diff --git a/.asf.yaml b/.asf.yaml
index aab8c1e6df2df..b719a495bd735 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -41,6 +41,7 @@ github:
     - sql
   enabled_merge_buttons:
     squash: true
+    squash_commit_message: PR_TITLE_AND_DESC
     merge: false
     rebase: false
   features:
@@ -50,11 +51,29 @@ github:
     main:
       required_pull_request_reviews:
         required_approving_review_count: 1
+    # needs to be updated as part of the release process
+    # .asf.yaml doesn't support wildcard branch protection rules, only exact branch names
+    # https://github.com/apache/infrastructure-asfyaml?tab=readme-ov-file#branch-protection
+    # these branches protection blocks autogenerated during release process which is described in
+    # https://github.com/apache/datafusion/tree/main/dev/release#2-add-a-protection-to-release-candidate-branch
+    branch-50:
+      required_pull_request_reviews:
+        required_approving_review_count: 1
+    branch-51:
+      required_pull_request_reviews:
+        required_approving_review_count: 1
+    branch-52:
+      required_pull_request_reviews:
+        required_approving_review_count: 1
   pull_requests:
     # enable updating head branches of pull requests
     allow_update_branch: true
+    allow_auto_merge: true
+    # auto-delete head branches after being merged
+    del_branch_on_merge: true
 
 # publishes the content of the `asf-site` branch to
 # https://datafusion.apache.org/
 publish:
   whoami: asf-site
+
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9dd627b01abed..49aacd118e19b 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -4,10 +4,12 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
     # Remove imagemagick due to https://security-tracker.debian.org/tracker/CVE-2019-10131
     && apt-get purge -y imagemagick imagemagick-6-common
 
-# Add protoc
-# https://datafusion.apache.org/contributor-guide/getting_started.html#protoc-installation
-RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v25.1/protoc-25.1-linux-x86_64.zip \
-    && unzip protoc-25.1-linux-x86_64.zip -d $HOME/.local \
-    && rm protoc-25.1-linux-x86_64.zip
+# setup the containers WORKDIR so npm install works
+# https://stackoverflow.com/questions/57534295/npm-err-tracker-idealtree-already-exists-while-creating-the-docker-image-for
+WORKDIR /root
 
-ENV PATH="$PATH:$HOME/.local/bin"
\ No newline at end of file
+# Add protoc, npm, prettier
+# https://datafusion.apache.org/contributor-guide/development_environment.html#protoc-installation
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends protobuf-compiler libprotobuf-dev npm nodejs\
+    && rm -rf /var/lib/apt/lists/*
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index a886cbd74c23a..ac5f082113117 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -1,5 +1,6 @@
 name: Bug report
 description: Create a report to help us improve
+type: Bug
 labels: bug
 body:
   - type: textarea
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index 2542b28dcae8a..955e59d74d08b 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -1,5 +1,6 @@
 name: Feature request
 description: Suggest an idea for this project
+type: Feature
 labels: enhancement
 body:
   - type: textarea
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
index 22d2f2187dd07..6228370c955a9 100644
--- a/.github/actions/setup-builder/action.yaml
+++ b/.github/actions/setup-builder/action.yaml
@@ -46,3 +46,17 @@ runs:
       # https://github.com/actions/checkout/issues/766
       shell: bash
       run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+    - name: Remove unnecessary preinstalled software
+      shell: bash
+      run: |
+        echo "Disk space before cleanup:"
+        df -h 
+        apt-get clean
+        # remove tool cache: about 8.5GB (github has host /opt/hostedtoolcache mounted as /__t)
+        rm -rf /__t/* || true
+        # remove Haskell runtime: about 6.3GB (host /usr/local/.ghcup)
+        rm -rf /host/usr/local/.ghcup || true
+        # remove Android library: about 7.8GB (host /usr/local/lib/android)
+        rm -rf /host/usr/local/lib/android || true
+        echo "Disk space after cleanup:"
+        df -h
\ No newline at end of file
diff --git a/.github/actions/setup-macos-aarch64-builder/action.yaml b/.github/actions/setup-macos-aarch64-builder/action.yaml
index 288799a284b01..b62370447adea 100644
--- a/.github/actions/setup-macos-aarch64-builder/action.yaml
+++ b/.github/actions/setup-macos-aarch64-builder/action.yaml
@@ -44,6 +44,8 @@ runs:
         rustup default stable
         rustup component add rustfmt
     - name: Setup rust cache
-      uses: Swatinem/rust-cache@v2
+      uses: Swatinem/rust-cache@f13886b937689c021905a6b90929199931d60db1  # v2.8.1
+      with:
+           save-if: ${{ github.ref_name == 'main' }}
     - name: Configure rust runtime env
       uses: ./.github/actions/setup-rust-runtime        
diff --git a/.github/actions/setup-macos-builder/action.yaml b/.github/actions/setup-macos-builder/action.yaml
deleted file mode 100644
index fffdab160b043..0000000000000
--- a/.github/actions/setup-macos-builder/action.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-name: Prepare Rust Builder for MacOS
-description: 'Prepare Rust Build Environment for MacOS'
-inputs:
-  rust-version:
-    description: 'version of rust to install (e.g. stable)'
-    required: true
-    default: 'stable'
-runs:
-  using: "composite"
-  steps:
-    - name: Install protobuf compiler
-      shell: bash
-      run: |
-        mkdir -p $HOME/d/protoc
-        cd $HOME/d/protoc
-        export PROTO_ZIP="protoc-29.1-osx-x86_64.zip"
-        curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.1/$PROTO_ZIP
-        unzip $PROTO_ZIP
-        echo "$HOME/d/protoc/bin" >> $GITHUB_PATH
-        export PATH=$PATH:$HOME/d/protoc/bin
-        protoc --version
-    - name: Setup Rust toolchain
-      shell: bash
-      run: |
-        rustup update stable
-        rustup toolchain install stable
-        rustup default stable
-        rustup component add rustfmt
-    - name: Configure rust runtime env
-      uses: ./.github/actions/setup-rust-runtime        
diff --git a/.github/actions/setup-rust-runtime/action.yaml b/.github/actions/setup-rust-runtime/action.yaml
index b6fb2c898bf2f..e0341de93b83d 100644
--- a/.github/actions/setup-rust-runtime/action.yaml
+++ b/.github/actions/setup-rust-runtime/action.yaml
@@ -20,10 +20,6 @@ description: 'Setup Rust Runtime Environment'
 runs:
   using: "composite"
   steps:
-    # https://github.com/apache/datafusion/issues/15535
-    # disabled because neither version nor git hash works with apache github policy
-    #- name: Run sccache-cache
-    #  uses: mozilla-actions/sccache-action@65101d47ea8028ed0c98a1cdea8dd9182e9b5133 # v0.0.8
     - name: Configure runtime env
       shell: bash
       # do not produce debug symbols to keep memory usage down
@@ -32,11 +28,6 @@ runs:
       # 
       # Set debuginfo=line-tables-only as debuginfo=0 causes immensely slow build
       # See for more details: https://github.com/rust-lang/rust/issues/119560
-      #
-      # readd the following to the run below once sccache-cache is re-enabled
-      # echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
-      # echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
       run: |
         echo "RUST_BACKTRACE=1" >> $GITHUB_ENV
         echo "RUSTFLAGS=-C debuginfo=line-tables-only -C incremental=false" >> $GITHUB_ENV
-     
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7c2b7e3a5458c..2cd4bdfdd7923 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -20,9 +20,10 @@ updates:
   - package-ecosystem: cargo
     directory: "/"
     schedule:
-      interval: daily
+      interval: weekly
     target-branch: main
     labels: [auto-dependencies]
+    open-pull-requests-limit: 15
     ignore:
       # major version bumps of arrow* and parquet are handled manually
       - dependency-name: "arrow*"
@@ -44,9 +45,31 @@ updates:
         patterns:
           - "prost*"
           - "pbjson*"
+
+        # Catch-all: group only minor/patch into a single PR,
+        # excluding deps we want always separate (and excluding arrow/parquet which have their own group)
+      all-other-cargo-deps:
+        applies-to: version-updates
+        patterns:
+          - "*"
+        exclude-patterns:
+          - "arrow*"
+          - "parquet"
+          - "object_store"
+          - "sqlparser"
+          - "prost*"
+          - "pbjson*"
+        update-types:
+          - "minor"
+          - "patch"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
-      interval: "daily"
+      interval: "weekly"
     open-pull-requests-limit: 10
     labels: [auto-dependencies]
+  - package-ecosystem: "pip"
+    directory: "/docs"
+    schedule:
+      interval: "weekly"
+    labels: [auto-dependencies]
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
index 491fa27c2a56a..281f600d6766a 100644
--- a/.github/workflows/audit.yml
+++ b/.github/workflows/audit.yml
@@ -23,25 +23,29 @@ concurrency:
 
 on:
   push:
+    branches:
+      - main
     paths:
       - "**/Cargo.toml"
       - "**/Cargo.lock"
-    branches:
-      - main
 
   pull_request:
     paths:
       - "**/Cargo.toml"
       - "**/Cargo.lock"
+  
+  merge_group:
 
 jobs:
   security_audit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Install cargo-audit
-        run: cargo install cargo-audit
+        uses: taiki-e/install-action@de6bbd1333b8f331563d54a051e542c7dfef81c3  # v2.68.34
+        with:
+          tool: cargo-audit
       - name: Run audit check
-        # Ignored until https://github.com/apache/datafusion/issues/15571
-        # ignored py03 warning until arrow 55 upgrade
-        run: cargo audit --ignore RUSTSEC-2024-0370 --ignore RUSTSEC-2025-0020
+        # Note: you can ignore specific RUSTSEC issues using the `--ignore` flag ,for example:
+        # run: cargo audit --ignore RUSTSEC-2026-0001
+        run: cargo audit --ignore RUSTSEC-2024-0014
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000000..d42c2b4aa8d39
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  schedule:
+    - cron: '16 4 * * 1'
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    name: Analyze Actions
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      security-events: write
+      packages: read
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
+
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4
+      with:
+        languages: actions
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@b1bff81932f5cdfc8695c7752dcee935dcd061c8 # v4
+      with:
+        category: "/language:actions"
diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml
index a577725fed4b9..3b2cc243d4967 100644
--- a/.github/workflows/dependencies.yml
+++ b/.github/workflows/dependencies.yml
@@ -23,6 +23,8 @@ concurrency:
 
 on:
   push:
+    branches-ignore:
+      - 'gh-readonly-queue/**'
     paths:
       - "**/Cargo.toml"
       - "**/Cargo.lock"
@@ -30,6 +32,7 @@ on:
     paths:
       - "**/Cargo.toml"
       - "**/Cargo.lock"
+  merge_group:
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
@@ -41,7 +44,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -53,3 +56,14 @@ jobs:
         run: |
           cd dev/depcheck
           cargo run
+
+  detect-unused-dependencies:
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - name: Install cargo-machete
+        run: cargo install cargo-machete --version ^0.9 --locked
+      - name: Detect unused dependencies
+        run: cargo machete --with-metadata
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index aa4bd862e09e4..2fec343650914 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -16,7 +16,12 @@
 # under the License.
 
 name: Dev
-on: [push, pull_request]
+on:
+  push:
+    branches-ignore:
+      - 'gh-readonly-queue/**'
+  pull_request:
+  merge_group:
 
 concurrency:
   group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
@@ -27,25 +32,36 @@ jobs:
     runs-on: ubuntu-latest
     name: Check License Header
     steps:
-      - uses: actions/checkout@v4
-      - uses: korandoru/hawkeye@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - name: Install HawkEye
+      # This CI job is bound by installation time, use `--profile dev` to speed it up
+        run: cargo install hawkeye --version 6.2.0 --locked --profile dev
+      - name: Run license header check
+        run: ci/scripts/license_header.sh
 
   prettier:
     name: Use prettier to check formatting of documents
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-node@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238  # v6.2.0
         with:
           node-version: "20"
       - name: Prettier check
-        run: |
-          # if you encounter error, rerun the command below and commit the changes
-          #
-          # ignore subproject CHANGELOG.md because they are machine generated
-          npx prettier@2.7.1 --write \
-            '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md' \
-            '!datafusion/CHANGELOG.md' \
-            README.md \
-            CONTRIBUTING.md
-          git diff --exit-code
+      # if you encounter error, see instructions inside the script
+        run: ci/scripts/doc_prettier_check.sh
+
+  typos:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+      # Version fixed on purpose. It uses heuristics to detect typos, so upgrading 
+      # it may cause checks to fail more often.
+      # We can upgrade it manually once a while.
+      - name: Install typos-cli
+        run: cargo install typos-cli --locked --version 1.37.0
+      - name: Run typos check
+        run: ci/scripts/typos_check.sh
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 5f1b2c1395982..63add4dacc812 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -32,32 +32,31 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout docs sources
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
 
       - name: Checkout asf-site branch
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: asf-site
           path: asf-site
 
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78  # v7.6.0
 
       - name: Install dependencies
+        run: uv sync --package datafusion-docs
+      - name: Install dependency graph tooling
         run: |
           set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+          sudo apt-get update
+          sudo apt-get install -y graphviz
+          cargo install cargo-depgraph --version ^1.6 --locked
 
       - name: Build docs
         run: |
           set -x
-          source venv/bin/activate
           cd docs
-          ./build.sh
+          uv run --package datafusion-docs ./build.sh
 
       - name: Copy & push the generated HTML
         run: |
diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml
index 8d11cdf9d39bb..cc5b9a1e44bb5 100644
--- a/.github/workflows/docs_pr.yaml
+++ b/.github/workflows/docs_pr.yaml
@@ -40,24 +40,22 @@ jobs:
     name: Test doc build
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78  # v7.6.0
       - name: Install doc dependencies
+        run: uv sync --package datafusion-docs
+      - name: Install dependency graph tooling
         run: |
           set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+          sudo apt-get update
+          sudo apt-get install -y graphviz
+          cargo install cargo-depgraph --version ^1.6 --locked
       - name: Build docs html and check for warnings
         run: |
           set -x
-          source venv/bin/activate
           cd docs
-          ./build.sh # fails on errors
-
+          uv run --package datafusion-docs ./build.sh # fails on errors
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
index 0ccecfc44fd64..c2aa96d92edc0 100644
--- a/.github/workflows/extended.yml
+++ b/.github/workflows/extended.yml
@@ -32,6 +32,19 @@ on:
   push:
     branches:
       - main
+      # support extended test suite for release candidate branches,
+      # it is not expected to have many changes in these branches,
+      # so running extended tests is not a burden
+      - 'branch-*'
+  # Also run for changes to some critical areas that are most likely
+  # to trigger errors in extended tests
+  pull_request:
+    branches: [ '**' ]
+    paths:
+      - 'datafusion/physical*/**/*.rs'
+      - 'datafusion/expr*/**/*.rs'
+      - 'datafusion/optimizer/**/*.rs'
+      - 'datafusion-testing'
   workflow_dispatch:
     inputs:
       pr_number:
@@ -53,10 +66,11 @@ jobs:
   # Check crate compiles and base cargo check passes
   linux-build-lib:
     name: linux build test
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     # note: do not use amd/rust container to preserve disk space
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -67,7 +81,9 @@ jobs:
           source $HOME/.cargo/env
           rustup toolchain install
       - name: Install Protobuf Compiler
-        run: sudo apt-get install -y protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
       - name: Prepare cargo build
         run: |
           cargo check --profile ci --all-targets
@@ -77,23 +93,27 @@ jobs:
   linux-test-extended:
     name: cargo test 'extended_tests' (amd64)
     needs: [linux-build-lib]
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion,spot=false', github.run_id) || 'ubuntu-latest' }}
+    # spot=false because the tests are long, https://runs-on.com/configuration/spot-instances/#disable-spot-pricing
     # note: do not use amd/rust container to preserve disk space
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
           fetch-depth: 1
       - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be  # v1.3.1
       - name: Install Rust
         run: |
           curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
           source $HOME/.cargo/env
           rustup toolchain install
       - name: Install Protobuf Compiler
-        run: sudo apt-get install -y protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y protobuf-compiler
       # For debugging, test binaries can be large.
       - name: Show available disk space
         run: |
@@ -111,7 +131,7 @@ jobs:
             --lib \
             --tests \
             --bins \
-            --features avro,json,backtrace,extended_tests,recursive_protection
+            --features avro,json,backtrace,extended_tests,recursive_protection,parquet_encryption
       - name: Verify Working Directory Clean
         run: git diff --exit-code
       - name: Cleanup
@@ -120,11 +140,12 @@ jobs:
   # Check answers are correct when hash values collide
   hash-collisions:
     name: cargo test hash collisions (amd64)
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -136,16 +157,18 @@ jobs:
       - name: Run tests
         run: |
           cd datafusion
-          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro
+          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --exclude datafusion-cli --workspace --lib --tests --features=force_hash_collisions,avro
           cargo clean
 
   sqllogictest-sqlite:
     name: "Run sqllogictests with the sqlite test suite"
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=48,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion,spot=false', github.run_id) || 'ubuntu-latest' }}
+    # spot=false because the tests are long, https://runs-on.com/configuration/spot-instances/#disable-spot-pricing
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
           submodules: true
@@ -156,47 +179,4 @@ jobs:
           rust-version: stable
       - name: Run sqllogictest
         run: |
-          cargo test --features backtrace --profile release-nonlto --test sqllogictests -- --include-sqlite
-          cargo clean
-
-  # If the workflow was triggered by the PR comment (through pr_comment_commands.yml action) we need to manually update check status to display in UI
-  update-check-status:
-    needs: [linux-build-lib, linux-test-extended, hash-collisions, sqllogictest-sqlite]
-    runs-on: ubuntu-latest
-    if: ${{ always() && github.event_name == 'workflow_dispatch' }}
-    steps:
-      - name: Determine workflow status
-        id: status
-        run: |
-          if [[ "${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
-            echo "workflow_status=failure" >> $GITHUB_OUTPUT
-            echo "conclusion=failure" >> $GITHUB_OUTPUT
-          else
-            echo "workflow_status=completed" >> $GITHUB_OUTPUT
-            echo "conclusion=success" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Update check run
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const workflowRunUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
-
-            await github.rest.checks.update({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              check_run_id: ${{ github.event.inputs.check_run_id }},
-              status: 'completed',
-              conclusion: '${{ steps.status.outputs.conclusion }}',
-              output: {
-                title: '${{ steps.status.outputs.conclusion == 'success' && 'Extended Tests Passed' || 'Extended Tests Failed' }}',
-                summary: `Extended tests have completed with status: ${{ steps.status.outputs.conclusion }}.\n\n[View workflow run](${workflowRunUrl})`
-              },
-              details_url: workflowRunUrl
-            });
-
-
-
-
-
+          cargo test --features backtrace,parquet_encryption --profile ci-optimized --test sqllogictests -- --include-sqlite
\ No newline at end of file
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 8b251552d3b2d..a575b39577477 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -39,14 +39,12 @@ jobs:
       contents: read
       pull-requests: write
     steps:
-      - uses: actions/checkout@v4
-
       - name: Assign GitHub labels
         if: |
           github.event_name == 'pull_request_target' &&
             (github.event.action == 'opened' ||
              github.event.action == 'synchronize')
-        uses: actions/labeler@v5.0.0
+        uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b  # v6.0.1
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           configuration-path: .github/workflows/labeler/labeler-config.yml
diff --git a/.github/workflows/labeler/labeler-config.yml b/.github/workflows/labeler/labeler-config.yml
index e408130725215..0e492b6f3f6dc 100644
--- a/.github/workflows/labeler/labeler-config.yml
+++ b/.github/workflows/labeler/labeler-config.yml
@@ -58,11 +58,11 @@ execution:
 
 datasource:
   - changed-files:
-      - any-glob-to-any-file: ['datafusion/datasource/**/*', 'datafusion/datasource-avro/**/*', 'datafusion/datasource-csv/**/*', 'datafusion/datasource-json/**/*', 'datafusion/datasource-parquet/**/*']
+      - any-glob-to-any-file: ['datafusion/datasource/**/*', 'datafusion/datasource-avro/**/*', 'datafusion/datasource-arrow/**/*', 'datafusion/datasource-csv/**/*', 'datafusion/datasource-json/**/*', 'datafusion/datasource-parquet/**/*']
 
 functions:
   - changed-files:
-      - any-glob-to-any-file: ['datafusion/functions/**/*', 'datafusion/functions-aggregate/**/*', 'datafusion/functions-aggregate-common', 'datafusion/functions-nested', 'datafusion/functions-table/**/*', 'datafusion/functions-window/**/*', 'datafusion/functions-window-common/**/*']
+      - any-glob-to-any-file: ['datafusion/functions/**/*', 'datafusion/functions-aggregate/**/*', 'datafusion/functions-aggregate-common/**/*', 'datafusion/functions-nested/**/*', 'datafusion/functions-table/**/*', 'datafusion/functions-window/**/*', 'datafusion/functions-window-common/**/*']
 
 
 optimizer:
diff --git a/.github/workflows/large_files.yml b/.github/workflows/large_files.yml
index aa96d55a0d851..12b7bae76ab32 100644
--- a/.github/workflows/large_files.yml
+++ b/.github/workflows/large_files.yml
@@ -23,12 +23,13 @@ concurrency:
 
 on:
   pull_request:
+  merge_group:
 
 jobs:
   check-files:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           fetch-depth: 0
       - name: Check size of new Git objects
@@ -38,7 +39,16 @@ jobs:
           MAX_FILE_SIZE_BYTES: 1048576
         shell: bash
         run: |
-          git rev-list --objects ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} \
+          if [ "${{ github.event_name }}" = "merge_group" ]; then
+            # For merge queue, compare against the base branch
+            base_sha="${{ github.event.merge_group.base_sha }}"
+            head_sha="${{ github.event.merge_group.head_sha }}"
+          else
+            # For pull requests
+            base_sha="${{ github.event.pull_request.base.sha }}"
+            head_sha="${{ github.event.pull_request.head.sha }}"
+          fi
+          git rev-list --objects ${base_sha}..${head_sha} \
             > pull-request-objects.txt
           exit_code=0
           while read -r id path; do
diff --git a/.github/workflows/pr_comment_commands.yml b/.github/workflows/pr_comment_commands.yml
deleted file mode 100644
index 6aa6caaf34d02..0000000000000
--- a/.github/workflows/pr_comment_commands.yml
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-name: PR commands
-
-on:
-  issue_comment:
-    types: [created]
-
-permissions:
-  contents: read
-  pull-requests: write
-  actions: write
-  checks: write
-
-jobs:
-  # Starts the extended_tests on a PR branch when someone leaves a `Run extended tests` comment
-  run_extended_tests:
-    runs-on: ubuntu-latest
-    if: ${{ github.event_name == 'issue_comment' && github.event.issue.pull_request && contains(github.event.comment.body, 'Run extended tests') }}
-    steps:
-      - name: Dispatch extended tests for a PR branch with comment
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            // Get PR details to fetch the branch name
-            const { data: pullRequest } = await github.rest.pulls.get({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                pull_number: context.payload.issue.number
-            });
-
-            // Extract the branch name
-            const branchName = pullRequest.head.ref;
-            const headSha = pullRequest.head.sha;
-            const workflowRunsUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions?query=workflow%3A%22Datafusion+extended+tests%22+branch%3A${branchName}`;
-
-            // Create a check run that links to the Actions tab so the run will be visible in GitHub UI
-            const check = await github.rest.checks.create({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: 'Extended Tests',
-              head_sha: headSha,
-              status: 'in_progress',
-              output: {
-                title: 'Extended Tests Running',
-                summary: `Extended tests have been triggered for this PR.\n\n[View workflow runs](${workflowRunsUrl})`
-              },
-              details_url: workflowRunsUrl
-            });
-
-            // Dispatch the workflow with the PR branch name
-            await github.rest.actions.createWorkflowDispatch({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              workflow_id: 'extended.yml',
-              ref: 'main',
-              inputs: {
-                pr_number: context.payload.issue.number.toString(),
-                check_run_id: check.data.id.toString(),
-                pr_head_sha: headSha
-              }
-            });
-
-      - name: Add reaction to comment
-        uses: actions/github-script@v7
-        with:
-          script: |
-            await github.rest.reactions.createForIssueComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: context.payload.comment.id,
-              content: 'rocket'
-            });
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 2463b04b33738..f7452ee603b1c 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# For some actions, we use Runs-On to run them on ASF infrastructure: https://datafusion.apache.org/contributor-guide/#ci-runners
+
 name: Rust
 
 concurrency:
@@ -23,6 +25,8 @@ concurrency:
 
 on:
   push:
+    branches-ignore:
+      - 'gh-readonly-queue/**'
     paths-ignore:
       - "docs/**"
       - "**.md"
@@ -34,31 +38,30 @@ on:
       - "**.md"
       - ".github/ISSUE_TEMPLATE/**"
       - ".github/pull_request_template.md"
+  merge_group:
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
 
 jobs:
-  # Check license header
-  license-header-check:
-    runs-on: ubuntu-latest
-    name: Check License Header
-    steps:
-      - uses: actions/checkout@v4
-      - uses: korandoru/hawkeye@v6
-
   # Check crate compiles and base cargo check passes
   linux-build-lib:
     name: linux build test
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          shared-key: "amd-ci-check" # this job uses it's own cache becase check has a separate cache and we need it to be fast as it blocks other jobs
+          save-if: ${{ github.ref_name == 'main' }}
       - name: Prepare cargo build
         run: |
           # Adding `--locked` here to assert that the `Cargo.lock` file is up to
@@ -77,7 +80,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -98,15 +101,20 @@ jobs:
   linux-datafusion-substrait-features:
     name: cargo check datafusion-substrait features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          save-if: false # set in linux-test
+          shared-key: "amd-ci"
       - name: Check datafusion-substrait (default features)
         run: cargo check --profile ci --all-targets -p datafusion-substrait
         #
@@ -130,11 +138,12 @@ jobs:
   linux-datafusion-proto-features:
     name: cargo check datafusion-proto features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -161,15 +170,21 @@ jobs:
   linux-cargo-check-datafusion:
     name: cargo check datafusion features
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          save-if: false # set in linux-test
+          shared-key: "amd-ci"
       - name: Check datafusion (default features)
         run: cargo check --profile ci --all-targets -p datafusion
       #
@@ -199,18 +214,20 @@ jobs:
         run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions
       - name: Check datafusion (parquet)
         run: cargo check --profile ci --no-default-features -p datafusion --features=parquet
-      - name: Check datafusion (pyarrow)
-        run: cargo check --profile ci --no-default-features -p datafusion --features=pyarrow
       - name: Check datafusion (regex_expressions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=regex_expressions
       - name: Check datafusion (recursive_protection)
         run: cargo check --profile ci --no-default-features -p datafusion --features=recursive_protection
       - name: Check datafusion (serde)
         run: cargo check --profile ci --no-default-features -p datafusion --features=serde
+      - name: Check datafusion (sql)
+        run: cargo check --profile ci --no-default-features -p datafusion --features=sql
       - name: Check datafusion (string_expressions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=string_expressions
       - name: Check datafusion (unicode_expressions)
         run: cargo check --profile ci --no-default-features -p datafusion --features=unicode_expressions
+      - name: Check parquet encryption (parquet_encryption)
+        run: cargo check --profile ci --no-default-features -p datafusion --features=parquet_encryption
 
   # Check datafusion-functions crate features
   #
@@ -223,7 +240,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -254,16 +271,26 @@ jobs:
   linux-test:
     name: cargo test (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
+    container:
+      image: amd64/rust
+      volumes:
+        - /usr/local:/host/usr/local
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
       - name: Setup Rust toolchain
-        run: rustup toolchain install stable
-      - name: Install Protobuf Compiler
-        run: sudo apt-get install -y protobuf-compiler
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+         save-if: ${{ github.ref_name == 'main' }}
+         shared-key: "amd-ci"
       - name: Run tests (excluding doctests and datafusion-cli)
         env:
           RUST_BACKTRACE: 1
@@ -278,34 +305,37 @@ jobs:
             --lib \
             --tests \
             --bins \
-            --features serde,avro,json,backtrace,integration-tests
+            --features serde,avro,json,backtrace,integration-tests,parquet_encryption
       - name: Verify Working Directory Clean
         run: git diff --exit-code
+        # Check no temporary directories created during test.
+        # `false/` folder is excuded for rust cache.
+      - name: Verify Working Directory Clean (No Untracked Files)
+        run: |
+          STATUS="$(git status --porcelain | sed -e '/^?? false\/$/d' -e '/^?? false$/d')"
+          if [ -n "$STATUS" ]; then
+            echo "$STATUS"
+            exit 1
+          fi
 
   # datafusion-cli tests
   linux-test-datafusion-cli:
     name: cargo test datafusion-cli (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
       - name: Setup Rust toolchain
         run: rustup toolchain install stable
-      - name: Setup Minio - S3-compatible storage
-        run: |
-          docker run -d --name minio-container \
-            -p 9000:9000 \
-            -e MINIO_ROOT_USER=TEST-DataFusionLogin -e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \
-            -v $(pwd)/datafusion/core/tests/data:/source quay.io/minio/minio \
-            server /data
-          docker exec minio-container /bin/sh -c "\
-            mc ready local
-            mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \
-            mc mb localminio/data && \
-            mc cp -r /source/* localminio/data"
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          save-if: false # set in linux-test
+          shared-key: "amd-ci"
       - name: Run tests (excluding doctests)
         env:
           RUST_BACKTRACE: 1
@@ -314,22 +344,20 @@ jobs:
           AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword
           TEST_STORAGE_INTEGRATION: 1
           AWS_ALLOW_HTTP: true
-        run: cargo test --profile ci -p datafusion-cli  --lib --tests --bins
+        run: cargo test --features backtrace --profile ci -p datafusion-cli  --lib --tests --bins
       - name: Verify Working Directory Clean
         run: git diff --exit-code
-      - name: Minio Output
-        if: ${{ !cancelled() }}
-        run: docker logs minio-container
 
 
   linux-test-example:
     name: cargo examples (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -337,6 +365,11 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          save-if: ${{ github.ref_name == 'main' }}
+          shared-key: "amd-ci-linux-test-example"
       - name: Run examples
         run: |
           # test datafusion-sql examples
@@ -350,11 +383,12 @@ jobs:
   linux-test-doc:
     name: cargo test doc (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -371,11 +405,12 @@ jobs:
   linux-rustdoc:
     name: cargo doc
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -387,7 +422,7 @@ jobs:
     name: build and run with wasm-pack
     runs-on: ubuntu-24.04
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup for wasm32
         run: |
           rustup target add wasm32-unknown-unknown
@@ -396,23 +431,27 @@ jobs:
           sudo apt-get update -qq
           sudo apt-get install -y -qq clang
       - name: Setup wasm-pack
-        run: |
-          cargo install wasm-pack
+        uses: taiki-e/install-action@de6bbd1333b8f331563d54a051e542c7dfef81c3  # v2.68.34
+        with:
+          tool: wasm-pack
       - name: Run tests with headless mode
         working-directory: ./datafusion/wasmtest
         run: |
-          RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack test --headless --firefox
-          RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack test --headless --chrome --chromedriver $CHROMEWEBDRIVER/chromedriver
+          # debuginfo=none because CI tests weren't completing successfully after this upstream PR:
+          # https://github.com/wasm-bindgen/wasm-bindgen/pull/4635
+          RUSTFLAGS='--cfg getrandom_backend="wasm_js" -C debuginfo=none' wasm-pack test --headless --firefox
+          RUSTFLAGS='--cfg getrandom_backend="wasm_js" -C debuginfo=none' wasm-pack test --headless --chrome --chromedriver $CHROMEWEBDRIVER/chromedriver
 
   # verify that the benchmark queries return the correct results
   verify-benchmark-results:
     name: verify benchmark results (amd64)
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -434,14 +473,14 @@ jobs:
           export RUST_MIN_STACK=20971520
           export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data`
           cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1
-          INCLUDE_TPCH=true cargo test --features backtrace --profile ci --package datafusion-sqllogictest --test sqllogictests
+          INCLUDE_TPCH=true cargo test --features backtrace,parquet_encryption --profile ci --package datafusion-sqllogictest --test sqllogictests
       - name: Verify Working Directory Clean
         run: git diff --exit-code
 
   sqllogictest-postgres:
     name: "Run sqllogictest with Postgres runner"
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     services:
@@ -459,7 +498,8 @@ jobs:
           --health-timeout 5s
           --health-retries 5
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -476,6 +516,29 @@ jobs:
           POSTGRES_HOST: postgres
           POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}
 
+  sqllogictest-substrait:
+    name: "Run sqllogictest in Substrait round-trip mode"
+    needs: linux-build-lib
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
+    container:
+      image: amd64/rust
+    steps:
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Run sqllogictest
+        # TODO: Right now several tests are failing in Substrait round-trip mode, so this
+        #  command cannot be run for all the .slt files. Run it for just one that works (limit.slt)
+        #  until most of the tickets in https://github.com/apache/datafusion/issues/16248 are addressed
+        #  and this command can be run without filters.
+        run: cargo test --test sqllogictests -- --substrait-round-trip limit.slt
+
   #  Temporarily commenting out the Windows flow, the reason is enormously slow running build
   #  Waiting for new Windows 2025 github runner
   #  Details: https://github.com/apache/datafusion/issues/13726
@@ -495,27 +558,11 @@ jobs:
   #          export PATH=$PATH:$HOME/d/protoc/bin
   #          cargo test --lib --tests --bins --features avro,json,backtrace
 
-  # Commenting out intel mac build as so few users would ever use it
-  # Details: https://github.com/apache/datafusion/issues/13846
-  #  macos:
-  #    name: cargo test (macos)
-  #    runs-on: macos-latest
-  #    steps:
-  #      - uses: actions/checkout@v4
-  #        with:
-  #          submodules: true
-  #          fetch-depth: 1
-  #      - name: Setup Rust toolchain
-  #        uses: ./.github/actions/setup-macos-builder
-  #      - name: Run tests (excluding doctests)
-  #        shell: bash
-  #        run: cargo test run --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace
-
   macos-aarch64:
     name: cargo test (macos-aarch64)
-    runs-on: macos-14
+    runs-on: macos-15
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -525,37 +572,13 @@ jobs:
         shell: bash
         run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests
 
-  test-datafusion-pyarrow:
-    name: cargo test pyarrow (amd64)
-    needs: linux-build-lib
-    runs-on: ubuntu-latest
-    container:
-      image: amd64/rust:bullseye # Use the bullseye tag image which comes with python3.9
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-      - name: Install PyArrow
-        run: |
-          echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
-          apt-get update
-          apt-get install python3-pip -y
-          python3 -m pip install pyarrow
-      - name: Setup Rust toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: stable
-      - name: Run datafusion-common tests
-        run: cargo test --profile ci -p datafusion-common --features=pyarrow
-
   vendor:
     name: Verify Vendored Code
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -572,7 +595,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -627,11 +650,12 @@ jobs:
   clippy:
     name: clippy
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -641,6 +665,11 @@ jobs:
           rust-version: stable
       - name: Install Clippy
         run: rustup component add clippy
+      - name: Rust Dependency Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4  # v2.9.1
+        with:
+          save-if: ${{ github.ref_name == 'main' }}
+          shared-key: "amd-ci-clippy"
       - name: Run clippy
         run: ci/scripts/rust_clippy.sh
 
@@ -651,7 +680,7 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -668,11 +697,12 @@ jobs:
   config-docs-check:
     name: check configs.md and ***_functions.md is up-to-date
     needs: linux-build-lib
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           submodules: true
           fetch-depth: 1
@@ -680,7 +710,7 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238  # v6.2.0
         with:
           node-version: "20"
       - name: Check if configs.md has been modified
@@ -693,11 +723,38 @@ jobs:
           # If you encounter an error, run './dev/update_function_docs.sh' and commit
           ./dev/update_function_docs.sh
           git diff --exit-code
-      - name: Check if runtime_configs.md has been modified
+
+# This job ensures `datafusion-examples/README.md` stays in sync with the source code:
+# 1. Generates README automatically using the Rust examples docs generator
+#    (parsing documentation from `examples/<group>/main.rs`)
+# 2. Formats the generated Markdown using DataFusion's standard Prettier setup
+# 3. Compares the result against the committed README.md and fails if out-of-date
+  examples-docs-check:
+    name: check example README is up-to-date
+    needs: linux-build-lib
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          submodules: true
+          fetch-depth: 1
+          
+      - name: Mark repository as safe for git
+        # Required for git commands inside container (avoids "dubious ownership" error)
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Set up Node.js (required for prettier)
+        # doc_prettier_check.sh uses npx to run prettier for Markdown formatting
+        uses: actions/setup-node@v6
+        with:
+          node-version: '18'
+
+      - name: Run examples docs check script
         run: |
-          # If you encounter an error, run './dev/update_runtime_config_docs.sh' and commit
-          ./dev/update_runtime_config_docs.sh
-          git diff --exit-code
+          bash ci/scripts/check_examples_docs.sh
 
   # Verify MSRV for the crates which are directly used by other projects:
   # - datafusion
@@ -710,11 +767,14 @@ jobs:
     container:
       image: amd64/rust
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
       - name: Install cargo-msrv
-        run: cargo install cargo-msrv
+        uses: taiki-e/install-action@de6bbd1333b8f331563d54a051e542c7dfef81c3  # v2.68.34
+        with:
+          tool: cargo-msrv
+
       - name: Check datafusion
         working-directory: datafusion/core
         run: |
@@ -724,10 +784,15 @@ jobs:
           # `rust-version` key of `Cargo.toml`.
           #
           # To reproduce:
-          # 1. Install the version of Rust that is failing. Example:
-          #    rustup install 1.80.1
-          # 2. Run the command that failed with that version. Example:
-          #    cargo +1.80.1 check -p datafusion
+          # 1. Install the version of Rust that is failing.
+          # 2. Run the command that failed with that version.
+          #
+          # Example:
+          #    # MSRV looks like "1.80.0" and is specified in Cargo.toml. We can read the value with the following command:
+          #    msrv="$(cargo metadata --format-version=1 | jq '.packages[] | select( .name == "datafusion" ) | .rust_version' -r)"
+          #    echo "MSRV: ${msrv}"
+          #    rustup install "${msrv}"
+          #    cargo "+${msrv}" check
           #
           # To resolve, either:
           # 1. Change your code to use older Rust features,
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 2312526824a91..ec7f54ec24dbc 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -27,7 +27,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v9
+      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f  # v10.2.0
         with:
           stale-pr-message: "Thank you for your contribution. Unfortunately, this pull request is stale because it has been open 60 days with no activity. Please remove the stale label or comment or this will be closed in 7 days."
           days-before-pr-stale: 60
diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml
index 86dc190add1d1..ffb5f728e04c1 100644
--- a/.github/workflows/take.yml
+++ b/.github/workflows/take.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Assign the issue via a `take` comment
+name: Assign/unassign the issue via `take` or `untake` comment
 on:
   issue_comment:
     types: created
@@ -26,16 +26,30 @@ permissions:
 jobs:
   issue_assign:
     runs-on: ubuntu-latest
-    if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
+    if: (!github.event.issue.pull_request) && (github.event.comment.body == 'take' || github.event.comment.body == 'untake')
     concurrency:
       group: ${{ github.actor }}-issue-assign
     steps:
-      - run: |
-          CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s)
-          if [ "$CODE" -eq "204" ]
+      - name: Take or untake issue
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          USER_LOGIN: ${{ github.event.comment.user.login }}
+          REPO: ${{ github.repository }}
+          TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ "$COMMENT_BODY" == "take" ]
           then
-            echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
-            curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
-          else
-            echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
+            CODE=$(curl -H "Authorization: token $TOKEN" -LI https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees/$USER_LOGIN -o /dev/null -w '%{http_code}\n' -s)
+            if [ "$CODE" -eq "204" ]
+            then
+              echo "Assigning issue $ISSUE_NUMBER to $USER_LOGIN"
+              curl -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
+            else
+              echo "Cannot assign issue $ISSUE_NUMBER to $USER_LOGIN"
+            fi
+          elif [ "$COMMENT_BODY" == "untake" ]
+          then
+            echo "Unassigning issue $ISSUE_NUMBER from $USER_LOGIN"
+            curl -X DELETE -H "Authorization: token $TOKEN" -H "Content-Type: application/json" -d "{\"assignees\": [\"$USER_LOGIN\"]}" https://api.github.com/repos/$REPO/issues/$ISSUE_NUMBER/assignees
           fi
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 4ae32925d908e..8466a72adaec8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ docker_cache
 *.orig
 .*.swp
 .*.swo
+*.pending-snap
 
 venv/*
 
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000000000..eeedbd8bc45ec
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,34 @@
+# Agent Guidelines for Apache DataFusion
+
+## Developer Documentation
+
+- [Contributor Guide](docs/source/contributor-guide/index.md)
+- [Architecture Guide](docs/source/contributor-guide/architecture.md)
+
+## Before Committing
+
+Before committing any changes, you **must** run the following checks and fix any issues:
+
+```bash
+cargo fmt --all
+cargo clippy --all-targets --all-features -- -D warnings
+```
+
+- `cargo fmt` ensures consistent code formatting across the project.
+- `cargo clippy` catches common mistakes and enforces idiomatic Rust patterns. All warnings must be resolved (treated as errors via `-D warnings`).
+
+Do not commit code that fails either of these checks.
+
+## Testing
+
+Run relevant tests before submitting changes:
+
+```bash
+cargo test --all-features
+```
+
+For SQL logic tests:
+
+```bash
+cargo test -p datafusion-sqllogictest
+```
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 120000
index 0000000000000..47dc3e3d863cf
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index f918b3ae2663d..5cef3742dfd18 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "abi_stable"
@@ -14,7 +14,7 @@ dependencies = [
  "core_extensions",
  "crossbeam-channel",
  "generational-arena",
- "libloading 0.7.4",
+ "libloading",
  "lock_api",
  "parking_lot",
  "paste",
@@ -50,37 +50,11 @@ dependencies = [
  "core_extensions",
 ]
 
-[[package]]
-name = "addr2line"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
-dependencies = [
- "gimli",
-]
-
 [[package]]
 name = "adler2"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
-
-[[package]]
-name = "adler32"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
-
-[[package]]
-name = "ahash"
-version = "0.7.8"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
-dependencies = [
- "getrandom 0.2.16",
- "once_cell",
- "version_check",
-]
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
 [[package]]
 name = "ahash"
@@ -90,7 +64,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
 dependencies = [
  "cfg-if",
  "const-random",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "once_cell",
  "version_check",
  "zerocopy",
@@ -98,9 +72,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
 dependencies = [
  "memchr",
 ]
@@ -121,16 +95,19 @@ dependencies = [
 ]
 
 [[package]]
-name = "allocator-api2"
-version = "0.2.21"
+name = "alloca"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4"
+dependencies = [
+ "cc",
+]
 
 [[package]]
-name = "android-tzdata"
-version = "0.1.1"
+name = "allocator-api2"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
 [[package]]
 name = "android_system_properties"
@@ -149,12 +126,27 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.6.18"
+version = "0.6.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
+dependencies = [
+ "anstyle",
+ "anstyle-parse 0.2.7",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
 dependencies = [
  "anstyle",
- "anstyle-parse",
+ "anstyle-parse 1.0.0",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
@@ -164,74 +156,92 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.10"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.6"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.1.2"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.7"
+version = "3.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
- "once_cell",
- "windows-sys 0.59.0",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.98"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
 
 [[package]]
 name = "apache-avro"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13"
+checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf"
 dependencies = [
  "bigdecimal",
- "bzip2 0.4.4",
+ "bon",
+ "bzip2",
  "crc32fast",
  "digest",
- "libflate",
+ "liblzma",
  "log",
+ "miniz_oxide",
  "num-bigint",
  "quad-rand",
- "rand 0.8.5",
+ "rand 0.9.2",
  "regex-lite",
  "serde",
  "serde_bytes",
  "serde_json",
  "snap",
- "strum",
- "strum_macros",
- "thiserror 1.0.69",
- "typed-builder",
+ "strum 0.27.2",
+ "strum_macros 0.27.2",
+ "thiserror",
  "uuid",
- "xz2",
  "zstd",
 ]
 
+[[package]]
+name = "ar_archive_writer"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
+dependencies = [
+ "object",
+]
+
 [[package]]
 name = "arrayref"
 version = "0.3.9"
@@ -246,9 +256,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "arrow"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87"
+checksum = "602268ce9f569f282cedb9a9f6bac569b680af47b9b077d515900c03c5d190da"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -264,61 +274,64 @@ dependencies = [
  "arrow-select",
  "arrow-string",
  "half",
- "pyo3",
- "rand 0.9.1",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "arrow-arith"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575"
+checksum = "cd53c6bf277dea91f136ae8e3a5d7041b44b5e489e244e637d00ae302051f56f"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
- "num",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-array"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f"
+checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
  "chrono-tz",
  "half",
- "hashbrown 0.15.3",
- "num",
+ "hashbrown 0.16.1",
+ "num-complex",
+ "num-integer",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-buffer"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce"
+checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20"
 dependencies = [
  "bytes",
  "half",
- "num",
+ "num-bigint",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-cast"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff"
+checksum = "89fb245db6b0e234ed8e15b644edb8664673fefe630575e94e62cd9d489a8a26"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
+ "arrow-ord",
  "arrow-schema",
  "arrow-select",
  "atoi",
@@ -327,15 +340,15 @@ dependencies = [
  "comfy-table",
  "half",
  "lexical-core",
- "num",
+ "num-traits",
  "ryu",
 ]
 
 [[package]]
 name = "arrow-csv"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a"
+checksum = "d374882fb465a194462527c0c15a93aa19a554cf690a6b77a26b2a02539937a7"
 dependencies = [
  "arrow-array",
  "arrow-cast",
@@ -343,27 +356,27 @@ dependencies = [
  "chrono",
  "csv",
  "csv-core",
- "lazy_static",
  "regex",
 ]
 
 [[package]]
 name = "arrow-data"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d"
+checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
  "half",
- "num",
+ "num-integer",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-flight"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91efc67a4f5a438833dd76ef674745c80f6f6b9a428a3b440cbfbf74e32867e6"
+checksum = "b4f5cdf00ee0003ba0768d3575d0afc47d736b29673b14c3c228fdffa9a3fb29"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -384,27 +397,30 @@ dependencies = [
  "prost",
  "prost-types",
  "tonic",
+ "tonic-prost",
 ]
 
 [[package]]
 name = "arrow-ipc"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e"
+checksum = "7968c2e5210c41f4909b2ef76f6e05e172b99021c2def5edf3cc48fdd39d1d6c"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
+ "arrow-select",
  "flatbuffers",
  "lz4_flex",
+ "zstd",
 ]
 
 [[package]]
 name = "arrow-json"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033"
+checksum = "92111dba5bf900f443488e01f00d8c4ddc2f47f5c50039d18120287b580baa22"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -413,20 +429,22 @@ dependencies = [
  "arrow-schema",
  "chrono",
  "half",
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
+ "itoa",
  "lexical-core",
  "memchr",
- "num",
- "serde",
+ "num-traits",
+ "ryu",
+ "serde_core",
  "serde_json",
  "simdutf8",
 ]
 
 [[package]]
 name = "arrow-ord"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7"
+checksum = "211136cb253577ee1a6665f741a13136d4e563f64f5093ffd6fb837af90b9495"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -437,9 +455,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-row"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8"
+checksum = "8e0f20145f9f5ea3fe383e2ba7a7487bf19be36aa9dbf5dd6a1f92f657179663"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -450,34 +468,35 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b"
+checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "serde",
+ "serde_core",
  "serde_json",
 ]
 
 [[package]]
 name = "arrow-select"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4"
+checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
- "num",
+ "num-traits",
 ]
 
 [[package]]
 name = "arrow-string"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd"
+checksum = "e1eab1208bc4fe55d768cdc9b9f3d9df5a794cdb3ee2586bf89f9b30dc31ad8c"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -485,7 +504,7 @@ dependencies = [
  "arrow-schema",
  "arrow-select",
  "memchr",
- "num",
+ "num-traits",
  "regex",
  "regex-syntax",
 ]
@@ -503,36 +522,31 @@ dependencies = [
 ]
 
 [[package]]
-name = "assert_cmd"
-version = "2.0.17"
+name = "astral-tokio-tar"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bd389a4b2970a01282ee455294913c0a43724daedcd1a24c3eb0ec1c1320b66"
+checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5"
 dependencies = [
- "anstyle",
- "bstr",
- "doc-comment",
+ "filetime",
+ "futures-core",
  "libc",
- "predicates",
- "predicates-core",
- "predicates-tree",
- "wait-timeout",
+ "portable-atomic",
+ "rustc-hash",
+ "tokio",
+ "tokio-stream",
+ "xattr",
 ]
 
 [[package]]
 name = "async-compression"
-version = "0.4.19"
+version = "0.4.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c"
+checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1"
 dependencies = [
- "bzip2 0.5.2",
- "flate2",
- "futures-core",
- "memchr",
+ "compression-codecs",
+ "compression-core",
  "pin-project-lite",
  "tokio",
- "xz2",
- "zstd",
- "zstd-safe",
 ]
 
 [[package]]
@@ -552,7 +566,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -574,18 +588,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.88"
+version = "0.1.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -605,15 +619,15 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
 
 [[package]]
 name = "autocfg"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "aws-config"
-version = "1.6.3"
+version = "1.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02a18fd934af6ae7ca52410d4548b98eb895aab0f1ea417d168d85db1434a141"
+checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -630,8 +644,8 @@ dependencies = [
  "bytes",
  "fastrand",
  "hex",
- "http 1.3.1",
- "ring",
+ "http 1.4.0",
+ "sha1",
  "time",
  "tokio",
  "tracing",
@@ -641,9 +655,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "1.2.3"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "687bc16bc431a8533fe0097c7f0182874767f920989d7260950172ae8e3c4465"
+checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -653,9 +667,9 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-rs"
-version = "1.13.1"
+version = "1.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93fcc8f365936c834db5514fc45aee5b1202d677e6b40e48468aaaa8183ca8c7"
+checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf"
 dependencies = [
  "aws-lc-sys",
  "zeroize",
@@ -663,11 +677,10 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.29.0"
+version = "0.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61b1d86e7705efe1be1b569bab41d4fa1e14e220b60a160f78de2db687add079"
+checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e"
 dependencies = [
- "bindgen",
  "cc",
  "cmake",
  "dunce",
@@ -676,9 +689,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.5.7"
+version = "1.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c4063282c69991e57faab9e5cb21ae557e59f5b0fb285c196335243df8dc25c"
+checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -689,9 +702,10 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
+ "bytes-utils",
  "fastrand",
- "http 0.2.12",
- "http-body 0.4.6",
+ "http 1.4.0",
+ "http-body 1.0.1",
  "percent-encoding",
  "pin-project-lite",
  "tracing",
@@ -700,15 +714,16 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.70.0"
+version = "1.96.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83447efb7179d8e2ad2afb15ceb9c113debbc2ecdf109150e338e2e28b86190b"
+checksum = "f64a6eded248c6b453966e915d32aeddb48ea63ad17932682774eb026fbef5b1"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -716,21 +731,23 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.71.0"
+version = "1.98.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f9bfbbda5e2b9fe330de098f14558ee8b38346408efe9f2e9cee82dc1636a4"
+checksum = "db96d720d3c622fcbe08bae1c4b04a72ce6257d8b0584cb5418da00ae20a344f"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -738,21 +755,23 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.71.0"
+version = "1.100.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e17b984a66491ec08b4f4097af8911251db79296b3e4a763060b45805746264f"
+checksum = "fafbdda43b93f57f699c5dfe8328db590b967b8a820a13ccdd6687355dfcc7ca"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
  "aws-smithy-json",
+ "aws-smithy-observability",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -761,15 +780,16 @@ dependencies = [
  "aws-types",
  "fastrand",
  "http 0.2.12",
+ "http 1.4.0",
  "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sigv4"
-version = "1.3.2"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3734aecf9ff79aa401a6ca099d076535ab465ff76b46440cf567c8e70b65dc13"
+checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-http",
@@ -780,7 +800,7 @@ dependencies = [
  "hex",
  "hmac",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "percent-encoding",
  "sha2",
  "time",
@@ -789,9 +809,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.5"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c"
+checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -800,18 +820,19 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.62.1"
+version = "0.63.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99335bec6cdc50a346fda1437f9fefe33abf8c99060739a546a16457f2862ca9"
+checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231"
 dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "bytes",
  "bytes-utils",
  "futures-core",
- "http 0.2.12",
- "http 1.3.1",
- "http-body 0.4.6",
+ "futures-util",
+ "http 1.4.0",
+ "http-body 1.0.1",
+ "http-body-util",
  "percent-encoding",
  "pin-project-lite",
  "pin-utils",
@@ -820,15 +841,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http-client"
-version = "1.0.2"
+version = "1.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e44697a9bded898dcd0b1cb997430d949b87f4f8940d91023ae9062bf218250"
+checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "hyper",
  "hyper-rustls",
  "hyper-util",
@@ -837,33 +858,34 @@ dependencies = [
  "rustls-native-certs",
  "rustls-pki-types",
  "tokio",
- "tower 0.5.2",
+ "tokio-rustls",
+ "tower",
  "tracing",
 ]
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.61.3"
+version = "0.62.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92144e45819cae7dc62af23eac5a038a58aa544432d2102609654376a900bd07"
+checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-observability"
-version = "0.1.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9364d5989ac4dd918e5cc4c4bdcc61c9be17dcd2586ea7f69e348fc7c6cab393"
+checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c"
 dependencies = [
  "aws-smithy-runtime-api",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.7"
+version = "0.60.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
+checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -871,9 +893,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.8.3"
+version = "1.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14302f06d1d5b7d333fd819943075b13d27c7700b414f574c3c35859bfb55d5e"
+checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -884,9 +906,10 @@ dependencies = [
  "bytes",
  "fastrand",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 0.4.6",
  "http-body 1.0.1",
+ "http-body-util",
  "pin-project-lite",
  "pin-utils",
  "tokio",
@@ -895,15 +918,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.8.0"
+version = "1.11.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1e5d9e3a80a18afa109391fb5ad09c3daf887b516c6fd805a157c6ea7994a57"
+checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
  "bytes",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -912,15 +935,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.3.1"
+version = "1.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40076bd09fadbc12d5e026ae080d0930defa606856186e31d83ccc6a255eeaf3"
+checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "http 0.2.12",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 0.4.6",
  "http-body 1.0.1",
  "http-body-util",
@@ -935,18 +958,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.9"
+version = "0.60.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc"
+checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.3.7"
+version = "1.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a322fec39e4df22777ed3ad8ea868ac2f94cd15e1a55f6ee8d8d6305057689a"
+checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -958,15 +981,14 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.7.9"
+version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8"
 dependencies = [
- "async-trait",
  "axum-core",
  "bytes",
  "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "itoa",
@@ -975,49 +997,31 @@ dependencies = [
  "mime",
  "percent-encoding",
  "pin-project-lite",
- "rustversion",
- "serde",
+ "serde_core",
  "sync_wrapper",
- "tower 0.5.2",
+ "tower",
  "tower-layer",
  "tower-service",
 ]
 
 [[package]]
 name = "axum-core"
-version = "0.4.5"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
 dependencies = [
- "async-trait",
  "bytes",
- "futures-util",
- "http 1.3.1",
+ "futures-core",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "mime",
  "pin-project-lite",
- "rustversion",
  "sync_wrapper",
  "tower-layer",
  "tower-service",
 ]
 
-[[package]]
-name = "backtrace"
-version = "0.3.75"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002"
-dependencies = [
- "addr2line",
- "cfg-if",
- "libc",
- "miniz_oxide",
- "object",
- "rustc-demangle",
- "windows-targets 0.52.6",
-]
-
 [[package]]
 name = "base64"
 version = "0.21.7"
@@ -1042,9 +1046,9 @@ dependencies = [
 
 [[package]]
 name = "bigdecimal"
-version = "0.4.8"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013"
+checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695"
 dependencies = [
  "autocfg",
  "libm",
@@ -1054,52 +1058,11 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.9.1",
- "cexpr",
- "clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash 1.1.0",
- "shlex",
- "syn 2.0.101",
- "which",
-]
-
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
 [[package]]
 name = "bitflags"
-version = "2.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
-
-[[package]]
-name = "bitvec"
-version = "1.0.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
-dependencies = [
- "funty",
- "radium",
- "tap",
- "wyz",
-]
+checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
 
 [[package]]
 name = "blake2"
@@ -1112,15 +1075,16 @@ dependencies = [
 
 [[package]]
 name = "blake3"
-version = "1.8.2"
+version = "1.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0"
+checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
 dependencies = [
  "arrayref",
  "arrayvec",
  "cc",
  "cfg-if",
  "constant_time_eq",
+ "cpufeatures",
 ]
 
 [[package]]
@@ -1134,18 +1098,21 @@ dependencies = [
 
 [[package]]
 name = "bollard"
-version = "0.18.1"
+version = "0.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97ccca1260af6a459d75994ad5acc1651bcabcbdbc41467cc9786519ab854c30"
+checksum = "ee04c4c84f1f811b017f2fbb7dd8815c976e7ca98593de9c1e2afad0f636bff4"
 dependencies = [
+ "async-stream",
  "base64 0.22.1",
+ "bitflags",
+ "bollard-buildkit-proto",
  "bollard-stubs",
  "bytes",
  "futures-core",
  "futures-util",
  "hex",
  "home",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body-util",
  "hyper",
  "hyper-named-pipe",
@@ -1153,63 +1120,86 @@ dependencies = [
  "hyper-util",
  "hyperlocal",
  "log",
+ "num",
  "pin-project-lite",
+ "rand 0.9.2",
  "rustls",
  "rustls-native-certs",
- "rustls-pemfile",
  "rustls-pki-types",
  "serde",
  "serde_derive",
  "serde_json",
- "serde_repr",
  "serde_urlencoded",
- "thiserror 2.0.12",
+ "thiserror",
+ "time",
  "tokio",
+ "tokio-stream",
  "tokio-util",
+ "tonic",
  "tower-service",
  "url",
  "winapi",
 ]
 
+[[package]]
+name = "bollard-buildkit-proto"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad"
+dependencies = [
+ "prost",
+ "prost-types",
+ "tonic",
+ "tonic-prost",
+ "ureq",
+]
+
 [[package]]
 name = "bollard-stubs"
-version = "1.47.1-rc.27.3.1"
+version = "1.52.1-rc.29.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f179cfbddb6e77a5472703d4b30436bff32929c0aa8a9008ecf23d1d3cdd0da"
+checksum = "0f0a8ca8799131c1837d1282c3f81f31e76ceb0ce426e04a7fe1ccee3287c066"
 dependencies = [
+ "base64 0.22.1",
+ "bollard-buildkit-proto",
+ "bytes",
+ "prost",
  "serde",
+ "serde_json",
  "serde_repr",
- "serde_with",
+ "time",
 ]
 
 [[package]]
-name = "borsh"
-version = "1.5.7"
+name = "bon"
+version = "3.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad8646f98db542e39fc66e68a20b2144f6a732636df7c2354e74645faaa433ce"
+checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe"
 dependencies = [
- "borsh-derive",
- "cfg_aliases",
+ "bon-macros",
+ "rustversion",
 ]
 
 [[package]]
-name = "borsh-derive"
-version = "1.5.7"
+name = "bon-macros"
+version = "3.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd1d3c0c2f5833f22386f252fe8ed005c7f59fdcddeef025c01b4c3b9fd9ac3"
+checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c"
 dependencies = [
- "once_cell",
- "proc-macro-crate",
+ "darling",
+ "ident_case",
+ "prettyplease",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "rustversion",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "brotli"
-version = "8.0.1"
+version = "8.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d"
+checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
 dependencies = [
  "alloc-no-stdlib",
  "alloc-stdlib",
@@ -1228,42 +1218,19 @@ dependencies = [
 
 [[package]]
 name = "bstr"
-version = "1.12.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
+checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
 dependencies = [
  "memchr",
- "regex-automata",
  "serde",
 ]
 
 [[package]]
 name = "bumpalo"
-version = "3.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
-
-[[package]]
-name = "bytecheck"
-version = "0.6.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2"
-dependencies = [
- "bytecheck_derive",
- "ptr_meta",
- "simdutf8",
-]
-
-[[package]]
-name = "bytecheck_derive"
-version = "0.6.12"
+version = "3.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
 
 [[package]]
 name = "byteorder"
@@ -1273,9 +1240,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.10.1"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "bytes-utils"
@@ -1289,31 +1256,11 @@ dependencies = [
 
 [[package]]
 name = "bzip2"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
-dependencies = [
- "bzip2-sys",
- "libc",
-]
-
-[[package]]
-name = "bzip2"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
-dependencies = [
- "bzip2-sys",
-]
-
-[[package]]
-name = "bzip2-sys"
-version = "0.1.13+1.0.8"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
+checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c"
 dependencies = [
- "cc",
- "pkg-config",
+ "libbz2-rs-sys",
 ]
 
 [[package]]
@@ -1324,29 +1271,21 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.2.23"
+version = "1.2.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766"
+checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423"
 dependencies = [
+ "find-msvc-tools",
  "jobserver",
  "libc",
  "shlex",
 ]
 
-[[package]]
-name = "cexpr"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
-dependencies = [
- "nom",
-]
-
 [[package]]
 name = "cfg-if"
-version = "1.0.0"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "cfg_aliases"
@@ -1356,11 +1295,10 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
 [[package]]
 name = "chrono"
-version = "0.4.41"
+version = "0.4.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
 dependencies = [
- "android-tzdata",
  "iana-time-zone",
  "js-sys",
  "num-traits",
@@ -1371,23 +1309,12 @@ dependencies = [
 
 [[package]]
 name = "chrono-tz"
-version = "0.10.3"
+version = "0.10.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3"
+checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3"
 dependencies = [
  "chrono",
- "chrono-tz-build",
- "phf",
-]
-
-[[package]]
-name = "chrono-tz-build"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402"
-dependencies = [
- "parse-zoneinfo",
- "phf_codegen",
+ "phf 0.12.1",
 ]
 
 [[package]]
@@ -1417,33 +1344,11 @@ dependencies = [
  "half",
 ]
 
-[[package]]
-name = "clang-sys"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
-dependencies = [
- "glob",
- "libc",
- "libloading 0.8.7",
-]
-
-[[package]]
-name = "clap"
-version = "2.34.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
-dependencies = [
- "bitflags 1.3.2",
- "textwrap",
- "unicode-width 0.1.14",
-]
-
 [[package]]
 name = "clap"
-version = "4.5.39"
+version = "4.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f"
+checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1451,11 +1356,11 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.39"
+version = "4.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
 dependencies = [
- "anstream",
+ "anstream 1.0.0",
  "anstyle",
  "clap_lex",
  "strsim",
@@ -1463,56 +1368,77 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.32"
+version = "4.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
+checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.4"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
 
 [[package]]
 name = "clipboard-win"
-version = "5.4.0"
+version = "5.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15efe7a882b08f34e38556b14f2fb3daa98769d06c7f0c1b076dfd0d983bc892"
+checksum = "bde03770d3df201d4fb868f2c9c59e66a3e4e2bd06692a0fe701e7103c7e84d4"
 dependencies = [
  "error-code",
 ]
 
 [[package]]
 name = "cmake"
-version = "0.1.54"
+version = "0.1.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
+checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
 dependencies = [
  "cc",
 ]
 
 [[package]]
 name = "colorchoice"
-version = "1.0.3"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
 
 [[package]]
 name = "comfy-table"
-version = "7.1.4"
+version = "7.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a"
+checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47"
 dependencies = [
  "unicode-segmentation",
- "unicode-width 0.2.0",
+ "unicode-width 0.2.2",
+]
+
+[[package]]
+name = "compression-codecs"
+version = "0.4.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7"
+dependencies = [
+ "bzip2",
+ "compression-core",
+ "flate2",
+ "liblzma",
+ "memchr",
+ "zstd",
+ "zstd-safe",
 ]
 
+[[package]]
+name = "compression-core"
+version = "0.4.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
+
 [[package]]
 name = "console"
 version = "0.15.11"
@@ -1522,10 +1448,21 @@ dependencies = [
  "encode_unicode",
  "libc",
  "once_cell",
- "unicode-width 0.2.0",
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "console"
+version = "0.16.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "unicode-width 0.2.2",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "console_error_panic_hook"
 version = "0.1.7"
@@ -1551,28 +1488,31 @@ version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "once_cell",
  "tiny-keccak",
 ]
 
 [[package]]
 name = "const_panic"
-version = "0.2.12"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2459fc9262a1aa204eb4b5764ad4f189caec88aea9634389c0a25f8be7f6265e"
+checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652"
+dependencies = [
+ "typewit",
+]
 
 [[package]]
 name = "constant_time_eq"
-version = "0.3.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
+checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
 
 [[package]]
 name = "core-foundation"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -1584,29 +1524,20 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
-[[package]]
-name = "core2"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "core_extensions"
-version = "1.5.3"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee"
+checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003"
 dependencies = [
  "core_extensions_proc_macros",
 ]
 
 [[package]]
 name = "core_extensions_proc_macros"
-version = "1.5.3"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6"
+checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea"
 
 [[package]]
 name = "cpufeatures"
@@ -1619,35 +1550,34 @@ dependencies = [
 
 [[package]]
 name = "crc32fast"
-version = "1.4.2"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
 dependencies = [
  "cfg-if",
 ]
 
 [[package]]
 name = "criterion"
-version = "0.5.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3"
 dependencies = [
+ "alloca",
  "anes",
  "cast",
  "ciborium",
- "clap 4.5.39",
+ "clap",
  "criterion-plot",
  "futures",
- "is-terminal",
- "itertools 0.10.5",
+ "itertools 0.13.0",
  "num-traits",
- "once_cell",
  "oorandom",
+ "page_size",
  "plotters",
  "rayon",
  "regex",
  "serde",
- "serde_derive",
  "serde_json",
  "tinytemplate",
  "tokio",
@@ -1656,12 +1586,12 @@ dependencies = [
 
 [[package]]
 name = "criterion-plot"
-version = "0.5.0"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea"
 dependencies = [
  "cast",
- "itertools 0.10.5",
+ "itertools 0.13.0",
 ]
 
 [[package]]
@@ -1692,6 +1622,15 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
@@ -1700,15 +1639,15 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
 [[package]]
 name = "crunchy"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
 [[package]]
 name = "crypto-common"
-version = "0.1.6"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
 dependencies = [
  "generic-array",
  "typenum",
@@ -1716,30 +1655,30 @@ dependencies = [
 
 [[package]]
 name = "csv"
-version = "1.3.1"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
+checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
 dependencies = [
  "csv-core",
  "itoa",
  "ryu",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "csv-core"
-version = "0.1.12"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
+checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "ctor"
-version = "0.4.2"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4735f265ba6a1188052ca32d461028a7d1125868be18e287e756019da7607b5"
+checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
 dependencies = [
  "ctor-proc-macro",
  "dtor",
@@ -1747,15 +1686,21 @@ dependencies = [
 
 [[package]]
 name = "ctor-proc-macro"
-version = "0.0.5"
+version = "0.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
+
+[[package]]
+name = "cty"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f211af61d8efdd104f96e57adf5e426ba1bc3ed7a4ead616e15e5881fd79c4d"
+checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35"
 
 [[package]]
 name = "darling"
-version = "0.20.11"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1763,35 +1708,28 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.20.11"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0"
 dependencies = [
- "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.20.11"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
-[[package]]
-name = "dary_heap"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
-
 [[package]]
 name = "dashmap"
 version = "6.1.0"
@@ -1808,14 +1746,13 @@ dependencies = [
 
 [[package]]
 name = "datafusion"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
- "arrow-ipc",
  "arrow-schema",
  "async-trait",
  "bytes",
- "bzip2 0.5.2",
+ "bzip2",
  "chrono",
  "criterion",
  "ctor",
@@ -1825,6 +1762,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
+ "datafusion-datasource-arrow",
  "datafusion-datasource-avro",
  "datafusion-datasource-csv",
  "datafusion-datasource-json",
@@ -1842,6 +1780,7 @@ dependencies = [
  "datafusion-macros",
  "datafusion-optimizer",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-optimizer",
  "datafusion-physical-plan",
@@ -1851,16 +1790,19 @@ dependencies = [
  "env_logger",
  "flate2",
  "futures",
+ "glob",
  "insta",
  "itertools 0.14.0",
+ "liblzma",
  "log",
- "nix",
+ "nix 0.31.2",
  "object_store",
  "parking_lot",
  "parquet",
- "paste",
- "rand 0.9.1",
+ "pretty_assertions",
+ "rand 0.9.2",
  "rand_distr",
+ "recursive",
  "regex",
  "rstest",
  "serde",
@@ -1872,37 +1814,39 @@ dependencies = [
  "tokio",
  "url",
  "uuid",
- "xz2",
  "zstd",
 ]
 
 [[package]]
 name = "datafusion-benchmarks"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "async-trait",
+ "bytes",
+ "clap",
  "datafusion",
  "datafusion-common",
  "datafusion-proto",
  "env_logger",
  "futures",
+ "libmimalloc-sys",
  "log",
  "mimalloc",
  "object_store",
  "parquet",
- "rand 0.9.1",
+ "rand 0.9.2",
+ "regex",
  "serde",
  "serde_json",
  "snmalloc-rs",
- "structopt",
- "test-utils",
  "tokio",
  "tokio-util",
 ]
 
 [[package]]
 name = "datafusion-catalog"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1915,7 +1859,6 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-plan",
  "datafusion-session",
- "datafusion-sql",
  "futures",
  "itertools 0.14.0",
  "log",
@@ -1926,75 +1869,78 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog-listing"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
  "datafusion-catalog",
  "datafusion-common",
  "datafusion-datasource",
+ "datafusion-datasource-parquet",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
- "datafusion-session",
  "futures",
+ "itertools 0.14.0",
  "log",
  "object_store",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-cli"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
- "assert_cmd",
  "async-trait",
  "aws-config",
  "aws-credential-types",
- "clap 4.5.39",
+ "chrono",
+ "clap",
  "ctor",
  "datafusion",
+ "datafusion-common",
  "dirs",
  "env_logger",
  "futures",
  "insta",
  "insta-cmd",
+ "log",
  "mimalloc",
  "object_store",
  "parking_lot",
  "parquet",
- "predicates",
  "regex",
  "rstest",
  "rustyline",
+ "testcontainers-modules",
  "tokio",
  "url",
 ]
 
 [[package]]
 name = "datafusion-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "apache-avro",
  "arrow",
  "arrow-ipc",
- "base64 0.22.1",
  "chrono",
+ "criterion",
+ "foldhash 0.2.0",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "hex",
+ "indexmap 2.13.0",
  "insta",
+ "itertools 0.14.0",
  "libc",
  "log",
  "object_store",
  "parquet",
- "paste",
- "pyo3",
- "rand 0.9.1",
+ "rand 0.9.2",
  "recursive",
  "sqlparser",
  "tokio",
@@ -2003,7 +1949,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-common-runtime"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "futures",
  "log",
@@ -2012,78 +1958,95 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-compression",
  "async-trait",
  "bytes",
- "bzip2 0.5.2",
+ "bzip2",
  "chrono",
  "criterion",
+ "crossbeam-queue",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
  "flate2",
  "futures",
  "glob",
+ "insta",
  "itertools 0.14.0",
+ "liblzma",
  "log",
  "object_store",
- "parquet",
- "rand 0.9.1",
+ "rand 0.9.2",
  "tempfile",
  "tokio",
  "tokio-util",
  "url",
- "xz2",
  "zstd",
 ]
 
 [[package]]
-name = "datafusion-datasource-avro"
-version = "47.0.0"
+name = "datafusion-datasource-arrow"
+version = "52.3.0"
 dependencies = [
- "apache-avro",
  "arrow",
+ "arrow-ipc",
  "async-trait",
  "bytes",
  "chrono",
- "datafusion-catalog",
  "datafusion-common",
+ "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
- "datafusion-physical-expr",
+ "datafusion-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "itertools 0.14.0",
+ "object_store",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-datasource-avro"
+version = "52.3.0"
+dependencies = [
+ "apache-avro",
+ "arrow",
+ "async-trait",
+ "bytes",
+ "datafusion-common",
+ "datafusion-datasource",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
  "futures",
  "num-traits",
  "object_store",
- "rstest",
  "serde_json",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-datasource-csv"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
- "datafusion-catalog",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
- "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
@@ -2095,18 +2058,16 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-json"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
- "datafusion-catalog",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
- "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
  "datafusion-session",
@@ -2114,27 +2075,31 @@ dependencies = [
  "object_store",
  "serde_json",
  "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
 name = "datafusion-datasource-parquet"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bytes",
  "chrono",
- "datafusion-catalog",
+ "criterion",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
- "datafusion-functions-aggregate",
+ "datafusion-functions",
+ "datafusion-functions-aggregate-common",
+ "datafusion-functions-nested",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-physical-expr-common",
- "datafusion-physical-optimizer",
  "datafusion-physical-plan",
+ "datafusion-pruning",
  "datafusion-session",
  "futures",
  "itertools 0.14.0",
@@ -2142,34 +2107,45 @@ dependencies = [
  "object_store",
  "parking_lot",
  "parquet",
- "rand 0.9.1",
+ "tempfile",
  "tokio",
 ]
 
 [[package]]
 name = "datafusion-doc"
-version = "47.0.0"
+version = "52.3.0"
 
 [[package]]
 name = "datafusion-examples"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "arrow-flight",
  "arrow-schema",
  "async-trait",
+ "base64 0.22.1",
  "bytes",
  "dashmap",
  "datafusion",
- "datafusion-ffi",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-physical-expr-adapter",
  "datafusion-proto",
+ "datafusion-sql",
  "env_logger",
  "futures",
+ "insta",
  "log",
  "mimalloc",
- "nix",
+ "nix 0.31.2",
+ "nom",
  "object_store",
  "prost",
+ "rand 0.9.2",
+ "serde",
+ "serde_json",
+ "strum 0.28.0",
+ "strum_macros 0.28.0",
  "tempfile",
  "test-utils",
  "tokio",
@@ -2182,28 +2158,33 @@ dependencies = [
 
 [[package]]
 name = "datafusion-execution"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "arrow-buffer",
+ "async-trait",
  "chrono",
  "dashmap",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-physical-expr-common",
  "futures",
  "insta",
  "log",
  "object_store",
  "parking_lot",
- "rand 0.9.1",
+ "parquet",
+ "rand 0.9.2",
  "tempfile",
  "url",
 ]
 
 [[package]]
 name = "datafusion-expr"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "async-trait",
  "chrono",
  "ctor",
  "datafusion-common",
@@ -2213,9 +2194,9 @@ dependencies = [
  "datafusion-functions-window-common",
  "datafusion-physical-expr-common",
  "env_logger",
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
  "insta",
- "paste",
+ "itertools 0.14.0",
  "recursive",
  "serde_json",
  "sqlparser",
@@ -2223,18 +2204,18 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "datafusion-common",
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
+ "insta",
  "itertools 0.14.0",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-ffi"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "abi_stable",
  "arrow",
@@ -2242,7 +2223,22 @@ dependencies = [
  "async-ffi",
  "async-trait",
  "datafusion",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
+ "datafusion-functions-table",
+ "datafusion-functions-window",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
  "datafusion-proto",
+ "datafusion-proto-common",
+ "datafusion-session",
  "doc-comment",
  "futures",
  "log",
@@ -2253,7 +2249,7 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "arrow-buffer",
@@ -2261,18 +2257,23 @@ dependencies = [
  "blake2",
  "blake3",
  "chrono",
+ "chrono-tz",
  "criterion",
+ "ctor",
  "datafusion-common",
  "datafusion-doc",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
  "datafusion-macros",
+ "env_logger",
  "hex",
  "itertools 0.14.0",
  "log",
  "md-5",
- "rand 0.9.1",
+ "memchr",
+ "num-traits",
+ "rand 0.9.2",
  "regex",
  "sha2",
  "tokio",
@@ -2282,9 +2283,8 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-aggregate"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
@@ -2295,28 +2295,28 @@ dependencies = [
  "datafusion-macros",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
+ "foldhash 0.2.0",
  "half",
  "log",
- "paste",
- "rand 0.9.1",
+ "num-traits",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "datafusion-functions-aggregate-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
  "datafusion-expr-common",
  "datafusion-physical-expr-common",
- "rand 0.9.1",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "datafusion-functions-nested"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "arrow-ord",
@@ -2325,19 +2325,22 @@ dependencies = [
  "datafusion-doc",
  "datafusion-execution",
  "datafusion-expr",
+ "datafusion-expr-common",
  "datafusion-functions",
  "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
  "datafusion-macros",
  "datafusion-physical-expr-common",
+ "hashbrown 0.16.1",
  "itertools 0.14.0",
+ "itoa",
  "log",
- "paste",
- "rand 0.9.1",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "datafusion-functions-table"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2346,14 +2349,14 @@ dependencies = [
  "datafusion-expr",
  "datafusion-physical-plan",
  "parking_lot",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-functions-window"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "criterion",
  "datafusion-common",
  "datafusion-doc",
  "datafusion-expr",
@@ -2362,12 +2365,11 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "log",
- "paste",
 ]
 
 [[package]]
 name = "datafusion-functions-window-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -2375,16 +2377,16 @@ dependencies = [
 
 [[package]]
 name = "datafusion-macros"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "datafusion-expr",
+ "datafusion-doc",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "datafusion-optimizer"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2393,13 +2395,14 @@ dependencies = [
  "ctor",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-expr-common",
  "datafusion-functions-aggregate",
  "datafusion-functions-window",
  "datafusion-functions-window-common",
  "datafusion-physical-expr",
  "datafusion-sql",
  "env_logger",
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
  "insta",
  "itertools 0.14.0",
  "log",
@@ -2410,9 +2413,8 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "criterion",
  "datafusion-common",
@@ -2422,101 +2424,137 @@ dependencies = [
  "datafusion-functions-aggregate-common",
  "datafusion-physical-expr-common",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "indexmap 2.13.0",
  "insta",
  "itertools 0.14.0",
- "log",
- "paste",
- "petgraph 0.8.1",
- "rand 0.9.1",
+ "parking_lot",
+ "petgraph",
+ "rand 0.9.2",
+ "recursive",
  "rstest",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-physical-expr-adapter"
+version = "52.3.0"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "itertools 0.14.0",
 ]
 
 [[package]]
 name = "datafusion-physical-expr-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
+ "chrono",
+ "criterion",
  "datafusion-common",
  "datafusion-expr-common",
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
+ "indexmap 2.13.0",
  "itertools 0.14.0",
+ "parking_lot",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "datafusion-physical-optimizer"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "datafusion-common",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
- "datafusion-functions-nested",
+ "datafusion-functions",
+ "datafusion-functions-window",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
+ "datafusion-pruning",
  "insta",
  "itertools 0.14.0",
- "log",
  "recursive",
+ "tokio",
 ]
 
 [[package]]
 name = "datafusion-physical-plan"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
- "ahash 0.8.12",
  "arrow",
  "arrow-ord",
  "arrow-schema",
  "async-trait",
- "chrono",
  "criterion",
  "datafusion-common",
  "datafusion-common-runtime",
  "datafusion-execution",
  "datafusion-expr",
+ "datafusion-functions",
  "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
  "datafusion-functions-window",
  "datafusion-functions-window-common",
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "futures",
  "half",
- "hashbrown 0.14.5",
- "indexmap 2.9.0",
+ "hashbrown 0.16.1",
+ "indexmap 2.13.0",
  "insta",
  "itertools 0.14.0",
  "log",
+ "num-traits",
  "parking_lot",
  "pin-project-lite",
- "rand 0.9.1",
+ "rand 0.9.2",
  "rstest",
  "rstest_reuse",
- "tempfile",
  "tokio",
 ]
 
 [[package]]
 name = "datafusion-proto"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "async-trait",
  "chrono",
  "datafusion",
+ "datafusion-catalog",
+ "datafusion-catalog-listing",
  "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-datasource-arrow",
+ "datafusion-datasource-avro",
+ "datafusion-datasource-csv",
+ "datafusion-datasource-json",
+ "datafusion-datasource-parquet",
+ "datafusion-execution",
  "datafusion-expr",
  "datafusion-functions",
  "datafusion-functions-aggregate",
+ "datafusion-functions-table",
  "datafusion-functions-window-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
  "datafusion-proto-common",
  "doc-comment",
  "object_store",
- "pbjson",
+ "pbjson 0.9.0",
+ "pretty_assertions",
  "prost",
+ "rand 0.9.2",
  "serde",
  "serde_json",
  "tokio",
@@ -2524,59 +2562,79 @@ dependencies = [
 
 [[package]]
 name = "datafusion-proto-common"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "datafusion-common",
  "doc-comment",
- "pbjson",
+ "pbjson 0.9.0",
  "prost",
  "serde",
- "serde_json",
 ]
 
 [[package]]
-name = "datafusion-session"
-version = "47.0.0"
+name = "datafusion-pruning"
+version = "52.3.0"
 dependencies = [
  "arrow",
- "async-trait",
- "dashmap",
  "datafusion-common",
- "datafusion-common-runtime",
- "datafusion-execution",
+ "datafusion-datasource",
  "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-functions-nested",
  "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
  "datafusion-physical-plan",
- "datafusion-sql",
- "futures",
+ "insta",
  "itertools 0.14.0",
  "log",
- "object_store",
+]
+
+[[package]]
+name = "datafusion-session"
+version = "52.3.0"
+dependencies = [
+ "async-trait",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-plan",
  "parking_lot",
- "tokio",
 ]
 
 [[package]]
 name = "datafusion-spark"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
+ "bigdecimal",
+ "chrono",
+ "crc32fast",
+ "criterion",
+ "datafusion",
  "datafusion-catalog",
  "datafusion-common",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-functions",
- "datafusion-macros",
+ "datafusion-functions-aggregate",
+ "datafusion-functions-nested",
  "log",
+ "percent-encoding",
+ "rand 0.9.2",
+ "serde_json",
+ "sha1",
+ "sha2",
+ "url",
 ]
 
 [[package]]
 name = "datafusion-sql"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "bigdecimal",
+ "chrono",
  "ctor",
  "datafusion-common",
  "datafusion-expr",
@@ -2585,10 +2643,10 @@ dependencies = [
  "datafusion-functions-nested",
  "datafusion-functions-window",
  "env_logger",
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
  "insta",
+ "itertools 0.14.0",
  "log",
- "paste",
  "recursive",
  "regex",
  "rstest",
@@ -2597,16 +2655,17 @@ dependencies = [
 
 [[package]]
 name = "datafusion-sqllogictest"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "arrow",
  "async-trait",
  "bigdecimal",
  "bytes",
  "chrono",
- "clap 4.5.39",
+ "clap",
  "datafusion",
  "datafusion-spark",
+ "datafusion-substrait",
  "env_logger",
  "futures",
  "half",
@@ -2614,28 +2673,27 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "object_store",
- "postgres-protocol",
  "postgres-types",
- "rust_decimal",
+ "regex",
  "sqllogictest",
  "sqlparser",
  "tempfile",
- "testcontainers",
  "testcontainers-modules",
- "thiserror 2.0.12",
+ "thiserror",
  "tokio",
  "tokio-postgres",
 ]
 
 [[package]]
 name = "datafusion-substrait"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
  "async-recursion",
  "async-trait",
  "chrono",
  "datafusion",
  "datafusion-functions-aggregate",
+ "half",
  "insta",
  "itertools 0.14.0",
  "object_store",
@@ -2649,8 +2707,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-wasmtest"
-version = "47.0.0"
+version = "52.3.0"
 dependencies = [
+ "bytes",
  "chrono",
  "console_error_panic_hook",
  "datafusion",
@@ -2660,8 +2719,8 @@ dependencies = [
  "datafusion-optimizer",
  "datafusion-physical-plan",
  "datafusion-sql",
- "getrandom 0.3.3",
- "insta",
+ "futures",
+ "getrandom 0.3.4",
  "object_store",
  "tokio",
  "url",
@@ -2671,19 +2730,19 @@ dependencies = [
 
 [[package]]
 name = "deranged"
-version = "0.4.0"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
 dependencies = [
  "powerfmt",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
-name = "difflib"
-version = "0.4.0"
+name = "diff"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
 
 [[package]]
 name = "digest"
@@ -2714,7 +2773,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2725,14 +2784,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "doc-comment"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
 
 [[package]]
 name = "docker_credential"
@@ -2747,18 +2806,18 @@ dependencies = [
 
 [[package]]
 name = "dtor"
-version = "0.0.6"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97cbdf2ad6846025e8e25df05171abfb30e3ababa12ee0a0e44b9bbe570633a8"
+checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
 dependencies = [
  "dtor-proc-macro",
 ]
 
 [[package]]
 name = "dtor-proc-macro"
-version = "0.0.5"
+version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7454e41ff9012c00d53cf7f475c5e3afa3b91b7c90568495495e8d9bf47a1055"
+checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
 
 [[package]]
 name = "dunce"
@@ -2768,9 +2827,9 @@ checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
 [[package]]
 name = "dyn-clone"
-version = "1.0.19"
+version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005"
+checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
 
 [[package]]
 name = "educe"
@@ -2781,7 +2840,7 @@ dependencies = [
  "enum-ordinalize",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2804,29 +2863,29 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
 
 [[package]]
 name = "enum-ordinalize"
-version = "4.3.0"
+version = "4.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea0dcfa4e54eeb516fe454635a95753ddd39acda650ce703031c6973e315dd5"
+checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0"
 dependencies = [
  "enum-ordinalize-derive",
 ]
 
 [[package]]
 name = "enum-ordinalize-derive"
-version = "4.3.1"
+version = "4.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff"
+checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "env_filter"
-version = "0.1.3"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
+checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f"
 dependencies = [
  "log",
  "regex",
@@ -2834,11 +2893,11 @@ dependencies = [
 
 [[package]]
 name = "env_logger"
-version = "0.11.8"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
+checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d"
 dependencies = [
- "anstream",
+ "anstream 0.6.21",
  "anstyle",
  "env_filter",
  "jiff",
@@ -2853,12 +2912,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
 
 [[package]]
 name = "errno"
-version = "0.3.12"
+version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2875,13 +2934,12 @@ checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6"
 
 [[package]]
 name = "etcetera"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26c7b13d0780cb82722fd59f6f57f925e143427e4a75313a6c77243bf5326ae6"
+checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96"
 dependencies = [
  "cfg-if",
- "home",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2903,10 +2961,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78"
 dependencies = [
  "cfg-if",
- "rustix 1.0.7",
+ "rustix",
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "ferroid"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb330bbd4cb7a5b9f559427f06f98a4f853a137c8298f3bd3f8ca57663e21986"
+dependencies = [
+ "portable-atomic",
+ "rand 0.9.2",
+ "web-time",
+]
+
 [[package]]
 name = "ffi_example_table_provider"
 version = "0.1.0"
@@ -2939,16 +3008,21 @@ dependencies = [
 
 [[package]]
 name = "filetime"
-version = "0.2.25"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
 dependencies = [
  "cfg-if",
  "libc",
  "libredox",
- "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
 [[package]]
 name = "fixedbitset"
 version = "0.5.7"
@@ -2957,32 +3031,23 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
 
 [[package]]
 name = "flatbuffers"
-version = "25.2.10"
+version = "25.12.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
+checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "rustc_version",
 ]
 
 [[package]]
 name = "flate2"
-version = "1.1.1"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
- "libz-rs-sys",
  "miniz_oxide",
-]
-
-[[package]]
-name = "float-cmp"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8"
-dependencies = [
- "num-traits",
+ "zlib-rs",
 ]
 
 [[package]]
@@ -2997,20 +3062,26 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
-version = "1.2.1"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
 dependencies = [
  "percent-encoding",
 ]
 
 [[package]]
 name = "fs-err"
-version = "3.1.0"
+version = "3.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f89bda4c2a21204059a977ed3bfe746677dfd137b83c339e702b0ac91d482aa"
+checksum = "73fde052dbfc920003cfd2c8e2c6e6d4cc7c1091538c3a24226cec0665ab08c0"
 dependencies = [
  "autocfg",
 ]
@@ -3021,17 +3092,11 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
 
-[[package]]
-name = "funty"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
-
 [[package]]
 name = "futures"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -3044,9 +3109,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -3054,15 +3119,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -3071,32 +3136,32 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
 
 [[package]]
 name = "futures-task"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
 
 [[package]]
 name = "futures-timer"
@@ -3106,9 +3171,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
 
 [[package]]
 name = "futures-util"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -3118,7 +3183,6 @@ dependencies = [
  "futures-task",
  "memchr",
  "pin-project-lite",
- "pin-utils",
  "slab",
 ]
 
@@ -3126,7 +3190,7 @@ dependencies = [
 name = "gen"
 version = "0.1.0"
 dependencies = [
- "pbjson-build",
+ "pbjson-build 0.9.0",
  "prost-build",
 ]
 
@@ -3134,7 +3198,7 @@ dependencies = [
 name = "gen-common"
 version = "0.1.0"
 dependencies = [
- "pbjson-build",
+ "pbjson-build 0.9.0",
  "prost-build",
 ]
 
@@ -3159,48 +3223,55 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
 dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasi 0.11.1+wasi-snapshot-preview1",
  "wasm-bindgen",
 ]
 
 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "r-efi",
- "wasi 0.14.2+wasi-0.2.4",
+ "r-efi 5.3.0",
+ "wasip2",
  "wasm-bindgen",
 ]
 
 [[package]]
-name = "gimli"
-version = "0.31.1"
+name = "getrandom"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "wasip2",
+ "wasip3",
+]
 
 [[package]]
 name = "glob"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
 
 [[package]]
 name = "globset"
-version = "0.4.16"
+version = "0.4.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5"
+checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3"
 dependencies = [
  "aho-corasick",
  "bstr",
@@ -3211,17 +3282,17 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9421a676d1b147b16b82c9225157dc629087ef8ec4d5e2960f9437a90dac0a5"
+checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54"
 dependencies = [
  "atomic-waker",
  "bytes",
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.3.1",
- "indexmap 2.9.0",
+ "http 1.4.0",
+ "indexmap 2.13.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -3230,13 +3301,16 @@ dependencies = [
 
 [[package]]
 name = "half"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
 dependencies = [
  "cfg-if",
  "crunchy",
  "num-traits",
+ "rand 0.9.2",
+ "rand_distr",
+ "zerocopy",
 ]
 
 [[package]]
@@ -3244,38 +3318,31 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-dependencies = [
- "ahash 0.7.8",
-]
 
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
-dependencies = [
- "ahash 0.8.12",
- "allocator-api2",
-]
 
 [[package]]
 name = "hashbrown"
-version = "0.15.3"
+version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
+ "foldhash 0.1.5",
 ]
 
 [[package]]
-name = "heck"
-version = "0.3.3"
+name = "hashbrown"
+version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
 dependencies = [
- "unicode-segmentation",
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
 ]
 
 [[package]]
@@ -3284,12 +3351,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
-[[package]]
-name = "hermit-abi"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08"
-
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -3307,11 +3368,11 @@ dependencies = [
 
 [[package]]
 name = "home"
-version = "0.5.11"
+version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
+checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3327,12 +3388,11 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.3.1"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
 dependencies = [
  "bytes",
- "fnv",
  "itoa",
 ]
 
@@ -3354,7 +3414,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
- "http 1.3.1",
+ "http 1.4.0",
 ]
 
 [[package]]
@@ -3365,7 +3425,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
 dependencies = [
  "bytes",
  "futures-core",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "pin-project-lite",
 ]
@@ -3384,26 +3444,28 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "humantime"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f"
+checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"
 
 [[package]]
 name = "hyper"
-version = "1.6.0"
+version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
+checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
 dependencies = [
+ "atomic-waker",
  "bytes",
  "futures-channel",
- "futures-util",
+ "futures-core",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "httparse",
  "httpdate",
  "itoa",
  "pin-project-lite",
+ "pin-utils",
  "smallvec",
  "tokio",
  "want",
@@ -3426,12 +3488,11 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.5"
+version = "0.27.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
 dependencies = [
- "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "hyper",
  "hyper-util",
  "rustls",
@@ -3457,17 +3518,20 @@ dependencies = [
 
 [[package]]
 name = "hyper-util"
-version = "0.1.12"
+version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
 dependencies = [
+ "base64 0.22.1",
  "bytes",
  "futures-channel",
  "futures-util",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "hyper",
+ "ipnet",
  "libc",
+ "percent-encoding",
  "pin-project-lite",
  "socket2",
  "tokio",
@@ -3492,9 +3556,9 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.63"
+version = "0.1.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8"
+checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
@@ -3516,9 +3580,9 @@ dependencies = [
 
 [[package]]
 name = "icu_collections"
-version = "2.0.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
+checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
 dependencies = [
  "displaydoc",
  "potential_utf",
@@ -3529,9 +3593,9 @@ dependencies = [
 
 [[package]]
 name = "icu_locale_core"
-version = "2.0.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
+checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
 dependencies = [
  "displaydoc",
  "litemap",
@@ -3542,11 +3606,10 @@ dependencies = [
 
 [[package]]
 name = "icu_normalizer"
-version = "2.0.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"
+checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
 dependencies = [
- "displaydoc",
  "icu_collections",
  "icu_normalizer_data",
  "icu_properties",
@@ -3557,42 +3620,38 @@ dependencies = [
 
 [[package]]
 name = "icu_normalizer_data"
-version = "2.0.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
+checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
 
 [[package]]
 name = "icu_properties"
-version = "2.0.1"
+version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
+checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
 dependencies = [
- "displaydoc",
  "icu_collections",
  "icu_locale_core",
  "icu_properties_data",
  "icu_provider",
- "potential_utf",
  "zerotrie",
  "zerovec",
 ]
 
 [[package]]
 name = "icu_properties_data"
-version = "2.0.1"
+version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
+checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
 
 [[package]]
 name = "icu_provider"
-version = "2.0.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
+checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
 dependencies = [
  "displaydoc",
  "icu_locale_core",
- "stable_deref_trait",
- "tinystr",
  "writeable",
  "yoke",
  "zerofrom",
@@ -3600,6 +3659,12 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -3608,9 +3673,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
 [[package]]
 name = "idna"
-version = "1.0.3"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
 dependencies = [
  "idna_adapter",
  "smallvec",
@@ -3640,46 +3705,42 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.9.0"
+version = "2.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
+checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.3",
+ "hashbrown 0.16.1",
  "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "indicatif"
-version = "0.17.11"
+version = "0.18.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
 dependencies = [
- "console",
- "number_prefix",
+ "console 0.16.3",
  "portable-atomic",
- "unicode-width 0.2.0",
+ "unicode-width 0.2.2",
+ "unit-prefix",
  "web-time",
 ]
 
-[[package]]
-name = "indoc"
-version = "2.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
-
 [[package]]
 name = "insta"
-version = "1.43.1"
+version = "1.46.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "154934ea70c58054b556dd430b99a98c2a7ff5309ac9891597e339b5c28f4371"
+checksum = "e82db8c87c7f1ccecb34ce0c24399b8a73081427f3c7c50a5d597925356115e4"
 dependencies = [
- "console",
+ "console 0.15.11",
  "globset",
  "once_cell",
  "regex",
  "serde",
  "similar",
+ "tempfile",
  "walkdir",
 ]
 
@@ -3702,44 +3763,25 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 
 [[package]]
 name = "ipnet"
-version = "2.11.0"
+version = "2.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
 
 [[package]]
-name = "is-terminal"
-version = "0.4.16"
+name = "iri-string"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
+checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a"
 dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys 0.59.0",
+ "memchr",
+ "serde",
 ]
 
 [[package]]
 name = "is_terminal_polyfill"
-version = "1.70.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-
-[[package]]
-name = "itertools"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itertools"
-version = "0.12.1"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
 [[package]]
 name = "itertools"
@@ -3761,49 +3803,49 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.15"
+version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
 
 [[package]]
 name = "jiff"
-version = "0.2.14"
+version = "0.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a194df1107f33c79f4f93d02c80798520551949d59dfad22b6157048a88cca93"
+checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359"
 dependencies = [
  "jiff-static",
  "log",
  "portable-atomic",
  "portable-atomic-util",
- "serde",
+ "serde_core",
 ]
 
 [[package]]
 name = "jiff-static"
-version = "0.2.14"
+version = "0.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c6e1db7ed32c6c71b759497fae34bf7933636f75a251b9e736555da426f6442"
+checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "jobserver"
-version = "0.1.33"
+version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "libc",
 ]
 
 [[package]]
 name = "js-sys"
-version = "0.3.77"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
  "once_cell",
  "wasm-bindgen",
@@ -3816,16 +3858,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
 [[package]]
-name = "lazycell"
-version = "1.3.0"
+name = "leb128fmt"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
 
 [[package]]
 name = "lexical-core"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958"
+checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594"
 dependencies = [
  "lexical-parse-float",
  "lexical-parse-integer",
@@ -3836,84 +3878,59 @@ dependencies = [
 
 [[package]]
 name = "lexical-parse-float"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2"
+checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56"
 dependencies = [
  "lexical-parse-integer",
  "lexical-util",
- "static_assertions",
 ]
 
 [[package]]
 name = "lexical-parse-integer"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e"
+checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34"
 dependencies = [
  "lexical-util",
- "static_assertions",
 ]
 
 [[package]]
 name = "lexical-util"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3"
-dependencies = [
- "static_assertions",
-]
+checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17"
 
 [[package]]
 name = "lexical-write-float"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd"
+checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361"
 dependencies = [
  "lexical-util",
  "lexical-write-integer",
- "static_assertions",
 ]
 
 [[package]]
 name = "lexical-write-integer"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978"
+checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df"
 dependencies = [
  "lexical-util",
- "static_assertions",
 ]
 
 [[package]]
-name = "libc"
-version = "0.2.172"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
-
-[[package]]
-name = "libflate"
-version = "2.1.0"
+name = "libbz2-rs-sys"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
-dependencies = [
- "adler32",
- "core2",
- "crc32fast",
- "dary_heap",
- "libflate_lz77",
-]
+checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
 
 [[package]]
-name = "libflate_lz77"
-version = "2.1.0"
+name = "libc"
+version = "0.2.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
-dependencies = [
- "core2",
- "hashbrown 0.14.5",
- "rle-decode-fast",
-]
+checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
 
 [[package]]
 name = "libloading"
@@ -3926,96 +3943,92 @@ dependencies = [
 ]
 
 [[package]]
-name = "libloading"
-version = "0.8.7"
+name = "liblzma"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c"
+checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899"
 dependencies = [
- "cfg-if",
- "windows-targets 0.53.0",
+ "liblzma-sys",
+]
+
+[[package]]
+name = "liblzma-sys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
 ]
 
 [[package]]
 name = "libm"
-version = "0.2.15"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "libmimalloc-sys"
-version = "0.1.42"
+version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec9d6fac27761dabcd4ee73571cdb06b7022dc99089acbe5435691edffaac0f4"
+checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870"
 dependencies = [
  "cc",
+ "cty",
  "libc",
 ]
 
 [[package]]
 name = "libredox"
-version = "0.1.3"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "libc",
- "redox_syscall 0.5.12",
+ "plain",
+ "redox_syscall 0.7.3",
 ]
 
 [[package]]
 name = "libtest-mimic"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33"
+checksum = "14e6ba06f0ade6e504aff834d7c34298e5155c6baca353cc6a4aaff2f9fd7f33"
 dependencies = [
- "anstream",
+ "anstream 1.0.0",
  "anstyle",
- "clap 4.5.39",
+ "clap",
  "escape8259",
 ]
 
-[[package]]
-name = "libz-rs-sys"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a"
-dependencies = [
- "zlib-rs",
-]
-
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.15"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
 
 [[package]]
-name = "linux-raw-sys"
-version = "0.9.4"
+name = "litemap"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
-
-[[package]]
-name = "litemap"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
 
 [[package]]
 name = "lock_api"
-version = "0.4.12"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
 dependencies = [
- "autocfg",
  "scopeguard",
 ]
 
 [[package]]
 name = "log"
-version = "0.4.27"
+version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
 [[package]]
 name = "lru-slab"
@@ -4025,29 +4038,18 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
 [[package]]
 name = "lz4_flex"
-version = "0.11.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
-dependencies = [
- "twox-hash 1.6.3",
-]
-
-[[package]]
-name = "lzma-sys"
-version = "0.1.20"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746"
 dependencies = [
- "cc",
- "libc",
- "pkg-config",
+ "twox-hash",
 ]
 
 [[package]]
 name = "matchit"
-version = "0.7.3"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
 
 [[package]]
 name = "md-5"
@@ -4061,24 +4063,15 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
-
-[[package]]
-name = "memoffset"
-version = "0.9.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
-dependencies = [
- "autocfg",
-]
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
 [[package]]
 name = "mimalloc"
-version = "0.1.46"
+version = "0.1.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "995942f432bbb4822a7e9c3faa87a695185b0d09273ba85f097b54f4e458f2af"
+checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8"
 dependencies = [
  "libmimalloc-sys",
 ]
@@ -4091,38 +4084,33 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
 [[package]]
 name = "minicov"
-version = "0.3.7"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27fe9f1cc3c22e1687f9446c2083c4c5fc7f0bcf1c7a86bdbded14985895b4b"
+checksum = "4869b6a491569605d66d3952bcdf03df789e5b536e5f0cf7758a7f08a55ae24d"
 dependencies = [
  "cc",
  "walkdir",
 ]
 
-[[package]]
-name = "minimal-lexical"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
-
 [[package]]
 name = "miniz_oxide"
-version = "0.8.8"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
 dependencies = [
  "adler2",
+ "simd-adler32",
 ]
 
 [[package]]
 name = "mio"
-version = "1.0.3"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
+checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc"
 dependencies = [
  "libc",
- "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.52.0",
+ "wasi 0.11.1+wasi-snapshot-preview1",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4146,45 +4134,49 @@ version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "cfg-if",
  "cfg_aliases",
  "libc",
 ]
 
 [[package]]
-name = "nom"
-version = "7.1.3"
+name = "nix"
+version = "0.31.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3"
 dependencies = [
- "memchr",
- "minimal-lexical",
+ "bitflags",
+ "cfg-if",
+ "cfg_aliases",
+ "libc",
 ]
 
 [[package]]
-name = "normalize-line-endings"
-version = "0.3.0"
+name = "nom"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
+checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
+dependencies = [
+ "memchr",
+]
 
 [[package]]
 name = "ntapi"
-version = "0.4.1"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
 dependencies = [
  "winapi",
 ]
 
 [[package]]
 name = "nu-ansi-term"
-version = "0.46.0"
+version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "overload",
- "winapi",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4223,9 +4215,9 @@ dependencies = [
 
 [[package]]
 name = "num-conv"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050"
 
 [[package]]
 name = "num-integer"
@@ -4268,45 +4260,48 @@ dependencies = [
  "libm",
 ]
 
-[[package]]
-name = "number_prefix"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
-
 [[package]]
 name = "objc2-core-foundation"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166"
+checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
 ]
 
 [[package]]
 name = "objc2-io-kit"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a"
+checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15"
 dependencies = [
  "libc",
  "objc2-core-foundation",
 ]
 
+[[package]]
+name = "objc2-system-configuration"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7216bd11cbda54ccabcab84d523dc93b858ec75ecfb3a7d89513fa22464da396"
+dependencies = [
+ "objc2-core-foundation",
+]
+
 [[package]]
 name = "object"
-version = "0.36.7"
+version = "0.37.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "object_store"
-version = "0.12.1"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d94ac16b433c0ccf75326388c893d2835ab7457ea35ab8ba5d745c053ef5fa16"
+checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4314,7 +4309,7 @@ dependencies = [
  "chrono",
  "form_urlencoded",
  "futures",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body-util",
  "humantime",
  "hyper",
@@ -4323,14 +4318,14 @@ dependencies = [
  "parking_lot",
  "percent-encoding",
  "quick-xml",
- "rand 0.9.1",
+ "rand 0.9.2",
  "reqwest",
  "ring",
- "rustls-pemfile",
+ "rustls-pki-types",
  "serde",
  "serde_json",
  "serde_urlencoded",
- "thiserror 2.0.12",
+ "thiserror",
  "tokio",
  "tracing",
  "url",
@@ -4341,9 +4336,15 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.21.3"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "oorandom"
@@ -4353,9 +4354,9 @@ checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
 [[package]]
 name = "openssl-probe"
-version = "0.1.6"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
 [[package]]
 name = "option-ext"
@@ -4379,22 +4380,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
 
 [[package]]
-name = "overload"
-version = "0.1.1"
+name = "owo-colors"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d"
 
 [[package]]
-name = "owo-colors"
-version = "4.2.1"
+name = "page_size"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26995317201fa17f3656c36716aed4a7c81743a9634ac4c99c0eeda495db0cec"
+checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
+dependencies = [
+ "libc",
+ "winapi",
+]
 
 [[package]]
 name = "parking_lot"
-version = "0.12.3"
+version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -4402,27 +4407,26 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.10"
+version = "0.9.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.5.12",
+ "redox_syscall 0.5.18",
  "smallvec",
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
 name = "parquet"
-version = "55.1.0"
+version = "58.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231"
+checksum = "3f491d0ef1b510194426ee67ddc18a9b747ef3c42050c19322a2cd2e1666c29b"
 dependencies = [
- "ahash 0.8.12",
+ "ahash",
  "arrow-array",
  "arrow-buffer",
- "arrow-cast",
  "arrow-data",
  "arrow-ipc",
  "arrow-schema",
@@ -4434,18 +4438,20 @@ dependencies = [
  "flate2",
  "futures",
  "half",
- "hashbrown 0.15.3",
+ "hashbrown 0.16.1",
  "lz4_flex",
- "num",
  "num-bigint",
+ "num-integer",
+ "num-traits",
  "object_store",
  "paste",
+ "ring",
  "seq-macro",
  "simdutf8",
  "snap",
  "thrift",
  "tokio",
- "twox-hash 2.1.0",
+ "twox-hash",
  "zstd",
 ]
 
@@ -4471,16 +4477,7 @@ dependencies = [
  "regex",
  "regex-syntax",
  "structmeta",
- "syn 2.0.101",
-]
-
-[[package]]
-name = "parse-zoneinfo"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24"
-dependencies = [
- "regex",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4491,36 +4488,58 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pbjson"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68"
+checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3"
 dependencies = [
- "base64 0.21.7",
+ "base64 0.22.1",
+ "serde",
+]
+
+[[package]]
+name = "pbjson"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8edd1efdd8ab23ba9cb9ace3d9987a72663d5d7c9f74fa00b51d6213645cf6c"
+dependencies = [
+ "base64 0.22.1",
  "serde",
 ]
 
 [[package]]
 name = "pbjson-build"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9"
+checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095"
 dependencies = [
- "heck 0.5.0",
- "itertools 0.13.0",
+ "heck",
+ "itertools 0.14.0",
+ "prost",
+ "prost-types",
+]
+
+[[package]]
+name = "pbjson-build"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ed4d5c6ae95e08ac768883c8401cf0e8deb4e6e1d6a4e1fd3d2ec4f0ec63200"
+dependencies = [
+ "heck",
+ "itertools 0.14.0",
  "prost",
  "prost-types",
 ]
 
 [[package]]
 name = "pbjson-types"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887"
+checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526"
 dependencies = [
  "bytes",
  "chrono",
- "pbjson",
- "pbjson-build",
+ "pbjson 0.8.0",
+ "pbjson-build 0.8.0",
  "prost",
  "prost-build",
  "serde",
@@ -4528,95 +4547,84 @@ dependencies = [
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.1"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
 
 [[package]]
 name = "petgraph"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
-dependencies = [
- "fixedbitset",
- "indexmap 2.9.0",
-]
-
-[[package]]
-name = "petgraph"
-version = "0.8.1"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a98c6720655620a521dcc722d0ad66cd8afd5d86e34a89ef691c50b7b24de06"
+checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
 dependencies = [
  "fixedbitset",
- "hashbrown 0.15.3",
- "indexmap 2.9.0",
+ "hashbrown 0.15.5",
+ "indexmap 2.13.0",
  "serde",
 ]
 
 [[package]]
 name = "phf"
-version = "0.11.3"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
 dependencies = [
- "phf_shared",
+ "phf_shared 0.12.1",
 ]
 
 [[package]]
-name = "phf_codegen"
-version = "0.11.3"
+name = "phf"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
 dependencies = [
- "phf_generator",
- "phf_shared",
+ "phf_shared 0.13.1",
+ "serde",
 ]
 
 [[package]]
-name = "phf_generator"
-version = "0.11.3"
+name = "phf_shared"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981"
 dependencies = [
- "phf_shared",
- "rand 0.8.5",
+ "siphasher",
 ]
 
 [[package]]
 name = "phf_shared"
-version = "0.11.3"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
 dependencies = [
  "siphasher",
 ]
 
 [[package]]
 name = "pin-project"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
 
 [[package]]
 name = "pin-utils"
@@ -4630,6 +4638,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "plotters"
 version = "0.3.7"
@@ -4660,36 +4674,36 @@ dependencies = [
 
 [[package]]
 name = "portable-atomic"
-version = "1.11.0"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
 
 [[package]]
 name = "portable-atomic-util"
-version = "0.2.4"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
+checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
 dependencies = [
  "portable-atomic",
 ]
 
 [[package]]
 name = "postgres-derive"
-version = "0.4.6"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69700ea4603c5ef32d447708e6a19cd3e8ac197a000842e97f527daea5e4175f"
+checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "postgres-protocol"
-version = "0.6.8"
+version = "0.6.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76ff0abab4a9b844b93ef7b81f1efc0a366062aaef2cd702c76256b5dc075c54"
+checksum = "3ee9dd5fe15055d2b6806f4736aa0c9637217074e224bbec46d4041b91bb9491"
 dependencies = [
  "base64 0.22.1",
  "byteorder",
@@ -4698,16 +4712,16 @@ dependencies = [
  "hmac",
  "md-5",
  "memchr",
- "rand 0.9.1",
+ "rand 0.9.2",
  "sha2",
  "stringprep",
 ]
 
 [[package]]
 name = "postgres-types"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613283563cd90e1dfc3518d548caee47e0e725455ed619881f5cf21f36de4b48"
+checksum = "54b858f82211e84682fecd373f68e1ceae642d8d751a1ebd13f33de6257b3e20"
 dependencies = [
  "bytes",
  "chrono",
@@ -4718,9 +4732,9 @@ dependencies = [
 
 [[package]]
 name = "potential_utf"
-version = "0.1.2"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"
+checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
 dependencies = [
  "zerovec",
 ]
@@ -4741,92 +4755,48 @@ dependencies = [
 ]
 
 [[package]]
-name = "predicates"
-version = "3.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573"
-dependencies = [
- "anstyle",
- "difflib",
- "float-cmp",
- "normalize-line-endings",
- "predicates-core",
- "regex",
-]
-
-[[package]]
-name = "predicates-core"
-version = "1.0.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa"
-
-[[package]]
-name = "predicates-tree"
-version = "1.0.12"
+name = "pretty_assertions"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c"
+checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d"
 dependencies = [
- "predicates-core",
- "termtree",
+ "diff",
+ "yansi",
 ]
 
 [[package]]
 name = "prettyplease"
-version = "0.2.32"
+version = "0.2.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
 dependencies = [
  "proc-macro2",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "proc-macro-crate"
-version = "3.3.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
+checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f"
 dependencies = [
  "toml_edit",
 ]
 
-[[package]]
-name = "proc-macro-error"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
-dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "version_check",
-]
-
-[[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
-dependencies = [
- "proc-macro2",
- "quote",
- "version_check",
-]
-
 [[package]]
 name = "proc-macro2"
-version = "1.0.95"
+version = "1.0.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "prost"
-version = "0.13.5"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
+checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
 dependencies = [
  "bytes",
  "prost-derive",
@@ -4834,42 +4804,41 @@ dependencies = [
 
 [[package]]
 name = "prost-build"
-version = "0.13.5"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
+checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "itertools 0.14.0",
  "log",
  "multimap",
- "once_cell",
- "petgraph 0.7.1",
+ "petgraph",
  "prettyplease",
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.101",
+ "syn 2.0.117",
  "tempfile",
 ]
 
 [[package]]
 name = "prost-derive"
-version = "0.13.5"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
+checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
 dependencies = [
  "anyhow",
  "itertools 0.14.0",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "prost-types"
-version = "0.13.5"
+version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
+checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7"
 dependencies = [
  "prost",
 ]
@@ -4885,96 +4854,14 @@ dependencies = [
 
 [[package]]
 name = "psm"
-version = "0.1.26"
+version = "0.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f"
+checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8"
 dependencies = [
+ "ar_archive_writer",
  "cc",
 ]
 
-[[package]]
-name = "ptr_meta"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1"
-dependencies = [
- "ptr_meta_derive",
-]
-
-[[package]]
-name = "ptr_meta_derive"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "pyo3"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219"
-dependencies = [
- "cfg-if",
- "indoc",
- "libc",
- "memoffset",
- "once_cell",
- "portable-atomic",
- "pyo3-build-config",
- "pyo3-ffi",
- "pyo3-macros",
- "unindent",
-]
-
-[[package]]
-name = "pyo3-build-config"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999"
-dependencies = [
- "once_cell",
- "target-lexicon",
-]
-
-[[package]]
-name = "pyo3-ffi"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33"
-dependencies = [
- "libc",
- "pyo3-build-config",
-]
-
-[[package]]
-name = "pyo3-macros"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9"
-dependencies = [
- "proc-macro2",
- "pyo3-macros-backend",
- "quote",
- "syn 2.0.101",
-]
-
-[[package]]
-name = "pyo3-macros-backend"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a"
-dependencies = [
- "heck 0.5.0",
- "proc-macro2",
- "pyo3-build-config",
- "quote",
- "syn 2.0.101",
-]
-
 [[package]]
 name = "quad-rand"
 version = "0.2.3"
@@ -4983,9 +4870,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40"
 
 [[package]]
 name = "quick-xml"
-version = "0.37.5"
+version = "0.38.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
+checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
 dependencies = [
  "memchr",
  "serde",
@@ -4993,19 +4880,19 @@ dependencies = [
 
 [[package]]
 name = "quinn"
-version = "0.11.8"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
 dependencies = [
  "bytes",
  "cfg_aliases",
  "pin-project-lite",
  "quinn-proto",
  "quinn-udp",
- "rustc-hash 2.1.1",
+ "rustc-hash",
  "rustls",
  "socket2",
- "thiserror 2.0.12",
+ "thiserror",
  "tokio",
  "tracing",
  "web-time",
@@ -5013,20 +4900,20 @@ dependencies = [
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.12"
+version = "0.11.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
 dependencies = [
  "bytes",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "lru-slab",
- "rand 0.9.1",
+ "rand 0.9.2",
  "ring",
- "rustc-hash 2.1.1",
+ "rustc-hash",
  "rustls",
  "rustls-pki-types",
  "slab",
- "thiserror 2.0.12",
+ "thiserror",
  "tinyvec",
  "tracing",
  "web-time",
@@ -5034,38 +4921,38 @@ dependencies = [
 
 [[package]]
 name = "quinn-udp"
-version = "0.5.12"
+version = "0.5.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee4e529991f949c5e25755532370b8af5d114acae52326361d68d47af64aa842"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
 dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.40"
+version = "1.0.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
 dependencies = [
  "proc-macro2",
 ]
 
 [[package]]
 name = "r-efi"
-version = "5.2.0"
+version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
 [[package]]
-name = "radium"
-version = "0.7.0"
+name = "r-efi"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
 [[package]]
 name = "radix_trie"
@@ -5090,12 +4977,12 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
  "rand_chacha 0.9.0",
- "rand_core 0.9.3",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -5115,7 +5002,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core 0.9.3",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -5124,16 +5011,16 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
 ]
 
 [[package]]
 name = "rand_core"
-version = "0.9.3"
+version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 ]
 
 [[package]]
@@ -5143,14 +5030,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
 dependencies = [
  "num-traits",
- "rand 0.9.1",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "rayon"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
 dependencies = [
  "either",
  "rayon-core",
@@ -5158,9 +5045,9 @@ dependencies = [
 
 [[package]]
 name = "rayon-core"
-version = "1.12.1"
+version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
 dependencies = [
  "crossbeam-deque",
  "crossbeam-utils",
@@ -5183,43 +5070,63 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
 dependencies = [
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.3.5"
+version = "0.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.12"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af"
+checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
 ]
 
 [[package]]
 name = "redox_users"
-version = "0.5.0"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b"
+checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
 dependencies = [
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "libredox",
- "thiserror 2.0.12",
+ "thiserror",
+]
+
+[[package]]
+name = "ref-cast"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "regex"
-version = "1.11.1"
+version = "1.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -5229,9 +5136,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.9"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -5240,23 +5147,23 @@ dependencies = [
 
 [[package]]
 name = "regex-lite"
-version = "0.1.6"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
+checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.5"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
 
 [[package]]
 name = "regress"
-version = "0.10.3"
+version = "0.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ef7fa9ed0256d64a688a3747d0fef7a88851c18a5e1d57f115f38ec2e09366"
+checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48"
 dependencies = [
- "hashbrown 0.15.3",
+ "hashbrown 0.16.1",
  "memchr",
 ]
 
@@ -5266,15 +5173,6 @@ version = "1.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
 
-[[package]]
-name = "rend"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c"
-dependencies = [
- "bytecheck",
-]
-
 [[package]]
 name = "repr_offset"
 version = "0.2.2"
@@ -5286,32 +5184,28 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.12.15"
+version = "0.12.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb"
+checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
 dependencies = [
  "base64 0.22.1",
  "bytes",
  "futures-core",
  "futures-util",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "hyper",
  "hyper-rustls",
  "hyper-util",
- "ipnet",
  "js-sys",
  "log",
- "mime",
- "once_cell",
  "percent-encoding",
  "pin-project-lite",
  "quinn",
  "rustls",
  "rustls-native-certs",
- "rustls-pemfile",
  "rustls-pki-types",
  "serde",
  "serde_json",
@@ -5320,14 +5214,14 @@ dependencies = [
  "tokio",
  "tokio-rustls",
  "tokio-util",
- "tower 0.5.2",
+ "tower",
+ "tower-http",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "wasm-streams",
  "web-sys",
- "windows-registry",
 ]
 
 [[package]]
@@ -5338,64 +5232,28 @@ checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",
- "getrandom 0.2.16",
+ "getrandom 0.2.17",
  "libc",
  "untrusted",
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "rkyv"
-version = "0.7.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b"
-dependencies = [
- "bitvec",
- "bytecheck",
- "bytes",
- "hashbrown 0.12.3",
- "ptr_meta",
- "rend",
- "rkyv_derive",
- "seahash",
- "tinyvec",
- "uuid",
-]
-
-[[package]]
-name = "rkyv_derive"
-version = "0.7.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "rle-decode-fast"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
-
 [[package]]
 name = "rstest"
-version = "0.25.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fc39292f8613e913f7df8fa892b8944ceb47c247b78e1b1ae2f09e019be789d"
+checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49"
 dependencies = [
  "futures-timer",
  "futures-util",
  "rstest_macros",
- "rustc_version",
 ]
 
 [[package]]
 name = "rstest_macros"
-version = "0.25.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746"
+checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0"
 dependencies = [
  "cfg-if",
  "glob",
@@ -5405,7 +5263,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.101",
+ "syn 2.0.117",
  "unicode-ident",
 ]
 
@@ -5417,38 +5275,9 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14"
 dependencies = [
  "quote",
  "rand 0.8.5",
- "syn 2.0.101",
-]
-
-[[package]]
-name = "rust_decimal"
-version = "1.37.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faa7de2ba56ac291bd90c6b9bece784a52ae1411f9506544b3eae36dd2356d50"
-dependencies = [
- "arrayvec",
- "borsh",
- "bytes",
- "num-traits",
- "postgres-types",
- "rand 0.8.5",
- "rkyv",
- "serde",
- "serde_json",
+ "syn 2.0.117",
 ]
 
-[[package]]
-name = "rustc-demangle"
-version = "0.1.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
-
-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
 [[package]]
 name = "rustc-hash"
 version = "2.1.1"
@@ -5466,37 +5295,25 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags 2.9.1",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
-[[package]]
-name = "rustix"
-version = "1.0.7"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "errno",
  "libc",
- "linux-raw-sys 0.9.4",
- "windows-sys 0.59.0",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.27"
+version = "0.23.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321"
+checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
 dependencies = [
  "aws-lc-rs",
+ "log",
  "once_cell",
  "ring",
  "rustls-pki-types",
@@ -5507,9 +5324,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-native-certs"
-version = "0.8.1"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
 dependencies = [
  "openssl-probe",
  "rustls-pki-types",
@@ -5517,20 +5334,11 @@ dependencies = [
  "security-framework",
 ]
 
-[[package]]
-name = "rustls-pemfile"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
-dependencies = [
- "rustls-pki-types",
-]
-
 [[package]]
 name = "rustls-pki-types"
-version = "1.12.0"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
+checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
 dependencies = [
  "web-time",
  "zeroize",
@@ -5538,9 +5346,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.103.3"
+version = "0.103.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53"
 dependencies = [
  "aws-lc-rs",
  "ring",
@@ -5550,17 +5358,17 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.20"
+version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
 [[package]]
 name = "rustyline"
-version = "16.0.0"
+version = "17.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62fd9ca5ebc709e8535e8ef7c658eb51457987e48c98ead2be482172accc408d"
+checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "cfg-if",
  "clipboard-win",
  "fd-lock",
@@ -5568,19 +5376,19 @@ dependencies = [
  "libc",
  "log",
  "memchr",
- "nix",
+ "nix 0.30.1",
  "radix_trie",
  "unicode-segmentation",
- "unicode-width 0.2.0",
+ "unicode-width 0.2.2",
  "utf8parse",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
 name = "ryu"
-version = "1.0.20"
+version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
 
 [[package]]
 name = "same-file"
@@ -5593,11 +5401,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.27"
+version = "0.1.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5612,6 +5420,30 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "schemars"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "schemars_derive"
 version = "0.8.22"
@@ -5621,7 +5453,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5630,19 +5462,13 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
-[[package]]
-name = "seahash"
-version = "4.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
-
 [[package]]
 name = "security-framework"
-version = "3.2.0"
+version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -5651,9 +5477,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.14.0"
+version = "2.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -5661,11 +5487,12 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.26"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
 dependencies = [
  "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -5676,31 +5503,42 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
 
 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
 dependencies = [
+ "serde_core",
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_bytes"
-version = "0.11.17"
+version = "0.11.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8437fd221bde2d4ca316d61b90e337e9e702b3820b87d63caa9ba6c02bd06d96"
+checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8"
 dependencies = [
  "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.219"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5711,19 +5549,21 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.140"
+version = "1.0.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
 dependencies = [
+ "indexmap 2.13.0",
  "itoa",
  "memchr",
- "ryu",
  "serde",
+ "serde_core",
+ "zmij",
 ]
 
 [[package]]
@@ -5734,19 +5574,19 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "serde_tokenstream"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1"
+checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69"
 dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5763,17 +5603,18 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "3.12.0"
+version = "3.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa"
+checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f"
 dependencies = [
  "base64 0.22.1",
  "chrono",
  "hex",
  "indexmap 1.9.3",
- "indexmap 2.9.0",
- "serde",
- "serde_derive",
+ "indexmap 2.13.0",
+ "schemars 0.9.0",
+ "schemars 1.2.1",
+ "serde_core",
  "serde_json",
  "serde_with_macros",
  "time",
@@ -5781,14 +5622,14 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "3.12.0"
+version = "3.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e"
+checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -5797,13 +5638,24 @@ version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
  "itoa",
  "ryu",
  "serde",
  "unsafe-libyaml",
 ]
 
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sha2"
 version = "0.10.9"
@@ -5832,13 +5684,20 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.5"
+version = "1.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
 dependencies = [
+ "errno",
  "libc",
 ]
 
+[[package]]
+name = "simd-adler32"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
+
 [[package]]
 name = "simdutf8"
 version = "0.1.5"
@@ -5853,24 +5712,21 @@ checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
 
 [[package]]
 name = "siphasher"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
 
 [[package]]
 name = "slab"
-version = "0.4.9"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
-dependencies = [
- "autocfg",
-]
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
 
 [[package]]
 name = "smallvec"
-version = "1.15.0"
+version = "1.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 
 [[package]]
 name = "snap"
@@ -5898,19 +5754,19 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.9"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "sqllogictest"
-version = "0.28.2"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94181af64007792bd1ab6d22023fbe86c2ccc50c1031b5bac554b5d057597e7b"
+checksum = "d03b2262a244037b0b510edbd25a8e6c9fb8d73ee0237fc6cc95a54c16f94a82"
 dependencies = [
  "async-trait",
  "educe",
@@ -5927,15 +5783,15 @@ dependencies = [
  "similar",
  "subst",
  "tempfile",
- "thiserror 2.0.12",
+ "thiserror",
  "tracing",
 ]
 
 [[package]]
 name = "sqlparser"
-version = "0.55.0"
+version = "0.61.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11"
+checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7"
 dependencies = [
  "log",
  "recursive",
@@ -5944,26 +5800,26 @@ dependencies = [
 
 [[package]]
 name = "sqlparser_derive"
-version = "0.3.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c"
+checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "stable_deref_trait"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
 [[package]]
 name = "stacker"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b"
+checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013"
 dependencies = [
  "cc",
  "cfg-if",
@@ -5972,12 +5828,6 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "static_assertions"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
-
 [[package]]
 name = "stringprep"
 version = "0.1.5"
@@ -6004,7 +5854,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6015,50 +5865,43 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
-name = "structopt"
-version = "0.3.26"
+name = "strum"
+version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
-dependencies = [
- "clap 2.34.0",
- "lazy_static",
- "structopt-derive",
-]
+checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
 
 [[package]]
-name = "structopt-derive"
-version = "0.4.18"
+name = "strum"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
+
+[[package]]
+name = "strum_macros"
+version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
+checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
 dependencies = [
- "heck 0.3.3",
- "proc-macro-error",
+ "heck",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.117",
 ]
 
-[[package]]
-name = "strum"
-version = "0.26.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
-
 [[package]]
 name = "strum_macros"
-version = "0.26.4"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
- "rustversion",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -6073,13 +5916,14 @@ dependencies = [
 
 [[package]]
 name = "substrait"
-version = "0.56.0"
+version = "0.63.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13de2e20128f2a018dab1cfa30be83ae069219a65968c6f89df66ad124de2397"
+checksum = "e620ff4d5c02fd6f7752931aa74b16a26af66a63022cc1ad412c77edbe0bab47"
 dependencies = [
- "heck 0.5.0",
- "pbjson",
- "pbjson-build",
+ "heck",
+ "indexmap 2.13.0",
+ "pbjson 0.8.0",
+ "pbjson-build 0.8.0",
  "pbjson-types",
  "prettyplease",
  "prost",
@@ -6087,12 +5931,12 @@ dependencies = [
  "prost-types",
  "protobuf-src",
  "regress",
- "schemars",
+ "schemars 0.8.22",
  "semver",
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.101",
+ "syn 2.0.117",
  "typify",
  "walkdir",
 ]
@@ -6116,9 +5960,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.101"
+version = "2.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6142,14 +5986,14 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "sysinfo"
-version = "0.35.1"
+version = "0.38.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79251336d17c72d9762b8b54be4befe38d2db56fbbc0241396d70f173c39d47a"
+checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f"
 dependencies = [
  "libc",
  "memchr",
@@ -6159,37 +6003,19 @@ dependencies = [
  "windows",
 ]
 
-[[package]]
-name = "tap"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
-
-[[package]]
-name = "target-lexicon"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
-
 [[package]]
 name = "tempfile"
-version = "3.20.0"
+version = "3.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
 dependencies = [
  "fastrand",
- "getrandom 0.3.3",
+ "getrandom 0.4.2",
  "once_cell",
- "rustix 1.0.7",
- "windows-sys 0.59.0",
+ "rustix",
+ "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "termtree"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
-
 [[package]]
 name = "test-utils"
 version = "0.1.0"
@@ -6198,23 +6024,26 @@ dependencies = [
  "chrono-tz",
  "datafusion-common",
  "env_logger",
- "rand 0.9.1",
+ "rand 0.9.2",
 ]
 
 [[package]]
 name = "testcontainers"
-version = "0.24.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23bb7577dca13ad86a78e8271ef5d322f37229ec83b8d98da6d996c588a1ddb1"
+checksum = "c1c0624faaa317c56d6d19136580be889677259caf5c897941c6f446b4655068"
 dependencies = [
+ "astral-tokio-tar",
  "async-trait",
  "bollard",
- "bollard-stubs",
  "bytes",
  "docker_credential",
  "either",
  "etcetera",
+ "ferroid",
  "futures",
+ "http 1.4.0",
+ "itertools 0.14.0",
  "log",
  "memchr",
  "parse-display",
@@ -6222,80 +6051,49 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
- "thiserror 2.0.12",
+ "thiserror",
  "tokio",
  "tokio-stream",
- "tokio-tar",
  "tokio-util",
  "url",
 ]
 
 [[package]]
 name = "testcontainers-modules"
-version = "0.12.1"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac95cde96549fc19c6bf19ef34cc42bd56e264c1cb97e700e21555be0ecf9e2"
+checksum = "e5985fde5befe4ffa77a052e035e16c2da86e8bae301baa9f9904ad3c494d357"
 dependencies = [
  "testcontainers",
 ]
 
-[[package]]
-name = "textwrap"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
-dependencies = [
- "unicode-width 0.1.14",
-]
-
 [[package]]
 name = "thiserror"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
-dependencies = [
- "thiserror-impl 1.0.69",
-]
-
-[[package]]
-name = "thiserror"
-version = "2.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
-dependencies = [
- "thiserror-impl 2.0.12",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.69"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
 dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.101",
+ "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.12"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "thread_local"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
 dependencies = [
  "cfg-if",
- "once_cell",
 ]
 
 [[package]]
@@ -6311,30 +6109,30 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.41"
+version = "0.3.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
 dependencies = [
  "deranged",
  "itoa",
  "num-conv",
  "powerfmt",
- "serde",
+ "serde_core",
  "time-core",
  "time-macros",
 ]
 
 [[package]]
 name = "time-core"
-version = "0.1.4"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
 
 [[package]]
 name = "time-macros"
-version = "0.2.22"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
 dependencies = [
  "num-conv",
  "time-core",
@@ -6351,9 +6149,9 @@ dependencies = [
 
 [[package]]
 name = "tinystr"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
+checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
 dependencies = [
  "displaydoc",
  "zerovec",
@@ -6371,9 +6169,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.9.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -6386,11 +6184,10 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.45.1"
+version = "1.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779"
+checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
 dependencies = [
- "backtrace",
  "bytes",
  "libc",
  "mio",
@@ -6399,25 +6196,25 @@ dependencies = [
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.5.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
+checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.13"
+version = "0.7.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c95d533c83082bb6490e0189acaa0bbeef9084e60471b696ca6988cd0541fb0"
+checksum = "dcea47c8f71744367793f16c2db1f11cb859d28f436bdb4ca9193eb1f787ee42"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -6428,11 +6225,11 @@ dependencies = [
  "log",
  "parking_lot",
  "percent-encoding",
- "phf",
+ "phf 0.13.1",
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "rand 0.9.1",
+ "rand 0.9.2",
  "socket2",
  "tokio",
  "tokio-util",
@@ -6441,9 +6238,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-rustls"
-version = "0.26.2"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
 dependencies = [
  "rustls",
  "tokio",
@@ -6451,35 +6248,21 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.17"
+version = "0.1.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
 dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
-]
-
-[[package]]
-name = "tokio-tar"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
-dependencies = [
- "filetime",
- "futures-core",
- "libc",
- "redox_syscall 0.3.5",
- "tokio",
- "tokio-stream",
- "xattr",
+ "tokio-util",
 ]
 
 [[package]]
 name = "tokio-util"
-version = "0.7.15"
+version = "0.7.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
 dependencies = [
  "bytes",
  "futures-core",
@@ -6490,34 +6273,46 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.9"
+version = "1.0.0+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3"
+checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e"
+dependencies = [
+ "serde_core",
+]
 
 [[package]]
 name = "toml_edit"
-version = "0.22.26"
+version = "0.25.4+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e"
+checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2"
 dependencies = [
- "indexmap 2.9.0",
+ "indexmap 2.13.0",
  "toml_datetime",
+ "toml_parser",
+ "winnow",
+]
+
+[[package]]
+name = "toml_parser"
+version = "1.0.9+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
+dependencies = [
  "winnow",
 ]
 
 [[package]]
 name = "tonic"
-version = "0.12.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
+checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec"
 dependencies = [
- "async-stream",
  "async-trait",
  "axum",
  "base64 0.22.1",
  "bytes",
  "h2",
- "http 1.3.1",
+ "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
  "hyper",
@@ -6525,29 +6320,39 @@ dependencies = [
  "hyper-util",
  "percent-encoding",
  "pin-project",
- "prost",
  "socket2",
+ "sync_wrapper",
  "tokio",
  "tokio-stream",
- "tower 0.4.13",
+ "tower",
  "tower-layer",
  "tower-service",
  "tracing",
 ]
 
 [[package]]
-name = "tower"
-version = "0.4.13"
+name = "tonic-prost"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309"
 dependencies = [
- "futures-core",
- "futures-util",
- "indexmap 1.9.3",
- "pin-project",
+ "bytes",
+ "prost",
+ "tonic",
+]
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 2.13.0",
  "pin-project-lite",
- "rand 0.8.5",
  "slab",
+ "sync_wrapper",
  "tokio",
  "tokio-util",
  "tower-layer",
@@ -6556,16 +6361,19 @@ dependencies = [
 ]
 
 [[package]]
-name = "tower"
-version = "0.5.2"
+name = "tower-http"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
 dependencies = [
- "futures-core",
+ "bitflags",
+ "bytes",
  "futures-util",
+ "http 1.4.0",
+ "http-body 1.0.1",
+ "iri-string",
  "pin-project-lite",
- "sync_wrapper",
- "tokio",
+ "tower",
  "tower-layer",
  "tower-service",
 ]
@@ -6584,9 +6392,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
-version = "0.1.41"
+version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
 dependencies = [
  "pin-project-lite",
  "tracing-attributes",
@@ -6595,20 +6403,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.28"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.33"
+version = "0.1.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6627,9 +6435,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.19"
+version = "0.3.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
 dependencies = [
  "nu-ansi-term",
  "sharded-slab",
@@ -6662,19 +6470,9 @@ checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a"
 
 [[package]]
 name = "twox-hash"
-version = "1.6.3"
+version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
-dependencies = [
- "cfg-if",
- "static_assertions",
-]
-
-[[package]]
-name = "twox-hash"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
+checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
 
 [[package]]
 name = "typed-arena"
@@ -6683,36 +6481,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
 
 [[package]]
-name = "typed-builder"
-version = "0.19.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600"
-dependencies = [
- "typed-builder-macro",
-]
-
-[[package]]
-name = "typed-builder-macro"
-version = "0.19.1"
+name = "typenum"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.101",
-]
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
 [[package]]
-name = "typenum"
-version = "1.18.0"
+name = "typewit"
+version = "1.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
+checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71"
 
 [[package]]
 name = "typify"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcc5bec3cdff70fd542e579aa2e52967833e543a25fae0d14579043d2e868a50"
+checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629"
 dependencies = [
  "typify-impl",
  "typify-macro",
@@ -6720,38 +6504,38 @@ dependencies = [
 
 [[package]]
 name = "typify-impl"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b52a67305054e1da6f3d99ad94875dcd0c7c49adbd17b4b64f0eefb7ae5bf8ab"
+checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "log",
  "proc-macro2",
  "quote",
  "regress",
- "schemars",
+ "schemars 0.8.22",
  "semver",
  "serde",
  "serde_json",
- "syn 2.0.101",
- "thiserror 2.0.12",
+ "syn 2.0.117",
+ "thiserror",
  "unicode-ident",
 ]
 
 [[package]]
 name = "typify-macro"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ff5799be156e4f635c348c6051d165e1c59997827155133351a8c4d333d9841"
+checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a"
 dependencies = [
  "proc-macro2",
  "quote",
- "schemars",
+ "schemars 0.8.22",
  "semver",
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.101",
+ "syn 2.0.117",
  "typify-impl",
 ]
 
@@ -6763,24 +6547,24 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.18"
+version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.24"
+version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
+checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
 dependencies = [
  "tinyvec",
 ]
 
 [[package]]
 name = "unicode-properties"
-version = "0.1.3"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
+checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
 
 [[package]]
 name = "unicode-segmentation"
@@ -6796,15 +6580,21 @@ checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
 
 [[package]]
 name = "unicode-width"
-version = "0.2.0"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
 
 [[package]]
-name = "unindent"
-version = "0.2.4"
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
 
 [[package]]
 name = "unsafe-libyaml"
@@ -6818,16 +6608,44 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
+[[package]]
+name = "ureq"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc"
+dependencies = [
+ "base64 0.22.1",
+ "log",
+ "percent-encoding",
+ "rustls",
+ "rustls-pki-types",
+ "ureq-proto",
+ "utf-8",
+]
+
+[[package]]
+name = "ureq-proto"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
+dependencies = [
+ "base64 0.22.1",
+ "http 1.4.0",
+ "httparse",
+ "log",
+]
+
 [[package]]
 name = "url"
-version = "2.5.4"
+version = "2.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
 dependencies = [
  "form_urlencoded",
  "idna",
  "percent-encoding",
  "serde",
+ "serde_derive",
 ]
 
 [[package]]
@@ -6836,6 +6654,12 @@ version = "2.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
 
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utf8_iter"
 version = "1.0.4"
@@ -6850,13 +6674,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "uuid"
-version = "1.17.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d"
+checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.4.2",
  "js-sys",
- "serde",
+ "serde_core",
  "wasm-bindgen",
 ]
 
@@ -6878,15 +6702,6 @@ version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
 
-[[package]]
-name = "wait-timeout"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "walkdir"
 version = "2.5.0"
@@ -6908,58 +6723,67 @@ dependencies = [
 
 [[package]]
 name = "wasi"
-version = "0.11.0+wasi-snapshot-preview1"
+version = "0.11.1+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 
 [[package]]
 name = "wasi"
-version = "0.14.2+wasi-0.2.4"
+version = "0.14.7+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
+dependencies = [
+ "wasip2",
+]
+
+[[package]]
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
 dependencies = [
- "wit-bindgen-rt",
+ "wit-bindgen",
 ]
 
 [[package]]
 name = "wasite"
-version = "0.1.0"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+checksum = "66fe902b4a6b8028a753d5424909b764ccf79b7a209eac9bf97e59cda9f71a42"
+dependencies = [
+ "wasi 0.14.7+wasi-0.2.4",
+]
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.100"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
  "cfg-if",
  "once_cell",
  "rustversion",
  "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.100"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
-dependencies = [
- "bumpalo",
- "log",
- "proc-macro2",
- "quote",
- "syn 2.0.101",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.50"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
  "cfg-if",
+ "futures-util",
  "js-sys",
  "once_cell",
  "wasm-bindgen",
@@ -6968,9 +6792,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.100"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6978,48 +6802,85 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.100"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
+ "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.101",
- "wasm-bindgen-backend",
+ "syn 2.0.117",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.100"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "wasm-bindgen-test"
-version = "0.3.50"
+version = "0.3.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66c8d5e33ca3b6d9fa3b4676d774c5778031d27a578c2b007f905acf816152c3"
+checksum = "6311c867385cc7d5602463b31825d454d0837a3aba7cdb5e56d5201792a3f7fe"
 dependencies = [
+ "async-trait",
+ "cast",
  "js-sys",
+ "libm",
  "minicov",
+ "nu-ansi-term",
+ "num-traits",
+ "oorandom",
+ "serde",
+ "serde_json",
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "wasm-bindgen-test-macro",
+ "wasm-bindgen-test-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-test-macro"
-version = "0.3.50"
+version = "0.3.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17d5042cc5fa009658f9a7333ef24291b1291a25b6382dd68862a7f3b969f69b"
+checksum = "67008cdde4769831958536b0f11b3bdd0380bde882be17fff9c2f34bb4549abd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "wasm-bindgen-test-shared"
+version = "0.2.114"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe29135b180b72b04c74aa97b2b4a2ef275161eff9a6c7955ea9eaedc7e1d4e"
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.13.0",
+ "wasm-encoder",
+ "wasmparser",
 ]
 
 [[package]]
@@ -7035,11 +6896,23 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap 2.13.0",
+ "semver",
+]
+
 [[package]]
 name = "web-sys"
-version = "0.3.77"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7055,25 +6928,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
 [[package]]
 name = "whoami"
-version = "1.6.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6994d13118ab492c3c80c1f81928718159254c53c472bf9ce36f8dae4add02a7"
+checksum = "d6a5b12f9df4f978d2cfdb1bd3bac52433f44393342d7ee9c25f5a1c14c0f45d"
 dependencies = [
- "redox_syscall 0.5.12",
+ "libc",
+ "libredox",
+ "objc2-system-configuration",
  "wasite",
  "web-sys",
 ]
@@ -7096,11 +6959,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.9"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7111,44 +6974,43 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
 name = "windows"
-version = "0.61.1"
+version = "0.62.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5ee8f3d025738cb02bad7868bbb5f8a6327501e870bf51f1b455b0a2454a419"
+checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
 dependencies = [
  "windows-collections",
  "windows-core",
  "windows-future",
- "windows-link",
  "windows-numerics",
 ]
 
 [[package]]
 name = "windows-collections"
-version = "0.2.0"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
+checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
 dependencies = [
  "windows-core",
 ]
 
 [[package]]
 name = "windows-core"
-version = "0.61.2"
+version = "0.62.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
 dependencies = [
  "windows-implement",
  "windows-interface",
  "windows-link",
  "windows-result",
- "windows-strings 0.4.2",
+ "windows-strings",
 ]
 
 [[package]]
 name = "windows-future"
-version = "0.2.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e"
+checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
 dependencies = [
  "windows-core",
  "windows-link",
@@ -7157,96 +7019,94 @@ dependencies = [
 
 [[package]]
 name = "windows-implement"
-version = "0.60.0"
+version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "windows-interface"
-version = "0.59.1"
+version = "0.59.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "windows-link"
-version = "0.1.1"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
 [[package]]
 name = "windows-numerics"
-version = "0.2.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
+checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
 dependencies = [
  "windows-core",
  "windows-link",
 ]
 
 [[package]]
-name = "windows-registry"
-version = "0.4.0"
+name = "windows-result"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
 dependencies = [
- "windows-result",
- "windows-strings 0.3.1",
- "windows-targets 0.53.0",
+ "windows-link",
 ]
 
 [[package]]
-name = "windows-result"
-version = "0.3.4"
+name = "windows-strings"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
 dependencies = [
  "windows-link",
 ]
 
 [[package]]
-name = "windows-strings"
-version = "0.3.1"
+name = "windows-sys"
+version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-link",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
-name = "windows-strings"
-version = "0.4.2"
+name = "windows-sys"
+version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-link",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
 name = "windows-sys"
-version = "0.52.0"
+version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-targets 0.53.5",
 ]
 
 [[package]]
 name = "windows-sys"
-version = "0.59.0"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
@@ -7267,25 +7127,26 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.53.0"
+version = "0.53.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
 dependencies = [
- "windows_aarch64_gnullvm 0.53.0",
- "windows_aarch64_msvc 0.53.0",
- "windows_i686_gnu 0.53.0",
- "windows_i686_gnullvm 0.53.0",
- "windows_i686_msvc 0.53.0",
- "windows_x86_64_gnu 0.53.0",
- "windows_x86_64_gnullvm 0.53.0",
- "windows_x86_64_msvc 0.53.0",
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
 ]
 
 [[package]]
 name = "windows-threading"
-version = "0.1.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6"
+checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
 dependencies = [
  "windows-link",
 ]
@@ -7298,9 +7159,9 @@ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -7310,9 +7171,9 @@ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -7322,9 +7183,9 @@ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
 
 [[package]]
 name = "windows_i686_gnullvm"
@@ -7334,9 +7195,9 @@ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
 [[package]]
 name = "windows_i686_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -7346,9 +7207,9 @@ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -7358,9 +7219,9 @@ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -7370,9 +7231,9 @@ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -7382,51 +7243,121 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.53.0"
+version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
 
 [[package]]
 name = "winnow"
-version = "0.7.10"
+version = "0.7.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06928c8748d81b05c9be96aad92e1b6ff01833332f281e8cfca3be4b35fc9ec"
+checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
-name = "wit-bindgen-rt"
-version = "0.39.0"
+name = "wit-bindgen"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
 dependencies = [
- "bitflags 2.9.1",
+ "wit-bindgen-rust-macro",
 ]
 
 [[package]]
-name = "writeable"
-version = "0.6.1"
+name = "wit-bindgen-core"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
 
 [[package]]
-name = "wyz"
-version = "0.5.1"
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap 2.13.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap 2.13.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
 dependencies = [
- "tap",
+ "anyhow",
+ "id-arena",
+ "indexmap 2.13.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
 ]
 
+[[package]]
+name = "writeable"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
+
 [[package]]
 name = "xattr"
-version = "1.5.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d65cbf2f12c15564212d48f4e3dfb87923d25d611f2aed18f4cb23f0413d89e"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
 dependencies = [
  "libc",
- "rustix 1.0.7",
+ "rustix",
 ]
 
 [[package]]
@@ -7436,21 +7367,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
 [[package]]
-name = "xz2"
-version = "0.1.7"
+name = "yansi"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
-dependencies = [
- "lzma-sys",
-]
+checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
 
 [[package]]
 name = "yoke"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
+checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
 dependencies = [
- "serde",
  "stable_deref_trait",
  "yoke-derive",
  "zerofrom",
@@ -7458,34 +7385,34 @@ dependencies = [
 
 [[package]]
 name = "yoke-derive"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
+checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "synstructure",
 ]
 
 [[package]]
 name = "zerocopy"
-version = "0.8.25"
+version = "0.8.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.25"
+version = "0.8.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -7505,21 +7432,21 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
  "synstructure",
 ]
 
 [[package]]
 name = "zeroize"
-version = "1.8.1"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
 
 [[package]]
 name = "zerotrie"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
+checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
 dependencies = [
  "displaydoc",
  "yoke",
@@ -7528,9 +7455,9 @@ dependencies = [
 
 [[package]]
 name = "zerovec"
-version = "0.11.2"
+version = "0.11.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428"
+checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
 dependencies = [
  "yoke",
  "zerofrom",
@@ -7539,20 +7466,26 @@ dependencies = [
 
 [[package]]
 name = "zerovec-derive"
-version = "0.11.1"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
+checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.101",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "zlib-rs"
-version = "0.5.0"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
 
 [[package]]
 name = "zstd"
@@ -7574,9 +7507,9 @@ dependencies = [
 
 [[package]]
 name = "zstd-sys"
-version = "2.0.15+zstd.1.5.7"
+version = "2.0.16+zstd.1.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
 dependencies = [
  "cc",
  "pkg-config",
diff --git a/Cargo.toml b/Cargo.toml
index 79bb2f3cc602d..08d585d3ef906 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ members = [
     "datafusion/catalog",
     "datafusion/catalog-listing",
     "datafusion/datasource",
+    "datafusion/datasource-arrow",
     "datafusion/datasource-avro",
     "datafusion/datasource-csv",
     "datafusion/datasource-json",
@@ -40,8 +41,10 @@ members = [
     "datafusion/functions-window-common",
     "datafusion/optimizer",
     "datafusion/physical-expr",
+    "datafusion/physical-expr-adapter",
     "datafusion/physical-expr-common",
     "datafusion/physical-optimizer",
+    "datafusion/pruning",
     "datafusion/physical-plan",
     "datafusion/proto",
     "datafusion/proto/gen",
@@ -68,15 +71,15 @@ resolver = "2"
 
 [workspace.package]
 authors = ["Apache DataFusion <dev@datafusion.apache.org>"]
-edition = "2021"
+edition = "2024"
 homepage = "https://datafusion.apache.org"
 license = "Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/apache/datafusion"
 # Define Minimum Supported Rust Version (MSRV)
-rust-version = "1.82.0"
+rust-version = "1.88.0"
 # Define DataFusion version
-version = "47.0.0"
+version = "52.3.0"
 
 [workspace.dependencies]
 # We turn off default-features for some dependencies here so the workspaces which inherit them can
@@ -84,100 +87,166 @@ version = "47.0.0"
 # for the inherited dependency but cannot do the reverse (override from true to false).
 #
 # See for more details: https://github.com/rust-lang/cargo/issues/11329
-ahash = { version = "0.8", default-features = false, features = [
-    "runtime-rng",
-] }
-apache-avro = { version = "0.17", default-features = false }
-arrow = { version = "55.1.0", features = [
+apache-avro = { version = "0.21", default-features = false }
+arrow = { version = "58.0.0", features = [
     "prettyprint",
     "chrono-tz",
 ] }
-arrow-buffer = { version = "55.0.0", default-features = false }
-arrow-flight = { version = "55.1.0", features = [
+arrow-buffer = { version = "58.0.0", default-features = false }
+arrow-flight = { version = "58.0.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "55.0.0", default-features = false, features = [
+arrow-ipc = { version = "58.0.0", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "55.0.0", default-features = false }
-arrow-schema = { version = "55.0.0", default-features = false }
-async-trait = "0.1.88"
+arrow-ord = { version = "58.0.0", default-features = false }
+arrow-schema = { version = "58.0.0", default-features = false }
+async-trait = "0.1.89"
 bigdecimal = "0.4.8"
-bytes = "1.10"
-chrono = { version = "0.4.41", default-features = false }
-criterion = "0.5.1"
-ctor = "0.4.0"
+bytes = "1.11"
+bzip2 = "0.6.1"
+chrono = { version = "0.4.44", default-features = false }
+criterion = "0.8"
+ctor = "0.6.3"
 dashmap = "6.0.1"
-datafusion = { path = "datafusion/core", version = "47.0.0", default-features = false }
-datafusion-catalog = { path = "datafusion/catalog", version = "47.0.0" }
-datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "47.0.0" }
-datafusion-common = { path = "datafusion/common", version = "47.0.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "47.0.0" }
-datafusion-datasource = { path = "datafusion/datasource", version = "47.0.0", default-features = false }
-datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "47.0.0", default-features = false }
-datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "47.0.0", default-features = false }
-datafusion-datasource-json = { path = "datafusion/datasource-json", version = "47.0.0", default-features = false }
-datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "47.0.0", default-features = false }
-datafusion-doc = { path = "datafusion/doc", version = "47.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "47.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "47.0.0" }
-datafusion-expr-common = { path = "datafusion/expr-common", version = "47.0.0" }
-datafusion-ffi = { path = "datafusion/ffi", version = "47.0.0" }
-datafusion-functions = { path = "datafusion/functions", version = "47.0.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "47.0.0" }
-datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "47.0.0" }
-datafusion-functions-nested = { path = "datafusion/functions-nested", version = "47.0.0" }
-datafusion-functions-table = { path = "datafusion/functions-table", version = "47.0.0" }
-datafusion-functions-window = { path = "datafusion/functions-window", version = "47.0.0" }
-datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "47.0.0" }
-datafusion-macros = { path = "datafusion/macros", version = "47.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "47.0.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "47.0.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "47.0.0", default-features = false }
-datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "47.0.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "47.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "47.0.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "47.0.0" }
-datafusion-session = { path = "datafusion/session", version = "47.0.0" }
-datafusion-spark = { path = "datafusion/spark", version = "47.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "47.0.0" }
+datafusion = { path = "datafusion/core", version = "52.3.0", default-features = false }
+datafusion-catalog = { path = "datafusion/catalog", version = "52.3.0" }
+datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "52.3.0" }
+datafusion-common = { path = "datafusion/common", version = "52.3.0", default-features = false }
+datafusion-common-runtime = { path = "datafusion/common-runtime", version = "52.3.0" }
+datafusion-datasource = { path = "datafusion/datasource", version = "52.3.0", default-features = false }
+datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "52.3.0", default-features = false }
+datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "52.3.0", default-features = false }
+datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "52.3.0", default-features = false }
+datafusion-datasource-json = { path = "datafusion/datasource-json", version = "52.3.0", default-features = false }
+datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "52.3.0", default-features = false }
+datafusion-doc = { path = "datafusion/doc", version = "52.3.0" }
+datafusion-execution = { path = "datafusion/execution", version = "52.3.0", default-features = false }
+datafusion-expr = { path = "datafusion/expr", version = "52.3.0", default-features = false }
+datafusion-expr-common = { path = "datafusion/expr-common", version = "52.3.0" }
+datafusion-ffi = { path = "datafusion/ffi", version = "52.3.0" }
+datafusion-functions = { path = "datafusion/functions", version = "52.3.0" }
+datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "52.3.0" }
+datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "52.3.0" }
+datafusion-functions-nested = { path = "datafusion/functions-nested", version = "52.3.0", default-features = false }
+datafusion-functions-table = { path = "datafusion/functions-table", version = "52.3.0" }
+datafusion-functions-window = { path = "datafusion/functions-window", version = "52.3.0" }
+datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "52.3.0" }
+datafusion-macros = { path = "datafusion/macros", version = "52.3.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "52.3.0", default-features = false }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "52.3.0", default-features = false }
+datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "52.3.0", default-features = false }
+datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "52.3.0", default-features = false }
+datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "52.3.0" }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "52.3.0" }
+datafusion-proto = { path = "datafusion/proto", version = "52.3.0" }
+datafusion-proto-common = { path = "datafusion/proto-common", version = "52.3.0" }
+datafusion-pruning = { path = "datafusion/pruning", version = "52.3.0" }
+datafusion-session = { path = "datafusion/session", version = "52.3.0" }
+datafusion-spark = { path = "datafusion/spark", version = "52.3.0" }
+datafusion-sql = { path = "datafusion/sql", version = "52.3.0" }
+datafusion-substrait = { path = "datafusion/substrait", version = "52.3.0" }
+
 doc-comment = "0.3"
 env_logger = "0.11"
+flate2 = "1.1.9"
 futures = "0.3"
-half = { version = "2.6.0", default-features = false }
-hashbrown = { version = "0.14.5", features = ["raw"] }
-indexmap = "2.9.0"
+glob = "0.3.0"
+half = { version = "2.7.0", default-features = false }
+hashbrown = { version = "0.16.1" }
+hex = { version = "0.4.3" }
+indexmap = "2.13.0"
+insta = { version = "1.46.3", features = ["glob", "filters"] }
 itertools = "0.14"
+itoa = "1.0"
+liblzma = { version = "0.4.6", features = ["static"] }
 log = "^0.4"
-object_store = { version = "0.12.0", default-features = false }
+memchr = "2.8.0"
+num-traits = { version = "0.2" }
+object_store = { version = "0.13.1", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "55.1.0", default-features = false, features = [
+parquet = { version = "58.0.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",
 ] }
-pbjson = { version = "0.7.0" }
-pbjson-types = "0.7"
+pbjson = { version = "0.9.0" }
+pbjson-types = "0.9"
 # Should match arrow-flight's version of prost.
-insta = { version = "1.43.1", features = ["glob", "filters"] }
-prost = "0.13.1"
+prost = "0.14.1"
 rand = "0.9"
 recursive = "0.1.1"
-regex = "1.8"
-rstest = "0.25.0"
+regex = "1.12"
+rstest = "0.26.1"
 serde_json = "1"
-sqlparser = { version = "0.55.0", features = ["visitor"] }
+sha2 = "^0.10.9"
+sqlparser = { version = "0.61.0", default-features = false, features = ["std", "visitor"] }
+strum = "0.28.0"
+strum_macros = "0.28.0"
 tempfile = "3"
-tokio = { version = "1.45", features = ["macros", "rt", "sync"] }
-url = "2.5.4"
+testcontainers-modules = { version = "0.15" }
+tokio = { version = "1.48", features = ["macros", "rt", "sync"] }
+tokio-stream = "0.1"
+tokio-util = "0.7"
+url = "2.5.7"
+uuid = "1.21"
+zstd = { version = "0.13", default-features = false }
+
+[workspace.lints.clippy]
+# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
+large_futures = "warn"
+used_underscore_binding = "warn"
+or_fun_call = "warn"
+unnecessary_lazy_evaluations = "warn"
+uninlined_format_args = "warn"
+inefficient_to_string = "warn"
+# https://github.com/apache/datafusion/issues/18503
+needless_pass_by_value = "warn"
+# https://github.com/apache/datafusion/issues/18881
+allow_attributes = "warn"
+assigning_clones = "warn"
+
+[workspace.lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = [
+    'cfg(datafusion_coop, values("tokio", "tokio_fallback", "per_stream"))',
+    "cfg(tarpaulin)",
+    "cfg(tarpaulin_include)",
+] }
+unused_qualifications = "deny"
 
+# --------------------
+# Compilation Profiles
+# --------------------
+#  A Cargo profile is a preset for the compiler/linker knobs that trade off:
+# - Build time: how quickly code compiles and links
+# - Runtime performance: how fast the resulting binaries execute
+# - Binary size: how large the executables end up
+# - Debuggability: how much debug information is preserved for debugging and profiling
+#
+# Profiles available:
+# - dev: default debug build; fastest to compile, slowest to run, full debug info
+#     for everyday development.
+#     Run: cargo run
+# - release: optimized build; slowest to compile, fastest to run, smallest
+#     binaries for public releases.
+#     Run: cargo run --release
+# - release-nonlto: skips LTO, so it builds quicker while staying close to
+#     release performance. It is useful when developing performance optimizations.
+#     Run: cargo run --profile release-nonlto
+# - profiling: inherits release optimizations but retains debug info to support
+#     profiling tools and flamegraphs.
+#     Run: cargo run --profile profiling
+# - ci: derived from `dev` but disables incremental builds and strips dependency
+#     symbols to keep CI artifacts small and reproducible.
+#     Run: cargo run --profile ci
+#
+# If you want to optimize compilation, the `compile_profile` benchmark can be useful.
+# See `benchmarks/README.md` for more details.
 [profile.release]
 codegen-units = 1
 lto = true
 strip = true      # Eliminate debug information to minimize binary size
 
-# the release profile takes a long time to build so we can use this profile during development to save time
-# cargo build --profile release-nonlto
 [profile.release-nonlto]
 codegen-units = 16
 debug-assertions = false
@@ -189,32 +258,27 @@ overflow-checks = false
 rpath = false
 strip = false            # Retain debug info for flamegraphs
 
+[profile.ci-optimized]
+inherits = "release"
+codegen-units = 16
+lto = "thin"
+strip = true
+
 [profile.ci]
+debug = false
 inherits = "dev"
 incremental = false
 
-# ci turns off debug info, etc. for dependencies to allow for smaller binaries making caching more effective
+# This rule applies to every package except workspace members (dependencies
+# such as `arrow` and `tokio`). It disables debug info and related features on
+# dependencies so their binaries stay smaller, improving cache reuse.
 [profile.ci.package."*"]
 debug = false
 debug-assertions = false
 strip = "debuginfo"
 incremental = false
 
-# release inherited profile keeping debug information and symbols
-# for mem/cpu profiling
 [profile.profiling]
 inherits = "release"
 debug = true
 strip = false
-
-[workspace.lints.clippy]
-# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
-large_futures = "warn"
-used_underscore_binding = "warn"
-or_fun_call = "warn"
-unnecessary_lazy_evaluations = "warn"
-uninlined_format_args = "warn"
-
-[workspace.lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
-unused_qualifications = "deny"
diff --git a/NOTICE.txt b/NOTICE.txt
index 7f3c80d606c07..0bd2d52368fea 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,5 +1,5 @@
 Apache DataFusion
-Copyright 2019-2025 The Apache Software Foundation
+Copyright 2019-2026 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
diff --git a/README.md b/README.md
index c142d8f366b2e..630d4295bd427 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@
 [![Build Status][actions-badge]][actions-url]
 ![Commit Activity][commit-activity-badge]
 [![Open Issues][open-issues-badge]][open-issues-url]
+[![Pending PRs][pending-pr-badge]][pending-pr-url]
 [![Discord chat][discord-badge]][discord-url]
 [![Linkedin][linkedin-badge]][linkedin-url]
 ![Crates.io MSRV][msrv-badge]
@@ -39,6 +40,8 @@
 [commit-activity-badge]: https://img.shields.io/github/commit-activity/m/apache/datafusion
 [open-issues-badge]: https://img.shields.io/github/issues-raw/apache/datafusion
 [open-issues-url]: https://github.com/apache/datafusion/issues
+[pending-pr-badge]: https://img.shields.io/github/issues-search/apache/datafusion?query=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess&label=Pending%20PRs&logo=github
+[pending-pr-url]: https://github.com/apache/datafusion/pulls?q=is%3Apr+is%3Aopen+draft%3Afalse+review%3Arequired+status%3Asuccess+sort%3Aupdated-desc
 [linkedin-badge]: https://img.shields.io/badge/Follow-Linkedin-blue
 [linkedin-url]: https://www.linkedin.com/company/apache-datafusion/
 [msrv-badge]: https://img.shields.io/crates/msrv/datafusion?label=Min%20Rust%20Version
@@ -55,18 +58,16 @@ DataFusion is an extensible query engine written in [Rust] that
 uses [Apache Arrow] as its in-memory format.
 
 This crate provides libraries and binaries for developers building fast and
-feature rich database and analytic systems, customized to particular workloads.
+feature-rich database and analytic systems, customized for particular workloads.
 See [use cases] for examples. The following related subprojects target end users:
 
 - [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
   queries.
-- [DataFusion Ray](https://github.com/apache/datafusion-ray/) provides a distributed version of DataFusion that scales
-  out on Ray clusters.
 - [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on
   DataFusion.
 
 "Out of the box,"
-DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance],
+DataFusion offers [SQL](https://datafusion.apache.org/user-guide/sql/index.html) and [DataFrame](https://datafusion.apache.org/user-guide/dataframe.html) APIs, excellent [performance],
 built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
 a great community.
 
@@ -83,7 +84,7 @@ See the [Architecture] section for more details.
 [performance]: https://benchmark.clickhouse.com/
 [architecture]: https://datafusion.apache.org/contributor-guide/architecture.html
 
-Here are links to some important information
+Here are links to important resources:
 
 - [Project Site](https://datafusion.apache.org/)
 - [Installation](https://datafusion.apache.org/user-guide/cli/installation.html)
@@ -96,8 +97,8 @@ Here are links to some important information
 
 ## What can you do with this crate?
 
-DataFusion is great for building projects such as domain specific query engines, new database platforms and data pipelines, query languages and more.
-It lets you start quickly from a fully working engine, and then customize those features specific to your use. [Click Here](https://datafusion.apache.org/user-guide/introduction.html#known-users) to see a list known users.
+DataFusion is great for building projects such as domain-specific query engines, new database platforms and data pipelines, query languages and more.
+It lets you start quickly from a fully working engine, and then customize those features specific to your needs. See the [list of known users](https://datafusion.apache.org/user-guide/introduction.html#known-users).
 
 ## Contributing to DataFusion
 
@@ -114,14 +115,15 @@ This crate has several [features] which can be specified in your `Cargo.toml`.
 
 Default features:
 
-- `nested_expressions`: functions for working with nested type function such as `array_to_string`
+- `nested_expressions`: functions for working with nested types such as `array_to_string`
 - `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd`
 - `crypto_expressions`: cryptographic functions such as `md5` and `sha256`
 - `datetime_expressions`: date and time functions such as `to_timestamp`
 - `encoding_expressions`: `encode` and `decode` functions
 - `parquet`: support for reading the [Apache Parquet] format
+- `sql`: support for SQL parsing and planning
 - `regex_expressions`: regular expression functions, such as `regexp_match`
-- `unicode_expressions`: Include unicode aware functions such as `character_length`
+- `unicode_expressions`: include Unicode-aware functions such as `character_length`
 - `unparser`: enables support to reverse LogicalPlans back into SQL
 - `recursive_protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.
 
@@ -129,11 +131,12 @@ Optional features:
 
 - `avro`: support for reading the [Apache Avro] format
 - `backtrace`: include backtrace information in error messages
-- `pyarrow`: conversions between PyArrow and DataFusion types
+- `parquet_encryption`: support for using [Parquet Modular Encryption]
 - `serde`: enable arrow-schema's `serde` feature
 
 [apache avro]: https://avro.apache.org/
 [apache parquet]: https://parquet.apache.org/
+[parquet modular encryption]: https://parquet.apache.org/docs/file-format/data-pages/encryption/
 
 ## DataFusion API Evolution and Deprecation Guidelines
 
@@ -141,7 +144,7 @@ Public methods in Apache DataFusion evolve over time: while we try to maintain a
 stable API, we also improve the API over time. As a result, we typically
 deprecate methods before removing them, according to the [deprecation guidelines].
 
-[deprecation guidelines]: https://datafusion.apache.org/library-user-guide/api-health.html
+[deprecation guidelines]: https://datafusion.apache.org/contributor-guide/api-health.html
 
 ## Dependencies and `Cargo.lock`
 
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index f9c198597b74c..56f7704309780 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -26,6 +26,9 @@ repository = { workspace = true }
 license = { workspace = true }
 rust-version = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -33,25 +36,29 @@ workspace = true
 ci = []
 default = ["mimalloc"]
 snmalloc = ["snmalloc-rs"]
+mimalloc_extended = ["libmimalloc-sys/extended"]
 
 [dependencies]
 arrow = { workspace = true }
+async-trait = "0.1"
+bytes = { workspace = true }
+clap = { version = "4.5.60", features = ["derive"] }
 datafusion = { workspace = true, default-features = true }
 datafusion-common = { workspace = true, default-features = true }
 env_logger = { workspace = true }
 futures = { workspace = true }
+libmimalloc-sys = { version = "0.1", optional = true }
 log = { workspace = true }
 mimalloc = { version = "0.1", optional = true, default-features = false }
 object_store = { workspace = true }
 parquet = { workspace = true, default-features = true }
 rand = { workspace = true }
-serde = { version = "1.0.219", features = ["derive"] }
+regex.workspace = true
+serde = { version = "1.0.228", features = ["derive"] }
 serde_json = { workspace = true }
 snmalloc-rs = { version = "0.3", optional = true }
-structopt = { version = "0.3", default-features = false }
-test-utils = { path = "../test-utils/", version = "0.1.0" }
 tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
-tokio-util = { version = "0.7.15" }
+tokio-util = { version = "0.7.17" }
 
 [dev-dependencies]
 datafusion-proto = { workspace = true }
diff --git a/benchmarks/README.md b/benchmarks/README.md
index b19b3385afc83..3aa4f4bb8640c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -87,6 +87,38 @@ To run for specific query, for example Q21
 ./bench.sh run tpch10 21
 ```
 
+## Compile profile benchmark
+
+Generate the data required for the compile profile helper (TPC-H SF=1):
+
+```shell
+./bench.sh data compile_profile
+```
+
+Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `release-nonlto`):
+
+```shell
+./bench.sh run compile_profile
+```
+
+Limit the run to a single profile:
+
+```shell
+./bench.sh run compile_profile dev
+```
+
+Or specify a subset of profiles:
+
+```shell
+./bench.sh run compile_profile dev release
+```
+
+You can also invoke the helper directly if you need to customise arguments further:
+
+```shell
+./benchmarks/compile_profile.py --profiles dev release --data /path/to/tpch_sf1
+```
+
 ## Benchmark with modified configurations
 
 ### Select join algorithm
@@ -114,6 +146,19 @@ To verify that datafusion picked up your configuration, run the benchmarks with
 
 ## Comparing performance of main and a branch
 
+For TPC-H
+```shell
+./benchmarks/compare_tpch.sh main mybranch
+```
+
+For TPC-DS. 
+To get data in `DATA_DIR` for TPCDS, please follow instructions in `./benchmarks/bench.sh data tcpds` 
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/compare_tpcds.sh main mybranch
+```
+
+Alternatively, you can compare manually following the example below
+
 ```shell
 git checkout main
 
@@ -195,6 +240,23 @@ Benchmark tpch_mem.json
 └──────────────┴──────────────┴──────────────┴───────────────┘
 ```
 
+## Comparing performance of main and a PR
+
+### TPCDS
+
+Considering you already have TPCDS data locally
+
+```shell
+export DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/
+export PR_NUMBER=19464
+git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER
+git checkout main
+git pull
+./benchmarks/compare_tpcds.sh main pr-$PR_NUMBER
+```
+
+Note: if `gh` is installed, you can also run `gh pr checkout $PR_NUMBER` instead of `git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER`
+
 ### Running Benchmarks Manually
 
 Assuming data is in the `data` directory, the `tpch` benchmark can be run with a command like this:
@@ -210,28 +272,11 @@ See the help for more details.
 You can enable `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`. For example:
 
 ```shell
-cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
-```
-
-The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
-(generated by the `dbgen` utility) to CSV and Parquet.
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet
+cargo run --release --features "mimalloc" --bin dfbench tpch --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
 ```
 
 Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
 
-#### Sorted Conversion
-
-The TPCH tables generated by the dbgen utility are sorted by their first column (their primary key for most tables, the `l_orderkey` column for the `lineitem` table.)
-
-To preserve this sorted order information during conversion (useful for benchmarking execution on pre-sorted data) include the `--sort` flag:
-
-```bash
-cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-sorted-parquet --format parquet --sort
-```
-
 ### Comparing results between runs
 
 Any `dfbench` execution with `-o <dir>` argument will produce a
@@ -321,6 +366,72 @@ FLAGS:
 ...
 ```
 
+# Profiling Memory Stats for each benchmark query
+
+The `mem_profile` program wraps benchmark execution to measure memory usage statistics, such as peak RSS. It runs each benchmark query in a separate subprocess, capturing the child process’s stdout to print structured output.
+
+Subcommands supported by mem_profile are the subset of those in `dfbench`.
+Currently supported benchmarks include: Clickbench, H2o, Imdb, SortTpch, Tpch, TPCDS
+
+Before running benchmarks, `mem_profile` automatically compiles the benchmark binary (`dfbench`) using `cargo build`. Note that the build profile used for `dfbench` is not tied to the profile used for running `mem_profile` itself. We can explicitly specify the desired build profile using the `--bench-profile` option (e.g. release-nonlto). By prebuilding the binary and running each query in a separate process, we can ensure accurate memory statistics.
+
+Currently, `mem_profile` only supports `mimalloc` as the memory allocator, since it relies on `mimalloc`'s API to collect memory statistics.
+
+Because it runs the compiled binary directly from the target directory, make sure your working directory is the top-level datafusion/ directory, where the target/ is also located.
+
+The benchmark subcommand (e.g., `tpch`) and all following arguments are passed directly to `dfbench`. Be sure to specify `--bench-profile` before the benchmark subcommand.
+
+Example:
+
+```shell
+datafusion$ cargo run --profile release-nonlto --bin mem_profile -- --bench-profile release-nonlto tpch --path benchmarks/data/tpch_sf1 --partitions 4 --format parquet
+```
+
+Example Output:
+
+```
+Query     Time (ms)     Peak RSS  Peak Commit  Major Page Faults
+----------------------------------------------------------------
+1            503.42     283.4 MB       3.0 GB                  0
+2            431.09     240.7 MB       3.0 GB                  0
+3            594.28     350.1 MB       3.0 GB                  0
+4            468.90     462.4 MB       3.0 GB                  0
+5            653.58     385.4 MB       3.0 GB                  0
+6            296.79     247.3 MB       2.0 GB                  0
+7            662.32     652.4 MB       3.0 GB                  0
+8            702.48     396.0 MB       3.0 GB                  0
+9            774.21     611.5 MB       3.0 GB                  0
+10           733.62     397.9 MB       3.0 GB                  0
+11           271.71     209.6 MB       3.0 GB                  0
+12           512.60     212.5 MB       2.0 GB                  0
+13           507.83     381.5 MB       2.0 GB                  0
+14           420.89     313.5 MB       3.0 GB                  0
+15           539.97     288.0 MB       2.0 GB                  0
+16           370.91     229.8 MB       3.0 GB                  0
+17           758.33     467.0 MB       2.0 GB                  0
+18          1112.32     638.9 MB       3.0 GB                  0
+19           712.72     280.9 MB       2.0 GB                  0
+20           620.64     402.9 MB       2.9 GB                  0
+21           971.63     388.9 MB       2.9 GB                  0
+22           404.50     164.8 MB       2.0 GB                  0
+```
+
+## Reported Metrics
+
+When running benchmarks, `mem_profile` collects several memory-related statistics using the mimalloc API:
+
+- Peak RSS (Resident Set Size):
+  The maximum amount of physical memory used by the process.
+  This is a process-level metric collected via OS-specific mechanisms and is not mimalloc-specific.
+
+- Peak Commit:
+  The peak amount of memory committed by the allocator (i.e., total virtual memory reserved).
+  This is mimalloc-specific. It gives a more allocator-aware view of memory usage than RSS.
+
+- Major Page Faults:
+  The number of major page faults triggered during execution.
+  This metric is obtained from the operating system and is not mimalloc-specific.
+
 # Writing a new benchmark
 
 ## Creating or downloading data outside of the benchmark
@@ -379,37 +490,6 @@ Your benchmark should create and use an instance of `BenchmarkRun` defined in `b
 
 The output of `dfbench` help includes a description of each benchmark, which is reproduced here for convenience.
 
-## Cancellation
-
-Test performance of cancelling queries.
-
-Queries in DataFusion should stop executing "quickly" after they are
-cancelled (the output stream is dropped).
-
-The queries are executed on a synthetic dataset generated during
-the benchmark execution that is an anonymized version of a
-real-world data set.
-
-The query is an anonymized version of a real-world query, and the
-test starts the query then cancels it and reports how long it takes
-for the runtime to fully exit.
-
-Example output:
-
-```
-Using 7 files found on disk
-Starting to load data into in-memory object store
-Done loading data into in-memory object store
-in main, sleeping
-Starting spawned
-Creating logical plan...
-Creating physical plan...
-Executing physical plan...
-Getting results...
-cancelling thread
-done dropping runtime in 83.531417ms
-```
-
 ## ClickBench
 
 The ClickBench[1] benchmarks are widely cited in the industry and
@@ -510,6 +590,14 @@ See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.
 ./bench.sh run sort_tpch
 ```
 
+### TopK TPCH
+
+In addition, topk_tpch is available from the bench.sh script:
+
+```bash
+./bench.sh run topk_tpch
+```
+
 ## IMDB
 
 Run Join Order Benchmark (JOB) on IMDB dataset.
@@ -532,6 +620,34 @@ This benchmarks is derived from the [TPC-H][1] version
 [2]: https://github.com/databricks/tpch-dbgen.git,
 [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
 
+## TPCDS
+
+Run the tpcds benchmark.
+
+For data please clone `datafusion-benchmarks` repo which contains the predefined parquet data with SF1.
+
+```shell
+git clone https://github.com/apache/datafusion-benchmarks
+```
+
+Then run the benchmark with the following command:
+
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/bench.sh run tpcds
+```
+
+Alternatively benchmark the specific query
+
+```shell
+DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/bench.sh run tpcds 30
+```
+
+More help
+
+```shell
+cargo run --release --bin dfbench -- tpcds --help
+```
+
 ## External Aggregation
 
 Run the benchmark for aggregations with limited memory.
@@ -672,3 +788,115 @@ For example, to run query 1 with the small data generated above:
 ```bash
 cargo run --release --bin dfbench -- h2o --join-paths ./benchmarks/data/h2o/J1_1e7_NA_0.csv,./benchmarks/data/h2o/J1_1e7_1e1_0.csv,./benchmarks/data/h2o/J1_1e7_1e4_0.csv,./benchmarks/data/h2o/J1_1e7_1e7_NA.csv --queries-path ./benchmarks/queries/h2o/window.sql --query 1
 ```
+
+# Micro-Benchmarks
+
+## Nested Loop Join
+
+This benchmark focuses on the performance of queries with nested loop joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+
+Different queries are included to test nested loop joins under various workloads.
+
+### Example Run
+
+```bash
+# No need to generate data: this benchmark uses table function `range()` as the data source
+
+./bench.sh run nlj
+```
+
+## Hash Join
+
+This benchmark focuses on the performance of queries with hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+
+Several queries are included to test hash joins under various workloads.
+
+### Example Run
+
+```bash
+# No need to generate data: this benchmark uses table function `range()` as the data source
+
+./bench.sh run hj
+```
+
+## Sort Merge Join
+
+This benchmark focuses on the performance of queries with sort merge joins joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+
+Several queries are included to test sort merge joins under various workloads.
+
+### Example Run
+
+```bash
+# No need to generate data: this benchmark uses table function `range()` as the data source
+
+./bench.sh run smj
+```
+## Cancellation
+
+Test performance of cancelling queries.
+
+Queries in DataFusion should stop executing "quickly" after they are
+cancelled (the output stream is dropped).
+
+The queries are executed on a synthetic dataset generated during
+the benchmark execution that is an anonymized version of a
+real-world data set.
+
+The query is an anonymized version of a real-world query, and the
+test starts the query then cancels it and reports how long it takes
+for the runtime to fully exit.
+
+Example output:
+
+```
+Using 7 files found on disk
+Starting to load data into in-memory object store
+Done loading data into in-memory object store
+in main, sleeping
+Starting spawned
+Creating logical plan...
+Creating physical plan...
+Executing physical plan...
+Getting results...
+cancelling thread
+done dropping runtime in 83.531417ms
+```
+
+## Sorted Data Benchmarks
+
+### Data Sorted ClickBench
+
+Benchmark for queries on pre-sorted data to test sort order optimization.
+This benchmark uses a subset of the ClickBench dataset (hits.parquet, ~14GB) that has been pre-sorted by the EventTime column. The queries are designed to test DataFusion's performance when the data is already sorted as is common in timeseries workloads.
+
+The benchmark includes queries that:
+- Scan pre-sorted data with ORDER BY clauses that match the sort order
+- Test reverse scans on sorted data
+- Verify the performance result
+
+#### Generating Sorted Data
+
+The sorted dataset is automatically generated from the ClickBench partitioned dataset. You can configure the memory used during the sorting process with the `DATAFUSION_MEMORY_GB` environment variable. The default memory limit is 12GB.
+```bash
+./bench.sh data clickbench_sorted
+```
+
+To create the sorted dataset, for example with 16GB of memory, run:
+
+```bash
+DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted
+```
+
+This command will:
+1. Download the ClickBench partitioned dataset if not present
+2. Sort hits.parquet by EventTime in ascending order
+3. Save the sorted file as hits_sorted.parquet
+
+#### Running the Benchmark
+
+```bash
+./bench.sh run clickbench_sorted
+```
+
+This runs queries against the pre-sorted dataset with the `--sorted-by EventTime` flag, which informs DataFusion that the data is pre-sorted, allowing it to optimize away redundant sort operations.
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 6f8cac2b6bfd5..0fc6ede3b3af4 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -28,6 +28,12 @@ set -e
 # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 
+# Execute command and also print it, for debugging purposes
+debug_run() {
+    set -x
+    "$@"
+    set +x
+}
 
 # Set Defaults
 COMMAND=
@@ -36,71 +42,113 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
-VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
+SIMULATE_LATENCY=${SIMULATE_LATENCY:-false}
+
+# Build latency arg based on SIMULATE_LATENCY setting
+LATENCY_ARG=""
+if [ "$SIMULATE_LATENCY" = "true" ]; then
+    LATENCY_ARG="--simulate-latency"
+fi
 
 usage() {
     echo "
 Orchestrates running benchmarks against DataFusion checkouts
 
 Usage:
-$0 data [benchmark] [query]
-$0 run [benchmark]
+$0 data [benchmark]
+$0 run [benchmark] [query]
 $0 compare <branch1> <branch2>
-$0 venv
+$0 compare_detail <branch1> <branch2>
 
-**********
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Examples:
-**********
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 # Create the datasets for all benchmarks in $DATA_DIR
 ./bench.sh data
 
 # Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion
 DATAFUSION_DIR=/source/datafusion ./bench.sh run tpch
 
-**********
-* Commands
-**********
-data:         Generates or downloads data needed for benchmarking
-run:          Runs the named benchmark
-compare:      Compares results from benchmark runs
-venv:         Creates new venv (unless already exists) and installs compare's requirements into it
-
-**********
-* Benchmarks
-**********
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Commands
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+data:            Generates or downloads data needed for benchmarking
+run:             Runs the named benchmark
+compare:         Compares fastest results from benchmark runs
+compare_detail:  Compares minimum, average (±stddev), and maximum results from benchmark runs
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Benchmarks
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Run all of the following benchmarks
 all(default): Data/Run/Compare for all benchmarks
+
+# TPC-H Benchmarks
 tpch:                   TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
+tpch_csv:               TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single csv file per table, hash join
 tpch_mem:               TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
 tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table, hash join
+tpch_csv10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single csv file per table, hash join
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
-cancellation:           How long cancelling a query takes
-parquet:                Benchmark of parquet reader's filtering speed
-sort:                   Benchmark of sorting speed
-sort_tpch:              Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
+
+# TPC-DS Benchmarks
+tpcds:                  TPCDS inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
+
+# Extended TPC-H Benchmarks
+sort_tpch:              Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=1)
+sort_tpch10:            Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=10)
+topk_tpch:              Benchmark of top-k (sorting with limit) queries on TPC-H dataset (SF=1)
+external_aggr:          External aggregation benchmark on TPC-H dataset (SF=1)
+
+# ClickBench Benchmarks
 clickbench_1:           ClickBench queries against a single parquet file
-clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
+clickbench_partitioned: ClickBench queries against partitioned (100 files) parquet
+clickbench_pushdown:    ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
-external_aggr:          External aggregation benchmark
-h2o_small:              h2oai benchmark with small dataset (1e7 rows) for groupby,  default file format is csv
-h2o_medium:             h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
-h2o_big:                h2oai benchmark with large dataset (1e9 rows) for groupby,  default file format is csv
-h2o_small_join:         h2oai benchmark with small dataset (1e7 rows) for join,  default file format is csv
-h2o_medium_join:        h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
-h2o_big_join:           h2oai benchmark with large dataset (1e9 rows) for join,  default file format is csv
-h2o_small_window:       Extended h2oai benchmark with small dataset (1e7 rows) for window,  default file format is csv
-h2o_medium_window:      Extended h2oai benchmark with medium dataset (1e8 rows) for window, default file format is csv
-h2o_big_window:         Extended h2oai benchmark with large dataset (1e9 rows) for window,  default file format is csv
+
+# Sorted Data Benchmarks (ORDER BY Optimization)
+clickbench_sorted:     ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
+
+# H2O.ai Benchmarks (Group By, Join, Window)
+h2o_small:                      h2oai benchmark with small dataset (1e7 rows) for groupby,  default file format is csv
+h2o_medium:                     h2oai benchmark with medium dataset (1e8 rows) for groupby, default file format is csv
+h2o_big:                        h2oai benchmark with large dataset (1e9 rows) for groupby,  default file format is csv
+h2o_small_join:                 h2oai benchmark with small dataset (1e7 rows) for join,  default file format is csv
+h2o_medium_join:                h2oai benchmark with medium dataset (1e8 rows) for join, default file format is csv
+h2o_big_join:                   h2oai benchmark with large dataset (1e9 rows) for join,  default file format is csv
+h2o_small_window:               Extended h2oai benchmark with small dataset (1e7 rows) for window,  default file format is csv
+h2o_medium_window:              Extended h2oai benchmark with medium dataset (1e8 rows) for window, default file format is csv
+h2o_big_window:                 Extended h2oai benchmark with large dataset (1e9 rows) for window,  default file format is csv
+h2o_small_parquet:              h2oai benchmark with small dataset (1e7 rows) for groupby,  file format is parquet
+h2o_medium_parquet:             h2oai benchmark with medium dataset (1e8 rows) for groupby, file format is parquet
+h2o_big_parquet:                h2oai benchmark with large dataset (1e9 rows) for groupby,  file format is parquet
+h2o_small_join_parquet:         h2oai benchmark with small dataset (1e7 rows) for join,  file format is parquet
+h2o_medium_join_parquet:        h2oai benchmark with medium dataset (1e8 rows) for join, file format is parquet
+h2o_big_join_parquet:           h2oai benchmark with large dataset (1e9 rows) for join,  file format is parquet
+h2o_small_window_parquet:       Extended h2oai benchmark with small dataset (1e7 rows) for window,  file format is parquet
+h2o_medium_window_parquet:      Extended h2oai benchmark with medium dataset (1e8 rows) for window, file format is parquet
+h2o_big_window_parquet:         Extended h2oai benchmark with large dataset (1e9 rows) for window,  file format is parquet
+
+# Join Order Benchmark (IMDB)
 imdb:                   Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
 
-**********
-* Supported Configuration (Environment Variables)
-**********
+# Micro-Benchmarks (specific operators and features)
+cancellation:           How long cancelling a query takes
+nlj:                    Benchmark for simple nested loop joins, testing various join scenarios
+hj:                     Benchmark for simple hash joins, testing various join scenarios
+smj:                    Benchmark for simple sort merge joins, testing various join scenarios
+compile_profile:        Compile and execute TPC-H across selected Cargo profiles, reporting timing and binary size
+
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+Supported Configuration (Environment Variables)
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 DATA_DIR            directory to store datasets
 CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
-VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
+SIMULATE_LATENCY    Simulate object store latency to mimic S3 (default false)
 DATAFUSION_*        Set the given datafusion configuration
 "
     exit 1
@@ -152,8 +200,8 @@ main() {
             echo "***************************"
             case "$BENCHMARK" in
                 all)
-                    data_tpch "1"
-                    data_tpch "10"
+                    data_tpch "1" "parquet"
+                    data_tpch "10" "parquet"
                     data_h2o "SMALL"
                     data_h2o "MEDIUM"
                     data_h2o "BIG"
@@ -163,20 +211,28 @@ main() {
                     data_clickbench_1
                     data_clickbench_partitioned
                     data_imdb
+                    # nlj uses range() function, no data generation needed
                     ;;
                 tpch)
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 tpch_mem)
-                    # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
+                    ;;
+                tpch_csv)
+                    data_tpch "1" "csv"
                     ;;
                 tpch10)
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
                     ;;
                 tpch_mem10)
-                    # same data as for tpch10
-                    data_tpch "10"
+                    data_tpch "10" "parquet"
+                    ;;
+                tpch_csv10)
+                    data_tpch "10" "csv"
+                    ;;
+                tpcds)
+                    data_tpcds
                     ;;
                 clickbench_1)
                     data_clickbench_1
@@ -184,6 +240,9 @@ main() {
                 clickbench_partitioned)
                     data_clickbench_partitioned
                     ;;
+                clickbench_pushdown)
+                    data_clickbench_partitioned # same data as clickbench_partitioned
+                    ;;
                 clickbench_extended)
                     data_clickbench_1
                     ;;
@@ -218,13 +277,66 @@ main() {
                 h2o_big_window)
                     data_h2o_join "BIG" "CSV"
                     ;;
+                h2o_small_parquet)
+                    data_h2o "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_parquet)
+                    data_h2o "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_parquet)
+                    data_h2o "BIG" "PARQUET"
+                    ;;
+                h2o_small_join_parquet)
+                    data_h2o_join "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_join_parquet)
+                    data_h2o_join "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_join_parquet)
+                    data_h2o_join "BIG" "PARQUET"
+                    ;;
+                # h2o window benchmark uses the same data as the h2o join
+                h2o_small_window_parquet)
+                    data_h2o_join "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_window_parquet)
+                    data_h2o_join "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_window_parquet)
+                    data_h2o_join "BIG" "PARQUET"
+                    ;;
                 external_aggr)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
                     ;;
                 sort_tpch)
                     # same data as for tpch
-                    data_tpch "1"
+                    data_tpch "1" "parquet"
+                    ;;
+                sort_tpch10)
+                    # same data as for tpch10
+                    data_tpch "10" "parquet"
+                    ;;
+                topk_tpch)
+                    # same data as for tpch
+                    data_tpch "1" "parquet"
+                    ;;
+                nlj)
+                    # nlj uses range() function, no data generation needed
+                    echo "NLJ benchmark does not require data generation"
+                    ;;
+                hj)
+                    data_tpch "10" "parquet"
+                    ;;
+                smj)
+                    # smj uses range() function, no data generation needed
+                    echo "SMJ benchmark does not require data generation"
+                    ;;
+                compile_profile)
+                    data_tpch "1" "parquet"
+                    ;;
+                clickbench_sorted)
+                    clickbench_sorted
                     ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
@@ -235,6 +347,18 @@ main() {
         run)
             # Parse positional parameters
             BENCHMARK=${ARG2:-"${BENCHMARK}"}
+            EXTRA_ARGS=("${POSITIONAL_ARGS[@]:2}")
+            PROFILE_ARGS=()
+            QUERY=""
+            QUERY_ARG=""
+            if [ "$BENCHMARK" = "compile_profile" ]; then
+                PROFILE_ARGS=("${EXTRA_ARGS[@]}")
+            else
+                QUERY=${EXTRA_ARGS[0]}
+                if [ -n "$QUERY" ]; then
+                    QUERY_ARG="--query ${QUERY}"
+                fi
+            fi
             BRANCH_NAME=$(cd "${DATAFUSION_DIR}" && git rev-parse --abbrev-ref HEAD)
             BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _
             RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"}
@@ -244,12 +368,18 @@ main() {
             echo "DataFusion Benchmark Script"
             echo "COMMAND: ${COMMAND}"
             echo "BENCHMARK: ${BENCHMARK}"
+            if [ "$BENCHMARK" = "compile_profile" ]; then
+                echo "PROFILES: ${PROFILE_ARGS[*]:-All}"
+            else
+                echo "QUERY: ${QUERY:-All}"
+            fi
             echo "DATAFUSION_DIR: ${DATAFUSION_DIR}"
             echo "BRANCH_NAME: ${BRANCH_NAME}"
             echo "DATA_DIR: ${DATA_DIR}"
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
             echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
+            echo "SIMULATE_LATENCY: ${SIMULATE_LATENCY}"
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -258,15 +388,16 @@ main() {
             mkdir -p "${DATA_DIR}"
             case "$BENCHMARK" in
                 all)
-                    run_tpch "1"
+                    run_tpch "1" "parquet"
+                    run_tpch "1" "csv"
                     run_tpch_mem "1"
-                    run_tpch "10"
+                    run_tpch "10" "parquet"
+                    run_tpch "10" "csv"
                     run_tpch_mem "10"
                     run_cancellation
-                    run_parquet
-                    run_sort
                     run_clickbench_1
                     run_clickbench_partitioned
+                    run_clickbench_pushdown
                     run_clickbench_extended
                     run_h2o "SMALL" "PARQUET" "groupby"
                     run_h2o "MEDIUM" "PARQUET" "groupby"
@@ -276,34 +407,44 @@ main() {
                     run_h2o_join "BIG" "PARQUET" "join"
                     run_imdb
                     run_external_aggr
+                    run_nlj
+                    run_hj
+                    run_tpcds
+                    run_smj
                     ;;
                 tpch)
-                    run_tpch "1"
+                    run_tpch "1" "parquet"
+                    ;;
+                tpch_csv)
+                    run_tpch "1" "csv"
                     ;;
                 tpch_mem)
                     run_tpch_mem "1"
                     ;;
                 tpch10)
-                    run_tpch "10"
+                    run_tpch "10" "parquet"
+                    ;;
+                tpch_csv10)
+                    run_tpch "10" "csv"
                     ;;
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
+                tpcds)
+                    run_tpcds
+                    ;;
                 cancellation)
                     run_cancellation
                     ;;
-                parquet)
-                    run_parquet
-                    ;;
-                sort)
-                    run_sort
-                    ;;
                 clickbench_1)
                     run_clickbench_1
                     ;;
                 clickbench_partitioned)
                     run_clickbench_partitioned
                     ;;
+                clickbench_pushdown)
+                    run_clickbench_pushdown
+                    ;;
                 clickbench_extended)
                     run_clickbench_extended
                     ;;
@@ -334,14 +475,63 @@ main() {
                 h2o_medium_window)
                     run_h2o_window "MEDIUM" "CSV" "window"
                     ;;
-                h2o_big_window) 
+                h2o_big_window)
                     run_h2o_window "BIG" "CSV" "window"
                     ;;
+                h2o_small_parquet)
+                    run_h2o "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_parquet)
+                    run_h2o "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_parquet)
+                    run_h2o "BIG" "PARQUET"
+                    ;;
+                h2o_small_join_parquet)
+                    run_h2o_join "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_join_parquet)
+                    run_h2o_join "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_join_parquet)
+                    run_h2o_join "BIG" "PARQUET"
+                    ;;
+                # h2o window benchmark uses the same data as the h2o join
+                h2o_small_window_parquet)
+                    run_h2o_window "SMALL" "PARQUET"
+                    ;;
+                h2o_medium_window_parquet)
+                    run_h2o_window "MEDIUM" "PARQUET"
+                    ;;
+                h2o_big_window_parquet)
+                    run_h2o_window "BIG" "PARQUET"
+                    ;;
                 external_aggr)
                     run_external_aggr
                     ;;
                 sort_tpch)
-                    run_sort_tpch
+                    run_sort_tpch "1"
+                    ;;
+                sort_tpch10)
+                    run_sort_tpch "10"
+                    ;;
+                topk_tpch)
+                    run_topk_tpch
+                    ;;
+                nlj)
+                    run_nlj
+                    ;;
+                hj)
+                    run_hj
+                    ;;
+                smj)
+                    run_smj
+                    ;;
+                compile_profile)
+                    run_compile_profile "${PROFILE_ARGS[@]}"
+                    ;;
+                clickbench_sorted)
+                    run_clickbench_sorted
                     ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
@@ -354,8 +544,8 @@ main() {
         compare)
             compare_benchmarks "$ARG2" "$ARG3"
             ;;
-        venv)
-            setup_venv
+        compare_detail)
+            compare_benchmarks "$ARG2" "$ARG3" "--detailed"
             ;;
         "")
             usage
@@ -372,7 +562,7 @@ main() {
 # Creates TPCH data at a certain scale factor, if it doesn't already
 # exist
 #
-# call like: data_tpch($scale_factor)
+# call like: data_tpch($scale_factor, format)
 #
 # Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
 # Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -383,20 +573,23 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2
+    if [ -z "$FORMAT" ] ; then
+        echo "Internal error: Format not specified"
+        exit 1
+    fi
 
     TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-    echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
+    echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
 
     # Ensure the target data directory exists
     mkdir -p "${TPCH_DIR}"
 
-    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
-    FILE="${TPCH_DIR}/supplier.tbl"
-    if test -f "${FILE}"; then
-        echo " tbl files exist ($FILE exists)."
-    else
-        echo " creating tbl files with tpch_dbgen..."
-        docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
+    # check if tpchgen-cli is installed
+    if ! command -v tpchgen-cli &> /dev/null
+    then
+        echo "tpchgen-cli could not be found, please install it via 'cargo install tpchgen-cli'"
+        exit 1
     fi
 
     # Copy expected answers into the ./data/answers directory if it does not already exist
@@ -409,16 +602,52 @@ data_tpch() {
         docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main  -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
     fi
 
-    # Create 'parquet' files from tbl
-    FILE="${TPCH_DIR}/supplier"
-    if test -d "${FILE}"; then
-        echo " parquet files exist ($FILE exists)."
-    else
-        echo " creating parquet files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-        popd > /dev/null
+    if [ "$FORMAT" = "parquet" ]; then
+      # Create 'parquet' files, one directory per file
+      FILE="${TPCH_DIR}/supplier"
+      if test -d "${FILE}"; then
+          echo " parquet files exist ($FILE exists)."
+      else
+          echo " creating parquet files using tpchgen-cli ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
+      fi
+      return
+    fi
+
+    # Create 'csv' files, one directory per file
+    if [ "$FORMAT" = "csv" ]; then
+      FILE="${TPCH_DIR}/csv/supplier"
+      if test -d "${FILE}"; then
+          echo " csv files exist ($FILE exists)."
+      else
+          echo " creating csv files using tpchgen-cli binary ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
+      fi
+      return
+    fi
+
+    echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
+    exit 1
+}
+
+# Downloads TPC-DS data
+data_tpcds() {
+    TPCDS_DIR="${DATA_DIR}/tpcds_sf1"
+
+    # Check if `web_site.parquet` exists in the TPCDS data directory to verify data presence
+    echo "Checking TPC-DS data directory: ${TPCDS_DIR}"
+    if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
+        mkdir -p "${TPCDS_DIR}"
+        # Download the DataFusion benchmarks repository zip if it is not already downloaded
+        if [ ! -f "${DATA_DIR}/datafusion-benchmarks.zip" ]; then
+          echo "Downloading DataFusion benchmarks repository zip to: ${DATA_DIR}/datafusion-benchmarks.zip"
+          wget --timeout=30 --tries=3 -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip
+        fi
+        echo "Extracting TPC-DS parquet data to ${TPCDS_DIR}..."
+        unzip -o -j -d "${TPCDS_DIR}" "${DATA_DIR}/datafusion-benchmarks.zip" datafusion-benchmarks-main/tpcds/data/sf1/*
+        echo "TPC-DS data extracted."
     fi
+    echo "Done."
 }
 
 # Runs the tpch benchmark
@@ -433,15 +662,12 @@ run_tpch() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch benchmark..."
-    # Optional query filter to run specific query
-    QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
-    # debug the target command
-    set -x
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY
-    set +x
+
+    FORMAT=$2
+    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
-# Runs the tpch in memory
+# Runs the tpch in memory (needs tpch parquet data)
 run_tpch_mem() {
     SCALE_FACTOR=$1
     if [ -z "$SCALE_FACTOR" ] ; then
@@ -453,37 +679,50 @@ run_tpch_mem() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
-    # Optional query filter to run specific query
-    QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
-    # debug the target command
-    set -x
     # -m means in memory
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" $QUERY
-    set +x
+    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
-# Runs the cancellation benchmark
-run_cancellation() {
-    RESULTS_FILE="${RESULTS_DIR}/cancellation.json"
+# Runs the tpcds benchmark
+run_tpcds() {
+    TPCDS_DIR="${DATA_DIR}/tpcds_sf1"
+
+    # Check if TPCDS data directory and representative file exists
+    if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
+        echo "" >&2
+        echo "Please prepare TPC-DS data first by following instructions:" >&2
+        echo "  ./bench.sh data tpcds" >&2
+        echo "" >&2
+        exit 1
+    fi
+
+    RESULTS_FILE="${RESULTS_DIR}/tpcds_sf1.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
-    echo "Running cancellation benchmark..."
-    $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}"
+    echo "Running tpcds benchmark..."
+
+    debug_run $CARGO_COMMAND --bin dfbench -- tpcds --iterations 5 --path "${TPCDS_DIR}" --query_path "../datafusion/core/tests/tpc-ds" --prefer_hash_join "${PREFER_HASH_JOIN}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
-# Runs the parquet filter benchmark
-run_parquet() {
-    RESULTS_FILE="${RESULTS_DIR}/parquet.json"
-    echo "RESULTS_FILE: ${RESULTS_FILE}"
-    echo "Running parquet filter benchmark..."
-    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+# Runs the compile profile benchmark helper
+run_compile_profile() {
+    local profiles=("$@")
+    local runner="${SCRIPT_DIR}/compile_profile.py"
+    local data_path="${DATA_DIR}/tpch_sf1"
+
+    echo "Running compile profile benchmark..."
+    local cmd=(uv run python3 "${runner}" --data "${data_path}")
+    if [ ${#profiles[@]} -gt 0 ]; then
+        cmd+=(--profiles "${profiles[@]}")
+    fi
+    debug_run "${cmd[@]}"
 }
 
-# Runs the sort benchmark
-run_sort() {
-    RESULTS_FILE="${RESULTS_DIR}/sort.json"
+# Runs the cancellation benchmark
+run_cancellation() {
+    RESULTS_FILE="${RESULTS_DIR}/cancellation.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
-    echo "Running sort benchmark..."
-    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+    echo "Running cancellation benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- cancellation --iterations 5 --path "${DATA_DIR}/cancellation" -o "${RESULTS_FILE}" ${LATENCY_ARG}
 }
 
 
@@ -537,23 +776,33 @@ run_clickbench_1() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (1 file) benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet"  --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet"  --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
- # Runs the clickbench benchmark with the partitioned parquet files
+ # Runs the clickbench benchmark with the partitioned parquet dataset (100 files)
 run_clickbench_partitioned() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+
+ # Runs the clickbench benchmark with the partitioned parquet files and filter_pushdown enabled
+run_clickbench_pushdown() {
+    RESULTS_FILE="${RESULTS_DIR}/clickbench_pushdown.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true, reorder_filters=true..."
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
+
 # Runs the clickbench "extended" benchmark with a single large parquet file
 run_clickbench_extended() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (1 file) extended benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
@@ -668,7 +917,7 @@ run_imdb() {
     RESULTS_FILE="${RESULTS_DIR}/imdb.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running imdb benchmark..."
-    $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 data_h2o() {
@@ -676,75 +925,13 @@ data_h2o() {
     SIZE=${1:-"SMALL"}
     DATA_FORMAT=${2:-"CSV"}
 
-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
     # Create directory if it doesn't exist
     H2O_DIR="${DATA_DIR}/h2o"
     mkdir -p "${H2O_DIR}"
 
     # Generate h2o test data
     echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }
 
 data_h2o_join() {
@@ -752,75 +939,13 @@ data_h2o_join() {
     SIZE=${1:-"SMALL"}
     DATA_FORMAT=${2:-"CSV"}
 
-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
     # Create directory if it doesn't exist
     H2O_DIR="${DATA_DIR}/h2o"
     mkdir -p "${H2O_DIR}"
 
     # Generate h2o test data
     echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }
 
 # Runner for h2o groupby benchmark
@@ -859,11 +984,12 @@ run_h2o() {
     QUERY_FILE="${SCRIPT_DIR}/queries/h2o/${RUN_Type}.sql"
 
     # Run the benchmark using the dynamically constructed file path and query file
-    $CARGO_COMMAND --bin dfbench -- h2o \
+    debug_run $CARGO_COMMAND --bin dfbench -- h2o \
         --iterations 3 \
         --path "${H2O_DIR}/${FILE_NAME}" \
         --queries-path "${QUERY_FILE}" \
-        -o "${RESULTS_FILE}"
+        -o "${RESULTS_FILE}" \
+         ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Utility function to run h2o join/window benchmark
@@ -910,11 +1036,12 @@ h2o_runner() {
     # Set the query file name based on the RUN_Type
     QUERY_FILE="${SCRIPT_DIR}/queries/h2o/${RUN_Type}.sql"
 
-    $CARGO_COMMAND --bin dfbench -- h2o \
+    debug_run $CARGO_COMMAND --bin dfbench -- h2o \
         --iterations 3 \
         --join-paths "${H2O_DIR}/${X_TABLE_FILE_NAME},${H2O_DIR}/${SMALL_TABLE_FILE_NAME},${H2O_DIR}/${MEDIUM_TABLE_FILE_NAME},${H2O_DIR}/${LARGE_TABLE_FILE_NAME}" \
         --queries-path "${QUERY_FILE}" \
-        -o "${RESULTS_FILE}"
+        -o "${RESULTS_FILE}" \
+         ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 # Runners for h2o join benchmark
@@ -940,17 +1067,57 @@ run_external_aggr() {
     # number-of-partitions), and by default `--partitions` is set to number of
     # CPU cores, we set a constant number of partitions to prevent this
     # benchmark to fail on some machines.
-    $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
 # Runs the sort integration benchmark
 run_sort_tpch() {
-    TPCH_DIR="${DATA_DIR}/tpch_sf1"
-    RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
+    SCALE_FACTOR=$1
+    if [ -z "$SCALE_FACTOR" ] ; then
+        echo "Internal error: Scale factor not specified"
+        exit 1
+    fi
+    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
+    RESULTS_FILE="${RESULTS_DIR}/sort_tpch${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running sort tpch benchmark..."
 
-    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the sort tpch integration benchmark with limit 100 (topk)
+run_topk_tpch() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/run_topk_tpch.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running topk tpch benchmark..."
+
+    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" --limit 100 ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the nlj benchmark
+run_nlj() {
+    RESULTS_FILE="${RESULTS_DIR}/nlj.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running nlj benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the hj benchmark
+run_hj() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf10"
+    RESULTS_FILE="${RESULTS_DIR}/hj.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running hj benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the smj benchmark
+run_smj() {
+    RESULTS_FILE="${RESULTS_DIR}/smj.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running smj benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- smj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
 
@@ -958,6 +1125,8 @@ compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="$1"
     BRANCH2="$2"
+    OPTS="$3"
+
     if [ -z "$BRANCH1" ] ; then
         echo "<branch1> not specified. Available branches:"
         ls -1 "${BASE_RESULTS_DIR}"
@@ -978,7 +1147,7 @@ compare_benchmarks() {
             echo "--------------------"
             echo "Benchmark ${BENCH}"
             echo "--------------------"
-            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+            uv run python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
         else
             echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
         fi
@@ -986,10 +1155,113 @@ compare_benchmarks() {
 
 }
 
-setup_venv() {
-    python3 -m venv "$VIRTUAL_ENV"
-    PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
+# Creates sorted ClickBench data from hits.parquet (full dataset)
+# The data is sorted by EventTime in ascending order
+# Uses datafusion-cli to reduce dependencies
+clickbench_sorted() {
+    SORTED_FILE="${DATA_DIR}/hits_sorted.parquet"
+    ORIGINAL_FILE="${DATA_DIR}/hits.parquet"
+
+    # Default memory limit is 12GB, can be overridden with DATAFUSION_MEMORY_GB env var
+    MEMORY_LIMIT_GB=${DATAFUSION_MEMORY_GB:-12}
+
+    echo "Creating sorted ClickBench dataset from hits.parquet..."
+    echo "Configuration:"
+    echo "  Memory limit: ${MEMORY_LIMIT_GB}G"
+    echo "  Row group size: 64K rows"
+    echo "  Compression: uncompressed"
+
+    if [ ! -f "${ORIGINAL_FILE}" ]; then
+        echo "hits.parquet not found. Running data_clickbench_1 first..."
+        data_clickbench_1
+    fi
+
+    if [ -f "${SORTED_FILE}" ]; then
+        echo "Sorted hits.parquet already exists at ${SORTED_FILE}"
+        return 0
+    fi
+
+    echo "Sorting hits.parquet by EventTime (this may take several minutes)..."
+
+    pushd "${DATAFUSION_DIR}" > /dev/null
+    echo "Building datafusion-cli..."
+    cargo build --release --bin datafusion-cli
+    DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli"
+    popd > /dev/null
+
+
+    START_TIME=$(date +%s)
+    echo "Start time: $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "Using datafusion-cli to create sorted parquet file..."
+    "${DATAFUSION_CLI}" << EOF
+-- Memory and performance configuration
+SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G';
+SET datafusion.execution.spill_compression = 'uncompressed';
+SET datafusion.execution.sort_spill_reservation_bytes = 10485760; -- 10MB
+SET datafusion.execution.batch_size = 8192;
+SET datafusion.execution.target_partitions = 1;
+
+-- Parquet output configuration
+SET datafusion.execution.parquet.max_row_group_size = 65536;
+SET datafusion.execution.parquet.compression = 'uncompressed';
+
+-- Execute sort and write
+COPY (SELECT * FROM '${ORIGINAL_FILE}' ORDER BY "EventTime")
+TO '${SORTED_FILE}'
+STORED AS PARQUET;
+EOF
+
+    local result=$?
+
+    END_TIME=$(date +%s)
+    DURATION=$((END_TIME - START_TIME))
+    echo "End time: $(date '+%Y-%m-%d %H:%M:%S')"
+
+    if [ $result -eq 0 ]; then
+        echo "✓ Successfully created sorted ClickBench dataset"
+
+        INPUT_SIZE=$(stat -f%z "${ORIGINAL_FILE}" 2>/dev/null || stat -c%s "${ORIGINAL_FILE}" 2>/dev/null)
+        OUTPUT_SIZE=$(stat -f%z "${SORTED_FILE}" 2>/dev/null || stat -c%s "${SORTED_FILE}" 2>/dev/null)
+        INPUT_MB=$((INPUT_SIZE / 1024 / 1024))
+        OUTPUT_MB=$((OUTPUT_SIZE / 1024 / 1024))
+
+        echo "  Input:  ${INPUT_MB} MB"
+        echo "  Output: ${OUTPUT_MB} MB"
+
+        echo ""
+        echo "Time Statistics:"
+        echo "  Total duration: ${DURATION} seconds ($(printf '%02d:%02d:%02d' $((DURATION/3600)) $((DURATION%3600/60)) $((DURATION%60))))"
+        echo "  Throughput: $((INPUT_MB / DURATION)) MB/s"
+
+        return 0
+    else
+        echo "✗ Error: Failed to create sorted dataset"
+        echo "💡 Tip: Try increasing memory with: DATAFUSION_MEMORY_GB=16 ./bench.sh data clickbench_sorted"
+        return 1
+    fi
+}
+
+# Runs the sorted data benchmark with prefer_existing_sort configuration
+run_clickbench_sorted() {
+    RESULTS_FILE="${RESULTS_DIR}/clickbench_sorted.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running sorted data benchmark with prefer_existing_sort optimization..."
+
+    # Ensure sorted data exists
+    clickbench_sorted
+
+    # Run benchmark with prefer_existing_sort configuration
+    # This allows DataFusion to optimize away redundant sorts while maintaining parallelism
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench \
+        --iterations 5 \
+        --path "${DATA_DIR}/hits_sorted.parquet" \
+        --queries-path "${SCRIPT_DIR}/queries/clickbench/queries/sorted_data" \
+        --sorted-by "EventTime" \
+        -c datafusion.optimizer.prefer_existing_sort=true \
+        -o "${RESULTS_FILE}" \
+        ${QUERY_ARG} ${LATENCY_ARG}
 }
 
+
 # And start the process up
 main
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 4b609c744d503..9ad1de980abe8 100755
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -18,7 +18,9 @@
 
 from __future__ import annotations
 
+import argparse
 import json
+import math
 from dataclasses import dataclass
 from typing import Dict, List, Any
 from pathlib import Path
@@ -47,6 +49,7 @@ class QueryRun:
     query: int
     iterations: List[QueryResult]
     start_time: int
+    success: bool = True
 
     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> QueryRun:
@@ -54,17 +57,57 @@ def load_from(cls, data: Dict[str, Any]) -> QueryRun:
             query=data["query"],
             iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
             start_time=data["start_time"],
+            success=data.get("success", True),
         )
 
     @property
-    def execution_time(self) -> float:
+    def min_execution_time(self) -> float:
         assert len(self.iterations) >= 1
 
-        # Use minimum execution time to account for variations / other
-        # things the system was doing
         return min(iteration.elapsed for iteration in self.iterations)
 
 
+    @property
+    def max_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        return max(iteration.elapsed for iteration in self.iterations)
+
+
+    @property
+    def mean_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        total = sum(iteration.elapsed for iteration in self.iterations)
+        return total / len(self.iterations)
+
+
+    @property
+    def stddev_execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        mean = self.mean_execution_time
+        squared_diffs = [(iteration.elapsed - mean) ** 2 for iteration in self.iterations]
+        variance = sum(squared_diffs) / len(self.iterations)
+        return math.sqrt(variance)
+
+    def execution_time_report(self, detailed = False) -> tuple[float, str]:
+        if detailed:
+            mean_execution_time = self.mean_execution_time
+            return (
+                mean_execution_time,
+                f"{self.min_execution_time:.2f} / {mean_execution_time :.2f} ±{self.stddev_execution_time:.2f} / {self.max_execution_time:.2f} ms"
+            )
+        else:
+            # Use minimum execution time to account for variations / other
+            # things the system was doing
+            min_execution_time = self.min_execution_time
+            return (
+                min_execution_time,
+                f"{min_execution_time :.2f} ms"
+            )
+
+
 @dataclass
 class Context:
     benchmark_version: str
@@ -106,35 +149,54 @@ def compare(
     baseline_path: Path,
     comparison_path: Path,
     noise_threshold: float,
+    detailed: bool,
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline_path)
     comparison = BenchmarkRun.load_from_file(comparison_path)
 
-    console = Console()
+    console = Console(width=200)
 
     # use basename as the column names
-    baseline_header = baseline_path.parent.stem
-    comparison_header = comparison_path.parent.stem
+    baseline_header = baseline_path.parent.name
+    comparison_header = comparison_path.parent.name
 
     table = Table(show_header=True, header_style="bold magenta")
-    table.add_column("Query", style="dim", width=12)
-    table.add_column(baseline_header, justify="right", style="dim")
-    table.add_column(comparison_header, justify="right", style="dim")
-    table.add_column("Change", justify="right", style="dim")
+    table.add_column("Query", style="dim", no_wrap=True)
+    table.add_column(baseline_header, justify="right", style="dim", no_wrap=True)
+    table.add_column(comparison_header, justify="right", style="dim", no_wrap=True)
+    table.add_column("Change", justify="right", style="dim", no_wrap=True)
 
     faster_count = 0
     slower_count = 0
     no_change_count = 0
+    failure_count = 0
     total_baseline_time = 0
     total_comparison_time = 0
 
     for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
         assert baseline_result.query == comparison_result.query
 
-        total_baseline_time += baseline_result.execution_time
-        total_comparison_time += comparison_result.execution_time
+        base_failed = not baseline_result.success
+        comp_failed = not comparison_result.success
+        # If a query fails, its execution time is excluded from the performance comparison
+        if base_failed or comp_failed:
+            change_text = "incomparable"
+            failure_count += 1
+            table.add_row(
+                f"Q{baseline_result.query}",
+                "FAIL" if base_failed else baseline_result.execution_time_report(detailed)[1],
+                "FAIL" if comp_failed else comparison_result.execution_time_report(detailed)[1],
+                change_text,
+            )
+            continue
+
+        baseline_value, baseline_text = baseline_result.execution_time_report(detailed)
+        comparison_value, comparison_text = comparison_result.execution_time_report(detailed)
+
+        total_baseline_time += baseline_value
+        total_comparison_time += comparison_value
 
-        change = comparison_result.execution_time / baseline_result.execution_time
+        change = comparison_value / baseline_value
 
         if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold):
             change_text = "no change"
@@ -148,16 +210,20 @@ def compare(
 
         table.add_row(
             f"Q{baseline_result.query}",
-            f"{baseline_result.execution_time:.2f}ms",
-            f"{comparison_result.execution_time:.2f}ms",
+            baseline_text,
+            comparison_text,
             change_text,
         )
 
     console.print(table)
 
     # Calculate averages
-    avg_baseline_time = total_baseline_time / len(baseline.queries)
-    avg_comparison_time = total_comparison_time / len(comparison.queries)
+    avg_baseline_time = 0.0
+    avg_comparison_time = 0.0
+    if len(baseline.queries) - failure_count > 0:
+        avg_baseline_time = total_baseline_time / (len(baseline.queries) - failure_count)
+    if len(comparison.queries) - failure_count > 0:
+        avg_comparison_time = total_comparison_time / (len(comparison.queries) - failure_count)
 
     # Summary table
     summary_table = Table(show_header=True, header_style="bold magenta")
@@ -171,6 +237,7 @@ def compare(
     summary_table.add_row("Queries Faster", str(faster_count))
     summary_table.add_row("Queries Slower", str(slower_count))
     summary_table.add_row("Queries with No Change", str(no_change_count))
+    summary_table.add_row("Queries with Failure", str(failure_count))
 
     console.print(summary_table)
 
@@ -193,10 +260,16 @@ def main() -> None:
         default=0.05,
         help="The threshold for statistically insignificant results (+/- %5).",
     )
+    compare_parser.add_argument(
+        "--detailed",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Show detailed result comparison instead of minimum runtime.",
+    )
 
     options = parser.parse_args()
 
-    compare(options.baseline_path, options.comparison_path, options.noise_threshold)
+    compare(options.baseline_path, options.comparison_path, options.noise_threshold, options.detailed)
 
 
 
diff --git a/benchmarks/compare_tpcds.sh b/benchmarks/compare_tpcds.sh
new file mode 100755
index 0000000000000..48331a7c7510e
--- /dev/null
+++ b/benchmarks/compare_tpcds.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Compare TPC-DS benchmarks between two branches
+
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+usage() {
+    echo "Usage: $0 <branch1> <branch2>"
+    echo ""
+    echo "Example: $0 main dev2"
+    echo ""
+    echo "Note: TPC-DS benchmarks are not currently implemented in bench.sh"
+    exit 1
+}
+
+BRANCH1=${1:-""}
+BRANCH2=${2:-""}
+
+if [ -z "$BRANCH1" ] || [ -z "$BRANCH2" ]; then
+    usage
+fi
+
+# Store current branch
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+
+echo "Comparing TPC-DS benchmarks: ${BRANCH1} vs ${BRANCH2}"
+
+# Run benchmark on first branch
+git checkout "$BRANCH1"
+./benchmarks/bench.sh run tpcds
+
+# Run benchmark on second branch
+git checkout "$BRANCH2"
+./benchmarks/bench.sh run tpcds
+
+# Compare results
+./benchmarks/bench.sh compare "$BRANCH1" "$BRANCH2"
+
+# Return to original branch
+git checkout "$CURRENT_BRANCH"
\ No newline at end of file
diff --git a/benchmarks/compare_tpch.sh b/benchmarks/compare_tpch.sh
new file mode 100755
index 0000000000000..85e8da29ce41d
--- /dev/null
+++ b/benchmarks/compare_tpch.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Compare TPC-H benchmarks between two branches
+
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+usage() {
+    echo "Usage: $0 <branch1> <branch2>"
+    echo ""
+    echo "Example: $0 main dev2"
+    exit 1
+}
+
+BRANCH1=${1:-""}
+BRANCH2=${2:-""}
+
+if [ -z "$BRANCH1" ] || [ -z "$BRANCH2" ]; then
+    usage
+fi
+
+# Store current branch
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+
+echo "Comparing TPC-H benchmarks: ${BRANCH1} vs ${BRANCH2}"
+
+# Run benchmark on first branch
+git checkout "$BRANCH1"
+./benchmarks/bench.sh run tpch
+
+# Run benchmark on second branch
+git checkout "$BRANCH2"
+./benchmarks/bench.sh run tpch
+
+# Compare results
+./benchmarks/bench.sh compare "$BRANCH1" "$BRANCH2"
+
+# Return to original branch
+git checkout "$CURRENT_BRANCH"
\ No newline at end of file
diff --git a/benchmarks/compile_profile.py b/benchmarks/compile_profile.py
new file mode 100644
index 0000000000000..ae51de94937bf
--- /dev/null
+++ b/benchmarks/compile_profile.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Compile profile benchmark runner for DataFusion.
+
+Builds the `tpch` benchmark binary with several Cargo profiles (e.g. `--release` or `--profile ci`), runs the full TPC-H suite against the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile time, execution time, and resulting 
+binary size.
+
+See `benchmarks/README.md` for usages.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import Iterable, NamedTuple
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+DEFAULT_DATA_DIR = REPO_ROOT / "benchmarks" / "data" / "tpch_sf1"
+DEFAULT_ITERATIONS = 1
+DEFAULT_FORMAT = "parquet"
+DEFAULT_PARTITIONS: int | None = None
+TPCH_BINARY = "tpch.exe" if os.name == "nt" else "tpch"
+PROFILE_TARGET_DIR = {
+    "dev": "debug",
+    "release": "release",
+    "ci": "ci",
+    "release-nonlto": "release-nonlto",
+}
+
+
+class ProfileResult(NamedTuple):
+    profile: str
+    compile_seconds: float
+    run_seconds: float
+    binary_bytes: int
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--profiles",
+        nargs="+",
+        default=list(PROFILE_TARGET_DIR.keys()),
+        help="Cargo profiles to test (default: dev release ci release-nonlto)",
+    )
+    parser.add_argument(
+        "--data",
+        type=Path,
+        default=DEFAULT_DATA_DIR,
+        help="Path to TPCH dataset (default: benchmarks/data/tpch_sf1)",
+    )
+    return parser.parse_args()
+
+
+def timed_run(command: Iterable[str]) -> float:
+    start = time.perf_counter()
+    try:
+        subprocess.run(command, cwd=REPO_ROOT, check=True)
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(f"command failed: {' '.join(command)}") from exc
+    return time.perf_counter() - start
+
+
+def cargo_build(profile: str) -> float:
+    if profile == "dev":
+        command = ["cargo", "build", "--bin", "tpch"]
+    else:
+        command = ["cargo", "build", "--profile", profile, "--bin", "tpch"]
+    return timed_run(command)
+
+
+def cargo_clean(profile: str) -> None:
+    command = ["cargo", "clean", "--profile", profile]
+    try:
+        subprocess.run(command, cwd=REPO_ROOT, check=True)
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(f"failed to clean cargo artifacts for profile '{profile}'") from exc
+
+
+def run_benchmark(profile: str, data_path: Path) -> float:
+    binary_dir = PROFILE_TARGET_DIR.get(profile)
+    if not binary_dir:
+        raise ValueError(f"unknown profile '{profile}'")
+    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    if not binary_path.exists():
+        raise FileNotFoundError(f"compiled binary not found at {binary_path}")
+
+    command = [
+        str(binary_path),
+        "benchmark",
+        "datafusion",
+        "--iterations",
+        str(DEFAULT_ITERATIONS),
+        "--path",
+        str(data_path),
+        "--format",
+        DEFAULT_FORMAT,
+    ]
+    if DEFAULT_PARTITIONS is not None:
+        command.extend(["--partitions", str(DEFAULT_PARTITIONS)])
+    env = os.environ.copy()
+    env.setdefault("RUST_LOG", "warn")
+
+    start = time.perf_counter()
+    try:
+        subprocess.run(command, cwd=REPO_ROOT, env=env, check=True)
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(f"benchmark failed for profile '{profile}'") from exc
+    return time.perf_counter() - start
+
+
+def binary_size(profile: str) -> int:
+    binary_dir = PROFILE_TARGET_DIR[profile]
+    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    return binary_path.stat().st_size
+
+
+def human_time(seconds: float) -> str:
+    return f"{seconds:6.2f}s"
+
+
+def human_size(size: int) -> str:
+    value = float(size)
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if value < 1024 or unit == "TB":
+            return f"{value:6.1f}{unit}"
+        value /= 1024
+    return f"{value:6.1f}TB"
+
+
+def main() -> None:
+    args = parse_args()
+    data_path = args.data.resolve()
+    if not data_path.exists():
+        print(f"Data directory not found: {data_path}", file=sys.stderr)
+        sys.exit(1)
+
+    results: list[ProfileResult] = []
+    for profile in args.profiles:
+        print(f"\n=== Profile: {profile} ===")
+        print("Cleaning previous build artifacts...")
+        cargo_clean(profile)
+        compile_seconds = cargo_build(profile)
+        run_seconds = run_benchmark(profile, data_path)
+        size_bytes = binary_size(profile)
+        results.append(ProfileResult(profile, compile_seconds, run_seconds, size_bytes))
+
+    print("\nSummary")
+    header = f"{'Profile':<15}{'Compile':>12}{'Run':>12}{'Size':>12}"
+    print(header)
+    print("-" * len(header))
+    for result in results:
+        print(
+            f"{result.profile:<15}{human_time(result.compile_seconds):>12}"
+            f"{human_time(result.run_seconds):>12}{human_size(result.binary_bytes):>12}"
+        )
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
new file mode 100644
index 0000000000000..e6a60582148ce
--- /dev/null
+++ b/benchmarks/pyproject.toml
@@ -0,0 +1,6 @@
+[project]
+name = "datafusion-benchmarks"
+version = "0.1.0"
+requires-python = ">=3.11"
+# typing_extensions is an undeclared dependency of falsa
+dependencies = ["rich", "falsa", "typing_extensions"]
diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md
index e5acd8f348a47..877ea0e0c3192 100644
--- a/benchmarks/queries/clickbench/README.md
+++ b/benchmarks/queries/clickbench/README.md
@@ -6,8 +6,8 @@ ClickBench is focused on aggregation and filtering performance (though it has no
 
 ## Files:
 
-- `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
-- `extended.sql` - "Extended" DataFusion specific queries.
+- `queries/*.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository](https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql) and split by the `update_queries.sh` script.
+- `extended/*.sql` - "Extended" DataFusion specific queries.
 
 [clickbench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
 
@@ -15,8 +15,8 @@ ClickBench is focused on aggregation and filtering performance (though it has no
 
 The "extended" queries are not part of the official ClickBench benchmark.
 Instead they are used to test other DataFusion features that are not covered by
-the standard benchmark. Each description below is for the corresponding line in
-`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)
+the standard benchmark. Each description below is for the corresponding file in
+`extended`
 
 ### Q0: Data Exploration
 
diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql
deleted file mode 100644
index 93c39efe4f8e3..0000000000000
--- a/benchmarks/queries/clickbench/extended.sql
+++ /dev/null
@@ -1,9 +0,0 @@
-SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
-SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
-SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
-SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice")  FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
-SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0  GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
-SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
-SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;
-SELECT "WatchID", MIN("ResolutionWidth") as wmin, MAX("ResolutionWidth") as wmax, SUM("IsRefresh") as srefresh FROM hits GROUP BY "WatchID" ORDER BY "WatchID" DESC LIMIT 10;
-SELECT "RegionID", "UserAgent", "OS", AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ResponseStartTiming")) as avg_response_time, AVG(to_timestamp("ResponseEndTiming")-to_timestamp("ConnectTiming")) as avg_latency FROM hits GROUP BY "RegionID", "UserAgent", "OS" ORDER BY avg_latency DESC limit 10;
\ No newline at end of file
diff --git a/benchmarks/queries/clickbench/extended/q0.sql b/benchmarks/queries/clickbench/extended/q0.sql
new file mode 100644
index 0000000000000..cb826e5f947e9
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q0.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
diff --git a/benchmarks/queries/clickbench/extended/q1.sql b/benchmarks/queries/clickbench/extended/q1.sql
new file mode 100644
index 0000000000000..7862423787d85
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q1.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
diff --git a/benchmarks/queries/clickbench/extended/q2.sql b/benchmarks/queries/clickbench/extended/q2.sql
new file mode 100644
index 0000000000000..de2be79885792
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q2.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/extended/q3.sql b/benchmarks/queries/clickbench/extended/q3.sql
new file mode 100644
index 0000000000000..f52990b9843a5
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q3.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice")  FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/extended/q4.sql b/benchmarks/queries/clickbench/extended/q4.sql
new file mode 100644
index 0000000000000..5865129db6425
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q4.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0  GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/extended/q5.sql b/benchmarks/queries/clickbench/extended/q5.sql
new file mode 100644
index 0000000000000..18d3e01c82c4b
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q5.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/extended/q6.sql b/benchmarks/queries/clickbench/extended/q6.sql
new file mode 100644
index 0000000000000..0a6467b8898aa
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q6.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;
diff --git a/benchmarks/queries/clickbench/extended/q7.sql b/benchmarks/queries/clickbench/extended/q7.sql
new file mode 100644
index 0000000000000..ddaff7f8804f5
--- /dev/null
+++ b/benchmarks/queries/clickbench/extended/q7.sql
@@ -0,0 +1 @@
+SELECT "WatchID", MIN("ResolutionWidth") as wmin, MAX("ResolutionWidth") as wmax, SUM("IsRefresh") as srefresh FROM hits GROUP BY "WatchID" ORDER BY "WatchID" DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries.sql b/benchmarks/queries/clickbench/queries.sql
deleted file mode 100644
index 9a183cd6e259c..0000000000000
--- a/benchmarks/queries/clickbench/queries.sql
+++ /dev/null
@@ -1,43 +0,0 @@
-SELECT COUNT(*) FROM hits;
-SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
-SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
-SELECT AVG("UserID") FROM hits;
-SELECT COUNT(DISTINCT "UserID") FROM hits;
-SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
-SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
-SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
-SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
-SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
-SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
-SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
-SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
-SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
-SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
-SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
-SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
-SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
-SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
-SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
-SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
-SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
-SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
-SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
-SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
-SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
-SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
-SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
-SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
-SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
-SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
-SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
-SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
-SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
-SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
-SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
-SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
-SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
-SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
-SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
-SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
diff --git a/benchmarks/queries/clickbench/queries/q0.sql b/benchmarks/queries/clickbench/queries/q0.sql
new file mode 100644
index 0000000000000..35f2b32ed4863
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q0.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+
+-- set datafusion.execution.parquet.binary_as_string = true
+SELECT COUNT(*) FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q1.sql b/benchmarks/queries/clickbench/queries/q1.sql
new file mode 100644
index 0000000000000..0bee959ec3c7d
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q1.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
diff --git a/benchmarks/queries/clickbench/queries/q10.sql b/benchmarks/queries/clickbench/queries/q10.sql
new file mode 100644
index 0000000000000..0f9114803fecf
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q10.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q11.sql b/benchmarks/queries/clickbench/queries/q11.sql
new file mode 100644
index 0000000000000..bed8bb210e130
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q11.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q12.sql b/benchmarks/queries/clickbench/queries/q12.sql
new file mode 100644
index 0000000000000..8cf09c0049f3d
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q12.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q13.sql b/benchmarks/queries/clickbench/queries/q13.sql
new file mode 100644
index 0000000000000..ef6583c8d1886
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q13.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q14.sql b/benchmarks/queries/clickbench/queries/q14.sql
new file mode 100644
index 0000000000000..dd267146edec5
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q14.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q15.sql b/benchmarks/queries/clickbench/queries/q15.sql
new file mode 100644
index 0000000000000..721d924cb9b95
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q15.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q16.sql b/benchmarks/queries/clickbench/queries/q16.sql
new file mode 100644
index 0000000000000..389725d58d7a3
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q16.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q17.sql b/benchmarks/queries/clickbench/queries/q17.sql
new file mode 100644
index 0000000000000..be9976a01d7a4
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q17.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q18.sql b/benchmarks/queries/clickbench/queries/q18.sql
new file mode 100644
index 0000000000000..d649f1edfe2a4
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q18.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q19.sql b/benchmarks/queries/clickbench/queries/q19.sql
new file mode 100644
index 0000000000000..8212a765730a3
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q19.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
diff --git a/benchmarks/queries/clickbench/queries/q2.sql b/benchmarks/queries/clickbench/queries/q2.sql
new file mode 100644
index 0000000000000..bcdfad84ec10f
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q2.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q20.sql b/benchmarks/queries/clickbench/queries/q20.sql
new file mode 100644
index 0000000000000..a7e488c2abcd8
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q20.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
diff --git a/benchmarks/queries/clickbench/queries/q21.sql b/benchmarks/queries/clickbench/queries/q21.sql
new file mode 100644
index 0000000000000..3551689728ede
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q21.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q22.sql b/benchmarks/queries/clickbench/queries/q22.sql
new file mode 100644
index 0000000000000..d5f696e75a8c8
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q22.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q23.sql b/benchmarks/queries/clickbench/queries/q23.sql
new file mode 100644
index 0000000000000..ff399ded6ed8c
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q23.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q24.sql b/benchmarks/queries/clickbench/queries/q24.sql
new file mode 100644
index 0000000000000..bc7a364151e23
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q24.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q25.sql b/benchmarks/queries/clickbench/queries/q25.sql
new file mode 100644
index 0000000000000..5332e3451aeaf
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q25.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q26.sql b/benchmarks/queries/clickbench/queries/q26.sql
new file mode 100644
index 0000000000000..bc1108aea1255
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q26.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q27.sql b/benchmarks/queries/clickbench/queries/q27.sql
new file mode 100644
index 0000000000000..ba234d34f8877
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q27.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
diff --git a/benchmarks/queries/clickbench/queries/q28.sql b/benchmarks/queries/clickbench/queries/q28.sql
new file mode 100644
index 0000000000000..6a3bd037bece7
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q28.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
diff --git a/benchmarks/queries/clickbench/queries/q29.sql b/benchmarks/queries/clickbench/queries/q29.sql
new file mode 100644
index 0000000000000..bca1eb7bbe54b
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q29.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q3.sql b/benchmarks/queries/clickbench/queries/q3.sql
new file mode 100644
index 0000000000000..09cdaca713047
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q3.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT AVG("UserID") FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q30.sql b/benchmarks/queries/clickbench/queries/q30.sql
new file mode 100644
index 0000000000000..c0d657927478e
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q30.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q31.sql b/benchmarks/queries/clickbench/queries/q31.sql
new file mode 100644
index 0000000000000..76ab3622ffb57
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q31.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q32.sql b/benchmarks/queries/clickbench/queries/q32.sql
new file mode 100644
index 0000000000000..88f1e4ce42d23
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q32.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q33.sql b/benchmarks/queries/clickbench/queries/q33.sql
new file mode 100644
index 0000000000000..3740503bbc0e9
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q33.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q34.sql b/benchmarks/queries/clickbench/queries/q34.sql
new file mode 100644
index 0000000000000..fdb7edbb656ac
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q34.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q35.sql b/benchmarks/queries/clickbench/queries/q35.sql
new file mode 100644
index 0000000000000..de7e2256eb551
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q35.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q36.sql b/benchmarks/queries/clickbench/queries/q36.sql
new file mode 100644
index 0000000000000..81b1199b0381e
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q36.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q37.sql b/benchmarks/queries/clickbench/queries/q37.sql
new file mode 100644
index 0000000000000..fa4b85ffbd9cb
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q37.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q38.sql b/benchmarks/queries/clickbench/queries/q38.sql
new file mode 100644
index 0000000000000..18fafab6c888f
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q38.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
diff --git a/benchmarks/queries/clickbench/queries/q39.sql b/benchmarks/queries/clickbench/queries/q39.sql
new file mode 100644
index 0000000000000..306f0caacff64
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q39.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
diff --git a/benchmarks/queries/clickbench/queries/q4.sql b/benchmarks/queries/clickbench/queries/q4.sql
new file mode 100644
index 0000000000000..d89ca78c2fb6f
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q4.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(DISTINCT "UserID") FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q40.sql b/benchmarks/queries/clickbench/queries/q40.sql
new file mode 100644
index 0000000000000..e9d27f5985fa9
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q40.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
diff --git a/benchmarks/queries/clickbench/queries/q41.sql b/benchmarks/queries/clickbench/queries/q41.sql
new file mode 100644
index 0000000000000..0e067e2dfc9da
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q41.sql
@@ -0,0 +1,3 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
diff --git a/benchmarks/queries/clickbench/queries/q42.sql b/benchmarks/queries/clickbench/queries/q42.sql
new file mode 100644
index 0000000000000..111cc1d3c4a9d
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q42.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
diff --git a/benchmarks/queries/clickbench/queries/q5.sql b/benchmarks/queries/clickbench/queries/q5.sql
new file mode 100644
index 0000000000000..d371cfb6b3557
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q5.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q6.sql b/benchmarks/queries/clickbench/queries/q6.sql
new file mode 100644
index 0000000000000..5b4e896a1df26
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q6.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
diff --git a/benchmarks/queries/clickbench/queries/q7.sql b/benchmarks/queries/clickbench/queries/q7.sql
new file mode 100644
index 0000000000000..afffcb1306d54
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q7.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
diff --git a/benchmarks/queries/clickbench/queries/q8.sql b/benchmarks/queries/clickbench/queries/q8.sql
new file mode 100644
index 0000000000000..097880a9da5ed
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q8.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/q9.sql b/benchmarks/queries/clickbench/queries/q9.sql
new file mode 100644
index 0000000000000..cb1b79bf5bdc1
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/q9.sql
@@ -0,0 +1,4 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+
+SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
diff --git a/benchmarks/queries/clickbench/queries/sorted_data/q0.sql b/benchmarks/queries/clickbench/queries/sorted_data/q0.sql
new file mode 100644
index 0000000000000..1170a383bcb22
--- /dev/null
+++ b/benchmarks/queries/clickbench/queries/sorted_data/q0.sql
@@ -0,0 +1,3 @@
+-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
+-- set datafusion.execution.parquet.binary_as_string = true
+SELECT * FROM hits ORDER BY "EventTime" DESC limit 10;
diff --git a/benchmarks/queries/clickbench/update_queries.sh b/benchmarks/queries/clickbench/update_queries.sh
new file mode 100755
index 0000000000000..d7db7359aa394
--- /dev/null
+++ b/benchmarks/queries/clickbench/update_queries.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is meant for developers of DataFusion -- it is runnable
+# from the standard DataFusion development environment and uses cargo,
+# etc and orchestrates gathering data and run the benchmark binary in
+# different configurations.
+
+# Script to download ClickBench queries and split them into individual files
+
+set -e  # Exit on any error
+
+# URL for the raw file (not the GitHub page)
+URL="https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql"
+
+# Temporary file to store downloaded content
+TEMP_FILE="queries.sql"
+
+TARGET_DIR="queries"
+
+# Download the file
+echo "Downloading queries from $URL..."
+if command -v curl &> /dev/null; then
+    curl -s -o "$TEMP_FILE" "$URL"
+elif command -v wget &> /dev/null; then
+    wget -q -O "$TEMP_FILE" "$URL"
+else
+    echo "Error: Neither curl nor wget is available. Please install one of them."
+    exit 1
+fi
+
+# Check if download was successful
+if [ ! -f "$TEMP_FILE" ] || [ ! -s "$TEMP_FILE" ]; then
+    echo "Error: Failed to download or file is empty"
+    exit 1
+fi
+
+# Initialize counter
+counter=0
+
+# Ensure the target directory exists
+if [ ! -d ${TARGET_DIR} ]; then
+  mkdir -p ${TARGET_DIR}
+fi
+
+# Read the file line by line and create individual query files
+mapfile -t lines < $TEMP_FILE
+for line in "${lines[@]}"; do
+    # Skip empty lines
+    if [ -n "$line" ]; then
+        # Create filename with zero-padded counter
+        filename="q${counter}.sql"
+
+        # Write the line to the individual file
+        echo "$line" > "${TARGET_DIR}/$filename"
+
+        echo "Created ${TARGET_DIR}/$filename"
+
+        # Increment counter
+        (( counter += 1 ))
+    fi
+done
+
+# Clean up temporary file
+rm "$TEMP_FILE"
\ No newline at end of file
diff --git a/benchmarks/queries/q10.sql b/benchmarks/queries/q10.sql
index cf45e43485fb5..8613fd4962837 100644
--- a/benchmarks/queries/q10.sql
+++ b/benchmarks/queries/q10.sql
@@ -28,4 +28,5 @@ group by
     c_address,
     c_comment
 order by
-    revenue desc;
\ No newline at end of file
+    revenue desc
+limit 20;
diff --git a/benchmarks/queries/q18.sql b/benchmarks/queries/q18.sql
index 835de28a57be2..ba7ee7f716cf1 100644
--- a/benchmarks/queries/q18.sql
+++ b/benchmarks/queries/q18.sql
@@ -29,4 +29,5 @@ group by
     o_totalprice
 order by
     o_totalprice desc,
-    o_orderdate;
\ No newline at end of file
+    o_orderdate
+limit 100;
diff --git a/benchmarks/queries/q2.sql b/benchmarks/queries/q2.sql
index f66af210205e9..68e478f65d3f9 100644
--- a/benchmarks/queries/q2.sql
+++ b/benchmarks/queries/q2.sql
@@ -40,4 +40,5 @@ order by
     s_acctbal desc,
     n_name,
     s_name,
-    p_partkey;
\ No newline at end of file
+    p_partkey
+limit 100;
diff --git a/benchmarks/queries/q21.sql b/benchmarks/queries/q21.sql
index 9d2fe32cee228..b95e7b0dfca02 100644
--- a/benchmarks/queries/q21.sql
+++ b/benchmarks/queries/q21.sql
@@ -36,4 +36,5 @@ group by
     s_name
 order by
     numwait desc,
-    s_name;
\ No newline at end of file
+    s_name
+limit 100;
diff --git a/benchmarks/queries/q3.sql b/benchmarks/queries/q3.sql
index 7dbc6d9ef6783..e5fa9e38664c3 100644
--- a/benchmarks/queries/q3.sql
+++ b/benchmarks/queries/q3.sql
@@ -19,4 +19,5 @@ group by
     o_shippriority
 order by
     revenue desc,
-    o_orderdate;
\ No newline at end of file
+    o_orderdate
+limit 10;
diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
index 06337cb758885..7e21890519fd1 100644
--- a/benchmarks/src/bin/dfbench.rs
+++ b/benchmarks/src/bin/dfbench.rs
@@ -18,7 +18,7 @@
 //! DataFusion benchmark runner
 use datafusion::error::Result;
 
-use structopt::StructOpt;
+use clap::{Parser, Subcommand};
 
 #[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
 compile_error!(
@@ -34,21 +34,28 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
 use datafusion_benchmarks::{
-    cancellation, clickbench, h2o, imdb, parquet_filter, sort, sort_tpch, tpch,
+    cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch,
 };
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
+#[derive(Debug, Parser)]
+#[command(about = "benchmark command")]
+struct Cli {
+    #[command(subcommand)]
+    command: Options,
+}
+
+#[derive(Debug, Subcommand)]
 enum Options {
     Cancellation(cancellation::RunOpt),
     Clickbench(clickbench::RunOpt),
     H2o(h2o::RunOpt),
+    HJ(hj::RunOpt),
     Imdb(imdb::RunOpt),
-    ParquetFilter(parquet_filter::RunOpt),
-    Sort(sort::RunOpt),
+    Nlj(nlj::RunOpt),
+    Smj(smj::RunOpt),
     SortTpch(sort_tpch::RunOpt),
     Tpch(tpch::RunOpt),
-    TpchConvert(tpch::ConvertOpt),
+    Tpcds(tpcds::RunOpt),
 }
 
 // Main benchmark runner entrypoint
@@ -56,15 +63,17 @@ enum Options {
 pub async fn main() -> Result<()> {
     env_logger::init();
 
-    match Options::from_args() {
+    let cli = Cli::parse();
+    match cli.command {
         Options::Cancellation(opt) => opt.run().await,
         Options::Clickbench(opt) => opt.run().await,
         Options::H2o(opt) => opt.run().await,
-        Options::Imdb(opt) => opt.run().await,
-        Options::ParquetFilter(opt) => opt.run().await,
-        Options::Sort(opt) => opt.run().await,
+        Options::HJ(opt) => opt.run().await,
+        Options::Imdb(opt) => Box::pin(opt.run()).await,
+        Options::Nlj(opt) => opt.run().await,
+        Options::Smj(opt) => opt.run().await,
         Options::SortTpch(opt) => opt.run().await,
-        Options::Tpch(opt) => opt.run().await,
-        Options::TpchConvert(opt) => opt.run().await,
+        Options::Tpch(opt) => Box::pin(opt.run()).await,
+        Options::Tpcds(opt) => Box::pin(opt.run()).await,
     }
 }
diff --git a/benchmarks/src/bin/external_aggr.rs b/benchmarks/src/bin/external_aggr.rs
index 36cd64222cc6b..ee604ec7365a1 100644
--- a/benchmarks/src/bin/external_aggr.rs
+++ b/benchmarks/src/bin/external_aggr.rs
@@ -17,13 +17,13 @@
 
 //! external_aggr binary entrypoint
 
+use clap::{Args, Parser, Subcommand};
 use datafusion::execution::memory_pool::GreedyMemoryPool;
 use datafusion::execution::memory_pool::MemoryPool;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::sync::LazyLock;
-use structopt::StructOpt;
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty;
@@ -33,55 +33,56 @@ use datafusion::datasource::listing::{
 };
 use datafusion::datasource::{MemTable, TableProvider};
 use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
 use datafusion::execution::memory_pool::FairSpillPool;
-use datafusion::execution::memory_pool::{human_readable_size, units};
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
-use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::*;
-use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
+use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
-use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
+use datafusion_common::{DEFAULT_PARQUET_EXTENSION, exec_err};
+use datafusion_common::{human_readable_size, units};
 
-#[derive(Debug, StructOpt)]
-#[structopt(
+#[derive(Debug, Parser)]
+#[command(
     name = "datafusion-external-aggregation",
     about = "DataFusion external aggregation benchmark"
 )]
+struct Cli {
+    #[command(subcommand)]
+    command: ExternalAggrOpt,
+}
+
+#[derive(Debug, Subcommand)]
 enum ExternalAggrOpt {
     Benchmark(ExternalAggrConfig),
 }
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 struct ExternalAggrConfig {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
+    #[arg(short, long)]
     query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files (lineitem). Only parquet format is supported
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to JSON benchmark result to be compare using `compare.py`
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 /// Query Memory Limits
 /// Map query id to predefined memory limits
 ///
@@ -118,7 +119,7 @@ impl ExternalAggrConfig {
         "#,
     ];
 
-    /// If `--query` and `--memory-limit` is not speicified, run all queries
+    /// If `--query` and `--memory-limit` is not specified, run all queries
     /// with pre-configured memory limits
     /// If only `--query` is specified, run the query with all memory limits
     /// for this query
@@ -343,7 +344,8 @@ impl ExternalAggrConfig {
 pub async fn main() -> Result<()> {
     env_logger::init();
 
-    match ExternalAggrOpt::from_args() {
+    let cli = Cli::parse();
+    match cli.command {
         ExternalAggrOpt::Benchmark(opt) => opt.run().await?,
     }
 
diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs
index 13421f8a89a9b..e86735f87b8f1 100644
--- a/benchmarks/src/bin/imdb.rs
+++ b/benchmarks/src/bin/imdb.rs
@@ -17,9 +17,9 @@
 
 //! IMDB binary entrypoint
 
+use clap::{Parser, Subcommand};
 use datafusion::error::Result;
 use datafusion_benchmarks::imdb;
-use structopt::StructOpt;
 
 #[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
 compile_error!(
@@ -34,26 +34,32 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
-enum BenchmarkSubCommandOpt {
-    #[structopt(name = "datafusion")]
-    DataFusionBenchmark(imdb::RunOpt),
+#[derive(Debug, Parser)]
+#[command(name = "IMDB", about = "IMDB Dataset Processing.")]
+struct Cli {
+    #[command(subcommand)]
+    command: ImdbOpt,
 }
 
-#[derive(Debug, StructOpt)]
-#[structopt(name = "IMDB", about = "IMDB Dataset Processing.")]
+#[derive(Debug, Subcommand)]
 enum ImdbOpt {
+    #[command(subcommand)]
     Benchmark(BenchmarkSubCommandOpt),
     Convert(imdb::ConvertOpt),
 }
 
+#[derive(Debug, Subcommand)]
+enum BenchmarkSubCommandOpt {
+    #[command(name = "datafusion")]
+    DataFusionBenchmark(imdb::RunOpt),
+}
+
 #[tokio::main]
 pub async fn main() -> Result<()> {
     env_logger::init();
-    match ImdbOpt::from_args() {
+    match Cli::parse().command {
         ImdbOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => {
-            opt.run().await
+            Box::pin(opt.run()).await
         }
         ImdbOpt::Convert(opt) => opt.run().await,
     }
diff --git a/benchmarks/src/bin/mem_profile.rs b/benchmarks/src/bin/mem_profile.rs
new file mode 100644
index 0000000000000..41a0baecbba86
--- /dev/null
+++ b/benchmarks/src/bin/mem_profile.rs
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! mem_profile binary entrypoint
+use clap::{Parser, Subcommand};
+use datafusion::error::Result;
+use std::{
+    env,
+    io::{BufRead, BufReader},
+    path::Path,
+    process::{Command, Stdio},
+};
+
+use datafusion_benchmarks::{
+    clickbench,
+    h2o::{self, AllQueries},
+    imdb, sort_tpch, tpch,
+};
+
+#[derive(Debug, Parser)]
+#[command(name = "Memory Profiling Utility")]
+struct Cli {
+    /// Cargo profile to use in dfbench (e.g. release, release-nonlto)
+    #[arg(long, default_value = "release")]
+    bench_profile: String,
+
+    #[command(subcommand)]
+    command: Options,
+}
+
+#[derive(Debug, Subcommand)]
+#[command(about = "Benchmark command")]
+enum Options {
+    Clickbench(clickbench::RunOpt),
+    H2o(h2o::RunOpt),
+    Imdb(imdb::RunOpt),
+    SortTpch(sort_tpch::RunOpt),
+    Tpch(tpch::RunOpt),
+}
+
+#[tokio::main]
+pub async fn main() -> Result<()> {
+    // 1. Parse args and check which benchmarks should be run
+    let cli = Cli::parse();
+    let profile = cli.bench_profile;
+    let query_range = match cli.command {
+        Options::Clickbench(opt) => {
+            let entries = std::fs::read_dir(&opt.queries_path)?
+                .filter_map(Result::ok)
+                .filter(|e| {
+                    let path = e.path();
+                    path.extension().map(|ext| ext == "sql").unwrap_or(false)
+                })
+                .collect::<Vec<_>>();
+
+            let max_query_id = entries.len().saturating_sub(1);
+            match opt.query {
+                Some(query_id) => query_id..=query_id,
+                None => 0..=max_query_id,
+            }
+        }
+        Options::H2o(opt) => {
+            let queries = AllQueries::try_new(&opt.queries_path)?;
+            match opt.query {
+                Some(query_id) => query_id..=query_id,
+                None => queries.min_query_id()..=queries.max_query_id(),
+            }
+        }
+        Options::Imdb(opt) => match opt.query {
+            Some(query_id) => query_id..=query_id,
+            None => imdb::IMDB_QUERY_START_ID..=imdb::IMDB_QUERY_END_ID,
+        },
+        Options::SortTpch(opt) => match opt.query {
+            Some(query_id) => query_id..=query_id,
+            None => {
+                sort_tpch::SORT_TPCH_QUERY_START_ID..=sort_tpch::SORT_TPCH_QUERY_END_ID
+            }
+        },
+        Options::Tpch(opt) => match opt.query {
+            Some(query_id) => query_id..=query_id,
+            None => tpch::TPCH_QUERY_START_ID..=tpch::TPCH_QUERY_END_ID,
+        },
+    };
+
+    // 2. Prebuild dfbench binary so that memory does not blow up due to build process
+    println!("Pre-building benchmark binary...");
+    let status = Command::new("cargo")
+        .args([
+            "build",
+            "--profile",
+            &profile,
+            "--features",
+            "mimalloc_extended",
+            "--bin",
+            "dfbench",
+        ])
+        .status()
+        .expect("Failed to build dfbench");
+    assert!(status.success());
+    println!("Benchmark binary built successfully.");
+
+    // 3. Create a new process per each benchmark query and print summary
+    // Find position of subcommand to collect args for dfbench
+    let args: Vec<_> = env::args().collect();
+    let subcommands = ["tpch", "clickbench", "h2o", "imdb", "sort-tpch"];
+    let sub_pos = args
+        .iter()
+        .position(|s| subcommands.iter().any(|&cmd| s == cmd))
+        .expect("No benchmark subcommand found");
+
+    // Args starting from subcommand become dfbench args
+    let mut dfbench_args: Vec<String> =
+        args[sub_pos..].iter().map(|s| s.to_string()).collect();
+
+    run_benchmark_as_child_process(&profile, query_range, &mut dfbench_args)?;
+
+    Ok(())
+}
+
+fn run_benchmark_as_child_process(
+    profile: &str,
+    query_range: std::ops::RangeInclusive<usize>,
+    args: &mut Vec<String>,
+) -> Result<()> {
+    let mut query_strings: Vec<String> = Vec::new();
+    for i in query_range {
+        query_strings.push(i.to_string());
+    }
+
+    let target_dir =
+        env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".to_string());
+    let command = format!("{target_dir}/{profile}/dfbench");
+    // Check whether benchmark binary exists
+    if !Path::new(&command).exists() {
+        panic!(
+            "Benchmark binary not found: `{command}`\nRun this command from the top-level `datafusion/` directory so `target/{profile}/dfbench` can be found.",
+        );
+    }
+    args.insert(0, command);
+    let mut results = vec![];
+
+    // Run Single Query (args already contain --query num)
+    if args.contains(&"--query".to_string()) {
+        let _ = run_query(args, &mut results);
+        print_summary_table(&results);
+        return Ok(());
+    }
+
+    // Run All Queries
+    args.push("--query".to_string());
+    for query_str in query_strings {
+        args.push(query_str);
+        let _ = run_query(args, &mut results);
+        args.pop();
+    }
+
+    print_summary_table(&results);
+    Ok(())
+}
+
+fn run_query(args: &[String], results: &mut Vec<QueryResult>) -> Result<()> {
+    let exec_path = &args[0];
+    let exec_args = &args[1..];
+
+    let mut child = Command::new(exec_path)
+        .args(exec_args)
+        .stdout(Stdio::piped())
+        .spawn()
+        .expect("Failed to start benchmark");
+
+    let stdout = child.stdout.take().unwrap();
+    let reader = BufReader::new(stdout);
+
+    // Buffer child's stdout
+    let lines: Result<Vec<String>, std::io::Error> =
+        reader.lines().collect::<Result<_, _>>();
+
+    child
+        .wait()
+        .expect("Benchmark process exited with an error");
+
+    // Parse after child process terminates
+    let lines = lines?;
+    let mut iter = lines.iter().peekable();
+
+    // Look for lines that contain execution time / memory stats
+    while let Some(line) = iter.next() {
+        if let Some((query, duration_ms)) = parse_query_time(line)
+            && let Some(next_line) = iter.peek()
+            && let Some((peak_rss, peak_commit, page_faults)) = parse_vm_line(next_line)
+        {
+            results.push(QueryResult {
+                query,
+                duration_ms,
+                peak_rss,
+                peak_commit,
+                page_faults,
+            });
+            break;
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Debug)]
+struct QueryResult {
+    query: usize,
+    duration_ms: f64,
+    peak_rss: String,
+    peak_commit: String,
+    page_faults: String,
+}
+
+fn parse_query_time(line: &str) -> Option<(usize, f64)> {
+    let re = regex::Regex::new(r"Query (\d+) avg time: ([\d.]+) ms").unwrap();
+    if let Some(caps) = re.captures(line) {
+        let query_id = caps[1].parse::<usize>().ok()?;
+        let avg_time = caps[2].parse::<f64>().ok()?;
+        Some((query_id, avg_time))
+    } else {
+        None
+    }
+}
+
+fn parse_vm_line(line: &str) -> Option<(String, String, String)> {
+    let re = regex::Regex::new(
+        r"Peak RSS:\s*([\d.]+\s*[A-Z]+),\s*Peak Commit:\s*([\d.]+\s*[A-Z]+),\s*Page Faults:\s*([\d.]+)"
+    ).ok()?;
+    let caps = re.captures(line)?;
+    let peak_rss = caps.get(1)?.as_str().to_string();
+    let peak_commit = caps.get(2)?.as_str().to_string();
+    let page_faults = caps.get(3)?.as_str().to_string();
+    Some((peak_rss, peak_commit, page_faults))
+}
+
+// Print as simple aligned table
+fn print_summary_table(results: &[QueryResult]) {
+    println!(
+        "\n{:<8} {:>10} {:>12} {:>12} {:>18}",
+        "Query", "Time (ms)", "Peak RSS", "Peak Commit", "Major Page Faults"
+    );
+    println!("{}", "-".repeat(64));
+
+    for r in results {
+        println!(
+            "{:<8} {:>10.2} {:>12} {:>12} {:>18}",
+            r.query, r.duration_ms, r.peak_rss, r.peak_commit, r.page_faults
+        );
+    }
+}
+
+#[cfg(test)]
+// Only run with "ci" mode when we have the data
+#[cfg(feature = "ci")]
+mod tests {
+    use datafusion::common::exec_err;
+    use datafusion::error::Result;
+    use std::path::{Path, PathBuf};
+    use std::process::Command;
+
+    fn get_tpch_data_path() -> Result<String> {
+        let path =
+            std::env::var("TPCH_DATA").unwrap_or_else(|_| "benchmarks/data".to_string());
+        if !Path::new(&path).exists() {
+            return exec_err!(
+                "Benchmark data not found (set TPCH_DATA env var to override): {}",
+                path
+            );
+        }
+        Ok(path)
+    }
+
+    // Try to find target/ dir upward
+    fn find_target_dir(start: &Path) -> Option<PathBuf> {
+        let mut dir = start;
+
+        while let Some(current) = Some(dir) {
+            if current.join("target").is_dir() {
+                return Some(current.join("target"));
+            }
+
+            dir = match current.parent() {
+                Some(parent) => parent,
+                None => break,
+            };
+        }
+
+        None
+    }
+
+    #[test]
+    // This test checks whether `mem_profile` runs successfully and produces expected output
+    // using TPC-H query 6 (which runs quickly).
+    fn mem_profile_e2e_tpch_q6() -> Result<()> {
+        let profile = "ci";
+        let tpch_data = get_tpch_data_path()?;
+
+        // The current working directory may not be the top-level datafusion/ directory,
+        // so we manually walkdir upward, locate the target directory
+        // and set it explicitly via CARGO_TARGET_DIR for the mem_profile command.
+        let target_dir = find_target_dir(&std::env::current_dir()?);
+        let output = Command::new("cargo")
+            .env("CARGO_TARGET_DIR", target_dir.unwrap())
+            .args([
+                "run",
+                "--profile",
+                profile,
+                "--bin",
+                "mem_profile",
+                "--",
+                "--bench-profile",
+                profile,
+                "tpch",
+                "--query",
+                "6",
+                "--path",
+                &tpch_data,
+                "--format",
+                "tbl",
+            ])
+            .output()
+            .expect("Failed to run mem_profile");
+
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let stderr = String::from_utf8_lossy(&output.stderr);
+
+        if !output.status.success() {
+            panic!(
+                "mem_profile failed\nstdout:\n{stdout}\nstderr:\n{stderr}---------------------",
+            );
+        }
+
+        assert!(
+            stdout.contains("Peak RSS")
+                && stdout.contains("Query")
+                && stdout.contains("Time"),
+            "Unexpected output:\n{stdout}",
+        );
+
+        Ok(())
+    }
+}
diff --git a/benchmarks/src/bin/parquet.rs b/benchmarks/src/bin/parquet.rs
deleted file mode 100644
index 6351a71a7bd3f..0000000000000
--- a/benchmarks/src/bin/parquet.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::common::Result;
-
-use datafusion_benchmarks::{parquet_filter, sort};
-use structopt::StructOpt;
-
-#[cfg(feature = "snmalloc")]
-#[global_allocator]
-static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
-
-#[derive(Debug, Clone, StructOpt)]
-#[structopt(name = "Benchmarks", about = "Apache DataFusion Rust Benchmarks.")]
-enum ParquetBenchCmd {
-    /// Benchmark sorting parquet files
-    Sort(sort::RunOpt),
-    /// Benchmark parquet filter pushdown
-    Filter(parquet_filter::RunOpt),
-}
-
-#[tokio::main]
-async fn main() -> Result<()> {
-    let cmd = ParquetBenchCmd::from_args();
-    match cmd {
-        ParquetBenchCmd::Filter(opt) => {
-            println!("running filter benchmarks");
-            opt.run().await
-        }
-        ParquetBenchCmd::Sort(opt) => {
-            println!("running sort benchmarks");
-            opt.run().await
-        }
-    }
-}
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
deleted file mode 100644
index 3270b082cfb43..0000000000000
--- a/benchmarks/src/bin/tpch.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! tpch binary only entrypoint
-
-use datafusion::error::Result;
-use datafusion_benchmarks::tpch;
-use structopt::StructOpt;
-
-#[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
-compile_error!(
-    "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time"
-);
-
-#[cfg(feature = "snmalloc")]
-#[global_allocator]
-static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
-
-#[cfg(feature = "mimalloc")]
-#[global_allocator]
-static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-#[derive(Debug, StructOpt)]
-#[structopt(about = "benchmark command")]
-enum BenchmarkSubCommandOpt {
-    #[structopt(name = "datafusion")]
-    DataFusionBenchmark(tpch::RunOpt),
-}
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")]
-enum TpchOpt {
-    Benchmark(BenchmarkSubCommandOpt),
-    Convert(tpch::ConvertOpt),
-}
-
-/// 'tpch' entry point, with tortured command line arguments.  Please
-/// use `dbbench` instead.
-///
-/// Note: this is kept to be backwards compatible with the benchmark names prior to
-/// <https://github.com/apache/datafusion/issues/6994>
-#[tokio::main]
-async fn main() -> Result<()> {
-    env_logger::init();
-    match TpchOpt::from_args() {
-        TpchOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => {
-            opt.run().await
-        }
-        TpchOpt::Convert(opt) => opt.run().await,
-    }
-}
diff --git a/benchmarks/src/cancellation.rs b/benchmarks/src/cancellation.rs
index fcf03fbc54550..d3da1b0e83623 100644
--- a/benchmarks/src/cancellation.rs
+++ b/benchmarks/src/cancellation.rs
@@ -24,24 +24,24 @@ use crate::util::{BenchmarkRun, CommonOpt};
 use arrow::array::Array;
 use arrow::datatypes::DataType;
 use arrow::record_batch::RecordBatch;
+use clap::Args;
 use datafusion::common::{Result, ScalarValue};
-use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
-use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::execution::TaskContext;
-use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::prelude::*;
 use datafusion_common::instant::Instant;
 use futures::TryStreamExt;
 use object_store::ObjectStore;
-use parquet::arrow::async_writer::ParquetObjectWriter;
 use parquet::arrow::AsyncArrowWriter;
+use parquet::arrow::async_writer::ParquetObjectWriter;
+use rand::Rng;
 use rand::distr::Alphanumeric;
 use rand::rngs::ThreadRng;
-use rand::Rng;
-use structopt::StructOpt;
 use tokio::runtime::Runtime;
 use tokio_util::sync::CancellationToken;
 
@@ -57,31 +57,31 @@ use tokio_util::sync::CancellationToken;
 /// The query is an anonymized version of a real-world query, and the
 /// test starts the query then cancels it and reports how long it takes
 /// for the runtime to fully exit.
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to folder where data will be generated
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Number of files to generate
-    #[structopt(long = "num-files", default_value = "7")]
+    #[arg(long = "num-files", default_value = "7")]
     num_files: usize,
 
     /// Number of rows per file to generate
-    #[structopt(long = "num-rows-per-file", default_value = "5000000")]
+    #[arg(long = "num-rows-per-file", default_value = "5000000")]
     num_rows_per_file: usize,
 
     /// How long to wait, in milliseconds, before attempting to cancel
-    #[structopt(long = "wait-time", default_value = "100")]
+    #[arg(long = "wait-time", default_value = "100")]
     wait_time: u64,
 }
 
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
index 2e934346748e1..70aaeb7d2d192 100644
--- a/benchmarks/src/clickbench.rs
+++ b/benchmarks/src/clickbench.rs
@@ -15,19 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::path::Path;
-use std::path::PathBuf;
+use std::fs;
+use std::io::ErrorKind;
+use std::path::{Path, PathBuf};
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
+use clap::Args;
+use datafusion::logical_expr::{ExplainFormat, ExplainOption};
 use datafusion::{
     error::{DataFusionError, Result},
     prelude::SessionContext,
 };
 use datafusion_common::exec_datafusion_err;
 use datafusion_common::instant::Instant;
-use structopt::StructOpt;
 
-/// Run the clickbench benchmark
+/// SQL to create the hits view with proper EventDate casting.
+///
+/// ClickBench stores EventDate as UInt16 (days since 1970-01-01) for
+/// storage efficiency (2 bytes vs 4-8 bytes for date types).
+/// This view transforms it to SQL DATE type for query compatibility.
+const HITS_VIEW_DDL: &str = r#"CREATE VIEW hits AS
+SELECT * EXCEPT ("EventDate"),
+       CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
+FROM hits_raw"#;
+
+/// Driver program to run the ClickBench benchmark
 ///
 /// The ClickBench[1] benchmarks are widely cited in the industry and
 /// focus on grouping / aggregation / filtering. This runner uses the
@@ -35,140 +47,308 @@ use structopt::StructOpt;
 ///
 /// [1]: https://github.com/ClickHouse/ClickBench
 /// [2]: https://github.com/ClickHouse/ClickBench/tree/main/datafusion
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number (between 0 and 42). If not specified, runs all queries
-    #[structopt(short, long)]
-    query: Option<usize>,
+    #[arg(short, long)]
+    pub query: Option<usize>,
+
+    /// If specified, enables Parquet Filter Pushdown.
+    ///
+    /// Specifically, it enables:
+    /// * `pushdown_filters = true`
+    /// * `reorder_filters = true`
+    #[arg(long = "pushdown")]
+    pushdown: bool,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to hits.parquet (single file) or `hits_partitioned`
     /// (partitioned, 100 files)
-    #[structopt(
-        parse(from_os_str),
-        short = "p",
+    #[arg(
+        short = 'p',
         long = "path",
         default_value = "benchmarks/data/hits.parquet"
     )]
     path: PathBuf,
 
-    /// Path to queries.sql (single file)
-    #[structopt(
-        parse(from_os_str),
-        short = "r",
+    /// Path to queries directory
+    #[arg(
+        short = 'r',
         long = "queries-path",
-        default_value = "benchmarks/queries/clickbench/queries.sql"
+        default_value = "benchmarks/queries/clickbench/queries"
     )]
-    queries_path: PathBuf,
+    pub queries_path: PathBuf,
 
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
-}
 
-struct AllQueries {
-    queries: Vec<String>,
-}
+    /// Column name that the data is sorted by (e.g., "EventTime")
+    /// If specified, DataFusion will be informed that the data has this sort order
+    /// using CREATE EXTERNAL TABLE with WITH ORDER clause.
+    ///
+    /// Recommended to use with: -c datafusion.optimizer.prefer_existing_sort=true
+    /// This allows DataFusion to optimize away redundant sorts while maintaining
+    /// multi-core parallelism for other operations.
+    #[arg(long = "sorted-by")]
+    sorted_by: Option<String>,
 
-impl AllQueries {
-    fn try_new(path: &Path) -> Result<Self> {
-        // ClickBench has all queries in a single file identified by line number
-        let all_queries = std::fs::read_to_string(path)
-            .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
-        Ok(Self {
-            queries: all_queries.lines().map(|s| s.to_string()).collect(),
-        })
-    }
+    /// Sort order: ASC or DESC (default: ASC)
+    #[arg(long = "sort-order", default_value = "ASC")]
+    sort_order: String,
 
-    /// Returns the text of query `query_id`
-    fn get_query(&self, query_id: usize) -> Result<&str> {
-        self.queries
-            .get(query_id)
-            .ok_or_else(|| {
-                let min_id = self.min_query_id();
-                let max_id = self.max_query_id();
-                exec_datafusion_err!(
-                    "Invalid query id {query_id}. Must be between {min_id} and {max_id}"
-                )
-            })
-            .map(|s| s.as_str())
-    }
+    /// Configuration options in the format key=value
+    /// Can be specified multiple times.
+    ///
+    /// Example: -c datafusion.optimizer.prefer_existing_sort=true
+    #[arg(short = 'c', long = "config")]
+    config_options: Vec<String>,
+}
 
-    fn min_query_id(&self) -> usize {
-        0
-    }
+/// Get the SQL file path
+pub fn get_query_path(query_dir: &Path, query: usize) -> PathBuf {
+    let mut query_path = query_dir.to_path_buf();
+    query_path.push(format!("q{query}.sql"));
+    query_path
+}
 
-    fn max_query_id(&self) -> usize {
-        self.queries.len() - 1
+/// Get the SQL statement from the specified query file
+pub fn get_query_sql(query_path: &Path) -> Result<Option<String>> {
+    if fs::exists(query_path)? {
+        Ok(Some(fs::read_to_string(query_path)?))
+    } else {
+        Ok(None)
     }
 }
+
 impl RunOpt {
     pub async fn run(self) -> Result<()> {
         println!("Running benchmarks with the following options: {self:?}");
-        let queries = AllQueries::try_new(self.queries_path.as_path())?;
+
+        let query_dir_metadata = fs::metadata(&self.queries_path).map_err(|e| {
+            if e.kind() == ErrorKind::NotFound {
+                exec_datafusion_err!(
+                    "Query path '{}' does not exist.",
+                    &self.queries_path.to_str().unwrap()
+                )
+            } else {
+                DataFusionError::External(Box::new(e))
+            }
+        })?;
+
+        if !query_dir_metadata.is_dir() {
+            return Err(exec_datafusion_err!(
+                "Query path '{}' is not a directory.",
+                &self.queries_path.to_str().unwrap()
+            ));
+        }
+
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
-            None => queries.min_query_id()..=queries.max_query_id(),
+            None => 0..=usize::MAX,
         };
 
         // configure parquet options
         let mut config = self.common.config()?;
+
+        if self.sorted_by.is_some() {
+            println!("ℹ️  Data is registered with sort order");
+
+            let has_prefer_sort = self
+                .config_options
+                .iter()
+                .any(|opt| opt.contains("prefer_existing_sort=true"));
+
+            if !has_prefer_sort {
+                println!(
+                    "ℹ️  Consider using -c datafusion.optimizer.prefer_existing_sort=true"
+                );
+                println!("ℹ️  to optimize queries while maintaining parallelism");
+            }
+        }
+
+        // Apply user-provided configuration options
+        for config_opt in &self.config_options {
+            let parts: Vec<&str> = config_opt.splitn(2, '=').collect();
+            if parts.len() != 2 {
+                return Err(exec_datafusion_err!(
+                    "Invalid config option format: '{}'. Expected 'key=value'",
+                    config_opt
+                ));
+            }
+            let key = parts[0];
+            let value = parts[1];
+
+            println!("Setting config: {key} = {value}");
+            config = config.set_str(key, value);
+        }
+
         {
             let parquet_options = &mut config.options_mut().execution.parquet;
             // The hits_partitioned dataset specifies string columns
             // as binary due to how it was written. Force it to strings
             parquet_options.binary_as_string = true;
+
+            // Turn on Parquet filter pushdown if requested
+            if self.pushdown {
+                parquet_options.pushdown_filters = true;
+                parquet_options.reorder_filters = true;
+            }
+
+            if self.sorted_by.is_some() {
+                // We should compare the dynamic topk optimization when data is sorted, so we make the
+                // assumption that filter pushdown is also enabled in this case.
+                parquet_options.pushdown_filters = true;
+                parquet_options.reorder_filters = true;
+            }
         }
 
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
         self.register_hits(&ctx).await?;
 
-        let iterations = self.common.iterations;
         let mut benchmark_run = BenchmarkRun::new();
         for query_id in query_range {
-            let mut millis = Vec::with_capacity(iterations);
+            let query_path = get_query_path(&self.queries_path, query_id);
+            let Some(sql) = get_query_sql(&query_path)? else {
+                if self.query.is_some() {
+                    return Err(exec_datafusion_err!(
+                        "Could not load query file '{}'.",
+                        &query_path.to_str().unwrap()
+                    ));
+                }
+                break;
+            };
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let sql = queries.get_query(query_id)?;
-            println!("Q{query_id}: {sql}");
-
-            for i in 0..iterations {
-                let start = Instant::now();
-                let results = ctx.sql(sql).await?.collect().await?;
-                let elapsed = start.elapsed();
-                let ms = elapsed.as_secs_f64() * 1000.0;
-                millis.push(ms);
-                let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
-                println!(
-                    "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
-                );
-                benchmark_run.write_iter(elapsed, row_count);
-            }
-            if self.common.debug {
-                ctx.sql(sql).await?.explain(false, false)?.show().await?;
+            let query_run = self.benchmark_query(&sql, query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
-            let avg = millis.iter().sum::<f64>() / millis.len() as f64;
-            println!("Query {query_id} avg time: {avg:.2} ms");
         }
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_id: usize,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        println!("Q{query_id}: {sql}");
+
+        let mut millis = Vec::with_capacity(self.iterations());
+        let mut query_results = vec![];
+        for i in 0..self.iterations() {
+            let start = Instant::now();
+            let results = ctx.sql(sql).await?.collect().await?;
+            let elapsed = start.elapsed();
+            let ms = elapsed.as_secs_f64() * 1000.0;
+            millis.push(ms);
+            let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
+            println!(
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+            );
+            query_results.push(QueryResult { elapsed, row_count })
+        }
+        if self.common.debug {
+            ctx.sql(sql)
+                .await?
+                .explain_with_options(
+                    ExplainOption::default().with_format(ExplainFormat::Tree),
+                )?
+                .show()
+                .await?;
+        }
+        let avg = millis.iter().sum::<f64>() / millis.len() as f64;
+        println!("Query {query_id} avg time: {avg:.2} ms");
+
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
+        Ok(query_results)
+    }
+
     /// Registers the `hits.parquet` as a table named `hits`
+    /// If sorted_by is specified, uses CREATE EXTERNAL TABLE with WITH ORDER
     async fn register_hits(&self, ctx: &SessionContext) -> Result<()> {
-        let options = Default::default();
         let path = self.path.as_os_str().to_str().unwrap();
-        ctx.register_parquet("hits", path, options)
-            .await
-            .map_err(|e| {
-                DataFusionError::Context(
-                    format!("Registering 'hits' as {path}"),
-                    Box::new(e),
-                )
-            })
+
+        // If sorted_by is specified, use CREATE EXTERNAL TABLE with WITH ORDER
+        if let Some(ref sort_column) = self.sorted_by {
+            println!(
+                "Registering table with sort order: {} {}",
+                sort_column, self.sort_order
+            );
+
+            // Escape column name with double quotes
+            let escaped_column = if sort_column.contains('"') {
+                sort_column.clone()
+            } else {
+                format!("\"{sort_column}\"")
+            };
+
+            // Build CREATE EXTERNAL TABLE DDL with WITH ORDER clause
+            // Schema will be automatically inferred from the Parquet file
+            let create_table_sql = format!(
+                "CREATE EXTERNAL TABLE hits_raw \
+                 STORED AS PARQUET \
+                 LOCATION '{}' \
+                 WITH ORDER ({} {})",
+                path,
+                escaped_column,
+                self.sort_order.to_uppercase()
+            );
+
+            println!("Executing: {create_table_sql}");
+
+            // Execute the CREATE EXTERNAL TABLE statement
+            ctx.sql(&create_table_sql).await?.collect().await?;
+        } else {
+            // Original registration without sort order
+            let options = Default::default();
+            ctx.register_parquet("hits_raw", path, options)
+                .await
+                .map_err(|e| {
+                    DataFusionError::Context(
+                        format!("Registering 'hits_raw' as {path}"),
+                        Box::new(e),
+                    )
+                })?;
+        }
+
+        // Create the hits view with EventDate transformation
+        Self::create_hits_view(ctx).await
+    }
+
+    /// Creates the hits view with EventDate transformation from UInt16 to DATE.
+    ///
+    /// ClickBench encodes EventDate as UInt16 days since epoch (1970-01-01).
+    async fn create_hits_view(ctx: &SessionContext) -> Result<()> {
+        ctx.sql(HITS_VIEW_DDL).await?.collect().await.map_err(|e| {
+            DataFusionError::Context(
+                "Creating 'hits' view with EventDate transformation".to_string(),
+                Box::new(e),
+            )
+        })?;
+        Ok(())
+    }
+
+    fn iterations(&self) -> usize {
+        self.common.iterations
     }
 }
diff --git a/benchmarks/src/h2o.rs b/benchmarks/src/h2o.rs
index 23dba07f426da..8b6e04932cb39 100644
--- a/benchmarks/src/h2o.rs
+++ b/benchmarks/src/h2o.rs
@@ -20,41 +20,40 @@
 //! - [H2O AI Benchmark](https://duckdb.org/2023/04/14/h2oai.html)
 //! - [Extended window function benchmark](https://duckdb.org/2024/06/26/benchmarks-over-time.html#window-functions-benchmark)
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, print_memory_stats};
+use clap::Args;
+use datafusion::logical_expr::{ExplainFormat, ExplainOption};
 use datafusion::{error::Result, prelude::SessionContext};
 use datafusion_common::{
-    exec_datafusion_err, instant::Instant, internal_err, DataFusionError,
+    DataFusionError, TableReference, exec_datafusion_err, instant::Instant, internal_err,
 };
 use std::path::{Path, PathBuf};
-use structopt::StructOpt;
 
 /// Run the H2O benchmark
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
-    #[structopt(short, long)]
-    query: Option<usize>,
+    #[arg(short, long)]
+    pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to queries.sql (single file)
     /// default value is the groupby.sql file in the h2o benchmark
-    #[structopt(
-        parse(from_os_str),
-        short = "r",
+    #[arg(
+        short = 'r',
         long = "queries-path",
         default_value = "benchmarks/queries/h2o/groupby.sql"
     )]
-    queries_path: PathBuf,
+    pub queries_path: PathBuf,
 
     /// Path to data file (parquet or csv)
     /// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark
     /// This is the small csv file with 10^7 rows
-    #[structopt(
-        parse(from_os_str),
-        short = "p",
+    #[arg(
+        short = 'p',
         long = "path",
         default_value = "benchmarks/data/h2o/G1_1e7_1e7_100_0.csv"
     )]
@@ -63,15 +62,15 @@ pub struct RunOpt {
     /// Path to data files (parquet or csv), using , to separate the paths
     /// Default value is the small files for join x table, small table, medium table, big table files in the h2o benchmark
     /// This is the small csv file case
-    #[structopt(
-        short = "join-paths",
+    #[arg(
+        short = 'j',
         long = "join-paths",
         default_value = "benchmarks/data/h2o/J1_1e7_NA_0.csv,benchmarks/data/h2o/J1_1e7_1e1_0.csv,benchmarks/data/h2o/J1_1e7_1e4_0.csv,benchmarks/data/h2o/J1_1e7_1e7_NA.csv"
     )]
     join_paths: String,
 
     /// If present, write results json here
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 }
 
@@ -85,24 +84,24 @@ impl RunOpt {
         };
 
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
 
         // Register tables depending on which h2o benchmark is being run
         // (groupby/join/window)
         if self.queries_path.to_str().unwrap().ends_with("groupby.sql") {
-            self.register_data(&ctx).await?;
+            self.register_data("x", self.path.as_os_str().to_str().unwrap(), &ctx)
+                .await?;
         } else if self.queries_path.to_str().unwrap().ends_with("join.sql") {
             let join_paths: Vec<&str> = self.join_paths.split(',').collect();
             let table_name: Vec<&str> = vec!["x", "small", "medium", "large"];
             for (i, path) in join_paths.iter().enumerate() {
-                ctx.register_csv(table_name[i], path, Default::default())
-                    .await?;
+                self.register_data(table_name[i], path, &ctx).await?;
             }
         } else if self.queries_path.to_str().unwrap().ends_with("window.sql") {
             // Only register the 'large' table in h2o-join dataset
             let h2o_join_large_path = self.join_paths.split(',').nth(3).unwrap();
-            ctx.register_csv("large", h2o_join_large_path, Default::default())
+            self.register_data("large", h2o_join_large_path, &ctx)
                 .await?;
         } else {
             return internal_err!("Invalid query file path");
@@ -131,8 +130,17 @@ impl RunOpt {
             let avg = millis.iter().sum::<f64>() / millis.len() as f64;
             println!("Query {query_id} avg time: {avg:.2} ms");
 
+            // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+            print_memory_stats();
+
             if self.common.debug {
-                ctx.sql(sql).await?.explain(false, false)?.show().await?;
+                ctx.sql(sql)
+                    .await?
+                    .explain_with_options(
+                        ExplainOption::default().with_format(ExplainFormat::Tree),
+                    )?
+                    .show()
+                    .await?;
             }
             benchmark_run.maybe_write_json(self.output_path.as_ref())?;
         }
@@ -140,49 +148,62 @@ impl RunOpt {
         Ok(())
     }
 
-    async fn register_data(&self, ctx: &SessionContext) -> Result<()> {
+    async fn register_data(
+        &self,
+        table_ref: impl Into<TableReference>,
+        table_path: impl AsRef<str>,
+        ctx: &SessionContext,
+    ) -> Result<()> {
         let csv_options = Default::default();
         let parquet_options = Default::default();
-        let path = self.path.as_os_str().to_str().unwrap();
-
-        if self.path.extension().map(|s| s == "csv").unwrap_or(false) {
-            ctx.register_csv("x", path, csv_options)
-                .await
-                .map_err(|e| {
-                    DataFusionError::Context(
-                        format!("Registering 'table' as {path}"),
-                        Box::new(e),
-                    )
-                })
-                .expect("error registering csv");
-        }
 
-        if self
-            .path
+        let table_path_str = table_path.as_ref();
+
+        let extension = Path::new(table_path_str)
             .extension()
-            .map(|s| s == "parquet")
-            .unwrap_or(false)
-        {
-            ctx.register_parquet("x", path, parquet_options)
-                .await
-                .map_err(|e| {
-                    DataFusionError::Context(
-                        format!("Registering 'table' as {path}"),
-                        Box::new(e),
-                    )
-                })
-                .expect("error registering parquet");
+            .and_then(|s| s.to_str())
+            .unwrap_or("");
+
+        match extension {
+            "csv" => {
+                ctx.register_csv(table_ref, table_path_str, csv_options)
+                    .await
+                    .map_err(|e| {
+                        DataFusionError::Context(
+                            format!("Registering 'table' as {table_path_str}"),
+                            Box::new(e),
+                        )
+                    })
+                    .expect("error registering csv");
+            }
+            "parquet" => {
+                ctx.register_parquet(table_ref, table_path_str, parquet_options)
+                    .await
+                    .map_err(|e| {
+                        DataFusionError::Context(
+                            format!("Registering 'table' as {table_path_str}"),
+                            Box::new(e),
+                        )
+                    })
+                    .expect("error registering parquet");
+            }
+            _ => {
+                return Err(DataFusionError::Plan(format!(
+                    "Unsupported file extension: {extension}",
+                )));
+            }
         }
+
         Ok(())
     }
 }
 
-struct AllQueries {
+pub struct AllQueries {
     queries: Vec<String>,
 }
 
 impl AllQueries {
-    fn try_new(path: &Path) -> Result<Self> {
+    pub fn try_new(path: &Path) -> Result<Self> {
         let all_queries = std::fs::read_to_string(path)
             .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
 
@@ -192,7 +213,7 @@ impl AllQueries {
     }
 
     /// Returns the text of query `query_id`
-    fn get_query(&self, query_id: usize) -> Result<&str> {
+    pub fn get_query(&self, query_id: usize) -> Result<&str> {
         self.queries
             .get(query_id - 1)
             .ok_or_else(|| {
@@ -205,11 +226,11 @@ impl AllQueries {
             .map(|s| s.as_str())
     }
 
-    fn min_query_id(&self) -> usize {
+    pub fn min_query_id(&self) -> usize {
         1
     }
 
-    fn max_query_id(&self) -> usize {
+    pub fn max_query_id(&self) -> usize {
         self.queries.len()
     }
 }
diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
new file mode 100644
index 0000000000000..301fe0d599cd6
--- /dev/null
+++ b/benchmarks/src/hj.rs
@@ -0,0 +1,441 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+use std::path::PathBuf;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number. If not specified, runs all queries
+    #[arg(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// Path to TPC-H SF10 data
+    #[arg(short = 'p', long = "path")]
+    path: Option<PathBuf>,
+
+    /// If present, write results json here
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<PathBuf>,
+}
+
+struct HashJoinQuery {
+    sql: &'static str,
+    density: f64,
+    prob_hit: f64,
+    build_size: &'static str,
+    probe_size: &'static str,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+const HASH_QUERIES: &[HashJoinQuery] = &[
+    // Q1: Very Small Build Side (Dense)
+    // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT n_nationkey FROM nation JOIN customer ON c_nationkey = n_nationkey"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M",
+    },
+    // Q2: Very Small Build Side (Sparse, range < 1024)
+    // Build Side: nation (25 rows, range 961) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+    FROM (
+      SELECT c_nationkey * 40 as k
+      FROM customer
+    ) l
+    JOIN (
+      SELECT n_nationkey * 40 as k FROM nation
+    ) s ON l.k = s.k"###,
+        density: 0.026,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M",
+    },
+    // Q3: 100% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT s_suppkey FROM supplier JOIN lineitem ON s_suppkey = l_suppkey"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q4: 100% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 1.0,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q5: 75% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 4 / 3 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 4 / 3 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.75,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q6: 75% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 4 / 3
+                      WHEN l_suppkey % 10 < 9 THEN (l_suppkey * 4 / 3 / 4) * 4 + 3
+                      ELSE l_suppkey * 4 / 3 + 1000000         
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 4 / 3 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.75,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q7: 50% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 2 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 2 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.5,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q8: 50% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
+                      ELSE l_suppkey * 2 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 2 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.5,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q9: 20% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 5 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 5 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q10: 20% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 5
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 5 + 1
+                      ELSE l_suppkey * 5 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 5 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q11: 10% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 10 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 10 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.1,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q12: 10% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
+                      ELSE l_suppkey * 10 + 1000000
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 10 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.1,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q13: 1% Density, 100% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT l_suppkey * 100 as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT s_suppkey * 100 as k FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.01,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q14: 1% Density, 10% Hit rate
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN l_suppkey * 100
+                      WHEN l_suppkey % 10 < 9 THEN l_suppkey * 100 + 1
+                      ELSE l_suppkey * 100 + 11000000                  -- oob
+                 END as k
+          FROM lineitem
+        ) l
+            JOIN (
+              SELECT s_suppkey * 100 as k FROM supplier
+            ) s ON l.k = s.k"###,
+        density: 0.01,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M",
+    },
+    // Q15: 20% Density, 10% Hit rate, 20% Duplicates in Build Side
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (
+          SELECT CASE 
+                      WHEN l_suppkey % 10 = 0 THEN ((l_suppkey % 80000) + 1) * 25 / 4
+                      ELSE ((l_suppkey % 80000) + 1) * 25 / 4 + 1
+                 END as k
+          FROM lineitem
+        ) l
+        JOIN (
+          SELECT CASE 
+                      WHEN s_suppkey <= 80000 THEN (s_suppkey * 25) / 4 
+                      ELSE ((s_suppkey - 80000) * 25) / 4 
+                 END as k 
+          FROM supplier
+        ) s ON l.k = s.k"###,
+        density: 0.2,
+        prob_hit: 0.1,
+        build_size: "100K_(20%_dups)",
+        probe_size: "60M",
+    },
+];
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running Hash Join benchmarks with the following options: {self:#?}\n");
+
+        let query_range = match self.query {
+            Some(query_id) => {
+                if query_id >= 1 && query_id <= HASH_QUERIES.len() {
+                    query_id..=query_id
+                } else {
+                    return exec_err!(
+                        "Query {query_id} not found. Available queries: 1 to {}",
+                        HASH_QUERIES.len()
+                    );
+                }
+            }
+            None => 1..=HASH_QUERIES.len(),
+        };
+
+        let config = self.common.config()?;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
+        if let Some(path) = &self.path {
+            for table in &["lineitem", "supplier", "nation", "customer"] {
+                let table_path = path.join(table);
+                if !table_path.exists() {
+                    return exec_err!(
+                        "TPC-H table {} not found at {:?}",
+                        table,
+                        table_path
+                    );
+                }
+                ctx.register_parquet(
+                    *table,
+                    table_path.to_str().unwrap(),
+                    Default::default(),
+                )
+                .await?;
+            }
+        }
+
+        let mut benchmark_run = BenchmarkRun::new();
+
+        for query_id in query_range {
+            let query_index = query_id - 1;
+            let query = &HASH_QUERIES[query_index];
+
+            let case_name = format!(
+                "Query {}_density={}_prob_hit={}_{}*{}",
+                query_id,
+                query.density,
+                query.prob_hit,
+                query.build_size,
+                query.probe_size
+            );
+            benchmark_run.start_new_case(&case_name);
+
+            let query_run = self
+                .benchmark_query(query.sql, &query_id.to_string(), &ctx)
+                .await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    return Err(DataFusionError::Context(
+                        format!("Hash Join benchmark Q{query_id} failed with error:"),
+                        Box::new(e),
+                    ));
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        Ok(())
+    }
+
+    /// Validates that the physical plan uses a HashJoin, then executes.
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_name: &str,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut query_results = vec![];
+
+        // Build/validate plan
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let plan_string = format!("{physical_plan:#?}");
+
+        if !plan_string.contains("HashJoinExec") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} does not use Hash Join. Physical plan: {plan_string}"
+            ));
+        }
+
+        // Execute without buffering
+        for i in 0..self.common.iterations {
+            let start = Instant::now();
+            let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?;
+            let elapsed = start.elapsed();
+
+            println!(
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
+
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        Ok(query_results)
+    }
+
+    /// Executes the SQL query and drops each batch to avoid result buffering.
+    async fn execute_sql_without_result_buffering(
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+            // Drop batches immediately to minimize memory pressure
+        }
+
+        Ok(row_count)
+    }
+}
diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs
index e7949aa715c23..aaed186da4905 100644
--- a/benchmarks/src/imdb/convert.rs
+++ b/benchmarks/src/imdb/convert.rs
@@ -20,31 +20,31 @@ use datafusion::logical_expr::select_expr::SelectExpr;
 use datafusion_common::instant::Instant;
 use std::path::PathBuf;
 
+use clap::Args;
 use datafusion::error::Result;
 use datafusion::prelude::*;
-use structopt::StructOpt;
 
 use datafusion::common::not_impl_err;
 
-use super::get_imdb_table_schema;
 use super::IMDB_TABLES;
+use super::get_imdb_table_schema;
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 pub struct ConvertOpt {
     /// Path to csv files
-    #[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
+    #[arg(required = true, short = 'i', long = "input")]
     input_path: PathBuf,
 
     /// Output path
-    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
+    #[arg(required = true, short = 'o', long = "output")]
     output_path: PathBuf,
 
     /// Output file format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format")]
+    #[arg(short = 'f', long = "format")]
     file_format: String,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
+    #[arg(short = 's', long = "batch-size", default_value = "8192")]
     batch_size: usize,
 }
 
diff --git a/benchmarks/src/imdb/mod.rs b/benchmarks/src/imdb/mod.rs
index 6a45242e6ff4b..87462bc3e81ba 100644
--- a/benchmarks/src/imdb/mod.rs
+++ b/benchmarks/src/imdb/mod.rs
@@ -54,6 +54,9 @@ pub const IMDB_TABLES: &[&str] = &[
     "person_info",
 ];
 
+pub const IMDB_QUERY_START_ID: usize = 1;
+pub const IMDB_QUERY_END_ID: usize = 113;
+
 /// Get the schema for the IMDB dataset tables
 /// see benchmarks/data/imdb/schematext.sql
 pub fn get_imdb_table_schema(table: &str) -> Schema {
diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs
index 61dcc07ebd639..ca9710a920517 100644
--- a/benchmarks/src/imdb/run.rs
+++ b/benchmarks/src/imdb/run.rs
@@ -18,14 +18,17 @@
 use std::path::PathBuf;
 use std::sync::Arc;
 
-use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
-use crate::util::{BenchmarkRun, CommonOpt};
+use super::{
+    IMDB_QUERY_END_ID, IMDB_QUERY_START_ID, IMDB_TABLES, get_imdb_table_schema,
+    get_query_sql,
+};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
@@ -38,8 +41,8 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
 
+use clap::Args;
 use log::info;
-use structopt::StructOpt;
 
 // hack to avoid `default_value is meaningless for bool` errors
 type BoolDefaultTrue = bool;
@@ -51,48 +54,49 @@ type BoolDefaultTrue = bool;
 /// [2] and [3].
 ///
 /// [1]: https://www.vldb.org/pvldb/vol9/p204-leis.pdf
-/// [2]: http://homepages.cwi.nl/~boncz/job/imdb.tgz
+/// [2]: https://event.cwi.nl/da/job/imdb.tgz
 /// [3]: https://db.in.tum.de/~leis/qo/job.tgz
 
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
-    query: Option<usize>,
+    #[arg(short, long)]
+    pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// File format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format", default_value = "csv")]
+    #[arg(short = 'f', long = "format", default_value = "csv")]
     file_format: String,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Whether to disable collection of statistics (and cost based optimizations) or not.
-    #[structopt(short = "S", long = "disable-statistics")]
+    #[arg(short = 'S', long = "disable-statistics")]
     disable_statistics: bool,
 
     /// If true then hash join used, if false then sort merge join
     /// True by default.
-    #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
-}
 
-const IMDB_QUERY_START_ID: usize = 1;
-const IMDB_QUERY_END_ID: usize = 113;
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
+}
 
 fn map_query_id_to_str(query_id: usize) -> &'static str {
     match query_id {
@@ -306,8 +310,10 @@ impl RunOpt {
             .config()?
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
 
         // register tables
         self.register_tables(&ctx).await?;
@@ -341,6 +347,9 @@ impl RunOpt {
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
         println!("Query {query_id} avg time: {avg:.2} ms");
 
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
         Ok(query_results)
     }
 
@@ -475,11 +484,6 @@ impl RunOpt {
     }
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 #[cfg(test)]
 // Only run with "ci" mode when we have the data
 #[cfg(feature = "ci")]
@@ -519,6 +523,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -529,6 +534,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(map_query_id_to_str(query))?;
@@ -536,7 +542,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.into_optimized_plan()?;
             let bytes = logical_plan_to_bytes(&plan)?;
-            let plan2 = logical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", plan.display_indent());
             let plan2_formatted = format!("{}", plan2.display_indent());
             assert_eq!(plan_formatted, plan2_formatted);
@@ -555,6 +561,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -565,6 +572,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(map_query_id_to_str(query))?;
@@ -572,7 +580,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.create_physical_plan().await?;
             let bytes = physical_plan_to_bytes(plan.clone())?;
-            let plan2 = physical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false));
             let plan2_formatted =
                 format!("{}", displayable(plan2.as_ref()).indent(false));
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
index a402fc1b8ce04..a3bc221840ada 100644
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@@ -19,9 +19,11 @@
 pub mod cancellation;
 pub mod clickbench;
 pub mod h2o;
+pub mod hj;
 pub mod imdb;
-pub mod parquet_filter;
-pub mod sort;
+pub mod nlj;
+pub mod smj;
 pub mod sort_tpch;
+pub mod tpcds;
 pub mod tpch;
 pub mod util;
diff --git a/benchmarks/src/nlj.rs b/benchmarks/src/nlj.rs
new file mode 100644
index 0000000000000..361cc35ec200c
--- /dev/null
+++ b/benchmarks/src/nlj.rs
@@ -0,0 +1,303 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+
+use futures::StreamExt;
+
+/// Run the Nested Loop Join (NLJ) benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of NLJs.
+///
+/// It always tries to use fast scanners (without decoding overhead) and
+/// efficient predicate expressions to ensure it can reflect the performance
+/// of the NLJ operator itself.
+///
+/// In this micro-benchmark, the following workload characteristics will be
+/// varied:
+/// - Join type: Inner/Left/Right/Full (all for the NestedLoopJoin physical
+///   operator)
+///   TODO: Include special join types (Semi/Anti/Mark joins)
+/// - Input size: Different combinations of left (build) side and right (probe)
+///   side sizes
+/// - Selectivity of join filters
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 10). If not specified, runs all queries
+    #[arg(short, long)]
+    query: Option<usize>,
+
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for NLJ benchmarks
+///
+/// Each query's comment includes:
+///   - Left (build) side row count × Right (probe) side row count
+///   - Join predicate selectivity (1% means the output size is 1% * input size)
+const NLJ_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW 0.1%
+    r#"
+        SELECT *
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q2: INNER 10K x 10K | Medium 20%
+    r#"
+        SELECT *
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+        ON (t1.value + t2.value) % 5 = 0;
+    "#,
+    // Q3: INNER 10K x 10K | High 90%
+    r#"
+        SELECT *
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+        ON (t1.value + t2.value) % 10 <> 0;
+    "#,
+    // Q4: INNER 30K x 30K | Medium 20%
+    r#"
+        SELECT *
+        FROM range(30000) AS t1
+        JOIN range(30000) AS t2
+        ON (t1.value + t2.value) % 5 = 0;
+    "#,
+    // Q5: INNER 10K x 200K | LOW 0.1% (small to large)
+    r#"
+        SELECT *
+        FROM range(10000) AS t1
+        JOIN range(200000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q6: INNER 200K x 10K | LOW 0.1% (large to small)
+    r#"
+        SELECT *
+        FROM range(200000) AS t1
+        JOIN range(10000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q7: RIGHT OUTER 10K x 200K | LOW 0.1%
+    r#"
+        SELECT *
+        FROM range(10000) AS t1
+        RIGHT JOIN range(200000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q8: LEFT OUTER 200K x 10K | LOW 0.1%
+    r#"
+        SELECT *
+        FROM range(200000) AS t1
+        LEFT JOIN range(10000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q9: FULL OUTER 30K x 30K | LOW 0.1%
+    r#"
+        SELECT *
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+        ON (t1.value + t2.value) % 1000 = 0;
+    "#,
+    // Q10: FULL OUTER 30K x 30K | High 90%
+    r#"
+        SELECT *
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+        ON (t1.value + t2.value) % 10 <> 0;
+    "#,
+    // Q11: INNER 30K x 30K | MEDIUM 50% | cheap predicate
+    r#"
+        SELECT *
+        FROM range(30000) AS t1
+        INNER JOIN range(30000) AS t2
+        ON (t1.value > t2.value);
+    "#,
+    // Q12: FULL OUTER 30K x 30K | MEDIUM 50% | cheap predicate
+    r#"
+        SELECT *
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+        ON (t1.value > t2.value);
+    "#,
+    // Q13: LEFT SEMI 30K x 30K | HIGH 99.9%
+    r#"
+        SELECT t1.*
+        FROM range(30000) AS t1
+        LEFT SEMI JOIN range(30000) AS t2
+        ON t1.value < t2.value;
+    "#,
+    // Q14: LEFT ANTI 30K x 30K | LOW 0.003%
+    r#"
+        SELECT t1.*
+        FROM range(30000) AS t1
+        LEFT ANTI JOIN range(30000) AS t2
+        ON t1.value < t2.value;
+    "#,
+    // Q15: RIGHT SEMI 30K x 30K | HIGH 99.9%
+    r#"
+        SELECT t1.*
+        FROM range(30000) AS t2
+        RIGHT SEMI JOIN range(30000) AS t1
+        ON t2.value < t1.value;
+    "#,
+    // Q16: RIGHT ANTI 30K x 30K | LOW 0.003%
+    r#"
+        SELECT t1.*
+        FROM range(30000) AS t2
+        RIGHT ANTI JOIN range(30000) AS t1
+        ON t2.value < t1.value;
+    "#,
+    // Q17: LEFT MARK | HIGH 99.9%
+    r#"
+        SELECT *
+        FROM range(30000) AS t2(k2)
+        WHERE k2 > 0
+        OR EXISTS (
+            SELECT 1
+            FROM range(30000) AS t1(k1)
+            WHERE t2.k2 > t1.k1
+        );
+    "#,
+];
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running NLJ benchmarks with the following options: {self:#?}\n");
+
+        // Define query range
+        let query_range = match self.query {
+            Some(query_id) => {
+                if query_id >= 1 && query_id <= NLJ_QUERIES.len() {
+                    query_id..=query_id
+                } else {
+                    return exec_err!(
+                        "Query {query_id} not found. Available queries: 1 to {}",
+                        NLJ_QUERIES.len()
+                    );
+                }
+            }
+            None => 1..=NLJ_QUERIES.len(),
+        };
+
+        let config = self.common.config()?;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
+        let mut benchmark_run = BenchmarkRun::new();
+        for query_id in query_range {
+            let query_index = query_id - 1; // Convert 1-based to 0-based index
+
+            let sql = NLJ_QUERIES[query_index];
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    return Err(DataFusionError::Context(
+                        "NLJ benchmark Q{query_id} failed with error:".to_string(),
+                        Box::new(e),
+                    ));
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        Ok(())
+    }
+
+    /// Validates that the query's physical plan uses a NestedLoopJoin (NLJ),
+    /// then executes the query and collects execution times.
+    ///
+    /// TODO: ensure the optimizer won't change the join order (it's not at
+    /// v48.0.0).
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_name: &str,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut query_results = vec![];
+
+        // Validate that the query plan includes a Nested Loop Join
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let plan_string = format!("{physical_plan:#?}");
+
+        if !plan_string.contains("NestedLoopJoinExec") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} does not use Nested Loop Join. Physical plan: {plan_string}"
+            ));
+        }
+
+        for i in 0..self.common.iterations {
+            let start = Instant::now();
+
+            let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?;
+
+            let elapsed = start.elapsed();
+
+            println!(
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
+
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        Ok(query_results)
+    }
+
+    /// Executes the SQL query and drops each result batch after evaluation, to
+    /// minimizes memory usage by not buffering results.
+    ///
+    /// Returns the total result row count
+    async fn execute_sql_without_result_buffering(
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+
+            // Evaluate the result and do nothing, the result will be dropped
+            // to reduce memory pressure
+        }
+
+        Ok(row_count)
+    }
+}
diff --git a/benchmarks/src/parquet_filter.rs b/benchmarks/src/parquet_filter.rs
deleted file mode 100644
index 34103af0ffd21..0000000000000
--- a/benchmarks/src/parquet_filter.rs
+++ /dev/null
@@ -1,194 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::path::PathBuf;
-
-use crate::util::{AccessLogOpt, BenchmarkRun, CommonOpt};
-
-use arrow::util::pretty;
-use datafusion::common::Result;
-use datafusion::logical_expr::utils::disjunction;
-use datafusion::logical_expr::{lit, or, Expr};
-use datafusion::physical_plan::collect;
-use datafusion::prelude::{col, SessionContext};
-use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile};
-use datafusion_common::instant::Instant;
-
-use structopt::StructOpt;
-
-/// Test performance of parquet filter pushdown
-///
-/// The queries are executed on a synthetic dataset generated during
-/// the benchmark execution and designed to simulate web server access
-/// logs.
-///
-/// Example
-///
-/// dfbench parquet-filter  --path ./data --scale-factor 1.0
-///
-/// generates the synthetic dataset at `./data/logs.parquet`. The size
-/// of the dataset can be controlled through the `size_factor`
-/// (with the default value of `1.0` generating a ~1GB parquet file).
-///
-/// For each filter we will run the query using different
-/// `ParquetScanOption` settings.
-///
-/// Example output:
-///
-/// Running benchmarks with the following options: Opt { debug: false, iterations: 3, partitions: 2, path: "./data", batch_size: 8192, scale_factor: 1.0 }
-/// Generated test dataset with 10699521 rows
-/// Executing with filter 'request_method = Utf8("GET")'
-/// Using scan options ParquetScanOptions { pushdown_filters: false, reorder_predicates: false, enable_page_index: false }
-/// Iteration 0 returned 10699521 rows in 1303 ms
-/// Iteration 1 returned 10699521 rows in 1288 ms
-/// Iteration 2 returned 10699521 rows in 1266 ms
-/// Using scan options ParquetScanOptions { pushdown_filters: true, reorder_predicates: true, enable_page_index: true }
-/// Iteration 0 returned 1781686 rows in 1970 ms
-/// Iteration 1 returned 1781686 rows in 2002 ms
-/// Iteration 2 returned 1781686 rows in 1988 ms
-/// Using scan options ParquetScanOptions { pushdown_filters: true, reorder_predicates: false, enable_page_index: true }
-/// Iteration 0 returned 1781686 rows in 1940 ms
-/// Iteration 1 returned 1781686 rows in 1986 ms
-/// Iteration 2 returned 1781686 rows in 1947 ms
-/// ...
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
-pub struct RunOpt {
-    /// Common options
-    #[structopt(flatten)]
-    common: CommonOpt,
-
-    /// Create data files
-    #[structopt(flatten)]
-    access_log: AccessLogOpt,
-
-    /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
-    output_path: Option<PathBuf>,
-}
-
-impl RunOpt {
-    pub async fn run(self) -> Result<()> {
-        let test_file = self.access_log.build()?;
-
-        let mut rundata = BenchmarkRun::new();
-        let scan_options_matrix = vec![
-            ParquetScanOptions {
-                pushdown_filters: false,
-                reorder_filters: false,
-                enable_page_index: false,
-            },
-            ParquetScanOptions {
-                pushdown_filters: true,
-                reorder_filters: true,
-                enable_page_index: true,
-            },
-            ParquetScanOptions {
-                pushdown_filters: true,
-                reorder_filters: true,
-                enable_page_index: false,
-            },
-        ];
-
-        let filter_matrix = vec![
-            ("Selective-ish filter", col("request_method").eq(lit("GET"))),
-            (
-                "Non-selective filter",
-                col("request_method").not_eq(lit("GET")),
-            ),
-            (
-                "Basic conjunction",
-                col("request_method")
-                    .eq(lit("POST"))
-                    .and(col("response_status").eq(lit(503_u16))),
-            ),
-            (
-                "Nested filters",
-                col("request_method").eq(lit("POST")).and(or(
-                    col("response_status").eq(lit(503_u16)),
-                    col("response_status").eq(lit(403_u16)),
-                )),
-            ),
-            (
-                "Many filters",
-                disjunction([
-                    col("request_method").not_eq(lit("GET")),
-                    col("response_status").eq(lit(400_u16)),
-                    col("service").eq(lit("backend")),
-                ])
-                .unwrap(),
-            ),
-            ("Filter everything", col("response_status").eq(lit(429_u16))),
-            ("Filter nothing", col("response_status").gt(lit(0_u16))),
-        ];
-
-        for (name, filter_expr) in &filter_matrix {
-            println!("Executing '{name}' (filter: {filter_expr})");
-            for scan_options in &scan_options_matrix {
-                println!("Using scan options {scan_options:?}");
-                rundata.start_new_case(&format!(
-                    "{name}: {}",
-                    parquet_scan_disp(scan_options)
-                ));
-                for i in 0..self.common.iterations {
-                    let config = self.common.update_config(scan_options.config());
-                    let ctx = SessionContext::new_with_config(config);
-
-                    let (rows, elapsed) = exec_scan(
-                        &ctx,
-                        &test_file,
-                        filter_expr.clone(),
-                        self.common.debug,
-                    )
-                    .await?;
-                    let ms = elapsed.as_secs_f64() * 1000.0;
-                    println!("Iteration {i} returned {rows} rows in {ms} ms");
-                    rundata.write_iter(elapsed, rows);
-                }
-            }
-            println!("\n");
-        }
-        rundata.maybe_write_json(self.output_path.as_ref())?;
-        Ok(())
-    }
-}
-
-fn parquet_scan_disp(opts: &ParquetScanOptions) -> String {
-    format!(
-        "pushdown_filters={}, reorder_filters={}, page_index={}",
-        opts.pushdown_filters, opts.reorder_filters, opts.enable_page_index
-    )
-}
-
-async fn exec_scan(
-    ctx: &SessionContext,
-    test_file: &TestParquetFile,
-    filter: Expr,
-    debug: bool,
-) -> Result<(usize, std::time::Duration)> {
-    let start = Instant::now();
-    let exec = test_file.create_scan(ctx, Some(filter)).await?;
-
-    let task_ctx = ctx.task_ctx();
-    let result = collect(exec, task_ctx).await?;
-    let elapsed = start.elapsed();
-    if debug {
-        pretty::print_batches(&result)?;
-    }
-    let rows = result.iter().map(|b| b.num_rows()).sum();
-    Ok((rows, elapsed))
-}
diff --git a/benchmarks/src/smj.rs b/benchmarks/src/smj.rs
new file mode 100644
index 0000000000000..5056fd5096156
--- /dev/null
+++ b/benchmarks/src/smj.rs
@@ -0,0 +1,524 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use clap::Args;
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+
+use futures::StreamExt;
+
+/// Run the Sort Merge Join (SMJ) benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of SMJs.
+///
+/// It uses equality join predicates (to ensure SMJ is selected) and varies:
+/// - Join type: Inner/Left/Right/Full/LeftSemi/LeftAnti/RightSemi/RightAnti
+/// - Key cardinality: 1:1, 1:N, N:M relationships
+/// - Filter selectivity: Low (1%), Medium (10%), High (50%)
+/// - Input sizes: Small to large, balanced and skewed
+///
+/// All inputs are pre-sorted in CTEs before the join to isolate join
+/// performance from sort overhead.
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 20). If not specified, runs all queries
+    #[arg(short, long)]
+    query: Option<usize>,
+
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for SMJ benchmarks
+///
+/// Each query's comment includes:
+///   - Join type
+///   - Left row count × Right row count
+///   - Key cardinality (rows per key)
+///   - Filter selectivity (if applicable)
+const SMJ_QUERIES: &[&str] = &[
+    // Q1: INNER 100K x 100K | 1:1
+    r#"
+        WITH t1_sorted AS (
+            SELECT value as key FROM range(100000) ORDER BY value
+        ),
+        t2_sorted AS (
+            SELECT value as key FROM range(100000) ORDER BY value
+        )
+        SELECT t1_sorted.key as k1, t2_sorted.key as k2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q2: INNER 100K x 1M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q3: INNER 1M x 1M | 1:100
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q4: INNER 100K x 1M | 1:10 | 1%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data % 100 = 0
+    "#,
+    // Q5: INNER 1M x 1M | 1:100 | 10%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t1_sorted.data <> t2_sorted.data AND t2_sorted.data % 10 = 0
+    "#,
+    // Q6: LEFT 100K x 1M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10500 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q7: LEFT 100K x 1M | 1:10 | 50%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data IS NULL OR t2_sorted.data % 2 = 0
+    "#,
+    // Q8: FULL 100K x 100K | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 12500 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key as k1, t1_sorted.data as d1,
+               t2_sorted.key as k2, t2_sorted.data as d2
+        FROM t1_sorted FULL JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+    "#,
+    // Q9: FULL 100K x 1M | 1:10 | 10%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key as k1, t1_sorted.data as d1,
+               t2_sorted.key as k2, t2_sorted.data as d2
+        FROM t1_sorted FULL JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE (t1_sorted.data IS NULL OR t2_sorted.data IS NULL
+               OR t1_sorted.data <> t2_sorted.data)
+          AND (t1_sorted.data IS NULL OR t1_sorted.data % 10 = 0)
+    "#,
+    // Q10: LEFT SEMI 100K x 1M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key
+            FROM range(1000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q11: LEFT SEMI 100K x 1M | 1:10 | 1%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 100 = 0
+        )
+    "#,
+    // Q12: LEFT SEMI 100K x 1M | 1:10 | 50%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 2 = 0
+        )
+    "#,
+    // Q13: LEFT SEMI 100K x 1M | 1:10 | 90%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data % 10 <> 0
+        )
+    "#,
+    // Q14: LEFT ANTI 100K x 1M | 1:10
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10500 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key
+            FROM range(1000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q15: LEFT ANTI 100K x 1M | 1:10 | partial match
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 12000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key
+            FROM range(1000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q16: LEFT ANTI 100K x 100K | 1:1 | stress
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 11000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key
+            FROM range(100000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q17: INNER 100K x 5M | 1:50 | 5%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(5000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        WHERE t2_sorted.data <> t1_sorted.data AND t2_sorted.data % 20 = 0
+    "#,
+    // Q18: LEFT SEMI 100K x 5M | 1:50 | 2%
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(5000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+              AND t2_sorted.data <> t1_sorted.data
+              AND t2_sorted.data % 50 = 0
+        )
+    "#,
+    // Q19: LEFT ANTI 100K x 5M | 1:50 | partial match
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 15000 as key, value as data
+            FROM range(100000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key
+            FROM range(5000000)
+            ORDER BY key
+        )
+        SELECT t1_sorted.key, t1_sorted.data
+        FROM t1_sorted
+        WHERE NOT EXISTS (
+            SELECT 1 FROM t2_sorted
+            WHERE t2_sorted.key = t1_sorted.key
+        )
+    "#,
+    // Q20: INNER 1M x 10M | 1:100 + GROUP BY
+    r#"
+        WITH t1_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(1000000)
+            ORDER BY key, data
+        ),
+        t2_sorted AS (
+            SELECT value % 10000 as key, value as data
+            FROM range(10000000)
+            ORDER BY key, data
+        )
+        SELECT t1_sorted.key, count(*) as cnt
+        FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key
+        GROUP BY t1_sorted.key
+    "#,
+];
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running SMJ benchmarks with the following options: {self:#?}\n");
+
+        // Define query range
+        let query_range = match self.query {
+            Some(query_id) => {
+                if query_id >= 1 && query_id <= SMJ_QUERIES.len() {
+                    query_id..=query_id
+                } else {
+                    return exec_err!(
+                        "Query {query_id} not found. Available queries: 1 to {}",
+                        SMJ_QUERIES.len()
+                    );
+                }
+            }
+            None => 1..=SMJ_QUERIES.len(),
+        };
+
+        let mut config = self.common.config()?;
+        // Disable hash joins to force SMJ
+        config = config.set_bool("datafusion.optimizer.prefer_hash_join", false);
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+
+        let mut benchmark_run = BenchmarkRun::new();
+        for query_id in query_range {
+            let query_index = query_id - 1; // Convert 1-based to 0-based index
+
+            let sql = SMJ_QUERIES[query_index];
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    return Err(DataFusionError::Context(
+                        format!("SMJ benchmark Q{query_id} failed with error:"),
+                        Box::new(e),
+                    ));
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        Ok(())
+    }
+
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_name: &str,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut query_results = vec![];
+
+        // Validate that the query plan includes a Sort Merge Join
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let plan_string = format!("{physical_plan:#?}");
+
+        if !plan_string.contains("SortMergeJoinExec") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} does not use Sort Merge Join. Physical plan: {plan_string}"
+            ));
+        }
+
+        for i in 0..self.common.iterations {
+            let start = Instant::now();
+
+            let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?;
+
+            let elapsed = start.elapsed();
+
+            println!(
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
+
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        Ok(query_results)
+    }
+
+    /// Executes the SQL query and drops each result batch after evaluation, to
+    /// minimizes memory usage by not buffering results.
+    ///
+    /// Returns the total result row count
+    async fn execute_sql_without_result_buffering(
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+
+            // Evaluate the result and do nothing, the result will be dropped
+            // to reduce memory pressure
+        }
+
+        Ok(row_count)
+    }
+}
diff --git a/benchmarks/src/sort.rs b/benchmarks/src/sort.rs
deleted file mode 100644
index 8b2b02670449e..0000000000000
--- a/benchmarks/src/sort.rs
+++ /dev/null
@@ -1,187 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use crate::util::{AccessLogOpt, BenchmarkRun, CommonOpt};
-
-use arrow::util::pretty;
-use datafusion::common::Result;
-use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion::physical_plan::collect;
-use datafusion::physical_plan::sorts::sort::SortExec;
-use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion::test_util::parquet::TestParquetFile;
-use datafusion_common::instant::Instant;
-use datafusion_common::utils::get_available_parallelism;
-use structopt::StructOpt;
-
-/// Test performance of sorting large datasets
-///
-/// This test sorts a a synthetic dataset generated during the
-/// benchmark execution, designed to simulate sorting web server
-/// access logs. Such sorting is often done during data transformation
-/// steps.
-///
-/// The tests sort the entire dataset using several different sort
-/// orders.
-///
-/// Example:
-///
-/// dfbench sort --path ./data --scale-factor 1.0
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
-pub struct RunOpt {
-    /// Common options
-    #[structopt(flatten)]
-    common: CommonOpt,
-
-    /// Create data files
-    #[structopt(flatten)]
-    access_log: AccessLogOpt,
-
-    /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
-    output_path: Option<PathBuf>,
-}
-
-impl RunOpt {
-    pub async fn run(self) -> Result<()> {
-        let test_file = self.access_log.build()?;
-
-        use datafusion::physical_expr::expressions::col;
-        let mut rundata = BenchmarkRun::new();
-        let schema = test_file.schema();
-        let sort_cases = vec![
-            (
-                "sort utf8",
-                LexOrdering::new(vec![PhysicalSortExpr {
-                    expr: col("request_method", &schema)?,
-                    options: Default::default(),
-                }]),
-            ),
-            (
-                "sort int",
-                LexOrdering::new(vec![PhysicalSortExpr {
-                    expr: col("response_bytes", &schema)?,
-                    options: Default::default(),
-                }]),
-            ),
-            (
-                "sort decimal",
-                LexOrdering::new(vec![PhysicalSortExpr {
-                    expr: col("decimal_price", &schema)?,
-                    options: Default::default(),
-                }]),
-            ),
-            (
-                "sort integer tuple",
-                LexOrdering::new(vec![
-                    PhysicalSortExpr {
-                        expr: col("request_bytes", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("response_bytes", &schema)?,
-                        options: Default::default(),
-                    },
-                ]),
-            ),
-            (
-                "sort utf8 tuple",
-                LexOrdering::new(vec![
-                    // sort utf8 tuple
-                    PhysicalSortExpr {
-                        expr: col("service", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("host", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("pod", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("image", &schema)?,
-                        options: Default::default(),
-                    },
-                ]),
-            ),
-            (
-                "sort mixed tuple",
-                LexOrdering::new(vec![
-                    PhysicalSortExpr {
-                        expr: col("service", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("request_bytes", &schema)?,
-                        options: Default::default(),
-                    },
-                    PhysicalSortExpr {
-                        expr: col("decimal_price", &schema)?,
-                        options: Default::default(),
-                    },
-                ]),
-            ),
-        ];
-        for (title, expr) in sort_cases {
-            println!("Executing '{title}' (sorting by: {expr:?})");
-            rundata.start_new_case(title);
-            for i in 0..self.common.iterations {
-                let config = SessionConfig::new().with_target_partitions(
-                    self.common
-                        .partitions
-                        .unwrap_or_else(get_available_parallelism),
-                );
-                let ctx = SessionContext::new_with_config(config);
-                let (rows, elapsed) =
-                    exec_sort(&ctx, &expr, &test_file, self.common.debug).await?;
-                let ms = elapsed.as_secs_f64() * 1000.0;
-                println!("Iteration {i} finished in {ms} ms");
-                rundata.write_iter(elapsed, rows);
-            }
-            println!("\n");
-        }
-        if let Some(path) = &self.output_path {
-            std::fs::write(path, rundata.to_json())?;
-        }
-        Ok(())
-    }
-}
-
-async fn exec_sort(
-    ctx: &SessionContext,
-    expr: &LexOrdering,
-    test_file: &TestParquetFile,
-    debug: bool,
-) -> Result<(usize, std::time::Duration)> {
-    let start = Instant::now();
-    let scan = test_file.create_scan(ctx, None).await?;
-    let exec = Arc::new(SortExec::new(expr.clone(), scan));
-    let task_ctx = ctx.task_ctx();
-    let result = collect(exec, task_ctx).await?;
-    let elapsed = start.elapsed();
-    if debug {
-        pretty::print_batches(&result)?;
-    }
-    let rows = result.iter().map(|b| b.num_rows()).sum();
-    Ok((rows, elapsed))
-}
diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs
index ba03529a930e7..95c90d826de20 100644
--- a/benchmarks/src/sort_tpch.rs
+++ b/benchmarks/src/sort_tpch.rs
@@ -21,10 +21,10 @@
 //! Another `Sort` benchmark focus on single core execution. This benchmark
 //! runs end-to-end sort queries and test the performance on multiple CPU cores.
 
+use clap::Args;
 use futures::StreamExt;
 use std::path::PathBuf;
 use std::sync::Arc;
-use structopt::StructOpt;
 
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{
@@ -36,48 +36,46 @@ use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{displayable, execute_stream};
 use datafusion::prelude::*;
+use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
-use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
-#[derive(Debug, StructOpt)]
+#[derive(Debug, Args)]
 pub struct RunOpt {
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Sort query number. If not specified, runs all queries
-    #[structopt(short, long)]
-    query: Option<usize>,
+    #[arg(short, long)]
+    pub query: Option<usize>,
 
     /// Path to data files (lineitem). Only parquet format is supported
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// Path to JSON benchmark result to be compare using `compare.py`
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
-    #[structopt(short = "t", long = "sorted")]
+    #[arg(short = 't', long = "sorted")]
     sorted: bool,
 
     /// Append a `LIMIT n` clause to the query
-    #[structopt(short = "l", long = "limit")]
+    #[arg(short = 'l', long = "limit")]
     limit: Option<usize>,
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
+pub const SORT_TPCH_QUERY_START_ID: usize = 1;
+pub const SORT_TPCH_QUERY_END_ID: usize = 11;
 
 impl RunOpt {
     const SORT_TABLES: [&'static str; 1] = ["lineitem"];
@@ -179,34 +177,42 @@ impl RunOpt {
     /// If query is specified from command line, run only that query.
     /// Otherwise, run all queries.
     pub async fn run(&self) -> Result<()> {
-        let mut benchmark_run = BenchmarkRun::new();
+        let mut benchmark_run: BenchmarkRun = BenchmarkRun::new();
 
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
-            None => 1..=Self::SORT_QUERIES.len(),
+            None => SORT_TPCH_QUERY_START_ID..=SORT_TPCH_QUERY_END_ID,
         };
 
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("{query_id}"));
 
-            let query_results = self.benchmark_query(query_id).await?;
-            for iter in query_results {
-                benchmark_run.write_iter(iter.elapsed, iter.row_count);
+            let query_results = self.benchmark_query(query_id).await;
+            match query_results {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
         }
 
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
-
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
     /// Benchmark query `query_id` in `SORT_QUERIES`
     async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
         let config = self.common.config()?;
-        let rt_builder = self.common.runtime_env_builder()?;
+        let rt = self.common.build_runtime()?;
         let state = SessionStateBuilder::new()
             .with_config(config)
-            .with_runtime_env(rt_builder.build_arc()?)
+            .with_runtime_env(rt)
             .with_default_features()
             .build();
         let ctx = SessionContext::from(state);
@@ -235,13 +241,16 @@ impl RunOpt {
             millis.push(ms);
 
             println!(
-                "Q{query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
             );
             query_results.push(QueryResult { elapsed, row_count });
         }
 
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
-        println!("Q{query_id} avg time: {avg:.2} ms");
+        println!("Query {query_id} avg time: {avg:.2} ms");
+
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
 
         Ok(query_results)
     }
@@ -294,7 +303,7 @@ impl RunOpt {
 
         let mut stream = execute_stream(physical_plan.clone(), state.task_ctx())?;
         while let Some(batch) = stream.next().await {
-            row_count += batch.unwrap().num_rows();
+            row_count += batch?.num_rows();
         }
 
         if debug {
diff --git a/benchmarks/src/tpcds/mod.rs b/benchmarks/src/tpcds/mod.rs
new file mode 100644
index 0000000000000..4829eb9fd348a
--- /dev/null
+++ b/benchmarks/src/tpcds/mod.rs
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod run;
+pub use run::RunOpt;
diff --git a/benchmarks/src/tpcds/run.rs b/benchmarks/src/tpcds/run.rs
new file mode 100644
index 0000000000000..f7ef6991515da
--- /dev/null
+++ b/benchmarks/src/tpcds/run.rs
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fs;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
+
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::{
+    ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+};
+use datafusion::datasource::{MemTable, TableProvider};
+use datafusion::error::Result;
+use datafusion::physical_plan::display::DisplayableExecutionPlan;
+use datafusion::physical_plan::{collect, displayable};
+use datafusion::prelude::*;
+use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
+use datafusion_common::{DEFAULT_PARQUET_EXTENSION, plan_err};
+
+use clap::Args;
+use log::info;
+
+// hack to avoid `default_value is meaningless for bool` errors
+type BoolDefaultTrue = bool;
+pub const TPCDS_QUERY_START_ID: usize = 1;
+pub const TPCDS_QUERY_END_ID: usize = 99;
+
+pub const TPCDS_TABLES: &[&str] = &[
+    "call_center",
+    "customer_address",
+    "household_demographics",
+    "promotion",
+    "store_sales",
+    "web_page",
+    "catalog_page",
+    "customer_demographics",
+    "income_band",
+    "reason",
+    "store",
+    "web_returns",
+    "catalog_returns",
+    "customer",
+    "inventory",
+    "ship_mode",
+    "time_dim",
+    "web_sales",
+    "catalog_sales",
+    "date_dim",
+    "item",
+    "store_returns",
+    "warehouse",
+    "web_site",
+];
+
+/// Get the SQL statements from the specified query file
+pub fn get_query_sql(base_query_path: &str, query: usize) -> Result<Vec<String>> {
+    if query > 0 && query < 100 {
+        let filename = format!("{base_query_path}/{query}.sql");
+        let mut errors = vec![];
+        match fs::read_to_string(&filename) {
+            Ok(contents) => {
+                return Ok(contents
+                    .split(';')
+                    .map(|s| s.trim())
+                    .filter(|s| !s.is_empty())
+                    .map(|s| s.to_string())
+                    .collect());
+            }
+            Err(e) => errors.push(format!("{filename}: {e}")),
+        };
+
+        plan_err!("invalid query. Could not find query: {:?}", errors)
+    } else {
+        plan_err!("invalid query. Expected value between 1 and 99")
+    }
+}
+
+/// Run the tpcds benchmark.
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number. If not specified, runs all queries
+    #[arg(short, long)]
+    pub query: Option<usize>,
+
+    /// Common options
+    #[command(flatten)]
+    common: CommonOpt,
+
+    /// Path to data files
+    #[arg(required = true, short = 'p', long = "path")]
+    path: PathBuf,
+
+    /// Path to query files
+    #[arg(required = true, short = 'Q', long = "query_path")]
+    query_path: PathBuf,
+
+    /// Load the data into a MemTable before executing the query
+    #[arg(short = 'm', long = "mem-table")]
+    mem_table: bool,
+
+    /// Path to machine readable output file
+    #[arg(short = 'o', long = "output")]
+    output_path: Option<PathBuf>,
+
+    /// Whether to disable collection of statistics (and cost based optimizations) or not.
+    #[arg(short = 'S', long = "disable-statistics")]
+    disable_statistics: bool,
+
+    /// If true then hash join used, if false then sort merge join
+    /// True by default.
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
+    prefer_hash_join: BoolDefaultTrue,
+
+    /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join
+    /// False by default.
+    #[arg(
+        short = 'w',
+        long = "enable_piecewise_merge_join",
+        default_value = "false"
+    )]
+    enable_piecewise_merge_join: BoolDefaultTrue,
+
+    /// Mark the first column of each table as sorted in ascending order.
+    /// The tables should have been created with the `--sort` option for this to have any effect.
+    #[arg(short = 't', long = "sorted")]
+    sorted: bool,
+
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
+}
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running benchmarks with the following options: {self:?}");
+        let query_range = match self.query {
+            Some(query_id) => query_id..=query_id,
+            None => TPCDS_QUERY_START_ID..=TPCDS_QUERY_END_ID,
+        };
+
+        let mut benchmark_run = BenchmarkRun::new();
+        let mut config = self
+            .common
+            .config()?
+            .with_collect_statistics(!self.disable_statistics);
+        config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
+        config.options_mut().optimizer.enable_piecewise_merge_join =
+            self.enable_piecewise_merge_join;
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
+        // register tables
+        self.register_tables(&ctx).await?;
+
+        for query_id in query_range {
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let query_run = self.benchmark_query(query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
+            }
+        }
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
+        Ok(())
+    }
+
+    async fn benchmark_query(
+        &self,
+        query_id: usize,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut millis = vec![];
+        // run benchmark
+        let mut query_results = vec![];
+
+        let sql = &get_query_sql(self.query_path.to_str().unwrap(), query_id)?;
+
+        if self.common.debug {
+            println!("=== SQL for query {query_id} ===\n{}\n", sql.join(";\n"));
+        }
+
+        for i in 0..self.iterations() {
+            let start = Instant::now();
+
+            // query 15 is special, with 3 statements. the second statement is the one from which we
+            // want to capture the results
+            let mut result = vec![];
+
+            for query in sql {
+                result = self.execute_query(ctx, query).await?;
+            }
+
+            let elapsed = start.elapsed();
+            let ms = elapsed.as_secs_f64() * 1000.0;
+            millis.push(ms);
+            info!("output:\n\n{}\n\n", pretty_format_batches(&result)?);
+            let row_count = result.iter().map(|b| b.num_rows()).sum();
+            println!(
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+            );
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        let avg = millis.iter().sum::<f64>() / millis.len() as f64;
+        println!("Query {query_id} avg time: {avg:.2} ms");
+
+        // Print memory stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
+        Ok(query_results)
+    }
+
+    async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
+        for table in TPCDS_TABLES {
+            let table_provider = { self.get_table(ctx, table).await? };
+
+            if self.mem_table {
+                println!("Loading table '{table}' into memory");
+                let start = Instant::now();
+                let memtable =
+                    MemTable::load(table_provider, Some(self.partitions()), &ctx.state())
+                        .await?;
+                println!(
+                    "Loaded table '{}' into memory in {} ms",
+                    table,
+                    start.elapsed().as_millis()
+                );
+                ctx.register_table(*table, Arc::new(memtable))?;
+            } else {
+                ctx.register_table(*table, table_provider)?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn execute_query(
+        &self,
+        ctx: &SessionContext,
+        sql: &str,
+    ) -> Result<Vec<RecordBatch>> {
+        let debug = self.common.debug;
+        let plan = ctx.sql(sql).await?;
+        let (state, plan) = plan.into_parts();
+
+        if debug {
+            println!("=== Logical plan ===\n{plan}\n");
+        }
+
+        let plan = state.optimize(&plan)?;
+        if debug {
+            println!("=== Optimized logical plan ===\n{plan}\n");
+        }
+        let physical_plan = state.create_physical_plan(&plan).await?;
+        if debug {
+            println!(
+                "=== Physical plan ===\n{}\n",
+                displayable(physical_plan.as_ref()).indent(true)
+            );
+        }
+        let result = collect(physical_plan.clone(), state.task_ctx()).await?;
+        if debug {
+            println!(
+                "=== Physical plan with metrics ===\n{}\n",
+                DisplayableExecutionPlan::with_metrics(physical_plan.as_ref())
+                    .indent(true)
+            );
+            if !result.is_empty() {
+                // do not call print_batches if there are no batches as the result is confusing
+                // and makes it look like there is a batch with no columns
+                pretty::print_batches(&result)?;
+            }
+        }
+        Ok(result)
+    }
+
+    async fn get_table(
+        &self,
+        ctx: &SessionContext,
+        table: &str,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let path = self.path.to_str().unwrap();
+        let target_partitions = self.partitions();
+
+        // Obtain a snapshot of the SessionState
+        let state = ctx.state();
+        let path = format!("{path}/{table}.parquet");
+
+        // Check if the file exists
+        if !std::path::Path::new(&path).exists() {
+            eprintln!("Warning registering {table}: Table file does not exist: {path}");
+        }
+
+        let format = ParquetFormat::default()
+            .with_options(ctx.state().table_options().parquet.clone());
+
+        let table_path = ListingTableUrl::parse(path)?;
+        let options = ListingOptions::new(Arc::new(format))
+            .with_file_extension(DEFAULT_PARQUET_EXTENSION)
+            .with_target_partitions(target_partitions)
+            .with_collect_stat(state.config().collect_statistics());
+        let schema = options.infer_schema(&state, &table_path).await?;
+
+        if self.common.debug {
+            println!(
+                "Inferred schema from {table_path} for table '{table}':\n{schema:#?}\n"
+            );
+        }
+
+        let options = if self.sorted {
+            let key_column_name = schema.fields()[0].name();
+            options
+                .with_file_sort_order(vec![vec![col(key_column_name).sort(true, false)]])
+        } else {
+            options
+        };
+
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(options)
+            .with_schema(schema);
+
+        Ok(Arc::new(ListingTable::try_new(config)?))
+    }
+
+    fn iterations(&self) -> usize {
+        self.common.iterations
+    }
+
+    fn partitions(&self) -> usize {
+        self.common
+            .partitions
+            .unwrap_or_else(get_available_parallelism)
+    }
+}
diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs
deleted file mode 100644
index 5219e09cd3052..0000000000000
--- a/benchmarks/src/tpch/convert.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::logical_expr::select_expr::SelectExpr;
-use datafusion_common::instant::Instant;
-use std::fs;
-use std::path::{Path, PathBuf};
-
-use datafusion::common::not_impl_err;
-
-use super::get_tbl_tpch_table_schema;
-use super::TPCH_TABLES;
-use datafusion::error::Result;
-use datafusion::prelude::*;
-use parquet::basic::Compression;
-use parquet::file::properties::WriterProperties;
-use structopt::StructOpt;
-
-/// Convert tpch .slt files to .parquet or .csv files
-#[derive(Debug, StructOpt)]
-pub struct ConvertOpt {
-    /// Path to csv files
-    #[structopt(parse(from_os_str), required = true, short = "i", long = "input")]
-    input_path: PathBuf,
-
-    /// Output path
-    #[structopt(parse(from_os_str), required = true, short = "o", long = "output")]
-    output_path: PathBuf,
-
-    /// Output file format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format")]
-    file_format: String,
-
-    /// Compression to use when writing Parquet files
-    #[structopt(short = "c", long = "compression", default_value = "zstd")]
-    compression: String,
-
-    /// Number of partitions to produce
-    #[structopt(short = "n", long = "partitions", default_value = "1")]
-    partitions: usize,
-
-    /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size", default_value = "8192")]
-    batch_size: usize,
-
-    /// Sort each table by its first column in ascending order.
-    #[structopt(short = "t", long = "sort")]
-    sort: bool,
-}
-
-impl ConvertOpt {
-    pub async fn run(self) -> Result<()> {
-        let compression = self.compression()?;
-
-        let input_path = self.input_path.to_str().unwrap();
-        let output_path = self.output_path.to_str().unwrap();
-
-        let output_root_path = Path::new(output_path);
-        for table in TPCH_TABLES {
-            let start = Instant::now();
-            let schema = get_tbl_tpch_table_schema(table);
-            let key_column_name = schema.fields()[0].name();
-
-            let input_path = format!("{input_path}/{table}.tbl");
-            let options = CsvReadOptions::new()
-                .schema(&schema)
-                .has_header(false)
-                .delimiter(b'|')
-                .file_extension(".tbl");
-            let options = if self.sort {
-                // indicated that the file is already sorted by its first column to speed up the conversion
-                options
-                    .file_sort_order(vec![vec![col(key_column_name).sort(true, false)]])
-            } else {
-                options
-            };
-
-            let config = SessionConfig::new().with_batch_size(self.batch_size);
-            let ctx = SessionContext::new_with_config(config);
-
-            // build plan to read the TBL file
-            let mut csv = ctx.read_csv(&input_path, options).await?;
-
-            // Select all apart from the padding column
-            let selection = csv
-                .schema()
-                .iter()
-                .take(schema.fields.len() - 1)
-                .map(Expr::from)
-                .map(SelectExpr::from)
-                .collect::<Vec<_>>();
-
-            csv = csv.select(selection)?;
-            // optionally, repartition the file
-            let partitions = self.partitions;
-            if partitions > 1 {
-                csv = csv.repartition(Partitioning::RoundRobinBatch(partitions))?
-            }
-            let csv = if self.sort {
-                csv.sort_by(vec![col(key_column_name)])?
-            } else {
-                csv
-            };
-
-            // create the physical plan
-            let csv = csv.create_physical_plan().await?;
-
-            let output_path = output_root_path.join(table);
-            let output_path = output_path.to_str().unwrap().to_owned();
-            fs::create_dir_all(&output_path)?;
-            println!(
-                "Converting '{}' to {} files in directory '{}'",
-                &input_path, self.file_format, &output_path
-            );
-            match self.file_format.as_str() {
-                "csv" => ctx.write_csv(csv, output_path).await?,
-                "parquet" => {
-                    let props = WriterProperties::builder()
-                        .set_compression(compression)
-                        .build();
-                    ctx.write_parquet(csv, output_path, Some(props)).await?
-                }
-                other => {
-                    return not_impl_err!("Invalid output format: {other}");
-                }
-            }
-            println!("Conversion completed in {} ms", start.elapsed().as_millis());
-        }
-
-        Ok(())
-    }
-
-    /// return the compression method to use when writing parquet
-    fn compression(&self) -> Result<Compression> {
-        Ok(match self.compression.as_str() {
-            "none" => Compression::UNCOMPRESSED,
-            "snappy" => Compression::SNAPPY,
-            "brotli" => Compression::BROTLI(Default::default()),
-            "gzip" => Compression::GZIP(Default::default()),
-            "lz4" => Compression::LZ4,
-            "lz0" => Compression::LZO,
-            "zstd" => Compression::ZSTD(Default::default()),
-            other => {
-                return not_impl_err!("Invalid compression format: {other}");
-            }
-        })
-    }
-}
diff --git a/benchmarks/src/tpch/mod.rs b/benchmarks/src/tpch/mod.rs
index 23d0681f560c8..681aa0a403ee1 100644
--- a/benchmarks/src/tpch/mod.rs
+++ b/benchmarks/src/tpch/mod.rs
@@ -27,13 +27,13 @@ use std::fs;
 mod run;
 pub use run::RunOpt;
 
-mod convert;
-pub use convert::ConvertOpt;
-
 pub const TPCH_TABLES: &[&str] = &[
     "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region",
 ];
 
+pub const TPCH_QUERY_START_ID: usize = 1;
+pub const TPCH_QUERY_END_ID: usize = 22;
+
 /// The `.tbl` file contains a trailing column
 pub fn get_tbl_tpch_table_schema(table: &str) -> Schema {
     let mut schema = SchemaBuilder::from(get_tpch_table_schema(table).fields);
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
index caef823aaf31d..0d1268013c168 100644
--- a/benchmarks/src/tpch/run.rs
+++ b/benchmarks/src/tpch/run.rs
@@ -19,15 +19,16 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use super::{
-    get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
+    TPCH_QUERY_END_ID, TPCH_QUERY_START_ID, TPCH_TABLES, get_query_sql,
+    get_tbl_tpch_table_schema, get_tpch_table_schema,
 };
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
+use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::file_format::FileFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
@@ -40,8 +41,8 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
 
+use clap::Args;
 use log::info;
-use structopt::StructOpt;
 
 // hack to avoid `default_value is meaningless for bool` errors
 type BoolDefaultTrue = bool;
@@ -53,52 +54,62 @@ type BoolDefaultTrue = bool;
 /// [2].
 ///
 /// [1]: http://www.tpc.org/tpch/
-/// [2]: https://github.com/databricks/tpch-dbgen.git,
+/// [2]: https://github.com/databricks/tpch-dbgen.git
 /// [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
-#[derive(Debug, StructOpt, Clone)]
-#[structopt(verbatim_doc_comment)]
+#[derive(Debug, Args, Clone)]
+#[command(verbatim_doc_comment)]
 pub struct RunOpt {
     /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
-    query: Option<usize>,
+    #[arg(short, long)]
+    pub query: Option<usize>,
 
     /// Common options
-    #[structopt(flatten)]
+    #[command(flatten)]
     common: CommonOpt,
 
     /// Path to data files
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+    #[arg(required = true, short = 'p', long = "path")]
     path: PathBuf,
 
     /// File format: `csv` or `parquet`
-    #[structopt(short = "f", long = "format", default_value = "csv")]
+    #[arg(short = 'f', long = "format", default_value = "csv")]
     file_format: String,
 
     /// Load the data into a MemTable before executing the query
-    #[structopt(short = "m", long = "mem-table")]
+    #[arg(short = 'm', long = "mem-table")]
     mem_table: bool,
 
     /// Path to machine readable output file
-    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    #[arg(short = 'o', long = "output")]
     output_path: Option<PathBuf>,
 
     /// Whether to disable collection of statistics (and cost based optimizations) or not.
-    #[structopt(short = "S", long = "disable-statistics")]
+    #[arg(short = 'S', long = "disable-statistics")]
     disable_statistics: bool,
 
     /// If true then hash join used, if false then sort merge join
     /// True by default.
-    #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
+    #[arg(short = 'j', long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
 
+    /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join
+    /// False by default.
+    #[arg(
+        short = 'w',
+        long = "enable_piecewise_merge_join",
+        default_value = "false"
+    )]
+    enable_piecewise_merge_join: BoolDefaultTrue,
+
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
-    #[structopt(short = "t", long = "sorted")]
+    #[arg(short = 't', long = "sorted")]
     sorted: bool,
-}
 
-const TPCH_QUERY_START_ID: usize = 1;
-const TPCH_QUERY_END_ID: usize = 22;
+    /// How many bytes to buffer on the probe side of hash joins.
+    #[arg(long, default_value = "0")]
+    hash_join_buffering_capacity: usize,
+}
 
 impl RunOpt {
     pub async fn run(self) -> Result<()> {
@@ -114,19 +125,32 @@ impl RunOpt {
             .config()?
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
-        let rt_builder = self.common.runtime_env_builder()?;
-        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+        config.options_mut().optimizer.enable_piecewise_merge_join =
+            self.enable_piecewise_merge_join;
+        config.options_mut().execution.hash_join_buffering_capacity =
+            self.hash_join_buffering_capacity;
+        let rt = self.common.build_runtime()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt);
         // register tables
         self.register_tables(&ctx).await?;
 
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let query_run = self.benchmark_query(query_id, &ctx).await?;
-            for iter in query_run {
-                benchmark_run.write_iter(iter.elapsed, iter.row_count);
+            let query_run = self.benchmark_query(query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
         }
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
@@ -138,11 +162,12 @@ impl RunOpt {
         let mut millis = vec![];
         // run benchmark
         let mut query_results = vec![];
+
+        let sql = &get_query_sql(query_id)?;
+
         for i in 0..self.iterations() {
             let start = Instant::now();
 
-            let sql = &get_query_sql(query_id)?;
-
             // query 15 is special, with 3 statements. the second statement is the one from which we
             // want to capture the results
             let mut result = vec![];
@@ -160,7 +185,7 @@ impl RunOpt {
                 }
             }
 
-            let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0;
+            let elapsed = start.elapsed();
             let ms = elapsed.as_secs_f64() * 1000.0;
             millis.push(ms);
             info!("output:\n\n{}\n\n", pretty_format_batches(&result)?);
@@ -174,6 +199,9 @@ impl RunOpt {
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
         println!("Query {query_id} avg time: {avg:.2} ms");
 
+        // Print memory stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
         Ok(query_results)
     }
 
@@ -264,7 +292,7 @@ impl RunOpt {
                     (Arc::new(format), path, ".tbl")
                 }
                 "csv" => {
-                    let path = format!("{path}/{table}");
+                    let path = format!("{path}/csv/{table}");
                     let format = CsvFormat::default()
                         .with_delimiter(b',')
                         .with_has_header(true);
@@ -320,11 +348,6 @@ impl RunOpt {
     }
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 #[cfg(test)]
 // Only run with "ci" mode when we have the data
 #[cfg(feature = "ci")]
@@ -363,6 +386,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -373,7 +397,9 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            enable_piecewise_merge_join: false,
             sorted: false,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(query)?;
@@ -381,7 +407,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.into_optimized_plan()?;
             let bytes = logical_plan_to_bytes(&plan)?;
-            let plan2 = logical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", plan.display_indent());
             let plan2_formatted = format!("{}", plan2.display_indent());
             assert_eq!(plan_formatted, plan2_formatted);
@@ -400,6 +426,7 @@ mod tests {
             memory_limit: None,
             sort_spill_reservation_bytes: None,
             debug: false,
+            simulate_latency: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -410,7 +437,9 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            enable_piecewise_merge_join: false,
             sorted: false,
+            hash_join_buffering_capacity: 0,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(query)?;
@@ -418,7 +447,7 @@ mod tests {
             let plan = ctx.sql(&query).await?;
             let plan = plan.create_physical_plan().await?;
             let bytes = physical_plan_to_bytes(plan.clone())?;
-            let plan2 = physical_plan_from_bytes(&bytes, &ctx)?;
+            let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
             let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false));
             let plan2_formatted =
                 format!("{}", displayable(plan2.as_ref()).indent(false));
diff --git a/benchmarks/src/util/access_log.rs b/benchmarks/src/util/access_log.rs
deleted file mode 100644
index 2b29465ee20e3..0000000000000
--- a/benchmarks/src/util/access_log.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Benchmark data generation
-
-use datafusion::common::Result;
-use datafusion::test_util::parquet::TestParquetFile;
-use parquet::file::properties::WriterProperties;
-use std::path::PathBuf;
-use structopt::StructOpt;
-use test_utils::AccessLogGenerator;
-
-// Options and builder for making an access log test file
-// Note don't use docstring or else it ends up in help
-#[derive(Debug, StructOpt, Clone)]
-pub struct AccessLogOpt {
-    /// Path to folder where access log file will be generated
-    #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
-    path: PathBuf,
-
-    /// Data page size of the generated parquet file
-    #[structopt(long = "page-size")]
-    page_size: Option<usize>,
-
-    /// Data page size of the generated parquet file
-    #[structopt(long = "row-group-size")]
-    row_group_size: Option<usize>,
-
-    /// Total size of generated dataset. The default scale factor of 1.0 will generate a roughly 1GB parquet file
-    #[structopt(long = "scale-factor", default_value = "1.0")]
-    scale_factor: f32,
-}
-
-impl AccessLogOpt {
-    /// Create the access log and return the file.
-    ///
-    /// See [`TestParquetFile`] for more details
-    pub fn build(self) -> Result<TestParquetFile> {
-        let path = self.path.join("logs.parquet");
-
-        let mut props_builder = WriterProperties::builder();
-
-        if let Some(s) = self.page_size {
-            props_builder = props_builder
-                .set_data_page_size_limit(s)
-                .set_write_batch_size(s);
-        }
-
-        if let Some(s) = self.row_group_size {
-            props_builder = props_builder.set_max_row_group_size(s);
-        }
-        let props = props_builder.build();
-
-        let generator = AccessLogGenerator::new();
-
-        let num_batches = 100_f32 * self.scale_factor;
-
-        TestParquetFile::try_new(path, props, generator.take(num_batches as usize))
-    }
-}
diff --git a/benchmarks/src/util/latency_object_store.rs b/benchmarks/src/util/latency_object_store.rs
new file mode 100644
index 0000000000000..9ef8d1b78b751
--- /dev/null
+++ b/benchmarks/src/util/latency_object_store.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! An ObjectStore wrapper that adds simulated S3-like latency to get and list operations.
+//!
+//! Cycles through a fixed latency distribution inspired by real S3 performance:
+//! - P50: ~30ms
+//! - P75-P90: ~100-120ms
+//! - P99: ~150-200ms
+
+use std::fmt;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use futures::stream::BoxStream;
+use object_store::path::Path;
+use object_store::{
+    CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
+};
+
+/// GET latency distribution, inspired by S3 latencies.
+/// Deterministic but shuffled to avoid artificial patterns.
+/// 20 values: 11x P50 (~25-35ms), 5x P75-P90 (~70-110ms), 2x P95 (~120-150ms), 2x P99 (~180-200ms)
+/// Sorted: 25,25,28,28,30,30,30,30,32,32,35, 70,85,100,100,110, 130,150, 180,200
+/// P50≈32ms, P90≈110ms, P99≈200ms
+const GET_LATENCIES_MS: &[u64] = &[
+    30, 100, 25, 85, 32, 200, 28, 130, 35, 70, 30, 150, 30, 110, 28, 180, 32, 25, 100, 30,
+];
+
+/// LIST latency distribution, generally higher than GET.
+/// 20 values: 11x P50 (~40-70ms), 5x P75-P90 (~120-180ms), 2x P95 (~200-250ms), 2x P99 (~300-400ms)
+/// Sorted: 40,40,50,50,55,55,60,60,65,65,70, 120,140,160,160,180, 210,250, 300,400
+/// P50≈65ms, P90≈180ms, P99≈400ms
+const LIST_LATENCIES_MS: &[u64] = &[
+    55, 160, 40, 140, 65, 400, 50, 210, 70, 120, 60, 250, 55, 180, 50, 300, 65, 40, 160,
+    60,
+];
+
+/// An ObjectStore wrapper that injects simulated latency on get and list calls.
+#[derive(Debug)]
+pub struct LatencyObjectStore<T: ObjectStore> {
+    inner: T,
+    get_counter: AtomicUsize,
+    list_counter: AtomicUsize,
+}
+
+impl<T: ObjectStore> LatencyObjectStore<T> {
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner,
+            get_counter: AtomicUsize::new(0),
+            list_counter: AtomicUsize::new(0),
+        }
+    }
+
+    fn next_get_latency(&self) -> Duration {
+        let idx =
+            self.get_counter.fetch_add(1, Ordering::Relaxed) % GET_LATENCIES_MS.len();
+        Duration::from_millis(GET_LATENCIES_MS[idx])
+    }
+
+    fn next_list_latency(&self) -> Duration {
+        let idx =
+            self.list_counter.fetch_add(1, Ordering::Relaxed) % LIST_LATENCIES_MS.len();
+        Duration::from_millis(LIST_LATENCIES_MS[idx])
+    }
+}
+
+impl<T: ObjectStore> fmt::Display for LatencyObjectStore<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "LatencyObjectStore({})", self.inner)
+    }
+}
+
+#[async_trait]
+impl<T: ObjectStore> ObjectStore for LatencyObjectStore<T> {
+    async fn put_opts(
+        &self,
+        location: &Path,
+        payload: PutPayload,
+        opts: PutOptions,
+    ) -> Result<PutResult> {
+        self.inner.put_opts(location, payload, opts).await
+    }
+
+    async fn put_multipart_opts(
+        &self,
+        location: &Path,
+        opts: PutMultipartOptions,
+    ) -> Result<Box<dyn MultipartUpload>> {
+        self.inner.put_multipart_opts(location, opts).await
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        tokio::time::sleep(self.next_get_latency()).await;
+        self.inner.get_opts(location, options).await
+    }
+
+    async fn get_ranges(
+        &self,
+        location: &Path,
+        ranges: &[std::ops::Range<u64>],
+    ) -> Result<Vec<bytes::Bytes>> {
+        tokio::time::sleep(self.next_get_latency()).await;
+        self.inner.get_ranges(location, ranges).await
+    }
+
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
+        self.inner.delete_stream(locations)
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
+        let latency = self.next_list_latency();
+        let stream = self.inner.list(prefix);
+        futures::stream::once(async move {
+            tokio::time::sleep(latency).await;
+            futures::stream::empty()
+        })
+        .flatten()
+        .chain(stream)
+        .boxed()
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        tokio::time::sleep(self.next_list_latency()).await;
+        self.inner.list_with_delimiter(prefix).await
+    }
+
+    async fn copy_opts(
+        &self,
+        from: &Path,
+        to: &Path,
+        options: CopyOptions,
+    ) -> Result<()> {
+        self.inner.copy_opts(from, to, options).await
+    }
+}
diff --git a/benchmarks/src/util/memory.rs b/benchmarks/src/util/memory.rs
new file mode 100644
index 0000000000000..11b96ef227756
--- /dev/null
+++ b/benchmarks/src/util/memory.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Print Peak RSS, Peak Commit, Page Faults based on mimalloc api
+pub fn print_memory_stats() {
+    #[cfg(all(feature = "mimalloc", feature = "mimalloc_extended"))]
+    {
+        use datafusion_common::human_readable_size;
+        let mut peak_rss = 0;
+        let mut peak_commit = 0;
+        let mut page_faults = 0;
+        unsafe {
+            libmimalloc_sys::mi_process_info(
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+                &mut peak_rss,
+                std::ptr::null_mut(),
+                &mut peak_commit,
+                &mut page_faults,
+            );
+        }
+
+        // When modifying this output format, make sure to update the corresponding
+        // parsers in `mem_profile.rs`, specifically `parse_vm_line` and `parse_query_time`,
+        // to keep the log output and parser logic in sync.
+        println!(
+            "Peak RSS: {}, Peak Commit: {}, Page Faults: {}",
+            if peak_rss == 0 {
+                "N/A".to_string()
+            } else {
+                human_readable_size(peak_rss)
+            },
+            if peak_commit == 0 {
+                "N/A".to_string()
+            } else {
+                human_readable_size(peak_commit)
+            },
+            page_faults
+        );
+    }
+}
diff --git a/benchmarks/src/util/mod.rs b/benchmarks/src/util/mod.rs
index 95c6e5f53d0f0..6dc11c0f425bd 100644
--- a/benchmarks/src/util/mod.rs
+++ b/benchmarks/src/util/mod.rs
@@ -16,10 +16,11 @@
 // under the License.
 
 //! Shared benchmark utilities
-mod access_log;
+pub mod latency_object_store;
+mod memory;
 mod options;
 mod run;
 
-pub use access_log::AccessLogOpt;
+pub use memory::print_memory_stats;
 pub use options::CommonOpt;
-pub use run::{BenchQuery, BenchmarkRun};
+pub use run::{BenchQuery, BenchmarkRun, QueryResult};
diff --git a/benchmarks/src/util/options.rs b/benchmarks/src/util/options.rs
index 6627a287dfcd4..a50a5268c0bfe 100644
--- a/benchmarks/src/util/options.rs
+++ b/benchmarks/src/util/options.rs
@@ -17,50 +17,59 @@
 
 use std::{num::NonZeroUsize, sync::Arc};
 
+use clap::Args;
 use datafusion::{
     execution::{
         disk_manager::DiskManagerBuilder,
         memory_pool::{FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool},
-        runtime_env::RuntimeEnvBuilder,
+        object_store::ObjectStoreUrl,
+        runtime_env::{RuntimeEnv, RuntimeEnvBuilder},
     },
     prelude::SessionConfig,
 };
 use datafusion_common::{DataFusionError, Result};
-use structopt::StructOpt;
+use object_store::local::LocalFileSystem;
+
+use super::latency_object_store::LatencyObjectStore;
 
 // Common benchmark options (don't use doc comments otherwise this doc
 // shows up in help files)
-#[derive(Debug, StructOpt, Clone)]
+#[derive(Debug, Args, Clone)]
 pub struct CommonOpt {
     /// Number of iterations of each test run
-    #[structopt(short = "i", long = "iterations", default_value = "3")]
+    #[arg(short = 'i', long = "iterations", default_value = "3")]
     pub iterations: usize,
 
     /// Number of partitions to process in parallel. Defaults to number of available cores.
-    #[structopt(short = "n", long = "partitions")]
+    #[arg(short = 'n', long = "partitions")]
     pub partitions: Option<usize>,
 
     /// Batch size when reading CSV or Parquet files
-    #[structopt(short = "s", long = "batch-size")]
+    #[arg(short = 's', long = "batch-size")]
     pub batch_size: Option<usize>,
 
     /// The memory pool type to use, should be one of "fair" or "greedy"
-    #[structopt(long = "mem-pool-type", default_value = "fair")]
+    #[arg(long = "mem-pool-type", default_value = "fair")]
     pub mem_pool_type: String,
 
     /// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query
     /// if there's any, otherwise run with no memory limit.
-    #[structopt(long = "memory-limit", parse(try_from_str = parse_memory_limit))]
+    #[arg(long = "memory-limit", value_parser = parse_capacity_limit)]
     pub memory_limit: Option<usize>,
 
     /// The amount of memory to reserve for sort spill operations. DataFusion's default value will be used
     /// if not specified.
-    #[structopt(long = "sort-spill-reservation-bytes", parse(try_from_str = parse_memory_limit))]
+    #[arg(long = "sort-spill-reservation-bytes", value_parser = parse_capacity_limit)]
     pub sort_spill_reservation_bytes: Option<usize>,
 
     /// Activate debug mode to see more details
-    #[structopt(short, long)]
+    #[arg(short, long)]
     pub debug: bool,
+
+    /// Simulate object store latency to mimic remote storage (e.g. S3).
+    /// Adds random latency in the range 20-200ms to each object store operation.
+    #[arg(long = "simulate-latency")]
+    pub simulate_latency: bool,
 }
 
 impl CommonOpt {
@@ -91,7 +100,15 @@ impl CommonOpt {
     pub fn runtime_env_builder(&self) -> Result<RuntimeEnvBuilder> {
         let mut rt_builder = RuntimeEnvBuilder::new();
         const NUM_TRACKED_CONSUMERS: usize = 5;
-        if let Some(memory_limit) = self.memory_limit {
+        // Use CLI --memory-limit if provided, otherwise fall back to
+        // DATAFUSION_RUNTIME_MEMORY_LIMIT env var
+        let memory_limit = self.memory_limit.or_else(|| {
+            std::env::var("DATAFUSION_RUNTIME_MEMORY_LIMIT")
+                .ok()
+                .and_then(|val| parse_capacity_limit(&val).ok())
+        });
+
+        if let Some(memory_limit) = memory_limit {
             let pool: Arc<dyn MemoryPool> = match self.mem_pool_type.as_str() {
                 "fair" => Arc::new(TrackConsumersPool::new(
                     FairSpillPool::new(memory_limit),
@@ -105,7 +122,7 @@ impl CommonOpt {
                     return Err(DataFusionError::Configuration(format!(
                         "Invalid memory pool type: {}",
                         self.mem_pool_type
-                    )))
+                    )));
                 }
             };
             rt_builder = rt_builder
@@ -114,22 +131,44 @@ impl CommonOpt {
         }
         Ok(rt_builder)
     }
+
+    /// Build the runtime environment, optionally wrapping the local filesystem
+    /// with a throttled object store to simulate remote storage latency.
+    pub fn build_runtime(&self) -> Result<Arc<RuntimeEnv>> {
+        let rt = self.runtime_env_builder()?.build_arc()?;
+        if self.simulate_latency {
+            let store: Arc<dyn object_store::ObjectStore> =
+                Arc::new(LatencyObjectStore::new(LocalFileSystem::new()));
+            let url = ObjectStoreUrl::parse("file:///")?;
+            rt.register_object_store(url.as_ref(), store);
+            println!(
+                "Simulating S3-like object store latency (get: 25-200ms, list: 40-400ms)"
+            );
+        }
+        Ok(rt)
+    }
 }
 
-/// Parse memory limit from string to number of bytes
-/// e.g. '1.5G', '100M' -> 1572864
-fn parse_memory_limit(limit: &str) -> Result<usize, String> {
+/// Parse capacity limit from string to number of bytes by allowing units: K, M and G.
+/// Supports formats like '1.5G' -> 1610612736, '100M' -> 104857600
+fn parse_capacity_limit(limit: &str) -> Result<usize, String> {
+    if limit.trim().is_empty() {
+        return Err("Capacity limit cannot be empty".to_string());
+    }
     let (number, unit) = limit.split_at(limit.len() - 1);
     let number: f64 = number
         .parse()
-        .map_err(|_| format!("Failed to parse number from memory limit '{limit}'"))?;
+        .map_err(|_| format!("Failed to parse number from capacity limit '{limit}'"))?;
+    if number.is_sign_negative() || number.is_infinite() {
+        return Err("Limit value should be positive finite number".to_string());
+    }
 
     match unit {
         "K" => Ok((number * 1024.0) as usize),
         "M" => Ok((number * 1024.0 * 1024.0) as usize),
         "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
         _ => Err(format!(
-            "Unsupported unit '{unit}' in memory limit '{limit}'"
+            "Unsupported unit '{unit}' in capacity limit '{limit}'. Unit must be one of: 'K', 'M', 'G'"
         )),
     }
 }
@@ -139,16 +178,59 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_memory_limit_all() {
+    fn test_runtime_env_builder_reads_env_var() {
+        // Set the env var and verify runtime_env_builder picks it up
+        // when no CLI --memory-limit is provided
+        let opt = CommonOpt {
+            iterations: 3,
+            partitions: None,
+            batch_size: None,
+            mem_pool_type: "fair".to_string(),
+            memory_limit: None,
+            sort_spill_reservation_bytes: None,
+            debug: false,
+            simulate_latency: false,
+        };
+
+        // With env var set, builder should succeed and have a memory pool
+        // SAFETY: This test is single-threaded and the env var is restored after use
+        unsafe {
+            std::env::set_var("DATAFUSION_RUNTIME_MEMORY_LIMIT", "2G");
+        }
+        let builder = opt.runtime_env_builder().unwrap();
+        let runtime = builder.build().unwrap();
+        unsafe {
+            std::env::remove_var("DATAFUSION_RUNTIME_MEMORY_LIMIT");
+        }
+        // A 2G memory pool should be present — verify it reports the correct limit
+        match runtime.memory_pool.memory_limit() {
+            datafusion::execution::memory_pool::MemoryLimit::Finite(limit) => {
+                assert_eq!(limit, 2 * 1024 * 1024 * 1024);
+            }
+            _ => panic!("Expected Finite memory limit"),
+        }
+    }
+
+    #[test]
+    fn test_parse_capacity_limit_all() {
         // Test valid inputs
-        assert_eq!(parse_memory_limit("100K").unwrap(), 102400);
-        assert_eq!(parse_memory_limit("1.5M").unwrap(), 1572864);
-        assert_eq!(parse_memory_limit("2G").unwrap(), 2147483648);
+        assert_eq!(parse_capacity_limit("100K").unwrap(), 102400);
+        assert_eq!(parse_capacity_limit("1.5M").unwrap(), 1572864);
+        assert_eq!(parse_capacity_limit("2G").unwrap(), 2147483648);
 
         // Test invalid unit
-        assert!(parse_memory_limit("500X").is_err());
+        assert!(parse_capacity_limit("500X").is_err());
 
         // Test invalid number
-        assert!(parse_memory_limit("abcM").is_err());
+        assert!(parse_capacity_limit("abcM").is_err());
+
+        // Test negative number
+        assert!(parse_capacity_limit("-1M").is_err());
+
+        // Test infinite number
+        assert!(parse_capacity_limit("infM").is_err());
+
+        // Test negative infinite number
+        assert!(parse_capacity_limit("-infM").is_err());
     }
 }
diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs
index 13969f4d39497..df17674e62961 100644
--- a/benchmarks/src/util/run.rs
+++ b/benchmarks/src/util/run.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::{error::Result, DATAFUSION_VERSION};
+use datafusion::{DATAFUSION_VERSION, error::Result};
 use datafusion_common::utils::get_available_parallelism;
 use serde::{Serialize, Serializer};
 use serde_json::Value;
@@ -90,8 +90,13 @@ pub struct BenchQuery {
     iterations: Vec<QueryIter>,
     #[serde(serialize_with = "serialize_start_time")]
     start_time: SystemTime,
+    success: bool,
+}
+/// Internal representation of a single benchmark query iteration result.
+pub struct QueryResult {
+    pub elapsed: Duration,
+    pub row_count: usize,
 }
-
 /// collects benchmark run data and then serializes it at the end
 pub struct BenchmarkRun {
     context: RunContext,
@@ -120,6 +125,7 @@ impl BenchmarkRun {
             query: id.to_owned(),
             iterations: vec![],
             start_time: SystemTime::now(),
+            success: true,
         });
         if let Some(c) = self.current_case.as_mut() {
             *c += 1;
@@ -138,6 +144,28 @@ impl BenchmarkRun {
         }
     }
 
+    /// Print the names of failed queries, if any
+    pub fn maybe_print_failures(&self) {
+        let failed_queries: Vec<&str> = self
+            .queries
+            .iter()
+            .filter_map(|q| (!q.success).then_some(q.query.as_str()))
+            .collect();
+
+        if !failed_queries.is_empty() {
+            println!("Failed Queries: {}", failed_queries.join(", "));
+        }
+    }
+
+    /// Mark current query
+    pub fn mark_failed(&mut self) {
+        if let Some(idx) = self.current_case {
+            self.queries[idx].success = false;
+        } else {
+            unreachable!("Cannot mark failure: no current case");
+        }
+    }
+
     /// Stringify data into formatted json
     pub fn to_json(&self) -> String {
         let mut output = HashMap::<&str, Value>::new();
diff --git a/ci/scripts/check_examples_docs.sh b/ci/scripts/check_examples_docs.sh
new file mode 100755
index 0000000000000..62308b323b535
--- /dev/null
+++ b/ci/scripts/check_examples_docs.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Generates documentation for DataFusion examples using the Rust-based
+# documentation generator and verifies that the committed README.md
+# is up to date.
+#
+# The README is generated from documentation comments in:
+#   datafusion-examples/examples/<group>/main.rs
+#
+# This script is intended to be run in CI to ensure that example
+# documentation stays in sync with the code.
+#
+# To update the README locally, run this script and replace README.md
+# with the generated output.
+
+set -euo pipefail
+
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
+
+EXAMPLES_DIR="$ROOT_DIR/datafusion-examples"
+README="$EXAMPLES_DIR/README.md"
+README_NEW="$EXAMPLES_DIR/README-NEW.md"
+
+echo "▶ Generating examples README (Rust generator)…"
+cargo run --quiet \
+  --manifest-path "$EXAMPLES_DIR/Cargo.toml" \
+  --bin examples-docs \
+  > "$README_NEW"
+
+echo "▶ Formatting generated README with prettier ${PRETTIER_VERSION}…"
+npx "prettier@${PRETTIER_VERSION}" \
+  --parser markdown \
+  --write "$README_NEW"
+
+echo "▶ Comparing generated README with committed version…"
+
+if ! diff -u "$README" "$README_NEW" > /tmp/examples-readme.diff; then
+  echo ""
+  echo "❌ Examples README is out of date."
+  echo ""
+  echo "The examples documentation is generated automatically from:"
+  echo "  - datafusion-examples/examples/<group>/main.rs"
+  echo ""
+  echo "To update the README locally, run:"
+  echo ""
+  echo "  cargo run --bin examples-docs \\"
+  echo "    | npx prettier@${PRETTIER_VERSION} --parser markdown --write \\"
+  echo "    > datafusion-examples/README.md"
+  echo ""
+  echo "Diff:"
+  echo "------------------------------------------------------------"
+  cat /tmp/examples-readme.diff
+  echo "------------------------------------------------------------"
+  exit 1
+fi
+
+echo "✅ Examples README is up-to-date."
diff --git a/ci/scripts/doc_prettier_check.sh b/ci/scripts/doc_prettier_check.sh
new file mode 100755
index 0000000000000..95332eb65aaf2
--- /dev/null
+++ b/ci/scripts/doc_prettier_check.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+# Load shared utilities and tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
+source "${ROOT_DIR}/ci/scripts/utils/git.sh"
+
+PRETTIER_TARGETS=(
+  '{datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md'
+  '!datafusion/CHANGELOG.md'
+  README.md
+  CONTRIBUTING.md
+)
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs prettier@${PRETTIER_VERSION} over markdown docs.
+--write         Run with \`--write\` to format files (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+echo "[${SCRIPT_NAME}] prettier@${PRETTIER_VERSION} ${MODE}"
+
+# Ensure `npx` is available
+if ! command -v npx >/dev/null 2>&1; then
+  echo "npx is required to run the prettier check. Install Node.js (e.g., brew install node) and re-run." >&2
+  exit 1
+fi
+
+PRETTIER_MODE=(--check)
+if [[ "$MODE" == "write" ]]; then
+  PRETTIER_MODE=(--write)
+fi
+
+# Ignore subproject CHANGELOG.md because it is machine generated
+npx "prettier@${PRETTIER_VERSION}" "${PRETTIER_MODE[@]}" "${PRETTIER_TARGETS[@]}"
diff --git a/ci/scripts/license_header.sh b/ci/scripts/license_header.sh
new file mode 100755
index 0000000000000..7ab8c9637598b
--- /dev/null
+++ b/ci/scripts/license_header.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+HAWKEYE_CONFIG="licenserc.toml"
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Checks Apache license headers with \`hawkeye check --config $HAWKEYE_CONFIG\`.
+--write         Run \`hawkeye format --config $HAWKEYE_CONFIG\` to auto-add/fix headers (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`hawkeye format --config ${HAWKEYE_CONFIG}\`"
+  if ! hawkeye format --config "${HAWKEYE_CONFIG}"; then
+    status=$?
+    # hawkeye returns exit code 1 when it applies fixes; treat that as success.
+    if [[ $status -eq 1 ]]; then
+      echo "[${SCRIPT_NAME}] hawkeye format applied fixes (exit 1 treated as success)"
+    else
+      exit $status
+    fi
+  fi
+else
+  echo "[${SCRIPT_NAME}] \`hawkeye check --config ${HAWKEYE_CONFIG}\`"
+  hawkeye check --config "${HAWKEYE_CONFIG}"
+fi
diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh
index 8118ecc577007..f8b5c0852fa30 100755
--- a/ci/scripts/rust_clippy.sh
+++ b/ci/scripts/rust_clippy.sh
@@ -17,5 +17,60 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests -- -D warnings
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+CLIPPY_FEATURES="avro,integration-tests,extended_tests"
+CLIPPY_ARGS=(--all-targets --workspace --features "$CLIPPY_FEATURES")
+CLIPPY_LINT_ARGS=(-- -D warnings)
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs \`cargo clippy\` to lint.
+--write         Run \`cargo clippy --fix\` to apply fixes for clippy lints (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted or staged changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+CLIPPY_CMD=(cargo clippy)
+if [[ "$MODE" == "write" ]]; then
+  CLIPPY_CMD+=(--fix)
+  if [[ $ALLOW_DIRTY -eq 1 ]]; then
+    CLIPPY_CMD+=(--allow-dirty --allow-staged)
+  fi
+fi
+CLIPPY_CMD+=("${CLIPPY_ARGS[@]}" "${CLIPPY_LINT_ARGS[@]}")
+
+echo "[${SCRIPT_NAME}] \`${CLIPPY_CMD[*]}\`"
+"${CLIPPY_CMD[@]}"
diff --git a/ci/scripts/rust_docs.sh b/ci/scripts/rust_docs.sh
index e90bfdf8bc277..91cc305f513ea 100755
--- a/ci/scripts/rust_docs.sh
+++ b/ci/scripts/rust_docs.sh
@@ -17,6 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Note: cargo doc does not support an auto-fix mode; this script runs the check-only build.
 set -ex
 export RUSTDOCFLAGS="-D warnings"
 cargo doc --document-private-items --no-deps --workspace
diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh
index c3efcf2cf2e92..7a5f7825b4e6d 100755
--- a/ci/scripts/rust_example.sh
+++ b/ci/scripts/rust_example.sh
@@ -25,12 +25,26 @@ export CARGO_PROFILE_CI_STRIP=true
 cd datafusion-examples/examples/
 cargo build --profile ci --examples
 
-files=$(ls .)
-for filename in $files
-do
-  example_name=`basename $filename ".rs"`
-  # Skip tests that rely on external storage and flight
-  if [ ! -d $filename ]; then
-    cargo run --profile ci --example $example_name
-  fi
+SKIP_LIST=("external_dependency" "flight" "ffi")
+
+skip_example() {
+    local name="$1"
+    for skip in "${SKIP_LIST[@]}"; do
+        if [ "$name" = "$skip" ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+for dir in */; do
+    example_name=$(basename "$dir")
+
+    if skip_example "$example_name"; then
+        echo "Skipping $example_name"
+        continue
+    fi
+
+    echo "Running example group: $example_name"
+    cargo run --profile ci --example "$example_name" -- all
 done
diff --git a/ci/scripts/rust_fmt.sh b/ci/scripts/rust_fmt.sh
index 9d8325877aad5..16c87cea5e0fa 100755
--- a/ci/scripts/rust_fmt.sh
+++ b/ci/scripts/rust_fmt.sh
@@ -17,5 +17,52 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
-cargo fmt --all -- --check
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs \`cargo fmt --all -- --check\` by default to verify Rust formatting.
+--write        Run \`cargo fmt --all\` to auto-fix formatting (requires a clean git worktree, no uncommitted changes).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`cargo fmt --all\`"
+  cargo fmt --all
+else
+  echo "[${SCRIPT_NAME}] \`cargo fmt --all -- --check\`"
+  cargo fmt --all -- --check
+fi
diff --git a/ci/scripts/rust_toml_fmt.sh b/ci/scripts/rust_toml_fmt.sh
index 393ad55f41684..b1f9373fbf9ee 100755
--- a/ci/scripts/rust_toml_fmt.sh
+++ b/ci/scripts/rust_toml_fmt.sh
@@ -17,8 +17,53 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Run `taplo format` with flag `--check` in dry run to check formatting
-# without overwritng the file. If any error occur, you may want to
-# rerun `taplo format` to fix the formatting automatically.
-set -ex
-taplo format --check
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs \`taplo format --check\` by default to verify TOML formatting.
+--write        Run \`taplo format\` to auto-fix formatting (best-effort; requires a clean git worktree, no uncommitted changes).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`taplo format\`"
+  taplo format
+else
+  echo "[${SCRIPT_NAME}] \`taplo format --check\`"
+  taplo format --check
+fi
diff --git a/ci/scripts/typos_check.sh b/ci/scripts/typos_check.sh
new file mode 100755
index 0000000000000..a567c7b44e609
--- /dev/null
+++ b/ci/scripts/typos_check.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+TYPOS_CONFIG="typos.toml"
+
+source "${SCRIPT_DIR}/utils/git.sh"
+
+MODE="check"
+ALLOW_DIRTY=0
+
+usage() {
+  cat >&2 <<EOF
+Usage: $SCRIPT_NAME [--write] [--allow-dirty]
+
+Runs \`typos --config ${TYPOS_CONFIG}\` by default to check spelling.
+--write         Run \`typos --write-changes --config ${TYPOS_CONFIG}\` to auto-fix spelling issues (requires a clean git worktree, no uncommitted changes).
+--allow-dirty   Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+if [[ "$MODE" == "write" && $ALLOW_DIRTY -eq 0 ]]; then
+  require_clean_work_tree "$SCRIPT_NAME" || exit 1
+fi
+
+if [[ "$MODE" == "write" ]]; then
+  echo "[${SCRIPT_NAME}] \`typos --write-changes --config ${TYPOS_CONFIG}\`"
+  typos --write-changes --config "${TYPOS_CONFIG}"
+else
+  echo "[${SCRIPT_NAME}] \`typos --config ${TYPOS_CONFIG}\`"
+  typos --config "${TYPOS_CONFIG}"
+fi
diff --git a/ci/scripts/utils/git.sh b/ci/scripts/utils/git.sh
new file mode 100644
index 0000000000000..b5baecda758ce
--- /dev/null
+++ b/ci/scripts/utils/git.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Ensure the repository is clean before auto-fixing files.
+require_clean_work_tree() {
+  local caller="${1:-script}"
+  if [[ -n "$(git status --porcelain)" ]]; then
+    echo "[$caller] Uncommitted changes detected. Commit or stash them, or re-run with --allow-dirty." >&2
+    return 1
+  fi
+}
diff --git a/ci/scripts/utils/tool_versions.sh b/ci/scripts/utils/tool_versions.sh
new file mode 100644
index 0000000000000..ac731ed0d5341
--- /dev/null
+++ b/ci/scripts/utils/tool_versions.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file defines centralized tool versions used by CI and development scripts.
+# It is intended to be sourced by other scripts and should not be executed directly.
+
+PRETTIER_VERSION="2.7.1"
diff --git a/clippy.toml b/clippy.toml
index 114e3bfceb272..ea3609b574c06 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -9,4 +9,14 @@ disallowed-types = [
 
 # Lowering the threshold to help prevent stack overflows (default is 16384)
 # See: https://rust-lang.github.io/rust-clippy/master/index.html#/large_futures
-future-size-threshold = 10000
\ No newline at end of file
+future-size-threshold = 10000
+
+# Be more aware of large error variants which can impact the "happy path" due
+# to large stack footprint when considering async state machines (default is 128).
+#
+# Value of 70 picked arbitrarily as something less than 100.
+#
+# See:
+# - https://github.com/apache/datafusion/issues/16652
+# - https://rust-lang.github.io/rust-clippy/master/index.html#result_large_err
+large-error-threshold = 70
diff --git a/datafusion-cli/CONTRIBUTING.md b/datafusion-cli/CONTRIBUTING.md
index 4b464dffc57ce..8be656ec4ee34 100644
--- a/datafusion-cli/CONTRIBUTING.md
+++ b/datafusion-cli/CONTRIBUTING.md
@@ -21,55 +21,40 @@
 
 ## Running Tests
 
-Tests can be run using `cargo`
+First check out test files with
 
 ```shell
-cargo test
+git submodule update --init
 ```
 
-## Running Storage Integration Tests
-
-By default, storage integration tests are not run. To run them you will need to set `TEST_STORAGE_INTEGRATION=1` and
-then provide the necessary configuration for that object store.
+Then run all the tests with
 
-For some of the tests, [snapshots](https://datafusion.apache.org/contributor-guide/testing.html#snapshot-testing) are used.
+```shell
+cargo test --all-targets
+```
 
-### AWS
+## Running Storage Integration Tests
 
-To test the S3 integration against [Minio](https://github.com/minio/minio)
+By default, storage integration tests are not run. These tests use the `testcontainers` crate to start up a local MinIO server using Docker on port 9000.
 
-First start up a container with Minio and load test files.
+To run them you will need to set `TEST_STORAGE_INTEGRATION`:
 
 ```shell
-docker run -d \
-  --name datafusion-test-minio \
-  -p 9000:9000 \
-  -e MINIO_ROOT_USER=TEST-DataFusionLogin \
-  -e MINIO_ROOT_PASSWORD=TEST-DataFusionPassword \
-  -v $(pwd)/../datafusion/core/tests/data:/source \
-  quay.io/minio/minio server /data
-
-docker exec datafusion-test-minio /bin/sh -c "\
-  mc ready local
-  mc alias set localminio http://localhost:9000 TEST-DataFusionLogin TEST-DataFusionPassword && \
-  mc mb localminio/data && \
-  mc cp -r /source/* localminio/data"
+TEST_STORAGE_INTEGRATION=1 cargo test
 ```
 
-Setup environment
+For some of the tests, [snapshots](https://datafusion.apache.org/contributor-guide/testing.html#snapshot-testing) are used.
 
-```shell
-export TEST_STORAGE_INTEGRATION=1
-export AWS_ACCESS_KEY_ID=TEST-DataFusionLogin
-export AWS_SECRET_ACCESS_KEY=TEST-DataFusionPassword
-export AWS_ENDPOINT=http://127.0.0.1:9000
-export AWS_ALLOW_HTTP=true
-```
+### AWS
 
-Note that `AWS_ENDPOINT` is set without slash at the end.
+S3 integration is tested against [Minio](https://github.com/minio/minio) with [TestContainers](https://github.com/testcontainers/testcontainers-rs)
+This requires Docker to be running on your machine and port 9000 to be free.
 
-Run tests
+If you see an error mentioning "failed to load IMDS session token" such as
 
-```shell
-cargo test
-```
+> ---- object_storage::tests::s3_object_store_builder_resolves_region_when_none_provided stdout ----
+> Error: ObjectStore(Generic { store: "S3", source: "Error getting credentials from provider: an error occurred while loading credentials: failed to load IMDS session token" })
+
+You may need to disable trying to fetch S3 credentials from the environment using the `AWS_EC2_METADATA_DISABLED`, for example:
+
+> $ AWS_EC2_METADATA_DISABLED=true TEST_STORAGE_INTEGRATION=1 cargo test
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 2eec93628b520..3fe6be964c3f6 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -37,37 +37,44 @@ backtrace = ["datafusion/backtrace"]
 [dependencies]
 arrow = { workspace = true }
 async-trait = { workspace = true }
-aws-config = "1.6.2"
-aws-credential-types = "1.2.0"
-clap = { version = "4.5.39", features = ["derive", "cargo"] }
+aws-config = "1.8.14"
+aws-credential-types = "1.2.13"
+chrono = { workspace = true }
+clap = { version = "4.5.60", features = ["cargo", "derive"] }
 datafusion = { workspace = true, features = [
     "avro",
+    "compression",
     "crypto_expressions",
     "datetime_expressions",
     "encoding_expressions",
     "nested_expressions",
     "parquet",
+    "parquet_encryption",
     "recursive_protection",
     "regex_expressions",
+    "sql",
     "unicode_expressions",
-    "compression",
 ] }
+datafusion-common = { workspace = true }
 dirs = "6.0.0"
 env_logger = { workspace = true }
 futures = { workspace = true }
+log = { workspace = true }
 mimalloc = { version = "0.1", default-features = false }
 object_store = { workspace = true, features = ["aws", "gcp", "http"] }
 parking_lot = { workspace = true }
 parquet = { workspace = true, default-features = false }
 regex = { workspace = true }
-rustyline = "16.0"
-tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] }
+rustyline = "17.0"
+tokio = { workspace = true, features = ["macros", "parking_lot", "rt", "rt-multi-thread", "signal", "sync"] }
 url = { workspace = true }
 
+[lints]
+workspace = true
+
 [dev-dependencies]
-assert_cmd = "2.0"
 ctor = { workspace = true }
 insta = { workspace = true }
 insta-cmd = "0.6.0"
-predicates = "3.0"
 rstest = { workspace = true }
+testcontainers-modules = { workspace = true, features = ["minio"] }
diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md
index ca796b525fa15..b34aa770374da 100644
--- a/datafusion-cli/README.md
+++ b/datafusion-cli/README.md
@@ -19,12 +19,15 @@
 
 <!-- Note this file is included in the crates.io page as well https://crates.io/crates/datafusion-cli -->
 
-# DataFusion Command-line Interface
+# Apache DataFusion Command-line Interface
 
-[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine.
 
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+
 # Frequently Asked Questions
 
 ## Where can I find more information?
diff --git a/datafusion-cli/examples/cli-session-context.rs b/datafusion-cli/examples/cli-session-context.rs
index 1a8f15c8731b2..6095072163870 100644
--- a/datafusion-cli/examples/cli-session-context.rs
+++ b/datafusion-cli/examples/cli-session-context.rs
@@ -23,12 +23,14 @@ use std::sync::Arc;
 use datafusion::{
     dataframe::DataFrame,
     error::DataFusionError,
-    execution::{context::SessionState, TaskContext},
+    execution::{TaskContext, context::SessionState},
     logical_expr::{LogicalPlan, LogicalPlanBuilder},
     prelude::SessionContext,
 };
 use datafusion_cli::{
-    cli_context::CliSessionContext, exec::exec_from_repl, print_options::PrintOptions,
+    cli_context::CliSessionContext, exec::exec_from_repl,
+    object_storage::instrumented::InstrumentedObjectStoreRegistry,
+    print_options::PrintOptions,
 };
 use object_store::ObjectStore;
 
@@ -89,6 +91,7 @@ pub async fn main() {
         quiet: false,
         maxrows: datafusion_cli::print_options::MaxRows::Unlimited,
         color: true,
+        instrumented_registry: Arc::new(InstrumentedObjectStoreRegistry::new()),
     };
 
     exec_from_repl(&my_ctx, &mut print_options).await.unwrap();
diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs
index 3298b7deaeba2..63b055388fdbe 100644
--- a/datafusion-cli/src/catalog.rs
+++ b/datafusion-cli/src/catalog.rs
@@ -18,13 +18,13 @@
 use std::any::Any;
 use std::sync::{Arc, Weak};
 
-use crate::object_storage::{get_object_store, AwsOptions, GcpOptions};
+use crate::object_storage::{AwsOptions, GcpOptions, get_object_store};
 
 use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
 
 use datafusion::common::plan_datafusion_err;
-use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionState;
 use datafusion::execution::session_state::SessionStateBuilder;
@@ -152,10 +152,10 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider {
 
     async fn table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         let inner_table = self.inner.table(name).await;
-        if inner_table.is_ok() {
-            if let Some(inner_table) = inner_table? {
-                return Ok(Some(inner_table));
-            }
+        if inner_table.is_ok()
+            && let Some(inner_table) = inner_table?
+        {
+            return Ok(Some(inner_table));
         }
 
         // if the inner schema provider didn't have a table by
@@ -200,6 +200,7 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider {
                     table_url.scheme(),
                     url,
                     &state.default_table_options(),
+                    false,
                 )
                 .await?;
                 state.runtime_env().register_object_store(url, store);
@@ -218,17 +219,18 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider {
 }
 
 pub fn substitute_tilde(cur: String) -> String {
-    if let Some(usr_dir_path) = home_dir() {
-        if let Some(usr_dir) = usr_dir_path.to_str() {
-            if cur.starts_with('~') && !usr_dir.is_empty() {
-                return cur.replacen('~', usr_dir, 1);
-            }
-        }
+    if let Some(usr_dir_path) = home_dir()
+        && let Some(usr_dir) = usr_dir_path.to_str()
+        && cur.starts_with('~')
+        && !usr_dir.is_empty()
+    {
+        return cur.replacen('~', usr_dir, 1);
     }
     cur
 }
 #[cfg(test)]
 mod tests {
+    use std::{env, vec};
 
     use super::*;
 
@@ -284,6 +286,19 @@ mod tests {
 
     #[tokio::test]
     async fn query_s3_location_test() -> Result<()> {
+        let aws_envs = vec![
+            "AWS_ENDPOINT",
+            "AWS_ACCESS_KEY_ID",
+            "AWS_SECRET_ACCESS_KEY",
+            "AWS_ALLOW_HTTP",
+        ];
+        for aws_env in aws_envs {
+            if env::var(aws_env).is_err() {
+                eprint!("aws envs not set, skipping s3 test");
+                return Ok(());
+            }
+        }
+
         let bucket = "examples3bucket";
         let location = format!("s3://{bucket}/file.parquet");
 
@@ -344,10 +359,12 @@ mod tests {
         } else {
             "/home/user"
         };
-        env::set_var(
-            if cfg!(windows) { "USERPROFILE" } else { "HOME" },
-            test_home_path,
-        );
+        unsafe {
+            env::set_var(
+                if cfg!(windows) { "USERPROFILE" } else { "HOME" },
+                test_home_path,
+            );
+        }
         let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet";
         let expected = PathBuf::from(test_home_path)
             .join("Code")
@@ -361,12 +378,16 @@ mod tests {
             .to_string();
         let actual = substitute_tilde(input.to_string());
         assert_eq!(actual, expected);
-        match original_home {
-            Some(home_path) => env::set_var(
-                if cfg!(windows) { "USERPROFILE" } else { "HOME" },
-                home_path.to_str().unwrap(),
-            ),
-            None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }),
+        unsafe {
+            match original_home {
+                Some(home_path) => env::set_var(
+                    if cfg!(windows) { "USERPROFILE" } else { "HOME" },
+                    home_path.to_str().unwrap(),
+                ),
+                None => {
+                    env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" })
+                }
+            }
         }
     }
 }
diff --git a/datafusion-cli/src/cli_context.rs b/datafusion-cli/src/cli_context.rs
index 516929ebacf19..a6320f03fe4de 100644
--- a/datafusion-cli/src/cli_context.rs
+++ b/datafusion-cli/src/cli_context.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use datafusion::{
     dataframe::DataFrame,
     error::DataFusionError,
-    execution::{context::SessionState, TaskContext},
+    execution::{TaskContext, context::SessionState},
     logical_expr::LogicalPlan,
     prelude::SessionContext,
 };
diff --git a/datafusion-cli/src/command.rs b/datafusion-cli/src/command.rs
index 77bc8d3d20003..8aaa8025d1c3a 100644
--- a/datafusion-cli/src/command.rs
+++ b/datafusion-cli/src/command.rs
@@ -19,16 +19,16 @@
 
 use crate::cli_context::CliSessionContext;
 use crate::exec::{exec_and_print, exec_from_lines};
-use crate::functions::{display_all_functions, Function};
+use crate::functions::{Function, display_all_functions};
 use crate::print_format::PrintFormat;
 use crate::print_options::PrintOptions;
 use clap::ValueEnum;
 use datafusion::arrow::array::{ArrayRef, StringArray};
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::common::exec_err;
 use datafusion::common::instant::Instant;
-use datafusion::error::{DataFusionError, Result};
+use datafusion::common::{exec_datafusion_err, exec_err};
+use datafusion::error::Result;
 use std::fs::File;
 use std::io::BufReader;
 use std::str::FromStr;
@@ -46,6 +46,7 @@ pub enum Command {
     SearchFunctions(String),
     QuietMode(Option<bool>),
     OutputFormat(Option<String>),
+    ObjectStoreProfileMode(Option<String>),
 }
 
 pub enum OutputFormat {
@@ -84,9 +85,7 @@ impl Command {
             Self::Include(filename) => {
                 if let Some(filename) = filename {
                     let file = File::open(filename).map_err(|e| {
-                        DataFusionError::Execution(format!(
-                            "Error opening {filename:?} {e}"
-                        ))
+                        exec_datafusion_err!("Error opening {filename:?} {e}")
                     })?;
                     exec_from_lines(ctx, &mut BufReader::new(file), print_options)
                         .await?;
@@ -124,6 +123,29 @@ impl Command {
             Self::OutputFormat(_) => exec_err!(
                 "Unexpected change output format, this should be handled outside"
             ),
+            Self::ObjectStoreProfileMode(mode) => {
+                if let Some(mode) = mode {
+                    let profile_mode = mode
+                        .parse()
+                        .map_err(|_|
+                            exec_datafusion_err!("Failed to parse input: {mode}. Valid options are disabled, summary, trace")
+                        )?;
+                    print_options
+                        .instrumented_registry
+                        .set_instrument_mode(profile_mode);
+                    println!(
+                        "ObjectStore Profile mode set to {}",
+                        print_options.instrumented_registry.instrument_mode()
+                    );
+                } else {
+                    println!(
+                        "ObjectStore Profile mode is {}",
+                        print_options.instrumented_registry.instrument_mode()
+                    );
+                }
+
+                Ok(())
+            }
         }
     }
 
@@ -142,11 +164,15 @@ impl Command {
             Self::OutputFormat(_) => {
                 ("\\pset [NAME [VALUE]]", "set table output option\n(format)")
             }
+            Self::ObjectStoreProfileMode(_) => (
+                "\\object_store_profiling (disabled|summary|trace)",
+                "print or set object store profile mode",
+            ),
         }
     }
 }
 
-const ALL_COMMANDS: [Command; 9] = [
+const ALL_COMMANDS: [Command; 10] = [
     Command::ListTables,
     Command::DescribeTableStmt(String::new()),
     Command::Quit,
@@ -156,6 +182,7 @@ const ALL_COMMANDS: [Command; 9] = [
     Command::SearchFunctions(String::new()),
     Command::QuietMode(None),
     Command::OutputFormat(None),
+    Command::ObjectStoreProfileMode(None),
 ];
 
 fn all_commands_info() -> RecordBatch {
@@ -206,6 +233,10 @@ impl FromStr for Command {
                 Self::OutputFormat(Some(subcommand.to_string()))
             }
             ("pset", None) => Self::OutputFormat(None),
+            ("object_store_profiling", Some(mode)) => {
+                Self::ObjectStoreProfileMode(Some(mode.to_string()))
+            }
+            ("object_store_profiling", None) => Self::ObjectStoreProfileMode(None),
             _ => return Err(()),
         })
     }
@@ -246,3 +277,62 @@ impl OutputFormat {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use datafusion::prelude::SessionContext;
+
+    use crate::{
+        object_storage::instrumented::{
+            InstrumentedObjectStoreMode, InstrumentedObjectStoreRegistry,
+        },
+        print_options::MaxRows,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn command_execute_profile_mode() {
+        let ctx = SessionContext::new();
+
+        let mut print_options = PrintOptions {
+            format: PrintFormat::Automatic,
+            quiet: false,
+            maxrows: MaxRows::Unlimited,
+            color: true,
+            instrumented_registry: Arc::new(InstrumentedObjectStoreRegistry::new()),
+        };
+
+        let mut cmd: Command = "object_store_profiling"
+            .parse()
+            .expect("expected parse to succeed");
+        assert!(cmd.execute(&ctx, &mut print_options).await.is_ok());
+        assert_eq!(
+            print_options.instrumented_registry.instrument_mode(),
+            InstrumentedObjectStoreMode::default()
+        );
+
+        cmd = "object_store_profiling summary"
+            .parse()
+            .expect("expected parse to succeed");
+        assert!(cmd.execute(&ctx, &mut print_options).await.is_ok());
+        assert_eq!(
+            print_options.instrumented_registry.instrument_mode(),
+            InstrumentedObjectStoreMode::Summary
+        );
+
+        cmd = "object_store_profiling trace"
+            .parse()
+            .expect("expected parse to succeed");
+        assert!(cmd.execute(&ctx, &mut print_options).await.is_ok());
+        assert_eq!(
+            print_options.instrumented_registry.instrument_mode(),
+            InstrumentedObjectStoreMode::Trace
+        );
+
+        cmd = "object_store_profiling does_not_exist"
+            .parse()
+            .expect("expected parse to succeed");
+        assert!(cmd.execute(&ctx, &mut print_options).await.is_err());
+    }
+}
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index 3c2a6e68bbe1b..09347d6d7dc2c 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -26,28 +26,28 @@ use crate::{
     object_storage::get_object_store,
     print_options::{MaxRows, PrintOptions},
 };
-use futures::StreamExt;
-use std::collections::HashMap;
-use std::fs::File;
-use std::io::prelude::*;
-use std::io::BufReader;
-
 use datafusion::common::instant::Instant;
 use datafusion::common::{plan_datafusion_err, plan_err};
 use datafusion::config::ConfigFileType;
 use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::error::{DataFusionError, Result};
+use datafusion::execution::memory_pool::MemoryConsumer;
 use datafusion::logical_expr::{DdlStatement, LogicalPlan};
 use datafusion::physical_plan::execution_plan::EmissionType;
-use datafusion::physical_plan::{execute_stream, ExecutionPlanProperties};
-use datafusion::sql::parser::{DFParser, Statement};
-use datafusion::sql::sqlparser::dialect::dialect_from_str;
-
-use datafusion::execution::memory_pool::MemoryConsumer;
 use datafusion::physical_plan::spill::get_record_batch_memory_size;
+use datafusion::physical_plan::{ExecutionPlanProperties, execute_stream};
+use datafusion::sql::parser::{DFParser, Statement};
 use datafusion::sql::sqlparser;
-use rustyline::error::ReadlineError;
+use datafusion::sql::sqlparser::dialect::dialect_from_str;
+use futures::StreamExt;
+use log::warn;
+use object_store::Error::Generic;
 use rustyline::Editor;
+use rustyline::error::ReadlineError;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::BufReader;
+use std::io::prelude::*;
 use tokio::signal;
 
 /// run and execute SQL statements and commands, against a context with the given print options
@@ -153,7 +153,7 @@ pub async fn exec_from_repl(
                                     }
                                 } else {
                                     eprintln!(
-                                        "'\\{}' is not a valid command",
+                                        "'\\{}' is not a valid command, you can use '\\?' to see all commands",
                                         &line[1..]
                                     );
                                 }
@@ -168,7 +168,10 @@ pub async fn exec_from_repl(
                         }
                     }
                 } else {
-                    eprintln!("'\\{}' is not a valid command", &line[1..]);
+                    eprintln!(
+                        "'\\{}' is not a valid command, you can use '\\?' to see all commands",
+                        &line[1..]
+                    );
                 }
             }
             Ok(line) => {
@@ -193,6 +196,7 @@ pub async fn exec_from_repl(
             }
             Err(ReadlineError::Interrupted) => {
                 println!("^C");
+                rl.helper().unwrap().reset_hint();
                 continue;
             }
             Err(ReadlineError::Eof) => {
@@ -214,7 +218,6 @@ pub(super) async fn exec_and_print(
     print_options: &PrintOptions,
     sql: String,
 ) -> Result<()> {
-    let now = Instant::now();
     let task_ctx = ctx.task_ctx();
     let options = task_ctx.session_config().options();
     let dialect = &options.sql_parser.dialect;
@@ -228,17 +231,46 @@ pub(super) async fn exec_and_print(
 
     let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?;
     for statement in statements {
-        let adjusted =
-            AdjustedPrintOptions::new(print_options.clone()).with_statement(&statement);
+        StatementExecutor::new(statement)
+            .execute(ctx, print_options)
+            .await?;
+    }
 
-        let plan = create_plan(ctx, statement).await?;
-        let adjusted = adjusted.with_plan(&plan);
+    Ok(())
+}
 
-        let df = ctx.execute_logical_plan(plan).await?;
+/// Executor for SQL statements, including special handling for S3 region detection retry logic
+struct StatementExecutor {
+    statement: Statement,
+    statement_for_retry: Option<Statement>,
+}
+
+impl StatementExecutor {
+    fn new(statement: Statement) -> Self {
+        let statement_for_retry = matches!(statement, Statement::CreateExternalTable(_))
+            .then(|| statement.clone());
+
+        Self {
+            statement,
+            statement_for_retry,
+        }
+    }
+
+    async fn execute(
+        self,
+        ctx: &dyn CliSessionContext,
+        print_options: &PrintOptions,
+    ) -> Result<()> {
+        let now = Instant::now();
+        let (df, adjusted) = self
+            .create_and_execute_logical_plan(ctx, print_options)
+            .await?;
         let physical_plan = df.create_physical_plan().await?;
+        let task_ctx = ctx.task_ctx();
+        let options = task_ctx.session_config().options();
 
         // Track memory usage for the query result if it's bounded
-        let mut reservation =
+        let reservation =
             MemoryConsumer::new("DataFusion-Cli").register(task_ctx.memory_pool());
 
         if physical_plan.boundedness().is_unbounded() {
@@ -269,7 +301,7 @@ pub(super) async fn exec_and_print(
                 let curr_num_rows = batch.num_rows();
                 // Stop collecting results if the number of rows exceeds the limit
                 // results batch should include the last batch that exceeds the limit
-                if row_count < max_rows + curr_num_rows {
+                if row_count < max_rows.saturating_add(curr_num_rows) {
                     // Try to grow the reservation to accommodate the batch in memory
                     reservation.try_grow(get_record_batch_memory_size(&batch))?;
                     results.push(batch);
@@ -285,9 +317,40 @@ pub(super) async fn exec_and_print(
             )?;
             reservation.free();
         }
+
+        Ok(())
     }
 
-    Ok(())
+    async fn create_and_execute_logical_plan(
+        mut self,
+        ctx: &dyn CliSessionContext,
+        print_options: &PrintOptions,
+    ) -> Result<(datafusion::dataframe::DataFrame, AdjustedPrintOptions)> {
+        let adjusted = AdjustedPrintOptions::new(print_options.clone())
+            .with_statement(&self.statement);
+
+        let plan = create_plan(ctx, self.statement, false).await?;
+        let adjusted = adjusted.with_plan(&plan);
+
+        let df = match ctx.execute_logical_plan(plan).await {
+            Ok(df) => Ok(df),
+            Err(DataFusionError::ObjectStore(err))
+                if matches!(err.as_ref(), Generic { store, source: _ } if "S3".eq_ignore_ascii_case(store))
+                    && self.statement_for_retry.is_some() =>
+            {
+                warn!(
+                    "S3 region is incorrect, auto-detecting the correct region (this may be slow). Consider updating your region configuration."
+                );
+                let plan =
+                    create_plan(ctx, self.statement_for_retry.take().unwrap(), true)
+                        .await?;
+                ctx.execute_logical_plan(plan).await
+            }
+            Err(e) => Err(e),
+        }?;
+
+        Ok((df, adjusted))
+    }
 }
 
 /// Track adjustments to the print options based on the plan / statement being executed
@@ -348,6 +411,7 @@ fn config_file_type_from_str(ext: &str) -> Option<ConfigFileType> {
 async fn create_plan(
     ctx: &dyn CliSessionContext,
     statement: Statement,
+    resolve_region: bool,
 ) -> Result<LogicalPlan, DataFusionError> {
     let mut plan = ctx.session_state().statement_to_plan(statement).await?;
 
@@ -362,6 +426,7 @@ async fn create_plan(
             &cmd.location,
             &cmd.options,
             format,
+            resolve_region,
         )
         .await?;
     }
@@ -374,6 +439,7 @@ async fn create_plan(
             &copy_to.output_url,
             &copy_to.options,
             format,
+            false,
         )
         .await?;
     }
@@ -412,6 +478,7 @@ pub(crate) async fn register_object_store_and_config_extensions(
     location: &String,
     options: &HashMap<String, String>,
     format: Option<ConfigFileType>,
+    resolve_region: bool,
 ) -> Result<()> {
     // Parse the location URL to extract the scheme and other components
     let table_path = ListingTableUrl::parse(location)?;
@@ -433,8 +500,14 @@ pub(crate) async fn register_object_store_and_config_extensions(
     table_options.alter_with_string_hash_map(options)?;
 
     // Retrieve the appropriate object store based on the scheme, URL, and modified table options
-    let store =
-        get_object_store(&ctx.session_state(), scheme, url, &table_options).await?;
+    let store = get_object_store(
+        &ctx.session_state(),
+        scheme,
+        url,
+        &table_options,
+        resolve_region,
+    )
+    .await?;
 
     // Register the retrieved object store in the session context's runtime environment
     ctx.register_object_store(url, store);
@@ -449,6 +522,7 @@ mod tests {
     use datafusion::common::plan_err;
 
     use datafusion::prelude::SessionContext;
+    use datafusion_common::assert_contains;
     use url::Url;
 
     async fn create_external_table_test(location: &str, sql: &str) -> Result<()> {
@@ -462,6 +536,7 @@ mod tests {
                 &cmd.location,
                 &cmd.options,
                 format,
+                false,
             )
             .await?;
         } else {
@@ -488,6 +563,7 @@ mod tests {
                 &cmd.output_url,
                 &cmd.options,
                 format,
+                false,
             )
             .await?;
         } else {
@@ -513,6 +589,19 @@ mod tests {
     }
     #[tokio::test]
     async fn copy_to_external_object_store_test() -> Result<()> {
+        let aws_envs = vec![
+            "AWS_ENDPOINT",
+            "AWS_ACCESS_KEY_ID",
+            "AWS_SECRET_ACCESS_KEY",
+            "AWS_ALLOW_HTTP",
+        ];
+        for aws_env in aws_envs {
+            if std::env::var(aws_env).is_err() {
+                eprint!("aws envs not set, skipping s3 test");
+                return Ok(());
+            }
+        }
+
         let locations = vec![
             "s3://bucket/path/file.parquet",
             "oss://bucket/path/file.parquet",
@@ -534,7 +623,7 @@ mod tests {
             let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?;
             for statement in statements {
                 //Should not fail
-                let mut plan = create_plan(&ctx, statement).await?;
+                let mut plan = create_plan(&ctx, statement, false).await?;
                 if let LogicalPlan::Copy(copy_to) = &mut plan {
                     assert_eq!(copy_to.output_url, location);
                     assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string());
@@ -617,8 +706,7 @@ mod tests {
     #[tokio::test]
     async fn create_object_store_table_gcs() -> Result<()> {
         let service_account_path = "fake_service_account_path";
-        let service_account_key =
-            "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}";
+        let service_account_key = "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}";
         let application_credentials_path = "fake_application_credentials_path";
         let location = "gcs://bucket/path/file.parquet";
 
@@ -628,15 +716,16 @@ mod tests {
         let err = create_external_table_test(location, &sql)
             .await
             .unwrap_err();
-        assert!(err.to_string().contains("os error 2"));
+        assert_contains!(err.to_string(), "os error 2");
 
         // for service_account_key
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"
+        );
         let err = create_external_table_test(location, &sql)
             .await
-            .unwrap_err()
-            .to_string();
-        assert!(err.contains("No RSA key found in pem file"), "{err}");
+            .unwrap_err();
+        assert_contains!(err.to_string(), "Error reading pem file: no items found");
 
         // for application_credentials_path
         let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
@@ -644,7 +733,7 @@ mod tests {
         let err = create_external_table_test(location, &sql)
             .await
             .unwrap_err();
-        assert!(err.to_string().contains("os error 2"));
+        assert_contains!(err.to_string(), "os error 2");
 
         Ok(())
     }
@@ -666,8 +755,9 @@ mod tests {
         let location = "path/to/file.cvs";
 
         // Test with format options
-        let sql =
-            format!("CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')"
+        );
         create_external_table_test(location, &sql).await.unwrap();
 
         Ok(())
diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index 911bbf34b06f4..67f3dc28269ef 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -17,20 +17,26 @@
 
 //! Functions that are query-able and searchable via the `\h` command
 
+use datafusion_common::instant::Instant;
 use std::fmt;
 use std::fs::File;
 use std::str::FromStr;
 use std::sync::Arc;
 
-use arrow::array::{Int64Array, StringArray};
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::array::{
+    DurationMillisecondArray, GenericListArray, Int64Array, StringArray, StructArray,
+    TimestampMillisecondArray, UInt64Array,
+};
+use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef, TimeUnit};
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::catalog::{Session, TableFunctionImpl};
-use datafusion::common::{plan_err, Column};
-use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::common::{Column, plan_err};
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
+use datafusion::execution::cache::cache_manager::CacheManager;
 use datafusion::logical_expr::Expr;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::scalar::ScalarValue;
@@ -227,7 +233,7 @@ impl TableProvider for ParquetMetadataTable {
         self
     }
 
-    fn schema(&self) -> arrow::datatypes::SchemaRef {
+    fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 
@@ -322,7 +328,7 @@ pub struct ParquetMetadataFunc {}
 impl TableFunctionImpl for ParquetMetadataFunc {
     fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
         let filename = match exprs.first() {
-            Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet')
+            Some(Expr::Literal(ScalarValue::Utf8(Some(s)), _)) => s, // single quote: parquet_metadata('x.parquet')
             Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet")
             _ => {
                 return plan_err!(
@@ -418,7 +424,9 @@ impl TableFunctionImpl for ParquetMetadataFunc {
                     stats_max_value_arr.push(None);
                 };
                 compression_arr.push(format!("{:?}", column.compression()));
-                encodings_arr.push(format!("{:?}", column.encodings()));
+                // need to collect into Vec to format
+                let encodings: Vec<_> = column.encodings().collect();
+                encodings_arr.push(format!("{encodings:?}"));
                 index_page_offset_arr.push(column.index_page_offset());
                 dictionary_page_offset_arr.push(column.dictionary_page_offset());
                 data_page_offset_arr.push(column.data_page_offset());
@@ -460,3 +468,416 @@ impl TableFunctionImpl for ParquetMetadataFunc {
         Ok(Arc::new(parquet_metadata))
     }
 }
+
+/// METADATA_CACHE table function
+#[derive(Debug)]
+struct MetadataCacheTable {
+    schema: SchemaRef,
+    batch: RecordBatch,
+}
+
+#[async_trait]
+impl TableProvider for MetadataCacheTable {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion::logical_expr::TableType {
+        datafusion::logical_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(MemorySourceConfig::try_new_exec(
+            &[vec![self.batch.clone()]],
+            TableProvider::schema(self),
+            projection.cloned(),
+        )?)
+    }
+}
+
+#[derive(Debug)]
+pub struct MetadataCacheFunc {
+    cache_manager: Arc<CacheManager>,
+}
+
+impl MetadataCacheFunc {
+    pub fn new(cache_manager: Arc<CacheManager>) -> Self {
+        Self { cache_manager }
+    }
+}
+
+impl TableFunctionImpl for MetadataCacheFunc {
+    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        if !exprs.is_empty() {
+            return plan_err!("metadata_cache should have no arguments");
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("path", DataType::Utf8, false),
+            Field::new(
+                "file_modified",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("file_size_bytes", DataType::UInt64, false),
+            Field::new("e_tag", DataType::Utf8, true),
+            Field::new("version", DataType::Utf8, true),
+            Field::new("metadata_size_bytes", DataType::UInt64, false),
+            Field::new("hits", DataType::UInt64, false),
+            Field::new("extra", DataType::Utf8, true),
+        ]));
+
+        // construct record batch from metadata
+        let mut path_arr = vec![];
+        let mut file_modified_arr = vec![];
+        let mut file_size_bytes_arr = vec![];
+        let mut e_tag_arr = vec![];
+        let mut version_arr = vec![];
+        let mut metadata_size_bytes = vec![];
+        let mut hits_arr = vec![];
+        let mut extra_arr = vec![];
+
+        let cached_entries = self.cache_manager.get_file_metadata_cache().list_entries();
+
+        for (path, entry) in cached_entries {
+            path_arr.push(path.to_string());
+            file_modified_arr
+                .push(Some(entry.object_meta.last_modified.timestamp_millis()));
+            file_size_bytes_arr.push(entry.object_meta.size);
+            e_tag_arr.push(entry.object_meta.e_tag);
+            version_arr.push(entry.object_meta.version);
+            metadata_size_bytes.push(entry.size_bytes as u64);
+            hits_arr.push(entry.hits as u64);
+
+            let mut extra = entry
+                .extra
+                .iter()
+                .map(|(k, v)| format!("{k}={v}"))
+                .collect::<Vec<_>>();
+            extra.sort();
+            extra_arr.push(extra.join(" "));
+        }
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(path_arr)),
+                Arc::new(TimestampMillisecondArray::from(file_modified_arr)),
+                Arc::new(UInt64Array::from(file_size_bytes_arr)),
+                Arc::new(StringArray::from(e_tag_arr)),
+                Arc::new(StringArray::from(version_arr)),
+                Arc::new(UInt64Array::from(metadata_size_bytes)),
+                Arc::new(UInt64Array::from(hits_arr)),
+                Arc::new(StringArray::from(extra_arr)),
+            ],
+        )?;
+
+        let metadata_cache = MetadataCacheTable { schema, batch };
+        Ok(Arc::new(metadata_cache))
+    }
+}
+
+/// STATISTICS_CACHE table function
+#[derive(Debug)]
+struct StatisticsCacheTable {
+    schema: SchemaRef,
+    batch: RecordBatch,
+}
+
+#[async_trait]
+impl TableProvider for StatisticsCacheTable {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion::logical_expr::TableType {
+        datafusion::logical_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(MemorySourceConfig::try_new_exec(
+            &[vec![self.batch.clone()]],
+            TableProvider::schema(self),
+            projection.cloned(),
+        )?)
+    }
+}
+
+#[derive(Debug)]
+pub struct StatisticsCacheFunc {
+    cache_manager: Arc<CacheManager>,
+}
+
+impl StatisticsCacheFunc {
+    pub fn new(cache_manager: Arc<CacheManager>) -> Self {
+        Self { cache_manager }
+    }
+}
+
+impl TableFunctionImpl for StatisticsCacheFunc {
+    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        if !exprs.is_empty() {
+            return plan_err!("statistics_cache should have no arguments");
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("path", DataType::Utf8, false),
+            Field::new(
+                "file_modified",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("file_size_bytes", DataType::UInt64, false),
+            Field::new("e_tag", DataType::Utf8, true),
+            Field::new("version", DataType::Utf8, true),
+            Field::new("num_rows", DataType::Utf8, false),
+            Field::new("num_columns", DataType::UInt64, false),
+            Field::new("table_size_bytes", DataType::Utf8, false),
+            Field::new("statistics_size_bytes", DataType::UInt64, false),
+        ]));
+
+        // construct record batch from metadata
+        let mut path_arr = vec![];
+        let mut file_modified_arr = vec![];
+        let mut file_size_bytes_arr = vec![];
+        let mut e_tag_arr = vec![];
+        let mut version_arr = vec![];
+        let mut num_rows_arr = vec![];
+        let mut num_columns_arr = vec![];
+        let mut table_size_bytes_arr = vec![];
+        let mut statistics_size_bytes_arr = vec![];
+
+        if let Some(file_statistics_cache) = self.cache_manager.get_file_statistic_cache()
+        {
+            for (path, entry) in file_statistics_cache.list_entries() {
+                path_arr.push(path.to_string());
+                file_modified_arr
+                    .push(Some(entry.object_meta.last_modified.timestamp_millis()));
+                file_size_bytes_arr.push(entry.object_meta.size);
+                e_tag_arr.push(entry.object_meta.e_tag);
+                version_arr.push(entry.object_meta.version);
+                num_rows_arr.push(entry.num_rows.to_string());
+                num_columns_arr.push(entry.num_columns as u64);
+                table_size_bytes_arr.push(entry.table_size_bytes.to_string());
+                statistics_size_bytes_arr.push(entry.statistics_size_bytes as u64);
+            }
+        }
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(path_arr)),
+                Arc::new(TimestampMillisecondArray::from(file_modified_arr)),
+                Arc::new(UInt64Array::from(file_size_bytes_arr)),
+                Arc::new(StringArray::from(e_tag_arr)),
+                Arc::new(StringArray::from(version_arr)),
+                Arc::new(StringArray::from(num_rows_arr)),
+                Arc::new(UInt64Array::from(num_columns_arr)),
+                Arc::new(StringArray::from(table_size_bytes_arr)),
+                Arc::new(UInt64Array::from(statistics_size_bytes_arr)),
+            ],
+        )?;
+
+        let statistics_cache = StatisticsCacheTable { schema, batch };
+        Ok(Arc::new(statistics_cache))
+    }
+}
+
+/// Implementation of the `list_files_cache` table function in datafusion-cli.
+///
+/// This function returns the cached results of running a LIST command on a
+/// particular object store path for a table. The object metadata is returned as
+/// a List of Structs, with one Struct for each object. DataFusion uses these
+/// cached results to plan queries against external tables.
+///
+/// # Schema
+/// ```sql
+/// > describe select * from list_files_cache();
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// | column_name         | data_type                                                                                                                                                                | is_nullable |
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// | table               | Utf8                                                                                                                                                                     | NO          |
+/// | path                | Utf8                                                                                                                                                                     | NO          |
+/// | metadata_size_bytes | UInt64                                                                                                                                                                   | NO          |
+/// | expires_in          | Duration(ms)                                                                                                                                                             | YES         |
+/// | metadata_list       | List(Struct("file_path": non-null Utf8, "file_modified": non-null Timestamp(ms), "file_size_bytes": non-null UInt64, "e_tag": Utf8, "version": Utf8), field: 'metadata') | YES         |
+/// +---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
+/// ```
+#[derive(Debug)]
+struct ListFilesCacheTable {
+    schema: SchemaRef,
+    batch: RecordBatch,
+}
+
+#[async_trait]
+impl TableProvider for ListFilesCacheTable {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn table_type(&self) -> datafusion::logical_expr::TableType {
+        datafusion::logical_expr::TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(MemorySourceConfig::try_new_exec(
+            &[vec![self.batch.clone()]],
+            TableProvider::schema(self),
+            projection.cloned(),
+        )?)
+    }
+}
+
+#[derive(Debug)]
+pub struct ListFilesCacheFunc {
+    cache_manager: Arc<CacheManager>,
+}
+
+impl ListFilesCacheFunc {
+    pub fn new(cache_manager: Arc<CacheManager>) -> Self {
+        Self { cache_manager }
+    }
+}
+
+impl TableFunctionImpl for ListFilesCacheFunc {
+    fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        if !exprs.is_empty() {
+            return plan_err!("list_files_cache should have no arguments");
+        }
+
+        let nested_fields = Fields::from(vec![
+            Field::new("file_path", DataType::Utf8, false),
+            Field::new(
+                "file_modified",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("file_size_bytes", DataType::UInt64, false),
+            Field::new("e_tag", DataType::Utf8, true),
+            Field::new("version", DataType::Utf8, true),
+        ]);
+
+        let metadata_field =
+            Field::new("metadata", DataType::Struct(nested_fields.clone()), true);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("table", DataType::Utf8, true),
+            Field::new("path", DataType::Utf8, false),
+            Field::new("metadata_size_bytes", DataType::UInt64, false),
+            // expires field in ListFilesEntry has type Instant when set, from which we cannot get "the number of seconds", hence using Duration instead of Timestamp as data type.
+            Field::new(
+                "expires_in",
+                DataType::Duration(TimeUnit::Millisecond),
+                true,
+            ),
+            Field::new(
+                "metadata_list",
+                DataType::List(Arc::new(metadata_field.clone())),
+                true,
+            ),
+        ]));
+
+        let mut table_arr = vec![];
+        let mut path_arr = vec![];
+        let mut metadata_size_bytes_arr = vec![];
+        let mut expires_arr = vec![];
+
+        let mut file_path_arr = vec![];
+        let mut file_modified_arr = vec![];
+        let mut file_size_bytes_arr = vec![];
+        let mut etag_arr = vec![];
+        let mut version_arr = vec![];
+        let mut offsets: Vec<i32> = vec![0];
+
+        if let Some(list_files_cache) = self.cache_manager.get_list_files_cache() {
+            let now = Instant::now();
+            let mut current_offset: i32 = 0;
+
+            for (path, entry) in list_files_cache.list_entries() {
+                table_arr.push(path.table.map(|t| t.to_string()));
+                path_arr.push(path.path.to_string());
+                metadata_size_bytes_arr.push(entry.size_bytes as u64);
+                // calculates time left before entry expires
+                expires_arr.push(
+                    entry
+                        .expires
+                        .map(|t| t.duration_since(now).as_millis() as i64),
+                );
+
+                for meta in entry.metas.files.iter() {
+                    file_path_arr.push(meta.location.to_string());
+                    file_modified_arr.push(meta.last_modified.timestamp_millis());
+                    file_size_bytes_arr.push(meta.size);
+                    etag_arr.push(meta.e_tag.clone());
+                    version_arr.push(meta.version.clone());
+                }
+                current_offset += entry.metas.files.len() as i32;
+                offsets.push(current_offset);
+            }
+        }
+
+        let struct_arr = StructArray::new(
+            nested_fields,
+            vec![
+                Arc::new(StringArray::from(file_path_arr)),
+                Arc::new(TimestampMillisecondArray::from(file_modified_arr)),
+                Arc::new(UInt64Array::from(file_size_bytes_arr)),
+                Arc::new(StringArray::from(etag_arr)),
+                Arc::new(StringArray::from(version_arr)),
+            ],
+            None,
+        );
+
+        let offsets_buffer: OffsetBuffer<i32> =
+            OffsetBuffer::new(ScalarBuffer::from(Buffer::from_vec(offsets)));
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(table_arr)),
+                Arc::new(StringArray::from(path_arr)),
+                Arc::new(UInt64Array::from(metadata_size_bytes_arr)),
+                Arc::new(DurationMillisecondArray::from(expires_arr)),
+                Arc::new(GenericListArray::new(
+                    Arc::new(metadata_field),
+                    offsets_buffer,
+                    Arc::new(struct_arr),
+                    None,
+                )),
+            ],
+        )?;
+
+        let list_files_cache = ListFilesCacheTable { schema, batch };
+        Ok(Arc::new(list_files_cache))
+    }
+}
diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs
index 64c34c4737369..f01d0891b964c 100644
--- a/datafusion-cli/src/helper.rs
+++ b/datafusion-cli/src/helper.rs
@@ -19,11 +19,13 @@
 //! and auto-completion for file name during creating external table.
 
 use std::borrow::Cow;
+use std::cell::Cell;
 
-use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter};
+use crate::highlighter::{Color, NoSyntaxHighlighter, SyntaxHighlighter};
 
 use datafusion::sql::parser::{DFParser, Statement};
 use datafusion::sql::sqlparser::dialect::dialect_from_str;
+use datafusion_common::config::Dialect;
 
 use rustyline::completion::{Completer, FilenameCompleter, Pair};
 use rustyline::error::ReadlineError;
@@ -32,14 +34,21 @@ use rustyline::hint::Hinter;
 use rustyline::validate::{ValidationContext, ValidationResult, Validator};
 use rustyline::{Context, Helper, Result};
 
+/// Default suggestion shown when the input line is empty.
+const DEFAULT_HINT_SUGGESTION: &str = " \\? for help, \\q to quit";
+
 pub struct CliHelper {
     completer: FilenameCompleter,
-    dialect: String,
+    dialect: Dialect,
     highlighter: Box<dyn Highlighter>,
+    /// Tracks whether to show the default hint. Set to `false` once the user
+    /// types anything, so the hint doesn't reappear after deleting back to
+    /// an empty line. Reset to `true` when the line is submitted.
+    show_hint: Cell<bool>,
 }
 
 impl CliHelper {
-    pub fn new(dialect: &str, color: bool) -> Self {
+    pub fn new(dialect: &Dialect, color: bool) -> Self {
         let highlighter: Box<dyn Highlighter> = if !color {
             Box::new(NoSyntaxHighlighter {})
         } else {
@@ -47,26 +56,32 @@ impl CliHelper {
         };
         Self {
             completer: FilenameCompleter::new(),
-            dialect: dialect.into(),
+            dialect: *dialect,
             highlighter,
+            show_hint: Cell::new(true),
         }
     }
 
-    pub fn set_dialect(&mut self, dialect: &str) {
-        if dialect != self.dialect {
-            self.dialect = dialect.to_string();
+    pub fn set_dialect(&mut self, dialect: &Dialect) {
+        if *dialect != self.dialect {
+            self.dialect = *dialect;
         }
     }
 
+    /// Re-enable the default hint for the next prompt.
+    pub fn reset_hint(&self) {
+        self.show_hint.set(true);
+    }
+
     fn validate_input(&self, input: &str) -> Result<ValidationResult> {
         if let Some(sql) = input.strip_suffix(';') {
-            let dialect = match dialect_from_str(&self.dialect) {
+            let dialect = match dialect_from_str(self.dialect) {
                 Some(dialect) => dialect,
                 None => {
                     return Ok(ValidationResult::Invalid(Some(format!(
                         "  🤔 Invalid dialect: {}",
                         self.dialect
-                    ))))
+                    ))));
                 }
             };
             let lines = split_from_semicolon(sql);
@@ -97,7 +112,7 @@ impl CliHelper {
 
 impl Default for CliHelper {
     fn default() -> Self {
-        Self::new("generic", false)
+        Self::new(&Dialect::Generic, false)
     }
 }
 
@@ -113,6 +128,14 @@ impl Highlighter for CliHelper {
 
 impl Hinter for CliHelper {
     type Hint = String;
+
+    fn hint(&self, line: &str, _pos: usize, _ctx: &Context<'_>) -> Option<String> {
+        if !line.is_empty() {
+            self.show_hint.set(false);
+        }
+        (self.show_hint.get() && line.trim().is_empty())
+            .then(|| Color::gray(DEFAULT_HINT_SUGGESTION))
+    }
 }
 
 /// returns true if the current position is after the open quote for
@@ -120,12 +143,9 @@ impl Hinter for CliHelper {
 fn is_open_quote_for_location(line: &str, pos: usize) -> bool {
     let mut sql = line[..pos].to_string();
     sql.push('\'');
-    if let Ok(stmts) = DFParser::parse_sql(&sql) {
-        if let Some(Statement::CreateExternalTable(_)) = stmts.back() {
-            return true;
-        }
-    }
-    false
+    DFParser::parse_sql(&sql).is_ok_and(|stmts| {
+        matches!(stmts.back(), Some(Statement::CreateExternalTable(_)))
+    })
 }
 
 impl Completer for CliHelper {
@@ -148,7 +168,9 @@ impl Completer for CliHelper {
 impl Validator for CliHelper {
     fn validate(&self, ctx: &mut ValidationContext<'_>) -> Result<ValidationResult> {
         let input = ctx.input().trim_end();
-        self.validate_input(input)
+        let result = self.validate_input(input);
+        self.reset_hint();
+        result
     }
 }
 
@@ -289,7 +311,7 @@ mod tests {
         );
 
         // valid in postgresql dialect
-        validator.set_dialect("postgresql");
+        validator.set_dialect(&Dialect::PostgreSQL);
         let result =
             readline_direct(Cursor::new(r"select 1 # 2;".as_bytes()), &validator)?;
         assert!(matches!(result, ValidationResult::Valid(None)));
diff --git a/datafusion-cli/src/highlighter.rs b/datafusion-cli/src/highlighter.rs
index 7a886b94740bd..adcb135bb401f 100644
--- a/datafusion-cli/src/highlighter.rs
+++ b/datafusion-cli/src/highlighter.rs
@@ -23,10 +23,11 @@ use std::{
 };
 
 use datafusion::sql::sqlparser::{
-    dialect::{dialect_from_str, Dialect, GenericDialect},
+    dialect::{Dialect, GenericDialect, dialect_from_str},
     keywords::Keyword,
     tokenizer::{Token, Tokenizer},
 };
+use datafusion_common::config;
 use rustyline::highlight::{CmdKind, Highlighter};
 
 /// The syntax highlighter.
@@ -36,8 +37,9 @@ pub struct SyntaxHighlighter {
 }
 
 impl SyntaxHighlighter {
-    pub fn new(dialect: &str) -> Self {
-        let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {}));
+    pub fn new(dialect: &config::Dialect) -> Self {
+        let dialect =
+            dialect_from_str(dialect).unwrap_or_else(|| Box::new(GenericDialect {}));
         Self { dialect }
     }
 }
@@ -79,27 +81,32 @@ impl Highlighter for SyntaxHighlighter {
 }
 
 /// Convenient utility to return strings with [ANSI color](https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124).
-struct Color {}
+pub(crate) struct Color {}
 
 impl Color {
-    fn green(s: impl Display) -> String {
+    pub(crate) fn green(s: impl Display) -> String {
         format!("\x1b[92m{s}\x1b[0m")
     }
 
-    fn red(s: impl Display) -> String {
+    pub(crate) fn red(s: impl Display) -> String {
         format!("\x1b[91m{s}\x1b[0m")
     }
+
+    pub(crate) fn gray(s: impl Display) -> String {
+        format!("\x1b[90m{s}\x1b[0m")
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::SyntaxHighlighter;
+    use super::config::Dialect;
     use rustyline::highlight::Highlighter;
 
     #[test]
     fn highlighter_valid() {
         let s = "SElect col_a from tab_1;";
-        let highlighter = SyntaxHighlighter::new("generic");
+        let highlighter = SyntaxHighlighter::new(&Dialect::Generic);
         let out = highlighter.highlight(s, s.len());
         assert_eq!(
             "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1;",
@@ -110,7 +117,7 @@ mod tests {
     #[test]
     fn highlighter_valid_with_new_line() {
         let s = "SElect col_a from tab_1\n WHERE col_b = 'なにか';";
-        let highlighter = SyntaxHighlighter::new("generic");
+        let highlighter = SyntaxHighlighter::new(&Dialect::Generic);
         let out = highlighter.highlight(s, s.len());
         assert_eq!(
             "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1\n \u{1b}[91mWHERE\u{1b}[0m col_b = \u{1b}[92m'なにか'\u{1b}[0m;",
@@ -121,7 +128,7 @@ mod tests {
     #[test]
     fn highlighter_invalid() {
         let s = "SElect col_a from tab_1 WHERE col_b = ';";
-        let highlighter = SyntaxHighlighter::new("generic");
+        let highlighter = SyntaxHighlighter::new(&Dialect::Generic);
         let out = highlighter.highlight(s, s.len());
         assert_eq!("SElect col_a from tab_1 WHERE col_b = ';", out);
     }
diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs
index 34fba6f79304b..f0b0bc23fd73d 100644
--- a/datafusion-cli/src/lib.rs
+++ b/datafusion-cli/src/lib.rs
@@ -19,7 +19,7 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 #![doc = include_str!("../README.md")]
 pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION");
 
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index fdecb185e33e4..6bfe1160ecdd6 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -28,15 +28,20 @@ use datafusion::execution::memory_pool::{
     FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool,
 };
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::logical_expr::ExplainFormat;
 use datafusion::prelude::SessionContext;
 use datafusion_cli::catalog::DynamicObjectStoreCatalog;
-use datafusion_cli::functions::ParquetMetadataFunc;
+use datafusion_cli::functions::{
+    ListFilesCacheFunc, MetadataCacheFunc, ParquetMetadataFunc, StatisticsCacheFunc,
+};
+use datafusion_cli::object_storage::instrumented::{
+    InstrumentedObjectStoreMode, InstrumentedObjectStoreRegistry,
+};
 use datafusion_cli::{
-    exec,
+    DATAFUSION_CLI_VERSION, exec,
     pool_type::PoolType,
     print_format::PrintFormat,
     print_options::{MaxRows, PrintOptions},
-    DATAFUSION_CLI_VERSION,
 };
 
 use clap::Parser;
@@ -144,6 +149,13 @@ struct Args {
         value_parser(extract_disk_limit)
     )]
     disk_limit: Option<usize>,
+
+    #[clap(
+        long,
+        help = "Specify the default object_store_profiling mode, defaults to 'disabled'.\n[possible values: disabled, summary, trace]",
+        default_value_t = InstrumentedObjectStoreMode::Disabled
+    )]
+    object_store_profiling: InstrumentedObjectStoreMode,
 }
 
 #[tokio::main]
@@ -205,6 +217,12 @@ async fn main_inner() -> Result<()> {
         rt_builder = rt_builder.with_disk_manager_builder(builder);
     }
 
+    let instrumented_registry = Arc::new(
+        InstrumentedObjectStoreRegistry::new()
+            .with_profile_mode(args.object_store_profiling),
+    );
+    rt_builder = rt_builder.with_object_store_registry(instrumented_registry.clone());
+
     let runtime_env = rt_builder.build_arc()?;
 
     // enable dynamic file query
@@ -219,11 +237,35 @@ async fn main_inner() -> Result<()> {
     // register `parquet_metadata` table function to get metadata from parquet files
     ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));
 
+    // register `metadata_cache` table function to get the contents of the file metadata cache
+    ctx.register_udtf(
+        "metadata_cache",
+        Arc::new(MetadataCacheFunc::new(
+            ctx.task_ctx().runtime_env().cache_manager.clone(),
+        )),
+    );
+
+    // register `statistics_cache` table function to get the contents of the file statistics cache
+    ctx.register_udtf(
+        "statistics_cache",
+        Arc::new(StatisticsCacheFunc::new(
+            ctx.task_ctx().runtime_env().cache_manager.clone(),
+        )),
+    );
+
+    ctx.register_udtf(
+        "list_files_cache",
+        Arc::new(ListFilesCacheFunc::new(
+            ctx.task_ctx().runtime_env().cache_manager.clone(),
+        )),
+    );
+
     let mut print_options = PrintOptions {
         format: args.format,
         quiet: args.quiet,
         maxrows: args.maxrows,
         color: args.color,
+        instrumented_registry: Arc::clone(&instrumented_registry),
     };
 
     let commands = args.command;
@@ -280,7 +322,7 @@ fn get_session_config(args: &Args) -> Result<SessionConfig> {
     // use easier to understand "tree" mode by default
     // if the user hasn't specified an explain format in the environment
     if env::var_os("DATAFUSION_EXPLAIN_FORMAT").is_none() {
-        config_options.explain.format = String::from("tree");
+        config_options.explain.format = ExplainFormat::Tree;
     }
 
     // in the CLI, we want to show NULL values rather the empty strings
@@ -396,9 +438,20 @@ pub fn extract_disk_limit(size: &str) -> Result<usize, String> {
 
 #[cfg(test)]
 mod tests {
+    use std::time::Duration;
+
     use super::*;
-    use datafusion::common::test_util::batches_to_string;
+    use datafusion::{
+        common::test_util::batches_to_string,
+        execution::cache::{
+            DefaultListFilesCache, cache_manager::CacheManagerConfig,
+            cache_unit::DefaultFileStatisticsCache,
+        },
+        prelude::{ParquetReadOptions, col, lit, split_part},
+    };
     use insta::assert_snapshot;
+    use object_store::memory::InMemory;
+    use url::Url;
 
     fn assert_conversion(input: &str, expected: Result<usize, String>) {
         let result = extract_memory_pool_size(input);
@@ -462,8 +515,7 @@ mod tests {
         ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));
 
         // input with single quote
-        let sql =
-            "SELECT * FROM parquet_metadata('../datafusion/core/tests/data/fixed_size_list_array.parquet')";
+        let sql = "SELECT * FROM parquet_metadata('../datafusion/core/tests/data/fixed_size_list_array.parquet')";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
 
@@ -471,20 +523,19 @@ mod tests {
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         | filename                                                    | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type  | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings                    | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
-        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0            | 2                  | 1                     | 123             | 0         | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 0                |                      | 1               | 4               | SNAPPY      | [RLE_DICTIONARY, PLAIN, RLE] |                   | 4                      | 46               | 121                   | 123                     |
+        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0            | 2                  | 1                     | 123             | 0         | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 0                |                      | 1               | 4               | SNAPPY      | [PLAIN, RLE, RLE_DICTIONARY] |                   | 4                      | 46               | 121                   | 123                     |
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         "#);
 
         // input with double quote
-        let sql =
-            "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")";
+        let sql = "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
         assert_snapshot!(batches_to_string(&rbs), @r#"
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         | filename                                                    | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type  | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings                    | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
-        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0            | 2                  | 1                     | 123             | 0         | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 0                |                      | 1               | 4               | SNAPPY      | [RLE_DICTIONARY, PLAIN, RLE] |                   | 4                      | 46               | 121                   | 123                     |
+        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0            | 2                  | 1                     | 123             | 0         | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 0                |                      | 1               | 4               | SNAPPY      | [PLAIN, RLE, RLE_DICTIONARY] |                   | 4                      | 46               | 121                   | 123                     |
         +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         "#);
 
@@ -497,8 +548,7 @@ mod tests {
         ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));
 
         // input with string columns
-        let sql =
-            "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
+        let sql = "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
         let df = ctx.sql(sql).await?;
         let rbs = df.collect().await?;
 
@@ -506,10 +556,296 @@ mod tests {
         +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         | filename                                                        | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type       | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression        | encodings                | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
         +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
-        | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0            | 14                 | 1                     | 163             | 0         | 4           | 14         | "String"       | BYTE_ARRAY | Hello     | today     | 0                |                      | Hello           | today           | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] |                   |                        | 4                | 152                   | 163                     |
+        | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0            | 14                 | 1                     | 163             | 0         | 4           | 14         | "String"       | BYTE_ARRAY | Hello     | today     | 0                |                      | Hello           | today           | GZIP(GzipLevel(6)) | [PLAIN, RLE, BIT_PACKED] |                   |                        | 4                | 152                   | 163                     |
         +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         "#);
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_metadata_cache() -> Result<(), DataFusionError> {
+        let ctx = SessionContext::new();
+        ctx.register_udtf(
+            "metadata_cache",
+            Arc::new(MetadataCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        ctx.register_parquet(
+            "alltypes_plain",
+            "../parquet-testing/data/alltypes_plain.parquet",
+            ParquetReadOptions::new(),
+        )
+        .await?;
+
+        ctx.register_parquet(
+            "alltypes_tiny_pages",
+            "../parquet-testing/data/alltypes_tiny_pages.parquet",
+            ParquetReadOptions::new(),
+        )
+        .await?;
+
+        ctx.register_parquet(
+            "lz4_raw_compressed_larger",
+            "../parquet-testing/data/lz4_raw_compressed_larger.parquet",
+            ParquetReadOptions::new(),
+        )
+        .await?;
+
+        ctx.sql("select * from alltypes_plain")
+            .await?
+            .collect()
+            .await?;
+        ctx.sql("select * from alltypes_tiny_pages")
+            .await?
+            .collect()
+            .await?;
+        ctx.sql("select * from lz4_raw_compressed_larger")
+            .await?
+            .collect()
+            .await?;
+
+        // initial state
+        let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, metadata_size_bytes, hits, extra from metadata_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        | alltypes_plain.parquet            | 1851            | 8882                | 2    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 2    | page_index=false |
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        ");
+
+        // increase the number of hits
+        ctx.sql("select * from alltypes_plain")
+            .await?
+            .collect()
+            .await?;
+        ctx.sql("select * from alltypes_plain")
+            .await?
+            .collect()
+            .await?;
+        ctx.sql("select * from alltypes_plain")
+            .await?
+            .collect()
+            .await?;
+        ctx.sql("select * from lz4_raw_compressed_larger")
+            .await?
+            .collect()
+            .await?;
+        let sql = "select split_part(path, '/', -1) as filename, file_size_bytes, metadata_size_bytes, hits, extra from metadata_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        | alltypes_plain.parquet            | 1851            | 8882                | 5    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 3    | page_index=false |
+        +-----------------------------------+-----------------+---------------------+------+------------------+
+        ");
+
+        Ok(())
+    }
+
+    /// Shows that the statistics cache is not enabled by default yet
+    /// See https://github.com/apache/datafusion/issues/19217
+    #[tokio::test]
+    async fn test_statistics_cache_default() -> Result<(), DataFusionError> {
+        let ctx = SessionContext::new();
+
+        ctx.register_udtf(
+            "statistics_cache",
+            Arc::new(StatisticsCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        for filename in [
+            "alltypes_plain",
+            "alltypes_tiny_pages",
+            "lz4_raw_compressed_larger",
+        ] {
+            ctx.sql(
+                format!(
+                    "create external table {filename}
+                    stored as parquet
+                    location '../parquet-testing/data/{filename}.parquet'",
+                )
+                .as_str(),
+            )
+            .await?
+            .collect()
+            .await?;
+        }
+
+        // When the cache manager creates a StatisticsCache by default,
+        // the contents will show up here
+        let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        ++
+        ++
+        ");
+
+        Ok(())
+    }
+
+    // Can be removed when https://github.com/apache/datafusion/issues/19217 is resolved
+    #[tokio::test]
+    async fn test_statistics_cache_override() -> Result<(), DataFusionError> {
+        // Install a specific StatisticsCache implementation
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build()?;
+        let config = SessionConfig::new().with_collect_statistics(true);
+        let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));
+
+        ctx.register_udtf(
+            "statistics_cache",
+            Arc::new(StatisticsCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        for filename in [
+            "alltypes_plain",
+            "alltypes_tiny_pages",
+            "lz4_raw_compressed_larger",
+        ] {
+            ctx.sql(
+                format!(
+                    "create external table {filename}
+                    stored as parquet
+                    location '../parquet-testing/data/{filename}.parquet'",
+                )
+                .as_str(),
+            )
+            .await?
+            .collect()
+            .await?;
+        }
+
+        let sql = "SELECT split_part(path, '/', -1) as filename, file_size_bytes, num_rows, num_columns, table_size_bytes from statistics_cache() order by filename";
+        let df = ctx.sql(sql).await?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        | filename                          | file_size_bytes | num_rows     | num_columns | table_size_bytes |
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        | alltypes_plain.parquet            | 1851            | Exact(8)     | 11          | Absent           |
+        | alltypes_tiny_pages.parquet       | 454233          | Exact(7300)  | 13          | Absent           |
+        | lz4_raw_compressed_larger.parquet | 380836          | Exact(10000) | 1           | Absent           |
+        +-----------------------------------+-----------------+--------------+-------------+------------------+
+        ");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_list_files_cache() -> Result<(), DataFusionError> {
+        let list_files_cache = Arc::new(DefaultListFilesCache::new(
+            1024,
+            Some(Duration::from_secs(1)),
+        ));
+
+        let rt = RuntimeEnvBuilder::new()
+            .with_cache_manager(
+                CacheManagerConfig::default()
+                    .with_list_files_cache(Some(list_files_cache)),
+            )
+            .build_arc()
+            .unwrap();
+
+        let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+        ctx.register_object_store(
+            &Url::parse("mem://test_table").unwrap(),
+            Arc::new(InMemory::new()),
+        );
+
+        ctx.register_udtf(
+            "list_files_cache",
+            Arc::new(ListFilesCacheFunc::new(
+                ctx.task_ctx().runtime_env().cache_manager.clone(),
+            )),
+        );
+
+        ctx.sql(
+            "CREATE EXTERNAL TABLE src_table
+            STORED AS PARQUET
+            LOCATION '../parquet-testing/data/alltypes_plain.parquet'",
+        )
+        .await?
+        .collect()
+        .await?;
+
+        ctx.sql("COPY (SELECT * FROM src_table) TO 'mem://test_table/0.parquet' STORED AS PARQUET").await?.collect().await?;
+
+        ctx.sql("COPY (SELECT * FROM src_table) TO 'mem://test_table/1.parquet' STORED AS PARQUET").await?.collect().await?;
+
+        ctx.sql(
+            "CREATE EXTERNAL TABLE test_table 
+            STORED AS PARQUET
+            LOCATION 'mem://test_table/'
+        ",
+        )
+        .await?
+        .collect()
+        .await?;
+
+        let sql = "SELECT metadata_size_bytes, expires_in, metadata_list FROM list_files_cache()";
+        let df = ctx
+            .sql(sql)
+            .await?
+            .unnest_columns(&["metadata_list"])?
+            .with_column_renamed("metadata_list", "metadata")?
+            .unnest_columns(&["metadata"])?;
+
+        assert_eq!(
+            2,
+            df.clone()
+                .filter(col("expires_in").is_not_null())?
+                .count()
+                .await?
+        );
+
+        let df = df
+            .with_column_renamed(r#""metadata.file_size_bytes""#, "file_size_bytes")?
+            .with_column_renamed(r#""metadata.e_tag""#, "etag")?
+            .with_column(
+                "filename",
+                split_part(col(r#""metadata.file_path""#), lit("/"), lit(-1)),
+            )?
+            .select_columns(&[
+                "metadata_size_bytes",
+                "filename",
+                "file_size_bytes",
+                "etag",
+            ])?
+            .sort(vec![col("filename").sort(true, false)])?;
+        let rbs = df.collect().await?;
+        assert_snapshot!(batches_to_string(&rbs),@r"
+        +---------------------+-----------+-----------------+------+
+        | metadata_size_bytes | filename  | file_size_bytes | etag |
+        +---------------------+-----------+-----------------+------+
+        | 212                 | 0.parquet | 3642            | 0    |
+        | 212                 | 1.parquet | 3642            | 1    |
+        +---------------------+-----------+-----------------+------+
+        ");
+
+        Ok(())
+    }
 }
diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs
index c31310093ac6b..34787838929f1 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -15,29 +15,70 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::any::Any;
-use std::fmt::{Debug, Display};
-use std::sync::Arc;
-
-use datafusion::common::config::{
-    ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, TableOptions, Visit,
-};
-use datafusion::common::{config_err, exec_datafusion_err, exec_err};
-use datafusion::error::{DataFusionError, Result};
-use datafusion::execution::context::SessionState;
+pub mod instrumented;
 
 use async_trait::async_trait;
 use aws_config::BehaviorVersion;
-use aws_credential_types::provider::ProvideCredentials;
-use object_store::aws::{AmazonS3Builder, AwsCredential};
-use object_store::gcp::GoogleCloudStorageBuilder;
-use object_store::http::HttpBuilder;
-use object_store::{ClientOptions, CredentialProvider, ObjectStore};
+use aws_credential_types::provider::{
+    ProvideCredentials, SharedCredentialsProvider, error::CredentialsError,
+};
+use datafusion::{
+    common::{
+        config::ConfigEntry, config::ConfigExtension, config::ConfigField,
+        config::ExtensionOptions, config::TableOptions, config::Visit, config_err,
+        exec_datafusion_err, exec_err,
+    },
+    error::{DataFusionError, Result},
+    execution::context::SessionState,
+};
+use log::debug;
+use object_store::{
+    ClientOptions, CredentialProvider,
+    Error::Generic,
+    ObjectStore,
+    aws::{AmazonS3Builder, AmazonS3ConfigKey, AwsCredential},
+    gcp::GoogleCloudStorageBuilder,
+    http::HttpBuilder,
+};
+use std::{
+    any::Any,
+    error::Error,
+    fmt::{Debug, Display},
+    sync::Arc,
+};
 use url::Url;
 
+#[cfg(not(test))]
+use object_store::aws::resolve_bucket_region;
+
+// Provide a local mock when running tests so we don't make network calls
+#[cfg(test)]
+async fn resolve_bucket_region(
+    _bucket: &str,
+    _client_options: &ClientOptions,
+) -> object_store::Result<String> {
+    Ok("eu-central-1".to_string())
+}
+
 pub async fn get_s3_object_store_builder(
     url: &Url,
     aws_options: &AwsOptions,
+    resolve_region: bool,
+) -> Result<AmazonS3Builder> {
+    // Box the inner future to reduce the future size of this async function,
+    // which is deeply nested in the CLI's async call chain.
+    Box::pin(get_s3_object_store_builder_inner(
+        url,
+        aws_options,
+        resolve_region,
+    ))
+    .await
+}
+
+async fn get_s3_object_store_builder_inner(
+    url: &Url,
+    aws_options: &AwsOptions,
+    resolve_region: bool,
 ) -> Result<AmazonS3Builder> {
     let AwsOptions {
         access_key_id,
@@ -46,6 +87,7 @@ pub async fn get_s3_object_store_builder(
         region,
         endpoint,
         allow_http,
+        skip_signature,
     } = aws_options;
 
     let bucket_name = get_bucket_name(url)?;
@@ -54,6 +96,7 @@ pub async fn get_s3_object_store_builder(
     if let (Some(access_key_id), Some(secret_access_key)) =
         (access_key_id, secret_access_key)
     {
+        debug!("Using explicitly provided S3 access_key_id and secret_access_key");
         builder = builder
             .with_access_key_id(access_key_id)
             .with_secret_access_key(secret_access_key);
@@ -62,40 +105,49 @@ pub async fn get_s3_object_store_builder(
             builder = builder.with_token(session_token);
         }
     } else {
-        let config = aws_config::defaults(BehaviorVersion::latest()).load().await;
-        if let Some(region) = config.region() {
-            builder = builder.with_region(region.to_string());
+        debug!("Using AWS S3 SDK to determine credentials");
+        let CredentialsFromConfig {
+            region,
+            credentials,
+        } = CredentialsFromConfig::try_new().await?;
+        if let Some(region) = region {
+            builder = builder.with_region(region);
+        }
+        if let Some(credentials) = credentials {
+            let credentials = Arc::new(S3CredentialProvider { credentials });
+            builder = builder.with_credentials(credentials);
+        } else {
+            debug!("No credentials found, defaulting to skip signature ");
+            builder = builder.with_skip_signature(true);
         }
-
-        let credentials = config
-            .credentials_provider()
-            .ok_or_else(|| {
-                DataFusionError::ObjectStore(object_store::Error::Generic {
-                    store: "S3",
-                    source: "Failed to get S3 credentials from the environment".into(),
-                })
-            })?
-            .clone();
-
-        let credentials = Arc::new(S3CredentialProvider { credentials });
-        builder = builder.with_credentials(credentials);
     }
 
     if let Some(region) = region {
         builder = builder.with_region(region);
     }
 
+    // If the region is not set or auto_detect_region is true, resolve the region.
+    if builder
+        .get_config_value(&AmazonS3ConfigKey::Region)
+        .is_none()
+        || resolve_region
+    {
+        let region = resolve_bucket_region(bucket_name, &ClientOptions::new()).await?;
+        builder = builder.with_region(region);
+    }
+
     if let Some(endpoint) = endpoint {
         // Make a nicer error if the user hasn't allowed http and the endpoint
         // is http as the default message is "URL scheme is not allowed"
-        if let Ok(endpoint_url) = Url::try_from(endpoint.as_str()) {
-            if !matches!(allow_http, Some(true)) && endpoint_url.scheme() == "http" {
-                return config_err!(
-                    "Invalid endpoint: {endpoint}. \
+        if let Ok(endpoint_url) = Url::try_from(endpoint.as_str())
+            && !matches!(allow_http, Some(true))
+            && endpoint_url.scheme() == "http"
+        {
+            return config_err!(
+                "Invalid endpoint: {endpoint}. \
                 HTTP is not allowed for S3 endpoints. \
                 To allow HTTP, set 'aws.allow_http' to true"
-                );
-            }
+            );
         }
 
         builder = builder.with_endpoint(endpoint);
@@ -105,12 +157,74 @@ pub async fn get_s3_object_store_builder(
         builder = builder.with_allow_http(*allow_http);
     }
 
+    if let Some(skip_signature) = skip_signature {
+        builder = builder.with_skip_signature(*skip_signature);
+    }
+
     Ok(builder)
 }
 
+/// Credentials from the AWS SDK
+struct CredentialsFromConfig {
+    region: Option<String>,
+    credentials: Option<SharedCredentialsProvider>,
+}
+
+impl CredentialsFromConfig {
+    /// Attempt find AWS S3 credentials via the AWS SDK
+    pub async fn try_new() -> Result<Self> {
+        let config = aws_config::defaults(BehaviorVersion::latest()).load().await;
+        let region = config.region().map(|r| r.to_string());
+
+        let credentials = config
+            .credentials_provider()
+            .ok_or_else(|| {
+                DataFusionError::ObjectStore(Box::new(Generic {
+                    store: "S3",
+                    source: "Failed to get S3 credentials aws_config".into(),
+                }))
+            })?
+            .clone();
+
+        // The credential provider is lazy, so it does not fetch credentials
+        // until they are needed. To ensure that the credentials are valid,
+        // we can call `provide_credentials` here.
+        let credentials = match credentials.provide_credentials().await {
+            Ok(_) => Some(credentials),
+            Err(CredentialsError::CredentialsNotLoaded(_)) => {
+                debug!("Could not use AWS SDK to get credentials");
+                None
+            }
+            // other errors like `CredentialsError::InvalidConfiguration`
+            // should be returned to the user so they can be fixed
+            Err(e) => {
+                // Pass back underlying error to the user, including underlying source
+                let source_message = if let Some(source) = e.source() {
+                    format!(": {source}")
+                } else {
+                    String::new()
+                };
+
+                let message = format!(
+                    "Error getting credentials from provider: {e}{source_message}",
+                );
+
+                return Err(DataFusionError::ObjectStore(Box::new(Generic {
+                    store: "S3",
+                    source: message.into(),
+                })));
+            }
+        };
+        Ok(Self {
+            region,
+            credentials,
+        })
+    }
+}
+
 #[derive(Debug)]
 struct S3CredentialProvider {
-    credentials: aws_credential_types::provider::SharedCredentialsProvider,
+    credentials: SharedCredentialsProvider,
 }
 
 #[async_trait]
@@ -118,12 +232,14 @@ impl CredentialProvider for S3CredentialProvider {
     type Credential = AwsCredential;
 
     async fn get_credential(&self) -> object_store::Result<Arc<Self::Credential>> {
-        let creds = self.credentials.provide_credentials().await.map_err(|e| {
-            object_store::Error::Generic {
-                store: "S3",
-                source: Box::new(e),
-            }
-        })?;
+        let creds =
+            self.credentials
+                .provide_credentials()
+                .await
+                .map_err(|e| Generic {
+                    store: "S3",
+                    source: Box::new(e),
+                })?;
         Ok(Arc::new(AwsCredential {
             key_id: creds.access_key_id().to_string(),
             secret_key: creds.secret_access_key().to_string(),
@@ -197,10 +313,7 @@ pub fn get_gcs_object_store_builder(
 
 fn get_bucket_name(url: &Url) -> Result<&str> {
     url.host_str().ok_or_else(|| {
-        DataFusionError::Execution(format!(
-            "Not able to parse bucket name from url: {}",
-            url.as_str()
-        ))
+        exec_datafusion_err!("Not able to parse bucket name from url: {}", url.as_str())
     })
 }
 
@@ -219,6 +332,11 @@ pub struct AwsOptions {
     pub endpoint: Option<String>,
     /// Allow HTTP (otherwise will always use https)
     pub allow_http: Option<bool>,
+    /// Do not fetch credentials and do not sign requests
+    ///
+    /// This can be useful when interacting with public S3 buckets that deny
+    /// authorized requests
+    pub skip_signature: Option<bool>,
 }
 
 impl ExtensionOptions for AwsOptions {
@@ -256,6 +374,9 @@ impl ExtensionOptions for AwsOptions {
             "allow_http" => {
                 self.allow_http.set(rem, value)?;
             }
+            "skip_signature" | "nosign" => {
+                self.skip_signature.set(rem, value)?;
+            }
             _ => {
                 return config_err!("Config value \"{}\" not found on AwsOptions", rem);
             }
@@ -397,6 +518,7 @@ pub(crate) async fn get_object_store(
     scheme: &str,
     url: &Url,
     table_options: &TableOptions,
+    resolve_region: bool,
 ) -> Result<Arc<dyn ObjectStore>, DataFusionError> {
     let store: Arc<dyn ObjectStore> = match scheme {
         "s3" => {
@@ -405,7 +527,8 @@ pub(crate) async fn get_object_store(
                     "Given table options incompatible with the 's3' scheme"
                 );
             };
-            let builder = get_s3_object_store_builder(url, options).await?;
+            let builder =
+                get_s3_object_store_builder(url, options, resolve_region).await?;
             Arc::new(builder.build()?)
         }
         "oss" => {
@@ -461,7 +584,6 @@ mod tests {
 
     use super::*;
 
-    use datafusion::common::plan_err;
     use datafusion::{
         datasource::listing::ListingTableUrl,
         logical_expr::{DdlStatement, LogicalPlan},
@@ -470,6 +592,74 @@ mod tests {
 
     use object_store::{aws::AmazonS3ConfigKey, gcp::GoogleConfigKey};
 
+    #[tokio::test]
+    async fn s3_object_store_builder_default() -> Result<()> {
+        if let Err(DataFusionError::Execution(e)) = check_aws_envs().await {
+            // Skip test if AWS envs are not set
+            eprintln!("{e}");
+            return Ok(());
+        }
+
+        let location = "s3://bucket/path/FAKE/file.parquet";
+        // Set it to a non-existent file to avoid reading the default configuration file
+        unsafe {
+            std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
+            std::env::set_var("AWS_SHARED_CREDENTIALS_FILE", "data/aws.credentials");
+        }
+
+        // No options
+        let table_url = ListingTableUrl::parse(location)?;
+        let scheme = table_url.scheme();
+        let sql =
+            format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'");
+
+        let ctx = SessionContext::new();
+        ctx.register_table_options_extension_from_scheme(scheme);
+        let table_options = get_table_options(&ctx, &sql).await;
+        let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
+        let builder =
+            get_s3_object_store_builder(table_url.as_ref(), aws_options, false).await?;
+
+        // If the environment variables are set (as they are in CI) use them
+        let expected_access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
+        let expected_secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
+        let expected_region = Some(
+            std::env::var("AWS_REGION").unwrap_or_else(|_| "eu-central-1".to_string()),
+        );
+        let expected_endpoint = std::env::var("AWS_ENDPOINT").ok();
+
+        // get the actual configuration information, then assert_eq!
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::AccessKeyId),
+            expected_access_key_id
+        );
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::SecretAccessKey),
+            expected_secret_access_key
+        );
+        // Default is to skip signature when no credentials are provided
+        let expected_skip_signature =
+            if expected_access_key_id.is_none() && expected_secret_access_key.is_none() {
+                Some(String::from("true"))
+            } else {
+                Some(String::from("false"))
+            };
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::Region),
+            expected_region
+        );
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::Endpoint),
+            expected_endpoint
+        );
+        assert_eq!(builder.get_config_value(&AmazonS3ConfigKey::Token), None);
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::SkipSignature),
+            expected_skip_signature
+        );
+        Ok(())
+    }
+
     #[tokio::test]
     async fn s3_object_store_builder() -> Result<()> {
         // "fake" is uppercase to ensure the values are not lowercased when parsed
@@ -493,29 +683,27 @@ mod tests {
         );
 
         let ctx = SessionContext::new();
-        let mut plan = ctx.state().create_logical_plan(&sql).await?;
-
-        if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
-            ctx.register_table_options_extension_from_scheme(scheme);
-            let mut table_options = ctx.state().default_table_options();
-            table_options.alter_with_string_hash_map(&cmd.options)?;
-            let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
-            let builder =
-                get_s3_object_store_builder(table_url.as_ref(), aws_options).await?;
-            // get the actual configuration information, then assert_eq!
-            let config = [
-                (AmazonS3ConfigKey::AccessKeyId, access_key_id),
-                (AmazonS3ConfigKey::SecretAccessKey, secret_access_key),
-                (AmazonS3ConfigKey::Region, region),
-                (AmazonS3ConfigKey::Endpoint, endpoint),
-                (AmazonS3ConfigKey::Token, session_token),
-            ];
-            for (key, value) in config {
-                assert_eq!(value, builder.get_config_value(&key).unwrap());
-            }
-        } else {
-            return plan_err!("LogicalPlan is not a CreateExternalTable");
+        ctx.register_table_options_extension_from_scheme(scheme);
+        let table_options = get_table_options(&ctx, &sql).await;
+        let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
+        let builder =
+            get_s3_object_store_builder(table_url.as_ref(), aws_options, false).await?;
+        // get the actual configuration information, then assert_eq!
+        let config = [
+            (AmazonS3ConfigKey::AccessKeyId, access_key_id),
+            (AmazonS3ConfigKey::SecretAccessKey, secret_access_key),
+            (AmazonS3ConfigKey::Region, region),
+            (AmazonS3ConfigKey::Endpoint, endpoint),
+            (AmazonS3ConfigKey::Token, session_token),
+        ];
+        for (key, value) in config {
+            assert_eq!(value, builder.get_config_value(&key).unwrap());
         }
+        // Should not skip signature when credentials are provided
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::SkipSignature),
+            Some("false".into())
+        );
 
         Ok(())
     }
@@ -538,21 +726,18 @@ mod tests {
         );
 
         let ctx = SessionContext::new();
-        let mut plan = ctx.state().create_logical_plan(&sql).await?;
-
-        if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
-            ctx.register_table_options_extension_from_scheme(scheme);
-            let mut table_options = ctx.state().default_table_options();
-            table_options.alter_with_string_hash_map(&cmd.options)?;
-            let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
-            let err = get_s3_object_store_builder(table_url.as_ref(), aws_options)
-                .await
-                .unwrap_err();
+        ctx.register_table_options_extension_from_scheme(scheme);
 
-            assert_eq!(err.to_string().lines().next().unwrap_or_default(), "Invalid or Unsupported Configuration: Invalid endpoint: http://endpoint33. HTTP is not allowed for S3 endpoints. To allow HTTP, set 'aws.allow_http' to true");
-        } else {
-            return plan_err!("LogicalPlan is not a CreateExternalTable");
-        }
+        let table_options = get_table_options(&ctx, &sql).await;
+        let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
+        let err = get_s3_object_store_builder(table_url.as_ref(), aws_options, false)
+            .await
+            .unwrap_err();
+
+        assert_eq!(
+            err.to_string().lines().next().unwrap_or_default(),
+            "Invalid or Unsupported Configuration: Invalid endpoint: http://endpoint33. HTTP is not allowed for S3 endpoints. To allow HTTP, set 'aws.allow_http' to true"
+        );
 
         // Now add `allow_http` to the options and check if it works
         let sql = format!(
@@ -563,20 +748,75 @@ mod tests {
             'aws.allow_http' 'true'\
             ) LOCATION '{location}'"
         );
+        let table_options = get_table_options(&ctx, &sql).await;
 
-        let mut plan = ctx.state().create_logical_plan(&sql).await?;
+        let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
+        // ensure this isn't an error
+        get_s3_object_store_builder(table_url.as_ref(), aws_options, false).await?;
 
-        if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
-            ctx.register_table_options_extension_from_scheme(scheme);
-            let mut table_options = ctx.state().default_table_options();
-            table_options.alter_with_string_hash_map(&cmd.options)?;
-            let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
-            // ensure this isn't an error
-            get_s3_object_store_builder(table_url.as_ref(), aws_options).await?;
-        } else {
-            return plan_err!("LogicalPlan is not a CreateExternalTable");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn s3_object_store_builder_resolves_region_when_none_provided() -> Result<()> {
+        if let Err(DataFusionError::Execution(e)) = check_aws_envs().await {
+            // Skip test if AWS envs are not set
+            eprintln!("{e}");
+            return Ok(());
+        }
+        let location = "s3://test-bucket/path/file.parquet";
+        // Set it to a non-existent file to avoid reading the default configuration file
+        unsafe {
+            std::env::set_var("AWS_CONFIG_FILE", "data/aws.config");
+        }
+
+        let table_url = ListingTableUrl::parse(location)?;
+        let aws_options = AwsOptions {
+            region: None, // No region specified - should auto-detect
+            ..Default::default()
+        };
+
+        let builder =
+            get_s3_object_store_builder(table_url.as_ref(), &aws_options, false).await?;
+
+        // Verify that the region was auto-detected in test environment
+        assert!(
+            builder
+                .get_config_value(&AmazonS3ConfigKey::Region)
+                .is_some()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn s3_object_store_builder_overrides_region_when_resolve_region_enabled()
+    -> Result<()> {
+        if let Err(DataFusionError::Execution(e)) = check_aws_envs().await {
+            // Skip test if AWS envs are not set
+            eprintln!("{e}");
+            return Ok(());
         }
 
+        let original_region = "us-east-1";
+        let expected_region = "eu-central-1"; // This should be the auto-detected region
+        let location = "s3://test-bucket/path/file.parquet";
+
+        let table_url = ListingTableUrl::parse(location)?;
+        let aws_options = AwsOptions {
+            region: Some(original_region.to_string()), // Explicit region provided
+            ..Default::default()
+        };
+
+        let builder =
+            get_s3_object_store_builder(table_url.as_ref(), &aws_options, true).await?;
+
+        // Verify that the region was overridden by auto-detection
+        assert_eq!(
+            builder.get_config_value(&AmazonS3ConfigKey::Region),
+            Some(expected_region.to_string())
+        );
+
         Ok(())
     }
 
@@ -589,28 +829,24 @@ mod tests {
 
         let table_url = ListingTableUrl::parse(location)?;
         let scheme = table_url.scheme();
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"
+        );
 
         let ctx = SessionContext::new();
-        let mut plan = ctx.state().create_logical_plan(&sql).await?;
-
-        if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
-            ctx.register_table_options_extension_from_scheme(scheme);
-            let mut table_options = ctx.state().default_table_options();
-            table_options.alter_with_string_hash_map(&cmd.options)?;
-            let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
-            let builder = get_oss_object_store_builder(table_url.as_ref(), aws_options)?;
-            // get the actual configuration information, then assert_eq!
-            let config = [
-                (AmazonS3ConfigKey::AccessKeyId, access_key_id),
-                (AmazonS3ConfigKey::SecretAccessKey, secret_access_key),
-                (AmazonS3ConfigKey::Endpoint, endpoint),
-            ];
-            for (key, value) in config {
-                assert_eq!(value, builder.get_config_value(&key).unwrap());
-            }
-        } else {
-            return plan_err!("LogicalPlan is not a CreateExternalTable");
+        ctx.register_table_options_extension_from_scheme(scheme);
+        let table_options = get_table_options(&ctx, &sql).await;
+
+        let aws_options = table_options.extensions.get::<AwsOptions>().unwrap();
+        let builder = get_oss_object_store_builder(table_url.as_ref(), aws_options)?;
+        // get the actual configuration information, then assert_eq!
+        let config = [
+            (AmazonS3ConfigKey::AccessKeyId, access_key_id),
+            (AmazonS3ConfigKey::SecretAccessKey, secret_access_key),
+            (AmazonS3ConfigKey::Endpoint, endpoint),
+        ];
+        for (key, value) in config {
+            assert_eq!(value, builder.get_config_value(&key).unwrap());
         }
 
         Ok(())
@@ -619,40 +855,66 @@ mod tests {
     #[tokio::test]
     async fn gcs_object_store_builder() -> Result<()> {
         let service_account_path = "fake_service_account_path";
-        let service_account_key =
-            "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
+        let service_account_key = "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}";
         let application_credentials_path = "fake_application_credentials_path";
         let location = "gcs://bucket/path/file.parquet";
 
         let table_url = ListingTableUrl::parse(location)?;
         let scheme = table_url.scheme();
-        let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'");
+        let sql = format!(
+            "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"
+        );
 
         let ctx = SessionContext::new();
-        let mut plan = ctx.state().create_logical_plan(&sql).await?;
-
-        if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
-            ctx.register_table_options_extension_from_scheme(scheme);
-            let mut table_options = ctx.state().default_table_options();
-            table_options.alter_with_string_hash_map(&cmd.options)?;
-            let gcp_options = table_options.extensions.get::<GcpOptions>().unwrap();
-            let builder = get_gcs_object_store_builder(table_url.as_ref(), gcp_options)?;
-            // get the actual configuration information, then assert_eq!
-            let config = [
-                (GoogleConfigKey::ServiceAccount, service_account_path),
-                (GoogleConfigKey::ServiceAccountKey, service_account_key),
-                (
-                    GoogleConfigKey::ApplicationCredentials,
-                    application_credentials_path,
-                ),
-            ];
-            for (key, value) in config {
-                assert_eq!(value, builder.get_config_value(&key).unwrap());
-            }
-        } else {
-            return plan_err!("LogicalPlan is not a CreateExternalTable");
+        ctx.register_table_options_extension_from_scheme(scheme);
+        let table_options = get_table_options(&ctx, &sql).await;
+
+        let gcp_options = table_options.extensions.get::<GcpOptions>().unwrap();
+        let builder = get_gcs_object_store_builder(table_url.as_ref(), gcp_options)?;
+        // get the actual configuration information, then assert_eq!
+        let config = [
+            (GoogleConfigKey::ServiceAccount, service_account_path),
+            (GoogleConfigKey::ServiceAccountKey, service_account_key),
+            (
+                GoogleConfigKey::ApplicationCredentials,
+                application_credentials_path,
+            ),
+        ];
+        for (key, value) in config {
+            assert_eq!(value, builder.get_config_value(&key).unwrap());
         }
 
         Ok(())
     }
+
+    /// Plans the `CREATE EXTERNAL TABLE` SQL statement and returns the
+    /// resulting resolved `CreateExternalTable` command.
+    async fn get_table_options(ctx: &SessionContext, sql: &str) -> TableOptions {
+        let mut plan = ctx.state().create_logical_plan(sql).await.unwrap();
+
+        let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan else {
+            panic!("plan is not a CreateExternalTable");
+        };
+
+        let mut table_options = ctx.state().default_table_options();
+        table_options
+            .alter_with_string_hash_map(&cmd.options)
+            .unwrap();
+        table_options
+    }
+
+    async fn check_aws_envs() -> Result<()> {
+        let aws_envs = [
+            "AWS_ACCESS_KEY_ID",
+            "AWS_SECRET_ACCESS_KEY",
+            "AWS_REGION",
+            "AWS_ALLOW_HTTP",
+        ];
+        for aws_env in aws_envs {
+            std::env::var(aws_env).map_err(|_| {
+                exec_datafusion_err!("aws envs not set, skipping s3 tests")
+            })?;
+        }
+        Ok(())
+    }
 }
diff --git a/datafusion-cli/src/object_storage/instrumented.rs b/datafusion-cli/src/object_storage/instrumented.rs
new file mode 100644
index 0000000000000..a0321cacb374b
--- /dev/null
+++ b/datafusion-cli/src/object_storage/instrumented.rs
@@ -0,0 +1,1388 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    cmp, fmt,
+    ops::AddAssign,
+    str::FromStr,
+    sync::{
+        Arc,
+        atomic::{AtomicU8, AtomicU64, Ordering},
+    },
+    time::Duration,
+};
+
+use arrow::array::{ArrayRef, RecordBatch, StringArray};
+use arrow::util::pretty::pretty_format_batches;
+use async_trait::async_trait;
+use chrono::Utc;
+use datafusion::{
+    common::{HashMap, instant::Instant},
+    error::DataFusionError,
+    execution::object_store::{DefaultObjectStoreRegistry, ObjectStoreRegistry},
+};
+use futures::stream::{BoxStream, Stream};
+use futures::{StreamExt, TryStreamExt};
+use object_store::{
+    CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
+    ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
+    PutResult, Result, path::Path,
+};
+use parking_lot::{Mutex, RwLock};
+use url::Url;
+
+/// A stream wrapper that measures the time until the first response(item or end of stream) is yielded.
+///
+/// The timer starts on the first `poll_next` call (not at stream creation) to avoid
+/// measuring unrelated work between stream creation and first poll.
+/// Duration is stored as nanoseconds in an `AtomicU64` (0 = not yet set).
+struct TimeToFirstItemStream<S> {
+    inner: S,
+    start: Option<Instant>,
+    request_duration: Arc<AtomicU64>,
+    duration_recorded: bool,
+}
+
+impl<S> TimeToFirstItemStream<S> {
+    fn new(inner: S, request_duration: Arc<AtomicU64>) -> Self {
+        Self {
+            inner,
+            start: None,
+            request_duration,
+            duration_recorded: false,
+        }
+    }
+}
+
+impl<S> Stream for TimeToFirstItemStream<S>
+where
+    S: Stream<Item = Result<ObjectMeta>> + Unpin,
+{
+    type Item = Result<ObjectMeta>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        let start = *self.start.get_or_insert_with(Instant::now);
+
+        let poll_result = std::pin::Pin::new(&mut self.inner).poll_next(cx);
+
+        if !self.duration_recorded && poll_result.is_ready() {
+            self.duration_recorded = true;
+            let nanos = start.elapsed().as_nanos() as u64;
+            self.request_duration.store(nanos, Ordering::Release);
+        }
+
+        poll_result
+    }
+}
+
+/// The profiling mode to use for an [`InstrumentedObjectStore`] instance. Collecting profiling
+/// data will have a small negative impact on both CPU and memory usage. Default is `Disabled`
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+pub enum InstrumentedObjectStoreMode {
+    /// Disable collection of profiling data
+    #[default]
+    Disabled,
+    /// Enable collection of profiling data and output a summary
+    Summary,
+    /// Enable collection of profiling data and output a summary and all details
+    Trace,
+}
+
+impl fmt::Display for InstrumentedObjectStoreMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl FromStr for InstrumentedObjectStoreMode {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "disabled" => Ok(Self::Disabled),
+            "summary" => Ok(Self::Summary),
+            "trace" => Ok(Self::Trace),
+            _ => Err(DataFusionError::Execution(format!("Unrecognized mode {s}"))),
+        }
+    }
+}
+
+impl From<u8> for InstrumentedObjectStoreMode {
+    fn from(value: u8) -> Self {
+        match value {
+            1 => InstrumentedObjectStoreMode::Summary,
+            2 => InstrumentedObjectStoreMode::Trace,
+            _ => InstrumentedObjectStoreMode::Disabled,
+        }
+    }
+}
+
+/// Wrapped [`ObjectStore`] instances that record information for reporting on the usage of the
+/// inner [`ObjectStore`]
+#[derive(Debug)]
+pub struct InstrumentedObjectStore {
+    inner: Arc<dyn ObjectStore>,
+    instrument_mode: AtomicU8,
+    requests: Arc<Mutex<Vec<RequestDetails>>>,
+}
+
+impl InstrumentedObjectStore {
+    /// Returns a new [`InstrumentedObjectStore`] that wraps the provided [`ObjectStore`]
+    fn new(object_store: Arc<dyn ObjectStore>, instrument_mode: AtomicU8) -> Self {
+        Self {
+            inner: object_store,
+            instrument_mode,
+            requests: Arc::new(Mutex::new(Vec::new())),
+        }
+    }
+
+    fn set_instrument_mode(&self, mode: InstrumentedObjectStoreMode) {
+        self.instrument_mode.store(mode as u8, Ordering::Relaxed)
+    }
+
+    /// Returns all [`RequestDetails`] accumulated in this [`InstrumentedObjectStore`] and clears
+    /// the stored requests
+    pub fn take_requests(&self) -> Vec<RequestDetails> {
+        let mut req = self.requests.lock();
+
+        req.drain(..).collect()
+    }
+
+    fn enabled(&self) -> bool {
+        self.instrument_mode.load(Ordering::Relaxed)
+            != InstrumentedObjectStoreMode::Disabled as u8
+    }
+
+    async fn instrumented_put_opts(
+        &self,
+        location: &Path,
+        payload: PutPayload,
+        opts: PutOptions,
+    ) -> Result<PutResult> {
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        let size = payload.content_length();
+        let ret = self.inner.put_opts(location, payload, opts).await?;
+        let elapsed = start.elapsed();
+
+        self.requests.lock().push(RequestDetails {
+            op: Operation::Put,
+            path: location.clone(),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size: Some(size),
+            range: None,
+            extra_display: None,
+        });
+
+        Ok(ret)
+    }
+
+    async fn instrumented_put_multipart(
+        &self,
+        location: &Path,
+        opts: PutMultipartOptions,
+    ) -> Result<Box<dyn MultipartUpload>> {
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        let ret = self.inner.put_multipart_opts(location, opts).await?;
+        let elapsed = start.elapsed();
+
+        self.requests.lock().push(RequestDetails {
+            op: Operation::Put,
+            path: location.clone(),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size: None,
+            range: None,
+            extra_display: None,
+        });
+
+        Ok(ret)
+    }
+
+    async fn instrumented_get_opts(
+        &self,
+        location: &Path,
+        options: GetOptions,
+    ) -> Result<GetResult> {
+        let timestamp = Utc::now();
+        let range = options.range.clone();
+
+        let head = options.head;
+        let start = Instant::now();
+        let ret = self.inner.get_opts(location, options).await?;
+        let elapsed = start.elapsed();
+
+        let (op, size) = if head {
+            (Operation::Head, None)
+        } else {
+            (
+                Operation::Get,
+                Some((ret.range.end - ret.range.start) as usize),
+            )
+        };
+
+        self.requests.lock().push(RequestDetails {
+            op,
+            path: location.clone(),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size,
+            range,
+            extra_display: None,
+        });
+
+        Ok(ret)
+    }
+
+    fn instrumented_delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
+        let requests_captured = Arc::clone(&self.requests);
+
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        self.inner
+            .delete_stream(locations)
+            .and_then(move |location| {
+                let elapsed = start.elapsed();
+                requests_captured.lock().push(RequestDetails {
+                    op: Operation::Delete,
+                    path: location.clone(),
+                    timestamp,
+                    duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+                    size: None,
+                    range: None,
+                    extra_display: None,
+                });
+                futures::future::ok(location)
+            })
+            .boxed()
+    }
+
+    fn instrumented_list(
+        &self,
+        prefix: Option<&Path>,
+    ) -> BoxStream<'static, Result<ObjectMeta>> {
+        let timestamp = Utc::now();
+        let inner_stream = self.inner.list(prefix);
+
+        let duration_nanos = Arc::new(AtomicU64::new(0));
+        self.requests.lock().push(RequestDetails {
+            op: Operation::List,
+            path: prefix.cloned().unwrap_or_else(|| Path::from("")),
+            timestamp,
+            duration_nanos: Arc::clone(&duration_nanos),
+            size: None,
+            range: None,
+            extra_display: None,
+        });
+
+        Box::pin(TimeToFirstItemStream::new(inner_stream, duration_nanos))
+    }
+
+    async fn instrumented_list_with_delimiter(
+        &self,
+        prefix: Option<&Path>,
+    ) -> Result<ListResult> {
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        let ret = self.inner.list_with_delimiter(prefix).await?;
+        let elapsed = start.elapsed();
+
+        self.requests.lock().push(RequestDetails {
+            op: Operation::List,
+            path: prefix.cloned().unwrap_or_else(|| Path::from("")),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size: None,
+            range: None,
+            extra_display: None,
+        });
+
+        Ok(ret)
+    }
+
+    async fn instrumented_copy(&self, from: &Path, to: &Path) -> Result<()> {
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        self.inner.copy(from, to).await?;
+        let elapsed = start.elapsed();
+
+        self.requests.lock().push(RequestDetails {
+            op: Operation::Copy,
+            path: from.clone(),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size: None,
+            range: None,
+            extra_display: Some(format!("copy_to: {to}")),
+        });
+
+        Ok(())
+    }
+
+    async fn instrumented_copy_if_not_exists(
+        &self,
+        from: &Path,
+        to: &Path,
+    ) -> Result<()> {
+        let timestamp = Utc::now();
+        let start = Instant::now();
+        self.inner.copy_if_not_exists(from, to).await?;
+        let elapsed = start.elapsed();
+
+        self.requests.lock().push(RequestDetails {
+            op: Operation::Copy,
+            path: from.clone(),
+            timestamp,
+            duration_nanos: Arc::new(AtomicU64::new(elapsed.as_nanos() as u64)),
+            size: None,
+            range: None,
+            extra_display: Some(format!("copy_to: {to}")),
+        });
+
+        Ok(())
+    }
+}
+
+impl fmt::Display for InstrumentedObjectStore {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mode: InstrumentedObjectStoreMode =
+            self.instrument_mode.load(Ordering::Relaxed).into();
+        write!(
+            f,
+            "Instrumented Object Store: instrument_mode: {mode}, inner: {}",
+            self.inner
+        )
+    }
+}
+
+#[async_trait]
+impl ObjectStore for InstrumentedObjectStore {
+    async fn put_opts(
+        &self,
+        location: &Path,
+        payload: PutPayload,
+        opts: PutOptions,
+    ) -> Result<PutResult> {
+        if self.enabled() {
+            return self.instrumented_put_opts(location, payload, opts).await;
+        }
+
+        self.inner.put_opts(location, payload, opts).await
+    }
+
+    async fn put_multipart_opts(
+        &self,
+        location: &Path,
+        opts: PutMultipartOptions,
+    ) -> Result<Box<dyn MultipartUpload>> {
+        if self.enabled() {
+            return self.instrumented_put_multipart(location, opts).await;
+        }
+
+        self.inner.put_multipart_opts(location, opts).await
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        if self.enabled() {
+            return self.instrumented_get_opts(location, options).await;
+        }
+
+        self.inner.get_opts(location, options).await
+    }
+
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, Result<Path>>,
+    ) -> BoxStream<'static, Result<Path>> {
+        if self.enabled() {
+            return self.instrumented_delete_stream(locations);
+        }
+
+        self.inner.delete_stream(locations)
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
+        if self.enabled() {
+            return self.instrumented_list(prefix);
+        }
+
+        self.inner.list(prefix)
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        if self.enabled() {
+            return self.instrumented_list_with_delimiter(prefix).await;
+        }
+
+        self.inner.list_with_delimiter(prefix).await
+    }
+
+    async fn copy_opts(
+        &self,
+        from: &Path,
+        to: &Path,
+        options: CopyOptions,
+    ) -> Result<()> {
+        if self.enabled() {
+            return match options.mode {
+                object_store::CopyMode::Create => {
+                    self.instrumented_copy_if_not_exists(from, to).await
+                }
+                object_store::CopyMode::Overwrite => {
+                    self.instrumented_copy(from, to).await
+                }
+            };
+        }
+
+        self.inner.copy_opts(from, to, options).await
+    }
+}
+
+/// Object store operation types tracked by [`InstrumentedObjectStore`]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum Operation {
+    Copy,
+    Delete,
+    Get,
+    Head,
+    List,
+    Put,
+}
+
+impl fmt::Display for Operation {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+/// Holds profiling details about individual requests made through an [`InstrumentedObjectStore`]
+pub struct RequestDetails {
+    op: Operation,
+    path: Path,
+    timestamp: chrono::DateTime<Utc>,
+    /// Duration stored as nanoseconds in an AtomicU64. 0 means not yet set.
+    duration_nanos: Arc<AtomicU64>,
+    size: Option<usize>,
+    range: Option<GetRange>,
+    extra_display: Option<String>,
+}
+
+impl fmt::Debug for RequestDetails {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("RequestDetails")
+            .field("op", &self.op)
+            .field("path", &self.path)
+            .field("timestamp", &self.timestamp)
+            .field("duration", &self.duration())
+            .field("size", &self.size)
+            .field("range", &self.range)
+            .field("extra_display", &self.extra_display)
+            .finish()
+    }
+}
+
+impl RequestDetails {
+    fn duration(&self) -> Option<Duration> {
+        let nanos = self.duration_nanos.load(Ordering::Acquire);
+        if nanos == 0 {
+            None
+        } else {
+            Some(Duration::from_nanos(nanos))
+        }
+    }
+}
+
+impl fmt::Display for RequestDetails {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut output_parts = vec![format!(
+            "{} operation={:?}",
+            self.timestamp.to_rfc3339(),
+            self.op
+        )];
+
+        if let Some(d) = self.duration() {
+            output_parts.push(format!("duration={:.6}s", d.as_secs_f32()));
+        }
+        if let Some(s) = self.size {
+            output_parts.push(format!("size={s}"));
+        }
+        if let Some(r) = &self.range {
+            output_parts.push(format!("range: {r}"));
+        }
+        output_parts.push(format!("path={}", self.path));
+
+        if let Some(ed) = &self.extra_display {
+            output_parts.push(ed.clone());
+        }
+
+        write!(f, "{}", output_parts.join(" "))
+    }
+}
+
+/// Summary statistics for all requests recorded in an [`InstrumentedObjectStore`]
+#[derive(Default)]
+pub struct RequestSummaries {
+    summaries: Vec<RequestSummary>,
+}
+
+/// Display the summary as a table
+impl fmt::Display for RequestSummaries {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Don't expect an error, but avoid panicking if it happens
+        match pretty_format_batches(&[self.to_batch()]) {
+            Err(e) => {
+                write!(f, "Error formatting summary: {e}")
+            }
+            Ok(displayable) => {
+                write!(f, "{displayable}")
+            }
+        }
+    }
+}
+
+impl RequestSummaries {
+    /// Summarizes input [`RequestDetails`]
+    pub fn new(requests: &[RequestDetails]) -> Self {
+        let mut summaries: HashMap<Operation, RequestSummary> = HashMap::new();
+        for rd in requests {
+            match summaries.get_mut(&rd.op) {
+                Some(rs) => rs.push(rd),
+                None => {
+                    let mut rs = RequestSummary::new(rd.op);
+                    rs.push(rd);
+                    summaries.insert(rd.op, rs);
+                }
+            }
+        }
+        // Convert to a Vec with consistent ordering
+        let mut summaries: Vec<RequestSummary> = summaries.into_values().collect();
+        summaries.sort_by_key(|s| s.operation);
+        Self { summaries }
+    }
+
+    /// Convert the summaries into a `RecordBatch` for display
+    ///
+    /// Results in a table like:
+    /// ```text
+    /// +-----------+----------+-----------+-----------+-----------+-----------+-----------+
+    /// | Operation | Metric   | min       | max       | avg       | sum       | count     |
+    /// +-----------+----------+-----------+-----------+-----------+-----------+-----------+
+    /// | Get       | duration | 5.000000s | 5.000000s | 5.000000s |           | 1         |
+    /// | Get       | size     | 100 B     | 100 B     | 100 B     | 100 B     | 1         |
+    /// +-----------+----------+-----------+-----------+-----------+-----------+-----------+
+    /// ```
+    pub fn to_batch(&self) -> RecordBatch {
+        let operations: StringArray = self
+            .iter()
+            .flat_map(|s| std::iter::repeat_n(Some(s.operation.to_string()), 2))
+            .collect();
+        let metrics: StringArray = self
+            .iter()
+            .flat_map(|_s| [Some("duration"), Some("size")])
+            .collect();
+        let mins: StringArray = self
+            .stats_iter()
+            .flat_map(|(duration_stats, size_stats)| {
+                let dur_min =
+                    duration_stats.map(|d| format!("{:.6}s", d.min.as_secs_f32()));
+                let size_min = size_stats.map(|s| format!("{} B", s.min));
+                [dur_min, size_min]
+            })
+            .collect();
+        let maxs: StringArray = self
+            .stats_iter()
+            .flat_map(|(duration_stats, size_stats)| {
+                let dur_max =
+                    duration_stats.map(|d| format!("{:.6}s", d.max.as_secs_f32()));
+                let size_max = size_stats.map(|s| format!("{} B", s.max));
+                [dur_max, size_max]
+            })
+            .collect();
+        let avgs: StringArray = self
+            .iter()
+            .flat_map(|s| {
+                let count = s.count as f32;
+                let duration_stats = s.duration_stats.as_ref();
+                let size_stats = s.size_stats.as_ref();
+                let dur_avg = duration_stats.map(|d| {
+                    let avg = d.sum.as_secs_f32() / count;
+                    format!("{avg:.6}s")
+                });
+                let size_avg = size_stats.map(|s| {
+                    let avg = s.sum as f32 / count;
+                    format!("{avg} B")
+                });
+                [dur_avg, size_avg]
+            })
+            .collect();
+        let sums: StringArray = self
+            .stats_iter()
+            .flat_map(|(duration_stats, size_stats)| {
+                // Omit a sum stat for duration in the initial
+                // implementation because it can be a bit misleading (at least
+                // at first glance). For example, particularly large queries the
+                // sum of the durations was often larger than the total time of
+                // the query itself, can be confusing without additional
+                // explanation (e.g. that the sum is of individual requests,
+                // which may be concurrent).
+                let dur_sum =
+                    duration_stats.map(|d| format!("{:.6}s", d.sum.as_secs_f32()));
+                let size_sum = size_stats.map(|s| format!("{} B", s.sum));
+                [dur_sum, size_sum]
+            })
+            .collect();
+        let counts: StringArray = self
+            .iter()
+            .flat_map(|s| {
+                let count = s.count.to_string();
+                [Some(count.clone()), Some(count)]
+            })
+            .collect();
+
+        RecordBatch::try_from_iter(vec![
+            ("Operation", Arc::new(operations) as ArrayRef),
+            ("Metric", Arc::new(metrics) as ArrayRef),
+            ("min", Arc::new(mins) as ArrayRef),
+            ("max", Arc::new(maxs) as ArrayRef),
+            ("avg", Arc::new(avgs) as ArrayRef),
+            ("sum", Arc::new(sums) as ArrayRef),
+            ("count", Arc::new(counts) as ArrayRef),
+        ])
+        .expect("Created the batch correctly")
+    }
+
+    /// Return an iterator over the summaries
+    fn iter(&self) -> impl Iterator<Item = &RequestSummary> {
+        self.summaries.iter()
+    }
+
+    /// Return an iterator over (duration_stats, size_stats) tuples
+    /// for each summary
+    fn stats_iter(
+        &self,
+    ) -> impl Iterator<Item = (Option<&Stats<Duration>>, Option<&Stats<usize>>)> {
+        self.summaries
+            .iter()
+            .map(|s| (s.duration_stats.as_ref(), s.size_stats.as_ref()))
+    }
+}
+
+/// Summary statistics for a particular type of [`Operation`] (e.g. `GET` or `PUT`)
+/// in an [`InstrumentedObjectStore`]'s [`RequestDetails`]
+pub struct RequestSummary {
+    operation: Operation,
+    count: usize,
+    duration_stats: Option<Stats<Duration>>,
+    size_stats: Option<Stats<usize>>,
+}
+
+impl RequestSummary {
+    fn new(operation: Operation) -> Self {
+        Self {
+            operation,
+            count: 0,
+            duration_stats: None,
+            size_stats: None,
+        }
+    }
+    fn push(&mut self, request: &RequestDetails) {
+        self.count += 1;
+        if let Some(dur) = request.duration() {
+            self.duration_stats.get_or_insert_default().push(dur)
+        }
+        if let Some(size) = request.size {
+            self.size_stats.get_or_insert_default().push(size)
+        }
+    }
+}
+
+struct Stats<T: Copy + Ord + AddAssign<T>> {
+    min: T,
+    max: T,
+    sum: T,
+}
+
+impl<T: Copy + Ord + AddAssign<T>> Stats<T> {
+    fn push(&mut self, val: T) {
+        self.min = cmp::min(val, self.min);
+        self.max = cmp::max(val, self.max);
+        self.sum += val;
+    }
+}
+
+impl Default for Stats<Duration> {
+    fn default() -> Self {
+        Self {
+            min: Duration::MAX,
+            max: Duration::ZERO,
+            sum: Duration::ZERO,
+        }
+    }
+}
+
+impl Default for Stats<usize> {
+    fn default() -> Self {
+        Self {
+            min: usize::MAX,
+            max: usize::MIN,
+            sum: 0,
+        }
+    }
+}
+
+/// Provides access to [`InstrumentedObjectStore`] instances that record requests for reporting
+#[derive(Debug)]
+pub struct InstrumentedObjectStoreRegistry {
+    inner: Arc<dyn ObjectStoreRegistry>,
+    instrument_mode: AtomicU8,
+    stores: RwLock<Vec<Arc<InstrumentedObjectStore>>>,
+}
+
+impl Default for InstrumentedObjectStoreRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl InstrumentedObjectStoreRegistry {
+    /// Returns a new [`InstrumentedObjectStoreRegistry`] that wraps the provided
+    /// [`ObjectStoreRegistry`]
+    pub fn new() -> Self {
+        Self {
+            inner: Arc::new(DefaultObjectStoreRegistry::new()),
+            instrument_mode: AtomicU8::new(InstrumentedObjectStoreMode::default() as u8),
+            stores: RwLock::new(Vec::new()),
+        }
+    }
+
+    pub fn with_profile_mode(self, mode: InstrumentedObjectStoreMode) -> Self {
+        self.instrument_mode.store(mode as u8, Ordering::Relaxed);
+        self
+    }
+
+    /// Provides access to all of the [`InstrumentedObjectStore`]s managed by this
+    /// [`InstrumentedObjectStoreRegistry`]
+    pub fn stores(&self) -> Vec<Arc<InstrumentedObjectStore>> {
+        self.stores.read().clone()
+    }
+
+    /// Returns the current [`InstrumentedObjectStoreMode`] for this
+    /// [`InstrumentedObjectStoreRegistry`]
+    pub fn instrument_mode(&self) -> InstrumentedObjectStoreMode {
+        self.instrument_mode.load(Ordering::Relaxed).into()
+    }
+
+    /// Sets the [`InstrumentedObjectStoreMode`] for this [`InstrumentedObjectStoreRegistry`]
+    pub fn set_instrument_mode(&self, mode: InstrumentedObjectStoreMode) {
+        self.instrument_mode.store(mode as u8, Ordering::Relaxed);
+        for s in self.stores.read().iter() {
+            s.set_instrument_mode(mode)
+        }
+    }
+}
+
+impl ObjectStoreRegistry for InstrumentedObjectStoreRegistry {
+    fn register_store(
+        &self,
+        url: &Url,
+        store: Arc<dyn ObjectStore>,
+    ) -> Option<Arc<dyn ObjectStore>> {
+        let mode = self.instrument_mode.load(Ordering::Relaxed);
+        let instrumented =
+            Arc::new(InstrumentedObjectStore::new(store, AtomicU8::new(mode)));
+        self.stores.write().push(Arc::clone(&instrumented));
+        self.inner.register_store(url, instrumented)
+    }
+
+    fn deregister_store(
+        &self,
+        url: &Url,
+    ) -> datafusion::common::Result<Arc<dyn ObjectStore>> {
+        self.inner.deregister_store(url)
+    }
+
+    fn get_store(&self, url: &Url) -> datafusion::common::Result<Arc<dyn ObjectStore>> {
+        self.inner.get_store(url)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::StreamExt;
+    use object_store::WriteMultipart;
+
+    use super::*;
+    use insta::assert_snapshot;
+
+    #[test]
+    fn instrumented_mode() {
+        assert!(matches!(
+            InstrumentedObjectStoreMode::default(),
+            InstrumentedObjectStoreMode::Disabled
+        ));
+
+        assert!(matches!(
+            "dIsABleD".parse().unwrap(),
+            InstrumentedObjectStoreMode::Disabled
+        ));
+        assert!(matches!(
+            "SUmMaRy".parse().unwrap(),
+            InstrumentedObjectStoreMode::Summary
+        ));
+        assert!(matches!(
+            "TRaCe".parse().unwrap(),
+            InstrumentedObjectStoreMode::Trace
+        ));
+        assert!(
+            "does_not_exist"
+                .parse::<InstrumentedObjectStoreMode>()
+                .is_err()
+        );
+
+        assert!(matches!(0.into(), InstrumentedObjectStoreMode::Disabled));
+        assert!(matches!(1.into(), InstrumentedObjectStoreMode::Summary));
+        assert!(matches!(2.into(), InstrumentedObjectStoreMode::Trace));
+        assert!(matches!(3.into(), InstrumentedObjectStoreMode::Disabled));
+    }
+
+    #[test]
+    fn instrumented_registry() {
+        let mut reg = InstrumentedObjectStoreRegistry::new();
+        assert!(reg.stores().is_empty());
+        assert_eq!(
+            reg.instrument_mode(),
+            InstrumentedObjectStoreMode::default()
+        );
+
+        reg = reg.with_profile_mode(InstrumentedObjectStoreMode::Trace);
+        assert_eq!(reg.instrument_mode(), InstrumentedObjectStoreMode::Trace);
+
+        let store = object_store::memory::InMemory::new();
+        let url = "mem://test".parse().unwrap();
+        let registered = reg.register_store(&url, Arc::new(store));
+        assert!(registered.is_none());
+
+        let fetched = reg.get_store(&url);
+        assert!(fetched.is_ok());
+        assert_eq!(reg.stores().len(), 1);
+    }
+
+    // Returns an `InstrumentedObjectStore` with some data loaded for testing and the path to
+    // access the data
+    async fn setup_test_store() -> (InstrumentedObjectStore, Path) {
+        let store = Arc::new(object_store::memory::InMemory::new());
+        let mode = AtomicU8::new(InstrumentedObjectStoreMode::default() as u8);
+        let instrumented = InstrumentedObjectStore::new(store, mode);
+
+        // Load the test store with some data we can read
+        let path = Path::from("test/data");
+        let payload = PutPayload::from_static(b"test_data");
+        instrumented.put(&path, payload).await.unwrap();
+
+        (instrumented, path)
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_get() {
+        let (instrumented, path) = setup_test_store().await;
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.get(&path).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.get(&path).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let mut requests = instrumented.take_requests();
+        assert_eq!(requests.len(), 1);
+        assert!(instrumented.requests.lock().is_empty());
+
+        let request = requests.pop().unwrap();
+        assert_eq!(request.op, Operation::Get);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert_eq!(request.size, Some(9));
+        assert_eq!(request.range, None);
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_delete() {
+        let (instrumented, path) = setup_test_store().await;
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.delete(&path).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        // We need a new store so we have data to delete again
+        let (instrumented, path) = setup_test_store().await;
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.delete(&path).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let mut requests = instrumented.take_requests();
+        assert_eq!(requests.len(), 1);
+        assert!(instrumented.requests.lock().is_empty());
+
+        let request = requests.pop().unwrap();
+        assert_eq!(request.op, Operation::Delete);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_list() {
+        let (instrumented, path) = setup_test_store().await;
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.list(Some(&path));
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        let mut stream = instrumented.list(Some(&path));
+        // Sleep between stream creation and first poll to verify the timer
+        // starts on first poll, not at stream creation.
+        let delay = Duration::from_millis(50);
+        tokio::time::sleep(delay).await;
+        let _ = stream.next().await;
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let request = instrumented.take_requests().pop().unwrap();
+        assert_eq!(request.op, Operation::List);
+        assert_eq!(request.path, path);
+        let duration = request
+            .duration()
+            .expect("duration should be set after consuming stream");
+        assert!(
+            duration < delay,
+            "duration {duration:?} should exclude the {delay:?} sleep before first poll"
+        );
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn time_to_first_item_stream_captures_inner_latency() {
+        let inner_delay = Duration::from_millis(50);
+        let inner_stream = futures::stream::once(async move {
+            tokio::time::sleep(inner_delay).await;
+            Ok(ObjectMeta {
+                location: Path::from("test"),
+                last_modified: Utc::now(),
+                size: 0,
+                e_tag: None,
+                version: None,
+            })
+        })
+        .boxed();
+
+        let duration_nanos = Arc::new(AtomicU64::new(0));
+        let mut stream = Box::pin(TimeToFirstItemStream::new(
+            inner_stream,
+            Arc::clone(&duration_nanos),
+        ));
+        let _ = stream.next().await;
+
+        let recorded = Duration::from_nanos(duration_nanos.load(Ordering::Acquire));
+        assert!(
+            recorded >= inner_delay,
+            "recorded duration {recorded:?} should be >= inner stream delay {inner_delay:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_list_with_delimiter() {
+        let (instrumented, path) = setup_test_store().await;
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.list_with_delimiter(Some(&path)).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.list_with_delimiter(Some(&path)).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let request = instrumented.take_requests().pop().unwrap();
+        assert_eq!(request.op, Operation::List);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_put_opts() {
+        // The `setup_test_store()` method comes with data already `put` into it, so we'll setup
+        // manually for this test
+        let store = Arc::new(object_store::memory::InMemory::new());
+        let mode = AtomicU8::new(InstrumentedObjectStoreMode::default() as u8);
+        let instrumented = InstrumentedObjectStore::new(store, mode);
+
+        let path = Path::from("test/data");
+        let payload = PutPayload::from_static(b"test_data");
+        let size = payload.content_length();
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.put(&path, payload.clone()).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.put(&path, payload).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let request = instrumented.take_requests().pop().unwrap();
+        assert_eq!(request.op, Operation::Put);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert_eq!(request.size.unwrap(), size);
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_put_multipart() {
+        // The `setup_test_store()` method comes with data already `put` into it, so we'll setup
+        // manually for this test
+        let store = Arc::new(object_store::memory::InMemory::new());
+        let mode = AtomicU8::new(InstrumentedObjectStoreMode::default() as u8);
+        let instrumented = InstrumentedObjectStore::new(store, mode);
+
+        let path = Path::from("test/data");
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        let mp = instrumented.put_multipart(&path).await.unwrap();
+        let mut write = WriteMultipart::new(mp);
+        write.write(b"test_data");
+        write.finish().await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        let mp = instrumented.put_multipart(&path).await.unwrap();
+        let mut write = WriteMultipart::new(mp);
+        write.write(b"test_data");
+        write.finish().await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let request = instrumented.take_requests().pop().unwrap();
+        assert_eq!(request.op, Operation::Put);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_copy() {
+        let (instrumented, path) = setup_test_store().await;
+        let copy_to = Path::from("test/copied");
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.copy(&path, &copy_to).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented.copy(&path, &copy_to).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let mut requests = instrumented.take_requests();
+        assert_eq!(requests.len(), 1);
+        assert!(instrumented.requests.lock().is_empty());
+
+        let request = requests.pop().unwrap();
+        assert_eq!(request.op, Operation::Copy);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert_eq!(
+            request.extra_display.unwrap(),
+            format!("copy_to: {copy_to}")
+        );
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_copy_if_not_exists() {
+        let (instrumented, path) = setup_test_store().await;
+        let mut copy_to = Path::from("test/copied");
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented
+            .copy_if_not_exists(&path, &copy_to)
+            .await
+            .unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        // Use a new destination since the previous one already exists
+        copy_to = Path::from("test/copied_again");
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        instrumented
+            .copy_if_not_exists(&path, &copy_to)
+            .await
+            .unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let mut requests = instrumented.take_requests();
+        assert_eq!(requests.len(), 1);
+        assert!(instrumented.requests.lock().is_empty());
+
+        let request = requests.pop().unwrap();
+        assert_eq!(request.op, Operation::Copy);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert_eq!(
+            request.extra_display.unwrap(),
+            format!("copy_to: {copy_to}")
+        );
+    }
+
+    #[tokio::test]
+    async fn instrumented_store_head() {
+        let (instrumented, path) = setup_test_store().await;
+
+        // By default no requests should be instrumented/stored
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.head(&path).await.unwrap();
+        assert!(instrumented.requests.lock().is_empty());
+
+        instrumented.set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        assert!(instrumented.requests.lock().is_empty());
+        let _ = instrumented.head(&path).await.unwrap();
+        assert_eq!(instrumented.requests.lock().len(), 1);
+
+        let mut requests = instrumented.take_requests();
+        assert_eq!(requests.len(), 1);
+        assert!(instrumented.requests.lock().is_empty());
+
+        let request = requests.pop().unwrap();
+        assert_eq!(request.op, Operation::Head);
+        assert_eq!(request.path, path);
+        assert!(request.duration().is_some());
+        assert!(request.size.is_none());
+        assert!(request.range.is_none());
+        assert!(request.extra_display.is_none());
+    }
+
+    #[test]
+    fn request_details() {
+        let rd = RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test"),
+            timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::new(5, 0).as_nanos() as u64
+            )),
+            size: Some(10),
+            range: Some((..10).into()),
+            extra_display: Some(String::from("extra info")),
+        };
+
+        assert_eq!(
+            format!("{rd}"),
+            "1970-01-01T00:00:00+00:00 operation=Get duration=5.000000s size=10 range: bytes=0-9 path=test extra info"
+        );
+    }
+
+    #[test]
+    fn request_summary() {
+        // Test empty request list
+        let mut requests = Vec::new();
+        assert_snapshot!(RequestSummaries::new(&requests), @r"
+        +-----------+--------+-----+-----+-----+-----+-------+
+        | Operation | Metric | min | max | avg | sum | count |
+        +-----------+--------+-----+-----+-----+-----+-------+
+        +-----------+--------+-----+-----+-----+-----+-------+
+        ");
+
+        requests.push(RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test1"),
+            timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(5).as_nanos() as u64
+            )),
+            size: Some(100),
+            range: None,
+            extra_display: None,
+        });
+
+        assert_snapshot!(RequestSummaries::new(&requests), @r"
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        | Operation | Metric   | min       | max       | avg       | sum       | count |
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        | Get       | duration | 5.000000s | 5.000000s | 5.000000s | 5.000000s | 1     |
+        | Get       | size     | 100 B     | 100 B     | 100 B     | 100 B     | 1     |
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        ");
+
+        // Add more Get requests to test aggregation
+        requests.push(RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test2"),
+            timestamp: chrono::DateTime::from_timestamp(1, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(8).as_nanos() as u64
+            )),
+            size: Some(150),
+            range: None,
+            extra_display: None,
+        });
+        requests.push(RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test3"),
+            timestamp: chrono::DateTime::from_timestamp(2, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(2).as_nanos() as u64
+            )),
+            size: Some(50),
+            range: None,
+            extra_display: None,
+        });
+        assert_snapshot!(RequestSummaries::new(&requests), @r"
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        | Operation | Metric   | min       | max       | avg       | sum        | count |
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        | Get       | duration | 2.000000s | 8.000000s | 5.000000s | 15.000000s | 3     |
+        | Get       | size     | 50 B      | 150 B     | 100 B     | 300 B      | 3     |
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        ");
+
+        // Add Put requests to test grouping
+        requests.push(RequestDetails {
+            op: Operation::Put,
+            path: Path::from("test4"),
+            timestamp: chrono::DateTime::from_timestamp(3, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_millis(200).as_nanos() as u64,
+            )),
+            size: Some(75),
+            range: None,
+            extra_display: None,
+        });
+
+        assert_snapshot!(RequestSummaries::new(&requests), @r"
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        | Operation | Metric   | min       | max       | avg       | sum        | count |
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        | Get       | duration | 2.000000s | 8.000000s | 5.000000s | 15.000000s | 3     |
+        | Get       | size     | 50 B      | 150 B     | 100 B     | 300 B      | 3     |
+        | Put       | duration | 0.200000s | 0.200000s | 0.200000s | 0.200000s  | 1     |
+        | Put       | size     | 75 B      | 75 B      | 75 B      | 75 B       | 1     |
+        +-----------+----------+-----------+-----------+-----------+------------+-------+
+        ");
+    }
+
+    #[test]
+    fn request_summary_only_duration() {
+        // Test request with only duration (no size)
+        let only_duration = vec![RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test1"),
+            timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(
+                Duration::from_secs(3).as_nanos() as u64
+            )),
+            size: None,
+            range: None,
+            extra_display: None,
+        }];
+        assert_snapshot!(RequestSummaries::new(&only_duration), @r"
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        | Operation | Metric   | min       | max       | avg       | sum       | count |
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        | Get       | duration | 3.000000s | 3.000000s | 3.000000s | 3.000000s | 1     |
+        | Get       | size     |           |           |           |           | 1     |
+        +-----------+----------+-----------+-----------+-----------+-----------+-------+
+        ");
+    }
+
+    #[test]
+    fn request_summary_only_size() {
+        // Test request with only size (no duration)
+        let only_size = vec![RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test1"),
+            timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(0)),
+            size: Some(200),
+            range: None,
+            extra_display: None,
+        }];
+        assert_snapshot!(RequestSummaries::new(&only_size), @r"
+        +-----------+----------+-------+-------+-------+-------+-------+
+        | Operation | Metric   | min   | max   | avg   | sum   | count |
+        +-----------+----------+-------+-------+-------+-------+-------+
+        | Get       | duration |       |       |       |       | 1     |
+        | Get       | size     | 200 B | 200 B | 200 B | 200 B | 1     |
+        +-----------+----------+-------+-------+-------+-------+-------+
+        ");
+    }
+
+    #[test]
+    fn request_summary_neither_duration_or_size() {
+        // Test request with neither duration nor size
+        let no_stats = vec![RequestDetails {
+            op: Operation::Get,
+            path: Path::from("test1"),
+            timestamp: chrono::DateTime::from_timestamp(0, 0).unwrap(),
+            duration_nanos: Arc::new(AtomicU64::new(0)),
+            size: None,
+            range: None,
+            extra_display: None,
+        }];
+        assert_snapshot!(RequestSummaries::new(&no_stats), @r"
+        +-----------+----------+-----+-----+-----+-----+-------+
+        | Operation | Metric   | min | max | avg | sum | count |
+        +-----------+----------+-----+-----+-----+-----+-------+
+        | Get       | duration |     |     |     |     | 1     |
+        | Get       | size     |     |     |     |     | 1     |
+        +-----------+----------+-----+-----+-----+-----+-------+
+        ");
+    }
+}
diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs
index 1d6a8396aee74..6a6a0370b08ac 100644
--- a/datafusion-cli/src/print_format.rs
+++ b/datafusion-cli/src/print_format.rs
@@ -221,6 +221,7 @@ mod tests {
 
     use arrow::array::Int32Array;
     use arrow::datatypes::{DataType, Field, Schema};
+    use insta::{allow_duplicates, assert_snapshot};
 
     #[test]
     fn print_empty() {
@@ -232,249 +233,202 @@ mod tests {
             PrintFormat::Automatic,
         ] {
             // no output for empty batches, even with header set
-            PrintBatchesTest::new()
+            let output = PrintBatchesTest::new()
                 .with_format(format)
                 .with_schema(three_column_schema())
                 .with_batches(vec![])
-                .with_expected(&[""])
                 .run();
+            assert_eq!(output, "")
         }
 
         // output column headers for empty batches when format is Table
-        #[rustfmt::skip]
-        let expected = &[
-            "+---+---+---+",
-            "| a | b | c |",
-            "+---+---+---+",
-            "+---+---+---+",
-        ];
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_schema(three_column_schema())
             .with_batches(vec![])
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+---+---+
+        | a | b | c |
+        +---+---+---+
+        +---+---+---+
+        ");
     }
 
     #[test]
     fn print_csv_no_header() {
-        #[rustfmt::skip]
-        let expected = &[
-            "1,4,7",
-            "2,5,8",
-            "3,6,9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Csv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        1,4,7
+        2,5,8
+        3,6,9
+        ");
     }
 
     #[test]
     fn print_csv_with_header() {
-        #[rustfmt::skip]
-        let expected = &[
-            "a,b,c",
-            "1,4,7",
-            "2,5,8",
-            "3,6,9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Csv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        a,b,c
+        1,4,7
+        2,5,8
+        3,6,9
+        ");
     }
 
     #[test]
     fn print_tsv_no_header() {
-        #[rustfmt::skip]
-        let expected = &[
-            "1\t4\t7",
-            "2\t5\t8",
-            "3\t6\t9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Tsv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        1	4	7
+        2	5	8
+        3	6	9
+        ")
     }
 
     #[test]
     fn print_tsv_with_header() {
-        #[rustfmt::skip]
-        let expected = &[
-            "a\tb\tc",
-            "1\t4\t7",
-            "2\t5\t8",
-            "3\t6\t9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Tsv)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        a	b	c
+        1	4	7
+        2	5	8
+        3	6	9
+        ");
     }
 
     #[test]
     fn print_table() {
-        let expected = &[
-            "+---+---+---+",
-            "| a | b | c |",
-            "+---+---+---+",
-            "| 1 | 4 | 7 |",
-            "| 2 | 5 | 8 |",
-            "| 3 | 6 | 9 |",
-            "+---+---+---+",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+---+---+
+        | a | b | c |
+        +---+---+---+
+        | 1 | 4 | 7 |
+        | 2 | 5 | 8 |
+        | 3 | 6 | 9 |
+        +---+---+---+
+        ");
     }
     #[test]
     fn print_json() {
-        let expected =
-            &[r#"[{"a":1,"b":4,"c":7},{"a":2,"b":5,"c":8},{"a":3,"b":6,"c":9}]"#];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Json)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r#"[{"a":1,"b":4,"c":7},{"a":2,"b":5,"c":8},{"a":3,"b":6,"c":9}]"#);
     }
 
     #[test]
     fn print_ndjson() {
-        let expected = &[
-            r#"{"a":1,"b":4,"c":7}"#,
-            r#"{"a":2,"b":5,"c":8}"#,
-            r#"{"a":3,"b":6,"c":9}"#,
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::NdJson)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Ignored)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r#"
+        {"a":1,"b":4,"c":7}
+        {"a":2,"b":5,"c":8}
+        {"a":3,"b":6,"c":9}
+        "#);
     }
 
     #[test]
     fn print_automatic_no_header() {
-        #[rustfmt::skip]
-            let expected = &[
-            "1,4,7",
-            "2,5,8",
-            "3,6,9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Automatic)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::No)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        1,4,7
+        2,5,8
+        3,6,9
+        ");
     }
     #[test]
     fn print_automatic_with_header() {
-        #[rustfmt::skip]
-            let expected = &[
-            "a,b,c",
-            "1,4,7",
-            "2,5,8",
-            "3,6,9",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Automatic)
-            .with_batches(split_batch(three_column_batch()))
+            .with_batches(split_batch(&three_column_batch()))
             .with_header(WithHeader::Yes)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        a,b,c
+        1,4,7
+        2,5,8
+        3,6,9
+        ");
     }
 
     #[test]
     fn print_maxrows_unlimited() {
-        #[rustfmt::skip]
-            let expected = &[
-            "+---+",
-            "| a |",
-            "+---+",
-            "| 1 |",
-            "| 2 |",
-            "| 3 |",
-            "+---+",
-        ];
-
         // should print out entire output with no truncation if unlimited or
         // limit greater than number of batches or equal to the number of batches
         for max_rows in [MaxRows::Unlimited, MaxRows::Limited(5), MaxRows::Limited(3)] {
-            PrintBatchesTest::new()
+            let output = PrintBatchesTest::new()
                 .with_format(PrintFormat::Table)
                 .with_schema(one_column_schema())
                 .with_batches(vec![one_column_batch()])
                 .with_maxrows(max_rows)
-                .with_expected(expected)
                 .run();
+            allow_duplicates! {
+                assert_snapshot!(output, @r"
+                +---+
+                | a |
+                +---+
+                | 1 |
+                | 2 |
+                | 3 |
+                +---+
+                ");
+            }
         }
     }
 
     #[test]
     fn print_maxrows_limited_one_batch() {
-        #[rustfmt::skip]
-            let expected = &[
-            "+---+",
-            "| a |",
-            "+---+",
-            "| 1 |",
-            "| . |",
-            "| . |",
-            "| . |",
-            "+---+",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_batches(vec![one_column_batch()])
             .with_maxrows(MaxRows::Limited(1))
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+
+        | a |
+        +---+
+        | 1 |
+        | . |
+        | . |
+        | . |
+        +---+
+        ");
     }
 
     #[test]
     fn print_maxrows_limited_multi_batched() {
-        #[rustfmt::skip]
-            let expected = &[
-            "+---+",
-            "| a |",
-            "+---+",
-            "| 1 |",
-            "| 2 |",
-            "| 3 |",
-            "| 1 |",
-            "| 2 |",
-            "| . |",
-            "| . |",
-            "| . |",
-            "+---+",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_batches(vec![
                 one_column_batch(),
@@ -482,8 +436,21 @@ mod tests {
                 one_column_batch(),
             ])
             .with_maxrows(MaxRows::Limited(5))
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+
+        | a |
+        +---+
+        | 1 |
+        | 2 |
+        | 3 |
+        | 1 |
+        | 2 |
+        | . |
+        | . |
+        | . |
+        +---+
+        ");
     }
 
     #[test]
@@ -491,22 +458,19 @@ mod tests {
         let batch = one_column_batch();
         let empty_batch = RecordBatch::new_empty(batch.schema());
 
-        #[rustfmt::skip]
-        let expected =&[
-            "+---+",
-            "| a |",
-            "+---+",
-            "| 1 |",
-            "| 2 |",
-            "| 3 |",
-            "+---+",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_batches(vec![empty_batch.clone(), batch, empty_batch])
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+
+        | a |
+        +---+
+        | 1 |
+        | 2 |
+        | 3 |
+        +---+
+        ");
     }
 
     #[test]
@@ -514,32 +478,28 @@ mod tests {
         let empty_batch = RecordBatch::new_empty(one_column_batch().schema());
 
         // Print column headers for empty batch when format is Table
-        #[rustfmt::skip]
-        let expected =&[
-            "+---+",
-            "| a |",
-            "+---+",
-            "+---+",
-        ];
-
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_schema(one_column_schema())
             .with_batches(vec![empty_batch])
             .with_header(WithHeader::Yes)
-            .with_expected(expected)
             .run();
+        assert_snapshot!(output, @r"
+        +---+
+        | a |
+        +---+
+        +---+
+        ");
 
         // No output for empty batch when schema contains no columns
         let empty_batch = RecordBatch::new_empty(Arc::new(Schema::empty()));
-        let expected = &[""];
-        PrintBatchesTest::new()
+        let output = PrintBatchesTest::new()
             .with_format(PrintFormat::Table)
             .with_schema(Arc::new(Schema::empty()))
             .with_batches(vec![empty_batch])
             .with_header(WithHeader::Yes)
-            .with_expected(expected)
             .run();
+        assert_eq!(output, "")
     }
 
     #[derive(Debug)]
@@ -549,7 +509,6 @@ mod tests {
         batches: Vec<RecordBatch>,
         maxrows: MaxRows,
         with_header: WithHeader,
-        expected: Vec<&'static str>,
     }
 
     /// How to test with_header
@@ -569,7 +528,6 @@ mod tests {
                 batches: vec![],
                 maxrows: MaxRows::Unlimited,
                 with_header: WithHeader::Ignored,
-                expected: vec![],
             }
         }
 
@@ -603,25 +561,9 @@ mod tests {
             self
         }
 
-        /// set expected output
-        fn with_expected(mut self, expected: &[&'static str]) -> Self {
-            self.expected = expected.to_vec();
-            self
-        }
-
         /// run the test
-        fn run(self) {
-            let actual = self.output();
-            let actual: Vec<_> = actual.trim_end().split('\n').collect();
-            let expected = self.expected;
-            assert_eq!(
-                actual, expected,
-                "\n\nactual:\n{actual:#?}\n\nexpected:\n{expected:#?}"
-            );
-        }
-
         /// formats batches using parameters and returns the resulting output
-        fn output(&self) -> String {
+        fn run(self) -> String {
             match self.with_header {
                 WithHeader::Yes => self.output_with_header(true),
                 WithHeader::No => self.output_with_header(false),
@@ -691,7 +633,7 @@ mod tests {
     }
 
     /// Slice the record batch into 2 batches
-    fn split_batch(batch: RecordBatch) -> Vec<RecordBatch> {
+    fn split_batch(batch: &RecordBatch) -> Vec<RecordBatch> {
         assert!(batch.num_rows() > 1);
         let split = batch.num_rows() / 2;
         vec![
diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs
index 56d787b0fe087..d0810cb034df1 100644
--- a/datafusion-cli/src/print_options.rs
+++ b/datafusion-cli/src/print_options.rs
@@ -16,16 +16,20 @@
 // under the License.
 
 use std::fmt::{Display, Formatter};
-use std::io::Write;
+use std::io;
 use std::pin::Pin;
 use std::str::FromStr;
+use std::sync::Arc;
 
+use crate::object_storage::instrumented::{
+    InstrumentedObjectStoreMode, InstrumentedObjectStoreRegistry, RequestSummaries,
+};
 use crate::print_format::PrintFormat;
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion::common::instant::Instant;
 use datafusion::common::DataFusionError;
+use datafusion::common::instant::Instant;
 use datafusion::error::Result;
 use datafusion::physical_plan::RecordBatchStream;
 
@@ -51,8 +55,10 @@ impl FromStr for MaxRows {
             Ok(Self::Unlimited)
         } else {
             match maxrows.parse::<usize>() {
-                Ok(nrows)  => Ok(Self::Limited(nrows)),
-                _ => Err(format!("Invalid maxrows {maxrows}. Valid inputs are natural numbers or \'none\', \'inf\', or \'infinite\' for no limit.")),
+                Ok(nrows) => Ok(Self::Limited(nrows)),
+                _ => Err(format!(
+                    "Invalid maxrows {maxrows}. Valid inputs are natural numbers or \'none\', \'inf\', or \'infinite\' for no limit."
+                )),
             }
         }
     }
@@ -67,12 +73,15 @@ impl Display for MaxRows {
     }
 }
 
+const OBJECT_STORE_PROFILING_HEADER: &str = "Object Store Profiling";
+
 #[derive(Debug, Clone)]
 pub struct PrintOptions {
     pub format: PrintFormat,
     pub quiet: bool,
     pub maxrows: MaxRows,
     pub color: bool,
+    pub instrumented_registry: Arc<InstrumentedObjectStoreRegistry>,
 }
 
 // Returns the query execution details formatted
@@ -106,7 +115,7 @@ impl PrintOptions {
         row_count: usize,
         format_options: &FormatOptions,
     ) -> Result<()> {
-        let stdout = std::io::stdout();
+        let stdout = io::stdout();
         let mut writer = stdout.lock();
 
         self.format.print_batches(
@@ -128,11 +137,7 @@ impl PrintOptions {
             query_start_time,
         );
 
-        if !self.quiet {
-            writeln!(writer, "{formatted_exec_details}")?;
-        }
-
-        Ok(())
+        self.write_output(&mut writer, &formatted_exec_details)
     }
 
     /// Print the stream to stdout using the specified format
@@ -148,7 +153,7 @@ impl PrintOptions {
             ));
         };
 
-        let stdout = std::io::stdout();
+        let stdout = io::stdout();
         let mut writer = stdout.lock();
 
         let mut row_count = 0_usize;
@@ -174,10 +179,88 @@ impl PrintOptions {
             query_start_time,
         );
 
+        self.write_output(&mut writer, &formatted_exec_details)
+    }
+
+    fn write_output<W: io::Write>(
+        &self,
+        writer: &mut W,
+        formatted_exec_details: &str,
+    ) -> Result<()> {
         if !self.quiet {
             writeln!(writer, "{formatted_exec_details}")?;
+
+            let instrument_mode = self.instrumented_registry.instrument_mode();
+            if instrument_mode != InstrumentedObjectStoreMode::Disabled {
+                writeln!(writer, "{OBJECT_STORE_PROFILING_HEADER}")?;
+                for store in self.instrumented_registry.stores() {
+                    let requests = store.take_requests();
+
+                    if !requests.is_empty() {
+                        writeln!(writer, "{store}")?;
+                        if instrument_mode == InstrumentedObjectStoreMode::Trace {
+                            for req in requests.iter() {
+                                writeln!(writer, "{req}")?;
+                            }
+                            // Add an extra blank line to help visually organize the output
+                            writeln!(writer)?;
+                        }
+
+                        writeln!(writer, "Summaries:")?;
+                        let summaries = RequestSummaries::new(&requests);
+                        writeln!(writer, "{summaries}")?;
+                    }
+                }
+            }
         }
 
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use datafusion::error::Result;
+
+    use super::*;
+
+    #[test]
+    fn write_output() -> Result<()> {
+        let instrumented_registry = Arc::new(InstrumentedObjectStoreRegistry::new());
+        let mut print_options = PrintOptions {
+            format: PrintFormat::Automatic,
+            quiet: true,
+            maxrows: MaxRows::Unlimited,
+            color: true,
+            instrumented_registry: Arc::clone(&instrumented_registry),
+        };
+
+        let mut print_output: Vec<u8> = Vec::new();
+        let exec_out = String::from("Formatted Exec Output");
+        print_options.write_output(&mut print_output, &exec_out)?;
+        assert!(print_output.is_empty());
+
+        print_options.quiet = false;
+        print_options.write_output(&mut print_output, &exec_out)?;
+        let out_str: String = print_output
+            .clone()
+            .try_into()
+            .expect("Expected successful String conversion");
+        assert!(out_str.contains(&exec_out));
+
+        // clear the previous data from the output so it doesn't pollute the next test
+        print_output.clear();
+        print_options
+            .instrumented_registry
+            .set_instrument_mode(InstrumentedObjectStoreMode::Trace);
+        print_options.write_output(&mut print_output, &exec_out)?;
+        let out_str: String = print_output
+            .clone()
+            .try_into()
+            .expect("Expected successful String conversion");
+        assert!(out_str.contains(&exec_out));
+        assert!(out_str.contains(OBJECT_STORE_PROFILING_HEADER));
+
+        Ok(())
+    }
+}
diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs
index fb2f08157f674..7bc45693a8b0d 100644
--- a/datafusion-cli/tests/cli_integration.rs
+++ b/datafusion-cli/tests/cli_integration.rs
@@ -19,9 +19,18 @@ use std::process::Command;
 
 use rstest::rstest;
 
-use insta::{glob, Settings};
+use async_trait::async_trait;
+use insta::internals::SettingsBindDropGuard;
+use insta::{Settings, glob};
 use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
+use std::path::PathBuf;
 use std::{env, fs};
+use testcontainers_modules::minio;
+use testcontainers_modules::testcontainers::core::{CmdWaitFor, ExecCommand, Mount};
+use testcontainers_modules::testcontainers::runners::AsyncRunner;
+use testcontainers_modules::testcontainers::{
+    ContainerAsync, ImageExt, TestcontainersError,
+};
 
 fn cli() -> Command {
     Command::new(get_cargo_bin("datafusion-cli"))
@@ -32,9 +41,85 @@ fn make_settings() -> Settings {
     settings.set_prepend_module_to_snapshot(false);
     settings.add_filter(r"Elapsed .* seconds\.", "[ELAPSED]");
     settings.add_filter(r"DataFusion CLI v.*", "[CLI_VERSION]");
+    settings.add_filter(r"(?s)backtrace:.*?\n\n\n", "");
     settings
 }
 
+async fn setup_minio_container() -> Result<ContainerAsync<minio::MinIO>, String> {
+    const MINIO_ROOT_USER: &str = "TEST-DataFusionLogin";
+    const MINIO_ROOT_PASSWORD: &str = "TEST-DataFusionPassword";
+
+    let data_path =
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../datafusion/core/tests/data");
+
+    let absolute_data_path = data_path
+        .canonicalize()
+        .expect("Failed to get absolute path for test data");
+
+    let container = minio::MinIO::default()
+        .with_env_var("MINIO_ROOT_USER", MINIO_ROOT_USER)
+        .with_env_var("MINIO_ROOT_PASSWORD", MINIO_ROOT_PASSWORD)
+        .with_mount(Mount::bind_mount(
+            absolute_data_path.to_str().unwrap(),
+            "/source",
+        ))
+        .start()
+        .await;
+
+    match container {
+        Ok(container) => {
+            // We wait for MinIO to be healthy and prepare test files. We do it via CLI to avoid s3 dependency
+            let commands = [
+                ExecCommand::new(["/usr/bin/mc", "ready", "local"]),
+                ExecCommand::new([
+                    "/usr/bin/mc",
+                    "alias",
+                    "set",
+                    "localminio",
+                    "http://localhost:9000",
+                    MINIO_ROOT_USER,
+                    MINIO_ROOT_PASSWORD,
+                ]),
+                ExecCommand::new(["/usr/bin/mc", "mb", "localminio/data"]),
+                ExecCommand::new([
+                    "/usr/bin/mc",
+                    "cp",
+                    "-r",
+                    "/source/",
+                    "localminio/data/",
+                ]),
+            ];
+
+            for command in commands {
+                let command =
+                    command.with_cmd_ready_condition(CmdWaitFor::Exit { code: Some(0) });
+
+                let cmd_ref = format!("{command:?}");
+
+                if let Err(e) = container.exec(command).await {
+                    let stdout = container.stdout_to_vec().await.unwrap_or_default();
+                    let stderr = container.stderr_to_vec().await.unwrap_or_default();
+
+                    return Err(format!(
+                        "Failed to execute command: {}\nError: {}\nStdout: {:?}\nStderr: {:?}",
+                        cmd_ref,
+                        e,
+                        String::from_utf8_lossy(&stdout),
+                        String::from_utf8_lossy(&stderr)
+                    ));
+                }
+            }
+
+            Ok(container)
+        }
+
+        Err(TestcontainersError::Client(e)) => Err(format!(
+            "Failed to start MinIO container. Ensure Docker is running and accessible: {e}"
+        )),
+        Err(e) => Err(format!("Failed to start MinIO container: {e}")),
+    }
+}
+
 #[cfg(test)]
 #[ctor::ctor]
 fn init() {
@@ -131,13 +216,49 @@ fn test_cli_top_memory_consumers<'a>(
     #[case] snapshot_name: &str,
     #[case] top_memory_consumers: impl IntoIterator<Item = &'a str>,
 ) {
+    let _bound = bind_to_settings(snapshot_name);
+
+    let mut cmd = cli();
+    let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
+    cmd.args(["--memory-limit", "10M", "--command", sql]);
+    cmd.args(top_memory_consumers);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+#[rstest]
+#[case("no_track", ["--top-memory-consumers", "0"])]
+#[case("top2", ["--top-memory-consumers", "2"])]
+#[test]
+fn test_cli_top_memory_consumers_with_mem_pool_type<'a>(
+    #[case] snapshot_name: &str,
+    #[case] top_memory_consumers: impl IntoIterator<Item = &'a str>,
+) {
+    let _bound = bind_to_settings(snapshot_name);
+
+    let mut cmd = cli();
+    let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
+    cmd.args([
+        "--memory-limit",
+        "10M",
+        "--mem-pool-type",
+        "fair",
+        "--command",
+        sql,
+    ]);
+    cmd.args(top_memory_consumers);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+fn bind_to_settings(snapshot_name: &str) -> SettingsBindDropGuard {
     let mut settings = make_settings();
 
     settings.set_snapshot_suffix(snapshot_name);
 
     settings.add_filter(
-        r"[^\s]+\#\d+\(can spill: (true|false)\) consumed .*?B",
-        "Consumer(can spill: bool) consumed XB",
+        r"[^\s]+\#\d+\(can spill: (true|false)\) consumed .*?B, peak .*?B",
+        "Consumer(can spill: bool) consumed XB, peak XB",
     );
     settings.add_filter(
         r"Error: Failed to allocate additional .*? for .*? with .*? already allocated for this reservation - .*? remain available for the total pool",
@@ -148,12 +269,20 @@ fn test_cli_top_memory_consumers<'a>(
         "Resources exhausted: Failed to allocate",
     );
 
+    settings.bind_to_scope()
+}
+
+#[test]
+fn test_cli_with_unbounded_memory_pool() {
+    let mut settings = make_settings();
+
+    settings.set_snapshot_suffix("default");
+
     let _bound = settings.bind_to_scope();
 
     let mut cmd = cli();
     let sql = "select * from generate_series(1,500000) as t1(v1) order by v1;";
-    cmd.args(["--memory-limit", "10M", "--command", sql]);
-    cmd.args(top_memory_consumers);
+    cmd.args(["--maxrows", "10", "--command", sql]);
 
     assert_cmd_snapshot!(cmd);
 }
@@ -165,12 +294,31 @@ async fn test_cli() {
         return;
     }
 
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
+
     let settings = make_settings();
     let _bound = settings.bind_to_scope();
 
+    let port = container.get_host_port_ipv4(9000).await.unwrap();
+
     glob!("sql/integration/*.sql", |path| {
         let input = fs::read_to_string(path).unwrap();
-        assert_cmd_snapshot!(cli().pass_stdin(input))
+        assert_cmd_snapshot!(
+            cli()
+                .env_clear()
+                .env("AWS_ACCESS_KEY_ID", "TEST-DataFusionLogin")
+                .env("AWS_SECRET_ACCESS_KEY", "TEST-DataFusionPassword")
+                .env("AWS_ENDPOINT", format!("http://localhost:{port}"))
+                .env("AWS_ALLOW_HTTP", "true")
+                .pass_stdin(input)
+        )
     });
 }
 
@@ -186,20 +334,24 @@ async fn test_aws_options() {
     let settings = make_settings();
     let _bound = settings.bind_to_scope();
 
-    let access_key_id =
-        env::var("AWS_ACCESS_KEY_ID").expect("AWS_ACCESS_KEY_ID is not set");
-    let secret_access_key =
-        env::var("AWS_SECRET_ACCESS_KEY").expect("AWS_SECRET_ACCESS_KEY is not set");
-    let endpoint_url = env::var("AWS_ENDPOINT").expect("AWS_ENDPOINT is not set");
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
+    let port = container.get_host_port_ipv4(9000).await.unwrap();
 
     let input = format!(
         r#"CREATE EXTERNAL TABLE CARS
 STORED AS CSV
 LOCATION 's3://data/cars.csv'
 OPTIONS(
-    'aws.access_key_id' '{access_key_id}',
-    'aws.secret_access_key' '{secret_access_key}',
-    'aws.endpoint' '{endpoint_url}',
+    'aws.access_key_id' 'TEST-DataFusionLogin',
+    'aws.secret_access_key' 'TEST-DataFusionPassword',
+    'aws.endpoint' 'http://localhost:{port}',
     'aws.allow_http' 'true'
 );
 
@@ -209,3 +361,186 @@ SELECT * FROM CARS limit 1;
 
     assert_cmd_snapshot!(cli().env_clear().pass_stdin(input));
 }
+
+#[tokio::test]
+async fn test_aws_region_auto_resolution() {
+    if env::var("TEST_STORAGE_INTEGRATION").is_err() {
+        eprintln!("Skipping external storages integration tests");
+        return;
+    }
+
+    let mut settings = make_settings();
+    settings.add_filter(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", "[TIME]");
+    let _bound = settings.bind_to_scope();
+
+    let bucket = "s3://clickhouse-public-datasets/hits_compatible/athena_partitioned/hits_1.parquet";
+    let region = "us-east-1";
+
+    let input = format!(
+        r#"CREATE EXTERNAL TABLE hits
+STORED AS PARQUET
+LOCATION '{bucket}'
+OPTIONS(
+    'aws.region' '{region}',
+    'aws.skip_signature' true
+);
+
+SELECT COUNT(*) FROM hits;
+"#
+    );
+
+    assert_cmd_snapshot!(
+        cli()
+            .env("RUST_LOG", "warn")
+            .env_remove("AWS_ENDPOINT")
+            .pass_stdin(input)
+    );
+}
+
+/// Ensure backtrace will be printed, if executing `datafusion-cli` with a query
+/// that triggers error.
+/// Example:
+///     RUST_BACKTRACE=1 cargo run --features backtrace -- -c 'select pow(1,'foo');'
+#[rstest]
+#[case("SELECT pow(1,'foo')")]
+#[case("SELECT CAST('not_a_number' AS INTEGER);")]
+#[cfg(feature = "backtrace")]
+fn test_backtrace_output(#[case] query: &str) {
+    let mut cmd = cli();
+    // Use a command that will cause an error and trigger backtrace
+    cmd.args(["--command", query, "-q"])
+        .env("RUST_BACKTRACE", "1"); // Enable backtrace
+
+    let output = cmd.output().expect("Failed to execute command");
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    let combined_output = format!("{}{}", stdout, stderr);
+
+    // Assert that the output includes literal 'backtrace'
+    assert!(
+        combined_output.to_lowercase().contains("backtrace"),
+        "Expected output to contain 'backtrace', but got stdout: '{}' stderr: '{}'",
+        stdout,
+        stderr
+    );
+}
+
+#[tokio::test]
+async fn test_s3_url_fallback() {
+    if env::var("TEST_STORAGE_INTEGRATION").is_err() {
+        eprintln!("Skipping external storages integration tests");
+        return;
+    }
+
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
+
+    let mut settings = make_settings();
+    settings.set_snapshot_suffix("s3_url_fallback");
+    let _bound = settings.bind_to_scope();
+
+    // Create a table using a prefix path (without trailing slash)
+    // This should trigger the fallback logic where head() fails on the prefix
+    // and list() is used to discover the actual files
+    let input = r#"CREATE EXTERNAL TABLE partitioned_data
+STORED AS CSV
+LOCATION 's3://data/partitioned_csv'
+OPTIONS (
+    'format.has_header' 'false'
+);
+
+SELECT * FROM partitioned_data ORDER BY column_1, column_2 LIMIT 5;
+"#;
+
+    assert_cmd_snapshot!(cli().with_minio(&container).await.pass_stdin(input));
+}
+
+/// Validate object store profiling output
+#[tokio::test]
+async fn test_object_store_profiling() {
+    if env::var("TEST_STORAGE_INTEGRATION").is_err() {
+        eprintln!("Skipping external storages integration tests");
+        return;
+    }
+
+    let container = match setup_minio_container().await {
+        Ok(c) => c,
+        Err(e) if e.contains("toomanyrequests") => {
+            eprintln!("Skipping test: Docker pull rate limit reached: {e}");
+            return;
+        }
+        e @ Err(_) => e.unwrap(),
+    };
+    let mut settings = make_settings();
+
+    // as the object store profiling contains timestamps and durations, we must
+    // filter them out to have stable snapshots
+    //
+    // Example line to filter:
+    // 2025-10-11T12:02:59.722646+00:00 operation=Get duration=0.001495s size=1006 path=cars.csv
+    // Output:
+    // <TIMESTAMP> operation=Get duration=[DURATION] size=1006 path=cars.csv
+    settings.add_filter(
+        r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?[+-]\d{2}:\d{2} operation=(Get|Put|Delete|List|Head) duration=\d+\.\d{6}s (size=\d+\s+)?path=(.*)",
+        "<TIMESTAMP> operation=$1 duration=[DURATION] ${2}path=$3",
+    );
+
+    // We also need to filter out the summary statistics (anything with an 's' at the end)
+    // Example line(s) to filter:
+    // | Get       | duration | 5.000000s | 5.000000s | 5.000000s |           | 1         |
+    settings.add_filter(
+        r"\| (Get|Put|Delete|List|Head)( +)\| duration \| .*? \| .*? \| .*? \| .*? \| (.*?) \|",
+        "| $1$2 | duration | ...NORMALIZED...| $3 |",
+    );
+
+    let _bound = settings.bind_to_scope();
+
+    let input = r#"
+    CREATE EXTERNAL TABLE CARS
+STORED AS CSV
+LOCATION 's3://data/cars.csv';
+
+-- Initial query should not show any profiling as the object store is not instrumented yet
+SELECT * from CARS LIMIT 1;
+\object_store_profiling trace
+-- Query again to see the full profiling output
+SELECT * from CARS LIMIT 1;
+\object_store_profiling summary
+-- Query again to see the summarized profiling output
+SELECT * from CARS LIMIT 1;
+\object_store_profiling disabled
+-- Final query should not show any profiling as we disabled it again
+SELECT * from CARS LIMIT 1;
+"#;
+
+    assert_cmd_snapshot!(cli().with_minio(&container).await.pass_stdin(input));
+}
+
+/// Extension trait to Add the minio connection information to a Command
+#[async_trait]
+trait MinioCommandExt {
+    async fn with_minio(&mut self, container: &ContainerAsync<minio::MinIO>)
+    -> &mut Self;
+}
+
+#[async_trait]
+impl MinioCommandExt for Command {
+    async fn with_minio(
+        &mut self,
+        container: &ContainerAsync<minio::MinIO>,
+    ) -> &mut Self {
+        let port = container.get_host_port_ipv4(9000).await.unwrap();
+
+        self.env_clear()
+            .env("AWS_ACCESS_KEY_ID", "TEST-DataFusionLogin")
+            .env("AWS_SECRET_ACCESS_KEY", "TEST-DataFusionPassword")
+            .env("AWS_ENDPOINT", format!("http://localhost:{port}"))
+            .env("AWS_ALLOW_HTTP", "true")
+    }
+}
diff --git a/datafusion-cli/tests/snapshots/aws_region_auto_resolution.snap b/datafusion-cli/tests/snapshots/aws_region_auto_resolution.snap
new file mode 100644
index 0000000000000..cd6d918b78d99
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/aws_region_auto_resolution.snap
@@ -0,0 +1,29 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args: []
+  env:
+    AWS_ENDPOINT: ""
+    RUST_LOG: warn
+  stdin: "CREATE EXTERNAL TABLE hits\nSTORED AS PARQUET\nLOCATION 's3://clickhouse-public-datasets/hits_compatible/athena_partitioned/hits_1.parquet'\nOPTIONS(\n    'aws.region' 'us-east-1',\n    'aws.skip_signature' true\n);\n\nSELECT COUNT(*) FROM hits;\n"
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
+0 row(s) fetched. 
+[ELAPSED]
+
++----------+
+| count(*) |
++----------+
+| 1000000  |
++----------+
+1 row(s) fetched. 
+[ELAPSED]
+
+\q
+
+----- stderr -----
+[[TIME] WARN  datafusion_cli::exec] S3 region is incorrect, auto-detecting the correct region (this may be slow). Consider updating your region configuration.
diff --git a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
index 6b3a247dd7b82..1359cefbe71c7 100644
--- a/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
+++ b/datafusion-cli/tests/snapshots/cli_explain_environment_overrides@explain_plan_environment_overrides.snap
@@ -7,7 +7,6 @@ info:
     - EXPLAIN SELECT 123
   env:
     DATAFUSION_EXPLAIN_FORMAT: pgjson
-snapshot_kind: text
 ---
 success: true
 exit_code: 0
diff --git a/datafusion-cli/tests/snapshots/cli_format@automatic.snap b/datafusion-cli/tests/snapshots/cli_format@automatic.snap
index 2591f493e90a8..76b14d9a3a924 100644
--- a/datafusion-cli/tests/snapshots/cli_format@automatic.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@automatic.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@csv.snap b/datafusion-cli/tests/snapshots/cli_format@csv.snap
index c41b042298eb0..2c969bd91d121 100644
--- a/datafusion-cli/tests/snapshots/cli_format@csv.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@csv.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@json.snap b/datafusion-cli/tests/snapshots/cli_format@json.snap
index 8f804a337cce5..22a9cc4657a91 100644
--- a/datafusion-cli/tests/snapshots/cli_format@json.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@json.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@nd-json.snap b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
index 7b4ce1e2530cf..513bcb7372ca6 100644
--- a/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@nd-json.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@table.snap b/datafusion-cli/tests/snapshots/cli_format@table.snap
index 99914182462aa..8677847588385 100644
--- a/datafusion-cli/tests/snapshots/cli_format@table.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@table.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_format@tsv.snap b/datafusion-cli/tests/snapshots/cli_format@tsv.snap
index 968268c31dd55..c56e60fcab155 100644
--- a/datafusion-cli/tests/snapshots/cli_format@tsv.snap
+++ b/datafusion-cli/tests/snapshots/cli_format@tsv.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
index c27d527df0b6a..9fd07fa6f4e1b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@batch_size.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@can_see_indent_format.snap b/datafusion-cli/tests/snapshots/cli_quick_test@can_see_indent_format.snap
index b2fb64709974e..8275041acaecc 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@can_see_indent_format.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@can_see_indent_format.snap
@@ -5,7 +5,6 @@ info:
   args:
     - "--command"
     - EXPLAIN FORMAT indent SELECT 123
-snapshot_kind: text
 ---
 success: true
 exit_code: 0
@@ -15,7 +14,7 @@ exit_code: 0
 | plan_type     | plan                                     |
 +---------------+------------------------------------------+
 | logical_plan  | Projection: Int64(123)                   |
-|               |   EmptyRelation                          |
+|               |   EmptyRelation: rows=1                  |
 | physical_plan | ProjectionExec: expr=[123 as Int64(123)] |
 |               |   PlaceholderRowExec                     |
 |               |                                          |
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap b/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
index 46ee6be64f624..8620f6da84488 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@default_explain_plan.snap
@@ -5,7 +5,6 @@ info:
   args:
     - "--command"
     - EXPLAIN SELECT 123
-snapshot_kind: text
 ---
 success: true
 exit_code: 0
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@files.snap b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
index 7c44e41729a17..df3a10b6bb54b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@files.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
index 3b975bb6a927d..a394458768d1b 100644
--- a/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
+++ b/datafusion-cli/tests/snapshots/cli_quick_test@statements.snap
@@ -1,5 +1,5 @@
 ---
-source: tests/cli_integration.rs
+source: datafusion-cli/tests/cli_integration.rs
 info:
   program: datafusion-cli
   args:
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
index 89b646a531f8b..fe454595eb4bc 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@no_track.snap
@@ -14,7 +14,7 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
 Resources exhausted: Failed to allocate
 
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
index ed925a6f64613..bb30e387166bc 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top2.snap
@@ -14,11 +14,11 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
-Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-  Consumer(can spill: bool) consumed XB,
-  Consumer(can spill: bool) consumed XB.
+Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
+  Consumer(can spill: bool) consumed XB, peak XB,
+  Consumer(can spill: bool) consumed XB, peak XB.
 Error: Failed to allocate 
 
 ----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
index f35e3b117178f..891d72e3cc639 100644
--- a/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers@top3_default.snap
@@ -12,12 +12,12 @@ success: false
 exit_code: 1
 ----- stdout -----
 [CLI_VERSION]
-Error: Not enough memory to continue external sort. Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
 caused by
-Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-  Consumer(can spill: bool) consumed XB,
-  Consumer(can spill: bool) consumed XB,
-  Consumer(can spill: bool) consumed XB.
+Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
+  Consumer(can spill: bool) consumed XB, peak XB,
+  Consumer(can spill: bool) consumed XB, peak XB,
+  Consumer(can spill: bool) consumed XB, peak XB.
 Error: Failed to allocate 
 
 ----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap
new file mode 100644
index 0000000000000..25267ea1617e5
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@no_track.snap
@@ -0,0 +1,23 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--memory-limit"
+    - 10M
+    - "--mem-pool-type"
+    - fair
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+    - "--top-memory-consumers"
+    - "0"
+---
+success: false
+exit_code: 1
+----- stdout -----
+[CLI_VERSION]
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
+caused by
+Resources exhausted: Failed to allocate
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap
new file mode 100644
index 0000000000000..6515050047107
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_top_memory_consumers_with_mem_pool_type@top2.snap
@@ -0,0 +1,26 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--memory-limit"
+    - 10M
+    - "--mem-pool-type"
+    - fair
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+    - "--top-memory-consumers"
+    - "2"
+---
+success: false
+exit_code: 1
+----- stdout -----
+[CLI_VERSION]
+Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
+caused by
+Resources exhausted: Additional allocation failed for ExternalSorter[0] with top memory consumers (across reservations) as:
+  Consumer(can spill: bool) consumed XB, peak XB,
+  Consumer(can spill: bool) consumed XB, peak XB.
+Error: Failed to allocate 
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap
new file mode 100644
index 0000000000000..7bdcd63dc7be6
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/cli_with_unbounded_memory_pool@default.snap
@@ -0,0 +1,36 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args:
+    - "--maxrows"
+    - "10"
+    - "--command"
+    - "select * from generate_series(1,500000) as t1(v1) order by v1;"
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
++----+
+| v1 |
++----+
+| 1  |
+| 2  |
+| 3  |
+| 4  |
+| 5  |
+| 6  |
+| 7  |
+| 8  |
+| 9  |
+| 10 |
+| .  |
+| .  |
+| .  |
++----+
+500000 row(s) fetched. (First 10 displayed. Use --maxrows to adjust)
+[ELAPSED]
+
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/object_store_profiling.snap b/datafusion-cli/tests/snapshots/object_store_profiling.snap
new file mode 100644
index 0000000000000..029b07c324f5d
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/object_store_profiling.snap
@@ -0,0 +1,83 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args: []
+  env:
+    AWS_ACCESS_KEY_ID: TEST-DataFusionLogin
+    AWS_ALLOW_HTTP: "true"
+    AWS_ENDPOINT: "http://localhost:55057"
+    AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword
+  stdin: "\n    CREATE EXTERNAL TABLE CARS\nSTORED AS CSV\nLOCATION 's3://data/cars.csv';\n\n-- Initial query should not show any profiling as the object store is not instrumented yet\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling trace\n-- Query again to see the full profiling output\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling summary\n-- Query again to see the summarized profiling output\nSELECT * from CARS LIMIT 1;\n\\object_store_profiling disabled\n-- Final query should not show any profiling as we disabled it again\nSELECT * from CARS LIMIT 1;\n"
+snapshot_kind: text
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
+0 row(s) fetched. 
+[ELAPSED]
+
++-----+-------+---------------------+
+| car | speed | time                |
++-----+-------+---------------------+
+| red | 20.0  | 1996-04-12T12:05:03 |
++-----+-------+---------------------+
+1 row(s) fetched. 
+[ELAPSED]
+
+ObjectStore Profile mode set to Trace
++-----+-------+---------------------+
+| car | speed | time                |
++-----+-------+---------------------+
+| red | 20.0  | 1996-04-12T12:05:03 |
++-----+-------+---------------------+
+1 row(s) fetched. 
+[ELAPSED]
+
+Object Store Profiling
+Instrumented Object Store: instrument_mode: Trace, inner: AmazonS3(data)
+<TIMESTAMP> operation=Head duration=[DURATION] path=cars.csv
+<TIMESTAMP> operation=Get duration=[DURATION] size=1006 path=cars.csv
+
+Summaries:
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Operation | Metric   | min       | max       | avg       | sum       | count |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Get        | duration | ...NORMALIZED...| 1     |
+| Get       | size     | 1006 B    | 1006 B    | 1006 B    | 1006 B    | 1     |
+| Head       | duration | ...NORMALIZED...| 1     |
+| Head      | size     |           |           |           |           | 1     |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+ObjectStore Profile mode set to Summary
++-----+-------+---------------------+
+| car | speed | time                |
++-----+-------+---------------------+
+| red | 20.0  | 1996-04-12T12:05:03 |
++-----+-------+---------------------+
+1 row(s) fetched. 
+[ELAPSED]
+
+Object Store Profiling
+Instrumented Object Store: instrument_mode: Summary, inner: AmazonS3(data)
+Summaries:
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Operation | Metric   | min       | max       | avg       | sum       | count |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Get        | duration | ...NORMALIZED...| 1     |
+| Get       | size     | 1006 B    | 1006 B    | 1006 B    | 1006 B    | 1     |
+| Head       | duration | ...NORMALIZED...| 1     |
+| Head      | size     |           |           |           |           | 1     |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+ObjectStore Profile mode set to Disabled
++-----+-------+---------------------+
+| car | speed | time                |
++-----+-------+---------------------+
+| red | 20.0  | 1996-04-12T12:05:03 |
++-----+-------+---------------------+
+1 row(s) fetched. 
+[ELAPSED]
+
+\q
+
+----- stderr -----
diff --git a/datafusion-cli/tests/snapshots/s3_url_fallback@s3_url_fallback.snap b/datafusion-cli/tests/snapshots/s3_url_fallback@s3_url_fallback.snap
new file mode 100644
index 0000000000000..07036d041b42c
--- /dev/null
+++ b/datafusion-cli/tests/snapshots/s3_url_fallback@s3_url_fallback.snap
@@ -0,0 +1,34 @@
+---
+source: datafusion-cli/tests/cli_integration.rs
+info:
+  program: datafusion-cli
+  args: []
+  env:
+    AWS_ACCESS_KEY_ID: TEST-DataFusionLogin
+    AWS_ALLOW_HTTP: "true"
+    AWS_ENDPOINT: "http://localhost:32771"
+    AWS_SECRET_ACCESS_KEY: TEST-DataFusionPassword
+  stdin: "CREATE EXTERNAL TABLE partitioned_data\nSTORED AS CSV\nLOCATION 's3://data/partitioned_csv'\nOPTIONS (\n    'format.has_header' 'false'\n);\n\nSELECT * FROM partitioned_data ORDER BY column_1, column_2 LIMIT 5;\n"
+---
+success: true
+exit_code: 0
+----- stdout -----
+[CLI_VERSION]
+0 row(s) fetched. 
+[ELAPSED]
+
++----------+----------+----------+
+| column_1 | column_2 | column_3 |
++----------+----------+----------+
+| 0        | 0        | true     |
+| 0        | 1        | false    |
+| 0        | 2        | true     |
+| 0        | 3        | false    |
+| 0        | 4        | true     |
++----------+----------+----------+
+5 row(s) fetched. 
+[ELAPSED]
+
+\q
+
+----- stderr -----
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index b31708a5c1cc7..e56f5ad6b8ca7 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -29,55 +29,50 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
-[[example]]
-name = "flight_sql_server"
-path = "examples/flight/flight_sql_server.rs"
-
-[[example]]
-name = "flight_server"
-path = "examples/flight/flight_server.rs"
-
-[[example]]
-name = "flight_client"
-path = "examples/flight/flight_client.rs"
-
-[[example]]
-name = "dataframe_to_s3"
-path = "examples/external_dependency/dataframe-to-s3.rs"
-
-[[example]]
-name = "query_aws_s3"
-path = "examples/external_dependency/query-aws-s3.rs"
+[dependencies]
+arrow = { workspace = true }
+arrow-schema = { workspace = true }
+datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] }
+datafusion-common = { workspace = true }
+nom = "8.0.0"
+tempfile = { workspace = true }
+tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
 
 [dev-dependencies]
-arrow = { workspace = true }
-# arrow_schema is required for record_batch! macro :sad:
 arrow-flight = { workspace = true }
-arrow-schema = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
 dashmap = { workspace = true }
 # note only use main datafusion crate for examples
-datafusion = { workspace = true, default-features = true }
-datafusion-ffi = { workspace = true }
+base64 = "0.22.1"
+datafusion-expr = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-proto = { workspace = true }
+datafusion-sql = { workspace = true }
 env_logger = { workspace = true }
 futures = { workspace = true }
+insta = { workspace = true }
 log = { workspace = true }
 mimalloc = { version = "0.1", default-features = false }
 object_store = { workspace = true, features = ["aws", "http"] }
 prost = { workspace = true }
-tempfile = { workspace = true }
+rand = { workspace = true }
+serde = { version = "1", features = ["derive"] }
+serde_json = { workspace = true }
+strum = { workspace = true }
+strum_macros = { workspace = true }
 test-utils = { path = "../test-utils" }
-tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
-tonic = "0.12.1"
+tonic = "0.14"
 tracing = { version = "0.1" }
 tracing-subscriber = { version = "0.3" }
 url = { workspace = true }
-uuid = "1.17"
+uuid = { workspace = true }
 
 [target.'cfg(not(target_os = "windows"))'.dev-dependencies]
-nix = { version = "0.30.1", features = ["fs"] }
+nix = { version = "0.31.1", features = ["fs"] }
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index 3ba4c77cd84c3..2cf0ec52409f8 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -39,51 +39,181 @@ git submodule update --init
 # Change to the examples directory
 cd datafusion-examples/examples
 
-# Run the `dataframe` example:
-# ... use the equivalent for other examples
-cargo run --example dataframe
+# Run all examples in a group
+cargo run --example <group> -- all
+
+# Run a specific example within a group
+cargo run --example <group> -- <subcommand>
+
+# Run all examples in the `dataframe` group
+cargo run --example dataframe -- all
+
+# Run a single example from the `dataframe` group
+# (apply the same pattern for any other group)
+cargo run --example dataframe -- dataframe
 ```
 
-## Single Process
-
-- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
-- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
-- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)
-- [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files
-- [`analyzer_rule.rs`](examples/analyzer_rule.rs): Use a custom AnalyzerRule to change a query's semantics (row level access control)
-- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
-- [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization
-- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file
-- [`csv_json_opener.rs`](examples/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es
-- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider)
-- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format
-- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
-- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
-- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s
-- [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks.
-- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
-- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros
-- [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates
-- [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries
-- [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution
-- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`.
-- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan`
-- [`planner_api.rs`](examples/planner_api.rs) APIs to manipulate logical and physical plans
-- [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics
-- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
-- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
-- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
-- [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network)
-- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
-- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
-- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
-- [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures
-- [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings
-- [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser`
-- [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory `RecordBatches`, local Parquet files)
-- [`date_time_function.rs`](examples/date_time_function.rs): Examples of date-time related functions and queries.
-
-## Distributed
-
-- [`flight_client.rs`](examples/flight/flight_client.rs) and [`flight_server.rs`](examples/flight/flight_server.rs): Run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol.
+## Builtin Functions Examples
+
+### Group: `builtin_functions`
+
+#### Category: Single Process
+
+| Subcommand       | File Path                                                                                 | Description                                                |
+| ---------------- | ----------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| date_time        | [`builtin_functions/date_time.rs`](examples/builtin_functions/date_time.rs)               | Examples of date-time related functions and queries        |
+| function_factory | [`builtin_functions/function_factory.rs`](examples/builtin_functions/function_factory.rs) | Register `CREATE FUNCTION` handler to implement SQL macros |
+| regexp           | [`builtin_functions/regexp.rs`](examples/builtin_functions/regexp.rs)                     | Examples of using regular expression functions             |
+
+## Custom Data Source Examples
+
+### Group: `custom_data_source`
+
+#### Category: Single Process
+
+| Subcommand            | File Path                                                                                             | Description                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| adapter_serialization | [`custom_data_source/adapter_serialization.rs`](examples/custom_data_source/adapter_serialization.rs) | Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception |
+| csv_json_opener       | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs)             | Use low-level FileOpener APIs for CSV/JSON                                                                          |
+| csv_sql_streaming     | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs)         | Run a streaming SQL query against CSV data                                                                          |
+| custom_datasource     | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs)         | Query a custom TableProvider                                                                                        |
+| custom_file_casts     | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs)         | Implement custom casting rules                                                                                      |
+| custom_file_format    | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs)       | Write to a custom file format                                                                                       |
+| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata                                                                                |
+| file_stream_provider  | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs)   | Read/write via FileStreamProvider for streams                                                                       |
+
+## Data IO Examples
+
+### Group: `data_io`
+
+#### Category: Single Process
+
+| Subcommand           | File Path                                                                                 | Description                                            |
+| -------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------ |
+| catalog              | [`data_io/catalog.rs`](examples/data_io/catalog.rs)                                       | Register tables into a custom catalog                  |
+| json_shredding       | [`data_io/json_shredding.rs`](examples/data_io/json_shredding.rs)                         | Implement filter rewriting for JSON shredding          |
+| parquet_adv_idx      | [`data_io/parquet_advanced_index.rs`](examples/data_io/parquet_advanced_index.rs)         | Create a secondary index across multiple parquet files |
+| parquet_emb_idx      | [`data_io/parquet_embedded_index.rs`](examples/data_io/parquet_embedded_index.rs)         | Store a custom index inside Parquet files              |
+| parquet_enc          | [`data_io/parquet_encrypted.rs`](examples/data_io/parquet_encrypted.rs)                   | Read & write encrypted Parquet files                   |
+| parquet_enc_with_kms | [`data_io/parquet_encrypted_with_kms.rs`](examples/data_io/parquet_encrypted_with_kms.rs) | Encrypted Parquet I/O using a KMS-backed factory       |
+| parquet_exec_visitor | [`data_io/parquet_exec_visitor.rs`](examples/data_io/parquet_exec_visitor.rs)             | Extract statistics by visiting an ExecutionPlan        |
+| parquet_idx          | [`data_io/parquet_index.rs`](examples/data_io/parquet_index.rs)                           | Create a secondary index                               |
+| query_http_csv       | [`data_io/query_http_csv.rs`](examples/data_io/query_http_csv.rs)                         | Query CSV files via HTTP                               |
+| remote_catalog       | [`data_io/remote_catalog.rs`](examples/data_io/remote_catalog.rs)                         | Interact with a remote catalog                         |
+
+## DataFrame Examples
+
+### Group: `dataframe`
+
+#### Category: Single Process
+
+| Subcommand            | File Path                                                                           | Description                                             |
+| --------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| cache_factory         | [`dataframe/cache_factory.rs`](examples/dataframe/cache_factory.rs)                 | Custom lazy caching for DataFrames using `CacheFactory` |
+| dataframe             | [`dataframe/dataframe.rs`](examples/dataframe/dataframe.rs)                         | Query DataFrames from various sources and write output  |
+| deserialize_to_struct | [`dataframe/deserialize_to_struct.rs`](examples/dataframe/deserialize_to_struct.rs) | Convert Arrow arrays into Rust structs                  |
+
+## Execution Monitoring Examples
+
+### Group: `execution_monitoring`
+
+#### Category: Single Process
+
+| Subcommand         | File Path                                                                                                           | Description                              |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------- |
+| mem_pool_exec_plan | [`execution_monitoring/memory_pool_execution_plan.rs`](examples/execution_monitoring/memory_pool_execution_plan.rs) | Memory-aware ExecutionPlan with spilling |
+| mem_pool_tracking  | [`execution_monitoring/memory_pool_tracking.rs`](examples/execution_monitoring/memory_pool_tracking.rs)             | Demonstrates memory tracking             |
+| tracing            | [`execution_monitoring/tracing.rs`](examples/execution_monitoring/tracing.rs)                                       | Demonstrates tracing integration         |
+
+## External Dependency Examples
+
+### Group: `external_dependency`
+
+#### Category: Single Process
+
+| Subcommand      | File Path                                                                                   | Description                              |
+| --------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------- |
+| dataframe_to_s3 | [`external_dependency/dataframe_to_s3.rs`](examples/external_dependency/dataframe_to_s3.rs) | Query DataFrames and write results to S3 |
+| query_aws_s3    | [`external_dependency/query_aws_s3.rs`](examples/external_dependency/query_aws_s3.rs)       | Query S3-backed data using object_store  |
+
+## Flight Examples
+
+### Group: `flight`
+
+#### Category: Distributed
+
+| Subcommand | File Path                                               | Description                                            |
+| ---------- | ------------------------------------------------------- | ------------------------------------------------------ |
+| client     | [`flight/client.rs`](examples/flight/client.rs)         | Execute SQL queries via Arrow Flight protocol          |
+| server     | [`flight/server.rs`](examples/flight/server.rs)         | Run DataFusion server accepting FlightSQL/JDBC queries |
+| sql_server | [`flight/sql_server.rs`](examples/flight/sql_server.rs) | Standalone SQL server for JDBC clients                 |
+
+## Proto Examples
+
+### Group: `proto`
+
+#### Category: Single Process
+
+| Subcommand               | File Path                                                                         | Description                                                                   |
+| ------------------------ | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization               |
+| expression_deduplication | [`proto/expression_deduplication.rs`](examples/proto/expression_deduplication.rs) | Example of expression caching/deduplication using the codec decorator pattern |
+
+## Query Planning Examples
+
+### Group: `query_planning`
+
+#### Category: Single Process
+
+| Subcommand     | File Path                                                                       | Description                                            |
+| -------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------ |
+| analyzer_rule  | [`query_planning/analyzer_rule.rs`](examples/query_planning/analyzer_rule.rs)   | Custom AnalyzerRule to change query semantics          |
+| expr_api       | [`query_planning/expr_api.rs`](examples/query_planning/expr_api.rs)             | Create, execute, analyze, and coerce Exprs             |
+| optimizer_rule | [`query_planning/optimizer_rule.rs`](examples/query_planning/optimizer_rule.rs) | Replace predicates via a custom OptimizerRule          |
+| parse_sql_expr | [`query_planning/parse_sql_expr.rs`](examples/query_planning/parse_sql_expr.rs) | Parse SQL into DataFusion Expr                         |
+| plan_to_sql    | [`query_planning/plan_to_sql.rs`](examples/query_planning/plan_to_sql.rs)       | Generate SQL from expressions or plans                 |
+| planner_api    | [`query_planning/planner_api.rs`](examples/query_planning/planner_api.rs)       | APIs for logical and physical plan manipulation        |
+| pruning        | [`query_planning/pruning.rs`](examples/query_planning/pruning.rs)               | Use pruning to skip irrelevant files                   |
+| thread_pools   | [`query_planning/thread_pools.rs`](examples/query_planning/thread_pools.rs)     | Configure custom thread pools for DataFusion execution |
+
+## Relation Planner Examples
+
+### Group: `relation_planner`
+
+#### Category: Single Process
+
+| Subcommand      | File Path                                                                             | Description                                |
+| --------------- | ------------------------------------------------------------------------------------- | ------------------------------------------ |
+| match_recognize | [`relation_planner/match_recognize.rs`](examples/relation_planner/match_recognize.rs) | Implement MATCH_RECOGNIZE pattern matching |
+| pivot_unpivot   | [`relation_planner/pivot_unpivot.rs`](examples/relation_planner/pivot_unpivot.rs)     | Implement PIVOT / UNPIVOT                  |
+| table_sample    | [`relation_planner/table_sample.rs`](examples/relation_planner/table_sample.rs)       | Implement TABLESAMPLE                      |
+
+## SQL Ops Examples
+
+### Group: `sql_ops`
+
+#### Category: Single Process
+
+| Subcommand        | File Path                                                               | Description                                        |
+| ----------------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
+| analysis          | [`sql_ops/analysis.rs`](examples/sql_ops/analysis.rs)                   | Analyze SQL queries                                |
+| custom_sql_parser | [`sql_ops/custom_sql_parser.rs`](examples/sql_ops/custom_sql_parser.rs) | Implement a custom SQL parser to extend DataFusion |
+| frontend          | [`sql_ops/frontend.rs`](examples/sql_ops/frontend.rs)                   | Build LogicalPlans from SQL                        |
+| query             | [`sql_ops/query.rs`](examples/sql_ops/query.rs)                         | Query data using SQL                               |
+
+## UDF Examples
+
+### Group: `udf`
+
+#### Category: Single Process
+
+| Subcommand | File Path                                               | Description                                     |
+| ---------- | ------------------------------------------------------- | ----------------------------------------------- |
+| adv_udaf   | [`udf/advanced_udaf.rs`](examples/udf/advanced_udaf.rs) | Advanced User Defined Aggregate Function (UDAF) |
+| adv_udf    | [`udf/advanced_udf.rs`](examples/udf/advanced_udf.rs)   | Advanced User Defined Scalar Function (UDF)     |
+| adv_udwf   | [`udf/advanced_udwf.rs`](examples/udf/advanced_udwf.rs) | Advanced User Defined Window Function (UDWF)    |
+| async_udf  | [`udf/async_udf.rs`](examples/udf/async_udf.rs)         | Asynchronous User Defined Scalar Function       |
+| udaf       | [`udf/simple_udaf.rs`](examples/udf/simple_udaf.rs)     | Simple UDAF example                             |
+| udf        | [`udf/simple_udf.rs`](examples/udf/simple_udf.rs)       | Simple UDF example                              |
+| udtf       | [`udf/simple_udtf.rs`](examples/udf/simple_udtf.rs)     | Simple UDTF example                             |
+| udwf       | [`udf/simple_udwf.rs`](examples/udf/simple_udwf.rs)     | Simple UDWF example                             |
diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md
new file mode 100644
index 0000000000000..e8296a8856e60
--- /dev/null
+++ b/datafusion-examples/data/README.md
@@ -0,0 +1,25 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## Example datasets
+
+| Filename    | Path                                    | Description                                                                                                                                                                          |
+| ----------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `cars.csv`  | [`data/csv/cars.csv`](./csv/cars.csv)   | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames).             |
+| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. |
diff --git a/datafusion-examples/data/csv/cars.csv b/datafusion-examples/data/csv/cars.csv
new file mode 100644
index 0000000000000..bc40f3b01e7a5
--- /dev/null
+++ b/datafusion-examples/data/csv/cars.csv
@@ -0,0 +1,26 @@
+car,speed,time
+red,20.0,1996-04-12T12:05:03.000000000
+red,20.3,1996-04-12T12:05:04.000000000
+red,21.4,1996-04-12T12:05:05.000000000
+red,21.5,1996-04-12T12:05:06.000000000
+red,19.0,1996-04-12T12:05:07.000000000
+red,18.0,1996-04-12T12:05:08.000000000
+red,17.0,1996-04-12T12:05:09.000000000
+red,7.0,1996-04-12T12:05:10.000000000
+red,7.1,1996-04-12T12:05:11.000000000
+red,7.2,1996-04-12T12:05:12.000000000
+red,3.0,1996-04-12T12:05:13.000000000
+red,1.0,1996-04-12T12:05:14.000000000
+red,0.0,1996-04-12T12:05:15.000000000
+green,10.0,1996-04-12T12:05:03.000000000
+green,10.3,1996-04-12T12:05:04.000000000
+green,10.4,1996-04-12T12:05:05.000000000
+green,10.5,1996-04-12T12:05:06.000000000
+green,11.0,1996-04-12T12:05:07.000000000
+green,12.0,1996-04-12T12:05:08.000000000
+green,14.0,1996-04-12T12:05:09.000000000
+green,15.0,1996-04-12T12:05:10.000000000
+green,15.1,1996-04-12T12:05:11.000000000
+green,15.2,1996-04-12T12:05:12.000000000
+green,8.0,1996-04-12T12:05:13.000000000
+green,2.0,1996-04-12T12:05:14.000000000
diff --git a/datafusion-examples/data/csv/regex.csv b/datafusion-examples/data/csv/regex.csv
new file mode 100644
index 0000000000000..b249c39522b60
--- /dev/null
+++ b/datafusion-examples/data/csv/regex.csv
@@ -0,0 +1,12 @@
+values,patterns,replacement,flags
+abc,^(a),bb\1bb,i
+ABC,^(A).*,B,i
+aBc,(b|d),e,i
+AbC,(B|D),e,
+aBC,^(b|c),d,
+4000,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz,
+4010,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz,
+Düsseldorf,[\p{Letter}-]+,München,
+Москва,[\p{L}-]+,Moscow,
+Köln,[a-zA-Z]ö[a-zA-Z]{2},Koln,
+اليوم,^\p{Arabic}+$,Today,
\ No newline at end of file
diff --git a/datafusion-examples/examples/date_time_functions.rs b/datafusion-examples/examples/builtin_functions/date_time.rs
similarity index 94%
rename from datafusion-examples/examples/date_time_functions.rs
rename to datafusion-examples/examples/builtin_functions/date_time.rs
index dbe9970439df7..08d4bc6e29978 100644
--- a/datafusion-examples/examples/date_time_functions.rs
+++ b/datafusion-examples/examples/builtin_functions/date_time.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::array::{Date32Array, Int32Array};
@@ -26,8 +28,20 @@ use datafusion::common::assert_contains;
 use datafusion::error::Result;
 use datafusion::prelude::*;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Example: Working with Date and Time Functions
+///
+/// This example demonstrates how to work with various date and time
+/// functions in DataFusion using both the DataFrame API and SQL queries.
+///
+/// It includes:
+/// - `make_date`: building `DATE` values from year, month, and day columns
+/// - `to_date`: converting string expressions into `DATE` values
+/// - `to_timestamp`: parsing strings or numeric values into `TIMESTAMP`s
+/// - `to_char`: formatting dates, timestamps, and durations as strings
+///
+/// Together, these examples show how to create, convert, and format temporal
+/// data using DataFusion’s built-in functions.
+pub async fn date_time() -> Result<()> {
     query_make_date().await?;
     query_to_date().await?;
     query_to_timestamp().await?;
@@ -167,12 +181,13 @@ async fn query_make_date() -> Result<()> {
 
     // invalid column values will result in an error
     let result = ctx
-        .sql("select make_date(2024, null, 23)")
+        .sql("select make_date(2024, '', 23)")
         .await?
         .collect()
         .await;
 
-    let expected = "Execution error: Unable to parse date from null/empty value";
+    let expected =
+        "Arrow error: Cast error: Cannot cast string '' to value of Int32 type";
     assert_contains!(result.unwrap_err().to_string(), expected);
 
     // invalid date values will also result in an error
@@ -182,7 +197,7 @@ async fn query_make_date() -> Result<()> {
         .collect()
         .await;
 
-    let expected = "Execution error: Unable to parse date from 2024, 1, 32";
+    let expected = "Execution error: Day value '32' is out of range";
     assert_contains!(result.unwrap_err().to_string(), expected);
 
     Ok(())
@@ -492,14 +507,14 @@ async fn query_to_char() -> Result<()> {
 
     assert_batches_eq!(
         &[
-            "+------------------------------+",
-            "| to_char(t.values,t.patterns) |",
-            "+------------------------------+",
-            "| 2020-09-01                   |",
-            "| 2020:09:02                   |",
-            "| 20200903                     |",
-            "| 04-09-2020                   |",
-            "+------------------------------+",
+            "+----------------------------------+",
+            "| date_format(t.values,t.patterns) |",
+            "+----------------------------------+",
+            "| 2020-09-01                       |",
+            "| 2020:09:02                       |",
+            "| 20200903                         |",
+            "| 04-09-2020                       |",
+            "+----------------------------------+",
         ],
         &result
     );
diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/builtin_functions/function_factory.rs
similarity index 95%
rename from datafusion-examples/examples/function_factory.rs
rename to datafusion-examples/examples/builtin_functions/function_factory.rs
index e712f4ea8eaa4..106c53cdf7f12 100644
--- a/datafusion-examples/examples/function_factory.rs
+++ b/datafusion-examples/examples/builtin_functions/function_factory.rs
@@ -15,19 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::DataType;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::{exec_err, internal_err, DataFusionError};
+use datafusion::common::{DataFusionError, exec_datafusion_err, exec_err, internal_err};
 use datafusion::error::Result;
 use datafusion::execution::context::{
     FunctionFactory, RegisterFunction, SessionContext, SessionState,
 };
-use datafusion::logical_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion::logical_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::{
     ColumnarValue, CreateFunction, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
     Signature, Volatility,
 };
+use std::hash::Hash;
 use std::result::Result as RResult;
 use std::sync::Arc;
 
@@ -41,8 +44,7 @@ use std::sync::Arc;
 ///
 /// This example is rather simple and does not cover all cases required for a
 /// real implementation.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn function_factory() -> Result<()> {
     // First we must configure the SessionContext with our function factory
     let ctx = SessionContext::new()
         // register custom function factory
@@ -106,7 +108,7 @@ impl FunctionFactory for CustomFunctionFactory {
 }
 
 /// this function represents the newly created execution engine.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct ScalarFunctionWrapper {
     /// The text of the function body, `$1 + f1($2)` in our example
     name: String,
@@ -143,17 +145,13 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let replacement = Self::replacement(&self.expr, &args)?;
 
         Ok(ExprSimplifyResult::Simplified(replacement))
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn output_ordering(&self, _input: &[ExprProperties]) -> Result<SortProperties> {
         Ok(SortProperties::Unordered)
     }
@@ -188,9 +186,7 @@ impl ScalarFunctionWrapper {
     fn parse_placeholder_identifier(placeholder: &str) -> Result<usize> {
         if let Some(value) = placeholder.strip_prefix('$') {
             Ok(value.parse().map(|v: usize| v - 1).map_err(|e| {
-                DataFusionError::Execution(format!(
-                    "Placeholder `{placeholder}` parsing error: {e}!"
-                ))
+                exec_datafusion_err!("Placeholder `{placeholder}` parsing error: {e}!")
             })?)
         } else {
             exec_err!("Placeholder should start with `$`!")
diff --git a/datafusion-examples/examples/builtin_functions/main.rs b/datafusion-examples/examples/builtin_functions/main.rs
new file mode 100644
index 0000000000000..42ca15f91935d
--- /dev/null
+++ b/datafusion-examples/examples/builtin_functions/main.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are miscellaneous function-related examples
+//!
+//! These examples demonstrate miscellaneous function-related features.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example builtin_functions -- [all|date_time|function_factory|regexp]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `date_time`
+//!   (file: date_time.rs, desc: Examples of date-time related functions and queries)
+//!
+//! - `function_factory`  
+//!   (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
+//!
+//! - `regexp`
+//!   (file: regexp.rs, desc: Examples of using regular expression functions)
+
+mod date_time;
+mod function_factory;
+mod regexp;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    DateTime,
+    FunctionFactory,
+    Regexp,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "builtin_functions";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::DateTime => date_time::date_time().await?,
+            ExampleKind::FunctionFactory => function_factory::function_factory().await?,
+            ExampleKind::Regexp => regexp::regexp().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs
similarity index 74%
rename from datafusion-examples/examples/regexp.rs
rename to datafusion-examples/examples/builtin_functions/regexp.rs
index 12d115b9b502c..97dc71b94e934 100644
--- a/datafusion-examples/examples/regexp.rs
+++ b/datafusion-examples/examples/builtin_functions/regexp.rs
@@ -1,5 +1,4 @@
 // Licensed to the Apache Software Foundation (ASF) under one
-// Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
@@ -16,9 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::common::{assert_batches_eq, assert_contains};
 use datafusion::error::Result;
 use datafusion::prelude::*;
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example demonstrates how to use the regexp_* functions
 ///
@@ -28,15 +30,12 @@ use datafusion::prelude::*;
 ///
 /// Supported flags can be found at
 /// https://docs.rs/regex/latest/regex/#grouping-and-flags
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn regexp() -> Result<()> {
     let ctx = SessionContext::new();
-    ctx.register_csv(
-        "examples",
-        "../../datafusion/physical-expr/tests/data/regex.csv",
-        CsvReadOptions::new(),
-    )
-    .await?;
+    let dataset = ExampleDataset::Regex;
+
+    ctx.register_csv("examples", dataset.path_str()?, CsvReadOptions::new())
+        .await?;
 
     //
     //
@@ -112,11 +111,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+---------------------------------------------------+----------------------------------------------------+",
-    "| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
-    "+---------------------------------------------------+----------------------------------------------------+",
-    "| true                                              | true                                               |",
-    "+---------------------------------------------------+----------------------------------------------------+",
+            "+---------------------------------------------------+----------------------------------------------------+",
+            "| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
+            "+---------------------------------------------------+----------------------------------------------------+",
+            "| true                                              | true                                               |",
+            "+---------------------------------------------------+----------------------------------------------------+",
         ],
         &result
     );
@@ -242,11 +241,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+----------------------------------------------------+-----------------------------------------------------+",
-    "| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
-    "+----------------------------------------------------+-----------------------------------------------------+",
-    "| [John Smith]                                       | [Smith Jones]                                       |",
-    "+----------------------------------------------------+-----------------------------------------------------+",
+            "+----------------------------------------------------+-----------------------------------------------------+",
+            "| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
+            "+----------------------------------------------------+-----------------------------------------------------+",
+            "| [John Smith]                                       | [Smith Jones]                                       |",
+            "+----------------------------------------------------+-----------------------------------------------------+",
         ],
         &result
     );
@@ -268,21 +267,21 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+---------------------------------------------------------------------------------------------------------+",
-    "| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
-    "+---------------------------------------------------------------------------------------------------------+",
-    "| bbabbbc                                                                                                 |",
-    "| B                                                                                                       |",
-    "| aec                                                                                                     |",
-    "| AbC                                                                                                     |",
-    "| aBC                                                                                                     |",
-    "| 4000                                                                                                    |",
-    "| xyz                                                                                                     |",
-    "| München                                                                                                 |",
-    "| Moscow                                                                                                  |",
-    "| Koln                                                                                                    |",
-    "| Today                                                                                                   |",
-    "+---------------------------------------------------------------------------------------------------------+",
+            "+---------------------------------------------------------------------------------------------------------+",
+            "| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
+            "+---------------------------------------------------------------------------------------------------------+",
+            "| bbabbbc                                                                                                 |",
+            "| B                                                                                                       |",
+            "| aec                                                                                                     |",
+            "| AbC                                                                                                     |",
+            "| aBC                                                                                                     |",
+            "| 4000                                                                                                    |",
+            "| xyz                                                                                                     |",
+            "| München                                                                                                 |",
+            "| Moscow                                                                                                  |",
+            "| Koln                                                                                                    |",
+            "| Today                                                                                                   |",
+            "+---------------------------------------------------------------------------------------------------------+",
         ],
         &result
     );
@@ -296,11 +295,11 @@ async fn main() -> Result<()> {
 
     assert_batches_eq!(
         &[
-    "+------------------------------------------------------------------------+",
-    "| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
-    "+------------------------------------------------------------------------+",
-    "| fooXarYXazY                                                            |",
-    "+------------------------------------------------------------------------+",
+            "+------------------------------------------------------------------------+",
+            "| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
+            "+------------------------------------------------------------------------+",
+            "| fooXarYXazY                                                            |",
+            "+------------------------------------------------------------------------+",
         ],
         &result
     );
diff --git a/datafusion-examples/examples/custom_data_source/adapter_serialization.rs b/datafusion-examples/examples/custom_data_source/adapter_serialization.rs
new file mode 100644
index 0000000000000..a2cd187fee067
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/adapter_serialization.rs
@@ -0,0 +1,519 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to use the `PhysicalExtensionCodec` trait's
+//! interception methods (`serialize_physical_plan` and `deserialize_physical_plan`)
+//! to implement custom serialization logic.
+//!
+//! The key insight is that `FileScanConfig::expr_adapter_factory` is NOT serialized by
+//! default. This example shows how to:
+//! 1. Detect plans with custom adapters during serialization
+//! 2. Wrap them as Extension nodes with JSON-serialized adapter metadata
+//! 3. Store the inner DataSourceExec (without adapter) as a child in the extension's inputs field
+//! 4. Unwrap and restore the adapter during deserialization
+//!
+//! This demonstrates nested serialization (protobuf outer, JSON inner) and the power
+//! of the `PhysicalExtensionCodec` interception pattern. Both plan and expression
+//! serialization route through the codec, enabling interception at every node in the tree.
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use arrow::array::record_batch;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::assert_batches_eq;
+use datafusion::common::{Result, not_impl_err};
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
+};
+use datafusion::datasource::physical_plan::{FileScanConfig, FileScanConfigBuilder};
+use datafusion::datasource::source::DataSourceExec;
+use datafusion::execution::TaskContext;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::SessionConfig;
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+};
+use datafusion_proto::bytes::{
+    physical_plan_from_bytes_with_proto_converter,
+    physical_plan_to_bytes_with_proto_converter,
+};
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr_with_converter;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
+use datafusion_proto::physical_plan::{
+    PhysicalExtensionCodec, PhysicalProtoConverterExtension,
+};
+use datafusion_proto::protobuf::physical_plan_node::PhysicalPlanType;
+use datafusion_proto::protobuf::{
+    PhysicalExprNode, PhysicalExtensionNode, PhysicalPlanNode,
+};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+use serde::{Deserialize, Serialize};
+
+/// Example showing how to preserve custom adapter information during plan serialization.
+///
+/// This demonstrates:
+/// 1. Creating a custom PhysicalExprAdapter with metadata
+/// 2. Using PhysicalExtensionCodec to intercept serialization
+/// 3. Wrapping adapter info as Extension nodes
+/// 4. Restoring adapters during deserialization
+pub async fn adapter_serialization() -> Result<()> {
+    println!("=== PhysicalExprAdapter Serialization Example ===\n");
+
+    // Step 1: Create sample Parquet data in memory
+    println!("Step 1: Creating sample Parquet data...");
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let batch = record_batch!(("id", Int32, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))?;
+    let path = Path::from("data.parquet");
+    write_parquet(&store, &path, &batch).await?;
+
+    // Step 2: Set up session with custom adapter
+    println!("Step 2: Setting up session with custom adapter...");
+    let logical_schema =
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+    let mut cfg = SessionConfig::new();
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.runtime_env().register_object_store(
+        ObjectStoreUrl::parse("memory://")?.as_ref(),
+        Arc::clone(&store),
+    );
+
+    // Create a table with our custom MetadataAdapterFactory
+    let adapter_factory = Arc::new(MetadataAdapterFactory::new("v1"));
+    let listing_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///data.parquet")?)
+            .infer_options(&ctx.state())
+            .await?
+            .with_schema(logical_schema)
+            .with_expr_adapter_factory(
+                Arc::clone(&adapter_factory) as Arc<dyn PhysicalExprAdapterFactory>
+            );
+    let table = ListingTable::try_new(listing_config)?;
+    ctx.register_table("my_table", Arc::new(table))?;
+
+    // Step 3: Create physical plan with filter
+    println!("Step 3: Creating physical plan with filter...");
+    let df = ctx.sql("SELECT * FROM my_table WHERE id > 5").await?;
+    let original_plan = df.create_physical_plan().await?;
+
+    // Verify adapter is present in original plan
+    let has_adapter_before = verify_adapter_in_plan(&original_plan, "original");
+    println!("  Original plan has adapter: {has_adapter_before}");
+
+    // Step 4: Serialize with our custom codec
+    println!("\nStep 4: Serializing plan with AdapterPreservingCodec...");
+    let codec = AdapterPreservingCodec;
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&original_plan),
+        &codec,
+        &codec,
+    )?;
+    println!("  Serialized {} bytes", bytes.len());
+    println!("  (DataSourceExec with adapter was wrapped as PhysicalExtensionNode)");
+
+    // Step 5: Deserialize with our custom codec
+    println!("\nStep 5: Deserializing plan with AdapterPreservingCodec...");
+    let task_ctx = ctx.task_ctx();
+    let restored_plan =
+        physical_plan_from_bytes_with_proto_converter(&bytes, &task_ctx, &codec, &codec)?;
+
+    // Verify adapter is restored
+    let has_adapter_after = verify_adapter_in_plan(&restored_plan, "restored");
+    println!("  Restored plan has adapter: {has_adapter_after}");
+
+    // Step 6: Execute and compare results
+    println!("\nStep 6: Executing plans and comparing results...");
+    let original_results =
+        datafusion::physical_plan::collect(Arc::clone(&original_plan), task_ctx.clone())
+            .await?;
+    let restored_results =
+        datafusion::physical_plan::collect(restored_plan, task_ctx).await?;
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| id |",
+        "+----+",
+        "| 6  |",
+        "| 7  |",
+        "| 8  |",
+        "| 9  |",
+        "| 10 |",
+        "+----+",
+    ];
+
+    println!("\n  Original plan results:");
+    arrow::util::pretty::print_batches(&original_results)?;
+    assert_batches_eq!(expected, &original_results);
+
+    println!("\n  Restored plan results:");
+    arrow::util::pretty::print_batches(&restored_results)?;
+    assert_batches_eq!(expected, &restored_results);
+
+    println!("\n=== Example Complete! ===");
+    println!("Key takeaways:");
+    println!(
+        "  1. PhysicalExtensionCodec provides serialize_physical_plan/deserialize_physical_plan hooks"
+    );
+    println!("  2. Custom metadata can be wrapped as PhysicalExtensionNode");
+    println!("  3. Nested serialization (protobuf + JSON) works seamlessly");
+    println!(
+        "  4. Both plans produce identical results despite serialization round-trip"
+    );
+    println!("  5. Adapters are fully preserved through the serialization round-trip");
+
+    Ok(())
+}
+
+// ============================================================================
+// MetadataAdapter - A simple custom adapter with a tag
+// ============================================================================
+
+/// A custom PhysicalExprAdapter that wraps another adapter.
+/// The tag metadata is stored in the factory, not the adapter itself.
+#[derive(Debug)]
+struct MetadataAdapter {
+    inner: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for MetadataAdapter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        // Simply delegate to inner adapter
+        self.inner.rewrite(expr)
+    }
+}
+
+// ============================================================================
+// MetadataAdapterFactory - Factory for creating MetadataAdapter instances
+// ============================================================================
+
+/// Factory for creating MetadataAdapter instances.
+/// The tag is stored in the factory and extracted via Debug formatting in `extract_adapter_tag`.
+#[derive(Debug)]
+struct MetadataAdapterFactory {
+    // Note: This field is read via Debug formatting in `extract_adapter_tag`.
+    // Rust's dead code analysis doesn't recognize Debug-based field access.
+    // In PR #19234, this field is used by `with_partition_values`, but that method
+    // doesn't exist in upstream DataFusion's PhysicalExprAdapter trait.
+    #[expect(dead_code)]
+    tag: String,
+}
+
+impl MetadataAdapterFactory {
+    fn new(tag: impl Into<String>) -> Self {
+        Self { tag: tag.into() }
+    }
+}
+
+impl PhysicalExprAdapterFactory for MetadataAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        let inner = DefaultPhysicalExprAdapterFactory
+            .create(logical_file_schema, physical_file_schema)?;
+        Ok(Arc::new(MetadataAdapter { inner }))
+    }
+}
+
+// ============================================================================
+// AdapterPreservingCodec - Custom codec that preserves adapters
+// ============================================================================
+
+/// Extension payload structure for serializing adapter info
+#[derive(Serialize, Deserialize)]
+struct ExtensionPayload {
+    /// Marker to identify this is our custom extension
+    marker: String,
+    /// JSON-serialized adapter metadata
+    adapter_metadata: AdapterMetadata,
+}
+
+/// Metadata about the adapter to recreate it during deserialization
+#[derive(Serialize, Deserialize)]
+struct AdapterMetadata {
+    /// The adapter tag (e.g., "v1")
+    tag: String,
+}
+
+const EXTENSION_MARKER: &str = "adapter_preserving_extension_v1";
+
+/// A codec that intercepts serialization to preserve adapter information.
+#[derive(Debug)]
+struct AdapterPreservingCodec;
+
+impl PhysicalExtensionCodec for AdapterPreservingCodec {
+    // Required method: decode custom extension nodes
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Try to parse as our extension payload
+        if let Ok(payload) = serde_json::from_slice::<ExtensionPayload>(buf)
+            && payload.marker == EXTENSION_MARKER
+        {
+            if inputs.len() != 1 {
+                return Err(datafusion::error::DataFusionError::Plan(format!(
+                    "Extension node expected exactly 1 child, got {}",
+                    inputs.len()
+                )));
+            }
+            let inner_plan = inputs[0].clone();
+
+            // Recreate the adapter factory
+            let adapter_factory = create_adapter_factory(&payload.adapter_metadata.tag);
+
+            // Inject adapter into the plan
+            return inject_adapter_into_plan(inner_plan, adapter_factory);
+        }
+
+        not_impl_err!("Unknown extension type")
+    }
+
+    // Required method: encode custom execution plans
+    fn try_encode(
+        &self,
+        _node: Arc<dyn ExecutionPlan>,
+        _buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        // We don't need this for the example - we use serialize_physical_plan instead
+        not_impl_err!(
+            "try_encode not used - adapter wrapping happens in serialize_physical_plan"
+        )
+    }
+}
+
+impl PhysicalProtoConverterExtension for AdapterPreservingCodec {
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        extension_codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalPlanNode> {
+        // Check if this is a DataSourceExec with adapter
+        if let Some(exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+            && let Some(config) =
+                exec.data_source().as_any().downcast_ref::<FileScanConfig>()
+            && let Some(adapter_factory) = &config.expr_adapter_factory
+            && let Some(tag) = extract_adapter_tag(adapter_factory.as_ref())
+        {
+            // Try to extract our MetadataAdapterFactory's tag
+            println!("    [Serialize] Found DataSourceExec with adapter tag: {tag}");
+
+            // 1. Create adapter metadata
+            let adapter_metadata = AdapterMetadata { tag };
+
+            // 2. Serialize the inner plan to protobuf
+            //    Note that this will drop the custom adapter since the default serialization cannot handle it
+            let inner_proto = PhysicalPlanNode::try_from_physical_plan_with_converter(
+                Arc::clone(plan),
+                extension_codec,
+                self,
+            )?;
+
+            // 3. Create extension payload to wrap the plan
+            //    so that the custom adapter gets re-attached during deserialization
+            //    The choice of JSON is arbitrary; other formats could be used.
+            let payload = ExtensionPayload {
+                marker: EXTENSION_MARKER.to_string(),
+                adapter_metadata,
+            };
+            let payload_bytes = serde_json::to_vec(&payload).map_err(|e| {
+                datafusion::error::DataFusionError::Plan(format!(
+                    "Failed to serialize payload: {e}"
+                ))
+            })?;
+
+            // 4. Return as PhysicalExtensionNode with child plan in inputs
+            return Ok(PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::Extension(
+                    PhysicalExtensionNode {
+                        node: payload_bytes,
+                        inputs: vec![inner_proto],
+                    },
+                )),
+            });
+        }
+
+        // No adapter found, not a DataSourceExec, etc. - use default serialization
+        PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            extension_codec,
+            self,
+        )
+    }
+
+    // Interception point: override deserialization to unwrap adapters
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        extension_codec: &dyn PhysicalExtensionCodec,
+        proto: &PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Check if this is our custom extension wrapper
+        if let Some(PhysicalPlanType::Extension(extension)) = &proto.physical_plan_type
+            && let Ok(payload) =
+                serde_json::from_slice::<ExtensionPayload>(&extension.node)
+            && payload.marker == EXTENSION_MARKER
+        {
+            println!(
+                "    [Deserialize] Found adapter extension with tag: {}",
+                payload.adapter_metadata.tag
+            );
+
+            // Get the inner plan proto from inputs field
+            if extension.inputs.is_empty() {
+                return Err(datafusion::error::DataFusionError::Plan(
+                    "Extension node missing child plan in inputs".to_string(),
+                ));
+            }
+            let inner_proto = &extension.inputs[0];
+
+            // Deserialize the inner plan
+            let inner_plan = inner_proto.try_into_physical_plan_with_converter(
+                ctx,
+                extension_codec,
+                self,
+            )?;
+
+            // Recreate the adapter factory
+            let adapter_factory = create_adapter_factory(&payload.adapter_metadata.tag);
+
+            // Inject adapter into the plan
+            return inject_adapter_into_plan(inner_plan, adapter_factory);
+        }
+
+        // Not our extension - use default deserialization
+        proto.try_into_physical_plan_with_converter(ctx, extension_codec, self)
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
+
+// ============================================================================
+// Helper functions
+// ============================================================================
+
+/// Write a RecordBatch to Parquet in the object store
+async fn write_parquet(
+    store: &dyn ObjectStore,
+    path: &Path,
+    batch: &arrow::record_batch::RecordBatch,
+) -> Result<()> {
+    let mut buf = vec![];
+    let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None)?;
+    writer.write(batch)?;
+    writer.close()?;
+
+    let payload = PutPayload::from_bytes(buf.into());
+    store.put(path, payload).await?;
+    Ok(())
+}
+
+/// Extract the tag from a MetadataAdapterFactory.
+///
+/// Note: Since `PhysicalExprAdapterFactory` doesn't provide `as_any()` for downcasting,
+/// we parse the Debug output. In a production system, you might add a dedicated trait
+/// method for metadata extraction.
+fn extract_adapter_tag(factory: &dyn PhysicalExprAdapterFactory) -> Option<String> {
+    let debug_str = format!("{factory:?}");
+    if debug_str.contains("MetadataAdapterFactory") {
+        // Extract tag from debug output: MetadataAdapterFactory { tag: "v1" }
+        if let Some(start) = debug_str.find("tag: \"") {
+            let after_tag = &debug_str[start + 6..];
+            if let Some(end) = after_tag.find('"') {
+                return Some(after_tag[..end].to_string());
+            }
+        }
+    }
+    None
+}
+
+/// Create an adapter factory from a tag
+fn create_adapter_factory(tag: &str) -> Arc<dyn PhysicalExprAdapterFactory> {
+    Arc::new(MetadataAdapterFactory::new(tag))
+}
+
+/// Inject an adapter into a plan (assumes plan is a DataSourceExec with FileScanConfig)
+fn inject_adapter_into_plan(
+    plan: Arc<dyn ExecutionPlan>,
+    adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    if let Some(exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+        && let Some(config) = exec.data_source().as_any().downcast_ref::<FileScanConfig>()
+    {
+        let new_config = FileScanConfigBuilder::from(config.clone())
+            .with_expr_adapter(Some(adapter_factory))
+            .build();
+        return Ok(DataSourceExec::from_data_source(new_config));
+    }
+    // If not a DataSourceExec with FileScanConfig, return as-is
+    Ok(plan)
+}
+
+/// Helper to verify if a plan has an adapter (for testing/validation)
+fn verify_adapter_in_plan(plan: &Arc<dyn ExecutionPlan>, label: &str) -> bool {
+    // Walk the plan tree to find DataSourceExec with adapter
+    fn check_plan(plan: &dyn ExecutionPlan) -> bool {
+        if let Some(exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+            && let Some(config) =
+                exec.data_source().as_any().downcast_ref::<FileScanConfig>()
+            && config.expr_adapter_factory.is_some()
+        {
+            return true;
+        }
+        // Check children
+        for child in plan.children() {
+            if check_plan(child.as_ref()) {
+                return true;
+            }
+        }
+        false
+    }
+
+    let has_adapter = check_plan(plan.as_ref());
+    println!("    [Verify] {label} plan adapter check: {has_adapter}");
+    has_adapter
+}
diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs
similarity index 66%
rename from datafusion-examples/examples/csv_json_opener.rs
rename to datafusion-examples/examples/custom_data_source/csv_json_opener.rs
index 1a2c2cbff4183..4804586382dc2 100644
--- a/datafusion-examples/examples/csv_json_opener.rs
+++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs
@@ -15,32 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::common::config::CsvOptions;
 use datafusion::{
     assert_batches_eq,
     datasource::{
         file_format::file_compression_type::FileCompressionType,
         listing::PartitionedFile,
         object_store::ObjectStoreUrl,
-        physical_plan::{CsvSource, FileSource, FileStream, JsonOpener, JsonSource},
+        physical_plan::{
+            CsvSource, FileSource, FileStreamBuilder, JsonOpener, JsonSource,
+        },
     },
     error::Result,
     physical_plan::metrics::ExecutionPlanMetricsSet,
-    test_util::aggr_test_schema,
 };
 
 use datafusion::datasource::physical_plan::FileScanConfigBuilder;
+use datafusion_examples::utils::datasets::ExampleDataset;
 use futures::StreamExt;
-use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore};
+use object_store::{ObjectStoreExt, local::LocalFileSystem, memory::InMemory};
 
 /// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly
 /// read data from (CSV/JSON) into Arrow RecordBatches.
 ///
 /// If you want to query data in CSV or JSON files, see the [`dataframe.rs`] and [`sql_query.rs`] examples
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn csv_json_opener() -> Result<()> {
     csv_opener().await?;
     json_opener().await?;
     Ok(())
@@ -48,48 +52,53 @@ async fn main() -> Result<()> {
 
 async fn csv_opener() -> Result<()> {
     let object_store = Arc::new(LocalFileSystem::new());
-    let schema = aggr_test_schema();
 
-    let testdata = datafusion::test_util::arrow_test_data();
-    let path = format!("{testdata}/csv/aggregate_test_100.csv");
+    let dataset = ExampleDataset::Cars;
+    let csv_path = dataset.path();
+    let schema = dataset.schema();
 
-    let path = std::path::Path::new(&path).canonicalize()?;
+    let options = CsvOptions {
+        has_header: Some(true),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        Arc::clone(&schema),
-        Arc::new(CsvSource::default()),
-    )
-    .with_projection(Some(vec![12, 0]))
-    .with_limit(Some(5))
-    .with_file(PartitionedFile::new(path.display().to_string(), 10))
-    .build();
-
-    let config = CsvSource::new(true, b',', b'"')
+    let source = CsvSource::new(Arc::clone(&schema))
+        .with_csv_options(options)
         .with_comment(Some(b'#'))
-        .with_schema(schema)
-        .with_batch_size(8192)
-        .with_projection(&scan_config);
+        .with_batch_size(8192);
 
-    let opener = config.create_file_opener(object_store, &scan_config, 0);
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_projection_indices(Some(vec![0, 1]))?
+            .with_limit(Some(5))
+            .with_file(PartitionedFile::new(csv_path.display().to_string(), 10))
+            .build();
+
+    let opener =
+        scan_config
+            .file_source()
+            .create_file_opener(object_store, &scan_config, 0)?;
 
     let mut result = vec![];
     let mut stream =
-        FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?;
+        FileStreamBuilder::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())
+            .build()?;
     while let Some(batch) = stream.next().await.transpose()? {
         result.push(batch);
     }
     assert_batches_eq!(
         &[
-            "+--------------------------------+----+",
-            "| c13                            | c1 |",
-            "+--------------------------------+----+",
-            "| 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW | c  |",
-            "| C2GT5KVyOPZpgKVl110TyZO0NcJ434 | d  |",
-            "| AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz | b  |",
-            "| 0keZ5G8BffGwgF2RwQD59TFzMStxCB | a  |",
-            "| Ig1QcuKsjHXkproePdERo2w0mYzIqd | b  |",
-            "+--------------------------------+----+",
+            "+-----+-------+",
+            "| car | speed |",
+            "+-----+-------+",
+            "| red | 20.0  |",
+            "| red | 20.3  |",
+            "| red | 21.4  |",
+            "| red | 21.5  |",
+            "| red | 19.0  |",
+            "+-----+-------+",
         ],
         &result
     );
@@ -119,24 +128,25 @@ async fn json_opener() -> Result<()> {
         projected,
         FileCompressionType::UNCOMPRESSED,
         Arc::new(object_store),
+        true,
     );
 
     let scan_config = FileScanConfigBuilder::new(
         ObjectStoreUrl::local_filesystem(),
-        schema,
-        Arc::new(JsonSource::default()),
+        Arc::new(JsonSource::new(schema)),
     )
-    .with_projection(Some(vec![1, 0]))
+    .with_projection_indices(Some(vec![1, 0]))?
     .with_limit(Some(5))
     .with_file(PartitionedFile::new(path.to_string(), 10))
     .build();
 
-    let mut stream = FileStream::new(
+    let mut stream = FileStreamBuilder::new(
         &scan_config,
         0,
         Arc::new(opener),
         &ExecutionPlanMetricsSet::new(),
-    )?;
+    )
+    .build()?;
     let mut result = vec![];
     while let Some(batch) = stream.next().await.transpose()? {
         result.push(batch);
diff --git a/datafusion-examples/examples/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
similarity index 82%
rename from datafusion-examples/examples/csv_sql_streaming.rs
rename to datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
index 99264bbcb486d..4692086a10b26 100644
--- a/datafusion-examples/examples/csv_sql_streaming.rs
+++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs
@@ -15,44 +15,46 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::test_util::datafusion_test_data;
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example demonstrates executing a simple query against an Arrow data source (CSV) and
 /// fetching results with streaming aggregation and streaming window
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn csv_sql_streaming() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    let testdata = datafusion_test_data();
+    let dataset = ExampleDataset::Cars;
+    let csv_path = dataset.path();
 
-    // Register a table source and tell DataFusion the file is ordered by `ts ASC`.
+    // Register a table source and tell DataFusion the file is ordered by `car ASC`.
     // Note it is the responsibility of the user to make sure
     // that file indeed satisfies this condition or else incorrect answers may be produced.
     let asc = true;
     let nulls_first = true;
-    let sort_expr = vec![col("ts").sort(asc, nulls_first)];
+    let sort_expr = vec![col("car").sort(asc, nulls_first)];
     // register csv file with the execution context
     ctx.register_csv(
         "ordered_table",
-        &format!("{testdata}/window_1.csv"),
+        csv_path.to_str().unwrap(),
         CsvReadOptions::new().file_sort_order(vec![sort_expr]),
     )
     .await?;
 
     // execute the query
-    // Following query can be executed with unbounded sources because group by expressions (e.g ts) is
+    // Following query can be executed with unbounded sources because group by expressions (e.g car) is
     // already ordered at the source.
     //
     // Unbounded sources means that if the input came from a "never ending" source (such as a FIFO
     // file on unix) the query could produce results incrementally as data was read.
     let df = ctx
         .sql(
-            "SELECT ts, MIN(inc_col), MAX(inc_col) \
+            "SELECT car, MIN(speed), MAX(speed) \
         FROM ordered_table \
-        GROUP BY ts",
+        GROUP BY car",
         )
         .await?;
 
@@ -63,7 +65,7 @@ async fn main() -> Result<()> {
     // its result in streaming fashion, because its required ordering is already satisfied at the source.
     let df = ctx
         .sql(
-            "SELECT ts, SUM(inc_col) OVER(ORDER BY ts ASC) \
+            "SELECT car, SUM(speed) OVER(ORDER BY car ASC) \
         FROM ordered_table",
         )
         .await?;
diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
similarity index 87%
rename from datafusion-examples/examples/custom_datasource.rs
rename to datafusion-examples/examples/custom_data_source/custom_datasource.rs
index bc865fac5a338..71e589dcf6e88 100644
--- a/datafusion-examples/examples/custom_datasource.rs
+++ b/datafusion-examples/examples/custom_data_source/custom_datasource.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::any::Any;
 use std::collections::{BTreeMap, HashMap};
 use std::fmt::{self, Debug, Formatter};
@@ -22,10 +24,11 @@ use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
 use async_trait::async_trait;
-use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
+use datafusion::arrow::array::{UInt8Builder, UInt64Builder};
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::datasource::{provider_as_source, TableProvider, TableType};
+use datafusion::common::tree_node::TreeNodeRecursion;
+use datafusion::datasource::{TableProvider, TableType, provider_as_source};
 use datafusion::error::Result;
 use datafusion::execution::context::TaskContext;
 use datafusion::logical_expr::LogicalPlanBuilder;
@@ -33,8 +36,8 @@ use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::memory::MemoryStream;
 use datafusion::physical_plan::{
-    project_schema, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    PlanProperties, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream, project_schema,
 };
 use datafusion::prelude::*;
 
@@ -42,8 +45,7 @@ use datafusion::catalog::Session;
 use tokio::time::timeout;
 
 /// This example demonstrates executing a simple query against a custom datasource
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn custom_datasource() -> Result<()> {
     // create our custom datasource and adding some users
     let db = CustomDataSource::default();
     db.populate_users();
@@ -191,10 +193,11 @@ impl TableProvider for CustomDataSource {
 struct CustomExec {
     db: CustomDataSource,
     projected_schema: SchemaRef,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomExec {
+    #[expect(clippy::needless_pass_by_value)]
     fn new(
         projections: Option<&Vec<usize>>,
         schema: SchemaRef,
@@ -205,7 +208,7 @@ impl CustomExec {
         Self {
             db,
             projected_schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -236,7 +239,7 @@ impl ExecutionPlan for CustomExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -281,4 +284,20 @@ impl ExecutionPlan for CustomExec {
             None,
         )?))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
diff --git a/datafusion-examples/examples/custom_data_source/custom_file_casts.rs b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs
new file mode 100644
index 0000000000000..6b37db653e35d
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/custom_file_casts.rs
@@ -0,0 +1,212 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
+use arrow::array::{RecordBatch, record_batch};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+
+use datafusion::assert_batches_eq;
+use datafusion::common::Result;
+use datafusion::common::not_impl_err;
+use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
+};
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr::expressions::{CastColumnExpr, CastExpr};
+use datafusion::prelude::SessionConfig;
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+// Example showing how to implement custom casting rules to adapt file schemas.
+// This example enforces that casts must be strictly widening: if the file type is Int64 and the table type is Int32, it will error
+// before even reading the data.
+// Without this custom cast rule DataFusion would happily do the narrowing cast, potentially erroring only if it found a row with data it could not cast.
+pub async fn custom_file_casts() -> Result<()> {
+    println!("=== Creating example data ===");
+
+    // Create a logical / table schema with an Int32 column (nullable)
+    let logical_schema =
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)]));
+
+    // Create some data that can be cast (Int16 -> Int32 is widening) and some that cannot (Int64 -> Int32 is narrowing)
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let path = Path::from("good.parquet");
+    let batch = record_batch!(("id", Int16, [1, 2, 3]))?;
+    write_data(&store, &path, &batch).await?;
+    let path = Path::from("bad.parquet");
+    let batch = record_batch!(("id", Int64, [1, 2, 3]))?;
+    write_data(&store, &path, &batch).await?;
+
+    // Set up query execution
+    let mut cfg = SessionConfig::new();
+    // Turn on filter pushdown so that the PhysicalExprAdapter is used
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.runtime_env()
+        .register_object_store(ObjectStoreUrl::parse("memory://")?.as_ref(), store);
+
+    // Register our good and bad files via ListingTable
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///good.parquet")?)
+            .infer_options(&ctx.state())
+            .await?
+            .with_schema(Arc::clone(&logical_schema))
+            .with_expr_adapter_factory(Arc::new(
+                CustomCastPhysicalExprAdapterFactory::new(Arc::new(
+                    DefaultPhysicalExprAdapterFactory,
+                )),
+            ));
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("good_table", Arc::new(table))?;
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///bad.parquet")?)
+            .infer_options(&ctx.state())
+            .await?
+            .with_schema(Arc::clone(&logical_schema))
+            .with_expr_adapter_factory(Arc::new(
+                CustomCastPhysicalExprAdapterFactory::new(Arc::new(
+                    DefaultPhysicalExprAdapterFactory,
+                )),
+            ));
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("bad_table", Arc::new(table))?;
+
+    println!("\n=== File with narrower schema is cast ===");
+    let query = "SELECT id FROM good_table WHERE id > 1";
+    println!("Query: {query}");
+    let batches = ctx.sql(query).await?.collect().await?;
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| id |",
+        "+----+",
+        "| 2  |",
+        "| 3  |",
+        "+----+",
+    ];
+    arrow::util::pretty::print_batches(&batches)?;
+    assert_batches_eq!(expected, &batches);
+
+    println!("\n=== File with wider schema errors ===");
+    let query = "SELECT id FROM bad_table WHERE id > 1";
+    println!("Query: {query}");
+    match ctx.sql(query).await?.collect().await {
+        Ok(_) => panic!("Expected error for narrowing cast, but query succeeded"),
+        Err(e) => {
+            println!("Caught expected error: {e}");
+        }
+    }
+    Ok(())
+}
+
+async fn write_data(
+    store: &dyn ObjectStore,
+    path: &Path,
+    batch: &RecordBatch,
+) -> Result<()> {
+    let mut buf = vec![];
+    let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None)?;
+    writer.write(batch)?;
+    writer.close()?;
+
+    let payload = PutPayload::from_bytes(buf.into());
+    store.put(path, payload).await?;
+    Ok(())
+}
+
+/// Factory for creating DefaultValuePhysicalExprAdapter instances
+#[derive(Debug)]
+struct CustomCastPhysicalExprAdapterFactory {
+    inner: Arc<dyn PhysicalExprAdapterFactory>,
+}
+
+impl CustomCastPhysicalExprAdapterFactory {
+    fn new(inner: Arc<dyn PhysicalExprAdapterFactory>) -> Self {
+        Self { inner }
+    }
+}
+
+impl PhysicalExprAdapterFactory for CustomCastPhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        let inner = self
+            .inner
+            .create(logical_file_schema, Arc::clone(&physical_file_schema))?;
+        Ok(Arc::new(CustomCastsPhysicalExprAdapter {
+            physical_file_schema,
+            inner,
+        }))
+    }
+}
+
+/// Custom PhysicalExprAdapter that handles missing columns with default values from metadata
+/// and wraps DefaultPhysicalExprAdapter for standard schema adaptation
+#[derive(Debug, Clone)]
+struct CustomCastsPhysicalExprAdapter {
+    physical_file_schema: SchemaRef,
+    inner: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for CustomCastsPhysicalExprAdapter {
+    fn rewrite(&self, mut expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        // First delegate to the inner adapter to handle missing columns and discover any necessary casts
+        expr = self.inner.rewrite(expr)?;
+        // Now we can apply custom casting rules or even swap out all CastExprs for a custom cast kernel / expression
+        // For example, [DataFusion Comet](https://github.com/apache/datafusion-comet) has a [custom cast kernel](https://github.com/apache/datafusion-comet/blob/b4ac876ab420ed403ac7fc8e1b29f42f1f442566/native/spark-expr/src/conversion_funcs/cast.rs#L133-L138).
+        expr.transform(|expr| {
+            if let Some(cast) = expr.as_any().downcast_ref::<CastExpr>() {
+                let input_data_type =
+                    cast.expr().data_type(&self.physical_file_schema)?;
+                let output_data_type = cast.data_type(&self.physical_file_schema)?;
+                if !cast.is_bigger_cast(&input_data_type) {
+                    return not_impl_err!(
+                        "Unsupported CAST from {input_data_type} to {output_data_type}"
+                    );
+                }
+            }
+            if let Some(cast) = expr.as_any().downcast_ref::<CastColumnExpr>() {
+                let input_data_type =
+                    cast.expr().data_type(&self.physical_file_schema)?;
+                let output_data_type = cast.data_type(&self.physical_file_schema)?;
+                if !CastExpr::check_bigger_cast(
+                    cast.target_field().data_type(),
+                    &input_data_type,
+                ) {
+                    return not_impl_err!(
+                        "Unsupported CAST from {input_data_type} to {output_data_type}"
+                    );
+                }
+            }
+            Ok(Transformed::no(expr))
+        })
+        .data()
+    }
+}
diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_data_source/custom_file_format.rs
similarity index 89%
rename from datafusion-examples/examples/custom_file_format.rs
rename to datafusion-examples/examples/custom_data_source/custom_file_format.rs
index ac1e643517685..6817beec41188 100644
--- a/datafusion-examples/examples/custom_file_format.rs
+++ b/datafusion-examples/examples/custom_data_source/custom_file_format.rs
@@ -15,33 +15,33 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::{any::Any, sync::Arc};
 
 use arrow::{
     array::{AsArray, RecordBatch, StringArray, UInt8Array},
     datatypes::{DataType, Field, Schema, SchemaRef, UInt64Type},
 };
-use datafusion::physical_expr::LexRequirement;
 use datafusion::{
     catalog::Session,
     common::{GetExt, Statistics},
-};
-use datafusion::{
-    datasource::physical_plan::FileSource, execution::session_state::SessionStateBuilder,
-};
-use datafusion::{
     datasource::{
+        MemTable,
         file_format::{
-            csv::CsvFormatFactory, file_compression_type::FileCompressionType,
-            FileFormat, FileFormatFactory,
+            FileFormat, FileFormatFactory, csv::CsvFormatFactory,
+            file_compression_type::FileCompressionType,
         },
-        physical_plan::{FileScanConfig, FileSinkConfig},
-        MemTable,
+        physical_plan::{FileScanConfig, FileSinkConfig, FileSource},
+        table_schema::TableSchema,
     },
     error::Result,
+    execution::session_state::SessionStateBuilder,
+    physical_expr_common::sort_expr::LexRequirement,
     physical_plan::ExecutionPlan,
     prelude::SessionContext,
 };
+
 use object_store::{ObjectMeta, ObjectStore};
 use tempfile::tempdir;
 
@@ -50,6 +50,42 @@ use tempfile::tempdir;
 /// TSVFileFormatFactory is responsible for creating instances of TSVFileFormat.
 /// The former, once registered with the SessionState, will then be used
 /// to facilitate SQL operations on TSV files, such as `COPY TO` shown here.
+pub async fn custom_file_format() -> Result<()> {
+    // Create a new context with the default configuration
+    let mut state = SessionStateBuilder::new().with_default_features().build();
+
+    // Register the custom file format
+    let file_format = Arc::new(TSVFileFactory::new());
+    state.register_file_format(file_format, true)?;
+
+    // Create a new context with the custom file format
+    let ctx = SessionContext::new_with_state(state);
+
+    let mem_table = create_mem_table();
+    ctx.register_table("mem_table", mem_table)?;
+
+    let temp_dir = tempdir().unwrap();
+    let table_save_path = temp_dir.path().join("mem_table.tsv");
+
+    let d = ctx
+        .sql(&format!(
+            "COPY mem_table TO '{}' STORED AS TSV;",
+            table_save_path.display(),
+        ))
+        .await?;
+
+    let results = d.collect().await?;
+    println!(
+        "Number of inserted rows: {:?}",
+        (results[0]
+            .column_by_name("count")
+            .unwrap()
+            .as_primitive::<UInt64Type>()
+            .value(0))
+    );
+
+    Ok(())
+}
 
 #[derive(Debug)]
 /// Custom file format that reads and writes TSV files
@@ -84,6 +120,10 @@ impl FileFormat for TSVFileFormat {
         }
     }
 
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        None
+    }
+
     async fn infer_schema(
         &self,
         state: &dyn Session,
@@ -127,8 +167,8 @@ impl FileFormat for TSVFileFormat {
             .await
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        self.csv_file_format.file_source()
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        self.csv_file_format.file_source(table_schema)
     }
 }
 
@@ -179,44 +219,6 @@ impl GetExt for TSVFileFactory {
     }
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
-    // Create a new context with the default configuration
-    let mut state = SessionStateBuilder::new().with_default_features().build();
-
-    // Register the custom file format
-    let file_format = Arc::new(TSVFileFactory::new());
-    state.register_file_format(file_format, true).unwrap();
-
-    // Create a new context with the custom file format
-    let ctx = SessionContext::new_with_state(state);
-
-    let mem_table = create_mem_table();
-    ctx.register_table("mem_table", mem_table).unwrap();
-
-    let temp_dir = tempdir().unwrap();
-    let table_save_path = temp_dir.path().join("mem_table.tsv");
-
-    let d = ctx
-        .sql(&format!(
-            "COPY mem_table TO '{}' STORED AS TSV;",
-            table_save_path.display(),
-        ))
-        .await?;
-
-    let results = d.collect().await?;
-    println!(
-        "Number of inserted rows: {:?}",
-        (results[0]
-            .column_by_name("count")
-            .unwrap()
-            .as_primitive::<UInt64Type>()
-            .value(0))
-    );
-
-    Ok(())
-}
-
 // create a simple mem table
 fn create_mem_table() -> Arc<MemTable> {
     let fields = vec![
diff --git a/datafusion-examples/examples/custom_data_source/default_column_values.rs b/datafusion-examples/examples/custom_data_source/default_column_values.rs
new file mode 100644
index 0000000000000..40c8836c1f822
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/default_column_values.rs
@@ -0,0 +1,335 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use arrow::array::RecordBatch;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use async_trait::async_trait;
+
+use datafusion::assert_batches_eq;
+use datafusion::catalog::memory::DataSourceExec;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::DFSchema;
+use datafusion::common::{Result, ScalarValue};
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource};
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::logical_expr::utils::conjunction;
+use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType};
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::parquet::file::properties::WriterProperties;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::{SessionConfig, lit};
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    replace_columns_with_literals,
+};
+use futures::StreamExt;
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+// Metadata key for storing default values in field metadata
+const DEFAULT_VALUE_METADATA_KEY: &str = "example.default_value";
+
+/// Example showing how to implement custom default value handling for missing columns
+/// using field metadata and PhysicalExprAdapter.
+///
+/// This example demonstrates how to:
+/// 1. Store default values in field metadata using a constant key
+/// 2. Create a custom PhysicalExprAdapter that reads these defaults
+/// 3. Inject default values for missing columns in filter predicates using `replace_columns_with_literals`
+/// 4. Use the DefaultPhysicalExprAdapter as a fallback for standard schema adaptation
+/// 5. Convert string default values to proper types using `ScalarValue::cast_to()` at planning time
+///
+/// Important: PhysicalExprAdapter handles rewriting both filter predicates and projection
+/// expressions for file scans, including handling missing columns.
+///
+/// The metadata-based approach provides a flexible way to store default values as strings
+/// and cast them to the appropriate types at planning time, avoiding runtime overhead.
+pub async fn default_column_values() -> Result<()> {
+    println!("=== Creating example data with missing columns and default values ===");
+
+    // Create sample data where the logical schema has more columns than the physical schema
+    let (logical_schema, physical_schema, batch) = create_sample_data_with_defaults();
+
+    let store = InMemory::new();
+    let buf = {
+        let mut buf = vec![];
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(2))
+            .build();
+
+        let mut writer =
+            ArrowWriter::try_new(&mut buf, physical_schema.clone(), Some(props))?;
+
+        writer.write(&batch)?;
+        writer.close()?;
+        buf
+    };
+    let path = Path::from("example.parquet");
+    let payload = PutPayload::from_bytes(buf.into());
+    store.put(&path, payload).await?;
+
+    // Create a custom table provider that handles missing columns with defaults
+    let table_provider = Arc::new(DefaultValueTableProvider::new(logical_schema));
+
+    // Set up query execution
+    let mut cfg = SessionConfig::new();
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+
+    // Register our table
+    ctx.register_table("example_table", table_provider)?;
+
+    ctx.runtime_env().register_object_store(
+        ObjectStoreUrl::parse("memory://")?.as_ref(),
+        Arc::new(store),
+    );
+
+    println!("\n=== Demonstrating default value injection in filter predicates ===");
+    let query = "SELECT id, name FROM example_table WHERE status = 'active' ORDER BY id";
+    println!("Query: {query}");
+    println!("Note: The 'status' column doesn't exist in the physical schema,");
+    println!(
+        "but our adapter injects the default value 'active' for the filter predicate."
+    );
+
+    let batches = ctx.sql(query).await?.collect().await?;
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+-------+",
+        "| id | name  |",
+        "+----+-------+",
+        "| 1  | Alice |",
+        "| 2  | Bob   |",
+        "| 3  | Carol |",
+        "+----+-------+",
+    ];
+    arrow::util::pretty::print_batches(&batches)?;
+    assert_batches_eq!(expected, &batches);
+
+    println!("\n=== Key Insight ===");
+    println!("This example demonstrates how PhysicalExprAdapter works:");
+    println!("1. Physical schema only has 'id' and 'name' columns");
+    println!(
+        "2. Logical schema has 'id', 'name', 'status', and 'priority' columns with defaults"
+    );
+    println!(
+        "3. Our custom adapter uses replace_columns_with_literals to inject default values"
+    );
+    println!("4. Default values from metadata are cast to proper types at planning time");
+    println!("5. The DefaultPhysicalExprAdapter handles other schema adaptations");
+
+    Ok(())
+}
+
+/// Create sample data with a logical schema that has default values in metadata
+/// and a physical schema that's missing some columns
+fn create_sample_data_with_defaults() -> (SchemaRef, SchemaRef, RecordBatch) {
+    // Create metadata for default values
+    let mut status_metadata = HashMap::new();
+    status_metadata.insert(DEFAULT_VALUE_METADATA_KEY.to_string(), "active".to_string());
+
+    let mut priority_metadata = HashMap::new();
+    priority_metadata.insert(DEFAULT_VALUE_METADATA_KEY.to_string(), "1".to_string());
+
+    // The logical schema includes all columns with their default values in metadata
+    // Note: We make the columns with defaults nullable to allow the default adapter to handle them
+    let logical_schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, false),
+        Field::new("status", DataType::Utf8, true).with_metadata(status_metadata),
+        Field::new("priority", DataType::Int32, true).with_metadata(priority_metadata),
+    ]);
+
+    // The physical schema only has some columns (simulating missing columns in storage)
+    let physical_schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, false),
+    ]);
+
+    // Create sample data for the physical schema
+    let batch = RecordBatch::try_new(
+        Arc::new(physical_schema.clone()),
+        vec![
+            Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])),
+            Arc::new(arrow::array::StringArray::from(vec![
+                "Alice", "Bob", "Carol",
+            ])),
+        ],
+    )
+    .unwrap();
+
+    (Arc::new(logical_schema), Arc::new(physical_schema), batch)
+}
+
+/// Custom TableProvider that uses DefaultValuePhysicalExprAdapter
+#[derive(Debug)]
+struct DefaultValueTableProvider {
+    schema: SchemaRef,
+}
+
+impl DefaultValueTableProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self { schema }
+    }
+}
+
+#[async_trait]
+impl TableProvider for DefaultValueTableProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()])
+    }
+
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let schema = Arc::clone(&self.schema);
+        let df_schema = DFSchema::try_from(schema.clone())?;
+        let filter = state.create_physical_expr(
+            conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true)),
+            &df_schema,
+        )?;
+
+        let parquet_source = ParquetSource::new(schema.clone())
+            .with_predicate(filter)
+            .with_pushdown_filters(true);
+
+        let object_store_url = ObjectStoreUrl::parse("memory://")?;
+        let store = state.runtime_env().object_store(object_store_url)?;
+
+        let mut files = vec![];
+        let mut listing = store.list(None);
+        while let Some(file) = listing.next().await {
+            if let Ok(file) = file {
+                files.push(file);
+            }
+        }
+
+        let file_group = files
+            .iter()
+            .map(|file| PartitionedFile::new(file.location.clone(), file.size))
+            .collect();
+
+        let file_scan_config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("memory://")?,
+            Arc::new(parquet_source),
+        )
+        .with_projection_indices(projection.cloned())?
+        .with_limit(limit)
+        .with_file_group(file_group)
+        .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _));
+
+        Ok(Arc::new(DataSourceExec::new(Arc::new(
+            file_scan_config.build(),
+        ))))
+    }
+}
+
+/// Factory for creating DefaultValuePhysicalExprAdapter instances
+#[derive(Debug)]
+struct DefaultValuePhysicalExprAdapterFactory;
+
+impl PhysicalExprAdapterFactory for DefaultValuePhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        let default_factory = DefaultPhysicalExprAdapterFactory;
+        let default_adapter = default_factory.create(
+            Arc::clone(&logical_file_schema),
+            Arc::clone(&physical_file_schema),
+        )?;
+
+        Ok(Arc::new(DefaultValuePhysicalExprAdapter {
+            logical_file_schema,
+            physical_file_schema,
+            default_adapter,
+        }))
+    }
+}
+
+/// Custom PhysicalExprAdapter that handles missing columns with default values from metadata
+/// and wraps DefaultPhysicalExprAdapter for standard schema adaptation
+#[derive(Debug)]
+struct DefaultValuePhysicalExprAdapter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+    default_adapter: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for DefaultValuePhysicalExprAdapter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        // Pre-compute replacements for missing columns with default values
+        let mut replacements = HashMap::new();
+        for field in self.logical_file_schema.fields() {
+            // Skip columns that exist in physical schema
+            if self.physical_file_schema.index_of(field.name()).is_ok() {
+                continue;
+            }
+
+            // Check if this missing column has a default value in metadata
+            if let Some(default_str) = field.metadata().get(DEFAULT_VALUE_METADATA_KEY) {
+                // Create a Utf8 ScalarValue from the string and cast it to the target type
+                let string_value = ScalarValue::Utf8(Some(default_str.to_string()));
+                let typed_value = string_value.cast_to(field.data_type())?;
+                replacements.insert(field.name().as_str(), typed_value);
+            }
+        }
+
+        // Replace columns with their default literals if any
+        let rewritten = if !replacements.is_empty() {
+            let refs: HashMap<_, _> = replacements.iter().map(|(k, v)| (*k, v)).collect();
+            replace_columns_with_literals(expr, &refs)?
+        } else {
+            expr
+        };
+
+        // Apply the default adapter as a fallback for other schema adaptations
+        self.default_adapter.rewrite(rewritten)
+    }
+}
diff --git a/datafusion-examples/examples/file_stream_provider.rs b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs
similarity index 90%
rename from datafusion-examples/examples/file_stream_provider.rs
rename to datafusion-examples/examples/custom_data_source/file_stream_provider.rs
index e6c59d57e98de..5b43072d43f80 100644
--- a/datafusion-examples/examples/file_stream_provider.rs
+++ b/datafusion-examples/examples/custom_data_source/file_stream_provider.rs
@@ -15,6 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+/// Demonstrates how to use [`FileStreamProvider`] and [`StreamTable`] to stream data
+/// from a file-like source (FIFO) into DataFusion for continuous querying.
+///
+/// On non-Windows systems, this example creates a named pipe (FIFO) and
+/// writes rows into it asynchronously while DataFusion reads the data
+/// through a `FileStreamProvider`.
+///
+/// This illustrates how to integrate dynamically updated data sources
+/// with DataFusion without needing to reload the entire dataset each time.
+///
+/// This example does not work on Windows.
+pub async fn file_stream_provider() -> datafusion::error::Result<()> {
+    #[cfg(target_os = "windows")]
+    {
+        println!("file_stream_provider example does not work on windows");
+        Ok(())
+    }
+    #[cfg(not(target_os = "windows"))]
+    {
+        non_windows::main().await
+    }
+}
+
 #[cfg(not(target_os = "windows"))]
 mod non_windows {
     use datafusion::assert_batches_eq;
@@ -22,8 +47,8 @@ mod non_windows {
     use std::fs::{File, OpenOptions};
     use std::io::Write;
     use std::path::PathBuf;
-    use std::sync::atomic::{AtomicBool, Ordering};
     use std::sync::Arc;
+    use std::sync::atomic::{AtomicBool, Ordering};
     use std::thread;
     use std::time::Duration;
 
@@ -34,9 +59,9 @@ mod non_windows {
     use tempfile::TempDir;
     use tokio::task::JoinSet;
 
-    use datafusion::common::{exec_err, Result};
-    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
+    use datafusion::common::{Result, exec_err};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::logical_expr::SortExpr;
     use datafusion::prelude::{SessionConfig, SessionContext};
 
@@ -101,7 +126,6 @@ mod non_windows {
         let broken_pipe_timeout = Duration::from_secs(10);
         let sa = file_path;
         // Spawn a new thread to write to the FIFO file
-        #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
         tasks.spawn_blocking(move || {
             let file = OpenOptions::new().write(true).open(sa).unwrap();
             // Reference time to use when deciding to fail the test
@@ -186,16 +210,3 @@ mod non_windows {
         Ok(())
     }
 }
-
-#[tokio::main]
-async fn main() -> datafusion::error::Result<()> {
-    #[cfg(target_os = "windows")]
-    {
-        println!("file_stream_provider example does not work on windows");
-        Ok(())
-    }
-    #[cfg(not(target_os = "windows"))]
-    {
-        non_windows::main().await
-    }
-}
diff --git a/datafusion-examples/examples/custom_data_source/main.rs b/datafusion-examples/examples/custom_data_source/main.rs
new file mode 100644
index 0000000000000..0d21a62591129
--- /dev/null
+++ b/datafusion-examples/examples/custom_data_source/main.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples are all related to extending or defining how DataFusion reads data
+//!
+//! These examples demonstrate how DataFusion reads data.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example custom_data_source -- [all|csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|default_column_values|file_stream_provider]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `adapter_serialization`  
+//!   (file: adapter_serialization.rs, desc: Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception)
+//!
+//! - `csv_json_opener`  
+//!   (file: csv_json_opener.rs, desc: Use low-level FileOpener APIs for CSV/JSON)
+//!
+//! - `csv_sql_streaming`
+//!   (file: csv_sql_streaming.rs, desc: Run a streaming SQL query against CSV data)
+//!
+//! - `custom_datasource`  
+//!   (file: custom_datasource.rs, desc: Query a custom TableProvider)
+//!
+//! - `custom_file_casts`
+//!   (file: custom_file_casts.rs, desc: Implement custom casting rules)
+//!
+//! - `custom_file_format`
+//!   (file: custom_file_format.rs, desc: Write to a custom file format)
+//!
+//! - `default_column_values`
+//!   (file: default_column_values.rs, desc: Custom default values using metadata)
+//!
+//! - `file_stream_provider`
+//!   (file: file_stream_provider.rs, desc: Read/write via FileStreamProvider for streams)
+
+mod adapter_serialization;
+mod csv_json_opener;
+mod csv_sql_streaming;
+mod custom_datasource;
+mod custom_file_casts;
+mod custom_file_format;
+mod default_column_values;
+mod file_stream_provider;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AdapterSerialization,
+    CsvJsonOpener,
+    CsvSqlStreaming,
+    CustomDatasource,
+    CustomFileCasts,
+    CustomFileFormat,
+    DefaultColumnValues,
+    FileStreamProvider,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "custom_data_source";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AdapterSerialization => {
+                adapter_serialization::adapter_serialization().await?
+            }
+            ExampleKind::CsvJsonOpener => csv_json_opener::csv_json_opener().await?,
+            ExampleKind::CsvSqlStreaming => {
+                csv_sql_streaming::csv_sql_streaming().await?
+            }
+            ExampleKind::CustomDatasource => {
+                custom_datasource::custom_datasource().await?
+            }
+            ExampleKind::CustomFileCasts => {
+                custom_file_casts::custom_file_casts().await?
+            }
+            ExampleKind::CustomFileFormat => {
+                custom_file_format::custom_file_format().await?
+            }
+            ExampleKind::DefaultColumnValues => {
+                default_column_values::default_column_values().await?
+            }
+            ExampleKind::FileStreamProvider => {
+                file_stream_provider::file_stream_provider().await?
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/data_io/catalog.rs
similarity index 97%
rename from datafusion-examples/examples/catalog.rs
rename to datafusion-examples/examples/data_io/catalog.rs
index 229867cdfc5bb..9781a93374ea6 100644
--- a/datafusion-examples/examples/catalog.rs
+++ b/datafusion-examples/examples/data_io/catalog.rs
@@ -15,15 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! Simple example of a catalog/schema implementation.
 use async_trait::async_trait;
 use datafusion::{
     arrow::util::pretty,
     catalog::{CatalogProvider, CatalogProviderList, SchemaProvider},
     datasource::{
-        file_format::{csv::CsvFormat, FileFormat},
-        listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
         TableProvider,
+        file_format::{FileFormat, csv::CsvFormat},
+        listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
     },
     error::Result,
     execution::context::SessionState,
@@ -34,8 +36,8 @@ use std::{any::Any, collections::HashMap, path::Path, sync::Arc};
 use std::{fs::File, io::Write};
 use tempfile::TempDir;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Register the table into a custom catalog
+pub async fn catalog() -> Result<()> {
     env_logger::builder()
         .filter_level(log::LevelFilter::Info)
         .init();
@@ -134,12 +136,13 @@ struct DirSchemaOpts<'a> {
     dir: &'a Path,
     format: Arc<dyn FileFormat>,
 }
+
 /// Schema where every file with extension `ext` in a given `dir` is a table.
 #[derive(Debug)]
 struct DirSchema {
-    ext: String,
     tables: RwLock<HashMap<String, Arc<dyn TableProvider>>>,
 }
+
 impl DirSchema {
     async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result<Arc<Self>> {
         let DirSchemaOpts { ext, dir, format } = opts;
@@ -169,13 +172,8 @@ impl DirSchema {
         }
         Ok(Arc::new(Self {
             tables: RwLock::new(tables),
-            ext: ext.to_string(),
         }))
     }
-    #[allow(unused)]
-    fn name(&self) -> &str {
-        &self.ext
-    }
 }
 
 #[async_trait]
@@ -198,6 +196,7 @@ impl SchemaProvider for DirSchema {
         let tables = self.tables.read().unwrap();
         tables.contains_key(name)
     }
+
     fn register_table(
         &self,
         name: String,
@@ -211,7 +210,6 @@ impl SchemaProvider for DirSchema {
 
     /// If supported by the implementation, removes an existing table from this schema and returns it.
     /// If no table of that name exists, returns Ok(None).
-    #[allow(unused_variables)]
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         let mut tables = self.tables.write().unwrap();
         log::info!("dropping table {name}");
@@ -223,6 +221,7 @@ impl SchemaProvider for DirSchema {
 struct DirCatalog {
     schemas: RwLock<HashMap<String, Arc<dyn SchemaProvider>>>,
 }
+
 impl DirCatalog {
     fn new() -> Self {
         Self {
@@ -230,10 +229,12 @@ impl DirCatalog {
         }
     }
 }
+
 impl CatalogProvider for DirCatalog {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn register_schema(
         &self,
         name: &str,
@@ -260,11 +261,13 @@ impl CatalogProvider for DirCatalog {
         }
     }
 }
+
 /// Catalog lists holds multiple catalog providers. Each context has a single catalog list.
 #[derive(Debug)]
 struct CustomCatalogProviderList {
     catalogs: RwLock<HashMap<String, Arc<dyn CatalogProvider>>>,
 }
+
 impl CustomCatalogProviderList {
     fn new() -> Self {
         Self {
@@ -272,10 +275,12 @@ impl CustomCatalogProviderList {
         }
     }
 }
+
 impl CatalogProviderList for CustomCatalogProviderList {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn register_catalog(
         &self,
         name: String,
diff --git a/datafusion-examples/examples/data_io/json_shredding.rs b/datafusion-examples/examples/data_io/json_shredding.rs
new file mode 100644
index 0000000000000..ca1513f626245
--- /dev/null
+++ b/datafusion-examples/examples/data_io/json_shredding.rs
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+
+use datafusion::assert_batches_eq;
+use datafusion::common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
+use datafusion::common::{Result, assert_contains, exec_datafusion_err};
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
+};
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::parquet::file::properties::WriterProperties;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr::{ScalarFunctionExpr, expressions};
+use datafusion::prelude::SessionConfig;
+use datafusion::scalar::ScalarValue;
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+};
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStoreExt, PutPayload};
+
+// Example showing how to implement custom filter rewriting for JSON shredding.
+//
+// JSON shredding is a technique for optimizing queries on semi-structured data
+// by materializing commonly accessed fields into separate columns for better
+// columnar storage performance.
+//
+// In this example, we have a table with both:
+// - Original JSON data: data: '{"age": 30}'
+// - Shredded flat columns: _data.name: "Alice" (extracted from JSON)
+//
+// Our custom TableProvider uses a PhysicalExprAdapter to rewrite
+// expressions like `json_get_str('name', data)` to use the pre-computed
+// flat column `_data.name` when available. This allows the query engine to:
+// 1. Push down predicates for better filtering
+// 2. Avoid expensive JSON parsing at query time
+// 3. Leverage columnar storage benefits for the materialized fields
+pub async fn json_shredding() -> Result<()> {
+    println!("=== Creating example data with flat columns and underscore prefixes ===");
+
+    // Create sample data with flat columns using underscore prefixes
+    let (table_schema, batch) = create_sample_data();
+
+    let store = InMemory::new();
+    let buf = {
+        let mut buf = vec![];
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(2))
+            .build();
+
+        let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props))
+            .expect("creating writer");
+
+        writer.write(&batch).expect("Writing batch");
+        writer.close().unwrap();
+        buf
+    };
+    let path = Path::from("example.parquet");
+    let payload = PutPayload::from_bytes(buf.into());
+    store.put(&path, payload).await?;
+
+    // Set up query execution
+    let mut cfg = SessionConfig::new();
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.runtime_env().register_object_store(
+        ObjectStoreUrl::parse("memory://")?.as_ref(),
+        Arc::new(store),
+    );
+
+    // Create a custom table provider that rewrites struct field access
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///example.parquet")?)
+            .infer_options(&ctx.state())
+            .await?
+            .with_schema(table_schema)
+            .with_expr_adapter_factory(Arc::new(ShreddedJsonRewriterFactory));
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    let table_provider = Arc::new(table);
+
+    // Register our table
+    ctx.register_table("structs", table_provider)?;
+    ctx.register_udf(ScalarUDF::new_from_impl(JsonGetStr::default()));
+
+    println!("\n=== Showing all data ===");
+    let batches = ctx.sql("SELECT * FROM structs").await?.collect().await?;
+    arrow::util::pretty::print_batches(&batches)?;
+
+    println!("\n=== Running query with flat column access and filter ===");
+    let query = "SELECT json_get_str('age', data) as age FROM structs WHERE json_get_str('name', data) = 'Bob'";
+    println!("Query: {query}");
+
+    let batches = ctx.sql(query).await?.collect().await?;
+
+    #[rustfmt::skip]
+    let expected = [
+        "+-----+",
+        "| age |",
+        "+-----+",
+        "| 25  |",
+        "+-----+",
+    ];
+    arrow::util::pretty::print_batches(&batches)?;
+    assert_batches_eq!(expected, &batches);
+
+    println!("\n=== Running explain analyze to confirm row group pruning ===");
+
+    let batches = ctx
+        .sql(&format!("EXPLAIN ANALYZE {query}"))
+        .await?
+        .collect()
+        .await?;
+    let plan = format!("{}", arrow::util::pretty::pretty_format_batches(&batches)?);
+    println!("{plan}");
+    assert_contains!(&plan, "row_groups_pruned_statistics=2 total → 1 matched");
+    assert_contains!(&plan, "pushdown_rows_pruned=1");
+
+    Ok(())
+}
+
+/// Create the example data with flat columns using underscore prefixes.
+///
+/// This demonstrates the logical data structure:
+/// - Table schema: What users see (just the 'data' JSON column)
+/// - File schema: What's physically stored (both 'data' and materialized '_data.name')
+///
+/// The naming convention uses underscore prefixes to indicate shredded columns:
+/// - `data` -> original JSON column
+/// - `_data.name` -> materialized field from JSON data.name
+fn create_sample_data() -> (SchemaRef, RecordBatch) {
+    // The table schema only has the main data column - this is what users query against
+    let table_schema = Schema::new(vec![Field::new("data", DataType::Utf8, false)]);
+
+    // The file schema has both the main column and the shredded flat column with underscore prefix
+    // This represents the actual physical storage with pre-computed columns
+    let file_schema = Schema::new(vec![
+        Field::new("data", DataType::Utf8, false), // Original JSON data
+        Field::new("_data.name", DataType::Utf8, false), // Materialized name field
+    ]);
+
+    let batch = create_sample_record_batch(&file_schema);
+
+    (Arc::new(table_schema), batch)
+}
+
+/// Create the actual RecordBatch with sample data
+fn create_sample_record_batch(file_schema: &Schema) -> RecordBatch {
+    // Build a RecordBatch with flat columns
+    let data_array = StringArray::from(vec![
+        r#"{"age": 30}"#,
+        r#"{"age": 25}"#,
+        r#"{"age": 35}"#,
+        r#"{"age": 22}"#,
+    ]);
+    let names_array = StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"]);
+
+    RecordBatch::try_new(
+        Arc::new(file_schema.clone()),
+        vec![Arc::new(data_array), Arc::new(names_array)],
+    )
+    .unwrap()
+}
+
+/// Scalar UDF that uses serde_json to access json fields
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct JsonGetStr {
+    signature: Signature,
+}
+
+impl Default for JsonGetStr {
+    fn default() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for JsonGetStr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "json_get_str"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        assert!(
+            args.args.len() == 2,
+            "json_get_str requires exactly 2 arguments"
+        );
+        let key = match &args.args[0] {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(key))) => key,
+            _ => {
+                return Err(exec_datafusion_err!(
+                    "json_get_str first argument must be a string"
+                ));
+            }
+        };
+        // We expect a string array that contains JSON strings
+        let json_array = match &args.args[1] {
+            ColumnarValue::Array(array) => array
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .ok_or_else(|| {
+                exec_datafusion_err!(
+                    "json_get_str second argument must be a string array"
+                )
+            })?,
+            _ => {
+                return Err(exec_datafusion_err!(
+                    "json_get_str second argument must be a string array"
+                ));
+            }
+        };
+        let values = json_array
+            .iter()
+            .map(|value| {
+                value.and_then(|v| {
+                    let json_value: serde_json::Value =
+                        serde_json::from_str(v).unwrap_or_default();
+                    json_value.get(key).map(|v| v.to_string())
+                })
+            })
+            .collect::<StringArray>();
+        Ok(ColumnarValue::Array(Arc::new(values)))
+    }
+}
+
+/// Factory for creating ShreddedJsonRewriter instances
+#[derive(Debug)]
+struct ShreddedJsonRewriterFactory;
+
+impl PhysicalExprAdapterFactory for ShreddedJsonRewriterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        let default_factory = DefaultPhysicalExprAdapterFactory;
+        let default_adapter = default_factory.create(
+            Arc::clone(&logical_file_schema),
+            Arc::clone(&physical_file_schema),
+        )?;
+
+        Ok(Arc::new(ShreddedJsonRewriter {
+            physical_file_schema,
+            default_adapter,
+        }))
+    }
+}
+
+/// Rewriter that converts json_get_str calls to direct flat column references
+/// and wraps DefaultPhysicalExprAdapter for standard schema adaptation
+#[derive(Debug)]
+struct ShreddedJsonRewriter {
+    physical_file_schema: SchemaRef,
+    default_adapter: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for ShreddedJsonRewriter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        // First try our custom JSON shredding rewrite
+        let rewritten = expr
+            .transform(|expr| self.rewrite_impl(expr, &self.physical_file_schema))
+            .data()?;
+
+        // Then apply the default adapter as a fallback to handle standard schema differences
+        // like type casting and missing columns
+        self.default_adapter.rewrite(rewritten)
+    }
+}
+
+impl ShreddedJsonRewriter {
+    fn rewrite_impl(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+        physical_file_schema: &Schema,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        if let Some(func) = expr.as_any().downcast_ref::<ScalarFunctionExpr>()
+            && func.name() == "json_get_str"
+            && func.args().len() == 2
+        {
+            // Get the key from the first argument
+            if let Some(literal) = func.args()[0]
+                .as_any()
+                .downcast_ref::<expressions::Literal>()
+                && let ScalarValue::Utf8(Some(field_name)) = literal.value()
+            {
+                // Get the column from the second argument
+                if let Some(column) = func.args()[1]
+                    .as_any()
+                    .downcast_ref::<expressions::Column>()
+                {
+                    let column_name = column.name();
+                    // Check if there's a flat column with underscore prefix
+                    let flat_column_name = format!("_{column_name}.{field_name}");
+
+                    if let Ok(flat_field_index) =
+                        physical_file_schema.index_of(&flat_column_name)
+                    {
+                        let flat_field = physical_file_schema.field(flat_field_index);
+
+                        if flat_field.data_type() == &DataType::Utf8 {
+                            // Replace the whole expression with a direct column reference
+                            let new_expr = Arc::new(expressions::Column::new(
+                                &flat_column_name,
+                                flat_field_index,
+                            ))
+                                as Arc<dyn PhysicalExpr>;
+
+                            return Ok(Transformed {
+                                data: new_expr,
+                                tnr: TreeNodeRecursion::Stop,
+                                transformed: true,
+                            });
+                        }
+                    }
+                }
+            }
+        }
+        Ok(Transformed::no(expr))
+    }
+}
diff --git a/datafusion-examples/examples/data_io/main.rs b/datafusion-examples/examples/data_io/main.rs
new file mode 100644
index 0000000000000..0039585d15b60
--- /dev/null
+++ b/datafusion-examples/examples/data_io/main.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples of data formats and I/O
+//!
+//! These examples demonstrate data formats and I/O.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example data_io -- [all|catalog|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `catalog`
+//!   (file: catalog.rs, desc: Register tables into a custom catalog)
+//!
+//! - `json_shredding`
+//!   (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
+//!
+//! - `parquet_adv_idx`
+//!   (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files)
+//!
+//! - `parquet_emb_idx`
+//!   (file: parquet_embedded_index.rs, desc: Store a custom index inside Parquet files)
+//!
+//! - `parquet_enc`  
+//!   (file: parquet_encrypted.rs, desc: Read & write encrypted Parquet files)
+//!
+//! - `parquet_enc_with_kms`
+//!   (file: parquet_encrypted_with_kms.rs, desc: Encrypted Parquet I/O using a KMS-backed factory)
+//!
+//! - `parquet_exec_visitor`
+//!   (file: parquet_exec_visitor.rs, desc: Extract statistics by visiting an ExecutionPlan)
+//!
+//! - `parquet_idx`
+//!   (file: parquet_index.rs, desc: Create a secondary index)
+//!
+//! - `query_http_csv`
+//!   (file: query_http_csv.rs, desc: Query CSV files via HTTP)
+//!
+//! - `remote_catalog`
+//!   (file: remote_catalog.rs, desc: Interact with a remote catalog)
+
+mod catalog;
+mod json_shredding;
+mod parquet_advanced_index;
+mod parquet_embedded_index;
+mod parquet_encrypted;
+mod parquet_encrypted_with_kms;
+mod parquet_exec_visitor;
+mod parquet_index;
+mod query_http_csv;
+mod remote_catalog;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Catalog,
+    JsonShredding,
+    ParquetAdvIdx,
+    ParquetEmbIdx,
+    ParquetEnc,
+    ParquetEncWithKms,
+    ParquetExecVisitor,
+    ParquetIdx,
+    QueryHttpCsv,
+    RemoteCatalog,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "data_io";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Catalog => catalog::catalog().await?,
+            ExampleKind::JsonShredding => json_shredding::json_shredding().await?,
+            ExampleKind::ParquetAdvIdx => {
+                parquet_advanced_index::parquet_advanced_index().await?
+            }
+            ExampleKind::ParquetEmbIdx => {
+                parquet_embedded_index::parquet_embedded_index().await?
+            }
+            ExampleKind::ParquetEncWithKms => {
+                parquet_encrypted_with_kms::parquet_encrypted_with_kms().await?
+            }
+            ExampleKind::ParquetEnc => parquet_encrypted::parquet_encrypted().await?,
+            ExampleKind::ParquetExecVisitor => {
+                parquet_exec_visitor::parquet_exec_visitor().await?
+            }
+            ExampleKind::ParquetIdx => parquet_index::parquet_index().await?,
+            ExampleKind::QueryHttpCsv => query_http_csv::query_http_csv().await?,
+            ExampleKind::RemoteCatalog => remote_catalog::remote_catalog().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/data_io/parquet_advanced_index.rs
similarity index 96%
rename from datafusion-examples/examples/advanced_parquet_index.rs
rename to datafusion-examples/examples/data_io/parquet_advanced_index.rs
index efaee23366a1c..f02b01354b784 100644
--- a/datafusion-examples/examples/advanced_parquet_index.rs
+++ b/datafusion-examples/examples/data_io/parquet_advanced_index.rs
@@ -15,40 +15,42 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::any::Any;
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use datafusion::catalog::Session;
 use datafusion::common::{
-    internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
 };
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan;
 use datafusion::datasource::physical_plan::{
-    FileMeta, FileScanConfigBuilder, ParquetFileReaderFactory, ParquetSource,
+    FileScanConfigBuilder, ParquetFileReaderFactory, ParquetSource,
 };
-use datafusion::datasource::TableProvider;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{TableProviderFilterPushDown, TableType};
+use datafusion::parquet::arrow::ArrowWriter;
 use datafusion::parquet::arrow::arrow_reader::{
     ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector,
 };
 use datafusion::parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
-use datafusion::parquet::arrow::ArrowWriter;
-use datafusion::parquet::file::metadata::ParquetMetaData;
+use datafusion::parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
 use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
 use datafusion::parquet::schema::types::ColumnPath;
-use datafusion::physical_expr::utils::{Guarantee, LiteralGuarantee};
 use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_expr::utils::{Guarantee, LiteralGuarantee};
 use datafusion::physical_optimizer::pruning::PruningPredicate;
-use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::prelude::*;
 
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
@@ -56,8 +58,8 @@ use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
 use bytes::Bytes;
 use datafusion::datasource::memory::DataSourceExec;
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use object_store::ObjectStore;
 use tempfile::TempDir;
 use url::Url;
@@ -121,7 +123,6 @@ use url::Url;
 ///         │ ╚═══════════════════╝ │      1. With cached ParquetMetadata, so
 ///         └───────────────────────┘      the ParquetSource does not re-read /
 ///          Parquet File                  decode the thrift footer
-///
 /// ```
 ///
 /// Within a Row Group, Column Chunks store data in DataPages. This example also
@@ -156,8 +157,7 @@ use url::Url;
 ///
 /// [`ListingTable`]: datafusion::datasource::listing::ListingTable
 /// [Page Index](https://github.com/apache/parquet-format/blob/master/PageIndex.md)
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parquet_advanced_index() -> Result<()> {
     // the object store is used to read the parquet files (in this case, it is
     // a local file system, but in a real system it could be S3, GCS, etc)
     let object_store: Arc<dyn ObjectStore> =
@@ -240,6 +240,7 @@ pub struct IndexTableProvider {
     /// if true, use row selections in addition to row group selections
     use_row_selections: AtomicBool,
 }
+
 impl IndexTableProvider {
     /// Create a new IndexTableProvider
     /// * `object_store` - the object store implementation to use for reading files
@@ -409,7 +410,7 @@ impl IndexedFile {
         let options = ArrowReaderOptions::new()
             // Load the page index when reading metadata to cache
             // so it is available to interpret row selections
-            .with_page_index(true);
+            .with_page_index_policy(PageIndexPolicy::Required);
         let reader =
             ParquetRecordBatchReaderBuilder::try_new_with_options(file, options)?;
         let metadata = reader.metadata().clone();
@@ -492,19 +493,18 @@ impl TableProvider for IndexTableProvider {
                 .with_file(indexed_file);
 
         let file_source = Arc::new(
-            ParquetSource::default()
+            ParquetSource::new(schema.clone())
                 // provide the predicate so the DataSourceExec can try and prune
                 // row groups internally
                 .with_predicate(predicate)
                 // provide the factory to create parquet reader without re-reading metadata
                 .with_parquet_file_reader_factory(Arc::new(reader_factory)),
         );
-        let file_scan_config =
-            FileScanConfigBuilder::new(object_store_url, schema, file_source)
-                .with_limit(limit)
-                .with_projection(projection.cloned())
-                .with_file(partitioned_file)
-                .build();
+        let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source)
+            .with_limit(limit)
+            .with_projection_indices(projection.cloned())?
+            .with_file(partitioned_file)
+            .build();
 
         // Finally, put it all together into a DataSourceExec
         Ok(DataSourceExec::from_data_source(file_scan_config))
@@ -541,6 +541,7 @@ impl CachedParquetFileReaderFactory {
             metadata: HashMap::new(),
         }
     }
+
     /// Add the pre-parsed information about the file to the factor
     fn with_file(mut self, indexed_file: &IndexedFile) -> Self {
         self.metadata.insert(
@@ -555,25 +556,26 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
     fn create_reader(
         &self,
         _partition_index: usize,
-        file_meta: FileMeta,
+        partitioned_file: PartitionedFile,
         metadata_size_hint: Option<usize>,
         _metrics: &ExecutionPlanMetricsSet,
     ) -> Result<Box<dyn AsyncFileReader + Send>> {
         // for this example we ignore the partition index and metrics
         // but in a real system you would likely use them to report details on
         // the performance of the reader.
-        let filename = file_meta
-            .location()
+        let filename = partitioned_file
+            .object_meta
+            .location
             .parts()
-            .last()
+            .next_back()
             .expect("No path in location")
             .as_ref()
             .to_string();
 
         let object_store = Arc::clone(&self.object_store);
         let mut inner =
-            ParquetObjectReader::new(object_store, file_meta.object_meta.location)
-                .with_file_size(file_meta.object_meta.size);
+            ParquetObjectReader::new(object_store, partitioned_file.object_meta.location)
+                .with_file_size(partitioned_file.object_meta.size);
 
         if let Some(hint) = metadata_size_hint {
             inner = inner.with_footer_size_hint(hint)
@@ -657,7 +659,7 @@ fn make_demo_file(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<()>
     // enable page statistics for the tag column,
     // for everything else.
     let props = WriterProperties::builder()
-        .set_max_row_group_size(100)
+        .set_max_row_group_row_count(Some(100))
         // compute column chunk (per row group) statistics by default
         .set_statistics_enabled(EnabledStatistics::Chunk)
         // compute column page statistics for the tag column
diff --git a/datafusion-examples/examples/data_io/parquet_embedded_index.rs b/datafusion-examples/examples/data_io/parquet_embedded_index.rs
new file mode 100644
index 0000000000000..bcaca2ed5c85b
--- /dev/null
+++ b/datafusion-examples/examples/data_io/parquet_embedded_index.rs
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! Embedding and using a custom index in Parquet files
+//!
+//! # Background
+//!
+//! This example shows how to add an application‑specific index to an Apache
+//! Parquet file without modifying the Parquet format itself. The resulting
+//! files can be read by any standard Parquet reader, which will simply
+//! ignore the extra index data.
+//!
+//! A “distinct value” index, similar to a  ["set" Skip Index in ClickHouse],
+//! is stored in a custom binary format within the parquet file. Only the
+//! location of index is stored in Parquet footer key/value metadata.
+//! This approach is more efficient than storing the index itself in the footer
+//! metadata because the footer must be read and parsed by all readers,
+//! even those that do not use the index.
+//!
+//! This example uses a file level index for skipping entire files, but any
+//! index can be stored using the same techniques and used skip row groups,
+//! data pages, or rows using the APIs on [`TableProvider`] and [`ParquetSource`].
+//!
+//! The resulting Parquet file layout is as follows:
+//!
+//! ```text
+//!                   ┌──────────────────────┐
+//!                   │┌───────────────────┐ │
+//!                   ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!  Standard Parquet │┌───────────────────┐ │
+//!  Data Pages       ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!                   │        ...           │
+//!                   │┌───────────────────┐ │
+//!                   ││     DataPage      │ │
+//!                   │└───────────────────┘ │
+//!                   │┏━━━━━━━━━━━━━━━━━━━┓ │
+//! Non standard      │┃                   ┃ │
+//! index (ignored by │┃Custom Binary Index┃ │
+//! other Parquet     │┃ (Distinct Values) ┃◀│─ ─ ─
+//! readers)          │┃                   ┃ │     │
+//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │
+//! Standard Parquet  │┏━━━━━━━━━━━━━━━━━━━┓ │     │  key/value metadata
+//! Page Index        │┃    Page Index     ┃ │        contains location
+//!                   │┗━━━━━━━━━━━━━━━━━━━┛ │     │  of special index
+//!                   │╔═══════════════════╗ │
+//!                   │║ Parquet Footer w/ ║ │     │
+//!                   │║     Metadata      ║ ┼ ─ ─
+//!                   │║ (Thrift Encoded)  ║ │
+//!                   │╚═══════════════════╝ │
+//!                   └──────────────────────┘
+//!
+//!                         Parquet File
+//!
+//! # High Level Flow
+//!
+//! To create a custom Parquet index:
+//!
+//! 1. Compute the index and serialize it to a binary format.
+//!
+//! 2. Write the Parquet file with:
+//!    - regular data pages
+//!    - the serialized index inline
+//!    - footer key/value metadata entry to locate the index
+//!
+//! To read and use the index are:
+//!
+//! 1. Read and deserialize the file’s footer to locate the index.
+//!
+//! 2. Read and deserialize the index.
+//!
+//! 3. Create a `TableProvider` that knows how to use the index to quickly find
+//!   the relevant files, row groups, data pages or rows based on on pushed down
+//!   filters.
+//!
+//! # FAQ: Why do other Parquet readers skip over the custom index?
+//!
+//! The flow for reading a parquet file is:
+//!
+//! 1. Seek to the end of the file and read the last 8 bytes (a 4‑byte
+//!    little‑endian footer length followed by the `PAR1` magic bytes).
+//!
+//! 2. Seek backwards by that length to parse the Thrift‑encoded footer
+//!    metadata (including key/value pairs).
+//!
+//! 3. Read data required for decoding such as data pages based on the offsets
+//!    encoded in the metadata.
+//!
+//! Since parquet readers do not scan from the start of the file they will read
+//! data in the file unless it is explicitly referenced in the footer metadata.
+//!
+//! Thus other readers will encounter and ignore an unknown key
+//! (`distinct_index_offset`) in the footer key/value metadata. Unless they
+//! know how to use that information, they will not attempt to read or
+//! the bytes that make up the index.
+//!
+//! ["set" Skip Index in ClickHouse]: https://clickhouse.com/docs/optimize/skipping-indexes#set
+
+use arrow::array::{ArrayRef, StringArray};
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use async_trait::async_trait;
+use datafusion::catalog::{Session, TableProvider};
+use datafusion::common::{HashMap, HashSet, Result, exec_err};
+use datafusion::datasource::TableType;
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::memory::DataSourceExec;
+use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource};
+use datafusion::execution::object_store::ObjectStoreUrl;
+use datafusion::logical_expr::{Operator, TableProviderFilterPushDown};
+use datafusion::parquet::arrow::ArrowWriter;
+use datafusion::parquet::errors::ParquetError;
+use datafusion::parquet::file::metadata::{FileMetaData, KeyValue};
+use datafusion::parquet::file::reader::{FileReader, SerializedFileReader};
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::*;
+use datafusion::scalar::ScalarValue;
+use std::fs::{File, read_dir};
+use std::io::{Read, Seek, SeekFrom, Write};
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use tempfile::TempDir;
+
+/// Store a custom index inside a Parquet file and use it to speed up queries
+pub async fn parquet_embedded_index() -> Result<()> {
+    // 1. Create temp dir and write 3 Parquet files with different category sets
+    let tmp = TempDir::new()?;
+    let dir = tmp.path();
+    write_file_with_index(&dir.join("a.parquet"), &["foo", "bar", "foo"])?;
+    write_file_with_index(&dir.join("b.parquet"), &["baz", "qux"])?;
+    write_file_with_index(&dir.join("c.parquet"), &["foo", "quux", "quux"])?;
+
+    // 2. Register our custom TableProvider
+    let field = Field::new("category", DataType::Utf8, false);
+    let schema_ref = Arc::new(Schema::new(vec![field]));
+    let provider = Arc::new(DistinctIndexTable::try_new(dir, schema_ref.clone())?);
+
+    let ctx = SessionContext::new();
+    ctx.register_table("t", provider)?;
+
+    // 3. Run a query: only files containing 'foo' get scanned. The rest are pruned.
+    // based on the distinct index.
+    let df = ctx.sql("SELECT * FROM t WHERE category = 'foo'").await?;
+    df.show().await?;
+
+    Ok(())
+}
+
+/// An index of distinct values for a single column
+///
+/// In this example the index is a simple set of strings, but in a real
+/// application it could be any arbitrary data structure.
+///
+/// Also, this example indexes the distinct values for an entire file
+/// but a real application could create multiple indexes for multiple
+/// row groups and/or columns, depending on the use case.
+#[derive(Debug, Clone)]
+struct DistinctIndex {
+    inner: HashSet<String>,
+}
+
+impl DistinctIndex {
+    /// Create a DistinctIndex from an iterator of strings
+    pub fn new<I: IntoIterator<Item = String>>(iter: I) -> Self {
+        Self {
+            inner: iter.into_iter().collect(),
+        }
+    }
+
+    /// Returns true if the index contains the given value
+    pub fn contains(&self, value: &str) -> bool {
+        self.inner.contains(value)
+    }
+
+    /// Serialize the distinct index to a writer as bytes
+    ///
+    /// In this example, we use a simple newline-separated format,
+    /// but a real application can use any arbitrary binary format.
+    ///
+    /// Note that we must use the ArrowWriter to write the index so that its
+    /// internal accounting of offsets can correctly track the actual size of
+    /// the file. If we wrote directly to the underlying writer, the PageIndex
+    /// written right before the would be incorrect as they would not account
+    /// for the extra bytes written.
+    fn serialize<W: Write + Send>(
+        &self,
+        arrow_writer: &mut ArrowWriter<W>,
+    ) -> Result<()> {
+        let serialized = self
+            .inner
+            .iter()
+            .map(|s| s.as_str())
+            .collect::<Vec<_>>()
+            .join("\n");
+        let index_bytes = serialized.into_bytes();
+
+        // Set the offset for the index
+        let offset = arrow_writer.bytes_written();
+        let index_len = index_bytes.len() as u64;
+
+        println!("Writing custom index at offset: {offset}, length: {index_len}");
+        // Write the index magic and length to the file
+        arrow_writer.write_all(INDEX_MAGIC)?;
+        arrow_writer.write_all(&index_len.to_le_bytes())?;
+
+        // Write the index bytes
+        arrow_writer.write_all(&index_bytes)?;
+
+        // Append metadata about the index to the Parquet file footer
+        arrow_writer.append_key_value_metadata(KeyValue::new(
+            "distinct_index_offset".to_string(),
+            offset.to_string(),
+        ));
+        Ok(())
+    }
+
+    /// Read the distinct values index from a reader at the given offset and length
+    pub fn new_from_reader<R: Read + Seek>(mut reader: R, offset: u64) -> Result<Self> {
+        reader.seek(SeekFrom::Start(offset))?;
+
+        let mut magic_buf = [0u8; 4];
+        reader.read_exact(&mut magic_buf)?;
+        if magic_buf != INDEX_MAGIC {
+            return exec_err!("Invalid index magic number at offset {offset}");
+        }
+
+        let mut len_buf = [0u8; 8];
+        reader.read_exact(&mut len_buf)?;
+        let stored_len = u64::from_le_bytes(len_buf) as usize;
+
+        let mut index_buf = vec![0u8; stored_len];
+        reader.read_exact(&mut index_buf)?;
+
+        let Ok(s) = String::from_utf8(index_buf) else {
+            return exec_err!("Invalid UTF-8 in index data");
+        };
+
+        Ok(Self {
+            inner: s.lines().map(|s| s.to_string()).collect(),
+        })
+    }
+}
+
+/// DataFusion [`TableProvider]` that reads Parquet files and uses a
+/// `DistinctIndex` to prune files based on pushed down filters.
+#[derive(Debug)]
+struct DistinctIndexTable {
+    /// The schema of the table
+    schema: SchemaRef,
+    /// Key is file name, value is DistinctIndex for that file
+    files_and_index: HashMap<String, DistinctIndex>,
+    /// Directory containing the Parquet files
+    dir: PathBuf,
+}
+
+impl DistinctIndexTable {
+    /// Create a new DistinctIndexTable for files in the given directory
+    ///
+    /// Scans the directory, reading the `DistinctIndex` from each file
+    fn try_new(dir: impl Into<PathBuf>, schema: SchemaRef) -> Result<Self> {
+        let dir = dir.into();
+        let mut index = HashMap::new();
+
+        for entry in read_dir(&dir)? {
+            let path = entry?.path();
+            if path.extension().and_then(|s| s.to_str()) != Some("parquet") {
+                continue;
+            }
+            let file_name = path.file_name().unwrap().to_string_lossy().to_string();
+
+            let distinct_set = read_distinct_index(&path)?;
+
+            println!("Read distinct index for {file_name}: {file_name:?}");
+            index.insert(file_name, distinct_set);
+        }
+
+        Ok(Self {
+            schema,
+            files_and_index: index,
+            dir,
+        })
+    }
+}
+
+/// Wrapper around ArrowWriter to write Parquet files with an embedded index
+struct IndexedParquetWriter<W: Write + Seek> {
+    writer: ArrowWriter<W>,
+}
+
+/// Magic bytes to identify our custom index format
+const INDEX_MAGIC: &[u8] = b"IDX1";
+
+impl<W: Write + Seek + Send> IndexedParquetWriter<W> {
+    pub fn try_new(sink: W, schema: Arc<Schema>) -> Result<Self> {
+        let writer = ArrowWriter::try_new(sink, schema, None)?;
+        Ok(Self { writer })
+    }
+
+    /// Write a RecordBatch to the Parquet file
+    pub fn write(&mut self, batch: &RecordBatch) -> Result<()> {
+        self.writer.write(batch)?;
+        Ok(())
+    }
+
+    /// Flush the current row group
+    pub fn flush(&mut self) -> Result<()> {
+        self.writer.flush()?;
+        Ok(())
+    }
+
+    /// Close the Parquet file, flushing any remaining data
+    pub fn close(self) -> Result<()> {
+        self.writer.close()?;
+        Ok(())
+    }
+
+    /// write the DistinctIndex to the Parquet file
+    pub fn write_index(&mut self, index: &DistinctIndex) -> Result<()> {
+        index.serialize(&mut self.writer)
+    }
+}
+
+/// Write a Parquet file with a single column "category" containing the
+/// strings in `values` and a DistinctIndex for that column.
+fn write_file_with_index(path: &Path, values: &[&str]) -> Result<()> {
+    // form an input RecordBatch with the string values
+    let field = Field::new("category", DataType::Utf8, false);
+    let schema = Arc::new(Schema::new(vec![field.clone()]));
+    let arr: ArrayRef = Arc::new(StringArray::from(values.to_vec()));
+    let batch = RecordBatch::try_new(schema.clone(), vec![arr])?;
+
+    // compute the distinct index
+    let distinct_index: DistinctIndex =
+        DistinctIndex::new(values.iter().map(|s| (*s).to_string()));
+
+    let file = File::create(path)?;
+
+    let mut writer = IndexedParquetWriter::try_new(file, schema.clone())?;
+    writer.write(&batch)?;
+    writer.flush()?;
+    writer.write_index(&distinct_index)?;
+    writer.close()?;
+
+    println!("Finished writing file to {}", path.display());
+    Ok(())
+}
+
+/// Read a `DistinctIndex` from a Parquet file
+fn read_distinct_index(path: &Path) -> Result<DistinctIndex> {
+    let file = File::open(path)?;
+
+    let file_size = file.metadata()?.len();
+    println!("Reading index from {} (size: {file_size})", path.display(),);
+
+    let reader = SerializedFileReader::new(file.try_clone()?)?;
+    let meta = reader.metadata().file_metadata();
+
+    let offset = get_key_value(meta, "distinct_index_offset")
+        .ok_or_else(|| ParquetError::General("Missing index offset".into()))?
+        .parse::<u64>()
+        .map_err(|e| ParquetError::General(e.to_string()))?;
+
+    println!("Reading index at offset: {offset}, length");
+    DistinctIndex::new_from_reader(file, offset)
+}
+
+/// Returns the value of a named key from the Parquet file metadata
+///
+/// Returns None if the key is not found
+fn get_key_value<'a>(file_meta_data: &'a FileMetaData, key: &'_ str) -> Option<&'a str> {
+    let kvs = file_meta_data.key_value_metadata()?;
+    let kv = kvs.iter().find(|kv| kv.key == key)?;
+    kv.value.as_deref()
+}
+
+/// Implement TableProvider for DistinctIndexTable, using the distinct index to prune files
+#[async_trait]
+impl TableProvider for DistinctIndexTable {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    /// Prune files before reading: only keep files whose distinct set
+    /// contains the filter value
+    async fn scan(
+        &self,
+        _ctx: &dyn Session,
+        _proj: Option<&Vec<usize>>,
+        filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // This example only handles filters of the form
+        // `category = 'X'` where X is a string literal
+        //
+        // You can use `PruningPredicate` for much more general range and
+        // equality analysis or write your own custom logic.
+        let mut target: Option<&str> = None;
+
+        if filters.len() == 1
+            && let Expr::BinaryExpr(expr) = &filters[0]
+            && expr.op == Operator::Eq
+            && let (Expr::Column(c), Expr::Literal(ScalarValue::Utf8(Some(v)), _)) =
+                (&*expr.left, &*expr.right)
+            && c.name == "category"
+        {
+            println!("Filtering for category: {v}");
+            target = Some(v);
+        }
+        // Determine which files to scan
+        let files_to_scan: Vec<_> = self
+            .files_and_index
+            .iter()
+            .filter_map(|(f, distinct_index)| {
+                // keep file if no target or target is in the distinct set
+                if target.is_none() || distinct_index.contains(target?) {
+                    Some(f)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        println!("Scanning only files: {files_to_scan:?}");
+
+        // Build ParquetSource to actually read the files
+        let url = ObjectStoreUrl::parse("file://")?;
+        let source = Arc::new(
+            ParquetSource::new(self.schema.clone()).with_enable_page_index(true),
+        );
+        let mut builder = FileScanConfigBuilder::new(url, source);
+        for file in files_to_scan {
+            let path = self.dir.join(file);
+            let len = std::fs::metadata(&path)?.len();
+            // If the index contained information about row groups or pages,
+            // you could also pass that information here to further prune
+            // the data read from the file.
+            let partitioned_file =
+                PartitionedFile::new(path.to_str().unwrap().to_string(), len);
+            builder = builder.with_file(partitioned_file);
+        }
+        Ok(DataSourceExec::from_data_source(builder.build()))
+    }
+
+    /// Tell DataFusion that we can handle filters on the "category" column
+    fn supports_filters_pushdown(
+        &self,
+        fs: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        // Mark as inexact since pruning is file‑granular
+        Ok(vec![TableProviderFilterPushDown::Inexact; fs.len()])
+    }
+}
diff --git a/datafusion-examples/examples/data_io/parquet_encrypted.rs b/datafusion-examples/examples/data_io/parquet_encrypted.rs
new file mode 100644
index 0000000000000..26361e9b52be0
--- /dev/null
+++ b/datafusion-examples/examples/data_io/parquet_encrypted.rs
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
+use datafusion::common::DataFusionError;
+use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions};
+use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
+use datafusion::logical_expr::{col, lit};
+use datafusion::parquet::encryption::decrypt::FileDecryptionProperties;
+use datafusion::parquet::encryption::encrypt::FileEncryptionProperties;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use tempfile::TempDir;
+
+/// Read and write encrypted Parquet files using DataFusion
+pub async fn parquet_encrypted() -> datafusion::common::Result<()> {
+    // The SessionContext is the main high level API for interacting with DataFusion
+    let ctx = SessionContext::new();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    // Read the sample parquet file
+    let parquet_df = ctx
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
+        .await?;
+
+    // Show information from the dataframe
+    println!(
+        "==============================================================================="
+    );
+    println!("Original Parquet DataFrame:");
+    query_dataframe(&parquet_df).await?;
+
+    // Setup encryption and decryption properties
+    let (encrypt, decrypt) = setup_encryption(&parquet_df)?;
+
+    // Create a temporary file location for the encrypted parquet file
+    let tmp_source = TempDir::new()?;
+    let tempfile = tmp_source.path().join("cars_encrypted.parquet");
+
+    // Write encrypted parquet
+    let mut options = TableParquetOptions::default();
+    options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt));
+    parquet_df
+        .write_parquet(
+            tempfile.to_str().unwrap(),
+            DataFrameWriteOptions::new().with_single_file_output(true),
+            Some(options),
+        )
+        .await?;
+
+    // Read encrypted parquet back as a DataFrame using matching decryption config
+    let ctx: SessionContext = SessionContext::new();
+    let read_options =
+        ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+
+    let encrypted_parquet_df = ctx
+        .read_parquet(tempfile.to_str().unwrap(), read_options)
+        .await?;
+
+    // Show information from the dataframe
+    println!(
+        "\n\n==============================================================================="
+    );
+    println!("Encrypted Parquet DataFrame:");
+    query_dataframe(&encrypted_parquet_df).await?;
+
+    Ok(())
+}
+
+// Show information from the dataframe
+async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> {
+    // show its schema using 'describe'
+    println!("Schema:");
+    df.clone().describe().await?.show().await?;
+
+    // Select three columns and filter the results
+    // so that only rows where speed > 5 are returned
+    // select car, speed, time from t where speed > 5
+    println!("\nSelected rows and columns:");
+    df.clone()
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(5)))?
+        .show()
+        .await?;
+
+    Ok(())
+}
+
+// Setup encryption and decryption properties
+fn setup_encryption(
+    parquet_df: &DataFrame,
+) -> Result<(Arc<FileEncryptionProperties>, Arc<FileDecryptionProperties>), DataFusionError>
+{
+    let schema = parquet_df.schema();
+    let footer_key = b"0123456789012345".to_vec(); // 128bit/16
+    let column_key = b"1234567890123450".to_vec(); // 128bit/16
+
+    let mut encrypt = FileEncryptionProperties::builder(footer_key.clone());
+    let mut decrypt = FileDecryptionProperties::builder(footer_key.clone());
+
+    for field in schema.fields().iter() {
+        encrypt = encrypt.with_column_key(field.name().as_str(), column_key.clone());
+        decrypt = decrypt.with_column_key(field.name().as_str(), column_key.clone());
+    }
+
+    let encrypt = encrypt.build()?;
+    let decrypt = decrypt.build()?;
+    Ok((encrypt, decrypt))
+}
diff --git a/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs b/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
new file mode 100644
index 0000000000000..1a9bf56c09b35
--- /dev/null
+++ b/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
+use arrow_schema::SchemaRef;
+use async_trait::async_trait;
+use base64::Engine;
+use datafusion::common::extensions_options;
+use datafusion::config::{EncryptionFactoryOptions, TableParquetOptions};
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::ListingOptions;
+use datafusion::error::Result;
+use datafusion::execution::parquet_encryption::EncryptionFactory;
+use datafusion::parquet::encryption::decrypt::KeyRetriever;
+use datafusion::parquet::encryption::{
+    decrypt::FileDecryptionProperties, encrypt::FileEncryptionProperties,
+};
+use datafusion::prelude::SessionContext;
+use futures::StreamExt;
+use object_store::path::Path;
+use rand::rand_core::{OsRng, TryRngCore};
+use std::collections::HashSet;
+use std::sync::Arc;
+use tempfile::TempDir;
+
+const ENCRYPTION_FACTORY_ID: &str = "example.mock_kms_encryption";
+
+/// This example demonstrates reading and writing Parquet files that
+/// are encrypted using Parquet Modular Encryption.
+///
+/// Compared to the `parquet_encrypted` example, where AES keys
+/// are specified directly, this example implements an `EncryptionFactory` that
+/// generates encryption keys dynamically per file.
+/// Encryption key metadata is stored inline in the Parquet files and is used to determine
+/// the decryption keys when reading the files.
+///
+/// In this example, encryption keys are simply stored base64 encoded in the Parquet metadata,
+/// which is not a secure way to store encryption keys.
+/// For production use, it is recommended to use a key-management service (KMS) to encrypt
+/// data encryption keys.
+pub async fn parquet_encrypted_with_kms() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Register an `EncryptionFactory` implementation to be used for Parquet encryption
+    // in the runtime environment.
+    // `EncryptionFactory` instances are registered with a name to identify them so
+    // they can be later referenced in configuration options, and it's possible to register
+    // multiple different factories to handle different ways of encrypting Parquet.
+    let encryption_factory = TestEncryptionFactory::default();
+    ctx.runtime_env().register_parquet_encryption_factory(
+        ENCRYPTION_FACTORY_ID,
+        Arc::new(encryption_factory),
+    );
+
+    // Register some simple test data
+    let a: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d"]));
+    let b: ArrayRef = Arc::new(Int32Array::from(vec![1, 10, 10, 100]));
+    let c: ArrayRef = Arc::new(Int32Array::from(vec![2, 20, 20, 200]));
+    let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)])?;
+    ctx.register_batch("test_data", batch)?;
+
+    {
+        // Write and read encrypted Parquet with the programmatic API
+        let tmpdir = TempDir::new()?;
+        let table_path = format!("{}/", tmpdir.path().to_str().unwrap());
+        write_encrypted(&ctx, &table_path).await?;
+        read_encrypted(&ctx, &table_path).await?;
+    }
+
+    {
+        // Write and read encrypted Parquet with the SQL API
+        let tmpdir = TempDir::new()?;
+        let table_path = format!("{}/", tmpdir.path().to_str().unwrap());
+        write_encrypted_with_sql(&ctx, &table_path).await?;
+        read_encrypted_with_sql(&ctx, &table_path).await?;
+    }
+
+    Ok(())
+}
+
+/// Write an encrypted Parquet file
+async fn write_encrypted(ctx: &SessionContext, table_path: &str) -> Result<()> {
+    let df = ctx.table("test_data").await?;
+
+    let mut parquet_options = TableParquetOptions::new();
+    // We specify that we want to use Parquet encryption by setting the identifier of the
+    // encryption factory to use and providing the factory-specific configuration.
+    // Our encryption factory only requires specifying the columns to encrypt.
+    let encryption_config = EncryptionConfig {
+        encrypted_columns: "b,c".to_owned(),
+    };
+    parquet_options
+        .crypto
+        .configure_factory(ENCRYPTION_FACTORY_ID, &encryption_config);
+
+    df.write_parquet(
+        table_path,
+        DataFrameWriteOptions::new(),
+        Some(parquet_options),
+    )
+    .await?;
+
+    println!("Encrypted Parquet written to {table_path}");
+    Ok(())
+}
+
+/// Read from an encrypted Parquet file
+async fn read_encrypted(ctx: &SessionContext, table_path: &str) -> Result<()> {
+    let mut parquet_options = TableParquetOptions::new();
+    // Specify the encryption factory to use for decrypting Parquet.
+    // In this example, we don't require any additional configuration options when reading
+    // as we only need the key metadata from the Parquet files to determine the decryption keys.
+    parquet_options
+        .crypto
+        .configure_factory(ENCRYPTION_FACTORY_ID, &EncryptionConfig::default());
+
+    let file_format = ParquetFormat::default().with_options(parquet_options);
+    let listing_options = ListingOptions::new(Arc::new(file_format));
+
+    ctx.register_listing_table(
+        "encrypted_parquet_table",
+        &table_path,
+        listing_options.clone(),
+        None,
+        None,
+    )
+    .await?;
+
+    let mut batch_stream = ctx
+        .table("encrypted_parquet_table")
+        .await?
+        .execute_stream()
+        .await?;
+    println!("Reading encrypted Parquet as a RecordBatch stream");
+    while let Some(batch) = batch_stream.next().await {
+        let batch = batch?;
+        println!("Read batch with {} rows", batch.num_rows());
+    }
+
+    println!("Finished reading");
+    Ok(())
+}
+
+/// Write an encrypted Parquet file using only SQL syntax with string configuration
+async fn write_encrypted_with_sql(ctx: &SessionContext, table_path: &str) -> Result<()> {
+    let query = format!(
+        "COPY test_data \
+        TO '{table_path}' \
+        STORED AS parquet
+        OPTIONS (\
+            'format.crypto.factory_id' '{ENCRYPTION_FACTORY_ID}', \
+            'format.crypto.factory_options.encrypted_columns' 'b,c' \
+        )"
+    );
+    let _ = ctx.sql(&query).await?.collect().await?;
+
+    println!("Encrypted Parquet written to {table_path}");
+    Ok(())
+}
+
+/// Read from an encrypted Parquet file using only the SQL API and string-based configuration
+async fn read_encrypted_with_sql(ctx: &SessionContext, table_path: &str) -> Result<()> {
+    let ddl = format!(
+        "CREATE EXTERNAL TABLE encrypted_parquet_table_2 \
+        STORED AS PARQUET LOCATION '{table_path}' OPTIONS (\
+        'format.crypto.factory_id' '{ENCRYPTION_FACTORY_ID}' \
+        )"
+    );
+    ctx.sql(&ddl).await?;
+    let df = ctx.sql("SELECT * FROM encrypted_parquet_table_2").await?;
+    let mut batch_stream = df.execute_stream().await?;
+
+    println!("Reading encrypted Parquet as a RecordBatch stream");
+    while let Some(batch) = batch_stream.next().await {
+        let batch = batch?;
+        println!("Read batch with {} rows", batch.num_rows());
+    }
+    println!("Finished reading");
+    Ok(())
+}
+
+// Options used to configure our example encryption factory
+extensions_options! {
+    struct EncryptionConfig {
+        /// Comma-separated list of columns to encrypt
+        pub encrypted_columns: String, default = "".to_owned()
+    }
+}
+
+/// Mock implementation of an `EncryptionFactory` that stores encryption keys
+/// base64 encoded in the Parquet encryption metadata.
+/// For production use, integrating with a key-management service to encrypt
+/// data encryption keys is recommended.
+#[derive(Default, Debug)]
+struct TestEncryptionFactory {}
+
+/// `EncryptionFactory` is a DataFusion trait for types that generate
+/// file encryption and decryption properties.
+#[async_trait]
+impl EncryptionFactory for TestEncryptionFactory {
+    /// Generate file encryption properties to use when writing a Parquet file.
+    /// The `schema` is provided so that it may be used to dynamically configure
+    /// per-column encryption keys.
+    /// The file path is also available. We don't use the path in this example,
+    /// but other implementations may want to use this to compute an
+    /// AAD prefix for the file, or to allow use of external key material
+    /// (where key metadata is stored in a JSON file alongside Parquet files).
+    async fn get_file_encryption_properties(
+        &self,
+        options: &EncryptionFactoryOptions,
+        schema: &SchemaRef,
+        _file_path: &Path,
+    ) -> Result<Option<Arc<FileEncryptionProperties>>> {
+        let config: EncryptionConfig = options.to_extension_options()?;
+
+        // Generate a random encryption key for this file.
+        let mut key = vec![0u8; 16];
+        OsRng.try_fill_bytes(&mut key).unwrap();
+
+        // Generate the key metadata that allows retrieving the key when reading the file.
+        let key_metadata = wrap_key(&key);
+
+        let mut builder = FileEncryptionProperties::builder(key.to_vec())
+            .with_footer_key_metadata(key_metadata.clone());
+
+        let encrypted_columns: HashSet<&str> =
+            config.encrypted_columns.split(",").collect();
+        if !encrypted_columns.is_empty() {
+            // Set up per-column encryption.
+            for field in schema.fields().iter() {
+                if encrypted_columns.contains(field.name().as_str()) {
+                    // Here we re-use the same key for all encrypted columns,
+                    // but new keys could also be generated per column.
+                    builder = builder.with_column_key_and_metadata(
+                        field.name().as_str(),
+                        key.clone(),
+                        key_metadata.clone(),
+                    );
+                }
+            }
+        }
+
+        let encryption_properties = builder.build()?;
+
+        Ok(Some(encryption_properties))
+    }
+
+    /// Generate file decryption properties to use when reading a Parquet file.
+    /// Rather than provide the AES keys directly for decryption, we set a `KeyRetriever`
+    /// that can determine the keys using the encryption metadata.
+    async fn get_file_decryption_properties(
+        &self,
+        _options: &EncryptionFactoryOptions,
+        _file_path: &Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>> {
+        let decryption_properties =
+            FileDecryptionProperties::with_key_retriever(Arc::new(TestKeyRetriever {}))
+                .build()?;
+        Ok(Some(decryption_properties))
+    }
+}
+
+/// Mock implementation of encrypting a key that simply base64 encodes the key.
+/// Note that this is not a secure way to store encryption keys,
+/// and for production use keys should be encrypted with a KMS.
+fn wrap_key(key: &[u8]) -> Vec<u8> {
+    base64::prelude::BASE64_STANDARD
+        .encode(key)
+        .as_bytes()
+        .to_vec()
+}
+
+struct TestKeyRetriever {}
+
+impl KeyRetriever for TestKeyRetriever {
+    /// Get a data encryption key using the metadata stored in the Parquet file.
+    fn retrieve_key(
+        &self,
+        key_metadata: &[u8],
+    ) -> datafusion::parquet::errors::Result<Vec<u8>> {
+        let key_metadata = std::str::from_utf8(key_metadata)?;
+        let key = base64::prelude::BASE64_STANDARD
+            .decode(key_metadata)
+            .unwrap();
+        Ok(key)
+    }
+}
diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs
similarity index 73%
rename from datafusion-examples/examples/parquet_exec_visitor.rs
rename to datafusion-examples/examples/data_io/parquet_exec_visitor.rs
index 84f92d4f450e1..47caf9480df93 100644
--- a/datafusion-examples/examples/parquet_exec_visitor.rs
+++ b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
 use datafusion::datasource::file_format::parquet::ParquetFormat;
@@ -25,34 +27,37 @@ use datafusion::error::DataFusionError;
 use datafusion::execution::context::SessionContext;
 use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::{
-    execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor,
+    ExecutionPlan, ExecutionPlanVisitor, execute_stream, visit_execution_plan,
 };
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::StreamExt;
 
 /// Example of collecting metrics after execution by visiting the `ExecutionPlan`
-#[tokio::main]
-async fn main() {
+pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> {
     let ctx = SessionContext::new();
 
-    let test_data = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Configure listing options
     let file_format = ParquetFormat::default().with_enable_pruning(true);
     let listing_options = ListingOptions::new(Arc::new(file_format));
 
+    let table_path = parquet_temp.file_uri()?;
+
     // First example were we use an absolute path, which requires no additional setup.
-    let _ = ctx
-        .register_listing_table(
-            "my_table",
-            &format!("file://{test_data}/alltypes_plain.parquet"),
-            listing_options.clone(),
-            None,
-            None,
-        )
-        .await;
-
-    let df = ctx.sql("SELECT * FROM my_table").await.unwrap();
-    let plan = df.create_physical_plan().await.unwrap();
+    ctx.register_listing_table(
+        "my_table",
+        &table_path,
+        listing_options.clone(),
+        None,
+        None,
+    )
+    .await?;
+
+    let df = ctx.sql("SELECT * FROM my_table").await?;
+    let plan = df.create_physical_plan().await?;
 
     // Create empty visitor
     let mut visitor = ParquetExecVisitor {
@@ -63,12 +68,12 @@ async fn main() {
     // Make sure you execute the plan to collect actual execution statistics.
     // For example, in this example the `file_scan_config` is known without executing
     // but the `bytes_scanned` would be None if we did not execute.
-    let mut batch_stream = execute_stream(plan.clone(), ctx.task_ctx()).unwrap();
+    let mut batch_stream = execute_stream(plan.clone(), ctx.task_ctx())?;
     while let Some(batch) = batch_stream.next().await {
         println!("Batch rows: {}", batch.unwrap().num_rows());
     }
 
-    visit_execution_plan(plan.as_ref(), &mut visitor).unwrap();
+    visit_execution_plan(plan.as_ref(), &mut visitor)?;
 
     println!(
         "ParquetExecVisitor bytes_scanned: {:?}",
@@ -78,6 +83,8 @@ async fn main() {
         "ParquetExecVisitor file_groups: {:?}",
         visitor.file_groups.unwrap()
     );
+
+    Ok(())
 }
 
 /// Define a struct with fields to hold the execution information you want to
@@ -97,18 +104,17 @@ impl ExecutionPlanVisitor for ParquetExecVisitor {
     /// or `post_visit` (visit each node after its children/inputs)
     fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
         // If needed match on a specific `ExecutionPlan` node type
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if let Some((file_config, _)) =
+        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+            && let Some((file_config, _)) =
                 data_source_exec.downcast_to_file_source::<ParquetSource>()
-            {
-                self.file_groups = Some(file_config.file_groups.clone());
-
-                let metrics = match data_source_exec.metrics() {
-                    None => return Ok(true),
-                    Some(metrics) => metrics,
-                };
-                self.bytes_scanned = metrics.sum_by_name("bytes_scanned");
-            }
+        {
+            self.file_groups = Some(file_config.file_groups.clone());
+
+            let metrics = match data_source_exec.metrics() {
+                None => return Ok(true),
+                Some(metrics) => metrics,
+            };
+            self.bytes_scanned = metrics.sum_by_name("bytes_scanned");
         }
         Ok(true)
     }
diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/data_io/parquet_index.rs
similarity index 97%
rename from datafusion-examples/examples/parquet_index.rs
rename to datafusion-examples/examples/data_io/parquet_index.rs
index e5ae3cc86bfe5..e11a303f442a4 100644
--- a/datafusion-examples/examples/parquet_index.rs
+++ b/datafusion-examples/examples/data_io/parquet_index.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{
     Array, ArrayRef, AsArray, BooleanArray, Int32Array, RecordBatch, StringArray,
     UInt64Array,
@@ -25,19 +27,19 @@ use async_trait::async_trait;
 use datafusion::catalog::Session;
 use datafusion::common::pruning::PruningStatistics;
 use datafusion::common::{
-    internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
 };
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::memory::DataSourceExec;
 use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource};
-use datafusion::datasource::TableProvider;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::logical_expr::{
-    utils::conjunction, TableProviderFilterPushDown, TableType,
+    TableProviderFilterPushDown, TableType, utils::conjunction,
 };
 use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use datafusion::parquet::arrow::{
-    arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter,
+    ArrowWriter, arrow_reader::ParquetRecordBatchReaderBuilder,
 };
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_optimizer::pruning::PruningPredicate;
@@ -50,8 +52,8 @@ use std::fs;
 use std::fs::{DirEntry, File};
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use tempfile::TempDir;
 use url::Url;
 
@@ -71,7 +73,7 @@ use url::Url;
 /// (using the same underlying APIs)
 ///
 /// For a more advanced example of using an index to prune row groups within a
-/// file, see the (forthcoming) `advanced_parquet_index` example.
+/// file, see the `advanced_parquet_index` example.
 ///
 /// # Diagram
 ///
@@ -99,12 +101,10 @@ use url::Url;
 ///                   Thus some parquet files are      │             │
 ///                   "pruned" and thus are not        └─────────────┘
 ///                   scanned at all                   Parquet Files
-///
 /// ```
 ///
 /// [`ListingTable`]: datafusion::datasource::listing::ListingTable
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parquet_index() -> Result<()> {
     // Demo data has three files, each with schema
     // * file_name (string)
     // * value (int32)
@@ -243,10 +243,11 @@ impl TableProvider for IndexTableProvider {
         let files = self.index.get_files(predicate.clone())?;
 
         let object_store_url = ObjectStoreUrl::parse("file://")?;
-        let source = Arc::new(ParquetSource::default().with_predicate(predicate));
+        let source =
+            Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate));
         let mut file_scan_config_builder =
-            FileScanConfigBuilder::new(object_store_url, self.schema(), source)
-                .with_projection(projection.cloned())
+            FileScanConfigBuilder::new(object_store_url, source)
+                .with_projection_indices(projection.cloned())?
                 .with_limit(limit);
 
         // Transform to the format needed to pass to DataSourceExec
@@ -313,7 +314,7 @@ impl Display for ParquetMetadataIndex {
             "ParquetMetadataIndex(last_num_pruned: {})",
             self.last_num_pruned()
         )?;
-        let batches = pretty_format_batches(&[self.index.clone()]).unwrap();
+        let batches = pretty_format_batches(std::slice::from_ref(&self.index)).unwrap();
         write!(f, "{batches}",)
     }
 }
@@ -510,7 +511,7 @@ impl ParquetMetadataIndexBuilder {
 
         // Get the schema of the file. A real system might have to handle the
         // case where the schema of the file is not the same as the schema of
-        // the other files e.g. using SchemaAdapter.
+        // the other files e.g. using PhysicalExprAdapterFactory.
         if self.file_schema.is_none() {
             self.file_schema = Some(reader.schema().clone());
         }
diff --git a/datafusion-examples/examples/query-http-csv.rs b/datafusion-examples/examples/data_io/query_http_csv.rs
similarity index 91%
rename from datafusion-examples/examples/query-http-csv.rs
rename to datafusion-examples/examples/data_io/query_http_csv.rs
index fa3fd2ac068df..71421e6270ccb 100644
--- a/datafusion-examples/examples/query-http-csv.rs
+++ b/datafusion-examples/examples/data_io/query_http_csv.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
 use object_store::http::HttpBuilder;
 use std::sync::Arc;
 use url::Url;
 
-/// This example demonstrates executing a simple query against an Arrow data source (CSV) and
-/// fetching results
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Configure `object_store` and run a query against files via HTTP
+pub async fn query_http_csv() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/data_io/remote_catalog.rs
similarity index 97%
rename from datafusion-examples/examples/remote_catalog.rs
rename to datafusion-examples/examples/data_io/remote_catalog.rs
index 70c0963545e08..10ec26b1d5c05 100644
--- a/datafusion-examples/examples/remote_catalog.rs
+++ b/datafusion-examples/examples/data_io/remote_catalog.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 /// This example shows how to implement the DataFusion [`CatalogProvider`] API
 /// for catalogs that are remote (require network access) and/or offer only
 /// asynchronous APIs such as [Polaris], [Unity], and [Hive].
@@ -39,15 +41,15 @@ use datafusion::common::{assert_batches_eq, internal_datafusion_err, plan_err};
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::execution::SendableRecordBatchStream;
 use datafusion::logical_expr::{Expr, TableType};
-use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::prelude::{DataFrame, SessionContext};
 use futures::TryStreamExt;
 use std::any::Any;
 use std::sync::Arc;
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Interfacing with a remote catalog (e.g. over a network)
+pub async fn remote_catalog() -> Result<()> {
     // As always, we create a session context to interact with DataFusion
     let ctx = SessionContext::new();
 
@@ -75,8 +77,8 @@ async fn main() -> Result<()> {
     let state = ctx.state();
 
     // First, parse the SQL (but don't plan it / resolve any table references)
-    let dialect = state.config().options().sql_parser.dialect.as_str();
-    let statement = state.sql_to_statement(sql, dialect)?;
+    let dialect = state.config().options().sql_parser.dialect;
+    let statement = state.sql_to_statement(sql, &dialect)?;
 
     // Find all `TableReferences` in the parsed queries. These correspond to the
     // tables referred to by the query (in this case
diff --git a/datafusion-examples/examples/dataframe/cache_factory.rs b/datafusion-examples/examples/dataframe/cache_factory.rs
new file mode 100644
index 0000000000000..a92c3dc4ce26a
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/cache_factory.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::sync::{Arc, RwLock};
+
+use arrow::array::RecordBatch;
+use async_trait::async_trait;
+use datafusion::catalog::memory::MemorySourceConfig;
+use datafusion::common::DFSchemaRef;
+use datafusion::error::Result;
+use datafusion::execution::context::QueryPlanner;
+use datafusion::execution::session_state::CacheFactory;
+use datafusion::execution::{SessionState, SessionStateBuilder};
+use datafusion::logical_expr::{
+    Extension, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+};
+use datafusion::physical_plan::{ExecutionPlan, collect_partitioned};
+use datafusion::physical_planner::{
+    DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner,
+};
+use datafusion::prelude::*;
+use datafusion_common::HashMap;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+
+/// This example demonstrates how to leverage [CacheFactory] to implement custom caching strategies for dataframes in DataFusion.
+/// By default, [DataFrame::cache] in Datafusion is eager and creates an in-memory table. This example shows a basic alternative implementation for lazy caching.
+/// Specifically, it implements:
+/// - A [CustomCacheFactory] that creates a logical node [CacheNode] representing the cache operation.
+/// - A [CacheNodePlanner] (an [ExtensionPlanner]) that understands [CacheNode] and performs caching.
+/// - A [CacheNodeQueryPlanner] that installs [CacheNodePlanner].
+/// - A simple in-memory [CacheManager] that stores cached [RecordBatch]es. Note that the implementation for this example is very naive and only implements put, but for real production use cases cache eviction and drop should also be implemented.
+pub async fn cache_dataframe_with_custom_logic() -> Result<()> {
+    let session_state = SessionStateBuilder::new()
+        .with_cache_factory(Some(Arc::new(CustomCacheFactory {})))
+        .with_query_planner(Arc::new(CacheNodeQueryPlanner::default()))
+        .build();
+    let ctx = SessionContext::new_with_state(session_state);
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    // Read the parquet files and show its schema using 'describe'
+    let parquet_df = ctx
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
+        .await?;
+
+    let df_cached = parquet_df
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(1.0)))?
+        .cache()
+        .await?;
+
+    let df1 = df_cached.clone().filter(col("car").eq(lit("red")))?;
+    let df2 = df1.clone().sort(vec![col("car").sort(true, false)])?;
+
+    // should see log for caching only once
+    df_cached.show().await?;
+    df1.show().await?;
+    df2.show().await?;
+
+    Ok(())
+}
+
+#[derive(Debug)]
+struct CustomCacheFactory {}
+
+impl CacheFactory for CustomCacheFactory {
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        _session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        Ok(LogicalPlan::Extension(Extension {
+            node: Arc::new(CacheNode { input: plan }),
+        }))
+    }
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Hash, Debug)]
+struct CacheNode {
+    input: LogicalPlan,
+}
+
+impl UserDefinedLogicalNodeCore for CacheNode {
+    fn name(&self) -> &str {
+        "CacheNode"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "CacheNode")
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        assert_eq!(inputs.len(), 1, "input size must be one");
+        Ok(Self {
+            input: inputs.swap_remove(0),
+        })
+    }
+}
+
+struct CacheNodePlanner {
+    cache_manager: Arc<RwLock<CacheManager>>,
+}
+
+#[async_trait]
+impl ExtensionPlanner for CacheNodePlanner {
+    async fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        if let Some(cache_node) = node.as_any().downcast_ref::<CacheNode>() {
+            assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs");
+            assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
+            if self
+                .cache_manager
+                .read()
+                .unwrap()
+                .get(&cache_node.input)
+                .is_none()
+            {
+                let ctx = session_state.task_ctx();
+                println!("caching in memory");
+                let batches =
+                    collect_partitioned(physical_inputs[0].clone(), ctx).await?;
+                self.cache_manager
+                    .write()
+                    .unwrap()
+                    .put(cache_node.input.clone(), batches);
+            } else {
+                println!("fetching directly from cache manager");
+            }
+            Ok(self
+                .cache_manager
+                .read()
+                .unwrap()
+                .get(&cache_node.input)
+                .map(|batches| {
+                    let exec: Arc<dyn ExecutionPlan> = MemorySourceConfig::try_new_exec(
+                        batches,
+                        physical_inputs[0].schema(),
+                        None,
+                    )
+                    .unwrap();
+                    exec
+                }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+struct CacheNodeQueryPlanner {
+    cache_manager: Arc<RwLock<CacheManager>>,
+}
+
+#[async_trait]
+impl QueryPlanner for CacheNodeQueryPlanner {
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        session_state: &SessionState,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let physical_planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(
+                CacheNodePlanner {
+                    cache_manager: Arc::clone(&self.cache_manager),
+                },
+            )]);
+        physical_planner
+            .create_physical_plan(logical_plan, session_state)
+            .await
+    }
+}
+
+// This naive implementation only includes put, but for real production use cases cache eviction and drop should also be implemented.
+#[derive(Debug, Default)]
+struct CacheManager {
+    cache: HashMap<LogicalPlan, Vec<Vec<RecordBatch>>>,
+}
+
+impl CacheManager {
+    pub fn put(&mut self, k: LogicalPlan, v: Vec<Vec<RecordBatch>>) {
+        self.cache.insert(k, v);
+    }
+
+    pub fn get(&self, k: &LogicalPlan) -> Option<&Vec<Vec<RecordBatch>>> {
+        self.cache.get(k)
+    }
+}
diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs
similarity index 73%
rename from datafusion-examples/examples/dataframe.rs
rename to datafusion-examples/examples/dataframe/dataframe.rs
index 57a28aeca0de2..dde19cb476f14 100644
--- a/datafusion-examples/examples/dataframe.rs
+++ b/datafusion-examples/examples/dataframe/dataframe.rs
@@ -15,22 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::fs::File;
+use std::io::Write;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray, StringViewArray};
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::catalog::MemTable;
+use datafusion::common::ScalarValue;
 use datafusion::common::config::CsvOptions;
 use datafusion::common::parsers::CompressionTypeVariant;
-use datafusion::common::DataFusionError;
-use datafusion::common::ScalarValue;
 use datafusion::dataframe::DataFrameWriteOptions;
 use datafusion::error::Result;
 use datafusion::functions_aggregate::average::avg;
 use datafusion::functions_aggregate::min_max::max;
 use datafusion::prelude::*;
-use std::fs::File;
-use std::io::Write;
-use std::sync::Arc;
-use tempfile::tempdir;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use tempfile::{TempDir, tempdir};
+use tokio::fs::create_dir_all;
 
 /// This example demonstrates using DataFusion's DataFrame API
 ///
@@ -39,6 +43,7 @@ use tempfile::tempdir;
 /// * [read_parquet]: execute queries against parquet files
 /// * [read_csv]: execute queries against csv files
 /// * [read_memory]: execute queries against in-memory arrow data
+/// * [read_memory_macro]: execute queries against in-memory arrow data using macro
 ///
 /// # Writing out to local storage
 ///
@@ -53,12 +58,8 @@ use tempfile::tempdir;
 /// * [where_scalar_subquery]: execute a scalar subquery
 /// * [where_in_subquery]: execute a subquery with an IN clause
 /// * [where_exist_subquery]: execute a subquery with an EXISTS clause
-///
-/// # Querying data
-///
-/// * [query_to_date]: execute queries against parquet files
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn dataframe_example() -> Result<()> {
+    env_logger::init();
     // The SessionContext is the main high level API for interacting with DataFusion
     let ctx = SessionContext::new();
     read_parquet(&ctx).await?;
@@ -66,8 +67,8 @@ async fn main() -> Result<()> {
     read_memory(&ctx).await?;
     read_memory_macro().await?;
     write_out(&ctx).await?;
-    register_aggregate_test_data("t1", &ctx).await?;
-    register_aggregate_test_data("t2", &ctx).await?;
+    register_cars_test_data("t1", &ctx).await?;
+    register_cars_test_data("t2", &ctx).await?;
     where_scalar_subquery(&ctx).await?;
     where_in_subquery(&ctx).await?;
     where_exist_subquery(&ctx).await?;
@@ -79,23 +80,24 @@ async fn main() -> Result<()> {
 /// 2. Show the schema
 /// 3. Select columns and rows
 async fn read_parquet(ctx: &SessionContext) -> Result<()> {
-    // Find the local path of "alltypes_plain.parquet"
-    let testdata = datafusion::test_util::parquet_test_data();
-    let filename = &format!("{testdata}/alltypes_plain.parquet");
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(ctx, &dataset.path()).await?;
 
     // Read the parquet files and show its schema using 'describe'
     let parquet_df = ctx
-        .read_parquet(filename, ParquetReadOptions::default())
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     // show its schema using 'describe'
     parquet_df.clone().describe().await?.show().await?;
 
     // Select three columns and filter the results
-    // so that only rows where id > 1 are returned
+    // so that only rows where speed > 1 are returned
+    // select car, speed, time from t where speed > 1
     parquet_df
-        .select_columns(&["id", "bool_col", "timestamp_col"])?
-        .filter(col("id").gt(lit(1)))?
+        .select_columns(&["car", "speed", "time"])?
+        .filter(col("speed").gt(lit(1)))?
         .show()
         .await?;
 
@@ -198,7 +200,7 @@ async fn read_memory_macro() -> Result<()> {
 /// 2. Write out a DataFrame to a parquet file
 /// 3. Write out a DataFrame to a csv file
 /// 4. Write out a DataFrame to a json file
-async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionError> {
+async fn write_out(ctx: &SessionContext) -> Result<()> {
     let array = StringViewArray::from(vec!["a", "b", "c"]);
     let schema = Arc::new(Schema::new(vec![Field::new(
         "tablecol1",
@@ -210,15 +212,26 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
     ctx.register_table("initial_data", Arc::new(mem_table))?;
     let df = ctx.table("initial_data").await?;
 
-    ctx.sql(
-        "create external table
-    test(tablecol1 varchar)
-    stored as parquet
-    location './datafusion-examples/test_table/'",
-    )
-    .await?
-    .collect()
-    .await?;
+    // Create a single temp root with subdirectories
+    let tmp_root = TempDir::new()?;
+    let examples_root = tmp_root.path().join("datafusion-examples");
+    create_dir_all(&examples_root).await?;
+    let table_dir = examples_root.join("test_table");
+    let parquet_dir = examples_root.join("test_parquet");
+    let csv_dir = examples_root.join("test_csv");
+    let json_dir = examples_root.join("test_json");
+    create_dir_all(&table_dir).await?;
+    create_dir_all(&parquet_dir).await?;
+    create_dir_all(&csv_dir).await?;
+    create_dir_all(&json_dir).await?;
+
+    let create_sql = format!(
+        "CREATE EXTERNAL TABLE test(tablecol1 varchar)
+         STORED AS parquet
+         LOCATION '{}'",
+        table_dir.display()
+    );
+    ctx.sql(&create_sql).await?.collect().await?;
 
     // This is equivalent to INSERT INTO test VALUES ('a'), ('b'), ('c').
     // The behavior of write_table depends on the TableProvider's implementation
@@ -229,7 +242,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_parquet(
-            "./datafusion-examples/test_parquet/",
+            parquet_dir.to_str().unwrap(),
             DataFrameWriteOptions::new(),
             None,
         )
@@ -237,7 +250,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_csv(
-            "./datafusion-examples/test_csv/",
+            csv_dir.to_str().unwrap(),
             // DataFrameWriteOptions contains options which control how data is written
             // such as compression codec
             DataFrameWriteOptions::new(),
@@ -247,7 +260,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 
     df.clone()
         .write_json(
-            "./datafusion-examples/test_json/",
+            json_dir.to_str().unwrap(),
             DataFrameWriteOptions::new(),
             None,
         )
@@ -257,7 +270,7 @@ async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionEr
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select c1,c2 from t1 where (select avg(t2.c2) from t2 where t1.c1 = t2.c1)>0 limit 3;
+/// select car, speed from t1 where (select avg(t2.speed) from t2 where t1.car = t2.car) > 0 limit 3;
 async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
@@ -265,14 +278,14 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
             scalar_subquery(Arc::new(
                 ctx.table("t2")
                     .await?
-                    .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
-                    .aggregate(vec![], vec![avg(col("t2.c2"))])?
-                    .select(vec![avg(col("t2.c2"))])?
+                    .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))?
+                    .aggregate(vec![], vec![avg(col("t2.speed"))])?
+                    .select(vec![avg(col("t2.speed"))])?
                     .into_unoptimized_plan(),
             ))
-            .gt(lit(0u8)),
+            .gt(lit(0.0)),
         )?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
@@ -280,22 +293,24 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select t1.c1, t1.c2 from t1 where t1.c2 in (select max(t2.c2) from t2 where t2.c1 > 0 ) limit 3;
+/// select t1.car, t1.speed from t1 where t1.speed in (select max(t2.speed) from t2 where t2.car = 'red') limit 3;
 async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
         .filter(in_subquery(
-            col("t1.c2"),
+            col("t1.speed"),
             Arc::new(
                 ctx.table("t2")
                     .await?
-                    .filter(col("t2.c1").gt(lit(ScalarValue::UInt8(Some(0)))))?
-                    .aggregate(vec![], vec![max(col("t2.c2"))])?
-                    .select(vec![max(col("t2.c2"))])?
+                    .filter(
+                        col("t2.car").eq(lit(ScalarValue::Utf8(Some("red".to_string())))),
+                    )?
+                    .aggregate(vec![], vec![max(col("t2.speed"))])?
+                    .select(vec![max(col("t2.speed"))])?
                     .into_unoptimized_plan(),
             ),
         ))?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
@@ -303,31 +318,27 @@ async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
 }
 
 /// Use the DataFrame API to execute the following subquery:
-/// select t1.c1, t1.c2 from t1 where exists (select t2.c2 from t2 where t1.c1 = t2.c1) limit 3;
+/// select t1.car, t1.speed from t1 where exists (select t2.speed from t2 where t1.car = t2.car) limit 3;
 async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> {
     ctx.table("t1")
         .await?
         .filter(exists(Arc::new(
             ctx.table("t2")
                 .await?
-                .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
-                .select(vec![col("t2.c2")])?
+                .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))?
+                .select(vec![col("t2.speed")])?
                 .into_unoptimized_plan(),
         )))?
-        .select(vec![col("t1.c1"), col("t1.c2")])?
+        .select(vec![col("t1.car"), col("t1.speed")])?
         .limit(0, Some(3))?
         .show()
         .await?;
     Ok(())
 }
 
-async fn register_aggregate_test_data(name: &str, ctx: &SessionContext) -> Result<()> {
-    let testdata = datafusion::test_util::arrow_test_data();
-    ctx.register_csv(
-        name,
-        &format!("{testdata}/csv/aggregate_test_100.csv"),
-        CsvReadOptions::default(),
-    )
-    .await?;
+async fn register_cars_test_data(name: &str, ctx: &SessionContext) -> Result<()> {
+    let dataset = ExampleDataset::Cars;
+    ctx.register_csv(name, dataset.path_str()?, CsvReadOptions::default())
+        .await?;
     Ok(())
 }
diff --git a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs
new file mode 100644
index 0000000000000..b031225dc9b69
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs
@@ -0,0 +1,366 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+
+use arrow::array::{Array, Float64Array, StringViewArray};
+use datafusion::common::assert_batches_eq;
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use futures::StreamExt;
+
+/// This example shows how to convert query results into Rust structs by using
+/// the Arrow APIs to convert the results into Rust native types.
+///
+/// This is a bit tricky initially as the results are returned as columns stored
+/// as [ArrayRef]
+///
+/// [ArrayRef]: arrow::array::ArrayRef
+pub async fn deserialize_to_struct() -> Result<()> {
+    // Run a query that returns two columns of data
+    let ctx = SessionContext::new();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    ctx.register_parquet(
+        "cars",
+        parquet_temp.path_str()?,
+        ParquetReadOptions::default(),
+    )
+    .await?;
+
+    let df = ctx
+        .sql("SELECT car, speed FROM cars ORDER BY speed LIMIT 50")
+        .await?;
+
+    // print out the results showing we have car and speed columns and a deterministic ordering
+    let results = df.clone().collect().await?;
+    assert_batches_eq!(
+        [
+            "+-------+-------+",
+            "| car   | speed |",
+            "+-------+-------+",
+            "| red   | 0.0   |",
+            "| red   | 1.0   |",
+            "| green | 2.0   |",
+            "| red   | 3.0   |",
+            "| red   | 7.0   |",
+            "| red   | 7.1   |",
+            "| red   | 7.2   |",
+            "| green | 8.0   |",
+            "| green | 10.0  |",
+            "| green | 10.3  |",
+            "| green | 10.4  |",
+            "| green | 10.5  |",
+            "| green | 11.0  |",
+            "| green | 12.0  |",
+            "| green | 14.0  |",
+            "| green | 15.0  |",
+            "| green | 15.1  |",
+            "| green | 15.2  |",
+            "| red   | 17.0  |",
+            "| red   | 18.0  |",
+            "| red   | 19.0  |",
+            "| red   | 20.0  |",
+            "| red   | 20.3  |",
+            "| red   | 21.4  |",
+            "| red   | 21.5  |",
+            "+-------+-------+",
+        ],
+        &results
+    );
+
+    // We will now convert the query results into a Rust struct
+    let mut stream = df.execute_stream().await?;
+    let mut list: Vec<Data> = vec![];
+
+    // DataFusion produces data in chunks called `RecordBatch`es which are
+    // typically 8000 rows each. This loop processes each `RecordBatch` as it is
+    // produced by the query plan and adds it to the list
+    while let Some(batch) = stream.next().await.transpose()? {
+        // Each `RecordBatch` has one or more columns. Each column is stored as
+        // an `ArrayRef`. To interact with data using Rust native types we need to
+        // convert these `ArrayRef`s into concrete array types using APIs from
+        // the arrow crate.
+
+        // In this case, we know that each batch has two columns of the  Arrow
+        // types StringView and Float64, so first we cast the two columns to the
+        // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).:
+        let car_col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .expect("car column must be Utf8View");
+
+        let speed_col = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .expect("speed column must be Float64");
+
+        // With PrimitiveArrays, we can access to the values as native Rust
+        // types String and f64, and forming the desired `Data` structs
+        for i in 0..batch.num_rows() {
+            let car = if car_col.is_null(i) {
+                None
+            } else {
+                Some(car_col.value(i).to_string())
+            };
+
+            let speed = if speed_col.is_null(i) {
+                None
+            } else {
+                Some(speed_col.value(i))
+            };
+
+            list.push(Data { car, speed });
+        }
+    }
+
+    // Finally, we have the results in the list of Rust structs
+    let res = format!("{list:#?}");
+    assert_eq!(
+        res,
+        r#"[
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            0.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            1.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            2.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            3.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.1,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            7.2,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            8.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.3,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.4,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            10.5,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            11.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            12.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            14.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.1,
+        ),
+    },
+    Data {
+        car: Some(
+            "green",
+        ),
+        speed: Some(
+            15.2,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            17.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            18.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            19.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            20.0,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            20.3,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            21.4,
+        ),
+    },
+    Data {
+        car: Some(
+            "red",
+        ),
+        speed: Some(
+            21.5,
+        ),
+    },
+]"#
+    );
+
+    let speed_green_sum: f64 = list
+        .iter()
+        .filter(|data| data.car.as_deref() == Some("green"))
+        .filter_map(|data| data.speed)
+        .sum();
+    let speed_red_sum: f64 = list
+        .iter()
+        .filter(|data| data.car.as_deref() == Some("red"))
+        .filter_map(|data| data.speed)
+        .sum();
+    assert_eq!(speed_green_sum, 133.5);
+    assert_eq!(speed_red_sum, 162.5);
+
+    Ok(())
+}
+
+/// This is target struct where we want the query results.
+#[derive(Debug)]
+struct Data {
+    car: Option<String>,
+    speed: Option<f64>,
+}
diff --git a/datafusion-examples/examples/dataframe/main.rs b/datafusion-examples/examples/dataframe/main.rs
new file mode 100644
index 0000000000000..25b5377d38239
--- /dev/null
+++ b/datafusion-examples/examples/dataframe/main.rs
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are core DataFrame API usage
+//!
+//! These examples demonstrate core DataFrame API usage.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example dataframe -- [all|dataframe|deserialize_to_struct|cache_factory]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `cache_factory`  
+//!   (file: cache_factory.rs, desc: Custom lazy caching for DataFrames using `CacheFactory`)
+//
+//! - `dataframe`
+//!   (file: dataframe.rs, desc: Query DataFrames from various sources and write output)
+//!
+//! - `deserialize_to_struct`
+//!   (file: deserialize_to_struct.rs, desc: Convert Arrow arrays into Rust structs)
+
+mod cache_factory;
+mod dataframe;
+mod deserialize_to_struct;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Dataframe,
+    DeserializeToStruct,
+    CacheFactory,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "dataframe";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Dataframe => {
+                dataframe::dataframe_example().await?;
+            }
+            ExampleKind::DeserializeToStruct => {
+                deserialize_to_struct::deserialize_to_struct().await?;
+            }
+            ExampleKind::CacheFactory => {
+                cache_factory::cache_dataframe_with_custom_logic().await?;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/deserialize_to_struct.rs b/datafusion-examples/examples/deserialize_to_struct.rs
deleted file mode 100644
index d6655b3b654f9..0000000000000
--- a/datafusion-examples/examples/deserialize_to_struct.rs
+++ /dev/null
@@ -1,150 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::array::{AsArray, PrimitiveArray};
-use arrow::datatypes::{Float64Type, Int32Type};
-use datafusion::common::assert_batches_eq;
-use datafusion::error::Result;
-use datafusion::prelude::*;
-use futures::StreamExt;
-
-/// This example shows how to convert query results into Rust structs by using
-/// the Arrow APIs to convert the results into Rust native types.
-///
-/// This is a bit tricky initially as the results are returned as columns stored
-/// as [ArrayRef]
-///
-/// [ArrayRef]: arrow::array::ArrayRef
-#[tokio::main]
-async fn main() -> Result<()> {
-    // Run a query that returns two columns of data
-    let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
-    ctx.register_parquet(
-        "alltypes_plain",
-        &format!("{testdata}/alltypes_plain.parquet"),
-        ParquetReadOptions::default(),
-    )
-    .await?;
-    let df = ctx
-        .sql("SELECT int_col, double_col FROM alltypes_plain")
-        .await?;
-
-    // print out the results showing we have an int32 and a float64 column
-    let results = df.clone().collect().await?;
-    assert_batches_eq!(
-        [
-            "+---------+------------+",
-            "| int_col | double_col |",
-            "+---------+------------+",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "| 0       | 0.0        |",
-            "| 1       | 10.1       |",
-            "+---------+------------+",
-        ],
-        &results
-    );
-
-    // We will now convert the query results into a Rust struct
-    let mut stream = df.execute_stream().await?;
-    let mut list = vec![];
-
-    // DataFusion produces data in chunks called `RecordBatch`es which are
-    // typically 8000 rows each. This loop processes each `RecordBatch` as it is
-    // produced by the query plan and adds it to the list
-    while let Some(b) = stream.next().await.transpose()? {
-        // Each `RecordBatch` has one or more columns. Each column is stored as
-        // an `ArrayRef`. To interact with data using Rust native types we need to
-        // convert these `ArrayRef`s into concrete array types using APIs from
-        // the arrow crate.
-
-        // In this case, we know that each batch has two columns of the  Arrow
-        // types Int32 and Float64, so first we cast the two columns to the
-        // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).:
-        let int_col: &PrimitiveArray<Int32Type> = b.column(0).as_primitive();
-        let float_col: &PrimitiveArray<Float64Type> = b.column(1).as_primitive();
-
-        // With PrimitiveArrays, we can access to the values as native Rust
-        // types i32 and f64, and forming the desired `Data` structs
-        for (i, f) in int_col.values().iter().zip(float_col.values()) {
-            list.push(Data {
-                int_col: *i,
-                double_col: *f,
-            })
-        }
-    }
-
-    // Finally, we have the results in the list of Rust structs
-    let res = format!("{list:#?}");
-    assert_eq!(
-        res,
-        r#"[
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-    Data {
-        int_col: 0,
-        double_col: 0.0,
-    },
-    Data {
-        int_col: 1,
-        double_col: 10.1,
-    },
-]"#
-    );
-
-    // Use the fields in the struct to avoid clippy complaints
-    let int_sum = list.iter().fold(0, |acc, x| acc + x.int_col);
-    let double_sum = list.iter().fold(0.0, |acc, x| acc + x.double_col);
-    assert_eq!(int_sum, 4);
-    assert_eq!(double_sum, 40.4);
-
-    Ok(())
-}
-
-/// This is target struct where we want the query results.
-#[derive(Debug)]
-struct Data {
-    int_col: i32,
-    double_col: f64,
-}
diff --git a/datafusion-examples/examples/execution_monitoring/main.rs b/datafusion-examples/examples/execution_monitoring/main.rs
new file mode 100644
index 0000000000000..8f80c36929ca2
--- /dev/null
+++ b/datafusion-examples/examples/execution_monitoring/main.rs
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These examples of memory and performance management
+//!
+//! These examples demonstrate memory and performance management.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example execution_monitoring -- [all|mem_pool_exec_plan|mem_pool_tracking|tracing]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `mem_pool_exec_plan`
+//!   (file: memory_pool_execution_plan.rs, desc: Memory-aware ExecutionPlan with spilling)
+//!
+//! - `mem_pool_tracking`
+//!   (file: memory_pool_tracking.rs, desc: Demonstrates memory tracking)
+//!
+//! - `tracing`
+//!   (file: tracing.rs, desc: Demonstrates tracing integration)
+
+mod memory_pool_execution_plan;
+mod memory_pool_tracking;
+mod tracing;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    MemPoolExecPlan,
+    MemPoolTracking,
+    Tracing,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "execution_monitoring";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::MemPoolExecPlan => {
+                memory_pool_execution_plan::memory_pool_execution_plan().await?
+            }
+            ExampleKind::MemPoolTracking => {
+                memory_pool_tracking::mem_pool_tracking().await?
+            }
+            ExampleKind::Tracing => tracing::tracing().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
new file mode 100644
index 0000000000000..1440347d4413d
--- /dev/null
+++ b/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
@@ -0,0 +1,316 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to implement custom ExecutionPlans that properly
+//! use memory tracking through TrackConsumersPool.
+//!
+//! This shows the pattern for implementing memory-aware operators that:
+//! - Register memory consumers with the pool
+//! - Reserve memory before allocating
+//! - Handle memory pressure by spilling to disk
+//! - Release memory when done
+
+use arrow::record_batch::RecordBatch;
+use arrow_schema::SchemaRef;
+use datafusion::common::record_batch;
+use datafusion::common::tree_node::TreeNodeRecursion;
+use datafusion::common::{exec_datafusion_err, internal_err};
+use datafusion::datasource::{DefaultTableSource, memory::MemTable};
+use datafusion::error::Result;
+use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::logical_expr::LogicalPlanBuilder;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+};
+use datafusion::prelude::*;
+use futures::stream::{StreamExt, TryStreamExt};
+use std::any::Any;
+use std::fmt;
+use std::sync::Arc;
+
+/// Shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
+pub async fn memory_pool_execution_plan() -> Result<()> {
+    println!("=== DataFusion ExecutionPlan Memory Tracking Example ===\n");
+
+    // Set up a runtime with memory tracking
+    // Set a low memory limit to trigger spilling on the second batch
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(15000, 1.0) // Allow only enough for 1 batch at once
+        .build_arc()?;
+
+    let config = SessionConfig::new().with_coalesce_batches(false);
+    let ctx = SessionContext::new_with_config_rt(config, runtime.clone());
+
+    // Create smaller batches to ensure we get multiple RecordBatches from the scan
+    // Make each batch smaller than the memory limit to force multiple batches
+    let batch1 = record_batch!(
+        ("id", Int32, vec![1; 800]),
+        ("name", Utf8, vec!["Alice"; 800])
+    )?;
+
+    let batch2 = record_batch!(
+        ("id", Int32, vec![2; 800]),
+        ("name", Utf8, vec!["Bob"; 800])
+    )?;
+
+    let batch3 = record_batch!(
+        ("id", Int32, vec![3; 800]),
+        ("name", Utf8, vec!["Charlie"; 800])
+    )?;
+
+    let batch4 = record_batch!(
+        ("id", Int32, vec![4; 800]),
+        ("name", Utf8, vec!["David"; 800])
+    )?;
+
+    let schema = batch1.schema();
+
+    // Create a single MemTable with all batches in one partition to preserve order but ensure streaming
+    let mem_table = Arc::new(MemTable::try_new(
+        Arc::clone(&schema),
+        vec![vec![batch1, batch2, batch3, batch4]], // Single partition with multiple batches
+    )?);
+
+    // Build logical plan with a single scan that will yield multiple batches
+    let table_source = Arc::new(DefaultTableSource::new(mem_table));
+    let logical_plan =
+        LogicalPlanBuilder::scan("multi_batch_table", table_source, None)?.build()?;
+
+    // Convert to physical plan
+    let physical_plan = ctx.state().create_physical_plan(&logical_plan).await?;
+
+    println!("Example: Custom Memory-Aware BufferingExecutionPlan");
+    println!("---------------------------------------------------");
+
+    // Wrap our input plan with our custom BufferingExecutionPlan
+    let buffering_plan = Arc::new(BufferingExecutionPlan::new(schema, physical_plan));
+
+    // Create a task context from our runtime
+    let task_ctx = Arc::new(TaskContext::default().with_runtime(runtime));
+
+    // Execute the plan directly to demonstrate memory tracking
+    println!("Executing BufferingExecutionPlan with memory tracking...");
+    println!("Memory limit: 15000 bytes - should trigger spill on later batches\n");
+
+    let stream = buffering_plan.execute(0, task_ctx.clone())?;
+    let _results: Vec<RecordBatch> = stream.try_collect().await?;
+
+    println!("\nSuccessfully executed BufferingExecutionPlan!");
+
+    println!("\nThe BufferingExecutionPlan processed 4 input batches and");
+    println!("demonstrated memory tracking with spilling behavior when the");
+    println!("memory limit was exceeded by later batches.");
+    println!("Check the console output above to see the spill messages.");
+
+    Ok(())
+}
+
+/// Example of an external batch bufferer that uses memory reservation.
+///
+/// It's a simple example which spills all existing data to disk
+/// whenever the memory limit is reached.
+struct ExternalBatchBufferer {
+    buffer: Vec<u8>,
+    reservation: MemoryReservation,
+    spill_count: usize,
+}
+
+impl ExternalBatchBufferer {
+    fn new(reservation: MemoryReservation) -> Self {
+        Self {
+            buffer: Vec::new(),
+            reservation,
+            spill_count: 0,
+        }
+    }
+
+    #[expect(clippy::needless_pass_by_value)]
+    fn add_batch(&mut self, batch_data: Vec<u8>) -> Result<()> {
+        let additional_memory = batch_data.len();
+
+        // Try to reserve memory before allocating
+        if self.reservation.try_grow(additional_memory).is_err() {
+            // Memory limit reached - handle by spilling
+            println!(
+                "Memory limit reached, spilling {} bytes to disk",
+                self.buffer.len()
+            );
+            self.spill_to_disk()?;
+
+            // Try again after spilling
+            self.reservation.try_grow(additional_memory)?;
+        }
+
+        self.buffer.extend_from_slice(&batch_data);
+        println!(
+            "Added batch of {} bytes, total buffered: {} bytes",
+            additional_memory,
+            self.buffer.len()
+        );
+        Ok(())
+    }
+
+    fn spill_to_disk(&mut self) -> Result<()> {
+        // Simulate writing buffer to disk
+        self.spill_count += 1;
+        println!(
+            "Spill #{}: Writing {} bytes to disk",
+            self.spill_count,
+            self.buffer.len()
+        );
+
+        // Free the memory after spilling
+        let freed_bytes = self.buffer.len();
+        self.buffer.clear();
+        self.reservation.shrink(freed_bytes);
+
+        Ok(())
+    }
+
+    fn finish(&mut self) -> Vec<u8> {
+        let result = std::mem::take(&mut self.buffer);
+        // Free the memory when done
+        self.reservation.free();
+        println!("Finished processing, released {} bytes", result.len());
+        result
+    }
+}
+
+/// Example of an ExecutionPlan that uses the ExternalBatchBufferer.
+#[derive(Debug)]
+struct BufferingExecutionPlan {
+    schema: SchemaRef,
+    input: Arc<dyn ExecutionPlan>,
+    properties: Arc<PlanProperties>,
+}
+
+impl BufferingExecutionPlan {
+    fn new(schema: SchemaRef, input: Arc<dyn ExecutionPlan>) -> Self {
+        let properties = input.properties().clone();
+
+        Self {
+            schema,
+            input,
+            properties,
+        }
+    }
+}
+
+impl DisplayAs for BufferingExecutionPlan {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "BufferingExecutionPlan")
+    }
+}
+
+impl ExecutionPlan for BufferingExecutionPlan {
+    fn name(&self) -> &'static str {
+        "BufferingExecutionPlan"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if children.len() == 1 {
+            Ok(Arc::new(BufferingExecutionPlan::new(
+                self.schema.clone(),
+                children[0].clone(),
+            )))
+        } else {
+            internal_err!("BufferingExecutionPlan must have exactly one child")
+        }
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // Register memory consumer with the context's memory pool
+        let reservation = MemoryConsumer::new("MyExternalBatchBufferer")
+            .with_can_spill(true)
+            .register(context.memory_pool());
+
+        // Incoming stream of batches
+        let mut input_stream = self.input.execute(partition, context)?;
+
+        // Process the stream and collect all batches
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.schema(),
+            futures::stream::once(async move {
+                let mut operator = ExternalBatchBufferer::new(reservation);
+
+                while let Some(batch) = input_stream.next().await {
+                    let batch = batch?;
+
+                    // Convert RecordBatch to bytes for this example
+                    let batch_data = vec![1u8; batch.get_array_memory_size()];
+
+                    operator.add_batch(batch_data)?;
+                }
+
+                // Finish processing and get results
+                let _final_result = operator.finish();
+                // In a real implementation, you would convert final_result back to RecordBatches
+
+                // Since this is a simplified example, return an empty batch
+                // In a real implementation, you would create a batch stream from the processed results
+                record_batch!(("id", Int32, vec![5]), ("name", Utf8, vec!["Eve"]))
+                    .map_err(|e| {
+                        exec_datafusion_err!("Failed to create final RecordBatch: {e}")
+                    })
+            }),
+        )))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+}
diff --git a/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
new file mode 100644
index 0000000000000..af3031c690fa3
--- /dev/null
+++ b/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to use TrackConsumersPool for memory tracking and debugging.
+//!
+//! The TrackConsumersPool provides enhanced error messages that show the top memory consumers
+//! when memory allocation fails, making it easier to debug memory issues in DataFusion queries.
+//!
+//! # Examples
+//!
+//! * [`automatic_usage_example`]: Shows how to use RuntimeEnvBuilder to automatically enable memory tracking
+
+use datafusion::error::Result;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::prelude::*;
+
+/// Demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages
+pub async fn mem_pool_tracking() -> Result<()> {
+    println!("=== DataFusion Memory Pool Tracking Example ===\n");
+
+    // Example 1: Automatic Usage with RuntimeEnvBuilder
+    automatic_usage_example().await?;
+
+    Ok(())
+}
+
+/// Example 1: Automatic Usage with RuntimeEnvBuilder
+///
+/// This shows the recommended way to use TrackConsumersPool through RuntimeEnvBuilder,
+/// which automatically creates a TrackConsumersPool with sensible defaults.
+async fn automatic_usage_example() -> Result<()> {
+    println!("Example 1: Automatic Usage with RuntimeEnvBuilder");
+    println!("------------------------------------------------");
+
+    // Success case: Create a runtime with reasonable memory limit
+    println!("Success case: Normal operation with sufficient memory");
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(5_000_000, 1.0) // 5MB, 100% utilization
+        .build_arc()?;
+
+    let config = SessionConfig::new();
+    let ctx = SessionContext::new_with_config_rt(config, runtime);
+
+    // Create a simple table for demonstration
+    ctx.sql("CREATE TABLE test AS VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+        .await?
+        .collect()
+        .await?;
+
+    println!("✓ Created table with memory tracking enabled");
+
+    // Run a simple query to show it works
+    let results = ctx.sql("SELECT * FROM test").await?.collect().await?;
+    println!(
+        "✓ Query executed successfully. Found {} rows",
+        results.len()
+    );
+
+    println!("\n{}", "-".repeat(50));
+
+    // Error case: Create a runtime with low memory limit to trigger errors
+    println!("Error case: Triggering memory limit error with detailed error messages");
+
+    // Use a WITH query that generates data and then processes it to trigger memory usage
+    match ctx.sql("
+        WITH large_dataset AS (
+            SELECT
+                column1 as id,
+                column1 * 2 as doubled,
+                repeat('data_', 20) || column1 as text_field,
+                column1 * column1 as squared
+            FROM generate_series(1, 2000) as t(column1)
+        ),
+        aggregated AS (
+            SELECT
+                id,
+                doubled,
+                text_field,
+                squared,
+                sum(doubled) OVER (ORDER BY id ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) as running_sum
+            FROM large_dataset
+        )
+        SELECT
+            a1.id,
+            a1.text_field,
+            a2.text_field as text_field2,
+            a1.running_sum + a2.running_sum as combined_sum
+        FROM aggregated a1
+        JOIN aggregated a2 ON a1.id = a2.id - 1
+        ORDER BY a1.id
+    ").await?.collect().await {
+        Ok(results) => panic!("Should not succeed! Yet got {} batches", results.len()),
+        Err(e) => {
+            println!("✓ Expected memory limit error during data processing:");
+            println!("Error: {e}");
+            /* Example error message:
+                Error: Not enough memory to continue external sort. Consider increasing the memory limit config: 'datafusion.runtime.memory_limit',
+                or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'.
+                caused by
+                    Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
+                    ExternalSorterMerge[3]#112(can spill: false) consumed 10.0 MB, peak 10.0 MB,
+                    ExternalSorterMerge[10]#147(can spill: false) consumed 10.0 MB, peak 10.0 MB,
+                    ExternalSorter[1]#93(can spill: true) consumed 69.0 KB, peak 69.0 KB,
+                    ExternalSorter[13]#155(can spill: true) consumed 67.6 KB, peak 67.6 KB,
+                    ExternalSorter[8]#140(can spill: true) consumed 67.2 KB, peak 67.2 KB.
+                Error: Failed to allocate additional 10.0 MB for ExternalSorterMerge[0] with 0.0 B already allocated for this reservation - 7.1 MB remain available for the total pool
+             */
+        }
+    }
+
+    println!("\nNote: The error message above shows which memory consumers");
+    println!("were using the most memory when the limit was exceeded.");
+
+    Ok(())
+}
diff --git a/datafusion-examples/examples/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs
similarity index 82%
rename from datafusion-examples/examples/tracing.rs
rename to datafusion-examples/examples/execution_monitoring/tracing.rs
index 334ee0f4e5686..172c1ca83b3bd 100644
--- a/datafusion-examples/examples/tracing.rs
+++ b/datafusion-examples/examples/execution_monitoring/tracing.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates the tracing injection feature for the DataFusion runtime.
 //! Tasks spawned on new threads behave differently depending on whether a tracer is injected.
 //! The log output clearly distinguishes the two cases.
@@ -49,20 +51,21 @@
 //! 10:29:40.809  INFO                 main ThreadId(01) tracing: ***** WITH tracer: Non-main tasks DID inherit the `run_instrumented_query` span *****
 //! ```
 
-use datafusion::common::runtime::{set_join_set_tracer, JoinSetTracer};
+use std::any::Any;
+use std::sync::Arc;
+
+use datafusion::common::runtime::{JoinSetTracer, set_join_set_tracer};
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
 use datafusion::error::Result;
 use datafusion::prelude::*;
-use datafusion::test_util::parquet_test_data;
-use futures::future::BoxFuture;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::FutureExt;
-use std::any::Any;
-use std::sync::Arc;
-use tracing::{info, instrument, Instrument, Level, Span};
+use futures::future::BoxFuture;
+use tracing::{Instrument, Level, Span, info, instrument};
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Demonstrates the tracing injection feature for the DataFusion runtime
+pub async fn tracing() -> Result<()> {
     // Initialize tracing subscriber with thread info.
     tracing_subscriber::fmt()
         .with_thread_ids(true)
@@ -73,7 +76,9 @@ async fn main() -> Result<()> {
     // Run query WITHOUT tracer injection.
     info!("***** RUNNING WITHOUT INJECTED TRACER *****");
     run_instrumented_query().await?;
-    info!("***** WITHOUT tracer: `tokio-runtime-worker` tasks did NOT inherit the `run_instrumented_query` span *****");
+    info!(
+        "***** WITHOUT tracer: `tokio-runtime-worker` tasks did NOT inherit the `run_instrumented_query` span *****"
+    );
 
     // Inject custom tracer so tasks run in the current span.
     info!("Injecting custom tracer...");
@@ -82,7 +87,9 @@ async fn main() -> Result<()> {
     // Run query WITH tracer injection.
     info!("***** RUNNING WITH INJECTED TRACER *****");
     run_instrumented_query().await?;
-    info!("***** WITH tracer: `tokio-runtime-worker` tasks DID inherit the `run_instrumented_query` span *****");
+    info!(
+        "***** WITH tracer: `tokio-runtime-worker` tasks DID inherit the `run_instrumented_query` span *****"
+    );
 
     Ok(())
 }
@@ -120,18 +127,27 @@ async fn run_instrumented_query() -> Result<()> {
     info!("Starting query execution");
 
     let ctx = SessionContext::new();
-    let test_data = parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let file_format = ParquetFormat::default().with_enable_pruning(true);
-    let listing_options = ListingOptions::new(Arc::new(file_format))
-        .with_file_extension("alltypes_tiny_pages_plain.parquet");
+    let listing_options =
+        ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet");
 
-    let table_path = format!("file://{test_data}/");
-    info!("Registering table 'alltypes' from {}", table_path);
-    ctx.register_listing_table("alltypes", &table_path, listing_options, None, None)
-        .await
-        .expect("Failed to register table");
+    info!("Registering table 'cars' from {}", parquet_temp.path_str()?);
+    ctx.register_listing_table(
+        "cars",
+        parquet_temp.path_str()?,
+        listing_options,
+        None,
+        None,
+    )
+    .await
+    .expect("Failed to register table");
 
-    let sql = "SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col";
+    let sql = "SELECT COUNT(*), car, sum(speed) FROM cars GROUP BY car";
     info!(sql, "Executing SQL query");
     let result = ctx.sql(sql).await?.collect().await?;
     info!("Query complete: {} batches returned", result.len());
diff --git a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
similarity index 87%
rename from datafusion-examples/examples/external_dependency/dataframe-to-s3.rs
rename to datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
index e75ba5dd5328a..fdb8a3c9c051a 100644
--- a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs
+++ b/datafusion-examples/examples/external_dependency/dataframe_to_s3.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::env;
 use std::sync::Arc;
 
 use datafusion::dataframe::DataFrameWriteOptions;
-use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
 use datafusion::error::Result;
 use datafusion::prelude::*;
@@ -28,14 +30,18 @@ use datafusion::prelude::*;
 use object_store::aws::AmazonS3Builder;
 use url::Url;
 
-/// This example demonstrates querying data from AmazonS3 and writing
-/// the result of a query back to AmazonS3
-#[tokio::main]
-async fn main() -> Result<()> {
+/// This example demonstrates querying data from Amazon S3 and writing
+/// the result of a query back to Amazon S3.
+///
+/// The following environment variables must be defined:
+///
+/// - AWS_ACCESS_KEY_ID
+/// - AWS_SECRET_ACCESS_KEY
+pub async fn dataframe_to_s3() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    //enter region and bucket to which your credentials have GET and PUT access
+    // enter region and bucket to which your credentials have GET and PUT access
     let region = "<bucket-region-here>";
     let bucket_name = "<bucket-name-here>";
 
@@ -66,13 +72,13 @@ async fn main() -> Result<()> {
         .write_parquet(&out_path, DataFrameWriteOptions::new(), None)
         .await?;
 
-    //write as JSON to s3
+    // write as JSON to s3
     let json_out = format!("s3://{bucket_name}/json_out");
     df.clone()
         .write_json(&json_out, DataFrameWriteOptions::new(), None)
         .await?;
 
-    //write as csv to s3
+    // write as csv to s3
     let csv_out = format!("s3://{bucket_name}/csv_out");
     df.write_csv(&csv_out, DataFrameWriteOptions::new(), None)
         .await?;
diff --git a/datafusion-examples/examples/external_dependency/main.rs b/datafusion-examples/examples/external_dependency/main.rs
new file mode 100644
index 0000000000000..447e7d38bdd5b
--- /dev/null
+++ b/datafusion-examples/examples/external_dependency/main.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are using data from Amazon S3 examples
+//!
+//! These examples demonstrate how to work with data from Amazon S3.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example external_dependency -- [all|dataframe_to_s3|query_aws_s3]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `dataframe_to_s3`
+//!   (file: dataframe_to_s3.rs, desc: Query DataFrames and write results to S3)
+//!
+//! - `query_aws_s3`
+//!   (file: query_aws_s3.rs, desc: Query S3-backed data using object_store)
+
+mod dataframe_to_s3;
+mod query_aws_s3;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    DataframeToS3,
+    QueryAwsS3,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "external_dependency";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::DataframeToS3 => dataframe_to_s3::dataframe_to_s3().await?,
+            ExampleKind::QueryAwsS3 => query_aws_s3::query_aws_s3().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query_aws_s3.rs
similarity index 90%
rename from datafusion-examples/examples/external_dependency/query-aws-s3.rs
rename to datafusion-examples/examples/external_dependency/query_aws_s3.rs
index da2d7e4879f99..63507bb3eed11 100644
--- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs
+++ b/datafusion-examples/examples/external_dependency/query_aws_s3.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::prelude::*;
 use object_store::aws::AmazonS3Builder;
@@ -22,15 +24,13 @@ use std::env;
 use std::sync::Arc;
 use url::Url;
 
-/// This example demonstrates querying data in an S3 bucket.
+/// This example demonstrates querying data in a public S3 bucket
+/// (the NYC TLC open dataset: `s3://nyc-tlc`).
 ///
 /// The following environment variables must be defined:
-///
-/// - AWS_ACCESS_KEY_ID
-/// - AWS_SECRET_ACCESS_KEY
-///
-#[tokio::main]
-async fn main() -> Result<()> {
+/// - `AWS_ACCESS_KEY_ID`
+/// - `AWS_SECRET_ACCESS_KEY`
+pub async fn query_aws_s3() -> Result<()> {
     let ctx = SessionContext::new();
 
     // the region must be set to the region where the bucket exists until the following
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
index e9c0c5b43d682..e2d0e3fa6744d 100644
--- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
@@ -28,6 +28,9 @@ datafusion = { workspace = true }
 datafusion-ffi = { workspace = true }
 ffi_module_interface = { path = "../ffi_module_interface" }
 
+[lints]
+workspace = true
+
 [lib]
 name = "ffi_example_table_provider"
 crate-type = ["cdylib", 'rlib']
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
index a83f15926f054..eb217ef9e4832 100644
--- a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
@@ -21,6 +21,7 @@ use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait};
 use arrow::array::RecordBatch;
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::{common::record_batch, datasource::MemTable};
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 use datafusion_ffi::table_provider::FFI_TableProvider;
 use ffi_module_interface::{TableProviderModule, TableProviderModuleRef};
 
@@ -34,7 +35,9 @@ fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
 
 /// Here we only wish to create a simple table provider as an example.
 /// We create an in-memory table and convert it to it's FFI counterpart.
-extern "C" fn construct_simple_table_provider() -> FFI_TableProvider {
+extern "C" fn construct_simple_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let schema = Arc::new(Schema::new(vec![
         Field::new("a", DataType::Int32, true),
         Field::new("b", DataType::Float64, true),
@@ -50,7 +53,7 @@ extern "C" fn construct_simple_table_provider() -> FFI_TableProvider {
 
     let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
 
-    FFI_TableProvider::new(Arc::new(table_provider), true, None)
+    FFI_TableProvider::new_with_ffi_codec(Arc::new(table_provider), true, None, codec)
 }
 
 #[export_root_module]
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
index 612a219324763..fe4902711241e 100644
--- a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
@@ -18,9 +18,12 @@
 [package]
 name = "ffi_module_interface"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 publish = false
 
+[lints]
+workspace = true
+
 [dependencies]
 abi_stable = "0.11.3"
 datafusion-ffi = { workspace = true }
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
index 88690e9297135..3b2b9e1871dae 100644
--- a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use abi_stable::{
-    declare_root_module_statics,
+    StableAbi, declare_root_module_statics,
     library::{LibraryError, RootModule},
     package_version_strings,
     sabi_types::VersionStrings,
-    StableAbi,
 };
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 use datafusion_ffi::table_provider::FFI_TableProvider;
 
 #[repr(C)]
@@ -34,7 +34,8 @@ use datafusion_ffi::table_provider::FFI_TableProvider;
 /// how a user may wish to separate these concerns.
 pub struct TableProviderModule {
     /// Constructs the table provider
-    pub create_table: extern "C" fn() -> FFI_TableProvider,
+    pub create_table:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_TableProvider,
 }
 
 impl RootModule for TableProviderModuleRef {
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
index 028a366aab1c0..8d7434dca211b 100644
--- a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
@@ -18,9 +18,12 @@
 [package]
 name = "ffi_module_loader"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 publish = false
 
+[lints]
+workspace = true
+
 [dependencies]
 abi_stable = "0.11.3"
 datafusion = { workspace = true }
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
index 6e376ca866e8f..8ce5b156df3b1 100644
--- a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
@@ -22,8 +22,10 @@ use datafusion::{
     prelude::SessionContext,
 };
 
-use abi_stable::library::{development_utils::compute_library_path, RootModule};
-use datafusion_ffi::table_provider::ForeignTableProvider;
+use abi_stable::library::{RootModule, development_utils::compute_library_path};
+use datafusion::datasource::TableProvider;
+use datafusion::execution::TaskContextProvider;
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 use ffi_module_interface::TableProviderModuleRef;
 
 #[tokio::main]
@@ -39,6 +41,11 @@ async fn main() -> Result<()> {
         TableProviderModuleRef::load_from_directory(&library_path)
             .map_err(|e| DataFusionError::External(Box::new(e)))?;
 
+    let ctx = Arc::new(SessionContext::new());
+    let codec = FFI_LogicalExtensionCodec::new_default(
+        &(Arc::clone(&ctx) as Arc<dyn TaskContextProvider>),
+    );
+
     // By calling the code below, the table provided will be created within
     // the module's code.
     let ffi_table_provider =
@@ -46,16 +53,14 @@ async fn main() -> Result<()> {
             .create_table()
             .ok_or(DataFusionError::NotImplemented(
                 "External table provider failed to implement create_table".to_string(),
-            ))?();
+            ))?(codec);
 
     // In order to access the table provider within this executable, we need to
-    // turn it into a `ForeignTableProvider`.
-    let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into();
-
-    let ctx = SessionContext::new();
+    // turn it into a `TableProvider`.
+    let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_table_provider).into();
 
     // Display the data to show the full cycle works.
-    ctx.register_table("external_table", Arc::new(foreign_table_provider))?;
+    ctx.register_table("external_table", foreign_table_provider)?;
     let df = ctx.table("external_table").await?;
     df.show().await?;
 
diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/client.rs
similarity index 78%
rename from datafusion-examples/examples/flight/flight_client.rs
rename to datafusion-examples/examples/flight/client.rs
index e3237284b4307..8f6856a4e4849 100644
--- a/datafusion-examples/examples/flight/flight_client.rs
+++ b/datafusion-examples/examples/flight/client.rs
@@ -15,32 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use datafusion::arrow::datatypes::Schema;
-
 use arrow_flight::flight_descriptor;
 use arrow_flight::flight_service_client::FlightServiceClient;
 use arrow_flight::utils::flight_data_to_arrow_batch;
 use arrow_flight::{FlightDescriptor, Ticket};
+use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::util::pretty;
+use datafusion::prelude::SessionContext;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use tonic::transport::Endpoint;
 
 /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for
 /// Parquet files and executing SQL queries against them on a remote server.
 /// This example is run along-side the example `flight_server`.
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let testdata = datafusion::test_util::parquet_test_data();
+pub async fn client() -> Result<(), Box<dyn std::error::Error>> {
+    let ctx = SessionContext::new();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Create Flight client
-    let mut client = FlightServiceClient::connect("http://localhost:50051").await?;
+    let endpoint = Endpoint::new("http://localhost:50051")?;
+    let channel = endpoint.connect().await?;
+    let mut client = FlightServiceClient::new(channel);
 
     // Call get_schema to get the schema of a Parquet file
     let request = tonic::Request::new(FlightDescriptor {
         r#type: flight_descriptor::DescriptorType::Path as i32,
         cmd: Default::default(),
-        path: vec![format!("{testdata}/alltypes_plain.parquet")],
+        path: vec![format!("{}", parquet_temp.path_str()?)],
     });
 
     let schema_result = client.get_schema(request).await?.into_inner();
@@ -49,7 +58,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // Call do_get to execute a SQL query and receive results
     let request = tonic::Request::new(Ticket {
-        ticket: "SELECT id FROM alltypes_plain".into(),
+        ticket: "SELECT car FROM cars".into(),
     });
 
     let mut stream = client.do_get(request).await?.into_inner();
diff --git a/datafusion-examples/examples/flight/main.rs b/datafusion-examples/examples/flight/main.rs
new file mode 100644
index 0000000000000..426e806486f70
--- /dev/null
+++ b/datafusion-examples/examples/flight/main.rs
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Arrow Flight Examples
+//!
+//! These examples demonstrate Arrow Flight usage.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example flight -- [all|client|server|sql_server]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!   Note: The Flight server must be started in a separate process
+//!   before running the `client` example. Therefore, running `all` will
+//!   not produce a full server+client workflow automatically.
+//!
+//! - `client`
+//!   (file: client.rs, desc: Execute SQL queries via Arrow Flight protocol)
+//!
+//! - `server`
+//!   (file: server.rs, desc: Run DataFusion server accepting FlightSQL/JDBC queries)
+//!
+//! - `sql_server`
+//!   (file: sql_server.rs, desc: Standalone SQL server for JDBC clients)
+
+mod client;
+mod server;
+mod sql_server;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+/// The `all` option cannot run all examples end-to-end because the
+/// `server` example must run in a separate process before the `client`
+/// example can connect.  
+/// Therefore, `all` only iterates over individually runnable examples.
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Client,
+    Server,
+    SqlServer,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "flight";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<(), Box<dyn std::error::Error>> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Client => client::client().await?,
+            ExampleKind::Server => server::server().await?,
+            ExampleKind::SqlServer => sql_server::sql_server().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/server.rs
similarity index 84%
rename from datafusion-examples/examples/flight/flight_server.rs
rename to datafusion-examples/examples/flight/server.rs
index cc5f43746ddfb..b73c81dd7d2c3 100644
--- a/datafusion-examples/examples/flight/flight_server.rs
+++ b/datafusion-examples/examples/flight/server.rs
@@ -15,25 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator};
+//! See `main.rs` for how to run it.
+
 use std::sync::Arc;
 
+use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator};
+use arrow_flight::{
+    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
+    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
+    flight_service_server::FlightService, flight_service_server::FlightServiceServer,
+};
 use arrow_flight::{PollInfo, SchemaAsIpc};
 use datafusion::arrow::error::ArrowError;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{ListingOptions, ListingTableUrl};
+use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::stream::BoxStream;
 use tonic::transport::Server;
 use tonic::{Request, Response, Status, Streaming};
 
-use datafusion::prelude::*;
-
-use arrow_flight::{
-    flight_service_server::FlightService, flight_service_server::FlightServiceServer,
-    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
-    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
-};
-
 #[derive(Clone)]
 pub struct FlightServiceImpl {}
 
@@ -83,22 +84,27 @@ impl FlightService for FlightServiceImpl {
                 // create local execution context
                 let ctx = SessionContext::new();
 
-                let testdata = datafusion::test_util::parquet_test_data();
+                // Convert the CSV input into a temporary Parquet directory for querying
+                let dataset = ExampleDataset::Cars;
+                let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path())
+                    .await
+                    .map_err(|e| {
+                        Status::internal(format!("Error writing csv to parquet: {e}"))
+                    })?;
+                let parquet_path = parquet_temp.path_str().map_err(|e| {
+                    Status::internal(format!("Error getting parquet path: {e}"))
+                })?;
 
                 // register parquet file with the execution context
-                ctx.register_parquet(
-                    "alltypes_plain",
-                    &format!("{testdata}/alltypes_plain.parquet"),
-                    ParquetReadOptions::default(),
-                )
-                .await
-                .map_err(to_tonic_err)?;
+                ctx.register_parquet("cars", parquet_path, ParquetReadOptions::default())
+                    .await
+                    .map_err(to_tonic_err)?;
 
                 // create the DataFrame
                 let df = ctx.sql(sql).await.map_err(to_tonic_err)?;
 
                 // execute the query
-                let schema = df.schema().clone().into();
+                let schema = Arc::clone(df.schema().inner());
                 let results = df.collect().await.map_err(to_tonic_err)?;
                 if results.is_empty() {
                     return Err(Status::internal("There were no results from ticket"));
@@ -106,6 +112,7 @@ impl FlightService for FlightServiceImpl {
 
                 // add an initial FlightData message that sends schema
                 let options = arrow::ipc::writer::IpcWriteOptions::default();
+                let mut compression_context = CompressionContext::default();
                 let schema_flight_data = SchemaAsIpc::new(&schema, &options);
 
                 let mut flights = vec![FlightData::from(schema_flight_data)];
@@ -115,7 +122,7 @@ impl FlightService for FlightServiceImpl {
 
                 for batch in &results {
                     let (flight_dictionaries, flight_batch) = encoder
-                        .encoded_batch(batch, &mut tracker, &options)
+                        .encode(batch, &mut tracker, &options, &mut compression_context)
                         .map_err(|e: ArrowError| Status::internal(e.to_string()))?;
 
                     flights.extend(flight_dictionaries.into_iter().map(Into::into));
@@ -186,6 +193,7 @@ impl FlightService for FlightServiceImpl {
     }
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status {
     Status::internal(format!("{e:?}"))
 }
@@ -193,8 +201,7 @@ fn to_tonic_err(e: datafusion::error::DataFusionError) -> Status {
 /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for
 /// Parquet files and executing SQL queries against them on a remote server.
 /// This example is run along-side the example `flight_client`.
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+pub async fn server() -> Result<(), Box<dyn std::error::Error>> {
     let addr = "0.0.0.0:50051".parse()?;
     let service = FlightServiceImpl {};
 
diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs
similarity index 93%
rename from datafusion-examples/examples/flight/flight_sql_server.rs
rename to datafusion-examples/examples/flight/sql_server.rs
index 5a573ed52320d..e55aaa7250ea7 100644
--- a/datafusion-examples/examples/flight/flight_sql_server.rs
+++ b/datafusion-examples/examples/flight/sql_server.rs
@@ -15,6 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::pin::Pin;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, StringArray};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::ipc::writer::IpcWriteOptions;
@@ -36,12 +41,11 @@ use arrow_flight::{
 use dashmap::DashMap;
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::prelude::{DataFrame, ParquetReadOptions, SessionConfig, SessionContext};
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use futures::{Stream, StreamExt, TryStreamExt};
 use log::info;
 use mimalloc::MiMalloc;
 use prost::Message;
-use std::pin::Pin;
-use std::sync::Arc;
 use tonic::metadata::MetadataValue;
 use tonic::transport::Server;
 use tonic::{Request, Response, Status, Streaming};
@@ -68,9 +72,7 @@ macro_rules! status {
 ///
 /// Based heavily on Ballista's implementation: https://github.com/apache/datafusion-ballista/blob/main/ballista/scheduler/src/flight_sql.rs
 /// and the example in arrow-rs: https://github.com/apache/arrow-rs/blob/master/arrow-flight/examples/flight_sql_server.rs
-///
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+pub async fn sql_server() -> Result<(), Box<dyn std::error::Error>> {
     env_logger::init();
     let addr = "0.0.0.0:50051".parse()?;
     let service = FlightSqlServiceImpl {
@@ -100,22 +102,24 @@ impl FlightSqlServiceImpl {
             .with_information_schema(true);
         let ctx = Arc::new(SessionContext::new_with_config(session_config));
 
-        let testdata = datafusion::test_util::parquet_test_data();
+        // Convert the CSV input into a temporary Parquet directory for querying
+        let dataset = ExampleDataset::Cars;
+        let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path())
+            .await
+            .map_err(|e| status!("Error writing csv to parquet", e))?;
+        let parquet_path = parquet_temp
+            .path_str()
+            .map_err(|e| status!("Error getting parquet path", e))?;
 
         // register parquet file with the execution context
-        ctx.register_parquet(
-            "alltypes_plain",
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
-        .await
-        .map_err(|e| status!("Error registering table", e))?;
+        ctx.register_parquet("cars", parquet_path, ParquetReadOptions::default())
+            .await
+            .map_err(|e| status!("Error registering table", e))?;
 
         self.contexts.insert(uuid.clone(), ctx);
         Ok(uuid)
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_ctx<T>(&self, req: &Request<T>) -> Result<Arc<SessionContext>, Status> {
         // get the token from the authorization header on Request
         let auth = req
@@ -141,7 +145,6 @@ impl FlightSqlServiceImpl {
         }
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_plan(&self, handle: &str) -> Result<LogicalPlan, Status> {
         if let Some(plan) = self.statements.get(handle) {
             Ok(plan.clone())
@@ -150,7 +153,6 @@ impl FlightSqlServiceImpl {
         }
     }
 
-    #[allow(clippy::result_large_err)]
     fn get_result(&self, handle: &str) -> Result<Vec<RecordBatch>, Status> {
         if let Some(result) = self.results.get(handle) {
             Ok(result.clone())
@@ -198,13 +200,11 @@ impl FlightSqlServiceImpl {
         .unwrap()
     }
 
-    #[allow(clippy::result_large_err)]
     fn remove_plan(&self, handle: &str) -> Result<(), Status> {
         self.statements.remove(&handle.to_string());
         Ok(())
     }
 
-    #[allow(clippy::result_large_err)]
     fn remove_result(&self, handle: &str) -> Result<(), Status> {
         self.results.remove(&handle.to_string());
         Ok(())
@@ -395,10 +395,8 @@ impl FlightSqlService for FlightSqlServiceImpl {
         let plan_uuid = Uuid::new_v4().hyphenated().to_string();
         self.statements.insert(plan_uuid.clone(), plan.clone());
 
-        let plan_schema = plan.schema();
-
-        let arrow_schema = (&**plan_schema).into();
-        let message = SchemaAsIpc::new(&arrow_schema, &IpcWriteOptions::default())
+        let arrow_schema = plan.schema().as_arrow();
+        let message = SchemaAsIpc::new(arrow_schema, &IpcWriteOptions::default())
             .try_into()
             .map_err(|e| status!("Unable to serialize schema", e))?;
         let IpcMessage(schema_bytes) = message;
@@ -418,7 +416,9 @@ impl FlightSqlService for FlightSqlServiceImpl {
     ) -> Result<(), Status> {
         let handle = std::str::from_utf8(&handle.prepared_statement_handle);
         if let Ok(handle) = handle {
-            info!("do_action_close_prepared_statement: removing plan and results for {handle}");
+            info!(
+                "do_action_close_prepared_statement: removing plan and results for {handle}"
+            );
             let _ = self.remove_plan(handle);
             let _ = self.remove_result(handle);
         }
diff --git a/datafusion-examples/examples/composed_extension_codec.rs b/datafusion-examples/examples/proto/composed_extension_codec.rs
similarity index 66%
rename from datafusion-examples/examples/composed_extension_codec.rs
rename to datafusion-examples/examples/proto/composed_extension_codec.rs
index 4baefcae507f6..df3d58b7bfb81 100644
--- a/datafusion-examples/examples/composed_extension_codec.rs
+++ b/datafusion-examples/examples/proto/composed_extension_codec.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example demonstrates how to compose multiple PhysicalExtensionCodecs
 //!
 //! This can be helpful when an Execution plan tree has different nodes from different crates
@@ -32,20 +34,21 @@
 
 use std::any::Any;
 use std::fmt::Debug;
-use std::ops::Deref;
 use std::sync::Arc;
 
 use datafusion::common::Result;
-use datafusion::common::{internal_err, DataFusionError};
-use datafusion::logical_expr::registry::FunctionRegistry;
-use datafusion::logical_expr::{AggregateUDF, ScalarUDF};
+use datafusion::common::internal_err;
+use datafusion::common::tree_node::TreeNodeRecursion;
+use datafusion::execution::TaskContext;
 use datafusion::physical_plan::{DisplayAs, ExecutionPlan};
 use datafusion::prelude::SessionContext;
-use datafusion_proto::physical_plan::{AsExecutionPlan, PhysicalExtensionCodec};
+use datafusion_proto::physical_plan::{
+    AsExecutionPlan, ComposedPhysicalExtensionCodec, PhysicalExtensionCodec,
+};
 use datafusion_proto::protobuf;
 
-#[tokio::main]
-async fn main() {
+/// Example of using multiple extension codecs for serialization / deserialization
+pub async fn composed_extension_codec() -> Result<()> {
     // build execution plan that has both types of nodes
     //
     // Note each node requires a different `PhysicalExtensionCodec` to decode
@@ -54,29 +57,28 @@ async fn main() {
     });
     let ctx = SessionContext::new();
 
-    let composed_codec = ComposedPhysicalExtensionCodec {
-        codecs: vec![
-            Arc::new(ParentPhysicalExtensionCodec {}),
-            Arc::new(ChildPhysicalExtensionCodec {}),
-        ],
-    };
+    // Position in this list is important as it will be used for decoding.
+    // If new codec is added it should go to last position.
+    let composed_codec = ComposedPhysicalExtensionCodec::new(vec![
+        Arc::new(ParentPhysicalExtensionCodec {}),
+        Arc::new(ChildPhysicalExtensionCodec {}),
+    ]);
 
     // serialize execution plan to proto
     let proto: protobuf::PhysicalPlanNode =
         protobuf::PhysicalPlanNode::try_from_physical_plan(
             exec_plan.clone(),
             &composed_codec,
-        )
-        .expect("to proto");
+        )?;
 
     // deserialize proto back to execution plan
-    let runtime = ctx.runtime_env();
-    let result_exec_plan: Arc<dyn ExecutionPlan> = proto
-        .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec)
-        .expect("from proto");
+    let result_exec_plan: Arc<dyn ExecutionPlan> =
+        proto.try_into_physical_plan(&ctx.task_ctx(), &composed_codec)?;
 
     // assert that the original and deserialized execution plans are equal
     assert_eq!(format!("{exec_plan:?}"), format!("{result_exec_plan:?}"));
+
+    Ok(())
 }
 
 /// This example has two types of nodes: `ParentExec` and `ChildExec` which can only
@@ -105,7 +107,7 @@ impl ExecutionPlan for ParentExec {
         self
     }
 
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion::physical_plan::PlanProperties> {
         unreachable!()
     }
 
@@ -123,10 +125,19 @@ impl ExecutionPlan for ParentExec {
     fn execute(
         &self,
         _partition: usize,
-        _context: Arc<datafusion::execution::TaskContext>,
+        _context: Arc<TaskContext>,
     ) -> Result<datafusion::physical_plan::SendableRecordBatchStream> {
         unreachable!()
     }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 /// A PhysicalExtensionCodec that can serialize and deserialize ParentExec
@@ -138,7 +149,7 @@ impl PhysicalExtensionCodec for ParentPhysicalExtensionCodec {
         &self,
         buf: &[u8],
         inputs: &[Arc<dyn ExecutionPlan>],
-        _registry: &dyn FunctionRegistry,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         if buf == "ParentExec".as_bytes() {
             Ok(Arc::new(ParentExec {
@@ -181,7 +192,7 @@ impl ExecutionPlan for ChildExec {
         self
     }
 
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion::physical_plan::PlanProperties> {
         unreachable!()
     }
 
@@ -199,10 +210,19 @@ impl ExecutionPlan for ChildExec {
     fn execute(
         &self,
         _partition: usize,
-        _context: Arc<datafusion::execution::TaskContext>,
+        _context: Arc<TaskContext>,
     ) -> Result<datafusion::physical_plan::SendableRecordBatchStream> {
         unreachable!()
     }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 /// A PhysicalExtensionCodec that can serialize and deserialize ChildExec
@@ -214,7 +234,7 @@ impl PhysicalExtensionCodec for ChildPhysicalExtensionCodec {
         &self,
         buf: &[u8],
         _inputs: &[Arc<dyn ExecutionPlan>],
-        _registry: &dyn FunctionRegistry,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         if buf == "ChildExec".as_bytes() {
             Ok(Arc::new(ChildExec {}))
@@ -232,60 +252,3 @@ impl PhysicalExtensionCodec for ChildPhysicalExtensionCodec {
         }
     }
 }
-
-/// A PhysicalExtensionCodec that tries one of multiple inner codecs
-/// until one works
-#[derive(Debug)]
-struct ComposedPhysicalExtensionCodec {
-    codecs: Vec<Arc<dyn PhysicalExtensionCodec>>,
-}
-
-impl ComposedPhysicalExtensionCodec {
-    fn try_any<T>(
-        &self,
-        mut f: impl FnMut(&dyn PhysicalExtensionCodec) -> Result<T>,
-    ) -> Result<T> {
-        let mut last_err = None;
-        for codec in &self.codecs {
-            match f(codec.as_ref()) {
-                Ok(node) => return Ok(node),
-                Err(err) => last_err = Some(err),
-            }
-        }
-
-        Err(last_err.unwrap_or_else(|| {
-            DataFusionError::NotImplemented("Empty list of composed codecs".to_owned())
-        }))
-    }
-}
-
-impl PhysicalExtensionCodec for ComposedPhysicalExtensionCodec {
-    fn try_decode(
-        &self,
-        buf: &[u8],
-        inputs: &[Arc<dyn ExecutionPlan>],
-        registry: &dyn FunctionRegistry,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        self.try_any(|codec| codec.try_decode(buf, inputs, registry))
-    }
-
-    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
-        self.try_any(|codec| codec.try_encode(node.clone(), buf))
-    }
-
-    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
-        self.try_any(|codec| codec.try_decode_udf(name, buf))
-    }
-
-    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
-        self.try_any(|codec| codec.try_encode_udf(node, buf))
-    }
-
-    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
-        self.try_any(|codec| codec.try_decode_udaf(name, buf))
-    }
-
-    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
-        self.try_any(|codec| codec.try_encode_udaf(node, buf))
-    }
-}
diff --git a/datafusion-examples/examples/proto/expression_deduplication.rs b/datafusion-examples/examples/proto/expression_deduplication.rs
new file mode 100644
index 0000000000000..0dec807f8043a
--- /dev/null
+++ b/datafusion-examples/examples/proto/expression_deduplication.rs
@@ -0,0 +1,275 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example demonstrates how to use the `PhysicalExtensionCodec` trait's
+//! interception methods to implement expression deduplication during deserialization.
+//!
+//! This pattern is inspired by PR #18192, which introduces expression caching
+//! to reduce memory usage when deserializing plans with duplicate expressions.
+//!
+//! The key insight is that identical expressions serialize to identical protobuf bytes.
+//! By caching deserialized expressions keyed by their protobuf bytes, we can:
+//! 1. Return the same Arc for duplicate expressions
+//! 2. Reduce memory allocation during deserialization
+//! 3. Enable downstream optimizations that rely on Arc pointer equality
+//!
+//! This demonstrates the decorator pattern enabled by the `PhysicalExtensionCodec` trait,
+//! where all expression serialization/deserialization routes through the codec methods.
+
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::sync::{Arc, RwLock};
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::common::Result;
+use datafusion::execution::TaskContext;
+use datafusion::logical_expr::Operator;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::expressions::{BinaryExpr, col};
+use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion::prelude::SessionContext;
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr_with_converter;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
+use datafusion_proto::physical_plan::{
+    DefaultPhysicalExtensionCodec, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
+};
+use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode};
+use prost::Message;
+
+/// Example showing how to implement expression deduplication using the codec decorator pattern.
+///
+/// This demonstrates:
+/// 1. Creating a CachingCodec that caches expressions by their protobuf bytes
+/// 2. Intercepting deserialization to return cached Arcs for duplicate expressions
+/// 3. Verifying that duplicate expressions share the same Arc after deserialization
+///
+/// Deduplication is keyed by the protobuf bytes representing the expression,
+/// in reality deduplication could be done based on e.g. the pointer address of the
+/// serialized expression in memory, but this is simpler to demonstrate.
+///
+/// In this case our expression is trivial and just for demonstration purposes.
+/// In real scenarios, expressions can be much more complex, e.g. a large InList
+/// expression could be megabytes in size, so deduplication can save significant memory
+/// in addition to more correctly representing the original plan structure.
+pub async fn expression_deduplication() -> Result<()> {
+    println!("=== Expression Deduplication Example ===\n");
+
+    // Create a schema for our test expressions
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
+
+    // Step 1: Create expressions with duplicates
+    println!("Step 1: Creating expressions with duplicates...");
+
+    // Create expression: col("a")
+    let a = col("a", &schema)?;
+
+    // Create a clone to show duplicates
+    let a_clone = Arc::clone(&a);
+
+    // Combine: a OR a_clone
+    let combined_expr =
+        Arc::new(BinaryExpr::new(a, Operator::Or, a_clone)) as Arc<dyn PhysicalExpr>;
+    println!("  Created expression: a OR a with duplicates");
+    println!("  Note: a appears twice in the expression tree\n");
+    // Step 2: Create a filter plan with this expression
+    println!("Step 2: Creating physical plan with the expression...");
+
+    let input = Arc::new(PlaceholderRowExec::new(Arc::clone(&schema)));
+    let filter_plan: Arc<dyn ExecutionPlan> =
+        Arc::new(FilterExec::try_new(combined_expr, input)?);
+
+    println!("  Created FilterExec with duplicate sub-expressions\n");
+
+    // Step 3: Serialize with the caching codec
+    println!("Step 3: Serializing plan...");
+
+    let extension_codec = DefaultPhysicalExtensionCodec {};
+    let caching_converter = CachingCodec::new();
+    let proto =
+        caching_converter.execution_plan_to_proto(&filter_plan, &extension_codec)?;
+
+    // Serialize to bytes
+    let mut bytes = Vec::new();
+    proto.encode(&mut bytes).unwrap();
+    println!("  Serialized plan to {} bytes\n", bytes.len());
+
+    // Step 4: Deserialize with the caching codec
+    println!("Step 4: Deserializing plan with CachingCodec...");
+
+    let ctx = SessionContext::new();
+    let deserialized_plan = proto.try_into_physical_plan_with_converter(
+        &ctx.task_ctx(),
+        &extension_codec,
+        &caching_converter,
+    )?;
+
+    // Step 5: check that we deduplicated expressions
+    println!("Step 5: Checking for deduplicated expressions...");
+    let Some(filter_exec) = deserialized_plan.as_any().downcast_ref::<FilterExec>()
+    else {
+        panic!("Deserialized plan is not a FilterExec");
+    };
+    let predicate = Arc::clone(filter_exec.predicate());
+    let binary_expr = predicate
+        .as_any()
+        .downcast_ref::<BinaryExpr>()
+        .expect("Predicate is not a BinaryExpr");
+    let left = &binary_expr.left();
+    let right = &binary_expr.right();
+    // Check if left and right point to the same Arc
+    let deduplicated = Arc::ptr_eq(left, right);
+    if deduplicated {
+        println!("  Success: Duplicate expressions were deduplicated!");
+        println!(
+            "  Cache Stats: hits={}, misses={}",
+            caching_converter.stats.read().unwrap().cache_hits,
+            caching_converter.stats.read().unwrap().cache_misses,
+        );
+    } else {
+        println!("  Failure: Duplicate expressions were NOT deduplicated.");
+    }
+
+    Ok(())
+}
+
+// ============================================================================
+// CachingCodec - Implements expression deduplication
+// ============================================================================
+
+/// Statistics for cache performance monitoring
+#[derive(Debug, Default)]
+struct CacheStats {
+    cache_hits: usize,
+    cache_misses: usize,
+}
+
+/// A codec that caches deserialized expressions to enable deduplication.
+///
+/// When deserializing, if we've already seen the same protobuf bytes,
+/// we return the cached Arc instead of creating a new allocation.
+#[derive(Debug, Default)]
+struct CachingCodec {
+    /// Cache mapping protobuf bytes -> deserialized expression
+    expr_cache: RwLock<HashMap<Vec<u8>, Arc<dyn PhysicalExpr>>>,
+    /// Statistics for demonstration
+    stats: RwLock<CacheStats>,
+}
+
+impl CachingCodec {
+    fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl PhysicalExtensionCodec for CachingCodec {
+    // Required: decode custom extension nodes
+    fn try_decode(
+        &self,
+        _buf: &[u8],
+        _inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        datafusion::common::not_impl_err!("No custom extension nodes")
+    }
+
+    // Required: encode custom execution plans
+    fn try_encode(
+        &self,
+        _node: Arc<dyn ExecutionPlan>,
+        _buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        datafusion::common::not_impl_err!("No custom extension nodes")
+    }
+}
+
+impl PhysicalProtoConverterExtension for CachingCodec {
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        extension_codec: &dyn PhysicalExtensionCodec,
+        proto: &PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        proto.try_into_physical_plan_with_converter(ctx, extension_codec, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        extension_codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalPlanNode> {
+        PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            extension_codec,
+            self,
+        )
+    }
+
+    // CACHING IMPLEMENTATION: Intercept expression deserialization
+    fn proto_to_physical_expr(
+        &self,
+        proto: &PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        // Create cache key from protobuf bytes
+        let mut key = Vec::new();
+        proto.encode(&mut key).map_err(|e| {
+            datafusion::error::DataFusionError::Internal(format!(
+                "Failed to encode proto for cache key: {e}"
+            ))
+        })?;
+
+        // Check cache first
+        {
+            let cache = self.expr_cache.read().unwrap();
+            if let Some(cached) = cache.get(&key) {
+                // Cache hit! Update stats and return cached Arc
+                let mut stats = self.stats.write().unwrap();
+                stats.cache_hits += 1;
+                return Ok(Arc::clone(cached));
+            }
+        }
+
+        // Cache miss - deserialize and store
+        let expr =
+            parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)?;
+
+        // Store in cache
+        {
+            let mut cache = self.expr_cache.write().unwrap();
+            cache.insert(key, Arc::clone(&expr));
+            let mut stats = self.stats.write().unwrap();
+            stats.cache_misses += 1;
+        }
+
+        Ok(expr)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
diff --git a/datafusion-examples/examples/proto/main.rs b/datafusion-examples/examples/proto/main.rs
new file mode 100644
index 0000000000000..3f525b5d46afa
--- /dev/null
+++ b/datafusion-examples/examples/proto/main.rs
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Examples demonstrating DataFusion's plan serialization via the `datafusion-proto` crate
+//!
+//! These examples show how to use multiple extension codecs for serialization / deserialization.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example proto -- [all|composed_extension_codec|expression_deduplication]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `composed_extension_codec`
+//!   (file: composed_extension_codec.rs, desc: Use multiple extension codecs for serialization/deserialization)
+//!
+//! - `expression_deduplication`
+//!   (file: expression_deduplication.rs, desc: Example of expression caching/deduplication using the codec decorator pattern)
+
+mod composed_extension_codec;
+mod expression_deduplication;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    ComposedExtensionCodec,
+    ExpressionDeduplication,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "proto";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::ComposedExtensionCodec => {
+                composed_extension_codec::composed_extension_codec().await?
+            }
+            ExampleKind::ExpressionDeduplication => {
+                expression_deduplication::expression_deduplication().await?
+            }
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/analyzer_rule.rs b/datafusion-examples/examples/query_planning/analyzer_rule.rs
similarity index 97%
rename from datafusion-examples/examples/analyzer_rule.rs
rename to datafusion-examples/examples/query_planning/analyzer_rule.rs
index cb81cd167a88b..a86f5cdd2a5e3 100644
--- a/datafusion-examples/examples/analyzer_rule.rs
+++ b/datafusion-examples/examples/query_planning/analyzer_rule.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
+use datafusion::common::Result;
 use datafusion::common::config::ConfigOptions;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::Result;
-use datafusion::logical_expr::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion::logical_expr::{Expr, LogicalPlan, LogicalPlanBuilder, col, lit};
 use datafusion::optimizer::analyzer::AnalyzerRule;
 use datafusion::prelude::SessionContext;
 use std::sync::{Arc, Mutex};
@@ -35,8 +37,7 @@ use std::sync::{Arc, Mutex};
 /// level access control scheme by introducing a filter to the query.
 ///
 /// See [optimizer_rule.rs] for an example of a optimizer rule
-#[tokio::main]
-pub async fn main() -> Result<()> {
+pub async fn analyzer_rule() -> Result<()> {
     // AnalyzerRules run before OptimizerRules.
     //
     // DataFusion includes several built in AnalyzerRules for tasks such as type
diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/query_planning/expr_api.rs
similarity index 94%
rename from datafusion-examples/examples/expr_api.rs
rename to datafusion-examples/examples/query_planning/expr_api.rs
index 089b8db6a5a06..386273c72817b 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/query_planning/expr_api.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use arrow::array::{BooleanArray, Int32Array, Int8Array};
+use arrow::array::{BooleanArray, Int8Array, Int32Array};
 use arrow::record_batch::RecordBatch;
 
 use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -35,7 +37,7 @@ use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator};
 use datafusion::optimizer::analyzer::type_coercion::TypeCoercionRewriter;
 use datafusion::optimizer::simplify_expressions::ExprSimplifier;
-use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
+use datafusion::physical_expr::{AnalysisContext, ExprBoundaries, analyze};
 use datafusion::prelude::*;
 
 /// This example demonstrates the DataFusion [`Expr`] API.
@@ -55,8 +57,7 @@ use datafusion::prelude::*;
 /// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
 /// 6. Get the types of the expressions: [`expression_type_demo`]
 /// 7. Apply type coercion to expressions: [`type_coercion_demo`]
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn expr_api() -> Result<()> {
     // The easiest way to do create expressions is to use the
     // "fluent"-style API:
     let expr = col("a") + lit(5);
@@ -65,7 +66,7 @@ async fn main() -> Result<()> {
     let expr2 = Expr::BinaryExpr(BinaryExpr::new(
         Box::new(col("a")),
         Operator::Plus,
-        Box::new(Expr::Literal(ScalarValue::Int32(Some(5)))),
+        Box::new(Expr::Literal(ScalarValue::Int32(Some(5)), None)),
     ));
     assert_eq!(expr, expr2);
 
@@ -85,7 +86,7 @@ async fn main() -> Result<()> {
     boundary_analysis_and_selectivity_demo()?;
 
     // See how boundary analysis works for `AND` & `OR` conjunctions.
-    boundary_analysis_in_conjuctions_demo()?;
+    boundary_analysis_in_conjunctions_demo()?;
 
     // See how to determine the data types of expressions
     expression_type_demo()?;
@@ -174,8 +175,9 @@ fn simplify_demo() -> Result<()> {
     // the ExecutionProps carries information needed to simplify
     // expressions, such as the current time (to evaluate `now()`
     // correctly)
-    let props = ExecutionProps::new();
-    let context = SimplifyContext::new(&props).with_schema(schema);
+    let context = SimplifyContext::default()
+        .with_schema(schema)
+        .with_current_time();
     let simplifier = ExprSimplifier::new(context);
 
     // And then call the simplify_expr function:
@@ -190,7 +192,9 @@ fn simplify_demo() -> Result<()> {
 
     // here are some other examples of what DataFusion is capable of
     let schema = Schema::new(vec![make_field("i", DataType::Int64)]).to_dfschema_ref()?;
-    let context = SimplifyContext::new(&props).with_schema(schema.clone());
+    let context = SimplifyContext::default()
+        .with_schema(Arc::clone(&schema))
+        .with_current_time();
     let simplifier = ExprSimplifier::new(context);
 
     // basic arithmetic simplification
@@ -302,6 +306,7 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
         min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
         sum_value: Precision::Absent,
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     };
 
     // We can then build our expression boundaries from the column statistics
@@ -342,16 +347,18 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> {
     //
     // (a' - b' + 1) / (a - b)
     // (10000 - 5000 + 1) / (10000 - 1)
-    assert!(analysis
-        .selectivity
-        .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity)));
+    assert!(
+        analysis
+            .selectivity
+            .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity))
+    );
 
     Ok(())
 }
 
 /// This function shows how to think about and leverage the analysis API
 /// to infer boundaries in `AND` & `OR` conjunctions.
-fn boundary_analysis_in_conjuctions_demo() -> Result<()> {
+fn boundary_analysis_in_conjunctions_demo() -> Result<()> {
     // Let us consider the more common case of AND & OR conjunctions.
     //
     // age > 18 AND age <= 25
@@ -369,6 +376,7 @@ fn boundary_analysis_in_conjuctions_demo() -> Result<()> {
         min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
         sum_value: Precision::Absent,
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     };
 
     let initial_boundaries =
@@ -414,9 +422,11 @@ fn boundary_analysis_in_conjuctions_demo() -> Result<()> {
     //
     // Granted a column such as age will more likely follow a Normal distribution
     // as such our selectivity estimation will not be as good as it can.
-    assert!(analysis
-        .selectivity
-        .is_some_and(|selectivity| (0.1..=0.2).contains(&selectivity)));
+    assert!(
+        analysis
+            .selectivity
+            .is_some_and(|selectivity| (0.1..=0.2).contains(&selectivity))
+    );
 
     // The above example was a good way to look at how we can derive better
     // interval and get a lower selectivity during boundary analysis.
@@ -519,7 +529,7 @@ fn type_coercion_demo() -> Result<()> {
     )?;
     let i8_array = Int8Array::from_iter_values(vec![0, 1, 2]);
     let batch = RecordBatch::try_new(
-        Arc::new(df_schema.as_arrow().to_owned()),
+        Arc::clone(df_schema.inner()),
         vec![Arc::new(i8_array) as _],
     )?;
 
@@ -532,10 +542,11 @@ fn type_coercion_demo() -> Result<()> {
     let physical_expr =
         datafusion::physical_expr::create_physical_expr(&expr, &df_schema, &props)?;
     let e = physical_expr.evaluate(&batch).unwrap_err();
-    assert!(e
-        .find_root()
-        .to_string()
-        .contains("Invalid comparison operation: Int8 > Int32"));
+    assert!(
+        e.find_root()
+            .to_string()
+            .contains("Invalid comparison operation: Int8 > Int32")
+    );
 
     // 1. Type coercion with `SessionContext::create_physical_expr` which implicitly applies type coercion before constructing the physical expr.
     let physical_expr =
@@ -543,7 +554,9 @@ fn type_coercion_demo() -> Result<()> {
     assert!(physical_expr.evaluate(&batch).is_ok());
 
     // 2. Type coercion with `ExprSimplifier::coerce`.
-    let context = SimplifyContext::new(&props).with_schema(Arc::new(df_schema.clone()));
+    let context = SimplifyContext::default()
+        .with_schema(Arc::new(df_schema.clone()))
+        .with_current_time();
     let simplifier = ExprSimplifier::new(context);
     let coerced_expr = simplifier.coerce(expr.clone(), &df_schema)?;
     let physical_expr = datafusion::physical_expr::create_physical_expr(
diff --git a/datafusion-examples/examples/query_planning/main.rs b/datafusion-examples/examples/query_planning/main.rs
new file mode 100644
index 0000000000000..d3f99aedceb3d
--- /dev/null
+++ b/datafusion-examples/examples/query_planning/main.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are all internal mechanics of the query planning and optimization layers
+//!
+//! These examples demonstrate internal mechanics of the query planning and optimization layers.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example query_planning -- [all|analyzer_rule|expr_api|optimizer_rule|parse_sql_expr|plan_to_sql|planner_api|pruning|thread_pools]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `analyzer_rule`
+//!   (file: analyzer_rule.rs, desc: Custom AnalyzerRule to change query semantics)
+//!
+//! - `expr_api`
+//!   (file: expr_api.rs, desc: Create, execute, analyze, and coerce Exprs)
+//!
+//! - `optimizer_rule`
+//!   (file: optimizer_rule.rs, desc: Replace predicates via a custom OptimizerRule)
+//!
+//! - `parse_sql_expr`
+//!   (file: parse_sql_expr.rs, desc: Parse SQL into DataFusion Expr)
+//!
+//! - `plan_to_sql`
+//!   (file: plan_to_sql.rs, desc: Generate SQL from expressions or plans)
+//!
+//! - `planner_api`
+//!   (file: planner_api.rs, desc: APIs for logical and physical plan manipulation)
+//!
+//! - `pruning`
+//!   (file: pruning.rs, desc: Use pruning to skip irrelevant files)
+//!
+//! - `thread_pools`
+//!   (file: thread_pools.rs, desc: Configure custom thread pools for DataFusion execution)
+
+mod analyzer_rule;
+mod expr_api;
+mod optimizer_rule;
+mod parse_sql_expr;
+mod plan_to_sql;
+mod planner_api;
+mod pruning;
+mod thread_pools;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AnalyzerRule,
+    ExprApi,
+    OptimizerRule,
+    ParseSqlExpr,
+    PlanToSql,
+    PlannerApi,
+    Pruning,
+    ThreadPools,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "query_planning";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AnalyzerRule => analyzer_rule::analyzer_rule().await?,
+            ExampleKind::ExprApi => expr_api::expr_api().await?,
+            ExampleKind::OptimizerRule => optimizer_rule::optimizer_rule().await?,
+            ExampleKind::ParseSqlExpr => parse_sql_expr::parse_sql_expr().await?,
+            ExampleKind::PlanToSql => plan_to_sql::plan_to_sql_examples().await?,
+            ExampleKind::PlannerApi => planner_api::planner_api().await?,
+            ExampleKind::Pruning => pruning::pruning().await?,
+            ExampleKind::ThreadPools => thread_pools::thread_pools().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/optimizer_rule.rs b/datafusion-examples/examples/query_planning/optimizer_rule.rs
similarity index 96%
rename from datafusion-examples/examples/optimizer_rule.rs
rename to datafusion-examples/examples/query_planning/optimizer_rule.rs
index 63f17484809e2..de9de7737a6a0 100644
--- a/datafusion-examples/examples/optimizer_rule.rs
+++ b/datafusion-examples/examples/query_planning/optimizer_rule.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
 use arrow::datatypes::DataType;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::{assert_batches_eq, Result, ScalarValue};
+use datafusion::common::{Result, ScalarValue, assert_batches_eq};
 use datafusion::logical_expr::{
     BinaryExpr, ColumnarValue, Expr, LogicalPlan, Operator, ScalarFunctionArgs,
     ScalarUDF, ScalarUDFImpl, Signature, Volatility,
@@ -37,8 +39,7 @@ use std::sync::Arc;
 ///
 /// See [analyzer_rule.rs] for an example of AnalyzerRules, which are for
 /// changing plan semantics.
-#[tokio::main]
-pub async fn main() -> Result<()> {
+pub async fn optimizer_rule() -> Result<()> {
     // DataFusion includes many built in OptimizerRules for tasks such as outer
     // to inner join conversion and constant folding.
     //
@@ -171,11 +172,11 @@ fn is_binary_eq(binary_expr: &BinaryExpr) -> bool {
 
 /// Return true if the expression is a literal or column reference
 fn is_lit_or_col(expr: &Expr) -> bool {
-    matches!(expr, Expr::Column(_) | Expr::Literal(_))
+    matches!(expr, Expr::Column(_) | Expr::Literal(_, _))
 }
 
 /// A simple user defined filter function
-#[derive(Debug, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
 struct MyEq {
     signature: Signature,
 }
diff --git a/datafusion-examples/examples/parse_sql_expr.rs b/datafusion-examples/examples/query_planning/parse_sql_expr.rs
similarity index 68%
rename from datafusion-examples/examples/parse_sql_expr.rs
rename to datafusion-examples/examples/query_planning/parse_sql_expr.rs
index 5387e7c4a05dc..74072b8480f99 100644
--- a/datafusion-examples/examples/parse_sql_expr.rs
+++ b/datafusion-examples/examples/query_planning/parse_sql_expr.rs
@@ -15,8 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::common::DFSchema;
+use datafusion::common::ScalarValue;
 use datafusion::logical_expr::{col, lit};
 use datafusion::sql::unparser::Unparser;
 use datafusion::{
@@ -24,6 +27,7 @@ use datafusion::{
     error::Result,
     prelude::{ParquetReadOptions, SessionContext},
 };
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the programmatic parsing of SQL expressions using
 /// the DataFusion [`SessionContext::parse_sql_expr`] API or the [`DataFrame::parse_sql_expr`] API.
@@ -32,17 +36,15 @@ use datafusion::{
 /// The code in this example shows how to:
 ///
 /// 1. [`simple_session_context_parse_sql_expr_demo`]: Parse a simple SQL text into a logical
-/// expression using a schema at [`SessionContext`].
+///    expression using a schema at [`SessionContext`].
 ///
 /// 2. [`simple_dataframe_parse_sql_expr_demo`]: Parse a simple SQL text into a logical expression
-/// using a schema at [`DataFrame`].
+///    using a schema at [`DataFrame`].
 ///
 /// 3. [`query_parquet_demo`]: Query a parquet file using the parsed_sql_expr from a DataFrame.
 ///
 /// 4. [`round_trip_parse_sql_expr_demo`]: Parse a SQL text and convert it back to SQL using [`Unparser`].
-
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn parse_sql_expr() -> Result<()> {
     // See how to evaluate expressions
     simple_session_context_parse_sql_expr_demo()?;
     simple_dataframe_parse_sql_expr_demo().await?;
@@ -70,18 +72,19 @@ fn simple_session_context_parse_sql_expr_demo() -> Result<()> {
 
 /// DataFusion can parse a SQL text to an logical expression using schema at [`DataFrame`].
 async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> {
-    let sql = "int_col < 5 OR double_col = 8.0";
-    let expr = col("int_col")
-        .lt(lit(5_i64))
-        .or(col("double_col").eq(lit(8.0_f64)));
+    let sql = "car = 'red' OR speed > 1.0";
+    let expr = col("car")
+        .eq(lit(ScalarValue::Utf8(Some("red".to_string()))))
+        .or(col("speed").gt(lit(1.0_f64)));
 
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let parsed_expr = df.parse_sql_expr(sql)?;
@@ -93,39 +96,37 @@ async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> {
 
 async fn query_parquet_demo() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let df = df
         .clone()
-        .select(vec![
-            df.parse_sql_expr("int_col")?,
-            df.parse_sql_expr("double_col")?,
-        ])?
-        .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)?
+        .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])?
+        .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)?
         .aggregate(
-            vec![df.parse_sql_expr("double_col")?],
-            vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?],
+            vec![df.parse_sql_expr("car")?],
+            vec![df.parse_sql_expr("SUM(speed) as sum_speed")?],
         )?
         // Directly parsing the SQL text into a sort expression is not supported yet, so
         // construct it programmatically
-        .sort(vec![col("double_col").sort(false, false)])?
+        .sort(vec![col("car").sort(false, false)])?
         .limit(0, Some(1))?;
 
     let result = df.collect().await?;
 
     assert_batches_eq!(
         &[
-            "+------------+-------------+",
-            "| double_col | sum_int_col |",
-            "+------------+-------------+",
-            "| 10.1       | 4           |",
-            "+------------+-------------+",
+            "+-----+--------------------+",
+            "| car | sum_speed          |",
+            "+-----+--------------------+",
+            "| red | 162.49999999999997 |",
+            "+-----+--------------------+"
         ],
         &result
     );
@@ -135,15 +136,16 @@ async fn query_parquet_demo() -> Result<()> {
 
 /// DataFusion can parse a SQL text and convert it back to SQL using [`Unparser`].
 async fn round_trip_parse_sql_expr_demo() -> Result<()> {
-    let sql = "((int_col < 5) OR (double_col = 8))";
+    let sql = "((car = 'red') OR (speed > 1.0))";
 
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     let parsed_expr = df.parse_sql_expr(sql)?;
@@ -158,7 +160,7 @@ async fn round_trip_parse_sql_expr_demo() -> Result<()> {
     // difference in precedence rules between DataFusion and target engines.
     let unparser = Unparser::default().with_pretty(true);
 
-    let pretty = "int_col < 5 OR double_col = 8";
+    let pretty = "car = 'red' OR speed > 1.0";
     let pretty_round_trip_sql = unparser.expr_to_sql(&parsed_expr)?.to_string();
     assert_eq!(pretty, pretty_round_trip_sql);
 
diff --git a/datafusion-examples/examples/plan_to_sql.rs b/datafusion-examples/examples/query_planning/plan_to_sql.rs
similarity index 77%
rename from datafusion-examples/examples/plan_to_sql.rs
rename to datafusion-examples/examples/query_planning/plan_to_sql.rs
index 54483b143a169..86aebbc0b2c33 100644
--- a/datafusion-examples/examples/plan_to_sql.rs
+++ b/datafusion-examples/examples/query_planning/plan_to_sql.rs
@@ -15,7 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
+use std::fmt;
+use std::sync::Arc;
+
 use datafusion::common::DFSchemaRef;
+use datafusion::common::ScalarValue;
 use datafusion::error::Result;
 use datafusion::logical_expr::sqlparser::ast::Statement;
 use datafusion::logical_expr::{
@@ -32,9 +38,8 @@ use datafusion::sql::unparser::extension_unparser::UserDefinedLogicalNodeUnparse
 use datafusion::sql::unparser::extension_unparser::{
     UnparseToStatementResult, UnparseWithinStatementResult,
 };
-use datafusion::sql::unparser::{plan_to_sql, Unparser};
-use std::fmt;
-use std::sync::Arc;
+use datafusion::sql::unparser::{Unparser, plan_to_sql};
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the programmatic construction of SQL strings using
 /// the DataFusion Expr [`Expr`] and LogicalPlan [`LogicalPlan`] API.
@@ -43,28 +48,26 @@ use std::sync::Arc;
 /// The code in this example shows how to:
 ///
 /// 1. [`simple_expr_to_sql_demo`]: Create a simple expression [`Exprs`] with
-/// fluent API and convert to sql suitable for passing to another database
+///    fluent API and convert to sql suitable for passing to another database
 ///
 /// 2. [`simple_expr_to_pretty_sql_demo`] Create a simple expression
-/// [`Exprs`] with fluent API and convert to sql without extra parentheses,
-/// suitable for displaying to humans
+///    [`Exprs`] with fluent API and convert to sql without extra parentheses,
+///    suitable for displaying to humans
 ///
 /// 3. [`simple_expr_to_sql_demo_escape_mysql_style`]" Create a simple
-/// expression [`Exprs`] with fluent API and convert to sql escaping column
-/// names in MySQL style.
+///    expression [`Exprs`] with fluent API and convert to sql escaping column
+///    names in MySQL style.
 ///
 /// 4. [`simple_plan_to_sql_demo`]: Create a simple logical plan using the
-/// DataFrames API and convert to sql string.
+///    DataFrames API and convert to sql string.
 ///
 /// 5. [`round_trip_plan_to_sql_demo`]: Create a logical plan from a SQL string, modify it using the
-/// DataFrames API and convert it back to a  sql string.
+///    DataFrames API and convert it back to a  sql string.
 ///
 /// 6. [`unparse_my_logical_plan_as_statement`]: Create a custom logical plan and unparse it as a statement.
 ///
 /// 7. [`unparse_my_logical_plan_as_subquery`]: Create a custom logical plan and unparse it as a subquery.
-
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn plan_to_sql_examples() -> Result<()> {
     // See how to evaluate expressions
     simple_expr_to_sql_demo()?;
     simple_expr_to_pretty_sql_demo()?;
@@ -114,21 +117,21 @@ fn simple_expr_to_sql_demo_escape_mysql_style() -> Result<()> {
 async fn simple_plan_to_sql_demo() -> Result<()> {
     let ctx = SessionContext::new();
 
-    let testdata = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?;
+        .select_columns(&["car", "speed", "time"])?;
 
     // Convert the data frame to a SQL string
     let sql = plan_to_sql(df.logical_plan())?.to_string();
 
     assert_eq!(
         sql,
-        r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""#
+        r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""#
     );
 
     Ok(())
@@ -139,35 +142,35 @@ async fn simple_plan_to_sql_demo() -> Result<()> {
 async fn round_trip_plan_to_sql_demo() -> Result<()> {
     let ctx = SessionContext::new();
 
-    let testdata = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // register parquet file with the execution context
     ctx.register_parquet(
-        "alltypes_plain",
-        &format!("{testdata}/alltypes_plain.parquet"),
+        "cars",
+        parquet_temp.path_str()?,
         ParquetReadOptions::default(),
     )
     .await?;
 
     // create a logical plan from a SQL string and then programmatically add new filters
+    // select car, speed, time from cars where speed > 1 and car = 'red'
     let df = ctx
         // Use SQL to read some data from the parquet file
-        .sql(
-            "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \
-        FROM alltypes_plain",
-        )
+        .sql("SELECT car, speed, time FROM cars")
         .await?
-        // Add id > 1 and tinyint_col < double_col filter
+        // Add speed > 1 and car = 'red' filter
         .filter(
-            col("id")
+            col("speed")
                 .gt(lit(1))
-                .and(col("tinyint_col").lt(col("double_col"))),
+                .and(col("car").eq(lit(ScalarValue::Utf8(Some("red".to_string()))))),
         )?;
 
     let sql = plan_to_sql(df.logical_plan())?.to_string();
     assert_eq!(
         sql,
-        r#"SELECT alltypes_plain.int_col, alltypes_plain.double_col, CAST(alltypes_plain.date_string_col AS VARCHAR) FROM alltypes_plain WHERE ((alltypes_plain.id > 1) AND (alltypes_plain.tinyint_col < alltypes_plain.double_col))"#
+        r#"SELECT cars.car, cars.speed, cars."time" FROM cars WHERE ((cars.speed > 1) AND (cars.car = 'red'))"#
     );
 
     Ok(())
@@ -211,6 +214,7 @@ impl UserDefinedLogicalNodeCore for MyLogicalPlan {
 }
 
 struct PlanToStatement {}
+
 impl UserDefinedLogicalNodeUnparser for PlanToStatement {
     fn unparse_to_statement(
         &self,
@@ -231,14 +235,15 @@ impl UserDefinedLogicalNodeUnparser for PlanToStatement {
 /// It can be unparse as a statement that reads from the same parquet file.
 async fn unparse_my_logical_plan_as_statement() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let inner_plan = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+        .select_columns(&["car", "speed", "time"])?
         .into_unoptimized_plan();
 
     let node = Arc::new(MyLogicalPlan { input: inner_plan });
@@ -249,7 +254,7 @@ async fn unparse_my_logical_plan_as_statement() -> Result<()> {
     let sql = unparser.plan_to_sql(&my_plan)?.to_string();
     assert_eq!(
         sql,
-        r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""#
+        r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""#
     );
     Ok(())
 }
@@ -284,14 +289,15 @@ impl UserDefinedLogicalNodeUnparser for PlanToSubquery {
 /// It can be unparse as a subquery that reads from the same parquet file, with some columns projected.
 async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let inner_plan = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?
-        .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+        .select_columns(&["car", "speed", "time"])?
         .into_unoptimized_plan();
 
     let node = Arc::new(MyLogicalPlan { input: inner_plan });
@@ -299,8 +305,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let my_plan = LogicalPlan::Extension(Extension { node });
     let plan = LogicalPlanBuilder::from(my_plan)
         .project(vec![
-            col("id").alias("my_id"),
-            col("int_col").alias("my_int"),
+            col("car").alias("my_car"),
+            col("speed").alias("my_speed"),
         ])?
         .build()?;
     let unparser =
@@ -308,8 +314,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?.to_string();
     assert_eq!(
         sql,
-        "SELECT \"?table?\".id AS my_id, \"?table?\".int_col AS my_int FROM \
-        (SELECT \"?table?\".id, \"?table?\".int_col, \"?table?\".double_col, \"?table?\".date_string_col FROM \"?table?\")",
+        "SELECT \"?table?\".car AS my_car, \"?table?\".speed AS my_speed FROM \
+        (SELECT \"?table?\".car, \"?table?\".speed, \"?table?\".\"time\" FROM \"?table?\")",
     );
     Ok(())
 }
diff --git a/datafusion-examples/examples/planner_api.rs b/datafusion-examples/examples/query_planning/planner_api.rs
similarity index 86%
rename from datafusion-examples/examples/planner_api.rs
rename to datafusion-examples/examples/query_planning/planner_api.rs
index 55aec7b0108a4..8b2c09f4aecba 100644
--- a/datafusion-examples/examples/planner_api.rs
+++ b/datafusion-examples/examples/query_planning/planner_api.rs
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::error::Result;
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::physical_plan::displayable;
 use datafusion::physical_planner::DefaultPhysicalPlanner;
 use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 
 /// This example demonstrates the process of converting logical plan
 /// into physical execution plans using DataFusion.
@@ -32,29 +35,26 @@ use datafusion::prelude::*;
 /// physical plan:
 /// - Via the combined `create_physical_plan` API.
 /// - Utilizing the analyzer, optimizer, and query planner APIs separately.
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn planner_api() -> Result<()> {
     // Set up a DataFusion context and load a Parquet file
     let ctx = SessionContext::new();
-    let testdata = datafusion::test_util::parquet_test_data();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
     let df = ctx
-        .read_parquet(
-            &format!("{testdata}/alltypes_plain.parquet"),
-            ParquetReadOptions::default(),
-        )
+        .read_parquet(parquet_temp.path_str()?, ParquetReadOptions::default())
         .await?;
 
     // Construct the input logical plan using DataFrame API
     let df = df
         .clone()
-        .select(vec![
-            df.parse_sql_expr("int_col")?,
-            df.parse_sql_expr("double_col")?,
-        ])?
-        .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)?
+        .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])?
+        .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)?
         .aggregate(
-            vec![df.parse_sql_expr("double_col")?],
-            vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?],
+            vec![df.parse_sql_expr("car")?],
+            vec![df.parse_sql_expr("SUM(speed) as sum_speed")?],
         )?
         .limit(0, Some(1))?;
     let logical_plan = df.logical_plan().clone();
diff --git a/datafusion-examples/examples/pruning.rs b/datafusion-examples/examples/query_planning/pruning.rs
similarity index 95%
rename from datafusion-examples/examples/pruning.rs
rename to datafusion-examples/examples/query_planning/pruning.rs
index b2d2fa13b7ed2..33f3f8428a77f 100644
--- a/datafusion-examples/examples/pruning.rs
+++ b/datafusion-examples/examples/query_planning/pruning.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::collections::HashSet;
 use std::sync::Arc;
 
@@ -22,6 +24,7 @@ use arrow::array::{ArrayRef, BooleanArray, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::common::pruning::PruningStatistics;
 use datafusion::common::{DFSchema, ScalarValue};
+use datafusion::error::Result;
 use datafusion::execution::context::ExecutionProps;
 use datafusion::physical_expr::create_physical_expr;
 use datafusion::physical_optimizer::pruning::PruningPredicate;
@@ -40,8 +43,7 @@ use datafusion::prelude::*;
 /// one might do as part of a higher level storage engine. See
 /// `parquet_index.rs` for an example that uses pruning in the context of an
 /// individual query.
-#[tokio::main]
-async fn main() {
+pub async fn pruning() -> Result<()> {
     // In this example, we'll use the PruningPredicate to determine if
     // the expression `x = 5 AND y = 10` can never be true based on statistics
 
@@ -69,7 +71,7 @@ async fn main() {
     let predicate = create_pruning_predicate(expr, &my_catalog.schema);
 
     // Evaluate the predicate for the three files in the catalog
-    let prune_results = predicate.prune(&my_catalog).unwrap();
+    let prune_results = predicate.prune(&my_catalog)?;
     println!("Pruning results: {prune_results:?}");
 
     // The result is a `Vec` of bool values, one for each file in the catalog
@@ -93,6 +95,8 @@ async fn main() {
             false
         ]
     );
+
+    Ok(())
 }
 
 /// A simple model catalog that has information about the three files that store
@@ -186,11 +190,12 @@ impl PruningStatistics for MyCatalog {
     }
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn create_pruning_predicate(expr: Expr, schema: &SchemaRef) -> PruningPredicate {
-    let df_schema = DFSchema::try_from(schema.as_ref().clone()).unwrap();
+    let df_schema = DFSchema::try_from(Arc::clone(schema)).unwrap();
     let props = ExecutionProps::new();
     let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap();
-    PruningPredicate::try_new(physical_expr, schema.clone()).unwrap()
+    PruningPredicate::try_new(physical_expr, Arc::clone(schema)).unwrap()
 }
 
 fn i32_array<'a>(values: impl Iterator<Item = &'a Option<i32>>) -> ArrayRef {
diff --git a/datafusion-examples/examples/query_planning/thread_pools.rs b/datafusion-examples/examples/query_planning/thread_pools.rs
new file mode 100644
index 0000000000000..2ff73a77c4024
--- /dev/null
+++ b/datafusion-examples/examples/query_planning/thread_pools.rs
@@ -0,0 +1,355 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example shows how to use separate thread pools (tokio [`Runtime`]))s to
+//! run the IO and CPU intensive parts of DataFusion plans.
+//!
+//! # Background
+//!
+//! DataFusion, by default, plans and executes all operations (both CPU and IO)
+//! on the same thread pool. This makes it fast and easy to get started, but
+//! can cause issues when running at scale, especially when fetching and operating
+//! on data directly from remote sources.
+//!
+//! Specifically, without configuration such as in this example, DataFusion
+//! plans and executes everything the same thread pool (Tokio Runtime), including
+//! any I/O, such as reading Parquet files from remote object storage
+//! (e.g. AWS S3), catalog access, and CPU intensive work. Running this diverse
+//! workload can lead to issues described in the [Architecture section] such as
+//! throttled network bandwidth (due to congestion control) and increased
+//! latencies or timeouts while processing network messages.
+//!
+//! [Architecture section]: https://docs.rs/datafusion/latest/datafusion/index.html#thread-scheduling-cpu--io-thread-pools-and-tokio-runtimes
+
+use std::sync::Arc;
+
+use arrow::util::pretty::pretty_format_batches;
+use datafusion::common::runtime::JoinSet;
+use datafusion::error::Result;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
+use futures::stream::StreamExt;
+use object_store::client::SpawnedReqwestConnector;
+use object_store::http::HttpBuilder;
+use tokio::runtime::Handle;
+use tokio::sync::Notify;
+use url::Url;
+
+/// Normally, you don't need to worry about the details of the tokio
+/// [`Runtime`], but for this example it is important to understand how the
+/// [`Runtime`]s work.
+///
+/// Each thread has "current" runtime that is installed in a thread local
+/// variable which is used by the `tokio::spawn` function.
+///
+/// The `#[tokio::main]` macro  creates a [`Runtime`] and installs it as
+/// as the "current" runtime in a thread local variable, on which any `async`
+/// [`Future`], [`Stream]`s and [`Task]`s are run.
+///
+/// This example uses the runtime created by [`tokio::main`] to do I/O and spawn
+/// CPU intensive tasks on a separate [`Runtime`], mirroring the common pattern
+/// when using Rust libraries such as `tonic`. Using a separate `Runtime` for
+/// CPU bound tasks will often be simpler in larger applications, even though it
+/// makes this example slightly more complex.
+pub async fn thread_pools() -> Result<()> {
+    // The first two examples read local files. Enabling the URL table feature
+    // lets us treat filenames as tables in SQL.
+    let ctx = SessionContext::new().enable_url_table();
+
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
+
+    let sql = format!("SELECT * FROM '{}'", parquet_temp.path_str()?);
+
+    // Run a query on the current runtime. Calling `await` means the future
+    // (in this case the `async` function and all spawned work in DataFusion
+    // plans) on the current runtime.
+    same_runtime(&ctx, &sql).await?;
+
+    // Run the same query but this time on a different runtime.
+    //
+    // Since we call `await` here, the `async` function itself runs on the
+    // current runtime, but internally `different_runtime_basic` executes the
+    // DataFusion plan on a different Runtime.
+    different_runtime_basic(ctx, sql).await?;
+
+    // Run the same query on a different runtime, including remote IO.
+    //
+    // NOTE: This is best practice for production systems
+    different_runtime_advanced().await?;
+
+    Ok(())
+}
+
+/// Run queries directly on the current tokio `Runtime`
+///
+/// This is how most examples in DataFusion are written and works well for
+/// development, local query processing, and non latency sensitive workloads.
+async fn same_runtime(ctx: &SessionContext, sql: &str) -> Result<()> {
+    // Calling .sql is an async function as it may also do network
+    // I/O, for example to contact a remote catalog or do an object store LIST
+    let df = ctx.sql(sql).await?;
+
+    // While many examples call `collect` or `show()`, those methods buffers the
+    // results. Internally DataFusion generates output a RecordBatch at a time
+
+    // Calling `execute_stream` return a `SendableRecordBatchStream`. Depending
+    // on the plan, this may also do network I/O, for example to begin reading a
+    // parquet file from a remote object store.
+    let mut stream: SendableRecordBatchStream = df.execute_stream().await?;
+
+    // `next()` drives the plan, incrementally producing new `RecordBatch`es
+    // using the current runtime.
+    //
+    // Perhaps somewhat non obviously, calling `next()` can also result in other
+    // tasks being spawned on the current runtime (e.g. for `RepartitionExec` to
+    // read data from each of its input partitions in parallel).
+    //
+    // Executing the plan using this pattern intermixes any IO and CPU intensive
+    // work on same Runtime
+    while let Some(batch) = stream.next().await {
+        println!("{}", pretty_format_batches(&[batch?])?);
+    }
+    Ok(())
+}
+
+/// Run queries on a **different** Runtime dedicated for CPU bound work
+///
+/// This example is suitable for running DataFusion plans against local data
+/// sources (e.g. files) and returning results to an async destination, as might
+/// be done to return query results to a remote client.
+///
+/// Production systems which also read data locally or require very low latency
+/// should follow the recommendations on [`different_runtime_advanced`] when
+/// processing data from a remote source such as object storage.
+async fn different_runtime_basic(ctx: SessionContext, sql: String) -> Result<()> {
+    // Since we are already in the context of runtime (installed by
+    // #[tokio::main]), we need a new Runtime (threadpool) for CPU bound tasks
+    let cpu_runtime = CpuRuntime::try_new()?;
+
+    // Prepare a task that runs the plan on cpu_runtime and sends
+    // the results back to the original runtime via a channel.
+    let (tx, mut rx) = tokio::sync::mpsc::channel(2);
+    let driver_task = async move {
+        // Plan the query (which might require CPU work to evaluate statistics)
+        let df = ctx.sql(&sql).await?;
+        let mut stream: SendableRecordBatchStream = df.execute_stream().await?;
+
+        // Calling `next()` to drive the plan in this task drives the
+        // execution from the cpu runtime the other thread pool
+        //
+        // NOTE any IO run by this plan (for example, reading from an
+        // `ObjectStore`) will be done on this new thread pool as well.
+        while let Some(batch) = stream.next().await {
+            if tx.send(batch).await.is_err() {
+                // error means dropped receiver, so nothing will get results anymore
+                return Ok(());
+            }
+        }
+        Ok(()) as Result<()>
+    };
+
+    // Run the driver task on the cpu runtime. Use a JoinSet to
+    // ensure the spawned task is canceled on error/drop
+    let mut join_set = JoinSet::new();
+    join_set.spawn_on(driver_task, cpu_runtime.handle());
+
+    // Retrieve the results in the original (IO) runtime. This requires only
+    // minimal work (pass pointers around).
+    while let Some(batch) = rx.recv().await {
+        println!("{}", pretty_format_batches(&[batch?])?);
+    }
+
+    // wait for completion of the driver task
+    drain_join_set(join_set).await;
+
+    Ok(())
+}
+
+/// Run CPU intensive work on a different runtime but do IO operations (object
+/// store access) on the current runtime.
+async fn different_runtime_advanced() -> Result<()> {
+    // In this example, we will query a file via https, reading
+    // the data directly from the plan
+
+    // The current runtime (created by tokio::main) is used for IO
+    //
+    // Note this handle should be used for *ALL* remote IO operations in your
+    // systems, including remote catalog access, which is not included in this
+    // example.
+    let cpu_runtime = CpuRuntime::try_new()?;
+    let io_handle = Handle::current();
+
+    let ctx = SessionContext::new();
+
+    // By default, the HttpStore use the same runtime that calls `await` for IO
+    // operations. This means that if the DataFusion plan is called from the
+    // cpu_runtime,  the HttpStore IO operations will *also* run on the CPU
+    // runtime, which will error.
+    //
+    // To avoid this, we use a `SpawnedReqwestConnector` to configure the
+    // `ObjectStore` to run the HTTP requests on the IO runtime.
+    let base_url = Url::parse("https://github.com").unwrap();
+    let http_store = HttpBuilder::new()
+        .with_url(base_url.clone())
+        // Use the io_runtime to run the HTTP requests. Without this line,
+        // you will see an error such as:
+        // A Tokio 1.x context was found, but IO is disabled.
+        .with_http_connector(SpawnedReqwestConnector::new(io_handle))
+        .build()?;
+
+    // Tell DataFusion to process `http://` urls with this wrapped object store
+    ctx.register_object_store(&base_url, Arc::new(http_store));
+
+    // As above, plan and execute the query on the cpu runtime.
+    let (tx, mut rx) = tokio::sync::mpsc::channel(2);
+    let driver_task = async move {
+        // Plan / execute the query
+        let url = "https://github.com/apache/arrow-testing/raw/master/data/csv/aggregate_test_100.csv";
+        let df = ctx
+            .sql(&format!("SELECT c1,c2,c3 FROM '{url}' LIMIT 5"))
+            .await?;
+
+        let mut stream: SendableRecordBatchStream = df.execute_stream().await?;
+
+        // Note you can do other non trivial CPU work on the results of the
+        // stream before sending it back to the original runtime. For example,
+        // calling a FlightDataEncoder to convert the results to flight messages
+        // to send over the network
+
+        // send results, as above
+        while let Some(batch) = stream.next().await {
+            if tx.send(batch).await.is_err() {
+                return Ok(());
+            }
+        }
+        Ok(()) as Result<()>
+    };
+
+    let mut join_set = JoinSet::new();
+    join_set.spawn_on(driver_task, cpu_runtime.handle());
+    while let Some(batch) = rx.recv().await {
+        println!("{}", pretty_format_batches(&[batch?])?);
+    }
+
+    Ok(())
+}
+
+/// Waits for all tasks in the JoinSet to complete and reports any errors that
+/// occurred.
+///
+/// If we don't do this, any errors that occur in the task (such as IO errors)
+/// are not reported.
+async fn drain_join_set(mut join_set: JoinSet<Result<()>>) {
+    // retrieve any errors from the tasks
+    while let Some(result) = join_set.join_next().await {
+        match result {
+            Ok(Ok(())) => {}                             // task completed successfully
+            Ok(Err(e)) => eprintln!("Task failed: {e}"), // task failed
+            Err(e) => eprintln!("JoinSet error: {e}"),   // JoinSet error
+        }
+    }
+}
+
+/// Creates a Tokio [`Runtime`] for use with CPU bound tasks
+///
+/// Tokio forbids dropping `Runtime`s in async contexts, so creating a separate
+/// `Runtime` correctly is somewhat tricky. This structure manages the creation
+/// and shutdown of a separate thread.
+///
+/// # Notes
+/// On drop, the thread will wait for all remaining tasks to complete.
+///
+/// Depending on your application, more sophisticated shutdown logic may be
+/// required, such as ensuring that no new tasks are added to the runtime.
+///
+/// # Credits
+/// This code is derived from code originally written for [InfluxDB 3.0]
+///
+/// [InfluxDB 3.0]: https://github.com/influxdata/influxdb3_core/tree/6fcbb004232738d55655f32f4ad2385523d10696/executor
+struct CpuRuntime {
+    /// Handle is the tokio structure for interacting with a Runtime.
+    handle: Handle,
+    /// Signal to start shutting down
+    notify_shutdown: Arc<Notify>,
+    /// When thread is active, is Some
+    thread_join_handle: Option<std::thread::JoinHandle<()>>,
+}
+
+impl Drop for CpuRuntime {
+    fn drop(&mut self) {
+        // Notify the thread to shutdown.
+        self.notify_shutdown.notify_one();
+        // In a production system you also need to ensure your code stops adding
+        // new tasks to the underlying runtime after this point to allow the
+        // thread to complete its work and exit cleanly.
+        if let Some(thread_join_handle) = self.thread_join_handle.take() {
+            // If the thread is still running, we wait for it to finish
+            print!("Shutting down CPU runtime thread...");
+            if let Err(e) = thread_join_handle.join() {
+                eprintln!("Error joining CPU runtime thread: {e:?}",);
+            } else {
+                println!("CPU runtime thread shutdown successfully.");
+            }
+        }
+    }
+}
+
+impl CpuRuntime {
+    /// Create a new Tokio Runtime for CPU bound tasks
+    pub fn try_new() -> Result<Self> {
+        let cpu_runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_time()
+            .build()?;
+        let handle = cpu_runtime.handle().clone();
+        let notify_shutdown = Arc::new(Notify::new());
+        let notify_shutdown_captured = Arc::clone(&notify_shutdown);
+
+        // The cpu_runtime runs and is dropped on a separate thread
+        let thread_join_handle = std::thread::spawn(move || {
+            cpu_runtime.block_on(async move {
+                notify_shutdown_captured.notified().await;
+            });
+            // Note: cpu_runtime is dropped here, which will wait for all tasks
+            // to complete
+        });
+
+        Ok(Self {
+            handle,
+            notify_shutdown,
+            thread_join_handle: Some(thread_join_handle),
+        })
+    }
+
+    /// Return a handle suitable for spawning CPU bound tasks
+    ///
+    /// # Notes
+    ///
+    /// If a task spawned on this handle attempts to do IO, it will error with a
+    /// message such as:
+    ///
+    /// ```text
+    /// A Tokio 1.x context was found, but IO is disabled.
+    /// ```
+    pub fn handle(&self) -> &Handle {
+        &self.handle
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/main.rs b/datafusion-examples/examples/relation_planner/main.rs
new file mode 100644
index 0000000000000..babc0d3714f72
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/main.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # Relation Planner Examples
+//!
+//! These examples demonstrate how to use custom relation planners to extend
+//! DataFusion's SQL syntax with custom table operators.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example relation_planner -- [all|match_recognize|pivot_unpivot|table_sample]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `match_recognize`
+//!   (file: match_recognize.rs, desc: Implement MATCH_RECOGNIZE pattern matching)
+//!
+//! - `pivot_unpivot`
+//!   (file: pivot_unpivot.rs, desc: Implement PIVOT / UNPIVOT)
+//!
+//! - `table_sample`
+//!   (file: table_sample.rs, desc: Implement TABLESAMPLE)
+//!
+//! ## Snapshot Testing
+//!
+//! These examples use [insta](https://insta.rs) for inline snapshot assertions.
+//! If query output changes, regenerate the snapshots with:
+//! ```bash
+//! cargo insta test --example relation_planner --accept
+//! ```
+
+mod match_recognize;
+mod pivot_unpivot;
+mod table_sample;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    MatchRecognize,
+    PivotUnpivot,
+    TableSample,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "relation_planner";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::MatchRecognize => match_recognize::match_recognize().await?,
+            ExampleKind::PivotUnpivot => pivot_unpivot::pivot_unpivot().await?,
+            ExampleKind::TableSample => table_sample::table_sample().await?,
+        }
+
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
+
+/// Test wrappers that enable `cargo insta test --example relation_planner --accept`
+/// to regenerate inline snapshots. Without these, insta cannot run the examples
+/// in test mode since they only have `main()` functions.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_match_recognize() {
+        match_recognize::match_recognize().await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_pivot_unpivot() {
+        pivot_unpivot::pivot_unpivot().await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_table_sample() {
+        table_sample::table_sample().await.unwrap();
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/match_recognize.rs b/datafusion-examples/examples/relation_planner/match_recognize.rs
new file mode 100644
index 0000000000000..c4b3d522efc17
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/match_recognize.rs
@@ -0,0 +1,408 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # MATCH_RECOGNIZE Example
+//!
+//! This example demonstrates implementing SQL `MATCH_RECOGNIZE` pattern matching
+//! using a custom [`RelationPlanner`]. Unlike the [`pivot_unpivot`] example that
+//! rewrites SQL to standard operations, this example creates a **custom logical
+//! plan node** (`MiniMatchRecognizeNode`) to represent the operation.
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! SELECT * FROM events
+//!   MATCH_RECOGNIZE (
+//!     PARTITION BY region
+//!     MEASURES SUM(price) AS total, AVG(price) AS average
+//!     PATTERN (A B+ C)
+//!     DEFINE
+//!       A AS price < 100,
+//!       B AS price BETWEEN 100 AND 200,
+//!       C AS price > 200
+//!   ) AS matches
+//! ```
+//!
+//! ## Architecture
+//!
+//! This example demonstrates **logical planning only**. Physical execution would
+//! require implementing an [`ExecutionPlan`] (see the [`table_sample`] example
+//! for a complete implementation with physical planning).
+//!
+//! ```text
+//! SQL Query
+//!     │
+//!     ▼
+//! ┌─────────────────────────────────────┐
+//! │ MatchRecognizePlanner               │
+//! │ (RelationPlanner trait)             │
+//! │                                     │
+//! │ • Parses MATCH_RECOGNIZE syntax     │
+//! │ • Creates MiniMatchRecognizeNode    │
+//! │ • Converts SQL exprs to DataFusion  │
+//! └─────────────────────────────────────┘
+//!     │
+//!     ▼
+//! ┌─────────────────────────────────────┐
+//! │ MiniMatchRecognizeNode              │
+//! │ (UserDefinedLogicalNode)            │
+//! │                                     │
+//! │ • measures: [(alias, expr), ...]    │
+//! │ • definitions: [(symbol, expr), ...]│
+//! └─────────────────────────────────────┘
+//! ```
+//!
+//! [`pivot_unpivot`]: super::pivot_unpivot
+//! [`table_sample`]: super::table_sample
+//! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan
+
+use std::{any::Any, cmp::Ordering, hash::Hasher, sync::Arc};
+
+use arrow::array::{ArrayRef, Float64Array, Int32Array, StringArray};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::*;
+use datafusion_common::{DFSchemaRef, Result};
+use datafusion_expr::{
+    Expr, UserDefinedLogicalNode,
+    logical_plan::{Extension, InvariantLevel, LogicalPlan},
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the MATCH_RECOGNIZE examples demonstrating pattern matching on event streams.
+///
+/// Note: This example demonstrates **logical planning only**. Physical execution
+/// would require additional implementation of an [`ExecutionPlan`].
+pub async fn match_recognize() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_relation_planner(Arc::new(MatchRecognizePlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("MATCH_RECOGNIZE Example (Logical Planning Only)");
+    println!("================================================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // Example 1: Basic MATCH_RECOGNIZE with MEASURES and DEFINE
+    // Demonstrates: Aggregate measures over matched rows
+    let plan = run_example(
+        ctx,
+        "Example 1: MATCH_RECOGNIZE with aggregations",
+        r#"SELECT * FROM events
+           MATCH_RECOGNIZE (
+             PARTITION BY 1
+             MEASURES SUM(price) AS total_price, AVG(price) AS avg_price
+             PATTERN (A)
+             DEFINE A AS price > 10
+           ) AS matches"#,
+    )
+    .await?;
+    assert_snapshot!(plan, @r"
+    Projection: matches.price
+      SubqueryAlias: matches
+        MiniMatchRecognize measures=[total_price := sum(events.price), avg_price := avg(events.price)] define=[a := events.price > Int64(10)]
+          TableScan: events
+    ");
+
+    // Example 2: Stock price pattern detection
+    // Demonstrates: Real-world use case finding prices above threshold
+    let plan = run_example(
+        ctx,
+        "Example 2: Detect high stock prices",
+        r#"SELECT * FROM stock_prices
+           MATCH_RECOGNIZE (
+             MEASURES
+               MIN(price) AS min_price,
+               MAX(price) AS max_price,
+               AVG(price) AS avg_price
+             PATTERN (HIGH)
+             DEFINE HIGH AS price > 151.0
+           ) AS trends"#,
+    )
+    .await?;
+    assert_snapshot!(plan, @r"
+    Projection: trends.symbol, trends.price
+      SubqueryAlias: trends
+        MiniMatchRecognize measures=[min_price := min(stock_prices.price), max_price := max(stock_prices.price), avg_price := avg(stock_prices.price)] define=[high := stock_prices.price > Float64(151)]
+          TableScan: stock_prices
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and display the logical plan.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let plan = ctx.sql(sql).await?.into_unoptimized_plan();
+    let plan_str = plan.display_indent().to_string();
+    println!("{plan_str}\n");
+    Ok(plan_str)
+}
+
+/// Register test data tables.
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    // events: simple price series
+    ctx.register_batch(
+        "events",
+        RecordBatch::try_from_iter(vec![(
+            "price",
+            Arc::new(Int32Array::from(vec![5, 12, 8, 15, 20])) as ArrayRef,
+        )])?,
+    )?;
+
+    // stock_prices: realistic stock data
+    ctx.register_batch(
+        "stock_prices",
+        RecordBatch::try_from_iter(vec![
+            (
+                "symbol",
+                Arc::new(StringArray::from(vec!["DDOG", "DDOG", "DDOG", "DDOG"]))
+                    as ArrayRef,
+            ),
+            (
+                "price",
+                Arc::new(Float64Array::from(vec![150.0, 155.0, 152.0, 158.0])),
+            ),
+        ])?,
+    )?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Logical Plan Node: MiniMatchRecognizeNode
+// ============================================================================
+
+/// A custom logical plan node representing MATCH_RECOGNIZE operations.
+///
+/// This is a simplified implementation that captures the essential structure:
+/// - `measures`: Aggregate expressions computed over matched rows
+/// - `definitions`: Symbol definitions (predicate expressions)
+///
+/// A production implementation would also include:
+/// - Pattern specification (regex-like pattern)
+/// - Partition and order by clauses
+/// - Output mode (ONE ROW PER MATCH, ALL ROWS PER MATCH)
+/// - After match skip strategy
+#[derive(Debug)]
+struct MiniMatchRecognizeNode {
+    input: Arc<LogicalPlan>,
+    schema: DFSchemaRef,
+    /// Measures: (alias, aggregate_expr)
+    measures: Vec<(String, Expr)>,
+    /// Symbol definitions: (symbol_name, predicate_expr)
+    definitions: Vec<(String, Expr)>,
+}
+
+impl UserDefinedLogicalNode for MiniMatchRecognizeNode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "MiniMatchRecognize"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
+        Ok(())
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.measures
+            .iter()
+            .chain(&self.definitions)
+            .map(|(_, expr)| expr.clone())
+            .collect()
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "MiniMatchRecognize")?;
+
+        if !self.measures.is_empty() {
+            write!(f, " measures=[")?;
+            for (i, (alias, expr)) in self.measures.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{alias} := {expr}")?;
+            }
+            write!(f, "]")?;
+        }
+
+        if !self.definitions.is_empty() {
+            write!(f, " define=[")?;
+            for (i, (symbol, expr)) in self.definitions.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{symbol} := {expr}")?;
+            }
+            write!(f, "]")?;
+        }
+
+        Ok(())
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> Result<Arc<dyn UserDefinedLogicalNode>> {
+        let expected_len = self.measures.len() + self.definitions.len();
+        if exprs.len() != expected_len {
+            return Err(datafusion_common::plan_datafusion_err!(
+                "MiniMatchRecognize: expected {expected_len} expressions, got {}",
+                exprs.len()
+            ));
+        }
+
+        let input = inputs.into_iter().next().ok_or_else(|| {
+            datafusion_common::plan_datafusion_err!(
+                "MiniMatchRecognize requires exactly one input"
+            )
+        })?;
+
+        let (measure_exprs, definition_exprs) = exprs.split_at(self.measures.len());
+
+        let measures = self
+            .measures
+            .iter()
+            .zip(measure_exprs)
+            .map(|((alias, _), expr)| (alias.clone(), expr.clone()))
+            .collect();
+
+        let definitions = self
+            .definitions
+            .iter()
+            .zip(definition_exprs)
+            .map(|((symbol, _), expr)| (symbol.clone(), expr.clone()))
+            .collect();
+
+        Ok(Arc::new(Self {
+            input: Arc::new(input),
+            schema: Arc::clone(&self.schema),
+            measures,
+            definitions,
+        }))
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        state.write_usize(Arc::as_ptr(&self.input) as usize);
+        state.write_usize(self.measures.len());
+        state.write_usize(self.definitions.len());
+    }
+
+    fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool {
+        other.as_any().downcast_ref::<Self>().is_some_and(|o| {
+            Arc::ptr_eq(&self.input, &o.input)
+                && self.measures == o.measures
+                && self.definitions == o.definitions
+        })
+    }
+
+    fn dyn_ord(&self, other: &dyn UserDefinedLogicalNode) -> Option<Ordering> {
+        if self.dyn_eq(other) {
+            Some(Ordering::Equal)
+        } else {
+            None
+        }
+    }
+}
+
+// ============================================================================
+// Relation Planner: MatchRecognizePlanner
+// ============================================================================
+
+/// Relation planner that creates `MiniMatchRecognizeNode` for MATCH_RECOGNIZE queries.
+#[derive(Debug)]
+struct MatchRecognizePlanner;
+
+impl RelationPlanner for MatchRecognizePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        let TableFactor::MatchRecognize {
+            table,
+            measures,
+            symbols,
+            alias,
+            ..
+        } = relation
+        else {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        };
+
+        // Plan the input table
+        let input = ctx.plan(*table)?;
+        let schema = input.schema().clone();
+
+        // Convert MEASURES: SQL expressions → DataFusion expressions
+        let planned_measures: Vec<(String, Expr)> = measures
+            .iter()
+            .map(|m| {
+                let alias = ctx.normalize_ident(m.alias.clone());
+                let expr = ctx.sql_to_expr(m.expr.clone(), schema.as_ref())?;
+                Ok((alias, expr))
+            })
+            .collect::<Result<_>>()?;
+
+        // Convert DEFINE: symbol definitions → DataFusion expressions
+        let planned_definitions: Vec<(String, Expr)> = symbols
+            .iter()
+            .map(|s| {
+                let name = ctx.normalize_ident(s.symbol.clone());
+                let expr = ctx.sql_to_expr(s.definition.clone(), schema.as_ref())?;
+                Ok((name, expr))
+            })
+            .collect::<Result<_>>()?;
+
+        // Create the custom node
+        let node = MiniMatchRecognizeNode {
+            input: Arc::new(input),
+            schema,
+            measures: planned_measures,
+            definitions: planned_definitions,
+        };
+
+        let plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(node),
+        });
+
+        Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+            plan, alias,
+        ))))
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/pivot_unpivot.rs b/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
new file mode 100644
index 0000000000000..2e1696956bf62
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
@@ -0,0 +1,571 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # PIVOT and UNPIVOT Example
+//!
+//! This example demonstrates implementing SQL `PIVOT` and `UNPIVOT` operations
+//! using a custom [`RelationPlanner`]. Unlike the other examples that create
+//! custom logical/physical nodes, this example shows how to **rewrite** SQL
+//! constructs into equivalent standard SQL operations:
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! -- PIVOT: Transform rows into columns
+//! SELECT * FROM sales
+//!   PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4'))
+//!
+//! -- UNPIVOT: Transform columns into rows
+//! SELECT * FROM wide_table
+//!   UNPIVOT (value FOR name IN (col1, col2, col3))
+//! ```
+//!
+//! ## Rewrite Strategy
+//!
+//! **PIVOT** is rewritten to `GROUP BY` with `CASE` expressions:
+//! ```sql
+//! -- Original:
+//! SELECT * FROM sales PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2'))
+//!
+//! -- Rewritten to:
+//! SELECT region,
+//!        SUM(CASE quarter WHEN 'Q1' THEN amount END) AS Q1,
+//!        SUM(CASE quarter WHEN 'Q2' THEN amount END) AS Q2
+//! FROM sales
+//! GROUP BY region
+//! ```
+//!
+//! **UNPIVOT** is rewritten to `UNION ALL` of projections:
+//! ```sql
+//! -- Original:
+//! SELECT * FROM wide UNPIVOT (sales FOR quarter IN (q1, q2))
+//!
+//! -- Rewritten to:
+//! SELECT region, 'q1' AS quarter, q1 AS sales FROM wide
+//! UNION ALL
+//! SELECT region, 'q2' AS quarter, q2 AS sales FROM wide
+//! ```
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array, StringArray};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::*;
+use datafusion_common::{Result, ScalarValue, plan_datafusion_err};
+use datafusion_expr::{
+    Expr, case, col, lit,
+    logical_plan::builder::LogicalPlanBuilder,
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::{NullInclusion, PivotValueSource, TableFactor};
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the PIVOT/UNPIVOT examples demonstrating data reshaping operations.
+pub async fn pivot_unpivot() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_relation_planner(Arc::new(PivotUnpivotPlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("PIVOT and UNPIVOT Example");
+    println!("=========================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // ----- PIVOT Examples -----
+
+    // Example 1: Basic PIVOT
+    // Transforms: (region, quarter, amount) → (region, Q1, Q2)
+    let results = run_example(
+        ctx,
+        "Example 1: Basic PIVOT",
+        r#"SELECT * FROM quarterly_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+------+------+
+    | region | Q1   | Q2   |
+    +--------+------+------+
+    | North  | 1000 | 1500 |
+    | South  | 1200 | 1300 |
+    +--------+------+------+
+    ");
+
+    // Example 2: PIVOT with multiple aggregates
+    // Creates columns for each (aggregate, value) combination
+    let results = run_example(
+        ctx,
+        "Example 2: PIVOT with multiple aggregates",
+        r#"SELECT * FROM quarterly_sales
+           PIVOT (SUM(amount), AVG(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+--------+--------+--------+--------+
+    | region | sum_Q1 | sum_Q2 | avg_Q1 | avg_Q2 |
+    +--------+--------+--------+--------+--------+
+    | North  | 1000   | 1500   | 1000.0 | 1500.0 |
+    | South  | 1200   | 1300   | 1200.0 | 1300.0 |
+    +--------+--------+--------+--------+--------+
+    ");
+
+    // Example 3: PIVOT with multiple grouping columns
+    // Non-pivot, non-aggregate columns become GROUP BY columns
+    let results = run_example(
+        ctx,
+        "Example 3: PIVOT with multiple grouping columns",
+        r#"SELECT * FROM product_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region, product"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+----------+-----+-----+
+    | region | product  | Q1  | Q2  |
+    +--------+----------+-----+-----+
+    | North  | ProductA | 500 |     |
+    | North  | ProductB | 500 |     |
+    | South  | ProductA |     | 650 |
+    +--------+----------+-----+-----+
+    ");
+
+    // ----- UNPIVOT Examples -----
+
+    // Example 4: Basic UNPIVOT
+    // Transforms: (region, q1, q2) → (region, quarter, sales)
+    let results = run_example(
+        ctx,
+        "Example 4: Basic UNPIVOT",
+        r#"SELECT * FROM wide_sales
+           UNPIVOT (sales FOR quarter IN (q1 AS 'Q1', q2 AS 'Q2')) AS u
+           ORDER BY quarter, region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+---------+-------+
+    | region | quarter | sales |
+    +--------+---------+-------+
+    | North  | Q1      | 1000  |
+    | South  | Q1      | 1200  |
+    | North  | Q2      | 1500  |
+    | South  | Q2      | 1300  |
+    +--------+---------+-------+
+    ");
+
+    // Example 5: UNPIVOT with INCLUDE NULLS
+    // By default, UNPIVOT excludes rows where the value column is NULL.
+    // INCLUDE NULLS keeps them (same result here since no NULLs in data).
+    let results = run_example(
+        ctx,
+        "Example 5: UNPIVOT INCLUDE NULLS",
+        r#"SELECT * FROM wide_sales
+           UNPIVOT INCLUDE NULLS (sales FOR quarter IN (q1 AS 'Q1', q2 AS 'Q2')) AS u
+           ORDER BY quarter, region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+---------+-------+
+    | region | quarter | sales |
+    +--------+---------+-------+
+    | North  | Q1      | 1000  |
+    | South  | Q1      | 1200  |
+    | North  | Q2      | 1500  |
+    | South  | Q2      | 1300  |
+    +--------+---------+-------+
+    ");
+
+    // Example 6: PIVOT with column projection
+    // Standard SQL operations work seamlessly after PIVOT
+    let results = run_example(
+        ctx,
+        "Example 6: PIVOT with projection",
+        r#"SELECT region FROM quarterly_sales
+           PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2')) AS p
+           ORDER BY region"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +--------+
+    | region |
+    +--------+
+    | North  |
+    | South  |
+    +--------+
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and capture results.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let df = ctx.sql(sql).await?;
+    println!("{}\n", df.logical_plan().display_indent());
+
+    let batches = df.collect().await?;
+    let results = arrow::util::pretty::pretty_format_batches(&batches)?.to_string();
+    println!("{results}\n");
+
+    Ok(results)
+}
+
+/// Register test data tables.
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    // quarterly_sales: normalized sales data (region, quarter, amount)
+    ctx.register_batch(
+        "quarterly_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "North", "South", "South"]))
+                    as ArrayRef,
+            ),
+            (
+                "quarter",
+                Arc::new(StringArray::from(vec!["Q1", "Q2", "Q1", "Q2"])),
+            ),
+            (
+                "amount",
+                Arc::new(Int64Array::from(vec![1000, 1500, 1200, 1300])),
+            ),
+        ])?,
+    )?;
+
+    // product_sales: sales with additional grouping dimension
+    ctx.register_batch(
+        "product_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "North", "South"])) as ArrayRef,
+            ),
+            (
+                "quarter",
+                Arc::new(StringArray::from(vec!["Q1", "Q1", "Q2"])),
+            ),
+            (
+                "product",
+                Arc::new(StringArray::from(vec!["ProductA", "ProductB", "ProductA"])),
+            ),
+            ("amount", Arc::new(Int64Array::from(vec![500, 500, 650]))),
+        ])?,
+    )?;
+
+    // wide_sales: denormalized/wide format (for UNPIVOT)
+    ctx.register_batch(
+        "wide_sales",
+        RecordBatch::try_from_iter(vec![
+            (
+                "region",
+                Arc::new(StringArray::from(vec!["North", "South"])) as ArrayRef,
+            ),
+            ("q1", Arc::new(Int64Array::from(vec![1000, 1200]))),
+            ("q2", Arc::new(Int64Array::from(vec![1500, 1300]))),
+        ])?,
+    )?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Relation Planner: PivotUnpivotPlanner
+// ============================================================================
+
+/// Relation planner that rewrites PIVOT and UNPIVOT into standard SQL.
+#[derive(Debug)]
+struct PivotUnpivotPlanner;
+
+impl RelationPlanner for PivotUnpivotPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::Pivot {
+                table,
+                aggregate_functions,
+                value_column,
+                value_source,
+                alias,
+                ..
+            } => plan_pivot(
+                ctx,
+                *table,
+                &aggregate_functions,
+                &value_column,
+                value_source,
+                alias,
+            ),
+
+            TableFactor::Unpivot {
+                table,
+                value,
+                name,
+                columns,
+                null_inclusion,
+                alias,
+            } => plan_unpivot(
+                ctx,
+                *table,
+                &value,
+                name,
+                &columns,
+                null_inclusion.as_ref(),
+                alias,
+            ),
+
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+// ============================================================================
+// PIVOT Implementation
+// ============================================================================
+
+/// Rewrite PIVOT to GROUP BY with CASE expressions.
+fn plan_pivot(
+    ctx: &mut dyn RelationPlannerContext,
+    table: TableFactor,
+    aggregate_functions: &[datafusion_sql::sqlparser::ast::ExprWithAlias],
+    value_column: &[datafusion_sql::sqlparser::ast::Expr],
+    value_source: PivotValueSource,
+    alias: Option<datafusion_sql::sqlparser::ast::TableAlias>,
+) -> Result<RelationPlanning> {
+    // Plan the input table
+    let input = ctx.plan(table)?;
+    let schema = input.schema();
+
+    // Parse aggregate functions
+    let aggregates: Vec<Expr> = aggregate_functions
+        .iter()
+        .map(|agg| ctx.sql_to_expr(agg.expr.clone(), schema.as_ref()))
+        .collect::<Result<_>>()?;
+
+    // Get the pivot column (only single-column pivot supported)
+    if value_column.len() != 1 {
+        return Err(plan_datafusion_err!(
+            "Only single-column PIVOT is supported"
+        ));
+    }
+    let pivot_col = ctx.sql_to_expr(value_column[0].clone(), schema.as_ref())?;
+    let pivot_col_name = extract_column_name(&pivot_col)?;
+
+    // Parse pivot values
+    let pivot_values = match value_source {
+        PivotValueSource::List(list) => list
+            .iter()
+            .map(|item| {
+                let alias = item
+                    .alias
+                    .as_ref()
+                    .map(|id| ctx.normalize_ident(id.clone()));
+                let expr = ctx.sql_to_expr(item.expr.clone(), schema.as_ref())?;
+                Ok((alias, expr))
+            })
+            .collect::<Result<Vec<_>>>()?,
+        _ => {
+            return Err(plan_datafusion_err!(
+                "Dynamic PIVOT (ANY/Subquery) is not supported"
+            ));
+        }
+    };
+
+    // Determine GROUP BY columns (non-pivot, non-aggregate columns)
+    let agg_input_cols: Vec<&str> = aggregates
+        .iter()
+        .filter_map(|agg| {
+            if let Expr::AggregateFunction(f) = agg {
+                f.params.args.first().and_then(|e| {
+                    if let Expr::Column(c) = e {
+                        Some(c.name.as_str())
+                    } else {
+                        None
+                    }
+                })
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    let group_by_cols: Vec<Expr> = schema
+        .fields()
+        .iter()
+        .map(|f| f.name().as_str())
+        .filter(|name| *name != pivot_col_name.as_str() && !agg_input_cols.contains(name))
+        .map(col)
+        .collect();
+
+    // Build CASE expressions for each (aggregate, pivot_value) pair
+    let mut pivot_exprs = Vec::new();
+    for agg in &aggregates {
+        let Expr::AggregateFunction(agg_fn) = agg else {
+            continue;
+        };
+        let Some(agg_input) = agg_fn.params.args.first().cloned() else {
+            continue;
+        };
+
+        for (value_alias, pivot_value) in &pivot_values {
+            // CASE pivot_col WHEN pivot_value THEN agg_input END
+            let case_expr = case(col(&pivot_col_name))
+                .when(pivot_value.clone(), agg_input.clone())
+                .end()?;
+
+            // Wrap in aggregate function
+            let pivoted = agg_fn.func.call(vec![case_expr]);
+
+            // Determine column alias
+            let value_str = value_alias
+                .clone()
+                .unwrap_or_else(|| expr_to_string(pivot_value));
+            let col_alias = if aggregates.len() > 1 {
+                format!("{}_{}", agg_fn.func.name(), value_str)
+            } else {
+                value_str
+            };
+
+            pivot_exprs.push(pivoted.alias(col_alias));
+        }
+    }
+
+    let plan = LogicalPlanBuilder::from(input)
+        .aggregate(group_by_cols, pivot_exprs)?
+        .build()?;
+
+    Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+        plan, alias,
+    ))))
+}
+
+// ============================================================================
+// UNPIVOT Implementation
+// ============================================================================
+
+/// Rewrite UNPIVOT to UNION ALL of projections.
+fn plan_unpivot(
+    ctx: &mut dyn RelationPlannerContext,
+    table: TableFactor,
+    value: &datafusion_sql::sqlparser::ast::Expr,
+    name: datafusion_sql::sqlparser::ast::Ident,
+    columns: &[datafusion_sql::sqlparser::ast::ExprWithAlias],
+    null_inclusion: Option<&NullInclusion>,
+    alias: Option<datafusion_sql::sqlparser::ast::TableAlias>,
+) -> Result<RelationPlanning> {
+    // Plan the input table
+    let input = ctx.plan(table)?;
+    let schema = input.schema();
+
+    // Output column names
+    let value_col_name = value.to_string();
+    let name_col_name = ctx.normalize_ident(name);
+
+    // Parse columns to unpivot: (source_column, label)
+    let unpivot_cols: Vec<(String, String)> = columns
+        .iter()
+        .map(|c| {
+            let label = c
+                .alias
+                .as_ref()
+                .map(|id| ctx.normalize_ident(id.clone()))
+                .unwrap_or_else(|| c.expr.to_string());
+            let expr = ctx.sql_to_expr(c.expr.clone(), schema.as_ref())?;
+            let col_name = extract_column_name(&expr)?;
+            Ok((col_name.to_string(), label))
+        })
+        .collect::<Result<_>>()?;
+
+    // Columns to preserve (not being unpivoted)
+    let keep_cols: Vec<&str> = schema
+        .fields()
+        .iter()
+        .map(|f| f.name().as_str())
+        .filter(|name| !unpivot_cols.iter().any(|(c, _)| c == *name))
+        .collect();
+
+    // Build UNION ALL: one SELECT per unpivot column
+    if unpivot_cols.is_empty() {
+        return Err(plan_datafusion_err!("UNPIVOT requires at least one column"));
+    }
+
+    let mut union_inputs: Vec<_> = unpivot_cols
+        .iter()
+        .map(|(col_name, label)| {
+            let mut projection: Vec<Expr> = keep_cols.iter().map(|c| col(*c)).collect();
+            projection.push(lit(label.clone()).alias(&name_col_name));
+            projection.push(col(col_name).alias(&value_col_name));
+
+            LogicalPlanBuilder::from(input.clone())
+                .project(projection)?
+                .build()
+        })
+        .collect::<Result<_>>()?;
+
+    // Combine with UNION ALL
+    let mut plan = union_inputs.remove(0);
+    for branch in union_inputs {
+        plan = LogicalPlanBuilder::from(plan).union(branch)?.build()?;
+    }
+
+    // Apply EXCLUDE NULLS filter (default behavior)
+    let exclude_nulls = null_inclusion.is_none()
+        || matches!(null_inclusion, Some(&NullInclusion::ExcludeNulls));
+    if exclude_nulls {
+        plan = LogicalPlanBuilder::from(plan)
+            .filter(col(&value_col_name).is_not_null())?
+            .build()?;
+    }
+
+    Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+        plan, alias,
+    ))))
+}
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+/// Extract column name from an expression.
+fn extract_column_name(expr: &Expr) -> Result<String> {
+    match expr {
+        Expr::Column(c) => Ok(c.name.clone()),
+        _ => Err(plan_datafusion_err!(
+            "Expected column reference, got {expr}"
+        )),
+    }
+}
+
+/// Convert an expression to a string for use as column alias.
+fn expr_to_string(expr: &Expr) -> String {
+    match expr {
+        Expr::Literal(ScalarValue::Utf8(Some(s)), _) => s.clone(),
+        Expr::Literal(v, _) => v.to_string(),
+        other => other.to_string(),
+    }
+}
diff --git a/datafusion-examples/examples/relation_planner/table_sample.rs b/datafusion-examples/examples/relation_planner/table_sample.rs
new file mode 100644
index 0000000000000..04e5efd9706a6
--- /dev/null
+++ b/datafusion-examples/examples/relation_planner/table_sample.rs
@@ -0,0 +1,836 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # TABLESAMPLE Example
+//!
+//! This example demonstrates implementing SQL `TABLESAMPLE` support using
+//! DataFusion's extensibility APIs.
+//!
+//! This is a working `TABLESAMPLE` implementation that can serve as a starting
+//! point for your own projects. It also works as a template for adding other
+//! custom SQL operators, covering the full pipeline from parsing to execution.
+//!
+//! It shows how to:
+//!
+//! 1. **Parse** TABLESAMPLE syntax via a custom [`RelationPlanner`]
+//! 2. **Plan** sampling as a custom logical node ([`TableSamplePlanNode`])
+//! 3. **Execute** sampling via a custom physical operator ([`SampleExec`])
+//!
+//! ## Supported Syntax
+//!
+//! ```sql
+//! -- Bernoulli sampling (each row has N% chance of selection)
+//! SELECT * FROM table TABLESAMPLE BERNOULLI(10 PERCENT)
+//!
+//! -- Fractional sampling (0.0 to 1.0)
+//! SELECT * FROM table TABLESAMPLE (0.1)
+//!
+//! -- Row count limit
+//! SELECT * FROM table TABLESAMPLE (100 ROWS)
+//!
+//! -- Reproducible sampling with a seed
+//! SELECT * FROM table TABLESAMPLE (10 PERCENT) REPEATABLE(42)
+//! ```
+//!
+//! ## Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                         SQL Query                               │
+//! │  SELECT * FROM t TABLESAMPLE BERNOULLI(10 PERCENT) REPEATABLE(1)│
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                    TableSamplePlanner                           │
+//! │    (RelationPlanner: parses TABLESAMPLE, creates logical node)  │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                   TableSamplePlanNode                           │
+//! │         (UserDefinedLogicalNode: stores sampling params)        │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                TableSampleExtensionPlanner                      │
+//! │       (ExtensionPlanner: creates physical execution plan)       │
+//! └─────────────────────────────────────────────────────────────────┘
+//!                                │
+//!                                ▼
+//! ┌─────────────────────────────────────────────────────────────────┐
+//! │                        SampleExec                               │
+//! │    (ExecutionPlan: performs actual row sampling at runtime)     │
+//! └─────────────────────────────────────────────────────────────────┘
+//! ```
+
+use std::{
+    any::Any,
+    fmt::{self, Debug, Formatter},
+    hash::{Hash, Hasher},
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use arrow::datatypes::{Float64Type, Int64Type};
+use arrow::{
+    array::{ArrayRef, Int32Array, RecordBatch, StringArray, UInt32Array},
+    compute,
+};
+use arrow_schema::SchemaRef;
+use futures::{
+    ready,
+    stream::{Stream, StreamExt},
+};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+use tonic::async_trait;
+
+use datafusion::optimizer::simplify_expressions::simplify_literal::parse_literal;
+use datafusion::{
+    execution::{
+        RecordBatchStream, SendableRecordBatchStream, SessionState, SessionStateBuilder,
+        TaskContext, context::QueryPlanner,
+    },
+    physical_expr::EquivalenceProperties,
+    physical_plan::{
+        DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput},
+    },
+    physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
+    prelude::*,
+};
+use datafusion_common::{
+    DFSchemaRef, DataFusionError, Result, Statistics, internal_err, not_impl_err,
+    plan_datafusion_err, plan_err, tree_node::TreeNodeRecursion,
+};
+use datafusion_expr::{
+    UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+    logical_plan::{Extension, LogicalPlan, LogicalPlanBuilder},
+    planner::{
+        PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+    },
+};
+use datafusion_sql::sqlparser::ast::{
+    self, TableFactor, TableSampleMethod, TableSampleUnit,
+};
+use insta::assert_snapshot;
+
+// ============================================================================
+// Example Entry Point
+// ============================================================================
+
+/// Runs the TABLESAMPLE examples demonstrating various sampling techniques.
+pub async fn table_sample() -> Result<()> {
+    // Build session with custom query planner for physical planning
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_query_planner(Arc::new(TableSampleQueryPlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Register custom relation planner for logical planning
+    ctx.register_relation_planner(Arc::new(TableSamplePlanner))?;
+    register_sample_data(&ctx)?;
+
+    println!("TABLESAMPLE Example");
+    println!("===================\n");
+
+    run_examples(&ctx).await
+}
+
+async fn run_examples(ctx: &SessionContext) -> Result<()> {
+    // Example 1: Baseline - full table scan
+    let results = run_example(
+        ctx,
+        "Example 1: Full table (baseline)",
+        "SELECT * FROM sample_data",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 3       | row_3   |
+    | 4       | row_4   |
+    | 5       | row_5   |
+    | 6       | row_6   |
+    | 7       | row_7   |
+    | 8       | row_8   |
+    | 9       | row_9   |
+    | 10      | row_10  |
+    +---------+---------+
+    ");
+
+    // Example 2: Percentage-based Bernoulli sampling
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 2: BERNOULLI percentage sampling",
+        "SELECT * FROM sample_data TABLESAMPLE BERNOULLI(30 PERCENT) REPEATABLE(123)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 7       | row_7   |
+    | 8       | row_8   |
+    +---------+---------+
+    ");
+
+    // Example 3: Fractional sampling (0.0 to 1.0)
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 3: Fractional sampling",
+        "SELECT * FROM sample_data TABLESAMPLE (0.5) REPEATABLE(456)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 2       | row_2   |
+    | 4       | row_4   |
+    | 8       | row_8   |
+    +---------+---------+
+    ");
+
+    // Example 4: Row count limit (deterministic, no seed needed)
+    let results = run_example(
+        ctx,
+        "Example 4: Row count limit",
+        "SELECT * FROM sample_data TABLESAMPLE (3 ROWS)",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 1       | row_1   |
+    | 2       | row_2   |
+    | 3       | row_3   |
+    +---------+---------+
+    ");
+
+    // Example 5: Sampling combined with filtering
+    let results = run_example(
+        ctx,
+        "Example 5: Sampling with WHERE clause",
+        "SELECT * FROM sample_data TABLESAMPLE (5 ROWS) WHERE column1 > 2",
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+
+    | column1 | column2 |
+    +---------+---------+
+    | 3       | row_3   |
+    | 4       | row_4   |
+    | 5       | row_5   |
+    +---------+---------+
+    ");
+
+    // Example 6: Sampling in JOIN queries
+    // REPEATABLE(seed) ensures deterministic results for snapshot testing
+    let results = run_example(
+        ctx,
+        "Example 6: Sampling in JOINs",
+        r#"SELECT t1.column1, t2.column1, t1.column2, t2.column2
+           FROM sample_data t1 TABLESAMPLE (0.7) REPEATABLE(789)
+           JOIN sample_data t2 TABLESAMPLE (0.7) REPEATABLE(123)
+           ON t1.column1 = t2.column1"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +---------+---------+---------+---------+
+    | column1 | column1 | column2 | column2 |
+    +---------+---------+---------+---------+
+    | 2       | 2       | row_2   | row_2   |
+    | 5       | 5       | row_5   | row_5   |
+    | 7       | 7       | row_7   | row_7   |
+    | 8       | 8       | row_8   | row_8   |
+    | 10      | 10      | row_10  | row_10  |
+    +---------+---------+---------+---------+
+    ");
+
+    Ok(())
+}
+
+/// Helper to run a single example query and capture results.
+async fn run_example(ctx: &SessionContext, title: &str, sql: &str) -> Result<String> {
+    println!("{title}:\n{sql}\n");
+    let df = ctx.sql(sql).await?;
+    println!("{}\n", df.logical_plan().display_indent());
+
+    let batches = df.collect().await?;
+    let results = arrow::util::pretty::pretty_format_batches(&batches)?.to_string();
+    println!("{results}\n");
+
+    Ok(results)
+}
+
+/// Register test data: 10 rows with column1=1..10 and column2="row_1".."row_10"
+fn register_sample_data(ctx: &SessionContext) -> Result<()> {
+    let column1: ArrayRef = Arc::new(Int32Array::from((1..=10).collect::<Vec<i32>>()));
+    let column2: ArrayRef = Arc::new(StringArray::from(
+        (1..=10).map(|i| format!("row_{i}")).collect::<Vec<_>>(),
+    ));
+    let batch =
+        RecordBatch::try_from_iter(vec![("column1", column1), ("column2", column2)])?;
+    ctx.register_batch("sample_data", batch)?;
+    Ok(())
+}
+
+// ============================================================================
+// Logical Planning: TableSamplePlanner + TableSamplePlanNode
+// ============================================================================
+
+/// Relation planner that intercepts `TABLESAMPLE` clauses in SQL and creates
+/// [`TableSamplePlanNode`] logical nodes.
+#[derive(Debug)]
+struct TableSamplePlanner;
+
+impl RelationPlanner for TableSamplePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        // Only handle Table relations with TABLESAMPLE clause
+        let TableFactor::Table {
+            sample: Some(sample),
+            alias,
+            name,
+            args,
+            with_hints,
+            version,
+            with_ordinality,
+            partitions,
+            json_path,
+            index_hints,
+        } = relation
+        else {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        };
+
+        // Extract sample spec (handles both before/after alias positions)
+        let sample = match sample {
+            ast::TableSampleKind::BeforeTableAlias(s)
+            | ast::TableSampleKind::AfterTableAlias(s) => s,
+        };
+
+        // Validate sampling method
+        if let Some(method) = &sample.name
+            && *method != TableSampleMethod::Bernoulli
+            && *method != TableSampleMethod::Row
+        {
+            return not_impl_err!(
+                "Sampling method {} is not supported (only BERNOULLI and ROW)",
+                method
+            );
+        }
+
+        // Offset sampling (ClickHouse-style) not supported
+        if sample.offset.is_some() {
+            return not_impl_err!(
+                "TABLESAMPLE with OFFSET is not supported (requires total row count)"
+            );
+        }
+
+        // Parse optional REPEATABLE seed
+        let seed = sample
+            .seed
+            .map(|s| {
+                s.value.to_string().parse::<u64>().map_err(|_| {
+                    plan_datafusion_err!("REPEATABLE seed must be an integer")
+                })
+            })
+            .transpose()?;
+
+        // Plan the underlying table without the sample clause
+        let base_relation = TableFactor::Table {
+            sample: None,
+            alias: alias.clone(),
+            name,
+            args,
+            with_hints,
+            version,
+            with_ordinality,
+            partitions,
+            json_path,
+            index_hints,
+        };
+        let input = context.plan(base_relation)?;
+
+        // Handle bucket sampling (Hive-style: TABLESAMPLE(BUCKET x OUT OF y))
+        if let Some(bucket) = sample.bucket {
+            if bucket.on.is_some() {
+                return not_impl_err!(
+                    "TABLESAMPLE BUCKET with ON clause requires CLUSTERED BY table"
+                );
+            }
+            let bucket_num: u64 =
+                bucket.bucket.to_string().parse().map_err(|_| {
+                    plan_datafusion_err!("bucket number must be an integer")
+                })?;
+            let total: u64 =
+                bucket.total.to_string().parse().map_err(|_| {
+                    plan_datafusion_err!("bucket total must be an integer")
+                })?;
+
+            let fraction = bucket_num as f64 / total as f64;
+            let plan = TableSamplePlanNode::new(input, fraction, seed).into_plan();
+            return Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                plan, alias,
+            ))));
+        }
+
+        // Handle quantity-based sampling
+        let Some(quantity) = sample.quantity else {
+            return plan_err!(
+                "TABLESAMPLE requires a quantity (percentage, fraction, or row count)"
+            );
+        };
+        let quantity_value_expr = context.sql_to_expr(quantity.value, input.schema())?;
+
+        match quantity.unit {
+            // TABLESAMPLE (N ROWS) - exact row limit
+            Some(TableSampleUnit::Rows) => {
+                let rows: i64 = parse_literal::<Int64Type>(&quantity_value_expr)?;
+                if rows < 0 {
+                    return plan_err!("row count must be non-negative, got {}", rows);
+                }
+                let plan = LogicalPlanBuilder::from(input)
+                    .limit(0, Some(rows as usize))?
+                    .build()?;
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+
+            // TABLESAMPLE (N PERCENT) - percentage sampling
+            Some(TableSampleUnit::Percent) => {
+                let percent: f64 = parse_literal::<Float64Type>(&quantity_value_expr)?;
+                let fraction = percent / 100.0;
+                let plan = TableSamplePlanNode::new(input, fraction, seed).into_plan();
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+
+            // TABLESAMPLE (N) - fraction if <1.0, row limit if >=1.0
+            None => {
+                let value = parse_literal::<Float64Type>(&quantity_value_expr)?;
+                if value < 0.0 {
+                    return plan_err!("sample value must be non-negative, got {}", value);
+                }
+                let plan = if value >= 1.0 {
+                    // Interpret as row limit
+                    LogicalPlanBuilder::from(input)
+                        .limit(0, Some(value as usize))?
+                        .build()?
+                } else {
+                    // Interpret as fraction
+                    TableSamplePlanNode::new(input, value, seed).into_plan()
+                };
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+        }
+    }
+}
+
+/// Custom logical plan node representing a TABLESAMPLE operation.
+///
+/// Stores sampling parameters (bounds, seed) and wraps the input plan.
+/// Gets converted to [`SampleExec`] during physical planning.
+#[derive(Debug, Clone, Hash, Eq, PartialEq, PartialOrd)]
+struct TableSamplePlanNode {
+    input: LogicalPlan,
+    lower_bound: HashableF64,
+    upper_bound: HashableF64,
+    seed: u64,
+}
+
+impl TableSamplePlanNode {
+    /// Create a new sampling node with the given fraction (0.0 to 1.0).
+    fn new(input: LogicalPlan, fraction: f64, seed: Option<u64>) -> Self {
+        Self {
+            input,
+            lower_bound: HashableF64(0.0),
+            upper_bound: HashableF64(fraction),
+            seed: seed.unwrap_or_else(rand::random),
+        }
+    }
+
+    /// Wrap this node in a LogicalPlan::Extension.
+    fn into_plan(self) -> LogicalPlan {
+        LogicalPlan::Extension(Extension {
+            node: Arc::new(self),
+        })
+    }
+}
+
+impl UserDefinedLogicalNodeCore for TableSamplePlanNode {
+    fn name(&self) -> &str {
+        "TableSample"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> fmt::Result {
+        write!(
+            f,
+            "Sample: bounds=[{}, {}], seed={}",
+            self.lower_bound.0, self.upper_bound.0, self.seed
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        Ok(Self {
+            input: inputs.swap_remove(0),
+            lower_bound: self.lower_bound,
+            upper_bound: self.upper_bound,
+            seed: self.seed,
+        })
+    }
+}
+
+/// Wrapper for f64 that implements Hash and Eq (required for LogicalPlan).
+#[derive(Debug, Clone, Copy, PartialOrd)]
+struct HashableF64(f64);
+
+impl PartialEq for HashableF64 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
+    }
+}
+
+impl Eq for HashableF64 {}
+
+impl Hash for HashableF64 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_bits().hash(state);
+    }
+}
+
+// ============================================================================
+// Physical Planning: TableSampleQueryPlanner + TableSampleExtensionPlanner
+// ============================================================================
+
+/// Custom query planner that registers [`TableSampleExtensionPlanner`] to
+/// convert [`TableSamplePlanNode`] into [`SampleExec`].
+#[derive(Debug)]
+struct TableSampleQueryPlanner;
+
+#[async_trait]
+impl QueryPlanner for TableSampleQueryPlanner {
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        session_state: &SessionState,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(
+            TableSampleExtensionPlanner,
+        )]);
+        planner
+            .create_physical_plan(logical_plan, session_state)
+            .await
+    }
+}
+
+/// Extension planner that converts [`TableSamplePlanNode`] to [`SampleExec`].
+struct TableSampleExtensionPlanner;
+
+#[async_trait]
+impl ExtensionPlanner for TableSampleExtensionPlanner {
+    async fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        _logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        _session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let Some(sample_node) = node.as_any().downcast_ref::<TableSamplePlanNode>()
+        else {
+            return Ok(None);
+        };
+
+        let exec = SampleExec::try_new(
+            Arc::clone(&physical_inputs[0]),
+            sample_node.lower_bound.0,
+            sample_node.upper_bound.0,
+            sample_node.seed,
+        )?;
+        Ok(Some(Arc::new(exec)))
+    }
+}
+
+// ============================================================================
+// Physical Execution: SampleExec + BernoulliSampler
+// ============================================================================
+
+/// Physical execution plan that samples rows from its input using Bernoulli sampling.
+///
+/// Each row is independently selected with probability `(upper_bound - lower_bound)`
+/// and appears at most once.
+#[derive(Debug, Clone)]
+pub struct SampleExec {
+    input: Arc<dyn ExecutionPlan>,
+    lower_bound: f64,
+    upper_bound: f64,
+    seed: u64,
+    metrics: ExecutionPlanMetricsSet,
+    cache: Arc<PlanProperties>,
+}
+
+impl SampleExec {
+    /// Create a new SampleExec with Bernoulli sampling (without replacement).
+    ///
+    /// # Arguments
+    /// * `input` - The input execution plan
+    /// * `lower_bound` - Lower bound of sampling range (typically 0.0)
+    /// * `upper_bound` - Upper bound of sampling range (0.0 to 1.0)
+    /// * `seed` - Random seed for reproducible sampling
+    pub fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        lower_bound: f64,
+        upper_bound: f64,
+        seed: u64,
+    ) -> Result<Self> {
+        if lower_bound < 0.0 || upper_bound > 1.0 || lower_bound > upper_bound {
+            return internal_err!(
+                "Sampling bounds must satisfy 0.0 <= lower <= upper <= 1.0, got [{}, {}]",
+                lower_bound,
+                upper_bound
+            );
+        }
+
+        let cache = PlanProperties::new(
+            EquivalenceProperties::new(input.schema()),
+            input.properties().partitioning.clone(),
+            input.properties().emission_type,
+            input.properties().boundedness,
+        );
+
+        Ok(Self {
+            input,
+            lower_bound,
+            upper_bound,
+            seed,
+            metrics: ExecutionPlanMetricsSet::new(),
+            cache: Arc::new(cache),
+        })
+    }
+
+    /// Create a sampler for the given partition.
+    fn create_sampler(&self, partition: usize) -> BernoulliSampler {
+        let seed = self.seed.wrapping_add(partition as u64);
+        BernoulliSampler::new(self.lower_bound, self.upper_bound, seed)
+    }
+}
+
+impl DisplayAs for SampleExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
+        write!(
+            f,
+            "SampleExec: bounds=[{}, {}], seed={}",
+            self.lower_bound, self.upper_bound, self.seed
+        )
+    }
+}
+
+impl ExecutionPlan for SampleExec {
+    fn name(&self) -> &'static str {
+        "SampleExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        // Sampling preserves row order (rows are filtered, not reordered)
+        vec![true]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::try_new(
+            children.swap_remove(0),
+            self.lower_bound,
+            self.upper_bound,
+            self.seed,
+        )?))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(SampleStream {
+            input: self.input.execute(partition, context)?,
+            sampler: self.create_sampler(partition),
+            metrics: BaselineMetrics::new(&self.metrics, partition),
+        }))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let mut stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let ratio = self.upper_bound - self.lower_bound;
+
+        // Scale statistics by sampling ratio (inexact due to randomness)
+        stats.num_rows = stats
+            .num_rows
+            .map(|n| (n as f64 * ratio) as usize)
+            .to_inexact();
+        stats.total_byte_size = stats
+            .total_byte_size
+            .map(|n| (n as f64 * ratio) as usize)
+            .to_inexact();
+
+        Ok(Arc::new(stats))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+}
+
+/// Bernoulli sampler: includes each row with probability `(upper - lower)`.
+/// This is sampling **without replacement** - each row appears at most once.
+struct BernoulliSampler {
+    lower_bound: f64,
+    upper_bound: f64,
+    rng: StdRng,
+}
+
+impl BernoulliSampler {
+    fn new(lower_bound: f64, upper_bound: f64, seed: u64) -> Self {
+        Self {
+            lower_bound,
+            upper_bound,
+            rng: StdRng::seed_from_u64(seed),
+        }
+    }
+
+    fn sample(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let range = self.upper_bound - self.lower_bound;
+        if range <= 0.0 {
+            return Ok(RecordBatch::new_empty(batch.schema()));
+        }
+
+        // Select rows where random value falls in [lower, upper)
+        let indices: Vec<u32> = (0..batch.num_rows())
+            .filter(|_| {
+                let r: f64 = self.rng.random();
+                r >= self.lower_bound && r < self.upper_bound
+            })
+            .map(|i| i as u32)
+            .collect();
+
+        if indices.is_empty() {
+            return Ok(RecordBatch::new_empty(batch.schema()));
+        }
+
+        compute::take_record_batch(batch, &UInt32Array::from(indices))
+            .map_err(DataFusionError::from)
+    }
+}
+
+/// Stream adapter that applies sampling to each batch.
+struct SampleStream {
+    input: SendableRecordBatchStream,
+    sampler: BernoulliSampler,
+    metrics: BaselineMetrics,
+}
+
+impl Stream for SampleStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        match ready!(self.input.poll_next_unpin(cx)) {
+            Some(Ok(batch)) => {
+                let elapsed = self.metrics.elapsed_compute().clone();
+                let _timer = elapsed.timer();
+                let result = self.sampler.sample(&batch);
+                Poll::Ready(Some(result.record_output(&self.metrics)))
+            }
+            Some(Err(e)) => Poll::Ready(Some(Err(e))),
+            None => Poll::Ready(None),
+        }
+    }
+}
+
+impl RecordBatchStream for SampleStream {
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+}
diff --git a/datafusion-examples/examples/sql_dialect.rs b/datafusion-examples/examples/sql_dialect.rs
deleted file mode 100644
index 20b515506f3b4..0000000000000
--- a/datafusion-examples/examples/sql_dialect.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::fmt::Display;
-
-use datafusion::error::{DataFusionError, Result};
-use datafusion::sql::{
-    parser::{CopyToSource, CopyToStatement, DFParser, DFParserBuilder, Statement},
-    sqlparser::{keywords::Keyword, tokenizer::Token},
-};
-
-/// This example demonstrates how to use the DFParser to parse a statement in a custom way
-///
-/// This technique can be used to implement a custom SQL dialect, for example.
-#[tokio::main]
-async fn main() -> Result<()> {
-    let mut my_parser =
-        MyParser::new("COPY source_table TO 'file.fasta' STORED AS FASTA")?;
-
-    let my_statement = my_parser.parse_statement()?;
-
-    match my_statement {
-        MyStatement::DFStatement(s) => println!("df: {s}"),
-        MyStatement::MyCopyTo(s) => println!("my_copy: {s}"),
-    }
-
-    Ok(())
-}
-
-/// Here we define a Parser for our new SQL dialect that wraps the existing `DFParser`
-struct MyParser<'a> {
-    df_parser: DFParser<'a>,
-}
-
-impl<'a> MyParser<'a> {
-    fn new(sql: &'a str) -> Result<Self> {
-        let df_parser = DFParserBuilder::new(sql).build()?;
-        Ok(Self { df_parser })
-    }
-
-    /// Returns true if the next token is `COPY` keyword, false otherwise
-    fn is_copy(&self) -> bool {
-        matches!(
-            self.df_parser.parser.peek_token().token,
-            Token::Word(w) if w.keyword == Keyword::COPY
-        )
-    }
-
-    /// This is the entry point to our parser -- it handles `COPY` statements specially
-    /// but otherwise delegates to the existing DataFusion parser.
-    pub fn parse_statement(&mut self) -> Result<MyStatement, DataFusionError> {
-        if self.is_copy() {
-            self.df_parser.parser.next_token(); // COPY
-            let df_statement = self.df_parser.parse_copy()?;
-
-            if let Statement::CopyTo(s) = df_statement {
-                Ok(MyStatement::from(s))
-            } else {
-                Ok(MyStatement::DFStatement(Box::from(df_statement)))
-            }
-        } else {
-            let df_statement = self.df_parser.parse_statement()?;
-            Ok(MyStatement::from(df_statement))
-        }
-    }
-}
-
-enum MyStatement {
-    DFStatement(Box<Statement>),
-    MyCopyTo(MyCopyToStatement),
-}
-
-impl Display for MyStatement {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            MyStatement::DFStatement(s) => write!(f, "{s}"),
-            MyStatement::MyCopyTo(s) => write!(f, "{s}"),
-        }
-    }
-}
-
-impl From<Statement> for MyStatement {
-    fn from(s: Statement) -> Self {
-        Self::DFStatement(Box::from(s))
-    }
-}
-
-impl From<CopyToStatement> for MyStatement {
-    fn from(s: CopyToStatement) -> Self {
-        if s.stored_as == Some("FASTA".to_string()) {
-            Self::MyCopyTo(MyCopyToStatement::from(s))
-        } else {
-            Self::DFStatement(Box::from(Statement::CopyTo(s)))
-        }
-    }
-}
-
-struct MyCopyToStatement {
-    pub source: CopyToSource,
-    pub target: String,
-}
-
-impl From<CopyToStatement> for MyCopyToStatement {
-    fn from(s: CopyToStatement) -> Self {
-        Self {
-            source: s.source,
-            target: s.target,
-        }
-    }
-}
-
-impl Display for MyCopyToStatement {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "COPY {} TO '{}' STORED AS FASTA",
-            self.source, self.target
-        )
-    }
-}
diff --git a/datafusion-examples/examples/sql_analysis.rs b/datafusion-examples/examples/sql_ops/analysis.rs
similarity index 96%
rename from datafusion-examples/examples/sql_analysis.rs
rename to datafusion-examples/examples/sql_ops/analysis.rs
index d3826026a9725..4243a2927865b 100644
--- a/datafusion-examples/examples/sql_analysis.rs
+++ b/datafusion-examples/examples/sql_ops/analysis.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 //! This example shows how to use the structures that DataFusion provides to perform
 //! Analysis on SQL queries and their plans.
 //!
@@ -23,8 +25,8 @@
 
 use std::sync::Arc;
 
-use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion::common::Result;
+use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion::logical_expr::LogicalPlan;
 use datafusion::{
     datasource::MemTable,
@@ -32,141 +34,9 @@ use datafusion::{
 };
 use test_utils::tpcds::tpcds_schemas;
 
-/// Counts the total number of joins in a plan
-fn total_join_count(plan: &LogicalPlan) -> usize {
-    let mut total = 0;
-
-    // We can use the TreeNode API to walk over a LogicalPlan.
-    plan.apply(|node| {
-        // if we encounter a join we update the running count
-        if matches!(node, LogicalPlan::Join(_)) {
-            total += 1;
-        }
-        Ok(TreeNodeRecursion::Continue)
-    })
-    .unwrap();
-
-    total
-}
-
-/// Counts the total number of joins in a plan and collects every join tree in
-/// the plan with their respective join count.
-///
-/// Join Tree Definition: the largest subtree consisting entirely of joins
-///
-/// For example, this plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   JOIN
-///            /  \
-///           B    C
-/// ```
-///
-/// has a single join tree `(A-B-C)` which will result in `(2, [2])`
-///
-/// This plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   GROUP
-///              |
-///             JOIN
-///             /  \
-///            B    C
-/// ```
-///
-/// Has two join trees `(A-, B-C)` which will result in `(2, [1, 1])`
-fn count_trees(plan: &LogicalPlan) -> (usize, Vec<usize>) {
-    // this works the same way as `total_count`, but now when we encounter a Join
-    // we try to collect it's entire tree
-    let mut to_visit = vec![plan];
-    let mut total = 0;
-    let mut groups = vec![];
-
-    while let Some(node) = to_visit.pop() {
-        // if we encounter a join, we know were at the root of the tree
-        // count this tree and recurse on it's inputs
-        if matches!(node, LogicalPlan::Join(_)) {
-            let (group_count, inputs) = count_tree(node);
-            total += group_count;
-            groups.push(group_count);
-            to_visit.extend(inputs);
-        } else {
-            to_visit.extend(node.inputs());
-        }
-    }
-
-    (total, groups)
-}
-
-/// Count the entire join tree and return its inputs using TreeNode API
-///
-/// For example, if this function receives following plan:
-///
-/// ```text
-///         JOIN
-///         /  \
-///       A   GROUP
-///              |
-///             JOIN
-///             /  \
-///            B    C
-/// ```
-///
-/// It will return `(1, [A, GROUP])`
-fn count_tree(join: &LogicalPlan) -> (usize, Vec<&LogicalPlan>) {
-    let mut inputs = Vec::new();
-    let mut total = 0;
-
-    join.apply(|node| {
-        // Some extra knowledge:
-        //
-        // optimized plans have their projections pushed down as far as
-        // possible, which sometimes results in a projection going in between 2
-        // subsequent joins giving the illusion these joins are not "related",
-        // when in fact they are.
-        //
-        // This plan:
-        //   JOIN
-        //   /  \
-        // A   PROJECTION
-        //        |
-        //       JOIN
-        //       /  \
-        //      B    C
-        //
-        // is the same as:
-        //
-        //   JOIN
-        //   /  \
-        // A   JOIN
-        //     /  \
-        //    B    C
-        // we can continue the recursion in this case
-        if let LogicalPlan::Projection(_) = node {
-            return Ok(TreeNodeRecursion::Continue);
-        }
-
-        // any join we count
-        if matches!(node, LogicalPlan::Join(_)) {
-            total += 1;
-            Ok(TreeNodeRecursion::Continue)
-        } else {
-            inputs.push(node);
-            // skip children of input node
-            Ok(TreeNodeRecursion::Jump)
-        }
-    })
-    .unwrap();
-
-    (total, inputs)
-}
-
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Demonstrates how to analyze a SQL query by counting JOINs and identifying
+/// join-trees using DataFusion’s `LogicalPlan` and `TreeNode` API.
+pub async fn analysis() -> Result<()> {
     // To show how we can count the joins in a sql query we'll be using query 88
     // from the TPC-DS benchmark.
     //
@@ -274,7 +144,10 @@ from
     for table in tables {
         ctx.register_table(
             table.name,
-            Arc::new(MemTable::try_new(Arc::new(table.schema.clone()), vec![])?),
+            Arc::new(MemTable::try_new(
+                Arc::new(table.schema.clone()),
+                vec![vec![]],
+            )?),
         )?;
     }
     // We can create a LogicalPlan from a SQL query like this
@@ -307,3 +180,136 @@ from
 
     Ok(())
 }
+
+/// Counts the total number of joins in a plan
+fn total_join_count(plan: &LogicalPlan) -> usize {
+    let mut total = 0;
+
+    // We can use the TreeNode API to walk over a LogicalPlan.
+    plan.apply(|node| {
+        // if we encounter a join we update the running count
+        if matches!(node, LogicalPlan::Join(_)) {
+            total += 1;
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .unwrap();
+
+    total
+}
+
+/// Counts the total number of joins in a plan and collects every join tree in
+/// the plan with their respective join count.
+///
+/// Join Tree Definition: the largest subtree consisting entirely of joins
+///
+/// For example, this plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   JOIN
+///            /  \
+///           B    C
+/// ```
+///
+/// has a single join tree `(A-B-C)` which will result in `(2, [2])`
+///
+/// This plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   GROUP
+///              |
+///             JOIN
+///             /  \
+///            B    C
+/// ```
+///
+/// Has two join trees `(A-, B-C)` which will result in `(2, [1, 1])`
+fn count_trees(plan: &LogicalPlan) -> (usize, Vec<usize>) {
+    // this works the same way as `total_count`, but now when we encounter a Join
+    // we try to collect it's entire tree
+    let mut to_visit = vec![plan];
+    let mut total = 0;
+    let mut groups = vec![];
+
+    while let Some(node) = to_visit.pop() {
+        // if we encounter a join, we know were at the root of the tree
+        // count this tree and recurse on it's inputs
+        if matches!(node, LogicalPlan::Join(_)) {
+            let (group_count, inputs) = count_tree(node);
+            total += group_count;
+            groups.push(group_count);
+            to_visit.extend(inputs);
+        } else {
+            to_visit.extend(node.inputs());
+        }
+    }
+
+    (total, groups)
+}
+
+/// Count the entire join tree and return its inputs using TreeNode API
+///
+/// For example, if this function receives following plan:
+///
+/// ```text
+///         JOIN
+///         /  \
+///       A   GROUP
+///              |
+///             JOIN
+///             /  \
+///            B    C
+/// ```
+///
+/// It will return `(1, [A, GROUP])`
+fn count_tree(join: &LogicalPlan) -> (usize, Vec<&LogicalPlan>) {
+    let mut inputs = Vec::new();
+    let mut total = 0;
+
+    join.apply(|node| {
+        // Some extra knowledge:
+        //
+        // optimized plans have their projections pushed down as far as
+        // possible, which sometimes results in a projection going in between 2
+        // subsequent joins giving the illusion these joins are not "related",
+        // when in fact they are.
+        //
+        // This plan:
+        //   JOIN
+        //   /  \
+        // A   PROJECTION
+        //        |
+        //       JOIN
+        //       /  \
+        //      B    C
+        //
+        // is the same as:
+        //
+        //   JOIN
+        //   /  \
+        // A   JOIN
+        //     /  \
+        //    B    C
+        // we can continue the recursion in this case
+        if let LogicalPlan::Projection(_) = node {
+            return Ok(TreeNodeRecursion::Continue);
+        }
+
+        // any join we count
+        if matches!(node, LogicalPlan::Join(_)) {
+            total += 1;
+            Ok(TreeNodeRecursion::Continue)
+        } else {
+            inputs.push(node);
+            // skip children of input node
+            Ok(TreeNodeRecursion::Jump)
+        }
+    })
+    .unwrap();
+
+    (total, inputs)
+}
diff --git a/datafusion-examples/examples/sql_ops/custom_sql_parser.rs b/datafusion-examples/examples/sql_ops/custom_sql_parser.rs
new file mode 100644
index 0000000000000..308a0de62a242
--- /dev/null
+++ b/datafusion-examples/examples/sql_ops/custom_sql_parser.rs
@@ -0,0 +1,420 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This example demonstrates extending the DataFusion SQL parser to support
+//! custom DDL statements, specifically `CREATE EXTERNAL CATALOG`.
+//!
+//! ### Custom Syntax
+//! ```sql
+//! CREATE EXTERNAL CATALOG my_catalog
+//! STORED AS ICEBERG
+//! LOCATION 's3://my-bucket/warehouse/'
+//! OPTIONS (
+//!   'region' = 'us-west-2'
+//! );
+//! ```
+//!
+//! Note: For the purpose of this example, we use `local://workspace/` to
+//! automatically discover and register files from the project's test data.
+
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use datafusion::catalog::{
+    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider,
+    TableProviderFactory,
+};
+use datafusion::datasource::listing_table_factory::ListingTableFactory;
+use datafusion::error::{DataFusionError, Result};
+use datafusion::prelude::SessionContext;
+use datafusion::sql::{
+    parser::{DFParser, DFParserBuilder, Statement},
+    sqlparser::{
+        ast::{ObjectName, Value},
+        keywords::Keyword,
+        tokenizer::Token,
+    },
+};
+use datafusion_common::{DFSchema, TableReference, plan_datafusion_err, plan_err};
+use datafusion_expr::CreateExternalTable;
+use futures::StreamExt;
+use insta::assert_snapshot;
+use object_store::ObjectStore;
+use object_store::local::LocalFileSystem;
+
+/// Entry point for the example.
+pub async fn custom_sql_parser() -> Result<()> {
+    // Use standard Parquet testing data as our "external" source.
+    let base_path = datafusion::common::test_util::parquet_test_data();
+    let base_path = std::path::Path::new(&base_path).canonicalize()?;
+
+    // Make the path relative to the workspace root
+    let workspace_root = workspace_root();
+    let location = base_path
+        .strip_prefix(&workspace_root)
+        .map(|p| p.to_string_lossy().to_string())
+        .unwrap_or_else(|_| base_path.to_string_lossy().to_string());
+
+    let create_catalog_sql = format!(
+        "CREATE EXTERNAL CATALOG parquet_testing
+         STORED AS parquet
+         LOCATION 'local://workspace/{location}'
+         OPTIONS (
+           'schema_name' = 'staged_data',
+           'format.pruning' = 'true'
+         )"
+    );
+
+    // =========================================================================
+    // Part 1: Standard DataFusion parser rejects the custom DDL
+    // =========================================================================
+    println!("=== Part 1: Standard DataFusion Parser ===\n");
+    println!("Parsing: {}\n", create_catalog_sql.trim());
+
+    let ctx_standard = SessionContext::new();
+    let err = ctx_standard
+        .sql(&create_catalog_sql)
+        .await
+        .expect_err("Expected the standard parser to reject CREATE EXTERNAL CATALOG (custom DDL syntax)");
+
+    println!("Error: {err}\n");
+    assert_snapshot!(err.to_string(), @r#"SQL error: ParserError("Expected: TABLE, found: CATALOG at Line: 1, Column: 17")"#);
+
+    // =========================================================================
+    // Part 2: Custom parser handles the statement
+    // =========================================================================
+    println!("=== Part 2: Custom Parser ===\n");
+    println!("Parsing: {}\n", create_catalog_sql.trim());
+
+    let ctx = SessionContext::new();
+
+    let mut parser = CustomParser::new(&create_catalog_sql)?;
+    let statement = parser.parse_statement()?;
+    match statement {
+        CustomStatement::CreateExternalCatalog(stmt) => {
+            handle_create_external_catalog(&ctx, stmt).await?;
+        }
+        CustomStatement::DFStatement(_) => {
+            panic!("Expected CreateExternalCatalog statement");
+        }
+    }
+
+    // Query a table from the registered catalog
+    let query_sql = "SELECT id, bool_col, tinyint_col FROM parquet_testing.staged_data.alltypes_plain LIMIT 5";
+    println!("Executing: {query_sql}\n");
+
+    let results = execute_sql(&ctx, query_sql).await?;
+    println!("{results}");
+    assert_snapshot!(results, @r"
+    +----+----------+-------------+
+    | id | bool_col | tinyint_col |
+    +----+----------+-------------+
+    | 4  | true     | 0           |
+    | 5  | false    | 1           |
+    | 6  | true     | 0           |
+    | 7  | false    | 1           |
+    | 2  | true     | 0           |
+    +----+----------+-------------+
+    ");
+
+    Ok(())
+}
+
+/// Execute SQL and return formatted results.
+async fn execute_sql(ctx: &SessionContext, sql: &str) -> Result<String> {
+    let batches = ctx.sql(sql).await?.collect().await?;
+    Ok(arrow::util::pretty::pretty_format_batches(&batches)?.to_string())
+}
+
+/// Custom handler for the `CREATE EXTERNAL CATALOG` statement.
+async fn handle_create_external_catalog(
+    ctx: &SessionContext,
+    stmt: CreateExternalCatalog,
+) -> Result<()> {
+    let factory = ListingTableFactory::new();
+    let catalog = Arc::new(MemoryCatalogProvider::new());
+    let schema = Arc::new(MemorySchemaProvider::new());
+
+    // Extract options
+    let mut schema_name = "public".to_string();
+    let mut table_options = HashMap::new();
+
+    for (k, v) in stmt.options {
+        let val_str = match v {
+            Value::SingleQuotedString(ref s) | Value::DoubleQuotedString(ref s) => {
+                s.to_string()
+            }
+            Value::Number(ref n, _) => n.to_string(),
+            Value::Boolean(b) => b.to_string(),
+            _ => v.to_string(),
+        };
+
+        if k == "schema_name" {
+            schema_name = val_str;
+        } else {
+            table_options.insert(k, val_str);
+        }
+    }
+
+    println!("  Target Catalog: {}", stmt.name);
+    println!("  Data Location: {}", stmt.location);
+    println!("  Resolved Schema: {schema_name}");
+
+    // Register a local object store rooted at the workspace root.
+    // We use a specific authority 'workspace' to ensure consistent resolution.
+    let store = Arc::new(LocalFileSystem::new_with_prefix(workspace_root())?);
+    let store_url = url::Url::parse("local://workspace").unwrap();
+    ctx.register_object_store(&store_url, Arc::clone(&store) as _);
+
+    let target_ext = format!(".{}", stmt.catalog_type.to_lowercase());
+
+    // For 'local://workspace/parquet-testing/data', the path is 'parquet-testing/data'.
+    let path_str = stmt
+        .location
+        .strip_prefix("local://workspace/")
+        .unwrap_or(&stmt.location);
+    let prefix = object_store::path::Path::from(path_str);
+
+    // Discover data files using the ObjectStore API
+    let mut table_count = 0;
+    let mut list_stream = store.list(Some(&prefix));
+
+    while let Some(meta) = list_stream.next().await {
+        let meta = meta?;
+        let path = &meta.location;
+
+        if path.as_ref().ends_with(&target_ext) {
+            let name = std::path::Path::new(path.as_ref())
+                .file_stem()
+                .unwrap()
+                .to_string_lossy()
+                .to_string();
+
+            let table_url = format!("local://workspace/{path}");
+
+            let cmd = CreateExternalTable::builder(
+                TableReference::bare(name.clone()),
+                table_url,
+                stmt.catalog_type.clone(),
+                Arc::new(DFSchema::empty()),
+            )
+            .with_options(table_options.clone())
+            .build();
+
+            match factory.create(&ctx.state(), &cmd).await {
+                Ok(table) => {
+                    schema.register_table(name, table)?;
+                    table_count += 1;
+                }
+                Err(e) => {
+                    eprintln!("Failed to create table {name}: {e}");
+                }
+            }
+        }
+    }
+    println!("  Registered {table_count} tables into schema: {schema_name}");
+
+    catalog.register_schema(&schema_name, schema)?;
+    ctx.register_catalog(stmt.name.to_string(), catalog);
+
+    Ok(())
+}
+
+/// Possible statements returned by our custom parser.
+#[derive(Debug, Clone)]
+pub enum CustomStatement {
+    /// Standard DataFusion statement
+    DFStatement(Box<Statement>),
+    /// Custom `CREATE EXTERNAL CATALOG` statement
+    CreateExternalCatalog(CreateExternalCatalog),
+}
+
+/// Data structure for `CREATE EXTERNAL CATALOG`.
+#[derive(Debug, Clone)]
+pub struct CreateExternalCatalog {
+    pub name: ObjectName,
+    pub catalog_type: String,
+    pub location: String,
+    pub options: Vec<(String, Value)>,
+}
+
+impl Display for CustomStatement {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::DFStatement(s) => write!(f, "{s}"),
+            Self::CreateExternalCatalog(s) => write!(f, "{s}"),
+        }
+    }
+}
+
+impl Display for CreateExternalCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "CREATE EXTERNAL CATALOG {} STORED AS {} LOCATION '{}'",
+            self.name, self.catalog_type, self.location
+        )?;
+        if !self.options.is_empty() {
+            write!(f, " OPTIONS (")?;
+            for (i, (k, v)) in self.options.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "'{k}' = '{v}'")?;
+            }
+            write!(f, ")")?;
+        }
+        Ok(())
+    }
+}
+
+/// A parser that extends `DFParser` with custom syntax.
+struct CustomParser<'a> {
+    df_parser: DFParser<'a>,
+}
+
+impl<'a> CustomParser<'a> {
+    fn new(sql: &'a str) -> Result<Self> {
+        Ok(Self {
+            df_parser: DFParserBuilder::new(sql).build()?,
+        })
+    }
+
+    pub fn parse_statement(&mut self) -> Result<CustomStatement> {
+        if self.is_create_external_catalog() {
+            return self.parse_create_external_catalog();
+        }
+        Ok(CustomStatement::DFStatement(Box::new(
+            self.df_parser.parse_statement()?,
+        )))
+    }
+
+    fn is_create_external_catalog(&self) -> bool {
+        let t1 = &self.df_parser.parser.peek_nth_token(0).token;
+        let t2 = &self.df_parser.parser.peek_nth_token(1).token;
+        let t3 = &self.df_parser.parser.peek_nth_token(2).token;
+
+        matches!(t1, Token::Word(w) if w.keyword == Keyword::CREATE)
+            && matches!(t2, Token::Word(w) if w.keyword == Keyword::EXTERNAL)
+            && matches!(t3, Token::Word(w) if w.value.to_uppercase() == "CATALOG")
+    }
+
+    fn parse_create_external_catalog(&mut self) -> Result<CustomStatement> {
+        // Consume prefix tokens: CREATE EXTERNAL CATALOG
+        for _ in 0..3 {
+            self.df_parser.parser.next_token();
+        }
+
+        let name = self
+            .df_parser
+            .parser
+            .parse_object_name(false)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        let mut catalog_type = None;
+        let mut location = None;
+        let mut options = vec![];
+
+        while let Some(keyword) = self.df_parser.parser.parse_one_of_keywords(&[
+            Keyword::STORED,
+            Keyword::LOCATION,
+            Keyword::OPTIONS,
+        ]) {
+            match keyword {
+                Keyword::STORED => {
+                    if catalog_type.is_some() {
+                        return plan_err!("Duplicate STORED AS");
+                    }
+                    self.df_parser
+                        .parser
+                        .expect_keyword(Keyword::AS)
+                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+                    catalog_type = Some(
+                        self.df_parser
+                            .parser
+                            .parse_identifier()
+                            .map_err(|e| DataFusionError::External(Box::new(e)))?
+                            .value,
+                    );
+                }
+                Keyword::LOCATION => {
+                    if location.is_some() {
+                        return plan_err!("Duplicate LOCATION");
+                    }
+                    location = Some(
+                        self.df_parser
+                            .parser
+                            .parse_literal_string()
+                            .map_err(|e| DataFusionError::External(Box::new(e)))?,
+                    );
+                }
+                Keyword::OPTIONS => {
+                    if !options.is_empty() {
+                        return plan_err!("Duplicate OPTIONS");
+                    }
+                    options = self.parse_value_options()?;
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        Ok(CustomStatement::CreateExternalCatalog(
+            CreateExternalCatalog {
+                name,
+                catalog_type: catalog_type
+                    .ok_or_else(|| plan_datafusion_err!("Missing STORED AS"))?,
+                location: location
+                    .ok_or_else(|| plan_datafusion_err!("Missing LOCATION"))?,
+                options,
+            },
+        ))
+    }
+
+    /// Parse options in the form: (key [=] value, key [=] value, ...)
+    fn parse_value_options(&mut self) -> Result<Vec<(String, Value)>> {
+        let mut options = vec![];
+        self.df_parser
+            .parser
+            .expect_token(&Token::LParen)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+        loop {
+            let key = self.df_parser.parse_option_key()?;
+            // Support optional '=' between key and value
+            let _ = self.df_parser.parser.consume_token(&Token::Eq);
+            let value = self.df_parser.parse_option_value()?;
+            options.push((key, value));
+
+            let comma = self.df_parser.parser.consume_token(&Token::Comma);
+            if self.df_parser.parser.consume_token(&Token::RParen) {
+                break;
+            } else if !comma {
+                return plan_err!("Expected ',' or ')' in OPTIONS");
+            }
+        }
+        Ok(options)
+    }
+}
+
+/// Returns the workspace root directory (parent of datafusion-examples).
+fn workspace_root() -> std::path::PathBuf {
+    std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("CARGO_MANIFEST_DIR should have a parent")
+        .to_path_buf()
+}
diff --git a/datafusion-examples/examples/sql_frontend.rs b/datafusion-examples/examples/sql_ops/frontend.rs
similarity index 98%
rename from datafusion-examples/examples/sql_frontend.rs
rename to datafusion-examples/examples/sql_ops/frontend.rs
index 3955d5038cfb0..025fe47e75b07 100644
--- a/datafusion-examples/examples/sql_frontend.rs
+++ b/datafusion-examples/examples/sql_ops/frontend.rs
@@ -15,8 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::common::{plan_err, TableReference};
+use datafusion::common::{TableReference, plan_err};
 use datafusion::config::ConfigOptions;
 use datafusion::error::Result;
 use datafusion::logical_expr::{
@@ -44,7 +46,7 @@ use std::sync::Arc;
 ///
 /// In this example, we demonstrate how to use the lower level APIs directly,
 /// which only requires the `datafusion-sql` dependency.
-pub fn main() -> Result<()> {
+pub fn frontend() -> Result<()> {
     // First, we parse the SQL string. Note that we use the DataFusion
     // Parser, which wraps the `sqlparser-rs` SQL parser and adds DataFusion
     // specific syntax such as `CREATE EXTERNAL TABLE`
@@ -83,7 +85,7 @@ pub fn main() -> Result<()> {
     let config = OptimizerContext::default().with_skip_failing_rules(false);
     let analyzed_plan = Analyzer::new().execute_and_check(
         logical_plan,
-        config.options(),
+        &config.options(),
         observe_analyzer,
     )?;
     // Note that the Analyzer has added a CAST to the plan to align the types
diff --git a/datafusion-examples/examples/sql_ops/main.rs b/datafusion-examples/examples/sql_ops/main.rs
new file mode 100644
index 0000000000000..ce7be8fa2bada
--- /dev/null
+++ b/datafusion-examples/examples/sql_ops/main.rs
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # SQL Examples
+//!
+//! These examples demonstrate SQL operations in DataFusion.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example sql_ops -- [all|analysis|custom_sql_parser|frontend|query]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `analysis`
+//!   (file: analysis.rs, desc: Analyze SQL queries)
+//!
+//! - `custom_sql_parser`
+//!   (file: custom_sql_parser.rs, desc: Implement a custom SQL parser to extend DataFusion)
+//!
+//! - `frontend`
+//!   (file: frontend.rs, desc: Build LogicalPlans from SQL)
+//!
+//! - `query`  
+//!   (file: query.rs, desc: Query data using SQL)
+
+mod analysis;
+mod custom_sql_parser;
+mod frontend;
+mod query;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    Analysis,
+    CustomSqlParser,
+    Frontend,
+    Query,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "sql_ops";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::Analysis => analysis::analysis().await?,
+            ExampleKind::CustomSqlParser => {
+                custom_sql_parser::custom_sql_parser().await?
+            }
+            ExampleKind::Frontend => frontend::frontend()?,
+            ExampleKind::Query => query::query().await?,
+        }
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/sql_query.rs b/datafusion-examples/examples/sql_ops/query.rs
similarity index 66%
rename from datafusion-examples/examples/sql_query.rs
rename to datafusion-examples/examples/sql_ops/query.rs
index 0ac203cfb7e74..60b47c36b9ae2 100644
--- a/datafusion-examples/examples/sql_query.rs
+++ b/datafusion-examples/examples/sql_ops/query.rs
@@ -15,26 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::arrow::array::{UInt64Array, UInt8Array};
+//! See `main.rs` for how to run it.
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::{UInt8Array, UInt64Array};
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::catalog::MemTable;
 use datafusion::common::{assert_batches_eq, exec_datafusion_err};
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::ListingOptions;
-use datafusion::datasource::MemTable;
 use datafusion::error::{DataFusionError, Result};
 use datafusion::prelude::*;
+use datafusion_examples::utils::{datasets::ExampleDataset, write_csv_to_parquet};
 use object_store::local::LocalFileSystem;
-use std::path::Path;
-use std::sync::Arc;
 
 /// Examples of various ways to execute queries using SQL
 ///
 /// [`query_memtable`]: a simple query against a [`MemTable`]
 /// [`query_parquet`]: a simple query against a directory with multiple Parquet files
-///
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn query() -> Result<()> {
     query_memtable().await?;
     query_parquet().await?;
     Ok(())
@@ -113,32 +114,33 @@ async fn query_parquet() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
-    let test_data = datafusion::test_util::parquet_test_data();
+    // Convert the CSV input into a temporary Parquet directory for querying
+    let dataset = ExampleDataset::Cars;
+    let parquet_temp = write_csv_to_parquet(&ctx, &dataset.path()).await?;
 
     // Configure listing options
     let file_format = ParquetFormat::default().with_enable_pruning(true);
-    let listing_options = ListingOptions::new(Arc::new(file_format))
-        // This is a workaround for this example since `test_data` contains
-        // many different parquet different files,
-        // in practice use FileType::PARQUET.get_ext().
-        .with_file_extension("alltypes_plain.parquet");
+    let listing_options =
+        ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet");
+
+    let table_path = parquet_temp.file_uri()?;
 
     // First example were we use an absolute path, which requires no additional setup.
     ctx.register_listing_table(
         "my_table",
-        &format!("file://{test_data}/"),
+        &table_path,
         listing_options.clone(),
         None,
         None,
     )
-    .await
-    .unwrap();
+    .await?;
 
     // execute the query
     let df = ctx
         .sql(
             "SELECT * \
         FROM my_table \
+        ORDER BY speed \
         LIMIT 1",
         )
         .await?;
@@ -147,20 +149,22 @@ async fn query_parquet() -> Result<()> {
     let results = df.collect().await?;
     assert_batches_eq!(
         [
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+            "+-----+-------+---------------------+",
+            "| car | speed | time                |",
+            "+-----+-------+---------------------+",
+            "| red | 0.0   | 1996-04-12T12:05:15 |",
+            "+-----+-------+---------------------+",
         ],
-        &results);
+        &results
+    );
 
-    // Second example were we temporarily move into the test data's parent directory and
-    // simulate a relative path, this requires registering an ObjectStore.
+    // Second example where we change the current working directory and explicitly
+    // register a local filesystem object store. This demonstrates how listing tables
+    // resolve paths via an ObjectStore, even when using filesystem-backed data.
     let cur_dir = std::env::current_dir()?;
-
-    let test_data_path = Path::new(&test_data);
-    let test_data_path_parent = test_data_path
+    let test_data_path_parent = parquet_temp
+        .tmp_dir
+        .path()
         .parent()
         .ok_or(exec_datafusion_err!("test_data path needs a parent"))?;
 
@@ -168,15 +172,15 @@ async fn query_parquet() -> Result<()> {
 
     let local_fs = Arc::new(LocalFileSystem::default());
 
-    let u = url::Url::parse("file://./")
+    let url = url::Url::parse("file://./")
         .map_err(|e| DataFusionError::External(Box::new(e)))?;
-    ctx.register_object_store(&u, local_fs);
+    ctx.register_object_store(&url, local_fs);
 
     // Register a listing table - this will use all files in the directory as data sources
     // for the query
     ctx.register_listing_table(
         "relative_table",
-        "./data",
+        parquet_temp.path_str()?,
         listing_options.clone(),
         None,
         None,
@@ -188,6 +192,7 @@ async fn query_parquet() -> Result<()> {
         .sql(
             "SELECT * \
         FROM relative_table \
+        ORDER BY speed \
         LIMIT 1",
         )
         .await?;
@@ -196,13 +201,14 @@ async fn query_parquet() -> Result<()> {
     let results = df.collect().await?;
     assert_batches_eq!(
         [
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
-            "| 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |",
-            "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+            "+-----+-------+---------------------+",
+            "| car | speed | time                |",
+            "+-----+-------+---------------------+",
+            "| red | 0.0   | 1996-04-12T12:05:15 |",
+            "+-----+-------+---------------------+",
         ],
-        &results);
+        &results
+    );
 
     // Reset the current directory
     std::env::set_current_dir(cur_dir)?;
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/udf/advanced_udaf.rs
similarity index 95%
rename from datafusion-examples/examples/advanced_udaf.rs
rename to datafusion-examples/examples/udf/advanced_udaf.rs
index 7b1d3e94b2efe..89f621d30e18d 100644
--- a/datafusion-examples/examples/advanced_udaf.rs
+++ b/datafusion-examples/examples/udf/advanced_udaf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::datatypes::{Field, Schema};
 use datafusion::physical_expr::NullState;
 use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
@@ -26,13 +28,13 @@ use arrow::array::{
 use arrow::datatypes::{ArrowNativeTypeOp, ArrowPrimitiveType, Float64Type, UInt32Type};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::FieldRef;
-use datafusion::common::{cast::as_float64_array, ScalarValue};
+use datafusion::common::{ScalarValue, cast::as_float64_array};
 use datafusion::error::Result;
 use datafusion::logical_expr::{
+    Accumulator, AggregateUDF, AggregateUDFImpl, EmitTo, GroupsAccumulator, Signature,
     expr::AggregateFunction,
     function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
-    simplify::SimplifyInfo,
-    Accumulator, AggregateUDF, AggregateUDFImpl, EmitTo, GroupsAccumulator, Signature,
+    simplify::SimplifyContext,
 };
 use datafusion::prelude::*;
 
@@ -41,7 +43,7 @@ use datafusion::prelude::*;
 /// a function `accumulator` that returns the `Accumulator` instance.
 ///
 /// To do so, we must implement the `AggregateUDFImpl` trait.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct GeoMeanUdaf {
     signature: Signature,
 }
@@ -312,12 +314,16 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
         let prods = emit_to.take_needed(&mut self.prods);
         let nulls = self.null_state.build(emit_to);
 
-        assert_eq!(nulls.len(), prods.len());
+        if let Some(nulls) = &nulls {
+            assert_eq!(nulls.len(), counts.len());
+        }
         assert_eq!(counts.len(), prods.len());
 
         // don't evaluate geometric mean with null inputs to avoid errors on null values
 
-        let array: PrimitiveArray<Float64Type> = if nulls.null_count() > 0 {
+        let array: PrimitiveArray<Float64Type> = if let Some(nulls) = &nulls
+            && nulls.null_count() > 0
+        {
             let mut builder = PrimitiveBuilder::<Float64Type>::with_capacity(nulls.len());
             let iter = prods.into_iter().zip(counts).zip(nulls.iter());
 
@@ -335,7 +341,7 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
                 .zip(counts)
                 .map(|(prod, count)| prod.powf(1.0 / count as f64))
                 .collect::<Vec<_>>();
-            PrimitiveArray::new(geo_mean.into(), Some(nulls)) // no copy
+            PrimitiveArray::new(geo_mean.into(), nulls) // no copy
                 .with_data_type(self.return_data_type.clone())
         };
 
@@ -345,7 +351,6 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
     // return arrays for counts and prods
     fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
         let nulls = self.null_state.build(emit_to);
-        let nulls = Some(nulls);
 
         let counts = emit_to.take_needed(&mut self.counts);
         let counts = UInt32Array::new(counts.into(), nulls.clone()); // zero copy
@@ -368,7 +373,7 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
 
 /// This example shows how to use the AggregateUDFImpl::simplify API to simplify/replace user
 /// defined aggregate function with a different expression which is defined in the `simplify` method.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct SimplifiedGeoMeanUdaf {
     signature: Signature,
 }
@@ -419,7 +424,7 @@ impl AggregateUDFImpl for SimplifiedGeoMeanUdaf {
 
     /// Optionally replaces a UDAF with another expression during query optimization.
     fn simplify(&self) -> Option<AggregateFunctionSimplification> {
-        let simplify = |aggregate_function: AggregateFunction, _: &dyn SimplifyInfo| {
+        let simplify = |aggregate_function: AggregateFunction, _: &SimplifyContext| {
             // Replaces the UDAF with `GeoMeanUdaf` as a placeholder example to demonstrate the `simplify` method.
             // In real-world scenarios, you might create UDFs from built-in expressions.
             Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
@@ -469,8 +474,9 @@ fn create_context() -> Result<SessionContext> {
     Ok(ctx)
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `GeoMeanUdaf` and `SimplifiedGeoMeanUdaf`
+/// as user defined aggregate functions and invoke them via the DataFrame API and SQL
+pub async fn advanced_udaf() -> Result<()> {
     let ctx = create_context()?;
 
     let geo_mean_udf = AggregateUDF::from(GeoMeanUdaf::new());
diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/udf/advanced_udf.rs
similarity index 98%
rename from datafusion-examples/examples/advanced_udf.rs
rename to datafusion-examples/examples/udf/advanced_udf.rs
index 290d1c53334b7..a00a7e7df434f 100644
--- a/datafusion-examples/examples/advanced_udf.rs
+++ b/datafusion-examples/examples/udf/advanced_udf.rs
@@ -15,19 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, Float32Array, Float64Array,
+    Array, ArrayRef, AsArray, Float32Array, Float64Array, new_null_array,
 };
 use arrow::compute;
 use arrow::datatypes::{DataType, Float64Type};
 use arrow::record_batch::RecordBatch;
-use datafusion::common::{exec_err, internal_err, ScalarValue};
+use datafusion::common::{ScalarValue, exec_err, internal_err};
 use datafusion::error::Result;
-use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::Volatility;
+use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
 };
@@ -39,7 +41,7 @@ use datafusion::prelude::*;
 /// the power of the second argument `a^b`.
 ///
 /// To do so, we must implement the `ScalarUDFImpl` trait.
-#[derive(Debug, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct PowUdf {
     signature: Signature,
     aliases: Vec<String>,
@@ -245,10 +247,35 @@ fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result<ArrayRef> {
     }
 }
 
+/// create local execution context with an in-memory table:
+///
+/// ```text
+/// +-----+-----+
+/// | a   | b   |
+/// +-----+-----+
+/// | 2.1 | 1.0 |
+/// | 3.1 | 2.0 |
+/// | 4.1 | 3.0 |
+/// | 5.1 | 4.0 |
+/// +-----+-----+
+/// ```
+fn create_context() -> Result<SessionContext> {
+    // define data.
+    let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
+    let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+    let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
+
+    // declare a new context. In Spark API, this corresponds to a new SparkSession
+    let ctx = SessionContext::new();
+
+    // declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
+    ctx.register_batch("t", batch)?;
+    Ok(ctx)
+}
+
 /// In this example we register `PowUdf` as a user defined function
 /// and invoke it via the DataFrame API and SQL
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn advanced_udf() -> Result<()> {
     let ctx = create_context()?;
 
     // create the UDF
@@ -295,29 +322,3 @@ async fn main() -> Result<()> {
 
     Ok(())
 }
-
-/// create local execution context with an in-memory table:
-///
-/// ```text
-/// +-----+-----+
-/// | a   | b   |
-/// +-----+-----+
-/// | 2.1 | 1.0 |
-/// | 3.1 | 2.0 |
-/// | 4.1 | 3.0 |
-/// | 5.1 | 4.0 |
-/// +-----+-----+
-/// ```
-fn create_context() -> Result<SessionContext> {
-    // define data.
-    let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
-    let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
-    let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
-
-    // declare a new context. In Spark API, this corresponds to a new SparkSession
-    let ctx = SessionContext::new();
-
-    // declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
-    ctx.register_batch("t", batch)?;
-    Ok(ctx)
-}
diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs
similarity index 89%
rename from datafusion-examples/examples/advanced_udwf.rs
rename to datafusion-examples/examples/udf/advanced_udwf.rs
index 4f00e04e7e993..615d099c2854d 100644
--- a/datafusion-examples/examples/advanced_udwf.rs
+++ b/datafusion-examples/examples/udf/advanced_udwf.rs
@@ -15,8 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
-use std::any::Any;
+//! See `main.rs` for how to run it.
+
+use std::{any::Any, sync::Arc};
 
 use arrow::datatypes::Field;
 use arrow::{
@@ -31,19 +32,22 @@ use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion::logical_expr::function::{
     PartitionEvaluatorArgs, WindowFunctionSimplification, WindowUDFFieldArgs,
 };
-use datafusion::logical_expr::simplify::SimplifyInfo;
+use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{
-    Expr, PartitionEvaluator, Signature, WindowFrame, WindowFunctionDefinition,
-    WindowUDF, WindowUDFImpl,
+    Expr, LimitEffect, PartitionEvaluator, Signature, WindowFrame,
+    WindowFunctionDefinition, WindowUDF, WindowUDFImpl,
 };
+use datafusion::physical_expr::PhysicalExpr;
 use datafusion::prelude::*;
+use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
+use datafusion_examples::utils::datasets::ExampleDataset;
 
 /// This example shows how to use the full WindowUDFImpl API to implement a user
 /// defined window function. As in the `simple_udwf.rs` example, this struct implements
 /// a function `partition_evaluator` that returns the `MyPartitionEvaluator` instance.
 ///
 /// To do so, we must implement the `WindowUDFImpl` trait.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct SmoothItUdf {
     signature: Signature,
 }
@@ -91,6 +95,10 @@ impl WindowUDFImpl for SmoothItUdf {
     fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
         Ok(Field::new(field_args.name(), DataType::Float64, true).into())
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 /// This implements the lowest level evaluation for a window function
@@ -149,7 +157,7 @@ impl PartitionEvaluator for MyPartitionEvaluator {
 }
 
 /// This UDWF will show how to use the WindowUDFImpl::simplify() API
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct SimplifySmoothItUdf {
     signature: Signature,
 }
@@ -190,8 +198,8 @@ impl WindowUDFImpl for SimplifySmoothItUdf {
     /// this function will simplify `SimplifySmoothItUdf` to `AggregateUDF` for `Avg`
     /// default implementation will not be called (left as `todo!()`)
     fn simplify(&self) -> Option<WindowFunctionSimplification> {
-        let simplify = |window_function: WindowFunction, _: &dyn SimplifyInfo| {
-            Ok(Expr::WindowFunction(WindowFunction {
+        let simplify = |window_function: WindowFunction, _: &SimplifyContext| {
+            Ok(Expr::from(WindowFunction {
                 fun: WindowFunctionDefinition::AggregateUDF(avg_udaf()),
                 params: WindowFunctionParams {
                     args: window_function.params.args,
@@ -199,6 +207,8 @@ impl WindowUDFImpl for SimplifySmoothItUdf {
                     order_by: window_function.params.order_by,
                     window_frame: window_function.params.window_frame,
                     null_treatment: window_function.params.null_treatment,
+                    distinct: window_function.params.distinct,
+                    filter: window_function.params.filter,
                 },
             }))
         };
@@ -209,6 +219,10 @@ impl WindowUDFImpl for SimplifySmoothItUdf {
     fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
         Ok(Field::new(field_args.name(), DataType::Float64, true).into())
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 // create local execution context with `cars.csv` registered as a table named `cars`
@@ -216,17 +230,17 @@ async fn create_context() -> Result<SessionContext> {
     // declare a new context. In spark API, this corresponds to a new spark SQL session
     let ctx = SessionContext::new();
 
-    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
-    println!("pwd: {}", std::env::current_dir().unwrap().display());
-    let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string();
-    let read_options = CsvReadOptions::default().has_header(true);
+    let dataset = ExampleDataset::Cars;
+
+    ctx.register_csv("cars", dataset.path_str()?, CsvReadOptions::new())
+        .await?;
 
-    ctx.register_csv("cars", &csv_path, read_options).await?;
     Ok(ctx)
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `SmoothItUdf` as user defined window function
+/// and invoke it via the DataFrame API and SQL
+pub async fn advanced_udwf() -> Result<()> {
     let ctx = create_context().await?;
     let smooth_it = WindowUDF::from(SmoothItUdf::new());
     ctx.register_udwf(smooth_it.clone());
diff --git a/datafusion-examples/examples/udf/async_udf.rs b/datafusion-examples/examples/udf/async_udf.rs
new file mode 100644
index 0000000000000..3d8faf623d439
--- /dev/null
+++ b/datafusion-examples/examples/udf/async_udf.rs
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! This example shows how to create and use "Async UDFs" in DataFusion.
+//!
+//! Async UDFs allow you to perform asynchronous operations, such as
+//! making network requests. This can be used for tasks like fetching
+//! data from an external API such as a LLM service or an external database.
+
+use std::{any::Any, sync::Arc};
+
+use arrow::array::{ArrayRef, BooleanArray, Int64Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use async_trait::async_trait;
+use datafusion::assert_batches_eq;
+use datafusion::common::cast::as_string_view_array;
+use datafusion::common::error::Result;
+use datafusion::common::not_impl_err;
+use datafusion::common::utils::take_function_args;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion::prelude::{SessionConfig, SessionContext};
+
+/// In this example we register `AskLLM` as an asynchronous user defined function
+/// and invoke it via the DataFrame API and SQL
+pub async fn async_udf() -> Result<()> {
+    // Use a hard coded parallelism level of 4 so the explain plan
+    // is consistent across machines.
+    let config = SessionConfig::new().with_target_partitions(4);
+    let ctx =
+        SessionContext::from(SessionStateBuilder::new().with_config(config).build());
+
+    // Similarly to regular UDFs, you create an AsyncScalarUDF by implementing
+    // `AsyncScalarUDFImpl` and creating an instance of `AsyncScalarUDF`.
+    let async_equal = AskLLM::new();
+    let udf = AsyncScalarUDF::new(Arc::new(async_equal));
+
+    // Async UDFs are registered with the SessionContext, using the same
+    // `register_udf` method as regular UDFs.
+    ctx.register_udf(udf.into_scalar_udf());
+
+    // Create a table named 'animal' with some sample data
+    ctx.register_batch("animal", animal()?)?;
+
+    // You can use the async UDF as normal in SQL queries
+    //
+    // Note: Async UDFs can currently be used in the select list and filter conditions.
+    let results = ctx
+        .sql("select * from animal a where ask_llm(a.name, 'Is this animal furry?')")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        [
+            "+----+------+",
+            "| id | name |",
+            "+----+------+",
+            "| 1  | cat  |",
+            "| 2  | dog  |",
+            "+----+------+",
+        ],
+        &results
+    );
+
+    // While the interface is the same for both normal and async UDFs, you can
+    // use `EXPLAIN` output to see that the async UDF uses a special
+    // `AsyncFuncExec` node in the physical plan:
+    let results = ctx
+        .sql("explain select * from animal a where ask_llm(a.name, 'Is this animal furry?')")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        [
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
+            "| plan_type     | plan                                                                                                                         |",
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
+            "| logical_plan  | SubqueryAlias: a                                                                                                             |",
+            "|               |   Filter: ask_llm(CAST(animal.name AS Utf8View), Utf8View(\"Is this animal furry?\"))                                          |",
+            "|               |     TableScan: animal projection=[id, name]                                                                                  |",
+            "| physical_plan | FilterExec: __async_fn_0@2, projection=[id@0, name@1]                                                                        |",
+            "|               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                       |",
+            "|               |     AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=ask_llm(CAST(name@1 AS Utf8View), Is this animal furry?))] |",
+            "|               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                      |",
+            "|               |                                                                                                                              |",
+            "+---------------+------------------------------------------------------------------------------------------------------------------------------+",
+        ],
+        &results
+    );
+
+    Ok(())
+}
+
+/// Returns a sample `RecordBatch` representing an "animal" table with two columns:
+fn animal() -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("name", DataType::Utf8, false),
+    ]));
+
+    let id_array = Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5]));
+    let name_array = Arc::new(StringArray::from(vec![
+        "cat", "dog", "fish", "bird", "snake",
+    ]));
+
+    Ok(RecordBatch::try_new(schema, vec![id_array, name_array])?)
+}
+
+/// An async UDF that simulates asking a large language model (LLM) service a
+/// question based on the content of two columns. The UDF will return a boolean
+/// indicating whether the LLM thinks the first argument matches the question in
+/// the second argument.
+///
+/// Since this is a simplified example, it does not call an LLM service, but
+/// could be extended to do so in a real-world scenario.
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct AskLLM {
+    signature: Signature,
+}
+
+impl Default for AskLLM {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AskLLM {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(
+                vec![DataType::Utf8View, DataType::Utf8View],
+                Volatility::Volatile,
+            ),
+        }
+    }
+}
+
+/// All async UDFs implement the `ScalarUDFImpl` trait, which provides the basic
+/// information for the function, such as its name, signature, and return type.
+/// [async_trait]
+impl ScalarUDFImpl for AskLLM {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "ask_llm"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    /// Since this is an async UDF, the `invoke_with_args` method will not be
+    /// called directly.
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        not_impl_err!("AskLLM can only be called from async contexts")
+    }
+}
+
+/// In addition to [`ScalarUDFImpl`], we also need to implement the
+/// [`AsyncScalarUDFImpl`] trait.
+#[async_trait]
+impl AsyncScalarUDFImpl for AskLLM {
+    /// The `invoke_async_with_args` method is similar to `invoke_with_args`,
+    /// but it returns a `Future` that resolves to the result.
+    ///
+    /// Since this signature is `async`, it can do any `async` operations, such
+    /// as network requests. This method is run on the same tokio `Runtime` that
+    /// is processing the query, so you may wish to make actual network requests
+    /// on a different `Runtime`, as explained in the `thread_pools.rs` example
+    /// in this directory.
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        // in a real UDF you would likely want to special case constant
+        // arguments to improve performance, but this example converts the
+        // arguments to arrays for simplicity.
+        let args = ColumnarValue::values_to_arrays(&args.args)?;
+        let [content_column, question_column] = take_function_args(self.name(), args)?;
+
+        // In a real function, you would use a library such as `reqwest` here to
+        // make an async HTTP request. Credentials and other configurations can
+        // be supplied via the `ConfigOptions` parameter.
+
+        // In this example, we will simulate the LLM response by comparing the two
+        // input arguments using some static strings
+        let content_column = as_string_view_array(&content_column)?;
+        let question_column = as_string_view_array(&question_column)?;
+
+        let result_array: BooleanArray = content_column
+            .iter()
+            .zip(question_column.iter())
+            .map(|(a, b)| {
+                // If either value is null, return None
+                let a = a?;
+                let b = b?;
+                // Simulate an LLM response by checking the arguments to some
+                // hardcoded conditions.
+                if a.contains("cat") && b.contains("furry")
+                    || a.contains("dog") && b.contains("furry")
+                {
+                    Some(true)
+                } else {
+                    Some(false)
+                }
+            })
+            .collect();
+
+        Ok(ColumnarValue::from(Arc::new(result_array) as ArrayRef))
+    }
+}
diff --git a/datafusion-examples/examples/udf/main.rs b/datafusion-examples/examples/udf/main.rs
new file mode 100644
index 0000000000000..e024e466ab07e
--- /dev/null
+++ b/datafusion-examples/examples/udf/main.rs
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # User-Defined Functions Examples
+//!
+//! These examples demonstrate user-defined functions in DataFusion.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example udf -- [all|adv_udaf|adv_udf|adv_udwf|async_udf|udaf|udf|udtf|udwf]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `all` — run all examples included in this module
+//!
+//! - `adv_udaf`
+//!   (file: advanced_udaf.rs, desc: Advanced User Defined Aggregate Function (UDAF))
+//!
+//! - `adv_udf`
+//!   (file: advanced_udf.rs, desc: Advanced User Defined Scalar Function (UDF))
+//!
+//! - `adv_udwf`
+//!   (file: advanced_udwf.rs, desc: Advanced User Defined Window Function (UDWF))
+//!
+//! - `async_udf`
+//!   (file: async_udf.rs, desc: Asynchronous User Defined Scalar Function)
+//!
+//! - `udaf`
+//!   (file: simple_udaf.rs, desc: Simple UDAF example)
+//!
+//! - `udf`
+//!   (file: simple_udf.rs, desc: Simple UDF example)
+//!
+//! - `udtf`
+//!   (file: simple_udtf.rs, desc: Simple UDTF example)
+//!
+//! - `udwf`
+//!   (file: simple_udwf.rs, desc: Simple UDWF example)
+
+mod advanced_udaf;
+mod advanced_udf;
+mod advanced_udwf;
+mod async_udf;
+mod simple_udaf;
+mod simple_udf;
+mod simple_udtf;
+mod simple_udwf;
+
+use datafusion::error::{DataFusionError, Result};
+use strum::{IntoEnumIterator, VariantNames};
+use strum_macros::{Display, EnumIter, EnumString, VariantNames};
+
+#[derive(EnumIter, EnumString, Display, VariantNames)]
+#[strum(serialize_all = "snake_case")]
+enum ExampleKind {
+    All,
+    AdvUdaf,
+    AdvUdf,
+    AdvUdwf,
+    AsyncUdf,
+    Udf,
+    Udaf,
+    Udwf,
+    Udtf,
+}
+
+impl ExampleKind {
+    const EXAMPLE_NAME: &str = "udf";
+
+    fn runnable() -> impl Iterator<Item = ExampleKind> {
+        ExampleKind::iter().filter(|v| !matches!(v, ExampleKind::All))
+    }
+
+    async fn run(&self) -> Result<()> {
+        match self {
+            ExampleKind::All => {
+                for example in ExampleKind::runnable() {
+                    println!("Running example: {example}");
+                    Box::pin(example.run()).await?;
+                }
+            }
+            ExampleKind::AdvUdaf => advanced_udaf::advanced_udaf().await?,
+            ExampleKind::AdvUdf => advanced_udf::advanced_udf().await?,
+            ExampleKind::AdvUdwf => advanced_udwf::advanced_udwf().await?,
+            ExampleKind::AsyncUdf => async_udf::async_udf().await?,
+            ExampleKind::Udaf => simple_udaf::simple_udaf().await?,
+            ExampleKind::Udf => simple_udf::simple_udf().await?,
+            ExampleKind::Udtf => simple_udtf::simple_udtf().await?,
+            ExampleKind::Udwf => simple_udwf::simple_udwf().await?,
+        }
+
+        Ok(())
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::VARIANTS.join("|")
+    );
+
+    let example: ExampleKind = std::env::args()
+        .nth(1)
+        .unwrap_or_else(|| ExampleKind::All.to_string())
+        .parse()
+        .map_err(|_| DataFusionError::Execution(format!("Unknown example. {usage}")))?;
+
+    example.run().await
+}
diff --git a/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/udf/simple_udaf.rs
similarity index 96%
rename from datafusion-examples/examples/simple_udaf.rs
rename to datafusion-examples/examples/udf/simple_udaf.rs
index 82bde7c034a57..42ea0054b759f 100644
--- a/datafusion-examples/examples/simple_udaf.rs
+++ b/datafusion-examples/examples/udf/simple_udaf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+//!
 /// In this example we will declare a single-type, single return type UDAF that computes the geometric mean.
 /// The geometric mean is described here: https://en.wikipedia.org/wiki/Geometric_mean
 use datafusion::arrow::{
@@ -135,8 +137,9 @@ impl Accumulator for GeometricMean {
     }
 }
 
-#[tokio::main]
-async fn main() -> Result<()> {
+/// In this example we register `GeometricMean`
+/// as user defined aggregate function and invoke it via the DataFrame API and SQL
+pub async fn simple_udaf() -> Result<()> {
     let ctx = create_context()?;
 
     // here is where we define the UDAF. We also declare its signature:
diff --git a/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/udf/simple_udf.rs
similarity index 98%
rename from datafusion-examples/examples/simple_udf.rs
rename to datafusion-examples/examples/udf/simple_udf.rs
index 5612e0939f709..e8d6c9c8173ac 100644
--- a/datafusion-examples/examples/simple_udf.rs
+++ b/datafusion-examples/examples/udf/simple_udf.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use datafusion::{
     arrow::{
         array::{ArrayRef, Float32Array, Float64Array},
@@ -57,8 +59,7 @@ fn create_context() -> Result<SessionContext> {
 }
 
 /// In this example we will declare a single-type, single return type UDF that exponentiates f64, a^b
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udf() -> Result<()> {
     let ctx = create_context()?;
 
     // First, declare the actual implementation of the calculation
diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs
similarity index 87%
rename from datafusion-examples/examples/simple_udtf.rs
rename to datafusion-examples/examples/udf/simple_udtf.rs
index d2b2d1bf96551..ee2615c4a5ac1 100644
--- a/datafusion-examples/examples/simple_udtf.rs
+++ b/datafusion-examples/examples/udf/simple_udtf.rs
@@ -15,53 +15,56 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::csv::reader::Format;
+//! See `main.rs` for how to run it.
+
+use std::fs::File;
+use std::io::Seek;
+use std::path::Path;
+use std::sync::Arc;
+
 use arrow::csv::ReaderBuilder;
+use arrow::csv::reader::Format;
 use async_trait::async_trait;
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::catalog::Session;
-use datafusion::catalog::TableFunctionImpl;
-use datafusion::common::{plan_err, ScalarValue};
-use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::catalog::{Session, TableFunctionImpl};
+use datafusion::common::{ScalarValue, plan_err};
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
-use datafusion::execution::context::ExecutionProps;
 use datafusion::logical_expr::simplify::SimplifyContext;
 use datafusion::logical_expr::{Expr, TableType};
 use datafusion::optimizer::simplify_expressions::ExprSimplifier;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
-use std::fs::File;
-use std::io::Seek;
-use std::path::Path;
-use std::sync::Arc;
+use datafusion_examples::utils::datasets::ExampleDataset;
+
 // To define your own table function, you only need to do the following 3 things:
 // 1. Implement your own [`TableProvider`]
 // 2. Implement your own [`TableFunctionImpl`] and return your [`TableProvider`]
 // 3. Register the function using [`SessionContext::register_udtf`]
 
 /// This example demonstrates how to register a TableFunction
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udtf() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
     // register the table function that will be called in SQL statements by `read_csv`
     ctx.register_udtf("read_csv", Arc::new(LocalCsvTableFunc {}));
 
-    let testdata = datafusion::test_util::arrow_test_data();
-    let csv_file = format!("{testdata}/csv/aggregate_test_100.csv");
+    let dataset = ExampleDataset::Cars;
 
     // Pass 2 arguments, read csv with at most 2 rows (simplify logic makes 1+1 --> 2)
     let df = ctx
-        .sql(format!("SELECT * FROM read_csv('{csv_file}', 1 + 1);").as_str())
+        .sql(
+            format!("SELECT * FROM read_csv('{}', 1 + 1);", dataset.path_str()?).as_str(),
+        )
         .await?;
     df.show().await?;
 
     // just run, return all rows
     let df = ctx
-        .sql(format!("SELECT * FROM read_csv('{csv_file}');").as_str())
+        .sql(format!("SELECT * FROM read_csv('{}');", dataset.path_str()?).as_str())
         .await?;
     df.show().await?;
 
@@ -133,7 +136,7 @@ struct LocalCsvTableFunc {}
 
 impl TableFunctionImpl for LocalCsvTableFunc {
     fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-        let Some(Expr::Literal(ScalarValue::Utf8(Some(ref path)))) = exprs.first() else {
+        let Some(Expr::Literal(ScalarValue::Utf8(Some(path)), _)) = exprs.first() else {
             return plan_err!("read_csv requires at least one string argument");
         };
 
@@ -141,11 +144,10 @@ impl TableFunctionImpl for LocalCsvTableFunc {
             .get(1)
             .map(|expr| {
                 // try to simplify the expression, so 1+2 becomes 3, for example
-                let execution_props = ExecutionProps::new();
-                let info = SimplifyContext::new(&execution_props);
+                let info = SimplifyContext::default();
                 let expr = ExprSimplifier::new(info).simplify(expr.clone())?;
 
-                if let Expr::Literal(ScalarValue::Int64(Some(limit))) = expr {
+                if let Expr::Literal(ScalarValue::Int64(Some(limit)), _) = expr {
                     Ok(limit as usize)
                 } else {
                     plan_err!("Limit must be an integer")
diff --git a/datafusion-examples/examples/simple_udwf.rs b/datafusion-examples/examples/udf/simple_udwf.rs
similarity index 79%
rename from datafusion-examples/examples/simple_udwf.rs
rename to datafusion-examples/examples/udf/simple_udwf.rs
index 1736ff00bd700..1842d88b9ba29 100644
--- a/datafusion-examples/examples/simple_udwf.rs
+++ b/datafusion-examples/examples/udf/simple_udwf.rs
@@ -15,35 +15,70 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+//! See `main.rs` for how to run it.
+
+use std::{fs::File, io::Write, sync::Arc};
 
 use arrow::{
     array::{ArrayRef, AsArray, Float64Array},
     datatypes::{DataType, Float64Type},
 };
-
 use datafusion::common::ScalarValue;
 use datafusion::error::Result;
 use datafusion::logical_expr::{PartitionEvaluator, Volatility, WindowFrame};
 use datafusion::prelude::*;
+use tempfile::tempdir;
 
 // create local execution context with `cars.csv` registered as a table named `cars`
 async fn create_context() -> Result<SessionContext> {
     // declare a new context. In spark API, this corresponds to a new spark SQL session
     let ctx = SessionContext::new();
 
-    // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
-    println!("pwd: {}", std::env::current_dir().unwrap().display());
-    let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string();
-    let read_options = CsvReadOptions::default().has_header(true);
+    // content from file 'datafusion/core/tests/data/cars.csv'
+    let csv_data = r#"car,speed,time
+red,20.0,1996-04-12T12:05:03.000000000
+red,20.3,1996-04-12T12:05:04.000000000
+red,21.4,1996-04-12T12:05:05.000000000
+red,21.5,1996-04-12T12:05:06.000000000
+red,19.0,1996-04-12T12:05:07.000000000
+red,18.0,1996-04-12T12:05:08.000000000
+red,17.0,1996-04-12T12:05:09.000000000
+red,7.0,1996-04-12T12:05:10.000000000
+red,7.1,1996-04-12T12:05:11.000000000
+red,7.2,1996-04-12T12:05:12.000000000
+red,3.0,1996-04-12T12:05:13.000000000
+red,1.0,1996-04-12T12:05:14.000000000
+red,0.0,1996-04-12T12:05:15.000000000
+green,10.0,1996-04-12T12:05:03.000000000
+green,10.3,1996-04-12T12:05:04.000000000
+green,10.4,1996-04-12T12:05:05.000000000
+green,10.5,1996-04-12T12:05:06.000000000
+green,11.0,1996-04-12T12:05:07.000000000
+green,12.0,1996-04-12T12:05:08.000000000
+green,14.0,1996-04-12T12:05:09.000000000
+green,15.0,1996-04-12T12:05:10.000000000
+green,15.1,1996-04-12T12:05:11.000000000
+green,15.2,1996-04-12T12:05:12.000000000
+green,8.0,1996-04-12T12:05:13.000000000
+green,2.0,1996-04-12T12:05:14.000000000
+"#;
+    let dir = tempdir()?;
+    let file_path = dir.path().join("cars.csv");
+    {
+        let mut file = File::create(&file_path)?;
+        // write CSV data
+        file.write_all(csv_data.as_bytes())?;
+    } // scope closes the file
+    let file_path = file_path.to_str().unwrap();
+
+    ctx.register_csv("cars", file_path, CsvReadOptions::new())
+        .await?;
 
-    ctx.register_csv("cars", &csv_path, read_options).await?;
     Ok(ctx)
 }
 
 /// In this example we will declare a user defined window function that computes a moving average and then run it using SQL
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn simple_udwf() -> Result<()> {
     let ctx = create_context().await?;
 
     // here is where we define the UDWF. We also declare its signature:
diff --git a/datafusion-examples/src/bin/examples-docs.rs b/datafusion-examples/src/bin/examples-docs.rs
new file mode 100644
index 0000000000000..7efcf4da15d20
--- /dev/null
+++ b/datafusion-examples/src/bin/examples-docs.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Generates Markdown documentation for DataFusion example groups.
+//!
+//! This binary scans `datafusion-examples/examples`, extracts structured
+//! documentation from each group's `main.rs` file, and renders a README-style
+//! Markdown document.
+//!
+//! By default, documentation is generated for all example groups. If a group
+//! name is provided as the first CLI argument, only that group is rendered.
+//!
+//! ## Usage
+//!
+//! ```bash
+//! # Generate docs for all example groups
+//! cargo run --bin examples-docs
+//!
+//! # Generate docs for a single group
+//! cargo run --bin examples-docs -- dataframe
+//! ```
+
+use datafusion_examples::utils::example_metadata::{
+    RepoLayout, generate_examples_readme,
+};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let layout = RepoLayout::detect()?;
+    let group = std::env::args().nth(1);
+    let markdown = generate_examples_readme(&layout, group.as_deref())?;
+    print!("{markdown}");
+    Ok(())
+}
diff --git a/datafusion-examples/src/lib.rs b/datafusion-examples/src/lib.rs
new file mode 100644
index 0000000000000..7f334aedaafe2
--- /dev/null
+++ b/datafusion-examples/src/lib.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Internal utilities shared by the DataFusion examples.
+
+pub mod utils;
diff --git a/datafusion-examples/src/utils/csv_to_parquet.rs b/datafusion-examples/src/utils/csv_to_parquet.rs
new file mode 100644
index 0000000000000..1fbf2930e9043
--- /dev/null
+++ b/datafusion-examples/src/utils/csv_to_parquet.rs
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::{Path, PathBuf};
+
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::error::{DataFusionError, Result};
+use datafusion::prelude::{CsvReadOptions, SessionContext};
+use tempfile::TempDir;
+use tokio::fs::create_dir_all;
+
+/// Temporary Parquet directory that is deleted when dropped.
+#[derive(Debug)]
+pub struct ParquetTemp {
+    pub tmp_dir: TempDir,
+    pub parquet_dir: PathBuf,
+}
+
+impl ParquetTemp {
+    pub fn path(&self) -> &Path {
+        &self.parquet_dir
+    }
+
+    pub fn path_str(&self) -> Result<&str> {
+        self.parquet_dir.to_str().ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "Parquet directory path is not valid UTF-8: {}",
+                self.parquet_dir.display()
+            ))
+        })
+    }
+
+    pub fn file_uri(&self) -> Result<String> {
+        Ok(format!("file://{}", self.path_str()?))
+    }
+}
+
+/// Helper for examples: load a CSV file and materialize it as Parquet
+/// in a temporary directory.
+///
+/// # Example
+/// ```
+/// use std::path::PathBuf;
+/// use datafusion::prelude::*;
+/// use datafusion_examples::utils::write_csv_to_parquet;
+/// # use datafusion::assert_batches_eq;
+/// # use datafusion::error::Result;
+/// # #[tokio::main]
+/// # async fn main() -> Result<()> {
+/// let ctx = SessionContext::new();
+/// let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+///     .join("data")
+///     .join("csv")
+///     .join("cars.csv");
+/// let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+/// let df = ctx.read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default()).await?;
+/// let rows = df
+///    .sort(vec![col("speed").sort(true, true)])?
+///    .limit(0, Some(5))?;
+/// assert_batches_eq!(
+///     &[
+///        "+-------+-------+---------------------+",
+///        "| car   | speed | time                |",
+///        "+-------+-------+---------------------+",
+///        "| red   | 0.0   | 1996-04-12T12:05:15 |",
+///        "| red   | 1.0   | 1996-04-12T12:05:14 |",
+///        "| green | 2.0   | 1996-04-12T12:05:14 |",
+///        "| red   | 3.0   | 1996-04-12T12:05:13 |",
+///        "| red   | 7.0   | 1996-04-12T12:05:10 |",
+///        "+-------+-------+---------------------+",
+///      ],
+///        &rows.collect().await?
+/// );
+/// # Ok(())
+/// # }
+/// ```
+pub async fn write_csv_to_parquet(
+    ctx: &SessionContext,
+    csv_path: &Path,
+) -> Result<ParquetTemp> {
+    if !csv_path.is_file() {
+        return Err(DataFusionError::Execution(format!(
+            "CSV file does not exist: {}",
+            csv_path.display()
+        )));
+    }
+
+    let csv_path = csv_path.to_str().ok_or_else(|| {
+        DataFusionError::Execution("CSV path is not valid UTF-8".to_string())
+    })?;
+
+    let csv_df = ctx.read_csv(csv_path, CsvReadOptions::default()).await?;
+
+    let tmp_dir = TempDir::new()?;
+    let parquet_dir = tmp_dir.path().join("parquet_source");
+    create_dir_all(&parquet_dir).await?;
+
+    let path = parquet_dir.to_str().ok_or_else(|| {
+        DataFusionError::Execution("Failed processing tmp directory path".to_string())
+    })?;
+
+    csv_df
+        .write_parquet(path, DataFrameWriteOptions::default(), None)
+        .await?;
+
+    Ok(ParquetTemp {
+        tmp_dir,
+        parquet_dir,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::path::PathBuf;
+
+    use datafusion::assert_batches_eq;
+    use datafusion::prelude::*;
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_with_cars_data() -> Result<()> {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("cars.csv");
+
+        let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+        let df = ctx
+            .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default())
+            .await?;
+
+        let rows = df.sort(vec![col("speed").sort(true, true)])?;
+        assert_batches_eq!(
+            &[
+                "+-------+-------+---------------------+",
+                "| car   | speed | time                |",
+                "+-------+-------+---------------------+",
+                "| red   | 0.0   | 1996-04-12T12:05:15 |",
+                "| red   | 1.0   | 1996-04-12T12:05:14 |",
+                "| green | 2.0   | 1996-04-12T12:05:14 |",
+                "| red   | 3.0   | 1996-04-12T12:05:13 |",
+                "| red   | 7.0   | 1996-04-12T12:05:10 |",
+                "| red   | 7.1   | 1996-04-12T12:05:11 |",
+                "| red   | 7.2   | 1996-04-12T12:05:12 |",
+                "| green | 8.0   | 1996-04-12T12:05:13 |",
+                "| green | 10.0  | 1996-04-12T12:05:03 |",
+                "| green | 10.3  | 1996-04-12T12:05:04 |",
+                "| green | 10.4  | 1996-04-12T12:05:05 |",
+                "| green | 10.5  | 1996-04-12T12:05:06 |",
+                "| green | 11.0  | 1996-04-12T12:05:07 |",
+                "| green | 12.0  | 1996-04-12T12:05:08 |",
+                "| green | 14.0  | 1996-04-12T12:05:09 |",
+                "| green | 15.0  | 1996-04-12T12:05:10 |",
+                "| green | 15.1  | 1996-04-12T12:05:11 |",
+                "| green | 15.2  | 1996-04-12T12:05:12 |",
+                "| red   | 17.0  | 1996-04-12T12:05:09 |",
+                "| red   | 18.0  | 1996-04-12T12:05:08 |",
+                "| red   | 19.0  | 1996-04-12T12:05:07 |",
+                "| red   | 20.0  | 1996-04-12T12:05:03 |",
+                "| red   | 20.3  | 1996-04-12T12:05:04 |",
+                "| red   | 21.4  | 1996-04-12T12:05:05 |",
+                "| red   | 21.5  | 1996-04-12T12:05:06 |",
+                "+-------+-------+---------------------+",
+            ],
+            &rows.collect().await?
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_with_regex_data() -> Result<()> {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("regex.csv");
+
+        let parquet_dir = write_csv_to_parquet(&ctx, &csv_path).await?;
+        let df = ctx
+            .read_parquet(parquet_dir.path_str()?, ParquetReadOptions::default())
+            .await?;
+
+        let rows = df.sort(vec![col("values").sort(true, true)])?;
+        assert_batches_eq!(
+            &[
+                "+------------+--------------------------------------+-------------+-------+",
+                "| values     | patterns                             | replacement | flags |",
+                "+------------+--------------------------------------+-------------+-------+",
+                "| 4000       | \\b4([1-9]\\d\\d|\\d[1-9]\\d|\\d\\d[1-9])\\b | xyz         |       |",
+                "| 4010       | \\b4([1-9]\\d\\d|\\d[1-9]\\d|\\d\\d[1-9])\\b | xyz         |       |",
+                "| ABC        | ^(A).*                               | B           | i     |",
+                "| AbC        | (B|D)                                | e           |       |",
+                "| Düsseldorf | [\\p{Letter}-]+                       | München     |       |",
+                "| Köln       | [a-zA-Z]ö[a-zA-Z]{2}                 | Koln        |       |",
+                "| aBC        | ^(b|c)                               | d           |       |",
+                "| aBc        | (b|d)                                | e           | i     |",
+                "| abc        | ^(a)                                 | bb\\1bb      | i     |",
+                "| Москва     | [\\p{L}-]+                            | Moscow      |       |",
+                "| اليوم      | ^\\p{Arabic}+$                        | Today       |       |",
+                "+------------+--------------------------------------+-------------+-------+",
+            ],
+            &rows.collect().await?
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_csv_to_parquet_error() {
+        let ctx = SessionContext::new();
+        let csv_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join("file-does-not-exist.csv");
+
+        let err = write_csv_to_parquet(&ctx, &csv_path).await.unwrap_err();
+        match err {
+            DataFusionError::Execution(msg) => {
+                assert!(
+                    msg.contains("CSV file does not exist"),
+                    "unexpected error message: {msg}"
+                );
+            }
+            other => panic!("unexpected error variant: {other:?}"),
+        }
+    }
+}
diff --git a/datafusion-examples/src/utils/datasets/cars.rs b/datafusion-examples/src/utils/datasets/cars.rs
new file mode 100644
index 0000000000000..2d8547c16d686
--- /dev/null
+++ b/datafusion-examples/src/utils/datasets/cars.rs
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+
+/// Schema for the `data/csv/cars.csv` example dataset.
+pub fn schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("car", DataType::Utf8, false),
+        Field::new("speed", DataType::Float64, false),
+        Field::new(
+            "time",
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        ),
+    ]))
+}
diff --git a/datafusion-examples/src/utils/datasets/mod.rs b/datafusion-examples/src/utils/datasets/mod.rs
new file mode 100644
index 0000000000000..1857e6af9b559
--- /dev/null
+++ b/datafusion-examples/src/utils/datasets/mod.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::PathBuf;
+
+use arrow_schema::SchemaRef;
+use datafusion::error::{DataFusionError, Result};
+
+pub mod cars;
+pub mod regex;
+
+/// Describes example datasets used across DataFusion examples.
+///
+/// This enum provides a single, discoverable place to define
+/// dataset-specific metadata such as file paths and schemas.
+#[derive(Debug)]
+pub enum ExampleDataset {
+    Cars,
+    Regex,
+}
+
+impl ExampleDataset {
+    pub fn file_stem(&self) -> &'static str {
+        match self {
+            Self::Cars => "cars",
+            Self::Regex => "regex",
+        }
+    }
+
+    pub fn path(&self) -> PathBuf {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("data")
+            .join("csv")
+            .join(format!("{}.csv", self.file_stem()))
+    }
+
+    pub fn path_str(&self) -> Result<String> {
+        let path = self.path();
+        path.to_str().map(String::from).ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "CSV directory path is not valid UTF-8: {}",
+                path.display()
+            ))
+        })
+    }
+
+    pub fn schema(&self) -> SchemaRef {
+        match self {
+            Self::Cars => cars::schema(),
+            Self::Regex => regex::schema(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use arrow::datatypes::{DataType, TimeUnit};
+
+    #[test]
+    fn example_dataset_file_stem() {
+        assert_eq!(ExampleDataset::Cars.file_stem(), "cars");
+        assert_eq!(ExampleDataset::Regex.file_stem(), "regex");
+    }
+
+    #[test]
+    fn example_dataset_path_points_to_csv() {
+        let path = ExampleDataset::Cars.path();
+        assert!(path.ends_with("data/csv/cars.csv"));
+
+        let path = ExampleDataset::Regex.path();
+        assert!(path.ends_with("data/csv/regex.csv"));
+    }
+
+    #[test]
+    fn example_dataset_path_str_is_valid_utf8() {
+        let path = ExampleDataset::Cars.path_str().unwrap();
+        assert!(path.ends_with("cars.csv"));
+
+        let path = ExampleDataset::Regex.path_str().unwrap();
+        assert!(path.ends_with("regex.csv"));
+    }
+
+    #[test]
+    fn cars_schema_is_stable() {
+        let schema = ExampleDataset::Cars.schema();
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| (f.name().as_str(), f.data_type().clone()))
+            .collect();
+
+        assert_eq!(
+            fields,
+            vec![
+                ("car", DataType::Utf8),
+                ("speed", DataType::Float64),
+                ("time", DataType::Timestamp(TimeUnit::Nanosecond, None)),
+            ]
+        );
+    }
+
+    #[test]
+    fn regex_schema_is_stable() {
+        let schema = ExampleDataset::Regex.schema();
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| (f.name().as_str(), f.data_type().clone()))
+            .collect();
+
+        assert_eq!(
+            fields,
+            vec![
+                ("values", DataType::Utf8),
+                ("patterns", DataType::Utf8),
+                ("replacement", DataType::Utf8),
+                ("flags", DataType::Utf8),
+            ]
+        );
+    }
+}
diff --git a/datafusion-examples/src/utils/datasets/regex.rs b/datafusion-examples/src/utils/datasets/regex.rs
new file mode 100644
index 0000000000000..d44582126a053
--- /dev/null
+++ b/datafusion-examples/src/utils/datasets/regex.rs
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+
+/// Schema for the `data/csv/regex.csv` example dataset.
+pub fn schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![
+        Field::new("values", DataType::Utf8, false),
+        Field::new("patterns", DataType::Utf8, false),
+        Field::new("replacement", DataType::Utf8, false),
+        Field::new("flags", DataType::Utf8, true),
+    ]))
+}
diff --git a/datafusion-examples/src/utils/example_metadata/discover.rs b/datafusion-examples/src/utils/example_metadata/discover.rs
new file mode 100644
index 0000000000000..1ba5f6d29a14e
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/discover.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for discovering example groups in the repository filesystem.
+//!
+//! An example group is defined as a directory containing a `main.rs` file
+//! under the examples root. This module is intentionally filesystem-focused
+//! and does not perform any parsing or rendering.
+//! Discovery fails if no valid example groups are found.
+
+use std::fs;
+use std::path::{Path, PathBuf};
+
+use datafusion::common::exec_err;
+use datafusion::error::Result;
+
+/// Discovers all example group directories under the given root.
+///
+/// A directory is considered an example group if it contains a `main.rs` file.
+pub fn discover_example_groups(root: &Path) -> Result<Vec<PathBuf>> {
+    let mut groups = Vec::new();
+    for entry in fs::read_dir(root)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() && path.join("main.rs").is_file() {
+            groups.push(path);
+        }
+    }
+
+    if groups.is_empty() {
+        return exec_err!("No example groups found under: {}", root.display());
+    }
+
+    groups.sort();
+    Ok(groups)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::assert_exec_err_contains;
+
+    use std::fs::{self, File};
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn discover_example_groups_finds_dirs_with_main_rs() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let root = tmp.path();
+
+        // valid example group
+        let group1 = root.join("group1");
+        fs::create_dir(&group1)?;
+        File::create(group1.join("main.rs"))?;
+
+        // not an example group
+        let group2 = root.join("group2");
+        fs::create_dir(&group2)?;
+
+        let groups = discover_example_groups(root)?;
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0], group1);
+        Ok(())
+    }
+
+    #[test]
+    fn discover_example_groups_errors_if_main_rs_is_a_directory() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let root = tmp.path();
+        let group = root.join("group");
+        fs::create_dir(&group)?;
+        fs::create_dir(group.join("main.rs"))?;
+
+        let err = discover_example_groups(root).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
+        Ok(())
+    }
+
+    #[test]
+    fn discover_example_groups_errors_if_none_found() -> Result<()> {
+        let tmp = TempDir::new()?;
+        let err = discover_example_groups(tmp.path()).unwrap_err();
+        assert_exec_err_contains(err, "No example groups found");
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/layout.rs b/datafusion-examples/src/utils/example_metadata/layout.rs
new file mode 100644
index 0000000000000..ee6fad89855f9
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/layout.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Repository layout utilities.
+//!
+//! This module provides a small helper (`RepoLayout`) that encapsulates
+//! knowledge about the DataFusion repository structure, in particular
+//! where example groups are located relative to the repository root.
+
+use std::path::{Path, PathBuf};
+
+use datafusion::error::{DataFusionError, Result};
+
+/// Describes the layout of a DataFusion repository.
+///
+/// This type centralizes knowledge about where example-related
+/// directories live relative to the repository root.
+#[derive(Debug, Clone)]
+pub struct RepoLayout {
+    root: PathBuf,
+}
+
+impl From<&Path> for RepoLayout {
+    fn from(path: &Path) -> Self {
+        Self {
+            root: path.to_path_buf(),
+        }
+    }
+}
+
+impl RepoLayout {
+    /// Creates a layout from an explicit repository root.
+    pub fn from_root(root: PathBuf) -> Self {
+        Self { root }
+    }
+
+    /// Detects the repository root based on `CARGO_MANIFEST_DIR`.
+    ///
+    /// This is intended for use from binaries inside the workspace.
+    pub fn detect() -> Result<Self> {
+        let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+
+        let root = manifest_dir.parent().ok_or_else(|| {
+            DataFusionError::Execution(
+                "CARGO_MANIFEST_DIR does not have a parent".to_string(),
+            )
+        })?;
+
+        Ok(Self {
+            root: root.to_path_buf(),
+        })
+    }
+
+    /// Returns the repository root directory.
+    pub fn root(&self) -> &Path {
+        &self.root
+    }
+
+    /// Returns the `datafusion-examples/examples` directory.
+    pub fn examples_root(&self) -> PathBuf {
+        self.root.join("datafusion-examples").join("examples")
+    }
+
+    /// Returns the directory for a single example group.
+    ///
+    /// Example: `examples/udf`
+    pub fn example_group_dir(&self, group: &str) -> PathBuf {
+        self.examples_root().join(group)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn detect_sets_non_empty_root() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        assert!(!layout.root().as_os_str().is_empty());
+        Ok(())
+    }
+
+    #[test]
+    fn examples_root_is_under_repo_root() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        let examples_root = layout.examples_root();
+        assert!(examples_root.starts_with(layout.root()));
+        assert!(examples_root.ends_with("datafusion-examples/examples"));
+        Ok(())
+    }
+
+    #[test]
+    fn example_group_dir_appends_group_name() -> Result<()> {
+        let layout = RepoLayout::detect()?;
+        let group_dir = layout.example_group_dir("foo");
+        assert!(group_dir.ends_with("datafusion-examples/examples/foo"));
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/mod.rs b/datafusion-examples/src/utils/example_metadata/mod.rs
new file mode 100644
index 0000000000000..ab4c8e4a8e4c2
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/mod.rs
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Documentation generator for DataFusion examples.
+//!
+//! # Design goals
+//!
+//! - Keep README.md in sync with runnable examples
+//! - Fail fast on malformed documentation
+//!
+//! # Overview
+//!
+//! Each example group corresponds to a directory under
+//! `datafusion-examples/examples/<group>` containing a `main.rs` file.
+//! Documentation is extracted from structured `//!` comments in that file.
+//!
+//! For each example group, the generator produces:
+//!
+//! ```text
+//! ## <Group Name> Examples
+//! ### Group: `<group>`
+//! #### Category: Single Process | Distributed
+//!
+//! | Subcommand | File Path | Description |
+//! ```
+//!
+//! # Usage
+//!
+//! Generate documentation for a single group only:
+//!
+//! ```bash
+//! cargo run --bin examples-docs -- dataframe
+//! ```
+//!
+//! Generate documentation for all examples:
+//!
+//! ```bash
+//! cargo run --bin examples-docs  
+//! ```
+
+pub mod discover;
+pub mod layout;
+pub mod model;
+pub mod parser;
+pub mod render;
+
+#[cfg(test)]
+pub mod test_utils;
+
+pub use layout::RepoLayout;
+pub use model::{Category, ExampleEntry, ExampleGroup, GroupName};
+pub use parser::parse_main_rs_docs;
+pub use render::generate_examples_readme;
diff --git a/datafusion-examples/src/utils/example_metadata/model.rs b/datafusion-examples/src/utils/example_metadata/model.rs
new file mode 100644
index 0000000000000..11416d141eb74
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/model.rs
@@ -0,0 +1,418 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Domain model for DataFusion example documentation.
+//!
+//! This module defines the core data structures used to represent
+//! example groups, individual examples, and their categorization
+//! as parsed from `main.rs` documentation comments.
+
+use std::path::Path;
+
+use datafusion::error::{DataFusionError, Result};
+
+use crate::utils::example_metadata::parse_main_rs_docs;
+
+/// Well-known abbreviations used to preserve correct capitalization
+/// when generating human-readable documentation titles.
+const ABBREVIATIONS: &[(&str, &str)] = &[
+    ("dataframe", "DataFrame"),
+    ("io", "IO"),
+    ("sql", "SQL"),
+    ("udf", "UDF"),
+];
+
+/// A group of related examples (e.g. `builtin_functions`, `udf`).
+///
+/// Each group corresponds to a directory containing a `main.rs` file
+/// with structured documentation comments.
+#[derive(Debug)]
+pub struct ExampleGroup {
+    pub name: GroupName,
+    pub examples: Vec<ExampleEntry>,
+    pub category: Category,
+}
+
+impl ExampleGroup {
+    /// Parses an example group from its directory.
+    ///
+    /// The group name is derived from the directory name, and example
+    /// entries are extracted from `main.rs`.
+    pub fn from_dir(dir: &Path, category: Category) -> Result<Self> {
+        let raw_name = dir
+            .file_name()
+            .and_then(|s| s.to_str())
+            .ok_or_else(|| {
+                DataFusionError::Execution("Invalid example group dir".to_string())
+            })?
+            .to_string();
+
+        let name = GroupName::from_dir_name(raw_name);
+        let main_rs = dir.join("main.rs");
+        let examples = parse_main_rs_docs(&main_rs)?;
+
+        Ok(Self {
+            name,
+            examples,
+            category,
+        })
+    }
+}
+
+/// Represents an example group name in both raw and human-readable forms.
+///
+/// For example:
+/// - raw: `builtin_functions`
+/// - title: `Builtin Functions`
+#[derive(Debug)]
+pub struct GroupName {
+    raw: String,
+    title: String,
+}
+
+impl GroupName {
+    /// Creates a group name from a directory name.
+    pub fn from_dir_name(raw: String) -> Self {
+        let title = raw
+            .split('_')
+            .map(format_part)
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        Self { raw, title }
+    }
+
+    /// Returns the raw group name (directory name).
+    pub fn raw(&self) -> &str {
+        &self.raw
+    }
+
+    /// Returns a title-cased name for documentation.
+    pub fn title(&self) -> &str {
+        &self.title
+    }
+}
+
+/// A single runnable example within a group.
+///
+/// Each entry corresponds to a subcommand documented in `main.rs`.
+#[derive(Debug)]
+pub struct ExampleEntry {
+    /// CLI subcommand name.
+    pub subcommand: String,
+    /// Rust source file name.
+    pub file: String,
+    /// Human-readable description.
+    pub desc: String,
+}
+
+/// Execution category of an example group.
+#[derive(Debug, Default)]
+pub enum Category {
+    /// Runs in a single process.
+    #[default]
+    SingleProcess,
+    /// Requires a distributed setup.
+    Distributed,
+}
+
+impl Category {
+    /// Returns the display name used in documentation.
+    pub fn name(&self) -> &str {
+        match self {
+            Self::SingleProcess => "Single Process",
+            Self::Distributed => "Distributed",
+        }
+    }
+
+    /// Determines the category for a group by name.
+    pub fn for_group(name: &str) -> Self {
+        match name {
+            "flight" => Category::Distributed,
+            _ => Category::SingleProcess,
+        }
+    }
+}
+
+/// Formats a single group-name segment for display.
+///
+/// This function applies DataFusion-specific capitalization rules:
+/// - Known abbreviations (e.g. `sql`, `io`, `udf`) are rendered in all caps
+/// - All other segments fall back to standard Title Case
+fn format_part(part: &str) -> String {
+    let lower = part.to_ascii_lowercase();
+
+    if let Some((_, replacement)) = ABBREVIATIONS.iter().find(|(k, _)| *k == lower) {
+        return replacement.to_string();
+    }
+
+    let mut chars = part.chars();
+    match chars.next() {
+        Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
+        None => String::new(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::{
+        assert_exec_err_contains, example_group_from_docs,
+    };
+
+    use std::fs;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn category_for_group_works() {
+        assert!(matches!(
+            Category::for_group("flight"),
+            Category::Distributed
+        ));
+        assert!(matches!(
+            Category::for_group("anything_else"),
+            Category::SingleProcess
+        ));
+    }
+
+    #[test]
+    fn all_subcommand_is_ignored() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `all` — run all examples included in this module
+        //!
+        //! - `foo`
+        //!   (file: foo.rs, desc: foo example)
+        "#,
+        )?;
+        assert_eq!(group.examples.len(), 1);
+        assert_eq!(group.examples[0].subcommand, "foo");
+        Ok(())
+    }
+
+    #[test]
+    fn metadata_without_subcommand_fails() {
+        let err = example_group_from_docs("//! (file: foo.rs, desc: missing subcommand)")
+            .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn group_name_handles_abbreviations() {
+        assert_eq!(
+            GroupName::from_dir_name("dataframe".to_string()).title(),
+            "DataFrame"
+        );
+        assert_eq!(
+            GroupName::from_dir_name("data_io".to_string()).title(),
+            "Data IO"
+        );
+        assert_eq!(
+            GroupName::from_dir_name("sql_ops".to_string()).title(),
+            "SQL Ops"
+        );
+        assert_eq!(GroupName::from_dir_name("udf".to_string()).title(), "UDF");
+    }
+
+    #[test]
+    fn group_name_title_cases() {
+        let cases = [
+            ("very_long_group_name", "Very Long Group Name"),
+            ("foo", "Foo"),
+            ("dataframe", "DataFrame"),
+            ("data_io", "Data IO"),
+            ("sql_ops", "SQL Ops"),
+            ("udf", "UDF"),
+        ];
+        for (input, expected) in cases {
+            let name = GroupName::from_dir_name(input.to_string());
+            assert_eq!(name.title(), expected);
+        }
+    }
+
+    #[test]
+    fn parse_group_example_works() -> Result<()> {
+        let tmp = TempDir::new().unwrap();
+
+        // Simulate: examples/builtin_functions/
+        let group_dir = tmp.path().join("builtin_functions");
+        fs::create_dir(&group_dir)?;
+
+        // Write a fake main.rs with docs
+        let main_rs = group_dir.join("main.rs");
+        fs::write(
+            &main_rs,
+            r#"
+    // Licensed to the Apache Software Foundation (ASF) under one
+    // or more contributor license agreements.  See the NOTICE file
+    // distributed with this work for additional information
+    // regarding copyright ownership.  The ASF licenses this file
+    // to you under the Apache License, Version 2.0 (the
+    // "License"); you may not use this file except in compliance
+    // with the License.  You may obtain a copy of the License at
+    //
+    //   http://www.apache.org/licenses/LICENSE-2.0
+    //
+    // Unless required by applicable law or agreed to in writing,
+    // software distributed under the License is distributed on an
+    // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    // KIND, either express or implied.  See the License for the
+    // specific language governing permissions and limitations
+    // under the License.
+    //
+    //! # These are miscellaneous function-related examples
+    //!
+    //! These examples demonstrate miscellaneous function-related features.
+    //!
+    //! ## Usage
+    //! ```bash
+    //! cargo run --example builtin_functions -- [all|date_time|function_factory|regexp]
+    //! ```
+    //!
+    //! Each subcommand runs a corresponding example:
+    //! - `all` — run all examples included in this module
+    //!
+    //! - `date_time`
+    //!   (file: date_time.rs, desc: Examples of date-time related functions and queries)
+    //!
+    //! - `function_factory`
+    //!   (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
+    //!
+    //! - `regexp`
+    //!   (file: regexp.rs, desc: Examples of using regular expression functions)
+    "#,
+        )?;
+
+        let group = ExampleGroup::from_dir(&group_dir, Category::SingleProcess)?;
+
+        // Assert group-level data
+        assert_eq!(group.name.title(), "Builtin Functions");
+        assert_eq!(group.examples.len(), 3);
+
+        // Assert 1 example
+        assert_eq!(group.examples[0].subcommand, "date_time");
+        assert_eq!(group.examples[0].file, "date_time.rs");
+        assert_eq!(
+            group.examples[0].desc,
+            "Examples of date-time related functions and queries"
+        );
+
+        // Assert 2 example
+        assert_eq!(group.examples[1].subcommand, "function_factory");
+        assert_eq!(group.examples[1].file, "function_factory.rs");
+        assert_eq!(
+            group.examples[1].desc,
+            "Register `CREATE FUNCTION` handler to implement SQL macros"
+        );
+
+        // Assert 3 example
+        assert_eq!(group.examples[2].subcommand, "regexp");
+        assert_eq!(group.examples[2].file, "regexp.rs");
+        assert_eq!(
+            group.examples[2].desc,
+            "Examples of using regular expression functions"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn duplicate_metadata_without_repeating_subcommand_fails() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! (file: a.rs, desc: first)
+        //! (file: b.rs, desc: second)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn duplicate_metadata_for_same_subcommand_fails() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! (file: a.rs, desc: first)
+        //!
+        //! - `foo`
+        //! (file: b.rs, desc: second)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Duplicate metadata for subcommand `foo`");
+    }
+
+    #[test]
+    fn metadata_must_follow_subcommand() {
+        let err = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //! some unrelated comment
+        //! (file: foo.rs, desc: test)
+        "#,
+        )
+        .unwrap_err();
+        assert_exec_err_contains(err, "Metadata without preceding subcommand");
+    }
+
+    #[test]
+    fn preserves_example_order_from_main_rs() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `second`
+        //! (file: second.rs, desc: second example)
+        //!
+        //! - `first`
+        //! (file: first.rs, desc: first example)
+        //!
+        //! - `third`
+        //! (file: third.rs, desc: third example)
+        "#,
+        )?;
+
+        let subcommands: Vec<&str> = group
+            .examples
+            .iter()
+            .map(|e| e.subcommand.as_str())
+            .collect();
+
+        assert_eq!(
+            subcommands,
+            vec!["second", "first", "third"],
+            "examples must preserve the order defined in main.rs"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn metadata_can_follow_blank_doc_line() -> Result<()> {
+        let group = example_group_from_docs(
+            r#"
+        //! - `foo`
+        //!
+        //! (file: foo.rs, desc: test)
+        "#,
+        )?;
+        assert_eq!(group.examples.len(), 1);
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/parser.rs b/datafusion-examples/src/utils/example_metadata/parser.rs
new file mode 100644
index 0000000000000..4ead3e5a2ae9f
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/parser.rs
@@ -0,0 +1,267 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Parser for example metadata embedded in `main.rs` documentation comments.
+//!
+//! This module scans `//!` doc comments to extract example subcommands
+//! and their associated metadata (file name and description), enforcing
+//! a strict ordering and structure to avoid ambiguous documentation.
+
+use std::{collections::HashSet, fs, path::Path};
+
+use datafusion::common::exec_err;
+use datafusion::error::Result;
+use nom::{
+    Err, IResult, Parser,
+    bytes::complete::{tag, take_until, take_while},
+    character::complete::multispace0,
+    combinator::all_consuming,
+    error::{Error, ErrorKind},
+    sequence::{delimited, preceded},
+};
+
+use crate::utils::example_metadata::ExampleEntry;
+
+/// Parsing state machine used while scanning `main.rs` docs.
+///
+/// This makes the "subcommand - metadata" relationship explicit:
+/// metadata is only valid immediately after a subcommand has been seen.
+enum ParserState<'a> {
+    /// Not currently expecting metadata.
+    Idle,
+    /// A subcommand was just parsed; the next valid metadata (if any)
+    /// must belong to this subcommand.
+    SeenSubcommand(&'a str),
+}
+
+/// Parses a subcommand declaration line from `main.rs` docs.
+///
+/// Expected format:
+/// ```text
+/// //! - `<subcommand>`
+/// ```
+fn parse_subcommand_line(input: &str) -> IResult<&str, &str> {
+    let parser = preceded(
+        multispace0,
+        delimited(tag("//! - `"), take_until("`"), tag("`")),
+    );
+    all_consuming(parser).parse(input)
+}
+
+/// Parses example metadata (file name and description) from `main.rs` docs.
+///
+/// Expected format:
+/// ```text
+/// //! (file: <file>.rs, desc: <description>)
+/// ```
+fn parse_metadata_line(input: &str) -> IResult<&str, (&str, &str)> {
+    let parser = preceded(
+        multispace0,
+        preceded(tag("//!"), preceded(multispace0, take_while(|_| true))),
+    );
+    let (rest, payload) = all_consuming(parser).parse(input)?;
+
+    let content = payload
+        .strip_prefix("(")
+        .and_then(|s| s.strip_suffix(")"))
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;
+
+    let (file, desc) = content
+        .strip_prefix("file:")
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?
+        .split_once(", desc:")
+        .ok_or_else(|| Err::Error(Error::new(payload, ErrorKind::Tag)))?;
+
+    Ok((rest, (file.trim(), desc.trim())))
+}
+
+/// Parses example entries from a group's `main.rs` file.
+pub fn parse_main_rs_docs(path: &Path) -> Result<Vec<ExampleEntry>> {
+    let content = fs::read_to_string(path)?;
+    let mut entries = vec![];
+    let mut state = ParserState::Idle;
+    let mut seen_subcommands = HashSet::new();
+
+    for (line_no, raw_line) in content.lines().enumerate() {
+        let line = raw_line.trim();
+
+        // Try parsing subcommand, excluding `all` because it's not used in README
+        if let Ok((_, sub)) = parse_subcommand_line(line) {
+            state = if sub == "all" {
+                ParserState::Idle
+            } else {
+                ParserState::SeenSubcommand(sub)
+            };
+            continue;
+        }
+
+        // Try parsing metadata
+        if let Ok((_, (file, desc))) = parse_metadata_line(line) {
+            let subcommand = match state {
+                ParserState::SeenSubcommand(s) => s,
+                ParserState::Idle => {
+                    return exec_err!(
+                        "Metadata without preceding subcommand at {}:{}",
+                        path.display(),
+                        line_no + 1
+                    );
+                }
+            };
+
+            if !seen_subcommands.insert(subcommand) {
+                return exec_err!("Duplicate metadata for subcommand `{subcommand}`");
+            }
+
+            entries.push(ExampleEntry {
+                subcommand: subcommand.to_string(),
+                file: file.to_string(),
+                desc: desc.to_string(),
+            });
+
+            state = ParserState::Idle;
+            continue;
+        }
+
+        // If a non-blank doc line interrupts a pending subcommand, reset the state
+        if let ParserState::SeenSubcommand(_) = state
+            && is_non_blank_doc_line(line)
+        {
+            state = ParserState::Idle;
+        }
+    }
+
+    Ok(entries)
+}
+
+/// Returns `true` for non-blank Rust doc comment lines (`//!`).
+///
+/// Used to detect when a subcommand is interrupted by unrelated documentation,
+/// so metadata is only accepted immediately after a subcommand (blank doc lines
+/// are allowed in between).
+fn is_non_blank_doc_line(line: &str) -> bool {
+    line.starts_with("//!") && !line.trim_start_matches("//!").trim().is_empty()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn parse_subcommand_line_accepts_valid_input() {
+        let line = "//! - `date_time`";
+        let sub = parse_subcommand_line(line);
+        assert_eq!(sub, Ok(("", "date_time")));
+    }
+
+    #[test]
+    fn parse_subcommand_line_invalid_inputs() {
+        let err_lines = [
+            "//! - ",
+            "//! - foo",
+            "//! - `foo` bar",
+            "//! --",
+            "//!-",
+            "//!--",
+            "//!",
+            "//",
+            "/",
+            "",
+        ];
+        for line in err_lines {
+            assert!(
+                parse_subcommand_line(line).is_err(),
+                "expected error for input: {line}"
+            );
+        }
+    }
+
+    #[test]
+    fn parse_metadata_line_accepts_valid_input() {
+        let line =
+            "//! (file: date_time.rs, desc: Examples of date-time related functions)";
+        let res = parse_metadata_line(line);
+        assert_eq!(
+            res,
+            Ok((
+                "",
+                ("date_time.rs", "Examples of date-time related functions")
+            ))
+        );
+
+        let line = "//! (file: foo.rs, desc: Foo, bar, baz)";
+        let res = parse_metadata_line(line);
+        assert_eq!(res, Ok(("", ("foo.rs", "Foo, bar, baz"))));
+
+        let line = "//! (file: foo.rs, desc: Foo(FOO))";
+        let res = parse_metadata_line(line);
+        assert_eq!(res, Ok(("", ("foo.rs", "Foo(FOO)"))));
+    }
+
+    #[test]
+    fn parse_metadata_line_invalid_inputs() {
+        let bad_lines = [
+            "//! (file: foo.rs)",
+            "//! (desc: missing file)",
+            "//! file: foo.rs, desc: test",
+            "//! file: foo.rs,desc: test",
+            "//! (file: foo.rs desc: test)",
+            "//! (file: foo.rs,desc: test)",
+            "//! (desc: test, file: foo.rs)",
+            "//! ()",
+            "//! (file: foo.rs, desc: test) extra",
+            "",
+        ];
+        for line in bad_lines {
+            assert!(
+                parse_metadata_line(line).is_err(),
+                "expected error for input: {line}"
+            );
+        }
+    }
+
+    #[test]
+    fn parse_main_rs_docs_extracts_entries() -> Result<()> {
+        let tmp = TempDir::new().unwrap();
+        let main_rs = tmp.path().join("main.rs");
+
+        fs::write(
+            &main_rs,
+            r#"
+        //! - `foo`
+        //! (file: foo.rs, desc: first example)
+        //!
+        //! - `bar`
+        //! (file: bar.rs, desc: second example)
+        "#,
+        )?;
+
+        let entries = parse_main_rs_docs(&main_rs)?;
+
+        assert_eq!(entries.len(), 2);
+
+        assert_eq!(entries[0].subcommand, "foo");
+        assert_eq!(entries[0].file, "foo.rs");
+        assert_eq!(entries[0].desc, "first example");
+
+        assert_eq!(entries[1].subcommand, "bar");
+        assert_eq!(entries[1].file, "bar.rs");
+        assert_eq!(entries[1].desc, "second example");
+        Ok(())
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/render.rs b/datafusion-examples/src/utils/example_metadata/render.rs
new file mode 100644
index 0000000000000..a4ea620e78352
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/render.rs
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Markdown renderer for DataFusion example documentation.
+//!
+//! This module takes parsed example metadata and generates the
+//! `README.md` content for `datafusion-examples`, including group
+//! sections and example tables.
+
+use std::path::PathBuf;
+
+use datafusion::error::{DataFusionError, Result};
+
+use crate::utils::example_metadata::discover::discover_example_groups;
+use crate::utils::example_metadata::model::ExampleGroup;
+use crate::utils::example_metadata::{Category, RepoLayout};
+
+const STATIC_HEADER: &str = r#"<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Examples
+
+This crate includes end to end, highly commented examples of how to use
+various DataFusion APIs to help you get started.
+
+## Prerequisites
+
+Run `git submodule update --init` to init test files.
+
+## Running Examples
+
+To run an example, use the `cargo run` command, such as:
+
+```bash
+git clone https://github.com/apache/datafusion
+cd datafusion
+# Download test data
+git submodule update --init
+
+# Change to the examples directory
+cd datafusion-examples/examples
+
+# Run all examples in a group
+cargo run --example <group> -- all
+
+# Run a specific example within a group
+cargo run --example <group> -- <subcommand>
+
+# Run all examples in the `dataframe` group
+cargo run --example dataframe -- all
+
+# Run a single example from the `dataframe` group
+# (apply the same pattern for any other group)
+cargo run --example dataframe -- dataframe
+```
+"#;
+
+/// Generates Markdown documentation for DataFusion examples.
+///
+/// If `group` is `None`, documentation is generated for all example groups.
+/// If `group` is `Some`, only that group is rendered.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - the requested group does not exist
+/// - a `main.rs` file is missing
+/// - documentation comments are malformed
+pub fn generate_examples_readme(
+    layout: &RepoLayout,
+    group: Option<&str>,
+) -> Result<String> {
+    let examples_root = layout.examples_root();
+
+    let mut out = String::new();
+    out.push_str(STATIC_HEADER);
+
+    let group_dirs: Vec<PathBuf> = match group {
+        Some(name) => {
+            let dir = examples_root.join(name);
+            if !dir.is_dir() {
+                return Err(DataFusionError::Execution(format!(
+                    "Example group `{name}` does not exist"
+                )));
+            }
+            vec![dir]
+        }
+        None => discover_example_groups(&examples_root)?,
+    };
+
+    for group_dir in group_dirs {
+        let raw_name =
+            group_dir
+                .file_name()
+                .and_then(|s| s.to_str())
+                .ok_or_else(|| {
+                    DataFusionError::Execution("Invalid example group dir".to_string())
+                })?;
+
+        let category = Category::for_group(raw_name);
+        let group = ExampleGroup::from_dir(&group_dir, category)?;
+
+        out.push_str(&group.render_markdown());
+    }
+
+    Ok(out)
+}
+
+impl ExampleGroup {
+    /// Renders this example group as a Markdown section for the README.
+    pub fn render_markdown(&self) -> String {
+        let mut out = String::new();
+        out.push_str(&format!("\n## {} Examples\n\n", self.name.title()));
+        out.push_str(&format!("### Group: `{}`\n\n", self.name.raw()));
+        out.push_str(&format!("#### Category: {}\n\n", self.category.name()));
+        out.push_str("| Subcommand | File Path | Description |\n");
+        out.push_str("| --- | --- | --- |\n");
+
+        for example in &self.examples {
+            out.push_str(&format!(
+                "| {} | [`{}/{}`](examples/{}/{}) | {} |\n",
+                example.subcommand,
+                self.name.raw(),
+                example.file,
+                self.name.raw(),
+                example.file,
+                example.desc
+            ));
+        }
+
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::utils::example_metadata::test_utils::assert_exec_err_contains;
+
+    use std::fs;
+
+    use tempfile::TempDir;
+
+    #[test]
+    fn single_group_generation_works() {
+        let tmp = TempDir::new().unwrap();
+        // Fake repo root
+        let layout = RepoLayout::from_root(tmp.path().to_path_buf());
+
+        // Create: datafusion-examples/examples/builtin_functions
+        let examples_dir = layout.example_group_dir("builtin_functions");
+        fs::create_dir_all(&examples_dir).unwrap();
+
+        fs::write(
+            examples_dir.join("main.rs"),
+            "//! - `x`\n//! (file: foo.rs, desc: test)",
+        )
+        .unwrap();
+
+        let out = generate_examples_readme(&layout, Some("builtin_functions")).unwrap();
+        assert!(out.contains("Builtin Functions"));
+        assert!(out.contains("| x | [`builtin_functions/foo.rs`]"));
+    }
+
+    #[test]
+    fn single_group_generation_fails_if_group_missing() {
+        let tmp = TempDir::new().unwrap();
+        let layout = RepoLayout::from_root(tmp.path().to_path_buf());
+        let err = generate_examples_readme(&layout, Some("missing_group")).unwrap_err();
+        assert_exec_err_contains(err, "Example group `missing_group` does not exist");
+    }
+}
diff --git a/datafusion-examples/src/utils/example_metadata/test_utils.rs b/datafusion-examples/src/utils/example_metadata/test_utils.rs
new file mode 100644
index 0000000000000..d6ab3b06ba06d
--- /dev/null
+++ b/datafusion-examples/src/utils/example_metadata/test_utils.rs
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test helpers for example metadata parsing and validation.
+//!
+//! This module provides small, focused utilities to reduce duplication
+//! and keep tests readable across the example metadata submodules.
+
+use std::fs;
+
+use datafusion::error::{DataFusionError, Result};
+use tempfile::TempDir;
+
+use crate::utils::example_metadata::{Category, ExampleGroup};
+
+/// Asserts that an `Execution` error contains the expected message fragment.
+///
+/// Keeps tests focused on semantic error causes without coupling them
+/// to full error string formatting.
+pub fn assert_exec_err_contains(err: DataFusionError, needle: &str) {
+    match err {
+        DataFusionError::Execution(msg) => {
+            assert!(
+                msg.contains(needle),
+                "expected '{needle}' in error message, got: {msg}"
+            );
+        }
+        other => panic!("expected Execution error, got: {other:?}"),
+    }
+}
+
+/// Helper for grammar-focused tests.
+///
+/// Creates a minimal temporary example group with a single `main.rs`
+/// containing the provided docs. Intended for testing parsing and
+/// validation rules, not full integration behavior.
+pub fn example_group_from_docs(docs: &str) -> Result<ExampleGroup> {
+    let tmp = TempDir::new().map_err(|e| {
+        DataFusionError::Execution(format!("Failed initializing temp dir: {e}"))
+    })?;
+    let dir = tmp.path().join("group");
+    fs::create_dir(&dir).map_err(|e| {
+        DataFusionError::Execution(format!("Failed creating temp dir: {e}"))
+    })?;
+    fs::write(dir.join("main.rs"), docs).map_err(|e| {
+        DataFusionError::Execution(format!("Failed writing to temp file: {e}"))
+    })?;
+    ExampleGroup::from_dir(&dir, Category::SingleProcess)
+}
diff --git a/datafusion-examples/src/utils/mod.rs b/datafusion-examples/src/utils/mod.rs
new file mode 100644
index 0000000000000..da96724a49cb3
--- /dev/null
+++ b/datafusion-examples/src/utils/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod csv_to_parquet;
+pub mod datasets;
+pub mod example_metadata;
+
+pub use csv_to_parquet::write_csv_to_parquet;
diff --git a/datafusion-testing b/datafusion-testing
index e9f9e22ccf091..eccb0e4a42634 160000
--- a/datafusion-testing
+++ b/datafusion-testing
@@ -1 +1 @@
-Subproject commit e9f9e22ccf09145a7368f80fd6a871f11e2b4481
+Subproject commit eccb0e4a426344ef3faf534cd60e02e9c3afd3ac
diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml
index b88461e7ebcbc..be1374b371485 100644
--- a/datafusion/catalog-listing/Cargo.toml
+++ b/datafusion/catalog-listing/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-catalog-listing"
 description = "datafusion-catalog-listing"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -39,19 +39,26 @@ datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
-datafusion-session = { workspace = true }
 futures = { workspace = true }
+itertools = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true }
-tokio = { workspace = true }
 
 [dev-dependencies]
+datafusion-datasource-parquet = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_catalog_listing"
 path = "src/mod.rs"
+
+[package.metadata.cargo-machete]
+ignored = ["datafusion-datasource-parquet"]
diff --git a/datafusion/catalog-listing/README.md b/datafusion/catalog-listing/README.md
index b4760c413d60b..81a7c7b1da3ae 100644
--- a/datafusion/catalog-listing/README.md
+++ b/datafusion/catalog-listing/README.md
@@ -17,14 +17,20 @@
   under the License.
 -->
 
-# DataFusion catalog-listing
+# Apache DataFusion Catalog Listing
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion with [ListingTable], an implementation
 of [TableProvider] based on files in a directory (either locally or on remote
 object storage such as S3).
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
 [listingtable]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
 [tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/catalog-listing/src/config.rs b/datafusion/catalog-listing/src/config.rs
new file mode 100644
index 0000000000000..ca4d2abfcd737
--- /dev/null
+++ b/datafusion/catalog-listing/src/config.rs
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::options::ListingOptions;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
+use datafusion_catalog::Session;
+use datafusion_common::{config_err, internal_err};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource::file_compression_type::FileCompressionType;
+#[expect(deprecated)]
+use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
+use std::str::FromStr;
+use std::sync::Arc;
+
+/// Indicates the source of the schema for a [`crate::ListingTable`]
+// PartialEq required for assert_eq! in tests
+#[derive(Debug, Clone, Copy, PartialEq, Default)]
+pub enum SchemaSource {
+    /// Schema is not yet set (initial state)
+    #[default]
+    Unset,
+    /// Schema was inferred from first table_path
+    Inferred,
+    /// Schema was specified explicitly via with_schema
+    Specified,
+}
+
+/// Configuration for creating a [`crate::ListingTable`]
+///
+/// # Schema Evolution Support
+///
+/// This configuration supports schema evolution through the optional
+/// [`PhysicalExprAdapterFactory`]. You might want to override the default factory when you need:
+///
+/// - **Type coercion requirements**: When you need custom logic for converting between
+///   different Arrow data types (e.g., Int32 ↔ Int64, Utf8 ↔ LargeUtf8)
+/// - **Column mapping**: You need to map columns with a legacy name to a new name
+/// - **Custom handling of missing columns**: By default they are filled in with nulls, but you may e.g. want to fill them in with `0` or `""`.
+#[derive(Debug, Clone, Default)]
+pub struct ListingTableConfig {
+    /// Paths on the `ObjectStore` for creating [`crate::ListingTable`].
+    /// They should share the same schema and object store.
+    pub table_paths: Vec<ListingTableUrl>,
+    /// Optional `SchemaRef` for the to be created [`crate::ListingTable`].
+    ///
+    /// See details on [`ListingTableConfig::with_schema`]
+    pub file_schema: Option<SchemaRef>,
+    /// Optional [`ListingOptions`] for the to be created [`crate::ListingTable`].
+    ///
+    /// See details on [`ListingTableConfig::with_listing_options`]
+    pub options: Option<ListingOptions>,
+    /// Tracks the source of the schema information
+    pub(crate) schema_source: SchemaSource,
+    /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
+    pub(crate) expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+}
+
+impl ListingTableConfig {
+    /// Creates new [`ListingTableConfig`] for reading the specified URL
+    pub fn new(table_path: ListingTableUrl) -> Self {
+        Self {
+            table_paths: vec![table_path],
+            ..Default::default()
+        }
+    }
+
+    /// Creates new [`ListingTableConfig`] with multiple table paths.
+    ///
+    ///  See `ListingTableConfigExt::infer_options` for details on what happens with multiple paths
+    pub fn new_with_multi_paths(table_paths: Vec<ListingTableUrl>) -> Self {
+        Self {
+            table_paths,
+            ..Default::default()
+        }
+    }
+
+    /// Returns the source of the schema for this configuration
+    pub fn schema_source(&self) -> SchemaSource {
+        self.schema_source
+    }
+    /// Set the `schema` for the overall [`crate::ListingTable`]
+    ///
+    /// [`crate::ListingTable`] will automatically coerce, when possible, the schema
+    /// for individual files to match this schema.
+    ///
+    /// If a schema is not provided, it is inferred using
+    /// [`Self::infer_schema`].
+    ///
+    /// If the schema is provided, it must contain only the fields in the file
+    /// without the table partitioning columns.
+    ///
+    /// # Example: Specifying Table Schema
+    /// ```rust
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions};
+    /// # use datafusion_datasource::ListingTableUrl;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    /// # use arrow::datatypes::{Schema, Field, DataType};
+    /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap();
+    /// # let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()));
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("id", DataType::Int64, false),
+    ///     Field::new("name", DataType::Utf8, true),
+    /// ]));
+    ///
+    /// let config = ListingTableConfig::new(table_paths)
+    ///     .with_listing_options(listing_options)  // Set options first
+    ///     .with_schema(schema);                    // Then set schema
+    /// ```
+    pub fn with_schema(self, schema: SchemaRef) -> Self {
+        // Note: We preserve existing options state, but downstream code may expect
+        // options to be set. Consider calling with_listing_options() or infer_options()
+        // before operations that require options to be present.
+        debug_assert!(
+            self.options.is_some() || cfg!(test),
+            "ListingTableConfig::with_schema called without options set. \
+             Consider calling with_listing_options() or infer_options() first to avoid panics in downstream code."
+        );
+
+        Self {
+            file_schema: Some(schema),
+            schema_source: SchemaSource::Specified,
+            ..self
+        }
+    }
+
+    /// Add `listing_options` to [`ListingTableConfig`]
+    ///
+    /// If not provided, format and other options are inferred via
+    /// `ListingTableConfigExt::infer_options`.
+    ///
+    /// # Example: Configuring Parquet Files with Custom Options
+    /// ```rust
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::{ListingTableConfig, ListingOptions};
+    /// # use datafusion_datasource::ListingTableUrl;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    /// # let table_paths = ListingTableUrl::parse("file:///path/to/data").unwrap();
+    /// let options = ListingOptions::new(Arc::new(ParquetFormat::default()))
+    ///     .with_file_extension(".parquet")
+    ///     .with_collect_stat(true);
+    ///
+    /// let config = ListingTableConfig::new(table_paths).with_listing_options(options);
+    /// // Configure file format and options
+    /// ```
+    pub fn with_listing_options(self, listing_options: ListingOptions) -> Self {
+        // Note: This method properly sets options, but be aware that downstream
+        // methods like infer_schema() and try_new() require both schema and options
+        // to be set to function correctly.
+        debug_assert!(
+            !self.table_paths.is_empty() || cfg!(test),
+            "ListingTableConfig::with_listing_options called without table_paths set. \
+             Consider calling new() or new_with_multi_paths() first to establish table paths."
+        );
+
+        Self {
+            options: Some(listing_options),
+            ..self
+        }
+    }
+
+    /// Returns a tuple of `(file_extension, optional compression_extension)`
+    ///
+    /// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))`
+    /// For example a path ending with blah.test.csv returns `("csv", None)`
+    pub fn infer_file_extension_and_compression_type(
+        path: &str,
+    ) -> datafusion_common::Result<(String, Option<String>)> {
+        let mut exts = path.rsplit('.');
+
+        let split = exts.next().unwrap_or("");
+
+        let file_compression_type = FileCompressionType::from_str(split)
+            .unwrap_or(FileCompressionType::UNCOMPRESSED);
+
+        if file_compression_type.is_compressed() {
+            let split2 = exts.next().unwrap_or("");
+            Ok((split2.to_string(), Some(split.to_string())))
+        } else {
+            Ok((split.to_string(), None))
+        }
+    }
+
+    /// Infer the [`SchemaRef`] based on `table_path`s.
+    ///
+    /// This method infers the table schema using the first `table_path`.
+    /// See [`ListingOptions::infer_schema`] for more details
+    ///
+    /// # Errors
+    /// * if `self.options` is not set. See [`Self::with_listing_options`]
+    pub async fn infer_schema(
+        self,
+        state: &dyn Session,
+    ) -> datafusion_common::Result<Self> {
+        match self.options {
+            Some(options) => {
+                let ListingTableConfig {
+                    table_paths,
+                    file_schema,
+                    options: _,
+                    schema_source,
+                    expr_adapter_factory,
+                } = self;
+
+                let (schema, new_schema_source) = match file_schema {
+                    Some(schema) => (schema, schema_source), // Keep existing source if schema exists
+                    None => {
+                        if let Some(url) = table_paths.first() {
+                            (
+                                options.infer_schema(state, url).await?,
+                                SchemaSource::Inferred,
+                            )
+                        } else {
+                            (Arc::new(Schema::empty()), SchemaSource::Inferred)
+                        }
+                    }
+                };
+
+                Ok(Self {
+                    table_paths,
+                    file_schema: Some(schema),
+                    options: Some(options),
+                    schema_source: new_schema_source,
+                    expr_adapter_factory,
+                })
+            }
+            None => internal_err!("No `ListingOptions` set for inferring schema"),
+        }
+    }
+
+    /// Infer the partition columns from `table_paths`.
+    ///
+    /// # Errors
+    /// * if `self.options` is not set. See [`Self::with_listing_options`]
+    pub async fn infer_partitions_from_path(
+        self,
+        state: &dyn Session,
+    ) -> datafusion_common::Result<Self> {
+        match self.options {
+            Some(options) => {
+                let Some(url) = self.table_paths.first() else {
+                    return config_err!("No table path found");
+                };
+                let partitions = options
+                    .infer_partitions(state, url)
+                    .await?
+                    .into_iter()
+                    .map(|col_name| {
+                        (
+                            col_name,
+                            DataType::Dictionary(
+                                Box::new(DataType::UInt16),
+                                Box::new(DataType::Utf8),
+                            ),
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                let options = options.with_table_partition_cols(partitions);
+                Ok(Self {
+                    table_paths: self.table_paths,
+                    file_schema: self.file_schema,
+                    options: Some(options),
+                    schema_source: self.schema_source,
+                    expr_adapter_factory: self.expr_adapter_factory,
+                })
+            }
+            None => config_err!("No `ListingOptions` set for inferring schema"),
+        }
+    }
+
+    /// Set the [`PhysicalExprAdapterFactory`] for the [`crate::ListingTable`]
+    ///
+    /// The expression adapter factory is used to create physical expression adapters that can
+    /// handle schema evolution and type conversions when evaluating expressions
+    /// with different schemas than the table schema.
+    pub fn with_expr_adapter_factory(
+        self,
+        expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+    ) -> Self {
+        Self {
+            expr_adapter_factory: Some(expr_adapter_factory),
+            ..self
+        }
+    }
+
+    /// Deprecated: Set the [`SchemaAdapterFactory`] for the [`crate::ListingTable`]
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use [`Self::with_expr_adapter_factory`]
+    /// and `PhysicalExprAdapterFactory` instead. See `upgrading.md` for more details.
+    ///
+    /// This method is a no-op and returns `self` unchanged.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use with_expr_adapter_factory and PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    pub fn with_schema_adapter_factory(
+        self,
+        _schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
+    ) -> Self {
+        // No-op - just return self unchanged
+        self
+    }
+}
diff --git a/datafusion/catalog-listing/src/helpers.rs b/datafusion/catalog-listing/src/helpers.rs
index 037c69cebd572..031b2ebfb8109 100644
--- a/datafusion/catalog-listing/src/helpers.rs
+++ b/datafusion/catalog-listing/src/helpers.rs
@@ -21,25 +21,23 @@ use std::mem;
 use std::sync::Arc;
 
 use datafusion_catalog::Session;
-use datafusion_common::internal_err;
-use datafusion_common::{HashMap, Result, ScalarValue};
+use datafusion_common::{HashMap, Result, ScalarValue, assert_or_internal_err};
 use datafusion_datasource::ListingTableUrl;
 use datafusion_datasource::PartitionedFile;
-use datafusion_expr::{BinaryExpr, Operator};
+use datafusion_expr::{BinaryExpr, Operator, lit, utils};
 
 use arrow::{
-    array::{Array, ArrayRef, AsArray, StringBuilder},
-    compute::{and, cast, prep_null_mask_filter},
-    datatypes::{DataType, Field, Fields, Schema},
+    array::AsArray,
+    datatypes::{DataType, Field},
     record_batch::RecordBatch,
 };
 use datafusion_expr::execution_props::ExecutionProps;
 use futures::stream::FuturesUnordered;
-use futures::{stream::BoxStream, StreamExt, TryStreamExt};
+use futures::{StreamExt, TryStreamExt, stream::BoxStream};
 use log::{debug, trace};
 
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{Column, DFSchema, DataFusionError};
+use datafusion_common::{Column, DFSchema};
 use datafusion_expr::{Expr, Volatility};
 use datafusion_physical_expr::create_physical_expr;
 use object_store::path::Path;
@@ -53,7 +51,7 @@ use object_store::{ObjectMeta, ObjectStore};
 pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
     let mut is_applicable = true;
     expr.apply(|expr| match expr {
-        Expr::Column(Column { ref name, .. }) => {
+        Expr::Column(Column { name, .. }) => {
             is_applicable &= col_names.contains(&name.as_str());
             if is_applicable {
                 Ok(TreeNodeRecursion::Jump)
@@ -61,7 +59,7 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
                 Ok(TreeNodeRecursion::Stop)
             }
         }
-        Expr::Literal(_)
+        Expr::Literal(_, _)
         | Expr::Alias(_)
         | Expr::OuterReferenceColumn(_, _)
         | Expr::ScalarVariable(_, _)
@@ -85,6 +83,7 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
         | Expr::Exists(_)
         | Expr::InSubquery(_)
         | Expr::ScalarSubquery(_)
+        | Expr::SetComparison(_)
         | Expr::GroupingSet(_)
         | Expr::Case(_) => Ok(TreeNodeRecursion::Continue),
 
@@ -156,6 +155,7 @@ pub fn split_files(
     chunks
 }
 
+#[derive(Debug)]
 pub struct Partition {
     /// The path to the partition, including the table prefix
     path: Path,
@@ -238,96 +238,6 @@ pub async fn list_partitions(
     Ok(out)
 }
 
-async fn prune_partitions(
-    table_path: &ListingTableUrl,
-    partitions: Vec<Partition>,
-    filters: &[Expr],
-    partition_cols: &[(String, DataType)],
-) -> Result<Vec<Partition>> {
-    if filters.is_empty() {
-        return Ok(partitions);
-    }
-
-    let mut builders: Vec<_> = (0..partition_cols.len())
-        .map(|_| StringBuilder::with_capacity(partitions.len(), partitions.len() * 10))
-        .collect();
-
-    for partition in &partitions {
-        let cols = partition_cols.iter().map(|x| x.0.as_str());
-        let parsed = parse_partitions_for_path(table_path, &partition.path, cols)
-            .unwrap_or_default();
-
-        let mut builders = builders.iter_mut();
-        for (p, b) in parsed.iter().zip(&mut builders) {
-            b.append_value(p);
-        }
-        builders.for_each(|b| b.append_null());
-    }
-
-    let arrays = partition_cols
-        .iter()
-        .zip(builders)
-        .map(|((_, d), mut builder)| {
-            let array = builder.finish();
-            cast(&array, d)
-        })
-        .collect::<Result<_, _>>()?;
-
-    let fields: Fields = partition_cols
-        .iter()
-        .map(|(n, d)| Field::new(n, d.clone(), true))
-        .collect();
-    let schema = Arc::new(Schema::new(fields));
-
-    let df_schema = DFSchema::from_unqualified_fields(
-        partition_cols
-            .iter()
-            .map(|(n, d)| Field::new(n, d.clone(), true))
-            .collect(),
-        Default::default(),
-    )?;
-
-    let batch = RecordBatch::try_new(schema, arrays)?;
-
-    // TODO: Plumb this down
-    let props = ExecutionProps::new();
-
-    // Applies `filter` to `batch` returning `None` on error
-    let do_filter = |filter| -> Result<ArrayRef> {
-        let expr = create_physical_expr(filter, &df_schema, &props)?;
-        expr.evaluate(&batch)?.into_array(partitions.len())
-    };
-
-    //.Compute the conjunction of the filters
-    let mask = filters
-        .iter()
-        .map(|f| do_filter(f).map(|a| a.as_boolean().clone()))
-        .reduce(|a, b| Ok(and(&a?, &b?)?));
-
-    let mask = match mask {
-        Some(Ok(mask)) => mask,
-        Some(Err(err)) => return Err(err),
-        None => return Ok(partitions),
-    };
-
-    // Don't retain partitions that evaluated to null
-    let prepared = match mask.null_count() {
-        0 => mask,
-        _ => prep_null_mask_filter(&mask),
-    };
-
-    // Sanity check
-    assert_eq!(prepared.len(), partitions.len());
-
-    let filtered = partitions
-        .into_iter()
-        .zip(prepared.values())
-        .filter_map(|(p, f)| f.then_some(p))
-        .collect();
-
-    Ok(filtered)
-}
-
 #[derive(Debug)]
 enum PartitionValue {
     Single(String),
@@ -338,16 +248,11 @@ fn populate_partition_values<'a>(
     partition_values: &mut HashMap<&'a str, PartitionValue>,
     filter: &'a Expr,
 ) {
-    if let Expr::BinaryExpr(BinaryExpr {
-        ref left,
-        op,
-        ref right,
-    }) = filter
-    {
+    if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = filter {
         match op {
             Operator::Eq => match (left.as_ref(), right.as_ref()) {
-                (Expr::Column(Column { ref name, .. }), Expr::Literal(val))
-                | (Expr::Literal(val), Expr::Column(Column { ref name, .. })) => {
+                (Expr::Column(Column { name, .. }), Expr::Literal(val, _))
+                | (Expr::Literal(val, _), Expr::Column(Column { name, .. })) => {
                     if partition_values
                         .insert(name, PartitionValue::Single(val.to_string()))
                         .is_some()
@@ -402,6 +307,62 @@ pub fn evaluate_partition_prefix<'a>(
     }
 }
 
+fn filter_partitions(
+    pf: PartitionedFile,
+    filters: &[Expr],
+    df_schema: &DFSchema,
+) -> Result<Option<PartitionedFile>> {
+    if pf.partition_values.is_empty() && !filters.is_empty() {
+        return Ok(None);
+    } else if filters.is_empty() {
+        return Ok(Some(pf));
+    }
+
+    let arrays = pf
+        .partition_values
+        .iter()
+        .map(|v| v.to_array())
+        .collect::<Result<_, _>>()?;
+
+    let batch = RecordBatch::try_new(Arc::clone(df_schema.inner()), arrays)?;
+
+    let filter = utils::conjunction(filters.iter().cloned()).unwrap_or_else(|| lit(true));
+    let props = ExecutionProps::new();
+    let expr = create_physical_expr(&filter, df_schema, &props)?;
+
+    // Since we're only operating on a single file, our batch and resulting "array" holds only one
+    // value indicating if the input file matches the provided filters
+    let matches = expr.evaluate(&batch)?.into_array(1)?;
+    if matches.as_boolean().value(0) {
+        return Ok(Some(pf));
+    }
+
+    Ok(None)
+}
+
+fn try_into_partitioned_file(
+    object_meta: ObjectMeta,
+    partition_cols: &[(String, DataType)],
+    table_path: &ListingTableUrl,
+) -> Result<PartitionedFile> {
+    let cols = partition_cols.iter().map(|(name, _)| name.as_str());
+    let parsed = parse_partitions_for_path(table_path, &object_meta.location, cols);
+
+    let partition_values = parsed
+        .into_iter()
+        .flatten()
+        .zip(partition_cols)
+        .map(|(parsed, (_, datatype))| {
+            ScalarValue::try_from_string(parsed.to_string(), datatype)
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    let mut pf: PartitionedFile = object_meta.into();
+    pf.partition_values = partition_values;
+
+    Ok(pf)
+}
+
 /// Discover the partitions on the given path and prune out files
 /// that belong to irrelevant partitions using `filters` expressions.
 /// `filters` should only contain expressions that can be evaluated
@@ -414,79 +375,46 @@ pub async fn pruned_partition_list<'a>(
     file_extension: &'a str,
     partition_cols: &'a [(String, DataType)],
 ) -> Result<BoxStream<'a, Result<PartitionedFile>>> {
-    // if no partition col => simply list all the files
-    if partition_cols.is_empty() {
-        if !filters.is_empty() {
-            return internal_err!(
-                "Got partition filters for unpartitioned table {}",
-                table_path
-            );
-        }
-        return Ok(Box::pin(
-            table_path
-                .list_all_files(ctx, store, file_extension)
-                .await?
-                .try_filter(|object_meta| futures::future::ready(object_meta.size > 0))
-                .map_ok(|object_meta| object_meta.into()),
-        ));
-    }
-
-    let partition_prefix = evaluate_partition_prefix(partition_cols, filters);
-    let partitions =
-        list_partitions(store, table_path, partition_cols.len(), partition_prefix)
-            .await?;
-    debug!("Listed {} partitions", partitions.len());
-
-    let pruned =
-        prune_partitions(table_path, partitions, filters, partition_cols).await?;
+    let prefix = if !partition_cols.is_empty() {
+        evaluate_partition_prefix(partition_cols, filters)
+    } else {
+        None
+    };
 
-    debug!("Pruning yielded {} partitions", pruned.len());
+    let objects = table_path
+        .list_prefixed_files(ctx, store, prefix, file_extension)
+        .await?
+        .try_filter(|object_meta| futures::future::ready(object_meta.size > 0));
 
-    let stream = futures::stream::iter(pruned)
-        .map(move |partition: Partition| async move {
-            let cols = partition_cols.iter().map(|x| x.0.as_str());
-            let parsed = parse_partitions_for_path(table_path, &partition.path, cols);
+    if partition_cols.is_empty() {
+        assert_or_internal_err!(
+            filters.is_empty(),
+            "Got partition filters for unpartitioned table {}",
+            table_path
+        );
 
-            let partition_values = parsed
-                .into_iter()
-                .flatten()
-                .zip(partition_cols)
-                .map(|(parsed, (_, datatype))| {
-                    ScalarValue::try_from_string(parsed.to_string(), datatype)
-                })
-                .collect::<Result<Vec<_>>>()?;
-
-            let files = match partition.files {
-                Some(files) => files,
-                None => {
-                    trace!("Recursively listing partition {}", partition.path);
-                    store.list(Some(&partition.path)).try_collect().await?
-                }
-            };
-            let files = files.into_iter().filter(move |o| {
-                let extension_match = o.location.as_ref().ends_with(file_extension);
-                // here need to scan subdirectories(`listing_table_ignore_subdirectory` = false)
-                let glob_match = table_path.contains(&o.location, false);
-                extension_match && glob_match
-            });
-
-            let stream = futures::stream::iter(files.map(move |object_meta| {
-                Ok(PartitionedFile {
-                    object_meta,
-                    partition_values: partition_values.clone(),
-                    range: None,
-                    statistics: None,
-                    extensions: None,
-                    metadata_size_hint: None,
-                })
-            }));
-
-            Ok::<_, DataFusionError>(stream)
-        })
-        .buffer_unordered(CONCURRENCY_LIMIT)
-        .try_flatten()
-        .boxed();
-    Ok(stream)
+        // if no partition col => simply list all the files
+        Ok(objects.map_ok(|object_meta| object_meta.into()).boxed())
+    } else {
+        let df_schema = DFSchema::from_unqualified_fields(
+            partition_cols
+                .iter()
+                .map(|(n, d)| Field::new(n, d.clone(), true))
+                .collect(),
+            Default::default(),
+        )?;
+
+        Ok(objects
+            .map_ok(|object_meta| {
+                try_into_partitioned_file(object_meta, partition_cols, table_path)
+            })
+            .try_filter_map(move |pf| {
+                futures::future::ready(
+                    pf.and_then(|pf| filter_partitions(pf, filters, &df_schema)),
+                )
+            })
+            .boxed())
+    }
 }
 
 /// Extract the partition values for the given `file_path` (in the given `table_path`)
@@ -502,12 +430,12 @@ where
     let subpath = table_path.strip_prefix(file_path)?;
 
     let mut part_values = vec![];
-    for (part, pn) in subpath.zip(table_partition_cols) {
+    for (part, expected_partition) in subpath.zip(table_partition_cols) {
         match part.split_once('=') {
-            Some((name, val)) if name == pn => part_values.push(val),
+            Some((name, val)) if name == expected_partition => part_values.push(val),
             _ => {
                 debug!(
-                    "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{pn}'",
+                    "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
                 );
                 return None;
             }
@@ -530,22 +458,11 @@ pub fn describe_partition(partition: &Partition) -> (&str, usize, Vec<&str>) {
 
 #[cfg(test)]
 mod tests {
-    use async_trait::async_trait;
-    use datafusion_common::config::TableOptions;
     use datafusion_datasource::file_groups::FileGroup;
-    use datafusion_execution::config::SessionConfig;
-    use datafusion_execution::runtime_env::RuntimeEnv;
-    use futures::FutureExt;
-    use object_store::memory::InMemory;
-    use std::any::Any;
     use std::ops::Not;
 
     use super::*;
-    use datafusion_expr::{
-        case, col, lit, AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF,
-    };
-    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_expr::{Expr, case, col, lit};
 
     #[test]
     fn test_split_files() {
@@ -588,205 +505,6 @@ mod tests {
         assert_eq!(0, chunks.len());
     }
 
-    #[tokio::test]
-    async fn test_pruned_partition_list_empty() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/mypartition=val1/notparquetfile", 100),
-            ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
-            ("tablepath/file.parquet", 100),
-        ]);
-        let filter = Expr::eq(col("mypartition"), lit("val1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter],
-            ".parquet",
-            &[(String::from("mypartition"), DataType::Utf8)],
-        )
-        .await
-        .expect("partition pruning failed")
-        .collect::<Vec<_>>()
-        .await;
-
-        assert_eq!(pruned.len(), 0);
-    }
-
-    #[tokio::test]
-    async fn test_pruned_partition_list() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/mypartition=val1/file.parquet", 100),
-            ("tablepath/mypartition=val2/file.parquet", 100),
-            ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
-            ("tablepath/mypartition=val1/other=val3/file.parquet", 100),
-        ]);
-        let filter = Expr::eq(col("mypartition"), lit("val1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter],
-            ".parquet",
-            &[(String::from("mypartition"), DataType::Utf8)],
-        )
-        .await
-        .expect("partition pruning failed")
-        .try_collect::<Vec<_>>()
-        .await
-        .unwrap();
-
-        assert_eq!(pruned.len(), 2);
-        let f1 = &pruned[0];
-        assert_eq!(
-            f1.object_meta.location.as_ref(),
-            "tablepath/mypartition=val1/file.parquet"
-        );
-        assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]);
-        let f2 = &pruned[1];
-        assert_eq!(
-            f2.object_meta.location.as_ref(),
-            "tablepath/mypartition=val1/other=val3/file.parquet"
-        );
-        assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]);
-    }
-
-    #[tokio::test]
-    async fn test_pruned_partition_list_multi() {
-        let (store, state) = make_test_store_and_state(&[
-            ("tablepath/part1=p1v1/file.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100),
-        ]);
-        let filter1 = Expr::eq(col("part1"), lit("p1v2"));
-        let filter2 = Expr::eq(col("part2"), lit("p2v1"));
-        let pruned = pruned_partition_list(
-            state.as_ref(),
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            &[filter1, filter2],
-            ".parquet",
-            &[
-                (String::from("part1"), DataType::Utf8),
-                (String::from("part2"), DataType::Utf8),
-            ],
-        )
-        .await
-        .expect("partition pruning failed")
-        .try_collect::<Vec<_>>()
-        .await
-        .unwrap();
-
-        assert_eq!(pruned.len(), 2);
-        let f1 = &pruned[0];
-        assert_eq!(
-            f1.object_meta.location.as_ref(),
-            "tablepath/part1=p1v2/part2=p2v1/file1.parquet"
-        );
-        assert_eq!(
-            &f1.partition_values,
-            &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),]
-        );
-        let f2 = &pruned[1];
-        assert_eq!(
-            f2.object_meta.location.as_ref(),
-            "tablepath/part1=p1v2/part2=p2v1/file2.parquet"
-        );
-        assert_eq!(
-            &f2.partition_values,
-            &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")]
-        );
-    }
-
-    #[tokio::test]
-    async fn test_list_partition() {
-        let (store, _) = make_test_store_and_state(&[
-            ("tablepath/part1=p1v1/file.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
-            ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100),
-            ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0),
-        ]);
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            0,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec![]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-            ]
-        );
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            1,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]),
-                ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-                ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]),
-            ]
-        );
-
-        let partitions = list_partitions(
-            store.as_ref(),
-            &ListingTableUrl::parse("file:///tablepath/").unwrap(),
-            2,
-            None,
-        )
-        .await
-        .expect("listing partitions failed");
-
-        assert_eq!(
-            &partitions
-                .iter()
-                .map(describe_partition)
-                .collect::<Vec<_>>(),
-            &vec![
-                ("tablepath", 0, vec![]),
-                ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
-                ("tablepath/part1=p1v2", 1, vec![]),
-                ("tablepath/part1=p1v3", 1, vec![]),
-                (
-                    "tablepath/part1=p1v2/part2=p2v1",
-                    2,
-                    vec!["file1.parquet", "file2.parquet"]
-                ),
-                ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]),
-                ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]),
-            ]
-        );
-    }
-
     #[test]
     fn test_parse_partitions_for_path() {
         assert_eq!(
@@ -984,7 +702,7 @@ mod tests {
         assert_eq!(
             evaluate_partition_prefix(
                 partitions,
-                &[col("a").eq(Expr::Literal(ScalarValue::Date32(Some(3))))],
+                &[col("a").eq(Expr::Literal(ScalarValue::Date32(Some(3)), None))],
             ),
             Some(Path::from("a=1970-01-04")),
         );
@@ -993,93 +711,12 @@ mod tests {
         assert_eq!(
             evaluate_partition_prefix(
                 partitions,
-                &[col("a").eq(Expr::Literal(ScalarValue::Date64(Some(
-                    4 * 24 * 60 * 60 * 1000
-                )))),],
+                &[col("a").eq(Expr::Literal(
+                    ScalarValue::Date64(Some(4 * 24 * 60 * 60 * 1000)),
+                    None
+                )),],
             ),
             Some(Path::from("a=1970-01-05")),
         );
     }
-
-    pub fn make_test_store_and_state(
-        files: &[(&str, u64)],
-    ) -> (Arc<InMemory>, Arc<dyn Session>) {
-        let memory = InMemory::new();
-
-        for (name, size) in files {
-            memory
-                .put(&Path::from(*name), vec![0; *size as usize].into())
-                .now_or_never()
-                .unwrap()
-                .unwrap();
-        }
-
-        (Arc::new(memory), Arc::new(MockSession {}))
-    }
-
-    struct MockSession {}
-
-    #[async_trait]
-    impl Session for MockSession {
-        fn session_id(&self) -> &str {
-            unimplemented!()
-        }
-
-        fn config(&self) -> &SessionConfig {
-            unimplemented!()
-        }
-
-        async fn create_physical_plan(
-            &self,
-            _logical_plan: &LogicalPlan,
-        ) -> Result<Arc<dyn ExecutionPlan>> {
-            unimplemented!()
-        }
-
-        fn create_physical_expr(
-            &self,
-            _expr: Expr,
-            _df_schema: &DFSchema,
-        ) -> Result<Arc<dyn PhysicalExpr>> {
-            unimplemented!()
-        }
-
-        fn scalar_functions(&self) -> &std::collections::HashMap<String, Arc<ScalarUDF>> {
-            unimplemented!()
-        }
-
-        fn aggregate_functions(
-            &self,
-        ) -> &std::collections::HashMap<String, Arc<AggregateUDF>> {
-            unimplemented!()
-        }
-
-        fn window_functions(&self) -> &std::collections::HashMap<String, Arc<WindowUDF>> {
-            unimplemented!()
-        }
-
-        fn runtime_env(&self) -> &Arc<RuntimeEnv> {
-            unimplemented!()
-        }
-
-        fn execution_props(&self) -> &ExecutionProps {
-            unimplemented!()
-        }
-
-        fn as_any(&self) -> &dyn Any {
-            unimplemented!()
-        }
-
-        fn table_options(&self) -> &TableOptions {
-            unimplemented!()
-        }
-
-        fn table_options_mut(&mut self) -> &mut TableOptions {
-            unimplemented!()
-        }
-
-        fn task_ctx(&self) -> Arc<datafusion_execution::TaskContext> {
-            unimplemented!()
-        }
-    }
 }
diff --git a/datafusion/catalog-listing/src/mod.rs b/datafusion/catalog-listing/src/mod.rs
index fb0a960f37b6a..9efb5aa96267e 100644
--- a/datafusion/catalog-listing/src/mod.rs
+++ b/datafusion/catalog-listing/src/mod.rs
@@ -15,13 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
 
+mod config;
 pub mod helpers;
+mod options;
+mod table;
+
+pub use config::{ListingTableConfig, SchemaSource};
+pub use options::ListingOptions;
+pub use table::{ListFilesResult, ListingTable};
diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs
new file mode 100644
index 0000000000000..146f98d62335e
--- /dev/null
+++ b/datafusion/catalog-listing/src/options.rs
@@ -0,0 +1,399 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, SchemaRef};
+use datafusion_catalog::Session;
+use datafusion_common::plan_err;
+use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource::file_format::FileFormat;
+use datafusion_execution::config::SessionConfig;
+use datafusion_expr::SortExpr;
+use futures::StreamExt;
+use futures::{TryStreamExt, future};
+use itertools::Itertools;
+use std::sync::Arc;
+
+/// Options for creating a [`crate::ListingTable`]
+#[derive(Clone, Debug)]
+pub struct ListingOptions {
+    /// A suffix on which files should be filtered (leave empty to
+    /// keep all files on the path)
+    pub file_extension: String,
+    /// The file format
+    pub format: Arc<dyn FileFormat>,
+    /// The expected partition column names in the folder structure.
+    /// See [Self::with_table_partition_cols] for details
+    pub table_partition_cols: Vec<(String, DataType)>,
+    /// Set true to try to guess statistics from the files.
+    /// This can add a lot of overhead as it will usually require files
+    /// to be opened and at least partially parsed.
+    pub collect_stat: bool,
+    /// Group files to avoid that the number of partitions exceeds
+    /// this limit
+    pub target_partitions: usize,
+    /// Optional pre-known sort order(s). Must be `SortExpr`s.
+    ///
+    /// DataFusion may take advantage of this ordering to omit sorts
+    /// or use more efficient algorithms. Currently sortedness must be
+    /// provided if it is known by some external mechanism, but may in
+    /// the future be automatically determined, for example using
+    /// parquet metadata.
+    ///
+    /// See <https://github.com/apache/datafusion/issues/4177>
+    ///
+    /// NOTE: This attribute stores all equivalent orderings (the outer `Vec`)
+    ///       where each ordering consists of an individual lexicographic
+    ///       ordering (encapsulated by a `Vec<Expr>`). If there aren't
+    ///       multiple equivalent orderings, the outer `Vec` will have a
+    ///       single element.
+    pub file_sort_order: Vec<Vec<SortExpr>>,
+}
+
+impl ListingOptions {
+    /// Creates an options instance with the given format
+    /// Default values:
+    /// - use default file extension filter
+    /// - no input partition to discover
+    /// - one target partition
+    /// - do not collect statistics
+    pub fn new(format: Arc<dyn FileFormat>) -> Self {
+        Self {
+            file_extension: format.get_ext(),
+            format,
+            table_partition_cols: vec![],
+            collect_stat: false,
+            target_partitions: 1,
+            file_sort_order: vec![],
+        }
+    }
+
+    /// Set options from [`SessionConfig`] and returns self.
+    ///
+    /// Currently this sets `target_partitions` and `collect_stat`
+    /// but if more options are added in the future that need to be coordinated
+    /// they will be synchronized through this method.
+    pub fn with_session_config_options(mut self, config: &SessionConfig) -> Self {
+        self = self.with_target_partitions(config.target_partitions());
+        self = self.with_collect_stat(config.collect_statistics());
+        self
+    }
+
+    /// Set file extension on [`ListingOptions`] and returns self.
+    ///
+    /// # Example
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()))
+    ///     .with_file_extension(".parquet");
+    ///
+    /// assert_eq!(listing_options.file_extension, ".parquet");
+    /// ```
+    pub fn with_file_extension(mut self, file_extension: impl Into<String>) -> Self {
+        self.file_extension = file_extension.into();
+        self
+    }
+
+    /// Optionally set file extension on [`ListingOptions`] and returns self.
+    ///
+    /// If `file_extension` is `None`, the file extension will not be changed
+    ///
+    /// # Example
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// let extension = Some(".parquet");
+    /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()))
+    ///     .with_file_extension_opt(extension);
+    ///
+    /// assert_eq!(listing_options.file_extension, ".parquet");
+    /// ```
+    pub fn with_file_extension_opt<S>(mut self, file_extension: Option<S>) -> Self
+    where
+        S: Into<String>,
+    {
+        if let Some(file_extension) = file_extension {
+            self.file_extension = file_extension.into();
+        }
+        self
+    }
+
+    /// Set `table partition columns` on [`ListingOptions`] and returns self.
+    ///
+    /// "partition columns," used to support [Hive Partitioning], are
+    /// columns added to the data that is read, based on the folder
+    /// structure where the data resides.
+    ///
+    /// For example, give the following files in your filesystem:
+    ///
+    /// ```text
+    /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
+    /// ```
+    ///
+    /// A [`crate::ListingTable`] created at `/mnt/nyctaxi/` with partition
+    /// columns "year" and "month" will include new `year` and `month`
+    /// columns while reading the files. The `year` column would have
+    /// value `2022` and the `month` column would have value `01` for
+    /// the rows read from
+    /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
+    ///
+    ///# Notes
+    ///
+    /// - If only one level (e.g. `year` in the example above) is
+    ///   specified, the other levels are ignored but the files are
+    ///   still read.
+    ///
+    /// - Files that don't follow this partitioning scheme will be
+    ///   ignored.
+    ///
+    /// - Since the columns have the same value for all rows read from
+    ///   each individual file (such as dates), they are typically
+    ///   dictionary encoded for efficiency. You may use
+    ///   [`wrap_partition_type_in_dict`] to request a
+    ///   dictionary-encoded type.
+    ///
+    /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::DataType;
+    /// # use datafusion_expr::col;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// // listing options for files with paths such as  `/mnt/data/col_a=x/col_b=y/data.parquet`
+    /// // `col_a` and `col_b` will be included in the data read from those files
+    /// let listing_options = ListingOptions::new(Arc::new(
+    ///     ParquetFormat::default()
+    ///   ))
+    ///   .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8),
+    ///       ("col_b".to_string(), DataType::Utf8)]);
+    ///
+    /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
+    ///     ("col_b".to_string(), DataType::Utf8)]);
+    /// ```
+    ///
+    /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
+    /// [`wrap_partition_type_in_dict`]: datafusion_datasource::file_scan_config::wrap_partition_type_in_dict
+    pub fn with_table_partition_cols(
+        mut self,
+        table_partition_cols: Vec<(String, DataType)>,
+    ) -> Self {
+        self.table_partition_cols = table_partition_cols;
+        self
+    }
+
+    /// Set stat collection on [`ListingOptions`] and returns self.
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// let listing_options =
+    ///     ListingOptions::new(Arc::new(ParquetFormat::default())).with_collect_stat(true);
+    ///
+    /// assert_eq!(listing_options.collect_stat, true);
+    /// ```
+    pub fn with_collect_stat(mut self, collect_stat: bool) -> Self {
+        self.collect_stat = collect_stat;
+        self
+    }
+
+    /// Set number of target partitions on [`ListingOptions`] and returns self.
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// let listing_options =
+    ///     ListingOptions::new(Arc::new(ParquetFormat::default())).with_target_partitions(8);
+    ///
+    /// assert_eq!(listing_options.target_partitions, 8);
+    /// ```
+    pub fn with_target_partitions(mut self, target_partitions: usize) -> Self {
+        self.target_partitions = target_partitions;
+        self
+    }
+
+    /// Set file sort order on [`ListingOptions`] and returns self.
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_expr::col;
+    /// # use datafusion_catalog_listing::ListingOptions;
+    /// # use datafusion_datasource_parquet::file_format::ParquetFormat;
+    ///
+    /// // Tell datafusion that the files are sorted by column "a"
+    /// let file_sort_order = vec![vec![col("a").sort(true, true)]];
+    ///
+    /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default()))
+    ///     .with_file_sort_order(file_sort_order.clone());
+    ///
+    /// assert_eq!(listing_options.file_sort_order, file_sort_order);
+    /// ```
+    pub fn with_file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
+        self.file_sort_order = file_sort_order;
+        self
+    }
+
+    /// Infer the schema of the files at the given path on the provided object store.
+    ///
+    /// If the table_path contains one or more files (i.e. it is a directory /
+    /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]
+    ///
+    /// Note: The inferred schema does not include any partitioning columns.
+    ///
+    /// This method is called as part of creating a [`crate::ListingTable`].
+    pub async fn infer_schema<'a>(
+        &'a self,
+        state: &dyn Session,
+        table_path: &'a ListingTableUrl,
+    ) -> datafusion_common::Result<SchemaRef> {
+        let store = state.runtime_env().object_store(table_path)?;
+
+        let files: Vec<_> = table_path
+            .list_all_files(state, store.as_ref(), &self.file_extension)
+            .await?
+            // Empty files cannot affect schema but may throw when trying to read for it
+            .try_filter(|object_meta| future::ready(object_meta.size > 0))
+            .try_collect()
+            .await?;
+
+        let schema = self.format.infer_schema(state, &store, &files).await?;
+
+        Ok(schema)
+    }
+
+    /// Infers the partition columns stored in `LOCATION` and compares
+    /// them with the columns provided in `PARTITIONED BY` to help prevent
+    /// accidental corrupts of partitioned tables.
+    ///
+    /// Allows specifying partial partitions.
+    pub async fn validate_partitions(
+        &self,
+        state: &dyn Session,
+        table_path: &ListingTableUrl,
+    ) -> datafusion_common::Result<()> {
+        if self.table_partition_cols.is_empty() {
+            return Ok(());
+        }
+
+        if !table_path.is_collection() {
+            return plan_err!(
+                "Can't create a partitioned table backed by a single file, \
+                perhaps the URL is missing a trailing slash?"
+            );
+        }
+
+        let inferred = self.infer_partitions(state, table_path).await?;
+
+        // no partitioned files found on disk
+        if inferred.is_empty() {
+            return Ok(());
+        }
+
+        let table_partition_names = self
+            .table_partition_cols
+            .iter()
+            .map(|(col_name, _)| col_name.clone())
+            .collect_vec();
+
+        if inferred.len() < table_partition_names.len() {
+            return plan_err!(
+                "Inferred partitions to be {:?}, but got {:?}",
+                inferred,
+                table_partition_names
+            );
+        }
+
+        // match prefix to allow creating tables with partial partitions
+        for (idx, col) in table_partition_names.iter().enumerate() {
+            if &inferred[idx] != col {
+                return plan_err!(
+                    "Inferred partitions to be {:?}, but got {:?}",
+                    inferred,
+                    table_partition_names
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Infer the partitioning at the given path on the provided object store.
+    /// For performance reasons, it doesn't read all the files on disk
+    /// and therefore may fail to detect invalid partitioning.
+    pub async fn infer_partitions(
+        &self,
+        state: &dyn Session,
+        table_path: &ListingTableUrl,
+    ) -> datafusion_common::Result<Vec<String>> {
+        let store = state.runtime_env().object_store(table_path)?;
+
+        // only use 10 files for inference
+        // This can fail to detect inconsistent partition keys
+        // A DFS traversal approach of the store can help here
+        let files: Vec<_> = table_path
+            .list_all_files(state, store.as_ref(), &self.file_extension)
+            .await?
+            .take(10)
+            .try_collect()
+            .await?;
+
+        let stripped_path_parts = files.iter().map(|file| {
+            table_path
+                .strip_prefix(&file.location)
+                .unwrap()
+                .collect_vec()
+        });
+
+        let partition_keys = stripped_path_parts
+            .map(|path_parts| {
+                path_parts
+                    .into_iter()
+                    .rev()
+                    .skip(1) // get parents only; skip the file itself
+                    .rev()
+                    // Partitions are expected to follow the format "column_name=value", so we
+                    // should ignore any path part that cannot be parsed into the expected format
+                    .filter(|s| s.contains('='))
+                    .map(|s| s.split('=').take(1).collect())
+                    .collect_vec()
+            })
+            .collect_vec();
+
+        match partition_keys.into_iter().all_equal_value() {
+            Ok(v) => Ok(v),
+            Err(None) => Ok(vec![]),
+            Err(Some(diff)) => {
+                let mut sorted_diff = [diff.0, diff.1];
+                sorted_diff.sort();
+                plan_err!("Found mixed partition values on disk {:?}", sorted_diff)
+            }
+        }
+    }
+}
diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
new file mode 100644
index 0000000000000..a5de79b052a4e
--- /dev/null
+++ b/datafusion/catalog-listing/src/table.rs
@@ -0,0 +1,1058 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::config::SchemaSource;
+use crate::helpers::{expr_applicable_for_cols, pruned_partition_list};
+use crate::{ListingOptions, ListingTableConfig};
+use arrow::datatypes::{Field, Schema, SchemaBuilder, SchemaRef};
+use async_trait::async_trait;
+use datafusion_catalog::{ScanArgs, ScanResult, Session, TableProvider};
+use datafusion_common::stats::Precision;
+use datafusion_common::{
+    Constraints, SchemaExt, Statistics, internal_datafusion_err, plan_err, project_schema,
+};
+use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_groups::FileGroup;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::file_sink_config::{FileOutputMode, FileSinkConfig};
+#[expect(deprecated)]
+use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use datafusion_datasource::{
+    ListingTableUrl, PartitionedFile, TableSchema, compute_all_files_statistics,
+};
+use datafusion_execution::cache::TableScopedPath;
+use datafusion_execution::cache::cache_manager::FileStatisticsCache;
+use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
+use datafusion_expr::dml::InsertOp;
+use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
+use datafusion_physical_expr::create_lex_ordering;
+use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::empty::EmptyExec;
+use futures::{Stream, StreamExt, TryStreamExt, future, stream};
+use object_store::ObjectStore;
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Result of a file listing operation from [`ListingTable::list_files_for_scan`].
+#[derive(Debug)]
+pub struct ListFilesResult {
+    /// File groups organized by the partitioning strategy.
+    pub file_groups: Vec<FileGroup>,
+    /// Aggregated statistics for all files.
+    pub statistics: Statistics,
+    /// Whether files are grouped by partition values (enables Hash partitioning).
+    pub grouped_by_partition: bool,
+}
+
+/// Built in [`TableProvider`] that reads data from one or more files as a single table.
+///
+/// The files are read using an  [`ObjectStore`] instance, for example from
+/// local files or objects from AWS S3.
+///
+/// # Features:
+/// * Reading multiple files as a single table
+/// * Hive style partitioning (e.g., directories named `date=2024-06-01`)
+/// * Merges schemas from files with compatible but not identical schemas (see [`ListingTableConfig::file_schema`])
+/// * `limit`, `filter` and `projection` pushdown for formats that support it (e.g.,
+///   Parquet)
+/// * Statistics collection and pruning based on file metadata
+/// * Pre-existing sort order (see [`ListingOptions::file_sort_order`])
+/// * Metadata caching to speed up repeated queries (see [`FileMetadataCache`])
+/// * Statistics caching (see [`FileStatisticsCache`])
+///
+/// [`FileMetadataCache`]: datafusion_execution::cache::cache_manager::FileMetadataCache
+///
+/// # Reading Directories and Hive Style Partitioning
+///
+/// For example, given the `table1` directory (or object store prefix)
+///
+/// ```text
+/// table1
+///  ├── file1.parquet
+///  └── file2.parquet
+/// ```
+///
+/// A `ListingTable` would read the files `file1.parquet` and `file2.parquet` as
+/// a single table, merging the schemas if the files have compatible but not
+/// identical schemas.
+///
+/// Given the `table2` directory (or object store prefix)
+///
+/// ```text
+/// table2
+///  ├── date=2024-06-01
+///  │    ├── file3.parquet
+///  │    └── file4.parquet
+///  └── date=2024-06-02
+///       └── file5.parquet
+/// ```
+///
+/// A `ListingTable` would read the files `file3.parquet`, `file4.parquet`, and
+/// `file5.parquet` as a single table, again merging schemas if necessary.
+///
+/// Given the hive style partitioning structure (e.g,. directories named
+/// `date=2024-06-01` and `date=2026-06-02`), `ListingTable` also adds a `date`
+/// column when reading the table:
+/// * The files in `table2/date=2024-06-01` will have the value `2024-06-01`
+/// * The files in `table2/date=2024-06-02` will have the value `2024-06-02`.
+///
+/// If the query has a predicate like `WHERE date = '2024-06-01'`
+/// only the corresponding directory will be read.
+///
+/// # See Also
+///
+/// 1. [`ListingTableConfig`]: Configuration options
+/// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable`
+///
+/// [`DataSourceExec`]: datafusion_datasource::source::DataSourceExec
+///
+/// # Caching Metadata
+///
+/// Some formats, such as Parquet, use the `FileMetadataCache` to cache file
+/// metadata that is needed to execute but expensive to read, such as row
+/// groups and statistics. The cache is scoped to the `SessionContext` and can
+/// be configured via the [runtime config options].
+///
+/// [runtime config options]: https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings
+///
+/// # Example: Read a directory of parquet files using a [`ListingTable`]
+///
+/// ```no_run
+/// # use datafusion_common::Result;
+/// # use std::sync::Arc;
+/// # use datafusion_catalog::TableProvider;
+/// # use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
+/// # use datafusion_datasource::ListingTableUrl;
+/// # use datafusion_datasource_parquet::file_format::ParquetFormat;/// #
+/// # use datafusion_catalog::Session;
+/// async fn get_listing_table(session: &dyn Session) -> Result<Arc<dyn TableProvider>> {
+/// let table_path = "/path/to/parquet";
+///
+/// // Parse the path
+/// let table_path = ListingTableUrl::parse(table_path)?;
+///
+/// // Create default parquet options
+/// let file_format = ParquetFormat::new();
+/// let listing_options = ListingOptions::new(Arc::new(file_format))
+///   .with_file_extension(".parquet");
+///
+/// // Resolve the schema
+/// let resolved_schema = listing_options
+///    .infer_schema(session, &table_path)
+///    .await?;
+///
+/// let config = ListingTableConfig::new(table_path)
+///   .with_listing_options(listing_options)
+///   .with_schema(resolved_schema);
+///
+/// // Create a new TableProvider
+/// let provider = Arc::new(ListingTable::try_new(config)?);
+///
+/// # Ok(provider)
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct ListingTable {
+    table_paths: Vec<ListingTableUrl>,
+    /// `file_schema` contains only the columns physically stored in the data files themselves.
+    ///     - Represents the actual fields found in files like Parquet, CSV, etc.
+    ///     - Used when reading the raw data from files
+    file_schema: SchemaRef,
+    /// `table_schema` combines `file_schema` + partition columns
+    ///     - Partition columns are derived from directory paths (not stored in files)
+    ///     - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet`
+    table_schema: SchemaRef,
+    /// Indicates how the schema was derived (inferred or explicitly specified)
+    schema_source: SchemaSource,
+    /// Options used to configure the listing table such as the file format
+    /// and partitioning information
+    options: ListingOptions,
+    /// The SQL definition for this table, if any
+    definition: Option<String>,
+    /// Cache for collected file statistics
+    collected_statistics: Arc<dyn FileStatisticsCache>,
+    /// Constraints applied to this table
+    constraints: Constraints,
+    /// Column default expressions for columns that are not physically present in the data files
+    column_defaults: HashMap<String, Expr>,
+    /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
+    expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+}
+
+impl ListingTable {
+    /// Create new [`ListingTable`]
+    ///
+    /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
+    pub fn try_new(config: ListingTableConfig) -> datafusion_common::Result<Self> {
+        // Extract schema_source before moving other parts of the config
+        let schema_source = config.schema_source();
+
+        let file_schema = config
+            .file_schema
+            .ok_or_else(|| internal_datafusion_err!("No schema provided."))?;
+
+        let options = config
+            .options
+            .ok_or_else(|| internal_datafusion_err!("No ListingOptions provided"))?;
+
+        // Add the partition columns to the file schema
+        let mut builder = SchemaBuilder::from(file_schema.as_ref().to_owned());
+        for (part_col_name, part_col_type) in &options.table_partition_cols {
+            builder.push(Field::new(part_col_name, part_col_type.clone(), false));
+        }
+
+        let table_schema = Arc::new(
+            builder
+                .finish()
+                .with_metadata(file_schema.metadata().clone()),
+        );
+
+        let table = Self {
+            table_paths: config.table_paths,
+            file_schema,
+            table_schema,
+            schema_source,
+            options,
+            definition: None,
+            collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
+            constraints: Constraints::default(),
+            column_defaults: HashMap::new(),
+            expr_adapter_factory: config.expr_adapter_factory,
+        };
+
+        Ok(table)
+    }
+
+    /// Assign constraints
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.constraints = constraints;
+        self
+    }
+
+    /// Assign column defaults
+    pub fn with_column_defaults(
+        mut self,
+        column_defaults: HashMap<String, Expr>,
+    ) -> Self {
+        self.column_defaults = column_defaults;
+        self
+    }
+
+    /// Set the [`FileStatisticsCache`] used to cache parquet file statistics.
+    ///
+    /// Setting a statistics cache on the `SessionContext` can avoid refetching statistics
+    /// multiple times in the same session.
+    ///
+    /// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query.
+    pub fn with_cache(mut self, cache: Option<Arc<dyn FileStatisticsCache>>) -> Self {
+        self.collected_statistics =
+            cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default()));
+        self
+    }
+
+    /// Specify the SQL definition for this table, if any
+    pub fn with_definition(mut self, definition: Option<String>) -> Self {
+        self.definition = definition;
+        self
+    }
+
+    /// Get paths ref
+    pub fn table_paths(&self) -> &Vec<ListingTableUrl> {
+        &self.table_paths
+    }
+
+    /// Get options ref
+    pub fn options(&self) -> &ListingOptions {
+        &self.options
+    }
+
+    /// Get the schema source
+    pub fn schema_source(&self) -> SchemaSource {
+        self.schema_source
+    }
+
+    /// Deprecated: Set the [`SchemaAdapterFactory`] for this [`ListingTable`]
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use [`ListingTableConfig::with_expr_adapter_factory`]
+    /// and `PhysicalExprAdapterFactory` instead. See `upgrading.md` for more details.
+    ///
+    /// This method is a no-op and returns `self` unchanged.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use ListingTableConfig::with_expr_adapter_factory and PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    pub fn with_schema_adapter_factory(
+        self,
+        _schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
+    ) -> Self {
+        // No-op - just return self unchanged
+        self
+    }
+
+    /// Deprecated: Returns the [`SchemaAdapterFactory`] used by this [`ListingTable`].
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    ///
+    /// Always returns `None`.
+    #[deprecated(
+        since = "52.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    pub fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
+        None
+    }
+
+    /// Creates a file source for this table
+    fn create_file_source(&self) -> Arc<dyn FileSource> {
+        let table_schema = TableSchema::new(
+            Arc::clone(&self.file_schema),
+            self.options
+                .table_partition_cols
+                .iter()
+                .map(|(col, field)| Arc::new(Field::new(col, field.clone(), false)))
+                .collect(),
+        );
+
+        self.options.format.file_source(table_schema)
+    }
+
+    /// Creates output ordering from user-specified file_sort_order or derives
+    /// from file orderings when user doesn't specify.
+    ///
+    /// If user specified `file_sort_order`, that takes precedence.
+    /// Otherwise, attempts to derive common ordering from file orderings in
+    /// the provided file groups.
+    pub fn try_create_output_ordering(
+        &self,
+        execution_props: &ExecutionProps,
+        file_groups: &[FileGroup],
+    ) -> datafusion_common::Result<Vec<LexOrdering>> {
+        // If user specified sort order, use that
+        if !self.options.file_sort_order.is_empty() {
+            return create_lex_ordering(
+                &self.table_schema,
+                &self.options.file_sort_order,
+                execution_props,
+            );
+        }
+        if let Some(ordering) = derive_common_ordering_from_files(file_groups) {
+            return Ok(vec![ordering]);
+        }
+        Ok(vec![])
+    }
+}
+
+/// Derives a common ordering from file orderings across all file groups.
+///
+/// Returns the common ordering if all files have compatible orderings,
+/// otherwise returns None.
+///
+/// The function finds the longest common prefix among all file orderings.
+/// For example, if files have orderings `[a, b, c]` and `[a, b]`, the common
+/// ordering is `[a, b]`.
+fn derive_common_ordering_from_files(file_groups: &[FileGroup]) -> Option<LexOrdering> {
+    enum CurrentOrderingState {
+        /// Initial state before processing any files
+        FirstFile,
+        /// Some common ordering found so far
+        SomeOrdering(LexOrdering),
+        /// No files have ordering
+        NoOrdering,
+    }
+    let mut state = CurrentOrderingState::FirstFile;
+
+    // Collect file orderings and track counts
+    for group in file_groups {
+        for file in group.iter() {
+            state = match (&state, &file.ordering) {
+                // If this is the first file with ordering, set it as current
+                (CurrentOrderingState::FirstFile, Some(ordering)) => {
+                    CurrentOrderingState::SomeOrdering(ordering.clone())
+                }
+                (CurrentOrderingState::FirstFile, None) => {
+                    CurrentOrderingState::NoOrdering
+                }
+                // If we have an existing ordering, find common prefix with new ordering
+                (CurrentOrderingState::SomeOrdering(current), Some(ordering)) => {
+                    // Find common prefix between current and new ordering
+                    let prefix_len = current
+                        .as_ref()
+                        .iter()
+                        .zip(ordering.as_ref().iter())
+                        .take_while(|(a, b)| a == b)
+                        .count();
+                    if prefix_len == 0 {
+                        log::trace!(
+                            "Cannot derive common ordering: no common prefix between orderings {current:?} and {ordering:?}"
+                        );
+                        return None;
+                    } else {
+                        let ordering =
+                            LexOrdering::new(current.as_ref()[..prefix_len].to_vec())
+                                .expect("prefix_len > 0, so ordering must be valid");
+                        CurrentOrderingState::SomeOrdering(ordering)
+                    }
+                }
+                // If one file has ordering and another doesn't, no common ordering
+                // Return None and log a trace message explaining why
+                (CurrentOrderingState::SomeOrdering(ordering), None)
+                | (CurrentOrderingState::NoOrdering, Some(ordering)) => {
+                    log::trace!(
+                        "Cannot derive common ordering: some files have ordering {ordering:?}, others don't"
+                    );
+                    return None;
+                }
+                // Both have no ordering, remain in NoOrdering state
+                (CurrentOrderingState::NoOrdering, None) => {
+                    CurrentOrderingState::NoOrdering
+                }
+            };
+        }
+    }
+
+    match state {
+        CurrentOrderingState::SomeOrdering(ordering) => Some(ordering),
+        _ => None,
+    }
+}
+
+// Expressions can be used for partition pruning if they can be evaluated using
+// only the partition columns and there are partition columns.
+fn can_be_evaluated_for_partition_pruning(
+    partition_column_names: &[&str],
+    expr: &Expr,
+) -> bool {
+    !partition_column_names.is_empty()
+        && expr_applicable_for_cols(partition_column_names, expr)
+}
+
+#[async_trait]
+impl TableProvider for ListingTable {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.table_schema)
+    }
+
+    fn constraints(&self) -> Option<&Constraints> {
+        Some(&self.constraints)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        let options = ScanArgs::default()
+            .with_projection(projection.map(|p| p.as_slice()))
+            .with_filters(Some(filters))
+            .with_limit(limit);
+        Ok(self.scan_with_args(state, options).await?.into_inner())
+    }
+
+    async fn scan_with_args<'a>(
+        &self,
+        state: &dyn Session,
+        args: ScanArgs<'a>,
+    ) -> datafusion_common::Result<ScanResult> {
+        let projection = args.projection().map(|p| p.to_vec());
+        let filters = args.filters().map(|f| f.to_vec()).unwrap_or_default();
+        let limit = args.limit();
+
+        // extract types of partition columns
+        let table_partition_cols = self
+            .options
+            .table_partition_cols
+            .iter()
+            .map(|col| Ok(Arc::new(self.table_schema.field_with_name(&col.0)?.clone())))
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
+
+        let table_partition_col_names = table_partition_cols
+            .iter()
+            .map(|field| field.name().as_str())
+            .collect::<Vec<_>>();
+
+        // If the filters can be resolved using only partition cols, there is no need to
+        // pushdown it to TableScan, otherwise, `unhandled` pruning predicates will be generated
+        let (partition_filters, filters): (Vec<_>, Vec<_>) =
+            filters.iter().cloned().partition(|filter| {
+                can_be_evaluated_for_partition_pruning(&table_partition_col_names, filter)
+            });
+
+        // We should not limit the number of partitioned files to scan if there are filters and limit
+        // at the same time. This is because the limit should be applied after the filters are applied.
+        let statistic_file_limit = if filters.is_empty() { limit } else { None };
+
+        let ListFilesResult {
+            file_groups: mut partitioned_file_lists,
+            statistics,
+            grouped_by_partition: partitioned_by_file_group,
+        } = self
+            .list_files_for_scan(state, &partition_filters, statistic_file_limit)
+            .await?;
+
+        // if no files need to be read, return an `EmptyExec`
+        if partitioned_file_lists.is_empty() {
+            let projected_schema = project_schema(&self.schema(), projection.as_ref())?;
+            return Ok(ScanResult::new(Arc::new(EmptyExec::new(projected_schema))));
+        }
+
+        let output_ordering = self.try_create_output_ordering(
+            state.execution_props(),
+            &partitioned_file_lists,
+        )?;
+        match state
+            .config_options()
+            .execution
+            .split_file_groups_by_statistics
+            .then(|| {
+                output_ordering.first().map(|output_ordering| {
+                    FileScanConfig::split_groups_by_statistics_with_target_partitions(
+                        &self.table_schema,
+                        &partitioned_file_lists,
+                        output_ordering,
+                        self.options.target_partitions,
+                    )
+                })
+            })
+            .flatten()
+        {
+            Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"),
+            Some(Ok(new_groups)) => {
+                if new_groups.len() <= self.options.target_partitions {
+                    partitioned_file_lists = new_groups;
+                } else {
+                    log::debug!(
+                        "attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered"
+                    )
+                }
+            }
+            None => {} // no ordering required
+        };
+
+        let Some(object_store_url) =
+            self.table_paths.first().map(ListingTableUrl::object_store)
+        else {
+            return Ok(ScanResult::new(Arc::new(EmptyExec::new(Arc::new(
+                Schema::empty(),
+            )))));
+        };
+
+        let file_source = self.create_file_source();
+
+        // create the execution plan
+        let plan = self
+            .options
+            .format
+            .create_physical_plan(
+                state,
+                FileScanConfigBuilder::new(object_store_url, file_source)
+                    .with_file_groups(partitioned_file_lists)
+                    .with_constraints(self.constraints.clone())
+                    .with_statistics(statistics)
+                    .with_projection_indices(projection)?
+                    .with_limit(limit)
+                    .with_output_ordering(output_ordering)
+                    .with_expr_adapter(self.expr_adapter_factory.clone())
+                    .with_partitioned_by_file_group(partitioned_by_file_group)
+                    .build(),
+            )
+            .await?;
+
+        Ok(ScanResult::new(plan))
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> datafusion_common::Result<Vec<TableProviderFilterPushDown>> {
+        let partition_column_names = self
+            .options
+            .table_partition_cols
+            .iter()
+            .map(|col| col.0.as_str())
+            .collect::<Vec<_>>();
+        filters
+            .iter()
+            .map(|filter| {
+                if can_be_evaluated_for_partition_pruning(&partition_column_names, filter)
+                {
+                    // if filter can be handled by partition pruning, it is exact
+                    return Ok(TableProviderFilterPushDown::Exact);
+                }
+
+                Ok(TableProviderFilterPushDown::Inexact)
+            })
+            .collect()
+    }
+
+    fn get_table_definition(&self) -> Option<&str> {
+        self.definition.as_deref()
+    }
+
+    async fn insert_into(
+        &self,
+        state: &dyn Session,
+        input: Arc<dyn ExecutionPlan>,
+        insert_op: InsertOp,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        // Check that the schema of the plan matches the schema of this table.
+        self.schema()
+            .logically_equivalent_names_and_types(&input.schema())?;
+
+        let table_path = &self.table_paths()[0];
+        if !table_path.is_collection() {
+            return plan_err!(
+                "Inserting into a ListingTable backed by a single file is not supported, URL is possibly missing a trailing `/`. \
+                To append to an existing file use StreamTable, e.g. by using CREATE UNBOUNDED EXTERNAL TABLE"
+            );
+        }
+
+        // Get the object store for the table path.
+        let store = state.runtime_env().object_store(table_path)?;
+
+        let file_list_stream = pruned_partition_list(
+            state,
+            store.as_ref(),
+            table_path,
+            &[],
+            &self.options.file_extension,
+            &self.options.table_partition_cols,
+        )
+        .await?;
+
+        let file_group = file_list_stream.try_collect::<Vec<_>>().await?.into();
+        let keep_partition_by_columns =
+            state.config_options().execution.keep_partition_by_columns;
+
+        // Invalidate cache entries for this table if they exist
+        if let Some(lfc) = state.runtime_env().cache_manager.get_list_files_cache() {
+            let key = TableScopedPath {
+                table: table_path.get_table_ref().clone(),
+                path: table_path.prefix().clone(),
+            };
+            let _ = lfc.remove(&key);
+        }
+
+        // Sink related option, apart from format
+        let config = FileSinkConfig {
+            original_url: String::default(),
+            object_store_url: self.table_paths()[0].object_store(),
+            table_paths: self.table_paths().clone(),
+            file_group,
+            output_schema: self.schema(),
+            table_partition_cols: self.options.table_partition_cols.clone(),
+            insert_op,
+            keep_partition_by_columns,
+            file_extension: self.options().format.get_ext(),
+            file_output_mode: FileOutputMode::Automatic,
+        };
+
+        // For writes, we only use user-specified ordering (no file groups to derive from)
+        let orderings = self.try_create_output_ordering(state.execution_props(), &[])?;
+        // It is sufficient to pass only one of the equivalent orderings:
+        let order_requirements = orderings.into_iter().next().map(Into::into);
+
+        self.options()
+            .format
+            .create_writer_physical_plan(input, state, config, order_requirements)
+            .await
+    }
+
+    fn get_column_default(&self, column: &str) -> Option<&Expr> {
+        self.column_defaults.get(column)
+    }
+}
+
+impl ListingTable {
+    /// Get the list of files for a scan as well as the file level statistics.
+    /// The list is grouped to let the execution plan know how the files should
+    /// be distributed to different threads / executors.
+    pub async fn list_files_for_scan<'a>(
+        &'a self,
+        ctx: &'a dyn Session,
+        filters: &'a [Expr],
+        limit: Option<usize>,
+    ) -> datafusion_common::Result<ListFilesResult> {
+        let store = if let Some(url) = self.table_paths.first() {
+            ctx.runtime_env().object_store(url)?
+        } else {
+            return Ok(ListFilesResult {
+                file_groups: vec![],
+                statistics: Statistics::new_unknown(&self.file_schema),
+                grouped_by_partition: false,
+            });
+        };
+        // list files (with partitions)
+        let file_list = future::try_join_all(self.table_paths.iter().map(|table_path| {
+            pruned_partition_list(
+                ctx,
+                store.as_ref(),
+                table_path,
+                filters,
+                &self.options.file_extension,
+                &self.options.table_partition_cols,
+            )
+        }))
+        .await?;
+        let meta_fetch_concurrency =
+            ctx.config_options().execution.meta_fetch_concurrency;
+        let file_list = stream::iter(file_list).flatten_unordered(meta_fetch_concurrency);
+        // collect the statistics and ordering if required by the config
+        let files = file_list
+            .map(|part_file| async {
+                let part_file = part_file?;
+                let (statistics, ordering) = if self.options.collect_stat {
+                    self.do_collect_statistics_and_ordering(ctx, &store, &part_file)
+                        .await?
+                } else {
+                    (Arc::new(Statistics::new_unknown(&self.file_schema)), None)
+                };
+                Ok(part_file
+                    .with_statistics(statistics)
+                    .with_ordering(ordering))
+            })
+            .boxed()
+            .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency);
+
+        let (file_group, inexact_stats) =
+            get_files_with_limit(files, limit, self.options.collect_stat).await?;
+
+        // Threshold: 0 = disabled, N > 0 = enabled when distinct_keys >= N
+        //
+        // When enabled, files are grouped by their Hive partition column values, allowing
+        // FileScanConfig to declare Hash partitioning. This enables the optimizer to skip
+        // hash repartitioning for aggregates and joins on partition columns.
+        let threshold = ctx.config_options().optimizer.preserve_file_partitions;
+
+        let (file_groups, grouped_by_partition) = if threshold > 0
+            && !self.options.table_partition_cols.is_empty()
+        {
+            let grouped =
+                file_group.group_by_partition_values(self.options.target_partitions);
+            if grouped.len() >= threshold {
+                (grouped, true)
+            } else {
+                let all_files: Vec<_> =
+                    grouped.into_iter().flat_map(|g| g.into_inner()).collect();
+                (
+                    FileGroup::new(all_files).split_files(self.options.target_partitions),
+                    false,
+                )
+            }
+        } else {
+            (
+                file_group.split_files(self.options.target_partitions),
+                false,
+            )
+        };
+
+        let (file_groups, stats) = compute_all_files_statistics(
+            file_groups,
+            self.schema(),
+            self.options.collect_stat,
+            inexact_stats,
+        )?;
+
+        // Note: Statistics already include both file columns and partition columns.
+        // PartitionedFile::with_statistics automatically appends exact partition column
+        // statistics (min=max=partition_value, null_count=0, distinct_count=1) computed
+        // from partition_values.
+        Ok(ListFilesResult {
+            file_groups,
+            statistics: stats,
+            grouped_by_partition,
+        })
+    }
+
+    /// Collects statistics and ordering for a given partitioned file.
+    ///
+    /// This method checks if statistics are cached. If cached, it returns the
+    /// cached statistics and infers ordering separately. If not cached, it infers
+    /// both statistics and ordering in a single metadata read for efficiency.
+    async fn do_collect_statistics_and_ordering(
+        &self,
+        ctx: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        part_file: &PartitionedFile,
+    ) -> datafusion_common::Result<(Arc<Statistics>, Option<LexOrdering>)> {
+        use datafusion_execution::cache::cache_manager::CachedFileMetadata;
+
+        let path = &part_file.object_meta.location;
+        let meta = &part_file.object_meta;
+
+        // Check cache first - if we have valid cached statistics and ordering
+        if let Some(cached) = self.collected_statistics.get(path)
+            && cached.is_valid_for(meta)
+        {
+            // Return cached statistics and ordering
+            return Ok((Arc::clone(&cached.statistics), cached.ordering.clone()));
+        }
+
+        // Cache miss or invalid: fetch both statistics and ordering in a single metadata read
+        let file_meta = self
+            .options
+            .format
+            .infer_stats_and_ordering(ctx, store, Arc::clone(&self.file_schema), meta)
+            .await?;
+
+        let statistics = Arc::new(file_meta.statistics);
+
+        // Store in cache
+        self.collected_statistics.put(
+            path,
+            CachedFileMetadata::new(
+                meta.clone(),
+                Arc::clone(&statistics),
+                file_meta.ordering.clone(),
+            ),
+        );
+
+        Ok((statistics, file_meta.ordering))
+    }
+}
+
+/// Processes a stream of partitioned files and returns a `FileGroup` containing the files.
+///
+/// This function collects files from the provided stream until either:
+/// 1. The stream is exhausted
+/// 2. The accumulated number of rows exceeds the provided `limit` (if specified)
+///
+/// # Arguments
+/// * `files` - A stream of `Result<PartitionedFile>` items to process
+/// * `limit` - An optional row count limit. If provided, the function will stop collecting files
+///   once the accumulated number of rows exceeds this limit
+/// * `collect_stats` - Whether to collect and accumulate statistics from the files
+///
+/// # Returns
+/// A `Result` containing a `FileGroup` with the collected files
+/// and a boolean indicating whether the statistics are inexact.
+///
+/// # Note
+/// The function will continue processing files if statistics are not available or if the
+/// limit is not provided. If `collect_stats` is false, statistics won't be accumulated
+/// but files will still be collected.
+async fn get_files_with_limit(
+    files: impl Stream<Item = datafusion_common::Result<PartitionedFile>>,
+    limit: Option<usize>,
+    collect_stats: bool,
+) -> datafusion_common::Result<(FileGroup, bool)> {
+    let mut file_group = FileGroup::default();
+    // Fusing the stream allows us to call next safely even once it is finished.
+    let mut all_files = Box::pin(files.fuse());
+    enum ProcessingState {
+        ReadingFiles,
+        ReachedLimit,
+    }
+
+    let mut state = ProcessingState::ReadingFiles;
+    let mut num_rows = Precision::Absent;
+
+    while let Some(file_result) = all_files.next().await {
+        // Early exit if we've already reached our limit
+        if matches!(state, ProcessingState::ReachedLimit) {
+            break;
+        }
+
+        let file = file_result?;
+
+        // Update file statistics regardless of state
+        if collect_stats && let Some(file_stats) = &file.statistics {
+            num_rows = if file_group.is_empty() {
+                // For the first file, just take its row count
+                file_stats.num_rows
+            } else {
+                // For subsequent files, accumulate the counts
+                num_rows.add(&file_stats.num_rows)
+            };
+        }
+
+        // Always add the file to our group
+        file_group.push(file);
+
+        // Check if we've hit the limit (if one was specified)
+        if let Some(limit) = limit
+            && let Precision::Exact(row_count) = num_rows
+            && row_count > limit
+        {
+            state = ProcessingState::ReachedLimit;
+        }
+    }
+    // If we still have files in the stream, it means that the limit kicked
+    // in, and the statistic could have been different had we processed the
+    // files in a different order.
+    let inexact_stats = all_files.next().await.is_some();
+    Ok((file_group, inexact_stats))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::compute::SortOptions;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+    use std::sync::Arc;
+
+    /// Helper to create a PhysicalSortExpr
+    fn sort_expr(
+        name: &str,
+        idx: usize,
+        descending: bool,
+        nulls_first: bool,
+    ) -> PhysicalSortExpr {
+        PhysicalSortExpr::new(
+            Arc::new(Column::new(name, idx)),
+            SortOptions {
+                descending,
+                nulls_first,
+            },
+        )
+    }
+
+    /// Helper to create a LexOrdering (unwraps the Option)
+    fn lex_ordering(exprs: Vec<PhysicalSortExpr>) -> LexOrdering {
+        LexOrdering::new(exprs).expect("expected non-empty ordering")
+    }
+
+    /// Helper to create a PartitionedFile with optional ordering
+    fn create_file(name: &str, ordering: Option<LexOrdering>) -> PartitionedFile {
+        PartitionedFile::new(name.to_string(), 1024).with_ordering(ordering)
+    }
+
+    #[test]
+    fn test_derive_common_ordering_all_files_same_ordering() {
+        // All files have the same ordering -> returns that ordering
+        let ordering = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, true, false),
+        ]);
+
+        let file_groups = vec![
+            FileGroup::new(vec![
+                create_file("f1.parquet", Some(ordering.clone())),
+                create_file("f2.parquet", Some(ordering.clone())),
+            ]),
+            FileGroup::new(vec![create_file("f3.parquet", Some(ordering.clone()))]),
+        ];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering));
+    }
+
+    #[test]
+    fn test_derive_common_ordering_common_prefix() {
+        // Files have different orderings but share a common prefix
+        let ordering_abc = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, false, true),
+            sort_expr("c", 2, false, true),
+        ]);
+        let ordering_ab = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, false, true),
+        ]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering_abc)),
+            create_file("f2.parquet", Some(ordering_ab.clone())),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering_ab));
+    }
+
+    #[test]
+    fn test_derive_common_ordering_no_common_prefix() {
+        // Files have completely different orderings -> returns None
+        let ordering_a = lex_ordering(vec![sort_expr("a", 0, false, true)]);
+        let ordering_b = lex_ordering(vec![sort_expr("b", 1, false, true)]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering_a)),
+            create_file("f2.parquet", Some(ordering_b)),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_mixed_with_none() {
+        // Some files have ordering, some don't -> returns None
+        let ordering = lex_ordering(vec![sort_expr("a", 0, false, true)]);
+
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", Some(ordering)),
+            create_file("f2.parquet", None),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_all_none() {
+        // No files have ordering -> returns None
+        let file_groups = vec![FileGroup::new(vec![
+            create_file("f1.parquet", None),
+            create_file("f2.parquet", None),
+        ])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_empty_groups() {
+        // Empty file groups -> returns None
+        let file_groups: Vec<FileGroup> = vec![];
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn test_derive_common_ordering_single_file() {
+        // Single file with ordering -> returns that ordering
+        let ordering = lex_ordering(vec![
+            sort_expr("a", 0, false, true),
+            sort_expr("b", 1, true, false),
+        ]);
+
+        let file_groups = vec![FileGroup::new(vec![create_file(
+            "f1.parquet",
+            Some(ordering.clone()),
+        )])];
+
+        let result = derive_common_ordering_from_files(&file_groups);
+        assert_eq!(result, Some(ordering));
+    }
+}
diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml
index 7307c4de87a8a..1009e9aee477b 100644
--- a/datafusion/catalog/Cargo.toml
+++ b/datafusion/catalog/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-catalog"
 description = "datafusion-catalog"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -42,7 +42,6 @@ datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
-datafusion-sql = { workspace = true }
 futures = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
@@ -50,5 +49,8 @@ object_store = { workspace = true }
 parking_lot = { workspace = true }
 tokio = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
diff --git a/datafusion/catalog/README.md b/datafusion/catalog/README.md
index 5b201e736fdc4..48c61b43c025b 100644
--- a/datafusion/catalog/README.md
+++ b/datafusion/catalog/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Catalog
+# Apache DataFusion Catalog
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides catalog management functionality, including catalogs, schemas, and tables.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/catalog/src/async.rs b/datafusion/catalog/src/async.rs
index 5d7a51ad71232..1b8039d828fdb 100644
--- a/datafusion/catalog/src/async.rs
+++ b/datafusion/catalog/src/async.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use async_trait::async_trait;
-use datafusion_common::{error::Result, not_impl_err, HashMap, TableReference};
+use datafusion_common::{HashMap, TableReference, error::Result, not_impl_err};
 use datafusion_execution::config::SessionConfig;
 
 use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider};
@@ -60,7 +60,9 @@ impl SchemaProvider for ResolvedSchemaProvider {
     }
 
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
-        not_impl_err!("Attempt to deregister table '{name}' with ResolvedSchemaProvider which is not supported")
+        not_impl_err!(
+            "Attempt to deregister table '{name}' with ResolvedSchemaProvider which is not supported"
+        )
     }
 
     fn table_exist(&self, name: &str) -> bool {
@@ -193,7 +195,7 @@ impl CatalogProviderList for ResolvedCatalogProviderList {
 ///
 /// See the [remote_catalog.rs] for an end to end example
 ///
-/// [remote_catalog.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
+/// [remote_catalog.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/remote_catalog.rs
 #[async_trait]
 pub trait AsyncSchemaProvider: Send + Sync {
     /// Lookup a table in the schema provider
@@ -425,14 +427,14 @@ mod tests {
     use std::{
         any::Any,
         sync::{
-            atomic::{AtomicU32, Ordering},
             Arc,
+            atomic::{AtomicU32, Ordering},
         },
     };
 
     use arrow::datatypes::SchemaRef;
     use async_trait::async_trait;
-    use datafusion_common::{error::Result, Statistics, TableReference};
+    use datafusion_common::{Statistics, TableReference, error::Result};
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::{Expr, TableType};
     use datafusion_physical_plan::ExecutionPlan;
@@ -737,7 +739,7 @@ mod tests {
         ] {
             let async_provider = MockAsyncCatalogProviderList::default();
             let cached_provider = async_provider
-                .resolve(&[table_ref.clone()], &test_config())
+                .resolve(std::slice::from_ref(table_ref), &test_config())
                 .await
                 .unwrap();
 
diff --git a/datafusion/catalog/src/catalog.rs b/datafusion/catalog/src/catalog.rs
index 71b9eccf9d657..bb9e89eba2fef 100644
--- a/datafusion/catalog/src/catalog.rs
+++ b/datafusion/catalog/src/catalog.rs
@@ -20,8 +20,8 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 pub use crate::schema::SchemaProvider;
-use datafusion_common::not_impl_err;
 use datafusion_common::Result;
+use datafusion_common::not_impl_err;
 
 /// Represents a catalog, comprising a number of named schemas.
 ///
@@ -61,7 +61,7 @@ use datafusion_common::Result;
 /// schemas and tables exist.
 ///
 /// [Delta Lake]: https://delta.io/
-/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/remote_catalog.rs
+/// [`remote_catalog`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/remote_catalog.rs
 ///
 /// The [`CatalogProvider`] can support this use case, but it takes some care.
 /// The planning APIs in DataFusion are not `async` and thus network IO can not
@@ -100,7 +100,7 @@ use datafusion_common::Result;
 ///
 /// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html
 /// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75
-/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs
+/// [`catalog.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/catalog.rs
 /// [delta-rs]: https://github.com/delta-io/delta-rs
 /// [`UnityCatalogProvider`]: https://github.com/delta-io/delta-rs/blob/951436ecec476ce65b5ed3b58b50fb0846ca7b91/crates/deltalake-core/src/data_catalog/unity/datafusion.rs#L111-L123
 ///
diff --git a/datafusion/catalog/src/cte_worktable.rs b/datafusion/catalog/src/cte_worktable.rs
index d72a30909c02c..9565dcc60141e 100644
--- a/datafusion/catalog/src/cte_worktable.rs
+++ b/datafusion/catalog/src/cte_worktable.rs
@@ -17,20 +17,18 @@
 
 //! CteWorkTable implementation used for recursive queries
 
+use std::any::Any;
+use std::borrow::Cow;
 use std::sync::Arc;
-use std::{any::Any, borrow::Cow};
 
-use crate::Session;
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_physical_plan::work_table::WorkTableExec;
-
-use datafusion_physical_plan::ExecutionPlan;
-
 use datafusion_common::error::Result;
 use datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableType};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::work_table::WorkTableExec;
 
-use crate::TableProvider;
+use crate::{ScanArgs, ScanResult, Session, TableProvider};
 
 /// The temporary working table where the previous iteration of a recursive query is stored
 /// Naming is based on PostgreSQL's implementation.
@@ -71,7 +69,7 @@ impl TableProvider for CteWorkTable {
         self
     }
 
-    fn get_logical_plan(&self) -> Option<Cow<LogicalPlan>> {
+    fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         None
     }
 
@@ -85,16 +83,28 @@ impl TableProvider for CteWorkTable {
 
     async fn scan(
         &self,
-        _state: &dyn Session,
-        _projection: Option<&Vec<usize>>,
-        _filters: &[Expr],
-        _limit: Option<usize>,
+        state: &dyn Session,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        // TODO: pushdown filters and limits
-        Ok(Arc::new(WorkTableExec::new(
+        let options = ScanArgs::default()
+            .with_projection(projection.map(|p| p.as_slice()))
+            .with_filters(Some(filters))
+            .with_limit(limit);
+        Ok(self.scan_with_args(state, options).await?.into_inner())
+    }
+
+    async fn scan_with_args<'a>(
+        &self,
+        _state: &dyn Session,
+        args: ScanArgs<'a>,
+    ) -> Result<ScanResult> {
+        Ok(ScanResult::new(Arc::new(WorkTableExec::new(
             self.name.clone(),
             Arc::clone(&self.table_schema),
-        )))
+            args.projection().map(|p| p.to_vec()),
+        )?)))
     }
 
     fn supports_filters_pushdown(
diff --git a/datafusion/catalog/src/default_table_source.rs b/datafusion/catalog/src/default_table_source.rs
index 9db8242caa999..fb6531ba0b2ee 100644
--- a/datafusion/catalog/src/default_table_source.rs
+++ b/datafusion/catalog/src/default_table_source.rs
@@ -23,7 +23,7 @@ use std::{any::Any, borrow::Cow};
 use crate::TableProvider;
 
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Constraints};
+use datafusion_common::{Constraints, internal_err};
 use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource, TableType};
 
 /// Implements [`TableSource`] for a [`TableProvider`]
@@ -33,8 +33,6 @@ use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource, TableType}
 ///
 /// It is used so logical plans in the `datafusion_expr` crate do not have a
 /// direct dependency on physical plans, such as [`TableProvider`]s.
-///
-/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html
 pub struct DefaultTableSource {
     /// table provider
     pub table_provider: Arc<dyn TableProvider>,
@@ -78,7 +76,7 @@ impl TableSource for DefaultTableSource {
         self.table_provider.supports_filters_pushdown(filter)
     }
 
-    fn get_logical_plan(&self) -> Option<Cow<datafusion_expr::LogicalPlan>> {
+    fn get_logical_plan(&'_ self) -> Option<Cow<'_, datafusion_expr::LogicalPlan>> {
         self.table_provider.get_logical_plan()
     }
 
diff --git a/datafusion/catalog/src/information_schema.rs b/datafusion/catalog/src/information_schema.rs
index 057d1a8198820..ea93dc21a3f5b 100644
--- a/datafusion/catalog/src/information_schema.rs
+++ b/datafusion/catalog/src/information_schema.rs
@@ -24,20 +24,24 @@ use crate::{CatalogProviderList, SchemaProvider, TableProvider};
 use arrow::array::builder::{BooleanBuilder, UInt8Builder};
 use arrow::{
     array::{StringBuilder, UInt64Builder},
-    datatypes::{DataType, Field, Schema, SchemaRef},
+    datatypes::{DataType, Field, FieldRef, Schema, SchemaRef},
     record_batch::RecordBatch,
 };
 use async_trait::async_trait;
+use datafusion_common::DataFusionError;
 use datafusion_common::config::{ConfigEntry, ConfigOptions};
 use datafusion_common::error::Result;
 use datafusion_common::types::NativeType;
-use datafusion_common::DataFusionError;
 use datafusion_execution::TaskContext;
-use datafusion_expr::{AggregateUDF, ScalarUDF, Signature, TypeSignature, WindowUDF};
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::function::WindowUDFFieldArgs;
+use datafusion_expr::{
+    AggregateUDF, ReturnFieldArgs, ScalarUDF, Signature, TypeSignature, WindowUDF,
+};
 use datafusion_expr::{TableType, Volatility};
+use datafusion_physical_plan::SendableRecordBatchStream;
 use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion_physical_plan::streaming::PartitionStream;
-use datafusion_physical_plan::SendableRecordBatchStream;
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fmt::Debug;
 use std::{any::Any, sync::Arc};
@@ -103,12 +107,14 @@ impl InformationSchemaConfig {
                     // schema name may not exist in the catalog, so we need to check
                     if let Some(schema) = catalog.schema(&schema_name) {
                         for table_name in schema.table_names() {
-                            if let Some(table) = schema.table(&table_name).await? {
+                            if let Some(table_type) =
+                                schema.table_type(&table_name).await?
+                            {
                                 builder.add_table(
                                     &catalog_name,
                                     &schema_name,
                                     &table_name,
-                                    table.table_type(),
+                                    table_type,
                                 );
                             }
                         }
@@ -135,11 +141,11 @@ impl InformationSchemaConfig {
             let catalog = self.catalog_list.catalog(&catalog_name).unwrap();
 
             for schema_name in catalog.schema_names() {
-                if schema_name != INFORMATION_SCHEMA {
-                    if let Some(schema) = catalog.schema(&schema_name) {
-                        let schema_owner = schema.owner_name();
-                        builder.add_schemata(&catalog_name, &schema_name, schema_owner);
-                    }
+                if schema_name != INFORMATION_SCHEMA
+                    && let Some(schema) = catalog.schema(&schema_name)
+                {
+                    let schema_owner = schema.owner_name();
+                    builder.add_schemata(&catalog_name, &schema_name, schema_owner);
                 }
             }
         }
@@ -213,11 +219,16 @@ impl InformationSchemaConfig {
     fn make_df_settings(
         &self,
         config_options: &ConfigOptions,
+        runtime_env: &Arc<RuntimeEnv>,
         builder: &mut InformationSchemaDfSettingsBuilder,
     ) {
         for entry in config_options.entries() {
             builder.add_setting(entry);
         }
+        // Add runtime configuration entries
+        for entry in runtime_env.config_entries() {
+            builder.add_setting(entry);
+        }
     }
 
     fn make_routines(
@@ -243,7 +254,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "SCALAR",
                     udf.documentation().map(|d| d.description.to_string()),
                     udf.documentation().map(|d| d.syntax_example.to_string()),
@@ -263,7 +274,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udaf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "AGGREGATE",
                     udaf.documentation().map(|d| d.description.to_string()),
                     udaf.documentation().map(|d| d.syntax_example.to_string()),
@@ -283,7 +294,7 @@ impl InformationSchemaConfig {
                     name,
                     "FUNCTION",
                     Self::is_deterministic(udwf.signature()),
-                    return_type,
+                    return_type.as_ref(),
                     "WINDOW",
                     udwf.documentation().map(|d| d.description.to_string()),
                     udwf.documentation().map(|d| d.syntax_example.to_string()),
@@ -413,14 +424,28 @@ fn get_udf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
+                let scalar_arguments = vec![None; arg_fields.len()];
                 let return_type = udf
-                    .return_type(&arg_types)
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &arg_fields,
+                        scalar_arguments: &scalar_arguments,
+                    })
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
                     .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
                 (arg_types, return_type)
             })
@@ -439,14 +464,24 @@ fn get_udaf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
                 let return_type = udaf
-                    .return_type(&arg_types)
-                    .ok()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)));
+                    .return_field(&arg_fields)
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
+                    .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
                 (arg_types, return_type)
             })
@@ -465,20 +500,34 @@ fn get_udwf_args_and_return_types(
         Ok(arg_types
             .into_iter()
             .map(|arg_types| {
-                // only handle the function which implemented [`ScalarUDFImpl::return_type`] method
+                let arg_fields: Vec<FieldRef> = arg_types
+                    .iter()
+                    .enumerate()
+                    .map(|(i, t)| {
+                        Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
+                    })
+                    .collect();
+                let return_type = udwf
+                    .field(WindowUDFFieldArgs::new(&arg_fields, udwf.name()))
+                    .map(|f| {
+                        remove_native_type_prefix(&NativeType::from(
+                            f.data_type().clone(),
+                        ))
+                    })
+                    .ok();
                 let arg_types = arg_types
                     .into_iter()
-                    .map(|t| remove_native_type_prefix(NativeType::from(t)))
+                    .map(|t| remove_native_type_prefix(&NativeType::from(t)))
                     .collect::<Vec<_>>();
-                (arg_types, None)
+                (arg_types, return_type)
             })
             .collect::<BTreeSet<_>>())
     }
 }
 
 #[inline]
-fn remove_native_type_prefix(native_type: NativeType) -> String {
-    format!("{native_type:?}")
+fn remove_native_type_prefix(native_type: &NativeType) -> String {
+    format!("{native_type}")
 }
 
 #[async_trait]
@@ -490,7 +539,7 @@ impl SchemaProvider for InformationSchemaProvider {
     fn table_names(&self) -> Vec<String> {
         INFORMATION_SCHEMA_TABLES
             .iter()
-            .map(|t| t.to_string())
+            .map(|t| (*t).to_string())
             .collect()
     }
 
@@ -677,7 +726,7 @@ impl InformationSchemaViewBuilder {
         catalog_name: impl AsRef<str>,
         schema_name: impl AsRef<str>,
         table_name: impl AsRef<str>,
-        definition: Option<impl AsRef<str>>,
+        definition: Option<&(impl AsRef<str> + ?Sized)>,
     ) {
         // Note: append_value is actually infallible.
         self.catalog_names.append_value(catalog_name.as_ref());
@@ -808,7 +857,7 @@ impl InformationSchemaColumnsBuilder {
     ) {
         use DataType::*;
 
-        // Note: append_value is actually infallable.
+        // Note: append_value is actually infallible.
         self.catalog_names.append_value(catalog_name);
         self.schema_names.append_value(schema_name);
         self.table_names.append_value(table_name);
@@ -825,8 +874,7 @@ impl InformationSchemaColumnsBuilder {
         self.is_nullables.append_value(nullable_str);
 
         // "System supplied type" --> Use debug format of the datatype
-        self.data_types
-            .append_value(format!("{:?}", field.data_type()));
+        self.data_types.append_value(field.data_type().to_string());
 
         // "If data_type identifies a character or bit string type, the
         // declared maximum length; null for all other data types or
@@ -1059,7 +1107,12 @@ impl PartitionStream for InformationSchemaDfSettings {
             // TODO: Stream this
             futures::stream::once(async move {
                 // create a mem table with the names of tables
-                config.make_df_settings(ctx.session_config().options(), &mut builder);
+                let runtime_env = ctx.runtime_env();
+                config.make_df_settings(
+                    ctx.session_config().options(),
+                    &runtime_env,
+                    &mut builder,
+                );
                 Ok(builder.finish())
             }),
         ))
@@ -1155,7 +1208,7 @@ struct InformationSchemaRoutinesBuilder {
 }
 
 impl InformationSchemaRoutinesBuilder {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn add_routine(
         &mut self,
         catalog_name: impl AsRef<str>,
@@ -1163,7 +1216,7 @@ impl InformationSchemaRoutinesBuilder {
         routine_name: impl AsRef<str>,
         routine_type: impl AsRef<str>,
         is_deterministic: bool,
-        data_type: Option<impl AsRef<str>>,
+        data_type: Option<&impl AsRef<str>>,
         function_type: impl AsRef<str>,
         description: Option<impl AsRef<str>>,
         syntax_example: Option<impl AsRef<str>>,
@@ -1289,7 +1342,7 @@ struct InformationSchemaParametersBuilder {
 }
 
 impl InformationSchemaParametersBuilder {
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn add_parameter(
         &mut self,
         specific_catalog: impl AsRef<str>,
@@ -1297,7 +1350,7 @@ impl InformationSchemaParametersBuilder {
         specific_name: impl AsRef<str>,
         ordinal_position: u64,
         parameter_mode: impl AsRef<str>,
-        parameter_name: Option<impl AsRef<str>>,
+        parameter_name: Option<&(impl AsRef<str> + ?Sized)>,
         data_type: impl AsRef<str>,
         parameter_default: Option<impl AsRef<str>>,
         is_variadic: bool,
@@ -1359,3 +1412,94 @@ impl PartitionStream for InformationSchemaParameters {
         ))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::CatalogProvider;
+
+    #[tokio::test]
+    async fn make_tables_uses_table_type() {
+        let config = InformationSchemaConfig {
+            catalog_list: Arc::new(Fixture),
+        };
+        let mut builder = InformationSchemaTablesBuilder {
+            catalog_names: StringBuilder::new(),
+            schema_names: StringBuilder::new(),
+            table_names: StringBuilder::new(),
+            table_types: StringBuilder::new(),
+            schema: Arc::new(Schema::empty()),
+        };
+
+        assert!(config.make_tables(&mut builder).await.is_ok());
+
+        assert_eq!("BASE TABLE", builder.table_types.finish().value(0));
+    }
+
+    #[derive(Debug)]
+    struct Fixture;
+
+    #[async_trait]
+    impl SchemaProvider for Fixture {
+        // InformationSchemaConfig::make_tables should use this.
+        async fn table_type(&self, _: &str) -> Result<Option<TableType>> {
+            Ok(Some(TableType::Base))
+        }
+
+        // InformationSchemaConfig::make_tables used this before `table_type`
+        // existed but should not, as it may be expensive.
+        async fn table(&self, _: &str) -> Result<Option<Arc<dyn TableProvider>>> {
+            panic!(
+                "InformationSchemaConfig::make_tables called SchemaProvider::table instead of table_type"
+            )
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            unimplemented!("not required for these tests")
+        }
+
+        fn table_names(&self) -> Vec<String> {
+            vec!["atable".to_string()]
+        }
+
+        fn table_exist(&self, _: &str) -> bool {
+            unimplemented!("not required for these tests")
+        }
+    }
+
+    impl CatalogProviderList for Fixture {
+        fn as_any(&self) -> &dyn Any {
+            unimplemented!("not required for these tests")
+        }
+
+        fn register_catalog(
+            &self,
+            _: String,
+            _: Arc<dyn CatalogProvider>,
+        ) -> Option<Arc<dyn CatalogProvider>> {
+            unimplemented!("not required for these tests")
+        }
+
+        fn catalog_names(&self) -> Vec<String> {
+            vec!["acatalog".to_string()]
+        }
+
+        fn catalog(&self, _: &str) -> Option<Arc<dyn CatalogProvider>> {
+            Some(Arc::new(Self))
+        }
+    }
+
+    impl CatalogProvider for Fixture {
+        fn as_any(&self) -> &dyn Any {
+            unimplemented!("not required for these tests")
+        }
+
+        fn schema_names(&self) -> Vec<String> {
+            vec!["aschema".to_string()]
+        }
+
+        fn schema(&self, _: &str) -> Option<Arc<dyn SchemaProvider>> {
+            Some(Arc::new(Self))
+        }
+    }
+}
diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs
index 0394b05277dac..931941e8fdfad 100644
--- a/datafusion/catalog/src/lib.rs
+++ b/datafusion/catalog/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Interfaces and default implementations of catalogs and schemas.
 //!
@@ -46,13 +47,13 @@ mod dynamic_file;
 mod schema;
 mod table;
 
+pub use r#async::*;
 pub use catalog::*;
 pub use datafusion_session::Session;
 pub use dynamic_file::catalog::*;
 pub use memory::{
     MemTable, MemoryCatalogProvider, MemoryCatalogProviderList, MemorySchemaProvider,
 };
-pub use r#async::*;
 pub use schema::*;
 pub use table::*;
 
diff --git a/datafusion/catalog/src/listing_schema.rs b/datafusion/catalog/src/listing_schema.rs
index cc2c2ee606b3d..77fbea8577089 100644
--- a/datafusion/catalog/src/listing_schema.rs
+++ b/datafusion/catalog/src/listing_schema.rs
@@ -26,7 +26,7 @@ use crate::{SchemaProvider, TableProvider, TableProviderFactory};
 
 use crate::Session;
 use datafusion_common::{
-    Constraints, DFSchema, DataFusionError, HashMap, TableReference,
+    DFSchema, DataFusionError, HashMap, TableReference, internal_datafusion_err,
 };
 use datafusion_expr::CreateExternalTable;
 
@@ -111,17 +111,13 @@ impl ListingSchemaProvider {
             let file_name = table
                 .path
                 .file_name()
-                .ok_or_else(|| {
-                    DataFusionError::Internal("Cannot parse file name!".to_string())
-                })?
+                .ok_or_else(|| internal_datafusion_err!("Cannot parse file name!"))?
                 .to_str()
-                .ok_or_else(|| {
-                    DataFusionError::Internal("Cannot parse file name!".to_string())
-                })?;
+                .ok_or_else(|| internal_datafusion_err!("Cannot parse file name!"))?;
             let table_name = file_name.split('.').collect_vec()[0];
-            let table_path = table.to_string().ok_or_else(|| {
-                DataFusionError::Internal("Cannot parse file name!".to_string())
-            })?;
+            let table_path = table
+                .to_string()
+                .ok_or_else(|| internal_datafusion_err!("Cannot parse file name!"))?;
 
             if !self.table_exist(table_name) {
                 let table_url = format!("{}/{}", self.authority, table_path);
@@ -131,21 +127,13 @@ impl ListingSchemaProvider {
                     .factory
                     .create(
                         state,
-                        &CreateExternalTable {
-                            schema: Arc::new(DFSchema::empty()),
+                        &CreateExternalTable::builder(
                             name,
-                            location: table_url,
-                            file_type: self.format.clone(),
-                            table_partition_cols: vec![],
-                            if_not_exists: false,
-                            temporary: false,
-                            definition: None,
-                            order_exprs: vec![],
-                            unbounded: false,
-                            options: Default::default(),
-                            constraints: Constraints::empty(),
-                            column_defaults: Default::default(),
-                        },
+                            table_url,
+                            self.format.clone(),
+                            Arc::new(DFSchema::empty()),
+                        )
+                        .build(),
                     )
                     .await?;
                 let _ =
diff --git a/datafusion/catalog/src/memory/schema.rs b/datafusion/catalog/src/memory/schema.rs
index f1b3628f7affc..97a579b021617 100644
--- a/datafusion/catalog/src/memory/schema.rs
+++ b/datafusion/catalog/src/memory/schema.rs
@@ -20,7 +20,7 @@
 use crate::{SchemaProvider, TableProvider};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use datafusion_common::{exec_err, DataFusionError};
+use datafusion_common::{DataFusionError, exec_err};
 use std::any::Any;
 use std::sync::Arc;
 
diff --git a/datafusion/catalog/src/memory/table.rs b/datafusion/catalog/src/memory/table.rs
index 81243e2c4889e..9b91062657a07 100644
--- a/datafusion/catalog/src/memory/table.rs
+++ b/datafusion/catalog/src/memory/table.rs
@@ -23,25 +23,32 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::TableProvider;
-use datafusion_common::error::Result;
-use datafusion_expr::Expr;
-use datafusion_expr::TableType;
-use datafusion_physical_expr::create_physical_sort_exprs;
-use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::{
-    common, ExecutionPlan, ExecutionPlanProperties, Partitioning,
-};
 
-use arrow::datatypes::SchemaRef;
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, RecordBatch as ArrowRecordBatch, UInt64Array,
+};
+use arrow::compute::kernels::zip::zip;
+use arrow::compute::{and, filter_record_batch};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt};
+use datafusion_common::error::Result;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Constraints, DFSchema, SchemaExt, not_impl_err, plan_err};
 use datafusion_common_runtime::JoinSet;
-use datafusion_datasource::memory::MemSink;
-use datafusion_datasource::memory::MemorySourceConfig;
+use datafusion_datasource::memory::{MemSink, MemorySourceConfig};
 use datafusion_datasource::sink::DataSinkExec;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_expr::dml::InsertOp;
-use datafusion_expr::SortExpr;
+use datafusion_expr::{Expr, SortExpr, TableType};
+use datafusion_physical_expr::{
+    LexOrdering, create_physical_expr, create_physical_sort_exprs,
+};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    PhysicalExpr, PlanProperties, common,
+};
 use datafusion_session::Session;
 
 use async_trait::async_trait;
@@ -70,8 +77,16 @@ pub struct MemTable {
 }
 
 impl MemTable {
-    /// Create a new in-memory table from the provided schema and record batches
+    /// Create a new in-memory table from the provided schema and record batches.
+    ///
+    /// Requires at least one partition. To construct an empty `MemTable`, pass
+    /// `vec![vec![]]` as the `partitions` argument, this represents one partition with
+    /// no batches.
     pub fn try_new(schema: SchemaRef, partitions: Vec<Vec<RecordBatch>>) -> Result<Self> {
+        if partitions.is_empty() {
+            return plan_err!("No partitions provided, expected at least one partition");
+        }
+
         for batches in partitions.iter().flatten() {
             let batches_schema = batches.schema();
             if !schema.contains(&batches_schema) {
@@ -89,7 +104,7 @@ impl MemTable {
                 .into_iter()
                 .map(|e| Arc::new(RwLock::new(e)))
                 .collect::<Vec<_>>(),
-            constraints: Constraints::empty(),
+            constraints: Constraints::default(),
             column_defaults: HashMap::new(),
             sort_order: Arc::new(Mutex::new(vec![])),
         })
@@ -237,18 +252,15 @@ impl TableProvider for MemTable {
         // add sort information if present
         let sort_order = self.sort_order.lock();
         if !sort_order.is_empty() {
-            let df_schema = DFSchema::try_from(self.schema.as_ref().clone())?;
-
-            let file_sort_order = sort_order
-                .iter()
-                .map(|sort_exprs| {
-                    create_physical_sort_exprs(
-                        sort_exprs,
-                        &df_schema,
-                        state.execution_props(),
-                    )
-                })
-                .collect::<Result<Vec<_>>>()?;
+            let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+
+            let eqp = state.execution_props();
+            let mut file_sort_order = vec![];
+            for sort_exprs in sort_order.iter() {
+                let physical_exprs =
+                    create_physical_sort_exprs(sort_exprs, &df_schema, eqp)?;
+                file_sort_order.extend(LexOrdering::new(physical_exprs));
+            }
             source = source.try_with_sort_information(file_sort_order)?;
         }
 
@@ -293,4 +305,342 @@ impl TableProvider for MemTable {
     fn get_column_default(&self, column: &str) -> Option<&Expr> {
         self.column_defaults.get(column)
     }
+
+    async fn delete_from(
+        &self,
+        state: &dyn Session,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Early exit if table has no partitions
+        if self.batches.is_empty() {
+            return Ok(Arc::new(DmlResultExec::new(0)));
+        }
+
+        *self.sort_order.lock() = vec![];
+
+        let mut total_deleted: u64 = 0;
+        let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+
+        for partition_data in &self.batches {
+            let mut partition = partition_data.write().await;
+            let mut new_batches = Vec::with_capacity(partition.len());
+
+            for batch in partition.iter() {
+                if batch.num_rows() == 0 {
+                    continue;
+                }
+
+                // Evaluate filters - None means "match all rows"
+                let filter_mask = evaluate_filters_to_mask(
+                    &filters,
+                    batch,
+                    &df_schema,
+                    state.execution_props(),
+                )?;
+
+                let (delete_count, keep_mask) = match filter_mask {
+                    Some(mask) => {
+                        // Count rows where mask is true (will be deleted)
+                        let count = mask.iter().filter(|v| v == &Some(true)).count();
+                        // Keep rows where predicate is false or NULL (SQL three-valued logic)
+                        let keep: BooleanArray =
+                            mask.iter().map(|v| Some(v != Some(true))).collect();
+                        (count, keep)
+                    }
+                    None => {
+                        // No filters = delete all rows
+                        (
+                            batch.num_rows(),
+                            BooleanArray::from(vec![false; batch.num_rows()]),
+                        )
+                    }
+                };
+
+                total_deleted += delete_count as u64;
+
+                let filtered_batch = filter_record_batch(batch, &keep_mask)?;
+                if filtered_batch.num_rows() > 0 {
+                    new_batches.push(filtered_batch);
+                }
+            }
+
+            *partition = new_batches;
+        }
+
+        Ok(Arc::new(DmlResultExec::new(total_deleted)))
+    }
+
+    async fn update(
+        &self,
+        state: &dyn Session,
+        assignments: Vec<(String, Expr)>,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Early exit if table has no partitions
+        if self.batches.is_empty() {
+            return Ok(Arc::new(DmlResultExec::new(0)));
+        }
+
+        // Validate column names upfront with clear error messages
+        let available_columns: Vec<&str> = self
+            .schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        for (column_name, _) in &assignments {
+            if self.schema.field_with_name(column_name).is_err() {
+                return plan_err!(
+                    "UPDATE failed: column '{}' does not exist. Available columns: {}",
+                    column_name,
+                    available_columns.join(", ")
+                );
+            }
+        }
+
+        let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+
+        // Create physical expressions for assignments upfront (outside batch loop)
+        let physical_assignments: HashMap<String, Arc<dyn PhysicalExpr>> = assignments
+            .iter()
+            .map(|(name, expr)| {
+                let physical_expr =
+                    create_physical_expr(expr, &df_schema, state.execution_props())?;
+                Ok((name.clone(), physical_expr))
+            })
+            .collect::<Result<_>>()?;
+
+        *self.sort_order.lock() = vec![];
+
+        let mut total_updated: u64 = 0;
+
+        for partition_data in &self.batches {
+            let mut partition = partition_data.write().await;
+            let mut new_batches = Vec::with_capacity(partition.len());
+
+            for batch in partition.iter() {
+                if batch.num_rows() == 0 {
+                    continue;
+                }
+
+                // Evaluate filters - None means "match all rows"
+                let filter_mask = evaluate_filters_to_mask(
+                    &filters,
+                    batch,
+                    &df_schema,
+                    state.execution_props(),
+                )?;
+
+                let (update_count, update_mask) = match filter_mask {
+                    Some(mask) => {
+                        // Count rows where mask is true (will be updated)
+                        let count = mask.iter().filter(|v| v == &Some(true)).count();
+                        // Normalize mask: only true (not NULL) triggers update
+                        let normalized: BooleanArray =
+                            mask.iter().map(|v| Some(v == Some(true))).collect();
+                        (count, normalized)
+                    }
+                    None => {
+                        // No filters = update all rows
+                        (
+                            batch.num_rows(),
+                            BooleanArray::from(vec![true; batch.num_rows()]),
+                        )
+                    }
+                };
+
+                total_updated += update_count as u64;
+
+                if update_count == 0 {
+                    new_batches.push(batch.clone());
+                    continue;
+                }
+
+                let mut new_columns: Vec<ArrayRef> =
+                    Vec::with_capacity(batch.num_columns());
+
+                for field in self.schema.fields() {
+                    let column_name = field.name();
+                    let original_column =
+                        batch.column_by_name(column_name).ok_or_else(|| {
+                            datafusion_common::DataFusionError::Internal(format!(
+                                "Column '{column_name}' not found in batch"
+                            ))
+                        })?;
+
+                    let new_column = if let Some(physical_expr) =
+                        physical_assignments.get(column_name.as_str())
+                    {
+                        // Use evaluate_selection to only evaluate on matching rows.
+                        // This avoids errors (e.g., divide-by-zero) on rows that won't
+                        // be updated. The result is scattered back with nulls for
+                        // non-matching rows, which zip() will replace with originals.
+                        let new_values =
+                            physical_expr.evaluate_selection(batch, &update_mask)?;
+                        let new_array = new_values.into_array(batch.num_rows())?;
+
+                        // Convert to &dyn Array which implements Datum
+                        let new_arr: &dyn Array = new_array.as_ref();
+                        let orig_arr: &dyn Array = original_column.as_ref();
+                        zip(&update_mask, &new_arr, &orig_arr)?
+                    } else {
+                        Arc::clone(original_column)
+                    };
+
+                    new_columns.push(new_column);
+                }
+
+                let updated_batch =
+                    ArrowRecordBatch::try_new(Arc::clone(&self.schema), new_columns)?;
+                new_batches.push(updated_batch);
+            }
+
+            *partition = new_batches;
+        }
+
+        Ok(Arc::new(DmlResultExec::new(total_updated)))
+    }
+}
+
+/// Evaluate filter expressions against a batch and return a combined boolean mask.
+/// Returns None if filters is empty (meaning "match all rows").
+/// The returned mask has true for rows that match the filter predicates.
+fn evaluate_filters_to_mask(
+    filters: &[Expr],
+    batch: &RecordBatch,
+    df_schema: &DFSchema,
+    execution_props: &datafusion_expr::execution_props::ExecutionProps,
+) -> Result<Option<BooleanArray>> {
+    if filters.is_empty() {
+        return Ok(None);
+    }
+
+    let mut combined_mask: Option<BooleanArray> = None;
+
+    for filter_expr in filters {
+        let physical_expr =
+            create_physical_expr(filter_expr, df_schema, execution_props)?;
+
+        let result = physical_expr.evaluate(batch)?;
+        let array = result.into_array(batch.num_rows())?;
+        let bool_array = array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .ok_or_else(|| {
+                datafusion_common::DataFusionError::Internal(
+                    "Filter did not evaluate to boolean".to_string(),
+                )
+            })?
+            .clone();
+
+        combined_mask = Some(match combined_mask {
+            Some(existing) => and(&existing, &bool_array)?,
+            None => bool_array,
+        });
+    }
+
+    Ok(combined_mask)
+}
+
+/// Returns a single row with the count of affected rows.
+#[derive(Debug)]
+struct DmlResultExec {
+    rows_affected: u64,
+    schema: SchemaRef,
+    properties: Arc<PlanProperties>,
+}
+
+impl DmlResultExec {
+    fn new(rows_affected: u64) -> Self {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "count",
+            DataType::UInt64,
+            false,
+        )]));
+
+        let properties = PlanProperties::new(
+            datafusion_physical_expr::EquivalenceProperties::new(Arc::clone(&schema)),
+            Partitioning::UnknownPartitioning(1),
+            datafusion_physical_plan::execution_plan::EmissionType::Final,
+            datafusion_physical_plan::execution_plan::Boundedness::Bounded,
+        );
+
+        Self {
+            rows_affected,
+            schema,
+            properties: Arc::new(properties),
+        }
+    }
+}
+
+impl DisplayAs for DmlResultExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                write!(f, "DmlResultExec: rows_affected={}", self.rows_affected)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for DmlResultExec {
+    fn name(&self) -> &str {
+        "DmlResultExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<datafusion_execution::TaskContext>,
+    ) -> Result<datafusion_execution::SendableRecordBatchStream> {
+        // Create a single batch with the count
+        let count_array = UInt64Array::from(vec![self.rows_affected]);
+        let batch = ArrowRecordBatch::try_new(
+            Arc::clone(&self.schema),
+            vec![Arc::new(count_array) as ArrayRef],
+        )?;
+
+        // Create a stream that yields just this one batch
+        let stream = futures::stream::iter(vec![Ok(batch)]);
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&self.schema),
+            stream,
+        )))
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
diff --git a/datafusion/catalog/src/schema.rs b/datafusion/catalog/src/schema.rs
index 5b37348fd7427..c6299582813b4 100644
--- a/datafusion/catalog/src/schema.rs
+++ b/datafusion/catalog/src/schema.rs
@@ -19,13 +19,14 @@
 //! representing collections of named tables.
 
 use async_trait::async_trait;
-use datafusion_common::{exec_err, DataFusionError};
+use datafusion_common::{DataFusionError, exec_err};
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::table::TableProvider;
 use datafusion_common::Result;
+use datafusion_expr::TableType;
 
 /// Represents a schema, comprising a number of named tables.
 ///
@@ -54,12 +55,20 @@ pub trait SchemaProvider: Debug + Sync + Send {
         name: &str,
     ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError>;
 
+    /// Retrieves the type of a specific table from the schema by name, if it exists, otherwise
+    /// returns `None`.  Implementations for which this operation is cheap but [Self::table] is
+    /// expensive can override this to improve operations that only need the type, e.g.
+    /// `SELECT * FROM information_schema.tables`.
+    async fn table_type(&self, name: &str) -> Result<Option<TableType>> {
+        self.table(name).await.map(|o| o.map(|t| t.table_type()))
+    }
+
     /// If supported by the implementation, adds a new table named `name` to
     /// this schema.
     ///
     /// If a table of the same name was already registered, returns "Table
     /// already exists" error.
-    #[allow(unused_variables)]
+    #[expect(unused_variables)]
     fn register_table(
         &self,
         name: String,
@@ -72,7 +81,7 @@ pub trait SchemaProvider: Debug + Sync + Send {
     /// schema and returns the previously registered [`TableProvider`], if any.
     ///
     /// If no `name` table exists, returns Ok(None).
-    #[allow(unused_variables)]
+    #[expect(unused_variables)]
     fn deregister_table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         exec_err!("schema provider does not support deregistering tables")
     }
diff --git a/datafusion/catalog/src/stream.rs b/datafusion/catalog/src/stream.rs
index fbfab513229e0..bdd72a1b1d70b 100644
--- a/datafusion/catalog/src/stream.rs
+++ b/datafusion/catalog/src/stream.rs
@@ -28,13 +28,13 @@ use std::sync::Arc;
 use crate::{Session, TableProvider, TableProviderFactory};
 use arrow::array::{RecordBatch, RecordBatchReader, RecordBatchWriter};
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result};
+use datafusion_common::{Constraints, DataFusionError, Result, config_err, plan_err};
 use datafusion_common_runtime::SpawnedTask;
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
-use datafusion_physical_expr::create_ordering;
+use datafusion_physical_expr::create_lex_ordering;
 use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder;
 use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
@@ -53,7 +53,7 @@ impl TableProviderFactory for StreamTableFactory {
         state: &dyn Session,
         cmd: &CreateExternalTable,
     ) -> Result<Arc<dyn TableProvider>> {
-        let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into());
+        let schema: SchemaRef = Arc::clone(cmd.schema.inner());
         let location = cmd.location.clone();
         let encoding = cmd.file_type.parse()?;
         let header = if let Ok(opt) = cmd
@@ -256,7 +256,7 @@ impl StreamConfig {
         Self {
             source,
             order: vec![],
-            constraints: Constraints::empty(),
+            constraints: Constraints::default(),
         }
     }
 
@@ -321,17 +321,21 @@ impl TableProvider for StreamTable {
 
     async fn scan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         projection: Option<&Vec<usize>>,
         _filters: &[Expr],
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let projected_schema = match projection {
             Some(p) => {
-                let projected = self.0.source.schema().project(p)?;
-                create_ordering(&projected, &self.0.order)?
+                let projected = Arc::new(self.0.source.schema().project(p)?);
+                create_lex_ordering(&projected, &self.0.order, state.execution_props())?
             }
-            None => create_ordering(self.0.source.schema(), &self.0.order)?,
+            None => create_lex_ordering(
+                self.0.source.schema(),
+                &self.0.order,
+                state.execution_props(),
+            )?,
         };
 
         Ok(Arc::new(StreamingTableExec::try_new(
@@ -350,15 +354,11 @@ impl TableProvider for StreamTable {
         input: Arc<dyn ExecutionPlan>,
         _insert_op: InsertOp,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let ordering = match self.0.order.first() {
-            Some(x) => {
-                let schema = self.0.source.schema();
-                let orders = create_ordering(schema, std::slice::from_ref(x))?;
-                let ordering = orders.into_iter().next().unwrap();
-                Some(ordering.into_iter().map(Into::into).collect())
-            }
-            None => None,
-        };
+        let schema = self.0.source.schema();
+        let orders =
+            create_lex_ordering(schema, &self.0.order, _state.execution_props())?;
+        // It is sufficient to pass only one of the equivalent orderings:
+        let ordering = orders.into_iter().next().map(Into::into);
 
         Ok(Arc::new(DataSinkExec::new(
             input,
@@ -440,6 +440,6 @@ impl DataSink for StreamWrite {
         write_task
             .join_unwind()
             .await
-            .map_err(DataFusionError::ExecutionJoin)?
+            .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))?
     }
 }
diff --git a/datafusion/catalog/src/streaming.rs b/datafusion/catalog/src/streaming.rs
index 654e6755d7d4c..db9596b420b7b 100644
--- a/datafusion/catalog/src/streaming.rs
+++ b/datafusion/catalog/src/streaming.rs
@@ -22,21 +22,23 @@ use std::sync::Arc;
 
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-
-use crate::Session;
-use crate::TableProvider;
-use datafusion_common::{plan_err, Result};
-use datafusion_expr::{Expr, TableType};
-use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
+use datafusion_common::{DFSchema, Result, plan_err};
+use datafusion_expr::{Expr, SortExpr, TableType};
+use datafusion_physical_expr::equivalence::project_ordering;
+use datafusion_physical_expr::{LexOrdering, create_physical_sort_exprs};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use log::debug;
 
+use crate::{Session, TableProvider};
+
 /// A [`TableProvider`] that streams a set of [`PartitionStream`]
 #[derive(Debug)]
 pub struct StreamingTable {
     schema: SchemaRef,
     partitions: Vec<Arc<dyn PartitionStream>>,
     infinite: bool,
+    sort_order: Vec<SortExpr>,
 }
 
 impl StreamingTable {
@@ -60,13 +62,21 @@ impl StreamingTable {
             schema,
             partitions,
             infinite: false,
+            sort_order: vec![],
         })
     }
+
     /// Sets streaming table can be infinite.
     pub fn with_infinite_table(mut self, infinite: bool) -> Self {
         self.infinite = infinite;
         self
     }
+
+    /// Sets the existing ordering of streaming table.
+    pub fn with_sort_order(mut self, sort_order: Vec<SortExpr>) -> Self {
+        self.sort_order = sort_order;
+        self
+    }
 }
 
 #[async_trait]
@@ -85,16 +95,40 @@ impl TableProvider for StreamingTable {
 
     async fn scan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         projection: Option<&Vec<usize>>,
         _filters: &[Expr],
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let physical_sort = if !self.sort_order.is_empty() {
+            let df_schema = DFSchema::try_from(Arc::clone(&self.schema))?;
+            let eqp = state.execution_props();
+
+            let original_sort_exprs =
+                create_physical_sort_exprs(&self.sort_order, &df_schema, eqp)?;
+
+            if let Some(p) = projection {
+                // When performing a projection, the output columns will not match
+                // the original physical sort expression indices. Also the sort columns
+                // may not be in the output projection. To correct for these issues
+                // we need to project the ordering based on the output schema.
+                let schema = Arc::new(self.schema.project(p)?);
+                LexOrdering::new(original_sort_exprs)
+                    .and_then(|lex_ordering| project_ordering(&lex_ordering, &schema))
+                    .map(|lex_ordering| lex_ordering.to_vec())
+                    .unwrap_or_default()
+            } else {
+                original_sort_exprs
+            }
+        } else {
+            vec![]
+        };
+
         Ok(Arc::new(StreamingTableExec::try_new(
             Arc::clone(&self.schema),
             self.partitions.clone(),
             projection,
-            None,
+            LexOrdering::new(physical_sort),
             self.infinite,
             limit,
         )?))
diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs
index 207abb9c66703..c9b4e974c8994 100644
--- a/datafusion/catalog/src/table.rs
+++ b/datafusion/catalog/src/table.rs
@@ -24,7 +24,7 @@ use crate::session::Session;
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
 use datafusion_common::Result;
-use datafusion_common::{not_impl_err, Constraints, Statistics};
+use datafusion_common::{Constraints, Statistics, not_impl_err};
 use datafusion_expr::Expr;
 
 use datafusion_expr::dml::InsertOp;
@@ -49,7 +49,7 @@ use datafusion_physical_plan::ExecutionPlan;
 /// [`CatalogProvider`]: super::CatalogProvider
 #[async_trait]
 pub trait TableProvider: Debug + Sync + Send {
-    /// Returns the table provider as [`Any`](std::any::Any) so that it can be
+    /// Returns the table provider as [`Any`] so that it can be
     /// downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
 
@@ -75,7 +75,7 @@ pub trait TableProvider: Debug + Sync + Send {
     }
 
     /// Get the [`LogicalPlan`] of this table, if available.
-    fn get_logical_plan(&self) -> Option<Cow<LogicalPlan>> {
+    fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         None
     }
 
@@ -171,6 +171,37 @@ pub trait TableProvider: Debug + Sync + Send {
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
+    /// Create an [`ExecutionPlan`] for scanning the table using structured arguments.
+    ///
+    /// This method uses [`ScanArgs`] to pass scan parameters in a structured way
+    /// and returns a [`ScanResult`] containing the execution plan.
+    ///
+    /// Table providers can override this method to take advantage of additional
+    /// parameters like the upcoming `preferred_ordering` that may not be available through
+    /// other scan methods.
+    ///
+    /// # Arguments
+    /// * `state` - The session state containing configuration and context
+    /// * `args` - Structured scan arguments including projection, filters, limit, and ordering preferences
+    ///
+    /// # Returns
+    /// A [`ScanResult`] containing the [`ExecutionPlan`] for scanning the table
+    ///
+    /// See [`Self::scan`] for detailed documentation about projection, filters, and limits.
+    async fn scan_with_args<'a>(
+        &self,
+        state: &dyn Session,
+        args: ScanArgs<'a>,
+    ) -> Result<ScanResult> {
+        let filters = args.filters().unwrap_or(&[]);
+        let projection = args.projection().map(|p| p.to_vec());
+        let limit = args.limit();
+        let plan = self
+            .scan(state, projection.as_ref(), filters, limit)
+            .await?;
+        Ok(plan.into())
+    }
+
     /// Specify if DataFusion should provide filter expressions to the
     /// TableProvider to apply *during* the scan.
     ///
@@ -297,6 +328,147 @@ pub trait TableProvider: Debug + Sync + Send {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         not_impl_err!("Insert into not implemented for this table")
     }
+
+    /// Delete rows matching the filter predicates.
+    ///
+    /// Returns an [`ExecutionPlan`] producing a single row with `count` (UInt64).
+    /// Empty `filters` deletes all rows.
+    async fn delete_from(
+        &self,
+        _state: &dyn Session,
+        _filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("DELETE not supported for {} table", self.table_type())
+    }
+
+    /// Update rows matching the filter predicates.
+    ///
+    /// Returns an [`ExecutionPlan`] producing a single row with `count` (UInt64).
+    /// Empty `filters` updates all rows.
+    async fn update(
+        &self,
+        _state: &dyn Session,
+        _assignments: Vec<(String, Expr)>,
+        _filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("UPDATE not supported for {} table", self.table_type())
+    }
+
+    /// Remove all rows from the table.
+    ///
+    /// Should return an [ExecutionPlan] producing a single row with count (UInt64),
+    /// representing the number of rows removed.
+    async fn truncate(&self, _state: &dyn Session) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("TRUNCATE not supported for {} table", self.table_type())
+    }
+}
+
+/// Arguments for scanning a table with [`TableProvider::scan_with_args`].
+#[derive(Debug, Clone, Default)]
+pub struct ScanArgs<'a> {
+    filters: Option<&'a [Expr]>,
+    projection: Option<&'a [usize]>,
+    limit: Option<usize>,
+}
+
+impl<'a> ScanArgs<'a> {
+    /// Set the column projection for the scan.
+    ///
+    /// The projection is a list of column indices from [`TableProvider::schema`]
+    /// that should be included in the scan results. If `None`, all columns are included.
+    ///
+    /// # Arguments
+    /// * `projection` - Optional slice of column indices to project
+    pub fn with_projection(mut self, projection: Option<&'a [usize]>) -> Self {
+        self.projection = projection;
+        self
+    }
+
+    /// Get the column projection for the scan.
+    ///
+    /// Returns a reference to the projection column indices, or `None` if
+    /// no projection was specified (meaning all columns should be included).
+    pub fn projection(&self) -> Option<&'a [usize]> {
+        self.projection
+    }
+
+    /// Set the filter expressions for the scan.
+    ///
+    /// Filters are boolean expressions that should be evaluated during the scan
+    /// to reduce the number of rows returned. All expressions are combined with AND logic.
+    /// Whether filters are actually pushed down depends on [`TableProvider::supports_filters_pushdown`].
+    ///
+    /// # Arguments
+    /// * `filters` - Optional slice of filter expressions
+    pub fn with_filters(mut self, filters: Option<&'a [Expr]>) -> Self {
+        self.filters = filters;
+        self
+    }
+
+    /// Get the filter expressions for the scan.
+    ///
+    /// Returns a reference to the filter expressions, or `None` if no filters were specified.
+    pub fn filters(&self) -> Option<&'a [Expr]> {
+        self.filters
+    }
+
+    /// Set the maximum number of rows to return from the scan.
+    ///
+    /// If specified, the scan should return at most this many rows. This is typically
+    /// used to optimize queries with `LIMIT` clauses.
+    ///
+    /// # Arguments
+    /// * `limit` - Optional maximum number of rows to return
+    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+        self.limit = limit;
+        self
+    }
+
+    /// Get the maximum number of rows to return from the scan.
+    ///
+    /// Returns the row limit, or `None` if no limit was specified.
+    pub fn limit(&self) -> Option<usize> {
+        self.limit
+    }
+}
+
+/// Result of a table scan operation from [`TableProvider::scan_with_args`].
+#[derive(Debug, Clone)]
+pub struct ScanResult {
+    /// The ExecutionPlan to run.
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+impl ScanResult {
+    /// Create a new `ScanResult` with the given execution plan.
+    ///
+    /// # Arguments
+    /// * `plan` - The execution plan that will perform the table scan
+    pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self { plan }
+    }
+
+    /// Get a reference to the execution plan for this scan result.
+    ///
+    /// Returns a reference to the [`ExecutionPlan`] that will perform
+    /// the actual table scanning and data retrieval.
+    pub fn plan(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.plan
+    }
+
+    /// Consume this ScanResult and return the execution plan.
+    ///
+    /// Returns the owned [`ExecutionPlan`] that will perform
+    /// the actual table scanning and data retrieval.
+    pub fn into_inner(self) -> Arc<dyn ExecutionPlan> {
+        self.plan
+    }
+}
+
+impl From<Arc<dyn ExecutionPlan>> for ScanResult {
+    fn from(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self::new(plan)
+    }
 }
 
 /// A factory which creates [`TableProvider`]s at runtime given a URL.
@@ -314,13 +486,13 @@ pub trait TableProviderFactory: Debug + Sync + Send {
 }
 
 /// A trait for table function implementations
-pub trait TableFunctionImpl: Debug + Sync + Send {
+pub trait TableFunctionImpl: Debug + Sync + Send + Any {
     /// Create a table provider
     fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>>;
 }
 
 /// A table that uses a function to generate data
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct TableFunction {
     /// Name of the table function
     name: String,
diff --git a/datafusion/catalog/src/view.rs b/datafusion/catalog/src/view.rs
index 8dfb79718c9bb..54c54431a5913 100644
--- a/datafusion/catalog/src/view.rs
+++ b/datafusion/catalog/src/view.rs
@@ -24,8 +24,8 @@ use crate::TableProvider;
 
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_common::error::Result;
 use datafusion_common::Column;
+use datafusion_common::error::Result;
 use datafusion_expr::TableType;
 use datafusion_expr::{Expr, LogicalPlan};
 use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown};
@@ -51,7 +51,7 @@ impl ViewTable {
     /// Notes: the `LogicalPlan` is not validated or type coerced. If this is
     /// needed it should be done after calling this function.
     pub fn new(logical_plan: LogicalPlan, definition: Option<String>) -> Self {
-        let table_schema = logical_plan.schema().as_ref().to_owned().into();
+        let table_schema = Arc::clone(logical_plan.schema().inner());
         Self {
             logical_plan,
             table_schema,
@@ -87,7 +87,7 @@ impl TableProvider for ViewTable {
         self
     }
 
-    fn get_logical_plan(&self) -> Option<Cow<LogicalPlan>> {
+    fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         Some(Cow::Borrowed(&self.logical_plan))
     }
 
diff --git a/datafusion/common-runtime/Cargo.toml b/datafusion/common-runtime/Cargo.toml
index 7ddc021e640c9..fd9a818bcb1d0 100644
--- a/datafusion/common-runtime/Cargo.toml
+++ b/datafusion/common-runtime/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -43,4 +46,4 @@ log = { workspace = true }
 tokio = { workspace = true }
 
 [dev-dependencies]
-tokio = { version = "1.45", features = ["rt", "rt-multi-thread", "time"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
diff --git a/datafusion/common-runtime/README.md b/datafusion/common-runtime/README.md
index 77100e52603c9..ff44e6c3e209e 100644
--- a/datafusion/common-runtime/README.md
+++ b/datafusion/common-runtime/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Common Runtime
+# Apache DataFusion Common Runtime
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides common utilities.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/common-runtime/src/common.rs b/datafusion/common-runtime/src/common.rs
index e7aba1d455ee6..ca618b19ed2f1 100644
--- a/datafusion/common-runtime/src/common.rs
+++ b/datafusion/common-runtime/src/common.rs
@@ -44,7 +44,7 @@ impl<R: 'static> SpawnedTask<R> {
         R: Send,
     {
         // Ok to use spawn here as SpawnedTask handles aborting/cancelling the task on Drop
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         let inner = tokio::task::spawn(trace_future(task));
         Self { inner }
     }
@@ -56,7 +56,7 @@ impl<R: 'static> SpawnedTask<R> {
         R: Send,
     {
         // Ok to use spawn_blocking here as SpawnedTask handles aborting/cancelling the task on Drop
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         let inner = tokio::task::spawn_blocking(trace_block(task));
         Self { inner }
     }
@@ -68,15 +68,28 @@ impl<R: 'static> SpawnedTask<R> {
     }
 
     /// Joins the task and unwinds the panic if it happens.
-    pub async fn join_unwind(self) -> Result<R, JoinError> {
+    pub async fn join_unwind(mut self) -> Result<R, JoinError> {
+        self.join_unwind_mut().await
+    }
+
+    /// Joins the task using a mutable reference and unwinds the panic if it happens.
+    ///
+    /// This method is similar to [`join_unwind`](Self::join_unwind), but takes a mutable
+    /// reference instead of consuming `self`. This allows the `SpawnedTask` to remain
+    /// usable after the call.
+    ///
+    /// If called multiple times on the same task:
+    /// - If the task is still running, it will continue waiting for completion
+    /// - If the task has already completed successfully, subsequent calls will
+    ///   continue to return the same `JoinError` indicating the task is finished
+    /// - If the task panicked, the first call will resume the panic, and the
+    ///   program will not reach subsequent calls
+    pub async fn join_unwind_mut(&mut self) -> Result<R, JoinError> {
         self.await.map_err(|e| {
             // `JoinError` can be caused either by panic or cancellation. We have to handle panics:
             if e.is_panic() {
                 std::panic::resume_unwind(e.into_panic());
             } else {
-                // Cancellation may be caused by two reasons:
-                // 1. Abort is called, but since we consumed `self`, it's not our case (`JoinHandle` not accessible outside).
-                // 2. The runtime is shutting down.
                 log::warn!("SpawnedTask was polled during shutdown");
                 e
             }
@@ -102,14 +115,14 @@ impl<R> Drop for SpawnedTask<R> {
 mod tests {
     use super::*;
 
-    use std::future::{pending, Pending};
+    use std::future::{Pending, pending};
 
     use tokio::{runtime::Runtime, sync::oneshot};
 
     #[tokio::test]
     async fn runtime_shutdown() {
         let rt = Runtime::new().unwrap();
-        #[allow(clippy::async_yields_async)]
+        #[expect(clippy::async_yields_async)]
         let task = rt
             .spawn(async {
                 SpawnedTask::spawn(async {
diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs
index ec8db0bdcd911..cf45ccf3ef63a 100644
--- a/datafusion/common-runtime/src/lib.rs
+++ b/datafusion/common-runtime/src/lib.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
@@ -30,4 +31,6 @@ mod trace_utils;
 
 pub use common::SpawnedTask;
 pub use join_set::JoinSet;
-pub use trace_utils::{set_join_set_tracer, JoinSetTracer};
+pub use trace_utils::{
+    JoinSetTracer, JoinSetTracerError, set_join_set_tracer, trace_block, trace_future,
+};
diff --git a/datafusion/common-runtime/src/trace_utils.rs b/datafusion/common-runtime/src/trace_utils.rs
index c3a39c355fc88..f8adbe8825bc1 100644
--- a/datafusion/common-runtime/src/trace_utils.rs
+++ b/datafusion/common-runtime/src/trace_utils.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use std::any::Any;
 use std::error::Error;
 use std::fmt::{Display, Formatter, Result as FmtResult};
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index d471e48be4e75..92dd76aa97d47 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,13 +43,30 @@ name = "datafusion_common"
 [features]
 avro = ["apache-avro"]
 backtrace = []
-pyarrow = ["pyo3", "arrow/pyarrow", "parquet"]
+parquet_encryption = [
+    "parquet",
+    "parquet/encryption",
+    "dep:hex",
+]
 force_hash_collisions = []
 recursive_protection = ["dep:recursive"]
+parquet = ["dep:parquet"]
+sql = ["sqlparser"]
+
+[[bench]]
+harness = false
+name = "with_hashes"
+
+[[bench]]
+harness = false
+name = "scalar_to_array"
+
+[[bench]]
+harness = false
+name = "stats_merge"
 
 [dependencies]
-ahash = { workspace = true }
-apache-avro = { version = "0.17", default-features = false, features = [
+apache-avro = { workspace = true, features = [
     "bzip",
     "snappy",
     "xz",
@@ -54,18 +74,19 @@ apache-avro = { version = "0.17", default-features = false, features = [
 ], optional = true }
 arrow = { workspace = true }
 arrow-ipc = { workspace = true }
-base64 = "0.22.1"
+chrono = { workspace = true }
+foldhash = "0.2"
 half = { workspace = true }
 hashbrown = { workspace = true }
+hex = { workspace = true, optional = true }
 indexmap = { workspace = true }
-libc = "0.2.172"
+itertools = { workspace = true }
+libc = "0.2.180"
 log = { workspace = true }
 object_store = { workspace = true, optional = true }
 parquet = { workspace = true, optional = true, default-features = true }
-paste = "1.0.15"
-pyo3 = { version = "0.24.2", optional = true }
 recursive = { workspace = true, optional = true }
-sqlparser = { workspace = true }
+sqlparser = { workspace = true, optional = true }
 tokio = { workspace = true }
 
 [target.'cfg(target_family = "wasm")'.dependencies]
@@ -73,5 +94,7 @@ web-time = "1.1.0"
 
 [dev-dependencies]
 chrono = { workspace = true }
+criterion = { workspace = true }
 insta = { workspace = true }
 rand = { workspace = true }
+sqlparser = { workspace = true }
diff --git a/datafusion/common/README.md b/datafusion/common/README.md
index 524ab4420d2a8..4948c8c581be9 100644
--- a/datafusion/common/README.md
+++ b/datafusion/common/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Common
+# Apache DataFusion Common
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides common data types and utilities.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/common/benches/scalar_to_array.rs b/datafusion/common/benches/scalar_to_array.rs
new file mode 100644
index 0000000000000..90a152e515fe5
--- /dev/null
+++ b/datafusion/common/benches/scalar_to_array.rs
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for `ScalarValue::to_array_of_size`, focusing on List
+//! scalars.
+
+use arrow::array::{Array, ArrayRef, AsArray, StringViewBuilder};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use std::sync::Arc;
+
+/// Build a `ScalarValue::List` of `num_elements` Utf8View strings whose
+/// inner StringViewArray has `num_buffers` data buffers.
+fn make_list_scalar(num_elements: usize, num_buffers: usize) -> ScalarValue {
+    let elements_per_buffer = num_elements.div_ceil(num_buffers);
+
+    let mut small_arrays: Vec<ArrayRef> = Vec::new();
+    let mut remaining = num_elements;
+    for buf_idx in 0..num_buffers {
+        let count = remaining.min(elements_per_buffer);
+        if count == 0 {
+            break;
+        }
+        let start = buf_idx * elements_per_buffer;
+        let mut builder = StringViewBuilder::with_capacity(count);
+        for i in start..start + count {
+            builder.append_value(format!("{i:024x}"));
+        }
+        small_arrays.push(Arc::new(builder.finish()) as ArrayRef);
+        remaining -= count;
+    }
+
+    let refs: Vec<&dyn Array> = small_arrays.iter().map(|a| a.as_ref()).collect();
+    let concated = arrow::compute::concat(&refs).unwrap();
+
+    let list_array = SingleRowListArrayBuilder::new(concated)
+        .with_field(&Field::new_list_field(DataType::Utf8View, true))
+        .build_list_array();
+    ScalarValue::List(Arc::new(list_array))
+}
+
+/// We want to measure the cost of doing the conversion and then also accessing
+/// the results, to model what would happen during query evaluation.
+fn consume_list_array(arr: &ArrayRef) {
+    let list_arr = arr.as_list::<i32>();
+    let mut total_len: usize = 0;
+    for i in 0..list_arr.len() {
+        let inner = list_arr.value(i);
+        let sv = inner.as_string_view();
+        for j in 0..sv.len() {
+            total_len += sv.value(j).len();
+        }
+    }
+    std::hint::black_box(total_len);
+}
+
+fn bench_list_to_array_of_size(c: &mut Criterion) {
+    let mut group = c.benchmark_group("list_to_array_of_size");
+
+    let num_elements = 1245;
+    let scalar_1buf = make_list_scalar(num_elements, 1);
+    let scalar_50buf = make_list_scalar(num_elements, 50);
+
+    for batch_size in [256, 1024] {
+        group.bench_with_input(
+            BenchmarkId::new("1_buffer", batch_size),
+            &batch_size,
+            |b, &sz| {
+                b.iter(|| {
+                    let arr = scalar_1buf.to_array_of_size(sz).unwrap();
+                    consume_list_array(&arr);
+                });
+            },
+        );
+        group.bench_with_input(
+            BenchmarkId::new("50_buffers", batch_size),
+            &batch_size,
+            |b, &sz| {
+                b.iter(|| {
+                    let arr = scalar_50buf.to_array_of_size(sz).unwrap();
+                    consume_list_array(&arr);
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_list_to_array_of_size);
+criterion_main!(benches);
diff --git a/datafusion/common/benches/stats_merge.rs b/datafusion/common/benches/stats_merge.rs
new file mode 100644
index 0000000000000..73229b6379360
--- /dev/null
+++ b/datafusion/common/benches/stats_merge.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for `Statistics::try_merge_iter`.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+
+/// Build a vector of `n` with `num_cols` columns
+fn make_stats(n: usize, num_cols: usize) -> Vec<Statistics> {
+    (0..n)
+        .map(|i| {
+            let mut stats = Statistics::default()
+                .with_num_rows(Precision::Exact(100 + i))
+                .with_total_byte_size(Precision::Exact(8000 + i * 80));
+            for c in 0..num_cols {
+                let base = (i * num_cols + c) as i64;
+                stats = stats.add_column_statistics(
+                    ColumnStatistics::new_unknown()
+                        .with_null_count(Precision::Exact(i))
+                        .with_min_value(Precision::Exact(ScalarValue::Int64(Some(base))))
+                        .with_max_value(Precision::Exact(ScalarValue::Int64(Some(
+                            base + 1000,
+                        ))))
+                        .with_sum_value(Precision::Exact(ScalarValue::Int64(Some(
+                            base * 100,
+                        )))),
+                );
+            }
+            stats
+        })
+        .collect()
+}
+
+fn bench_stats_merge(c: &mut Criterion) {
+    let mut group = c.benchmark_group("stats_merge");
+
+    for &num_partitions in &[10, 100, 500] {
+        for &num_cols in &[1, 5, 20] {
+            let items = make_stats(num_partitions, num_cols);
+            let schema = Arc::new(Schema::new(
+                (0..num_cols)
+                    .map(|i| Field::new(format!("col{i}"), DataType::Int64, true))
+                    .collect::<Vec<_>>(),
+            ));
+
+            let param = format!("{num_partitions}parts_{num_cols}cols");
+
+            group.bench_with_input(
+                BenchmarkId::new("try_merge_iter", &param),
+                &(&items, &schema),
+                |b, (items, schema)| {
+                    b.iter(|| {
+                        std::hint::black_box(
+                            Statistics::try_merge_iter(*items, schema).unwrap(),
+                        );
+                    });
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_stats_merge);
+criterion_main!(benches);
diff --git a/datafusion/common/benches/with_hashes.rs b/datafusion/common/benches/with_hashes.rs
new file mode 100644
index 0000000000000..0e9c53c896a5e
--- /dev/null
+++ b/datafusion/common/benches/with_hashes.rs
@@ -0,0 +1,569 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for `with_hashes` function
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray, Int32Array,
+    Int64Array, ListArray, MapArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveArray,
+    RunArray, StringViewArray, StructArray, UnionArray, make_array,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type, UnionFields,
+};
+use criterion::{Bencher, Criterion, criterion_group, criterion_main};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::with_hashes;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::distr::{Alphanumeric, Distribution, StandardUniform};
+use rand::prelude::StdRng;
+use std::sync::Arc;
+
+const BATCH_SIZE: usize = 8192;
+
+struct BenchData {
+    name: &'static str,
+    array: ArrayRef,
+    /// Union arrays can't have null bitmasks added
+    supports_nulls: bool,
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let pool = StringPool::new(100, 64);
+    // poll with small strings for string view tests (<=12 bytes are inlined)
+    let small_pool = StringPool::new(100, 5);
+    let cases = [
+        BenchData {
+            name: "int64",
+            array: primitive_array::<Int64Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8",
+            array: pool.string_array::<i32>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "large_utf8",
+            array: pool.string_array::<i64>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8_view",
+            array: pool.string_view_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "utf8_view (small)",
+            array: small_pool.string_view_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "dictionary_utf8_int32",
+            array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "list_array",
+            array: list_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "map_array",
+            array: map_array(BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "sparse_union",
+            array: sparse_union_array(BATCH_SIZE),
+            supports_nulls: false,
+        },
+        BenchData {
+            name: "dense_union",
+            array: dense_union_array(BATCH_SIZE),
+            supports_nulls: false,
+        },
+        BenchData {
+            name: "struct_array",
+            array: create_struct_array(&pool, BATCH_SIZE),
+            supports_nulls: true,
+        },
+        BenchData {
+            name: "run_array_int32",
+            array: create_run_array::<Int32Type>(BATCH_SIZE),
+            supports_nulls: true,
+        },
+    ];
+
+    for BenchData {
+        name,
+        array,
+        supports_nulls,
+    } in cases
+    {
+        c.bench_function(&format!("{name}: single, no nulls"), |b| {
+            do_hash_test(b, std::slice::from_ref(&array));
+        });
+        c.bench_function(&format!("{name}: multiple, no nulls"), |b| {
+            let arrays = vec![array.clone(), array.clone(), array.clone()];
+            do_hash_test(b, &arrays);
+        });
+        // Union arrays can't have null bitmasks
+        if supports_nulls {
+            let nullable_array = add_nulls(&array);
+            c.bench_function(&format!("{name}: single, nulls"), |b| {
+                do_hash_test(b, std::slice::from_ref(&nullable_array));
+            });
+            c.bench_function(&format!("{name}: multiple, nulls"), |b| {
+                let arrays = vec![
+                    nullable_array.clone(),
+                    nullable_array.clone(),
+                    nullable_array.clone(),
+                ];
+                do_hash_test(b, &arrays);
+            });
+        }
+    }
+}
+
+fn do_hash_test(b: &mut Bencher, arrays: &[ArrayRef]) {
+    let state = RandomState::default();
+    b.iter(|| {
+        with_hashes(arrays, &state, |hashes| {
+            assert_eq!(hashes.len(), BATCH_SIZE); // make sure the result is used
+            Ok(())
+        })
+        .unwrap();
+    });
+}
+
+fn create_null_mask(len: usize) -> NullBuffer
+where
+    StandardUniform: Distribution<bool>,
+{
+    let mut rng = make_rng();
+    let null_density = 0.03;
+    let mut builder = NullBufferBuilder::new(len);
+    for _ in 0..len {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_non_null();
+        }
+    }
+    builder.finish().expect("should be nulls in buffer")
+}
+
+// Returns a new array that is the same as array, but with nulls
+// Handles the special case of RunArray where nulls must be in the values array
+fn add_nulls(array: &ArrayRef) -> ArrayRef {
+    use arrow::datatypes::DataType;
+
+    match array.data_type() {
+        DataType::RunEndEncoded(_, _) => {
+            // RunArray can't have top-level nulls, so apply nulls to the values array
+            let run_array = array
+                .as_any()
+                .downcast_ref::<RunArray<Int32Type>>()
+                .expect("Expected RunArray");
+
+            let run_ends_buffer = run_array.run_ends().inner().clone();
+            let run_ends_array = PrimitiveArray::<Int32Type>::new(run_ends_buffer, None);
+            let values = run_array.values().clone();
+
+            // Add nulls to the values array
+            let values_with_nulls = {
+                let array_data = values
+                    .clone()
+                    .into_data()
+                    .into_builder()
+                    .nulls(Some(create_null_mask(values.len())))
+                    .build()
+                    .unwrap();
+                make_array(array_data)
+            };
+
+            Arc::new(
+                RunArray::try_new(&run_ends_array, values_with_nulls.as_ref())
+                    .expect("Failed to create RunArray with null values"),
+            )
+        }
+        _ => {
+            let array_data = array
+                .clone()
+                .into_data()
+                .into_builder()
+                .nulls(Some(create_null_mask(array.len())))
+                .build()
+                .unwrap();
+            make_array(array_data)
+        }
+    }
+}
+
+pub fn make_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+/// String pool for generating low cardinality data (for dictionaries and string views)
+struct StringPool {
+    strings: Vec<String>,
+}
+
+impl StringPool {
+    /// Create a new string pool with the given number of random strings
+    /// each having between 1 and max_length characters.
+    fn new(pool_size: usize, max_length: usize) -> Self {
+        let mut rng = make_rng();
+        let mut strings = Vec::with_capacity(pool_size);
+        for _ in 0..pool_size {
+            let len = rng.random_range(1..=max_length);
+            let value: Vec<u8> =
+                rng.clone().sample_iter(&Alphanumeric).take(len).collect();
+            strings.push(String::from_utf8(value).unwrap());
+        }
+        Self { strings }
+    }
+
+    /// Return an iterator over &str of the given length with values randomly chosen from the pool
+    fn iter_strings(&self, len: usize) -> impl Iterator<Item = &str> {
+        let mut rng = make_rng();
+        (0..len).map(move |_| {
+            let idx = rng.random_range(0..self.strings.len());
+            self.strings[idx].as_str()
+        })
+    }
+
+    /// Return a StringArray of the given length with values randomly chosen from the pool
+    fn string_array<O: OffsetSizeTrait>(&self, array_length: usize) -> ArrayRef {
+        Arc::new(GenericStringArray::<O>::from_iter_values(
+            self.iter_strings(array_length),
+        ))
+    }
+
+    /// Return a StringViewArray of the given length with values randomly chosen from the pool
+    fn string_view_array(&self, array_length: usize) -> ArrayRef {
+        Arc::new(StringViewArray::from_iter_values(
+            self.iter_strings(array_length),
+        ))
+    }
+
+    /// Return a DictionaryArray of the given length with values randomly chosen from the pool
+    fn dictionary_array<T: ArrowDictionaryKeyType>(
+        &self,
+        array_length: usize,
+    ) -> ArrayRef {
+        Arc::new(DictionaryArray::<T>::from_iter(
+            self.iter_strings(array_length),
+        ))
+    }
+}
+
+pub fn primitive_array<T>(array_len: usize) -> ArrayRef
+where
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    let mut rng = make_rng();
+
+    let array: PrimitiveArray<T> = (0..array_len)
+        .map(|_| Some(rng.random::<T::Native>()))
+        .collect();
+    Arc::new(array)
+}
+
+/// Benchmark sliced arrays to demonstrate the optimization for when an array is
+/// sliced, the underlying buffer may be much larger than what's referenced by
+/// the slice. The optimization avoids hashing unreferenced elements.
+fn sliced_array_benchmark(c: &mut Criterion) {
+    // Test with different slice ratios: slice_size / total_size
+    // Smaller ratio = more potential savings from the optimization
+    let slice_ratios = [10, 5, 2]; // 1/10, 1/5, 1/2 of total
+
+    for ratio in slice_ratios {
+        let total_rows = BATCH_SIZE * ratio;
+        let slice_offset = BATCH_SIZE * (ratio / 2); // Take from middle
+        let slice_len = BATCH_SIZE;
+
+        // Sliced ListArray
+        {
+            let full_array = list_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<ListArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("list_array_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+
+        // Sliced MapArray
+        {
+            let full_array = map_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<MapArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("map_array_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+
+        // Sliced Sparse UnionArray
+        {
+            let full_array = sparse_union_array(total_rows);
+            let sliced: ArrayRef = Arc::new(
+                full_array
+                    .as_any()
+                    .downcast_ref::<UnionArray>()
+                    .unwrap()
+                    .slice(slice_offset, slice_len),
+            );
+            c.bench_function(
+                &format!("sparse_union_sliced: 1/{ratio} of {total_rows} rows"),
+                |b| {
+                    do_hash_test_with_len(b, std::slice::from_ref(&sliced), slice_len);
+                },
+            );
+        }
+    }
+}
+
+fn do_hash_test_with_len(b: &mut Bencher, arrays: &[ArrayRef], expected_len: usize) {
+    let state = RandomState::default();
+    b.iter(|| {
+        with_hashes(arrays, &state, |hashes| {
+            assert_eq!(hashes.len(), expected_len);
+            Ok(())
+        })
+        .unwrap();
+    });
+}
+
+fn list_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let elements_per_row = 5;
+    let total_elements = num_rows * elements_per_row;
+
+    let values: Int64Array = (0..total_elements)
+        .map(|_| Some(rng.random::<i64>()))
+        .collect();
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * elements_per_row) as i32)
+        .collect();
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int64, true)),
+        OffsetBuffer::new(ScalarBuffer::from(offsets)),
+        Arc::new(values),
+        None,
+    ))
+}
+
+fn map_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let entries_per_row = 5;
+    let total_entries = num_rows * entries_per_row;
+
+    let keys: Int32Array = (0..total_entries)
+        .map(|_| Some(rng.random::<i32>()))
+        .collect();
+    let values: Int64Array = (0..total_entries)
+        .map(|_| Some(rng.random::<i64>()))
+        .collect();
+    let offsets: Vec<i32> = (0..=num_rows)
+        .map(|i| (i * entries_per_row) as i32)
+        .collect();
+
+    let entries = StructArray::try_new(
+        Fields::from(vec![
+            Field::new("keys", DataType::Int32, false),
+            Field::new("values", DataType::Int64, true),
+        ]),
+        vec![Arc::new(keys), Arc::new(values)],
+        None,
+    )
+    .unwrap();
+
+    Arc::new(MapArray::new(
+        Arc::new(Field::new(
+            "entries",
+            DataType::Struct(Fields::from(vec![
+                Field::new("keys", DataType::Int32, false),
+                Field::new("values", DataType::Int64, true),
+            ])),
+            false,
+        )),
+        OffsetBuffer::new(ScalarBuffer::from(offsets)),
+        entries,
+        None,
+        false,
+    ))
+}
+
+fn sparse_union_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let num_types = 5;
+
+    let type_ids: Vec<i8> = (0..num_rows)
+        .map(|_| rng.random_range(0..num_types) as i8)
+        .collect();
+    let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
+        .map(|i| {
+            (
+                (
+                    i as i8,
+                    Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
+                ),
+                primitive_array::<Int64Type>(num_rows),
+            )
+        })
+        .unzip();
+
+    Arc::new(
+        UnionArray::try_new(
+            UnionFields::from_iter(fields),
+            ScalarBuffer::from(type_ids),
+            None,
+            children,
+        )
+        .unwrap(),
+    )
+}
+
+fn dense_union_array(num_rows: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    let num_types = 5;
+    let type_ids: Vec<i8> = (0..num_rows)
+        .map(|_| rng.random_range(0..num_types) as i8)
+        .collect();
+
+    let mut type_counts = vec![0i32; num_types];
+    for &tid in &type_ids {
+        type_counts[tid as usize] += 1;
+    }
+
+    let mut current_offsets = vec![0i32; num_types];
+    let offsets: Vec<i32> = type_ids
+        .iter()
+        .map(|&tid| {
+            let offset = current_offsets[tid as usize];
+            current_offsets[tid as usize] += 1;
+            offset
+        })
+        .collect();
+
+    let (fields, children): (Vec<_>, Vec<_>) = (0..num_types)
+        .map(|i| {
+            (
+                (
+                    i as i8,
+                    Arc::new(Field::new(format!("f{i}"), DataType::Int64, true)),
+                ),
+                primitive_array::<Int64Type>(type_counts[i] as usize),
+            )
+        })
+        .unzip();
+
+    Arc::new(
+        UnionArray::try_new(
+            UnionFields::from_iter(fields),
+            ScalarBuffer::from(type_ids),
+            Some(ScalarBuffer::from(offsets)),
+            children,
+        )
+        .unwrap(),
+    )
+}
+
+fn boolean_array(array_len: usize) -> ArrayRef {
+    let mut rng = make_rng();
+    Arc::new(
+        (0..array_len)
+            .map(|_| Some(rng.random::<bool>()))
+            .collect::<arrow::array::BooleanArray>(),
+    )
+}
+
+/// Create a StructArray with multiple columns
+fn create_struct_array(pool: &StringPool, array_len: usize) -> ArrayRef {
+    let bool_array = boolean_array(array_len);
+    let int32_array = primitive_array::<Int32Type>(array_len);
+    let int64_array = primitive_array::<Int64Type>(array_len);
+    let str_array = pool.string_array::<i32>(array_len);
+
+    let fields = Fields::from(vec![
+        Field::new("bool_col", DataType::Boolean, false),
+        Field::new("int32_col", DataType::Int32, false),
+        Field::new("int64_col", DataType::Int64, false),
+        Field::new("string_col", DataType::Utf8, false),
+    ]);
+
+    Arc::new(StructArray::new(
+        fields,
+        vec![bool_array, int32_array, int64_array, str_array],
+        None,
+    ))
+}
+
+/// Create a RunArray to test run array hashing.
+fn create_run_array<T>(array_len: usize) -> ArrayRef
+where
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    let mut rng = make_rng();
+
+    // Create runs of varying lengths
+    let mut run_ends = Vec::new();
+    let mut values = Vec::new();
+    let mut current_end = 0;
+
+    while current_end < array_len {
+        // Random run length between 1 and 50
+        let run_length = rng.random_range(1..=50).min(array_len - current_end);
+        current_end += run_length;
+        run_ends.push(current_end as i32);
+        values.push(Some(rng.random::<T::Native>()));
+    }
+
+    let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
+    let values_array: Arc<dyn Array> =
+        Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());
+
+    Arc::new(
+        RunArray::try_new(&run_ends_array, values_array.as_ref())
+            .expect("Failed to create RunArray"),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark, sliced_array_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs
index 2ee2cb4dc7add..99f6447a6acd8 100644
--- a/datafusion/common/src/alias.rs
+++ b/datafusion/common/src/alias.rs
@@ -37,6 +37,16 @@ impl AliasGenerator {
         Self::default()
     }
 
+    /// Advance the counter to at least `min_id`, ensuring future aliases
+    /// won't collide with already-existing ones.
+    ///
+    /// For example, if the query already contains an alias `alias_42`, then calling
+    /// `update_min_id(42)` will ensure that future aliases generated by this
+    /// [`AliasGenerator`] will start from `alias_43`.
+    pub fn update_min_id(&self, min_id: usize) {
+        self.next_id.fetch_max(min_id + 1, Ordering::Relaxed);
+    }
+
     /// Return a unique alias with the provided prefix
     pub fn next(&self, prefix: &str) -> String {
         let id = self.next_id.fetch_add(1, Ordering::Relaxed);
diff --git a/datafusion/common/src/cast.rs b/datafusion/common/src/cast.rs
index 28202c6684b50..bc4313ed95665 100644
--- a/datafusion/common/src/cast.rs
+++ b/datafusion/common/src/cast.rs
@@ -20,11 +20,14 @@
 //! but provide an error message rather than a panic, as the corresponding
 //! kernels in arrow-rs such as `as_boolean_array` do.
 
-use crate::{downcast_value, Result};
+use crate::{Result, downcast_value};
 use arrow::array::{
-    BinaryViewArray, Float16Array, Int16Array, Int8Array, LargeBinaryArray,
-    LargeStringArray, StringViewArray, UInt16Array,
+    BinaryViewArray, Decimal32Array, Decimal64Array, DurationMicrosecondArray,
+    DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array,
+    Int8Array, Int16Array, LargeBinaryArray, LargeListViewArray, LargeStringArray,
+    ListViewArray, RunArray, StringViewArray, UInt16Array,
 };
+use arrow::datatypes::RunEndIndexType;
 use arrow::{
     array::{
         Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
@@ -35,254 +38,305 @@ use arrow::{
         MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray,
         Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
         Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-        TimestampNanosecondArray, TimestampSecondArray, UInt32Array, UInt64Array,
-        UInt8Array, UnionArray,
+        TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt32Array,
+        UInt64Array, UnionArray,
     },
     datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType},
 };
 
-// Downcast ArrayRef to Date32Array
+// Downcast Array to Date32Array
 pub fn as_date32_array(array: &dyn Array) -> Result<&Date32Array> {
     Ok(downcast_value!(array, Date32Array))
 }
 
-// Downcast ArrayRef to Date64Array
+// Downcast Array to Date64Array
 pub fn as_date64_array(array: &dyn Array) -> Result<&Date64Array> {
     Ok(downcast_value!(array, Date64Array))
 }
 
-// Downcast ArrayRef to StructArray
+// Downcast Array to StructArray
 pub fn as_struct_array(array: &dyn Array) -> Result<&StructArray> {
     Ok(downcast_value!(array, StructArray))
 }
 
-// Downcast ArrayRef to Int8Array
+// Downcast Array to Int8Array
 pub fn as_int8_array(array: &dyn Array) -> Result<&Int8Array> {
     Ok(downcast_value!(array, Int8Array))
 }
 
-// Downcast ArrayRef to UInt8Array
+// Downcast Array to UInt8Array
 pub fn as_uint8_array(array: &dyn Array) -> Result<&UInt8Array> {
     Ok(downcast_value!(array, UInt8Array))
 }
 
-// Downcast ArrayRef to Int16Array
+// Downcast Array to Int16Array
 pub fn as_int16_array(array: &dyn Array) -> Result<&Int16Array> {
     Ok(downcast_value!(array, Int16Array))
 }
 
-// Downcast ArrayRef to UInt16Array
+// Downcast Array to UInt16Array
 pub fn as_uint16_array(array: &dyn Array) -> Result<&UInt16Array> {
     Ok(downcast_value!(array, UInt16Array))
 }
 
-// Downcast ArrayRef to Int32Array
+// Downcast Array to Int32Array
 pub fn as_int32_array(array: &dyn Array) -> Result<&Int32Array> {
     Ok(downcast_value!(array, Int32Array))
 }
 
-// Downcast ArrayRef to UInt32Array
+// Downcast Array to UInt32Array
 pub fn as_uint32_array(array: &dyn Array) -> Result<&UInt32Array> {
     Ok(downcast_value!(array, UInt32Array))
 }
 
-// Downcast ArrayRef to Int64Array
+// Downcast Array to Int64Array
 pub fn as_int64_array(array: &dyn Array) -> Result<&Int64Array> {
     Ok(downcast_value!(array, Int64Array))
 }
 
-// Downcast ArrayRef to UInt64Array
+// Downcast Array to UInt64Array
 pub fn as_uint64_array(array: &dyn Array) -> Result<&UInt64Array> {
     Ok(downcast_value!(array, UInt64Array))
 }
 
-// Downcast ArrayRef to Decimal128Array
+// Downcast Array to Decimal32Array
+pub fn as_decimal32_array(array: &dyn Array) -> Result<&Decimal32Array> {
+    Ok(downcast_value!(array, Decimal32Array))
+}
+
+// Downcast Array to Decimal64Array
+pub fn as_decimal64_array(array: &dyn Array) -> Result<&Decimal64Array> {
+    Ok(downcast_value!(array, Decimal64Array))
+}
+
+// Downcast Array to Decimal128Array
 pub fn as_decimal128_array(array: &dyn Array) -> Result<&Decimal128Array> {
     Ok(downcast_value!(array, Decimal128Array))
 }
 
-// Downcast ArrayRef to Decimal256Array
+// Downcast Array to Decimal256Array
 pub fn as_decimal256_array(array: &dyn Array) -> Result<&Decimal256Array> {
     Ok(downcast_value!(array, Decimal256Array))
 }
 
-// Downcast ArrayRef to Float16Array
+// Downcast Array to Float16Array
 pub fn as_float16_array(array: &dyn Array) -> Result<&Float16Array> {
     Ok(downcast_value!(array, Float16Array))
 }
 
-// Downcast ArrayRef to Float32Array
+// Downcast Array to Float32Array
 pub fn as_float32_array(array: &dyn Array) -> Result<&Float32Array> {
     Ok(downcast_value!(array, Float32Array))
 }
 
-// Downcast ArrayRef to Float64Array
+// Downcast Array to Float64Array
 pub fn as_float64_array(array: &dyn Array) -> Result<&Float64Array> {
     Ok(downcast_value!(array, Float64Array))
 }
 
-// Downcast ArrayRef to StringArray
+// Downcast Array to StringArray
 pub fn as_string_array(array: &dyn Array) -> Result<&StringArray> {
     Ok(downcast_value!(array, StringArray))
 }
 
-// Downcast ArrayRef to StringViewArray
+// Downcast Array to StringViewArray
 pub fn as_string_view_array(array: &dyn Array) -> Result<&StringViewArray> {
     Ok(downcast_value!(array, StringViewArray))
 }
 
-// Downcast ArrayRef to LargeStringArray
+// Downcast Array to LargeStringArray
 pub fn as_large_string_array(array: &dyn Array) -> Result<&LargeStringArray> {
     Ok(downcast_value!(array, LargeStringArray))
 }
 
-// Downcast ArrayRef to BooleanArray
+// Downcast Array to BooleanArray
 pub fn as_boolean_array(array: &dyn Array) -> Result<&BooleanArray> {
     Ok(downcast_value!(array, BooleanArray))
 }
 
-// Downcast ArrayRef to ListArray
+// Downcast Array to ListArray
 pub fn as_list_array(array: &dyn Array) -> Result<&ListArray> {
     Ok(downcast_value!(array, ListArray))
 }
 
-// Downcast ArrayRef to DictionaryArray
+// Downcast Array to DictionaryArray
 pub fn as_dictionary_array<T: ArrowDictionaryKeyType>(
     array: &dyn Array,
 ) -> Result<&DictionaryArray<T>> {
     Ok(downcast_value!(array, DictionaryArray, T))
 }
 
-// Downcast ArrayRef to GenericBinaryArray
+// Downcast Array to GenericBinaryArray
 pub fn as_generic_binary_array<T: OffsetSizeTrait>(
     array: &dyn Array,
 ) -> Result<&GenericBinaryArray<T>> {
     Ok(downcast_value!(array, GenericBinaryArray, T))
 }
 
-// Downcast ArrayRef to GenericListArray
+// Downcast Array to GenericListArray
 pub fn as_generic_list_array<T: OffsetSizeTrait>(
     array: &dyn Array,
 ) -> Result<&GenericListArray<T>> {
     Ok(downcast_value!(array, GenericListArray, T))
 }
 
-// Downcast ArrayRef to LargeListArray
+// Downcast Array to LargeListArray
 pub fn as_large_list_array(array: &dyn Array) -> Result<&LargeListArray> {
     Ok(downcast_value!(array, LargeListArray))
 }
 
-// Downcast ArrayRef to PrimitiveArray
+// Downcast Array to PrimitiveArray
 pub fn as_primitive_array<T: ArrowPrimitiveType>(
     array: &dyn Array,
 ) -> Result<&PrimitiveArray<T>> {
     Ok(downcast_value!(array, PrimitiveArray, T))
 }
 
-// Downcast ArrayRef to MapArray
+// Downcast Array to MapArray
 pub fn as_map_array(array: &dyn Array) -> Result<&MapArray> {
     Ok(downcast_value!(array, MapArray))
 }
 
-// Downcast ArrayRef to NullArray
+// Downcast Array to NullArray
 pub fn as_null_array(array: &dyn Array) -> Result<&NullArray> {
     Ok(downcast_value!(array, NullArray))
 }
 
-// Downcast ArrayRef to NullArray
+// Downcast Array to NullArray
 pub fn as_union_array(array: &dyn Array) -> Result<&UnionArray> {
     Ok(downcast_value!(array, UnionArray))
 }
 
-// Downcast ArrayRef to Time32SecondArray
+// Downcast Array to Time32SecondArray
 pub fn as_time32_second_array(array: &dyn Array) -> Result<&Time32SecondArray> {
     Ok(downcast_value!(array, Time32SecondArray))
 }
 
-// Downcast ArrayRef to Time32MillisecondArray
+// Downcast Array to Time32MillisecondArray
 pub fn as_time32_millisecond_array(array: &dyn Array) -> Result<&Time32MillisecondArray> {
     Ok(downcast_value!(array, Time32MillisecondArray))
 }
 
-// Downcast ArrayRef to Time64MicrosecondArray
+// Downcast Array to Time64MicrosecondArray
 pub fn as_time64_microsecond_array(array: &dyn Array) -> Result<&Time64MicrosecondArray> {
     Ok(downcast_value!(array, Time64MicrosecondArray))
 }
 
-// Downcast ArrayRef to Time64NanosecondArray
+// Downcast Array to Time64NanosecondArray
 pub fn as_time64_nanosecond_array(array: &dyn Array) -> Result<&Time64NanosecondArray> {
     Ok(downcast_value!(array, Time64NanosecondArray))
 }
 
-// Downcast ArrayRef to TimestampNanosecondArray
+// Downcast Array to TimestampNanosecondArray
 pub fn as_timestamp_nanosecond_array(
     array: &dyn Array,
 ) -> Result<&TimestampNanosecondArray> {
     Ok(downcast_value!(array, TimestampNanosecondArray))
 }
 
-// Downcast ArrayRef to TimestampMillisecondArray
+// Downcast Array to TimestampMillisecondArray
 pub fn as_timestamp_millisecond_array(
     array: &dyn Array,
 ) -> Result<&TimestampMillisecondArray> {
     Ok(downcast_value!(array, TimestampMillisecondArray))
 }
 
-// Downcast ArrayRef to TimestampMicrosecondArray
+// Downcast Array to TimestampMicrosecondArray
 pub fn as_timestamp_microsecond_array(
     array: &dyn Array,
 ) -> Result<&TimestampMicrosecondArray> {
     Ok(downcast_value!(array, TimestampMicrosecondArray))
 }
 
-// Downcast ArrayRef to TimestampSecondArray
+// Downcast Array to TimestampSecondArray
 pub fn as_timestamp_second_array(array: &dyn Array) -> Result<&TimestampSecondArray> {
     Ok(downcast_value!(array, TimestampSecondArray))
 }
 
-// Downcast ArrayRef to IntervalYearMonthArray
+// Downcast Array to IntervalYearMonthArray
 pub fn as_interval_ym_array(array: &dyn Array) -> Result<&IntervalYearMonthArray> {
     Ok(downcast_value!(array, IntervalYearMonthArray))
 }
 
-// Downcast ArrayRef to IntervalDayTimeArray
+// Downcast Array to IntervalDayTimeArray
 pub fn as_interval_dt_array(array: &dyn Array) -> Result<&IntervalDayTimeArray> {
     Ok(downcast_value!(array, IntervalDayTimeArray))
 }
 
-// Downcast ArrayRef to IntervalMonthDayNanoArray
+// Downcast Array to IntervalMonthDayNanoArray
 pub fn as_interval_mdn_array(array: &dyn Array) -> Result<&IntervalMonthDayNanoArray> {
     Ok(downcast_value!(array, IntervalMonthDayNanoArray))
 }
 
-// Downcast ArrayRef to BinaryArray
+// Downcast Array to DurationSecondArray
+pub fn as_duration_second_array(array: &dyn Array) -> Result<&DurationSecondArray> {
+    Ok(downcast_value!(array, DurationSecondArray))
+}
+
+// Downcast Array to DurationMillisecondArray
+pub fn as_duration_millisecond_array(
+    array: &dyn Array,
+) -> Result<&DurationMillisecondArray> {
+    Ok(downcast_value!(array, DurationMillisecondArray))
+}
+
+// Downcast Array to DurationMicrosecondArray
+pub fn as_duration_microsecond_array(
+    array: &dyn Array,
+) -> Result<&DurationMicrosecondArray> {
+    Ok(downcast_value!(array, DurationMicrosecondArray))
+}
+
+// Downcast Array to DurationNanosecondArray
+pub fn as_duration_nanosecond_array(
+    array: &dyn Array,
+) -> Result<&DurationNanosecondArray> {
+    Ok(downcast_value!(array, DurationNanosecondArray))
+}
+
+// Downcast Array to BinaryArray
 pub fn as_binary_array(array: &dyn Array) -> Result<&BinaryArray> {
     Ok(downcast_value!(array, BinaryArray))
 }
 
-// Downcast ArrayRef to BinaryViewArray
+// Downcast Array to BinaryViewArray
 pub fn as_binary_view_array(array: &dyn Array) -> Result<&BinaryViewArray> {
     Ok(downcast_value!(array, BinaryViewArray))
 }
 
-// Downcast ArrayRef to LargeBinaryArray
+// Downcast Array to LargeBinaryArray
 pub fn as_large_binary_array(array: &dyn Array) -> Result<&LargeBinaryArray> {
     Ok(downcast_value!(array, LargeBinaryArray))
 }
 
-// Downcast ArrayRef to FixedSizeListArray
+// Downcast Array to FixedSizeListArray
 pub fn as_fixed_size_list_array(array: &dyn Array) -> Result<&FixedSizeListArray> {
     Ok(downcast_value!(array, FixedSizeListArray))
 }
 
-// Downcast ArrayRef to FixedSizeListArray
+// Downcast Array to FixedSizeBinaryArray
 pub fn as_fixed_size_binary_array(array: &dyn Array) -> Result<&FixedSizeBinaryArray> {
     Ok(downcast_value!(array, FixedSizeBinaryArray))
 }
 
-// Downcast ArrayRef to GenericBinaryArray
+// Downcast Array to GenericBinaryArray
 pub fn as_generic_string_array<T: OffsetSizeTrait>(
     array: &dyn Array,
 ) -> Result<&GenericStringArray<T>> {
     Ok(downcast_value!(array, GenericStringArray, T))
 }
+
+// Downcast Array to ListViewArray
+pub fn as_list_view_array(array: &dyn Array) -> Result<&ListViewArray> {
+    Ok(downcast_value!(array, ListViewArray))
+}
+
+// Downcast Array to LargeListViewArray
+pub fn as_large_list_view_array(array: &dyn Array) -> Result<&LargeListViewArray> {
+    Ok(downcast_value!(array, LargeListViewArray))
+}
+
+// Downcast Array to RunArray
+pub fn as_run_array<T: RunEndIndexType>(array: &dyn Array) -> Result<&RunArray<T>> {
+    Ok(downcast_value!(array, RunArray, T))
+}
diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs
index b3acaeee5a54c..c7f0b5a4f4881 100644
--- a/datafusion/common/src/column.rs
+++ b/datafusion/common/src/column.rs
@@ -18,13 +18,12 @@
 //! Column
 
 use crate::error::{_schema_err, add_possible_columns_to_diag};
-use crate::utils::{parse_identifiers_normalized, quote_identifier};
+use crate::utils::parse_identifiers_normalized;
+use crate::utils::quote_identifier;
 use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference};
 use arrow::datatypes::{Field, FieldRef};
 use std::collections::HashSet;
-use std::convert::Infallible;
 use std::fmt;
-use std::str::FromStr;
 
 /// A named reference to a qualified field in a schema.
 #[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
@@ -140,6 +139,7 @@ impl Column {
     }
 
     /// Deserialize a fully qualified name string into a column preserving column text case
+    #[cfg(feature = "sql")]
     pub fn from_qualified_name_ignore_case(flat_name: impl Into<String>) -> Self {
         let flat_name = flat_name.into();
         Self::from_idents(parse_identifiers_normalized(&flat_name, true)).unwrap_or_else(
@@ -151,6 +151,11 @@ impl Column {
         )
     }
 
+    #[cfg(not(feature = "sql"))]
+    pub fn from_qualified_name_ignore_case(flat_name: impl Into<String>) -> Self {
+        Self::from_qualified_name(flat_name)
+    }
+
     /// return the column's name.
     ///
     /// Note: This ignores the relation and returns the column name only.
@@ -262,7 +267,7 @@ impl Column {
 
                     // If not due to USING columns then due to ambiguous column name
                     return _schema_err!(SchemaError::AmbiguousReference {
-                        field: Column::new_unqualified(&self.name),
+                        field: Box::new(Column::new_unqualified(&self.name)),
                     })
                     .map_err(|err| {
                         let mut diagnostic = Diagnostic::new_error(
@@ -356,8 +361,9 @@ impl From<(Option<&TableReference>, &FieldRef)> for Column {
     }
 }
 
-impl FromStr for Column {
-    type Err = Infallible;
+#[cfg(feature = "sql")]
+impl std::str::FromStr for Column {
+    type Err = std::convert::Infallible;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         Ok(s.into())
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 726015d171496..9b6e6aa5dac37 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -17,15 +17,25 @@
 
 //! Runtime configuration, via [`ConfigOptions`]
 
+use arrow_ipc::CompressionType;
+
+#[cfg(feature = "parquet_encryption")]
+use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties};
 use crate::error::_config_err;
+use crate::format::{ExplainAnalyzeLevel, ExplainFormat};
+use crate::parquet_config::DFParquetWriterVersion;
 use crate::parsers::CompressionTypeVariant;
 use crate::utils::get_available_parallelism;
 use crate::{DataFusionError, Result};
+#[cfg(feature = "parquet_encryption")]
+use hex;
 use std::any::Any;
 use std::collections::{BTreeMap, HashMap};
 use std::error::Error;
 use std::fmt::{self, Display};
 use std::str::FromStr;
+#[cfg(feature = "parquet_encryption")]
+use std::sync::Arc;
 
 /// A macro that wraps a configuration struct and automatically derives
 /// [`Default`] and [`ConfigField`] for it, allowing it to be used
@@ -48,7 +58,7 @@ use std::str::FromStr;
 ///        /// Field 3 doc
 ///        field3: Option<usize>, default = None
 ///    }
-///}
+/// }
 /// ```
 ///
 /// Will generate
@@ -148,12 +158,10 @@ macro_rules! config_namespace {
                             // $(#[allow(deprecated)])?
                             {
                                 $(let value = $transform(value);)? // Apply transformation if specified
-                                #[allow(deprecated)]
                                 let ret = self.$field_name.set(rem, value.as_ref());
 
                                 $(if !$warn.is_empty() {
                                     let default: $field_type = $default;
-                                    #[allow(deprecated)]
                                     if default != self.$field_name {
                                         log::warn!($warn);
                                     }
@@ -172,14 +180,36 @@ macro_rules! config_namespace {
                 $(
                     let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
+
+            fn reset(&mut self, key: &str) -> $crate::error::Result<()> {
+                let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+                match key {
+                    $(
+                        stringify!($field_name) => {
+                                    {
+                                if rem.is_empty() {
+                                    let default_value: $field_type = $default;
+                                    self.$field_name = default_value;
+                                    Ok(())
+                                } else {
+                                    self.$field_name.reset(rem)
+                                }
+                            }
+                        },
+                    )*
+                    _ => $crate::error::_config_err!(
+                        "Config value \"{}\" not found on {}",
+                        key,
+                        stringify!($struct_name)
+                    ),
+                }
+            }
         }
         impl Default for $struct_name {
             fn default() -> Self {
-                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -250,7 +280,7 @@ config_namespace! {
 
         /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
         /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
-        pub dialect: String, default = "generic".to_string()
+        pub dialect: Dialect, default = Dialect::Generic
         // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive
 
         /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
@@ -259,10 +289,10 @@ config_namespace! {
         /// string length and thus DataFusion can not enforce such limits.
         pub support_varchar_with_length: bool, default = true
 
-       /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning.
-       /// If false, `VARCHAR` is mapped to `Utf8`  during SQL planning.
-       /// Default is false.
-        pub map_varchar_to_utf8view: bool, default = true
+        /// If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
+        /// If false, they are mapped to `Utf8`.
+        /// Default is true.
+        pub map_string_types_to_utf8view: bool, default = true
 
         /// When set to true, the source locations relative to the original SQL
         /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected
@@ -271,6 +301,159 @@ config_namespace! {
 
         /// Specifies the recursion depth limit when parsing complex SQL Queries
         pub recursion_limit: usize, default = 50
+
+        /// Specifies the default null ordering for query results. There are 4 options:
+        /// - `nulls_max`: Nulls appear last in ascending order.
+        /// - `nulls_min`: Nulls appear first in ascending order.
+        /// - `nulls_first`: Nulls always be first in any order.
+        /// - `nulls_last`: Nulls always be last in any order.
+        ///
+        /// By default, `nulls_max` is used to follow Postgres's behavior.
+        /// postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>
+        pub default_null_ordering: String, default = "nulls_max".to_string()
+    }
+}
+
+/// This is the SQL dialect used by DataFusion's parser.
+/// This mirrors [sqlparser::dialect::Dialect](https://docs.rs/sqlparser/latest/sqlparser/dialect/trait.Dialect.html)
+/// trait in order to offer an easier API and avoid adding the `sqlparser` dependency
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
+pub enum Dialect {
+    #[default]
+    Generic,
+    MySQL,
+    PostgreSQL,
+    Hive,
+    SQLite,
+    Snowflake,
+    Redshift,
+    MsSQL,
+    ClickHouse,
+    BigQuery,
+    Ansi,
+    DuckDB,
+    Databricks,
+}
+
+impl AsRef<str> for Dialect {
+    fn as_ref(&self) -> &str {
+        match self {
+            Self::Generic => "generic",
+            Self::MySQL => "mysql",
+            Self::PostgreSQL => "postgresql",
+            Self::Hive => "hive",
+            Self::SQLite => "sqlite",
+            Self::Snowflake => "snowflake",
+            Self::Redshift => "redshift",
+            Self::MsSQL => "mssql",
+            Self::ClickHouse => "clickhouse",
+            Self::BigQuery => "bigquery",
+            Self::Ansi => "ansi",
+            Self::DuckDB => "duckdb",
+            Self::Databricks => "databricks",
+        }
+    }
+}
+
+impl FromStr for Dialect {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let value = match s.to_ascii_lowercase().as_str() {
+            "generic" => Self::Generic,
+            "mysql" => Self::MySQL,
+            "postgresql" | "postgres" => Self::PostgreSQL,
+            "hive" => Self::Hive,
+            "sqlite" => Self::SQLite,
+            "snowflake" => Self::Snowflake,
+            "redshift" => Self::Redshift,
+            "mssql" => Self::MsSQL,
+            "clickhouse" => Self::ClickHouse,
+            "bigquery" => Self::BigQuery,
+            "ansi" => Self::Ansi,
+            "duckdb" => Self::DuckDB,
+            "databricks" => Self::Databricks,
+            other => {
+                let error_message = format!(
+                    "Invalid Dialect: {other}. Expected one of: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB, Databricks"
+                );
+                return Err(DataFusionError::Configuration(error_message));
+            }
+        };
+        Ok(value)
+    }
+}
+
+impl ConfigField for Dialect {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = Self::from_str(value)?;
+        Ok(())
+    }
+}
+
+impl Display for Dialect {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let str = self.as_ref();
+        write!(f, "{str}")
+    }
+}
+
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
+pub enum SpillCompression {
+    Zstd,
+    Lz4Frame,
+    #[default]
+    Uncompressed,
+}
+
+impl FromStr for SpillCompression {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "zstd" => Ok(Self::Zstd),
+            "lz4_frame" => Ok(Self::Lz4Frame),
+            "uncompressed" | "" => Ok(Self::Uncompressed),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid Spill file compression type: {other}. Expected one of: zstd, lz4_frame, uncompressed"
+            ))),
+        }
+    }
+}
+
+impl ConfigField for SpillCompression {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = SpillCompression::from_str(value)?;
+        Ok(())
+    }
+}
+
+impl Display for SpillCompression {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let str = match self {
+            Self::Zstd => "zstd",
+            Self::Lz4Frame => "lz4_frame",
+            Self::Uncompressed => "uncompressed",
+        };
+        write!(f, "{str}")
+    }
+}
+
+impl From<SpillCompression> for Option<CompressionType> {
+    fn from(c: SpillCompression) -> Self {
+        match c {
+            SpillCompression::Zstd => Some(CompressionType::ZSTD),
+            SpillCompression::Lz4Frame => Some(CompressionType::LZ4_FRAME),
+            SpillCompression::Uncompressed => None,
+        }
     }
 }
 
@@ -286,6 +469,25 @@ config_namespace! {
         /// metadata memory consumption
         pub batch_size: usize, default = 8192
 
+        /// A perfect hash join (see `HashJoinExec` for more details) will be considered
+        /// if the range of keys (max - min) on the build side is < this threshold.
+        /// This provides a fast path for joins with very small key ranges,
+        /// bypassing the density check.
+        ///
+        /// Currently only supports cases where build_side.num_rows() < u32::MAX.
+        /// Support for build_side.num_rows() >= u32::MAX will be added in the future.
+        pub perfect_hash_join_small_build_threshold: usize, default = 1024
+
+        /// The minimum required density of join keys on the build side to consider a
+        /// perfect hash join (see `HashJoinExec` for more details). Density is calculated as:
+        /// `(number of rows) / (max_key - min_key + 1)`.
+        /// A perfect hash join may be used if the actual key density > this
+        /// value.
+        ///
+        /// Currently only supports cases where build_side.num_rows() < u32::MAX.
+        /// Support for build_side.num_rows() >= u32::MAX will be added in the future.
+        pub perfect_hash_join_min_key_density: f64, default = 0.15
+
         /// When set to true, record batches will be examined between each operator and
         /// small batches will be coalesced into larger batches. This is helpful when there
         /// are highly selective filters or joins that could produce tiny output batches. The
@@ -294,8 +496,8 @@ config_namespace! {
 
         /// Should DataFusion collect statistics when first creating a table.
         /// Has no effect after the table is created. Applies to the default
-        /// `ListingTableProvider` in DataFusion. Defaults to false.
-        pub collect_statistics: bool, default = false
+        /// `ListingTableProvider` in DataFusion. Defaults to true.
+        pub collect_statistics: bool, default = true
 
         /// Number of partitions for query execution. Increasing partitions can increase
         /// concurrency.
@@ -305,9 +507,8 @@ config_namespace! {
 
         /// The default time zone
         ///
-        /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
-        /// according to this time zone, and then extract the hour
-        pub time_zone: Option<String>, default = Some("+00:00".into())
+        /// Some functions, e.g. `now` return timestamps in this time zone
+        pub time_zone: Option<String>, default = None
 
         /// Parquet options
         pub parquet: ParquetOptions, default = Default::default()
@@ -330,6 +531,16 @@ config_namespace! {
         /// the new schema verification step.
         pub skip_physical_aggregate_schema_check: bool, default = false
 
+        /// Sets the compression codec used when spilling data to disk.
+        ///
+        /// Since datafusion writes spill files using the Arrow IPC Stream format,
+        /// only codecs supported by the Arrow IPC Stream Writer are allowed.
+        /// Valid values are: uncompressed, lz4_frame, zstd.
+        /// Note: lz4_frame offers faster (de)compression, but typically results in
+        /// larger spill files. In contrast, zstd achieves
+        /// higher compression ratios at the cost of slower (de)compression speed.
+        pub spill_compression: SpillCompression, default = SpillCompression::Uncompressed
+
         /// Specifies the reserved memory for each spillable sort operation to
         /// facilitate an in-memory merge.
         ///
@@ -346,6 +557,23 @@ config_namespace! {
         /// batches and merged.
         pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024
 
+        /// Maximum size in bytes for individual spill files before rotating to a new file.
+        ///
+        /// When operators spill data to disk (e.g., RepartitionExec), they write
+        /// multiple batches to the same file until this size limit is reached, then rotate
+        /// to a new file. This reduces syscall overhead compared to one-file-per-batch
+        /// while preventing files from growing too large.
+        ///
+        /// A larger value reduces file creation overhead but may hold more disk space.
+        /// A smaller value creates more files but allows finer-grained space reclamation
+        /// as files can be deleted once fully consumed.
+        ///
+        /// Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators
+        /// may create spill files larger than the limit.
+        ///
+        /// Default: 128 MB
+        pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024
+
         /// Number of files to read in parallel when inferring schema and statistics
         pub meta_fetch_concurrency: usize, default = 32
 
@@ -373,6 +601,11 @@ config_namespace! {
         /// tables (e.g. `/table/year=2021/month=01/data.parquet`).
         pub listing_table_ignore_subdirectory: bool, default = true
 
+        /// Should a `ListingTable` created through the `ListingTableFactory` infer table
+        /// partitions from Hive compliant directories. Defaults to true (partition columns are
+        /// inferred and will be represented in the table schema).
+        pub listing_table_factory_infer_partitions: bool, default = true
+
         /// Should DataFusion support recursive CTEs
         pub enable_recursive_ctes: bool, default = true
 
@@ -413,6 +646,44 @@ config_namespace! {
         /// written, it may be necessary to increase this size to avoid errors from
         /// the remote end point.
         pub objectstore_writer_buffer_size: usize, default = 10 * 1024 * 1024
+
+        /// Whether to enable ANSI SQL mode.
+        ///
+        /// The flag is experimental and relevant only for DataFusion Spark built-in functions
+        ///
+        /// When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL
+        /// semantics for expressions, casting, and error handling. This means:
+        /// - **Strict type coercion rules:** implicit casts between incompatible types are disallowed.
+        /// - **Standard SQL arithmetic behavior:** operations such as division by zero,
+        ///   numeric overflow, or invalid casts raise runtime errors rather than returning
+        ///   `NULL` or adjusted values.
+        /// - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling.
+        ///
+        /// When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive,
+        /// non-ANSI mode designed for user convenience and backward compatibility. In this mode:
+        /// - Implicit casts between types are allowed (e.g., string to integer when possible).
+        /// - Arithmetic operations are more lenient — for example, `abs()` on the minimum
+        ///   representable integer value returns the input value instead of raising overflow.
+        /// - Division by zero or invalid casts may return `NULL` instead of failing.
+        ///
+        /// # Default
+        /// `false` — ANSI SQL mode is disabled by default.
+        pub enable_ansi_mode: bool, default = false
+
+        /// How many bytes to buffer in the probe side of hash joins while the build side is
+        /// concurrently being built.
+        ///
+        /// Without this, hash joins will wait until the full materialization of the build side
+        /// before polling the probe side. This is useful in scenarios where the query is not
+        /// completely CPU bounded, allowing to do some early work concurrently and reducing the
+        /// latency of the query.
+        ///
+        /// Note that when hash join buffering is enabled, the probe side will start eagerly
+        /// polling data, not giving time for the producer side of dynamic filters to produce any
+        /// meaningful predicate. Queries with dynamic filters might see performance degradation.
+        ///
+        /// Disabled by default, set to a number greater than 0 for enabling it.
+        pub hash_join_buffering_capacity: usize, default = 0
     }
 }
 
@@ -444,7 +715,10 @@ config_namespace! {
         /// bytes of the parquet file optimistically. If not specified, two reads are required:
         /// One read to fetch the 8-byte parquet footer and
         /// another to fetch the metadata length encoded in the footer
-        pub metadata_size_hint: Option<usize>, default = None
+        /// Default setting to 512 KiB, which should be sufficient for most parquet files,
+        /// it can reduce one I/O operation per parquet file. If the metadata is larger than
+        /// the hint, two reads will still be performed.
+        pub metadata_size_hint: Option<usize>, default = Some(512 * 1024)
 
         /// (reading) If true, filter expressions are be applied during the parquet decoding operation to
         /// reduce the number of rows decoded. This optimization is sometimes called "late materialization".
@@ -455,6 +729,12 @@ config_namespace! {
         /// the filters are applied in the same order as written in the query
         pub reorder_filters: bool, default = false
 
+        /// (reading) Force the use of RowSelections for filter results, when
+        /// pushdown_filters is enabled. If false, the reader will automatically
+        /// choose between a RowSelection and a Bitmap based on the number and
+        /// pattern of selected rows.
+        pub force_filter_selections: bool, default = false
+
         /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
         /// and `Binary/BinaryLarge` with `BinaryView`.
         pub schema_force_view_types: bool, default = true
@@ -478,18 +758,26 @@ config_namespace! {
         /// (reading) Use any available bloom filters when reading parquet files
         pub bloom_filter_on_read: bool, default = true
 
+        /// (reading) The maximum predicate cache size, in bytes. When
+        /// `pushdown_filters` is enabled, sets the maximum memory used to cache
+        /// the results of predicate evaluation between filter evaluation and
+        /// output generation. Decreasing this value will reduce memory usage,
+        /// but may increase IO and CPU usage. None means use the default
+        /// parquet reader setting. 0 means no caching.
+        pub max_predicate_cache_size: Option<usize>, default = None
+
         // The following options affect writing to parquet files
         // and map to parquet::file::properties::WriterProperties
 
         /// (writing) Sets best effort maximum size of data page in bytes
         pub data_pagesize_limit: usize, default = 1024 * 1024
 
-        /// (writing) Sets write_batch_size in bytes
+        /// (writing) Sets write_batch_size in rows
         pub write_batch_size: usize, default = 1024
 
         /// (writing) Sets parquet writer version
         /// valid values are "1.0" and "2.0"
-        pub writer_version: String, default = "1.0".to_string()
+        pub writer_version: DFParquetWriterVersion, default = DFParquetWriterVersion::default()
 
         /// (writing) Skip encoding the embedded arrow metadata in the KV_meta
         ///
@@ -499,7 +787,7 @@ config_namespace! {
 
         /// (writing) Sets default parquet compression codec.
         /// Valid values are: uncompressed, snappy, gzip(level),
-        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
+        /// brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case sensitive. If NULL, uses
         /// default parquet writer setting
         ///
@@ -520,13 +808,6 @@ config_namespace! {
         /// default parquet writer setting
         pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
 
-        /// (writing) Sets max statistics size for any column. If NULL, uses
-        /// default parquet writer setting
-        /// max_statistics_size is deprecated, currently it is not being used
-        // TODO: remove once deprecated
-        #[deprecated(since = "45.0.0", note = "Setting does not do anything")]
-        pub max_statistics_size: Option<usize>, default = Some(4096)
-
         /// (writing) Target maximum number of rows in each row group (defaults to 1M
         /// rows). Writing larger row groups requires more memory to write, but
         /// can get better compression and be faster to read.
@@ -538,9 +819,9 @@ config_namespace! {
         /// (writing) Sets column index truncate length
         pub column_index_truncate_length: Option<usize>, default = Some(64)
 
-        /// (writing) Sets statictics truncate length. If NULL, uses
+        /// (writing) Sets statistics truncate length. If NULL, uses
         /// default parquet writer setting
-        pub statistics_truncate_length: Option<usize>, default = None
+        pub statistics_truncate_length: Option<usize>, default = Some(64)
 
         /// (writing) Sets best effort maximum number of rows in data page
         pub data_page_row_count_limit: usize, default = 20_000
@@ -594,6 +875,44 @@ config_namespace! {
     }
 }
 
+config_namespace! {
+    /// Options for configuring Parquet Modular Encryption
+    ///
+    /// To use Parquet encryption, you must enable the `parquet_encryption` feature flag, as it is not activated by default.
+    pub struct ParquetEncryptionOptions {
+        /// Optional file decryption properties
+        pub file_decryption: Option<ConfigFileDecryptionProperties>, default = None
+
+        /// Optional file encryption properties
+        pub file_encryption: Option<ConfigFileEncryptionProperties>, default = None
+
+        /// Identifier for the encryption factory to use to create file encryption and decryption properties.
+        /// Encryption factories can be registered in the runtime environment with
+        /// `RuntimeEnv::register_parquet_encryption_factory`.
+        pub factory_id: Option<String>, default = None
+
+        /// Any encryption factory specific options
+        pub factory_options: EncryptionFactoryOptions, default = EncryptionFactoryOptions::default()
+    }
+}
+
+impl ParquetEncryptionOptions {
+    /// Specify the encryption factory to use for Parquet modular encryption, along with its configuration
+    pub fn configure_factory(
+        &mut self,
+        factory_id: &str,
+        config: &impl ExtensionOptions,
+    ) {
+        self.factory_id = Some(factory_id.to_owned());
+        self.factory_options.options.clear();
+        for entry in config.entries() {
+            if let Some(value) = entry.value {
+                self.factory_options.options.insert(entry.key, value);
+            }
+        }
+    }
+}
+
 config_namespace! {
     /// Options related to query optimization
     ///
@@ -614,6 +933,36 @@ config_namespace! {
         /// during aggregations, if possible
         pub enable_topk_aggregation: bool, default = true
 
+        /// When set to true, the optimizer will attempt to push limit operations
+        /// past window functions, if possible
+        pub enable_window_limits: bool, default = true
+
+        /// When set to true, the optimizer will push TopK (Sort with fetch)
+        /// below hash repartition when the partition key is a prefix of the
+        /// sort key, reducing data volume before the shuffle.
+        pub enable_topk_repartition: bool, default = true
+
+        /// When set to true, the optimizer will attempt to push down TopK dynamic filters
+        /// into the file scan phase.
+        pub enable_topk_dynamic_filter_pushdown: bool, default = true
+
+        /// When set to true, the optimizer will attempt to push down Join dynamic filters
+        /// into the file scan phase.
+        pub enable_join_dynamic_filter_pushdown: bool, default = true
+
+        /// When set to true, the optimizer will attempt to push down Aggregate dynamic filters
+        /// into the file scan phase.
+        pub enable_aggregate_dynamic_filter_pushdown: bool, default = true
+
+        /// When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase.
+        /// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer
+        /// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans.
+        /// This means that if we already have 10 timestamps in the year 2025
+        /// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan.
+        /// The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown`
+        /// So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
+        pub enable_dynamic_filter_pushdown: bool, default = true
+
         /// When set to true, the optimizer will insert filters before a join between
         /// a nullable and non-nullable column to filter out nulls on the nullable side. This
         /// filter can add additional overhead when the file format does not fully support
@@ -656,6 +1005,19 @@ config_namespace! {
         /// record tables provided to the MemTable on creation.
         pub repartition_file_scans: bool, default = true
 
+        /// Minimum number of distinct partition values required to group files by their
+        /// Hive partition column values (enabling Hash partitioning declaration).
+        ///
+        /// How the option is used:
+        ///     - preserve_file_partitions=0: Disable it.
+        ///     - preserve_file_partitions=1: Always enable it.
+        ///     - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N.
+        ///     This threshold preserves I/O parallelism when file partitioning is below it.
+        ///
+        /// Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct
+        /// partitions is less than the target_partitions.
+        pub preserve_file_partitions: usize, default = 0
+
         /// Should DataFusion repartition data using the partitions keys to execute window
         /// functions in parallel using the provided `target_partitions` level
         pub repartition_windows: bool, default = true
@@ -678,6 +1040,34 @@ config_namespace! {
         /// ```
         pub repartition_sorts: bool, default = true
 
+        /// Partition count threshold for subset satisfaction optimization.
+        ///
+        /// When the current partition count is >= this threshold, DataFusion will
+        /// skip repartitioning if the required partitioning expression is a subset
+        /// of the current partition expression such as Hash(a) satisfies Hash(a, b).
+        ///
+        /// When the current partition count is < this threshold, DataFusion will
+        /// repartition to increase parallelism even when subset satisfaction applies.
+        ///
+        /// Set to 0 to always repartition (disable subset satisfaction optimization).
+        /// Set to a high value to always use subset satisfaction.
+        ///
+        /// Example (subset_repartition_threshold = 4):
+        /// ```text
+        ///     Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a])
+        ///
+        ///     If current partitions (3) < threshold (4), repartition:
+        ///     AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)]
+        ///       RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3
+        ///         AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)]
+        ///           DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3)
+        ///
+        ///     If current partitions (8) >= threshold (4), use subset satisfaction:
+        ///     AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)]
+        ///       DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8)
+        /// ```
+        pub subset_repartition_threshold: usize, default = 4
+
         /// When true, DataFusion will opportunistically remove sorts when the data is already sorted,
         /// (i.e. setting `preserve_order` to true on `RepartitionExec`  and
         /// using `SortPreservingMergeExec`)
@@ -702,6 +1092,11 @@ config_namespace! {
         /// HashJoin can work more efficiently than SortMergeJoin but consumes more memory
         pub prefer_hash_join: bool, default = true
 
+        /// When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently
+        /// experimental. Physical planner will opt for PiecewiseMergeJoin when there is only
+        /// one range filter.
+        pub enable_piecewise_merge_join: bool, default = false
+
         /// The maximum estimated size in bytes for one input side of a HashJoin
         /// will be collected into a single partition
         pub hash_join_single_partition_threshold: usize, default = 1024 * 1024
@@ -710,6 +1105,36 @@ config_namespace! {
         /// will be collected into a single partition
         pub hash_join_single_partition_threshold_rows: usize, default = 1024 * 128
 
+        /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides larger than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// InList pushdown can be more efficient for small build sides because it can result in better
+        /// statistics pruning as well as use any bloom filters present on the scan side.
+        /// InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion.
+        /// On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory.
+        ///
+        /// This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory.
+        ///
+        /// The default is 128kB per partition.
+        /// This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases
+        /// but avoids excessive memory usage or overhead for larger joins.
+        pub hash_join_inlist_pushdown_max_size: usize, default = 128 * 1024
+
+        /// Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
+        /// Build sides with more rows than this will use hash table lookups instead.
+        /// Set to 0 to always use hash table lookups.
+        ///
+        /// This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent
+        /// very large IN lists that might not provide much benefit over hash table lookups.
+        ///
+        /// This uses the deduplicated row count once the build side has been evaluated.
+        ///
+        /// The default is 150 values per partition.
+        /// This is inspired by Trino's `max-filter-keys-per-column` setting.
+        /// See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
+        pub hash_join_inlist_pushdown_max_distinct_values: usize, default = 150
+
         /// The default filter selectivity used by Filter Statistics
         /// when an exact selectivity cannot be determined. Valid values are
         /// between 0 (no selectivity) and 100 (all rows are selected).
@@ -722,6 +1147,27 @@ config_namespace! {
         /// then the output will be coerced to a non-view.
         /// Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.
         pub expand_views_at_output: bool, default = false
+
+        /// Enable sort pushdown optimization.
+        /// When enabled, attempts to push sort requirements down to data sources
+        /// that can natively handle them (e.g., by reversing file/row group read order).
+        ///
+        /// Returns **inexact ordering**: Sort operator is kept for correctness,
+        /// but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N),
+        /// providing significant speedup.
+        ///
+        /// Memory: No additional overhead (only changes read order).
+        ///
+        /// Future: Will add option to detect perfectly sorted data and eliminate Sort completely.
+        ///
+        /// Default: true
+        pub enable_sort_pushdown: bool, default = true
+
+        /// When set to true, the optimizer will extract leaf expressions
+        /// (such as `get_field`) from filter/sort/join nodes into projections
+        /// closer to the leaf table scans, and push those projections down
+        /// towards the leaf nodes.
+        pub enable_leaf_expression_pushdown: bool, default = true
     }
 }
 
@@ -750,7 +1196,16 @@ config_namespace! {
 
         /// Display format of explain. Default is "indent".
         /// When set to "tree", it will print the plan in a tree-rendered format.
-        pub format: String, default = "indent".to_string()
+        pub format: ExplainFormat, default = ExplainFormat::Indent
+
+        /// (format=tree only) Maximum total width of the rendered tree.
+        /// When set to 0, the tree will have no width limit.
+        pub tree_maximum_render_width: usize, default = 240
+
+        /// Verbosity level for "EXPLAIN ANALYZE". Default is "dev"
+        /// "summary" shows common metrics for high-level insights.
+        /// "dev" provides deep operator-level introspection for developers.
+        pub analyze_level: ExplainAnalyzeLevel, default = ExplainAnalyzeLevel::Dev
     }
 }
 
@@ -803,7 +1258,7 @@ impl<'a> TryInto<arrow::util::display::FormatOptions<'a>> for &'a FormatOptions
                 return _config_err!(
                     "Invalid duration format: {}. Valid values are pretty or iso8601",
                     self.duration_format
-                )
+                );
             }
         };
 
@@ -821,7 +1276,7 @@ impl<'a> TryInto<arrow::util::display::FormatOptions<'a>> for &'a FormatOptions
 }
 
 /// A key value pair, with a corresponding description
-#[derive(Debug)]
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
 pub struct ConfigEntry {
     /// A unique string to identify this config value
     pub key: String,
@@ -854,6 +1309,15 @@ pub struct ConfigOptions {
 }
 
 impl ConfigField for ConfigOptions {
+    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
+        self.catalog.visit(v, "datafusion.catalog", "");
+        self.execution.visit(v, "datafusion.execution", "");
+        self.optimizer.visit(v, "datafusion.optimizer", "");
+        self.explain.visit(v, "datafusion.explain", "");
+        self.sql_parser.visit(v, "datafusion.sql_parser", "");
+        self.format.visit(v, "datafusion.format", "");
+    }
+
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         // Extensions are handled in the public `ConfigOptions::set`
         let (key, rem) = key.split_once('.').unwrap_or((key, ""));
@@ -868,16 +1332,50 @@ impl ConfigField for ConfigOptions {
         }
     }
 
-    fn visit<V: Visit>(&self, v: &mut V, _key_prefix: &str, _description: &'static str) {
-        self.catalog.visit(v, "datafusion.catalog", "");
-        self.execution.visit(v, "datafusion.execution", "");
-        self.optimizer.visit(v, "datafusion.optimizer", "");
-        self.explain.visit(v, "datafusion.explain", "");
-        self.sql_parser.visit(v, "datafusion.sql_parser", "");
-        self.format.visit(v, "datafusion.format", "");
+    /// Reset a configuration option back to its default value
+    fn reset(&mut self, key: &str) -> Result<()> {
+        let Some((prefix, rest)) = key.split_once('.') else {
+            return _config_err!("could not find config namespace for key \"{key}\"");
+        };
+
+        if prefix != "datafusion" {
+            return _config_err!("Could not find config namespace \"{prefix}\"");
+        }
+
+        let (section, rem) = rest.split_once('.').unwrap_or((rest, ""));
+        if rem.is_empty() {
+            return _config_err!("could not find config field for key \"{key}\"");
+        }
+
+        match section {
+            "catalog" => self.catalog.reset(rem),
+            "execution" => self.execution.reset(rem),
+            "optimizer" => {
+                if rem == "enable_dynamic_filter_pushdown" {
+                    let defaults = OptimizerOptions::default();
+                    self.optimizer.enable_dynamic_filter_pushdown =
+                        defaults.enable_dynamic_filter_pushdown;
+                    self.optimizer.enable_topk_dynamic_filter_pushdown =
+                        defaults.enable_topk_dynamic_filter_pushdown;
+                    self.optimizer.enable_join_dynamic_filter_pushdown =
+                        defaults.enable_join_dynamic_filter_pushdown;
+                    Ok(())
+                } else {
+                    self.optimizer.reset(rem)
+                }
+            }
+            "explain" => self.explain.reset(rem),
+            "sql_parser" => self.sql_parser.reset(rem),
+            "format" => self.format.reset(rem),
+            other => _config_err!("Config value \"{other}\" not found on ConfigOptions"),
+        }
     }
 }
 
+/// This namespace is reserved for interacting with Foreign Function Interface
+/// (FFI) based configuration extensions.
+pub const DATAFUSION_FFI_CONFIG_NAMESPACE: &str = "datafusion_ffi";
+
 impl ConfigOptions {
     /// Creates a new [`ConfigOptions`] with default values
     pub fn new() -> Self {
@@ -892,25 +1390,62 @@ impl ConfigOptions {
 
     /// Set a configuration option
     pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
-        let Some((prefix, key)) = key.split_once('.') else {
+        let Some((mut prefix, mut inner_key)) = key.split_once('.') else {
             return _config_err!("could not find config namespace for key \"{key}\"");
         };
 
         if prefix == "datafusion" {
-            return ConfigField::set(self, key, value);
+            if inner_key == "optimizer.enable_dynamic_filter_pushdown" {
+                let bool_value = value.parse::<bool>().map_err(|e| {
+                    DataFusionError::Configuration(format!(
+                        "Failed to parse '{value}' as bool: {e}",
+                    ))
+                })?;
+
+                {
+                    self.optimizer.enable_dynamic_filter_pushdown = bool_value;
+                    self.optimizer.enable_topk_dynamic_filter_pushdown = bool_value;
+                    self.optimizer.enable_join_dynamic_filter_pushdown = bool_value;
+                    self.optimizer.enable_aggregate_dynamic_filter_pushdown = bool_value;
+                }
+                return Ok(());
+            }
+            return ConfigField::set(self, inner_key, value);
+        }
+
+        if !self.extensions.0.contains_key(prefix)
+            && self
+                .extensions
+                .0
+                .contains_key(DATAFUSION_FFI_CONFIG_NAMESPACE)
+        {
+            inner_key = key;
+            prefix = DATAFUSION_FFI_CONFIG_NAMESPACE;
         }
 
         let Some(e) = self.extensions.0.get_mut(prefix) else {
             return _config_err!("Could not find config namespace \"{prefix}\"");
         };
-        e.0.set(key, value)
+        e.0.set(inner_key, value)
     }
 
-    /// Create new ConfigOptions struct, taking values from
-    /// environment variables where possible.
+    /// Create new [`ConfigOptions`], taking values from environment variables
+    /// where possible.
+    ///
+    /// For example, to configure `datafusion.execution.batch_size`
+    /// ([`ExecutionOptions::batch_size`]) you would set the
+    /// `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
     ///
-    /// For example, setting `DATAFUSION_EXECUTION_BATCH_SIZE` will
-    /// control `datafusion.execution.batch_size`.
+    /// The name of the environment variable is the option's key, transformed to
+    /// uppercase and with periods replaced with underscores.
+    ///
+    /// Values are parsed according to the [same rules used in casts from
+    /// Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
+    ///
+    /// If the value in the environment variable cannot be cast to the type of
+    /// the configuration option, the default value will be used instead and a
+    /// warning emitted. Environment variables are read when this method is
+    /// called, and are not re-read later.
     pub fn from_env() -> Result<Self> {
         struct Visitor(Vec<String>);
 
@@ -1046,36 +1581,35 @@ impl ConfigOptions {
 /// # Example
 /// ```
 /// use datafusion_common::{
-///     config::ConfigExtension, extensions_options,
-///     config::ConfigOptions,
+///     config::ConfigExtension, config::ConfigOptions, extensions_options,
 /// };
-///  // Define a new configuration struct using the `extensions_options` macro
-///  extensions_options! {
-///     /// My own config options.
-///     pub struct MyConfig {
-///         /// Should "foo" be replaced by "bar"?
-///         pub foo_to_bar: bool, default = true
+/// // Define a new configuration struct using the `extensions_options` macro
+/// extensions_options! {
+///    /// My own config options.
+///    pub struct MyConfig {
+///        /// Should "foo" be replaced by "bar"?
+///        pub foo_to_bar: bool, default = true
 ///
-///         /// How many "baz" should be created?
-///         pub baz_count: usize, default = 1337
-///     }
-///  }
+///        /// How many "baz" should be created?
+///        pub baz_count: usize, default = 1337
+///    }
+/// }
 ///
-///  impl ConfigExtension for MyConfig {
+/// impl ConfigExtension for MyConfig {
 ///     const PREFIX: &'static str = "my_config";
-///  }
+/// }
 ///
-///  // set up config struct and register extension
-///  let mut config = ConfigOptions::default();
-///  config.extensions.insert(MyConfig::default());
+/// // set up config struct and register extension
+/// let mut config = ConfigOptions::default();
+/// config.extensions.insert(MyConfig::default());
 ///
-///  // overwrite config default
-///  config.set("my_config.baz_count", "42").unwrap();
+/// // overwrite config default
+/// config.set("my_config.baz_count", "42").unwrap();
 ///
-///  // check config state
-///  let my_config = config.extensions.get::<MyConfig>().unwrap();
-///  assert!(my_config.foo_to_bar,);
-///  assert_eq!(my_config.baz_count, 42,);
+/// // check config state
+/// let my_config = config.extensions.get::<MyConfig>().unwrap();
+/// assert!(my_config.foo_to_bar,);
+/// assert_eq!(my_config.baz_count, 42,);
 /// ```
 ///
 /// # Note:
@@ -1142,6 +1676,14 @@ impl Extensions {
         let e = self.0.get_mut(T::PREFIX)?;
         e.0.as_any_mut().downcast_mut()
     }
+
+    /// Iterates all the config extension entries yielding their prefix and their
+    /// [ExtensionOptions] implementation.
+    pub fn iter(
+        &self,
+    ) -> impl Iterator<Item = (&'static str, &Box<dyn ExtensionOptions>)> {
+        self.0.iter().map(|(k, v)| (*k, &v.0))
+    }
 }
 
 #[derive(Debug)]
@@ -1159,6 +1701,10 @@ pub trait ConfigField {
     fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str);
 
     fn set(&mut self, key: &str, value: &str) -> Result<()>;
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        _config_err!("Reset is not supported for this config field, key: {}", key)
+    }
 }
 
 impl<F: ConfigField + Default> ConfigField for Option<F> {
@@ -1172,9 +1718,21 @@ impl<F: ConfigField + Default> ConfigField for Option<F> {
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         self.get_or_insert_with(Default::default).set(key, value)
     }
+
+    fn reset(&mut self, key: &str) -> Result<()> {
+        if key.is_empty() {
+            *self = Default::default();
+            Ok(())
+        } else {
+            self.get_or_insert_with(Default::default).reset(key)
+        }
+    }
 }
 
-fn default_transform<T>(input: &str) -> Result<T>
+/// Default transformation to parse a [`ConfigField`] for a string.
+///
+/// This uses [`FromStr`] to parse the data.
+pub fn default_config_transform<T>(input: &str) -> Result<T>
 where
     T: FromStr,
     <T as FromStr>::Err: Sync + Send + Error + 'static,
@@ -1191,31 +1749,71 @@ where
     })
 }
 
+/// Macro that generates [`ConfigField`] for a given type.
+///
+/// # Usage
+/// This always requires [`Display`] to be implemented for the given type.
+///
+/// There are two ways to invoke this macro. The first one uses
+/// [`default_config_transform`]/[`FromStr`] to parse the data:
+///
+/// ```ignore
+/// config_field(MyType);
+/// ```
+///
+/// Note that the parsing error MUST implement [`std::error::Error`]!
+///
+/// Or you can specify how you want to parse an [`str`] into the type:
+///
+/// ```ignore
+/// fn parse_it(s: &str) -> Result<MyType> {
+///     ...
+/// }
+///
+/// config_field(
+///     MyType,
+///     value => parse_it(value)
+/// );
+/// ```
 #[macro_export]
 macro_rules! config_field {
     ($t:ty) => {
-        config_field!($t, value => default_transform(value)?);
+        config_field!($t, value => $crate::config::default_config_transform(value)?);
     };
 
     ($t:ty, $arg:ident => $transform:expr) => {
-        impl ConfigField for $t {
-            fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        impl $crate::config::ConfigField for $t {
+            fn visit<V: $crate::config::Visit>(&self, v: &mut V, key: &str, description: &'static str) {
                 v.some(key, self, description)
             }
 
-            fn set(&mut self, _: &str, $arg: &str) -> Result<()> {
+            fn set(&mut self, _: &str, $arg: &str) -> $crate::error::Result<()> {
                 *self = $transform;
                 Ok(())
             }
+
+            fn reset(&mut self, key: &str) -> $crate::error::Result<()> {
+                if key.is_empty() {
+                    *self = <$t as Default>::default();
+                    Ok(())
+                } else {
+                    $crate::error::_config_err!(
+                        "Config field is a scalar {} and does not have nested field \"{}\"",
+                        stringify!($t),
+                        key
+                    )
+                }
+            }
         }
     };
 }
 
 config_field!(String);
-config_field!(bool, value => default_transform(value.to_lowercase().as_str())?);
+config_field!(bool, value => default_config_transform(value.to_lowercase().as_str())?);
 config_field!(usize);
 config_field!(f64);
 config_field!(u64);
+config_field!(u32);
 
 impl ConfigField for u8 {
     fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
@@ -1406,8 +2004,7 @@ macro_rules! extensions_options {
                             // Safely apply deprecated attribute if present
                             // $(#[allow(deprecated)])?
                             {
-                                #[allow(deprecated)]
-                                self.$field_name.set(rem, value.as_ref())
+                                            self.$field_name.set(rem, value.as_ref())
                             }
                         },
                     )*
@@ -1421,7 +2018,6 @@ macro_rules! extensions_options {
                 $(
                     let key = stringify!($field_name).to_string();
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
@@ -1595,7 +2191,7 @@ impl TableOptions {
     ///
     /// A result indicating success or failure in setting the configuration option.
     pub fn set(&mut self, key: &str, value: &str) -> Result<()> {
-        let Some((prefix, _)) = key.split_once('.') else {
+        let Some((mut prefix, _)) = key.split_once('.') else {
             return _config_err!("could not find config namespace for key \"{key}\"");
         };
 
@@ -1607,6 +2203,15 @@ impl TableOptions {
             return Ok(());
         }
 
+        if !self.extensions.0.contains_key(prefix)
+            && self
+                .extensions
+                .0
+                .contains_key(DATAFUSION_FFI_CONFIG_NAMESPACE)
+        {
+            prefix = DATAFUSION_FFI_CONFIG_NAMESPACE;
+        }
+
         let Some(e) = self.extensions.0.get_mut(prefix) else {
             return _config_err!("Could not find config namespace \"{prefix}\"");
         };
@@ -1692,7 +2297,7 @@ impl TableOptions {
 /// Options that control how Parquet files are read, including global options
 /// that apply to all columns and optional column-specific overrides
 ///
-/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions).
+/// Closely tied to `ParquetWriterOptions` (see `crate::file_options::parquet_writer::ParquetWriterOptions` when the "parquet" feature is enabled).
 /// Properties not included in [`TableParquetOptions`] may not be configurable at the external API
 /// (e.g. sorting_columns).
 #[derive(Clone, Default, Debug, PartialEq)]
@@ -1716,6 +2321,26 @@ pub struct TableParquetOptions {
     /// )
     /// ```
     pub key_value_metadata: HashMap<String, Option<String>>,
+    /// Options for configuring Parquet modular encryption
+    ///
+    /// To use Parquet encryption, you must enable the `parquet_encryption` feature flag, as it is not activated by default.
+    /// See ConfigFileEncryptionProperties and ConfigFileDecryptionProperties in datafusion/common/src/config.rs
+    /// These can be set via 'format.crypto', for example:
+    /// ```sql
+    /// OPTIONS (
+    ///    'format.crypto.file_encryption.encrypt_footer' 'true',
+    ///    'format.crypto.file_encryption.footer_key_as_hex' '30313233343536373839303132333435',  -- b"0123456789012345" */
+    ///    'format.crypto.file_encryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450"
+    ///    'format.crypto.file_encryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451"
+    ///     -- Same for decryption
+    ///    'format.crypto.file_decryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345"
+    ///    'format.crypto.file_decryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450"
+    ///    'format.crypto.file_decryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451"
+    /// )
+    /// ```
+    /// See datafusion-cli/tests/sql/encrypted_parquet.sql for a more complete example.
+    /// Note that keys must be provided as in hex format since these are binary strings.
+    pub crypto: ParquetEncryptionOptions,
 }
 
 impl TableParquetOptions {
@@ -1737,13 +2362,52 @@ impl TableParquetOptions {
             ..self
         }
     }
+
+    /// Retrieves all configuration entries from this `TableParquetOptions`.
+    ///
+    /// # Returns
+    ///
+    /// A vector of `ConfigEntry` instances, representing all the configuration options within this
+    pub fn entries(self: &TableParquetOptions) -> Vec<ConfigEntry> {
+        struct Visitor(Vec<ConfigEntry>);
+
+        impl Visit for Visitor {
+            fn some<V: Display>(
+                &mut self,
+                key: &str,
+                value: V,
+                description: &'static str,
+            ) {
+                self.0.push(ConfigEntry {
+                    key: key[1..].to_string(),
+                    value: Some(value.to_string()),
+                    description,
+                })
+            }
+
+            fn none(&mut self, key: &str, description: &'static str) {
+                self.0.push(ConfigEntry {
+                    key: key[1..].to_string(),
+                    value: None,
+                    description,
+                })
+            }
+        }
+
+        let mut v = Visitor(vec![]);
+        self.visit(&mut v, "", "");
+
+        v.0
+    }
 }
 
 impl ConfigField for TableParquetOptions {
     fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, description: &'static str) {
         self.global.visit(v, key_prefix, description);
         self.column_specific_options
-            .visit(v, key_prefix, description)
+            .visit(v, key_prefix, description);
+        self.crypto
+            .visit(v, &format!("{key_prefix}.crypto"), description);
     }
 
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
@@ -1753,17 +2417,19 @@ impl ConfigField for TableParquetOptions {
                 [_meta] | [_meta, ""] => {
                     return _config_err!(
                         "Invalid metadata key provided, missing key in metadata::<key>"
-                    )
+                    );
                 }
                 [_meta, k] => k.into(),
                 _ => {
                     return _config_err!(
                         "Invalid metadata key provided, found too many '::' in \"{key}\""
-                    )
+                    );
                 }
             };
             self.key_value_metadata.insert(k, Some(value.into()));
             Ok(())
+        } else if let Some(crypto_feature) = key.strip_prefix("crypto.") {
+            self.crypto.set(crypto_feature, value)
         } else if key.contains("::") {
             self.column_specific_options.set(key, value)
         } else {
@@ -1803,7 +2469,6 @@ macro_rules! config_namespace_with_hashmap {
                     $(
                        stringify!($field_name) => {
                            // Handle deprecated fields
-                           #[allow(deprecated)] // Allow deprecated fields
                            $(let value = $transform(value);)?
                            self.$field_name.set(rem, value.as_ref())
                        },
@@ -1819,7 +2484,6 @@ macro_rules! config_namespace_with_hashmap {
                 let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
                 let desc = concat!($($d),*).trim();
                 // Handle deprecated fields
-                #[allow(deprecated)]
                 self.$field_name.visit(v, key.as_str(), desc);
                 )*
             }
@@ -1827,7 +2491,6 @@ macro_rules! config_namespace_with_hashmap {
 
         impl Default for $struct_name {
             fn default() -> Self {
-                #[allow(deprecated)]
                 Self {
                     $($field_name: $default),*
                 }
@@ -1855,7 +2518,6 @@ macro_rules! config_namespace_with_hashmap {
                     $(
                     let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name));
                     let desc = concat!($($d),*).trim();
-                    #[allow(deprecated)]
                     col_options.$field_name.visit(v, key.as_str(), desc);
                     )*
                 }
@@ -1886,7 +2548,7 @@ config_namespace_with_hashmap! {
 
         /// Sets default parquet compression codec for the column path.
         /// Valid values are: uncompressed, snappy, gzip(level),
-        /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
+        /// brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case-sensitive. If NULL, uses
         /// default parquet options
         pub compression: Option<String>, transform = str::to_lowercase, default = None
@@ -1904,13 +2566,352 @@ config_namespace_with_hashmap! {
         /// Sets bloom filter number of distinct values. If NULL, uses
         /// default parquet options
         pub bloom_filter_ndv: Option<u64>, default = None
+    }
+}
 
-        /// Sets max statistics size for the column path. If NULL, uses
-        /// default parquet options
-        /// max_statistics_size is deprecated, currently it is not being used
-        // TODO: remove once deprecated
-        #[deprecated(since = "45.0.0", note = "Setting does not do anything")]
-        pub max_statistics_size: Option<usize>, default = None
+#[derive(Clone, Debug, PartialEq)]
+pub struct ConfigFileEncryptionProperties {
+    /// Should the parquet footer be encrypted
+    /// default is true
+    pub encrypt_footer: bool,
+    /// Key to use for the parquet footer encoded in hex format
+    pub footer_key_as_hex: String,
+    /// Metadata information for footer key
+    pub footer_key_metadata_as_hex: String,
+    /// HashMap of column names --> (key in hex format, metadata)
+    pub column_encryption_properties: HashMap<String, ColumnEncryptionProperties>,
+    /// AAD prefix string uniquely identifies the file and prevents file swapping
+    pub aad_prefix_as_hex: String,
+    /// If true, store the AAD prefix in the file
+    /// default is false
+    pub store_aad_prefix: bool,
+}
+
+// Setup to match EncryptionPropertiesBuilder::new()
+impl Default for ConfigFileEncryptionProperties {
+    fn default() -> Self {
+        ConfigFileEncryptionProperties {
+            encrypt_footer: true,
+            footer_key_as_hex: String::new(),
+            footer_key_metadata_as_hex: String::new(),
+            column_encryption_properties: Default::default(),
+            aad_prefix_as_hex: String::new(),
+            store_aad_prefix: false,
+        }
+    }
+}
+
+config_namespace_with_hashmap! {
+    pub struct ColumnEncryptionProperties {
+        /// Per column encryption key
+        pub column_key_as_hex: String, default = "".to_string()
+        /// Per column encryption key metadata
+        pub column_metadata_as_hex: Option<String>, default = None
+    }
+}
+
+impl ConfigField for ConfigFileEncryptionProperties {
+    fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
+        let key = format!("{key_prefix}.encrypt_footer");
+        let desc = "Encrypt the footer";
+        self.encrypt_footer.visit(v, key.as_str(), desc);
+
+        let key = format!("{key_prefix}.footer_key_as_hex");
+        let desc = "Key to use for the parquet footer";
+        self.footer_key_as_hex.visit(v, key.as_str(), desc);
+
+        let key = format!("{key_prefix}.footer_key_metadata_as_hex");
+        let desc = "Metadata to use for the parquet footer";
+        self.footer_key_metadata_as_hex.visit(v, key.as_str(), desc);
+
+        self.column_encryption_properties.visit(v, key_prefix, desc);
+
+        let key = format!("{key_prefix}.aad_prefix_as_hex");
+        let desc = "AAD prefix to use";
+        self.aad_prefix_as_hex.visit(v, key.as_str(), desc);
+
+        let key = format!("{key_prefix}.store_aad_prefix");
+        let desc = "If true, store the AAD prefix";
+        self.store_aad_prefix.visit(v, key.as_str(), desc);
+
+        self.aad_prefix_as_hex.visit(v, key.as_str(), desc);
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        // Any hex encoded values must be pre-encoded using
+        // hex::encode() before calling set.
+
+        if key.contains("::") {
+            // Handle any column specific properties
+            return self.column_encryption_properties.set(key, value);
+        };
+
+        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+        match key {
+            "encrypt_footer" => self.encrypt_footer.set(rem, value.as_ref()),
+            "footer_key_as_hex" => self.footer_key_as_hex.set(rem, value.as_ref()),
+            "footer_key_metadata_as_hex" => {
+                self.footer_key_metadata_as_hex.set(rem, value.as_ref())
+            }
+            "aad_prefix_as_hex" => self.aad_prefix_as_hex.set(rem, value.as_ref()),
+            "store_aad_prefix" => self.store_aad_prefix.set(rem, value.as_ref()),
+            _ => _config_err!(
+                "Config value \"{}\" not found on ConfigFileEncryptionProperties",
+                key
+            ),
+        }
+    }
+}
+
+#[cfg(feature = "parquet_encryption")]
+impl From<ConfigFileEncryptionProperties> for FileEncryptionProperties {
+    fn from(val: ConfigFileEncryptionProperties) -> Self {
+        let mut fep = FileEncryptionProperties::builder(
+            hex::decode(val.footer_key_as_hex).unwrap(),
+        )
+        .with_plaintext_footer(!val.encrypt_footer)
+        .with_aad_prefix_storage(val.store_aad_prefix);
+
+        if !val.footer_key_metadata_as_hex.is_empty() {
+            fep = fep.with_footer_key_metadata(
+                hex::decode(&val.footer_key_metadata_as_hex)
+                    .expect("Invalid footer key metadata"),
+            );
+        }
+
+        for (column_name, encryption_props) in val.column_encryption_properties.iter() {
+            let encryption_key = hex::decode(&encryption_props.column_key_as_hex)
+                .expect("Invalid column encryption key");
+            let key_metadata = encryption_props
+                .column_metadata_as_hex
+                .as_ref()
+                .map(|x| hex::decode(x).expect("Invalid column metadata"));
+            match key_metadata {
+                Some(key_metadata) => {
+                    fep = fep.with_column_key_and_metadata(
+                        column_name,
+                        encryption_key,
+                        key_metadata,
+                    );
+                }
+                None => {
+                    fep = fep.with_column_key(column_name, encryption_key);
+                }
+            }
+        }
+
+        if !val.aad_prefix_as_hex.is_empty() {
+            let aad_prefix: Vec<u8> =
+                hex::decode(&val.aad_prefix_as_hex).expect("Invalid AAD prefix");
+            fep = fep.with_aad_prefix(aad_prefix);
+        }
+        Arc::unwrap_or_clone(fep.build().unwrap())
+    }
+}
+
+#[cfg(feature = "parquet_encryption")]
+impl From<&Arc<FileEncryptionProperties>> for ConfigFileEncryptionProperties {
+    fn from(f: &Arc<FileEncryptionProperties>) -> Self {
+        let (column_names_vec, column_keys_vec, column_metas_vec) = f.column_keys();
+
+        let mut column_encryption_properties: HashMap<
+            String,
+            ColumnEncryptionProperties,
+        > = HashMap::new();
+
+        for (i, column_name) in column_names_vec.iter().enumerate() {
+            let column_key_as_hex = hex::encode(&column_keys_vec[i]);
+            let column_metadata_as_hex: Option<String> =
+                column_metas_vec.get(i).map(hex::encode);
+            column_encryption_properties.insert(
+                column_name.clone(),
+                ColumnEncryptionProperties {
+                    column_key_as_hex,
+                    column_metadata_as_hex,
+                },
+            );
+        }
+        let aad_prefix = f.aad_prefix().cloned().unwrap_or_default();
+        ConfigFileEncryptionProperties {
+            encrypt_footer: f.encrypt_footer(),
+            footer_key_as_hex: hex::encode(f.footer_key()),
+            footer_key_metadata_as_hex: f
+                .footer_key_metadata()
+                .map(hex::encode)
+                .unwrap_or_default(),
+            column_encryption_properties,
+            aad_prefix_as_hex: hex::encode(aad_prefix),
+            store_aad_prefix: f.store_aad_prefix(),
+        }
+    }
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct ConfigFileDecryptionProperties {
+    /// Binary string to use for the parquet footer encoded in hex format
+    pub footer_key_as_hex: String,
+    /// HashMap of column names --> key in hex format
+    pub column_decryption_properties: HashMap<String, ColumnDecryptionProperties>,
+    /// AAD prefix string uniquely identifies the file and prevents file swapping
+    pub aad_prefix_as_hex: String,
+    /// If true, then verify signature for files with plaintext footers.
+    /// default = true
+    pub footer_signature_verification: bool,
+}
+
+config_namespace_with_hashmap! {
+    pub struct ColumnDecryptionProperties {
+        /// Per column encryption key
+        pub column_key_as_hex: String, default = "".to_string()
+    }
+}
+
+// Setup to match DecryptionPropertiesBuilder::new()
+impl Default for ConfigFileDecryptionProperties {
+    fn default() -> Self {
+        ConfigFileDecryptionProperties {
+            footer_key_as_hex: String::new(),
+            column_decryption_properties: Default::default(),
+            aad_prefix_as_hex: String::new(),
+            footer_signature_verification: true,
+        }
+    }
+}
+
+impl ConfigField for ConfigFileDecryptionProperties {
+    fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
+        let key = format!("{key_prefix}.footer_key_as_hex");
+        let desc = "Key to use for the parquet footer";
+        self.footer_key_as_hex.visit(v, key.as_str(), desc);
+
+        let key = format!("{key_prefix}.aad_prefix_as_hex");
+        let desc = "AAD prefix to use";
+        self.aad_prefix_as_hex.visit(v, key.as_str(), desc);
+
+        let key = format!("{key_prefix}.footer_signature_verification");
+        let desc = "If true, verify the footer signature";
+        self.footer_signature_verification
+            .visit(v, key.as_str(), desc);
+
+        self.column_decryption_properties.visit(v, key_prefix, desc);
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        // Any hex encoded values must be pre-encoded using
+        // hex::encode() before calling set.
+
+        if key.contains("::") {
+            // Handle any column specific properties
+            return self.column_decryption_properties.set(key, value);
+        };
+
+        let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+        match key {
+            "footer_key_as_hex" => self.footer_key_as_hex.set(rem, value.as_ref()),
+            "aad_prefix_as_hex" => self.aad_prefix_as_hex.set(rem, value.as_ref()),
+            "footer_signature_verification" => {
+                self.footer_signature_verification.set(rem, value.as_ref())
+            }
+            _ => _config_err!(
+                "Config value \"{}\" not found on ConfigFileDecryptionProperties",
+                key
+            ),
+        }
+    }
+}
+
+#[cfg(feature = "parquet_encryption")]
+impl From<ConfigFileDecryptionProperties> for FileDecryptionProperties {
+    fn from(val: ConfigFileDecryptionProperties) -> Self {
+        let mut column_names: Vec<&str> = Vec::new();
+        let mut column_keys: Vec<Vec<u8>> = Vec::new();
+
+        for (col_name, decryption_properties) in val.column_decryption_properties.iter() {
+            column_names.push(col_name.as_str());
+            column_keys.push(
+                hex::decode(&decryption_properties.column_key_as_hex)
+                    .expect("Invalid column decryption key"),
+            );
+        }
+
+        let mut fep = FileDecryptionProperties::builder(
+            hex::decode(val.footer_key_as_hex).expect("Invalid footer key"),
+        )
+        .with_column_keys(column_names, column_keys)
+        .unwrap();
+
+        if !val.footer_signature_verification {
+            fep = fep.disable_footer_signature_verification();
+        }
+
+        if !val.aad_prefix_as_hex.is_empty() {
+            let aad_prefix =
+                hex::decode(&val.aad_prefix_as_hex).expect("Invalid AAD prefix");
+            fep = fep.with_aad_prefix(aad_prefix);
+        }
+
+        Arc::unwrap_or_clone(fep.build().unwrap())
+    }
+}
+
+#[cfg(feature = "parquet_encryption")]
+impl From<&Arc<FileDecryptionProperties>> for ConfigFileDecryptionProperties {
+    fn from(f: &Arc<FileDecryptionProperties>) -> Self {
+        let (column_names_vec, column_keys_vec) = f.column_keys();
+        let mut column_decryption_properties: HashMap<
+            String,
+            ColumnDecryptionProperties,
+        > = HashMap::new();
+        for (i, column_name) in column_names_vec.iter().enumerate() {
+            let props = ColumnDecryptionProperties {
+                column_key_as_hex: hex::encode(column_keys_vec[i].clone()),
+            };
+            column_decryption_properties.insert(column_name.clone(), props);
+        }
+
+        let aad_prefix = f.aad_prefix().cloned().unwrap_or_default();
+        ConfigFileDecryptionProperties {
+            footer_key_as_hex: hex::encode(
+                f.footer_key(None).unwrap_or_default().as_ref(),
+            ),
+            column_decryption_properties,
+            aad_prefix_as_hex: hex::encode(aad_prefix),
+            footer_signature_verification: f.check_plaintext_footer_integrity(),
+        }
+    }
+}
+
+/// Holds implementation-specific options for an encryption factory
+#[derive(Clone, Debug, Default, PartialEq)]
+pub struct EncryptionFactoryOptions {
+    pub options: HashMap<String, String>,
+}
+
+impl ConfigField for EncryptionFactoryOptions {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, _description: &'static str) {
+        for (option_key, option_value) in &self.options {
+            v.some(
+                &format!("{key}.{option_key}"),
+                option_value,
+                "Encryption factory specific option",
+            );
+        }
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        self.options.insert(key.to_owned(), value.to_owned());
+        Ok(())
+    }
+}
+
+impl EncryptionFactoryOptions {
+    /// Convert these encryption factory options to an [`ExtensionOptions`] instance.
+    pub fn to_extension_options<T: ExtensionOptions + Default>(&self) -> Result<T> {
+        let mut options = T::default();
+        for (key, value) in &self.options {
+            options.set(key, value)?;
+        }
+        Ok(options)
     }
 }
 
@@ -1935,6 +2936,14 @@ config_namespace! {
         /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
         pub newlines_in_values: Option<bool>, default = None
         pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
+        /// Compression level for the output file. The valid range depends on the
+        /// compression algorithm:
+        /// - ZSTD: 1 to 22 (default: 3)
+        /// - GZIP: 0 to 9 (default: 6)
+        /// - BZIP2: 0 to 9 (default: 6)
+        /// - XZ: 0 to 9 (default: 6)
+        /// If not specified, the default level for the compression algorithm is used.
+        pub compression_level: Option<u32>, default = None
         pub schema_infer_max_rec: Option<usize>, default = None
         pub date_format: Option<String>, default = None
         pub datetime_format: Option<String>, default = None
@@ -1946,6 +2955,16 @@ config_namespace! {
         // The input regex for Nulls when loading CSVs.
         pub null_regex: Option<String>, default = None
         pub comment: Option<u8>, default = None
+        /// Whether to allow truncated rows when parsing, both within a single file and across files.
+        ///
+        /// When set to false (default), reading a single CSV file which has rows of different lengths will
+        /// error; if reading multiple CSV files with different number of columns, it will also fail.
+        ///
+        /// When set to true, reading a single CSV file with rows of different lengths will pad the truncated
+        /// rows with null values for the missing columns; if reading multiple CSV files with different number
+        /// of columns, it creates a union schema containing all columns found across the files, and will
+        /// pad any files missing columns with null values for their rows.
+        pub truncated_rows: Option<bool>, default = None
     }
 }
 
@@ -2038,6 +3057,23 @@ impl CsvOptions {
         self
     }
 
+    /// Whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths.
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub fn with_truncated_rows(mut self, allow: bool) -> Self {
+        self.truncated_rows = Some(allow);
+        self
+    }
+
+    /// Set the compression level for the output file.
+    /// The valid range depends on the compression algorithm.
+    /// If not specified, the default level for the algorithm is used.
+    pub fn with_compression_level(mut self, level: u32) -> Self {
+        self.compression_level = Some(level);
+        self
+    }
+
     /// The delimiter character.
     pub fn delimiter(&self) -> u8 {
         self.delimiter
@@ -2063,14 +3099,38 @@ config_namespace! {
     /// Options controlling JSON format
     pub struct JsonOptions {
         pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
+        /// Compression level for the output file. The valid range depends on the
+        /// compression algorithm:
+        /// - ZSTD: 1 to 22 (default: 3)
+        /// - GZIP: 0 to 9 (default: 6)
+        /// - BZIP2: 0 to 9 (default: 6)
+        /// - XZ: 0 to 9 (default: 6)
+        /// If not specified, the default level for the compression algorithm is used.
+        pub compression_level: Option<u32>, default = None
         pub schema_infer_max_rec: Option<usize>, default = None
+       /// The JSON format to use when reading files.
+       ///
+       /// When `true` (default), expects newline-delimited JSON (NDJSON):
+       /// ```text
+       /// {"key1": 1, "key2": "val"}
+       /// {"key1": 2, "key2": "vals"}
+       /// ```
+       ///
+       /// When `false`, expects JSON array format:
+       /// ```text
+       /// [
+       ///   {"key1": 1, "key2": "val"},
+       ///   {"key1": 2, "key2": "vals"}
+       /// ]
+       /// ```
+       pub newline_delimited: bool, default = true
     }
 }
 
 pub trait OutputFormatExt: Display {}
 
 #[derive(Debug, Clone, PartialEq)]
-#[allow(clippy::large_enum_variant)]
+#[cfg_attr(feature = "parquet", expect(clippy::large_enum_variant))]
 pub enum OutputFormat {
     CSV(CsvOptions),
     JSON(JsonOptions),
@@ -2096,13 +3156,14 @@ impl Display for OutputFormat {
 
 #[cfg(test)]
 mod tests {
-    use std::any::Any;
-    use std::collections::HashMap;
-
+    #[cfg(feature = "parquet")]
+    use crate::config::TableParquetOptions;
     use crate::config::{
         ConfigEntry, ConfigExtension, ConfigField, ConfigFileType, ExtensionOptions,
         Extensions, TableOptions,
     };
+    use std::any::Any;
+    use std::collections::HashMap;
 
     #[derive(Default, Debug, Clone)]
     pub struct TestExtensionConfig {
@@ -2174,6 +3235,16 @@ mod tests {
         );
     }
 
+    #[test]
+    fn iter_test_extension_config() {
+        let mut extension = Extensions::new();
+        extension.insert(TestExtensionConfig::default());
+        let table_config = TableOptions::new().with_extensions(extension);
+        let extensions = table_config.extensions.iter().collect::<Vec<_>>();
+        assert_eq!(extensions.len(), 1);
+        assert_eq!(extensions[0].0, TestExtensionConfig::PREFIX);
+    }
+
     #[test]
     fn csv_u8_table_options() {
         let mut table_config = TableOptions::new();
@@ -2217,6 +3288,19 @@ mod tests {
         assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 1);
     }
 
+    #[test]
+    fn reset_nested_scalar_reports_helpful_error() {
+        let mut value = true;
+        let err = <bool as ConfigField>::reset(&mut value, "nested").unwrap_err();
+        let message = err.to_string();
+        assert!(
+            message.starts_with(
+                "Invalid or Unsupported Configuration: Config field is a scalar bool and does not have nested field \"nested\""
+            ),
+            "unexpected error message: {message}"
+        );
+    }
+
     #[cfg(feature = "parquet")]
     #[test]
     fn parquet_table_options() {
@@ -2231,6 +3315,159 @@ mod tests {
         );
     }
 
+    #[cfg(feature = "parquet_encryption")]
+    #[test]
+    fn parquet_table_encryption() {
+        use crate::config::{
+            ConfigFileDecryptionProperties, ConfigFileEncryptionProperties,
+        };
+        use parquet::encryption::decrypt::FileDecryptionProperties;
+        use parquet::encryption::encrypt::FileEncryptionProperties;
+        use std::sync::Arc;
+
+        let footer_key = b"0123456789012345".to_vec(); // 128bit/16
+        let column_names = vec!["double_field", "float_field"];
+        let column_keys =
+            vec![b"1234567890123450".to_vec(), b"1234567890123451".to_vec()];
+
+        let file_encryption_properties =
+            FileEncryptionProperties::builder(footer_key.clone())
+                .with_column_keys(column_names.clone(), column_keys.clone())
+                .unwrap()
+                .build()
+                .unwrap();
+
+        let decryption_properties = FileDecryptionProperties::builder(footer_key.clone())
+            .with_column_keys(column_names.clone(), column_keys.clone())
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Test round-trip
+        let config_encrypt =
+            ConfigFileEncryptionProperties::from(&file_encryption_properties);
+        let encryption_properties_built =
+            Arc::new(FileEncryptionProperties::from(config_encrypt.clone()));
+        assert_eq!(file_encryption_properties, encryption_properties_built);
+
+        let config_decrypt = ConfigFileDecryptionProperties::from(&decryption_properties);
+        let decryption_properties_built =
+            Arc::new(FileDecryptionProperties::from(config_decrypt.clone()));
+        assert_eq!(decryption_properties, decryption_properties_built);
+
+        ///////////////////////////////////////////////////////////////////////////////////
+        // Test encryption config
+
+        // Display original encryption config
+        // println!("{:#?}", config_encrypt);
+
+        let mut table_config = TableOptions::new();
+        table_config.set_config_format(ConfigFileType::PARQUET);
+        table_config
+            .parquet
+            .set(
+                "crypto.file_encryption.encrypt_footer",
+                config_encrypt.encrypt_footer.to_string().as_str(),
+            )
+            .unwrap();
+        table_config
+            .parquet
+            .set(
+                "crypto.file_encryption.footer_key_as_hex",
+                config_encrypt.footer_key_as_hex.as_str(),
+            )
+            .unwrap();
+
+        for (i, col_name) in column_names.iter().enumerate() {
+            let key = format!("crypto.file_encryption.column_key_as_hex::{col_name}");
+            let value = hex::encode(column_keys[i].clone());
+            table_config
+                .parquet
+                .set(key.as_str(), value.as_str())
+                .unwrap();
+        }
+
+        // Print matching final encryption config
+        // println!("{:#?}", table_config.parquet.crypto.file_encryption);
+
+        assert_eq!(
+            table_config.parquet.crypto.file_encryption,
+            Some(config_encrypt)
+        );
+
+        ///////////////////////////////////////////////////////////////////////////////////
+        // Test decryption config
+
+        // Display original decryption config
+        // println!("{:#?}", config_decrypt);
+
+        let mut table_config = TableOptions::new();
+        table_config.set_config_format(ConfigFileType::PARQUET);
+        table_config
+            .parquet
+            .set(
+                "crypto.file_decryption.footer_key_as_hex",
+                config_decrypt.footer_key_as_hex.as_str(),
+            )
+            .unwrap();
+
+        for (i, col_name) in column_names.iter().enumerate() {
+            let key = format!("crypto.file_decryption.column_key_as_hex::{col_name}");
+            let value = hex::encode(column_keys[i].clone());
+            table_config
+                .parquet
+                .set(key.as_str(), value.as_str())
+                .unwrap();
+        }
+
+        // Print matching final decryption config
+        // println!("{:#?}", table_config.parquet.crypto.file_decryption);
+
+        assert_eq!(
+            table_config.parquet.crypto.file_decryption,
+            Some(config_decrypt.clone())
+        );
+
+        // Set config directly
+        let mut table_config = TableOptions::new();
+        table_config.set_config_format(ConfigFileType::PARQUET);
+        table_config.parquet.crypto.file_decryption = Some(config_decrypt.clone());
+        assert_eq!(
+            table_config.parquet.crypto.file_decryption,
+            Some(config_decrypt.clone())
+        );
+    }
+
+    #[cfg(feature = "parquet_encryption")]
+    #[test]
+    fn parquet_encryption_factory_config() {
+        let mut parquet_options = TableParquetOptions::default();
+
+        assert_eq!(parquet_options.crypto.factory_id, None);
+        assert_eq!(parquet_options.crypto.factory_options.options.len(), 0);
+
+        let mut input_config = TestExtensionConfig::default();
+        input_config
+            .properties
+            .insert("key1".to_string(), "value 1".to_string());
+        input_config
+            .properties
+            .insert("key2".to_string(), "value 2".to_string());
+
+        parquet_options
+            .crypto
+            .configure_factory("example_factory", &input_config);
+
+        assert_eq!(
+            parquet_options.crypto.factory_id,
+            Some("example_factory".to_string())
+        );
+        let factory_options = &parquet_options.crypto.factory_options.options;
+        assert_eq!(factory_options.len(), 2);
+        assert_eq!(factory_options.get("key1"), Some(&"value 1".to_string()));
+        assert_eq!(factory_options.get("key2"), Some(&"value 2".to_string()));
+    }
+
     #[cfg(feature = "parquet")]
     #[test]
     fn parquet_table_options_config_entry() {
@@ -2240,9 +3477,28 @@ mod tests {
             .set("format.bloom_filter_enabled::col1", "true")
             .unwrap();
         let entries = table_config.entries();
-        assert!(entries
-            .iter()
-            .any(|item| item.key == "format.bloom_filter_enabled::col1"))
+        assert!(
+            entries
+                .iter()
+                .any(|item| item.key == "format.bloom_filter_enabled::col1")
+        )
+    }
+
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn parquet_table_parquet_options_config_entry() {
+        let mut table_parquet_options = TableParquetOptions::new();
+        table_parquet_options
+            .set(
+                "crypto.file_encryption.column_key_as_hex::double_field",
+                "31323334353637383930313233343530",
+            )
+            .unwrap();
+        let entries = table_parquet_options.entries();
+        assert!(
+            entries.iter().any(|item| item.key
+                == "crypto.file_encryption.column_key_as_hex::double_field")
+        )
     }
 
     #[cfg(feature = "parquet")]
@@ -2278,4 +3534,37 @@ mod tests {
         let parsed_metadata = table_config.parquet.key_value_metadata;
         assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into())));
     }
+    #[cfg(feature = "parquet")]
+    #[test]
+    fn test_parquet_writer_version_validation() {
+        use crate::{config::ConfigOptions, parquet_config::DFParquetWriterVersion};
+
+        let mut config = ConfigOptions::default();
+
+        // Valid values should work
+        config
+            .set("datafusion.execution.parquet.writer_version", "1.0")
+            .unwrap();
+        assert_eq!(
+            config.execution.parquet.writer_version,
+            DFParquetWriterVersion::V1_0
+        );
+
+        config
+            .set("datafusion.execution.parquet.writer_version", "2.0")
+            .unwrap();
+        assert_eq!(
+            config.execution.parquet.writer_version,
+            DFParquetWriterVersion::V2_0
+        );
+
+        // Invalid value should error immediately at SET time
+        let err = config
+            .set("datafusion.execution.parquet.writer_version", "3.0")
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid or Unsupported Configuration: Invalid parquet writer version: 3.0. Expected one of: 1.0, 2.0"
+        );
+    }
 }
diff --git a/datafusion/common/src/cse.rs b/datafusion/common/src/cse.rs
index 674d3386171f8..93169d6a02ff1 100644
--- a/datafusion/common/src/cse.rs
+++ b/datafusion/common/src/cse.rs
@@ -19,12 +19,12 @@
 //! a [`CSEController`], that defines how to eliminate common subtrees from a particular
 //! [`TreeNode`] tree.
 
+use crate::Result;
 use crate::hash_utils::combine_hashes;
 use crate::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
     TreeNodeVisitor,
 };
-use crate::Result;
 use indexmap::IndexMap;
 use std::collections::HashMap;
 use std::hash::{BuildHasher, Hash, Hasher, RandomState};
@@ -676,13 +676,13 @@ where
 
 #[cfg(test)]
 mod test {
+    use crate::Result;
     use crate::alias::AliasGenerator;
     use crate::cse::{
-        CSEController, HashNode, IdArray, Identifier, NodeStats, NormalizeEq,
-        Normalizeable, CSE,
+        CSE, CSEController, HashNode, IdArray, Identifier, NodeStats, NormalizeEq,
+        Normalizeable,
     };
     use crate::tree_node::tests::TestTreeNode;
-    use crate::Result;
     use std::collections::HashSet;
     use std::hash::{Hash, Hasher};
 
diff --git a/datafusion/common/src/datatype.rs b/datafusion/common/src/datatype.rs
new file mode 100644
index 0000000000000..19847f8583505
--- /dev/null
+++ b/datafusion/common/src/datatype.rs
@@ -0,0 +1,273 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`DataTypeExt`] and [`FieldExt`] extension trait for working with Arrow [`DataType`] and [`Field`]s
+
+use crate::arrow::datatypes::{DataType, Field, FieldRef};
+use crate::metadata::FieldMetadata;
+use std::sync::Arc;
+
+/// DataFusion extension methods for Arrow [`DataType`]
+pub trait DataTypeExt {
+    /// Convert the type to field with nullable type and "" name
+    ///
+    /// This is used to track the places where we convert a [`DataType`]
+    /// into a nameless field to interact with an API that is
+    /// capable of representing an extension type and/or nullability.
+    ///
+    /// For example, it will convert a `DataType::Int32` into
+    /// `Field::new("", DataType::Int32, true)`.
+    ///
+    /// ```
+    /// # use datafusion_common::datatype::DataTypeExt;
+    /// # use arrow::datatypes::DataType;
+    /// let dt = DataType::Utf8;
+    /// let field = dt.into_nullable_field();
+    /// // result is a nullable Utf8 field with "" name
+    /// assert_eq!(field.name(), "");
+    /// assert_eq!(field.data_type(), &DataType::Utf8);
+    /// assert!(field.is_nullable());
+    /// ```
+    fn into_nullable_field(self) -> Field;
+
+    /// Convert the type to [`FieldRef`] with nullable type and "" name
+    ///
+    /// Concise wrapper around [`DataTypeExt::into_nullable_field`] that
+    /// constructs a [`FieldRef`].
+    fn into_nullable_field_ref(self) -> FieldRef;
+}
+
+impl DataTypeExt for DataType {
+    fn into_nullable_field(self) -> Field {
+        Field::new("", self, true)
+    }
+
+    fn into_nullable_field_ref(self) -> FieldRef {
+        Arc::new(Field::new("", self, true))
+    }
+}
+
+/// DataFusion extension methods for Arrow [`Field`] and [`FieldRef`]
+///
+/// This trait is implemented for both [`Field`] and [`FieldRef`] and
+/// provides convenience methods for efficiently working with both types.
+///
+/// For [`FieldRef`], the methods will attempt to unwrap the `Arc`
+/// to avoid unnecessary cloning when possible.
+pub trait FieldExt {
+    /// Ensure the field is named `new_name`, returning the given field if the
+    /// name matches, and a new field if not.
+    ///
+    /// This method avoids `clone`ing fields and names if the name is the same
+    /// as the field's existing name.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // rename to "your_int"
+    /// let renamed_field = int_field.renamed("your_int");
+    /// assert_eq!(renamed_field.name(), "your_int");
+    /// ```
+    fn renamed(self, new_name: &str) -> Self;
+
+    /// Ensure the field has the given data type
+    ///
+    /// Note this is different than simply calling [`Field::with_data_type`] as
+    /// it avoids copying if the data type is already the same.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // change to Float64
+    /// let retyped_field = int_field.retyped(DataType::Float64);
+    /// assert_eq!(retyped_field.data_type(), &DataType::Float64);
+    /// ```
+    fn retyped(self, new_data_type: DataType) -> Self;
+
+    /// Add field metadata to the Field
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self;
+
+    /// Add optional field metadata,
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self;
+
+    /// Returns a new Field representing a List of this Field's DataType.
+    ///
+    /// For example if input represents an `Int32`, the return value will
+    /// represent a `List<Int32>`.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// // Int32 field
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // convert to a List field
+    /// let list_field = int_field.into_list();
+    /// // List<Int32>
+    /// // Note that the item field name has been renamed to "item"
+    /// assert_eq!(list_field.data_type(), &DataType::List(Arc::new(
+    ///     Field::new("item", DataType::Int32, true)
+    /// )));
+    fn into_list(self) -> Self;
+
+    /// Return a new Field representing this Field as the item type of a
+    /// [`DataType::FixedSizeList`]
+    ///
+    /// For example if input represents an `Int32`, the return value will
+    /// represent a `FixedSizeList<Int32, size>`.
+    ///
+    /// Example:
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::datatype::FieldExt;
+    /// // Int32 field
+    /// let int_field = Field::new("my_int", DataType::Int32, true);
+    /// // convert to a FixedSizeList field of size 3
+    /// let fixed_size_list_field = int_field.into_fixed_size_list(3);
+    /// // FixedSizeList<Int32, 3>
+    /// // Note that the item field name has been renamed to "item"
+    /// assert_eq!(
+    ///   fixed_size_list_field.data_type(),
+    ///   &DataType::FixedSizeList(Arc::new(
+    ///    Field::new("item", DataType::Int32, true)),
+    ///    3
+    /// ));
+    fn into_fixed_size_list(self, list_size: i32) -> Self;
+
+    /// Update the field to have the default list field name ("item")
+    ///
+    /// Lists are allowed to have an arbitrarily named field; however, a name
+    /// other than 'item' will cause it to fail an == check against a more
+    /// idiomatically created list in arrow-rs which causes issues.
+    ///
+    /// For example, if input represents an `Int32` field named "my_int",
+    /// the return value will represent an `Int32` field named "item".
+    ///
+    /// Example:
+    /// ```
+    /// # use arrow::datatypes::Field;
+    /// # use datafusion_common::datatype::FieldExt;
+    /// let my_field = Field::new("my_int", arrow::datatypes::DataType::Int32, true);
+    /// let item_field = my_field.into_list_item();
+    /// assert_eq!(item_field.name(), Field::LIST_FIELD_DEFAULT_NAME);
+    /// assert_eq!(item_field.name(), "item");
+    /// ```
+    fn into_list_item(self) -> Self;
+}
+
+impl FieldExt for Field {
+    fn renamed(self, new_name: &str) -> Self {
+        // check if this is a new name before allocating a new Field / copying
+        // the existing one
+        if self.name() != new_name {
+            self.with_name(new_name)
+        } else {
+            self
+        }
+    }
+
+    fn retyped(self, new_data_type: DataType) -> Self {
+        self.with_data_type(new_data_type)
+    }
+
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self {
+        metadata.add_to_field(self)
+    }
+
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self {
+        if let Some(metadata) = metadata {
+            self.with_field_metadata(metadata)
+        } else {
+            self
+        }
+    }
+
+    fn into_list(self) -> Self {
+        DataType::List(Arc::new(self.into_list_item())).into_nullable_field()
+    }
+
+    fn into_fixed_size_list(self, list_size: i32) -> Self {
+        DataType::FixedSizeList(self.into_list_item().into(), list_size)
+            .into_nullable_field()
+    }
+
+    fn into_list_item(self) -> Self {
+        if self.name() != Field::LIST_FIELD_DEFAULT_NAME {
+            self.with_name(Field::LIST_FIELD_DEFAULT_NAME)
+        } else {
+            self
+        }
+    }
+}
+
+impl FieldExt for Arc<Field> {
+    fn renamed(mut self, new_name: &str) -> Self {
+        if self.name() != new_name {
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_name(new_name);
+        }
+        self
+    }
+
+    fn retyped(mut self, new_data_type: DataType) -> Self {
+        if self.data_type() != &new_data_type {
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_data_type(new_data_type);
+        }
+        self
+    }
+
+    fn with_field_metadata(self, metadata: &FieldMetadata) -> Self {
+        metadata.add_to_field_ref(self)
+    }
+
+    fn with_field_metadata_opt(self, metadata: Option<&FieldMetadata>) -> Self {
+        if let Some(metadata) = metadata {
+            self.with_field_metadata(metadata)
+        } else {
+            self
+        }
+    }
+
+    fn into_list(self) -> Self {
+        DataType::List(self.into_list_item())
+            .into_nullable_field()
+            .into()
+    }
+
+    fn into_fixed_size_list(self, list_size: i32) -> Self {
+        DataType::FixedSizeList(self.into_list_item(), list_size)
+            .into_nullable_field()
+            .into()
+    }
+
+    fn into_list_item(mut self) -> Self {
+        if self.name() != Field::LIST_FIELD_DEFAULT_NAME {
+            // avoid cloning if possible
+            Arc::make_mut(&mut self).set_name(Field::LIST_FIELD_DEFAULT_NAME);
+        }
+        self
+    }
+}
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index 804e14bf72fb0..de0aacf9e8bcd 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -23,10 +23,10 @@ use std::fmt::{Display, Formatter};
 use std::hash::Hash;
 use std::sync::Arc;
 
-use crate::error::{DataFusionError, Result, _plan_err, _schema_err};
+use crate::error::{_plan_err, _schema_err, DataFusionError, Result};
 use crate::{
-    field_not_found, unqualified_field_not_found, Column, FunctionalDependencies,
-    SchemaError, TableReference,
+    Column, FunctionalDependencies, SchemaError, TableReference, field_not_found,
+    unqualified_field_not_found,
 };
 
 use arrow::compute::can_cast_types;
@@ -37,7 +37,7 @@ use arrow::datatypes::{
 /// A reference-counted reference to a [DFSchema].
 pub type DFSchemaRef = Arc<DFSchema>;
 
-/// DFSchema wraps an Arrow schema and adds relation names.
+/// DFSchema wraps an Arrow schema and add a relation (table) name.
 ///
 /// The schema may hold the fields across multiple tables. Some fields may be
 /// qualified and some unqualified. A qualified field is a field that has a
@@ -47,8 +47,14 @@ pub type DFSchemaRef = Arc<DFSchema>;
 /// have a distinct name from any qualified field names. This allows finding a
 /// qualified field by name to be possible, so long as there aren't multiple
 /// qualified fields with the same name.
+///]
+/// # See Also
+/// * [DFSchemaRef], an alias to `Arc<DFSchema>`
+/// * [DataTypeExt], common methods for working with Arrow [DataType]s
+/// * [FieldExt], extension methods for working with Arrow [Field]s
 ///
-/// There is an alias to `Arc<DFSchema>` named [DFSchemaRef].
+/// [DataTypeExt]: crate::datatype::DataTypeExt
+/// [FieldExt]: crate::datatype::FieldExt
 ///
 /// # Creating qualified schemas
 ///
@@ -56,12 +62,10 @@ pub type DFSchemaRef = Arc<DFSchema>;
 /// an Arrow schema.
 ///
 /// ```rust
-/// use datafusion_common::{DFSchema, Column};
 /// use arrow::datatypes::{DataType, Field, Schema};
+/// use datafusion_common::{Column, DFSchema};
 ///
-/// let arrow_schema = Schema::new(vec![
-///    Field::new("c1", DataType::Int32, false),
-/// ]);
+/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
 ///
 /// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap();
 /// let column = Column::from_qualified_name("t1.c1");
@@ -77,12 +81,10 @@ pub type DFSchemaRef = Arc<DFSchema>;
 /// Create an unqualified schema using TryFrom:
 ///
 /// ```rust
-/// use datafusion_common::{DFSchema, Column};
 /// use arrow::datatypes::{DataType, Field, Schema};
+/// use datafusion_common::{Column, DFSchema};
 ///
-/// let arrow_schema = Schema::new(vec![
-///    Field::new("c1", DataType::Int32, false),
-/// ]);
+/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
 ///
 /// let df_schema = DFSchema::try_from(arrow_schema).unwrap();
 /// let column = Column::new_unqualified("c1");
@@ -94,14 +96,16 @@ pub type DFSchemaRef = Arc<DFSchema>;
 /// Use the `Into` trait to convert `DFSchema` into an Arrow schema:
 ///
 /// ```rust
+/// use arrow::datatypes::{Field, Schema};
 /// use datafusion_common::DFSchema;
-/// use arrow::datatypes::{Schema, Field};
 /// use std::collections::HashMap;
 ///
-/// let df_schema = DFSchema::from_unqualified_fields(vec![
-///    Field::new("c1", arrow::datatypes::DataType::Int32, false),
-/// ].into(),HashMap::new()).unwrap();
-/// let schema = Schema::from(df_schema);
+/// let df_schema = DFSchema::from_unqualified_fields(
+///     vec![Field::new("c1", arrow::datatypes::DataType::Int32, false)].into(),
+///     HashMap::new(),
+/// )
+/// .unwrap();
+/// let schema: &Schema = df_schema.as_arrow();
 /// assert_eq!(schema.fields().len(), 1);
 /// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -206,6 +210,25 @@ impl DFSchema {
         Ok(dfschema)
     }
 
+    /// Return the same schema, where all fields have a given qualifier.
+    pub fn with_field_specific_qualified_schema(
+        &self,
+        qualifiers: Vec<Option<TableReference>>,
+    ) -> Result<Self> {
+        if qualifiers.len() != self.fields().len() {
+            return _plan_err!(
+                "Number of qualifiers must match number of fields. Expected {}, got {}",
+                self.fields().len(),
+                qualifiers.len()
+            );
+        }
+        Ok(DFSchema {
+            inner: Arc::clone(&self.inner),
+            field_qualifiers: qualifiers,
+            functional_dependencies: self.functional_dependencies.clone(),
+        })
+    }
+
     /// Check if the schema have some fields with the same name
     pub fn check_names(&self) -> Result<()> {
         let mut qualified_names = BTreeSet::new();
@@ -229,7 +252,7 @@ impl DFSchema {
         for (qualifier, name) in qualified_names {
             if unqualified_names.contains(name) {
                 return _schema_err!(SchemaError::AmbiguousReference {
-                    field: Column::new(Some(qualifier.clone()), name)
+                    field: Box::new(Column::new(Some(qualifier.clone()), name))
                 });
             }
         }
@@ -278,6 +301,20 @@ impl DFSchema {
 
     /// Modify this schema by appending the fields from the supplied schema, ignoring any
     /// duplicate fields.
+    ///
+    /// ## Merge Precedence
+    ///
+    /// **Schema-level metadata**: Metadata from both schemas is merged.
+    /// If both schemas have the same metadata key, the value from the `other_schema` parameter takes precedence.
+    ///
+    /// **Field-level merging**: Only non-duplicate fields are added. This means that the
+    /// `self` fields will always take precedence over the `other_schema` fields.
+    /// Duplicate field detection is based on:
+    /// - For qualified fields: both qualifier and field name must match
+    /// - For unqualified fields: only field name needs to match
+    ///
+    /// Take note how the precedence for fields & metadata merging differs;
+    /// merging prefers fields from `self` but prefers metadata from `other_schema`.
     pub fn merge(&mut self, other_schema: &DFSchema) {
         if other_schema.inner.fields.is_empty() {
             return;
@@ -315,20 +352,22 @@ impl DFSchema {
         self.field_qualifiers.extend(qualifiers);
     }
 
-    /// Get a list of fields
+    /// Get a list of fields for this schema
     pub fn fields(&self) -> &Fields {
         &self.inner.fields
     }
 
-    /// Returns an immutable reference of a specific `Field` instance selected using an
-    /// offset within the internal `fields` vector
-    pub fn field(&self, i: usize) -> &Field {
+    /// Returns a reference to [`FieldRef`] for a column at specific index
+    /// within the schema.
+    ///
+    /// See also [Self::qualified_field] to get both qualifier and field
+    pub fn field(&self, i: usize) -> &FieldRef {
         &self.inner.fields[i]
     }
 
-    /// Returns an immutable reference of a specific `Field` instance selected using an
-    /// offset within the internal `fields` vector and its qualifier
-    pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &Field) {
+    /// Returns the qualifier (if any) and [`FieldRef`] for a column at specific
+    /// index within the schema.
+    pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &FieldRef) {
         (self.field_qualifiers[i].as_ref(), self.field(i))
     }
 
@@ -379,12 +418,12 @@ impl DFSchema {
             .is_some()
     }
 
-    /// Find the field with the given name
+    /// Find the [`FieldRef`] with the given name and optional qualifier
     pub fn field_with_name(
         &self,
         qualifier: Option<&TableReference>,
         name: &str,
-    ) -> Result<&Field> {
+    ) -> Result<&FieldRef> {
         if let Some(qualifier) = qualifier {
             self.field_with_qualified_name(qualifier, name)
         } else {
@@ -397,7 +436,7 @@ impl DFSchema {
         &self,
         qualifier: Option<&TableReference>,
         name: &str,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         if let Some(qualifier) = qualifier {
             let idx = self
                 .index_of_column_by_name(Some(qualifier), name)
@@ -409,10 +448,10 @@ impl DFSchema {
     }
 
     /// Find all fields having the given qualifier
-    pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> {
+    pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&FieldRef> {
         self.iter()
             .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false))
-            .map(|(_, f)| f.as_ref())
+            .map(|(_, f)| f)
             .collect()
     }
 
@@ -428,11 +467,10 @@ impl DFSchema {
     }
 
     /// Find all fields that match the given name
-    pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> {
+    pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&FieldRef> {
         self.fields()
             .iter()
             .filter(|field| field.name() == name)
-            .map(|f| f.as_ref())
             .collect()
     }
 
@@ -440,10 +478,9 @@ impl DFSchema {
     pub fn qualified_fields_with_unqualified_name(
         &self,
         name: &str,
-    ) -> Vec<(Option<&TableReference>, &Field)> {
+    ) -> Vec<(Option<&TableReference>, &FieldRef)> {
         self.iter()
             .filter(|(_, field)| field.name() == name)
-            .map(|(qualifier, field)| (qualifier, field.as_ref()))
             .collect()
     }
 
@@ -468,7 +505,7 @@ impl DFSchema {
     pub fn qualified_field_with_unqualified_name(
         &self,
         name: &str,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         let matches = self.qualified_fields_with_unqualified_name(name);
         match matches.len() {
             0 => Err(unqualified_field_not_found(name, self)),
@@ -489,7 +526,7 @@ impl DFSchema {
                     Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1))
                 } else {
                     _schema_err!(SchemaError::AmbiguousReference {
-                        field: Column::new_unqualified(name.to_string(),),
+                        field: Box::new(Column::new_unqualified(name.to_string()))
                     })
                 }
             }
@@ -497,7 +534,7 @@ impl DFSchema {
     }
 
     /// Find the field with the given name
-    pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> {
+    pub fn field_with_unqualified_name(&self, name: &str) -> Result<&FieldRef> {
         self.qualified_field_with_unqualified_name(name)
             .map(|(_, field)| field)
     }
@@ -507,7 +544,7 @@ impl DFSchema {
         &self,
         qualifier: &TableReference,
         name: &str,
-    ) -> Result<&Field> {
+    ) -> Result<&FieldRef> {
         let idx = self
             .index_of_column_by_name(Some(qualifier), name)
             .ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?;
@@ -519,7 +556,7 @@ impl DFSchema {
     pub fn qualified_field_from_column(
         &self,
         column: &Column,
-    ) -> Result<(Option<&TableReference>, &Field)> {
+    ) -> Result<(Option<&TableReference>, &FieldRef)> {
         self.qualified_field_with_name(column.relation.as_ref(), &column.name)
     }
 
@@ -561,7 +598,7 @@ impl DFSchema {
         &self,
         arrow_schema: &Schema,
     ) -> Result<()> {
-        let self_arrow_schema: Schema = self.into();
+        let self_arrow_schema = self.as_arrow();
         self_arrow_schema
             .fields()
             .iter()
@@ -636,8 +673,8 @@ impl DFSchema {
                         ))
                     {
                         _plan_err!(
-                            "Schema mismatch: Expected field '{}' with type {:?}, \
-                            but got '{}' with type {:?}.",
+                            "Schema mismatch: Expected field '{}' with type {}, \
+                            but got '{}' with type {}.",
                             f1.name(),
                             f1.data_type(),
                             f2.name(),
@@ -661,10 +698,12 @@ impl DFSchema {
         // check nested fields
         match (dt1, dt2) {
             (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => {
-                v1.as_ref() == v2.as_ref()
+                Self::datatype_is_logically_equal(v1.as_ref(), v2.as_ref())
+            }
+            (DataType::Dictionary(_, v1), othertype)
+            | (othertype, DataType::Dictionary(_, v1)) => {
+                Self::datatype_is_logically_equal(v1.as_ref(), othertype)
             }
-            (DataType::Dictionary(_, v1), othertype) => v1.as_ref() == othertype,
-            (othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype,
             (DataType::List(f1), DataType::List(f2))
             | (DataType::LargeList(f1), DataType::LargeList(f2))
             | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => {
@@ -714,7 +753,8 @@ impl DFSchema {
     }
 
     /// Returns true of two [`DataType`]s are semantically equal (same
-    /// name and type), ignoring both metadata and nullability, and decimal precision/scale.
+    /// name and type), ignoring both metadata and nullability, decimal precision/scale,
+    /// and timezone time units/timezones.
     ///
     /// request to upstream: <https://github.com/apache/arrow-rs/issues/3199>
     pub fn datatype_is_semantically_equal(dt1: &DataType, dt2: &DataType) -> bool {
@@ -765,6 +805,14 @@ impl DFSchema {
                         .zip(iter2)
                         .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_semantically_equal(f1, f2))
             }
+            (
+                DataType::Decimal32(_l_precision, _l_scale),
+                DataType::Decimal32(_r_precision, _r_scale),
+            ) => true,
+            (
+                DataType::Decimal64(_l_precision, _l_scale),
+                DataType::Decimal64(_r_precision, _r_scale),
+            ) => true,
             (
                 DataType::Decimal128(_l_precision, _l_scale),
                 DataType::Decimal128(_r_precision, _r_scale),
@@ -773,6 +821,10 @@ impl DFSchema {
                 DataType::Decimal256(_l_precision, _l_scale),
                 DataType::Decimal256(_r_precision, _r_scale),
             ) => true,
+            (
+                DataType::Timestamp(_l_time_unit, _l_timezone),
+                DataType::Timestamp(_r_time_unit, _r_timezone),
+            ) => true,
             _ => dt1 == dt2,
         }
     }
@@ -830,21 +882,216 @@ impl DFSchema {
             .zip(self.inner.fields().iter())
             .map(|(qualifier, field)| (qualifier.as_ref(), field))
     }
+    /// Returns a tree-like string representation of the schema.
+    ///
+    /// This method formats the schema
+    /// with a tree-like structure showing field names, types, and nullability.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::DFSchema;
+    /// use std::collections::HashMap;
+    ///
+    /// let schema = DFSchema::from_unqualified_fields(
+    ///     vec![
+    ///         Field::new("id", DataType::Int32, false),
+    ///         Field::new("name", DataType::Utf8, true),
+    ///     ]
+    ///     .into(),
+    ///     HashMap::new(),
+    /// )
+    /// .unwrap();
+    ///
+    /// assert_eq!(
+    ///     schema.tree_string().to_string(),
+    ///     r#"root
+    ///  |-- id: int32 (nullable = false)
+    ///  |-- name: utf8 (nullable = true)"#
+    /// );
+    /// ```
+    pub fn tree_string(&self) -> impl Display + '_ {
+        let mut result = String::from("root\n");
+
+        for (qualifier, field) in self.iter() {
+            let field_name = match qualifier {
+                Some(q) => format!("{}.{}", q, field.name()),
+                None => field.name().to_string(),
+            };
+
+            format_field_with_indent(
+                &mut result,
+                &field_name,
+                field.data_type(),
+                field.is_nullable(),
+                " ",
+            );
+        }
+
+        // Remove the trailing newline
+        if result.ends_with('\n') {
+            result.pop();
+        }
+
+        result
+    }
 }
 
-impl From<DFSchema> for Schema {
-    /// Convert DFSchema into a Schema
-    fn from(df_schema: DFSchema) -> Self {
-        let fields: Fields = df_schema.inner.fields.clone();
-        Schema::new_with_metadata(fields, df_schema.inner.metadata.clone())
+/// Format field with proper nested indentation for complex types
+fn format_field_with_indent(
+    result: &mut String,
+    field_name: &str,
+    data_type: &DataType,
+    nullable: bool,
+    indent: &str,
+) {
+    let nullable_str = nullable.to_string().to_lowercase();
+    let child_indent = format!("{indent}|    ");
+
+    match data_type {
+        DataType::List(field) => {
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: list (nullable = {nullable_str})\n"
+            ));
+            format_field_with_indent(
+                result,
+                field.name(),
+                field.data_type(),
+                field.is_nullable(),
+                &child_indent,
+            );
+        }
+        DataType::LargeList(field) => {
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: large list (nullable = {nullable_str})\n"
+            ));
+            format_field_with_indent(
+                result,
+                field.name(),
+                field.data_type(),
+                field.is_nullable(),
+                &child_indent,
+            );
+        }
+        DataType::FixedSizeList(field, _size) => {
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: fixed size list (nullable = {nullable_str})\n"
+            ));
+            format_field_with_indent(
+                result,
+                field.name(),
+                field.data_type(),
+                field.is_nullable(),
+                &child_indent,
+            );
+        }
+        DataType::Map(field, _) => {
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: map (nullable = {nullable_str})\n"
+            ));
+            if let DataType::Struct(inner_fields) = field.data_type()
+                && inner_fields.len() == 2
+            {
+                format_field_with_indent(
+                    result,
+                    "key",
+                    inner_fields[0].data_type(),
+                    inner_fields[0].is_nullable(),
+                    &child_indent,
+                );
+                let value_contains_null = field.is_nullable().to_string().to_lowercase();
+                // Handle complex value types properly
+                match inner_fields[1].data_type() {
+                    DataType::Struct(_)
+                    | DataType::List(_)
+                    | DataType::LargeList(_)
+                    | DataType::FixedSizeList(_, _)
+                    | DataType::Map(_, _) => {
+                        format_field_with_indent(
+                            result,
+                            "value",
+                            inner_fields[1].data_type(),
+                            inner_fields[1].is_nullable(),
+                            &child_indent,
+                        );
+                    }
+                    _ => {
+                        result.push_str(&format!("{child_indent}|-- value: {} (nullable = {value_contains_null})\n",
+                                format_simple_data_type(inner_fields[1].data_type())));
+                    }
+                }
+            }
+        }
+        DataType::Struct(fields) => {
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: struct (nullable = {nullable_str})\n"
+            ));
+            for struct_field in fields {
+                format_field_with_indent(
+                    result,
+                    struct_field.name(),
+                    struct_field.data_type(),
+                    struct_field.is_nullable(),
+                    &child_indent,
+                );
+            }
+        }
+        _ => {
+            let type_str = format_simple_data_type(data_type);
+            result.push_str(&format!(
+                "{indent}|-- {field_name}: {type_str} (nullable = {nullable_str})\n"
+            ));
+        }
     }
 }
 
-impl From<&DFSchema> for Schema {
-    /// Convert DFSchema reference into a Schema
-    fn from(df_schema: &DFSchema) -> Self {
-        let fields: Fields = df_schema.inner.fields.clone();
-        Schema::new_with_metadata(fields, df_schema.inner.metadata.clone())
+/// Format simple DataType in lowercase format (for leaf nodes)
+fn format_simple_data_type(data_type: &DataType) -> String {
+    match data_type {
+        DataType::Boolean => "boolean".to_string(),
+        DataType::Int8 => "int8".to_string(),
+        DataType::Int16 => "int16".to_string(),
+        DataType::Int32 => "int32".to_string(),
+        DataType::Int64 => "int64".to_string(),
+        DataType::UInt8 => "uint8".to_string(),
+        DataType::UInt16 => "uint16".to_string(),
+        DataType::UInt32 => "uint32".to_string(),
+        DataType::UInt64 => "uint64".to_string(),
+        DataType::Float16 => "float16".to_string(),
+        DataType::Float32 => "float32".to_string(),
+        DataType::Float64 => "float64".to_string(),
+        DataType::Utf8 => "utf8".to_string(),
+        DataType::LargeUtf8 => "large_utf8".to_string(),
+        DataType::Binary => "binary".to_string(),
+        DataType::LargeBinary => "large_binary".to_string(),
+        DataType::FixedSizeBinary(_) => "fixed_size_binary".to_string(),
+        DataType::Date32 => "date32".to_string(),
+        DataType::Date64 => "date64".to_string(),
+        DataType::Time32(_) => "time32".to_string(),
+        DataType::Time64(_) => "time64".to_string(),
+        DataType::Timestamp(_, tz) => match tz {
+            Some(tz_str) => format!("timestamp ({tz_str})"),
+            None => "timestamp".to_string(),
+        },
+        DataType::Interval(_) => "interval".to_string(),
+        DataType::Dictionary(_, value_type) => {
+            format_simple_data_type(value_type.as_ref())
+        }
+        DataType::Decimal32(precision, scale) => {
+            format!("decimal32({precision}, {scale})")
+        }
+        DataType::Decimal64(precision, scale) => {
+            format!("decimal64({precision}, {scale})")
+        }
+        DataType::Decimal128(precision, scale) => {
+            format!("decimal128({precision}, {scale})")
+        }
+        DataType::Decimal256(precision, scale) => {
+            format!("decimal256({precision}, {scale})")
+        }
+        DataType::Null => "null".to_string(),
+        _ => format!("{data_type}").to_lowercase(),
     }
 }
 
@@ -880,13 +1127,18 @@ impl TryFrom<SchemaRef> for DFSchema {
             field_qualifiers: vec![None; field_count],
             functional_dependencies: FunctionalDependencies::empty(),
         };
+        // Without checking names, because schema here may have duplicate field names.
+        // For example, Partial AggregateMode will generate duplicate field names from
+        // state_fields.
+        // See <https://github.com/apache/datafusion/issues/17715>
+        // dfschema.check_names()?;
         Ok(dfschema)
     }
 }
 
 impl From<DFSchema> for SchemaRef {
-    fn from(df_schema: DFSchema) -> Self {
-        SchemaRef::new(df_schema.into())
+    fn from(dfschema: DFSchema) -> Self {
+        Arc::clone(&dfschema.inner)
     }
 }
 
@@ -982,7 +1234,7 @@ pub trait ExprSchema: std::fmt::Debug {
     }
 
     // Return the column's field
-    fn field_from_column(&self, col: &Column) -> Result<&Field>;
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef>;
 }
 
 // Implement `ExprSchema` for `Arc<DFSchema>`
@@ -1003,13 +1255,13 @@ impl<P: AsRef<DFSchema> + std::fmt::Debug> ExprSchema for P {
         self.as_ref().data_type_and_nullable(col)
     }
 
-    fn field_from_column(&self, col: &Column) -> Result<&Field> {
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef> {
         self.as_ref().field_from_column(col)
     }
 }
 
 impl ExprSchema for DFSchema {
-    fn field_from_column(&self, col: &Column) -> Result<&Field> {
+    fn field_from_column(&self, col: &Column) -> Result<&FieldRef> {
         match &col.relation {
             Some(r) => self.field_with_qualified_name(r, &col.name),
             None => self.field_with_unqualified_name(&col.name),
@@ -1072,8 +1324,8 @@ impl SchemaExt for Schema {
                 .try_for_each(|(f1, f2)| {
                     if f1.name() != f2.name() || (!DFSchema::datatype_is_logically_equal(f1.data_type(), f2.data_type()) && !can_cast_types(f2.data_type(), f1.data_type())) {
                         _plan_err!(
-                            "Inserting query schema mismatch: Expected table field '{}' with type {:?}, \
-                            but got '{}' with type {:?}.",
+                            "Inserting query schema mismatch: Expected table field '{}' with type {}, \
+                            but got '{}' with type {}.",
                             f1.name(),
                             f1.data_type(),
                             f2.name(),
@@ -1179,10 +1431,8 @@ mod tests {
     #[test]
     fn from_qualified_schema_into_arrow_schema() -> Result<()> {
         let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
-        let arrow_schema: Schema = schema.into();
-        let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, \
-        Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }";
-        assert_eq!(expected, arrow_schema.to_string());
+        let arrow_schema = schema.as_arrow();
+        insta::assert_snapshot!(arrow_schema.to_string(), @r#"Field { "c0": nullable Boolean }, Field { "c1": nullable Boolean }"#);
         Ok(())
     }
 
@@ -1196,12 +1446,14 @@ mod tests {
             join.to_string()
         );
         // test valid access
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t1"), "c0")
-            .is_ok());
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t2"), "c0")
-            .is_ok());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t1"), "c0")
+                .is_ok()
+        );
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t2"), "c0")
+                .is_ok()
+        );
         // test invalid access
         assert!(join.field_with_unqualified_name("c0").is_err());
         assert!(join.field_with_unqualified_name("t1.c0").is_err());
@@ -1243,18 +1495,20 @@ mod tests {
             join.to_string()
         );
         // test valid access
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare("t1"), "c0")
-            .is_ok());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare("t1"), "c0")
+                .is_ok()
+        );
         assert!(join.field_with_unqualified_name("c0").is_ok());
         assert!(join.field_with_unqualified_name("c100").is_ok());
         assert!(join.field_with_name(None, "c100").is_ok());
         // test invalid access
         assert!(join.field_with_unqualified_name("t1.c0").is_err());
         assert!(join.field_with_unqualified_name("t1.c100").is_err());
-        assert!(join
-            .field_with_qualified_name(&TableReference::bare(""), "c100")
-            .is_err());
+        assert!(
+            join.field_with_qualified_name(&TableReference::bare(""), "c100")
+                .is_err()
+        );
         Ok(())
     }
 
@@ -1263,9 +1517,11 @@ mod tests {
         let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
         let right = DFSchema::try_from(test_schema_1())?;
         let join = left.join(&right);
-        assert_contains!(join.unwrap_err().to_string(),
-                         "Schema error: Schema contains qualified \
-                          field name t1.c0 and unqualified field name c0 which would be ambiguous");
+        assert_contains!(
+            join.unwrap_err().to_string(),
+            "Schema error: Schema contains qualified \
+                          field name t1.c0 and unqualified field name c0 which would be ambiguous"
+        );
         Ok(())
     }
 
@@ -1544,6 +1800,27 @@ mod tests {
             &DataType::Utf8,
             &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
         ));
+
+        // Dictionary is logically equal to the logically equivalent value type
+        assert!(DFSchema::datatype_is_logically_equal(
+            &DataType::Utf8View,
+            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+        ));
+
+        assert!(DFSchema::datatype_is_logically_equal(
+            &DataType::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(DataType::List(
+                    Field::new("element", DataType::Utf8, false).into()
+                ))
+            ),
+            &DataType::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(DataType::List(
+                    Field::new("element", DataType::Utf8View, false).into()
+                ))
+            )
+        ));
     }
 
     #[test]
@@ -1558,6 +1835,36 @@ mod tests {
             &DataType::Int16
         ));
 
+        // Succeeds if decimal precision and scale are different
+        assert!(DFSchema::datatype_is_semantically_equal(
+            &DataType::Decimal32(1, 2),
+            &DataType::Decimal32(2, 1),
+        ));
+
+        assert!(DFSchema::datatype_is_semantically_equal(
+            &DataType::Decimal64(1, 2),
+            &DataType::Decimal64(2, 1),
+        ));
+
+        assert!(DFSchema::datatype_is_semantically_equal(
+            &DataType::Decimal128(1, 2),
+            &DataType::Decimal128(2, 1),
+        ));
+
+        assert!(DFSchema::datatype_is_semantically_equal(
+            &DataType::Decimal256(1, 2),
+            &DataType::Decimal256(2, 1),
+        ));
+
+        // Any two timestamp types should match
+        assert!(DFSchema::datatype_is_semantically_equal(
+            &DataType::Timestamp(
+                arrow::datatypes::TimeUnit::Microsecond,
+                Some("UTC".into())
+            ),
+            &DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
+        ));
+
         // Test lists
 
         // Succeeds if both have the same element type, disregards names and nullability
@@ -1700,4 +2007,488 @@ mod tests {
     fn test_metadata_n(n: usize) -> HashMap<String, String> {
         (0..n).map(|i| (format!("k{i}"), format!("v{i}"))).collect()
     }
+
+    #[test]
+    fn test_print_schema_unqualified() {
+        let schema = DFSchema::from_unqualified_fields(
+            vec![
+                Field::new("id", DataType::Int32, false),
+                Field::new("name", DataType::Utf8, true),
+                Field::new("age", DataType::Int64, true),
+                Field::new("active", DataType::Boolean, false),
+            ]
+            .into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- id: int32 (nullable = false)
+         |-- name: utf8 (nullable = true)
+         |-- age: int64 (nullable = true)
+         |-- active: boolean (nullable = false)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_qualified() {
+        let schema = DFSchema::try_from_qualified_schema(
+            "table1",
+            &Schema::new(vec![
+                Field::new("id", DataType::Int32, false),
+                Field::new("name", DataType::Utf8, true),
+            ]),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- table1.id: int32 (nullable = false)
+         |-- table1.name: utf8 (nullable = true)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_complex_types() {
+        let struct_field = Field::new(
+            "address",
+            DataType::Struct(Fields::from(vec![
+                Field::new("street", DataType::Utf8, true),
+                Field::new("city", DataType::Utf8, true),
+            ])),
+            true,
+        );
+
+        let list_field = Field::new(
+            "tags",
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+            true,
+        );
+
+        let schema = DFSchema::from_unqualified_fields(
+            vec![
+                Field::new("id", DataType::Int32, false),
+                struct_field,
+                list_field,
+                Field::new("score", DataType::Decimal128(10, 2), true),
+            ]
+            .into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- id: int32 (nullable = false)
+         |-- address: struct (nullable = true)
+         |    |-- street: utf8 (nullable = true)
+         |    |-- city: utf8 (nullable = true)
+         |-- tags: list (nullable = true)
+         |    |-- item: utf8 (nullable = true)
+         |-- score: decimal128(10, 2) (nullable = true)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_empty() {
+        let schema = DFSchema::empty();
+        let output = schema.tree_string();
+        insta::assert_snapshot!(output, @"root");
+    }
+
+    #[test]
+    fn test_print_schema_deeply_nested_types() {
+        // Create a deeply nested structure to test indentation and complex type formatting
+        let inner_struct = Field::new(
+            "inner",
+            DataType::Struct(Fields::from(vec![
+                Field::new("level1", DataType::Utf8, true),
+                Field::new("level2", DataType::Int32, false),
+            ])),
+            true,
+        );
+
+        let nested_list = Field::new(
+            "nested_list",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Struct(Fields::from(vec![
+                    Field::new("id", DataType::Int64, false),
+                    Field::new("value", DataType::Float64, true),
+                ])),
+                true,
+            ))),
+            true,
+        );
+
+        let map_field = Field::new(
+            "map_data",
+            DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("key", DataType::Utf8, false),
+                        Field::new(
+                            "value",
+                            DataType::List(Arc::new(Field::new(
+                                "item",
+                                DataType::Int32,
+                                true,
+                            ))),
+                            true,
+                        ),
+                    ])),
+                    false,
+                )),
+                false,
+            ),
+            true,
+        );
+
+        let schema = DFSchema::from_unqualified_fields(
+            vec![
+                Field::new("simple_field", DataType::Utf8, true),
+                inner_struct,
+                nested_list,
+                map_field,
+                Field::new(
+                    "timestamp_field",
+                    DataType::Timestamp(
+                        arrow::datatypes::TimeUnit::Microsecond,
+                        Some("UTC".into()),
+                    ),
+                    false,
+                ),
+            ]
+            .into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- simple_field: utf8 (nullable = true)
+         |-- inner: struct (nullable = true)
+         |    |-- level1: utf8 (nullable = true)
+         |    |-- level2: int32 (nullable = false)
+         |-- nested_list: list (nullable = true)
+         |    |-- item: struct (nullable = true)
+         |    |    |-- id: int64 (nullable = false)
+         |    |    |-- value: float64 (nullable = true)
+         |-- map_data: map (nullable = true)
+         |    |-- key: utf8 (nullable = false)
+         |    |-- value: list (nullable = true)
+         |    |    |-- item: int32 (nullable = true)
+         |-- timestamp_field: timestamp (UTC) (nullable = false)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_mixed_qualified_unqualified() {
+        // Test a schema with mixed qualified and unqualified fields
+        let schema = DFSchema::new_with_metadata(
+            vec![
+                (
+                    Some("table1".into()),
+                    Arc::new(Field::new("id", DataType::Int32, false)),
+                ),
+                (None, Arc::new(Field::new("name", DataType::Utf8, true))),
+                (
+                    Some("table2".into()),
+                    Arc::new(Field::new("score", DataType::Float64, true)),
+                ),
+                (
+                    None,
+                    Arc::new(Field::new("active", DataType::Boolean, false)),
+                ),
+            ],
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- table1.id: int32 (nullable = false)
+         |-- name: utf8 (nullable = true)
+         |-- table2.score: float64 (nullable = true)
+         |-- active: boolean (nullable = false)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_array_of_map() {
+        // Test the specific example from user feedback: array of map
+        let map_field = Field::new(
+            "entries",
+            DataType::Struct(Fields::from(vec![
+                Field::new("key", DataType::Utf8, false),
+                Field::new("value", DataType::Utf8, false),
+            ])),
+            false,
+        );
+
+        let array_of_map_field = Field::new(
+            "array_map_field",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Map(Arc::new(map_field), false),
+                false,
+            ))),
+            false,
+        );
+
+        let schema = DFSchema::from_unqualified_fields(
+            vec![array_of_map_field].into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- array_map_field: list (nullable = false)
+         |    |-- item: map (nullable = false)
+         |    |    |-- key: utf8 (nullable = false)
+         |    |    |-- value: utf8 (nullable = false)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_complex_type_combinations() {
+        // Test various combinations of list, struct, and map types
+
+        // List of structs
+        let list_of_structs = Field::new(
+            "list_of_structs",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Struct(Fields::from(vec![
+                    Field::new("id", DataType::Int32, false),
+                    Field::new("name", DataType::Utf8, true),
+                    Field::new("score", DataType::Float64, true),
+                ])),
+                true,
+            ))),
+            true,
+        );
+
+        // Struct containing lists
+        let struct_with_lists = Field::new(
+            "struct_with_lists",
+            DataType::Struct(Fields::from(vec![
+                Field::new(
+                    "tags",
+                    DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+                    true,
+                ),
+                Field::new(
+                    "scores",
+                    DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
+                    false,
+                ),
+                Field::new("metadata", DataType::Utf8, true),
+            ])),
+            false,
+        );
+
+        // Map with struct values
+        let map_with_struct_values = Field::new(
+            "map_with_struct_values",
+            DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("key", DataType::Utf8, false),
+                        Field::new(
+                            "value",
+                            DataType::Struct(Fields::from(vec![
+                                Field::new("count", DataType::Int64, false),
+                                Field::new("active", DataType::Boolean, true),
+                            ])),
+                            true,
+                        ),
+                    ])),
+                    false,
+                )),
+                false,
+            ),
+            true,
+        );
+
+        // List of maps
+        let list_of_maps = Field::new(
+            "list_of_maps",
+            DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Map(
+                    Arc::new(Field::new(
+                        "entries",
+                        DataType::Struct(Fields::from(vec![
+                            Field::new("key", DataType::Utf8, false),
+                            Field::new("value", DataType::Int32, true),
+                        ])),
+                        false,
+                    )),
+                    false,
+                ),
+                true,
+            ))),
+            true,
+        );
+
+        // Deeply nested: struct containing list of structs containing maps
+        let deeply_nested = Field::new(
+            "deeply_nested",
+            DataType::Struct(Fields::from(vec![
+                Field::new("level1", DataType::Utf8, true),
+                Field::new(
+                    "level2",
+                    DataType::List(Arc::new(Field::new(
+                        "item",
+                        DataType::Struct(Fields::from(vec![
+                            Field::new("id", DataType::Int32, false),
+                            Field::new(
+                                "properties",
+                                DataType::Map(
+                                    Arc::new(Field::new(
+                                        "entries",
+                                        DataType::Struct(Fields::from(vec![
+                                            Field::new("key", DataType::Utf8, false),
+                                            Field::new("value", DataType::Float64, true),
+                                        ])),
+                                        false,
+                                    )),
+                                    false,
+                                ),
+                                true,
+                            ),
+                        ])),
+                        true,
+                    ))),
+                    false,
+                ),
+            ])),
+            true,
+        );
+
+        let schema = DFSchema::from_unqualified_fields(
+            vec![
+                list_of_structs,
+                struct_with_lists,
+                map_with_struct_values,
+                list_of_maps,
+                deeply_nested,
+            ]
+            .into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- list_of_structs: list (nullable = true)
+         |    |-- item: struct (nullable = true)
+         |    |    |-- id: int32 (nullable = false)
+         |    |    |-- name: utf8 (nullable = true)
+         |    |    |-- score: float64 (nullable = true)
+         |-- struct_with_lists: struct (nullable = false)
+         |    |-- tags: list (nullable = true)
+         |    |    |-- item: utf8 (nullable = true)
+         |    |-- scores: list (nullable = false)
+         |    |    |-- item: int32 (nullable = true)
+         |    |-- metadata: utf8 (nullable = true)
+         |-- map_with_struct_values: map (nullable = true)
+         |    |-- key: utf8 (nullable = false)
+         |    |-- value: struct (nullable = true)
+         |    |    |-- count: int64 (nullable = false)
+         |    |    |-- active: boolean (nullable = true)
+         |-- list_of_maps: list (nullable = true)
+         |    |-- item: map (nullable = true)
+         |    |    |-- key: utf8 (nullable = false)
+         |    |    |-- value: int32 (nullable = false)
+         |-- deeply_nested: struct (nullable = true)
+         |    |-- level1: utf8 (nullable = true)
+         |    |-- level2: list (nullable = false)
+         |    |    |-- item: struct (nullable = true)
+         |    |    |    |-- id: int32 (nullable = false)
+         |    |    |    |-- properties: map (nullable = true)
+         |    |    |    |    |-- key: utf8 (nullable = false)
+         |    |    |    |    |-- value: float64 (nullable = false)
+        ");
+    }
+
+    #[test]
+    fn test_print_schema_edge_case_types() {
+        // Test edge cases and special types
+        let schema = DFSchema::from_unqualified_fields(
+            vec![
+                Field::new("null_field", DataType::Null, true),
+                Field::new("binary_field", DataType::Binary, false),
+                Field::new("large_binary", DataType::LargeBinary, true),
+                Field::new("large_utf8", DataType::LargeUtf8, false),
+                Field::new("fixed_size_binary", DataType::FixedSizeBinary(16), true),
+                Field::new(
+                    "fixed_size_list",
+                    DataType::FixedSizeList(
+                        Arc::new(Field::new("item", DataType::Int32, true)),
+                        5,
+                    ),
+                    false,
+                ),
+                Field::new("decimal32", DataType::Decimal32(9, 4), true),
+                Field::new("decimal64", DataType::Decimal64(9, 4), true),
+                Field::new("decimal128", DataType::Decimal128(18, 4), true),
+                Field::new("decimal256", DataType::Decimal256(38, 10), false),
+                Field::new("date32", DataType::Date32, true),
+                Field::new("date64", DataType::Date64, false),
+                Field::new(
+                    "time32_seconds",
+                    DataType::Time32(arrow::datatypes::TimeUnit::Second),
+                    true,
+                ),
+                Field::new(
+                    "time64_nanoseconds",
+                    DataType::Time64(arrow::datatypes::TimeUnit::Nanosecond),
+                    false,
+                ),
+            ]
+            .into(),
+            HashMap::new(),
+        )
+        .unwrap();
+
+        let output = schema.tree_string();
+
+        insta::assert_snapshot!(output, @r"
+        root
+         |-- null_field: null (nullable = true)
+         |-- binary_field: binary (nullable = false)
+         |-- large_binary: large_binary (nullable = true)
+         |-- large_utf8: large_utf8 (nullable = false)
+         |-- fixed_size_binary: fixed_size_binary (nullable = true)
+         |-- fixed_size_list: fixed size list (nullable = false)
+         |    |-- item: int32 (nullable = true)
+         |-- decimal32: decimal32(9, 4) (nullable = true)
+         |-- decimal64: decimal64(9, 4) (nullable = true)
+         |-- decimal128: decimal128(18, 4) (nullable = true)
+         |-- decimal256: decimal256(38, 10) (nullable = false)
+         |-- date32: date32 (nullable = true)
+         |-- date64: date64 (nullable = false)
+         |-- time32_seconds: time32 (nullable = true)
+         |-- time64_nanoseconds: time64 (nullable = false)
+        ");
+    }
 }
diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs
index 0dce8e6a56eca..b25bf1c12e44a 100644
--- a/datafusion/common/src/diagnostic.rs
+++ b/datafusion/common/src/diagnostic.rs
@@ -30,8 +30,11 @@ use crate::Span;
 /// ```rust
 /// # use datafusion_common::{Location, Span, Diagnostic};
 /// let span = Some(Span {
-///     start: Location{ line: 2, column: 1 },
-///     end: Location{ line: 4, column: 15 }
+///     start: Location { line: 2, column: 1 },
+///     end: Location {
+///         line: 4,
+///         column: 15,
+///     },
 /// });
 /// let diagnostic = Diagnostic::new_error("Something went wrong", span)
 ///     .with_help("Have you tried turning it on and off again?", None);
diff --git a/datafusion/common/src/display/human_readable.rs b/datafusion/common/src/display/human_readable.rs
new file mode 100644
index 0000000000000..0e0d677bd8904
--- /dev/null
+++ b/datafusion/common/src/display/human_readable.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Helpers for rendering sizes, counts, and durations in human readable form.
+
+/// Common data size units
+pub mod units {
+    pub const TB: u64 = 1 << 40;
+    pub const GB: u64 = 1 << 30;
+    pub const MB: u64 = 1 << 20;
+    pub const KB: u64 = 1 << 10;
+}
+
+/// Present size in human-readable form
+pub fn human_readable_size(size: usize) -> String {
+    use units::*;
+
+    let size = size as u64;
+    let (value, unit) = {
+        if size >= 2 * TB {
+            (size as f64 / TB as f64, "TB")
+        } else if size >= 2 * GB {
+            (size as f64 / GB as f64, "GB")
+        } else if size >= 2 * MB {
+            (size as f64 / MB as f64, "MB")
+        } else if size >= 2 * KB {
+            (size as f64 / KB as f64, "KB")
+        } else {
+            (size as f64, "B")
+        }
+    };
+    format!("{value:.1} {unit}")
+}
+
+/// Present count in human-readable form with K, M, B, T suffixes
+pub fn human_readable_count(count: usize) -> String {
+    let count = count as u64;
+    let (value, unit) = {
+        if count >= 1_000_000_000_000 {
+            (count as f64 / 1_000_000_000_000.0, " T")
+        } else if count >= 1_000_000_000 {
+            (count as f64 / 1_000_000_000.0, " B")
+        } else if count >= 1_000_000 {
+            (count as f64 / 1_000_000.0, " M")
+        } else if count >= 1_000 {
+            (count as f64 / 1_000.0, " K")
+        } else {
+            return count.to_string();
+        }
+    };
+
+    // Format with appropriate precision
+    // For values >= 100, show 1 decimal place (e.g., 123.4 K)
+    // For values < 100, show 2 decimal places (e.g., 10.12 K)
+    if value >= 100.0 {
+        format!("{value:.1}{unit}")
+    } else {
+        format!("{value:.2}{unit}")
+    }
+}
+
+/// Present duration in human-readable form with 2 decimal places
+pub fn human_readable_duration(nanos: u64) -> String {
+    const NANOS_PER_SEC: f64 = 1_000_000_000.0;
+    const NANOS_PER_MILLI: f64 = 1_000_000.0;
+    const NANOS_PER_MICRO: f64 = 1_000.0;
+
+    let nanos_f64 = nanos as f64;
+
+    if nanos >= 1_000_000_000 {
+        // >= 1 second: show in seconds
+        format!("{:.2}s", nanos_f64 / NANOS_PER_SEC)
+    } else if nanos >= 1_000_000 {
+        // >= 1 millisecond: show in milliseconds
+        format!("{:.2}ms", nanos_f64 / NANOS_PER_MILLI)
+    } else if nanos >= 1_000 {
+        // >= 1 microsecond: show in microseconds
+        format!("{:.2}µs", nanos_f64 / NANOS_PER_MICRO)
+    } else {
+        // < 1 microsecond: show in nanoseconds
+        format!("{nanos}ns")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_human_readable_count() {
+        assert_eq!(human_readable_count(0), "0");
+        assert_eq!(human_readable_count(1), "1");
+        assert_eq!(human_readable_count(999), "999");
+        assert_eq!(human_readable_count(1_000), "1.00 K");
+        assert_eq!(human_readable_count(10_100), "10.10 K");
+        assert_eq!(human_readable_count(1_532), "1.53 K");
+        assert_eq!(human_readable_count(99_999), "100.00 K");
+        assert_eq!(human_readable_count(1_000_000), "1.00 M");
+        assert_eq!(human_readable_count(1_532_000), "1.53 M");
+        assert_eq!(human_readable_count(99_000_000), "99.00 M");
+        assert_eq!(human_readable_count(123_456_789), "123.5 M");
+        assert_eq!(human_readable_count(1_000_000_000), "1.00 B");
+        assert_eq!(human_readable_count(1_532_000_000), "1.53 B");
+        assert_eq!(human_readable_count(999_999_999_999), "1000.0 B");
+        assert_eq!(human_readable_count(1_000_000_000_000), "1.00 T");
+        assert_eq!(human_readable_count(42_000_000_000_000), "42.00 T");
+    }
+
+    #[test]
+    fn test_human_readable_duration() {
+        assert_eq!(human_readable_duration(0), "0ns");
+        assert_eq!(human_readable_duration(1), "1ns");
+        assert_eq!(human_readable_duration(999), "999ns");
+        assert_eq!(human_readable_duration(1_000), "1.00µs");
+        assert_eq!(human_readable_duration(1_234), "1.23µs");
+        assert_eq!(human_readable_duration(999_999), "1000.00µs");
+        assert_eq!(human_readable_duration(1_000_000), "1.00ms");
+        assert_eq!(human_readable_duration(11_295_377), "11.30ms");
+        assert_eq!(human_readable_duration(1_234_567), "1.23ms");
+        assert_eq!(human_readable_duration(999_999_999), "1000.00ms");
+        assert_eq!(human_readable_duration(1_000_000_000), "1.00s");
+        assert_eq!(human_readable_duration(1_234_567_890), "1.23s");
+        assert_eq!(human_readable_duration(42_000_000_000), "42.00s");
+    }
+}
diff --git a/datafusion/common/src/display/mod.rs b/datafusion/common/src/display/mod.rs
index bad51c45f8ee8..a6a97b243f06a 100644
--- a/datafusion/common/src/display/mod.rs
+++ b/datafusion/common/src/display/mod.rs
@@ -18,6 +18,7 @@
 //! Types for plan display
 
 mod graphviz;
+pub mod human_readable;
 pub use graphviz::*;
 
 use std::{
diff --git a/datafusion/common/src/encryption.rs b/datafusion/common/src/encryption.rs
new file mode 100644
index 0000000000000..2a8cfdbc89966
--- /dev/null
+++ b/datafusion/common/src/encryption.rs
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Support optional features for encryption in Parquet files.
+//! This module provides types and functions related to encryption in Parquet files.
+
+#[cfg(feature = "parquet_encryption")]
+pub use parquet::encryption::decrypt::FileDecryptionProperties;
+#[cfg(feature = "parquet_encryption")]
+pub use parquet::encryption::encrypt::FileEncryptionProperties;
+
+#[cfg(not(feature = "parquet_encryption"))]
+#[derive(Default, Clone, Debug)]
+pub struct FileDecryptionProperties;
+#[cfg(not(feature = "parquet_encryption"))]
+#[derive(Default, Clone, Debug)]
+pub struct FileEncryptionProperties;
+
+pub use crate::config::{ConfigFileDecryptionProperties, ConfigFileEncryptionProperties};
diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs
index b4a537fdce7ee..b7a30f868a02b 100644
--- a/datafusion/common/src/error.rs
+++ b/datafusion/common/src/error.rs
@@ -15,7 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! DataFusion error types
+//! # Error Handling in DataFusion
+//!
+//! In DataFusion, there are two types of errors that can be raised:
+//!
+//! 1. Expected errors – These indicate invalid operations performed by the caller,
+//!    such as attempting to open a non-existent file. Different categories exist to
+//!    distinguish their sources (e.g., [`DataFusionError::ArrowError`],
+//!    [`DataFusionError::IoError`], etc.).
+//!
+//! 2. Unexpected errors – Represented by [`DataFusionError::Internal`], these
+//!    indicate that an internal invariant has been broken, suggesting a potential
+//!    bug in the system.
+//!
+//! There are several convenient macros for throwing errors. For example, use
+//! `exec_err!` for expected errors.
+//! For invariant checks, you can use `assert_or_internal_err!`,
+//! `assert_eq_or_internal_err!`, `assert_ne_or_internal_err!` for easier assertions.
+//! On the performance-critical path, use `debug_assert!` instead to reduce overhead.
+
 #[cfg(feature = "backtrace")]
 use std::backtrace::{Backtrace, BacktraceStatus};
 
@@ -35,6 +53,7 @@ use apache_avro::Error as AvroError;
 use arrow::error::ArrowError;
 #[cfg(feature = "parquet")]
 use parquet::errors::ParquetError;
+#[cfg(feature = "sql")]
 use sqlparser::parser::ParserError;
 use tokio::task::JoinError;
 
@@ -53,22 +72,23 @@ pub enum DataFusionError {
     /// Error returned by arrow.
     ///
     /// 2nd argument is for optional backtrace
-    ArrowError(ArrowError, Option<String>),
+    ArrowError(Box<ArrowError>, Option<String>),
     /// Error when reading / writing Parquet data.
     #[cfg(feature = "parquet")]
-    ParquetError(ParquetError),
+    ParquetError(Box<ParquetError>),
     /// Error when reading Avro data.
     #[cfg(feature = "avro")]
     AvroError(Box<AvroError>),
     /// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile)
     #[cfg(feature = "object_store")]
-    ObjectStore(object_store::Error),
+    ObjectStore(Box<object_store::Error>),
     /// Error when an I/O operation fails
     IoError(io::Error),
     /// Error when SQL is syntactically incorrect.
     ///
     /// 2nd argument is for optional backtrace
-    SQL(ParserError, Option<String>),
+    #[cfg(feature = "sql")]
+    SQL(Box<ParserError>, Option<String>),
     /// Error when a feature is not yet implemented.
     ///
     /// These errors are sometimes returned for features that are still in
@@ -107,7 +127,7 @@ pub enum DataFusionError {
     ///
     /// 2nd argument is for optional backtrace
     /// Boxing the optional backtrace to prevent <https://rust-lang.github.io/rust-clippy/master/index.html#/result_large_err>
-    SchemaError(SchemaError, Box<Option<String>>),
+    SchemaError(Box<SchemaError>, Box<Option<String>>),
     /// Error during execution of the query.
     ///
     /// This error is returned when an error happens during execution due to a
@@ -118,7 +138,7 @@ pub enum DataFusionError {
     /// [`JoinError`] during execution of the query.
     ///
     /// This error can't occur for unjoined tasks, such as execution shutdown.
-    ExecutionJoin(JoinError),
+    ExecutionJoin(Box<JoinError>),
     /// Error when resources (such as memory of scratch disk space) are exhausted.
     ///
     /// This error is thrown when a consumer cannot acquire additional memory
@@ -151,6 +171,10 @@ pub enum DataFusionError {
     /// to multiple receivers. For example, when the source of a repartition
     /// errors and the error is propagated to multiple consumers.
     Shared(Arc<DataFusionError>),
+    /// An error that originated during a foreign function interface call.
+    /// Transferring errors across the FFI boundary is difficult, so the original
+    /// error will be converted to a string.
+    Ffi(String),
 }
 
 #[macro_export]
@@ -164,7 +188,7 @@ macro_rules! context {
 #[derive(Debug)]
 pub enum SchemaError {
     /// Schema contains a (possibly) qualified and unqualified field with same unqualified name
-    AmbiguousReference { field: Column },
+    AmbiguousReference { field: Box<Column> },
     /// Schema contains duplicate qualified field name
     DuplicateQualifiedField {
         qualifier: Box<TableReference>,
@@ -276,14 +300,14 @@ impl From<io::Error> for DataFusionError {
 
 impl From<ArrowError> for DataFusionError {
     fn from(e: ArrowError) -> Self {
-        DataFusionError::ArrowError(e, None)
+        DataFusionError::ArrowError(Box::new(e), Some(DataFusionError::get_back_trace()))
     }
 }
 
 impl From<DataFusionError> for ArrowError {
     fn from(e: DataFusionError) -> Self {
         match e {
-            DataFusionError::ArrowError(e, _) => e,
+            DataFusionError::ArrowError(e, _) => *e,
             DataFusionError::External(e) => ArrowError::ExternalError(e),
             other => ArrowError::ExternalError(Box::new(other)),
         }
@@ -304,7 +328,7 @@ impl From<&Arc<DataFusionError>> for DataFusionError {
 #[cfg(feature = "parquet")]
 impl From<ParquetError> for DataFusionError {
     fn from(e: ParquetError) -> Self {
-        DataFusionError::ParquetError(e)
+        DataFusionError::ParquetError(Box::new(e))
     }
 }
 
@@ -318,20 +342,21 @@ impl From<AvroError> for DataFusionError {
 #[cfg(feature = "object_store")]
 impl From<object_store::Error> for DataFusionError {
     fn from(e: object_store::Error) -> Self {
-        DataFusionError::ObjectStore(e)
+        DataFusionError::ObjectStore(Box::new(e))
     }
 }
 
 #[cfg(feature = "object_store")]
 impl From<object_store::path::Error> for DataFusionError {
     fn from(e: object_store::path::Error) -> Self {
-        DataFusionError::ObjectStore(e.into())
+        DataFusionError::ObjectStore(Box::new(e.into()))
     }
 }
 
+#[cfg(feature = "sql")]
 impl From<ParserError> for DataFusionError {
     fn from(e: ParserError) -> Self {
-        DataFusionError::SQL(e, None)
+        DataFusionError::SQL(Box::new(e), None)
     }
 }
 
@@ -361,22 +386,23 @@ impl Display for DataFusionError {
 impl Error for DataFusionError {
     fn source(&self) -> Option<&(dyn Error + 'static)> {
         match self {
-            DataFusionError::ArrowError(e, _) => Some(e),
+            DataFusionError::ArrowError(e, _) => Some(e.as_ref()),
             #[cfg(feature = "parquet")]
-            DataFusionError::ParquetError(e) => Some(e),
+            DataFusionError::ParquetError(e) => Some(e.as_ref()),
             #[cfg(feature = "avro")]
-            DataFusionError::AvroError(e) => Some(e),
+            DataFusionError::AvroError(e) => Some(e.as_ref()),
             #[cfg(feature = "object_store")]
-            DataFusionError::ObjectStore(e) => Some(e),
+            DataFusionError::ObjectStore(e) => Some(e.as_ref()),
             DataFusionError::IoError(e) => Some(e),
-            DataFusionError::SQL(e, _) => Some(e),
+            #[cfg(feature = "sql")]
+            DataFusionError::SQL(e, _) => Some(e.as_ref()),
             DataFusionError::NotImplemented(_) => None,
             DataFusionError::Internal(_) => None,
             DataFusionError::Configuration(_) => None,
             DataFusionError::Plan(_) => None,
-            DataFusionError::SchemaError(e, _) => Some(e),
+            DataFusionError::SchemaError(e, _) => Some(e.as_ref()),
             DataFusionError::Execution(_) => None,
-            DataFusionError::ExecutionJoin(e) => Some(e),
+            DataFusionError::ExecutionJoin(e) => Some(e.as_ref()),
             DataFusionError::ResourcesExhausted(_) => None,
             DataFusionError::External(e) => Some(e.as_ref()),
             DataFusionError::Context(_, e) => Some(e.as_ref()),
@@ -391,6 +417,7 @@ impl Error for DataFusionError {
             // can't be executed.
             DataFusionError::Collection(errs) => errs.first().map(|e| e as &dyn Error),
             DataFusionError::Shared(e) => Some(e.as_ref()),
+            DataFusionError::Ffi(_) => None,
         }
     }
 }
@@ -451,12 +478,13 @@ impl DataFusionError {
     /// If backtrace enabled then error has a format "message" [`Self::BACK_TRACE_SEP`] "backtrace"
     /// The method strips the backtrace and outputs "message"
     pub fn strip_backtrace(&self) -> String {
-        self.to_string()
+        (*self
+            .to_string()
             .split(Self::BACK_TRACE_SEP)
             .collect::<Vec<&str>>()
             .first()
-            .unwrap_or(&"")
-            .to_string()
+            .unwrap_or(&""))
+        .to_string()
     }
 
     /// To enable optional rust backtrace in DataFusion:
@@ -497,6 +525,7 @@ impl DataFusionError {
             #[cfg(feature = "object_store")]
             DataFusionError::ObjectStore(_) => "Object Store error: ",
             DataFusionError::IoError(_) => "IO error: ",
+            #[cfg(feature = "sql")]
             DataFusionError::SQL(_, _) => "SQL error: ",
             DataFusionError::NotImplemented(_) => {
                 "This feature is not implemented: "
@@ -520,10 +549,11 @@ impl DataFusionError {
                 errs.first().expect("cannot construct DataFusionError::Collection with 0 errors, but got one such case").error_prefix()
             }
             DataFusionError::Shared(_) => "",
+            DataFusionError::Ffi(_) => "FFI error: ",
         }
     }
 
-    pub fn message(&self) -> Cow<str> {
+    pub fn message(&self) -> Cow<'_, str> {
         match *self {
             DataFusionError::ArrowError(ref desc, ref backtrace) => {
                 let backtrace = backtrace.clone().unwrap_or_else(|| "".to_owned());
@@ -534,6 +564,7 @@ impl DataFusionError {
             #[cfg(feature = "avro")]
             DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()),
             DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()),
+            #[cfg(feature = "sql")]
             DataFusionError::SQL(ref desc, ref backtrace) => {
                 let backtrace: String =
                     backtrace.clone().unwrap_or_else(|| "".to_owned());
@@ -542,8 +573,9 @@ impl DataFusionError {
             DataFusionError::Configuration(ref desc) => Cow::Owned(desc.to_string()),
             DataFusionError::NotImplemented(ref desc) => Cow::Owned(desc.to_string()),
             DataFusionError::Internal(ref desc) => Cow::Owned(format!(
-                "{desc}.\nThis was likely caused by a bug in DataFusion's \
-            code and we would welcome that you file an bug report in our issue tracker"
+                "{desc}.\nThis issue was likely caused by a bug in DataFusion's code. \
+                Please help us to resolve this by filing a bug report in our issue tracker: \
+                https://github.com/apache/datafusion/issues"
             )),
             DataFusionError::Plan(ref desc) => Cow::Owned(desc.to_string()),
             DataFusionError::SchemaError(ref desc, ref backtrace) => {
@@ -570,6 +602,7 @@ impl DataFusionError {
                 .expect("cannot construct DataFusionError::Collection with 0 errors")
                 .message(),
             DataFusionError::Shared(ref desc) => Cow::Owned(desc.to_string()),
+            DataFusionError::Ffi(ref desc) => Cow::Owned(desc.to_string()),
         }
     }
 
@@ -676,7 +709,10 @@ impl DataFusionError {
 /// let mut builder = DataFusionError::builder();
 /// builder.add_error(DataFusionError::Internal("foo".to_owned()));
 /// // ok_or returns the value if no errors have been added
-/// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo");
+/// assert_contains!(
+///     builder.error_or(42).unwrap_err().to_string(),
+///     "Internal error: foo"
+/// );
 /// ```
 #[derive(Debug, Default)]
 pub struct DataFusionErrorBuilder(Vec<DataFusionError>);
@@ -694,7 +730,10 @@ impl DataFusionErrorBuilder {
     /// # use datafusion_common::{assert_contains, DataFusionError};
     /// let mut builder = DataFusionError::builder();
     /// builder.add_error(DataFusionError::Internal("foo".to_owned()));
-    /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo");
+    /// assert_contains!(
+    ///     builder.error_or(42).unwrap_err().to_string(),
+    ///     "Internal error: foo"
+    /// );
     /// ```
     pub fn add_error(&mut self, error: DataFusionError) {
         self.0.push(error);
@@ -706,8 +745,11 @@ impl DataFusionErrorBuilder {
     /// ```
     /// # use datafusion_common::{assert_contains, DataFusionError};
     /// let builder = DataFusionError::builder()
-    ///   .with_error(DataFusionError::Internal("foo".to_owned()));
-    /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo");
+    ///     .with_error(DataFusionError::Internal("foo".to_owned()));
+    /// assert_contains!(
+    ///     builder.error_or(42).unwrap_err().to_string(),
+    ///     "Internal error: foo"
+    /// );
     /// ```
     pub fn with_error(mut self, error: DataFusionError) -> Self {
         self.0.push(error);
@@ -733,7 +775,7 @@ impl DataFusionErrorBuilder {
 macro_rules! unwrap_or_internal_err {
     ($Value: ident) => {
         $Value.ok_or_else(|| {
-            DataFusionError::Internal(format!(
+            $crate::DataFusionError::Internal(format!(
                 "{} should not be None",
                 stringify!($Value)
             ))
@@ -741,6 +783,116 @@ macro_rules! unwrap_or_internal_err {
     };
 }
 
+/// Assert a condition, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_or_internal_err!(predicate);
+/// assert_or_internal_err!(predicate, "human readable message");
+/// assert_or_internal_err!(predicate, format!("details: {}", value));
+/// ```
+#[macro_export]
+macro_rules! assert_or_internal_err {
+    ($cond:expr) => {
+        if !$cond {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {}",
+                stringify!($cond)
+            )));
+        }
+    };
+    ($cond:expr, $($arg:tt)+) => {
+        if !$cond {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {}: {}",
+                stringify!($cond),
+                format!($($arg)+)
+            )));
+        }
+    };
+}
+
+/// Assert equality, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_eq_or_internal_err!(actual, expected);
+/// assert_eq_or_internal_err!(left_expr, right_expr, "values must match");
+/// assert_eq_or_internal_err!(lhs, rhs, "metadata: {}", extra);
+/// ```
+#[macro_export]
+macro_rules! assert_eq_or_internal_err {
+    ($left:expr, $right:expr $(,)?) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val != right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} == {} (left: {:?}, right: {:?})",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val
+            )));
+        }
+    }};
+    ($left:expr, $right:expr, $($arg:tt)+) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val != right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} == {} (left: {:?}, right: {:?}): {}",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val,
+                format!($($arg)+)
+            )));
+        }
+    }};
+}
+
+/// Assert inequality, returning `DataFusionError::Internal` on failure.
+///
+/// # Examples
+///
+/// ```text
+/// assert_ne_or_internal_err!(left, right);
+/// assert_ne_or_internal_err!(lhs_expr, rhs_expr, "values must differ");
+/// assert_ne_or_internal_err!(a, b, "context {}", info);
+/// ```
+#[macro_export]
+macro_rules! assert_ne_or_internal_err {
+    ($left:expr, $right:expr $(,)?) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val == right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} != {} (left: {:?}, right: {:?})",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val
+            )));
+        }
+    }};
+    ($left:expr, $right:expr, $($arg:tt)+) => {{
+        let left_val = &$left;
+        let right_val = &$right;
+        if left_val == right_val {
+            return Err($crate::DataFusionError::Internal(format!(
+                "Assertion failed: {} != {} (left: {:?}, right: {:?}): {}",
+                stringify!($left),
+                stringify!($right),
+                left_val,
+                right_val,
+                format!($($arg)+)
+            )));
+        }
+    }};
+}
+
 /// Add a macros for concise  DataFusionError::* errors declaration
 /// supports placeholders the same way as `format!`
 /// Examples:
@@ -751,84 +903,131 @@ macro_rules! unwrap_or_internal_err {
 ///     plan_err!("Error {val:?}")
 ///
 /// `NAME_ERR` -  macro name for wrapping Err(DataFusionError::*)
+/// `PREFIXED_NAME_ERR` - underscore-prefixed alias for NAME_ERR (e.g., _plan_err)
+/// (Needed to avoid compiler error when using macro in the same crate: `macros from the current crate cannot be referred to by absolute paths`)
 /// `NAME_DF_ERR` -  macro name for wrapping DataFusionError::*. Needed to keep backtrace opportunity
 /// in construction where DataFusionError::* used directly, like `map_err`, `ok_or_else`, etc
+/// `PREFIXED_NAME_DF_ERR` - underscore-prefixed alias for NAME_DF_ERR (e.g., _plan_datafusion_err).
+/// (Needed to avoid compiler error when using macro in the same crate: `macros from the current crate cannot be referred to by absolute paths`)
 macro_rules! make_error {
-    ($NAME_ERR:ident, $NAME_DF_ERR: ident, $ERR:ident) => { make_error!(@inner ($), $NAME_ERR, $NAME_DF_ERR, $ERR); };
-    (@inner ($d:tt), $NAME_ERR:ident, $NAME_DF_ERR:ident, $ERR:ident) => {
-        ::paste::paste!{
-            /// Macro wraps `$ERR` to add backtrace feature
-            #[macro_export]
-            macro_rules! $NAME_DF_ERR {
-                ($d($d args:expr),* $d(; diagnostic=$d DIAG:expr)?) => {{
-                    let err =$crate::DataFusionError::$ERR(
-                        ::std::format!(
-                            "{}{}",
-                            ::std::format!($d($d args),*),
-                            $crate::DataFusionError::get_back_trace(),
-                        ).into()
-                    );
-                    $d (
-                        let err = err.with_diagnostic($d DIAG);
-                    )?
-                    err
-                }
-            }
+    ($NAME_ERR:ident, $PREFIXED_NAME_ERR:ident, $NAME_DF_ERR:ident, $PREFIXED_NAME_DF_ERR:ident, $ERR:ident) => {
+        make_error!(@inner ($), $NAME_ERR, $PREFIXED_NAME_ERR, $NAME_DF_ERR, $PREFIXED_NAME_DF_ERR, $ERR);
+    };
+    (@inner ($d:tt), $NAME_ERR:ident, $PREFIXED_NAME_ERR:ident, $NAME_DF_ERR:ident, $PREFIXED_NAME_DF_ERR:ident, $ERR:ident) => {
+        /// Macro wraps `$ERR` to add backtrace feature
+        #[macro_export]
+        macro_rules! $NAME_DF_ERR {
+            ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
+                let err = $crate::DataFusionError::$ERR(
+                    ::std::format!(
+                        "{}{}",
+                        ::std::format!($d($d args),*),
+                        $crate::DataFusionError::get_back_trace(),
+                    ).into()
+                );
+                $d (
+                    let err = err.with_diagnostic($d DIAG);
+                )?
+                err
+            }}
         }
 
-            /// Macro wraps Err(`$ERR`) to add backtrace feature
-            #[macro_export]
-            macro_rules! $NAME_ERR {
-                ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
-                    let err = $crate::[<_ $NAME_DF_ERR>]!($d($d args),*);
-                    $d (
-                        let err = err.with_diagnostic($d DIAG);
-                    )?
-                    Err(err)
-
-                }}
-            }
-
-
-            // Note: Certain macros are used in this  crate, but not all.
-            // This macro generates a use or all of them in case they are needed
-            // so we allow unused code to avoid warnings when they are not used
-            #[doc(hidden)]
-            #[allow(unused)]
-            pub use $NAME_ERR as [<_ $NAME_ERR>];
-            #[doc(hidden)]
-            #[allow(unused)]
-            pub use $NAME_DF_ERR as [<_ $NAME_DF_ERR>];
+        /// Macro wraps Err(`$ERR`) to add backtrace feature
+        #[macro_export]
+        macro_rules! $NAME_ERR {
+            ($d($d args:expr),* $d(; diagnostic = $d DIAG:expr)?) => {{
+                let err = $crate::$PREFIXED_NAME_DF_ERR!($d($d args),*);
+                $d (
+                    let err = err.with_diagnostic($d DIAG);
+                )?
+                Err(err)
+            }}
         }
+
+        #[doc(hidden)]
+        pub use $NAME_ERR as $PREFIXED_NAME_ERR;
+        #[doc(hidden)]
+        pub use $NAME_DF_ERR as $PREFIXED_NAME_DF_ERR;
     };
 }
 
 // Exposes a macro to create `DataFusionError::Plan` with optional backtrace
-make_error!(plan_err, plan_datafusion_err, Plan);
+make_error!(
+    plan_err,
+    _plan_err,
+    plan_datafusion_err,
+    _plan_datafusion_err,
+    Plan
+);
 
 // Exposes a macro to create `DataFusionError::Internal` with optional backtrace
-make_error!(internal_err, internal_datafusion_err, Internal);
+make_error!(
+    internal_err,
+    _internal_err,
+    internal_datafusion_err,
+    _internal_datafusion_err,
+    Internal
+);
 
 // Exposes a macro to create `DataFusionError::NotImplemented` with optional backtrace
-make_error!(not_impl_err, not_impl_datafusion_err, NotImplemented);
+make_error!(
+    not_impl_err,
+    _not_impl_err,
+    not_impl_datafusion_err,
+    _not_impl_datafusion_err,
+    NotImplemented
+);
 
 // Exposes a macro to create `DataFusionError::Execution` with optional backtrace
-make_error!(exec_err, exec_datafusion_err, Execution);
+make_error!(
+    exec_err,
+    _exec_err,
+    exec_datafusion_err,
+    _exec_datafusion_err,
+    Execution
+);
 
 // Exposes a macro to create `DataFusionError::Configuration` with optional backtrace
-make_error!(config_err, config_datafusion_err, Configuration);
+make_error!(
+    config_err,
+    _config_err,
+    config_datafusion_err,
+    _config_datafusion_err,
+    Configuration
+);
 
 // Exposes a macro to create `DataFusionError::Substrait` with optional backtrace
-make_error!(substrait_err, substrait_datafusion_err, Substrait);
+make_error!(
+    substrait_err,
+    _substrait_err,
+    substrait_datafusion_err,
+    _substrait_datafusion_err,
+    Substrait
+);
 
 // Exposes a macro to create `DataFusionError::ResourcesExhausted` with optional backtrace
-make_error!(resources_err, resources_datafusion_err, ResourcesExhausted);
+make_error!(
+    resources_err,
+    _resources_err,
+    resources_datafusion_err,
+    _resources_datafusion_err,
+    ResourcesExhausted
+);
+
+// Exposes a macro to create `DataFusionError::Ffi` with optional backtrace
+make_error!(
+    ffi_err,
+    _ffi_err,
+    ffi_datafusion_err,
+    _ffi_datafusion_err,
+    Ffi
+);
 
 // Exposes a macro to create `DataFusionError::SQL` with optional backtrace
 #[macro_export]
 macro_rules! sql_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = DataFusionError::SQL($ERR, Some(DataFusionError::get_back_trace()));
+        let err = $crate::DataFusionError::SQL(Box::new($ERR), Some($crate::DataFusionError::get_back_trace()));
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -840,7 +1039,7 @@ macro_rules! sql_datafusion_err {
 #[macro_export]
 macro_rules! sql_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = datafusion_common::sql_datafusion_err!($ERR);
+        let err = $crate::sql_datafusion_err!($ERR);
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -852,7 +1051,7 @@ macro_rules! sql_err {
 #[macro_export]
 macro_rules! arrow_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = DataFusionError::ArrowError($ERR, Some(DataFusionError::get_back_trace()));
+        let err = $crate::DataFusionError::ArrowError(Box::new($ERR), Some($crate::DataFusionError::get_back_trace()));
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -865,7 +1064,7 @@ macro_rules! arrow_datafusion_err {
 macro_rules! arrow_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {
     {
-        let err = datafusion_common::arrow_datafusion_err!($ERR);
+        let err = $crate::arrow_datafusion_err!($ERR);
         $(
             let err = err.with_diagnostic($DIAG);
         )?
@@ -877,9 +1076,9 @@ macro_rules! arrow_err {
 #[macro_export]
 macro_rules! schema_datafusion_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = $crate::error::DataFusionError::SchemaError(
-            $ERR,
-            Box::new(Some($crate::error::DataFusionError::get_back_trace())),
+        let err = $crate::DataFusionError::SchemaError(
+            Box::new($ERR),
+            Box::new(Some($crate::DataFusionError::get_back_trace())),
         );
         $(
             let err = err.with_diagnostic($DIAG);
@@ -892,9 +1091,9 @@ macro_rules! schema_datafusion_err {
 #[macro_export]
 macro_rules! schema_err {
     ($ERR:expr $(; diagnostic = $DIAG:expr)?) => {{
-        let err = $crate::error::DataFusionError::SchemaError(
-            $ERR,
-            Box::new(Some($crate::error::DataFusionError::get_back_trace())),
+        let err = $crate::DataFusionError::SchemaError(
+            Box::new($ERR),
+            Box::new(Some($crate::DataFusionError::get_back_trace())),
         );
         $(
             let err = err.with_diagnostic($DIAG);
@@ -951,17 +1150,137 @@ pub fn add_possible_columns_to_diag(
 
 #[cfg(test)]
 mod test {
+    use super::*;
+
+    use std::mem::size_of;
     use std::sync::Arc;
 
-    use crate::error::{DataFusionError, GenericError};
     use arrow::error::ArrowError;
+    use insta::assert_snapshot;
+
+    fn ok_result() -> Result<()> {
+        Ok(())
+    }
+
+    #[test]
+    fn test_assert_eq_or_internal_err_passes() -> Result<()> {
+        assert_eq_or_internal_err!(1, 1);
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_eq_or_internal_err_fails() {
+        fn check() -> Result<()> {
+            assert_eq_or_internal_err!(1, 2, "expected equality");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: 1 == 2 (left: 1, right: 2): expected equality.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_ne_or_internal_err_passes() -> Result<()> {
+        assert_ne_or_internal_err!(1, 2);
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_ne_or_internal_err_fails() {
+        fn check() -> Result<()> {
+            assert_ne_or_internal_err!(3, 3, "values must differ");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: 3 != 3 (left: 3, right: 3): values must differ.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_passes() -> Result<()> {
+        assert_or_internal_err!(true);
+        assert_or_internal_err!(true, "message");
+        ok_result()
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_fails_default() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false);
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_fails_with_message() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false, "custom message");
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false: custom message.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_assert_or_internal_err_with_format_arguments() {
+        fn check() -> Result<()> {
+            assert_or_internal_err!(false, "custom {}", 42);
+            ok_result()
+        }
+
+        let err = check().unwrap_err();
+        assert_snapshot!(
+            err.to_string(),
+            @r"
+        Internal error: Assertion failed: false: custom 42.
+        This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues
+        "
+        );
+    }
+
+    #[test]
+    fn test_error_size() {
+        // Since Errors influence the size of Result which influence the size of the stack
+        // please don't allow this to grow larger
+        assert_eq!(size_of::<SchemaError>(), 40);
+        assert_eq!(size_of::<DataFusionError>(), 40);
+    }
 
     #[test]
     fn datafusion_error_to_arrow() {
         let res = return_arrow_error().unwrap_err();
-        assert!(res
-            .to_string()
-            .starts_with("External error: Error during planning: foo"));
+        assert!(
+            res.to_string()
+                .starts_with("External error: Error during planning: foo")
+        );
     }
 
     #[test]
@@ -973,7 +1292,7 @@ mod test {
     // To pass the test the environment variable RUST_BACKTRACE should be set to 1 to enforce backtrace
     #[cfg(feature = "backtrace")]
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
+    #[expect(clippy::unnecessary_literal_unwrap)]
     fn test_enabled_backtrace() {
         match std::env::var("RUST_BACKTRACE") {
             Ok(val) if val == "1" => {}
@@ -990,17 +1309,17 @@ mod test {
                 .unwrap(),
             &"Error during planning: Err"
         );
-        assert!(!err
-            .split(DataFusionError::BACK_TRACE_SEP)
-            .collect::<Vec<&str>>()
-            .get(1)
-            .unwrap()
-            .is_empty());
+        assert!(
+            !err.split(DataFusionError::BACK_TRACE_SEP)
+                .collect::<Vec<&str>>()
+                .get(1)
+                .unwrap()
+                .is_empty()
+        );
     }
 
     #[cfg(not(feature = "backtrace"))]
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
     fn test_disabled_backtrace() {
         let res: Result<(), DataFusionError> = plan_err!("Err");
         let res = res.unwrap_err().to_string();
@@ -1020,8 +1339,8 @@ mod test {
 
         do_root_test(
             DataFusionError::ArrowError(
-                ArrowError::ExternalError(Box::new(DataFusionError::ResourcesExhausted(
-                    "foo".to_string(),
+                Box::new(ArrowError::ExternalError(Box::new(
+                    DataFusionError::ResourcesExhausted("foo".to_string()),
                 ))),
                 None,
             ),
@@ -1044,9 +1363,11 @@ mod test {
 
         do_root_test(
             DataFusionError::ArrowError(
-                ArrowError::ExternalError(Box::new(ArrowError::ExternalError(Box::new(
-                    DataFusionError::ResourcesExhausted("foo".to_string()),
-                )))),
+                Box::new(ArrowError::ExternalError(Box::new(
+                    ArrowError::ExternalError(Box::new(
+                        DataFusionError::ResourcesExhausted("foo".to_string()),
+                    )),
+                ))),
                 None,
             ),
             DataFusionError::ResourcesExhausted("foo".to_string()),
@@ -1068,7 +1389,6 @@ mod test {
     }
 
     #[test]
-    #[allow(clippy::unnecessary_literal_unwrap)]
     fn test_make_error_parse_input() {
         let res: Result<(), DataFusionError> = plan_err!("Err");
         let res = res.unwrap_err();
@@ -1120,7 +1440,7 @@ mod test {
         );
 
         // assert wrapping other Error
-        let generic_error: GenericError = Box::new(std::io::Error::other("io error"));
+        let generic_error: GenericError = Box::new(io::Error::other("io error"));
         let datafusion_error: DataFusionError = generic_error.into();
         println!("{}", datafusion_error.strip_backtrace());
         assert_eq!(
@@ -1131,15 +1451,17 @@ mod test {
 
     #[test]
     fn external_error_no_recursive() {
-        let generic_error_1: GenericError = Box::new(std::io::Error::other("io error"));
+        let generic_error_1: GenericError = Box::new(io::Error::other("io error"));
         let external_error_1: DataFusionError = generic_error_1.into();
         let generic_error_2: GenericError = Box::new(external_error_1);
         let external_error_2: DataFusionError = generic_error_2.into();
 
         println!("{external_error_2}");
-        assert!(external_error_2
-            .to_string()
-            .starts_with("External error: io error"));
+        assert!(
+            external_error_2
+                .to_string()
+                .starts_with("External error: io error")
+        );
     }
 
     /// Model what happens when implementing SendableRecordBatchStream:
@@ -1151,7 +1473,7 @@ mod test {
 
     /// Model what happens when using arrow kernels in DataFusion
     /// code: need to turn an ArrowError into a DataFusionError
-    fn return_datafusion_error() -> crate::error::Result<()> {
+    fn return_datafusion_error() -> Result<()> {
         // Expect the '?' to work
         Err(ArrowError::SchemaError("bar".to_string()).into())
     }
diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs
index 943288af91642..4e6f74a4448af 100644
--- a/datafusion/common/src/file_options/csv_writer.rs
+++ b/datafusion/common/src/file_options/csv_writer.rs
@@ -31,6 +31,8 @@ pub struct CsvWriterOptions {
     /// Compression to apply after ArrowWriter serializes RecordBatches.
     /// This compression is applied by DataFusion not the ArrowWriter itself.
     pub compression: CompressionTypeVariant,
+    /// Compression level for the output file.
+    pub compression_level: Option<u32>,
 }
 
 impl CsvWriterOptions {
@@ -41,6 +43,20 @@ impl CsvWriterOptions {
         Self {
             writer_options,
             compression,
+            compression_level: None,
+        }
+    }
+
+    /// Create a new `CsvWriterOptions` with the specified compression level.
+    pub fn new_with_level(
+        writer_options: WriterBuilder,
+        compression: CompressionTypeVariant,
+        compression_level: u32,
+    ) -> Self {
+        Self {
+            writer_options,
+            compression,
+            compression_level: Some(compression_level),
         }
     }
 }
@@ -81,6 +97,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
         Ok(CsvWriterOptions {
             writer_options: builder,
             compression: value.compression,
+            compression_level: value.compression_level,
         })
     }
 }
diff --git a/datafusion/common/src/file_options/json_writer.rs b/datafusion/common/src/file_options/json_writer.rs
index 750d2972329bb..a537192c8128a 100644
--- a/datafusion/common/src/file_options/json_writer.rs
+++ b/datafusion/common/src/file_options/json_writer.rs
@@ -27,11 +27,26 @@ use crate::{
 #[derive(Clone, Debug)]
 pub struct JsonWriterOptions {
     pub compression: CompressionTypeVariant,
+    pub compression_level: Option<u32>,
 }
 
 impl JsonWriterOptions {
     pub fn new(compression: CompressionTypeVariant) -> Self {
-        Self { compression }
+        Self {
+            compression,
+            compression_level: None,
+        }
+    }
+
+    /// Create a new `JsonWriterOptions` with the specified compression and level.
+    pub fn new_with_level(
+        compression: CompressionTypeVariant,
+        compression_level: u32,
+    ) -> Self {
+        Self {
+            compression,
+            compression_level: Some(compression_level),
+        }
     }
 }
 
@@ -41,6 +56,7 @@ impl TryFrom<&JsonOptions> for JsonWriterOptions {
     fn try_from(value: &JsonOptions) -> Result<Self> {
         Ok(JsonWriterOptions {
             compression: value.compression,
+            compression_level: value.compression_level,
         })
     }
 }
diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs
index 02667e0165717..5d2abd23172ed 100644
--- a/datafusion/common/src/file_options/mod.rs
+++ b/datafusion/common/src/file_options/mod.rs
@@ -31,10 +31,10 @@ mod tests {
     use std::collections::HashMap;
 
     use crate::{
+        Result,
         config::{ConfigFileType, TableOptions},
         file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
         parsers::CompressionTypeVariant,
-        Result,
     };
 
     use parquet::{
@@ -84,7 +84,7 @@ mod tests {
         .build();
 
         // Verify the expected options propagated down to parquet crate WriterProperties struct
-        assert_eq!(properties.max_row_group_size(), 123);
+        assert_eq!(properties.max_row_group_row_count(), Some(123));
         assert_eq!(properties.data_page_size_limit(), 123);
         assert_eq!(properties.write_batch_size(), 123);
         assert_eq!(properties.writer_version(), WriterVersion::PARQUET_2_0);
diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
index 07e763f0ee6f3..a7a1fc6d0bb66 100644
--- a/datafusion/common/src/file_options/parquet_writer.rs
+++ b/datafusion/common/src/file_options/parquet_writer.rs
@@ -17,25 +17,23 @@
 
 //! Options related to how parquet files should be written
 
-use base64::Engine;
 use std::sync::Arc;
 
 use crate::{
+    _internal_datafusion_err, DataFusionError, Result,
     config::{ParquetOptions, TableParquetOptions},
-    DataFusionError, Result, _internal_datafusion_err,
 };
 
 use arrow::datatypes::Schema;
-// TODO: handle once deprecated
-#[allow(deprecated)]
+use parquet::arrow::encode_arrow_schema;
 use parquet::{
     arrow::ARROW_SCHEMA_META_KEY,
     basic::{BrotliLevel, GzipLevel, ZstdLevel},
     file::{
         metadata::KeyValue,
         properties::{
-            EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
-            DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
+            DEFAULT_STATISTICS_ENABLED, EnabledStatistics, WriterProperties,
+            WriterPropertiesBuilder,
         },
     },
     schema::types::ColumnPath,
@@ -89,12 +87,15 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
     /// Convert the session's [`TableParquetOptions`] into a single write action's [`WriterPropertiesBuilder`].
     ///
     /// The returned [`WriterPropertiesBuilder`] includes customizations applicable per column.
+    /// Note that any encryption options are ignored as building the `FileEncryptionProperties`
+    /// might require other inputs besides the [`TableParquetOptions`].
     fn try_from(table_parquet_options: &TableParquetOptions) -> Result<Self> {
         // Table options include kv_metadata and col-specific options
         let TableParquetOptions {
             global,
             column_specific_options,
             key_value_metadata,
+            crypto: _,
         } = table_parquet_options;
 
         let mut builder = global.into_writer_properties_builder()?;
@@ -103,7 +104,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
         if !global.skip_arrow_metadata
             && !key_value_metadata.contains_key(ARROW_SCHEMA_META_KEY)
         {
-            return Err(_internal_datafusion_err!("arrow schema was not added to the kv_metadata, even though it is required by configuration settings"));
+            return Err(_internal_datafusion_err!(
+                "arrow schema was not added to the kv_metadata, even though it is required by configuration settings"
+            ));
         }
 
         // add kv_meta, if any
@@ -157,47 +160,12 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
                 builder =
                     builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
             }
-
-            // max_statistics_size is deprecated, currently it is not being used
-            // TODO: remove once deprecated
-            #[allow(deprecated)]
-            if let Some(max_statistics_size) = options.max_statistics_size {
-                builder = {
-                    #[allow(deprecated)]
-                    builder.set_column_max_statistics_size(path, max_statistics_size)
-                }
-            }
         }
 
         Ok(builder)
     }
 }
 
-/// Encodes the Arrow schema into the IPC format, and base64 encodes it
-///
-/// TODO: use extern parquet's private method, once publicly available.
-/// Refer to <https://github.com/apache/arrow-rs/pull/6916>
-fn encode_arrow_schema(schema: &Arc<Schema>) -> String {
-    let options = arrow_ipc::writer::IpcWriteOptions::default();
-    let mut dictionary_tracker = arrow_ipc::writer::DictionaryTracker::new(true);
-    let data_gen = arrow_ipc::writer::IpcDataGenerator::default();
-    let mut serialized_schema = data_gen.schema_to_bytes_with_dictionary_tracker(
-        schema,
-        &mut dictionary_tracker,
-        &options,
-    );
-
-    // manually prepending the length to the schema as arrow uses the legacy IPC format
-    // TODO: change after addressing ARROW-9777
-    let schema_len = serialized_schema.ipc_message.len();
-    let mut len_prefix_schema = Vec::with_capacity(schema_len + 8);
-    len_prefix_schema.append(&mut vec![255u8, 255, 255, 255]);
-    len_prefix_schema.append((schema_len as u32).to_le_bytes().to_vec().as_mut());
-    len_prefix_schema.append(&mut serialized_schema.ipc_message);
-
-    base64::prelude::BASE64_STANDARD.encode(&len_prefix_schema)
-}
-
 impl ParquetOptions {
     /// Convert the global session options, [`ParquetOptions`], into a single write action's [`WriterPropertiesBuilder`].
     ///
@@ -206,7 +174,6 @@ impl ParquetOptions {
     ///
     /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`].
     pub fn into_writer_properties_builder(&self) -> Result<WriterPropertiesBuilder> {
-        #[allow(deprecated)]
         let ParquetOptions {
             data_pagesize_limit,
             write_batch_size,
@@ -215,7 +182,6 @@ impl ParquetOptions {
             dictionary_enabled,
             dictionary_page_size_limit,
             statistics_enabled,
-            max_statistics_size,
             max_row_group_size,
             created_by,
             column_index_truncate_length,
@@ -233,6 +199,7 @@ impl ParquetOptions {
             metadata_size_hint: _,
             pushdown_filters: _,
             reorder_filters: _,
+            force_filter_selections: _, // not used for writer props
             allow_single_file_parallelism: _,
             maximum_parallel_row_group_writers: _,
             maximum_buffered_record_batches_per_stream: _,
@@ -241,12 +208,13 @@ impl ParquetOptions {
             binary_as_string: _, // not used for writer props
             coerce_int96: _,     // not used for writer props
             skip_arrow_metadata: _,
+            max_predicate_cache_size: _,
         } = self;
 
         let mut builder = WriterProperties::builder()
             .set_data_page_size_limit(*data_pagesize_limit)
             .set_write_batch_size(*write_batch_size)
-            .set_writer_version(parse_version_string(writer_version.as_str())?)
+            .set_writer_version((*writer_version).into())
             .set_dictionary_page_size_limit(*dictionary_page_size_limit)
             .set_statistics_enabled(
                 statistics_enabled
@@ -254,20 +222,13 @@ impl ParquetOptions {
                     .and_then(|s| parse_statistics_string(s).ok())
                     .unwrap_or(DEFAULT_STATISTICS_ENABLED),
             )
-            .set_max_row_group_size(*max_row_group_size)
+            .set_max_row_group_row_count(Some(*max_row_group_size))
             .set_created_by(created_by.clone())
             .set_column_index_truncate_length(*column_index_truncate_length)
             .set_statistics_truncate_length(*statistics_truncate_length)
             .set_data_page_row_count_limit(*data_page_row_count_limit)
             .set_bloom_filter_enabled(*bloom_filter_on_write);
 
-        builder = {
-            #[allow(deprecated)]
-            builder.set_max_statistics_size(
-                max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
-            )
-        };
-
         if let Some(bloom_filter_fpp) = bloom_filter_fpp {
             builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
         };
@@ -300,7 +261,7 @@ pub(crate) fn parse_encoding_string(
         "plain" => Ok(parquet::basic::Encoding::PLAIN),
         "plain_dictionary" => Ok(parquet::basic::Encoding::PLAIN_DICTIONARY),
         "rle" => Ok(parquet::basic::Encoding::RLE),
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         "bit_packed" => Ok(parquet::basic::Encoding::BIT_PACKED),
         "delta_binary_packed" => Ok(parquet::basic::Encoding::DELTA_BINARY_PACKED),
         "delta_length_byte_array" => {
@@ -380,10 +341,6 @@ pub fn parse_compression_string(
                 level,
             )?))
         }
-        "lzo" => {
-            check_level_is_none(codec, &level)?;
-            Ok(parquet::basic::Compression::LZO)
-        }
         "brotli" => {
             let level = require_level(codec, level)?;
             Ok(parquet::basic::Compression::BROTLI(BrotliLevel::try_new(
@@ -407,19 +364,7 @@ pub fn parse_compression_string(
         _ => Err(DataFusionError::Configuration(format!(
             "Unknown or unsupported parquet compression: \
         {str_setting}. Valid values are: uncompressed, snappy, gzip(level), \
-        lzo, brotli(level), lz4, zstd(level), and lz4_raw."
-        ))),
-    }
-}
-
-pub(crate) fn parse_version_string(str_setting: &str) -> Result<WriterVersion> {
-    let str_setting_lower: &str = &str_setting.to_lowercase();
-    match str_setting_lower {
-        "1.0" => Ok(WriterVersion::PARQUET_1_0),
-        "2.0" => Ok(WriterVersion::PARQUET_2_0),
-        _ => Err(DataFusionError::Configuration(format!(
-            "Unknown or unsupported parquet writer version {str_setting} \
-            valid options are 1.0 and 2.0"
+        brotli(level), lz4, zstd(level), and lz4_raw."
         ))),
     }
 }
@@ -440,31 +385,28 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
 #[cfg(feature = "parquet")]
 #[cfg(test)]
 mod tests {
-    use parquet::{
-        basic::Compression,
-        file::properties::{
-            BloomFilterProperties, EnabledStatistics, DEFAULT_BLOOM_FILTER_FPP,
-            DEFAULT_BLOOM_FILTER_NDV,
-        },
+    use super::*;
+    #[cfg(feature = "parquet_encryption")]
+    use crate::config::ConfigFileEncryptionProperties;
+    use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
+    use crate::parquet_config::DFParquetWriterVersion;
+    use parquet::basic::Compression;
+    use parquet::file::properties::{
+        BloomFilterProperties, DEFAULT_BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_NDV,
+        DEFAULT_MAX_ROW_GROUP_ROW_COUNT, EnabledStatistics,
     };
     use std::collections::HashMap;
 
-    use crate::config::{ParquetColumnOptions, ParquetOptions};
-
-    use super::*;
-
     const COL_NAME: &str = "configured";
 
     /// Take the column defaults provided in [`ParquetOptions`], and generate a non-default col config.
     fn column_options_with_non_defaults(
         src_col_defaults: &ParquetOptions,
     ) -> ParquetColumnOptions {
-        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             compression: Some("zstd(22)".into()),
             dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
             statistics_enabled: Some("none".into()),
-            max_statistics_size: Some(72),
             encoding: Some("RLE".into()),
             bloom_filter_enabled: Some(true),
             bloom_filter_fpp: Some(0.72),
@@ -474,22 +416,21 @@ mod tests {
 
     fn parquet_options_with_non_defaults() -> ParquetOptions {
         let defaults = ParquetOptions::default();
-        let writer_version = if defaults.writer_version.eq("1.0") {
-            "2.0"
+        let writer_version = if defaults.writer_version.eq(&DFParquetWriterVersion::V1_0)
+        {
+            DFParquetWriterVersion::V2_0
         } else {
-            "1.0"
+            DFParquetWriterVersion::V1_0
         };
 
-        #[allow(deprecated)] // max_statistics_size
         ParquetOptions {
             data_pagesize_limit: 42,
             write_batch_size: 42,
-            writer_version: writer_version.into(),
+            writer_version,
             compression: Some("zstd(22)".into()),
             dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
             dictionary_page_size_limit: 42,
             statistics_enabled: Some("chunk".into()),
-            max_statistics_size: Some(42),
             max_row_group_size: 42,
             created_by: "wordy".into(),
             column_index_truncate_length: Some(42),
@@ -507,6 +448,7 @@ mod tests {
             metadata_size_hint: defaults.metadata_size_hint,
             pushdown_filters: defaults.pushdown_filters,
             reorder_filters: defaults.reorder_filters,
+            force_filter_selections: defaults.force_filter_selections,
             allow_single_file_parallelism: defaults.allow_single_file_parallelism,
             maximum_parallel_row_group_writers: defaults
                 .maximum_parallel_row_group_writers,
@@ -517,6 +459,7 @@ mod tests {
             binary_as_string: defaults.binary_as_string,
             skip_arrow_metadata: defaults.skip_arrow_metadata,
             coerce_int96: None,
+            max_predicate_cache_size: defaults.max_predicate_cache_size,
         }
     }
 
@@ -526,7 +469,6 @@ mod tests {
     ) -> ParquetColumnOptions {
         let bloom_filter_default_props = props.bloom_filter_properties(&col);
 
-        #[allow(deprecated)] // max_statistics_size
         ParquetColumnOptions {
             bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
             encoding: props.encoding(&col).map(|s| s.to_string()),
@@ -547,7 +489,6 @@ mod tests {
             ),
             bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
             bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
-            max_statistics_size: Some(props.max_statistics_size(&col)),
         }
     }
 
@@ -580,15 +521,24 @@ mod tests {
             HashMap::from([(COL_NAME.into(), configured_col_props)])
         };
 
-        #[allow(deprecated)] // max_statistics_size
+        #[cfg(feature = "parquet_encryption")]
+        let fep = props
+            .file_encryption_properties()
+            .map(ConfigFileEncryptionProperties::from);
+
+        #[cfg(not(feature = "parquet_encryption"))]
+        let fep = None;
+
         TableParquetOptions {
             global: ParquetOptions {
                 // global options
                 data_pagesize_limit: props.dictionary_page_size_limit(),
                 write_batch_size: props.write_batch_size(),
-                writer_version: format!("{}.0", props.writer_version().as_num()),
+                writer_version: props.writer_version().into(),
                 dictionary_page_size_limit: props.dictionary_page_size_limit(),
-                max_row_group_size: props.max_row_group_size(),
+                max_row_group_size: props
+                    .max_row_group_row_count()
+                    .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
                 created_by: props.created_by().to_string(),
                 column_index_truncate_length: props.column_index_truncate_length(),
                 statistics_truncate_length: props.statistics_truncate_length(),
@@ -599,7 +549,6 @@ mod tests {
                 compression: default_col_props.compression,
                 dictionary_enabled: default_col_props.dictionary_enabled,
                 statistics_enabled: default_col_props.statistics_enabled,
-                max_statistics_size: default_col_props.max_statistics_size,
                 bloom_filter_on_write: default_col_props
                     .bloom_filter_enabled
                     .unwrap_or_default(),
@@ -613,6 +562,7 @@ mod tests {
                 metadata_size_hint: global_options_defaults.metadata_size_hint,
                 pushdown_filters: global_options_defaults.pushdown_filters,
                 reorder_filters: global_options_defaults.reorder_filters,
+                force_filter_selections: global_options_defaults.force_filter_selections,
                 allow_single_file_parallelism: global_options_defaults
                     .allow_single_file_parallelism,
                 maximum_parallel_row_group_writers: global_options_defaults
@@ -620,6 +570,8 @@ mod tests {
                 maximum_buffered_record_batches_per_stream: global_options_defaults
                     .maximum_buffered_record_batches_per_stream,
                 bloom_filter_on_read: global_options_defaults.bloom_filter_on_read,
+                max_predicate_cache_size: global_options_defaults
+                    .max_predicate_cache_size,
                 schema_force_view_types: global_options_defaults.schema_force_view_types,
                 binary_as_string: global_options_defaults.binary_as_string,
                 skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
@@ -627,6 +579,12 @@ mod tests {
             },
             column_specific_options,
             key_value_metadata,
+            crypto: ParquetEncryptionOptions {
+                file_encryption: fep,
+                file_decryption: None,
+                factory_id: None,
+                factory_options: Default::default(),
+            },
         }
     }
 
@@ -681,6 +639,7 @@ mod tests {
             )]
             .into(),
             key_value_metadata: [(key, value)].into(),
+            crypto: Default::default(),
         };
 
         let writer_props = WriterPropertiesBuilder::try_from(&table_parquet_opts)
@@ -701,8 +660,7 @@ mod tests {
         let mut default_table_writer_opts = TableParquetOptions::default();
         let default_parquet_opts = ParquetOptions::default();
         assert_eq!(
-            default_table_writer_opts.global,
-            default_parquet_opts,
+            default_table_writer_opts.global, default_parquet_opts,
             "should have matching defaults for TableParquetOptions.global and ParquetOptions",
         );
 
@@ -726,7 +684,9 @@ mod tests {
             "should have different created_by sources",
         );
         assert!(
-            default_writer_props.created_by().starts_with("parquet-rs version"),
+            default_writer_props
+                .created_by()
+                .starts_with("parquet-rs version"),
             "should indicate that writer_props defaults came from the extern parquet crate",
         );
         assert!(
@@ -760,8 +720,7 @@ mod tests {
         from_extern_parquet.global.skip_arrow_metadata = true;
 
         assert_eq!(
-            default_table_writer_opts,
-            from_extern_parquet,
+            default_table_writer_opts, from_extern_parquet,
             "the default writer_props should have the same configuration as the session's default TableParquetOptions",
         );
     }
diff --git a/datafusion/common/src/format.rs b/datafusion/common/src/format.rs
index a4ebd17539996..a505bd0e1c74e 100644
--- a/datafusion/common/src/format.rs
+++ b/datafusion/common/src/format.rs
@@ -15,9 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::fmt::{self, Display};
+use std::str::FromStr;
+
 use arrow::compute::CastOptions;
 use arrow::util::display::{DurationFormat, FormatOptions};
 
+use crate::config::{ConfigField, Visit};
+use crate::error::{DataFusionError, Result};
+
 /// The default [`FormatOptions`] to use within DataFusion
 /// Also see [`crate::config::FormatOptions`]
 pub const DEFAULT_FORMAT_OPTIONS: FormatOptions<'static> =
@@ -28,3 +34,219 @@ pub const DEFAULT_CAST_OPTIONS: CastOptions<'static> = CastOptions {
     safe: false,
     format_options: DEFAULT_FORMAT_OPTIONS,
 };
+
+/// Output formats for controlling for Explain plans
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum ExplainFormat {
+    /// Indent mode
+    ///
+    /// Example:
+    /// ```text
+    /// > explain format indent select x from values (1) t(x);
+    /// +---------------+-----------------------------------------------------+
+    /// | plan_type     | plan                                                |
+    /// +---------------+-----------------------------------------------------+
+    /// | logical_plan  | SubqueryAlias: t                                    |
+    /// |               |   Projection: column1 AS x                          |
+    /// |               |     Values: (Int64(1))                              |
+    /// | physical_plan | ProjectionExec: expr=[column1@0 as x]               |
+    /// |               |   DataSourceExec: partitions=1, partition_sizes=[1] |
+    /// |               |                                                     |
+    /// +---------------+-----------------------------------------------------+
+    /// ```
+    Indent,
+    /// Tree mode
+    ///
+    /// Example:
+    /// ```text
+    /// > explain format tree select x from values (1) t(x);
+    /// +---------------+-------------------------------+
+    /// | plan_type     | plan                          |
+    /// +---------------+-------------------------------+
+    /// | physical_plan | ┌───────────────────────────┐ |
+    /// |               | │       ProjectionExec      │ |
+    /// |               | │    --------------------   │ |
+    /// |               | │        x: column1@0       │ |
+    /// |               | └─────────────┬─────────────┘ |
+    /// |               | ┌─────────────┴─────────────┐ |
+    /// |               | │       DataSourceExec      │ |
+    /// |               | │    --------------------   │ |
+    /// |               | │         bytes: 128        │ |
+    /// |               | │       format: memory      │ |
+    /// |               | │          rows: 1          │ |
+    /// |               | └───────────────────────────┘ |
+    /// |               |                               |
+    /// +---------------+-------------------------------+
+    /// ```
+    Tree,
+    /// Postgres Json mode
+    ///
+    /// A displayable structure that produces plan in postgresql JSON format.
+    ///
+    /// Users can use this format to visualize the plan in existing plan
+    /// visualization tools, for example [dalibo](https://explain.dalibo.com/)
+    ///
+    /// Example:
+    /// ```text
+    /// > explain format pgjson select x from values (1) t(x);
+    /// +--------------+--------------------------------------+
+    /// | plan_type    | plan                                 |
+    /// +--------------+--------------------------------------+
+    /// | logical_plan | [                                    |
+    /// |              |   {                                  |
+    /// |              |     "Plan": {                        |
+    /// |              |       "Alias": "t",                  |
+    /// |              |       "Node Type": "Subquery",       |
+    /// |              |       "Output": [                    |
+    /// |              |         "x"                          |
+    /// |              |       ],                             |
+    /// |              |       "Plans": [                     |
+    /// |              |         {                            |
+    /// |              |           "Expressions": [           |
+    /// |              |             "column1 AS x"           |
+    /// |              |           ],                         |
+    /// |              |           "Node Type": "Projection", |
+    /// |              |           "Output": [                |
+    /// |              |             "x"                      |
+    /// |              |           ],                         |
+    /// |              |           "Plans": [                 |
+    /// |              |             {                        |
+    /// |              |               "Node Type": "Values", |
+    /// |              |               "Output": [            |
+    /// |              |                 "column1"            |
+    /// |              |               ],                     |
+    /// |              |               "Plans": [],           |
+    /// |              |               "Values": "(Int64(1))" |
+    /// |              |             }                        |
+    /// |              |           ]                          |
+    /// |              |         }                            |
+    /// |              |       ]                              |
+    /// |              |     }                                |
+    /// |              |   }                                  |
+    /// |              | ]                                    |
+    /// +--------------+--------------------------------------+
+    /// ```
+    PostgresJSON,
+    /// Graphviz mode
+    ///
+    /// Example:
+    /// ```text
+    /// > explain format graphviz select x from values (1) t(x);
+    /// +--------------+------------------------------------------------------------------------+
+    /// | plan_type    | plan                                                                   |
+    /// +--------------+------------------------------------------------------------------------+
+    /// | logical_plan |                                                                        |
+    /// |              | // Begin DataFusion GraphViz Plan,                                     |
+    /// |              | // display it online here: https://dreampuf.github.io/GraphvizOnline   |
+    /// |              |                                                                        |
+    /// |              | digraph {                                                              |
+    /// |              |   subgraph cluster_1                                                   |
+    /// |              |   {                                                                    |
+    /// |              |     graph[label="LogicalPlan"]                                         |
+    /// |              |     2[shape=box label="SubqueryAlias: t"]                              |
+    /// |              |     3[shape=box label="Projection: column1 AS x"]                      |
+    /// |              |     2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]                |
+    /// |              |     4[shape=box label="Values: (Int64(1))"]                            |
+    /// |              |     3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]                |
+    /// |              |   }                                                                    |
+    /// |              |   subgraph cluster_5                                                   |
+    /// |              |   {                                                                    |
+    /// |              |     graph[label="Detailed LogicalPlan"]                                |
+    /// |              |     6[shape=box label="SubqueryAlias: t\nSchema: [x:Int64;N]"]         |
+    /// |              |     7[shape=box label="Projection: column1 AS x\nSchema: [x:Int64;N]"] |
+    /// |              |     6 -> 7 [arrowhead=none, arrowtail=normal, dir=back]                |
+    /// |              |     8[shape=box label="Values: (Int64(1))\nSchema: [column1:Int64;N]"] |
+    /// |              |     7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]                |
+    /// |              |   }                                                                    |
+    /// |              | }                                                                      |
+    /// |              | // End DataFusion GraphViz Plan                                        |
+    /// |              |                                                                        |
+    /// +--------------+------------------------------------------------------------------------+
+    /// ```
+    Graphviz,
+}
+
+/// Implement  parsing strings to `ExplainFormat`
+impl FromStr for ExplainFormat {
+    type Err = DataFusionError;
+
+    fn from_str(format: &str) -> Result<Self, Self::Err> {
+        match format.to_lowercase().as_str() {
+            "indent" => Ok(ExplainFormat::Indent),
+            "tree" => Ok(ExplainFormat::Tree),
+            "pgjson" => Ok(ExplainFormat::PostgresJSON),
+            "graphviz" => Ok(ExplainFormat::Graphviz),
+            _ => Err(DataFusionError::Configuration(format!(
+                "Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got '{format}'"
+            ))),
+        }
+    }
+}
+
+impl Display for ExplainFormat {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            ExplainFormat::Indent => "indent",
+            ExplainFormat::Tree => "tree",
+            ExplainFormat::PostgresJSON => "pgjson",
+            ExplainFormat::Graphviz => "graphviz",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl ConfigField for ExplainFormat {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = ExplainFormat::from_str(value)?;
+        Ok(())
+    }
+}
+
+/// Verbosity levels controlling how `EXPLAIN ANALYZE` renders metrics
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ExplainAnalyzeLevel {
+    /// Show a compact view containing high-level metrics
+    Summary,
+    /// Show a developer-focused view with per-operator details
+    Dev,
+    // When adding new enum, update the error message in `from_str()` accordingly.
+}
+
+impl FromStr for ExplainAnalyzeLevel {
+    type Err = DataFusionError;
+
+    fn from_str(level: &str) -> Result<Self, Self::Err> {
+        match level.to_lowercase().as_str() {
+            "summary" => Ok(ExplainAnalyzeLevel::Summary),
+            "dev" => Ok(ExplainAnalyzeLevel::Dev),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid explain analyze level. Expected 'summary' or 'dev'. Got '{other}'"
+            ))),
+        }
+    }
+}
+
+impl Display for ExplainAnalyzeLevel {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            ExplainAnalyzeLevel::Summary => "summary",
+            ExplainAnalyzeLevel::Dev => "dev",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl ConfigField for ExplainAnalyzeLevel {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = ExplainAnalyzeLevel::from_str(value)?;
+        Ok(())
+    }
+}
diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs
index 77e00d6dcda23..63962998ad18b 100644
--- a/datafusion/common/src/functional_dependencies.rs
+++ b/datafusion/common/src/functional_dependencies.rs
@@ -36,35 +36,31 @@ pub enum Constraint {
 }
 
 /// This object encapsulates a list of functional constraints:
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
+#[derive(Clone, Debug, Default, Eq, Hash, PartialEq, PartialOrd)]
 pub struct Constraints {
     inner: Vec<Constraint>,
 }
 
 impl Constraints {
-    /// Create empty constraints
-    pub fn empty() -> Self {
-        Constraints::new_unverified(vec![])
-    }
-
     /// Create a new [`Constraints`] object from the given `constraints`.
-    /// Users should use the [`Constraints::empty`] or [`SqlToRel::new_constraint_from_table_constraints`] functions
-    /// for constructing [`Constraints`]. This constructor is for internal
-    /// purposes only and does not check whether the argument is valid. The user
-    /// is responsible for supplying a valid vector of [`Constraint`] objects.
+    /// Users should use the [`Constraints::default`] or [`SqlToRel::new_constraint_from_table_constraints`]
+    /// functions for constructing [`Constraints`] instances. This constructor
+    /// is for internal purposes only and does not check whether the argument
+    /// is valid. The user is responsible for supplying a valid vector of
+    /// [`Constraint`] objects.
     ///
     /// [`SqlToRel::new_constraint_from_table_constraints`]: https://docs.rs/datafusion/latest/datafusion/sql/planner/struct.SqlToRel.html#method.new_constraint_from_table_constraints
     pub fn new_unverified(constraints: Vec<Constraint>) -> Self {
         Self { inner: constraints }
     }
 
-    /// Check whether constraints is empty
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
+    /// Extends the current constraints with the given `other` constraints.
+    pub fn extend(&mut self, other: Constraints) {
+        self.inner.extend(other.inner);
     }
 
-    /// Projects constraints using the given projection indices.
-    /// Returns None if any of the constraint columns are not included in the projection.
+    /// Projects constraints using the given projection indices. Returns `None`
+    /// if any of the constraint columns are not included in the projection.
     pub fn project(&self, proj_indices: &[usize]) -> Option<Self> {
         let projected = self
             .inner
@@ -74,14 +70,14 @@ impl Constraints {
                     Constraint::PrimaryKey(indices) => {
                         let new_indices =
                             update_elements_with_matching_indices(indices, proj_indices);
-                        // Only keep constraint if all columns are preserved
+                        // Only keep the constraint if all columns are preserved:
                         (new_indices.len() == indices.len())
                             .then_some(Constraint::PrimaryKey(new_indices))
                     }
                     Constraint::Unique(indices) => {
                         let new_indices =
                             update_elements_with_matching_indices(indices, proj_indices);
-                        // Only keep constraint if all columns are preserved
+                        // Only keep the constraint if all columns are preserved:
                         (new_indices.len() == indices.len())
                             .then_some(Constraint::Unique(new_indices))
                     }
@@ -93,15 +89,9 @@ impl Constraints {
     }
 }
 
-impl Default for Constraints {
-    fn default() -> Self {
-        Constraints::empty()
-    }
-}
-
 impl IntoIterator for Constraints {
     type Item = Constraint;
-    type IntoIter = IntoIter<Constraint>;
+    type IntoIter = IntoIter<Self::Item>;
 
     fn into_iter(self) -> Self::IntoIter {
         self.inner.into_iter()
@@ -374,7 +364,7 @@ impl FunctionalDependencies {
                 // These joins preserve functional dependencies of the left side:
                 left_func_dependencies
             }
-            JoinType::RightSemi | JoinType::RightAnti => {
+            JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
                 // These joins preserve functional dependencies of the right side:
                 right_func_dependencies
             }
diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
index e78d42257b9cb..255525b92e0c0 100644
--- a/datafusion/common/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -17,25 +17,30 @@
 
 //! Functionality used both on logical and physical plans
 
-#[cfg(not(feature = "force_hash_collisions"))]
-use std::sync::Arc;
-
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::*;
+use arrow::compute::take;
 use arrow::datatypes::*;
 #[cfg(not(feature = "force_hash_collisions"))]
 use arrow::{downcast_dictionary_array, downcast_primitive_array};
+use foldhash::fast::FixedState;
+use itertools::Itertools;
+use std::collections::HashMap;
+use std::hash::{BuildHasher, Hash, Hasher};
+
+/// The hash random state used throughout DataFusion for hashing.
+pub type RandomState = FixedState;
 
 #[cfg(not(feature = "force_hash_collisions"))]
 use crate::cast::{
     as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
-    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
-    as_string_array, as_string_view_array, as_struct_array,
+    as_generic_binary_array, as_large_list_array, as_large_list_view_array,
+    as_list_array, as_list_view_array, as_map_array, as_string_array,
+    as_string_view_array, as_struct_array, as_union_array,
 };
 use crate::error::Result;
-#[cfg(not(feature = "force_hash_collisions"))]
-use crate::error::_internal_err;
+use crate::error::{_internal_datafusion_err, _internal_err};
+use std::cell::RefCell;
 
 // Combines two hashes into one hash
 #[inline]
@@ -44,6 +49,94 @@ pub fn combine_hashes(l: u64, r: u64) -> u64 {
     hash.wrapping_mul(37).wrapping_add(r)
 }
 
+/// Maximum size for the thread-local hash buffer before truncation (4MB = 524,288 u64 elements).
+/// The goal of this is to avoid unbounded memory growth that would appear as a memory leak.
+/// We allow temporary allocations beyond this size, but after use the buffer is truncated
+/// to this size.
+const MAX_BUFFER_SIZE: usize = 524_288;
+
+thread_local! {
+    /// Thread-local buffer for hash computations to avoid repeated allocations.
+    /// The buffer is reused across calls and truncated if it exceeds MAX_BUFFER_SIZE.
+    /// Defaults to a capacity of 8192 u64 elements which is the default batch size.
+    /// This corresponds to 64KB of memory.
+    static HASH_BUFFER: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
+}
+
+/// Creates hashes for the given arrays using a thread-local buffer, then calls the provided callback
+/// with an immutable reference to the computed hashes.
+///
+/// This function manages a thread-local buffer to avoid repeated allocations. The buffer is automatically
+/// truncated if it exceeds `MAX_BUFFER_SIZE` after use.
+///
+/// # Arguments
+/// * `arrays` - The arrays to hash (must contain at least one array)
+/// * `random_state` - The random state for hashing
+/// * `callback` - A function that receives an immutable reference to the hash slice and returns a result
+///
+/// # Errors
+/// Returns an error if:
+/// - No arrays are provided
+/// - The function is called reentrantly (i.e., the callback invokes `with_hashes` again on the same thread)
+/// - The function is called during or after thread destruction
+///
+/// # Example
+/// ```ignore
+/// use datafusion_common::hash_utils::{with_hashes, RandomState};
+/// use arrow::array::{Int32Array, ArrayRef};
+/// use std::sync::Arc;
+///
+/// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+/// let random_state = RandomState::default();
+///
+/// let result = with_hashes([&array], &random_state, |hashes| {
+///     // Use the hashes here
+///     Ok(hashes.len())
+/// })?;
+/// ```
+pub fn with_hashes<I, T, F, R>(
+    arrays: I,
+    random_state: &RandomState,
+    callback: F,
+) -> Result<R>
+where
+    I: IntoIterator<Item = T>,
+    T: AsDynArray,
+    F: FnOnce(&[u64]) -> Result<R>,
+{
+    // Peek at the first array to determine buffer size without fully collecting
+    let mut iter = arrays.into_iter().peekable();
+
+    // Get the required size from the first array
+    let required_size = match iter.peek() {
+        Some(arr) => arr.as_dyn_array().len(),
+        None => return _internal_err!("with_hashes requires at least one array"),
+    };
+
+    HASH_BUFFER.try_with(|cell| {
+        let mut buffer = cell.try_borrow_mut()
+            .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?;
+
+        // Ensure buffer has sufficient length, clearing old values
+        buffer.clear();
+        buffer.resize(required_size, 0);
+
+        // Create hashes in the buffer - this consumes the iterator
+        create_hashes(iter, random_state, &mut buffer[..required_size])?;
+
+        // Execute the callback with an immutable slice
+        let result = callback(&buffer[..required_size])?;
+
+        // Cleanup: truncate if buffer grew too large
+        if buffer.capacity() > MAX_BUFFER_SIZE {
+            buffer.truncate(MAX_BUFFER_SIZE);
+            buffer.shrink_to_fit();
+        }
+
+        Ok(result)
+    }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))?
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
     if mul_col {
@@ -60,12 +153,17 @@ fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col:
 
 pub trait HashValue {
     fn hash_one(&self, state: &RandomState) -> u64;
+    /// Write this value into an existing hasher (same data as `hash_one`).
+    fn hash_write(&self, hasher: &mut impl Hasher);
 }
 
 impl<T: HashValue + ?Sized> HashValue for &T {
     fn hash_one(&self, state: &RandomState) -> u64 {
         T::hash_one(self, state)
     }
+    fn hash_write(&self, hasher: &mut impl Hasher) {
+        T::hash_write(self, hasher)
+    }
 }
 
 macro_rules! hash_value {
@@ -74,10 +172,13 @@ macro_rules! hash_value {
             fn hash_one(&self, state: &RandomState) -> u64 {
                 state.hash_one(self)
             }
+            fn hash_write(&self, hasher: &mut impl Hasher) {
+                Hash::hash(self, hasher)
+            }
         })+
     };
 }
-hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
+hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
 hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
 
 macro_rules! hash_float_value {
@@ -86,14 +187,28 @@ macro_rules! hash_float_value {
             fn hash_one(&self, state: &RandomState) -> u64 {
                 state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
             }
+            fn hash_write(&self, hasher: &mut impl Hasher) {
+                hasher.write(&self.to_ne_bytes())
+            }
         })+
     };
 }
 hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
 
+/// Create a `SeedableRandomState` whose per-hasher seed incorporates `seed`.
+/// This folds the previous hash into the hasher's initial state so only the
+/// new value needs to pass through the hash function — same cost as `hash_one`.
+#[inline]
+fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState {
+    foldhash::fast::SeedableRandomState::with_seed(
+        seed,
+        foldhash::SharedSeed::global_fixed(),
+    )
+}
+
 /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
-/// If `rehash==true` this combines the previous hash value in the buffer
-/// with the new hash using `combine_hashes`
+/// If `rehash==true` this folds the existing hash into the hasher state
+/// and hashes only the new value (avoiding a separate combine step).
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array_primitive<T>(
     array: &PrimitiveArray<T>,
@@ -112,7 +227,9 @@ fn hash_array_primitive<T>(
     if array.null_count() == 0 {
         if rehash {
             for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
+                let mut hasher = seeded_state(*hash).build_hasher();
+                value.hash_write(&mut hasher);
+                *hash = hasher.finish();
             }
         } else {
             for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
@@ -120,18 +237,16 @@ fn hash_array_primitive<T>(
             }
         }
     } else if rehash {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            let mut hasher = seeded_state(hashes_buffer[i]).build_hasher();
+            value.hash_write(&mut hasher);
+            hashes_buffer[i] = hasher.finish();
         }
     } else {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = value.hash_one(random_state);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] = value.hash_one(random_state);
         }
     }
 }
@@ -141,7 +256,7 @@ fn hash_array_primitive<T>(
 /// with the new hash using `combine_hashes`
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array<T>(
-    array: T,
+    array: &T,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
     rehash: bool,
@@ -168,54 +283,255 @@ fn hash_array<T>(
             }
         }
     } else if rehash {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = combine_hashes(value.hash_one(random_state), *hash);
-            }
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] =
+                combine_hashes(value.hash_one(random_state), hashes_buffer[i]);
         }
     } else {
-        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
-            if !array.is_null(i) {
-                let value = unsafe { array.value_unchecked(i) };
-                *hash = value.hash_one(random_state);
+        for i in array.nulls().unwrap().valid_indices() {
+            let value = unsafe { array.value_unchecked(i) };
+            hashes_buffer[i] = value.hash_one(random_state);
+        }
+    }
+}
+
+/// Hash a StringView or BytesView array
+///
+/// Templated to optimize inner loop based on presence of nulls and external buffers.
+///
+/// HAS_NULLS: do we have to check null in the inner loop
+/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes
+/// REHASH: if true, combining with existing hash, otherwise initializing
+#[inline(never)]
+fn hash_string_view_array_inner<
+    T: ByteViewType,
+    const HAS_NULLS: bool,
+    const HAS_BUFFERS: bool,
+    const REHASH: bool,
+>(
+    array: &GenericByteViewArray<T>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) {
+    assert_eq!(
+        hashes_buffer.len(),
+        array.len(),
+        "hashes_buffer and array should be of equal length"
+    );
+
+    let buffers = array.data_buffers();
+    let view_bytes = |view_len: u32, view: u128| {
+        let view = ByteView::from(view);
+        let offset = view.offset as usize;
+        // SAFETY: view is a valid view as it came from the array
+        unsafe {
+            let data = buffers.get_unchecked(view.buffer_index as usize);
+            data.get_unchecked(offset..offset + view_len as usize)
+        }
+    };
+
+    let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
+    for (i, (hash, &v)) in hashes_and_views.enumerate() {
+        if HAS_NULLS && array.is_null(i) {
+            continue;
+        }
+        let view_len = v as u32;
+        // all views are inlined, no need to access external buffers
+        if !HAS_BUFFERS || view_len <= 12 {
+            if REHASH {
+                let mut hasher = seeded_state(*hash).build_hasher();
+                v.hash_write(&mut hasher);
+                *hash = hasher.finish();
+            } else {
+                *hash = v.hash_one(random_state);
             }
+            continue;
+        }
+        // view is not inlined, so we need to hash the bytes as well
+        let value = view_bytes(view_len, v);
+        if REHASH {
+            let mut hasher = seeded_state(*hash).build_hasher();
+            value.hash_write(&mut hasher);
+            *hash = hasher.finish();
+        } else {
+            *hash = value.hash_one(random_state);
         }
     }
 }
 
-/// Hash the values in a dictionary array
+/// Builds hash values for array views and writes them into `hashes_buffer`
+/// If `rehash==true` this combines the previous hash value in the buffer
+/// with the new hash using `combine_hashes`
 #[cfg(not(feature = "force_hash_collisions"))]
-fn hash_dictionary<K: ArrowDictionaryKeyType>(
+fn hash_generic_byte_view_array<T: ByteViewType>(
+    array: &GenericByteViewArray<T>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) {
+    // instantiate the correct version based on presence of nulls and external buffers
+    match (
+        array.null_count() != 0,
+        !array.data_buffers().is_empty(),
+        rehash,
+    ) {
+        // no nulls or buffers ==> hash the inlined views directly
+        // don't call the inner function as Rust seems better able to inline this simpler code (2-3% faster)
+        (false, false, false) => {
+            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
+                *hash = view.hash_one(random_state);
+            }
+        }
+        (false, false, true) => {
+            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
+                let mut hasher = seeded_state(*hash).build_hasher();
+                view.hash_write(&mut hasher);
+                *hash = hasher.finish();
+            }
+        }
+        (false, true, false) => hash_string_view_array_inner::<T, false, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, true) => hash_string_view_array_inner::<T, false, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, false) => hash_string_view_array_inner::<T, true, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, true) => hash_string_view_array_inner::<T, true, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, false) => hash_string_view_array_inner::<T, true, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, true) => hash_string_view_array_inner::<T, true, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+    }
+}
+
+/// Hash dictionary array with compile-time specialization for null handling.
+///
+/// Uses const generics to eliminate runtim branching in the hot loop:
+/// - `HAS_NULL_KEYS`: Whether to check for null dictionary keys
+/// - `HAS_NULL_VALUES`: Whether to check for null dictionary values
+/// - `MULTI_COL`: Whether to combine with existing hash (true) or initialize (false)
+#[inline(never)]
+fn hash_dictionary_inner<
+    K: ArrowDictionaryKeyType,
+    const HAS_NULL_KEYS: bool,
+    const HAS_NULL_VALUES: bool,
+    const MULTI_COL: bool,
+>(
     array: &DictionaryArray<K>,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
-    multi_col: bool,
 ) -> Result<()> {
     // Hash each dictionary value once, and then use that computed
     // hash for each key value to avoid a potentially expensive
     // redundant hashing for large dictionary elements (e.g. strings)
-    let values = Arc::clone(array.values());
-    let mut dict_hashes = vec![0; values.len()];
-    create_hashes(&[values], random_state, &mut dict_hashes)?;
+    let dict_values = array.values();
+    let mut dict_hashes = vec![0; dict_values.len()];
+    create_hashes([dict_values], random_state, &mut dict_hashes)?;
 
-    // combine hash for each index in values
-    if multi_col {
+    if HAS_NULL_KEYS {
         for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
             if let Some(key) = key {
-                *hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
-            } // no update for Null, consistent with other hashes
+                let idx = key.as_usize();
+                if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
+                    if MULTI_COL {
+                        *hash = combine_hashes(dict_hashes[idx], *hash);
+                    } else {
+                        *hash = dict_hashes[idx];
+                    }
+                }
+            }
         }
     } else {
-        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
-            if let Some(key) = key {
-                *hash = dict_hashes[key.as_usize()]
-            } // no update for Null, consistent with other hashes
+        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().values()) {
+            let idx = key.as_usize();
+            if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
+                if MULTI_COL {
+                    *hash = combine_hashes(dict_hashes[idx], *hash);
+                } else {
+                    *hash = dict_hashes[idx];
+                }
+            }
         }
     }
     Ok(())
 }
 
+/// Hash the values in a dictionary array
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_dictionary<K: ArrowDictionaryKeyType>(
+    array: &DictionaryArray<K>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    multi_col: bool,
+) -> Result<()> {
+    let has_null_keys = array.keys().null_count() != 0;
+    let has_null_values = array.values().null_count() != 0;
+
+    // Dispatcher based on null presence and multi-column mode
+    // Should reduce branching within hot loops
+    match (has_null_keys, has_null_values, multi_col) {
+        (false, false, false) => hash_dictionary_inner::<K, false, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, false, true) => hash_dictionary_inner::<K, false, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, false) => hash_dictionary_inner::<K, false, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (false, true, true) => hash_dictionary_inner::<K, false, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, false) => hash_dictionary_inner::<K, true, false, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, false, true) => hash_dictionary_inner::<K, true, false, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, false) => hash_dictionary_inner::<K, true, true, false>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+        (true, true, true) => hash_dictionary_inner::<K, true, true, true>(
+            array,
+            random_state,
+            hashes_buffer,
+        ),
+    }
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_struct_array(
     array: &StructArray,
@@ -225,19 +541,21 @@ fn hash_struct_array(
     let nulls = array.nulls();
     let row_len = array.len();
 
-    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
-        nulls.valid_indices().collect()
-    } else {
-        (0..row_len).collect()
-    };
-
     // Create hashes for each row that combines the hashes over all the column at that row.
     let mut values_hashes = vec![0u64; row_len];
     create_hashes(array.columns(), random_state, &mut values_hashes)?;
 
-    for i in valid_row_indices {
-        let hash = &mut hashes_buffer[i];
-        *hash = combine_hashes(*hash, values_hashes[i]);
+    // Separate paths to avoid allocating Vec when there are no nulls
+    if let Some(nulls) = nulls {
+        for i in nulls.valid_indices() {
+            let hash = &mut hashes_buffer[i];
+            *hash = combine_hashes(*hash, values_hashes[i]);
+        }
+    } else {
+        for i in 0..row_len {
+            let hash = &mut hashes_buffer[i];
+            *hash = combine_hashes(*hash, values_hashes[i]);
+        }
     }
 
     Ok(())
@@ -254,15 +572,29 @@ fn hash_map_array(
     let offsets = array.offsets();
 
     // Create hashes for each entry in each row
-    let mut values_hashes = vec![0u64; array.entries().len()];
-    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
+    let first_offset = offsets.first().copied().unwrap_or_default() as usize;
+    let last_offset = offsets.last().copied().unwrap_or_default() as usize;
+    let entries_len = last_offset - first_offset;
+
+    // Only hash the entries that are actually referenced
+    let mut values_hashes = vec![0u64; entries_len];
+    let entries = array.entries();
+    let sliced_columns: Vec<ArrayRef> = entries
+        .columns()
+        .iter()
+        .map(|col| col.slice(first_offset, entries_len))
+        .collect();
+    create_hashes(&sliced_columns, random_state, &mut values_hashes)?;
 
     // Combine the hashes for entries on each row with each other and previous hash for that row
+    // Adjust indices by first_offset since values_hashes is sliced starting from first_offset
     if let Some(nulls) = nulls {
         for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
             if nulls.is_valid(i) {
                 let hash = &mut hashes_buffer[i];
-                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+                for values_hash in &values_hashes
+                    [start.as_usize() - first_offset..stop.as_usize() - first_offset]
+                {
                     *hash = combine_hashes(*hash, *values_hash);
                 }
             }
@@ -270,7 +602,9 @@ fn hash_map_array(
     } else {
         for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
             let hash = &mut hashes_buffer[i];
-            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+            for values_hash in &values_hashes
+                [start.as_usize() - first_offset..stop.as_usize() - first_offset]
+            {
                 *hash = combine_hashes(*hash, *values_hash);
             }
         }
@@ -288,24 +622,80 @@ fn hash_list_array<OffsetSize>(
 where
     OffsetSize: OffsetSizeTrait,
 {
-    let values = Arc::clone(array.values());
+    // In case values is sliced, hash only the bytes used by the offsets of this ListArray
+    let first_offset = array.value_offsets().first().cloned().unwrap_or_default();
+    let last_offset = array.value_offsets().last().cloned().unwrap_or_default();
+    let value_bytes_len = (last_offset - first_offset).as_usize();
+    let mut values_hashes = vec![0u64; value_bytes_len];
+    create_hashes(
+        [array
+            .values()
+            .slice(first_offset.as_usize(), value_bytes_len)],
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    if array.null_count() > 0 {
+        for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate()
+        {
+            if array.is_valid(i) {
+                let hash = &mut hashes_buffer[i];
+                for values_hash in &values_hashes[(*start - first_offset).as_usize()
+                    ..(*stop - first_offset).as_usize()]
+                {
+                    *hash = combine_hashes(*hash, *values_hash);
+                }
+            }
+        }
+    } else {
+        for ((start, stop), hash) in array
+            .value_offsets()
+            .iter()
+            .tuple_windows()
+            .zip(hashes_buffer.iter_mut())
+        {
+            for values_hash in &values_hashes
+                [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()]
+            {
+                *hash = combine_hashes(*hash, *values_hash);
+            }
+        }
+    }
+    Ok(())
+}
+
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_list_view_array<OffsetSize>(
+    array: &GenericListViewArray<OffsetSize>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    let values = array.values();
     let offsets = array.value_offsets();
+    let sizes = array.value_sizes();
     let nulls = array.nulls();
     let mut values_hashes = vec![0u64; values.len()];
-    create_hashes(&[values], random_state, &mut values_hashes)?;
+    create_hashes([values], random_state, &mut values_hashes)?;
     if let Some(nulls) = nulls {
-        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
+        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
             if nulls.is_valid(i) {
                 let hash = &mut hashes_buffer[i];
-                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+                let start = offset.as_usize();
+                let end = start + size.as_usize();
+                for values_hash in &values_hashes[start..end] {
                     *hash = combine_hashes(*hash, *values_hash);
                 }
             }
         }
     } else {
-        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
+        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
             let hash = &mut hashes_buffer[i];
-            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
+            let start = offset.as_usize();
+            let end = start + size.as_usize();
+            for values_hash in &values_hashes[start..end] {
                 *hash = combine_hashes(*hash, *values_hash);
             }
         }
@@ -313,17 +703,145 @@ where
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_union_array(
+    array: &UnionArray,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    let DataType::Union(union_fields, _mode) = array.data_type() else {
+        unreachable!()
+    };
+
+    if array.is_dense() {
+        // Dense union: children only contain values of their type, so they're already compact.
+        // Use the default hashing approach which is efficient for dense unions.
+        hash_union_array_default(array, union_fields, random_state, hashes_buffer)
+    } else {
+        // Sparse union: each child has the same length as the union array.
+        // Optimization: only hash the elements that are actually referenced by type_ids,
+        // instead of hashing all K*N elements (where K = num types, N = array length).
+        hash_sparse_union_array(array, union_fields, random_state, hashes_buffer)
+    }
+}
+
+/// Default hashing for union arrays - hashes all elements of each child array fully.
+///
+/// This approach works for both dense and sparse union arrays:
+/// - Dense unions: children are compact (each child only contains values of that type)
+/// - Sparse unions: children have the same length as the union array
+///
+/// For sparse unions with 3+ types, the optimized take/scatter approach in
+/// `hash_sparse_union_array` is more efficient, but for 1-2 types or dense unions,
+/// this simpler approach is preferred.
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_union_array_default(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    let mut child_hashes: HashMap<i8, Vec<u64>> =
+        HashMap::with_capacity(union_fields.len());
+
+    // Hash each child array fully
+    for (type_id, _field) in union_fields.iter() {
+        let child = array.child(type_id);
+        let mut child_hash_buffer = vec![0; child.len()];
+        create_hashes([child], random_state, &mut child_hash_buffer)?;
+
+        child_hashes.insert(type_id, child_hash_buffer);
+    }
+
+    // Combine hashes for each row using the appropriate child offset
+    // For dense unions: value_offset points to the actual position in the child
+    // For sparse unions: value_offset equals the row index
+    #[expect(clippy::needless_range_loop)]
+    for i in 0..array.len() {
+        let type_id = array.type_id(i);
+        let child_offset = array.value_offset(i);
+
+        let child_hash = child_hashes.get(&type_id).expect("invalid type_id");
+        hashes_buffer[i] = combine_hashes(hashes_buffer[i], child_hash[child_offset]);
+    }
+
+    Ok(())
+}
+
+/// Hash a sparse union array.
+/// Sparse unions have child arrays with the same length as the union array.
+/// For 3+ types, we optimize by only hashing the N elements that are actually used
+/// (via take/scatter), instead of hashing all K*N elements.
+///
+/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we use
+/// the default approach of hashing all children (same as dense unions).
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_sparse_union_array(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    use std::collections::HashMap;
+
+    // For 1-2 types, the take/scatter overhead isn't worth it.
+    // Fall back to the default approach (same as dense union).
+    if union_fields.len() <= 2 {
+        return hash_union_array_default(
+            array,
+            union_fields,
+            random_state,
+            hashes_buffer,
+        );
+    }
+
+    let type_ids = array.type_ids();
+
+    // Group indices by type_id
+    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
+    for (i, &type_id) in type_ids.iter().enumerate() {
+        indices_by_type.entry(type_id).or_default().push(i as u32);
+    }
+
+    // For each type, extract only the needed elements, hash them, and scatter back
+    for (type_id, _field) in union_fields.iter() {
+        if let Some(indices) = indices_by_type.get(&type_id) {
+            if indices.is_empty() {
+                continue;
+            }
+
+            let child = array.child(type_id);
+            let indices_array = UInt32Array::from(indices.clone());
+
+            // Extract only the elements we need using take()
+            let filtered = take(child.as_ref(), &indices_array, None)?;
+
+            // Hash the filtered array
+            let mut filtered_hashes = vec![0u64; filtered.len()];
+            create_hashes([&filtered], random_state, &mut filtered_hashes)?;
+
+            // Scatter hashes back to correct positions
+            for (hash, &idx) in filtered_hashes.iter().zip(indices.iter()) {
+                hashes_buffer[idx as usize] =
+                    combine_hashes(hashes_buffer[idx as usize], *hash);
+            }
+        }
+    }
+
+    Ok(())
+}
+
 #[cfg(not(feature = "force_hash_collisions"))]
 fn hash_fixed_list_array(
     array: &FixedSizeListArray,
     random_state: &RandomState,
     hashes_buffer: &mut [u64],
 ) -> Result<()> {
-    let values = Arc::clone(array.values());
+    let values = array.values();
     let value_length = array.value_length() as usize;
     let nulls = array.nulls();
     let mut values_hashes = vec![0u64; values.len()];
-    create_hashes(&[values], random_state, &mut values_hashes)?;
+    create_hashes([values], random_state, &mut values_hashes)?;
     if let Some(nulls) = nulls {
         for i in 0..array.len() {
             if nulls.is_valid(i) {
@@ -346,83 +864,246 @@ fn hash_fixed_list_array(
     Ok(())
 }
 
-/// Test version of `create_hashes` that produces the same value for
-/// all hashes (to test collisions)
-///
-/// See comments on `hashes_buffer` for more details
+/// Inner hash function for RunArray
+#[inline(never)]
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array_inner<
+    R: RunEndIndexType,
+    const HAS_NULL_VALUES: bool,
+    const REHASH: bool,
+>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    // We find the relevant runs that cover potentially sliced arrays, so we can only hash those
+    // values. Then we find the runs that refer to the original runs and ensure that we apply
+    // hashes correctly to the sliced, whether sliced at the start, end, or both.
+    let array_offset = array.offset();
+    let array_len = array.len();
+
+    if array_len == 0 {
+        return Ok(());
+    }
+
+    let run_ends = array.run_ends();
+    let run_ends_values = run_ends.values();
+    let values = array.values();
+
+    let start_physical_index = array.get_start_physical_index();
+    // get_end_physical_index returns the inclusive last index, but we need the exclusive range end
+    // for the operations we use below.
+    let end_physical_index = array.get_end_physical_index() + 1;
+
+    let sliced_values = values.slice(
+        start_physical_index,
+        end_physical_index - start_physical_index,
+    );
+    let mut values_hashes = vec![0u64; sliced_values.len()];
+    create_hashes(
+        std::slice::from_ref(&sliced_values),
+        random_state,
+        &mut values_hashes,
+    )?;
+
+    let mut start_in_slice = 0;
+    for (adjusted_physical_index, &absolute_run_end) in run_ends_values
+        [start_physical_index..end_physical_index]
+        .iter()
+        .enumerate()
+    {
+        let absolute_run_end = absolute_run_end.as_usize();
+        let end_in_slice = (absolute_run_end - array_offset).min(array_len);
+
+        if HAS_NULL_VALUES && sliced_values.is_null(adjusted_physical_index) {
+            start_in_slice = end_in_slice;
+            continue;
+        }
+
+        let value_hash = values_hashes[adjusted_physical_index];
+        let run_slice = &mut hashes_buffer[start_in_slice..end_in_slice];
+
+        if REHASH {
+            for hash in run_slice.iter_mut() {
+                *hash = combine_hashes(value_hash, *hash);
+            }
+        } else {
+            run_slice.fill(value_hash);
+        }
+
+        start_in_slice = end_in_slice;
+    }
+
+    Ok(())
+}
+
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_run_array<R: RunEndIndexType>(
+    array: &RunArray<R>,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    let has_null_values = array.values().null_count() != 0;
+
+    match (has_null_values, rehash) {
+        (false, false) => {
+            hash_run_array_inner::<R, false, false>(array, random_state, hashes_buffer)
+        }
+        (false, true) => {
+            hash_run_array_inner::<R, false, true>(array, random_state, hashes_buffer)
+        }
+        (true, false) => {
+            hash_run_array_inner::<R, true, false>(array, random_state, hashes_buffer)
+        }
+        (true, true) => {
+            hash_run_array_inner::<R, true, true>(array, random_state, hashes_buffer)
+        }
+    }
+}
+
+/// Internal helper function that hashes a single array and either initializes or combines
+/// the hash values in the buffer.
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_single_array(
+    array: &dyn Array,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    downcast_primitive_array! {
+        array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
+        DataType::Null => hash_null(random_state, hashes_buffer, rehash),
+        DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::Utf8View => hash_generic_byte_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
+        DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
+        DataType::BinaryView => hash_generic_byte_view_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
+        DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
+        DataType::FixedSizeBinary(_) => {
+            let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
+            hash_array(&array, random_state, hashes_buffer, rehash)
+        }
+        DataType::Dictionary(_, _) => downcast_dictionary_array! {
+            array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
+            _ => unreachable!()
+        }
+        DataType::Struct(_) => {
+            let array = as_struct_array(array)?;
+            hash_struct_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::List(_) => {
+            let array = as_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::LargeList(_) => {
+            let array = as_large_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::ListView(_) => {
+            let array = as_list_view_array(array)?;
+            hash_list_view_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::LargeListView(_) => {
+            let array = as_large_list_view_array(array)?;
+            hash_list_view_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::Map(_, _) => {
+            let array = as_map_array(array)?;
+            hash_map_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::FixedSizeList(_,_) => {
+            let array = as_fixed_size_list_array(array)?;
+            hash_fixed_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::Union(_, _) => {
+            let array = as_union_array(array)?;
+            hash_union_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::RunEndEncoded(_, _) => downcast_run_array! {
+            array => hash_run_array(array, random_state, hashes_buffer, rehash)?,
+            _ => unreachable!()
+        }
+        _ => {
+            // This is internal because we should have caught this before.
+            return _internal_err!(
+                "Unsupported data type in hasher: {}",
+                array.data_type()
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Test version of `hash_single_array` that forces all hashes to collide to zero.
 #[cfg(feature = "force_hash_collisions")]
-pub fn create_hashes<'a>(
-    _arrays: &[ArrayRef],
+fn hash_single_array(
+    _array: &dyn Array,
     _random_state: &RandomState,
-    hashes_buffer: &'a mut Vec<u64>,
-) -> Result<&'a mut Vec<u64>> {
+    hashes_buffer: &mut [u64],
+    _rehash: bool,
+) -> Result<()> {
     for hash in hashes_buffer.iter_mut() {
         *hash = 0
     }
-    Ok(hashes_buffer)
+    Ok(())
 }
 
-/// Creates hash values for every row, based on the values in the
-/// columns.
+/// Something that can be returned as a `&dyn Array`.
+///
+/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`,
+/// and this seems the best way to do so.
+///
+/// We tried having it accept `AsRef<dyn Array>`
+/// but that is not implemented for and cannot be implemented for
+/// `&dyn Array` so callers that have the latter would not be able
+/// to call `create_hashes` directly. This shim trait makes it possible.
+pub trait AsDynArray {
+    fn as_dyn_array(&self) -> &dyn Array;
+}
+
+impl AsDynArray for dyn Array {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self
+    }
+}
+
+impl AsDynArray for &dyn Array {
+    fn as_dyn_array(&self) -> &dyn Array {
+        *self
+    }
+}
+
+impl AsDynArray for ArrayRef {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self.as_ref()
+    }
+}
+
+impl AsDynArray for &ArrayRef {
+    fn as_dyn_array(&self) -> &dyn Array {
+        self.as_ref()
+    }
+}
+
+/// Creates hash values for every row, based on the values in the columns.
 ///
 /// The number of rows to hash is determined by `hashes_buffer.len()`.
-/// `hashes_buffer` should be pre-sized appropriately
-#[cfg(not(feature = "force_hash_collisions"))]
-pub fn create_hashes<'a>(
-    arrays: &[ArrayRef],
+/// `hashes_buffer` should be pre-sized appropriately.
+pub fn create_hashes<'a, I, T>(
+    arrays: I,
     random_state: &RandomState,
-    hashes_buffer: &'a mut Vec<u64>,
-) -> Result<&'a mut Vec<u64>> {
-    for (i, col) in arrays.iter().enumerate() {
-        let array = col.as_ref();
+    hashes_buffer: &'a mut [u64],
+) -> Result<&'a mut [u64]>
+where
+    I: IntoIterator<Item = T>,
+    T: AsDynArray,
+{
+    for (i, array) in arrays.into_iter().enumerate() {
         // combine hashes with `combine_hashes` for all columns besides the first
         let rehash = i >= 1;
-        downcast_primitive_array! {
-            array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
-            DataType::Null => hash_null(random_state, hashes_buffer, rehash),
-            DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
-            DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
-            DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
-            DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
-            DataType::FixedSizeBinary(_) => {
-                let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
-                hash_array(array, random_state, hashes_buffer, rehash)
-            }
-            DataType::Dictionary(_, _) => downcast_dictionary_array! {
-                array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
-                _ => unreachable!()
-            }
-            DataType::Struct(_) => {
-                let array = as_struct_array(array)?;
-                hash_struct_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::List(_) => {
-                let array = as_list_array(array)?;
-                hash_list_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::LargeList(_) => {
-                let array = as_large_list_array(array)?;
-                hash_list_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::Map(_, _) => {
-                let array = as_map_array(array)?;
-                hash_map_array(array, random_state, hashes_buffer)?;
-            }
-            DataType::FixedSizeList(_,_) => {
-                let array = as_fixed_size_list_array(array)?;
-                hash_fixed_list_array(array, random_state, hashes_buffer)?;
-            }
-            _ => {
-                // This is internal because we should have caught this before.
-                return _internal_err!(
-                    "Unsupported data type in hasher: {}",
-                    col.data_type()
-                );
-            }
-        }
+        hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
     }
     Ok(hashes_buffer)
 }
@@ -445,8 +1126,8 @@ mod tests {
             .collect::<Decimal128Array>()
             .with_precision_and_scale(20, 3)
             .unwrap();
-        let array_ref = Arc::new(array);
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let array_ref: ArrayRef = Arc::new(array);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; array_ref.len()];
         let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
         assert_eq!(hashes.len(), 4);
@@ -456,19 +1137,25 @@ mod tests {
     #[test]
     fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
         let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
-        let hashes_buff = &mut vec![0; 0];
-        let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut [0; 0];
+        let hashes = create_hashes(
+            &[Arc::new(empty_array) as ArrayRef],
+            &random_state,
+            hashes_buff,
+        )?;
         assert_eq!(hashes, &Vec::<u64>::new());
         Ok(())
     }
 
     #[test]
     fn create_hashes_for_float_arrays() -> Result<()> {
-        let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
-        let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
+        let f32_arr: ArrayRef =
+            Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
+        let f64_arr: ArrayRef =
+            Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; f32_arr.len()];
         let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
         assert_eq!(hashes.len(), 4,);
@@ -494,18 +1181,15 @@ mod tests {
                     Some(b"Longer than 12 bytes string"),
                 ];
 
-                let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
-                let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
+                let binary_array: ArrayRef =
+                    Arc::new(binary.iter().cloned().collect::<$ARRAY>());
 
-                let random_state = RandomState::with_seeds(0, 0, 0, 0);
+                let random_state = RandomState::with_seed(0);
 
                 let mut binary_hashes = vec![0; binary.len()];
                 create_hashes(&[binary_array], &random_state, &mut binary_hashes)
                     .unwrap();
 
-                let mut ref_hashes = vec![0; binary.len()];
-                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
-
                 // Null values result in a zero hash,
                 for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
                     match val {
@@ -514,9 +1198,6 @@ mod tests {
                     }
                 }
 
-                // same logical values should hash to the same hash value
-                assert_eq!(binary_hashes, ref_hashes);
-
                 // Same values should map to same hash values
                 assert_eq!(binary[0], binary[5]);
                 assert_eq!(binary[4], binary[6]);
@@ -528,15 +1209,16 @@ mod tests {
     }
 
     create_hash_binary!(binary_array, BinaryArray);
+    create_hash_binary!(large_binary_array, LargeBinaryArray);
     create_hash_binary!(binary_view_array, BinaryViewArray);
 
     #[test]
     fn create_hashes_fixed_size_binary() -> Result<()> {
         let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
-        let fixed_size_binary_array =
+        let fixed_size_binary_array: ArrayRef =
             Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
         let hashes =
             create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
@@ -560,15 +1242,16 @@ mod tests {
                     Some("Longer than 12 bytes string"),
                 ];
 
-                let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
-                let dict_array = Arc::new(
+                let string_array: ArrayRef =
+                    Arc::new(strings.iter().cloned().collect::<$ARRAY>());
+                let dict_array: ArrayRef = Arc::new(
                     strings
                         .iter()
                         .cloned()
                         .collect::<DictionaryArray<Int8Type>>(),
                 );
 
-                let random_state = RandomState::with_seeds(0, 0, 0, 0);
+                let random_state = RandomState::with_seed(0);
 
                 let mut string_hashes = vec![0; strings.len()];
                 create_hashes(&[string_array], &random_state, &mut string_hashes)
@@ -603,21 +1286,90 @@ mod tests {
     create_hash_string!(string_view_array, StringArray);
     create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
 
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut vec![0; array.len()];
+        let hashes = create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            hashes_buff,
+        )?;
+
+        assert_eq!(hashes.len(), 7);
+        assert_eq!(hashes[0], hashes[1]);
+        assert_eq!(hashes[2], hashes[3]);
+        assert_eq!(hashes[3], hashes[4]);
+        assert_eq!(hashes[5], hashes[6]);
+        assert_ne!(hashes[0], hashes[2]);
+        assert_ne!(hashes[2], hashes[5]);
+        assert_ne!(hashes[0], hashes[5]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_multi_column_hash_with_run_array() -> Result<()> {
+        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]));
+        let values = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let run_array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut one_col_hashes = vec![0; int_array.len()];
+        create_hashes(
+            &[Arc::clone(&int_array) as ArrayRef],
+            &random_state,
+            &mut one_col_hashes,
+        )?;
+
+        let mut two_col_hashes = vec![0; int_array.len()];
+        create_hashes(
+            &[
+                Arc::clone(&int_array) as ArrayRef,
+                Arc::clone(&run_array) as ArrayRef,
+            ],
+            &random_state,
+            &mut two_col_hashes,
+        )?;
+
+        assert_eq!(one_col_hashes.len(), 7);
+        assert_eq!(two_col_hashes.len(), 7);
+        assert_ne!(one_col_hashes, two_col_hashes);
+
+        let diff_0_vs_1_one_col = one_col_hashes[0] != one_col_hashes[1];
+        let diff_0_vs_1_two_col = two_col_hashes[0] != two_col_hashes[1];
+        assert_eq!(diff_0_vs_1_one_col, diff_0_vs_1_two_col);
+
+        let diff_2_vs_3_one_col = one_col_hashes[2] != one_col_hashes[3];
+        let diff_2_vs_3_two_col = two_col_hashes[2] != two_col_hashes[3];
+        assert_eq!(diff_2_vs_3_one_col, diff_2_vs_3_two_col);
+
+        Ok(())
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]
     fn create_hashes_for_dict_arrays() {
         let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
 
-        let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
-        let dict_array = Arc::new(
+        let string_array: ArrayRef =
+            Arc::new(strings.iter().cloned().collect::<StringArray>());
+        let dict_array: ArrayRef = Arc::new(
             strings
                 .iter()
                 .cloned()
                 .collect::<DictionaryArray<Int8Type>>(),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
 
         let mut string_hashes = vec![0; strings.len()];
         create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
@@ -662,7 +1414,7 @@ mod tests {
         ];
         let list_array =
             Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; list_array.len()];
         create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[5]);
@@ -671,6 +1423,130 @@ mod tests {
         assert_eq!(hashes[1], hashes[6]); // null vs empty list
     }
 
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_list_arrays() {
+        let data = vec![
+            Some(vec![Some(0), Some(1), Some(2)]),
+            None,
+            // Slice from here
+            Some(vec![Some(3), None, Some(5)]),
+            Some(vec![Some(3), None, Some(5)]),
+            None,
+            // To here
+            Some(vec![Some(0), Some(1), Some(2)]),
+            Some(vec![]),
+        ];
+        let list_array =
+            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
+        let list_array = list_array.slice(2, 3);
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; list_array.len()];
+        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
+        assert_eq!(hashes[0], hashes[1]);
+        assert_ne!(hashes[1], hashes[2]);
+    }
+
+    #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_list_view_arrays() {
+        use arrow::buffer::{NullBuffer, ScalarBuffer};
+
+        // Create values array: [0, 1, 2, 3, null, 5]
+        let values = Arc::new(Int32Array::from(vec![
+            Some(0),
+            Some(1),
+            Some(2),
+            Some(3),
+            None,
+            Some(5),
+        ])) as ArrayRef;
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        // Create ListView with the following logical structure:
+        // Row 0: [0, 1, 2]        (offset=0, size=3)
+        // Row 1: null             (null bit set)
+        // Row 2: [3, null, 5]     (offset=3, size=3)
+        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
+        // Row 4: null             (null bit set)
+        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
+        // Row 6: []               (offset=0, size=0) - empty list
+        let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
+        let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
+        let nulls = Some(NullBuffer::from(vec![
+            true, false, true, true, false, true, true,
+        ]));
+
+        let list_view_array =
+            Arc::new(ListViewArray::new(field, offsets, sizes, values, nulls))
+                as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; list_view_array.len()];
+        create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
+
+        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
+        assert_eq!(hashes[1], hashes[4]); // both null
+        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
+        assert_eq!(hashes[1], hashes[6]); // null vs empty list
+
+        // Negative tests: different content should produce different hashes
+        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
+        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
+        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
+    }
+
+    #[test]
+    // Tests actual values of hashes, which are different if forcing collisions
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_large_list_view_arrays() {
+        use arrow::buffer::{NullBuffer, ScalarBuffer};
+
+        // Create values array: [0, 1, 2, 3, null, 5]
+        let values = Arc::new(Int32Array::from(vec![
+            Some(0),
+            Some(1),
+            Some(2),
+            Some(3),
+            None,
+            Some(5),
+        ])) as ArrayRef;
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+
+        // Create LargeListView with the following logical structure:
+        // Row 0: [0, 1, 2]        (offset=0, size=3)
+        // Row 1: null             (null bit set)
+        // Row 2: [3, null, 5]     (offset=3, size=3)
+        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
+        // Row 4: null             (null bit set)
+        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
+        // Row 6: []               (offset=0, size=0) - empty list
+        let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
+        let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
+        let nulls = Some(NullBuffer::from(vec![
+            true, false, true, true, false, true, true,
+        ]));
+
+        let large_list_view_array = Arc::new(LargeListViewArray::new(
+            field, offsets, sizes, values, nulls,
+        )) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; large_list_view_array.len()];
+        create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
+
+        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
+        assert_eq!(hashes[1], hashes[4]); // both null
+        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
+        assert_eq!(hashes[1], hashes[6]); // null vs empty list
+
+        // Negative tests: different content should produce different hashes
+        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
+        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
+        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
+    }
+
     #[test]
     // Tests actual values of hashes, which are different if forcing collisions
     #[cfg(not(feature = "force_hash_collisions"))]
@@ -687,7 +1563,7 @@ mod tests {
             Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
                 data, 3,
             )) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; list_array.len()];
         create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[5]);
@@ -737,7 +1613,7 @@ mod tests {
 
         let array = Arc::new(struct_array) as ArrayRef;
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]);
@@ -774,7 +1650,7 @@ mod tests {
         assert!(struct_array.is_valid(1));
 
         let array = Arc::new(struct_array) as ArrayRef;
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]);
@@ -827,7 +1703,7 @@ mod tests {
 
         let array = Arc::new(builder.finish()) as ArrayRef;
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let mut hashes = vec![0; array.len()];
         create_hashes(&[array], &random_state, &mut hashes).unwrap();
         assert_eq!(hashes[0], hashes[1]); // same value
@@ -845,15 +1721,16 @@ mod tests {
         let strings1 = [Some("foo"), None, Some("bar")];
         let strings2 = [Some("blarg"), Some("blah"), None];
 
-        let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
-        let dict_array = Arc::new(
+        let string_array: ArrayRef =
+            Arc::new(strings1.iter().cloned().collect::<StringArray>());
+        let dict_array: ArrayRef = Arc::new(
             strings2
                 .iter()
                 .cloned()
                 .collect::<DictionaryArray<Int32Type>>(),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
 
         let mut one_col_hashes = vec![0; strings1.len()];
         create_hashes(
@@ -876,4 +1753,345 @@ mod tests {
 
         assert_ne!(one_col_hashes, two_col_hashes);
     }
+
+    #[test]
+    fn test_create_hashes_from_arrays() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let float_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut vec![0; int_array.len()];
+        let hashes =
+            create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
+        assert_eq!(hashes.len(), 4,);
+    }
+
+    #[test]
+    fn test_create_hashes_from_dyn_arrays() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let float_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
+
+        // Verify that we can call create_hashes with only &dyn Array
+        fn test(arr1: &dyn Array, arr2: &dyn Array) {
+            let random_state = RandomState::with_seed(0);
+            let hashes_buff = &mut vec![0; arr1.len()];
+            let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
+            assert_eq!(hashes.len(), 4,);
+        }
+        test(&*int_array, &*float_array);
+    }
+
+    #[test]
+    fn test_create_hashes_equivalence() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let random_state = RandomState::with_seed(0);
+
+        let mut hashes1 = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut hashes1,
+        )
+        .unwrap();
+
+        let mut hashes2 = vec![0; array.len()];
+        create_hashes([array], &random_state, &mut hashes2).unwrap();
+
+        assert_eq!(hashes1, hashes2);
+    }
+
+    #[test]
+    fn test_with_hashes() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test that with_hashes produces the same results as create_hashes
+        let mut expected_hashes = vec![0; array.len()];
+        create_hashes([&array], &random_state, &mut expected_hashes).unwrap();
+
+        let result = with_hashes([&array], &random_state, |hashes| {
+            assert_eq!(hashes.len(), 4);
+            // Verify hashes match expected values
+            assert_eq!(hashes, &expected_hashes[..]);
+            // Return a copy of the hashes
+            Ok(hashes.to_vec())
+        })
+        .unwrap();
+
+        // Verify callback result is returned correctly
+        assert_eq!(result, expected_hashes);
+    }
+
+    #[test]
+    fn test_with_hashes_multi_column() {
+        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test multi-column hashing
+        let mut expected_hashes = vec![0; int_array.len()];
+        create_hashes(
+            [&int_array, &str_array],
+            &random_state,
+            &mut expected_hashes,
+        )
+        .unwrap();
+
+        with_hashes([&int_array, &str_array], &random_state, |hashes| {
+            assert_eq!(hashes.len(), 3);
+            assert_eq!(hashes, &expected_hashes[..]);
+            Ok(())
+        })
+        .unwrap();
+    }
+
+    #[test]
+    fn test_with_hashes_empty_arrays() {
+        let random_state = RandomState::with_seed(0);
+
+        // Test that passing no arrays returns an error
+        let empty: [&ArrayRef; 0] = [];
+        let result = with_hashes(empty, &random_state, |_hashes| Ok(()));
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("requires at least one array")
+        );
+    }
+
+    #[test]
+    fn test_with_hashes_reentrancy() {
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6]));
+        let random_state = RandomState::with_seed(0);
+
+        // Test that reentrant calls return an error instead of panicking
+        let result = with_hashes([&array], &random_state, |_hashes| {
+            // Try to call with_hashes again inside the callback
+            with_hashes([&array2], &random_state, |_inner_hashes| Ok(()))
+        });
+
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("reentrantly") || err_msg.contains("cannot be called"),
+            "Error message should mention reentrancy: {err_msg}",
+        );
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sparse_union_arrays() {
+        // logical array: [int(5), str("foo"), int(10), int(5)]
+        let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
+        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
+
+        let type_ids = vec![0_i8, 1, 0, 0].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, true))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // Rows 0 and 3 both have type_id=0 (int) with value 5
+        assert_eq!(hashes[0], hashes[3]);
+        // Row 0 (int 5) vs Row 2 (int 10) - different values
+        assert_ne!(hashes[0], hashes[2]);
+        // Row 0 (int) vs Row 1 (string) - different types
+        assert_ne!(hashes[0], hashes[1]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sparse_union_arrays_with_nulls() {
+        // logical array: [int(5), str("foo"), int(null), str(null)]
+        let int_array = Int32Array::from(vec![Some(5), None, None, None]);
+        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
+
+        let type_ids = vec![0, 1, 0, 1].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, true))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // row 2 (int null) and row 3 (str null) should have the same hash
+        // because they are both null values
+        assert_eq!(hashes[2], hashes[3]);
+
+        // row 0 (int 5) vs row 2 (int null) - different (value vs null)
+        assert_ne!(hashes[0], hashes[2]);
+
+        // row 1 (str "foo") vs row 3 (str null) - different (value vs null)
+        assert_ne!(hashes[1], hashes[3]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_dense_union_arrays() {
+        // creates a dense union array with int and string types
+        // [67, "norm", 100, "macdonald", 67]
+        let int_array = Int32Array::from(vec![67, 100, 67]);
+        let str_array = StringArray::from(vec!["norm", "macdonald"]);
+
+        let type_ids = vec![0, 1, 0, 1, 0].into();
+        let offsets = vec![0, 0, 1, 1, 2].into();
+        let children = vec![
+            Arc::new(int_array) as ArrayRef,
+            Arc::new(str_array) as ArrayRef,
+        ];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, false))),
+            (1, Arc::new(Field::new("b", DataType::Utf8, false))),
+        ]
+        .into_iter()
+        .collect();
+
+        let array =
+            UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
+        let array_ref = Arc::new(array) as ArrayRef;
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array_ref.len()];
+        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
+
+        // 67 vs "norm"
+        assert_ne!(hashes[0], hashes[1]);
+        // 67 vs 100
+        assert_ne!(hashes[0], hashes[2]);
+        // "norm" vs "macdonald"
+        assert_ne!(hashes[1], hashes[3]);
+        // 100 vs "macdonald"
+        assert_ne!(hashes[2], hashes[3]);
+        // 67 vs 67
+        assert_eq!(hashes[0], hashes[4]);
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn create_hashes_for_sliced_run_array() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut full_hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut full_hashes,
+        )?;
+
+        let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
+        let sliced_array = array_ref.slice(2, 3);
+
+        let mut sliced_hashes = vec![0; sliced_array.len()];
+        create_hashes(
+            std::slice::from_ref(&sliced_array),
+            &random_state,
+            &mut sliced_hashes,
+        )?;
+
+        assert_eq!(sliced_hashes.len(), 3);
+        assert_eq!(sliced_hashes[0], sliced_hashes[1]);
+        assert_eq!(sliced_hashes[1], sliced_hashes[2]);
+        assert_eq!(&sliced_hashes, &full_hashes[2..5]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn test_run_array_with_nulls() -> Result<()> {
+        let values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_ends = Arc::new(Int32Array::from(vec![2, 4, 6]));
+        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
+
+        let random_state = RandomState::with_seed(0);
+        let mut hashes = vec![0; array.len()];
+        create_hashes(
+            &[Arc::clone(&array) as ArrayRef],
+            &random_state,
+            &mut hashes,
+        )?;
+
+        assert_eq!(hashes[0], hashes[1]);
+        assert_ne!(hashes[0], 0);
+        assert_eq!(hashes[2], hashes[3]);
+        assert_eq!(hashes[2], 0);
+        assert_eq!(hashes[4], hashes[5]);
+        assert_ne!(hashes[4], 0);
+        assert_ne!(hashes[0], hashes[4]);
+
+        Ok(())
+    }
+
+    #[test]
+    #[cfg(not(feature = "force_hash_collisions"))]
+    fn test_run_array_with_nulls_multicolumn() -> Result<()> {
+        let primitive_array = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
+        let run_ends = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let run_array =
+            Arc::new(RunArray::try_new(&run_ends, run_values.as_ref()).unwrap());
+        let second_col = Arc::new(Int32Array::from(vec![100, 200, 300]));
+
+        let random_state = RandomState::with_seed(0);
+
+        let mut primitive_hashes = vec![0; 3];
+        create_hashes(
+            &[
+                Arc::clone(&primitive_array) as ArrayRef,
+                Arc::clone(&second_col) as ArrayRef,
+            ],
+            &random_state,
+            &mut primitive_hashes,
+        )?;
+
+        let mut run_hashes = vec![0; 3];
+        create_hashes(
+            &[
+                Arc::clone(&run_array) as ArrayRef,
+                Arc::clone(&second_col) as ArrayRef,
+            ],
+            &random_state,
+            &mut run_hashes,
+        )?;
+
+        assert_eq!(primitive_hashes, run_hashes);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/common/src/instant.rs b/datafusion/common/src/instant.rs
index 42f21c061c0c2..a5dfb28292581 100644
--- a/datafusion/common/src/instant.rs
+++ b/datafusion/common/src/instant.rs
@@ -22,7 +22,7 @@
 /// under `wasm` feature gate. It provides the same API as [`std::time::Instant`].
 pub type Instant = web_time::Instant;
 
-#[allow(clippy::disallowed_types)]
+#[expect(clippy::disallowed_types)]
 #[cfg(not(target_family = "wasm"))]
 /// DataFusion wrapper around [`std::time::Instant`]. This is only a type alias.
 pub type Instant = std::time::Instant;
diff --git a/datafusion/common/src/join_type.rs b/datafusion/common/src/join_type.rs
index ac81d977b7296..8855e993f2bc7 100644
--- a/datafusion/common/src/join_type.rs
+++ b/datafusion/common/src/join_type.rs
@@ -67,6 +67,11 @@ pub enum JoinType {
     ///
     /// [1]: http://btw2017.informatik.uni-stuttgart.de/slidesandpapers/F1-10-37/paper_web.pdf
     LeftMark,
+    /// Right Mark Join
+    ///
+    /// Same logic as the LeftMark Join above, however it returns a record for each record from the
+    /// right input.
+    RightMark,
 }
 
 impl JoinType {
@@ -87,13 +92,41 @@ impl JoinType {
             JoinType::RightSemi => JoinType::LeftSemi,
             JoinType::LeftAnti => JoinType::RightAnti,
             JoinType::RightAnti => JoinType::LeftAnti,
-            JoinType::LeftMark => {
-                unreachable!("LeftMark join type does not support swapping")
-            }
+            JoinType::LeftMark => JoinType::RightMark,
+            JoinType::RightMark => JoinType::LeftMark,
+        }
+    }
+
+    /// Whether each side of the join is preserved for ON-clause filter pushdown.
+    ///
+    /// It is only correct to push ON-clause filters below a join for preserved
+    /// inputs.
+    ///
+    /// # "Preserved" input definition
+    ///
+    /// A join side is preserved if the join returns all or a subset of the rows
+    /// from that side, such that each output row directly maps to an input row.
+    /// If a side is not preserved, the join can produce extra null rows that
+    /// don't map to any input row.
+    ///
+    /// # Return Value
+    ///
+    /// A tuple of booleans - (left_preserved, right_preserved).
+    pub fn on_lr_is_preserved(&self) -> (bool, bool) {
+        match self {
+            JoinType::Inner => (true, true),
+            JoinType::Left => (false, true),
+            JoinType::Right => (true, false),
+            JoinType::Full => (false, false),
+            JoinType::LeftSemi | JoinType::RightSemi => (true, true),
+            JoinType::LeftAnti => (false, true),
+            JoinType::RightAnti => (true, false),
+            JoinType::LeftMark => (false, true),
+            JoinType::RightMark => (true, false),
         }
     }
 
-    /// Does the join type support swapping  inputs?
+    /// Does the join type support swapping inputs?
     pub fn supports_swap(&self) -> bool {
         matches!(
             self,
@@ -105,6 +138,8 @@ impl JoinType {
                 | JoinType::RightSemi
                 | JoinType::LeftAnti
                 | JoinType::RightAnti
+                | JoinType::LeftMark
+                | JoinType::RightMark
         )
     }
 }
@@ -121,6 +156,7 @@ impl Display for JoinType {
             JoinType::LeftAnti => "LeftAnti",
             JoinType::RightAnti => "RightAnti",
             JoinType::LeftMark => "LeftMark",
+            JoinType::RightMark => "RightMark",
         };
         write!(f, "{join_type}")
     }
@@ -141,6 +177,7 @@ impl FromStr for JoinType {
             "LEFTANTI" => Ok(JoinType::LeftAnti),
             "RIGHTANTI" => Ok(JoinType::RightAnti),
             "LEFTMARK" => Ok(JoinType::LeftMark),
+            "RIGHTMARK" => Ok(JoinType::RightMark),
             _ => _not_impl_err!("The join type {s} does not exist or is not implemented"),
         }
     }
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 7b2c86d3975ff..fdd04f752455e 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -19,18 +19,17 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 mod column;
 mod dfschema;
 mod functional_dependencies;
 mod join_type;
 mod param_value;
-#[cfg(feature = "pyarrow")]
-mod pyarrow;
 mod schema_reference;
 mod table_reference;
 mod unnest;
@@ -39,13 +38,19 @@ pub mod alias;
 pub mod cast;
 pub mod config;
 pub mod cse;
+pub mod datatype;
 pub mod diagnostic;
 pub mod display;
+pub mod encryption;
 pub mod error;
 pub mod file_options;
 pub mod format;
 pub mod hash_utils;
 pub mod instant;
+pub mod metadata;
+pub mod nested_struct;
+mod null_equality;
+pub mod parquet_config;
 pub mod parsers;
 pub mod pruning;
 pub mod rounding;
@@ -56,29 +61,33 @@ pub mod test_util;
 pub mod tree_node;
 pub mod types;
 pub mod utils;
-
 /// Reexport arrow crate
 pub use arrow;
 pub use column::Column;
 pub use dfschema::{
-    qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema,
+    DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, qualified_name,
 };
 pub use diagnostic::Diagnostic;
+pub use display::human_readable::{
+    human_readable_count, human_readable_duration, human_readable_size, units,
+};
 pub use error::{
-    field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError,
-    SharedResult,
+    DataFusionError, Result, SchemaError, SharedResult, field_not_found,
+    unqualified_field_not_found,
 };
 pub use file_options::file_type::{
-    GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION,
-    DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION,
+    DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION,
+    DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, GetExt,
 };
 pub use functional_dependencies::{
+    Constraint, Constraints, Dependency, FunctionalDependence, FunctionalDependencies,
     aggregate_functional_dependencies, get_required_group_by_exprs_indices,
-    get_target_functional_dependencies, Constraint, Constraints, Dependency,
-    FunctionalDependence, FunctionalDependencies,
+    get_target_functional_dependencies,
 };
-use hashbrown::hash_map::DefaultHashBuilder;
+use hashbrown::DefaultHashBuilder;
 pub use join_type::{JoinConstraint, JoinSide, JoinType};
+pub use nested_struct::cast_column;
+pub use null_equality::NullEquality;
 pub use param_value::ParamValues;
 pub use scalar::{ScalarType, ScalarValue};
 pub use schema_reference::SchemaReference;
@@ -95,14 +104,20 @@ pub use utils::project_schema;
 // https://github.com/rust-lang/rust/pull/52234#issuecomment-976702997
 #[doc(hidden)]
 pub use error::{
-    _config_datafusion_err, _exec_datafusion_err, _internal_datafusion_err,
-    _not_impl_datafusion_err, _plan_datafusion_err, _resources_datafusion_err,
-    _substrait_datafusion_err,
+    _config_datafusion_err, _exec_datafusion_err, _ffi_datafusion_err,
+    _internal_datafusion_err, _not_impl_datafusion_err, _plan_datafusion_err,
+    _resources_datafusion_err, _substrait_datafusion_err,
 };
 
 // The HashMap and HashSet implementations that should be used as the uniform defaults
 pub type HashMap<K, V, S = DefaultHashBuilder> = hashbrown::HashMap<K, V, S>;
 pub type HashSet<T, S = DefaultHashBuilder> = hashbrown::HashSet<T, S>;
+pub mod hash_map {
+    pub use hashbrown::hash_map::Entry;
+}
+pub mod hash_set {
+    pub use hashbrown::hash_set::Entry;
+}
 
 /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is
 /// not possible. In normal usage of DataFusion the downcast should always succeed.
@@ -123,10 +138,10 @@ macro_rules! downcast_value {
 // Not public API.
 #[doc(hidden)]
 pub mod __private {
-    use crate::error::_internal_datafusion_err;
     use crate::Result;
+    use crate::error::_internal_datafusion_err;
     use arrow::array::Array;
-    use std::any::{type_name, Any};
+    use std::any::{Any, type_name};
 
     #[doc(hidden)]
     pub trait DowncastArrayHelper {
@@ -136,10 +151,12 @@ pub mod __private {
     impl<T: Array + ?Sized> DowncastArrayHelper for T {
         fn downcast_array_helper<U: Any>(&self) -> Result<&U> {
             self.as_any().downcast_ref().ok_or_else(|| {
+                let actual_type = self.data_type();
+                let desired_type_name = type_name::<U>();
                 _internal_datafusion_err!(
                     "could not cast array of type {} to {}",
-                    self.data_type(),
-                    type_name::<U>()
+                    actual_type,
+                    desired_type_name
                 )
             })
         }
@@ -175,7 +192,7 @@ mod tests {
 
         assert_starts_with(
             error.to_string(),
-            "Internal error: could not cast array of type Int32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::UInt64Type>"
+            "Internal error: could not cast array of type Int32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::UInt64Type>",
         );
     }
 
diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs
new file mode 100644
index 0000000000000..d6d8fb7b0ed0c
--- /dev/null
+++ b/datafusion/common/src/metadata.rs
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::BTreeMap, sync::Arc};
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use hashbrown::HashMap;
+
+use crate::{DataFusionError, ScalarValue, error::_plan_err};
+
+/// A [`ScalarValue`] with optional [`FieldMetadata`]
+#[derive(Debug, Clone)]
+pub struct ScalarAndMetadata {
+    pub value: ScalarValue,
+    pub metadata: Option<FieldMetadata>,
+}
+
+impl ScalarAndMetadata {
+    /// Create a new Literal from a scalar value with optional [`FieldMetadata`]
+    pub fn new(value: ScalarValue, metadata: Option<FieldMetadata>) -> Self {
+        Self { value, metadata }
+    }
+
+    /// Access the underlying [ScalarValue] storage
+    pub fn value(&self) -> &ScalarValue {
+        &self.value
+    }
+
+    /// Access the [FieldMetadata] attached to this value, if any
+    pub fn metadata(&self) -> Option<&FieldMetadata> {
+        self.metadata.as_ref()
+    }
+
+    /// Consume self and return components
+    pub fn into_inner(self) -> (ScalarValue, Option<FieldMetadata>) {
+        (self.value, self.metadata)
+    }
+
+    /// Cast this values's storage type
+    ///
+    /// This operation assumes that if the underlying [ScalarValue] can be casted
+    /// to a given type that any extension type represented by the metadata is also
+    /// valid.
+    pub fn cast_storage_to(
+        &self,
+        target_type: &DataType,
+    ) -> Result<Self, DataFusionError> {
+        let new_value = self.value().cast_to(target_type)?;
+        Ok(Self::new(new_value, self.metadata.clone()))
+    }
+}
+
+/// create a new ScalarAndMetadata from a ScalarValue without
+/// any metadata
+impl From<ScalarValue> for ScalarAndMetadata {
+    fn from(value: ScalarValue) -> Self {
+        Self::new(value, None)
+    }
+}
+
+/// Assert equality of data types where one or both sides may have field metadata
+///
+/// This currently compares absent metadata (e.g., one side was a DataType) and
+/// empty metadata (e.g., one side was a field where the field had no metadata)
+/// as equal and uses byte-for-byte comparison for the keys and values of the
+/// fields, even though this is potentially too strict for some cases (e.g.,
+/// extension types where extension metadata is represented by JSON, or cases
+/// where field metadata is orthogonal to the interpretation of the data type).
+///
+/// Returns a planning error with suitably formatted type representations if
+/// actual and expected do not compare to equal.
+pub fn check_metadata_with_storage_equal(
+    actual: (
+        &DataType,
+        Option<&std::collections::HashMap<String, String>>,
+    ),
+    expected: (
+        &DataType,
+        Option<&std::collections::HashMap<String, String>>,
+    ),
+    what: &str,
+    context: &str,
+) -> Result<(), DataFusionError> {
+    if actual.0 != expected.0 {
+        return _plan_err!(
+            "Expected {what} of type {}, got {}{context}",
+            format_type_and_metadata(expected.0, expected.1),
+            format_type_and_metadata(actual.0, actual.1)
+        );
+    }
+
+    let metadata_equal = match (actual.1, expected.1) {
+        (None, None) => true,
+        (None, Some(expected_metadata)) => expected_metadata.is_empty(),
+        (Some(actual_metadata), None) => actual_metadata.is_empty(),
+        (Some(actual_metadata), Some(expected_metadata)) => {
+            actual_metadata == expected_metadata
+        }
+    };
+
+    if !metadata_equal {
+        return _plan_err!(
+            "Expected {what} of type {}, got {}{context}",
+            format_type_and_metadata(expected.0, expected.1),
+            format_type_and_metadata(actual.0, actual.1)
+        );
+    }
+
+    Ok(())
+}
+
+/// Given a data type represented by storage and optional metadata, generate
+/// a user-facing string
+///
+/// This function exists to reduce the number of Field debug strings that are
+/// used to communicate type information in error messages and plan explain
+/// renderings.
+pub fn format_type_and_metadata(
+    data_type: &DataType,
+    metadata: Option<&std::collections::HashMap<String, String>>,
+) -> String {
+    match metadata {
+        Some(metadata) if !metadata.is_empty() => {
+            format!("{data_type}<{metadata:?}>")
+        }
+        _ => data_type.to_string(),
+    }
+}
+
+/// Literal metadata
+///
+/// Stores metadata associated with a literal expressions
+/// and is designed to be fast to `clone`.
+///
+/// This structure is used to store metadata associated with a literal expression, and it
+/// corresponds to the `metadata` field on [`Field`].
+///
+/// # Example: Create [`FieldMetadata`] from a [`Field`]
+/// ```
+/// # use std::collections::HashMap;
+/// # use datafusion_common::metadata::FieldMetadata;
+/// # use arrow::datatypes::{Field, DataType};
+/// # let field = Field::new("c1", DataType::Int32, true)
+/// #  .with_metadata(HashMap::from([("foo".to_string(), "bar".to_string())]));
+/// // Create a new `FieldMetadata` instance from a `Field`
+/// let metadata = FieldMetadata::new_from_field(&field);
+/// // There is also a `From` impl:
+/// let metadata = FieldMetadata::from(&field);
+/// ```
+///
+/// # Example: Update a [`Field`] with [`FieldMetadata`]
+/// ```
+/// # use datafusion_common::metadata::FieldMetadata;
+/// # use arrow::datatypes::{Field, DataType};
+/// # let field = Field::new("c1", DataType::Int32, true);
+/// # let metadata = FieldMetadata::new_from_field(&field);
+/// // Add any metadata from `FieldMetadata` to `Field`
+/// let updated_field = metadata.add_to_field(field);
+/// ```
+///
+/// For more background, please also see the [Implementing User Defined Types and Custom Metadata in DataFusion blog]
+///
+/// [Implementing User Defined Types and Custom Metadata in DataFusion blog]: https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub struct FieldMetadata {
+    /// The inner metadata of a literal expression, which is a map of string
+    /// keys to string values.
+    ///
+    /// Note this is not a `HashMap` because `HashMap` does not provide
+    /// implementations for traits like `Debug` and `Hash`.
+    inner: Arc<BTreeMap<String, String>>,
+}
+
+impl Default for FieldMetadata {
+    fn default() -> Self {
+        Self::new_empty()
+    }
+}
+
+impl FieldMetadata {
+    /// Create a new empty metadata instance.
+    pub fn new_empty() -> Self {
+        Self {
+            inner: Arc::new(BTreeMap::new()),
+        }
+    }
+
+    /// Merges two optional `FieldMetadata` instances, overwriting any existing
+    /// keys in `m` with keys from `n` if present.
+    ///
+    /// This function is commonly used in alias operations, particularly for literals
+    /// with metadata. When creating an alias expression, the metadata from the original
+    /// expression (such as a literal) is combined with any metadata specified on the alias.
+    ///
+    /// # Arguments
+    ///
+    /// * `m` - The first metadata (typically from the original expression like a literal)
+    /// * `n` - The second metadata (typically from the alias definition)
+    ///
+    /// # Merge Strategy
+    ///
+    /// - If both metadata instances exist, they are merged with `n` taking precedence
+    /// - Keys from `n` will overwrite keys from `m` if they have the same name
+    /// - If only one metadata instance exists, it is returned unchanged
+    /// - If neither exists, `None` is returned
+    ///
+    /// # Example usage
+    /// ```rust
+    /// use datafusion_common::metadata::FieldMetadata;
+    /// use std::collections::BTreeMap;
+    ///
+    /// // Create metadata for a literal expression
+    /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([
+    ///     ("source".to_string(), "constant".to_string()),
+    ///     ("type".to_string(), "int".to_string()),
+    /// ])));
+    ///
+    /// // Create metadata for an alias
+    /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([
+    ///     ("description".to_string(), "answer".to_string()),
+    ///     ("source".to_string(), "user".to_string()), // This will override literal's "source"
+    /// ])));
+    ///
+    /// // Merge the metadata
+    /// let merged = FieldMetadata::merge_options(
+    ///     literal_metadata.as_ref(),
+    ///     alias_metadata.as_ref(),
+    /// );
+    ///
+    /// // Result contains: {"source": "user", "type": "int", "description": "answer"}
+    /// assert!(merged.is_some());
+    /// ```
+    pub fn merge_options(
+        m: Option<&FieldMetadata>,
+        n: Option<&FieldMetadata>,
+    ) -> Option<FieldMetadata> {
+        match (m, n) {
+            (Some(m), Some(n)) => {
+                let mut merged = m.clone();
+                merged.extend(n.clone());
+                Some(merged)
+            }
+            (Some(m), None) => Some(m.clone()),
+            (None, Some(n)) => Some(n.clone()),
+            (None, None) => None,
+        }
+    }
+
+    /// Create a new metadata instance from a `Field`'s metadata.
+    pub fn new_from_field(field: &Field) -> Self {
+        let inner = field
+            .metadata()
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self {
+            inner: Arc::new(inner),
+        }
+    }
+
+    /// Create a new metadata instance from a map of string keys to string values.
+    pub fn new(inner: BTreeMap<String, String>) -> Self {
+        Self {
+            inner: Arc::new(inner),
+        }
+    }
+
+    /// Get the inner metadata as a reference to a `BTreeMap`.
+    pub fn inner(&self) -> &BTreeMap<String, String> {
+        &self.inner
+    }
+
+    /// Return the inner metadata
+    pub fn into_inner(self) -> Arc<BTreeMap<String, String>> {
+        self.inner
+    }
+
+    /// Adds metadata from `other` into `self`, overwriting any existing keys.
+    pub fn extend(&mut self, other: Self) {
+        if other.is_empty() {
+            return;
+        }
+        let other = Arc::unwrap_or_clone(other.into_inner());
+        Arc::make_mut(&mut self.inner).extend(other);
+    }
+
+    /// Returns true if the metadata is empty.
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Returns the number of key-value pairs in the metadata.
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// Convert this `FieldMetadata` into a `HashMap<String, String>`
+    pub fn to_hashmap(&self) -> std::collections::HashMap<String, String> {
+        self.inner
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect()
+    }
+
+    /// Updates the metadata on the Field with this metadata, if it is not empty.
+    pub fn add_to_field(&self, field: Field) -> Field {
+        if self.inner.is_empty() {
+            return field;
+        }
+
+        field.with_metadata(self.to_hashmap())
+    }
+
+    /// Updates the metadata on the FieldRef with this metadata, if it is not empty.
+    pub fn add_to_field_ref(&self, mut field_ref: FieldRef) -> FieldRef {
+        if self.inner.is_empty() {
+            return field_ref;
+        }
+
+        Arc::make_mut(&mut field_ref).set_metadata(self.to_hashmap());
+        field_ref
+    }
+}
+
+impl From<&Field> for FieldMetadata {
+    fn from(field: &Field) -> Self {
+        Self::new_from_field(field)
+    }
+}
+
+impl From<BTreeMap<String, String>> for FieldMetadata {
+    fn from(inner: BTreeMap<String, String>) -> Self {
+        Self::new(inner)
+    }
+}
+
+impl From<std::collections::HashMap<String, String>> for FieldMetadata {
+    fn from(map: std::collections::HashMap<String, String>) -> Self {
+        Self::new(map.into_iter().collect())
+    }
+}
+
+/// From reference
+impl From<&std::collections::HashMap<String, String>> for FieldMetadata {
+    fn from(map: &std::collections::HashMap<String, String>) -> Self {
+        let inner = map
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self::new(inner)
+    }
+}
+
+/// From hashbrown map
+impl From<HashMap<String, String>> for FieldMetadata {
+    fn from(map: HashMap<String, String>) -> Self {
+        let inner = map.into_iter().collect();
+        Self::new(inner)
+    }
+}
+
+impl From<&HashMap<String, String>> for FieldMetadata {
+    fn from(map: &HashMap<String, String>) -> Self {
+        let inner = map
+            .into_iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self::new(inner)
+    }
+}
diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs
new file mode 100644
index 0000000000000..bf2558f313069
--- /dev/null
+++ b/datafusion/common/src/nested_struct.rs
@@ -0,0 +1,1013 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{_plan_err, Result};
+use arrow::{
+    array::{Array, ArrayRef, StructArray, new_null_array},
+    compute::{CastOptions, cast_with_options},
+    datatypes::{DataType, DataType::Struct, Field, FieldRef},
+};
+use std::{collections::HashSet, sync::Arc};
+
+/// Cast a struct column to match target struct fields, handling nested structs recursively.
+///
+/// This function implements struct-to-struct casting with the assumption that **structs should
+/// always be allowed to cast to other structs**. However, the source column must already be
+/// a struct type - non-struct sources will result in an error.
+///
+/// ## Field Matching Strategy
+/// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive)
+/// - **No Positional Mapping**: Structs with no overlapping field names are rejected
+/// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type
+/// - **Missing Fields**: Target fields not present in the source are filled with null values
+/// - **Extra Fields**: Source fields not present in the target are ignored
+///
+/// ## Nested Struct Handling
+/// - Nested structs are handled recursively using the same casting rules
+/// - Each level of nesting follows the same field matching and null-filling strategy
+/// - This allows for complex struct transformations while maintaining data integrity
+///
+/// # Arguments
+/// * `source_col` - The source array to cast (must be a struct array)
+/// * `target_fields` - The target struct field definitions to cast to
+///
+/// # Returns
+/// A `Result<ArrayRef>` containing the cast struct array
+///
+/// # Errors
+/// Returns a `DataFusionError::Plan` if the source column is not a struct type
+fn cast_struct_column(
+    source_col: &ArrayRef,
+    target_fields: &[Arc<Field>],
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    if source_col.data_type() == &DataType::Null
+        || (!source_col.is_empty() && source_col.null_count() == source_col.len())
+    {
+        return Ok(new_null_array(
+            &Struct(target_fields.to_vec().into()),
+            source_col.len(),
+        ));
+    }
+
+    if let Some(source_struct) = source_col.as_any().downcast_ref::<StructArray>() {
+        let source_fields = source_struct.fields();
+        validate_struct_compatibility(source_fields, target_fields)?;
+        let mut fields: Vec<Arc<Field>> = Vec::with_capacity(target_fields.len());
+        let mut arrays: Vec<ArrayRef> = Vec::with_capacity(target_fields.len());
+        let num_rows = source_col.len();
+
+        // Iterate target fields and pick source child by name when present.
+        for target_child_field in target_fields.iter() {
+            fields.push(Arc::clone(target_child_field));
+
+            let source_child_opt =
+                source_struct.column_by_name(target_child_field.name());
+
+            match source_child_opt {
+                Some(source_child_col) => {
+                    let adapted_child =
+                        cast_column(source_child_col, target_child_field, cast_options)
+                            .map_err(|e| {
+                            e.context(format!(
+                                "While casting struct field '{}'",
+                                target_child_field.name()
+                            ))
+                        })?;
+                    arrays.push(adapted_child);
+                }
+                None => {
+                    arrays.push(new_null_array(target_child_field.data_type(), num_rows));
+                }
+            }
+        }
+
+        let struct_array =
+            StructArray::new(fields.into(), arrays, source_struct.nulls().cloned());
+        Ok(Arc::new(struct_array))
+    } else {
+        // Return error if source is not a struct type
+        _plan_err!(
+            "Cannot cast column of type {} to struct type. Source must be a struct to cast to struct.",
+            source_col.data_type()
+        )
+    }
+}
+
+/// Cast a column to match the target field type, with special handling for nested structs.
+///
+/// This function serves as the main entry point for column casting operations. For struct
+/// types, it enforces that **only struct columns can be cast to struct types**.
+///
+/// ## Casting Behavior
+/// - **Struct Types**: Delegates to `cast_struct_column` for struct-to-struct casting only
+/// - **Non-Struct Types**: Uses Arrow's standard `cast` function for primitive type conversions
+///
+/// ## Cast Options
+/// The `cast_options` argument controls how Arrow handles values that cannot be represented
+/// in the target type. When `safe` is `false` (DataFusion's default) the cast will return an
+/// error if such a value is encountered. Setting `safe` to `true` instead produces `NULL`
+/// for out-of-range or otherwise invalid values. The options also allow customizing how
+/// temporal values are formatted when cast to strings.
+///
+/// ```
+/// use arrow::array::{ArrayRef, Int64Array};
+/// use arrow::compute::CastOptions;
+/// use arrow::datatypes::{DataType, Field};
+/// use datafusion_common::nested_struct::cast_column;
+/// use std::sync::Arc;
+///
+/// let source: ArrayRef = Arc::new(Int64Array::from(vec![1, i64::MAX]));
+/// let target = Field::new("ints", DataType::Int32, true);
+/// // Permit lossy conversions by producing NULL on overflow instead of erroring
+/// let options = CastOptions {
+///     safe: true,
+///     ..Default::default()
+/// };
+/// let result = cast_column(&source, &target, &options).unwrap();
+/// assert!(result.is_null(1));
+/// ```
+///
+/// ## Struct Casting Requirements
+/// The struct casting logic requires that the source column must already be a struct type.
+/// This makes the function useful for:
+/// - Schema evolution scenarios where struct layouts change over time
+/// - Data migration between different struct schemas
+/// - Type-safe data processing pipelines that maintain struct type integrity
+///
+/// # Arguments
+/// * `source_col` - The source array to cast
+/// * `target_field` - The target field definition (including type and metadata)
+/// * `cast_options` - Options that govern strictness and formatting of the cast
+///
+/// # Returns
+/// A `Result<ArrayRef>` containing the cast array
+///
+/// # Errors
+/// Returns an error if:
+/// - Attempting to cast a non-struct column to a struct type
+/// - Arrow's cast function fails for non-struct types
+/// - Memory allocation fails during struct construction
+/// - Invalid data type combinations are encountered
+pub fn cast_column(
+    source_col: &ArrayRef,
+    target_field: &Field,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    match target_field.data_type() {
+        Struct(target_fields) => {
+            cast_struct_column(source_col, target_fields, cast_options)
+        }
+        _ => Ok(cast_with_options(
+            source_col,
+            target_field.data_type(),
+            cast_options,
+        )?),
+    }
+}
+
+/// Validates compatibility between source and target struct fields for casting operations.
+///
+/// This function implements comprehensive struct compatibility checking by examining:
+/// - Field name matching between source and target structs
+/// - Type castability for each matching field (including recursive struct validation)
+/// - Proper handling of missing fields (target fields not in source are allowed - filled with nulls)
+/// - Proper handling of extra fields (source fields not in target are allowed - ignored)
+///
+/// # Compatibility Rules
+/// - **Field Matching**: Fields are matched by name (case-sensitive)
+/// - **Missing Target Fields**: Allowed - will be filled with null values during casting
+/// - **Extra Source Fields**: Allowed - will be ignored during casting
+/// - **Type Compatibility**: Each matching field must be castable using Arrow's type system
+/// - **Nested Structs**: Recursively validates nested struct compatibility
+///
+/// # Arguments
+/// * `source_fields` - Fields from the source struct type
+/// * `target_fields` - Fields from the target struct type
+///
+/// # Returns
+/// * `Ok(())` if the structs are compatible for casting
+/// * `Err(DataFusionError)` with detailed error message if incompatible
+///
+/// # Examples
+/// ```text
+/// // Compatible: source has extra field, target has missing field
+/// // Source: {a: i32, b: string, c: f64}
+/// // Target: {a: i64, d: bool}
+/// // Result: Ok(()) - 'a' can cast i32->i64, 'b','c' ignored, 'd' filled with nulls
+///
+/// // Incompatible: matching field has incompatible types
+/// // Source: {a: string}
+/// // Target: {a: binary}
+/// // Result: Err(...) - string cannot cast to binary
+/// ```
+///
+pub fn validate_struct_compatibility(
+    source_fields: &[FieldRef],
+    target_fields: &[FieldRef],
+) -> Result<()> {
+    let has_overlap = has_one_of_more_common_fields(source_fields, target_fields);
+    if !has_overlap {
+        return _plan_err!(
+            "Cannot cast struct with {} fields to {} fields because there is no field name overlap",
+            source_fields.len(),
+            target_fields.len()
+        );
+    }
+
+    // Check compatibility for each target field
+    for target_field in target_fields {
+        // Look for matching field in source by name
+        if let Some(source_field) = source_fields
+            .iter()
+            .find(|f| f.name() == target_field.name())
+        {
+            validate_field_compatibility(source_field, target_field)?;
+        } else {
+            // Target field is missing from source
+            // If it's non-nullable, we cannot fill it with NULL
+            if !target_field.is_nullable() {
+                return _plan_err!(
+                    "Cannot cast struct: target field '{}' is non-nullable but missing from source. \
+                     Cannot fill with NULL.",
+                    target_field.name()
+                );
+            }
+        }
+    }
+
+    // Extra fields in source are OK - they'll be ignored
+    Ok(())
+}
+
+fn validate_field_compatibility(
+    source_field: &Field,
+    target_field: &Field,
+) -> Result<()> {
+    if source_field.data_type() == &DataType::Null {
+        // Validate that target allows nulls before returning early.
+        // It is invalid to cast a NULL source field to a non-nullable target field.
+        if !target_field.is_nullable() {
+            return _plan_err!(
+                "Cannot cast NULL struct field '{}' to non-nullable field '{}'",
+                source_field.name(),
+                target_field.name()
+            );
+        }
+        return Ok(());
+    }
+
+    // Ensure nullability is compatible. It is invalid to cast a nullable
+    // source field to a non-nullable target field as this may discard
+    // null values.
+    if source_field.is_nullable() && !target_field.is_nullable() {
+        return _plan_err!(
+            "Cannot cast nullable struct field '{}' to non-nullable field",
+            target_field.name()
+        );
+    }
+
+    // Check if the matching field types are compatible
+    match (source_field.data_type(), target_field.data_type()) {
+        // Recursively validate nested structs
+        (Struct(source_nested), Struct(target_nested)) => {
+            validate_struct_compatibility(source_nested, target_nested)?;
+        }
+        // For non-struct types, use the existing castability check
+        _ => {
+            if !arrow::compute::can_cast_types(
+                source_field.data_type(),
+                target_field.data_type(),
+            ) {
+                return _plan_err!(
+                    "Cannot cast struct field '{}' from type {} to type {}",
+                    target_field.name(),
+                    source_field.data_type(),
+                    target_field.data_type()
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Check if two field lists have at least one common field by name.
+///
+/// This is useful for validating struct compatibility when casting between structs,
+/// ensuring that source and target fields have overlapping names.
+pub fn has_one_of_more_common_fields(
+    source_fields: &[FieldRef],
+    target_fields: &[FieldRef],
+) -> bool {
+    let source_names: HashSet<&str> = source_fields
+        .iter()
+        .map(|field| field.name().as_str())
+        .collect();
+    target_fields
+        .iter()
+        .any(|field| source_names.contains(field.name().as_str()))
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::{assert_contains, format::DEFAULT_CAST_OPTIONS};
+    use arrow::{
+        array::{
+            BinaryArray, Int32Array, Int32Builder, Int64Array, ListArray, MapArray,
+            MapBuilder, NullArray, StringArray, StringBuilder,
+        },
+        buffer::NullBuffer,
+        datatypes::{DataType, Field, FieldRef, Int32Type},
+    };
+    /// Macro to extract and downcast a column from a StructArray
+    macro_rules! get_column_as {
+        ($struct_array:expr, $column_name:expr, $array_type:ty) => {
+            $struct_array
+                .column_by_name($column_name)
+                .unwrap()
+                .as_any()
+                .downcast_ref::<$array_type>()
+                .unwrap()
+        };
+    }
+
+    fn field(name: &str, data_type: DataType) -> Field {
+        Field::new(name, data_type, true)
+    }
+
+    fn non_null_field(name: &str, data_type: DataType) -> Field {
+        Field::new(name, data_type, false)
+    }
+
+    fn arc_field(name: &str, data_type: DataType) -> FieldRef {
+        Arc::new(field(name, data_type))
+    }
+
+    fn struct_type(fields: Vec<Field>) -> DataType {
+        Struct(fields.into())
+    }
+
+    fn struct_field(name: &str, fields: Vec<Field>) -> Field {
+        field(name, struct_type(fields))
+    }
+
+    fn arc_struct_field(name: &str, fields: Vec<Field>) -> FieldRef {
+        Arc::new(struct_field(name, fields))
+    }
+
+    #[test]
+    fn test_cast_simple_column() {
+        let source = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
+        let target_field = field("ints", DataType::Int64);
+        let result = cast_column(&source, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.value(0), 1);
+        assert_eq!(result.value(1), 2);
+        assert_eq!(result.value(2), 3);
+    }
+
+    #[test]
+    fn test_cast_column_with_options() {
+        let source = Arc::new(Int64Array::from(vec![1, i64::MAX])) as ArrayRef;
+        let target_field = field("ints", DataType::Int32);
+
+        let safe_opts = CastOptions {
+            // safe: false - return Err for failure
+            safe: false,
+            ..DEFAULT_CAST_OPTIONS
+        };
+        assert!(cast_column(&source, &target_field, &safe_opts).is_err());
+
+        let unsafe_opts = CastOptions {
+            // safe: true - return Null for failure
+            safe: true,
+            ..DEFAULT_CAST_OPTIONS
+        };
+        let result = cast_column(&source, &target_field, &unsafe_opts).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(result.value(0), 1);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_cast_struct_with_missing_field() {
+        let a_array = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef;
+        let source_struct = StructArray::from(vec![(
+            arc_field("a", DataType::Int32),
+            Arc::clone(&a_array),
+        )]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "s",
+            vec![field("a", DataType::Int32), field("b", DataType::Utf8)],
+        );
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_array.fields().len(), 2);
+        let a_result = get_column_as!(&struct_array, "a", Int32Array);
+        assert_eq!(a_result.value(0), 1);
+        assert_eq!(a_result.value(1), 2);
+
+        let b_result = get_column_as!(&struct_array, "b", StringArray);
+        assert_eq!(b_result.len(), 2);
+        assert!(b_result.is_null(0));
+        assert!(b_result.is_null(1));
+    }
+
+    #[test]
+    fn test_cast_struct_source_not_struct() {
+        let source = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef;
+        let target_field = struct_field("s", vec![field("a", DataType::Int32)]);
+
+        let result = cast_column(&source, &target_field, &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("Cannot cast column of type"));
+        assert!(error_msg.contains("to struct type"));
+        assert!(error_msg.contains("Source must be a struct"));
+    }
+
+    #[test]
+    fn test_cast_struct_incompatible_child_type() {
+        let a_array = Arc::new(BinaryArray::from(vec![
+            Some(b"a".as_ref()),
+            Some(b"b".as_ref()),
+        ])) as ArrayRef;
+        let source_struct =
+            StructArray::from(vec![(arc_field("a", DataType::Binary), a_array)]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field("s", vec![field("a", DataType::Int32)]);
+
+        let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("Cannot cast struct field 'a'"));
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_incompatible_types() {
+        // Source struct: {field1: Binary, field2: String}
+        let source_fields = vec![
+            arc_field("field1", DataType::Binary),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Target struct: {field1: Int32}
+        let target_fields = vec![arc_field("field1", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("Cannot cast struct field 'field1'"));
+        assert!(error_msg.contains("Binary"));
+        assert!(error_msg.contains("Int32"));
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_compatible_types() {
+        // Source struct: {field1: Int32, field2: String}
+        let source_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Target struct: {field1: Int64} (Int32 can cast to Int64)
+        let target_fields = vec![arc_field("field1", DataType::Int64)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_missing_field_in_source() {
+        // Source struct: {field1: Int32} (missing field2)
+        let source_fields = vec![arc_field("field1", DataType::Int32)];
+
+        // Target struct: {field1: Int32, field2: Utf8}
+        let target_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Should be OK - missing fields will be filled with nulls
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_additional_field_in_source() {
+        // Source struct: {field1: Int32, field2: String} (extra field2)
+        let source_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Target struct: {field1: Int32}
+        let target_fields = vec![arc_field("field1", DataType::Int32)];
+
+        // Should be OK - extra fields in source are ignored
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_no_overlap_mismatch_len() {
+        let source_fields = vec![
+            arc_field("left", DataType::Int32),
+            arc_field("right", DataType::Int32),
+        ];
+        let target_fields = vec![arc_field("alpha", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
+    #[test]
+    fn test_cast_struct_parent_nulls_retained() {
+        let a_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let fields = vec![arc_field("a", DataType::Int32)];
+        let nulls = Some(NullBuffer::from(vec![true, false]));
+        let source_struct = StructArray::new(fields.clone().into(), vec![a_array], nulls);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field("s", vec![field("a", DataType::Int64)]);
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_array.null_count(), 1);
+        assert!(struct_array.is_valid(0));
+        assert!(struct_array.is_null(1));
+
+        let a_result = get_column_as!(&struct_array, "a", Int64Array);
+        assert_eq!(a_result.value(0), 1);
+        assert_eq!(a_result.value(1), 2);
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_nullable_to_non_nullable() {
+        // Source struct: {field1: Int32 nullable}
+        let source_fields = vec![arc_field("field1", DataType::Int32)];
+
+        // Target struct: {field1: Int32 non-nullable}
+        let target_fields = vec![Arc::new(non_null_field("field1", DataType::Int32))];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("field1"));
+        assert!(error_msg.contains("non-nullable"));
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_non_nullable_to_nullable() {
+        // Source struct: {field1: Int32 non-nullable}
+        let source_fields = vec![Arc::new(non_null_field("field1", DataType::Int32))];
+
+        // Target struct: {field1: Int32 nullable}
+        let target_fields = vec![arc_field("field1", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_nested_nullable_to_non_nullable() {
+        // Source struct: {field1: {nested: Int32 nullable}}
+        let source_fields = vec![Arc::new(non_null_field(
+            "field1",
+            struct_type(vec![field("nested", DataType::Int32)]),
+        ))];
+
+        // Target struct: {field1: {nested: Int32 non-nullable}}
+        let target_fields = vec![Arc::new(non_null_field(
+            "field1",
+            struct_type(vec![non_null_field("nested", DataType::Int32)]),
+        ))];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert!(error_msg.contains("nested"));
+        assert!(error_msg.contains("non-nullable"));
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_by_name() {
+        // Source struct: {field1: Int32, field2: String}
+        let source_fields = vec![
+            arc_field("field1", DataType::Int32),
+            arc_field("field2", DataType::Utf8),
+        ];
+
+        // Target struct: {field2: String, field1: Int64}
+        let target_fields = vec![
+            arc_field("field2", DataType::Utf8),
+            arc_field("field1", DataType::Int64),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_by_name_with_type_mismatch() {
+        // Source struct: {field1: Binary}
+        let source_fields = vec![arc_field("field1", DataType::Binary)];
+
+        // Target struct: {field1: Int32} (incompatible type)
+        let target_fields = vec![arc_field("field1", DataType::Int32)];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct field 'field1' from type Binary to type Int32"
+        );
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_no_overlap_equal_len() {
+        let source_fields = vec![
+            arc_field("left", DataType::Int32),
+            arc_field("right", DataType::Utf8),
+        ];
+
+        let target_fields = vec![
+            arc_field("alpha", DataType::Int32),
+            arc_field("beta", DataType::Utf8),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_mixed_name_overlap() {
+        // Source struct: {a: Int32, b: String, extra: Boolean}
+        let source_fields = vec![
+            arc_field("a", DataType::Int32),
+            arc_field("b", DataType::Utf8),
+            arc_field("extra", DataType::Boolean),
+        ];
+
+        // Target struct: {b: String, a: Int64, c: Float32}
+        // Name overlap with a and b, missing c (nullable)
+        let target_fields = vec![
+            arc_field("b", DataType::Utf8),
+            arc_field("a", DataType::Int64),
+            arc_field("c", DataType::Float32),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_by_name_missing_required_field() {
+        // Source struct: {field1: Int32} (missing field2)
+        let source_fields = vec![arc_field("field1", DataType::Int32)];
+
+        // Target struct: {field1: Int32, field2: Int32 non-nullable}
+        let target_fields = vec![
+            arc_field("field1", DataType::Int32),
+            Arc::new(non_null_field("field2", DataType::Int32)),
+        ];
+
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct: target field 'field2' is non-nullable but missing from source. Cannot fill with NULL."
+        );
+    }
+
+    #[test]
+    fn test_validate_struct_compatibility_partial_name_overlap_with_count_mismatch() {
+        // Source struct: {a: Int32} (only one field)
+        let source_fields = vec![arc_field("a", DataType::Int32)];
+
+        // Target struct: {a: Int32, b: String} (two fields, but 'a' overlaps)
+        let target_fields = vec![
+            arc_field("a", DataType::Int32),
+            arc_field("b", DataType::Utf8),
+        ];
+
+        // This should succeed - partial overlap means by-name mapping
+        // and missing field 'b' is nullable
+        let result = validate_struct_compatibility(&source_fields, &target_fields);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_cast_nested_struct_with_extra_and_missing_fields() {
+        // Source inner struct has fields a, b, extra
+        let a = Arc::new(Int32Array::from(vec![Some(1), None])) as ArrayRef;
+        let b = Arc::new(Int32Array::from(vec![Some(2), Some(3)])) as ArrayRef;
+        let extra = Arc::new(Int32Array::from(vec![Some(9), Some(10)])) as ArrayRef;
+
+        let inner = StructArray::from(vec![
+            (arc_field("a", DataType::Int32), a),
+            (arc_field("b", DataType::Int32), b),
+            (arc_field("extra", DataType::Int32), extra),
+        ]);
+
+        let source_struct = StructArray::from(vec![(
+            arc_struct_field(
+                "inner",
+                vec![
+                    field("a", DataType::Int32),
+                    field("b", DataType::Int32),
+                    field("extra", DataType::Int32),
+                ],
+            ),
+            Arc::new(inner) as ArrayRef,
+        )]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        // Target inner struct reorders fields, adds "missing", and drops "extra"
+        let target_field = struct_field(
+            "outer",
+            vec![struct_field(
+                "inner",
+                vec![
+                    field("b", DataType::Int64),
+                    field("a", DataType::Int32),
+                    field("missing", DataType::Int32),
+                ],
+            )],
+        );
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let outer = result.as_any().downcast_ref::<StructArray>().unwrap();
+        let inner = get_column_as!(&outer, "inner", StructArray);
+        assert_eq!(inner.fields().len(), 3);
+
+        let b = get_column_as!(inner, "b", Int64Array);
+        assert_eq!(b.value(0), 2);
+        assert_eq!(b.value(1), 3);
+        assert!(!b.is_null(0));
+        assert!(!b.is_null(1));
+
+        let a = get_column_as!(inner, "a", Int32Array);
+        assert_eq!(a.value(0), 1);
+        assert!(a.is_null(1));
+
+        let missing = get_column_as!(inner, "missing", Int32Array);
+        assert!(missing.is_null(0));
+        assert!(missing.is_null(1));
+    }
+
+    #[test]
+    fn test_cast_null_struct_field_to_nested_struct() {
+        let null_inner = Arc::new(NullArray::new(2)) as ArrayRef;
+        let source_struct = StructArray::from(vec![(
+            arc_field("inner", DataType::Null),
+            Arc::clone(&null_inner),
+        )]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "outer",
+            vec![struct_field("inner", vec![field("a", DataType::Int32)])],
+        );
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let outer = result.as_any().downcast_ref::<StructArray>().unwrap();
+        let inner = get_column_as!(&outer, "inner", StructArray);
+        assert_eq!(inner.len(), 2);
+        assert!(inner.is_null(0));
+        assert!(inner.is_null(1));
+
+        let inner_a = get_column_as!(inner, "a", Int32Array);
+        assert!(inner_a.is_null(0));
+        assert!(inner_a.is_null(1));
+    }
+
+    #[test]
+    fn test_cast_struct_with_array_and_map_fields() {
+        // Array field with second row null
+        let arr_array = Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+        ])) as ArrayRef;
+
+        // Map field with second row null
+        let string_builder = StringBuilder::new();
+        let int_builder = Int32Builder::new();
+        let mut map_builder = MapBuilder::new(None, string_builder, int_builder);
+        map_builder.keys().append_value("a");
+        map_builder.values().append_value(1);
+        map_builder.append(true).unwrap();
+        map_builder.append(false).unwrap();
+        let map_array = Arc::new(map_builder.finish()) as ArrayRef;
+
+        let source_struct = StructArray::from(vec![
+            (
+                arc_field(
+                    "arr",
+                    DataType::List(Arc::new(field("item", DataType::Int32))),
+                ),
+                arr_array,
+            ),
+            (
+                arc_field(
+                    "map",
+                    DataType::Map(
+                        Arc::new(non_null_field(
+                            "entries",
+                            struct_type(vec![
+                                non_null_field("keys", DataType::Utf8),
+                                field("values", DataType::Int32),
+                            ]),
+                        )),
+                        false,
+                    ),
+                ),
+                map_array,
+            ),
+        ]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "s",
+            vec![
+                field(
+                    "arr",
+                    DataType::List(Arc::new(field("item", DataType::Int32))),
+                ),
+                field(
+                    "map",
+                    DataType::Map(
+                        Arc::new(non_null_field(
+                            "entries",
+                            struct_type(vec![
+                                non_null_field("keys", DataType::Utf8),
+                                field("values", DataType::Int32),
+                            ]),
+                        )),
+                        false,
+                    ),
+                ),
+            ],
+        );
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+        let arr = get_column_as!(&struct_array, "arr", ListArray);
+        assert!(!arr.is_null(0));
+        assert!(arr.is_null(1));
+        let arr0 = arr.value(0);
+        let values = arr0.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(values.value(0), 1);
+        assert_eq!(values.value(1), 2);
+
+        let map = get_column_as!(&struct_array, "map", MapArray);
+        assert!(!map.is_null(0));
+        assert!(map.is_null(1));
+        let map0 = map.value(0);
+        let entries = map0.as_any().downcast_ref::<StructArray>().unwrap();
+        let keys = get_column_as!(entries, "keys", StringArray);
+        let vals = get_column_as!(entries, "values", Int32Array);
+        assert_eq!(keys.value(0), "a");
+        assert_eq!(vals.value(0), 1);
+    }
+
+    #[test]
+    fn test_cast_struct_field_order_differs() {
+        let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let b = Arc::new(Int32Array::from(vec![Some(3), None])) as ArrayRef;
+
+        let source_struct = StructArray::from(vec![
+            (arc_field("a", DataType::Int32), a),
+            (arc_field("b", DataType::Int32), b),
+        ]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "s",
+            vec![field("b", DataType::Int64), field("a", DataType::Int32)],
+        );
+
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+        let b_col = get_column_as!(&struct_array, "b", Int64Array);
+        assert_eq!(b_col.value(0), 3);
+        assert!(b_col.is_null(1));
+
+        let a_col = get_column_as!(&struct_array, "a", Int32Array);
+        assert_eq!(a_col.value(0), 1);
+        assert_eq!(a_col.value(1), 2);
+    }
+
+    #[test]
+    fn test_cast_struct_no_overlap_rejected() {
+        let first = Arc::new(Int32Array::from(vec![Some(10), Some(20)])) as ArrayRef;
+        let second =
+            Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")])) as ArrayRef;
+
+        let source_struct = StructArray::from(vec![
+            (arc_field("left", DataType::Int32), first),
+            (arc_field("right", DataType::Utf8), second),
+        ]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        let target_field = struct_field(
+            "s",
+            vec![field("a", DataType::Int64), field("b", DataType::Utf8)],
+        );
+
+        let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let error_msg = result.unwrap_err().to_string();
+        assert_contains!(error_msg, "no field name overlap");
+    }
+
+    #[test]
+    fn test_cast_struct_missing_non_nullable_field_fails() {
+        // Source has only field 'a'
+        let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        // Target has fields 'a' (nullable) and 'b' (non-nullable)
+        let target_field = struct_field(
+            "s",
+            vec![
+                field("a", DataType::Int32),
+                non_null_field("b", DataType::Int32),
+            ],
+        );
+
+        // Should fail because 'b' is non-nullable but missing from source
+        let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS);
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("target field 'b' is non-nullable but missing from source"),
+            "Unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn test_cast_struct_missing_nullable_field_succeeds() {
+        // Source has only field 'a'
+        let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
+        let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]);
+        let source_col = Arc::new(source_struct) as ArrayRef;
+
+        // Target has fields 'a' and 'b' (both nullable)
+        let target_field = struct_field(
+            "s",
+            vec![field("a", DataType::Int32), field("b", DataType::Int32)],
+        );
+
+        // Should succeed - 'b' is nullable so can be filled with NULL
+        let result =
+            cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
+        let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+        let a_col = get_column_as!(&struct_array, "a", Int32Array);
+        assert_eq!(a_col.value(0), 1);
+        assert_eq!(a_col.value(1), 2);
+
+        let b_col = get_column_as!(&struct_array, "b", Int32Array);
+        assert!(b_col.is_null(0));
+        assert!(b_col.is_null(1));
+    }
+}
diff --git a/datafusion/common/src/null_equality.rs b/datafusion/common/src/null_equality.rs
new file mode 100644
index 0000000000000..847fb0975703e
--- /dev/null
+++ b/datafusion/common/src/null_equality.rs
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Represents the behavior for null values when evaluating equality. Currently, its primary use
+/// case is to define the behavior of joins for null values.
+///
+/// # Examples
+///
+/// The following table shows the expected equality behavior for `NullEquality`.
+///
+/// | A    | B    | NullEqualsNothing | NullEqualsNull |
+/// |------|------|-------------------|----------------|
+/// | NULL | NULL | false             | true           |
+/// | NULL | 'b'  | false             | false          |
+/// | 'a'  | NULL | false             | false          |
+/// | 'a'  | 'b'  | false             | false          |
+///
+/// # Order
+///
+/// The order on this type represents the "restrictiveness" of the behavior. The more restrictive
+/// a behavior is, the fewer elements are considered to be equal to null.
+/// [NullEquality::NullEqualsNothing] represents the most restrictive behavior.
+///
+/// This mirrors the old order with `null_equals_null` booleans, as `false` indicated that
+/// `null != null`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Hash)]
+pub enum NullEquality {
+    /// Null is *not* equal to anything (`null != null`)
+    NullEqualsNothing,
+    /// Null is equal to null (`null == null`)
+    NullEqualsNull,
+}
diff --git a/datafusion/common/src/param_value.rs b/datafusion/common/src/param_value.rs
index d2802c096da1b..0fac6b529eb0f 100644
--- a/datafusion/common/src/param_value.rs
+++ b/datafusion/common/src/param_value.rs
@@ -16,22 +16,37 @@
 // under the License.
 
 use crate::error::{_plan_datafusion_err, _plan_err};
+use crate::metadata::{ScalarAndMetadata, check_metadata_with_storage_equal};
 use crate::{Result, ScalarValue};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use std::collections::HashMap;
 
 /// The parameter value corresponding to the placeholder
 #[derive(Debug, Clone)]
 pub enum ParamValues {
     /// For positional query parameters, like `SELECT * FROM test WHERE a > $1 AND b = $2`
-    List(Vec<ScalarValue>),
+    List(Vec<ScalarAndMetadata>),
     /// For named query parameters, like `SELECT * FROM test WHERE a > $foo AND b = $goo`
-    Map(HashMap<String, ScalarValue>),
+    Map(HashMap<String, ScalarAndMetadata>),
 }
 
 impl ParamValues {
-    /// Verify parameter list length and type
+    /// Verify parameter list length and DataType
+    ///
+    /// Use [`ParamValues::verify_fields`] to ensure field metadata is considered when
+    /// computing type equality.
+    #[deprecated(since = "51.0.0", note = "Use verify_fields instead")]
     pub fn verify(&self, expect: &[DataType]) -> Result<()> {
+        // make dummy Fields
+        let expect = expect
+            .iter()
+            .map(|dt| Field::new("", dt.clone(), true).into())
+            .collect::<Vec<_>>();
+        self.verify_fields(&expect)
+    }
+
+    /// Verify parameter list length and type
+    pub fn verify_fields(&self, expect: &[FieldRef]) -> Result<()> {
         match self {
             ParamValues::List(list) => {
                 // Verify if the number of params matches the number of values
@@ -45,15 +60,16 @@ impl ParamValues {
 
                 // Verify if the types of the params matches the types of the values
                 let iter = expect.iter().zip(list.iter());
-                for (i, (param_type, value)) in iter.enumerate() {
-                    if *param_type != value.data_type() {
-                        return _plan_err!(
-                            "Expected parameter of type {:?}, got {:?} at index {}",
-                            param_type,
-                            value.data_type(),
-                            i
-                        );
-                    }
+                for (i, (param_type, lit)) in iter.enumerate() {
+                    check_metadata_with_storage_equal(
+                        (
+                            &lit.value.data_type(),
+                            lit.metadata.as_ref().map(|m| m.to_hashmap()).as_ref(),
+                        ),
+                        (param_type.data_type(), Some(param_type.metadata())),
+                        "parameter",
+                        &format!(" at index {i}"),
+                    )?;
                 }
                 Ok(())
             }
@@ -65,7 +81,7 @@ impl ParamValues {
         }
     }
 
-    pub fn get_placeholders_with_values(&self, id: &str) -> Result<ScalarValue> {
+    pub fn get_placeholders_with_values(&self, id: &str) -> Result<ScalarAndMetadata> {
         match self {
             ParamValues::List(list) => {
                 if id.is_empty() {
@@ -99,7 +115,7 @@ impl ParamValues {
 
 impl From<Vec<ScalarValue>> for ParamValues {
     fn from(value: Vec<ScalarValue>) -> Self {
-        Self::List(value)
+        Self::List(value.into_iter().map(ScalarAndMetadata::from).collect())
     }
 }
 
@@ -108,8 +124,10 @@ where
     K: Into<String>,
 {
     fn from(value: Vec<(K, ScalarValue)>) -> Self {
-        let value: HashMap<String, ScalarValue> =
-            value.into_iter().map(|(k, v)| (k.into(), v)).collect();
+        let value: HashMap<String, ScalarAndMetadata> = value
+            .into_iter()
+            .map(|(k, v)| (k.into(), ScalarAndMetadata::from(v)))
+            .collect();
         Self::Map(value)
     }
 }
@@ -119,8 +137,10 @@ where
     K: Into<String>,
 {
     fn from(value: HashMap<K, ScalarValue>) -> Self {
-        let value: HashMap<String, ScalarValue> =
-            value.into_iter().map(|(k, v)| (k.into(), v)).collect();
+        let value: HashMap<String, ScalarAndMetadata> = value
+            .into_iter()
+            .map(|(k, v)| (k.into(), ScalarAndMetadata::from(v)))
+            .collect();
         Self::Map(value)
     }
 }
diff --git a/datafusion/common/src/parquet_config.rs b/datafusion/common/src/parquet_config.rs
new file mode 100644
index 0000000000000..9d6d7a88566a7
--- /dev/null
+++ b/datafusion/common/src/parquet_config.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::{self, Display};
+use std::str::FromStr;
+
+use crate::config::{ConfigField, Visit};
+use crate::error::{DataFusionError, Result};
+
+/// Parquet writer version options for controlling the Parquet file format version
+///
+/// This enum validates parquet writer version values at configuration time,
+/// ensuring only valid versions ("1.0" or "2.0") can be set via `SET` commands
+/// or proto deserialization.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum DFParquetWriterVersion {
+    /// Parquet format version 1.0
+    #[default]
+    V1_0,
+    /// Parquet format version 2.0
+    V2_0,
+}
+
+/// Implement parsing strings to `DFParquetWriterVersion`
+impl FromStr for DFParquetWriterVersion {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "1.0" => Ok(DFParquetWriterVersion::V1_0),
+            "2.0" => Ok(DFParquetWriterVersion::V2_0),
+            other => Err(DataFusionError::Configuration(format!(
+                "Invalid parquet writer version: {other}. Expected one of: 1.0, 2.0"
+            ))),
+        }
+    }
+}
+
+impl Display for DFParquetWriterVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            DFParquetWriterVersion::V1_0 => "1.0",
+            DFParquetWriterVersion::V2_0 => "2.0",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl ConfigField for DFParquetWriterVersion {
+    fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
+        v.some(key, self, description)
+    }
+
+    fn set(&mut self, _: &str, value: &str) -> Result<()> {
+        *self = DFParquetWriterVersion::from_str(value)?;
+        Ok(())
+    }
+}
+
+/// Convert `DFParquetWriterVersion` to parquet crate's `WriterVersion`
+///
+/// This conversion is infallible since `DFParquetWriterVersion` only contains
+/// valid values that have been validated at configuration time.
+#[cfg(feature = "parquet")]
+impl From<DFParquetWriterVersion> for parquet::file::properties::WriterVersion {
+    fn from(value: DFParquetWriterVersion) -> Self {
+        match value {
+            DFParquetWriterVersion::V1_0 => {
+                parquet::file::properties::WriterVersion::PARQUET_1_0
+            }
+            DFParquetWriterVersion::V2_0 => {
+                parquet::file::properties::WriterVersion::PARQUET_2_0
+            }
+        }
+    }
+}
+
+/// Convert parquet crate's `WriterVersion` to `DFParquetWriterVersion`
+///
+/// This is used when converting from existing parquet writer properties,
+/// such as when reading from proto or test code.
+#[cfg(feature = "parquet")]
+impl From<parquet::file::properties::WriterVersion> for DFParquetWriterVersion {
+    fn from(version: parquet::file::properties::WriterVersion) -> Self {
+        match version {
+            parquet::file::properties::WriterVersion::PARQUET_1_0 => {
+                DFParquetWriterVersion::V1_0
+            }
+            parquet::file::properties::WriterVersion::PARQUET_2_0 => {
+                DFParquetWriterVersion::V2_0
+            }
+        }
+    }
+}
diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs
index 41571ebb8576c..cd3d607dacd88 100644
--- a/datafusion/common/src/parsers.rs
+++ b/datafusion/common/src/parsers.rs
@@ -20,7 +20,7 @@
 use std::fmt::Display;
 use std::str::FromStr;
 
-use sqlparser::parser::ParserError;
+use crate::DataFusionError;
 
 /// Readable file compression type
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -38,9 +38,9 @@ pub enum CompressionTypeVariant {
 }
 
 impl FromStr for CompressionTypeVariant {
-    type Err = ParserError;
+    type Err = DataFusionError;
 
-    fn from_str(s: &str) -> Result<Self, ParserError> {
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
         let s = s.to_uppercase();
         match s.as_str() {
             "GZIP" | "GZ" => Ok(Self::GZIP),
@@ -48,7 +48,7 @@ impl FromStr for CompressionTypeVariant {
             "XZ" => Ok(Self::XZ),
             "ZST" | "ZSTD" => Ok(Self::ZSTD),
             "" | "UNCOMPRESSED" => Ok(Self::UNCOMPRESSED),
-            _ => Err(ParserError::ParserError(format!(
+            _ => Err(DataFusionError::NotImplemented(format!(
                 "Unsupported file compression type {s}"
             ))),
         }
diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs
index 48750e3c995c4..5a7598ea1f299 100644
--- a/datafusion/common/src/pruning.rs
+++ b/datafusion/common/src/pruning.rs
@@ -135,6 +135,10 @@ pub trait PruningStatistics {
 /// This feeds into [`CompositePruningStatistics`] to allow pruning
 /// with filters that depend both on partition columns and data columns
 /// (e.g. `WHERE partition_col = data_col`).
+#[deprecated(
+    since = "52.0.0",
+    note = "This struct is no longer used internally. Use `replace_columns_with_literals` from `datafusion-physical-expr-adapter` to substitute partition column values before pruning. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+)]
 #[derive(Clone)]
 pub struct PartitionPruningStatistics {
     /// Values for each column for each container.
@@ -156,6 +160,7 @@ pub struct PartitionPruningStatistics {
     partition_schema: SchemaRef,
 }
 
+#[expect(deprecated)]
 impl PartitionPruningStatistics {
     /// Create a new instance of [`PartitionPruningStatistics`].
     ///
@@ -169,6 +174,36 @@ impl PartitionPruningStatistics {
     ///   This must **not** be the schema of the entire file or table:
     ///   instead it must only be the schema of the partition columns,
     ///   in the same order as the values in `partition_values`.
+    ///
+    /// # Example
+    ///
+    /// To create [`PartitionPruningStatistics`] for two partition columns `a` and `b`,
+    /// for three containers like this:
+    ///
+    /// | a | b |
+    /// | - | - |
+    /// | 1 | 2 |
+    /// | 3 | 4 |
+    /// | 5 | 6 |
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_common::ScalarValue;
+    /// # use arrow::datatypes::{DataType, Field};
+    /// # use datafusion_common::pruning::PartitionPruningStatistics;
+    ///
+    /// let partition_values = vec![
+    ///     vec![ScalarValue::from(1i32), ScalarValue::from(2i32)],
+    ///     vec![ScalarValue::from(3i32), ScalarValue::from(4i32)],
+    ///     vec![ScalarValue::from(5i32), ScalarValue::from(6i32)],
+    /// ];
+    /// let partition_fields = vec![
+    ///     Arc::new(Field::new("a", DataType::Int32, false)),
+    ///     Arc::new(Field::new("b", DataType::Int32, false)),
+    /// ];
+    /// let partition_stats =
+    ///     PartitionPruningStatistics::try_new(partition_values, partition_fields).unwrap();
+    /// ```
     pub fn try_new(
         partition_values: Vec<Vec<ScalarValue>>,
         partition_fields: Vec<FieldRef>,
@@ -202,6 +237,7 @@ impl PartitionPruningStatistics {
     }
 }
 
+#[expect(deprecated)]
 impl PruningStatistics for PartitionPruningStatistics {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
         let index = self.partition_schema.index_of(column.name()).ok()?;
@@ -245,7 +281,7 @@ impl PruningStatistics for PartitionPruningStatistics {
             match acc {
                 None => Some(Some(eq_result)),
                 Some(acc_array) => {
-                    arrow::compute::kernels::boolean::and(&acc_array, &eq_result)
+                    arrow::compute::kernels::boolean::or_kleene(&acc_array, &eq_result)
                         .map(Some)
                         .ok()
                 }
@@ -409,10 +445,15 @@ impl PruningStatistics for PrunableStatistics {
 /// the first one is returned without any regard for completeness or accuracy.
 /// That is: if the first statistics has information for a column, even if it is incomplete,
 /// that is returned even if a later statistics has more complete information.
+#[deprecated(
+    since = "52.0.0",
+    note = "This struct is no longer used internally. It may be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first. Please open an issue if you have a use case for it."
+)]
 pub struct CompositePruningStatistics {
     pub statistics: Vec<Box<dyn PruningStatistics>>,
 }
 
+#[expect(deprecated)]
 impl CompositePruningStatistics {
     /// Create a new instance of [`CompositePruningStatistics`] from
     /// a vector of [`PruningStatistics`].
@@ -427,6 +468,7 @@ impl CompositePruningStatistics {
     }
 }
 
+#[expect(deprecated)]
 impl PruningStatistics for CompositePruningStatistics {
     fn min_values(&self, column: &Column) -> Option<ArrayRef> {
         for stats in &self.statistics {
@@ -483,18 +525,25 @@ impl PruningStatistics for CompositePruningStatistics {
 }
 
 #[cfg(test)]
+#[expect(deprecated)]
 mod tests {
     use crate::{
-        cast::{as_int32_array, as_uint64_array},
         ColumnStatistics,
+        cast::{as_int32_array, as_uint64_array},
     };
 
     use super::*;
     use arrow::datatypes::{DataType, Field};
     use std::sync::Arc;
 
-    #[test]
-    fn test_partition_pruning_statistics() {
+    /// return a PartitionPruningStatistics for two columns 'a' and 'b'
+    /// and the following stats
+    ///
+    /// | a | b |
+    /// | - | - |
+    /// | 1 | 2 |
+    /// | 3 | 4 |
+    fn partition_pruning_statistics_setup() -> PartitionPruningStatistics {
         let partition_values = vec![
             vec![ScalarValue::from(1i32), ScalarValue::from(2i32)],
             vec![ScalarValue::from(3i32), ScalarValue::from(4i32)],
@@ -503,9 +552,12 @@ mod tests {
             Arc::new(Field::new("a", DataType::Int32, false)),
             Arc::new(Field::new("b", DataType::Int32, false)),
         ];
-        let partition_stats =
-            PartitionPruningStatistics::try_new(partition_values, partition_fields)
-                .unwrap();
+        PartitionPruningStatistics::try_new(partition_values, partition_fields).unwrap()
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics() {
+        let partition_stats = partition_pruning_statistics_setup();
 
         let column_a = Column::new_unqualified("a");
         let column_b = Column::new_unqualified("b");
@@ -560,6 +612,85 @@ mod tests {
         assert_eq!(partition_stats.num_containers(), 2);
     }
 
+    #[test]
+    fn test_partition_pruning_statistics_multiple_positive_values() {
+        let partition_stats = partition_pruning_statistics_setup();
+
+        let column_a = Column::new_unqualified("a");
+
+        // The two containers have `a` values 1 and 3, so they both only contain values from 1 and 3
+        let values = HashSet::from([ScalarValue::from(1i32), ScalarValue::from(3i32)]);
+        let contained_a = partition_stats.contained(&column_a, &values).unwrap();
+        let expected_contained_a = BooleanArray::from(vec![true, true]);
+        assert_eq!(contained_a, expected_contained_a);
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics_multiple_negative_values() {
+        let partition_stats = partition_pruning_statistics_setup();
+
+        let column_a = Column::new_unqualified("a");
+
+        // The two containers have `a` values 1 and 3,
+        // so the first contains ONLY values from 1,2
+        // but the second does not
+        let values = HashSet::from([ScalarValue::from(1i32), ScalarValue::from(2i32)]);
+        let contained_a = partition_stats.contained(&column_a, &values).unwrap();
+        let expected_contained_a = BooleanArray::from(vec![true, false]);
+        assert_eq!(contained_a, expected_contained_a);
+    }
+
+    #[test]
+    fn test_partition_pruning_statistics_null_in_values() {
+        let partition_values = vec![
+            vec![
+                ScalarValue::from(1i32),
+                ScalarValue::from(2i32),
+                ScalarValue::from(3i32),
+            ],
+            vec![
+                ScalarValue::from(4i32),
+                ScalarValue::from(5i32),
+                ScalarValue::from(6i32),
+            ],
+        ];
+        let partition_fields = vec![
+            Arc::new(Field::new("a", DataType::Int32, false)),
+            Arc::new(Field::new("b", DataType::Int32, false)),
+            Arc::new(Field::new("c", DataType::Int32, false)),
+        ];
+        let partition_stats =
+            PartitionPruningStatistics::try_new(partition_values, partition_fields)
+                .unwrap();
+
+        let column_a = Column::new_unqualified("a");
+        let column_b = Column::new_unqualified("b");
+        let column_c = Column::new_unqualified("c");
+
+        let values_a = HashSet::from([ScalarValue::from(1i32), ScalarValue::Int32(None)]);
+        let contained_a = partition_stats.contained(&column_a, &values_a).unwrap();
+        let mut builder = BooleanArray::builder(2);
+        builder.append_value(true);
+        builder.append_null();
+        let expected_contained_a = builder.finish();
+        assert_eq!(contained_a, expected_contained_a);
+
+        // First match creates a NULL boolean array
+        // The accumulator should update the value to true for the second value
+        let values_b = HashSet::from([ScalarValue::Int32(None), ScalarValue::from(5i32)]);
+        let contained_b = partition_stats.contained(&column_b, &values_b).unwrap();
+        let mut builder = BooleanArray::builder(2);
+        builder.append_null();
+        builder.append_value(true);
+        let expected_contained_b = builder.finish();
+        assert_eq!(contained_b, expected_contained_b);
+
+        // All matches are null, contained should return None
+        let values_c = HashSet::from([ScalarValue::Int32(None)]);
+        let contained_c = partition_stats.contained(&column_c, &values_c);
+        assert!(contained_c.is_none());
+    }
+
     #[test]
     fn test_partition_pruning_statistics_empty() {
         let partition_values = vec![];
diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs
deleted file mode 100644
index ff413e08ab076..0000000000000
--- a/datafusion/common/src/pyarrow.rs
+++ /dev/null
@@ -1,171 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Conversions between PyArrow and DataFusion types
-
-use arrow::array::{Array, ArrayData};
-use arrow::pyarrow::{FromPyArrow, ToPyArrow};
-use pyo3::exceptions::PyException;
-use pyo3::prelude::PyErr;
-use pyo3::types::{PyAnyMethods, PyList};
-use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python};
-
-use crate::{DataFusionError, ScalarValue};
-
-impl From<DataFusionError> for PyErr {
-    fn from(err: DataFusionError) -> PyErr {
-        PyException::new_err(err.to_string())
-    }
-}
-
-impl FromPyArrow for ScalarValue {
-    fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult<Self> {
-        let py = value.py();
-        let typ = value.getattr("type")?;
-        let val = value.call_method0("as_py")?;
-
-        // construct pyarrow array from the python value and pyarrow type
-        let factory = py.import("pyarrow")?.getattr("array")?;
-        let args = PyList::new(py, [val])?;
-        let array = factory.call1((args, typ))?;
-
-        // convert the pyarrow array to rust array using C data interface
-        let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?);
-        let scalar = ScalarValue::try_from_array(&array, 0)?;
-
-        Ok(scalar)
-    }
-}
-
-impl ToPyArrow for ScalarValue {
-    fn to_pyarrow(&self, py: Python) -> PyResult<PyObject> {
-        let array = self.to_array()?;
-        // convert to pyarrow array using C data interface
-        let pyarray = array.to_data().to_pyarrow(py)?;
-        let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?;
-
-        Ok(pyscalar)
-    }
-}
-
-impl<'source> FromPyObject<'source> for ScalarValue {
-    fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult<Self> {
-        Self::from_pyarrow_bound(value)
-    }
-}
-
-impl<'source> IntoPyObject<'source> for ScalarValue {
-    type Target = PyAny;
-
-    type Output = Bound<'source, Self::Target>;
-
-    type Error = PyErr;
-
-    fn into_pyobject(self, py: Python<'source>) -> Result<Self::Output, Self::Error> {
-        let array = self.to_array()?;
-        // convert to pyarrow array using C data interface
-        let pyarray = array.to_data().to_pyarrow(py)?;
-        let pyarray_bound = pyarray.bind(py);
-        pyarray_bound.call_method1("__getitem__", (0,))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pyo3::ffi::c_str;
-    use pyo3::prepare_freethreaded_python;
-    use pyo3::py_run;
-    use pyo3::types::PyDict;
-
-    use super::*;
-
-    fn init_python() {
-        prepare_freethreaded_python();
-        Python::with_gil(|py| {
-            if py.run(c_str!("import pyarrow"), None, None).is_err() {
-                let locals = PyDict::new(py);
-                py.run(
-                    c_str!(
-                        "import sys; executable = sys.executable; python_path = sys.path"
-                    ),
-                    None,
-                    Some(&locals),
-                )
-                .expect("Couldn't get python info");
-                let executable = locals.get_item("executable").unwrap();
-                let executable: String = executable.extract().unwrap();
-
-                let python_path = locals.get_item("python_path").unwrap();
-                let python_path: Vec<String> = python_path.extract().unwrap();
-
-                panic!("pyarrow not found\nExecutable: {executable}\nPython path: {python_path:?}\n\
-                         HINT: try `pip install pyarrow`\n\
-                         NOTE: On Mac OS, you must compile against a Framework Python \
-                         (default in python.org installers and brew, but not pyenv)\n\
-                         NOTE: On Mac OS, PYO3 might point to incorrect Python library \
-                         path when using virtual environments. Try \
-                         `export PYTHONPATH=$(python -c \"import sys; print(sys.path[-1])\")`\n")
-            }
-        })
-    }
-
-    #[test]
-    fn test_roundtrip() {
-        init_python();
-
-        let example_scalars = vec![
-            ScalarValue::Boolean(Some(true)),
-            ScalarValue::Int32(Some(23)),
-            ScalarValue::Float64(Some(12.34)),
-            ScalarValue::from("Hello!"),
-            ScalarValue::Date32(Some(1234)),
-        ];
-
-        Python::with_gil(|py| {
-            for scalar in example_scalars.iter() {
-                let result = ScalarValue::from_pyarrow_bound(
-                    scalar.to_pyarrow(py).unwrap().bind(py),
-                )
-                .unwrap();
-                assert_eq!(scalar, &result);
-            }
-        });
-    }
-
-    #[test]
-    fn test_py_scalar() -> PyResult<()> {
-        init_python();
-
-        Python::with_gil(|py| -> PyResult<()> {
-            let scalar_float = ScalarValue::Float64(Some(12.34));
-            let py_float = scalar_float
-                .into_pyobject(py)?
-                .call_method0("as_py")
-                .unwrap();
-            py_run!(py, py_float, "assert py_float == 12.34");
-
-            let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string()));
-            let py_string = scalar_string
-                .into_pyobject(py)?
-                .call_method0("as_py")
-                .unwrap();
-            py_run!(py, py_string, "assert py_string == 'Hello!'");
-
-            Ok(())
-        })
-    }
-}
diff --git a/datafusion/common/src/rounding.rs b/datafusion/common/src/rounding.rs
index 413067ecd61ed..1796143d7cf1a 100644
--- a/datafusion/common/src/rounding.rs
+++ b/datafusion/common/src/rounding.rs
@@ -47,7 +47,7 @@ extern crate libc;
     any(target_arch = "x86_64", target_arch = "aarch64"),
     not(target_os = "windows")
 ))]
-extern "C" {
+unsafe extern "C" {
     fn fesetround(round: i32);
     fn fegetround() -> i32;
 }
@@ -77,6 +77,7 @@ pub trait FloatBits {
 
     /// The integer value 0, used in bitwise operations.
     const ZERO: Self::Item;
+    const NEG_ZERO: Self::Item;
 
     /// Converts the floating-point value to its bitwise representation.
     fn to_bits(self) -> Self::Item;
@@ -101,6 +102,7 @@ impl FloatBits for f32 {
     const CLEAR_SIGN_MASK: u32 = 0x7fff_ffff;
     const ONE: Self::Item = 1;
     const ZERO: Self::Item = 0;
+    const NEG_ZERO: Self::Item = 0x8000_0000;
 
     fn to_bits(self) -> Self::Item {
         self.to_bits()
@@ -130,6 +132,7 @@ impl FloatBits for f64 {
     const CLEAR_SIGN_MASK: u64 = 0x7fff_ffff_ffff_ffff;
     const ONE: Self::Item = 1;
     const ZERO: Self::Item = 0;
+    const NEG_ZERO: Self::Item = 0x8000_0000_0000_0000;
 
     fn to_bits(self) -> Self::Item {
         self.to_bits()
@@ -175,8 +178,10 @@ pub fn next_up<F: FloatBits + Copy>(float: F) -> F {
     }
 
     let abs = bits & F::CLEAR_SIGN_MASK;
-    let next_bits = if abs == F::ZERO {
+    let next_bits = if bits == F::ZERO {
         F::TINY_BITS
+    } else if abs == F::ZERO {
+        F::ZERO
     } else if bits == abs {
         bits + F::ONE
     } else {
@@ -206,8 +211,11 @@ pub fn next_down<F: FloatBits + Copy>(float: F) -> F {
     if float.float_is_nan() || bits == F::neg_infinity().to_bits() {
         return float;
     }
+
     let abs = bits & F::CLEAR_SIGN_MASK;
-    let next_bits = if abs == F::ZERO {
+    let next_bits = if bits == F::ZERO {
+        F::NEG_ZERO
+    } else if abs == F::ZERO {
         F::NEG_TINY_BITS
     } else if bits == abs {
         bits - F::ONE
@@ -396,4 +404,32 @@ mod tests {
         let result = next_down(value);
         assert!(result.is_nan());
     }
+
+    #[test]
+    fn test_next_up_neg_zero_f32() {
+        let value: f32 = -0.0;
+        let result = next_up(value);
+        assert_eq!(result, 0.0);
+    }
+
+    #[test]
+    fn test_next_down_zero_f32() {
+        let value: f32 = 0.0;
+        let result = next_down(value);
+        assert_eq!(result, -0.0);
+    }
+
+    #[test]
+    fn test_next_up_neg_zero_f64() {
+        let value: f64 = -0.0;
+        let result = next_up(value);
+        assert_eq!(result, 0.0);
+    }
+
+    #[test]
+    fn test_next_down_zero_f64() {
+        let value: f64 = 0.0;
+        let result = next_down(value);
+        assert_eq!(result, -0.0);
+    }
 }
diff --git a/datafusion/common/src/scalar/cache.rs b/datafusion/common/src/scalar/cache.rs
new file mode 100644
index 0000000000000..5b1ad4e4ede01
--- /dev/null
+++ b/datafusion/common/src/scalar/cache.rs
@@ -0,0 +1,215 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Array caching utilities for scalar values
+
+use std::iter::repeat_n;
+use std::sync::{Arc, LazyLock, Mutex};
+
+use arrow::array::{Array, ArrayRef, PrimitiveArray, new_null_array};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, DataType, Int8Type, Int16Type, Int32Type, Int64Type,
+    UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+};
+
+/// Maximum number of rows to cache to be conservative on memory usage
+const MAX_CACHE_SIZE: usize = 1024 * 1024;
+
+/// Cache for dictionary key arrays to avoid repeated allocations
+/// when the same size is used frequently.
+///
+/// Similar to PartitionColumnProjector's ZeroBufferGenerators, this cache
+/// stores key arrays for different dictionary key types. The cache is
+/// limited to 1 entry per type (the last size used) to prevent memory leaks
+/// for extremely large array requests.
+#[derive(Debug)]
+struct KeyArrayCache<K: ArrowDictionaryKeyType> {
+    cache: Option<(usize, bool, PrimitiveArray<K>)>, // (num_rows, is_null, key_array)
+}
+
+impl<K: ArrowDictionaryKeyType> Default for KeyArrayCache<K> {
+    fn default() -> Self {
+        Self { cache: None }
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> KeyArrayCache<K> {
+    /// Get or create a cached key array for the given number of rows and null status
+    fn get_or_create(&mut self, num_rows: usize, is_null: bool) -> PrimitiveArray<K> {
+        // Check cache size limit to prevent memory leaks
+        if num_rows > MAX_CACHE_SIZE {
+            // For very large arrays, don't cache them - just create and return
+            return self.create_key_array(num_rows, is_null);
+        }
+
+        match &self.cache {
+            Some((cached_num_rows, cached_is_null, cached_array))
+                if *cached_num_rows == num_rows && *cached_is_null == is_null =>
+            {
+                // Cache hit: reuse existing array if same size and null status
+                cached_array.clone()
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let key_array = self.create_key_array(num_rows, is_null);
+                self.cache = Some((num_rows, is_null, key_array.clone()));
+                key_array
+            }
+        }
+    }
+
+    /// Create a new key array with the specified number of rows and null status
+    fn create_key_array(&self, num_rows: usize, is_null: bool) -> PrimitiveArray<K> {
+        let key_array: PrimitiveArray<K> = repeat_n(
+            if is_null {
+                None
+            } else {
+                Some(K::default_value())
+            },
+            num_rows,
+        )
+        .collect();
+        key_array
+    }
+}
+
+/// Cache for null arrays to avoid repeated allocations
+/// when the same size is used frequently.
+#[derive(Debug, Default)]
+struct NullArrayCache {
+    cache: Option<(usize, ArrayRef)>, // (num_rows, null_array)
+}
+
+impl NullArrayCache {
+    /// Get or create a cached null array for the given number of rows
+    fn get_or_create(&mut self, num_rows: usize) -> ArrayRef {
+        // Check cache size limit to prevent memory leaks
+        if num_rows > MAX_CACHE_SIZE {
+            // For very large arrays, don't cache them - just create and return
+            return new_null_array(&DataType::Null, num_rows);
+        }
+
+        match &self.cache {
+            Some((cached_num_rows, cached_array)) if *cached_num_rows == num_rows => {
+                // Cache hit: reuse existing array if same size
+                Arc::clone(cached_array)
+            }
+            _ => {
+                // Cache miss: create new array and cache it
+                let null_array = new_null_array(&DataType::Null, num_rows);
+                self.cache = Some((num_rows, Arc::clone(&null_array)));
+                null_array
+            }
+        }
+    }
+}
+
+/// Global cache for dictionary key arrays and null arrays
+#[derive(Debug, Default)]
+struct ArrayCaches {
+    cache_i8: KeyArrayCache<Int8Type>,
+    cache_i16: KeyArrayCache<Int16Type>,
+    cache_i32: KeyArrayCache<Int32Type>,
+    cache_i64: KeyArrayCache<Int64Type>,
+    cache_u8: KeyArrayCache<UInt8Type>,
+    cache_u16: KeyArrayCache<UInt16Type>,
+    cache_u32: KeyArrayCache<UInt32Type>,
+    cache_u64: KeyArrayCache<UInt64Type>,
+    null_cache: NullArrayCache,
+}
+
+static ARRAY_CACHES: LazyLock<Mutex<ArrayCaches>> =
+    LazyLock::new(|| Mutex::new(ArrayCaches::default()));
+
+/// Get the global cache for arrays
+fn get_array_caches() -> &'static Mutex<ArrayCaches> {
+    &ARRAY_CACHES
+}
+
+/// Get or create a cached null array for the given number of rows
+pub(crate) fn get_or_create_cached_null_array(num_rows: usize) -> ArrayRef {
+    let cache = get_array_caches();
+    let mut caches = cache.lock().unwrap();
+    caches.null_cache.get_or_create(num_rows)
+}
+
+/// Get or create a cached key array for a specific key type
+pub(crate) fn get_or_create_cached_key_array<K: ArrowDictionaryKeyType>(
+    num_rows: usize,
+    is_null: bool,
+) -> PrimitiveArray<K> {
+    let cache = get_array_caches();
+    let mut caches = cache.lock().unwrap();
+
+    // Use the DATA_TYPE to dispatch to the correct cache, similar to original implementation
+    match K::DATA_TYPE {
+        DataType::Int8 => {
+            let array = caches.cache_i8.get_or_create(num_rows, is_null);
+            // Convert using ArrayData to avoid unsafe transmute
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::Int16 => {
+            let array = caches.cache_i16.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::Int32 => {
+            let array = caches.cache_i32.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::Int64 => {
+            let array = caches.cache_i64.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::UInt8 => {
+            let array = caches.cache_u8.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::UInt16 => {
+            let array = caches.cache_u16.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::UInt32 => {
+            let array = caches.cache_u32.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        DataType::UInt64 => {
+            let array = caches.cache_u64.get_or_create(num_rows, is_null);
+            let array_data = array.to_data();
+            PrimitiveArray::<K>::from(array_data)
+        }
+        _ => {
+            // Fallback for unsupported types - create array directly without caching
+            let key_array: PrimitiveArray<K> = repeat_n(
+                if is_null {
+                    None
+                } else {
+                    Some(K::default_value())
+                },
+                num_rows,
+            )
+            .collect();
+            key_array
+        }
+    }
+}
diff --git a/datafusion/common/src/scalar/consts.rs b/datafusion/common/src/scalar/consts.rs
index efcde651841b0..599c2523cd2c7 100644
--- a/datafusion/common/src/scalar/consts.rs
+++ b/datafusion/common/src/scalar/consts.rs
@@ -17,28 +17,40 @@
 
 // Constants defined for scalar construction.
 
-// PI ~ 3.1415927 in f32
-#[allow(clippy::approx_constant)]
-pub(super) const PI_UPPER_F32: f32 = 3.141593_f32;
+// Next F16 value above π (upper bound)
+pub(super) const PI_UPPER_F16: half::f16 = half::f16::from_bits(0x4249);
 
-// PI ~ 3.141592653589793 in f64
-pub(super) const PI_UPPER_F64: f64 = 3.141592653589794_f64;
+// Next f32 value above π (upper bound)
+pub(super) const PI_UPPER_F32: f32 = std::f32::consts::PI.next_up();
 
-// -PI ~ -3.1415927 in f32
-#[allow(clippy::approx_constant)]
-pub(super) const NEGATIVE_PI_LOWER_F32: f32 = -3.141593_f32;
+// Next f64 value above π (upper bound)
+pub(super) const PI_UPPER_F64: f64 = std::f64::consts::PI.next_up();
 
-// -PI ~ -3.141592653589793 in f64
-pub(super) const NEGATIVE_PI_LOWER_F64: f64 = -3.141592653589794_f64;
+// Next f16 value below -π (lower bound)
+pub(super) const NEGATIVE_PI_LOWER_F16: half::f16 = half::f16::from_bits(0xC249);
 
-// PI / 2 ~ 1.5707964 in f32
-pub(super) const FRAC_PI_2_UPPER_F32: f32 = 1.5707965_f32;
+// Next f32 value below -π (lower bound)
+pub(super) const NEGATIVE_PI_LOWER_F32: f32 = (-std::f32::consts::PI).next_down();
 
-// PI / 2 ~ 1.5707963267948966 in f64
-pub(super) const FRAC_PI_2_UPPER_F64: f64 = 1.5707963267948967_f64;
+// Next f64 value below -π (lower bound)
+pub(super) const NEGATIVE_PI_LOWER_F64: f64 = (-std::f64::consts::PI).next_down();
 
-// -PI / 2 ~ -1.5707964 in f32
-pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F32: f32 = -1.5707965_f32;
+// Next f16 value above π/2 (upper bound)
+pub(super) const FRAC_PI_2_UPPER_F16: half::f16 = half::f16::from_bits(0x3E49);
 
-// -PI / 2 ~ -1.5707963267948966 in f64
-pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F64: f64 = -1.5707963267948967_f64;
+// Next f32 value above π/2 (upper bound)
+pub(super) const FRAC_PI_2_UPPER_F32: f32 = std::f32::consts::FRAC_PI_2.next_up();
+
+// Next f64 value above π/2 (upper bound)
+pub(super) const FRAC_PI_2_UPPER_F64: f64 = std::f64::consts::FRAC_PI_2.next_up();
+
+// Next f32 value below -π/2 (lower bound)
+pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F16: half::f16 = half::f16::from_bits(0xBE49);
+
+// Next f32 value below -π/2 (lower bound)
+pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F32: f32 =
+    (-std::f32::consts::FRAC_PI_2).next_down();
+
+// Next f64 value below -π/2 (lower bound)
+pub(super) const NEGATIVE_FRAC_PI_2_LOWER_F64: f64 =
+    (-std::f64::consts::FRAC_PI_2).next_down();
diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
index 3d4aa78b6da65..ebed41e9d8587 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -17,6 +17,7 @@
 
 //! [`ScalarValue`]: stores single  values
 
+mod cache;
 mod consts;
 mod struct_builder;
 
@@ -25,6 +26,7 @@ use std::cmp::Ordering;
 use std::collections::{HashSet, VecDeque};
 use std::convert::Infallible;
 use std::fmt;
+use std::fmt::Write;
 use std::hash::Hash;
 use std::hash::Hasher;
 use std::iter::repeat_n;
@@ -32,36 +34,162 @@ use std::mem::{size_of, size_of_val};
 use std::str::FromStr;
 use std::sync::Arc;
 
-use crate::arrow_datafusion_err;
+use crate::assert_or_internal_err;
 use crate::cast::{
-    as_decimal128_array, as_decimal256_array, as_dictionary_array,
-    as_fixed_size_binary_array, as_fixed_size_list_array,
+    as_binary_array, as_binary_view_array, as_boolean_array, as_date32_array,
+    as_date64_array, as_decimal32_array, as_decimal64_array, as_decimal128_array,
+    as_decimal256_array, as_dictionary_array, as_duration_microsecond_array,
+    as_duration_millisecond_array, as_duration_nanosecond_array,
+    as_duration_second_array, as_fixed_size_binary_array, as_fixed_size_list_array,
+    as_float16_array, as_float32_array, as_float64_array, as_int8_array, as_int16_array,
+    as_int32_array, as_int64_array, as_interval_dt_array, as_interval_mdn_array,
+    as_interval_ym_array, as_large_binary_array, as_large_list_array,
+    as_large_string_array, as_run_array, as_string_array, as_string_view_array,
+    as_time32_millisecond_array, as_time32_second_array, as_time64_microsecond_array,
+    as_time64_nanosecond_array, as_timestamp_microsecond_array,
+    as_timestamp_millisecond_array, as_timestamp_nanosecond_array,
+    as_timestamp_second_array, as_uint8_array, as_uint16_array, as_uint32_array,
+    as_uint64_array, as_union_array,
 };
-use crate::error::{DataFusionError, Result, _exec_err, _internal_err, _not_impl_err};
+use crate::error::{_exec_err, _internal_err, _not_impl_err, DataFusionError, Result};
 use crate::format::DEFAULT_CAST_OPTIONS;
 use crate::hash_utils::create_hashes;
 use crate::utils::SingleRowListArrayBuilder;
+use crate::{_internal_datafusion_err, arrow_datafusion_err};
 use arrow::array::{
-    types::{IntervalDayTime, IntervalMonthDayNano},
-    *,
+    Array, ArrayData, ArrayDataBuilder, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType,
+    AsArray, BinaryArray, BinaryViewArray, BinaryViewBuilder, BooleanArray, Date32Array,
+    Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+    DictionaryArray, DurationMicrosecondArray, DurationMillisecondArray,
+    DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray,
+    FixedSizeListArray, Float16Array, Float32Array, Float64Array, GenericListArray,
+    Int8Array, Int16Array, Int32Array, Int64Array, IntervalDayTimeArray,
+    IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeListArray,
+    LargeStringArray, ListArray, MapArray, MutableArrayData, OffsetSizeTrait,
+    PrimitiveArray, RunArray, Scalar, StringArray, StringViewArray, StringViewBuilder,
+    StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
+    Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
+    TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array,
+    UInt64Array, UnionArray, downcast_run_array, new_empty_array, new_null_array,
 };
-use arrow::buffer::ScalarBuffer;
-use arrow::compute::kernels::{
-    cast::{cast_with_options, CastOptions},
-    numeric::*,
+use arrow::buffer::{BooleanBuffer, ScalarBuffer};
+use arrow::compute::kernels::cast::{CastOptions, cast_with_options};
+use arrow::compute::kernels::numeric::{
+    add, add_wrapping, div, mul, mul_wrapping, rem, sub, sub_wrapping,
 };
 use arrow::datatypes::{
-    i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType,
-    Date32Type, Date64Type, Field, Float32Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-    IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type, UnionFields, UnionMode, DECIMAL128_MAX_PRECISION,
+    ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType, Date32Type,
+    Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType, Field,
+    FieldRef, Float32Type, Int8Type, Int16Type, Int32Type, Int64Type, IntervalDayTime,
+    IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType, IntervalUnit,
+    IntervalYearMonthType, RunEndIndexType, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type, UnionFields, UnionMode, i256,
+    validate_decimal_precision_and_scale,
 };
-use arrow::util::display::{array_value_to_string, ArrayFormatter, FormatOptions};
+use arrow::util::display::{ArrayFormatter, FormatOptions, array_value_to_string};
+use cache::{get_or_create_cached_key_array, get_or_create_cached_null_array};
+use chrono::{Duration, NaiveDate};
 use half::f16;
 pub use struct_builder::ScalarStructBuilder;
 
+const SECONDS_PER_DAY: i64 = 86_400;
+const MILLIS_PER_DAY: i64 = SECONDS_PER_DAY * 1_000;
+const MICROS_PER_DAY: i64 = MILLIS_PER_DAY * 1_000;
+const NANOS_PER_DAY: i64 = MICROS_PER_DAY * 1_000;
+const MICROS_PER_MILLISECOND: i64 = 1_000;
+const NANOS_PER_MILLISECOND: i64 = 1_000_000;
+
+/// Returns the multiplier that converts the input date representation into the
+/// desired timestamp unit, if the conversion requires a multiplication that can
+/// overflow an `i64`.
+pub fn date_to_timestamp_multiplier(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Option<i64> {
+    let DataType::Timestamp(target_unit, _) = target_type else {
+        return None;
+    };
+
+    // Only `Timestamp` target types have a time unit; otherwise no
+    // multiplier applies (handled above). The function returns `Some(m)`
+    // when converting the `source_type` to `target_type` requires a
+    // multiplication that could overflow `i64`. It returns `None` when
+    // the conversion is a division or otherwise doesn't require a
+    // multiplication (e.g. Date64 -> Second).
+    match source_type {
+        // Date32 stores days since epoch. Converting to any timestamp
+        // unit requires multiplying by the per-day factor (seconds,
+        // milliseconds, microseconds, nanoseconds).
+        DataType::Date32 => Some(match target_unit {
+            TimeUnit::Second => SECONDS_PER_DAY,
+            TimeUnit::Millisecond => MILLIS_PER_DAY,
+            TimeUnit::Microsecond => MICROS_PER_DAY,
+            TimeUnit::Nanosecond => NANOS_PER_DAY,
+        }),
+
+        // Date64 stores milliseconds since epoch. Converting to
+        // seconds is a division (no multiplication), so return `None`.
+        // Converting to milliseconds is 1:1 (multiplier 1). Converting
+        // to micro/nano requires multiplying by 1_000 / 1_000_000.
+        DataType::Date64 => match target_unit {
+            TimeUnit::Second => None,
+            // Converting Date64 (ms since epoch) to millisecond timestamps
+            // is an identity conversion and does not require multiplication.
+            // Returning `None` indicates no multiplication-based overflow
+            // check is necessary.
+            TimeUnit::Millisecond => None,
+            TimeUnit::Microsecond => Some(MICROS_PER_MILLISECOND),
+            TimeUnit::Nanosecond => Some(NANOS_PER_MILLISECOND),
+        },
+
+        _ => None,
+    }
+}
+
+/// Ensures the provided value can be represented as a timestamp with the given
+/// multiplier. Returns an [`DataFusionError::Execution`] when the converted
+/// value would overflow the timestamp range.
+pub fn ensure_timestamp_in_bounds(
+    value: i64,
+    multiplier: i64,
+    source_type: &DataType,
+    target_type: &DataType,
+) -> Result<()> {
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    if value.checked_mul(multiplier).is_none() {
+        let target = format_timestamp_type_for_error(target_type);
+        _exec_err!(
+            "Cannot cast {} value {} to {}: converted value exceeds the representable i64 range",
+            source_type,
+            value,
+            target
+        )
+    } else {
+        Ok(())
+    }
+}
+
+/// Format a `DataType::Timestamp` into a short, stable string used in
+/// user-facing error messages.
+pub(crate) fn format_timestamp_type_for_error(target_type: &DataType) -> String {
+    match target_type {
+        DataType::Timestamp(unit, _) => {
+            let s = match unit {
+                TimeUnit::Second => "s",
+                TimeUnit::Millisecond => "ms",
+                TimeUnit::Microsecond => "us",
+                TimeUnit::Nanosecond => "ns",
+            };
+            format!("Timestamp({s})")
+        }
+        other => format!("{other}"),
+    }
+}
+
 /// A dynamically typed, nullable single value.
 ///
 /// While an arrow  [`Array`]) stores one or more values of the same type, in a
@@ -142,9 +270,9 @@ pub use struct_builder::ScalarStructBuilder;
 /// let field_b = Field::new("b", DataType::Utf8, false);
 ///
 /// let s1 = ScalarStructBuilder::new()
-///    .with_scalar(field_a, ScalarValue::from(1i32))
-///    .with_scalar(field_b, ScalarValue::from("foo"))
-///    .build();
+///     .with_scalar(field_a, ScalarValue::from(1i32))
+///     .with_scalar(field_b, ScalarValue::from("foo"))
+///     .build();
 /// ```
 ///
 /// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`]
@@ -170,13 +298,13 @@ pub use struct_builder::ScalarStructBuilder;
 /// // Build a struct like: {a: 1, b: "foo"}
 /// // Field description
 /// let fields = Fields::from(vec![
-///   Field::new("a", DataType::Int32, false),
-///   Field::new("b", DataType::Utf8, false),
+///     Field::new("a", DataType::Int32, false),
+///     Field::new("b", DataType::Utf8, false),
 /// ]);
 /// // one row arrays for each field
 /// let arrays: Vec<ArrayRef> = vec![
-///   Arc::new(Int32Array::from(vec![1])),
-///   Arc::new(StringArray::from(vec!["foo"])),
+///     Arc::new(Int32Array::from(vec![1])),
+///     Arc::new(StringArray::from(vec!["foo"])),
 /// ];
 /// // no nulls for this array
 /// let nulls = None;
@@ -191,6 +319,8 @@ pub use struct_builder::ScalarStructBuilder;
 /// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) for
 /// details on datatypes and the [format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375)
 /// for the definitive reference.
+///
+/// [`NullArray`]: arrow::array::NullArray
 #[derive(Clone)]
 pub enum ScalarValue {
     /// represents `DataType::Null` (castable to/from any other type)
@@ -203,6 +333,10 @@ pub enum ScalarValue {
     Float32(Option<f32>),
     /// 64bit float
     Float64(Option<f64>),
+    /// 32bit decimal, using the i32 to represent the decimal, precision scale
+    Decimal32(Option<i32>, u8, i8),
+    /// 64bit decimal, using the i64 to represent the decimal, precision scale
+    Decimal64(Option<i64>, u8, i8),
     /// 128bit decimal, using the i128 to represent the decimal, precision scale
     Decimal128(Option<i128>, u8, i8),
     /// 256bit decimal, using the i256 to represent the decimal, precision scale
@@ -296,6 +430,8 @@ pub enum ScalarValue {
     Union(Option<(i8, Box<ScalarValue>)>, UnionFields, UnionMode),
     /// Dictionary type: index type and value
     Dictionary(Box<DataType>, Box<ScalarValue>),
+    /// (run-ends field, value field, value)
+    RunEndEncoded(FieldRef, FieldRef, Box<ScalarValue>),
 }
 
 impl Hash for Fl<f16> {
@@ -312,6 +448,14 @@ impl PartialEq for ScalarValue {
         // any newly added enum variant will require editing this list
         // or else face a compile error
         match (self, other) {
+            (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => {
+                v1.eq(v2) && p1.eq(p2) && s1.eq(s2)
+            }
+            (Decimal32(_, _, _), _) => false,
+            (Decimal64(v1, p1, s1), Decimal64(v2, p2, s2)) => {
+                v1.eq(v2) && p1.eq(p2) && s1.eq(s2)
+            }
+            (Decimal64(_, _, _), _) => false,
             (Decimal128(v1, p1, s1), Decimal128(v2, p2, s2)) => {
                 v1.eq(v2) && p1.eq(p2) && s1.eq(s2)
             }
@@ -417,6 +561,10 @@ impl PartialEq for ScalarValue {
             (Union(_, _, _), _) => false,
             (Dictionary(k1, v1), Dictionary(k2, v2)) => k1.eq(k2) && v1.eq(v2),
             (Dictionary(_, _), _) => false,
+            (RunEndEncoded(rf1, vf1, v1), RunEndEncoded(rf2, vf2, v2)) => {
+                rf1.eq(rf2) && vf1.eq(vf2) && v1.eq(v2)
+            }
+            (RunEndEncoded(_, _, _), _) => false,
             (Null, Null) => true,
             (Null, _) => false,
         }
@@ -431,6 +579,24 @@ impl PartialOrd for ScalarValue {
         // any newly added enum variant will require editing this list
         // or else face a compile error
         match (self, other) {
+            (Decimal32(v1, p1, s1), Decimal32(v2, p2, s2)) => {
+                if p1.eq(p2) && s1.eq(s2) {
+                    v1.partial_cmp(v2)
+                } else {
+                    // Two decimal values can be compared if they have the same precision and scale.
+                    None
+                }
+            }
+            (Decimal32(_, _, _), _) => None,
+            (Decimal64(v1, p1, s1), Decimal64(v2, p2, s2)) => {
+                if p1.eq(p2) && s1.eq(s2) {
+                    v1.partial_cmp(v2)
+                } else {
+                    // Two decimal values can be compared if they have the same precision and scale.
+                    None
+                }
+            }
+            (Decimal64(_, _, _), _) => None,
             (Decimal128(v1, p1, s1), Decimal128(v2, p2, s2)) => {
                 if p1.eq(p2) && s1.eq(s2) {
                     v1.partial_cmp(v2)
@@ -561,13 +727,18 @@ impl PartialOrd for ScalarValue {
             (Union(_, _, _), _) => None,
             (Dictionary(k1, v1), Dictionary(k2, v2)) => {
                 // Don't compare if the key types don't match (it is effectively a different datatype)
-                if k1 == k2 {
+                if k1 == k2 { v1.partial_cmp(v2) } else { None }
+            }
+            (Dictionary(_, _), _) => None,
+            (RunEndEncoded(rf1, vf1, v1), RunEndEncoded(rf2, vf2, v2)) => {
+                // Don't compare if the run ends fields don't match (it is effectively a different datatype)
+                if rf1 == rf2 && vf1 == vf2 {
                     v1.partial_cmp(v2)
                 } else {
                     None
                 }
             }
-            (Dictionary(_, _), _) => None,
+            (RunEndEncoded(_, _, _), _) => None,
             (Null, Null) => Some(Ordering::Equal),
             (Null, _) => None,
         }
@@ -585,7 +756,9 @@ fn first_array_for_list(arr: &dyn Array) -> ArrayRef {
     } else if let Some(arr) = arr.as_fixed_size_list_opt() {
         arr.value(0)
     } else {
-        unreachable!("Since only List / LargeList / FixedSizeList are supported, this should never happen")
+        unreachable!(
+            "Since only List / LargeList / FixedSizeList are supported, this should never happen"
+        )
     }
 }
 
@@ -732,6 +905,16 @@ impl Hash for ScalarValue {
     fn hash<H: Hasher>(&self, state: &mut H) {
         use ScalarValue::*;
         match self {
+            Decimal32(v, p, s) => {
+                v.hash(state);
+                p.hash(state);
+                s.hash(state)
+            }
+            Decimal64(v, p, s) => {
+                v.hash(state);
+                p.hash(state);
+                s.hash(state)
+            }
             Decimal128(v, p, s) => {
                 v.hash(state);
                 p.hash(state);
@@ -799,6 +982,11 @@ impl Hash for ScalarValue {
                 k.hash(state);
                 v.hash(state);
             }
+            RunEndEncoded(rf, vf, v) => {
+                rf.hash(state);
+                vf.hash(state);
+                v.hash(state);
+            }
             // stable hash for Null value
             Null => 1.hash(state),
         }
@@ -806,10 +994,11 @@ impl Hash for ScalarValue {
 }
 
 fn hash_nested_array<H: Hasher>(arr: ArrayRef, state: &mut H) {
-    let arrays = vec![arr.to_owned()];
-    let hashes_buffer = &mut vec![0; arr.len()];
-    let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0);
-    let hashes = create_hashes(&arrays, &random_state, hashes_buffer).unwrap();
+    let len = arr.len();
+    let hashes_buffer = &mut vec![0; len];
+    let random_state = crate::hash_utils::RandomState::with_seed(0);
+    let hashes = create_hashes(&[arr], &random_state, hashes_buffer)
+        .expect("hash_nested_array: failed to create row hashes");
     // Hash back to std::hash::Hasher
     hashes.hash(state);
 }
@@ -839,15 +1028,9 @@ fn dict_from_scalar<K: ArrowDictionaryKeyType>(
     let values_array = value.to_array_of_size(1)?;
 
     // Create a key array with `size` elements, each of 0
-    let key_array: PrimitiveArray<K> = repeat_n(
-        if value.is_null() {
-            None
-        } else {
-            Some(K::default_value())
-        },
-        size,
-    )
-    .collect();
+    // Use cache to avoid repeated allocations for the same size
+    let key_array: PrimitiveArray<K> =
+        get_or_create_cached_key_array::<K>(size, value.is_null());
 
     // create a new DictionaryArray
     //
@@ -859,8 +1042,21 @@ fn dict_from_scalar<K: ArrowDictionaryKeyType>(
     ))
 }
 
-/// Create a dictionary array representing all the values in values
-fn dict_from_values<K: ArrowDictionaryKeyType>(
+/// Create a `DictionaryArray` from the provided values array.
+///
+/// Each element gets a unique key (`0..N-1`), without deduplication.
+/// Useful for wrapping arrays in dictionary form.
+///
+/// # Input
+/// ["alice", "bob", "alice", null, "carol"]
+///
+/// # Output
+/// `DictionaryArray<Int32>`
+/// {
+///   keys:   [0, 1, 2, 3, 4],
+///   values: ["alice", "bob", "alice", null, "carol"]
+/// }
+pub fn dict_from_values<K: ArrowDictionaryKeyType>(
     values_array: ArrayRef,
 ) -> Result<ArrayRef> {
     // Create a key array with `size` elements of 0..array_len for all
@@ -869,11 +1065,10 @@ fn dict_from_values<K: ArrowDictionaryKeyType>(
         .map(|index| {
             if values_array.is_valid(index) {
                 let native_index = K::Native::from_usize(index).ok_or_else(|| {
-                    DataFusionError::Internal(format!(
-                        "Can not create index of type {} from value {}",
-                        K::DATA_TYPE,
-                        index
-                    ))
+                    _internal_datafusion_err!(
+                        "Can not create index of type {} from value {index}",
+                        K::DATA_TYPE
+                    )
                 })?;
                 Ok(Some(native_index))
             } else {
@@ -894,17 +1089,8 @@ fn dict_from_values<K: ArrowDictionaryKeyType>(
 }
 
 macro_rules! typed_cast_tz {
-    ($array:expr, $index:expr, $ARRAYTYPE:ident, $SCALAR:ident, $TZ:expr) => {{
-        use std::any::type_name;
-        let array = $array
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "could not cast value to {}",
-                    type_name::<$ARRAYTYPE>()
-                ))
-            })?;
+    ($array:expr, $index:expr, $array_cast:ident, $SCALAR:ident, $TZ:expr) => {{
+        let array = $array_cast($array)?;
         Ok::<ScalarValue, DataFusionError>(ScalarValue::$SCALAR(
             match array.is_null($index) {
                 true => None,
@@ -916,17 +1102,8 @@ macro_rules! typed_cast_tz {
 }
 
 macro_rules! typed_cast {
-    ($array:expr, $index:expr, $ARRAYTYPE:ident, $SCALAR:ident) => {{
-        use std::any::type_name;
-        let array = $array
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "could not cast value to {}",
-                    type_name::<$ARRAYTYPE>()
-                ))
-            })?;
+    ($array:expr, $index:expr, $array_cast:ident, $SCALAR:ident) => {{
+        let array = $array_cast($array)?;
         Ok::<ScalarValue, DataFusionError>(ScalarValue::$SCALAR(
             match array.is_null($index) {
                 true => None,
@@ -963,17 +1140,8 @@ macro_rules! build_timestamp_array_from_option {
 }
 
 macro_rules! eq_array_primitive {
-    ($array:expr, $index:expr, $ARRAYTYPE:ident, $VALUE:expr) => {{
-        use std::any::type_name;
-        let array = $array
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "could not cast value to {}",
-                    type_name::<$ARRAYTYPE>()
-                ))
-            })?;
+    ($array:expr, $index:expr, $array_cast:ident, $VALUE:expr) => {{
+        let array = $array_cast($array)?;
         let is_valid = array.is_valid($index);
         Ok::<bool, DataFusionError>(match $VALUE {
             Some(val) => is_valid && &array.value($index) == val,
@@ -1004,21 +1172,16 @@ impl ScalarValue {
 
     /// Create a decimal Scalar from value/precision and scale.
     pub fn try_new_decimal128(value: i128, precision: u8, scale: i8) -> Result<Self> {
-        // make sure the precision and scale is valid
-        if precision <= DECIMAL128_MAX_PRECISION && scale.unsigned_abs() <= precision {
-            return Ok(ScalarValue::Decimal128(Some(value), precision, scale));
-        }
-        _internal_err!(
-            "Can not new a decimal type ScalarValue for precision {precision} and scale {scale}"
-        )
+        Self::validate_decimal_or_internal_err::<Decimal128Type>(precision, scale)?;
+        Ok(ScalarValue::Decimal128(Some(value), precision, scale))
     }
 
     /// Create a Null instance of ScalarValue for this datatype
     ///
     /// Example
     /// ```
-    /// use datafusion_common::ScalarValue;
     /// use arrow::datatypes::DataType;
+    /// use datafusion_common::ScalarValue;
     ///
     /// let scalar = ScalarValue::try_new_null(&DataType::Int32).unwrap();
     /// assert_eq!(scalar.is_null(), true);
@@ -1038,6 +1201,12 @@ impl ScalarValue {
             DataType::UInt16 => ScalarValue::UInt16(None),
             DataType::UInt32 => ScalarValue::UInt32(None),
             DataType::UInt64 => ScalarValue::UInt64(None),
+            DataType::Decimal32(precision, scale) => {
+                ScalarValue::Decimal32(None, *precision, *scale)
+            }
+            DataType::Decimal64(precision, scale) => {
+                ScalarValue::Decimal64(None, *precision, *scale)
+            }
             DataType::Decimal128(precision, scale) => {
                 ScalarValue::Decimal128(None, *precision, *scale)
             }
@@ -1096,7 +1265,14 @@ impl ScalarValue {
                 index_type.clone(),
                 Box::new(value_type.as_ref().try_into()?),
             ),
-            // `ScalaValue::List` contains single element `ListArray`.
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                ScalarValue::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(value_field.data_type().try_into()?),
+                )
+            }
+            // `ScalarValue::List` contains single element `ListArray`.
             DataType::List(field_ref) => ScalarValue::List(Arc::new(
                 GenericListArray::new_null(Arc::clone(field_ref), 1),
             )),
@@ -1104,7 +1280,7 @@ impl ScalarValue {
             DataType::LargeList(field_ref) => ScalarValue::LargeList(Arc::new(
                 GenericListArray::new_null(Arc::clone(field_ref), 1),
             )),
-            // `ScalaValue::FixedSizeList` contains single element `FixedSizeList`.
+            // `ScalarValue::FixedSizeList` contains single element `FixedSizeList`.
             DataType::FixedSizeList(field_ref, fixed_length) => {
                 ScalarValue::FixedSizeList(Arc::new(FixedSizeListArray::new_null(
                     Arc::clone(field_ref),
@@ -1130,7 +1306,7 @@ impl ScalarValue {
             DataType::Null => ScalarValue::Null,
             _ => {
                 return _not_impl_err!(
-                    "Can't create a null scalar from data_type \"{data_type:?}\""
+                    "Can't create a null scalar from data_type \"{data_type}\""
                 );
             }
         })
@@ -1184,21 +1360,21 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing PI
     pub fn new_pi(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::PI)),
             DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::PI)),
             DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::PI)),
-            _ => _internal_err!("PI is not supported for data type: {:?}", datatype),
+            _ => _internal_err!("PI is not supported for data type: {}", datatype),
         }
     }
 
     /// Returns a [`ScalarValue`] representing PI's upper bound
     pub fn new_pi_upper(datatype: &DataType) -> Result<ScalarValue> {
-        // TODO: replace the constants with next_up/next_down when
-        // they are stabilized: https://doc.rust-lang.org/std/primitive.f64.html#method.next_up
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::Float16(Some(consts::PI_UPPER_F16))),
             DataType::Float32 => Ok(ScalarValue::from(consts::PI_UPPER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::PI_UPPER_F64)),
             _ => {
-                _internal_err!("PI_UPPER is not supported for data type: {:?}", datatype)
+                _internal_err!("PI_UPPER is not supported for data type: {}", datatype)
             }
         }
     }
@@ -1206,10 +1382,13 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing -PI's lower bound
     pub fn new_negative_pi_lower(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => {
+                Ok(ScalarValue::Float16(Some(consts::NEGATIVE_PI_LOWER_F16)))
+            }
             DataType::Float32 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::NEGATIVE_PI_LOWER_F64)),
             _ => {
-                _internal_err!("-PI_LOWER is not supported for data type: {:?}", datatype)
+                _internal_err!("-PI_LOWER is not supported for data type: {}", datatype)
             }
         }
     }
@@ -1217,13 +1396,13 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing FRAC_PI_2's upper bound
     pub fn new_frac_pi_2_upper(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => {
+                Ok(ScalarValue::Float16(Some(consts::FRAC_PI_2_UPPER_F16)))
+            }
             DataType::Float32 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F32)),
             DataType::Float64 => Ok(ScalarValue::from(consts::FRAC_PI_2_UPPER_F64)),
             _ => {
-                _internal_err!(
-                    "PI_UPPER/2 is not supported for data type: {:?}",
-                    datatype
-                )
+                _internal_err!("PI_UPPER/2 is not supported for data type: {}", datatype)
             }
         }
     }
@@ -1231,6 +1410,9 @@ impl ScalarValue {
     // Returns a [`ScalarValue`] representing FRAC_PI_2's lower bound
     pub fn new_neg_frac_pi_2_lower(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::Float16(Some(
+                consts::NEGATIVE_FRAC_PI_2_LOWER_F16,
+            ))),
             DataType::Float32 => {
                 Ok(ScalarValue::from(consts::NEGATIVE_FRAC_PI_2_LOWER_F32))
             }
@@ -1238,10 +1420,7 @@ impl ScalarValue {
                 Ok(ScalarValue::from(consts::NEGATIVE_FRAC_PI_2_LOWER_F64))
             }
             _ => {
-                _internal_err!(
-                    "-PI/2_LOWER is not supported for data type: {:?}",
-                    datatype
-                )
+                _internal_err!("-PI/2_LOWER is not supported for data type: {}", datatype)
             }
         }
     }
@@ -1249,37 +1428,41 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing -PI
     pub fn new_negative_pi(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(-f16::PI)),
             DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::PI)),
             DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::PI)),
-            _ => _internal_err!("-PI is not supported for data type: {:?}", datatype),
+            _ => _internal_err!("-PI is not supported for data type: {}", datatype),
         }
     }
 
     /// Returns a [`ScalarValue`] representing PI/2
     pub fn new_frac_pi_2(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::FRAC_PI_2)),
             DataType::Float32 => Ok(ScalarValue::from(std::f32::consts::FRAC_PI_2)),
             DataType::Float64 => Ok(ScalarValue::from(std::f64::consts::FRAC_PI_2)),
-            _ => _internal_err!("PI/2 is not supported for data type: {:?}", datatype),
+            _ => _internal_err!("PI/2 is not supported for data type: {}", datatype),
         }
     }
 
     /// Returns a [`ScalarValue`] representing -PI/2
     pub fn new_neg_frac_pi_2(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(-f16::FRAC_PI_2)),
             DataType::Float32 => Ok(ScalarValue::from(-std::f32::consts::FRAC_PI_2)),
             DataType::Float64 => Ok(ScalarValue::from(-std::f64::consts::FRAC_PI_2)),
-            _ => _internal_err!("-PI/2 is not supported for data type: {:?}", datatype),
+            _ => _internal_err!("-PI/2 is not supported for data type: {}", datatype),
         }
     }
 
     /// Returns a [`ScalarValue`] representing infinity
     pub fn new_infinity(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::INFINITY)),
             DataType::Float32 => Ok(ScalarValue::from(f32::INFINITY)),
             DataType::Float64 => Ok(ScalarValue::from(f64::INFINITY)),
             _ => {
-                _internal_err!("Infinity is not supported for data type: {:?}", datatype)
+                _internal_err!("Infinity is not supported for data type: {}", datatype)
             }
         }
     }
@@ -1287,11 +1470,12 @@ impl ScalarValue {
     /// Returns a [`ScalarValue`] representing negative infinity
     pub fn new_neg_infinity(datatype: &DataType) -> Result<ScalarValue> {
         match datatype {
+            DataType::Float16 => Ok(ScalarValue::from(f16::NEG_INFINITY)),
             DataType::Float32 => Ok(ScalarValue::from(f32::NEG_INFINITY)),
             DataType::Float64 => Ok(ScalarValue::from(f64::NEG_INFINITY)),
             _ => {
                 _internal_err!(
-                    "Negative Infinity is not supported for data type: {:?}",
+                    "Negative Infinity is not supported for data type: {}",
                     datatype
                 )
             }
@@ -1310,9 +1494,15 @@ impl ScalarValue {
             DataType::UInt16 => ScalarValue::UInt16(Some(0)),
             DataType::UInt32 => ScalarValue::UInt32(Some(0)),
             DataType::UInt64 => ScalarValue::UInt64(Some(0)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(0.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::ZERO)),
             DataType::Float32 => ScalarValue::Float32(Some(0.0)),
             DataType::Float64 => ScalarValue::Float64(Some(0.0)),
+            DataType::Decimal32(precision, scale) => {
+                ScalarValue::Decimal32(Some(0), *precision, *scale)
+            }
+            DataType::Decimal64(precision, scale) => {
+                ScalarValue::Decimal64(Some(0), *precision, *scale)
+            }
             DataType::Decimal128(precision, scale) => {
                 ScalarValue::Decimal128(Some(0), *precision, *scale)
             }
@@ -1364,12 +1554,159 @@ impl ScalarValue {
             DataType::Date64 => ScalarValue::Date64(Some(0)),
             _ => {
                 return _not_impl_err!(
-                    "Can't create a zero scalar from data_type \"{datatype:?}\""
+                    "Can't create a zero scalar from data_type \"{datatype}\""
                 );
             }
         })
     }
 
+    /// Returns a default value for the given `DataType`.
+    ///
+    /// This function is useful when you need to initialize a column with
+    /// non-null values in a DataFrame or when you need a "zero" value
+    /// for a specific data type.
+    ///
+    /// # Default Values
+    ///
+    /// - **Numeric types**: Returns zero (via [`new_zero`])
+    /// - **String types**: Returns empty string (`""`)
+    /// - **Binary types**: Returns empty byte array
+    /// - **Temporal types**: Returns zero/epoch value
+    /// - **List types**: Returns empty list
+    /// - **Struct types**: Returns struct with all fields set to their defaults
+    /// - **Dictionary types**: Returns dictionary with default value
+    /// - **Map types**: Returns empty map
+    /// - **Union types**: Returns first variant with default value
+    ///
+    /// # Errors
+    ///
+    /// Returns an error for data types that don't have a clear default value
+    /// or are not yet supported (e.g., `RunEndEncoded`).
+    ///
+    /// [`new_zero`]: Self::new_zero
+    pub fn new_default(datatype: &DataType) -> Result<ScalarValue> {
+        match datatype {
+            // Null type
+            DataType::Null => Ok(ScalarValue::Null),
+
+            // Numeric types
+            DataType::Boolean
+            | DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float16
+            | DataType::Float32
+            | DataType::Float64
+            | DataType::Decimal32(_, _)
+            | DataType::Decimal64(_, _)
+            | DataType::Decimal128(_, _)
+            | DataType::Decimal256(_, _)
+            | DataType::Timestamp(_, _)
+            | DataType::Time32(_)
+            | DataType::Time64(_)
+            | DataType::Interval(_)
+            | DataType::Duration(_)
+            | DataType::Date32
+            | DataType::Date64 => ScalarValue::new_zero(datatype),
+
+            // String types
+            DataType::Utf8 => Ok(ScalarValue::Utf8(Some("".to_string()))),
+            DataType::LargeUtf8 => Ok(ScalarValue::LargeUtf8(Some("".to_string()))),
+            DataType::Utf8View => Ok(ScalarValue::Utf8View(Some("".to_string()))),
+
+            // Binary types
+            DataType::Binary => Ok(ScalarValue::Binary(Some(vec![]))),
+            DataType::LargeBinary => Ok(ScalarValue::LargeBinary(Some(vec![]))),
+            DataType::BinaryView => Ok(ScalarValue::BinaryView(Some(vec![]))),
+
+            // Fixed-size binary
+            DataType::FixedSizeBinary(size) => Ok(ScalarValue::FixedSizeBinary(
+                *size,
+                Some(vec![0; *size as usize]),
+            )),
+
+            // List types
+            DataType::List(field) => {
+                let list =
+                    ScalarValue::new_list(&[], field.data_type(), field.is_nullable());
+                Ok(ScalarValue::List(list))
+            }
+            DataType::FixedSizeList(field, _size) => {
+                let empty_arr = new_empty_array(field.data_type());
+                let values = Arc::new(
+                    SingleRowListArrayBuilder::new(empty_arr)
+                        .with_nullable(field.is_nullable())
+                        .build_fixed_size_list_array(0),
+                );
+                Ok(ScalarValue::FixedSizeList(values))
+            }
+            DataType::LargeList(field) => {
+                let list = ScalarValue::new_large_list(&[], field.data_type());
+                Ok(ScalarValue::LargeList(list))
+            }
+
+            // Struct types
+            DataType::Struct(fields) => {
+                let values = fields
+                    .iter()
+                    .map(|f| ScalarValue::new_default(f.data_type()))
+                    .collect::<Result<Vec<_>>>()?;
+                Ok(ScalarValue::Struct(Arc::new(StructArray::new(
+                    fields.clone(),
+                    values
+                        .into_iter()
+                        .map(|v| v.to_array())
+                        .collect::<Result<_>>()?,
+                    None,
+                ))))
+            }
+
+            // Dictionary types
+            DataType::Dictionary(key_type, value_type) => Ok(ScalarValue::Dictionary(
+                key_type.clone(),
+                Box::new(ScalarValue::new_default(value_type)?),
+            )),
+
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                Ok(ScalarValue::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(ScalarValue::new_default(value_field.data_type())?),
+                ))
+            }
+
+            // Map types
+            DataType::Map(field, _) => Ok(ScalarValue::Map(Arc::new(MapArray::from(
+                ArrayData::new_empty(field.data_type()),
+            )))),
+
+            // Union types - return first variant with default value
+            DataType::Union(fields, mode) => {
+                if let Some((type_id, field)) = fields.iter().next() {
+                    let default_value = ScalarValue::new_default(field.data_type())?;
+                    Ok(ScalarValue::Union(
+                        Some((type_id, Box::new(default_value))),
+                        fields.clone(),
+                        *mode,
+                    ))
+                } else {
+                    _internal_err!("Union type must have at least one field")
+                }
+            }
+
+            DataType::ListView(_) | DataType::LargeListView(_) => {
+                _not_impl_err!(
+                    "Default value for data_type \"{datatype}\" is not implemented yet"
+                )
+            }
+        }
+    }
+
     /// Create an one value in the given type.
     pub fn new_one(datatype: &DataType) -> Result<ScalarValue> {
         Ok(match datatype {
@@ -1381,12 +1718,60 @@ impl ScalarValue {
             DataType::UInt16 => ScalarValue::UInt16(Some(1)),
             DataType::UInt32 => ScalarValue::UInt32(Some(1)),
             DataType::UInt64 => ScalarValue::UInt64(Some(1)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(1.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::ONE)),
             DataType::Float32 => ScalarValue::Float32(Some(1.0)),
             DataType::Float64 => ScalarValue::Float64(Some(1.0)),
+            DataType::Decimal32(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match 10_i32.checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal32(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal64(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i64::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal64(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal128(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i128::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal128(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal256(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i256::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal256(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
             _ => {
                 return _not_impl_err!(
-                    "Can't create an one scalar from data_type \"{datatype:?}\""
+                    "Can't create an one scalar from data_type \"{datatype}\""
                 );
             }
         })
@@ -1399,12 +1784,60 @@ impl ScalarValue {
             DataType::Int16 | DataType::UInt16 => ScalarValue::Int16(Some(-1)),
             DataType::Int32 | DataType::UInt32 => ScalarValue::Int32(Some(-1)),
             DataType::Int64 | DataType::UInt64 => ScalarValue::Int64(Some(-1)),
-            DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(-1.0))),
+            DataType::Float16 => ScalarValue::Float16(Some(f16::NEG_ONE)),
             DataType::Float32 => ScalarValue::Float32(Some(-1.0)),
             DataType::Float64 => ScalarValue::Float64(Some(-1.0)),
+            DataType::Decimal32(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match 10_i32.checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal32(Some(-value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal64(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i64::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal64(Some(-value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal128(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i128::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal128(Some(-value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal256(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i256::from(10).checked_pow(*scale as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal256(Some(-value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
             _ => {
                 return _not_impl_err!(
-                    "Can't create a negative one scalar from data_type \"{datatype:?}\""
+                    "Can't create a negative one scalar from data_type \"{datatype}\""
                 );
             }
         })
@@ -1423,9 +1856,57 @@ impl ScalarValue {
             DataType::Float16 => ScalarValue::Float16(Some(f16::from_f32(10.0))),
             DataType::Float32 => ScalarValue::Float32(Some(10.0)),
             DataType::Float64 => ScalarValue::Float64(Some(10.0)),
+            DataType::Decimal32(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal32Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match 10_i32.checked_pow((*scale + 1) as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal32(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal64(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal64Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i64::from(10).checked_pow((*scale + 1) as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal64(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal128(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal128Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i128::from(10).checked_pow((*scale + 1) as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal128(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
+            DataType::Decimal256(precision, scale) => {
+                Self::validate_decimal_or_internal_err::<Decimal256Type>(
+                    *precision, *scale,
+                )?;
+                assert_or_internal_err!(*scale >= 0, "Negative scale is not supported");
+                match i256::from(10).checked_pow((*scale + 1) as u32) {
+                    Some(value) => {
+                        ScalarValue::Decimal256(Some(value), *precision, *scale)
+                    }
+                    None => return _internal_err!("Unsupported scale {scale}"),
+                }
+            }
             _ => {
                 return _not_impl_err!(
-                    "Can't create a ten scalar from data_type \"{datatype:?}\""
+                    "Can't create a ten scalar from data_type \"{datatype}\""
                 );
             }
         })
@@ -1443,6 +1924,12 @@ impl ScalarValue {
             ScalarValue::Int16(_) => DataType::Int16,
             ScalarValue::Int32(_) => DataType::Int32,
             ScalarValue::Int64(_) => DataType::Int64,
+            ScalarValue::Decimal32(_, precision, scale) => {
+                DataType::Decimal32(*precision, *scale)
+            }
+            ScalarValue::Decimal64(_, precision, scale) => {
+                DataType::Decimal64(*precision, *scale)
+            }
             ScalarValue::Decimal128(_, precision, scale) => {
                 DataType::Decimal128(*precision, *scale)
             }
@@ -1503,6 +1990,12 @@ impl ScalarValue {
             ScalarValue::Dictionary(k, v) => {
                 DataType::Dictionary(k.clone(), Box::new(v.data_type()))
             }
+            ScalarValue::RunEndEncoded(run_ends_field, value_field, _) => {
+                DataType::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                )
+            }
             ScalarValue::Null => DataType::Null,
         }
     }
@@ -1524,9 +2017,7 @@ impl ScalarValue {
             | ScalarValue::Float16(None)
             | ScalarValue::Float32(None)
             | ScalarValue::Float64(None) => Ok(self.clone()),
-            ScalarValue::Float16(Some(v)) => {
-                Ok(ScalarValue::Float16(Some(f16::from_f32(-v.to_f32()))))
-            }
+            ScalarValue::Float16(Some(v)) => Ok(ScalarValue::Float16(Some(-v))),
             ScalarValue::Float64(Some(v)) => Ok(ScalarValue::Float64(Some(-v))),
             ScalarValue::Float32(Some(v)) => Ok(ScalarValue::Float32(Some(-v))),
             ScalarValue::Int8(Some(v)) => Ok(ScalarValue::Int8(Some(v.neg_checked()?))),
@@ -1565,6 +2056,24 @@ impl ScalarValue {
                 );
                 Ok(ScalarValue::IntervalMonthDayNano(Some(val)))
             }
+            ScalarValue::Decimal32(Some(v), precision, scale) => {
+                Ok(ScalarValue::Decimal32(
+                    Some(neg_checked_with_ctx(*v, || {
+                        format!("In negation of Decimal32({v}, {precision}, {scale})")
+                    })?),
+                    *precision,
+                    *scale,
+                ))
+            }
+            ScalarValue::Decimal64(Some(v), precision, scale) => {
+                Ok(ScalarValue::Decimal64(
+                    Some(neg_checked_with_ctx(*v, || {
+                        format!("In negation of Decimal64({v}, {precision}, {scale})")
+                    })?),
+                    *precision,
+                    *scale,
+                ))
+            }
             ScalarValue::Decimal128(Some(v), precision, scale) => {
                 Ok(ScalarValue::Decimal128(
                     Some(neg_checked_with_ctx(*v, || {
@@ -1629,6 +2138,7 @@ impl ScalarValue {
         let r = add_wrapping(&self.to_scalar()?, &other.borrow().to_scalar()?)?;
         Self::try_from_array(r.as_ref(), 0)
     }
+
     /// Checked addition of `ScalarValue`
     ///
     /// NB: operating on `ScalarValue` directly is not efficient, performance sensitive code
@@ -1716,6 +2226,8 @@ impl ScalarValue {
             ScalarValue::Float16(v) => v.is_none(),
             ScalarValue::Float32(v) => v.is_none(),
             ScalarValue::Float64(v) => v.is_none(),
+            ScalarValue::Decimal32(v, _, _) => v.is_none(),
+            ScalarValue::Decimal64(v, _, _) => v.is_none(),
             ScalarValue::Decimal128(v, _, _) => v.is_none(),
             ScalarValue::Decimal256(v, _, _) => v.is_none(),
             ScalarValue::Int8(v) => v.is_none(),
@@ -1762,6 +2274,7 @@ impl ScalarValue {
                 None => true,
             },
             ScalarValue::Dictionary(_, v) => v.is_null(),
+            ScalarValue::RunEndEncoded(_, _, v) => v.is_null(),
         }
     }
 
@@ -1792,6 +2305,26 @@ impl ScalarValue {
             (Self::Float64(Some(l)), Self::Float64(Some(r))) => {
                 Some((l - r).abs().round() as _)
             }
+            (
+                Self::Decimal128(Some(l), lprecision, lscale),
+                Self::Decimal128(Some(r), rprecision, rscale),
+            ) => {
+                if lprecision == rprecision && lscale == rscale {
+                    l.checked_sub(*r)?.checked_abs()?.to_usize()
+                } else {
+                    None
+                }
+            }
+            (
+                Self::Decimal256(Some(l), lprecision, lscale),
+                Self::Decimal256(Some(r), rprecision, rscale),
+            ) => {
+                if lprecision == rprecision && lscale == rscale {
+                    l.checked_sub(*r)?.checked_abs()?.to_usize()
+                } else {
+                    None
+                }
+            }
             _ => None,
         }
     }
@@ -1816,23 +2349,16 @@ impl ScalarValue {
     ///
     /// # Example
     /// ```
-    /// use datafusion_common::ScalarValue;
     /// use arrow::array::{BooleanArray, Int32Array};
+    /// use datafusion_common::ScalarValue;
     ///
     /// let arr = Int32Array::from(vec![Some(1), None, Some(10)]);
     /// let five = ScalarValue::Int32(Some(5));
     ///
-    /// let result = arrow::compute::kernels::cmp::lt(
-    ///   &arr,
-    ///   &five.to_scalar().unwrap(),
-    /// ).unwrap();
+    /// let result =
+    ///     arrow::compute::kernels::cmp::lt(&arr, &five.to_scalar().unwrap()).unwrap();
     ///
-    /// let expected = BooleanArray::from(vec![
-    ///     Some(true),
-    ///     None,
-    ///     Some(false)
-    ///   ]
-    /// );
+    /// let expected = BooleanArray::from(vec![Some(true), None, Some(false)]);
     ///
     /// assert_eq!(&result, &expected);
     /// ```
@@ -1848,32 +2374,22 @@ impl ScalarValue {
     /// Returns an error if the iterator is empty or if the
     /// [`ScalarValue`]s are not all the same type
     ///
-    /// # Panics
-    ///
-    /// Panics if `self` is a dictionary with invalid key type
-    ///
     /// # Example
     /// ```
-    /// use datafusion_common::ScalarValue;
     /// use arrow::array::{ArrayRef, BooleanArray};
+    /// use datafusion_common::ScalarValue;
     ///
     /// let scalars = vec![
-    ///   ScalarValue::Boolean(Some(true)),
-    ///   ScalarValue::Boolean(None),
-    ///   ScalarValue::Boolean(Some(false)),
+    ///     ScalarValue::Boolean(Some(true)),
+    ///     ScalarValue::Boolean(None),
+    ///     ScalarValue::Boolean(Some(false)),
     /// ];
     ///
     /// // Build an Array from the list of ScalarValues
-    /// let array = ScalarValue::iter_to_array(scalars.into_iter())
-    ///   .unwrap();
+    /// let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap();
     ///
-    /// let expected: ArrayRef = std::sync::Arc::new(
-    ///   BooleanArray::from(vec![
-    ///     Some(true),
-    ///     None,
-    ///     Some(false)
-    ///   ]
-    /// ));
+    /// let expected: ArrayRef =
+    ///     std::sync::Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)]));
     ///
     /// assert_eq!(&array, &expected);
     /// ```
@@ -1895,18 +2411,20 @@ impl ScalarValue {
         macro_rules! build_array_primitive {
             ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array)
                 }
             }};
@@ -1915,18 +2433,20 @@ impl ScalarValue {
         macro_rules! build_array_primitive_tz {
             ($ARRAY_TY:ident, $SCALAR_TY:ident, $TZ:expr) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v, _) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v, _) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array.with_timezone_opt($TZ.clone()))
                 }
             }};
@@ -1937,36 +2457,48 @@ impl ScalarValue {
         macro_rules! build_array_string {
             ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{
                 {
-                    let array = scalars.map(|sv| {
-                        if let ScalarValue::$SCALAR_TY(v) = sv {
-                            Ok(v)
-                        } else {
-                            _exec_err!(
-                                "Inconsistent types in ScalarValue::iter_to_array. \
+                    let array = scalars
+                        .map(|sv| {
+                            if let ScalarValue::$SCALAR_TY(v) = sv {
+                                Ok(v)
+                            } else {
+                                _exec_err!(
+                                    "Inconsistent types in ScalarValue::iter_to_array. \
                                     Expected {:?}, got {:?}",
-                                data_type, sv
-                            )
-                        }
-                    })
-                    .collect::<Result<$ARRAY_TY>>()?;
+                                    data_type,
+                                    sv
+                                )
+                            }
+                        })
+                        .collect::<Result<$ARRAY_TY>>()?;
                     Arc::new(array)
                 }
             }};
         }
 
         let array: ArrayRef = match &data_type {
-            DataType::Decimal128(precision, scale) => {
+            DataType::Decimal32(precision, scale) => {
                 let decimal_array =
-                    ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?;
+                    ScalarValue::iter_to_decimal32_array(scalars, *precision, *scale)?;
                 Arc::new(decimal_array)
             }
-            DataType::Decimal256(precision, scale) => {
+            DataType::Decimal64(precision, scale) => {
                 let decimal_array =
-                    ScalarValue::iter_to_decimal256_array(scalars, *precision, *scale)?;
+                    ScalarValue::iter_to_decimal64_array(scalars, *precision, *scale)?;
                 Arc::new(decimal_array)
             }
-            DataType::Null => ScalarValue::iter_to_null_array(scalars)?,
-            DataType::Boolean => build_array_primitive!(BooleanArray, Boolean),
+            DataType::Decimal128(precision, scale) => {
+                let decimal_array =
+                    ScalarValue::iter_to_decimal128_array(scalars, *precision, *scale)?;
+                Arc::new(decimal_array)
+            }
+            DataType::Decimal256(precision, scale) => {
+                let decimal_array =
+                    ScalarValue::iter_to_decimal256_array(scalars, *precision, *scale)?;
+                Arc::new(decimal_array)
+            }
+            DataType::Null => ScalarValue::iter_to_null_array(scalars)?,
+            DataType::Boolean => build_array_primitive!(BooleanArray, Boolean),
             DataType::Float16 => build_array_primitive!(Float16Array, Float16),
             DataType::Float32 => build_array_primitive!(Float32Array, Float32),
             DataType::Float64 => build_array_primitive!(Float64Array, Float64),
@@ -2107,7 +2639,95 @@ impl ScalarValue {
                     DataType::UInt16 => dict_from_values::<UInt16Type>(values)?,
                     DataType::UInt32 => dict_from_values::<UInt32Type>(values)?,
                     DataType::UInt64 => dict_from_values::<UInt64Type>(values)?,
-                    _ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
+                    _ => unreachable!("Invalid dictionary keys type: {}", key_type),
+                }
+            }
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                fn make_run_array<R: RunEndIndexType>(
+                    scalars: impl IntoIterator<Item = ScalarValue>,
+                    run_ends_field: &FieldRef,
+                    values_field: &FieldRef,
+                ) -> Result<ArrayRef> {
+                    let mut scalars = scalars.into_iter();
+
+                    let mut run_ends = vec![];
+                    let mut value_scalars = vec![];
+
+                    let mut len = R::Native::ONE;
+                    let mut current =
+                        if let Some(ScalarValue::RunEndEncoded(_, _, scalar)) =
+                            scalars.next()
+                        {
+                            *scalar
+                        } else {
+                            // We are guaranteed to have one element of correct
+                            // type because we peeked above
+                            unreachable!()
+                        };
+                    for scalar in scalars {
+                        let scalar = match scalar {
+                            ScalarValue::RunEndEncoded(
+                                inner_run_ends_field,
+                                inner_value_field,
+                                scalar,
+                            ) if &inner_run_ends_field == run_ends_field
+                                && &inner_value_field == values_field =>
+                            {
+                                *scalar
+                            }
+                            _ => {
+                                return _exec_err!(
+                                    "Expected RunEndEncoded scalar with run-ends field {run_ends_field} but got: {scalar:?}"
+                                );
+                            }
+                        };
+
+                        // new run
+                        if scalar != current {
+                            run_ends.push(len);
+                            value_scalars.push(current);
+                            current = scalar;
+                        }
+
+                        len = len.add_checked(R::Native::ONE).map_err(|_| {
+                            DataFusionError::Execution(format!(
+                                "Cannot construct RunArray: Overflows run-ends type {}",
+                                run_ends_field.data_type()
+                            ))
+                        })?;
+                    }
+
+                    run_ends.push(len);
+                    value_scalars.push(current);
+
+                    let run_ends = PrimitiveArray::<R>::from_iter_values(run_ends);
+                    let values = ScalarValue::iter_to_array(value_scalars)?;
+
+                    // Using ArrayDataBuilder so we can maintain the fields
+                    let dt = DataType::RunEndEncoded(
+                        Arc::clone(run_ends_field),
+                        Arc::clone(values_field),
+                    );
+                    let builder = ArrayDataBuilder::new(dt)
+                        .len(RunArray::logical_len(&run_ends))
+                        .add_child_data(run_ends.to_data())
+                        .add_child_data(values.to_data());
+                    let run_array = RunArray::<R>::from(builder.build()?);
+
+                    Ok(Arc::new(run_array))
+                }
+
+                match run_ends_field.data_type() {
+                    DataType::Int16 => {
+                        make_run_array::<Int16Type>(scalars, run_ends_field, value_field)?
+                    }
+                    DataType::Int32 => {
+                        make_run_array::<Int32Type>(scalars, run_ends_field, value_field)?
+                    }
+                    DataType::Int64 => {
+                        make_run_array::<Int64Type>(scalars, run_ends_field, value_field)?
+                    }
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
                 }
             }
             DataType::FixedSizeBinary(size) => {
@@ -2118,7 +2738,7 @@ impl ScalarValue {
                         } else {
                             _exec_err!(
                                 "Inconsistent types in ScalarValue::iter_to_array. \
-                                Expected {data_type:?}, got {sv:?}"
+                                Expected {data_type}, got {sv:?}"
                             )
                         }
                     })
@@ -2130,7 +2750,7 @@ impl ScalarValue {
                 Arc::new(array)
             }
             // explicitly enumerate unsupported types so newly added
-            // types must be aknowledged, Time32 and Time64 types are
+            // types must be acknowledged, Time32 and Time64 types are
             // not supported if the TimeUnit is not valid (Time32 can
             // only be used with Second and Millisecond, Time64 only
             // with Microsecond and Nanosecond)
@@ -2138,7 +2758,6 @@ impl ScalarValue {
             | DataType::Time32(TimeUnit::Nanosecond)
             | DataType::Time64(TimeUnit::Second)
             | DataType::Time64(TimeUnit::Millisecond)
-            | DataType::RunEndEncoded(_, _)
             | DataType::ListView(_)
             | DataType::LargeListView(_) => {
                 return _not_impl_err!(
@@ -2166,71 +2785,78 @@ impl ScalarValue {
         Ok(new_null_array(&DataType::Null, length))
     }
 
-    fn iter_to_decimal_array(
+    fn iter_to_decimal32_array(
         scalars: impl IntoIterator<Item = ScalarValue>,
         precision: u8,
         scale: i8,
-    ) -> Result<Decimal128Array> {
+    ) -> Result<Decimal32Array> {
         let array = scalars
             .into_iter()
             .map(|element: ScalarValue| match element {
-                ScalarValue::Decimal128(v1, _, _) => Ok(v1),
+                ScalarValue::Decimal32(v1, _, _) => Ok(v1),
                 s => {
                     _internal_err!("Expected ScalarValue::Null element. Received {s:?}")
                 }
             })
-            .collect::<Result<Decimal128Array>>()?
+            .collect::<Result<Decimal32Array>>()?
             .with_precision_and_scale(precision, scale)?;
         Ok(array)
     }
 
-    fn iter_to_decimal256_array(
+    fn iter_to_decimal64_array(
         scalars: impl IntoIterator<Item = ScalarValue>,
         precision: u8,
         scale: i8,
-    ) -> Result<Decimal256Array> {
+    ) -> Result<Decimal64Array> {
         let array = scalars
             .into_iter()
             .map(|element: ScalarValue| match element {
-                ScalarValue::Decimal256(v1, _, _) => Ok(v1),
+                ScalarValue::Decimal64(v1, _, _) => Ok(v1),
                 s => {
-                    _internal_err!(
-                        "Expected ScalarValue::Decimal256 element. Received {s:?}"
-                    )
+                    _internal_err!("Expected ScalarValue::Null element. Received {s:?}")
                 }
             })
-            .collect::<Result<Decimal256Array>>()?
+            .collect::<Result<Decimal64Array>>()?
             .with_precision_and_scale(precision, scale)?;
         Ok(array)
     }
 
-    fn build_decimal_array(
-        value: Option<i128>,
+    fn iter_to_decimal128_array(
+        scalars: impl IntoIterator<Item = ScalarValue>,
         precision: u8,
         scale: i8,
-        size: usize,
     ) -> Result<Decimal128Array> {
-        Ok(match value {
-            Some(val) => Decimal128Array::from(vec![val; size])
-                .with_precision_and_scale(precision, scale)?,
-            None => {
-                let mut builder = Decimal128Array::builder(size)
-                    .with_precision_and_scale(precision, scale)?;
-                builder.append_nulls(size);
-                builder.finish()
-            }
-        })
+        let array = scalars
+            .into_iter()
+            .map(|element: ScalarValue| match element {
+                ScalarValue::Decimal128(v1, _, _) => Ok(v1),
+                s => {
+                    _internal_err!("Expected ScalarValue::Null element. Received {s:?}")
+                }
+            })
+            .collect::<Result<Decimal128Array>>()?
+            .with_precision_and_scale(precision, scale)?;
+        Ok(array)
     }
 
-    fn build_decimal256_array(
-        value: Option<i256>,
+    fn iter_to_decimal256_array(
+        scalars: impl IntoIterator<Item = ScalarValue>,
         precision: u8,
         scale: i8,
-        size: usize,
     ) -> Result<Decimal256Array> {
-        Ok(repeat_n(value, size)
-            .collect::<Decimal256Array>()
-            .with_precision_and_scale(precision, scale)?)
+        let array = scalars
+            .into_iter()
+            .map(|element: ScalarValue| match element {
+                ScalarValue::Decimal256(v1, _, _) => Ok(v1),
+                s => {
+                    _internal_err!(
+                        "Expected ScalarValue::Decimal256 element. Received {s:?}"
+                    )
+                }
+            })
+            .collect::<Result<Decimal256Array>>()?
+            .with_precision_and_scale(precision, scale)?;
+        Ok(array)
     }
 
     /// Converts `Vec<ScalarValue>` where each element has type corresponding to
@@ -2238,23 +2864,24 @@ impl ScalarValue {
     ///
     /// Example
     /// ```
-    /// use datafusion_common::ScalarValue;
-    /// use arrow::array::{ListArray, Int32Array};
+    /// use arrow::array::{Int32Array, ListArray};
     /// use arrow::datatypes::{DataType, Int32Type};
     /// use datafusion_common::cast::as_list_array;
+    /// use datafusion_common::ScalarValue;
     ///
     /// let scalars = vec![
-    ///    ScalarValue::Int32(Some(1)),
-    ///    ScalarValue::Int32(None),
-    ///    ScalarValue::Int32(Some(2))
+    ///     ScalarValue::Int32(Some(1)),
+    ///     ScalarValue::Int32(None),
+    ///     ScalarValue::Int32(Some(2)),
     /// ];
     ///
     /// let result = ScalarValue::new_list(&scalars, &DataType::Int32, true);
     ///
-    /// let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(
-    ///     vec![
-    ///        Some(vec![Some(1), None, Some(2)])
-    ///     ]);
+    /// let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+    ///     Some(1),
+    ///     None,
+    ///     Some(2),
+    /// ])]);
     ///
     /// assert_eq!(*result, expected);
     /// ```
@@ -2298,23 +2925,25 @@ impl ScalarValue {
     ///
     /// Example
     /// ```
-    /// use datafusion_common::ScalarValue;
-    /// use arrow::array::{ListArray, Int32Array};
+    /// use arrow::array::{Int32Array, ListArray};
     /// use arrow::datatypes::{DataType, Int32Type};
     /// use datafusion_common::cast::as_list_array;
+    /// use datafusion_common::ScalarValue;
     ///
     /// let scalars = vec![
-    ///    ScalarValue::Int32(Some(1)),
-    ///    ScalarValue::Int32(None),
-    ///    ScalarValue::Int32(Some(2))
+    ///     ScalarValue::Int32(Some(1)),
+    ///     ScalarValue::Int32(None),
+    ///     ScalarValue::Int32(Some(2)),
     /// ];
     ///
-    /// let result = ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true);
+    /// let result =
+    ///     ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true);
     ///
-    /// let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(
-    ///     vec![
-    ///        Some(vec![Some(1), None, Some(2)])
-    ///     ]);
+    /// let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+    ///     Some(1),
+    ///     None,
+    ///     Some(2),
+    /// ])]);
     ///
     /// assert_eq!(*result, expected);
     /// ```
@@ -2340,23 +2969,25 @@ impl ScalarValue {
     ///
     /// Example
     /// ```
-    /// use datafusion_common::ScalarValue;
-    /// use arrow::array::{LargeListArray, Int32Array};
+    /// use arrow::array::{Int32Array, LargeListArray};
     /// use arrow::datatypes::{DataType, Int32Type};
     /// use datafusion_common::cast::as_large_list_array;
+    /// use datafusion_common::ScalarValue;
     ///
     /// let scalars = vec![
-    ///    ScalarValue::Int32(Some(1)),
-    ///    ScalarValue::Int32(None),
-    ///    ScalarValue::Int32(Some(2))
+    ///     ScalarValue::Int32(Some(1)),
+    ///     ScalarValue::Int32(None),
+    ///     ScalarValue::Int32(Some(2)),
     /// ];
     ///
     /// let result = ScalarValue::new_large_list(&scalars, &DataType::Int32);
     ///
-    /// let expected = LargeListArray::from_iter_primitive::<Int32Type, _, _>(
-    ///     vec![
-    ///        Some(vec![Some(1), None, Some(2)])
-    ///     ]);
+    /// let expected =
+    ///     LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+    ///         Some(1),
+    ///         None,
+    ///         Some(2),
+    ///     ])]);
     ///
     /// assert_eq!(*result, expected);
     /// ```
@@ -2378,20 +3009,51 @@ impl ScalarValue {
     ///
     /// Errors if `self` is
     /// - a decimal that fails be converted to a decimal array of size
-    /// - a `FixedsizeList` that fails to be concatenated into an array of size
+    /// - a `FixedSizeList` that fails to be concatenated into an array of size
     /// - a `List` that fails to be concatenated into an array of size
     /// - a `Dictionary` that fails be converted to a dictionary array of size
     pub fn to_array_of_size(&self, size: usize) -> Result<ArrayRef> {
         Ok(match self {
-            ScalarValue::Decimal128(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal32(Some(e), precision, scale) => Arc::new(
+                Decimal32Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
+            ),
+            ScalarValue::Decimal32(None, precision, scale) => {
+                new_null_array(&DataType::Decimal32(*precision, *scale), size)
+            }
+            ScalarValue::Decimal64(Some(e), precision, scale) => Arc::new(
+                Decimal64Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
-            ScalarValue::Decimal256(e, precision, scale) => Arc::new(
-                ScalarValue::build_decimal256_array(*e, *precision, *scale, size)?,
+            ScalarValue::Decimal64(None, precision, scale) => {
+                new_null_array(&DataType::Decimal64(*precision, *scale), size)
+            }
+            ScalarValue::Decimal128(Some(e), precision, scale) => Arc::new(
+                Decimal128Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
+            ),
+            ScalarValue::Decimal128(None, precision, scale) => {
+                new_null_array(&DataType::Decimal128(*precision, *scale), size)
+            }
+            ScalarValue::Decimal256(Some(e), precision, scale) => Arc::new(
+                Decimal256Array::from_value(*e, size)
+                    .with_precision_and_scale(*precision, *scale)?,
             ),
-            ScalarValue::Boolean(e) => {
-                Arc::new(BooleanArray::from(vec![*e; size])) as ArrayRef
+            ScalarValue::Decimal256(None, precision, scale) => {
+                new_null_array(&DataType::Decimal256(*precision, *scale), size)
             }
+
+            ScalarValue::Boolean(e) => match e {
+                None => new_null_array(&DataType::Boolean, size),
+                Some(true) => {
+                    Arc::new(BooleanArray::new(BooleanBuffer::new_set(size), None))
+                        as ArrayRef
+                }
+                Some(false) => {
+                    Arc::new(BooleanArray::new(BooleanBuffer::new_unset(size), None))
+                        as ArrayRef
+                }
+            },
             ScalarValue::Float64(e) => {
                 build_array_from_option!(Float64, Float64Array, e, size)
             }
@@ -2453,36 +3115,36 @@ impl ScalarValue {
                 )
             }
             ScalarValue::Utf8(e) => match e {
-                Some(value) => {
-                    Arc::new(StringArray::from_iter_values(repeat_n(value, size)))
-                }
+                Some(value) => Arc::new(StringArray::new_repeated(value, size)),
                 None => new_null_array(&DataType::Utf8, size),
             },
             ScalarValue::Utf8View(e) => match e {
                 Some(value) => {
-                    Arc::new(StringViewArray::from_iter_values(repeat_n(value, size)))
+                    let mut builder = StringViewBuilder::with_capacity(size);
+                    builder.try_append_value_n(value, size)?;
+                    let array = builder.finish();
+                    Arc::new(array)
                 }
                 None => new_null_array(&DataType::Utf8View, size),
             },
             ScalarValue::LargeUtf8(e) => match e {
-                Some(value) => {
-                    Arc::new(LargeStringArray::from_iter_values(repeat_n(value, size)))
-                }
+                Some(value) => Arc::new(LargeStringArray::new_repeated(value, size)),
                 None => new_null_array(&DataType::LargeUtf8, size),
             },
             ScalarValue::Binary(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<BinaryArray>(),
-                ),
-                None => Arc::new(repeat_n(None::<&str>, size).collect::<BinaryArray>()),
+                Some(value) => {
+                    Arc::new(BinaryArray::new_repeated(value.as_slice(), size))
+                }
+                None => new_null_array(&DataType::Binary, size),
             },
             ScalarValue::BinaryView(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<BinaryViewArray>(),
-                ),
-                None => {
-                    Arc::new(repeat_n(None::<&str>, size).collect::<BinaryViewArray>())
+                Some(value) => {
+                    let mut builder = BinaryViewBuilder::with_capacity(size);
+                    builder.try_append_value_n(value, size)?;
+                    let array = builder.finish();
+                    Arc::new(array)
                 }
+                None => new_null_array(&DataType::BinaryView, size),
             },
             ScalarValue::FixedSizeBinary(s, e) => match e {
                 Some(value) => Arc::new(
@@ -2492,35 +3154,42 @@ impl ScalarValue {
                     )
                     .unwrap(),
                 ),
-                None => Arc::new(
-                    FixedSizeBinaryArray::try_from_sparse_iter_with_size(
-                        repeat_n(None::<&[u8]>, size),
-                        *s,
-                    )
-                    .unwrap(),
-                ),
+                None => Arc::new(FixedSizeBinaryArray::new_null(*s, size)),
             },
             ScalarValue::LargeBinary(e) => match e {
-                Some(value) => Arc::new(
-                    repeat_n(Some(value.as_slice()), size).collect::<LargeBinaryArray>(),
-                ),
-                None => {
-                    Arc::new(repeat_n(None::<&str>, size).collect::<LargeBinaryArray>())
+                Some(value) => {
+                    Arc::new(LargeBinaryArray::new_repeated(value.as_slice(), size))
                 }
+                None => new_null_array(&DataType::LargeBinary, size),
             },
             ScalarValue::List(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
             ScalarValue::LargeList(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
             ScalarValue::FixedSizeList(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
             ScalarValue::Struct(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
             ScalarValue::Map(arr) => {
+                if size == 1 {
+                    return Ok(Arc::clone(arr) as Arc<dyn Array>);
+                }
                 Self::list_to_array_of_size(arr.as_ref() as &dyn Array, size)?
             }
             ScalarValue::Date32(e) => {
@@ -2646,13 +3315,10 @@ impl ScalarValue {
                         value_offsets,
                         child_arrays,
                     )
-                    .map_err(|e| DataFusionError::ArrowError(e, None))?;
+                    .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?;
                     Arc::new(ar)
                 }
-                None => {
-                    let dt = self.data_type();
-                    new_null_array(&dt, size)
-                }
+                None => new_null_array(&DataType::Union(fields.clone(), *mode), size),
             },
             ScalarValue::Dictionary(key_type, v) => {
                 // values array is one element long (the value)
@@ -2665,10 +3331,58 @@ impl ScalarValue {
                     DataType::UInt16 => dict_from_scalar::<UInt16Type>(v, size)?,
                     DataType::UInt32 => dict_from_scalar::<UInt32Type>(v, size)?,
                     DataType::UInt64 => dict_from_scalar::<UInt64Type>(v, size)?,
-                    _ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
+                    _ => unreachable!("Invalid dictionary keys type: {}", key_type),
+                }
+            }
+            ScalarValue::RunEndEncoded(run_ends_field, values_field, value) => {
+                fn make_run_array<R: RunEndIndexType>(
+                    run_ends_field: &Arc<Field>,
+                    values_field: &Arc<Field>,
+                    value: &ScalarValue,
+                    size: usize,
+                ) -> Result<ArrayRef> {
+                    let size_native = R::Native::from_usize(size)
+                        .ok_or_else(|| DataFusionError::Execution(format!("Cannot construct RunArray of size {size}: Overflows run-ends type {}", R::DATA_TYPE)))?;
+                    let values = value.to_array_of_size(1)?;
+                    let run_ends =
+                        PrimitiveArray::<R>::new(vec![size_native].into(), None);
+
+                    // Using ArrayDataBuilder so we can maintain the fields
+                    let dt = DataType::RunEndEncoded(
+                        Arc::clone(run_ends_field),
+                        Arc::clone(values_field),
+                    );
+                    let builder = ArrayDataBuilder::new(dt)
+                        .len(size)
+                        .add_child_data(run_ends.to_data())
+                        .add_child_data(values.to_data());
+                    let run_array = RunArray::<R>::from(builder.build()?);
+
+                    Ok(Arc::new(run_array))
+                }
+                match run_ends_field.data_type() {
+                    DataType::Int16 => make_run_array::<Int16Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    DataType::Int32 => make_run_array::<Int32Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    DataType::Int64 => make_run_array::<Int64Type>(
+                        run_ends_field,
+                        values_field,
+                        value,
+                        size,
+                    )?,
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
                 }
             }
-            ScalarValue::Null => new_null_array(&DataType::Null, size),
+            ScalarValue::Null => get_or_create_cached_null_array(size),
         })
     }
 
@@ -2679,6 +3393,24 @@ impl ScalarValue {
         scale: i8,
     ) -> Result<ScalarValue> {
         match array.data_type() {
+            DataType::Decimal32(_, _) => {
+                let array = as_decimal32_array(array)?;
+                if array.is_null(index) {
+                    Ok(ScalarValue::Decimal32(None, precision, scale))
+                } else {
+                    let value = array.value(index);
+                    Ok(ScalarValue::Decimal32(Some(value), precision, scale))
+                }
+            }
+            DataType::Decimal64(_, _) => {
+                let array = as_decimal64_array(array)?;
+                if array.is_null(index) {
+                    Ok(ScalarValue::Decimal64(None, precision, scale))
+                } else {
+                    let value = array.value(index);
+                    Ok(ScalarValue::Decimal64(Some(value), precision, scale))
+                }
+            }
             DataType::Decimal128(_, _) => {
                 let array = as_decimal128_array(array)?;
                 if array.is_null(index) {
@@ -2697,46 +3429,59 @@ impl ScalarValue {
                     Ok(ScalarValue::Decimal256(Some(value), precision, scale))
                 }
             }
-            _ => _internal_err!("Unsupported decimal type"),
+            other => {
+                unreachable!("Invalid type isn't decimal: {other:?}")
+            }
         }
     }
 
+    /// Repeats the rows of `arr` `size` times, producing an array with
+    /// `arr.len() * size` total rows.
     fn list_to_array_of_size(arr: &dyn Array, size: usize) -> Result<ArrayRef> {
-        let arrays = repeat_n(arr, size).collect::<Vec<_>>();
-        let ret = match !arrays.is_empty() {
-            true => arrow::compute::concat(arrays.as_slice())?,
-            false => arr.slice(0, 0),
-        };
-        Ok(ret)
+        if size == 0 {
+            return Ok(arr.slice(0, 0));
+        }
+
+        // Examples: given `arr = [[A, B, C]]` and `size = 3`, `indices = [0, 0, 0]` and
+        // the result is `[[A, B, C], [A, B, C], [A, B, C]]`.
+        //
+        // Given `arr = [[A, B], [C]]` and `size = 2`, `indices = [0, 1, 0, 1]` and the
+        // result is `[[A, B], [C], [A, B], [C]]`. (But in practice, we are always called
+        // with `arr.len() == 1`.)
+        let n = arr.len() as u32;
+        let indices = UInt32Array::from_iter_values((0..size).flat_map(|_| 0..n));
+        Ok(arrow::compute::take(arr, &indices, None)?)
     }
 
     /// Retrieve ScalarValue for each row in `array`
     ///
+    /// Elements in `array` may be NULL, in which case the corresponding element in the returned vector is None.
+    ///
     /// Example 1: Array (ScalarValue::Int32)
     /// ```
-    /// use datafusion_common::ScalarValue;
     /// use arrow::array::ListArray;
     /// use arrow::datatypes::{DataType, Int32Type};
+    /// use datafusion_common::ScalarValue;
     ///
     /// // Equivalent to [[1,2,3], [4,5]]
     /// let list_arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-    ///    Some(vec![Some(1), Some(2), Some(3)]),
-    ///    Some(vec![Some(4), Some(5)])
+    ///     Some(vec![Some(1), Some(2), Some(3)]),
+    ///     Some(vec![Some(4), Some(5)]),
     /// ]);
     ///
     /// // Convert the array into Scalar Values for each row
     /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap();
     ///
     /// let expected = vec![
-    /// vec![
-    ///     ScalarValue::Int32(Some(1)),
-    ///     ScalarValue::Int32(Some(2)),
-    ///     ScalarValue::Int32(Some(3)),
-    /// ],
-    /// vec![
-    ///    ScalarValue::Int32(Some(4)),
-    ///    ScalarValue::Int32(Some(5)),
-    /// ],
+    ///     Some(vec![
+    ///         ScalarValue::Int32(Some(1)),
+    ///         ScalarValue::Int32(Some(2)),
+    ///         ScalarValue::Int32(Some(3)),
+    ///     ]),
+    ///     Some(vec![
+    ///         ScalarValue::Int32(Some(4)),
+    ///         ScalarValue::Int32(Some(5)),
+    ///     ]),
     /// ];
     ///
     /// assert_eq!(scalar_vec, expected);
@@ -2744,15 +3489,15 @@ impl ScalarValue {
     ///
     /// Example 2: Nested array (ScalarValue::List)
     /// ```
-    /// use datafusion_common::ScalarValue;
     /// use arrow::array::ListArray;
     /// use arrow::datatypes::{DataType, Int32Type};
     /// use datafusion_common::utils::SingleRowListArrayBuilder;
+    /// use datafusion_common::ScalarValue;
     /// use std::sync::Arc;
     ///
     /// let list_arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-    ///    Some(vec![Some(1), Some(2), Some(3)]),
-    ///    Some(vec![Some(4), Some(5)])
+    ///     Some(vec![Some(1), Some(2), Some(3)]),
+    ///     Some(vec![Some(4), Some(5)]),
     /// ]);
     ///
     /// // Wrap into another layer of list, we got nested array as [ [[1,2,3], [4,5]] ]
@@ -2761,34 +3506,82 @@ impl ScalarValue {
     /// // Convert the array into Scalar Values for each row, we got 1D arrays in this example
     /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap();
     ///
-    /// let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+    /// let l1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+    ///     Some(1),
+    ///     Some(2),
+    ///     Some(3),
+    /// ])]);
+    /// let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+    ///     Some(4),
+    ///     Some(5),
+    /// ])]);
+    ///
+    /// let expected = vec![Some(vec![
+    ///     ScalarValue::List(Arc::new(l1)),
+    ///     ScalarValue::List(Arc::new(l2)),
+    /// ])];
+    ///
+    /// assert_eq!(scalar_vec, expected);
+    /// ```
+    ///
+    /// Example 3: Nullable array
+    /// ```
+    /// use arrow::array::ListArray;
+    /// use arrow::datatypes::{DataType, Int32Type};
+    /// use datafusion_common::ScalarValue;
+    ///
+    /// let list_arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
     ///     Some(vec![Some(1), Some(2), Some(3)]),
-    /// ]);
-    /// let l2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+    ///     None,
     ///     Some(vec![Some(4), Some(5)]),
     /// ]);
     ///
+    /// // Convert the array into Scalar Values for each row
+    /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap();
+    ///
     /// let expected = vec![
-    ///   vec![
-    ///     ScalarValue::List(Arc::new(l1)),
-    ///     ScalarValue::List(Arc::new(l2)),
-    ///   ],
+    ///     Some(vec![
+    ///         ScalarValue::Int32(Some(1)),
+    ///         ScalarValue::Int32(Some(2)),
+    ///         ScalarValue::Int32(Some(3)),
+    ///     ]),
+    ///     None,
+    ///     Some(vec![
+    ///         ScalarValue::Int32(Some(4)),
+    ///         ScalarValue::Int32(Some(5)),
+    ///     ]),
     /// ];
     ///
     /// assert_eq!(scalar_vec, expected);
     /// ```
-    pub fn convert_array_to_scalar_vec(array: &dyn Array) -> Result<Vec<Vec<Self>>> {
-        let mut scalars = Vec::with_capacity(array.len());
-
-        for index in 0..array.len() {
-            let nested_array = array.as_list::<i32>().value(index);
-            let scalar_values = (0..nested_array.len())
-                .map(|i| ScalarValue::try_from_array(&nested_array, i))
-                .collect::<Result<Vec<_>>>()?;
-            scalars.push(scalar_values);
+    pub fn convert_array_to_scalar_vec(
+        array: &dyn Array,
+    ) -> Result<Vec<Option<Vec<Self>>>> {
+        fn generic_collect<OffsetSize: OffsetSizeTrait>(
+            array: &dyn Array,
+        ) -> Result<Vec<Option<Vec<ScalarValue>>>> {
+            array
+                .as_list::<OffsetSize>()
+                .iter()
+                .map(|nested_array| {
+                    nested_array
+                        .map(|array| {
+                            (0..array.len())
+                                .map(|i| ScalarValue::try_from_array(&array, i))
+                                .collect::<Result<Vec<_>>>()
+                        })
+                        .transpose()
+                })
+                .collect()
         }
 
-        Ok(scalars)
+        match array.data_type() {
+            DataType::List(_) => generic_collect::<i32>(array),
+            DataType::LargeList(_) => generic_collect::<i64>(array),
+            _ => _internal_err!(
+                "ScalarValue::convert_array_to_scalar_vec input must be a List/LargeList type"
+            ),
+        }
     }
 
     #[deprecated(
@@ -2805,12 +3598,22 @@ impl ScalarValue {
     /// Converts a value in `array` at `index` into a ScalarValue
     pub fn try_from_array(array: &dyn Array, index: usize) -> Result<Self> {
         // handle NULL value
-        if !array.is_valid(index) {
+        if array.is_null(index) {
             return array.data_type().try_into();
         }
 
         Ok(match array.data_type() {
             DataType::Null => ScalarValue::Null,
+            DataType::Decimal32(precision, scale) => {
+                ScalarValue::get_decimal_value_from_array(
+                    array, index, *precision, *scale,
+                )?
+            }
+            DataType::Decimal64(precision, scale) => {
+                ScalarValue::get_decimal_value_from_array(
+                    array, index, *precision, *scale,
+                )?
+            }
             DataType::Decimal128(precision, scale) => {
                 ScalarValue::get_decimal_value_from_array(
                     array, index, *precision, *scale,
@@ -2821,30 +3624,32 @@ impl ScalarValue {
                     array, index, *precision, *scale,
                 )?
             }
-            DataType::Boolean => typed_cast!(array, index, BooleanArray, Boolean)?,
-            DataType::Float64 => typed_cast!(array, index, Float64Array, Float64)?,
-            DataType::Float32 => typed_cast!(array, index, Float32Array, Float32)?,
-            DataType::Float16 => typed_cast!(array, index, Float16Array, Float16)?,
-            DataType::UInt64 => typed_cast!(array, index, UInt64Array, UInt64)?,
-            DataType::UInt32 => typed_cast!(array, index, UInt32Array, UInt32)?,
-            DataType::UInt16 => typed_cast!(array, index, UInt16Array, UInt16)?,
-            DataType::UInt8 => typed_cast!(array, index, UInt8Array, UInt8)?,
-            DataType::Int64 => typed_cast!(array, index, Int64Array, Int64)?,
-            DataType::Int32 => typed_cast!(array, index, Int32Array, Int32)?,
-            DataType::Int16 => typed_cast!(array, index, Int16Array, Int16)?,
-            DataType::Int8 => typed_cast!(array, index, Int8Array, Int8)?,
-            DataType::Binary => typed_cast!(array, index, BinaryArray, Binary)?,
+            DataType::Boolean => typed_cast!(array, index, as_boolean_array, Boolean)?,
+            DataType::Float64 => typed_cast!(array, index, as_float64_array, Float64)?,
+            DataType::Float32 => typed_cast!(array, index, as_float32_array, Float32)?,
+            DataType::Float16 => typed_cast!(array, index, as_float16_array, Float16)?,
+            DataType::UInt64 => typed_cast!(array, index, as_uint64_array, UInt64)?,
+            DataType::UInt32 => typed_cast!(array, index, as_uint32_array, UInt32)?,
+            DataType::UInt16 => typed_cast!(array, index, as_uint16_array, UInt16)?,
+            DataType::UInt8 => typed_cast!(array, index, as_uint8_array, UInt8)?,
+            DataType::Int64 => typed_cast!(array, index, as_int64_array, Int64)?,
+            DataType::Int32 => typed_cast!(array, index, as_int32_array, Int32)?,
+            DataType::Int16 => typed_cast!(array, index, as_int16_array, Int16)?,
+            DataType::Int8 => typed_cast!(array, index, as_int8_array, Int8)?,
+            DataType::Binary => typed_cast!(array, index, as_binary_array, Binary)?,
             DataType::LargeBinary => {
-                typed_cast!(array, index, LargeBinaryArray, LargeBinary)?
+                typed_cast!(array, index, as_large_binary_array, LargeBinary)?
             }
             DataType::BinaryView => {
-                typed_cast!(array, index, BinaryViewArray, BinaryView)?
+                typed_cast!(array, index, as_binary_view_array, BinaryView)?
             }
-            DataType::Utf8 => typed_cast!(array, index, StringArray, Utf8)?,
+            DataType::Utf8 => typed_cast!(array, index, as_string_array, Utf8)?,
             DataType::LargeUtf8 => {
-                typed_cast!(array, index, LargeStringArray, LargeUtf8)?
+                typed_cast!(array, index, as_large_string_array, LargeUtf8)?
+            }
+            DataType::Utf8View => {
+                typed_cast!(array, index, as_string_view_array, Utf8View)?
             }
-            DataType::Utf8View => typed_cast!(array, index, StringViewArray, Utf8View)?,
             DataType::List(field) => {
                 let list_array = array.as_list::<i32>();
                 let nested_array = list_array.value(index);
@@ -2854,7 +3659,7 @@ impl ScalarValue {
                     .build_list_scalar()
             }
             DataType::LargeList(field) => {
-                let list_array = as_large_list_array(array);
+                let list_array = as_large_list_array(array)?;
                 let nested_array = list_array.value(index);
                 // Produces a single element `LargeListArray` with the value at `index`.
                 SingleRowListArrayBuilder::new(nested_array)
@@ -2871,45 +3676,61 @@ impl ScalarValue {
                     .with_field(field)
                     .build_fixed_size_list_scalar(list_size)
             }
-            DataType::Date32 => typed_cast!(array, index, Date32Array, Date32)?,
-            DataType::Date64 => typed_cast!(array, index, Date64Array, Date64)?,
+            DataType::ListView(field) => {
+                let list_array = array.as_list_view::<i32>();
+                let nested_array = list_array.value(index);
+                // Store as List scalar since ScalarValue has no ListView variant.
+                SingleRowListArrayBuilder::new(nested_array)
+                    .with_field(field)
+                    .build_list_scalar()
+            }
+            DataType::LargeListView(field) => {
+                let list_array = array.as_list_view::<i64>();
+                let nested_array = list_array.value(index);
+                // Store as LargeList scalar since ScalarValue has no LargeListView variant.
+                SingleRowListArrayBuilder::new(nested_array)
+                    .with_field(field)
+                    .build_large_list_scalar()
+            }
+            DataType::Date32 => typed_cast!(array, index, as_date32_array, Date32)?,
+            DataType::Date64 => typed_cast!(array, index, as_date64_array, Date64)?,
             DataType::Time32(TimeUnit::Second) => {
-                typed_cast!(array, index, Time32SecondArray, Time32Second)?
+                typed_cast!(array, index, as_time32_second_array, Time32Second)?
             }
             DataType::Time32(TimeUnit::Millisecond) => {
-                typed_cast!(array, index, Time32MillisecondArray, Time32Millisecond)?
+                typed_cast!(array, index, as_time32_millisecond_array, Time32Millisecond)?
             }
             DataType::Time64(TimeUnit::Microsecond) => {
-                typed_cast!(array, index, Time64MicrosecondArray, Time64Microsecond)?
+                typed_cast!(array, index, as_time64_microsecond_array, Time64Microsecond)?
             }
             DataType::Time64(TimeUnit::Nanosecond) => {
-                typed_cast!(array, index, Time64NanosecondArray, Time64Nanosecond)?
+                typed_cast!(array, index, as_time64_nanosecond_array, Time64Nanosecond)?
             }
             DataType::Timestamp(TimeUnit::Second, tz_opt) => typed_cast_tz!(
                 array,
                 index,
-                TimestampSecondArray,
+                as_timestamp_second_array,
                 TimestampSecond,
                 tz_opt
             )?,
             DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => typed_cast_tz!(
                 array,
                 index,
-                TimestampMillisecondArray,
+                as_timestamp_millisecond_array,
                 TimestampMillisecond,
                 tz_opt
             )?,
             DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => typed_cast_tz!(
                 array,
                 index,
-                TimestampMicrosecondArray,
+                as_timestamp_microsecond_array,
                 TimestampMicrosecond,
                 tz_opt
             )?,
             DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => typed_cast_tz!(
                 array,
                 index,
-                TimestampNanosecondArray,
+                as_timestamp_nanosecond_array,
                 TimestampNanosecond,
                 tz_opt
             )?,
@@ -2923,7 +3744,7 @@ impl ScalarValue {
                     DataType::UInt16 => get_dict_value::<UInt16Type>(array, index)?,
                     DataType::UInt32 => get_dict_value::<UInt32Type>(array, index)?,
                     DataType::UInt64 => get_dict_value::<UInt64Type>(array, index)?,
-                    _ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
+                    _ => unreachable!("Invalid dictionary keys type: {}", key_type),
                 };
                 // look up the index in the values dictionary
                 let value = match values_index {
@@ -2936,6 +3757,28 @@ impl ScalarValue {
 
                 Self::Dictionary(key_type.clone(), Box::new(value))
             }
+            DataType::RunEndEncoded(run_ends_field, value_field) => {
+                // Explicitly check length here since get_physical_index() doesn't
+                // bound check for us
+                if index > array.len() {
+                    return _exec_err!(
+                        "Index {index} out of bounds for array of length {}",
+                        array.len()
+                    );
+                }
+                let scalar = downcast_run_array!(
+                    array => {
+                        let index = array.get_physical_index(index);
+                        ScalarValue::try_from_array(array.values(), index)?
+                    },
+                    dt => unreachable!("Invalid run-ends type: {dt}")
+                );
+                Self::RunEndEncoded(
+                    Arc::clone(run_ends_field),
+                    Arc::clone(value_field),
+                    Box::new(scalar),
+                )
+            }
             DataType::Struct(_) => {
                 let a = array.slice(index, 1);
                 Self::Struct(Arc::new(a.as_struct().to_owned()))
@@ -2955,36 +3798,42 @@ impl ScalarValue {
                 )
             }
             DataType::Interval(IntervalUnit::DayTime) => {
-                typed_cast!(array, index, IntervalDayTimeArray, IntervalDayTime)?
+                typed_cast!(array, index, as_interval_dt_array, IntervalDayTime)?
             }
             DataType::Interval(IntervalUnit::YearMonth) => {
-                typed_cast!(array, index, IntervalYearMonthArray, IntervalYearMonth)?
+                typed_cast!(array, index, as_interval_ym_array, IntervalYearMonth)?
+            }
+            DataType::Interval(IntervalUnit::MonthDayNano) => {
+                typed_cast!(array, index, as_interval_mdn_array, IntervalMonthDayNano)?
             }
-            DataType::Interval(IntervalUnit::MonthDayNano) => typed_cast!(
-                array,
-                index,
-                IntervalMonthDayNanoArray,
-                IntervalMonthDayNano
-            )?,
 
             DataType::Duration(TimeUnit::Second) => {
-                typed_cast!(array, index, DurationSecondArray, DurationSecond)?
-            }
-            DataType::Duration(TimeUnit::Millisecond) => {
-                typed_cast!(array, index, DurationMillisecondArray, DurationMillisecond)?
-            }
-            DataType::Duration(TimeUnit::Microsecond) => {
-                typed_cast!(array, index, DurationMicrosecondArray, DurationMicrosecond)?
-            }
-            DataType::Duration(TimeUnit::Nanosecond) => {
-                typed_cast!(array, index, DurationNanosecondArray, DurationNanosecond)?
+                typed_cast!(array, index, as_duration_second_array, DurationSecond)?
             }
+            DataType::Duration(TimeUnit::Millisecond) => typed_cast!(
+                array,
+                index,
+                as_duration_millisecond_array,
+                DurationMillisecond
+            )?,
+            DataType::Duration(TimeUnit::Microsecond) => typed_cast!(
+                array,
+                index,
+                as_duration_microsecond_array,
+                DurationMicrosecond
+            )?,
+            DataType::Duration(TimeUnit::Nanosecond) => typed_cast!(
+                array,
+                index,
+                as_duration_nanosecond_array,
+                DurationNanosecond
+            )?,
             DataType::Map(_, _) => {
                 let a = array.slice(index, 1);
                 Self::Map(Arc::new(a.as_map().to_owned()))
             }
             DataType::Union(fields, mode) => {
-                let array = as_union_array(array);
+                let array = as_union_array(array)?;
                 let ti = array.type_id(index);
                 let index = array.value_offset(index);
                 let value = ScalarValue::try_from_array(array.child(ti), index)?;
@@ -3042,6 +3891,7 @@ impl ScalarValue {
             ScalarValue::LargeUtf8(v) => v,
             ScalarValue::Utf8View(v) => v,
             ScalarValue::Dictionary(_, v) => return v.try_as_str(),
+            ScalarValue::RunEndEncoded(_, _, v) => return v.try_as_str(),
             _ => return None,
         };
         Some(v.as_ref().map(|v| v.as_str()))
@@ -3058,55 +3908,50 @@ impl ScalarValue {
         target_type: &DataType,
         cast_options: &CastOptions<'static>,
     ) -> Result<Self> {
-        let scalar_array = match (self, target_type) {
-            (
-                ScalarValue::Float64(Some(float_ts)),
-                DataType::Timestamp(TimeUnit::Nanosecond, None),
-            ) => ScalarValue::Int64(Some((float_ts * 1_000_000_000_f64).trunc() as i64))
-                .to_array()?,
-            (
-                ScalarValue::Decimal128(Some(decimal_value), _, scale),
-                DataType::Timestamp(time_unit, None),
-            ) => {
-                let scale_factor = 10_i128.pow(*scale as u32);
-                let seconds = decimal_value / scale_factor;
-                let fraction = decimal_value % scale_factor;
-
-                let timestamp_value = match time_unit {
-                    TimeUnit::Second => ScalarValue::Int64(Some(seconds as i64)),
-                    TimeUnit::Millisecond => {
-                        let millis = seconds * 1_000 + (fraction * 1_000) / scale_factor;
-                        ScalarValue::Int64(Some(millis as i64))
-                    }
-                    TimeUnit::Microsecond => {
-                        let micros =
-                            seconds * 1_000_000 + (fraction * 1_000_000) / scale_factor;
-                        ScalarValue::Int64(Some(micros as i64))
-                    }
-                    TimeUnit::Nanosecond => {
-                        let nanos = seconds * 1_000_000_000
-                            + (fraction * 1_000_000_000) / scale_factor;
-                        ScalarValue::Int64(Some(nanos as i64))
-                    }
-                };
+        let source_type = self.data_type();
+        if let Some(multiplier) = date_to_timestamp_multiplier(&source_type, target_type)
+            && let Some(value) = self.date_scalar_value_as_i64()
+        {
+            ensure_timestamp_in_bounds(value, multiplier, &source_type, target_type)?;
+        }
 
-                timestamp_value.to_array()?
+        let scalar_array = self.to_array()?;
+
+        // For struct types, use name-based casting logic that matches fields by name
+        // and recursively casts nested structs. The field name wrapper is arbitrary
+        // since cast_column only uses the DataType::Struct field definitions inside.
+        let cast_arr = match target_type {
+            DataType::Struct(_) => {
+                // Field name is unused; only the struct's inner field names matter
+                let target_field = Field::new("_", target_type.clone(), true);
+                crate::nested_struct::cast_column(
+                    &scalar_array,
+                    &target_field,
+                    cast_options,
+                )?
             }
-            _ => self.to_array()?,
+            _ => cast_with_options(&scalar_array, target_type, cast_options)?,
         };
 
-        let cast_arr = cast_with_options(&scalar_array, target_type, cast_options)?;
         ScalarValue::try_from_array(&cast_arr, 0)
     }
 
-    fn eq_array_decimal(
+    fn date_scalar_value_as_i64(&self) -> Option<i64> {
+        match self {
+            ScalarValue::Date32(Some(value)) => Some(i64::from(*value)),
+            ScalarValue::Date64(Some(value)) => Some(*value),
+            _ => None,
+        }
+    }
+
+    fn eq_array_decimal32(
         array: &ArrayRef,
         index: usize,
-        value: Option<&i128>,
+        value: Option<&i32>,
         precision: u8,
         scale: i8,
     ) -> Result<bool> {
-        let array = as_decimal128_array(array)?;
+        let array = as_decimal32_array(array)?;
         if array.precision() != precision || array.scale() != scale {
             return Ok(false);
         }
@@ -3118,14 +3963,52 @@ impl ScalarValue {
         }
     }
 
-    fn eq_array_decimal256(
+    fn eq_array_decimal64(
         array: &ArrayRef,
         index: usize,
-        value: Option<&i256>,
+        value: Option<&i64>,
         precision: u8,
         scale: i8,
     ) -> Result<bool> {
-        let array = as_decimal256_array(array)?;
+        let array = as_decimal64_array(array)?;
+        if array.precision() != precision || array.scale() != scale {
+            return Ok(false);
+        }
+        let is_null = array.is_null(index);
+        if let Some(v) = value {
+            Ok(!array.is_null(index) && array.value(index) == *v)
+        } else {
+            Ok(is_null)
+        }
+    }
+
+    fn eq_array_decimal(
+        array: &ArrayRef,
+        index: usize,
+        value: Option<&i128>,
+        precision: u8,
+        scale: i8,
+    ) -> Result<bool> {
+        let array = as_decimal128_array(array)?;
+        if array.precision() != precision || array.scale() != scale {
+            return Ok(false);
+        }
+        let is_null = array.is_null(index);
+        if let Some(v) = value {
+            Ok(!array.is_null(index) && array.value(index) == *v)
+        } else {
+            Ok(is_null)
+        }
+    }
+
+    fn eq_array_decimal256(
+        array: &ArrayRef,
+        index: usize,
+        value: Option<&i256>,
+        precision: u8,
+        scale: i8,
+    ) -> Result<bool> {
+        let array = as_decimal256_array(array)?;
         if array.precision() != precision || array.scale() != scale {
             return Ok(false);
         }
@@ -3166,6 +4049,24 @@ impl ScalarValue {
     #[inline]
     pub fn eq_array(&self, array: &ArrayRef, index: usize) -> Result<bool> {
         Ok(match self {
+            ScalarValue::Decimal32(v, precision, scale) => {
+                ScalarValue::eq_array_decimal32(
+                    array,
+                    index,
+                    v.as_ref(),
+                    *precision,
+                    *scale,
+                )?
+            }
+            ScalarValue::Decimal64(v, precision, scale) => {
+                ScalarValue::eq_array_decimal64(
+                    array,
+                    index,
+                    v.as_ref(),
+                    *precision,
+                    *scale,
+                )?
+            }
             ScalarValue::Decimal128(v, precision, scale) => {
                 ScalarValue::eq_array_decimal(
                     array,
@@ -3185,59 +4086,61 @@ impl ScalarValue {
                 )?
             }
             ScalarValue::Boolean(val) => {
-                eq_array_primitive!(array, index, BooleanArray, val)?
+                eq_array_primitive!(array, index, as_boolean_array, val)?
             }
             ScalarValue::Float16(val) => {
-                eq_array_primitive!(array, index, Float16Array, val)?
+                eq_array_primitive!(array, index, as_float16_array, val)?
             }
             ScalarValue::Float32(val) => {
-                eq_array_primitive!(array, index, Float32Array, val)?
+                eq_array_primitive!(array, index, as_float32_array, val)?
             }
             ScalarValue::Float64(val) => {
-                eq_array_primitive!(array, index, Float64Array, val)?
+                eq_array_primitive!(array, index, as_float64_array, val)?
+            }
+            ScalarValue::Int8(val) => {
+                eq_array_primitive!(array, index, as_int8_array, val)?
             }
-            ScalarValue::Int8(val) => eq_array_primitive!(array, index, Int8Array, val)?,
             ScalarValue::Int16(val) => {
-                eq_array_primitive!(array, index, Int16Array, val)?
+                eq_array_primitive!(array, index, as_int16_array, val)?
             }
             ScalarValue::Int32(val) => {
-                eq_array_primitive!(array, index, Int32Array, val)?
+                eq_array_primitive!(array, index, as_int32_array, val)?
             }
             ScalarValue::Int64(val) => {
-                eq_array_primitive!(array, index, Int64Array, val)?
+                eq_array_primitive!(array, index, as_int64_array, val)?
             }
             ScalarValue::UInt8(val) => {
-                eq_array_primitive!(array, index, UInt8Array, val)?
+                eq_array_primitive!(array, index, as_uint8_array, val)?
             }
             ScalarValue::UInt16(val) => {
-                eq_array_primitive!(array, index, UInt16Array, val)?
+                eq_array_primitive!(array, index, as_uint16_array, val)?
             }
             ScalarValue::UInt32(val) => {
-                eq_array_primitive!(array, index, UInt32Array, val)?
+                eq_array_primitive!(array, index, as_uint32_array, val)?
             }
             ScalarValue::UInt64(val) => {
-                eq_array_primitive!(array, index, UInt64Array, val)?
+                eq_array_primitive!(array, index, as_uint64_array, val)?
             }
             ScalarValue::Utf8(val) => {
-                eq_array_primitive!(array, index, StringArray, val)?
+                eq_array_primitive!(array, index, as_string_array, val)?
             }
             ScalarValue::Utf8View(val) => {
-                eq_array_primitive!(array, index, StringViewArray, val)?
+                eq_array_primitive!(array, index, as_string_view_array, val)?
             }
             ScalarValue::LargeUtf8(val) => {
-                eq_array_primitive!(array, index, LargeStringArray, val)?
+                eq_array_primitive!(array, index, as_large_string_array, val)?
             }
             ScalarValue::Binary(val) => {
-                eq_array_primitive!(array, index, BinaryArray, val)?
+                eq_array_primitive!(array, index, as_binary_array, val)?
             }
             ScalarValue::BinaryView(val) => {
-                eq_array_primitive!(array, index, BinaryViewArray, val)?
+                eq_array_primitive!(array, index, as_binary_view_array, val)?
             }
             ScalarValue::FixedSizeBinary(_, val) => {
-                eq_array_primitive!(array, index, FixedSizeBinaryArray, val)?
+                eq_array_primitive!(array, index, as_fixed_size_binary_array, val)?
             }
             ScalarValue::LargeBinary(val) => {
-                eq_array_primitive!(array, index, LargeBinaryArray, val)?
+                eq_array_primitive!(array, index, as_large_binary_array, val)?
             }
             ScalarValue::List(arr) => {
                 Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
@@ -3255,58 +4158,58 @@ impl ScalarValue {
                 Self::eq_array_list(&(arr.to_owned() as ArrayRef), array, index)
             }
             ScalarValue::Date32(val) => {
-                eq_array_primitive!(array, index, Date32Array, val)?
+                eq_array_primitive!(array, index, as_date32_array, val)?
             }
             ScalarValue::Date64(val) => {
-                eq_array_primitive!(array, index, Date64Array, val)?
+                eq_array_primitive!(array, index, as_date64_array, val)?
             }
             ScalarValue::Time32Second(val) => {
-                eq_array_primitive!(array, index, Time32SecondArray, val)?
+                eq_array_primitive!(array, index, as_time32_second_array, val)?
             }
             ScalarValue::Time32Millisecond(val) => {
-                eq_array_primitive!(array, index, Time32MillisecondArray, val)?
+                eq_array_primitive!(array, index, as_time32_millisecond_array, val)?
             }
             ScalarValue::Time64Microsecond(val) => {
-                eq_array_primitive!(array, index, Time64MicrosecondArray, val)?
+                eq_array_primitive!(array, index, as_time64_microsecond_array, val)?
             }
             ScalarValue::Time64Nanosecond(val) => {
-                eq_array_primitive!(array, index, Time64NanosecondArray, val)?
+                eq_array_primitive!(array, index, as_time64_nanosecond_array, val)?
             }
             ScalarValue::TimestampSecond(val, _) => {
-                eq_array_primitive!(array, index, TimestampSecondArray, val)?
+                eq_array_primitive!(array, index, as_timestamp_second_array, val)?
             }
             ScalarValue::TimestampMillisecond(val, _) => {
-                eq_array_primitive!(array, index, TimestampMillisecondArray, val)?
+                eq_array_primitive!(array, index, as_timestamp_millisecond_array, val)?
             }
             ScalarValue::TimestampMicrosecond(val, _) => {
-                eq_array_primitive!(array, index, TimestampMicrosecondArray, val)?
+                eq_array_primitive!(array, index, as_timestamp_microsecond_array, val)?
             }
             ScalarValue::TimestampNanosecond(val, _) => {
-                eq_array_primitive!(array, index, TimestampNanosecondArray, val)?
+                eq_array_primitive!(array, index, as_timestamp_nanosecond_array, val)?
             }
             ScalarValue::IntervalYearMonth(val) => {
-                eq_array_primitive!(array, index, IntervalYearMonthArray, val)?
+                eq_array_primitive!(array, index, as_interval_ym_array, val)?
             }
             ScalarValue::IntervalDayTime(val) => {
-                eq_array_primitive!(array, index, IntervalDayTimeArray, val)?
+                eq_array_primitive!(array, index, as_interval_dt_array, val)?
             }
             ScalarValue::IntervalMonthDayNano(val) => {
-                eq_array_primitive!(array, index, IntervalMonthDayNanoArray, val)?
+                eq_array_primitive!(array, index, as_interval_mdn_array, val)?
             }
             ScalarValue::DurationSecond(val) => {
-                eq_array_primitive!(array, index, DurationSecondArray, val)?
+                eq_array_primitive!(array, index, as_duration_second_array, val)?
             }
             ScalarValue::DurationMillisecond(val) => {
-                eq_array_primitive!(array, index, DurationMillisecondArray, val)?
+                eq_array_primitive!(array, index, as_duration_millisecond_array, val)?
             }
             ScalarValue::DurationMicrosecond(val) => {
-                eq_array_primitive!(array, index, DurationMicrosecondArray, val)?
+                eq_array_primitive!(array, index, as_duration_microsecond_array, val)?
             }
             ScalarValue::DurationNanosecond(val) => {
-                eq_array_primitive!(array, index, DurationNanosecondArray, val)?
+                eq_array_primitive!(array, index, as_duration_nanosecond_array, val)?
             }
             ScalarValue::Union(value, _, _) => {
-                let array = as_union_array(array);
+                let array = as_union_array(array)?;
                 let ti = array.type_id(index);
                 let index = array.value_offset(index);
                 if let Some((ti_v, value)) = value {
@@ -3325,7 +4228,7 @@ impl ScalarValue {
                     DataType::UInt16 => get_dict_value::<UInt16Type>(array, index)?,
                     DataType::UInt32 => get_dict_value::<UInt32Type>(array, index)?,
                     DataType::UInt64 => get_dict_value::<UInt64Type>(array, index)?,
-                    _ => unreachable!("Invalid dictionary keys type: {:?}", key_type),
+                    _ => unreachable!("Invalid dictionary keys type: {}", key_type),
                 };
                 // was the value in the array non null?
                 match values_index {
@@ -3333,6 +4236,34 @@ impl ScalarValue {
                     None => v.is_null(),
                 }
             }
+            ScalarValue::RunEndEncoded(run_ends_field, _, value) => {
+                // Explicitly check length here since get_physical_index() doesn't
+                // bound check for us
+                if index > array.len() {
+                    return _exec_err!(
+                        "Index {index} out of bounds for array of length {}",
+                        array.len()
+                    );
+                }
+                match run_ends_field.data_type() {
+                    DataType::Int16 => {
+                        let array = as_run_array::<Int16Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    DataType::Int32 => {
+                        let array = as_run_array::<Int32Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    DataType::Int64 => {
+                        let array = as_run_array::<Int64Type>(array)?;
+                        let index = array.get_physical_index(index);
+                        value.eq_array(array.values(), index)?
+                    }
+                    dt => unreachable!("Invalid run-ends type: {dt}"),
+                }
+            }
             ScalarValue::Null => array.is_null(index),
         })
     }
@@ -3342,6 +4273,16 @@ impl ScalarValue {
         arr1 == &right
     }
 
+    /// Compare `self` with `other` and return an `Ordering`.
+    ///
+    /// This is the same as [`PartialOrd`] except that it returns
+    /// `Err` if the values cannot be compared, e.g., they have incompatible data types.
+    pub fn try_cmp(&self, other: &Self) -> Result<Ordering> {
+        self.partial_cmp(other).ok_or_else(|| {
+            _internal_datafusion_err!("Uncomparable values: {self:?}, {other:?}")
+        })
+    }
+
     /// Estimate size if bytes including `Self`. For values with internal containers such as `String`
     /// includes the allocated size (`capacity`) rather than the current length (`len`)
     pub fn size(&self) -> usize {
@@ -3352,6 +4293,8 @@ impl ScalarValue {
                 | ScalarValue::Float16(_)
                 | ScalarValue::Float32(_)
                 | ScalarValue::Float64(_)
+                | ScalarValue::Decimal32(_, _, _)
+                | ScalarValue::Decimal64(_, _, _)
                 | ScalarValue::Decimal128(_, _, _)
                 | ScalarValue::Decimal256(_, _, _)
                 | ScalarValue::Int8(_)
@@ -3410,6 +4353,7 @@ impl ScalarValue {
                     // `dt` and `sv` are boxed, so they are NOT already included in `self`
                     dt.size() + sv.size()
                 }
+                ScalarValue::RunEndEncoded(rf, vf, v) => rf.size() + vf.size() + v.size(),
             }
     }
 
@@ -3461,6 +4405,8 @@ impl ScalarValue {
             | ScalarValue::Float16(_)
             | ScalarValue::Float32(_)
             | ScalarValue::Float64(_)
+            | ScalarValue::Decimal32(_, _, _)
+            | ScalarValue::Decimal64(_, _, _)
             | ScalarValue::Decimal128(_, _, _)
             | ScalarValue::Decimal256(_, _, _)
             | ScalarValue::Int8(_)
@@ -3523,13 +4469,258 @@ impl ScalarValue {
             ScalarValue::Dictionary(_, value) => {
                 value.compact();
             }
+            ScalarValue::RunEndEncoded(_, _, value) => {
+                value.compact();
+            }
+        }
+    }
+
+    /// Compacts ([ScalarValue::compact]) the current [ScalarValue] and returns it.
+    pub fn compacted(mut self) -> Self {
+        self.compact();
+        self
+    }
+
+    /// Returns the minimum value for the given numeric `DataType`.
+    ///
+    /// This function returns the smallest representable value for numeric
+    /// and temporal data types. For non-numeric types, it returns `None`.
+    ///
+    /// # Supported Types
+    ///
+    /// - **Integer types**: `i8::MIN`, `i16::MIN`, etc.
+    /// - **Unsigned types**: Always 0 (`u8::MIN`, `u16::MIN`, etc.)
+    /// - **Float types**: Negative infinity (IEEE 754)
+    /// - **Decimal types**: Smallest value based on precision
+    /// - **Temporal types**: Minimum timestamp/date values
+    /// - **Time types**: 0 (midnight)
+    /// - **Duration types**: `i64::MIN`
+    pub fn min(datatype: &DataType) -> Option<ScalarValue> {
+        match datatype {
+            DataType::Int8 => Some(ScalarValue::Int8(Some(i8::MIN))),
+            DataType::Int16 => Some(ScalarValue::Int16(Some(i16::MIN))),
+            DataType::Int32 => Some(ScalarValue::Int32(Some(i32::MIN))),
+            DataType::Int64 => Some(ScalarValue::Int64(Some(i64::MIN))),
+            DataType::UInt8 => Some(ScalarValue::UInt8(Some(u8::MIN))),
+            DataType::UInt16 => Some(ScalarValue::UInt16(Some(u16::MIN))),
+            DataType::UInt32 => Some(ScalarValue::UInt32(Some(u32::MIN))),
+            DataType::UInt64 => Some(ScalarValue::UInt64(Some(u64::MIN))),
+            DataType::Float16 => Some(ScalarValue::Float16(Some(f16::NEG_INFINITY))),
+            DataType::Float32 => Some(ScalarValue::Float32(Some(f32::NEG_INFINITY))),
+            DataType::Float64 => Some(ScalarValue::Float64(Some(f64::NEG_INFINITY))),
+            DataType::Decimal128(precision, scale) => {
+                // For decimal, min is -10^(precision-scale) + 10^(-scale)
+                // But for simplicity, we use the minimum i128 value that fits the precision
+                let max_digits = 10_i128.pow(*precision as u32) - 1;
+                Some(ScalarValue::Decimal128(
+                    Some(-max_digits),
+                    *precision,
+                    *scale,
+                ))
+            }
+            DataType::Decimal256(precision, scale) => {
+                // Similar to Decimal128 but with i256
+                // For now, use a large negative value
+                let max_digits = i256::from_i128(10_i128)
+                    .checked_pow(*precision as u32)
+                    .and_then(|v| v.checked_sub(i256::from_i128(1)))
+                    .unwrap_or(i256::MAX);
+                Some(ScalarValue::Decimal256(
+                    Some(max_digits.neg_wrapping()),
+                    *precision,
+                    *scale,
+                ))
+            }
+            DataType::Date32 => Some(ScalarValue::Date32(Some(i32::MIN))),
+            DataType::Date64 => Some(ScalarValue::Date64(Some(i64::MIN))),
+            DataType::Time32(TimeUnit::Second) => {
+                Some(ScalarValue::Time32Second(Some(0)))
+            }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                Some(ScalarValue::Time32Millisecond(Some(0)))
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                Some(ScalarValue::Time64Microsecond(Some(0)))
+            }
+            DataType::Time64(TimeUnit::Nanosecond) => {
+                Some(ScalarValue::Time64Nanosecond(Some(0)))
+            }
+            DataType::Timestamp(unit, tz) => match unit {
+                TimeUnit::Second => {
+                    Some(ScalarValue::TimestampSecond(Some(i64::MIN), tz.clone()))
+                }
+                TimeUnit::Millisecond => Some(ScalarValue::TimestampMillisecond(
+                    Some(i64::MIN),
+                    tz.clone(),
+                )),
+                TimeUnit::Microsecond => Some(ScalarValue::TimestampMicrosecond(
+                    Some(i64::MIN),
+                    tz.clone(),
+                )),
+                TimeUnit::Nanosecond => {
+                    Some(ScalarValue::TimestampNanosecond(Some(i64::MIN), tz.clone()))
+                }
+            },
+            DataType::Duration(unit) => match unit {
+                TimeUnit::Second => Some(ScalarValue::DurationSecond(Some(i64::MIN))),
+                TimeUnit::Millisecond => {
+                    Some(ScalarValue::DurationMillisecond(Some(i64::MIN)))
+                }
+                TimeUnit::Microsecond => {
+                    Some(ScalarValue::DurationMicrosecond(Some(i64::MIN)))
+                }
+                TimeUnit::Nanosecond => {
+                    Some(ScalarValue::DurationNanosecond(Some(i64::MIN)))
+                }
+            },
+            _ => None,
+        }
+    }
+
+    /// Returns the maximum value for the given numeric `DataType`.
+    ///
+    /// This function returns the largest representable value for numeric
+    /// and temporal data types. For non-numeric types, it returns `None`.
+    ///
+    /// # Supported Types
+    ///
+    /// - **Integer types**: `i8::MAX`, `i16::MAX`, etc.
+    /// - **Unsigned types**: `u8::MAX`, `u16::MAX`, etc.
+    /// - **Float types**: Positive infinity (IEEE 754)
+    /// - **Decimal types**: Largest value based on precision
+    /// - **Temporal types**: Maximum timestamp/date values
+    /// - **Time types**: Maximum time in the day (1 day - 1 unit)
+    /// - **Duration types**: `i64::MAX`
+    pub fn max(datatype: &DataType) -> Option<ScalarValue> {
+        match datatype {
+            DataType::Int8 => Some(ScalarValue::Int8(Some(i8::MAX))),
+            DataType::Int16 => Some(ScalarValue::Int16(Some(i16::MAX))),
+            DataType::Int32 => Some(ScalarValue::Int32(Some(i32::MAX))),
+            DataType::Int64 => Some(ScalarValue::Int64(Some(i64::MAX))),
+            DataType::UInt8 => Some(ScalarValue::UInt8(Some(u8::MAX))),
+            DataType::UInt16 => Some(ScalarValue::UInt16(Some(u16::MAX))),
+            DataType::UInt32 => Some(ScalarValue::UInt32(Some(u32::MAX))),
+            DataType::UInt64 => Some(ScalarValue::UInt64(Some(u64::MAX))),
+            DataType::Float16 => Some(ScalarValue::Float16(Some(f16::INFINITY))),
+            DataType::Float32 => Some(ScalarValue::Float32(Some(f32::INFINITY))),
+            DataType::Float64 => Some(ScalarValue::Float64(Some(f64::INFINITY))),
+            DataType::Decimal128(precision, scale) => {
+                // For decimal, max is 10^(precision-scale) - 10^(-scale)
+                // But for simplicity, we use the maximum i128 value that fits the precision
+                let max_digits = 10_i128.pow(*precision as u32) - 1;
+                Some(ScalarValue::Decimal128(
+                    Some(max_digits),
+                    *precision,
+                    *scale,
+                ))
+            }
+            DataType::Decimal256(precision, scale) => {
+                // Similar to Decimal128 but with i256
+                let max_digits = i256::from_i128(10_i128)
+                    .checked_pow(*precision as u32)
+                    .and_then(|v| v.checked_sub(i256::from_i128(1)))
+                    .unwrap_or(i256::MAX);
+                Some(ScalarValue::Decimal256(
+                    Some(max_digits),
+                    *precision,
+                    *scale,
+                ))
+            }
+            DataType::Date32 => Some(ScalarValue::Date32(Some(i32::MAX))),
+            DataType::Date64 => Some(ScalarValue::Date64(Some(i64::MAX))),
+            DataType::Time32(TimeUnit::Second) => {
+                // 86399 seconds = 23:59:59
+                Some(ScalarValue::Time32Second(Some(86_399)))
+            }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                // 86_399_999 milliseconds = 23:59:59.999
+                Some(ScalarValue::Time32Millisecond(Some(86_399_999)))
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                // 86_399_999_999 microseconds = 23:59:59.999999
+                Some(ScalarValue::Time64Microsecond(Some(86_399_999_999)))
+            }
+            DataType::Time64(TimeUnit::Nanosecond) => {
+                // 86_399_999_999_999 nanoseconds = 23:59:59.999999999
+                Some(ScalarValue::Time64Nanosecond(Some(86_399_999_999_999)))
+            }
+            DataType::Timestamp(unit, tz) => match unit {
+                TimeUnit::Second => {
+                    Some(ScalarValue::TimestampSecond(Some(i64::MAX), tz.clone()))
+                }
+                TimeUnit::Millisecond => Some(ScalarValue::TimestampMillisecond(
+                    Some(i64::MAX),
+                    tz.clone(),
+                )),
+                TimeUnit::Microsecond => Some(ScalarValue::TimestampMicrosecond(
+                    Some(i64::MAX),
+                    tz.clone(),
+                )),
+                TimeUnit::Nanosecond => {
+                    Some(ScalarValue::TimestampNanosecond(Some(i64::MAX), tz.clone()))
+                }
+            },
+            DataType::Duration(unit) => match unit {
+                TimeUnit::Second => Some(ScalarValue::DurationSecond(Some(i64::MAX))),
+                TimeUnit::Millisecond => {
+                    Some(ScalarValue::DurationMillisecond(Some(i64::MAX)))
+                }
+                TimeUnit::Microsecond => {
+                    Some(ScalarValue::DurationMicrosecond(Some(i64::MAX)))
+                }
+                TimeUnit::Nanosecond => {
+                    Some(ScalarValue::DurationNanosecond(Some(i64::MAX)))
+                }
+            },
+            _ => None,
         }
     }
+
+    /// A thin wrapper on Arrow's validation that throws internal error if validation
+    /// fails.
+    fn validate_decimal_or_internal_err<T: DecimalType>(
+        precision: u8,
+        scale: i8,
+    ) -> Result<()> {
+        validate_decimal_precision_and_scale::<T>(precision, scale).map_err(|err| {
+            _internal_datafusion_err!(
+                "Decimal precision/scale invariant violated \
+                 (precision={precision}, scale={scale}): {err}"
+            )
+        })
+    }
 }
 
-pub fn copy_array_data(data: &ArrayData) -> ArrayData {
-    let mut copy = MutableArrayData::new(vec![&data], true, data.len());
-    copy.extend(0, 0, data.len());
+/// Compacts the data of an `ArrayData` into a new `ArrayData`.
+///
+/// This is useful when you want to minimize the memory footprint of an
+/// `ArrayData`. For example, the value returned by [`Array::slice`] still
+/// points at the same underlying data buffers as the original array, which may
+/// hold many more values. Calling `copy_array_data` on the sliced array will
+/// create a new, smaller, `ArrayData` that only contains the data for the
+/// sliced array.
+///
+/// # Example
+/// ```
+/// # use arrow::array::{make_array, Array, Int32Array};
+/// use datafusion_common::scalar::copy_array_data;
+/// let array = Int32Array::from_iter_values(0..8192);
+/// // Take only the first 2 elements
+/// let sliced_array = array.slice(0, 2);
+/// // The memory footprint of `sliced_array` is close to 8192 * 4 bytes
+/// assert_eq!(32864, sliced_array.get_array_memory_size());
+/// // however, we can copy the data to a new `ArrayData`
+/// let new_array = make_array(copy_array_data(&sliced_array.into_data()));
+/// // The memory footprint of `new_array` is now only 2 * 4 bytes
+/// // and overhead:
+/// assert_eq!(160, new_array.get_array_memory_size());
+/// ```
+///
+/// See also [`ScalarValue::compact`] which applies to `ScalarValue` instances
+/// as necessary.
+pub fn copy_array_data(src_data: &ArrayData) -> ArrayData {
+    let mut copy = MutableArrayData::new(vec![&src_data], true, src_data.len());
+    copy.extend(0, 0, src_data.len());
     copy.freeze()
 }
 
@@ -3551,6 +4742,7 @@ macro_rules! impl_scalar {
 
 impl_scalar!(f64, Float64);
 impl_scalar!(f32, Float32);
+impl_scalar!(f16, Float16);
 impl_scalar!(i8, Int8);
 impl_scalar!(i16, Int16);
 impl_scalar!(i32, Int32);
@@ -3570,7 +4762,7 @@ impl From<&str> for ScalarValue {
 impl From<Option<&str>> for ScalarValue {
     fn from(value: Option<&str>) -> Self {
         let value = value.map(|s| s.to_string());
-        ScalarValue::Utf8(value)
+        value.into()
     }
 }
 
@@ -3597,7 +4789,13 @@ impl FromStr for ScalarValue {
 
 impl From<String> for ScalarValue {
     fn from(value: String) -> Self {
-        ScalarValue::Utf8(Some(value))
+        Some(value).into()
+    }
+}
+
+impl From<Option<String>> for ScalarValue {
+    fn from(value: Option<String>) -> Self {
+        ScalarValue::Utf8(value)
     }
 }
 
@@ -3701,6 +4899,7 @@ impl_try_from!(UInt8, u8);
 impl_try_from!(UInt16, u16);
 impl_try_from!(UInt32, u32);
 impl_try_from!(UInt64, u64);
+impl_try_from!(Float16, f16);
 impl_try_from!(Float32, f32);
 impl_try_from!(Float64, f64);
 impl_try_from!(Boolean, bool);
@@ -3740,6 +4939,12 @@ macro_rules! format_option {
 impl fmt::Display for ScalarValue {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
+            ScalarValue::Decimal32(v, p, s) => {
+                write!(f, "{v:?},{p:?},{s:?}")?;
+            }
+            ScalarValue::Decimal64(v, p, s) => {
+                write!(f, "{v:?},{p:?},{s:?}")?;
+            }
             ScalarValue::Decimal128(v, p, s) => {
                 write!(f, "{v:?},{p:?},{s:?}")?;
             }
@@ -3771,8 +4976,10 @@ impl fmt::Display for ScalarValue {
             | ScalarValue::BinaryView(e) => match e {
                 Some(bytes) => {
                     // print up to first 10 bytes, with trailing ... if needed
+                    const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
                     for b in bytes.iter().take(10) {
-                        write!(f, "{b:02X}")?;
+                        f.write_char(HEX_CHARS_UPPER[(b >> 4) as usize] as char)?;
+                        f.write_char(HEX_CHARS_UPPER[(b & 0x0f) as usize] as char)?;
                     }
                     if bytes.len() > 10 {
                         write!(f, "...")?;
@@ -3780,15 +4987,31 @@ impl fmt::Display for ScalarValue {
                 }
                 None => write!(f, "NULL")?,
             },
-            ScalarValue::List(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
-            ScalarValue::LargeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
-            ScalarValue::FixedSizeList(arr) => fmt_list(arr.to_owned() as ArrayRef, f)?,
-            ScalarValue::Date32(e) => {
-                format_option!(f, e.map(|v| Date32Type::to_naive_date(v).to_string()))?
-            }
-            ScalarValue::Date64(e) => {
-                format_option!(f, e.map(|v| Date64Type::to_naive_date(v).to_string()))?
-            }
+            ScalarValue::List(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::LargeList(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::FixedSizeList(arr) => fmt_list(arr.as_ref(), f)?,
+            ScalarValue::Date32(e) => format_option!(
+                f,
+                e.map(|v| {
+                    let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
+                    match epoch.checked_add_signed(Duration::try_days(v as i64).unwrap())
+                    {
+                        Some(date) => date.to_string(),
+                        None => "".to_string(),
+                    }
+                })
+            )?,
+            ScalarValue::Date64(e) => format_option!(
+                f,
+                e.map(|v| {
+                    let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
+                    match epoch.checked_add_signed(Duration::try_milliseconds(v).unwrap())
+                    {
+                        Some(date) => date.to_string(),
+                        None => "".to_string(),
+                    }
+                })
+            )?,
             ScalarValue::Time32Second(e) => format_option!(f, e)?,
             ScalarValue::Time32Millisecond(e) => format_option!(f, e)?,
             ScalarValue::Time64Microsecond(e) => format_option!(f, e)?,
@@ -3882,18 +5105,18 @@ impl fmt::Display for ScalarValue {
                 None => write!(f, "NULL")?,
             },
             ScalarValue::Dictionary(_k, v) => write!(f, "{v}")?,
+            ScalarValue::RunEndEncoded(_, _, v) => write!(f, "{v}")?,
             ScalarValue::Null => write!(f, "NULL")?,
         };
         Ok(())
     }
 }
 
-fn fmt_list(arr: ArrayRef, f: &mut fmt::Formatter) -> fmt::Result {
+fn fmt_list(arr: &dyn Array, f: &mut fmt::Formatter) -> fmt::Result {
     // ScalarValue List, LargeList, FixedSizeList should always have a single element
     assert_eq!(arr.len(), 1);
     let options = FormatOptions::default().with_display_error(true);
-    let formatter =
-        ArrayFormatter::try_new(arr.as_ref() as &dyn Array, &options).unwrap();
+    let formatter = ArrayFormatter::try_new(arr, &options).unwrap();
     let value_formatter = formatter.value(0);
     write!(f, "{value_formatter}")
 }
@@ -3913,6 +5136,8 @@ fn fmt_binary(data: &[u8], f: &mut fmt::Formatter) -> fmt::Result {
 impl fmt::Debug for ScalarValue {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
+            ScalarValue::Decimal32(_, _, _) => write!(f, "Decimal32({self})"),
+            ScalarValue::Decimal64(_, _, _) => write!(f, "Decimal64({self})"),
             ScalarValue::Decimal128(_, _, _) => write!(f, "Decimal128({self})"),
             ScalarValue::Decimal256(_, _, _) => write!(f, "Decimal256({self})"),
             ScalarValue::Boolean(_) => write!(f, "Boolean({self})"),
@@ -4059,6 +5284,9 @@ impl fmt::Debug for ScalarValue {
                 None => write!(f, "Union(NULL)"),
             },
             ScalarValue::Dictionary(k, v) => write!(f, "Dictionary({k:?}, {v:?})"),
+            ScalarValue::RunEndEncoded(rf, vf, v) => {
+                write!(f, "RunEndEncoded({rf:?}, {vf:?}, {v:?})")
+            }
             ScalarValue::Null => write!(f, "NULL"),
         }
     }
@@ -4108,17 +5336,22 @@ impl ScalarType<i32> for Date32Type {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
 
     use super::*;
-    use crate::cast::{
-        as_map_array, as_string_array, as_struct_array, as_uint32_array, as_uint64_array,
-    };
-
+    use crate::cast::{as_list_array, as_map_array, as_struct_array};
     use crate::test_util::batches_to_string;
-    use arrow::array::{types::Float64Type, NullBufferBuilder};
-    use arrow::buffer::{Buffer, OffsetBuffer};
+    use arrow::array::{
+        FixedSizeListBuilder, Int32Builder, LargeListBuilder, ListBuilder, MapBuilder,
+        NullArray, NullBufferBuilder, OffsetSizeTrait, PrimitiveBuilder, RecordBatch,
+        StringBuilder, StringDictionaryBuilder, StructBuilder, UnionBuilder,
+    };
+    use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer};
     use arrow::compute::{is_null, kernels};
-    use arrow::datatypes::Fields;
+    use arrow::datatypes::{
+        ArrowNumericType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, Fields,
+        Float64Type, TimeUnit,
+    };
     use arrow::error::ArrowError;
     use arrow::util::pretty::pretty_format_columns;
     use chrono::NaiveDate;
@@ -4150,6 +5383,52 @@ mod tests {
         assert_eq!(actual, &expected);
     }
 
+    #[test]
+    fn test_format_timestamp_type_for_error_and_bounds() {
+        // format helper
+        let ts_ns = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Nanosecond,
+            None,
+        ));
+        assert_eq!(ts_ns, "Timestamp(ns)");
+
+        let ts_us = format_timestamp_type_for_error(&DataType::Timestamp(
+            TimeUnit::Microsecond,
+            None,
+        ));
+        assert_eq!(ts_us, "Timestamp(us)");
+
+        // ensure_timestamp_in_bounds: Date32 non-overflow
+        let ok = ensure_timestamp_in_bounds(
+            1000,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(ok.is_ok());
+
+        // Date32 overflow -- known large day value (9999-12-31 -> 2932896)
+        let err = ensure_timestamp_in_bounds(
+            2932896,
+            NANOS_PER_DAY,
+            &DataType::Date32,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err.is_err());
+        let msg = err.unwrap_err().to_string();
+        assert!(msg.contains("Cannot cast Date32 value 2932896 to Timestamp(ns): converted value exceeds the representable i64 range"));
+
+        // Date64 overflow for ns (millis * 1_000_000)
+        let overflow_millis: i64 = (i64::MAX / NANOS_PER_MILLISECOND) + 1;
+        let err2 = ensure_timestamp_in_bounds(
+            overflow_millis,
+            NANOS_PER_MILLISECOND,
+            &DataType::Date64,
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        );
+        assert!(err2.is_err());
+    }
+
     #[test]
     fn test_scalar_value_from_for_struct() {
         let boolean = Arc::new(BooleanArray::from(vec![false]));
@@ -4281,6 +5560,91 @@ mod tests {
         assert_eq!(empty_array.len(), 0);
     }
 
+    #[test]
+    fn test_to_array_of_size_list_size_one() {
+        // size=1 takes the fast path (Arc::clone)
+        let arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+            Some(10),
+            Some(20),
+        ])]);
+        let sv = ScalarValue::List(Arc::new(arr.clone()));
+        let result = sv.to_array_of_size(1).unwrap();
+        assert_eq!(result.as_list::<i32>(), &arr);
+    }
+
+    #[test]
+    fn test_to_array_of_size_list_empty_inner() {
+        // A list scalar containing an empty list: [[]]
+        let arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![])]);
+        let sv = ScalarValue::List(Arc::new(arr));
+        let result = sv.to_array_of_size(3).unwrap();
+        let result_list = result.as_list::<i32>();
+        assert_eq!(result_list.len(), 3);
+        for i in 0..3 {
+            assert_eq!(result_list.value(i).len(), 0);
+        }
+    }
+
+    #[test]
+    fn test_to_array_of_size_large_list() {
+        let arr =
+            LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+                Some(100),
+                Some(200),
+            ])]);
+        let sv = ScalarValue::LargeList(Arc::new(arr));
+        let result = sv.to_array_of_size(3).unwrap();
+        let expected = LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(100), Some(200)]),
+            Some(vec![Some(100), Some(200)]),
+            Some(vec![Some(100), Some(200)]),
+        ]);
+        assert_eq!(result.as_list::<i64>(), &expected);
+    }
+
+    #[test]
+    fn test_list_to_array_of_size_multi_row() {
+        // Call list_to_array_of_size directly with arr.len() > 1
+        let arr = Int32Array::from(vec![Some(10), None, Some(30)]);
+        let result = ScalarValue::list_to_array_of_size(&arr, 3).unwrap();
+        let result = result.as_primitive::<Int32Type>();
+        assert_eq!(
+            result.iter().collect::<Vec<_>>(),
+            vec![
+                Some(10),
+                None,
+                Some(30),
+                Some(10),
+                None,
+                Some(30),
+                Some(10),
+                None,
+                Some(30),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_to_array_of_size_null_list() {
+        let dt = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
+        let sv = ScalarValue::try_from(&dt).unwrap();
+        let result = sv.to_array_of_size(3).unwrap();
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.null_count(), 3);
+    }
+
+    /// See https://github.com/apache/datafusion/issues/18870
+    #[test]
+    fn test_to_array_of_size_for_none_fsb() {
+        let sv = ScalarValue::FixedSizeBinary(5, None);
+        let result = sv
+            .to_array_of_size(2)
+            .expect("Failed to convert to array of size");
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.null_count(), 2);
+        assert_eq!(result.as_fixed_size_binary().values().len(), 10);
+    }
+
     #[test]
     fn test_list_to_array_string() {
         let scalars = vec![
@@ -4475,7 +5839,7 @@ mod tests {
         ]);
 
         let array = ScalarValue::iter_to_array(scalars).unwrap();
-        let list_array = as_list_array(&array);
+        let list_array = as_list_array(&array).unwrap();
         // List[[1,2,3], null, [4,5]]
         let expected = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
             Some(vec![Some(1), Some(2), Some(3)]),
@@ -4491,7 +5855,7 @@ mod tests {
         ]);
 
         let array = ScalarValue::iter_to_array(scalars).unwrap();
-        let list_array = as_large_list_array(&array);
+        let list_array = as_large_list_array(&array).unwrap();
         let expected = LargeListArray::from_iter_primitive::<Int64Type, _, _>(vec![
             Some(vec![Some(1), Some(2), Some(3)]),
             None,
@@ -4555,6 +5919,17 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_eq_array_err_message() {
+        assert_starts_with(
+            ScalarValue::Utf8(Some("123".to_string()))
+                .eq_array(&(Arc::new(Int32Array::from(vec![123])) as ArrayRef), 0)
+                .unwrap_err()
+                .message(),
+            "could not cast array of type Int32 to arrow_array::array::byte_array::GenericByteArray<arrow_array::types::GenericStringType<i32>>",
+        );
+    }
+
     #[test]
     fn scalar_add_trait_test() -> Result<()> {
         let float_value = ScalarValue::Float64(Some(123.));
@@ -4625,7 +6000,10 @@ mod tests {
             .sub_checked(&int_value_2)
             .unwrap_err()
             .strip_backtrace();
-        assert_eq!(err, "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808")
+        assert_eq!(
+            err,
+            "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808"
+        )
     }
 
     #[test]
@@ -4711,6 +6089,32 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_try_cmp() {
+        assert_eq!(
+            ScalarValue::try_cmp(
+                &ScalarValue::Int32(Some(1)),
+                &ScalarValue::Int32(Some(2))
+            )
+            .unwrap(),
+            Ordering::Less
+        );
+        assert_eq!(
+            ScalarValue::try_cmp(&ScalarValue::Int32(None), &ScalarValue::Int32(Some(2)))
+                .unwrap(),
+            Ordering::Less
+        );
+        assert_starts_with(
+            ScalarValue::try_cmp(
+                &ScalarValue::Int32(Some(1)),
+                &ScalarValue::Int64(Some(2)),
+            )
+            .unwrap_err()
+            .message(),
+            "Uncomparable values: Int32(1), Int64(2)",
+        );
+    }
+
     #[test]
     fn scalar_decimal_test() -> Result<()> {
         let decimal_value = ScalarValue::Decimal128(Some(123), 10, 1);
@@ -4747,12 +6151,16 @@ mod tests {
         assert_eq!(123i128, array_decimal.value(0));
         assert_eq!(123i128, array_decimal.value(9));
         // test eq array
-        assert!(decimal_value
-            .eq_array(&array, 1)
-            .expect("Failed to compare arrays"));
-        assert!(decimal_value
-            .eq_array(&array, 5)
-            .expect("Failed to compare arrays"));
+        assert!(
+            decimal_value
+                .eq_array(&array, 1)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            decimal_value
+                .eq_array(&array, 5)
+                .expect("Failed to compare arrays")
+        );
         // test try from array
         assert_eq!(
             decimal_value,
@@ -4797,18 +6205,24 @@ mod tests {
         assert_eq!(4, array.len());
         assert_eq!(DataType::Decimal128(10, 2), array.data_type().clone());
 
-        assert!(ScalarValue::try_new_decimal128(1, 10, 2)
-            .unwrap()
-            .eq_array(&array, 0)
-            .expect("Failed to compare arrays"));
-        assert!(ScalarValue::try_new_decimal128(2, 10, 2)
-            .unwrap()
-            .eq_array(&array, 1)
-            .expect("Failed to compare arrays"));
-        assert!(ScalarValue::try_new_decimal128(3, 10, 2)
-            .unwrap()
-            .eq_array(&array, 2)
-            .expect("Failed to compare arrays"));
+        assert!(
+            ScalarValue::try_new_decimal128(1, 10, 2)
+                .unwrap()
+                .eq_array(&array, 0)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            ScalarValue::try_new_decimal128(2, 10, 2)
+                .unwrap()
+                .eq_array(&array, 1)
+                .expect("Failed to compare arrays")
+        );
+        assert!(
+            ScalarValue::try_new_decimal128(3, 10, 2)
+                .unwrap()
+                .eq_array(&array, 2)
+                .expect("Failed to compare arrays")
+        );
         assert_eq!(
             ScalarValue::Decimal128(None, 10, 2),
             ScalarValue::try_from_array(&array, 3).unwrap()
@@ -4818,12 +6232,120 @@ mod tests {
     }
 
     #[test]
-    fn test_list_partial_cmp() {
-        let a =
-            ScalarValue::List(Arc::new(
-                ListArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(vec![
-                    Some(1),
-                    Some(2),
+    fn test_new_one_decimal128() {
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal128(5, 0)).unwrap(),
+            ScalarValue::Decimal128(Some(1), 5, 0)
+        );
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal128(5, 1)).unwrap(),
+            ScalarValue::Decimal128(Some(10), 5, 1)
+        );
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal128(5, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(100), 5, 2)
+        );
+        // More precision
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal128(7, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(100), 7, 2)
+        );
+        // No negative scale
+        assert!(ScalarValue::new_one(&DataType::Decimal128(5, -1)).is_err());
+        // Invalid combination
+        assert!(ScalarValue::new_one(&DataType::Decimal128(0, 2)).is_err());
+        assert!(ScalarValue::new_one(&DataType::Decimal128(5, 7)).is_err());
+    }
+
+    #[test]
+    fn test_new_one_decimal256() {
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal256(5, 0)).unwrap(),
+            ScalarValue::Decimal256(Some(1.into()), 5, 0)
+        );
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal256(5, 1)).unwrap(),
+            ScalarValue::Decimal256(Some(10.into()), 5, 1)
+        );
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal256(5, 2)).unwrap(),
+            ScalarValue::Decimal256(Some(100.into()), 5, 2)
+        );
+        // More precision
+        assert_eq!(
+            ScalarValue::new_one(&DataType::Decimal256(7, 2)).unwrap(),
+            ScalarValue::Decimal256(Some(100.into()), 7, 2)
+        );
+        // No negative scale
+        assert!(ScalarValue::new_one(&DataType::Decimal256(5, -1)).is_err());
+        // Invalid combination
+        assert!(ScalarValue::new_one(&DataType::Decimal256(0, 2)).is_err());
+        assert!(ScalarValue::new_one(&DataType::Decimal256(5, 7)).is_err());
+    }
+
+    #[test]
+    fn test_new_ten_decimal128() {
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal128(5, 1)).unwrap(),
+            ScalarValue::Decimal128(Some(100), 5, 1)
+        );
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal128(5, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(1000), 5, 2)
+        );
+        // More precision
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal128(7, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(1000), 7, 2)
+        );
+        // No negative scale
+        assert!(ScalarValue::new_ten(&DataType::Decimal128(5, -1)).is_err());
+        // Invalid combination
+        assert!(ScalarValue::new_ten(&DataType::Decimal128(0, 2)).is_err());
+        assert!(ScalarValue::new_ten(&DataType::Decimal128(5, 7)).is_err());
+    }
+
+    #[test]
+    fn test_new_ten_decimal256() {
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal256(5, 1)).unwrap(),
+            ScalarValue::Decimal256(Some(100.into()), 5, 1)
+        );
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal256(5, 2)).unwrap(),
+            ScalarValue::Decimal256(Some(1000.into()), 5, 2)
+        );
+        // More precision
+        assert_eq!(
+            ScalarValue::new_ten(&DataType::Decimal256(7, 2)).unwrap(),
+            ScalarValue::Decimal256(Some(1000.into()), 7, 2)
+        );
+        // No negative scale
+        assert!(ScalarValue::new_ten(&DataType::Decimal256(5, -1)).is_err());
+        // Invalid combination
+        assert!(ScalarValue::new_ten(&DataType::Decimal256(0, 2)).is_err());
+        assert!(ScalarValue::new_ten(&DataType::Decimal256(5, 7)).is_err());
+    }
+
+    #[test]
+    fn test_new_negative_one_decimal128() {
+        assert_eq!(
+            ScalarValue::new_negative_one(&DataType::Decimal128(5, 0)).unwrap(),
+            ScalarValue::Decimal128(Some(-1), 5, 0)
+        );
+        assert_eq!(
+            ScalarValue::new_negative_one(&DataType::Decimal128(5, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(-100), 5, 2)
+        );
+    }
+
+    #[test]
+    fn test_list_partial_cmp() {
+        let a =
+            ScalarValue::List(Arc::new(
+                ListArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(vec![
+                    Some(1),
+                    Some(2),
                     Some(3),
                 ])]),
             ));
@@ -5136,8 +6658,6 @@ mod tests {
     }
 
     #[test]
-    // despite clippy claiming they are useless, the code doesn't compile otherwise.
-    #[allow(clippy::useless_vec)]
     fn scalar_iter_to_array_boolean() {
         check_scalar_iter!(Boolean, BooleanArray, vec![Some(true), None, Some(false)]);
         check_scalar_iter!(Float32, Float32Array, vec![Some(1.9), None, Some(-2.1)]);
@@ -5187,12 +6707,12 @@ mod tests {
         check_scalar_iter_binary!(
             Binary,
             BinaryArray,
-            vec![Some(b"foo"), None, Some(b"bar")]
+            [Some(b"foo"), None, Some(b"bar")]
         );
         check_scalar_iter_binary!(
             LargeBinary,
             LargeBinaryArray,
-            vec![Some(b"foo"), None, Some(b"bar")]
+            [Some(b"foo"), None, Some(b"bar")]
         );
     }
 
@@ -5645,7 +7165,9 @@ mod tests {
                 for other_index in 0..array.len() {
                     if index != other_index {
                         assert!(
-                            !scalar.eq_array(&array, other_index).expect("Failed to compare arrays"),
+                            !scalar
+                                .eq_array(&array, other_index)
+                                .expect("Failed to compare arrays"),
                             "Expected {scalar:?} to be NOT equal to {array:?} at index {other_index}"
                         );
                     }
@@ -6073,6 +7595,31 @@ mod tests {
         }
     }
 
+    #[test]
+    fn roundtrip_run_array() {
+        // Comparison logic in round_trip_through_scalar doesn't work for RunArrays
+        // so we have a custom test for them
+        // TODO: https://github.com/apache/arrow-rs/pull/9213 might fix this ^
+        let run_ends = Int16Array::from(vec![2, 3]);
+        let values = Int64Array::from(vec![Some(1), None]);
+        let run_array = RunArray::try_new(&run_ends, &values).unwrap();
+        let run_array = run_array.downcast::<Int64Array>().unwrap();
+
+        let expected_values = run_array.into_iter().collect::<Vec<_>>();
+
+        for i in 0..run_array.len() {
+            let scalar = ScalarValue::try_from_array(&run_array, i).unwrap();
+            let array = scalar.to_array_of_size(1).unwrap();
+            assert_eq!(array.data_type(), run_array.data_type());
+            let array = array.as_run::<Int16Type>();
+            let array = array.downcast::<Int64Array>().unwrap();
+            assert_eq!(
+                array.into_iter().collect::<Vec<_>>(),
+                expected_values[i..i + 1]
+            );
+        }
+    }
+
     #[test]
     fn test_scalar_union_sparse() {
         let field_a = Arc::new(Field::new("A", DataType::Int32, true));
@@ -6570,7 +8117,6 @@ mod tests {
     }
 
     #[test]
-    #[allow(arithmetic_overflow)] // we want to test them
     fn test_scalar_negative_overflows() -> Result<()> {
         macro_rules! test_overflow_on_value {
             ($($val:expr),* $(,)?) => {$(
@@ -6579,10 +8125,7 @@ mod tests {
                     let err = value.arithmetic_negate().expect_err("Should receive overflow error on negating {value:?}");
                     let root_err = err.find_root();
                     match  root_err{
-                        DataFusionError::ArrowError(
-                            ArrowError::ArithmeticOverflow(_),
-                            _,
-                        ) => {}
+                        DataFusionError::ArrowError(err, _) if matches!(err.as_ref(), ArrowError::ArithmeticOverflow(_)) => {}
                         _ => return Err(err),
                     };
                 }
@@ -6870,6 +8413,26 @@ mod tests {
                 ScalarValue::Float64(Some(-9.9)),
                 5,
             ),
+            (
+                ScalarValue::Decimal128(Some(10), 1, 0),
+                ScalarValue::Decimal128(Some(5), 1, 0),
+                5,
+            ),
+            (
+                ScalarValue::Decimal128(Some(5), 1, 0),
+                ScalarValue::Decimal128(Some(10), 1, 0),
+                5,
+            ),
+            (
+                ScalarValue::Decimal256(Some(10.into()), 1, 0),
+                ScalarValue::Decimal256(Some(5.into()), 1, 0),
+                5,
+            ),
+            (
+                ScalarValue::Decimal256(Some(5.into()), 1, 0),
+                ScalarValue::Decimal256(Some(10.into()), 1, 0),
+                5,
+            ),
         ];
         for (lhs, rhs, expected) in cases.iter() {
             let distance = lhs.distance(rhs).unwrap();
@@ -6877,6 +8440,24 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_distance_none() {
+        let cases = [
+            (
+                ScalarValue::Decimal128(Some(i128::MAX), DECIMAL128_MAX_PRECISION, 0),
+                ScalarValue::Decimal128(Some(-i128::MAX), DECIMAL128_MAX_PRECISION, 0),
+            ),
+            (
+                ScalarValue::Decimal256(Some(i256::MAX), DECIMAL256_MAX_PRECISION, 0),
+                ScalarValue::Decimal256(Some(-i256::MAX), DECIMAL256_MAX_PRECISION, 0),
+            ),
+        ];
+        for (lhs, rhs) in cases.iter() {
+            let distance = lhs.distance(rhs);
+            assert!(distance.is_none(), "{lhs} vs {rhs}");
+        }
+    }
+
     #[test]
     fn test_scalar_distance_invalid() {
         let cases = [
@@ -6918,7 +8499,33 @@ mod tests {
             (ScalarValue::Date64(Some(0)), ScalarValue::Date64(Some(1))),
             (
                 ScalarValue::Decimal128(Some(123), 5, 5),
-                ScalarValue::Decimal128(Some(120), 5, 5),
+                ScalarValue::Decimal128(Some(120), 5, 3),
+            ),
+            (
+                ScalarValue::Decimal128(Some(123), 5, 5),
+                ScalarValue::Decimal128(Some(120), 3, 5),
+            ),
+            (
+                ScalarValue::Decimal256(Some(123.into()), 5, 5),
+                ScalarValue::Decimal256(Some(120.into()), 3, 5),
+            ),
+            // Distance 2 * 2^50 is larger than usize
+            (
+                ScalarValue::Decimal256(
+                    Some(i256::from_parts(0, 2_i64.pow(50).into())),
+                    1,
+                    0,
+                ),
+                ScalarValue::Decimal256(
+                    Some(i256::from_parts(0, (-(2_i64).pow(50)).into())),
+                    1,
+                    0,
+                ),
+            ),
+            // Distance overflow
+            (
+                ScalarValue::Decimal256(Some(i256::from_parts(0, i128::MAX)), 1, 0),
+                ScalarValue::Decimal256(Some(i256::from_parts(0, -i128::MAX)), 1, 0),
             ),
         ];
         for (lhs, rhs) in cases {
@@ -7196,6 +8803,19 @@ mod tests {
         ");
     }
 
+    #[test]
+    fn test_display_date64_large_values() {
+        assert_eq!(
+            format!("{}", ScalarValue::Date64(Some(790179464505))),
+            "1995-01-15"
+        );
+        // This used to panic, see https://github.com/apache/arrow-rs/issues/7728
+        assert_eq!(
+            format!("{}", ScalarValue::Date64(Some(-790179464505600000))),
+            ""
+        );
+    }
+
     #[test]
     fn test_struct_display_null() {
         let fields = vec![Field::new("a", DataType::Int32, false)];
@@ -7512,6 +9132,19 @@ mod tests {
         assert!(dense_scalar.is_null());
     }
 
+    #[test]
+    fn cast_date_to_timestamp_overflow_returns_error() {
+        let scalar = ScalarValue::Date32(Some(i32::MAX));
+        let err = scalar
+            .cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None))
+            .expect_err("expected cast to fail");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 range"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn null_dictionary_scalar_produces_null_dictionary_array() {
         let dictionary_scalar = ScalarValue::Dictionary(
@@ -7584,7 +9217,7 @@ mod tests {
             ])),
             true,
         ));
-        let scalars = vec![
+        let scalars = [
             ScalarValue::try_new_null(&DataType::List(Arc::clone(&field_ref))).unwrap(),
             ScalarValue::try_new_null(&DataType::LargeList(Arc::clone(&field_ref)))
                 .unwrap(),
@@ -7599,11 +9232,654 @@ mod tests {
             .unwrap(),
             ScalarValue::try_new_null(&DataType::Map(map_field_ref, false)).unwrap(),
             ScalarValue::try_new_null(&DataType::Union(
-                UnionFields::new(vec![42], vec![field_ref]),
+                UnionFields::try_new(vec![42], vec![field_ref]).unwrap(),
                 UnionMode::Dense,
             ))
             .unwrap(),
         ];
         assert!(scalars.iter().all(|s| s.is_null()));
     }
+
+    // `err.to_string()` depends on backtrace being present (may have backtrace appended)
+    // `err.strip_backtrace()` also depends on backtrace being present (may have "This was likely caused by ..." stripped)
+    fn assert_starts_with(actual: impl AsRef<str>, expected_prefix: impl AsRef<str>) {
+        let actual = actual.as_ref();
+        let expected_prefix = expected_prefix.as_ref();
+        assert!(
+            actual.starts_with(expected_prefix),
+            "Expected '{actual}' to start with '{expected_prefix}'"
+        );
+    }
+
+    #[test]
+    fn test_new_default() {
+        // Test numeric types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Int32).unwrap(),
+            ScalarValue::Int32(Some(0))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Float64).unwrap(),
+            ScalarValue::Float64(Some(0.0))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Boolean).unwrap(),
+            ScalarValue::Boolean(Some(false))
+        );
+
+        // Test string types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Utf8).unwrap(),
+            ScalarValue::Utf8(Some("".to_string()))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::LargeUtf8).unwrap(),
+            ScalarValue::LargeUtf8(Some("".to_string()))
+        );
+
+        // Test binary types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Binary).unwrap(),
+            ScalarValue::Binary(Some(vec![]))
+        );
+
+        // Test fixed size binary
+        assert_eq!(
+            ScalarValue::new_default(&DataType::FixedSizeBinary(5)).unwrap(),
+            ScalarValue::FixedSizeBinary(5, Some(vec![0, 0, 0, 0, 0]))
+        );
+
+        // Test temporal types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Date32).unwrap(),
+            ScalarValue::Date32(Some(0))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Time32(TimeUnit::Second)).unwrap(),
+            ScalarValue::Time32Second(Some(0))
+        );
+
+        // Test decimal types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Decimal128(10, 2)).unwrap(),
+            ScalarValue::Decimal128(Some(0), 10, 2)
+        );
+
+        // Test list type
+        let list_field = Field::new_list_field(DataType::Int32, true);
+        let list_result =
+            ScalarValue::new_default(&DataType::List(Arc::new(list_field.clone())))
+                .unwrap();
+        match list_result {
+            ScalarValue::List(arr) => {
+                assert_eq!(arr.len(), 1);
+                assert_eq!(arr.value_length(0), 0); // empty list
+            }
+            _ => panic!("Expected List"),
+        }
+
+        // Test struct type
+        let struct_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+        let struct_result =
+            ScalarValue::new_default(&DataType::Struct(struct_fields.clone())).unwrap();
+        match struct_result {
+            ScalarValue::Struct(arr) => {
+                assert_eq!(arr.len(), 1);
+                assert_eq!(arr.column(0).as_primitive::<Int32Type>().value(0), 0);
+                assert_eq!(arr.column(1).as_string::<i32>().value(0), "");
+            }
+            _ => panic!("Expected Struct"),
+        }
+
+        // Test union type
+        let union_fields = UnionFields::try_new(
+            vec![0, 1],
+            vec![
+                Field::new("i32", DataType::Int32, false),
+                Field::new("f64", DataType::Float64, false),
+            ],
+        )
+        .unwrap();
+        let union_result = ScalarValue::new_default(&DataType::Union(
+            union_fields.clone(),
+            UnionMode::Sparse,
+        ))
+        .unwrap();
+        match union_result {
+            ScalarValue::Union(Some((type_id, value)), _, _) => {
+                assert_eq!(type_id, 0);
+                assert_eq!(*value, ScalarValue::Int32(Some(0)));
+            }
+            _ => panic!("Expected Union"),
+        }
+    }
+
+    #[test]
+    fn test_scalar_min() {
+        // Test integer types
+        assert_eq!(
+            ScalarValue::min(&DataType::Int8),
+            Some(ScalarValue::Int8(Some(i8::MIN)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::Int32),
+            Some(ScalarValue::Int32(Some(i32::MIN)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::UInt8),
+            Some(ScalarValue::UInt8(Some(0)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::UInt64),
+            Some(ScalarValue::UInt64(Some(0)))
+        );
+
+        // Test float types
+        assert_eq!(
+            ScalarValue::min(&DataType::Float32),
+            Some(ScalarValue::Float32(Some(f32::NEG_INFINITY)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::Float64),
+            Some(ScalarValue::Float64(Some(f64::NEG_INFINITY)))
+        );
+
+        // Test decimal types
+        let decimal_min = ScalarValue::min(&DataType::Decimal128(5, 2)).unwrap();
+        match decimal_min {
+            ScalarValue::Decimal128(Some(val), 5, 2) => {
+                assert_eq!(val, -99999); // -999.99 with scale 2
+            }
+            _ => panic!("Expected Decimal128"),
+        }
+
+        // Test temporal types
+        assert_eq!(
+            ScalarValue::min(&DataType::Date32),
+            Some(ScalarValue::Date32(Some(i32::MIN)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::Time32(TimeUnit::Second)),
+            Some(ScalarValue::Time32Second(Some(0)))
+        );
+        assert_eq!(
+            ScalarValue::min(&DataType::Timestamp(TimeUnit::Nanosecond, None)),
+            Some(ScalarValue::TimestampNanosecond(Some(i64::MIN), None))
+        );
+
+        // Test duration types
+        assert_eq!(
+            ScalarValue::min(&DataType::Duration(TimeUnit::Second)),
+            Some(ScalarValue::DurationSecond(Some(i64::MIN)))
+        );
+
+        // Test unsupported types
+        assert_eq!(ScalarValue::min(&DataType::Utf8), None);
+        assert_eq!(ScalarValue::min(&DataType::Binary), None);
+        assert_eq!(
+            ScalarValue::min(&DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            )))),
+            None
+        );
+    }
+
+    #[test]
+    fn test_scalar_max() {
+        // Test integer types
+        assert_eq!(
+            ScalarValue::max(&DataType::Int8),
+            Some(ScalarValue::Int8(Some(i8::MAX)))
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::Int32),
+            Some(ScalarValue::Int32(Some(i32::MAX)))
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::UInt8),
+            Some(ScalarValue::UInt8(Some(u8::MAX)))
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::UInt64),
+            Some(ScalarValue::UInt64(Some(u64::MAX)))
+        );
+
+        // Test float types
+        assert_eq!(
+            ScalarValue::max(&DataType::Float32),
+            Some(ScalarValue::Float32(Some(f32::INFINITY)))
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::Float64),
+            Some(ScalarValue::Float64(Some(f64::INFINITY)))
+        );
+
+        // Test decimal types
+        let decimal_max = ScalarValue::max(&DataType::Decimal128(5, 2)).unwrap();
+        match decimal_max {
+            ScalarValue::Decimal128(Some(val), 5, 2) => {
+                assert_eq!(val, 99999); // 999.99 with scale 2
+            }
+            _ => panic!("Expected Decimal128"),
+        }
+
+        // Test temporal types
+        assert_eq!(
+            ScalarValue::max(&DataType::Date32),
+            Some(ScalarValue::Date32(Some(i32::MAX)))
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::Time32(TimeUnit::Second)),
+            Some(ScalarValue::Time32Second(Some(86_399))) // 23:59:59
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::Time64(TimeUnit::Microsecond)),
+            Some(ScalarValue::Time64Microsecond(Some(86_399_999_999))) // 23:59:59.999999
+        );
+        assert_eq!(
+            ScalarValue::max(&DataType::Timestamp(TimeUnit::Nanosecond, None)),
+            Some(ScalarValue::TimestampNanosecond(Some(i64::MAX), None))
+        );
+
+        // Test duration types
+        assert_eq!(
+            ScalarValue::max(&DataType::Duration(TimeUnit::Millisecond)),
+            Some(ScalarValue::DurationMillisecond(Some(i64::MAX)))
+        );
+
+        // Test unsupported types
+        assert_eq!(ScalarValue::max(&DataType::Utf8), None);
+        assert_eq!(ScalarValue::max(&DataType::Binary), None);
+        assert_eq!(
+            ScalarValue::max(&DataType::Struct(Fields::from(vec![Field::new(
+                "field",
+                DataType::Int32,
+                true
+            )]))),
+            None
+        );
+    }
+
+    #[test]
+    fn test_min_max_float16() {
+        // Test Float16 min and max
+        let min_f16 = ScalarValue::min(&DataType::Float16).unwrap();
+        match min_f16 {
+            ScalarValue::Float16(Some(val)) => {
+                assert_eq!(val, f16::NEG_INFINITY);
+            }
+            _ => panic!("Expected Float16"),
+        }
+
+        let max_f16 = ScalarValue::max(&DataType::Float16).unwrap();
+        match max_f16 {
+            ScalarValue::Float16(Some(val)) => {
+                assert_eq!(val, f16::INFINITY);
+            }
+            _ => panic!("Expected Float16"),
+        }
+    }
+
+    #[test]
+    fn test_new_default_interval() {
+        // Test all interval types
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Interval(IntervalUnit::YearMonth))
+                .unwrap(),
+            ScalarValue::IntervalYearMonth(Some(0))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Interval(IntervalUnit::DayTime)).unwrap(),
+            ScalarValue::IntervalDayTime(Some(IntervalDayTime::ZERO))
+        );
+        assert_eq!(
+            ScalarValue::new_default(&DataType::Interval(IntervalUnit::MonthDayNano))
+                .unwrap(),
+            ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano::ZERO))
+        );
+    }
+
+    #[test]
+    fn test_min_max_with_timezone() {
+        let tz = Some(Arc::from("UTC"));
+
+        // Test timestamp with timezone
+        let min_ts =
+            ScalarValue::min(&DataType::Timestamp(TimeUnit::Second, tz.clone())).unwrap();
+        match min_ts {
+            ScalarValue::TimestampSecond(Some(val), Some(tz_str)) => {
+                assert_eq!(val, i64::MIN);
+                assert_eq!(tz_str.as_ref(), "UTC");
+            }
+            _ => panic!("Expected TimestampSecond with timezone"),
+        }
+
+        let max_ts =
+            ScalarValue::max(&DataType::Timestamp(TimeUnit::Millisecond, tz.clone()))
+                .unwrap();
+        match max_ts {
+            ScalarValue::TimestampMillisecond(Some(val), Some(tz_str)) => {
+                assert_eq!(val, i64::MAX);
+                assert_eq!(tz_str.as_ref(), "UTC");
+            }
+            _ => panic!("Expected TimestampMillisecond with timezone"),
+        }
+    }
+
+    #[test]
+    fn test_views_minimize_memory() {
+        let value = "this string is longer than 12 bytes".to_string();
+
+        let scalar = ScalarValue::Utf8View(Some(value.clone()));
+        let array = scalar.to_array_of_size(10).unwrap();
+        let array = array.as_string_view();
+        let buffers = array.data_buffers();
+        assert_eq!(1, buffers.len());
+        // Ensure we only have a single copy of the value string
+        assert_eq!(value.len(), buffers[0].len());
+
+        // Same but for BinaryView
+        let scalar = ScalarValue::BinaryView(Some(value.bytes().collect()));
+        let array = scalar.to_array_of_size(10).unwrap();
+        let array = array.as_binary_view();
+        let buffers = array.data_buffers();
+        assert_eq!(1, buffers.len());
+        assert_eq!(value.len(), buffers[0].len());
+    }
+
+    #[test]
+    fn test_to_array_of_size_run_end_encoded() {
+        fn run_test<R: RunEndIndexType>() {
+            let value = Box::new(ScalarValue::Float32(Some(1.0)));
+            let size = 5;
+            let scalar = ScalarValue::RunEndEncoded(
+                Field::new("run_ends", R::DATA_TYPE, false).into(),
+                Field::new("values", DataType::Float32, true).into(),
+                value.clone(),
+            );
+            let array = scalar.to_array_of_size(size).unwrap();
+            let array = array.as_run::<R>();
+            let array = array.downcast::<Float32Array>().unwrap();
+            assert_eq!(vec![Some(1.0); size], array.into_iter().collect::<Vec<_>>());
+            assert_eq!(1, array.values().len());
+        }
+
+        run_test::<Int16Type>();
+        run_test::<Int32Type>();
+        run_test::<Int64Type>();
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        let err = scalar.to_array_of_size(i16::MAX as usize + 10).unwrap_err();
+        assert_eq!(
+            "Execution error: Cannot construct RunArray of size 32777: Overflows run-ends type Int16",
+            err.to_string()
+        )
+    }
+
+    #[test]
+    fn test_eq_array_run_end_encoded() {
+        let run_ends = Int16Array::from(vec![1, 3]);
+        let values = Float32Array::from(vec![None, Some(1.0)]);
+        let run_array =
+            Arc::new(RunArray::try_new(&run_ends, &values).unwrap()) as ArrayRef;
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(None)),
+        );
+        assert!(scalar.eq_array(&run_array, 0).unwrap());
+
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(Some(1.0))),
+        );
+        assert!(scalar.eq_array(&run_array, 1).unwrap());
+        assert!(scalar.eq_array(&run_array, 2).unwrap());
+
+        // value types must match
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Float64, true).into(),
+            Box::new(ScalarValue::Float64(Some(1.0))),
+        );
+        let err = scalar.eq_array(&run_array, 1).unwrap_err();
+        let expected = "Internal error: could not cast array of type Float32 to arrow_array::array::primitive_array::PrimitiveArray<arrow_array::types::Float64Type>";
+        assert!(err.to_string().starts_with(expected));
+
+        // run ends type must match
+        let scalar = ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int32, false).into(),
+            Field::new("values", DataType::Float32, true).into(),
+            Box::new(ScalarValue::Float32(None)),
+        );
+        let err = scalar.eq_array(&run_array, 0).unwrap_err();
+        let expected = "Internal error: could not cast array of type RunEndEncoded(\"run_ends\": non-null Int16, \"values\": Float32) to arrow_array::array::run_array::RunArray<arrow_array::types::Int32Type>";
+        assert!(err.to_string().starts_with(expected));
+    }
+
+    #[test]
+    fn test_iter_to_array_run_end_encoded() {
+        let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int16, false));
+        let values_field = Arc::new(Field::new("values", DataType::Int64, true));
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(None)),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(2))),
+            ),
+        ];
+
+        let run_array = ScalarValue::iter_to_array(scalars).unwrap();
+        let expected = RunArray::try_new(
+            &Int16Array::from(vec![2, 3, 6]),
+            &Int64Array::from(vec![Some(1), None, Some(2)]),
+        )
+        .unwrap();
+        assert_eq!(&expected as &dyn Array, run_array.as_ref());
+
+        // inconsistent run-ends type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: RunEndEncoded(Field { name: \"run_ends\", data_type: Int32 }, Field { name: \"values\", data_type: Int64, nullable: true }, Int64(1))";
+        assert!(err.to_string().starts_with(expected));
+
+        // inconsistent value type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Field::new("values", DataType::Int32, true).into(),
+                Box::new(ScalarValue::Int32(Some(1))),
+            ),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: RunEndEncoded(Field { name: \"run_ends\", data_type: Int16 }, Field { name: \"values\", data_type: Int32, nullable: true }, Int32(1))";
+        assert!(err.to_string().starts_with(expected));
+
+        // inconsistent scalars type
+        let scalars = vec![
+            ScalarValue::RunEndEncoded(
+                Arc::clone(&run_ends_field),
+                Arc::clone(&values_field),
+                Box::new(ScalarValue::Int64(Some(1))),
+            ),
+            ScalarValue::Int64(Some(1)),
+        ];
+        let err = ScalarValue::iter_to_array(scalars).unwrap_err();
+        let expected = "Execution error: Expected RunEndEncoded scalar with run-ends field Field { \"run_ends\": Int16 } but got: Int64(1)";
+        assert!(err.to_string().starts_with(expected));
+    }
+
+    #[test]
+    fn test_convert_array_to_scalar_vec() {
+        // 1: Regular ListArray
+        let list = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), None, Some(4)]),
+        ]);
+        let converted = ScalarValue::convert_array_to_scalar_vec(&list).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(None),
+                    ScalarValue::Int64(Some(4))
+                ]),
+            ]
+        );
+
+        // 2: Regular LargeListArray
+        let large_list = LargeListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), None, Some(4)]),
+        ]);
+        let converted = ScalarValue::convert_array_to_scalar_vec(&large_list).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(None),
+                    ScalarValue::Int64(Some(4))
+                ]),
+            ]
+        );
+
+        // 3: Funky (null slot has non-zero list offsets)
+        // Offsets + Values looks like this: [[1, 2], [3, 4], [5]]
+        // But with NullBuffer it's like this: [[1, 2], NULL, [5]]
+        let funky = ListArray::new(
+            Field::new_list_field(DataType::Int64, true).into(),
+            OffsetBuffer::new(vec![0, 2, 4, 5].into()),
+            Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])),
+            Some(NullBuffer::from(vec![true, false, true])),
+        );
+        let converted = ScalarValue::convert_array_to_scalar_vec(&funky).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![ScalarValue::Int64(Some(5))]),
+            ]
+        );
+
+        // 4: Offsets + Values looks like this: [[1, 2], [], [5]]
+        // But with NullBuffer it's like this: [[1, 2], NULL, [5]]
+        // The converted result is: [[1, 2], None, [5]]
+        let array4 = ListArray::new(
+            Field::new_list_field(DataType::Int64, true).into(),
+            OffsetBuffer::new(vec![0, 2, 2, 5].into()),
+            Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])),
+            Some(NullBuffer::from(vec![true, false, true])),
+        );
+        let converted = ScalarValue::convert_array_to_scalar_vec(&array4).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                None,
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(Some(4)),
+                    ScalarValue::Int64(Some(5)),
+                ]),
+            ]
+        );
+
+        // 5: Offsets + Values looks like this: [[1, 2], [], [5]]
+        // Same as 4, but the middle array is not null, so after conversion it's empty.
+        let array5 = ListArray::new(
+            Field::new_list_field(DataType::Int64, true).into(),
+            OffsetBuffer::new(vec![0, 2, 2, 5].into()),
+            Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])),
+            Some(NullBuffer::from(vec![true, true, true])),
+        );
+        let converted = ScalarValue::convert_array_to_scalar_vec(&array5).unwrap();
+        assert_eq!(
+            converted,
+            vec![
+                Some(vec![
+                    ScalarValue::Int64(Some(1)),
+                    ScalarValue::Int64(Some(2))
+                ]),
+                Some(vec![]),
+                Some(vec![
+                    ScalarValue::Int64(Some(3)),
+                    ScalarValue::Int64(Some(4)),
+                    ScalarValue::Int64(Some(5)),
+                ]),
+            ]
+        );
+    }
 }
diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs
index fd19dccf89636..045b5778243df 100644
--- a/datafusion/common/src/scalar/struct_builder.rs
+++ b/datafusion/common/src/scalar/struct_builder.rs
@@ -47,13 +47,11 @@ impl ScalarStructBuilder {
     /// ```rust
     /// # use arrow::datatypes::{DataType, Field};
     /// # use datafusion_common::scalar::ScalarStructBuilder;
-    /// let fields = vec![
-    ///    Field::new("a", DataType::Int32, false),
-    /// ];
+    /// let fields = vec![Field::new("a", DataType::Int32, false)];
     /// let sv = ScalarStructBuilder::new_null(fields);
     /// // Note this is `NULL`, not `{a: NULL}`
     /// assert_eq!(format!("{sv}"), "NULL");
-    ///```
+    /// ```
     ///
     /// To create a struct where the *fields* are null, use `Self::new()` and
     /// pass null values for each field:
@@ -65,9 +63,9 @@ impl ScalarStructBuilder {
     /// let field = Field::new("a", DataType::Int32, true);
     /// // add a null value for the "a" field
     /// let sv = ScalarStructBuilder::new()
-    ///   .with_scalar(field, ScalarValue::Int32(None))
-    ///   .build()
-    ///   .unwrap();
+    ///     .with_scalar(field, ScalarValue::Int32(None))
+    ///     .build()
+    ///     .unwrap();
     /// // value is not null, but field is
     /// assert_eq!(format!("{sv}"), "{a:}");
     /// ```
@@ -85,6 +83,7 @@ impl ScalarStructBuilder {
     }
 
     /// Add the specified field and `ScalarValue` to the struct.
+    #[expect(clippy::needless_pass_by_value)] // Skip for public API's compatibility
     pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self {
         // valid scalar value should not fail
         let array = value.to_array().unwrap();
diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs
index 5111e264123ce..c0b52977e14a9 100644
--- a/datafusion/common/src/spans.rs
+++ b/datafusion/common/src/spans.rs
@@ -39,6 +39,7 @@ impl fmt::Debug for Location {
     }
 }
 
+#[cfg(feature = "sql")]
 impl From<sqlparser::tokenizer::Location> for Location {
     fn from(value: sqlparser::tokenizer::Location) -> Self {
         Self {
@@ -70,6 +71,7 @@ impl Span {
     /// Convert a [`Span`](sqlparser::tokenizer::Span) from the parser, into a
     /// DataFusion [`Span`]. If the input span is empty (line 0 column 0, to
     /// line 0 column 0), then [`None`] is returned.
+    #[cfg(feature = "sql")]
     pub fn try_from_sqlparser_span(span: sqlparser::tokenizer::Span) -> Option<Span> {
         if span == sqlparser::tokenizer::Span::empty() {
             None
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index a6d132ef51f6a..f263c905faf6b 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -22,17 +22,40 @@ use std::fmt::{self, Debug, Display};
 use crate::{Result, ScalarValue};
 
 use crate::error::_plan_err;
-use arrow::datatypes::{DataType, Schema, SchemaRef};
+use crate::utils::aggregate::precision_add;
+use arrow::datatypes::{DataType, Schema};
 
 /// Represents a value with a degree of certainty. `Precision` is used to
 /// propagate information the precision of statistical values.
 #[derive(Clone, PartialEq, Eq, Default, Copy)]
 pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
-    /// The exact value is known
+    /// The exact value is known. Used for guaranteeing correctness.
+    ///
+    /// Comes from definitive sources such as:
+    /// - Parquet file metadata (row counts, byte sizes)
+    /// - In-memory RecordBatch data (actual row counts, byte sizes, null counts)
+    /// - and more...
     Exact(T),
-    /// The value is not known exactly, but is likely close to this value
+    /// The value is not known exactly, but is likely close to this value.
+    /// Used for cost-based optimizations.
+    ///
+    /// Some operations that would result in `Inexact(T)` would be:
+    /// - Applying a filter (selectivity is unknown)
+    /// - Mixing exact and inexact values in arithmetic
+    /// - and more...
     Inexact(T),
-    /// Nothing is known about the value
+    /// Nothing is known about the value. This is the default state.
+    ///
+    /// Acts as an absorbing element in arithmetic -> any operation
+    /// involving `Absent` yields `Absent`. [`Precision::to_inexact`]
+    /// on `Absent` returns `Absent`, not `Inexact` — it represents
+    /// a fundamentally different state.
+    ///
+    /// Common sources include:
+    /// - Data sources without statistics
+    /// - Parquet columns missing from file metadata
+    /// - Statistics that cannot be derived for an operation (e.g.,
+    ///   `distinct_count` after a union, `total_byte_size` for joins)
     #[default]
     Absent,
 }
@@ -120,10 +143,15 @@ impl Precision<usize> {
     /// values is [`Precision::Absent`], the result is `Absent` too.
     pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
         match (self, other) {
-            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a + b),
+            (Precision::Exact(a), Precision::Exact(b)) => a.checked_add(*b).map_or_else(
+                || Precision::Inexact(a.saturating_add(*b)),
+                Precision::Exact,
+            ),
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
-            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a + b),
+            | (Precision::Inexact(a), Precision::Inexact(b)) => {
+                Precision::Inexact(a.saturating_add(*b))
+            }
             (_, _) => Precision::Absent,
         }
     }
@@ -133,10 +161,15 @@ impl Precision<usize> {
     /// values is [`Precision::Absent`], the result is `Absent` too.
     pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
         match (self, other) {
-            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a - b),
+            (Precision::Exact(a), Precision::Exact(b)) => a.checked_sub(*b).map_or_else(
+                || Precision::Inexact(a.saturating_sub(*b)),
+                Precision::Exact,
+            ),
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
-            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a - b),
+            | (Precision::Inexact(a), Precision::Inexact(b)) => {
+                Precision::Inexact(a.saturating_sub(*b))
+            }
             (_, _) => Precision::Absent,
         }
     }
@@ -146,10 +179,15 @@ impl Precision<usize> {
     /// values is [`Precision::Absent`], the result is `Absent` too.
     pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
         match (self, other) {
-            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a * b),
+            (Precision::Exact(a), Precision::Exact(b)) => a.checked_mul(*b).map_or_else(
+                || Precision::Inexact(a.saturating_mul(*b)),
+                Precision::Exact,
+            ),
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
-            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a * b),
+            | (Precision::Inexact(a), Precision::Inexact(b)) => {
+                Precision::Inexact(a.saturating_mul(*b))
+            }
             (_, _) => Precision::Absent,
         }
     }
@@ -168,15 +206,22 @@ impl Precision<ScalarValue> {
     /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
     /// conservatively propagating exactness information. If one of the input
     /// values is [`Precision::Absent`], the result is `Absent` too.
+    ///
+    /// Uses [`ScalarValue::add_checked`] so that integer overflow returns
+    /// an error (mapped to `Absent`) instead of silently wrapping.
+    ///
+    /// For performance-sensitive paths prefer `precision_add` which
+    /// avoids the Arrow array round-trip.
     pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
         match (self, other) {
-            (Precision::Exact(a), Precision::Exact(b)) => {
-                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
-            }
+            (Precision::Exact(a), Precision::Exact(b)) => a
+                .add_checked(b)
+                .map(Precision::Exact)
+                .unwrap_or(Precision::Absent),
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
             | (Precision::Inexact(a), Precision::Inexact(b)) => a
-                .add(b)
+                .add_checked(b)
                 .map(Precision::Inexact)
                 .unwrap_or(Precision::Absent),
             (_, _) => Precision::Absent,
@@ -268,9 +313,14 @@ impl From<Precision<usize>> for Precision<ScalarValue> {
 /// and the transformations output are not always predictable.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Statistics {
-    /// The number of table rows.
+    /// The number of rows estimated to be scanned.
     pub num_rows: Precision<usize>,
-    /// Total bytes of the table rows.
+    /// The total bytes of the output data.
+    ///
+    /// Note that this is not the same as the total bytes that may be scanned,
+    /// processed, etc.
+    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
+    /// the node produces may be 2GB; it's this 2GB that is tracked here.
     pub total_byte_size: Precision<usize>,
     /// Statistics on a column level.
     ///
@@ -302,6 +352,31 @@ impl Statistics {
         }
     }
 
+    /// Calculates `total_byte_size` based on the schema and `num_rows`.
+    /// If any of the columns has non-primitive width, `total_byte_size` is set to inexact.
+    pub fn calculate_total_byte_size(&mut self, schema: &Schema) {
+        let mut row_size = Some(0);
+        for field in schema.fields() {
+            match field.data_type().primitive_width() {
+                Some(width) => {
+                    row_size = row_size.map(|s| s + width);
+                }
+                None => {
+                    row_size = None;
+                    break;
+                }
+            }
+        }
+        match row_size {
+            None => {
+                self.total_byte_size = self.total_byte_size.to_inexact();
+            }
+            Some(size) => {
+                self.total_byte_size = self.num_rows.multiply(&Precision::Exact(size));
+            }
+        }
+    }
+
     /// Returns an unbounded `ColumnStatistics` for each field in the schema.
     pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
         schema
@@ -347,12 +422,17 @@ impl Statistics {
     /// For example, if we had statistics for columns `{"a", "b", "c"}`,
     /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
     /// "b"}`.
-    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
-        let Some(projection) = projection else {
+    pub fn project(self, projection: Option<&impl AsRef<[usize]>>) -> Self {
+        let projection = projection.map(AsRef::as_ref);
+        self.project_impl(projection)
+    }
+
+    fn project_impl(mut self, projection: Option<&[usize]>) -> Self {
+        let Some(projection) = projection.map(AsRef::as_ref) else {
             return self;
         };
 
-        #[allow(clippy::large_enum_variant)]
+        #[expect(clippy::large_enum_variant)]
         enum Slot {
             /// The column is taken and put into the specified statistics location
             Taken(usize),
@@ -366,7 +446,7 @@ impl Statistics {
             .map(Slot::Present)
             .collect();
 
-        for idx in projection {
+        for idx in projection.iter() {
             let next_idx = self.column_statistics.len();
             let slot = std::mem::replace(
                 columns.get_mut(*idx).expect("projection out of bounds"),
@@ -391,13 +471,15 @@ impl Statistics {
     /// parameter to compute global statistics in a multi-partition setting.
     pub fn with_fetch(
         mut self,
-        schema: SchemaRef,
         fetch: Option<usize>,
         skip: usize,
         n_partitions: usize,
     ) -> Result<Self> {
         let fetch_val = fetch.unwrap_or(usize::MAX);
 
+        // Get the ratio of rows after / rows before on a per-partition basis
+        let num_rows_before = self.num_rows;
+
         self.num_rows = match self {
             Statistics {
                 num_rows: Precision::Exact(nr),
@@ -431,8 +513,7 @@ impl Statistics {
                     // At this point we know that we were given a `fetch` value
                     // as the `None` case would go into the branch above. Since
                     // the input has more rows than `fetch + skip`, the number
-                    // of rows will be the `fetch`, but we won't be able to
-                    // predict the other statistics.
+                    // of rows will be the `fetch`, other statistics will have to be downgraded to inexact.
                     check_num_rows(
                         fetch_val.checked_mul(n_partitions),
                         // We know that we have an estimate for the number of rows:
@@ -445,8 +526,55 @@ impl Statistics {
                 ..
             } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
         };
-        self.column_statistics = Statistics::unknown_column(&schema);
-        self.total_byte_size = Precision::Absent;
+        let ratio: f64 = match (num_rows_before, self.num_rows) {
+            (
+                Precision::Exact(nr_before) | Precision::Inexact(nr_before),
+                Precision::Exact(nr_after) | Precision::Inexact(nr_after),
+            ) => {
+                if nr_before == 0 {
+                    0.0
+                } else {
+                    nr_after as f64 / nr_before as f64
+                }
+            }
+            _ => 0.0,
+        };
+        self.column_statistics = self
+            .column_statistics
+            .into_iter()
+            .map(|cs| {
+                let mut cs = cs.to_inexact();
+                // Scale byte_size by the row ratio
+                cs.byte_size = match cs.byte_size {
+                    Precision::Exact(n) | Precision::Inexact(n) => {
+                        Precision::Inexact((n as f64 * ratio) as usize)
+                    }
+                    Precision::Absent => Precision::Absent,
+                };
+                cs
+            })
+            .collect();
+
+        // Compute total_byte_size as sum of column byte_size values if all are present,
+        // otherwise fall back to scaling the original total_byte_size
+        let sum_scan_bytes: Option<usize> = self
+            .column_statistics
+            .iter()
+            .map(|cs| cs.byte_size.get_value().copied())
+            .try_fold(0usize, |acc, val| val.map(|v| acc + v));
+
+        self.total_byte_size = match sum_scan_bytes {
+            Some(sum) => Precision::Inexact(sum),
+            None => {
+                // Fall back to scaling original total_byte_size if not all columns have byte_size
+                match &self.total_byte_size {
+                    Precision::Exact(n) | Precision::Inexact(n) => {
+                        Precision::Inexact((*n as f64 * ratio) as usize)
+                    }
+                    Precision::Absent => Precision::Absent,
+                }
+            }
+        };
         Ok(self)
     }
 
@@ -456,23 +584,6 @@ impl Statistics {
     /// If not, maybe you can call `SchemaMapper::map_column_statistics` to make them consistent.
     ///
     /// Returns an error if the statistics do not match the specified schemas.
-    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
-    where
-        I: IntoIterator<Item = &'a Statistics>,
-    {
-        let mut items = items.into_iter();
-
-        let Some(init) = items.next() else {
-            return Ok(Statistics::new_unknown(schema));
-        };
-        items.try_fold(init.clone(), |acc: Statistics, item_stats: &Statistics| {
-            acc.try_merge(item_stats)
-        })
-    }
-
-    /// Merge this Statistics value with another Statistics value.
-    ///
-    /// Returns an error if the statistics do not match (different schemas).
     ///
     /// # Example
     /// ```
@@ -480,64 +591,113 @@ impl Statistics {
     /// # use arrow::datatypes::{Field, Schema, DataType};
     /// # use datafusion_common::stats::Precision;
     /// let stats1 = Statistics::default()
-    ///   .with_num_rows(Precision::Exact(1))
-    ///   .with_total_byte_size(Precision::Exact(2))
-    ///   .add_column_statistics(ColumnStatistics::new_unknown()
-    ///      .with_null_count(Precision::Exact(3))
-    ///      .with_min_value(Precision::Exact(ScalarValue::from(4)))
-    ///      .with_max_value(Precision::Exact(ScalarValue::from(5)))
-    ///   );
+    ///     .with_num_rows(Precision::Exact(10))
+    ///     .add_column_statistics(
+    ///         ColumnStatistics::new_unknown()
+    ///             .with_min_value(Precision::Exact(ScalarValue::from(1)))
+    ///             .with_max_value(Precision::Exact(ScalarValue::from(100)))
+    ///             .with_sum_value(Precision::Exact(ScalarValue::from(500))),
+    ///     );
     ///
     /// let stats2 = Statistics::default()
-    ///   .with_num_rows(Precision::Exact(10))
-    ///   .with_total_byte_size(Precision::Inexact(20))
-    ///   .add_column_statistics(ColumnStatistics::new_unknown()
-    ///       // absent null count
-    ///      .with_min_value(Precision::Exact(ScalarValue::from(40)))
-    ///      .with_max_value(Precision::Exact(ScalarValue::from(50)))
-    ///   );
+    ///     .with_num_rows(Precision::Exact(20))
+    ///     .add_column_statistics(
+    ///         ColumnStatistics::new_unknown()
+    ///             .with_min_value(Precision::Exact(ScalarValue::from(5)))
+    ///             .with_max_value(Precision::Exact(ScalarValue::from(200)))
+    ///             .with_sum_value(Precision::Exact(ScalarValue::from(1000))),
+    ///     );
     ///
-    /// let merged_stats = stats1.try_merge(&stats2).unwrap();
-    /// let expected_stats = Statistics::default()
-    ///   .with_num_rows(Precision::Exact(11))
-    ///   .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact
-    ///   .add_column_statistics(
-    ///     ColumnStatistics::new_unknown()
-    ///       .with_null_count(Precision::Absent) // missing from stats2 --> absent
-    ///       .with_min_value(Precision::Exact(ScalarValue::from(4)))
-    ///       .with_max_value(Precision::Exact(ScalarValue::from(50)))
-    ///   );
+    /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+    /// let merged = Statistics::try_merge_iter(
+    ///     &[stats1, stats2],
+    ///     &schema,
+    /// ).unwrap();
     ///
-    /// assert_eq!(merged_stats, expected_stats)
+    /// assert_eq!(merged.num_rows, Precision::Exact(30));
+    /// assert_eq!(merged.column_statistics[0].min_value,
+    ///     Precision::Exact(ScalarValue::from(1)));
+    /// assert_eq!(merged.column_statistics[0].max_value,
+    ///     Precision::Exact(ScalarValue::from(200)));
+    /// assert_eq!(merged.column_statistics[0].sum_value,
+    ///     Precision::Exact(ScalarValue::from(1500)));
     /// ```
-    pub fn try_merge(self, other: &Statistics) -> Result<Self> {
-        let Self {
-            mut num_rows,
-            mut total_byte_size,
-            mut column_statistics,
-        } = self;
-
-        // Accumulate statistics for subsequent items
-        num_rows = num_rows.add(&other.num_rows);
-        total_byte_size = total_byte_size.add(&other.total_byte_size);
-
-        if column_statistics.len() != other.column_statistics.len() {
-            return _plan_err!(
-                "Cannot merge statistics with different number of columns: {} vs {}",
-                column_statistics.len(),
-                other.column_statistics.len()
-            );
+    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> Result<Statistics>
+    where
+        I: IntoIterator<Item = &'a Statistics>,
+    {
+        let items: Vec<&Statistics> = items.into_iter().collect();
+
+        if items.is_empty() {
+            return Ok(Statistics::new_unknown(schema));
+        }
+        if items.len() == 1 {
+            return Ok(items[0].clone());
+        }
+
+        let num_cols = items[0].column_statistics.len();
+        // Validate all items have the same number of columns
+        for (i, stat) in items.iter().enumerate().skip(1) {
+            if stat.column_statistics.len() != num_cols {
+                return _plan_err!(
+                    "Cannot merge statistics with different number of columns: {} vs {} (item {})",
+                    num_cols,
+                    stat.column_statistics.len(),
+                    i
+                );
+            }
         }
 
-        for (item_col_stats, col_stats) in other
+        // Aggregate usize fields (cheap arithmetic)
+        let mut num_rows = Precision::Exact(0usize);
+        let mut total_byte_size = Precision::Exact(0usize);
+        for stat in &items {
+            num_rows = num_rows.add(&stat.num_rows);
+            total_byte_size = total_byte_size.add(&stat.total_byte_size);
+        }
+
+        let first = items[0];
+        let mut column_statistics: Vec<ColumnStatistics> = first
             .column_statistics
             .iter()
-            .zip(column_statistics.iter_mut())
-        {
-            col_stats.null_count = col_stats.null_count.add(&item_col_stats.null_count);
-            col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
-            col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
-            col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
+            .map(|cs| ColumnStatistics {
+                null_count: cs.null_count,
+                max_value: cs.max_value.clone(),
+                min_value: cs.min_value.clone(),
+                sum_value: cs.sum_value.clone(),
+                distinct_count: cs.distinct_count,
+                byte_size: cs.byte_size,
+            })
+            .collect();
+
+        // Accumulate all statistics in a single pass.
+        // Uses precision_add for sum (avoids the expensive
+        // ScalarValue::add round-trip through Arrow arrays), and
+        // Precision::min/max which use cheap PartialOrd comparison.
+        for stat in items.iter().skip(1) {
+            for (col_idx, col_stats) in column_statistics.iter_mut().enumerate() {
+                let item_cs = &stat.column_statistics[col_idx];
+
+                col_stats.null_count = col_stats.null_count.add(&item_cs.null_count);
+
+                // NDV must be computed before min/max update (needs pre-merge ranges)
+                col_stats.distinct_count = match (
+                    col_stats.distinct_count.get_value(),
+                    item_cs.distinct_count.get_value(),
+                ) {
+                    (Some(&l), Some(&r)) => Precision::Inexact(
+                        estimate_ndv_with_overlap(col_stats, item_cs, l, r)
+                            .unwrap_or_else(|| usize::max(l, r)),
+                    ),
+                    _ => Precision::Absent,
+                };
+
+                col_stats.min_value = col_stats.min_value.min(&item_cs.min_value);
+                col_stats.max_value = col_stats.max_value.max(&item_cs.max_value);
+                col_stats.sum_value =
+                    precision_add(&col_stats.sum_value, &item_cs.sum_value);
+                col_stats.byte_size = col_stats.byte_size.add(&item_cs.byte_size);
+            }
         }
 
         Ok(Statistics {
@@ -548,6 +708,96 @@ impl Statistics {
     }
 }
 
+/// Estimates the combined number of distinct values (NDV) when merging two
+/// column statistics, using range overlap to avoid double-counting shared values.
+///
+/// Assumes values are distributed uniformly within each input's
+/// `[min, max]` range (the standard assumption when only summary
+/// statistics are available). Under uniformity the fraction of an input's
+/// distinct values that land in a sub-range equals the fraction of
+/// the range that sub-range covers.
+///
+/// The combined value space is split into three disjoint regions:
+///
+/// ```text
+///   |-- only A --|-- overlap --|-- only B --|
+/// ```
+///
+/// * **Only in A/B** - values outside the other input's range
+///   contribute `(1 - overlap_a) * NDV_a` and `(1 - overlap_b) * NDV_b`.
+/// * **Overlap** - both inputs may produce values here. We take
+///   `max(overlap_a * NDV_a, overlap_b * NDV_b)` rather than the
+///   sum because values in the same sub-range are likely shared
+///   (the smaller set is assumed to be a subset of the larger).
+///
+/// The formula ranges between `[max(NDV_a, NDV_b), NDV_a + NDV_b]`,
+/// from full overlap to no overlap.
+///
+/// ```text
+/// NDV = max(overlap_a * NDV_a, overlap_b * NDV_b)   [intersection]
+///     + (1 - overlap_a) * NDV_a                      [only in A]
+///     + (1 - overlap_b) * NDV_b                      [only in B]
+/// ```
+///
+/// Returns `None` when min/max are absent or distance is unsupported
+/// (e.g. strings), in which case the caller should fall back to a simpler
+/// estimate.
+pub fn estimate_ndv_with_overlap(
+    left: &ColumnStatistics,
+    right: &ColumnStatistics,
+    ndv_left: usize,
+    ndv_right: usize,
+) -> Option<usize> {
+    let left_min = left.min_value.get_value()?;
+    let left_max = left.max_value.get_value()?;
+    let right_min = right.min_value.get_value()?;
+    let right_max = right.max_value.get_value()?;
+
+    let range_left = left_max.distance(left_min)?;
+    let range_right = right_max.distance(right_min)?;
+
+    // Constant columns (range == 0) can't use the proportional overlap
+    // formula below, so check interval overlap directly instead.
+    if range_left == 0 || range_right == 0 {
+        let overlaps = left_min <= right_max && right_min <= left_max;
+        return Some(if overlaps {
+            usize::max(ndv_left, ndv_right)
+        } else {
+            ndv_left + ndv_right
+        });
+    }
+
+    let overlap_min = if left_min >= right_min {
+        left_min
+    } else {
+        right_min
+    };
+    let overlap_max = if left_max <= right_max {
+        left_max
+    } else {
+        right_max
+    };
+
+    // Disjoint ranges: no overlap, NDVs are additive
+    if overlap_min > overlap_max {
+        return Some(ndv_left + ndv_right);
+    }
+
+    let overlap_range = overlap_max.distance(overlap_min)? as f64;
+
+    let overlap_left = overlap_range / range_left as f64;
+    let overlap_right = overlap_range / range_right as f64;
+
+    let intersection = f64::max(
+        overlap_left * ndv_left as f64,
+        overlap_right * ndv_right as f64,
+    );
+    let only_left = (1.0 - overlap_left) * ndv_left as f64;
+    let only_right = (1.0 - overlap_right) * ndv_right as f64;
+
+    Some((intersection + only_left + only_right).round() as usize)
+}
+
 /// Creates an estimate of the number of rows in the output using the given
 /// optional value and exactness flag.
 fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
@@ -599,6 +849,11 @@ impl Display for Statistics {
                 } else {
                     s
                 };
+                let s = if cs.byte_size != Precision::Absent {
+                    format!("{} ScanBytes={}", s, cs.byte_size)
+                } else {
+                    s
+                };
 
                 s + ")"
             })
@@ -628,6 +883,21 @@ pub struct ColumnStatistics {
     pub sum_value: Precision<ScalarValue>,
     /// Number of distinct values
     pub distinct_count: Precision<usize>,
+    /// Estimated size of this column's data in bytes for the output.
+    ///
+    /// Note that this is not the same as the total bytes that may be scanned,
+    /// processed, etc.
+    ///
+    /// E.g. we may read 1GB of data from a Parquet file but the Arrow data
+    /// the node produces may be 2GB; it's this 2GB that is tracked here.
+    ///
+    /// Currently this is accurately calculated for primitive types only.
+    /// For complex types (like Utf8, List, Struct, etc), this value may be
+    /// absent or inexact (e.g. estimated from the size of the data in the source Parquet files).
+    ///
+    /// This value is automatically scaled when operations like limits or
+    /// filters reduce the number of rows (see [`Statistics::with_fetch`]).
+    pub byte_size: Precision<usize>,
 }
 
 impl ColumnStatistics {
@@ -650,6 +920,7 @@ impl ColumnStatistics {
             min_value: Precision::Absent,
             sum_value: Precision::Absent,
             distinct_count: Precision::Absent,
+            byte_size: Precision::Absent,
         }
     }
 
@@ -683,6 +954,13 @@ impl ColumnStatistics {
         self
     }
 
+    /// Set the scan byte size
+    /// This should initially be set to the total size of the column.
+    pub fn with_byte_size(mut self, byte_size: Precision<usize>) -> Self {
+        self.byte_size = byte_size;
+        self
+    }
+
     /// If the exactness of a [`ColumnStatistics`] instance is lost, this
     /// function relaxes the exactness of all information by converting them
     /// [`Precision::Inexact`].
@@ -692,6 +970,7 @@ impl ColumnStatistics {
         self.min_value = self.min_value.to_inexact();
         self.sum_value = self.sum_value.to_inexact();
         self.distinct_count = self.distinct_count.to_inexact();
+        self.byte_size = self.byte_size.to_inexact();
         self
     }
 }
@@ -781,11 +1060,21 @@ mod tests {
         let precision2 = Precision::Inexact(23);
         let precision3 = Precision::Exact(30);
         let absent_precision = Precision::Absent;
+        let precision_max_exact = Precision::Exact(usize::MAX);
+        let precision_max_inexact = Precision::Exact(usize::MAX);
 
         assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
         assert_eq!(precision1.add(&precision3), Precision::Exact(72));
         assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
         assert_eq!(precision1.add(&absent_precision), Precision::Absent);
+        assert_eq!(
+            precision_max_exact.add(&precision1),
+            Precision::Inexact(usize::MAX)
+        );
+        assert_eq!(
+            precision_max_inexact.add(&precision1),
+            Precision::Inexact(usize::MAX)
+        );
     }
 
     #[test]
@@ -817,6 +1106,8 @@ mod tests {
 
         assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
         assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
+        assert_eq!(precision2.sub(&precision1), Precision::Inexact(0));
+        assert_eq!(precision3.sub(&precision1), Precision::Inexact(0));
         assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
     }
 
@@ -845,12 +1136,22 @@ mod tests {
         let precision1 = Precision::Exact(6);
         let precision2 = Precision::Inexact(3);
         let precision3 = Precision::Exact(5);
+        let precision_max_exact = Precision::Exact(usize::MAX);
+        let precision_max_inexact = Precision::Exact(usize::MAX);
         let absent_precision = Precision::Absent;
 
         assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
         assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
         assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
         assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
+        assert_eq!(
+            precision_max_exact.multiply(&precision1),
+            Precision::Inexact(usize::MAX)
+        );
+        assert_eq!(
+            precision_max_inexact.multiply(&precision1),
+            Precision::Inexact(usize::MAX)
+        );
     }
 
     #[test]
@@ -896,9 +1197,11 @@ mod tests {
             Precision::Exact(ScalarValue::Int64(None)),
         );
         // Overflow returns error
-        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
-            .cast_to(&DataType::Int8)
-            .is_err());
+        assert!(
+            Precision::Exact(ScalarValue::Int32(Some(256)))
+                .cast_to(&DataType::Int8)
+                .is_err()
+        );
     }
 
     #[test]
@@ -911,15 +1214,13 @@ mod tests {
         // Precision<ScalarValue> is not copy (requires .clone())
         let precision: Precision<ScalarValue> =
             Precision::Exact(ScalarValue::Int64(Some(42)));
-        // Clippy would complain about this if it were Copy
-        #[allow(clippy::redundant_clone)]
         let p2 = precision.clone();
         assert_eq!(precision, p2);
     }
 
     #[test]
     fn test_project_none() {
-        let projection = None;
+        let projection: Option<Vec<usize>> = None;
         let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
         assert_eq!(stats, make_stats(vec![10, 20, 30]));
     }
@@ -961,11 +1262,12 @@ mod tests {
             min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
             sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
             distinct_count: Precision::Exact(100),
+            byte_size: Precision::Exact(800),
         }
     }
 
     #[test]
-    fn test_try_merge_basic() {
+    fn test_try_merge() {
         // Create a schema with two columns
         let schema = Arc::new(Schema::new(vec![
             Field::new("col1", DataType::Int32, false),
@@ -983,6 +1285,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(2),
@@ -990,6 +1293,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
                 },
             ],
         };
@@ -1004,6 +1308,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(3),
@@ -1011,6 +1316,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
                     sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
                 },
             ],
         };
@@ -1074,6 +1380,7 @@ mod tests {
                 min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
                 sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(40),
             }],
         };
 
@@ -1086,6 +1393,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
                 sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
             }],
         };
 
@@ -1106,7 +1414,7 @@ mod tests {
             col_stats.min_value,
             Precision::Inexact(ScalarValue::Int32(Some(-10)))
         );
-        assert!(matches!(col_stats.sum_value, Precision::Absent));
+        assert_eq!(col_stats.sum_value, Precision::Absent);
     }
 
     #[test]
@@ -1150,6 +1458,1059 @@ mod tests {
         let items = vec![stats1, stats2];
 
         let e = Statistics::try_merge_iter(&items, &schema).unwrap_err();
-        assert_contains!(e.to_string(), "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1");
+        assert_contains!(
+            e.to_string(),
+            "Error during planning: Cannot merge statistics with different number of columns: 0 vs 1"
+        );
+    }
+
+    #[test]
+    fn test_try_merge_distinct_count_absent() {
+        // Create statistics with known distinct counts
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .with_total_byte_size(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_null_count(Precision::Exact(0))
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(15))
+            .with_total_byte_size(Precision::Exact(150))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_null_count(Precision::Exact(0))
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(20))))
+                    .with_distinct_count(Precision::Exact(7)),
+            );
+
+        // Merge statistics
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged_stats =
+            Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+
+        // Verify the results
+        assert_eq!(merged_stats.num_rows, Precision::Exact(25));
+        assert_eq!(merged_stats.total_byte_size, Precision::Exact(250));
+
+        let col_stats = &merged_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Exact(0));
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(1)))
+        );
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(20)))
+        );
+        // Overlap-based NDV: ranges [1,10] and [5,20], overlap [5,10]
+        // range_left=9, range_right=15, overlap=5
+        // overlap_left=5*(5/9)=2.78, overlap_right=7*(5/15)=2.33
+        // result = max(2.78, 2.33) + (5-2.78) + (7-2.33) = 9.67 -> 10
+        assert_eq!(col_stats.distinct_count, Precision::Inexact(10));
+    }
+
+    #[test]
+    fn test_try_merge_ndv_disjoint_ranges() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(20))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(30))))
+                    .with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // No overlap -> sum of NDVs
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(13)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_identical_ranges() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(50)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(30)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // Full overlap -> max(50, 30) = 50
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(50)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_partial_overlap() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+                    .with_distinct_count(Precision::Exact(80)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(100))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(50))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(150))))
+                    .with_distinct_count(Precision::Exact(60)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // overlap=[50,100], range_left=100, range_right=100, overlap_range=50
+        // overlap_left=80*(50/100)=40, overlap_right=60*(50/100)=30
+        // result = max(40,30) + (80-40) + (60-30) = 40 + 40 + 30 = 110
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(110)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_missing_min_max() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown().with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // No min/max -> fallback to max(5, 8)
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_non_numeric_types() {
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "aaa".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "zzz".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(5)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "bbb".to_string(),
+                    ))))
+                    .with_max_value(Precision::Exact(ScalarValue::Utf8(Some(
+                        "yyy".to_string(),
+                    ))))
+                    .with_distinct_count(Precision::Exact(8)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        // distance() unsupported for strings -> fallback to max
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(8)
+        );
+    }
+
+    #[test]
+    fn test_try_merge_ndv_constant_columns() {
+        // Same constant: [5,5]+[5,5] -> max
+        let stats1 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+        let stats2 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let merged = Statistics::try_merge_iter([&stats1, &stats2], &schema).unwrap();
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(1)
+        );
+
+        // Different constants: [5,5]+[10,10] -> sum
+        let stats3 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(5))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+        let stats4 = Statistics::default()
+            .with_num_rows(Precision::Exact(10))
+            .add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+                    .with_distinct_count(Precision::Exact(1)),
+            );
+
+        let merged = Statistics::try_merge_iter([&stats3, &stats4], &schema).unwrap();
+        assert_eq!(
+            merged.column_statistics[0].distinct_count,
+            Precision::Inexact(2)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_basic_preservation() {
+        // Test that column statistics and byte size are preserved (as inexact) when applying fetch
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(0))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(5050))),
+                    distinct_count: Precision::Exact(50),
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
+                    min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10100))),
+                    distinct_count: Precision::Exact(75),
+                    byte_size: Precision::Exact(8000),
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
+
+        // Check num_rows
+        assert_eq!(result.num_rows, Precision::Exact(100));
+
+        // Check total_byte_size is computed as sum of scaled column byte_size values
+        // Column 1: 4000 * 0.1 = 400, Column 2: 8000 * 0.1 = 800, Sum = 1200
+        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
+
+        // Check column statistics are preserved but marked as inexact
+        assert_eq!(result.column_statistics.len(), 2);
+
+        // First column
+        assert_eq!(
+            result.column_statistics[0].null_count,
+            Precision::Inexact(10)
+        );
+        assert_eq!(
+            result.column_statistics[0].max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(100)))
+        );
+        assert_eq!(
+            result.column_statistics[0].min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(0)))
+        );
+        assert_eq!(
+            result.column_statistics[0].sum_value,
+            Precision::Inexact(ScalarValue::Int32(Some(5050)))
+        );
+        assert_eq!(
+            result.column_statistics[0].distinct_count,
+            Precision::Inexact(50)
+        );
+
+        // Second column
+        assert_eq!(
+            result.column_statistics[1].null_count,
+            Precision::Inexact(20)
+        );
+        assert_eq!(
+            result.column_statistics[1].max_value,
+            Precision::Inexact(ScalarValue::Int64(Some(200)))
+        );
+        assert_eq!(
+            result.column_statistics[1].min_value,
+            Precision::Inexact(ScalarValue::Int64(Some(10)))
+        );
+        assert_eq!(
+            result.column_statistics[1].sum_value,
+            Precision::Inexact(ScalarValue::Int64(Some(10100)))
+        );
+        assert_eq!(
+            result.column_statistics[1].distinct_count,
+            Precision::Inexact(75)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_inexact_input() {
+        // Test that inexact input statistics remain inexact
+        let original_stats = Statistics {
+            num_rows: Precision::Inexact(1000),
+            total_byte_size: Precision::Inexact(8000),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(10),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(0))),
+                sum_value: Precision::Inexact(ScalarValue::Int32(Some(5050))),
+                distinct_count: Precision::Inexact(50),
+                byte_size: Precision::Inexact(4000),
+            }],
+        };
+
+        let result = original_stats.clone().with_fetch(Some(500), 0, 1).unwrap();
+
+        // Check num_rows is inexact
+        assert_eq!(result.num_rows, Precision::Inexact(500));
+
+        // Check total_byte_size is computed as sum of scaled column byte_size values
+        // Column 1: 4000 * 0.5 = 2000, Sum = 2000
+        assert_eq!(result.total_byte_size, Precision::Inexact(2000));
+
+        // Column stats remain inexact
+        assert_eq!(
+            result.column_statistics[0].null_count,
+            Precision::Inexact(10)
+        );
+    }
+
+    #[test]
+    fn test_with_fetch_skip_all_rows() {
+        // Test when skip >= num_rows (all rows are skipped)
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Exact(800),
+            column_statistics: vec![col_stats_i64(10)],
+        };
+
+        let result = original_stats.clone().with_fetch(Some(50), 100, 1).unwrap();
+
+        assert_eq!(result.num_rows, Precision::Exact(0));
+        // When ratio is 0/100 = 0, byte size should be 0
+        assert_eq!(result.total_byte_size, Precision::Inexact(0));
+    }
+
+    #[test]
+    fn test_with_fetch_no_limit() {
+        // Test when fetch is None and skip is 0 (no limit applied)
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Exact(800),
+            column_statistics: vec![col_stats_i64(10)],
+        };
+
+        let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();
+
+        // Stats should be unchanged when no fetch and no skip
+        assert_eq!(result.num_rows, Precision::Exact(100));
+        assert_eq!(result.total_byte_size, Precision::Exact(800));
+    }
+
+    #[test]
+    fn test_with_fetch_with_skip() {
+        // Test with both skip and fetch
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![col_stats_i64(10)],
+        };
+
+        // Skip 200, fetch 300, so we get rows 200-500
+        let result = original_stats
+            .clone()
+            .with_fetch(Some(300), 200, 1)
+            .unwrap();
+
+        assert_eq!(result.num_rows, Precision::Exact(300));
+        // Column 1: byte_size 800 * (300/500) = 240, Sum = 240
+        assert_eq!(result.total_byte_size, Precision::Inexact(240));
+    }
+
+    #[test]
+    fn test_with_fetch_multi_partition() {
+        // Test with multiple partitions
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000), // per partition
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![col_stats_i64(10)],
+        };
+
+        // Fetch 100 per partition, 4 partitions = 400 total
+        let result = original_stats.clone().with_fetch(Some(100), 0, 4).unwrap();
+
+        assert_eq!(result.num_rows, Precision::Exact(400));
+        // Column 1: byte_size 800 * 0.4 = 320, Sum = 320
+        assert_eq!(result.total_byte_size, Precision::Inexact(320));
+    }
+
+    #[test]
+    fn test_with_fetch_absent_stats() {
+        // Test with absent statistics
+        let original_stats = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
+            }],
+        };
+
+        let result = original_stats.clone().with_fetch(Some(100), 0, 1).unwrap();
+
+        // With absent input stats, output should be inexact estimate
+        assert_eq!(result.num_rows, Precision::Inexact(100));
+        assert_eq!(result.total_byte_size, Precision::Absent);
+        // Column stats should remain absent
+        assert_eq!(result.column_statistics[0].null_count, Precision::Absent);
+    }
+
+    #[test]
+    fn test_with_fetch_fetch_exceeds_rows() {
+        // Test when fetch is larger than available rows after skip
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Exact(800),
+            column_statistics: vec![col_stats_i64(10)],
+        };
+
+        // Skip 50, fetch 100, but only 50 rows remain
+        let result = original_stats.clone().with_fetch(Some(100), 50, 1).unwrap();
+
+        assert_eq!(result.num_rows, Precision::Exact(50));
+        // 50/100 = 0.5, so 800 * 0.5 = 400
+        assert_eq!(result.total_byte_size, Precision::Inexact(400));
+    }
+
+    #[test]
+    fn test_with_fetch_preserves_all_column_stats() {
+        // Comprehensive test that all column statistic fields are preserved
+        let original_col_stats = ColumnStatistics {
+            null_count: Precision::Exact(42),
+            max_value: Precision::Exact(ScalarValue::Int32(Some(999))),
+            min_value: Precision::Exact(ScalarValue::Int32(Some(-100))),
+            sum_value: Precision::Exact(ScalarValue::Int32(Some(123456))),
+            distinct_count: Precision::Exact(789),
+            byte_size: Precision::Exact(4000),
+        };
+
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![original_col_stats.clone()],
+        };
+
+        let result = original_stats.with_fetch(Some(250), 0, 1).unwrap();
+
+        let result_col_stats = &result.column_statistics[0];
+
+        // All values should be preserved but marked as inexact
+        assert_eq!(result_col_stats.null_count, Precision::Inexact(42));
+        assert_eq!(
+            result_col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(999)))
+        );
+        assert_eq!(
+            result_col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-100)))
+        );
+        assert_eq!(
+            result_col_stats.sum_value,
+            Precision::Inexact(ScalarValue::Int32(Some(123456)))
+        );
+        assert_eq!(result_col_stats.distinct_count, Precision::Inexact(789));
+    }
+
+    #[test]
+    fn test_byte_size_to_inexact() {
+        let col_stats = ColumnStatistics {
+            null_count: Precision::Exact(10),
+            max_value: Precision::Absent,
+            min_value: Precision::Absent,
+            sum_value: Precision::Absent,
+            distinct_count: Precision::Absent,
+            byte_size: Precision::Exact(5000),
+        };
+
+        let inexact = col_stats.to_inexact();
+        assert_eq!(inexact.byte_size, Precision::Inexact(5000));
+    }
+
+    #[test]
+    fn test_with_byte_size_builder() {
+        let col_stats =
+            ColumnStatistics::new_unknown().with_byte_size(Precision::Exact(8192));
+        assert_eq!(col_stats.byte_size, Precision::Exact(8192));
+    }
+
+    #[test]
+    fn test_with_fetch_scales_byte_size() {
+        // Test that byte_size is scaled by the row ratio in with_fetch
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8000),
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
+
+        // byte_size should be scaled: 4000 * 0.1 = 400, 8000 * 0.1 = 800
+        assert_eq!(
+            result.column_statistics[0].byte_size,
+            Precision::Inexact(400)
+        );
+        assert_eq!(
+            result.column_statistics[1].byte_size,
+            Precision::Inexact(800)
+        );
+
+        // total_byte_size should be computed as sum of byte_size values: 400 + 800 = 1200
+        assert_eq!(result.total_byte_size, Precision::Inexact(1200));
+    }
+
+    #[test]
+    fn test_with_fetch_total_byte_size_fallback() {
+        // Test that total_byte_size falls back to scaling when not all columns have byte_size
+        let original_stats = Statistics {
+            num_rows: Precision::Exact(1000),
+            total_byte_size: Precision::Exact(8000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(4000),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(20),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent, // One column has no byte_size
+                },
+            ],
+        };
+
+        // Apply fetch of 100 rows (10% of original)
+        let result = original_stats.with_fetch(Some(100), 0, 1).unwrap();
+
+        // total_byte_size should fall back to scaling: 8000 * 0.1 = 800
+        assert_eq!(result.total_byte_size, Precision::Inexact(800));
+    }
+
+    #[test]
+    fn test_try_merge_iter_basic() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col1", DataType::Int32, false),
+            Field::new("col2", DataType::Int32, false),
+        ]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(40),
+                },
+            ],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(3),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(60),
+                },
+            ],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Exact(25));
+        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250));
+
+        let col1_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col1_stats.null_count, Precision::Exact(3));
+        assert_eq!(
+            col1_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col1_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(-10)))
+        );
+        assert_eq!(
+            col1_stats.sum_value,
+            Precision::Exact(ScalarValue::Int32(Some(1100)))
+        );
+
+        let col2_stats = &summary_stats.column_statistics[1];
+        assert_eq!(col2_stats.null_count, Precision::Exact(5));
+        assert_eq!(
+            col2_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(200)))
+        );
+        assert_eq!(
+            col2_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(5)))
+        );
+        assert_eq!(
+            col2_stats.sum_value,
+            Precision::Exact(ScalarValue::Int32(Some(2200)))
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_mixed_precision() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Inexact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(40),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Inexact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(2),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
+        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Inexact(3));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-10)))
+        );
+        // sum_value becomes Absent because stats2 has Absent sum
+        assert_eq!(col_stats.sum_value, Precision::Absent);
+    }
+
+    #[test]
+    fn test_try_merge_iter_empty() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let items: Vec<&Statistics> = vec![];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Absent);
+        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
+        assert_eq!(summary_stats.column_statistics.len(), 1);
+        assert_eq!(
+            summary_stats.column_statistics[0].null_count,
+            Precision::Absent
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_single_item() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Exact(10),
+                byte_size: Precision::Exact(40),
+            }],
+        };
+
+        let items = vec![&stats];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats, stats);
+    }
+
+    #[test]
+    fn test_try_merge_iter_mismatched_columns() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics::default();
+        let stats2 =
+            Statistics::default().add_column_statistics(ColumnStatistics::new_unknown());
+
+        let items = vec![&stats1, &stats2];
+        let e = Statistics::try_merge_iter(items, &schema).unwrap_err();
+        assert_contains!(
+            e.to_string(),
+            "Cannot merge statistics with different number of columns: 0 vs 1"
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_three_items() {
+        // Verify that merging three items works correctly
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int64,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(10))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(500))),
+                distinct_count: Precision::Exact(8),
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(20),
+            total_byte_size: Precision::Exact(200),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(2),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(200))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(5))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(1000))),
+                distinct_count: Precision::Exact(15),
+                byte_size: Precision::Exact(160),
+            }],
+        };
+
+        let stats3 = Statistics {
+            num_rows: Precision::Exact(30),
+            total_byte_size: Precision::Exact(300),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(3),
+                max_value: Precision::Exact(ScalarValue::Int64(Some(150))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(2000))),
+                distinct_count: Precision::Exact(25),
+                byte_size: Precision::Exact(240),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2, &stats3];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Exact(60));
+        assert_eq!(summary_stats.total_byte_size, Precision::Exact(600));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Exact(6));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Int64(Some(200)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Int64(Some(1)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(3500)))
+        );
+        assert_eq!(col_stats.byte_size, Precision::Exact(480));
+        // Overlap-based NDV merge (pairwise left-to-right):
+        // stats1+stats2: [10,100]+[5,200] -> NDV=16, then +stats3: [5,200]+[1,150] -> NDV=29
+        assert_eq!(col_stats.distinct_count, Precision::Inexact(29));
+    }
+
+    #[test]
+    fn test_try_merge_iter_float_types() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Float64,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(80),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Float64(Some(99.9))),
+                min_value: Precision::Exact(ScalarValue::Float64(Some(1.1))),
+                sum_value: Precision::Exact(ScalarValue::Float64(Some(500.5))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(80),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Float64(Some(200.0))),
+                min_value: Precision::Exact(ScalarValue::Float64(Some(0.5))),
+                sum_value: Precision::Exact(ScalarValue::Float64(Some(1000.0))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(80),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Float64(Some(200.0)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Float64(Some(0.5)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Exact(ScalarValue::Float64(Some(1500.5)))
+        );
+    }
+
+    #[test]
+    fn test_try_merge_iter_string_types() {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("col1", DataType::Utf8, false)]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("dog".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("ant".to_string()))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(100),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("bat".to_string()))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Exact(100),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Exact(ScalarValue::Utf8(Some("zebra".to_string())))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Exact(ScalarValue::Utf8(Some("ant".to_string())))
+        );
+        assert_eq!(col_stats.sum_value, Precision::Absent);
+    }
+
+    #[test]
+    fn test_try_merge_iter_all_inexact() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        let stats1 = Statistics {
+            num_rows: Precision::Inexact(10),
+            total_byte_size: Precision::Inexact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(1),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Inexact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(40),
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Inexact(20),
+            total_byte_size: Precision::Inexact(200),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(2),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(-5))),
+                sum_value: Precision::Inexact(ScalarValue::Int32(Some(1000))),
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Inexact(60),
+            }],
+        };
+
+        let items = vec![&stats1, &stats2];
+        let summary_stats = Statistics::try_merge_iter(items, &schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Inexact(30));
+        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(300));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Inexact(3));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(200)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-5)))
+        );
+        assert_eq!(
+            col_stats.sum_value,
+            Precision::Inexact(ScalarValue::Int32(Some(1500)))
+        );
     }
 }
diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs
index 9b6f9696c00bb..3163a8b16c8dc 100644
--- a/datafusion/common/src/table_reference.rs
+++ b/datafusion/common/src/table_reference.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::{parse_identifiers_normalized, quote_identifier};
+use crate::utils::parse_identifiers_normalized;
+use crate::utils::quote_identifier;
 use std::sync::Arc;
 
 /// A fully resolved path to a table of the form "catalog.schema.table"
@@ -68,8 +69,11 @@ impl std::fmt::Display for ResolvedTableReference {
 ///
 /// // Get a table reference to 'myschema.mytable' (note the capitalization)
 /// let table_reference = TableReference::from("MySchema.MyTable");
-/// assert_eq!(table_reference, TableReference::partial("myschema", "mytable"));
-///```
+/// assert_eq!(
+///     table_reference,
+///     TableReference::partial("myschema", "mytable")
+/// );
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub enum TableReference {
     /// An unqualified table reference, e.g. "table"
@@ -246,7 +250,10 @@ impl TableReference {
     /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable");
     ///
     /// let table_reference = TableReference::partial("MySchema", "MyTable");
-    /// assert_eq!(table_reference.to_quoted_string(), r#""MySchema"."MyTable""#);
+    /// assert_eq!(
+    ///     table_reference.to_quoted_string(),
+    ///     r#""MySchema"."MyTable""#
+    /// );
     /// ```
     pub fn to_quoted_string(&self) -> String {
         match self {
@@ -268,24 +275,41 @@ impl TableReference {
     }
 
     /// Forms a [`TableReference`] by parsing `s` as a multipart SQL
-    /// identifier. See docs on [`TableReference`] for more details.
+    /// identifier, normalizing `s` to lowercase.
+    /// See docs on [`TableReference`] for more details.
     pub fn parse_str(s: &str) -> Self {
-        let mut parts = parse_identifiers_normalized(s, false);
+        Self::parse_str_normalized(s, false)
+    }
 
+    /// Forms a [`TableReference`] by parsing `s` as a multipart SQL
+    /// identifier, normalizing `s` to lowercase if `ignore_case` is `false`.
+    /// See docs on [`TableReference`] for more details.
+    pub fn parse_str_normalized(s: &str, ignore_case: bool) -> Self {
+        let table_parts = parse_identifiers_normalized(s, ignore_case);
+
+        Self::from_vec(table_parts).unwrap_or_else(|| Self::Bare { table: s.into() })
+    }
+
+    /// Consume a vector of identifier parts to compose a [`TableReference`]. The input vector
+    /// should contain 1 <= N <= 3 elements in the following sequence:
+    /// ```no_rust
+    /// [<catalog>, <schema>, table]
+    /// ```
+    fn from_vec(mut parts: Vec<String>) -> Option<Self> {
         match parts.len() {
-            1 => Self::Bare {
-                table: parts.remove(0).into(),
-            },
-            2 => Self::Partial {
-                schema: parts.remove(0).into(),
-                table: parts.remove(0).into(),
-            },
-            3 => Self::Full {
-                catalog: parts.remove(0).into(),
-                schema: parts.remove(0).into(),
-                table: parts.remove(0).into(),
-            },
-            _ => Self::Bare { table: s.into() },
+            1 => Some(Self::Bare {
+                table: parts.pop()?.into(),
+            }),
+            2 => Some(Self::Partial {
+                table: parts.pop()?.into(),
+                schema: parts.pop()?.into(),
+            }),
+            3 => Some(Self::Full {
+                table: parts.pop()?.into(),
+                schema: parts.pop()?.into(),
+                catalog: parts.pop()?.into(),
+            }),
+            _ => None,
         }
     }
 
@@ -367,26 +391,32 @@ mod tests {
         let actual = TableReference::from("TABLE");
         assert_eq!(expected, actual);
 
-        // if fail to parse, take entire input string as identifier
-        let expected = TableReference::Bare {
-            table: "TABLE()".into(),
-        };
-        let actual = TableReference::from("TABLE()");
-        assert_eq!(expected, actual);
+        // Disable this test for non-sql features so that we don't need to reproduce
+        // things like table function upper case conventions, since those will not
+        // be used if SQL is not selected.
+        #[cfg(feature = "sql")]
+        {
+            // if fail to parse, take entire input string as identifier
+            let expected = TableReference::Bare {
+                table: "TABLE()".into(),
+            };
+            let actual = TableReference::from("TABLE()");
+            assert_eq!(expected, actual);
+        }
     }
 
     #[test]
     fn test_table_reference_to_vector() {
-        let table_reference = TableReference::parse_str("table");
+        let table_reference = TableReference::from("table");
         assert_eq!(vec!["table".to_string()], table_reference.to_vec());
 
-        let table_reference = TableReference::parse_str("schema.table");
+        let table_reference = TableReference::from("schema.table");
         assert_eq!(
             vec!["schema".to_string(), "table".to_string()],
             table_reference.to_vec()
         );
 
-        let table_reference = TableReference::parse_str("catalog.schema.table");
+        let table_reference = TableReference::from("catalog.schema.table");
         assert_eq!(
             vec![
                 "catalog".to_string(),
diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs
index 820a230bf6e17..f060704944233 100644
--- a/datafusion/common/src/test_util.rs
+++ b/datafusion/common/src/test_util.rs
@@ -55,7 +55,7 @@ pub fn format_batches(results: &[RecordBatch]) -> Result<impl Display, ArrowErro
 /// # use arrow::array::{ArrayRef, Int32Array};
 /// # use datafusion_common::assert_batches_eq;
 /// let col: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
-///  let batch = RecordBatch::try_from_iter([("column", col)]).unwrap();
+/// let batch = RecordBatch::try_from_iter([("column", col)]).unwrap();
 /// // Expected output is a vec of strings
 /// let expected = vec![
 ///     "+--------+",
@@ -158,7 +158,7 @@ macro_rules! assert_batches_sorted_eq {
 /// Is a macro so test error
 /// messages are on the same line as the failure;
 ///
-/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+/// Both arguments must be convertible into Strings ([`Into`]<[`String`]>)
 #[macro_export]
 macro_rules! assert_contains {
     ($ACTUAL: expr, $EXPECTED: expr) => {
@@ -181,7 +181,7 @@ macro_rules! assert_contains {
 /// Is a macro so test error
 /// messages are on the same line as the failure;
 ///
-/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+/// Both arguments must be convertible into Strings ([`Into`]<[`String`]>)
 #[macro_export]
 macro_rules! assert_not_contains {
     ($ACTUAL: expr, $UNEXPECTED: expr) => {
@@ -255,7 +255,14 @@ pub fn arrow_test_data() -> String {
 #[cfg(feature = "parquet")]
 pub fn parquet_test_data() -> String {
     match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") {
-        Ok(pb) => pb.display().to_string(),
+        Ok(pb) => {
+            let mut path = pb.display().to_string();
+            if cfg!(target_os = "windows") {
+                // Replace backslashes (Windows paths; avoids some test issues).
+                path = path.replace("\\", "/");
+            }
+            path
+        }
         Err(err) => panic!("failed to get parquet data dir: {err}"),
     }
 }
@@ -314,43 +321,43 @@ pub fn get_data_dir(
 #[macro_export]
 macro_rules! create_array {
     (Boolean, $values: expr) => {
-        std::sync::Arc::new(arrow::array::BooleanArray::from($values))
+        std::sync::Arc::new($crate::arrow::array::BooleanArray::from($values))
     };
     (Int8, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Int8Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Int8Array::from($values))
     };
     (Int16, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Int16Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Int16Array::from($values))
     };
     (Int32, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Int32Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Int32Array::from($values))
     };
     (Int64, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Int64Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Int64Array::from($values))
     };
     (UInt8, $values: expr) => {
-        std::sync::Arc::new(arrow::array::UInt8Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::UInt8Array::from($values))
     };
     (UInt16, $values: expr) => {
-        std::sync::Arc::new(arrow::array::UInt16Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::UInt16Array::from($values))
     };
     (UInt32, $values: expr) => {
-        std::sync::Arc::new(arrow::array::UInt32Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::UInt32Array::from($values))
     };
     (UInt64, $values: expr) => {
-        std::sync::Arc::new(arrow::array::UInt64Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::UInt64Array::from($values))
     };
     (Float16, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Float16Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Float16Array::from($values))
     };
     (Float32, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Float32Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Float32Array::from($values))
     };
     (Float64, $values: expr) => {
-        std::sync::Arc::new(arrow::array::Float64Array::from($values))
+        std::sync::Arc::new($crate::arrow::array::Float64Array::from($values))
     };
     (Utf8, $values: expr) => {
-        std::sync::Arc::new(arrow::array::StringArray::from($values))
+        std::sync::Arc::new($crate::arrow::array::StringArray::from($values))
     };
 }
 
@@ -359,7 +366,7 @@ macro_rules! create_array {
 ///
 /// Example:
 /// ```
-/// use datafusion_common::{record_batch, create_array};
+/// use datafusion_common::record_batch;
 /// let batch = record_batch!(
 ///     ("a", Int32, vec![1, 2, 3]),
 ///     ("b", Float64, vec![Some(4.0), None, Some(5.0)]),
@@ -370,13 +377,13 @@ macro_rules! create_array {
 macro_rules! record_batch {
     ($(($name: expr, $type: ident, $values: expr)),*) => {
         {
-            let schema = std::sync::Arc::new(arrow::datatypes::Schema::new(vec![
+            let schema = std::sync::Arc::new($crate::arrow::datatypes::Schema::new(vec![
                 $(
-                    arrow::datatypes::Field::new($name, arrow::datatypes::DataType::$type, true),
+                    $crate::arrow::datatypes::Field::new($name, $crate::arrow::datatypes::DataType::$type, true),
                 )*
             ]));
 
-            let batch = arrow::array::RecordBatch::try_new(
+            let batch = $crate::arrow::array::RecordBatch::try_new(
                 schema,
                 vec![$(
                     $crate::create_array!($type, $values),
@@ -728,32 +735,34 @@ mod tests {
         let non_existing = cwd.join("non-existing-dir").display().to_string();
         let non_existing_str = non_existing.as_str();
 
-        env::set_var(udf_env, non_existing_str);
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_err());
+        unsafe {
+            env::set_var(udf_env, non_existing_str);
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_err());
 
-        env::set_var(udf_env, "");
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
+            env::set_var(udf_env, "");
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
 
-        env::set_var(udf_env, " ");
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
+            env::set_var(udf_env, " ");
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
 
-        env::set_var(udf_env, existing_str);
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
+            env::set_var(udf_env, existing_str);
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
 
-        env::remove_var(udf_env);
-        let res = get_data_dir(udf_env, non_existing_str);
-        assert!(res.is_err());
+            env::remove_var(udf_env);
+            let res = get_data_dir(udf_env, non_existing_str);
+            assert!(res.is_err());
 
-        let res = get_data_dir(udf_env, existing_str);
-        assert!(res.is_ok());
-        assert_eq!(res.unwrap(), existing_pb);
+            let res = get_data_dir(udf_env, existing_str);
+            assert!(res.is_ok());
+            assert_eq!(res.unwrap(), existing_pb);
+        }
     }
 
     #[test]
diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
index cf51dadf6b4ad..1e7c02e424256 100644
--- a/datafusion/common/src/tree_node.rs
+++ b/datafusion/common/src/tree_node.rs
@@ -638,12 +638,13 @@ impl TreeNodeRecursion {
 /// # fn make_new_expr(i: i64) -> i64 { 2 }
 /// let expr = orig_expr();
 /// let ret = Transformed::no(expr.clone())
-///   .transform_data(|expr| {
-///    // closure returns a result and potentially transforms the node
-///    // in this example, it does transform the node
-///    let new_expr = make_new_expr(expr);
-///    Ok(Transformed::yes(new_expr))
-///  }).unwrap();
+///     .transform_data(|expr| {
+///         // closure returns a result and potentially transforms the node
+///         // in this example, it does transform the node
+///         let new_expr = make_new_expr(expr);
+///         Ok(Transformed::yes(new_expr))
+///     })
+///     .unwrap();
 /// // transformed flag is the union of the original ans closure's  transformed flag
 /// assert!(ret.transformed);
 /// ```
@@ -680,6 +681,11 @@ impl<T> Transformed<T> {
         Self::new(data, true, TreeNodeRecursion::Continue)
     }
 
+    /// Wrapper for transformed data with [`TreeNodeRecursion::Stop`] statement.
+    pub fn complete(data: T) -> Self {
+        Self::new(data, true, TreeNodeRecursion::Stop)
+    }
+
     /// Wrapper for unchanged data with [`TreeNodeRecursion::Continue`] statement.
     pub fn no(data: T) -> Self {
         Self::new(data, false, TreeNodeRecursion::Continue)
@@ -950,12 +956,12 @@ impl<'a, T: 'a, C0: TreeNodeContainer<'a, T>, C1: TreeNodeContainer<'a, T>>
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-    > TreeNodeContainer<'a, T> for (C0, C1, C2)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+> TreeNodeContainer<'a, T> for (C0, C1, C2)
 {
     fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &'a self,
@@ -985,6 +991,48 @@ impl<
     }
 }
 
+impl<
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+    C3: TreeNodeContainer<'a, T>,
+> TreeNodeContainer<'a, T> for (C0, C1, C2, C3)
+{
+    fn apply_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
+        &'a self,
+        mut f: F,
+    ) -> Result<TreeNodeRecursion> {
+        self.0
+            .apply_elements(&mut f)?
+            .visit_sibling(|| self.1.apply_elements(&mut f))?
+            .visit_sibling(|| self.2.apply_elements(&mut f))?
+            .visit_sibling(|| self.3.apply_elements(&mut f))
+    }
+
+    fn map_elements<F: FnMut(T) -> Result<Transformed<T>>>(
+        self,
+        mut f: F,
+    ) -> Result<Transformed<Self>> {
+        self.0
+            .map_elements(&mut f)?
+            .map_data(|new_c0| Ok((new_c0, self.1, self.2, self.3)))?
+            .transform_sibling(|(new_c0, c1, c2, c3)| {
+                c1.map_elements(&mut f)?
+                    .map_data(|new_c1| Ok((new_c0, new_c1, c2, c3)))
+            })?
+            .transform_sibling(|(new_c0, new_c1, c2, c3)| {
+                c2.map_elements(&mut f)?
+                    .map_data(|new_c2| Ok((new_c0, new_c1, new_c2, c3)))
+            })?
+            .transform_sibling(|(new_c0, new_c1, new_c2, c3)| {
+                c3.map_elements(&mut f)?
+                    .map_data(|new_c3| Ok((new_c0, new_c1, new_c2, new_c3)))
+            })
+    }
+}
+
 /// [`TreeNodeRefContainer`] contains references to elements that a function can be
 /// applied on. The elements of the container are siblings so the continuation rules are
 /// similar to [`TreeNodeRecursion::visit_sibling`].
@@ -1042,12 +1090,12 @@ impl<'a, T: 'a, C0: TreeNodeContainer<'a, T>, C1: TreeNodeContainer<'a, T>>
 }
 
 impl<
-        'a,
-        T: 'a,
-        C0: TreeNodeContainer<'a, T>,
-        C1: TreeNodeContainer<'a, T>,
-        C2: TreeNodeContainer<'a, T>,
-    > TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2)
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+> TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2)
 {
     fn apply_ref_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
         &self,
@@ -1060,6 +1108,27 @@ impl<
     }
 }
 
+impl<
+    'a,
+    T: 'a,
+    C0: TreeNodeContainer<'a, T>,
+    C1: TreeNodeContainer<'a, T>,
+    C2: TreeNodeContainer<'a, T>,
+    C3: TreeNodeContainer<'a, T>,
+> TreeNodeRefContainer<'a, T> for (&'a C0, &'a C1, &'a C2, &'a C3)
+{
+    fn apply_ref_elements<F: FnMut(&'a T) -> Result<TreeNodeRecursion>>(
+        &self,
+        mut f: F,
+    ) -> Result<TreeNodeRecursion> {
+        self.0
+            .apply_elements(&mut f)?
+            .visit_sibling(|| self.1.apply_elements(&mut f))?
+            .visit_sibling(|| self.2.apply_elements(&mut f))?
+            .visit_sibling(|| self.3.apply_elements(&mut f))
+    }
+}
+
 /// Transformation helper to process a sequence of iterable tree nodes that are siblings.
 pub trait TreeNodeIterator: Iterator {
     /// Apples `f` to each item in this iterator
@@ -1267,11 +1336,11 @@ pub(crate) mod tests {
     use std::collections::HashMap;
     use std::fmt::Display;
 
+    use crate::Result;
     use crate::tree_node::{
         Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, TreeNodeRewriter,
         TreeNodeVisitor,
     };
-    use crate::Result;
 
     #[derive(Debug, Eq, Hash, PartialEq, Clone)]
     pub struct TestTreeNode<T> {
diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs
index ec69db7903779..dfd2cc4cf2d8b 100644
--- a/datafusion/common/src/types/builtin.rs
+++ b/datafusion/common/src/types/builtin.rs
@@ -15,9 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::datatypes::IntervalUnit::*;
+use arrow::datatypes::TimeUnit::*;
+
 use crate::types::{LogicalTypeRef, NativeType};
 use std::sync::{Arc, LazyLock};
 
+/// Create a singleton and accompanying static variable for a [`LogicalTypeRef`]
+/// of a [`NativeType`].
+/// * `name`: name of the static variable, must be unique.
+/// * `getter`: name of the public function that will return the singleton instance
+///   of the static variable.
+/// * `ty`: the [`NativeType`].
 macro_rules! singleton {
     ($name:ident, $getter:ident, $ty:ident) => {
         static $name: LazyLock<LogicalTypeRef> =
@@ -31,6 +40,26 @@ macro_rules! singleton {
     };
 }
 
+/// Similar to [`singleton`], but for native types that have variants, such as
+/// `NativeType::Interval(MonthDayNano)`.
+/// * `name`: name of the static variable, must be unique.
+/// * `getter`: name of the public function that will return the singleton instance
+///   of the static variable.
+/// * `ty`: the [`NativeType`].
+/// * `variant`: specific variant of the `ty`.
+macro_rules! singleton_variant {
+    ($name:ident, $getter:ident, $ty:ident, $variant:ident) => {
+        static $name: LazyLock<LogicalTypeRef> =
+            LazyLock::new(|| Arc::new(NativeType::$ty($variant)));
+
+        #[doc = "Getter for singleton instance of a logical type representing"]
+        #[doc = concat!("[`NativeType::", stringify!($ty), "`] of unit [`", stringify!($variant),"`].`")]
+        pub fn $getter() -> LogicalTypeRef {
+            Arc::clone(&$name)
+        }
+    };
+}
+
 singleton!(LOGICAL_NULL, logical_null, Null);
 singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean);
 singleton!(LOGICAL_INT8, logical_int8, Int8);
@@ -47,3 +76,24 @@ singleton!(LOGICAL_FLOAT64, logical_float64, Float64);
 singleton!(LOGICAL_DATE, logical_date, Date);
 singleton!(LOGICAL_BINARY, logical_binary, Binary);
 singleton!(LOGICAL_STRING, logical_string, String);
+
+singleton_variant!(
+    LOGICAL_INTERVAL_MDN,
+    logical_interval_mdn,
+    Interval,
+    MonthDayNano
+);
+
+singleton_variant!(
+    LOGICAL_INTERVAL_YEAR_MONTH,
+    logical_interval_year_month,
+    Interval,
+    YearMonth
+);
+
+singleton_variant!(
+    LOGICAL_DURATION_MICROSECOND,
+    logical_duration_microsecond,
+    Duration,
+    Microsecond
+);
diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs
index 884ce20fd9e29..0f886252d6452 100644
--- a/datafusion/common/src/types/logical.rs
+++ b/datafusion/common/src/types/logical.rs
@@ -67,12 +67,12 @@ pub type LogicalTypeRef = Arc<dyn LogicalType>;
 ///         &NativeType::String
 ///     }
 ///
-///    fn signature(&self) -> TypeSignature<'_> {
-///        TypeSignature::Extension {
-///            name: "JSON",
-///            parameters: &[],
-///        }
-///    }
+///     fn signature(&self) -> TypeSignature<'_> {
+///         TypeSignature::Extension {
+///             name: "JSON",
+///             parameters: &[],
+///         }
+///     }
 /// }
 /// ```
 pub trait LogicalType: Sync + Send {
@@ -100,12 +100,16 @@ impl fmt::Debug for dyn LogicalType {
 
 impl std::fmt::Display for dyn LogicalType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{self:?}")
+        match self.signature() {
+            TypeSignature::Native(_) => write!(f, "{}", self.native()),
+            TypeSignature::Extension { name, .. } => write!(f, "{name}"),
+        }
     }
 }
 
 impl PartialEq for dyn LogicalType {
     fn eq(&self, other: &Self) -> bool {
+        // Logical types with identical signatures are considered equal.
         self.signature().eq(&other.signature())
     }
 }
@@ -120,15 +124,129 @@ impl PartialOrd for dyn LogicalType {
 
 impl Ord for dyn LogicalType {
     fn cmp(&self, other: &Self) -> Ordering {
-        self.signature()
-            .cmp(&other.signature())
-            .then(self.native().cmp(other.native()))
+        // Logical types with identical signatures are considered equal.
+        self.signature().cmp(&other.signature())
     }
 }
 
 impl Hash for dyn LogicalType {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        // Logical types with identical signatures are considered equal.
         self.signature().hash(state);
-        self.native().hash(state);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{
+        LogicalField, LogicalFields, logical_boolean, logical_date, logical_float32,
+        logical_float64, logical_int32, logical_int64, logical_null, logical_string,
+    };
+    use arrow::datatypes::{DataType, Field, Fields};
+    use insta::assert_snapshot;
+
+    #[test]
+    fn test_logical_type_display_simple() {
+        assert_snapshot!(logical_null(), @"Null");
+        assert_snapshot!(logical_boolean(), @"Boolean");
+        assert_snapshot!(logical_int32(), @"Int32");
+        assert_snapshot!(logical_int64(), @"Int64");
+        assert_snapshot!(logical_float32(), @"Float32");
+        assert_snapshot!(logical_float64(), @"Float64");
+        assert_snapshot!(logical_string(), @"String");
+        assert_snapshot!(logical_date(), @"Date");
+    }
+
+    #[test]
+    fn test_logical_type_display_list() {
+        let list_type: Arc<dyn LogicalType> = Arc::new(NativeType::List(Arc::new(
+            LogicalField::from(&Field::new("item", DataType::Int32, true)),
+        )));
+        assert_snapshot!(list_type, @"List(Int32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_struct() {
+        let struct_type: Arc<dyn LogicalType> = Arc::new(NativeType::Struct(
+            LogicalFields::from(&Fields::from(vec![
+                Field::new("x", DataType::Float64, false),
+                Field::new("y", DataType::Float64, true),
+            ])),
+        ));
+        assert_snapshot!(struct_type, @r#"Struct("x": non-null Float64, "y": Float64)"#);
+    }
+
+    #[test]
+    fn test_logical_type_display_fixed_size_list() {
+        let fsl_type: Arc<dyn LogicalType> = Arc::new(NativeType::FixedSizeList(
+            Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Float32,
+                false,
+            ))),
+            3,
+        ));
+        assert_snapshot!(fsl_type, @"FixedSizeList(3 x non-null Float32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_map() {
+        let map_type: Arc<dyn LogicalType> = Arc::new(NativeType::Map(Arc::new(
+            LogicalField::from(&Field::new("entries", DataType::Utf8, false)),
+        )));
+        assert_snapshot!(map_type, @"Map(non-null String)");
+    }
+
+    #[test]
+    fn test_logical_type_display_union() {
+        use arrow::datatypes::UnionFields;
+
+        let union_fields = UnionFields::try_new(
+            vec![0, 1],
+            vec![
+                Field::new("int_val", DataType::Int32, false),
+                Field::new("str_val", DataType::Utf8, true),
+            ],
+        )
+        .unwrap();
+        let union_type: Arc<dyn LogicalType> = Arc::new(NativeType::Union(
+            crate::types::LogicalUnionFields::from(&union_fields),
+        ));
+        assert_snapshot!(union_type, @r#"Union(0: ("int_val": non-null Int32), 1: ("str_val": String))"#);
+    }
+
+    #[test]
+    fn test_logical_type_display_nullable_vs_non_nullable() {
+        let nullable_list: Arc<dyn LogicalType> = Arc::new(NativeType::List(Arc::new(
+            LogicalField::from(&Field::new("item", DataType::Int32, true)),
+        )));
+        let non_nullable_list: Arc<dyn LogicalType> =
+            Arc::new(NativeType::List(Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Int32,
+                false,
+            )))));
+
+        assert_snapshot!(nullable_list, @"List(Int32)");
+        assert_snapshot!(non_nullable_list, @"List(non-null Int32)");
+    }
+
+    #[test]
+    fn test_logical_type_display_extension() {
+        struct JsonType;
+        impl LogicalType for JsonType {
+            fn native(&self) -> &NativeType {
+                &NativeType::String
+            }
+            fn signature(&self) -> TypeSignature<'_> {
+                TypeSignature::Extension {
+                    name: "JSON",
+                    parameters: &[],
+                }
+            }
+        }
+        let json: Arc<dyn LogicalType> = Arc::new(JsonType);
+        assert_snapshot!(json, @"JSON");
     }
 }
diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs
index 39c79b4b99742..a4202db986bbf 100644
--- a/datafusion/common/src/types/native.rs
+++ b/datafusion/common/src/types/native.rs
@@ -19,10 +19,11 @@ use super::{
     LogicalField, LogicalFieldRef, LogicalFields, LogicalType, LogicalUnionFields,
     TypeSignature,
 };
-use crate::error::{Result, _internal_err};
+use crate::error::{_internal_err, Result};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
+    DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
+    Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
 };
 use std::{fmt::Display, sync::Arc};
 
@@ -183,9 +184,82 @@ pub enum NativeType {
     Map(LogicalFieldRef),
 }
 
+/// Format a [`LogicalField`] for display, matching [`arrow::datatypes::DataType`]'s
+/// Display convention of showing a `"non-null "` prefix for non-nullable fields.
+fn format_logical_field(
+    f: &mut std::fmt::Formatter<'_>,
+    field: &LogicalField,
+) -> std::fmt::Result {
+    let non_null = if field.nullable { "" } else { "non-null " };
+    write!(f, "{:?}: {non_null}{}", field.name, field.logical_type)
+}
+
 impl Display for NativeType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "NativeType::{self:?}")
+        // Match the format used by arrow::datatypes::DataType's Display impl
+        match self {
+            Self::Null => write!(f, "Null"),
+            Self::Boolean => write!(f, "Boolean"),
+            Self::Int8 => write!(f, "Int8"),
+            Self::Int16 => write!(f, "Int16"),
+            Self::Int32 => write!(f, "Int32"),
+            Self::Int64 => write!(f, "Int64"),
+            Self::UInt8 => write!(f, "UInt8"),
+            Self::UInt16 => write!(f, "UInt16"),
+            Self::UInt32 => write!(f, "UInt32"),
+            Self::UInt64 => write!(f, "UInt64"),
+            Self::Float16 => write!(f, "Float16"),
+            Self::Float32 => write!(f, "Float32"),
+            Self::Float64 => write!(f, "Float64"),
+            Self::Timestamp(unit, Some(tz)) => write!(f, "Timestamp({unit}, {tz:?})"),
+            Self::Timestamp(unit, None) => write!(f, "Timestamp({unit})"),
+            Self::Date => write!(f, "Date"),
+            Self::Time(unit) => write!(f, "Time({unit})"),
+            Self::Duration(unit) => write!(f, "Duration({unit})"),
+            Self::Interval(unit) => write!(f, "Interval({unit:?})"),
+            Self::Binary => write!(f, "Binary"),
+            Self::FixedSizeBinary(size) => write!(f, "FixedSizeBinary({size})"),
+            Self::String => write!(f, "String"),
+            Self::List(field) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(f, "List({non_null}{})", field.logical_type)
+            }
+            Self::FixedSizeList(field, size) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(
+                    f,
+                    "FixedSizeList({size} x {non_null}{})",
+                    field.logical_type
+                )
+            }
+            Self::Struct(fields) => {
+                write!(f, "Struct(")?;
+                for (i, field) in fields.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    format_logical_field(f, field)?;
+                }
+                write!(f, ")")
+            }
+            Self::Union(fields) => {
+                write!(f, "Union(")?;
+                for (i, (type_id, field)) in fields.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "{type_id}: (")?;
+                    format_logical_field(f, field)?;
+                    write!(f, ")")?;
+                }
+                write!(f, ")")
+            }
+            Self::Decimal(precision, scale) => write!(f, "Decimal({precision}, {scale})"),
+            Self::Map(field) => {
+                let non_null = if field.nullable { "" } else { "non-null " };
+                write!(f, "Map({non_null}{})", field.logical_type)
+            }
+        }
     }
 }
 
@@ -228,13 +302,19 @@ impl LogicalType for NativeType {
             (Self::Float16, _) => Float16,
             (Self::Float32, _) => Float32,
             (Self::Float64, _) => Float64,
-            (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s),
+            (Self::Decimal(p, s), _) if *p <= DECIMAL32_MAX_PRECISION => {
+                Decimal32(*p, *s)
+            }
+            (Self::Decimal(p, s), _) if *p <= DECIMAL64_MAX_PRECISION => {
+                Decimal64(*p, *s)
+            }
+            (Self::Decimal(p, s), _) if *p <= DECIMAL128_MAX_PRECISION => {
+                Decimal128(*p, *s)
+            }
             (Self::Decimal(p, s), _) => Decimal256(*p, *s),
             (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
             // If given type is Date, return the same type
-            (Self::Date, origin) if matches!(origin, Date32 | Date64) => {
-                origin.to_owned()
-            }
+            (Self::Date, Date32 | Date64) => origin.to_owned(),
             (Self::Date, _) => Date32,
             (Self::Time(tu), _) => match tu {
                 TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
@@ -244,6 +324,8 @@ impl LogicalType for NativeType {
             (Self::Interval(iu), _) => Interval(*iu),
             (Self::Binary, LargeUtf8) => LargeBinary,
             (Self::Binary, Utf8View) => BinaryView,
+            // We don't cast to another kind of binary type if the origin one is already a binary type
+            (Self::Binary, Binary | LargeBinary | BinaryView) => origin.to_owned(),
             (Self::Binary, data_type) if can_cast_types(data_type, &BinaryView) => {
                 BinaryView
             }
@@ -352,10 +434,10 @@ impl LogicalType for NativeType {
             }
             _ => {
                 return _internal_err!(
-                "Unavailable default cast for native type {:?} from physical type {:?}",
-                self,
-                origin
-            )
+                    "Unavailable default cast for native type {} from physical type {}",
+                    self,
+                    origin
+                );
             }
         })
     }
@@ -407,7 +489,10 @@ impl From<DataType> for NativeType {
             DataType::Union(union_fields, _) => {
                 Union(LogicalUnionFields::from(&union_fields))
             }
-            DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
+            DataType::Decimal32(p, s)
+            | DataType::Decimal64(p, s)
+            | DataType::Decimal128(p, s)
+            | DataType::Decimal256(p, s) => Decimal(p, s),
             DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
             DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
             DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),
@@ -418,22 +503,7 @@ impl From<DataType> for NativeType {
 impl NativeType {
     #[inline]
     pub fn is_numeric(&self) -> bool {
-        use NativeType::*;
-        matches!(
-            self,
-            UInt8
-                | UInt16
-                | UInt32
-                | UInt64
-                | Int8
-                | Int16
-                | Int32
-                | Int64
-                | Float16
-                | Float32
-                | Float64
-                | Decimal(_, _)
-        )
+        self.is_integer() || self.is_float() || self.is_decimal()
     }
 
     #[inline]
@@ -452,7 +522,7 @@ impl NativeType {
 
     #[inline]
     pub fn is_date(&self) -> bool {
-        matches!(self, NativeType::Date)
+        *self == NativeType::Date
     }
 
     #[inline]
@@ -469,4 +539,111 @@ impl NativeType {
     pub fn is_duration(&self) -> bool {
         matches!(self, NativeType::Duration(_))
     }
+
+    #[inline]
+    pub fn is_binary(&self) -> bool {
+        matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_))
+    }
+
+    #[inline]
+    pub fn is_null(&self) -> bool {
+        *self == NativeType::Null
+    }
+
+    #[inline]
+    pub fn is_decimal(&self) -> bool {
+        matches!(self, Self::Decimal(_, _))
+    }
+
+    #[inline]
+    pub fn is_float(&self) -> bool {
+        matches!(self, Self::Float16 | Self::Float32 | Self::Float64)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::LogicalField;
+    use arrow::datatypes::Field;
+    use insta::assert_snapshot;
+
+    #[test]
+    fn test_native_type_display() {
+        assert_snapshot!(NativeType::Null, @"Null");
+        assert_snapshot!(NativeType::Boolean, @"Boolean");
+        assert_snapshot!(NativeType::Int8, @"Int8");
+        assert_snapshot!(NativeType::Int16, @"Int16");
+        assert_snapshot!(NativeType::Int32, @"Int32");
+        assert_snapshot!(NativeType::Int64, @"Int64");
+        assert_snapshot!(NativeType::UInt8, @"UInt8");
+        assert_snapshot!(NativeType::UInt16, @"UInt16");
+        assert_snapshot!(NativeType::UInt32, @"UInt32");
+        assert_snapshot!(NativeType::UInt64, @"UInt64");
+        assert_snapshot!(NativeType::Float16, @"Float16");
+        assert_snapshot!(NativeType::Float32, @"Float32");
+        assert_snapshot!(NativeType::Float64, @"Float64");
+        assert_snapshot!(NativeType::Date, @"Date");
+        assert_snapshot!(NativeType::Binary, @"Binary");
+        assert_snapshot!(NativeType::String, @"String");
+        assert_snapshot!(NativeType::FixedSizeBinary(16), @"FixedSizeBinary(16)");
+        assert_snapshot!(NativeType::Decimal(10, 2), @"Decimal(10, 2)");
+    }
+
+    #[test]
+    fn test_native_type_display_timestamp() {
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Second, None),
+            @"Timestamp(s)"
+        );
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Millisecond, None),
+            @"Timestamp(ms)"
+        );
+        assert_snapshot!(
+            NativeType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("UTC"))),
+            @r#"Timestamp(ns, "UTC")"#
+        );
+    }
+
+    #[test]
+    fn test_native_type_display_time_duration_interval() {
+        assert_snapshot!(NativeType::Time(TimeUnit::Microsecond), @"Time(µs)");
+        assert_snapshot!(NativeType::Duration(TimeUnit::Nanosecond), @"Duration(ns)");
+        assert_snapshot!(NativeType::Interval(IntervalUnit::YearMonth), @"Interval(YearMonth)");
+        assert_snapshot!(NativeType::Interval(IntervalUnit::MonthDayNano), @"Interval(MonthDayNano)");
+    }
+
+    #[test]
+    fn test_native_type_display_nested() {
+        let list = NativeType::List(Arc::new(LogicalField::from(&Field::new(
+            "item",
+            DataType::Int32,
+            true,
+        ))));
+        assert_snapshot!(list, @"List(Int32)");
+
+        let fixed_list = NativeType::FixedSizeList(
+            Arc::new(LogicalField::from(&Field::new(
+                "item",
+                DataType::Float64,
+                false,
+            ))),
+            3,
+        );
+        assert_snapshot!(fixed_list, @"FixedSizeList(3 x non-null Float64)");
+
+        let struct_type = NativeType::Struct(LogicalFields::from(&Fields::from(vec![
+            Field::new("name", DataType::Utf8, false),
+            Field::new("age", DataType::Int32, true),
+        ])));
+        assert_snapshot!(struct_type, @r#"Struct("name": non-null String, "age": Int32)"#);
+
+        let map = NativeType::Map(Arc::new(LogicalField::from(&Field::new(
+            "entries",
+            DataType::Utf8,
+            false,
+        ))));
+        assert_snapshot!(map, @"Map(non-null String)");
+    }
 }
diff --git a/datafusion/common/src/utils/aggregate.rs b/datafusion/common/src/utils/aggregate.rs
new file mode 100644
index 0000000000000..43bc0676b2d3c
--- /dev/null
+++ b/datafusion/common/src/utils/aggregate.rs
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Scalar-level aggregation utilities for statistics merging.
+//!
+//! Provides a cheap pairwise [`ScalarValue`] addition that directly
+//! extracts inner primitive values, avoiding the expensive
+//! `ScalarValue::add` path (which round-trips through Arrow arrays).
+use arrow::datatypes::i256;
+
+use crate::stats::Precision;
+use crate::{Result, ScalarValue};
+
+/// Saturating addition for [`i256`] (which lacks a built-in
+/// `saturating_add`).  Returns `i256::MAX` on positive overflow and
+/// `i256::MIN` on negative overflow.
+#[inline]
+fn i256_saturating_add(a: i256, b: i256) -> i256 {
+    match a.checked_add(b) {
+        Some(sum) => sum,
+        None => {
+            // If b is non-negative the overflow is positive, otherwise
+            // negative.
+            if b >= i256::ZERO {
+                i256::MAX
+            } else {
+                i256::MIN
+            }
+        }
+    }
+}
+
+/// Add two [`ScalarValue`]s by directly extracting and adding their
+/// inner primitive values.
+///
+/// This avoids `ScalarValue::add` which converts both operands to
+/// single-element Arrow arrays, runs the `add_wrapping` kernel, and
+/// converts the result back — 3 heap allocations per call.
+///
+/// For non-primitive types, falls back to `ScalarValue::add`.
+pub(crate) fn scalar_add(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
+    macro_rules! add_int {
+        ($lhs:expr, $rhs:expr, $VARIANT:ident) => {
+            match ($lhs, $rhs) {
+                (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => {
+                    Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b))))
+                }
+                (ScalarValue::$VARIANT(None), other)
+                | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()),
+                _ => unreachable!(),
+            }
+        };
+    }
+
+    macro_rules! add_decimal {
+        ($lhs:expr, $rhs:expr, $VARIANT:ident) => {
+            match ($lhs, $rhs) {
+                (
+                    ScalarValue::$VARIANT(Some(a), p, s),
+                    ScalarValue::$VARIANT(Some(b), _, _),
+                ) => Ok(ScalarValue::$VARIANT(Some(a.saturating_add(*b)), *p, *s)),
+                (ScalarValue::$VARIANT(None, _, _), other)
+                | (other, ScalarValue::$VARIANT(None, _, _)) => Ok(other.clone()),
+                _ => unreachable!(),
+            }
+        };
+    }
+
+    macro_rules! add_float {
+        ($lhs:expr, $rhs:expr, $VARIANT:ident) => {
+            match ($lhs, $rhs) {
+                (ScalarValue::$VARIANT(Some(a)), ScalarValue::$VARIANT(Some(b))) => {
+                    Ok(ScalarValue::$VARIANT(Some(*a + *b)))
+                }
+                (ScalarValue::$VARIANT(None), other)
+                | (other, ScalarValue::$VARIANT(None)) => Ok(other.clone()),
+                _ => unreachable!(),
+            }
+        };
+    }
+
+    match lhs {
+        ScalarValue::Int8(_) => add_int!(lhs, rhs, Int8),
+        ScalarValue::Int16(_) => add_int!(lhs, rhs, Int16),
+        ScalarValue::Int32(_) => add_int!(lhs, rhs, Int32),
+        ScalarValue::Int64(_) => add_int!(lhs, rhs, Int64),
+        ScalarValue::UInt8(_) => add_int!(lhs, rhs, UInt8),
+        ScalarValue::UInt16(_) => add_int!(lhs, rhs, UInt16),
+        ScalarValue::UInt32(_) => add_int!(lhs, rhs, UInt32),
+        ScalarValue::UInt64(_) => add_int!(lhs, rhs, UInt64),
+        ScalarValue::Float16(_) => add_float!(lhs, rhs, Float16),
+        ScalarValue::Float32(_) => add_float!(lhs, rhs, Float32),
+        ScalarValue::Float64(_) => add_float!(lhs, rhs, Float64),
+        ScalarValue::Decimal32(_, _, _) => add_decimal!(lhs, rhs, Decimal32),
+        ScalarValue::Decimal64(_, _, _) => add_decimal!(lhs, rhs, Decimal64),
+        ScalarValue::Decimal128(_, _, _) => add_decimal!(lhs, rhs, Decimal128),
+        ScalarValue::Decimal256(_, _, _) => match (lhs, rhs) {
+            (
+                ScalarValue::Decimal256(Some(a), p, s),
+                ScalarValue::Decimal256(Some(b), _, _),
+            ) => Ok(ScalarValue::Decimal256(
+                Some(i256_saturating_add(*a, *b)),
+                *p,
+                *s,
+            )),
+            (ScalarValue::Decimal256(None, _, _), other)
+            | (other, ScalarValue::Decimal256(None, _, _)) => Ok(other.clone()),
+            _ => unreachable!(),
+        },
+        // Fallback: use the existing ScalarValue::add
+        _ => lhs.add(rhs),
+    }
+}
+
+/// [`Precision`]-aware sum of two [`ScalarValue`] precisions using
+/// cheap direct addition via [`scalar_add`].
+///
+/// Mirrors the semantics of `Precision<ScalarValue>::add` but avoids
+/// the expensive `ScalarValue::add` round-trip through Arrow arrays.
+pub(crate) fn precision_add(
+    lhs: &Precision<ScalarValue>,
+    rhs: &Precision<ScalarValue>,
+) -> Precision<ScalarValue> {
+    match (lhs, rhs) {
+        (Precision::Exact(a), Precision::Exact(b)) => scalar_add(a, b)
+            .map(Precision::Exact)
+            .unwrap_or(Precision::Absent),
+        (Precision::Inexact(a), Precision::Exact(b))
+        | (Precision::Exact(a), Precision::Inexact(b))
+        | (Precision::Inexact(a), Precision::Inexact(b)) => scalar_add(a, b)
+            .map(Precision::Inexact)
+            .unwrap_or(Precision::Absent),
+        (_, _) => Precision::Absent,
+    }
+}
diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
index 7ac081e0beb84..78ec434d2b577 100644
--- a/datafusion/common/src/utils/memory.rs
+++ b/datafusion/common/src/utils/memory.rs
@@ -17,8 +17,11 @@
 
 //! This module provides a function to estimate the memory size of a HashTable prior to allocation
 
-use crate::{DataFusionError, Result};
-use std::mem::size_of;
+use crate::error::_exec_datafusion_err;
+use crate::{HashSet, Result};
+use arrow::array::ArrayData;
+use arrow::record_batch::RecordBatch;
+use std::{mem::size_of, ptr::NonNull};
 
 /// Estimates the memory size required for a hash table prior to allocation.
 ///
@@ -36,7 +39,7 @@ use std::mem::size_of;
 ///     buckets.
 ///   - One byte overhead for each bucket.
 ///   - The fixed size overhead of the collection.
-/// - If the estimation overflows, we return a [`DataFusionError`]
+/// - If the estimation overflows, we return a [`crate::error::DataFusionError`]
 ///
 /// # Examples
 /// ---
@@ -55,8 +58,8 @@ use std::mem::size_of;
 /// impl<T> MyStruct<T> {
 ///     fn size(&self) -> Result<usize> {
 ///         let num_elements = self.values.len();
-///         let fixed_size = std::mem::size_of_val(self) +
-///           std::mem::size_of_val(&self.values);
+///         let fixed_size =
+///             std::mem::size_of_val(self) + std::mem::size_of_val(&self.values);
 ///
 ///         estimate_memory_size::<T>(num_elements, fixed_size)
 ///     }
@@ -72,8 +75,8 @@ use std::mem::size_of;
 /// let num_rows = 100;
 /// let fixed_size = std::mem::size_of::<HashMap<u64, u64>>();
 /// let estimated_hashtable_size =
-///   estimate_memory_size::<(u64, u64)>(num_rows,fixed_size)
-///     .expect("Size estimation failed");
+///     estimate_memory_size::<(u64, u64)>(num_rows, fixed_size)
+///         .expect("Size estimation failed");
 /// ```
 pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result<usize> {
     // For the majority of cases hashbrown overestimates the bucket quantity
@@ -94,12 +97,78 @@ pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result
                 .checked_add(fixed_size)
         })
         .ok_or_else(|| {
-            DataFusionError::Execution(
-                "usize overflow while estimating the number of buckets".to_string(),
-            )
+            _exec_datafusion_err!("usize overflow while estimating the number of buckets")
         })
 }
 
+/// Calculate total used memory of this batch.
+///
+/// This function is used to estimate the physical memory usage of the `RecordBatch`.
+/// It only counts the memory of large data `Buffer`s, and ignores metadata like
+/// types and pointers.
+/// The implementation will add up all unique `Buffer`'s memory
+/// size, due to:
+/// - The data pointer inside `Buffer` are memory regions returned by global memory
+///   allocator, those regions can't have overlap.
+/// - The actual used range of `ArrayRef`s inside `RecordBatch` can have overlap
+///   or reuse the same `Buffer`. For example: taking a slice from `Array`.
+///
+/// Example:
+/// For a `RecordBatch` with two columns: `col1` and `col2`, two columns are pointing
+/// to a sub-region of the same buffer.
+///
+/// {xxxxxxxxxxxxxxxxxxx} <--- buffer
+///       ^    ^  ^    ^
+///       |    |  |    |
+/// col1->{    }  |    |
+/// col2--------->{    }
+///
+/// In the above case, `get_record_batch_memory_size` will return the size of
+/// the buffer, instead of the sum of `col1` and `col2`'s actual memory size.
+///
+/// Note: Current `RecordBatch`.get_array_memory_size()` will double count the
+/// buffer memory size if multiple arrays within the batch are sharing the same
+/// `Buffer`. This method provides temporary fix until the issue is resolved:
+/// <https://github.com/apache/arrow-rs/issues/6439>
+pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize {
+    // Store pointers to `Buffer`'s start memory address (instead of actual
+    // used data region's pointer represented by current `Array`)
+    let mut counted_buffers: HashSet<NonNull<u8>> = HashSet::new();
+    let mut total_size = 0;
+
+    for array in batch.columns() {
+        let array_data = array.to_data();
+        count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size);
+    }
+
+    total_size
+}
+
+/// Count the memory usage of `array_data` and its children recursively.
+fn count_array_data_memory_size(
+    array_data: &ArrayData,
+    counted_buffers: &mut HashSet<NonNull<u8>>,
+    total_size: &mut usize,
+) {
+    // Count memory usage for `array_data`
+    for buffer in array_data.buffers() {
+        if counted_buffers.insert(buffer.data_ptr()) {
+            *total_size += buffer.capacity();
+        } // Otherwise the buffer's memory is already counted
+    }
+
+    if let Some(null_buffer) = array_data.nulls()
+        && counted_buffers.insert(null_buffer.inner().inner().data_ptr())
+    {
+        *total_size += null_buffer.inner().inner().capacity();
+    }
+
+    // Count all children `ArrayData` recursively
+    for child in array_data.child_data() {
+        count_array_data_memory_size(child, counted_buffers, total_size);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::{collections::HashSet, mem::size_of};
@@ -133,3 +202,129 @@ mod tests {
         assert!(estimated.is_err());
     }
 }
+
+#[cfg(test)]
+mod record_batch_tests {
+    use super::*;
+    use arrow::array::{Float64Array, Int32Array, ListArray};
+    use arrow::datatypes::{DataType, Field, Int32Type, Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_get_record_batch_memory_size() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ints", DataType::Int32, true),
+            Field::new("float64", DataType::Float64, false),
+        ]));
+
+        let int_array =
+            Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
+        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_array), Arc::new(float64_array)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 60);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_with_null() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ints", DataType::Int32, true),
+            Field::new("float64", DataType::Float64, false),
+        ]));
+
+        let int_array = Int32Array::from(vec![None, Some(2), Some(3)]);
+        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_array), Arc::new(float64_array)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 100);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_empty() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ints",
+            DataType::Int32,
+            false,
+        )]));
+
+        let int_array: Int32Array = Int32Array::from(vec![] as Vec<i32>);
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)]).unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 0, "Empty batch should have 0 memory size");
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_shared_buffer() {
+        let original = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        let slice1 = original.slice(0, 3);
+        let slice2 = original.slice(2, 3);
+
+        let schema_origin = Arc::new(Schema::new(vec![Field::new(
+            "origin_col",
+            DataType::Int32,
+            false,
+        )]));
+        let batch_origin =
+            RecordBatch::try_new(schema_origin, vec![Arc::new(original)]).unwrap();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("slice1", DataType::Int32, false),
+            Field::new("slice2", DataType::Int32, false),
+        ]));
+
+        let batch_sliced =
+            RecordBatch::try_new(schema, vec![Arc::new(slice1), Arc::new(slice2)])
+                .unwrap();
+
+        let size_origin = get_record_batch_memory_size(&batch_origin);
+        let size_sliced = get_record_batch_memory_size(&batch_sliced);
+
+        assert_eq!(size_origin, size_sliced);
+    }
+
+    #[test]
+    fn test_get_record_batch_memory_size_nested_array() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "nested_int",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                false,
+            ),
+            Field::new(
+                "nested_int2",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                false,
+            ),
+        ]));
+
+        let int_list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2), Some(3)]),
+        ]);
+
+        let int_list_array2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(4), Some(5), Some(6)]),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(int_list_array), Arc::new(int_list_array2)],
+        )
+        .unwrap();
+
+        let size = get_record_batch_memory_size(&batch);
+        assert_eq!(size, 8208);
+    }
+}
diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs
index 409f248621f7f..075a189c371dc 100644
--- a/datafusion/common/src/utils/mod.rs
+++ b/datafusion/common/src/utils/mod.rs
@@ -17,25 +17,26 @@
 
 //! This module provides the bisect function, which implements binary search.
 
+pub(crate) mod aggregate;
 pub mod expr;
 pub mod memory;
 pub mod proxy;
 pub mod string_utils;
 
-use crate::error::{_exec_datafusion_err, _internal_datafusion_err, _internal_err};
-use crate::{DataFusionError, Result, ScalarValue};
+use crate::assert_or_internal_err;
+use crate::error::{_exec_datafusion_err, _internal_datafusion_err};
+use crate::{Result, ScalarValue};
 use arrow::array::{
-    cast::AsArray, Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray, OffsetSizeTrait,
+    cast::AsArray,
 };
 use arrow::buffer::OffsetBuffer;
-use arrow::compute::{partition, SortColumn, SortOptions};
+use arrow::compute::{SortColumn, SortOptions, partition};
 use arrow::datatypes::{DataType, Field, SchemaRef};
-use sqlparser::ast::Ident;
-use sqlparser::dialect::GenericDialect;
-use sqlparser::parser::Parser;
+#[cfg(feature = "sql")]
+use sqlparser::{ast::Ident, dialect::GenericDialect, parser::Parser};
 use std::borrow::{Borrow, Cow};
-use std::cmp::{min, Ordering};
+use std::cmp::{Ordering, min};
 use std::collections::HashSet;
 use std::num::NonZero;
 use std::ops::Range;
@@ -47,36 +48,33 @@ use std::thread::available_parallelism;
 ///
 /// Example:
 /// ```
-/// use arrow::datatypes::{SchemaRef, Schema, Field, DataType};
+/// use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 /// use datafusion_common::project_schema;
 ///
 /// // Schema with columns 'a', 'b', and 'c'
 /// let schema = SchemaRef::new(Schema::new(vec![
-///   Field::new("a", DataType::Int32, true),
-///   Field::new("b", DataType::Int64, true),
-///   Field::new("c", DataType::Utf8, true),
+///     Field::new("a", DataType::Int32, true),
+///     Field::new("b", DataType::Int64, true),
+///     Field::new("c", DataType::Utf8, true),
 /// ]));
 ///
 /// // Pick columns 'c' and 'b'
-/// let projection = Some(vec![2,1]);
-/// let projected_schema = project_schema(
-///    &schema,
-///    projection.as_ref()
-///  ).unwrap();
+/// let projection = Some(vec![2, 1]);
+/// let projected_schema = project_schema(&schema, projection.as_ref()).unwrap();
 ///
 /// let expected_schema = SchemaRef::new(Schema::new(vec![
-///   Field::new("c", DataType::Utf8, true),
-///   Field::new("b", DataType::Int64, true),
+///     Field::new("c", DataType::Utf8, true),
+///     Field::new("b", DataType::Int64, true),
 /// ]));
 ///
 /// assert_eq!(projected_schema, expected_schema);
 /// ```
 pub fn project_schema(
     schema: &SchemaRef,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&impl AsRef<[usize]>>,
 ) -> Result<SchemaRef> {
     let schema = match projection {
-        Some(columns) => Arc::new(schema.project(columns)?),
+        Some(columns) => Arc::new(schema.project(columns.as_ref())?),
         None => Arc::clone(schema),
     };
     Ok(schema)
@@ -120,14 +118,13 @@ pub fn compare_rows(
         let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first) {
             (true, false, false) | (false, true, true) => Ordering::Greater,
             (true, false, true) | (false, true, false) => Ordering::Less,
-            (false, false, _) => if sort_options.descending {
-                rhs.partial_cmp(lhs)
-            } else {
-                lhs.partial_cmp(rhs)
+            (false, false, _) => {
+                if sort_options.descending {
+                    rhs.try_cmp(lhs)?
+                } else {
+                    lhs.try_cmp(rhs)?
+                }
             }
-            .ok_or_else(|| {
-                _internal_datafusion_err!("Column array shouldn't be empty")
-            })?,
             (true, true, _) => continue,
         };
         if result != Ordering::Equal {
@@ -149,9 +146,7 @@ pub fn bisect<const SIDE: bool>(
     let low: usize = 0;
     let high: usize = item_columns
         .first()
-        .ok_or_else(|| {
-            DataFusionError::Internal("Column array shouldn't be empty".to_string())
-        })?
+        .ok_or_else(|| _internal_datafusion_err!("Column array shouldn't be empty"))?
         .len();
     let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| {
         let cmp = compare_rows(current, target, sort_options)?;
@@ -200,9 +195,7 @@ pub fn linear_search<const SIDE: bool>(
     let low: usize = 0;
     let high: usize = item_columns
         .first()
-        .ok_or_else(|| {
-            DataFusionError::Internal("Column array shouldn't be empty".to_string())
-        })?
+        .ok_or_else(|| _internal_datafusion_err!("Column array shouldn't be empty"))?
         .len();
     let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| {
         let cmp = compare_rows(current, target, sort_options)?;
@@ -261,7 +254,7 @@ pub fn evaluate_partition_ranges(
 /// the identifier by replacing it with two double quotes
 ///
 /// e.g. identifier `tab.le"name` becomes `"tab.le""name"`
-pub fn quote_identifier(s: &str) -> Cow<str> {
+pub fn quote_identifier(s: &str) -> Cow<'_, str> {
     if needs_quotes(s) {
         Cow::Owned(format!("\"{}\"", s.replace('"', "\"\"")))
     } else {
@@ -274,15 +267,16 @@ fn needs_quotes(s: &str) -> bool {
     let mut chars = s.chars();
 
     // first char can not be a number unless escaped
-    if let Some(first_char) = chars.next() {
-        if !(first_char.is_ascii_lowercase() || first_char == '_') {
-            return true;
-        }
+    if let Some(first_char) = chars.next()
+        && !(first_char.is_ascii_lowercase() || first_char == '_')
+    {
+        return true;
     }
 
     !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
 }
 
+#[cfg(feature = "sql")]
 pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<Ident>> {
     let dialect = GenericDialect;
     let mut parser = Parser::new(&dialect).try_with_sql(s)?;
@@ -290,6 +284,10 @@ pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<Ident>> {
     Ok(idents)
 }
 
+/// Parse a string into a vector of identifiers.
+///
+/// Note: If ignore_case is false, the string will be normalized to lowercase.
+#[cfg(feature = "sql")]
 pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec<String> {
     parse_identifiers(s)
         .unwrap_or_default()
@@ -302,6 +300,59 @@ pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec<St
         .collect::<Vec<_>>()
 }
 
+#[cfg(not(feature = "sql"))]
+pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<String>> {
+    let mut result = Vec::new();
+    let mut current = String::new();
+    let mut in_quotes = false;
+
+    for ch in s.chars() {
+        match ch {
+            '"' => {
+                in_quotes = !in_quotes;
+                current.push(ch);
+            }
+            '.' if !in_quotes => {
+                result.push(current.clone());
+                current.clear();
+            }
+            _ => {
+                current.push(ch);
+            }
+        }
+    }
+
+    // Push the last part if it's not empty
+    if !current.is_empty() {
+        result.push(current);
+    }
+
+    Ok(result)
+}
+
+#[cfg(not(feature = "sql"))]
+pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec<String> {
+    parse_identifiers(s)
+        .unwrap_or_default()
+        .into_iter()
+        .map(|id| {
+            let is_double_quoted = if id.len() > 2 {
+                let mut chars = id.chars();
+                chars.next() == Some('"') && chars.last() == Some('"')
+            } else {
+                false
+            };
+            if is_double_quoted {
+                id[1..id.len() - 1].to_string().replace("\"\"", "\"")
+            } else if ignore_case {
+                id
+            } else {
+                id.to_ascii_lowercase()
+            }
+        })
+        .collect::<Vec<_>>()
+}
+
 /// This function "takes" the elements at `indices` from the slice `items`.
 pub fn get_at_indices<T: Clone, I: Borrow<usize>>(
     items: &[T],
@@ -312,9 +363,7 @@ pub fn get_at_indices<T: Clone, I: Borrow<usize>>(
         .map(|idx| items.get(*idx.borrow()).cloned())
         .collect::<Option<Vec<T>>>()
         .ok_or_else(|| {
-            DataFusionError::Execution(
-                "Expects indices to be in the range of searched vector".to_string(),
-            )
+            _exec_datafusion_err!("Expects indices to be in the range of searched vector")
         })
 }
 
@@ -348,9 +397,11 @@ pub fn longest_consecutive_prefix<T: Borrow<usize>>(
 /// # use arrow::array::types::Int64Type;
 /// # use datafusion_common::utils::SingleRowListArrayBuilder;
 /// // Array is [1, 2, 3]
-/// let arr = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
-///       Some(vec![Some(1), Some(2), Some(3)]),
-/// ]);
+/// let arr = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(vec![
+///     Some(1),
+///     Some(2),
+///     Some(3),
+/// ])]);
 /// // Wrap as a list array: [[1, 2, 3]]
 /// let list_arr = SingleRowListArrayBuilder::new(Arc::new(arr)).build_list_array();
 /// assert_eq!(list_arr.len(), 1);
@@ -445,94 +496,6 @@ impl SingleRowListArrayBuilder {
     }
 }
 
-/// Wrap an array into a single element `ListArray`.
-/// For example `[1, 2, 3]` would be converted into `[[1, 2, 3]]`
-/// The field in the list array is nullable.
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_list_array_nullable(arr: ArrayRef) -> ListArray {
-    SingleRowListArrayBuilder::new(arr)
-        .with_nullable(true)
-        .build_list_array()
-}
-
-/// Wrap an array into a single element `ListArray`.
-/// For example `[1, 2, 3]` would be converted into `[[1, 2, 3]]`
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_list_array(arr: ArrayRef, nullable: bool) -> ListArray {
-    SingleRowListArrayBuilder::new(arr)
-        .with_nullable(nullable)
-        .build_list_array()
-}
-
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_list_array_with_field_name(
-    arr: ArrayRef,
-    nullable: bool,
-    field_name: &str,
-) -> ListArray {
-    SingleRowListArrayBuilder::new(arr)
-        .with_nullable(nullable)
-        .with_field_name(Some(field_name.to_string()))
-        .build_list_array()
-}
-
-/// Wrap an array into a single element `LargeListArray`.
-/// For example `[1, 2, 3]` would be converted into `[[1, 2, 3]]`
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_large_list_array(arr: ArrayRef) -> LargeListArray {
-    SingleRowListArrayBuilder::new(arr).build_large_list_array()
-}
-
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_large_list_array_with_field_name(
-    arr: ArrayRef,
-    field_name: &str,
-) -> LargeListArray {
-    SingleRowListArrayBuilder::new(arr)
-        .with_field_name(Some(field_name.to_string()))
-        .build_large_list_array()
-}
-
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_fixed_size_list_array(
-    arr: ArrayRef,
-    list_size: usize,
-) -> FixedSizeListArray {
-    SingleRowListArrayBuilder::new(arr).build_fixed_size_list_array(list_size)
-}
-
-#[deprecated(
-    since = "44.0.0",
-    note = "please use `SingleRowListArrayBuilder` instead"
-)]
-pub fn array_into_fixed_size_list_array_with_field_name(
-    arr: ArrayRef,
-    list_size: usize,
-    field_name: &str,
-) -> FixedSizeListArray {
-    SingleRowListArrayBuilder::new(arr)
-        .with_field_name(Some(field_name.to_string()))
-        .build_fixed_size_list_array(list_size)
-}
-
 /// Wrap arrays into a single element `ListArray`.
 ///
 /// Example:
@@ -554,13 +517,12 @@ pub fn array_into_fixed_size_list_array_with_field_name(
 /// );
 ///
 /// assert_eq!(list_arr, expected);
+/// ```
 pub fn arrays_into_list_array(
     arr: impl IntoIterator<Item = ArrayRef>,
 ) -> Result<ListArray> {
     let arr = arr.into_iter().collect::<Vec<_>>();
-    if arr.is_empty() {
-        return _internal_err!("Cannot wrap empty array into list array");
-    }
+    assert_or_internal_err!(!arr.is_empty(), "Cannot wrap empty array into list array");
 
     let lens = arr.iter().map(|x| x.len()).collect::<Vec<_>>();
     // Assume data type is consistent
@@ -592,7 +554,8 @@ pub fn fixed_size_list_to_arrays(a: &ArrayRef) -> Vec<ArrayRef> {
 /// use datafusion_common::utils::base_type;
 /// use std::sync::Arc;
 ///
-/// let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
+/// let data_type =
+///     DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
 /// assert_eq!(base_type(&data_type), DataType::Int32);
 ///
 /// let data_type = DataType::Int32;
@@ -626,6 +589,7 @@ pub enum ListCoercion {
 /// let base_type = DataType::Float64;
 /// let coerced_type = coerced_type_with_base_type_only(&data_type, &base_type, None);
 /// assert_eq!(coerced_type, DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))));
+/// ```
 pub fn coerced_type_with_base_type_only(
     data_type: &DataType,
     base_type: &DataType,
@@ -732,10 +696,14 @@ pub mod datafusion_strsim {
     }
 
     /// Calculates the minimum number of insertions, deletions, and substitutions
-    /// required to change one sequence into the other.
-    fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
+    /// required to change one sequence into the other, using a reusable cache buffer.
+    ///
+    /// This is the generic implementation that works with any iterator types.
+    /// The `cache` buffer will be resized as needed and reused across calls.
+    fn generic_levenshtein_with_buffer<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
         a: &'a Iter1,
         b: &'b Iter2,
+        cache: &mut Vec<usize>,
     ) -> usize
     where
         &'a Iter1: IntoIterator<Item = Elem1>,
@@ -748,7 +716,9 @@ pub mod datafusion_strsim {
             return b_len;
         }
 
-        let mut cache: Vec<usize> = (1..b_len + 1).collect();
+        // Resize cache to fit b_len elements
+        cache.clear();
+        cache.extend(1..=b_len);
 
         let mut result = 0;
 
@@ -768,6 +738,21 @@ pub mod datafusion_strsim {
         result
     }
 
+    /// Calculates the minimum number of insertions, deletions, and substitutions
+    /// required to change one sequence into the other.
+    fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
+        a: &'a Iter1,
+        b: &'b Iter2,
+    ) -> usize
+    where
+        &'a Iter1: IntoIterator<Item = Elem1>,
+        &'b Iter2: IntoIterator<Item = Elem2>,
+        Elem1: PartialEq<Elem2>,
+    {
+        let mut cache = Vec::new();
+        generic_levenshtein_with_buffer(a, b, &mut cache)
+    }
+
     /// Calculates the minimum number of insertions, deletions, and substitutions
     /// required to change one string into the other.
     ///
@@ -780,6 +765,15 @@ pub mod datafusion_strsim {
         generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
     }
 
+    /// Calculates the Levenshtein distance using a reusable cache buffer.
+    /// This avoids allocating a new Vec for each call, improving performance
+    /// when computing many distances.
+    ///
+    /// The `cache` buffer will be resized as needed and reused across calls.
+    pub fn levenshtein_with_buffer(a: &str, b: &str, cache: &mut Vec<usize>) -> usize {
+        generic_levenshtein_with_buffer(&StringWrapper(a), &StringWrapper(b), cache)
+    }
+
     /// Calculates the normalized Levenshtein distance between two strings.
     /// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
     /// that the strings are identical and 0.0 indicates no similarity.
@@ -833,21 +827,6 @@ pub fn set_difference<T: Borrow<usize>, S: Borrow<usize>>(
         .collect()
 }
 
-/// Checks whether the given index sequence is monotonically non-decreasing.
-#[deprecated(since = "45.0.0", note = "Use std::Iterator::is_sorted instead")]
-pub fn is_sorted<T: Borrow<usize>>(sequence: impl IntoIterator<Item = T>) -> bool {
-    // TODO: Remove this function when `is_sorted` graduates from Rust nightly.
-    let mut previous = 0;
-    for item in sequence.into_iter() {
-        let current = *item.borrow();
-        if current < previous {
-            return false;
-        }
-        previous = current;
-    }
-    true
-}
-
 /// Find indices of each element in `targets` inside `items`. If one of the
 /// elements is absent in `items`, returns an error.
 pub fn find_indices<T: PartialEq, S: Borrow<T>>(
@@ -858,7 +837,7 @@ pub fn find_indices<T: PartialEq, S: Borrow<T>>(
         .into_iter()
         .map(|target| items.iter().position(|e| target.borrow().eq(e)))
         .collect::<Option<_>>()
-        .ok_or_else(|| DataFusionError::Execution("Target not found".to_string()))
+        .ok_or_else(|| _exec_datafusion_err!("Target not found"))
 }
 
 /// Transposes the given vector of vectors.
@@ -950,7 +929,7 @@ pub fn get_available_parallelism() -> usize {
         .get()
 }
 
-/// Converts a collection of function arguments into an fixed-size array of length N
+/// Converts a collection of function arguments into a fixed-size array of length N
 /// producing a reasonable error message in case of unexpected number of arguments.
 ///
 /// # Example
@@ -959,16 +938,19 @@ pub fn get_available_parallelism() -> usize {
 /// # use datafusion_common::utils::take_function_args;
 /// # use datafusion_common::ScalarValue;
 /// fn my_function(args: &[ScalarValue]) -> Result<()> {
-///   // function expects 2 args, so create a 2-element array
-///   let [arg1, arg2] = take_function_args("my_function", args)?;
-///   // ... do stuff..
-///   Ok(())
+///     // function expects 2 args, so create a 2-element array
+///     let [arg1, arg2] = take_function_args("my_function", args)?;
+///     // ... do stuff..
+///     Ok(())
 /// }
 ///
 /// // Calling the function with 1 argument produces an error:
 /// let args = vec![ScalarValue::Int32(Some(10))];
 /// let err = my_function(&args).unwrap_err();
-/// assert_eq!(err.to_string(), "Execution error: my_function function requires 2 arguments, got 1");
+/// assert_eq!(
+///     err.to_string(),
+///     "Execution error: my_function function requires 2 arguments, got 1"
+/// );
 /// // Calling the function with 2 arguments works great
 /// let args = vec![ScalarValue::Int32(Some(10)), ScalarValue::Int32(Some(20))];
 /// my_function(&args).unwrap();
@@ -994,7 +976,6 @@ mod tests {
     use super::*;
     use crate::ScalarValue::Null;
     use arrow::array::Float64Array;
-    use sqlparser::tokenizer::Span;
 
     #[test]
     fn test_bisect_linear_left_and_right() -> Result<()> {
@@ -1190,6 +1171,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "sql")]
     #[test]
     fn test_quote_identifier() -> Result<()> {
         let cases = vec![
@@ -1222,7 +1204,7 @@ mod tests {
             let expected_parsed = vec![Ident {
                 value: identifier.to_string(),
                 quote_style,
-                span: Span::empty(),
+                span: sqlparser::tokenizer::Span::empty(),
             }];
 
             assert_eq!(
@@ -1275,19 +1257,6 @@ mod tests {
         assert_eq!(set_difference([3, 4, 0], [4, 1, 2]), vec![3, 0]);
     }
 
-    #[test]
-    #[expect(deprecated)]
-    fn test_is_sorted() {
-        assert!(is_sorted::<usize>([]));
-        assert!(is_sorted([0]));
-        assert!(is_sorted([0, 3, 4]));
-        assert!(is_sorted([0, 1, 2]));
-        assert!(is_sorted([0, 1, 4]));
-        assert!(is_sorted([0usize; 0]));
-        assert!(is_sorted([1, 2]));
-        assert!(!is_sorted([3, 2]));
-    }
-
     #[test]
     fn test_find_indices() -> Result<()> {
         assert_eq!(find_indices(&[0, 3, 4], [0, 3, 4])?, vec![0, 1, 2]);
diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs
index d940677a5fb3b..846c928515d60 100644
--- a/datafusion/common/src/utils/proxy.rs
+++ b/datafusion/common/src/utils/proxy.rs
@@ -15,12 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`VecAllocExt`] and [`RawTableAllocExt`] to help tracking of memory allocations
+//! [`VecAllocExt`] to help tracking of memory allocations
 
-use hashbrown::{
-    hash_table::HashTable,
-    raw::{Bucket, RawTable},
-};
+use hashbrown::hash_table::HashTable;
 use std::mem::size_of;
 
 /// Extension trait for [`Vec`] to account for allocations.
@@ -47,7 +44,9 @@ pub trait VecAllocExt {
     /// assert_eq!(allocated, 16); // no new allocation needed
     ///
     /// // push more data into the vec
-    /// for _ in 0..10 { vec.push_accounted(1, &mut allocated); }
+    /// for _ in 0..10 {
+    ///     vec.push_accounted(1, &mut allocated);
+    /// }
     /// assert_eq!(allocated, 64); // underlying vec has space for 10 u32s
     /// assert_eq!(vec.allocated_size(), 64);
     /// ```
@@ -82,7 +81,9 @@ pub trait VecAllocExt {
     /// assert_eq!(vec.allocated_size(), 16); // no new allocation needed
     ///
     /// // push more data into the vec
-    /// for _ in 0..10 { vec.push(1); }
+    /// for _ in 0..10 {
+    ///     vec.push(1);
+    /// }
     /// assert_eq!(vec.allocated_size(), 64); // space for 64 now
     /// ```
     fn allocated_size(&self) -> usize;
@@ -110,73 +111,6 @@ impl<T> VecAllocExt for Vec<T> {
     }
 }
 
-/// Extension trait for hash browns [`RawTable`] to account for allocations.
-pub trait RawTableAllocExt {
-    /// Item type.
-    type T;
-
-    /// [Insert](RawTable::insert) new element into table and increase
-    /// `accounting` by any newly allocated bytes.
-    ///
-    /// Returns the bucket where the element was inserted.
-    /// Note that allocation counts capacity, not size.
-    ///
-    /// # Example:
-    /// ```
-    /// # use datafusion_common::utils::proxy::RawTableAllocExt;
-    /// # use hashbrown::raw::RawTable;
-    /// let mut table = RawTable::new();
-    /// let mut allocated = 0;
-    /// let hash_fn = |x: &u32| (*x as u64) % 1000;
-    /// // pretend 0x3117 is the hash value for 1
-    /// table.insert_accounted(1, hash_fn, &mut allocated);
-    /// assert_eq!(allocated, 64);
-    ///
-    /// // insert more values
-    /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); }
-    /// assert_eq!(allocated, 400);
-    /// ```
-    fn insert_accounted(
-        &mut self,
-        x: Self::T,
-        hasher: impl Fn(&Self::T) -> u64,
-        accounting: &mut usize,
-    ) -> Bucket<Self::T>;
-}
-
-impl<T> RawTableAllocExt for RawTable<T> {
-    type T = T;
-
-    fn insert_accounted(
-        &mut self,
-        x: Self::T,
-        hasher: impl Fn(&Self::T) -> u64,
-        accounting: &mut usize,
-    ) -> Bucket<Self::T> {
-        let hash = hasher(&x);
-
-        match self.try_insert_no_grow(hash, x) {
-            Ok(bucket) => bucket,
-            Err(x) => {
-                // need to request more memory
-
-                let bump_elements = self.capacity().max(16);
-                let bump_size = bump_elements * size_of::<T>();
-                *accounting = (*accounting).checked_add(bump_size).expect("overflow");
-
-                self.reserve(bump_elements, hasher);
-
-                // still need to insert the element since first try failed
-                // Note: cannot use `.expect` here because `T` may not implement `Debug`
-                match self.try_insert_no_grow(hash, x) {
-                    Ok(bucket) => bucket,
-                    Err(_) => panic!("just grew the container"),
-                }
-            }
-        }
-    }
-}
-
 /// Extension trait for hash browns [`HashTable`] to account for allocations.
 pub trait HashTableAllocExt {
     /// Item type.
@@ -187,6 +121,8 @@ pub trait HashTableAllocExt {
     ///
     /// Returns the bucket where the element was inserted.
     /// Note that allocation counts capacity, not size.
+    /// Panics:
+    ///     Assumes the element is not already present, and may panic if it does
     ///
     /// # Example:
     /// ```
@@ -200,7 +136,9 @@ pub trait HashTableAllocExt {
     /// assert_eq!(allocated, 64);
     ///
     /// // insert more values
-    /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); }
+    /// for i in 2..100 {
+    ///     table.insert_accounted(i, hash_fn, &mut allocated);
+    /// }
     /// assert_eq!(allocated, 400);
     /// ```
     fn insert_accounted(
@@ -225,22 +163,24 @@ where
     ) {
         let hash = hasher(&x);
 
-        // NOTE: `find_entry` does NOT grow!
-        match self.find_entry(hash, |y| y == &x) {
-            Ok(_occupied) => {}
-            Err(_absent) => {
-                if self.len() == self.capacity() {
-                    // need to request more memory
-                    let bump_elements = self.capacity().max(16);
-                    let bump_size = bump_elements * size_of::<T>();
-                    *accounting = (*accounting).checked_add(bump_size).expect("overflow");
+        if cfg!(debug_assertions) {
+            // In debug mode, check that the element is not already present
+            debug_assert!(
+                self.find_entry(hash, |y| y == &x).is_err(),
+                "attempted to insert duplicate element into HashTableAllocExt::insert_accounted"
+            );
+        }
 
-                    self.reserve(bump_elements, &hasher);
-                }
+        if self.len() == self.capacity() {
+            // need to request more memory
+            let bump_elements = self.capacity().max(16);
+            let bump_size = bump_elements * size_of::<T>();
+            *accounting = (*accounting).checked_add(bump_size).expect("overflow");
 
-                // still need to insert the element since first try failed
-                self.entry(hash, |y| y == &x, hasher).insert(x);
-            }
+            self.reserve(bump_elements, &hasher);
         }
+
+        // We assume the element is not already present
+        self.insert_unique(hash, x, hasher);
     }
 }
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 03a9ec8f3f150..326b791a2f624 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -32,6 +32,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -43,10 +46,11 @@ array_expressions = ["nested_expressions"]
 avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
 backtrace = ["datafusion-common/backtrace"]
 compression = [
-    "xz2",
+    "liblzma",
     "bzip2",
     "flate2",
     "zstd",
+    "datafusion-datasource-arrow/compression",
     "datafusion-datasource/compression",
 ]
 crypto_expressions = ["datafusion-functions/crypto_expressions"]
@@ -62,13 +66,19 @@ default = [
     "compression",
     "parquet",
     "recursive_protection",
+    "sql",
 ]
 encoding_expressions = ["datafusion-functions/encoding_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
 math_expressions = ["datafusion-functions/math_expressions"]
 parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"]
-pyarrow = ["datafusion-common/pyarrow", "parquet"]
+parquet_encryption = [
+    "parquet",
+    "parquet/encryption",
+    "datafusion-common/parquet_encryption",
+    "datafusion-datasource-parquet/parquet_encryption",
+]
 regex_expressions = [
     "datafusion-functions/regex_expressions",
 ]
@@ -77,7 +87,9 @@ recursive_protection = [
     "datafusion-expr/recursive_protection",
     "datafusion-optimizer/recursive_protection",
     "datafusion-physical-optimizer/recursive_protection",
-    "datafusion-sql/recursive_protection",
+    "datafusion-physical-expr/recursive_protection",
+    "datafusion-sql?/recursive_protection",
+    "sqlparser?/recursive-protection",
 ]
 serde = [
     "dep:serde",
@@ -85,62 +97,66 @@ serde = [
     # statements in `arrow-schema` crate
     "arrow-schema/serde",
 ]
+sql = [
+    "datafusion-common/sql",
+    "datafusion-functions-nested?/sql",
+    "datafusion-sql",
+    "sqlparser",
+]
 string_expressions = ["datafusion-functions/string_expressions"]
 unicode_expressions = [
-    "datafusion-sql/unicode_expressions",
+    "datafusion-sql?/unicode_expressions",
     "datafusion-functions/unicode_expressions",
 ]
 extended_tests = []
 
 [dependencies]
 arrow = { workspace = true }
-arrow-ipc = { workspace = true }
 arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
 async-trait = { workspace = true }
-bytes = { workspace = true }
-bzip2 = { version = "0.5.2", optional = true }
+bzip2 = { workspace = true, optional = true }
 chrono = { workspace = true }
 datafusion-catalog = { workspace = true }
 datafusion-catalog-listing = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-common-runtime = { workspace = true }
 datafusion-datasource = { workspace = true }
+datafusion-datasource-arrow = { workspace = true }
 datafusion-datasource-avro = { workspace = true, optional = true }
 datafusion-datasource-csv = { workspace = true }
 datafusion-datasource-json = { workspace = true }
 datafusion-datasource-parquet = { workspace = true, optional = true }
 datafusion-execution = { workspace = true }
-datafusion-expr = { workspace = true }
+datafusion-expr = { workspace = true, default-features = false }
 datafusion-expr-common = { workspace = true }
 datafusion-functions = { workspace = true }
 datafusion-functions-aggregate = { workspace = true }
-datafusion-functions-nested = { workspace = true, optional = true }
+datafusion-functions-nested = { workspace = true, default-features = false, optional = true }
 datafusion-functions-table = { workspace = true }
 datafusion-functions-window = { workspace = true }
 datafusion-optimizer = { workspace = true }
 datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-optimizer = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
-datafusion-sql = { workspace = true }
-flate2 = { version = "1.1.1", optional = true }
+datafusion-sql = { workspace = true, optional = true }
+flate2 = { workspace = true, optional = true }
 futures = { workspace = true }
 itertools = { workspace = true }
+liblzma = { workspace = true, optional = true }
 log = { workspace = true }
 object_store = { workspace = true }
 parking_lot = { workspace = true }
 parquet = { workspace = true, optional = true, default-features = true }
-rand = { workspace = true }
-regex = { workspace = true }
 serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
-sqlparser = { workspace = true }
+sqlparser = { workspace = true, optional = true }
 tempfile = { workspace = true }
 tokio = { workspace = true }
 url = { workspace = true }
-uuid = { version = "1.17", features = ["v4", "js"] }
-xz2 = { version = "0.1", optional = true, features = ["static"] }
-zstd = { version = "0.13", optional = true, default-features = false }
+uuid = { workspace = true, features = ["v4", "js"] }
+zstd = { workspace = true, optional = true }
 
 [dev-dependencies]
 async-trait = { workspace = true }
@@ -152,20 +168,26 @@ datafusion-functions-window-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-optimizer = { workspace = true }
 doc-comment = { workspace = true }
+bytes = { workspace = true }
 env_logger = { workspace = true }
+glob = { workspace = true }
 insta = { workspace = true }
-paste = "^1.0"
+pretty_assertions = "1.0"
 rand = { workspace = true, features = ["small_rng"] }
 rand_distr = "0.5"
+recursive = { workspace = true }
 regex = { workspace = true }
 rstest = { workspace = true }
 serde_json = { workspace = true }
-sysinfo = "0.35.1"
+sysinfo = "0.38.2"
 test-utils = { path = "../../test-utils" }
 tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
 
+[package.metadata.cargo-machete]
+ignored = ["datafusion-doc", "datafusion-macros", "dashmap"]
+
 [target.'cfg(not(target_os = "windows"))'.dev-dependencies]
-nix = { version = "0.30.1", features = ["fs"] }
+nix = { version = "0.31.1", features = ["fs"] }
 
 [[bench]]
 harness = false
@@ -203,6 +225,10 @@ name = "struct_query_sql"
 harness = false
 name = "window_query_sql"
 
+[[bench]]
+harness = false
+name = "topk_repartition"
+
 [[bench]]
 harness = false
 name = "scalar"
@@ -216,10 +242,23 @@ harness = false
 name = "parquet_query_sql"
 required-features = ["parquet"]
 
+[[bench]]
+harness = false
+name = "parquet_struct_query"
+required-features = ["parquet"]
+
+[[bench]]
+harness = false
+name = "range_and_generate_series"
+
 [[bench]]
 harness = false
 name = "sql_planner"
 
+[[bench]]
+harness = false
+name = "sql_planner_extended"
+
 [[bench]]
 harness = false
 name = "sql_query_with_io"
@@ -244,3 +283,12 @@ name = "dataframe"
 [[bench]]
 harness = false
 name = "spm"
+
+[[bench]]
+harness = false
+name = "preserve_file_partitioning"
+required-features = ["parquet"]
+
+[[bench]]
+harness = false
+name = "reset_plan_states"
diff --git a/datafusion/core/README.md b/datafusion/core/README.md
index b5501087d2647..859fcb9c0dff9 100644
--- a/datafusion/core/README.md
+++ b/datafusion/core/README.md
@@ -17,15 +17,12 @@
   under the License.
 -->
 
-# DataFusion Core
+<!--
+  Note the main crates.io landing page https://crates.io/crates/datafusion
+  uses the workspace README.md file, not this file
+-->
 
-DataFusion is an extensible query execution framework, written in Rust,
-that uses Apache Arrow as its in-memory format.
+# Apache DataFusion Core
 
 This crate contains the main entry points and high level DataFusion APIs such as
 `SessionContext`, `DataFrame` and `ListingTable`.
-
-For more information, please see:
-
-- [DataFusion Website](https://datafusion.apache.org)
-- [DataFusion API Docs](https://docs.rs/datafusion/latest/datafusion/)
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 057a0e1d1b54c..402ac9c7176b5 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -15,23 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
-use crate::criterion::Criterion;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::create_table_provider;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use parking_lot::Mutex;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context(
@@ -153,6 +151,38 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    c.bench_function(
+        "aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions",
+        |b| {
+            b.iter(|| {
+                query(
+                    ctx.clone(),
+                    &rt,
+                    // Due to the large number of distinct values in u64_wide,
+                    // this query test the actual grouping performance for more than 1 column
+                    "SELECT u64_wide, utf8 \
+                 FROM t GROUP BY u64_wide, utf8",
+                )
+            })
+        },
+    );
+
+    c.bench_function(
+        "aggregate_query_group_by_wide_u64_and_f32_without_aggregate_expressions",
+        |b| {
+            b.iter(|| {
+                query(
+                    ctx.clone(),
+                    &rt,
+                    // Due to the large number of distinct values in u64_wide,
+                    // this query test the actual grouping performance for more than 1 column
+                    "SELECT u64_wide, f32 \
+                 FROM t GROUP BY u64_wide, f32",
+                )
+            })
+        },
+    );
+
     c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
         b.iter(|| {
             query(
@@ -221,6 +251,50 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
+
+    c.bench_function("array_agg_query_group_by_few_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_narrow, array_agg(f64) \
+                 FROM t GROUP BY u64_narrow",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_query_group_by_mid_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_mid, array_agg(f64) \
+                 FROM t GROUP BY u64_mid",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_query_group_by_many_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_wide, array_agg(f64) \
+                 FROM t GROUP BY u64_wide",
+            )
+        })
+    });
+
+    c.bench_function("array_agg_struct_query_group_by_mid_groups", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT u64_mid, array_agg(named_struct('market', dict10, 'price', f64)) \
+                 FROM t GROUP BY u64_mid",
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/core/benches/csv_load.rs b/datafusion/core/benches/csv_load.rs
index 3f984757466d5..13843dadddd0c 100644
--- a/datafusion/core/benches/csv_load.rs
+++ b/datafusion/core/benches/csv_load.rs
@@ -15,23 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
-use crate::criterion::Criterion;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use datafusion::prelude::CsvReadOptions;
 use datafusion::test_util::csv::TestCsvFile;
 use parking_lot::Mutex;
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 use test_utils::AccessLogGenerator;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn load_csv(
     ctx: Arc<Mutex<SessionContext>>,
     rt: &Runtime,
@@ -39,7 +37,7 @@ fn load_csv(
     options: CsvReadOptions,
 ) {
     let df = rt.block_on(ctx.lock().read_csv(path, options)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context() -> Result<Arc<Mutex<SessionContext>>> {
diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs
index c0477b1306f75..728c6490c72bd 100644
--- a/datafusion/core/benches/data_utils/mod.rs
+++ b/datafusion/core/benches/data_utils/mod.rs
@@ -18,10 +18,11 @@
 //! This module provides the in-memory table for more realistic benchmarking.
 
 use arrow::array::{
-    builder::{Int64Builder, StringBuilder},
     ArrayRef, Float32Array, Float64Array, RecordBatch, StringArray, StringViewBuilder,
     UInt64Array,
+    builder::{Int64Builder, StringBuilder, StringDictionaryBuilder},
 };
+use arrow::datatypes::Int32Type;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::datasource::MemTable;
 use datafusion::error::Result;
@@ -36,6 +37,7 @@ use std::sync::Arc;
 
 /// create an in-memory table given the partition len, array len, and batch size,
 /// and the result table will be of array_len in total, and then partitioned, and batched.
+#[expect(clippy::allow_attributes)] // some issue where expect(dead_code) doesn't fire properly
 #[allow(dead_code)]
 pub fn create_table_provider(
     partitions_len: usize,
@@ -44,7 +46,7 @@ pub fn create_table_provider(
 ) -> Result<Arc<MemTable>> {
     let schema = Arc::new(create_schema());
     let partitions =
-        create_record_batches(schema.clone(), array_len, partitions_len, batch_size);
+        create_record_batches(&schema, array_len, partitions_len, batch_size);
     // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
     MemTable::try_new(schema, partitions).map(Arc::new)
 }
@@ -55,21 +57,24 @@ pub fn create_schema() -> Schema {
         Field::new("utf8", DataType::Utf8, false),
         Field::new("f32", DataType::Float32, false),
         Field::new("f64", DataType::Float64, true),
-        // This field will contain integers randomly selected from a large
-        // range of values, i.e. [0, u64::MAX], such that there are none (or
-        // very few) repeated values.
-        Field::new("u64_wide", DataType::UInt64, true),
-        // This field will contain integers randomly selected from a narrow
-        // range of values such that there are a few distinct values, but they
-        // are repeated often.
+        // Integers randomly selected from a wide range of values, i.e. [0,
+        // u64::MAX], such that there are ~no repeated values.
+        Field::new("u64_wide", DataType::UInt64, false),
+        // Integers randomly selected from a mid-range of values [0, 1000),
+        // providing ~1000 distinct groups.
+        Field::new("u64_mid", DataType::UInt64, false),
+        // Integers randomly selected from a narrow range of values such that
+        // there are a few distinct values, but they are repeated often.
         Field::new("u64_narrow", DataType::UInt64, false),
+        Field::new(
+            "dict10",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ),
     ])
 }
 
-fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
-    // use random numbers to avoid spurious compiler optimizations wrt to branching
-    let mut rng = StdRng::seed_from_u64(42);
-
+fn create_data(rng: &mut StdRng, size: usize, null_density: f64) -> Vec<Option<f64>> {
     (0..size)
         .map(|_| {
             if rng.random::<f64>() > null_density {
@@ -81,56 +86,54 @@ fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
         .collect()
 }
 
-fn create_integer_data(size: usize, value_density: f64) -> Vec<Option<u64>> {
-    // use random numbers to avoid spurious compiler optimizations wrt to branching
-    let mut rng = StdRng::seed_from_u64(42);
-
-    (0..size)
-        .map(|_| {
-            if rng.random::<f64>() > value_density {
-                None
-            } else {
-                Some(rng.random::<u64>())
-            }
-        })
-        .collect()
-}
-
 fn create_record_batch(
     schema: SchemaRef,
     rng: &mut StdRng,
     batch_size: usize,
-    i: usize,
+    batch_index: usize,
 ) -> RecordBatch {
-    // the 4 here is the number of different keys.
-    // a higher number increase sparseness
-    let vs = [0, 1, 2, 3];
-    let keys: Vec<String> = (0..batch_size)
-        .map(
-            // use random numbers to avoid spurious compiler optimizations wrt to branching
-            |_| format!("hi{:?}", vs.choose(rng)),
-        )
-        .collect();
-    let keys: Vec<&str> = keys.iter().map(|e| &**e).collect();
+    // Randomly choose from 4 distinct key values; a higher number increases sparseness.
+    let key_suffixes = [0, 1, 2, 3];
+    let keys = StringArray::from_iter_values(
+        (0..batch_size).map(|_| format!("hi{}", key_suffixes.choose(rng).unwrap())),
+    );
 
-    let values = create_data(batch_size, 0.5);
+    let values = create_data(rng, batch_size, 0.5);
 
     // Integer values between [0, u64::MAX].
-    let integer_values_wide = create_integer_data(batch_size, 9.0);
+    let integer_values_wide = (0..batch_size)
+        .map(|_| rng.random::<u64>())
+        .collect::<Vec<_>>();
+
+    // Integer values between [0, 1000).
+    let integer_values_mid = (0..batch_size)
+        .map(|_| rng.random_range(0..1000))
+        .collect::<Vec<_>>();
 
-    // Integer values between [0, 9].
+    // Integer values between [0, 10).
     let integer_values_narrow = (0..batch_size)
-        .map(|_| rng.random_range(0_u64..10))
+        .map(|_| rng.random_range(0..10))
         .collect::<Vec<_>>();
 
+    let mut dict_builder = StringDictionaryBuilder::<Int32Type>::new();
+    for _ in 0..batch_size {
+        if rng.random::<f64>() > 0.9 {
+            dict_builder.append_null();
+        } else {
+            dict_builder.append_value(format!("market_{}", rng.random_range(0..10)));
+        }
+    }
+
     RecordBatch::try_new(
         schema,
         vec![
-            Arc::new(StringArray::from(keys)),
-            Arc::new(Float32Array::from(vec![i as f32; batch_size])),
+            Arc::new(keys),
+            Arc::new(Float32Array::from(vec![batch_index as f32; batch_size])),
             Arc::new(Float64Array::from(values)),
             Arc::new(UInt64Array::from(integer_values_wide)),
+            Arc::new(UInt64Array::from(integer_values_mid)),
             Arc::new(UInt64Array::from(integer_values_narrow)),
+            Arc::new(dict_builder.finish()),
         ],
     )
     .unwrap()
@@ -139,19 +142,28 @@ fn create_record_batch(
 /// Create record batches of `partitions_len` partitions and `batch_size` for each batch,
 /// with a total number of `array_len` records
 pub fn create_record_batches(
-    schema: SchemaRef,
+    schema: &SchemaRef,
     array_len: usize,
     partitions_len: usize,
     batch_size: usize,
 ) -> Vec<Vec<RecordBatch>> {
     let mut rng = StdRng::seed_from_u64(42);
-    (0..partitions_len)
-        .map(|_| {
-            (0..array_len / batch_size / partitions_len)
-                .map(|i| create_record_batch(schema.clone(), &mut rng, batch_size, i))
-                .collect::<Vec<_>>()
-        })
-        .collect::<Vec<_>>()
+    let mut partitions = Vec::with_capacity(partitions_len);
+    let batches_per_partition = array_len / batch_size / partitions_len;
+
+    for _ in 0..partitions_len {
+        let mut batches = Vec::with_capacity(batches_per_partition);
+        for batch_index in 0..batches_per_partition {
+            batches.push(create_record_batch(
+                schema.clone(),
+                &mut rng,
+                batch_size,
+                batch_index,
+            ));
+        }
+        partitions.push(batches);
+    }
+    partitions
 }
 
 /// An enum that wraps either a regular StringBuilder or a GenericByteViewBuilder
@@ -181,6 +193,7 @@ impl TraceIdBuilder {
 
 /// Create time series data with `partition_cnt` partitions and `sample_cnt` rows per partition
 /// in ascending order, if `asc` is true, otherwise randomly sampled using a Pareto distribution
+#[expect(clippy::allow_attributes)] // some issue where expect(dead_code) doesn't fire properly
 #[allow(dead_code)]
 pub(crate) fn make_data(
     partition_cnt: i32,
diff --git a/datafusion/core/benches/dataframe.rs b/datafusion/core/benches/dataframe.rs
index 12eb34719e4ba..5aeade315cc7b 100644
--- a/datafusion/core/benches/dataframe.rs
+++ b/datafusion/core/benches/dataframe.rs
@@ -15,17 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate arrow;
-#[macro_use]
-extern crate criterion;
-extern crate datafusion;
-
 use arrow_schema::{DataType, Field, Schema};
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::MemTable;
 use datafusion::prelude::SessionContext;
 use datafusion_expr::col;
 use datafusion_functions::expr_fn::btrim;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
@@ -44,8 +40,9 @@ fn create_context(field_count: u32) -> datafusion_common::Result<Arc<SessionCont
     Ok(Arc::new(ctx))
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn run(column_count: u32, ctx: Arc<SessionContext>, rt: &Runtime) {
-    criterion::black_box(rt.block_on(async {
+    black_box(rt.block_on(async {
         let mut data_frame = ctx.table("t").await.unwrap();
 
         for i in 0..column_count {
diff --git a/datafusion/core/benches/distinct_query_sql.rs b/datafusion/core/benches/distinct_query_sql.rs
index c1ef55992689e..d389b1b3d6a22 100644
--- a/datafusion/core/benches/distinct_query_sql.rs
+++ b/datafusion/core/benches/distinct_query_sql.rs
@@ -15,27 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
-use crate::criterion::Criterion;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::{create_table_provider, make_data};
 use datafusion::execution::context::SessionContext;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::{datasource::MemTable, error::Result};
-use datafusion_execution::config::SessionConfig;
 use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
 
 use parking_lot::Mutex;
+use std::hint::black_box;
 use std::{sync::Arc, time::Duration};
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context(
@@ -123,9 +121,9 @@ async fn distinct_with_limit(
     Ok(())
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn run(rt: &Runtime, plan: Arc<dyn ExecutionPlan>, ctx: Arc<TaskContext>) {
-    criterion::black_box(rt.block_on(distinct_with_limit(plan.clone(), ctx.clone())))
-        .unwrap();
+    black_box(rt.block_on(distinct_with_limit(plan.clone(), ctx.clone()))).unwrap();
 }
 
 pub async fn create_context_sampled_data(
diff --git a/datafusion/core/benches/filter_query_sql.rs b/datafusion/core/benches/filter_query_sql.rs
index c82a1607184dc..3b80518d32dcd 100644
--- a/datafusion/core/benches/filter_query_sql.rs
+++ b/datafusion/core/benches/filter_query_sql.rs
@@ -20,17 +20,18 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use futures::executor::block_on;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
 async fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.sql(sql)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context(array_len: usize, batch_size: usize) -> Result<SessionContext> {
diff --git a/datafusion/core/benches/map_query_sql.rs b/datafusion/core/benches/map_query_sql.rs
index 97d47fc3b9079..67904197bc257 100644
--- a/datafusion/core/benches/map_query_sql.rs
+++ b/datafusion/core/benches/map_query_sql.rs
@@ -15,13 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashSet;
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, RecordBatch};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use parking_lot::Mutex;
-use rand::prelude::ThreadRng;
 use rand::Rng;
+use rand::prelude::ThreadRng;
 use tokio::runtime::Runtime;
 
 use datafusion::prelude::SessionContext;
@@ -32,11 +34,12 @@ use datafusion_functions_nested::map::map;
 mod data_utils;
 
 fn build_keys(rng: &mut ThreadRng) -> Vec<String> {
-    let mut keys = vec![];
-    for _ in 0..1000 {
-        keys.push(rng.random_range(0..9999).to_string());
+    let mut keys = HashSet::with_capacity(1000);
+    while keys.len() < 1000 {
+        let key = rng.random_range(0..9999).to_string();
+        keys.insert(key);
     }
-    keys
+    keys.into_iter().collect()
 }
 
 fn build_values(rng: &mut ThreadRng) -> Vec<i32> {
@@ -71,8 +74,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     let mut value_buffer = Vec::new();
 
     for i in 0..1000 {
-        key_buffer.push(Expr::Literal(ScalarValue::Utf8(Some(keys[i].clone()))));
-        value_buffer.push(Expr::Literal(ScalarValue::Int32(Some(values[i]))));
+        key_buffer.push(Expr::Literal(
+            ScalarValue::Utf8(Some(keys[i].clone())),
+            None,
+        ));
+        value_buffer.push(Expr::Literal(ScalarValue::Int32(Some(values[i])), None));
     }
     c.bench_function("map_1000_1", |b| {
         b.iter(|| {
diff --git a/datafusion/core/benches/math_query_sql.rs b/datafusion/core/benches/math_query_sql.rs
index 76824850c114c..f5df56e95a2d8 100644
--- a/datafusion/core/benches/math_query_sql.rs
+++ b/datafusion/core/benches/math_query_sql.rs
@@ -15,18 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 
 use parking_lot::Mutex;
 use std::sync::Arc;
 
 use tokio::runtime::Runtime;
 
-extern crate arrow;
-extern crate datafusion;
-
 use arrow::{
     array::{Float32Array, Float64Array},
     datatypes::{DataType, Field, Schema},
@@ -36,6 +31,7 @@ use datafusion::datasource::MemTable;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs
index 14dcdf15f173b..f099137973592 100644
--- a/datafusion/core/benches/parquet_query_sql.rs
+++ b/datafusion/core/benches/parquet_query_sql.rs
@@ -23,14 +23,14 @@ use arrow::datatypes::{
     SchemaRef,
 };
 use arrow::record_batch::RecordBatch;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::instant::Instant;
 use futures::stream::StreamExt;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{WriterProperties, WriterVersion};
-use rand::distr::uniform::SampleUniform;
 use rand::distr::Alphanumeric;
+use rand::distr::uniform::SampleUniform;
 use rand::prelude::*;
 use rand::rng;
 use std::fs::File;
@@ -45,7 +45,7 @@ const NUM_BATCHES: usize = 2048;
 /// The number of rows in each record batch to write
 const WRITE_RECORD_BATCH_SIZE: usize = 1024;
 /// The number of rows in a row group
-const ROW_GROUP_SIZE: usize = 1024 * 1024;
+const ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
 /// The number of row groups expected
 const EXPECTED_ROW_GROUPS: usize = 2;
 
@@ -154,7 +154,7 @@ fn generate_file() -> NamedTempFile {
 
     let properties = WriterProperties::builder()
         .set_writer_version(WriterVersion::PARQUET_2_0)
-        .set_max_row_group_size(ROW_GROUP_SIZE)
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
         .build();
 
     let mut writer =
@@ -166,11 +166,12 @@ fn generate_file() -> NamedTempFile {
     }
 
     let metadata = writer.close().unwrap();
+    let file_metadata = metadata.file_metadata();
     assert_eq!(
-        metadata.num_rows as usize,
+        file_metadata.num_rows() as usize,
         WRITE_RECORD_BATCH_SIZE * NUM_BATCHES
     );
-    assert_eq!(metadata.row_groups.len(), EXPECTED_ROW_GROUPS);
+    assert_eq!(metadata.row_groups().len(), EXPECTED_ROW_GROUPS);
 
     println!(
         "Generated parquet file in {} seconds",
diff --git a/datafusion/core/benches/parquet_struct_query.rs b/datafusion/core/benches/parquet_struct_query.rs
new file mode 100644
index 0000000000000..e7e91f0dd0e1e
--- /dev/null
+++ b/datafusion/core/benches/parquet_struct_query.rs
@@ -0,0 +1,312 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks of SQL queries on struct columns in parquet data
+
+use arrow::array::{ArrayRef, Int32Array, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::SessionContext;
+use datafusion_common::instant::Instant;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::{WriterProperties, WriterVersion};
+use rand::distr::Alphanumeric;
+use rand::prelude::*;
+use rand::rng;
+use std::hint::black_box;
+use std::ops::Range;
+use std::path::Path;
+use std::sync::Arc;
+use tempfile::NamedTempFile;
+use tokio::runtime::Runtime;
+
+/// The number of batches to write
+const NUM_BATCHES: usize = 128;
+/// The number of rows in each record batch to write
+const WRITE_RECORD_BATCH_SIZE: usize = 4096;
+/// The number of rows in a row group
+const ROW_GROUP_ROW_COUNT: usize = 65536;
+/// The number of row groups expected
+const EXPECTED_ROW_GROUPS: usize = 8;
+/// The range for random string lengths
+const STRING_LENGTH_RANGE: Range<usize> = 50..200;
+
+fn schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("value", DataType::Utf8, false),
+    ]);
+    let struct_type = DataType::Struct(struct_fields);
+
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("s", struct_type, false),
+    ]))
+}
+
+fn generate_strings(len: usize) -> ArrayRef {
+    let mut rng = rng();
+    Arc::new(StringArray::from_iter((0..len).map(|_| {
+        let string_len = rng.random_range(STRING_LENGTH_RANGE.clone());
+        Some(
+            (0..string_len)
+                .map(|_| char::from(rng.sample(Alphanumeric)))
+                .collect::<String>(),
+        )
+    })))
+}
+
+fn generate_batch(batch_id: usize) -> RecordBatch {
+    let schema = schema();
+    let len = WRITE_RECORD_BATCH_SIZE;
+
+    // Generate sequential IDs based on batch_id for uniqueness
+    let base_id = (batch_id * len) as i32;
+    let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
+    let id_array = Arc::new(Int32Array::from(id_values.clone()));
+
+    // Create struct id array (matching top-level id)
+    let struct_id_array = Arc::new(Int32Array::from(id_values));
+
+    // Generate random strings for struct value field
+    let value_array = generate_strings(len);
+
+    // Construct StructArray
+    let struct_array = StructArray::from(vec![
+        (
+            Arc::new(Field::new("id", DataType::Int32, false)),
+            struct_id_array as ArrayRef,
+        ),
+        (
+            Arc::new(Field::new("value", DataType::Utf8, false)),
+            value_array,
+        ),
+    ]);
+
+    RecordBatch::try_new(schema, vec![id_array, Arc::new(struct_array)]).unwrap()
+}
+
+fn generate_file() -> NamedTempFile {
+    let now = Instant::now();
+    let mut named_file = tempfile::Builder::new()
+        .prefix("parquet_struct_query")
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+
+    println!("Generating parquet file - {}", named_file.path().display());
+    let schema = schema();
+
+    let properties = WriterProperties::builder()
+        .set_writer_version(WriterVersion::PARQUET_2_0)
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer =
+        ArrowWriter::try_new(&mut named_file, schema, Some(properties)).unwrap();
+
+    for batch_id in 0..NUM_BATCHES {
+        let batch = generate_batch(batch_id);
+        writer.write(&batch).unwrap();
+    }
+
+    let metadata = writer.close().unwrap();
+    let file_metadata = metadata.file_metadata();
+    let expected_rows = WRITE_RECORD_BATCH_SIZE * NUM_BATCHES;
+    assert_eq!(
+        file_metadata.num_rows() as usize,
+        expected_rows,
+        "Expected {} rows but got {}",
+        expected_rows,
+        file_metadata.num_rows()
+    );
+    assert_eq!(
+        metadata.row_groups().len(),
+        EXPECTED_ROW_GROUPS,
+        "Expected {} row groups but got {}",
+        EXPECTED_ROW_GROUPS,
+        metadata.row_groups().len()
+    );
+
+    println!(
+        "Generated parquet file with {} rows and {} row groups in {} seconds",
+        file_metadata.num_rows(),
+        metadata.row_groups().len(),
+        now.elapsed().as_secs_f32()
+    );
+
+    named_file
+}
+
+fn create_context(file_path: &str) -> SessionContext {
+    let ctx = SessionContext::new();
+    let rt = Runtime::new().unwrap();
+    rt.block_on(ctx.register_parquet("t", file_path, Default::default()))
+        .unwrap();
+    ctx
+}
+
+fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) {
+    let ctx = ctx.clone();
+    let sql = sql.to_string();
+    let df = rt.block_on(ctx.sql(&sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let (file_path, temp_file) = match std::env::var("PARQUET_FILE") {
+        Ok(file) => (file, None),
+        Err(_) => {
+            let temp_file = generate_file();
+            (temp_file.path().display().to_string(), Some(temp_file))
+        }
+    };
+
+    assert!(Path::new(&file_path).exists(), "path not found");
+    println!("Using parquet file {file_path}");
+
+    let ctx = create_context(&file_path);
+    let rt = Runtime::new().unwrap();
+
+    // Basic struct access
+    c.bench_function("struct_access", |b| {
+        b.iter(|| query(&ctx, &rt, "select id, s['id'] from t"))
+    });
+
+    // Filter queries
+    c.bench_function("filter_struct_field_eq", |b| {
+        b.iter(|| query(&ctx, &rt, "select id from t where s['id'] = 5"))
+    });
+
+    c.bench_function("filter_struct_field_with_select", |b| {
+        b.iter(|| query(&ctx, &rt, "select id, s['id'] from t where s['id'] = 5"))
+    });
+
+    c.bench_function("filter_top_level_with_struct_select", |b| {
+        b.iter(|| query(&ctx, &rt, "select s['id'] from t where id = 5"))
+    });
+
+    c.bench_function("filter_struct_string_length", |b| {
+        b.iter(|| query(&ctx, &rt, "select id from t where length(s['value']) > 100"))
+    });
+
+    c.bench_function("filter_struct_range", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id from t where s['id'] > 100 and s['id'] < 200",
+            )
+        })
+    });
+
+    // Join queries (limited with WHERE id < 1000 for performance)
+    c.bench_function("join_struct_to_struct", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.s['id'] where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_to_toplevel", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.id where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_toplevel_to_struct", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.id = t2.s['id'] where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_to_struct_with_top_level", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.id from t t1 join t t2 on t1.s['id'] = t2.s['id'] and t1.id = t2.id where t1.id < 1000"
+        ))
+    });
+
+    c.bench_function("join_struct_and_struct_value", |b| {
+        b.iter(|| query(
+            &ctx,
+            &rt,
+            "select t1.s['id'], t2.s['value'] from t t1 join t t2 on t1.id = t2.id where t1.id < 1000"
+        ))
+    });
+
+    // Group by queries
+    c.bench_function("group_by_struct_field", |b| {
+        b.iter(|| query(&ctx, &rt, "select s['id'] from t group by s['id']"))
+    });
+
+    c.bench_function("group_by_struct_select_toplevel", |b| {
+        b.iter(|| query(&ctx, &rt, "select max(id) from t group by s['id']"))
+    });
+
+    c.bench_function("group_by_toplevel_select_struct", |b| {
+        b.iter(|| query(&ctx, &rt, "select max(s['id']) from t group by id"))
+    });
+
+    c.bench_function("group_by_struct_with_count", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select s['id'], count(*) from t group by s['id']",
+            )
+        })
+    });
+
+    c.bench_function("group_by_multiple_with_count", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id, s['id'], count(*) from t group by id, s['id']",
+            )
+        })
+    });
+
+    // Additional queries
+    c.bench_function("order_by_struct_limit", |b| {
+        b.iter(|| {
+            query(
+                &ctx,
+                &rt,
+                "select id, s['id'] from t order by s['id'] limit 1000",
+            )
+        })
+    });
+
+    c.bench_function("distinct_struct_field", |b| {
+        b.iter(|| query(&ctx, &rt, "select distinct s['id'] from t"))
+    });
+
+    // Temporary file must outlive the benchmarks, it is deleted when dropped
+    drop(temp_file);
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/physical_plan.rs b/datafusion/core/benches/physical_plan.rs
index 0a65c52f72def..7b66996b05929 100644
--- a/datafusion/core/benches/physical_plan.rs
+++ b/datafusion/core/benches/physical_plan.rs
@@ -15,11 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::{BatchSize, Criterion};
-extern crate arrow;
-extern crate datafusion;
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 
 use std::sync::Arc;
 
@@ -32,7 +28,7 @@ use tokio::runtime::Runtime;
 use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion::physical_plan::{
     collect,
-    expressions::{col, PhysicalSortExpr},
+    expressions::{PhysicalSortExpr, col},
 };
 use datafusion::prelude::SessionContext;
 use datafusion_datasource::memory::MemorySourceConfig;
@@ -40,6 +36,7 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 // Initialize the operator using the provided record batches and the sort key
 // as inputs. All record batches must have the same schema.
+#[expect(clippy::needless_pass_by_value)]
 fn sort_preserving_merge_operator(
     session_ctx: Arc<SessionContext>,
     rt: &Runtime,
@@ -50,11 +47,8 @@ fn sort_preserving_merge_operator(
 
     let sort = sort
         .iter()
-        .map(|name| PhysicalSortExpr {
-            expr: col(name, &schema).unwrap(),
-            options: Default::default(),
-        })
-        .collect::<LexOrdering>();
+        .map(|name| PhysicalSortExpr::new_default(col(name, &schema).unwrap()));
+    let sort = LexOrdering::new(sort).unwrap();
 
     let exec = MemorySourceConfig::try_new_exec(
         &batches.into_iter().map(|rb| vec![rb]).collect::<Vec<_>>(),
diff --git a/datafusion/core/benches/preserve_file_partitioning.rs b/datafusion/core/benches/preserve_file_partitioning.rs
new file mode 100644
index 0000000000000..9b1f59adc6823
--- /dev/null
+++ b/datafusion/core/benches/preserve_file_partitioning.rs
@@ -0,0 +1,838 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for `preserve_file_partitions` optimization.
+//!
+//! When enabled, this optimization declares Hive-partitioned tables as
+//! `Hash([partition_col])` partitioned, allowing the query optimizer to
+//! skip unnecessary repartitioning and sorting operations.
+//!
+//! When This Optimization Helps
+//! - Window functions: PARTITION BY on partition column eliminates RepartitionExec and SortExec
+//! - Aggregates with ORDER BY: GROUP BY partition column and ORDER BY eliminates post aggregate sort
+//!
+//! When This Optimization Does NOT Help
+//! - GROUP BY non-partition columns: Required Hash distribution doesn't match declared partitioning
+//! - When the number of distinct file partitioning groups < the number of CPUs available: Reduces
+//!   parallelization, thus may outweigh the pros of reduced shuffles
+//!
+//! Usage
+//! - BENCH_SIZE=small|medium|large cargo bench -p datafusion --bench preserve_file_partitions
+//! - SAVE_PLANS=1 cargo bench ...  # Save query plans to files
+
+use arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray};
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty::pretty_format_batches;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext, col};
+use datafusion_expr::SortExpr;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::WriterProperties;
+use std::fs::{self, File};
+use std::io::Write;
+use std::path::Path;
+use std::sync::Arc;
+use tempfile::TempDir;
+use tokio::runtime::Runtime;
+
+#[derive(Debug, Clone, Copy)]
+struct BenchConfig {
+    fact_partitions: usize,
+    rows_per_partition: usize,
+    target_partitions: usize,
+    measurement_time_secs: u64,
+}
+
+impl BenchConfig {
+    fn small() -> Self {
+        Self {
+            fact_partitions: 10,
+            rows_per_partition: 1_000_000,
+            target_partitions: 10,
+            measurement_time_secs: 15,
+        }
+    }
+
+    fn medium() -> Self {
+        Self {
+            fact_partitions: 30,
+            rows_per_partition: 3_000_000,
+            target_partitions: 30,
+            measurement_time_secs: 30,
+        }
+    }
+
+    fn large() -> Self {
+        Self {
+            fact_partitions: 50,
+            rows_per_partition: 6_000_000,
+            target_partitions: 50,
+            measurement_time_secs: 90,
+        }
+    }
+
+    fn from_env() -> Self {
+        match std::env::var("BENCH_SIZE").as_deref() {
+            Ok("small") | Ok("SMALL") => Self::small(),
+            Ok("medium") | Ok("MEDIUM") => Self::medium(),
+            Ok("large") | Ok("LARGE") => Self::large(),
+            _ => {
+                println!("Using SMALL dataset (set BENCH_SIZE=small|medium|large)");
+                Self::small()
+            }
+        }
+    }
+
+    fn total_rows(&self) -> usize {
+        self.fact_partitions * self.rows_per_partition
+    }
+
+    fn high_cardinality(base: &Self) -> Self {
+        Self {
+            fact_partitions: (base.fact_partitions as f64 * 2.5) as usize,
+            rows_per_partition: base.rows_per_partition / 2,
+            target_partitions: base.target_partitions,
+            measurement_time_secs: base.measurement_time_secs,
+        }
+    }
+}
+
+fn dkey_names(count: usize) -> Vec<String> {
+    (0..count)
+        .map(|i| {
+            if i < 26 {
+                ((b'A' + i as u8) as char).to_string()
+            } else {
+                format!(
+                    "{}{}",
+                    (b'A' + ((i / 26) - 1) as u8) as char,
+                    (b'A' + (i % 26) as u8) as char
+                )
+            }
+        })
+        .collect()
+}
+
+/// Hive-partitioned fact table, sorted by timestamp within each partition.
+fn generate_fact_table(
+    base_dir: &Path,
+    num_partitions: usize,
+    rows_per_partition: usize,
+) {
+    let fact_dir = base_dir.join("fact");
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(
+            "timestamp",
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            false,
+        ),
+        Field::new("value", DataType::Float64, false),
+    ]));
+
+    let props = WriterProperties::builder()
+        .set_compression(parquet::basic::Compression::SNAPPY)
+        .build();
+
+    let dkeys = dkey_names(num_partitions);
+
+    for dkey in &dkeys {
+        let part_dir = fact_dir.join(format!("f_dkey={dkey}"));
+        fs::create_dir_all(&part_dir).unwrap();
+        let file_path = part_dir.join("data.parquet");
+        let file = File::create(file_path).unwrap();
+
+        let mut writer =
+            ArrowWriter::try_new(file, schema.clone(), Some(props.clone())).unwrap();
+
+        let base_ts = 1672567200000i64; // 2023-01-01T09:00:00
+        let timestamps: Vec<i64> = (0..rows_per_partition)
+            .map(|i| base_ts + (i as i64 * 10000))
+            .collect();
+
+        let values: Vec<f64> = (0..rows_per_partition)
+            .map(|i| 50.0 + (i % 100) as f64 + ((i % 7) as f64 * 10.0))
+            .collect();
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(TimestampMillisecondArray::from(timestamps)) as ArrayRef,
+                Arc::new(Float64Array::from(values)),
+            ],
+        )
+        .unwrap();
+
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+    }
+}
+
+/// Single-file dimension table for CollectLeft joins.
+fn generate_dimension_table(base_dir: &Path, num_partitions: usize) {
+    let dim_dir = base_dir.join("dimension");
+    fs::create_dir_all(&dim_dir).unwrap();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("d_dkey", DataType::Utf8, false),
+        Field::new("env", DataType::Utf8, false),
+        Field::new("service", DataType::Utf8, false),
+        Field::new("host", DataType::Utf8, false),
+    ]));
+
+    let props = WriterProperties::builder()
+        .set_compression(parquet::basic::Compression::SNAPPY)
+        .build();
+
+    let file_path = dim_dir.join("data.parquet");
+    let file = File::create(file_path).unwrap();
+    let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props)).unwrap();
+
+    let dkeys = dkey_names(num_partitions);
+    let envs = ["dev", "prod", "staging", "test"];
+    let services = ["log", "trace", "metric"];
+    let hosts = ["ma", "vim", "nano", "emacs"];
+
+    let d_dkey_vals: Vec<String> = dkeys.clone();
+    let env_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| envs[i % envs.len()].to_string())
+        .collect();
+    let service_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| services[i % services.len()].to_string())
+        .collect();
+    let host_vals: Vec<String> = dkeys
+        .iter()
+        .enumerate()
+        .map(|(i, _)| hosts[i % hosts.len()].to_string())
+        .collect();
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(StringArray::from(d_dkey_vals)) as ArrayRef,
+            Arc::new(StringArray::from(env_vals)),
+            Arc::new(StringArray::from(service_vals)),
+            Arc::new(StringArray::from(host_vals)),
+        ],
+    )
+    .unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+}
+
+struct BenchVariant {
+    name: &'static str,
+    preserve_file_partitions: usize,
+    prefer_existing_sort: bool,
+}
+
+const BENCH_VARIANTS: [BenchVariant; 3] = [
+    BenchVariant {
+        name: "with_optimization",
+        preserve_file_partitions: 1,
+        prefer_existing_sort: false,
+    },
+    BenchVariant {
+        name: "prefer_existing_sort",
+        preserve_file_partitions: 0,
+        prefer_existing_sort: true,
+    },
+    BenchVariant {
+        name: "without_optimization",
+        preserve_file_partitions: 0,
+        prefer_existing_sort: false,
+    },
+];
+
+async fn save_plans(
+    output_file: &Path,
+    fact_path: &str,
+    dim_path: Option<&str>,
+    target_partitions: usize,
+    query: &str,
+    file_sort_order: Option<Vec<Vec<SortExpr>>>,
+) {
+    let mut file = File::create(output_file).unwrap();
+    writeln!(file, "Query: {query}\n").unwrap();
+
+    for variant in &BENCH_VARIANTS {
+        let session_config = SessionConfig::new()
+            .with_target_partitions(target_partitions)
+            .set_usize(
+                "datafusion.optimizer.preserve_file_partitions",
+                variant.preserve_file_partitions,
+            )
+            .set_bool(
+                "datafusion.optimizer.prefer_existing_sort",
+                variant.prefer_existing_sort,
+            );
+        let ctx = SessionContext::new_with_config(session_config);
+
+        let mut fact_options = ParquetReadOptions {
+            table_partition_cols: vec![("f_dkey".to_string(), DataType::Utf8)],
+            ..Default::default()
+        };
+        if let Some(ref order) = file_sort_order {
+            fact_options.file_sort_order = order.clone();
+        }
+        ctx.register_parquet("fact", fact_path, fact_options)
+            .await
+            .unwrap();
+
+        if let Some(dim) = dim_path {
+            let dim_schema = Arc::new(Schema::new(vec![
+                Field::new("d_dkey", DataType::Utf8, false),
+                Field::new("env", DataType::Utf8, false),
+                Field::new("service", DataType::Utf8, false),
+                Field::new("host", DataType::Utf8, false),
+            ]));
+            let dim_options = ParquetReadOptions {
+                schema: Some(&dim_schema),
+                ..Default::default()
+            };
+            ctx.register_parquet("dimension", dim, dim_options)
+                .await
+                .unwrap();
+        }
+
+        let df = ctx.sql(query).await.unwrap();
+        let plan = df.explain(false, false).unwrap().collect().await.unwrap();
+        writeln!(file, "=== {} ===", variant.name).unwrap();
+        writeln!(file, "{}\n", pretty_format_batches(&plan).unwrap()).unwrap();
+    }
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_benchmark(
+    c: &mut Criterion,
+    rt: &Runtime,
+    name: &str,
+    fact_path: &str,
+    dim_path: Option<&str>,
+    target_partitions: usize,
+    query: &str,
+    file_sort_order: &Option<Vec<Vec<SortExpr>>>,
+) {
+    if std::env::var("SAVE_PLANS").is_ok() {
+        let output_path = format!("{name}_plans.txt");
+        rt.block_on(save_plans(
+            Path::new(&output_path),
+            fact_path,
+            dim_path,
+            target_partitions,
+            query,
+            file_sort_order.clone(),
+        ));
+        println!("Plans saved to {output_path}");
+    }
+
+    let mut group = c.benchmark_group(name);
+
+    for variant in &BENCH_VARIANTS {
+        let fact_path_owned = fact_path.to_string();
+        let dim_path_owned = dim_path.map(|s| s.to_string());
+        let sort_order = file_sort_order.clone();
+        let query_owned = query.to_string();
+        let preserve_file_partitions = variant.preserve_file_partitions;
+        let prefer_existing_sort = variant.prefer_existing_sort;
+
+        group.bench_function(variant.name, |b| {
+            b.to_async(rt).iter(|| {
+                let fact_path = fact_path_owned.clone();
+                let dim_path = dim_path_owned.clone();
+                let sort_order = sort_order.clone();
+                let query = query_owned.clone();
+                async move {
+                    let session_config = SessionConfig::new()
+                        .with_target_partitions(target_partitions)
+                        .set_usize(
+                            "datafusion.optimizer.preserve_file_partitions",
+                            preserve_file_partitions,
+                        )
+                        .set_bool(
+                            "datafusion.optimizer.prefer_existing_sort",
+                            prefer_existing_sort,
+                        );
+                    let ctx = SessionContext::new_with_config(session_config);
+
+                    let mut fact_options = ParquetReadOptions {
+                        table_partition_cols: vec![(
+                            "f_dkey".to_string(),
+                            DataType::Utf8,
+                        )],
+                        ..Default::default()
+                    };
+                    if let Some(ref order) = sort_order {
+                        fact_options.file_sort_order = order.clone();
+                    }
+                    ctx.register_parquet("fact", &fact_path, fact_options)
+                        .await
+                        .unwrap();
+
+                    if let Some(ref dim) = dim_path {
+                        let dim_schema = Arc::new(Schema::new(vec![
+                            Field::new("d_dkey", DataType::Utf8, false),
+                            Field::new("env", DataType::Utf8, false),
+                            Field::new("service", DataType::Utf8, false),
+                            Field::new("host", DataType::Utf8, false),
+                        ]));
+                        let dim_options = ParquetReadOptions {
+                            schema: Some(&dim_schema),
+                            ..Default::default()
+                        };
+                        ctx.register_parquet("dimension", dim, dim_options)
+                            .await
+                            .unwrap();
+                    }
+
+                    let df = ctx.sql(&query).await.unwrap();
+                    df.collect().await.unwrap()
+                }
+            })
+        });
+    }
+
+    group.finish();
+}
+
+/// Aggregate on high-cardinality partitions which eliminates repartition and sort.
+///
+/// Query: SELECT f_dkey, COUNT(*), SUM(value) FROM fact GROUP BY f_dkey ORDER BY f_dkey
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │ Sort Preserved                                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     AggregateExec         │ No repartitioning needed                                                │
+/// │   │   (SinglePartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=Hash([f_dkey])                                             │
+/// │   │   file_groups={N groups}  │                                                                         │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │ Sort Preserved                                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle with order preservation                                    │
+/// │   │  Hash([f_dkey], N)        │ Uses k-way merge to maintain sort, has overhead                         │
+/// │   │  preserve_order=true      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning                                        │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │ FinalPartitioned                                                        │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   │    [f_dkey ASC]           │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   │     Hash([f_dkey], N)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning                                                     │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    hc_fact_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f_dkey, COUNT(*) as cnt, SUM(value) as total \
+                 FROM fact \
+                 GROUP BY f_dkey \
+                 ORDER BY f_dkey";
+
+    let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order",
+        hc_fact_path,
+        None,
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+/// Join and aggregate on partition column which demonstrates propagation through join.
+///
+/// Query: SELECT f.f_dkey, MAX(d.env), ... FROM fact f JOIN dimension d ON f.f_dkey = d.d_dkey
+///        WHERE d.service = 'log' GROUP BY f.f_dkey ORDER BY f.f_dkey
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │ Hash partitioning propagates through join                               │
+/// │   │    (SinglePartitioned)    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │ Hash partitioning preserved on probe side                               │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │  partitioning=Hash([f_dkey]), output_ordering=[f_dkey]           │
+/// │   │ Table │    │  (fact, N groups)   │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │  Hash shuffle with order preservation                                   │
+/// │   │     preserve_order=true   │  Uses k-way merge to maintain sort, has overhead                        │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │                                                                         │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │ partitioning=UnknownPartitioning, output_ordering=[f_dkey]       │
+/// │   │ Table │    │      (fact)         │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │  SortPreservingMergeExec  │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │    (FinalPartitioned)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      AggregateExec        │                                                                         │
+/// │   │        (Partial)          │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │      HashJoinExec         │                                                                         │
+/// │   │     (CollectLeft)         │                                                                         │
+/// │   └──────────┬────────────────┘                                                                         │
+/// │              │                                                                                          │
+/// │       ┌──────┴──────┐                                                                                   │
+/// │       │             │                                                                                   │
+/// │   ┌───▼───┐    ┌────▼────────────────┐                                                                  │
+/// │   │ Dim   │    │   DataSourceExec    │ partitioning=UnknownPartitioning, output_ordering=[f_dkey]       │
+/// │   │ Table │    │      (fact)         │                                                                  │
+/// │   └───────┘    └─────────────────────┘                                                                  │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_join_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    hc_fact_path: &str,
+    dim_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f.f_dkey, MAX(d.env), MAX(d.service), COUNT(*), SUM(f.value) \
+                 FROM fact f \
+                 INNER JOIN dimension d ON f.f_dkey = d.d_dkey \
+                 WHERE d.service = 'log' \
+                 GROUP BY f.f_dkey \
+                 ORDER BY f.f_dkey";
+
+    let file_sort_order = vec![vec![col("f_dkey").sort(true, false)]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order_join",
+        hc_fact_path,
+        Some(dim_path),
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+/// Window function with LIMIT which demonstrates partition and sort elimination.
+///
+/// Query: SELECT f_dkey, timestamp, value,
+///               ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+///        FROM fact LIMIT 1000
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                          with_optimization                                              │
+/// │                                   (preserve_file_partitions=enabled)                                    │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   │        (LIMIT 1000)       │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │ No repaartition needed                                                  │
+/// │   │  PARTITION BY f_dkey      │                                                                         │
+/// │   │  ORDER BY timestamp       │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=Hash([f_dkey]), output_ordering=[f_dkey, timestamp]        │
+/// │   │   file_groups={N groups}  │                                                                         │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                        prefer_existing_sort                                             │
+/// │                         (preserve_file_partitions=disabled, prefer_existing_sort=true)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle with order preservation                                    │
+/// │   │  Hash([f_dkey], N)        │ Uses k-way merge to maintain sort, has overhead                         │
+/// │   │  preserve_order=true      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning, output_ordering=[f_dkey, timestamp]   │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+///
+/// ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+/// │                                       without_optimization                                              │
+/// │                        (preserve_file_partitions=disabled, prefer_existing_sort=false)                  │
+/// │                                                                                                         │
+/// │   ┌───────────────────────────┐                                                                         │
+/// │   │       GlobalLimitExec     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │   BoundedWindowAggExec    │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │        SortExec           │ Must sort after shuffle                                                 │
+/// │   │  [f_dkey, timestamp]      │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     RepartitionExec       │ Hash shuffle destroys ordering                                          │
+/// │   │     Hash([f_dkey], N)     │                                                                         │
+/// │   └─────────────┬─────────────┘                                                                         │
+/// │                 │                                                                                       │
+/// │   ┌─────────────▼─────────────┐                                                                         │
+/// │   │     DataSourceExec        │ partitioning=UnknownPartitioning, output_ordering=[f_dkey, timestamp]   │
+/// │   └───────────────────────────┘                                                                         │
+/// └─────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+fn preserve_order_window_bench(
+    c: &mut Criterion,
+    rt: &Runtime,
+    fact_path: &str,
+    target_partitions: usize,
+) {
+    let query = "SELECT f_dkey, timestamp, value, \
+                        ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn \
+                 FROM fact \
+                 LIMIT 1000";
+
+    let file_sort_order = vec![vec![
+        col("f_dkey").sort(true, false),
+        col("timestamp").sort(true, false),
+    ]];
+
+    run_benchmark(
+        c,
+        rt,
+        "preserve_order_window",
+        fact_path,
+        None,
+        target_partitions,
+        query,
+        &Some(file_sort_order),
+    );
+}
+
+fn benchmark_main(c: &mut Criterion) {
+    let config = BenchConfig::from_env();
+    let hc_config = BenchConfig::high_cardinality(&config);
+
+    println!("\n=== Preserve File Partitioning Benchmark ===");
+    println!(
+        "Normal config: {} partitions × {} rows = {} total rows",
+        config.fact_partitions,
+        config.rows_per_partition,
+        config.total_rows()
+    );
+    println!(
+        "High-cardinality config: {} partitions × {} rows = {} total rows",
+        hc_config.fact_partitions,
+        hc_config.rows_per_partition,
+        hc_config.total_rows()
+    );
+    println!("Target partitions: {}\n", config.target_partitions);
+
+    let tmp_dir = TempDir::new().unwrap();
+    println!("Generating data...");
+
+    // High-cardinality fact table
+    generate_fact_table(
+        tmp_dir.path(),
+        hc_config.fact_partitions,
+        hc_config.rows_per_partition,
+    );
+    let hc_fact_dir = tmp_dir.path().join("fact_hc");
+    fs::rename(tmp_dir.path().join("fact"), &hc_fact_dir).unwrap();
+    let hc_fact_path = hc_fact_dir.to_str().unwrap().to_string();
+
+    // Normal fact table
+    generate_fact_table(
+        tmp_dir.path(),
+        config.fact_partitions,
+        config.rows_per_partition,
+    );
+    let fact_path = tmp_dir.path().join("fact").to_str().unwrap().to_string();
+
+    // Dimension table (for join)
+    generate_dimension_table(tmp_dir.path(), hc_config.fact_partitions);
+    let dim_path = tmp_dir
+        .path()
+        .join("dimension")
+        .to_str()
+        .unwrap()
+        .to_string();
+
+    println!("Done.\n");
+
+    let rt = Runtime::new().unwrap();
+
+    preserve_order_bench(c, &rt, &hc_fact_path, hc_config.target_partitions);
+    preserve_order_join_bench(
+        c,
+        &rt,
+        &hc_fact_path,
+        &dim_path,
+        hc_config.target_partitions,
+    );
+    preserve_order_window_bench(c, &rt, &fact_path, config.target_partitions);
+}
+
+criterion_group! {
+    name = benches;
+    config = {
+        let config = BenchConfig::from_env();
+        Criterion::default()
+            .measurement_time(std::time::Duration::from_secs(config.measurement_time_secs))
+            .sample_size(10)
+    };
+    targets = benchmark_main
+}
+criterion_main!(benches);
diff --git a/datafusion/core/benches/push_down_filter.rs b/datafusion/core/benches/push_down_filter.rs
index 139fb12c30947..d41085907dbc8 100644
--- a/datafusion/core/benches/push_down_filter.rs
+++ b/datafusion/core/benches/push_down_filter.rs
@@ -18,16 +18,16 @@
 use arrow::array::RecordBatch;
 use arrow::datatypes::{DataType, Field, Schema};
 use bytes::{BufMut, BytesMut};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::config::ConfigOptions;
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_plan::ExecutionPlan;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::ObjectStore;
+use object_store::{ObjectStore, ObjectStoreExt};
 use parquet::arrow::ArrowWriter;
 use std::sync::Arc;
 
diff --git a/datafusion/core/benches/range_and_generate_series.rs b/datafusion/core/benches/range_and_generate_series.rs
new file mode 100644
index 0000000000000..10d560df0813e
--- /dev/null
+++ b/datafusion/core/benches/range_and_generate_series.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod data_utils;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::execution::context::SessionContext;
+use parking_lot::Mutex;
+use std::hint::black_box;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+#[expect(clippy::needless_pass_by_value)]
+fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
+    let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn create_context() -> Arc<Mutex<SessionContext>> {
+    let ctx = SessionContext::new();
+    Arc::new(Mutex::new(ctx))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ctx = create_context();
+    let rt = Runtime::new().unwrap();
+
+    c.bench_function("range(1000000)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(1000000)"))
+    });
+
+    c.bench_function("generate_series(1000000)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(1000000)",
+            )
+        })
+    });
+
+    c.bench_function("range(0, 1000000, 5)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(0, 1000000, 5)"))
+    });
+
+    c.bench_function("generate_series(0, 1000000, 5)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(0, 1000000, 5)",
+            )
+        })
+    });
+
+    c.bench_function("range(1000000, 0, -5)", |b| {
+        b.iter(|| query(ctx.clone(), &rt, "SELECT value from range(1000000, 0, -5)"))
+    });
+
+    c.bench_function("generate_series(1000000, 0, -5)", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                "SELECT value from generate_series(1000000, 0, -5)",
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/reset_plan_states.rs b/datafusion/core/benches/reset_plan_states.rs
new file mode 100644
index 0000000000000..5afae7f43242d
--- /dev/null
+++ b/datafusion/core/benches/reset_plan_states.rs
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::{Arc, LazyLock};
+
+use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion::prelude::SessionContext;
+use datafusion_catalog::MemTable;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::displayable;
+use datafusion_physical_plan::execution_plan::reset_plan_states;
+use tokio::runtime::Runtime;
+
+const NUM_FIELDS: usize = 1000;
+const PREDICATE_LEN: usize = 50;
+
+static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    Arc::new(Schema::new(
+        (0..NUM_FIELDS)
+            .map(|i| Arc::new(Field::new(format!("x_{i}"), DataType::Int64, false)))
+            .collect::<Fields>(),
+    ))
+});
+
+fn col_name(i: usize) -> String {
+    format!("x_{i}")
+}
+
+fn aggr_name(i: usize) -> String {
+    format!("aggr_{i}")
+}
+
+fn physical_plan(
+    ctx: &SessionContext,
+    rt: &Runtime,
+    sql: &str,
+) -> Arc<dyn ExecutionPlan> {
+    rt.block_on(async {
+        ctx.sql(sql)
+            .await
+            .unwrap()
+            .create_physical_plan()
+            .await
+            .unwrap()
+    })
+}
+
+fn predicate(col_name: impl Fn(usize) -> String, len: usize) -> String {
+    let mut predicate = String::new();
+    for i in 0..len {
+        if i > 0 {
+            predicate.push_str(" AND ");
+        }
+        predicate.push_str(&col_name(i));
+        predicate.push_str(" = ");
+        predicate.push_str(&i.to_string());
+    }
+    predicate
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT aggr1(col1) as aggr1, aggr2(col2) as aggr2 FROM t
+/// WHERE p1
+/// HAVING p2
+/// ```
+///
+/// Where `p1` and `p2` some long predicates.
+///
+fn query1() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+    for i in 0..NUM_FIELDS {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        query.push_str("AVG(");
+        query.push_str(&col_name(i));
+        query.push_str(") AS ");
+        query.push_str(&aggr_name(i));
+    }
+    query.push_str(" FROM t WHERE ");
+    query.push_str(&predicate(col_name, PREDICATE_LEN));
+    query.push_str(" HAVING ");
+    query.push_str(&predicate(aggr_name, PREDICATE_LEN));
+    query
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT projection FROM t JOIN v ON t.a = v.a
+/// WHERE p1
+/// ```
+///
+fn query2() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+    for i in (0..NUM_FIELDS).step_by(2) {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        if (i / 2) % 2 == 0 {
+            query.push_str(&format!("t.{}", col_name(i)));
+        } else {
+            query.push_str(&format!("v.{}", col_name(i)));
+        }
+    }
+    query.push_str(" FROM t JOIN v ON t.x_0 = v.x_0 WHERE ");
+
+    fn qualified_name(i: usize) -> String {
+        format!("t.{}", col_name(i))
+    }
+
+    query.push_str(&predicate(qualified_name, PREDICATE_LEN));
+    query
+}
+
+/// Returns a typical plan for the query like:
+///
+/// ```sql
+/// SELECT projection FROM t
+/// WHERE p
+/// ```
+///
+fn query3() -> String {
+    let mut query = String::new();
+    query.push_str("SELECT ");
+
+    // Create non-trivial projection.
+    for i in 0..NUM_FIELDS / 2 {
+        if i > 0 {
+            query.push_str(", ");
+        }
+        query.push_str(&col_name(i * 2));
+        query.push_str(" + ");
+        query.push_str(&col_name(i * 2 + 1));
+    }
+
+    query.push_str(" FROM t WHERE ");
+    query.push_str(&predicate(col_name, PREDICATE_LEN));
+    query
+}
+
+fn run_reset_states(b: &mut criterion::Bencher, plan: &Arc<dyn ExecutionPlan>) {
+    b.iter(|| std::hint::black_box(reset_plan_states(Arc::clone(plan)).unwrap()));
+}
+
+/// Benchmark is intended to measure overhead of actions, required to perform
+/// making an independent instance of the execution plan to re-execute it, avoiding
+/// re-planning stage.
+fn bench_reset_plan_states(c: &mut Criterion) {
+    env_logger::init();
+
+    let rt = Runtime::new().unwrap();
+    let ctx = SessionContext::new();
+    ctx.register_table(
+        "t",
+        Arc::new(MemTable::try_new(Arc::clone(&SCHEMA), vec![vec![], vec![]]).unwrap()),
+    )
+    .unwrap();
+
+    ctx.register_table(
+        "v",
+        Arc::new(MemTable::try_new(Arc::clone(&SCHEMA), vec![vec![], vec![]]).unwrap()),
+    )
+    .unwrap();
+
+    macro_rules! bench_query {
+        ($query_producer: expr) => {{
+            let sql = $query_producer();
+            let plan = physical_plan(&ctx, &rt, &sql);
+            log::debug!("plan:\n{}", displayable(plan.as_ref()).indent(true));
+            move |b| run_reset_states(b, &plan)
+        }};
+    }
+
+    c.bench_function("query1", bench_query!(query1));
+    c.bench_function("query2", bench_query!(query2));
+    c.bench_function("query3", bench_query!(query3));
+}
+
+criterion_group!(benches, bench_reset_plan_states);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/scalar.rs b/datafusion/core/benches/scalar.rs
index 540f7212e96e9..d06ed3f28b743 100644
--- a/datafusion/core/benches/scalar.rs
+++ b/datafusion/core/benches/scalar.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::scalar::ScalarValue;
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs
index e1bc478b36f0a..4ba57a1530e81 100644
--- a/datafusion/core/benches/sort.rs
+++ b/datafusion/core/benches/sort.rs
@@ -71,7 +71,6 @@ use std::sync::Arc;
 use arrow::array::StringViewArray;
 use arrow::{
     array::{DictionaryArray, Float64Array, Int64Array, StringArray},
-    compute::SortOptions,
     datatypes::{Int32Type, Schema},
     record_batch::RecordBatch,
 };
@@ -79,18 +78,18 @@ use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::{
     execution::context::TaskContext,
     physical_plan::{
+        ExecutionPlan, ExecutionPlanProperties,
         coalesce_partitions::CoalescePartitionsExec,
-        sorts::sort_preserving_merge::SortPreservingMergeExec, ExecutionPlan,
-        ExecutionPlanProperties,
+        sorts::sort_preserving_merge::SortPreservingMergeExec,
     },
     prelude::SessionContext,
 };
 use datafusion_datasource::memory::MemorySourceConfig;
-use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 /// Benchmarks for SortPreservingMerge stream
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use futures::StreamExt;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -272,14 +271,11 @@ impl BenchCase {
 
 /// Make sort exprs for each column in `schema`
 fn make_sort_exprs(schema: &Schema) -> LexOrdering {
-    schema
+    let sort_exprs = schema
         .fields()
         .iter()
-        .map(|f| PhysicalSortExpr {
-            expr: col(f.name(), schema).unwrap(),
-            options: SortOptions::default(),
-        })
-        .collect()
+        .map(|f| PhysicalSortExpr::new_default(col(f.name(), schema).unwrap()));
+    LexOrdering::new(sort_exprs).unwrap()
 }
 
 /// Create streams of int64 (where approximately 1/3 values is repeated)
@@ -359,14 +355,14 @@ fn utf8_high_cardinality_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (utf8_low, utf8_low, utf8_high)
 fn utf8_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+    let mut data_gen = DataGenerator::new();
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_high_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_high_cardinality_values())
         .collect();
 
     if sorted {
@@ -392,14 +388,14 @@ fn utf8_tuple_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (utf8_view_low, utf8_view_low, utf8_view_high)
 fn utf8_view_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+    let mut data_gen = DataGenerator::new();
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_high_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_high_cardinality_values())
         .collect();
 
     if sorted {
@@ -425,15 +421,15 @@ fn utf8_view_tuple_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (f64, utf8_low, utf8_low, i64)
 fn mixed_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+    let mut data_gen = DataGenerator::new();
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .i64_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
@@ -463,15 +459,15 @@ fn mixed_tuple_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (f64, utf8_view_low, utf8_view_low, i64)
 fn mixed_tuple_with_utf8_view_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
+    let mut data_gen = DataGenerator::new();
 
     // need to sort by the combined key, so combine them together
-    let mut tuples: Vec<_> = gen
+    let mut tuples: Vec<_> = data_gen
         .i64_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
@@ -501,8 +497,8 @@ fn mixed_tuple_with_utf8_view_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (utf8_dict)
 fn dictionary_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut values = gen.utf8_low_cardinality_values();
+    let mut data_gen = DataGenerator::new();
+    let mut values = data_gen.utf8_low_cardinality_values();
     if sorted {
         values.sort_unstable();
     }
@@ -516,12 +512,12 @@ fn dictionary_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (utf8_dict, utf8_dict, utf8_dict)
 fn dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut tuples: Vec<_> = gen
+    let mut data_gen = DataGenerator::new();
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
         .collect();
 
     if sorted {
@@ -547,13 +543,13 @@ fn dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
 
 /// Create a batch of (utf8_dict, utf8_dict, utf8_dict, i64)
 fn mixed_dictionary_tuple_streams(sorted: bool) -> PartitionedBatches {
-    let mut gen = DataGenerator::new();
-    let mut tuples: Vec<_> = gen
+    let mut data_gen = DataGenerator::new();
+    let mut tuples: Vec<_> = data_gen
         .utf8_low_cardinality_values()
         .into_iter()
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.utf8_low_cardinality_values())
-        .zip(gen.i64_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.utf8_low_cardinality_values())
+        .zip(data_gen.i64_values())
         .collect();
 
     if sorted {
diff --git a/datafusion/core/benches/sort_limit_query_sql.rs b/datafusion/core/benches/sort_limit_query_sql.rs
index e535a018161f1..54cd9a0bcd547 100644
--- a/datafusion/core/benches/sort_limit_query_sql.rs
+++ b/datafusion/core/benches/sort_limit_query_sql.rs
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-use criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::file_format::csv::CsvFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
@@ -27,9 +25,6 @@ use datafusion::prelude::SessionConfig;
 use parking_lot::Mutex;
 use std::sync::Arc;
 
-extern crate arrow;
-extern crate datafusion;
-
 use arrow::datatypes::{DataType, Field, Schema};
 
 use datafusion::datasource::MemTable;
@@ -37,6 +32,7 @@ use datafusion::execution::context::SessionContext;
 
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
@@ -97,8 +93,7 @@ fn create_context() -> Arc<Mutex<SessionContext>> {
         ctx_holder.lock().push(Arc::new(Mutex::new(ctx)))
     });
 
-    let ctx = ctx_holder.lock().first().unwrap().clone();
-    ctx
+    ctx_holder.lock().first().unwrap().clone()
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/datafusion/core/benches/spm.rs b/datafusion/core/benches/spm.rs
index 8613525cb248d..afd384f7b170e 100644
--- a/datafusion/core/benches/spm.rs
+++ b/datafusion/core/benches/spm.rs
@@ -15,18 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr::expressions::col;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use datafusion_physical_plan::{collect, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, collect};
 
 use criterion::async_executor::FuturesExecutor;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_datasource::memory::MemorySourceConfig;
 
 fn generate_spm_for_round_robin_tie_breaker(
@@ -66,11 +66,10 @@ fn generate_spm_for_round_robin_tie_breaker(
         RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap()
     };
 
-    let rbs = (0..batch_count).map(|_| rb.clone()).collect::<Vec<_>>();
-    let partitiones = vec![rbs.clone(); partition_count];
-
     let schema = rb.schema();
-    let sort = LexOrdering::new(vec![
+    let rbs = std::iter::repeat_n(rb, batch_count).collect::<Vec<_>>();
+    let partitions = vec![rbs.clone(); partition_count];
+    let sort = [
         PhysicalSortExpr {
             expr: col("b", &schema).unwrap(),
             options: Default::default(),
@@ -79,9 +78,10 @@ fn generate_spm_for_round_robin_tie_breaker(
             expr: col("c", &schema).unwrap(),
             options: Default::default(),
         },
-    ]);
+    ]
+    .into();
 
-    let exec = MemorySourceConfig::try_new_exec(&partitiones, schema, None).unwrap();
+    let exec = MemorySourceConfig::try_new_exec(&partitions, schema, None).unwrap();
     SortPreservingMergeExec::new(sort, exec)
         .with_round_robin_repartition(enable_round_robin_repartition)
 }
diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs
index 6dc953f56b435..59502da987904 100644
--- a/datafusion/core/benches/sql_planner.rs
+++ b/datafusion/core/benches/sql_planner.rs
@@ -15,29 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate arrow;
-#[macro_use]
-extern crate criterion;
-extern crate datafusion;
-
 mod data_utils;
 
-use crate::criterion::Criterion;
+use arrow::array::PrimitiveArray;
 use arrow::array::{ArrayRef, RecordBatch};
+use arrow::datatypes::ArrowNativeTypeOp;
+use arrow::datatypes::ArrowPrimitiveType;
 use arrow::datatypes::{DataType, Field, Fields, Schema};
 use criterion::Bencher;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::datasource::MemTable;
 use datafusion::execution::context::SessionContext;
-use datafusion_common::ScalarValue;
+use datafusion_common::{ScalarValue, config::Dialect};
 use datafusion_expr::col;
-use itertools::Itertools;
-use std::fs::File;
-use std::io::{BufRead, BufReader};
+use rand_distr::num_traits::NumCast;
+use std::hint::black_box;
 use std::path::PathBuf;
 use std::sync::Arc;
+use test_utils::TableDef;
 use test_utils::tpcds::tpcds_schemas;
 use test_utils::tpch::tpch_schemas;
-use test_utils::TableDef;
 use tokio::runtime::Runtime;
 
 const BENCHMARKS_PATH_1: &str = "../../benchmarks/";
@@ -46,12 +43,12 @@ const CLICKBENCH_DATA_PATH: &str = "data/hits_partitioned/";
 
 /// Create a logical plan from the specified sql
 fn logical_plan(ctx: &SessionContext, rt: &Runtime, sql: &str) {
-    criterion::black_box(rt.block_on(ctx.sql(sql)).unwrap());
+    black_box(rt.block_on(ctx.sql(sql)).unwrap());
 }
 
 /// Create a physical ExecutionPlan (by way of logical plan)
 fn physical_plan(ctx: &SessionContext, rt: &Runtime, sql: &str) {
-    criterion::black_box(rt.block_on(async {
+    black_box(rt.block_on(async {
         ctx.sql(sql)
             .await
             .unwrap()
@@ -76,6 +73,21 @@ fn create_table_provider(column_prefix: &str, num_columns: usize) -> Arc<MemTabl
         .unwrap()
 }
 
+/// Create a table provider with a struct column: `id` (Int32) and `props` (Struct { value: Int32, label: Utf8 })
+fn create_struct_table_provider() -> Arc<MemTable> {
+    let struct_fields = Fields::from(vec![
+        Field::new("value", DataType::Int32, true),
+        Field::new("label", DataType::Utf8, true),
+    ]);
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, true),
+        Field::new("props", DataType::Struct(struct_fields), true),
+    ]));
+    MemTable::try_new(schema, vec![vec![]])
+        .map(Arc::new)
+        .unwrap()
+}
+
 fn create_context() -> SessionContext {
     let ctx = SessionContext::new();
     ctx.register_table("t1", create_table_provider("a", 200))
@@ -86,11 +98,16 @@ fn create_context() -> SessionContext {
         .unwrap();
     ctx.register_table("t1000", create_table_provider("d", 1000))
         .unwrap();
+    ctx.register_table("struct_t1", create_struct_table_provider())
+        .unwrap();
+    ctx.register_table("struct_t2", create_struct_table_provider())
+        .unwrap();
     ctx
 }
 
 /// Register the table definitions as a MemTable with the context and return the
 /// context
+#[expect(clippy::needless_pass_by_value)]
 fn register_defs(ctx: SessionContext, defs: Vec<TableDef>) -> SessionContext {
     defs.iter().for_each(|TableDef { name, schema }| {
         ctx.register_table(
@@ -115,6 +132,11 @@ fn register_clickbench_hits_table(rt: &Runtime) -> SessionContext {
 
     let sql = format!("CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '{path}'");
 
+    // ClickBench partitioned dataset was written by an ancient version of pyarrow that
+    // that wrote strings with the wrong logical type. To read it correctly, we must
+    // automatically convert binary to string.
+    rt.block_on(ctx.sql("SET datafusion.execution.parquet.binary_as_string  = true;"))
+        .unwrap();
     rt.block_on(ctx.sql(&sql)).unwrap();
 
     let count =
@@ -140,12 +162,15 @@ fn benchmark_with_param_values_many_columns(
     }
     // SELECT max(attr0), ..., max(attrN) FROM t1.
     let query = format!("SELECT {aggregates} FROM t1");
-    let statement = ctx.state().sql_to_statement(&query, "Generic").unwrap();
+    let statement = ctx
+        .state()
+        .sql_to_statement(&query, &Dialect::Generic)
+        .unwrap();
     let plan =
         rt.block_on(async { ctx.state().statement_to_plan(statement).await.unwrap() });
     b.iter(|| {
         let plan = plan.clone();
-        criterion::black_box(plan.with_param_values(vec![ScalarValue::from(1)]).unwrap());
+        black_box(plan.with_param_values(vec![ScalarValue::from(1)]).unwrap());
     });
 }
 
@@ -154,18 +179,30 @@ fn benchmark_with_param_values_many_columns(
 /// 0,100...9900
 /// 0,200...19800
 /// 0,300...29700
-fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) {
-    // ("c0", [0, 0, ...])
-    // ("c1": [100, 200, ...])
-    // etc
-    let iter = (0..num_columns).map(|i| i as u64).map(|i| {
-        let array: ArrayRef = Arc::new(arrow::array::UInt64Array::from_iter_values(
-            (0..num_rows)
-                .map(|j| j as u64 * 100 + i)
-                .collect::<Vec<_>>(),
-        ));
+fn register_union_order_table_generic<T>(
+    ctx: &SessionContext,
+    num_columns: usize,
+    num_rows: usize,
+) where
+    T: ArrowPrimitiveType,
+    T::Native: ArrowNativeTypeOp + NumCast,
+{
+    let iter = (0..num_columns).map(|i| {
+        let array_data: Vec<T::Native> = (0..num_rows)
+            .map(|j| {
+                let value = (j as u64) * 100 + (i as u64);
+                <T::Native as NumCast>::from(value).unwrap_or_else(|| {
+                    panic!("Failed to cast numeric value to Native type")
+                })
+            })
+            .collect();
+
+        // Use PrimitiveArray which is generic over the ArrowPrimitiveType T
+        let array: ArrayRef = Arc::new(PrimitiveArray::<T>::from_iter_values(array_data));
+
         (format!("c{i}"), array)
     });
+
     let batch = RecordBatch::try_from_iter(iter).unwrap();
     let schema = batch.schema();
     let partitions = vec![vec![batch]];
@@ -182,14 +219,13 @@ fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows
 
     ctx.register_table("t", Arc::new(table)).unwrap();
 }
-
 /// return a query like
 /// ```sql
-/// select c1, null as c2, ... null as cn from t ORDER BY c1
+/// select c1, 2 as c2, ... n as cn from t ORDER BY c1
 ///   UNION ALL
-/// select null as c1, c2, ... null as cn from t ORDER BY c2
+/// select 1 as c1, c2, ... n as cn from t ORDER BY c2
 /// ...
-/// select null as c1, null as c2, ... cn from t ORDER BY cn
+/// select 1 as c1, 2 as c2, ... cn from t ORDER BY cn
 ///  ORDER BY c1, c2 ... CN
 /// ```
 fn union_orderby_query(n: usize) -> String {
@@ -203,7 +239,7 @@ fn union_orderby_query(n: usize) -> String {
                 if i == j {
                     format!("c{j}")
                 } else {
-                    format!("null as c{j}")
+                    format!("{j} as c{j}")
                 }
             })
             .collect::<Vec<_>>()
@@ -225,8 +261,10 @@ fn criterion_benchmark(c: &mut Criterion) {
     if !PathBuf::from(format!("{BENCHMARKS_PATH_1}{CLICKBENCH_DATA_PATH}")).exists()
         && !PathBuf::from(format!("{BENCHMARKS_PATH_2}{CLICKBENCH_DATA_PATH}")).exists()
     {
-        panic!("benchmarks/data/hits_partitioned/ could not be loaded. Please run \
-         'benchmarks/bench.sh data clickbench_partitioned' prior to running this benchmark")
+        panic!(
+            "benchmarks/data/hits_partitioned/ could not be loaded. Please run \
+         'benchmarks/bench.sh data clickbench_partitioned' prior to running this benchmark"
+        )
     }
 
     let ctx = create_context();
@@ -301,6 +339,34 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
     });
 
+    // It was observed in production that queries with window functions sometimes partition over more than 30 columns
+    for partitioning_columns in [4, 7, 8, 12, 30] {
+        c.bench_function(
+            &format!(
+                "physical_window_function_partition_by_{partitioning_columns}_on_values"
+            ),
+            |b| {
+                let source = format!(
+                    "SELECT 1 AS n{}",
+                    (0..partitioning_columns)
+                        .map(|i| format!(", {i} AS c{i}"))
+                        .collect::<String>()
+                );
+                let window = format!(
+                    "SUM(n) OVER (PARTITION BY {}) AS sum_n",
+                    (0..partitioning_columns)
+                        .map(|i| format!("c{i}"))
+                        .collect::<Vec<_>>()
+                        .join(", ")
+                );
+                let query = format!("SELECT {window} FROM ({source})");
+                b.iter(|| {
+                    physical_plan(&ctx, &rt, &query);
+                });
+            },
+        );
+    }
+
     // Benchmark for Physical Planning Joins
     c.bench_function("physical_join_consider_sort", |b| {
         b.iter(|| {
@@ -372,16 +438,70 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
     });
 
-    // -- Sorted Queries --
-    register_union_order_table(&ctx, 100, 1000);
-
-    // this query has many expressions in its sort order so stresses
-    // order equivalence validation
-    c.bench_function("physical_sorted_union_orderby", |b| {
-        // SELECT ... UNION ALL ...
-        let query = union_orderby_query(20);
-        b.iter(|| physical_plan(&ctx, &rt, &query))
+    let struct_agg_sort_query = "SELECT \
+         struct_t1.props['label'], \
+         SUM(struct_t1.props['value']), \
+         MAX(struct_t2.props['value']), \
+         COUNT(*) \
+     FROM struct_t1 \
+     JOIN struct_t2 ON struct_t1.id = struct_t2.id \
+     WHERE struct_t1.props['value'] > 50 \
+     GROUP BY struct_t1.props['label'] \
+     ORDER BY SUM(struct_t1.props['value']) DESC";
+
+    // -- Struct column benchmarks --
+    c.bench_function("logical_plan_struct_join_agg_sort", |b| {
+        b.iter(|| logical_plan(&ctx, &rt, struct_agg_sort_query))
     });
+    c.bench_function("physical_plan_struct_join_agg_sort", |b| {
+        b.iter(|| physical_plan(&ctx, &rt, struct_agg_sort_query))
+    });
+
+    // -- Sorted Queries --
+    // 100, 200 && 300 is taking too long - https://github.com/apache/datafusion/issues/18366
+    // Logical Plan for datatype Int64 and UInt64 differs, UInt64 Logical Plan's Union are wrapped
+    // up in Projection, and EliminateNestedUnion OptimezerRule is not applied leading to significantly
+    // longer execution time.
+    // https://github.com/apache/datafusion/issues/17261
+
+    for column_count in [10, 50 /* 100, 200, 300 */] {
+        register_union_order_table_generic::<arrow::datatypes::Int64Type>(
+            &ctx,
+            column_count,
+            1000,
+        );
+
+        // this query has many expressions in its sort order so stresses
+        // order equivalence validation
+        c.bench_function(
+            &format!("physical_sorted_union_order_by_{column_count}_int64"),
+            |b| {
+                // SELECT ... UNION ALL ...
+                let query = union_orderby_query(column_count);
+                b.iter(|| physical_plan(&ctx, &rt, &query))
+            },
+        );
+
+        let _ = ctx.deregister_table("t");
+    }
+
+    for column_count in [10, 50 /* 100, 200, 300 */] {
+        register_union_order_table_generic::<arrow::datatypes::UInt64Type>(
+            &ctx,
+            column_count,
+            1000,
+        );
+        c.bench_function(
+            &format!("physical_sorted_union_order_by_{column_count}_uint64"),
+            |b| {
+                // SELECT ... UNION ALL ...
+                let query = union_orderby_query(column_count);
+                b.iter(|| physical_plan(&ctx, &rt, &query))
+            },
+        );
+
+        let _ = ctx.deregister_table("t");
+    }
 
     // --- TPC-H ---
 
@@ -466,17 +586,20 @@ fn criterion_benchmark(c: &mut Criterion) {
     // });
 
     // -- clickbench --
-
-    let queries_file =
-        File::open(format!("{benchmarks_path}queries/clickbench/queries.sql")).unwrap();
-    let extended_file =
-        File::open(format!("{benchmarks_path}queries/clickbench/extended.sql")).unwrap();
-
-    let clickbench_queries: Vec<String> = BufReader::new(queries_file)
-        .lines()
-        .chain(BufReader::new(extended_file).lines())
-        .map(|l| l.expect("Could not parse line"))
-        .collect_vec();
+    let clickbench_queries = (0..=42)
+        .map(|q| {
+            std::fs::read_to_string(format!(
+                "{benchmarks_path}queries/clickbench/queries/q{q}.sql"
+            ))
+            .unwrap()
+        })
+        .chain((0..=7).map(|q| {
+            std::fs::read_to_string(format!(
+                "{benchmarks_path}queries/clickbench/extended/q{q}.sql"
+            ))
+            .unwrap()
+        }))
+        .collect::<Vec<_>>();
 
     let clickbench_ctx = register_clickbench_hits_table(&rt);
 
diff --git a/datafusion/core/benches/sql_planner_extended.rs b/datafusion/core/benches/sql_planner_extended.rs
new file mode 100644
index 0000000000000..d4955313c79c3
--- /dev/null
+++ b/datafusion/core/benches/sql_planner_extended.rs
@@ -0,0 +1,466 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, RecordBatch};
+use arrow_schema::DataType;
+use arrow_schema::TimeUnit::Nanosecond;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion::prelude::{DataFrame, SessionContext};
+use datafusion_catalog::MemTable;
+use datafusion_common::ScalarValue;
+use datafusion_expr::Expr::Literal;
+use datafusion_expr::{cast, col, lit, not, try_cast, when};
+use datafusion_functions::expr_fn::{
+    btrim, length, regexp_like, regexp_replace, to_timestamp, upper,
+};
+use std::fmt::Write;
+use std::hint::black_box;
+use std::ops::Rem;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+// This benchmark suite is designed to test the performance of
+// logical planning with a large plan containing unions, many columns
+// with a variety of operations in it.
+//
+// Since it is (currently) very slow to execute it has been separated
+// out from the sql_planner benchmark suite to this file.
+//
+// See https://github.com/apache/datafusion/issues/17261 for details.
+
+/// Registers a table like this:
+/// c0,c1,c2...,c99
+/// "0","100"..."9900"
+/// "0","200"..."19800"
+/// "0","300"..."29700"
+fn register_string_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) {
+    // ("c0", ["0", "0", ...])
+    // ("c1": ["100", "200", ...])
+    // etc
+    let iter = (0..num_columns).map(|i| i as u64).map(|i| {
+        let array: ArrayRef = Arc::new(arrow::array::StringViewArray::from_iter_values(
+            (0..num_rows)
+                .map(|j| format!("c{}", j as u64 * 100 + i))
+                .collect::<Vec<_>>(),
+        ));
+        (format!("c{i}"), array)
+    });
+    let batch = RecordBatch::try_from_iter(iter).unwrap();
+    let schema = batch.schema();
+    let partitions = vec![vec![batch]];
+
+    // create the table
+    let table = MemTable::try_new(schema, partitions).unwrap();
+
+    ctx.register_table("t", Arc::new(table)).unwrap();
+}
+
+/// Build a dataframe for testing logical plan optimization
+fn build_test_data_frame(ctx: &SessionContext, rt: &Runtime) -> DataFrame {
+    register_string_table(ctx, 100, 1000);
+
+    rt.block_on(async {
+        let mut df = ctx.table("t").await.unwrap();
+        // add some columns in
+        for i in 100..150 {
+            df = df
+                .with_column(&format!("c{i}"), Literal(ScalarValue::Utf8(None), None))
+                .unwrap();
+        }
+        // add in some columns with string encoded timestamps
+        for i in 150..175 {
+            df = df
+                .with_column(
+                    &format!("c{i}"),
+                    Literal(ScalarValue::Utf8(Some("2025-08-21 09:43:17".into())), None),
+                )
+                .unwrap();
+        }
+        // do a bunch of ops on the columns
+        for i in 0..175 {
+            // trim the columns
+            df = df
+                .with_column(&format!("c{i}"), btrim(vec![col(format!("c{i}"))]))
+                .unwrap();
+        }
+
+        for i in 0..175 {
+            let c_name = format!("c{i}");
+            let c = col(&c_name);
+
+            // random ops
+            if i % 5 == 0 && i < 150 {
+                // the actual ops here are largely unimportant as they are just a sample
+                // of ops that could occur on a dataframe
+                df = df
+                    .with_column(&c_name, cast(c.clone(), DataType::Utf8))
+                    .unwrap()
+                    .with_column(
+                        &c_name,
+                        when(
+                            cast(c.clone(), DataType::Int32).gt(lit(135)),
+                            cast(
+                                cast(c.clone(), DataType::Int32) - lit(i + 3),
+                                DataType::Utf8,
+                            ),
+                        )
+                        .otherwise(c.clone())
+                        .unwrap(),
+                    )
+                    .unwrap()
+                    .with_column(
+                        &c_name,
+                        when(
+                            c.clone().is_not_null().and(
+                                cast(c.clone(), DataType::Int32)
+                                    .between(lit(120), lit(130)),
+                            ),
+                            Literal(ScalarValue::Utf8(None), None),
+                        )
+                        .otherwise(
+                            when(
+                                c.clone().is_not_null().and(regexp_like(
+                                    cast(c.clone(), DataType::Utf8View),
+                                    lit("[0-9]*"),
+                                    None,
+                                )),
+                                upper(c.clone()),
+                            )
+                            .otherwise(c.clone())
+                            .unwrap(),
+                        )
+                        .unwrap(),
+                    )
+                    .unwrap()
+                    .with_column(
+                        &c_name,
+                        when(
+                            c.clone().is_not_null().and(
+                                cast(c.clone(), DataType::Int32)
+                                    .between(lit(90), lit(100)),
+                            ),
+                            cast(c.clone(), DataType::Utf8View),
+                        )
+                        .otherwise(Literal(ScalarValue::Date32(None), None))
+                        .unwrap(),
+                    )
+                    .unwrap()
+                    .with_column(
+                        &c_name,
+                        when(
+                            c.clone().is_not_null().and(
+                                cast(c.clone(), DataType::Int32).rem(lit(10)).gt(lit(7)),
+                            ),
+                            regexp_replace(
+                                cast(c.clone(), DataType::Utf8View),
+                                lit("1"),
+                                lit("a"),
+                                None,
+                            ),
+                        )
+                        .otherwise(Literal(ScalarValue::Date32(None), None))
+                        .unwrap(),
+                    )
+                    .unwrap()
+            }
+            if i >= 150 {
+                df = df
+                    .with_column(
+                        &c_name,
+                        try_cast(
+                            to_timestamp(vec![c.clone(), lit("%Y-%m-%d %H:%M:%S")]),
+                            DataType::Timestamp(Nanosecond, Some("UTC".into())),
+                        ),
+                    )
+                    .unwrap()
+                    .with_column(&c_name, try_cast(c.clone(), DataType::Date32))
+                    .unwrap()
+            }
+
+            // add in a few unions
+            if i % 30 == 0 {
+                let df1 = df
+                    .clone()
+                    .filter(length(c.clone()).gt(lit(2)))
+                    .unwrap()
+                    .with_column(&format!("c{i}_filtered"), lit(true))
+                    .unwrap();
+                let df2 = df
+                    .filter(not(length(c.clone()).gt(lit(2))))
+                    .unwrap()
+                    .with_column(&format!("c{i}_filtered"), lit(false))
+                    .unwrap();
+
+                df = df1.union_by_name(df2).unwrap()
+            }
+        }
+
+        df
+    })
+}
+
+/// Build a CASE-heavy dataframe over a non-inner join to stress
+/// planner-time filter pushdown and nullability/type inference.
+fn build_case_heavy_left_join_df(ctx: &SessionContext, rt: &Runtime) -> DataFrame {
+    register_string_table(ctx, 100, 1000);
+    let query = build_case_heavy_left_join_query(30, 1);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn build_case_heavy_left_join_query(predicate_count: usize, case_depth: usize) -> String {
+    let mut query = String::from(
+        "SELECT l.c0, r.c0 AS rc0 FROM t l LEFT JOIN t r ON l.c0 = r.c0 WHERE ",
+    );
+
+    if predicate_count == 0 {
+        query.push_str("TRUE");
+        return query;
+    }
+
+    // Keep this deterministic so comparisons between profiles are stable.
+    for i in 0..predicate_count {
+        if i > 0 {
+            query.push_str(" AND ");
+        }
+
+        let mut expr = format!("length(l.c{})", i % 20);
+        for depth in 0..case_depth {
+            let left_col = (i + depth + 1) % 20;
+            let right_col = (i + depth + 2) % 20;
+            expr = format!(
+                "CASE WHEN l.c{left_col} IS NOT NULL THEN {expr} ELSE length(r.c{right_col}) END"
+            );
+        }
+
+        let _ = write!(&mut query, "{expr} > 2");
+    }
+
+    query
+}
+
+fn build_case_heavy_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    case_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
+    let ctx = SessionContext::new();
+    register_string_table(&ctx, 100, 1000);
+    if !push_down_filter_enabled {
+        let removed = ctx.remove_optimizer_rule("push_down_filter");
+        assert!(
+            removed,
+            "push_down_filter rule should be present in the default optimizer"
+        );
+    }
+
+    let query = build_case_heavy_left_join_query(predicate_count, case_depth);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn build_non_case_left_join_query(
+    predicate_count: usize,
+    nesting_depth: usize,
+) -> String {
+    let mut query = String::from(
+        "SELECT l.c0, r.c0 AS rc0 FROM t l LEFT JOIN t r ON l.c0 = r.c0 WHERE ",
+    );
+
+    if predicate_count == 0 {
+        query.push_str("TRUE");
+        return query;
+    }
+
+    // Keep this deterministic so comparisons between profiles are stable.
+    for i in 0..predicate_count {
+        if i > 0 {
+            query.push_str(" AND ");
+        }
+
+        let left_col = i % 20;
+        let mut expr = format!("l.c{left_col}");
+        for depth in 0..nesting_depth {
+            let right_col = (i + depth + 1) % 20;
+            expr = format!("coalesce({expr}, r.c{right_col})");
+        }
+
+        let _ = write!(&mut query, "length({expr}) > 2");
+    }
+
+    query
+}
+
+fn build_non_case_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    nesting_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
+    let ctx = SessionContext::new();
+    register_string_table(&ctx, 100, 1000);
+    if !push_down_filter_enabled {
+        let removed = ctx.remove_optimizer_rule("push_down_filter");
+        assert!(
+            removed,
+            "push_down_filter rule should be present in the default optimizer"
+        );
+    }
+
+    let query = build_non_case_left_join_query(predicate_count, nesting_depth);
+    rt.block_on(async { ctx.sql(&query).await.unwrap() })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let baseline_ctx = SessionContext::new();
+    let case_heavy_ctx = SessionContext::new();
+    let rt = Runtime::new().unwrap();
+
+    // validate logical plan optimize performance
+    // https://github.com/apache/datafusion/issues/17261
+
+    let df = build_test_data_frame(&baseline_ctx, &rt);
+    let case_heavy_left_join_df = build_case_heavy_left_join_df(&case_heavy_ctx, &rt);
+
+    c.bench_function("logical_plan_optimize", |b| {
+        b.iter(|| {
+            let df_clone = df.clone();
+            black_box(rt.block_on(async { df_clone.into_optimized_plan().unwrap() }));
+        })
+    });
+
+    c.bench_function("logical_plan_optimize_hotspot_case_heavy_left_join", |b| {
+        b.iter(|| {
+            let df_clone = case_heavy_left_join_df.clone();
+            black_box(rt.block_on(async { df_clone.into_optimized_plan().unwrap() }));
+        })
+    });
+
+    let predicate_sweep = [10, 20, 30, 40, 60];
+    let case_depth_sweep = [1, 2, 3];
+
+    let mut hotspot_group =
+        c.benchmark_group("push_down_filter_hotspot_case_heavy_left_join_ab");
+    for case_depth in case_depth_sweep {
+        for predicate_count in predicate_sweep {
+            let with_push_down_filter =
+                build_case_heavy_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    case_depth,
+                    true,
+                );
+            let without_push_down_filter =
+                build_case_heavy_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    case_depth,
+                    false,
+                );
+
+            let input_label =
+                format!("predicates={predicate_count},case_depth={case_depth}");
+            // A/B interpretation:
+            // - with_push_down_filter: default optimizer path (rule enabled)
+            // - without_push_down_filter: control path with the rule removed
+            // Compare both IDs at the same sweep point to isolate rule impact.
+            hotspot_group.bench_with_input(
+                BenchmarkId::new("with_push_down_filter", &input_label),
+                &with_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+            hotspot_group.bench_with_input(
+                BenchmarkId::new("without_push_down_filter", &input_label),
+                &without_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+        }
+    }
+    hotspot_group.finish();
+
+    let mut control_group =
+        c.benchmark_group("push_down_filter_control_non_case_left_join_ab");
+    for nesting_depth in case_depth_sweep {
+        for predicate_count in predicate_sweep {
+            let with_push_down_filter = build_non_case_left_join_df_with_push_down_filter(
+                &rt,
+                predicate_count,
+                nesting_depth,
+                true,
+            );
+            let without_push_down_filter =
+                build_non_case_left_join_df_with_push_down_filter(
+                    &rt,
+                    predicate_count,
+                    nesting_depth,
+                    false,
+                );
+
+            let input_label =
+                format!("predicates={predicate_count},nesting_depth={nesting_depth}");
+            control_group.bench_with_input(
+                BenchmarkId::new("with_push_down_filter", &input_label),
+                &with_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+            control_group.bench_with_input(
+                BenchmarkId::new("without_push_down_filter", &input_label),
+                &without_push_down_filter,
+                |b, df| {
+                    b.iter(|| {
+                        let df_clone = df.clone();
+                        black_box(
+                            rt.block_on(async {
+                                df_clone.into_optimized_plan().unwrap()
+                            }),
+                        );
+                    })
+                },
+            );
+        }
+    }
+    control_group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/sql_query_with_io.rs b/datafusion/core/benches/sql_query_with_io.rs
index 58797dfed6b67..fc8caf31acd11 100644
--- a/datafusion/core/benches/sql_query_with_io.rs
+++ b/datafusion/core/benches/sql_query_with_io.rs
@@ -20,7 +20,7 @@ use std::{fmt::Write, sync::Arc, time::Duration};
 use arrow::array::{Int64Builder, RecordBatch, UInt64Builder};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use bytes::Bytes;
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion::{
     datasource::{
         file_format::parquet::ParquetFormat,
@@ -31,13 +31,13 @@ use datafusion::{
 use datafusion_execution::runtime_env::RuntimeEnv;
 use itertools::Itertools;
 use object_store::{
+    ObjectStore, ObjectStoreExt,
     memory::InMemory,
     path::Path,
     throttle::{ThrottleConfig, ThrottledStore},
-    ObjectStore,
 };
 use parquet::arrow::ArrowWriter;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rngs::StdRng};
 use tokio::runtime::Runtime;
 use url::Url;
 
diff --git a/datafusion/core/benches/struct_query_sql.rs b/datafusion/core/benches/struct_query_sql.rs
index f9cc43d1ea2c5..96434fc379ea6 100644
--- a/datafusion/core/benches/struct_query_sql.rs
+++ b/datafusion/core/benches/struct_query_sql.rs
@@ -20,17 +20,18 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use futures::executor::block_on;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
 async fn query(ctx: &SessionContext, rt: &Runtime, sql: &str) {
     // execute the query
     let df = rt.block_on(ctx.sql(sql)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context(array_len: usize, batch_size: usize) -> Result<SessionContext> {
diff --git a/datafusion/core/benches/topk_aggregate.rs b/datafusion/core/benches/topk_aggregate.rs
index cf3c7fa2e26fe..f71cf1087be7d 100644
--- a/datafusion/core/benches/topk_aggregate.rs
+++ b/datafusion/core/benches/topk_aggregate.rs
@@ -16,25 +16,71 @@
 // under the License.
 
 mod data_utils;
+
+use arrow::array::Int64Builder;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::make_data;
-use datafusion::physical_plan::{collect, displayable, ExecutionPlan};
+use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::SessionContext;
 use datafusion::{datasource::MemTable, error::Result};
 use datafusion_execution::config::SessionConfig;
-use datafusion_execution::TaskContext;
+use rand::SeedableRng;
+use rand::seq::SliceRandom;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+const LIMIT: usize = 10;
+
+/// Create deterministic data for DISTINCT benchmarks with predictable trace_ids
+/// This ensures consistent results across benchmark runs
+fn make_distinct_data(
+    partition_cnt: i32,
+    sample_cnt: i32,
+) -> Result<(Arc<Schema>, Vec<Vec<RecordBatch>>)> {
+    let mut rng = rand::rngs::SmallRng::from_seed([42; 32]);
+    let total_samples = partition_cnt as usize * sample_cnt as usize;
+    let mut ids = Vec::new();
+    for i in 0..total_samples {
+        ids.push(i as i64);
+    }
+    ids.shuffle(&mut rng);
+
+    let mut global_idx = 0;
+    let schema = test_distinct_schema();
+    let mut partitions = vec![];
+    for _ in 0..partition_cnt {
+        let mut id_builder = Int64Builder::new();
+
+        for _ in 0..sample_cnt {
+            let id = ids[global_idx];
+            id_builder.append_value(id);
+            global_idx += 1;
+        }
+
+        let id_col = Arc::new(id_builder.finish());
+        let batch = RecordBatch::try_new(schema.clone(), vec![id_col])?;
+        partitions.push(vec![batch]);
+    }
+
+    Ok((schema, partitions))
+}
+
+/// Returns a Schema for distinct benchmarks with i64 trace_id
+fn test_distinct_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]))
+}
+
 async fn create_context(
-    limit: usize,
     partition_cnt: i32,
     sample_cnt: i32,
     asc: bool,
     use_topk: bool,
     use_view: bool,
-) -> Result<(Arc<dyn ExecutionPlan>, Arc<TaskContext>)> {
+) -> Result<SessionContext> {
     let (schema, parts) = make_data(partition_cnt, sample_cnt, asc, use_view).unwrap();
     let mem_table = Arc::new(MemTable::try_new(schema, parts).unwrap());
 
@@ -44,55 +90,196 @@ async fn create_context(
     opts.optimizer.enable_topk_aggregation = use_topk;
     let ctx = SessionContext::new_with_config(cfg);
     let _ = ctx.register_table("traces", mem_table)?;
-    let sql = format!("select trace_id, max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};");
+
+    Ok(ctx)
+}
+
+async fn create_context_distinct(
+    partition_cnt: i32,
+    sample_cnt: i32,
+    use_topk: bool,
+) -> Result<SessionContext> {
+    // Use deterministic data generation for DISTINCT queries to ensure consistent results
+    let (schema, parts) = make_distinct_data(partition_cnt, sample_cnt).unwrap();
+    let mem_table = Arc::new(MemTable::try_new(schema, parts).unwrap());
+
+    // Create the DataFrame
+    let mut cfg = SessionConfig::new();
+    let opts = cfg.options_mut();
+    opts.optimizer.enable_topk_aggregation = use_topk;
+    let ctx = SessionContext::new_with_config(cfg);
+    let _ = ctx.register_table("traces", mem_table)?;
+
+    Ok(ctx)
+}
+
+fn run(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool, asc: bool) {
+    black_box(rt.block_on(async { aggregate(ctx, limit, use_topk, asc).await })).unwrap();
+}
+
+fn run_string(rt: &Runtime, ctx: SessionContext, limit: usize, use_topk: bool) {
+    black_box(rt.block_on(async { aggregate_string(ctx, limit, use_topk).await }))
+        .unwrap();
+}
+
+fn run_distinct(
+    rt: &Runtime,
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) {
+    black_box(rt.block_on(async { aggregate_distinct(ctx, limit, use_topk, asc).await }))
+        .unwrap();
+}
+
+async fn aggregate(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+    asc: bool,
+) -> Result<()> {
+    let sql = format!(
+        "select max(timestamp_ms) from traces group by trace_id order by max(timestamp_ms) desc limit {limit};"
+    );
     let df = ctx.sql(sql.as_str()).await?;
-    let physical_plan = df.create_physical_plan().await?;
-    let actual_phys_plan = displayable(physical_plan.as_ref()).indent(true).to_string();
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
     assert_eq!(
         actual_phys_plan.contains(&format!("lim=[{limit}]")),
         use_topk
     );
 
-    Ok((physical_plan, ctx.task_ctx()))
+    let batches = collect(plan, ctx.task_ctx()).await?;
+    assert_eq!(batches.len(), 1);
+    let batch = batches.first().unwrap();
+    assert_eq!(batch.num_rows(), LIMIT);
+
+    let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase();
+    let expected_asc = r#"
++--------------------------+
+| max(traces.timestamp_ms) |
++--------------------------+
+| 16909009999999           |
+| 16909009999998           |
+| 16909009999997           |
+| 16909009999996           |
+| 16909009999995           |
+| 16909009999994           |
+| 16909009999993           |
+| 16909009999992           |
+| 16909009999991           |
+| 16909009999990           |
++--------------------------+
+        "#
+    .trim();
+    if asc {
+        assert_eq!(actual.trim(), expected_asc);
+    }
+
+    Ok(())
 }
 
-fn run(rt: &Runtime, plan: Arc<dyn ExecutionPlan>, ctx: Arc<TaskContext>, asc: bool) {
-    criterion::black_box(
-        rt.block_on(async { aggregate(plan.clone(), ctx.clone(), asc).await }),
-    )
-    .unwrap();
+/// Benchmark for string aggregate functions with topk optimization.
+/// This tests grouping by a numeric column (timestamp_ms) and aggregating
+/// a string column (trace_id) with Utf8 or Utf8View data types.
+async fn aggregate_string(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
+) -> Result<()> {
+    let sql = format!(
+        "select max(trace_id) from traces group by timestamp_ms order by max(trace_id) desc limit {limit};"
+    );
+    let df = ctx.sql(sql.as_str()).await?;
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
+    assert_eq!(
+        actual_phys_plan.contains(&format!("lim=[{limit}]")),
+        use_topk
+    );
+
+    let batches = collect(plan, ctx.task_ctx()).await?;
+    assert_eq!(batches.len(), 1);
+    let batch = batches.first().unwrap();
+    assert_eq!(batch.num_rows(), LIMIT);
+
+    Ok(())
 }
 
-async fn aggregate(
-    plan: Arc<dyn ExecutionPlan>,
-    ctx: Arc<TaskContext>,
+async fn aggregate_distinct(
+    ctx: SessionContext,
+    limit: usize,
+    use_topk: bool,
     asc: bool,
 ) -> Result<()> {
-    let batches = collect(plan, ctx).await?;
+    let order_direction = if asc { "asc" } else { "desc" };
+    let sql = format!(
+        "select id from traces group by id order by id {order_direction} limit {limit};"
+    );
+    let df = ctx.sql(sql.as_str()).await?;
+    let plan = df.create_physical_plan().await?;
+    let actual_phys_plan = displayable(plan.as_ref()).indent(true).to_string();
+    assert_eq!(
+        actual_phys_plan.contains(&format!("lim=[{limit}]")),
+        use_topk
+    );
+    let batches = collect(plan, ctx.task_ctx()).await?;
     assert_eq!(batches.len(), 1);
     let batch = batches.first().unwrap();
-    assert_eq!(batch.num_rows(), 10);
+    assert_eq!(batch.num_rows(), LIMIT);
 
     let actual = format!("{}", pretty_format_batches(&batches)?).to_lowercase();
+
     let expected_asc = r#"
-+----------------------------------+--------------------------+
-| trace_id                         | max(traces.timestamp_ms) |
-+----------------------------------+--------------------------+
-| 5868861a23ed31355efc5200eb80fe74 | 16909009999999           |
-| 4040e64656804c3d77320d7a0e7eb1f0 | 16909009999998           |
-| 02801bbe533190a9f8713d75222f445d | 16909009999997           |
-| 9e31b3b5a620de32b68fefa5aeea57f1 | 16909009999996           |
-| 2d88a860e9bd1cfaa632d8e7caeaa934 | 16909009999995           |
-| a47edcef8364ab6f191dd9103e51c171 | 16909009999994           |
-| 36a3fa2ccfbf8e00337f0b1254384db6 | 16909009999993           |
-| 0756be84f57369012e10de18b57d8a2f | 16909009999992           |
-| d4d6bf9845fa5897710e3a8db81d5907 | 16909009999991           |
-| 3c2cc1abe728a66b61e14880b53482a0 | 16909009999990           |
-+----------------------------------+--------------------------+
-        "#
++----+
+| id |
++----+
+| 0  |
+| 1  |
+| 2  |
+| 3  |
+| 4  |
+| 5  |
+| 6  |
+| 7  |
+| 8  |
+| 9  |
++----+
+"#
     .trim();
+
+    let expected_desc = r#"
++---------+
+| id      |
++---------+
+| 9999999 |
+| 9999998 |
+| 9999997 |
+| 9999996 |
+| 9999995 |
+| 9999994 |
+| 9999993 |
+| 9999992 |
+| 9999991 |
+| 9999990 |
++---------+
+"#
+    .trim();
+
+    // Verify exact results match expected values
     if asc {
-        assert_eq!(actual.trim(), expected_asc);
+        assert_eq!(
+            actual.trim(),
+            expected_asc,
+            "Ascending DISTINCT results do not match expected values"
+        );
+    } else {
+        assert_eq!(
+            actual.trim(),
+            expected_desc,
+            "Descending DISTINCT results do not match expected values"
+        );
     }
 
     Ok(())
@@ -100,110 +287,154 @@ async fn aggregate(
 
 fn criterion_benchmark(c: &mut Criterion) {
     let rt = Runtime::new().unwrap();
-    let limit = 10;
+    let limit = LIMIT;
     let partitions = 10;
     let samples = 1_000_000;
 
+    let ctx = rt
+        .block_on(create_context(partitions, samples, false, false, false))
+        .unwrap();
     c.bench_function(
         format!("aggregate {} time-series rows", partitions * samples).as_str(),
-        |b| {
-            b.iter(|| {
-                let real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, false, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, real.0.clone(), real.1.clone(), false)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, false, false)),
     );
 
+    let ctx = rt
+        .block_on(create_context(partitions, samples, true, false, false))
+        .unwrap();
     c.bench_function(
         format!("aggregate {} worst-case rows", partitions * samples).as_str(),
-        |b| {
-            b.iter(|| {
-                let asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, false, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, asc.0.clone(), asc.1.clone(), true)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, false, true)),
     );
 
+    let ctx = rt
+        .block_on(create_context(partitions, samples, false, true, false))
+        .unwrap();
     c.bench_function(
         format!(
             "top k={limit} aggregate {} time-series rows",
             partitions * samples
         )
         .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, true, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_real.0.clone(), topk_real.1.clone(), false)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, true, false)),
     );
 
+    let ctx = rt
+        .block_on(create_context(partitions, samples, true, true, false))
+        .unwrap();
     c.bench_function(
         format!(
             "top k={limit} aggregate {} worst-case rows",
             partitions * samples
         )
         .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, true, false)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_asc.0.clone(), topk_asc.1.clone(), true)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, true, true)),
     );
 
     // Utf8View schema，time-series rows
+    let ctx = rt
+        .block_on(create_context(partitions, samples, false, true, true))
+        .unwrap();
     c.bench_function(
         format!(
             "top k={limit} aggregate {} time-series rows [Utf8View]",
             partitions * samples
         )
         .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_real = rt.block_on(async {
-                    create_context(limit, partitions, samples, false, true, true)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_real.0.clone(), topk_real.1.clone(), false)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, true, false)),
     );
 
     // Utf8View schema，worst-case rows
+    let ctx = rt
+        .block_on(create_context(partitions, samples, true, true, true))
+        .unwrap();
     c.bench_function(
         format!(
             "top k={limit} aggregate {} worst-case rows [Utf8View]",
             partitions * samples
         )
         .as_str(),
-        |b| {
-            b.iter(|| {
-                let topk_asc = rt.block_on(async {
-                    create_context(limit, partitions, samples, true, true, true)
-                        .await
-                        .unwrap()
-                });
-                run(&rt, topk_asc.0.clone(), topk_asc.1.clone(), true)
-            })
-        },
+        |b| b.iter(|| run(&rt, ctx.clone(), limit, true, true)),
+    );
+
+    // String aggregate benchmarks - grouping by timestamp, aggregating string column
+    let ctx = rt
+        .block_on(create_context(partitions, samples, false, true, false))
+        .unwrap();
+    c.bench_function(
+        format!(
+            "top k={limit} string aggregate {} time-series rows [Utf8]",
+            partitions * samples
+        )
+        .as_str(),
+        |b| b.iter(|| run_string(&rt, ctx.clone(), limit, true)),
+    );
+
+    let ctx = rt
+        .block_on(create_context(partitions, samples, true, true, false))
+        .unwrap();
+    c.bench_function(
+        format!(
+            "top k={limit} string aggregate {} worst-case rows [Utf8]",
+            partitions * samples
+        )
+        .as_str(),
+        |b| b.iter(|| run_string(&rt, ctx.clone(), limit, true)),
+    );
+
+    let ctx = rt
+        .block_on(create_context(partitions, samples, false, true, true))
+        .unwrap();
+    c.bench_function(
+        format!(
+            "top k={limit} string aggregate {} time-series rows [Utf8View]",
+            partitions * samples
+        )
+        .as_str(),
+        |b| b.iter(|| run_string(&rt, ctx.clone(), limit, true)),
+    );
+
+    let ctx = rt
+        .block_on(create_context(partitions, samples, true, true, true))
+        .unwrap();
+    c.bench_function(
+        format!(
+            "top k={limit} string aggregate {} worst-case rows [Utf8View]",
+            partitions * samples
+        )
+        .as_str(),
+        |b| b.iter(|| run_string(&rt, ctx.clone(), limit, true)),
+    );
+
+    // DISTINCT benchmarks
+    let ctx = rt.block_on(async {
+        create_context_distinct(partitions, samples, false)
+            .await
+            .unwrap()
+    });
+    c.bench_function(
+        format!("distinct {} rows desc [no TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx.clone(), limit, false, false)),
+    );
+
+    c.bench_function(
+        format!("distinct {} rows asc [no TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx.clone(), limit, false, true)),
+    );
+
+    let ctx_topk = rt.block_on(async {
+        create_context_distinct(partitions, samples, true)
+            .await
+            .unwrap()
+    });
+    c.bench_function(
+        format!("distinct {} rows desc [TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx_topk.clone(), limit, true, false)),
+    );
+
+    c.bench_function(
+        format!("distinct {} rows asc [TopK]", partitions * samples).as_str(),
+        |b| b.iter(|| run_distinct(&rt, ctx_topk.clone(), limit, true, true)),
     );
 }
 
diff --git a/datafusion/core/benches/topk_repartition.rs b/datafusion/core/benches/topk_repartition.rs
new file mode 100644
index 0000000000000..e1f14e4aaa633
--- /dev/null
+++ b/datafusion/core/benches/topk_repartition.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for the TopKRepartition optimizer rule.
+//!
+//! Measures the benefit of pushing TopK (Sort with fetch) below hash
+//! repartition when running partitioned window functions with LIMIT.
+
+mod data_utils;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use data_utils::create_table_provider;
+use datafusion::prelude::{SessionConfig, SessionContext};
+use parking_lot::Mutex;
+use std::hint::black_box;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+#[expect(clippy::needless_pass_by_value)]
+fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
+    let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
+    black_box(rt.block_on(df.collect()).unwrap());
+}
+
+fn create_context(
+    partitions_len: usize,
+    target_partitions: usize,
+    enable_topk_repartition: bool,
+) -> Arc<Mutex<SessionContext>> {
+    let array_len = 1024 * 1024;
+    let batch_size = 8 * 1024;
+    let mut config = SessionConfig::new().with_target_partitions(target_partitions);
+    config.options_mut().optimizer.enable_topk_repartition = enable_topk_repartition;
+    let ctx = SessionContext::new_with_config(config);
+    let rt = Runtime::new().unwrap();
+    rt.block_on(async {
+        let provider =
+            create_table_provider(partitions_len, array_len, batch_size).unwrap();
+        ctx.register_table("t", provider).unwrap();
+    });
+    Arc::new(Mutex::new(ctx))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+
+    let limits = [10, 1_000, 10_000, 100_000];
+    let scans = 16;
+    let target_partitions = 4;
+
+    let group = format!("topk_repartition_{scans}_to_{target_partitions}");
+    let mut group = c.benchmark_group(group);
+    for limit in limits {
+        let sql = format!(
+            "SELECT \
+                SUM(f64) OVER (PARTITION BY u64_narrow ORDER BY u64_wide ROWS UNBOUNDED PRECEDING) \
+                FROM t \
+                ORDER BY u64_narrow, u64_wide \
+                LIMIT {limit}"
+        );
+
+        let ctx_disabled = create_context(scans, target_partitions, false);
+        group.bench_function(BenchmarkId::new("disabled", limit), |b| {
+            b.iter(|| query(ctx_disabled.clone(), &rt, &sql))
+        });
+
+        let ctx_enabled = create_context(scans, target_partitions, true);
+        group.bench_function(BenchmarkId::new("enabled", limit), |b| {
+            b.iter(|| query(ctx_enabled.clone(), &rt, &sql))
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/core/benches/window_query_sql.rs b/datafusion/core/benches/window_query_sql.rs
index a55d17a7c5dcf..1657cae913fef 100644
--- a/datafusion/core/benches/window_query_sql.rs
+++ b/datafusion/core/benches/window_query_sql.rs
@@ -15,23 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-extern crate datafusion;
-
 mod data_utils;
-use crate::criterion::Criterion;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 use data_utils::create_table_provider;
 use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use parking_lot::Mutex;
+use std::hint::black_box;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+#[expect(clippy::needless_pass_by_value)]
 fn query(ctx: Arc<Mutex<SessionContext>>, rt: &Runtime, sql: &str) {
     let df = rt.block_on(ctx.lock().sql(sql)).unwrap();
-    criterion::black_box(rt.block_on(df.collect()).unwrap());
+    black_box(rt.block_on(df.collect()).unwrap());
 }
 
 fn create_context(
diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs
index 1044717aaffb1..2466d42692192 100644
--- a/datafusion/core/src/bin/print_functions_docs.rs
+++ b/datafusion/core/src/bin/print_functions_docs.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use datafusion::execution::SessionStateDefaults;
-use datafusion_common::{not_impl_err, HashSet, Result};
+use datafusion_common::{HashSet, Result, not_impl_err};
 use datafusion_expr::{
-    aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF,
-    DocSection, Documentation, ScalarUDF, WindowUDF,
+    AggregateUDF, DocSection, Documentation, ScalarUDF, WindowUDF,
+    aggregate_doc_sections, scalar_doc_sections, window_doc_sections,
 };
 use itertools::Itertools;
 use std::env::args;
@@ -84,30 +84,7 @@ fn print_window_docs() -> Result<String> {
     print_docs(providers, window_doc_sections::doc_sections())
 }
 
-// Temporary method useful to semi automate
-// the migration of UDF documentation generation from code based
-// to attribute based
-// To be removed
-#[allow(dead_code)]
-fn save_doc_code_text(documentation: &Documentation, name: &str) {
-    let attr_text = documentation.to_doc_attribute();
-
-    let file_path = format!("{name}.txt");
-    if std::path::Path::new(&file_path).exists() {
-        std::fs::remove_file(&file_path).unwrap();
-    }
-
-    // Open the file in append mode, create it if it doesn't exist
-    let mut file = std::fs::OpenOptions::new()
-        .append(true) // Open in append mode
-        .create(true) // Create the file if it doesn't exist
-        .open(file_path)
-        .unwrap();
-
-    use std::io::Write;
-    file.write_all(attr_text.as_bytes()).unwrap();
-}
-
+#[expect(clippy::needless_pass_by_value)]
 fn print_docs(
     providers: Vec<Box<dyn DocProvider>>,
     doc_sections: Vec<DocSection>,
@@ -254,13 +231,15 @@ fn print_docs(
         for f in &providers_with_no_docs {
             eprintln!("  - {f}");
         }
-        not_impl_err!("Some functions do not have documentation. Please implement `documentation` for: {providers_with_no_docs:?}")
+        not_impl_err!(
+            "Some functions do not have documentation. Please implement `documentation` for: {providers_with_no_docs:?}"
+        )
     } else {
         Ok(docs)
     }
 }
 
-/// Trait for accessing name / aliases / documentation for differnet functions
+/// Trait for accessing name / aliases / documentation for different functions
 trait DocProvider {
     fn get_name(&self) -> String;
     fn get_aliases(&self) -> Vec<String>;
@@ -303,8 +282,7 @@ impl DocProvider for WindowUDF {
     }
 }
 
-#[allow(clippy::borrowed_box)]
-#[allow(clippy::ptr_arg)]
+#[expect(clippy::borrowed_box)]
 fn get_names_and_aliases(functions: &Vec<&Box<dyn DocProvider>>) -> Vec<String> {
     functions
         .iter()
diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
index 69992e57ca7d0..2292f5855bfde 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -26,42 +26,43 @@ use crate::datasource::file_format::csv::CsvFormatFactory;
 use crate::datasource::file_format::format_as_file_type;
 use crate::datasource::file_format::json::JsonFormatFactory;
 use crate::datasource::{
-    provider_as_source, DefaultTableSource, MemTable, TableProvider,
+    DefaultTableSource, MemTable, TableProvider, provider_as_source,
 };
 use crate::error::Result;
-use crate::execution::context::{SessionState, TaskContext};
 use crate::execution::FunctionRegistry;
+use crate::execution::context::{SessionState, TaskContext};
 use crate::logical_expr::utils::find_window_exprs;
 use crate::logical_expr::{
-    col, ident, Expr, JoinType, LogicalPlan, LogicalPlanBuilder,
-    LogicalPlanBuilderOptions, Partitioning, TableType,
+    Expr, JoinType, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions,
+    Partitioning, TableType, col, ident,
 };
 use crate::physical_plan::{
-    collect, collect_partitioned, execute_stream, execute_stream_partitioned,
-    ExecutionPlan, SendableRecordBatchStream,
+    ExecutionPlan, SendableRecordBatchStream, collect, collect_partitioned,
+    execute_stream, execute_stream_partitioned,
 };
 use crate::prelude::SessionContext;
 use std::any::Any;
 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, Int64Array, StringArray};
 use arrow::compute::{cast, concat};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_schema::FieldRef;
 use datafusion_common::config::{CsvOptions, JsonOptions};
 use datafusion_common::{
-    exec_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema,
-    DataFusionError, ParamValues, ScalarValue, SchemaError, UnnestOptions,
+    Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError,
+    TableReference, UnnestOptions, exec_err, internal_datafusion_err, not_impl_err,
+    plan_datafusion_err, plan_err, unqualified_field_not_found,
 };
 use datafusion_expr::select_expr::SelectExpr;
 use datafusion_expr::{
-    case,
+    ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
     dml::InsertOp,
     expr::{Alias, ScalarFunction},
     is_null, lit,
     utils::COUNT_STAR_EXPANSION,
-    SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE,
 };
 use datafusion_functions::core::coalesce;
 use datafusion_functions_aggregate::expr_fn::{
@@ -70,7 +71,6 @@ use datafusion_functions_aggregate::expr_fn::{
 
 use async_trait::async_trait;
 use datafusion_catalog::Session;
-use datafusion_sql::TableReference;
 
 /// Contains options that control how data is
 /// written out from a DataFrame
@@ -78,9 +78,11 @@ pub struct DataFrameWriteOptions {
     /// Controls how new data should be written to the table, determining whether
     /// to append, overwrite, or replace existing data.
     insert_op: InsertOp,
-    /// Controls if all partitions should be coalesced into a single output file
-    /// Generally will have slower performance when set to true.
-    single_file_output: bool,
+    /// Controls if all partitions should be coalesced into a single output file.
+    /// - `None`: Use automatic mode (extension-based heuristic)
+    /// - `Some(true)`: Force single file output at exact path
+    /// - `Some(false)`: Force directory output with generated filenames
+    single_file_output: Option<bool>,
     /// Sets which columns should be used for hive-style partitioned writes by name.
     /// Can be set to empty vec![] for non-partitioned writes.
     partition_by: Vec<String>,
@@ -94,7 +96,7 @@ impl DataFrameWriteOptions {
     pub fn new() -> Self {
         DataFrameWriteOptions {
             insert_op: InsertOp::Append,
-            single_file_output: false,
+            single_file_output: None,
             partition_by: vec![],
             sort_by: vec![],
         }
@@ -107,8 +109,14 @@ impl DataFrameWriteOptions {
     }
 
     /// Set the single_file_output value to true or false
+    ///
+    /// - `true`: Force single file output at the exact path specified
+    /// - `false`: Force directory output with generated filenames
+    ///
+    /// When not called, automatic mode is used (extension-based heuristic).
+    /// When set to true, an output file will always be created even if the DataFrame is empty.
     pub fn with_single_file_output(mut self, single_file_output: bool) -> Self {
-        self.single_file_output = single_file_output;
+        self.single_file_output = Some(single_file_output);
         self
     }
 
@@ -123,6 +131,15 @@ impl DataFrameWriteOptions {
         self.sort_by = sort_by;
         self
     }
+
+    /// Build the options HashMap to pass to CopyTo for sink configuration.
+    fn build_sink_options(&self) -> HashMap<String, String> {
+        let mut options = HashMap::new();
+        if let Some(single_file) = self.single_file_output {
+            options.insert("single_file_output".to_string(), single_file.to_string());
+        }
+        options
+    }
 }
 
 impl Default for DataFrameWriteOptions {
@@ -258,15 +275,19 @@ impl DataFrame {
     /// # async fn main() -> Result<()> {
     /// // datafusion will parse number as i64 first.
     /// let sql = "a > 1 and b in (1, 10)";
-    /// let expected = col("a").gt(lit(1 as i64))
-    ///   .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false));
+    /// let expected = col("a")
+    ///     .gt(lit(1 as i64))
+    ///     .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false));
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let expr = df.parse_sql_expr(sql)?;
     /// assert_eq!(expected, expr);
     /// # Ok(())
     /// # }
     /// ```
+    #[cfg(feature = "sql")]
     pub fn parse_sql_expr(&self, sql: &str) -> Result<Expr> {
         let df_schema = self.schema();
 
@@ -288,14 +309,16 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.select_columns(&["a", "b"])?;
     /// let expected = vec![
     ///     "+---+---+",
     ///     "| a | b |",
     ///     "+---+---+",
     ///     "| 1 | 2 |",
-    ///     "+---+---+"
+    ///     "+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -305,11 +328,20 @@ impl DataFrame {
         let fields = columns
             .iter()
             .map(|name| {
-                self.plan
+                let fields = self
+                    .plan
                     .schema()
-                    .qualified_field_with_unqualified_name(name)
+                    .qualified_fields_with_unqualified_name(name);
+                if fields.is_empty() {
+                    Err(unqualified_field_not_found(name, self.plan.schema()))
+                } else {
+                    Ok(fields)
+                }
             })
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Result<Vec<_>, _>>()?
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
         let expr: Vec<Expr> = fields
             .into_iter()
             .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
@@ -328,11 +360,14 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// let df : DataFrame = df.select_exprs(&["a * b", "c"])?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?;
     /// # Ok(())
     /// # }
     /// ```
+    #[cfg(feature = "sql")]
     pub fn select_exprs(self, exprs: &[&str]) -> Result<DataFrame> {
         let expr_list = exprs
             .iter()
@@ -355,14 +390,16 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.select(vec![col("a"), col("b") * col("c")])?;
     /// let expected = vec![
     ///     "+---+-----------------------+",
     ///     "| a | ?table?.b * ?table?.c |",
     ///     "+---+-----------------------+",
     ///     "| 1 | 6                     |",
-    ///     "+---+-----------------------+"
+    ///     "+---+-----------------------+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -375,15 +412,12 @@ impl DataFrame {
         let expr_list: Vec<SelectExpr> =
             expr_list.into_iter().map(|e| e.into()).collect::<Vec<_>>();
 
-        let expressions = expr_list
-            .iter()
-            .filter_map(|e| match e {
-                SelectExpr::Expression(expr) => Some(expr.clone()),
-                _ => None,
-            })
-            .collect::<Vec<_>>();
+        let expressions = expr_list.iter().filter_map(|e| match e {
+            SelectExpr::Expression(expr) => Some(expr),
+            _ => None,
+        });
 
-        let window_func_exprs = find_window_exprs(&expressions);
+        let window_func_exprs = find_window_exprs(expressions);
         let plan = if window_func_exprs.is_empty() {
             self.plan
         } else {
@@ -408,7 +442,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// // +----+----+----+
     /// // | a  | b  | c  |
     /// // +----+----+----+
@@ -420,22 +456,37 @@ impl DataFrame {
     ///     "| b | c |",
     ///     "+---+---+",
     ///     "| 2 | 3 |",
-    ///     "+---+---+"
+    ///     "+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
     /// # }
     /// ```
-    pub fn drop_columns(self, columns: &[&str]) -> Result<DataFrame> {
+    pub fn drop_columns<T>(self, columns: &[T]) -> Result<DataFrame>
+    where
+        T: Into<Column> + Clone,
+    {
         let fields_to_drop = columns
             .iter()
-            .map(|name| {
-                self.plan
-                    .schema()
-                    .qualified_field_with_unqualified_name(name)
+            .flat_map(|col| {
+                let column: Column = col.clone().into();
+                match column.relation.as_ref() {
+                    Some(_) => {
+                        // qualified_field_from_column returns Result<(Option<&TableReference>, &FieldRef)>
+                        vec![self.plan.schema().qualified_field_from_column(&column)]
+                    }
+                    None => {
+                        // qualified_fields_with_unqualified_name returns Vec<(Option<&TableReference>, &FieldRef)>
+                        self.plan
+                            .schema()
+                            .qualified_fields_with_unqualified_name(&column.name)
+                            .into_iter()
+                            .map(Ok)
+                            .collect::<Vec<_>>()
+                    }
+                }
             })
-            .filter(|r| r.is_ok())
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Result<Vec<_>, _>>()?;
         let expr: Vec<Expr> = self
             .plan
             .schema()
@@ -461,7 +512,7 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_json("tests/data/unnest.json", NdJsonReadOptions::default()).await?;
+    /// let df = ctx.read_json("tests/data/unnest.json", JsonReadOptions::default()).await?;
     /// // expand into multiple columns if it's json array, flatten field name if it's nested structure
     /// let df = df.unnest_columns(&["b","c","d"])?;
     /// let expected = vec![
@@ -519,7 +570,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.filter(col("a").lt_eq(col("b")))?;
     /// // all rows where a <= b are returned
     /// let expected = vec![
@@ -529,7 +582,7 @@ impl DataFrame {
     ///     "| 1 | 2 | 3 |",
     ///     "| 4 | 5 | 6 |",
     ///     "| 7 | 8 | 9 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -558,7 +611,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     ///
     /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
     /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?;
@@ -569,7 +624,7 @@ impl DataFrame {
     ///     "| 1 | 2              |",
     ///     "| 4 | 5              |",
     ///     "| 7 | 8              |",
-    ///     "+---+----------------+"
+    ///     "+---+----------------+",
     /// ];
     /// assert_batches_sorted_eq!(expected1, &df1.collect().await?);
     /// // The following use is the equivalent of "SELECT MIN(b)"
@@ -579,7 +634,7 @@ impl DataFrame {
     ///     "| min(?table?.b) |",
     ///     "+----------------+",
     ///     "| 2              |",
-    ///     "+----------------+"
+    ///     "+----------------+",
     /// ];
     /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?);
     /// # Ok(())
@@ -647,7 +702,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.limit(1, Some(2))?;
     /// let expected = vec![
     ///     "+---+---+---+",
@@ -655,7 +712,7 @@ impl DataFrame {
     ///     "+---+---+---+",
     ///     "| 4 | 5 | 6 |",
     ///     "| 7 | 8 | 9 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -684,7 +741,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?   ;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let d2 = df.clone();
     /// let df = df.union(d2)?;
     /// let expected = vec![
@@ -693,7 +752,7 @@ impl DataFrame {
     ///     "+---+---+---+",
     ///     "| 1 | 2 | 3 |",
     ///     "| 1 | 2 | 3 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -724,8 +783,13 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// let d2 = df.clone().select_columns(&["b", "c", "a"])?.with_column("d", lit("77"))?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let d2 = df
+    ///     .clone()
+    ///     .select_columns(&["b", "c", "a"])?
+    ///     .with_column("d", lit("77"))?;
     /// let df = df.union_by_name(d2)?;
     /// let expected = vec![
     ///     "+---+---+---+----+",
@@ -733,7 +797,7 @@ impl DataFrame {
     ///     "+---+---+---+----+",
     ///     "| 1 | 2 | 3 |    |",
     ///     "| 1 | 2 | 3 | 77 |",
-    ///     "+---+---+---+----+"
+    ///     "+---+---+---+----+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -763,7 +827,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let d2 = df.clone();
     /// let df = df.union_distinct(d2)?;
     /// // df2 are duplicate of df
@@ -772,7 +838,7 @@ impl DataFrame {
     ///     "| a | b | c |",
     ///     "+---+---+---+",
     ///     "| 1 | 2 | 3 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -803,7 +869,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let d2 = df.clone().select_columns(&["b", "c", "a"])?;
     /// let df = df.union_by_name_distinct(d2)?;
     /// let expected = vec![
@@ -811,7 +879,7 @@ impl DataFrame {
     ///     "| a | b | c |",
     ///     "+---+---+---+",
     ///     "| 1 | 2 | 3 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -838,14 +906,16 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.distinct()?;
     /// let expected = vec![
     ///     "+---+---+---+",
     ///     "| a | b | c |",
     ///     "+---+---+---+",
     ///     "| 1 | 2 | 3 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -872,15 +942,17 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
-    ///   // Return a single row (a, b) for each distinct value of a
-    ///   .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?
+    ///     // Return a single row (a, b) for each distinct value of a
+    ///     .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?;
     /// let expected = vec![
     ///     "+---+---+",
     ///     "| a | b |",
     ///     "+---+---+",
     ///     "| 1 | 2 |",
-    ///     "+---+---+"
+    ///     "+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -953,7 +1025,7 @@ impl DataFrame {
         }));
 
         //collect recordBatch
-        let describe_record_batch = vec![
+        let describe_record_batch = [
             // count aggregation
             self.clone().aggregate(
                 vec![],
@@ -1126,11 +1198,13 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.sort(vec![
-    ///   col("a").sort(false, true),   // a DESC, nulls first
-    ///   col("b").sort(true, false), // b ASC, nulls last
-    ///  ])?;
+    ///     col("a").sort(false, true), // a DESC, nulls first
+    ///     col("b").sort(true, false), // b ASC, nulls last
+    /// ])?;
     /// let expected = vec![
     ///     "+---+---+---+",
     ///     "| a | b | c |",
@@ -1177,12 +1251,17 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let left = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// let right = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
-    ///   .select(vec![
-    ///     col("a").alias("a2"),
-    ///     col("b").alias("b2"),
-    ///     col("c").alias("c2")])?;
+    /// let left = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let right = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?
+    ///     .select(vec![
+    ///         col("a").alias("a2"),
+    ///         col("b").alias("b2"),
+    ///         col("c").alias("c2"),
+    ///     ])?;
     /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)`
     /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`.
     /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?;
@@ -1191,13 +1270,12 @@ impl DataFrame {
     ///     "| a | b | c | a2 | b2 | c2 |",
     ///     "+---+---+---+----+----+----+",
     ///     "| 1 | 2 | 3 | 1  | 2  | 3  |",
-    ///     "+---+---+---+----+----+----+"
+    ///     "+---+---+---+----+----+----+",
     /// ];
     /// assert_batches_sorted_eq!(expected, &join.collect().await?);
     /// # Ok(())
     /// # }
     /// ```
-    ///
     pub fn join(
         self,
         right: DataFrame,
@@ -1259,7 +1337,7 @@ impl DataFrame {
     ///     "+---+---+---+----+----+----+",
     ///     "| a | b | c | a2 | b2 | c2 |",
     ///     "+---+---+---+----+----+----+",
-    ///     "+---+---+---+----+----+----+"
+    ///     "+---+---+---+----+----+----+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?);
     /// # Ok(())
@@ -1291,7 +1369,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
     /// let expected = vec![
     ///     "+---+---+---+",
@@ -1300,7 +1380,7 @@ impl DataFrame {
     ///     "| 1 | 2 | 3 |",
     ///     "| 4 | 5 | 6 |",
     ///     "| 7 | 8 | 9 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df1.collect().await?);
     /// # Ok(())
@@ -1329,7 +1409,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let count = df.count().await?; // 1
     /// # assert_eq!(count, 1);
     /// # Ok(())
@@ -1337,7 +1419,10 @@ impl DataFrame {
     /// ```
     pub async fn count(self) -> Result<usize> {
         let rows = self
-            .aggregate(vec![], vec![count(Expr::Literal(COUNT_STAR_EXPANSION))])?
+            .aggregate(
+                vec![],
+                vec![count(Expr::Literal(COUNT_STAR_EXPANSION, None))],
+            )?
             .collect()
             .await?;
         let len = *rows
@@ -1345,9 +1430,9 @@ impl DataFrame {
             .and_then(|r| r.columns().first())
             .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
             .and_then(|a| a.values().first())
-            .ok_or(DataFusionError::Internal(
-                "Unexpected output when collecting for count()".to_string(),
-            ))? as usize;
+            .ok_or_else(|| {
+                internal_datafusion_err!("Unexpected output when collecting for count()")
+            })? as usize;
         Ok(len)
     }
 
@@ -1365,7 +1450,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let batches = df.collect().await?;
     /// # Ok(())
     /// # }
@@ -1385,7 +1472,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// df.show().await?;
     /// # Ok(())
     /// # }
@@ -1444,7 +1533,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// df.show_limit(10).await?;
     /// # Ok(())
     /// # }
@@ -1470,7 +1561,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let stream = df.execute_stream().await?;
     /// # Ok(())
     /// # }
@@ -1496,7 +1589,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let batches = df.collect_partitioned().await?;
     /// # Ok(())
     /// # }
@@ -1516,7 +1611,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let batches = df.execute_stream_partitioned().await?;
     /// # Ok(())
     /// # }
@@ -1545,7 +1642,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let schema = df.schema();
     /// # Ok(())
     /// # }
@@ -1596,12 +1695,26 @@ impl DataFrame {
     /// Note: This discards the [`SessionState`] associated with this
     /// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`]
     pub fn into_view(self) -> Arc<dyn TableProvider> {
-        Arc::new(DataFrameTableProvider { plan: self.plan })
+        Arc::new(DataFrameTableProvider {
+            plan: self.plan,
+            table_type: TableType::View,
+        })
+    }
+
+    /// See [`Self::into_view`]. The returned [`TableProvider`] will
+    /// create a transient table.
+    pub fn into_temporary_view(self) -> Arc<dyn TableProvider> {
+        Arc::new(DataFrameTableProvider {
+            plan: self.plan,
+            table_type: TableType::Temporary,
+        })
     }
 
     /// Return a DataFrame with the explanation of its plan so far.
     ///
     /// if `analyze` is specified, runs the plan and reports metrics
+    /// if `verbose` is true, prints out additional details.
+    /// The default format is Indent format.
     ///
     /// ```
     /// # use datafusion::prelude::*;
@@ -1609,17 +1722,60 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// let batches = df.limit(0, Some(100))?.explain(false, false)?.collect().await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let batches = df
+    ///     .limit(0, Some(100))?
+    ///     .explain(false, false)?
+    ///     .collect()
+    ///     .await?;
     /// # Ok(())
     /// # }
     /// ```
     pub fn explain(self, verbose: bool, analyze: bool) -> Result<DataFrame> {
+        // Set the default format to Indent to keep the previous behavior
+        let opts = ExplainOption::default()
+            .with_verbose(verbose)
+            .with_analyze(analyze);
+        self.explain_with_options(opts)
+    }
+
+    /// Return a DataFrame with the explanation of its plan so far.
+    ///
+    /// `opt` is used to specify the options for the explain operation.
+    /// Details of the options can be found in [`ExplainOption`].
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// use datafusion_expr::{Explain, ExplainOption};
+    /// let ctx = SessionContext::new();
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let batches = df
+    ///     .limit(0, Some(100))?
+    ///     .explain_with_options(
+    ///         ExplainOption::default()
+    ///             .with_verbose(false)
+    ///             .with_analyze(false),
+    ///     )?
+    ///     .collect()
+    ///     .await?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn explain_with_options(
+        self,
+        explain_option: ExplainOption,
+    ) -> Result<DataFrame> {
         if matches!(self.plan, LogicalPlan::Explain(_)) {
             return plan_err!("Nested EXPLAINs are not supported");
         }
         let plan = LogicalPlanBuilder::from(self.plan)
-            .explain(verbose, analyze)?
+            .explain_option_format(explain_option)?
             .build()?;
         Ok(DataFrame {
             session_state: self.session_state,
@@ -1637,7 +1793,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let f = df.registry();
     /// // use f.udf("name", vec![...]) to use the udf
     /// # Ok(())
@@ -1656,15 +1814,19 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-    /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let d2 = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.intersect(d2)?;
     /// let expected = vec![
     ///     "+---+---+---+",
     ///     "| a | b | c |",
     ///     "+---+---+---+",
     ///     "| 1 | 2 | 3 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
     /// # Ok(())
@@ -1681,6 +1843,44 @@ impl DataFrame {
         })
     }
 
+    /// Calculate the distinct intersection of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use datafusion_common::assert_batches_sorted_eq;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let d2 = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df = df.intersect_distinct(d2)?;
+    /// let expected = vec![
+    ///     "+---+---+---+",
+    ///     "| a | b | c |",
+    ///     "+---+---+---+",
+    ///     "| 1 | 2 | 3 |",
+    ///     "+---+---+---+",
+    /// ];
+    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn intersect_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
+        let left_plan = self.plan;
+        let right_plan = dataframe.plan;
+        let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, false)?;
+        Ok(DataFrame {
+            session_state: self.session_state,
+            plan,
+            projection_requires_validation: true,
+        })
+    }
+
     /// Calculate the exception of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
     ///
     /// ```
@@ -1690,8 +1890,12 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?;
-    /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let d2 = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let result = df.except(d2)?;
     /// // those columns are not in example.csv, but in example_long.csv
     /// let expected = vec![
@@ -1700,7 +1904,7 @@ impl DataFrame {
     ///     "+---+---+---+",
     ///     "| 4 | 5 | 6 |",
     ///     "| 7 | 8 | 9 |",
-    ///     "+---+---+---+"
+    ///     "+---+---+---+",
     /// ];
     /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
     /// # Ok(())
@@ -1717,6 +1921,46 @@ impl DataFrame {
         })
     }
 
+    /// Calculate the distinct exception of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use datafusion_common::assert_batches_sorted_eq;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx
+    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let d2 = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let result = df.except_distinct(d2)?;
+    /// // those columns are not in example.csv, but in example_long.csv
+    /// let expected = vec![
+    ///     "+---+---+---+",
+    ///     "| a | b | c |",
+    ///     "+---+---+---+",
+    ///     "| 4 | 5 | 6 |",
+    ///     "| 7 | 8 | 9 |",
+    ///     "+---+---+---+",
+    /// ];
+    /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn except_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
+        let left_plan = self.plan;
+        let right_plan = dataframe.plan;
+        let plan = LogicalPlanBuilder::except(left_plan, right_plan, false)?;
+        Ok(DataFrame {
+            session_state: self.session_state,
+            plan,
+            projection_requires_validation: true,
+        })
+    }
+
     /// Execute this `DataFrame` and write the results to `table_name`.
     ///
     /// Returns a single [RecordBatch] containing a single column and
@@ -1777,13 +2021,15 @@ impl DataFrame {
     /// use datafusion::dataframe::DataFrameWriteOptions;
     /// let ctx = SessionContext::new();
     /// // Sort the data by column "b" and write it to a new location
-    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
-    ///   .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
-    ///   .write_csv(
-    ///     "output.csv",
-    ///     DataFrameWriteOptions::new(),
-    ///     None, // can also specify CSV writing options here
-    /// ).await?;
+    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?
+    ///     .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
+    ///     .write_csv(
+    ///         "output.csv",
+    ///         DataFrameWriteOptions::new(),
+    ///         None, // can also specify CSV writing options here
+    ///     )
+    ///     .await?;
     /// # fs::remove_file("output.csv")?;
     /// # Ok(())
     /// # }
@@ -1809,6 +2055,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -1821,7 +2069,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            HashMap::new(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -1847,13 +2095,11 @@ impl DataFrame {
     /// use datafusion::dataframe::DataFrameWriteOptions;
     /// let ctx = SessionContext::new();
     /// // Sort the data by column "b" and write it to a new location
-    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
-    ///   .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
-    ///   .write_json(
-    ///     "output.json",
-    ///     DataFrameWriteOptions::new(),
-    ///     None
-    /// ).await?;
+    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?
+    ///     .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
+    ///     .write_json("output.json", DataFrameWriteOptions::new(), None)
+    ///     .await?;
     /// # fs::remove_file("output.json")?;
     /// # Ok(())
     /// # }
@@ -1879,6 +2125,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -1891,7 +2139,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            Default::default(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -1914,39 +2162,48 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.with_column("ab_sum", col("a") + col("b"))?;
     /// # Ok(())
     /// # }
     /// ```
     pub fn with_column(self, name: &str, expr: Expr) -> Result<DataFrame> {
-        let window_func_exprs = find_window_exprs(std::slice::from_ref(&expr));
+        let window_func_exprs = find_window_exprs([&expr]);
+
+        let original_names: HashSet<String> = self
+            .plan
+            .schema()
+            .iter()
+            .map(|(_, f)| f.name().clone())
+            .collect();
 
-        let (window_fn_str, plan) = if window_func_exprs.is_empty() {
-            (None, self.plan)
+        // Maybe build window plan
+        let plan = if window_func_exprs.is_empty() {
+            self.plan
         } else {
-            (
-                Some(window_func_exprs[0].to_string()),
-                LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?,
-            )
+            LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
         };
 
-        let mut col_exists = false;
         let new_column = expr.alias(name);
+        let mut col_exists = false;
+
         let mut fields: Vec<(Expr, bool)> = plan
             .schema()
             .iter()
             .filter_map(|(qualifier, field)| {
+                // Skip new fields introduced by window_plan
+                if !original_names.contains(field.name()) {
+                    return None;
+                }
+
                 if field.name() == name {
                     col_exists = true;
                     Some((new_column.clone(), true))
                 } else {
                     let e = col(Column::from((qualifier, field)));
-                    window_fn_str
-                        .as_ref()
-                        .filter(|s| *s == &e.to_string())
-                        .is_none()
-                        .then_some((e, self.projection_requires_validation))
+                    Some((e, self.projection_requires_validation))
                 }
             })
             .collect();
@@ -1981,7 +2238,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.with_column_renamed("ab_sum", "total")?;
     ///
     /// # Ok(())
@@ -2007,10 +2266,11 @@ impl DataFrame {
             match self.plan.schema().qualified_field_from_column(&old_column) {
                 Ok(qualifier_and_field) => qualifier_and_field,
                 // no-op if field not found
-                Err(DataFusionError::SchemaError(
-                    SchemaError::FieldNotFound { .. },
-                    _,
-                )) => return Ok(self),
+                Err(DataFusionError::SchemaError(e, _))
+                    if matches!(*e, SchemaError::FieldNotFound { .. }) =>
+                {
+                    return Ok(self);
+                }
                 Err(err) => return Err(err),
             };
         let projection = self
@@ -2018,7 +2278,7 @@ impl DataFrame {
             .schema()
             .iter()
             .map(|(qualifier, field)| {
-                if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename {
+                if qualifier.eq(&qualifier_rename) && field == field_rename {
                     (
                         col(Column::from((qualifier, field)))
                             .alias_qualified(qualifier.cloned(), new_name),
@@ -2107,26 +2367,38 @@ impl DataFrame {
 
     /// Cache DataFrame as a memory table.
     ///
+    /// Default behavior could be changed using
+    /// a [`crate::execution::session_state::CacheFactory`]
+    /// configured via [`SessionState`].
+    ///
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// let df = df.cache().await?;
     /// # Ok(())
     /// # }
     /// ```
     pub async fn cache(self) -> Result<DataFrame> {
-        let context = SessionContext::new_with_state((*self.session_state).clone());
-        // The schema is consistent with the output
-        let plan = self.clone().create_physical_plan().await?;
-        let schema = plan.schema();
-        let task_ctx = Arc::new(self.task_ctx());
-        let partitions = collect_partitioned(plan, task_ctx).await?;
-        let mem_table = MemTable::try_new(schema, partitions)?;
-        context.read_table(Arc::new(mem_table))
+        if let Some(cache_factory) = self.session_state.cache_factory() {
+            let new_plan =
+                cache_factory.create(self.plan, self.session_state.as_ref())?;
+            Ok(Self::new(*self.session_state, new_plan))
+        } else {
+            let context = SessionContext::new_with_state((*self.session_state).clone());
+            // The schema is consistent with the output
+            let plan = self.clone().create_physical_plan().await?;
+            let schema = plan.schema();
+            let task_ctx = Arc::new(self.task_ctx());
+            let partitions = collect_partitioned(plan, task_ctx).await?;
+            let mem_table = MemTable::try_new(schema, partitions)?;
+            context.read_table(Arc::new(mem_table))
+        }
     }
 
     /// Apply an alias to the DataFrame.
@@ -2157,7 +2429,9 @@ impl DataFrame {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// // Fill nulls in only columns "a" and "c":
     /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?;
     /// // Fill nulls across all columns:
@@ -2165,6 +2439,7 @@ impl DataFrame {
     /// # Ok(())
     /// # }
     /// ```
+    #[expect(clippy::needless_pass_by_value)]
     pub fn fill_null(
         &self,
         value: ScalarValue,
@@ -2175,7 +2450,7 @@ impl DataFrame {
                 .schema()
                 .fields()
                 .iter()
-                .map(|f| f.as_ref().clone())
+                .map(Arc::clone)
                 .collect()
         } else {
             self.find_columns(&columns)?
@@ -2212,7 +2487,7 @@ impl DataFrame {
     }
 
     // Helper to find columns from names
-    fn find_columns(&self, names: &[String]) -> Result<Vec<Field>> {
+    fn find_columns(&self, names: &[String]) -> Result<Vec<FieldRef>> {
         let schema = self.logical_plan().schema();
         names
             .iter()
@@ -2225,12 +2500,54 @@ impl DataFrame {
             .collect()
     }
 
+    /// Find qualified columns for this dataframe from names
+    ///
+    /// # Arguments
+    /// * `names` - Unqualified names to find.
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use datafusion_common::ScalarValue;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// ctx.register_csv("first_table", "tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df = ctx.table("first_table").await?;
+    /// ctx.register_csv("second_table", "tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
+    /// let df2 = ctx.table("second_table").await?;
+    /// let join_expr = df.find_qualified_columns(&["a"])?.iter()
+    ///     .zip(df2.find_qualified_columns(&["a"])?.iter())
+    ///     .map(|(col1, col2)| col(*col1).eq(col(*col2)))
+    ///     .collect::<Vec<Expr>>();
+    /// let df3 = df.join_on(df2, JoinType::Inner, join_expr)?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn find_qualified_columns(
+        &self,
+        names: &[&str],
+    ) -> Result<Vec<(Option<&TableReference>, &FieldRef)>> {
+        let schema = self.logical_plan().schema();
+        names
+            .iter()
+            .map(|name| {
+                schema
+                    .qualified_field_from_column(&Column::from_name(*name))
+                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
+            })
+            .collect()
+    }
+
     /// Helper for creating DataFrame.
     /// # Example
     /// ```
-    /// use std::sync::Arc;
     /// use arrow::array::{ArrayRef, Int32Array, StringArray};
     /// use datafusion::prelude::DataFrame;
+    /// use std::sync::Arc;
     /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
     /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
     /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap();
@@ -2317,6 +2634,7 @@ macro_rules! dataframe {
 #[derive(Debug)]
 struct DataFrameTableProvider {
     plan: LogicalPlan,
+    table_type: TableType,
 }
 
 #[async_trait]
@@ -2325,7 +2643,7 @@ impl TableProvider for DataFrameTableProvider {
         self
     }
 
-    fn get_logical_plan(&self) -> Option<Cow<LogicalPlan>> {
+    fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
         Some(Cow::Borrowed(&self.plan))
     }
 
@@ -2338,12 +2656,11 @@ impl TableProvider for DataFrameTableProvider {
     }
 
     fn schema(&self) -> SchemaRef {
-        let schema: Schema = self.plan.schema().as_ref().into();
-        Arc::new(schema)
+        Arc::clone(self.plan.schema().inner())
     }
 
     fn table_type(&self) -> TableType {
-        TableType::View
+        self.table_type
     }
 
     async fn scan(
diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs
index 1bb5444ca009f..e9c49a92843d6 100644
--- a/datafusion/core/src/dataframe/parquet.rs
+++ b/datafusion/core/src/dataframe/parquet.rs
@@ -42,13 +42,15 @@ impl DataFrame {
     /// use datafusion::dataframe::DataFrameWriteOptions;
     /// let ctx = SessionContext::new();
     /// // Sort the data by column "b" and write it to a new location
-    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
-    ///   .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
-    ///   .write_parquet(
-    ///     "output.parquet",
-    ///     DataFrameWriteOptions::new(),
-    ///     None, // can also specify parquet writing options here
-    /// ).await?;
+    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?
+    ///     .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
+    ///     .write_parquet(
+    ///         "output.parquet",
+    ///         DataFrameWriteOptions::new(),
+    ///         None, // can also specify parquet writing options here
+    ///     )
+    ///     .await?;
     /// # fs::remove_file("output.parquet")?;
     /// # Ok(())
     /// # }
@@ -74,6 +76,8 @@ impl DataFrame {
 
         let file_type = format_as_file_type(format);
 
+        let copy_options = options.build_sink_options();
+
         let plan = if options.sort_by.is_empty() {
             self.plan
         } else {
@@ -86,7 +90,7 @@ impl DataFrame {
             plan,
             path.into(),
             file_type,
-            Default::default(),
+            copy_options,
             options.partition_by,
         )?
         .build()?;
@@ -116,11 +120,26 @@ mod tests {
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::{col, lit};
 
+    #[cfg(feature = "parquet_encryption")]
+    use datafusion_common::config::ConfigFileEncryptionProperties;
     use object_store::local::LocalFileSystem;
     use parquet::file::reader::FileReader;
     use tempfile::TempDir;
     use url::Url;
 
+    /// Helper to extract a metric value by name from aggregated metrics.
+    fn metric_usize(
+        aggregated: &datafusion_physical_expr_common::metrics::MetricsSet,
+        name: &str,
+    ) -> usize {
+        aggregated
+            .iter()
+            .find(|m| m.value().name() == name)
+            .unwrap_or_else(|| panic!("should have {name} metric"))
+            .value()
+            .as_usize()
+    }
+
     #[tokio::test]
     async fn filter_pushdown_dataframe() -> Result<()> {
         let ctx = SessionContext::new();
@@ -146,7 +165,7 @@ mod tests {
         let plan = df.explain(false, false)?.collect().await?;
         // Filters all the way to Parquet
         let formatted = pretty::pretty_format_batches(&plan)?.to_string();
-        assert!(formatted.contains("FilterExec: id@0 = 1"));
+        assert!(formatted.contains("FilterExec: id@0 = 1"), "{formatted}");
 
         Ok(())
     }
@@ -205,7 +224,7 @@ mod tests {
             &HashMap::from_iter(
                 [("datafusion.execution.batch_size", "10")]
                     .iter()
-                    .map(|(s1, s2)| (s1.to_string(), s2.to_string())),
+                    .map(|(s1, s2)| ((*s1).to_string(), (*s2).to_string())),
             ),
         )?);
         register_aggregate_csv(&ctx, "aggregate_test_100").await?;
@@ -246,4 +265,350 @@ mod tests {
 
         Ok(())
     }
+
+    #[rstest::rstest]
+    #[cfg(feature = "parquet_encryption")]
+    #[tokio::test]
+    async fn roundtrip_parquet_with_encryption(
+        #[values(false, true)] allow_single_file_parallelism: bool,
+    ) -> Result<()> {
+        use parquet::encryption::decrypt::FileDecryptionProperties;
+        use parquet::encryption::encrypt::FileEncryptionProperties;
+
+        let test_df = test_util::test_table().await?;
+
+        let schema = test_df.schema();
+        let footer_key = b"0123456789012345".to_vec(); // 128bit/16
+        let column_key = b"1234567890123450".to_vec(); // 128bit/16
+
+        let mut encrypt = FileEncryptionProperties::builder(footer_key.clone());
+        let mut decrypt = FileDecryptionProperties::builder(footer_key.clone());
+
+        for field in schema.fields().iter() {
+            encrypt = encrypt.with_column_key(field.name().as_str(), column_key.clone());
+            decrypt = decrypt.with_column_key(field.name().as_str(), column_key.clone());
+        }
+
+        let encrypt = encrypt.build()?;
+        let decrypt = decrypt.build()?;
+
+        let df = test_df.clone();
+        let tmp_dir = TempDir::new()?;
+        let tempfile = tmp_dir.path().join("roundtrip.parquet");
+        let tempfile_str = tempfile.into_os_string().into_string().unwrap();
+
+        // Write encrypted parquet using write_parquet
+        let mut options = TableParquetOptions::default();
+        options.crypto.file_encryption =
+            Some(ConfigFileEncryptionProperties::from(&encrypt));
+        options.global.allow_single_file_parallelism = allow_single_file_parallelism;
+
+        df.write_parquet(
+            tempfile_str.as_str(),
+            DataFrameWriteOptions::new().with_single_file_output(true),
+            Some(options),
+        )
+        .await?;
+        let num_rows_written = test_df.count().await?;
+
+        // Read encrypted parquet
+        let ctx: SessionContext = SessionContext::new();
+        let read_options =
+            ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+
+        ctx.register_parquet("roundtrip_parquet", &tempfile_str, read_options.clone())
+            .await?;
+
+        let df_enc = ctx.sql("SELECT * FROM roundtrip_parquet").await?;
+        let num_rows_read = df_enc.count().await?;
+
+        assert_eq!(num_rows_read, num_rows_written);
+
+        // Read encrypted parquet and subset rows + columns
+        let encrypted_parquet_df = ctx.read_parquet(tempfile_str, read_options).await?;
+
+        // Select three columns and filter the results
+        // Test that the filter works as expected
+        let selected = encrypted_parquet_df
+            .clone()
+            .select_columns(&["c1", "c2", "c3"])?
+            .filter(col("c2").gt(lit(4)))?;
+
+        let num_rows_selected = selected.count().await?;
+        assert_eq!(num_rows_selected, 14);
+
+        Ok(())
+    }
+
+    /// Test FileOutputMode::SingleFile - explicitly request single file output
+    /// for paths WITHOUT file extensions. This verifies the fix for the regression
+    /// where extension heuristics ignored the explicit with_single_file_output(true).
+    #[tokio::test]
+    async fn test_file_output_mode_single_file() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        // Path WITHOUT .parquet extension - this is the key scenario
+        let output_path = tmp_dir.path().join("data_no_ext");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let df = ctx.read_batch(RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?)?;
+
+        // Explicitly request single file output
+        df.write_parquet(
+            output_path_str,
+            DataFrameWriteOptions::new().with_single_file_output(true),
+            None,
+        )
+        .await?;
+
+        // Verify: output should be a FILE, not a directory
+        assert!(
+            output_path.is_file(),
+            "Expected single file at {:?}, but got is_file={}, is_dir={}",
+            output_path,
+            output_path.is_file(),
+            output_path.is_dir()
+        );
+
+        // Verify the file is readable as parquet
+        let file = std::fs::File::open(&output_path)?;
+        let reader = parquet::file::reader::SerializedFileReader::new(file)?;
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+        assert_eq!(metadata.file_metadata().num_rows(), 3);
+
+        Ok(())
+    }
+
+    /// Test FileOutputMode::Automatic - uses extension heuristic.
+    /// Path WITH extension -> single file; path WITHOUT extension -> directory.
+    #[tokio::test]
+    async fn test_file_output_mode_automatic() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?;
+
+        // Case 1: Path WITH extension -> should create single file (Automatic mode)
+        let output_with_ext = tmp_dir.path().join("data.parquet");
+        let df = ctx.read_batch(batch.clone())?;
+        df.write_parquet(
+            output_with_ext.to_str().unwrap(),
+            DataFrameWriteOptions::new(), // Automatic mode (default)
+            None,
+        )
+        .await?;
+
+        assert!(
+            output_with_ext.is_file(),
+            "Path with extension should be a single file, got is_file={}, is_dir={}",
+            output_with_ext.is_file(),
+            output_with_ext.is_dir()
+        );
+
+        // Case 2: Path WITHOUT extension -> should create directory (Automatic mode)
+        let output_no_ext = tmp_dir.path().join("data_dir");
+        let df = ctx.read_batch(batch)?;
+        df.write_parquet(
+            output_no_ext.to_str().unwrap(),
+            DataFrameWriteOptions::new(), // Automatic mode (default)
+            None,
+        )
+        .await?;
+
+        assert!(
+            output_no_ext.is_dir(),
+            "Path without extension should be a directory, got is_file={}, is_dir={}",
+            output_no_ext.is_file(),
+            output_no_ext.is_dir()
+        );
+
+        Ok(())
+    }
+
+    /// Test that ParquetSink exposes rows_written, bytes_written, and
+    /// elapsed_compute metrics via DataSinkExec.
+    #[tokio::test]
+    async fn test_parquet_sink_metrics() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use datafusion_execution::TaskContext;
+
+        use futures::TryStreamExt;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+        let output_path = tmp_dir.path().join("metrics_test.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        // Register a table with 100 rows
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("val", DataType::Int32, false),
+        ]));
+        let ids: Vec<i32> = (0..100).collect();
+        let vals: Vec<i32> = (100..200).collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(ids)),
+                Arc::new(Int32Array::from(vals)),
+            ],
+        )?;
+        ctx.register_batch("source", batch)?;
+
+        // Create the physical plan for COPY TO
+        let df = ctx
+            .sql(&format!(
+                "COPY source TO '{output_path_str}' STORED AS PARQUET"
+            ))
+            .await?;
+        let plan = df.create_physical_plan().await?;
+
+        // Execute the plan
+        let task_ctx = Arc::new(TaskContext::from(&ctx.state()));
+        let stream = plan.execute(0, task_ctx)?;
+        let _batches: Vec<_> = stream.try_collect().await?;
+
+        // Check metrics on the DataSinkExec (top-level plan)
+        let metrics = plan
+            .metrics()
+            .expect("DataSinkExec should return metrics from ParquetSink");
+        let aggregated = metrics.aggregate_by_name();
+
+        // rows_written should be 100
+        assert_eq!(
+            metric_usize(&aggregated, "rows_written"),
+            100,
+            "expected 100 rows written"
+        );
+
+        // bytes_written should be > 0
+        let bytes_written = metric_usize(&aggregated, "bytes_written");
+        assert!(
+            bytes_written > 0,
+            "expected bytes_written > 0, got {bytes_written}"
+        );
+
+        // elapsed_compute should be > 0
+        let elapsed = metric_usize(&aggregated, "elapsed_compute");
+        assert!(elapsed > 0, "expected elapsed_compute > 0");
+
+        Ok(())
+    }
+
+    /// Test that ParquetSink metrics work with single_file_parallelism enabled.
+    #[tokio::test]
+    async fn test_parquet_sink_metrics_parallel() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+        use datafusion_execution::TaskContext;
+
+        use futures::TryStreamExt;
+
+        let ctx = SessionContext::new();
+        ctx.sql("SET datafusion.execution.parquet.allow_single_file_parallelism = true")
+            .await?
+            .collect()
+            .await?;
+
+        let tmp_dir = TempDir::new()?;
+        let output_path = tmp_dir.path().join("metrics_parallel.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let ids: Vec<i32> = (0..50).collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(ids))],
+        )?;
+        ctx.register_batch("source2", batch)?;
+
+        let df = ctx
+            .sql(&format!(
+                "COPY source2 TO '{output_path_str}' STORED AS PARQUET"
+            ))
+            .await?;
+        let plan = df.create_physical_plan().await?;
+        let task_ctx = Arc::new(TaskContext::from(&ctx.state()));
+        let stream = plan.execute(0, task_ctx)?;
+        let _batches: Vec<_> = stream.try_collect().await?;
+
+        let metrics = plan.metrics().expect("DataSinkExec should return metrics");
+        let aggregated = metrics.aggregate_by_name();
+
+        assert_eq!(metric_usize(&aggregated, "rows_written"), 50);
+        assert!(metric_usize(&aggregated, "bytes_written") > 0);
+
+        Ok(())
+    }
+
+    /// Test FileOutputMode::Directory - explicitly request directory output
+    /// even for paths WITH file extensions.
+    #[tokio::test]
+    async fn test_file_output_mode_directory() -> Result<()> {
+        use arrow::array::Int32Array;
+        use arrow::datatypes::{DataType, Field, Schema};
+        use arrow::record_batch::RecordBatch;
+
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+
+        // Path WITH .parquet extension but explicitly requesting directory output
+        let output_path = tmp_dir.path().join("output.parquet");
+        let output_path_str = output_path.to_str().unwrap();
+
+        let df = ctx.read_batch(RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )?)?;
+
+        // Explicitly request directory output (single_file_output = false)
+        df.write_parquet(
+            output_path_str,
+            DataFrameWriteOptions::new().with_single_file_output(false),
+            None,
+        )
+        .await?;
+
+        // Verify: output should be a DIRECTORY, not a single file
+        assert!(
+            output_path.is_dir(),
+            "Expected directory at {:?}, but got is_file={}, is_dir={}",
+            output_path,
+            output_path.is_file(),
+            output_path.is_dir()
+        );
+
+        // Verify the directory contains parquet file(s)
+        let entries: Vec<_> = std::fs::read_dir(&output_path)?
+            .filter_map(|e| e.ok())
+            .collect();
+        assert!(
+            !entries.is_empty(),
+            "Directory should contain at least one file"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs
index b30d53e586911..50ee96da3dff0 100644
--- a/datafusion/core/src/datasource/dynamic_file.rs
+++ b/datafusion/core/src/datasource/dynamic_file.rs
@@ -20,8 +20,9 @@
 
 use std::sync::Arc;
 
-use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl};
 use crate::datasource::TableProvider;
+use crate::datasource::listing::ListingTableConfigExt;
+use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl};
 use crate::error::Result;
 use crate::execution::context::SessionState;
 
diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/core/src/datasource/empty.rs
index 77686c5eb7c27..5aeca92b1626d 100644
--- a/datafusion/core/src/datasource/empty.rs
+++ b/datafusion/core/src/datasource/empty.rs
@@ -28,8 +28,8 @@ use datafusion_common::project_schema;
 use crate::datasource::{TableProvider, TableType};
 use crate::error::Result;
 use crate::logical_expr::Expr;
-use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::empty::EmptyExec;
 
 /// An empty plan that is useful for testing and generating plans
 /// without mapping them to actual data.
diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs
index b620ff62d9a65..338de76b1353b 100644
--- a/datafusion/core/src/datasource/file_format/arrow.rs
+++ b/datafusion/core/src/datasource/file_format/arrow.rs
@@ -15,504 +15,97 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ArrowFormat`]: Apache Arrow [`FileFormat`] abstractions
-//!
-//! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format)
+//! Re-exports the [`datafusion_datasource_arrow::file_format`] module, and contains tests for it.
+pub use datafusion_datasource_arrow::file_format::*;
 
-use std::any::Any;
-use std::borrow::Cow;
-use std::collections::HashMap;
-use std::fmt::{self, Debug};
-use std::sync::Arc;
-
-use super::file_compression_type::FileCompressionType;
-use super::write::demux::DemuxedStreamReceiver;
-use super::write::SharedBuffer;
-use super::FileFormatFactory;
-use crate::datasource::file_format::write::get_writer_schema;
-use crate::datasource::file_format::FileFormat;
-use crate::datasource::physical_plan::{ArrowSource, FileSink, FileSinkConfig};
-use crate::error::Result;
-use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
-
-use arrow::datatypes::{Schema, SchemaRef};
-use arrow::error::ArrowError;
-use arrow::ipc::convert::fb_to_schema;
-use arrow::ipc::reader::FileReader;
-use arrow::ipc::writer::IpcWriteOptions;
-use arrow::ipc::{root_as_message, CompressionType};
-use datafusion_catalog::Session;
-use datafusion_common::parsers::CompressionTypeVariant;
-use datafusion_common::{
-    not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION,
-};
-use datafusion_common_runtime::{JoinSet, SpawnedTask};
-use datafusion_datasource::display::FileGroupDisplay;
-use datafusion_datasource::file::FileSource;
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-use datafusion_datasource::sink::{DataSink, DataSinkExec};
-use datafusion_datasource::write::ObjectWriterBuilder;
-use datafusion_execution::{SendableRecordBatchStream, TaskContext};
-use datafusion_expr::dml::InsertOp;
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
-
-use async_trait::async_trait;
-use bytes::Bytes;
-use datafusion_datasource::source::DataSourceExec;
-use futures::stream::BoxStream;
-use futures::StreamExt;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
-use tokio::io::AsyncWriteExt;
-
-/// Initial writing buffer size. Note this is just a size hint for efficiency. It
-/// will grow beyond the set value if needed.
-const INITIAL_BUFFER_BYTES: usize = 1048576;
-
-/// If the buffered Arrow data exceeds this size, it is flushed to object store
-const BUFFER_FLUSH_BYTES: usize = 1024000;
-
-#[derive(Default, Debug)]
-/// Factory struct used to create [ArrowFormat]
-pub struct ArrowFormatFactory;
-
-impl ArrowFormatFactory {
-    /// Creates an instance of [ArrowFormatFactory]
-    pub fn new() -> Self {
-        Self {}
-    }
-}
-
-impl FileFormatFactory for ArrowFormatFactory {
-    fn create(
-        &self,
-        _state: &dyn Session,
-        _format_options: &HashMap<String, String>,
-    ) -> Result<Arc<dyn FileFormat>> {
-        Ok(Arc::new(ArrowFormat))
-    }
-
-    fn default(&self) -> Arc<dyn FileFormat> {
-        Arc::new(ArrowFormat)
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
-impl GetExt for ArrowFormatFactory {
-    fn get_ext(&self) -> String {
-        // Removes the dot, i.e. ".parquet" -> "parquet"
-        DEFAULT_ARROW_EXTENSION[1..].to_string()
-    }
-}
-
-/// Arrow `FileFormat` implementation.
-#[derive(Default, Debug)]
-pub struct ArrowFormat;
-
-#[async_trait]
-impl FileFormat for ArrowFormat {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn get_ext(&self) -> String {
-        ArrowFormatFactory::new().get_ext()
-    }
-
-    fn get_ext_with_compression(
-        &self,
-        file_compression_type: &FileCompressionType,
-    ) -> Result<String> {
-        let ext = self.get_ext();
-        match file_compression_type.get_variant() {
-            CompressionTypeVariant::UNCOMPRESSED => Ok(ext),
-            _ => Err(DataFusionError::Internal(
-                "Arrow FileFormat does not support compression.".into(),
-            )),
-        }
-    }
-
-    async fn infer_schema(
-        &self,
-        _state: &dyn Session,
-        store: &Arc<dyn ObjectStore>,
-        objects: &[ObjectMeta],
-    ) -> Result<SchemaRef> {
-        let mut schemas = vec![];
-        for object in objects {
-            let r = store.as_ref().get(&object.location).await?;
-            let schema = match r.payload {
-                #[cfg(not(target_arch = "wasm32"))]
-                GetResultPayload::File(mut file, _) => {
-                    let reader = FileReader::try_new(&mut file, None)?;
-                    reader.schema()
-                }
-                GetResultPayload::Stream(stream) => {
-                    infer_schema_from_file_stream(stream).await?
-                }
-            };
-            schemas.push(schema.as_ref().clone());
-        }
-        let merged_schema = Schema::try_merge(schemas)?;
-        Ok(Arc::new(merged_schema))
-    }
-
-    async fn infer_stats(
-        &self,
-        _state: &dyn Session,
-        _store: &Arc<dyn ObjectStore>,
-        table_schema: SchemaRef,
-        _object: &ObjectMeta,
-    ) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&table_schema))
-    }
-
-    async fn create_physical_plan(
-        &self,
-        _state: &dyn Session,
-        conf: FileScanConfig,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        let source = Arc::new(ArrowSource::default());
-        let config = FileScanConfigBuilder::from(conf)
-            .with_source(source)
-            .build();
-
-        Ok(DataSourceExec::from_data_source(config))
-    }
-
-    async fn create_writer_physical_plan(
-        &self,
-        input: Arc<dyn ExecutionPlan>,
-        _state: &dyn Session,
-        conf: FileSinkConfig,
-        order_requirements: Option<LexRequirement>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        if conf.insert_op != InsertOp::Append {
-            return not_impl_err!("Overwrites are not implemented yet for Arrow format");
-        }
-
-        let sink = Arc::new(ArrowFileSink::new(conf));
-
-        Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
-    }
-
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(ArrowSource::default())
-    }
-}
-
-/// Implements [`FileSink`] for writing to arrow_ipc files
-struct ArrowFileSink {
-    config: FileSinkConfig,
-}
-
-impl ArrowFileSink {
-    fn new(config: FileSinkConfig) -> Self {
-        Self { config }
-    }
-}
-
-#[async_trait]
-impl FileSink for ArrowFileSink {
-    fn config(&self) -> &FileSinkConfig {
-        &self.config
-    }
-
-    async fn spawn_writer_tasks_and_join(
-        &self,
-        context: &Arc<TaskContext>,
-        demux_task: SpawnedTask<Result<()>>,
-        mut file_stream_rx: DemuxedStreamReceiver,
-        object_store: Arc<dyn ObjectStore>,
-    ) -> Result<u64> {
-        let mut file_write_tasks: JoinSet<std::result::Result<usize, DataFusionError>> =
-            JoinSet::new();
-
-        let ipc_options =
-            IpcWriteOptions::try_new(64, false, arrow_ipc::MetadataVersion::V5)?
-                .try_with_compression(Some(CompressionType::LZ4_FRAME))?;
-        while let Some((path, mut rx)) = file_stream_rx.recv().await {
-            let shared_buffer = SharedBuffer::new(INITIAL_BUFFER_BYTES);
-            let mut arrow_writer = arrow_ipc::writer::FileWriter::try_new_with_options(
-                shared_buffer.clone(),
-                &get_writer_schema(&self.config),
-                ipc_options.clone(),
-            )?;
-            let mut object_store_writer = ObjectWriterBuilder::new(
-                FileCompressionType::UNCOMPRESSED,
-                &path,
-                Arc::clone(&object_store),
-            )
-            .with_buffer_size(Some(
-                context
-                    .session_config()
-                    .options()
-                    .execution
-                    .objectstore_writer_buffer_size,
-            ))
-            .build()?;
-            file_write_tasks.spawn(async move {
-                let mut row_count = 0;
-                while let Some(batch) = rx.recv().await {
-                    row_count += batch.num_rows();
-                    arrow_writer.write(&batch)?;
-                    let mut buff_to_flush = shared_buffer.buffer.try_lock().unwrap();
-                    if buff_to_flush.len() > BUFFER_FLUSH_BYTES {
-                        object_store_writer
-                            .write_all(buff_to_flush.as_slice())
-                            .await?;
-                        buff_to_flush.clear();
-                    }
-                }
-                arrow_writer.finish()?;
-                let final_buff = shared_buffer.buffer.try_lock().unwrap();
-
-                object_store_writer.write_all(final_buff.as_slice()).await?;
-                object_store_writer.shutdown().await?;
-                Ok(row_count)
-            });
-        }
-
-        let mut row_count = 0;
-        while let Some(result) = file_write_tasks.join_next().await {
-            match result {
-                Ok(r) => {
-                    row_count += r?;
-                }
-                Err(e) => {
-                    if e.is_panic() {
-                        std::panic::resume_unwind(e.into_panic());
-                    } else {
-                        unreachable!();
-                    }
-                }
-            }
-        }
-
-        demux_task
-            .join_unwind()
-            .await
-            .map_err(DataFusionError::ExecutionJoin)??;
-        Ok(row_count as u64)
-    }
-}
-
-impl Debug for ArrowFileSink {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("ArrowFileSink").finish()
-    }
-}
-
-impl DisplayAs for ArrowFileSink {
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match t {
-            DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, "ArrowFileSink(file_groups=",)?;
-                FileGroupDisplay(&self.config.file_group).fmt_as(t, f)?;
-                write!(f, ")")
-            }
-            DisplayFormatType::TreeRender => {
-                writeln!(f, "format: arrow")?;
-                write!(f, "file={}", &self.config.original_url)
-            }
-        }
-    }
-}
-
-#[async_trait]
-impl DataSink for ArrowFileSink {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> &SchemaRef {
-        self.config.output_schema()
-    }
-
-    async fn write_all(
-        &self,
-        data: SendableRecordBatchStream,
-        context: &Arc<TaskContext>,
-    ) -> Result<u64> {
-        FileSink::write_all(self, data, context).await
-    }
-}
-
-const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
-const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
-
-/// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs.
-/// See <https://github.com/apache/arrow-rs/issues/5021>
-async fn infer_schema_from_file_stream(
-    mut stream: BoxStream<'static, object_store::Result<Bytes>>,
-) -> Result<SchemaRef> {
-    // Expected format:
-    // <magic number "ARROW1"> - 6 bytes
-    // <empty padding bytes [to 8 byte boundary]> - 2 bytes
-    // <continuation: 0xFFFFFFFF> - 4 bytes, not present below v0.15.0
-    // <metadata_size: int32> - 4 bytes
-    // <metadata_flatbuffer: bytes>
-    // <rest of file bytes>
-
-    // So in first read we need at least all known sized sections,
-    // which is 6 + 2 + 4 + 4 = 16 bytes.
-    let bytes = collect_at_least_n_bytes(&mut stream, 16, None).await?;
-
-    // Files should start with these magic bytes
-    if bytes[0..6] != ARROW_MAGIC {
-        return Err(ArrowError::ParseError(
-            "Arrow file does not contain correct header".to_string(),
-        ))?;
-    }
-
-    // Since continuation marker bytes added in later versions
-    let (meta_len, rest_of_bytes_start_index) = if bytes[8..12] == CONTINUATION_MARKER {
-        (&bytes[12..16], 16)
-    } else {
-        (&bytes[8..12], 12)
-    };
-
-    let meta_len = [meta_len[0], meta_len[1], meta_len[2], meta_len[3]];
-    let meta_len = i32::from_le_bytes(meta_len);
-
-    // Read bytes for Schema message
-    let block_data = if bytes[rest_of_bytes_start_index..].len() < meta_len as usize {
-        // Need to read more bytes to decode Message
-        let mut block_data = Vec::with_capacity(meta_len as usize);
-        // In case we had some spare bytes in our initial read chunk
-        block_data.extend_from_slice(&bytes[rest_of_bytes_start_index..]);
-        let size_to_read = meta_len as usize - block_data.len();
-        let block_data =
-            collect_at_least_n_bytes(&mut stream, size_to_read, Some(block_data)).await?;
-        Cow::Owned(block_data)
-    } else {
-        // Already have the bytes we need
-        let end_index = meta_len as usize + rest_of_bytes_start_index;
-        let block_data = &bytes[rest_of_bytes_start_index..end_index];
-        Cow::Borrowed(block_data)
-    };
+#[cfg(test)]
+mod tests {
+    use futures::StreamExt;
+    use std::sync::Arc;
 
-    // Decode Schema message
-    let message = root_as_message(&block_data).map_err(|err| {
-        ArrowError::ParseError(format!("Unable to read IPC message as metadata: {err:?}"))
-    })?;
-    let ipc_schema = message.header_as_schema().ok_or_else(|| {
-        ArrowError::IpcError("Unable to read IPC message as schema".to_string())
-    })?;
-    let schema = fb_to_schema(ipc_schema);
+    use arrow::array::{Int64Array, StringArray};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use datafusion_common::Result;
 
-    Ok(Arc::new(schema))
-}
+    use crate::execution::options::ArrowReadOptions;
+    use crate::prelude::SessionContext;
 
-async fn collect_at_least_n_bytes(
-    stream: &mut BoxStream<'static, object_store::Result<Bytes>>,
-    n: usize,
-    extend_from: Option<Vec<u8>>,
-) -> Result<Vec<u8>> {
-    let mut buf = extend_from.unwrap_or_else(|| Vec::with_capacity(n));
-    // If extending existing buffer then ensure we read n additional bytes
-    let n = n + buf.len();
-    while let Some(bytes) = stream.next().await.transpose()? {
-        buf.extend_from_slice(&bytes);
-        if buf.len() >= n {
-            break;
-        }
-    }
-    if buf.len() < n {
-        return Err(ArrowError::ParseError(
-            "Unexpected end of byte stream for Arrow IPC file".to_string(),
-        ))?;
-    }
-    Ok(buf)
-}
+    #[tokio::test]
+    async fn test_write_empty_arrow_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::execution::context::SessionContext;
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.arrow", tmp_dir.path().to_string_lossy());
 
-    use chrono::DateTime;
-    use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path};
+        ctx.sql(&format!(
+            "COPY (SELECT CAST(1 AS BIGINT) AS id LIMIT 0) TO '{path}' STORED AS ARROW",
+        ))
+        .await?
+        .collect()
+        .await?;
 
-    #[tokio::test]
-    async fn test_infer_schema_stream() -> Result<()> {
-        let mut bytes = std::fs::read("tests/data/example.arrow")?;
-        bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file
-        let location = Path::parse("example.arrow")?;
-        let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
-        in_memory_store.put(&location, bytes.into()).await?;
+        assert!(std::path::Path::new(&path).exists());
 
-        let session_ctx = SessionContext::new();
-        let state = session_ctx.state();
-        let object_meta = ObjectMeta {
-            location,
-            last_modified: DateTime::default(),
-            size: u64::MAX,
-            e_tag: None,
-            version: None,
-        };
+        let read_df = ctx.read_arrow(&path, ArrowReadOptions::default()).await?;
+        let stream = read_df.execute_stream().await?;
 
-        let arrow_format = ArrowFormat {};
-        let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"];
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
 
-        // Test chunk sizes where too small so we keep having to read more bytes
-        // And when large enough that first read contains all we need
-        for chunk_size in [7, 3000] {
-            let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size));
-            let inferred_schema = arrow_format
-                .infer_schema(
-                    &state,
-                    &(store.clone() as Arc<dyn ObjectStore>),
-                    std::slice::from_ref(&object_meta),
-                )
-                .await?;
-            let actual_fields = inferred_schema
-                .fields()
-                .iter()
-                .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
-                .collect::<Vec<_>>();
-            assert_eq!(expected, actual_fields);
-        }
+        let results: Vec<_> = stream.collect().await;
+        let total_rows: usize = results
+            .iter()
+            .filter_map(|r| r.as_ref().ok())
+            .map(|b| b.num_rows())
+            .sum();
+        assert_eq!(total_rows, 0);
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_infer_schema_short_stream() -> Result<()> {
-        let mut bytes = std::fs::read("tests/data/example.arrow")?;
-        bytes.truncate(20); // should cause error that file shorter than expected
-        let location = Path::parse("example.arrow")?;
-        let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
-        in_memory_store.put(&location, bytes.into()).await?;
-
-        let session_ctx = SessionContext::new();
-        let state = session_ctx.state();
-        let object_meta = ObjectMeta {
-            location,
-            last_modified: DateTime::default(),
-            size: u64::MAX,
-            e_tag: None,
-            version: None,
-        };
-
-        let arrow_format = ArrowFormat {};
-
-        let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7));
-        let err = arrow_format
-            .infer_schema(
-                &state,
-                &(store.clone() as Arc<dyn ObjectStore>),
-                std::slice::from_ref(&object_meta),
-            )
-            .await;
-
-        assert!(err.is_err());
-        assert_eq!(
-            "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file",
-            err.unwrap_err().to_string()
-        );
+    async fn test_write_empty_arrow_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int64Array::from(Vec::<i64>::new())),
+                Arc::new(StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_batch.arrow", tmp_dir.path().to_string_lossy());
+
+        ctx.register_batch("empty_table", empty_batch)?;
+
+        ctx.sql(&format!("COPY empty_table TO '{path}' STORED AS ARROW"))
+            .await?
+            .collect()
+            .await?;
+
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx.read_arrow(&path, ArrowReadOptions::default()).await?;
+        let stream = read_df.execute_stream().await?;
+
+        assert_eq!(stream.schema().fields().len(), 2);
+        assert_eq!(stream.schema().field(0).name(), "id");
+        assert_eq!(stream.schema().field(1).name(), "name");
+
+        let results: Vec<_> = stream.collect().await;
+        let total_rows: usize = results
+            .iter()
+            .filter_map(|r| r.as_ref().ok())
+            .map(|b| b.num_rows())
+            .sum();
+        assert_eq!(total_rows, 0);
 
         Ok(())
     }
diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs
index 3428d08a6ae52..7cf23ee294d86 100644
--- a/datafusion/core/src/datasource/file_format/avro.rs
+++ b/datafusion/core/src/datasource/file_format/avro.rs
@@ -26,20 +26,21 @@ mod tests {
     use crate::{
         datasource::file_format::test_util::scan_format, prelude::SessionContext,
     };
-    use arrow::array::{as_string_array, Array};
+    use arrow::array::{Array, as_string_array};
     use datafusion_catalog::Session;
     use datafusion_common::test_util::batches_to_string;
     use datafusion_common::{
+        Result,
         cast::{
             as_binary_array, as_boolean_array, as_float32_array, as_float64_array,
             as_int32_array, as_timestamp_microsecond_array,
         },
-        test_util, Result,
+        test_util,
     };
 
     use datafusion_datasource_avro::AvroFormat;
     use datafusion_execution::config::SessionConfig;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
     use futures::StreamExt;
     use insta::assert_snapshot;
 
@@ -94,7 +95,7 @@ mod tests {
             .schema()
             .fields()
             .iter()
-            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .map(|f| format!("{}: {}", f.name(), f.data_type()))
             .collect();
         assert_eq!(
             vec![
@@ -108,7 +109,7 @@ mod tests {
                 "double_col: Float64",
                 "date_string_col: Binary",
                 "string_col: Binary",
-                "timestamp_col: Timestamp(Microsecond, None)",
+                "timestamp_col: Timestamp(µs)",
             ],
             x
         );
@@ -116,20 +117,20 @@ mod tests {
         let batches = collect(exec, task_ctx).await?;
         assert_eq!(batches.len(), 1);
 
-        assert_snapshot!(batches_to_string(&batches),@r###"
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-            | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-            | 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |
-            | 5  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30332f30312f3039 | 31         | 2009-03-01T00:01:00 |
-            | 6  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30342f30312f3039 | 30         | 2009-04-01T00:00:00 |
-            | 7  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30342f30312f3039 | 31         | 2009-04-01T00:01:00 |
-            | 2  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30322f30312f3039 | 30         | 2009-02-01T00:00:00 |
-            | 3  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30322f30312f3039 | 31         | 2009-02-01T00:01:00 |
-            | 0  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30312f30312f3039 | 30         | 2009-01-01T00:00:00 |
-            | 1  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30312f30312f3039 | 31         | 2009-01-01T00:01:00 |
-            +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
-        "###);
+        assert_snapshot!(batches_to_string(&batches),@r"
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
+        | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
+        | 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |
+        | 5  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30332f30312f3039 | 31         | 2009-03-01T00:01:00 |
+        | 6  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30342f30312f3039 | 30         | 2009-04-01T00:00:00 |
+        | 7  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30342f30312f3039 | 31         | 2009-04-01T00:01:00 |
+        | 2  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30322f30312f3039 | 30         | 2009-02-01T00:00:00 |
+        | 3  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30322f30312f3039 | 31         | 2009-02-01T00:01:00 |
+        | 0  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30312f30312f3039 | 30         | 2009-01-01T00:00:00 |
+        | 1  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30312f30312f3039 | 31         | 2009-01-01T00:01:00 |
+        +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
+        ");
         Ok(())
     }
 
@@ -245,7 +246,10 @@ mod tests {
             values.push(array.value(i));
         }
 
-        assert_eq!("[1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000]", format!("{values:?}"));
+        assert_eq!(
+            "[1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000]",
+            format!("{values:?}")
+        );
 
         Ok(())
     }
diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs
index efec07abbca05..a068b4f5c0413 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -32,11 +32,12 @@ mod tests {
     use crate::prelude::{CsvReadOptions, SessionConfig, SessionContext};
     use arrow_schema::{DataType, Field, Schema, SchemaRef};
     use datafusion_catalog::Session;
+    use datafusion_common::Result;
     use datafusion_common::cast::as_string_array;
+    use datafusion_common::config::CsvOptions;
     use datafusion_common::internal_err;
     use datafusion_common::stats::Precision;
     use datafusion_common::test_util::{arrow_test_data, batches_to_string};
-    use datafusion_common::Result;
     use datafusion_datasource::decoder::{
         BatchDeserializer, DecoderDeserializer, DeserializerOutput,
     };
@@ -44,10 +45,10 @@ mod tests {
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource::write::BatchSerializer;
     use datafusion_expr::{col, lit};
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
     use arrow::array::{
-        BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray,
+        Array, BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray,
     };
     use arrow::compute::concat_batches;
     use arrow::csv::ReaderBuilder;
@@ -55,14 +56,17 @@ mod tests {
     use async_trait::async_trait;
     use bytes::Bytes;
     use chrono::DateTime;
-    use futures::stream::BoxStream;
+    use datafusion_common::parsers::CompressionTypeVariant;
     use futures::StreamExt;
+    use futures::stream::BoxStream;
     use insta::assert_snapshot;
+    use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
     use object_store::path::Path;
     use object_store::{
         Attributes, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload,
-        ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult,
+        ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions,
+        PutPayload, PutResult,
     };
     use regex::Regex;
     use rstest::*;
@@ -96,16 +100,22 @@ mod tests {
         async fn put_multipart_opts(
             &self,
             _location: &Path,
-            _opts: PutMultipartOpts,
+            _opts: PutMultipartOptions,
         ) -> object_store::Result<Box<dyn MultipartUpload>> {
             unimplemented!()
         }
 
-        async fn get(&self, location: &Path) -> object_store::Result<GetResult> {
+        async fn get_opts(
+            &self,
+            location: &Path,
+            _opts: GetOptions,
+        ) -> object_store::Result<GetResult> {
             let bytes = self.bytes_to_repeat.clone();
             let len = bytes.len() as u64;
             let range = 0..len * self.max_iterations;
             let arc = self.iterations_detected.clone();
+            #[expect(clippy::result_large_err)]
+            // closure only ever returns Ok; Err type is never constructed
             let stream = futures::stream::repeat_with(move || {
                 let arc_inner = arc.clone();
                 *arc_inner.lock().unwrap() += 1;
@@ -128,14 +138,6 @@ mod tests {
             })
         }
 
-        async fn get_opts(
-            &self,
-            _location: &Path,
-            _opts: GetOptions,
-        ) -> object_store::Result<GetResult> {
-            unimplemented!()
-        }
-
         async fn get_ranges(
             &self,
             _location: &Path,
@@ -144,14 +146,6 @@ mod tests {
             unimplemented!()
         }
 
-        async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
-            unimplemented!()
-        }
-
-        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-            unimplemented!()
-        }
-
         fn list(
             &self,
             _prefix: Option<&Path>,
@@ -166,17 +160,21 @@ mod tests {
             unimplemented!()
         }
 
-        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
-            unimplemented!()
-        }
-
-        async fn copy_if_not_exists(
+        async fn copy_opts(
             &self,
             _from: &Path,
             _to: &Path,
+            _options: object_store::CopyOptions,
         ) -> object_store::Result<()> {
             unimplemented!()
         }
+
+        fn delete_stream(
+            &self,
+            _locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            unimplemented!()
+        }
     }
 
     impl VariableStream {
@@ -468,6 +466,59 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_infer_schema_stream_null_chunks() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        // a stream where each line is read as a separate chunk,
+        // data type for each chunk is inferred separately.
+        // +----+-----+----+
+        // | c1 | c2  | c3 |
+        // +----+-----+----+
+        // | 1  | 1.0 |    |  type: Int64, Float64, Null
+        // |    |     |    |  type: Null, Null, Null
+        // +----+-----+----+
+        let chunked_object_store = Arc::new(ChunkedStore::new(
+            Arc::new(VariableStream::new(
+                Bytes::from(
+                    r#"c1,c2,c3
+1,1.0,
+,,
+"#,
+                ),
+                1,
+            )),
+            1,
+        ));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: u64::MAX,
+            e_tag: None,
+            version: None,
+        };
+
+        let csv_format = CsvFormat::default().with_has_header(true);
+        let inferred_schema = csv_format
+            .infer_schema(
+                &state,
+                &(chunked_object_store as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await?;
+
+        let actual_fields: Vec<_> = inferred_schema
+            .fields()
+            .iter()
+            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .collect();
+
+        // ensure null chunks don't skew type inference
+        assert_eq!(vec!["c1: Int64", "c2: Float64", "c3: Null"], actual_fields);
+        Ok(())
+    }
+
     #[rstest(
         file_compression_type,
         case(FileCompressionType::UNCOMPRESSED),
@@ -565,15 +616,15 @@ mod tests {
             .collect()
             .await?;
 
-        assert_snapshot!(batches_to_string(&record_batch), @r###"
-            +----+------+
-            | c2 | c3   |
-            +----+------+
-            | 5  | 36   |
-            | 5  | -31  |
-            | 5  | -101 |
-            +----+------+
-        "###);
+        assert_snapshot!(batches_to_string(&record_batch), @r"
+        +----+------+
+        | c2 | c3   |
+        +----+------+
+        | 5  | 36   |
+        | 5  | -31  |
+        | 5  | -101 |
+        +----+------+
+        ");
 
         Ok(())
     }
@@ -650,11 +701,11 @@ mod tests {
 
         let re = Regex::new(r"DataSourceExec: file_groups=\{(\d+) group").unwrap();
 
-        if let Some(captures) = re.captures(&plan) {
-            if let Some(match_) = captures.get(1) {
-                let n_partitions = match_.as_str().parse::<usize>().unwrap();
-                return Ok(n_partitions);
-            }
+        if let Some(captures) = re.captures(&plan)
+            && let Some(match_) = captures.get(1)
+        {
+            let n_partitions = match_.as_str().parse::<usize>().unwrap();
+            return Ok(n_partitions);
         }
 
         internal_err!("query contains no DataSourceExec")
@@ -680,13 +731,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c2) |
         +--------------+
         | 285          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(n_partitions, actual_partitions);
@@ -719,13 +770,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c3) |
         +--------------+
         | 781          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(1, actual_partitions); // Compressed csv won't be scanned in parallel
@@ -756,13 +807,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +--------------+
         | sum(aggr.c3) |
         +--------------+
         | 781          |
         +--------------+
-        "###);
+        ");
         }
 
         assert_eq!(1, actual_partitions); // csv won't be scanned in parallel when newlines_in_values is set
@@ -787,10 +838,10 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -812,10 +863,136 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
+
+        Ok(())
+    }
+
+    /// Read multiple csv files (some are empty) with header
+    ///
+    /// some_empty_with_header
+    /// ├── a_empty.csv
+    /// ├── b.csv
+    /// └── c_nulls_column.csv
+    ///
+    /// a_empty.csv:
+    /// c1,c2,c3
+    ///
+    /// b.csv:
+    /// c1,c2,c3
+    /// 1,1,1
+    /// 2,2,2
+    ///
+    /// c_nulls_column.csv:
+    /// c1,c2,c3
+    /// 3,3,
+    #[tokio::test]
+    async fn test_csv_some_empty_with_header() -> Result<()> {
+        let ctx = SessionContext::new();
+        ctx.register_csv(
+            "some_empty_with_header",
+            "tests/data/empty_files/some_empty_with_header",
+            CsvReadOptions::new().has_header(true),
+        )
+        .await?;
+
+        let query = "select sum(c3) from some_empty_with_header;";
+        let query_result = ctx.sql(query).await?.collect().await?;
+
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        +--------------------------------+
+        | sum(some_empty_with_header.c3) |
+        +--------------------------------+
+        | 3                              |
+        +--------------------------------+
+        ");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_csv_extension_compressed() -> Result<()> {
+        // Write compressed CSV files
+        // Expect: under the directory, a file is created with ".csv.gz" extension
+        let ctx = SessionContext::new();
+
+        let df = ctx
+            .read_csv(
+                &format!("{}/csv/aggregate_test_100.csv", arrow_test_data()),
+                CsvReadOptions::default().has_header(true),
+            )
+            .await?;
+
+        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let path = format!("{}", tmp_dir.path().to_string_lossy());
+
+        let cfg1 = crate::dataframe::DataFrameWriteOptions::new();
+        let cfg2 = CsvOptions::default()
+            .with_has_header(true)
+            .with_compression(CompressionTypeVariant::GZIP);
+
+        df.write_csv(&path, cfg1, Some(cfg2)).await?;
+        assert!(std::path::Path::new(&path).exists());
+
+        let files: Vec<_> = std::fs::read_dir(&path).unwrap().collect();
+        assert_eq!(files.len(), 1);
+        assert!(
+            files
+                .last()
+                .unwrap()
+                .as_ref()
+                .unwrap()
+                .path()
+                .file_name()
+                .unwrap()
+                .to_str()
+                .unwrap()
+                .ends_with(".csv.gz")
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_csv_extension_uncompressed() -> Result<()> {
+        // Write plain uncompressed CSV files
+        // Expect: under the directory, a file is created with ".csv" extension
+        let ctx = SessionContext::new();
+
+        let df = ctx
+            .read_csv(
+                &format!("{}/csv/aggregate_test_100.csv", arrow_test_data()),
+                CsvReadOptions::default().has_header(true),
+            )
+            .await?;
+
+        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let path = format!("{}", tmp_dir.path().to_string_lossy());
+
+        let cfg1 = crate::dataframe::DataFrameWriteOptions::new();
+        let cfg2 = CsvOptions::default().with_has_header(true);
+
+        df.write_csv(&path, cfg1, Some(cfg2)).await?;
+        assert!(std::path::Path::new(&path).exists());
+
+        let files: Vec<_> = std::fs::read_dir(&path).unwrap().collect();
+        assert_eq!(files.len(), 1);
+        assert!(
+            files
+                .last()
+                .unwrap()
+                .as_ref()
+                .unwrap()
+                .path()
+                .file_name()
+                .unwrap()
+                .to_str()
+                .unwrap()
+                .ends_with(".csv")
+        );
 
         Ok(())
     }
@@ -854,10 +1031,10 @@ mod tests {
         let query = "select * from empty where random() > 0.5;";
         let query_result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&query_result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&query_result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -906,13 +1083,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
-            +---------------------+
-            | sum(empty.column_1) |
-            +---------------------+
-            | 10                  |
-            +---------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
+        +---------------------+
+        | sum(empty.column_1) |
+        +---------------------+
+        | 10                  |
+        +---------------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions); // Won't get partitioned if all files are empty
 
@@ -954,13 +1131,13 @@ mod tests {
             file_size
         };
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
         +-----------------------+
         | sum(one_col.column_1) |
         +-----------------------+
         | 50                    |
         +-----------------------+
-        "###);
+        ");
         }
 
         assert_eq!(expected_partitions, actual_partitions);
@@ -993,13 +1170,13 @@ mod tests {
         let query_result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_query_csv_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r###"
-            +---------------+
-            | sum_of_5_cols |
-            +---------------+
-            | 15            |
-            +---------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&query_result),@r"
+        +---------------+
+        | sum_of_5_cols |
+        +---------------+
+        | 15            |
+        +---------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions);
 
@@ -1013,7 +1190,9 @@ mod tests {
     ) -> Result<()> {
         let schema = csv_schema();
         let generator = CsvBatchGenerator::new(batch_size, line_count);
-        let mut deserializer = csv_deserializer(batch_size, &schema);
+
+        let schema_clone = Arc::clone(&schema);
+        let mut deserializer = csv_deserializer(batch_size, &schema_clone);
 
         for data in generator {
             deserializer.digest(data);
@@ -1052,7 +1231,8 @@ mod tests {
     ) -> Result<()> {
         let schema = csv_schema();
         let generator = CsvBatchGenerator::new(batch_size, line_count);
-        let mut deserializer = csv_deserializer(batch_size, &schema);
+        let schema_clone = Arc::clone(&schema);
+        let mut deserializer = csv_deserializer(batch_size, &schema_clone);
 
         for data in generator {
             deserializer.digest(data);
@@ -1151,7 +1331,7 @@ mod tests {
     fn csv_values(line_number: usize) -> (i32, f64, bool, String) {
         let int_value = line_number as i32;
         let float_value = line_number as f64;
-        let bool_value = line_number % 2 == 0;
+        let bool_value = line_number.is_multiple_of(2);
         let char_value = format!("{line_number}-string");
         (int_value, float_value, bool_value, char_value)
     }
@@ -1174,4 +1354,271 @@ mod tests {
             .build_decoder();
         DecoderDeserializer::new(CsvDecoder::new(decoder))
     }
+
+    fn csv_deserializer_with_truncated(
+        batch_size: usize,
+        schema: &Arc<Schema>,
+    ) -> impl BatchDeserializer<Bytes> {
+        // using Arrow's ReaderBuilder and enabling truncated_rows
+        let decoder = ReaderBuilder::new(schema.clone())
+            .with_batch_size(batch_size)
+            .with_truncated_rows(true) // <- enable runtime truncated_rows
+            .build_decoder();
+        DecoderDeserializer::new(CsvDecoder::new(decoder))
+    }
+
+    #[tokio::test]
+    async fn infer_schema_with_truncated_rows_true() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        // CSV: header has 3 columns, but first data row has only 2 columns, second row has 3
+        let csv_data = Bytes::from("a,b,c\n1,2\n3,4,5\n");
+        let variable_object_store = Arc::new(VariableStream::new(csv_data, 1));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: u64::MAX,
+            e_tag: None,
+            version: None,
+        };
+
+        // Construct CsvFormat and enable truncated_rows via CsvOptions
+        let csv_options = CsvOptions::default().with_truncated_rows(true);
+        let csv_format = CsvFormat::default()
+            .with_has_header(true)
+            .with_options(csv_options)
+            .with_schema_infer_max_rec(10);
+
+        let inferred_schema = csv_format
+            .infer_schema(
+                &state,
+                &(variable_object_store.clone() as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await?;
+
+        // header has 3 columns; inferred schema should also have 3
+        assert_eq!(inferred_schema.fields().len(), 3);
+
+        // inferred columns should be nullable
+        for f in inferred_schema.fields() {
+            assert!(f.is_nullable());
+        }
+
+        Ok(())
+    }
+    #[test]
+    fn test_decoder_truncated_rows_runtime() -> Result<()> {
+        // Synchronous test: Decoder API used here is synchronous
+        let schema = csv_schema(); // helper already defined in file
+
+        // Construct a decoder that enables truncated_rows at runtime
+        let mut deserializer = csv_deserializer_with_truncated(10, &schema);
+
+        // Provide two rows: first row complete, second row missing last column
+        let input = Bytes::from("0,0.0,true,0-string\n1,1.0,true\n");
+        deserializer.digest(input);
+
+        // Finish and collect output
+        deserializer.finish();
+
+        let output = deserializer.next()?;
+        match output {
+            DeserializerOutput::RecordBatch(batch) => {
+                // ensure at least two rows present
+                assert!(batch.num_rows() >= 2);
+                // column 4 (index 3) should be a StringArray where second row is NULL
+                let col4 = batch
+                    .column(3)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("column 4 should be StringArray");
+
+                // first row present, second row should be null
+                assert!(!col4.is_null(0));
+                assert!(col4.is_null(1));
+            }
+            other => panic!("expected RecordBatch but got {other:?}"),
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn infer_schema_truncated_rows_false_error() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        // CSV: header has 4 cols, first data row has 3 cols -> truncated at end
+        let csv_data = Bytes::from("id,a,b,c\n1,foo,bar\n2,foo,bar,baz\n");
+        let variable_object_store = Arc::new(VariableStream::new(csv_data, 1));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: u64::MAX,
+            e_tag: None,
+            version: None,
+        };
+
+        // CsvFormat without enabling truncated_rows (default behavior = false)
+        let csv_format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(10);
+
+        let res = csv_format
+            .infer_schema(
+                &state,
+                &(variable_object_store.clone() as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await;
+
+        // Expect an error due to unequal lengths / incorrect number of fields
+        assert!(
+            res.is_err(),
+            "expected infer_schema to error on truncated rows when disabled"
+        );
+
+        // Optional: check message contains indicative text (two known possibilities)
+        if let Err(err) = res {
+            let msg = format!("{err}");
+            assert!(
+                msg.contains("Encountered unequal lengths")
+                    || msg.contains("incorrect number of fields"),
+                "unexpected error message: {msg}",
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_read_csv_truncated_rows_via_tempfile() -> Result<()> {
+        use std::io::Write;
+
+        // create a SessionContext
+        let ctx = SessionContext::new();
+
+        // Create a temp file with a .csv suffix so the reader accepts it
+        let mut tmp = tempfile::Builder::new().suffix(".csv").tempfile()?; // ensures path ends with .csv
+        // CSV has header "a,b,c". First data row is truncated (only "1,2"), second row is complete.
+        write!(tmp, "a,b,c\n1,2\n3,4,5\n")?;
+        let path = tmp.path().to_str().unwrap().to_string();
+
+        // Build CsvReadOptions: header present, enable truncated_rows.
+        // (Use the exact builder method your crate exposes: `truncated_rows(true)` here,
+        //  if the method name differs in your codebase use the appropriate one.)
+        let options = CsvReadOptions::default().truncated_rows(true);
+
+        println!("options: {}, path: {path}", options.truncated_rows);
+
+        // Call the API under test
+        let df = ctx.read_csv(&path, options).await?;
+
+        // Collect the results and combine batches so we can inspect columns
+        let batches = df.collect().await?;
+        let combined = concat_batches(&batches[0].schema(), &batches)?;
+
+        // Column 'c' is the 3rd column (index 2). The first data row was truncated -> should be NULL.
+        let col_c = combined.column(2);
+        assert!(
+            col_c.is_null(0),
+            "expected first row column 'c' to be NULL due to truncated row"
+        );
+
+        // Also ensure we read at least one row
+        assert!(combined.num_rows() >= 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_csv_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.csv", tmp_dir.path().to_string_lossy());
+        let df = ctx.sql("SELECT CAST(1 AS BIGINT) AS id LIMIT 0").await?;
+        df.write_csv(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx
+            .read_csv(&path, CsvReadOptions::default().has_header(true))
+            .await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_csv_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(arrow::array::Int64Array::from(Vec::<i64>::new())),
+                Arc::new(StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_batch.csv", tmp_dir.path().to_string_lossy());
+
+        // Write empty RecordBatch
+        let df = ctx.read_batch(empty_batch.clone())?;
+        df.write_csv(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        // Expected the file to exist
+        assert!(std::path::Path::new(&path).exists());
+
+        let read_df = ctx
+            .read_csv(&path, CsvReadOptions::default().has_header(true))
+            .await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 2);
+        assert_eq!(stream.schema().field(0).name(), "id");
+        assert_eq!(stream.schema().field(1).name(), "name");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_infer_schema_with_zero_max_records() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        let root = format!("{}/csv", arrow_test_data());
+        let format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(0); // Set to 0 to disable inference
+        let exec = scan_format(
+            &state,
+            &format,
+            None,
+            &root,
+            "aggregate_test_100.csv",
+            None,
+            None,
+        )
+        .await?;
+
+        // related to https://github.com/apache/datafusion/issues/19417
+        for f in exec.schema().fields() {
+            assert_eq!(*f.data_type(), DataType::Utf8);
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs
index 34d3d64f07fb2..5b3e22705620e 100644
--- a/datafusion/core/src/datasource/file_format/json.rs
+++ b/datafusion/core/src/datasource/file_format/json.rs
@@ -25,7 +25,7 @@ mod tests {
     use super::*;
 
     use crate::datasource::file_format::test_util::scan_format;
-    use crate::prelude::{NdJsonReadOptions, SessionConfig, SessionContext};
+    use crate::prelude::{SessionConfig, SessionContext};
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::array::RecordBatch;
     use arrow_schema::Schema;
@@ -36,7 +36,7 @@ mod tests {
         BatchDeserializer, DecoderDeserializer, DeserializerOutput,
     };
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
     use arrow::compute::concat_batches;
     use arrow::datatypes::{DataType, Field};
@@ -46,12 +46,54 @@ mod tests {
     use datafusion_common::internal_err;
     use datafusion_common::stats::Precision;
 
+    use crate::execution::options::JsonReadOptions;
     use datafusion_common::Result;
+    use datafusion_datasource::file_compression_type::FileCompressionType;
     use futures::StreamExt;
     use insta::assert_snapshot;
     use object_store::local::LocalFileSystem;
     use regex::Regex;
     use rstest::rstest;
+    // ==================== Test Helpers ====================
+
+    /// Create a temporary JSON file and return (TempDir, path)
+    fn create_temp_json(content: &str) -> (tempfile::TempDir, String) {
+        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let path = tmp_dir.path().join("test.json");
+        std::fs::write(&path, content).unwrap();
+        (tmp_dir, path.to_string_lossy().to_string())
+    }
+
+    /// Infer schema from JSON array format file
+    async fn infer_json_array_schema(
+        content: &str,
+    ) -> Result<arrow::datatypes::SchemaRef> {
+        let (_tmp_dir, path) = create_temp_json(content);
+        let session = SessionContext::new();
+        let ctx = session.state();
+        let store = Arc::new(LocalFileSystem::new()) as _;
+        let format = JsonFormat::default().with_newline_delimited(false);
+        format
+            .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)])
+            .await
+    }
+
+    /// Register a JSON array table and run a query
+    async fn query_json_array(content: &str, query: &str) -> Result<Vec<RecordBatch>> {
+        let (_tmp_dir, path) = create_temp_json(content);
+        let ctx = SessionContext::new();
+        let options = JsonReadOptions::default().newline_delimited(false);
+        ctx.register_json("test_table", &path, options).await?;
+        ctx.sql(query).await?.collect().await
+    }
+
+    /// Register a JSON array table and run a query, return formatted string
+    async fn query_json_array_str(content: &str, query: &str) -> Result<String> {
+        let result = query_json_array(content, query).await?;
+        Ok(batches_to_string(&result))
+    }
+
+    // ==================== Existing Tests ====================
 
     #[tokio::test]
     async fn read_small_batches() -> Result<()> {
@@ -187,11 +229,11 @@ mod tests {
 
         let re = Regex::new(r"file_groups=\{(\d+) group").unwrap();
 
-        if let Some(captures) = re.captures(&plan) {
-            if let Some(match_) = captures.get(1) {
-                let count = match_.as_str().parse::<usize>().unwrap();
-                return Ok(count);
-            }
+        if let Some(captures) = re.captures(&plan)
+            && let Some(match_) = captures.get(1)
+        {
+            let count = match_.as_str().parse::<usize>().unwrap();
+            return Ok(count);
         }
 
         internal_err!("Query contains no Exec: file_groups")
@@ -208,7 +250,7 @@ mod tests {
         let ctx = SessionContext::new_with_config(config);
 
         let table_path = "tests/data/1.json";
-        let options = NdJsonReadOptions::default();
+        let options = JsonReadOptions::default();
 
         ctx.register_json("json_parallel", table_path, options)
             .await?;
@@ -218,13 +260,13 @@ mod tests {
         let result = ctx.sql(query).await?.collect().await?;
         let actual_partitions = count_num_partitions(&ctx, query).await?;
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&result),@r###"
-            +----------------------+
-            | sum(json_parallel.a) |
-            +----------------------+
-            | -7                   |
-            +----------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&result),@r"
+        +----------------------+
+        | sum(json_parallel.a) |
+        +----------------------+
+        | -7                   |
+        +----------------------+
+        ");}
 
         assert_eq!(n_partitions, actual_partitions);
 
@@ -240,7 +282,7 @@ mod tests {
         let ctx = SessionContext::new_with_config(config);
 
         let table_path = "tests/data/empty.json";
-        let options = NdJsonReadOptions::default();
+        let options = JsonReadOptions::default();
 
         ctx.register_json("json_parallel_empty", table_path, options)
             .await?;
@@ -249,10 +291,10 @@ mod tests {
 
         let result = ctx.sql(query).await?.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result),@r###"
-            ++
-            ++
-        "###);
+        assert_snapshot!(batches_to_string(&result),@r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -284,15 +326,15 @@ mod tests {
         }
         assert_eq!(deserializer.next()?, DeserializerOutput::InputExhausted);
 
-        assert_snapshot!(batches_to_string(&[all_batches]),@r###"
-            +----+----+----+----+----+
-            | c1 | c2 | c3 | c4 | c5 |
-            +----+----+----+----+----+
-            | 1  | 2  | 3  | 4  | 5  |
-            | 6  | 7  | 8  | 9  | 10 |
-            | 11 | 12 | 13 | 14 | 15 |
-            +----+----+----+----+----+
-        "###);
+        assert_snapshot!(batches_to_string(&[all_batches]),@r"
+        +----+----+----+----+----+
+        | c1 | c2 | c3 | c4 | c5 |
+        +----+----+----+----+----+
+        | 1  | 2  | 3  | 4  | 5  |
+        | 6  | 7  | 8  | 9  | 10 |
+        | 11 | 12 | 13 | 14 | 15 |
+        +----+----+----+----+----+
+        ");
 
         Ok(())
     }
@@ -314,7 +356,6 @@ mod tests {
             .digest(r#"{ "c1": 11, "c2": 12, "c3": 13, "c4": 14, "c5": 15 }"#.into());
 
         let mut all_batches = RecordBatch::new_empty(schema.clone());
-        // We get RequiresMoreData after 2 batches because of how json::Decoder works
         for _ in 0..2 {
             let output = deserializer.next()?;
             let DeserializerOutput::RecordBatch(batch) = output else {
@@ -324,14 +365,14 @@ mod tests {
         }
         assert_eq!(deserializer.next()?, DeserializerOutput::RequiresMoreData);
 
-        insta::assert_snapshot!(fmt_batches(&[all_batches]),@r###"
-            +----+----+----+----+----+
-            | c1 | c2 | c3 | c4 | c5 |
-            +----+----+----+----+----+
-            | 1  | 2  | 3  | 4  | 5  |
-            | 6  | 7  | 8  | 9  | 10 |
-            +----+----+----+----+----+
-        "###);
+        insta::assert_snapshot!(fmt_batches(&[all_batches]),@r"
+        +----+----+----+----+----+
+        | c1 | c2 | c3 | c4 | c5 |
+        +----+----+----+----+----+
+        | 1  | 2  | 3  | 4  | 5  |
+        | 6  | 7  | 8  | 9  | 10 |
+        +----+----+----+----+----+
+        ");
 
         Ok(())
     }
@@ -349,4 +390,248 @@ mod tests {
     fn fmt_batches(batches: &[RecordBatch]) -> String {
         pretty::pretty_format_batches(batches).unwrap().to_string()
     }
+
+    #[tokio::test]
+    async fn test_write_empty_json_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("empty_sql.json");
+        let path = path.to_string_lossy().to_string();
+        let df = ctx.sql("SELECT CAST(1 AS BIGINT) AS id LIMIT 0").await?;
+        df.write_json(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+        let metadata = std::fs::metadata(&path)?;
+        assert_eq!(metadata.len(), 0);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_json_from_record_batch() -> Result<()> {
+        let ctx = SessionContext::new();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+        let empty_batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(arrow::array::Int64Array::from(Vec::<i64>::new())),
+                Arc::new(arrow::array::StringArray::from(Vec::<Option<&str>>::new())),
+            ],
+        )?;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("empty_batch.json");
+        let path = path.to_string_lossy().to_string();
+        let df = ctx.read_batch(empty_batch.clone())?;
+        df.write_json(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+        let metadata = std::fs::metadata(&path)?;
+        assert_eq!(metadata.len(), 0);
+        Ok(())
+    }
+
+    // ==================== JSON Array Format Tests ====================
+
+    #[tokio::test]
+    async fn test_json_array_schema_inference() -> Result<()> {
+        let schema = infer_json_array_schema(
+            r#"[{"a": 1, "b": 2.0, "c": true}, {"a": 2, "b": 3.5, "c": false}]"#,
+        )
+        .await?;
+
+        let fields: Vec<_> = schema
+            .fields()
+            .iter()
+            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .collect();
+        assert_eq!(vec!["a: Int64", "b: Float64", "c: Boolean"], fields);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_empty() -> Result<()> {
+        let schema = infer_json_array_schema("[]").await?;
+        assert_eq!(schema.fields().len(), 0);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_struct() -> Result<()> {
+        let schema = infer_json_array_schema(
+            r#"[{"id": 1, "info": {"name": "Alice", "age": 30}}]"#,
+        )
+        .await?;
+
+        let info_field = schema.field_with_name("info").unwrap();
+        assert!(matches!(info_field.data_type(), DataType::Struct(_)));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_list_type() -> Result<()> {
+        let schema =
+            infer_json_array_schema(r#"[{"id": 1, "tags": ["a", "b", "c"]}]"#).await?;
+
+        let tags_field = schema.field_with_name("tags").unwrap();
+        assert!(matches!(tags_field.data_type(), DataType::List(_)));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_basic_query() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}, {"a": 3, "b": "test"}]"#,
+            "SELECT a, b FROM test_table ORDER BY a",
+        )
+        .await?;
+
+        assert_snapshot!(result, @r"
+        +---+-------+
+        | a | b     |
+        +---+-------+
+        | 1 | hello |
+        | 2 | world |
+        | 3 | test  |
+        +---+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_with_nulls() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "name": "Alice"}, {"id": 2, "name": null}, {"id": 3, "name": "Charlie"}]"#,
+            "SELECT id, name FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+---------+
+        | id | name    |
+        +----+---------+
+        | 1  | Alice   |
+        | 2  |         |
+        | 3  | Charlie |
+        +----+---------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_unnest() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "values": [10, 20, 30]}, {"id": 2, "values": [40, 50]}]"#,
+            "SELECT id, unnest(values) as value FROM test_table ORDER BY id, value",
+        )
+        .await?;
+
+        assert_snapshot!(result, @r"
+        +----+-------+
+        | id | value |
+        +----+-------+
+        | 1  | 10    |
+        | 1  | 20    |
+        | 1  | 30    |
+        | 2  | 40    |
+        | 2  | 50    |
+        +----+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_unnest_struct() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "orders": [{"product": "A", "qty": 2}, {"product": "B", "qty": 3}]}, {"id": 2, "orders": [{"product": "C", "qty": 1}]}]"#,
+            "SELECT id, unnest(orders)['product'] as product, unnest(orders)['qty'] as qty FROM test_table ORDER BY id, product",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+---------+-----+
+        | id | product | qty |
+        +----+---------+-----+
+        | 1  | A       | 2   |
+        | 1  | B       | 3   |
+        | 2  | C       | 1   |
+        +----+---------+-----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_struct_access() -> Result<()> {
+        let result = query_json_array_str(
+            r#"[{"id": 1, "dept": {"name": "Engineering", "head": "Alice"}}, {"id": 2, "dept": {"name": "Sales", "head": "Bob"}}]"#,
+            "SELECT id, dept['name'] as dept_name, dept['head'] as head FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_snapshot!(result, @r"
+        +----+-------------+-------+
+        | id | dept_name   | head  |
+        +----+-------------+-------+
+        | 1  | Engineering | Alice |
+        | 2  | Sales       | Bob   |
+        +----+-------------+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_with_compression() -> Result<()> {
+        use flate2::Compression;
+        use flate2::write::GzEncoder;
+        use std::io::Write;
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = tmp_dir.path().join("array.json.gz");
+        let path = path.to_string_lossy().to_string();
+
+        let file = std::fs::File::create(&path)?;
+        let mut encoder = GzEncoder::new(file, Compression::default());
+        encoder.write_all(
+            r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}]"#.as_bytes(),
+        )?;
+        encoder.finish()?;
+
+        let ctx = SessionContext::new();
+        let options = JsonReadOptions::default()
+            .newline_delimited(false)
+            .file_compression_type(FileCompressionType::GZIP)
+            .file_extension(".json.gz");
+
+        ctx.register_json("test_table", &path, options).await?;
+        let result = ctx
+            .sql("SELECT a, b FROM test_table ORDER BY a")
+            .await?
+            .collect()
+            .await?;
+
+        assert_snapshot!(batches_to_string(&result), @r"
+        +---+-------+
+        | a | b     |
+        +---+-------+
+        | 1 | hello |
+        | 2 | world |
+        +---+-------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_list_of_structs() -> Result<()> {
+        let batches = query_json_array(
+            r#"[{"id": 1, "items": [{"name": "x", "price": 10.5}]}, {"id": 2, "items": []}]"#,
+            "SELECT id, items FROM test_table ORDER BY id",
+        )
+            .await?;
+
+        assert_eq!(1, batches.len());
+        assert_eq!(2, batches[0].num_rows());
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
index e165707c2eb0e..b04238ebc9b37 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -39,8 +39,9 @@ pub(crate) mod test_util {
     use arrow_schema::SchemaRef;
     use datafusion_catalog::Session;
     use datafusion_common::Result;
+    use datafusion_datasource::TableSchema;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-    use datafusion_datasource::{file_format::FileFormat, PartitionedFile};
+    use datafusion_datasource::{PartitionedFile, file_format::FileFormat};
     use datafusion_execution::object_store::ObjectStoreUrl;
     use std::sync::Arc;
 
@@ -66,31 +67,24 @@ pub(crate) mod test_util {
                 .await?
         };
 
+        let table_schema = TableSchema::new(file_schema.clone(), vec![]);
+
         let statistics = format
             .infer_stats(state, &store, file_schema.clone(), &meta)
             .await?;
 
-        let file_groups = vec![vec![PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        }]
-        .into()];
+        let file_groups = vec![vec![PartitionedFile::new_from_meta(meta)].into()];
 
         let exec = format
             .create_physical_plan(
                 state,
                 FileScanConfigBuilder::new(
                     ObjectStoreUrl::local_filesystem(),
-                    file_schema,
-                    format.file_source(),
+                    format.file_source(table_schema),
                 )
                 .with_file_groups(file_groups)
                 .with_statistics(statistics)
-                .with_projection(projection)
+                .with_projection_indices(projection)?
                 .with_limit(limit)
                 .build(),
             )
@@ -131,7 +125,10 @@ mod tests {
             .write_parquet(out_dir_url, DataFrameWriteOptions::new(), None)
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 }
diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs
index 9aaf1cf598113..bd0ac36087381 100644
--- a/datafusion/core/src/datasource/file_format/options.rs
+++ b/datafusion/core/src/datasource/file_format/options.rs
@@ -25,16 +25,16 @@ use crate::datasource::file_format::avro::AvroFormat;
 #[cfg(feature = "parquet")]
 use crate::datasource::file_format::parquet::ParquetFormat;
 
+use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::file_format::arrow::ArrowFormat;
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
-use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
 use crate::datasource::listing::ListingTableUrl;
 use crate::datasource::{file_format::csv::CsvFormat, listing::ListingOptions};
 use crate::error::Result;
 use crate::execution::context::{SessionConfig, SessionState};
 
 use arrow::datatypes::{DataType, Schema, SchemaRef};
-use datafusion_common::config::TableOptions;
+use datafusion_common::config::{ConfigFileDecryptionProperties, TableOptions};
 use datafusion_common::{
     DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION,
     DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION,
@@ -91,6 +91,11 @@ pub struct CsvReadOptions<'a> {
     pub file_sort_order: Vec<Vec<SortExpr>>,
     /// Optional regex to match null values
     pub null_regex: Option<String>,
+    /// Whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths.
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub truncated_rows: bool,
 }
 
 impl Default for CsvReadOptions<'_> {
@@ -117,6 +122,7 @@ impl<'a> CsvReadOptions<'a> {
             file_sort_order: vec![],
             comment: None,
             null_regex: None,
+            truncated_rows: false,
         }
     }
 
@@ -223,6 +229,15 @@ impl<'a> CsvReadOptions<'a> {
         self.null_regex = null_regex;
         self
     }
+
+    /// Configure whether to allow truncated rows when parsing.
+    /// By default this is set to false and will error if the CSV rows have different lengths
+    /// When set to true then it will allow records with less than the expected number of columns and fill the missing columns with nulls.
+    /// If the record’s schema is not nullable, then it will still return an error.
+    pub fn truncated_rows(mut self, truncated_rows: bool) -> Self {
+        self.truncated_rows = truncated_rows;
+        self
+    }
 }
 
 /// Options that control the reading of Parquet files.
@@ -252,6 +267,10 @@ pub struct ParquetReadOptions<'a> {
     pub schema: Option<&'a Schema>,
     /// Indicates how the file is sorted
     pub file_sort_order: Vec<Vec<SortExpr>>,
+    /// Properties for decryption of Parquet files that use modular encryption
+    pub file_decryption_properties: Option<ConfigFileDecryptionProperties>,
+    /// Metadata size hint for Parquet files reading (in bytes)
+    pub metadata_size_hint: Option<usize>,
 }
 
 impl Default for ParquetReadOptions<'_> {
@@ -263,6 +282,8 @@ impl Default for ParquetReadOptions<'_> {
             skip_metadata: None,
             schema: None,
             file_sort_order: vec![],
+            file_decryption_properties: None,
+            metadata_size_hint: None,
         }
     }
 }
@@ -313,6 +334,21 @@ impl<'a> ParquetReadOptions<'a> {
         self.file_sort_order = file_sort_order;
         self
     }
+
+    /// Configure file decryption properties for reading encrypted Parquet files
+    pub fn file_decryption_properties(
+        mut self,
+        file_decryption_properties: ConfigFileDecryptionProperties,
+    ) -> Self {
+        self.file_decryption_properties = Some(file_decryption_properties);
+        self
+    }
+
+    /// Configure metadata size hint for Parquet files reading (in bytes)
+    pub fn metadata_size_hint(mut self, size_hint: Option<usize>) -> Self {
+        self.metadata_size_hint = size_hint;
+        self
+    }
 }
 
 /// Options that control the reading of ARROW files.
@@ -406,14 +442,23 @@ impl<'a> AvroReadOptions<'a> {
     }
 }
 
-/// Options that control the reading of Line-delimited JSON files (NDJson)
+#[deprecated(
+    since = "53.0.0",
+    note = "Use `JsonReadOptions` instead. This alias will be removed in a future version."
+)]
+#[doc = "Deprecated: Use [`JsonReadOptions`] instead."]
+pub type NdJsonReadOptions<'a> = JsonReadOptions<'a>;
+
+/// Options that control the reading of JSON files.
+///
+/// Supports both newline-delimited JSON (NDJSON) and JSON array formats.
 ///
 /// Note this structure is supplied when a datasource is created and
-/// can not not vary from statement to statement. For settings that
+/// can not vary from statement to statement. For settings that
 /// can vary statement to statement see
 /// [`ConfigOptions`](crate::config::ConfigOptions).
 #[derive(Clone)]
-pub struct NdJsonReadOptions<'a> {
+pub struct JsonReadOptions<'a> {
     /// The data source schema.
     pub schema: Option<&'a Schema>,
     /// Max number of rows to read from JSON files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
@@ -429,9 +474,25 @@ pub struct NdJsonReadOptions<'a> {
     pub infinite: bool,
     /// Indicates how the file is sorted
     pub file_sort_order: Vec<Vec<SortExpr>>,
+    /// Whether to read as newline-delimited JSON (default: true).
+    ///
+    /// When `true` (default), expects newline-delimited JSON (NDJSON):
+    /// ```text
+    /// {"key1": 1, "key2": "val"}
+    /// {"key1": 2, "key2": "vals"}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [
+    ///   {"key1": 1, "key2": "val"},
+    ///   {"key1": 2, "key2": "vals"}
+    /// ]
+    /// ```
+    pub newline_delimited: bool,
 }
 
-impl Default for NdJsonReadOptions<'_> {
+impl Default for JsonReadOptions<'_> {
     fn default() -> Self {
         Self {
             schema: None,
@@ -441,11 +502,12 @@ impl Default for NdJsonReadOptions<'_> {
             file_compression_type: FileCompressionType::UNCOMPRESSED,
             infinite: false,
             file_sort_order: vec![],
+            newline_delimited: true,
         }
     }
 }
 
-impl<'a> NdJsonReadOptions<'a> {
+impl<'a> JsonReadOptions<'a> {
     /// Specify table_partition_cols for partition pruning
     pub fn table_partition_cols(
         mut self,
@@ -487,6 +549,32 @@ impl<'a> NdJsonReadOptions<'a> {
         self.file_sort_order = file_sort_order;
         self
     }
+
+    /// Specify how many rows to read for schema inference
+    pub fn schema_infer_max_records(mut self, schema_infer_max_records: usize) -> Self {
+        self.schema_infer_max_records = schema_infer_max_records;
+        self
+    }
+
+    /// Set whether to read as newline-delimited JSON.
+    ///
+    /// When `true` (default), expects newline-delimited JSON (NDJSON):
+    /// ```text
+    /// {"key1": 1, "key2": "val"}
+    /// {"key1": 2, "key2": "vals"}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [
+    ///   {"key1": 1, "key2": "val"},
+    ///   {"key1": 2, "key2": "vals"}
+    /// ]
+    /// ```
+    pub fn newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.newline_delimited = newline_delimited;
+        self
+    }
 }
 
 #[async_trait]
@@ -546,7 +634,8 @@ impl ReadOptions<'_> for CsvReadOptions<'_> {
             .with_newlines_in_values(self.newlines_in_values)
             .with_schema_infer_max_rec(self.schema_infer_max_records)
             .with_file_compression_type(self.file_compression_type.to_owned())
-            .with_null_regex(self.null_regex.clone());
+            .with_null_regex(self.null_regex.clone())
+            .with_truncated_rows(self.truncated_rows);
 
         ListingOptions::new(Arc::new(file_format))
             .with_file_extension(self.file_extension)
@@ -574,7 +663,16 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
         config: &SessionConfig,
         table_options: TableOptions,
     ) -> ListingOptions {
-        let mut file_format = ParquetFormat::new().with_options(table_options.parquet);
+        let mut options = table_options.parquet;
+        if let Some(file_decryption_properties) = &self.file_decryption_properties {
+            options.crypto.file_decryption = Some(file_decryption_properties.clone());
+        }
+        // This can be overridden per-read in ParquetReadOptions, if setting.
+        if let Some(metadata_size_hint) = self.metadata_size_hint {
+            options.global.metadata_size_hint = Some(metadata_size_hint);
+        }
+
+        let mut file_format = ParquetFormat::new().with_options(options);
 
         if let Some(parquet_pruning) = self.parquet_pruning {
             file_format = file_format.with_enable_pruning(parquet_pruning)
@@ -602,7 +700,7 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
 }
 
 #[async_trait]
-impl ReadOptions<'_> for NdJsonReadOptions<'_> {
+impl ReadOptions<'_> for JsonReadOptions<'_> {
     fn to_listing_options(
         &self,
         config: &SessionConfig,
@@ -611,7 +709,8 @@ impl ReadOptions<'_> for NdJsonReadOptions<'_> {
         let file_format = JsonFormat::default()
             .with_options(table_options.json)
             .with_schema_infer_max_rec(self.schema_infer_max_records)
-            .with_file_compression_type(self.file_compression_type.to_owned());
+            .with_file_compression_type(self.file_compression_type.to_owned())
+            .with_newline_delimited(self.newline_delimited);
 
         ListingOptions::new(Arc::new(file_format))
             .with_file_extension(self.file_extension)
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
index 9705225c24c7b..6a8f7ab999757 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -107,10 +107,8 @@ pub(crate) mod test_util {
 mod tests {
 
     use std::fmt::{self, Display, Formatter};
-    use std::pin::Pin;
-    use std::sync::atomic::{AtomicUsize, Ordering};
     use std::sync::Arc;
-    use std::task::{Context, Poll};
+    use std::sync::atomic::{AtomicUsize, Ordering};
     use std::time::Duration;
 
     use crate::datasource::file_format::parquet::test_util::store_parquet;
@@ -120,8 +118,9 @@ mod tests {
     use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
 
     use arrow::array::RecordBatch;
-    use arrow_schema::{Schema, SchemaRef};
+    use arrow_schema::Schema;
     use datafusion_catalog::Session;
+    use datafusion_common::ScalarValue::Utf8;
     use datafusion_common::cast::{
         as_binary_array, as_binary_view_array, as_boolean_array, as_float32_array,
         as_float64_array, as_int32_array, as_timestamp_nanosecond_array,
@@ -129,44 +128,47 @@ mod tests {
     use datafusion_common::config::{ParquetOptions, TableParquetOptions};
     use datafusion_common::stats::Precision;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::ScalarValue::Utf8;
     use datafusion_common::{Result, ScalarValue};
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
+    use datafusion_datasource::file_sink_config::{
+        FileOutputMode, FileSink, FileSinkConfig,
+    };
     use datafusion_datasource::{ListingTableUrl, PartitionedFile};
     use datafusion_datasource_parquet::{
-        fetch_parquet_metadata, fetch_statistics, statistics_from_parquet_meta_calc,
         ParquetFormat, ParquetFormatFactory, ParquetSink,
     };
+    use datafusion_execution::TaskContext;
     use datafusion_execution::object_store::ObjectStoreUrl;
     use datafusion_execution::runtime_env::RuntimeEnv;
-    use datafusion_execution::{RecordBatchStream, TaskContext};
     use datafusion_expr::dml::InsertOp;
     use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
-    use datafusion_physical_plan::{collect, ExecutionPlan};
+    use datafusion_physical_plan::{ExecutionPlan, collect};
 
+    use crate::test_util::bounded_stream;
     use arrow::array::{
-        types::Int32Type, Array, ArrayRef, DictionaryArray, Int32Array, Int64Array,
-        StringArray,
+        Array, ArrayRef, DictionaryArray, Int32Array, Int64Array, StringArray,
+        types::Int32Type,
     };
     use arrow::datatypes::{DataType, Field};
     use async_trait::async_trait;
     use datafusion_datasource::file_groups::FileGroup;
+    use datafusion_datasource_parquet::metadata::DFParquetMetadata;
+    use futures::StreamExt;
     use futures::stream::BoxStream;
-    use futures::{Stream, StreamExt};
     use insta::assert_snapshot;
-    use log::error;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectMeta;
+    use object_store::{CopyOptions, ObjectMeta};
     use object_store::{
-        path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectStore,
-        PutMultipartOpts, PutOptions, PutPayload, PutResult,
+        GetOptions, GetResult, ListResult, MultipartUpload, ObjectStore,
+        PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
     };
-    use parquet::arrow::arrow_reader::ArrowReaderOptions;
     use parquet::arrow::ParquetRecordBatchStreamBuilder;
-    use parquet::file::metadata::{KeyValue, ParquetColumnIndex, ParquetOffsetIndex};
-    use parquet::file::page_index::index::Index;
-    use parquet::format::FileMetaData;
+    use parquet::arrow::arrow_reader::ArrowReaderOptions;
+    use parquet::file::metadata::{
+        KeyValue, PageIndexPolicy, ParquetColumnIndex, ParquetMetaData,
+        ParquetOffsetIndex,
+    };
+    use parquet::file::page_index::column_index::ColumnIndexMetaData;
     use tokio::fs::File;
 
     enum ForceViews {
@@ -180,8 +182,8 @@ mod tests {
 
         let c2: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), Some(2), None]));
 
-        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())]).unwrap();
-        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)]).unwrap();
+        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())])?;
+        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)])?;
 
         let store = Arc::new(LocalFileSystem::new()) as _;
         let (meta, _files) = store_parquet(vec![batch1, batch2], false).await?;
@@ -193,10 +195,14 @@ mod tests {
             ForceViews::No => false,
         };
         let format = ParquetFormat::default().with_force_view_types(force_views);
-        let schema = format.infer_schema(&ctx, &store, &meta).await.unwrap();
+        let schema = format.infer_schema(&ctx, &store, &meta).await?;
 
-        let stats =
-            fetch_statistics(store.as_ref(), schema.clone(), &meta[0], None).await?;
+        let file_metadata_cache =
+            ctx.runtime_env().cache_manager.get_file_metadata_cache();
+        let stats = DFParquetMetadata::new(&store, &meta[0])
+            .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+            .fetch_statistics(&schema)
+            .await?;
 
         assert_eq!(stats.num_rows, Precision::Exact(3));
         let c1_stats = &stats.column_statistics[0];
@@ -204,7 +210,11 @@ mod tests {
         assert_eq!(c1_stats.null_count, Precision::Exact(1));
         assert_eq!(c2_stats.null_count, Precision::Exact(3));
 
-        let stats = fetch_statistics(store.as_ref(), schema, &meta[1], None).await?;
+        let stats = DFParquetMetadata::new(&store, &meta[1])
+            .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+            .fetch_statistics(&schema)
+            .await?;
+
         assert_eq!(stats.num_rows, Precision::Exact(3));
         let c1_stats = &stats.column_statistics[0];
         let c2_stats = &stats.column_statistics[1];
@@ -238,11 +248,9 @@ mod tests {
         let c2: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), Some(2), None]));
 
         let batch1 =
-            RecordBatch::try_from_iter(vec![("a", c1.clone()), ("b", c1.clone())])
-                .unwrap();
+            RecordBatch::try_from_iter(vec![("a", c1.clone()), ("b", c1.clone())])?;
         let batch2 =
-            RecordBatch::try_from_iter(vec![("c", c2.clone()), ("d", c2.clone())])
-                .unwrap();
+            RecordBatch::try_from_iter(vec![("c", c2.clone()), ("d", c2.clone())])?;
 
         let store = Arc::new(LocalFileSystem::new()) as _;
         let (meta, _files) = store_parquet(vec![batch1, batch2], false).await?;
@@ -250,7 +258,7 @@ mod tests {
         let session = SessionContext::new();
         let ctx = session.state();
         let format = ParquetFormat::default();
-        let schema = format.infer_schema(&ctx, &store, &meta).await.unwrap();
+        let schema = format.infer_schema(&ctx, &store, &meta).await?;
 
         let order: Vec<_> = ["a", "b", "c", "d"]
             .into_iter()
@@ -303,15 +311,15 @@ mod tests {
             _payload: PutPayload,
             _opts: PutOptions,
         ) -> object_store::Result<PutResult> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
         async fn put_multipart_opts(
             &self,
             _location: &Path,
-            _opts: PutMultipartOpts,
+            _opts: PutMultipartOptions,
         ) -> object_store::Result<Box<dyn MultipartUpload>> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
         async fn get_opts(
@@ -323,40 +331,34 @@ mod tests {
             self.inner.get_opts(location, options).await
         }
 
-        async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
-            Err(object_store::Error::NotImplemented)
-        }
-
-        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
+        fn delete_stream(
+            &self,
+            _locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            unimplemented!()
         }
 
         fn list(
             &self,
             _prefix: Option<&Path>,
         ) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
-            Box::pin(futures::stream::once(async {
-                Err(object_store::Error::NotImplemented)
-            }))
+            unimplemented!()
         }
 
         async fn list_with_delimiter(
             &self,
             _prefix: Option<&Path>,
         ) -> object_store::Result<ListResult> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
 
-        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
-        }
-
-        async fn copy_if_not_exists(
+        async fn copy_opts(
             &self,
             _from: &Path,
             _to: &Path,
+            _options: CopyOptions,
         ) -> object_store::Result<()> {
-            Err(object_store::Error::NotImplemented)
+            unimplemented!()
         }
     }
 
@@ -366,24 +368,42 @@ mod tests {
 
         let c2: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), Some(2), None]));
 
-        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())]).unwrap();
-        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)]).unwrap();
+        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())])?;
+        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)])?;
 
         let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
             LocalFileSystem::new(),
         )));
         let (meta, _files) = store_parquet(vec![batch1, batch2], false).await?;
 
+        let session = SessionContext::new();
+        let ctx = session.state();
+
         // Use a size hint larger than the parquet footer but smaller than the actual metadata, requiring a second fetch
         // for the remaining metadata
-        fetch_parquet_metadata(store.as_ref() as &dyn ObjectStore, &meta[0], Some(9))
-            .await
-            .expect("error reading metadata with hint");
-
+        let file_metadata_cache =
+            ctx.runtime_env().cache_manager.get_file_metadata_cache();
+        let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0])
+            .with_metadata_size_hint(Some(9));
+        df_meta.fetch_metadata().await?;
         assert_eq!(store.request_count(), 2);
 
-        let session = SessionContext::new();
-        let ctx = session.state();
+        let df_meta =
+            df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)));
+
+        // Increases by 3 because cache has no entries yet
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 5);
+
+        // No increase because cache has an entry
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 5);
+
+        // Increase by 2  because `get_file_metadata_cache()` is None
+        let df_meta = df_meta.with_file_metadata_cache(None);
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 7);
+
         let force_views = match force_views {
             ForceViews::Yes => true,
             ForceViews::No => false,
@@ -391,14 +411,18 @@ mod tests {
         let format = ParquetFormat::default()
             .with_metadata_size_hint(Some(9))
             .with_force_view_types(force_views);
-        let schema = format
-            .infer_schema(&ctx, &store.upcast(), &meta)
-            .await
-            .unwrap();
-
-        let stats =
-            fetch_statistics(store.upcast().as_ref(), schema.clone(), &meta[0], Some(9))
-                .await?;
+        // Increase by 3, partial cache being used.
+        let _schema = format.infer_schema(&ctx, &store.upcast(), &meta).await?;
+        assert_eq!(store.request_count(), 10);
+        // No increase, full cache being used.
+        let schema = format.infer_schema(&ctx, &store.upcast(), &meta).await?;
+        assert_eq!(store.request_count(), 10);
+
+        // No increase, cache being used
+        let df_meta =
+            df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)));
+        let stats = df_meta.fetch_statistics(&schema).await?;
+        assert_eq!(store.request_count(), 10);
 
         assert_eq!(stats.num_rows, Precision::Exact(3));
         let c1_stats = &stats.column_statistics[0];
@@ -412,28 +436,46 @@ mod tests {
 
         // Use the file size as the hint so we can get the full metadata from the first fetch
         let size_hint = meta[0].size as usize;
+        let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0])
+            .with_metadata_size_hint(Some(size_hint));
 
-        fetch_parquet_metadata(store.upcast().as_ref(), &meta[0], Some(size_hint))
-            .await
-            .expect("error reading metadata with hint");
-
+        df_meta.fetch_metadata().await?;
         // ensure the requests were coalesced into a single request
         assert_eq!(store.request_count(), 1);
 
+        let session = SessionContext::new();
+        let ctx = session.state();
+        let file_metadata_cache =
+            ctx.runtime_env().cache_manager.get_file_metadata_cache();
+        let df_meta =
+            df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)));
+        // Increases by 1 because cache has no entries yet and new session context
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 2);
+
+        // No increase because cache has an entry
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 2);
+
+        // Increase by 1  because `get_file_metadata_cache` is None
+        let df_meta = df_meta.with_file_metadata_cache(None);
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 3);
+
         let format = ParquetFormat::default()
             .with_metadata_size_hint(Some(size_hint))
             .with_force_view_types(force_views);
-        let schema = format
-            .infer_schema(&ctx, &store.upcast(), &meta)
-            .await
-            .unwrap();
-        let stats = fetch_statistics(
-            store.upcast().as_ref(),
-            schema.clone(),
-            &meta[0],
-            Some(size_hint),
-        )
-        .await?;
+        // Increase by 1, partial cache being used.
+        let _schema = format.infer_schema(&ctx, &store.upcast(), &meta).await?;
+        assert_eq!(store.request_count(), 4);
+        // No increase, full cache being used.
+        let schema = format.infer_schema(&ctx, &store.upcast(), &meta).await?;
+        assert_eq!(store.request_count(), 4);
+        // No increase, cache being used
+        let df_meta =
+            df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)));
+        let stats = df_meta.fetch_statistics(&schema).await?;
+        assert_eq!(store.request_count(), 4);
 
         assert_eq!(stats.num_rows, Precision::Exact(3));
         let c1_stats = &stats.column_statistics[0];
@@ -445,13 +487,18 @@ mod tests {
             LocalFileSystem::new(),
         )));
 
-        // Use the a size hint larger than the file size to make sure we don't panic
+        // Use a size hint larger than the file size to make sure we don't panic
         let size_hint = (meta[0].size + 100) as usize;
+        let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0])
+            .with_metadata_size_hint(Some(size_hint));
 
-        fetch_parquet_metadata(store.upcast().as_ref(), &meta[0], Some(size_hint))
-            .await
-            .expect("error reading metadata with hint");
+        df_meta.fetch_metadata().await?;
+        assert_eq!(store.request_count(), 1);
 
+        // No increase because cache has an entry
+        let df_meta =
+            df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)));
+        df_meta.fetch_metadata().await?;
         assert_eq!(store.request_count(), 1);
 
         Ok(())
@@ -470,25 +517,47 @@ mod tests {
         // Data for column c_dic: ["a", "b", "c", "d"]
         let values = StringArray::from_iter_values(["a", "b", "c", "d"]);
         let keys = Int32Array::from_iter_values([0, 1, 2, 3]);
-        let dic_array =
-            DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap();
+        let dic_array = DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values))?;
         let c_dic: ArrayRef = Arc::new(dic_array);
 
-        let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)]).unwrap();
+        // Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null]
+        let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("a".repeat(128)),
+            None,
+            Some("b".repeat(128)),
+            None,
+        ]));
+
+        let batch1 = RecordBatch::try_from_iter(vec![
+            ("c_dic", c_dic),
+            ("string_truncation", string_truncation),
+        ])?;
 
         // Use store_parquet to write each batch to its own file
         // . batch1 written into first file and includes:
         //    - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available.
-        let store = Arc::new(LocalFileSystem::new()) as _;
+        //    - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact.
+        let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
+            LocalFileSystem::new(),
+        )));
         let (files, _file_names) = store_parquet(vec![batch1], false).await?;
 
         let state = SessionContext::new().state();
-        let format = ParquetFormat::default();
-        let schema = format.infer_schema(&state, &store, &files).await.unwrap();
-
-        // Fetch statistics for first file
-        let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?;
-        let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?;
+        // Make metadata size hint None to keep original behavior
+        let format = ParquetFormat::default().with_metadata_size_hint(None);
+        let _schema = format.infer_schema(&state, &store.upcast(), &files).await?;
+        assert_eq!(store.request_count(), 3);
+        // No increase, cache being used.
+        let schema = format.infer_schema(&state, &store.upcast(), &files).await?;
+        assert_eq!(store.request_count(), 3);
+
+        // No increase in request count because cache is not empty
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let stats = DFParquetMetadata::new(store.as_ref(), &files[0])
+            .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+            .fetch_statistics(&schema)
+            .await?;
         assert_eq!(stats.num_rows, Precision::Exact(4));
 
         // column c_dic
@@ -504,6 +573,19 @@ mod tests {
             Precision::Exact(Utf8(Some("a".into())))
         );
 
+        // column string_truncation
+        let string_truncation_stats = &stats.column_statistics[1];
+
+        assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
+        assert_eq!(
+            string_truncation_stats.max_value,
+            Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
+        );
+        assert_eq!(
+            string_truncation_stats.min_value,
+            Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
+        );
+
         Ok(())
     }
 
@@ -513,18 +595,20 @@ mod tests {
         // Data for column c1: ["Foo", null, "bar"]
         let c1: ArrayRef =
             Arc::new(StringArray::from(vec![Some("Foo"), None, Some("bar")]));
-        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())]).unwrap();
+        let batch1 = RecordBatch::try_from_iter(vec![("c1", c1.clone())])?;
 
         // Data for column c2: [1, 2, null]
         let c2: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), Some(2), None]));
-        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)]).unwrap();
+        let batch2 = RecordBatch::try_from_iter(vec![("c2", c2)])?;
 
         // Use store_parquet to write each batch to its own file
         // . batch1 written into first file and includes:
         //    - column c1 that has 3 rows with one null. Stats min and max of string column is missing for this test even the column has values
         // . batch2 written into second file and includes:
         //    - column c2 that has 3 rows with one null. Stats min and max of int are available and 1 and 2 respectively
-        let store = Arc::new(LocalFileSystem::new()) as _;
+        let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
+            LocalFileSystem::new(),
+        )));
         let (files, _file_names) = store_parquet(vec![batch1, batch2], false).await?;
 
         let force_views = match force_views {
@@ -534,8 +618,11 @@ mod tests {
 
         let mut state = SessionContext::new().state();
         state = set_view_state(state, force_views);
-        let format = ParquetFormat::default().with_force_view_types(force_views);
-        let schema = format.infer_schema(&state, &store, &files).await.unwrap();
+        let format = ParquetFormat::default()
+            .with_force_view_types(force_views)
+            .with_metadata_size_hint(None);
+        let schema = format.infer_schema(&state, &store.upcast(), &files).await?;
+        assert_eq!(store.request_count(), 6);
 
         let null_i64 = ScalarValue::Int64(None);
         let null_utf8 = if force_views {
@@ -544,9 +631,14 @@ mod tests {
             Utf8(None)
         };
 
-        // Fetch statistics for first file
-        let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?;
-        let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?;
+        // No increase in request count because cache is not empty
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let stats = DFParquetMetadata::new(store.as_ref(), &files[0])
+            .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+            .fetch_statistics(&schema)
+            .await?;
+        assert_eq!(store.request_count(), 6);
         assert_eq!(stats.num_rows, Precision::Exact(3));
         // column c1
         let c1_stats = &stats.column_statistics[0];
@@ -570,9 +662,12 @@ mod tests {
         assert_eq!(c2_stats.max_value, Precision::Exact(null_i64.clone()));
         assert_eq!(c2_stats.min_value, Precision::Exact(null_i64.clone()));
 
-        // Fetch statistics for second file
-        let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[1], None).await?;
-        let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?;
+        // No increase in request count because cache is not empty
+        let stats = DFParquetMetadata::new(store.as_ref(), &files[1])
+            .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+            .fetch_statistics(&schema)
+            .await?;
+        assert_eq!(store.request_count(), 6);
         assert_eq!(stats.num_rows, Precision::Exact(3));
         // column c1: missing from the file so the table treats all 3 rows as null
         let c1_stats = &stats.column_statistics[0];
@@ -626,7 +721,7 @@ mod tests {
         // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
 
         Ok(())
@@ -672,10 +767,9 @@ mod tests {
             exec.partition_statistics(None)?.num_rows,
             Precision::Exact(8)
         );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
+            Precision::Absent,
         );
         let batches = collect(exec, task_ctx).await?;
         assert_eq!(1, batches.len());
@@ -718,7 +812,7 @@ mod tests {
             .schema()
             .fields()
             .iter()
-            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .map(|f| format!("{}: {}", f.name(), f.data_type()))
             .collect();
         let y = x.join("\n");
         assert_eq!(expected, y);
@@ -744,7 +838,7 @@ mod tests {
              double_col: Float64\n\
              date_string_col: Binary\n\
              string_col: Binary\n\
-             timestamp_col: Timestamp(Nanosecond, None)";
+             timestamp_col: Timestamp(ns)";
         _run_read_alltypes_plain_parquet(ForceViews::No, no_views).await?;
 
         let with_views = "id: Int32\n\
@@ -757,7 +851,7 @@ mod tests {
              double_col: Float64\n\
              date_string_col: BinaryView\n\
              string_col: BinaryView\n\
-             timestamp_col: Timestamp(Nanosecond, None)";
+             timestamp_col: Timestamp(ns)";
         _run_read_alltypes_plain_parquet(ForceViews::Yes, with_views).await?;
 
         Ok(())
@@ -833,7 +927,10 @@ mod tests {
             values.push(array.value(i));
         }
 
-        assert_eq!("[1235865600000000000, 1235865660000000000, 1238544000000000000, 1238544060000000000, 1233446400000000000, 1233446460000000000, 1230768000000000000, 1230768060000000000]", format!("{values:?}"));
+        assert_eq!(
+            "[1235865600000000000, 1235865660000000000, 1238544000000000000, 1238544060000000000, 1233446400000000000, 1233446460000000000, 1230768000000000000, 1230768060000000000]",
+            format!("{values:?}")
+        );
 
         Ok(())
     }
@@ -1002,22 +1099,21 @@ mod tests {
     async fn test_read_parquet_page_index() -> Result<()> {
         let testdata = datafusion_common::test_util::parquet_test_data();
         let path = format!("{testdata}/alltypes_tiny_pages.parquet");
-        let file = File::open(path).await.unwrap();
-        let options = ArrowReaderOptions::new().with_page_index(true);
+        let file = File::open(path).await?;
+        let options =
+            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
         let builder =
             ParquetRecordBatchStreamBuilder::new_with_options(file, options.clone())
-                .await
-                .unwrap()
+                .await?
                 .metadata()
                 .clone();
         check_page_index_validation(builder.column_index(), builder.offset_index());
 
         let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
-        let file = File::open(path).await.unwrap();
+        let file = File::open(path).await?;
 
         let builder = ParquetRecordBatchStreamBuilder::new_with_options(file, options)
-            .await
-            .unwrap()
+            .await?
             .metadata()
             .clone();
         check_page_index_validation(builder.column_index(), builder.offset_index());
@@ -1051,18 +1147,14 @@ mod tests {
 
         // 325 pages in int_col
         assert_eq!(int_col_offset.len(), 325);
-        match int_col_index {
-            Index::INT32(index) => {
-                assert_eq!(index.indexes.len(), 325);
-                for min_max in index.clone().indexes {
-                    assert!(min_max.min.is_some());
-                    assert!(min_max.max.is_some());
-                    assert!(min_max.null_count.is_some());
-                }
-            }
-            _ => {
-                error!("fail to read page index.")
-            }
+        let ColumnIndexMetaData::INT32(index) = int_col_index else {
+            panic!("fail to read page index.")
+        };
+        assert_eq!(index.min_values().len(), 325);
+        assert_eq!(index.max_values().len(), 325);
+        // all values are non null
+        for idx in 0..325 {
+            assert_eq!(index.null_count(idx), Some(0));
         }
     }
 
@@ -1099,7 +1191,7 @@ mod tests {
     /// Test that 0-byte files don't break while reading
     #[tokio::test]
     async fn test_read_empty_parquet() -> Result<()> {
-        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let tmp_dir = tempfile::TempDir::new()?;
         let path = format!("{}/empty.parquet", tmp_dir.path().to_string_lossy());
         File::create(&path).await?;
 
@@ -1112,10 +1204,10 @@ mod tests {
 
         let result = df.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            ++
-            ++
-       "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -1123,12 +1215,10 @@ mod tests {
     /// Test that 0-byte files don't break while reading
     #[tokio::test]
     async fn test_read_partitioned_empty_parquet() -> Result<()> {
-        let tmp_dir = tempfile::TempDir::new().unwrap();
+        let tmp_dir = tempfile::TempDir::new()?;
         let partition_dir = tmp_dir.path().join("col1=a");
-        std::fs::create_dir(&partition_dir).unwrap();
-        File::create(partition_dir.join("empty.parquet"))
-            .await
-            .unwrap();
+        std::fs::create_dir(&partition_dir)?;
+        File::create(partition_dir.join("empty.parquet")).await?;
 
         let ctx = SessionContext::new();
 
@@ -1143,10 +1233,10 @@ mod tests {
 
         let result = df.collect().await?;
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            ++
-            ++
-       "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        ++
+        ++
+        ");
 
         Ok(())
     }
@@ -1246,6 +1336,56 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_write_empty_recordbatch_creates_file() -> Result<()> {
+        let empty_record_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
+            vec![Arc::new(Int32Array::from(Vec::<i32>::new()))],
+        )
+        .expect("Failed to create empty RecordBatch");
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty2.parquet", tmp_dir.path().to_string_lossy());
+
+        let ctx = SessionContext::new();
+        let df = ctx.read_batch(empty_record_batch.clone())?;
+        df.write_parquet(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        assert!(std::path::Path::new(&path).exists());
+
+        let stream = ctx
+            .read_parquet(&path, ParquetReadOptions::new())
+            .await?
+            .execute_stream()
+            .await?;
+        assert_eq!(stream.schema(), empty_record_batch.schema());
+        let results = stream.collect::<Vec<_>>().await;
+        assert_eq!(results.len(), 0);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_write_empty_parquet_from_sql() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let tmp_dir = tempfile::TempDir::new()?;
+        let path = format!("{}/empty_sql.parquet", tmp_dir.path().to_string_lossy());
+        let df = ctx.sql("SELECT CAST(1 AS INT) AS id LIMIT 0").await?;
+        df.write_parquet(&path, crate::dataframe::DataFrameWriteOptions::new(), None)
+            .await?;
+        // Expected the file to exist
+        assert!(std::path::Path::new(&path).exists());
+        let read_df = ctx.read_parquet(&path, ParquetReadOptions::new()).await?;
+        let stream = read_df.execute_stream().await?;
+        assert_eq!(stream.schema().fields().len(), 1);
+        assert_eq!(stream.schema().field(0).name(), "id");
+
+        let results: Vec<_> = stream.collect().await;
+        assert_eq!(results.len(), 0);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn parquet_sink_write_insert_schema_into_metadata() -> Result<()> {
         // expected kv metadata without schema
@@ -1405,6 +1545,7 @@ mod tests {
             insert_op: InsertOp::Overwrite,
             keep_partition_by_columns: false,
             file_extension: "parquet".into(),
+            file_output_mode: FileOutputMode::Automatic,
         };
         let parquet_sink = Arc::new(ParquetSink::new(
             file_sink_config,
@@ -1421,7 +1562,7 @@ mod tests {
         // create data
         let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"]));
         let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"]));
-        let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap();
+        let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)])?;
 
         // write stream
         FileSink::write_all(
@@ -1437,7 +1578,7 @@ mod tests {
         Ok(parquet_sink)
     }
 
-    fn get_written(parquet_sink: Arc<ParquetSink>) -> Result<(Path, FileMetaData)> {
+    fn get_written(parquet_sink: Arc<ParquetSink>) -> Result<(Path, ParquetMetaData)> {
         let mut written = parquet_sink.written();
         let written = written.drain();
         assert_eq!(
@@ -1447,28 +1588,33 @@ mod tests {
             written.len()
         );
 
-        let (path, file_metadata) = written.take(1).next().unwrap();
-        Ok((path, file_metadata))
+        let (path, parquet_meta_data) = written.take(1).next().unwrap();
+        Ok((path, parquet_meta_data))
     }
 
-    fn assert_file_metadata(file_metadata: FileMetaData, expected_kv: &Vec<KeyValue>) {
-        let FileMetaData {
-            num_rows,
-            schema,
-            key_value_metadata,
-            ..
-        } = file_metadata;
-        assert_eq!(num_rows, 2, "file metadata to have 2 rows");
+    fn assert_file_metadata(
+        parquet_meta_data: ParquetMetaData,
+        expected_kv: &Vec<KeyValue>,
+    ) {
+        let file_metadata = parquet_meta_data.file_metadata();
+        let schema_descr = file_metadata.schema_descr();
+        assert_eq!(file_metadata.num_rows(), 2, "file metadata to have 2 rows");
         assert!(
-            schema.iter().any(|col_schema| col_schema.name == "a"),
+            schema_descr
+                .columns()
+                .iter()
+                .any(|col_schema| col_schema.name() == "a"),
             "output file metadata should contain col a"
         );
         assert!(
-            schema.iter().any(|col_schema| col_schema.name == "b"),
+            schema_descr
+                .columns()
+                .iter()
+                .any(|col_schema| col_schema.name() == "b"),
             "output file metadata should contain col b"
         );
 
-        let mut key_value_metadata = key_value_metadata.unwrap();
+        let mut key_value_metadata = file_metadata.key_value_metadata().unwrap().clone();
         key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key));
         assert_eq!(&key_value_metadata, expected_kv);
     }
@@ -1491,6 +1637,7 @@ mod tests {
             insert_op: InsertOp::Overwrite,
             keep_partition_by_columns: false,
             file_extension: "parquet".into(),
+            file_output_mode: FileOutputMode::Automatic,
         };
         let parquet_sink = Arc::new(ParquetSink::new(
             file_sink_config,
@@ -1500,7 +1647,7 @@ mod tests {
         // create data with 2 partitions
         let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"]));
         let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"]));
-        let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap();
+        let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)])?;
 
         // write stream
         FileSink::write_all(
@@ -1525,13 +1672,11 @@ mod tests {
 
         // check the file metadata includes partitions
         let mut expected_partitions = std::collections::HashSet::from(["a=foo", "a=bar"]);
-        for (
-            path,
-            FileMetaData {
-                num_rows, schema, ..
-            },
-        ) in written.take(2)
-        {
+        for (path, parquet_metadata) in written.take(2) {
+            let file_metadata = parquet_metadata.file_metadata();
+            let schema = file_metadata.schema_descr();
+            let num_rows = file_metadata.num_rows();
+
             let path_parts = path.parts().collect::<Vec<_>>();
             assert_eq!(path_parts.len(), 2, "should have path prefix");
 
@@ -1544,11 +1689,17 @@ mod tests {
 
             assert_eq!(num_rows, 1, "file metadata to have 1 row");
             assert!(
-                !schema.iter().any(|col_schema| col_schema.name == "a"),
+                !schema
+                    .columns()
+                    .iter()
+                    .any(|col_schema| col_schema.name() == "a"),
                 "output file metadata will not contain partitioned col a"
             );
             assert!(
-                schema.iter().any(|col_schema| col_schema.name == "b"),
+                schema
+                    .columns()
+                    .iter()
+                    .any(|col_schema| col_schema.name() == "b"),
                 "output file metadata should contain col b"
             );
         }
@@ -1577,6 +1728,7 @@ mod tests {
                 insert_op: InsertOp::Overwrite,
                 keep_partition_by_columns: false,
                 file_extension: "parquet".into(),
+                file_output_mode: FileOutputMode::Automatic,
             };
             let parquet_sink = Arc::new(ParquetSink::new(
                 file_sink_config,
@@ -1593,8 +1745,7 @@ mod tests {
             // create data
             let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"]));
             let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"]));
-            let batch =
-                RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap();
+            let batch = RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)])?;
 
             // create task context
             let task_context = build_ctx(object_store_url.as_ref());
@@ -1662,43 +1813,4 @@ mod tests {
 
         Ok(())
     }
-
-    /// Creates an bounded stream for testing purposes.
-    fn bounded_stream(
-        batch: RecordBatch,
-        limit: usize,
-    ) -> datafusion_execution::SendableRecordBatchStream {
-        Box::pin(BoundedStream {
-            count: 0,
-            limit,
-            batch,
-        })
-    }
-
-    struct BoundedStream {
-        limit: usize,
-        count: usize,
-        batch: RecordBatch,
-    }
-
-    impl Stream for BoundedStream {
-        type Item = Result<RecordBatch>;
-
-        fn poll_next(
-            mut self: Pin<&mut Self>,
-            _cx: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            if self.count >= self.limit {
-                return Poll::Ready(None);
-            }
-            self.count += 1;
-            Poll::Ready(Some(Ok(self.batch.clone())))
-        }
-    }
-
-    impl RecordBatchStream for BoundedStream {
-        fn schema(&self) -> SchemaRef {
-            self.batch.schema()
-        }
-    }
 }
diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs
index a58db55bccb61..85dee3f91cffb 100644
--- a/datafusion/core/src/datasource/listing/mod.rs
+++ b/datafusion/core/src/datasource/listing/mod.rs
@@ -20,7 +20,9 @@
 
 mod table;
 pub use datafusion_catalog_listing::helpers;
-pub use datafusion_datasource::{
-    FileRange, ListingTableUrl, PartitionedFile, PartitionedFileStream,
-};
-pub use table::{ListingOptions, ListingTable, ListingTableConfig};
+pub use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
+// Keep for backwards compatibility until removed
+#[expect(deprecated)]
+pub use datafusion_datasource::PartitionedFileStream;
+pub use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile};
+pub use table::ListingTableConfigExt;
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 3c87d3ee2329c..5dd11739c1f57 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -15,144 +15,42 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! The table implementation.
-
-use super::helpers::{expr_applicable_for_cols, pruned_partition_list};
-use super::{ListingTableUrl, PartitionedFile};
+use crate::execution::SessionState;
+use async_trait::async_trait;
+use datafusion_catalog_listing::{ListingOptions, ListingTableConfig};
+use datafusion_common::{config_datafusion_err, internal_datafusion_err};
+use datafusion_session::Session;
+use futures::StreamExt;
 use std::collections::HashMap;
-use std::{any::Any, str::FromStr, sync::Arc};
-
-use crate::datasource::{
-    create_ordering,
-    file_format::{file_compression_type::FileCompressionType, FileFormat},
-    physical_plan::FileSinkConfig,
-};
-use crate::execution::context::SessionState;
-use datafusion_catalog::TableProvider;
-use datafusion_common::{config_err, DataFusionError, Result};
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory;
-use datafusion_execution::config::SessionConfig;
-use datafusion_expr::dml::InsertOp;
-use datafusion_expr::{Expr, TableProviderFilterPushDown};
-use datafusion_expr::{SortExpr, TableType};
-use datafusion_physical_plan::empty::EmptyExec;
-use datafusion_physical_plan::{ExecutionPlan, Statistics};
-
-use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef};
-use datafusion_common::{
-    config_datafusion_err, internal_err, plan_err, project_schema, Constraints, SchemaExt,
-};
-use datafusion_execution::cache::{
-    cache_manager::FileStatisticsCache, cache_unit::DefaultFileStatisticsCache,
-};
-use datafusion_physical_expr::{LexOrdering, PhysicalSortRequirement};
 
-use async_trait::async_trait;
-use datafusion_catalog::Session;
-use datafusion_common::stats::Precision;
-use datafusion_datasource::compute_all_files_statistics;
-use datafusion_datasource::file_groups::FileGroup;
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
-use futures::{future, stream, Stream, StreamExt, TryStreamExt};
-use itertools::Itertools;
-use object_store::ObjectStore;
-
-/// Configuration for creating a [`ListingTable`]
-///
+/// Extension trait for [`ListingTableConfig`] that supports inferring schemas
 ///
-#[derive(Debug, Clone)]
-pub struct ListingTableConfig {
-    /// Paths on the `ObjectStore` for creating `ListingTable`.
-    /// They should share the same schema and object store.
-    pub table_paths: Vec<ListingTableUrl>,
-    /// Optional `SchemaRef` for the to be created `ListingTable`.
-    ///
-    /// See details on [`ListingTableConfig::with_schema`]
-    pub file_schema: Option<SchemaRef>,
-    /// Optional [`ListingOptions`] for the to be created [`ListingTable`].
-    ///
-    /// See details on [`ListingTableConfig::with_listing_options`]
-    pub options: Option<ListingOptions>,
-}
-
-impl ListingTableConfig {
-    /// Creates new [`ListingTableConfig`] for reading the specified URL
-    pub fn new(table_path: ListingTableUrl) -> Self {
-        let table_paths = vec![table_path];
-        Self {
-            table_paths,
-            file_schema: None,
-            options: None,
-        }
-    }
-
-    /// Creates new [`ListingTableConfig`] with multiple table paths.
-    ///
-    /// See [`Self::infer_options`] for details on what happens with multiple paths
-    pub fn new_with_multi_paths(table_paths: Vec<ListingTableUrl>) -> Self {
-        Self {
-            table_paths,
-            file_schema: None,
-            options: None,
-        }
-    }
-    /// Set the `schema` for the overall [`ListingTable`]
-    ///
-    /// [`ListingTable`] will automatically coerce, when possible, the schema
-    /// for individual files to match this schema.
-    ///
-    /// If a schema is not provided, it is inferred using
-    /// [`Self::infer_schema`].
-    ///
-    /// If the schema is provided, it must contain only the fields in the file
-    /// without the table partitioning columns.
-    pub fn with_schema(self, schema: SchemaRef) -> Self {
-        Self {
-            table_paths: self.table_paths,
-            file_schema: Some(schema),
-            options: self.options,
-        }
-    }
-
-    /// Add `listing_options` to [`ListingTableConfig`]
-    ///
-    /// If not provided, format and other options are inferred via
-    /// [`Self::infer_options`].
-    pub fn with_listing_options(self, listing_options: ListingOptions) -> Self {
-        Self {
-            table_paths: self.table_paths,
-            file_schema: self.file_schema,
-            options: Some(listing_options),
-        }
-    }
-
-    /// Returns a tuple of `(file_extension, optional compression_extension)`
-    ///
-    /// For example a path ending with blah.test.csv.gz returns `("csv", Some("gz"))`
-    /// For example a path ending with blah.test.csv returns `("csv", None)`
-    fn infer_file_extension_and_compression_type(
-        path: &str,
-    ) -> Result<(String, Option<String>)> {
-        let mut exts = path.rsplit('.');
-
-        let splitted = exts.next().unwrap_or("");
-
-        let file_compression_type = FileCompressionType::from_str(splitted)
-            .unwrap_or(FileCompressionType::UNCOMPRESSED);
-
-        if file_compression_type.is_compressed() {
-            let splitted2 = exts.next().unwrap_or("");
-            Ok((splitted2.to_string(), Some(splitted.to_string())))
-        } else {
-            Ok((splitted.to_string(), None))
-        }
-    }
-
+/// This trait exists because the following inference methods only
+/// work for [`SessionState`] implementations of [`Session`].
+/// See [`ListingTableConfig`] for the remaining inference methods.
+#[async_trait]
+pub trait ListingTableConfigExt {
     /// Infer `ListingOptions` based on `table_path` and file suffix.
     ///
     /// The format is inferred based on the first `table_path`.
-    pub async fn infer_options(self, state: &dyn Session) -> Result<Self> {
+    async fn infer_options(
+        self,
+        state: &dyn Session,
+    ) -> datafusion_common::Result<ListingTableConfig>;
+
+    /// Convenience method to call both [`Self::infer_options`] and [`ListingTableConfig::infer_schema`]
+    async fn infer(
+        self,
+        state: &dyn Session,
+    ) -> datafusion_common::Result<ListingTableConfig>;
+}
+
+#[async_trait]
+impl ListingTableConfigExt for ListingTableConfig {
+    async fn infer_options(
+        self,
+        state: &dyn Session,
+    ) -> datafusion_common::Result<ListingTableConfig> {
         let store = if let Some(url) = self.table_paths.first() {
             state.runtime_env().object_store(url)?
         } else {
@@ -167,7 +65,7 @@ impl ListingTableConfig {
             .await?
             .next()
             .await
-            .ok_or_else(|| DataFusionError::Internal("No files for table".into()))??;
+            .ok_or_else(|| internal_datafusion_err!("No files for table"))??;
 
         let (file_extension, maybe_compression_type) =
             ListingTableConfig::infer_file_extension_and_compression_type(
@@ -199,1099 +97,136 @@ impl ListingTableConfig {
             .with_target_partitions(state.config().target_partitions())
             .with_collect_stat(state.config().collect_statistics());
 
-        Ok(Self {
-            table_paths: self.table_paths,
-            file_schema: self.file_schema,
-            options: Some(listing_options),
-        })
+        Ok(self.with_listing_options(listing_options))
     }
 
-    /// Infer the [`SchemaRef`] based on `table_path`s.
-    ///
-    /// This method infers the table schema using the first `table_path`.
-    /// See [`ListingOptions::infer_schema`] for more details
-    ///
-    /// # Errors
-    /// * if `self.options` is not set. See [`Self::with_listing_options`]
-    pub async fn infer_schema(self, state: &dyn Session) -> Result<Self> {
-        match self.options {
-            Some(options) => {
-                let schema = if let Some(url) = self.table_paths.first() {
-                    options.infer_schema(state, url).await?
-                } else {
-                    Arc::new(Schema::empty())
-                };
-
-                Ok(Self {
-                    table_paths: self.table_paths,
-                    file_schema: Some(schema),
-                    options: Some(options),
-                })
-            }
-            None => internal_err!("No `ListingOptions` set for inferring schema"),
-        }
-    }
-
-    /// Convenience method to call both [`Self::infer_options`] and [`Self::infer_schema`]
-    pub async fn infer(self, state: &dyn Session) -> Result<Self> {
+    async fn infer(self, state: &dyn Session) -> datafusion_common::Result<Self> {
         self.infer_options(state).await?.infer_schema(state).await
     }
-
-    /// Infer the partition columns from `table_paths`.
-    ///
-    /// # Errors
-    /// * if `self.options` is not set. See [`Self::with_listing_options`]
-    pub async fn infer_partitions_from_path(self, state: &dyn Session) -> Result<Self> {
-        match self.options {
-            Some(options) => {
-                let Some(url) = self.table_paths.first() else {
-                    return config_err!("No table path found");
-                };
-                let partitions = options
-                    .infer_partitions(state, url)
-                    .await?
-                    .into_iter()
-                    .map(|col_name| {
-                        (
-                            col_name,
-                            DataType::Dictionary(
-                                Box::new(DataType::UInt16),
-                                Box::new(DataType::Utf8),
-                            ),
-                        )
-                    })
-                    .collect::<Vec<_>>();
-                let options = options.with_table_partition_cols(partitions);
-                Ok(Self {
-                    table_paths: self.table_paths,
-                    file_schema: self.file_schema,
-                    options: Some(options),
-                })
-            }
-            None => config_err!("No `ListingOptions` set for inferring schema"),
-        }
-    }
 }
 
-/// Options for creating a [`ListingTable`]
-#[derive(Clone, Debug)]
-pub struct ListingOptions {
-    /// A suffix on which files should be filtered (leave empty to
-    /// keep all files on the path)
-    pub file_extension: String,
-    /// The file format
-    pub format: Arc<dyn FileFormat>,
-    /// The expected partition column names in the folder structure.
-    /// See [Self::with_table_partition_cols] for details
-    pub table_partition_cols: Vec<(String, DataType)>,
-    /// Set true to try to guess statistics from the files.
-    /// This can add a lot of overhead as it will usually require files
-    /// to be opened and at least partially parsed.
-    pub collect_stat: bool,
-    /// Group files to avoid that the number of partitions exceeds
-    /// this limit
-    pub target_partitions: usize,
-    /// Optional pre-known sort order(s). Must be `SortExpr`s.
-    ///
-    /// DataFusion may take advantage of this ordering to omit sorts
-    /// or use more efficient algorithms. Currently sortedness must be
-    /// provided if it is known by some external mechanism, but may in
-    /// the future be automatically determined, for example using
-    /// parquet metadata.
-    ///
-    /// See <https://github.com/apache/datafusion/issues/4177>
-    ///
-    /// NOTE: This attribute stores all equivalent orderings (the outer `Vec`)
-    ///       where each ordering consists of an individual lexicographic
-    ///       ordering (encapsulated by a `Vec<Expr>`). If there aren't
-    ///       multiple equivalent orderings, the outer `Vec` will have a
-    ///       single element.
-    pub file_sort_order: Vec<Vec<SortExpr>>,
-}
-
-impl ListingOptions {
-    /// Creates an options instance with the given format
-    /// Default values:
-    /// - use default file extension filter
-    /// - no input partition to discover
-    /// - one target partition
-    /// - do not collect statistics
-    pub fn new(format: Arc<dyn FileFormat>) -> Self {
-        Self {
-            file_extension: format.get_ext(),
-            format,
-            table_partition_cols: vec![],
-            collect_stat: false,
-            target_partitions: 1,
-            file_sort_order: vec![],
-        }
-    }
-
-    /// Set options from [`SessionConfig`] and returns self.
-    ///
-    /// Currently this sets `target_partitions` and `collect_stat`
-    /// but if more options are added in the future that need to be coordinated
-    /// they will be synchronized thorugh this method.
-    pub fn with_session_config_options(mut self, config: &SessionConfig) -> Self {
-        self = self.with_target_partitions(config.target_partitions());
-        self = self.with_collect_stat(config.collect_statistics());
-        self
-    }
-
-    /// Set file extension on [`ListingOptions`] and returns self.
-    ///
-    /// # Example
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use datafusion::prelude::SessionContext;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    ///
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_file_extension(".parquet");
-    ///
-    /// assert_eq!(listing_options.file_extension, ".parquet");
-    /// ```
-    pub fn with_file_extension(mut self, file_extension: impl Into<String>) -> Self {
-        self.file_extension = file_extension.into();
-        self
-    }
-
-    /// Optionally set file extension on [`ListingOptions`] and returns self.
-    ///
-    /// If `file_extension` is `None`, the file extension will not be changed
-    ///
-    /// # Example
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use datafusion::prelude::SessionContext;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    /// let extension = Some(".parquet");
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_file_extension_opt(extension);
-    ///
-    /// assert_eq!(listing_options.file_extension, ".parquet");
-    /// ```
-    pub fn with_file_extension_opt<S>(mut self, file_extension: Option<S>) -> Self
-    where
-        S: Into<String>,
-    {
-        if let Some(file_extension) = file_extension {
-            self.file_extension = file_extension.into();
-        }
-        self
-    }
-
-    /// Set `table partition columns` on [`ListingOptions`] and returns self.
-    ///
-    /// "partition columns," used to support [Hive Partitioning], are
-    /// columns added to the data that is read, based on the folder
-    /// structure where the data resides.
-    ///
-    /// For example, give the following files in your filesystem:
-    ///
-    /// ```text
-    /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
-    /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
-    /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
-    /// ```
-    ///
-    /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
-    /// columns "year" and "month" will include new `year` and `month`
-    /// columns while reading the files. The `year` column would have
-    /// value `2022` and the `month` column would have value `01` for
-    /// the rows read from
-    /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
-    ///
-    ///# Notes
-    ///
-    /// - If only one level (e.g. `year` in the example above) is
-    ///   specified, the other levels are ignored but the files are
-    ///   still read.
-    ///
-    /// - Files that don't follow this partitioning scheme will be
-    ///   ignored.
-    ///
-    /// - Since the columns have the same value for all rows read from
-    ///   each individual file (such as dates), they are typically
-    ///   dictionary encoded for efficiency. You may use
-    ///   [`wrap_partition_type_in_dict`] to request a
-    ///   dictionary-encoded type.
-    ///
-    /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use arrow::datatypes::DataType;
-    /// # use datafusion::prelude::col;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    ///
-    /// // listing options for files with paths such as  `/mnt/data/col_a=x/col_b=y/data.parquet`
-    /// // `col_a` and `col_b` will be included in the data read from those files
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_table_partition_cols(vec![("col_a".to_string(), DataType::Utf8),
-    ///       ("col_b".to_string(), DataType::Utf8)]);
-    ///
-    /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
-    ///     ("col_b".to_string(), DataType::Utf8)]);
-    /// ```
-    ///
-    /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
-    /// [`wrap_partition_type_in_dict`]: crate::datasource::physical_plan::wrap_partition_type_in_dict
-    pub fn with_table_partition_cols(
-        mut self,
-        table_partition_cols: Vec<(String, DataType)>,
-    ) -> Self {
-        self.table_partition_cols = table_partition_cols;
-        self
-    }
-
-    /// Set stat collection on [`ListingOptions`] and returns self.
-    ///
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    ///
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_collect_stat(true);
-    ///
-    /// assert_eq!(listing_options.collect_stat, true);
-    /// ```
-    pub fn with_collect_stat(mut self, collect_stat: bool) -> Self {
-        self.collect_stat = collect_stat;
-        self
-    }
-
-    /// Set number of target partitions on [`ListingOptions`] and returns self.
-    ///
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    ///
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_target_partitions(8);
-    ///
-    /// assert_eq!(listing_options.target_partitions, 8);
-    /// ```
-    pub fn with_target_partitions(mut self, target_partitions: usize) -> Self {
-        self.target_partitions = target_partitions;
-        self
-    }
-
-    /// Set file sort order on [`ListingOptions`] and returns self.
-    ///
-    /// ```
-    /// # use std::sync::Arc;
-    /// # use datafusion::prelude::col;
-    /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
-    ///
-    ///  // Tell datafusion that the files are sorted by column "a"
-    ///  let file_sort_order = vec![vec![
-    ///    col("a").sort(true, true)
-    ///  ]];
-    ///
-    /// let listing_options = ListingOptions::new(Arc::new(
-    ///     ParquetFormat::default()
-    ///   ))
-    ///   .with_file_sort_order(file_sort_order.clone());
-    ///
-    /// assert_eq!(listing_options.file_sort_order, file_sort_order);
-    /// ```
-    pub fn with_file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
-        self.file_sort_order = file_sort_order;
-        self
-    }
-
-    /// Infer the schema of the files at the given path on the provided object store.
-    ///
-    /// If the table_path contains one or more files (i.e. it is a directory /
-    /// prefix of files) their schema is merged by calling [`FileFormat::infer_schema`]
-    ///
-    /// Note: The inferred schema does not include any partitioning columns.
-    ///
-    /// This method is called as part of creating a [`ListingTable`].
-    pub async fn infer_schema<'a>(
-        &'a self,
-        state: &dyn Session,
-        table_path: &'a ListingTableUrl,
-    ) -> Result<SchemaRef> {
-        let store = state.runtime_env().object_store(table_path)?;
-
-        let files: Vec<_> = table_path
-            .list_all_files(state, store.as_ref(), &self.file_extension)
-            .await?
-            // Empty files cannot affect schema but may throw when trying to read for it
-            .try_filter(|object_meta| future::ready(object_meta.size > 0))
-            .try_collect()
-            .await?;
-
-        let schema = self.format.infer_schema(state, &store, &files).await?;
-
-        Ok(schema)
-    }
-
-    /// Infers the partition columns stored in `LOCATION` and compares
-    /// them with the columns provided in `PARTITIONED BY` to help prevent
-    /// accidental corrupts of partitioned tables.
-    ///
-    /// Allows specifying partial partitions.
-    pub async fn validate_partitions(
-        &self,
-        state: &dyn Session,
-        table_path: &ListingTableUrl,
-    ) -> Result<()> {
-        if self.table_partition_cols.is_empty() {
-            return Ok(());
-        }
-
-        if !table_path.is_collection() {
-            return plan_err!(
-                "Can't create a partitioned table backed by a single file, \
-                perhaps the URL is missing a trailing slash?"
-            );
-        }
-
-        let inferred = self.infer_partitions(state, table_path).await?;
-
-        // no partitioned files found on disk
-        if inferred.is_empty() {
-            return Ok(());
-        }
-
-        let table_partition_names = self
-            .table_partition_cols
-            .iter()
-            .map(|(col_name, _)| col_name.clone())
-            .collect_vec();
-
-        if inferred.len() < table_partition_names.len() {
-            return plan_err!(
-                "Inferred partitions to be {:?}, but got {:?}",
-                inferred,
-                table_partition_names
-            );
-        }
-
-        // match prefix to allow creating tables with partial partitions
-        for (idx, col) in table_partition_names.iter().enumerate() {
-            if &inferred[idx] != col {
-                return plan_err!(
-                    "Inferred partitions to be {:?}, but got {:?}",
-                    inferred,
-                    table_partition_names
-                );
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Infer the partitioning at the given path on the provided object store.
-    /// For performance reasons, it doesn't read all the files on disk
-    /// and therefore may fail to detect invalid partitioning.
-    pub(crate) async fn infer_partitions(
-        &self,
-        state: &dyn Session,
-        table_path: &ListingTableUrl,
-    ) -> Result<Vec<String>> {
-        let store = state.runtime_env().object_store(table_path)?;
-
-        // only use 10 files for inference
-        // This can fail to detect inconsistent partition keys
-        // A DFS traversal approach of the store can help here
-        let files: Vec<_> = table_path
-            .list_all_files(state, store.as_ref(), &self.file_extension)
-            .await?
-            .take(10)
-            .try_collect()
-            .await?;
+#[cfg(test)]
+mod tests {
+    #[cfg(feature = "parquet")]
+    use crate::datasource::file_format::parquet::ParquetFormat;
+    use crate::datasource::listing::table::ListingTableConfigExt;
+    use crate::execution::options::JsonReadOptions;
+    use crate::prelude::*;
+    use crate::{
+        datasource::{
+            DefaultTableSource, MemTable, file_format::csv::CsvFormat,
+            file_format::json::JsonFormat, provider_as_source,
+        },
+        execution::options::ArrowReadOptions,
+        test::{
+            columns, object_store::ensure_head_concurrency,
+            object_store::make_test_store_and_state, object_store::register_test_store,
+        },
+    };
+    use arrow::{compute::SortOptions, record_batch::RecordBatch};
+    use arrow_schema::{DataType, Field, Schema, SchemaRef};
+    use datafusion_catalog::TableProvider;
+    use datafusion_catalog_listing::{
+        ListingOptions, ListingTable, ListingTableConfig, SchemaSource,
+    };
+    use datafusion_common::{
+        DataFusionError, Result, ScalarValue, assert_contains,
+        stats::Precision,
+        test_util::{batches_to_string, datafusion_test_data},
+    };
+    use datafusion_datasource::ListingTableUrl;
+    use datafusion_datasource::file_compression_type::FileCompressionType;
+    use datafusion_datasource::file_format::FileFormat;
+    use datafusion_expr::dml::InsertOp;
+    use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator};
+    use datafusion_physical_expr::PhysicalSortExpr;
+    use datafusion_physical_expr::expressions::binary;
+    use datafusion_physical_expr_common::sort_expr::LexOrdering;
+    use datafusion_physical_plan::empty::EmptyExec;
+    use datafusion_physical_plan::{ExecutionPlanProperties, collect};
+    use std::collections::HashMap;
+    use std::io::Write;
+    use std::sync::Arc;
+    use tempfile::TempDir;
+    use url::Url;
 
-        let stripped_path_parts = files.iter().map(|file| {
-            table_path
-                .strip_prefix(&file.location)
-                .unwrap()
-                .collect_vec()
-        });
-
-        let partition_keys = stripped_path_parts
-            .map(|path_parts| {
-                path_parts
-                    .into_iter()
-                    .rev()
-                    .skip(1) // get parents only; skip the file itself
-                    .rev()
-                    .map(|s| s.split('=').take(1).collect())
-                    .collect_vec()
-            })
-            .collect_vec();
-
-        match partition_keys.into_iter().all_equal_value() {
-            Ok(v) => Ok(v),
-            Err(None) => Ok(vec![]),
-            Err(Some(diff)) => {
-                let mut sorted_diff = [diff.0, diff.1];
-                sorted_diff.sort();
-                plan_err!("Found mixed partition values on disk {:?}", sorted_diff)
-            }
-        }
+    /// Creates a test schema with standard field types used in tests
+    fn create_test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]))
+    }
+
+    /// Helper function to generate test file paths with given prefix, count, and optional start index
+    fn generate_test_files(prefix: &str, count: usize) -> Vec<String> {
+        generate_test_files_with_start(prefix, count, 0)
+    }
+
+    /// Helper function to generate test file paths with given prefix, count, and start index
+    fn generate_test_files_with_start(
+        prefix: &str,
+        count: usize,
+        start_index: usize,
+    ) -> Vec<String> {
+        (start_index..start_index + count)
+            .map(|i| format!("{prefix}/file{i}"))
+            .collect()
     }
-}
-
-/// Reads data from one or more files as a single table.
-///
-/// Implements [`TableProvider`], a DataFusion data source. The files are read
-/// using an  [`ObjectStore`] instance, for example from local files or objects
-/// from AWS S3.
-///
-/// # Reading Directories
-/// For example, given the `table1` directory (or object store prefix)
-///
-/// ```text
-/// table1
-///  ├── file1.parquet
-///  └── file2.parquet
-/// ```
-///
-/// A `ListingTable` would read the files `file1.parquet` and `file2.parquet` as
-/// a single table, merging the schemas if the files have compatible but not
-/// identical schemas.
-///
-/// Given the `table2` directory (or object store prefix)
-///
-/// ```text
-/// table2
-///  ├── date=2024-06-01
-///  │    ├── file3.parquet
-///  │    └── file4.parquet
-///  └── date=2024-06-02
-///       └── file5.parquet
-/// ```
-///
-/// A `ListingTable` would read the files `file3.parquet`, `file4.parquet`, and
-/// `file5.parquet` as a single table, again merging schemas if necessary.
-///
-/// Given the hive style partitioning structure (e.g,. directories named
-/// `date=2024-06-01` and `date=2026-06-02`), `ListingTable` also adds a `date`
-/// column when reading the table:
-/// * The files in `table2/date=2024-06-01` will have the value `2024-06-01`
-/// * The files in `table2/date=2024-06-02` will have the value `2024-06-02`.
-///
-/// If the query has a predicate like `WHERE date = '2024-06-01'`
-/// only the corresponding directory will be read.
-///
-/// `ListingTable` also supports limit, filter and projection pushdown for formats that
-/// support it as such as Parquet.
-///
-/// # See Also
-///
-/// 1. [`ListingTableConfig`]: Configuration options
-/// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable`
-///
-/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
-///
-/// # Example: Read a directory of parquet files using a [`ListingTable`]
-///
-/// ```no_run
-/// # use datafusion::prelude::SessionContext;
-/// # use datafusion::error::Result;
-/// # use std::sync::Arc;
-/// # use datafusion::datasource::{
-/// #   listing::{
-/// #      ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
-/// #   },
-/// #   file_format::parquet::ParquetFormat,
-/// # };
-/// # #[tokio::main]
-/// # async fn main() -> Result<()> {
-/// let ctx = SessionContext::new();
-/// let session_state = ctx.state();
-/// let table_path = "/path/to/parquet";
-///
-/// // Parse the path
-/// let table_path = ListingTableUrl::parse(table_path)?;
-///
-/// // Create default parquet options
-/// let file_format = ParquetFormat::new();
-/// let listing_options = ListingOptions::new(Arc::new(file_format))
-///   .with_file_extension(".parquet");
-///
-/// // Resolve the schema
-/// let resolved_schema = listing_options
-///    .infer_schema(&session_state, &table_path)
-///    .await?;
-///
-/// let config = ListingTableConfig::new(table_path)
-///   .with_listing_options(listing_options)
-///   .with_schema(resolved_schema);
-///
-/// // Create a new TableProvider
-/// let provider = Arc::new(ListingTable::try_new(config)?);
-///
-/// // This provider can now be read as a dataframe:
-/// let df = ctx.read_table(provider.clone());
-///
-/// // or registered as a named table:
-/// ctx.register_table("my_table", provider);
-///
-/// # Ok(())
-/// # }
-/// ```
-#[derive(Debug, Clone)]
-pub struct ListingTable {
-    table_paths: Vec<ListingTableUrl>,
-    /// `file_schema` contains only the columns physically stored in the data files themselves.
-    ///     - Represents the actual fields found in files like Parquet, CSV, etc.
-    ///     - Used when reading the raw data from files
-    file_schema: SchemaRef,
-    /// `table_schema` combines `file_schema` + partition columns
-    ///     - Partition columns are derived from directory paths (not stored in files)
-    ///     - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet`
-    table_schema: SchemaRef,
-    options: ListingOptions,
-    definition: Option<String>,
-    collected_statistics: FileStatisticsCache,
-    constraints: Constraints,
-    column_defaults: HashMap<String, Expr>,
-}
-
-impl ListingTable {
-    /// Create new [`ListingTable`]
-    ///
-    /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
-    pub fn try_new(config: ListingTableConfig) -> Result<Self> {
-        let file_schema = config
-            .file_schema
-            .ok_or_else(|| DataFusionError::Internal("No schema provided.".into()))?;
-
-        let options = config.options.ok_or_else(|| {
-            DataFusionError::Internal("No ListingOptions provided".into())
-        })?;
-
-        // Add the partition columns to the file schema
-        let mut builder = SchemaBuilder::from(file_schema.as_ref().to_owned());
-        for (part_col_name, part_col_type) in &options.table_partition_cols {
-            builder.push(Field::new(part_col_name, part_col_type.clone(), false));
-        }
 
-        let table_schema = Arc::new(
-            builder
-                .finish()
-                .with_metadata(file_schema.metadata().clone()),
+    #[tokio::test]
+    async fn test_schema_source_tracking_comprehensive() -> Result<()> {
+        let ctx = SessionContext::new();
+        let testdata = datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename)?;
+
+        // Test default schema source
+        let format = CsvFormat::default();
+        let options = ListingOptions::new(Arc::new(format));
+        let config =
+            ListingTableConfig::new(table_path.clone()).with_listing_options(options);
+        assert_eq!(config.schema_source(), SchemaSource::Unset);
+
+        // Test schema source after setting a schema explicitly
+        let provided_schema = create_test_schema();
+        let config_with_schema = config.clone().with_schema(provided_schema.clone());
+        assert_eq!(config_with_schema.schema_source(), SchemaSource::Specified);
+
+        // Test schema source after inferring schema
+        assert_eq!(config.schema_source(), SchemaSource::Unset);
+
+        let config_with_inferred = config.infer_schema(&ctx.state()).await?;
+        assert_eq!(config_with_inferred.schema_source(), SchemaSource::Inferred);
+
+        // Test schema preservation through operations
+        let config_with_schema_and_options = config_with_schema.clone();
+        assert_eq!(
+            config_with_schema_and_options.schema_source(),
+            SchemaSource::Specified
         );
 
-        let table = Self {
-            table_paths: config.table_paths,
-            file_schema,
-            table_schema,
-            options,
-            definition: None,
-            collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
-            constraints: Constraints::empty(),
-            column_defaults: HashMap::new(),
-        };
-
-        Ok(table)
-    }
-
-    /// Assign constraints
-    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.constraints = constraints;
-        self
-    }
-
-    /// Assign column defaults
-    pub fn with_column_defaults(
-        mut self,
-        column_defaults: HashMap<String, Expr>,
-    ) -> Self {
-        self.column_defaults = column_defaults;
-        self
-    }
-
-    /// Set the [`FileStatisticsCache`] used to cache parquet file statistics.
-    ///
-    /// Setting a statistics cache on the `SessionContext` can avoid refetching statistics
-    /// multiple times in the same session.
-    ///
-    /// If `None`, creates a new [`DefaultFileStatisticsCache`] scoped to this query.
-    pub fn with_cache(mut self, cache: Option<FileStatisticsCache>) -> Self {
-        self.collected_statistics =
-            cache.unwrap_or_else(|| Arc::new(DefaultFileStatisticsCache::default()));
-        self
-    }
-
-    /// Specify the SQL definition for this table, if any
-    pub fn with_definition(mut self, definition: Option<String>) -> Self {
-        self.definition = definition;
-        self
-    }
-
-    /// Get paths ref
-    pub fn table_paths(&self) -> &Vec<ListingTableUrl> {
-        &self.table_paths
-    }
-
-    /// Get options ref
-    pub fn options(&self) -> &ListingOptions {
-        &self.options
-    }
-
-    /// If file_sort_order is specified, creates the appropriate physical expressions
-    fn try_create_output_ordering(&self) -> Result<Vec<LexOrdering>> {
-        create_ordering(&self.table_schema, &self.options.file_sort_order)
-    }
-}
-
-// Expressions can be used for parttion pruning if they can be evaluated using
-// only the partiton columns and there are partition columns.
-fn can_be_evaluted_for_partition_pruning(
-    partition_column_names: &[&str],
-    expr: &Expr,
-) -> bool {
-    !partition_column_names.is_empty()
-        && expr_applicable_for_cols(partition_column_names, expr)
-}
-
-#[async_trait]
-impl TableProvider for ListingTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.table_schema)
-    }
-
-    fn constraints(&self) -> Option<&Constraints> {
-        Some(&self.constraints)
-    }
-
-    fn table_type(&self) -> TableType {
-        TableType::Base
-    }
-
-    async fn scan(
-        &self,
-        state: &dyn Session,
-        projection: Option<&Vec<usize>>,
-        filters: &[Expr],
-        limit: Option<usize>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        // extract types of partition columns
-        let table_partition_cols = self
-            .options
-            .table_partition_cols
-            .iter()
-            .map(|col| Ok(self.table_schema.field_with_name(&col.0)?.clone()))
-            .collect::<Result<Vec<_>>>()?;
-
-        let table_partition_col_names = table_partition_cols
-            .iter()
-            .map(|field| field.name().as_str())
-            .collect::<Vec<_>>();
-        // If the filters can be resolved using only partition cols, there is no need to
-        // pushdown it to TableScan, otherwise, `unhandled` pruning predicates will be generated
-        let (partition_filters, filters): (Vec<_>, Vec<_>) =
-            filters.iter().cloned().partition(|filter| {
-                can_be_evaluted_for_partition_pruning(&table_partition_col_names, filter)
-            });
-
-        // We should not limit the number of partitioned files to scan if there are filters and limit
-        // at the same time. This is because the limit should be applied after the filters are applied.
-        let statistic_file_limit = if filters.is_empty() { limit } else { None };
-
-        let (mut partitioned_file_lists, statistics) = self
-            .list_files_for_scan(state, &partition_filters, statistic_file_limit)
+        // Make sure inferred schema doesn't override specified schema
+        let config_with_schema_and_infer = config_with_schema_and_options
+            .clone()
+            .infer(&ctx.state())
             .await?;
+        assert_eq!(
+            config_with_schema_and_infer.schema_source(),
+            SchemaSource::Specified
+        );
 
-        // if no files need to be read, return an `EmptyExec`
-        if partitioned_file_lists.is_empty() {
-            let projected_schema = project_schema(&self.schema(), projection)?;
-            return Ok(Arc::new(EmptyExec::new(projected_schema)));
-        }
-
-        let output_ordering = self.try_create_output_ordering()?;
-        match state
-            .config_options()
-            .execution
-            .split_file_groups_by_statistics
-            .then(|| {
-                output_ordering.first().map(|output_ordering| {
-                    FileScanConfig::split_groups_by_statistics_with_target_partitions(
-                        &self.table_schema,
-                        &partitioned_file_lists,
-                        output_ordering,
-                        self.options.target_partitions,
-                    )
-                })
-            })
-            .flatten()
-        {
-            Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"),
-            Some(Ok(new_groups)) => {
-                if new_groups.len() <= self.options.target_partitions {
-                    partitioned_file_lists = new_groups;
-                } else {
-                    log::debug!("attempted to split file groups by statistics, but there were more file groups than target_partitions; falling back to unordered")
-                }
-            }
-            None => {} // no ordering required
-        };
-
-        let Some(object_store_url) =
-            self.table_paths.first().map(ListingTableUrl::object_store)
-        else {
-            return Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty()))));
-        };
-
-        // create the execution plan
-        self.options
-            .format
-            .create_physical_plan(
-                state,
-                FileScanConfigBuilder::new(
-                    object_store_url,
-                    Arc::clone(&self.file_schema),
-                    self.options.format.file_source(),
-                )
-                .with_file_groups(partitioned_file_lists)
-                .with_constraints(self.constraints.clone())
-                .with_statistics(statistics)
-                .with_projection(projection.cloned())
-                .with_limit(limit)
-                .with_output_ordering(output_ordering)
-                .with_table_partition_cols(table_partition_cols)
-                .build(),
-            )
-            .await
-    }
-
-    fn supports_filters_pushdown(
-        &self,
-        filters: &[&Expr],
-    ) -> Result<Vec<TableProviderFilterPushDown>> {
-        let partition_column_names = self
-            .options
-            .table_partition_cols
-            .iter()
-            .map(|col| col.0.as_str())
-            .collect::<Vec<_>>();
-        filters
-            .iter()
-            .map(|filter| {
-                if can_be_evaluted_for_partition_pruning(&partition_column_names, filter)
-                {
-                    // if filter can be handled by partition pruning, it is exact
-                    return Ok(TableProviderFilterPushDown::Exact);
-                }
-
-                Ok(TableProviderFilterPushDown::Inexact)
-            })
-            .collect()
-    }
-
-    fn get_table_definition(&self) -> Option<&str> {
-        self.definition.as_deref()
-    }
-
-    async fn insert_into(
-        &self,
-        state: &dyn Session,
-        input: Arc<dyn ExecutionPlan>,
-        insert_op: InsertOp,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        // Check that the schema of the plan matches the schema of this table.
-        self.schema()
-            .logically_equivalent_names_and_types(&input.schema())?;
-
-        let table_path = &self.table_paths()[0];
-        if !table_path.is_collection() {
-            return plan_err!(
-                "Inserting into a ListingTable backed by a single file is not supported, URL is possibly missing a trailing `/`. \
-                To append to an existing file use StreamTable, e.g. by using CREATE UNBOUNDED EXTERNAL TABLE"
-            );
-        }
-
-        // Get the object store for the table path.
-        let store = state.runtime_env().object_store(table_path)?;
-
-        let file_list_stream = pruned_partition_list(
-            state,
-            store.as_ref(),
-            table_path,
-            &[],
-            &self.options.file_extension,
-            &self.options.table_partition_cols,
-        )
-        .await?;
-
-        let file_group = file_list_stream.try_collect::<Vec<_>>().await?.into();
-        let keep_partition_by_columns =
-            state.config_options().execution.keep_partition_by_columns;
-
-        // Sink related option, apart from format
-        let config = FileSinkConfig {
-            original_url: String::default(),
-            object_store_url: self.table_paths()[0].object_store(),
-            table_paths: self.table_paths().clone(),
-            file_group,
-            output_schema: self.schema(),
-            table_partition_cols: self.options.table_partition_cols.clone(),
-            insert_op,
-            keep_partition_by_columns,
-            file_extension: self.options().format.get_ext(),
-        };
-
-        let order_requirements = if !self.options().file_sort_order.is_empty() {
-            // Multiple sort orders in outer vec are equivalent, so we pass only the first one
-            let orderings = self.try_create_output_ordering()?;
-            let Some(ordering) = orderings.first() else {
-                return internal_err!(
-                    "Expected ListingTable to have a sort order, but none found!"
-                );
-            };
-            // Converts Vec<Vec<SortExpr>> into type required by execution plan to specify its required input ordering
-            Some(LexRequirement::new(
-                ordering
-                    .into_iter()
-                    .cloned()
-                    .map(PhysicalSortRequirement::from)
-                    .collect::<Vec<_>>(),
-            ))
-        } else {
-            None
-        };
-
-        self.options()
-            .format
-            .create_writer_physical_plan(input, state, config, order_requirements)
-            .await
-    }
-
-    fn get_column_default(&self, column: &str) -> Option<&Expr> {
-        self.column_defaults.get(column)
-    }
-}
-
-impl ListingTable {
-    /// Get the list of files for a scan as well as the file level statistics.
-    /// The list is grouped to let the execution plan know how the files should
-    /// be distributed to different threads / executors.
-    async fn list_files_for_scan<'a>(
-        &'a self,
-        ctx: &'a dyn Session,
-        filters: &'a [Expr],
-        limit: Option<usize>,
-    ) -> Result<(Vec<FileGroup>, Statistics)> {
-        let store = if let Some(url) = self.table_paths.first() {
-            ctx.runtime_env().object_store(url)?
-        } else {
-            return Ok((vec![], Statistics::new_unknown(&self.file_schema)));
-        };
-        // list files (with partitions)
-        let file_list = future::try_join_all(self.table_paths.iter().map(|table_path| {
-            pruned_partition_list(
-                ctx,
-                store.as_ref(),
-                table_path,
-                filters,
-                &self.options.file_extension,
-                &self.options.table_partition_cols,
-            )
-        }))
-        .await?;
-        let meta_fetch_concurrency =
-            ctx.config_options().execution.meta_fetch_concurrency;
-        let file_list = stream::iter(file_list).flatten_unordered(meta_fetch_concurrency);
-        // collect the statistics if required by the config
-        let files = file_list
-            .map(|part_file| async {
-                let part_file = part_file?;
-                let statistics = if self.options.collect_stat {
-                    self.do_collect_statistics(ctx, &store, &part_file).await?
-                } else {
-                    Arc::new(Statistics::new_unknown(&self.file_schema))
-                };
-                Ok(part_file.with_statistics(statistics))
-            })
-            .boxed()
-            .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency);
-
-        let (file_group, inexact_stats) =
-            get_files_with_limit(files, limit, self.options.collect_stat).await?;
-
-        let file_groups = file_group.split_files(self.options.target_partitions);
-        let (mut file_groups, mut stats) = compute_all_files_statistics(
-            file_groups,
-            self.schema(),
-            self.options.collect_stat,
-            inexact_stats,
-        )?;
-        let (schema_mapper, _) = DefaultSchemaAdapterFactory::from_schema(self.schema())
-            .map_schema(self.file_schema.as_ref())?;
-        stats.column_statistics =
-            schema_mapper.map_column_statistics(&stats.column_statistics)?;
-        file_groups.iter_mut().try_for_each(|file_group| {
-            if let Some(stat) = file_group.statistics_mut() {
-                stat.column_statistics =
-                    schema_mapper.map_column_statistics(&stat.column_statistics)?;
-            }
-            Ok::<_, DataFusionError>(())
-        })?;
-        Ok((file_groups, stats))
-    }
-
-    /// Collects statistics for a given partitioned file.
-    ///
-    /// This method first checks if the statistics for the given file are already cached.
-    /// If they are, it returns the cached statistics.
-    /// If they are not, it infers the statistics from the file and stores them in the cache.
-    async fn do_collect_statistics(
-        &self,
-        ctx: &dyn Session,
-        store: &Arc<dyn ObjectStore>,
-        part_file: &PartitionedFile,
-    ) -> Result<Arc<Statistics>> {
-        match self
-            .collected_statistics
-            .get_with_extra(&part_file.object_meta.location, &part_file.object_meta)
-        {
-            Some(statistics) => Ok(statistics),
-            None => {
-                let statistics = self
-                    .options
-                    .format
-                    .infer_stats(
-                        ctx,
-                        store,
-                        Arc::clone(&self.file_schema),
-                        &part_file.object_meta,
-                    )
-                    .await?;
-                let statistics = Arc::new(statistics);
-                self.collected_statistics.put_with_extra(
-                    &part_file.object_meta.location,
-                    Arc::clone(&statistics),
-                    &part_file.object_meta,
-                );
-                Ok(statistics)
-            }
-        }
-    }
-}
-
-/// Processes a stream of partitioned files and returns a `FileGroup` containing the files.
-///
-/// This function collects files from the provided stream until either:
-/// 1. The stream is exhausted
-/// 2. The accumulated number of rows exceeds the provided `limit` (if specified)
-///
-/// # Arguments
-/// * `files` - A stream of `Result<PartitionedFile>` items to process
-/// * `limit` - An optional row count limit. If provided, the function will stop collecting files
-///   once the accumulated number of rows exceeds this limit
-/// * `collect_stats` - Whether to collect and accumulate statistics from the files
-///
-/// # Returns
-/// A `Result` containing a `FileGroup` with the collected files
-/// and a boolean indicating whether the statistics are inexact.
-///
-/// # Note
-/// The function will continue processing files if statistics are not available or if the
-/// limit is not provided. If `collect_stats` is false, statistics won't be accumulated
-/// but files will still be collected.
-async fn get_files_with_limit(
-    files: impl Stream<Item = Result<PartitionedFile>>,
-    limit: Option<usize>,
-    collect_stats: bool,
-) -> Result<(FileGroup, bool)> {
-    let mut file_group = FileGroup::default();
-    // Fusing the stream allows us to call next safely even once it is finished.
-    let mut all_files = Box::pin(files.fuse());
-    enum ProcessingState {
-        ReadingFiles,
-        ReachedLimit,
-    }
-
-    let mut state = ProcessingState::ReadingFiles;
-    let mut num_rows = Precision::Absent;
-
-    while let Some(file_result) = all_files.next().await {
-        // Early exit if we've already reached our limit
-        if matches!(state, ProcessingState::ReachedLimit) {
-            break;
-        }
-
-        let file = file_result?;
-
-        // Update file statistics regardless of state
-        if collect_stats {
-            if let Some(file_stats) = &file.statistics {
-                num_rows = if file_group.is_empty() {
-                    // For the first file, just take its row count
-                    file_stats.num_rows
-                } else {
-                    // For subsequent files, accumulate the counts
-                    num_rows.add(&file_stats.num_rows)
-                };
-            }
-        }
+        // Verify sources in actual ListingTable objects
+        let table_specified = ListingTable::try_new(config_with_schema_and_options)?;
+        assert_eq!(table_specified.schema_source(), SchemaSource::Specified);
 
-        // Always add the file to our group
-        file_group.push(file);
+        let table_inferred = ListingTable::try_new(config_with_inferred)?;
+        assert_eq!(table_inferred.schema_source(), SchemaSource::Inferred);
 
-        // Check if we've hit the limit (if one was specified)
-        if let Some(limit) = limit {
-            if let Precision::Exact(row_count) = num_rows {
-                if row_count > limit {
-                    state = ProcessingState::ReachedLimit;
-                }
-            }
-        }
+        Ok(())
     }
-    // If we still have files in the stream, it means that the limit kicked
-    // in, and the statistic could have been different had we processed the
-    // files in a different order.
-    let inexact_stats = all_files.next().await.is_some();
-    Ok((file_group, inexact_stats))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::datasource::file_format::csv::CsvFormat;
-    use crate::datasource::file_format::json::JsonFormat;
-    #[cfg(feature = "parquet")]
-    use crate::datasource::file_format::parquet::ParquetFormat;
-    use crate::datasource::{provider_as_source, DefaultTableSource, MemTable};
-    use crate::execution::options::ArrowReadOptions;
-    use crate::prelude::*;
-    use crate::test::{columns, object_store::register_test_store};
-
-    use arrow::compute::SortOptions;
-    use arrow::record_batch::RecordBatch;
-    use datafusion_common::stats::Precision;
-    use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{assert_contains, ScalarValue};
-    use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator};
-    use datafusion_physical_expr::PhysicalSortExpr;
-    use datafusion_physical_plan::collect;
-    use datafusion_physical_plan::ExecutionPlanProperties;
-
-    use crate::test::object_store::{ensure_head_concurrency, make_test_store_and_state};
-    use tempfile::TempDir;
-    use url::Url;
 
     #[tokio::test]
     async fn read_single_file() -> Result<()> {
@@ -1316,86 +251,7 @@ mod tests {
         );
         assert_eq!(
             exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
-        );
-
-        Ok(())
-    }
-
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn do_not_load_table_stats_by_default() -> Result<()> {
-        use crate::datasource::file_format::parquet::ParquetFormat;
-
-        let testdata = crate::test_util::parquet_test_data();
-        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        let ctx = SessionContext::new();
-        let state = ctx.state();
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()));
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path.clone())
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(exec.partition_statistics(None)?.num_rows, Precision::Absent);
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Absent
-        );
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()))
-            .with_collect_stat(true);
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path)
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(
-            exec.partition_statistics(None)?.num_rows,
-            Precision::Exact(8)
-        );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
-        );
-
-        Ok(())
-    }
-
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn load_table_stats_when_no_stats() -> Result<()> {
-        use crate::datasource::file_format::parquet::ParquetFormat;
-
-        let testdata = crate::test_util::parquet_test_data();
-        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        let ctx = SessionContext::new();
-        let state = ctx.state();
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()))
-            .with_collect_stat(false);
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path)
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(exec.partition_statistics(None)?.num_rows, Precision::Absent);
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Absent
+            Precision::Absent,
         );
 
         Ok(())
@@ -1415,31 +271,48 @@ mod tests {
 
         use crate::datasource::file_format::parquet::ParquetFormat;
         use datafusion_physical_plan::expressions::col as physical_col;
+        use datafusion_physical_plan::expressions::lit as physical_lit;
         use std::ops::Add;
 
         // (file_sort_order, expected_result)
         let cases = vec![
-            (vec![], Ok(vec![])),
+            (
+                vec![],
+                Ok::<Vec<LexOrdering>, DataFusionError>(Vec::<LexOrdering>::new()),
+            ),
             // sort expr, but non column
             (
-                vec![vec![
-                    col("int_col").add(lit(1)).sort(true, true),
-                ]],
-                Err("Expected single column reference in sort_order[0][0], got int_col + Int32(1)"),
+                vec![vec![col("int_col").add(lit(1)).sort(true, true)]],
+                Ok(vec![
+                    [PhysicalSortExpr {
+                        expr: binary(
+                            physical_col("int_col", &schema).unwrap(),
+                            Operator::Plus,
+                            physical_lit(1),
+                            &schema,
+                        )
+                        .unwrap(),
+                        options: SortOptions {
+                            descending: false,
+                            nulls_first: true,
+                        },
+                    }]
+                    .into(),
+                ]),
             ),
             // ok with one column
             (
                 vec![vec![col("string_col").sort(true, false)]],
-                Ok(vec![LexOrdering::new(
-                        vec![PhysicalSortExpr {
-                            expr: physical_col("string_col", &schema).unwrap(),
-                            options: SortOptions {
-                                descending: false,
-                                nulls_first: false,
-                            },
-                        }],
-                )
-                ])
+                Ok(vec![
+                    [PhysicalSortExpr {
+                        expr: physical_col("string_col", &schema).unwrap(),
+                        options: SortOptions {
+                            descending: false,
+                            nulls_first: false,
+                        },
+                    }]
+                    .into(),
+                ]),
             ),
             // ok with two columns, different options
             (
@@ -1447,17 +320,21 @@ mod tests {
                     col("string_col").sort(true, false),
                     col("int_col").sort(false, true),
                 ]],
-                Ok(vec![LexOrdering::new(
-                        vec![
-                            PhysicalSortExpr::new_default(physical_col("string_col", &schema).unwrap())
-                                        .asc()
-                                        .nulls_last(),
-                            PhysicalSortExpr::new_default(physical_col("int_col", &schema).unwrap())
-                                        .desc()
-                                        .nulls_first()
-                        ],
-                )
-                ])
+                Ok(vec![
+                    [
+                        PhysicalSortExpr::new_default(
+                            physical_col("string_col", &schema).unwrap(),
+                        )
+                        .asc()
+                        .nulls_last(),
+                        PhysicalSortExpr::new_default(
+                            physical_col("int_col", &schema).unwrap(),
+                        )
+                        .desc()
+                        .nulls_first(),
+                    ]
+                    .into(),
+                ]),
             ),
         ];
 
@@ -1470,7 +347,8 @@ mod tests {
 
             let table =
                 ListingTable::try_new(config.clone()).expect("Creating the table");
-            let ordering_result = table.try_create_output_ordering();
+            let ordering_result =
+                table.try_create_output_ordering(state.execution_props(), &[]);
 
             match (expected_result, ordering_result) {
                 (Ok(expected), Ok(result)) => {
@@ -1505,290 +383,33 @@ mod tests {
             .with_table_partition_cols(vec![(String::from("p1"), DataType::Utf8)])
             .with_target_partitions(4);
 
-        let table_path = ListingTableUrl::parse("test:///table/").unwrap();
+        let table_path = ListingTableUrl::parse("test:///table/")?;
         let file_schema =
             Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
         let config = ListingTableConfig::new(table_path)
             .with_listing_options(opt)
-            .with_schema(file_schema);
-        let table = ListingTable::try_new(config)?;
-
-        assert_eq!(
-            columns(&table.schema()),
-            vec!["a".to_owned(), "p1".to_owned()]
-        );
-
-        // this will filter out the only file in the store
-        let filter = Expr::not_eq(col("p1"), lit("v1"));
-
-        let scan = table
-            .scan(&ctx.state(), None, &[filter], None)
-            .await
-            .expect("Empty execution plan");
-
-        assert!(scan.as_any().is::<EmptyExec>());
-        assert_eq!(
-            columns(&scan.schema()),
-            vec!["a".to_owned(), "p1".to_owned()]
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_assert_list_files_for_scan_grouping() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-                "bucket/key-prefix/file4",
-            ],
-            "test:///bucket/key-prefix/",
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-            ],
-            "test:///bucket/key-prefix/",
-            4,
-            4,
-            Some(""),
-        )
-        .await?;
-
-        // more files as expected partitions
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-                "bucket/key-prefix/file4",
-            ],
-            "test:///bucket/key-prefix/",
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // no files => no groups
-        assert_list_files_for_scan_grouping(
-            &[],
-            "test:///bucket/key-prefix/",
-            2,
-            0,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/other-prefix/roguefile",
-            ],
-            "test:///bucket/key-prefix/",
-            10,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix or the default file extention
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0.json",
-                "bucket/key-prefix/file1.parquet",
-                "bucket/other-prefix/roguefile.json",
-            ],
-            "test:///bucket/key-prefix/",
-            10,
-            1,
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_assert_list_files_for_multi_path() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key2/"],
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key2/"],
-            5,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // more files as expected partitions
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/"],
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // no files => no groups
-        assert_list_files_for_multi_paths(&[], &["test:///bucket/key1/"], 2, 0, Some(""))
-            .await?;
-
-        // files that don't match the prefix
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key3/"],
-            2,
-            1,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix or the default file ext
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0.json",
-                "bucket/key1/file1.csv",
-                "bucket/key1/file2.json",
-                "bucket/key2/file3.csv",
-                "bucket/key2/file4.json",
-                "bucket/key3/file5.csv",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key3/"],
-            2,
-            2,
-            None,
-        )
-        .await?;
-        Ok(())
-    }
+            .with_schema(file_schema);
+        let table = ListingTable::try_new(config)?;
 
-    #[tokio::test]
-    async fn test_assert_list_files_for_exact_paths() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
+        assert_eq!(
+            columns(&table.schema()),
+            vec!["a".to_owned(), "p1".to_owned()]
+        );
 
-        // more files than meta_fetch_concurrency (32)
-        let files: Vec<String> =
-            (0..64).map(|i| format!("bucket/key1/file{i}")).collect();
-        // Collect references to each string
-        let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
-        assert_list_files_for_exact_paths(file_refs.as_slice(), 5, 5, Some("")).await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            5,
-            5,
-            Some(""),
-        )
-        .await?;
+        // this will filter out the only file in the store
+        let filter = Expr::not_eq(col("p1"), lit("v1"));
 
-        // more files as expected partitions
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
+        let scan = table
+            .scan(&ctx.state(), None, &[filter], None)
+            .await
+            .expect("Empty execution plan");
+
+        assert!(scan.as_any().is::<EmptyExec>());
+        assert_eq!(
+            columns(&scan.schema()),
+            vec!["a".to_owned(), "p1".to_owned()]
+        );
 
-        // no files => no groups
-        assert_list_files_for_exact_paths(&[], 2, 0, Some("")).await?;
-
-        // files that don't match the default file ext
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0.json",
-                "bucket/key1/file1.csv",
-                "bucket/key1/file2.json",
-                "bucket/key2/file3.csv",
-                "bucket/key2/file4.json",
-                "bucket/key3/file5.csv",
-            ],
-            2,
-            2,
-            None,
-        )
-        .await?;
         Ok(())
     }
 
@@ -1798,7 +419,7 @@ mod tests {
     ) -> Result<Arc<dyn TableProvider>> {
         let testdata = crate::test_util::parquet_test_data();
         let filename = format!("{testdata}/{name}");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
+        let table_path = ListingTableUrl::parse(filename)?;
 
         let config = ListingTableConfig::new(table_path)
             .infer(&ctx.state())
@@ -1825,16 +446,16 @@ mod tests {
 
         let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
 
-        let table_path = ListingTableUrl::parse(table_prefix).unwrap();
+        let table_path = ListingTableUrl::parse(table_prefix)?;
         let config = ListingTableConfig::new(table_path)
             .with_listing_options(opt)
             .with_schema(Arc::new(schema));
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-        assert_eq!(file_list.len(), output_partitioning);
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
         Ok(())
     }
@@ -1867,9 +488,9 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-        assert_eq!(file_list.len(), output_partitioning);
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
         Ok(())
     }
@@ -1894,10 +515,10 @@ mod tests {
             .execution
             .meta_fetch_concurrency;
         let expected_concurrency = files.len().min(meta_fetch_concurrency);
-        let head_blocking_store = ensure_head_concurrency(store, expected_concurrency);
+        let head_concurrency_store = ensure_head_concurrency(store, expected_concurrency);
 
         let url = Url::parse("test://").unwrap();
-        ctx.register_object_store(&url, head_blocking_store.clone());
+        ctx.register_object_store(&url, head_concurrency_store.clone());
 
         let format = JsonFormat::default();
 
@@ -1917,84 +538,10 @@ mod tests {
 
         let table = ListingTable::try_new(config)?;
 
-        let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], None).await?;
-
-        assert_eq!(file_list.len(), output_partitioning);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_insert_into_append_new_json_files() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            JsonFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_insert_into_append_new_csv_files() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            CsvFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
 
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn test_insert_into_append_2_new_parquet_files_defaults() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            ParquetFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
+        assert_eq!(result.file_groups.len(), output_partitioning);
 
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn test_insert_into_append_1_new_parquet_files_defaults() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "20".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "20".into(),
-        );
-        helper_test_append_new_files_to_table(
-            ParquetFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            1,
-        )
-        .await?;
         Ok(())
     }
 
@@ -2108,7 +655,6 @@ mod tests {
     #[tokio::test]
     async fn test_insert_into_append_new_parquet_files_session_overrides() -> Result<()> {
         let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
         config_map.insert(
             "datafusion.execution.soft_max_rows_per_output_file".into(),
             "10".into(),
@@ -2173,7 +719,7 @@ mod tests {
             "datafusion.execution.parquet.write_batch_size".into(),
             "5".into(),
         );
-        config_map.insert("datafusion.execution.batch_size".into(), "1".into());
+        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
         helper_test_append_new_files_to_table(
             ParquetFormat::default().get_ext(),
             FileCompressionType::UNCOMPRESSED,
@@ -2185,8 +731,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_insert_into_append_new_parquet_files_invalid_session_fails(
-    ) -> Result<()> {
+    async fn test_insert_into_append_new_parquet_files_invalid_session_fails()
+    -> Result<()> {
         let mut config_map: HashMap<String, String> = HashMap::new();
         config_map.insert(
             "datafusion.execution.parquet.compression".into(),
@@ -2200,7 +746,10 @@ mod tests {
         )
         .await
         .expect_err("Example should fail!");
-        assert_eq!(e.strip_backtrace(), "Invalid or Unsupported Configuration: zstd compression requires specifying a level such as zstd(4)");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Invalid or Unsupported Configuration: zstd compression requires specifying a level such as zstd(4)"
+        );
 
         Ok(())
     }
@@ -2230,7 +779,7 @@ mod tests {
         let filter_predicate = Expr::BinaryExpr(BinaryExpr::new(
             Box::new(Expr::Column("column1".into())),
             Operator::GtEq,
-            Box::new(Expr::Literal(ScalarValue::Int32(Some(0)))),
+            Box::new(Expr::Literal(ScalarValue::Int32(Some(0)), None)),
         ));
 
         // Create a new batch of data to insert into the table
@@ -2260,7 +809,7 @@ mod tests {
                     .register_json(
                         "t",
                         tmp_dir.path().to_str().unwrap(),
-                        NdJsonReadOptions::default()
+                        JsonReadOptions::default()
                             .schema(schema.as_ref())
                             .file_compression_type(file_compression_type),
                     )
@@ -2327,13 +876,13 @@ mod tests {
         let res = collect(plan, session_ctx.task_ctx()).await?;
         // Insert returns the number of rows written, in our case this would be 6.
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Read the records in the table
         let batches = session_ctx
@@ -2342,13 +891,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Assert that `target_partition_number` many files were added to the table.
         let num_files = tmp_dir.path().read_dir()?.count();
@@ -2363,13 +912,13 @@ mod tests {
         // Again, execute the physical plan and collect the results
         let res = collect(plan, session_ctx.task_ctx()).await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 20    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&res),@r"
+        +-------+
+        | count |
+        +-------+
+        | 20    |
+        +-------+
+        ");}
 
         // Read the contents of the table
         let batches = session_ctx
@@ -2378,13 +927,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-------+
-            | count |
-            +-------+
-            | 40    |
-            +-------+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-------+
+        | count |
+        +-------+
+        | 40    |
+        +-------+
+        ");}
 
         // Assert that another `target_partition_number` many files were added to the table.
         let num_files = tmp_dir.path().read_dir()?.count();
@@ -2442,15 +991,15 @@ mod tests {
             .collect()
             .await?;
 
-        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
-            +-----+-----+---+
-            | a   | b   | c |
-            +-----+-----+---+
-            | foo | bar | 1 |
-            | foo | bar | 2 |
-            | foo | bar | 3 |
-            +-----+-----+---+
-        "###);}
+        insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r"
+        +-----+-----+---+
+        | a   | b   | c |
+        +-----+-----+---+
+        | foo | bar | 1 |
+        | foo | bar | 2 |
+        | foo | bar | 3 |
+        +-----+-----+---+
+        ");}
 
         Ok(())
     }
@@ -2459,7 +1008,7 @@ mod tests {
     async fn test_infer_options_compressed_csv() -> Result<()> {
         let testdata = crate::test_util::arrow_test_data();
         let filename = format!("{testdata}/csv/aggregate_test_100.csv.gz");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
+        let table_path = ListingTableUrl::parse(filename)?;
 
         let ctx = SessionContext::new();
 
@@ -2473,4 +1022,467 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn infer_preserves_provided_schema() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let testdata = datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename)?;
+
+        let provided_schema = create_test_schema();
+
+        let format = CsvFormat::default();
+        let options = ListingOptions::new(Arc::new(format));
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(options)
+            .with_schema(Arc::clone(&provided_schema));
+
+        let config = config.infer(&ctx.state()).await?;
+
+        assert_eq!(*config.file_schema.unwrap(), *provided_schema);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_listing_table_config_with_multiple_files_comprehensive() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        // Create test files with different schemas
+        let tmp_dir = TempDir::new()?;
+        let file_path1 = tmp_dir.path().join("file1.csv");
+        let file_path2 = tmp_dir.path().join("file2.csv");
+
+        // File 1: c1,c2,c3
+        let mut file1 = std::fs::File::create(&file_path1)?;
+        writeln!(file1, "c1,c2,c3")?;
+        writeln!(file1, "1,2,3")?;
+        writeln!(file1, "4,5,6")?;
+
+        // File 2: c1,c2,c3,c4
+        let mut file2 = std::fs::File::create(&file_path2)?;
+        writeln!(file2, "c1,c2,c3,c4")?;
+        writeln!(file2, "7,8,9,10")?;
+        writeln!(file2, "11,12,13,14")?;
+
+        // Parse paths
+        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
+        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+
+        // Create format and options
+        let format = CsvFormat::default().with_has_header(true);
+        let options = ListingOptions::new(Arc::new(format));
+
+        // Test case 1: Infer schema using first file's schema
+        let config1 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_listing_options(options.clone());
+        let config1 = config1.infer_schema(&ctx.state()).await?;
+        assert_eq!(config1.schema_source(), SchemaSource::Inferred);
+
+        // Verify schema matches first file
+        let schema1 = config1.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema1.fields().len(), 3);
+        assert_eq!(schema1.field(0).name(), "c1");
+        assert_eq!(schema1.field(1).name(), "c2");
+        assert_eq!(schema1.field(2).name(), "c3");
+
+        // Test case 2: Use specified schema with 3 columns
+        let schema_3cols = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, true),
+            Field::new("c2", DataType::Utf8, true),
+            Field::new("c3", DataType::Utf8, true),
+        ]));
+
+        let config2 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_listing_options(options.clone())
+        .with_schema(schema_3cols);
+        let config2 = config2.infer_schema(&ctx.state()).await?;
+        assert_eq!(config2.schema_source(), SchemaSource::Specified);
+
+        // Verify that the schema is still the one we specified (3 columns)
+        let schema2 = config2.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema2.fields().len(), 3);
+        assert_eq!(schema2.field(0).name(), "c1");
+        assert_eq!(schema2.field(1).name(), "c2");
+        assert_eq!(schema2.field(2).name(), "c3");
+
+        // Test case 3: Use specified schema with 4 columns
+        let schema_4cols = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, true),
+            Field::new("c2", DataType::Utf8, true),
+            Field::new("c3", DataType::Utf8, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        let config3 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_listing_options(options.clone())
+        .with_schema(schema_4cols);
+        let config3 = config3.infer_schema(&ctx.state()).await?;
+        assert_eq!(config3.schema_source(), SchemaSource::Specified);
+
+        // Verify that the schema is still the one we specified (4 columns)
+        let schema3 = config3.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema3.fields().len(), 4);
+        assert_eq!(schema3.field(0).name(), "c1");
+        assert_eq!(schema3.field(1).name(), "c2");
+        assert_eq!(schema3.field(2).name(), "c3");
+        assert_eq!(schema3.field(3).name(), "c4");
+
+        // Test case 4: Verify order matters when inferring schema
+        let config4 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path2.clone(),
+            table_path1.clone(),
+        ])
+        .with_listing_options(options);
+        let config4 = config4.infer_schema(&ctx.state()).await?;
+
+        // Should use first file's schema, which now has 4 columns
+        let schema4 = config4.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema4.fields().len(), 4);
+        assert_eq!(schema4.field(0).name(), "c1");
+        assert_eq!(schema4.field(1).name(), "c2");
+        assert_eq!(schema4.field(2).name(), "c3");
+        assert_eq!(schema4.field(3).name(), "c4");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_list_files_configurations() -> Result<()> {
+        // Define common test cases as (description, files, paths, target_partitions, expected_partitions, file_ext)
+        let test_cases = vec![
+            // Single path cases
+            (
+                "Single path, more partitions than files",
+                generate_test_files("bucket/key-prefix", 5),
+                vec!["test:///bucket/key-prefix/"],
+                12,
+                5,
+                Some(""),
+            ),
+            (
+                "Single path, equal partitions and files",
+                generate_test_files("bucket/key-prefix", 4),
+                vec!["test:///bucket/key-prefix/"],
+                4,
+                4,
+                Some(""),
+            ),
+            (
+                "Single path, more files than partitions",
+                generate_test_files("bucket/key-prefix", 5),
+                vec!["test:///bucket/key-prefix/"],
+                2,
+                2,
+                Some(""),
+            ),
+            // Multi path cases
+            (
+                "Multi path, more partitions than files",
+                {
+                    let mut files = generate_test_files("bucket/key1", 3);
+                    files.extend(generate_test_files_with_start("bucket/key2", 2, 3));
+                    files.extend(generate_test_files_with_start("bucket/key3", 1, 5));
+                    files
+                },
+                vec!["test:///bucket/key1/", "test:///bucket/key2/"],
+                12,
+                5,
+                Some(""),
+            ),
+            // No files case
+            (
+                "No files",
+                vec![],
+                vec!["test:///bucket/key-prefix/"],
+                2,
+                0,
+                Some(""),
+            ),
+            // Exact path cases
+            (
+                "Exact paths test",
+                {
+                    let mut files = generate_test_files("bucket/key1", 3);
+                    files.extend(generate_test_files_with_start("bucket/key2", 2, 3));
+                    files
+                },
+                vec![
+                    "test:///bucket/key1/file0",
+                    "test:///bucket/key1/file1",
+                    "test:///bucket/key1/file2",
+                    "test:///bucket/key2/file3",
+                    "test:///bucket/key2/file4",
+                ],
+                12,
+                5,
+                Some(""),
+            ),
+        ];
+
+        // Run each test case
+        for (test_name, files, paths, target_partitions, expected_partitions, file_ext) in
+            test_cases
+        {
+            println!("Running test: {test_name}");
+
+            if files.is_empty() {
+                // Test empty files case
+                assert_list_files_for_multi_paths(
+                    &[],
+                    &paths,
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else if paths.len() == 1 {
+                // Test using single path API
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
+                assert_list_files_for_scan_grouping(
+                    &file_refs,
+                    paths[0],
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else if paths[0].contains("test:///bucket/key") {
+                // Test using multi path API
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
+                assert_list_files_for_multi_paths(
+                    &file_refs,
+                    &paths,
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else {
+                // Test using exact path API for specific cases
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
+                assert_list_files_for_exact_paths(
+                    &file_refs,
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_listing_table_prunes_extra_files_in_hive() -> Result<()> {
+        let files = [
+            "bucket/test/pid=1/file1",
+            "bucket/test/pid=1/file2",
+            "bucket/test/pid=2/file3",
+            "bucket/test/pid=2/file4",
+            "bucket/test/other/file5",
+        ];
+
+        let ctx = SessionContext::new();
+        register_test_store(&ctx, &files.iter().map(|f| (*f, 10)).collect::<Vec<_>>());
+
+        let opt = ListingOptions::new(Arc::new(JsonFormat::default()))
+            .with_file_extension_opt(Some(""))
+            .with_table_partition_cols(vec![("pid".to_string(), DataType::Int32)]);
+
+        let table_path = ListingTableUrl::parse("test:///bucket/test/").unwrap();
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(opt)
+            .with_schema(Arc::new(schema));
+
+        let table = ListingTable::try_new(config)?;
+
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        assert_eq!(result.file_groups.len(), 1);
+
+        let files = result.file_groups[0].clone();
+
+        assert_eq!(
+            files
+                .iter()
+                .map(|f| f.path().to_string())
+                .collect::<Vec<_>>(),
+            vec![
+                "bucket/test/pid=1/file1",
+                "bucket/test/pid=1/file2",
+                "bucket/test/pid=2/file3",
+                "bucket/test/pid=2/file4",
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[cfg(feature = "parquet")]
+    #[tokio::test]
+    async fn test_table_stats_behaviors() -> Result<()> {
+        use crate::datasource::file_format::parquet::ParquetFormat;
+
+        let testdata = crate::test_util::parquet_test_data();
+        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
+        let table_path = ListingTableUrl::parse(filename)?;
+
+        let ctx = SessionContext::new();
+        let state = ctx.state();
+
+        // Test 1: Default behavior - stats not collected
+        let opt_default = ListingOptions::new(Arc::new(ParquetFormat::default()));
+        let schema_default = opt_default.infer_schema(&state, &table_path).await?;
+        let config_default = ListingTableConfig::new(table_path.clone())
+            .with_listing_options(opt_default)
+            .with_schema(schema_default);
+
+        let table_default = ListingTable::try_new(config_default)?;
+
+        let exec_default = table_default.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_default.partition_statistics(None)?.num_rows,
+            Precision::Absent
+        );
+
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
+        assert_eq!(
+            exec_default.partition_statistics(None)?.total_byte_size,
+            Precision::Absent
+        );
+
+        // Test 2: Explicitly disable stats
+        let opt_disabled = ListingOptions::new(Arc::new(ParquetFormat::default()))
+            .with_collect_stat(false);
+        let schema_disabled = opt_disabled.infer_schema(&state, &table_path).await?;
+        let config_disabled = ListingTableConfig::new(table_path.clone())
+            .with_listing_options(opt_disabled)
+            .with_schema(schema_disabled);
+        let table_disabled = ListingTable::try_new(config_disabled)?;
+
+        let exec_disabled = table_disabled.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_disabled.partition_statistics(None)?.num_rows,
+            Precision::Absent
+        );
+        assert_eq!(
+            exec_disabled.partition_statistics(None)?.total_byte_size,
+            Precision::Absent
+        );
+
+        // Test 3: Explicitly enable stats
+        let opt_enabled = ListingOptions::new(Arc::new(ParquetFormat::default()))
+            .with_collect_stat(true);
+        let schema_enabled = opt_enabled.infer_schema(&state, &table_path).await?;
+        let config_enabled = ListingTableConfig::new(table_path)
+            .with_listing_options(opt_enabled)
+            .with_schema(schema_enabled);
+        let table_enabled = ListingTable::try_new(config_enabled)?;
+
+        let exec_enabled = table_enabled.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_enabled.partition_statistics(None)?.num_rows,
+            Precision::Exact(8)
+        );
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
+        assert_eq!(
+            exec_enabled.partition_statistics(None)?.total_byte_size,
+            Precision::Absent,
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_insert_into_parameterized() -> Result<()> {
+        let test_cases = vec![
+            // (file_format, batch_size, soft_max_rows, expected_files)
+            ("json", 10, 10, 2),
+            ("csv", 10, 10, 2),
+            #[cfg(feature = "parquet")]
+            ("parquet", 10, 10, 2),
+            #[cfg(feature = "parquet")]
+            ("parquet", 20, 20, 1),
+        ];
+
+        for (format, batch_size, soft_max_rows, expected_files) in test_cases {
+            println!(
+                "Testing insert with format: {format}, batch_size: {batch_size}, expected files: {expected_files}"
+            );
+
+            let mut config_map = HashMap::new();
+            config_map.insert(
+                "datafusion.execution.batch_size".into(),
+                batch_size.to_string(),
+            );
+            config_map.insert(
+                "datafusion.execution.soft_max_rows_per_output_file".into(),
+                soft_max_rows.to_string(),
+            );
+
+            let file_extension = match format {
+                "json" => JsonFormat::default().get_ext(),
+                "csv" => CsvFormat::default().get_ext(),
+                #[cfg(feature = "parquet")]
+                "parquet" => ParquetFormat::default().get_ext(),
+                _ => unreachable!("Unsupported format"),
+            };
+
+            helper_test_append_new_files_to_table(
+                file_extension,
+                FileCompressionType::UNCOMPRESSED,
+                Some(config_map),
+                expected_files,
+            )
+            .await?;
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_basic_table_scan() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        // Test basic table creation and scanning
+        let path = "table/file.json";
+        register_test_store(&ctx, &[(path, 10)]);
+
+        let format = JsonFormat::default();
+        let opt = ListingOptions::new(Arc::new(format)).with_collect_stat(false);
+        let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
+        let table_path = ListingTableUrl::parse("test:///table/")?;
+
+        let config = ListingTableConfig::new(table_path)
+            .with_listing_options(opt)
+            .with_schema(Arc::new(schema));
+
+        let table = ListingTable::try_new(config)?;
+
+        // The scan should work correctly
+        let scan_result = table.scan(&ctx.state(), None, &[], None).await;
+        assert!(scan_result.is_ok(), "Scan should succeed");
+
+        // Verify file listing works
+        let result = table.list_files_for_scan(&ctx.state(), &[], None).await?;
+        assert!(
+            !result.file_groups.is_empty(),
+            "Should list files successfully"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs
index 71686c61a8f76..f85f15a6d8c63 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -27,9 +27,9 @@ use crate::datasource::listing::{
 };
 use crate::execution::context::SessionState;
 
-use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, ToDFSchema};
-use datafusion_common::{config_datafusion_err, Result};
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, config_datafusion_err};
+use datafusion_common::{ToDFSchema, arrow_datafusion_err, plan_err};
 use datafusion_expr::CreateExternalTable;
 
 use async_trait::async_trait;
@@ -54,7 +54,15 @@ impl TableProviderFactory for ListingTableFactory {
         cmd: &CreateExternalTable,
     ) -> Result<Arc<dyn TableProvider>> {
         // TODO (https://github.com/apache/datafusion/issues/11600) remove downcast_ref from here. Should file format factory be an extension to session state?
-        let session_state = state.as_any().downcast_ref::<SessionState>().unwrap();
+        let session_state =
+            state
+                .as_any()
+                .downcast_ref::<SessionState>()
+                .ok_or_else(|| {
+                    datafusion_common::internal_datafusion_err!(
+                        "ListingTableFactory requires SessionState"
+                    )
+                })?;
         let file_format = session_state
             .get_file_format_factory(cmd.file_type.as_str())
             .ok_or(config_datafusion_err!(
@@ -63,16 +71,40 @@ impl TableProviderFactory for ListingTableFactory {
             ))?
             .create(session_state, &cmd.options)?;
 
-        let file_extension = get_extension(cmd.location.as_str());
+        let mut table_path =
+            ListingTableUrl::parse(&cmd.location)?.with_table_ref(cmd.name.clone());
+        let file_extension = match table_path.is_collection() {
+            // Setting the extension to be empty instead of allowing the default extension seems
+            // odd, but was done to ensure existing behavior isn't modified. It seems like this
+            // could be refactored to either use the default extension or set the fully expected
+            // extension when compression is included (e.g. ".csv.gz")
+            true => "",
+            false => &get_extension(cmd.location.as_str()),
+        };
+        let mut options = ListingOptions::new(file_format)
+            .with_session_config_options(session_state.config())
+            .with_file_extension(file_extension);
 
         let (provided_schema, table_partition_cols) = if cmd.schema.fields().is_empty() {
+            let infer_parts = session_state
+                .config_options()
+                .execution
+                .listing_table_factory_infer_partitions;
+            let part_cols = if cmd.table_partition_cols.is_empty() && infer_parts {
+                options
+                    .infer_partitions(session_state, &table_path)
+                    .await?
+                    .into_iter()
+            } else {
+                cmd.table_partition_cols.clone().into_iter()
+            };
+
             (
                 None,
-                cmd.table_partition_cols
-                    .iter()
-                    .map(|x| {
+                part_cols
+                    .map(|p| {
                         (
-                            x.clone(),
+                            p,
                             DataType::Dictionary(
                                 Box::new(DataType::UInt16),
                                 Box::new(DataType::Utf8),
@@ -82,7 +114,7 @@ impl TableProviderFactory for ListingTableFactory {
                     .collect::<Vec<_>>(),
             )
         } else {
-            let schema: SchemaRef = Arc::new(cmd.schema.as_ref().to_owned().into());
+            let schema = Arc::clone(cmd.schema.inner());
             let table_partition_cols = cmd
                 .table_partition_cols
                 .iter()
@@ -108,12 +140,7 @@ impl TableProviderFactory for ListingTableFactory {
             (Some(schema), table_partition_cols)
         };
 
-        let table_path = ListingTableUrl::parse(&cmd.location)?;
-
-        let options = ListingOptions::new(file_format)
-            .with_file_extension(file_extension)
-            .with_session_config_options(session_state.config())
-            .with_table_partition_cols(table_partition_cols);
+        options = options.with_table_partition_cols(table_partition_cols);
 
         options
             .validate_partitions(session_state, &table_path)
@@ -125,6 +152,25 @@ impl TableProviderFactory for ListingTableFactory {
             // specifically for parquet file format.
             // See: https://github.com/apache/datafusion/issues/7317
             None => {
+                // if the folder then rewrite a file path as 'path/*.parquet'
+                // to only read the files the reader can understand
+                if table_path.is_folder() && table_path.get_glob().is_none() {
+                    // Since there are no files yet to infer an actual extension,
+                    // derive the pattern based on compression type.
+                    // So for gzipped CSV the pattern is `*.csv.gz`
+                    let glob = match options.format.compression_type() {
+                        Some(compression) => {
+                            match options.format.get_ext_with_compression(&compression) {
+                                // Use glob based on `FileFormat` extension
+                                Ok(ext) => format!("*.{ext}"),
+                                // Fallback to `file_type`, if not supported by `FileFormat`
+                                Err(_) => format!("*.{}", cmd.file_type.to_lowercase()),
+                            }
+                        }
+                        None => format!("*.{}", cmd.file_type.to_lowercase()),
+                    };
+                    table_path = table_path.with_glob(glob.as_ref())?;
+                }
                 let schema = options.infer_schema(session_state, &table_path).await?;
                 let df_schema = Arc::clone(&schema).to_dfschema()?;
                 let column_refs: HashSet<_> = cmd
@@ -153,6 +199,16 @@ impl TableProviderFactory for ListingTableFactory {
             .with_definition(cmd.definition.clone())
             .with_constraints(cmd.constraints.clone())
             .with_column_defaults(cmd.column_defaults.clone());
+
+        // Pre-warm statistics cache if collect_statistics is enabled
+        if session_state.config().collect_statistics() {
+            let filters = &[];
+            let limit = None;
+            if let Err(e) = table.list_files_for_scan(state, filters, limit).await {
+                log::warn!("Failed to pre-warm statistics cache: {e}");
+            }
+        }
+
         Ok(Arc::new(table))
     }
 }
@@ -168,14 +224,23 @@ fn get_extension(path: &str) -> String {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
-
     use super::*;
     use crate::{
         datasource::file_format::csv::CsvFormat, execution::context::SessionContext,
+        test_util::parquet_test_data,
     };
+    use datafusion_execution::cache::CacheAccessor;
+    use datafusion_execution::cache::cache_manager::CacheManagerConfig;
+    use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+    use glob::Pattern;
+    use std::collections::HashMap;
+    use std::fs;
+    use std::path::PathBuf;
 
-    use datafusion_common::{Constraints, DFSchema, TableReference};
+    use datafusion_common::parsers::CompressionTypeVariant;
+    use datafusion_common::{DFSchema, TableReference};
 
     #[tokio::test]
     async fn test_create_using_non_std_file_ext() {
@@ -189,21 +254,14 @@ mod tests {
         let context = SessionContext::new();
         let state = context.state();
         let name = TableReference::bare("foo");
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: csv_file.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options: HashMap::from([("format.has_header".into(), "true".into())]),
-            constraints: Constraints::empty(),
-            column_defaults: HashMap::new(),
-        };
+            csv_file.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(HashMap::from([("format.has_header".into(), "true".into())]))
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
         let listing_table = table_provider
             .as_any()
@@ -229,21 +287,14 @@ mod tests {
         let mut options = HashMap::new();
         options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
         options.insert("format.has_header".into(), "true".into());
-        let cmd = CreateExternalTable {
+        let cmd = CreateExternalTable::builder(
             name,
-            location: csv_file.path().to_str().unwrap().to_string(),
-            file_type: "csv".to_string(),
-            schema: Arc::new(DFSchema::empty()),
-            table_partition_cols: vec![],
-            if_not_exists: false,
-            temporary: false,
-            definition: None,
-            order_exprs: vec![],
-            unbounded: false,
-            options,
-            constraints: Constraints::empty(),
-            column_defaults: HashMap::new(),
-        };
+            csv_file.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
         let table_provider = factory.create(&state, &cmd).await.unwrap();
         let listing_table = table_provider
             .as_any()
@@ -257,4 +308,349 @@ mod tests {
         let listing_options = listing_table.options();
         assert_eq!(".tbl", listing_options.file_extension);
     }
+
+    /// Validates that CreateExternalTable with compression
+    /// searches for gzipped files in a directory location
+    #[tokio::test]
+    async fn test_create_using_folder_with_compression() {
+        let dir = tempfile::tempdir().unwrap();
+
+        let factory = ListingTableFactory::new();
+        let context = SessionContext::new();
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let mut options = HashMap::new();
+        options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
+        options.insert("format.has_header".into(), "true".into());
+        options.insert("format.compression".into(), "gzip".into());
+        let cmd = CreateExternalTable::builder(
+            name,
+            dir.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        // Verify compression is used
+        let format = listing_table.options().format.clone();
+        let csv_format = format.as_any().downcast_ref::<CsvFormat>().unwrap();
+        let csv_options = csv_format.options().clone();
+        assert_eq!(csv_options.compression, CompressionTypeVariant::GZIP);
+
+        let listing_options = listing_table.options();
+        assert_eq!("", listing_options.file_extension);
+        // Glob pattern is set to search for gzipped files
+        let table_path = listing_table.table_paths().first().unwrap();
+        assert_eq!(
+            table_path.get_glob().clone().unwrap(),
+            Pattern::new("*.csv.gz").unwrap()
+        );
+    }
+
+    /// Validates that CreateExternalTable without compression
+    /// searches for normal files in a directory location
+    #[tokio::test]
+    async fn test_create_using_folder_without_compression() {
+        let dir = tempfile::tempdir().unwrap();
+
+        let factory = ListingTableFactory::new();
+        let context = SessionContext::new();
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let mut options = HashMap::new();
+        options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned());
+        options.insert("format.has_header".into(), "true".into());
+        let cmd = CreateExternalTable::builder(
+            name,
+            dir.path().to_str().unwrap().to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .with_options(options)
+        .build();
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        let listing_options = listing_table.options();
+        assert_eq!("", listing_options.file_extension);
+        // Glob pattern is set to search for gzipped files
+        let table_path = listing_table.table_paths().first().unwrap();
+        assert_eq!(
+            table_path.get_glob().clone().unwrap(),
+            Pattern::new("*.csv").unwrap()
+        );
+    }
+
+    #[tokio::test]
+    async fn test_odd_directory_names() {
+        let dir = tempfile::tempdir().unwrap();
+        let mut path = PathBuf::from(dir.path());
+        path.extend(["odd.v1", "odd.v2"]);
+        fs::create_dir_all(&path).unwrap();
+
+        let factory = ListingTableFactory::new();
+        let context = SessionContext::new();
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            String::from(path.to_str().unwrap()),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        let listing_options = listing_table.options();
+        assert_eq!("", listing_options.file_extension);
+    }
+
+    #[tokio::test]
+    async fn test_create_with_hive_partitions() {
+        let dir = tempfile::tempdir().unwrap();
+        let mut path = PathBuf::from(dir.path());
+        path.extend(["key1=value1", "key2=value2"]);
+        fs::create_dir_all(&path).unwrap();
+        path.push("data.parquet");
+        fs::File::create_new(&path).unwrap();
+
+        let factory = ListingTableFactory::new();
+        let context = SessionContext::new();
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            dir.path().to_str().unwrap(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        let listing_options = listing_table.options();
+        let dtype =
+            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8));
+        let expected_cols = vec![
+            (String::from("key1"), dtype.clone()),
+            (String::from("key2"), dtype.clone()),
+        ];
+        assert_eq!(expected_cols, listing_options.table_partition_cols);
+
+        // Ensure partition detection can be disabled via config
+        let factory = ListingTableFactory::new();
+        let mut cfg = SessionConfig::new();
+        cfg.options_mut()
+            .execution
+            .listing_table_factory_infer_partitions = false;
+        let context = SessionContext::new_with_config(cfg);
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            dir.path().to_str().unwrap().to_string(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        let listing_options = listing_table.options();
+        assert!(listing_options.table_partition_cols.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_statistics_cache_prewarming() {
+        let factory = ListingTableFactory::new();
+
+        let location = PathBuf::from(parquet_test_data())
+            .join("alltypes_tiny_pages_plain.parquet")
+            .to_string_lossy()
+            .to_string();
+
+        // Test with collect_statistics enabled
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build_arc()
+            .unwrap();
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.collect_statistics = true;
+        let context = SessionContext::new_with_config_rt(config, runtime);
+        let state = context.state();
+        let name = TableReference::bare("test");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            location.clone(),
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        let _table_provider = factory.create(&state, &cmd).await.unwrap();
+
+        assert!(
+            file_statistics_cache.len() > 0,
+            "Statistics cache should be pre-warmed when collect_statistics is enabled"
+        );
+
+        // Test with collect_statistics disabled
+        let file_statistics_cache = Arc::new(DefaultFileStatisticsCache::default());
+        let cache_config = CacheManagerConfig::default()
+            .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+        let runtime = RuntimeEnvBuilder::new()
+            .with_cache_manager(cache_config)
+            .build_arc()
+            .unwrap();
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.collect_statistics = false;
+        let context = SessionContext::new_with_config_rt(config, runtime);
+        let state = context.state();
+        let name = TableReference::bare("test");
+
+        let cmd = CreateExternalTable::builder(
+            name,
+            location,
+            "parquet",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        let _table_provider = factory.create(&state, &cmd).await.unwrap();
+
+        assert_eq!(
+            file_statistics_cache.len(),
+            0,
+            "Statistics cache should not be pre-warmed when collect_statistics is disabled"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_create_with_invalid_session() {
+        use async_trait::async_trait;
+        use datafusion_catalog::Session;
+        use datafusion_common::Result;
+        use datafusion_common::config::TableOptions;
+        use datafusion_execution::TaskContext;
+        use datafusion_execution::config::SessionConfig;
+        use datafusion_physical_expr::PhysicalExpr;
+        use datafusion_physical_plan::ExecutionPlan;
+        use std::any::Any;
+        use std::collections::HashMap;
+        use std::sync::Arc;
+
+        // A mock Session that is NOT SessionState
+        #[derive(Debug)]
+        struct MockSession;
+
+        #[async_trait]
+        impl Session for MockSession {
+            fn session_id(&self) -> &str {
+                "mock_session"
+            }
+            fn config(&self) -> &SessionConfig {
+                unimplemented!()
+            }
+            async fn create_physical_plan(
+                &self,
+                _logical_plan: &datafusion_expr::LogicalPlan,
+            ) -> Result<Arc<dyn ExecutionPlan>> {
+                unimplemented!()
+            }
+            fn create_physical_expr(
+                &self,
+                _expr: datafusion_expr::Expr,
+                _df_schema: &DFSchema,
+            ) -> Result<Arc<dyn PhysicalExpr>> {
+                unimplemented!()
+            }
+            fn scalar_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::ScalarUDF>> {
+                unimplemented!()
+            }
+            fn aggregate_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::AggregateUDF>> {
+                unimplemented!()
+            }
+            fn window_functions(
+                &self,
+            ) -> &HashMap<String, Arc<datafusion_expr::WindowUDF>> {
+                unimplemented!()
+            }
+            fn runtime_env(&self) -> &Arc<datafusion_execution::runtime_env::RuntimeEnv> {
+                unimplemented!()
+            }
+            fn execution_props(
+                &self,
+            ) -> &datafusion_expr::execution_props::ExecutionProps {
+                unimplemented!()
+            }
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+            fn table_options(&self) -> &TableOptions {
+                unimplemented!()
+            }
+            fn table_options_mut(&mut self) -> &mut TableOptions {
+                unimplemented!()
+            }
+            fn task_ctx(&self) -> Arc<TaskContext> {
+                unimplemented!()
+            }
+        }
+
+        let factory = ListingTableFactory::new();
+        let mock_session = MockSession;
+
+        let name = TableReference::bare("foo");
+        let cmd = CreateExternalTable::builder(
+            name,
+            "foo.csv".to_string(),
+            "csv",
+            Arc::new(DFSchema::empty()),
+        )
+        .build();
+
+        // This should return an error, not panic
+        let result = factory.create(&mock_session, &cmd).await;
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .strip_backtrace()
+                .contains("Internal error: ListingTableFactory requires SessionState")
+        );
+    }
 }
diff --git a/datafusion/core/src/datasource/memory_test.rs b/datafusion/core/src/datasource/memory_test.rs
index 381000ab8ee1e..c7721cafb02ea 100644
--- a/datafusion/core/src/datasource/memory_test.rs
+++ b/datafusion/core/src/datasource/memory_test.rs
@@ -19,7 +19,7 @@
 mod tests {
 
     use crate::datasource::MemTable;
-    use crate::datasource::{provider_as_source, DefaultTableSource};
+    use crate::datasource::{DefaultTableSource, provider_as_source};
     use crate::physical_plan::collect;
     use crate::prelude::SessionContext;
     use arrow::array::{AsArray, Int32Array};
@@ -29,8 +29,8 @@ mod tests {
     use arrow_schema::SchemaRef;
     use datafusion_catalog::TableProvider;
     use datafusion_common::{DataFusionError, Result};
-    use datafusion_expr::dml::InsertOp;
     use datafusion_expr::LogicalPlanBuilder;
+    use datafusion_expr::dml::InsertOp;
     use futures::StreamExt;
     use std::collections::HashMap;
     use std::sync::Arc;
@@ -130,12 +130,15 @@ mod tests {
             .scan(&session_ctx.state(), Some(&projection), &[], None)
             .await
         {
-            Err(DataFusionError::ArrowError(ArrowError::SchemaError(e), _)) => {
-                assert_eq!(
-                    "\"project index 4 out of bounds, max field 3\"",
-                    format!("{e:?}")
-                )
-            }
+            Err(DataFusionError::ArrowError(err, _)) => match err.as_ref() {
+                ArrowError::SchemaError(e) => {
+                    assert_eq!(
+                        "\"project index 4 out of bounds, max field 3\"",
+                        format!("{e:?}")
+                    )
+                }
+                _ => panic!("unexpected error"),
+            },
             res => panic!("Scan should failed on invalid projection, got {res:?}"),
         };
 
@@ -326,12 +329,11 @@ mod tests {
         );
         let col = batch.column(0).as_primitive::<UInt64Type>();
         assert_eq!(col.len(), 1, "expected 1 row, got {}", col.len());
-        let val = col
-            .iter()
+
+        col.iter()
             .next()
             .expect("had value")
-            .expect("expected non null");
-        val
+            .expect("expected non null")
     }
 
     // Test inserting a single batch of data into a single partition
@@ -443,7 +445,7 @@ mod tests {
             .unwrap_err();
         // Ensure that there is a descriptive error message
         assert_eq!(
-            "Error during planning: Cannot insert into MemTable with zero partitions",
+            "Error during planning: No partitions provided, expected at least one partition",
             experiment_result.strip_backtrace()
         );
         Ok(())
diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs
index f0c6771515a7f..c0cb9b5fa0fe6 100644
--- a/datafusion/core/src/datasource/mod.rs
+++ b/datafusion/core/src/datasource/mod.rs
@@ -31,7 +31,7 @@ mod view_test;
 
 // backwards compatibility
 pub use self::default_table_source::{
-    provider_as_source, source_as_provider, DefaultTableSource,
+    DefaultTableSource, provider_as_source, source_as_provider,
 };
 pub use self::memory::MemTable;
 pub use self::view::ViewTable;
@@ -45,40 +45,46 @@ pub use datafusion_catalog::view;
 pub use datafusion_datasource::schema_adapter;
 pub use datafusion_datasource::sink;
 pub use datafusion_datasource::source;
+pub use datafusion_datasource::table_schema;
 pub use datafusion_execution::object_store;
 pub use datafusion_physical_expr::create_ordering;
 
 #[cfg(all(test, feature = "parquet"))]
 mod tests {
 
-    use datafusion_datasource::schema_adapter::{
-        DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper,
-    };
-
     use crate::prelude::SessionContext;
+    use ::object_store::{ObjectMeta, path::Path};
     use arrow::{
-        array::{Int32Array, StringArray},
+        array::Int32Array,
         datatypes::{DataType, Field, Schema, SchemaRef},
         record_batch::RecordBatch,
     };
-    use datafusion_common::{record_batch, test_util::batches_to_sort_string};
+    use datafusion_common::{
+        Result, ScalarValue,
+        test_util::batches_to_sort_string,
+        tree_node::{Transformed, TransformedResult, TreeNode},
+    };
     use datafusion_datasource::{
-        file::FileSource, file_scan_config::FileScanConfigBuilder,
-        source::DataSourceExec, PartitionedFile,
+        PartitionedFile, file_scan_config::FileScanConfigBuilder, source::DataSourceExec,
     };
     use datafusion_datasource_parquet::source::ParquetSource;
-    use datafusion_execution::object_store::ObjectStoreUrl;
+    use datafusion_physical_expr::expressions::{Column, Literal};
+    use datafusion_physical_expr_adapter::{
+        PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    };
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_plan::collect;
-    use object_store::{path::Path, ObjectMeta};
     use std::{fs, sync::Arc};
     use tempfile::TempDir;
+    use url::Url;
 
     #[tokio::test]
-    async fn can_override_schema_adapter() {
-        // Test shows that SchemaAdapter can add a column that doesn't existing in the
-        // record batches returned from parquet.  This can be useful for schema evolution
+    async fn can_override_physical_expr_adapter() {
+        // Test shows that PhysicalExprAdapter can add a column that doesn't exist in the
+        // record batches returned from parquet. This can be useful for schema evolution
         // where older files may not have all columns.
 
+        use datafusion_execution::object_store::ObjectStoreUrl;
         let tmp_dir = TempDir::new().unwrap();
         let table_dir = tmp_dir.path().join("parquet_test");
         fs::DirBuilder::new().create(table_dir.as_path()).unwrap();
@@ -98,7 +104,8 @@ mod tests {
         writer.write(&rec_batch).unwrap();
         writer.close().unwrap();
 
-        let location = Path::parse(path.to_str().unwrap()).unwrap();
+        let url = Url::from_file_path(path.canonicalize().unwrap()).unwrap();
+        let location = Path::from_url_path(url.path()).unwrap();
         let metadata = fs::metadata(path.as_path()).expect("Local file metadata");
         let meta = ObjectMeta {
             location,
@@ -108,28 +115,18 @@ mod tests {
             version: None,
         };
 
-        let partitioned_file = PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+        let partitioned_file = PartitionedFile::new_from_meta(meta);
 
         let f1 = Field::new("id", DataType::Int32, true);
         let f2 = Field::new("extra_column", DataType::Utf8, true);
 
         let schema = Arc::new(Schema::new(vec![f1.clone(), f2.clone()]));
-        let source = ParquetSource::default()
-            .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {}));
-        let base_conf = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            schema,
-            source,
-        )
-        .with_file(partitioned_file)
-        .build();
+        let source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
+        let base_conf =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+                .with_file(partitioned_file)
+                .with_expr_adapter(Some(Arc::new(TestPhysicalExprAdapterFactory)))
+                .build();
 
         let parquet_exec = DataSourceExec::from_data_source(base_conf);
 
@@ -137,134 +134,52 @@ mod tests {
         let task_ctx = session_ctx.task_ctx();
         let read = collect(parquet_exec, task_ctx).await.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
         +----+--------------+
         | id | extra_column |
         +----+--------------+
         | 1  | foo          |
         +----+--------------+
-        "###);
-    }
-
-    #[test]
-    fn default_schema_adapter() {
-        let table_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]);
-
-        // file has a subset of the table schema fields and different type
-        let file_schema = Schema::new(vec![
-            Field::new("c", DataType::Float64, true), // not in table schema
-            Field::new("b", DataType::Float64, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
-        let (mapper, indices) = adapter.map_schema(&file_schema).unwrap();
-        assert_eq!(indices, vec![1]);
-
-        let file_batch = record_batch!(("b", Float64, vec![1.0, 2.0])).unwrap();
-
-        let mapped_batch = mapper.map_batch(file_batch).unwrap();
-
-        // the mapped batch has the correct schema and the "b" column has been cast to Utf8
-        let expected_batch = record_batch!(
-            ("a", Int32, vec![None, None]), // missing column filled with nulls
-            ("b", Utf8, vec!["1.0", "2.0"])  // b was cast to string and order was changed
-        )
-        .unwrap();
-        assert_eq!(mapped_batch, expected_batch);
-    }
-
-    #[test]
-    fn default_schema_adapter_non_nullable_columns() {
-        let table_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, false), // "a"" is declared non nullable
-            Field::new("b", DataType::Utf8, true),
-        ]);
-        let file_schema = Schema::new(vec![
-            // since file doesn't have "a" it will be filled with nulls
-            Field::new("b", DataType::Float64, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
-        let (mapper, indices) = adapter.map_schema(&file_schema).unwrap();
-        assert_eq!(indices, vec![0]);
-
-        let file_batch = record_batch!(("b", Float64, vec![1.0, 2.0])).unwrap();
-
-        // Mapping fails because it tries to fill in a non-nullable column with nulls
-        let err = mapper.map_batch(file_batch).unwrap_err().to_string();
-        assert!(err.contains("Invalid argument error: Column 'a' is declared as non-nullable but contains null values"), "{err}");
+        ");
     }
 
     #[derive(Debug)]
-    struct TestSchemaAdapterFactory;
+    struct TestPhysicalExprAdapterFactory;
 
-    impl SchemaAdapterFactory for TestSchemaAdapterFactory {
+    impl PhysicalExprAdapterFactory for TestPhysicalExprAdapterFactory {
         fn create(
             &self,
-            projected_table_schema: SchemaRef,
-            _table_schema: SchemaRef,
-        ) -> Box<dyn SchemaAdapter> {
-            Box::new(TestSchemaAdapter {
-                table_schema: projected_table_schema,
-            })
+            _logical_file_schema: SchemaRef,
+            physical_file_schema: SchemaRef,
+        ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+            Ok(Arc::new(TestPhysicalExprAdapter {
+                physical_file_schema,
+            }))
         }
     }
 
-    struct TestSchemaAdapter {
-        /// Schema for the table
-        table_schema: SchemaRef,
+    #[derive(Debug)]
+    struct TestPhysicalExprAdapter {
+        physical_file_schema: SchemaRef,
     }
 
-    impl SchemaAdapter for TestSchemaAdapter {
-        fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-            let field = self.table_schema.field(index);
-            Some(file_schema.fields.find(field.name())?.0)
-        }
-
-        fn map_schema(
-            &self,
-            file_schema: &Schema,
-        ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-            let mut projection = Vec::with_capacity(file_schema.fields().len());
-
-            for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
-                if self.table_schema.fields().find(file_field.name()).is_some() {
-                    projection.push(file_idx);
+    impl PhysicalExprAdapter for TestPhysicalExprAdapter {
+        fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+            expr.transform(|e| {
+                if let Some(column) = e.as_any().downcast_ref::<Column>() {
+                    // If column is "extra_column" and missing from physical schema, inject "foo"
+                    if column.name() == "extra_column"
+                        && self.physical_file_schema.index_of("extra_column").is_err()
+                    {
+                        return Ok(Transformed::yes(Arc::new(Literal::new(
+                            ScalarValue::Utf8(Some("foo".to_string())),
+                        ))
+                            as Arc<dyn PhysicalExpr>));
+                    }
                 }
-            }
-
-            Ok((Arc::new(TestSchemaMapping {}), projection))
-        }
-    }
-
-    #[derive(Debug)]
-    struct TestSchemaMapping {}
-
-    impl SchemaMapper for TestSchemaMapping {
-        fn map_batch(
-            &self,
-            batch: RecordBatch,
-        ) -> datafusion_common::Result<RecordBatch> {
-            let f1 = Field::new("id", DataType::Int32, true);
-            let f2 = Field::new("extra_column", DataType::Utf8, true);
-
-            let schema = Arc::new(Schema::new(vec![f1, f2]));
-
-            let extra_column = Arc::new(StringArray::from(vec!["foo"]));
-            let mut new_columns = batch.columns().to_vec();
-            new_columns.push(extra_column);
-
-            Ok(RecordBatch::try_new(schema, new_columns).unwrap())
-        }
-
-        fn map_column_statistics(
-            &self,
-            _file_col_statistics: &[datafusion_common::ColumnStatistics],
-        ) -> datafusion_common::Result<Vec<datafusion_common::ColumnStatistics>> {
-            unimplemented!()
+                Ok(Transformed::no(e))
+            })
+            .data()
         }
     }
 }
diff --git a/datafusion/core/src/datasource/physical_plan/arrow.rs b/datafusion/core/src/datasource/physical_plan/arrow.rs
new file mode 100644
index 0000000000000..392eaa8c4be49
--- /dev/null
+++ b/datafusion/core/src/datasource/physical_plan/arrow.rs
@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Reexports the [`datafusion_datasource_arrow::source`] module, containing [Arrow] based [`FileSource`].
+//!
+//! [Arrow]: https://arrow.apache.org/docs/python/ipc.html
+//! [`FileSource`]: datafusion_datasource::file::FileSource
+
+pub use datafusion_datasource_arrow::source::*;
diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs
deleted file mode 100644
index 6de72aa8ff720..0000000000000
--- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs
+++ /dev/null
@@ -1,239 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::any::Any;
-use std::sync::Arc;
-
-use crate::datasource::physical_plan::{FileMeta, FileOpenFuture, FileOpener};
-use crate::error::Result;
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-use datafusion_datasource::{as_file_source, impl_schema_adapter_methods};
-
-use arrow::buffer::Buffer;
-use arrow::datatypes::SchemaRef;
-use arrow_ipc::reader::FileDecoder;
-use datafusion_common::Statistics;
-use datafusion_datasource::file::FileSource;
-use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-
-use futures::StreamExt;
-use itertools::Itertools;
-use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore};
-
-/// Arrow configuration struct that is given to DataSourceExec
-/// Does not hold anything special, since [`FileScanConfig`] is sufficient for arrow
-#[derive(Clone, Default)]
-pub struct ArrowSource {
-    metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
-}
-
-impl From<ArrowSource> for Arc<dyn FileSource> {
-    fn from(source: ArrowSource) -> Self {
-        as_file_source(source)
-    }
-}
-
-impl FileSource for ArrowSource {
-    fn create_file_opener(
-        &self,
-        object_store: Arc<dyn ObjectStore>,
-        base_config: &FileScanConfig,
-        _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(ArrowOpener {
-            object_store,
-            projection: base_config.file_column_projection_indices(),
-        })
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn with_schema(&self, _schema: SchemaRef) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
-    }
-
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-
-    fn metrics(&self) -> &ExecutionPlanMetricsSet {
-        &self.metrics
-    }
-
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
-
-    fn file_type(&self) -> &str {
-        "arrow"
-    }
-
-    impl_schema_adapter_methods!();
-}
-
-/// The struct arrow that implements `[FileOpener]` trait
-pub struct ArrowOpener {
-    pub object_store: Arc<dyn ObjectStore>,
-    pub projection: Option<Vec<usize>>,
-}
-
-impl FileOpener for ArrowOpener {
-    fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture> {
-        let object_store = Arc::clone(&self.object_store);
-        let projection = self.projection.clone();
-        Ok(Box::pin(async move {
-            let range = file_meta.range.clone();
-            match range {
-                None => {
-                    let r = object_store.get(file_meta.location()).await?;
-                    match r.payload {
-                        #[cfg(not(target_arch = "wasm32"))]
-                        GetResultPayload::File(file, _) => {
-                            let arrow_reader = arrow::ipc::reader::FileReader::try_new(
-                                file, projection,
-                            )?;
-                            Ok(futures::stream::iter(arrow_reader).boxed())
-                        }
-                        GetResultPayload::Stream(_) => {
-                            let bytes = r.bytes().await?;
-                            let cursor = std::io::Cursor::new(bytes);
-                            let arrow_reader = arrow::ipc::reader::FileReader::try_new(
-                                cursor, projection,
-                            )?;
-                            Ok(futures::stream::iter(arrow_reader).boxed())
-                        }
-                    }
-                }
-                Some(range) => {
-                    // range is not none, the file maybe split into multiple parts to scan in parallel
-                    // get footer_len firstly
-                    let get_option = GetOptions {
-                        range: Some(GetRange::Suffix(10)),
-                        ..Default::default()
-                    };
-                    let get_result = object_store
-                        .get_opts(file_meta.location(), get_option)
-                        .await?;
-                    let footer_len_buf = get_result.bytes().await?;
-                    let footer_len = arrow_ipc::reader::read_footer_length(
-                        footer_len_buf[..].try_into().unwrap(),
-                    )?;
-                    // read footer according to footer_len
-                    let get_option = GetOptions {
-                        range: Some(GetRange::Suffix(10 + (footer_len as u64))),
-                        ..Default::default()
-                    };
-                    let get_result = object_store
-                        .get_opts(file_meta.location(), get_option)
-                        .await?;
-                    let footer_buf = get_result.bytes().await?;
-                    let footer = arrow_ipc::root_as_footer(
-                        footer_buf[..footer_len].try_into().unwrap(),
-                    )
-                    .map_err(|err| {
-                        arrow::error::ArrowError::ParseError(format!(
-                            "Unable to get root as footer: {err:?}"
-                        ))
-                    })?;
-                    // build decoder according to footer & projection
-                    let schema =
-                        arrow_ipc::convert::fb_to_schema(footer.schema().unwrap());
-                    let mut decoder = FileDecoder::new(schema.into(), footer.version());
-                    if let Some(projection) = projection {
-                        decoder = decoder.with_projection(projection);
-                    }
-                    let dict_ranges = footer
-                        .dictionaries()
-                        .iter()
-                        .flatten()
-                        .map(|block| {
-                            let block_len =
-                                block.bodyLength() as u64 + block.metaDataLength() as u64;
-                            let block_offset = block.offset() as u64;
-                            block_offset..block_offset + block_len
-                        })
-                        .collect_vec();
-                    let dict_results = object_store
-                        .get_ranges(file_meta.location(), &dict_ranges)
-                        .await?;
-                    for (dict_block, dict_result) in
-                        footer.dictionaries().iter().flatten().zip(dict_results)
-                    {
-                        decoder
-                            .read_dictionary(dict_block, &Buffer::from(dict_result))?;
-                    }
-
-                    // filter recordbatches according to range
-                    let recordbatches = footer
-                        .recordBatches()
-                        .iter()
-                        .flatten()
-                        .filter(|block| {
-                            let block_offset = block.offset() as u64;
-                            block_offset >= range.start as u64
-                                && block_offset < range.end as u64
-                        })
-                        .copied()
-                        .collect_vec();
-
-                    let recordbatch_ranges = recordbatches
-                        .iter()
-                        .map(|block| {
-                            let block_len =
-                                block.bodyLength() as u64 + block.metaDataLength() as u64;
-                            let block_offset = block.offset() as u64;
-                            block_offset..block_offset + block_len
-                        })
-                        .collect_vec();
-
-                    let recordbatch_results = object_store
-                        .get_ranges(file_meta.location(), &recordbatch_ranges)
-                        .await?;
-
-                    Ok(futures::stream::iter(
-                        recordbatches
-                            .into_iter()
-                            .zip(recordbatch_results)
-                            .filter_map(move |(block, data)| {
-                                decoder
-                                    .read_record_batch(&block, &Buffer::from(data))
-                                    .transpose()
-                            }),
-                    )
-                    .boxed())
-                }
-            }
-        }))
-    }
-}
diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs
index 8a00af959ccc9..2954a47403299 100644
--- a/datafusion/core/src/datasource/physical_plan/avro.rs
+++ b/datafusion/core/src/datasource/physical_plan/avro.rs
@@ -31,21 +31,21 @@ mod tests {
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::datatypes::{DataType, Field, SchemaBuilder};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{test_util, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, test_util};
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-    use datafusion_datasource::PartitionedFile;
-    use datafusion_datasource_avro::source::AvroSource;
+    use datafusion_datasource::{PartitionedFile, TableSchema};
     use datafusion_datasource_avro::AvroFormat;
+    use datafusion_datasource_avro::source::AvroSource;
     use datafusion_execution::object_store::ObjectStoreUrl;
     use datafusion_physical_plan::ExecutionPlan;
 
     use datafusion_datasource::source::DataSourceExec;
     use futures::StreamExt;
     use insta::assert_snapshot;
+    use object_store::ObjectStore;
     use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectStore;
     use rstest::*;
     use url::Url;
 
@@ -81,15 +81,11 @@ mod tests {
             .infer_schema(&state, &store, std::slice::from_ref(&meta))
             .await?;
 
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            file_schema,
-            source,
-        )
-        .with_file(meta.into())
-        .with_projection(Some(vec![0, 1, 2]))
-        .build();
+        let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_file(meta.into())
+            .with_projection_indices(Some(vec![0, 1, 2]))?
+            .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
         assert_eq!(
@@ -109,20 +105,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+-------------+
-            | id | bool_col | tinyint_col |
-            +----+----------+-------------+
-            | 4  | true     | 0           |
-            | 5  | false    | 1           |
-            | 6  | true     | 0           |
-            | 7  | false    | 1           |
-            | 2  | true     | 0           |
-            | 3  | false    | 1           |
-            | 0  | true     | 0           |
-            | 1  | false    | 1           |
-            +----+----------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+-------------+
+        | id | bool_col | tinyint_col |
+        +----+----------+-------------+
+        | 4  | true     | 0           |
+        | 5  | false    | 1           |
+        | 6  | true     | 0           |
+        | 7  | false    | 1           |
+        | 2  | true     | 0           |
+        | 3  | false    | 1           |
+        | 0  | true     | 0           |
+        | 1  | false    | 1           |
+        +----+----------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -157,10 +153,10 @@ mod tests {
         // Include the missing column in the projection
         let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]);
 
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file(meta.into())
-            .with_projection(projection)
+            .with_projection_indices(projection)?
             .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
@@ -182,20 +178,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+-------------+-------------+
-            | id | bool_col | tinyint_col | missing_col |
-            +----+----------+-------------+-------------+
-            | 4  | true     | 0           |             |
-            | 5  | false    | 1           |             |
-            | 6  | true     | 0           |             |
-            | 7  | false    | 1           |             |
-            | 2  | true     | 0           |             |
-            | 3  | false    | 1           |             |
-            | 0  | true     | 0           |             |
-            | 1  | false    | 1           |             |
-            +----+----------+-------------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+-------------+-------------+
+        | id | bool_col | tinyint_col | missing_col |
+        +----+----------+-------------+-------------+
+        | 4  | true     | 0           |             |
+        | 5  | false    | 1           |             |
+        | 6  | true     | 0           |             |
+        | 7  | false    | 1           |             |
+        | 2  | true     | 0           |             |
+        | 3  | false    | 1           |             |
+        | 0  | true     | 0           |             |
+        | 1  | false    | 1           |             |
+        +----+----------+-------------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -227,13 +223,16 @@ mod tests {
         partitioned_file.partition_values = vec![ScalarValue::from("2021-10-26")];
 
         let projection = Some(vec![0, 1, file_schema.fields().len(), 2]);
-        let source = Arc::new(AvroSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let table_schema = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+        );
+        let source = Arc::new(AvroSource::new(table_schema.clone()));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             // select specific columns of the files as well as the partitioning
             // column which is supposed to be the last column in the table schema.
-            .with_projection(projection)
+            .with_projection_indices(projection)?
             .with_file(partitioned_file)
-            .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)])
             .build();
 
         let source_exec = DataSourceExec::from_data_source(conf);
@@ -256,20 +255,20 @@ mod tests {
             .expect("plan iterator empty")
             .expect("plan iterator returned an error");
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----------+------------+-------------+
-            | id | bool_col | date       | tinyint_col |
-            +----+----------+------------+-------------+
-            | 4  | true     | 2021-10-26 | 0           |
-            | 5  | false    | 2021-10-26 | 1           |
-            | 6  | true     | 2021-10-26 | 0           |
-            | 7  | false    | 2021-10-26 | 1           |
-            | 2  | true     | 2021-10-26 | 0           |
-            | 3  | false    | 2021-10-26 | 1           |
-            | 0  | true     | 2021-10-26 | 0           |
-            | 1  | false    | 2021-10-26 | 1           |
-            +----+----------+------------+-------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----------+------------+-------------+
+        | id | bool_col | date       | tinyint_col |
+        +----+----------+------------+-------------+
+        | 4  | true     | 2021-10-26 | 0           |
+        | 5  | false    | 2021-10-26 | 1           |
+        | 6  | true     | 2021-10-26 | 0           |
+        | 7  | false    | 2021-10-26 | 1           |
+        | 2  | true     | 2021-10-26 | 0           |
+        | 3  | false    | 2021-10-26 | 1           |
+        | 0  | true     | 2021-10-26 | 0           |
+        | 1  | false    | 2021-10-26 | 1           |
+        +----+----------+------------+-------------+
+        ");}
 
         let batch = results.next().await;
         assert!(batch.is_none());
diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs
index 3ef4030134520..82c47b6c7281c 100644
--- a/datafusion/core/src/datasource/physical_plan/csv.rs
+++ b/datafusion/core/src/datasource/physical_plan/csv.rs
@@ -29,18 +29,21 @@ mod tests {
     use std::io::Write;
     use std::sync::Arc;
 
+    use datafusion_datasource::TableSchema;
     use datafusion_datasource_csv::CsvFormat;
-    use object_store::ObjectStore;
+    use object_store::{ObjectStore, ObjectStoreExt};
 
+    use crate::datasource::file_format::FileFormat;
     use crate::prelude::CsvReadOptions;
     use crate::prelude::SessionContext;
     use crate::test::partitioned_file_groups;
+    use datafusion_common::config::CsvOptions;
     use datafusion_common::test_util::arrow_test_data;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{assert_batches_eq, Result};
+    use datafusion_common::{Result, assert_batches_eq};
     use datafusion_execution::config::SessionConfig;
-    use datafusion_physical_plan::metrics::MetricsSet;
     use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_physical_plan::metrics::MetricsSet;
 
     #[cfg(feature = "compression")]
     use datafusion_datasource::file_compression_type::FileCompressionType;
@@ -94,34 +97,41 @@ mod tests {
     async fn csv_exec_with_projection(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
         let file_schema = aggr_test_schema();
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_file_compression_type(file_compression_type)
-        .with_newlines_in_values(false)
-        .with_projection(Some(vec![0, 2, 4]))
-        .build();
-
-        assert_eq!(13, config.file_schema.fields().len());
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type)
+                .with_projection_indices(Some(vec![0, 2, 4]))?
+                .build();
+
+        assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
 
         assert_eq!(3, csv.schema().fields().len());
@@ -131,17 +141,17 @@ mod tests {
         assert_eq!(3, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +----+-----+------------+
-            | c1 | c3  | c5         |
-            +----+-----+------------+
-            | c  | 1   | 2033001162 |
-            | d  | -40 | 706441268  |
-            | b  | 29  | 994303988  |
-            | a  | -85 | 1171968280 |
-            | b  | -82 | 1824882165 |
-            +----+-----+------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +----+-----+------------+
+        | c1 | c3  | c5         |
+        +----+-----+------------+
+        | c  | 1   | 2033001162 |
+        | d  | -40 | 706441268  |
+        | b  | 29  | 994303988  |
+        | a  | -85 | 1171968280 |
+        | b  | -82 | 1824882165 |
+        +----+-----+------------+
+        ");}
         Ok(())
     }
 
@@ -158,6 +168,8 @@ mod tests {
     async fn csv_exec_with_mixed_order_projection(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true");
         let session_ctx = SessionContext::new_with_config(cfg);
         let task_ctx = session_ctx.task_ctx();
@@ -165,27 +177,32 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_projection(Some(vec![4, 0, 2]))
-        .build();
-        assert_eq!(13, config.file_schema.fields().len());
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_projection_indices(Some(vec![4, 0, 2]))?
+                .build();
+        assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(3, csv.schema().fields().len());
 
@@ -194,17 +211,17 @@ mod tests {
         assert_eq!(3, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +------------+----+-----+
-            | c5         | c1 | c3  |
-            +------------+----+-----+
-            | 2033001162 | c  | 1   |
-            | 706441268  | d  | -40 |
-            | 994303988  | b  | 29  |
-            | 1171968280 | a  | -85 |
-            | 1824882165 | b  | -82 |
-            +------------+----+-----+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +------------+----+-----+
+        | c5         | c1 | c3  |
+        +------------+----+-----+
+        | 2033001162 | c  | 1   |
+        | 706441268  | d  | -40 |
+        | 994303988  | b  | 29  |
+        | 1171968280 | a  | -85 |
+        | 1824882165 | b  | -82 |
+        +------------+----+-----+
+        ");}
         Ok(())
     }
 
@@ -221,6 +238,7 @@ mod tests {
     async fn csv_exec_with_limit(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
         use futures::StreamExt;
 
         let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true");
@@ -230,27 +248,32 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_limit(Some(5))
-        .build();
-        assert_eq!(13, config.file_schema.fields().len());
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_limit(Some(5))
+                .build();
+        assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(13, csv.schema().fields().len());
 
@@ -259,17 +282,17 @@ mod tests {
         assert_eq!(13, batch.num_columns());
         assert_eq!(5, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r###"
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-            | c1 | c2 | c3  | c4     | c5         | c6                   | c7  | c8    | c9         | c10                  | c11         | c12                 | c13                            |
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-            | c  | 2  | 1   | 18109  | 2033001162 | -6513304855495910254 | 25  | 43062 | 1491205016 | 5863949479783605708  | 0.110830784 | 0.9294097332465232  | 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW |
-            | d  | 5  | -40 | 22614  | 706441268  | -7542719935673075327 | 155 | 14337 | 3373581039 | 11720144131976083864 | 0.69632107  | 0.3114712539863804  | C2GT5KVyOPZpgKVl110TyZO0NcJ434 |
-            | b  | 1  | 29  | -18218 | 994303988  | 5983957848665088916  | 204 | 9489  | 3275293996 | 14857091259186476033 | 0.53840446  | 0.17909035118828576 | AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz |
-            | a  | 1  | -85 | -15154 | 1171968280 | 1919439543497968449  | 77  | 52286 | 774637006  | 12101411955859039553 | 0.12285209  | 0.6864391962767343  | 0keZ5G8BffGwgF2RwQD59TFzMStxCB |
-            | b  | 5  | -82 | 22080  | 1824882165 | 7373730676428214987  | 208 | 34331 | 3342719438 | 3330177516592499461  | 0.82634634  | 0.40975383525297016 | Ig1QcuKsjHXkproePdERo2w0mYzIqd |
-            +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch]), @r"
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        | c1 | c2 | c3  | c4     | c5         | c6                   | c7  | c8    | c9         | c10                  | c11         | c12                 | c13                            |
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        | c  | 2  | 1   | 18109  | 2033001162 | -6513304855495910254 | 25  | 43062 | 1491205016 | 5863949479783605708  | 0.110830784 | 0.9294097332465232  | 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW |
+        | d  | 5  | -40 | 22614  | 706441268  | -7542719935673075327 | 155 | 14337 | 3373581039 | 11720144131976083864 | 0.69632107  | 0.3114712539863804  | C2GT5KVyOPZpgKVl110TyZO0NcJ434 |
+        | b  | 1  | 29  | -18218 | 994303988  | 5983957848665088916  | 204 | 9489  | 3275293996 | 14857091259186476033 | 0.53840446  | 0.17909035118828576 | AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz |
+        | a  | 1  | -85 | -15154 | 1171968280 | 1919439543497968449  | 77  | 52286 | 774637006  | 12101411955859039553 | 0.12285209  | 0.6864391962767343  | 0keZ5G8BffGwgF2RwQD59TFzMStxCB |
+        | b  | 5  | -82 | 22080  | 1824882165 | 7373730676428214987  | 208 | 34331 | 3342719438 | 3330177516592499461  | 0.82634634  | 0.40975383525297016 | Ig1QcuKsjHXkproePdERo2w0mYzIqd |
+        +----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+
+        ");}
 
         Ok(())
     }
@@ -287,33 +310,40 @@ mod tests {
     async fn csv_exec_with_missing_column(
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
+        use datafusion_datasource::TableSchema;
+
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
         let file_schema = aggr_test_schema_with_missing_col();
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .with_limit(Some(5))
-        .build();
-        assert_eq!(14, config.file_schema.fields().len());
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .with_limit(Some(5))
+                .build();
+        assert_eq!(14, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(14, csv.schema().fields().len());
 
@@ -341,6 +371,7 @@ mod tests {
         file_compression_type: FileCompressionType,
     ) -> Result<()> {
         use datafusion_common::ScalarValue;
+        use datafusion_datasource::TableSchema;
 
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
@@ -348,38 +379,45 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
-        let file_groups = partitioned_file_groups(
+        let mut file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )?;
-
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let mut config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .build();
-
-        // Add partition columns
-        config.table_partition_cols = vec![Field::new("date", DataType::Utf8, false)];
-        config.file_groups[0][0].partition_values = vec![ScalarValue::from("2021-10-26")];
-
-        // We should be able to project on the partition column
-        // Which is supposed to be after the file fields
-        config.projection = Some(vec![0, config.file_schema.fields().len()]);
+        // Add partition columns / values
+        file_groups[0][0].partition_values = vec![ScalarValue::from("2021-10-26")];
+
+        let num_file_schema_fields = file_schema.fields().len();
+
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::new(
+            Arc::clone(&file_schema),
+            vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+        );
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                // We should be able to project on the partition column
+                // Which is supposed to be after the file fields
+                .with_projection_indices(Some(vec![0, num_file_schema_fields]))?
+                .build();
 
         // we don't have `/date=xx/` in the path but that is ok because
         // partitions are resolved during scan anyway
 
-        assert_eq!(13, config.file_schema.fields().len());
+        assert_eq!(13, config.file_schema().fields().len());
         let csv = DataSourceExec::from_data_source(config);
         assert_eq!(2, csv.schema().fields().len());
 
@@ -388,17 +426,17 @@ mod tests {
         assert_eq!(2, batch.num_columns());
         assert_eq!(100, batch.num_rows());
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r###"
-            +----+------------+
-            | c1 | date       |
-            +----+------------+
-            | c  | 2021-10-26 |
-            | d  | 2021-10-26 |
-            | b  | 2021-10-26 |
-            | a  | 2021-10-26 |
-            | b  | 2021-10-26 |
-            +----+------------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&[batch.slice(0, 5)]), @r"
+        +----+------------+
+        | c1 | date       |
+        +----+------------+
+        | c  | 2021-10-26 |
+        | d  | 2021-10-26 |
+        | b  | 2021-10-26 |
+        | a  | 2021-10-26 |
+        | b  | 2021-10-26 |
+        +----+------------+
+        ");}
 
         let metrics = csv.metrics().expect("doesn't found metrics");
         let time_elapsed_processing = get_value(&metrics, "time_elapsed_processing");
@@ -452,26 +490,31 @@ mod tests {
         let path = format!("{}/csv", arrow_test_data());
         let filename = "aggregate_test_100.csv";
         let tmp_dir = TempDir::new()?;
+        let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
 
         let file_groups = partitioned_file_groups(
             path.as_str(),
             filename,
             1,
-            Arc::new(CsvFormat::default()),
+            &csv_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )
         .unwrap();
 
-        let source = Arc::new(CsvSource::new(true, b',', b'"'));
-        let config = FileScanConfigBuilder::from(partitioned_csv_config(
-            file_schema,
-            file_groups,
-            source,
-        ))
-        .with_newlines_in_values(false)
-        .with_file_compression_type(file_compression_type.to_owned())
-        .build();
+        let options = CsvOptions {
+            has_header: Some(true),
+            delimiter: b',',
+            quote: b'"',
+            ..Default::default()
+        };
+        let table_schema = TableSchema::from_file_schema(Arc::clone(&file_schema));
+        let source =
+            Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
+        let config =
+            FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
+                .with_file_compression_type(file_compression_type.to_owned())
+                .build();
         let csv = DataSourceExec::from_data_source(config);
 
         let it = csv.execute(0, task_ctx).unwrap();
@@ -527,14 +570,14 @@ mod tests {
 
         let result = df.collect().await.unwrap();
 
-        assert_snapshot!(batches_to_string(&result), @r###"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | 2 |
-            | 3 | 4 |
-            +---+---+
-        "###);
+        assert_snapshot!(batches_to_string(&result), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | 2 |
+        | 3 | 4 |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
@@ -556,14 +599,14 @@ mod tests {
 
         let result = df.collect().await.unwrap();
 
-        assert_snapshot!(batches_to_string(&result),@r###"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | 2 |
-            | 3 | 4 |
-            +---+---+
-        "###);
+        assert_snapshot!(batches_to_string(&result),@r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | 2 |
+        | 3 | 4 |
+        +---+---+
+        ");
 
         let e = session_ctx
             .read_csv("memory:///", CsvReadOptions::new().terminator(Some(b'\n')))
@@ -572,7 +615,10 @@ mod tests {
             .collect()
             .await
             .unwrap_err();
-        assert_eq!(e.strip_backtrace(), "Arrow error: Csv error: incorrect number of fields for line 1, expected 2 got more than 2")
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Csv error: incorrect number of fields for line 1, expected 2 got more than 2"
+        )
     }
 
     #[tokio::test]
@@ -593,22 +639,22 @@ mod tests {
         .await?;
 
         let df = ctx.sql(r#"select * from t1"#).await?.collect().await?;
-        assert_snapshot!(batches_to_string(&df),@r###"
-            +------+--------+
-            | col1 | col2   |
-            +------+--------+
-            | id0  | value0 |
-            | id1  | value1 |
-            | id2  | value2 |
-            | id3  | value3 |
-            +------+--------+
-        "###);
+        assert_snapshot!(batches_to_string(&df),@r"
+        +------+--------+
+        | col1 | col2   |
+        +------+--------+
+        | id0  | value0 |
+        | id1  | value1 |
+        | id2  | value2 |
+        | id3  | value3 |
+        +------+--------+
+        ");
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_create_external_table_with_terminator_with_newlines_in_values(
-    ) -> Result<()> {
+    async fn test_create_external_table_with_terminator_with_newlines_in_values()
+    -> Result<()> {
         let ctx = SessionContext::new();
         ctx.sql(r#"
             CREATE EXTERNAL TABLE t1 (
@@ -658,7 +704,10 @@ mod tests {
             )
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 
diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs
index 0d45711c76fb0..b70791c7b2390 100644
--- a/datafusion/core/src/datasource/physical_plan/json.rs
+++ b/datafusion/core/src/datasource/physical_plan/json.rs
@@ -32,11 +32,11 @@ mod tests {
 
     use crate::dataframe::DataFrameWriteOptions;
     use crate::execution::SessionState;
-    use crate::prelude::{CsvReadOptions, NdJsonReadOptions, SessionContext};
+    use crate::prelude::{CsvReadOptions, JsonReadOptions, SessionContext};
     use crate::test::partitioned_file_groups;
+    use datafusion_common::Result;
     use datafusion_common::cast::{as_int32_array, as_int64_array, as_string_array};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::Result;
     use datafusion_datasource::file_compression_type::FileCompressionType;
     use datafusion_datasource::file_format::FileFormat;
     use datafusion_datasource_json::JsonFormat;
@@ -51,9 +51,9 @@ mod tests {
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
     use datafusion_datasource::source::DataSourceExec;
     use insta::assert_snapshot;
+    use object_store::ObjectStore;
     use object_store::chunked::ChunkedStore;
     use object_store::local::LocalFileSystem;
-    use object_store::ObjectStore;
     use rstest::*;
     use tempfile::TempDir;
     use url::Url;
@@ -69,11 +69,13 @@ mod tests {
         let store = state.runtime_env().object_store(&store_url).unwrap();
 
         let filename = "1.json";
+        let json_format: Arc<dyn FileFormat> = Arc::new(JsonFormat::default());
+
         let file_groups = partitioned_file_groups(
             TEST_DATA_BASE,
             filename,
             1,
-            Arc::new(JsonFormat::default()),
+            &json_format,
             file_compression_type.to_owned(),
             work_dir,
         )
@@ -104,11 +106,13 @@ mod tests {
         ctx.register_object_store(&url, store.clone());
         let filename = "1.json";
         let tmp_dir = TempDir::new()?;
+        let json_format: Arc<dyn FileFormat> = Arc::new(JsonFormat::default());
+
         let file_groups = partitioned_file_groups(
             TEST_DATA_BASE,
             filename,
             1,
-            Arc::new(JsonFormat::default()),
+            &json_format,
             file_compression_type.to_owned(),
             tmp_dir.path(),
         )
@@ -132,22 +136,22 @@ mod tests {
             .get_ext_with_compression(&file_compression_type)
             .unwrap();
 
-        let read_options = NdJsonReadOptions::default()
+        let read_options = JsonReadOptions::default()
             .file_extension(ext.as_str())
             .file_compression_type(file_compression_type.to_owned());
         let frame = ctx.read_json(path, read_options).await.unwrap();
         let results = frame.collect().await.unwrap();
 
-        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&results), @r###"
-            +-----+------------------+---------------+------+
-            | a   | b                | c             | d    |
-            +-----+------------------+---------------+------+
-            | 1   | [2.0, 1.3, -6.1] | [false, true] | 4    |
-            | -10 | [2.0, 1.3, -6.1] | [true, true]  | 4    |
-            | 2   | [2.0, , -6.1]    | [false, ]     | text |
-            |     |                  |               |      |
-            +-----+------------------+---------------+------+
-        "###);}
+        insta::allow_duplicates! {assert_snapshot!(batches_to_string(&results), @r"
+        +-----+------------------+---------------+------+
+        | a   | b                | c             | d    |
+        +-----+------------------+---------------+------+
+        | 1   | [2.0, 1.3, -6.1] | [false, true] | 4    |
+        | -10 | [2.0, 1.3, -6.1] | [true, true]  | 4    |
+        | 2   | [2.0, , -6.1]    | [false, ]     | text |
+        |     |                  |               |      |
+        +-----+------------------+---------------+------+
+        ");}
 
         Ok(())
     }
@@ -176,8 +180,8 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
             .with_limit(Some(3))
             .with_file_compression_type(file_compression_type.to_owned())
@@ -251,8 +255,8 @@ mod tests {
         let file_schema = Arc::new(builder.finish());
         let missing_field_idx = file_schema.fields.len() - 1;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
             .with_limit(Some(3))
             .with_file_compression_type(file_compression_type.to_owned())
@@ -294,10 +298,11 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
-            .with_projection(Some(vec![0, 2]))
+            .with_projection_indices(Some(vec![0, 2]))
+            .unwrap()
             .with_file_compression_type(file_compression_type.to_owned())
             .build();
         let exec = DataSourceExec::from_data_source(conf);
@@ -342,10 +347,10 @@ mod tests {
         let (object_store_url, file_groups, file_schema) =
             prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await;
 
-        let source = Arc::new(JsonSource::new());
-        let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+        let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
+        let conf = FileScanConfigBuilder::new(object_store_url, source)
             .with_file_groups(file_groups)
-            .with_projection(Some(vec![3, 0, 2]))
+            .with_projection_indices(Some(vec![3, 0, 2]))?
             .with_file_compression_type(file_compression_type.to_owned())
             .build();
         let exec = DataSourceExec::from_data_source(conf);
@@ -384,7 +389,7 @@ mod tests {
         let path = format!("{TEST_DATA_BASE}/1.json");
 
         // register json file with the execution context
-        ctx.register_json("test", path.as_str(), NdJsonReadOptions::default())
+        ctx.register_json("test", path.as_str(), JsonReadOptions::default())
             .await?;
 
         // register a local file system object store for /tmp directory
@@ -426,7 +431,7 @@ mod tests {
         }
 
         // register each partition as well as the top level dir
-        let json_read_option = NdJsonReadOptions::default();
+        let json_read_option = JsonReadOptions::default();
         ctx.register_json(
             "part0",
             &format!("{out_dir}/{part_0_name}"),
@@ -494,7 +499,10 @@ mod tests {
             .write_json(out_dir_url, DataFrameWriteOptions::new(), None)
             .await
             .expect_err("should fail because input file does not match inferred schema");
-        assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'");
+        assert_eq!(
+            e.strip_backtrace(),
+            "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"
+        );
         Ok(())
     }
 
@@ -503,7 +511,7 @@ mod tests {
         async fn read_test_data(schema_infer_max_records: usize) -> Result<SchemaRef> {
             let ctx = SessionContext::new();
 
-            let options = NdJsonReadOptions {
+            let options = JsonReadOptions {
                 schema_infer_max_records,
                 ..Default::default()
             };
@@ -579,7 +587,7 @@ mod tests {
             .get_ext_with_compression(&file_compression_type)
             .unwrap();
 
-        let read_option = NdJsonReadOptions::default()
+        let read_option = JsonReadOptions::default()
             .file_compression_type(file_compression_type)
             .file_extension(ext.as_str());
 
diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs
index 3f71b253d9695..8e4855afa66bb 100644
--- a/datafusion/core/src/datasource/physical_plan/mod.rs
+++ b/datafusion/core/src/datasource/physical_plan/mod.rs
@@ -17,7 +17,7 @@
 
 //! Execution plans that read file formats
 
-mod arrow_file;
+pub mod arrow;
 pub mod csv;
 pub mod json;
 
@@ -35,156 +35,19 @@ pub use datafusion_datasource_parquet::source::ParquetSource;
 #[cfg(feature = "parquet")]
 pub use datafusion_datasource_parquet::{ParquetFileMetrics, ParquetFileReaderFactory};
 
-pub use arrow_file::ArrowSource;
-
 pub use json::{JsonOpener, JsonSource};
 
+pub use arrow::{ArrowOpener, ArrowSource};
 pub use csv::{CsvOpener, CsvSource};
 pub use datafusion_datasource::file::FileSource;
 pub use datafusion_datasource::file_groups::FileGroup;
 pub use datafusion_datasource::file_groups::FileGroupPartitioner;
-pub use datafusion_datasource::file_meta::FileMeta;
 pub use datafusion_datasource::file_scan_config::{
-    wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig,
-    FileScanConfigBuilder,
+    FileScanConfig, FileScanConfigBuilder, wrap_partition_type_in_dict,
+    wrap_partition_value_in_dict,
 };
 pub use datafusion_datasource::file_sink_config::*;
 
 pub use datafusion_datasource::file_stream::{
-    FileOpenFuture, FileOpener, FileStream, OnError,
+    FileOpenFuture, FileOpener, FileStream, FileStreamBuilder, OnError,
 };
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        cast::AsArray,
-        types::{Float32Type, Float64Type, UInt32Type},
-        BinaryArray, BooleanArray, Float32Array, Int32Array, Int64Array, RecordBatch,
-        StringArray, UInt64Array,
-    };
-    use arrow::datatypes::{DataType, Field, Schema};
-    use arrow_schema::SchemaRef;
-
-    use crate::datasource::schema_adapter::{
-        DefaultSchemaAdapterFactory, SchemaAdapterFactory,
-    };
-
-    #[test]
-    fn schema_mapping_map_batch() {
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::UInt32, true),
-            Field::new("c3", DataType::Float64, true),
-        ]));
-
-        let adapter = DefaultSchemaAdapterFactory
-            .create(table_schema.clone(), table_schema.clone());
-
-        let file_schema = Schema::new(vec![
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::UInt64, true),
-            Field::new("c3", DataType::Float32, true),
-        ]);
-
-        let (mapping, _) = adapter.map_schema(&file_schema).expect("map schema failed");
-
-        let c1 = StringArray::from(vec!["hello", "world"]);
-        let c2 = UInt64Array::from(vec![9_u64, 5_u64]);
-        let c3 = Float32Array::from(vec![2.0_f32, 7.0_f32]);
-        let batch = RecordBatch::try_new(
-            Arc::new(file_schema),
-            vec![Arc::new(c1), Arc::new(c2), Arc::new(c3)],
-        )
-        .unwrap();
-
-        let mapped_batch = mapping.map_batch(batch).unwrap();
-
-        assert_eq!(mapped_batch.schema(), table_schema);
-        assert_eq!(mapped_batch.num_columns(), 3);
-        assert_eq!(mapped_batch.num_rows(), 2);
-
-        let c1 = mapped_batch.column(0).as_string::<i32>();
-        let c2 = mapped_batch.column(1).as_primitive::<UInt32Type>();
-        let c3 = mapped_batch.column(2).as_primitive::<Float64Type>();
-
-        assert_eq!(c1.value(0), "hello");
-        assert_eq!(c1.value(1), "world");
-        assert_eq!(c2.value(0), 9_u32);
-        assert_eq!(c2.value(1), 5_u32);
-        assert_eq!(c3.value(0), 2.0_f64);
-        assert_eq!(c3.value(1), 7.0_f64);
-    }
-
-    #[test]
-    fn schema_adapter_map_schema_with_projection() {
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("c0", DataType::Utf8, true),
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Int32, true),
-            Field::new("c4", DataType::Float32, true),
-        ]));
-
-        let file_schema = Schema::new(vec![
-            Field::new("id", DataType::Int32, true),
-            Field::new("c1", DataType::Boolean, true),
-            Field::new("c2", DataType::Float32, true),
-            Field::new("c3", DataType::Binary, true),
-            Field::new("c4", DataType::Int64, true),
-        ]);
-
-        let indices = vec![1, 2, 4];
-        let schema = SchemaRef::from(table_schema.project(&indices).unwrap());
-        let adapter = DefaultSchemaAdapterFactory.create(schema, table_schema.clone());
-        let (mapping, projection) = adapter.map_schema(&file_schema).unwrap();
-
-        let id = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
-        let c1 = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
-        let c2 = Float32Array::from(vec![Some(2.0_f32), Some(7.0_f32), Some(3.0_f32)]);
-        let c3 = BinaryArray::from_opt_vec(vec![
-            Some(b"hallo"),
-            Some(b"danke"),
-            Some(b"super"),
-        ]);
-        let c4 = Int64Array::from(vec![1, 2, 3]);
-        let batch = RecordBatch::try_new(
-            Arc::new(file_schema),
-            vec![
-                Arc::new(id),
-                Arc::new(c1),
-                Arc::new(c2),
-                Arc::new(c3),
-                Arc::new(c4),
-            ],
-        )
-        .unwrap();
-        let rows_num = batch.num_rows();
-        let projected = batch.project(&projection).unwrap();
-        let mapped_batch = mapping.map_batch(projected).unwrap();
-
-        assert_eq!(
-            mapped_batch.schema(),
-            Arc::new(table_schema.project(&indices).unwrap())
-        );
-        assert_eq!(mapped_batch.num_columns(), indices.len());
-        assert_eq!(mapped_batch.num_rows(), rows_num);
-
-        let c1 = mapped_batch.column(0).as_string::<i32>();
-        let c2 = mapped_batch.column(1).as_primitive::<Float64Type>();
-        let c4 = mapped_batch.column(2).as_primitive::<Float32Type>();
-
-        assert_eq!(c1.value(0), "true");
-        assert_eq!(c1.value(1), "false");
-        assert_eq!(c1.value(2), "true");
-
-        assert_eq!(c2.value(0), 2.0_f64);
-        assert_eq!(c2.value(1), 7.0_f64);
-        assert_eq!(c2.value(2), 3.0_f64);
-
-        assert_eq!(c4.value(0), 1.0_f32);
-        assert_eq!(c4.value(1), 2.0_f32);
-        assert_eq!(c4.value(2), 3.0_f32);
-    }
-}
diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs
index 8dee79ad61b23..4c6d915d5bcaa 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet.rs
@@ -38,34 +38,35 @@ mod tests {
     use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
     use crate::test::object_store::local_unpartitioned_file;
     use arrow::array::{
-        ArrayRef, AsArray, Date64Array, Int32Array, Int64Array, Int8Array, StringArray,
-        StringViewArray, StructArray,
+        ArrayRef, AsArray, Date64Array, DictionaryArray, Int8Array, Int32Array,
+        Int64Array, StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
     };
-    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder};
+    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaBuilder, UInt16Type};
     use arrow::record_batch::RecordBatch;
     use arrow::util::pretty::pretty_format_batches;
     use arrow_schema::{SchemaRef, TimeUnit};
     use bytes::{BufMut, BytesMut};
     use datafusion_common::config::TableParquetOptions;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
-    use datafusion_common::{assert_contains, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, assert_contains};
     use datafusion_datasource::file_format::FileFormat;
-    use datafusion_datasource::file_meta::FileMeta;
     use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
     use datafusion_datasource::source::DataSourceExec;
 
     use datafusion_datasource::file::FileSource;
-    use datafusion_datasource::{FileRange, PartitionedFile};
+    use datafusion_datasource::{PartitionedFile, TableSchema};
     use datafusion_datasource_parquet::source::ParquetSource;
     use datafusion_datasource_parquet::{
         DefaultParquetFileReaderFactory, ParquetFileReaderFactory, ParquetFormat,
     };
     use datafusion_execution::object_store::ObjectStoreUrl;
-    use datafusion_expr::{col, lit, when, Expr};
+    use datafusion_expr::{Expr, col, lit, when};
     use datafusion_physical_expr::planner::logical2physical;
     use datafusion_physical_plan::analyze::AnalyzeExec;
     use datafusion_physical_plan::collect;
-    use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+    use datafusion_physical_plan::metrics::{
+        ExecutionPlanMetricsSet, MetricType, MetricValue, MetricsSet,
+    };
     use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
     use chrono::{TimeZone, Utc};
@@ -160,7 +161,7 @@ mod tests {
                 .as_ref()
                 .map(|p| logical2physical(p, &table_schema));
 
-            let mut source = ParquetSource::default();
+            let mut source = ParquetSource::new(table_schema);
             if let Some(predicate) = predicate {
                 source = source.with_predicate(predicate);
             }
@@ -185,23 +186,20 @@ mod tests {
                 source = source.with_bloom_filter_on_read(false);
             }
 
-            source.with_schema(Arc::clone(&table_schema))
+            Arc::new(source)
         }
 
         fn build_parquet_exec(
             &self,
-            file_schema: SchemaRef,
             file_group: FileGroup,
             source: Arc<dyn FileSource>,
         ) -> Arc<DataSourceExec> {
-            let base_config = FileScanConfigBuilder::new(
-                ObjectStoreUrl::local_filesystem(),
-                file_schema,
-                source,
-            )
-            .with_file_group(file_group)
-            .with_projection(self.projection.clone())
-            .build();
+            let base_config =
+                FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+                    .with_file_group(file_group)
+                    .with_projection_indices(self.projection.clone())
+                    .unwrap()
+                    .build();
             DataSourceExec::from_data_source(base_config)
         }
 
@@ -230,18 +228,15 @@ mod tests {
 
             // build a ParquetExec to return the results
             let parquet_source = self.build_file_source(Arc::clone(table_schema));
-            let parquet_exec = self.build_parquet_exec(
-                Arc::clone(table_schema),
-                file_group.clone(),
-                Arc::clone(&parquet_source),
-            );
+            let parquet_exec =
+                self.build_parquet_exec(file_group.clone(), Arc::clone(&parquet_source));
 
             let analyze_exec = Arc::new(AnalyzeExec::new(
                 false,
                 false,
+                vec![MetricType::SUMMARY, MetricType::DEV],
                 // use a new ParquetSource to avoid sharing execution metrics
                 self.build_parquet_exec(
-                    Arc::clone(table_schema),
                     file_group.clone(),
                     self.build_file_source(Arc::clone(table_schema)),
                 ),
@@ -311,7 +306,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit(1_i32));
@@ -332,7 +327,7 @@ mod tests {
         let metric = get_value(&metrics, "pushdown_rows_pruned");
         assert_eq!(metric, 3, "Expected all rows to be pruned");
 
-        // If we excplicitly allow nulls the rest of the predicate should work
+        // If we explicitly allow nulls the rest of the predicate should work
         let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32)));
         let rt = RoundTrip::new()
             .with_table_schema(table_schema.clone())
@@ -342,13 +337,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+
         | c1 | c2 |
         +----+----+
         | 1  |    |
         +----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -369,7 +364,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -390,7 +385,7 @@ mod tests {
         let metric = get_value(&metrics, "pushdown_rows_pruned");
         assert_eq!(metric, 3, "Expected all rows to be pruned");
 
-        // If we excplicitly allow nulls the rest of the predicate should work
+        // If we explicitly allow nulls the rest of the predicate should work
         let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32)));
         let rt = RoundTrip::new()
             .with_table_schema(table_schema.clone())
@@ -400,13 +395,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+
         | c1 | c2 |
         +----+----+
         | 1  |    |
         +----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -431,7 +426,7 @@ mod tests {
 
         let batch = RecordBatch::try_new(file_schema.clone(), vec![c1, c3]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -452,7 +447,7 @@ mod tests {
         let metric = get_value(&metrics, "pushdown_rows_pruned");
         assert_eq!(metric, 3, "Expected all rows to be pruned");
 
-        // If we excplicitly allow nulls the rest of the predicate should work
+        // If we explicitly allow nulls the rest of the predicate should work
         let filter = col("c2").is_null().and(col("c1").eq(lit(1_i32)));
         let rt = RoundTrip::new()
             .with_table_schema(table_schema.clone())
@@ -462,13 +457,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         | 1  |    | 7  |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -493,7 +488,7 @@ mod tests {
         let batch =
             RecordBatch::try_new(file_schema.clone(), vec![c3.clone(), c3]).unwrap();
 
-        // Since c2 is missing from the file and we didn't supply a custom `SchemaAdapterFactory`,
+        // Since c2 is missing from the file and we didn't supply a custom `PhysicalExprAdapterFactory`,
         // the default behavior is to fill in missing columns with nulls.
         // Thus this predicate will come back as false.
         let filter = col("c2").eq(lit("abc"));
@@ -514,7 +509,7 @@ mod tests {
         let metric = get_value(&metrics, "pushdown_rows_pruned");
         assert_eq!(metric, 3, "Expected all rows to be pruned");
 
-        // If we excplicitly allow nulls the rest of the predicate should work
+        // If we explicitly allow nulls the rest of the predicate should work
         let filter = col("c2").is_null().and(col("c3").eq(lit(7_i32)));
         let rt = RoundTrip::new()
             .with_table_schema(table_schema.clone())
@@ -524,13 +519,13 @@ mod tests {
             .await;
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         |    |    | 7  |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -573,13 +568,13 @@ mod tests {
 
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
         | 1  |    | 10 |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -603,7 +598,7 @@ mod tests {
 
         let batches = rt.batches.unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&batches),@r###"
+        insta::assert_snapshot!(batches_to_sort_string(&batches),@r"
         +----+----+----+
         | c1 | c2 | c3 |
         +----+----+----+
@@ -611,7 +606,7 @@ mod tests {
         | 4  |    | 40 |
         | 5  |    | 50 |
         +----+----+----+
-        "###);
+        ");
 
         let metrics = rt.parquet_exec.metrics().unwrap();
         let metric = get_value(&metrics, "pushdown_rows_pruned");
@@ -640,7 +635,7 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read), @r"
         +-----+----+----+
         | c1  | c2 | c3 |
         +-----+----+----+
@@ -654,7 +649,7 @@ mod tests {
         | bar |    |    |
         | bar |    |    |
         +-----+----+----+
-        "###);
+        ");
     }
 
     #[tokio::test]
@@ -755,18 +750,18 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+----+
-            | c1  | c3 | c2 |
-            +-----+----+----+
-            |     |    |    |
-            |     | 10 | 1  |
-            |     | 20 |    |
-            |     | 20 | 2  |
-            | Foo | 10 |    |
-            | bar |    |    |
-            +-----+----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+----+
+        | c1  | c3 | c2 |
+        +-----+----+----+
+        |     |    |    |
+        |     | 10 | 1  |
+        |     | 20 |    |
+        |     | 20 | 2  |
+        | Foo | 10 |    |
+        | bar |    |    |
+        +-----+----+----+
+        ");
     }
 
     #[tokio::test]
@@ -787,14 +782,14 @@ mod tests {
             .round_trip(vec![batch1, batch2])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +----+----+----+
         | c1 | c3 | c2 |
         +----+----+----+
         |    | 10 | 1  |
         |    | 20 | 2  |
         +----+----+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
         // Note there are were 6 rows in total (across three batches)
         assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 4);
@@ -830,7 +825,7 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&read), @r"
         +-----+-----+
         | c1  | c4  |
         +-----+-----+
@@ -841,7 +836,7 @@ mod tests {
         | bar |     |
         | bar |     |
         +-----+-----+
-        "###);
+        ");
     }
 
     #[tokio::test]
@@ -960,6 +955,73 @@ mod tests {
         assert_eq!(read, 2, "Expected 2 rows to match the predicate");
     }
 
+    #[tokio::test]
+    async fn evolved_schema_column_type_filter_timestamp_units() {
+        // The table and filter have a common data type
+        // The table schema is in milliseconds, but the file schema is in nanoseconds
+        let c1: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![
+            Some(1_000_000_000), // 1970-01-01T00:00:01Z
+            Some(2_000_000_000), // 1970-01-01T00:00:02Z
+            Some(3_000_000_000), // 1970-01-01T00:00:03Z
+            Some(4_000_000_000), // 1970-01-01T00:00:04Z
+        ]));
+        let batch = create_batch(vec![("c1", c1.clone())]);
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "c1",
+            DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
+            false,
+        )]));
+        // One row should match, 2 pruned via page index, 1 pruned via filter pushdown
+        let filter = col("c1").eq(lit(ScalarValue::TimestampMillisecond(
+            Some(1_000),
+            Some("UTC".into()),
+        )));
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_pushdown_predicate()
+            .with_page_index_predicate() // produces pages with 2 rows each (2 pages total for our data)
+            .with_table_schema(table_schema.clone())
+            .round_trip(vec![batch.clone()])
+            .await;
+        // There should be no predicate evaluation errors and we keep 1 row
+        let metrics = rt.parquet_exec.metrics().unwrap();
+        assert_eq!(get_value(&metrics, "predicate_evaluation_errors"), 0);
+        let read = rt
+            .batches
+            .unwrap()
+            .iter()
+            .map(|b| b.num_rows())
+            .sum::<usize>();
+        assert_eq!(read, 1, "Expected 1 rows to match the predicate");
+        assert_eq!(get_value(&metrics, "row_groups_pruned_statistics"), 0);
+        assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 2);
+        assert_eq!(get_value(&metrics, "page_index_pages_pruned"), 1);
+        assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 1);
+        // If we filter with a value that is completely out of the range of the data
+        // we prune at the row group level.
+        let filter = col("c1").eq(lit(ScalarValue::TimestampMillisecond(
+            Some(5_000),
+            Some("UTC".into()),
+        )));
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_pushdown_predicate()
+            .with_table_schema(table_schema)
+            .round_trip(vec![batch])
+            .await;
+        // There should be no predicate evaluation errors and we keep 0 rows
+        let metrics = rt.parquet_exec.metrics().unwrap();
+        assert_eq!(get_value(&metrics, "predicate_evaluation_errors"), 0);
+        let read = rt
+            .batches
+            .unwrap()
+            .iter()
+            .map(|b| b.num_rows())
+            .sum::<usize>();
+        assert_eq!(read, 0, "Expected 0 rows to match the predicate");
+        assert_eq!(get_value(&metrics, "row_groups_pruned_statistics"), 1);
+    }
+
     #[tokio::test]
     async fn evolved_schema_disjoint_schema_filter() {
         let c1: ArrayRef =
@@ -988,18 +1050,18 @@ mod tests {
         // In a real query where this predicate was pushed down from a filter stage instead of created directly in the `DataSourceExec`,
         // the filter stage would be preserved as a separate execution plan stage so the actual query results would be as expected.
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            |     |    |
-            |     |    |
-            |     | 1  |
-            |     | 2  |
-            | Foo |    |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        |     |    |
+        |     |    |
+        |     | 1  |
+        |     | 2  |
+        | Foo |    |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1024,13 +1086,13 @@ mod tests {
             .round_trip(vec![batch1, batch2])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +----+----+
         | c1 | c2 |
         +----+----+
         |    | 1  |
         +----+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
         // Note there are were 6 rows in total (across three batches)
         assert_eq!(get_value(&metrics, "pushdown_rows_pruned"), 5);
@@ -1084,7 +1146,7 @@ mod tests {
             .round_trip(vec![batch1, batch2, batch3, batch4])
             .await;
 
-        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r###"
+        insta::assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()), @r"
         +------+----+
         | c1   | c2 |
         +------+----+
@@ -1101,14 +1163,22 @@ mod tests {
         | Foo2 |    |
         | Foo3 |    |
         +------+----+
-        "###);
+        ");
         let metrics = rt.parquet_exec.metrics().unwrap();
 
         // There are 4 rows pruned in each of batch2, batch3, and
         // batch4 for a total of 12. batch1 had no pruning as c2 was
         // filled in as null
-        assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 12);
-        assert_eq!(get_value(&metrics, "page_index_rows_matched"), 6);
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_rows_pruned, 12);
+        assert_eq!(page_index_rows_matched, 6);
+
+        // each page has 2 rows, so the num of pages is 1/2 the number of rows
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 6);
+        assert_eq!(page_index_pages_matched, 3);
     }
 
     #[tokio::test]
@@ -1131,14 +1201,14 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            | Foo | 1  |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        | Foo | 1  |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1161,15 +1231,15 @@ mod tests {
             .await
             .unwrap();
 
-        insta::assert_snapshot!(batches_to_sort_string(&read),@r###"
-            +-----+----+
-            | c1  | c2 |
-            +-----+----+
-            |     | 2  |
-            | Foo | 1  |
-            | bar |    |
-            +-----+----+
-        "###);
+        insta::assert_snapshot!(batches_to_sort_string(&read),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        |     | 2  |
+        | Foo | 1  |
+        | bar |    |
+        +-----+----+
+        ");
     }
 
     #[tokio::test]
@@ -1194,7 +1264,7 @@ mod tests {
             ("c3", c3.clone()),
         ]);
 
-        // batch2: c3(int8), c2(int64), c1(string), c4(string)
+        // batch2: c3(date64), c2(int64), c1(string)
         let batch2 = create_batch(vec![("c3", c4), ("c2", c2), ("c1", c1)]);
 
         let table_schema = Schema::new(vec![
@@ -1208,8 +1278,10 @@ mod tests {
             .with_table_schema(Arc::new(table_schema))
             .round_trip_to_batches(vec![batch1, batch2])
             .await;
-        assert_contains!(read.unwrap_err().to_string(),
-            "Cannot cast file schema field c3 of type Date64 to table schema field of type Int8");
+        assert_contains!(
+            read.unwrap_err().to_string(),
+            "Cannot cast column 'c3' from 'Date64' (physical data type) to 'Int8' (logical data type)"
+        );
     }
 
     #[tokio::test]
@@ -1259,7 +1331,7 @@ mod tests {
     async fn parquet_exec_with_int96_from_spark() -> Result<()> {
         // arrow-rs relies on the chrono library to convert between timestamps and strings, so
         // instead compare as Int64. The underlying type should be a PrimitiveArray of Int64
-        // anyway, so this should be a zero-copy non-modifying cast at the SchemaAdapter.
+        // anyway, so this should be a zero-copy non-modifying cast.
 
         let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
         let testdata = datafusion_common::test_util::parquet_test_data();
@@ -1462,14 +1534,7 @@ mod tests {
     #[tokio::test]
     async fn parquet_exec_with_range() -> Result<()> {
         fn file_range(meta: &ObjectMeta, start: i64, end: i64) -> PartitionedFile {
-            PartitionedFile {
-                object_meta: meta.clone(),
-                partition_values: vec![],
-                range: Some(FileRange { start, end }),
-                statistics: None,
-                extensions: None,
-                metadata_size_hint: None,
-            }
+            PartitionedFile::new_from_meta(meta.clone()).with_range(start, end)
         }
 
         async fn assert_parquet_read(
@@ -1480,8 +1545,7 @@ mod tests {
         ) -> Result<()> {
             let config = FileScanConfigBuilder::new(
                 ObjectStoreUrl::local_filesystem(),
-                file_schema,
-                Arc::new(ParquetSource::default()),
+                Arc::new(ParquetSource::new(file_schema)),
             )
             .with_file_groups(file_groups)
             .build();
@@ -1552,21 +1616,15 @@ mod tests {
             .await
             .unwrap();
 
-        let partitioned_file = PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![
+        let partitioned_file = PartitionedFile::new_from_meta(meta)
+            .with_partition_values(vec![
                 ScalarValue::from("2021"),
                 ScalarValue::UInt8(Some(10)),
                 ScalarValue::Dictionary(
                     Box::new(DataType::UInt16),
                     Box::new(ScalarValue::from("26")),
                 ),
-            ],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+            ]);
 
         let expected_schema = Schema::new(vec![
             Field::new("id", DataType::Int32, true),
@@ -1583,23 +1641,27 @@ mod tests {
             ),
         ]);
 
-        let source = Arc::new(ParquetSource::default());
-        let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source)
-            .with_file(partitioned_file)
-            // file has 10 cols so index 12 should be month and 13 should be day
-            .with_projection(Some(vec![0, 1, 2, 12, 13]))
-            .with_table_partition_cols(vec![
-                Field::new("year", DataType::Utf8, false),
-                Field::new("month", DataType::UInt8, false),
-                Field::new(
+        let table_schema = TableSchema::new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Field::new("year", DataType::Utf8, false)),
+                Arc::new(Field::new("month", DataType::UInt8, false)),
+                Arc::new(Field::new(
                     "day",
                     DataType::Dictionary(
                         Box::new(DataType::UInt16),
                         Box::new(DataType::Utf8),
                     ),
                     false,
-                ),
-            ])
+                )),
+            ],
+        );
+        let source = Arc::new(ParquetSource::new(table_schema.clone()));
+        let config = FileScanConfigBuilder::new(object_store_url, source)
+            .with_file(partitioned_file)
+            // file has 10 cols so index 12 should be month and 13 should be day
+            .with_projection_indices(Some(vec![0, 1, 2, 12, 13]))
+            .unwrap()
             .build();
 
         let parquet_exec = DataSourceExec::from_data_source(config);
@@ -1614,20 +1676,20 @@ mod tests {
         let batch = results.next().await.unwrap()?;
         assert_eq!(batch.schema().as_ref(), &expected_schema);
 
-        assert_snapshot!(batches_to_string(&[batch]),@r###"
-            +----+----------+-------------+-------+-----+
-            | id | bool_col | tinyint_col | month | day |
-            +----+----------+-------------+-------+-----+
-            | 4  | true     | 0           | 10    | 26  |
-            | 5  | false    | 1           | 10    | 26  |
-            | 6  | true     | 0           | 10    | 26  |
-            | 7  | false    | 1           | 10    | 26  |
-            | 2  | true     | 0           | 10    | 26  |
-            | 3  | false    | 1           | 10    | 26  |
-            | 0  | true     | 0           | 10    | 26  |
-            | 1  | false    | 1           | 10    | 26  |
-            +----+----------+-------------+-------+-----+
-        "###);
+        assert_snapshot!(batches_to_string(&[batch]),@r"
+        +----+----------+-------------+-------+-----+
+        | id | bool_col | tinyint_col | month | day |
+        +----+----------+-------------+-------+-----+
+        | 4  | true     | 0           | 10    | 26  |
+        | 5  | false    | 1           | 10    | 26  |
+        | 6  | true     | 0           | 10    | 26  |
+        | 7  | false    | 1           | 10    | 26  |
+        | 2  | true     | 0           | 10    | 26  |
+        | 3  | false    | 1           | 10    | 26  |
+        | 0  | true     | 0           | 10    | 26  |
+        | 1  | false    | 1           | 10    | 26  |
+        +----+----------+-------------+-------+-----+
+        ");
 
         let batch = results.next().await;
         assert!(batch.is_none());
@@ -1643,26 +1705,18 @@ mod tests {
             .unwrap()
             .child("invalid.parquet");
 
-        let partitioned_file = PartitionedFile {
-            object_meta: ObjectMeta {
-                location,
-                last_modified: Utc.timestamp_nanos(0),
-                size: 1337,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        };
+        let partitioned_file = PartitionedFile::new_from_meta(ObjectMeta {
+            location,
+            last_modified: Utc.timestamp_nanos(0),
+            size: 1337,
+            e_tag: None,
+            version: None,
+        });
 
         let file_schema = Arc::new(Schema::empty());
         let config = FileScanConfigBuilder::new(
             ObjectStoreUrl::local_filesystem(),
-            file_schema,
-            Arc::new(ParquetSource::default()),
+            Arc::new(ParquetSource::new(file_schema)),
         )
         .with_file(partitioned_file)
         .build();
@@ -1687,6 +1741,7 @@ mod tests {
             Some(3),
             Some(4),
             Some(5),
+            Some(6), // last page with only one row
         ]));
         let batch1 = create_batch(vec![("int", c1.clone())]);
 
@@ -1695,25 +1750,53 @@ mod tests {
         let rt = RoundTrip::new()
             .with_predicate(filter)
             .with_page_index_predicate()
-            .round_trip(vec![batch1])
+            .round_trip(vec![batch1.clone()])
             .await;
 
         let metrics = rt.parquet_exec.metrics().unwrap();
 
-        assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()),@r###"
-            +-----+
-            | int |
-            +-----+
-            | 4   |
-            | 5   |
-            +-----+
-        "###);
-        assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 4);
-        assert_eq!(get_value(&metrics, "page_index_rows_matched"), 2);
+        assert_snapshot!(batches_to_sort_string(&rt.batches.unwrap()),@r"
+        +-----+
+        | int |
+        +-----+
+        | 4   |
+        | 5   |
+        +-----+
+        ");
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_rows_pruned, 5);
+        assert_eq!(page_index_rows_matched, 2);
         assert!(
             get_value(&metrics, "page_index_eval_time") > 0,
             "no eval time in metrics: {metrics:#?}"
         );
+
+        // each page has 2 rows, so the num of pages is 1/2 the number of rows
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 3);
+        assert_eq!(page_index_pages_matched, 1);
+
+        // test with a filter that matches the page with one row
+        let filter = col("int").eq(lit(6_i32));
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_page_index_predicate()
+            .round_trip(vec![batch1])
+            .await;
+
+        let metrics = rt.parquet_exec.metrics().unwrap();
+
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_rows_pruned, 6);
+        assert_eq!(page_index_rows_matched, 1);
+
+        let (page_index_pages_pruned, page_index_pages_matched) =
+            get_pruning_metric(&metrics, "page_index_pages_pruned");
+        assert_eq!(page_index_pages_pruned, 3);
+        assert_eq!(page_index_pages_matched, 1);
     }
 
     /// Returns a string array with contents:
@@ -1751,14 +1834,14 @@ mod tests {
         let metrics = rt.parquet_exec.metrics().unwrap();
 
         // assert the batches and some metrics
-        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r###"
-            +-----+
-            | c1  |
-            +-----+
-            | Foo |
-            | zzz |
-            +-----+
-        "###);
+        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r"
+        +-----+
+        | c1  |
+        +-----+
+        | Foo |
+        | zzz |
+        +-----+
+        ");
 
         // pushdown predicates have eliminated all 4 bar rows and the
         // null row for 5 rows total
@@ -1798,13 +1881,109 @@ mod tests {
         assert_contains!(&explain, "predicate=c1@0 != bar");
 
         // there's a single row group, but we can check that it matched
-        // if no pruning was done this would be 0 instead of 1
-        assert_contains!(&explain, "row_groups_matched_statistics=1");
+        assert_contains!(
+            &explain,
+            "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
+        );
 
         // check the projection
         assert_contains!(&explain, "projection=[c1]");
     }
 
+    #[tokio::test]
+    async fn parquet_exec_metrics_with_multiple_predicates() {
+        // Test that metrics are correctly calculated when multiple predicates
+        // are pushed down (connected with AND). This ensures we don't double-count
+        // rows when multiple predicates filter the data sequentially.
+
+        // Create a batch with two columns: c1 (string) and c2 (int32)
+        // Total: 10 rows
+        let c1: ArrayRef = Arc::new(StringArray::from(vec![
+            Some("foo"), // 0 - passes c1 filter, fails c2 filter (5 <= 10)
+            Some("bar"), // 1 - fails c1 filter
+            Some("bar"), // 2 - fails c1 filter
+            Some("baz"), // 3 - passes both filters (20 > 10)
+            Some("foo"), // 4 - passes both filters (12 > 10)
+            Some("bar"), // 5 - fails c1 filter
+            Some("baz"), // 6 - passes both filters (25 > 10)
+            Some("foo"), // 7 - passes c1 filter, fails c2 filter (7 <= 10)
+            Some("bar"), // 8 - fails c1 filter
+            Some("qux"), // 9 - passes both filters (30 > 10)
+        ]));
+
+        let c2: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(5),
+            Some(15),
+            Some(8),
+            Some(20),
+            Some(12),
+            Some(9),
+            Some(25),
+            Some(7),
+            Some(18),
+            Some(30),
+        ]));
+
+        let batch = create_batch(vec![("c1", c1), ("c2", c2)]);
+
+        // Create filter: c1 != 'bar' AND c2 > 10
+        //
+        // First predicate (c1 != 'bar'):
+        //   - Rows passing: 0, 3, 4, 6, 7, 9 (6 rows)
+        //   - Rows pruned: 1, 2, 5, 8 (4 rows)
+        //
+        // Second predicate (c2 > 10) on remaining 6 rows:
+        //   - Rows passing: 3, 4, 6, 9 (4 rows with c2 = 20, 12, 25, 30)
+        //   - Rows pruned: 0, 7 (2 rows with c2 = 5, 7)
+        //
+        // Expected final metrics:
+        //   - pushdown_rows_matched: 4 (final result)
+        //   - pushdown_rows_pruned: 4 + 2 = 6 (cumulative)
+        //   - Total: 4 + 6 = 10
+
+        let filter = col("c1").not_eq(lit("bar")).and(col("c2").gt(lit(10)));
+
+        let rt = RoundTrip::new()
+            .with_predicate(filter)
+            .with_pushdown_predicate()
+            .round_trip(vec![batch])
+            .await;
+
+        let metrics = rt.parquet_exec.metrics().unwrap();
+
+        // Verify the result rows
+        assert_snapshot!(batches_to_string(&rt.batches.unwrap()),@r"
+        +-----+----+
+        | c1  | c2 |
+        +-----+----+
+        | baz | 20 |
+        | foo | 12 |
+        | baz | 25 |
+        | qux | 30 |
+        +-----+----+
+        ");
+
+        // Verify metrics - this is the key test
+        let pushdown_rows_matched = get_value(&metrics, "pushdown_rows_matched");
+        let pushdown_rows_pruned = get_value(&metrics, "pushdown_rows_pruned");
+
+        assert_eq!(
+            pushdown_rows_matched, 4,
+            "Expected 4 rows to pass both predicates"
+        );
+        assert_eq!(
+            pushdown_rows_pruned, 6,
+            "Expected 6 rows to be pruned (4 by first predicate + 2 by second predicate)"
+        );
+
+        // The sum should equal the total number of rows
+        assert_eq!(
+            pushdown_rows_matched + pushdown_rows_pruned,
+            10,
+            "matched + pruned should equal total rows"
+        );
+    }
+
     #[tokio::test]
     async fn parquet_exec_has_no_pruning_predicate_if_can_not_prune() {
         // batch1: c1(string)
@@ -1830,8 +2009,10 @@ mod tests {
 
         // When both matched and pruned are 0, it means that the pruning predicate
         // was not used at all.
-        assert_contains!(&explain, "row_groups_matched_statistics=0");
-        assert_contains!(&explain, "row_groups_pruned_statistics=0");
+        assert_contains!(
+            &explain,
+            "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
+        );
 
         // But pushdown predicate should be present
         assert_contains!(
@@ -1884,7 +2065,12 @@ mod tests {
     /// Panics if no such metric.
     fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
         match metrics.sum_by_name(metric_name) {
-            Some(v) => v.as_usize(),
+            Some(v) => match v {
+                MetricValue::PruningMetrics {
+                    pruning_metrics, ..
+                } => pruning_metrics.pruned(),
+                _ => v.as_usize(),
+            },
             _ => {
                 panic!(
                     "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
@@ -1893,6 +2079,20 @@ mod tests {
         }
     }
 
+    fn get_pruning_metric(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) {
+        match metrics.sum_by_name(metric_name) {
+            Some(MetricValue::PruningMetrics {
+                pruning_metrics, ..
+            }) => (pruning_metrics.pruned(), pruning_metrics.matched()),
+            Some(_) => panic!(
+                "Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}"
+            ),
+            None => panic!(
+                "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+            ),
+        }
+    }
+
     fn populate_csv_partitions(
         tmp_dir: &TempDir,
         partition_count: usize,
@@ -1952,14 +2152,14 @@ mod tests {
         let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out";
         fs::create_dir(&out_dir).unwrap();
         let df = ctx.sql("SELECT c1, c2 FROM test").await?;
-        let schema: Schema = df.schema().into();
+        let schema = Arc::clone(df.schema().inner());
         // Register a listing table - this will use all files in the directory as data sources
         // for the query
         ctx.register_listing_table(
             "my_table",
             &out_dir,
             listing_options,
-            Some(Arc::new(schema)),
+            Some(schema),
             None,
         )
         .await
@@ -2024,13 +2224,13 @@ mod tests {
         let sql = "select * from base_table where name='test02'";
         let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap();
         assert_eq!(batch.len(), 1);
-        insta::assert_snapshot!(batches_to_string(&batch),@r###"
-            +---------------------+----+--------+
-            | struct              | id | name   |
-            +---------------------+----+--------+
-            | {id: 4, name: aaa2} | 2  | test02 |
-            +---------------------+----+--------+
-        "###);
+        insta::assert_snapshot!(batches_to_string(&batch),@r"
+        +---------------------+----+--------+
+        | struct              | id | name   |
+        +---------------------+----+--------+
+        | {id: 4, name: aaa2} | 2  | test02 |
+        +---------------------+----+--------+
+        ");
         Ok(())
     }
 
@@ -2053,13 +2253,55 @@ mod tests {
         let sql = "select * from base_table where name='test02'";
         let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap();
         assert_eq!(batch.len(), 1);
-        insta::assert_snapshot!(batches_to_string(&batch),@r###"
-            +---------------------+----+--------+
-            | struct              | id | name   |
-            +---------------------+----+--------+
-            | {id: 4, name: aaa2} | 2  | test02 |
-            +---------------------+----+--------+
-        "###);
+        insta::assert_snapshot!(batches_to_string(&batch),@r"
+        +---------------------+----+--------+
+        | struct              | id | name   |
+        +---------------------+----+--------+
+        | {id: 4, name: aaa2} | 2  | test02 |
+        +---------------------+----+--------+
+        ");
+        Ok(())
+    }
+
+    /// Tests that constant dictionary columns (where min == max in statistics)
+    /// are correctly handled. This reproduced a bug where the constant value
+    /// from statistics had type Utf8 but the schema expected Dictionary.
+    #[tokio::test]
+    async fn test_constant_dictionary_column_parquet() -> Result<()> {
+        let tmp_dir = TempDir::new()?;
+        let path = tmp_dir.path().to_str().unwrap().to_string() + "/test.parquet";
+
+        // Write parquet with dictionary column where all values are the same
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "status",
+            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
+            false,
+        )]));
+        let status: DictionaryArray<UInt16Type> =
+            vec!["active", "active"].into_iter().collect();
+        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(status)])?;
+        let file = File::create(&path)?;
+        let props = WriterProperties::builder()
+            .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page)
+            .build();
+        let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
+        writer.write(&batch)?;
+        writer.close()?;
+
+        // Query the constant dictionary column
+        let ctx = SessionContext::new();
+        ctx.register_parquet("t", &path, ParquetReadOptions::default())
+            .await?;
+        let result = ctx.sql("SELECT status FROM t").await?.collect().await?;
+
+        insta::assert_snapshot!(batches_to_string(&result),@r"
+        +--------+
+        | status |
+        +--------+
+        | active |
+        | active |
+        +--------+
+        ");
         Ok(())
     }
 
@@ -2141,7 +2383,7 @@ mod tests {
         fn create_reader(
             &self,
             partition_index: usize,
-            file_meta: FileMeta,
+            partitioned_file: PartitionedFile,
             metadata_size_hint: Option<usize>,
             metrics: &ExecutionPlanMetricsSet,
         ) -> Result<Box<dyn parquet::arrow::async_reader::AsyncFileReader + Send>>
@@ -2152,7 +2394,7 @@ mod tests {
                 .push(metadata_size_hint);
             self.inner.create_reader(
                 partition_index,
-                file_meta,
+                partitioned_file,
                 metadata_size_hint,
                 metrics,
             )
@@ -2184,42 +2426,28 @@ mod tests {
         let size_hint_calls = reader_factory.metadata_size_hint_calls.clone();
 
         let source = Arc::new(
-            ParquetSource::default()
+            ParquetSource::new(Arc::clone(&schema))
                 .with_parquet_file_reader_factory(reader_factory)
                 .with_metadata_size_hint(456),
         );
-        let config = FileScanConfigBuilder::new(store_url, schema, source)
+        let config = FileScanConfigBuilder::new(store_url, source)
             .with_file(
-                PartitionedFile {
-                    object_meta: ObjectMeta {
-                        location: Path::from(name_1),
-                        last_modified: Utc::now(),
-                        size: total_size_1,
-                        e_tag: None,
-                        version: None,
-                    },
-                    partition_values: vec![],
-                    range: None,
-                    statistics: None,
-                    extensions: None,
-                    metadata_size_hint: None,
-                }
-                .with_metadata_size_hint(123),
-            )
-            .with_file(PartitionedFile {
-                object_meta: ObjectMeta {
-                    location: Path::from(name_2),
+                PartitionedFile::new_from_meta(ObjectMeta {
+                    location: Path::from(name_1),
                     last_modified: Utc::now(),
-                    size: total_size_2,
+                    size: total_size_1,
                     e_tag: None,
                     version: None,
-                },
-                partition_values: vec![],
-                range: None,
-                statistics: None,
-                extensions: None,
-                metadata_size_hint: None,
-            })
+                })
+                .with_metadata_size_hint(123),
+            )
+            .with_file(PartitionedFile::new_from_meta(ObjectMeta {
+                location: Path::from(name_2),
+                last_modified: Utc::now(),
+                size: total_size_2,
+                e_tag: None,
+                version: None,
+            }))
             .build();
 
         let exec = DataSourceExec::from_data_source(config);
diff --git a/datafusion/core/src/datasource/view_test.rs b/datafusion/core/src/datasource/view_test.rs
index 85ad9ff664ade..35418d6dea632 100644
--- a/datafusion/core/src/datasource/view_test.rs
+++ b/datafusion/core/src/datasource/view_test.rs
@@ -46,13 +46,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---+
         | b |
         +---+
         | 2 |
         +---+
-        "###);
+        ");
 
         Ok(())
     }
@@ -96,14 +96,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column1 | column2 | column3 |
         +---------+---------+---------+
         | 1       | 2       | 3       |
         | 4       | 5       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         let view_sql =
             "CREATE VIEW replace_xyz AS SELECT * REPLACE (column1*2 as column1) FROM xyz";
@@ -115,14 +115,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column1 | column2 | column3 |
         +---------+---------+---------+
         | 2       | 2       | 3       |
         | 8       | 5       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -146,14 +146,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------------+
         | column1_alias |
         +---------------+
         | 1             |
         | 4             |
         +---------------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -177,14 +177,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------------+---------------+
         | column2_alias | column1_alias |
         +---------------+---------------+
         | 2             | 1             |
         | 5             | 4             |
         +---------------+---------------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -213,14 +213,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 1       |
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -249,13 +249,13 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -287,14 +287,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+---------+---------+
         | column2 | column1 | column3 |
         +---------+---------+---------+
         | 2       | 1       | 3       |
         | 5       | 4       | 6       |
         +---------+---------+---------+
-        "###);
+        ");
 
         Ok(())
     }
@@ -358,7 +358,10 @@ mod tests {
             .to_string();
         assert!(formatted.contains("DataSourceExec: "));
         assert!(formatted.contains("file_type=parquet"));
-        assert!(formatted.contains("projection=[bool_col, int_col], limit=10"));
+        assert!(
+            formatted.contains("projection=[bool_col, int_col], limit=10"),
+            "{formatted}"
+        );
         Ok(())
     }
 
@@ -442,14 +445,14 @@ mod tests {
             .collect()
             .await?;
 
-        insta::assert_snapshot!(batches_to_string(&results),@r###"
+        insta::assert_snapshot!(batches_to_string(&results),@r"
         +---------+
         | column1 |
         +---------+
         | 1       |
         | 4       |
         +---------+
-        "###);
+        ");
 
         Ok(())
     }
diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs
index 15d6d21f038a0..e6f95886e91d1 100644
--- a/datafusion/core/src/execution/context/csv.rs
+++ b/datafusion/core/src/execution/context/csv.rs
@@ -37,9 +37,16 @@ impl SessionContext {
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
     /// // You can read a single file using `read_csv`
-    /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+    ///     .await?;
     /// // you can also read multiple files:
-    /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?;
+    /// let df = ctx
+    ///     .read_csv(
+    ///         vec!["tests/data/example.csv", "tests/data/example.csv"],
+    ///         CsvReadOptions::new(),
+    ///     )
+    ///     .await?;
     /// # Ok(())
     /// # }
     /// ```
diff --git a/datafusion/core/src/execution/context/json.rs b/datafusion/core/src/execution/context/json.rs
index e9d799400863d..f7df2ad7a1cd6 100644
--- a/datafusion/core/src/execution/context/json.rs
+++ b/datafusion/core/src/execution/context/json.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use super::super::options::ReadOptions;
+use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
+use crate::execution::options::JsonReadOptions;
 use datafusion_common::TableReference;
 use datafusion_datasource_json::source::plan_to_json;
 use std::sync::Arc;
 
-use super::super::options::{NdJsonReadOptions, ReadOptions};
-use super::{DataFilePaths, DataFrame, ExecutionPlan, Result, SessionContext};
-
 impl SessionContext {
     /// Creates a [`DataFrame`] for reading an JSON data source.
     ///
@@ -32,7 +32,7 @@ impl SessionContext {
     pub async fn read_json<P: DataFilePaths>(
         &self,
         table_paths: P,
-        options: NdJsonReadOptions<'_>,
+        options: JsonReadOptions<'_>,
     ) -> Result<DataFrame> {
         self._read_type(table_paths, options).await
     }
@@ -43,7 +43,7 @@ impl SessionContext {
         &self,
         table_ref: impl Into<TableReference>,
         table_path: impl AsRef<str>,
-        options: NdJsonReadOptions<'_>,
+        options: JsonReadOptions<'_>,
     ) -> Result<()> {
         let listing_options = options
             .to_listing_options(&self.copied_config(), self.copied_table_options());
diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs
index 5ef666b61e547..5dbae61fc534d 100644
--- a/datafusion/core/src/execution/context/mod.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -20,6 +20,7 @@
 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::sync::{Arc, Weak};
+use std::time::Duration;
 
 use super::options::ReadOptions;
 use crate::datasource::dynamic_file::DynamicListTableFactory;
@@ -33,20 +34,20 @@ use crate::{
     datasource::listing::{
         ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
     },
-    datasource::{provider_as_source, MemTable, ViewTable},
-    error::{DataFusionError, Result},
+    datasource::{MemTable, ViewTable, provider_as_source},
+    error::Result,
     execution::{
+        FunctionRegistry,
         options::ArrowReadOptions,
         runtime_env::{RuntimeEnv, RuntimeEnvBuilder},
-        FunctionRegistry,
     },
     logical_expr::AggregateUDF,
     logical_expr::ScalarUDF,
     logical_expr::{
         CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction,
         CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable,
-        DropView, Execute, LogicalPlan, LogicalPlanBuilder, Prepare, SetVariable,
-        TableType, UNNAMED_TABLE,
+        DropView, Execute, LogicalPlan, LogicalPlanBuilder, Prepare, ResetVariable,
+        SetVariable, TableType, UNNAMED_TABLE,
     },
     physical_expr::PhysicalExpr,
     physical_plan::ExecutionPlan,
@@ -58,30 +59,43 @@ pub use crate::execution::session_state::SessionState;
 
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_catalog::memory::MemorySchemaProvider;
 use datafusion_catalog::MemoryCatalogProvider;
+use datafusion_catalog::memory::MemorySchemaProvider;
 use datafusion_catalog::{
     DynamicFileCatalog, TableFunction, TableFunctionImpl, UrlTableFactory,
 };
-use datafusion_common::config::ConfigOptions;
+use datafusion_common::config::{ConfigField, ConfigOptions};
+use datafusion_common::metadata::ScalarAndMetadata;
 use datafusion_common::{
+    DFSchema, DataFusionError, ParamValues, SchemaReference, TableReference,
     config::{ConfigExtension, TableOptions},
-    exec_datafusion_err, exec_err, not_impl_err, plan_datafusion_err, plan_err,
+    exec_datafusion_err, exec_err, internal_datafusion_err, not_impl_err,
+    plan_datafusion_err, plan_err,
     tree_node::{TreeNodeRecursion, TreeNodeVisitor},
-    DFSchema, ParamValues, ScalarValue, SchemaReference, TableReference,
+};
+pub use datafusion_execution::TaskContext;
+use datafusion_execution::cache::cache_manager::{
+    DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL,
+    DEFAULT_METADATA_CACHE_LIMIT,
 };
 pub use datafusion_execution::config::SessionConfig;
+use datafusion_execution::disk_manager::{
+    DEFAULT_MAX_TEMP_DIRECTORY_SIZE, DiskManagerBuilder,
+};
 use datafusion_execution::registry::SerializerRegistry;
-pub use datafusion_execution::TaskContext;
 pub use datafusion_expr::execution_props::ExecutionProps;
+#[cfg(feature = "sql")]
+use datafusion_expr::planner::RelationPlanner;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
+    Expr, UserDefinedLogicalNode, WindowUDF,
     expr_rewriter::FunctionRewrite,
     logical_plan::{DdlStatement, Statement},
     planner::ExprPlanner,
-    Expr, UserDefinedLogicalNode, WindowUDF,
 };
 use datafusion_optimizer::analyzer::type_coercion::TypeCoercion;
-use datafusion_optimizer::Analyzer;
+use datafusion_optimizer::simplify_expressions::ExprSimplifier;
+use datafusion_optimizer::{Analyzer, OptimizerContext};
 use datafusion_optimizer::{AnalyzerRule, OptimizerRule};
 use datafusion_session::SessionStore;
 
@@ -164,22 +178,23 @@ where
 /// # #[tokio::main]
 /// # async fn main() -> Result<()> {
 /// let ctx = SessionContext::new();
-/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
-/// let df = df.filter(col("a").lt_eq(col("b")))?
-///            .aggregate(vec![col("a")], vec![min(col("b"))])?
-///            .limit(0, Some(100))?;
-/// let results = df
-///   .collect()
-///   .await?;
+/// let df = ctx
+///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+///     .await?;
+/// let df = df
+///     .filter(col("a").lt_eq(col("b")))?
+///     .aggregate(vec![col("a")], vec![min(col("b"))])?
+///     .limit(0, Some(100))?;
+/// let results = df.collect().await?;
 /// assert_batches_eq!(
-///  &[
-///    "+---+----------------+",
-///    "| a | min(?table?.b) |",
-///    "+---+----------------+",
-///    "| 1 | 2              |",
-///    "+---+----------------+",
-///  ],
-///  &results
+///     &[
+///         "+---+----------------+",
+///         "| a | min(?table?.b) |",
+///         "+---+----------------+",
+///         "| 1 | 2              |",
+///         "+---+----------------+",
+///     ],
+///     &results
 /// );
 /// # Ok(())
 /// # }
@@ -195,21 +210,22 @@ where
 /// # #[tokio::main]
 /// # async fn main() -> Result<()> {
 /// let ctx = SessionContext::new();
-/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?;
+/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new())
+///     .await?;
 /// let results = ctx
-///   .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100")
-///   .await?
-///   .collect()
-///   .await?;
+///     .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100")
+///     .await?
+///     .collect()
+///     .await?;
 /// assert_batches_eq!(
-///  &[
-///    "+---+----------------+",
-///    "| a | min(example.b) |",
-///    "+---+----------------+",
-///    "| 1 | 2              |",
-///    "+---+----------------+",
-///  ],
-///  &results
+///     &[
+///         "+---+----------------+",
+///         "| a | min(example.b) |",
+///         "+---+----------------+",
+///         "| 1 | 2              |",
+///         "+---+----------------+",
+///     ],
+///     &results
 /// );
 /// # Ok(())
 /// # }
@@ -226,21 +242,21 @@ where
 /// # use datafusion::execution::SessionStateBuilder;
 /// # use datafusion_execution::runtime_env::RuntimeEnvBuilder;
 /// // Configure a 4k batch size
-/// let config = SessionConfig::new() .with_batch_size(4 * 1024);
+/// let config = SessionConfig::new().with_batch_size(4 * 1024);
 ///
 /// // configure a memory limit of 1GB with 20%  slop
-///  let runtime_env = RuntimeEnvBuilder::new()
+/// let runtime_env = RuntimeEnvBuilder::new()
 ///     .with_memory_limit(1024 * 1024 * 1024, 0.80)
 ///     .build_arc()
 ///     .unwrap();
 ///
 /// // Create a SessionState using the config and runtime_env
 /// let state = SessionStateBuilder::new()
-///   .with_config(config)
-///   .with_runtime_env(runtime_env)
-///   // include support for built in functions and configurations
-///   .with_default_features()
-///   .build();
+///     .with_config(config)
+///     .with_runtime_env(runtime_env)
+///     // include support for built in functions and configurations
+///     .with_default_features()
+///     .build();
 ///
 /// // Create a SessionContext
 /// let ctx = SessionContext::from(state);
@@ -297,13 +313,13 @@ impl SessionContext {
     pub async fn refresh_catalogs(&self) -> Result<()> {
         let cat_names = self.catalog_names().clone();
         for cat_name in cat_names.iter() {
-            let cat = self.catalog(cat_name.as_str()).ok_or_else(|| {
-                DataFusionError::Internal("Catalog not found!".to_string())
-            })?;
+            let cat = self
+                .catalog(cat_name.as_str())
+                .ok_or_else(|| internal_datafusion_err!("Catalog not found!"))?;
             for schema_name in cat.schema_names() {
-                let schema = cat.schema(schema_name.as_str()).ok_or_else(|| {
-                    DataFusionError::Internal("Schema not found!".to_string())
-                })?;
+                let schema = cat
+                    .schema(schema_name.as_str())
+                    .ok_or_else(|| internal_datafusion_err!("Schema not found!"))?;
                 let lister = schema.as_any().downcast_ref::<ListingSchemaProvider>();
                 if let Some(lister) = lister {
                     lister.refresh(&self.state()).await?;
@@ -426,16 +442,14 @@ impl SessionContext {
     /// # use datafusion::prelude::*;
     /// # use datafusion::execution::SessionStateBuilder;
     /// # use datafusion_optimizer::push_down_filter::PushDownFilter;
-    /// let my_rule = PushDownFilter{}; // pretend it is a new rule
-    /// // Create a new builder with a custom optimizer rule
+    /// let my_rule = PushDownFilter {}; // pretend it is a new rule
+    ///                                  // Create a new builder with a custom optimizer rule
     /// let context: SessionContext = SessionStateBuilder::new()
-    ///   .with_optimizer_rule(Arc::new(my_rule))
-    ///   .build()
-    ///   .into();
+    ///     .with_optimizer_rule(Arc::new(my_rule))
+    ///     .build()
+    ///     .into();
     /// // Enable local file access and convert context back to a builder
-    /// let builder = context
-    ///   .enable_url_table()
-    ///   .into_state_builder();
+    /// let builder = context.enable_url_table().into_state_builder();
     /// ```
     pub fn into_state_builder(self) -> SessionStateBuilder {
         let SessionContext {
@@ -474,6 +488,11 @@ impl SessionContext {
         self.state.write().append_optimizer_rule(optimizer_rule);
     }
 
+    /// Removes an optimizer rule by name, returning `true` if it existed.
+    pub fn remove_optimizer_rule(&self, name: &str) -> bool {
+        self.state.write().remove_optimizer_rule(name)
+    }
+
     /// Adds an analyzer rule to the end of the existing rules.
     ///
     /// See [`SessionState`] for more control of when the rule is applied.
@@ -504,19 +523,21 @@ impl SessionContext {
         self.runtime_env().register_object_store(url, object_store)
     }
 
-    /// Registers the [`RecordBatch`] as the specified table name
+    /// Deregisters an [`ObjectStore`] associated with the specific URL prefix.
+    ///
+    /// See [`RuntimeEnv::deregister_object_store`] for more details.
+    pub fn deregister_object_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
+        self.runtime_env().deregister_object_store(url)
+    }
+
+    /// Registers the given [`RecordBatch`] as the specified table reference.
     pub fn register_batch(
         &self,
-        table_name: &str,
+        table_ref: impl Into<TableReference>,
         batch: RecordBatch,
     ) -> Result<Option<Arc<dyn TableProvider>>> {
         let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?;
-        self.register_table(
-            TableReference::Bare {
-                table: table_name.into(),
-            },
-            Arc::new(table),
-        )
+        self.register_table(table_ref, Arc::new(table))
     }
 
     /// Return the [RuntimeEnv] used to run queries with this `SessionContext`
@@ -576,15 +597,15 @@ impl SessionContext {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// ctx
-    ///   .sql("CREATE TABLE foo (x INTEGER)")
-    ///   .await?
-    ///   .collect()
-    ///   .await?;
+    /// ctx.sql("CREATE TABLE foo (x INTEGER)")
+    ///     .await?
+    ///     .collect()
+    ///     .await?;
     /// assert!(ctx.table_exist("foo").unwrap());
     /// # Ok(())
     /// # }
     /// ```
+    #[cfg(feature = "sql")]
     pub async fn sql(&self, sql: &str) -> Result<DataFrame> {
         self.sql_with_options(sql, SQLOptions::new()).await
     }
@@ -604,17 +625,18 @@ impl SessionContext {
     /// # #[tokio::main]
     /// # async fn main() -> Result<()> {
     /// let ctx = SessionContext::new();
-    /// let options = SQLOptions::new()
-    ///   .with_allow_ddl(false);
-    /// let err = ctx.sql_with_options("CREATE TABLE foo (x INTEGER)", options)
-    ///   .await
-    ///   .unwrap_err();
-    /// assert!(
-    ///   err.to_string().starts_with("Error during planning: DDL not supported: CreateMemoryTable")
-    /// );
+    /// let options = SQLOptions::new().with_allow_ddl(false);
+    /// let err = ctx
+    ///     .sql_with_options("CREATE TABLE foo (x INTEGER)", options)
+    ///     .await
+    ///     .unwrap_err();
+    /// assert!(err
+    ///     .to_string()
+    ///     .starts_with("Error during planning: DDL not supported: CreateMemoryTable"));
     /// # Ok(())
     /// # }
     /// ```
+    #[cfg(feature = "sql")]
     pub async fn sql_with_options(
         &self,
         sql: &str,
@@ -642,12 +664,12 @@ impl SessionContext {
     /// // provide type information that `a` is an Int32
     /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
     /// let df_schema = DFSchema::try_from(schema).unwrap();
-    /// let expr = SessionContext::new()
-    ///  .parse_sql_expr(sql, &df_schema)?;
+    /// let expr = SessionContext::new().parse_sql_expr(sql, &df_schema)?;
     /// assert_eq!(expected, expr);
     /// # Ok(())
     /// # }
     /// ```
+    #[cfg(feature = "sql")]
     pub fn parse_sql_expr(&self, sql: &str, df_schema: &DFSchema) -> Result<Expr> {
         self.state.read().create_logical_expr(sql, df_schema)
     }
@@ -668,7 +690,7 @@ impl SessionContext {
                 match ddl {
                     DdlStatement::CreateExternalTable(cmd) => {
                         (Box::pin(async move { self.create_external_table(&cmd).await })
-                            as std::pin::Pin<Box<dyn futures::Future<Output = _> + Send>>)
+                            as std::pin::Pin<Box<dyn Future<Output = _> + Send>>)
                             .await
                     }
                     DdlStatement::CreateMemoryTable(cmd) => {
@@ -699,30 +721,42 @@ impl SessionContext {
             }
             // TODO what about the other statements (like TransactionStart and TransactionEnd)
             LogicalPlan::Statement(Statement::SetVariable(stmt)) => {
-                self.set_variable(stmt).await
+                self.set_variable(stmt).await?;
+                self.return_empty_dataframe()
+            }
+            LogicalPlan::Statement(Statement::ResetVariable(stmt)) => {
+                self.reset_variable(stmt).await?;
+                self.return_empty_dataframe()
             }
             LogicalPlan::Statement(Statement::Prepare(Prepare {
                 name,
                 input,
-                data_types,
+                fields,
             })) => {
                 // The number of parameters must match the specified data types length.
-                if !data_types.is_empty() {
+                if !fields.is_empty() {
                     let param_names = input.get_parameter_names()?;
-                    if param_names.len() != data_types.len() {
+                    if param_names.len() != fields.len() {
                         return plan_err!(
                             "Prepare specifies {} data types but query has {} parameters",
-                            data_types.len(),
+                            fields.len(),
                             param_names.len()
                         );
                     }
                 }
-                // Store the unoptimized plan into the session state. Although storing the
-                // optimized plan or the physical plan would be more efficient, doing so is
-                // not currently feasible. This is because `now()` would be optimized to a
-                // constant value, causing each EXECUTE to yield the same result, which is
-                // incorrect behavior.
-                self.state.write().store_prepared(name, data_types, input)?;
+                // Optimize the plan without evaluating expressions like now()
+                let optimizer_context = OptimizerContext::new_with_config_options(
+                    Arc::clone(self.state().config().options()),
+                )
+                .without_query_execution_start_time();
+                let plan = self.state().optimizer().optimize(
+                    Arc::unwrap_or_clone(input),
+                    &optimizer_context,
+                    |_1, _2| {},
+                )?;
+                self.state
+                    .write()
+                    .store_prepared(name, fields, Arc::new(plan))?;
                 self.return_empty_dataframe()
             }
             LogicalPlan::Statement(Statement::Execute(execute)) => {
@@ -764,7 +798,7 @@ impl SessionContext {
     /// * [`SessionState::create_physical_expr`] for a lower level API
     ///
     /// [simplified]: datafusion_optimizer::simplify_expressions
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     pub fn create_physical_expr(
         &self,
         expr: Expr,
@@ -789,19 +823,44 @@ impl SessionContext {
             return not_impl_err!("Temporary tables not supported");
         }
 
-        if exist {
-            match cmd.if_not_exists {
-                true => return self.return_empty_dataframe(),
-                false => {
-                    return exec_err!("Table '{}' already exists", cmd.name);
+        match (cmd.if_not_exists, cmd.or_replace, exist) {
+            (true, false, true) => self.return_empty_dataframe(),
+            (false, true, true) => {
+                let result = self
+                    .find_and_deregister(cmd.name.clone(), TableType::Base)
+                    .await;
+
+                match result {
+                    Ok(true) => {
+                        let table_provider: Arc<dyn TableProvider> =
+                            self.create_custom_table(cmd).await?;
+                        self.register_table(cmd.name.clone(), table_provider)?;
+                        self.return_empty_dataframe()
+                    }
+                    Ok(false) => {
+                        let table_provider: Arc<dyn TableProvider> =
+                            self.create_custom_table(cmd).await?;
+                        self.register_table(cmd.name.clone(), table_provider)?;
+                        self.return_empty_dataframe()
+                    }
+                    Err(e) => {
+                        exec_err!("Errored while deregistering external table: {}", e)
+                    }
                 }
             }
+            (true, true, true) => {
+                exec_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'")
+            }
+            (_, _, false) => {
+                let table_provider: Arc<dyn TableProvider> =
+                    self.create_custom_table(cmd).await?;
+                self.register_table(cmd.name.clone(), table_provider)?;
+                self.return_empty_dataframe()
+            }
+            (false, false, true) => {
+                exec_err!("External table '{}' already exists", cmd.name)
+            }
         }
-
-        let table_provider: Arc<dyn TableProvider> =
-            self.create_custom_table(cmd).await?;
-        self.register_table(cmd.name.clone(), table_provider)?;
-        self.return_empty_dataframe()
     }
 
     async fn create_memory_table(&self, cmd: CreateMemoryTable) -> Result<DataFrame> {
@@ -827,7 +886,7 @@ impl SessionContext {
             (true, false, Ok(_)) => self.return_empty_dataframe(),
             (false, true, Ok(_)) => {
                 self.deregister_table(name.clone())?;
-                let schema = Arc::new(input.schema().as_ref().into());
+                let schema = Arc::clone(input.schema().inner());
                 let physical = DataFrame::new(self.state(), input);
 
                 let batches: Vec<_> = physical.collect_partitioned().await?;
@@ -845,8 +904,7 @@ impl SessionContext {
                 exec_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'")
             }
             (_, _, Err(_)) => {
-                let df_schema = input.schema();
-                let schema = Arc::new(df_schema.as_ref().into());
+                let schema = Arc::clone(input.schema().inner());
                 let physical = DataFrame::new(self.state(), input);
 
                 let batches: Vec<_> = physical.collect_partitioned().await?;
@@ -914,7 +972,7 @@ impl SessionContext {
             ..
         } = cmd;
 
-        // sqlparser doesnt accept database / catalog as parameter to CREATE SCHEMA
+        // sqlparser doesn't accept database / catalog as parameter to CREATE SCHEMA
         // so for now, we default to default catalog
         let tokens: Vec<&str> = schema_name.split('.').collect();
         let (catalog, schema_name) = match tokens.len() {
@@ -922,17 +980,15 @@ impl SessionContext {
                 let state = self.state.read();
                 let name = &state.config().options().catalog.default_catalog;
                 let catalog = state.catalog_list().catalog(name).ok_or_else(|| {
-                    DataFusionError::Execution(format!(
-                        "Missing default catalog '{name}'"
-                    ))
+                    exec_datafusion_err!("Missing default catalog '{name}'")
                 })?;
                 (catalog, tokens[0])
             }
             2 => {
                 let name = &tokens[0];
-                let catalog = self.catalog(name).ok_or_else(|| {
-                    DataFusionError::Execution(format!("Missing catalog '{name}'"))
-                })?;
+                let catalog = self
+                    .catalog(name)
+                    .ok_or_else(|| exec_datafusion_err!("Missing catalog '{name}'"))?;
                 (catalog, tokens[1])
             }
             _ => return exec_err!("Unable to parse catalog from {schema_name}"),
@@ -1020,22 +1076,22 @@ impl SessionContext {
             } else if allow_missing {
                 return self.return_empty_dataframe();
             } else {
-                return self.schema_doesnt_exist_err(name);
+                return self.schema_doesnt_exist_err(&name);
             }
         };
         let dereg = catalog.deregister_schema(name.schema_name(), cascade)?;
         match (dereg, allow_missing) {
             (None, true) => self.return_empty_dataframe(),
-            (None, false) => self.schema_doesnt_exist_err(name),
+            (None, false) => self.schema_doesnt_exist_err(&name),
             (Some(_), _) => self.return_empty_dataframe(),
         }
     }
 
-    fn schema_doesnt_exist_err(&self, schemaref: SchemaReference) -> Result<DataFrame> {
+    fn schema_doesnt_exist_err(&self, schemaref: &SchemaReference) -> Result<DataFrame> {
         exec_err!("Schema '{schemaref}' doesn't exist.")
     }
 
-    async fn set_variable(&self, stmt: SetVariable) -> Result<DataFrame> {
+    async fn set_variable(&self, stmt: SetVariable) -> Result<()> {
         let SetVariable {
             variable, value, ..
         } = stmt;
@@ -1046,33 +1102,132 @@ impl SessionContext {
         } else {
             let mut state = self.state.write();
             state.config_mut().options_mut().set(&variable, &value)?;
-            drop(state);
+
+            // Re-initialize any UDFs that depend on configuration
+            // This allows both built-in and custom functions to respond to configuration changes
+            let config_options = state.config().options();
+
+            // Collect updated UDFs in a separate vector
+            let udfs_to_update: Vec<_> = state
+                .scalar_functions()
+                .values()
+                .filter_map(|udf| {
+                    udf.inner()
+                        .with_updated_config(config_options)
+                        .map(Arc::new)
+                })
+                .collect();
+
+            for udf in udfs_to_update {
+                state.register_udf(udf)?;
+            }
         }
 
-        self.return_empty_dataframe()
+        Ok(())
+    }
+
+    async fn reset_variable(&self, stmt: ResetVariable) -> Result<()> {
+        let variable = stmt.variable;
+        if variable.starts_with("datafusion.runtime.") {
+            return self.reset_runtime_variable(&variable);
+        }
+
+        let mut state = self.state.write();
+        state.config_mut().options_mut().reset(&variable)?;
+
+        // Refresh UDFs to ensure configuration-dependent behavior updates
+        let config_options = state.config().options();
+        let udfs_to_update: Vec<_> = state
+            .scalar_functions()
+            .values()
+            .filter_map(|udf| {
+                udf.inner()
+                    .with_updated_config(config_options)
+                    .map(Arc::new)
+            })
+            .collect();
+
+        for udf in udfs_to_update {
+            state.register_udf(udf)?;
+        }
+
+        Ok(())
     }
 
     fn set_runtime_variable(&self, variable: &str, value: &str) -> Result<()> {
         let key = variable.strip_prefix("datafusion.runtime.").unwrap();
 
+        let mut state = self.state.write();
+
+        let mut builder = RuntimeEnvBuilder::from_runtime_env(state.runtime_env());
+        builder = match key {
+            "memory_limit" => {
+                let memory_limit = Self::parse_capacity_limit(variable, value)?;
+                builder.with_memory_limit(memory_limit, 1.0)
+            }
+            "max_temp_directory_size" => {
+                let directory_size = Self::parse_capacity_limit(variable, value)?;
+                builder.with_max_temp_directory_size(directory_size as u64)
+            }
+            "temp_directory" => builder.with_temp_file_path(value),
+            "metadata_cache_limit" => {
+                let limit = Self::parse_capacity_limit(variable, value)?;
+                builder.with_metadata_cache_limit(limit)
+            }
+            "list_files_cache_limit" => {
+                let limit = Self::parse_capacity_limit(variable, value)?;
+                builder.with_object_list_cache_limit(limit)
+            }
+            "list_files_cache_ttl" => {
+                let duration = Self::parse_duration(variable, value)?;
+                builder.with_object_list_cache_ttl(Some(duration))
+            }
+            _ => return plan_err!("Unknown runtime configuration: {variable}"),
+            // Remember to update `reset_runtime_variable()` when adding new options
+        };
+
+        *state = SessionStateBuilder::from(state.clone())
+            .with_runtime_env(Arc::new(builder.build()?))
+            .build();
+
+        Ok(())
+    }
+
+    fn reset_runtime_variable(&self, variable: &str) -> Result<()> {
+        let key = variable.strip_prefix("datafusion.runtime.").unwrap();
+
+        let mut state = self.state.write();
+
+        let mut builder = RuntimeEnvBuilder::from_runtime_env(state.runtime_env());
         match key {
             "memory_limit" => {
-                let memory_limit = Self::parse_memory_limit(value)?;
-
-                let mut state = self.state.write();
-                let mut builder =
-                    RuntimeEnvBuilder::from_runtime_env(state.runtime_env());
-                builder = builder.with_memory_limit(memory_limit, 1.0);
-                *state = SessionStateBuilder::from(state.clone())
-                    .with_runtime_env(Arc::new(builder.build()?))
-                    .build();
+                builder.memory_pool = None;
             }
-            _ => {
-                return Err(DataFusionError::Plan(format!(
-                    "Unknown runtime configuration: {variable}"
-                )))
+            "max_temp_directory_size" => {
+                builder =
+                    builder.with_max_temp_directory_size(DEFAULT_MAX_TEMP_DIRECTORY_SIZE);
             }
-        }
+            "temp_directory" => {
+                builder.disk_manager_builder = Some(DiskManagerBuilder::default());
+            }
+            "metadata_cache_limit" => {
+                builder = builder.with_metadata_cache_limit(DEFAULT_METADATA_CACHE_LIMIT);
+            }
+            "list_files_cache_limit" => {
+                builder = builder
+                    .with_object_list_cache_limit(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT);
+            }
+            "list_files_cache_ttl" => {
+                builder =
+                    builder.with_object_list_cache_ttl(DEFAULT_LIST_FILES_CACHE_TTL);
+            }
+            _ => return plan_err!("Unknown runtime configuration: {variable}"),
+        };
+
+        *state = SessionStateBuilder::from(state.clone())
+            .with_runtime_env(Arc::new(builder.build()?))
+            .build();
+
         Ok(())
     }
 
@@ -1083,27 +1238,146 @@ impl SessionContext {
     /// ```
     /// use datafusion::execution::context::SessionContext;
     ///
-    /// assert_eq!(SessionContext::parse_memory_limit("1M").unwrap(), 1024 * 1024);
-    /// assert_eq!(SessionContext::parse_memory_limit("1.5G").unwrap(), (1.5 * 1024.0 * 1024.0 * 1024.0) as usize);
+    /// assert_eq!(
+    ///     SessionContext::parse_memory_limit("1M").unwrap(),
+    ///     1024 * 1024
+    /// );
+    /// assert_eq!(
+    ///     SessionContext::parse_memory_limit("1.5G").unwrap(),
+    ///     (1.5 * 1024.0 * 1024.0 * 1024.0) as usize
+    /// );
     /// ```
+    #[deprecated(
+        since = "53.0.0",
+        note = "please use `parse_capacity_limit` function instead."
+    )]
     pub fn parse_memory_limit(limit: &str) -> Result<usize> {
+        if limit.trim().is_empty() {
+            return Err(plan_datafusion_err!("Empty limit value found!"));
+        }
         let (number, unit) = limit.split_at(limit.len() - 1);
         let number: f64 = number.parse().map_err(|_| {
-            DataFusionError::Plan(format!(
-                "Failed to parse number from memory limit '{limit}'"
-            ))
+            plan_datafusion_err!("Failed to parse number from memory limit '{limit}'")
         })?;
+        if number.is_sign_negative() || number.is_infinite() {
+            return Err(plan_datafusion_err!(
+                "Limit value should be positive finite number"
+            ));
+        }
 
         match unit {
             "K" => Ok((number * 1024.0) as usize),
             "M" => Ok((number * 1024.0 * 1024.0) as usize),
             "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
-            _ => Err(DataFusionError::Plan(format!(
-                "Unsupported unit '{unit}' in memory limit '{limit}'"
-            ))),
+            _ => plan_err!("Unsupported unit '{unit}' in memory limit '{limit}'"),
         }
     }
 
+    /// Parse capacity limit from string to number of bytes by allowing units: K, M and G.
+    /// Supports formats like '1.5G', '100M', '512K'
+    ///
+    /// # Examples
+    /// ```
+    /// use datafusion::execution::context::SessionContext;
+    ///
+    /// assert_eq!(
+    ///     SessionContext::parse_capacity_limit("datafusion.runtime.memory_limit", "1M").unwrap(),
+    ///     1024 * 1024
+    /// );
+    /// assert_eq!(
+    ///     SessionContext::parse_capacity_limit("datafusion.runtime.memory_limit", "1.5G").unwrap(),
+    ///     (1.5 * 1024.0 * 1024.0 * 1024.0) as usize
+    /// );
+    /// ```
+    pub fn parse_capacity_limit(config_name: &str, limit: &str) -> Result<usize> {
+        if limit.trim().is_empty() {
+            return Err(plan_datafusion_err!(
+                "Empty limit value found for '{config_name}'"
+            ));
+        }
+        let (number, unit) = limit.split_at(limit.len() - 1);
+        let number: f64 = number.parse().map_err(|_| {
+            plan_datafusion_err!(
+                "Failed to parse number from '{config_name}', limit '{limit}'"
+            )
+        })?;
+        if number.is_sign_negative() || number.is_infinite() {
+            return Err(plan_datafusion_err!(
+                "Limit value should be positive finite number for '{config_name}'"
+            ));
+        }
+
+        match unit {
+            "K" => Ok((number * 1024.0) as usize),
+            "M" => Ok((number * 1024.0 * 1024.0) as usize),
+            "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
+            _ => plan_err!(
+                "Unsupported unit '{unit}' in '{config_name}', limit '{limit}'. \
+            Unit must be one of: 'K', 'M', 'G'"
+            ),
+        }
+    }
+
+    fn parse_duration(config_name: &str, duration: &str) -> Result<Duration> {
+        if duration.trim().is_empty() {
+            return Err(plan_datafusion_err!(
+                "Duration should not be empty or blank for '{config_name}'"
+            ));
+        }
+
+        let mut minutes = None;
+        let mut seconds = None;
+
+        for duration in duration.split_inclusive(&['m', 's']) {
+            let (number, unit) = duration.split_at(duration.len() - 1);
+            let number: u64 = number.parse().map_err(|_| {
+                plan_datafusion_err!("Failed to parse number from duration '{duration}' for '{config_name}'")
+            })?;
+
+            match unit {
+                "m" if minutes.is_none() && seconds.is_none() => minutes = Some(number),
+                "s" if seconds.is_none() => seconds = Some(number),
+                other => plan_err!(
+                    "Invalid duration unit: '{other}'. The unit must be either 'm' (minutes), or 's' (seconds), and be in the correct order for '{config_name}'"
+                )?,
+            }
+        }
+
+        let secs = Self::check_overflow(config_name, minutes, 60, seconds)?;
+        let duration = Duration::from_secs(secs);
+
+        if duration.is_zero() {
+            return plan_err!(
+                "Duration must be greater than 0 seconds for '{config_name}'"
+            );
+        }
+
+        Ok(duration)
+    }
+
+    fn check_overflow(
+        config_name: &str,
+        mins: Option<u64>,
+        multiplier: u64,
+        secs: Option<u64>,
+    ) -> Result<u64> {
+        let first_part_of_secs = mins.unwrap_or_default().checked_mul(multiplier);
+        if first_part_of_secs.is_none() {
+            plan_err!(
+                "Duration has overflowed allowed maximum limit due to 'mins * {multiplier}' when setting '{config_name}'"
+            )?
+        }
+        let second_part_of_secs = first_part_of_secs
+            .unwrap()
+            .checked_add(secs.unwrap_or_default());
+        if second_part_of_secs.is_none() {
+            plan_err!(
+                "Duration has overflowed allowed maximum limit due to 'mins * {multiplier} + secs' when setting '{config_name}'"
+            )?
+        }
+        Ok(second_part_of_secs.unwrap())
+    }
+
     async fn create_custom_table(
         &self,
         cmd: &CreateExternalTable,
@@ -1115,10 +1389,7 @@ impl SessionContext {
                 .table_factories()
                 .get(file_type.as_str())
                 .ok_or_else(|| {
-                    DataFusionError::Execution(format!(
-                        "Unable to find factory for {}",
-                        cmd.file_type
-                    ))
+                    exec_datafusion_err!("Unable to find factory for {}", cmd.file_type)
                 })?;
         let table = (*factory).create(&state, cmd).await?;
         Ok(table)
@@ -1133,20 +1404,24 @@ impl SessionContext {
         let table = table_ref.table().to_owned();
         let maybe_schema = {
             let state = self.state.read();
-            let resolved = state.resolve_table_ref(table_ref);
+            let resolved = state.resolve_table_ref(table_ref.clone());
             state
                 .catalog_list()
                 .catalog(&resolved.catalog)
                 .and_then(|c| c.schema(&resolved.schema))
         };
 
-        if let Some(schema) = maybe_schema {
-            if let Some(table_provider) = schema.table(&table).await? {
-                if table_provider.table_type() == table_type {
-                    schema.deregister_table(&table)?;
-                    return Ok(true);
-                }
+        if let Some(schema) = maybe_schema
+            && let Some(table_provider) = schema.table(&table).await?
+            && table_provider.table_type() == table_type
+        {
+            schema.deregister_table(&table)?;
+            if table_type == TableType::Base
+                && let Some(lfc) = self.runtime_env().cache_manager.get_list_files_cache()
+            {
+                lfc.drop_table_entries(&Some(table_ref))?;
             }
+            return Ok(true);
         }
 
         Ok(false)
@@ -1159,9 +1434,11 @@ impl SessionContext {
 
             match function_factory {
                 Some(f) => f.create(&state, stmt).await?,
-                _ => Err(DataFusionError::Configuration(
-                    "Function factory has not been configured".into(),
-                ))?,
+                _ => {
+                    return Err(DataFusionError::Configuration(
+                        "Function factory has not been configured".to_string(),
+                    ));
+                }
             }
         };
 
@@ -1210,29 +1487,40 @@ impl SessionContext {
             exec_datafusion_err!("Prepared statement '{}' does not exist", name)
         })?;
 
+        let state = self.state.read();
+        let context = SimplifyContext::default()
+            .with_schema(Arc::clone(prepared.plan.schema()))
+            .with_config_options(Arc::clone(state.config_options()))
+            .with_query_execution_start_time(
+                state.execution_props().query_execution_start_time,
+            );
+        let simplifier = ExprSimplifier::new(context);
+
         // Only allow literals as parameters for now.
-        let mut params: Vec<ScalarValue> = parameters
+        let mut params: Vec<ScalarAndMetadata> = parameters
             .into_iter()
-            .map(|e| match e {
-                Expr::Literal(scalar) => Ok(scalar),
-                _ => not_impl_err!("Unsupported parameter type: {}", e),
+            .map(|e| match simplifier.simplify(e)? {
+                Expr::Literal(scalar, metadata) => {
+                    Ok(ScalarAndMetadata::new(scalar, metadata))
+                }
+                e => not_impl_err!("Unsupported parameter type: {e}"),
             })
             .collect::<Result<_>>()?;
 
         // If the prepared statement provides data types, cast the params to those types.
-        if !prepared.data_types.is_empty() {
-            if params.len() != prepared.data_types.len() {
+        if !prepared.fields.is_empty() {
+            if params.len() != prepared.fields.len() {
                 return exec_err!(
                     "Prepared statement '{}' expects {} parameters, but {} provided",
                     name,
-                    prepared.data_types.len(),
+                    prepared.fields.len(),
                     params.len()
                 );
             }
             params = params
                 .into_iter()
-                .zip(prepared.data_types.iter())
-                .map(|(e, dt)| e.cast_to(dt))
+                .zip(prepared.fields.iter())
+                .map(|(e, dt)| -> Result<_> { e.cast_storage_to(dt.data_type()) })
                 .collect::<Result<_>>()?;
         }
 
@@ -1298,6 +1586,18 @@ impl SessionContext {
         self.state.write().register_udwf(Arc::new(f)).ok();
     }
 
+    #[cfg(feature = "sql")]
+    /// Registers a [`RelationPlanner`] to customize SQL table-factor planning.
+    ///
+    /// Planners are invoked in reverse registration order, allowing newer
+    /// planners to take precedence over existing ones.
+    pub fn register_relation_planner(
+        &self,
+        planner: Arc<dyn RelationPlanner>,
+    ) -> Result<()> {
+        self.state.write().register_relation_planner(planner)
+    }
+
     /// Deregisters a UDF within this context.
     pub fn deregister_udf(&self, name: &str) {
         self.state.write().deregister_udf(name).ok();
@@ -1483,15 +1783,14 @@ impl SessionContext {
     /// SQL statements executed against this context.
     pub async fn register_arrow(
         &self,
-        name: &str,
-        table_path: &str,
+        table_ref: impl Into<TableReference>,
+        table_path: impl AsRef<str>,
         options: ArrowReadOptions<'_>,
     ) -> Result<()> {
         let listing_options = options
             .to_listing_options(&self.copied_config(), self.copied_table_options());
-
         self.register_listing_table(
-            name,
+            table_ref,
             table_path,
             listing_options,
             options.schema.map(|s| Arc::new(s.to_owned())),
@@ -1640,7 +1939,7 @@ impl SessionContext {
     /// [`ConfigOptions`]: crate::config::ConfigOptions
     pub fn state(&self) -> SessionState {
         let mut state = self.state.read().clone();
-        state.execution_props_mut().start_execution();
+        state.mark_start_execution();
         state
     }
 
@@ -1717,6 +2016,20 @@ impl FunctionRegistry for SessionContext {
     ) -> Result<()> {
         self.state.write().register_expr_planner(expr_planner)
     }
+
+    fn udafs(&self) -> HashSet<String> {
+        self.state.read().udafs()
+    }
+
+    fn udwfs(&self) -> HashSet<String> {
+        self.state.read().udwfs()
+    }
+}
+
+impl datafusion_execution::TaskContextProvider for SessionContext {
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        SessionContext::task_ctx(self)
+    }
 }
 
 /// Create a new task context instance from SessionContext
@@ -1741,7 +2054,7 @@ impl From<SessionContext> for SessionStateBuilder {
 /// A planner used to add extensions to DataFusion logical and physical plans.
 #[async_trait]
 pub trait QueryPlanner: Debug {
-    /// Given a `LogicalPlan`, create an [`ExecutionPlan`] suitable for execution
+    /// Given a [`LogicalPlan`], create an [`ExecutionPlan`] suitable for execution
     async fn create_physical_plan(
         &self,
         logical_plan: &LogicalPlan,
@@ -1749,12 +2062,46 @@ pub trait QueryPlanner: Debug {
     ) -> Result<Arc<dyn ExecutionPlan>>;
 }
 
-/// A pluggable interface to handle `CREATE FUNCTION` statements
-/// and interact with [SessionState] to registers new udf, udaf or udwf.
+/// Interface for handling `CREATE FUNCTION` statements and interacting with
+/// [SessionState] to create and register functions ([`ScalarUDF`],
+/// [`AggregateUDF`], [`WindowUDF`], and [`TableFunctionImpl`]) dynamically.
+///
+/// Implement this trait to create user-defined functions in a custom way, such
+/// as loading from external libraries or defining them programmatically.
+/// DataFusion will parse `CREATE FUNCTION` statements into [`CreateFunction`]
+/// structs and pass them to the [`create`](Self::create) method.
+///
+/// Note there is no default implementation of this trait provided in DataFusion,
+/// because the implementation and requirements vary widely. Please see
+/// [function_factory example] for a reference implementation.
+///
+/// [function_factory example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/function_factory.rs
+///
+/// # Examples of syntax that can be supported
+///
+/// ```sql
+/// CREATE FUNCTION f1(BIGINT)
+///   RETURNS BIGINT
+///   RETURN $1 + 1;
+/// ```
+/// or
+/// ```sql
+/// CREATE FUNCTION to_miles(DOUBLE)
+/// RETURNS DOUBLE
+/// LANGUAGE PYTHON
+/// AS '
+/// import pyarrow.compute as pc
+///
+/// conversation_rate_multiplier = 0.62137119
+///
+/// def to_miles(km_data):
+///     return pc.multiply(km_data, conversation_rate_multiplier)
+/// '
+/// ```
 
 #[async_trait]
 pub trait FunctionFactory: Debug + Sync + Send {
-    /// Handles creation of user defined function specified in [CreateFunction] statement
+    /// Creates a new dynamic function from the SQL in the [CreateFunction] statement
     async fn create(
         &self,
         state: &SessionState,
@@ -1762,7 +2109,8 @@ pub trait FunctionFactory: Debug + Sync + Send {
     ) -> Result<RegisterFunction>;
 }
 
-/// Type of function to create
+/// The result of processing a [`CreateFunction`] statement with [`FunctionFactory`].
+#[derive(Debug, Clone)]
 pub enum RegisterFunction {
     /// Scalar user defined function
     Scalar(Arc<ScalarUDF>),
@@ -1894,6 +2242,9 @@ mod tests {
     use crate::test;
     use crate::test_util::{plan_and_collect, populate_csv_partitions};
     use arrow::datatypes::{DataType, TimeUnit};
+    use arrow_schema::FieldRef;
+    use datafusion_common::DataFusionError;
+    use datafusion_common::datatype::DataTypeExt;
     use std::error::Error;
     use std::path::PathBuf;
 
@@ -1918,7 +2269,7 @@ mod tests {
         // configure with same memory / disk manager
         let memory_pool = ctx1.runtime_env().memory_pool.clone();
 
-        let mut reservation = MemoryConsumer::new("test").register(&memory_pool);
+        let reservation = MemoryConsumer::new("test").register(&memory_pool);
         reservation.grow(100);
 
         let disk_manager = ctx1.runtime_env().disk_manager.clone();
@@ -2410,7 +2761,7 @@ mod tests {
     struct MyTypePlanner {}
 
     impl TypePlanner for MyTypePlanner {
-        fn plan_type(&self, sql_type: &ast::DataType) -> Result<Option<DataType>> {
+        fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
             match sql_type {
                 ast::DataType::Datetime(precision) => {
                     let precision = match precision {
@@ -2420,10 +2771,213 @@ mod tests {
                         None | Some(9) => TimeUnit::Nanosecond,
                         _ => unreachable!(),
                     };
-                    Ok(Some(DataType::Timestamp(precision, None)))
+                    Ok(Some(
+                        DataType::Timestamp(precision, None).into_nullable_field_ref(),
+                    ))
                 }
                 _ => Ok(None),
             }
         }
     }
+
+    #[tokio::test]
+    async fn remove_optimizer_rule() -> Result<()> {
+        let get_optimizer_rules = |ctx: &SessionContext| {
+            ctx.state()
+                .optimizer()
+                .rules
+                .iter()
+                .map(|r| r.name().to_owned())
+                .collect::<HashSet<_>>()
+        };
+
+        let ctx = SessionContext::new();
+        assert!(get_optimizer_rules(&ctx).contains("simplify_expressions"));
+
+        // default plan
+        let plan = ctx
+            .sql("select 1 + 1")
+            .await?
+            .into_optimized_plan()?
+            .to_string();
+        assert_snapshot!(plan, @r"
+        Projection: Int64(2) AS Int64(1) + Int64(1)
+          EmptyRelation: rows=1
+        ");
+
+        assert!(ctx.remove_optimizer_rule("simplify_expressions"));
+        assert!(!get_optimizer_rules(&ctx).contains("simplify_expressions"));
+
+        // plan without the simplify_expressions rule
+        let plan = ctx
+            .sql("select 1 + 1")
+            .await?
+            .into_optimized_plan()?
+            .to_string();
+        assert_snapshot!(plan, @r"
+        Projection: Int64(1) + Int64(1)
+          EmptyRelation: rows=1
+        ");
+
+        // attempting to remove a non-existing rule returns false
+        assert!(!ctx.remove_optimizer_rule("simplify_expressions"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_duration() {
+        const LIST_FILES_CACHE_TTL: &str = "datafusion.runtime.list_files_cache_ttl";
+
+        // Valid durations
+        for (duration, want) in [
+            ("1s", Duration::from_secs(1)),
+            ("1m", Duration::from_secs(60)),
+            ("1m0s", Duration::from_secs(60)),
+            ("1m1s", Duration::from_secs(61)),
+        ] {
+            let have =
+                SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid durations
+        for duration in [
+            "0s", "0m", "1s0m", "1s1m", "XYZ", "1h", "XYZm2s", "", " ", "-1m", "1m 1s",
+            "1m1s ", " 1m1s",
+        ] {
+            let have = SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration);
+            assert!(have.is_err());
+            assert!(
+                have.unwrap_err()
+                    .message()
+                    .to_string()
+                    .contains(LIST_FILES_CACHE_TTL)
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_duration_with_overflow_check() {
+        const LIST_FILES_CACHE_TTL: &str = "datafusion.runtime.list_files_cache_ttl";
+
+        // Valid durations which are close to max allowed limit
+        for (duration, want) in [
+            (
+                "18446744073709551615s",
+                Duration::from_secs(18446744073709551615),
+            ),
+            (
+                "307445734561825860m",
+                Duration::from_secs(307445734561825860 * 60),
+            ),
+            (
+                "307445734561825860m10s",
+                Duration::from_secs(307445734561825860 * 60 + 10),
+            ),
+            (
+                "1m18446744073709551555s",
+                Duration::from_secs(60 + 18446744073709551555),
+            ),
+        ] {
+            let have =
+                SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid durations which overflow max allowed limit
+        for (duration, error_message_prefix) in [
+            (
+                "18446744073709551616s",
+                "Failed to parse number from duration",
+            ),
+            (
+                "307445734561825861m",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+            (
+                "307445734561825860m60s",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+            (
+                "1m18446744073709551556s",
+                "Duration has overflowed allowed maximum limit due to",
+            ),
+        ] {
+            let have = SessionContext::parse_duration(LIST_FILES_CACHE_TTL, duration);
+            assert!(have.is_err());
+            let error_message = have.unwrap_err().message().to_string();
+            assert!(
+                error_message.contains(error_message_prefix)
+                    && error_message.contains(LIST_FILES_CACHE_TTL)
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_memory_limit() {
+        // Valid memory_limit
+        for (limit, want) in [
+            ("1.5K", (1.5 * 1024.0) as usize),
+            ("2M", (2f64 * 1024.0 * 1024.0) as usize),
+            ("1G", (1f64 * 1024.0 * 1024.0 * 1024.0) as usize),
+        ] {
+            #[expect(deprecated)]
+            let have = SessionContext::parse_memory_limit(limit).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid memory_limit
+        for limit in [
+            "1B",
+            "1T",
+            "",
+            " ",
+            "XYZG",
+            "-1G",
+            "infG",
+            "-infG",
+            "G",
+            "1024B",
+            "invalid_size",
+        ] {
+            #[expect(deprecated)]
+            let have = SessionContext::parse_memory_limit(limit);
+            assert!(have.is_err());
+        }
+    }
+
+    #[test]
+    fn test_parse_capacity_limit() {
+        const MEMORY_LIMIT: &str = "datafusion.runtime.memory_limit";
+
+        // Valid capacity_limit
+        for (limit, want) in [
+            ("1.5K", (1.5 * 1024.0) as usize),
+            ("2M", (2f64 * 1024.0 * 1024.0) as usize),
+            ("1G", (1f64 * 1024.0 * 1024.0 * 1024.0) as usize),
+        ] {
+            let have = SessionContext::parse_capacity_limit(MEMORY_LIMIT, limit).unwrap();
+            assert_eq!(want, have);
+        }
+
+        // Invalid capacity_limit
+        for limit in [
+            "1B",
+            "1T",
+            "",
+            " ",
+            "XYZG",
+            "-1G",
+            "infG",
+            "-infG",
+            "G",
+            "1024B",
+            "invalid_size",
+        ] {
+            let have = SessionContext::parse_capacity_limit(MEMORY_LIMIT, limit);
+            assert!(have.is_err());
+            assert!(have.unwrap_err().to_string().contains(MEMORY_LIMIT));
+        }
+    }
 }
diff --git a/datafusion/core/src/execution/context/parquet.rs b/datafusion/core/src/execution/context/parquet.rs
index eea2b804770a3..823dc946ea732 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -34,13 +34,12 @@ impl SessionContext {
     ///
     /// # Note: Statistics
     ///
-    /// NOTE: by default, statistics are not collected when reading the Parquet
-    /// files as this can slow down the initial DataFrame creation. However,
-    /// collecting statistics can greatly accelerate queries with certain
-    /// filters.
+    /// NOTE: by default, statistics are collected when reading the Parquet
+    /// files This can slow down the initial DataFrame creation while
+    /// greatly accelerating queries with certain filters.
     ///
-    /// To enable collect statistics, set the [config option]
-    /// `datafusion.execution.collect_statistics` to `true`. See
+    /// To disable statistics collection, set the [config option]
+    /// `datafusion.execution.collect_statistics` to `false`. See
     /// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
     /// details.
     ///
@@ -108,11 +107,13 @@ mod tests {
     use crate::test_util::parquet_test_data;
 
     use arrow::util::pretty::pretty_format_batches;
-    use datafusion_common::assert_contains;
     use datafusion_common::config::TableParquetOptions;
+    use datafusion_common::{
+        assert_batches_eq, assert_batches_sorted_eq, assert_contains,
+    };
     use datafusion_execution::config::SessionConfig;
 
-    use tempfile::tempdir;
+    use tempfile::{TempDir, tempdir};
 
     #[tokio::test]
     async fn read_with_glob_path() -> Result<()> {
@@ -171,28 +172,28 @@ mod tests {
 
     #[tokio::test]
     async fn register_parquet_respects_collect_statistics_config() -> Result<()> {
-        // The default is false
+        // The default is true
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Absent,");
+        assert_contains!(content, "statistics=[Rows=Exact(");
 
-        // Explicitly set to false
+        // Explicitly set to true
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
-        config.options_mut().execution.collect_statistics = false;
+        config.options_mut().execution.collect_statistics = true;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Absent,");
+        assert_contains!(content, "statistics=[Rows=Exact(");
 
-        // Explicitly set to true
+        // Explicitly set to false
         let mut config = SessionConfig::new();
         config.options_mut().explain.physical_plan_only = true;
         config.options_mut().explain.show_statistics = true;
-        config.options_mut().execution.collect_statistics = true;
+        config.options_mut().execution.collect_statistics = false;
         let content = explain_query_all_with_config(config).await?;
-        assert_contains!(content, "statistics=[Rows=Exact(10),");
+        assert_contains!(content, "statistics=[Rows=Absent,");
 
         Ok(())
     }
@@ -354,7 +355,9 @@ mod tests {
         let expected_path = binding[0].as_str();
         assert_eq!(
             read_df.unwrap_err().strip_backtrace(),
-            format!("Execution error: File path '{expected_path}' does not match the expected extension '.parquet'")
+            format!(
+                "Execution error: File path '{expected_path}' does not match the expected extension '.parquet'"
+            )
         );
 
         // Read the dataframe from 'output3.parquet.snappy.parquet' with the correct file extension.
@@ -401,4 +404,124 @@ mod tests {
         assert_eq!(total_rows, 5);
         Ok(())
     }
+
+    #[tokio::test]
+    async fn read_from_parquet_folder() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+        let test_path = tmp_dir.path().to_str().unwrap().to_string();
+
+        ctx.sql("SELECT 1 a")
+            .await?
+            .write_parquet(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        ctx.sql("SELECT 2 a")
+            .await?
+            .write_parquet(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        // Adding CSV to check it is not read with Parquet reader
+        ctx.sql("SELECT 3 a")
+            .await?
+            .write_csv(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        let actual = ctx
+            .read_parquet(&test_path, ParquetReadOptions::default())
+            .await?
+            .collect()
+            .await?;
+
+        #[cfg_attr(any(), rustfmt::skip)]
+        assert_batches_sorted_eq!(&[
+            "+---+",
+            "| a |",
+            "+---+",
+            "| 2 |",
+            "| 1 |",
+            "+---+",
+        ], &actual);
+
+        let actual = ctx
+            .read_parquet(test_path, ParquetReadOptions::default())
+            .await?
+            .collect()
+            .await?;
+
+        #[cfg_attr(any(), rustfmt::skip)]
+        assert_batches_sorted_eq!(&[
+            "+---+",
+            "| a |",
+            "+---+",
+            "| 2 |",
+            "| 1 |",
+            "+---+",
+        ], &actual);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn read_from_parquet_folder_table() -> Result<()> {
+        let ctx = SessionContext::new();
+        let tmp_dir = TempDir::new()?;
+        let test_path = tmp_dir.path().to_str().unwrap().to_string();
+
+        ctx.sql("SELECT 1 a")
+            .await?
+            .write_parquet(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        ctx.sql("SELECT 2 a")
+            .await?
+            .write_parquet(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        // Adding CSV to check it is not read with Parquet reader
+        ctx.sql("SELECT 3 a")
+            .await?
+            .write_csv(&test_path, DataFrameWriteOptions::default(), None)
+            .await?;
+
+        ctx.sql(format!("CREATE EXTERNAL TABLE parquet_folder_t1 STORED AS PARQUET LOCATION '{test_path}'").as_ref())
+            .await?;
+
+        let actual = ctx
+            .sql("select * from parquet_folder_t1")
+            .await?
+            .collect()
+            .await?;
+        #[cfg_attr(any(), rustfmt::skip)]
+        assert_batches_sorted_eq!(&[
+            "+---+",
+            "| a |",
+            "+---+",
+            "| 2 |",
+            "| 1 |",
+            "+---+",
+        ], &actual);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn read_dummy_folder() -> Result<()> {
+        let ctx = SessionContext::new();
+        let test_path = "/foo/";
+
+        let actual = ctx
+            .read_parquet(test_path, ParquetReadOptions::default())
+            .await?
+            .collect()
+            .await?;
+
+        #[cfg_attr(any(), rustfmt::skip)]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &actual);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs
index 8aa812cc5258a..9560616c1b6da 100644
--- a/datafusion/core/src/execution/session_state.rs
+++ b/datafusion/core/src/execution/session_state.rs
@@ -24,61 +24,67 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::catalog::{CatalogProviderList, SchemaProvider, TableProviderFactory};
-use crate::datasource::cte_worktable::CteWorkTable;
-use crate::datasource::file_format::{format_as_file_type, FileFormatFactory};
+use crate::datasource::file_format::FileFormatFactory;
+#[cfg(feature = "sql")]
 use crate::datasource::provider_as_source;
-use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
 use crate::execution::SessionStateDefaults;
+use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
 use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
+use arrow_schema::{DataType, FieldRef};
+use datafusion_catalog::MemoryCatalogProviderList;
 use datafusion_catalog::information_schema::{
-    InformationSchemaProvider, INFORMATION_SCHEMA,
+    INFORMATION_SCHEMA, InformationSchemaProvider,
 };
-
-use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_catalog::MemoryCatalogProviderList;
 use datafusion_catalog::{TableFunction, TableFunctionImpl};
 use datafusion_common::alias::AliasGenerator;
+#[cfg(feature = "sql")]
+use datafusion_common::config::Dialect;
 use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions};
 use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan};
-use datafusion_common::file_options::file_type::FileType;
 use datafusion_common::tree_node::TreeNode;
 use datafusion_common::{
-    config_err, exec_err, not_impl_err, plan_datafusion_err, DFSchema, DataFusionError,
-    ResolvedTableReference, TableReference,
+    DFSchema, DataFusionError, ResolvedTableReference, TableReference, config_err,
+    exec_err, plan_datafusion_err,
 };
+use datafusion_execution::TaskContext;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
+#[cfg(feature = "sql")]
+use datafusion_expr::TableSource;
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
-use datafusion_expr::planner::{ExprPlanner, TypePlanner};
+use datafusion_expr::planner::ExprPlanner;
+#[cfg(feature = "sql")]
+use datafusion_expr::planner::{RelationPlanner, TypePlanner};
 use datafusion_expr::registry::{FunctionRegistry, SerializerRegistry};
-use datafusion_expr::simplify::SimplifyInfo;
-use datafusion_expr::var_provider::{is_system_variables, VarType};
-use datafusion_expr::{
-    AggregateUDF, Explain, Expr, ExprSchemable, LogicalPlan, ScalarUDF, TableSource,
-    WindowUDF,
-};
+use datafusion_expr::simplify::SimplifyContext;
+use datafusion_expr::{AggregateUDF, Explain, Expr, LogicalPlan, ScalarUDF, WindowUDF};
 use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 use datafusion_optimizer::{
     Analyzer, AnalyzerRule, Optimizer, OptimizerConfig, OptimizerRule,
 };
 use datafusion_physical_expr::create_physical_expr;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::optimizer::PhysicalOptimizer;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
-use datafusion_sql::parser::{DFParserBuilder, Statement};
-use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel};
+#[cfg(feature = "sql")]
+use datafusion_sql::{
+    parser::{DFParserBuilder, Statement},
+    planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel},
+};
 
 use async_trait::async_trait;
 use chrono::{DateTime, Utc};
 use itertools::Itertools;
 use log::{debug, info};
 use object_store::ObjectStore;
-use sqlparser::ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias};
-use sqlparser::dialect::dialect_from_str;
+#[cfg(feature = "sql")]
+use sqlparser::{
+    ast::{Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias},
+    dialect::dialect_from_str,
+};
 use url::Url;
 use uuid::Uuid;
 
@@ -108,12 +114,12 @@ use uuid::Uuid;
 /// # use std::sync::Arc;
 /// # #[tokio::main]
 /// # async fn main() -> Result<()> {
-///     let state = SessionStateBuilder::new()
-///         .with_config(SessionConfig::new())  
-///         .with_runtime_env(Arc::new(RuntimeEnv::default()))
-///         .with_default_features()
-///         .build();
-///     Ok(())  
+/// let state = SessionStateBuilder::new()
+///     .with_config(SessionConfig::new())
+///     .with_runtime_env(Arc::new(RuntimeEnv::default()))
+///     .with_default_features()
+///     .build();
+/// Ok(())
 /// # }
 /// ```
 ///
@@ -131,7 +137,10 @@ pub struct SessionState {
     analyzer: Analyzer,
     /// Provides support for customizing the SQL planner, e.g. to add support for custom operators like `->>` or `?`
     expr_planners: Vec<Arc<dyn ExprPlanner>>,
+    #[cfg(feature = "sql")]
+    relation_planners: Vec<Arc<dyn RelationPlanner>>,
     /// Provides support for customizing the SQL type planning
+    #[cfg(feature = "sql")]
     type_planner: Option<Arc<dyn TypePlanner>>,
     /// Responsible for optimizing a logical plan
     optimizer: Optimizer,
@@ -176,6 +185,7 @@ pub struct SessionState {
     /// It will be invoked on `CREATE FUNCTION` statements.
     /// thus, changing dialect o PostgreSql is required
     function_factory: Option<Arc<dyn FunctionFactory>>,
+    cache_factory: Option<Arc<dyn CacheFactory>>,
     /// Cache logical plans of prepared statements for later execution.
     /// Key is the prepared statement name.
     prepared_plans: HashMap<String, Arc<PreparedPlan>>,
@@ -185,7 +195,8 @@ impl Debug for SessionState {
     /// Prefer having short fields at the top and long vector fields near the end
     /// Group fields by
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("SessionState")
+        let mut debug_struct = f.debug_struct("SessionState");
+        let ret = debug_struct
             .field("session_id", &self.session_id)
             .field("config", &self.config)
             .field("runtime_env", &self.runtime_env)
@@ -196,9 +207,16 @@ impl Debug for SessionState {
             .field("table_options", &self.table_options)
             .field("table_factories", &self.table_factories)
             .field("function_factory", &self.function_factory)
-            .field("expr_planners", &self.expr_planners)
-            .field("type_planner", &self.type_planner)
-            .field("query_planners", &self.query_planner)
+            .field("cache_factory", &self.cache_factory)
+            .field("expr_planners", &self.expr_planners);
+
+        #[cfg(feature = "sql")]
+        let ret = ret.field("relation_planners", &self.relation_planners);
+
+        #[cfg(feature = "sql")]
+        let ret = ret.field("type_planner", &self.type_planner);
+
+        ret.field("query_planners", &self.query_planner)
             .field("analyzer", &self.analyzer)
             .field("optimizer", &self.optimizer)
             .field("physical_optimizers", &self.physical_optimizers)
@@ -274,17 +292,6 @@ impl Session for SessionState {
 }
 
 impl SessionState {
-    /// Returns new [`SessionState`] using the provided
-    /// [`SessionConfig`] and [`RuntimeEnv`].
-    #[deprecated(since = "41.0.0", note = "Use SessionStateBuilder")]
-    pub fn new_with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
-        SessionStateBuilder::new()
-            .with_config(config)
-            .with_runtime_env(runtime)
-            .with_default_features()
-            .build()
-    }
-
     pub(crate) fn resolve_table_ref(
         &self,
         table_ref: impl Into<TableReference>,
@@ -343,6 +350,13 @@ impl SessionState {
         self.optimizer.rules.push(optimizer_rule);
     }
 
+    /// Removes an optimizer rule by name, returning `true` if it existed.
+    pub(crate) fn remove_optimizer_rule(&mut self, name: &str) -> bool {
+        let original_len = self.optimizer.rules.len();
+        self.optimizer.rules.retain(|r| r.name() != name);
+        self.optimizer.rules.len() < original_len
+    }
+
     /// Registers a [`FunctionFactory`] to handle `CREATE FUNCTION` statements
     pub fn set_function_factory(&mut self, function_factory: Arc<dyn FunctionFactory>) {
         self.function_factory = Some(function_factory);
@@ -353,6 +367,16 @@ impl SessionState {
         self.function_factory.as_ref()
     }
 
+    /// Register a [`CacheFactory`] for custom caching strategy
+    pub fn set_cache_factory(&mut self, cache_factory: Arc<dyn CacheFactory>) {
+        self.cache_factory = Some(cache_factory);
+    }
+
+    /// Get the cache factory
+    pub fn cache_factory(&self) -> Option<&Arc<dyn CacheFactory>> {
+        self.cache_factory.as_ref()
+    }
+
     /// Get the table factories
     pub fn table_factories(&self) -> &HashMap<String, Arc<dyn TableProviderFactory>> {
         &self.table_factories
@@ -369,10 +393,11 @@ impl SessionState {
     /// [`Statement`]. See [`SessionContext::sql`] for running queries.
     ///
     /// [`SessionContext::sql`]: crate::execution::context::SessionContext::sql
+    #[cfg(feature = "sql")]
     pub fn sql_to_statement(
         &self,
         sql: &str,
-        dialect: &str,
+        dialect: &Dialect,
     ) -> datafusion_common::Result<Statement> {
         let dialect = dialect_from_str(dialect).ok_or_else(|| {
             plan_datafusion_err!(
@@ -391,7 +416,7 @@ impl SessionState {
             .parse_statements()?;
 
         if statements.len() > 1 {
-            return not_impl_err!(
+            return datafusion_common::not_impl_err!(
                 "The context currently only supports a single SQL statement"
             );
         }
@@ -405,10 +430,11 @@ impl SessionState {
     /// parse a sql string into a sqlparser-rs AST [`SQLExpr`].
     ///
     /// See [`Self::create_logical_expr`] for parsing sql to [`Expr`].
+    #[cfg(feature = "sql")]
     pub fn sql_to_expr(
         &self,
         sql: &str,
-        dialect: &str,
+        dialect: &Dialect,
     ) -> datafusion_common::Result<SQLExpr> {
         self.sql_to_expr_with_alias(sql, dialect).map(|x| x.expr)
     }
@@ -416,10 +442,11 @@ impl SessionState {
     /// parse a sql string into a sqlparser-rs AST [`SQLExprWithAlias`].
     ///
     /// See [`Self::create_logical_expr`] for parsing sql to [`Expr`].
+    #[cfg(feature = "sql")]
     pub fn sql_to_expr_with_alias(
         &self,
         sql: &str,
-        dialect: &str,
+        dialect: &Dialect,
     ) -> datafusion_common::Result<SQLExprWithAlias> {
         let dialect = dialect_from_str(dialect).ok_or_else(|| {
             plan_datafusion_err!(
@@ -434,7 +461,7 @@ impl SessionState {
             .with_dialect(dialect.as_ref())
             .with_recursion_limit(recursion_limit)
             .build()?
-            .parse_expr()?;
+            .parse_into_expr()?;
 
         Ok(expr)
     }
@@ -444,6 +471,7 @@ impl SessionState {
     /// See [`datafusion_sql::resolve::resolve_table_references`] for more information.
     ///
     /// [`datafusion_sql::resolve::resolve_table_references`]: datafusion_sql::resolve::resolve_table_references
+    #[cfg(feature = "sql")]
     pub fn resolve_table_references(
         &self,
         statement: &Statement,
@@ -458,6 +486,7 @@ impl SessionState {
     }
 
     /// Convert an AST Statement into a LogicalPlan
+    #[cfg(feature = "sql")]
     pub async fn statement_to_plan(
         &self,
         statement: Statement,
@@ -473,10 +502,10 @@ impl SessionState {
             let resolved = self.resolve_table_ref(reference);
             if let Entry::Vacant(v) = provider.tables.entry(resolved) {
                 let resolved = v.key();
-                if let Ok(schema) = self.schema_for_ref(resolved.clone()) {
-                    if let Some(table) = schema.table(&resolved.table).await? {
-                        v.insert(provider_as_source(table));
-                    }
+                if let Ok(schema) = self.schema_for_ref(resolved.clone())
+                    && let Some(table) = schema.table(&resolved.table).await?
+                {
+                    v.insert(provider_as_source(table));
                 }
             }
         }
@@ -485,6 +514,7 @@ impl SessionState {
         query.statement_to_plan(statement)
     }
 
+    #[cfg(feature = "sql")]
     fn get_parser_options(&self) -> ParserOptions {
         let sql_parser_options = &self.config.options().sql_parser;
 
@@ -494,8 +524,12 @@ impl SessionState {
             enable_options_value_normalization: sql_parser_options
                 .enable_options_value_normalization,
             support_varchar_with_length: sql_parser_options.support_varchar_with_length,
-            map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view,
+            map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view,
             collect_spans: sql_parser_options.collect_spans,
+            default_null_ordering: sql_parser_options
+                .default_null_ordering
+                .as_str()
+                .into(),
         }
     }
 
@@ -511,12 +545,13 @@ impl SessionState {
     ///
     /// [`SessionContext::sql`]: crate::execution::context::SessionContext::sql
     /// [`SessionContext::sql_with_options`]: crate::execution::context::SessionContext::sql_with_options
+    #[cfg(feature = "sql")]
     pub async fn create_logical_plan(
         &self,
         sql: &str,
     ) -> datafusion_common::Result<LogicalPlan> {
-        let dialect = self.config.options().sql_parser.dialect.as_str();
-        let statement = self.sql_to_statement(sql, dialect)?;
+        let dialect = self.config.options().sql_parser.dialect;
+        let statement = self.sql_to_statement(sql, &dialect)?;
         let plan = self.statement_to_plan(statement).await?;
         Ok(plan)
     }
@@ -524,15 +559,26 @@ impl SessionState {
     /// Creates a datafusion style AST [`Expr`] from a SQL string.
     ///
     /// See example on  [SessionContext::parse_sql_expr](crate::execution::context::SessionContext::parse_sql_expr)
+    #[cfg(feature = "sql")]
     pub fn create_logical_expr(
         &self,
         sql: &str,
         df_schema: &DFSchema,
     ) -> datafusion_common::Result<Expr> {
-        let dialect = self.config.options().sql_parser.dialect.as_str();
+        let dialect = self.config.options().sql_parser.dialect;
 
-        let sql_expr = self.sql_to_expr_with_alias(sql, dialect)?;
+        let sql_expr = self.sql_to_expr_with_alias(sql, &dialect)?;
 
+        self.create_logical_expr_from_sql_expr(sql_expr, df_schema)
+    }
+
+    /// Creates a datafusion style AST [`Expr`] from a SQL expression.
+    #[cfg(feature = "sql")]
+    pub fn create_logical_expr_from_sql_expr(
+        &self,
+        sql_expr: SQLExprWithAlias,
+        df_schema: &DFSchema,
+    ) -> datafusion_common::Result<Expr> {
         let provider = SessionContextProvider {
             state: self,
             tables: HashMap::new(),
@@ -557,6 +603,24 @@ impl SessionState {
         &self.expr_planners
     }
 
+    #[cfg(feature = "sql")]
+    /// Returns the registered relation planners in priority order.
+    pub fn relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        &self.relation_planners
+    }
+
+    #[cfg(feature = "sql")]
+    /// Registers a [`RelationPlanner`] to customize SQL relation planning.
+    ///
+    /// Newly registered planners are given higher priority than existing ones.
+    pub fn register_relation_planner(
+        &mut self,
+        planner: Arc<dyn RelationPlanner>,
+    ) -> datafusion_common::Result<()> {
+        self.relation_planners.insert(0, planner);
+        Ok(())
+    }
+
     /// Returns the [`QueryPlanner`] for this session
     pub fn query_planner(&self) -> &Arc<dyn QueryPlanner + Send + Sync> {
         &self.query_planner
@@ -570,7 +634,7 @@ impl SessionState {
             // analyze & capture output of each rule
             let analyzer_result = self.analyzer.execute_and_check(
                 e.plan.as_ref().clone(),
-                self.options(),
+                &self.options(),
                 |analyzed_plan, analyzer| {
                     let analyzer_name = analyzer.name().to_string();
                     let plan_type = PlanType::AnalyzedLogicalPlan { analyzer_name };
@@ -632,7 +696,7 @@ impl SessionState {
         } else {
             let analyzed_plan = self.analyzer.execute_and_check(
                 plan.clone(),
-                self.options(),
+                &self.options(),
                 |_, _| {},
             )?;
             self.optimizer.optimize(analyzed_plan, self, |_, _| {})
@@ -671,20 +735,25 @@ impl SessionState {
     /// * [`create_physical_expr`] for a lower-level API
     ///
     /// [simplified]: datafusion_optimizer::simplify_expressions
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     /// [`SessionContext::create_physical_expr`]: crate::execution::context::SessionContext::create_physical_expr
     pub fn create_physical_expr(
         &self,
         expr: Expr,
         df_schema: &DFSchema,
     ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
-        let simplifier =
-            ExprSimplifier::new(SessionSimplifyProvider::new(self, df_schema));
+        let config_options = self.config_options();
+        let simplify_context = SimplifyContext::default()
+            .with_schema(Arc::new(df_schema.clone()))
+            .with_config_options(Arc::clone(config_options))
+            .with_query_execution_start_time(
+                self.execution_props().query_execution_start_time,
+            );
+        let simplifier = ExprSimplifier::new(simplify_context);
         // apply type coercion here to ensure types match
         let mut expr = simplifier.coerce(expr, df_schema)?;
 
         // rewrite Exprs to functions if necessary
-        let config_options = self.config_options();
         for rewrite in self.analyzer.function_rewrites() {
             expr = expr
                 .transform_up(|expr| rewrite.rewrite(expr, df_schema, config_options))?
@@ -734,10 +803,16 @@ impl SessionState {
     }
 
     /// return the configuration options
-    pub fn config_options(&self) -> &ConfigOptions {
+    pub fn config_options(&self) -> &Arc<ConfigOptions> {
         self.config.options()
     }
 
+    /// Mark the start of the execution
+    pub fn mark_start_execution(&mut self) {
+        let config = Arc::clone(self.config.options());
+        self.execution_props.mark_start_execution(config);
+    }
+
     /// Return the table options
     pub fn table_options(&self) -> &TableOptions {
         &self.table_options
@@ -768,10 +843,18 @@ impl SessionState {
         overwrite: bool,
     ) -> Result<(), DataFusionError> {
         let ext = file_format.get_ext().to_lowercase();
-        match (self.file_formats.entry(ext.clone()), overwrite){
-            (Entry::Vacant(e), _) => {e.insert(file_format);},
-            (Entry::Occupied(mut e), true)  => {e.insert(file_format);},
-            (Entry::Occupied(_), false) => return config_err!("File type already registered for extension {ext}. Set overwrite to true to replace this extension."),
+        match (self.file_formats.entry(ext.clone()), overwrite) {
+            (Entry::Vacant(e), _) => {
+                e.insert(file_format);
+            }
+            (Entry::Occupied(mut e), true) => {
+                e.insert(file_format);
+            }
+            (Entry::Occupied(_), false) => {
+                return config_err!(
+                    "File type already registered for extension {ext}. Set overwrite to true to replace this extension."
+                );
+            }
         };
         Ok(())
     }
@@ -795,11 +878,8 @@ impl SessionState {
         &self.catalog_list
     }
 
-    /// set the catalog list
-    pub(crate) fn register_catalog_list(
-        &mut self,
-        catalog_list: Arc<dyn CatalogProviderList>,
-    ) {
+    /// Set the catalog list
+    pub fn register_catalog_list(&mut self, catalog_list: Arc<dyn CatalogProviderList>) {
         self.catalog_list = catalog_list;
     }
 
@@ -854,12 +934,12 @@ impl SessionState {
     pub(crate) fn store_prepared(
         &mut self,
         name: String,
-        data_types: Vec<DataType>,
+        fields: Vec<FieldRef>,
         plan: Arc<LogicalPlan>,
     ) -> datafusion_common::Result<()> {
         match self.prepared_plans.entry(name) {
             Entry::Vacant(e) => {
-                e.insert(Arc::new(PreparedPlan { data_types, plan }));
+                e.insert(Arc::new(PreparedPlan { fields, plan }));
                 Ok(())
             }
             Entry::Occupied(e) => {
@@ -889,10 +969,14 @@ impl SessionState {
 /// be used for all values unless explicitly provided.
 ///
 /// See example on [`SessionState`]
+#[derive(Clone)]
 pub struct SessionStateBuilder {
     session_id: Option<String>,
     analyzer: Option<Analyzer>,
     expr_planners: Option<Vec<Arc<dyn ExprPlanner>>>,
+    #[cfg(feature = "sql")]
+    relation_planners: Option<Vec<Arc<dyn RelationPlanner>>>,
+    #[cfg(feature = "sql")]
     type_planner: Option<Arc<dyn TypePlanner>>,
     optimizer: Option<Optimizer>,
     physical_optimizers: Option<PhysicalOptimizer>,
@@ -910,6 +994,7 @@ pub struct SessionStateBuilder {
     table_factories: Option<HashMap<String, Arc<dyn TableProviderFactory>>>,
     runtime_env: Option<Arc<RuntimeEnv>>,
     function_factory: Option<Arc<dyn FunctionFactory>>,
+    cache_factory: Option<Arc<dyn CacheFactory>>,
     // fields to support convenience functions
     analyzer_rules: Option<Vec<Arc<dyn AnalyzerRule + Send + Sync>>>,
     optimizer_rules: Option<Vec<Arc<dyn OptimizerRule + Send + Sync>>>,
@@ -929,6 +1014,9 @@ impl SessionStateBuilder {
             session_id: None,
             analyzer: None,
             expr_planners: None,
+            #[cfg(feature = "sql")]
+            relation_planners: None,
+            #[cfg(feature = "sql")]
             type_planner: None,
             optimizer: None,
             physical_optimizers: None,
@@ -946,6 +1034,7 @@ impl SessionStateBuilder {
             table_factories: None,
             runtime_env: None,
             function_factory: None,
+            cache_factory: None,
             // fields to support convenience functions
             analyzer_rules: None,
             optimizer_rules: None,
@@ -978,6 +1067,9 @@ impl SessionStateBuilder {
             session_id: None,
             analyzer: Some(existing.analyzer),
             expr_planners: Some(existing.expr_planners),
+            #[cfg(feature = "sql")]
+            relation_planners: Some(existing.relation_planners),
+            #[cfg(feature = "sql")]
             type_planner: existing.type_planner,
             optimizer: Some(existing.optimizer),
             physical_optimizers: Some(existing.physical_optimizers),
@@ -997,7 +1089,7 @@ impl SessionStateBuilder {
             table_factories: Some(existing.table_factories),
             runtime_env: Some(existing.runtime_env),
             function_factory: existing.function_factory,
-
+            cache_factory: existing.cache_factory,
             // fields to support convenience functions
             analyzer_rules: None,
             optimizer_rules: None,
@@ -1118,7 +1210,18 @@ impl SessionStateBuilder {
         self
     }
 
+    #[cfg(feature = "sql")]
+    /// Sets the [`RelationPlanner`]s used to customize SQL relation planning.
+    pub fn with_relation_planners(
+        mut self,
+        relation_planners: Vec<Arc<dyn RelationPlanner>>,
+    ) -> Self {
+        self.relation_planners = Some(relation_planners);
+        self
+    }
+
     /// Set the [`TypePlanner`] used to customize the behavior of the SQL planner.
+    #[cfg(feature = "sql")]
     pub fn with_type_planner(mut self, type_planner: Arc<dyn TypePlanner>) -> Self {
         self.type_planner = Some(type_planner);
         self
@@ -1285,6 +1388,15 @@ impl SessionStateBuilder {
         self
     }
 
+    /// Set a [`CacheFactory`] for custom caching strategy
+    pub fn with_cache_factory(
+        mut self,
+        cache_factory: Option<Arc<dyn CacheFactory>>,
+    ) -> Self {
+        self.cache_factory = cache_factory;
+        self
+    }
+
     /// Register an `ObjectStore` to the [`RuntimeEnv`]. See [`RuntimeEnv::register_object_store`]
     /// for more details.
     ///
@@ -1300,7 +1412,7 @@ impl SessionStateBuilder {
     /// let url = Url::try_from("file://").unwrap();
     /// let object_store = object_store::local::LocalFileSystem::new();
     /// let state = SessionStateBuilder::new()
-    ///     .with_config(SessionConfig::new())  
+    ///     .with_config(SessionConfig::new())
     ///     .with_object_store(&url, Arc::new(object_store))
     ///     .with_default_features()
     ///     .build();
@@ -1330,6 +1442,9 @@ impl SessionStateBuilder {
             session_id,
             analyzer,
             expr_planners,
+            #[cfg(feature = "sql")]
+            relation_planners,
+            #[cfg(feature = "sql")]
             type_planner,
             optimizer,
             physical_optimizers,
@@ -1347,6 +1462,7 @@ impl SessionStateBuilder {
             table_factories,
             runtime_env,
             function_factory,
+            cache_factory,
             analyzer_rules,
             optimizer_rules,
             physical_optimizer_rules,
@@ -1359,6 +1475,9 @@ impl SessionStateBuilder {
             session_id: session_id.unwrap_or_else(|| Uuid::new_v4().to_string()),
             analyzer: analyzer.unwrap_or_default(),
             expr_planners: expr_planners.unwrap_or_default(),
+            #[cfg(feature = "sql")]
+            relation_planners: relation_planners.unwrap_or_default(),
+            #[cfg(feature = "sql")]
             type_planner,
             optimizer: optimizer.unwrap_or_default(),
             physical_optimizers: physical_optimizers.unwrap_or_default(),
@@ -1382,6 +1501,7 @@ impl SessionStateBuilder {
             table_factories: table_factories.unwrap_or_default(),
             runtime_env,
             function_factory,
+            cache_factory,
             prepared_plans: HashMap::new(),
         };
 
@@ -1394,12 +1514,31 @@ impl SessionStateBuilder {
         }
 
         if let Some(scalar_functions) = scalar_functions {
-            scalar_functions.into_iter().for_each(|udf| {
-                let existing_udf = state.register_udf(udf);
-                if let Ok(Some(existing_udf)) = existing_udf {
-                    debug!("Overwrote an existing UDF: {}", existing_udf.name());
+            for udf in scalar_functions {
+                let config_options = state.config().options();
+                match udf.inner().with_updated_config(config_options) {
+                    Some(new_udf) => {
+                        if let Err(err) = state.register_udf(Arc::new(new_udf)) {
+                            debug!(
+                                "Failed to re-register updated UDF '{}': {}",
+                                udf.name(),
+                                err
+                            );
+                        }
+                    }
+                    None => match state.register_udf(Arc::clone(&udf)) {
+                        Ok(Some(existing)) => {
+                            debug!("Overwrote existing UDF '{}'", existing.name());
+                        }
+                        Ok(None) => {
+                            debug!("Registered UDF '{}'", udf.name());
+                        }
+                        Err(err) => {
+                            debug!("Failed to register UDF '{}': {}", udf.name(), err);
+                        }
+                    },
                 }
-            });
+            }
         }
 
         if let Some(aggregate_functions) = aggregate_functions {
@@ -1476,7 +1615,14 @@ impl SessionStateBuilder {
         &mut self.expr_planners
     }
 
+    #[cfg(feature = "sql")]
+    /// Returns a mutable reference to the current [`RelationPlanner`] list.
+    pub fn relation_planners(&mut self) -> &mut Option<Vec<Arc<dyn RelationPlanner>>> {
+        &mut self.relation_planners
+    }
+
     /// Returns the current type_planner value
+    #[cfg(feature = "sql")]
     pub fn type_planner(&mut self) -> &mut Option<Arc<dyn TypePlanner>> {
         &mut self.type_planner
     }
@@ -1565,6 +1711,11 @@ impl SessionStateBuilder {
         &mut self.function_factory
     }
 
+    /// Returns the cache factory
+    pub fn cache_factory(&mut self) -> &mut Option<Arc<dyn CacheFactory>> {
+        &mut self.cache_factory
+    }
+
     /// Returns the current analyzer_rules value
     pub fn analyzer_rules(
         &mut self,
@@ -1591,7 +1742,8 @@ impl Debug for SessionStateBuilder {
     /// Prefer having short fields at the top and long vector fields near the end
     /// Group fields by
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("SessionStateBuilder")
+        let mut debug_struct = f.debug_struct("SessionStateBuilder");
+        let ret = debug_struct
             .field("session_id", &self.session_id)
             .field("config", &self.config)
             .field("runtime_env", &self.runtime_env)
@@ -1602,9 +1754,11 @@ impl Debug for SessionStateBuilder {
             .field("table_options", &self.table_options)
             .field("table_factories", &self.table_factories)
             .field("function_factory", &self.function_factory)
-            .field("expr_planners", &self.expr_planners)
-            .field("type_planner", &self.type_planner)
-            .field("query_planners", &self.query_planner)
+            .field("cache_factory", &self.cache_factory)
+            .field("expr_planners", &self.expr_planners);
+        #[cfg(feature = "sql")]
+        let ret = ret.field("type_planner", &self.type_planner);
+        ret.field("query_planners", &self.query_planner)
             .field("analyzer_rules", &self.analyzer_rules)
             .field("analyzer", &self.analyzer)
             .field("optimizer_rules", &self.optimizer_rules)
@@ -1635,16 +1789,22 @@ impl From<SessionState> for SessionStateBuilder {
 ///
 /// This is used so the SQL planner can access the state of the session without
 /// having a direct dependency on the [`SessionState`] struct (and core crate)
+#[cfg(feature = "sql")]
 struct SessionContextProvider<'a> {
     state: &'a SessionState,
     tables: HashMap<ResolvedTableReference, Arc<dyn TableSource>>,
 }
 
+#[cfg(feature = "sql")]
 impl ContextProvider for SessionContextProvider<'_> {
     fn get_expr_planners(&self) -> &[Arc<dyn ExprPlanner>] {
         self.state.expr_planners()
     }
 
+    fn get_relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        self.state.relation_planners()
+    }
+
     fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
         if let Some(type_planner) = &self.state.type_planner {
             Some(Arc::clone(type_planner))
@@ -1675,6 +1835,21 @@ impl ContextProvider for SessionContextProvider<'_> {
             .get(name)
             .cloned()
             .ok_or_else(|| plan_datafusion_err!("table function '{name}' not found"))?;
+        let simplify_context = SimplifyContext::default()
+            .with_config_options(Arc::clone(self.state.config_options()))
+            .with_query_execution_start_time(
+                self.state.execution_props().query_execution_start_time,
+            );
+        let simplifier = ExprSimplifier::new(simplify_context);
+        let schema = DFSchema::empty();
+        let args = args
+            .into_iter()
+            .map(|arg| {
+                simplifier
+                    .coerce(arg, &schema)
+                    .and_then(|e| simplifier.simplify(e))
+            })
+            .collect::<datafusion_common::Result<Vec<_>>>()?;
         let provider = tbl_func.create_table_provider(&args)?;
 
         Ok(provider_as_source(provider))
@@ -1686,9 +1861,11 @@ impl ContextProvider for SessionContextProvider<'_> {
     fn create_cte_work_table(
         &self,
         name: &str,
-        schema: SchemaRef,
+        schema: arrow::datatypes::SchemaRef,
     ) -> datafusion_common::Result<Arc<dyn TableSource>> {
-        let table = Arc::new(CteWorkTable::new(name, schema));
+        let table = Arc::new(crate::datasource::cte_worktable::CteWorkTable::new(
+            name, schema,
+        ));
         Ok(provider_as_source(table))
     }
 
@@ -1705,6 +1882,8 @@ impl ContextProvider for SessionContextProvider<'_> {
     }
 
     fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType> {
+        use datafusion_expr::var_provider::{VarType, is_system_variables};
+
         if variable_names.is_empty() {
             return None;
         }
@@ -1738,14 +1917,21 @@ impl ContextProvider for SessionContextProvider<'_> {
         self.state.window_functions().keys().cloned().collect()
     }
 
-    fn get_file_type(&self, ext: &str) -> datafusion_common::Result<Arc<dyn FileType>> {
+    fn get_file_type(
+        &self,
+        ext: &str,
+    ) -> datafusion_common::Result<
+        Arc<dyn datafusion_common::file_options::file_type::FileType>,
+    > {
         self.state
             .file_formats
             .get(&ext.to_lowercase())
             .ok_or(plan_datafusion_err!(
                 "There is no registered file format with ext {ext}"
             ))
-            .map(|file_type| format_as_file_type(Arc::clone(file_type)))
+            .map(|file_type| {
+                crate::datasource::file_format::format_as_file_type(Arc::clone(file_type))
+            })
     }
 }
 
@@ -1869,10 +2055,24 @@ impl FunctionRegistry for SessionState {
         self.expr_planners.push(expr_planner);
         Ok(())
     }
+
+    fn udafs(&self) -> HashSet<String> {
+        self.aggregate_functions.keys().cloned().collect()
+    }
+
+    fn udwfs(&self) -> HashSet<String> {
+        self.window_functions.keys().cloned().collect()
+    }
+}
+
+impl datafusion_execution::TaskContextProvider for SessionState {
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        SessionState::task_ctx(self)
+    }
 }
 
 impl OptimizerConfig for SessionState {
-    fn query_execution_start_time(&self) -> DateTime<Utc> {
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
         self.execution_props.query_execution_start_time
     }
 
@@ -1880,8 +2080,8 @@ impl OptimizerConfig for SessionState {
         &self.execution_props.alias_generator
     }
 
-    fn options(&self) -> &ConfigOptions {
-        self.config_options()
+    fn options(&self) -> Arc<ConfigOptions> {
+        Arc::clone(self.config.options())
     }
 
     fn function_registry(&self) -> Option<&dyn FunctionRegistry> {
@@ -1924,51 +2124,35 @@ impl QueryPlanner for DefaultQueryPlanner {
     }
 }
 
-struct SessionSimplifyProvider<'a> {
-    state: &'a SessionState,
-    df_schema: &'a DFSchema,
-}
-
-impl<'a> SessionSimplifyProvider<'a> {
-    fn new(state: &'a SessionState, df_schema: &'a DFSchema) -> Self {
-        Self { state, df_schema }
-    }
-}
-
-impl SimplifyInfo for SessionSimplifyProvider<'_> {
-    fn is_boolean_type(&self, expr: &Expr) -> datafusion_common::Result<bool> {
-        Ok(expr.get_type(self.df_schema)? == DataType::Boolean)
-    }
-
-    fn nullable(&self, expr: &Expr) -> datafusion_common::Result<bool> {
-        expr.nullable(self.df_schema)
-    }
-
-    fn execution_props(&self) -> &ExecutionProps {
-        self.state.execution_props()
-    }
-
-    fn get_data_type(&self, expr: &Expr) -> datafusion_common::Result<DataType> {
-        expr.get_type(self.df_schema)
-    }
-}
-
 #[derive(Debug)]
 pub(crate) struct PreparedPlan {
     /// Data types of the parameters
-    pub(crate) data_types: Vec<DataType>,
+    pub(crate) fields: Vec<FieldRef>,
     /// The prepared logical plan
     pub(crate) plan: Arc<LogicalPlan>,
 }
 
+/// A [`CacheFactory`] can be registered via [`SessionState`]
+/// to create a custom logical plan for [`crate::dataframe::DataFrame::cache`].
+/// Additionally, a custom [`crate::physical_planner::ExtensionPlanner`]/[`QueryPlanner`]
+/// may need to be implemented to handle such plans.
+pub trait CacheFactory: Debug + Send + Sync {
+    /// Create a logical plan for caching
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        session_state: &SessionState,
+    ) -> datafusion_common::Result<LogicalPlan>;
+}
+
 #[cfg(test)]
 mod tests {
     use super::{SessionContextProvider, SessionStateBuilder};
     use crate::common::assert_contains;
     use crate::config::ConfigOptions;
+    use crate::datasource::MemTable;
     use crate::datasource::empty::EmptyTable;
     use crate::datasource::provider_as_source;
-    use crate::datasource::MemTable;
     use crate::execution::context::SessionState;
     use crate::logical_expr::planner::ExprPlanner;
     use crate::logical_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF};
@@ -1980,18 +2164,21 @@ mod tests {
     use datafusion_catalog::MemoryCatalogProviderList;
     use datafusion_common::DFSchema;
     use datafusion_common::Result;
+    use datafusion_common::config::Dialect;
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::Expr;
-    use datafusion_optimizer::optimizer::OptimizerRule;
     use datafusion_optimizer::Optimizer;
+    use datafusion_optimizer::optimizer::OptimizerRule;
     use datafusion_physical_plan::display::DisplayableExecutionPlan;
     use datafusion_sql::planner::{PlannerContext, SqlToRel};
     use std::collections::HashMap;
     use std::sync::Arc;
 
     #[test]
+    #[cfg(feature = "sql")]
     fn test_session_state_with_default_features() {
         // test array planners with and without builtin planners
+        #[cfg(feature = "sql")]
         fn sql_to_expr(state: &SessionState) -> Result<Expr> {
             let provider = SessionContextProvider {
                 state,
@@ -2001,8 +2188,8 @@ mod tests {
             let sql = "[1,2,3]";
             let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
             let df_schema = DFSchema::try_from(schema)?;
-            let dialect = state.config.options().sql_parser.dialect.as_str();
-            let sql_expr = state.sql_to_expr(sql, dialect)?;
+            let dialect = state.config.options().sql_parser.dialect;
+            let sql_expr = state.sql_to_expr(sql, &dialect)?;
 
             let query = SqlToRel::new_with_options(&provider, state.get_parser_options());
             query.sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new())
@@ -2018,6 +2205,36 @@ mod tests {
         assert!(sql_to_expr(&state).is_err())
     }
 
+    #[test]
+    #[cfg(feature = "sql")]
+    fn test_create_logical_expr_from_sql_expr() {
+        let state = SessionStateBuilder::new().with_default_features().build();
+
+        let provider = SessionContextProvider {
+            state: &state,
+            tables: HashMap::new(),
+        };
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        let dialect = state.config.options().sql_parser.dialect;
+        let query = SqlToRel::new_with_options(&provider, state.get_parser_options());
+
+        for sql in ["[1,2,3]", "a > 10", "SUM(a)"] {
+            let sql_expr = state.sql_to_expr(sql, &dialect).unwrap();
+            let from_str = query
+                .sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new())
+                .unwrap();
+
+            let sql_expr_with_alias =
+                state.sql_to_expr_with_alias(sql, &dialect).unwrap();
+            let from_expr = state
+                .create_logical_expr_from_sql_expr(sql_expr_with_alias, &df_schema)
+                .unwrap();
+            assert_eq!(from_str, from_expr);
+        }
+    }
+
     #[test]
     fn test_from_existing() -> Result<()> {
         fn employee_batch() -> RecordBatch {
@@ -2058,13 +2275,15 @@ mod tests {
             .table_exist("employee");
         assert!(is_exist);
         let new_state = SessionStateBuilder::new_from_existing(session_state).build();
-        assert!(new_state
-            .catalog_list()
-            .catalog(default_catalog.as_str())
-            .unwrap()
-            .schema(default_schema.as_str())
-            .unwrap()
-            .table_exist("employee"));
+        assert!(
+            new_state
+                .catalog_list()
+                .catalog(default_catalog.as_str())
+                .unwrap()
+                .schema(default_schema.as_str())
+                .unwrap()
+                .table_exist("employee")
+        );
 
         // if `with_create_default_catalog_and_schema` is disabled, the new one shouldn't create default catalog and schema
         let disable_create_default =
@@ -2072,10 +2291,12 @@ mod tests {
         let without_default_state = SessionStateBuilder::new()
             .with_config(disable_create_default)
             .build();
-        assert!(without_default_state
-            .catalog_list()
-            .catalog(&default_catalog)
-            .is_none());
+        assert!(
+            without_default_state
+                .catalog_list()
+                .catalog(&default_catalog)
+                .is_none()
+        );
         let new_state =
             SessionStateBuilder::new_from_existing(without_default_state).build();
         assert!(new_state.catalog_list().catalog(&default_catalog).is_none());
@@ -2160,7 +2381,8 @@ mod tests {
             }
 
             let state = &context_provider.state;
-            let statement = state.sql_to_statement("select count(*) from t", "mysql")?;
+            let statement =
+                state.sql_to_statement("select count(*) from t", &Dialect::MySQL)?;
             let plan = SqlToRel::new(&context_provider).statement_to_plan(statement)?;
             state.create_physical_plan(&plan).await
         }
diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs
index a241738bd3a42..721710d4e057e 100644
--- a/datafusion/core/src/execution/session_state_defaults.rs
+++ b/datafusion/core/src/execution/session_state_defaults.rs
@@ -17,6 +17,7 @@
 
 use crate::catalog::listing_schema::ListingSchemaProvider;
 use crate::catalog::{CatalogProvider, TableProviderFactory};
+use crate::datasource::file_format::FileFormatFactory;
 use crate::datasource::file_format::arrow::ArrowFormatFactory;
 #[cfg(feature = "avro")]
 use crate::datasource::file_format::avro::AvroFormatFactory;
@@ -24,7 +25,6 @@ use crate::datasource::file_format::csv::CsvFormatFactory;
 use crate::datasource::file_format::json::JsonFormatFactory;
 #[cfg(feature = "parquet")]
 use crate::datasource::file_format::parquet::ParquetFormatFactory;
-use crate::datasource::file_format::FileFormatFactory;
 use crate::datasource::provider::DefaultTableFactory;
 use crate::execution::context::SessionState;
 #[cfg(feature = "nested_expressions")]
@@ -90,11 +90,10 @@ impl SessionStateDefaults {
             Arc::new(functions_nested::planner::NestedFunctionPlanner),
             #[cfg(feature = "nested_expressions")]
             Arc::new(functions_nested::planner::FieldAccessPlanner),
-            #[cfg(any(
-                feature = "datetime_expressions",
-                feature = "unicode_expressions"
-            ))]
-            Arc::new(functions::planner::UserDefinedFunctionPlanner),
+            #[cfg(feature = "datetime_expressions")]
+            Arc::new(functions::datetime::planner::DatetimeFunctionPlanner),
+            #[cfg(feature = "unicode_expressions")]
+            Arc::new(functions::unicode::planner::UnicodeFunctionPlanner),
             Arc::new(functions_aggregate::planner::AggregateFunctionPlanner),
             Arc::new(functions_window::planner::WindowFunctionPlanner),
         ];
@@ -102,9 +101,9 @@ impl SessionStateDefaults {
         expr_planners
     }
 
-    /// returns the list of default [`ScalarUDF']'s
+    /// returns the list of default [`ScalarUDF`]s
     pub fn default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
-        #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))]
+        #[cfg_attr(not(feature = "nested_expressions"), expect(unused_mut))]
         let mut functions: Vec<Arc<ScalarUDF>> = functions::all_default_functions();
 
         #[cfg(feature = "nested_expressions")]
@@ -113,12 +112,12 @@ impl SessionStateDefaults {
         functions
     }
 
-    /// returns the list of default [`AggregateUDF']'s
+    /// returns the list of default [`AggregateUDF`]s
     pub fn default_aggregate_functions() -> Vec<Arc<AggregateUDF>> {
         functions_aggregate::all_default_aggregate_functions()
     }
 
-    /// returns the list of default [`WindowUDF']'s
+    /// returns the list of default [`WindowUDF`]s
     pub fn default_window_functions() -> Vec<Arc<WindowUDF>> {
         functions_window::all_default_window_functions()
     }
@@ -128,7 +127,7 @@ impl SessionStateDefaults {
         functions_table::all_default_table_functions()
     }
 
-    /// returns the list of default [`FileFormatFactory']'s
+    /// returns the list of default [`FileFormatFactory`]s
     pub fn default_file_formats() -> Vec<Arc<dyn FileFormatFactory>> {
         let file_formats: Vec<Arc<dyn FileFormatFactory>> = vec![
             #[cfg(feature = "parquet")]
@@ -156,7 +155,7 @@ impl SessionStateDefaults {
     }
 
     /// registers all the builtin array functions
-    #[cfg_attr(not(feature = "nested_expressions"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "nested_expressions"), expect(unused_variables))]
     pub fn register_array_functions(state: &mut SessionState) {
         // register crate of array expressions (if enabled)
         #[cfg(feature = "nested_expressions")]
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 6956108e2df3f..349eee5592abe 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -19,7 +19,7 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 //
@@ -35,11 +35,15 @@
     )
 )]
 #![warn(missing_docs, clippy::needless_borrow)]
+// Use `allow` instead of `expect` for test configuration to explicitly
+// disable the lint for all test code rather than expecting violations
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! [DataFusion] is an extensible query engine written in Rust that
 //! uses [Apache Arrow] as its in-memory format. DataFusion's target users are
 //! developers building fast and feature rich database and analytic systems,
-//! customized to particular workloads. See [use cases] for examples.
+//! customized to particular workloads. Please see the [DataFusion website] for
+//! additional documentation, [use cases] and examples.
 //!
 //! "Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
 //! excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
@@ -53,6 +57,7 @@
 //! See the [Architecture] section below for more details.
 //!
 //! [DataFusion]: https://datafusion.apache.org/
+//! [DataFusion website]: https://datafusion.apache.org
 //! [Apache Arrow]: https://arrow.apache.org
 //! [use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
 //! [SQL]: https://datafusion.apache.org/user-guide/sql/index.html
@@ -84,26 +89,29 @@
 //! let ctx = SessionContext::new();
 //!
 //! // create the dataframe
-//! let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
+//! let df = ctx
+//!     .read_csv("tests/data/example.csv", CsvReadOptions::new())
+//!     .await?;
 //!
 //! // create a plan
-//! let df = df.filter(col("a").lt_eq(col("b")))?
-//!            .aggregate(vec![col("a")], vec![min(col("b"))])?
-//!            .limit(0, Some(100))?;
+//! let df = df
+//!     .filter(col("a").lt_eq(col("b")))?
+//!     .aggregate(vec![col("a")], vec![min(col("b"))])?
+//!     .limit(0, Some(100))?;
 //!
 //! // execute the plan
 //! let results: Vec<RecordBatch> = df.collect().await?;
 //!
 //! // format the results
-//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?
-//!    .to_string();
+//! let pretty_results =
+//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
 //!
 //! let expected = vec![
 //!     "+---+----------------+",
 //!     "| a | min(?table?.b) |",
 //!     "+---+----------------+",
 //!     "| 1 | 2              |",
-//!     "+---+----------------+"
+//!     "+---+----------------+",
 //! ];
 //!
 //! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
@@ -124,24 +132,27 @@
 //! # async fn main() -> Result<()> {
 //! let ctx = SessionContext::new();
 //!
-//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?;
+//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new())
+//!     .await?;
 //!
 //! // create a plan
-//! let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?;
+//! let df = ctx
+//!     .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100")
+//!     .await?;
 //!
 //! // execute the plan
 //! let results: Vec<RecordBatch> = df.collect().await?;
 //!
 //! // format the results
-//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?
-//!   .to_string();
+//! let pretty_results =
+//!     arrow::util::pretty::pretty_format_batches(&results)?.to_string();
 //!
 //! let expected = vec![
 //!     "+---+----------------+",
 //!     "| a | min(example.b) |",
 //!     "+---+----------------+",
 //!     "| 1 | 2              |",
-//!     "+---+----------------+"
+//!     "+---+----------------+",
 //! ];
 //!
 //! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
@@ -311,17 +322,17 @@
 //! ```
 //!
 //! A [`TableProvider`] provides information for planning and
-//! an [`ExecutionPlan`] for execution. DataFusion includes [`ListingTable`],
-//! a [`TableProvider`] which reads individual files or directories of files
-//! ("partitioned datasets") of the same file format. Users can add
-//! support for new file formats by implementing the [`TableProvider`]
-//! trait.
+//! an [`ExecutionPlan`] for execution. DataFusion includes two built-in
+//! table providers that support common file formats and require no runtime services,
+//! [`ListingTable`] and [`MemTable`]. You can add support for any other data
+//! source and/or file formats by implementing the [`TableProvider`] trait.
 //!
 //! See also:
 //!
 //! 1. [`ListingTable`]: Reads data from one or more Parquet, JSON, CSV, or AVRO
-//!    files supporting HIVE style partitioning, optional compression, directly
-//!    reading from remote object store and more.
+//!    files in one or more local or remote directories. Supports HIVE style
+//!    partitioning, optional compression, directly reading from remote
+//!    object store, file metadata caching, and more.
 //!
 //! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
 //!
@@ -350,7 +361,7 @@
 //! [`TreeNode`]: datafusion_common::tree_node::TreeNode
 //! [`tree_node module`]: datafusion_expr::logical_plan::tree_node
 //! [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
-//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
 //!
 //! ### Physical Plans
 //!
@@ -441,7 +452,30 @@
 //! other operators read a single [`RecordBatch`] from their input to produce a
 //! single [`RecordBatch`] as output.
 //!
-//! For example, given this SQL query:
+//! For example, given this SQL:
+//!
+//! ```sql
+//! SELECT name FROM 'data.parquet' WHERE id > 10
+//! ```
+//!
+//! An simplified DataFusion execution plan is shown below. It first reads
+//! data from the Parquet file, then applies the filter, then the projection,
+//! and finally produces output. Each step processes one [`RecordBatch`] at a
+//! time. Multiple batches are processed concurrently on different CPU cores
+//! for plans with multiple partitions.
+//!
+//! ```text
+//! ┌─────────────┐    ┌──────────────┐    ┌────────────────┐    ┌──────────────────┐    ┌──────────┐
+//! │ Parquet     │───▶│ DataSource   │───▶│ FilterExec     │───▶│ ProjectionExec   │───▶│ Results  │
+//! │ File        │    │              │    │                │    │                  │    │          │
+//! └─────────────┘    └──────────────┘    └────────────────┘    └──────────────────┘    └──────────┘
+//!                    (reads data)        (id > 10)             (keeps "name" col)
+//!                    RecordBatch ───▶    RecordBatch ────▶     RecordBatch ────▶        RecordBatch
+//! ```
+//!
+//! DataFusion uses the classic "pull" based control flow (explained more in the
+//! next section) to implement streaming execution. As an example,
+//! consider the following SQL query:
 //!
 //! ```sql
 //! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30);
@@ -498,10 +532,21 @@
 //! While preparing for execution, DataFusion tries to create this many distinct
 //! `async` [`Stream`]s for each [`ExecutionPlan`].
 //! The [`Stream`]s for certain [`ExecutionPlan`]s, such as [`RepartitionExec`]
-//! and [`CoalescePartitionsExec`], spawn [Tokio] [`task`]s, that are run by
+//! and [`CoalescePartitionsExec`], spawn [Tokio] [`task`]s, that run on
 //! threads managed by the [`Runtime`].
 //! Many DataFusion [`Stream`]s perform CPU intensive processing.
 //!
+//! ### Cooperative Scheduling
+//!
+//! DataFusion uses cooperative scheduling, which means that each [`Stream`]
+//! is responsible for yielding control back to the [`Runtime`] after
+//! some amount of work is done. Please see the [`coop`] module documentation
+//! for more details.
+//!
+//! [`coop`]: datafusion_physical_plan::coop
+//!
+//! ### Network I/O and CPU intensive tasks
+//!
 //! Using `async` for CPU intensive tasks makes it easy for [`TableProvider`]s
 //! to perform network I/O using standard Rust `async` during execution.
 //! However, this design also makes it very easy to mix CPU intensive and latency
@@ -510,17 +555,20 @@
 //! initial development and processing local files, but it can lead to problems
 //! under load and/or when reading from network sources such as AWS S3.
 //!
+//! ### Optimizing Latency: Throttled CPU / IO under Highly Concurrent Load
+//!
 //! If your system does not fully utilize either the CPU or network bandwidth
 //! during execution, or you see significantly higher tail (e.g. p99) latencies
 //! responding to network requests, **it is likely you need to use a different
-//! [`Runtime`] for CPU intensive DataFusion plans**. This effect can be especially
-//! pronounced when running several queries concurrently.
+//! [`Runtime`] for DataFusion plans**. The [thread_pools example]
+//! has  an example of how to do so.
 //!
-//! As shown in the following figure, using the same [`Runtime`] for both CPU
-//! intensive processing and network requests can introduce significant
-//! delays in responding to those network requests. Delays in processing network
-//! requests can and does lead network flow control to throttle the available
-//! bandwidth in response.
+//! As shown below, using the same [`Runtime`] for both CPU intensive processing
+//! and network requests can introduce significant delays in responding to
+//! those network requests. Delays in processing network requests can and does
+//! lead network flow control to throttle the available bandwidth in response.
+//! This effect can be especially pronounced when running multiple queries
+//! concurrently.
 //!
 //! ```text
 //!                                                                          Legend
@@ -591,7 +639,7 @@
 //! └─────────────┘           ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
 //!                          ─────────────────────────────────────────────────────────────▶
 //!                                                                                           time
-//!```
+//! ```
 //!
 //! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for
 //! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**,
@@ -602,6 +650,7 @@
 //!
 //! [Tokio]:  https://tokio.rs
 //! [`Runtime`]: tokio::runtime::Runtime
+//! [thread_pools example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/thread_pools.rs
 //! [`task`]: tokio::task
 //! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
 //! [`RepartitionExec`]: physical_plan::repartition::RepartitionExec
@@ -717,6 +766,8 @@
 pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
 
 extern crate core;
+
+#[cfg(feature = "sql")]
 extern crate sqlparser;
 
 pub mod dataframe;
@@ -727,11 +778,16 @@ pub mod physical_planner;
 pub mod prelude;
 pub mod scalar;
 
-// re-export dependencies from arrow-rs to minimize version maintenance for crate users
+// Re-export dependencies that are part of DataFusion public API (e.g. via DataFusionError)
 pub use arrow;
+pub use object_store;
+
 #[cfg(feature = "parquet")]
 pub use parquet;
 
+#[cfg(feature = "avro")]
+pub use datafusion_datasource_avro::apache_avro;
+
 // re-export DataFusion sub-crates at the top level. Use `pub use *`
 // so that the contents of the subcrates appears in rustdocs
 // for details, see https://github.com/apache/datafusion/issues/6648
@@ -786,6 +842,11 @@ pub mod physical_expr {
     pub use datafusion_physical_expr::*;
 }
 
+/// re-export of [`datafusion_physical_expr_adapter`] crate
+pub mod physical_expr_adapter {
+    pub use datafusion_physical_expr_adapter::*;
+}
+
 /// re-export of [`datafusion_physical_plan`] crate
 pub mod physical_plan {
     pub use datafusion_physical_plan::*;
@@ -796,6 +857,7 @@ pub use datafusion_common::assert_batches_eq;
 pub use datafusion_common::assert_batches_sorted_eq;
 
 /// re-export of [`datafusion_sql`] crate
+#[cfg(feature = "sql")]
 pub mod sql {
     pub use datafusion_sql::*;
 }
@@ -811,13 +873,6 @@ pub mod functions_nested {
     pub use datafusion_functions_nested::*;
 }
 
-/// re-export of [`datafusion_functions_nested`] crate as [`functions_array`] for backward compatibility, if "nested_expressions" feature is enabled
-#[deprecated(since = "41.0.0", note = "use datafusion-functions-nested instead")]
-pub mod functions_array {
-    #[cfg(feature = "nested_expressions")]
-    pub use datafusion_functions_nested::*;
-}
-
 /// re-export of [`datafusion_functions_aggregate`] crate
 pub mod functions_aggregate {
     pub use datafusion_functions_aggregate::*;
@@ -876,20 +931,20 @@ doc_comment::doctest!("../../../README.md", readme_example_test);
 //
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/user-guide/concepts-readings-events.md",
-    user_guide_concepts_readings_events
+    "../../../docs/source/user-guide/arrow-introduction.md",
+    user_guide_arrow_introduction
 );
 
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/user-guide/configs.md",
-    user_guide_configs
+    "../../../docs/source/user-guide/concepts-readings-events.md",
+    user_guide_concepts_readings_events
 );
 
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/user-guide/runtime_configs.md",
-    user_guide_runtime_configs
+    "../../../docs/source/user-guide/configs.md",
+    user_guide_configs
 );
 
 #[cfg(doctest)]
@@ -1047,8 +1102,14 @@ doc_comment::doctest!(
 
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/library-user-guide/adding-udfs.md",
-    library_user_guide_adding_udfs
+    "../../../docs/source/library-user-guide/functions/adding-udfs.md",
+    library_user_guide_functions_adding_udfs
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/functions/spark.md",
+    library_user_guide_functions_spark
 );
 
 #[cfg(doctest)]
@@ -1119,8 +1180,56 @@ doc_comment::doctest!(
 
 #[cfg(doctest)]
 doc_comment::doctest!(
-    "../../../docs/source/library-user-guide/upgrading.md",
-    library_user_guide_upgrading
+    "../../../docs/source/library-user-guide/upgrading/46.0.0.md",
+    library_user_guide_upgrading_46_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/47.0.0.md",
+    library_user_guide_upgrading_47_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/48.0.0.md",
+    library_user_guide_upgrading_48_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/48.0.1.md",
+    library_user_guide_upgrading_48_0_1
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/49.0.0.md",
+    library_user_guide_upgrading_49_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/50.0.0.md",
+    library_user_guide_upgrading_50_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/51.0.0.md",
+    library_user_guide_upgrading_51_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/52.0.0.md",
+    library_user_guide_upgrading_52_0_0
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/library-user-guide/upgrading/53.0.0.md",
+    library_user_guide_upgrading_53_0_0
 );
 
 #[cfg(doctest)]
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index fbb4250fc4dfb..b4fb44f670e8d 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -18,13 +18,13 @@
 //! Planner for [`LogicalPlan`] to [`ExecutionPlan`]
 
 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use crate::datasource::file_format::file_type_to_format;
 use crate::datasource::listing::ListingTableUrl;
-use crate::datasource::physical_plan::FileSinkConfig;
-use crate::datasource::{source_as_provider, DefaultTableSource};
+use crate::datasource::physical_plan::{FileOutputMode, FileSinkConfig};
+use crate::datasource::{DefaultTableSource, source_as_provider};
 use crate::error::{DataFusionError, Result};
 use crate::execution::context::{ExecutionProps, SessionState};
 use crate::logical_expr::utils::generate_sort_key;
@@ -39,64 +39,76 @@ use crate::physical_expr::{create_physical_expr, create_physical_exprs};
 use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
 use crate::physical_plan::analyze::AnalyzeExec;
 use crate::physical_plan::explain::ExplainExec;
-use crate::physical_plan::filter::FilterExec;
+use crate::physical_plan::filter::FilterExecBuilder;
 use crate::physical_plan::joins::utils as join_utils;
 use crate::physical_plan::joins::{
     CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
 };
 use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
-use crate::physical_plan::projection::ProjectionExec;
+use crate::physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sorts::sort::SortExec;
 use crate::physical_plan::union::UnionExec;
 use crate::physical_plan::unnest::UnnestExec;
 use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
 use crate::physical_plan::{
-    displayable, windows, ExecutionPlan, ExecutionPlanProperties, InputOrderMode,
-    Partitioning, PhysicalExpr, WindowExpr,
+    ExecutionPlan, ExecutionPlanProperties, InputOrderMode, Partitioning, PhysicalExpr,
+    WindowExpr, displayable, windows,
 };
-use datafusion_physical_plan::empty::EmptyExec;
-use datafusion_physical_plan::recursive_query::RecursiveQueryExec;
+use crate::schema_equivalence::schema_satisfied_by;
 
-use arrow::array::{builder::StringBuilder, RecordBatch};
+use arrow::array::{RecordBatch, builder::StringBuilder};
 use arrow::compute::SortOptions;
-use arrow::datatypes::{Schema, SchemaRef};
+use arrow::datatypes::Schema;
+use arrow_schema::Field;
+use datafusion_catalog::ScanArgs;
+use datafusion_common::Column;
 use datafusion_common::display::ToStringifiedPlan;
+use datafusion_common::format::ExplainAnalyzeLevel;
 use datafusion_common::tree_node::{
-    Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeVisitor,
+    Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor,
+};
+use datafusion_common::{
+    DFSchema, DFSchemaRef, ScalarValue, exec_err, internal_datafusion_err, internal_err,
+    not_impl_err, plan_err,
 };
 use datafusion_common::{
-    exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema,
-    ScalarValue,
+    TableReference, assert_eq_or_internal_err, assert_or_internal_err,
 };
+use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::memory::MemorySourceConfig;
 use datafusion_expr::dml::{CopyTo, InsertOp};
 use datafusion_expr::expr::{
-    physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet,
-    WindowFunction, WindowFunctionParams,
+    AggregateFunction, AggregateFunctionParams, Alias, GroupingSet, NullTreatment,
+    WindowFunction, WindowFunctionParams, physical_name,
 };
 use datafusion_expr::expr_rewriter::unnormalize_cols;
 use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary;
+use datafusion_expr::utils::{expr_to_columns, split_conjunction};
 use datafusion_expr::{
-    Analyze, DescribeTable, DmlStatement, Explain, ExplainFormat, Extension, FetchType,
-    Filter, JoinType, RecursiveQuery, SkipType, StringifiedPlan, WindowFrame,
-    WindowFrameBound, WriteOp,
+    Analyze, BinaryExpr, DescribeTable, DmlStatement, Explain, ExplainFormat, Extension,
+    FetchType, Filter, JoinType, Operator, RecursiveQuery, SkipType, StringifiedPlan,
+    WindowFrame, WindowFrameBound, WriteOp,
 };
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
-use datafusion_physical_expr::expressions::{Column, Literal};
-use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_expr::expressions::Literal;
+use datafusion_physical_expr::{
+    LexOrdering, PhysicalSortExpr, create_physical_sort_exprs,
+};
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::execution_plan::InvariantLevel;
+use datafusion_physical_plan::joins::PiecewiseMergeJoinExec;
+use datafusion_physical_plan::metrics::MetricType;
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion_physical_plan::recursive_query::RecursiveQueryExec;
 use datafusion_physical_plan::unnest::ListUnnest;
 
-use crate::schema_equivalence::schema_satisfied_by;
 use async_trait::async_trait;
-use datafusion_datasource::file_groups::FileGroup;
+use datafusion_physical_plan::async_func::{AsyncFuncExec, AsyncMapper};
 use futures::{StreamExt, TryStreamExt};
-use itertools::{multiunzip, Itertools};
-use log::{debug, trace};
-use sqlparser::ast::NullTreatment;
+use itertools::{Itertools, multiunzip};
+use log::debug;
 use tokio::sync::Mutex;
 
 /// Physical query planner that converts a `LogicalPlan` to an
@@ -145,6 +157,80 @@ pub trait ExtensionPlanner {
         physical_inputs: &[Arc<dyn ExecutionPlan>],
         session_state: &SessionState,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>>;
+
+    /// Create a physical plan for a [`LogicalPlan::TableScan`].
+    ///
+    /// This is useful for planning valid [`TableSource`]s that are not [`TableProvider`]s.
+    ///
+    /// Returns:
+    /// * `Ok(Some(plan))` if the planner knows how to plan the `scan`
+    /// * `Ok(None)` if the planner does not know how to plan the `scan` and wants to delegate the planning to another [`ExtensionPlanner`]
+    /// * `Err` if the planner knows how to plan the `scan` but errors while doing so
+    ///
+    /// # Example
+    ///
+    /// ```rust,ignore
+    /// use std::sync::Arc;
+    /// use datafusion::physical_plan::ExecutionPlan;
+    /// use datafusion::logical_expr::TableScan;
+    /// use datafusion::execution::context::SessionState;
+    /// use datafusion::error::Result;
+    /// use datafusion_physical_planner::{ExtensionPlanner, PhysicalPlanner};
+    /// use async_trait::async_trait;
+    ///
+    /// // Your custom table source type
+    /// struct MyCustomTableSource { /* ... */ }
+    ///
+    /// // Your custom execution plan
+    /// struct MyCustomExec { /* ... */ }
+    ///
+    /// struct MyExtensionPlanner;
+    ///
+    /// #[async_trait]
+    /// impl ExtensionPlanner for MyExtensionPlanner {
+    ///     async fn plan_extension(
+    ///         &self,
+    ///         _planner: &dyn PhysicalPlanner,
+    ///         _node: &dyn UserDefinedLogicalNode,
+    ///         _logical_inputs: &[&LogicalPlan],
+    ///         _physical_inputs: &[Arc<dyn ExecutionPlan>],
+    ///         _session_state: &SessionState,
+    ///     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    ///         Ok(None)
+    ///     }
+    ///
+    ///     async fn plan_table_scan(
+    ///         &self,
+    ///         _planner: &dyn PhysicalPlanner,
+    ///         scan: &TableScan,
+    ///         _session_state: &SessionState,
+    ///     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    ///         // Check if this is your custom table source
+    ///         if scan.source.as_any().is::<MyCustomTableSource>() {
+    ///             // Create a custom execution plan for your table source
+    ///             let exec = MyCustomExec::new(
+    ///                 scan.table_name.clone(),
+    ///                 Arc::clone(scan.projected_schema.inner()),
+    ///             );
+    ///             Ok(Some(Arc::new(exec)))
+    ///         } else {
+    ///             // Return None to let other extension planners handle it
+    ///             Ok(None)
+    ///         }
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// [`TableSource`]: datafusion_expr::TableSource
+    /// [`TableProvider`]: datafusion_catalog::TableProvider
+    async fn plan_table_scan(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        _scan: &TableScan,
+        _session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        Ok(None)
+    }
 }
 
 /// Default single node physical query planner that converts a
@@ -266,7 +352,8 @@ struct LogicalNode<'a> {
 
 impl DefaultPhysicalPlanner {
     /// Create a physical planner that uses `extension_planners` to
-    /// plan user-defined logical nodes [`LogicalPlan::Extension`].
+    /// plan user-defined logical nodes [`LogicalPlan::Extension`]
+    /// or user-defined table sources in [`LogicalPlan::TableScan`].
     /// The planner uses the first [`ExtensionPlanner`] to return a non-`None`
     /// plan.
     pub fn with_extension_planners(
@@ -275,6 +362,24 @@ impl DefaultPhysicalPlanner {
         Self { extension_planners }
     }
 
+    fn ensure_schema_matches(
+        &self,
+        logical_schema: &DFSchemaRef,
+        physical_plan: &Arc<dyn ExecutionPlan>,
+        context: &str,
+    ) -> Result<()> {
+        if !logical_schema.matches_arrow_schema(&physical_plan.schema()) {
+            return plan_err!(
+                "{} created an ExecutionPlan with mismatched schema. \
+                    LogicalPlan schema: {:?}, ExecutionPlan schema: {:?}",
+                context,
+                logical_schema,
+                physical_plan.schema()
+            );
+        }
+        Ok(())
+    }
+
     /// Create a physical plan from a logical plan
     async fn create_initial_plan(
         &self,
@@ -341,11 +446,11 @@ impl DefaultPhysicalPlanner {
             .flatten()
             .collect::<Vec<_>>();
         // Ideally this never happens if we have a valid LogicalPlan tree
-        if outputs.len() != 1 {
-            return internal_err!(
-                "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected"
-            );
-        }
+        assert_eq_or_internal_err!(
+            outputs.len(),
+            1,
+            "Failed to convert LogicalPlan to ExecutionPlan: More than one root detected"
+        );
         let plan = outputs.pop().unwrap();
         Ok(plan)
     }
@@ -443,24 +548,55 @@ impl DefaultPhysicalPlanner {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let exec_node: Arc<dyn ExecutionPlan> = match node {
             // Leaves (no children)
-            LogicalPlan::TableScan(TableScan {
-                source,
-                projection,
-                filters,
-                fetch,
-                ..
-            }) => {
-                let source = source_as_provider(source)?;
-                // Remove all qualifiers from the scan as the provider
-                // doesn't know (nor should care) how the relation was
-                // referred to in the query
-                let filters = unnormalize_cols(filters.iter().cloned());
-                source
-                    .scan(session_state, projection.as_ref(), &filters, *fetch)
-                    .await?
+            LogicalPlan::TableScan(scan) => {
+                let TableScan {
+                    source,
+                    projection,
+                    filters,
+                    fetch,
+                    projected_schema,
+                    ..
+                } = scan;
+
+                if let Ok(source) = source_as_provider(source) {
+                    // Remove all qualifiers from the scan as the provider
+                    // doesn't know (nor should care) how the relation was
+                    // referred to in the query
+                    let filters = unnormalize_cols(filters.iter().cloned());
+                    let filters_vec = filters.into_iter().collect::<Vec<_>>();
+                    let opts = ScanArgs::default()
+                        .with_projection(projection.as_deref())
+                        .with_filters(Some(&filters_vec))
+                        .with_limit(*fetch);
+                    let res = source.scan_with_args(session_state, opts).await?;
+                    Arc::clone(res.plan())
+                } else {
+                    let mut maybe_plan = None;
+                    for planner in &self.extension_planners {
+                        if maybe_plan.is_some() {
+                            break;
+                        }
+
+                        maybe_plan =
+                            planner.plan_table_scan(self, scan, session_state).await?;
+                    }
+
+                    let plan = match maybe_plan {
+                        Some(plan) => plan,
+                        None => {
+                            return plan_err!(
+                                "No installed planner was able to plan TableScan for custom TableSource: {:?}",
+                                scan.table_name
+                            );
+                        }
+                    };
+                    let context =
+                        format!("Extension planner for table scan {}", scan.table_name);
+                    self.ensure_schema_matches(projected_schema, &plan, &context)?;
+                    plan
+                }
             }
             LogicalPlan::Values(Values { values, schema }) => {
-                let exec_schema = schema.as_ref().to_owned().into();
                 let exprs = values
                     .iter()
                     .map(|row| {
@@ -471,27 +607,23 @@ impl DefaultPhysicalPlanner {
                             .collect::<Result<Vec<Arc<dyn PhysicalExpr>>>>()
                     })
                     .collect::<Result<Vec<_>>>()?;
-                MemorySourceConfig::try_new_as_values(SchemaRef::new(exec_schema), exprs)?
+                MemorySourceConfig::try_new_as_values(Arc::clone(schema.inner()), exprs)?
                     as _
             }
             LogicalPlan::EmptyRelation(EmptyRelation {
                 produce_one_row: false,
                 schema,
-            }) => Arc::new(EmptyExec::new(SchemaRef::new(
-                schema.as_ref().to_owned().into(),
-            ))),
+            }) => Arc::new(EmptyExec::new(Arc::clone(schema.inner()))),
             LogicalPlan::EmptyRelation(EmptyRelation {
                 produce_one_row: true,
                 schema,
-            }) => Arc::new(PlaceholderRowExec::new(SchemaRef::new(
-                schema.as_ref().to_owned().into(),
-            ))),
+            }) => Arc::new(PlaceholderRowExec::new(Arc::clone(schema.inner()))),
             LogicalPlan::DescribeTable(DescribeTable {
                 schema,
                 output_schema,
             }) => {
-                let output_schema: Schema = output_schema.as_ref().into();
-                self.plan_describe(Arc::clone(schema), Arc::new(output_schema))?
+                let output_schema = Arc::clone(output_schema.inner());
+                self.plan_describe(&Arc::clone(schema), output_schema)?
             }
 
             // 1 Child
@@ -501,13 +633,14 @@ impl DefaultPhysicalPlanner {
                 file_type,
                 partition_by,
                 options: source_option_tuples,
+                output_schema: _,
             }) => {
                 let original_url = output_url.clone();
                 let input_exec = children.one()?;
                 let parsed_url = ListingTableUrl::parse(output_url)?;
                 let object_store_url = parsed_url.object_store();
 
-                let schema: Schema = (**input.schema()).clone().into();
+                let schema = Arc::clone(input.schema().inner());
 
                 // Note: the DataType passed here is ignored for the purposes of writing and inferred instead
                 // from the schema of the RecordBatch being written. This allows COPY statements to specify only
@@ -519,16 +652,56 @@ impl DefaultPhysicalPlanner {
 
                 let keep_partition_by_columns = match source_option_tuples
                     .get("execution.keep_partition_by_columns")
-                    .map(|v| v.trim()) {
-                    None => session_state.config().options().execution.keep_partition_by_columns,
+                    .map(|v| v.trim())
+                {
+                    None => {
+                        session_state
+                            .config()
+                            .options()
+                            .execution
+                            .keep_partition_by_columns
+                    }
                     Some("true") => true,
                     Some("false") => false,
-                    Some(value) =>
-                        return Err(DataFusionError::Configuration(format!("provided value for 'execution.keep_partition_by_columns' was not recognized: \"{value}\""))),
+                    Some(value) => {
+                        return Err(DataFusionError::Configuration(format!(
+                            "provided value for 'execution.keep_partition_by_columns' was not recognized: \"{value}\""
+                        )));
+                    }
+                };
+
+                // Parse single_file_output option if explicitly set
+                let file_output_mode = match source_option_tuples
+                    .get("single_file_output")
+                    .map(|v| v.trim())
+                {
+                    None => FileOutputMode::Automatic,
+                    Some("true") => FileOutputMode::SingleFile,
+                    Some("false") => FileOutputMode::Directory,
+                    Some(value) => {
+                        return Err(DataFusionError::Configuration(format!(
+                            "provided value for 'single_file_output' was not recognized: \"{value}\""
+                        )));
+                    }
                 };
 
+                // Filter out sink-related options that are not format options
+                let format_options: HashMap<String, String> = source_option_tuples
+                    .iter()
+                    .filter(|(k, _)| k.as_str() != "single_file_output")
+                    .map(|(k, v)| (k.clone(), v.clone()))
+                    .collect();
+
                 let sink_format = file_type_to_format(file_type)?
-                    .create(session_state, source_option_tuples)?;
+                    .create(session_state, &format_options)?;
+
+                // Determine extension based on format extension and compression
+                let file_extension = match sink_format.compression_type() {
+                    Some(compression_type) => sink_format
+                        .get_ext_with_compression(&compression_type)
+                        .unwrap_or_else(|_| sink_format.get_ext()),
+                    None => sink_format.get_ext(),
+                };
 
                 // Set file sink related options
                 let config = FileSinkConfig {
@@ -536,15 +709,23 @@ impl DefaultPhysicalPlanner {
                     object_store_url,
                     table_paths: vec![parsed_url],
                     file_group: FileGroup::default(),
-                    output_schema: Arc::new(schema),
+                    output_schema: schema,
                     table_partition_cols,
                     insert_op: InsertOp::Append,
                     keep_partition_by_columns,
-                    file_extension: sink_format.get_ext(),
+                    file_extension,
+                    file_output_mode,
                 };
 
+                let ordering = input_exec.properties().output_ordering().cloned();
+
                 sink_format
-                    .create_writer_physical_plan(input_exec, session_state, config, None)
+                    .create_writer_physical_plan(
+                        input_exec,
+                        session_state,
+                        config,
+                        ordering.map(Into::into),
+                    )
                     .await?
             }
             LogicalPlan::Dml(DmlStatement {
@@ -566,35 +747,110 @@ impl DefaultPhysicalPlanner {
                     );
                 }
             }
-            LogicalPlan::Window(Window { window_expr, .. }) => {
-                if window_expr.is_empty() {
-                    return internal_err!("Impossibly got empty window expression");
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Delete,
+                input,
+                ..
+            }) => {
+                if let Some(provider) =
+                    target.as_any().downcast_ref::<DefaultTableSource>()
+                {
+                    let filters = extract_dml_filters(input, table_name)?;
+                    provider
+                        .table_provider
+                        .delete_from(session_state, filters)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!("DELETE operation on table '{table_name}'"))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
+                }
+            }
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Update,
+                input,
+                ..
+            }) => {
+                if let Some(provider) =
+                    target.as_any().downcast_ref::<DefaultTableSource>()
+                {
+                    // For UPDATE, the assignments are encoded in the projection of input
+                    // We pass the filters and let the provider handle the projection
+                    let filters = extract_dml_filters(input, table_name)?;
+                    // Extract assignments from the projection in input plan
+                    let assignments = extract_update_assignments(input)?;
+                    provider
+                        .table_provider
+                        .update(session_state, assignments, filters)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!("UPDATE operation on table '{table_name}'"))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
+                }
+            }
+            LogicalPlan::Dml(DmlStatement {
+                table_name,
+                target,
+                op: WriteOp::Truncate,
+                ..
+            }) => {
+                if let Some(provider) =
+                    target.as_any().downcast_ref::<DefaultTableSource>()
+                {
+                    provider
+                        .table_provider
+                        .truncate(session_state)
+                        .await
+                        .map_err(|e| {
+                            e.context(format!(
+                                "TRUNCATE operation on table '{table_name}'"
+                            ))
+                        })?
+                } else {
+                    return exec_err!(
+                        "Table source can't be downcasted to DefaultTableSource"
+                    );
                 }
+            }
+            LogicalPlan::Window(Window { window_expr, .. }) => {
+                assert_or_internal_err!(
+                    !window_expr.is_empty(),
+                    "Impossibly got empty window expression"
+                );
 
                 let input_exec = children.one()?;
 
                 let get_sort_keys = |expr: &Expr| match expr {
-                    Expr::WindowFunction(WindowFunction {
-                        params:
-                            WindowFunctionParams {
-                                ref partition_by,
-                                ref order_by,
-                                ..
-                            },
-                        ..
-                    }) => generate_sort_key(partition_by, order_by),
+                    Expr::WindowFunction(window_fun) => {
+                        let WindowFunctionParams {
+                            partition_by,
+                            order_by,
+                            ..
+                        } = &window_fun.as_ref().params;
+                        generate_sort_key(partition_by, order_by)
+                    }
                     Expr::Alias(Alias { expr, .. }) => {
                         // Convert &Box<T> to &T
                         match &**expr {
-                            Expr::WindowFunction(WindowFunction {
-                                params:
-                                    WindowFunctionParams {
-                                        ref partition_by,
-                                        ref order_by,
-                                        ..
-                                    },
-                                ..
-                            }) => generate_sort_key(partition_by, order_by),
+                            Expr::WindowFunction(window_fun) => {
+                                let WindowFunctionParams {
+                                    partition_by,
+                                    order_by,
+                                    ..
+                                } = &window_fun.as_ref().params;
+                                generate_sort_key(partition_by, order_by)
+                            }
                             _ => unreachable!(),
                         }
                     }
@@ -603,11 +859,11 @@ impl DefaultPhysicalPlanner {
                 let sort_keys = get_sort_keys(&window_expr[0])?;
                 if window_expr.len() > 1 {
                     debug_assert!(
-                            window_expr[1..]
-                                .iter()
-                                .all(|expr| get_sort_keys(expr).unwrap() == sort_keys),
-                            "all window expressions shall have the same sort keys, as guaranteed by logical planning"
-                        );
+                        window_expr[1..]
+                            .iter()
+                            .all(|expr| get_sort_keys(expr).unwrap() == sort_keys),
+                        "all window expressions shall have the same sort keys, as guaranteed by logical planning"
+                    );
                 }
 
                 let logical_schema = node.schema();
@@ -664,6 +920,17 @@ impl DefaultPhysicalPlanner {
                     )
                 {
                     let mut differences = Vec::new();
+
+                    if physical_input_schema.metadata()
+                        != physical_input_schema_from_logical.metadata()
+                    {
+                        differences.push(format!(
+                            "schema metadata differs: (physical) {:?} vs (logical) {:?}",
+                            physical_input_schema.metadata(),
+                            physical_input_schema_from_logical.metadata()
+                        ));
+                    }
+
                     if physical_input_schema.fields().len()
                         != physical_input_schema_from_logical.fields().len()
                     {
@@ -693,11 +960,20 @@ impl DefaultPhysicalPlanner {
                         if physical_field.is_nullable() && !logical_field.is_nullable() {
                             differences.push(format!("field nullability at index {} [{}]: (physical) {} vs (logical) {}", i, physical_field.name(), physical_field.is_nullable(), logical_field.is_nullable()));
                         }
+                        if physical_field.metadata() != logical_field.metadata() {
+                            differences.push(format!(
+                                "field metadata at index {} [{}]: (physical) {:?} vs (logical) {:?}",
+                                i,
+                                physical_field.name(),
+                                physical_field.metadata(),
+                                logical_field.metadata()
+                            ));
+                        }
                     }
-                    return internal_err!("Physical input schema should be the same as the one converted from logical input schema. Differences: {}", differences
-                        .iter()
-                        .map(|s| format!("\n\t- {s}"))
-                        .join(""));
+                    return internal_err!(
+                        "Physical input schema should be the same as the one converted from logical input schema. Differences: {}",
+                        differences.iter().map(|s| format!("\n\t- {s}")).join("")
+                    );
                 }
 
                 let groups = self.create_grouping_physical_expr(
@@ -719,9 +995,54 @@ impl DefaultPhysicalPlanner {
                     })
                     .collect::<Result<Vec<_>>>()?;
 
-                let (aggregates, filters, _order_bys): (Vec<_>, Vec<_>, Vec<_>) =
+                let (mut aggregates, filters, _order_bys): (Vec<_>, Vec<_>, Vec<_>) =
                     multiunzip(agg_filter);
 
+                let mut async_exprs = Vec::new();
+                let num_input_columns = physical_input_schema.fields().len();
+
+                for agg_func in &mut aggregates {
+                    match self.try_plan_async_exprs(
+                        num_input_columns,
+                        PlannedExprResult::Expr(agg_func.expressions()),
+                        physical_input_schema.as_ref(),
+                    )? {
+                        PlanAsyncExpr::Async(
+                            async_map,
+                            PlannedExprResult::Expr(physical_exprs),
+                        ) => {
+                            async_exprs.extend(async_map.async_exprs);
+
+                            if let Some(new_agg_func) = agg_func.with_new_expressions(
+                                physical_exprs,
+                                agg_func
+                                    .order_bys()
+                                    .iter()
+                                    .cloned()
+                                    .map(|x| x.expr)
+                                    .collect(),
+                            ) {
+                                *agg_func = Arc::new(new_agg_func);
+                            } else {
+                                return internal_err!("Failed to plan async expression");
+                            }
+                        }
+                        PlanAsyncExpr::Sync(PlannedExprResult::Expr(_)) => {
+                            // Do nothing
+                        }
+                        _ => {
+                            return internal_err!(
+                                "Unexpected result from try_plan_async_exprs"
+                            );
+                        }
+                    }
+                }
+                let input_exec = if !async_exprs.is_empty() {
+                    Arc::new(AsyncFuncExec::try_new(async_exprs, input_exec)?)
+                } else {
+                    input_exec
+                };
+
                 let initial_aggr = Arc::new(AggregateExec::try_new(
                     AggregateMode::Partial,
                     groups.clone(),
@@ -777,12 +1098,53 @@ impl DefaultPhysicalPlanner {
 
                 let runtime_expr =
                     self.create_physical_expr(predicate, input_dfschema, session_state)?;
+
+                let input_schema = input.schema();
+                let filter = match self.try_plan_async_exprs(
+                    input_schema.fields().len(),
+                    PlannedExprResult::Expr(vec![runtime_expr]),
+                    input_schema.as_arrow(),
+                )? {
+                    PlanAsyncExpr::Sync(PlannedExprResult::Expr(runtime_expr)) => {
+                        FilterExecBuilder::new(
+                            Arc::clone(&runtime_expr[0]),
+                            physical_input,
+                        )
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
+                    }
+                    PlanAsyncExpr::Async(
+                        async_map,
+                        PlannedExprResult::Expr(runtime_expr),
+                    ) => {
+                        let async_exec = AsyncFuncExec::try_new(
+                            async_map.async_exprs,
+                            physical_input,
+                        )?;
+                        FilterExecBuilder::new(
+                            Arc::clone(&runtime_expr[0]),
+                            Arc::new(async_exec),
+                        )
+                        // project the output columns excluding the async functions
+                        // The async functions are always appended to the end of the schema.
+                        .apply_projection(Some(
+                            (0..input.schema().fields().len()).collect::<Vec<_>>(),
+                        ))?
+                        .with_batch_size(session_state.config().batch_size())
+                        .build()?
+                    }
+                    _ => {
+                        return internal_err!(
+                            "Unexpected result from try_plan_async_exprs"
+                        );
+                    }
+                };
+
                 let selectivity = session_state
                     .config()
                     .options()
                     .optimizer
                     .default_filter_selectivity;
-                let filter = FilterExec::try_new(runtime_expr, physical_input)?;
                 Arc::new(filter.with_default_selectivity(selectivity)?)
             }
             LogicalPlan::Repartition(Repartition {
@@ -824,13 +1186,17 @@ impl DefaultPhysicalPlanner {
             }) => {
                 let physical_input = children.one()?;
                 let input_dfschema = input.as_ref().schema();
-                let sort_expr = create_physical_sort_exprs(
+                let sort_exprs = create_physical_sort_exprs(
                     expr,
                     input_dfschema,
                     session_state.execution_props(),
                 )?;
-                let new_sort =
-                    SortExec::new(sort_expr, physical_input).with_fetch(*fetch);
+                let Some(ordering) = LexOrdering::new(sort_exprs) else {
+                    return internal_err!(
+                        "SortExec requires at least one sort expression"
+                    );
+                };
+                let new_sort = SortExec::new(ordering, physical_input).with_fetch(*fetch);
                 Arc::new(new_sort)
             }
             LogicalPlan::Subquery(_) => todo!(),
@@ -873,7 +1239,7 @@ impl DefaultPhysicalPlanner {
                 ..
             }) => {
                 let input = children.one()?;
-                let schema = SchemaRef::new(schema.as_ref().to_owned().into());
+                let schema = Arc::clone(schema.inner());
                 let list_column_indices = list_type_columns
                     .iter()
                     .map(|(index, unnesting)| ListUnnest {
@@ -887,22 +1253,21 @@ impl DefaultPhysicalPlanner {
                     struct_type_columns.clone(),
                     schema,
                     options.clone(),
-                ))
+                )?)
             }
 
             // 2 Children
             LogicalPlan::Join(Join {
-                left,
-                right,
+                left: original_left,
+                right: original_right,
                 on: keys,
                 filter,
                 join_type,
-                null_equals_null,
+                null_equality,
+                null_aware,
                 schema: join_schema,
                 ..
             }) => {
-                let null_equals_null = *null_equals_null;
-
                 let [physical_left, physical_right] = children.two()?;
 
                 // If join has expression equijoin keys, add physical projection.
@@ -918,23 +1283,25 @@ impl DefaultPhysicalPlanner {
                     let (left, left_col_keys, left_projected) =
                         wrap_projection_for_join_if_necessary(
                             &left_keys,
-                            left.as_ref().clone(),
+                            original_left.as_ref().clone(),
                         )?;
                     let (right, right_col_keys, right_projected) =
                         wrap_projection_for_join_if_necessary(
                             &right_keys,
-                            right.as_ref().clone(),
+                            original_right.as_ref().clone(),
                         )?;
                     let column_on = (left_col_keys, right_col_keys);
 
                     let left = Arc::new(left);
                     let right = Arc::new(right);
-                    let new_join = LogicalPlan::Join(Join::try_new_with_project_input(
+                    let (new_join, requalified) = Join::try_new_with_project_input(
                         node,
                         Arc::clone(&left),
                         Arc::clone(&right),
                         column_on,
-                    )?);
+                    )?;
+
+                    let new_join = LogicalPlan::Join(new_join);
 
                     // If inputs were projected then create ExecutionPlan for these new
                     // LogicalPlan nodes.
@@ -967,8 +1334,24 @@ impl DefaultPhysicalPlanner {
 
                     // Remove temporary projected columns
                     if left_projected || right_projected {
-                        let final_join_result =
-                            join_schema.iter().map(Expr::from).collect::<Vec<_>>();
+                        // Re-qualify the join schema only if the inputs were previously requalified in
+                        // `try_new_with_project_input`. This ensures that when building the Projection
+                        // it can correctly resolve field nullability and data types
+                        // by disambiguating fields from the left and right sides of the join.
+                        let qualified_join_schema = if requalified {
+                            Arc::new(qualify_join_schema_sides(
+                                join_schema,
+                                original_left,
+                                original_right,
+                            )?)
+                        } else {
+                            Arc::clone(join_schema)
+                        };
+
+                        let final_join_result = qualified_join_schema
+                            .iter()
+                            .map(Expr::from)
+                            .collect::<Vec<_>>();
                         let projection = LogicalPlan::Projection(Projection::try_new(
                             final_join_result,
                             Arc::new(new_join),
@@ -1017,8 +1400,42 @@ impl DefaultPhysicalPlanner {
                     })
                     .collect::<Result<join_utils::JoinOn>>()?;
 
+                // TODO: `num_range_filters` can be used later on for ASOF joins (`num_range_filters > 1`)
+                let mut num_range_filters = 0;
+                let mut range_filters: Vec<Expr> = Vec::new();
+                let mut total_filters = 0;
+
                 let join_filter = match filter {
                     Some(expr) => {
+                        let split_expr = split_conjunction(expr);
+                        for expr in split_expr.iter() {
+                            match *expr {
+                                Expr::BinaryExpr(BinaryExpr {
+                                    left: _,
+                                    right: _,
+                                    op,
+                                }) => {
+                                    if matches!(
+                                        op,
+                                        Operator::Lt
+                                            | Operator::LtEq
+                                            | Operator::Gt
+                                            | Operator::GtEq
+                                    ) {
+                                        range_filters.push((**expr).clone());
+                                        num_range_filters += 1;
+                                    }
+                                    total_filters += 1;
+                                }
+                                // TODO: Want to deal with `Expr::Between` for IEJoins, it counts as two range predicates
+                                // which is why it is not dealt with in PWMJ
+                                // Expr::Between(_) => {},
+                                _ => {
+                                    total_filters += 1;
+                                }
+                            }
+                        }
+
                         // Extract columns from filter expression and saved in a HashSet
                         let cols = expr.column_refs();
 
@@ -1055,7 +1472,7 @@ impl DefaultPhysicalPlanner {
                         let filter_df_fields = filter_df_fields
                             .into_iter()
                             .map(|(qualifier, field)| {
-                                (qualifier.cloned(), Arc::new(field.clone()))
+                                (qualifier.cloned(), Arc::clone(field))
                             })
                             .collect();
 
@@ -1074,6 +1491,7 @@ impl DefaultPhysicalPlanner {
                         )?;
                         let filter_schema =
                             Schema::new_with_metadata(filter_fields, metadata);
+
                         let filter_expr = create_physical_expr(
                             expr,
                             &filter_df_schema,
@@ -1096,10 +1514,123 @@ impl DefaultPhysicalPlanner {
                 let prefer_hash_join =
                     session_state.config_options().optimizer.prefer_hash_join;
 
+                // TODO: Allow PWMJ to deal with residual equijoin conditions
                 let join: Arc<dyn ExecutionPlan> = if join_on.is_empty() {
-                    if join_filter.is_none() && matches!(join_type, JoinType::Inner) {
+                    if join_filter.is_none() && *join_type == JoinType::Inner {
                         // cross join if there is no join conditions and no join filter set
                         Arc::new(CrossJoinExec::new(physical_left, physical_right))
+                    } else if num_range_filters == 1
+                        && total_filters == 1
+                        && !matches!(
+                            join_type,
+                            JoinType::LeftSemi
+                                | JoinType::RightSemi
+                                | JoinType::LeftAnti
+                                | JoinType::RightAnti
+                                | JoinType::LeftMark
+                                | JoinType::RightMark
+                        )
+                        && session_state
+                            .config_options()
+                            .optimizer
+                            .enable_piecewise_merge_join
+                    {
+                        let Expr::BinaryExpr(be) = &range_filters[0] else {
+                            return plan_err!(
+                                "Unsupported expression for PWMJ: Expected `Expr::BinaryExpr`"
+                            );
+                        };
+
+                        let mut op = be.op;
+                        if !matches!(
+                            op,
+                            Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq
+                        ) {
+                            return plan_err!(
+                                "Unsupported operator for PWMJ: {:?}. Expected one of <, <=, >, >=",
+                                op
+                            );
+                        }
+
+                        fn reverse_ineq(op: Operator) -> Operator {
+                            match op {
+                                Operator::Lt => Operator::Gt,
+                                Operator::LtEq => Operator::GtEq,
+                                Operator::Gt => Operator::Lt,
+                                Operator::GtEq => Operator::LtEq,
+                                _ => op,
+                            }
+                        }
+
+                        #[derive(Clone, Copy, Debug, PartialEq, Eq)]
+                        enum Side {
+                            Left,
+                            Right,
+                            Both,
+                        }
+
+                        let side_of = |e: &Expr| -> Result<Side> {
+                            let cols = e.column_refs();
+                            let any_left = cols
+                                .iter()
+                                .any(|c| left_df_schema.index_of_column(c).is_ok());
+                            let any_right = cols
+                                .iter()
+                                .any(|c| right_df_schema.index_of_column(c).is_ok());
+
+                            Ok(match (any_left, any_right) {
+                                (true, false) => Side::Left,
+                                (false, true) => Side::Right,
+                                (true, true) => Side::Both,
+                                _ => unreachable!(),
+                            })
+                        };
+
+                        let mut lhs_logical = &be.left;
+                        let mut rhs_logical = &be.right;
+
+                        let left_side = side_of(lhs_logical)?;
+                        let right_side = side_of(rhs_logical)?;
+                        if left_side == Side::Both || right_side == Side::Both {
+                            return Ok(Arc::new(NestedLoopJoinExec::try_new(
+                                physical_left,
+                                physical_right,
+                                join_filter,
+                                join_type,
+                                None,
+                            )?));
+                        }
+
+                        if left_side == Side::Right && right_side == Side::Left {
+                            std::mem::swap(&mut lhs_logical, &mut rhs_logical);
+                            op = reverse_ineq(op);
+                        } else if !(left_side == Side::Left && right_side == Side::Right)
+                        {
+                            return plan_err!(
+                                "Unsupported operator for PWMJ: {:?}. Expected one of <, <=, >, >=",
+                                op
+                            );
+                        }
+
+                        let on_left = create_physical_expr(
+                            lhs_logical,
+                            left_df_schema,
+                            session_state.execution_props(),
+                        )?;
+                        let on_right = create_physical_expr(
+                            rhs_logical,
+                            right_df_schema,
+                            session_state.execution_props(),
+                        )?;
+
+                        Arc::new(PiecewiseMergeJoinExec::try_new(
+                            physical_left,
+                            physical_right,
+                            (on_left, on_right),
+                            op,
+                            *join_type,
+                            session_state.config().target_partitions(),
+                        )?)
                     } else {
                         // there is no equal join condition, use the nested loop join
                         Arc::new(NestedLoopJoinExec::try_new(
@@ -1123,11 +1654,13 @@ impl DefaultPhysicalPlanner {
                         join_filter,
                         *join_type,
                         vec![SortOptions::default(); join_on_len],
-                        null_equals_null,
+                        *null_equality,
                     )?)
                 } else if session_state.config().target_partitions() > 1
                     && session_state.config().repartition_joins()
                     && prefer_hash_join
+                    && !*null_aware
+                // Null-aware joins must use CollectLeft
                 {
                     Arc::new(HashJoinExec::try_new(
                         physical_left,
@@ -1137,7 +1670,8 @@ impl DefaultPhysicalPlanner {
                         join_type,
                         None,
                         PartitionMode::Auto,
-                        null_equals_null,
+                        *null_equality,
+                        *null_aware,
                     )?)
                 } else {
                     Arc::new(HashJoinExec::try_new(
@@ -1148,7 +1682,8 @@ impl DefaultPhysicalPlanner {
                         join_type,
                         None,
                         PartitionMode::CollectLeft,
-                        null_equals_null,
+                        *null_equality,
+                        *null_aware,
                     )?)
                 };
 
@@ -1173,7 +1708,7 @@ impl DefaultPhysicalPlanner {
             }
 
             // N Children
-            LogicalPlan::Union(_) => Arc::new(UnionExec::new(children.vec())),
+            LogicalPlan::Union(_) => UnionExec::try_new(children.vec())?,
             LogicalPlan::Extension(Extension { node }) => {
                 let mut maybe_plan = None;
                 let children = children.vec();
@@ -1195,22 +1730,16 @@ impl DefaultPhysicalPlanner {
                 }
 
                 let plan = match maybe_plan {
-                        Some(v) => Ok(v),
-                        _ => plan_err!("No installed planner was able to convert the custom node to an execution plan: {:?}", node)
-                    }?;
-
-                // Ensure the ExecutionPlan's schema matches the
-                // declared logical schema to catch and warn about
-                // logic errors when creating user defined plans.
-                if !node.schema().matches_arrow_schema(&plan.schema()) {
-                    return plan_err!(
-                            "Extension planner for {:?} created an ExecutionPlan with mismatched schema. \
-                            LogicalPlan schema: {:?}, ExecutionPlan schema: {:?}",
-                            node, node.schema(), plan.schema()
-                        );
-                } else {
-                    plan
-                }
+                    Some(v) => Ok(v),
+                    _ => plan_err!(
+                        "No installed planner was able to convert the custom node to an execution plan: {:?}",
+                        node
+                    ),
+                }?;
+
+                let context = format!("Extension planner for {node:?}");
+                self.ensure_schema_matches(node.schema(), &plan, &context)?;
+                plan
             }
 
             // Other
@@ -1234,17 +1763,17 @@ impl DefaultPhysicalPlanner {
             LogicalPlan::Explain(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Explain must be root of the plan"
-                )
+                );
             }
             LogicalPlan::Distinct(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Distinct should be replaced to Aggregate"
-                )
+                );
             }
             LogicalPlan::Analyze(_) => {
                 return internal_err!(
                     "Unsupported logical plan: Analyze must be root of the plan"
-                )
+                );
             }
         };
         Ok(exec_node)
@@ -1286,6 +1815,10 @@ impl DefaultPhysicalPlanner {
                     physical_name(expr),
                 ))?])),
             }
+        } else if group_expr.is_empty() {
+            // No GROUP BY clause - create empty PhysicalGroupBy
+            // no expressions, no null expressions and no grouping expressions
+            Ok(PhysicalGroupBy::new(vec![], vec![], vec![], false))
         } else {
             Ok(PhysicalGroupBy::new_single(
                 group_expr
@@ -1357,6 +1890,7 @@ fn merge_grouping_set_physical_expr(
         grouping_set_expr,
         null_exprs,
         merged_sets,
+        true,
     ))
 }
 
@@ -1399,7 +1933,7 @@ fn create_cube_physical_expr(
         }
     }
 
-    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups))
+    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups, true))
 }
 
 /// Expand and align a ROLLUP expression. This is a special case of GROUPING SETS
@@ -1444,7 +1978,7 @@ fn create_rollup_physical_expr(
         groups.push(group)
     }
 
-    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups))
+    Ok(PhysicalGroupBy::new(all_exprs, null_exprs, groups, true))
 }
 
 /// For a given logical expr, get a properly typed NULL ScalarValue physical expression
@@ -1465,6 +1999,62 @@ fn get_null_physical_expr_pair(
     Ok((Arc::new(null_value), physical_name))
 }
 
+/// Qualifies the fields in a join schema with "left" and "right" qualifiers
+/// without mutating the original schema. This function should only be used when
+/// the join inputs have already been requalified earlier in `try_new_with_project_input`.
+///
+/// The purpose is to avoid ambiguity errors later in planning (e.g., in nullability or data type resolution)
+/// when converting expressions to fields.
+fn qualify_join_schema_sides(
+    join_schema: &DFSchema,
+    left: &LogicalPlan,
+    right: &LogicalPlan,
+) -> Result<DFSchema> {
+    let left_fields = left.schema().fields();
+    let right_fields = right.schema().fields();
+    let join_fields = join_schema.fields();
+
+    // Validate lengths
+    assert_eq_or_internal_err!(
+        join_fields.len(),
+        left_fields.len() + right_fields.len(),
+        "Join schema field count must match left and right field count."
+    );
+
+    // Validate field names match
+    for (i, (field, expected)) in join_fields
+        .iter()
+        .zip(left_fields.iter().chain(right_fields.iter()))
+        .enumerate()
+    {
+        assert_eq_or_internal_err!(
+            field.name(),
+            expected.name(),
+            "Field name mismatch at index {}",
+            i
+        );
+    }
+
+    // qualify sides
+    let qualifiers = join_fields
+        .iter()
+        .enumerate()
+        .map(|(i, _)| {
+            if i < left_fields.len() {
+                Some(TableReference::Bare {
+                    table: Arc::from("left"),
+                })
+            } else {
+                Some(TableReference::Bare {
+                    table: Arc::from("right"),
+                })
+            }
+        })
+        .collect();
+
+    join_schema.with_field_specific_qualified_schema(qualifiers)
+}
+
 fn get_physical_expr_pair(
     expr: &Expr,
     input_dfschema: &DFSchema,
@@ -1476,47 +2066,276 @@ fn get_physical_expr_pair(
     Ok((physical_expr, physical_name))
 }
 
-/// Check if window bounds are valid after schema information is available, and
-/// window_frame bounds are casted to the corresponding column type.
-/// queries like:
-/// OVER (ORDER BY a RANGES BETWEEN 3 PRECEDING AND 5 PRECEDING)
-/// OVER (ORDER BY a RANGES BETWEEN INTERVAL '3 DAY' PRECEDING AND '5 DAY' PRECEDING)  are rejected
-pub fn is_window_frame_bound_valid(window_frame: &WindowFrame) -> bool {
-    match (&window_frame.start_bound, &window_frame.end_bound) {
-        (WindowFrameBound::Following(_), WindowFrameBound::Preceding(_))
-        | (WindowFrameBound::Following(_), WindowFrameBound::CurrentRow)
-        | (WindowFrameBound::CurrentRow, WindowFrameBound::Preceding(_)) => false,
-        (WindowFrameBound::Preceding(lhs), WindowFrameBound::Preceding(rhs)) => {
-            !rhs.is_null() && (lhs.is_null() || (lhs >= rhs))
+/// Extract filter predicates from a DML input plan (DELETE/UPDATE).
+///
+/// Walks the logical plan tree and collects Filter predicates and any filters
+/// pushed down into TableScan nodes, splitting AND conjunctions into individual expressions.
+///
+/// For UPDATE...FROM queries involving multiple tables, this function only extracts predicates
+/// that reference the target table. Filters from source table scans are excluded to prevent
+/// incorrect filter semantics.
+///
+/// Column qualifiers are stripped so expressions can be evaluated against the TableProvider's
+/// schema. Deduplication is performed because filters may appear in both Filter nodes and
+/// TableScan.filters when the optimizer performs partial (Inexact) filter pushdown.
+///
+/// # Parameters
+/// - `input`: The logical plan tree to extract filters from (typically a DELETE or UPDATE plan)
+/// - `target`: The target table reference to scope filter extraction (prevents multi-table filter leakage)
+///
+/// # Returns
+/// A vector of unqualified filter expressions that can be passed to the TableProvider for execution.
+/// Returns an empty vector if no applicable filters are found.
+///
+fn extract_dml_filters(
+    input: &Arc<LogicalPlan>,
+    target: &TableReference,
+) -> Result<Vec<Expr>> {
+    let mut filters = Vec::new();
+    let mut allowed_refs = vec![target.clone()];
+
+    // First pass: collect any alias references to the target table
+    input.apply(|node| {
+        if let LogicalPlan::SubqueryAlias(alias) = node
+            // Check if this alias points to the target table
+            && let LogicalPlan::TableScan(scan) = alias.input.as_ref()
+            && scan.table_name.resolved_eq(target)
+        {
+            allowed_refs.push(TableReference::bare(alias.alias.to_string()));
         }
-        (WindowFrameBound::Following(lhs), WindowFrameBound::Following(rhs)) => {
-            !lhs.is_null() && (rhs.is_null() || (lhs <= rhs))
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    input.apply(|node| {
+        match node {
+            LogicalPlan::Filter(filter) => {
+                // Split AND predicates into individual expressions
+                for predicate in split_conjunction(&filter.predicate) {
+                    if predicate_is_on_target_multi(predicate, &allowed_refs)? {
+                        filters.push(predicate.clone());
+                    }
+                }
+            }
+            LogicalPlan::TableScan(TableScan {
+                table_name,
+                filters: scan_filters,
+                ..
+            }) => {
+                // Only extract filters from the target table scan.
+                // This prevents incorrect filter extraction in UPDATE...FROM scenarios
+                // where multiple table scans may have filters.
+                if table_name.resolved_eq(target) {
+                    for filter in scan_filters {
+                        filters.extend(split_conjunction(filter).into_iter().cloned());
+                    }
+                }
+            }
+            // Plans without filter information
+            LogicalPlan::EmptyRelation(_)
+            | LogicalPlan::Values(_)
+            | LogicalPlan::DescribeTable(_)
+            | LogicalPlan::Explain(_)
+            | LogicalPlan::Analyze(_)
+            | LogicalPlan::Distinct(_)
+            | LogicalPlan::Extension(_)
+            | LogicalPlan::Statement(_)
+            | LogicalPlan::Dml(_)
+            | LogicalPlan::Ddl(_)
+            | LogicalPlan::Copy(_)
+            | LogicalPlan::Unnest(_)
+            | LogicalPlan::RecursiveQuery(_) => {
+                // No filters to extract from leaf/meta plans
+            }
+            // Plans with inputs (may contain filters in children)
+            LogicalPlan::Projection(_)
+            | LogicalPlan::SubqueryAlias(_)
+            | LogicalPlan::Limit(_)
+            | LogicalPlan::Sort(_)
+            | LogicalPlan::Union(_)
+            | LogicalPlan::Join(_)
+            | LogicalPlan::Repartition(_)
+            | LogicalPlan::Aggregate(_)
+            | LogicalPlan::Window(_)
+            | LogicalPlan::Subquery(_) => {
+                // Filter information may appear in child nodes; continue traversal
+                // to extract filters from Filter/TableScan nodes deeper in the plan
+            }
         }
-        _ => true,
-    }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    // Strip qualifiers and deduplicate. This ensures:
+    // 1. Only target-table predicates are retained from Filter nodes
+    // 2. Qualifiers stripped for TableProvider compatibility
+    // 3. Duplicates removed (from Filter nodes + TableScan.filters)
+    //
+    // Deduplication is necessary because filters may appear in both Filter nodes
+    // and TableScan.filters when the optimizer performs partial (Inexact) pushdown.
+    let mut seen_filters = HashSet::new();
+    filters
+        .into_iter()
+        .try_fold(Vec::new(), |mut deduped, filter| {
+            let unqualified = strip_column_qualifiers(filter).map_err(|e| {
+                e.context(format!(
+                    "Failed to strip column qualifiers for DML filter on table '{target}'"
+                ))
+            })?;
+            if seen_filters.insert(unqualified.clone()) {
+                deduped.push(unqualified);
+            }
+            Ok(deduped)
+        })
 }
 
-/// Create a window expression with a name from a logical expression
-pub fn create_window_expr_with_name(
-    e: &Expr,
-    name: impl Into<String>,
-    logical_schema: &DFSchema,
+/// Determine whether a predicate references only columns from the target table
+/// or its aliases.
+///
+/// Columns may be qualified with the target table name or any of its aliases.
+/// Unqualified columns are also accepted as they implicitly belong to the target table.
+fn predicate_is_on_target_multi(
+    expr: &Expr,
+    allowed_refs: &[TableReference],
+) -> Result<bool> {
+    let mut columns = HashSet::new();
+    expr_to_columns(expr, &mut columns)?;
+
+    // Short-circuit on first mismatch: returns false if any column references a table not in allowed_refs.
+    // Columns are accepted if:
+    // 1. They are unqualified (no relation specified), OR
+    // 2. Their relation matches one of the allowed table references using resolved equality
+    Ok(!columns.iter().any(|column| {
+        column.relation.as_ref().is_some_and(|relation| {
+            !allowed_refs
+                .iter()
+                .any(|allowed| relation.resolved_eq(allowed))
+        })
+    }))
+}
+
+/// Strip table qualifiers from column references in an expression.
+/// This is needed because DML filter expressions contain qualified column names
+/// (e.g., "table.column") but the TableProvider's schema only has simple names.
+fn strip_column_qualifiers(expr: Expr) -> Result<Expr> {
+    expr.transform(|e| {
+        if let Expr::Column(col) = &e
+            && col.relation.is_some()
+        {
+            // Strip the qualifier
+            return Ok(Transformed::yes(Expr::Column(Column::new_unqualified(
+                col.name.clone(),
+            ))));
+        }
+        Ok(Transformed::no(e))
+    })
+    .map(|t| t.data)
+}
+
+/// Extract column assignments from an UPDATE input plan.
+/// For UPDATE statements, the SQL planner encodes assignments as a projection
+/// over the source table. This function extracts column name and expression pairs
+/// from the projection. Column qualifiers are stripped from the expressions.
+///
+fn extract_update_assignments(input: &Arc<LogicalPlan>) -> Result<Vec<(String, Expr)>> {
+    // The UPDATE input plan structure is:
+    // Projection(updated columns as expressions with aliases)
+    //   Filter(optional WHERE clause)
+    //     TableScan
+    //
+    // Each projected expression has an alias matching the column name
+    let mut assignments = Vec::new();
+
+    // Find the top-level projection
+    if let LogicalPlan::Projection(projection) = input.as_ref() {
+        for expr in &projection.expr {
+            if let Expr::Alias(alias) = expr {
+                // The alias name is the column name being updated
+                // The inner expression is the new value
+                let column_name = alias.name.clone();
+                // Only include if it's not just a column reference to itself
+                // (those are columns that aren't being updated)
+                if !is_identity_assignment(&alias.expr, &column_name) {
+                    // Strip qualifiers from the assignment expression
+                    let stripped_expr = strip_column_qualifiers((*alias.expr).clone())?;
+                    assignments.push((column_name, stripped_expr));
+                }
+            }
+        }
+    } else {
+        // Try to find projection deeper in the plan
+        input.apply(|node| {
+            if let LogicalPlan::Projection(projection) = node {
+                for expr in &projection.expr {
+                    if let Expr::Alias(alias) = expr {
+                        let column_name = alias.name.clone();
+                        if !is_identity_assignment(&alias.expr, &column_name) {
+                            let stripped_expr =
+                                strip_column_qualifiers((*alias.expr).clone())?;
+                            assignments.push((column_name, stripped_expr));
+                        }
+                    }
+                }
+                return Ok(TreeNodeRecursion::Stop);
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+    }
+
+    Ok(assignments)
+}
+
+/// Check if an assignment is an identity assignment (column = column)
+/// These are columns that are not being modified in the UPDATE
+fn is_identity_assignment(expr: &Expr, column_name: &str) -> bool {
+    match expr {
+        Expr::Column(col) => col.name == column_name,
+        _ => false,
+    }
+}
+
+/// Check if window bounds are valid after schema information is available, and
+/// window_frame bounds are casted to the corresponding column type.
+/// queries like:
+/// OVER (ORDER BY a RANGES BETWEEN 3 PRECEDING AND 5 PRECEDING)
+/// OVER (ORDER BY a RANGES BETWEEN INTERVAL '3 DAY' PRECEDING AND '5 DAY' PRECEDING)  are rejected
+pub fn is_window_frame_bound_valid(window_frame: &WindowFrame) -> bool {
+    match (&window_frame.start_bound, &window_frame.end_bound) {
+        (WindowFrameBound::Following(_), WindowFrameBound::Preceding(_))
+        | (WindowFrameBound::Following(_), WindowFrameBound::CurrentRow)
+        | (WindowFrameBound::CurrentRow, WindowFrameBound::Preceding(_)) => false,
+        (WindowFrameBound::Preceding(lhs), WindowFrameBound::Preceding(rhs)) => {
+            !rhs.is_null() && (lhs.is_null() || (lhs >= rhs))
+        }
+        (WindowFrameBound::Following(lhs), WindowFrameBound::Following(rhs)) => {
+            !lhs.is_null() && (rhs.is_null() || (lhs <= rhs))
+        }
+        _ => true,
+    }
+}
+
+/// Create a window expression with a name from a logical expression
+pub fn create_window_expr_with_name(
+    e: &Expr,
+    name: impl Into<String>,
+    logical_schema: &DFSchema,
     execution_props: &ExecutionProps,
 ) -> Result<Arc<dyn WindowExpr>> {
     let name = name.into();
-    let physical_schema: &Schema = &logical_schema.into();
+    let physical_schema = Arc::clone(logical_schema.inner());
     match e {
-        Expr::WindowFunction(WindowFunction {
-            fun,
-            params:
-                WindowFunctionParams {
-                    args,
-                    partition_by,
-                    order_by,
-                    window_frame,
-                    null_treatment,
-                },
-        }) => {
+        Expr::WindowFunction(window_fun) => {
+            let WindowFunction {
+                fun,
+                params:
+                    WindowFunctionParams {
+                        args,
+                        partition_by,
+                        order_by,
+                        window_frame,
+                        null_treatment,
+                        distinct,
+                        filter,
+                    },
+            } = window_fun.as_ref();
             let physical_args =
                 create_physical_exprs(args, logical_schema, execution_props)?;
             let partition_by =
@@ -1526,23 +2345,31 @@ pub fn create_window_expr_with_name(
 
             if !is_window_frame_bound_valid(window_frame) {
                 return plan_err!(
-                        "Invalid window frame: start bound ({}) cannot be larger than end bound ({})",
-                        window_frame.start_bound, window_frame.end_bound
-                    );
+                    "Invalid window frame: start bound ({}) cannot be larger than end bound ({})",
+                    window_frame.start_bound,
+                    window_frame.end_bound
+                );
             }
 
             let window_frame = Arc::new(window_frame.clone());
             let ignore_nulls = null_treatment.unwrap_or(NullTreatment::RespectNulls)
                 == NullTreatment::IgnoreNulls;
+            let physical_filter = filter
+                .as_ref()
+                .map(|f| create_physical_expr(f, logical_schema, execution_props))
+                .transpose()?;
+
             windows::create_window_expr(
                 fun,
                 name,
                 &physical_args,
                 &partition_by,
-                order_by.as_ref(),
+                &order_by,
                 window_frame,
                 physical_schema,
                 ignore_nulls,
+                *distinct,
+                physical_filter,
             )
         }
         other => plan_err!("Invalid window expression '{other:?}'"),
@@ -1567,8 +2394,8 @@ type AggregateExprWithOptionalArgs = (
     Arc<AggregateFunctionExpr>,
     // The filter clause, if any
     Option<Arc<dyn PhysicalExpr>>,
-    // Ordering requirements, if any
-    Option<LexOrdering>,
+    // Expressions in the ORDER BY clause
+    Vec<PhysicalSortExpr>,
 );
 
 /// Create an aggregate expression with a name from a logical expression
@@ -1612,22 +2439,16 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter(
             let ignore_nulls = null_treatment.unwrap_or(NullTreatment::RespectNulls)
                 == NullTreatment::IgnoreNulls;
 
-            let (agg_expr, filter, order_by) = {
-                let physical_sort_exprs = match order_by {
-                    Some(exprs) => Some(create_physical_sort_exprs(
-                        exprs,
-                        logical_input_schema,
-                        execution_props,
-                    )?),
-                    None => None,
-                };
-
-                let ordering_reqs: LexOrdering =
-                    physical_sort_exprs.clone().unwrap_or_default();
+            let (agg_expr, filter, order_bys) = {
+                let order_bys = create_physical_sort_exprs(
+                    order_by,
+                    logical_input_schema,
+                    execution_props,
+                )?;
 
                 let agg_expr =
                     AggregateExprBuilder::new(func.to_owned(), physical_args.to_vec())
-                        .order_by(ordering_reqs)
+                        .order_by(order_bys.clone())
                         .schema(Arc::new(physical_input_schema.to_owned()))
                         .alias(name)
                         .human_display(human_displan)
@@ -1636,10 +2457,10 @@ pub fn create_aggregate_expr_with_name_and_maybe_filter(
                         .build()
                         .map(Arc::new)?;
 
-                (agg_expr, filter, physical_sort_exprs)
+                (agg_expr, filter, order_bys)
             };
 
-            Ok((agg_expr, filter, order_by))
+            Ok((agg_expr, filter, order_bys))
         }
         other => internal_err!("Invalid aggregate expression '{other:?}'"),
     }
@@ -1652,21 +2473,24 @@ pub fn create_aggregate_expr_and_maybe_filter(
     physical_input_schema: &Schema,
     execution_props: &ExecutionProps,
 ) -> Result<AggregateExprWithOptionalArgs> {
-    // unpack (nested) aliased logical expressions, e.g. "sum(col) as total"
+    // Unpack (potentially nested) aliased logical expressions, e.g. "sum(col) as total"
+    // Some functions like `count_all()` create internal aliases,
+    // Unwrap all alias layers to get to the underlying aggregate function
     let (name, human_display, e) = match e {
-        Expr::Alias(Alias { expr, name, .. }) => {
-            (Some(name.clone()), String::default(), expr.as_ref())
+        Expr::Alias(Alias { name, .. }) => {
+            let unaliased = e.clone().unalias_nested().data;
+            (Some(name.clone()), e.human_display().to_string(), unaliased)
         }
         Expr::AggregateFunction(_) => (
             Some(e.schema_name().to_string()),
             e.human_display().to_string(),
-            e,
+            e.clone(),
         ),
-        _ => (None, String::default(), e),
+        _ => (None, String::default(), e.clone()),
     };
 
     create_aggregate_expr_with_name_and_maybe_filter(
-        e,
+        &e,
         name,
         human_display,
         logical_input_schema,
@@ -1675,14 +2499,6 @@ pub fn create_aggregate_expr_and_maybe_filter(
     )
 }
 
-#[deprecated(
-    since = "47.0.0",
-    note = "use datafusion::{create_physical_sort_expr, create_physical_sort_exprs}"
-)]
-pub use datafusion_physical_expr::{
-    create_physical_sort_expr, create_physical_sort_exprs,
-};
-
 impl DefaultPhysicalPlanner {
     /// Handles capturing the various plans for EXPLAIN queries
     ///
@@ -1739,6 +2555,7 @@ impl DefaultPhysicalPlanner {
                 stringified_plans.push(StringifiedPlan::new(
                     FinalPhysicalPlan,
                     displayable(optimized_plan.as_ref())
+                        .set_tree_maximum_render_width(config.tree_maximum_render_width)
                         .tree_render()
                         .to_string(),
                 ));
@@ -1896,11 +2713,17 @@ impl DefaultPhysicalPlanner {
         session_state: &SessionState,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input = self.create_physical_plan(&a.input, session_state).await?;
-        let schema = SchemaRef::new((*a.schema).clone().into());
+        let schema = Arc::clone(a.schema.inner());
         let show_statistics = session_state.config_options().explain.show_statistics;
+        let analyze_level = session_state.config_options().explain.analyze_level;
+        let metric_types = match analyze_level {
+            ExplainAnalyzeLevel::Summary => vec![MetricType::SUMMARY],
+            ExplainAnalyzeLevel::Dev => vec![MetricType::SUMMARY, MetricType::DEV],
+        };
         Ok(Arc::new(AnalyzeExec::new(
             a.verbose,
             show_statistics,
+            metric_types,
             input,
             schema,
         )))
@@ -1908,6 +2731,7 @@ impl DefaultPhysicalPlanner {
 
     /// Optimize a physical plan by applying each physical optimizer,
     /// calling observer(plan, optimizer after each one)
+    #[expect(clippy::needless_pass_by_value)]
     pub fn optimize_physical_plan<F>(
         &self,
         plan: Arc<dyn ExecutionPlan>,
@@ -1922,7 +2746,7 @@ impl DefaultPhysicalPlanner {
             "Input physical plan:\n{}\n",
             displayable(plan.as_ref()).indent(false)
         );
-        trace!(
+        debug!(
             "Detailed input physical plan:\n{}",
             displayable(plan.as_ref()).indent(true)
         );
@@ -1942,9 +2766,9 @@ impl DefaultPhysicalPlanner {
 
             // This only checks the schema in release build, and performs additional checks in debug mode.
             OptimizationInvariantChecker::new(optimizer)
-                .check(&new_plan, before_schema)?;
+                .check(&new_plan, &before_schema)?;
 
-            trace!(
+            debug!(
                 "Optimized physical plan by {}:\n{}\n",
                 optimizer.name(),
                 displayable(new_plan.as_ref()).indent(false)
@@ -1960,14 +2784,22 @@ impl DefaultPhysicalPlanner {
             "Optimized physical plan:\n{}\n",
             displayable(new_plan.as_ref()).indent(false)
         );
-        trace!("Detailed optimized physical plan:\n{new_plan:?}");
+
+        // Don't print new_plan directly, as that may overflow the stack.
+        // For example:
+        // thread 'tokio-runtime-worker' has overflowed its stack
+        // fatal runtime error: stack overflow, aborting
+        debug!(
+            "Detailed optimized physical plan:\n{}\n",
+            displayable(new_plan.as_ref()).indent(true)
+        );
         Ok(new_plan)
     }
 
     // return an record_batch which describes a table's schema.
     fn plan_describe(
         &self,
-        table_schema: Arc<Schema>,
+        table_schema: &Arc<Schema>,
         output_schema: Arc<Schema>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut column_names = StringBuilder::new();
@@ -1978,7 +2810,7 @@ impl DefaultPhysicalPlanner {
 
             // "System supplied type" --> Use debug format of the datatype
             let data_type = field.data_type();
-            data_types.append_value(format!("{data_type:?}"));
+            data_types.append_value(format!("{data_type}"));
 
             // "YES if the column is possibly nullable, NO if it is known not nullable. "
             let nullable_str = if field.is_nullable() { "YES" } else { "NO" };
@@ -2044,21 +2876,105 @@ impl DefaultPhysicalPlanner {
                 let physical_expr =
                     self.create_physical_expr(e, input_logical_schema, session_state);
 
-                // Check for possible column name mismatches
-                let final_physical_expr =
-                    maybe_fix_physical_column_name(physical_expr, &input_physical_schema);
-
-                tuple_err((final_physical_expr, physical_name))
+                tuple_err((physical_expr, physical_name))
             })
             .collect::<Result<Vec<_>>>()?;
 
-        Ok(Arc::new(ProjectionExec::try_new(
-            physical_exprs,
-            input_exec,
-        )?))
+        let num_input_columns = input_exec.schema().fields().len();
+
+        match self.try_plan_async_exprs(
+            num_input_columns,
+            PlannedExprResult::ExprWithName(physical_exprs),
+            input_physical_schema.as_ref(),
+        )? {
+            PlanAsyncExpr::Sync(PlannedExprResult::ExprWithName(physical_exprs)) => {
+                let proj_exprs: Vec<ProjectionExpr> = physical_exprs
+                    .into_iter()
+                    .map(|(expr, alias)| ProjectionExpr { expr, alias })
+                    .collect();
+                Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input_exec)?))
+            }
+            PlanAsyncExpr::Async(
+                async_map,
+                PlannedExprResult::ExprWithName(physical_exprs),
+            ) => {
+                let async_exec =
+                    AsyncFuncExec::try_new(async_map.async_exprs, input_exec)?;
+                let proj_exprs: Vec<ProjectionExpr> = physical_exprs
+                    .into_iter()
+                    .map(|(expr, alias)| ProjectionExpr { expr, alias })
+                    .collect();
+                let new_proj_exec =
+                    ProjectionExec::try_new(proj_exprs, Arc::new(async_exec))?;
+                Ok(Arc::new(new_proj_exec))
+            }
+            _ => internal_err!("Unexpected PlanAsyncExpressions variant"),
+        }
+    }
+
+    fn try_plan_async_exprs(
+        &self,
+        num_input_columns: usize,
+        physical_expr: PlannedExprResult,
+        schema: &Schema,
+    ) -> Result<PlanAsyncExpr> {
+        let mut async_map = AsyncMapper::new(num_input_columns);
+        match &physical_expr {
+            PlannedExprResult::ExprWithName(exprs) => {
+                exprs
+                    .iter()
+                    .try_for_each(|(expr, _)| async_map.find_references(expr, schema))?;
+            }
+            PlannedExprResult::Expr(exprs) => {
+                exprs
+                    .iter()
+                    .try_for_each(|expr| async_map.find_references(expr, schema))?;
+            }
+        }
+
+        if async_map.is_empty() {
+            return Ok(PlanAsyncExpr::Sync(physical_expr));
+        }
+
+        let new_exprs = match physical_expr {
+            PlannedExprResult::ExprWithName(exprs) => PlannedExprResult::ExprWithName(
+                exprs
+                    .iter()
+                    .map(|(expr, column_name)| {
+                        let new_expr = Arc::clone(expr)
+                            .transform_up(|e| Ok(async_map.map_expr(e)))?;
+                        Ok((new_expr.data, column_name.to_string()))
+                    })
+                    .collect::<Result<_>>()?,
+            ),
+            PlannedExprResult::Expr(exprs) => PlannedExprResult::Expr(
+                exprs
+                    .iter()
+                    .map(|expr| {
+                        let new_expr = Arc::clone(expr)
+                            .transform_up(|e| Ok(async_map.map_expr(e)))?;
+                        Ok(new_expr.data)
+                    })
+                    .collect::<Result<_>>()?,
+            ),
+        };
+        // rewrite the projection's expressions in terms of the columns with the result of async evaluation
+        Ok(PlanAsyncExpr::Async(async_map, new_exprs))
     }
 }
 
+#[derive(Debug)]
+enum PlannedExprResult {
+    ExprWithName(Vec<(Arc<dyn PhysicalExpr>, String)>),
+    Expr(Vec<Arc<dyn PhysicalExpr>>),
+}
+
+#[derive(Debug)]
+enum PlanAsyncExpr {
+    Sync(PlannedExprResult),
+    Async(AsyncMapper, PlannedExprResult),
+}
+
 fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
     match value {
         (Ok(e), Ok(e1)) => Ok((e, e1)),
@@ -2068,47 +2984,6 @@ fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
     }
 }
 
-// Handle the case where the name of a physical column expression does not match the corresponding physical input fields names.
-// Physical column names are derived from the physical schema, whereas physical column expressions are derived from the logical column names.
-//
-// This is a special case that applies only to column expressions. Logical plans may slightly modify column names by appending a suffix (e.g., using ':'),
-// to avoid duplicates—since DFSchemas do not allow duplicate names. For example: `count(Int64(1)):1`.
-fn maybe_fix_physical_column_name(
-    expr: Result<Arc<dyn PhysicalExpr>>,
-    input_physical_schema: &SchemaRef,
-) -> Result<Arc<dyn PhysicalExpr>> {
-    let Ok(expr) = expr else { return expr };
-    expr.transform_down(|node| {
-        if let Some(column) = node.as_any().downcast_ref::<Column>() {
-            let idx = column.index();
-            let physical_field = input_physical_schema.field(idx);
-            let expr_col_name = column.name();
-            let physical_name = physical_field.name();
-
-            if expr_col_name != physical_name {
-                // handle edge cases where the physical_name contains ':'.
-                let colon_count = physical_name.matches(':').count();
-                let mut splits = expr_col_name.match_indices(':');
-                let split_pos = splits.nth(colon_count);
-
-                if let Some((i, _)) = split_pos {
-                    let base_name = &expr_col_name[..i];
-                    if base_name == physical_name {
-                        let updated_column = Column::new(physical_name, idx);
-                        return Ok(Transformed::yes(Arc::new(updated_column)));
-                    }
-                }
-            }
-
-            // If names already match or fix is not possible, just leave it as it is
-            Ok(Transformed::no(node))
-        } else {
-            Ok(Transformed::no(node))
-        }
-    })
-    .data()
-}
-
 struct OptimizationInvariantChecker<'a> {
     rule: &'a Arc<dyn PhysicalOptimizerRule + Send + Sync>,
 }
@@ -2127,11 +3002,14 @@ impl<'a> OptimizationInvariantChecker<'a> {
     pub fn check(
         &mut self,
         plan: &Arc<dyn ExecutionPlan>,
-        previous_schema: Arc<Schema>,
+        previous_schema: &Arc<Schema>,
     ) -> Result<()> {
         // if the rule is not permitted to change the schema, confirm that it did not change.
-        if self.rule.schema_check() && plan.schema() != previous_schema {
-            internal_err!("PhysicalOptimizer rule '{}' failed. Schema mismatch. Expected original schema: {:?}, got new schema: {:?}",
+        if self.rule.schema_check()
+            && !is_allowed_schema_change(previous_schema.as_ref(), plan.schema().as_ref())
+        {
+            internal_err!(
+                "PhysicalOptimizer rule '{}' failed. Schema mismatch. Expected original schema: {}, got new schema: {}",
                 self.rule.name(),
                 previous_schema,
                 plan.schema()
@@ -2146,12 +3024,44 @@ impl<'a> OptimizationInvariantChecker<'a> {
     }
 }
 
+/// Checks if the change from `old` schema to `new` is allowed or not.
+///
+/// The current implementation only allows nullability of individual fields to change
+/// from 'nullable' to 'not nullable'. This can happen due to physical expressions knowing
+/// more about their null-ness than their logical counterparts.
+/// This change is allowed because for any field the non-nullable domain `F` is a strict subset
+/// of the nullable domain `F ∪ { NULL }`. A physical schema that guarantees a stricter subset
+/// of values will not violate any assumptions made based on the less strict schema.
+fn is_allowed_schema_change(old: &Schema, new: &Schema) -> bool {
+    if new.metadata != old.metadata {
+        return false;
+    }
+
+    if new.fields.len() != old.fields.len() {
+        return false;
+    }
+
+    let new_fields = new.fields.iter().map(|f| f.as_ref());
+    let old_fields = old.fields.iter().map(|f| f.as_ref());
+    old_fields
+        .zip(new_fields)
+        .all(|(old, new)| is_allowed_field_change(old, new))
+}
+
+fn is_allowed_field_change(old_field: &Field, new_field: &Field) -> bool {
+    new_field.name() == old_field.name()
+        && new_field.data_type() == old_field.data_type()
+        && new_field.metadata() == old_field.metadata()
+        && (new_field.is_nullable() == old_field.is_nullable()
+            || !new_field.is_nullable())
+}
+
 impl<'n> TreeNodeVisitor<'n> for OptimizationInvariantChecker<'_> {
     type Node = Arc<dyn ExecutionPlan>;
 
     fn f_down(&mut self, node: &'n Self::Node) -> Result<TreeNodeRecursion> {
         // Checks for the more permissive `InvariantLevel::Always`.
-        // Plans are not guarenteed to be executable after each physical optimizer run.
+        // Plans are not guaranteed to be executable after each physical optimizer run.
         node.check_invariants(InvariantLevel::Always).map_err(|e|
             e.context(format!("Invariant for ExecutionPlan node '{}' failed for PhysicalOptimizer rule '{}'", node.name(), self.rule.name()))
         )?;
@@ -2194,11 +3104,11 @@ mod tests {
     use std::ops::{BitAnd, Not};
 
     use super::*;
-    use crate::datasource::file_format::options::CsvReadOptions;
     use crate::datasource::MemTable;
+    use crate::datasource::file_format::options::CsvReadOptions;
     use crate::physical_plan::{
-        expressions, DisplayAs, DisplayFormatType, PlanProperties,
-        SendableRecordBatchStream,
+        DisplayAs, DisplayFormatType, PlanProperties, SendableRecordBatchStream,
+        expressions,
     };
     use crate::prelude::{SessionConfig, SessionContext};
     use crate::test_util::{scan_empty, scan_empty_with_partitions};
@@ -2206,17 +3116,19 @@ mod tests {
     use crate::execution::session_state::SessionStateBuilder;
     use arrow::array::{ArrayRef, DictionaryArray, Int32Array};
     use arrow::datatypes::{DataType, Field, Int32Type};
+    use arrow_schema::SchemaRef;
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::{
-        assert_contains, DFSchemaRef, TableReference, ToDFSchema as _,
+        DFSchemaRef, TableReference, ToDFSchema as _, assert_contains,
     };
-    use datafusion_execution::runtime_env::RuntimeEnv;
     use datafusion_execution::TaskContext;
+    use datafusion_execution::runtime_env::RuntimeEnv;
+    use datafusion_expr::builder::subquery_alias;
     use datafusion_expr::{
-        col, lit, LogicalPlanBuilder, Operator, UserDefinedLogicalNodeCore,
+        LogicalPlanBuilder, TableSource, UserDefinedLogicalNodeCore, col, lit,
     };
+    use datafusion_functions_aggregate::count::count_all;
     use datafusion_functions_aggregate::expr_fn::sum;
-    use datafusion_physical_expr::expressions::{BinaryExpr, IsNotNullExpr};
     use datafusion_physical_expr::EquivalenceProperties;
     use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 
@@ -2258,8 +3170,9 @@ mod tests {
         // verify that the plan correctly casts u8 to i64
         // the cast from u8 to i64 for literal will be simplified, and get lit(int64(5))
         // the cast here is implicit so has CastOptions with safe=true
-        let expected = "BinaryExpr { left: Column { name: \"c7\", index: 2 }, op: Lt, right: Literal { value: Int64(5) }, fail_on_overflow: false }";
-        assert!(format!("{exec_plan:?}").contains(expected));
+        let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: false"#;
+
+        assert_contains!(format!("{exec_plan:?}"), expected);
         Ok(())
     }
 
@@ -2283,9 +3196,113 @@ mod tests {
             &session_state,
         );
 
-        let expected = r#"Ok(PhysicalGroupBy { expr: [(Column { name: "c1", index: 0 }, "c1"), (Column { name: "c2", index: 1 }, "c2"), (Column { name: "c3", index: 2 }, "c3")], null_expr: [(Literal { value: Utf8(NULL) }, "c1"), (Literal { value: Int64(NULL) }, "c2"), (Literal { value: Int64(NULL) }, "c3")], groups: [[false, false, false], [true, false, false], [false, true, false], [false, false, true], [true, true, false], [true, false, true], [false, true, true], [true, true, true]] })"#;
-
-        assert_eq!(format!("{cube:?}"), expected);
+        insta::assert_debug_snapshot!(cube, @r#"
+        Ok(
+            PhysicalGroupBy {
+                expr: [
+                    (
+                        Column {
+                            name: "c1",
+                            index: 0,
+                        },
+                        "c1",
+                    ),
+                    (
+                        Column {
+                            name: "c2",
+                            index: 1,
+                        },
+                        "c2",
+                    ),
+                    (
+                        Column {
+                            name: "c3",
+                            index: 2,
+                        },
+                        "c3",
+                    ),
+                ],
+                null_expr: [
+                    (
+                        Literal {
+                            value: Utf8(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Utf8,
+                                nullable: true,
+                            },
+                        },
+                        "c1",
+                    ),
+                    (
+                        Literal {
+                            value: Int64(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Int64,
+                                nullable: true,
+                            },
+                        },
+                        "c2",
+                    ),
+                    (
+                        Literal {
+                            value: Int64(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Int64,
+                                nullable: true,
+                            },
+                        },
+                        "c3",
+                    ),
+                ],
+                groups: [
+                    [
+                        false,
+                        false,
+                        false,
+                    ],
+                    [
+                        true,
+                        false,
+                        false,
+                    ],
+                    [
+                        false,
+                        true,
+                        false,
+                    ],
+                    [
+                        false,
+                        false,
+                        true,
+                    ],
+                    [
+                        true,
+                        true,
+                        false,
+                    ],
+                    [
+                        true,
+                        false,
+                        true,
+                    ],
+                    [
+                        false,
+                        true,
+                        true,
+                    ],
+                    [
+                        true,
+                        true,
+                        true,
+                    ],
+                ],
+                has_grouping_set: true,
+            },
+        )
+        "#);
 
         Ok(())
     }
@@ -2310,9 +3327,93 @@ mod tests {
             &session_state,
         );
 
-        let expected = r#"Ok(PhysicalGroupBy { expr: [(Column { name: "c1", index: 0 }, "c1"), (Column { name: "c2", index: 1 }, "c2"), (Column { name: "c3", index: 2 }, "c3")], null_expr: [(Literal { value: Utf8(NULL) }, "c1"), (Literal { value: Int64(NULL) }, "c2"), (Literal { value: Int64(NULL) }, "c3")], groups: [[true, true, true], [false, true, true], [false, false, true], [false, false, false]] })"#;
-
-        assert_eq!(format!("{rollup:?}"), expected);
+        insta::assert_debug_snapshot!(rollup, @r#"
+        Ok(
+            PhysicalGroupBy {
+                expr: [
+                    (
+                        Column {
+                            name: "c1",
+                            index: 0,
+                        },
+                        "c1",
+                    ),
+                    (
+                        Column {
+                            name: "c2",
+                            index: 1,
+                        },
+                        "c2",
+                    ),
+                    (
+                        Column {
+                            name: "c3",
+                            index: 2,
+                        },
+                        "c3",
+                    ),
+                ],
+                null_expr: [
+                    (
+                        Literal {
+                            value: Utf8(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Utf8,
+                                nullable: true,
+                            },
+                        },
+                        "c1",
+                    ),
+                    (
+                        Literal {
+                            value: Int64(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Int64,
+                                nullable: true,
+                            },
+                        },
+                        "c2",
+                    ),
+                    (
+                        Literal {
+                            value: Int64(NULL),
+                            field: Field {
+                                name: "lit",
+                                data_type: Int64,
+                                nullable: true,
+                            },
+                        },
+                        "c3",
+                    ),
+                ],
+                groups: [
+                    [
+                        true,
+                        true,
+                        true,
+                    ],
+                    [
+                        false,
+                        true,
+                        true,
+                    ],
+                    [
+                        false,
+                        false,
+                        true,
+                    ],
+                    [
+                        false,
+                        false,
+                        false,
+                    ],
+                ],
+                has_grouping_set: true,
+            },
+        )
+        "#);
 
         Ok(())
     }
@@ -2427,8 +3528,7 @@ mod tests {
             .create_physical_plan(&logical_plan, &session_state)
             .await;
 
-        let expected_error =
-            "No installed planner was able to convert the custom node to an execution plan: NoOp";
+        let expected_error = "No installed planner was able to convert the custom node to an execution plan: NoOp";
         match plan {
             Ok(_) => panic!("Expected planning failure"),
             Err(e) => assert!(
@@ -2450,35 +3550,13 @@ mod tests {
         let logical_plan = LogicalPlan::Extension(Extension {
             node: Arc::new(NoOpExtensionNode::default()),
         });
-        let plan = planner
+        let e = planner
             .create_physical_plan(&logical_plan, &session_state)
-            .await;
+            .await
+            .expect_err("planning error")
+            .strip_backtrace();
 
-        let expected_error: &str = "Error during planning: \
-            Extension planner for NoOp created an ExecutionPlan with mismatched schema. \
-            LogicalPlan schema: \
-            DFSchema { inner: Schema { fields: \
-                [Field { name: \"a\", \
-                data_type: Int32, \
-                nullable: false, \
-                dict_id: 0, \
-                dict_is_ordered: false, metadata: {} }], \
-                metadata: {} }, field_qualifiers: [None], \
-                functional_dependencies: FunctionalDependencies { deps: [] } }, \
-            ExecutionPlan schema: Schema { fields: \
-                [Field { name: \"b\", \
-                data_type: Int32, \
-                nullable: false, \
-                dict_id: 0, \
-                dict_is_ordered: false, metadata: {} }], \
-                metadata: {} }";
-        match plan {
-            Ok(_) => panic!("Expected planning failure"),
-            Err(e) => assert!(
-                e.to_string().contains(expected_error),
-                "Error '{e}' did not contain expected error '{expected_error}'"
-            ),
-        }
+        insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#);
     }
 
     #[tokio::test]
@@ -2494,10 +3572,9 @@ mod tests {
         let execution_plan = plan(&logical_plan).await?;
         // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated.
 
-        let expected = "expr: [(BinaryExpr { left: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"a\") }, fail_on_overflow: false }, op: Or, right: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"1\") }, fail_on_overflow: false }, fail_on_overflow: false }";
+        let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#;
 
-        let actual = format!("{execution_plan:?}");
-        assert!(actual.contains(expected), "{}", actual);
+        assert_contains!(format!("{execution_plan:?}"), expected);
 
         Ok(())
     }
@@ -2517,7 +3594,7 @@ mod tests {
 
         assert_contains!(
             &e,
-            r#"Error during planning: Can not find compatible types to compare Boolean with [Struct([Field { name: "foo", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), Utf8]"#
+            r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": non-null Boolean), Utf8]"#
         );
 
         Ok(())
@@ -2674,6 +3751,25 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_aggregate_count_all_with_alias() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, false),
+            Field::new("c2", DataType::UInt32, false),
+        ]));
+
+        let logical_plan = scan_empty(None, schema.as_ref(), None)?
+            .aggregate(Vec::<Expr>::new(), vec![count_all().alias("total_rows")])?
+            .build()?;
+
+        let physical_plan = plan(&logical_plan).await?;
+        assert_eq!(
+            "total_rows",
+            physical_plan.schema().field(0).name().as_str()
+        );
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_explain() {
         let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
@@ -2689,18 +3785,27 @@ mod tests {
         if let Some(plan) = plan.as_any().downcast_ref::<ExplainExec>() {
             let stringified_plans = plan.stringified_plans();
             assert!(stringified_plans.len() >= 4);
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::FinalLogicalPlan)));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::InitialPhysicalPlan)));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::OptimizedPhysicalPlan { .. })));
-            assert!(stringified_plans
-                .iter()
-                .any(|p| matches!(p.plan_type, PlanType::FinalPhysicalPlan)));
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::FinalLogicalPlan)
+            );
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::InitialPhysicalPlan)
+            );
+            assert!(
+                stringified_plans.iter().any(|p| matches!(
+                    p.plan_type,
+                    PlanType::OptimizedPhysicalPlan { .. }
+                ))
+            );
+            assert!(
+                stringified_plans
+                    .iter()
+                    .any(|p| p.plan_type == PlanType::FinalPhysicalPlan)
+            );
         } else {
             panic!(
                 "Plan was not an explain plan: {}",
@@ -2757,71 +3862,6 @@ mod tests {
         }
     }
 
-    #[tokio::test]
-    async fn test_maybe_fix_colon_in_physical_name() {
-        // The physical schema has a field name with a colon
-        let schema = Schema::new(vec![Field::new("metric:avg", DataType::Int32, false)]);
-        let schema_ref: SchemaRef = Arc::new(schema);
-
-        // What might happen after deduplication
-        let logical_col_name = "metric:avg:1";
-        let expr_with_suffix =
-            Arc::new(Column::new(logical_col_name, 0)) as Arc<dyn PhysicalExpr>;
-        let expr_result = Ok(expr_with_suffix);
-
-        // Call function under test
-        let fixed_expr =
-            maybe_fix_physical_column_name(expr_result, &schema_ref).unwrap();
-
-        // Downcast back to Column so we can check the name
-        let col = fixed_expr
-            .as_any()
-            .downcast_ref::<Column>()
-            .expect("Column");
-
-        assert_eq!(col.name(), "metric:avg");
-    }
-
-    #[tokio::test]
-    async fn test_maybe_fix_nested_column_name_with_colon() {
-        let schema = Schema::new(vec![Field::new("column", DataType::Int32, false)]);
-        let schema_ref: SchemaRef = Arc::new(schema);
-
-        // Construct the nested expr
-        let col_expr = Arc::new(Column::new("column:1", 0)) as Arc<dyn PhysicalExpr>;
-        let is_not_null_expr = Arc::new(IsNotNullExpr::new(col_expr.clone()));
-
-        // Create a binary expression and put the column inside
-        let binary_expr = Arc::new(BinaryExpr::new(
-            is_not_null_expr.clone(),
-            Operator::Or,
-            is_not_null_expr.clone(),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let fixed_expr =
-            maybe_fix_physical_column_name(Ok(binary_expr), &schema_ref).unwrap();
-
-        let bin = fixed_expr
-            .as_any()
-            .downcast_ref::<BinaryExpr>()
-            .expect("Expected BinaryExpr");
-
-        // Check that both sides where renamed
-        for expr in &[bin.left(), bin.right()] {
-            let is_not_null = expr
-                .as_any()
-                .downcast_ref::<IsNotNullExpr>()
-                .expect("Expected IsNotNull");
-
-            let col = is_not_null
-                .arg()
-                .as_any()
-                .downcast_ref::<Column>()
-                .expect("Expected Column");
-
-            assert_eq!(col.name(), "column");
-        }
-    }
     struct ErrorExtensionPlanner {}
 
     #[async_trait]
@@ -2908,13 +3948,15 @@ mod tests {
 
     #[derive(Debug)]
     struct NoOpExecutionPlan {
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl NoOpExecutionPlan {
         fn new(schema: SchemaRef) -> Self {
             let cache = Self::compute_properties(schema);
-            Self { cache }
+            Self {
+                cache: Arc::new(cache),
+            }
         }
 
         /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -2952,7 +3994,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -2974,6 +4016,20 @@ mod tests {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!("NoOpExecutionPlan::execute");
         }
+
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            // Visit expressions in the output ordering from equivalence properties
+            let mut tnr = TreeNodeRecursion::Continue;
+            if let Some(ordering) = self.cache.output_ordering() {
+                for sort_expr in ordering {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+            Ok(tnr)
+        }
     }
 
     //  Produces an execution plan where the schema is mismatched from
@@ -3106,7 +4162,7 @@ digraph {
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             self.0.iter().collect::<Vec<_>>()
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3116,6 +4172,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for OkExtensionNode {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3132,8 +4194,12 @@ digraph {
         }
         fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
             match check {
-                InvariantLevel::Always => plan_err!("extension node failed it's user-defined always-invariant check"),
-                InvariantLevel::Executable => panic!("the OptimizationInvariantChecker should not be checking for executableness"),
+                InvariantLevel::Always => plan_err!(
+                    "extension node failed it's user-defined always-invariant check"
+                ),
+                InvariantLevel::Executable => panic!(
+                    "the OptimizationInvariantChecker should not be checking for executableness"
+                ),
             }
         }
         fn schema(&self) -> SchemaRef {
@@ -3151,7 +4217,7 @@ digraph {
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3161,6 +4227,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for InvariantFailsExtensionNode {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3202,24 +4274,26 @@ digraph {
 
         // Test: check should pass with same schema
         let equal_schema = ok_plan.schema();
-        OptimizationInvariantChecker::new(&rule).check(&ok_plan, equal_schema)?;
+        OptimizationInvariantChecker::new(&rule).check(&ok_plan, &equal_schema)?;
 
         // Test: should fail with schema changed
         let different_schema =
             Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)]));
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&ok_plan, different_schema)
+            .check(&ok_plan, &different_schema)
             .unwrap_err();
         assert!(expected_err.to_string().contains("PhysicalOptimizer rule 'OptimizerRuleWithSchemaCheck' failed. Schema mismatch. Expected original schema"));
 
         // Test: should fail when extension node fails it's own invariant check
         let failing_node: Arc<dyn ExecutionPlan> = Arc::new(InvariantFailsExtensionNode);
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&failing_node, ok_plan.schema())
+            .check(&failing_node, &ok_plan.schema())
             .unwrap_err();
-        assert!(expected_err
-            .to_string()
-            .contains("extension node failed it's user-defined always-invariant check"));
+        assert!(
+            expected_err.to_string().contains(
+                "extension node failed it's user-defined always-invariant check"
+            )
+        );
 
         // Test: should fail when descendent extension node fails
         let failing_node: Arc<dyn ExecutionPlan> = Arc::new(InvariantFailsExtensionNode);
@@ -3228,11 +4302,13 @@ digraph {
             Arc::clone(&child),
         ])?;
         let expected_err = OptimizationInvariantChecker::new(&rule)
-            .check(&invalid_plan, ok_plan.schema())
+            .check(&invalid_plan, &ok_plan.schema())
             .unwrap_err();
-        assert!(expected_err
-            .to_string()
-            .contains("extension node failed it's user-defined always-invariant check"));
+        assert!(
+            expected_err.to_string().contains(
+                "extension node failed it's user-defined always-invariant check"
+            )
+        );
 
         Ok(())
     }
@@ -3268,7 +4344,7 @@ digraph {
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             vec![]
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
         fn execute(
@@ -3278,6 +4354,12 @@ digraph {
         ) -> Result<SendableRecordBatchStream> {
             unimplemented!()
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
     impl DisplayAs for ExecutableInvariantFails {
         fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
@@ -3318,4 +4400,358 @@ digraph {
 
         Ok(())
     }
+
+    // Reproducer for DataFusion issue #17405:
+    //
+    // The following SQL is semantically invalid. Notably, the `SELECT left_table.a, right_table.a`
+    // clause is missing from the explicit logical plan:
+    //
+    // SELECT a FROM (
+    //       -- SELECT left_table.a, right_table.a
+    //       FROM left_table
+    //       FULL JOIN right_table ON left_table.a = right_table.a
+    // ) AS alias
+    // GROUP BY a;
+    //
+    // As a result, the variables within `alias` subquery are not properly distinguished, which
+    // leads to a bug for logical and physical planning.
+    //
+    // The fix is to implicitly insert a Projection node to represent the missing SELECT clause to
+    // ensure each field is correctly aliased to a unique name when the SubqueryAlias node is added.
+    #[tokio::test]
+    async fn subquery_alias_confusing_the_optimizer() -> Result<()> {
+        let state = make_session_state();
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let schema = Arc::new(schema);
+
+        let table = MemTable::try_new(schema.clone(), vec![vec![]])?;
+        let table = Arc::new(table);
+
+        let source = DefaultTableSource::new(table);
+        let source = Arc::new(source);
+
+        let left = LogicalPlanBuilder::scan("left", source.clone(), None)?;
+        let right = LogicalPlanBuilder::scan("right", source, None)?.build()?;
+
+        let join_keys = (
+            vec![Column::new(Some("left"), "a")],
+            vec![Column::new(Some("right"), "a")],
+        );
+
+        let join = left.join(right, JoinType::Full, join_keys, None)?.build()?;
+
+        let alias = subquery_alias(join, "alias")?;
+
+        let planner = DefaultPhysicalPlanner::default();
+
+        let logical_plan = LogicalPlanBuilder::new(alias)
+            .aggregate(vec![col("a:1")], Vec::<Expr>::new())?
+            .build()?;
+        let _physical_plan = planner.create_physical_plan(&logical_plan, &state).await?;
+
+        let optimized_logical_plan = state.optimize(&logical_plan)?;
+        let _optimized_physical_plan = planner
+            .create_physical_plan(&optimized_logical_plan, &state)
+            .await?;
+
+        Ok(())
+    }
+
+    // --- Tests for aggregate schema mismatch error messages ---
+
+    use crate::catalog::TableProvider;
+    use datafusion_catalog::Session;
+    use datafusion_expr::TableType;
+
+    /// A TableProvider that returns schemas for logical planning vs physical planning.
+    /// Used to test schema mismatch error messages.
+    #[derive(Debug)]
+    struct MockSchemaTableProvider {
+        logical_schema: SchemaRef,
+        physical_schema: SchemaRef,
+    }
+
+    #[async_trait]
+    impl TableProvider for MockSchemaTableProvider {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.logical_schema)
+        }
+
+        fn table_type(&self) -> TableType {
+            TableType::Base
+        }
+
+        async fn scan(
+            &self,
+            _state: &dyn Session,
+            _projection: Option<&Vec<usize>>,
+            _filters: &[Expr],
+            _limit: Option<usize>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Ok(Arc::new(NoOpExecutionPlan::new(Arc::clone(
+                &self.physical_schema,
+            ))))
+        }
+    }
+
+    /// Attempts to plan a query with potentially mismatched schemas.
+    async fn plan_with_schemas(
+        logical_schema: SchemaRef,
+        physical_schema: SchemaRef,
+        query: &str,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let provider = MockSchemaTableProvider {
+            logical_schema,
+            physical_schema,
+        };
+        let ctx = SessionContext::new();
+        ctx.register_table("test", Arc::new(provider)).unwrap();
+
+        ctx.sql(query).await.unwrap().create_physical_plan().await
+    }
+
+    #[tokio::test]
+    // When schemas match, planning proceeds past the schema_satisfied_by check.
+    // It then panics on unimplemented error in NoOpExecutionPlan.
+    #[should_panic(expected = "NoOpExecutionPlan")]
+    async fn test_aggregate_schema_check_passes() {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+
+        plan_with_schemas(
+            Arc::clone(&schema),
+            schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_metadata() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(
+            Schema::new(vec![Field::new("c1", DataType::Int32, false)])
+                .with_metadata(HashMap::from([("key".into(), "value".into())])),
+        );
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "schema metadata differs");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_count() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "Different number of fields");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_name() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![Field::new(
+            "different_name",
+            DataType::Int32,
+            false,
+        )]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field name at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_type() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int64, false)]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field data type at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_nullability() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field nullability at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_field_metadata() {
+        let logical_schema =
+            Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false)
+                .with_metadata(HashMap::from([("key".into(), "value".into())])),
+        ]));
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        assert_contains!(err.to_string(), "field metadata at index");
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_schema_mismatch_multiple() {
+        let logical_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Utf8, false),
+        ]));
+        let physical_schema = Arc::new(
+            Schema::new(vec![
+                Field::new("c1", DataType::Int64, true)
+                    .with_metadata(HashMap::from([("key".into(), "value".into())])),
+                Field::new("c2", DataType::Utf8, false),
+            ])
+            .with_metadata(HashMap::from([(
+                "schema_key".into(),
+                "schema_value".into(),
+            )])),
+        );
+
+        let err = plan_with_schemas(
+            logical_schema,
+            physical_schema,
+            "SELECT count(*) FROM test GROUP BY c1",
+        )
+        .await
+        .unwrap_err();
+
+        // Verify all applicable error fragments are present
+        let err_str = err.to_string();
+        assert_contains!(&err_str, "schema metadata differs");
+        assert_contains!(&err_str, "field data type at index");
+        assert_contains!(&err_str, "field nullability at index");
+        assert_contains!(&err_str, "field metadata at index");
+    }
+
+    #[derive(Debug)]
+    struct MockTableSource {
+        schema: SchemaRef,
+    }
+
+    impl TableSource for MockTableSource {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    struct MockTableScanExtensionPlanner;
+
+    #[async_trait]
+    impl ExtensionPlanner for MockTableScanExtensionPlanner {
+        async fn plan_extension(
+            &self,
+            _planner: &dyn PhysicalPlanner,
+            _node: &dyn UserDefinedLogicalNode,
+            _logical_inputs: &[&LogicalPlan],
+            _physical_inputs: &[Arc<dyn ExecutionPlan>],
+            _session_state: &SessionState,
+        ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+            Ok(None)
+        }
+
+        async fn plan_table_scan(
+            &self,
+            _planner: &dyn PhysicalPlanner,
+            scan: &TableScan,
+            _session_state: &SessionState,
+        ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+            if scan.source.as_any().is::<MockTableSource>() {
+                Ok(Some(Arc::new(EmptyExec::new(Arc::clone(
+                    scan.projected_schema.inner(),
+                )))))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_table_scan_extension_planner() {
+        let session_state = make_session_state();
+        let planner = Arc::new(MockTableScanExtensionPlanner);
+        let physical_planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![planner]);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let table_source = Arc::new(MockTableSource {
+            schema: Arc::clone(&schema),
+        });
+        let logical_plan = LogicalPlanBuilder::scan("test", table_source, None)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let plan = physical_planner
+            .create_physical_plan(&logical_plan, &session_state)
+            .await
+            .unwrap();
+
+        assert_eq!(plan.schema(), schema);
+        assert!(plan.as_any().is::<EmptyExec>());
+    }
 }
diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs
index d723620d32323..31d9d7eb471f0 100644
--- a/datafusion/core/src/prelude.rs
+++ b/datafusion/core/src/prelude.rs
@@ -29,15 +29,15 @@ pub use crate::dataframe;
 pub use crate::dataframe::DataFrame;
 pub use crate::execution::context::{SQLOptions, SessionConfig, SessionContext};
 pub use crate::execution::options::{
-    AvroReadOptions, CsvReadOptions, NdJsonReadOptions, ParquetReadOptions,
+    AvroReadOptions, CsvReadOptions, JsonReadOptions, ParquetReadOptions,
 };
 
 pub use datafusion_common::Column;
 pub use datafusion_expr::{
+    Expr,
     expr_fn::*,
     lit, lit_timestamp_nano,
     logical_plan::{JoinType, Partitioning},
-    Expr,
 };
 pub use datafusion_functions::expr_fn::*;
 #[cfg(feature = "nested_expressions")]
diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs
index 8719a16f4919f..717182f1d3d5b 100644
--- a/datafusion/core/src/test/mod.rs
+++ b/datafusion/core/src/test/mod.rs
@@ -25,9 +25,9 @@ use std::io::{BufReader, BufWriter};
 use std::path::Path;
 use std::sync::Arc;
 
+use crate::datasource::file_format::FileFormat;
 use crate::datasource::file_format::csv::CsvFormat;
 use crate::datasource::file_format::file_compression_type::FileCompressionType;
-use crate::datasource::file_format::FileFormat;
 
 use crate::datasource::physical_plan::CsvSource;
 use crate::datasource::{MemTable, TableProvider};
@@ -35,27 +35,31 @@ use crate::error::Result;
 use crate::logical_expr::LogicalPlan;
 use crate::test_util::{aggr_test_schema, arrow_test_data};
 
+use datafusion_common::config::CsvOptions;
+
 use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
+#[cfg(feature = "compression")]
 use datafusion_common::DataFusionError;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::source::DataSourceExec;
 
-#[cfg(feature = "compression")]
-use bzip2::write::BzEncoder;
 #[cfg(feature = "compression")]
 use bzip2::Compression as BzCompression;
+#[cfg(feature = "compression")]
+use bzip2::write::BzEncoder;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource_csv::partitioned_csv_config;
 #[cfg(feature = "compression")]
+use flate2::Compression as GzCompression;
+#[cfg(feature = "compression")]
 use flate2::write::GzEncoder;
 #[cfg(feature = "compression")]
-use flate2::Compression as GzCompression;
+use liblzma::write::XzEncoder;
 use object_store::local_unpartitioned_file;
 #[cfg(feature = "compression")]
-use xz2::write::XzEncoder;
-#[cfg(feature = "compression")]
 use zstd::Encoder as ZstdEncoder;
 
 pub fn create_table_dual() -> Arc<dyn TableProvider> {
@@ -83,17 +87,26 @@ pub fn scan_partitioned_csv(
     let schema = aggr_test_schema();
     let filename = "aggregate_test_100.csv";
     let path = format!("{}/csv", arrow_test_data());
+    let csv_format: Arc<dyn FileFormat> = Arc::new(CsvFormat::default());
+
     let file_groups = partitioned_file_groups(
         path.as_str(),
         filename,
         partitions,
-        Arc::new(CsvFormat::default()),
+        &csv_format,
         FileCompressionType::UNCOMPRESSED,
         work_dir,
     )?;
-    let source = Arc::new(CsvSource::new(true, b'"', b'"'));
+    let options = CsvOptions {
+        has_header: Some(true),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
+    let table_schema = TableSchema::from_file_schema(schema);
+    let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
     let config =
-        FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source))
+        FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
             .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
             .build();
     Ok(DataSourceExec::from_data_source(config))
@@ -104,7 +117,7 @@ pub fn partitioned_file_groups(
     path: &str,
     filename: &str,
     partitions: usize,
-    file_format: Arc<dyn FileFormat>,
+    file_format: &Arc<dyn FileFormat>,
     file_compression_type: FileCompressionType,
     work_dir: &Path,
 ) -> Result<Vec<FileGroup>> {
@@ -188,7 +201,7 @@ pub fn partitioned_file_groups(
         .collect::<Vec<_>>())
 }
 
-pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) {
+pub fn assert_fields_eq(plan: &LogicalPlan, expected: &[&str]) {
     let actual: Vec<String> = plan
         .schema()
         .fields()
diff --git a/datafusion/core/src/test/object_store.rs b/datafusion/core/src/test/object_store.rs
index ed8474bbfc812..62c6699f8fcd1 100644
--- a/datafusion/core/src/test/object_store.rs
+++ b/datafusion/core/src/test/object_store.rs
@@ -17,21 +17,24 @@
 
 //! Object store implementation used for testing
 
-use crate::execution::context::SessionState;
-use crate::execution::session_state::SessionStateBuilder;
-use crate::prelude::SessionContext;
-use futures::stream::BoxStream;
-use futures::FutureExt;
-use object_store::{
-    memory::InMemory, path::Path, Error, GetOptions, GetResult, ListResult,
-    MultipartUpload, ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload,
-    PutResult,
+use crate::{
+    execution::{context::SessionState, session_state::SessionStateBuilder},
+    object_store::{
+        Error, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+        ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
+        memory::InMemory, path::Path,
+    },
+    prelude::SessionContext,
+};
+use futures::{FutureExt, stream::BoxStream};
+use object_store::{CopyOptions, ObjectStoreExt};
+use std::{
+    fmt::{Debug, Display, Formatter},
+    sync::Arc,
 };
-use std::fmt::{Debug, Display, Formatter};
-use std::sync::Arc;
 use tokio::{
     sync::Barrier,
-    time::{timeout, Duration},
+    time::{Duration, timeout},
 };
 use url::Url;
 
@@ -118,7 +121,7 @@ impl ObjectStore for BlockingObjectStore {
     async fn put_multipart_opts(
         &self,
         location: &Path,
-        opts: PutMultipartOpts,
+        opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
         self.inner.put_multipart_opts(location, opts).await
     }
@@ -128,39 +131,40 @@ impl ObjectStore for BlockingObjectStore {
         location: &Path,
         options: GetOptions,
     ) -> object_store::Result<GetResult> {
-        self.inner.get_opts(location, options).await
-    }
-
-    async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-        println!(
-            "{} received head call for {location}",
-            BlockingObjectStore::NAME
-        );
-        // Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
-        let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
-        match wait_result {
-            Ok(_) => println!(
-                "{} barrier reached for {location}",
+        if options.head {
+            println!(
+                "{} received head call for {location}",
                 BlockingObjectStore::NAME
-            ),
-            Err(_) => {
-                let error_message = format!(
-                    "{} barrier wait timed out for {location}",
+            );
+            // Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
+            let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
+            match wait_result {
+                Ok(_) => println!(
+                    "{} barrier reached for {location}",
                     BlockingObjectStore::NAME
-                );
-                log::error!("{error_message}");
-                return Err(Error::Generic {
-                    store: BlockingObjectStore::NAME,
-                    source: error_message.into(),
-                });
+                ),
+                Err(_) => {
+                    let error_message = format!(
+                        "{} barrier wait timed out for {location}",
+                        BlockingObjectStore::NAME
+                    );
+                    log::error!("{error_message}");
+                    return Err(Error::Generic {
+                        store: BlockingObjectStore::NAME,
+                        source: error_message.into(),
+                    });
+                }
             }
         }
+
         // Forward the call to the inner object store.
-        self.inner.head(location).await
+        self.inner.get_opts(location, options).await
     }
-
-    async fn delete(&self, location: &Path) -> object_store::Result<()> {
-        self.inner.delete(location).await
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        self.inner.delete_stream(locations)
     }
 
     fn list(
@@ -177,15 +181,12 @@ impl ObjectStore for BlockingObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
-        self.inner.copy(from, to).await
-    }
-
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         from: &Path,
         to: &Path,
+        options: CopyOptions,
     ) -> object_store::Result<()> {
-        self.inner.copy_if_not_exists(from, to).await
+        self.inner.copy_opts(from, to, options).await
     }
 }
diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs
index d6865ca3d532a..466ee38a426fd 100644
--- a/datafusion/core/src/test_util/mod.rs
+++ b/datafusion/core/src/test_util/mod.rs
@@ -22,27 +22,36 @@ pub mod parquet;
 
 pub mod csv;
 
+use futures::Stream;
 use std::any::Any;
 use std::collections::HashMap;
+use std::fmt::Formatter;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
 use crate::catalog::{TableProvider, TableProviderFactory};
 use crate::dataframe::DataFrame;
 use crate::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
 use crate::datasource::{empty::EmptyTable, provider_as_source};
 use crate::error::Result;
+use crate::execution::session_state::CacheFactory;
 use crate::logical_expr::{LogicalPlanBuilder, UNNAMED_TABLE};
 use crate::physical_plan::ExecutionPlan;
 use crate::prelude::{CsvReadOptions, SessionContext};
 
+use crate::execution::{SendableRecordBatchStream, SessionState, SessionStateBuilder};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_catalog::Session;
-use datafusion_common::TableReference;
-use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
+use datafusion_common::{DFSchemaRef, TableReference};
+use datafusion_expr::{
+    CreateExternalTable, Expr, LogicalPlan, SortExpr, TableType,
+    UserDefinedLogicalNodeCore,
+};
+use std::pin::Pin;
 
 use async_trait::async_trait;
 
@@ -52,6 +61,8 @@ use tempfile::TempDir;
 pub use datafusion_common::test_util::parquet_test_data;
 pub use datafusion_common::test_util::{arrow_test_data, get_data_dir};
 
+use crate::execution::RecordBatchStream;
+
 /// Scan an empty data source, mainly used in tests
 pub fn scan_empty(
     name: Option<&str>,
@@ -129,6 +140,7 @@ pub async fn test_table() -> Result<DataFrame> {
 }
 
 /// Execute SQL and return results
+#[cfg(feature = "sql")]
 pub async fn plan_and_collect(
     ctx: &SessionContext,
     sql: &str,
@@ -178,7 +190,7 @@ impl TableProviderFactory for TestTableFactory {
     ) -> Result<Arc<dyn TableProvider>> {
         Ok(Arc::new(TestTableProvider {
             url: cmd.location.to_string(),
-            schema: Arc::new(cmd.schema.as_ref().into()),
+            schema: Arc::clone(cmd.schema.inner()),
         }))
     }
 }
@@ -234,3 +246,108 @@ pub fn register_unbounded_file_with_ordering(
     ctx.register_table(table_name, Arc::new(StreamTable::new(Arc::new(config))))?;
     Ok(())
 }
+
+/// Creates a bounded stream that emits the same record batch a specified number of times.
+/// This is useful for testing purposes.
+pub fn bounded_stream(
+    record_batch: RecordBatch,
+    limit: usize,
+) -> SendableRecordBatchStream {
+    Box::pin(BoundedStream {
+        record_batch,
+        count: 0,
+        limit,
+    })
+}
+
+struct BoundedStream {
+    record_batch: RecordBatch,
+    count: usize,
+    limit: usize,
+}
+
+impl Stream for BoundedStream {
+    type Item = Result<RecordBatch, crate::error::DataFusionError>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if self.count >= self.limit {
+            Poll::Ready(None)
+        } else {
+            self.count += 1;
+            Poll::Ready(Some(Ok(self.record_batch.clone())))
+        }
+    }
+}
+
+impl RecordBatchStream for BoundedStream {
+    fn schema(&self) -> SchemaRef {
+        self.record_batch.schema()
+    }
+}
+
+#[derive(Hash, Eq, PartialEq, PartialOrd, Debug)]
+struct CacheNode {
+    input: LogicalPlan,
+}
+
+impl UserDefinedLogicalNodeCore for CacheNode {
+    fn name(&self) -> &str {
+        "CacheNode"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "CacheNode")
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        assert_eq!(inputs.len(), 1, "input size inconsistent");
+        Ok(Self {
+            input: inputs[0].clone(),
+        })
+    }
+}
+
+#[derive(Debug)]
+struct TestCacheFactory {}
+
+impl CacheFactory for TestCacheFactory {
+    fn create(
+        &self,
+        plan: LogicalPlan,
+        _session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        Ok(LogicalPlan::Extension(datafusion_expr::Extension {
+            node: Arc::new(CacheNode { input: plan }),
+        }))
+    }
+}
+
+/// Create a test table registered to a session context with an associated cache factory
+pub async fn test_table_with_cache_factory() -> Result<DataFrame> {
+    let session_state = SessionStateBuilder::new()
+        .with_cache_factory(Some(Arc::new(TestCacheFactory {})))
+        .build();
+    let ctx = SessionContext::new_with_state(session_state);
+    let name = "aggregate_test_100";
+    register_aggregate_csv(&ctx, name).await?;
+    ctx.table(name).await
+}
diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs
index eb4c61c025248..dba017f83ba1e 100644
--- a/datafusion/core/src/test_util/parquet.rs
+++ b/datafusion/core/src/test_util/parquet.rs
@@ -32,16 +32,15 @@ use crate::logical_expr::execution_props::ExecutionProps;
 use crate::logical_expr::simplify::SimplifyContext;
 use crate::optimizer::simplify_expressions::ExprSimplifier;
 use crate::physical_expr::create_physical_expr;
+use crate::physical_plan::ExecutionPlan;
 use crate::physical_plan::filter::FilterExec;
 use crate::physical_plan::metrics::MetricsSet;
-use crate::physical_plan::ExecutionPlan;
 use crate::prelude::{Expr, SessionConfig, SessionContext};
 
-use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
-use object_store::path::Path;
 use object_store::ObjectMeta;
+use object_store::path::Path;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::WriterProperties;
 
@@ -156,26 +155,18 @@ impl TestParquetFile {
         maybe_filter: Option<Expr>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let parquet_options = ctx.copied_table_options().parquet;
-        let source = Arc::new(ParquetSource::new(parquet_options.clone()));
-        let scan_config_builder = FileScanConfigBuilder::new(
-            self.object_store_url.clone(),
-            Arc::clone(&self.schema),
-            source,
-        )
-        .with_file(PartitionedFile {
-            object_meta: self.object_meta.clone(),
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        });
+        let source = Arc::new(
+            ParquetSource::new(Arc::clone(&self.schema))
+                .with_table_parquet_options(parquet_options.clone()),
+        );
+        let scan_config_builder =
+            FileScanConfigBuilder::new(self.object_store_url.clone(), source)
+                .with_file(PartitionedFile::new_from_meta(self.object_meta.clone()));
 
         let df_schema = Arc::clone(&self.schema).to_dfschema_ref()?;
 
         // run coercion on the filters to coerce types etc.
-        let props = ExecutionProps::new();
-        let context = SimplifyContext::new(&props).with_schema(Arc::clone(&df_schema));
+        let context = SimplifyContext::default().with_schema(Arc::clone(&df_schema));
         if let Some(filter) = maybe_filter {
             let simplifier = ExprSimplifier::new(context);
             let filter = simplifier.coerce(filter, &df_schema).unwrap();
@@ -183,10 +174,10 @@ impl TestParquetFile {
                 create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?;
 
             let source = Arc::new(
-                ParquetSource::new(parquet_options)
+                ParquetSource::new(Arc::clone(&self.schema))
+                    .with_table_parquet_options(parquet_options)
                     .with_predicate(Arc::clone(&physical_filter_expr)),
-            )
-            .with_schema(Arc::clone(&self.schema));
+            );
             let config = scan_config_builder.with_source(source).build();
             let parquet_exec = DataSourceExec::from_data_source(config);
 
@@ -203,13 +194,12 @@ impl TestParquetFile {
     /// Recursively searches for DataSourceExec and returns the metrics
     /// on the first one it finds
     pub fn parquet_metrics(plan: &Arc<dyn ExecutionPlan>) -> Option<MetricsSet> {
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if data_source_exec
+        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+            && data_source_exec
                 .downcast_to_file_source::<ParquetSource>()
                 .is_some()
-            {
-                return data_source_exec.metrics();
-            }
+        {
+            return data_source_exec.metrics();
         }
 
         for child in plan.children() {
diff --git a/datafusion/core/tests/catalog/memory.rs b/datafusion/core/tests/catalog/memory.rs
index b0753eb5c9494..5258f3bf97574 100644
--- a/datafusion/core/tests/catalog/memory.rs
+++ b/datafusion/core/tests/catalog/memory.rs
@@ -19,7 +19,7 @@ use arrow::datatypes::Schema;
 use datafusion::catalog::CatalogProvider;
 use datafusion::datasource::empty::EmptyTable;
 use datafusion::datasource::listing::{
-    ListingTable, ListingTableConfig, ListingTableUrl,
+    ListingTable, ListingTableConfig, ListingTableConfigExt, ListingTableUrl,
 };
 use datafusion::prelude::SessionContext;
 use datafusion_catalog::memory::*;
@@ -47,6 +47,20 @@ fn memory_catalog_dereg_nonempty_schema() {
     assert!(cat.deregister_schema("foo", true).unwrap().is_some());
 }
 
+#[test]
+fn memory_catalog_dereg_nonempty_schema_with_table_removal() {
+    let cat = Arc::new(MemoryCatalogProvider::new()) as Arc<dyn CatalogProvider>;
+
+    let schema = Arc::new(MemorySchemaProvider::new()) as Arc<dyn SchemaProvider>;
+    let test_table =
+        Arc::new(EmptyTable::new(Arc::new(Schema::empty()))) as Arc<dyn TableProvider>;
+    schema.register_table("t".into(), test_table).unwrap();
+
+    cat.register_schema("foo", schema.clone()).unwrap();
+    schema.deregister_table("t").unwrap();
+    assert!(cat.deregister_schema("foo", false).unwrap().is_some());
+}
+
 #[test]
 fn memory_catalog_dereg_empty_schema() {
     let cat = Arc::new(MemoryCatalogProvider::new()) as Arc<dyn CatalogProvider>;
@@ -102,14 +116,16 @@ async fn test_mem_provider() {
     assert!(provider.deregister_table(table_name).unwrap().is_none());
     let test_table = EmptyTable::new(Arc::new(Schema::empty()));
     // register table successfully
-    assert!(provider
-        .register_table(table_name.to_string(), Arc::new(test_table))
-        .unwrap()
-        .is_none());
+    assert!(
+        provider
+            .register_table(table_name.to_string(), Arc::new(test_table))
+            .unwrap()
+            .is_none()
+    );
     assert!(provider.table_exist(table_name));
     let other_table = EmptyTable::new(Arc::new(Schema::empty()));
     let result = provider.register_table(table_name.to_string(), Arc::new(other_table));
-    assert!(result.is_err());
+    assert!(result.is_err(), "The table test_table_exist already exists");
 }
 
 #[tokio::test]
diff --git a/datafusion/core/tests/catalog_listing/mod.rs b/datafusion/core/tests/catalog_listing/mod.rs
new file mode 100644
index 0000000000000..cb6cac4fb0672
--- /dev/null
+++ b/datafusion/core/tests/catalog_listing/mod.rs
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod pruned_partition_list;
diff --git a/datafusion/core/tests/catalog_listing/pruned_partition_list.rs b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs
new file mode 100644
index 0000000000000..8f93dc17dbad2
--- /dev/null
+++ b/datafusion/core/tests/catalog_listing/pruned_partition_list.rs
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::DataType;
+use futures::{FutureExt, StreamExt as _, TryStreamExt as _};
+use object_store::{ObjectStoreExt, memory::InMemory, path::Path};
+
+use datafusion::execution::SessionStateBuilder;
+use datafusion_catalog_listing::helpers::{
+    describe_partition, list_partitions, pruned_partition_list,
+};
+use datafusion_common::ScalarValue;
+use datafusion_datasource::ListingTableUrl;
+use datafusion_expr::{Expr, col, lit};
+use datafusion_session::Session;
+
+#[tokio::test]
+async fn test_pruned_partition_list_empty() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/mypartition=val1/notparquetfile", 100),
+        ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
+        ("tablepath/file.parquet", 100),
+        ("tablepath/notapartition/file.parquet", 100),
+        ("tablepath/notmypartition=val1/file.parquet", 100),
+    ]);
+    let filter = Expr::eq(col("mypartition"), lit("val1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter],
+        ".parquet",
+        &[(String::from("mypartition"), DataType::Utf8)],
+    )
+    .await
+    .expect("partition pruning failed")
+    .collect::<Vec<_>>()
+    .await;
+
+    assert_eq!(pruned.len(), 0);
+}
+
+#[tokio::test]
+async fn test_pruned_partition_list() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/mypartition=val1/file.parquet", 100),
+        ("tablepath/mypartition=val2/file.parquet", 100),
+        ("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
+        ("tablepath/mypartition=val1/other=val3/file.parquet", 100),
+        ("tablepath/notapartition/file.parquet", 100),
+        ("tablepath/notmypartition=val1/file.parquet", 100),
+    ]);
+    let filter = Expr::eq(col("mypartition"), lit("val1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter],
+        ".parquet",
+        &[(String::from("mypartition"), DataType::Utf8)],
+    )
+    .await
+    .expect("partition pruning failed")
+    .try_collect::<Vec<_>>()
+    .await
+    .unwrap();
+
+    assert_eq!(pruned.len(), 2);
+    let f1 = &pruned[0];
+    assert_eq!(
+        f1.object_meta.location.as_ref(),
+        "tablepath/mypartition=val1/file.parquet"
+    );
+    assert_eq!(&f1.partition_values, &[ScalarValue::from("val1")]);
+    let f2 = &pruned[1];
+    assert_eq!(
+        f2.object_meta.location.as_ref(),
+        "tablepath/mypartition=val1/other=val3/file.parquet"
+    );
+    assert_eq!(f2.partition_values, &[ScalarValue::from("val1"),]);
+}
+
+#[tokio::test]
+async fn test_pruned_partition_list_multi() {
+    let (store, state) = make_test_store_and_state(&[
+        ("tablepath/part1=p1v1/file.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100),
+    ]);
+    let filter1 = Expr::eq(col("part1"), lit("p1v2"));
+    let filter2 = Expr::eq(col("part2"), lit("p2v1"));
+    let pruned = pruned_partition_list(
+        state.as_ref(),
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        &[filter1, filter2],
+        ".parquet",
+        &[
+            (String::from("part1"), DataType::Utf8),
+            (String::from("part2"), DataType::Utf8),
+        ],
+    )
+    .await
+    .expect("partition pruning failed")
+    .try_collect::<Vec<_>>()
+    .await
+    .unwrap();
+
+    assert_eq!(pruned.len(), 2);
+    let f1 = &pruned[0];
+    assert_eq!(
+        f1.object_meta.location.as_ref(),
+        "tablepath/part1=p1v2/part2=p2v1/file1.parquet"
+    );
+    assert_eq!(
+        &f1.partition_values,
+        &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1"),]
+    );
+    let f2 = &pruned[1];
+    assert_eq!(
+        f2.object_meta.location.as_ref(),
+        "tablepath/part1=p1v2/part2=p2v1/file2.parquet"
+    );
+    assert_eq!(
+        &f2.partition_values,
+        &[ScalarValue::from("p1v2"), ScalarValue::from("p2v1")]
+    );
+}
+
+#[tokio::test]
+async fn test_list_partition() {
+    let (store, _) = make_test_store_and_state(&[
+        ("tablepath/part1=p1v1/file.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100),
+        ("tablepath/part1=p1v3/part2=p2v1/file3.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/file4.parquet", 100),
+        ("tablepath/part1=p1v2/part2=p2v2/empty.parquet", 0),
+    ]);
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        0,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec![]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+        ]
+    );
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        1,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v2/part2=p2v1", 2, vec![]),
+            ("tablepath/part1=p1v2/part2=p2v2", 2, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+            ("tablepath/part1=p1v3/part2=p2v1", 2, vec![]),
+        ]
+    );
+
+    let partitions = list_partitions(
+        store.as_ref(),
+        &ListingTableUrl::parse("file:///tablepath/").unwrap(),
+        2,
+        None,
+    )
+    .await
+    .expect("listing partitions failed");
+
+    assert_eq!(
+        &partitions
+            .iter()
+            .map(describe_partition)
+            .collect::<Vec<_>>(),
+        &vec![
+            ("tablepath", 0, vec![]),
+            ("tablepath/part1=p1v1", 1, vec!["file.parquet"]),
+            ("tablepath/part1=p1v2", 1, vec![]),
+            ("tablepath/part1=p1v3", 1, vec![]),
+            (
+                "tablepath/part1=p1v2/part2=p2v1",
+                2,
+                vec!["file1.parquet", "file2.parquet"]
+            ),
+            ("tablepath/part1=p1v2/part2=p2v2", 2, vec!["file4.parquet"]),
+            ("tablepath/part1=p1v3/part2=p2v1", 2, vec!["file3.parquet"]),
+        ]
+    );
+}
+
+pub fn make_test_store_and_state(
+    files: &[(&str, u64)],
+) -> (Arc<InMemory>, Arc<dyn Session>) {
+    let memory = InMemory::new();
+
+    for (name, size) in files {
+        memory
+            .put(&Path::from(*name), vec![0; *size as usize].into())
+            .now_or_never()
+            .unwrap()
+            .unwrap();
+    }
+
+    let state = SessionStateBuilder::new().build();
+    (Arc::new(memory), Arc::new(state))
+}
diff --git a/datafusion/core/tests/config_from_env.rs b/datafusion/core/tests/config_from_env.rs
index 976597c8a9ac5..6375d4e25d8eb 100644
--- a/datafusion/core/tests/config_from_env.rs
+++ b/datafusion/core/tests/config_from_env.rs
@@ -20,35 +20,43 @@ use std::env;
 
 #[test]
 fn from_env() {
-    // Note: these must be a single test to avoid interference from concurrent execution
-    let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
-    // valid testing in different cases
-    for bool_option in ["true", "TRUE", "True", "tRUe"] {
-        env::set_var(env_key, bool_option);
-        let config = ConfigOptions::from_env().unwrap();
-        env::remove_var(env_key);
-        assert!(config.optimizer.filter_null_join_keys);
-    }
+    unsafe {
+        // Note: these must be a single test to avoid interference from concurrent execution
+        let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
+        // valid testing in different cases
+        for bool_option in ["true", "TRUE", "True", "tRUe"] {
+            env::set_var(env_key, bool_option);
+            let config = ConfigOptions::from_env().unwrap();
+            env::remove_var(env_key);
+            assert!(config.optimizer.filter_null_join_keys);
+        }
 
-    // invalid testing
-    env::set_var(env_key, "ttruee");
-    let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
-    assert_eq!(err, "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`");
-    env::remove_var(env_key);
+        // invalid testing
+        env::set_var(env_key, "ttruee");
+        let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
+        assert_eq!(
+            err,
+            "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`"
+        );
+        env::remove_var(env_key);
 
-    let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
+        let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";
 
-    // for valid testing
-    env::set_var(env_key, "4096");
-    let config = ConfigOptions::from_env().unwrap();
-    assert_eq!(config.execution.batch_size, 4096);
+        // for valid testing
+        env::set_var(env_key, "4096");
+        let config = ConfigOptions::from_env().unwrap();
+        assert_eq!(config.execution.batch_size, 4096);
 
-    // for invalid testing
-    env::set_var(env_key, "abc");
-    let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
-    assert_eq!(err, "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string");
+        // for invalid testing
+        env::set_var(env_key, "abc");
+        let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
+        assert_eq!(
+            err,
+            "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string"
+        );
 
-    env::remove_var(env_key);
-    let config = ConfigOptions::from_env().unwrap();
-    assert_eq!(config.execution.batch_size, 8192); // set to its default value
+        env::remove_var(env_key);
+        let config = ConfigOptions::from_env().unwrap();
+        assert_eq!(config.execution.batch_size, 8192); // set to its default value
+    }
 }
diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs
index 250538b133703..bdbe72245323d 100644
--- a/datafusion/core/tests/core_integration.rs
+++ b/datafusion/core/tests/core_integration.rs
@@ -21,6 +21,9 @@ mod sql;
 /// Run all tests that are found in the `dataframe` directory
 mod dataframe;
 
+/// Run all tests that are found in the `datasource` directory
+mod datasource;
+
 /// Run all tests that are found in the `macro_hygiene` directory
 mod macro_hygiene;
 
@@ -51,6 +54,9 @@ mod serde;
 /// Run all tests that are found in the `catalog` directory
 mod catalog;
 
+/// Run all tests that are found in the `catalog_listing` directory
+mod catalog_listing;
+
 /// Run all tests that are found in the `tracing` directory
 mod tracing;
 
diff --git a/datafusion/core/tests/custom_sources_cases/dml_planning.rs b/datafusion/core/tests/custom_sources_cases/dml_planning.rs
new file mode 100644
index 0000000000000..8c4bae5e98b36
--- /dev/null
+++ b/datafusion/core/tests/custom_sources_cases/dml_planning.rs
@@ -0,0 +1,819 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for DELETE, UPDATE, and TRUNCATE planning to verify filter and assignment extraction.
+
+use std::any::Any;
+use std::sync::{Arc, Mutex};
+
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use async_trait::async_trait;
+use datafusion::datasource::{TableProvider, TableType};
+use datafusion::error::Result;
+use datafusion::execution::context::{SessionConfig, SessionContext};
+use datafusion::logical_expr::{
+    Expr, LogicalPlan, TableProviderFilterPushDown, TableScan,
+};
+use datafusion_catalog::Session;
+use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::empty::EmptyExec;
+
+/// A TableProvider that captures the filters passed to delete_from().
+struct CaptureDeleteProvider {
+    schema: SchemaRef,
+    received_filters: Arc<Mutex<Option<Vec<Expr>>>>,
+    filter_pushdown: TableProviderFilterPushDown,
+    per_filter_pushdown: Option<Vec<TableProviderFilterPushDown>>,
+}
+
+impl CaptureDeleteProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_filter_pushdown(
+        schema: SchemaRef,
+        filter_pushdown: TableProviderFilterPushDown,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_per_filter_pushdown(
+        schema: SchemaRef,
+        per_filter_pushdown: Vec<TableProviderFilterPushDown>,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: Some(per_filter_pushdown),
+        }
+    }
+
+    fn captured_filters(&self) -> Option<Vec<Expr>> {
+        self.received_filters.lock().unwrap().clone()
+    }
+}
+
+impl std::fmt::Debug for CaptureDeleteProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureDeleteProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureDeleteProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn delete_from(
+        &self,
+        _state: &dyn Session,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.received_filters.lock().unwrap() = Some(filters);
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        if let Some(per_filter) = &self.per_filter_pushdown
+            && per_filter.len() == filters.len()
+        {
+            return Ok(per_filter.clone());
+        }
+
+        Ok(vec![self.filter_pushdown.clone(); filters.len()])
+    }
+}
+
+/// A TableProvider that captures filters and assignments passed to update().
+#[expect(clippy::type_complexity)]
+struct CaptureUpdateProvider {
+    schema: SchemaRef,
+    received_filters: Arc<Mutex<Option<Vec<Expr>>>>,
+    received_assignments: Arc<Mutex<Option<Vec<(String, Expr)>>>>,
+    filter_pushdown: TableProviderFilterPushDown,
+    per_filter_pushdown: Option<Vec<TableProviderFilterPushDown>>,
+}
+
+impl CaptureUpdateProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            received_assignments: Arc::new(Mutex::new(None)),
+            filter_pushdown: TableProviderFilterPushDown::Unsupported,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn new_with_filter_pushdown(
+        schema: SchemaRef,
+        filter_pushdown: TableProviderFilterPushDown,
+    ) -> Self {
+        Self {
+            schema,
+            received_filters: Arc::new(Mutex::new(None)),
+            received_assignments: Arc::new(Mutex::new(None)),
+            filter_pushdown,
+            per_filter_pushdown: None,
+        }
+    }
+
+    fn captured_filters(&self) -> Option<Vec<Expr>> {
+        self.received_filters.lock().unwrap().clone()
+    }
+
+    fn captured_assignments(&self) -> Option<Vec<(String, Expr)>> {
+        self.received_assignments.lock().unwrap().clone()
+    }
+}
+
+impl std::fmt::Debug for CaptureUpdateProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureUpdateProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureUpdateProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn update(
+        &self,
+        _state: &dyn Session,
+        assignments: Vec<(String, Expr)>,
+        filters: Vec<Expr>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.received_filters.lock().unwrap() = Some(filters);
+        *self.received_assignments.lock().unwrap() = Some(assignments);
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        if let Some(per_filter) = &self.per_filter_pushdown
+            && per_filter.len() == filters.len()
+        {
+            return Ok(per_filter.clone());
+        }
+
+        Ok(vec![self.filter_pushdown.clone(); filters.len()])
+    }
+}
+
+/// A TableProvider that captures whether truncate() was called.
+struct CaptureTruncateProvider {
+    schema: SchemaRef,
+    truncate_called: Arc<Mutex<bool>>,
+}
+
+impl CaptureTruncateProvider {
+    fn new(schema: SchemaRef) -> Self {
+        Self {
+            schema,
+            truncate_called: Arc::new(Mutex::new(false)),
+        }
+    }
+
+    fn was_truncated(&self) -> bool {
+        *self.truncate_called.lock().unwrap()
+    }
+}
+
+impl std::fmt::Debug for CaptureTruncateProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CaptureTruncateProvider")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl TableProvider for CaptureTruncateProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(EmptyExec::new(Arc::clone(&self.schema))))
+    }
+
+    async fn truncate(&self, _state: &dyn Session) -> Result<Arc<dyn ExecutionPlan>> {
+        *self.truncate_called.lock().unwrap() = true;
+
+        Ok(Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("count", DataType::UInt64, false),
+        ])))))
+    }
+}
+
+fn test_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("status", DataType::Utf8, true),
+        Field::new("value", DataType::Int32, true),
+    ]))
+}
+
+#[tokio::test]
+async fn test_delete_single_filter() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_multiple_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'x'")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_no_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t").await?.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(
+        filters.is_empty(),
+        "DELETE without WHERE should have empty filters"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_complex_expr() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id > 5 AND (status = 'a' OR status = 'b')")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_filter_pushdown_extracts_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx.sql("DELETE FROM t WHERE id = 1").await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_compound_filters_with_pushdown() -> Result<()> {
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?
+        .collect()
+        .await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    // Should receive both filters, not deduplicate valid separate predicates
+    assert_eq!(
+        filters.len(),
+        2,
+        "compound filters should not be over-suppressed"
+    );
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain status filter"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_mixed_filter_locations() -> Result<()> {
+    // Test mixed-location filters: some in Filter node, some in TableScan.filters
+    // This happens when provider uses TableProviderFilterPushDown::Inexact,
+    // meaning it can push down some predicates but not others.
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Inexact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    // Execute DELETE with compound WHERE clause
+    ctx.sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?
+        .collect()
+        .await?;
+
+    // Verify that both predicates are extracted and passed to delete_from(),
+    // even though they may be split between Filter node and TableScan.filters
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(
+        filters.len(),
+        2,
+        "should extract both predicates (union of Filter and TableScan.filters)"
+    );
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain status filter"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_per_filter_pushdown_mixed_locations() -> Result<()> {
+    // Force per-filter pushdown decisions to exercise mixed locations in one query.
+    // First predicate is pushed down (Exact), second stays as residual (Unsupported).
+    let provider = Arc::new(CaptureDeleteProvider::new_with_per_filter_pushdown(
+        test_schema(),
+        vec![
+            TableProviderFilterPushDown::Exact,
+            TableProviderFilterPushDown::Unsupported,
+        ],
+    ));
+
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx
+        .sql("DELETE FROM t WHERE id = 1 AND status = 'active'")
+        .await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    // Only the first predicate should be pushed to TableScan.filters.
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    // Both predicates should still reach the provider (union + dedup behavior).
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 2);
+
+    let filter_strs: Vec<String> = filters.iter().map(|f| f.to_string()).collect();
+    assert!(
+        filter_strs.iter().any(|s| s.contains("id")),
+        "should contain pushed-down id filter"
+    );
+    assert!(
+        filter_strs.iter().any(|s| s.contains("status")),
+        "should contain residual status filter"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_assignments() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new(test_schema()));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("UPDATE t SET value = 100, status = 'updated' WHERE id = 5")
+        .await?
+        .collect()
+        .await?;
+
+    let assignments = provider
+        .captured_assignments()
+        .expect("assignments should be captured");
+    assert_eq!(assignments.len(), 2, "should have 2 assignments");
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty(), "should have filter for WHERE clause");
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_filter_pushdown_extracts_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx.sql("UPDATE t SET value = 100 WHERE id = 1").await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    // Verify that the optimizer pushed down the filter into TableScan
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert_eq!(scan_filters.len(), 1);
+    assert!(scan_filters[0].to_string().contains("id"));
+
+    // Execute the UPDATE and verify filters were extracted and passed to update()
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(filters[0].to_string().contains("id"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_filter_pushdown_passes_table_scan_filters() -> Result<()> {
+    let provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    let df = ctx
+        .sql("UPDATE t SET value = 42 WHERE status = 'ready'")
+        .await?;
+    let optimized_plan = df.clone().into_optimized_plan()?;
+
+    let mut scan_filters = Vec::new();
+    optimized_plan.apply(|node| {
+        if let LogicalPlan::TableScan(TableScan { filters, .. }) = node {
+            scan_filters.extend(filters.clone());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })?;
+
+    assert!(
+        !scan_filters.is_empty(),
+        "expected filter pushdown to populate TableScan filters"
+    );
+
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(
+        !filters.is_empty(),
+        "expected filters extracted from TableScan during UPDATE"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_truncate_calls_provider() -> Result<()> {
+    let provider = Arc::new(CaptureTruncateProvider::new(test_schema()));
+    let config = SessionConfig::new().set(
+        "datafusion.optimizer.max_passes",
+        &ScalarValue::UInt64(Some(0)),
+    );
+
+    let ctx = SessionContext::new_with_config(config);
+
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    ctx.sql("TRUNCATE TABLE t").await?.collect().await?;
+
+    assert!(
+        provider.was_truncated(),
+        "truncate() should be called on the TableProvider"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_delete() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("DELETE FROM empty_t WHERE id = 1").await;
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_update() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("UPDATE empty_t SET value = 1 WHERE id = 1").await;
+
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_target_table_scoping() -> Result<()> {
+    // Test that DELETE only extracts filters from the target table,
+    // not from other tables (important for DELETE...FROM safety)
+    let target_provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table(
+        "target_t",
+        Arc::clone(&target_provider) as Arc<dyn TableProvider>,
+    )?;
+
+    // For now, we test single-table DELETE
+    // and validate that the scoping logic is correct
+    let df = ctx.sql("DELETE FROM target_t WHERE id > 5").await?;
+    df.collect().await?;
+
+    let filters = target_provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert_eq!(filters.len(), 1);
+    assert!(
+        filters[0].to_string().contains("id"),
+        "Filter should be for id column"
+    );
+    assert!(
+        filters[0].to_string().contains("5"),
+        "Filter should contain the value 5"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_update_from_drops_non_target_predicates() -> Result<()> {
+    // UPDATE ... FROM is currently not working
+    // TODO fix https://github.com/apache/datafusion/issues/19950
+    let target_provider = Arc::new(CaptureUpdateProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t1", Arc::clone(&target_provider) as Arc<dyn TableProvider>)?;
+
+    let source_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("status", DataType::Utf8, true),
+        // t2-only column to avoid false negatives after qualifier stripping
+        Field::new("src_only", DataType::Utf8, true),
+    ]));
+    let source_table = datafusion::datasource::empty::EmptyTable::new(source_schema);
+    ctx.register_table("t2", Arc::new(source_table))?;
+
+    let result = ctx
+        .sql(
+            "UPDATE t1 SET value = 1 FROM t2 \
+             WHERE t1.id = t2.id AND t2.src_only = 'active' AND t1.value > 10",
+        )
+        .await;
+
+    // Verify UPDATE ... FROM is rejected with appropriate error
+    // TODO fix https://github.com/apache/datafusion/issues/19950
+    assert!(result.is_err());
+    let err = result.unwrap_err();
+    assert!(
+        err.to_string().contains("UPDATE ... FROM is not supported"),
+        "Expected 'UPDATE ... FROM is not supported' error, got: {err}"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_delete_qualifier_stripping_and_validation() -> Result<()> {
+    // Test that filter qualifiers are properly stripped and validated
+    // Unqualified predicates should work fine
+    let provider = Arc::new(CaptureDeleteProvider::new_with_filter_pushdown(
+        test_schema(),
+        TableProviderFilterPushDown::Exact,
+    ));
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::clone(&provider) as Arc<dyn TableProvider>)?;
+
+    // Execute DELETE with unqualified column reference
+    // (After parsing, the planner adds qualifiers, but our validation should accept them)
+    let df = ctx.sql("DELETE FROM t WHERE id = 1").await?;
+    df.collect().await?;
+
+    let filters = provider
+        .captured_filters()
+        .expect("filters should be captured");
+    assert!(!filters.is_empty(), "Should have extracted filter");
+
+    // Verify qualifiers are stripped: check that Column expressions have no qualifier
+    let has_qualified_column = filters[0]
+        .exists(|expr| Ok(matches!(expr, Expr::Column(col) if col.relation.is_some())))?;
+    assert!(
+        !has_qualified_column,
+        "Filter should have unqualified columns after stripping"
+    );
+
+    // Also verify the string representation doesn't contain table qualifiers
+    let filter_str = filters[0].to_string();
+    assert!(
+        !filter_str.contains("t.id"),
+        "Filter should not contain qualified column reference, got: {filter_str}"
+    );
+    assert!(
+        filter_str.contains("id") || filter_str.contains("1"),
+        "Filter should reference id column or the value 1, got: {filter_str}"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_unsupported_table_truncate() -> Result<()> {
+    let schema = test_schema();
+    let ctx = SessionContext::new();
+
+    let empty_table = datafusion::datasource::empty::EmptyTable::new(schema);
+    ctx.register_table("empty_t", Arc::new(empty_table))?;
+
+    let result = ctx.sql("TRUNCATE TABLE empty_t").await;
+
+    assert!(result.is_err() || result.unwrap().collect().await.is_err());
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/custom_sources_cases/mod.rs b/datafusion/core/tests/custom_sources_cases/mod.rs
index cbdc4a448ea41..6919d9794b29e 100644
--- a/datafusion/core/tests/custom_sources_cases/mod.rs
+++ b/datafusion/core/tests/custom_sources_cases/mod.rs
@@ -28,25 +28,27 @@ use datafusion::datasource::{TableProvider, TableType};
 use datafusion::error::Result;
 use datafusion::execution::context::{SessionContext, TaskContext};
 use datafusion::logical_expr::{
-    col, Expr, LogicalPlan, LogicalPlanBuilder, TableScan, UNNAMED_TABLE,
+    Expr, LogicalPlan, LogicalPlanBuilder, TableScan, UNNAMED_TABLE, col,
 };
 use datafusion::physical_plan::{
-    collect, ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    RecordBatchStream, SendableRecordBatchStream, Statistics,
+    ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
+    RecordBatchStream, SendableRecordBatchStream, Statistics, collect,
 };
 use datafusion::scalar::ScalarValue;
 use datafusion_catalog::Session;
 use datafusion_common::cast::as_primitive_array;
 use datafusion_common::project_schema;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_plan::PlanProperties;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion_physical_plan::PlanProperties;
 
 use async_trait::async_trait;
 use futures::stream::Stream;
 
+mod dml_planning;
 mod provider_filter_pushdown;
 mod statistics;
 
@@ -78,7 +80,7 @@ struct CustomTableProvider;
 #[derive(Debug, Clone)]
 struct CustomExecutionPlan {
     projection: Option<Vec<usize>>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomExecutionPlan {
@@ -87,7 +89,10 @@ impl CustomExecutionPlan {
         let schema =
             project_schema(&schema, projection.as_ref()).expect("projected schema");
         let cache = Self::compute_properties(schema);
-        Self { projection, cache }
+        Self {
+            projection,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -156,7 +161,7 @@ impl ExecutionPlan for CustomExecutionPlan {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -179,16 +184,12 @@ impl ExecutionPlan for CustomExecutionPlan {
         Ok(Box::pin(TestCustomRecordBatchStream { nb_batch: 1 }))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema())));
         }
         let batch = TEST_CUSTOM_RECORD_BATCH!().unwrap();
-        Ok(Statistics {
+        Ok(Arc::new(Statistics {
             num_rows: Precision::Exact(batch.num_rows()),
             total_byte_size: Precision::Absent,
             column_statistics: self
@@ -207,7 +208,23 @@ impl ExecutionPlan for CustomExecutionPlan {
                     ..Default::default()
                 })
                 .collect(),
-        })
+        }))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -316,6 +333,7 @@ async fn optimizers_catch_all_statistics() {
     assert_eq!(format!("{:?}", actual[0]), format!("{expected:?}"));
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn contains_place_holder_exec(plan: Arc<dyn ExecutionPlan>) -> bool {
     if plan.as_any().is::<PlaceholderRowExec>() {
         true
diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
index f68bcfaf15507..8078b0a7ec158 100644
--- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
+++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
@@ -29,13 +29,14 @@ use datafusion::logical_expr::TableProviderFilterPushDown;
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream,
 };
 use datafusion::prelude::*;
 use datafusion::scalar::ScalarValue;
 use datafusion_catalog::Session;
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{internal_err, not_impl_err};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, internal_err, not_impl_err};
 use datafusion_expr::expr::{BinaryExpr, Cast};
 use datafusion_functions_aggregate::expr_fn::count;
 use datafusion_physical_expr::EquivalenceProperties;
@@ -62,13 +63,16 @@ fn create_batch(value: i32, num_rows: usize) -> Result<RecordBatch> {
 #[derive(Debug)]
 struct CustomPlan {
     batches: Vec<RecordBatch>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CustomPlan {
     fn new(schema: SchemaRef, batches: Vec<RecordBatch>) -> Self {
         let cache = Self::compute_properties(schema);
-        Self { batches, cache }
+        Self {
+            batches,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -109,7 +113,7 @@ impl ExecutionPlan for CustomPlan {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -134,16 +138,36 @@ impl ExecutionPlan for CustomPlan {
         _partition: usize,
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
+        let schema_captured = self.schema().clone();
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             self.schema(),
-            futures::stream::iter(self.batches.clone().into_iter().map(Ok)),
+            futures::stream::iter(self.batches.clone().into_iter().map(move |batch| {
+                let projection: Vec<usize> = schema_captured
+                    .fields()
+                    .iter()
+                    .filter_map(|field| batch.schema().index_of(field.name()).ok())
+                    .collect();
+                batch
+                    .project(&projection)
+                    .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
+            })),
         )))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // here we could provide more accurate statistics
-        // but we want to test the filter pushdown not the CBOs
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -179,12 +203,12 @@ impl TableProvider for CustomProvider {
         match &filters[0] {
             Expr::BinaryExpr(BinaryExpr { right, .. }) => {
                 let int_value = match &**right {
-                    Expr::Literal(ScalarValue::Int8(Some(i))) => *i as i64,
-                    Expr::Literal(ScalarValue::Int16(Some(i))) => *i as i64,
-                    Expr::Literal(ScalarValue::Int32(Some(i))) => *i as i64,
-                    Expr::Literal(ScalarValue::Int64(Some(i))) => *i,
-                    Expr::Cast(Cast { expr, data_type: _ }) => match expr.deref() {
-                        Expr::Literal(lit_value) => match lit_value {
+                    Expr::Literal(ScalarValue::Int8(Some(i)), _) => *i as i64,
+                    Expr::Literal(ScalarValue::Int16(Some(i)), _) => *i as i64,
+                    Expr::Literal(ScalarValue::Int32(Some(i)), _) => *i as i64,
+                    Expr::Literal(ScalarValue::Int64(Some(i)), _) => *i,
+                    Expr::Cast(Cast { expr, field: _ }) => match expr.deref() {
+                        Expr::Literal(lit_value, _) => match lit_value {
                             ScalarValue::Int8(Some(v)) => *v as i64,
                             ScalarValue::Int16(Some(v)) => *v as i64,
                             ScalarValue::Int32(Some(v)) => *v as i64,
diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs
index f9b0db0e808c0..561c6b3b246ff 100644
--- a/datafusion/core/tests/custom_sources_cases/statistics.rs
+++ b/datafusion/core/tests/custom_sources_cases/statistics.rs
@@ -33,6 +33,7 @@ use datafusion::{
     scalar::ScalarValue,
 };
 use datafusion_catalog::Session;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{project_schema, stats::Precision};
 use datafusion_physical_expr::EquivalenceProperties;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
@@ -45,7 +46,7 @@ use async_trait::async_trait;
 struct StatisticsValidation {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl StatisticsValidation {
@@ -59,7 +60,7 @@ impl StatisticsValidation {
         Self {
             stats,
             schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -158,7 +159,7 @@ impl ExecutionPlan for StatisticsValidation {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -181,16 +182,28 @@ impl ExecutionPlan for StatisticsValidation {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            Ok(Statistics::new_unknown(&self.schema))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema)))
         } else {
-            Ok(self.stats.clone())
+            Ok(Arc::new(self.stats.clone()))
+        }
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
         }
+        Ok(tnr)
     }
 }
 
@@ -214,6 +227,7 @@ fn fully_defined() -> (Statistics, Schema) {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(13),
@@ -221,6 +235,7 @@ fn fully_defined() -> (Statistics, Schema) {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-6783))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(5),
+                    byte_size: Precision::Absent,
                 },
             ],
         },
@@ -240,7 +255,7 @@ async fn sql_basic() -> Result<()> {
     let physical_plan = df.create_physical_plan().await.unwrap();
 
     // the statistics should be those of the source
-    assert_eq!(stats, physical_plan.partition_statistics(None)?);
+    assert_eq!(stats, *physical_plan.partition_statistics(None)?);
 
     Ok(())
 }
@@ -265,20 +280,22 @@ async fn sql_filter() -> Result<()> {
 #[tokio::test]
 async fn sql_limit() -> Result<()> {
     let (stats, schema) = fully_defined();
-    let col_stats = Statistics::unknown_column(&schema);
     let ctx = init_ctx(stats.clone(), schema)?;
 
     let df = ctx.sql("SELECT * FROM stats_table LIMIT 5").await.unwrap();
     let physical_plan = df.create_physical_plan().await.unwrap();
-    // when the limit is smaller than the original number of lines
-    // we loose all statistics except the for number of rows which becomes the limit
+    // when the limit is smaller than the original number of lines we mark the statistics as inexact
     assert_eq!(
         Statistics {
             num_rows: Precision::Exact(5),
-            column_statistics: col_stats,
+            column_statistics: stats
+                .column_statistics
+                .iter()
+                .map(|c| c.clone().to_inexact())
+                .collect(),
             total_byte_size: Precision::Absent
         },
-        physical_plan.partition_statistics(None)?
+        *physical_plan.partition_statistics(None)?
     );
 
     let df = ctx
@@ -287,7 +304,7 @@ async fn sql_limit() -> Result<()> {
         .unwrap();
     let physical_plan = df.create_physical_plan().await.unwrap();
     // when the limit is larger than the original number of lines, statistics remain unchanged
-    assert_eq!(stats, physical_plan.partition_statistics(None)?);
+    assert_eq!(stats, *physical_plan.partition_statistics(None)?);
 
     Ok(())
 }
@@ -307,7 +324,7 @@ async fn sql_window() -> Result<()> {
     let result = physical_plan.partition_statistics(None)?;
 
     assert_eq!(stats.num_rows, result.num_rows);
-    let col_stats = result.column_statistics;
+    let col_stats = &result.column_statistics;
     assert_eq!(2, col_stats.len());
     assert_eq!(stats.column_statistics[1], col_stats[0]);
 
diff --git a/datafusion/core/tests/data/empty_files/some_empty_with_header/a_empty.csv b/datafusion/core/tests/data/empty_files/some_empty_with_header/a_empty.csv
new file mode 100644
index 0000000000000..f1968a0906d09
--- /dev/null
+++ b/datafusion/core/tests/data/empty_files/some_empty_with_header/a_empty.csv
@@ -0,0 +1 @@
+c1,c2,c3
diff --git a/datafusion/core/tests/data/empty_files/some_empty_with_header/b.csv b/datafusion/core/tests/data/empty_files/some_empty_with_header/b.csv
new file mode 100644
index 0000000000000..ff596071444c3
--- /dev/null
+++ b/datafusion/core/tests/data/empty_files/some_empty_with_header/b.csv
@@ -0,0 +1,3 @@
+c1,c2,c3
+1,1,1
+2,2,2
diff --git a/datafusion/core/tests/data/empty_files/some_empty_with_header/c_nulls_column.csv b/datafusion/core/tests/data/empty_files/some_empty_with_header/c_nulls_column.csv
new file mode 100644
index 0000000000000..bf86844cb0293
--- /dev/null
+++ b/datafusion/core/tests/data/empty_files/some_empty_with_header/c_nulls_column.csv
@@ -0,0 +1,2 @@
+c1,c2,c3
+3,3,
diff --git a/datafusion/core/tests/data/json_array.json b/datafusion/core/tests/data/json_array.json
new file mode 100644
index 0000000000000..1a8716dbf4beb
--- /dev/null
+++ b/datafusion/core/tests/data/json_array.json
@@ -0,0 +1,5 @@
+[
+    {"a": 1, "b": "hello"},
+    {"a": 2, "b": "world"},
+    {"a": 3, "b": "test"}
+]
diff --git a/datafusion/core/tests/data/json_empty_array.json b/datafusion/core/tests/data/json_empty_array.json
new file mode 100644
index 0000000000000..fe51488c7066f
--- /dev/null
+++ b/datafusion/core/tests/data/json_empty_array.json
@@ -0,0 +1 @@
+[]
diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow
new file mode 100644
index 0000000000000..bad9e3de4a57f
Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=123/data.arrow differ
diff --git a/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow
new file mode 100644
index 0000000000000..4a07fbfa47f32
Binary files /dev/null and b/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow differ
diff --git a/datafusion/core/tests/data/recursive_cte/closure.csv b/datafusion/core/tests/data/recursive_cte/closure.csv
new file mode 100644
index 0000000000000..a31e2bfbf36b6
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/closure.csv
@@ -0,0 +1,6 @@
+start,end
+1,2
+2,3
+2,4
+2,4
+4,1
\ No newline at end of file
diff --git a/datafusion/core/tests/data/tpch_customer_small.parquet b/datafusion/core/tests/data/tpch_customer_small.parquet
new file mode 100644
index 0000000000000..3d5f73ef3a066
Binary files /dev/null and b/datafusion/core/tests/data/tpch_customer_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_lineitem_small.parquet b/datafusion/core/tests/data/tpch_lineitem_small.parquet
new file mode 100644
index 0000000000000..5e98706669d3b
Binary files /dev/null and b/datafusion/core/tests/data/tpch_lineitem_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_nation_small.parquet b/datafusion/core/tests/data/tpch_nation_small.parquet
new file mode 100644
index 0000000000000..99da99594cf89
Binary files /dev/null and b/datafusion/core/tests/data/tpch_nation_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_orders_small.parquet b/datafusion/core/tests/data/tpch_orders_small.parquet
new file mode 100644
index 0000000000000..79e043137caf6
Binary files /dev/null and b/datafusion/core/tests/data/tpch_orders_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_part_small.parquet b/datafusion/core/tests/data/tpch_part_small.parquet
new file mode 100644
index 0000000000000..d8e1d7d680aa2
Binary files /dev/null and b/datafusion/core/tests/data/tpch_part_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_partsupp_small.parquet b/datafusion/core/tests/data/tpch_partsupp_small.parquet
new file mode 100644
index 0000000000000..711d58dda7493
Binary files /dev/null and b/datafusion/core/tests/data/tpch_partsupp_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_region_small.parquet b/datafusion/core/tests/data/tpch_region_small.parquet
new file mode 100644
index 0000000000000..5e00a1f6da1d9
Binary files /dev/null and b/datafusion/core/tests/data/tpch_region_small.parquet differ
diff --git a/datafusion/core/tests/data/tpch_supplier_small.parquet b/datafusion/core/tests/data/tpch_supplier_small.parquet
new file mode 100644
index 0000000000000..18323395fcbed
Binary files /dev/null and b/datafusion/core/tests/data/tpch_supplier_small.parquet differ
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 40590d74ad910..014f356cd64cd 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{types::Int32Type, ListArray};
+use arrow::array::{ListArray, types::Int32Type};
 use arrow::datatypes::SchemaRef;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::{
@@ -31,7 +31,7 @@ use datafusion::prelude::*;
 use datafusion_common::test_util::batches_to_string;
 use datafusion_common::{DFSchema, ScalarValue};
 use datafusion_expr::expr::Alias;
-use datafusion_expr::{table_scan, ExprSchemable, LogicalPlanBuilder};
+use datafusion_expr::{ExprSchemable, LogicalPlanBuilder, table_scan};
 use datafusion_functions_aggregate::expr_fn::{approx_median, approx_percentile_cont};
 use datafusion_functions_nested::map::map;
 use insta::assert_snapshot;
@@ -274,6 +274,33 @@ async fn test_nvl2() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn test_nvl2_short_circuit() -> Result<()> {
+    let expr = nvl2(
+        col("a"),
+        arrow_cast(lit("1"), lit("Int32")),
+        arrow_cast(col("a"), lit("Int32")),
+    );
+
+    let batches = get_batches(expr).await?;
+
+    assert_snapshot!(
+        batches_to_string(&batches),
+        @r#"
+    +-----------------------------------------------------------------------------------+
+    | nvl2(test.a,arrow_cast(Utf8("1"),Utf8("Int32")),arrow_cast(test.a,Utf8("Int32"))) |
+    +-----------------------------------------------------------------------------------+
+    | 1                                                                                 |
+    | 1                                                                                 |
+    | 1                                                                                 |
+    | 1                                                                                 |
+    +-----------------------------------------------------------------------------------+
+    "#
+    );
+
+    Ok(())
+}
 #[tokio::test]
 async fn test_fn_arrow_typeof() -> Result<()> {
     let expr = arrow_typeof(col("l"));
@@ -282,16 +309,16 @@ async fn test_fn_arrow_typeof() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&batches),
-        @r#"
-    +------------------------------------------------------------------------------------------------------------------+
-    | arrow_typeof(test.l)                                                                                             |
-    +------------------------------------------------------------------------------------------------------------------+
-    | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |
-    | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |
-    | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |
-    | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) |
-    +------------------------------------------------------------------------------------------------------------------+
-    "#);
+        @r"
+    +----------------------+
+    | arrow_typeof(test.l) |
+    +----------------------+
+    | List(Int32)          |
+    | List(Int32)          |
+    | List(Int32)          |
+    | List(Int32)          |
+    +----------------------+
+    ");
 
     Ok(())
 }
@@ -1215,7 +1242,7 @@ async fn test_fn_decode() -> Result<()> {
     // Note that the decode function returns binary, and the default display of
     // binary is "hexadecimal" and therefore the output looks like decode did
     // nothing. So compare to a constant.
-    let df_schema = DFSchema::try_from(test_schema().as_ref().clone())?;
+    let df_schema = DFSchema::try_from(test_schema())?;
     let expr = decode(encode(col("a"), lit("hex")), lit("hex"))
         // need to cast to utf8 otherwise the default display of binary array is hex
         // so it looks like nothing is done
@@ -1316,3 +1343,28 @@ async fn test_count_wildcard() -> Result<()> {
 
     Ok(())
 }
+
+/// Call count wildcard with alias from dataframe API
+#[tokio::test]
+async fn test_count_wildcard_with_alias() -> Result<()> {
+    let df = create_test_table().await?;
+    let result_df = df.aggregate(vec![], vec![count_all().alias("total_count")])?;
+
+    let schema = result_df.schema();
+    assert_eq!(schema.fields().len(), 1);
+    assert_eq!(schema.field(0).name(), "total_count");
+    assert_eq!(*schema.field(0).data_type(), DataType::Int64);
+
+    let batches = result_df.collect().await?;
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), 1);
+
+    let count_array = batches[0]
+        .column(0)
+        .as_any()
+        .downcast_ref::<arrow::array::Int64Array>()
+        .unwrap();
+    assert_eq!(count_array.value(0), 4);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs
index 9bd69dfa72b4c..c61fe4fed1615 100644
--- a/datafusion/core/tests/dataframe/describe.rs
+++ b/datafusion/core/tests/dataframe/describe.rs
@@ -17,7 +17,7 @@
 
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_common::test_util::batches_to_string;
-use datafusion_common::{test_util::parquet_test_data, Result};
+use datafusion_common::{Result, test_util::parquet_test_data};
 use insta::assert_snapshot;
 
 #[tokio::test]
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
index 089ff8808134d..80bbde1f6ba14 100644
--- a/datafusion/core/tests/dataframe/mod.rs
+++ b/datafusion/core/tests/dataframe/mod.rs
@@ -20,30 +20,33 @@ mod dataframe_functions;
 mod describe;
 
 use arrow::array::{
-    record_batch, Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray,
-    FixedSizeListBuilder, Float32Array, Float64Array, Int32Array, Int32Builder,
-    Int8Array, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray,
-    StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray,
+    Array, ArrayRef, BooleanArray, DictionaryArray, FixedSizeListArray,
+    FixedSizeListBuilder, Float32Array, Float64Array, Int8Array, Int32Array,
+    Int32Builder, LargeListArray, ListArray, ListBuilder, RecordBatch, StringArray,
+    StringBuilder, StructBuilder, UInt32Array, UInt32Builder, UnionArray, record_batch,
 };
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::{
-    DataType, Field, Float32Type, Int32Type, Schema, SchemaRef, UInt64Type, UnionFields,
-    UnionMode,
+    DataType, Field, Float32Type, Int32Type, Schema, UInt64Type, UnionFields, UnionMode,
 };
 use arrow::error::ArrowError;
 use arrow::util::pretty::pretty_format_batches;
+use arrow_schema::{SortOptions, TimeUnit};
 use datafusion::{assert_batches_eq, dataframe};
+use datafusion_common::metadata::FieldMetadata;
 use datafusion_functions_aggregate::count::{count_all, count_all_window};
 use datafusion_functions_aggregate::expr_fn::{
-    array_agg, avg, count, count_distinct, max, median, min, sum,
+    array_agg, avg, avg_distinct, count, count_distinct, max, median, min, sum,
+    sum_distinct,
 };
 use datafusion_functions_nested::make_array::make_array_udf;
-use datafusion_functions_window::expr_fn::{first_value, row_number};
+use datafusion_functions_window::expr_fn::{first_value, lead, row_number};
 use insta::assert_snapshot;
 use object_store::local::LocalFileSystem;
-use sqlparser::ast::NullTreatment;
+use rstest::rstest;
 use std::collections::HashMap;
 use std::fs;
+use std::path::Path;
 use std::sync::Arc;
 use tempfile::TempDir;
 use url::Url;
@@ -54,34 +57,43 @@ use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::logical_expr::{ColumnarValue, Volatility};
-use datafusion::prelude::{
-    CsvReadOptions, JoinType, NdJsonReadOptions, ParquetReadOptions,
-};
+use datafusion::prelude::{CsvReadOptions, JoinType, ParquetReadOptions};
 use datafusion::test_util::{
     parquet_test_data, populate_csv_partitions, register_aggregate_csv, test_table,
-    test_table_with_name,
+    test_table_with_cache_factory, test_table_with_name,
 };
 use datafusion_catalog::TableProvider;
 use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
 use datafusion_common::{
-    assert_contains, Constraint, Constraints, DataFusionError, ParamValues, ScalarValue,
-    TableReference, UnnestOptions,
+    Constraint, Constraints, DFSchema, DataFusionError, ScalarValue, SchemaError,
+    TableReference, UnnestOptions, assert_contains, internal_datafusion_err,
 };
 use datafusion_common_runtime::SpawnedTask;
+use datafusion_datasource::file_format::format_as_file_type;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_expr::expr::{GroupingSet, Sort, WindowFunction};
+use datafusion_expr::expr::{GroupingSet, NullTreatment, Sort, WindowFunction};
 use datafusion_expr::var_provider::{VarProvider, VarType};
 use datafusion_expr::{
-    cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder,
-    scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan,
-    ScalarFunctionImplementation, WindowFrame, WindowFrameBound, WindowFrameUnits,
-    WindowFunctionDefinition,
+    Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, LogicalPlanBuilder,
+    ScalarFunctionImplementation, SortExpr, TableType, WindowFrame, WindowFrameBound,
+    WindowFrameUnits, WindowFunctionDefinition, cast, col, create_udf, exists,
+    in_subquery, lit, out_ref_col, placeholder, scalar_subquery, when, wildcard,
 };
-use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_plan::{displayable, ExecutionPlanProperties};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateMode, PhysicalGroupBy,
+};
+use datafusion_physical_plan::empty::EmptyExec;
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable};
+
+use datafusion::error::Result as DataFusionResult;
+use datafusion::execution::options::JsonReadOptions;
+use datafusion_functions_window::expr_fn::lag;
 
 // Get string representation of the plan
 async fn physical_plan_to_string(df: &DataFrame) -> String {
@@ -91,8 +103,8 @@ async fn physical_plan_to_string(df: &DataFrame) -> String {
         .await
         .expect("Error creating physical plan");
 
-    let formated = displayable(physical_plan.as_ref()).indent(true);
-    formated.to_string()
+    let formatted = displayable(physical_plan.as_ref()).indent(true);
+    formatted.to_string()
 }
 
 pub fn table_with_constraints() -> Arc<dyn TableProvider> {
@@ -117,8 +129,7 @@ pub fn table_with_constraints() -> Arc<dyn TableProvider> {
 }
 
 async fn assert_logical_expr_schema_eq_physical_expr_schema(df: DataFrame) -> Result<()> {
-    let logical_expr_dfschema = df.schema();
-    let logical_expr_schema = SchemaRef::from(logical_expr_dfschema.to_owned());
+    let logical_expr_schema = Arc::clone(df.schema().inner());
     let batches = df.collect().await?;
     let physical_expr_schema = batches[0].schema();
     assert_eq!(logical_expr_schema, physical_expr_schema);
@@ -150,6 +161,46 @@ async fn test_array_agg_ord_schema() -> Result<()> {
     Ok(())
 }
 
+type WindowFnCase = (fn() -> Expr, &'static str);
+
+#[tokio::test]
+async fn with_column_window_functions() -> DataFusionResult<()> {
+    let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema.clone()),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+    )?;
+
+    let ctx = SessionContext::new();
+
+    let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]])?;
+    ctx.register_table("t", Arc::new(provider))?;
+
+    // Define test cases: (expr builder, alias name)
+    let test_cases: Vec<WindowFnCase> = vec![
+        (|| lag(col("a"), Some(1), None), "lag_val"),
+        (|| lead(col("a"), Some(1), None), "lead_val"),
+        (row_number, "row_num"),
+    ];
+
+    for (make_expr, alias) in test_cases {
+        let df = ctx.table("t").await?;
+        let expr = make_expr();
+        let df_with = df.with_column(alias, expr)?;
+        let df_schema = df_with.schema().clone();
+
+        assert!(
+            df_schema.has_column_with_unqualified_name(alias),
+            "Schema does not contain expected column {alias}",
+        );
+
+        assert_eq!(2, df_schema.columns().len());
+    }
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn test_coalesce_schema() -> Result<()> {
     let ctx = SessionContext::new();
@@ -254,6 +305,27 @@ async fn select_columns() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn select_columns_with_nonexistent_columns() -> Result<()> {
+    let t = test_table().await?;
+    let t2 = t.select_columns(&["canada", "c2", "rocks"]);
+
+    match t2 {
+        Err(DataFusionError::SchemaError(boxed_err, _)) => {
+            // Verify it's the first invalid column
+            match boxed_err.as_ref() {
+                SchemaError::FieldNotFound { field, .. } => {
+                    assert_eq!(field.name(), "canada");
+                }
+                _ => panic!("Expected SchemaError::FieldNotFound for 'canada'"),
+            }
+        }
+        _ => panic!("Expected SchemaError"),
+    }
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn select_expr() -> Result<()> {
     // build plan using Table API
@@ -341,16 +413,65 @@ async fn select_with_periods() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+
     | f.c1 |
     +------+
     | 1    |
     | 10   |
     +------+
-    "###
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn select_columns_duplicated_names_from_different_qualifiers() -> Result<()> {
+    let t1 = test_table_with_name("t1")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(0, Some(3))?;
+    let t2 = test_table_with_name("t2")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(3, Some(3))?;
+    let t3 = test_table_with_name("t3")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(6, Some(3))?;
+
+    let join_res = t1
+        .join(t2, JoinType::Left, &["t1.c1"], &["t2.c1"], None)?
+        .join(t3, JoinType::Left, &["t1.c1"], &["t3.c1"], None)?;
+    assert_snapshot!(
+        batches_to_sort_string(&join_res.clone().collect().await.unwrap()),
+        @r"
+    +----+----+----+
+    | c1 | c1 | c1 |
+    +----+----+----+
+    | b  | b  |    |
+    | b  | b  |    |
+    | c  |    |    |
+    | d  |    | d  |
+    +----+----+----+
+    "
     );
 
+    let select_res = join_res.select_columns(&["c1"])?;
+    assert_snapshot!(
+        batches_to_sort_string(&select_res.clone().collect().await.unwrap()),
+        @r"
+    +----+----+----+
+    | c1 | c1 | c1 |
+    +----+----+----+
+    | b  | b  |    |
+    | b  | b  |    |
+    | c  |    |    |
+    | d  |    | d  |
+    +----+----+----+
+    "
+    );
     Ok(())
 }
 
@@ -413,7 +534,8 @@ async fn drop_columns_with_nonexistent_columns() -> Result<()> {
 async fn drop_columns_with_empty_array() -> Result<()> {
     // build plan using Table API
     let t = test_table().await?;
-    let t2 = t.drop_columns(&[])?;
+    let drop_columns = vec![] as Vec<&str>;
+    let t2 = t.drop_columns(&drop_columns)?;
     let plan = t2.logical_plan().clone();
 
     // build query using SQL
@@ -428,6 +550,107 @@ async fn drop_columns_with_empty_array() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn drop_columns_qualified() -> Result<()> {
+    // build plan using Table API
+    let mut t = test_table().await?;
+    t = t.select_columns(&["c1", "c2", "c11"])?;
+    let mut t2 = test_table_with_name("another_table").await?;
+    t2 = t2.select_columns(&["c1", "c2", "c11"])?;
+    let mut t3 = t.join_on(
+        t2,
+        JoinType::Inner,
+        [col("aggregate_test_100.c1").eq(col("another_table.c1"))],
+    )?;
+    t3 = t3.drop_columns(&["another_table.c2", "another_table.c11"])?;
+
+    let plan = t3.logical_plan().clone();
+
+    let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1";
+    let ctx = SessionContext::new();
+    register_aggregate_csv(&ctx, "aggregate_test_100").await?;
+    register_aggregate_csv(&ctx, "another_table").await?;
+    let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan();
+
+    // the two plans should be identical
+    assert_same_plan(&plan, &sql_plan);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn drop_columns_qualified_find_qualified() -> Result<()> {
+    // build plan using Table API
+    let mut t = test_table().await?;
+    t = t.select_columns(&["c1", "c2", "c11"])?;
+    let mut t2 = test_table_with_name("another_table").await?;
+    t2 = t2.select_columns(&["c1", "c2", "c11"])?;
+    let mut t3 = t.join_on(
+        t2.clone(),
+        JoinType::Inner,
+        [col("aggregate_test_100.c1").eq(col("another_table.c1"))],
+    )?;
+    t3 = t3.drop_columns(&t2.find_qualified_columns(&["c2", "c11"])?)?;
+
+    let plan = t3.logical_plan().clone();
+
+    let sql = "SELECT aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c11, another_table.c1 FROM (SELECT c1, c2, c11 FROM aggregate_test_100) INNER JOIN (SELECT c1, c2, c11 FROM another_table) ON aggregate_test_100.c1 = another_table.c1";
+    let ctx = SessionContext::new();
+    register_aggregate_csv(&ctx, "aggregate_test_100").await?;
+    register_aggregate_csv(&ctx, "another_table").await?;
+    let sql_plan = ctx.sql(sql).await?.into_unoptimized_plan();
+
+    // the two plans should be identical
+    assert_same_plan(&plan, &sql_plan);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_find_qualified_names() -> Result<()> {
+    let t = test_table().await?;
+    let column_names = ["c1", "c2", "c3"];
+    let columns = t.find_qualified_columns(&column_names)?;
+
+    // Expected results for each column
+    let binding = TableReference::bare("aggregate_test_100");
+    let expected = [
+        (Some(&binding), "c1"),
+        (Some(&binding), "c2"),
+        (Some(&binding), "c3"),
+    ];
+
+    // Verify we got the expected number of results
+    assert_eq!(
+        columns.len(),
+        expected.len(),
+        "Expected {} columns, got {}",
+        expected.len(),
+        columns.len()
+    );
+
+    // Iterate over the results and check each one individually
+    for (i, (actual, expected)) in columns.iter().zip(expected.iter()).enumerate() {
+        let (actual_table_ref, actual_field_ref) = actual;
+        let (expected_table_ref, expected_field_name) = expected;
+
+        // Check table reference
+        assert_eq!(
+            actual_table_ref, expected_table_ref,
+            "Column {i}: expected table reference {expected_table_ref:?}, got {actual_table_ref:?}"
+        );
+
+        // Check field name
+        assert_eq!(
+            actual_field_ref.name(),
+            *expected_field_name,
+            "Column {i}: expected field name '{expected_field_name}', got '{actual_field_ref}'"
+        );
+    }
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn drop_with_quotes() -> Result<()> {
     // define data with a column name that has a "." in it:
@@ -447,14 +670,14 @@ async fn drop_with_quotes() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r#"
     +------+
     | f"c2 |
     +------+
     | 11   |
     | 2    |
     +------+
-    "###
+    "#
     );
 
     Ok(())
@@ -473,20 +696,68 @@ async fn drop_with_periods() -> Result<()> {
     let ctx = SessionContext::new();
     ctx.register_batch("t", batch)?;
 
-    let df = ctx.table("t").await?.drop_columns(&["f.c1"])?;
+    let df = ctx.table("t").await?.drop_columns(&["\"f.c1\""])?;
 
     let df_results = df.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+
     | f.c2 |
     +------+
     | 11   |
     | 2    |
     +------+
-    "###
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn drop_columns_duplicated_names_from_different_qualifiers() -> Result<()> {
+    let t1 = test_table_with_name("t1")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(0, Some(3))?;
+    let t2 = test_table_with_name("t2")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(3, Some(3))?;
+    let t3 = test_table_with_name("t3")
+        .await?
+        .select_columns(&["c1"])?
+        .limit(6, Some(3))?;
+
+    let join_res = t1
+        .join(t2, JoinType::LeftMark, &["c1"], &["c1"], None)?
+        .join(t3, JoinType::LeftMark, &["c1"], &["c1"], None)?;
+    assert_snapshot!(
+        batches_to_sort_string(&join_res.clone().collect().await.unwrap()),
+        @r"
+    +----+-------+-------+
+    | c1 | mark  | mark  |
+    +----+-------+-------+
+    | b  | true  | false |
+    | c  | false | false |
+    | d  | false | true  |
+    +----+-------+-------+
+    "
+    );
+
+    let drop_res = join_res.drop_columns(&["mark"])?;
+    assert_snapshot!(
+        batches_to_sort_string(&drop_res.clone().collect().await.unwrap()),
+        @r"
+    +----+
+    | c1 |
+    +----+
+    | b  |
+    | c  |
+    | d  |
+    +----+
+    "
     );
 
     Ok(())
@@ -495,32 +766,35 @@ async fn drop_with_periods() -> Result<()> {
 #[tokio::test]
 async fn aggregate() -> Result<()> {
     // build plan using DataFrame API
-    let df = test_table().await?;
+    // union so some of the distincts have a clearly distinct result
+    let df = test_table().await?.union(test_table().await?)?;
     let group_expr = vec![col("c1")];
     let aggr_expr = vec![
-        min(col("c12")),
-        max(col("c12")),
-        avg(col("c12")),
-        sum(col("c12")),
-        count(col("c12")),
-        count_distinct(col("c12")),
+        min(col("c4")).alias("min(c4)"),
+        max(col("c4")).alias("max(c4)"),
+        avg(col("c4")).alias("avg(c4)"),
+        avg_distinct(col("c4")).alias("avg_distinct(c4)"),
+        sum(col("c4")).alias("sum(c4)"),
+        sum_distinct(col("c4")).alias("sum_distinct(c4)"),
+        count(col("c4")).alias("count(c4)"),
+        count_distinct(col("c4")).alias("count_distinct(c4)"),
     ];
 
     let df: Vec<RecordBatch> = df.aggregate(group_expr, aggr_expr)?.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df),
-        @r###"
-    +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+
-    | c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) |
-    +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+
-    | a  | 0.02182578039211991         | 0.9800193410444061          | 0.48754517466109415         | 10.238448667882977          | 21                            | 21                                     |
-    | b  | 0.04893135681998029         | 0.9185813970744787          | 0.41040709263815384         | 7.797734760124923           | 19                            | 19                                     |
-    | c  | 0.0494924465469434          | 0.991517828651004           | 0.6600456536439784          | 13.860958726523545          | 21                            | 21                                     |
-    | d  | 0.061029375346466685        | 0.9748360509016578          | 0.48855379387549824         | 8.793968289758968           | 18                            | 18                                     |
-    | e  | 0.01479305307777301         | 0.9965400387585364          | 0.48600669271341534         | 10.206140546981722          | 21                            | 21                                     |
-    +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+
-    "###
+        @r"
+    +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+
+    | c1 | min(c4) | max(c4) | avg(c4)             | avg_distinct(c4)    | sum(c4) | sum_distinct(c4) | count(c4) | count_distinct(c4) |
+    +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+
+    | a  | -28462  | 32064   | 306.04761904761904  | 306.04761904761904  | 12854   | 6427             | 42        | 21                 |
+    | b  | -28070  | 25286   | 7732.315789473684   | 7732.315789473684   | 293828  | 146914           | 38        | 19                 |
+    | c  | -30508  | 29106   | -1320.5238095238096 | -1320.5238095238096 | -55462  | -27731           | 42        | 21                 |
+    | d  | -24558  | 31106   | 10890.111111111111  | 10890.111111111111  | 392044  | 196022           | 36        | 18                 |
+    | e  | -31500  | 32514   | -4268.333333333333  | -4268.333333333333  | -179270 | -89635           | 42        | 21                 |
+    +----+---------+---------+---------------------+---------------------+---------+------------------+-----------+--------------------+
+    "
     );
 
     Ok(())
@@ -535,7 +809,9 @@ async fn aggregate_assert_no_empty_batches() -> Result<()> {
         min(col("c12")),
         max(col("c12")),
         avg(col("c12")),
+        avg_distinct(col("c12")),
         sum(col("c12")),
+        sum_distinct(col("c12")),
         count(col("c12")),
         count_distinct(col("c12")),
         median(col("c12")),
@@ -570,23 +846,23 @@ async fn test_aggregate_with_pk() -> Result<()> {
 
     assert_snapshot!(
         physical_plan_to_string(&df).await,
-        @r###"
+        @r"
     AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
       DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+    "
     );
 
     let df_results = df.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -611,12 +887,11 @@ async fn test_aggregate_with_pk2() -> Result<()> {
     let df = df.filter(predicate)?;
     assert_snapshot!(
         physical_plan_to_string(&df).await,
-        @r###"
-    CoalesceBatchesExec: target_batch_size=8192
+        @r"
+    AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=Sorted
       FilterExec: id@0 = 1 AND name@1 = a
-        AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
-          DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
     );
 
     // Since id and name are functionally dependant, we can use name among expression
@@ -625,13 +900,13 @@ async fn test_aggregate_with_pk2() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -660,12 +935,11 @@ async fn test_aggregate_with_pk3() -> Result<()> {
     let df = df.select(vec![col("id"), col("name")])?;
     assert_snapshot!(
         physical_plan_to_string(&df).await,
-        @r###"
-    CoalesceBatchesExec: target_batch_size=8192
+        @r"
+    AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=PartiallySorted([0])
       FilterExec: id@0 = 1
-        AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
-          DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
     );
 
     // Since id and name are functionally dependant, we can use name among expression
@@ -674,13 +948,13 @@ async fn test_aggregate_with_pk3() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 1  | a    |
     +----+------+
-    "###
+    "
     );
 
     Ok(())
@@ -711,25 +985,24 @@ async fn test_aggregate_with_pk4() -> Result<()> {
     // columns are not used.
     assert_snapshot!(
         physical_plan_to_string(&df).await,
-        @r###"
-    CoalesceBatchesExec: target_batch_size=8192
+        @r"
+    AggregateExec: mode=Single, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
       FilterExec: id@0 = 1
-        AggregateExec: mode=Single, gby=[id@0 as id], aggr=[]
-          DataSourceExec: partitions=1, partition_sizes=[1]
-    "###
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
     );
 
     let df_results = df.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | id |
     +----+
     | 1  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -751,7 +1024,7 @@ async fn test_aggregate_alias() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c2 |
     +----+
@@ -761,7 +1034,7 @@ async fn test_aggregate_alias() -> Result<()> {
     | 5  |
     | 6  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -798,7 +1071,7 @@ async fn test_aggregate_with_union() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+------------+
     | c1 | sum_result |
     +----+------------+
@@ -808,7 +1081,7 @@ async fn test_aggregate_with_union() -> Result<()> {
     | d  | 126        |
     | e  | 121        |
     +----+------------+
-    "###
+    "
     );
     Ok(())
 }
@@ -834,7 +1107,7 @@ async fn test_aggregate_subexpr() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----------------+------+
     | c2 + Int32(10) | sum  |
     +----------------+------+
@@ -844,7 +1117,7 @@ async fn test_aggregate_subexpr() -> Result<()> {
     | 15             | 95   |
     | 16             | -146 |
     +----------------+------+
-    "###
+    "
     );
 
     Ok(())
@@ -867,7 +1140,7 @@ async fn test_aggregate_name_collision() -> Result<()> {
         // The select expr has the same display_name as the group_expr,
         // but since they are different expressions, it should fail.
         .expect_err("Expected error");
-    assert_snapshot!(df.strip_backtrace(), @r###"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."###);
+    assert_snapshot!(df.strip_backtrace(), @r#"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."#);
 
     Ok(())
 }
@@ -907,7 +1180,7 @@ async fn window_using_aggregates() -> Result<()> {
             vec![col("c3")],
         );
 
-        Expr::WindowFunction(w)
+        Expr::from(w)
             .null_treatment(NullTreatment::IgnoreNulls)
             .order_by(vec![col("c2").sort(true, true), col("c3").sort(true, true)])
             .window_frame(WindowFrame::new_bounds(
@@ -926,33 +1199,110 @@ async fn window_using_aggregates() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df),
-        @r###"
+        @r"
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
     | first_value | last_val | approx_distinct | approx_median | median | max | min  | c2 | c3   |
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
     |             |          |                 |               |        |     |      | 1  | -85  |
-    | -85         | -101     | 14              | -12           | -101   | 83  | -101 | 4  | -54  |
-    | -85         | -101     | 17              | -25           | -101   | 83  | -101 | 5  | -31  |
-    | -85         | -12      | 10              | -32           | -12    | 83  | -85  | 3  | 13   |
-    | -85         | -25      | 3               | -56           | -25    | -25 | -85  | 1  | -5   |
-    | -85         | -31      | 18              | -29           | -31    | 83  | -101 | 5  | 36   |
-    | -85         | -38      | 16              | -25           | -38    | 83  | -101 | 4  | 65   |
+    | -85         | -101     | 14              | -12           | -12    | 83  | -101 | 4  | -54  |
+    | -85         | -101     | 17              | -25           | -25    | 83  | -101 | 5  | -31  |
+    | -85         | -12      | 10              | -32           | -34    | 83  | -85  | 3  | 13   |
+    | -85         | -25      | 3               | -56           | -56    | -25 | -85  | 1  | -5   |
+    | -85         | -31      | 18              | -29           | -28    | 83  | -101 | 5  | 36   |
+    | -85         | -38      | 16              | -25           | -25    | 83  | -101 | 4  | 65   |
     | -85         | -43      | 7               | -43           | -43    | 83  | -85  | 2  | 45   |
-    | -85         | -48      | 6               | -35           | -48    | 83  | -85  | 2  | -43  |
-    | -85         | -5       | 4               | -37           | -5     | -5  | -85  | 1  | 83   |
-    | -85         | -54      | 15              | -17           | -54    | 83  | -101 | 4  | -38  |
-    | -85         | -56      | 2               | -70           | -56    | -56 | -85  | 1  | -25  |
-    | -85         | -72      | 9               | -43           | -72    | 83  | -85  | 3  | -12  |
+    | -85         | -48      | 6               | -35           | -36    | 83  | -85  | 2  | -43  |
+    | -85         | -5       | 4               | -37           | -40    | -5  | -85  | 1  | 83   |
+    | -85         | -54      | 15              | -17           | -18    | 83  | -101 | 4  | -38  |
+    | -85         | -56      | 2               | -70           | -70    | -56 | -85  | 1  | -25  |
+    | -85         | -72      | 9               | -43           | -43    | 83  | -85  | 3  | -12  |
     | -85         | -85      | 1               | -85           | -85    | -85 | -85  | 1  | -56  |
-    | -85         | 13       | 11              | -17           | 13     | 83  | -85  | 3  | 14   |
-    | -85         | 13       | 11              | -25           | 13     | 83  | -85  | 3  | 13   |
-    | -85         | 14       | 12              | -12           | 14     | 83  | -85  | 3  | 17   |
-    | -85         | 17       | 13              | -11           | 17     | 83  | -85  | 4  | -101 |
-    | -85         | 45       | 8               | -34           | 45     | 83  | -85  | 3  | -72  |
-    | -85         | 65       | 17              | -17           | 65     | 83  | -101 | 5  | -101 |
-    | -85         | 83       | 5               | -25           | 83     | 83  | -85  | 2  | -48  |
+    | -85         | 13       | 11              | -17           | -18    | 83  | -85  | 3  | 14   |
+    | -85         | 13       | 11              | -25           | -25    | 83  | -85  | 3  | 13   |
+    | -85         | 14       | 12              | -12           | -12    | 83  | -85  | 3  | 17   |
+    | -85         | 17       | 13              | -11           | -8     | 83  | -85  | 4  | -101 |
+    | -85         | 45       | 8               | -34           | -34    | 83  | -85  | 3  | -72  |
+    | -85         | 65       | 17              | -17           | -18    | 83  | -101 | 5  | -101 |
+    | -85         | 83       | 5               | -25           | -25    | 83  | -85  | 2  | -48  |
     +-------------+----------+-----------------+---------------+--------+-----+------+----+------+
-    "###
+    "
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn window_aggregates_with_filter() -> Result<()> {
+    // Define a small in-memory table to make expected values clear
+    let ts: Int32Array = [1, 2, 3, 4, 5].into_iter().collect();
+    let val: Int32Array = [-3, -2, 1, 4, -1].into_iter().collect();
+    let batch = RecordBatch::try_from_iter(vec![
+        ("ts", Arc::new(ts) as _),
+        ("val", Arc::new(val) as _),
+    ])?;
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("t", batch)?;
+
+    let df = ctx.table("t").await?;
+
+    // Build filtered window aggregates over ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    let mut exprs = vec![
+        (datafusion_functions_aggregate::sum::sum_udaf(), "sum_pos"),
+        (
+            datafusion_functions_aggregate::average::avg_udaf(),
+            "avg_pos",
+        ),
+        (
+            datafusion_functions_aggregate::min_max::min_udaf(),
+            "min_pos",
+        ),
+        (
+            datafusion_functions_aggregate::min_max::max_udaf(),
+            "max_pos",
+        ),
+        (
+            datafusion_functions_aggregate::count::count_udaf(),
+            "cnt_pos",
+        ),
+    ]
+    .into_iter()
+    .map(|(func, alias)| {
+        let w = WindowFunction::new(
+            WindowFunctionDefinition::AggregateUDF(func),
+            vec![col("val")],
+        );
+
+        Expr::from(w)
+            .order_by(vec![col("ts").sort(true, true)])
+            .window_frame(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            ))
+            .filter(col("val").gt(lit(0)))
+            .build()
+            .unwrap()
+            .alias(alias)
+    })
+    .collect::<Vec<_>>();
+    exprs.extend_from_slice(&[col("ts"), col("val")]);
+
+    let results = df.select(exprs)?.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +---------+---------+---------+---------+---------+----+-----+
+    | sum_pos | avg_pos | min_pos | max_pos | cnt_pos | ts | val |
+    +---------+---------+---------+---------+---------+----+-----+
+    |         |         |         |         | 0       | 1  | -3  |
+    |         |         |         |         | 0       | 2  | -2  |
+    | 1       | 1.0     | 1       | 1       | 1       | 3  | 1   |
+    | 5       | 2.5     | 1       | 4       | 2       | 4  | 4   |
+    | 5       | 2.5     | 1       | 4       | 2       | 5  | -1  |
+    +---------+---------+---------+---------+---------+----+-----+
+    "
     );
 
     Ok(())
@@ -1008,7 +1358,7 @@ async fn test_distinct_sort_by() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1018,7 +1368,7 @@ async fn test_distinct_sort_by() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1056,7 +1406,7 @@ async fn test_distinct_on() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1066,7 +1416,7 @@ async fn test_distinct_on() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1091,7 +1441,7 @@ async fn test_distinct_on_sort_by() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -1101,7 +1451,7 @@ async fn test_distinct_on_sort_by() -> Result<()> {
     | d  |
     | e  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1165,13 +1515,13 @@ async fn join_coercion_unnamed() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+------+
     | id | name |
     +----+------+
     | 10 | d    |
     +----+------+
-    "###
+    "
     );
     Ok(())
 }
@@ -1190,13 +1540,13 @@ async fn join_on() -> Result<()> {
         [col("a.c1").not_eq(col("b.c1")), col("a.c2").eq(col("b.c2"))],
     )?;
 
-    assert_snapshot!(join.logical_plan(), @r###"
+    assert_snapshot!(join.logical_plan(), @r"
     Inner Join:  Filter: a.c1 != b.c1 AND a.c2 = b.c2
       Projection: a.c1, a.c2
         TableScan: a
       Projection: b.c1, b.c2
         TableScan: b
-    "###);
+    ");
 
     Ok(())
 }
@@ -1210,16 +1560,20 @@ async fn join_on_filter_datatype() -> Result<()> {
     let join = left.clone().join_on(
         right.clone(),
         JoinType::Inner,
-        Some(Expr::Literal(ScalarValue::Null)),
+        Some(Expr::Literal(ScalarValue::Null, None)),
     )?;
-    assert_snapshot!(join.into_optimized_plan().unwrap(), @"EmptyRelation");
+    assert_snapshot!(join.into_optimized_plan().unwrap(), @"EmptyRelation: rows=0");
 
     // JOIN ON expression must be boolean type
     let join = left.join_on(right, JoinType::Inner, Some(lit("TRUE")))?;
     let err = join.into_optimized_plan().unwrap_err();
     assert_snapshot!(
         err.strip_backtrace(),
-        @"type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8"
+        @r"
+    type_coercion
+    caused by
+    Error during planning: Join condition must be boolean type, but got Utf8
+    "
     );
     Ok(())
 }
@@ -1360,6 +1714,36 @@ async fn except() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn except_distinct() -> Result<()> {
+    let df = test_table().await?.select_columns(&["c1", "c3"])?;
+    let d2 = df.clone();
+    let plan = df.except_distinct(d2)?;
+    let result = plan.logical_plan().clone();
+    let expected = create_plan(
+        "SELECT c1, c3 FROM aggregate_test_100
+            EXCEPT DISTINCT SELECT c1, c3 FROM aggregate_test_100",
+    )
+    .await?;
+    assert_same_plan(&result, &expected);
+    Ok(())
+}
+
+#[tokio::test]
+async fn intersect_distinct() -> Result<()> {
+    let df = test_table().await?.select_columns(&["c1", "c3"])?;
+    let d2 = df.clone();
+    let plan = df.intersect_distinct(d2)?;
+    let result = plan.logical_plan().clone();
+    let expected = create_plan(
+        "SELECT c1, c3 FROM aggregate_test_100
+            INTERSECT DISTINCT SELECT c1, c3 FROM aggregate_test_100",
+    )
+    .await?;
+    assert_same_plan(&result, &expected);
+    Ok(())
+}
+
 #[tokio::test]
 async fn register_table() -> Result<()> {
     let df = test_table().await?.select_columns(&["c1", "c12"])?;
@@ -1367,7 +1751,9 @@ async fn register_table() -> Result<()> {
     let df_impl = DataFrame::new(ctx.state(), df.logical_plan().clone());
 
     // register a dataframe as a table
-    ctx.register_table("test_table", df_impl.clone().into_view())?;
+    let table_provider = df_impl.clone().into_view();
+    assert_eq!(table_provider.table_type(), TableType::View);
+    ctx.register_table("test_table", table_provider)?;
 
     // pull the table out
     let table = ctx.table("test_table").await?;
@@ -1384,7 +1770,7 @@ async fn register_table() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+-----------------------------+
     | c1 | sum(aggregate_test_100.c12) |
     +----+-----------------------------+
@@ -1394,13 +1780,13 @@ async fn register_table() -> Result<()> {
     | d  | 8.793968289758968           |
     | e  | 10.206140546981722          |
     +----+-----------------------------+
-    "###
+    "
     );
 
     // the results are the same as the results from the view, modulo the leaf table name
     assert_snapshot!(
         batches_to_sort_string(table_results),
-        @r###"
+        @r"
     +----+---------------------+
     | c1 | sum(test_table.c12) |
     +----+---------------------+
@@ -1410,11 +1796,28 @@ async fn register_table() -> Result<()> {
     | d  | 8.793968289758968   |
     | e  | 10.206140546981722  |
     +----+---------------------+
-    "###
+    "
     );
     Ok(())
 }
 
+#[tokio::test]
+async fn register_temporary_table() -> Result<()> {
+    let df = test_table().await?.select_columns(&["c1", "c12"])?;
+    let ctx = SessionContext::new();
+    let df_impl = DataFrame::new(ctx.state(), df.logical_plan().clone());
+
+    let df_table_provider = df_impl.clone().into_temporary_view();
+
+    // check that we set the correct table_type
+    assert_eq!(df_table_provider.table_type(), TableType::Temporary);
+
+    // check that we can register a dataframe as a temporary table
+    ctx.register_table("test_table", df_table_provider)?;
+
+    Ok(())
+}
+
 /// Compare the formatted string representation of two plans for equality
 fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) {
     assert_eq!(format!("{plan1:?}"), format!("{plan2:?}"));
@@ -1442,7 +1845,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+-----+
     | c1 | c2 | c3  | sum |
     +----+----+-----+-----+
@@ -1453,7 +1856,7 @@ async fn with_column() -> Result<()> {
     | a  | 3  | 14  | 17  |
     | a  | 3  | 17  | 20  |
     +----+----+-----+-----+
-    "###
+    "
     );
 
     // check that col with the same name overwritten
@@ -1465,7 +1868,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results_overwrite),
-        @r###"
+        @r"
     +-----+----+-----+-----+
     | c1  | c2 | c3  | sum |
     +-----+----+-----+-----+
@@ -1476,7 +1879,7 @@ async fn with_column() -> Result<()> {
     | 17  | 3  | 14  | 17  |
     | 20  | 3  | 17  | 20  |
     +-----+----+-----+-----+
-    "###
+    "
     );
 
     // check that col with the same name overwritten using same name as reference
@@ -1488,7 +1891,7 @@ async fn with_column() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results_overwrite_self),
-        @r###"
+        @r"
     +----+----+-----+-----+
     | c1 | c2 | c3  | sum |
     +----+----+-----+-----+
@@ -1499,7 +1902,7 @@ async fn with_column() -> Result<()> {
     | a  | 4  | 14  | 17  |
     | a  | 4  | 17  | 20  |
     +----+----+-----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -1527,14 +1930,14 @@ async fn test_window_function_with_column() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+-----+---+
     | c1 | c2 | c3  | s   | r |
     +----+----+-----+-----+---+
     | c  | 2  | 1   | 3   | 1 |
     | d  | 5  | -40 | -35 | 2 |
     +----+----+-----+-----+---+
-    "###
+    "
     );
 
     Ok(())
@@ -1569,13 +1972,13 @@ async fn with_column_join_same_columns() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+
     | c1 | c1 |
     +----+----+
     | a  | a  |
     +----+----+
-    "###
+    "
     );
 
     let df_with_column = df.clone().with_column("new_column", lit(true))?;
@@ -1598,7 +2001,7 @@ async fn with_column_join_same_columns() -> Result<()> {
 
     assert_snapshot!(
         df_with_column.clone().into_optimized_plan().unwrap(),
-        @r###"
+        @r"
     Projection: t1.c1, t2.c1, Boolean(true) AS new_column
       Sort: t1.c1 ASC NULLS FIRST, fetch=1
         Inner Join: t1.c1 = t2.c1
@@ -1606,20 +2009,20 @@ async fn with_column_join_same_columns() -> Result<()> {
             TableScan: aggregate_test_100 projection=[c1]
           SubqueryAlias: t2
             TableScan: aggregate_test_100 projection=[c1]
-    "###
+    "
     );
 
     let df_results = df_with_column.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+------------+
     | c1 | c1 | new_column |
     +----+----+------------+
     | a  | a  | true       |
     +----+----+------------+
-    "###
+    "
     );
 
     Ok(())
@@ -1669,13 +2072,13 @@ async fn with_column_renamed() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(batches),
-        @r###"
+        @r"
     +-----+-----+-----+-------+
     | one | two | c3  | total |
     +-----+-----+-----+-------+
     | a   | 3   | -72 | -69   |
     +-----+-----+-----+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -1740,13 +2143,13 @@ async fn with_column_renamed_join() -> Result<()> {
     let df_results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+----+----+-----+
     | c1 | c2 | c3  | c1 | c2 | c3  |
     +----+----+-----+----+----+-----+
     | a  | 1  | -85 | a  | 1  | -85 |
     +----+----+-----+----+----+-----+
-    "###
+    "
     );
 
     let df_renamed = df.clone().with_column_renamed("t1.c1", "AAA")?;
@@ -1769,7 +2172,7 @@ async fn with_column_renamed_join() -> Result<()> {
 
     assert_snapshot!(
         df_renamed.clone().into_optimized_plan().unwrap(),
-        @r###"
+        @r"
     Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3
       Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1
         Inner Join: t1.c1 = t2.c1
@@ -1777,20 +2180,20 @@ async fn with_column_renamed_join() -> Result<()> {
             TableScan: aggregate_test_100 projection=[c1, c2, c3]
           SubqueryAlias: t2
             TableScan: aggregate_test_100 projection=[c1, c2, c3]
-    "###
+    "
     );
 
     let df_results = df_renamed.collect().await?;
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +-----+----+-----+----+----+-----+
     | AAA | c2 | c3  | c1 | c2 | c3  |
     +-----+----+-----+----+----+-----+
     | a   | 1  | -85 | a  | 1  | -85 |
     +-----+----+-----+----+----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -1825,13 +2228,13 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(res),
-        @r###"
+        @r"
     +---------+
     | CoLuMn1 |
     +---------+
     | a       |
     +---------+
-    "###
+    "
     );
 
     let df_renamed = df_renamed
@@ -1841,13 +2244,13 @@ async fn with_column_renamed_case_sensitive() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_renamed),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
     | a  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -1885,19 +2288,19 @@ async fn describe_lookup_via_quoted_identifier() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&describe_result.clone().collect().await?),
-        @r###"
-        +------------+--------------+
-        | describe   | CoLu.Mn["1"] |
-        +------------+--------------+
-        | count      | 1            |
-        | max        | a            |
-        | mean       | null         |
-        | median     | null         |
-        | min        | a            |
-        | null_count | 0            |
-        | std        | null         |
-        +------------+--------------+
-    "###
+        @r#"
+    +------------+--------------+
+    | describe   | CoLu.Mn["1"] |
+    +------------+--------------+
+    | count      | 1            |
+    | max        | a            |
+    | mean       | null         |
+    | median     | null         |
+    | min        | a            |
+    | null_count | 0            |
+    | std        | null         |
+    +------------+--------------+
+    "#
     );
 
     Ok(())
@@ -1915,13 +2318,13 @@ async fn cast_expr_test() -> Result<()> {
     df.clone().show().await?;
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +----+----+-----+
     | c2 | c3 | sum |
     +----+----+-----+
     | 2  | 1  | 3   |
     +----+----+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -1937,12 +2340,14 @@ async fn row_writer_resize_test() -> Result<()> {
 
     let data = RecordBatch::try_new(
         schema,
-        vec![
-            Arc::new(StringArray::from(vec![
-                Some("2a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"),
-                Some("3a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800"),
-            ]))
-        ],
+        vec![Arc::new(StringArray::from(vec![
+            Some(
+                "2a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+            ),
+            Some(
+                "3a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800",
+            ),
+        ]))],
     )?;
 
     let ctx = SessionContext::new();
@@ -1981,14 +2386,14 @@ async fn with_column_name() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&df_results),
-        @r###"
+        @r"
     +------+-------+
     | f.c1 | f.c2  |
     +------+-------+
     | 1    | hello |
     | 10   | hello |
     +------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -2024,13 +2429,13 @@ async fn cache_test() -> Result<()> {
     let cached_df_results = cached_df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&cached_df_results),
-        @r###"
+        @r"
     +----+----+-----+
     | c2 | c3 | sum |
     +----+----+-----+
     | 2  | 1  | 3   |
     +----+----+-----+
-    "###
+    "
     );
 
     assert_eq!(&df_results, &cached_df_results);
@@ -2038,6 +2443,29 @@ async fn cache_test() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn cache_producer_test() -> Result<()> {
+    let df = test_table_with_cache_factory()
+        .await?
+        .select_columns(&["c2", "c3"])?
+        .limit(0, Some(1))?
+        .with_column("sum", cast(col("c2") + col("c3"), DataType::Int64))?;
+
+    let cached_df = df.clone().cache().await?;
+
+    assert_snapshot!(
+        cached_df.clone().into_optimized_plan().unwrap(),
+        @r"
+    CacheNode
+      Projection: aggregate_test_100.c2, aggregate_test_100.c3, CAST(CAST(aggregate_test_100.c2 AS Int64) + CAST(aggregate_test_100.c3 AS Int64) AS Int64) AS sum
+        Projection: aggregate_test_100.c2, aggregate_test_100.c3
+          Limit: skip=0, fetch=1
+            TableScan: aggregate_test_100, fetch=1
+    "
+    );
+    Ok(())
+}
+
 #[tokio::test]
 async fn partition_aware_union() -> Result<()> {
     let left = test_table().await?.select_columns(&["c1", "c2"])?;
@@ -2145,6 +2573,7 @@ async fn verify_join_output_partitioning() -> Result<()> {
         JoinType::LeftAnti,
         JoinType::RightAnti,
         JoinType::LeftMark,
+        JoinType::RightMark,
     ];
 
     let default_partition_count = SessionConfig::new().target_partitions();
@@ -2178,7 +2607,8 @@ async fn verify_join_output_partitioning() -> Result<()> {
             JoinType::Inner
             | JoinType::Right
             | JoinType::RightSemi
-            | JoinType::RightAnti => {
+            | JoinType::RightAnti
+            | JoinType::RightMark => {
                 let right_exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
                     Arc::new(Column::new_with_schema("c2_c1", &join_schema)?),
                     Arc::new(Column::new_with_schema("c2_c2", &join_schema)?),
@@ -2300,18 +2730,18 @@ async fn filtered_aggr_with_param_values() -> Result<()> {
     let df = ctx
         .sql("select count (c2) filter (where c3 > $1) from table1")
         .await?
-        .with_param_values(ParamValues::List(vec![ScalarValue::from(10u64)]));
+        .with_param_values(vec![ScalarValue::from(10u64)]);
 
     let df_results = df?.collect().await?;
     assert_snapshot!(
         batches_to_string(&df_results),
-        @r###"
+        @r"
     +------------------------------------------------+
     | count(table1.c2) FILTER (WHERE table1.c3 > $1) |
     +------------------------------------------------+
     | 54                                             |
     +------------------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -2359,7 +2789,7 @@ async fn write_parquet_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2369,7 +2799,7 @@ async fn write_parquet_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -2417,7 +2847,7 @@ async fn write_csv_with_order() -> Result<()> {
 
     assert_snapshot!(
         batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2427,7 +2857,7 @@ async fn write_csv_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
     Ok(())
 }
@@ -2465,7 +2895,7 @@ async fn write_json_with_order() -> Result<()> {
     ctx.register_json(
         "data",
         test_path.to_str().unwrap(),
-        NdJsonReadOptions::default().schema(&schema),
+        JsonReadOptions::default().schema(&schema),
     )
     .await?;
 
@@ -2474,7 +2904,7 @@ async fn write_json_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +---+---+
     | a | b |
     +---+---+
@@ -2484,7 +2914,7 @@ async fn write_json_with_order() -> Result<()> {
     | 5 | 3 |
     | 7 | 4 |
     +---+---+
-    "###
+    "
     );
     Ok(())
 }
@@ -2533,7 +2963,7 @@ async fn write_table_with_order() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------+
     | tablecol1 |
     +-----------+
@@ -2543,7 +2973,7 @@ async fn write_table_with_order() -> Result<()> {
     | x         |
     | z         |
     +-----------+
-    "###
+    "
     );
     Ok(())
 }
@@ -2570,50 +3000,44 @@ async fn test_count_wildcard_on_sort() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
-        @r###"
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                       |
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.b, count(*)                                                                                 |
-    |               |   Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST                                             |
-    |               |     Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1))                                         |
-    |               |       Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                                                |
-    |               |         TableScan: t1 projection=[b]                                                                       |
-    | physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)]                                                    |
-    |               |   SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST]                                              |
-    |               |     SortExec: expr=[count(Int64(1))@2 ASC NULLS LAST], preserve_partitioning=[true]                        |
-    |               |       ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] |
-    |               |         AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))]                       |
-    |               |           CoalesceBatchesExec: target_batch_size=8192                                                      |
-    |               |             RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4                               |
-    |               |               RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                         |
-    |               |                 AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]                        |
-    |               |                   DataSourceExec: partitions=1, partition_sizes=[1]                                        |
-    |               |                                                                                                            |
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    "###
+        @r"
+    +---------------+------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                               |
+    +---------------+------------------------------------------------------------------------------------+
+    | logical_plan  | Sort: count(*) ASC NULLS LAST                                                      |
+    |               |   Projection: t1.b, count(Int64(1)) AS count(*)                                    |
+    |               |     Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                          |
+    |               |       TableScan: t1 projection=[b]                                                 |
+    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                               |
+    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]         |
+    |               |     ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*)]                 |
+    |               |       AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |
+    |               |         RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1           |
+    |               |           AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]      |
+    |               |             DataSourceExec: partitions=1, partition_sizes=[1]                      |
+    |               |                                                                                    |
+    +---------------+------------------------------------------------------------------------------------+
+    "
     );
 
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
-        @r###"
-    +---------------+--------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                           |
-    +---------------+--------------------------------------------------------------------------------+
-    | logical_plan  | Sort: count(*) ASC NULLS LAST                                                  |
-    |               |   Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]]            |
-    |               |     TableScan: t1 projection=[b]                                               |
-    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                           |
-    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]     |
-    |               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)]      |
-    |               |       CoalesceBatchesExec: target_batch_size=8192                              |
-    |               |         RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4       |
-    |               |           RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
-    |               |             AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)]       |
-    |               |               DataSourceExec: partitions=1, partition_sizes=[1]                |
-    |               |                                                                                |
-    +---------------+--------------------------------------------------------------------------------+
-    "###
+        @r"
+    +---------------+----------------------------------------------------------------------------+
+    | plan_type     | plan                                                                       |
+    +---------------+----------------------------------------------------------------------------+
+    | logical_plan  | Sort: count(*) AS count(*) ASC NULLS LAST                                  |
+    |               |   Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]]        |
+    |               |     TableScan: t1 projection=[b]                                           |
+    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                       |
+    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |
+    |               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)]  |
+    |               |       RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1     |
+    |               |         AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)]       |
+    |               |           DataSourceExec: partitions=1, partition_sizes=[1]                |
+    |               |                                                                            |
+    +---------------+----------------------------------------------------------------------------+
+    "
     );
     Ok(())
 }
@@ -2631,23 +3055,22 @@ async fn test_count_wildcard_on_where_in() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
         @r"
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                   |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                        |
-    |               |   TableScan: t1 projection=[a, b]                                                                                      |
-    |               |   SubqueryAlias: __correlated_sq_1                                                                                     |
-    |               |     Projection: count(Int64(1)) AS count(*)                                                                            |
-    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]                                                                |
-    |               |         TableScan: t2 projection=[]                                                                                    |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                            |
-    |               |   HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
-    |               |     ProjectionExec: expr=[4 as count(*)]                                                                               |
-    |               |       PlaceholderRowExec                                                                                               |
-    |               |     ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
-    |               |                                                                                                                        |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                 |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                      |
+    |               |   TableScan: t1 projection=[a, b]                                                                                    |
+    |               |   SubqueryAlias: __correlated_sq_1                                                                                   |
+    |               |     Projection: count(Int64(1)) AS count(*)                                                                          |
+    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]                                                              |
+    |               |         TableScan: t2 projection=[]                                                                                  |
+    | physical_plan | HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
+    |               |   ProjectionExec: expr=[4 as count(*)]                                                                               |
+    |               |     PlaceholderRowExec                                                                                               |
+    |               |   ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
+    |               |     DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
+    |               |                                                                                                                      |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -2677,22 +3100,21 @@ async fn test_count_wildcard_on_where_in() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
         @r"
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                   |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                        |
-    |               |   TableScan: t1 projection=[a, b]                                                                                      |
-    |               |   SubqueryAlias: __correlated_sq_1                                                                                     |
-    |               |     Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]                                                      |
-    |               |       TableScan: t2 projection=[]                                                                                      |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                            |
-    |               |   HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
-    |               |     ProjectionExec: expr=[4 as count(*)]                                                                               |
-    |               |       PlaceholderRowExec                                                                                               |
-    |               |     ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
-    |               |                                                                                                                        |
-    +---------------+------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                 |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*)                                                      |
+    |               |   TableScan: t1 projection=[a, b]                                                                                    |
+    |               |   SubqueryAlias: __correlated_sq_1                                                                                   |
+    |               |     Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]                                                    |
+    |               |       TableScan: t2 projection=[]                                                                                    |
+    | physical_plan | HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |
+    |               |   ProjectionExec: expr=[4 as count(*)]                                                                               |
+    |               |     PlaceholderRowExec                                                                                               |
+    |               |   ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)]                               |
+    |               |     DataSourceExec: partitions=1, partition_sizes=[1]                                                                |
+    |               |                                                                                                                      |
+    +---------------+----------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -2711,23 +3133,20 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
-        @r###"
-    +---------------+---------------------------------------------------------+
-    | plan_type     | plan                                                    |
-    +---------------+---------------------------------------------------------+
-    | logical_plan  | LeftSemi Join:                                          |
-    |               |   TableScan: t1 projection=[a, b]                       |
-    |               |   SubqueryAlias: __correlated_sq_1                      |
-    |               |     Projection:                                         |
-    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |
-    |               |         TableScan: t2 projection=[]                     |
-    | physical_plan | NestedLoopJoinExec: join_type=RightSemi                 |
-    |               |   ProjectionExec: expr=[]                               |
-    |               |     PlaceholderRowExec                                  |
-    |               |   DataSourceExec: partitions=1, partition_sizes=[1]     |
-    |               |                                                         |
-    +---------------+---------------------------------------------------------+
-    "###
+        @r"
+    +---------------+-----------------------------------------------------+
+    | plan_type     | plan                                                |
+    +---------------+-----------------------------------------------------+
+    | logical_plan  | LeftSemi Join:                                      |
+    |               |   TableScan: t1 projection=[a, b]                   |
+    |               |   SubqueryAlias: __correlated_sq_1                  |
+    |               |     EmptyRelation: rows=1                           |
+    | physical_plan | NestedLoopJoinExec: join_type=RightSemi             |
+    |               |   PlaceholderRowExec                                |
+    |               |   DataSourceExec: partitions=1, partition_sizes=[1] |
+    |               |                                                     |
+    +---------------+-----------------------------------------------------+
+    "
     );
 
     let df_results = ctx
@@ -2750,92 +3169,194 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
-        @r###"
-    +---------------+---------------------------------------------------------------------+
-    | plan_type     | plan                                                                |
-    +---------------+---------------------------------------------------------------------+
-    | logical_plan  | LeftSemi Join:                                                      |
-    |               |   TableScan: t1 projection=[a, b]                                   |
-    |               |   SubqueryAlias: __correlated_sq_1                                  |
-    |               |     Projection:                                                     |
-    |               |       Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |
-    |               |         TableScan: t2 projection=[]                                 |
-    | physical_plan | NestedLoopJoinExec: join_type=RightSemi                             |
-    |               |   ProjectionExec: expr=[]                                           |
-    |               |     PlaceholderRowExec                                              |
-    |               |   DataSourceExec: partitions=1, partition_sizes=[1]                 |
-    |               |                                                                     |
-    +---------------+---------------------------------------------------------------------+
-    "###
+        @r"
+    +---------------+-----------------------------------------------------+
+    | plan_type     | plan                                                |
+    +---------------+-----------------------------------------------------+
+    | logical_plan  | LeftSemi Join:                                      |
+    |               |   TableScan: t1 projection=[a, b]                   |
+    |               |   SubqueryAlias: __correlated_sq_1                  |
+    |               |     EmptyRelation: rows=1                           |
+    | physical_plan | NestedLoopJoinExec: join_type=RightSemi             |
+    |               |   PlaceholderRowExec                                |
+    |               |   DataSourceExec: partitions=1, partition_sizes=[1] |
+    |               |                                                     |
+    +---------------+-----------------------------------------------------+
+    "
     );
 
     Ok(())
 }
 
-#[tokio::test]
-async fn test_count_wildcard_on_window() -> Result<()> {
-    let ctx = create_join_context()?;
+#[tokio::test]
+async fn test_count_wildcard_on_window() -> Result<()> {
+    let ctx = create_join_context()?;
+
+    let sql_results = ctx
+        .sql("select count(*) OVER(ORDER BY a DESC RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING) from t1")
+        .await?
+        .explain(false, false)?
+        .collect()
+        .await?;
+
+    assert_snapshot!(
+        pretty_format_batches(&sql_results).unwrap(),
+        @r#"
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                                                                                                                                                                                                     |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING                                                                                                            |
+    |               |   WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]]                                                                                                                                                                                  |
+    |               |     TableScan: t1 projection=[a]                                                                                                                                                                                                                                                                         |
+    | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]                                                                                               |
+    |               |   BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] |
+    |               |     SortExec: expr=[a@0 DESC], preserve_partitioning=[false]                                                                                                                                                                                                                                             |
+    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                                                                                                                                                                                                  |
+    |               |                                                                                                                                                                                                                                                                                                          |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    "#
+    );
+
+    let df_results = ctx
+        .table("t1")
+        .await?
+        .select(vec![
+            count_all_window()
+                .order_by(vec![Sort::new(col("a"), false, true)])
+                .window_frame(WindowFrame::new_bounds(
+                    WindowFrameUnits::Range,
+                    WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))),
+                    WindowFrameBound::Following(ScalarValue::UInt32(Some(2))),
+                ))
+                .build()
+                .unwrap(),
+        ])?
+        .explain(false, false)?
+        .collect()
+        .await?;
+
+    assert_snapshot!(
+        pretty_format_batches(&df_results).unwrap(),
+        @r#"
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                                                                                                                                                                                                     |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING                                                                                                                                                                                                   |
+    |               |   WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]]                                                                                                                                                                                  |
+    |               |     TableScan: t1 projection=[a]                                                                                                                                                                                                                                                                         |
+    | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]                                                                                        |
+    |               |   BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] |
+    |               |     SortExec: expr=[a@0 DESC], preserve_partitioning=[false]                                                                                                                                                                                                                                             |
+    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                                                                                                                                                                                                  |
+    |               |                                                                                                                                                                                                                                                                                                          |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    "#
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+// Test with `repartition_sorts` disabled, causing a full resort of the data
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false()
+-> Result<()> {
+    assert_snapshot!(
+        union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?,
+        @r"
+    AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
+      SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+        AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+            SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+// Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true()
+-> Result<()> {
+    assert_snapshot!(
+        union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?,
+        @r"
+    AggregateExec: mode=Final, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
+      SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+        AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+            SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[{testdata}/alltypes_tiny_pages.parquet]]}, projection=[id], file_type=parquet
+    ");
+
+    Ok(())
+}
+
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(
+    repartition_sorts: bool,
+) -> Result<String> {
+    let config = SessionConfig::default()
+        .with_target_partitions(1)
+        .with_repartition_sorts(repartition_sorts);
+    let ctx = SessionContext::new_with_config(config);
+
+    let testdata = parquet_test_data();
+
+    // Register "sorted" table, that is sorted
+    ctx.register_parquet(
+        "sorted",
+        &format!("{testdata}/alltypes_tiny_pages.parquet"),
+        ParquetReadOptions::default()
+            .file_sort_order(vec![vec![col("id").sort(true, false)]]),
+    )
+    .await?;
+
+    // Register "unsorted" table
+    ctx.register_parquet(
+        "unsorted",
+        &format!("{testdata}/alltypes_tiny_pages.parquet"),
+        ParquetReadOptions::default(),
+    )
+    .await?;
+
+    let source_sorted = ctx
+        .table("sorted")
+        .await
+        .unwrap()
+        .select(vec![col("id")])
+        .unwrap();
+
+    let source_unsorted = ctx
+        .table("unsorted")
+        .await
+        .unwrap()
+        .select(vec![col("id")])
+        .unwrap();
+
+    let source_unsorted_resorted =
+        source_unsorted.sort(vec![col("id").sort(true, false)])?;
+
+    let union = source_sorted.union(source_unsorted_resorted)?;
 
-    let sql_results = ctx
-        .sql("select count(*) OVER(ORDER BY a DESC RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING) from t1")
-        .await?
-        .explain(false, false)?
-        .collect()
-        .await?;
+    let agg = union.aggregate(vec![col("id")], vec![])?;
 
-    assert_snapshot!(
-        pretty_format_batches(&sql_results).unwrap(),
-        @r###"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING                                                                                                                                                                                                                                                                             |
-    |               |   WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]]                                                                                                                                                                                                                                                                                                                                                   |
-    |               |     TableScan: t1 projection=[a]                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-    | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]                                                                                                                                                                                                                                                                |
-    |               |   BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |
-    |               |     SortExec: expr=[a@0 DESC], preserve_partitioning=[false]                                                                                                                                                                                                                                                                                                                                                                                                              |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                                                                                                                                                                                                                                                                                                                                                                   |
-    |               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    "###
-    );
+    let df = agg;
 
-    let df_results = ctx
-        .table("t1")
-        .await?
-        .select(vec![count_all_window()
-            .order_by(vec![Sort::new(col("a"), false, true)])
-            .window_frame(WindowFrame::new_bounds(
-                WindowFrameUnits::Range,
-                WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))),
-                WindowFrameBound::Following(ScalarValue::UInt32(Some(2))),
-            ))
-            .build()
-            .unwrap()])?
-        .explain(false, false)?
-        .collect()
-        .await?;
+    // To be able to remove user specific paths from the plan, for stable assertions
+    let testdata_clean = Path::new(&testdata).canonicalize()?.display().to_string();
+    let testdata_clean = testdata_clean.strip_prefix("/").unwrap_or(&testdata_clean);
 
-    assert_snapshot!(
-        pretty_format_batches(&df_results).unwrap(),
-        @r###"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING                                                                                                                                                                                                                                                                                                                                                                    |
-    |               |   WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]]                                                                                                                                                                                                                                                                                                                                                   |
-    |               |     TableScan: t1 projection=[a]                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-    | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]                                                                                                                                                                                                                                                         |
-    |               |   BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |
-    |               |     SortExec: expr=[a@0 DESC], preserve_partitioning=[false]                                                                                                                                                                                                                                                                                                                                                                                                              |
-    |               |       DataSourceExec: partitions=1, partition_sizes=[1]                                                                                                                                                                                                                                                                                                                                                                                                                   |
-    |               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-    "###
-    );
+    // Use displayable() rather than explain().collect() to avoid table formatting issues. We need
+    // to replace machine-specific paths with variable lengths, which breaks table alignment and
+    // causes snapshot mismatches.
+    let physical_plan = df.create_physical_plan().await?;
+    let displayable_plan = displayable(physical_plan.as_ref())
+        .indent(true)
+        .to_string()
+        .replace(testdata_clean, "{testdata}");
 
-    Ok(())
+    Ok(displayable_plan)
 }
 
 #[tokio::test]
@@ -2852,7 +3373,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
-        @r###"
+        @r"
     +---------------+-----------------------------------------------------+
     | plan_type     | plan                                                |
     +---------------+-----------------------------------------------------+
@@ -2863,7 +3384,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
     |               |   PlaceholderRowExec                                |
     |               |                                                     |
     +---------------+-----------------------------------------------------+
-    "###
+    "
     );
 
     // add `.select(vec![count_wildcard()])?` to make sure we can analyze all node instead of just top node.
@@ -2878,7 +3399,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
 
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
-        @r###"
+        @r"
     +---------------+---------------------------------------------------------------+
     | plan_type     | plan                                                          |
     +---------------+---------------------------------------------------------------+
@@ -2888,7 +3409,7 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> {
     |               |   PlaceholderRowExec                                          |
     |               |                                                               |
     +---------------+---------------------------------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -2908,32 +3429,31 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
         @r"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.a, t1.b                                                                                                    |
-    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)          |
-    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                           |
-    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                   |
-    |               |         TableScan: t1 projection=[a, b]                                                                                   |
-    |               |         SubqueryAlias: __scalar_sq_1                                                                                      |
-    |               |           Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true                                   |
-    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]]                                                         |
-    |               |               TableScan: t2 projection=[a]                                                                                |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                               |
-    |               |   FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                     |
-    |               |     CoalesceBatchesExec: target_batch_size=8192                                                                           |
-    |               |       HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |
-    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                 |
-    |               |         ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true]                             |
-    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]                                    |
-    |               |             CoalesceBatchesExec: target_batch_size=8192                                                                   |
-    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4                                            |
-    |               |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                      |
-    |               |                   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]                                     |
-    |               |                     DataSourceExec: partitions=1, partition_sizes=[1]                                                     |
-    |               |                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                       |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: t1.a, t1.b                                                                                                     |
+    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)           |
+    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                            |
+    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                    |
+    |               |         TableScan: t1 projection=[a, b]                                                                                    |
+    |               |         SubqueryAlias: __scalar_sq_1                                                                                       |
+    |               |           Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true                                    |
+    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]]                                                          |
+    |               |               TableScan: t2 projection=[a]                                                                                 |
+    | physical_plan | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                        |
+    |               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                     |
+    |               |     ProjectionExec: expr=[a@2 as a, b@3 as b, count(*)@0 as count(*), __always_true@1 as __always_true]                    |
+    |               |       HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@1, a@0)], projection=[count(*)@0, __always_true@2, a@3, b@4] |
+    |               |         CoalescePartitionsExec                                                                                             |
+    |               |           ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true]                            |
+    |               |             AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]                                   |
+    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1                                             |
+    |               |                 AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]                                        |
+    |               |                   DataSourceExec: partitions=1, partition_sizes=[1]                                                        |
+    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                  |
+    |               |                                                                                                                            |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -2965,32 +3485,31 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&df_results).unwrap(),
         @r"
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                                      |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.a, t1.b                                                                                                    |
-    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)          |
-    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                           |
-    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                   |
-    |               |         TableScan: t1 projection=[a, b]                                                                                   |
-    |               |         SubqueryAlias: __scalar_sq_1                                                                                      |
-    |               |           Projection: count(*), t2.a, Boolean(true) AS __always_true                                                      |
-    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]]                                             |
-    |               |               TableScan: t2 projection=[a]                                                                                |
-    | physical_plan | CoalesceBatchesExec: target_batch_size=8192                                                                               |
-    |               |   FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                     |
-    |               |     CoalesceBatchesExec: target_batch_size=8192                                                                           |
-    |               |       HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |
-    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                 |
-    |               |         ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true]                                    |
-    |               |           AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)]                                           |
-    |               |             CoalesceBatchesExec: target_batch_size=8192                                                                   |
-    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4                                            |
-    |               |                 RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                      |
-    |               |                   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)]                                            |
-    |               |                     DataSourceExec: partitions=1, partition_sizes=[1]                                                     |
-    |               |                                                                                                                           |
-    +---------------+---------------------------------------------------------------------------------------------------------------------------+
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                                                                       |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
+    | logical_plan  | Projection: t1.a, t1.b                                                                                                     |
+    |               |   Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0)           |
+    |               |     Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true                                            |
+    |               |       Left Join: t1.a = __scalar_sq_1.a                                                                                    |
+    |               |         TableScan: t1 projection=[a, b]                                                                                    |
+    |               |         SubqueryAlias: __scalar_sq_1                                                                                       |
+    |               |           Projection: count(*), t2.a, Boolean(true) AS __always_true                                                       |
+    |               |             Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]]                                              |
+    |               |               TableScan: t2 projection=[a]                                                                                 |
+    | physical_plan | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1]                        |
+    |               |   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1                                                     |
+    |               |     ProjectionExec: expr=[a@2 as a, b@3 as b, count(*)@0 as count(*), __always_true@1 as __always_true]                    |
+    |               |       HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@1, a@0)], projection=[count(*)@0, __always_true@2, a@3, b@4] |
+    |               |         CoalescePartitionsExec                                                                                             |
+    |               |           ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true]                                   |
+    |               |             AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)]                                          |
+    |               |               RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1                                             |
+    |               |                 AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)]                                               |
+    |               |                   DataSourceExec: partitions=1, partition_sizes=[1]                                                        |
+    |               |         DataSourceExec: partitions=1, partition_sizes=[1]                                                                  |
+    |               |                                                                                                                            |
+    +---------------+----------------------------------------------------------------------------------------------------------------------------+
     "
     );
 
@@ -3075,7 +3594,7 @@ async fn sort_on_unprojected_columns() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
@@ -3084,7 +3603,7 @@ async fn sort_on_unprojected_columns() -> Result<()> {
     | 10  |
     | 1   |
     +-----+
-    "###
+    "
     );
 
     Ok(())
@@ -3122,7 +3641,7 @@ async fn sort_on_distinct_columns() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
@@ -3130,7 +3649,7 @@ async fn sort_on_distinct_columns() -> Result<()> {
     | 10  |
     | 1   |
     +-----+
-    "###
+    "
     );
     Ok(())
 }
@@ -3261,14 +3780,14 @@ async fn filter_with_alias_overwrite() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+
     | a    |
     +------+
     | true |
     | true |
     +------+
-    "###
+    "
     );
 
     Ok(())
@@ -3297,7 +3816,7 @@ async fn select_with_alias_overwrite() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-------+
     | a     |
     +-------+
@@ -3306,7 +3825,7 @@ async fn select_with_alias_overwrite() -> Result<()> {
     | true  |
     | false |
     +-------+
-    "###
+    "
     );
 
     Ok(())
@@ -3332,7 +3851,7 @@ async fn test_grouping_sets() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------+-----+---------------+
     | a         | b   | count(test.a) |
     +-----------+-----+---------------+
@@ -3348,7 +3867,7 @@ async fn test_grouping_sets() -> Result<()> {
     | 123AbcDef |     | 1             |
     | 123AbcDef | 100 | 1             |
     +-----------+-----+---------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3375,7 +3894,7 @@ async fn test_grouping_sets_count() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+----+-----------------+
     | c1 | c2 | count(Int32(1)) |
     +----+----+-----------------+
@@ -3390,7 +3909,7 @@ async fn test_grouping_sets_count() -> Result<()> {
     | b  |    | 19              |
     | a  |    | 21              |
     +----+----+-----------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3424,7 +3943,7 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+----+--------+---------------------+
     | c1 | c2 | sum_c3 | avg_c3              |
     +----+----+--------+---------------------+
@@ -3464,7 +3983,7 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> {
     | a  | 2  | -46    | -15.333333333333334 |
     | a  | 1  | -88    | -17.6               |
     +----+----+--------+---------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -3501,25 +4020,25 @@ async fn join_with_alias_filter() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32]
       Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32]
         TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
         TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+----+---+----+---+---+
     | a  | a  | b | c  | b | c |
     +----+----+---+----+---+---+
     | 1  | 3  | a | 10 | a | 1 |
     | 11 | 13 | c | 30 | c | 3 |
     +----+----+---+----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -3546,27 +4065,27 @@ async fn right_semi_with_alias_filter() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]
       Projection: t1.a [a:UInt32]
         Filter: t1.c > Int32(1) [a:UInt32, c:Int32]
           TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]
       Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]
         TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +-----+---+---+
     | a   | b | c |
     +-----+---+---+
     | 10  | b | 2 |
     | 100 | d | 4 |
     +-----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -3593,26 +4112,26 @@ async fn right_anti_filter_push_down() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]
       Projection: t1.a [a:UInt32]
         Filter: t1.c > Int32(1) [a:UInt32, c:Int32]
           TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]
       TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+---+---+
     | a  | b | c |
     +----+---+---+
     | 13 | c | 3 |
     | 3  | a | 1 |
     +----+---+---+
-    "###
+    "
     );
 
     Ok(())
@@ -3625,37 +4144,37 @@ async fn unnest_columns() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+---------------------------------+--------------------------+
-          | shape_id | points                          | tags                     |
-          +----------+---------------------------------+--------------------------+
-          | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | [tag1]                   |
-          | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | [tag1]                   |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | [tag1, tag2, tag3, tag4] |
-          | 4        |                                 | [tag1, tag2, tag3]       |
-          +----------+---------------------------------+--------------------------+
-        "###);
+        @r"
+    +----------+---------------------------------+--------------------------+
+    | shape_id | points                          | tags                     |
+    +----------+---------------------------------+--------------------------+
+    | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | [tag1]                   |
+    | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | [tag1]                   |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | [tag1, tag2, tag3, tag4] |
+    | 4        |                                 | [tag1, tag2, tag3]       |
+    +----------+---------------------------------+--------------------------+
+    ");
 
     // Unnest tags
     let df = table_with_nested_types(NUM_ROWS).await?;
     let results = df.unnest_columns(&["tags"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+---------------------------------+------+
-          | shape_id | points                          | tags |
-          +----------+---------------------------------+------+
-          | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | tag1 |
-          | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | tag1 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag1 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag2 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag3 |
-          | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag4 |
-          | 4        |                                 | tag1 |
-          | 4        |                                 | tag2 |
-          | 4        |                                 | tag3 |
-          +----------+---------------------------------+------+
-        "###);
+        @r"
+    +----------+---------------------------------+------+
+    | shape_id | points                          | tags |
+    +----------+---------------------------------+------+
+    | 1        | [{x: 5, y: -8}, {x: -3, y: -4}] | tag1 |
+    | 2        | [{x: 6, y: 2}, {x: -2, y: -8}]  | tag1 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag1 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag2 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag3 |
+    | 3        | [{x: -9, y: -7}, {x: -2, y: 5}] | tag4 |
+    | 4        |                                 | tag1 |
+    | 4        |                                 | tag2 |
+    | 4        |                                 | tag3 |
+    +----------+---------------------------------+------+
+    ");
 
     // Test aggregate results for tags.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -3667,19 +4186,19 @@ async fn unnest_columns() -> Result<()> {
     let results = df.unnest_columns(&["points"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+----------------+--------------------------+
-          | shape_id | points         | tags                     |
-          +----------+----------------+--------------------------+
-          | 1        | {x: -3, y: -4} | [tag1]                   |
-          | 1        | {x: 5, y: -8}  | [tag1]                   |
-          | 2        | {x: -2, y: -8} | [tag1]                   |
-          | 2        | {x: 6, y: 2}   | [tag1]                   |
-          | 3        | {x: -2, y: 5}  | [tag1, tag2, tag3, tag4] |
-          | 3        | {x: -9, y: -7} | [tag1, tag2, tag3, tag4] |
-          | 4        |                | [tag1, tag2, tag3]       |
-          +----------+----------------+--------------------------+
-        "###);
+        @r"
+    +----------+----------------+--------------------------+
+    | shape_id | points         | tags                     |
+    +----------+----------------+--------------------------+
+    | 1        | {x: -3, y: -4} | [tag1]                   |
+    | 1        | {x: 5, y: -8}  | [tag1]                   |
+    | 2        | {x: -2, y: -8} | [tag1]                   |
+    | 2        | {x: 6, y: 2}   | [tag1]                   |
+    | 3        | {x: -2, y: 5}  | [tag1, tag2, tag3, tag4] |
+    | 3        | {x: -9, y: -7} | [tag1, tag2, tag3, tag4] |
+    | 4        |                | [tag1, tag2, tag3]       |
+    +----------+----------------+--------------------------+
+    ");
 
     // Test aggregate results for points.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -3695,27 +4214,27 @@ async fn unnest_columns() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-          +----------+----------------+------+
-          | shape_id | points         | tags |
-          +----------+----------------+------+
-          | 1        | {x: -3, y: -4} | tag1 |
-          | 1        | {x: 5, y: -8}  | tag1 |
-          | 2        | {x: -2, y: -8} | tag1 |
-          | 2        | {x: 6, y: 2}   | tag1 |
-          | 3        | {x: -2, y: 5}  | tag1 |
-          | 3        | {x: -2, y: 5}  | tag2 |
-          | 3        | {x: -2, y: 5}  | tag3 |
-          | 3        | {x: -2, y: 5}  | tag4 |
-          | 3        | {x: -9, y: -7} | tag1 |
-          | 3        | {x: -9, y: -7} | tag2 |
-          | 3        | {x: -9, y: -7} | tag3 |
-          | 3        | {x: -9, y: -7} | tag4 |
-          | 4        |                | tag1 |
-          | 4        |                | tag2 |
-          | 4        |                | tag3 |
-          +----------+----------------+------+
-    "###);
+        @r"
+    +----------+----------------+------+
+    | shape_id | points         | tags |
+    +----------+----------------+------+
+    | 1        | {x: -3, y: -4} | tag1 |
+    | 1        | {x: 5, y: -8}  | tag1 |
+    | 2        | {x: -2, y: -8} | tag1 |
+    | 2        | {x: 6, y: 2}   | tag1 |
+    | 3        | {x: -2, y: 5}  | tag1 |
+    | 3        | {x: -2, y: 5}  | tag2 |
+    | 3        | {x: -2, y: 5}  | tag3 |
+    | 3        | {x: -2, y: 5}  | tag4 |
+    | 3        | {x: -9, y: -7} | tag1 |
+    | 3        | {x: -9, y: -7} | tag2 |
+    | 3        | {x: -9, y: -7} | tag3 |
+    | 3        | {x: -9, y: -7} | tag4 |
+    | 4        |                | tag1 |
+    | 4        |                | tag2 |
+    | 4        |                | tag3 |
+    +----------+----------------+------+
+    ");
 
     // Test aggregate results for points and tags.
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -3755,7 +4274,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     let results = df.collect().await.unwrap();
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------------+---------+
     | make_array_expr | column1 |
     +-----------------+---------+
@@ -3763,7 +4282,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     | y               | y       |
     | z               | z       |
     +-----------------+---------+
-    "###
+    "
     );
 
     // make_array(dict_encoded_string,literal string)
@@ -3783,7 +4302,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     let results = df.collect().await.unwrap();
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----------------+---------+
     | make_array_expr | column1 |
     +-----------------+---------+
@@ -3794,7 +4313,7 @@ async fn unnest_dict_encoded_columns() -> Result<()> {
     | z               | z       |
     | fixed_string    | z       |
     +-----------------+---------+
-    "###
+    "
     );
     Ok(())
 }
@@ -3805,7 +4324,7 @@ async fn unnest_column_nulls() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +--------+----+
     | list   | id |
     +--------+----+
@@ -3814,7 +4333,7 @@ async fn unnest_column_nulls() -> Result<()> {
     | []     | C  |
     | [3]    | D  |
     +--------+----+
-    "###
+    "
     );
 
     // Unnest, preserving nulls (row with B is preserved)
@@ -3827,7 +4346,7 @@ async fn unnest_column_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+----+
     | list | id |
     +------+----+
@@ -3836,7 +4355,7 @@ async fn unnest_column_nulls() -> Result<()> {
     |      | B  |
     | 3    | D  |
     +------+----+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(false);
@@ -3846,7 +4365,7 @@ async fn unnest_column_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+----+
     | list | id |
     +------+----+
@@ -3854,7 +4373,7 @@ async fn unnest_column_nulls() -> Result<()> {
     | 2    | A  |
     | 3    | D  |
     +------+----+
-    "###
+    "
     );
 
     Ok(())
@@ -3871,7 +4390,7 @@ async fn unnest_fixed_list() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -3882,7 +4401,7 @@ async fn unnest_fixed_list() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(true);
@@ -3893,7 +4412,7 @@ async fn unnest_fixed_list() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -3908,7 +4427,7 @@ async fn unnest_fixed_list() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -3925,7 +4444,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -3936,7 +4455,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(false);
@@ -3947,7 +4466,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -3960,7 +4479,7 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -3996,7 +4515,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+----------------+
     | shape_id | tags           |
     +----------+----------------+
@@ -4007,7 +4526,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     | 5        | [tag51, tag52] |
     | 6        | [tag61, tag62] |
     +----------+----------------+
-    "###
+    "
     );
 
     let options = UnnestOptions::new().with_preserve_nulls(true);
@@ -4017,7 +4536,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+-------+
     | shape_id | tags  |
     +----------+-------+
@@ -4034,7 +4553,7 @@ async fn unnest_fixed_list_non_null() -> Result<()> {
     | 6        | tag61 |
     | 6        | tag62 |
     +----------+-------+
-    "###
+    "
     );
 
     Ok(())
@@ -4048,17 +4567,17 @@ async fn unnest_aggregate_columns() -> Result<()> {
     let results = df.select_columns(&["tags"])?.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
-        +--------------------------+
-        | tags                     |
-        +--------------------------+
-        | [tag1, tag2, tag3, tag4] |
-        | [tag1, tag2, tag3]       |
-        | [tag1, tag2]             |
-        | [tag1]                   |
-        | [tag1]                   |
-        +--------------------------+
-    "###
+        @r"
+    +--------------------------+
+    | tags                     |
+    +--------------------------+
+    | [tag1, tag2, tag3, tag4] |
+    | [tag1, tag2, tag3]       |
+    | [tag1, tag2]             |
+    | [tag1]                   |
+    | [tag1]                   |
+    +--------------------------+
+    "
     );
 
     let df = table_with_nested_types(NUM_ROWS).await?;
@@ -4069,13 +4588,13 @@ async fn unnest_aggregate_columns() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +-------------+
     | count(tags) |
     +-------------+
     | 11          |
     +-------------+
-    "###
+    "
     );
 
     Ok(())
@@ -4148,7 +4667,7 @@ async fn unnest_array_agg() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4162,7 +4681,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     // Doing an `array_agg` by `shape_id` produces:
@@ -4176,7 +4695,7 @@ async fn unnest_array_agg() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------------+
     | shape_id | tag_id       |
     +----------+--------------+
@@ -4184,7 +4703,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 2        | [21, 22, 23] |
     | 3        | [31, 32, 33] |
     +----------+--------------+
-    "###
+    "
     );
 
     // Unnesting again should produce the original batch.
@@ -4200,7 +4719,7 @@ async fn unnest_array_agg() -> Result<()> {
         .await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4214,7 +4733,7 @@ async fn unnest_array_agg() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     Ok(())
@@ -4244,7 +4763,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     let results = df.clone().collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+--------+
     | shape_id | tag_id |
     +----------+--------+
@@ -4258,7 +4777,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     | 3        | 32     |
     | 3        | 33     |
     +----------+--------+
-    "###
+    "
     );
 
     // Doing an `array_agg` by `shape_id` produces:
@@ -4277,18 +4796,18 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     let actual = formatted.trim();
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Projection: shapes.shape_id [shape_id:UInt32]
       Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N]
-        Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+        Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(UInt32);N]
           TableScan: shapes projection=[shape_id] [shape_id:UInt32]
-    "###
+    "
     );
 
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----------+
     | shape_id |
     +----------+
@@ -4302,7 +4821,7 @@ async fn unnest_with_redundant_columns() -> Result<()> {
     | 3        |
     | 3        |
     +----------+
-    "###
+    "
     );
 
     Ok(())
@@ -4343,7 +4862,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     // string:      a, b, c, d
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+------------+------------+--------+
     | list | large_list | fixed_list | string |
     +------+------------+------------+--------+
@@ -4357,7 +4876,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     |      |            | 4          | c      |
     |      |            |            | d      |
     +------+------------+------------+--------+
-    "###
+    "
     );
 
     // Test with `preserve_nulls = false``
@@ -4374,7 +4893,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     // string:      a, b, c, d
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +------+------------+------------+--------+
     | list | large_list | fixed_list | string |
     +------+------------+------------+--------+
@@ -4387,7 +4906,7 @@ async fn unnest_multiple_columns() -> Result<()> {
     |      |            | 3          | c      |
     |      |            | 4          | c      |
     +------+------------+------------+--------+
-    "###
+    "
     );
 
     Ok(())
@@ -4416,7 +4935,7 @@ async fn unnest_non_nullable_list() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +----+
     | c1 |
     +----+
@@ -4424,7 +4943,7 @@ async fn unnest_non_nullable_list() -> Result<()> {
     | 2  |
     |    |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -4469,7 +4988,7 @@ async fn test_read_batches() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+--------+
     | id | number |
     +----+--------+
@@ -4482,7 +5001,7 @@ async fn test_read_batches() -> Result<()> {
     | 5  | 3.33   |
     | 5  | 6.66   |
     +----+--------+
-    "###
+    "
     );
     Ok(())
 }
@@ -4503,10 +5022,10 @@ async fn test_read_batches_empty() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     ++
     ++
-    "###
+    "
     );
     Ok(())
 }
@@ -4527,7 +5046,10 @@ async fn consecutive_projection_same_schema() -> Result<()> {
 
     // Add `t` column full of nulls
     let df = df
-        .with_column("t", cast(Expr::Literal(ScalarValue::Null), DataType::Int32))
+        .with_column(
+            "t",
+            cast(Expr::Literal(ScalarValue::Null, None), DataType::Int32),
+        )
         .unwrap();
     df.clone().show().await.unwrap();
 
@@ -4552,14 +5074,14 @@ async fn consecutive_projection_same_schema() -> Result<()> {
     let results = df.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +----+----+----+
     | id | t  | t2 |
     +----+----+----+
     | 0  |    |    |
     | 1  | 10 | 10 |
     +----+----+----+
-    "###
+    "
     );
 
     Ok(())
@@ -4846,7 +5368,7 @@ async fn use_var_provider() -> Result<()> {
         Field::new("bar", DataType::Int64, false),
     ]));
 
-    let mem_table = Arc::new(MemTable::try_new(schema, vec![])?);
+    let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![]])?);
 
     let config = SessionConfig::new()
         .with_target_partitions(4)
@@ -4873,13 +5395,13 @@ async fn test_array_agg() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-------------------------------------+
     | array_agg(test.a)                   |
     +-------------------------------------+
     | [abcDEF, abc123, CBAdef, 123AbcDef] |
     +-------------------------------------+
-    "###
+    "
     );
 
     Ok(())
@@ -4904,11 +5426,11 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Filter: a = $0 [a:Int32]
       Projection: Int32(1) AS a [a:Int32]
-        EmptyRelation []
-    "###
+        EmptyRelation: rows=1 []
+    "
     );
 
     // Executing LogicalPlans with placeholders that don't have bound values
@@ -4937,20 +5459,20 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Filter: a = Int32(3) [a:Int32]
       Projection: Int32(1) AS a [a:Int32]
-        EmptyRelation []
-    "###
+        EmptyRelation: rows=1 []
+    "
     );
 
     // N.B., the test is basically `SELECT 1 as a WHERE a = 3;` which returns no results.
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     ++
     ++
-    "###
+    "
     );
 
     Ok(())
@@ -4968,10 +5490,10 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r"
     Projection: $1 [$1:Null;N]
-      EmptyRelation []
-    "###
+      EmptyRelation: rows=1 []
+    "
     );
 
     // Executing LogicalPlans with placeholders that don't have bound values
@@ -4998,21 +5520,21 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
-    Projection: Int32(3) AS $1 [$1:Null;N]
-      EmptyRelation []
-    "###
+        @r"
+    Projection: Int32(3) AS $1 [$1:Int32]
+      EmptyRelation: rows=1 []
+    "
     );
 
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----+
     | $1 |
     +----+
     | 3  |
     +----+
-    "###
+    "
     );
 
     Ok(())
@@ -5037,11 +5559,11 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r#"
     Filter: a LIKE $1 [a:Utf8]
       Projection: Utf8("foo") AS a [a:Utf8]
-        EmptyRelation []
-    "###
+        EmptyRelation: rows=1 []
+    "#
     );
 
     // Executing LogicalPlans with placeholders that don't have bound values
@@ -5070,51 +5592,54 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> {
 
     assert_snapshot!(
         actual,
-        @r###"
+        @r#"
     Filter: a LIKE Utf8("f%") [a:Utf8]
       Projection: Utf8("foo") AS a [a:Utf8]
-        EmptyRelation []
-    "###
+        EmptyRelation: rows=1 []
+    "#
     );
 
     assert_snapshot!(
        batches_to_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----+
     | a   |
     +-----+
     | foo |
     +-----+
-    "###
+    "
     );
 
     Ok(())
 }
 
+#[rstest]
+#[case(DataType::Utf8)]
+#[case(DataType::LargeUtf8)]
+#[case(DataType::Utf8View)]
 #[tokio::test]
-async fn write_partitioned_parquet_results() -> Result<()> {
-    // create partitioned input file and context
-    let tmp_dir = TempDir::new()?;
-
-    let ctx = SessionContext::new();
-
+async fn write_partitioned_parquet_results(#[case] string_type: DataType) -> Result<()> {
     // Create an in memory table with schema C1 and C2, both strings
     let schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Utf8, false),
-        Field::new("c2", DataType::Utf8, false),
+        Field::new("c1", string_type.clone(), false),
+        Field::new("c2", string_type.clone(), false),
     ]));
 
-    let record_batch = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(StringArray::from(vec!["abc", "def"])),
-            Arc::new(StringArray::from(vec!["123", "456"])),
-        ],
-    )?;
+    let columns = [
+        Arc::new(StringArray::from(vec!["abc", "def"])) as ArrayRef,
+        Arc::new(StringArray::from(vec!["123", "456"])) as ArrayRef,
+    ]
+    .map(|col| arrow::compute::cast(&col, &string_type).unwrap())
+    .to_vec();
+
+    let record_batch = RecordBatch::try_new(schema.clone(), columns)?;
 
     let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![record_batch]])?);
 
     // Register the table in the context
+    // create partitioned input file and context
+    let tmp_dir = TempDir::new()?;
+    let ctx = SessionContext::new();
     ctx.register_table("test", mem_table)?;
 
     let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
@@ -5141,16 +5666,17 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the c2 column is gone and that c1 is abc.
     let results = filter_df.collect().await?;
+    insta::allow_duplicates! {
     assert_snapshot!(
        batches_to_string(&results),
-        @r###"
+        @r"
     +-----+
     | c1  |
     +-----+
     | abc |
     +-----+
-    "###
-    );
+    "
+    )};
 
     // Read the entire set of parquet files
     let df = ctx
@@ -5163,17 +5689,19 @@ async fn write_partitioned_parquet_results() -> Result<()> {
 
     // Check that the df has the entire set of data
     let results = df.collect().await?;
-    assert_snapshot!(
-        batches_to_sort_string(&results),
-        @r###"
+    insta::allow_duplicates! {
+        assert_snapshot!(
+            batches_to_sort_string(&results),
+            @r"
     +-----+-----+
     | c1  | c2  |
     +-----+-----+
     | abc | 123 |
     | def | 456 |
     +-----+-----+
-    "###
-    );
+    "
+        )
+    };
 
     Ok(())
 }
@@ -5284,11 +5812,11 @@ async fn union_literal_is_null_and_not_null() -> Result<()> {
     for batch in batches {
         // Verify schema is the same for all batches
         if !schema.contains(&batch.schema()) {
-            return Err(DataFusionError::Internal(format!(
+            return Err(internal_datafusion_err!(
                 "Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}",
                 &schema,
                 batch.schema()
-            )));
+            ));
         }
     }
 
@@ -5329,7 +5857,7 @@ async fn sparse_union_is_null() {
     // view_all
     assert_snapshot!(
         batches_to_sort_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5340,14 +5868,14 @@ async fn sparse_union_is_null() {
     | {C=a}    |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is null
     let result_df = df.clone().filter(col("my_union").is_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5355,14 +5883,14 @@ async fn sparse_union_is_null() {
     | {B=}     |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is not null
     let result_df = df.filter(col("my_union").is_not_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5370,7 +5898,7 @@ async fn sparse_union_is_null() {
     | {B=3.2}  |
     | {C=a}    |
     +----------+
-    "###
+    "
     );
 }
 
@@ -5412,7 +5940,7 @@ async fn dense_union_is_null() {
     // view_all
     assert_snapshot!(
         batches_to_sort_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5423,14 +5951,14 @@ async fn dense_union_is_null() {
     | {C=a}    |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is null
     let result_df = df.clone().filter(col("my_union").is_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5438,14 +5966,14 @@ async fn dense_union_is_null() {
     | {B=}     |
     | {C=}     |
     +----------+
-    "###
+    "
     );
 
     // filter where is not null
     let result_df = df.filter(col("my_union").is_not_null()).unwrap();
     assert_snapshot!(
         batches_to_sort_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------+
     | my_union |
     +----------+
@@ -5453,7 +5981,7 @@ async fn dense_union_is_null() {
     | {B=3.2}  |
     | {C=a}    |
     +----------+
-    "###
+    "
     );
 }
 
@@ -5485,7 +6013,7 @@ async fn boolean_dictionary_as_filter() {
     // view_all
     assert_snapshot!(
        batches_to_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +---------+
     | my_dict |
     +---------+
@@ -5497,14 +6025,14 @@ async fn boolean_dictionary_as_filter() {
     | true    |
     | false   |
     +---------+
-    "###
+    "
     );
 
     let result_df = df.clone().filter(col("my_dict")).unwrap();
 
     assert_snapshot!(
        batches_to_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +---------+
     | my_dict |
     +---------+
@@ -5512,7 +6040,7 @@ async fn boolean_dictionary_as_filter() {
     | true    |
     | true    |
     +---------+
-    "###
+    "
     );
 
     // test nested dictionary
@@ -5543,26 +6071,26 @@ async fn boolean_dictionary_as_filter() {
     // view_all
     assert_snapshot!(
        batches_to_string(&df.clone().collect().await.unwrap()),
-        @r###"
+        @r"
     +----------------+
     | my_nested_dict |
     +----------------+
     | true           |
     | false          |
     +----------------+
-    "###
+    "
     );
 
     let result_df = df.clone().filter(col("my_nested_dict")).unwrap();
     assert_snapshot!(
        batches_to_string(&result_df.collect().await.unwrap()),
-        @r###"
+        @r"
     +----------------+
     | my_nested_dict |
     +----------------+
     | true           |
     +----------------+
-    "###
+    "
     );
 }
 
@@ -5630,7 +6158,7 @@ async fn test_alias() -> Result<()> {
         .await?
         .select(vec![col("a"), col("test.b"), lit(1).alias("one")])?
         .alias("table_alias")?;
-    // All ouput column qualifiers are changed to "table_alias"
+    // All output column qualifiers are changed to "table_alias"
     df.schema().columns().iter().for_each(|c| {
         assert_eq!(c.relation, Some("table_alias".into()));
     });
@@ -5640,11 +6168,11 @@ async fn test_alias() -> Result<()> {
         .into_unoptimized_plan()
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32]
       Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]
         TableScan: test [a:Utf8, b:Int32]
-    "###);
+    ");
 
     // Select over the aliased DataFrame
     let df = df.select(vec![
@@ -5653,7 +6181,7 @@ async fn test_alias() -> Result<()> {
     ])?;
     assert_snapshot!(
         batches_to_sort_string(&df.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+---------------------------------+
     | a         | table_alias.b + table_alias.one |
     +-----------+---------------------------------+
@@ -5662,7 +6190,7 @@ async fn test_alias() -> Result<()> {
     | abc123    | 11                              |
     | abcDEF    | 2                               |
     +-----------+---------------------------------+
-    "###
+    "
     );
     Ok(())
 }
@@ -5671,6 +6199,7 @@ async fn test_alias() -> Result<()> {
 async fn test_alias_with_metadata() -> Result<()> {
     let mut metadata = HashMap::new();
     metadata.insert(String::from("k"), String::from("v"));
+    let metadata = FieldMetadata::from(metadata);
     let df = create_test_table("test")
         .await?
         .select(vec![col("a").alias_with_metadata("b", Some(metadata))])?
@@ -5691,7 +6220,7 @@ async fn test_alias_self_join() -> Result<()> {
     let joined = left.join(right, JoinType::Full, &["a"], &["a"], None)?;
     assert_snapshot!(
         batches_to_sort_string(&joined.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----+-----------+-----+
     | a         | b   | a         | b   |
     +-----------+-----+-----------+-----+
@@ -5700,7 +6229,7 @@ async fn test_alias_self_join() -> Result<()> {
     | abc123    | 10  | abc123    | 10  |
     | abcDEF    | 1   | abcDEF    | 1   |
     +-----------+-----+-----------+-----+
-    "###
+    "
     );
     Ok(())
 }
@@ -5713,14 +6242,14 @@ async fn test_alias_empty() -> Result<()> {
         .into_unoptimized_plan()
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias:  [a:Utf8, b:Int32]
       TableScan: test [a:Utf8, b:Int32]
-    "###);
+    ");
 
     assert_snapshot!(
         batches_to_sort_string(&df.select(vec![col("a"), col("b")])?.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----+
     | a         | b   |
     +-----------+-----+
@@ -5729,7 +6258,7 @@ async fn test_alias_empty() -> Result<()> {
     | abc123    | 10  |
     | abcDEF    | 1   |
     +-----------+-----+
-    "###
+    "
     );
 
     Ok(())
@@ -5748,12 +6277,12 @@ async fn test_alias_nested() -> Result<()> {
         .into_optimized_plan()?
         .display_indent_schema()
         .to_string();
-    assert_snapshot!(plan, @r###"
+    assert_snapshot!(plan, @r"
     SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32]
       SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32]
         Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]
           TableScan: test projection=[a, b] [a:Utf8, b:Int32]
-    "###);
+    ");
 
     // Select over the aliased DataFrame
     let select1 = df
@@ -5762,7 +6291,7 @@ async fn test_alias_nested() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&select1.collect().await.unwrap()),
-        @r###"
+        @r"
     +-----------+-----------------------+
     | a         | alias2.b + alias2.one |
     +-----------+-----------------------+
@@ -5771,7 +6300,7 @@ async fn test_alias_nested() -> Result<()> {
     | abc123    | 11                    |
     | abcDEF    | 2                     |
     +-----------+-----------------------+
-    "###
+    "
     );
 
     // Only the outermost alias is visible
@@ -5790,7 +6319,7 @@ async fn register_non_json_file() {
         .register_json(
             "data",
             "tests/data/test_binary.parquet",
-            NdJsonReadOptions::default(),
+            JsonReadOptions::default(),
         )
         .await;
     assert_contains!(
@@ -5891,7 +6420,10 @@ async fn test_insert_into_checking() -> Result<()> {
         .await
         .unwrap_err();
 
-    assert_contains!(e.to_string(), "Inserting query schema mismatch: Expected table field 'a' with type Int64, but got 'column1' with type Utf8");
+    assert_contains!(
+        e.to_string(),
+        "Inserting query schema mismatch: Expected table field 'a' with type Int64, but got 'column1' with type Utf8"
+    );
 
     Ok(())
 }
@@ -5938,7 +6470,7 @@ async fn test_fill_null() -> Result<()> {
     let results = df_filled.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -5946,7 +6478,7 @@ async fn test_fill_null() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
 
     Ok(())
@@ -5966,7 +6498,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
 
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -5974,7 +6506,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
 
     // Fill column "a" null values with a value that cannot be cast to Int32.
@@ -5983,7 +6515,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     let results = df_filled.collect().await?;
     assert_snapshot!(
         batches_to_sort_string(&results),
-        @r###"
+        @r"
     +---+---------+
     | a | b       |
     +---+---------+
@@ -5991,7 +6523,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
     | 1 | x       |
     | 3 | z       |
     +---+---------+
-    "###
+    "
     );
     Ok(())
 }
@@ -6000,7 +6532,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
 async fn test_insert_into_casting_support() -> Result<()> {
     // Testing case1:
     // Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8.
-    // And the cast is not supported from Utf8 to Float16.
+    // And the cast is not supported from Binary to Float16.
 
     // Create a new schema with one field called "a" of type Float16, and setting nullable to false
     let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float16, false)]));
@@ -6011,7 +6543,10 @@ async fn test_insert_into_casting_support() -> Result<()> {
     let initial_table = Arc::new(MemTable::try_new(schema.clone(), vec![vec![]])?);
     session_ctx.register_table("t", initial_table.clone())?;
 
-    let mut write_df = session_ctx.sql("values ('a123'), ('b456')").await.unwrap();
+    let mut write_df = session_ctx
+        .sql("values (x'a123'), (x'b456')")
+        .await
+        .unwrap();
 
     write_df = write_df
         .clone()
@@ -6023,7 +6558,10 @@ async fn test_insert_into_casting_support() -> Result<()> {
         .await
         .unwrap_err();
 
-    assert_contains!(e.to_string(), "Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8.");
+    assert_contains!(
+        e.to_string(),
+        "Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Binary."
+    );
 
     // Testing case2:
     // Inserting query schema mismatch: Expected table field 'a' with type Utf8View, but got 'a' with type Utf8.
@@ -6061,14 +6599,14 @@ async fn test_insert_into_casting_support() -> Result<()> {
 
     assert_snapshot!(
        batches_to_string(&res),
-        @r###"
+        @r"
     +------+
     | a    |
     +------+
     | a123 |
     | b456 |
     +------+
-    "###
+    "
     );
     Ok(())
 }
@@ -6131,3 +6669,188 @@ async fn test_dataframe_macro() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn test_copy_schema() -> Result<()> {
+    let tmp_dir = TempDir::new()?;
+
+    let session_state = SessionStateBuilder::new_with_default_features().build();
+
+    let session_ctx = SessionContext::new_with_state(session_state);
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+
+    // Create and register the source table with the provided schema and data
+    let source_table = Arc::new(MemTable::try_new(schema.clone(), vec![vec![]])?);
+    session_ctx.register_table("source_table", source_table.clone())?;
+
+    let target_path = tmp_dir.path().join("target.csv");
+
+    let query = format!(
+        "COPY source_table TO '{}' STORED AS csv",
+        target_path.to_str().unwrap()
+    );
+
+    let result = session_ctx.sql(&query).await?;
+    assert_logical_expr_schema_eq_physical_expr_schema(result).await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_copy_to_preserves_order() -> Result<()> {
+    let tmp_dir = TempDir::new()?;
+
+    let session_state = SessionStateBuilder::new_with_default_features().build();
+    let session_ctx = SessionContext::new_with_state(session_state);
+
+    let target_path = tmp_dir.path().join("target_ordered.csv");
+    let csv_file_format = session_ctx
+        .state()
+        .get_file_format_factory("csv")
+        .map(format_as_file_type)
+        .unwrap();
+
+    let ordered_select_plan = LogicalPlanBuilder::values(vec![
+        vec![lit(1u64)],
+        vec![lit(10u64)],
+        vec![lit(20u64)],
+        vec![lit(100u64)],
+    ])?
+    .sort(vec![SortExpr::new(col("column1"), false, true)])?
+    .build()?;
+
+    let copy_to_plan = LogicalPlanBuilder::copy_to(
+        ordered_select_plan,
+        target_path.to_str().unwrap().to_string(),
+        csv_file_format,
+        HashMap::new(),
+        vec![],
+    )?
+    .build()?;
+
+    let union_side_branch = LogicalPlanBuilder::values(vec![vec![lit(1u64)]])?.build()?;
+    let union_plan = LogicalPlanBuilder::from(copy_to_plan)
+        .union(union_side_branch)?
+        .build()?;
+
+    let frame = session_ctx.execute_logical_plan(union_plan).await?;
+    let physical_plan = frame.create_physical_plan().await?;
+
+    let physical_plan_format =
+        displayable(physical_plan.as_ref()).indent(true).to_string();
+
+    // Expect that input to the DataSinkExec is sorted correctly
+    assert_snapshot!(
+        physical_plan_format,
+        @r"
+    UnionExec
+      DataSinkExec: sink=CsvSink(file_groups=[])
+        SortExec: expr=[column1@0 DESC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[1]
+      DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Simple schema with just the fields we need
+    let file_schema = Arc::new(Schema::new(vec![
+        Field::new(
+            "timestamp",
+            DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
+            true,
+        ),
+        Field::new("ticker", DataType::Utf8, true),
+        Field::new("value", DataType::Float64, true),
+        Field::new("date", DataType::Utf8, false),
+    ]));
+
+    let df_schema = DFSchema::try_from(file_schema.clone())?;
+
+    let timestamp = col("timestamp");
+    let value = col("value");
+    let ticker = col("ticker");
+    let date = col("date");
+
+    let mock_exec = Arc::new(EmptyExec::new(file_schema.clone()));
+
+    // Build first_value aggregate
+    let first_value = Arc::new(
+        AggregateExprBuilder::new(
+            datafusion_functions_aggregate::first_last::first_value_udaf(),
+            vec![ctx.create_physical_expr(value.clone(), &df_schema)?],
+        )
+        .alias("first_value(value)")
+        .order_by(vec![PhysicalSortExpr::new(
+            ctx.create_physical_expr(timestamp.clone(), &df_schema)?,
+            SortOptions::new(false, false),
+        )])
+        .schema(file_schema.clone())
+        .build()
+        .expect("Failed to build first_value"),
+    );
+
+    // Build last_value aggregate
+    let last_value = Arc::new(
+        AggregateExprBuilder::new(
+            datafusion_functions_aggregate::first_last::last_value_udaf(),
+            vec![ctx.create_physical_expr(value.clone(), &df_schema)?],
+        )
+        .alias("last_value(value)")
+        .order_by(vec![PhysicalSortExpr::new(
+            ctx.create_physical_expr(timestamp.clone(), &df_schema)?,
+            SortOptions::new(false, false),
+        )])
+        .schema(file_schema.clone())
+        .build()
+        .expect("Failed to build last_value"),
+    );
+
+    let partial_agg = AggregateExec::try_new(
+        AggregateMode::Partial,
+        PhysicalGroupBy::new_single(vec![
+            (
+                ctx.create_physical_expr(date.clone(), &df_schema)?,
+                "date".to_string(),
+            ),
+            (
+                ctx.create_physical_expr(ticker.clone(), &df_schema)?,
+                "ticker".to_string(),
+            ),
+        ]),
+        vec![first_value, last_value],
+        vec![None, None],
+        mock_exec,
+        file_schema,
+    )
+    .expect("Failed to build partial agg");
+
+    // Assert that the schema field names match the expected names
+    let expected_field_names = vec![
+        "date",
+        "ticker",
+        "first_value(value)[first_value]",
+        "timestamp@0",
+        "first_value(value)[first_value_is_set]",
+        "last_value(value)[last_value]",
+        "timestamp@0",
+        "last_value(value)[last_value_is_set]",
+    ];
+
+    let binding = partial_agg.schema();
+    let actual_field_names: Vec<_> = binding.fields().iter().map(|f| f.name()).collect();
+    assert_eq!(actual_field_names, expected_field_names);
+
+    // Ensure that DFSchema::try_from does not fail
+    let partial_agg_exec_schema = DFSchema::try_from(partial_agg.schema());
+    assert!(
+        partial_agg_exec_schema.is_ok(),
+        "Expected get AggregateExec schema to succeed with duplicate state fields"
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/datasource/csv.rs b/datafusion/core/tests/datasource/csv.rs
new file mode 100644
index 0000000000000..2e1daa113b096
--- /dev/null
+++ b/datafusion/core/tests/datasource/csv.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test for CSV schema inference with different column counts (GitHub issue #17516)
+
+use datafusion::error::Result;
+use datafusion::prelude::*;
+use datafusion_common::test_util::batches_to_sort_string;
+use insta::assert_snapshot;
+use std::fs;
+use tempfile::TempDir;
+
+#[tokio::test]
+async fn test_csv_schema_inference_different_column_counts() -> Result<()> {
+    // Create temporary directory for test files
+    let temp_dir = TempDir::new().expect("Failed to create temp dir");
+    let temp_path = temp_dir.path();
+
+    // Create CSV file 1 with 3 columns (simulating older railway services format)
+    let csv1_content = r#"service_id,route_type,agency_id
+1,bus,agency1
+2,rail,agency2
+3,bus,agency3
+"#;
+    fs::write(temp_path.join("services_2024.csv"), csv1_content)?;
+
+    // Create CSV file 2 with 6 columns (simulating newer railway services format)
+    let csv2_content = r#"service_id,route_type,agency_id,stop_platform_change,stop_planned_platform,stop_actual_platform
+4,rail,agency2,true,Platform A,Platform B
+5,bus,agency1,false,Stop 1,Stop 1
+6,rail,agency3,true,Platform C,Platform D
+"#;
+    fs::write(temp_path.join("services_2025.csv"), csv2_content)?;
+
+    // Create DataFusion context
+    let ctx = SessionContext::new();
+
+    // This should now work (previously would have failed with column count mismatch)
+    // Enable truncated_rows to handle files with different column counts
+    let df = ctx
+        .read_csv(
+            temp_path.to_str().unwrap(),
+            CsvReadOptions::new().truncated_rows(true),
+        )
+        .await
+        .expect("Should successfully read CSV directory with different column counts");
+
+    // Verify the schema contains all 6 columns (union of both files)
+    let df_clone = df.clone();
+    let schema = df_clone.schema();
+    assert_eq!(
+        schema.fields().len(),
+        6,
+        "Schema should contain all 6 columns"
+    );
+
+    // Check that we have all expected columns
+    let field_names: Vec<&str> =
+        schema.fields().iter().map(|f| f.name().as_str()).collect();
+    assert!(field_names.contains(&"service_id"));
+    assert!(field_names.contains(&"route_type"));
+    assert!(field_names.contains(&"agency_id"));
+    assert!(field_names.contains(&"stop_platform_change"));
+    assert!(field_names.contains(&"stop_planned_platform"));
+    assert!(field_names.contains(&"stop_actual_platform"));
+
+    // All fields should be nullable since they don't appear in all files
+    for field in schema.fields() {
+        assert!(
+            field.is_nullable(),
+            "Field {} should be nullable",
+            field.name()
+        );
+    }
+
+    // Verify we can actually read the data
+    let results = df.collect().await?;
+
+    // Calculate total rows across all batches
+    let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
+    assert_eq!(total_rows, 6, "Should have 6 total rows across all batches");
+
+    // All batches should have 6 columns (the union schema)
+    for batch in &results {
+        assert_eq!(batch.num_columns(), 6, "All batches should have 6 columns");
+        assert_eq!(
+            batch.schema().fields().len(),
+            6,
+            "Each batch should use the union schema with 6 fields"
+        );
+    }
+
+    // Verify the actual content of the data using snapshot testing
+    assert_snapshot!(batches_to_sort_string(&results), @r"
+    +------------+------------+-----------+----------------------+-----------------------+----------------------+
+    | service_id | route_type | agency_id | stop_platform_change | stop_planned_platform | stop_actual_platform |
+    +------------+------------+-----------+----------------------+-----------------------+----------------------+
+    | 1          | bus        | agency1   |                      |                       |                      |
+    | 2          | rail       | agency2   |                      |                       |                      |
+    | 3          | bus        | agency3   |                      |                       |                      |
+    | 4          | rail       | agency2   | true                 | Platform A            | Platform B           |
+    | 5          | bus        | agency1   | false                | Stop 1                | Stop 1               |
+    | 6          | rail       | agency3   | true                 | Platform C            | Platform D           |
+    +------------+------------+-----------+----------------------+-----------------------+----------------------+
+    ");
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/datasource/mod.rs b/datafusion/core/tests/datasource/mod.rs
new file mode 100644
index 0000000000000..3785aa0766182
--- /dev/null
+++ b/datafusion/core/tests/datasource/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for various DataSources
+//!
+//! Note tests for the Parquet format are in `parquet_integration` binary
+
+// Include tests in csv module
+mod csv;
+mod object_store_access;
diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs
new file mode 100644
index 0000000000000..30654c687f8d2
--- /dev/null
+++ b/datafusion/core/tests/datasource/object_store_access.rs
@@ -0,0 +1,971 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for object store access patterns with [`ListingTable`]\
+//!
+//! These tests setup a `ListingTable` backed by an in-memory object store
+//! that counts the number of requests made against it and then do
+//! various operations (table creation, queries with and without predicates)
+//! to verify the expected object store access patterns.
+//!
+//! [`ListingTable`]: datafusion::datasource::listing::ListingTable
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch};
+use async_trait::async_trait;
+use bytes::Bytes;
+use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext};
+use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource_csv::CsvFormat;
+use futures::stream::BoxStream;
+use insta::assert_snapshot;
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{
+    CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
+    ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
+    PutResult,
+};
+use parking_lot::Mutex;
+use std::fmt;
+use std::fmt::{Display, Formatter};
+use std::ops::Range;
+use std::sync::Arc;
+use url::Url;
+
+#[tokio::test]
+async fn create_single_csv_file() {
+    let test = Test::new().with_single_file_csv().await;
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=csv_table.csv head=true
+    - GET  (opts) path=csv_table.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_single_csv_file() {
+    let test = Test::new().with_single_file_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.00001 | 5e-12 | true  |
+    | 0.00002 | 4e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=csv_table.csv head=true
+    - GET  (opts) path=csv_table.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_multi_file_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn multi_query_multi_file_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+
+    // Force a cache eviction by removing the data limit for the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // Then re-enable the cache
+    assert_snapshot!(
+        test.query("set datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // this query should list the table since the cache entries were evicted
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+
+    // this query should not list the table since the entries were added in the previous query
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_multi_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_partitioned_csv_file() {
+    let test = Test::new().with_partitioned_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00001 | 1e-12 | true  | 1 | 10 | 100 |
+    | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    | 0.00003 | 3e-12 | true  | 3 | 30 | 300 |
+    | 0.00003 | 5e-12 | false | 3 | 30 | 300 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
+    - GET  (opts) path=data/a=3/b=30/c=300/file_3.csv
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned WHERE a=2").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned WHERE b=20").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned WHERE c=200").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned WHERE a=2 AND b=20").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00002 | 2e-12 | true  | 2 | 20 | 200 |
+    | 0.00003 | 5e-12 | false | 2 | 20 | 200 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
+    "
+    );
+
+    assert_snapshot!(
+        test.query("select * from csv_table_partitioned WHERE a<2 AND b=10 AND c=100").await,
+        @r"
+    ------- Query Output (2 rows) -------
+    +---------+-------+-------+---+----+-----+
+    | d1      | d2    | d3    | a | b  | c   |
+    +---------+-------+-------+---+----+-----+
+    | 0.00001 | 1e-12 | true  | 1 | 10 | 100 |
+    | 0.00003 | 5e-12 | false | 1 | 10 | 100 |
+    +---------+-------+-------+---+----+-----+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 1
+    - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_default() {
+    // The default metadata size hint is 512KB
+    // which is enough to fetch the entire footer metadata and PageIndex
+    // in a single GET request.
+    let test = Test::new().with_single_file_parquet().await;
+    // expect 1 get request which reads the footer metadata and page index
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (opts) path=parquet_table.parquet range=0-2994
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_prefetch() {
+    // Explicitly specify a prefetch hint that is adequate for the footer and page index
+    let test = Test::new()
+        .with_parquet_metadata_size_hint(Some(1000))
+        .with_single_file_parquet()
+        .await;
+    // expect 1 1000 byte request which reads the footer metadata and page index
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (opts) path=parquet_table.parquet range=1994-2994
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_too_small_prefetch() {
+    // configure a prefetch size that is too small to fetch the footer
+    // metadata
+    //
+    // Using the ranges from  the test below (with no_prefetch),
+    // pick a number less than 730:
+    // --------
+    // 2286-2294: (8 bytes) footer + length
+    // 2264-2986: (722 bytes) footer metadata
+    let test = Test::new()
+        .with_parquet_metadata_size_hint(Some(500))
+        .with_single_file_parquet()
+        .await;
+    // expect three get requests:
+    // 1. read the footer (500 bytes per hint, not enough for the footer metadata)
+    // 2. Read the footer metadata
+    // 3. reads the PageIndex
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (opts) path=parquet_table.parquet range=2494-2994
+    - GET  (opts) path=parquet_table.parquet range=2264-2986
+    - GET  (opts) path=parquet_table.parquet range=2124-2264
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_small_prefetch() {
+    // configure a prefetch size that is large enough for the footer
+    // metadata but **not** the PageIndex
+    //
+    // Using the ranges from the test below (with no_prefetch),
+    // the 730 is determined as follows;
+    // --------
+    // 2286-2294: (8 bytes) footer + length
+    // 2264-2986: (722 bytes) footer metadata
+    let test = Test::new()
+        // 740 is enough to get both the footer + length (8 bytes)
+        // but not the entire PageIndex
+        .with_parquet_metadata_size_hint(Some(740))
+        .with_single_file_parquet()
+        .await;
+    // expect two get requests:
+    // 1. read the footer metadata
+    // 2. reads the PageIndex
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (opts) path=parquet_table.parquet range=2254-2994
+    - GET  (opts) path=parquet_table.parquet range=2124-2264
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_no_prefetch() {
+    let test = Test::new()
+        // force no prefetch by setting size hint to None
+        .with_parquet_metadata_size_hint(None)
+        .with_single_file_parquet()
+        .await;
+    // Without a metadata size hint, the parquet reader
+    // does *three* range requests to read the footer metadata:
+    // 1. The footer length (last 8 bytes)
+    // 2. The footer metadata
+    // 3. The PageIndex metadata
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (opts) path=parquet_table.parquet range=0-2994
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_single_parquet_file() {
+    let test = Test::new().with_single_file_parquet().await;
+    assert_snapshot!(
+        test.query("select count(distinct a), count(b) from parquet_table").await,
+        @r"
+    ------- Query Output (1 rows) -------
+    +---------------------------------+------------------------+
+    | count(DISTINCT parquet_table.a) | count(parquet_table.b) |
+    +---------------------------------+------------------------+
+    | 200                             | 200                    |
+    +---------------------------------+------------------------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=4-534,534-1064
+    - GET  (ranges) path=parquet_table.parquet ranges=1064-1594,1594-2124
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_single_parquet_file_with_single_predicate() {
+    let test = Test::new().with_single_file_parquet().await;
+    // Note that evaluating predicates requires additional object store requests
+    // (to evaluate predicates)
+    assert_snapshot!(
+        test.query("select min(a), max(b) from parquet_table WHERE a > 150").await,
+        @r"
+    ------- Query Output (1 rows) -------
+    +----------------------+----------------------+
+    | min(parquet_table.a) | max(parquet_table.b) |
+    +----------------------+----------------------+
+    | 151                  | 1199                 |
+    +----------------------+----------------------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
+    "
+    );
+}
+
+#[tokio::test]
+async fn query_single_parquet_file_multi_row_groups_multiple_predicates() {
+    let test = Test::new().with_single_file_parquet().await;
+
+    // Note that evaluating predicates requires additional object store requests
+    // (to evaluate predicates)
+    assert_snapshot!(
+        test.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await,
+        @r"
+    ------- Query Output (1 rows) -------
+    +----------------------+----------------------+
+    | min(parquet_table.a) | max(parquet_table.b) |
+    +----------------------+----------------------+
+    | 51                   | 1149                 |
+    +----------------------+----------------------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=parquet_table.parquet head=true
+    - GET  (ranges) path=parquet_table.parquet ranges=4-421,421-534,534-951,951-1064
+    - GET  (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
+    "
+    );
+}
+
+/// Runs tests with a request counting object store
+struct Test {
+    object_store: Arc<RequestCountingObjectStore>,
+    session_context: SessionContext,
+    /// metadata size hint to use when registering parquet files
+    ///
+    /// * `None`: uses the default (does not set a size_hint)
+    /// * `Some(None)`L: set prefetch hint to None (prefetching)
+    /// * `Some(Some(size))`: set prefetch hint to size
+    parquet_metadata_size_hint: Option<Option<usize>>,
+}
+
+impl Test {
+    fn new() -> Self {
+        let object_store = Arc::new(RequestCountingObjectStore::new());
+        let session_context = SessionContext::new();
+        session_context
+            .runtime_env()
+            .register_object_store(&Url::parse("mem://").unwrap(), object_store.clone());
+        Self {
+            object_store,
+            session_context,
+            parquet_metadata_size_hint: None,
+        }
+    }
+
+    /// Specify the metadata size hint to use when registering parquet files
+    fn with_parquet_metadata_size_hint(mut self, size_hint: Option<usize>) -> Self {
+        self.parquet_metadata_size_hint = Some(size_hint);
+        self
+    }
+
+    /// Returns a string representation of all recorded requests thus far
+    fn requests(&self) -> String {
+        format!("{}", self.object_store)
+    }
+
+    /// Store the specified bytes at the given path
+    async fn with_bytes(self, path: &str, bytes: impl Into<Bytes>) -> Self {
+        let path = Path::from(path);
+        self.object_store
+            .inner
+            .put(&path, PutPayload::from(bytes.into()))
+            .await
+            .unwrap();
+        self
+    }
+
+    /// Register a CSV file at the given path
+    async fn register_csv(self, table_name: &str, path: &str) -> Self {
+        let mut options = CsvReadOptions::new();
+        options.has_header = true;
+        let url = format!("mem://{path}");
+        self.session_context
+            .register_csv(table_name, url, options)
+            .await
+            .unwrap();
+        self
+    }
+
+    /// Register a partitioned CSV table at the given path
+    async fn register_partitioned_csv(self, table_name: &str, path: &str) -> Self {
+        let file_format = Arc::new(CsvFormat::default().with_has_header(true));
+        let options = ListingOptions::new(file_format);
+
+        let url = format!("mem://{path}").parse().unwrap();
+        let table_url = ListingTableUrl::try_new(url, None).unwrap();
+
+        let session_state = self.session_context.state();
+        let mut config = ListingTableConfig::new(table_url).with_listing_options(options);
+        config = config
+            .infer_partitions_from_path(&session_state)
+            .await
+            .unwrap();
+        config = config.infer_schema(&session_state).await.unwrap();
+
+        let table = Arc::new(ListingTable::try_new(config).unwrap());
+        self.session_context
+            .register_table(table_name, table)
+            .unwrap();
+        self
+    }
+
+    /// Register a Parquet file at the given path
+    async fn register_parquet(self, table_name: &str, path: &str) -> Self {
+        let path = format!("mem://{path}");
+        let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new();
+
+        // If a metadata size hint was specified, apply it
+        if let Some(parquet_metadata_size_hint) = self.parquet_metadata_size_hint {
+            options = options.metadata_size_hint(parquet_metadata_size_hint);
+        }
+
+        self.session_context
+            .register_parquet(table_name, path, options)
+            .await
+            .unwrap();
+        self
+    }
+
+    /// Register a single CSV file with three columns and two row named
+    /// `csv_table`
+    async fn with_single_file_csv(self) -> Test {
+        // upload CSV data to object store
+        let csv_data = r#"c1,c2,c3
+0.00001,5e-12,true
+0.00002,4e-12,false
+"#;
+        self.with_bytes("/csv_table.csv", csv_data)
+            .await
+            .register_csv("csv_table", "/csv_table.csv")
+            .await
+    }
+
+    /// Register three CSV files in a directory, called `csv_table`
+    async fn with_multi_file_csv(mut self) -> Test {
+        // upload CSV data to object store
+        for i in 0..3 {
+            let csv_data1 = format!(
+                r#"c1,c2,c3
+0.0000{i},{i}e-12,true
+0.00003,5e-12,false
+"#
+            );
+            self = self
+                .with_bytes(&format!("/data/file_{i}.csv"), csv_data1)
+                .await;
+        }
+        // register table
+        self.register_csv("csv_table", "/data/").await
+    }
+
+    /// Register three CSV files in a partitioned directory structure, called
+    /// `csv_table_partitioned`
+    async fn with_partitioned_csv(mut self) -> Test {
+        for i in 1..4 {
+            // upload CSV data to object store
+            let csv_data1 = format!(
+                r#"d1,d2,d3
+0.0000{i},{i}e-12,true
+0.00003,5e-12,false
+"#
+            );
+            self = self
+                .with_bytes(
+                    &format!("/data/a={i}/b={}/c={}/file_{i}.csv", i * 10, i * 100,),
+                    csv_data1,
+                )
+                .await;
+        }
+        // register table
+        self.register_partitioned_csv("csv_table_partitioned", "/data/")
+            .await
+    }
+
+    /// Add a single parquet file that has two columns and two row groups named `parquet_table`
+    ///
+    /// Column "a": Int32 with values 0-100] in row group 1
+    /// and [101-200] in row group 2
+    ///
+    /// Column "b": Int32 with values 1000-1100] in row group 1
+    /// and [1101-1200] in row group 2
+    async fn with_single_file_parquet(self) -> Test {
+        // Create parquet bytes
+        let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200));
+        let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200));
+        let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap();
+
+        let mut buffer = vec![];
+        let props = parquet::file::properties::WriterProperties::builder()
+            .set_max_row_group_row_count(Some(100))
+            .build();
+        let mut writer = parquet::arrow::ArrowWriter::try_new(
+            &mut buffer,
+            batch.schema(),
+            Some(props),
+        )
+        .unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        self.with_bytes("/parquet_table.parquet", buffer)
+            .await
+            .register_parquet("parquet_table", "/parquet_table.parquet")
+            .await
+    }
+
+    /// Runs the specified query and returns a string representation of the results
+    /// suitable for comparison with insta snapshots
+    ///
+    /// Clears all recorded requests before running the query
+    async fn query(&self, sql: &str) -> String {
+        self.object_store.clear_requests();
+        let results = self
+            .session_context
+            .sql(sql)
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        let num_rows = results.iter().map(|batch| batch.num_rows()).sum::<usize>();
+        let formatted_result =
+            arrow::util::pretty::pretty_format_batches(&results).unwrap();
+
+        let object_store = &self.object_store;
+
+        format!(
+            r#"------- Query Output ({num_rows} rows) -------
+{formatted_result}
+------- Object Store Request Summary -------
+{object_store}
+"#
+        )
+    }
+}
+
+/// Details of individual requests made through the [`RequestCountingObjectStore`]
+#[derive(Clone, Debug)]
+enum RequestDetails {
+    GetOpts { path: Path, get_options: GetOptions },
+    GetRanges { path: Path, ranges: Vec<Range<u64>> },
+    List { prefix: Option<Path> },
+    ListWithDelimiter { prefix: Option<Path> },
+    ListWithOffset { prefix: Option<Path>, offset: Path },
+}
+
+fn display_range(range: &Range<u64>) -> impl Display + '_ {
+    struct Wrapper<'a>(&'a Range<u64>);
+    impl Display for Wrapper<'_> {
+        fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+            write!(f, "{}-{}", self.0.start, self.0.end)
+        }
+    }
+    Wrapper(range)
+}
+impl Display for RequestDetails {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            RequestDetails::GetOpts { path, get_options } => {
+                write!(f, "GET  (opts) path={path}")?;
+                if let Some(range) = &get_options.range {
+                    match range {
+                        GetRange::Bounded(range) => {
+                            let range = display_range(range);
+                            write!(f, " range={range}")?;
+                        }
+                        GetRange::Offset(offset) => {
+                            write!(f, " range=offset:{offset}")?;
+                        }
+                        GetRange::Suffix(suffix) => {
+                            write!(f, " range=suffix:{suffix}")?;
+                        }
+                    }
+                }
+                if let Some(version) = &get_options.version {
+                    write!(f, " version={version}")?;
+                }
+                if get_options.head {
+                    write!(f, " head=true")?;
+                }
+                Ok(())
+            }
+            RequestDetails::GetRanges { path, ranges } => {
+                write!(f, "GET  (ranges) path={path}")?;
+                if !ranges.is_empty() {
+                    write!(f, " ranges=")?;
+                    for (i, range) in ranges.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ",")?;
+                        }
+                        write!(f, "{}", display_range(range))?;
+                    }
+                }
+                Ok(())
+            }
+            RequestDetails::List { prefix } => {
+                write!(f, "LIST")?;
+                if let Some(prefix) = prefix {
+                    write!(f, " prefix={prefix}")?;
+                }
+                Ok(())
+            }
+            RequestDetails::ListWithDelimiter { prefix } => {
+                write!(f, "LIST (with delimiter)")?;
+                if let Some(prefix) = prefix {
+                    write!(f, " prefix={prefix}")?;
+                }
+                Ok(())
+            }
+            RequestDetails::ListWithOffset { prefix, offset } => {
+                write!(f, "LIST (with offset) offset={offset}")?;
+                if let Some(prefix) = prefix {
+                    write!(f, " prefix={prefix}")?;
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct RequestCountingObjectStore {
+    /// Inner (memory) store
+    inner: Arc<dyn ObjectStore>,
+    requests: Mutex<Vec<RequestDetails>>,
+}
+
+impl Display for RequestCountingObjectStore {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "RequestCountingObjectStore()")?;
+        let requests = self.recorded_requests();
+        write!(f, "\nTotal Requests: {}", requests.len())?;
+        for request in requests {
+            write!(f, "\n- {request}")?;
+        }
+        Ok(())
+    }
+}
+
+impl RequestCountingObjectStore {
+    pub fn new() -> Self {
+        let inner = Arc::new(InMemory::new());
+        Self {
+            inner,
+            requests: Mutex::new(vec![]),
+        }
+    }
+
+    pub fn clear_requests(&self) {
+        self.requests.lock().clear();
+    }
+
+    /// Return a copy of the recorded requests normalized
+    /// by removing the path prefix
+    pub fn recorded_requests(&self) -> Vec<RequestDetails> {
+        self.requests.lock().to_vec()
+    }
+}
+
+#[async_trait]
+impl ObjectStore for RequestCountingObjectStore {
+    async fn put_opts(
+        &self,
+        _location: &Path,
+        _payload: PutPayload,
+        _opts: PutOptions,
+    ) -> object_store::Result<PutResult> {
+        unimplemented!()
+    }
+
+    async fn put_multipart_opts(
+        &self,
+        _location: &Path,
+        _opts: PutMultipartOptions,
+    ) -> object_store::Result<Box<dyn MultipartUpload>> {
+        unimplemented!()
+    }
+
+    async fn get_opts(
+        &self,
+        location: &Path,
+        options: GetOptions,
+    ) -> object_store::Result<GetResult> {
+        let result = self.inner.get_opts(location, options.clone()).await?;
+        self.requests.lock().push(RequestDetails::GetOpts {
+            path: location.to_owned(),
+            get_options: options,
+        });
+        Ok(result)
+    }
+
+    async fn get_ranges(
+        &self,
+        location: &Path,
+        ranges: &[Range<u64>],
+    ) -> object_store::Result<Vec<Bytes>> {
+        let result = self.inner.get_ranges(location, ranges).await?;
+        self.requests.lock().push(RequestDetails::GetRanges {
+            path: location.to_owned(),
+            ranges: ranges.to_vec(),
+        });
+        Ok(result)
+    }
+
+    fn list(
+        &self,
+        prefix: Option<&Path>,
+    ) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
+        self.requests.lock().push(RequestDetails::List {
+            prefix: prefix.map(|p| p.to_owned()),
+        });
+
+        self.inner.list(prefix)
+    }
+
+    fn list_with_offset(
+        &self,
+        prefix: Option<&Path>,
+        offset: &Path,
+    ) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
+        self.requests.lock().push(RequestDetails::ListWithOffset {
+            prefix: prefix.map(|p| p.to_owned()),
+            offset: offset.to_owned(),
+        });
+        self.inner.list_with_offset(prefix, offset)
+    }
+
+    async fn list_with_delimiter(
+        &self,
+        prefix: Option<&Path>,
+    ) -> object_store::Result<ListResult> {
+        self.requests
+            .lock()
+            .push(RequestDetails::ListWithDelimiter {
+                prefix: prefix.map(|p| p.to_owned()),
+            });
+        self.inner.list_with_delimiter(prefix).await
+    }
+
+    fn delete_stream(
+        &self,
+        _locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        unimplemented!()
+    }
+
+    async fn copy_opts(
+        &self,
+        _from: &Path,
+        _to: &Path,
+        _options: CopyOptions,
+    ) -> object_store::Result<()> {
+        unimplemented!()
+    }
+}
diff --git a/datafusion/core/tests/execution/coop.rs b/datafusion/core/tests/execution/coop.rs
new file mode 100644
index 0000000000000..e02364a0530cc
--- /dev/null
+++ b/datafusion/core/tests/execution/coop.rs
@@ -0,0 +1,835 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Int64Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_schema::SortOptions;
+use datafusion::common::NullEquality;
+use datafusion::functions_aggregate::sum;
+use datafusion::physical_expr::aggregate::AggregateExprBuilder;
+use datafusion::physical_plan;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::aggregates::{
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
+};
+use datafusion::physical_plan::execution_plan::Boundedness;
+use datafusion::prelude::SessionContext;
+use datafusion_common::{DataFusionError, JoinType, ScalarValue, exec_datafusion_err};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_expr_common::operator::Operator;
+use datafusion_expr_common::operator::Operator::{Divide, Eq, Gt, Modulo};
+use datafusion_functions_aggregate::min_max;
+use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr::expressions::{
+    BinaryExpr, Column, Literal, binary, col, lit,
+};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::ensure_coop::EnsureCooperative;
+use datafusion_physical_plan::coop::make_cooperative;
+use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
+use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion_physical_plan::union::InterleaveExec;
+use futures::StreamExt;
+use parking_lot::RwLock;
+use rstest::rstest;
+use std::any::Any;
+use std::error::Error;
+use std::fmt::Formatter;
+use std::ops::Range;
+use std::sync::Arc;
+use std::task::Poll;
+use std::time::Duration;
+use tokio::runtime::{Handle, Runtime};
+use tokio::select;
+
+#[derive(Debug, Clone)]
+struct RangeBatchGenerator {
+    schema: SchemaRef,
+    value_range: Range<i64>,
+    boundedness: Boundedness,
+    batch_size: usize,
+    poll_count: usize,
+    original_range: Range<i64>,
+}
+
+impl std::fmt::Display for RangeBatchGenerator {
+    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+        // Display current counter
+        write!(f, "InfiniteGenerator(counter={})", self.poll_count)
+    }
+}
+
+impl LazyBatchGenerator for RangeBatchGenerator {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn boundedness(&self) -> Boundedness {
+        self.boundedness
+    }
+
+    /// Generate the next RecordBatch.
+    fn generate_next_batch(&mut self) -> datafusion_common::Result<Option<RecordBatch>> {
+        self.poll_count += 1;
+
+        let mut builder = Int64Array::builder(self.batch_size);
+        for _ in 0..self.batch_size {
+            match self.value_range.next() {
+                None => break,
+                Some(v) => builder.append_value(v),
+            }
+        }
+        let array = builder.finish();
+
+        if array.is_empty() {
+            return Ok(None);
+        }
+
+        let batch =
+            RecordBatch::try_new(Arc::clone(&self.schema), vec![Arc::new(array)])?;
+        Ok(Some(batch))
+    }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        let mut new = self.clone();
+        new.poll_count = 0;
+        new.value_range = new.original_range.clone();
+        Arc::new(RwLock::new(new))
+    }
+}
+
+fn make_lazy_exec(column_name: &str, pretend_infinite: bool) -> LazyMemoryExec {
+    make_lazy_exec_with_range(column_name, i64::MIN..i64::MAX, pretend_infinite)
+}
+
+fn make_lazy_exec_with_range(
+    column_name: &str,
+    range: Range<i64>,
+    pretend_infinite: bool,
+) -> LazyMemoryExec {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        column_name,
+        DataType::Int64,
+        false,
+    )]));
+
+    let boundedness = if pretend_infinite {
+        Boundedness::Unbounded {
+            requires_infinite_memory: false,
+        }
+    } else {
+        Boundedness::Bounded
+    };
+
+    // Instantiate the generator with the batch and limit
+    let batch_gen = RangeBatchGenerator {
+        schema: Arc::clone(&schema),
+        boundedness,
+        value_range: range.clone(),
+        batch_size: 8192,
+        poll_count: 0,
+        original_range: range,
+    };
+
+    // Wrap the generator in a trait object behind Arc<RwLock<_>>
+    let generator: Arc<RwLock<dyn LazyBatchGenerator>> = Arc::new(RwLock::new(batch_gen));
+
+    // Create a LazyMemoryExec with one partition using our generator
+    let mut exec = LazyMemoryExec::try_new(schema, vec![generator]).unwrap();
+
+    exec.add_ordering(vec![PhysicalSortExpr::new(
+        Arc::new(Column::new(column_name, 0)),
+        SortOptions::new(false, true),
+    )]);
+
+    exec
+}
+
+#[rstest]
+#[tokio::test]
+async fn agg_no_grouping_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up an aggregation without grouping
+    let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
+    let aggr = Arc::new(AggregateExec::try_new(
+        AggregateMode::Single,
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
+        vec![Arc::new(
+            AggregateExprBuilder::new(
+                sum::sum_udaf(),
+                vec![col("value", &inf.schema())?],
+            )
+            .schema(inf.schema())
+            .alias("sum")
+            .build()?,
+        )],
+        vec![None],
+        inf.clone(),
+        inf.schema(),
+    )?);
+
+    query_yields(aggr, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn agg_grouping_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up an aggregation with grouping
+    let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
+
+    let value_col = col("value", &inf.schema())?;
+    let group = binary(value_col.clone(), Divide, lit(1000000i64), &inf.schema())?;
+
+    let aggr = Arc::new(AggregateExec::try_new(
+        AggregateMode::Single,
+        PhysicalGroupBy::new(vec![(group, "group".to_string())], vec![], vec![], false),
+        vec![Arc::new(
+            AggregateExprBuilder::new(sum::sum_udaf(), vec![value_col.clone()])
+                .schema(inf.schema())
+                .alias("sum")
+                .build()?,
+        )],
+        vec![None],
+        inf.clone(),
+        inf.schema(),
+    )?);
+
+    query_yields(aggr, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn agg_grouped_topk_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+
+    let session_ctx = SessionContext::new();
+
+    // set up a top-k aggregation
+    let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
+
+    let value_col = col("value", &inf.schema())?;
+    let group = binary(value_col.clone(), Divide, lit(1000000i64), &inf.schema())?;
+
+    let aggr = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new(
+                vec![(group, "group".to_string())],
+                vec![],
+                vec![vec![false]],
+                false,
+            ),
+            vec![Arc::new(
+                AggregateExprBuilder::new(min_max::max_udaf(), vec![value_col.clone()])
+                    .schema(inf.schema())
+                    .alias("max")
+                    .build()?,
+            )],
+            vec![None],
+            inf.clone(),
+            inf.schema(),
+        )?
+        .with_limit_options(Some(LimitOptions::new(100))),
+    );
+
+    query_yields(aggr, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+// A test that mocks the behavior of `SpillManager::read_spill_as_stream` without file access
+// to verify that a cooperative stream would properly yields in a spill file read scenario
+async fn spill_reader_stream_yield() -> Result<(), Box<dyn Error>> {
+    use datafusion_physical_plan::common::spawn_buffered;
+
+    // A mock stream that always returns `Poll::Ready(Some(...))` immediately
+    let always_ready =
+        make_lazy_exec("value", false).execute(0, SessionContext::new().task_ctx())?;
+
+    // this function makes a consumer stream that resembles how read_stream from spill file is constructed
+    let stream = make_cooperative(always_ready);
+
+    // Set large buffer so that buffer always has free space for the producer/sender
+    let buffer_capacity = 100_000;
+    let mut mock_stream = spawn_buffered(stream, buffer_capacity);
+    let schema = mock_stream.schema();
+
+    let consumer_stream = futures::stream::poll_fn(move |cx| {
+        let mut collected = vec![];
+        // To make sure that inner stream is polled multiple times, loop until the buffer is full
+        // Ideally, the stream will yield before the loop ends
+        for _ in 0..buffer_capacity {
+            match mock_stream.as_mut().poll_next(cx) {
+                Poll::Ready(Some(Ok(batch))) => {
+                    collected.push(batch);
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(e)));
+                }
+                Poll::Ready(None) => {
+                    break;
+                }
+                Poll::Pending => {
+                    // polling inner stream may return Pending only when it reaches budget, since
+                    // we intentionally made ProducerStream always return Ready
+                    return Poll::Pending;
+                }
+            }
+        }
+
+        // This should be unreachable since the stream is canceled
+        unreachable!("Expected the stream to be canceled, but it continued polling");
+    });
+
+    let consumer_record_batch_stream =
+        Box::pin(RecordBatchStreamAdapter::new(schema, consumer_stream));
+
+    stream_yields(consumer_record_batch_stream).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn sort_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up the infinite source
+    let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
+
+    // set up a SortExec that will not be able to finish in time because input is very large
+    let sort_expr = PhysicalSortExpr::new(
+        col("value", &inf.schema())?,
+        SortOptions {
+            descending: true,
+            nulls_first: true,
+        },
+    );
+
+    let lex_ordering = LexOrdering::new(vec![sort_expr]).unwrap();
+    let sort_exec = Arc::new(SortExec::new(lex_ordering, inf.clone()));
+
+    query_yields(sort_exec, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn sort_merge_join_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up the join sources
+    let inf1 = Arc::new(make_lazy_exec_with_range(
+        "value1",
+        i64::MIN..0,
+        pretend_infinite,
+    ));
+    let inf2 = Arc::new(make_lazy_exec_with_range(
+        "value2",
+        0..i64::MAX,
+        pretend_infinite,
+    ));
+
+    // set up a SortMergeJoinExec that will take a long time skipping left side content to find
+    // the first right side match
+    let join = Arc::new(SortMergeJoinExec::try_new(
+        inf1.clone(),
+        inf2.clone(),
+        vec![(
+            col("value1", &inf1.schema())?,
+            col("value2", &inf2.schema())?,
+        )],
+        None,
+        JoinType::Inner,
+        vec![inf1.properties().eq_properties.output_ordering().unwrap()[0].options],
+        NullEquality::NullEqualsNull,
+    )?);
+
+    query_yields(join, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn filter_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up the infinite source
+    let inf = Arc::new(make_lazy_exec("value", pretend_infinite));
+
+    // set up a FilterExec that will filter out entire batches
+    let filter_expr = binary(
+        col("value", &inf.schema())?,
+        Operator::Lt,
+        lit(i64::MIN),
+        &inf.schema(),
+    )?;
+    let filter = Arc::new(FilterExec::try_new(filter_expr, inf.clone())?);
+
+    query_yields(filter, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn filter_reject_all_batches_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Create a Session, Schema, and an 8K-row RecordBatch
+    let session_ctx = SessionContext::new();
+
+    // Wrap this batch in an InfiniteExec
+    let infinite = make_lazy_exec_with_range("value", i64::MIN..0, pretend_infinite);
+
+    // 2b) Construct a FilterExec that is always false: “value > 10000” (no rows pass)
+    let false_predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("value", 0)),
+        Gt,
+        Arc::new(Literal::new(ScalarValue::Int64(Some(0)))),
+    ));
+    let filtered = Arc::new(FilterExec::try_new(false_predicate, Arc::new(infinite))?);
+
+    query_yields(filtered, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn interleave_then_filter_all_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Build a session and a schema with one i64 column.
+    let session_ctx = SessionContext::new();
+
+    // Create multiple infinite sources, each filtered by a different threshold.
+    // This ensures InterleaveExec has many children.
+    let mut infinite_children = vec![];
+
+    // Use 32 distinct thresholds (each >0 and <8 192) to force 32 infinite inputs
+    for threshold in 1..32 {
+        // One infinite exec:
+        let mut inf = make_lazy_exec_with_range("value", 0..i64::MAX, pretend_infinite);
+
+        // Now repartition so that all children share identical Hash partitioning
+        // on “value” into 1 bucket. This is required for InterleaveExec::try_new.
+        let exprs = vec![Arc::new(Column::new("value", 0)) as _];
+        let partitioning = Partitioning::Hash(exprs, 1);
+        inf.try_set_partitioning(partitioning)?;
+
+        // Apply a FilterExec: “(value / 8192) % threshold == 0”.
+        let filter_expr = binary(
+            binary(
+                binary(
+                    col("value", &inf.schema())?,
+                    Divide,
+                    lit(8192i64),
+                    &inf.schema(),
+                )?,
+                Modulo,
+                lit(threshold as i64),
+                &inf.schema(),
+            )?,
+            Eq,
+            lit(0i64),
+            &inf.schema(),
+        )?;
+        let filtered = Arc::new(FilterExec::try_new(filter_expr, Arc::new(inf))?);
+
+        infinite_children.push(filtered as _);
+    }
+
+    // Build an InterleaveExec over all infinite children.
+    let interleave = Arc::new(InterleaveExec::try_new(infinite_children)?);
+
+    // Wrap the InterleaveExec in a FilterExec that always returns false,
+    // ensuring that no rows are ever emitted.
+    let always_false = Arc::new(Literal::new(ScalarValue::Boolean(Some(false))));
+    let filtered_interleave = Arc::new(FilterExec::try_new(always_false, interleave)?);
+
+    query_yields(filtered_interleave, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn interleave_then_aggregate_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Build session, schema, and a sample batch.
+    let session_ctx = SessionContext::new();
+
+    // Create N infinite sources, each filtered by a different predicate.
+    // That way, the InterleaveExec will have multiple children.
+    let mut infinite_children = vec![];
+
+    // Use 32 distinct thresholds (each >0 and <8 192) to force 32 infinite inputs
+    for threshold in 1..32 {
+        // One infinite exec:
+        let mut inf = make_lazy_exec_with_range("value", 0..i64::MAX, pretend_infinite);
+
+        // Now repartition so that all children share identical Hash partitioning
+        // on “value” into 1 bucket. This is required for InterleaveExec::try_new.
+        let exprs = vec![Arc::new(Column::new("value", 0)) as _];
+        let partitioning = Partitioning::Hash(exprs, 1);
+        inf.try_set_partitioning(partitioning)?;
+
+        // Apply a FilterExec: “(value / 8192) % threshold == 0”.
+        let filter_expr = binary(
+            binary(
+                binary(
+                    col("value", &inf.schema())?,
+                    Divide,
+                    lit(8192i64),
+                    &inf.schema(),
+                )?,
+                Modulo,
+                lit(threshold as i64),
+                &inf.schema(),
+            )?,
+            Eq,
+            lit(0i64),
+            &inf.schema(),
+        )?;
+        let filtered = Arc::new(FilterExec::try_new(filter_expr, Arc::new(inf))?);
+
+        infinite_children.push(filtered as _);
+    }
+
+    // Build an InterleaveExec over all N children.
+    // Since each child now has Partitioning::Hash([col "value"], 1), InterleaveExec::try_new succeeds.
+    let interleave = Arc::new(InterleaveExec::try_new(infinite_children)?);
+    let interleave_schema = interleave.schema();
+
+    // Build a global AggregateExec that sums “value” over all rows.
+    // Because we use `AggregateMode::Single` with no GROUP BY columns, this plan will
+    // only produce one “final” row once all inputs finish. But our inputs never finish,
+    // so we should never get any output.
+    let aggregate_expr = AggregateExprBuilder::new(
+        sum::sum_udaf(),
+        vec![Arc::new(Column::new("value", 0))],
+    )
+    .schema(interleave_schema.clone())
+    .alias("total")
+    .build()?;
+
+    let aggr = Arc::new(AggregateExec::try_new(
+        AggregateMode::Single,
+        PhysicalGroupBy::new(
+            vec![], // no GROUP BY columns
+            vec![], // no GROUP BY expressions
+            vec![], // no GROUP BY physical expressions
+            false,
+        ),
+        vec![Arc::new(aggregate_expr)],
+        vec![None], // no “distinct” flags
+        interleave,
+        interleave_schema,
+    )?);
+
+    query_yields(aggr, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn join_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Session, schema, and a single 8 K‐row batch for each side
+    let session_ctx = SessionContext::new();
+
+    // on the right side, we’ll shift each value by +1 so that not everything joins,
+    // but plenty of matching keys exist (e.g. 0 on left matches 1 on right, etc.)
+    let infinite_left = make_lazy_exec_with_range("value", -10..10, false);
+    let infinite_right =
+        make_lazy_exec_with_range("value", 0..i64::MAX, pretend_infinite);
+
+    // Create Join keys → join on “value” = “value”
+    let left_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
+    let right_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
+
+    let part_left = Partitioning::Hash(left_keys, 1);
+    let part_right = Partitioning::Hash(right_keys, 1);
+
+    // Wrap each side in Repartition so they are both hashed into 1 partition
+    let hashed_left = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_left),
+        part_left,
+    )?);
+    let hashed_right = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_right),
+        part_right,
+    )?);
+
+    // Build an Inner HashJoinExec → left.value = right.value
+    let join = Arc::new(HashJoinExec::try_new(
+        hashed_left,
+        hashed_right,
+        vec![(
+            Arc::new(Column::new("value", 0)),
+            Arc::new(Column::new("value", 0)),
+        )],
+        None,
+        &JoinType::Inner,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNull,
+        false,
+    )?);
+
+    query_yields(join, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn join_agg_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Session, schema, and a single 8 K‐row batch for each side
+    let session_ctx = SessionContext::new();
+
+    // on the right side, we’ll shift each value by +1 so that not everything joins,
+    // but plenty of matching keys exist (e.g. 0 on left matches 1 on right, etc.)
+    let infinite_left = make_lazy_exec_with_range("value", -10..10, false);
+    let infinite_right =
+        make_lazy_exec_with_range("value", 0..i64::MAX, pretend_infinite);
+
+    // 2b) Create Join keys → join on “value” = “value”
+    let left_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
+    let right_keys: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("value", 0))];
+
+    let part_left = Partitioning::Hash(left_keys, 1);
+    let part_right = Partitioning::Hash(right_keys, 1);
+
+    // Wrap each side in Repartition so they are both hashed into 1 partition
+    let hashed_left = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_left),
+        part_left,
+    )?);
+    let hashed_right = Arc::new(RepartitionExec::try_new(
+        Arc::new(infinite_right),
+        part_right,
+    )?);
+
+    // Build an Inner HashJoinExec → left.value = right.value
+    let join = Arc::new(HashJoinExec::try_new(
+        hashed_left,
+        hashed_right,
+        vec![(
+            Arc::new(Column::new("value", 0)),
+            Arc::new(Column::new("value", 0)),
+        )],
+        None,
+        &JoinType::Inner,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNull,
+        false,
+    )?);
+
+    // Project only one column (“value” from the left side) because we just want to sum that
+    let input_schema = join.schema();
+
+    let proj_expr = vec![ProjectionExpr::new(
+        Arc::new(Column::new_with_schema("value", &input_schema)?) as _,
+        "value",
+    )];
+
+    let projection = Arc::new(ProjectionExec::try_new(proj_expr, join)?);
+    let projection_schema = projection.schema();
+
+    let output_fields = vec![Field::new("total", DataType::Int64, true)];
+    let output_schema = Arc::new(Schema::new(output_fields));
+
+    // 4) Global aggregate (Single) over “value”
+    let aggregate_expr = AggregateExprBuilder::new(
+        sum::sum_udaf(),
+        vec![Arc::new(Column::new_with_schema(
+            "value",
+            &projection.schema(),
+        )?)],
+    )
+    .schema(output_schema)
+    .alias("total")
+    .build()?;
+
+    let aggr = Arc::new(AggregateExec::try_new(
+        AggregateMode::Single,
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
+        vec![Arc::new(aggregate_expr)],
+        vec![None],
+        projection,
+        projection_schema,
+    )?);
+
+    query_yields(aggr, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn hash_join_yields(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // build session
+    let session_ctx = SessionContext::new();
+
+    // set up the join sources
+    let inf1 = Arc::new(make_lazy_exec("value1", pretend_infinite));
+    let inf2 = Arc::new(make_lazy_exec("value2", pretend_infinite));
+
+    // set up a HashJoinExec that will take a long time in the build phase
+    let join = Arc::new(HashJoinExec::try_new(
+        inf1.clone(),
+        inf2.clone(),
+        vec![(
+            col("value1", &inf1.schema())?,
+            col("value2", &inf2.schema())?,
+        )],
+        None,
+        &JoinType::Left,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNull,
+        false,
+    )?);
+
+    query_yields(join, session_ctx.task_ctx()).await
+}
+
+#[rstest]
+#[tokio::test]
+async fn hash_join_without_repartition_and_no_agg(
+    #[values(false, true)] pretend_infinite: bool,
+) -> Result<(), Box<dyn Error>> {
+    // Create Session, schema, and an 8K-row RecordBatch for each side
+    let session_ctx = SessionContext::new();
+
+    // on the right side, we’ll shift each value by +1 so that not everything joins,
+    // but plenty of matching keys exist (e.g. 0 on left matches 1 on right, etc.)
+    let infinite_left = make_lazy_exec_with_range("value", -10..10, false);
+    let infinite_right =
+        make_lazy_exec_with_range("value", 0..i64::MAX, pretend_infinite);
+
+    // Directly feed `infinite_left` and `infinite_right` into HashJoinExec.
+    // Do not use aggregation or repartition.
+    let join = Arc::new(HashJoinExec::try_new(
+        Arc::new(infinite_left),
+        Arc::new(infinite_right),
+        vec![(
+            Arc::new(Column::new("value", 0)),
+            Arc::new(Column::new("value", 0)),
+        )],
+        /* filter */ None,
+        &JoinType::Inner,
+        /* output64 */ None,
+        // Using CollectLeft is fine—just avoid RepartitionExec's partitioned channels.
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNull,
+        false,
+    )?);
+
+    query_yields(join, session_ctx.task_ctx()).await
+}
+
+#[derive(Debug)]
+enum Yielded {
+    ReadyOrPending,
+    Err(#[expect(dead_code)] DataFusionError),
+    Timeout,
+}
+
+async fn stream_yields(
+    mut stream: SendableRecordBatchStream,
+) -> Result<(), Box<dyn Error>> {
+    // Create an independent executor pool
+    let child_runtime = Runtime::new()?;
+
+    // Spawn a task that tries to poll the stream
+    // The task returns Ready when the stream yielded with either Ready or Pending
+    let join_handle = child_runtime.spawn(std::future::poll_fn(move |cx| {
+        match stream.poll_next_unpin(cx) {
+            Poll::Ready(Some(Ok(_))) => Poll::Ready(Poll::Ready(Ok(()))),
+            Poll::Ready(Some(Err(e))) => Poll::Ready(Poll::Ready(Err(e))),
+            Poll::Ready(None) => Poll::Ready(Poll::Ready(Ok(()))),
+            Poll::Pending => Poll::Ready(Poll::Pending),
+        }
+    }));
+
+    let abort_handle = join_handle.abort_handle();
+
+    // Now select on the join handle of the task running in the child executor with a timeout
+    let yielded = select! {
+        result = join_handle => {
+            match result {
+                Ok(Poll::Pending) => Yielded::ReadyOrPending,
+                Ok(Poll::Ready(Ok(_))) => Yielded::ReadyOrPending,
+                Ok(Poll::Ready(Err(e))) => Yielded::Err(e),
+                Err(_) => Yielded::Err(exec_datafusion_err!("join error")),
+            }
+        },
+        _ = tokio::time::sleep(Duration::from_secs(10)) => {
+            Yielded::Timeout
+        }
+    };
+
+    // Try to abort the poll task and shutdown the child runtime
+    abort_handle.abort();
+    Handle::current().spawn_blocking(move || {
+        child_runtime.shutdown_timeout(Duration::from_secs(5));
+    });
+
+    // Finally, check if poll_next yielded
+    assert!(
+        matches!(yielded, Yielded::ReadyOrPending),
+        "Result is not Ready or Pending: {yielded:?}"
+    );
+    Ok(())
+}
+
+async fn query_yields(
+    plan: Arc<dyn ExecutionPlan>,
+    task_ctx: Arc<TaskContext>,
+) -> Result<(), Box<dyn Error>> {
+    // Run plan through EnsureCooperative
+    let optimized =
+        EnsureCooperative::new().optimize(plan, task_ctx.session_config().options())?;
+
+    // Get the stream
+    let stream = physical_plan::execute_stream(optimized, task_ctx)?;
+
+    // Spawn a task that tries to poll the stream and check whether given stream yields
+    stream_yields(stream).await
+}
diff --git a/datafusion/core/tests/execution/datasource_split.rs b/datafusion/core/tests/execution/datasource_split.rs
new file mode 100644
index 0000000000000..370249cd8044e
--- /dev/null
+++ b/datafusion/core/tests/execution/datasource_split.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+    array::{ArrayRef, Int32Array},
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_datasource::memory::MemorySourceConfig;
+use datafusion_execution::TaskContext;
+use datafusion_physical_plan::{ExecutionPlan, common::collect};
+use std::sync::Arc;
+
+/// Helper function to create a memory source with the given batch size and collect all batches
+async fn create_and_collect_batches(
+    batch_size: usize,
+) -> datafusion_common::Result<Vec<RecordBatch>> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let array = Int32Array::from_iter_values(0..batch_size as i32);
+    let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array) as ArrayRef])?;
+    let exec = MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?;
+    let ctx = Arc::new(TaskContext::default());
+    let stream = exec.execute(0, ctx)?;
+    collect(stream).await
+}
+
+/// Helper function to create a memory source with multiple batches and collect all results
+async fn create_and_collect_multiple_batches(
+    input_batches: Vec<RecordBatch>,
+) -> datafusion_common::Result<Vec<RecordBatch>> {
+    let schema = input_batches[0].schema();
+    let exec = MemorySourceConfig::try_new_exec(&[input_batches], schema, None)?;
+    let ctx = Arc::new(TaskContext::default());
+    let stream = exec.execute(0, ctx)?;
+    collect(stream).await
+}
+
+#[tokio::test]
+async fn datasource_splits_large_batches() -> datafusion_common::Result<()> {
+    let batch_size = 20000;
+    let batches = create_and_collect_batches(batch_size).await?;
+
+    assert!(batches.len() > 1);
+    let max = batches.iter().map(|b| b.num_rows()).max().unwrap();
+    assert!(
+        max <= datafusion_execution::config::SessionConfig::new()
+            .options()
+            .execution
+            .batch_size
+    );
+    let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total, batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_exact_batch_size_no_split() -> datafusion_common::Result<()> {
+    let session_config = datafusion_execution::config::SessionConfig::new();
+    let configured_batch_size = session_config.options().execution.batch_size;
+
+    let batches = create_and_collect_batches(configured_batch_size).await?;
+
+    // Should not split when exactly equal to batch_size
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), configured_batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_small_batch_no_split() -> datafusion_common::Result<()> {
+    // Test with batch smaller than the batch size (8192)
+    let small_batch_size = 512; // Less than 8192
+
+    let batches = create_and_collect_batches(small_batch_size).await?;
+
+    // Should not split small batches below the batch size
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), small_batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_empty_batch_clean_termination() -> datafusion_common::Result<()> {
+    let batches = create_and_collect_batches(0).await?;
+
+    // Empty batch should result in one empty batch
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), 0);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_multiple_empty_batches() -> datafusion_common::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let empty_array = Int32Array::from_iter_values(std::iter::empty::<i32>());
+    let empty_batch =
+        RecordBatch::try_new(schema.clone(), vec![Arc::new(empty_array) as ArrayRef])?;
+
+    // Create multiple empty batches
+    let input_batches = vec![empty_batch.clone(), empty_batch.clone(), empty_batch];
+    let batches = create_and_collect_multiple_batches(input_batches).await?;
+
+    // Should preserve empty batches without issues
+    assert_eq!(batches.len(), 3);
+    for batch in &batches {
+        assert_eq!(batch.num_rows(), 0);
+    }
+    Ok(())
+}
diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs
index 97bb2a727bbfe..3eaa3fb2ed5e6 100644
--- a/datafusion/core/tests/execution/logical_plan.rs
+++ b/datafusion/core/tests/execution/logical_plan.rs
@@ -20,7 +20,7 @@
 
 use arrow::array::Int64Array;
 use arrow::datatypes::{DataType, Field, Schema};
-use datafusion::datasource::{provider_as_source, ViewTable};
+use datafusion::datasource::{ViewTable, provider_as_source};
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, ScalarValue, Spans};
 use datafusion_execution::TaskContext;
@@ -47,9 +47,9 @@ async fn count_only_nulls() -> Result<()> {
     let input = Arc::new(LogicalPlan::Values(Values {
         schema: input_schema,
         values: vec![
-            vec![Expr::Literal(ScalarValue::Null)],
-            vec![Expr::Literal(ScalarValue::Null)],
-            vec![Expr::Literal(ScalarValue::Null)],
+            vec![Expr::Literal(ScalarValue::Null, None)],
+            vec![Expr::Literal(ScalarValue::Null, None)],
+            vec![Expr::Literal(ScalarValue::Null, None)],
         ],
     }));
     let input_col_ref = Expr::Column(Column {
@@ -68,7 +68,7 @@ async fn count_only_nulls() -> Result<()> {
                 args: vec![input_col_ref],
                 distinct: false,
                 filter: None,
-                order_by: None,
+                order_by: vec![],
                 null_treatment: None,
             },
         })],
@@ -128,7 +128,7 @@ fn inline_scan_projection_test() -> Result<()> {
         @r"
     SubqueryAlias: ?table?
       Projection: a
-        EmptyRelation
+        EmptyRelation: rows=0
     "
     );
 
diff --git a/datafusion/core/tests/execution/mod.rs b/datafusion/core/tests/execution/mod.rs
index 8169db1a4611e..f33ef87aa3023 100644
--- a/datafusion/core/tests/execution/mod.rs
+++ b/datafusion/core/tests/execution/mod.rs
@@ -15,4 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod coop;
+mod datasource_split;
 mod logical_plan;
+mod register_arrow;
diff --git a/datafusion/core/tests/execution/register_arrow.rs b/datafusion/core/tests/execution/register_arrow.rs
new file mode 100644
index 0000000000000..4ce16dc0906c1
--- /dev/null
+++ b/datafusion/core/tests/execution/register_arrow.rs
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Integration tests for register_arrow API
+
+use datafusion::{execution::options::ArrowReadOptions, prelude::*};
+use datafusion_common::Result;
+
+#[tokio::test]
+async fn test_register_arrow_auto_detects_format() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_arrow(
+        "file_format",
+        "../../datafusion/datasource-arrow/tests/data/example.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    ctx.register_arrow(
+        "stream_format",
+        "../../datafusion/datasource-arrow/tests/data/example_stream.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    let file_result = ctx.sql("SELECT * FROM file_format ORDER BY f0").await?;
+    let stream_result = ctx.sql("SELECT * FROM stream_format ORDER BY f0").await?;
+
+    let file_batches = file_result.collect().await?;
+    let stream_batches = stream_result.collect().await?;
+
+    assert_eq!(file_batches.len(), stream_batches.len());
+    assert_eq!(file_batches[0].schema(), stream_batches[0].schema());
+
+    let file_rows: usize = file_batches.iter().map(|b| b.num_rows()).sum();
+    let stream_rows: usize = stream_batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(file_rows, stream_rows);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_register_arrow_join_file_and_stream() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_arrow(
+        "file_table",
+        "../../datafusion/datasource-arrow/tests/data/example.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    ctx.register_arrow(
+        "stream_table",
+        "../../datafusion/datasource-arrow/tests/data/example_stream.arrow",
+        ArrowReadOptions::default(),
+    )
+    .await?;
+
+    let result = ctx
+        .sql(
+            "SELECT a.f0, a.f1, b.f0, b.f1
+             FROM file_table a
+             JOIN stream_table b ON a.f0 = b.f0
+             WHERE a.f0 <= 2
+             ORDER BY a.f0",
+        )
+        .await?;
+    let batches = result.collect().await?;
+
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 2);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs
index a9cf7f04bb3a2..91dd5de7fcd64 100644
--- a/datafusion/core/tests/expr_api/mod.rs
+++ b/datafusion/core/tests/expr_api/mod.rs
@@ -16,26 +16,26 @@
 // under the License.
 
 use arrow::array::{
-    builder::{ListBuilder, StringBuilder},
     ArrayRef, Int64Array, RecordBatch, StringArray, StructArray,
+    builder::{ListBuilder, StringBuilder},
 };
 use arrow::datatypes::{DataType, Field};
 use arrow::util::pretty::{pretty_format_batches, pretty_format_columns};
 use datafusion::prelude::*;
 use datafusion_common::{DFSchema, ScalarValue};
-use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::ExprFunctionExt;
+use datafusion_expr::expr::NullTreatment;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_functions::core::expr_ext::FieldAccessor;
 use datafusion_functions_aggregate::first_last::first_value_udaf;
 use datafusion_functions_aggregate::sum::sum_udaf;
 use datafusion_functions_nested::expr_ext::{IndexAccessor, SliceAccessor};
 use datafusion_optimizer::simplify_expressions::ExprSimplifier;
-use sqlparser::ast::NullTreatment;
 /// Tests of using and evaluating `Expr`s outside the context of a LogicalPlan
 use std::sync::{Arc, LazyLock};
 
 mod parse_sql_expr;
+#[expect(clippy::needless_pass_by_value)]
 mod simplification;
 
 #[test]
@@ -320,6 +320,26 @@ async fn test_create_physical_expr() {
     create_simplified_expr_test(lit(1i32) + lit(2i32), "3");
 }
 
+#[test]
+fn test_create_physical_expr_nvl2() {
+    let batch = &TEST_BATCH;
+    let df_schema = DFSchema::try_from(batch.schema()).unwrap();
+    let ctx = SessionContext::new();
+
+    let expect_err = |expr| {
+        let physical_expr = ctx.create_physical_expr(expr, &df_schema).unwrap();
+        let err = physical_expr.evaluate(batch).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("nvl2 should have been simplified to case"),
+            "unexpected error: {err:?}"
+        );
+    };
+
+    expect_err(nvl2(col("i"), lit(1i64), lit(0i64)));
+    expect_err(nvl2(lit(1i64), col("i"), lit(0i64)));
+}
+
 #[tokio::test]
 async fn test_create_physical_expr_coercion() {
     // create_physical_expr does apply type coercion and unwrapping in cast
@@ -364,6 +384,7 @@ async fn evaluate_agg_test(expr: Expr, expected_lines: Vec<&str>) {
 
 /// Converts the `Expr` to a `PhysicalExpr`, evaluates it against the provided
 /// `RecordBatch` and compares the result to the expected result.
+#[expect(clippy::needless_pass_by_value)]
 fn evaluate_expr_test(expr: Expr, expected_lines: Vec<&str>) {
     let batch = &TEST_BATCH;
     let df_schema = DFSchema::try_from(batch.schema()).unwrap();
@@ -400,9 +421,7 @@ fn create_simplified_expr_test(expr: Expr, expected_expr: &str) {
     let df_schema = DFSchema::try_from(batch.schema()).unwrap();
 
     // Simplify the expression first
-    let props = ExecutionProps::new();
-    let simplify_context =
-        SimplifyContext::new(&props).with_schema(df_schema.clone().into());
+    let simplify_context = SimplifyContext::default().with_schema(Arc::new(df_schema));
     let simplifier = ExprSimplifier::new(simplify_context).with_max_cycles(10);
     let simplified = simplifier.simplify(expr).unwrap();
     create_expr_test(simplified, expected_expr);
diff --git a/datafusion/core/tests/expr_api/parse_sql_expr.rs b/datafusion/core/tests/expr_api/parse_sql_expr.rs
index 92c18204324f7..b0d8b3a349ae2 100644
--- a/datafusion/core/tests/expr_api/parse_sql_expr.rs
+++ b/datafusion/core/tests/expr_api/parse_sql_expr.rs
@@ -19,9 +19,9 @@ use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::prelude::{CsvReadOptions, SessionContext};
 use datafusion_common::DFSchema;
 use datafusion_common::{DFSchemaRef, Result, ToDFSchema};
+use datafusion_expr::Expr;
 use datafusion_expr::col;
 use datafusion_expr::lit;
-use datafusion_expr::Expr;
 use datafusion_sql::unparser::Unparser;
 /// A schema like:
 ///
diff --git a/datafusion/core/tests/expr_api/simplification.rs b/datafusion/core/tests/expr_api/simplification.rs
index 34e0487f312fb..02f2503faf22a 100644
--- a/datafusion/core/tests/expr_api/simplification.rs
+++ b/datafusion/core/tests/expr_api/simplification.rs
@@ -17,20 +17,22 @@
 
 //! This program demonstrates the DataFusion expression simplification API.
 
+use insta::assert_snapshot;
+
 use arrow::array::types::IntervalDayTime;
 use arrow::array::{ArrayRef, Int32Array};
 use arrow::datatypes::{DataType, Field, Schema};
 use chrono::{DateTime, TimeZone, Utc};
-use datafusion::{error::Result, execution::context::ExecutionProps, prelude::*};
-use datafusion_common::cast::as_int32_array;
+use datafusion::{error::Result, prelude::*};
 use datafusion_common::ScalarValue;
+use datafusion_common::cast::as_int32_array;
 use datafusion_common::{DFSchemaRef, ToDFSchema};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::logical_plan::builder::table_scan_with_filters;
-use datafusion_expr::simplify::SimplifyInfo;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
-    table_scan, Cast, ColumnarValue, ExprSchemable, LogicalPlan, LogicalPlanBuilder,
-    ScalarUDF, Volatility,
+    Cast, ColumnarValue, ExprSchemable, LogicalPlan, LogicalPlanBuilder, Projection,
+    ScalarUDF, Volatility, table_scan,
 };
 use datafusion_functions::math;
 use datafusion_optimizer::optimizer::Optimizer;
@@ -38,50 +40,6 @@ use datafusion_optimizer::simplify_expressions::{ExprSimplifier, SimplifyExpress
 use datafusion_optimizer::{OptimizerContext, OptimizerRule};
 use std::sync::Arc;
 
-/// In order to simplify expressions, DataFusion must have information
-/// about the expressions.
-///
-/// You can provide that information using DataFusion [DFSchema]
-/// objects or from some other implementation
-struct MyInfo {
-    /// The input schema
-    schema: DFSchemaRef,
-
-    /// Execution specific details needed for constant evaluation such
-    /// as the current time for `now()` and [VariableProviders]
-    execution_props: ExecutionProps,
-}
-
-impl SimplifyInfo for MyInfo {
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-        Ok(matches!(
-            expr.get_type(self.schema.as_ref())?,
-            DataType::Boolean
-        ))
-    }
-
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        expr.nullable(self.schema.as_ref())
-    }
-
-    fn execution_props(&self) -> &ExecutionProps {
-        &self.execution_props
-    }
-
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-        expr.get_type(self.schema.as_ref())
-    }
-}
-
-impl From<DFSchemaRef> for MyInfo {
-    fn from(schema: DFSchemaRef) -> Self {
-        Self {
-            schema,
-            execution_props: ExecutionProps::new(),
-        }
-    }
-}
-
 /// A schema like:
 ///
 /// a: Int32 (possibly with nulls)
@@ -130,14 +88,10 @@ fn test_evaluate_with_start_time(
     expected_expr: Expr,
     date_time: &DateTime<Utc>,
 ) {
-    let execution_props =
-        ExecutionProps::new().with_query_execution_start_time(*date_time);
-
-    let info: MyInfo = MyInfo {
-        schema: schema(),
-        execution_props,
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::default()
+        .with_schema(schema())
+        .with_query_execution_start_time(Some(*date_time));
+    let simplifier = ExprSimplifier::new(context);
     let simplified_expr = simplifier
         .simplify(input_expr.clone())
         .expect("successfully evaluated");
@@ -199,7 +153,9 @@ fn to_timestamp_expr(arg: impl Into<String>) -> Expr {
 
 #[test]
 fn basic() {
-    let info: MyInfo = schema().into();
+    let context = SimplifyContext::default()
+        .with_schema(schema())
+        .with_query_execution_start_time(Some(Utc::now()));
 
     // The `Expr` is a core concept in DataFusion, and DataFusion can
     // help simplify it.
@@ -208,21 +164,21 @@ fn basic() {
     // optimize form `a < 5` automatically
     let expr = col("a").lt(lit(2i32) + lit(3i32));
 
-    let simplifier = ExprSimplifier::new(info);
+    let simplifier = ExprSimplifier::new(context);
     let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, col("a").lt(lit(5i32)));
 }
 
 #[test]
 fn fold_and_simplify() {
-    let info: MyInfo = schema().into();
+    let context = SimplifyContext::default().with_schema(schema());
 
     // What will it do with the expression `concat('foo', 'bar') == 'foobar')`?
     let expr = concat(vec![lit("foo"), lit("bar")]).eq(lit("foobar"));
 
     // Since datafusion applies both simplification *and* rewriting
     // some expressions can be entirely simplified
-    let simplifier = ExprSimplifier::new(info);
+    let simplifier = ExprSimplifier::new(context);
     let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, lit(true))
 }
@@ -237,11 +193,15 @@ fn to_timestamp_expr_folded() -> Result<()> {
         .project(proj)?
         .build()?;
 
-    let expected = "Projection: TimestampNanosecond(1599566400000000000, None) AS to_timestamp(Utf8(\"2020-09-08T12:00:00+00:00\"))\
-            \n  TableScan: test"
-        .to_string();
-    let actual = get_optimized_plan_formatted(plan, &Utc::now());
-    assert_eq!(expected, actual);
+    let formatted = get_optimized_plan_formatted(plan, &Utc::now());
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    Projection: TimestampNanosecond(1599566400000000000, None) AS to_timestamp(Utf8("2020-09-08T12:00:00+00:00"))
+      TableScan: test
+    "#
+    );
     Ok(())
 }
 
@@ -262,11 +222,16 @@ fn now_less_than_timestamp() -> Result<()> {
 
     // Note that constant folder runs and folds the entire
     // expression down to a single constant (true)
-    let expected = "Filter: Boolean(true)\
-                        \n  TableScan: test";
-    let actual = get_optimized_plan_formatted(plan, &time);
-
-    assert_eq!(expected, actual);
+    let formatted = get_optimized_plan_formatted(plan, &time);
+    let actual = formatted.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    Filter: Boolean(true)
+      TableScan: test
+    "
+    );
     Ok(())
 }
 
@@ -282,10 +247,13 @@ fn select_date_plus_interval() -> Result<()> {
 
     let date_plus_interval_expr = to_timestamp_expr(ts_string)
         .cast_to(&DataType::Date32, schema)?
-        + Expr::Literal(ScalarValue::IntervalDayTime(Some(IntervalDayTime {
-            days: 123,
-            milliseconds: 0,
-        })));
+        + Expr::Literal(
+            ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                days: 123,
+                milliseconds: 0,
+            })),
+            None,
+        );
 
     let plan = LogicalPlanBuilder::from(table_scan.clone())
         .project(vec![date_plus_interval_expr])?
@@ -293,11 +261,16 @@ fn select_date_plus_interval() -> Result<()> {
 
     // Note that constant folder runs and folds the entire
     // expression down to a single constant (true)
-    let expected = r#"Projection: Date32("2021-01-09") AS to_timestamp(Utf8("2020-09-08T12:05:00+00:00")) + IntervalDayTime("IntervalDayTime { days: 123, milliseconds: 0 }")
-  TableScan: test"#;
-    let actual = get_optimized_plan_formatted(plan, &time);
-
-    assert_eq!(expected, actual);
+    let formatted = get_optimized_plan_formatted(plan, &time);
+    let actual = formatted.trim();
+
+    assert_snapshot!(
+        actual,
+        @r#"
+    Projection: Date32("2021-01-09") AS to_timestamp(Utf8("2020-09-08T12:05:00+00:00")) + IntervalDayTime("IntervalDayTime { days: 123, milliseconds: 0 }")
+      TableScan: test
+    "#
+    );
     Ok(())
 }
 
@@ -311,10 +284,15 @@ fn simplify_project_scalar_fn() -> Result<()> {
 
     // before simplify: power(t.f, 1.0)
     // after simplify:  t.f as "power(t.f, 1.0)"
-    let expected = "Projection: test.f AS power(test.f,Float64(1))\
-                      \n  TableScan: test";
-    let actual = get_optimized_plan_formatted(plan, &Utc::now());
-    assert_eq!(expected, actual);
+    let formatter = get_optimized_plan_formatted(plan, &Utc::now());
+    let actual = formatter.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Projection: test.f AS power(test.f,Float64(1))
+      TableScan: test
+    "
+    );
     Ok(())
 }
 
@@ -334,9 +312,9 @@ fn simplify_scan_predicate() -> Result<()> {
 
     // before simplify: t.g = power(t.f, 1.0)
     // after simplify:  t.g = t.f"
-    let expected = "TableScan: test, full_filters=[g = f]";
-    let actual = get_optimized_plan_formatted(plan, &Utc::now());
-    assert_eq!(expected, actual);
+    let formatted = get_optimized_plan_formatted(plan, &Utc::now());
+    let actual = formatted.trim();
+    assert_snapshot!(actual, @"TableScan: test, full_filters=[g = f]");
     Ok(())
 }
 
@@ -490,8 +468,7 @@ fn multiple_now() -> Result<()> {
     // expect the same timestamp appears in both exprs
     let actual = get_optimized_plan_formatted(plan, &time);
     let expected = format!(
-        "Projection: TimestampNanosecond({}, Some(\"+00:00\")) AS now(), TimestampNanosecond({}, Some(\"+00:00\")) AS t2\
-            \n  TableScan: test",
+        "Projection: TimestampNanosecond({}, None) AS now(), TimestampNanosecond({}, None) AS t2\n  TableScan: test",
         time.timestamp_nanos_opt().unwrap(),
         time.timestamp_nanos_opt().unwrap()
     );
@@ -500,6 +477,72 @@ fn multiple_now() -> Result<()> {
     Ok(())
 }
 
+/// Unwraps an alias expression to get the inner expression
+fn unrwap_aliases(expr: &Expr) -> &Expr {
+    match expr {
+        Expr::Alias(alias) => unrwap_aliases(&alias.expr),
+        expr => expr,
+    }
+}
+
+/// Test that `now()` is simplified to a literal when execution start time is set,
+/// but remains as an expression when no execution start time is available.
+#[test]
+fn now_simplification_with_and_without_start_time() {
+    let plan = LogicalPlanBuilder::empty(false)
+        .project(vec![now()])
+        .unwrap()
+        .build()
+        .unwrap();
+
+    // Case 1: With execution start time set, now() should be simplified to a literal
+    {
+        let time = DateTime::<Utc>::from_timestamp_nanos(123);
+        let ctx: OptimizerContext =
+            OptimizerContext::new().with_query_execution_start_time(time);
+        let optimizer = SimplifyExpressions {};
+        let simplified = optimizer
+            .rewrite(plan.clone(), &ctx)
+            .expect("rewrite should succeed")
+            .data;
+        let LogicalPlan::Projection(Projection { expr, .. }) = simplified else {
+            panic!("Expected Projection plan");
+        };
+        assert_eq!(expr.len(), 1);
+        let simplified = unrwap_aliases(expr.first().unwrap());
+        // Should be a literal timestamp
+        match simplified {
+            Expr::Literal(ScalarValue::TimestampNanosecond(Some(ts), _), _) => {
+                assert_eq!(*ts, time.timestamp_nanos_opt().unwrap());
+            }
+            other => panic!("Expected timestamp literal, got: {other:?}"),
+        }
+    }
+
+    // Case 2: Without execution start time, now() should remain as a function call
+    {
+        let ctx: OptimizerContext =
+            OptimizerContext::new().without_query_execution_start_time();
+        let optimizer = SimplifyExpressions {};
+        let simplified = optimizer
+            .rewrite(plan, &ctx)
+            .expect("rewrite should succeed")
+            .data;
+        let LogicalPlan::Projection(Projection { expr, .. }) = simplified else {
+            panic!("Expected Projection plan");
+        };
+        assert_eq!(expr.len(), 1);
+        let simplified = unrwap_aliases(expr.first().unwrap());
+        // Should still be a now() function call
+        match simplified {
+            Expr::ScalarFunction(ScalarFunction { func, .. }) => {
+                assert_eq!(func.name(), "now");
+            }
+            other => panic!("Expected now() function call, got: {other:?}"),
+        }
+    }
+}
+
 // ------------------------------
 // --- Simplifier tests -----
 // ------------------------------
@@ -522,11 +565,8 @@ fn expr_test_schema() -> DFSchemaRef {
 }
 
 fn test_simplify(input_expr: Expr, expected_expr: Expr) {
-    let info: MyInfo = MyInfo {
-        schema: expr_test_schema(),
-        execution_props: ExecutionProps::new(),
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::default().with_schema(expr_test_schema());
+    let simplifier = ExprSimplifier::new(context);
     let simplified_expr = simplifier
         .simplify(input_expr.clone())
         .expect("successfully evaluated");
@@ -541,11 +581,10 @@ fn test_simplify_with_cycle_count(
     expected_expr: Expr,
     expected_count: u32,
 ) {
-    let info: MyInfo = MyInfo {
-        schema: expr_test_schema(),
-        execution_props: ExecutionProps::new(),
-    };
-    let simplifier = ExprSimplifier::new(info);
+    let context = SimplifyContext::default()
+        .with_schema(expr_test_schema())
+        .with_query_execution_start_time(Some(Utc::now()));
+    let simplifier = ExprSimplifier::new(context);
     let (simplified_expr, count) = simplifier
         .simplify_with_cycle_count_transformed(input_expr.clone())
         .expect("successfully evaluated");
diff --git a/datafusion/core/tests/fifo/mod.rs b/datafusion/core/tests/fifo/mod.rs
index 141a3f3b75586..3d99cc72fa590 100644
--- a/datafusion/core/tests/fifo/mod.rs
+++ b/datafusion/core/tests/fifo/mod.rs
@@ -22,21 +22,21 @@
 mod unix_test {
     use std::fs::File;
     use std::path::PathBuf;
-    use std::sync::atomic::{AtomicBool, Ordering};
     use std::sync::Arc;
+    use std::sync::atomic::{AtomicBool, Ordering};
     use std::time::Duration;
 
     use arrow::array::Array;
     use arrow::csv::ReaderBuilder;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
     use datafusion::{
         prelude::{CsvReadOptions, SessionConfig, SessionContext},
         test_util::{aggr_test_schema, arrow_test_data},
     };
     use datafusion_common::instant::Instant;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::SortExpr;
 
     use futures::StreamExt;
@@ -44,7 +44,7 @@ mod unix_test {
     use nix::unistd;
     use tempfile::TempDir;
     use tokio::io::AsyncWriteExt;
-    use tokio::task::{spawn_blocking, JoinHandle};
+    use tokio::task::{JoinHandle, spawn_blocking};
 
     /// Makes a TableProvider for a fifo file
     fn fifo_table(
@@ -94,7 +94,6 @@ mod unix_test {
     /// This function creates a writing task for the FIFO file. To verify
     /// incremental processing, it waits for a signal to continue writing after
     /// a certain number of lines are written.
-    #[allow(clippy::disallowed_methods)]
     fn create_writing_task(
         file_path: PathBuf,
         header: String,
@@ -105,6 +104,7 @@ mod unix_test {
         // Timeout for a long period of BrokenPipe error
         let broken_pipe_timeout = Duration::from_secs(10);
         // Spawn a new task to write to the FIFO file
+        #[expect(clippy::disallowed_methods)]
         tokio::spawn(async move {
             let mut file = tokio::fs::OpenOptions::new()
                 .write(true)
@@ -357,7 +357,7 @@ mod unix_test {
             (sink_fifo_path.clone(), sink_fifo_path.display());
 
         // Spawn a new thread to read sink EXTERNAL TABLE.
-        #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
+        #[expect(clippy::disallowed_methods)] // spawn allowed only in tests
         tasks.push(spawn_blocking(move || {
             let file = File::open(sink_fifo_path_thread).unwrap();
             let schema = Arc::new(Schema::new(vec![
diff --git a/datafusion/core/tests/fuzz.rs b/datafusion/core/tests/fuzz.rs
index 92646e8b37636..5e94f12b5805d 100644
--- a/datafusion/core/tests/fuzz.rs
+++ b/datafusion/core/tests/fuzz.rs
@@ -15,7 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-/// Run all tests that are found in the `fuzz_cases` directory
+/// Run all tests that are found in the `fuzz_cases` directory.
+/// Fuzz tests are slow and gated behind the `extended_tests` feature.
+/// Run with: cargo test --features extended_tests
+#[cfg(feature = "extended_tests")]
 mod fuzz_cases;
 
 #[cfg(test)]
diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
index 7e5ad011b5dd8..d64223abdb767 100644
--- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -17,47 +17,44 @@
 
 use std::sync::Arc;
 
+use super::record_batch_generator::get_supported_types_columns;
 use crate::fuzz_cases::aggregation_fuzzer::query_builder::QueryBuilder;
 use crate::fuzz_cases::aggregation_fuzzer::{
     AggregationFuzzerBuilder, DatasetGeneratorConfig,
 };
 
 use arrow::array::{
-    types::Int64Type, Array, ArrayRef, AsArray, Int32Array, Int64Array, RecordBatch,
-    StringArray,
+    Array, ArrayRef, AsArray, Int32Array, Int64Array, RecordBatch, StringArray,
+    types::Int64Type,
 };
-use arrow::compute::{concat_batches, SortOptions};
+use arrow::compute::concat_batches;
 use arrow::datatypes::DataType;
 use arrow::util::pretty::pretty_format_batches;
 use arrow_schema::{Field, Schema, SchemaRef};
-use datafusion::common::Result;
+use datafusion::datasource::MemTable;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::MemTable;
-use datafusion::physical_expr::aggregate::AggregateExprBuilder;
-use datafusion::physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
-};
-use datafusion::physical_plan::{collect, displayable, ExecutionPlan};
 use datafusion::prelude::{DataFrame, SessionConfig, SessionContext};
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::HashMap;
+use datafusion_common::{HashMap, Result};
 use datafusion_common_runtime::JoinSet;
 use datafusion_functions_aggregate::sum::sum_udaf;
-use datafusion_physical_expr::expressions::{col, lit, Column};
 use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr::expressions::{Column, col, lit};
 use datafusion_physical_plan::InputOrderMode;
-use test_utils::{add_empty_batches, StringBatchGenerator};
+use test_utils::{StringBatchGenerator, add_empty_batches};
 
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::FairSpillPool;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_execution::TaskContext;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateMode, PhysicalGroupBy,
+};
 use datafusion_physical_plan::metrics::MetricValue;
+use datafusion_physical_plan::{ExecutionPlan, collect, displayable};
 use rand::rngs::StdRng;
-use rand::{random, rng, Rng, SeedableRng};
-
-use super::record_batch_generator::get_supported_types_columns;
+use rand::{Rng, SeedableRng, random, rng};
 
 // ========================================================================
 //  The new aggregation fuzz tests based on [`AggregationFuzzer`]
@@ -254,6 +251,12 @@ fn baseline_config() -> DatasetGeneratorConfig {
             // low cardinality to try and get many repeated runs
             vec![String::from("u8_low")],
             vec![String::from("utf8_low"), String::from("u8_low")],
+            vec![String::from("dictionary_utf8_low")],
+            vec![
+                String::from("dictionary_utf8_low"),
+                String::from("utf8_low"),
+                String::from("u8_low"),
+            ],
         ],
     }
 }
@@ -303,13 +306,9 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
     let schema = input1[0].schema();
     let session_config = SessionConfig::new().with_batch_size(50);
     let ctx = SessionContext::new_with_config(session_config);
-    let mut sort_keys = LexOrdering::default();
-    for ordering_col in ["a", "b", "c"] {
-        sort_keys.push(PhysicalSortExpr {
-            expr: col(ordering_col, &schema).unwrap(),
-            options: SortOptions::default(),
-        })
-    }
+    let sort_keys = ["a", "b", "c"].map(|ordering_col| {
+        PhysicalSortExpr::new_default(col(ordering_col, &schema).unwrap())
+    });
 
     let concat_input_record = concat_batches(&schema, &input1).unwrap();
 
@@ -321,24 +320,23 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
     .unwrap();
 
     let running_source = DataSourceExec::from_data_source(
-        MemorySourceConfig::try_new(&[input1.clone()], schema.clone(), None)
+        MemorySourceConfig::try_new(std::slice::from_ref(&input1), schema.clone(), None)
             .unwrap()
-            .try_with_sort_information(vec![sort_keys])
+            .try_with_sort_information(vec![sort_keys.into()])
             .unwrap(),
     );
 
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(sum_udaf(), vec![col("d", &schema).unwrap()])
-                .schema(Arc::clone(&schema))
-                .alias("sum1")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(sum_udaf(), vec![col("d", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("sum1")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
     let expr = group_by_columns
         .iter()
-        .map(|elem| (col(elem, &schema).unwrap(), elem.to_string()))
+        .map(|elem| (col(elem, &schema).unwrap(), (*elem).to_string()))
         .collect::<Vec<_>>();
     let group_by = PhysicalGroupBy::new_single(expr);
 
@@ -404,7 +402,7 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
              Left Plan:\n{}\n\
              Right Plan:\n{}\n\
              schema:\n{schema}\n\
-             Left Ouptut:\n{}\n\
+             Left Output:\n{}\n\
              Right Output:\n{}\n\
              input:\n{}\n\
              ",
@@ -556,7 +554,7 @@ async fn verify_ordered_aggregate(frame: &DataFrame, expected_sort: bool) {
                         InputOrderMode::PartiallySorted(_) | InputOrderMode::Sorted
                     ));
                 } else {
-                    assert!(matches!(exec.input_order_mode(), InputOrderMode::Linear));
+                    assert_eq!(*exec.input_order_mode(), InputOrderMode::Linear);
                 }
             }
             Ok(TreeNodeRecursion::Continue)
@@ -633,8 +631,11 @@ fn extract_result_counts(results: Vec<RecordBatch>) -> HashMap<Option<String>, i
     output
 }
 
-fn assert_spill_count_metric(expect_spill: bool, single_aggregate: Arc<AggregateExec>) {
-    if let Some(metrics_set) = single_aggregate.metrics() {
+pub(crate) fn assert_spill_count_metric(
+    expect_spill: bool,
+    plan_that_spills: Arc<dyn ExecutionPlan>,
+) -> usize {
+    if let Some(metrics_set) = plan_that_spills.metrics() {
         let mut spill_count = 0;
 
         // Inspect metrics for SpillCount
@@ -648,8 +649,12 @@ fn assert_spill_count_metric(expect_spill: bool, single_aggregate: Arc<Aggregate
         if expect_spill && spill_count == 0 {
             panic!("Expected spill but SpillCount metric not found or SpillCount was 0.");
         } else if !expect_spill && spill_count > 0 {
-            panic!("Expected no spill but found SpillCount metric with value greater than 0.");
+            panic!(
+                "Expected no spill but found SpillCount metric with value greater than 0."
+            );
         }
+
+        spill_count
     } else {
         panic!("No metrics returned from the operator; cannot verify spilling.");
     }
@@ -657,7 +662,7 @@ fn assert_spill_count_metric(expect_spill: bool, single_aggregate: Arc<Aggregate
 
 // Fix for https://github.com/apache/datafusion/issues/15530
 #[tokio::test]
-async fn test_single_mode_aggregate_with_spill() -> Result<()> {
+async fn test_single_mode_aggregate_single_mode_aggregate_with_spill() -> Result<()> {
     let scan_schema = Arc::new(Schema::new(vec![
         Field::new("col_0", DataType::Int64, true),
         Field::new("col_1", DataType::Utf8, true),
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
index 2abfcd8417cbc..fe31098622c58 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs
@@ -25,7 +25,7 @@ use datafusion_catalog::TableProvider;
 use datafusion_common::ScalarValue;
 use datafusion_common::{error::Result, utils::get_available_parallelism};
 use datafusion_expr::col;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
 use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset;
 
@@ -44,7 +44,6 @@ use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset;
 ///   - hint `sorted` or not
 ///   - `spilling` or not (TODO, I think a special `MemoryPool` may be needed
 ///     to support this)
-///
 pub struct SessionContextGenerator {
     /// Current testing dataset
     dataset: Arc<Dataset>,
@@ -215,7 +214,7 @@ impl GeneratedSessionContextBuilder {
 
 /// The generated params for [`SessionContext`]
 #[derive(Debug)]
-#[allow(dead_code)]
+#[expect(dead_code)]
 pub struct SessionContextParams {
     batch_size: usize,
     target_partitions: usize,
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
index 82bfe199234ef..e49cffa89b04e 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
@@ -18,7 +18,7 @@
 use arrow::array::RecordBatch;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
-use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::sorts::sort::sort_batch;
 use test_utils::stagger_batch;
@@ -39,7 +39,6 @@ use crate::fuzz_cases::record_batch_generator::{ColumnDescr, RecordBatchGenerato
 ///     will generate one `base dataset` firstly. Then the `base dataset` will be sorted
 ///     based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets
 ///     will be returned
-///
 #[derive(Debug, Clone)]
 pub struct DatasetGeneratorConfig {
     /// Descriptions of columns in datasets, it's `required`
@@ -115,7 +114,6 @@ impl DatasetGeneratorConfig {
 ///   
 ///   - Split each batch to multiple batches which each sub-batch in has the randomly `rows num`,
 ///     and this multiple batches will be used to create the `Dataset`.
-///
 pub struct DatasetGenerator {
     batch_generator: RecordBatchGenerator,
     sort_keys_set: Vec<Vec<String>>,
@@ -149,14 +147,14 @@ impl DatasetGenerator {
         for sort_keys in self.sort_keys_set.clone() {
             let sort_exprs = sort_keys
                 .iter()
-                .map(|key| {
-                    let col_expr = col(key, schema)?;
-                    Ok(PhysicalSortExpr::new_default(col_expr))
-                })
-                .collect::<Result<LexOrdering>>()?;
-            let sorted_batch = sort_batch(&base_batch, sort_exprs.as_ref(), None)?;
-
-            let batches = stagger_batch(sorted_batch);
+                .map(|key| col(key, schema).map(PhysicalSortExpr::new_default))
+                .collect::<Result<Vec<_>>>()?;
+            let batch = if let Some(ordering) = LexOrdering::new(sort_exprs) {
+                sort_batch(&base_batch, &ordering, None)?
+            } else {
+                base_batch.clone()
+            };
+            let batches = stagger_batch(batch);
             let dataset = Dataset::new(batches, sort_keys);
             datasets.push(dataset);
         }
@@ -211,8 +209,8 @@ mod test {
             sort_keys_set: vec![vec!["b".to_string()]],
         };
 
-        let mut gen = DatasetGenerator::new(config);
-        let datasets = gen.generate().unwrap();
+        let mut data_gen = DatasetGenerator::new(config);
+        let datasets = data_gen.generate().unwrap();
 
         // Should Generate 2 datasets
         assert_eq!(datasets.len(), 2);
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
index cfb3c1c6a1b98..430762b1c28db 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
@@ -19,9 +19,9 @@ use std::sync::Arc;
 
 use arrow::array::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::{Result, internal_datafusion_err};
 use datafusion_common_runtime::JoinSet;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
 use crate::fuzz_cases::aggregation_fuzzer::query_builder::QueryBuilder;
 use crate::fuzz_cases::aggregation_fuzzer::{
@@ -171,7 +171,7 @@ impl AggregationFuzzer {
             let datasets = self
                 .dataset_generator
                 .generate()
-                .expect("should success to generate dataset");
+                .expect("should succeed to generate dataset");
 
             // Then for each of them, we random select a test sql for it
             let query_groups = datasets
@@ -197,7 +197,7 @@ impl AggregationFuzzer {
         while let Some(join_handle) = join_set.join_next().await {
             // propagate errors
             join_handle.map_err(|e| {
-                DataFusionError::Internal(format!("AggregationFuzzer task error: {e:?}"))
+                internal_datafusion_err!("AggregationFuzzer task error: {e:?}")
             })??;
         }
         Ok(())
@@ -216,16 +216,16 @@ impl AggregationFuzzer {
             // Generate the baseline context, and get the baseline result firstly
             let baseline_ctx_with_params = ctx_generator
                 .generate_baseline()
-                .expect("should success to generate baseline session context");
+                .expect("should succeed to generate baseline session context");
             let baseline_result = run_sql(&sql, &baseline_ctx_with_params.ctx)
                 .await
-                .expect("should success to run baseline sql");
+                .expect("should succeed to run baseline sql");
             let baseline_result = Arc::new(baseline_result);
             // Generate test tasks
             for _ in 0..CTX_GEN_ROUNDS {
                 let ctx_with_params = ctx_generator
                     .generate()
-                    .expect("should success to generate session context");
+                    .expect("should succeed to generate session context");
                 let task = AggregationFuzzTestTask {
                     dataset_ref: dataset_ref.clone(),
                     expected_result: baseline_result.clone(),
@@ -253,7 +253,6 @@ impl AggregationFuzzer {
 ///
 ///   - `dataset_ref`, the input dataset, store it for error reported when found
 ///     the inconsistency between the one for `ctx` and `expected results`.
-///
 struct AggregationFuzzTestTask {
     /// Generated session context in current test case
     ctx_with_params: SessionContextWithParams,
@@ -308,7 +307,7 @@ impl AggregationFuzzTestTask {
                 format_batches_with_limit(expected_result),
                 format_batches_with_limit(&self.dataset_ref.batches),
             );
-            DataFusionError::Internal(message)
+            internal_datafusion_err!("{message}")
         })
     }
 
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/mod.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/mod.rs
index 04b764e46a96b..e7ce557d2267d 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/mod.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/mod.rs
@@ -77,8 +77,8 @@ pub(crate) fn check_equality_of_batches(
         if lhs_row != rhs_row {
             return Err(InconsistentResult {
                 row_idx,
-                lhs_row: lhs_row.to_string(),
-                rhs_row: rhs_row.to_string(),
+                lhs_row: (*lhs_row).to_string(),
+                rhs_row: (*rhs_row).to_string(),
             });
         }
     }
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
index 209278385b7b5..7bb6177c31010 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs
@@ -17,14 +17,14 @@
 
 use std::{collections::HashSet, str::FromStr};
 
-use rand::{rng, seq::SliceRandom, Rng};
+use rand::{Rng, rng, seq::SliceRandom};
 
 /// Random aggregate query builder
 ///
 /// Creates queries like
 /// ```sql
 /// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
-///```
+/// ```
 #[derive(Debug, Default, Clone)]
 pub struct QueryBuilder {
     // ===================================
@@ -95,7 +95,6 @@ pub struct QueryBuilder {
     /// More details can see [`GroupOrdering`].
     ///
     /// [`GroupOrdering`]:  datafusion_physical_plan::aggregates::order::GroupOrdering
-    ///
     dataset_sort_keys: Vec<Vec<String>>,
 
     /// If we will also test the no grouping case like:
@@ -103,7 +102,6 @@ pub struct QueryBuilder {
     /// ```text
     ///   SELECT aggr FROM t;
     /// ```
-    ///
     no_grouping: bool,
 
     // ====================================
@@ -184,13 +182,13 @@ impl QueryBuilder {
 
     /// Add max columns num in group by(default: 3), for example if it is set to 1,
     /// the generated sql will group by at most 1 column
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_max_group_by_columns(mut self, max_group_by_columns: usize) -> Self {
         self.max_group_by_columns = max_group_by_columns;
         self
     }
 
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_min_group_by_columns(mut self, min_group_by_columns: usize) -> Self {
         self.min_group_by_columns = min_group_by_columns;
         self
@@ -204,7 +202,7 @@ impl QueryBuilder {
     }
 
     /// Add if also test the no grouping aggregation case(default: true)
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     pub fn with_no_grouping(mut self, no_grouping: bool) -> Self {
         self.no_grouping = no_grouping;
         self
diff --git a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
index 3049631d4b3fe..92adda200d1a5 100644
--- a/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
@@ -19,7 +19,7 @@
 
 use std::sync::Arc;
 
-use arrow::array::{cast::AsArray, Array, OffsetSizeTrait, RecordBatch};
+use arrow::array::{Array, OffsetSizeTrait, RecordBatch, cast::AsArray};
 
 use datafusion::datasource::MemTable;
 use datafusion_common_runtime::JoinSet;
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
index d12d0a130c0c0..a57095066ee12 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/ordering.rs
@@ -16,15 +16,19 @@
 // under the License.
 
 use crate::fuzz_cases::equivalence::utils::{
-    convert_to_orderings, create_random_schema, create_test_params, create_test_schema_2,
+    TestScalarUDF, create_random_schema, create_test_params, create_test_schema_2,
     generate_table_for_eq_properties, generate_table_for_orderings,
-    is_table_same_after_sort, TestScalarUDF,
+    is_table_same_after_sort,
 };
 use arrow::compute::SortOptions;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{Operator, ScalarUDF};
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
 use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::equivalence::{
+    convert_to_orderings, convert_to_sort_exprs,
+};
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use itertools::Itertools;
@@ -55,26 +59,25 @@ fn test_ordering_satisfy_with_equivalence_random() -> Result<()> {
             col("f", &test_schema)?,
         ];
 
-        for n_req in 0..=col_exprs.len() {
+        for n_req in 1..=col_exprs.len() {
             for exprs in col_exprs.iter().combinations(n_req) {
-                let requirement = exprs
+                let sort_exprs = exprs
                     .into_iter()
-                    .map(|expr| PhysicalSortExpr {
-                        expr: Arc::clone(expr),
-                        options: SORT_OPTIONS,
-                    })
-                    .collect::<LexOrdering>();
+                    .map(|expr| PhysicalSortExpr::new(Arc::clone(expr), SORT_OPTIONS));
+                let Some(ordering) = LexOrdering::new(sort_exprs) else {
+                    unreachable!("Test should always produce non-degenerate orderings");
+                };
                 let expected = is_table_same_after_sort(
-                    requirement.clone(),
-                    table_data_with_properties.clone(),
+                    ordering.clone(),
+                    &table_data_with_properties,
                 )?;
                 let err_msg = format!(
-                    "Error in test case requirement:{requirement:?}, expected: {expected:?}, eq_properties {eq_properties}"
+                    "Error in test case requirement:{ordering:?}, expected: {expected:?}, eq_properties {eq_properties}"
                 );
                 // Check whether ordering_satisfy API result and
                 // experimental result matches.
                 assert_eq!(
-                    eq_properties.ordering_satisfy(requirement.as_ref()),
+                    eq_properties.ordering_satisfy(ordering)?,
                     expected,
                     "{err_msg}"
                 );
@@ -108,6 +111,7 @@ fn test_ordering_satisfy_with_equivalence_complex_random() -> Result<()> {
             Arc::clone(&test_fun),
             vec![col_a],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?);
         let a_plus_b = Arc::new(BinaryExpr::new(
             col("a", &test_schema)?,
@@ -125,27 +129,26 @@ fn test_ordering_satisfy_with_equivalence_complex_random() -> Result<()> {
             a_plus_b,
         ];
 
-        for n_req in 0..=exprs.len() {
+        for n_req in 1..=exprs.len() {
             for exprs in exprs.iter().combinations(n_req) {
-                let requirement = exprs
+                let sort_exprs = exprs
                     .into_iter()
-                    .map(|expr| PhysicalSortExpr {
-                        expr: Arc::clone(expr),
-                        options: SORT_OPTIONS,
-                    })
-                    .collect::<LexOrdering>();
+                    .map(|expr| PhysicalSortExpr::new(Arc::clone(expr), SORT_OPTIONS));
+                let Some(ordering) = LexOrdering::new(sort_exprs) else {
+                    unreachable!("Test should always produce non-degenerate orderings");
+                };
                 let expected = is_table_same_after_sort(
-                    requirement.clone(),
-                    table_data_with_properties.clone(),
+                    ordering.clone(),
+                    &table_data_with_properties,
                 )?;
                 let err_msg = format!(
-                    "Error in test case requirement:{requirement:?}, expected: {expected:?}, eq_properties: {eq_properties}",
+                    "Error in test case requirement:{ordering:?}, expected: {expected:?}, eq_properties: {eq_properties}",
                 );
                 // Check whether ordering_satisfy API result and
                 // experimental result matches.
 
                 assert_eq!(
-                    eq_properties.ordering_satisfy(requirement.as_ref()),
+                    eq_properties.ordering_satisfy(ordering)?,
                     (expected | false),
                     "{err_msg}"
                 );
@@ -300,25 +303,19 @@ fn test_ordering_satisfy_with_equivalence() -> Result<()> {
     ];
 
     for (cols, expected) in requirements {
-        let err_msg = format!("Error in test case:{cols:?}");
-        let required = cols
-            .into_iter()
-            .map(|(expr, options)| PhysicalSortExpr {
-                expr: Arc::clone(expr),
-                options,
-            })
-            .collect::<LexOrdering>();
+        let err_msg = format!("Error in test case: {cols:?}");
+        let sort_exprs = convert_to_sort_exprs(&cols);
+        let Some(ordering) = LexOrdering::new(sort_exprs) else {
+            unreachable!("Test should always produce non-degenerate orderings");
+        };
 
         // Check expected result with experimental result.
         assert_eq!(
-            is_table_same_after_sort(
-                required.clone(),
-                table_data_with_properties.clone()
-            )?,
+            is_table_same_after_sort(ordering.clone(), &table_data_with_properties)?,
             expected
         );
         assert_eq!(
-            eq_properties.ordering_satisfy(required.as_ref()),
+            eq_properties.ordering_satisfy(ordering)?,
             expected,
             "{err_msg}"
         );
@@ -371,7 +368,7 @@ fn test_ordering_satisfy_on_data() -> Result<()> {
         (col_d, option_asc),
     ];
     let ordering = convert_to_orderings(&[ordering])[0].clone();
-    assert!(!is_table_same_after_sort(ordering, batch.clone())?);
+    assert!(!is_table_same_after_sort(ordering, &batch)?);
 
     // [a ASC, b ASC, d ASC] cannot be deduced
     let ordering = vec![
@@ -380,12 +377,12 @@ fn test_ordering_satisfy_on_data() -> Result<()> {
         (col_d, option_asc),
     ];
     let ordering = convert_to_orderings(&[ordering])[0].clone();
-    assert!(!is_table_same_after_sort(ordering, batch.clone())?);
+    assert!(!is_table_same_after_sort(ordering, &batch)?);
 
     // [a ASC, b ASC] can be deduced
     let ordering = vec![(col_a, option_asc), (col_b, option_asc)];
     let ordering = convert_to_orderings(&[ordering])[0].clone();
-    assert!(is_table_same_after_sort(ordering, batch.clone())?);
+    assert!(is_table_same_after_sort(ordering, &batch)?);
 
     Ok(())
 }
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
index 38e66387a02cd..2f67e211ce915 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/projection.rs
@@ -16,14 +16,15 @@
 // under the License.
 
 use crate::fuzz_cases::equivalence::utils::{
-    apply_projection, create_random_schema, generate_table_for_eq_properties,
-    is_table_same_after_sort, TestScalarUDF,
+    TestScalarUDF, apply_projection, create_random_schema,
+    generate_table_for_eq_properties, is_table_same_after_sort,
 };
 use arrow::compute::SortOptions;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{Operator, ScalarUDF};
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
 use datafusion_physical_expr::{PhysicalExprRef, ScalarFunctionExpr};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
@@ -49,6 +50,7 @@ fn project_orderings_random() -> Result<()> {
             Arc::clone(&test_fun),
             vec![col_a],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?);
         // a + b
         let a_plus_b = Arc::new(BinaryExpr::new(
@@ -56,7 +58,7 @@ fn project_orderings_random() -> Result<()> {
             Operator::Plus,
             col("b", &test_schema)?,
         )) as Arc<dyn PhysicalExpr>;
-        let proj_exprs = vec![
+        let proj_exprs = [
             (col("a", &test_schema)?, "a_new"),
             (col("b", &test_schema)?, "b_new"),
             (col("c", &test_schema)?, "c_new"),
@@ -71,7 +73,7 @@ fn project_orderings_random() -> Result<()> {
             for proj_exprs in proj_exprs.iter().combinations(n_req) {
                 let proj_exprs = proj_exprs
                     .into_iter()
-                    .map(|(expr, name)| (Arc::clone(expr), name.to_string()))
+                    .map(|(expr, name)| (Arc::clone(expr), (*name).to_string()))
                     .collect::<Vec<_>>();
                 let (projected_batch, projected_eq) = apply_projection(
                     proj_exprs.clone(),
@@ -87,10 +89,7 @@ fn project_orderings_random() -> Result<()> {
                     // Since ordered section satisfies schema, we expect
                     // that result will be same after sort (e.g sort was unnecessary).
                     assert!(
-                        is_table_same_after_sort(
-                            ordering.clone(),
-                            projected_batch.clone(),
-                        )?,
+                        is_table_same_after_sort(ordering.clone(), &projected_batch)?,
                         "{}",
                         err_msg
                     );
@@ -125,6 +124,7 @@ fn ordering_satisfy_after_projection_random() -> Result<()> {
             Arc::clone(&test_fun),
             vec![col_a],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?) as PhysicalExprRef;
         // a + b
         let a_plus_b = Arc::new(BinaryExpr::new(
@@ -132,7 +132,7 @@ fn ordering_satisfy_after_projection_random() -> Result<()> {
             Operator::Plus,
             col("b", &test_schema)?,
         )) as Arc<dyn PhysicalExpr>;
-        let proj_exprs = vec![
+        let proj_exprs = [
             (col("a", &test_schema)?, "a_new"),
             (col("b", &test_schema)?, "b_new"),
             (col("c", &test_schema)?, "c_new"),
@@ -147,8 +147,7 @@ fn ordering_satisfy_after_projection_random() -> Result<()> {
             for proj_exprs in proj_exprs.iter().combinations(n_req) {
                 let proj_exprs = proj_exprs
                     .into_iter()
-                    .map(|(expr, name)| (Arc::clone(expr), name.to_string()))
-                    .collect::<Vec<_>>();
+                    .map(|(expr, name)| (Arc::clone(expr), (*name).to_string()));
                 let (projected_batch, projected_eq) = apply_projection(
                     proj_exprs.clone(),
                     &table_data_with_properties,
@@ -156,33 +155,34 @@ fn ordering_satisfy_after_projection_random() -> Result<()> {
                 )?;
 
                 let projection_mapping =
-                    ProjectionMapping::try_new(&proj_exprs, &test_schema)?;
+                    ProjectionMapping::try_new(proj_exprs, &test_schema)?;
 
                 let projected_exprs = projection_mapping
                     .iter()
-                    .map(|(_source, target)| Arc::clone(target))
+                    .flat_map(|(_, targets)| {
+                        targets.iter().map(|(target, _)| Arc::clone(target))
+                    })
                     .collect::<Vec<_>>();
 
-                for n_req in 0..=projected_exprs.len() {
+                for n_req in 1..=projected_exprs.len() {
                     for exprs in projected_exprs.iter().combinations(n_req) {
-                        let requirement = exprs
-                            .into_iter()
-                            .map(|expr| PhysicalSortExpr {
-                                expr: Arc::clone(expr),
-                                options: SORT_OPTIONS,
-                            })
-                            .collect::<LexOrdering>();
-                        let expected = is_table_same_after_sort(
-                            requirement.clone(),
-                            projected_batch.clone(),
-                        )?;
+                        let sort_exprs = exprs.into_iter().map(|expr| {
+                            PhysicalSortExpr::new(Arc::clone(expr), SORT_OPTIONS)
+                        });
+                        let Some(ordering) = LexOrdering::new(sort_exprs) else {
+                            unreachable!(
+                                "Test should always produce non-degenerate orderings"
+                            );
+                        };
+                        let expected =
+                            is_table_same_after_sort(ordering.clone(), &projected_batch)?;
                         let err_msg = format!(
-                            "Error in test case requirement:{requirement:?}, expected: {expected:?}, eq_properties: {eq_properties}, projected_eq: {projected_eq}, projection_mapping: {projection_mapping:?}"
+                            "Error in test case requirement:{ordering:?}, expected: {expected:?}, eq_properties: {eq_properties}, projected_eq: {projected_eq}, projection_mapping: {projection_mapping:?}"
                         );
                         // Check whether ordering_satisfy API result and
                         // experimental result matches.
                         assert_eq!(
-                            projected_eq.ordering_satisfy(requirement.as_ref()),
+                            projected_eq.ordering_satisfy(ordering)?,
                             expected,
                             "{err_msg}"
                         );
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
index 9a21464157495..1490eb08a0291 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/properties.rs
@@ -15,18 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::sync::Arc;
+
 use crate::fuzz_cases::equivalence::utils::{
-    create_random_schema, generate_table_for_eq_properties, is_table_same_after_sort,
-    TestScalarUDF,
+    TestScalarUDF, create_random_schema, generate_table_for_eq_properties,
+    is_table_same_after_sort,
 };
+
 use datafusion_common::Result;
 use datafusion_expr::{Operator, ScalarUDF};
-use datafusion_physical_expr::expressions::{col, BinaryExpr};
-use datafusion_physical_expr::{PhysicalExprRef, ScalarFunctionExpr};
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr::expressions::{BinaryExpr, col};
+use datafusion_physical_expr::{LexOrdering, ScalarFunctionExpr};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+use datafusion_common::config::ConfigOptions;
 use itertools::Itertools;
-use std::sync::Arc;
 
 #[test]
 fn test_find_longest_permutation_random() -> Result<()> {
@@ -47,13 +50,14 @@ fn test_find_longest_permutation_random() -> Result<()> {
             Arc::clone(&test_fun),
             vec![col_a],
             &test_schema,
-        )?) as PhysicalExprRef;
+            Arc::new(ConfigOptions::default()),
+        )?) as _;
 
         let a_plus_b = Arc::new(BinaryExpr::new(
             col("a", &test_schema)?,
             Operator::Plus,
             col("b", &test_schema)?,
-        )) as Arc<dyn PhysicalExpr>;
+        )) as _;
         let exprs = [
             col("a", &test_schema)?,
             col("b", &test_schema)?,
@@ -68,16 +72,16 @@ fn test_find_longest_permutation_random() -> Result<()> {
         for n_req in 0..=exprs.len() {
             for exprs in exprs.iter().combinations(n_req) {
                 let exprs = exprs.into_iter().cloned().collect::<Vec<_>>();
-                let (ordering, indices) = eq_properties.find_longest_permutation(&exprs);
+                let (ordering, indices) =
+                    eq_properties.find_longest_permutation(&exprs)?;
                 // Make sure that find_longest_permutation return values are consistent
                 let ordering2 = indices
                     .iter()
                     .zip(ordering.iter())
-                    .map(|(&idx, sort_expr)| PhysicalSortExpr {
-                        expr: Arc::clone(&exprs[idx]),
-                        options: sort_expr.options,
+                    .map(|(&idx, sort_expr)| {
+                        PhysicalSortExpr::new(Arc::clone(&exprs[idx]), sort_expr.options)
                     })
-                    .collect::<LexOrdering>();
+                    .collect::<Vec<_>>();
                 assert_eq!(
                     ordering, ordering2,
                     "indices and lexicographical ordering do not match"
@@ -89,11 +93,11 @@ fn test_find_longest_permutation_random() -> Result<()> {
                 assert_eq!(ordering.len(), indices.len(), "{err_msg}");
                 // Since ordered section satisfies schema, we expect
                 // that result will be same after sort (e.g sort was unnecessary).
+                let Some(ordering) = LexOrdering::new(ordering) else {
+                    continue;
+                };
                 assert!(
-                    is_table_same_after_sort(
-                        ordering.clone(),
-                        table_data_with_properties.clone(),
-                    )?,
+                    is_table_same_after_sort(ordering, &table_data_with_properties)?,
                     "{}",
                     err_msg
                 );
diff --git a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
index a906648f872dc..580a226721083 100644
--- a/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
+++ b/datafusion/core/tests/fuzz_cases/equivalence/utils.rs
@@ -15,55 +15,50 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::physical_plan::expressions::col;
-use datafusion::physical_plan::expressions::Column;
-use datafusion_physical_expr::{ConstExpr, EquivalenceProperties, PhysicalSortExpr};
 use std::any::Any;
 use std::cmp::Ordering;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Float32Array, Float64Array, RecordBatch, UInt32Array};
-use arrow::compute::SortOptions;
-use arrow::compute::{lexsort_to_indices, take_record_batch, SortColumn};
+use arrow::compute::{SortColumn, SortOptions, lexsort_to_indices, take_record_batch};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::utils::{compare_rows, get_row_at_idx};
-use datafusion_common::{exec_err, plan_datafusion_err, DataFusionError, Result};
+use datafusion_common::{Result, exec_err, internal_datafusion_err, plan_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
-use datafusion_physical_expr::equivalence::{EquivalenceClass, ProjectionMapping};
+use datafusion_physical_expr::equivalence::{
+    EquivalenceClass, ProjectionMapping, convert_to_orderings,
+};
+use datafusion_physical_expr::{ConstExpr, EquivalenceProperties};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::expressions::{Column, col};
 
 use itertools::izip;
 use rand::prelude::*;
 
+/// Projects the input schema based on the given projection mapping.
 pub fn output_schema(
     mapping: &ProjectionMapping,
     input_schema: &Arc<Schema>,
 ) -> Result<SchemaRef> {
-    // Calculate output schema
-    let fields: Result<Vec<Field>> = mapping
-        .iter()
-        .map(|(source, target)| {
-            let name = target
-                .as_any()
-                .downcast_ref::<Column>()
-                .ok_or_else(|| plan_datafusion_err!("Expects to have column"))?
-                .name();
-            let field = Field::new(
-                name,
-                source.data_type(input_schema)?,
-                source.nullable(input_schema)?,
-            );
-
-            Ok(field)
-        })
-        .collect();
+    // Calculate output schema:
+    let mut fields = vec![];
+    for (source, targets) in mapping.iter() {
+        let data_type = source.data_type(input_schema)?;
+        let nullable = source.nullable(input_schema)?;
+        for (target, _) in targets.iter() {
+            let Some(column) = target.as_any().downcast_ref::<Column>() else {
+                return plan_err!("Expects to have column");
+            };
+            fields.push(Field::new(column.name(), data_type.clone(), nullable));
+        }
+    }
 
     let output_schema = Arc::new(Schema::new_with_metadata(
-        fields?,
+        fields,
         input_schema.metadata().clone(),
     ));
 
@@ -100,9 +95,9 @@ pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperti
 
     let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
     // Define a and f are aliases
-    eq_properties.add_equal_conditions(col_a, col_f)?;
+    eq_properties.add_equal_conditions(Arc::clone(col_a), Arc::clone(col_f))?;
     // Column e has constant value.
-    eq_properties = eq_properties.with_constants([ConstExpr::from(col_e)]);
+    eq_properties.add_constants([ConstExpr::from(Arc::clone(col_e))])?;
 
     // Randomly order columns for sorting
     let mut rng = StdRng::seed_from_u64(seed);
@@ -114,18 +109,18 @@ pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperti
     };
 
     while !remaining_exprs.is_empty() {
-        let n_sort_expr = rng.random_range(0..remaining_exprs.len() + 1);
+        let n_sort_expr = rng.random_range(1..remaining_exprs.len() + 1);
         remaining_exprs.shuffle(&mut rng);
 
-        let ordering = remaining_exprs
-            .drain(0..n_sort_expr)
-            .map(|expr| PhysicalSortExpr {
-                expr: Arc::clone(expr),
-                options: options_asc,
-            })
-            .collect();
+        let ordering =
+            remaining_exprs
+                .drain(0..n_sort_expr)
+                .map(|expr| PhysicalSortExpr {
+                    expr: Arc::clone(expr),
+                    options: options_asc,
+                });
 
-        eq_properties.add_new_orderings([ordering]);
+        eq_properties.add_ordering(ordering);
     }
 
     Ok((test_schema, eq_properties))
@@ -133,12 +128,12 @@ pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperti
 
 // Apply projection to the input_data, return projected equivalence properties and record batch
 pub fn apply_projection(
-    proj_exprs: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    proj_exprs: impl IntoIterator<Item = (Arc<dyn PhysicalExpr>, String)>,
     input_data: &RecordBatch,
     input_eq_properties: &EquivalenceProperties,
 ) -> Result<(RecordBatch, EquivalenceProperties)> {
     let input_schema = input_data.schema();
-    let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?;
+    let projection_mapping = ProjectionMapping::try_new(proj_exprs, &input_schema)?;
 
     let output_schema = output_schema(&projection_mapping, &input_schema)?;
     let num_rows = input_data.num_rows();
@@ -168,49 +163,49 @@ fn add_equal_conditions_test() -> Result<()> {
     ]));
 
     let mut eq_properties = EquivalenceProperties::new(schema);
-    let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
-    let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
-    let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
-    let col_x_expr = Arc::new(Column::new("x", 3)) as Arc<dyn PhysicalExpr>;
-    let col_y_expr = Arc::new(Column::new("y", 4)) as Arc<dyn PhysicalExpr>;
+    let col_a = Arc::new(Column::new("a", 0)) as _;
+    let col_b = Arc::new(Column::new("b", 1)) as _;
+    let col_c = Arc::new(Column::new("c", 2)) as _;
+    let col_x = Arc::new(Column::new("x", 3)) as _;
+    let col_y = Arc::new(Column::new("y", 4)) as _;
 
     // a and b are aliases
-    eq_properties.add_equal_conditions(&col_a_expr, &col_b_expr)?;
+    eq_properties.add_equal_conditions(Arc::clone(&col_a), Arc::clone(&col_b))?;
     assert_eq!(eq_properties.eq_group().len(), 1);
 
     // This new entry is redundant, size shouldn't increase
-    eq_properties.add_equal_conditions(&col_b_expr, &col_a_expr)?;
+    eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_a))?;
     assert_eq!(eq_properties.eq_group().len(), 1);
     let eq_groups = eq_properties.eq_group().iter().next().unwrap();
     assert_eq!(eq_groups.len(), 2);
-    assert!(eq_groups.contains(&col_a_expr));
-    assert!(eq_groups.contains(&col_b_expr));
+    assert!(eq_groups.contains(&col_a));
+    assert!(eq_groups.contains(&col_b));
 
     // b and c are aliases. Existing equivalence class should expand,
     // however there shouldn't be any new equivalence class
-    eq_properties.add_equal_conditions(&col_b_expr, &col_c_expr)?;
+    eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_c))?;
     assert_eq!(eq_properties.eq_group().len(), 1);
     let eq_groups = eq_properties.eq_group().iter().next().unwrap();
     assert_eq!(eq_groups.len(), 3);
-    assert!(eq_groups.contains(&col_a_expr));
-    assert!(eq_groups.contains(&col_b_expr));
-    assert!(eq_groups.contains(&col_c_expr));
+    assert!(eq_groups.contains(&col_a));
+    assert!(eq_groups.contains(&col_b));
+    assert!(eq_groups.contains(&col_c));
 
     // This is a new set of equality. Hence equivalent class count should be 2.
-    eq_properties.add_equal_conditions(&col_x_expr, &col_y_expr)?;
+    eq_properties.add_equal_conditions(Arc::clone(&col_x), Arc::clone(&col_y))?;
     assert_eq!(eq_properties.eq_group().len(), 2);
 
     // This equality bridges distinct equality sets.
     // Hence equivalent class count should decrease from 2 to 1.
-    eq_properties.add_equal_conditions(&col_x_expr, &col_a_expr)?;
+    eq_properties.add_equal_conditions(Arc::clone(&col_x), Arc::clone(&col_a))?;
     assert_eq!(eq_properties.eq_group().len(), 1);
     let eq_groups = eq_properties.eq_group().iter().next().unwrap();
     assert_eq!(eq_groups.len(), 5);
-    assert!(eq_groups.contains(&col_a_expr));
-    assert!(eq_groups.contains(&col_b_expr));
-    assert!(eq_groups.contains(&col_c_expr));
-    assert!(eq_groups.contains(&col_x_expr));
-    assert!(eq_groups.contains(&col_y_expr));
+    assert!(eq_groups.contains(&col_a));
+    assert!(eq_groups.contains(&col_b));
+    assert!(eq_groups.contains(&col_c));
+    assert!(eq_groups.contains(&col_x));
+    assert!(eq_groups.contains(&col_y));
 
     Ok(())
 }
@@ -226,7 +221,7 @@ fn add_equal_conditions_test() -> Result<()> {
 /// already sorted according to `required_ordering` to begin with.
 pub fn is_table_same_after_sort(
     mut required_ordering: LexOrdering,
-    batch: RecordBatch,
+    batch: &RecordBatch,
 ) -> Result<bool> {
     // Clone the original schema and columns
     let original_schema = batch.schema();
@@ -327,7 +322,7 @@ pub fn create_test_params() -> Result<(SchemaRef, EquivalenceProperties)> {
     let col_f = &col("f", &test_schema)?;
     let col_g = &col("g", &test_schema)?;
     let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
-    eq_properties.add_equal_conditions(col_a, col_c)?;
+    eq_properties.add_equal_conditions(Arc::clone(col_a), Arc::clone(col_c))?;
 
     let option_asc = SortOptions {
         descending: false,
@@ -350,7 +345,7 @@ pub fn create_test_params() -> Result<(SchemaRef, EquivalenceProperties)> {
         ],
     ];
     let orderings = convert_to_orderings(&orderings);
-    eq_properties.add_new_orderings(orderings);
+    eq_properties.add_orderings(orderings);
     Ok((test_schema, eq_properties))
 }
 
@@ -376,7 +371,7 @@ pub fn generate_table_for_eq_properties(
 
     // Fill constant columns
     for constant in eq_properties.constants() {
-        let col = constant.expr().as_any().downcast_ref::<Column>().unwrap();
+        let col = constant.expr.as_any().downcast_ref::<Column>().unwrap();
         let (idx, _field) = schema.column_with_name(col.name()).unwrap();
         let arr =
             Arc::new(Float64Array::from_iter_values(vec![0 as f64; n_elem])) as ArrayRef;
@@ -461,7 +456,7 @@ pub fn generate_table_for_orderings(
     let batch = RecordBatch::try_from_iter(arrays)?;
 
     // Sort batch according to first ordering expression
-    let sort_columns = get_sort_columns(&batch, orderings[0].as_ref())?;
+    let sort_columns = get_sort_columns(&batch, &orderings[0])?;
     let sort_indices = lexsort_to_indices(&sort_columns, None)?;
     let mut batch = take_record_batch(&batch, &sort_indices)?;
 
@@ -494,29 +489,6 @@ pub fn generate_table_for_orderings(
     Ok(batch)
 }
 
-// Convert each tuple to PhysicalSortExpr
-pub fn convert_to_sort_exprs(
-    in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)],
-) -> LexOrdering {
-    in_data
-        .iter()
-        .map(|(expr, options)| PhysicalSortExpr {
-            expr: Arc::clone(*expr),
-            options: *options,
-        })
-        .collect()
-}
-
-// Convert each inner tuple to PhysicalSortExpr
-pub fn convert_to_orderings(
-    orderings: &[Vec<(&Arc<dyn PhysicalExpr>, SortOptions)>],
-) -> Vec<LexOrdering> {
-    orderings
-        .iter()
-        .map(|sort_exprs| convert_to_sort_exprs(sort_exprs))
-        .collect()
-}
-
 // Utility function to generate random f64 array
 fn generate_random_f64_array(
     n_elems: usize,
@@ -540,7 +512,7 @@ fn get_sort_columns(
         .collect::<Result<Vec<_>>>()
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct TestScalarUDF {
     pub(crate) signature: Signature,
 }
@@ -590,11 +562,11 @@ impl ScalarUDFImpl for TestScalarUDF {
             DataType::Float64 => Arc::new({
                 let arg = &args[0].as_any().downcast_ref::<Float64Array>().ok_or_else(
                     || {
-                        DataFusionError::Internal(format!(
+                        internal_datafusion_err!(
                             "could not cast {} to {}",
                             self.name(),
                             std::any::type_name::<Float64Array>()
-                        ))
+                        )
                     },
                 )?;
 
@@ -605,11 +577,11 @@ impl ScalarUDFImpl for TestScalarUDF {
             DataType::Float32 => Arc::new({
                 let arg = &args[0].as_any().downcast_ref::<Float32Array>().ok_or_else(
                     || {
-                        DataFusionError::Internal(format!(
+                        internal_datafusion_err!(
                             "could not cast {} to {}",
                             self.name(),
                             std::any::type_name::<Float32Array>()
-                        ))
+                        )
                     },
                 )?;
 
diff --git a/datafusion/core/tests/fuzz_cases/join_fuzz.rs b/datafusion/core/tests/fuzz_cases/join_fuzz.rs
index 82ee73b525cb1..669b98e39fec1 100644
--- a/datafusion/core/tests/fuzz_cases/join_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/join_fuzz.rs
@@ -20,7 +20,7 @@ use std::time::SystemTime;
 
 use crate::fuzz_cases::join_fuzz::JoinTestType::{HjSmj, NljHj};
 
-use arrow::array::{ArrayRef, Int32Array};
+use arrow::array::{ArrayRef, BinaryArray, Int32Array};
 use arrow::compute::SortOptions;
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
@@ -37,9 +37,9 @@ use datafusion::physical_plan::joins::{
     HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::ScalarValue;
-use datafusion_physical_expr::expressions::Literal;
+use datafusion_common::{NullEquality, ScalarValue};
 use datafusion_physical_expr::PhysicalExprRef;
+use datafusion_physical_expr::expressions::Literal;
 
 use itertools::Itertools;
 use rand::Rng;
@@ -91,218 +91,564 @@ fn col_lt_col_filter(schema1: Arc<Schema>, schema2: Arc<Schema>) -> JoinFilter {
 
 #[tokio::test]
 async fn test_inner_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Inner,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Inner,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_inner_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Inner,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Inner,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Left,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Left,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Left,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Left,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Right,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Right,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Right,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Right,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Full,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Full,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_full_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::Full,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[NljHj, HjSmj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::Full,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[NljHj, HjSmj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_semi_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::RightSemi,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_semi_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::RightSemi,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_anti_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::RightAnti,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_right_anti_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::RightAnti,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftMark,
-        None,
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 #[tokio::test]
 async fn test_left_mark_join_1k_filtered() {
-    JoinFuzzTestCase::new(
-        make_staggered_batches(1000),
-        make_staggered_batches(1000),
-        JoinType::LeftMark,
-        Some(Box::new(col_lt_col_filter)),
-    )
-    .run_test(&[HjSmj, NljHj], false)
-    .await
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::LeftMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+// todo: add JoinTestType::HjSmj after Right mark SortMergeJoin support
+#[tokio::test]
+async fn test_right_mark_join_1k() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_mark_join_1k_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_i32(1000, left_extra),
+            make_staggered_batches_i32(1000, right_extra),
+            JoinType::RightMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_inner_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Inner,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_inner_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Inner,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Left,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Left,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Right,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Right,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_full_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Full,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_full_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::Full,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[NljHj, HjSmj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_semi_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_semi_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_semi_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightSemi,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_semi_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightSemi,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_anti_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_anti_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_anti_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightAnti,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_anti_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightAnti,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_mark_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_left_mark_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::LeftMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+// todo: add JoinTestType::HjSmj after Right mark SortMergeJoin support
+#[tokio::test]
+async fn test_right_mark_join_1k_binary() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightMark,
+            None,
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
+}
+
+#[tokio::test]
+async fn test_right_mark_join_1k_binary_filtered() {
+    for (left_extra, right_extra) in [(true, true), (false, true), (true, false)] {
+        JoinFuzzTestCase::new(
+            make_staggered_batches_binary(1000, left_extra),
+            make_staggered_batches_binary(1000, right_extra),
+            JoinType::RightMark,
+            Some(Box::new(col_lt_col_filter)),
+        )
+        .run_test(&[HjSmj, NljHj], false)
+        .await
+    }
 }
 
 type JoinFilterBuilder = Box<dyn Fn(Arc<Schema>, Arc<Schema>) -> JoinFilter>;
@@ -452,12 +798,18 @@ impl JoinFuzzTestCase {
     fn left_right(&self) -> (Arc<DataSourceExec>, Arc<DataSourceExec>) {
         let schema1 = self.input1[0].schema();
         let schema2 = self.input2[0].schema();
-        let left =
-            MemorySourceConfig::try_new_exec(&[self.input1.clone()], schema1, None)
-                .unwrap();
-        let right =
-            MemorySourceConfig::try_new_exec(&[self.input2.clone()], schema2, None)
-                .unwrap();
+        let left = MemorySourceConfig::try_new_exec(
+            std::slice::from_ref(&self.input1),
+            schema1,
+            None,
+        )
+        .unwrap();
+        let right = MemorySourceConfig::try_new_exec(
+            std::slice::from_ref(&self.input2),
+            schema2,
+            None,
+        )
+        .unwrap();
         (left, right)
     }
 
@@ -479,7 +831,7 @@ impl JoinFuzzTestCase {
                 self.join_filter(),
                 self.join_type,
                 vec![SortOptions::default(); self.on_columns().len()],
-                false,
+                NullEquality::NullEqualsNothing,
             )
             .unwrap(),
         )
@@ -496,6 +848,7 @@ impl JoinFuzzTestCase {
                 &self.join_type,
                 None,
                 PartitionMode::Partitioned,
+                NullEquality::NullEqualsNothing,
                 false,
             )
             .unwrap(),
@@ -569,7 +922,9 @@ impl JoinFuzzTestCase {
                 std::fs::remove_dir_all(fuzz_debug).unwrap_or(());
                 std::fs::create_dir_all(fuzz_debug).unwrap();
                 let out_dir_name = &format!("{fuzz_debug}/batch_size_{batch_size}");
-                println!("Test result data mismatch found. HJ rows {hj_rows}, SMJ rows {smj_rows}, NLJ rows {nlj_rows}");
+                println!(
+                    "Test result data mismatch found. HJ rows {hj_rows}, SMJ rows {smj_rows}, NLJ rows {nlj_rows}"
+                );
                 println!("The debug is ON. Input data will be saved to {out_dir_name}");
 
                 Self::save_partitioned_batches_as_parquet(
@@ -588,7 +943,6 @@ impl JoinFuzzTestCase {
                     hj_formatted_sorted.iter().for_each(|s| println!("{s}"));
                     println!("=============== NestedLoopJoinExec ==================");
                     nlj_formatted_sorted.iter().for_each(|s| println!("{s}"));
-
                     Self::save_partitioned_batches_as_parquet(
                         &nlj_collected,
                         out_dir_name,
@@ -621,10 +975,18 @@ impl JoinFuzzTestCase {
             }
 
             if join_tests.contains(&NljHj) {
-                let err_msg_rowcnt = format!("NestedLoopJoinExec and HashJoinExec produced different row counts, batch_size: {batch_size}");
+                let err_msg_rowcnt = format!(
+                    "NestedLoopJoinExec and HashJoinExec produced different row counts, batch_size: {batch_size}"
+                );
                 assert_eq!(nlj_rows, hj_rows, "{}", err_msg_rowcnt.as_str());
+                if nlj_rows == 0 && hj_rows == 0 {
+                    // both joins returned no rows, skip content comparison
+                    continue;
+                }
 
-                let err_msg_contents = format!("NestedLoopJoinExec and HashJoinExec produced different results, batch_size: {batch_size}");
+                let err_msg_contents = format!(
+                    "NestedLoopJoinExec and HashJoinExec produced different results, batch_size: {batch_size}"
+                );
                 // row level compare if any of joins returns the result
                 // the reason is different formatting when there is no rows
                 for (i, (nlj_line, hj_line)) in nlj_formatted_sorted
@@ -642,10 +1004,16 @@ impl JoinFuzzTestCase {
             }
 
             if join_tests.contains(&HjSmj) {
-                let err_msg_row_cnt = format!("HashJoinExec and SortMergeJoinExec produced different row counts, batch_size: {}", &batch_size);
+                let err_msg_row_cnt = format!(
+                    "HashJoinExec and SortMergeJoinExec produced different row counts, batch_size: {}",
+                    &batch_size
+                );
                 assert_eq!(hj_rows, smj_rows, "{}", err_msg_row_cnt.as_str());
 
-                let err_msg_contents = format!("SortMergeJoinExec and HashJoinExec produced different results, batch_size: {}", &batch_size);
+                let err_msg_contents = format!(
+                    "SortMergeJoinExec and HashJoinExec produced different results, batch_size: {}",
+                    &batch_size
+                );
                 // row level compare if any of joins returns the result
                 // the reason is different formatting when there is no rows
                 if smj_rows > 0 || hj_rows > 0 {
@@ -719,7 +1087,7 @@ impl JoinFuzzTestCase {
     /// Files can be of different sizes
     /// The method can be useful to read partitions have been saved by `save_partitioned_batches_as_parquet`
     /// for test debugging purposes
-    #[allow(dead_code)]
+    #[expect(dead_code)]
     async fn load_partitioned_batches_from_parquet(
         dir: &str,
     ) -> std::io::Result<Vec<RecordBatch>> {
@@ -760,7 +1128,7 @@ impl JoinFuzzTestCase {
 /// Return randomly sized record batches with:
 /// two sorted int32 columns 'a', 'b' ranged from 0..99 as join columns
 /// two random int32 columns 'x', 'y' as other columns
-fn make_staggered_batches(len: usize) -> Vec<RecordBatch> {
+fn make_staggered_batches_i32(len: usize, with_extra_column: bool) -> Vec<RecordBatch> {
     let mut rng = rand::rng();
     let mut input12: Vec<(i32, i32)> = vec![(0, 0); len];
     let mut input3: Vec<i32> = vec![0; len];
@@ -776,15 +1144,66 @@ fn make_staggered_batches(len: usize) -> Vec<RecordBatch> {
     let input3 = Int32Array::from_iter_values(input3);
     let input4 = Int32Array::from_iter_values(input4);
 
-    // split into several record batches
-    let batch = RecordBatch::try_from_iter(vec![
+    let mut columns = vec![
         ("a", Arc::new(input1) as ArrayRef),
         ("b", Arc::new(input2) as ArrayRef),
         ("x", Arc::new(input3) as ArrayRef),
-        ("y", Arc::new(input4) as ArrayRef),
-    ])
-    .unwrap();
+    ];
+
+    if with_extra_column {
+        columns.push(("y", Arc::new(input4) as ArrayRef));
+    }
+
+    // split into several record batches
+    let batch = RecordBatch::try_from_iter(columns).unwrap();
 
     // use a random number generator to pick a random sized output
     stagger_batch_with_seed(batch, 42)
 }
+
+fn rand_bytes<R: Rng>(rng: &mut R, min: usize, max: usize) -> Vec<u8> {
+    let n = rng.random_range(min..=max);
+    let mut v = vec![0u8; n];
+    rng.fill(&mut v[..]);
+    v
+}
+
+/// Return randomly sized record batches with:
+/// two sorted binary columns 'a', 'b' (lexicographically) as join columns
+/// two random binary columns 'x', 'y' as other columns
+fn make_staggered_batches_binary(
+    len: usize,
+    with_extra_column: bool,
+) -> Vec<RecordBatch> {
+    let mut rng = rand::rng();
+
+    // produce (a,b) pairs then sort lexicographically so SMJ has naturally sorted keys
+    let mut input12: Vec<(Vec<u8>, Vec<u8>)> = (0..len)
+        .map(|_| (rand_bytes(&mut rng, 4, 16), rand_bytes(&mut rng, 4, 16)))
+        .collect();
+    input12.sort_unstable(); // lexicographic on Vec<u8>
+
+    // payload cols (also binary so the existing x < x filter is well-typed)
+    let input3: Vec<Vec<u8>> = (0..len).map(|_| rand_bytes(&mut rng, 4, 24)).collect();
+    let input4: Vec<Vec<u8>> = (0..len).map(|_| rand_bytes(&mut rng, 4, 24)).collect();
+
+    let a = BinaryArray::from_iter_values(input12.iter().map(|k| &k.0));
+    let b = BinaryArray::from_iter_values(input12.iter().map(|k| &k.1));
+    let x = BinaryArray::from_iter_values(input3.iter());
+    let y = BinaryArray::from_iter_values(input4.iter());
+
+    let mut columns = vec![
+        ("a", Arc::new(a) as ArrayRef),
+        ("b", Arc::new(b) as ArrayRef),
+        ("x", Arc::new(x) as ArrayRef),
+    ];
+
+    if with_extra_column {
+        columns.push(("y", Arc::new(y) as ArrayRef));
+    }
+
+    let batch = RecordBatch::try_from_iter(columns).unwrap();
+
+    // preserve your existing randomized partitioning
+    stagger_batch_with_seed(batch, 42)
+}
diff --git a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
index 4c5ebf0402414..1c5741e7a21b3 100644
--- a/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/limit_fuzz.rs
@@ -24,7 +24,7 @@ use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::MemTable;
 use datafusion::prelude::SessionContext;
 use datafusion_common::assert_contains;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 use std::sync::Arc;
 use test_utils::stagger_batch;
 
diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
index 92f3755250663..59430a98cc4b4 100644
--- a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
@@ -27,11 +27,10 @@ use arrow::{
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::physical_plan::{
     collect,
-    expressions::{col, PhysicalSortExpr},
+    expressions::{PhysicalSortExpr, col},
     sorts::sort_preserving_merge::SortPreservingMergeExec,
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 use test_utils::{batches_to_vec, partitions_to_sorted_vec, stagger_batch_with_seed};
 
@@ -109,13 +108,14 @@ async fn run_merge_test(input: Vec<Vec<RecordBatch>>) {
             .expect("at least one batch");
         let schema = first_batch.schema();
 
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort = [PhysicalSortExpr {
             expr: col("x", &schema).unwrap(),
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
 
         let exec = MemorySourceConfig::try_new_exec(&input, schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
diff --git a/datafusion/core/tests/fuzz_cases/mod.rs b/datafusion/core/tests/fuzz_cases/mod.rs
index 8ccc2a5bc1310..edb53df382c62 100644
--- a/datafusion/core/tests/fuzz_cases/mod.rs
+++ b/datafusion/core/tests/fuzz_cases/mod.rs
@@ -15,21 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#[expect(clippy::needless_pass_by_value)]
 mod aggregate_fuzz;
 mod distinct_count_string_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod join_fuzz;
 mod merge_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_query_fuzz;
+mod topk_filter_pushdown;
 
 mod aggregation_fuzzer;
+#[expect(clippy::needless_pass_by_value)]
 mod equivalence;
 
 mod pruning;
 
 mod limit_fuzz;
+#[expect(clippy::needless_pass_by_value)]
 mod sort_preserving_repartition_fuzz;
 mod window_fuzz;
 
 // Utility modules
+mod once_exec;
 mod record_batch_generator;
+mod spilling_fuzz_in_memory_constrained_env;
diff --git a/datafusion/core/tests/fuzz_cases/once_exec.rs b/datafusion/core/tests/fuzz_cases/once_exec.rs
new file mode 100644
index 0000000000000..eed172f09f994
--- /dev/null
+++ b/datafusion/core/tests/fuzz_cases/once_exec.rs
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::SchemaRef;
+use datafusion_common::internal_datafusion_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+};
+use std::any::Any;
+use std::fmt::{Debug, Formatter};
+use std::sync::{Arc, Mutex};
+
+/// Execution plan that return the stream on the call to `execute`. further calls to `execute` will
+/// return an error
+pub struct OnceExec {
+    /// the results to send back
+    stream: Mutex<Option<SendableRecordBatchStream>>,
+    cache: Arc<PlanProperties>,
+}
+
+impl Debug for OnceExec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "OnceExec")
+    }
+}
+
+impl OnceExec {
+    pub fn new(stream: SendableRecordBatchStream) -> Self {
+        let cache = Self::compute_properties(stream.schema());
+        Self {
+            stream: Mutex::new(Some(stream)),
+            cache: Arc::new(cache),
+        }
+    }
+
+    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
+    fn compute_properties(schema: SchemaRef) -> PlanProperties {
+        PlanProperties::new(
+            EquivalenceProperties::new(schema),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        )
+    }
+}
+
+impl DisplayAs for OnceExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "OnceExec:")
+            }
+            DisplayFormatType::TreeRender => {
+                write!(f, "")
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for OnceExec {
+    fn name(&self) -> &'static str {
+        Self::static_name()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        unimplemented!()
+    }
+
+    /// Returns a stream which yields data
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> datafusion_common::Result<SendableRecordBatchStream> {
+        assert_eq!(partition, 0);
+
+        let stream = self.stream.lock().unwrap().take();
+
+        stream.ok_or_else(|| internal_datafusion_err!("Stream already consumed"))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> datafusion_common::Result<TreeNodeRecursion>,
+    ) -> datafusion_common::Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+}
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs
index 6e624d458bd93..8ce5207f91190 100644
--- a/datafusion/core/tests/fuzz_cases/pruning.rs
+++ b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -29,9 +29,11 @@ use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, collect, filter::FilterExec};
 use itertools::Itertools;
-use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use object_store::{
+    ObjectStore, ObjectStoreExt, PutPayload, memory::InMemory, path::Path,
+};
 use parquet::{
     arrow::ArrowWriter,
     file::properties::{EnabledStatistics, WriterProperties},
@@ -201,7 +203,7 @@ impl Utf8Test {
         }
     }
 
-    ///  all combinations of interesting charactes  with lengths ranging from 1 to 4
+    ///  all combinations of interesting characters  with lengths ranging from 1 to 4
     fn values() -> &'static [String] {
         &VALUES
     }
@@ -276,13 +278,12 @@ async fn execute_with_predicate(
     ctx: &SessionContext,
 ) -> Vec<String> {
     let parquet_source = if prune_stats {
-        ParquetSource::default().with_predicate(predicate.clone())
+        ParquetSource::new(schema.clone()).with_predicate(predicate.clone())
     } else {
-        ParquetSource::default()
+        ParquetSource::new(schema.clone())
     };
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("memory://").unwrap(),
-        schema.clone(),
         Arc::new(parquet_source),
     )
     .with_file_group(
@@ -319,14 +320,9 @@ async fn write_parquet_file(
     row_groups: Vec<Vec<String>>,
 ) -> Bytes {
     let mut buf = BytesMut::new().writer();
-    let mut props = WriterProperties::builder();
-    if let Some(truncation_length) = truncation_length {
-        props = {
-            #[allow(deprecated)]
-            props.set_max_statistics_size(truncation_length)
-        }
-    }
-    props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level
+    let props = WriterProperties::builder()
+        .set_statistics_enabled(EnabledStatistics::Chunk) // row group level
+        .set_statistics_truncate_length(truncation_length);
     let props = props.build();
     {
         let mut writer =
diff --git a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
index 7b48eadf77e09..22b145f5095a7 100644
--- a/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/record_batch_generator.rs
@@ -17,23 +17,25 @@
 
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, RecordBatch};
+use arrow::array::{ArrayRef, DictionaryArray, PrimitiveArray, RecordBatch};
 use arrow::datatypes::{
-    BooleanType, DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
-    DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
-    DurationSecondType, Field, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-    IntervalYearMonthType, Schema, Time32MillisecondType, Time32SecondType,
-    Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
-    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type,
-    UInt32Type, UInt64Type, UInt8Type,
+    ArrowPrimitiveType, BooleanType, DataType, Date32Type, Date64Type, Decimal32Type,
+    Decimal64Type, Decimal128Type, Decimal256Type, DurationMicrosecondType,
+    DurationMillisecondType, DurationNanosecondType, DurationSecondType, Field,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
+    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType,
+    Schema, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
 use arrow_schema::{
-    DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION,
-    DECIMAL256_MAX_SCALE,
+    DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
 };
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result};
-use rand::{rng, rngs::StdRng, Rng, SeedableRng};
+use datafusion_common::{Result, arrow_datafusion_err};
+use rand::{Rng, SeedableRng, rng, rngs::StdRng};
 use test_utils::array_gen::{
     BinaryArrayGenerator, BooleanArrayGenerator, DecimalArrayGenerator,
     PrimitiveArrayGenerator, StringArrayGenerator,
@@ -103,6 +105,20 @@ pub fn get_supported_types_columns(rng_seed: u64) -> Vec<ColumnDescr> {
             "duration_nanosecond",
             DataType::Duration(TimeUnit::Nanosecond),
         ),
+        ColumnDescr::new("decimal32", {
+            let precision: u8 = rng.random_range(1..=DECIMAL32_MAX_PRECISION);
+            let scale: i8 = rng.random_range(
+                i8::MIN..=std::cmp::min(precision as i8, DECIMAL32_MAX_SCALE),
+            );
+            DataType::Decimal32(precision, scale)
+        }),
+        ColumnDescr::new("decimal64", {
+            let precision: u8 = rng.random_range(1..=DECIMAL64_MAX_PRECISION);
+            let scale: i8 = rng.random_range(
+                i8::MIN..=std::cmp::min(precision as i8, DECIMAL64_MAX_SCALE),
+            );
+            DataType::Decimal64(precision, scale)
+        }),
         ColumnDescr::new("decimal128", {
             let precision: u8 = rng.random_range(1..=DECIMAL128_MAX_PRECISION);
             let scale: i8 = rng.random_range(
@@ -126,6 +142,11 @@ pub fn get_supported_types_columns(rng_seed: u64) -> Vec<ColumnDescr> {
         ColumnDescr::new("binary", DataType::Binary),
         ColumnDescr::new("large_binary", DataType::LargeBinary),
         ColumnDescr::new("binaryview", DataType::BinaryView),
+        ColumnDescr::new(
+            "dictionary_utf8_low",
+            DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
+        )
+        .with_max_num_distinct(10),
     ]
 }
 
@@ -185,17 +206,13 @@ pub struct RecordBatchGenerator {
 }
 
 macro_rules! generate_decimal_array {
-    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT: expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $PRECISION: ident, $SCALE: ident, $ARROW_TYPE: ident) => {{
-        let null_pct_idx =
-            $BATCH_GEN_RNG.random_range(0..$SELF.candidate_null_pcts.len());
-        let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
-
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT: expr, $NULL_PCT:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $PRECISION: ident, $SCALE: ident, $ARROW_TYPE: ident) => {{
         let mut generator = DecimalArrayGenerator {
             precision: $PRECISION,
             scale: $SCALE,
             num_decimals: $NUM_ROWS,
             num_distinct_decimals: $MAX_NUM_DISTINCT,
-            null_pct,
+            null_pct: $NULL_PCT,
             rng: $ARRAY_GEN_RNG,
         };
 
@@ -205,18 +222,13 @@ macro_rules! generate_decimal_array {
 
 // Generating `BooleanArray` due to it being a special type in Arrow (bit-packed)
 macro_rules! generate_boolean_array {
-    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
-        // Select a null percentage from the candidate percentages
-        let null_pct_idx =
-            $BATCH_GEN_RNG.random_range(0..$SELF.candidate_null_pcts.len());
-        let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
-
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $NULL_PCT:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
         let num_distinct_booleans = if $MAX_NUM_DISTINCT >= 2 { 2 } else { 1 };
 
         let mut generator = BooleanArrayGenerator {
             num_booleans: $NUM_ROWS,
             num_distinct_booleans,
-            null_pct,
+            null_pct: $NULL_PCT,
             rng: $ARRAY_GEN_RNG,
         };
 
@@ -225,15 +237,11 @@ macro_rules! generate_boolean_array {
 }
 
 macro_rules! generate_primitive_array {
-    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
-        let null_pct_idx =
-            $BATCH_GEN_RNG.random_range(0..$SELF.candidate_null_pcts.len());
-        let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
-
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $NULL_PCT:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
         let mut generator = PrimitiveArrayGenerator {
             num_primitives: $NUM_ROWS,
             num_distinct_primitives: $MAX_NUM_DISTINCT,
-            null_pct,
+            null_pct: $NULL_PCT,
             rng: $ARRAY_GEN_RNG,
         };
 
@@ -241,6 +249,28 @@ macro_rules! generate_primitive_array {
     }};
 }
 
+macro_rules! generate_dict {
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $NULL_PCT:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident, $VALUES: ident) => {{
+        debug_assert_eq!($VALUES.len(), $MAX_NUM_DISTINCT);
+        let keys: PrimitiveArray<$ARROW_TYPE> = (0..$NUM_ROWS)
+            .map(|_| {
+                if $BATCH_GEN_RNG.random::<f64>() < $NULL_PCT {
+                    None
+                } else if $MAX_NUM_DISTINCT > 1 {
+                    let range = 0..($MAX_NUM_DISTINCT
+                        as <$ARROW_TYPE as ArrowPrimitiveType>::Native);
+                    Some($ARRAY_GEN_RNG.random_range(range))
+                } else {
+                    Some(0)
+                }
+            })
+            .collect();
+
+        let dict = DictionaryArray::new(keys, $VALUES);
+        Arc::new(dict) as ArrayRef
+    }};
+}
+
 impl RecordBatchGenerator {
     /// Create a new `RecordBatchGenerator` with a random seed. The generated
     /// batches will be different each time.
@@ -302,6 +332,25 @@ impl RecordBatchGenerator {
         num_rows: usize,
         batch_gen_rng: &mut StdRng,
         array_gen_rng: StdRng,
+    ) -> ArrayRef {
+        let null_pct_idx = batch_gen_rng.random_range(0..self.candidate_null_pcts.len());
+        let null_pct = self.candidate_null_pcts[null_pct_idx];
+
+        Self::generate_array_of_type_inner(
+            col,
+            num_rows,
+            batch_gen_rng,
+            array_gen_rng,
+            null_pct,
+        )
+    }
+
+    fn generate_array_of_type_inner(
+        col: &ColumnDescr,
+        num_rows: usize,
+        batch_gen_rng: &mut StdRng,
+        array_gen_rng: StdRng,
+        null_pct: f64,
     ) -> ArrayRef {
         let num_distinct = if num_rows > 1 {
             batch_gen_rng.random_range(1..num_rows)
@@ -320,6 +369,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Int8Type
@@ -330,6 +380,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Int16Type
@@ -340,6 +391,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Int32Type
@@ -350,6 +402,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Int64Type
@@ -360,6 +413,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     UInt8Type
@@ -370,6 +424,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     UInt16Type
@@ -380,6 +435,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     UInt32Type
@@ -390,6 +446,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     UInt64Type
@@ -400,6 +457,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Float32Type
@@ -410,6 +468,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Float64Type
@@ -420,6 +479,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Date32Type
@@ -430,6 +490,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Date64Type
@@ -440,6 +501,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Time32SecondType
@@ -450,6 +512,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Time32MillisecondType
@@ -460,6 +523,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Time64MicrosecondType
@@ -470,6 +534,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     Time64NanosecondType
@@ -480,6 +545,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     IntervalYearMonthType
@@ -490,6 +556,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     IntervalDayTimeType
@@ -500,6 +567,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     IntervalMonthDayNanoType
@@ -510,6 +578,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     DurationSecondType
@@ -520,6 +589,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     DurationMillisecondType
@@ -530,6 +600,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     DurationMicrosecondType
@@ -540,6 +611,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     DurationNanosecondType
@@ -550,6 +622,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     TimestampSecondType
@@ -560,6 +633,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     TimestampMillisecondType
@@ -570,6 +644,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     TimestampMicrosecondType
@@ -580,15 +655,13 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     TimestampNanosecondType
                 )
             }
             DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
-                let null_pct_idx =
-                    batch_gen_rng.random_range(0..self.candidate_null_pcts.len());
-                let null_pct = self.candidate_null_pcts[null_pct_idx];
                 let max_len = batch_gen_rng.random_range(1..50);
 
                 let mut generator = StringArrayGenerator {
@@ -607,9 +680,6 @@ impl RecordBatchGenerator {
                 }
             }
             DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
-                let null_pct_idx =
-                    batch_gen_rng.random_range(0..self.candidate_null_pcts.len());
-                let null_pct = self.candidate_null_pcts[null_pct_idx];
                 let max_len = batch_gen_rng.random_range(1..100);
 
                 let mut generator = BinaryArrayGenerator {
@@ -627,11 +697,38 @@ impl RecordBatchGenerator {
                     _ => unreachable!(),
                 }
             }
+            DataType::Decimal32(precision, scale) => {
+                generate_decimal_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    null_pct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    precision,
+                    scale,
+                    Decimal32Type
+                )
+            }
+            DataType::Decimal64(precision, scale) => {
+                generate_decimal_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    null_pct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    precision,
+                    scale,
+                    Decimal64Type
+                )
+            }
             DataType::Decimal128(precision, scale) => {
                 generate_decimal_array!(
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     precision,
@@ -644,6 +741,7 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     precision,
@@ -656,11 +754,41 @@ impl RecordBatchGenerator {
                     self,
                     num_rows,
                     max_num_distinct,
+                    null_pct,
                     batch_gen_rng,
                     array_gen_rng,
                     BooleanType
                 }
             }
+            DataType::Dictionary(ref key_type, ref value_type)
+                if key_type.is_dictionary_key_type() =>
+            {
+                // We generate just num_distinct values because they will be reused by different keys
+                let mut array_gen_rng = array_gen_rng;
+                debug_assert!((0.0..=1.0).contains(&null_pct));
+                let values = Self::generate_array_of_type_inner(
+                    &ColumnDescr::new("values", *value_type.clone()),
+                    num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng.clone(),
+                    null_pct, // generate some null values
+                );
+
+                match key_type.as_ref() {
+                    // new key types can be added here
+                    DataType::UInt64 => generate_dict!(
+                        self,
+                        num_rows,
+                        num_distinct,
+                        null_pct,
+                        batch_gen_rng,
+                        array_gen_rng,
+                        UInt64Type,
+                        values
+                    ),
+                    _ => panic!("Invalid dictionary keys type: {key_type}"),
+                }
+            }
             _ => {
                 panic!("Unsupported data generator type: {}", col.column_type)
             }
diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
index 703b8715821a8..0d8a066d432dd 100644
--- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs
@@ -20,7 +20,7 @@
 use std::sync::Arc;
 
 use arrow::{
-    array::{as_string_array, ArrayRef, Int32Array, StringArray},
+    array::{ArrayRef, Int32Array, StringArray, as_string_array},
     compute::SortOptions,
     record_batch::RecordBatch,
 };
@@ -28,7 +28,7 @@ use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::physical_plan::expressions::PhysicalSortExpr;
 use datafusion::physical_plan::sorts::sort::SortExec;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::cast::as_int32_array;
 use datafusion_execution::memory_pool::GreedyMemoryPool;
@@ -188,7 +188,7 @@ impl SortTest {
     }
 
     fn with_sort_columns(mut self, sort_columns: Vec<&str>) -> Self {
-        self.sort_columns = sort_columns.iter().map(|s| s.to_string()).collect();
+        self.sort_columns = sort_columns.iter().map(|s| (*s).to_string()).collect();
         self
     }
 
@@ -232,18 +232,15 @@ impl SortTest {
             .expect("at least one batch");
         let schema = first_batch.schema();
 
-        let sort_ordering = LexOrdering::new(
-            self.sort_columns
-                .iter()
-                .map(|c| PhysicalSortExpr {
-                    expr: col(c, &schema).unwrap(),
-                    options: SortOptions {
-                        descending: false,
-                        nulls_first: true,
-                    },
-                })
-                .collect(),
-        );
+        let sort_ordering =
+            LexOrdering::new(self.sort_columns.iter().map(|c| PhysicalSortExpr {
+                expr: col(c, &schema).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: true,
+                },
+            }))
+            .unwrap();
 
         let exec = MemorySourceConfig::try_new_exec(&input, schema, None).unwrap();
         let sort = Arc::new(SortExec::new(sort_ordering, exec));
diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
index cf6867758edc7..8f3b8ea05324c 100644
--- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
@@ -20,35 +20,33 @@ mod sp_repartition_fuzz_tests {
     use std::sync::Arc;
 
     use arrow::array::{ArrayRef, Int64Array, RecordBatch, UInt64Array};
-    use arrow::compute::{concat_batches, lexsort, SortColumn, SortOptions};
+    use arrow::compute::{SortColumn, SortOptions, concat_batches, lexsort};
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 
+    use datafusion::datasource::memory::MemorySourceConfig;
+    use datafusion::datasource::source::DataSourceExec;
     use datafusion::physical_plan::{
-        collect,
+        ExecutionPlan, Partitioning, collect,
         metrics::{BaselineMetrics, ExecutionPlanMetricsSet},
         repartition::RepartitionExec,
         sorts::sort_preserving_merge::SortPreservingMergeExec,
         sorts::streaming_merge::StreamingMergeBuilder,
         stream::RecordBatchStreamAdapter,
-        ExecutionPlan, Partitioning,
     };
     use datafusion::prelude::SessionContext;
     use datafusion_common::Result;
-    use datafusion_execution::{
-        config::SessionConfig, memory_pool::MemoryConsumer, SendableRecordBatchStream,
-    };
-    use datafusion_physical_expr::{
-        equivalence::{EquivalenceClass, EquivalenceProperties},
-        expressions::{col, Column},
-        ConstExpr, PhysicalExpr, PhysicalSortExpr,
+    use datafusion_execution::{config::SessionConfig, memory_pool::MemoryConsumer};
+    use datafusion_physical_expr::ConstExpr;
+    use datafusion_physical_expr::equivalence::{
+        EquivalenceClass, EquivalenceProperties,
     };
+    use datafusion_physical_expr::expressions::{Column, col};
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
     use test_utils::add_empty_batches;
 
-    use datafusion::datasource::memory::MemorySourceConfig;
-    use datafusion::datasource::source::DataSourceExec;
-    use datafusion_physical_expr_common::sort_expr::LexOrdering;
     use itertools::izip;
-    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+    use rand::{Rng, SeedableRng, rngs::StdRng, seq::SliceRandom};
 
     // Generate a schema which consists of 6 columns (a, b, c, d, e, f)
     fn create_test_schema() -> Result<SchemaRef> {
@@ -80,9 +78,9 @@ mod sp_repartition_fuzz_tests {
 
         let mut eq_properties = EquivalenceProperties::new(test_schema.clone());
         // Define a and f are aliases
-        eq_properties.add_equal_conditions(col_a, col_f)?;
+        eq_properties.add_equal_conditions(Arc::clone(col_a), Arc::clone(col_f))?;
         // Column e has constant value.
-        eq_properties = eq_properties.with_constants([ConstExpr::from(col_e)]);
+        eq_properties.add_constants([ConstExpr::from(Arc::clone(col_e))])?;
 
         // Randomly order columns for sorting
         let mut rng = StdRng::seed_from_u64(seed);
@@ -94,18 +92,18 @@ mod sp_repartition_fuzz_tests {
         };
 
         while !remaining_exprs.is_empty() {
-            let n_sort_expr = rng.random_range(0..remaining_exprs.len() + 1);
+            let n_sort_expr = rng.random_range(1..remaining_exprs.len() + 1);
             remaining_exprs.shuffle(&mut rng);
 
-            let ordering = remaining_exprs
-                .drain(0..n_sort_expr)
-                .map(|expr| PhysicalSortExpr {
-                    expr: expr.clone(),
-                    options: options_asc,
-                })
-                .collect();
+            let ordering =
+                remaining_exprs
+                    .drain(0..n_sort_expr)
+                    .map(|expr| PhysicalSortExpr {
+                        expr: expr.clone(),
+                        options: options_asc,
+                    });
 
-            eq_properties.add_new_orderings([ordering]);
+            eq_properties.add_ordering(ordering);
         }
 
         Ok((test_schema, eq_properties))
@@ -151,7 +149,7 @@ mod sp_repartition_fuzz_tests {
 
         // Fill constant columns
         for constant in eq_properties.constants() {
-            let col = constant.expr().as_any().downcast_ref::<Column>().unwrap();
+            let col = constant.expr.as_any().downcast_ref::<Column>().unwrap();
             let (idx, _field) = schema.column_with_name(col.name()).unwrap();
             let arr =
                 Arc::new(UInt64Array::from_iter_values(vec![0; n_elem])) as ArrayRef;
@@ -227,21 +225,21 @@ mod sp_repartition_fuzz_tests {
             let table_data_with_properties =
                 generate_table_for_eq_properties(&eq_properties, N_ELEM, N_DISTINCT)?;
             let schema = table_data_with_properties.schema();
-            let streams: Vec<SendableRecordBatchStream> = (0..N_PARTITION)
+            let streams = (0..N_PARTITION)
                 .map(|_idx| {
                     let batch = table_data_with_properties.clone();
                     Box::pin(RecordBatchStreamAdapter::new(
                         schema.clone(),
                         futures::stream::once(async { Ok(batch) }),
-                    )) as SendableRecordBatchStream
+                    )) as _
                 })
                 .collect::<Vec<_>>();
 
-            // Returns concatenated version of the all available orderings
-            let exprs = eq_properties
-                .oeq_class()
-                .output_ordering()
-                .unwrap_or_default();
+            // Returns concatenated version of the all available orderings:
+            let Some(exprs) = eq_properties.oeq_class().output_ordering() else {
+                // We always should have an ordering due to the way we generate the schema:
+                unreachable!("No ordering found in eq_properties: {:?}", eq_properties);
+            };
 
             let context = SessionContext::new().task_ctx();
             let mem_reservation =
@@ -303,7 +301,7 @@ mod sp_repartition_fuzz_tests {
                 let mut handles = Vec::new();
 
                 for seed in seed_start..seed_end {
-                    #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
+                    #[expect(clippy::disallowed_methods)] // spawn allowed only in tests
                     let job = tokio::spawn(run_sort_preserving_repartition_test(
                         make_staggered_batches::<true>(n_row, n_distinct, seed as u64),
                         is_first_roundrobin,
@@ -347,20 +345,16 @@ mod sp_repartition_fuzz_tests {
         let schema = input1[0].schema();
         let session_config = SessionConfig::new().with_batch_size(50);
         let ctx = SessionContext::new_with_config(session_config);
-        let mut sort_keys = LexOrdering::default();
-        for ordering_col in ["a", "b", "c"] {
-            sort_keys.push(PhysicalSortExpr {
-                expr: col(ordering_col, &schema).unwrap(),
-                options: SortOptions::default(),
-            })
-        }
+        let sort_keys = ["a", "b", "c"].map(|ordering_col| {
+            PhysicalSortExpr::new_default(col(ordering_col, &schema).unwrap())
+        });
 
         let concat_input_record = concat_batches(&schema, &input1).unwrap();
 
         let running_source = Arc::new(
-            MemorySourceConfig::try_new(&[input1.clone()], schema.clone(), None)
+            MemorySourceConfig::try_new(&[input1], schema.clone(), None)
                 .unwrap()
-                .try_with_sort_information(vec![sort_keys.clone()])
+                .try_with_sort_information(vec![sort_keys.clone().into()])
                 .unwrap(),
         );
         let running_source = Arc::new(DataSourceExec::new(running_source));
@@ -381,7 +375,7 @@ mod sp_repartition_fuzz_tests {
             sort_preserving_repartition_exec_hash(intermediate, hash_exprs.clone())
         };
 
-        let final_plan = sort_preserving_merge_exec(sort_keys.clone(), intermediate);
+        let final_plan = sort_preserving_merge_exec(sort_keys.into(), intermediate);
         let task_ctx = ctx.task_ctx();
 
         let collected_running = collect(final_plan, task_ctx.clone()).await.unwrap();
@@ -428,10 +422,9 @@ mod sp_repartition_fuzz_tests {
     }
 
     fn sort_preserving_merge_exec(
-        sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+        sort_exprs: LexOrdering,
         input: Arc<dyn ExecutionPlan>,
     ) -> Arc<dyn ExecutionPlan> {
-        let sort_exprs = sort_exprs.into_iter().collect();
         Arc::new(SortPreservingMergeExec::new(sort_exprs, input))
     }
 
diff --git a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
index d2d3a5e0c22fa..376306f3e0659 100644
--- a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs
@@ -24,24 +24,22 @@ use arrow::array::RecordBatch;
 use arrow_schema::SchemaRef;
 use datafusion::datasource::MemTable;
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::{instant::Instant, Result};
+use datafusion_common::{Result, human_readable_size, instant::Instant};
 use datafusion_execution::disk_manager::DiskManagerBuilder;
-use datafusion_execution::memory_pool::{
-    human_readable_size, MemoryPool, UnboundedMemoryPool,
-};
+use datafusion_execution::memory_pool::{MemoryPool, UnboundedMemoryPool};
 use datafusion_expr::display_schema;
 use datafusion_physical_plan::spill::get_record_batch_memory_size;
 use std::time::Duration;
 
 use datafusion_execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder};
-use rand::prelude::IndexedRandom;
 use rand::Rng;
-use rand::{rngs::StdRng, SeedableRng};
+use rand::prelude::IndexedRandom;
+use rand::{SeedableRng, rngs::StdRng};
 
 use crate::fuzz_cases::aggregation_fuzzer::check_equality_of_batches;
 
 use super::aggregation_fuzzer::ColumnDescr;
-use super::record_batch_generator::{get_supported_types_columns, RecordBatchGenerator};
+use super::record_batch_generator::{RecordBatchGenerator, get_supported_types_columns};
 
 /// Entry point for executing the sort query fuzzer.
 ///
@@ -177,16 +175,16 @@ impl SortQueryFuzzer {
         n_round: usize,
         n_query: usize,
     ) -> bool {
-        if let Some(time_limit) = self.time_limit {
-            if Instant::now().duration_since(start_time) > time_limit {
-                println!(
-                    "[SortQueryFuzzer] Time limit reached: {} queries ({} random configs each) in {} rounds",
-                    n_round * self.queries_per_round + n_query,
-                    self.config_variations_per_query,
-                    n_round
-                );
-                return true;
-            }
+        if let Some(time_limit) = self.time_limit
+            && Instant::now().duration_since(start_time) > time_limit
+        {
+            println!(
+                "[SortQueryFuzzer] Time limit reached: {} queries ({} random configs each) in {} rounds",
+                n_round * self.queries_per_round + n_query,
+                self.config_variations_per_query,
+                n_round
+            );
+            return true;
         }
         false
     }
@@ -220,7 +218,7 @@ impl SortQueryFuzzer {
                         .test_gen
                         .fuzzer_run(init_seed, query_seed, config_seed)
                         .await?;
-                    println!("\n"); // Seperator between tested runs
+                    println!("\n"); // Separator between tested runs
 
                     if expected_results.is_none() {
                         expected_results = Some(results);
@@ -428,7 +426,7 @@ impl SortFuzzerTestGenerator {
             .collect();
 
         let mut order_by_clauses = Vec::new();
-        for col in selected_columns {
+        for col in &selected_columns {
             let mut clause = col.name.clone();
             if rng.random_bool(0.5) {
                 let order = if rng.random_bool(0.5) { "ASC" } else { "DESC" };
@@ -463,7 +461,12 @@ impl SortFuzzerTestGenerator {
         let limit_clause = limit.map_or(String::new(), |l| format!(" LIMIT {l}"));
 
         let query = format!(
-            "SELECT * FROM {} ORDER BY {}{}",
+            "SELECT {} FROM {} ORDER BY {}{}",
+            selected_columns
+                .iter()
+                .map(|col| col.name.clone())
+                .collect::<Vec<_>>()
+                .join(", "),
             self.table_name,
             order_by_clauses.join(", "),
             limit_clause
diff --git a/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs
new file mode 100644
index 0000000000000..d401557e966d6
--- /dev/null
+++ b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs
@@ -0,0 +1,658 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Fuzz Test for different operators in memory constrained environment
+
+use std::pin::Pin;
+use std::sync::Arc;
+
+use crate::fuzz_cases::aggregate_fuzz::assert_spill_count_metric;
+use crate::fuzz_cases::once_exec::OnceExec;
+use arrow::array::UInt64Array;
+use arrow::{array::StringArray, compute::SortOptions, record_batch::RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::common::Result;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::expressions::PhysicalSortExpr;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::prelude::SessionConfig;
+use datafusion_common::units::{KB, MB};
+use datafusion_execution::memory_pool::{
+    FairSpillPool, MemoryConsumer, MemoryReservation,
+};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_functions_aggregate::array_agg::array_agg_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::{Column, col};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateMode, PhysicalGroupBy,
+};
+use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
+use futures::StreamExt;
+
+#[tokio::test]
+async fn test_sort_with_limited_memory() -> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(record_batch_size)
+                    .with_sort_spill_reservation_bytes(1),
+            )
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    let record_batch_size = pool_size / 16;
+
+    // Basic test with a lot of groups that cannot all fit in memory and 1 record batch
+    // from each spill file is too much memory
+    let spill_count = run_sort_test_with_limited_memory(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |_| record_batch_size),
+        memory_behavior: Default::default(),
+    })
+    .await?;
+
+    let total_spill_files_size = spill_count * record_batch_size;
+    assert!(
+        total_spill_files_size > pool_size,
+        "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch() -> Result<()>
+{
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(record_batch_size)
+                    .with_sort_spill_reservation_bytes(1),
+            )
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_sort_test_with_limited_memory(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                16 * KB as usize
+            }
+        }),
+        memory_behavior: Default::default(),
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(record_batch_size)
+                    .with_sort_spill_reservation_bytes(1),
+            )
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_sort_test_with_limited_memory(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                16 * KB as usize
+            }
+        }),
+        memory_behavior: MemoryBehavior::TakeAllMemoryAndReleaseEveryNthBatch(10),
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_sort_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(record_batch_size)
+                    .with_sort_spill_reservation_bytes(1),
+            )
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_sort_test_with_limited_memory(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                16 * KB as usize
+            }
+        }),
+        memory_behavior: MemoryBehavior::TakeAllMemoryAtTheBeginning,
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_sort_with_limited_memory_and_large_record_batch() -> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(record_batch_size)
+                    .with_sort_spill_reservation_bytes(1),
+            )
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    // Test that the merge degree of multi level merge sort cannot be fixed size when there is not enough memory
+    run_sort_test_with_limited_memory(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |_| pool_size / 6),
+        memory_behavior: Default::default(),
+    })
+    .await?;
+
+    Ok(())
+}
+
+struct RunTestWithLimitedMemoryArgs {
+    pool_size: usize,
+    task_ctx: Arc<TaskContext>,
+    number_of_record_batches: usize,
+    get_size_of_record_batch_to_generate:
+        Pin<Box<dyn Fn(usize) -> usize + Send + 'static>>,
+    memory_behavior: MemoryBehavior,
+}
+
+#[derive(Default)]
+enum MemoryBehavior {
+    #[default]
+    AsIs,
+    TakeAllMemoryAtTheBeginning,
+    TakeAllMemoryAndReleaseEveryNthBatch(usize),
+}
+
+async fn run_sort_test_with_limited_memory(
+    mut args: RunTestWithLimitedMemoryArgs,
+) -> Result<usize> {
+    let get_size_of_record_batch_to_generate = std::mem::replace(
+        &mut args.get_size_of_record_batch_to_generate,
+        Box::pin(move |_| unreachable!("should not be called after take")),
+    );
+
+    let scan_schema = Arc::new(Schema::new(vec![
+        Field::new("col_0", DataType::UInt64, true),
+        Field::new("col_1", DataType::Utf8, true),
+    ]));
+
+    let record_batch_size = args.task_ctx.session_config().batch_size() as u64;
+
+    let schema = Arc::clone(&scan_schema);
+    let plan: Arc<dyn ExecutionPlan> =
+        Arc::new(OnceExec::new(Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&schema),
+            futures::stream::iter((0..args.number_of_record_batches as u64).map(
+                move |index| {
+                    let mut record_batch_memory_size =
+                        get_size_of_record_batch_to_generate(index as usize);
+                    record_batch_memory_size = record_batch_memory_size
+                        .saturating_sub(size_of::<u64>() * record_batch_size as usize);
+
+                    let string_item_size =
+                        record_batch_memory_size / record_batch_size as usize;
+                    let string_array =
+                        Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+                            "a".repeat(string_item_size),
+                            record_batch_size as usize,
+                        )));
+
+                    RecordBatch::try_new(
+                        Arc::clone(&schema),
+                        vec![
+                            Arc::new(UInt64Array::from_iter_values(
+                                (index * record_batch_size)
+                                    ..(index * record_batch_size) + record_batch_size,
+                            )),
+                            string_array,
+                        ],
+                    )
+                    .map_err(|err| err.into())
+                },
+            )),
+        ))));
+    let sort_exec = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr {
+            expr: col("col_0", &scan_schema).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: true,
+            },
+        }])
+        .unwrap(),
+        plan,
+    ));
+
+    let result = sort_exec.execute(0, Arc::clone(&args.task_ctx))?;
+
+    run_test(args, sort_exec, result).await
+}
+
+fn grow_memory_as_much_as_possible(
+    memory_step: usize,
+    memory_reservation: &mut MemoryReservation,
+) -> Result<bool> {
+    let mut was_able_to_grow = false;
+    while memory_reservation.try_grow(memory_step).is_ok() {
+        was_able_to_grow = true;
+    }
+
+    Ok(was_able_to_grow)
+}
+
+#[tokio::test]
+async fn test_aggregate_with_high_cardinality_with_limited_memory() -> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(SessionConfig::new().with_batch_size(record_batch_size))
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    let record_batch_size = pool_size / 16;
+
+    // Basic test with a lot of groups that cannot all fit in memory and 1 record batch
+    // from each spill file is too much memory
+    let spill_count =
+        run_test_aggregate_with_high_cardinality(RunTestWithLimitedMemoryArgs {
+            pool_size,
+            task_ctx: Arc::new(task_ctx),
+            number_of_record_batches: 100,
+            get_size_of_record_batch_to_generate: Box::pin(move |_| record_batch_size),
+            memory_behavior: Default::default(),
+        })
+        .await?;
+
+    let total_spill_files_size = spill_count * record_batch_size;
+    assert!(
+        total_spill_files_size > pool_size,
+        "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}",
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(SessionConfig::new().with_batch_size(record_batch_size))
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_test_aggregate_with_high_cardinality(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                (16 * KB) as usize
+            }
+        }),
+        memory_behavior: Default::default(),
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_changing_memory_reservation()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(SessionConfig::new().with_batch_size(record_batch_size))
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_test_aggregate_with_high_cardinality(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                (16 * KB) as usize
+            }
+        }),
+        memory_behavior: MemoryBehavior::TakeAllMemoryAndReleaseEveryNthBatch(10),
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_different_sizes_of_record_batch_and_take_all_memory()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(SessionConfig::new().with_batch_size(record_batch_size))
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    run_test_aggregate_with_high_cardinality(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |i| {
+            if i % 25 == 1 {
+                pool_size / 6
+            } else {
+                (16 * KB) as usize
+            }
+        }),
+        memory_behavior: MemoryBehavior::TakeAllMemoryAtTheBeginning,
+    })
+    .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_aggregate_with_high_cardinality_with_limited_memory_and_large_record_batch()
+-> Result<()> {
+    let record_batch_size = 8192;
+    let pool_size = 2 * MB as usize;
+    let task_ctx = {
+        let memory_pool = Arc::new(FairSpillPool::new(pool_size));
+        TaskContext::default()
+            .with_session_config(SessionConfig::new().with_batch_size(record_batch_size))
+            .with_runtime(Arc::new(
+                RuntimeEnvBuilder::new()
+                    .with_memory_pool(memory_pool)
+                    .build()?,
+            ))
+    };
+
+    // Test that the merge degree of multi level merge sort cannot be fixed size when there is not enough memory
+    run_test_aggregate_with_high_cardinality(RunTestWithLimitedMemoryArgs {
+        pool_size,
+        task_ctx: Arc::new(task_ctx),
+        number_of_record_batches: 100,
+        get_size_of_record_batch_to_generate: Box::pin(move |_| pool_size / 6),
+        memory_behavior: Default::default(),
+    })
+    .await?;
+
+    Ok(())
+}
+
+async fn run_test_aggregate_with_high_cardinality(
+    mut args: RunTestWithLimitedMemoryArgs,
+) -> Result<usize> {
+    let get_size_of_record_batch_to_generate = std::mem::replace(
+        &mut args.get_size_of_record_batch_to_generate,
+        Box::pin(move |_| unreachable!("should not be called after take")),
+    );
+    let scan_schema = Arc::new(Schema::new(vec![
+        Field::new("col_0", DataType::UInt64, true),
+        Field::new("col_1", DataType::Utf8, true),
+    ]));
+
+    let group_by = PhysicalGroupBy::new_single(vec![(
+        Arc::new(Column::new("col_0", 0)),
+        "col_0".to_string(),
+    )]);
+
+    let aggregate_expressions = vec![Arc::new(
+        AggregateExprBuilder::new(
+            array_agg_udaf(),
+            vec![col("col_1", &scan_schema).unwrap()],
+        )
+        .schema(Arc::clone(&scan_schema))
+        .alias("array_agg(col_1)")
+        .build()?,
+    )];
+
+    let record_batch_size = args.task_ctx.session_config().batch_size() as u64;
+
+    let schema = Arc::clone(&scan_schema);
+    let plan: Arc<dyn ExecutionPlan> =
+        Arc::new(OnceExec::new(Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&schema),
+            futures::stream::iter((0..args.number_of_record_batches as u64).map(
+                move |index| {
+                    let mut record_batch_memory_size =
+                        get_size_of_record_batch_to_generate(index as usize);
+                    record_batch_memory_size = record_batch_memory_size
+                        .saturating_sub(size_of::<u64>() * record_batch_size as usize);
+
+                    let string_item_size =
+                        record_batch_memory_size / record_batch_size as usize;
+                    let string_array =
+                        Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+                            "a".repeat(string_item_size),
+                            record_batch_size as usize,
+                        )));
+
+                    RecordBatch::try_new(
+                        Arc::clone(&schema),
+                        vec![
+                            // Grouping key
+                            Arc::new(UInt64Array::from_iter_values(
+                                (index * record_batch_size)
+                                    ..(index * record_batch_size) + record_batch_size,
+                            )),
+                            // Grouping value
+                            string_array,
+                        ],
+                    )
+                    .map_err(|err| err.into())
+                },
+            )),
+        ))));
+
+    let aggregate_exec = Arc::new(AggregateExec::try_new(
+        AggregateMode::Partial,
+        group_by.clone(),
+        aggregate_expressions.clone(),
+        vec![None; aggregate_expressions.len()],
+        plan,
+        Arc::clone(&scan_schema),
+    )?);
+    let aggregate_final = Arc::new(AggregateExec::try_new(
+        AggregateMode::Final,
+        group_by,
+        aggregate_expressions.clone(),
+        vec![None; aggregate_expressions.len()],
+        aggregate_exec,
+        Arc::clone(&scan_schema),
+    )?);
+
+    let result = aggregate_final.execute(0, Arc::clone(&args.task_ctx))?;
+
+    run_test(args, aggregate_final, result).await
+}
+
+async fn run_test(
+    args: RunTestWithLimitedMemoryArgs,
+    plan: Arc<dyn ExecutionPlan>,
+    result_stream: SendableRecordBatchStream,
+) -> Result<usize> {
+    let number_of_record_batches = args.number_of_record_batches;
+
+    consume_stream_and_simulate_other_running_memory_consumers(args, result_stream)
+        .await?;
+
+    let spill_count = assert_spill_count_metric(true, plan);
+
+    assert!(
+        spill_count > 0,
+        "Expected spill, but did not, number of record batches: {number_of_record_batches}",
+    );
+
+    Ok(spill_count)
+}
+
+/// Consume the stream and change the amount of memory used while consuming it based on the [`MemoryBehavior`] provided
+async fn consume_stream_and_simulate_other_running_memory_consumers(
+    args: RunTestWithLimitedMemoryArgs,
+    mut result_stream: SendableRecordBatchStream,
+) -> Result<()> {
+    let mut number_of_rows = 0;
+    let record_batch_size = args.task_ctx.session_config().batch_size() as u64;
+
+    let memory_pool = args.task_ctx.memory_pool();
+    let memory_consumer = MemoryConsumer::new("mock_memory_consumer");
+    let mut memory_reservation = memory_consumer.register(memory_pool);
+
+    let mut index = 0;
+    let mut memory_took = false;
+
+    while let Some(batch) = result_stream.next().await {
+        match args.memory_behavior {
+            MemoryBehavior::AsIs => {
+                // Do nothing
+            }
+            MemoryBehavior::TakeAllMemoryAtTheBeginning => {
+                if !memory_took {
+                    memory_took = true;
+                    grow_memory_as_much_as_possible(10, &mut memory_reservation)?;
+                }
+            }
+            MemoryBehavior::TakeAllMemoryAndReleaseEveryNthBatch(n) => {
+                if !memory_took {
+                    memory_took = true;
+                    grow_memory_as_much_as_possible(
+                        args.pool_size,
+                        &mut memory_reservation,
+                    )?;
+                } else if index % n == 0 {
+                    // release memory
+                    memory_reservation.free();
+                }
+            }
+        }
+
+        let batch = batch?;
+        number_of_rows += batch.num_rows();
+
+        index += 1;
+    }
+
+    assert_eq!(
+        number_of_rows,
+        args.number_of_record_batches * record_batch_size as usize
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
new file mode 100644
index 0000000000000..d14afaf1b3267
--- /dev/null
+++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs
@@ -0,0 +1,387 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{Int32Array, StringArray, StringDictionaryBuilder};
+use arrow::datatypes::Int32Type;
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty::pretty_format_batches;
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource_parquet::ParquetFormat;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use itertools::Itertools;
+use object_store::memory::InMemory;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+use parquet::arrow::ArrowWriter;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use tokio::sync::Mutex;
+use tokio::task::JoinSet;
+
+#[derive(Clone)]
+struct TestDataSet {
+    store: Arc<dyn ObjectStore>,
+    schema: Arc<Schema>,
+}
+
+/// List of in memory parquet files with UTF8 data
+// Use a mutex rather than LazyLock to allow for async initialization
+static TESTFILES: LazyLock<Mutex<Vec<TestDataSet>>> =
+    LazyLock::new(|| Mutex::new(vec![]));
+
+async fn test_files() -> Vec<TestDataSet> {
+    let files_mutex = &TESTFILES;
+    let mut files = files_mutex.lock().await;
+    if !files.is_empty() {
+        return (*files).clone();
+    }
+
+    let mut rng = StdRng::seed_from_u64(0);
+
+    for nulls_in_ids in [false, true] {
+        for nulls_in_names in [false, true] {
+            for nulls_in_departments in [false, true] {
+                let store = Arc::new(InMemory::new());
+
+                let schema = Arc::new(Schema::new(vec![
+                    Field::new("id", DataType::Int32, nulls_in_ids),
+                    Field::new("name", DataType::Utf8, nulls_in_names),
+                    Field::new(
+                        "department",
+                        DataType::Dictionary(
+                            Box::new(DataType::Int32),
+                            Box::new(DataType::Utf8),
+                        ),
+                        nulls_in_departments,
+                    ),
+                ]));
+
+                let name_choices = if nulls_in_names {
+                    [Some("Alice"), Some("Bob"), None, Some("David"), None]
+                } else {
+                    [
+                        Some("Alice"),
+                        Some("Bob"),
+                        Some("Charlie"),
+                        Some("David"),
+                        Some("Eve"),
+                    ]
+                };
+
+                let department_choices = if nulls_in_departments {
+                    [
+                        Some("Theater"),
+                        Some("Engineering"),
+                        None,
+                        Some("Arts"),
+                        None,
+                    ]
+                } else {
+                    [
+                        Some("Theater"),
+                        Some("Engineering"),
+                        Some("Healthcare"),
+                        Some("Arts"),
+                        Some("Music"),
+                    ]
+                };
+
+                // Generate 5 files, some with overlapping or repeated ids some without
+                for i in 0..5 {
+                    let num_batches = rng.random_range(1..3);
+                    let mut batches = Vec::with_capacity(num_batches);
+                    for _ in 0..num_batches {
+                        let num_rows = 25;
+                        let ids = Int32Array::from_iter((0..num_rows).map(|file| {
+                            if nulls_in_ids {
+                                if rng.random_bool(1.0 / 10.0) {
+                                    None
+                                } else {
+                                    Some(rng.random_range(file..file + 5))
+                                }
+                            } else {
+                                Some(rng.random_range(file..file + 5))
+                            }
+                        }));
+                        let names = StringArray::from_iter((0..num_rows).map(|_| {
+                            // randomly select a name
+                            let idx = rng.random_range(0..name_choices.len());
+                            name_choices[idx].map(|s| s.to_string())
+                        }));
+                        let mut departments = StringDictionaryBuilder::<Int32Type>::new();
+                        for _ in 0..num_rows {
+                            // randomly select a department
+                            let idx = rng.random_range(0..department_choices.len());
+                            departments.append_option(department_choices[idx].as_ref());
+                        }
+                        let batch = RecordBatch::try_new(
+                            schema.clone(),
+                            vec![
+                                Arc::new(ids),
+                                Arc::new(names),
+                                Arc::new(departments.finish()),
+                            ],
+                        )
+                        .unwrap();
+                        batches.push(batch);
+                    }
+                    let mut buf = vec![];
+                    {
+                        let mut writer =
+                            ArrowWriter::try_new(&mut buf, schema.clone(), None).unwrap();
+                        for batch in batches {
+                            writer.write(&batch).unwrap();
+                            writer.flush().unwrap();
+                        }
+                        writer.flush().unwrap();
+                        writer.finish().unwrap();
+                    }
+                    let payload = PutPayload::from(buf);
+                    let path = Path::from(format!("file_{i}.parquet"));
+                    store.put(&path, payload).await.unwrap();
+                }
+                files.push(TestDataSet { store, schema });
+            }
+        }
+    }
+    (*files).clone()
+}
+
+struct RunResult {
+    results: Vec<RecordBatch>,
+    explain_plan: String,
+}
+
+async fn run_query_with_config(
+    query: &str,
+    config: SessionConfig,
+    dataset: TestDataSet,
+) -> RunResult {
+    let store = dataset.store;
+    let schema = dataset.schema;
+    let ctx = SessionContext::new_with_config(config);
+    let url = ObjectStoreUrl::parse("memory://").unwrap();
+    ctx.register_object_store(url.as_ref(), store.clone());
+
+    let format = Arc::new(
+        ParquetFormat::default()
+            .with_options(ctx.state().table_options().parquet.clone()),
+    );
+    let options = ListingOptions::new(format);
+    let table_path = ListingTableUrl::parse("memory:///").unwrap();
+    let config = ListingTableConfig::new(table_path)
+        .with_listing_options(options)
+        .with_schema(schema);
+    let table = Arc::new(ListingTable::try_new(config).unwrap());
+
+    ctx.register_table("test_table", table).unwrap();
+
+    let results = ctx.sql(query).await.unwrap().collect().await.unwrap();
+    let explain_batches = ctx
+        .sql(&format!("EXPLAIN ANALYZE {query}"))
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    let explain_plan = pretty_format_batches(&explain_batches).unwrap().to_string();
+    RunResult {
+        results,
+        explain_plan,
+    }
+}
+
+#[derive(Debug)]
+struct RunQueryResult {
+    query: String,
+    result: Vec<RecordBatch>,
+    expected: Vec<RecordBatch>,
+}
+
+impl RunQueryResult {
+    fn expected_formatted(&self) -> String {
+        format!("{}", pretty_format_batches(&self.expected).unwrap())
+    }
+
+    fn result_formatted(&self) -> String {
+        format!("{}", pretty_format_batches(&self.result).unwrap())
+    }
+
+    fn is_ok(&self) -> bool {
+        self.expected_formatted() == self.result_formatted()
+    }
+}
+
+/// Iterate over each line in the plan and check that one of them has `DataSourceExec` and `DynamicFilter` in the same line.
+fn has_dynamic_filter_expr_pushdown(plan: &str) -> bool {
+    for line in plan.lines() {
+        if line.contains("DataSourceExec") && line.contains("DynamicFilter") {
+            return true;
+        }
+    }
+    false
+}
+
+async fn run_query(
+    query: String,
+    cfg: SessionConfig,
+    dataset: TestDataSet,
+) -> RunQueryResult {
+    let cfg_with_dynamic_filters = cfg
+        .clone()
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let cfg_without_dynamic_filters = cfg
+        .clone()
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", false);
+
+    let expected_result =
+        run_query_with_config(&query, cfg_without_dynamic_filters, dataset.clone()).await;
+    let result =
+        run_query_with_config(&query, cfg_with_dynamic_filters, dataset.clone()).await;
+    // Check that dynamic filters were actually pushed down
+    if !has_dynamic_filter_expr_pushdown(&result.explain_plan) {
+        panic!(
+            "Dynamic filter was not pushed down in query: {query}\n\n{}",
+            result.explain_plan
+        );
+    }
+
+    RunQueryResult {
+        query: query.to_string(),
+        result: result.results,
+        expected: expected_result.results,
+    }
+}
+
+struct TestCase {
+    query: String,
+    cfg: SessionConfig,
+    dataset: TestDataSet,
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn test_fuzz_topk_filter_pushdown() {
+    let order_columns = ["id", "name", "department"];
+    let order_directions = ["ASC", "DESC"];
+    let null_orders = ["NULLS FIRST", "NULLS LAST"];
+
+    let start = datafusion_common::instant::Instant::now();
+    let mut orders: HashMap<String, Vec<String>> = HashMap::new();
+    for order_column in &order_columns {
+        for order_direction in &order_directions {
+            for null_order in &null_orders {
+                // if there is a vec for this column insert the order, otherwise create a new vec
+                let ordering = format!("{order_column} {order_direction} {null_order}");
+                match orders.get_mut(*order_column) {
+                    Some(order_vec) => {
+                        order_vec.push(ordering);
+                    }
+                    None => {
+                        orders.insert((*order_column).to_string(), vec![ordering]);
+                    }
+                }
+            }
+        }
+    }
+
+    let mut queries = vec![];
+
+    for limit in [1, 10] {
+        for num_order_by_columns in [1, 2, 3] {
+            for order_columns in ["id", "name", "department"]
+                .iter()
+                .combinations(num_order_by_columns)
+            {
+                for orderings in order_columns
+                    .iter()
+                    .map(|col| orders.get(**col).unwrap())
+                    .multi_cartesian_product()
+                {
+                    let query = format!(
+                        "SELECT * FROM test_table ORDER BY {} LIMIT {}",
+                        orderings.into_iter().join(", "),
+                        limit
+                    );
+                    queries.push(query);
+                }
+            }
+        }
+    }
+
+    queries.sort_unstable();
+    println!(
+        "Generated {} queries in {:?}",
+        queries.len(),
+        start.elapsed()
+    );
+
+    let start = datafusion_common::instant::Instant::now();
+    let datasets = test_files().await;
+    println!("Generated test files in {:?}", start.elapsed());
+
+    let mut test_cases = vec![];
+    for enable_filter_pushdown in [true, false] {
+        for query in &queries {
+            for dataset in &datasets {
+                let mut cfg = SessionConfig::new();
+                cfg = cfg.set_bool(
+                    "datafusion.optimizer.enable_dynamic_filter_pushdown",
+                    enable_filter_pushdown,
+                );
+                test_cases.push(TestCase {
+                    query: query.to_string(),
+                    cfg,
+                    dataset: dataset.clone(),
+                });
+            }
+        }
+    }
+
+    let start = datafusion_common::instant::Instant::now();
+    let mut join_set = JoinSet::new();
+    for tc in test_cases {
+        join_set.spawn(run_query(tc.query, tc.cfg, tc.dataset));
+    }
+    let mut results = join_set.join_all().await;
+    results.sort_unstable_by(|a, b| a.query.cmp(&b.query));
+    println!("Ran {} test cases in {:?}", results.len(), start.elapsed());
+
+    let failures = results
+        .iter()
+        .filter(|result| !result.is_ok())
+        .collect::<Vec<_>>();
+
+    for failure in &failures {
+        println!("Failure:");
+        println!("Query:\n{}", failure.query);
+        println!("\nExpected:\n{}", failure.expected_formatted());
+        println!("\nResult:\n{}", failure.result_formatted());
+        println!("\n\n");
+    }
+
+    if !failures.is_empty() {
+        panic!("Some test cases failed");
+    } else {
+        println!("All test cases passed");
+    }
+}
diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
index 5bd2e457b42a5..82b6d0e4e9d89 100644
--- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs
@@ -18,24 +18,24 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array, StringArray};
-use arrow::compute::{concat_batches, SortOptions};
+use arrow::compute::{SortOptions, concat_batches};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::functions_window::row_number::row_number_udwf;
+use datafusion::physical_plan::InputOrderMode::{Linear, PartiallySorted, Sorted};
 use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::windows::{
-    create_window_expr, schema_add_window_field, BoundedWindowAggExec, WindowAggExec,
+    BoundedWindowAggExec, WindowAggExec, create_window_expr, schema_add_window_field,
 };
-use datafusion::physical_plan::InputOrderMode::{Linear, PartiallySorted, Sorted};
-use datafusion::physical_plan::{collect, InputOrderMode};
+use datafusion::physical_plan::{InputOrderMode, collect};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::HashMap;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_expr::type_coercion::functions::fields_with_aggregate_udf;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
 use datafusion_expr::{
     WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
 };
@@ -252,7 +252,6 @@ async fn bounded_window_causal_non_causal() -> Result<()> {
     ];
 
     let partitionby_exprs = vec![];
-    let orderby_exprs = LexOrdering::default();
     // Window frame starts with "UNBOUNDED PRECEDING":
     let start_bound = WindowFrameBound::Preceding(ScalarValue::UInt64(None));
 
@@ -285,10 +284,12 @@ async fn bounded_window_causal_non_causal() -> Result<()> {
                     fn_name.to_string(),
                     &args,
                     &partitionby_exprs,
-                    orderby_exprs.as_ref(),
+                    &[],
                     Arc::new(window_frame),
-                    &extended_schema,
+                    extended_schema,
                     false,
+                    false,
+                    None,
                 )?;
                 let running_window_exec = Arc::new(BoundedWindowAggExec::try_new(
                     vec![window_expr],
@@ -444,17 +445,17 @@ fn get_random_function(
     let fn_name = window_fn_map.keys().collect::<Vec<_>>()[rand_fn_idx];
     let (window_fn, args) = window_fn_map.values().collect::<Vec<_>>()[rand_fn_idx];
     let mut args = args.clone();
-    if let WindowFunctionDefinition::AggregateUDF(udf) = window_fn {
-        if !args.is_empty() {
-            // Do type coercion first argument
-            let a = args[0].clone();
-            let dt = a.return_field(schema.as_ref()).unwrap();
-            let coerced = fields_with_aggregate_udf(&[dt], udf).unwrap();
-            args[0] = cast(a, schema, coerced[0].data_type().clone()).unwrap();
-        }
+    if let WindowFunctionDefinition::AggregateUDF(udf) = window_fn
+        && !args.is_empty()
+    {
+        // Do type coercion first argument
+        let a = args[0].clone();
+        let dt = a.return_field(schema.as_ref()).unwrap();
+        let coerced = fields_with_udf(&[dt], udf.as_ref()).unwrap();
+        args[0] = cast(a, schema, coerced[0].data_type().clone()).unwrap();
     }
 
-    (window_fn.clone(), args, fn_name.to_string())
+    (window_fn.clone(), args, (*fn_name).to_string())
 }
 
 fn get_random_window_frame(rng: &mut StdRng, is_linear: bool) -> WindowFrame {
@@ -568,10 +569,11 @@ fn convert_bound_to_current_row_if_applicable(
 ) {
     match bound {
         WindowFrameBound::Preceding(value) | WindowFrameBound::Following(value) => {
-            if let Ok(zero) = ScalarValue::new_zero(&value.data_type()) {
-                if value == &zero && rng.random_range(0..2) == 0 {
-                    *bound = WindowFrameBound::CurrentRow;
-                }
+            if let Ok(zero) = ScalarValue::new_zero(&value.data_type())
+                && value == &zero
+                && rng.random_range(0..2) == 0
+            {
+                *bound = WindowFrameBound::CurrentRow;
             }
         }
         _ => {}
@@ -587,14 +589,14 @@ async fn run_window_test(
     orderby_columns: Vec<&str>,
     search_mode: InputOrderMode,
 ) -> Result<()> {
-    let is_linear = !matches!(search_mode, Sorted);
+    let is_linear = search_mode != Sorted;
     let mut rng = StdRng::seed_from_u64(random_seed);
     let schema = input1[0].schema();
     let session_config = SessionConfig::new().with_batch_size(50);
     let ctx = SessionContext::new_with_config(session_config);
     let (window_fn, args, fn_name) = get_random_function(&schema, &mut rng, is_linear);
     let window_frame = get_random_window_frame(&mut rng, is_linear);
-    let mut orderby_exprs = LexOrdering::default();
+    let mut orderby_exprs = vec![];
     for column in &orderby_columns {
         orderby_exprs.push(PhysicalSortExpr {
             expr: col(column, &schema)?,
@@ -602,13 +604,13 @@ async fn run_window_test(
         })
     }
     if orderby_exprs.len() > 1 && !window_frame.can_accept_multi_orderby() {
-        orderby_exprs = LexOrdering::new(orderby_exprs[0..1].to_vec());
+        orderby_exprs.truncate(1);
     }
     let mut partitionby_exprs = vec![];
     for column in &partition_by_columns {
         partitionby_exprs.push(col(column, &schema)?);
     }
-    let mut sort_keys = LexOrdering::default();
+    let mut sort_keys = vec![];
     for partition_by_expr in &partitionby_exprs {
         sort_keys.push(PhysicalSortExpr {
             expr: partition_by_expr.clone(),
@@ -622,7 +624,7 @@ async fn run_window_test(
     }
 
     let concat_input_record = concat_batches(&schema, &input1)?;
-    let source_sort_keys = LexOrdering::new(vec![
+    let source_sort_keys: LexOrdering = [
         PhysicalSortExpr {
             expr: col("a", &schema)?,
             options: Default::default(),
@@ -635,15 +637,16 @@ async fn run_window_test(
             expr: col("c", &schema)?,
             options: Default::default(),
         },
-    ]);
+    ]
+    .into();
     let mut exec1 = DataSourceExec::from_data_source(
         MemorySourceConfig::try_new(&[vec![concat_input_record]], schema.clone(), None)?
             .try_with_sort_information(vec![source_sort_keys.clone()])?,
     ) as _;
     // Table is ordered according to ORDER BY a, b, c In linear test we use PARTITION BY b, ORDER BY a
     // For WindowAggExec  to produce correct result it need table to be ordered by b,a. Hence add a sort.
-    if is_linear {
-        exec1 = Arc::new(SortExec::new(sort_keys, exec1)) as _;
+    if is_linear && let Some(ordering) = LexOrdering::new(sort_keys) {
+        exec1 = Arc::new(SortExec::new(ordering, exec1)) as _;
     }
 
     let extended_schema = schema_add_window_field(&args, &schema, &window_fn, &fn_name)?;
@@ -654,17 +657,19 @@ async fn run_window_test(
             fn_name.clone(),
             &args,
             &partitionby_exprs,
-            orderby_exprs.as_ref(),
+            &orderby_exprs.clone(),
             Arc::new(window_frame.clone()),
-            &extended_schema,
+            Arc::clone(&extended_schema),
             false,
+            false,
+            None,
         )?],
         exec1,
         false,
     )?) as _;
     let exec2 = DataSourceExec::from_data_source(
-        MemorySourceConfig::try_new(&[input1.clone()], schema.clone(), None)?
-            .try_with_sort_information(vec![source_sort_keys.clone()])?,
+        MemorySourceConfig::try_new(&[input1], schema, None)?
+            .try_with_sort_information(vec![source_sort_keys])?,
     );
     let running_window_exec = Arc::new(BoundedWindowAggExec::try_new(
         vec![create_window_expr(
@@ -672,10 +677,12 @@ async fn run_window_test(
             fn_name,
             &args,
             &partitionby_exprs,
-            orderby_exprs.as_ref(),
+            &orderby_exprs,
             Arc::new(window_frame.clone()),
-            &extended_schema,
+            extended_schema,
             false,
+            false,
+            None,
         )?],
         exec2,
         search_mode.clone(),
@@ -691,7 +698,9 @@ async fn run_window_test(
 
     // BoundedWindowAggExec should produce more chunk than the usual WindowAggExec.
     // Otherwise it means that we cannot generate result in running mode.
-    let err_msg = format!("Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}");
+    let err_msg = format!(
+        "Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}"
+    );
     // Below check makes sure that, streaming execution generates more chunks than the bulk execution.
     // Since algorithms and operators works on sliding windows in the streaming execution.
     // However, in the current test setup for some random generated window frame clauses: It is not guaranteed
@@ -723,8 +732,12 @@ async fn run_window_test(
         .enumerate()
     {
         if !usual_line.eq(running_line) {
-            println!("Inconsistent result for window_frame at line:{i:?}: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, pb_cols:{partition_by_columns:?}, ob_cols:{orderby_columns:?}, search_mode:{search_mode:?}");
-            println!("--------usual_formatted_sorted----------------running_formatted_sorted--------");
+            println!(
+                "Inconsistent result for window_frame at line:{i:?}: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, pb_cols:{partition_by_columns:?}, ob_cols:{orderby_columns:?}, search_mode:{search_mode:?}"
+            );
+            println!(
+                "--------usual_formatted_sorted----------------running_formatted_sorted--------"
+            );
             for (line1, line2) in
                 usual_formatted_sorted.iter().zip(running_formatted_sorted)
             {
diff --git a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs b/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs
deleted file mode 100644
index 38c2ee582a616..0000000000000
--- a/datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs
+++ /dev/null
@@ -1,260 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Integration test for schema adapter factory functionality
-
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use arrow::record_batch::RecordBatch;
-use datafusion::datasource::object_store::ObjectStoreUrl;
-use datafusion::datasource::physical_plan::arrow_file::ArrowSource;
-use datafusion::prelude::*;
-use datafusion_common::Result;
-use datafusion_datasource::file::FileSource;
-use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-use datafusion_datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory};
-use datafusion_datasource::source::DataSourceExec;
-use datafusion_datasource::PartitionedFile;
-use std::sync::Arc;
-use tempfile::TempDir;
-
-#[cfg(feature = "parquet")]
-use datafusion_datasource_parquet::ParquetSource;
-#[cfg(feature = "parquet")]
-use parquet::arrow::ArrowWriter;
-#[cfg(feature = "parquet")]
-use parquet::file::properties::WriterProperties;
-
-#[cfg(feature = "csv")]
-use datafusion_datasource_csv::CsvSource;
-
-/// A schema adapter factory that transforms column names to uppercase
-#[derive(Debug)]
-struct UppercaseAdapterFactory {}
-
-impl SchemaAdapterFactory for UppercaseAdapterFactory {
-    fn create(&self, schema: &Schema) -> Result<Box<dyn SchemaAdapter>> {
-        Ok(Box::new(UppercaseAdapter {
-            input_schema: Arc::new(schema.clone()),
-        }))
-    }
-}
-
-/// Schema adapter that transforms column names to uppercase
-#[derive(Debug)]
-struct UppercaseAdapter {
-    input_schema: SchemaRef,
-}
-
-impl SchemaAdapter for UppercaseAdapter {
-    fn adapt(&self, record_batch: RecordBatch) -> Result<RecordBatch> {
-        // In a real adapter, we might transform the data too
-        // For this test, we're just passing through the batch
-        Ok(record_batch)
-    }
-
-    fn output_schema(&self) -> SchemaRef {
-        let fields = self
-            .input_schema
-            .fields()
-            .iter()
-            .map(|f| {
-                Field::new(
-                    f.name().to_uppercase().as_str(),
-                    f.data_type().clone(),
-                    f.is_nullable(),
-                )
-            })
-            .collect();
-
-        Arc::new(Schema::new(fields))
-    }
-}
-
-#[cfg(feature = "parquet")]
-#[tokio::test]
-async fn test_parquet_integration_with_schema_adapter() -> Result<()> {
-    // Create a temporary directory for our test file
-    let tmp_dir = TempDir::new()?;
-    let file_path = tmp_dir.path().join("test.parquet");
-    let file_path_str = file_path.to_str().unwrap();
-
-    // Create test data
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-    ]));
-
-    let batch = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])),
-            Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])),
-        ],
-    )?;
-
-    // Write test parquet file
-    let file = std::fs::File::create(file_path_str)?;
-    let props = WriterProperties::builder().build();
-    let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?;
-    writer.write(&batch)?;
-    writer.close()?;
-
-    // Create a session context
-    let ctx = SessionContext::new();
-
-    // Create a ParquetSource with the adapter factory
-    let source = ParquetSource::default()
-        .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}));
-
-    // Create a scan config
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse(&format!("file://{}", file_path_str))?,
-        schema.clone(),
-    )
-    .with_source(source)
-    .build();
-
-    // Create a data source executor
-    let exec = DataSourceExec::from_data_source(config);
-
-    // Collect results
-    let task_ctx = ctx.task_ctx();
-    let stream = exec.execute(0, task_ctx)?;
-    let batches = datafusion::physical_plan::common::collect(stream).await?;
-
-    // There should be one batch
-    assert_eq!(batches.len(), 1);
-
-    // Verify the schema has uppercase column names
-    let result_schema = batches[0].schema();
-    assert_eq!(result_schema.field(0).name(), "ID");
-    assert_eq!(result_schema.field(1).name(), "NAME");
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn test_multi_source_schema_adapter_reuse() -> Result<()> {
-    // This test verifies that the same schema adapter factory can be reused
-    // across different file source types. This is important for ensuring that:
-    // 1. The schema adapter factory interface works uniformly across all source types
-    // 2. The factory can be shared and cloned efficiently using Arc
-    // 3. Various data source implementations correctly implement the schema adapter factory pattern
-
-    // Create a test factory
-    let factory = Arc::new(UppercaseAdapterFactory {});
-
-    // Apply the same adapter to different source types
-    let arrow_source =
-        ArrowSource::default().with_schema_adapter_factory(factory.clone());
-
-    #[cfg(feature = "parquet")]
-    let parquet_source =
-        ParquetSource::default().with_schema_adapter_factory(factory.clone());
-
-    #[cfg(feature = "csv")]
-    let csv_source = CsvSource::default().with_schema_adapter_factory(factory.clone());
-
-    // Verify adapters were properly set
-    assert!(arrow_source.schema_adapter_factory().is_some());
-
-    #[cfg(feature = "parquet")]
-    assert!(parquet_source.schema_adapter_factory().is_some());
-
-    #[cfg(feature = "csv")]
-    assert!(csv_source.schema_adapter_factory().is_some());
-
-    Ok(())
-}
-
-// Helper function to test From<T> for Arc<dyn FileSource> implementations
-fn test_from_impl<T: Into<Arc<dyn FileSource>> + Default>(expected_file_type: &str) {
-    let source = T::default();
-    let file_source: Arc<dyn FileSource> = source.into();
-    assert_eq!(file_source.file_type(), expected_file_type);
-}
-
-#[test]
-fn test_from_implementations() {
-    // Test From implementation for various sources
-    test_from_impl::<ArrowSource>("arrow");
-
-    #[cfg(feature = "parquet")]
-    test_from_impl::<ParquetSource>("parquet");
-
-    #[cfg(feature = "csv")]
-    test_from_impl::<CsvSource>("csv");
-
-    #[cfg(feature = "json")]
-    test_from_impl::<datafusion_datasource_json::JsonSource>("json");
-}
-
-/// A simple test schema adapter factory that doesn't modify the schema
-#[derive(Debug)]
-struct TestSchemaAdapterFactory {}
-
-impl SchemaAdapterFactory for TestSchemaAdapterFactory {
-    fn create(&self, schema: &Schema) -> Result<Box<dyn SchemaAdapter>> {
-        Ok(Box::new(TestSchemaAdapter {
-            input_schema: Arc::new(schema.clone()),
-        }))
-    }
-}
-
-/// A test schema adapter that passes through data unmodified
-#[derive(Debug)]
-struct TestSchemaAdapter {
-    input_schema: SchemaRef,
-}
-
-impl SchemaAdapter for TestSchemaAdapter {
-    fn adapt(&self, record_batch: RecordBatch) -> Result<RecordBatch> {
-        // Just pass through the batch unmodified
-        Ok(record_batch)
-    }
-
-    fn output_schema(&self) -> SchemaRef {
-        self.input_schema.clone()
-    }
-}
-
-#[cfg(feature = "parquet")]
-#[test]
-fn test_schema_adapter_preservation() {
-    // Create a test schema
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-    ]));
-
-    // Create source with schema adapter factory
-    let source = ParquetSource::default();
-    let factory = Arc::new(TestSchemaAdapterFactory {});
-    let file_source = source.with_schema_adapter_factory(factory);
-
-    // Create a FileScanConfig with the source
-    let config_builder =
-        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema.clone())
-            .with_source(file_source.clone())
-            // Add a file to make it valid
-            .with_file(PartitionedFile::new("test.parquet", 100));
-
-    let config = config_builder.build();
-
-    // Verify the schema adapter factory is present in the file source
-    assert!(config.source().schema_adapter_factory().is_some());
-}
diff --git a/datafusion/core/tests/macro_hygiene/mod.rs b/datafusion/core/tests/macro_hygiene/mod.rs
index 9196efec972c1..9fd60cd1f06f3 100644
--- a/datafusion/core/tests/macro_hygiene/mod.rs
+++ b/datafusion/core/tests/macro_hygiene/mod.rs
@@ -65,3 +65,43 @@ mod config_namespace {
         }
     }
 }
+
+mod config_field {
+    // NO other imports!
+    use datafusion_common::config_field;
+
+    #[test]
+    fn test_macro() {
+        #[derive(Debug)]
+        #[expect(dead_code)]
+        struct E;
+
+        impl std::fmt::Display for E {
+            fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                unimplemented!()
+            }
+        }
+
+        impl std::error::Error for E {}
+
+        #[expect(dead_code)]
+        #[derive(Default)]
+        struct S;
+
+        impl std::str::FromStr for S {
+            type Err = E;
+
+            fn from_str(_s: &str) -> Result<Self, Self::Err> {
+                unimplemented!()
+            }
+        }
+
+        impl std::fmt::Display for S {
+            fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                unimplemented!()
+            }
+        }
+
+        config_field!(S);
+    }
+}
diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
index 64ab1378340aa..e1d5f1b1ab198 100644
--- a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
+++ b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
@@ -31,7 +31,7 @@ static INIT: Once = Once::new();
 
 // ===========================================================================
 // Test runners:
-// Runners are splitted into multiple tests to run in parallel
+// Runners are split into multiple tests to run in parallel
 // ===========================================================================
 
 #[test]
@@ -98,11 +98,9 @@ fn init_once() {
 fn spawn_test_process(test: &str) {
     init_once();
 
-    let test_path = format!(
-        "memory_limit::memory_limit_validation::sort_mem_validation::{}",
-        test
-    );
-    info!("Running test: {}", test_path);
+    let test_path =
+        format!("memory_limit::memory_limit_validation::sort_mem_validation::{test}");
+    info!("Running test: {test_path}");
 
     // Run the test command
     let output = Command::new("cargo")
@@ -125,7 +123,7 @@ fn spawn_test_process(test: &str) {
     let stdout = str::from_utf8(&output.stdout).unwrap_or("");
     let stderr = str::from_utf8(&output.stderr).unwrap_or("");
 
-    info!("{}", stdout);
+    info!("{stdout}");
 
     assert!(
         output.status.success(),
diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
index 7b157b707a6de..2c9fae20c8606 100644
--- a/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
+++ b/datafusion/core/tests/memory_limit/memory_limit_validation/utils.rs
@@ -16,16 +16,14 @@
 // under the License.
 
 use datafusion_common_runtime::SpawnedTask;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System};
-use tokio::time::{interval, Duration};
+use tokio::time::{Duration, interval};
 
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_execution::{
-    memory_pool::{human_readable_size, FairSpillPool},
-    runtime_env::RuntimeEnvBuilder,
-};
+use datafusion_common::human_readable_size;
+use datafusion_execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder};
 
 /// Measures the maximum RSS (in bytes) during the execution of an async task. RSS
 /// will be sampled every 7ms.
@@ -40,7 +38,7 @@ use datafusion_execution::{
 async fn measure_max_rss<F, Fut, T>(f: F) -> (T, usize)
 where
     F: FnOnce() -> Fut,
-    Fut: std::future::Future<Output = T>,
+    Fut: Future<Output = T>,
 {
     // Initialize system information
     let mut system = System::new_all();
diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs
index 7695cc0969d87..ff8c512cbd22e 100644
--- a/datafusion/core/tests/memory_limit/mod.rs
+++ b/datafusion/core/tests/memory_limit/mod.rs
@@ -23,11 +23,13 @@ use std::sync::{Arc, LazyLock};
 
 #[cfg(feature = "extended_tests")]
 mod memory_limit_validation;
+mod repartition_mem_limit;
 use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringViewArray};
 use arrow::compute::SortOptions;
 use arrow::datatypes::{Int32Type, SchemaRef};
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::assert_batches_eq;
+use datafusion::config::SpillCompression;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::datasource::{MemTable, TableProvider};
@@ -37,19 +39,19 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::streaming::PartitionStream;
 use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream};
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_catalog::streaming::StreamingTable;
 use datafusion_catalog::Session;
-use datafusion_common::{assert_contains, Result};
+use datafusion_catalog::streaming::StreamingTable;
+use datafusion_common::{Result, assert_contains};
+use datafusion_execution::TaskContext;
 use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
 use datafusion_execution::memory_pool::{
     FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool,
 };
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
 use datafusion_expr::{Expr, TableType};
 use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_optimizer::join_selection::JoinSelection;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::join_selection::JoinSelection;
 use datafusion_physical_plan::collect as collect_batches;
 use datafusion_physical_plan::common::collect;
 use datafusion_physical_plan::spill::get_record_batch_memory_size;
@@ -84,7 +86,8 @@ async fn group_by_none() {
     TestCase::new()
         .with_query("select median(request_bytes) from t")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  AggregateStream"
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:\n  AggregateStream",
         ])
         .with_memory_limit(2_000)
         .run()
@@ -96,7 +99,7 @@ async fn group_by_row_hash() {
     TestCase::new()
         .with_query("select count(*) from t GROUP BY response_bytes")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  GroupedHashAggregateStream"
+            "Resources exhausted: Additional allocation failed", "with top memory consumers (across reservations) as:\n  GroupedHashAggregateStream"
         ])
         .with_memory_limit(2_000)
         .run()
@@ -109,7 +112,7 @@ async fn group_by_hash() {
         // group by dict column
         .with_query("select count(*) from t GROUP BY service, host, pod, container")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  GroupedHashAggregateStream"
+            "Resources exhausted: Additional allocation failed", "with top memory consumers (across reservations) as:\n  GroupedHashAggregateStream"
         ])
         .with_memory_limit(1_000)
         .run()
@@ -122,7 +125,8 @@ async fn join_by_key_multiple_partitions() {
     TestCase::new()
         .with_query("select t1.* from t t1 JOIN t t2 ON t1.service = t2.service")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  HashJoinInput",
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:\n  HashJoinInput",
         ])
         .with_memory_limit(1_000)
         .with_config(config)
@@ -136,7 +140,8 @@ async fn join_by_key_single_partition() {
     TestCase::new()
         .with_query("select t1.* from t t1 JOIN t t2 ON t1.service = t2.service")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  HashJoinInput",
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:\n  HashJoinInput",
         ])
         .with_memory_limit(1_000)
         .with_config(config)
@@ -149,7 +154,7 @@ async fn join_by_expression() {
     TestCase::new()
         .with_query("select t1.* from t t1 JOIN t t2 ON t1.service != t2.service")
         .with_expected_errors(vec![
-           "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  NestedLoopJoinLoad[0]",
+           "Resources exhausted: Additional allocation failed", "with top memory consumers (across reservations) as:\n  NestedLoopJoinLoad[0]",
         ])
         .with_memory_limit(1_000)
         .run()
@@ -161,7 +166,8 @@ async fn cross_join() {
     TestCase::new()
         .with_query("select t1.*, t2.* from t t1 CROSS JOIN t t2")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  CrossJoinExec",
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:\n  CrossJoinExec",
         ])
         .with_memory_limit(1_000)
         .run()
@@ -217,7 +223,7 @@ async fn symmetric_hash_join() {
             "select t1.* from t t1 JOIN t t2 ON t1.pod = t2.pod AND t1.time = t2.time",
         )
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  SymmetricHashJoinStream",
+            "Resources exhausted: Additional allocation failed", "with top memory consumers (across reservations) as:\n  SymmetricHashJoinStream",
         ])
         .with_memory_limit(1_000)
         .with_scenario(Scenario::AccessLogStreaming)
@@ -235,7 +241,7 @@ async fn sort_preserving_merge() {
     // so only a merge is needed
         .with_query("select * from t ORDER BY a ASC NULLS LAST, b ASC NULLS LAST LIMIT 10")
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  SortPreservingMergeExec",
+            "Resources exhausted: Additional allocation failed", "with top memory consumers (across reservations) as:\n  SortPreservingMergeExec",
         ])
         // provide insufficient memory to merge
         .with_memory_limit(partition_size / 2)
@@ -314,7 +320,8 @@ async fn sort_spill_reservation() {
 
     test.clone()
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:",
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:",
             "B for ExternalSorterMerge",
         ])
         .with_config(config)
@@ -344,7 +351,8 @@ async fn oom_recursive_cte() {
         SELECT * FROM nodes;",
         )
         .with_expected_errors(vec![
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  RecursiveQuery",
+            "Resources exhausted: Additional allocation failed",
+            "with top memory consumers (across reservations) as:\n  RecursiveQuery",
         ])
         .with_memory_limit(2_000)
         .run()
@@ -396,7 +404,7 @@ async fn oom_with_tracked_consumer_pool() {
         .with_expected_errors(vec![
             "Failed to allocate additional",
             "for ParquetSink(ArrowColumnWriter)",
-            "Additional allocation failed with top memory consumers (across reservations) as:\n  ParquetSink(ArrowColumnWriter)"
+            "Additional allocation failed", "with top memory consumers (across reservations) as:\n  ParquetSink(ArrowColumnWriter)"
         ])
         .with_memory_pool(Arc::new(
             TrackConsumersPool::new(
@@ -545,10 +553,11 @@ async fn test_external_sort_zero_merge_reservation() {
 // Tests for disk limit (`max_temp_directory_size` in `DiskManager`)
 // ------------------------------------------------------------------
 
-// Create a new `SessionContext` with speicified disk limit and memory pool limit
+// Create a new `SessionContext` with specified disk limit, memory pool limit, and spill compression codec
 async fn setup_context(
     disk_limit: u64,
     memory_pool_limit: usize,
+    spill_compression: SpillCompression,
 ) -> Result<SessionContext> {
     let disk_manager = DiskManagerBuilder::default()
         .with_mode(DiskManagerMode::OsTmpDirectory)
@@ -565,11 +574,16 @@ async fn setup_context(
         disk_manager: Arc::new(disk_manager),
         cache_manager: runtime.cache_manager.clone(),
         object_store_registry: runtime.object_store_registry.clone(),
+        #[cfg(feature = "parquet_encryption")]
+        parquet_encryption_factory_registry: runtime
+            .parquet_encryption_factory_registry
+            .clone(),
     });
 
     let config = SessionConfig::new()
         .with_sort_spill_reservation_bytes(64 * 1024) // 256KB
         .with_sort_in_place_threshold_bytes(0)
+        .with_spill_compression(spill_compression)
         .with_batch_size(64) // To reduce test memory usage
         .with_target_partitions(1);
 
@@ -580,18 +594,24 @@ async fn setup_context(
 /// (specified by `max_temp_directory_size` in `DiskManager`)
 #[tokio::test]
 async fn test_disk_spill_limit_reached() -> Result<()> {
-    let ctx = setup_context(1024 * 1024, 1024 * 1024).await?; // 1MB disk limit, 1MB memory limit
+    let spill_compression = SpillCompression::Uncompressed;
+    let ctx = setup_context(1024 * 1024, 1024 * 1024, spill_compression).await?; // 1MB disk limit, 1MB memory limit
 
     let df = ctx
         .sql("select * from generate_series(1, 1000000000000) as t1(v1) order by v1")
         .await
         .unwrap();
 
-    let err = df.collect().await.unwrap_err();
-    assert_contains!(
-    err.to_string(),
-    "The used disk space during the spilling process has exceeded the allowable limit"
-    );
+    let error_message = df.collect().await.unwrap_err().to_string();
+    for expected in [
+        "The used disk space during the spilling process has exceeded the allowable limit",
+        "datafusion.runtime.max_temp_directory_size",
+    ] {
+        assert!(
+            error_message.contains(expected),
+            "'{expected}' is not contained by '{error_message}'"
+        );
+    }
 
     Ok(())
 }
@@ -602,7 +622,8 @@ async fn test_disk_spill_limit_reached() -> Result<()> {
 #[tokio::test]
 async fn test_disk_spill_limit_not_reached() -> Result<()> {
     let disk_spill_limit = 1024 * 1024; // 1MB
-    let ctx = setup_context(disk_spill_limit, 128 * 1024).await?; // 1MB disk limit, 128KB memory limit
+    let spill_compression = SpillCompression::Uncompressed;
+    let ctx = setup_context(disk_spill_limit, 128 * 1024, spill_compression).await?; // 1MB disk limit, 128KB memory limit
 
     let df = ctx
         .sql("select * from generate_series(1, 10000) as t1(v1) order by v1")
@@ -630,6 +651,77 @@ async fn test_disk_spill_limit_not_reached() -> Result<()> {
     Ok(())
 }
 
+/// External query should succeed using zstd as spill compression codec and
+/// and all temporary spill files are properly cleaned up after execution.
+/// Note: This test does not inspect file contents (e.g. magic number),
+/// as spill files are automatically deleted on drop.
+#[tokio::test]
+async fn test_spill_file_compressed_with_zstd() -> Result<()> {
+    let disk_spill_limit = 1024 * 1024; // 1MB
+    let spill_compression = SpillCompression::Zstd;
+    let ctx = setup_context(disk_spill_limit, 128 * 1024, spill_compression).await?; // 1MB disk limit, 128KB memory limit, zstd
+
+    let df = ctx
+        .sql("select * from generate_series(1, 100000) as t1(v1) order by v1")
+        .await
+        .unwrap();
+    let plan = df.create_physical_plan().await.unwrap();
+
+    let task_ctx = ctx.task_ctx();
+    let _ = collect_batches(Arc::clone(&plan), task_ctx)
+        .await
+        .expect("Query execution failed");
+
+    let spill_count = plan.metrics().unwrap().spill_count().unwrap();
+    let spilled_bytes = plan.metrics().unwrap().spilled_bytes().unwrap();
+
+    println!("spill count {spill_count}");
+    assert!(spill_count > 0);
+    assert!((spilled_bytes as u64) < disk_spill_limit);
+
+    // Verify that all temporary files have been properly cleaned up by checking
+    // that the total disk usage tracked by the disk manager is zero
+    let current_disk_usage = ctx.runtime_env().disk_manager.used_disk_space();
+    assert_eq!(current_disk_usage, 0);
+
+    Ok(())
+}
+
+/// External query should succeed using lz4_frame as spill compression codec and
+/// and all temporary spill files are properly cleaned up after execution.
+/// Note: This test does not inspect file contents (e.g. magic number),
+/// as spill files are automatically deleted on drop.
+#[tokio::test]
+async fn test_spill_file_compressed_with_lz4_frame() -> Result<()> {
+    let disk_spill_limit = 1024 * 1024; // 1MB
+    let spill_compression = SpillCompression::Lz4Frame;
+    let ctx = setup_context(disk_spill_limit, 128 * 1024, spill_compression).await?; // 1MB disk limit, 128KB memory limit, lz4_frame
+
+    let df = ctx
+        .sql("select * from generate_series(1, 100000) as t1(v1) order by v1")
+        .await
+        .unwrap();
+    let plan = df.create_physical_plan().await.unwrap();
+
+    let task_ctx = ctx.task_ctx();
+    let _ = collect_batches(Arc::clone(&plan), task_ctx)
+        .await
+        .expect("Query execution failed");
+
+    let spill_count = plan.metrics().unwrap().spill_count().unwrap();
+    let spilled_bytes = plan.metrics().unwrap().spilled_bytes().unwrap();
+
+    println!("spill count {spill_count}");
+    assert!(spill_count > 0);
+    assert!((spilled_bytes as u64) < disk_spill_limit);
+
+    // Verify that all temporary files have been properly cleaned up by checking
+    // that the total disk usage tracked by the disk manager is zero
+    let current_disk_usage = ctx.runtime_env().disk_manager.used_disk_space();
+    assert_eq!(current_disk_usage, 0);
+
+    Ok(())
+}
 /// Run the query with the specified memory limit,
 /// and verifies the expected errors are returned
 #[derive(Clone, Debug)]
@@ -726,7 +818,7 @@ impl TestCase {
 
     /// Specify an expected plan to review
     pub fn with_expected_plan(mut self, expected_plan: &[&str]) -> Self {
-        self.expected_plan = expected_plan.iter().map(|s| s.to_string()).collect();
+        self.expected_plan = expected_plan.iter().map(|s| (*s).to_string()).collect();
         self
     }
 
@@ -890,16 +982,13 @@ impl Scenario {
                     descending: false,
                     nulls_first: false,
                 };
-                let sort_information = vec![LexOrdering::new(vec![
-                    PhysicalSortExpr {
-                        expr: col("a", &schema).unwrap(),
-                        options,
-                    },
-                    PhysicalSortExpr {
-                        expr: col("b", &schema).unwrap(),
-                        options,
-                    },
-                ])];
+                let sort_information = vec![
+                    [
+                        PhysicalSortExpr::new(col("a", &schema).unwrap(), options),
+                        PhysicalSortExpr::new(col("b", &schema).unwrap(), options),
+                    ]
+                    .into(),
+                ];
 
                 let table = SortedTableProvider::new(batches, sort_information);
                 Arc::new(table)
@@ -975,7 +1064,7 @@ fn make_dict_batches() -> Vec<RecordBatch> {
     let batch_size = 50;
 
     let mut i = 0;
-    let gen = std::iter::from_fn(move || {
+    let batch_gen = std::iter::from_fn(move || {
         // create values like
         // 0000000001
         // 0000000002
@@ -998,7 +1087,7 @@ fn make_dict_batches() -> Vec<RecordBatch> {
 
     let num_batches = 5;
 
-    let batches: Vec<_> = gen.take(num_batches).collect();
+    let batches: Vec<_> = batch_gen.take(num_batches).collect();
 
     batches.iter().enumerate().for_each(|(i, batch)| {
         println!("Dict batch[{i}] size is: {}", batch.get_array_memory_size());
@@ -1013,9 +1102,9 @@ fn batches_byte_size(batches: &[RecordBatch]) -> usize {
 }
 
 #[derive(Debug)]
-struct DummyStreamPartition {
-    schema: SchemaRef,
-    batches: Vec<RecordBatch>,
+pub(crate) struct DummyStreamPartition {
+    pub(crate) schema: SchemaRef,
+    pub(crate) batches: Vec<RecordBatch>,
 }
 
 impl PartitionStream for DummyStreamPartition {
diff --git a/datafusion/core/tests/memory_limit/repartition_mem_limit.rs b/datafusion/core/tests/memory_limit/repartition_mem_limit.rs
new file mode 100644
index 0000000000000..b21bffebaf95e
--- /dev/null
+++ b/datafusion/core/tests/memory_limit/repartition_mem_limit.rs
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch};
+use datafusion::{
+    assert_batches_sorted_eq,
+    prelude::{SessionConfig, SessionContext},
+};
+use datafusion_catalog::MemTable;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+use datafusion_physical_plan::{ExecutionPlanProperties, repartition::RepartitionExec};
+use futures::TryStreamExt;
+use itertools::Itertools;
+
+/// End to end test for spilling in RepartitionExec.
+/// The idea is to make a real world query with a relatively low memory limit and
+/// then drive one partition at a time, simulating dissimilar execution speed in partitions.
+/// Just as some examples of real world scenarios where this can happen consider
+/// lopsided groups in a group by especially if one partitions spills and others don't,
+/// or in distributed systems if one upstream node is slower than others.
+#[tokio::test]
+async fn test_repartition_memory_limit() {
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(1024 * 1024, 1.0)
+        .build()
+        .unwrap();
+    let config = SessionConfig::new()
+        .with_batch_size(32)
+        .with_target_partitions(2);
+    let ctx = SessionContext::new_with_config_rt(config, Arc::new(runtime));
+    let batches = vec![
+        RecordBatch::try_from_iter(vec![(
+            "c1",
+            Arc::new(Int32Array::from_iter_values((0..10).cycle().take(100_000)))
+                as ArrayRef,
+        )])
+        .unwrap(),
+    ];
+    let table = Arc::new(MemTable::try_new(batches[0].schema(), vec![batches]).unwrap());
+    ctx.register_table("t", table).unwrap();
+    let plan = ctx
+        .state()
+        .create_logical_plan("SELECT c1, count(*) as c FROM t GROUP BY c1;")
+        .await
+        .unwrap();
+    let plan = ctx.state().create_physical_plan(&plan).await.unwrap();
+    assert_eq!(plan.output_partitioning().partition_count(), 2);
+    // Execute partition 0, this should cause items going into the rest of the partitions to queue up and because
+    // of the low memory limit should spill to disk.
+    let batches0 = Arc::clone(&plan)
+        .execute(0, ctx.task_ctx())
+        .unwrap()
+        .try_collect::<Vec<_>>()
+        .await
+        .unwrap();
+
+    let mut metrics = None;
+    Arc::clone(&plan)
+        .transform_down(|node| {
+            if node.as_any().is::<RepartitionExec>() {
+                metrics = node.metrics();
+            }
+            Ok(Transformed::no(node))
+        })
+        .unwrap();
+
+    let metrics = metrics.unwrap();
+    assert!(metrics.spilled_bytes().unwrap() > 0);
+    assert!(metrics.spilled_rows().unwrap() > 0);
+    assert!(metrics.spill_count().unwrap() > 0);
+
+    // Execute the other partition
+    let batches1 = Arc::clone(&plan)
+        .execute(1, ctx.task_ctx())
+        .unwrap()
+        .try_collect::<Vec<_>>()
+        .await
+        .unwrap();
+
+    let all_batches = batches0
+        .into_iter()
+        .chain(batches1.into_iter())
+        .collect_vec();
+    #[rustfmt::skip]
+    let expected = &[
+    "+----+-------+",
+    "| c1 | c     |",
+    "+----+-------+",
+    "| 0  | 10000 |",
+    "| 1  | 10000 |",
+    "| 2  | 10000 |",
+    "| 3  | 10000 |",
+    "| 4  | 10000 |",
+    "| 5  | 10000 |",
+    "| 6  | 10000 |",
+    "| 7  | 10000 |",
+    "| 8  | 10000 |",
+    "| 9  | 10000 |",
+    "+----+-------+",
+    ];
+    assert_batches_sorted_eq!(expected, &all_batches);
+}
diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs
index 2daed4fe36bbe..6466e9ad96d17 100644
--- a/datafusion/core/tests/optimizer/mod.rs
+++ b/datafusion/core/tests/optimizer/mod.rs
@@ -18,6 +18,7 @@
 //! Tests for the DataFusion SQL query planner that require functions from the
 //! datafusion-functions crate.
 
+use insta::assert_snapshot;
 use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -26,17 +27,16 @@ use arrow::datatypes::{
     DataType, Field, Fields, Schema, SchemaBuilder, SchemaRef, TimeUnit,
 };
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{TransformedResult, TreeNode};
-use datafusion_common::{plan_err, DFSchema, Result, ScalarValue, TableReference};
+use datafusion_common::tree_node::TransformedResult;
+use datafusion_common::{DFSchema, Result, ScalarValue, TableReference, plan_err};
 use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
 use datafusion_expr::{
-    col, lit, AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator,
-    ScalarUDF, TableSource, WindowUDF,
+    AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator, ScalarUDF,
+    TableSource, WindowUDF, col, lit,
 };
 use datafusion_functions::core::expr_ext::FieldAccessor;
 use datafusion_optimizer::analyzer::Analyzer;
 use datafusion_optimizer::optimizer::Optimizer;
-use datafusion_optimizer::simplify_expressions::GuaranteeRewriter;
 use datafusion_optimizer::{OptimizerConfig, OptimizerContext};
 use datafusion_sql::planner::{ContextProvider, SqlToRel};
 use datafusion_sql::sqlparser::ast::Statement;
@@ -44,6 +44,7 @@ use datafusion_sql::sqlparser::dialect::GenericDialect;
 use datafusion_sql::sqlparser::parser::Parser;
 
 use chrono::DateTime;
+use datafusion_expr::expr_rewriter::rewrite_with_guarantees;
 use datafusion_functions::datetime;
 
 #[cfg(test)]
@@ -56,9 +57,14 @@ fn init() {
 #[test]
 fn select_arrow_cast() {
     let sql = "SELECT arrow_cast(1234, 'Float64') as f64, arrow_cast('foo', 'LargeUtf8') as large";
-    let expected = "Projection: Float64(1234) AS f64, LargeUtf8(\"foo\") AS large\
-        \n  EmptyRelation";
-    quick_test(sql, expected);
+    let plan = test_sql(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: Float64(1234) AS f64, LargeUtf8("foo") AS large
+      EmptyRelation: rows=1
+    "#
+    );
 }
 #[test]
 fn timestamp_nano_ts_none_predicates() -> Result<()> {
@@ -68,11 +74,15 @@ fn timestamp_nano_ts_none_predicates() -> Result<()> {
     // a scan should have the now()... predicate folded to a single
     // constant and compared to the column without a cast so it can be
     // pushed down / pruned
-    let expected =
-        "Projection: test.col_int32\
-         \n  Filter: test.col_ts_nano_none < TimestampNanosecond(1666612093000000000, None)\
-         \n    TableScan: test projection=[col_int32, col_ts_nano_none]";
-    quick_test(sql, expected);
+    let plan = test_sql(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: test.col_int32
+      Filter: test.col_ts_nano_none < TimestampNanosecond(1666612093000000000, None)
+        TableScan: test projection=[col_int32, col_ts_nano_none]
+    "
+    );
     Ok(())
 }
 
@@ -84,10 +94,15 @@ fn timestamp_nano_ts_utc_predicates() {
     // a scan should have the now()... predicate folded to a single
     // constant and compared to the column without a cast so it can be
     // pushed down / pruned
-    let expected =
-        "Projection: test.col_int32\n  Filter: test.col_ts_nano_utc < TimestampNanosecond(1666612093000000000, Some(\"+00:00\"))\
-         \n    TableScan: test projection=[col_int32, col_ts_nano_utc]";
-    quick_test(sql, expected);
+    let plan = test_sql(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: test.col_int32
+      Filter: test.col_ts_nano_utc < TimestampNanosecond(1666612093000000000, Some("+00:00"))
+        TableScan: test projection=[col_int32, col_ts_nano_utc]
+    "#
+    );
 }
 
 #[test]
@@ -95,10 +110,14 @@ fn concat_literals() -> Result<()> {
     let sql = "SELECT concat(true, col_int32, false, null, 'hello', col_utf8, 12, 3.4) \
         AS col
         FROM test";
-    let expected =
-        "Projection: concat(Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"falsehello\"), test.col_utf8, Utf8(\"123.4\")) AS col\
-        \n  TableScan: test projection=[col_int32, col_utf8]";
-    quick_test(sql, expected);
+    let plan = test_sql(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: concat(Utf8("true"), CAST(test.col_int32 AS Utf8), Utf8("falsehello"), test.col_utf8, Utf8("123.4")) AS col
+      TableScan: test projection=[col_int32, col_utf8]
+    "#
+    );
     Ok(())
 }
 
@@ -107,16 +126,15 @@ fn concat_ws_literals() -> Result<()> {
     let sql = "SELECT concat_ws('-', true, col_int32, false, null, 'hello', col_utf8, 12, '', 3.4) \
         AS col
         FROM test";
-    let expected =
-        "Projection: concat_ws(Utf8(\"-\"), Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"false-hello\"), test.col_utf8, Utf8(\"12--3.4\")) AS col\
-        \n  TableScan: test projection=[col_int32, col_utf8]";
-    quick_test(sql, expected);
-    Ok(())
-}
-
-fn quick_test(sql: &str, expected_plan: &str) {
     let plan = test_sql(sql).unwrap();
-    assert_eq!(expected_plan, format!("{plan}"));
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: concat_ws(Utf8("-"), Utf8("true"), CAST(test.col_int32 AS Utf8), Utf8("false-hello"), test.col_utf8, Utf8("12--3.4")) AS col
+      TableScan: test projection=[col_int32, col_utf8]
+    "#
+    );
+    Ok(())
 }
 
 fn test_sql(sql: &str) -> Result<LogicalPlan> {
@@ -126,8 +144,9 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
     let statement = &ast[0];
 
     // create a logical query plan
+    let config = ConfigOptions::default();
     let context_provider = MyContextProvider::default()
-        .with_udf(datetime::now())
+        .with_udf(datetime::now(&config))
         .with_udf(datafusion_functions::core::arrow_cast())
         .with_udf(datafusion_functions::string::concat())
         .with_udf(datafusion_functions::string::concat_ws());
@@ -142,7 +161,7 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
     let analyzer = Analyzer::new();
     let optimizer = Optimizer::new();
     // analyze and optimize the logical plan
-    let plan = analyzer.execute_and_check(plan, config.options(), |_, _| {})?;
+    let plan = analyzer.execute_and_check(plan, &config.options(), |_, _| {})?;
     optimizer.optimize(plan, &config, |_, _| {})
 }
 
@@ -268,7 +287,7 @@ fn test_nested_schema_nullability() {
 
 #[test]
 fn test_inequalities_non_null_bounded() {
-    let guarantees = vec![
+    let guarantees = [
         // x ∈ [1, 3] (not null)
         (
             col("x"),
@@ -285,8 +304,6 @@ fn test_inequalities_non_null_bounded() {
         ),
     ];
 
-    let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
     // (original_expr, expected_simplification)
     let simplified_cases = &[
         (col("x").lt(lit(0)), false),
@@ -318,7 +335,7 @@ fn test_inequalities_non_null_bounded() {
         ),
     ];
 
-    validate_simplified_cases(&mut rewriter, simplified_cases);
+    validate_simplified_cases(&guarantees, simplified_cases);
 
     let unchanged_cases = &[
         col("x").gt(lit(2)),
@@ -329,16 +346,20 @@ fn test_inequalities_non_null_bounded() {
         col("x").not_between(lit(3), lit(10)),
     ];
 
-    validate_unchanged_cases(&mut rewriter, unchanged_cases);
+    validate_unchanged_cases(&guarantees, unchanged_cases);
 }
 
-fn validate_simplified_cases<T>(rewriter: &mut GuaranteeRewriter, cases: &[(Expr, T)])
-where
+fn validate_simplified_cases<T>(
+    guarantees: &[(Expr, NullableInterval)],
+    cases: &[(Expr, T)],
+) where
     ScalarValue: From<T>,
     T: Clone,
 {
     for (expr, expected_value) in cases {
-        let output = expr.clone().rewrite(rewriter).data().unwrap();
+        let output = rewrite_with_guarantees(expr.clone(), guarantees)
+            .data()
+            .unwrap();
         let expected = lit(ScalarValue::from(expected_value.clone()));
         assert_eq!(
             output, expected,
@@ -346,9 +367,11 @@ where
         );
     }
 }
-fn validate_unchanged_cases(rewriter: &mut GuaranteeRewriter, cases: &[Expr]) {
+fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Expr]) {
     for expr in cases {
-        let output = expr.clone().rewrite(rewriter).data().unwrap();
+        let output = rewrite_with_guarantees(expr.clone(), guarantees)
+            .data()
+            .unwrap();
         assert_eq!(
             &output, expr,
             "{expr} was simplified to {output}, but expected it to be unchanged"
diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs
index 761a78a29fd3a..ae11fa9a11334 100644
--- a/datafusion/core/tests/parquet/custom_reader.rs
+++ b/datafusion/core/tests/parquet/custom_reader.rs
@@ -20,33 +20,33 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::time::SystemTime;
 
-use arrow::array::{ArrayRef, Int64Array, Int8Array, StringArray};
+use arrow::array::{ArrayRef, Int8Array, Int64Array, StringArray};
 use arrow::datatypes::{Field, Schema, SchemaBuilder};
 use arrow::record_batch::RecordBatch;
-use datafusion::datasource::file_format::parquet::fetch_parquet_metadata;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::{
-    FileMeta, ParquetFileMetrics, ParquetFileReaderFactory, ParquetSource,
+    ParquetFileMetrics, ParquetFileReaderFactory, ParquetSource,
 };
 use datafusion::physical_plan::collect;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion::prelude::SessionContext;
-use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_common::Result;
+use datafusion_common::test_util::batches_to_sort_string;
 
 use bytes::Bytes;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
+use datafusion_datasource_parquet::metadata::DFParquetMetadata;
 use futures::future::BoxFuture;
 use futures::{FutureExt, TryFutureExt};
 use insta::assert_snapshot;
 use object_store::memory::InMemory;
 use object_store::path::Path;
-use object_store::{ObjectMeta, ObjectStore};
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
+use parquet::arrow::ArrowWriter;
 use parquet::arrow::arrow_reader::ArrowReaderOptions;
 use parquet::arrow::async_reader::AsyncFileReader;
-use parquet::arrow::ArrowWriter;
 use parquet::errors::ParquetError;
 use parquet::file::metadata::ParquetMetaData;
 
@@ -69,18 +69,14 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() {
         store_parquet_in_memory(vec![batch]).await;
     let file_group = parquet_files_meta
         .into_iter()
-        .map(|meta| PartitionedFile {
-            object_meta: meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: Some(Arc::new(String::from(EXPECTED_USER_DEFINED_METADATA))),
-            metadata_size_hint: None,
+        .map(|meta| {
+            PartitionedFile::new_from_meta(meta)
+                .with_extensions(Arc::new(String::from(EXPECTED_USER_DEFINED_METADATA)))
         })
         .collect();
 
     let source = Arc::new(
-        ParquetSource::default()
+        ParquetSource::new(file_schema.clone())
             // prepare the scan
             .with_parquet_file_reader_factory(Arc::new(
                 InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)),
@@ -89,7 +85,6 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() {
     let base_config = FileScanConfigBuilder::new(
         // just any url that doesn't point to in memory object store
         ObjectStoreUrl::local_filesystem(),
-        file_schema,
         source,
     )
     .with_file_group(file_group)
@@ -119,11 +114,11 @@ impl ParquetFileReaderFactory for InMemoryParquetFileReaderFactory {
     fn create_reader(
         &self,
         partition_index: usize,
-        file_meta: FileMeta,
+        partitioned_file: PartitionedFile,
         metadata_size_hint: Option<usize>,
         metrics: &ExecutionPlanMetricsSet,
     ) -> Result<Box<dyn AsyncFileReader + Send>> {
-        let metadata = file_meta
+        let metadata = partitioned_file
             .extensions
             .as_ref()
             .expect("has user defined metadata");
@@ -135,13 +130,13 @@ impl ParquetFileReaderFactory for InMemoryParquetFileReaderFactory {
 
         let parquet_file_metrics = ParquetFileMetrics::new(
             partition_index,
-            file_meta.location().as_ref(),
+            partitioned_file.object_meta.location.as_ref(),
             metrics,
         );
 
         Ok(Box::new(ParquetFileReader {
             store: Arc::clone(&self.0),
-            meta: file_meta.object_meta,
+            meta: partitioned_file.object_meta,
             metrics: parquet_file_metrics,
             metadata_size_hint,
         }))
@@ -237,18 +232,16 @@ impl AsyncFileReader for ParquetFileReader {
         _options: Option<&ArrowReaderOptions>,
     ) -> BoxFuture<'_, parquet::errors::Result<Arc<ParquetMetaData>>> {
         Box::pin(async move {
-            let metadata = fetch_parquet_metadata(
-                self.store.as_ref(),
-                &self.meta,
-                self.metadata_size_hint,
-            )
-            .await
-            .map_err(|e| {
-                ParquetError::General(format!(
-                    "AsyncChunkReader::get_metadata error: {e}"
-                ))
-            })?;
-            Ok(Arc::new(metadata))
+            let metadata = DFParquetMetadata::new(self.store.as_ref(), &self.meta)
+                .with_metadata_size_hint(self.metadata_size_hint)
+                .fetch_metadata()
+                .await
+                .map_err(|e| {
+                    ParquetError::General(format!(
+                        "AsyncChunkReader::get_metadata error: {e}"
+                    ))
+                })?;
+            Ok(metadata)
         })
     }
 }
diff --git a/datafusion/core/tests/parquet/encryption.rs b/datafusion/core/tests/parquet/encryption.rs
new file mode 100644
index 0000000000000..8b3170e367457
--- /dev/null
+++ b/datafusion/core/tests/parquet/encryption.rs
@@ -0,0 +1,370 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for reading and writing Parquet files that use Parquet modular encryption
+
+use arrow::array::{ArrayRef, Int32Array, StringArray};
+use arrow::record_batch::RecordBatch;
+use arrow_schema::{DataType, SchemaRef};
+use async_trait::async_trait;
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::datasource::listing::ListingOptions;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_common::config::{EncryptionFactoryOptions, TableParquetOptions};
+use datafusion_common::{DataFusionError, assert_batches_sorted_eq, exec_datafusion_err};
+use datafusion_datasource_parquet::ParquetFormat;
+use datafusion_execution::parquet_encryption::EncryptionFactory;
+use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
+use parquet::encryption::decrypt::FileDecryptionProperties;
+use parquet::encryption::encrypt::FileEncryptionProperties;
+use parquet::file::column_crypto_metadata::ColumnCryptoMetaData;
+use parquet::file::properties::WriterProperties;
+use std::collections::HashMap;
+use std::fs::File;
+use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicU8, Ordering};
+use std::sync::{Arc, Mutex};
+use tempfile::TempDir;
+
+async fn read_parquet_test_data<'a, T: Into<String>>(
+    path: T,
+    ctx: &SessionContext,
+    options: ParquetReadOptions<'a>,
+) -> Vec<RecordBatch> {
+    ctx.read_parquet(path.into(), options)
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap()
+}
+
+#[expect(clippy::needless_pass_by_value)]
+pub fn write_batches(
+    path: PathBuf,
+    props: WriterProperties,
+    batches: impl IntoIterator<Item = RecordBatch>,
+) -> datafusion_common::Result<usize> {
+    let mut batches = batches.into_iter();
+    let first_batch = batches.next().expect("need at least one record batch");
+    let schema = first_batch.schema();
+
+    let file = File::create(&path)?;
+    let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props))?;
+
+    writer.write(&first_batch)?;
+    let mut num_rows = first_batch.num_rows();
+
+    for batch in batches {
+        writer.write(&batch)?;
+        num_rows += batch.num_rows();
+    }
+    writer.close()?;
+    Ok(num_rows)
+}
+
+#[tokio::test]
+async fn round_trip_encryption() {
+    let ctx: SessionContext = SessionContext::new();
+
+    let options = ParquetReadOptions::default();
+    let batches = read_parquet_test_data(
+        "tests/data/filter_pushdown/single_file.gz.parquet",
+        &ctx,
+        options,
+    )
+    .await;
+
+    let schema = batches[0].schema();
+    let footer_key = b"0123456789012345".to_vec(); // 128bit/16
+    let column_key = b"1234567890123450".to_vec(); // 128bit/16
+
+    let mut encrypt = FileEncryptionProperties::builder(footer_key.clone());
+    let mut decrypt = FileDecryptionProperties::builder(footer_key.clone());
+
+    for field in schema.fields.iter() {
+        encrypt = encrypt.with_column_key(field.name().as_str(), column_key.clone());
+        decrypt = decrypt.with_column_key(field.name().as_str(), column_key.clone());
+    }
+    let encrypt = encrypt.build().unwrap();
+    let decrypt = decrypt.build().unwrap();
+
+    // Write encrypted parquet
+    let props = WriterProperties::builder()
+        .with_file_encryption_properties(encrypt)
+        .build();
+
+    let tempdir = TempDir::new_in(Path::new(".")).unwrap();
+    let tempfile = tempdir.path().join("data.parquet");
+    let num_rows_written = write_batches(tempfile.clone(), props, batches).unwrap();
+
+    // Read encrypted parquet
+    let ctx: SessionContext = SessionContext::new();
+    let options =
+        ParquetReadOptions::default().file_decryption_properties((&decrypt).into());
+
+    let encrypted_batches = read_parquet_test_data(
+        tempfile.into_os_string().into_string().unwrap(),
+        &ctx,
+        options,
+    )
+    .await;
+
+    let num_rows_read = encrypted_batches
+        .iter()
+        .fold(0, |acc, x| acc + x.num_rows());
+
+    assert_eq!(num_rows_written, num_rows_read);
+}
+
+#[tokio::test]
+async fn round_trip_parquet_with_encryption_factory() {
+    let ctx = SessionContext::new();
+    let encryption_factory = Arc::new(MockEncryptionFactory::default());
+    ctx.runtime_env().register_parquet_encryption_factory(
+        "test_encryption_factory",
+        Arc::clone(&encryption_factory) as Arc<dyn EncryptionFactory>,
+    );
+
+    let tmpdir = TempDir::new().unwrap();
+
+    // Register some simple test data
+    let strings: ArrayRef =
+        Arc::new(StringArray::from(vec!["a", "b", "c", "a", "b", "c"]));
+    let x1: ArrayRef = Arc::new(Int32Array::from(vec![1, 10, 11, 100, 101, 111]));
+    let x2: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6]));
+    let batch =
+        RecordBatch::try_from_iter(vec![("string", strings), ("x1", x1), ("x2", x2)])
+            .unwrap();
+    let test_data_schema = batch.schema();
+    ctx.register_batch("test_data", batch).unwrap();
+    let df = ctx.table("test_data").await.unwrap();
+
+    // Write encrypted Parquet, partitioned by string column into separate files
+    let mut parquet_options = TableParquetOptions::new();
+    parquet_options.crypto.factory_id = Some("test_encryption_factory".to_string());
+    parquet_options
+        .crypto
+        .factory_options
+        .options
+        .insert("test_key".to_string(), "test value".to_string());
+
+    let df_write_options =
+        DataFrameWriteOptions::default().with_partition_by(vec!["string".to_string()]);
+    df.write_parquet(
+        tmpdir.path().to_str().unwrap(),
+        df_write_options,
+        Some(parquet_options.clone()),
+    )
+    .await
+    .unwrap();
+
+    // Crypto factory should have generated one key per partition file
+    assert_eq!(encryption_factory.encryption_keys.lock().unwrap().len(), 3);
+
+    verify_table_encrypted(tmpdir.path(), &encryption_factory)
+        .await
+        .unwrap();
+
+    // Registering table without decryption properties should fail
+    let table_path = format!("file://{}/", tmpdir.path().to_str().unwrap());
+    let without_decryption_register = ctx
+        .register_listing_table(
+            "parquet_missing_decryption",
+            &table_path,
+            ListingOptions::new(Arc::new(ParquetFormat::default())),
+            None,
+            None,
+        )
+        .await;
+    assert!(matches!(
+        without_decryption_register.unwrap_err(),
+        DataFusionError::ParquetError(_)
+    ));
+
+    // Registering table succeeds if schema is provided
+    ctx.register_listing_table(
+        "parquet_missing_decryption",
+        &table_path,
+        ListingOptions::new(Arc::new(ParquetFormat::default())),
+        Some(test_data_schema),
+        None,
+    )
+    .await
+    .unwrap();
+
+    // But trying to read from the table should fail
+    let without_decryption_read = ctx
+        .table("parquet_missing_decryption")
+        .await
+        .unwrap()
+        .collect()
+        .await;
+    assert!(matches!(
+        without_decryption_read.unwrap_err(),
+        DataFusionError::ParquetError(_)
+    ));
+
+    // Register table with encryption factory specified
+    let listing_options = ListingOptions::new(Arc::new(
+        ParquetFormat::default().with_options(parquet_options),
+    ))
+    .with_table_partition_cols(vec![("string".to_string(), DataType::Utf8)]);
+    ctx.register_listing_table(
+        "parquet_with_decryption",
+        &table_path,
+        listing_options,
+        None,
+        None,
+    )
+    .await
+    .unwrap();
+
+    // Can read correct data when encryption factory has been specified
+    let table = ctx
+        .table("parquet_with_decryption")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+-----+----+--------+",
+        "| x1  | x2 | string |",
+        "+-----+----+--------+",
+        "| 1   | 1  | a      |",
+        "| 100 | 4  | a      |",
+        "| 10  | 2  | b      |",
+        "| 101 | 5  | b      |",
+        "| 11  | 3  | c      |",
+        "| 111 | 6  | c      |",
+        "+-----+----+--------+",
+    ];
+    assert_batches_sorted_eq!(expected, &table);
+}
+
+async fn verify_table_encrypted(
+    table_path: &Path,
+    encryption_factory: &Arc<MockEncryptionFactory>,
+) -> datafusion_common::Result<()> {
+    let mut directories = vec![table_path.to_path_buf()];
+    let mut files_visited = 0;
+    while let Some(directory) = directories.pop() {
+        for entry in std::fs::read_dir(&directory)? {
+            let path = entry?.path();
+            if path.is_dir() {
+                directories.push(path);
+            } else {
+                verify_file_encrypted(&path, encryption_factory).await?;
+                files_visited += 1;
+            }
+        }
+    }
+    assert!(files_visited > 0);
+    Ok(())
+}
+
+async fn verify_file_encrypted(
+    file_path: &Path,
+    encryption_factory: &Arc<MockEncryptionFactory>,
+) -> datafusion_common::Result<()> {
+    let mut options = EncryptionFactoryOptions::default();
+    options
+        .options
+        .insert("test_key".to_string(), "test value".to_string());
+
+    let file_path_str = if cfg!(target_os = "windows") {
+        // Windows backslashes are eventually converted to slashes when writing the Parquet files,
+        // through `ListingTableUrl::parse`, making `encryption_factory.encryption_keys` store them
+        // it that format. So we also replace backslashes here to ensure they match.
+        file_path.to_str().unwrap().replace("\\", "/")
+    } else {
+        file_path.to_str().unwrap().to_owned()
+    };
+
+    let object_path = object_store::path::Path::from(file_path_str);
+    let decryption_properties = encryption_factory
+        .get_file_decryption_properties(&options, &object_path)
+        .await?
+        .unwrap();
+
+    let reader_options =
+        ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties);
+    let file = File::open(file_path)?;
+    let reader_metadata = ArrowReaderMetadata::load(&file, reader_options)?;
+    let metadata = reader_metadata.metadata();
+    assert!(metadata.num_row_groups() > 0);
+    for row_group in metadata.row_groups() {
+        assert!(row_group.num_columns() > 0);
+        for col in row_group.columns() {
+            assert!(matches!(
+                col.crypto_metadata(),
+                Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY)
+            ));
+        }
+    }
+    Ok(())
+}
+
+/// Encryption factory implementation for use in tests,
+/// which generates encryption keys in a sequence
+#[derive(Debug, Default)]
+struct MockEncryptionFactory {
+    pub encryption_keys: Mutex<HashMap<object_store::path::Path, Vec<u8>>>,
+    pub counter: AtomicU8,
+}
+
+#[async_trait]
+impl EncryptionFactory for MockEncryptionFactory {
+    async fn get_file_encryption_properties(
+        &self,
+        config: &EncryptionFactoryOptions,
+        _schema: &SchemaRef,
+        file_path: &object_store::path::Path,
+    ) -> datafusion_common::Result<Option<Arc<FileEncryptionProperties>>> {
+        assert_eq!(
+            config.options.get("test_key"),
+            Some(&"test value".to_string())
+        );
+        let file_idx = self.counter.fetch_add(1, Ordering::Relaxed);
+        let key = vec![file_idx, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let mut keys = self.encryption_keys.lock().unwrap();
+        keys.insert(file_path.clone(), key.clone());
+        let encryption_properties = FileEncryptionProperties::builder(key).build()?;
+        Ok(Some(encryption_properties))
+    }
+
+    async fn get_file_decryption_properties(
+        &self,
+        config: &EncryptionFactoryOptions,
+        file_path: &object_store::path::Path,
+    ) -> datafusion_common::Result<Option<Arc<FileDecryptionProperties>>> {
+        assert_eq!(
+            config.options.get("test_key"),
+            Some(&"test value".to_string())
+        );
+        let keys = self.encryption_keys.lock().unwrap();
+        let key = keys
+            .get(file_path)
+            .ok_or_else(|| exec_datafusion_err!("No key for file {file_path:?}"))?;
+        let decryption_properties =
+            FileDecryptionProperties::builder(key.clone()).build()?;
+        Ok(Some(decryption_properties))
+    }
+}
diff --git a/datafusion/core/tests/parquet/expr_adapter.rs b/datafusion/core/tests/parquet/expr_adapter.rs
new file mode 100644
index 0000000000000..f412cdf9bd7a6
--- /dev/null
+++ b/datafusion/core/tests/parquet/expr_adapter.rs
@@ -0,0 +1,608 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, Int32Array, Int64Array, RecordBatch, StringArray,
+    StructArray, record_batch,
+};
+use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
+use bytes::{BufMut, BytesMut};
+use datafusion::assert_batches_eq;
+use datafusion::common::Result;
+use datafusion::datasource::listing::{
+    ListingTable, ListingTableConfig, ListingTableConfigExt,
+};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::DataFusionError;
+use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{self, Column};
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter,
+    PhysicalExprAdapterFactory,
+};
+use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
+use parquet::arrow::ArrowWriter;
+
+async fn write_parquet(batch: RecordBatch, store: Arc<dyn ObjectStore>, path: &str) {
+    let mut out = BytesMut::new().writer();
+    {
+        let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.finish().unwrap();
+    }
+    let data = out.into_inner().freeze();
+    store.put(&Path::from(path), data.into()).await.unwrap();
+}
+
+// Implement a custom PhysicalExprAdapterFactory that fills in missing columns with
+// the default value for the field type:
+// - Int64 columns are filled with `1`
+// - Utf8 columns are filled with `'b'`
+#[derive(Debug)]
+struct CustomPhysicalExprAdapterFactory;
+
+impl PhysicalExprAdapterFactory for CustomPhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        Ok(Arc::new(CustomPhysicalExprAdapter {
+            logical_file_schema: Arc::clone(&logical_file_schema),
+            physical_file_schema: Arc::clone(&physical_file_schema),
+            inner: Arc::new(DefaultPhysicalExprAdapter::new(
+                logical_file_schema,
+                physical_file_schema,
+            )),
+        }))
+    }
+}
+
+#[derive(Debug, Clone)]
+struct CustomPhysicalExprAdapter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+    inner: Arc<dyn PhysicalExprAdapter>,
+}
+
+impl PhysicalExprAdapter for CustomPhysicalExprAdapter {
+    fn rewrite(&self, mut expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        expr = expr
+            .transform(|expr| {
+                if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                    let field_name = column.name();
+                    if self
+                        .physical_file_schema
+                        .field_with_name(field_name)
+                        .ok()
+                        .is_none()
+                    {
+                        let field = self
+                            .logical_file_schema
+                            .field_with_name(field_name)
+                            .map_err(|_| {
+                                DataFusionError::Plan(format!(
+                                    "Field '{field_name}' not found in logical file schema",
+                                ))
+                            })?;
+                        // If the field does not exist, create a default value expression
+                        // Note that we use slightly different logic here to create a default value so that we can see different behavior in tests
+                        let default_value = match field.data_type() {
+                            DataType::Int64 => ScalarValue::Int64(Some(1)),
+                            DataType::Utf8 => ScalarValue::Utf8(Some("b".to_string())),
+                            _ => unimplemented!(
+                                "Unsupported data type: {}",
+                                field.data_type()
+                            ),
+                        };
+                        return Ok(Transformed::yes(Arc::new(
+                            expressions::Literal::new(default_value),
+                        )));
+                    }
+                }
+
+                Ok(Transformed::no(expr))
+            })
+            .data()?;
+        self.inner.rewrite(expr)
+    }
+}
+
+#[tokio::test]
+async fn test_custom_schema_adapter_and_custom_expression_adapter() {
+    let batch =
+        record_batch!(("extra", Int64, [1, 2, 3]), ("c1", Int32, [1, 2, 3])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    let path = "test.parquet";
+    write_parquet(batch, store.clone(), path).await;
+
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false),
+        Field::new("c2", DataType::Utf8, true),
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        // Disable statistics collection for this test otherwise early pruning makes it hard to demonstrate data adaptation
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false)
+        .with_parquet_page_index_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+    assert!(
+        !ctx.state()
+            .config_mut()
+            .options_mut()
+            .execution
+            .collect_statistics
+    );
+    assert!(!ctx.state().config().collect_statistics());
+
+    // Test with DefaultPhysicalExprAdapterFactory - missing columns are filled with NULL
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    let batches = ctx
+        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 IS NULL")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c2 | c1 |",
+        "+----+----+",
+        "|    | 2  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Test with a custom physical expr adapter
+    // PhysicalExprAdapterFactory now handles both predicates AND projections
+    // CustomPhysicalExprAdapterFactory fills missing columns with 'b' for Utf8
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.deregister_table("t").unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+    let batches = ctx
+        .sql("SELECT c2, c1 FROM t WHERE c1 = 2 AND c2 = 'b'")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    // With CustomPhysicalExprAdapterFactory, missing column c2 is filled with 'b'
+    // in both the predicate (c2 = 'b' becomes 'b' = 'b' -> true) and the projection
+    let expected = [
+        "+----+----+",
+        "| c2 | c1 |",
+        "+----+----+",
+        "| b  | 2  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
+
+/// Test demonstrating how to implement a custom PhysicalExprAdapterFactory
+/// that fills missing columns with non-null default values.
+///
+/// PhysicalExprAdapterFactory rewrites expressions to use literals for
+/// missing columns, handling schema evolution efficiently at planning time.
+#[tokio::test]
+async fn test_physical_expr_adapter_with_non_null_defaults() {
+    // File only has c1 column
+    let batch = record_batch!(("c1", Int32, [10, 20, 30])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    write_parquet(batch, store.clone(), "defaults_test.parquet").await;
+
+    // Table schema has additional columns c2 (Utf8) and c3 (Int64) that don't exist in file
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false), // type differs from file (Int32 vs Int64)
+        Field::new("c2", DataType::Utf8, true),   // missing from file
+        Field::new("c3", DataType::Int64, true),  // missing from file
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    // CustomPhysicalExprAdapterFactory fills:
+    // - missing Utf8 columns with 'b'
+    // - missing Int64 columns with 1
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(CustomPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    // Query all columns - missing columns should have default values
+    let batches = ctx
+        .sql("SELECT c1, c2, c3 FROM t ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    // c1 is cast from Int32 to Int64, c2 defaults to 'b', c3 defaults to 1
+    let expected = [
+        "+----+----+----+",
+        "| c1 | c2 | c3 |",
+        "+----+----+----+",
+        "| 10 | b  | 1  |",
+        "| 20 | b  | 1  |",
+        "| 30 | b  | 1  |",
+        "+----+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify predicates work with default values
+    // c3 = 1 should match all rows since default is 1
+    let batches = ctx
+        .sql("SELECT c1 FROM t WHERE c3 = 1 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 10 |",
+        "| 20 |",
+        "| 30 |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // c3 = 999 should match no rows
+    let batches = ctx
+        .sql("SELECT c1 FROM t WHERE c3 = 999")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "++",
+        "++",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
+
+#[tokio::test]
+async fn test_struct_schema_evolution_projection_and_filter() -> Result<()> {
+    use std::collections::HashMap;
+
+    // Physical struct: {id: Int32, name: Utf8}
+    let physical_struct_fields: Fields = vec![
+        Arc::new(Field::new("id", DataType::Int32, false)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+    ]
+    .into();
+
+    let struct_array = StructArray::new(
+        physical_struct_fields.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+            Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef,
+        ],
+        None,
+    );
+
+    let physical_schema = Arc::new(Schema::new(vec![Field::new(
+        "s",
+        DataType::Struct(physical_struct_fields),
+        true,
+    )]));
+
+    let batch =
+        RecordBatch::try_new(Arc::clone(&physical_schema), vec![Arc::new(struct_array)])?;
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+    write_parquet(batch, store.clone(), "struct_evolution.parquet").await;
+
+    // Logical struct: {id: Int64?, name: Utf8?, extra: Boolean?} + metadata
+    let logical_struct_fields: Fields = vec![
+        Arc::new(Field::new("id", DataType::Int64, true)),
+        Arc::new(Field::new("name", DataType::Utf8, true)),
+        Arc::new(Field::new("extra", DataType::Boolean, true).with_metadata(
+            HashMap::from([("nested_meta".to_string(), "1".to_string())]),
+        )),
+    ]
+    .into();
+
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("s", DataType::Struct(logical_struct_fields), false)
+            .with_metadata(HashMap::from([("top_meta".to_string(), "1".to_string())])),
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false)
+        .with_parquet_page_index_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    let listing_table_config =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory));
+
+    let table = ListingTable::try_new(listing_table_config).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    let batches = ctx
+        .sql("SELECT s FROM t")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    assert_eq!(batches.len(), 1);
+
+    // Verify top-level metadata propagation
+    let output_schema = batches[0].schema();
+    let s_field = output_schema.field_with_name("s").unwrap();
+    assert_eq!(
+        s_field.metadata().get("top_meta").map(String::as_str),
+        Some("1")
+    );
+
+    // Verify nested struct type/field propagation + values
+    let s_array = batches[0]
+        .column(0)
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .expect("expected struct array");
+
+    let id_array = s_array
+        .column_by_name("id")
+        .expect("id column")
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .expect("id should be cast to Int64");
+    assert_eq!(id_array.values(), &[1, 2, 3]);
+
+    let extra_array = s_array.column_by_name("extra").expect("extra column");
+    assert_eq!(extra_array.null_count(), 3);
+
+    // Verify nested field metadata propagation
+    let extra_field = match s_field.data_type() {
+        DataType::Struct(fields) => fields
+            .iter()
+            .find(|f| f.name() == "extra")
+            .expect("extra field"),
+        other => panic!("expected struct type for s, got {other:?}"),
+    };
+    assert_eq!(
+        extra_field
+            .metadata()
+            .get("nested_meta")
+            .map(String::as_str),
+        Some("1")
+    );
+
+    // Smoke test: filtering on a missing nested field evaluates correctly
+    let filtered = ctx
+        .sql("SELECT get_field(s, 'extra') AS extra FROM t WHERE get_field(s, 'extra') IS NULL")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    assert_eq!(filtered.len(), 1);
+    assert_eq!(filtered[0].num_rows(), 3);
+    let extra = filtered[0]
+        .column(0)
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("extra should be a boolean array");
+    assert_eq!(extra.null_count(), 3);
+
+    Ok(())
+}
+
+/// Test demonstrating that a single PhysicalExprAdapterFactory instance can be
+/// reused across multiple ListingTable instances.
+///
+/// This addresses the concern: "This is important for ListingTable. A test for
+/// ListingTable would add assurance that the functionality is retained [i.e. we
+/// can re-use a PhysicalExprAdapterFactory]"
+#[tokio::test]
+async fn test_physical_expr_adapter_factory_reuse_across_tables() {
+    // Create two different parquet files with different schemas
+    // File 1: has column c1 only
+    let batch1 = record_batch!(("c1", Int32, [1, 2, 3])).unwrap();
+    // File 2: has column c1 only but different data
+    let batch2 = record_batch!(("c1", Int32, [10, 20, 30])).unwrap();
+
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let store_url = ObjectStoreUrl::parse("memory://").unwrap();
+
+    // Write files to different paths
+    write_parquet(batch1, store.clone(), "table1/data.parquet").await;
+    write_parquet(batch2, store.clone(), "table2/data.parquet").await;
+
+    // Table schema has additional columns that don't exist in files
+    let table_schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Int64, false),
+        Field::new("c2", DataType::Utf8, true), // missing from files
+    ]));
+
+    let mut cfg = SessionConfig::new()
+        .with_collect_statistics(false)
+        .with_parquet_pruning(false);
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(store_url.as_ref(), Arc::clone(&store));
+
+    // Create ONE factory instance wrapped in Arc - this will be REUSED
+    let factory: Arc<dyn PhysicalExprAdapterFactory> =
+        Arc::new(CustomPhysicalExprAdapterFactory);
+
+    // Create ListingTable 1 using the shared factory
+    let listing_table_config1 =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///table1/").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::clone(&factory)); // Clone the Arc, not create new factory
+
+    let table1 = ListingTable::try_new(listing_table_config1).unwrap();
+    ctx.register_table("t1", Arc::new(table1)).unwrap();
+
+    // Create ListingTable 2 using the SAME factory instance
+    let listing_table_config2 =
+        ListingTableConfig::new(ListingTableUrl::parse("memory:///table2/").unwrap())
+            .infer_options(&ctx.state())
+            .await
+            .unwrap()
+            .with_schema(table_schema.clone())
+            .with_expr_adapter_factory(Arc::clone(&factory)); // Reuse same factory
+
+    let table2 = ListingTable::try_new(listing_table_config2).unwrap();
+    ctx.register_table("t2", Arc::new(table2)).unwrap();
+
+    // Verify table 1 works correctly with the shared factory
+    // CustomPhysicalExprAdapterFactory fills missing Utf8 columns with 'b'
+    let batches = ctx
+        .sql("SELECT c1, c2 FROM t1 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c1 | c2 |",
+        "+----+----+",
+        "| 1  | b  |",
+        "| 2  | b  |",
+        "| 3  | b  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify table 2 also works correctly with the SAME shared factory
+    let batches = ctx
+        .sql("SELECT c1, c2 FROM t2 ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let expected = [
+        "+----+----+",
+        "| c1 | c2 |",
+        "+----+----+",
+        "| 10 | b  |",
+        "| 20 | b  |",
+        "| 30 | b  |",
+        "+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    // Verify predicates work on both tables with the shared factory
+    let batches = ctx
+        .sql("SELECT c1 FROM t1 WHERE c2 = 'b' ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 1  |",
+        "| 2  |",
+        "| 3  |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+
+    let batches = ctx
+        .sql("SELECT c1 FROM t2 WHERE c2 = 'b' ORDER BY c1")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+",
+        "| c1 |",
+        "+----+",
+        "| 10 |",
+        "| 20 |",
+        "| 30 |",
+        "+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+}
diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs
index a5397c5a397ca..9ff8137687c95 100644
--- a/datafusion/core/tests/parquet/external_access_plan.rs
+++ b/datafusion/core/tests/parquet/external_access_plan.rs
@@ -21,7 +21,7 @@ use std::path::Path;
 use std::sync::Arc;
 
 use crate::parquet::utils::MetricsFinder;
-use crate::parquet::{create_data_batch, Scenario};
+use crate::parquet::{Scenario, create_data_batch};
 
 use arrow::datatypes::SchemaRef;
 use arrow::util::pretty::pretty_format_batches;
@@ -29,17 +29,17 @@ use datafusion::common::Result;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::prelude::SessionContext;
-use datafusion_common::{assert_contains, DFSchema};
+use datafusion_common::{DFSchema, assert_contains};
 use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess};
 use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_expr::{col, lit, Expr};
-use datafusion_physical_plan::metrics::MetricsSet;
+use datafusion_expr::{Expr, col, lit};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::metrics::{MetricValue, MetricsSet};
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::source::DataSourceExec;
-use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::file::properties::WriterProperties;
 use tempfile::NamedTempFile;
 
@@ -178,12 +178,21 @@ async fn plan_and_filter() {
     .unwrap();
 
     // Verify that row group pruning still happens for just that group
-    let row_groups_pruned_statistics =
-        metric_value(&parquet_metrics, "row_groups_pruned_statistics").unwrap();
-    assert_eq!(
-        row_groups_pruned_statistics, 1,
-        "metrics : {parquet_metrics:#?}",
-    );
+    let row_groups_pruned_statistics = parquet_metrics
+        .sum_by_name("row_groups_pruned_statistics")
+        .unwrap();
+    if let MetricValue::PruningMetrics {
+        pruning_metrics, ..
+    } = row_groups_pruned_statistics
+    {
+        assert_eq!(
+            pruning_metrics.pruned(),
+            1,
+            "metrics : {parquet_metrics:#?}",
+        );
+    } else {
+        unreachable!("metrics `row_groups_pruned_statistics` should exist")
+    }
 }
 
 #[tokio::test]
@@ -248,7 +257,10 @@ async fn bad_selection() {
     .await
     .unwrap_err();
     let err_string = err.to_string();
-    assert_contains!(&err_string, "Internal error: Invalid ParquetAccessPlan Selection. Row group 0 has 5 rows but selection only specifies 4 rows");
+    assert_contains!(
+        &err_string,
+        "Row group 0 has 5 rows but selection only specifies 4 rows."
+    );
 }
 
 /// Return a RowSelection of 1 rows from a row group of 5 rows
@@ -346,11 +358,11 @@ impl TestFull {
         let source = if let Some(predicate) = predicate {
             let df_schema = DFSchema::try_from(schema.clone())?;
             let predicate = ctx.create_physical_expr(predicate, &df_schema)?;
-            Arc::new(ParquetSource::default().with_predicate(predicate))
+            Arc::new(ParquetSource::new(schema.clone()).with_predicate(predicate))
         } else {
-            Arc::new(ParquetSource::default())
+            Arc::new(ParquetSource::new(schema.clone()))
         };
-        let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source)
+        let config = FileScanConfigBuilder::new(object_store_url, source)
             .with_file(partitioned_file)
             .build();
 
@@ -397,7 +409,7 @@ fn get_test_data() -> TestData {
         .expect("tempfile creation");
 
     let props = WriterProperties::builder()
-        .set_max_row_group_size(row_per_group)
+        .set_max_row_group_row_count(Some(row_per_group))
         .build();
 
     let batches = create_data_batch(scenario);
diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs
index a60beaf665e55..fdefdafa00aa4 100644
--- a/datafusion/core/tests/parquet/file_statistics.rs
+++ b/datafusion/core/tests/parquet/file_statistics.rs
@@ -18,30 +18,30 @@
 use std::fs;
 use std::sync::Arc;
 
+use datafusion::datasource::TableProvider;
 use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::TableProvider;
 use datafusion::execution::context::SessionState;
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::prelude::SessionContext;
-use datafusion_common::stats::Precision;
 use datafusion_common::DFSchema;
+use datafusion_common::stats::Precision;
+use datafusion_execution::cache::DefaultListFilesCache;
 use datafusion_execution::cache::cache_manager::CacheManagerConfig;
-use datafusion_execution::cache::cache_unit::{
-    DefaultFileStatisticsCache, DefaultListFilesCache,
-};
+use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-use datafusion_expr::{col, lit, Expr};
+use datafusion_expr::{Expr, col, lit};
 
 use datafusion::datasource::physical_plan::FileScanConfig;
-use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
+use datafusion_common::config::ConfigOptions;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
-use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::filter::FilterExec;
 use tempfile::tempdir;
 
 #[tokio::test]
@@ -55,7 +55,7 @@ async fn check_stats_precision_with_filter_pushdown() {
     let table = get_listing_table(&table_path, None, &opt).await;
 
     let (_, _, state) = get_cache_runtime_state();
-    let mut options = state.config().options().clone();
+    let mut options: ConfigOptions = state.config().options().as_ref().clone();
     options.execution.parquet.pushdown_filters = true;
 
     // Scan without filter, stats are exact
@@ -71,7 +71,7 @@ async fn check_stats_precision_with_filter_pushdown() {
     // source operator after the appropriate optimizer pass.
     let filter_expr = Expr::gt(col("id"), lit(1));
     let exec_with_filter = table
-        .scan(&state, None, &[filter_expr.clone()], None)
+        .scan(&state, None, std::slice::from_ref(&filter_expr), None)
         .await
         .unwrap();
 
@@ -126,8 +126,9 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec1.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Byte size is absent because we cannot estimate the output size
+        // of the Arrow data since there are variable length columns.
+        Precision::Absent,
     );
     assert_eq!(get_static_cache_size(&state1), 1);
 
@@ -141,8 +142,8 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec2.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Absent because the data contains variable length columns
+        Precision::Absent,
     );
     assert_eq!(get_static_cache_size(&state2), 1);
 
@@ -156,8 +157,8 @@ async fn load_table_stats_with_session_level_cache() {
     );
     assert_eq!(
         exec3.partition_statistics(None).unwrap().total_byte_size,
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        Precision::Exact(671),
+        // Absent because the data contains variable length columns
+        Precision::Absent,
     );
     // List same file no increase
     assert_eq!(get_static_cache_size(&state1), 1);
diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs
index b8d570916c7c5..e6266b2c088d7 100644
--- a/datafusion/core/tests/parquet/filter_pushdown.rs
+++ b/datafusion/core/tests/parquet/filter_pushdown.rs
@@ -26,18 +26,19 @@
 //! select * from data limit 10;
 //! ```
 
-use std::path::Path;
-
 use arrow::compute::concat_batches;
 use arrow::record_batch::RecordBatch;
 use datafusion::physical_plan::collect;
-use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::prelude::{
-    col, lit, lit_timestamp_nano, Expr, ParquetReadOptions, SessionContext,
+    Expr, ParquetReadOptions, SessionContext, col, lit, lit_timestamp_nano,
 };
 use datafusion::test_util::parquet::{ParquetScanOptions, TestParquetFile};
 use datafusion_expr::utils::{conjunction, disjunction, split_conjunction};
+use std::path::Path;
 
+use datafusion_common::test_util::parquet_test_data;
+use datafusion_execution::config::SessionConfig;
 use itertools::Itertools;
 use parquet::file::properties::WriterProperties;
 use tempfile::TempDir;
@@ -62,7 +63,7 @@ async fn single_file() {
 
     // Set the row group size smaller so can test with fewer rows
     let props = WriterProperties::builder()
-        .set_max_row_group_size(1024)
+        .set_max_row_group_row_count(Some(1024))
         .build();
 
     // Only create the parquet file once as it is fairly large
@@ -219,7 +220,6 @@ async fn single_file() {
 }
 
 #[tokio::test]
-#[allow(dead_code)]
 async fn single_file_small_data_pages() {
     let batches = read_parquet_test_data(
         "tests/data/filter_pushdown/single_file_small_pages.gz.parquet",
@@ -230,7 +230,7 @@ async fn single_file_small_data_pages() {
 
     // Set a low row count limit to improve page filtering
     let props = WriterProperties::builder()
-        .set_max_row_group_size(2048)
+        .set_max_row_group_row_count(Some(2048))
         .set_data_page_row_count_limit(512)
         .set_write_batch_size(512)
         .build();
@@ -562,9 +562,9 @@ impl<'a> TestCase<'a> {
             }
         };
 
-        let page_index_rows_pruned = get_value(&metrics, "page_index_rows_pruned");
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metrics(&metrics, "page_index_rows_pruned");
         println!(" page_index_rows_pruned: {page_index_rows_pruned}");
-        let page_index_rows_matched = get_value(&metrics, "page_index_rows_matched");
         println!(" page_index_rows_matched: {page_index_rows_matched}");
 
         let page_index_filtering_expected = if scan_options.enable_page_index {
@@ -591,13 +591,158 @@ impl<'a> TestCase<'a> {
     }
 }
 
+fn get_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) {
+    match metrics.sum_by_name(metric_name) {
+        Some(MetricValue::PruningMetrics {
+            pruning_metrics, ..
+        }) => (pruning_metrics.pruned(), pruning_metrics.matched()),
+        Some(_) => {
+            panic!("Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}")
+        }
+        None => panic!(
+            "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+        ),
+    }
+}
+
 fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
     match metrics.sum_by_name(metric_name) {
+        Some(MetricValue::PruningMetrics {
+            pruning_metrics, ..
+        }) => pruning_metrics.pruned(),
         Some(v) => v.as_usize(),
-        _ => {
-            panic!(
-                "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
-            );
-        }
+        None => panic!(
+            "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+        ),
+    }
+}
+
+#[tokio::test]
+async fn predicate_cache_default() -> datafusion_common::Result<()> {
+    let ctx = SessionContext::new();
+    // The cache is on by default, but not used unless filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 0,
+        expected_records: 0,
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    let ctx = SessionContext::new_with_config(config);
+    // The cache is on by default, and used when filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 8,
+        expected_records: 7, // reads more than necessary from the cache as then another bitmap is applied
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_stats_issue_19561() -> datafusion_common::Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    // force to get multiple batches to trigger repeated metric compound bug
+    config.options_mut().execution.batch_size = 1;
+    let ctx = SessionContext::new_with_config(config);
+    // The cache is on by default, and used when filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 8,
+        expected_records: 4,
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_pushdown_default_selections_only()
+-> datafusion_common::Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    // forcing filter selections minimizes the number of rows read from the cache
+    config
+        .options_mut()
+        .execution
+        .parquet
+        .force_filter_selections = true;
+    let ctx = SessionContext::new_with_config(config);
+    // The cache is on by default, and used when filter pushdown is enabled
+    PredicateCacheTest {
+        expected_inner_records: 8,
+        expected_records: 4,
+    }
+    .run(&ctx)
+    .await
+}
+
+#[tokio::test]
+async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> {
+    // Can disable the cache even with filter pushdown by setting the size to 0.
+    // This results in no records read from the cache and no metrics reported
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    config
+        .options_mut()
+        .execution
+        .parquet
+        .max_predicate_cache_size = Some(0);
+    let ctx = SessionContext::new_with_config(config);
+    // Since the cache is disabled, there is no reporting or use of the cache
+    PredicateCacheTest {
+        expected_inner_records: 0,
+        expected_records: 0,
+    }
+    .run(&ctx)
+    .await
+}
+
+/// Runs the query "SELECT * FROM alltypes_plain WHERE double_col != 0.0"
+/// with a given SessionContext and asserts that the predicate cache metrics
+/// are as expected
+#[derive(Debug)]
+struct PredicateCacheTest {
+    /// Expected records read from the underlying reader (to evaluate filters)
+    /// -- this is the total number of records in the file
+    expected_inner_records: usize,
+    /// Expected records to be read from the cache (after filtering)
+    expected_records: usize,
+}
+
+impl PredicateCacheTest {
+    async fn run(self, ctx: &SessionContext) -> datafusion_common::Result<()> {
+        let Self {
+            expected_inner_records,
+            expected_records,
+        } = self;
+        // Create a dataframe that scans the "alltypes_plain.parquet" file with
+        // a filter on `double_col != 0.0`
+        let path = parquet_test_data() + "/alltypes_plain.parquet";
+        let exec = ctx
+            .read_parquet(path, ParquetReadOptions::default())
+            .await?
+            .filter(col("double_col").not_eq(lit(0.0)))?
+            .create_physical_plan()
+            .await?;
+
+        // run the plan to completion
+        let _ = collect(exec.clone(), ctx.task_ctx()).await?; // run plan
+        let metrics =
+            TestParquetFile::parquet_metrics(&exec).expect("found parquet metrics");
+
+        // verify the predicate cache metrics
+        assert_eq!(
+            get_value(&metrics, "predicate_cache_inner_records"),
+            expected_inner_records
+        );
+        assert_eq!(
+            get_value(&metrics, "predicate_cache_records"),
+            expected_records
+        );
+        Ok(())
     }
 }
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 87a5ed33f127d..0535ddd9247d4 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -19,33 +19,39 @@
 use crate::parquet::utils::MetricsFinder;
 use arrow::{
     array::{
-        make_array, Array, ArrayRef, BinaryArray, Date32Array, Date64Array,
-        Decimal128Array, DictionaryArray, FixedSizeBinaryArray, Float64Array, Int16Array,
-        Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray,
-        StringArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-        TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
-        UInt64Array, UInt8Array,
+        Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Decimal128Array,
+        DictionaryArray, FixedSizeBinaryArray, Float64Array, Int8Array, Int16Array,
+        Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, StringArray,
+        TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+        TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
+        make_array,
     },
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
     util::pretty::pretty_format_batches,
 };
+use arrow_schema::SchemaRef;
 use chrono::{Datelike, Duration, TimeDelta};
 use datafusion::{
-    datasource::{provider_as_source, TableProvider},
+    datasource::{TableProvider, provider_as_source},
     physical_plan::metrics::MetricsSet,
     prelude::{ParquetReadOptions, SessionConfig, SessionContext},
 };
 use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_physical_plan::metrics::MetricValue;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
 use std::sync::Arc;
 use tempfile::NamedTempFile;
 
 mod custom_reader;
+#[cfg(feature = "parquet_encryption")]
+mod encryption;
+mod expr_adapter;
 mod external_access_plan;
 mod file_statistics;
 mod filter_pushdown;
+mod ordering;
 mod page_pruning;
 mod row_group_pruning;
 mod schema;
@@ -105,13 +111,33 @@ struct ContextWithParquet {
     ctx: SessionContext,
 }
 
+struct PruningMetric {
+    total_pruned: usize,
+    total_matched: usize,
+    total_fully_matched: usize,
+}
+
+impl PruningMetric {
+    pub fn total_pruned(&self) -> usize {
+        self.total_pruned
+    }
+
+    pub fn total_matched(&self) -> usize {
+        self.total_matched
+    }
+
+    pub fn total_fully_matched(&self) -> usize {
+        self.total_fully_matched
+    }
+}
+
 /// The output of running one of the test cases
 struct TestOutput {
-    /// The input string
+    /// The input query SQL
     sql: String,
     /// Execution metrics for the Parquet Scan
     parquet_metrics: MetricsSet,
-    /// number of rows in results
+    /// number of actual rows in results
     result_rows: usize,
     /// the contents of the input, as a string
     pretty_input: String,
@@ -122,9 +148,50 @@ struct TestOutput {
 impl TestOutput {
     /// retrieve the value of the named metric, if any
     fn metric_value(&self, metric_name: &str) -> Option<usize> {
+        if let Some(pm) = self.pruning_metric(metric_name) {
+            return Some(pm.total_pruned());
+        }
+
         self.parquet_metrics
             .sum(|metric| metric.value().name() == metric_name)
-            .map(|v| v.as_usize())
+            .map(|v| match v {
+                MetricValue::PruningMetrics {
+                    pruning_metrics, ..
+                } => pruning_metrics.pruned(),
+                _ => v.as_usize(),
+            })
+    }
+
+    fn pruning_metric(&self, metric_name: &str) -> Option<PruningMetric> {
+        let mut total_pruned = 0;
+        let mut total_matched = 0;
+        let mut total_fully_matched = 0;
+        let mut found = false;
+
+        for metric in self.parquet_metrics.iter() {
+            let metric = metric.as_ref();
+            if metric.value().name() == metric_name
+                && let MetricValue::PruningMetrics {
+                    pruning_metrics, ..
+                } = metric.value()
+            {
+                total_pruned += pruning_metrics.pruned();
+                total_matched += pruning_metrics.matched();
+                total_fully_matched += pruning_metrics.fully_matched();
+
+                found = true;
+            }
+        }
+
+        if found {
+            Some(PruningMetric {
+                total_pruned,
+                total_matched,
+                total_fully_matched,
+            })
+        } else {
+            None
+        }
     }
 
     /// The number of times the pruning predicate evaluation errors
@@ -132,43 +199,63 @@ impl TestOutput {
         self.metric_value("predicate_evaluation_errors")
     }
 
-    /// The number of row_groups matched by bloom filter
-    fn row_groups_matched_bloom_filter(&self) -> Option<usize> {
-        self.metric_value("row_groups_matched_bloom_filter")
-    }
-
-    /// The number of row_groups pruned by bloom filter
-    fn row_groups_pruned_bloom_filter(&self) -> Option<usize> {
-        self.metric_value("row_groups_pruned_bloom_filter")
+    /// The number of row_groups pruned / matched by bloom filter
+    fn row_groups_bloom_filter(&self) -> Option<PruningMetric> {
+        self.pruning_metric("row_groups_pruned_bloom_filter")
     }
 
     /// The number of row_groups matched by statistics
     fn row_groups_matched_statistics(&self) -> Option<usize> {
-        self.metric_value("row_groups_matched_statistics")
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|pm| pm.total_matched())
+    }
+
+    /// The number of row_groups fully matched by statistics
+    fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|pm| pm.total_fully_matched())
     }
 
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
-        self.metric_value("row_groups_pruned_statistics")
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|pm| pm.total_pruned())
+    }
+
+    /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
+    /// for testing purpose, here it only aggregate the `pruned` count.
+    fn files_ranges_pruned_statistics(&self) -> Option<usize> {
+        self.pruning_metric("files_ranges_pruned_statistics")
+            .map(|pm| pm.total_pruned())
     }
 
     /// The number of row_groups matched by bloom filter or statistics
+    ///
+    /// E.g. starting with 10 row groups, statistics: 10 total -> 7 matched, bloom
+    /// filter: 7 total -> 3 matched, this function returns 3 for the final matched
+    /// count.
     fn row_groups_matched(&self) -> Option<usize> {
-        self.row_groups_matched_bloom_filter()
-            .zip(self.row_groups_matched_statistics())
-            .map(|(a, b)| a + b)
+        self.row_groups_bloom_filter().map(|pm| pm.total_matched())
     }
 
     /// The number of row_groups pruned
     fn row_groups_pruned(&self) -> Option<usize> {
-        self.row_groups_pruned_bloom_filter()
+        self.row_groups_bloom_filter()
+            .map(|pm| pm.total_pruned())
             .zip(self.row_groups_pruned_statistics())
             .map(|(a, b)| a + b)
     }
 
     /// The number of row pages pruned
     fn row_pages_pruned(&self) -> Option<usize> {
-        self.metric_value("page_index_rows_pruned")
+        self.pruning_metric("page_index_rows_pruned")
+            .map(|pm| pm.total_pruned())
+    }
+
+    /// The number of row groups pruned by limit pruning
+    fn limit_pruned_row_groups(&self) -> Option<usize> {
+        self.pruning_metric("limit_pruned_row_groups")
+            .map(|pm| pm.total_pruned())
     }
 
     fn description(&self) -> String {
@@ -184,18 +271,41 @@ impl TestOutput {
 /// and the appropriate scenario
 impl ContextWithParquet {
     async fn new(scenario: Scenario, unit: Unit) -> Self {
-        Self::with_config(scenario, unit, SessionConfig::new()).await
+        Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
+    }
+
+    /// Set custom schema and batches for the test
+    pub async fn with_custom_data(
+        scenario: Scenario,
+        unit: Unit,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+    ) -> Self {
+        Self::with_config(
+            scenario,
+            unit,
+            SessionConfig::new(),
+            Some(schema),
+            Some(batches),
+        )
+        .await
     }
 
     async fn with_config(
         scenario: Scenario,
         unit: Unit,
         mut config: SessionConfig,
+        custom_schema: Option<SchemaRef>,
+        custom_batches: Option<Vec<RecordBatch>>,
     ) -> Self {
+        // Use a single partition for deterministic results no matter how many CPUs the host has
+        config = config.with_target_partitions(1);
         let file = match unit {
             Unit::RowGroup(row_per_group) => {
                 config = config.with_parquet_bloom_filter_pruning(true);
-                make_test_file_rg(scenario, row_per_group).await
+                config.options_mut().execution.parquet.pushdown_filters = true;
+                make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
+                    .await
             }
             Unit::Page(row_per_page) => {
                 config = config.with_parquet_page_index_pruning(true);
@@ -466,9 +576,9 @@ fn make_uint_batches(start: u8, end: u8) -> RecordBatch {
         Field::new("u64", DataType::UInt64, true),
     ]));
     let v8: Vec<u8> = (start..end).collect();
-    let v16: Vec<u16> = (start as _..end as _).collect();
-    let v32: Vec<u32> = (start as _..end as _).collect();
-    let v64: Vec<u64> = (start as _..end as _).collect();
+    let v16: Vec<u16> = (start as u16..end as u16).collect();
+    let v32: Vec<u32> = (start as u32..end as u32).collect();
+    let v64: Vec<u64> = (start as u64..end as u64).collect();
     RecordBatch::try_new(
         schema,
         vec![
@@ -602,6 +712,7 @@ fn make_date_batch(offset: Duration) -> RecordBatch {
 /// of the column. It is *not* a table named service.name
 ///
 /// name | service.name
+#[expect(clippy::needless_pass_by_value)]
 fn make_bytearray_batch(
     name: &str,
     string_values: Vec<&str>,
@@ -657,6 +768,7 @@ fn make_bytearray_batch(
 /// of the column. It is *not* a table named service.name
 ///
 /// name | service.name
+#[expect(clippy::needless_pass_by_value)]
 fn make_names_batch(name: &str, service_name_values: Vec<&str>) -> RecordBatch {
     let num_rows = service_name_values.len();
     let name: StringArray = std::iter::repeat_n(Some(name), num_rows).collect();
@@ -741,6 +853,7 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
     .unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn make_dictionary_batch(strings: Vec<&str>, integers: Vec<i32>) -> RecordBatch {
     let keys = Int32Array::from_iter(0..strings.len() as i32);
     let small_keys = Int16Array::from_iter(0..strings.len() as i16);
@@ -789,6 +902,7 @@ fn make_dictionary_batch(strings: Vec<&str>, integers: Vec<i32>) -> RecordBatch
     .unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
     match scenario {
         Scenario::Timestamps => {
@@ -1021,7 +1135,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
 }
 
 /// Create a test parquet file with various data types
-async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
+async fn make_test_file_rg(
+    scenario: Scenario,
+    row_per_group: usize,
+    custom_schema: Option<SchemaRef>,
+    custom_batches: Option<Vec<RecordBatch>>,
+) -> NamedTempFile {
     let mut output_file = tempfile::Builder::new()
         .prefix("parquet_pruning")
         .suffix(".parquet")
@@ -1029,13 +1148,19 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
         .expect("tempfile creation");
 
     let props = WriterProperties::builder()
-        .set_max_row_group_size(row_per_group)
+        .set_max_row_group_row_count(Some(row_per_group))
         .set_bloom_filter_enabled(true)
         .set_statistics_enabled(EnabledStatistics::Page)
         .build();
 
-    let batches = create_data_batch(scenario);
-    let schema = batches[0].schema();
+    let (batches, schema) =
+        if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
+            (batches, schema)
+        } else {
+            let batches = create_data_batch(scenario);
+            let schema = batches[0].schema();
+            (batches, schema)
+        };
 
     let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();
 
diff --git a/datafusion/core/tests/parquet/ordering.rs b/datafusion/core/tests/parquet/ordering.rs
new file mode 100644
index 0000000000000..faecb4ca6a861
--- /dev/null
+++ b/datafusion/core/tests/parquet/ordering.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for ordering in Parquet sorting_columns metadata
+
+use datafusion::prelude::SessionContext;
+use datafusion_common::Result;
+use tempfile::tempdir;
+
+/// Test that CREATE TABLE ... WITH ORDER writes sorting_columns to Parquet metadata
+#[tokio::test]
+async fn test_create_table_with_order_writes_sorting_columns() -> Result<()> {
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use std::fs::File;
+
+    let ctx = SessionContext::new();
+    let tmp_dir = tempdir()?;
+    let table_path = tmp_dir.path().join("sorted_table");
+    std::fs::create_dir_all(&table_path)?;
+
+    // Create external table with ordering
+    let create_table_sql = format!(
+        "CREATE EXTERNAL TABLE sorted_data (a INT, b VARCHAR) \
+         STORED AS PARQUET \
+         LOCATION '{}' \
+         WITH ORDER (a ASC NULLS FIRST, b DESC NULLS LAST)",
+        table_path.display()
+    );
+    ctx.sql(&create_table_sql).await?;
+
+    // Insert sorted data
+    ctx.sql("INSERT INTO sorted_data VALUES (1, 'x'), (2, 'y'), (3, 'z')")
+        .await?
+        .collect()
+        .await?;
+
+    // Find the parquet file that was written
+    let parquet_files: Vec<_> = std::fs::read_dir(&table_path)?
+        .filter_map(|e| e.ok())
+        .filter(|e| e.path().extension().is_some_and(|ext| ext == "parquet"))
+        .collect();
+
+    assert!(
+        !parquet_files.is_empty(),
+        "Expected at least one parquet file in {}",
+        table_path.display()
+    );
+
+    // Read the parquet file and verify sorting_columns metadata
+    let file = File::open(parquet_files[0].path())?;
+    let reader = SerializedFileReader::new(file)?;
+    let metadata = reader.metadata();
+
+    // Check that row group has sorting_columns
+    let row_group = metadata.row_group(0);
+    let sorting_columns = row_group.sorting_columns();
+
+    assert!(
+        sorting_columns.is_some(),
+        "Expected sorting_columns in row group metadata"
+    );
+    let sorting = sorting_columns.unwrap();
+    assert_eq!(sorting.len(), 2, "Expected 2 sorting columns");
+
+    // First column: a ASC NULLS FIRST (column_idx = 0)
+    assert_eq!(sorting[0].column_idx, 0, "First sort column should be 'a'");
+    assert!(
+        !sorting[0].descending,
+        "First column should be ASC (descending=false)"
+    );
+    assert!(
+        sorting[0].nulls_first,
+        "First column should have NULLS FIRST"
+    );
+
+    // Second column: b DESC NULLS LAST (column_idx = 1)
+    assert_eq!(sorting[1].column_idx, 1, "Second sort column should be 'b'");
+    assert!(
+        sorting[1].descending,
+        "Second column should be DESC (descending=true)"
+    );
+    assert!(
+        !sorting[1].nulls_first,
+        "Second column should have NULLS LAST"
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs
index 9da879a32f6b5..a41803191ad05 100644
--- a/datafusion/core/tests/parquet/page_pruning.rs
+++ b/datafusion/core/tests/parquet/page_pruning.rs
@@ -20,27 +20,35 @@ use std::sync::Arc;
 use crate::parquet::Unit::Page;
 use crate::parquet::{ContextWithParquet, Scenario};
 
-use datafusion::datasource::file_format::parquet::ParquetFormat;
+use arrow::array::{Int32Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::datasource::file_format::FileFormat;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::context::SessionState;
-use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
+use datafusion::physical_plan::metrics::MetricValue;
+use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_common::{ScalarValue, ToDFSchema};
 use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::{col, lit, Expr};
+use datafusion_expr::{Expr, col, lit};
 use datafusion_physical_expr::create_physical_expr;
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use futures::StreamExt;
-use object_store::path::Path;
 use object_store::ObjectMeta;
-
-async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec {
+use object_store::path::Path;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::WriterProperties;
+
+async fn get_parquet_exec(
+    state: &SessionState,
+    filter: Expr,
+    pushdown_filters: bool,
+) -> DataSourceExec {
     let object_store_url = ObjectStoreUrl::local_filesystem();
     let store = state.runtime_env().object_store(&object_store_url).unwrap();
 
@@ -62,63 +70,63 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec
         .await
         .unwrap();
 
-    let partitioned_file = PartitionedFile {
-        object_meta: meta,
-        partition_values: vec![],
-        range: None,
-        statistics: None,
-        extensions: None,
-        metadata_size_hint: None,
-    };
+    let partitioned_file = PartitionedFile::new_from_meta(meta);
 
     let df_schema = schema.clone().to_dfschema().unwrap();
     let execution_props = ExecutionProps::new();
     let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap();
 
     let source = Arc::new(
-        ParquetSource::default()
+        ParquetSource::new(schema.clone())
             .with_predicate(predicate)
-            .with_enable_page_index(true),
+            .with_enable_page_index(true)
+            .with_pushdown_filters(pushdown_filters),
     );
-    let base_config = FileScanConfigBuilder::new(object_store_url, schema, source)
+    let base_config = FileScanConfigBuilder::new(object_store_url, source)
         .with_file(partitioned_file)
         .build();
 
     DataSourceExec::new(Arc::new(base_config))
 }
 
+async fn get_filter_results(
+    state: &SessionState,
+    filter: Expr,
+    pushdown_filters: bool,
+) -> Vec<RecordBatch> {
+    let parquet_exec = get_parquet_exec(state, filter, pushdown_filters).await;
+    let task_ctx = state.task_ctx();
+    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
+    let mut batches = Vec::new();
+    while let Some(Ok(batch)) = results.next().await {
+        batches.push(batch);
+    }
+    batches
+}
+
 #[tokio::test]
 async fn page_index_filter_one_col() {
     let session_ctx = SessionContext::new();
     let state = session_ctx.state();
-    let task_ctx = state.task_ctx();
 
     // 1.create filter month == 1;
     let filter = col("month").eq(lit(1_i32));
 
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-
+    let batches = get_filter_results(&state, filter.clone(), false).await;
     // `month = 1` from the page index should create below RowSelection
     //  vec.push(RowSelector::select(312));
     //  vec.push(RowSelector::skip(3330));
     //  vec.push(RowSelector::select(339));
     //  vec.push(RowSelector::skip(3319));
     // total 651 row
-    assert_eq!(batch.num_rows(), 651);
+    assert_eq!(batches[0].num_rows(), 651);
+
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 620);
 
     // 2. create filter month == 1 or month == 2;
     let filter = col("month").eq(lit(1_i32)).or(col("month").eq(lit(2_i32)));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-
+    let batches = get_filter_results(&state, filter.clone(), false).await;
     // `month = 1` or `month = 2` from the page index should create below RowSelection
     //  vec.push(RowSelector::select(312));
     //  vec.push(RowSelector::skip(900));
@@ -128,95 +136,78 @@ async fn page_index_filter_one_col() {
     //  vec.push(RowSelector::skip(873));
     //  vec.push(RowSelector::select(318));
     //  vec.push(RowSelector::skip(2128));
-    assert_eq!(batch.num_rows(), 1281);
+    assert_eq!(batches[0].num_rows(), 1281);
+
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 1180);
 
     // 3. create filter month == 1 and month == 12;
     let filter = col("month")
         .eq(lit(1_i32))
         .and(col("month").eq(lit(12_i32)));
+    let batches = get_filter_results(&state, filter.clone(), false).await;
+    assert!(batches.is_empty());
 
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await;
-
-    assert!(batch.is_none());
+    let batches = get_filter_results(&state, filter, true).await;
+    assert!(batches.is_empty());
 
     // 4.create filter 0 < month < 2 ;
     let filter = col("month").gt(lit(0_i32)).and(col("month").lt(lit(2_i32)));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-
+    let batches = get_filter_results(&state, filter.clone(), false).await;
     // should same with `month = 1`
-    assert_eq!(batch.num_rows(), 651);
-
-    let session_ctx = SessionContext::new();
-    let task_ctx = session_ctx.task_ctx();
+    assert_eq!(batches[0].num_rows(), 651);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 620);
 
     // 5.create filter date_string_col == "01/01/09"`;
     // Note this test doesn't apply type coercion so the literal must match the actual view type
     let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-    let batch = results.next().await.unwrap().unwrap();
+    let batches = get_filter_results(&state, filter.clone(), false).await;
+    assert_eq!(batches[0].num_rows(), 14);
 
     // there should only two pages match the filter
     //                                  min                                        max
     // page-20                        0  01/01/09                                  01/02/09
     // page-21                        0  01/01/09                                  01/01/09
     // each 7 rows
-    assert_eq!(batch.num_rows(), 14);
+    assert_eq!(batches[0].num_rows(), 14);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 10);
 }
 
 #[tokio::test]
 async fn page_index_filter_multi_col() {
     let session_ctx = SessionContext::new();
     let state = session_ctx.state();
-    let task_ctx = session_ctx.task_ctx();
 
     // create filter month == 1 and year = 2009;
     let filter = col("month").eq(lit(1_i32)).and(col("year").eq(lit(2009)));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-
+    let batches = get_filter_results(&state, filter.clone(), false).await;
     //  `year = 2009` from the page index should create below RowSelection
     //  vec.push(RowSelector::select(3663));
     //  vec.push(RowSelector::skip(3642));
     //  combine with `month = 1` total 333 row
-    assert_eq!(batch.num_rows(), 333);
+    assert_eq!(batches[0].num_rows(), 333);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 310);
 
     // create filter (year = 2009 or id = 1) and month = 1;
     // this should only use `month = 1` to evaluate the page index.
     let filter = col("month")
         .eq(lit(1_i32))
         .and(col("year").eq(lit(2009)).or(col("id").eq(lit(1))));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-    assert_eq!(batch.num_rows(), 651);
+    let batches = get_filter_results(&state, filter.clone(), false).await;
+    assert_eq!(batches[0].num_rows(), 651);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 310);
 
     // create filter (year = 2009 or id = 1)
     // this filter use two columns will not push down
     let filter = col("year").eq(lit(2009)).or(col("id").eq(lit(1)));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-    assert_eq!(batch.num_rows(), 7300);
+    let batches = get_filter_results(&state, filter.clone(), false).await;
+    assert_eq!(batches[0].num_rows(), 7300);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 3650);
 
     // create filter (year = 2009 and id = 1) or (year = 2010)
     // this filter use two columns will not push down
@@ -226,13 +217,10 @@ async fn page_index_filter_multi_col() {
         .eq(lit(2009))
         .and(col("id").eq(lit(1)))
         .or(col("year").eq(lit(2010)));
-
-    let parquet_exec = get_parquet_exec(&state, filter).await;
-
-    let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
-
-    let batch = results.next().await.unwrap().unwrap();
-    assert_eq!(batch.num_rows(), 7300);
+    let batches = get_filter_results(&state, filter.clone(), false).await;
+    assert_eq!(batches[0].num_rows(), 7300);
+    let batches = get_filter_results(&state, filter, true).await;
+    assert_eq!(batches[0].num_rows(), 3651);
 }
 
 async fn test_prune(
@@ -378,281 +366,367 @@ async fn prune_date64() {
 }
 
 macro_rules! int_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            //                      null count  min                                       max
-            // page-0                         0  -5                                        -1
-            // page-1                         0  -4                                        0
-            // page-2                         0  0                                         4
-            // page-3                         0  5                                         9
-            async fn [<prune_int $bits _lt>]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} < 1", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5,
-                )
-                .await;
-                // result of sql "SELECT * FROM t where i < 1" is same as
-                // "SELECT * FROM t where -i > -1"
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where -i{} > -1", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5,
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _gt >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} > 8", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5,
-                )
-                .await;
-
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where -i{} < -8", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5,
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} = 1", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun_and_eq >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where abs(i{}) = 1  and i{} = 1", $bits, $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where abs(i{}) = 1", $bits),
-                    Some(0),
-                    Some(0),
-                    3,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr>]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{}+1 = 1", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr_subtract >]() {
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where 1-i{} > 1", $bits),
-                    Some(0),
-                    Some(0),
-                    9,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} in (1)", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                test_prune(
-                    Scenario::Int,
-                    &format!("SELECT * FROM t where i{} not in (1)", $bits),
-                    Some(0),
-                    Some(0),
-                    19,
-                    5
-                )
-                .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_gt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_complex_expr_subtract:ident, $fn_eq_in_list:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        //                      null count  min                                       max
+        // page-0                         0  -5                                        -1
+        // page-1                         0  -4                                        0
+        // page-2                         0  0                                         4
+        // page-3                         0  5                                         9
+        async fn $fn_lt() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} < 1", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
+            // result of sql "SELECT * FROM t where i < 1" is same as
+            // "SELECT * FROM t where -i > -1"
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where -i{} > -1", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
         }
-    }
+
+        #[tokio::test]
+        async fn $fn_gt() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} > 8", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where -i{} < -8", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} = 1", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            test_prune(
+                Scenario::Int,
+                &format!(
+                    "SELECT * FROM t where abs(i{}) = 1  and i{} = 1",
+                    $bits, $bits
+                ),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where abs(i{}) = 1", $bits),
+                Some(0),
+                Some(0),
+                3,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{}+1 = 1", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr_subtract() {
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where 1-i{} > 1", $bits),
+                Some(0),
+                Some(0),
+                9,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} in (1)", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            test_prune(
+                Scenario::Int,
+                &format!("SELECT * FROM t where i{} not in (1)", $bits),
+                Some(0),
+                Some(0),
+                19,
+                5,
+            )
+            .await;
+        }
+    };
 }
 
-int_tests!(8);
-int_tests!(16);
-int_tests!(32);
-int_tests!(64);
+int_tests!(
+    8,
+    prune_int8_lt,
+    prune_int8_gt,
+    prune_int8_eq,
+    prune_int8_scalar_fun_and_eq,
+    prune_int8_scalar_fun,
+    prune_int8_complex_expr,
+    prune_int8_complex_expr_subtract,
+    prune_int8_eq_in_list,
+    prune_int8_eq_in_list_negated
+);
+int_tests!(
+    16,
+    prune_int16_lt,
+    prune_int16_gt,
+    prune_int16_eq,
+    prune_int16_scalar_fun_and_eq,
+    prune_int16_scalar_fun,
+    prune_int16_complex_expr,
+    prune_int16_complex_expr_subtract,
+    prune_int16_eq_in_list,
+    prune_int16_eq_in_list_negated
+);
+int_tests!(
+    32,
+    prune_int32_lt,
+    prune_int32_gt,
+    prune_int32_eq,
+    prune_int32_scalar_fun_and_eq,
+    prune_int32_scalar_fun,
+    prune_int32_complex_expr,
+    prune_int32_complex_expr_subtract,
+    prune_int32_eq_in_list,
+    prune_int32_eq_in_list_negated
+);
+int_tests!(
+    64,
+    prune_int64_lt,
+    prune_int64_gt,
+    prune_int64_eq,
+    prune_int64_scalar_fun_and_eq,
+    prune_int64_scalar_fun,
+    prune_int64_complex_expr,
+    prune_int64_complex_expr_subtract,
+    prune_int64_eq_in_list,
+    prune_int64_eq_in_list_negated
+);
 
 macro_rules! uint_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            //                      null count  min                                       max
-            // page-0                         0  0                                         4
-            // page-1                         0  1                                         5
-            // page-2                         0  5                                         9
-            // page-3                         0  250                                       254
-            async fn [<prune_uint $bits _lt>]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} < 6", $bits),
-                    Some(0),
-                    Some(5),
-                    11,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _gt >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} > 253", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} = 6", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun_and_eq >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun >]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where power(u{}, 2) = 25", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _complex_expr>]() {
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{}+1 = 6", $bits),
-                    Some(0),
-                    Some(0),
-                    2,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} in (6)", $bits),
-                    Some(0),
-                    Some(15),
-                    1,
-                    5
-                )
-                .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (6)" prune nothing
-                test_prune(
-                    Scenario::UInt,
-                    &format!("SELECT * FROM t where u{} not in (6)", $bits),
-                    Some(0),
-                    Some(0),
-                    19,
-                    5
-                )
-                .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_gt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_eq_in_list:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        //                      null count  min                                       max
+        // page-0                         0  0                                         4
+        // page-1                         0  1                                         5
+        // page-2                         0  5                                         9
+        // page-3                         0  250                                       254
+        async fn $fn_lt() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} < 6", $bits),
+                Some(0),
+                Some(5),
+                11,
+                5,
+            )
+            .await;
         }
-    }
+
+        #[tokio::test]
+        async fn $fn_gt() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} > 253", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} = 6", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            test_prune(
+                Scenario::UInt,
+                &format!(
+                    "SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6",
+                    $bits, $bits
+                ),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where power(u{}, 2) = 25", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{}+1 = 6", $bits),
+                Some(0),
+                Some(0),
+                2,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} in (6)", $bits),
+                Some(0),
+                Some(15),
+                1,
+                5,
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (6)" prune nothing
+            test_prune(
+                Scenario::UInt,
+                &format!("SELECT * FROM t where u{} not in (6)", $bits),
+                Some(0),
+                Some(0),
+                19,
+                5,
+            )
+            .await;
+        }
+    };
 }
 
-uint_tests!(8);
-uint_tests!(16);
-uint_tests!(32);
-uint_tests!(64);
+uint_tests!(
+    8,
+    prune_uint8_lt,
+    prune_uint8_gt,
+    prune_uint8_eq,
+    prune_uint8_scalar_fun_and_eq,
+    prune_uint8_scalar_fun,
+    prune_uint8_complex_expr,
+    prune_uint8_eq_in_list,
+    prune_uint8_eq_in_list_negated
+);
+uint_tests!(
+    16,
+    prune_uint16_lt,
+    prune_uint16_gt,
+    prune_uint16_eq,
+    prune_uint16_scalar_fun_and_eq,
+    prune_uint16_scalar_fun,
+    prune_uint16_complex_expr,
+    prune_uint16_eq_in_list,
+    prune_uint16_eq_in_list_negated
+);
+uint_tests!(
+    32,
+    prune_uint32_lt,
+    prune_uint32_gt,
+    prune_uint32_eq,
+    prune_uint32_scalar_fun_and_eq,
+    prune_uint32_scalar_fun,
+    prune_uint32_complex_expr,
+    prune_uint32_eq_in_list,
+    prune_uint32_eq_in_list_negated
+);
+uint_tests!(
+    64,
+    prune_uint64_lt,
+    prune_uint64_gt,
+    prune_uint64_eq,
+    prune_uint64_scalar_fun_and_eq,
+    prune_uint64_scalar_fun,
+    prune_uint64_complex_expr,
+    prune_uint64_eq_in_list,
+    prune_uint64_eq_in_list_negated
+);
 
 #[tokio::test]
 //                      null count  min                                       max
@@ -911,8 +985,8 @@ async fn without_pushdown_filter() {
     )
     .unwrap();
 
-    // Without filter will not read pageIndex.
-    assert!(bytes_scanned_with_filter > bytes_scanned_without_filter);
+    // Same amount of bytes are scanned when defaulting to cache parquet metadata
+    assert_eq!(bytes_scanned_with_filter, bytes_scanned_without_filter);
 }
 
 #[tokio::test]
@@ -976,3 +1050,56 @@ fn cast_count_metric(metric: MetricValue) -> Option<usize> {
         _ => None,
     }
 }
+
+#[tokio::test]
+async fn test_parquet_opener_without_page_index() {
+    // Defines a simple schema and batch
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+    )
+    .unwrap();
+
+    // Create a temp file
+    let file = tempfile::Builder::new()
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+    let path = file.path().to_str().unwrap().to_string();
+
+    // Write parquet WITHOUT page index
+    // The default WriterProperties does not write page index, but we set it explicitly
+    // to be robust against future changes in defaults as requested by reviewers.
+    let props = WriterProperties::builder()
+        .set_statistics_enabled(parquet::file::properties::EnabledStatistics::None)
+        .build();
+
+    let file_fs = std::fs::File::create(&path).unwrap();
+    let mut writer = ArrowWriter::try_new(file_fs, batch.schema(), Some(props)).unwrap();
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    // Setup SessionContext with PageIndex enabled
+    // This triggers the ParquetOpener to try and load page index if available
+    let config = SessionConfig::new().with_parquet_page_index_pruning(true);
+
+    let ctx = SessionContext::new_with_config(config);
+
+    // Register the table
+    ctx.register_parquet("t", &path, Default::default())
+        .await
+        .unwrap();
+
+    // Query the table
+    // If the bug exists, this might fail because Opener tries to load PageIndex forcefully
+    let df = ctx.sql("SELECT * FROM t").await.unwrap();
+    let batches = df
+        .collect()
+        .await
+        .expect("Failed to read parquet file without page index");
+
+    // We expect this to succeed, but currently it might fail
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), 3);
+}
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 5a85f47c015a9..3ec3541af977a 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -18,8 +18,12 @@
 //! This file contains an end to end test of parquet pruning. It writes
 //! data into a parquet file and then verifies row groups are pruned as
 //! expected.
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
 use datafusion::prelude::SessionConfig;
-use datafusion_common::ScalarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
 use itertools::Itertools;
 
 use crate::parquet::Unit::RowGroup;
@@ -30,10 +34,13 @@ struct RowGroupPruningTest {
     query: String,
     expected_errors: Option<usize>,
     expected_row_group_matched_by_statistics: Option<usize>,
+    expected_row_group_fully_matched_by_statistics: Option<usize>,
     expected_row_group_pruned_by_statistics: Option<usize>,
+    expected_files_pruned_by_statistics: Option<usize>,
     expected_row_group_matched_by_bloom_filter: Option<usize>,
     expected_row_group_pruned_by_bloom_filter: Option<usize>,
-    expected_results: usize,
+    expected_limit_pruned_row_groups: Option<usize>,
+    expected_rows: usize,
 }
 impl RowGroupPruningTest {
     // Start building the test configuration
@@ -44,9 +51,12 @@ impl RowGroupPruningTest {
             expected_errors: None,
             expected_row_group_matched_by_statistics: None,
             expected_row_group_pruned_by_statistics: None,
+            expected_row_group_fully_matched_by_statistics: None,
+            expected_files_pruned_by_statistics: None,
             expected_row_group_matched_by_bloom_filter: None,
             expected_row_group_pruned_by_bloom_filter: None,
-            expected_results: 0,
+            expected_limit_pruned_row_groups: None,
+            expected_rows: 0,
         }
     }
 
@@ -74,12 +84,26 @@ impl RowGroupPruningTest {
         self
     }
 
+    // Set the expected fully matched row groups by statistics
+    fn with_fully_matched_by_stats(
+        mut self,
+        fully_matched_by_stats: Option<usize>,
+    ) -> Self {
+        self.expected_row_group_fully_matched_by_statistics = fully_matched_by_stats;
+        self
+    }
+
     // Set the expected pruned row groups by statistics
     fn with_pruned_by_stats(mut self, pruned_by_stats: Option<usize>) -> Self {
         self.expected_row_group_pruned_by_statistics = pruned_by_stats;
         self
     }
 
+    fn with_pruned_files(mut self, pruned_files: Option<usize>) -> Self {
+        self.expected_files_pruned_by_statistics = pruned_files;
+        self
+    }
+
     // Set the expected matched row groups by bloom filter
     fn with_matched_by_bloom_filter(mut self, matched_by_bf: Option<usize>) -> Self {
         self.expected_row_group_matched_by_bloom_filter = matched_by_bf;
@@ -92,9 +116,14 @@ impl RowGroupPruningTest {
         self
     }
 
-    // Set the expected rows for the test
+    fn with_limit_pruned_row_groups(mut self, pruned_by_limit: Option<usize>) -> Self {
+        self.expected_limit_pruned_row_groups = pruned_by_limit;
+        self
+    }
+
+    /// Set the number of expected rows from the output of this test
     fn with_expected_rows(mut self, rows: usize) -> Self {
-        self.expected_results = rows;
+        self.expected_rows = rows;
         self
     }
 
@@ -122,19 +151,86 @@ impl RowGroupPruningTest {
             "mismatched row_groups_pruned_statistics",
         );
         assert_eq!(
-            output.row_groups_matched_bloom_filter(),
+            output.files_ranges_pruned_statistics(),
+            self.expected_files_pruned_by_statistics,
+            "mismatched files_ranges_pruned_statistics",
+        );
+        let bloom_filter_metrics = output.row_groups_bloom_filter();
+        assert_eq!(
+            bloom_filter_metrics.as_ref().map(|pm| pm.total_matched()),
             self.expected_row_group_matched_by_bloom_filter,
             "mismatched row_groups_matched_bloom_filter",
         );
         assert_eq!(
-            output.row_groups_pruned_bloom_filter(),
+            bloom_filter_metrics.map(|pm| pm.total_pruned()),
             self.expected_row_group_pruned_by_bloom_filter,
             "mismatched row_groups_pruned_bloom_filter",
         );
+
         assert_eq!(
             output.result_rows,
-            self.expected_results,
-            "mismatched expected rows: {}",
+            self.expected_rows,
+            "Expected {} rows, got {}: {}",
+            output.result_rows,
+            self.expected_rows,
+            output.description(),
+        );
+    }
+
+    // Execute the test with the current configuration
+    async fn test_row_group_prune_with_custom_data(
+        self,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+        max_row_per_group: usize,
+    ) {
+        let output = ContextWithParquet::with_custom_data(
+            self.scenario,
+            RowGroup(max_row_per_group),
+            schema,
+            batches,
+        )
+        .await
+        .query(&self.query)
+        .await;
+
+        println!("{}", output.description());
+        assert_eq!(
+            output.predicate_evaluation_errors(),
+            self.expected_errors,
+            "mismatched predicate_evaluation error"
+        );
+        assert_eq!(
+            output.row_groups_matched_statistics(),
+            self.expected_row_group_matched_by_statistics,
+            "mismatched row_groups_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_fully_matched_statistics(),
+            self.expected_row_group_fully_matched_by_statistics,
+            "mismatched row_groups_fully_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_pruned_statistics(),
+            self.expected_row_group_pruned_by_statistics,
+            "mismatched row_groups_pruned_statistics",
+        );
+        assert_eq!(
+            output.files_ranges_pruned_statistics(),
+            self.expected_files_pruned_by_statistics,
+            "mismatched files_ranges_pruned_statistics",
+        );
+        assert_eq!(
+            output.limit_pruned_row_groups(),
+            self.expected_limit_pruned_row_groups,
+            "mismatched limit_pruned_row_groups",
+        );
+        assert_eq!(
+            output.result_rows,
+            self.expected_rows,
+            "Expected {} rows, got {}: {}",
+            output.result_rows,
+            self.expected_rows,
             output.description(),
         );
     }
@@ -148,7 +244,8 @@ async fn prune_timestamps_nanos() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(10)
         .test_row_group_prune()
@@ -165,7 +262,8 @@ async fn prune_timestamps_micros() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(10)
         .test_row_group_prune()
@@ -182,7 +280,8 @@ async fn prune_timestamps_millis() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(10)
         .test_row_group_prune()
@@ -199,7 +298,8 @@ async fn prune_timestamps_seconds() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(10)
         .test_row_group_prune()
@@ -214,7 +314,8 @@ async fn prune_date32() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(3))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(1)
         .test_row_group_prune()
@@ -243,8 +344,9 @@ async fn prune_date64() {
     println!("{}", output.description());
     // This should prune out groups  without error
     assert_eq!(output.predicate_evaluation_errors(), Some(0));
-    assert_eq!(output.row_groups_matched(), Some(1));
-    assert_eq!(output.row_groups_pruned(), Some(3));
+    // 'dates' table has 4 row groups, and only the first one is matched by the predicate
+    assert_eq!(output.row_groups_matched_statistics(), Some(1));
+    assert_eq!(output.row_groups_pruned_statistics(), Some(3));
     assert_eq!(output.result_rows, 1, "{}", output.description());
 }
 
@@ -256,7 +358,8 @@ async fn prune_disabled() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(10)
         .test_row_group_prune()
@@ -267,16 +370,21 @@ async fn prune_disabled() {
     let expected_rows = 10;
     let config = SessionConfig::new().with_parquet_pruning(false);
 
-    let output =
-        ContextWithParquet::with_config(Scenario::Timestamps, RowGroup(5), config)
-            .await
-            .query(query)
-            .await;
+    let output = ContextWithParquet::with_config(
+        Scenario::Timestamps,
+        RowGroup(5),
+        config,
+        None,
+        None,
+    )
+    .await
+    .query(query)
+    .await;
     println!("{}", output.description());
 
     // This should not prune any
     assert_eq!(output.predicate_evaluation_errors(), Some(0));
-    assert_eq!(output.row_groups_matched(), Some(0));
+    assert_eq!(output.row_groups_matched(), Some(4));
     assert_eq!(output.row_groups_pruned(), Some(0));
     assert_eq!(
         output.result_rows,
@@ -291,303 +399,365 @@ async fn prune_disabled() {
 // https://github.com/apache/datafusion/issues/9779 bug so that tests pass
 // if and only if Bloom filters on Int8 and Int16 columns are still buggy.
 macro_rules! int_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            async fn [<prune_int $bits _lt >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} < 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-
-                // result of sql "SELECT * FROM t where i < 1" is same as
-                // "SELECT * FROM t where -i > -1"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where -i{} > -1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun_and_eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where abs(i{}) = 1 and i{} = 1", $bits, $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _scalar_fun >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(3)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _complex_expr_subtract >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(9)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} in (1)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_2 >]() {
-                // result of sql "SELECT * FROM t where in (1000)", prune all
-                // test whether statistics works
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(4))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(0)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_int $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::Int)
-                    .with_query(&format!("SELECT * FROM t where i{} not in (1)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(19)
-                    .test_row_group_prune()
-                    .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_complex_expr_subtract:ident, $fn_eq_in_list:ident, $fn_eq_in_list_2:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        async fn $fn_lt() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} < 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+
+            // result of sql "SELECT * FROM t where i < 1" is same as
+            // "SELECT * FROM t where -i > -1"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where -i{} > -1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!(
+                    "SELECT * FROM t where abs(i{}) = 1 and i{} = 1",
+                    $bits, $bits
+                ))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(3)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr_subtract() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(9)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} in (1)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_2() {
+            // result of sql "SELECT * FROM t where in (1000)", prune all
+            // test whether statistics works
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(0))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(1))
+                .with_matched_by_bloom_filter(Some(0))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(0)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::Int)
+                .with_query(&format!("SELECT * FROM t where i{} not in (1)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(19)
+                .test_row_group_prune()
+                .await;
         }
     };
 }
 
 // int8/int16 are incorrect: https://github.com/apache/datafusion/issues/9779
-int_tests!(32);
-int_tests!(64);
+int_tests!(
+    32,
+    prune_int32_lt,
+    prune_int32_eq,
+    prune_int32_scalar_fun_and_eq,
+    prune_int32_scalar_fun,
+    prune_int32_complex_expr,
+    prune_int32_complex_expr_subtract,
+    prune_int32_eq_in_list,
+    prune_int32_eq_in_list_2,
+    prune_int32_eq_in_list_negated
+);
+int_tests!(
+    64,
+    prune_int64_lt,
+    prune_int64_eq,
+    prune_int64_scalar_fun_and_eq,
+    prune_int64_scalar_fun,
+    prune_int64_complex_expr,
+    prune_int64_complex_expr_subtract,
+    prune_int64_eq_in_list,
+    prune_int64_eq_in_list_2,
+    prune_int64_eq_in_list_negated
+);
 
 // $bits: number of bits of the integer to test (8, 16, 32, 64)
 // $correct_bloom_filters: if false, replicates the
 // https://github.com/apache/datafusion/issues/9779 bug so that tests pass
 // if and only if Bloom filters on UInt8 and UInt16 columns are still buggy.
 macro_rules! uint_tests {
-    ($bits:expr) => {
-        paste::item! {
-            #[tokio::test]
-            async fn [<prune_uint $bits _lt >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} < 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(3))
-                    .with_pruned_by_stats(Some(1))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(11)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} = 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun_and_eq >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _scalar_fun >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _complex_expr >]() {
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(2)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list >]() {
-                // result of sql "SELECT * FROM t where in (1)"
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} in (6)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(1))
-                    .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(1))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(1)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_2 >]() {
-                // result of sql "SELECT * FROM t where in (1000)", prune all
-                // test whether statistics works
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} in (100)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(0))
-                    .with_pruned_by_stats(Some(4))
-                    .with_matched_by_bloom_filter(Some(0))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(0)
-                    .test_row_group_prune()
-                    .await;
-            }
-
-            #[tokio::test]
-            async fn [<prune_uint $bits _eq_in_list_negated >]() {
-                // result of sql "SELECT * FROM t where not in (1)" prune nothing
-                RowGroupPruningTest::new()
-                    .with_scenario(Scenario::UInt)
-                    .with_query(&format!("SELECT * FROM t where u{} not in (6)", $bits))
-                    .with_expected_errors(Some(0))
-                    .with_matched_by_stats(Some(4))
-                    .with_pruned_by_stats(Some(0))
-                    .with_matched_by_bloom_filter(Some(4))
-                    .with_pruned_by_bloom_filter(Some(0))
-                    .with_expected_rows(19)
-                    .test_row_group_prune()
-                    .await;
-            }
+    ($bits:expr, $fn_lt:ident, $fn_eq:ident, $fn_scalar_fun_and_eq:ident, $fn_scalar_fun:ident, $fn_complex_expr:ident, $fn_eq_in_list:ident, $fn_eq_in_list_2:ident, $fn_eq_in_list_negated:ident) => {
+        #[tokio::test]
+        async fn $fn_lt() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} < 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(3))
+                .with_pruned_by_stats(Some(1))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(3))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(11)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} = 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+        #[tokio::test]
+        async fn $fn_scalar_fun_and_eq() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!(
+                    "SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6",
+                    $bits, $bits
+                ))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_scalar_fun() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_complex_expr() {
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(2)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list() {
+            // result of sql "SELECT * FROM t where in (1)"
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} in (6)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(1))
+                .with_pruned_by_stats(Some(3))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(1))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(1)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_2() {
+            // result of sql "SELECT * FROM t where in (1000)", prune all
+            // test whether statistics works
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} in (100)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(0))
+                .with_pruned_by_stats(Some(4))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(0))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(0)
+                .test_row_group_prune()
+                .await;
+        }
+
+        #[tokio::test]
+        async fn $fn_eq_in_list_negated() {
+            // result of sql "SELECT * FROM t where not in (1)" prune nothing
+            RowGroupPruningTest::new()
+                .with_scenario(Scenario::UInt)
+                .with_query(&format!("SELECT * FROM t where u{} not in (6)", $bits))
+                .with_expected_errors(Some(0))
+                .with_matched_by_stats(Some(4))
+                .with_pruned_by_stats(Some(0))
+                .with_pruned_files(Some(0))
+                .with_matched_by_bloom_filter(Some(4))
+                .with_pruned_by_bloom_filter(Some(0))
+                .with_expected_rows(19)
+                .test_row_group_prune()
+                .await;
         }
     };
 }
 
 // uint8/uint16 are incorrect: https://github.com/apache/datafusion/issues/9779
-uint_tests!(32);
-uint_tests!(64);
+uint_tests!(
+    32,
+    prune_uint32_lt,
+    prune_uint32_eq,
+    prune_uint32_scalar_fun_and_eq,
+    prune_uint32_scalar_fun,
+    prune_uint32_complex_expr,
+    prune_uint32_eq_in_list,
+    prune_uint32_eq_in_list_2,
+    prune_uint32_eq_in_list_negated
+);
+uint_tests!(
+    64,
+    prune_uint64_lt,
+    prune_uint64_eq,
+    prune_uint64_scalar_fun_and_eq,
+    prune_uint64_scalar_fun,
+    prune_uint64_complex_expr,
+    prune_uint64_eq_in_list,
+    prune_uint64_eq_in_list_2,
+    prune_uint64_eq_in_list_negated
+);
 
 #[tokio::test]
 async fn prune_int32_eq_large_in_list() {
@@ -604,6 +774,7 @@ async fn prune_int32_eq_large_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(0)
@@ -626,6 +797,7 @@ async fn prune_uint32_eq_large_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(0)
@@ -641,7 +813,8 @@ async fn prune_f64_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(11)
         .test_row_group_prune()
@@ -652,7 +825,8 @@ async fn prune_f64_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(11)
         .test_row_group_prune()
@@ -669,7 +843,8 @@ async fn prune_f64_scalar_fun_and_gt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(2))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(1)
         .test_row_group_prune()
@@ -683,9 +858,10 @@ async fn prune_f64_scalar_fun() {
         .with_scenario(Scenario::Float64)
         .with_query("SELECT * FROM t where abs(f-1) <= 0.000001")
         .with_expected_errors(Some(0))
-        .with_matched_by_stats(Some(0))
+        .with_matched_by_stats(Some(4))
         .with_pruned_by_stats(Some(0))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(4))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(1)
         .test_row_group_prune()
@@ -699,9 +875,10 @@ async fn prune_f64_complex_expr() {
         .with_scenario(Scenario::Float64)
         .with_query("SELECT * FROM t where f+1 > 1.1")
         .with_expected_errors(Some(0))
-        .with_matched_by_stats(Some(0))
+        .with_matched_by_stats(Some(4))
         .with_pruned_by_stats(Some(0))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(4))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(9)
         .test_row_group_prune()
@@ -715,9 +892,10 @@ async fn prune_f64_complex_expr_subtract() {
         .with_scenario(Scenario::Float64)
         .with_query("SELECT * FROM t where 1-f > 1")
         .with_expected_errors(Some(0))
-        .with_matched_by_stats(Some(0))
+        .with_matched_by_stats(Some(4))
         .with_pruned_by_stats(Some(0))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(4))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(9)
         .test_row_group_prune()
@@ -735,7 +913,8 @@ async fn prune_decimal_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(6)
         .test_row_group_prune()
@@ -746,7 +925,8 @@ async fn prune_decimal_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(8)
         .test_row_group_prune()
@@ -757,7 +937,8 @@ async fn prune_decimal_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(6)
         .test_row_group_prune()
@@ -768,7 +949,8 @@ async fn prune_decimal_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(8)
         .test_row_group_prune()
@@ -786,6 +968,7 @@ async fn prune_decimal_eq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
@@ -797,6 +980,7 @@ async fn prune_decimal_eq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
@@ -809,6 +993,7 @@ async fn prune_decimal_eq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
@@ -820,6 +1005,7 @@ async fn prune_decimal_eq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
@@ -839,7 +1025,8 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(5)
         .test_row_group_prune()
@@ -850,7 +1037,8 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(6)
         .test_row_group_prune()
@@ -861,7 +1049,8 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(5)
         .test_row_group_prune()
@@ -872,7 +1061,8 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(6)
         .test_row_group_prune()
@@ -885,6 +1075,7 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(2))
         .with_expected_rows(1)
@@ -898,6 +1089,7 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(2))
         .with_expected_rows(1)
@@ -911,6 +1103,7 @@ async fn prune_decimal_in_list() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(2))
         .with_expected_rows(1)
@@ -929,6 +1122,7 @@ async fn prune_string_eq_match() {
         // false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(1)
@@ -947,6 +1141,7 @@ async fn prune_string_eq_no_match() {
         // false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(0)
@@ -963,6 +1158,7 @@ async fn prune_string_eq_no_match() {
         // false positive on 'mixed' batch: 'backend one' < 'frontend nine' < 'frontend six'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(2))
         .with_expected_rows(0)
@@ -980,6 +1176,7 @@ async fn prune_string_neq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(14)
@@ -998,7 +1195,8 @@ async fn prune_string_lt() {
         // matches 'all backends' only
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(3)
         .test_row_group_prune()
@@ -1012,7 +1210,8 @@ async fn prune_string_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         // all backends from 'mixed' and 'all backends'
         .with_expected_rows(8)
@@ -1031,6 +1230,7 @@ async fn prune_binary_eq_match() {
         // false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(1)
@@ -1049,6 +1249,7 @@ async fn prune_binary_eq_no_match() {
         // false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(0)
@@ -1065,6 +1266,7 @@ async fn prune_binary_eq_no_match() {
         // false positive on 'mixed' batch: 'backend one' < 'frontend nine' < 'frontend six'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(2))
         .with_expected_rows(0)
@@ -1082,6 +1284,7 @@ async fn prune_binary_neq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(14)
@@ -1100,7 +1303,8 @@ async fn prune_binary_lt() {
         // matches 'all backends' only
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(3)
         .test_row_group_prune()
@@ -1114,7 +1318,8 @@ async fn prune_binary_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         // all backends from 'mixed' and 'all backends'
         .with_expected_rows(8)
@@ -1133,6 +1338,7 @@ async fn prune_fixedsizebinary_eq_match() {
         // false positive on 'all frontends' batch: 'fe1' < 'fe6' < 'fe7'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(1)
@@ -1148,6 +1354,7 @@ async fn prune_fixedsizebinary_eq_match() {
         // false positive on 'all frontends' batch: 'fe1' < 'fe6' < 'fe7'
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(1)
@@ -1166,6 +1373,7 @@ async fn prune_fixedsizebinary_eq_no_match() {
         // false positive on 'mixed' batch: 'be1' < 'be9' < 'fe4'
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(1))
         .with_expected_rows(0)
@@ -1183,6 +1391,7 @@ async fn prune_fixedsizebinary_neq() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(3))
         .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(3))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(14)
@@ -1201,7 +1410,8 @@ async fn prune_fixedsizebinary_lt() {
         // matches 'all backends' only
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
         .test_row_group_prune()
@@ -1215,7 +1425,8 @@ async fn prune_fixedsizebinary_lt() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
-        .with_matched_by_bloom_filter(Some(0))
+        .with_pruned_files(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         // all backends from 'mixed' and 'all backends'
         .with_expected_rows(8)
@@ -1235,6 +1446,7 @@ async fn prune_periods_in_column_names() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(7)
@@ -1246,6 +1458,7 @@ async fn prune_periods_in_column_names() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(5)
@@ -1257,6 +1470,7 @@ async fn prune_periods_in_column_names() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .with_expected_rows(2)
@@ -1277,9 +1491,10 @@ async fn test_row_group_with_null_values() {
         .with_query("SELECT * FROM t WHERE \"i8\" <= 5")
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_pruned_by_stats(Some(2))
         .with_expected_rows(5)
-        .with_matched_by_bloom_filter(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .test_row_group_prune()
         .await;
@@ -1290,9 +1505,10 @@ async fn test_row_group_with_null_values() {
         .with_query("SELECT * FROM t WHERE \"i8\" is Null")
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(2))
+        .with_pruned_files(Some(0))
         .with_pruned_by_stats(Some(1))
         .with_expected_rows(10)
-        .with_matched_by_bloom_filter(Some(0))
+        .with_matched_by_bloom_filter(Some(2))
         .with_pruned_by_bloom_filter(Some(0))
         .test_row_group_prune()
         .await;
@@ -1303,9 +1519,10 @@ async fn test_row_group_with_null_values() {
         .with_query("SELECT * FROM t WHERE \"i16\" is Not Null")
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_pruned_by_stats(Some(2))
         .with_expected_rows(5)
-        .with_matched_by_bloom_filter(Some(0))
+        .with_matched_by_bloom_filter(Some(1))
         .with_pruned_by_bloom_filter(Some(0))
         .test_row_group_prune()
         .await;
@@ -1316,7 +1533,8 @@ async fn test_row_group_with_null_values() {
         .with_query("SELECT * FROM t WHERE \"i32\" > 7")
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(0))
-        .with_pruned_by_stats(Some(3))
+        .with_pruned_by_stats(Some(0))
+        .with_pruned_files(Some(1))
         .with_expected_rows(0)
         .with_matched_by_bloom_filter(Some(0))
         .with_pruned_by_bloom_filter(Some(0))
@@ -1332,6 +1550,7 @@ async fn test_bloom_filter_utf8_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1344,6 +1563,7 @@ async fn test_bloom_filter_utf8_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1356,6 +1576,7 @@ async fn test_bloom_filter_utf8_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1368,6 +1589,7 @@ async fn test_bloom_filter_utf8_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1383,6 +1605,7 @@ async fn test_bloom_filter_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1395,6 +1618,7 @@ async fn test_bloom_filter_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1407,6 +1631,7 @@ async fn test_bloom_filter_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1419,6 +1644,7 @@ async fn test_bloom_filter_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1434,6 +1660,7 @@ async fn test_bloom_filter_unsigned_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1446,6 +1673,7 @@ async fn test_bloom_filter_unsigned_integer_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1461,6 +1689,7 @@ async fn test_bloom_filter_binary_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1473,6 +1702,7 @@ async fn test_bloom_filter_binary_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1485,6 +1715,7 @@ async fn test_bloom_filter_binary_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1499,6 +1730,7 @@ async fn test_bloom_filter_binary_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
@@ -1514,6 +1746,7 @@ async fn test_bloom_filter_decimal_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(1)
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_bloom_filter(Some(1))
@@ -1526,9 +1759,247 @@ async fn test_bloom_filter_decimal_dict() {
         .with_expected_errors(Some(0))
         .with_matched_by_stats(Some(1))
         .with_pruned_by_stats(Some(1))
+        .with_pruned_files(Some(0))
         .with_expected_rows(0)
         .with_pruned_by_bloom_filter(Some(1))
         .with_matched_by_bloom_filter(Some(0))
         .test_row_group_prune()
         .await;
 }
+
+// Helper function to create a batch with a single Int32 column.
+fn make_i32_batch(
+    name: &str,
+    values: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, false)]));
+    let array: ArrayRef = Arc::new(Int32Array::from(values));
+    RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
+}
+
+// Helper function to create a batch with two Int32 columns
+fn make_two_col_i32_batch(
+    name_a: &str,
+    name_b: &str,
+    values_a: Vec<i32>,
+    values_b: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(name_a, DataType::Int32, false),
+        Field::new(name_b, DataType::Int32, false),
+    ]));
+    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
+    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
+    RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
+}
+
+#[tokio::test]
+async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
+    // Scenario: Simple integer column, multiple row groups
+    // Query: SELECT c1 FROM  t WHERE c1 = 0 LIMIT 2
+    // We expect 2 rows in total.
+
+    // Row Group 0: c1 = [0, -2] -> Partially matched, 1 row
+    // Row Group 1: c1 = [1, 2] -> Fully matched, 2 rows
+    // Row Group 2: c1 = [3, 4] -> Fully matched, 2 rows
+    // Row Group 3: c1 = [5, 6] -> Fully matched, 2 rows
+    // Row Group 4: c1 = [-1, -2] -> Not matched
+
+    // If limit = 2, and RG1 is fully matched and has 2 rows, we should
+    // only scan RG1 and prune other row groups
+    // RG4 is pruned by statistics. RG2 and RG3 are pruned by limit.
+    // So 2 row groups are effectively pruned due to limit pruning.
+
+    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+    let query = "SELECT c1 FROM t WHERE c1 >= 0 LIMIT 2";
+
+    let batches = vec![
+        make_i32_batch("c1", vec![0, -2])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![-1, -2])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int) // Assuming Scenario::Int can handle this data
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(2)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4))
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(1))
+        .with_limit_pruned_row_groups(Some(3))
+        .test_row_group_prune_with_custom_data(schema, batches, 2)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> {
+    // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
+    // Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match (b=2,3)
+    // Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows
+    // Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows
+    // Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows
+    // Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1)
+    // Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4)
+
+    // With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows
+    // RG4 and RG5 should be pruned by statistics
+    // RG3 should be pruned by limit
+    // RG0 is partially matched, so it depends on the order
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5";
+
+    let batches = vec![
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(5)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(2)) // RG4,5 are pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_multiple_fully_matched()
+-> datafusion_common::error::Result<()> {
+    // Test Case 2: Limit requires multiple fully matched row groups
+    // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 4: a=[1,2,3,4] -> Not matched
+
+    // With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows)  8 rows
+    // RG2,3 should be pruned by limit
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 5 LIMIT 8";
+
+    let batches = vec![
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![1, 2, 3, 4])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(8)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(4))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Result<()> {
+    // Test Case 3: No fully matched row groups - all are partially matched
+    // Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2)
+    // Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2)
+    // Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2)
+    // Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2)
+    // Row Group 4: a=[9,10,11] -> Not matched
+
+    // With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows
+    // Cannot prune much by limit since all matching RGs are partial
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 2 LIMIT 3";
+
+    let batches = vec![
+        make_i32_batch("a", vec![1, 2, 3])?,
+        make_i32_batch("a", vec![2, 3, 4])?,
+        make_i32_batch("a", vec![2, 5, 6])?,
+        make_i32_batch("a", vec![2, 7, 8])?,
+        make_i32_batch("a", vec![9, 10, 11])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(3)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(0))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error::Result<()>
+{
+    // Test Case 4: Limit exceeds all fully matched rows, need partially matched
+    // Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10)
+    // Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10)
+    // Row Group 4: a=[20,21,22,22] -> Not matched
+
+    // With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched
+    // Still need 2 more, so we need to scan partially matched RG0 and RG3
+    // All matching row groups should be scanned, only RG4 pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 10 LIMIT 10";
+
+    let batches = vec![
+        make_i32_batch("a", vec![10, 11, 12, 12])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 13, 14, 11])?,
+        make_i32_batch("a", vec![20, 21, 22, 22])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(10) // Total: 1 + 4 + 4 + 1 = 10
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(2))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+    Ok(())
+}
diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs
index 59cbf4b0872ea..6f7e2e328d0c3 100644
--- a/datafusion/core/tests/parquet/schema_coercion.rs
+++ b/datafusion/core/tests/parquet/schema_coercion.rs
@@ -18,16 +18,16 @@
 use std::sync::Arc;
 
 use arrow::array::{
-    types::Int32Type, ArrayRef, DictionaryArray, Float32Array, Int64Array, RecordBatch,
-    StringArray,
+    ArrayRef, DictionaryArray, Float32Array, Int64Array, RecordBatch, StringArray,
+    types::Int32Type,
 };
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::physical_plan::collect;
 use datafusion::prelude::SessionContext;
 use datafusion::test::object_store::local_unpartitioned_file;
-use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_common::Result;
+use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_execution::object_store::ObjectStoreUrl;
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
@@ -62,14 +62,10 @@ async fn multi_parquet_coercion() {
         Field::new("c2", DataType::Int32, true),
         Field::new("c3", DataType::Float64, true),
     ]));
-    let source = Arc::new(ParquetSource::default());
-    let conf = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        source,
-    )
-    .with_file_group(file_group)
-    .build();
+    let source = Arc::new(ParquetSource::new(file_schema.clone()));
+    let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+        .with_file_group(file_group)
+        .build();
 
     let parquet_exec = DataSourceExec::from_data_source(conf);
 
@@ -122,11 +118,11 @@ async fn multi_parquet_coercion_projection() {
     ]));
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(file_schema)),
     )
     .with_file_group(file_group)
-    .with_projection(Some(vec![1, 0, 2]))
+    .with_projection_indices(Some(vec![1, 0, 2]))
+    .unwrap()
     .build();
 
     let parquet_exec = DataSourceExec::from_data_source(config);
diff --git a/datafusion/core/tests/parquet/utils.rs b/datafusion/core/tests/parquet/utils.rs
index 24b6cadc148f8..e5e0026ec1f16 100644
--- a/datafusion/core/tests/parquet/utils.rs
+++ b/datafusion/core/tests/parquet/utils.rs
@@ -20,7 +20,7 @@
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion_physical_plan::metrics::MetricsSet;
-use datafusion_physical_plan::{accept, ExecutionPlan, ExecutionPlanVisitor};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanVisitor, accept};
 
 /// Find the metrics from the first DataSourceExec encountered in the plan
 #[derive(Debug)]
@@ -47,13 +47,12 @@ impl MetricsFinder {
 impl ExecutionPlanVisitor for MetricsFinder {
     type Error = std::convert::Infallible;
     fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
-        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-            if data_source_exec
+        if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+            && data_source_exec
                 .downcast_to_file_source::<ParquetSource>()
                 .is_some()
-            {
-                self.metrics = data_source_exec.metrics();
-            }
+        {
+            self.metrics = data_source_exec.metrics();
         }
         // stop searching once we have found the metrics
         Ok(self.metrics.is_none())
diff --git a/datafusion/core/tests/parquet_config.rs b/datafusion/core/tests/parquet_integration.rs
similarity index 100%
rename from datafusion/core/tests/parquet_config.rs
rename to datafusion/core/tests/parquet_integration.rs
diff --git a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
index a79d743cb253d..850f9d187780b 100644
--- a/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
+++ b/datafusion/core/tests/physical_optimizer/aggregate_statistics.rs
@@ -20,26 +20,38 @@ use std::sync::Arc;
 use crate::physical_optimizer::test_utils::TestAggregate;
 
 use arrow::array::Int32Array;
+use arrow::array::{Int64Array, StringArray};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::memory::MemTable;
 use datafusion::datasource::memory::MemorySourceConfig;
+use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::assert_batches_eq;
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::Result;
+use datafusion_common::stats::Precision;
+use datafusion_common::{ColumnStatistics, Result, Statistics};
+use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::TaskContext;
+use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::Operator;
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
 use datafusion_physical_expr::expressions::{self, cast};
-use datafusion_physical_optimizer::aggregate_statistics::AggregateStatistics;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::aggregate_statistics::AggregateStatistics;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::aggregates::AggregateMode;
 use datafusion_physical_plan::aggregates::PhysicalGroupBy;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::common;
+use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::projection::ProjectionExec;
-use datafusion_physical_plan::ExecutionPlan;
 
 /// Mock data using a MemorySourceConfig which has an exact count statistic
 fn mock_data() -> Result<Arc<DataSourceExec>> {
@@ -316,3 +328,228 @@ async fn test_count_with_nulls_inexact_stat() -> Result<()> {
 
     Ok(())
 }
+
+/// Tests that TopK aggregation correctly handles UTF-8 (string) types in both grouping keys and aggregate values.
+///
+/// The TopK optimization is designed to efficiently handle `GROUP BY ... ORDER BY aggregate LIMIT n` queries
+/// by maintaining only the top K groups during aggregation. However, not all type combinations are supported.
+///
+/// This test verifies two scenarios:
+/// 1. **Supported case**: UTF-8 grouping key with numeric aggregate (max/min) - should use TopK optimization
+/// 2. **Unsupported case**: UTF-8 grouping key with UTF-8 aggregate value - must gracefully fall back to
+///    standard aggregation without panicking
+///
+/// The fallback behavior is critical because attempting to use TopK with unsupported types could cause
+/// runtime panics. This test ensures the optimizer correctly detects incompatible types and chooses
+/// the appropriate execution path.
+#[tokio::test]
+async fn utf8_grouping_min_max_limit_fallbacks() -> Result<()> {
+    let mut config = SessionConfig::new();
+    config.options_mut().optimizer.enable_topk_aggregation = true;
+    let ctx = SessionContext::new_with_config(config);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![
+            Field::new("g", DataType::Utf8, false),
+            Field::new("val_str", DataType::Utf8, false),
+            Field::new("val_num", DataType::Int64, false),
+        ])),
+        vec![
+            Arc::new(StringArray::from(vec!["a", "b", "a"])),
+            Arc::new(StringArray::from(vec!["alpha", "bravo", "charlie"])),
+            Arc::new(Int64Array::from(vec![1, 2, 3])),
+        ],
+    )?;
+    let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?;
+    ctx.register_table("t", Arc::new(table))?;
+
+    // Supported path: numeric min/max with UTF-8 grouping should still use TopK aggregation
+    // and return correct results.
+    let supported_df = ctx
+        .sql("SELECT g, max(val_num) AS m FROM t GROUP BY g ORDER BY m DESC LIMIT 1")
+        .await?;
+    let supported_batches = supported_df.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+---+---+",
+            "| g | m |",
+            "+---+---+",
+            "| a | 3 |",
+            "+---+---+"
+        ],
+        &supported_batches
+    );
+
+    // Unsupported TopK value type: string min/max should fall back without panicking.
+    let unsupported_df = ctx
+        .sql("SELECT g, max(val_str) AS s FROM t GROUP BY g ORDER BY s DESC LIMIT 1")
+        .await?;
+    let unsupported_plan = unsupported_df.clone().create_physical_plan().await?;
+    let unsupported_batches = unsupported_df.collect().await?;
+
+    // Ensure the plan avoided the TopK-specific stream implementation.
+    let plan_display = displayable(unsupported_plan.as_ref())
+        .indent(true)
+        .to_string();
+    assert!(
+        !plan_display.contains("GroupedTopKAggregateStream"),
+        "Unsupported UTF-8 aggregate value should not use TopK: {plan_display}"
+    );
+
+    assert_batches_eq!(
+        &[
+            "+---+---------+",
+            "| g | s       |",
+            "+---+---------+",
+            "| a | charlie |",
+            "+---+---------+"
+        ],
+        &unsupported_batches
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_count_distinct_optimization() -> Result<()> {
+    struct TestCase {
+        name: &'static str,
+        distinct_count: Precision<usize>,
+        use_column_expr: bool,
+        expect_optimized: bool,
+        expected_value: Option<i64>,
+    }
+
+    let cases = vec![
+        TestCase {
+            name: "exact statistics",
+            distinct_count: Precision::Exact(42),
+            use_column_expr: true,
+            expect_optimized: true,
+            expected_value: Some(42),
+        },
+        TestCase {
+            name: "absent statistics",
+            distinct_count: Precision::Absent,
+            use_column_expr: true,
+            expect_optimized: false,
+            expected_value: None,
+        },
+        TestCase {
+            name: "inexact statistics",
+            distinct_count: Precision::Inexact(42),
+            use_column_expr: true,
+            expect_optimized: false,
+            expected_value: None,
+        },
+        TestCase {
+            name: "non-column expression with exact statistics",
+            distinct_count: Precision::Exact(42),
+            use_column_expr: false,
+            expect_optimized: false,
+            expected_value: None,
+        },
+    ];
+
+    for case in cases {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let statistics = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: case.distinct_count,
+                    null_count: Precision::Exact(10),
+                    ..Default::default()
+                },
+                ColumnStatistics::default(),
+            ],
+        };
+
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(ParquetSource::new(Arc::clone(&schema))),
+        )
+        .with_file(PartitionedFile::new("x".to_string(), 100))
+        .with_statistics(statistics)
+        .build();
+
+        let source: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
+        let schema = source.schema();
+
+        let (agg_args, alias): (Vec<Arc<dyn datafusion_physical_expr::PhysicalExpr>>, _) =
+            if case.use_column_expr {
+                (vec![expressions::col("a", &schema)?], "COUNT(DISTINCT a)")
+            } else {
+                (
+                    vec![expressions::binary(
+                        expressions::col("a", &schema)?,
+                        Operator::Plus,
+                        expressions::col("b", &schema)?,
+                        &schema,
+                    )?],
+                    "COUNT(DISTINCT a + b)",
+                )
+            };
+
+        let count_distinct_expr = AggregateExprBuilder::new(count_udaf(), agg_args)
+            .schema(Arc::clone(&schema))
+            .alias(alias)
+            .distinct()
+            .build()?;
+
+        let partial_agg = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(count_distinct_expr.clone())],
+            vec![None],
+            source,
+            Arc::clone(&schema),
+        )?;
+
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(count_distinct_expr)],
+            vec![None],
+            Arc::new(partial_agg),
+            Arc::clone(&schema),
+        )?;
+
+        let conf = ConfigOptions::new();
+        let optimized =
+            AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
+
+        if case.expect_optimized {
+            assert!(
+                optimized.as_any().is::<ProjectionExec>(),
+                "'{}': expected ProjectionExec",
+                case.name
+            );
+
+            if let Some(expected_val) = case.expected_value {
+                let task_ctx = Arc::new(TaskContext::default());
+                let result = common::collect(optimized.execute(0, task_ctx)?).await?;
+                assert_eq!(result.len(), 1, "'{}': expected 1 batch", case.name);
+                assert_eq!(
+                    as_int64_array(result[0].column(0)).unwrap().values(),
+                    &[expected_val],
+                    "'{}': unexpected value",
+                    case.name
+                );
+            }
+        } else {
+            assert!(
+                optimized.as_any().is::<AggregateExec>(),
+                "'{}': expected AggregateExec (not optimized)",
+                case.name
+            );
+        }
+    }
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
index 568be0d18f245..9e63c341c92d9 100644
--- a/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
+++ b/datafusion/core/tests/physical_optimizer/combine_partial_final_agg.rs
@@ -20,45 +20,40 @@
 //! Note these tests are not in the same module as the optimizer pass because
 //! they rely on `DataSourceExec` which is in the core crate.
 
+use insta::assert_snapshot;
 use std::sync::Arc;
 
-use crate::physical_optimizer::test_utils::{parquet_exec, trim_plan_display};
+use crate::physical_optimizer::test_utils::parquet_exec;
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::config::ConfigOptions;
 use datafusion_functions_aggregate::count::count_udaf;
 use datafusion_functions_aggregate::sum::sum_udaf;
+use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
 use datafusion_physical_expr::expressions::{col, lit};
-use datafusion_physical_expr::Partitioning;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::ExecutionPlan;
 
 /// Runs the CombinePartialFinalAggregate optimizer and asserts the plan against the expected
 macro_rules! assert_optimized {
-    ($EXPECTED_LINES: expr, $PLAN: expr) => {
-        let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect();
-
+    ($PLAN: expr, @ $EXPECTED_LINES: literal $(,)?) => {
         // run optimizer
         let optimizer = CombinePartialFinalAggregate {};
         let config = ConfigOptions::new();
         let optimized = optimizer.optimize($PLAN, &config)?;
         // Now format correctly
         let plan = displayable(optimized.as_ref()).indent(true).to_string();
-        let actual_lines = trim_plan_display(&plan);
+        let actual_lines = plan.trim();
 
-        assert_eq!(
-            &expected_lines, &actual_lines,
-            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
-            expected_lines, actual_lines
-        );
+        assert_snapshot!(actual_lines, @ $EXPECTED_LINES);
     };
 }
 
@@ -136,7 +131,7 @@ fn aggregations_not_combined() -> datafusion_common::Result<()> {
 
     let plan = final_aggregate_exec(
         repartition_exec(partial_aggregate_exec(
-            parquet_exec(&schema),
+            parquet_exec(schema.clone()),
             PhysicalGroupBy::default(),
             aggr_expr.clone(),
         )),
@@ -144,20 +139,22 @@ fn aggregations_not_combined() -> datafusion_common::Result<()> {
         aggr_expr,
     );
     // should not combine the Partial/Final AggregateExecs
-    let expected = &[
-        "AggregateExec: mode=Final, gby=[], aggr=[COUNT(1)]",
-        "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet",
-    ];
-    assert_optimized!(expected, plan);
+    assert_optimized!(
+        plan,
+        @ r"
+    AggregateExec: mode=Final, gby=[], aggr=[COUNT(1)]
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
+    "
+    );
 
     let aggr_expr1 = vec![count_expr(lit(1i8), "COUNT(1)", &schema)];
     let aggr_expr2 = vec![count_expr(lit(1i8), "COUNT(2)", &schema)];
 
     let plan = final_aggregate_exec(
         partial_aggregate_exec(
-            parquet_exec(&schema),
+            parquet_exec(schema),
             PhysicalGroupBy::default(),
             aggr_expr1,
         ),
@@ -165,13 +162,14 @@ fn aggregations_not_combined() -> datafusion_common::Result<()> {
         aggr_expr2,
     );
     // should not combine the Partial/Final AggregateExecs
-    let expected = &[
-        "AggregateExec: mode=Final, gby=[], aggr=[COUNT(2)]",
-        "AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet",
-    ];
-
-    assert_optimized!(expected, plan);
+    assert_optimized!(
+        plan,
+        @ r"
+    AggregateExec: mode=Final, gby=[], aggr=[COUNT(2)]
+      AggregateExec: mode=Partial, gby=[], aggr=[COUNT(1)]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
+    "
+    );
 
     Ok(())
 }
@@ -183,7 +181,7 @@ fn aggregations_combined() -> datafusion_common::Result<()> {
 
     let plan = final_aggregate_exec(
         partial_aggregate_exec(
-            parquet_exec(&schema),
+            parquet_exec(schema),
             PhysicalGroupBy::default(),
             aggr_expr.clone(),
         ),
@@ -191,12 +189,13 @@ fn aggregations_combined() -> datafusion_common::Result<()> {
         aggr_expr,
     );
     // should combine the Partial/Final AggregateExecs to the Single AggregateExec
-    let expected = &[
-        "AggregateExec: mode=Single, gby=[], aggr=[COUNT(1)]",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet",
-    ];
-
-    assert_optimized!(expected, plan);
+    assert_optimized!(
+        plan,
+        @ r"
+    AggregateExec: mode=Single, gby=[], aggr=[COUNT(1)]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
+    "
+    );
     Ok(())
 }
 
@@ -215,11 +214,8 @@ fn aggregations_with_group_combined() -> datafusion_common::Result<()> {
         vec![(col("c", &schema)?, "c".to_string())];
 
     let partial_group_by = PhysicalGroupBy::new_single(groups);
-    let partial_agg = partial_aggregate_exec(
-        parquet_exec(&schema),
-        partial_group_by,
-        aggr_expr.clone(),
-    );
+    let partial_agg =
+        partial_aggregate_exec(parquet_exec(schema), partial_group_by, aggr_expr.clone());
 
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("c", &partial_agg.schema())?, "c".to_string())];
@@ -227,12 +223,13 @@ fn aggregations_with_group_combined() -> datafusion_common::Result<()> {
 
     let plan = final_aggregate_exec(partial_agg, final_group_by, aggr_expr);
     // should combine the Partial/Final AggregateExecs to the Single AggregateExec
-    let expected = &[
-        "AggregateExec: mode=Single, gby=[c@2 as c], aggr=[Sum(b)]",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet",
-    ];
-
-    assert_optimized!(expected, plan);
+    assert_optimized!(
+        plan,
+        @ r"
+    AggregateExec: mode=Single, gby=[c@2 as c], aggr=[Sum(b)]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
+    "
+    );
     Ok(())
 }
 
@@ -245,11 +242,8 @@ fn aggregations_with_limit_combined() -> datafusion_common::Result<()> {
         vec![(col("c", &schema)?, "c".to_string())];
 
     let partial_group_by = PhysicalGroupBy::new_single(groups);
-    let partial_agg = partial_aggregate_exec(
-        parquet_exec(&schema),
-        partial_group_by,
-        aggr_expr.clone(),
-    );
+    let partial_agg =
+        partial_aggregate_exec(parquet_exec(schema), partial_group_by, aggr_expr.clone());
 
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("c", &partial_agg.schema())?, "c".to_string())];
@@ -266,16 +260,17 @@ fn aggregations_with_limit_combined() -> datafusion_common::Result<()> {
             schema,
         )
         .unwrap()
-        .with_limit(Some(5)),
+        .with_limit_options(Some(LimitOptions::new(5))),
     );
     let plan: Arc<dyn ExecutionPlan> = final_agg;
     // should combine the Partial/Final AggregateExecs to a Single AggregateExec
     // with the final limit preserved
-    let expected = &[
-        "AggregateExec: mode=Single, gby=[c@2 as c], aggr=[], lim=[5]",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet",
-    ];
-
-    assert_optimized!(expected, plan);
+    assert_optimized!(
+        plan,
+        @ r"
+    AggregateExec: mode=Single, gby=[c@2 as c], aggr=[], lim=[5]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c], file_type=parquet
+    "
+    );
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
index 4034800c30cba..993798ff7539f 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs
@@ -20,59 +20,100 @@ use std::ops::Deref;
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    check_integrity, coalesce_partitions_exec, repartition_exec, schema,
-    sort_merge_join_exec, sort_preserving_merge_exec,
-};
-use crate::physical_optimizer::test_utils::{
-    parquet_exec_with_sort, parquet_exec_with_stats,
+    check_integrity, coalesce_partitions_exec, parquet_exec_with_sort,
+    parquet_exec_with_stats, repartition_exec, schema, sort_exec,
+    sort_exec_with_preserve_partitioning, sort_merge_join_exec,
+    sort_preserving_merge_exec, union_exec,
 };
 
-use arrow::array::{RecordBatch, UInt64Array, UInt8Array};
+use arrow::array::{RecordBatch, UInt8Array, UInt64Array};
 use arrow::compute::SortOptions;
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use datafusion::config::ConfigOptions;
+use datafusion::datasource::MemTable;
 use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::{CsvSource, ParquetSource};
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::datasource::MemTable;
 use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion_common::error::Result;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::CsvOptions;
+use datafusion_common::error::Result;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_expr::{JoinType, Operator};
-use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
-use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_expr::{
-    expressions::binary, expressions::lit, LexOrdering, PhysicalSortExpr,
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal, binary, lit};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, OrderingRequirements, PhysicalSortExpr,
 };
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_optimizer::enforce_distribution::*;
 use datafusion_physical_optimizer::enforce_sorting::EnforceSorting;
 use datafusion_physical_optimizer::output_requirements::OutputRequirements;
-use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::execution_plan::ExecutionPlan;
 use datafusion_physical_plan::expressions::col;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::JoinOn;
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
-use datafusion_physical_plan::projection::ProjectionExec;
-use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::union::UnionExec;
-use datafusion_physical_plan::ExecutionPlanProperties;
-use datafusion_physical_plan::PlanProperties;
 use datafusion_physical_plan::{
-    get_plan_string, DisplayAs, DisplayFormatType, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, displayable,
 };
+use insta::Settings;
+
+/// Helper function to replace only the first occurrence of a regex pattern in a plan
+/// Returns (captured_group_1, modified_string)
+fn hide_first(
+    plan: &dyn ExecutionPlan,
+    regex: &str,
+    replacement: &str,
+) -> (String, String) {
+    let plan_str = displayable(plan).indent(true).to_string();
+    let pattern = regex::Regex::new(regex).unwrap();
+
+    if let Some(captures) = pattern.captures(&plan_str) {
+        let full_match = captures.get(0).unwrap();
+        let captured_value = captures
+            .get(1)
+            .map(|m| m.as_str().to_string())
+            .unwrap_or_default();
+        let pos = full_match.start();
+        let end_pos = full_match.end();
+        let mut result = String::with_capacity(plan_str.len());
+        result.push_str(&plan_str[..pos]);
+        result.push_str(replacement);
+        result.push_str(&plan_str[end_pos..]);
+        (captured_value, result)
+    } else {
+        (String::new(), plan_str)
+    }
+}
+
+macro_rules! assert_plan {
+    ($plan: expr, @ $expected:literal) => {
+        insta::assert_snapshot!(
+            displayable($plan.as_ref()).indent(true).to_string(),
+            @ $expected
+        )
+    };
+    ($plan: expr, $another_plan: expr) => {
+        let plan1 = displayable($plan.as_ref()).indent(true).to_string();
+        let plan2 = displayable($another_plan.as_ref()).indent(true).to_string();
+        assert_eq!(plan1, plan2);
+    }
+}
 
 /// Models operators like BoundedWindowExec that require an input
 /// ordering but is easy to construct
@@ -80,7 +121,7 @@ use datafusion_physical_plan::{
 struct SortRequiredExec {
     input: Arc<dyn ExecutionPlan>,
     expr: LexOrdering,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl SortRequiredExec {
@@ -92,7 +133,7 @@ impl SortRequiredExec {
         Self {
             input,
             expr: requirement,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -134,7 +175,7 @@ impl ExecutionPlan for SortRequiredExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -147,12 +188,8 @@ impl ExecutionPlan for SortRequiredExec {
     }
 
     // model that it requires the output ordering of its input
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
-        if self.expr.is_empty() {
-            vec![None]
-        } else {
-            vec![Some(LexRequirement::from(self.expr.clone()))]
-        }
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        vec![Some(OrderingRequirements::from(self.expr.clone()))]
     }
 
     fn with_new_children(
@@ -167,6 +204,20 @@ impl ExecutionPlan for SortRequiredExec {
         )))
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn execute(
         &self,
         _partition: usize,
@@ -174,14 +225,10 @@ impl ExecutionPlan for SortRequiredExec {
     ) -> Result<datafusion_physical_plan::SendableRecordBatchStream> {
         unreachable!();
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
 }
 
 fn parquet_exec() -> Arc<DataSourceExec> {
-    parquet_exec_with_sort(vec![])
+    parquet_exec_with_sort(schema(), vec![])
 }
 
 fn parquet_exec_multiple() -> Arc<DataSourceExec> {
@@ -194,8 +241,7 @@ fn parquet_exec_multiple_sorted(
 ) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema())),
     )
     .with_file_groups(vec![
         FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
@@ -212,14 +258,19 @@ fn csv_exec() -> Arc<DataSourceExec> {
 }
 
 fn csv_exec_with_sort(output_ordering: Vec<LexOrdering>) -> Arc<DataSourceExec> {
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(CsvSource::new(false, b',', b'"')),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_output_ordering(output_ordering)
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: b',',
+                quote: b'"',
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x".to_string(), 100))
+        .with_output_ordering(output_ordering)
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -230,17 +281,22 @@ fn csv_exec_multiple() -> Arc<DataSourceExec> {
 
 // Created a sorted parquet exec with multiple files
 fn csv_exec_multiple_sorted(output_ordering: Vec<LexOrdering>) -> Arc<DataSourceExec> {
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(CsvSource::new(false, b',', b'"')),
-    )
-    .with_file_groups(vec![
-        FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
-        FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]),
-    ])
-    .with_output_ordering(output_ordering)
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: b',',
+                quote: b'"',
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema()).with_csv_options(options))
+        })
+        .with_file_groups(vec![
+            FileGroup::new(vec![PartitionedFile::new("x".to_string(), 100)]),
+            FileGroup::new(vec![PartitionedFile::new("y".to_string(), 100)]),
+        ])
+        .with_output_ordering(output_ordering)
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -251,7 +307,10 @@ fn projection_exec_with_alias(
 ) -> Arc<dyn ExecutionPlan> {
     let mut exprs = vec![];
     for (column, alias) in alias_pairs.iter() {
-        exprs.push((col(column, &input.schema()).unwrap(), alias.to_string()));
+        exprs.push(ProjectionExpr {
+            expr: col(column, &input.schema()).unwrap(),
+            alias: alias.to_string(),
+        });
     }
     Arc::new(ProjectionExec::try_new(exprs, input).unwrap())
 }
@@ -327,16 +386,6 @@ fn filter_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     Arc::new(FilterExec::try_new(predicate, input).unwrap())
 }
 
-fn sort_exec(
-    sort_exprs: LexOrdering,
-    input: Arc<dyn ExecutionPlan>,
-    preserve_partitioning: bool,
-) -> Arc<dyn ExecutionPlan> {
-    let new_sort = SortExec::new(sort_exprs, input)
-        .with_preserve_partitioning(preserve_partitioning);
-    Arc::new(new_sort)
-}
-
 fn limit_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     Arc::new(GlobalLimitExec::new(
         Arc::new(LocalLimitExec::new(input, 100)),
@@ -345,10 +394,6 @@ fn limit_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     ))
 }
 
-fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(UnionExec::new(input))
-}
-
 fn sort_required_exec_with_req(
     input: Arc<dyn ExecutionPlan>,
     sort_exprs: LexOrdering,
@@ -371,22 +416,6 @@ fn ensure_distribution_helper(
     ensure_distribution(distribution_context, &config).map(|item| item.data.plan)
 }
 
-/// Test whether plan matches with expected plan
-macro_rules! plans_matches_expected {
-    ($EXPECTED_LINES: expr, $PLAN: expr) => {
-        let physical_plan = $PLAN;
-        let actual = get_plan_string(&physical_plan);
-
-        let expected_plan_lines: Vec<&str> = $EXPECTED_LINES
-            .iter().map(|s| *s).collect();
-
-        assert_eq!(
-            expected_plan_lines, actual,
-            "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
-    }
-}
-
 fn test_suite_default_config_options() -> ConfigOptions {
     let mut config = ConfigOptions::new();
 
@@ -463,15 +492,12 @@ impl TestConfig {
 
     /// Perform a series of runs using the current [`TestConfig`],
     /// assert the expected plan result,
-    /// and return the result plan (for potentional subsequent runs).
-    fn run(
+    /// and return the result plan (for potential subsequent runs).
+    fn try_to_plan(
         &self,
-        expected_lines: &[&str],
         plan: Arc<dyn ExecutionPlan>,
         optimizers_to_run: &[Run],
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let expected_lines: Vec<&str> = expected_lines.to_vec();
-
         // Add the ancillary output requirements operator at the start:
         let optimizer = OutputRequirements::new_add_mode();
         let mut optimized = optimizer.optimize(plan.clone(), &self.config)?;
@@ -526,30 +552,16 @@ impl TestConfig {
         let optimizer = OutputRequirements::new_remove_mode();
         let optimized = optimizer.optimize(optimized, &self.config)?;
 
-        // Now format correctly
-        let actual_lines = get_plan_string(&optimized);
-
-        assert_eq!(
-            &expected_lines, &actual_lines,
-            "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n"
-        );
-
         Ok(optimized)
     }
-}
-
-macro_rules! assert_plan_txt {
-    ($EXPECTED_LINES: expr, $PLAN: expr) => {
-        let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect();
-        // Now format correctly
-        let actual_lines = get_plan_string(&$PLAN);
 
-        assert_eq!(
-            &expected_lines, &actual_lines,
-            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
-            expected_lines, actual_lines
-        );
-    };
+    fn to_plan(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        optimizers_to_run: &[Run],
+    ) -> Arc<dyn ExecutionPlan> {
+        self.try_to_plan(plan, optimizers_to_run).unwrap()
+    }
 }
 
 #[test]
@@ -575,6 +587,8 @@ fn multi_hash_joins() -> Result<()> {
         JoinType::RightAnti,
     ];
 
+    let settings = Settings::clone_current();
+
     // Join on (a == b1)
     let join_on = vec![(
         Arc::new(Column::new_with_schema("a", &schema()).unwrap()) as _,
@@ -583,11 +597,17 @@ fn multi_hash_joins() -> Result<()> {
 
     for join_type in join_types {
         let join = hash_join_exec(left.clone(), right.clone(), &join_on, &join_type);
-        let join_plan = |shift| -> String {
-            format!("{}HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, b1@1)]", " ".repeat(shift))
-        };
-        let join_plan_indent2 = join_plan(2);
-        let join_plan_indent4 = join_plan(4);
+
+        let mut settings = settings.clone();
+        settings.add_filter(
+            // join_type={} replace with join_type=... to avoid snapshot name issue
+            format!("join_type={join_type}").as_str(),
+            "join_type=...",
+        );
+
+        insta::allow_duplicates! {
+            settings.bind( || {
+
 
         match join_type {
             JoinType::Inner
@@ -608,57 +628,60 @@ fn multi_hash_joins() -> Result<()> {
                     &top_join_on,
                     &join_type,
                 );
-                let top_join_plan =
-                    format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, c@2)]");
 
-                let expected = match join_type {
+                let test_config = TestConfig::default();
+                let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                match join_type {
                     // Should include 3 RepartitionExecs
-                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => vec![
-                        top_join_plan.as_str(),
-                        &join_plan_indent2,
-                        "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
+                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
+
+                                assert_plan!(plan_distrib, @r"
+                                HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)]
+                                  HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
+                                    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                      ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            },
                     // Should include 4 RepartitionExecs
-                    _ => vec![
-                        top_join_plan.as_str(),
-                        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        &join_plan_indent4,
-                        "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
+                    _ => {
+                                assert_plan!(plan_distrib, @r"
+                                HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+                                    HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
+                                      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            },
                 };
 
-                let test_config = TestConfig::default();
-                test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-                test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?;
+
+                let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+                assert_plan!(plan_distrib, plan_sort);
             }
-            JoinType::RightSemi | JoinType::RightAnti => {}
+            JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {}
         }
 
+
+
         match join_type {
             JoinType::Inner
             | JoinType::Left
             | JoinType::Right
             | JoinType::Full
             | JoinType::RightSemi
-            | JoinType::RightAnti => {
+            | JoinType::RightAnti
+            | JoinType::RightMark => {
                 // This time we use (b1 == c) for top join
                 // Join on (b1 == c)
                 let top_join_on = vec![(
@@ -668,55 +691,58 @@ fn multi_hash_joins() -> Result<()> {
 
                 let top_join =
                     hash_join_exec(join, parquet_exec(), &top_join_on, &join_type);
-                let top_join_plan = match join_type {
-                    JoinType::RightSemi | JoinType::RightAnti =>
-                        format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@1, c@2)]"),
-                    _ =>
-                        format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@6, c@2)]"),
-                };
 
-                let expected = match join_type {
+                let test_config = TestConfig::default();
+                let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                match join_type {
                     // Should include 3 RepartitionExecs
-                    JoinType::Inner | JoinType::Right | JoinType::RightSemi | JoinType::RightAnti =>
-                        vec![
-                            top_join_plan.as_str(),
-                            &join_plan_indent2,
-                            "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                            "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                            "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                            "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        ],
+                    JoinType::Inner | JoinType::Right => {
+                            assert_plan!(parquet_exec(), @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet");
+                            },
+                    // Should include 3 RepartitionExecs but have a different "on"
+                            JoinType::RightSemi | JoinType::RightAnti => {
+                            assert_plan!(plan_distrib, @r"
+                            HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@1, c@2)]
+                              HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
+                                RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            ");
+
+                            }
+
                     // Should include 4 RepartitionExecs
-                    _ =>
-                        vec![
-                            top_join_plan.as_str(),
-                            "  RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10",
-                            &join_plan_indent4,
-                            "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                            "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                            "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                            "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                            "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                            "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        ],
+                    _ => {
+                            assert_plan!(plan_distrib, @r"
+                            HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@6, c@2)]
+                              RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10
+                                HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                    ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                              RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            ");
+
+                            },
                 };
 
-                let test_config = TestConfig::default();
-                test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-                test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?;
+
+                let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+                        assert_plan!(plan_distrib, plan_sort);
             }
             JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {}
         }
+
+                });
+            }
     }
 
     Ok(())
@@ -755,23 +781,24 @@ fn multi_joins_after_alias() -> Result<()> {
     );
 
     // Output partition need to respect the Alias and should not introduce additional RepartitionExec
-    let expected = &[
-        "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]",
-        "  ProjectionExec: expr=[a@0 as a1, a@0 as a2]",
-        "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]",
-        "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "      RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]
+      ProjectionExec: expr=[a@0 as a1, a@0 as a2]
+        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
+          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     // Join on (a2 == c)
     let top_join_on = vec![(
@@ -782,23 +809,24 @@ fn multi_joins_after_alias() -> Result<()> {
     let top_join = hash_join_exec(projection, right, &top_join_on, &JoinType::Inner);
 
     // Output partition need to respect the Alias and should not introduce additional RepartitionExec
-    let expected = &[
-        "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]",
-        "  ProjectionExec: expr=[a@0 as a1, a@0 as a2]",
-        "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]",
-        "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "      RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]
+      ProjectionExec: expr=[a@0 as a1, a@0 as a2]
+        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
+          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -834,26 +862,26 @@ fn multi_joins_after_multi_alias() -> Result<()> {
 
     // The Column 'a' has different meaning now after the two Projections
     // The original Output partition can not satisfy the Join requirements and need to add an additional RepartitionExec
-    let expected = &[
-        "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    ProjectionExec: expr=[c1@0 as a]",
-        "      ProjectionExec: expr=[c@2 as c1]",
-        "        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]",
-        "          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "          RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10",
-        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        ProjectionExec: expr=[c1@0 as a]
+          ProjectionExec: expr=[c@2 as c1]
+            HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]
+              RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -879,22 +907,26 @@ fn join_after_agg_alias() -> Result<()> {
     let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner);
 
     // Only two RepartitionExecs added
-    let expected = &[
-        "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)]",
-        "  AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]",
-        "    RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10",
-        "      AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]",
-        "    RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10",
-        "      AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[]",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)]
+      AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]
+        RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10
+          AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]
+        RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10
+          AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[]
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -932,23 +964,27 @@ fn hash_join_key_ordering() -> Result<()> {
     let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner);
 
     // Only two RepartitionExecs added
-    let expected = &[
-        "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)]",
-        "  ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]",
-        "    AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]",
-        "      RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10",
-        "        AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]",
-        "    RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10",
-        "      AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)]
+      ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
+        AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
+          RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
+            AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
+        RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
+          AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1052,30 +1088,31 @@ fn multi_hash_join_key_ordering() -> Result<()> {
         Arc::new(FilterExec::try_new(predicate, top_join)?);
 
     // The bottom joins' join key ordering is adjusted based on the top join. And the top join should not introduce additional RepartitionExec
-    let expected = &[
-        "FilterExec: c@6 > 1",
-        "  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]",
-        "    ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]",
-        "      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]",
-        "        RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "        RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]",
-        "      RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "      RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, filter_top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, filter_top_join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib =
+        test_config.to_plan(filter_top_join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(
+        plan_distrib,
+        @r"
+    FilterExec: c@6 > 1
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]
+        ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
+          HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]
+            RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1
+              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+        HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]
+          RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=1
+            ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+    let plan_sort = test_config.to_plan(filter_top_join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1186,34 +1223,30 @@ fn reorder_join_keys_to_left_input() -> Result<()> {
             &top_join_on,
             &join_type,
         );
-        let top_join_plan =
-            format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]", &join_type);
 
-        let reordered = reorder_join_keys_to_inputs(top_join)?;
+        let reordered = reorder_join_keys_to_inputs(top_join).unwrap();
 
         // The top joins' join key ordering is adjusted based on the children inputs.
-        let expected = &[
-            top_join_plan.as_str(),
-            "  ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]",
-            "      RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10",
-            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "      RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10",
-            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-            "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]",
-            "    RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10",
-            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "    RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10",
-            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        ];
-
-        assert_plan_txt!(expected, reordered);
+        let (captured_join_type, modified_plan) =
+            hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=...");
+        assert_eq!(captured_join_type, join_type.to_string());
+
+        insta::allow_duplicates! {insta::assert_snapshot!(modified_plan, @r"
+        HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]
+          ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
+            HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]
+              RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=1
+                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
+            RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1
+              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+        ");}
     }
 
     Ok(())
@@ -1320,34 +1353,28 @@ fn reorder_join_keys_to_right_input() -> Result<()> {
             &top_join_on,
             &join_type,
         );
-        let top_join_plan =
-            format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]", &join_type);
 
-        let reordered = reorder_join_keys_to_inputs(top_join)?;
+        let reordered = reorder_join_keys_to_inputs(top_join).unwrap();
 
         // The top joins' join key ordering is adjusted based on the children inputs.
-        let expected = &[
-            top_join_plan.as_str(),
-            "  ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]",
-            "      RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10",
-            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "      RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10",
-            "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-            "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]",
-            "    RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10",
-            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            "    RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10",
-            "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]",
-            "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        ];
-
-        assert_plan_txt!(expected, reordered);
+        let (_, plan_str) =
+            hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=...");
+        insta::allow_duplicates! {insta::assert_snapshot!(plan_str, @r"
+        HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]
+          ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]
+            HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]
+              RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+              RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=1
+                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]
+            RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=1
+              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+        ");}
     }
 
     Ok(())
@@ -1387,15 +1414,6 @@ fn multi_smj_joins() -> Result<()> {
     for join_type in join_types {
         let join =
             sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type);
-        let join_plan = |shift| -> String {
-            format!(
-                "{}SortMergeJoin: join_type={join_type}, on=[(a@0, b1@1)]",
-                " ".repeat(shift)
-            )
-        };
-        let join_plan_indent2 = join_plan(2);
-        let join_plan_indent6 = join_plan(6);
-        let join_plan_indent10 = join_plan(10);
 
         // Top join on (a == c)
         let top_join_on = vec![(
@@ -1404,235 +1422,220 @@ fn multi_smj_joins() -> Result<()> {
         )];
         let top_join =
             sort_merge_join_exec(join.clone(), parquet_exec(), &top_join_on, &join_type);
-        let top_join_plan =
-            format!("SortMergeJoin: join_type={join_type}, on=[(a@0, c@2)]");
-
-        let expected = match join_type {
-            // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
-            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti =>
-                vec![
-                    top_join_plan.as_str(),
-                    &join_plan_indent2,
-                    "    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                    "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                    "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                ],
-            // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs
-            // Since ordering of the left child is not preserved after SortMergeJoin
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases
-            //   when mode is Inner, Left, LeftSemi, LeftAnti
-            // Similarly, since partitioning of the left side is not preserved
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional Hash Repartition after SortMergeJoin in contrast the test
-            //   cases when mode is Inner, Left, LeftSemi, LeftAnti
-            _ => vec![
-                    top_join_plan.as_str(),
-                    // Below 2 operators are differences introduced, when join mode is changed
-                    "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    &join_plan_indent6,
-                    "        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                    "          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                    "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                    "          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                    "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                    "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            ],
-        };
-        // TODO(wiedld): show different test result if enforce sorting first.
-        test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-
-        let expected_first_sort_enforcement = match join_type {
-            // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
-            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti =>
-                vec![
-                    top_join_plan.as_str(),
-                    &join_plan_indent2,
-                    "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                    "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                    "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                    "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                    "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                    "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                    "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                    "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                ],
-            // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs
-            // Since ordering of the left child is not preserved after SortMergeJoin
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional SortExec after SortMergeJoin in contrast the test cases
-            //   when mode is Inner, Left, LeftSemi, LeftAnti
-            // Similarly, since partitioning of the left side is not preserved
-            // when mode is Right, RightSemi, RightAnti, Full
-            // - We need to add one additional Hash Repartition and Roundrobin repartition after
-            //   SortMergeJoin in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti
-            _ => vec![
-                top_join_plan.as_str(),
-                // Below 4 operators are differences introduced, when join mode is changed
-                "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                "        CoalescePartitionsExec",
-                &join_plan_indent10,
-                "            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "                SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                "                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                "            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "                SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                "                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                "                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-            ],
-        };
-        // TODO(wiedld): show different test result if enforce distribution first.
-        test_config.run(
-            &expected_first_sort_enforcement,
-            top_join,
-            &SORT_DISTRIB_DISTRIB,
-        )?;
 
-        match join_type {
-            JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
-                // This time we use (b1 == c) for top join
-                // Join on (b1 == c)
-                let top_join_on = vec![(
-                    Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _,
-                    Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _,
-                )];
-                let top_join =
-                    sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type);
-                let top_join_plan =
-                    format!("SortMergeJoin: join_type={join_type}, on=[(b1@6, c@2)]");
-
-                let expected = match join_type {
-                    // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs
-                    JoinType::Inner | JoinType::Right => vec![
-                        top_join_plan.as_str(),
-                        &join_plan_indent2,
-                        "    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                        "      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                        "      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs
-                    JoinType::Left | JoinType::Full => vec![
-                        top_join_plan.as_str(),
-                        "  SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10",
-                        &join_plan_indent6,
-                        "        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-                        "          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-                        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]",
-                        "          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10",
-                        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-                        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // this match arm cannot be reached
-                    _ => unreachable!()
-                };
-                // TODO(wiedld): show different test result if enforce sorting first.
-                test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?;
-
-                let expected_first_sort_enforcement = match join_type {
-                    // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs
-                    JoinType::Inner | JoinType::Right => vec![
-                        top_join_plan.as_str(),
-                        &join_plan_indent2,
-                        "    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "        SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                        "          ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs
-                    JoinType::Left | JoinType::Full => vec![
-                        top_join_plan.as_str(),
-                        "  RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]",
-                        "        CoalescePartitionsExec",
-                        &join_plan_indent10,
-                        "            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-                        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "                SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-                        "                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC",
-                        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "                SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]",
-                        "                  ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]",
-                        "                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC",
-                        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-                        "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-                        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-                    ],
-                    // this match arm cannot be reached
-                    _ => unreachable!()
-                };
+        let mut settings = Settings::clone_current();
+        settings.add_filter(&format!("join_type={join_type}"), "join_type=...");
+
+        #[rustfmt::skip]
+        insta::allow_duplicates! {
+            settings.bind(|| {
+                let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                match join_type {
+                    // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
+                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => {
+                        assert_plan!(plan_distrib, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                            SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                              RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                              RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                    // Should include 7 RepartitionExecs (4 hash, 3 round-robin), 4 SortExecs
+                    // Since ordering of the left child is not preserved after SortMergeJoinExec
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional SortExec after SortMergeJoinExec in contrast the test cases
+                    //   when mode is Inner, Left, LeftSemi, LeftAnti
+                    // Similarly, since partitioning of the left side is not preserved
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional Hash Repartition after SortMergeJoinExec in contrast the test
+                    //   cases when mode is Inner, Left, LeftSemi, LeftAnti
+                    _ => {
+                        assert_plan!(plan_distrib, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+                              SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                    ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                            RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                }
 
-                // TODO(wiedld): show different test result if enforce distribution first.
-                test_config.run(
-                    &expected_first_sort_enforcement,
-                    top_join,
-                    &SORT_DISTRIB_DISTRIB,
-                )?;
-            }
-            _ => {}
+                let plan_sort = test_config.to_plan(top_join.clone(), &SORT_DISTRIB_DISTRIB);
+
+                match join_type {
+                    // Should include 6 RepartitionExecs (3 hash, 3 round-robin), 3 SortExecs
+                    JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti => {
+                        // TODO(wiedld): show different test result if enforce distribution first.
+                        assert_plan!(plan_sort, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                            RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                              SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                            RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                              SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                    // Should include 8 RepartitionExecs (4 hash, 8 round-robin), 4 SortExecs
+                    // Since ordering of the left child is not preserved after SortMergeJoinExec
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional SortExec after SortMergeJoinExec in contrast the test cases
+                    //   when mode is Inner, Left, LeftSemi, LeftAnti
+                    // Similarly, since partitioning of the left side is not preserved
+                    // when mode is Right, RightSemi, RightAnti, Full
+                    // - We need to add one additional Hash Repartition and Roundrobin repartition after
+                    //   SortMergeJoinExec in contrast the test cases when mode is Inner, Left, LeftSemi, LeftAnti
+                    _ => {
+                        // TODO(wiedld): show different test result if enforce distribution first.
+                        assert_plan!(plan_sort, @r"
+                        SortMergeJoinExec: join_type=..., on=[(a@0, c@2)]
+                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                              CoalescePartitionsExec
+                                SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                      ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                          RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                            SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                        ");
+                    }
+                }
+
+                match join_type {
+                    JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
+                        // This time we use (b1 == c) for top join
+                        // Join on (b1 == c)
+                        let top_join_on = vec![(
+                            Arc::new(Column::new_with_schema("b1", &join.schema()).unwrap()) as _,
+                            Arc::new(Column::new_with_schema("c", &schema()).unwrap()) as _,
+                        )];
+                        let top_join = sort_merge_join_exec(join, parquet_exec(), &top_join_on, &join_type);
+
+                        let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT);
+
+                        match join_type {
+                            // Should include 6 RepartitionExecs(3 hash, 3 round-robin) and 3 SortExecs
+                            JoinType::Inner | JoinType::Right => {
+                                // TODO(wiedld): show different test result if enforce sorting first.
+                                assert_plan!(plan_distrib, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                      RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // Should include 7 RepartitionExecs (4 hash, 3 round-robin) and 4 SortExecs
+                            JoinType::Left | JoinType::Full => {
+                                // TODO(wiedld): show different test result if enforce sorting first.
+                                assert_plan!(plan_distrib, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortExec: expr=[b1@6 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10
+                                      SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+                                            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                        SortExec: expr=[b1@1 ASC], preserve_partitioning=[true]
+                                          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1
+                                            ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+                                    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // this match arm cannot be reached
+                            _ => unreachable!()
+                        }
+
+                        let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB);
+
+                        match join_type {
+                            // Should include 6 RepartitionExecs (3 of them preserves order) and 3 SortExecs
+                            JoinType::Inner | JoinType::Right => {
+                                // TODO(wiedld): show different test result if enforce distribution first.
+                                assert_plan!(plan_sort, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                    RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                    RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                      SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                        ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs
+                            JoinType::Left | JoinType::Full => {
+                                // TODO(wiedld): show different test result if enforce distribution first.
+                                assert_plan!(plan_sort, @r"
+                                SortMergeJoinExec: join_type=..., on=[(b1@6, c@2)]
+                                  RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[b1@6 ASC], preserve_partitioning=[false]
+                                      CoalescePartitionsExec
+                                        SortMergeJoinExec: join_type=..., on=[(a@0, b1@1)]
+                                          RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+                                            SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                                              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                          RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=1, maintains_sort_order=true
+                                            SortExec: expr=[b1@1 ASC], preserve_partitioning=[false]
+                                              ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]
+                                                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=1, maintains_sort_order=true
+                                    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+                                ");
+                            }
+                            // this match arm cannot be reached
+                            _ => unreachable!()
+                        }
+                    }
+                    _ => {}
+                }
+            });
         }
     }
-
     Ok(())
 }
 
@@ -1688,52 +1691,50 @@ fn smj_join_key_ordering() -> Result<()> {
 
     // Test: run EnforceDistribution, then EnforceSort.
     // Only two RepartitionExecs added
-    let expected = &[
-        "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]",
-        "  SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]",
-        "    ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]",
-        "      ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]",
-        "        AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]",
-        "          RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10",
-        "            AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]",
-        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]",
-        "    ProjectionExec: expr=[a@1 as a2, b@0 as b2]",
-        "      AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]",
-        "        RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10",
-        "          AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]",
-        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?;
+    let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib, @r"
+    SortMergeJoinExec: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
+      SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
+          ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
+            AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
+              RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
+                AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@1 as a2, b@0 as b2]
+          AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
+            RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
+              AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
+                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]",
-        "  RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]",
-        "        CoalescePartitionsExec",
-        "          ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]",
-        "            ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]",
-        "              AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]",
-        "                RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10",
-        "                  AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]",
-        "                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]",
-        "        CoalescePartitionsExec",
-        "          ProjectionExec: expr=[a@1 as a2, b@0 as b2]",
-        "            AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]",
-        "              RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10",
-        "                AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]",
-        "                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, join, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort, @r"
+    SortMergeJoinExec: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]
+      RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]
+          CoalescePartitionsExec
+            ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]
+              ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]
+                AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]
+                  RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10
+                    AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]
+                      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]
+          CoalescePartitionsExec
+            ProjectionExec: expr=[a@1 as a2, b@0 as b2]
+              AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]
+                RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10
+                  AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]
+                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -1742,17 +1743,15 @@ fn smj_join_key_ordering() -> Result<()> {
 fn merge_does_not_need_sort() -> Result<()> {
     // see https://github.com/apache/datafusion/issues/4331
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
 
     // Scan some sorted parquet files
     let exec = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
 
-    // CoalesceBatchesExec to mimic behavior after a filter
-    let exec = Arc::new(CoalesceBatchesExec::new(exec, 4096));
-
     // Merge from multiple parquet files and keep the data sorted
     let exec: Arc<dyn ExecutionPlan> =
         Arc::new(SortPreservingMergeExec::new(sort_key, exec));
@@ -1761,13 +1760,13 @@ fn merge_does_not_need_sort() -> Result<()> {
     //
     // The optimizer should not add an additional SortExec as the
     // data is already sorted
-    let expected = &[
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  CoalesceBatchesExec: target_batch_size=4096",
-        "    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, exec.clone(), &DISTRIB_DISTRIB_SORT)?;
+    let plan_distrib = test_config.to_plan(exec.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                    @r"
+    SortPreservingMergeExec: [a@0 ASC]
+      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
     //
@@ -1775,13 +1774,13 @@ fn merge_does_not_need_sort() -> Result<()> {
     // (according to flag: PREFER_EXISTING_SORT)
     // hence in this case ordering lost during CoalescePartitionsExec and re-introduced with
     // SortExec at the top.
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    CoalesceBatchesExec: target_batch_size=4096",
-        "      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, exec, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(exec, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                    @r"
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -1800,32 +1799,33 @@ fn union_to_interleave() -> Result<()> {
     );
 
     //  Union
-    let plan = Arc::new(UnionExec::new(vec![left, right]));
+    let plan = UnionExec::try_new(vec![left, right])?;
 
     // final agg
     let plan =
         aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]);
 
     // Only two RepartitionExecs added, no final RepartitionExec required
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]",
-        "  AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]",
-        "    InterleaveExec",
-        "      AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]",
-        "        RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10",
-        "          AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]",
-        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "      AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]",
-        "        RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10",
-        "          AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]",
-        "            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]
+      AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]
+        InterleaveExec
+          AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]
+            RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10
+              AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]
+                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+          AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]
+            RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10
+              AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]
+                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1844,35 +1844,36 @@ fn union_not_to_interleave() -> Result<()> {
     );
 
     //  Union
-    let plan = Arc::new(UnionExec::new(vec![left, right]));
+    let plan = UnionExec::try_new(vec![left, right])?;
 
     // final agg
     let plan =
         aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]);
 
     // Only two RepartitionExecs added, no final RepartitionExec required
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20",
-        "    AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]",
-        "      UnionExec",
-        "        AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]",
-        "          RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10",
-        "            AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]",
-        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "        AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]",
-        "          RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10",
-        "            AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]",
-        "              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     // TestConfig: Prefer existing union.
     let test_config = TestConfig::default().with_prefer_existing_union();
 
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]
+      RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20
+        AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]
+          UnionExec
+            AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]
+              RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10
+                AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]
+                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+            AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]
+              RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10
+                AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]
+                  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1882,17 +1883,18 @@ fn added_repartition_to_single_partition() -> Result<()> {
     let alias = vec![("a".to_string(), "a".to_string())];
     let plan = aggregate_exec_with_alias(parquet_exec(), alias);
 
-    let expected = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(&expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(&expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1902,18 +1904,19 @@ fn repartition_deepest_node() -> Result<()> {
     let alias = vec![("a".to_string(), "a".to_string())];
     let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias);
 
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      FilterExec: c@2 = 0",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1922,19 +1925,20 @@ fn repartition_deepest_node() -> Result<()> {
 fn repartition_unsorted_limit() -> Result<()> {
     let plan = limit_exec(filter_exec(parquet_exec()));
 
-    let expected = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  CoalescePartitionsExec",
-        "    LocalLimitExec: fetch=100",
-        "      FilterExec: c@2 = 0",
-        // nothing sorts the data, so the local limit doesn't require sorted data either
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      CoalescePartitionsExec
+        LocalLimitExec: fetch=100
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // nothing sorts the data, so the local limit doesn't require sorted data either
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1942,23 +1946,25 @@ fn repartition_unsorted_limit() -> Result<()> {
 #[test]
 fn repartition_sorted_limit() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let plan = limit_exec(sort_exec(sort_key, parquet_exec(), false));
-
-    let expected = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  LocalLimitExec: fetch=100",
-        // data is sorted so can't repartition here
-        "    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
+    }]
+    .into();
+    let plan = limit_exec(sort_exec(sort_key, parquet_exec()));
 
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // data is sorted so can't repartition here
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -1966,28 +1972,30 @@ fn repartition_sorted_limit() -> Result<()> {
 #[test]
 fn repartition_sorted_limit_with_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_required_exec_with_req(
-        filter_exec(sort_exec(sort_key.clone(), parquet_exec(), false)),
+        filter_exec(sort_exec(sort_key.clone(), parquet_exec())),
         sort_key,
     );
 
-    let expected = &[
-        "SortRequiredExec: [c@2 ASC]",
-        "  FilterExec: c@2 = 0",
-        // We can use repartition here, ordering requirement by SortRequiredExec
-        // is still satisfied.
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortRequiredExec: [c@2 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // We can use repartition here, ordering requirement by SortRequiredExec
+    // is still satisfied.
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2000,26 +2008,28 @@ fn repartition_ignores_limit() -> Result<()> {
         alias,
     );
 
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        GlobalLimitExec: skip=0, fetch=100",
-        "          CoalescePartitionsExec",
-        "            LocalLimitExec: fetch=100",
-        "              FilterExec: c@2 = 0",
-        // repartition should happen prior to the filter to maximize parallelism
-        "                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                  GlobalLimitExec: skip=0, fetch=100",
-        "                    LocalLimitExec: fetch=100",
-        // Expect no repartition to happen for local limit
-        "                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            GlobalLimitExec: skip=0, fetch=100
+              CoalescePartitionsExec
+                LocalLimitExec: fetch=100
+                  FilterExec: c@2 = 0
+                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                      GlobalLimitExec: skip=0, fetch=100
+                        LocalLimitExec: fetch=100
+                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // repartition should happen prior to the filter to maximize parallelism
+    // Expect no repartition to happen for local limit (DataSourceExec)
+
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2028,19 +2038,20 @@ fn repartition_ignores_limit() -> Result<()> {
 fn repartition_ignores_union() -> Result<()> {
     let plan = union_exec(vec![parquet_exec(); 5]);
 
-    let expected = &[
-        "UnionExec",
-        // Expect no repartition of DataSourceExec
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Expect no repartition of DataSourceExec
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2049,21 +2060,22 @@ fn repartition_ignores_union() -> Result<()> {
 fn repartition_through_sort_preserving_merge() -> Result<()> {
     // sort preserving merge with non-sorted input
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_preserving_merge_exec(sort_key, parquet_exec());
 
-    // need resort as the data was not sorted correctly
-    let expected = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2072,33 +2084,35 @@ fn repartition_through_sort_preserving_merge() -> Result<()> {
 fn repartition_ignores_sort_preserving_merge() -> Result<()> {
     // sort preserving merge already sorted input,
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_preserving_merge_exec(
         sort_key.clone(),
         parquet_exec_multiple_sorted(vec![sort_key]),
     );
 
+    let test_config = TestConfig::default();
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     // Test: run EnforceDistribution, then EnforceSort
-    //
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [c@2 ASC]
+      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     // should not sort (as the data was already sorted)
     // should not repartition, since increased parallelism is not beneficial for SortPReservingMerge
-    let expected = &[
-        "SortPreservingMergeExec: [c@2 ASC]",
-        "  DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2107,34 +2121,40 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> {
 fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> {
     // 2 sorted parquet files unioned (partitions are concatenated, sort is preserved)
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let input = union_exec(vec![parquet_exec_with_sort(vec![sort_key.clone()]); 2]);
+    }]
+    .into();
+    let input = union_exec(vec![
+        parquet_exec_with_sort(schema, vec![sort_key.clone()]);
+        2
+    ]);
     let plan = sort_preserving_merge_exec(sort_key, input);
 
+    let test_config = TestConfig::default();
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
     // Test: run EnforceDistribution, then EnforceSort.
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [c@2 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
     //
     // should not repartition / sort (as the data was already sorted)
-    let expected = &[
-        "SortPreservingMergeExec: [c@2 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
 
     // test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2145,28 +2165,30 @@ fn repartition_does_not_destroy_sort() -> Result<()> {
     //  SortRequired
     //    Parquet(sorted)
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("d", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("d", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_required_exec_with_req(
-        filter_exec(parquet_exec_with_sort(vec![sort_key.clone()])),
+        filter_exec(parquet_exec_with_sort(schema, vec![sort_key.clone()])),
         sort_key,
     );
 
     // TestConfig: Prefer existing sort.
     let test_config = TestConfig::default().with_prefer_existing_sort();
 
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortRequiredExec: [d@3 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
+    ");
     // during repartitioning ordering is preserved
-    let expected = &[
-        "SortRequiredExec: [d@3 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet",
-    ];
-
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2183,33 +2205,37 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> {
     //    Parquet(unsorted)
 
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input1 = sort_required_exec_with_req(
-        parquet_exec_with_sort(vec![sort_key.clone()]),
+        parquet_exec_with_sort(schema, vec![sort_key.clone()]),
         sort_key,
     );
     let input2 = filter_exec(parquet_exec());
     let plan = union_exec(vec![input1, input2]);
 
+    let test_config = TestConfig::default();
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    UnionExec
+      SortRequiredExec: [c@2 ASC]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // union input 1: no repartitioning
+    // union input 2: should repartition
+    //
     // should not repartition below the SortRequired as that
     // branch doesn't benefit from increased parallelism
-    let expected = &[
-        "UnionExec",
-        // union input 1: no repartitioning
-        "  SortRequiredExec: [c@2 ASC]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-        // union input 2: should repartition
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
 
-    let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2217,44 +2243,45 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> {
 #[test]
 fn repartition_transitively_with_projection() -> Result<()> {
     let schema = schema();
-    let proj_exprs = vec![(
-        Arc::new(BinaryExpr::new(
-            col("a", &schema).unwrap(),
+    let proj_exprs = vec![ProjectionExpr {
+        expr: Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
             Operator::Plus,
-            col("b", &schema).unwrap(),
-        )) as Arc<dyn PhysicalExpr>,
-        "sum".to_string(),
-    )];
+            col("b", &schema)?,
+        )) as _,
+        alias: "sum".to_string(),
+    }];
     // non sorted input
     let proj = Arc::new(ProjectionExec::try_new(proj_exprs, parquet_exec())?);
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("sum", &proj.schema()).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("sum", &proj.schema())?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_preserving_merge_exec(sort_key, proj);
 
-    // Test: run EnforceDistribution, then EnforceSort.
-    let expected = &[
-        "SortPreservingMergeExec: [sum@0 ASC]",
-        "  SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]",
-        // Since this projection is not trivial, increasing parallelism is beneficial
-        "    ProjectionExec: expr=[a@0 + b@1 as sum]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [sum@0 ASC]
+      SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@0 + b@1 as sum]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        // Since this projection is not trivial, increasing parallelism is beneficial
-        "    ProjectionExec: expr=[a@0 + b@1 as sum]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        ProjectionExec: expr=[a@0 + b@1 as sum]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Since this projection is not trivial, increasing parallelism is beneficial
 
     Ok(())
 }
@@ -2262,10 +2289,11 @@ fn repartition_transitively_with_projection() -> Result<()> {
 #[test]
 fn repartition_ignores_transitively_with_projection() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let alias = vec![
         ("a".to_string(), "a".to_string()),
         ("b".to_string(), "b".to_string()),
@@ -2280,16 +2308,18 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> {
         sort_key,
     );
 
-    let expected = &[
-        "SortRequiredExec: [c@2 ASC]",
-        // Since this projection is trivial, increasing parallelism is not beneficial
-        "  ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]",
-        "    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortRequiredExec: [c@2 ASC]
+      ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    // Since this projection is trivial, increasing parallelism is not beneficial
+
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2297,10 +2327,11 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> {
 #[test]
 fn repartition_transitively_past_sort_with_projection() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let alias = vec![
         ("a".to_string(), "a".to_string()),
         ("b".to_string(), "b".to_string()),
@@ -2308,23 +2339,23 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> {
     ];
     let plan = sort_preserving_merge_exec(
         sort_key.clone(),
-        sort_exec(
+        sort_exec_with_preserve_partitioning(
             sort_key,
             projection_exec_with_alias(parquet_exec(), alias),
-            true,
         ),
     );
 
-    let expected = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // Since this projection is trivial, increasing parallelism is not beneficial
-        "  ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Since this projection is trivial, increasing parallelism is not beneficial
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -2332,34 +2363,37 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> {
 #[test]
 fn repartition_transitively_past_sort_with_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let plan = sort_exec(sort_key, filter_exec(parquet_exec()), false);
+    }]
+    .into();
+    let plan = sort_exec(sort_key, filter_exec(parquet_exec()));
 
-    // Test: run EnforceDistribution, then EnforceSort.
-    let expected = &[
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
-        "    FilterExec: c@2 = 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+
+    // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    FilterExec: c@2 = 0",
-        // Expect repartition on the input of the filter (as it can benefit from additional parallelism)
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Expect repartition on the input of the filter (as it can benefit from additional parallelism)
 
     Ok(())
 }
@@ -2368,10 +2402,11 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> {
 #[cfg(feature = "parquet")]
 fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan = sort_exec(
         sort_key,
         projection_exec_with_alias(
@@ -2382,33 +2417,34 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()>
                 ("c".to_string(), "c".to_string()),
             ],
         ),
-        false,
     );
 
-    // Test: run EnforceDistribution, then EnforceSort.
-    let expected = &[
-        "SortPreservingMergeExec: [a@0 ASC]",
-        // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "    ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]",
-        "      FilterExec: c@2 = 0",
-        // repartition is lowest down
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
+    let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+
+    // Expect repartition on the input to the sort (as it can benefit from additional parallelism)
+    // repartition is lowest down
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]",
-        "      FilterExec: c@2 = 0",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -2424,28 +2460,29 @@ fn parallelization_single_partition() -> Result<()> {
         .with_query_execution_partitions(2);
 
     // Test: with parquet
-    let expected_parquet = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        &expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+                                                                                        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+                                                                                        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2453,10 +2490,11 @@ fn parallelization_single_partition() -> Result<()> {
 #[test]
 fn parallelization_multiple_files() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
 
     let plan = filter_exec(parquet_exec_multiple_sorted(vec![sort_key.clone()]));
     let plan = sort_required_exec_with_req(plan, sort_key);
@@ -2468,40 +2506,31 @@ fn parallelization_multiple_files() -> Result<()> {
     // The groups must have only contiguous ranges of rows from the same file
     // if any group has rows from multiple files, the data is no longer sorted destroyed
     // https://github.com/apache/datafusion/issues/8451
-    let expected_with_3_target_partitions = [
-        "SortRequiredExec: [a@0 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
     let test_config_concurrency_3 =
         test_config.clone().with_query_execution_partitions(3);
-    test_config_concurrency_3.run(
-        &expected_with_3_target_partitions,
-        plan.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config_concurrency_3.run(
-        &expected_with_3_target_partitions,
-        plan.clone(),
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_3_distrib =
+        test_config_concurrency_3.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_3_distrib,
+                                                                                        @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
+    let plan_3_sort =
+        test_config_concurrency_3.to_plan(plan.clone(), &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_3_distrib, plan_3_sort);
 
-    let expected_with_8_target_partitions = [
-        "SortRequiredExec: [a@0 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
     let test_config_concurrency_8 = test_config.with_query_execution_partitions(8);
-    test_config_concurrency_8.run(
-        &expected_with_8_target_partitions,
-        plan.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config_concurrency_8.run(
-        &expected_with_8_target_partitions,
-        plan,
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_8_distrib =
+        test_config_concurrency_8.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_8_distrib,
+                                                                                        @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
+    let plan_8_sort = test_config_concurrency_8.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_8_distrib, plan_8_sort);
 
     Ok(())
 }
@@ -2518,46 +2547,55 @@ fn parallelization_compressed_csv() -> Result<()> {
         FileCompressionType::UNCOMPRESSED,
     ];
 
-    let expected_not_partitioned = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-
-    let expected_partitioned = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
+    #[rustfmt::skip]
+    insta::allow_duplicates! {
+        for compression_type in compression_types {
+            let plan = aggregate_exec_with_alias(
+                DataSourceExec::from_data_source(
+                    FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+                        let options = CsvOptions {
+                            has_header: Some(false),
+                            delimiter: b',',
+                            quote: b'"',
+                            ..Default::default()
+                        };
+                        Arc::new(CsvSource::new(schema()).with_csv_options(options))
+                    })
+                    .with_file(PartitionedFile::new("x".to_string(), 100))
+                    .with_file_compression_type(compression_type)
+                    .build(),
+                ),
+                vec![("a".to_string(), "a".to_string())],
+            );
+            let test_config = TestConfig::default()
+                .with_query_execution_partitions(2)
+                .with_prefer_repartition_file_scans(10);
+
+            let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT);
+            if compression_type.is_compressed() {
+                // Compressed files cannot be partitioned
+                assert_plan!(plan_distrib,
+                    @r"
+                AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+                  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+                    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+                      RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+                        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+                ");
+            } else {
+                // Uncompressed files can be partitioned
+                assert_plan!(plan_distrib,
+                    @r"
+                AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+                  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+                    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+                      DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+                ");
+            }
 
-    for compression_type in compression_types {
-        let expected = if compression_type.is_compressed() {
-            &expected_not_partitioned[..]
-        } else {
-            &expected_partitioned[..]
-        };
-
-        let plan = aggregate_exec_with_alias(
-            DataSourceExec::from_data_source(
-                FileScanConfigBuilder::new(
-                    ObjectStoreUrl::parse("test:///").unwrap(),
-                    schema(),
-                    Arc::new(CsvSource::new(false, b',', b'"')),
-                )
-                .with_file(PartitionedFile::new("x".to_string(), 100))
-                .with_file_compression_type(compression_type)
-                .build(),
-            ),
-            vec![("a".to_string(), "a".to_string())],
-        );
-        let test_config = TestConfig::default()
-            .with_query_execution_partitions(2)
-            .with_prefer_repartition_file_scans(10);
-        test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-        test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?;
+            let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB);
+            assert_plan!(plan_distrib, plan_sort);
+        }
     }
     Ok(())
 }
@@ -2573,30 +2611,30 @@ fn parallelization_two_partitions() -> Result<()> {
         .with_prefer_repartition_file_scans(10);
 
     // Test: with parquet
-    let expected_parquet = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        // Plan already has two partitions
-        "      DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        &expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+                                                                                    @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Plan already has two partitions
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        // Plan already has two partitions
-        "      DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib, @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    // Plan already has two partitions
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2612,30 +2650,32 @@ fn parallelization_two_partitions_into_four() -> Result<()> {
         .with_prefer_repartition_file_scans(10);
 
     // Test: with parquet
-    let expected_parquet = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        // Multiple source files splitted across partitions
-        "      DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        &expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    // Multiple source files split across partitions
+    assert_plan!(plan_parquet_distrib,
+                                                                                    @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Multiple source files split across partitions
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = [
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        // Multiple source files splitted across partitions
-        "      DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    // Multiple source files split across partitions
+    assert_plan!(plan_csv_distrib, @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    // Multiple source files split across partitions
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2643,42 +2683,43 @@ fn parallelization_two_partitions_into_four() -> Result<()> {
 #[test]
 fn parallelization_sorted_limit() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let plan_parquet = limit_exec(sort_exec(sort_key.clone(), parquet_exec(), false));
-    let plan_csv = limit_exec(sort_exec(sort_key, csv_exec(), false));
+    }]
+    .into();
+    let plan_parquet = limit_exec(sort_exec(sort_key.clone(), parquet_exec()));
+    let plan_csv = limit_exec(sort_exec(sort_key, csv_exec()));
 
     let test_config = TestConfig::default();
 
     // Test: with parquet
-    let expected_parquet = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  LocalLimitExec: fetch=100",
-        // data is sorted so can't repartition here
-        "    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // Doesn't parallelize for SortExec without preserve_partitioning
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib, @r"
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // data is sorted so can't repartition here
+    // Doesn't parallelize for SortExec without preserve_partitioning
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  LocalLimitExec: fetch=100",
-        // data is sorted so can't repartition here
-        "    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // Doesn't parallelize for SortExec without preserve_partitioning
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      LocalLimitExec: fetch=100
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    // data is sorted so can't repartition here
+    // Doesn't parallelize for SortExec without preserve_partitioning
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2686,54 +2727,53 @@ fn parallelization_sorted_limit() -> Result<()> {
 #[test]
 fn parallelization_limit_with_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let plan_parquet = limit_exec(filter_exec(sort_exec(
-        sort_key.clone(),
-        parquet_exec(),
-        false,
-    )));
-    let plan_csv = limit_exec(filter_exec(sort_exec(sort_key, csv_exec(), false)));
+    }]
+    .into();
+    let plan_parquet =
+        limit_exec(filter_exec(sort_exec(sort_key.clone(), parquet_exec())));
+    let plan_csv = limit_exec(filter_exec(sort_exec(sort_key, csv_exec())));
 
     let test_config = TestConfig::default();
 
     // Test: with parquet
-    let expected_parquet = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  CoalescePartitionsExec",
-        "    LocalLimitExec: fetch=100",
-        "      FilterExec: c@2 = 0",
-        // even though data is sorted, we can use repartition here. Since
-        // ordering is not used in subsequent stages anyway.
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // SortExec doesn't benefit from input partitioning
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    // even though data is sorted, we can use repartition here. Since
+    // ordering is not used in subsequent stages anyway.
+    // SortExec doesn't benefit from input partitioning
+    assert_plan!(plan_parquet_distrib,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      CoalescePartitionsExec
+        LocalLimitExec: fetch=100
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+              SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "GlobalLimitExec: skip=0, fetch=100",
-        "  CoalescePartitionsExec",
-        "    LocalLimitExec: fetch=100",
-        "      FilterExec: c@2 = 0",
-        // even though data is sorted, we can use repartition here. Since
-        // ordering is not used in subsequent stages anyway.
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // SortExec doesn't benefit from input partitioning
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    // even though data is sorted, we can use repartition here. Since
+    // ordering is not used in subsequent stages anyway.
+    // SortExec doesn't benefit from input partitioning
+    assert_plan!(plan_csv_distrib,
+                                                                                    @r"
+    GlobalLimitExec: skip=0, fetch=100
+      CoalescePartitionsExec
+        LocalLimitExec: fetch=100
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+              SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2751,48 +2791,49 @@ fn parallelization_ignores_limit() -> Result<()> {
     let test_config = TestConfig::default();
 
     // Test: with parquet
-    let expected_parquet = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        GlobalLimitExec: skip=0, fetch=100",
-        "          CoalescePartitionsExec",
-        "            LocalLimitExec: fetch=100",
-        "              FilterExec: c@2 = 0",
-        // repartition should happen prior to the filter to maximize parallelism
-        "                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                  GlobalLimitExec: skip=0, fetch=100",
-        // Limit doesn't benefit from input partitioning - no parallelism
-        "                    LocalLimitExec: fetch=100",
-        "                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            GlobalLimitExec: skip=0, fetch=100
+              CoalescePartitionsExec
+                LocalLimitExec: fetch=100
+                  FilterExec: c@2 = 0
+                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                      GlobalLimitExec: skip=0, fetch=100
+                        LocalLimitExec: fetch=100
+                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // repartition should happen prior to the filter to maximize parallelism
+    // Limit doesn't benefit from input partitioning - no parallelism
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10",
-        "    AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        GlobalLimitExec: skip=0, fetch=100",
-        "          CoalescePartitionsExec",
-        "            LocalLimitExec: fetch=100",
-        "              FilterExec: c@2 = 0",
-        // repartition should happen prior to the filter to maximize parallelism
-        "                RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "                  GlobalLimitExec: skip=0, fetch=100",
-        // Limit doesn't benefit from input partitioning - no parallelism
-        "                    LocalLimitExec: fetch=100",
-        "                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            GlobalLimitExec: skip=0, fetch=100
+              CoalescePartitionsExec
+                LocalLimitExec: fetch=100
+                  FilterExec: c@2 = 0
+                    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                      GlobalLimitExec: skip=0, fetch=100
+                        LocalLimitExec: fetch=100
+                          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    // repartition should happen prior to the filter to maximize parallelism
+    // Limit doesn't benefit from input partitioning - no parallelism
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2805,34 +2846,35 @@ fn parallelization_union_inputs() -> Result<()> {
     let test_config = TestConfig::default();
 
     // Test: with parquet
-    let expected_parquet = &[
-        "UnionExec",
-        // Union doesn't benefit from input partitioning - no parallelism
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+                                                                                    @r"
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    // Union doesn't benefit from input partitioning - no parallelism
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "UnionExec",
-        // Union doesn't benefit from input partitioning - no parallelism
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+                                                                                    @r"
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    ");
+    // Union doesn't benefit from input partitioning - no parallelism
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2840,14 +2882,15 @@ fn parallelization_union_inputs() -> Result<()> {
 #[test]
 fn parallelization_prior_to_sort_preserving_merge() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     // sort preserving merge already sorted input,
     let plan_parquet = sort_preserving_merge_exec(
         sort_key.clone(),
-        parquet_exec_with_sort(vec![sort_key.clone()]),
+        parquet_exec_with_sort(schema, vec![sort_key.clone()]),
     );
     let plan_csv =
         sort_preserving_merge_exec(sort_key.clone(), csv_exec_with_sort(vec![sort_key]));
@@ -2858,22 +2901,21 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> {
     // parallelization is not beneficial for SortPreservingMerge
 
     // Test: with parquet
-    let expected_parquet = &[
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet"
+    );
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false"
+    );
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2881,13 +2923,17 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> {
 #[test]
 fn parallelization_sort_preserving_merge_with_union() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     // 2 sorted parquet files unioned (partitions are concatenated, sort is preserved)
     let input_parquet =
-        union_exec(vec![parquet_exec_with_sort(vec![sort_key.clone()]); 2]);
+        union_exec(vec![
+            parquet_exec_with_sort(schema, vec![sort_key.clone()]);
+            2
+        ]);
     let input_csv = union_exec(vec![csv_exec_with_sort(vec![sort_key.clone()]); 2]);
     let plan_parquet = sort_preserving_merge_exec(sort_key.clone(), input_parquet);
     let plan_csv = sort_preserving_merge_exec(sort_key, input_csv);
@@ -2899,54 +2945,47 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> {
     // should not sort (as the data was already sorted)
 
     // Test: with parquet
-    let expected_parquet = &[
-        "SortPreservingMergeExec: [c@2 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    let expected_parquet_first_sort_enforcement = &[
-        // no SPM
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // has coalesce
-        "  CoalescePartitionsExec",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet_first_sort_enforcement,
-        plan_parquet,
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+        @r"
+    SortPreservingMergeExec: [c@2 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_sort,
+        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    // no SPM
+    // has coalesce
 
     // Test: with csv
-    let expected_csv = &[
-        "SortPreservingMergeExec: [c@2 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    let expected_csv_first_sort_enforcement = &[
-        // no SPM
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        // has coalesce
-        "  CoalescePartitionsExec",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    test_config.run(
-        expected_csv_first_sort_enforcement,
-        plan_csv.clone(),
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+        @r"
+    SortPreservingMergeExec: [c@2 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
+    let plan_csv_sort = test_config.to_plan(plan_csv.clone(), &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_sort,
+        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
+    // no SPM
+    // has coalesce
 
     Ok(())
 }
@@ -2954,14 +2993,15 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> {
 #[test]
 fn parallelization_does_not_benefit() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     //  SortRequired
     //    Parquet(sorted)
     let plan_parquet = sort_required_exec_with_req(
-        parquet_exec_with_sort(vec![sort_key.clone()]),
+        parquet_exec_with_sort(schema, vec![sort_key.clone()]),
         sort_key.clone(),
     );
     let plan_csv =
@@ -2973,24 +3013,25 @@ fn parallelization_does_not_benefit() -> Result<()> {
     // no parallelization, because SortRequiredExec doesn't benefit from increased parallelism
 
     // Test: with parquet
-    let expected_parquet = &[
-        "SortRequiredExec: [c@2 ASC]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_parquet_distrib,
+        @r"
+    SortRequiredExec: [c@2 ASC]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     // Test: with csv
-    let expected_csv = &[
-        "SortRequiredExec: [c@2 ASC]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_csv_distrib,
+        @r"
+    SortRequiredExec: [c@2 ASC]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
+    let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_csv_distrib, plan_csv_sort);
 
     Ok(())
 }
@@ -2999,44 +3040,48 @@ fn parallelization_does_not_benefit() -> Result<()> {
 fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> {
     // sorted input
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
 
     //Projection(a as a2, b as b2)
     let alias_pairs: Vec<(String, String)> = vec![
         ("a".to_string(), "a2".to_string()),
         ("c".to_string(), "c2".to_string()),
     ];
-    let proj_parquet =
-        projection_exec_with_alias(parquet_exec_with_sort(vec![sort_key]), alias_pairs);
-    let sort_key_after_projection = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c2", &proj_parquet.schema()).unwrap(),
+    let proj_parquet = projection_exec_with_alias(
+        parquet_exec_with_sort(schema, vec![sort_key]),
+        alias_pairs,
+    );
+    let sort_key_after_projection = [PhysicalSortExpr {
+        expr: col("c2", &proj_parquet.schema())?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan_parquet =
         sort_preserving_merge_exec(sort_key_after_projection, proj_parquet);
-    let expected = &[
-        "SortPreservingMergeExec: [c2@1 ASC]",
-        "  ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    plans_matches_expected!(expected, &plan_parquet);
 
+    assert_plan!(plan_parquet,
+        @r"
+    SortPreservingMergeExec: [c2@1 ASC]
+      ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+
+    let test_config = TestConfig::default();
+    let plan_parquet_distrib =
+        test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT);
     // Expected Outcome:
     // data should not be repartitioned / resorted
-    let expected_parquet = &[
-        "ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    let test_config = TestConfig::default();
-    test_config.run(
-        expected_parquet,
-        plan_parquet.clone(),
-        &DISTRIB_DISTRIB_SORT,
-    )?;
-    test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?;
+    assert_plan!(plan_parquet_distrib,
+                                                                                    @r"
+    ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_parquet_distrib, plan_parquet_sort);
 
     Ok(())
 }
@@ -3045,10 +3090,11 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()>
 fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> {
     // sorted input
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
 
     //Projection(a as a2, b as b2)
     let alias_pairs: Vec<(String, String)> = vec![
@@ -3058,27 +3104,30 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> {
 
     let proj_csv =
         projection_exec_with_alias(csv_exec_with_sort(vec![sort_key]), alias_pairs);
-    let sort_key_after_projection = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c2", &proj_csv.schema()).unwrap(),
+    let sort_key_after_projection = [PhysicalSortExpr {
+        expr: col("c2", &proj_csv.schema())?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv);
-    let expected = &[
-        "SortPreservingMergeExec: [c2@1 ASC]",
-        "  ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    plans_matches_expected!(expected, &plan_csv);
+    assert_plan!(plan_csv,
+                                                                                        @r"
+    SortPreservingMergeExec: [c2@1 ASC]
+      ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
 
+    let test_config = TestConfig::default();
+    let plan_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    ProjectionExec: expr=[a@0 as a2, c@2 as c2]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false
+    ");
     // Expected Outcome:
     // data should not be repartitioned / resorted
-    let expected_csv = &[
-        "ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false",
-    ];
-    let test_config = TestConfig::default();
-    test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?;
+    let plan_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3088,24 +3137,25 @@ fn remove_redundant_roundrobins() -> Result<()> {
     let input = parquet_exec();
     let repartition = repartition_exec(repartition_exec(input));
     let physical_plan = repartition_exec(filter_exec(repartition));
-    let expected = &[
-        "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10",
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    plans_matches_expected!(expected, &physical_plan);
-
-    let expected = &[
-        "FilterExec: c@2 = 0",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
+    assert_plan!(physical_plan,
+                                                                                        @r"
+    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let test_config = TestConfig::default();
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3114,28 +3164,30 @@ fn remove_redundant_roundrobins() -> Result<()> {
 #[test]
 fn remove_unnecessary_spm_after_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
     let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input));
 
     // TestConfig: Prefer existing sort.
     let test_config = TestConfig::default().with_prefer_existing_sort();
 
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     // Expected Outcome:
     // Original plan expects its output to be ordered by c@2 ASC.
     // This is still satisfied since, after filter that column is constant.
-    let expected = &[
-        "CoalescePartitionsExec",
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC",
-        "      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    CoalescePartitionsExec
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3144,24 +3196,27 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> {
 #[test]
 fn preserve_ordering_through_repartition() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("d", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("d", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
     let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input));
 
     // TestConfig: Prefer existing sort.
     let test_config = TestConfig::default().with_prefer_existing_sort();
 
-    let expected = &[
-        "SortPreservingMergeExec: [d@3 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC",
-        "      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [d@3 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3169,38 +3224,37 @@ fn preserve_ordering_through_repartition() -> Result<()> {
 #[test]
 fn do_not_preserve_ordering_through_repartition() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
     let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input));
 
     let test_config = TestConfig::default();
 
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     // Test: run EnforceDistribution, then EnforceSort.
-    let expected = &[
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "    FilterExec: c@2 = 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    FilterExec: c@2 = 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_first_sort_enforcement,
-        physical_plan,
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3208,24 +3262,26 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> {
 #[test]
 fn no_need_for_sort_after_filter() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
     let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input));
 
-    let expected = &[
-        // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied.
-        "CoalescePartitionsExec",
-        // Since after this stage c is constant. c@2 ASC ordering is already satisfied.
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "      DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib, @r"
+    CoalescePartitionsExec
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
+    // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied.
+    // Since after this stage c is constant. c@2 ASC ordering is already satisfied.
 
     Ok(())
 }
@@ -3233,44 +3289,44 @@ fn no_need_for_sort_after_filter() -> Result<()> {
 #[test]
 fn do_not_preserve_ordering_through_repartition2() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key]);
 
-    let sort_req = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_req = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let physical_plan = sort_preserving_merge_exec(sort_req, filter_exec(input));
 
     let test_config = TestConfig::default();
 
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
     // Test: run EnforceDistribution, then EnforceSort.
-    let expected = &[
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "    FilterExec: c@2 = 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        FilterExec: c@2 = 0
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     // Test: result IS DIFFERENT, if EnforceSorting is run first:
-    let expected_first_sort_enforcement = &[
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "      FilterExec: c@2 = 0",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "          DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(
-        expected_first_sort_enforcement,
-        physical_plan,
-        &SORT_DISTRIB_DISTRIB,
-    )?;
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_sort,
+                                                                                        @r"
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+              DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3278,21 +3334,24 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> {
 #[test]
 fn do_not_preserve_ordering_through_repartition3() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key]);
     let physical_plan = filter_exec(input);
 
-    let expected = &[
-        "FilterExec: c@2 = 0",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3300,36 +3359,34 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> {
 #[test]
 fn do_not_put_sort_when_input_is_invalid() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec();
     let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key);
-    let expected = &[
-        // Ordering requirement of sort required exec is NOT satisfied
-        // by existing ordering at the source.
-        "SortRequiredExec: [a@0 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    assert_plan_txt!(expected, physical_plan);
-
-    let expected = &[
-        "SortRequiredExec: [a@0 ASC]",
-        // Since at the start of the rule ordering requirement is not satisfied
-        // EnforceDistribution rule doesn't satisfy this requirement either.
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
+    // Ordering requirement of sort required exec is NOT satisfied
+    // by existing ordering at the source.
+    assert_plan!(physical_plan, @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let mut config = ConfigOptions::new();
     config.execution.target_partitions = 10;
     config.optimizer.enable_round_robin_repartition = true;
     config.optimizer.prefer_existing_sort = false;
     let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?;
-    assert_plan_txt!(expected, dist_plan);
+    // Since at the start of the rule ordering requirement is not satisfied
+    // EnforceDistribution rule doesn't satisfy this requirement either.
+    assert_plan!(dist_plan, @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3337,36 +3394,34 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> {
 #[test]
 fn put_sort_when_input_is_valid() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema).unwrap(),
+    let sort_key: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]);
     let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key);
 
-    let expected = &[
-        // Ordering requirement of sort required exec is satisfied
-        // by existing ordering at the source.
-        "SortRequiredExec: [a@0 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
-    assert_plan_txt!(expected, physical_plan);
-
-    let expected = &[
-        // Since at the start of the rule ordering requirement is satisfied
-        // EnforceDistribution rule satisfy this requirement also.
-        "SortRequiredExec: [a@0 ASC]",
-        "  FilterExec: c@2 = 0",
-        "    DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
+    // Ordering requirement of sort required exec is satisfied
+    // by existing ordering at the source.
+    assert_plan!(physical_plan, @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     let mut config = ConfigOptions::new();
     config.execution.target_partitions = 10;
     config.optimizer.enable_round_robin_repartition = true;
     config.optimizer.prefer_existing_sort = false;
     let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?;
-    assert_plan_txt!(expected, dist_plan);
+    // Since at the start of the rule ordering requirement is satisfied
+    // EnforceDistribution rule satisfy this requirement also.
+    assert_plan!(dist_plan, @r"
+    SortRequiredExec: [a@0 ASC]
+      FilterExec: c@2 = 0
+        DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -3374,25 +3429,28 @@ fn put_sort_when_input_is_valid() -> Result<()> {
 #[test]
 fn do_not_add_unnecessary_hash() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let alias = vec![("a".to_string(), "a".to_string())];
-    let input = parquet_exec_with_sort(vec![sort_key]);
+    let input = parquet_exec_with_sort(schema, vec![sort_key]);
     let physical_plan = aggregate_exec_with_alias(input, alias);
 
     // TestConfig:
     // Make sure target partition number is 1. In this case hash repartition is unnecessary.
     let test_config = TestConfig::default().with_query_execution_partitions(1);
 
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "  AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3400,10 +3458,11 @@ fn do_not_add_unnecessary_hash() -> Result<()> {
 #[test]
 fn do_not_add_unnecessary_hash2() -> Result<()> {
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
+    let sort_key = [PhysicalSortExpr {
+        expr: col("c", &schema)?,
         options: SortOptions::default(),
-    }]);
+    }]
+    .into();
     let alias = vec![("a".to_string(), "a".to_string())];
     let input = parquet_exec_multiple_sorted(vec![sort_key]);
     let aggregate = aggregate_exec_with_alias(input, alias.clone());
@@ -3413,19 +3472,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> {
     // Make sure target partition number is larger than 2 (e.g partition number at the source).
     let test_config = TestConfig::default().with_query_execution_partitions(4);
 
-    let expected = &[
-        "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        // Since hash requirements of this operator is satisfied. There shouldn't be
-        // a hash repartition here
-        "  AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]",
-        "      RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4",
-        "        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
-        "          RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2",
-        "            DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+        AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+            AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]
+              RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+                DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet
+    ");
+    // Since hash requirements of this operator is satisfied. There shouldn't be
+    // a hash repartition here
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3433,19 +3494,19 @@ fn do_not_add_unnecessary_hash2() -> Result<()> {
 #[test]
 fn optimize_away_unnecessary_repartition() -> Result<()> {
     let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec()));
-    let expected = &[
-        "CoalescePartitionsExec",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    plans_matches_expected!(expected, physical_plan.clone());
-
-    let expected =
-        &["DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"];
+    assert_plan!(physical_plan,
+                                                                                        @r"
+    CoalescePartitionsExec
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     let test_config = TestConfig::default();
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3455,25 +3516,27 @@ fn optimize_away_unnecessary_repartition2() -> Result<()> {
     let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec(
         filter_exec(repartition_exec(parquet_exec())),
     )));
-    let expected = &[
-        "FilterExec: c@2 = 0",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    CoalescePartitionsExec",
-        "      FilterExec: c@2 = 0",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    plans_matches_expected!(expected, physical_plan.clone());
+    assert_plan!(physical_plan,
+                                                                                        @r"
+    FilterExec: c@2 = 0
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        CoalescePartitionsExec
+          FilterExec: c@2 = 0
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
-    let expected = &[
-        "FilterExec: c@2 = 0",
-        "  FilterExec: c@2 = 0",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
     let test_config = TestConfig::default();
-    test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?;
-    test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?;
+    let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT);
+    assert_plan!(plan_distrib,
+                                                                                        @r"
+    FilterExec: c@2 = 0
+      FilterExec: c@2 = 0
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
+    let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB);
+    assert_plan!(plan_distrib, plan_sort);
 
     Ok(())
 }
@@ -3489,34 +3552,35 @@ async fn test_distribute_sort_parquet() -> Result<()> {
     );
 
     let schema = schema();
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("c", &schema).unwrap(),
-        options: SortOptions::default(),
-    }]);
-    let physical_plan = sort_exec(sort_key, parquet_exec_with_stats(10000 * 8192), false);
+    let sort_key = [PhysicalSortExpr::new_default(col("c", &schema)?)].into();
+    let physical_plan = sort_exec(sort_key, parquet_exec_with_stats(10000 * 8192));
 
     // prior to optimization, this is the starting plan
-    let starting = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    plans_matches_expected!(starting, physical_plan.clone());
+    assert_plan!(physical_plan,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // what the enforce distribution run does.
-    let expected = &[
-        "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan.clone(), &[Run::Distribution])?;
+    let plan_distribution =
+        test_config.to_plan(physical_plan.clone(), &[Run::Distribution]);
+    assert_plan!(plan_distribution,
+                                                                                        @r"
+    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
 
     // what the sort parallelization (in enforce sorting), does after the enforce distribution changes
-    let expected = &[
-        "SortPreservingMergeExec: [c@2 ASC]",
-        "  SortExec: expr=[c@2 ASC], preserve_partitioning=[true]",
-        "    DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet",
-    ];
-    test_config.run(expected, physical_plan, &[Run::Distribution, Run::Sorting])?;
+    let plan_both =
+        test_config.to_plan(physical_plan, &[Run::Distribution, Run::Sorting]);
+    assert_plan!(plan_both,
+                                                                                        @r"
+    SortPreservingMergeExec: [c@2 ASC]
+      SortExec: expr=[c@2 ASC], preserve_partitioning=[true]
+        DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet
+    ");
     Ok(())
 }
 
@@ -3541,12 +3605,12 @@ async fn test_distribute_sort_memtable() -> Result<()> {
     let physical_plan = dataframe.create_physical_plan().await?;
 
     // this is the final, optimized plan
-    let expected = &[
-        "SortPreservingMergeExec: [id@0 ASC NULLS LAST]",
-        "  SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]",
-        "    DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]",
-    ];
-    plans_matches_expected!(expected, physical_plan);
+    assert_plan!(physical_plan,
+                                                                                        @r"
+    SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+      SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+        DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]
+    ");
 
     Ok(())
 }
@@ -3583,16 +3647,12 @@ fn test_replace_order_preserving_variants_with_fetch() -> Result<()> {
     // Create a base plan
     let parquet_exec = parquet_exec();
 
-    let sort_expr = PhysicalSortExpr {
-        expr: Arc::new(Column::new("id", 0)),
-        options: SortOptions::default(),
-    };
-
-    let ordering = LexOrdering::new(vec![sort_expr]);
+    let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("id", 0)));
 
     // Create a SortPreservingMergeExec with fetch=5
     let spm_exec = Arc::new(
-        SortPreservingMergeExec::new(ordering, parquet_exec.clone()).with_fetch(Some(5)),
+        SortPreservingMergeExec::new([sort_expr].into(), parquet_exec.clone())
+            .with_fetch(Some(5)),
     );
 
     // Create distribution context
diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
index f7668c8aab11f..6349ff1cd109f 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
@@ -17,130 +17,119 @@
 
 use std::sync::Arc;
 
+use crate::memory_limit::DummyStreamPartition;
 use crate::physical_optimizer::test_utils::{
-    aggregate_exec, bounded_window_exec, check_integrity, coalesce_batches_exec,
-    coalesce_partitions_exec, create_test_schema, create_test_schema2,
-    create_test_schema3, filter_exec, global_limit_exec, hash_join_exec, limit_exec,
-    local_limit_exec, memory_exec, parquet_exec, repartition_exec, sort_exec,
+    RequirementsTestExec, aggregate_exec, bounded_window_exec,
+    bounded_window_exec_with_partition, check_integrity, coalesce_partitions_exec,
+    create_test_schema, create_test_schema2, create_test_schema3, filter_exec,
+    global_limit_exec, hash_join_exec, local_limit_exec, memory_exec, parquet_exec,
+    parquet_exec_with_sort, projection_exec, repartition_exec, sort_exec,
     sort_exec_with_fetch, sort_expr, sort_expr_options, sort_merge_join_exec,
     sort_preserving_merge_exec, sort_preserving_merge_exec_with_fetch,
-    spr_repartition_exec, stream_exec_ordered, union_exec, RequirementsTestExec,
+    spr_repartition_exec, stream_exec_ordered, union_exec,
 };
 
-use arrow::compute::SortOptions;
+use arrow::compute::{SortOptions};
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::config::ConfigOptions;
+use datafusion_common::config::{ConfigOptions, CsvOptions};
 use datafusion_common::tree_node::{TreeNode, TransformedResult};
-use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::{JoinType, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition};
+use datafusion_common::{create_array, Result, TableReference};
+use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+use datafusion_datasource::source::DataSourceExec;
+use datafusion_expr_common::operator::Operator;
+use datafusion_expr::{JoinType, SortExpr};
 use datafusion_execution::object_store::ObjectStoreUrl;
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_expr::expressions::{col, Column, NotExpr};
-use datafusion_physical_expr::Partitioning;
-use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, PhysicalSortExpr, PhysicalSortRequirement, OrderingRequirements
+};
+use datafusion_physical_expr::{Distribution, Partitioning};
+use datafusion_physical_expr::expressions::{col, BinaryExpr, Column, NotExpr};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::windows::{create_window_expr, BoundedWindowAggExec, WindowAggExec};
-use datafusion_physical_plan::{displayable, get_plan_string, ExecutionPlan, InputOrderMode};
-use datafusion::datasource::physical_plan::{CsvSource, ParquetSource};
+use datafusion_physical_plan::{displayable, get_plan_string, ExecutionPlan};
+use datafusion::datasource::physical_plan::CsvSource;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion_physical_optimizer::enforce_sorting::{EnforceSorting, PlanWithCorrespondingCoalescePartitions, PlanWithCorrespondingSort, parallelize_sorts, ensure_sorting};
 use datafusion_physical_optimizer::enforce_sorting::replace_with_order_preserving_variants::{replace_with_order_preserving_variants, OrderPreservationContext};
 use datafusion_physical_optimizer::enforce_sorting::sort_pushdown::{SortPushDown, assign_initial_requirements, pushdown_sorts};
 use datafusion_physical_optimizer::enforce_distribution::EnforceDistribution;
+use datafusion_physical_optimizer::output_requirements::OutputRequirementExec;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
-use datafusion_functions_aggregate::average::avg_udaf;
-use datafusion_functions_aggregate::count::count_udaf;
-use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf};
-
-use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-use datafusion_datasource::source::DataSourceExec;
-use rstest::rstest;
-
-/// Create a csv exec for tests
-fn csv_exec_ordered(
-    schema: &SchemaRef,
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
-) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema.clone(),
-        Arc::new(CsvSource::new(true, 0, b'"')),
-    )
-    .with_file(PartitionedFile::new("file_path".to_string(), 100))
-    .with_output_ordering(vec![sort_exprs])
-    .build();
-
-    DataSourceExec::from_data_source(config)
-}
-
-/// Created a sorted parquet exec
-pub fn parquet_exec_sorted(
-    schema: &SchemaRef,
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
-) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-
-    let source = Arc::new(ParquetSource::default());
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema.clone(),
-        source,
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_output_ordering(vec![sort_exprs])
-    .build();
+use datafusion::prelude::*;
+use arrow::array::{record_batch, ArrayRef, Int32Array, RecordBatch};
+use arrow::datatypes::{Field};
+use arrow_schema::Schema;
+use datafusion_execution::TaskContext;
+use datafusion_catalog::streaming::StreamingTable;
 
-    DataSourceExec::from_data_source(config)
-}
+use futures::StreamExt;
+use insta::{Settings, assert_snapshot};
 
 /// Create a sorted Csv exec
 fn csv_exec_sorted(
     schema: &SchemaRef,
     sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-
-    let config = FileScanConfigBuilder::new(
+    let options = CsvOptions {
+        has_header: Some(false),
+        delimiter: 0,
+        quote: 0,
+        ..Default::default()
+    };
+    let mut builder = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema.clone(),
-        Arc::new(CsvSource::new(false, 0, 0)),
+        Arc::new(CsvSource::new(schema.clone()).with_csv_options(options)),
     )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_output_ordering(vec![sort_exprs])
-    .build();
+    .with_file(PartitionedFile::new("x".to_string(), 100));
+    if let Some(ordering) = LexOrdering::new(sort_exprs) {
+        builder = builder.with_output_ordering(vec![ordering]);
+    }
 
+    let config = builder.build();
     DataSourceExec::from_data_source(config)
 }
 
 /// Runs the sort enforcement optimizer and asserts the plan
 /// against the original and expected plans
-///
-/// `$EXPECTED_PLAN_LINES`: input plan
-/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan
-/// `$PLAN`: the plan to optimized
-/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option.
-///
-macro_rules! assert_optimized {
-    ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr) => {
+pub(crate) struct EnforceSortingTest {
+    plan: Arc<dyn ExecutionPlan>,
+    repartition_sorts: bool,
+}
+
+impl EnforceSortingTest {
+    pub(crate) fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self {
+            plan,
+            repartition_sorts: false,
+        }
+    }
+
+    /// Set whether to repartition sorts
+    pub(crate) fn with_repartition_sorts(mut self, repartition_sorts: bool) -> Self {
+        self.repartition_sorts = repartition_sorts;
+        self
+    }
+
+    /// Runs the enforce sorting test and returns a string with the input and
+    /// optimized plan as strings for snapshot comparison using insta
+    pub(crate) fn run(&self) -> String {
         let mut config = ConfigOptions::new();
-        config.optimizer.repartition_sorts = $REPARTITION_SORTS;
+        config.optimizer.repartition_sorts = self.repartition_sorts;
 
         // This file has 4 rules that use tree node, apply these rules as in the
         // EnforceSorting::optimize implementation
         // After these operations tree nodes should be in a consistent state.
         // This code block makes sure that these rules doesn't violate tree node integrity.
         {
-            let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone());
+            let plan_requirements =
+                PlanWithCorrespondingSort::new_default(Arc::clone(&self.plan));
             let adjusted = plan_requirements
                 .transform_up(ensure_sorting)
                 .data()
-                .and_then(check_integrity)?;
+                .and_then(check_integrity)
+                .expect("check_integrity failed after ensure_sorting");
             // TODO: End state payloads will be checked here.
 
             let new_plan = if config.optimizer.repartition_sorts {
@@ -149,60 +138,60 @@ macro_rules! assert_optimized {
                 let parallel = plan_with_coalesce_partitions
                     .transform_up(parallelize_sorts)
                     .data()
-                    .and_then(check_integrity)?;
+                    .and_then(check_integrity)
+                    .expect("check_integrity failed after parallelize_sorts");
                 // TODO: End state payloads will be checked here.
                 parallel.plan
             } else {
                 adjusted.plan
             };
 
-            let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan);
+            let plan_with_pipeline_fixer =
+                OrderPreservationContext::new_default(new_plan);
             let updated_plan = plan_with_pipeline_fixer
                 .transform_up(|plan_with_pipeline_fixer| {
                     replace_with_order_preserving_variants(
                         plan_with_pipeline_fixer,
                         false,
                         true,
-                       &config,
+                        &config,
                     )
                 })
                 .data()
-                .and_then(check_integrity)?;
+                .and_then(check_integrity)
+                .expect(
+                    "check_integrity failed after replace_with_order_preserving_variants",
+                );
             // TODO: End state payloads will be checked here.
 
             let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan);
             assign_initial_requirements(&mut sort_pushdown);
-            check_integrity(pushdown_sorts(sort_pushdown)?)?;
+            check_integrity(
+                pushdown_sorts(sort_pushdown).expect("pushdown_sorts failed"),
+            )
+            .expect("check_integrity failed after pushdown_sorts");
             // TODO: End state payloads will be checked here.
         }
-
-        let physical_plan = $PLAN;
-        let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-        let actual: Vec<&str> = formatted.trim().lines().collect();
-
-        let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES
-            .iter().map(|s| *s).collect();
-
-        assert_eq!(
-            expected_plan_lines, actual,
-            "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
-
-        let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES
-            .iter().map(|s| *s).collect();
+        let input_plan_string = displayable(self.plan.as_ref()).indent(true).to_string();
 
         // Run the actual optimizer
-        let optimized_physical_plan =
-            EnforceSorting::new().optimize(physical_plan,&config)?;
+        let optimized_physical_plan = EnforceSorting::new()
+            .optimize(Arc::clone(&self.plan), &config)
+            .expect("enforce_sorting failed");
 
         // Get string representation of the plan
-        let actual = get_plan_string(&optimized_physical_plan);
-        assert_eq!(
-            expected_optimized_lines, actual,
-            "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
+        let optimized_plan_string = displayable(optimized_physical_plan.as_ref())
+            .indent(true)
+            .to_string();
 
-    };
+        if input_plan_string == optimized_plan_string {
+            format!("Input / Optimized Plan:\n{input_plan_string}",)
+        } else {
+            format!(
+                "Input Plan:\n{input_plan_string}\nOptimized Plan:\n{optimized_plan_string}",
+            )
+        }
+    }
 }
 
 #[tokio::test]
@@ -210,96 +199,97 @@ async fn test_remove_unnecessary_sort5() -> Result<()> {
     let left_schema = create_test_schema2()?;
     let right_schema = create_test_schema3()?;
     let left_input = memory_exec(&left_schema);
-    let parquet_sort_exprs = vec![sort_expr("a", &right_schema)];
-    let right_input = parquet_exec_sorted(&right_schema, parquet_sort_exprs);
-
+    let parquet_ordering = [sort_expr("a", &right_schema)].into();
+    let right_input =
+        parquet_exec_with_sort(right_schema.clone(), vec![parquet_ordering]);
     let on = vec![(
         Arc::new(Column::new_with_schema("col_a", &left_schema)?) as _,
         Arc::new(Column::new_with_schema("c", &right_schema)?) as _,
     )];
     let join = hash_join_exec(left_input, right_input, on, None, &JoinType::Inner)?;
-    let physical_plan = sort_exec(vec![sort_expr("a", &join.schema())], join);
-
-    let expected_input = ["SortExec: expr=[a@2 ASC], preserve_partitioning=[false]",
-        "  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col_a@0, c@2)]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet"];
-
-    let expected_optimized = ["HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col_a@0, c@2)]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
-
+    let physical_plan = sort_exec([sort_expr("a", &join.schema())].into(), join);
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@2 ASC], preserve_partitioning=[false]
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col_a@0, c@2)]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+
+    Optimized Plan:
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col_a@0, c@2)]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    ");
     Ok(())
 }
 
 #[tokio::test]
 async fn test_do_not_remove_sort_with_limit() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort = sort_exec(sort_exprs.clone(), source1);
-    let limit = limit_exec(sort);
-
-    let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs);
-
+    ]
+    .into();
+    let sort = sort_exec(ordering.clone(), source1);
+    let limit = local_limit_exec(sort, 100);
+    let parquet_ordering = [sort_expr("nullable_col", &schema)].into();
+    let source2 = parquet_exec_with_sort(schema, vec![parquet_ordering]);
     let union = union_exec(vec![source2, limit]);
     let repartition = repartition_exec(union);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, repartition);
-
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "      GlobalLimitExec: skip=0, fetch=100",
-        "        LocalLimitExec: fetch=100",
-        "          SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-
+    let physical_plan = sort_preserving_merge_exec(ordering, repartition);
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+          LocalLimitExec: fetch=100
+            SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+            LocalLimitExec: fetch=100
+              SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // We should keep the bottom `SortExec`.
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-        "      UnionExec",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "        GlobalLimitExec: skip=0, fetch=100",
-        "          LocalLimitExec: fetch=100",
-        "            SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
-
     Ok(())
 }
 
 #[tokio::test]
 async fn test_union_inputs_sorted() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), source1);
-
-    let source2 = parquet_exec_sorted(&schema, sort_exprs.clone());
-
+    let source1 = parquet_exec(schema.clone());
+    let ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let sort = sort_exec(ordering.clone(), source1);
+    let source2 = parquet_exec_with_sort(schema, vec![ordering.clone()]);
     let union = union_exec(vec![source2, sort]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, union);
+    let physical_plan = sort_preserving_merge_exec(ordering, union);
 
     // one input to the union is already sorted, one is not.
-    let expected_input = vec![
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-    ];
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // should not add a sort at the output of the union, input plan should not be changed
-    let expected_optimized = expected_input.clone();
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
 
     Ok(())
 }
@@ -307,31 +297,30 @@ async fn test_union_inputs_sorted() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), source1);
-
-    let parquet_sort_exprs = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let sort = sort_exec(ordering.clone(), source1);
+    let parquet_ordering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs);
-
+    ]
+    .into();
+    let source2 = parquet_exec_with_sort(schema, vec![parquet_ordering]);
     let union = union_exec(vec![source2, sort]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, union);
+    let physical_plan = sort_preserving_merge_exec(ordering, union);
 
     // one input to the union is already sorted, one is not.
-    let expected_input = vec![
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-    ];
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // should not add a sort at the output of the union, input plan should not be changed
-    let expected_optimized = expected_input.clone();
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
 
     Ok(())
 }
@@ -339,120 +328,216 @@ async fn test_union_inputs_different_sorted() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted2() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs = vec![
+    let source1 = parquet_exec(schema.clone());
+    let sort_exprs: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
+    ]
+    .into();
     let sort = sort_exec(sort_exprs.clone(), source1);
-
-    let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs);
-
+    let parquet_ordering = [sort_expr("nullable_col", &schema)].into();
+    let source2 = parquet_exec_with_sort(schema, vec![parquet_ordering]);
     let union = union_exec(vec![source2, sort]);
     let physical_plan = sort_preserving_merge_exec(sort_exprs, union);
 
     // Input is an invalid plan. In this case rule should add required sorting in appropriate places.
     // First DataSourceExec has output ordering(nullable_col@0 ASC). However, it doesn't satisfy the
     // required ordering of SortPreservingMergeExec.
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  UnionExec",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
 
     Ok(())
 }
 
 #[tokio::test]
-async fn test_union_inputs_different_sorted3() -> Result<()> {
+// Test with `repartition_sorts` enabled to preserve pre-sorted partitions and avoid resorting
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_true()
+-> Result<()> {
+    assert_snapshot!(
+        union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(true).await?,
+        @r"
+    Input Plan:
+    OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition
+      CoalescePartitionsExec
+        UnionExec
+          SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+
+    Optimized Plan:
+    OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition
+      SortPreservingMergeExec: [nullable_col@0 ASC]
+        UnionExec
+          SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+// Test with `repartition_sorts` disabled, causing a full resort of the data
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_with_repartition_sorts_false()
+-> Result<()> {
+    assert_snapshot!(
+        union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(false).await?,
+        @r"
+    Input Plan:
+    OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition
+      CoalescePartitionsExec
+        UnionExec
+          SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+
+    Optimized Plan:
+    OutputRequirementExec: order_by=[(nullable_col@0, asc)], dist_by=SinglePartition
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+        CoalescePartitionsExec
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+    ");
+    Ok(())
+}
+
+async fn union_with_mix_of_presorted_and_explicitly_resorted_inputs_impl(
+    repartition_sorts: bool,
+) -> Result<String> {
     let schema = create_test_schema()?;
 
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
-        sort_expr("nullable_col", &schema),
-        sort_expr("non_nullable_col", &schema),
-    ];
-    let sort1 = sort_exec(sort_exprs1, source1.clone());
-    let sort_exprs2 = vec![sort_expr("nullable_col", &schema)];
-    let sort2 = sort_exec(sort_exprs2, source1);
+    // Source 1, will be sorted explicitly (on `nullable_col`)
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [sort_expr("nullable_col", &schema)].into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+
+    // Source 2, pre-sorted (on `nullable_col`)
+    let parquet_ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let source2 = parquet_exec_with_sort(schema.clone(), vec![parquet_ordering.clone()]);
+
+    let union = union_exec(vec![sort1, source2]);
+
+    let coalesced = coalesce_partitions_exec(union);
+
+    // Required sorted / single partitioned output
+    let requirement = [PhysicalSortRequirement::new(
+        col("nullable_col", &schema)?,
+        Some(SortOptions::new(false, true)),
+    )]
+    .into();
+    let physical_plan = Arc::new(OutputRequirementExec::new(
+        coalesced,
+        Some(OrderingRequirements::new(requirement)),
+        Distribution::SinglePartition,
+        None,
+    ));
 
-    let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone());
+    let test =
+        EnforceSortingTest::new(physical_plan).with_repartition_sorts(repartition_sorts);
+    Ok(test.run())
+}
 
+#[tokio::test]
+async fn test_union_inputs_different_sorted3() -> Result<()> {
+    let schema = create_test_schema()?;
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [
+        sort_expr("nullable_col", &schema),
+        sort_expr("non_nullable_col", &schema),
+    ]
+    .into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+    let ordering2 = [sort_expr("nullable_col", &schema)].into();
+    let sort2 = sort_exec(ordering2, source1);
+    let parquet_ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let source2 = parquet_exec_with_sort(schema, vec![parquet_ordering.clone()]);
     let union = union_exec(vec![sort1, source2, sort2]);
-    let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union);
+    let physical_plan = sort_preserving_merge_exec(parquet_ordering, union);
 
     // First input to the union is not Sorted (SortExec is finer than required ordering by the SortPreservingMergeExec above).
     // Second input to the union is already Sorted (matches with the required ordering by the SortPreservingMergeExec above).
     // Third input to the union is not Sorted (SortExec is matches required ordering by the SortPreservingMergeExec above).
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // should adjust sorting in the first input of the union such that it is not unnecessarily fine
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
-
     Ok(())
 }
 
 #[tokio::test]
 async fn test_union_inputs_different_sorted4() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs2 = vec![sort_expr("nullable_col", &schema)];
-    let sort1 = sort_exec(sort_exprs2.clone(), source1.clone());
-    let sort2 = sort_exec(sort_exprs2.clone(), source1);
-
-    let source2 = parquet_exec_sorted(&schema, sort_exprs2);
-
+    ]
+    .into();
+    let ordering2: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let sort1 = sort_exec(ordering2.clone(), source1.clone());
+    let sort2 = sort_exec(ordering2.clone(), source1);
+    let source2 = parquet_exec_with_sort(schema, vec![ordering2]);
     let union = union_exec(vec![sort1, source2, sort2]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs1, union);
+    let physical_plan = sort_preserving_merge_exec(ordering1, union);
 
     // Ordering requirement of the `SortPreservingMergeExec` is not met.
     // Should modify the plan to ensure that all three inputs to the
     // `UnionExec` satisfy the ordering, OR add a single sort after
     // the `UnionExec` (both of which are equally good for this example).
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -460,13 +545,13 @@ async fn test_union_inputs_different_sorted4() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted5() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs2 = vec![
+    ]
+    .into();
+    let ordering2 = [
         sort_expr("nullable_col", &schema),
         sort_expr_options(
             "non_nullable_col",
@@ -476,30 +561,35 @@ async fn test_union_inputs_different_sorted5() -> Result<()> {
                 nulls_first: false,
             },
         ),
-    ];
-    let sort_exprs3 = vec![sort_expr("nullable_col", &schema)];
-    let sort1 = sort_exec(sort_exprs1, source1.clone());
-    let sort2 = sort_exec(sort_exprs2, source1);
-
+    ]
+    .into();
+    let ordering3 = [sort_expr("nullable_col", &schema)].into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+    let sort2 = sort_exec(ordering2, source1);
     let union = union_exec(vec![sort1, sort2]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs3, union);
+    let physical_plan = sort_preserving_merge_exec(ordering3, union);
 
     // The `UnionExec` doesn't preserve any of the inputs ordering in the
     // example below. However, we should be able to change the unnecessarily
     // fine `SortExec`s below with required `SortExec`s that are absolutely necessary.
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -507,22 +597,20 @@ async fn test_union_inputs_different_sorted5() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted6() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![sort_expr("nullable_col", &schema)];
-    let sort1 = sort_exec(sort_exprs1, source1.clone());
-    let sort_exprs2 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [sort_expr("nullable_col", &schema)].into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+    let ordering2 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
+    ]
+    .into();
     let repartition = repartition_exec(source1);
-    let spm = sort_preserving_merge_exec(sort_exprs2, repartition);
-
-    let parquet_sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let source2 = parquet_exec_sorted(&schema, parquet_sort_exprs.clone());
-
+    let spm = sort_preserving_merge_exec(ordering2, repartition);
+    let parquet_ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let source2 = parquet_exec_with_sort(schema, vec![parquet_ordering.clone()]);
     let union = union_exec(vec![sort1, source2, spm]);
-    let physical_plan = sort_preserving_merge_exec(parquet_sort_exprs, union);
+    let physical_plan = sort_preserving_merge_exec(parquet_ordering, union);
 
     // The plan is not valid as it is -- the input ordering requirement
     // of the `SortPreservingMergeExec` under the third child of the
@@ -530,25 +618,30 @@ async fn test_union_inputs_different_sorted6() -> Result<()> {
     // At the same time, this ordering requirement is unnecessarily fine.
     // The final plan should be valid AND the ordering of the third child
     // shouldn't be finer than necessary.
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // Should adjust the requirement in the third input of the union so
     // that it is not unnecessarily fine.
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
 
     Ok(())
 }
@@ -556,34 +649,30 @@ async fn test_union_inputs_different_sorted6() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted7() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs3 = vec![sort_expr("nullable_col", &schema)];
-    let sort1 = sort_exec(sort_exprs1.clone(), source1.clone());
-    let sort2 = sort_exec(sort_exprs1, source1);
-
+    ]
+    .into();
+    let sort1 = sort_exec(ordering1.clone(), source1.clone());
+    let sort2 = sort_exec(ordering1, source1);
     let union = union_exec(vec![sort1, sort2]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs3, union);
+    let ordering2 = [sort_expr("nullable_col", &schema)].into();
+    let physical_plan = sort_preserving_merge_exec(ordering2, union);
 
     // Union has unnecessarily fine ordering below it. We should be able to replace them with absolutely necessary ordering.
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    // Union preserves the inputs ordering and we should not change any of the SortExecs under UnionExec
-    let expected_output = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_output, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
+    // Union preserves the inputs ordering, and we should not change any of the SortExecs under UnionExec
 
     Ok(())
 }
@@ -591,13 +680,13 @@ async fn test_union_inputs_different_sorted7() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted8() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs2 = vec![
+    ]
+    .into();
+    let ordering2 = [
         sort_expr_options(
             "nullable_col",
             &schema,
@@ -614,75 +703,484 @@ async fn test_union_inputs_different_sorted8() -> Result<()> {
                 nulls_first: false,
             },
         ),
-    ];
-    let sort1 = sort_exec(sort_exprs1, source1.clone());
-    let sort2 = sort_exec(sort_exprs2, source1);
-
+    ]
+    .into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+    let sort2 = sort_exec(ordering2, source1);
     let physical_plan = union_exec(vec![sort1, sort2]);
 
     // The `UnionExec` doesn't preserve any of the inputs ordering in the
     // example below.
-    let expected_input = ["UnionExec",
-        "  SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "  SortExec: expr=[nullable_col@0 DESC NULLS LAST, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    UnionExec
+      SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+      SortExec: expr=[nullable_col@0 DESC NULLS LAST, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
     // Since `UnionExec` doesn't preserve ordering in the plan above.
     // We shouldn't keep SortExecs in the plan.
-    let expected_optimized = ["UnionExec",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
 
     Ok(())
 }
 
 #[tokio::test]
-async fn test_window_multi_path_sort() -> Result<()> {
+async fn test_soft_hard_requirements_remove_soft_requirement() -> Result<()> {
+    let schema = create_test_schema()?;
+    let source = parquet_exec(schema.clone());
+    let sort_exprs = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(sort_exprs, source);
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let physical_plan =
+        bounded_window_exec_with_partition("nullable_col", vec![], partition_bys, sort);
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns()
+-> Result<()> {
     let schema = create_test_schema()?;
+    let source = parquet_exec(schema.clone());
+    let ordering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source.clone());
+    let proj_exprs = vec![(
+        Arc::new(BinaryExpr::new(
+            col("nullable_col", &schema)?,
+            Operator::Plus,
+            col("non_nullable_col", &schema)?,
+        )) as _,
+        "count".to_string(),
+    )];
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let bounded_window =
+        bounded_window_exec_with_partition("nullable_col", vec![], partition_bys, sort);
+    let physical_plan = projection_exec(proj_exprs, bounded_window)?;
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count]",
+    //     "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+
+    let ordering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source);
+    let proj_exprs = vec![(
+        Arc::new(BinaryExpr::new(
+            col("nullable_col", &schema)?,
+            Operator::Plus,
+            col("non_nullable_col", &schema)?,
+        )) as _,
+        "nullable_col".to_string(),
+    )];
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let projection = projection_exec(proj_exprs, sort)?;
+    let physical_plan = bounded_window_exec_with_partition(
+        "nullable_col",
+        vec![],
+        partition_bys,
+        projection,
+    );
 
-    let sort_exprs1 = vec![
-        sort_expr("nullable_col", &schema),
-        sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs2 = vec![sort_expr("nullable_col", &schema)];
-    // reverse sorting of sort_exprs2
-    let sort_exprs3 = vec![sort_expr_options(
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+        SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+        ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+          SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "  ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]",
+    //     "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> {
+    let schema = create_test_schema()?;
+    let source = parquet_exec(schema.clone());
+    let ordering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source.clone());
+    let proj_exprs = vec![(
+        Arc::new(BinaryExpr::new(
+            col("nullable_col", &schema)?,
+            Operator::Plus,
+            col("non_nullable_col", &schema)?,
+        )) as _,
+        "nullable_col".to_string(),
+    )];
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let projection = projection_exec(proj_exprs, sort)?;
+    let bounded_window = bounded_window_exec_with_partition(
+        "nullable_col",
+        vec![],
+        partition_bys,
+        projection,
+    );
+    let physical_plan = bounded_window_exec_with_partition(
+        "count",
+        vec![],
+        partition_bys,
+        bounded_window,
+    );
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+          SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+          ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+            SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "    ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]",
+    //     "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+
+    let ordering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source);
+    let proj_exprs = vec![(
+        Arc::new(BinaryExpr::new(
+            col("nullable_col", &schema)?,
+            Operator::Plus,
+            col("non_nullable_col", &schema)?,
+        )) as _,
+        "nullable_col".to_string(),
+    )];
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let projection = projection_exec(proj_exprs, sort)?;
+    let bounded_window = bounded_window_exec_with_partition(
+        "nullable_col",
+        vec![],
+        partition_bys,
+        projection,
+    );
+
+    let ordering2: LexOrdering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort2 = sort_exec(ordering2.clone(), bounded_window);
+    let sort3 = sort_exec(ordering2, sort2);
+    let physical_plan =
+        bounded_window_exec_with_partition("count", vec![], partition_bys, sort3);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+        SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+          BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+              SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+                DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+          ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+            SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "    ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]",
+    //     "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_soft_hard_requirements_multiple_sorts() -> Result<()> {
+    let schema = create_test_schema()?;
+    let source = parquet_exec(schema.clone());
+    let ordering = [sort_expr_options(
         "nullable_col",
         &schema,
         SortOptions {
             descending: true,
             nulls_first: false,
         },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source);
+    let proj_exprs = vec![(
+        Arc::new(BinaryExpr::new(
+            col("nullable_col", &schema)?,
+            Operator::Plus,
+            col("non_nullable_col", &schema)?,
+        )) as _,
+        "nullable_col".to_string(),
     )];
-    let source1 = parquet_exec_sorted(&schema, sort_exprs1);
-    let source2 = parquet_exec_sorted(&schema, sort_exprs2);
-    let sort1 = sort_exec(sort_exprs3.clone(), source1);
-    let sort2 = sort_exec(sort_exprs3.clone(), source2);
+    let partition_bys = &[col("nullable_col", &schema)?];
+    let projection = projection_exec(proj_exprs, sort)?;
+    let bounded_window = bounded_window_exec_with_partition(
+        "nullable_col",
+        vec![],
+        partition_bys,
+        projection,
+    );
+    let ordering2: LexOrdering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort2 = sort_exec(ordering2.clone(), bounded_window);
+    let physical_plan = sort_exec(ordering2, sort2);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+      SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+        BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+            SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+          ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]
+            SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "  ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col]",
+    //     "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_requirement()
+-> Result<()> {
+    let schema = create_test_schema()?;
+    let source = parquet_exec(schema.clone());
+    let ordering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let sort = sort_exec(ordering, source);
+    let partition_bys1 = &[col("nullable_col", &schema)?];
+    let bounded_window =
+        bounded_window_exec_with_partition("nullable_col", vec![], partition_bys1, sort);
+    let partition_bys2 = &[col("non_nullable_col", &schema)?];
+    let bounded_window2 = bounded_window_exec_with_partition(
+        "non_nullable_col",
+        vec![],
+        partition_bys2,
+        bounded_window,
+    );
+    let requirement = [PhysicalSortRequirement::new(
+        col("non_nullable_col", &schema)?,
+        Some(SortOptions::new(false, true)),
+    )]
+    .into();
+    let physical_plan = Arc::new(OutputRequirementExec::new(
+        bounded_window2,
+        Some(OrderingRequirements::new(requirement)),
+        Distribution::SinglePartition,
+        None,
+    ));
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+          BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    "#);
+    // TODO When sort pushdown respects to the alternatives, and removes soft SortExecs this should be changed
+    // let expected_optimized = [
+    //     "OutputRequirementExec",
+    //     "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
+    //     "    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
+    //     "      BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]",
+    //     "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
+    // ];
+    Ok(())
+}
 
+#[tokio::test]
+async fn test_window_multi_path_sort() -> Result<()> {
+    let schema = create_test_schema()?;
+    let ordering1 = [
+        sort_expr("nullable_col", &schema),
+        sort_expr("non_nullable_col", &schema),
+    ]
+    .into();
+    let ordering2 = [sort_expr("nullable_col", &schema)].into();
+    // Reverse of the above
+    let ordering3: LexOrdering = [sort_expr_options(
+        "nullable_col",
+        &schema,
+        SortOptions {
+            descending: true,
+            nulls_first: false,
+        },
+    )]
+    .into();
+    let source1 = parquet_exec_with_sort(schema.clone(), vec![ordering1]);
+    let source2 = parquet_exec_with_sort(schema, vec![ordering2]);
+    let sort1 = sort_exec(ordering3.clone(), source1);
+    let sort2 = sort_exec(ordering3.clone(), source2);
     let union = union_exec(vec![sort1, sort2]);
-    let spm = sort_preserving_merge_exec(sort_exprs3.clone(), union);
-    let physical_plan = bounded_window_exec("nullable_col", sort_exprs3, spm);
+    let spm = sort_preserving_merge_exec(ordering3.clone(), union);
+    let physical_plan = bounded_window_exec("nullable_col", ordering3, spm);
 
     // The `WindowAggExec` gets its sorting from multiple children jointly.
     // During the removal of `SortExec`s, it should be able to remove the
     // corresponding SortExecs together. Also, the inputs of these `SortExec`s
     // are not necessarily the same to be able to remove them.
-    let expected_input = [
-        "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST]",
-        "    UnionExec",
-        "      SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet",
-        "      SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet"];
-    let expected_optimized = [
-        "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-        "  SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST]
+        UnionExec
+          SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet
+          SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      SortPreservingMergeExec: [nullable_col@0 ASC]
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+    "#);
 
     Ok(())
 }
@@ -690,36 +1188,40 @@ async fn test_window_multi_path_sort() -> Result<()> {
 #[tokio::test]
 async fn test_window_multi_path_sort2() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let sort_exprs1 = LexOrdering::new(vec![
+    let ordering1: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ]);
-    let sort_exprs2 = vec![sort_expr("nullable_col", &schema)];
-    let source1 = parquet_exec_sorted(&schema, sort_exprs2.clone());
-    let source2 = parquet_exec_sorted(&schema, sort_exprs2.clone());
-    let sort1 = sort_exec(sort_exprs1.clone(), source1);
-    let sort2 = sort_exec(sort_exprs1.clone(), source2);
-
+    ]
+    .into();
+    let ordering2: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let source1 = parquet_exec_with_sort(schema.clone(), vec![ordering2.clone()]);
+    let source2 = parquet_exec_with_sort(schema, vec![ordering2.clone()]);
+    let sort1 = sort_exec(ordering1.clone(), source1);
+    let sort2 = sort_exec(ordering1.clone(), source2);
     let union = union_exec(vec![sort1, sort2]);
-    let spm = Arc::new(SortPreservingMergeExec::new(sort_exprs1, union)) as _;
-    let physical_plan = bounded_window_exec("nullable_col", sort_exprs2, spm);
+    let spm = Arc::new(SortPreservingMergeExec::new(ordering1, union)) as _;
+    let physical_plan = bounded_window_exec("nullable_col", ordering2, spm);
 
     // The `WindowAggExec` can get its required sorting from the leaf nodes directly.
     // The unnecessary SortExecs should be removed
-    let expected_input = ["BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "    UnionExec",
-        "      SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "      SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet"];
-    let expected_optimized = ["BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "    UnionExec",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+        UnionExec
+          SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+          SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortPreservingMergeExec: [nullable_col@0 ASC]
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet
+    "#);
 
     Ok(())
 }
@@ -727,13 +1229,13 @@ async fn test_window_multi_path_sort2() -> Result<()> {
 #[tokio::test]
 async fn test_union_inputs_different_sorted_with_limit() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
-    let sort_exprs1 = vec![
+    let source1 = parquet_exec(schema.clone());
+    let ordering1 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort_exprs2 = vec![
+    ]
+    .into();
+    let ordering2 = [
         sort_expr("nullable_col", &schema),
         sort_expr_options(
             "non_nullable_col",
@@ -743,35 +1245,39 @@ async fn test_union_inputs_different_sorted_with_limit() -> Result<()> {
                 nulls_first: false,
             },
         ),
-    ];
-    let sort_exprs3 = vec![sort_expr("nullable_col", &schema)];
-    let sort1 = sort_exec(sort_exprs1, source1.clone());
-
-    let sort2 = sort_exec(sort_exprs2, source1);
-    let limit = local_limit_exec(sort2);
-    let limit = global_limit_exec(limit);
-
+    ]
+    .into();
+    let sort1 = sort_exec(ordering1, source1.clone());
+    let sort2 = sort_exec(ordering2, source1);
+    let limit = local_limit_exec(sort2, 100);
+    let limit = global_limit_exec(limit, 0, Some(100));
     let union = union_exec(vec![sort1, limit]);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs3, union);
+    let ordering3 = [sort_expr("nullable_col", &schema)].into();
+    let physical_plan = sort_preserving_merge_exec(ordering3, union);
 
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
     // Should not change the unnecessarily fine `SortExec`s because there is `LimitExec`
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    GlobalLimitExec: skip=0, fetch=100",
-        "      LocalLimitExec: fetch=100",
-        "        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  UnionExec",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    GlobalLimitExec: skip=0, fetch=100",
-        "      LocalLimitExec: fetch=100",
-        "        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        GlobalLimitExec: skip=0, fetch=100
+          LocalLimitExec: fetch=100
+            SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      UnionExec
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        GlobalLimitExec: skip=0, fetch=100
+          LocalLimitExec: fetch=100
+            SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
 
     Ok(())
 }
@@ -781,15 +1287,17 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> {
     let left_schema = create_test_schema()?;
     let right_schema = create_test_schema2()?;
 
-    let left = parquet_exec(&left_schema);
-    let right = parquet_exec(&right_schema);
+    let left = parquet_exec(left_schema);
+    let right = parquet_exec(right_schema);
 
     // Join on (nullable_col == col_a)
     let join_on = vec![(
-        Arc::new(Column::new_with_schema("nullable_col", &left.schema()).unwrap()) as _,
-        Arc::new(Column::new_with_schema("col_a", &right.schema()).unwrap()) as _,
+        Arc::new(Column::new_with_schema("nullable_col", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("col_a", &right.schema())?) as _,
     )];
 
+    let settings = Settings::clone_current();
+
     let join_types = vec![
         JoinType::Inner,
         JoinType::Left,
@@ -801,49 +1309,69 @@ async fn test_sort_merge_join_order_by_left() -> Result<()> {
     for join_type in join_types {
         let join =
             sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type);
-        let sort_exprs = vec![
+        let ordering = [
             sort_expr("nullable_col", &join.schema()),
             sort_expr("non_nullable_col", &join.schema()),
-        ];
-        let physical_plan = sort_preserving_merge_exec(sort_exprs.clone(), join);
+        ]
+        .into();
+        let physical_plan = sort_preserving_merge_exec(ordering, join);
 
-        let join_plan = format!(
-            "SortMergeJoin: join_type={join_type}, on=[(nullable_col@0, col_a@0)]"
-        );
-        let join_plan2 = format!(
-            "  SortMergeJoin: join_type={join_type}, on=[(nullable_col@0, col_a@0)]"
+        let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+        let mut settings = settings.clone();
+
+        settings.add_filter(
+            // join_type={} replace with join_type=... to avoid snapshot name issue
+            format!("join_type={join_type}").as_str(),
+            "join_type=...",
         );
-        let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-            join_plan2.as_str(),
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-        let expected_optimized = match join_type {
+
+        insta::allow_duplicates! {
+            settings.bind( || {
+
+
+        match join_type {
             JoinType::Inner
             | JoinType::Left
             | JoinType::LeftSemi
             | JoinType::LeftAnti => {
                 // can push down the sort requirements and save 1 SortExec
-                vec![
-                    join_plan.as_str(),
-                    "  SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-                    "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-                    "  SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]",
-                    "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet",
-                ]
+                assert_snapshot!(test.run(), @r"
+                Input Plan:
+                SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+                Optimized Plan:
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                  SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+                ");
             }
             _ => {
                 // can not push down the sort requirements
-                vec![
-                    "SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-                    join_plan2.as_str(),
-                    "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-                    "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-                    "    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]",
-                    "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet",
-                ]
+                assert_snapshot!(test.run(), @r"
+                Input Plan:
+                SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+                Optimized Plan:
+                SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
+                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+                ");
             }
         };
-        assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+        })
+        }
     }
     Ok(())
 }
@@ -853,15 +1381,17 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> {
     let left_schema = create_test_schema()?;
     let right_schema = create_test_schema2()?;
 
-    let left = parquet_exec(&left_schema);
-    let right = parquet_exec(&right_schema);
+    let left = parquet_exec(left_schema);
+    let right = parquet_exec(right_schema);
 
     // Join on (nullable_col == col_a)
     let join_on = vec![(
-        Arc::new(Column::new_with_schema("nullable_col", &left.schema()).unwrap()) as _,
-        Arc::new(Column::new_with_schema("col_a", &right.schema()).unwrap()) as _,
+        Arc::new(Column::new_with_schema("nullable_col", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("col_a", &right.schema())?) as _,
     )];
 
+    let settings = Settings::clone_current();
+
     let join_types = vec![
         JoinType::Inner,
         JoinType::Left,
@@ -872,50 +1402,83 @@ async fn test_sort_merge_join_order_by_right() -> Result<()> {
     for join_type in join_types {
         let join =
             sort_merge_join_exec(left.clone(), right.clone(), &join_on, &join_type);
-        let sort_exprs = vec![
+        let ordering = [
             sort_expr("col_a", &join.schema()),
             sort_expr("col_b", &join.schema()),
-        ];
-        let physical_plan = sort_preserving_merge_exec(sort_exprs, join);
+        ]
+        .into();
+        let physical_plan = sort_preserving_merge_exec(ordering, join);
 
-        let join_plan = format!(
-            "SortMergeJoin: join_type={join_type}, on=[(nullable_col@0, col_a@0)]"
-        );
-        let spm_plan = match join_type {
-            JoinType::RightAnti => "SortPreservingMergeExec: [col_a@0 ASC, col_b@1 ASC]",
-            _ => "SortPreservingMergeExec: [col_a@2 ASC, col_b@3 ASC]",
-        };
-        let join_plan2 = format!(
-            "  SortMergeJoin: join_type={join_type}, on=[(nullable_col@0, col_a@0)]"
+        let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+        let mut settings = settings.clone();
+
+        settings.add_filter(
+            // join_type={} replace with join_type=... to avoid snapshot name issue
+            format!("join_type={join_type}").as_str(),
+            "join_type=...",
         );
-        let expected_input = [spm_plan,
-            join_plan2.as_str(),
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-        let expected_optimized = match join_type {
-            JoinType::Inner | JoinType::Right | JoinType::RightAnti => {
+
+        insta::allow_duplicates! {
+            settings.bind( || {
+
+
+        match join_type {
+            JoinType::Inner | JoinType::Right => {
+                // can push down the sort requirements and save 1 SortExec
+                assert_snapshot!(test.run(), @r"
+                Input Plan:
+                SortPreservingMergeExec: [col_a@2 ASC, col_b@3 ASC]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+                Optimized Plan:
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                  SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+                ");
+            }
+            JoinType::RightAnti => {
                 // can push down the sort requirements and save 1 SortExec
-                vec![
-                    join_plan.as_str(),
-                    "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-                    "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-                    "  SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]",
-                    "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet",
-                ]
+                assert_snapshot!(test.run(), @r"
+                Input Plan:
+                SortPreservingMergeExec: [col_a@0 ASC, col_b@1 ASC]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+                Optimized Plan:
+                SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                  SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+                ");
             }
             _ => {
                 // can not push down the sort requirements for Left and Full join.
-                vec![
-                    "SortExec: expr=[col_a@2 ASC, col_b@3 ASC], preserve_partitioning=[false]",
-                    join_plan2.as_str(),
-                    "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-                    "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-                    "    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]",
-                    "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet",
-                ]
+                assert_snapshot!(test.run(), @r"
+                Input Plan:
+                SortPreservingMergeExec: [col_a@2 ASC, col_b@3 ASC]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+                Optimized Plan:
+                SortExec: expr=[col_a@2 ASC, col_b@3 ASC], preserve_partitioning=[false]
+                  SortMergeJoinExec: join_type=..., on=[(nullable_col@0, col_a@0)]
+                    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+                    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
+                      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+                ");
             }
         };
-        assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+        })
+        }
     }
     Ok(())
 }
@@ -925,59 +1488,69 @@ async fn test_sort_merge_join_complex_order_by() -> Result<()> {
     let left_schema = create_test_schema()?;
     let right_schema = create_test_schema2()?;
 
-    let left = parquet_exec(&left_schema);
-    let right = parquet_exec(&right_schema);
+    let left = parquet_exec(left_schema);
+    let right = parquet_exec(right_schema);
 
     // Join on (nullable_col == col_a)
     let join_on = vec![(
-        Arc::new(Column::new_with_schema("nullable_col", &left.schema()).unwrap()) as _,
-        Arc::new(Column::new_with_schema("col_a", &right.schema()).unwrap()) as _,
+        Arc::new(Column::new_with_schema("nullable_col", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("col_a", &right.schema())?) as _,
     )];
 
     let join = sort_merge_join_exec(left, right, &join_on, &JoinType::Inner);
 
     // order by (col_b, col_a)
-    let sort_exprs1 = vec![
+    let ordering = [
         sort_expr("col_b", &join.schema()),
         sort_expr("col_a", &join.schema()),
-    ];
-    let physical_plan = sort_preserving_merge_exec(sort_exprs1, join.clone());
-
-    let expected_input = ["SortPreservingMergeExec: [col_b@3 ASC, col_a@2 ASC]",
-        "  SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-
+    ]
+    .into();
+    let physical_plan = sort_preserving_merge_exec(ordering, join.clone());
+
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [col_b@3 ASC, col_a@2 ASC]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[col_b@3 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+    ");
     // can not push down the sort requirements, need to add SortExec
-    let expected_optimized = ["SortExec: expr=[col_b@3 ASC, col_a@2 ASC], preserve_partitioning=[false]",
-        "  SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
 
     // order by (nullable_col, col_b, col_a)
-    let sort_exprs2 = vec![
+    let ordering2 = [
         sort_expr("nullable_col", &join.schema()),
         sort_expr("col_b", &join.schema()),
         sort_expr("col_a", &join.schema()),
-    ];
-    let physical_plan = sort_preserving_merge_exec(sort_exprs2, join);
-
-    let expected_input = ["SortPreservingMergeExec: [nullable_col@0 ASC, col_b@3 ASC, col_a@2 ASC]",
-        "  SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-
-    // can not push down the sort requirements, need to add SortExec
-    let expected_optimized = ["SortExec: expr=[nullable_col@0 ASC, col_b@3 ASC, col_a@2 ASC], preserve_partitioning=[false]",
-        "  SortMergeJoin: join_type=Inner, on=[(nullable_col@0, col_a@0)]",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet",
-        "    SortExec: expr=[col_a@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    ]
+    .into();
+    let physical_plan = sort_preserving_merge_exec(ordering2, join);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, col_b@3 ASC, col_a@2 ASC]
+      SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+
+    Optimized Plan:
+    SortMergeJoinExec: join_type=Inner, on=[(nullable_col@0, col_a@0)]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+      SortExec: expr=[col_a@0 ASC, col_b@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[col_a, col_b], file_type=parquet
+    ");
+    // Can push down the sort requirements since col_a = nullable_col
 
     Ok(())
 }
@@ -985,152 +1558,136 @@ async fn test_sort_merge_join_complex_order_by() -> Result<()> {
 #[tokio::test]
 async fn test_multilayer_coalesce_partitions() -> Result<()> {
     let schema = create_test_schema()?;
-
-    let source1 = parquet_exec(&schema);
+    let source1 = parquet_exec(schema.clone());
     let repartition = repartition_exec(source1);
-    let coalesce = Arc::new(CoalescePartitionsExec::new(repartition)) as _;
+    let coalesce = coalesce_partitions_exec(repartition) as _;
     // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before
     let filter = filter_exec(
-        Arc::new(NotExpr::new(
-            col("non_nullable_col", schema.as_ref()).unwrap(),
-        )),
+        Arc::new(NotExpr::new(col("non_nullable_col", schema.as_ref())?)),
         coalesce,
     );
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let physical_plan = sort_exec(sort_exprs, filter);
+    let ordering = [sort_expr("nullable_col", &schema)].into();
+    let physical_plan = sort_exec(ordering, filter);
 
     // CoalescePartitionsExec and SortExec are not directly consecutive. In this case
     // we should be able to parallelize Sorting also (given that executors in between don't require)
     // single partition.
-    let expected_input = ["SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  FilterExec: NOT non_nullable_col@1",
-        "    CoalescePartitionsExec",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]",
-        "    FilterExec: NOT non_nullable_col@1",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      FilterExec: NOT non_nullable_col@1
+        CoalescePartitionsExec
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]
+        FilterExec: NOT non_nullable_col@1
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet
+    ");
 
     Ok(())
 }
 
-#[tokio::test]
-async fn test_with_lost_ordering_bounded() -> Result<()> {
+fn create_lost_ordering_plan(source_unbounded: bool) -> Result<Arc<dyn ExecutionPlan>> {
     let schema = create_test_schema3()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = csv_exec_sorted(&schema, sort_exprs);
-    let repartition_rr = repartition_exec(source);
-    let repartition_hash = Arc::new(RepartitionExec::try_new(
-        repartition_rr,
-        Partitioning::Hash(vec![col("c", &schema).unwrap()], 10),
-    )?) as _;
-    let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
-    let physical_plan = sort_exec(vec![sort_expr("a", &schema)], coalesce_partitions);
-
-    let expected_input = ["SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false"];
-    let expected_optimized = ["SortPreservingMergeExec: [a@0 ASC]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
-
-    Ok(())
-}
-
-#[rstest]
-#[tokio::test]
-async fn test_with_lost_ordering_unbounded_bounded(
-    #[values(false, true)] source_unbounded: bool,
-) -> Result<()> {
-    let schema = create_test_schema3()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
+    let sort_exprs = [sort_expr("a", &schema)];
     // create either bounded or unbounded source
     let source = if source_unbounded {
-        stream_exec_ordered(&schema, sort_exprs)
+        stream_exec_ordered(&schema, sort_exprs.clone().into())
     } else {
-        csv_exec_ordered(&schema, sort_exprs)
+        csv_exec_sorted(&schema, sort_exprs.clone())
     };
     let repartition_rr = repartition_exec(source);
     let repartition_hash = Arc::new(RepartitionExec::try_new(
         repartition_rr,
-        Partitioning::Hash(vec![col("c", &schema).unwrap()], 10),
+        Partitioning::Hash(vec![col("c", &schema)?], 10),
     )?) as _;
     let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
-    let physical_plan = sort_exec(vec![sort_expr("a", &schema)], coalesce_partitions);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = vec![
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
-    ];
-    let expected_input_bounded = vec![
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=true",
-    ];
+    let physical_plan = sort_exec(sort_exprs.into(), coalesce_partitions);
+    Ok(physical_plan)
+}
 
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = vec![
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
-    ];
+#[tokio::test]
+async fn test_with_lost_ordering_unbounded() -> Result<()> {
+    let physical_plan = create_lost_ordering_plan(true)?;
+
+    let test_no_repartition_sorts =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(false);
+
+    assert_snapshot!(test_no_repartition_sorts.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+
+    Optimized Plan:
+    SortPreservingMergeExec: [a@0 ASC]
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+    ");
+
+    let test_with_repartition_sorts =
+        EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test_with_repartition_sorts.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+
+    Optimized Plan:
+    SortPreservingMergeExec: [a@0 ASC]
+      RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+    ");
 
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = vec![
-        "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "  CoalescePartitionsExec",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=true",
-    ];
-    let expected_optimized_bounded_parallelize_sort = vec![
-        "SortPreservingMergeExec: [a@0 ASC]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-        "    RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        DataSourceExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=true",
-    ];
-    let (expected_input, expected_optimized, expected_optimized_sort_parallelize) =
-        if source_unbounded {
-            (
-                expected_input_unbounded,
-                expected_optimized_unbounded.clone(),
-                expected_optimized_unbounded,
-            )
-        } else {
-            (
-                expected_input_bounded,
-                expected_optimized_bounded,
-                expected_optimized_bounded_parallelize_sort,
-            )
-        };
-    assert_optimized!(
-        expected_input,
-        expected_optimized,
-        physical_plan.clone(),
-        false
-    );
-    assert_optimized!(
-        expected_input,
-        expected_optimized_sort_parallelize,
-        physical_plan,
-        true
-    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_with_lost_ordering_bounded() -> Result<()> {
+    let physical_plan = create_lost_ordering_plan(false)?;
+
+    let test_no_repartition_sorts =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(false);
+
+    assert_snapshot!(test_no_repartition_sorts.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
+    ");
+
+    let test_with_repartition_sorts =
+        EnforceSortingTest::new(physical_plan).with_repartition_sorts(true);
+
+    assert_snapshot!(test_with_repartition_sorts.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+      CoalescePartitionsExec
+        RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
+
+    Optimized Plan:
+    SortPreservingMergeExec: [a@0 ASC]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+        RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=csv, has_header=false
+    ");
 
     Ok(())
 }
@@ -1138,21 +1695,21 @@ async fn test_with_lost_ordering_unbounded_bounded(
 #[tokio::test]
 async fn test_do_not_pushdown_through_spm() -> Result<()> {
     let schema = create_test_schema3()?;
-    let sort_exprs = vec![sort_expr("a", &schema), sort_expr("b", &schema)];
+    let sort_exprs = [sort_expr("a", &schema), sort_expr("b", &schema)];
     let source = csv_exec_sorted(&schema, sort_exprs.clone());
     let repartition_rr = repartition_exec(source);
-    let spm = sort_preserving_merge_exec(sort_exprs, repartition_rr);
-    let physical_plan = sort_exec(vec![sort_expr("b", &schema)], spm);
-
-    let expected_input = ["SortExec: expr=[b@1 ASC], preserve_partitioning=[false]",
-        "  SortPreservingMergeExec: [a@0 ASC, b@1 ASC]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false",];
-    let expected_optimized = ["SortExec: expr=[b@1 ASC], preserve_partitioning=[false]",
-        "  SortPreservingMergeExec: [a@0 ASC, b@1 ASC]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false",];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, false);
+    let spm = sort_preserving_merge_exec(sort_exprs.into(), repartition_rr);
+    let physical_plan = sort_exec([sort_expr("b", &schema)].into(), spm);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+      SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
+    ");
 
     Ok(())
 }
@@ -1160,192 +1717,115 @@ async fn test_do_not_pushdown_through_spm() -> Result<()> {
 #[tokio::test]
 async fn test_pushdown_through_spm() -> Result<()> {
     let schema = create_test_schema3()?;
-    let sort_exprs = vec![sort_expr("a", &schema), sort_expr("b", &schema)];
+    let sort_exprs = [sort_expr("a", &schema), sort_expr("b", &schema)];
     let source = csv_exec_sorted(&schema, sort_exprs.clone());
     let repartition_rr = repartition_exec(source);
-    let spm = sort_preserving_merge_exec(sort_exprs, repartition_rr);
+    let spm = sort_preserving_merge_exec(sort_exprs.into(), repartition_rr);
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("a", &schema),
             sort_expr("b", &schema),
             sort_expr("c", &schema),
-        ],
+        ]
+        .into(),
         spm,
     );
-
-    let expected_input = ["SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  SortPreservingMergeExec: [a@0 ASC, b@1 ASC]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false",];
-    let expected_optimized = ["SortPreservingMergeExec: [a@0 ASC, b@1 ASC]",
-        "  SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[true]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false",];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, false);
-
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
+
+    Optimized Plan:
+    SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
+      SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[true]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=csv, has_header=false
+    ");
     Ok(())
 }
 
 #[tokio::test]
 async fn test_window_multi_layer_requirement() -> Result<()> {
     let schema = create_test_schema3()?;
-    let sort_exprs = vec![sort_expr("a", &schema), sort_expr("b", &schema)];
+    let sort_exprs = [sort_expr("a", &schema), sort_expr("b", &schema)];
     let source = csv_exec_sorted(&schema, vec![]);
-    let sort = sort_exec(sort_exprs.clone(), source);
+    let sort = sort_exec(sort_exprs.clone().into(), source);
     let repartition = repartition_exec(sort);
     let repartition = spr_repartition_exec(repartition);
-    let spm = sort_preserving_merge_exec(sort_exprs.clone(), repartition);
-
+    let spm = sort_preserving_merge_exec(sort_exprs.clone().into(), repartition);
     let physical_plan = bounded_window_exec("a", sort_exprs, spm);
 
-    let expected_input = [
-        "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortPreservingMergeExec: [a@0 ASC, b@1 ASC]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    let expected_optimized = [
-        "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]",
-        "    CoalescePartitionsExec",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, false);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortPreservingMergeExec: [a@0 ASC, b@1 ASC]
+        SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "#);
 
     Ok(())
 }
 #[tokio::test]
 async fn test_not_replaced_with_partial_sort_for_bounded_input() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("b", &schema), sort_expr("c", &schema)];
-    let parquet_input = parquet_exec_sorted(&schema, input_sort_exprs);
-
+    let parquet_ordering = [sort_expr("b", &schema), sort_expr("c", &schema)].into();
+    let parquet_input = parquet_exec_with_sort(schema.clone(), vec![parquet_ordering]);
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("a", &schema),
             sort_expr("b", &schema),
             sort_expr("c", &schema),
-        ],
+        ]
+        .into(),
         parquet_input,
     );
-    let expected_input = [
-        "SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[b@1 ASC, c@2 ASC], file_type=parquet"
-    ];
-    let expected_no_change = expected_input;
-    assert_optimized!(expected_input, expected_no_change, physical_plan, false);
-    Ok(())
-}
-
-/// Runs the sort enforcement optimizer and asserts the plan
-/// against the original and expected plans
-///
-/// `$EXPECTED_PLAN_LINES`: input plan
-/// `$EXPECTED_OPTIMIZED_PLAN_LINES`: optimized plan
-/// `$PLAN`: the plan to optimized
-/// `REPARTITION_SORTS`: Flag to set `config.options.optimizer.repartition_sorts` option.
-/// `$CASE_NUMBER` (optional): The test case number to print on failure.
-macro_rules! assert_optimized {
-    ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $REPARTITION_SORTS: expr $(, $CASE_NUMBER: expr)?) => {
-        let mut config = ConfigOptions::new();
-        config.optimizer.repartition_sorts = $REPARTITION_SORTS;
-
-        // This file has 4 rules that use tree node, apply these rules as in the
-        // EnforceSorting::optimize implementation
-        // After these operations tree nodes should be in a consistent state.
-        // This code block makes sure that these rules doesn't violate tree node integrity.
-        {
-            let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone());
-            let adjusted = plan_requirements
-                .transform_up(ensure_sorting)
-                .data()
-                .and_then(check_integrity)?;
-            // TODO: End state payloads will be checked here.
-
-            let new_plan = if config.optimizer.repartition_sorts {
-                let plan_with_coalesce_partitions =
-                    PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan);
-                let parallel = plan_with_coalesce_partitions
-                    .transform_up(parallelize_sorts)
-                    .data()
-                    .and_then(check_integrity)?;
-                // TODO: End state payloads will be checked here.
-                parallel.plan
-            } else {
-                adjusted.plan
-            };
-
-            let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan);
-            let updated_plan = plan_with_pipeline_fixer
-                .transform_up(|plan_with_pipeline_fixer| {
-                    replace_with_order_preserving_variants(
-                        plan_with_pipeline_fixer,
-                        false,
-                        true,
-                        &config,
-                    )
-                })
-                .data()
-                .and_then(check_integrity)?;
-            // TODO: End state payloads will be checked here.
-
-            let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan);
-            assign_initial_requirements(&mut sort_pushdown);
-            check_integrity(pushdown_sorts(sort_pushdown)?)?;
-            // TODO: End state payloads will be checked here.
-        }
-
-        let physical_plan = $PLAN;
-        let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-        let actual: Vec<&str> = formatted.trim().lines().collect();
-
-        let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES
-            .iter().map(|s| *s).collect();
-
-        if expected_plan_lines != actual {
-            $(println!("\n**Original Plan Mismatch in case {}**", $CASE_NUMBER);)?
-            println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_plan_lines, actual);
-            assert_eq!(expected_plan_lines, actual);
-        }
-
-        let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES
-            .iter().map(|s| *s).collect();
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(false);
 
-        // Run the actual optimizer
-        let optimized_physical_plan =
-            EnforceSorting::new().optimize(physical_plan, &config)?;
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[b@1 ASC, c@2 ASC], file_type=parquet
+    ");
 
-        // Get string representation of the plan
-        let actual = get_plan_string(&optimized_physical_plan);
-        if expected_optimized_lines != actual {
-            $(println!("\n**Optimized Plan Mismatch in case {}**", $CASE_NUMBER);)?
-            println!("\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", expected_optimized_lines, actual);
-            assert_eq!(expected_optimized_lines, actual);
-        }
-    };
+    Ok(())
 }
 
 #[tokio::test]
 async fn test_remove_unnecessary_sort() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source);
-    let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], input);
-
-    let expected_input = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let input = sort_exec([sort_expr("non_nullable_col", &schema)].into(), source);
+    let physical_plan = sort_exec([sort_expr("nullable_col", &schema)].into(), input);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1354,58 +1834,52 @@ async fn test_remove_unnecessary_sort() -> Result<()> {
 async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-
-    let sort_exprs = vec![sort_expr_options(
+    let ordering: LexOrdering = [sort_expr_options(
         "non_nullable_col",
         &source.schema(),
         SortOptions {
             descending: true,
             nulls_first: true,
         },
-    )];
-    let sort = sort_exec(sort_exprs.clone(), source);
-    // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before
-    let coalesce_batches = coalesce_batches_exec(sort);
-
-    let window_agg =
-        bounded_window_exec("non_nullable_col", sort_exprs, coalesce_batches);
-
-    let sort_exprs = vec![sort_expr_options(
+    )]
+    .into();
+    let sort = sort_exec(ordering.clone(), source);
+    let window_agg = bounded_window_exec("non_nullable_col", ordering, sort);
+    let ordering2: LexOrdering = [sort_expr_options(
         "non_nullable_col",
         &window_agg.schema(),
         SortOptions {
             descending: false,
             nulls_first: false,
         },
-    )];
-
-    let sort = sort_exec(sort_exprs.clone(), window_agg);
-
+    )]
+    .into();
+    let sort = sort_exec(ordering2.clone(), window_agg);
     // Add dummy layer propagating Sort above, to test whether sort can be removed from multi layer before
     let filter = filter_exec(
-        Arc::new(NotExpr::new(
-            col("non_nullable_col", schema.as_ref()).unwrap(),
-        )),
+        Arc::new(NotExpr::new(col("non_nullable_col", schema.as_ref())?)),
         sort,
     );
-
-    let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs, filter);
-
-    let expected_input = ["BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  FilterExec: NOT non_nullable_col@1",
-        "    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-        "      BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "        CoalesceBatchesExec: target_batch_size=128",
-        "          SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]",
-        "            DataSourceExec: partitions=1, partition_sizes=[0]"];
-
-    let expected_optimized = ["WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-        "  FilterExec: NOT non_nullable_col@1",
-        "    BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "      CoalesceBatchesExec: target_batch_size=128",
-        "        SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]",
-        "          DataSourceExec: partitions=1, partition_sizes=[0]"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let physical_plan = bounded_window_exec("non_nullable_col", ordering2, filter);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      FilterExec: NOT non_nullable_col@1
+        SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+          BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
+              DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      FilterExec: NOT non_nullable_col@1
+        BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+    "#);
 
     Ok(())
 }
@@ -1414,20 +1888,20 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> {
 async fn test_add_required_sort() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
+    let ordering = [sort_expr("nullable_col", &schema)].into();
+    let physical_plan = sort_preserving_merge_exec(ordering, source);
 
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, source);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      DataSourceExec: partitions=1, partition_sizes=[0]
 
-    let expected_input = [
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1436,25 +1910,26 @@ async fn test_add_required_sort() -> Result<()> {
 async fn test_remove_unnecessary_sort1() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), source);
-    let spm = sort_preserving_merge_exec(sort_exprs, sort);
-
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), spm);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, sort);
-    let expected_input = [
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "        DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
+    let sort = sort_exec(ordering.clone(), source);
+    let spm = sort_preserving_merge_exec(ordering.clone(), sort);
+    let sort = sort_exec(ordering.clone(), spm);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+        SortPreservingMergeExec: [nullable_col@0 ASC]
+          SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1463,38 +1938,38 @@ async fn test_remove_unnecessary_sort1() -> Result<()> {
 async fn test_remove_unnecessary_sort2() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let sort_exprs = vec![sort_expr("non_nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), source);
-    let spm = sort_preserving_merge_exec(sort_exprs, sort);
-
-    let sort_exprs = vec![
+    let ordering: LexOrdering = [sort_expr("non_nullable_col", &schema)].into();
+    let sort = sort_exec(ordering.clone(), source);
+    let spm = sort_preserving_merge_exec(ordering, sort);
+    let ordering2: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let sort2 = sort_exec(sort_exprs.clone(), spm);
-    let spm2 = sort_preserving_merge_exec(sort_exprs, sort2);
-
-    let sort_exprs = vec![sort_expr("nullable_col", &schema)];
-    let sort3 = sort_exec(sort_exprs, spm2);
+    ]
+    .into();
+    let sort2 = sort_exec(ordering2.clone(), spm);
+    let spm2 = sort_preserving_merge_exec(ordering2, sort2);
+    let ordering3 = [sort_expr("nullable_col", &schema)].into();
+    let sort3 = sort_exec(ordering3, spm2);
     let physical_plan = repartition_exec(repartition_exec(sort3));
 
-    let expected_input = [
-        "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "      SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "          SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "            SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "              DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-
-    let expected_optimized = [
-        "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+          SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+            SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+              SortPreservingMergeExec: [non_nullable_col@1 ASC]
+                SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+                  DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1503,43 +1978,43 @@ async fn test_remove_unnecessary_sort2() -> Result<()> {
 async fn test_remove_unnecessary_sort3() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let sort_exprs = vec![sort_expr("non_nullable_col", &schema)];
-    let sort = sort_exec(sort_exprs.clone(), source);
-    let spm = sort_preserving_merge_exec(sort_exprs, sort);
-
-    let sort_exprs = LexOrdering::new(vec![
+    let ordering: LexOrdering = [sort_expr("non_nullable_col", &schema)].into();
+    let sort = sort_exec(ordering.clone(), source);
+    let spm = sort_preserving_merge_exec(ordering, sort);
+    let repartition_exec = repartition_exec(spm);
+    let ordering2: LexOrdering = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ]);
-    let repartition_exec = repartition_exec(spm);
+    ]
+    .into();
     let sort2 = Arc::new(
-        SortExec::new(sort_exprs.clone(), repartition_exec)
+        SortExec::new(ordering2.clone(), repartition_exec)
             .with_preserve_partitioning(true),
     ) as _;
-    let spm2 = sort_preserving_merge_exec(sort_exprs, sort2);
-
+    let spm2 = sort_preserving_merge_exec(ordering2, sort2);
     let physical_plan = aggregate_exec(spm2);
 
     // When removing a `SortPreservingMergeExec`, make sure that partitioning
     // requirements are not violated. In some cases, we may need to replace
     // it with a `CoalescePartitionsExec` instead of directly removing it.
-    let expected_input = [
-        "AggregateExec: mode=Final, gby=[], aggr=[]",
-        "  SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "          SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "            DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-
-    let expected_optimized = [
-        "AggregateExec: mode=Final, gby=[], aggr=[]",
-        "  CoalescePartitionsExec",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    AggregateExec: mode=Final, gby=[], aggr=[]
+      SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+            SortPreservingMergeExec: [non_nullable_col@1 ASC]
+              SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+                DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    AggregateExec: mode=Final, gby=[], aggr=[]
+      CoalescePartitionsExec
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1548,52 +2023,51 @@ async fn test_remove_unnecessary_sort3() -> Result<()> {
 async fn test_remove_unnecessary_sort4() -> Result<()> {
     let schema = create_test_schema()?;
     let source1 = repartition_exec(memory_exec(&schema));
-
     let source2 = repartition_exec(memory_exec(&schema));
     let union = union_exec(vec![source1, source2]);
-
-    let sort_exprs = LexOrdering::new(vec![sort_expr("non_nullable_col", &schema)]);
-    // let sort = sort_exec(sort_exprs.clone(), union);
-    let sort = Arc::new(
-        SortExec::new(sort_exprs.clone(), union).with_preserve_partitioning(true),
-    ) as _;
-    let spm = sort_preserving_merge_exec(sort_exprs, sort);
-
+    let ordering: LexOrdering = [sort_expr("non_nullable_col", &schema)].into();
+    let sort =
+        Arc::new(SortExec::new(ordering.clone(), union).with_preserve_partitioning(true))
+            as _;
+    let spm = sort_preserving_merge_exec(ordering, sort);
     let filter = filter_exec(
-        Arc::new(NotExpr::new(
-            col("non_nullable_col", schema.as_ref()).unwrap(),
-        )),
+        Arc::new(NotExpr::new(col("non_nullable_col", schema.as_ref())?)),
         spm,
     );
-
-    let sort_exprs = vec![
+    let ordering2 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
-    ];
-    let physical_plan = sort_exec(sort_exprs, filter);
+    ]
+    .into();
+    let physical_plan = sort_exec(ordering2, filter);
 
     // When removing a `SortPreservingMergeExec`, make sure that partitioning
     // requirements are not violated. In some cases, we may need to replace
     // it with a `CoalescePartitionsExec` instead of directly removing it.
-    let expected_input = ["SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "  FilterExec: NOT non_nullable_col@1",
-        "    SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "      SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[true]",
-        "        UnionExec",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            DataSourceExec: partitions=1, partition_sizes=[0]",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            DataSourceExec: partitions=1, partition_sizes=[0]"];
-
-    let expected_optimized = ["SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]",
-        "    FilterExec: NOT non_nullable_col@1",
-        "      UnionExec",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: partitions=1, partition_sizes=[0]",
-        "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "          DataSourceExec: partitions=1, partition_sizes=[0]"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+      FilterExec: NOT non_nullable_col@1
+        SortPreservingMergeExec: [non_nullable_col@1 ASC]
+          SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[true]
+            UnionExec
+              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                DataSourceExec: partitions=1, partition_sizes=[0]
+              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[true]
+        FilterExec: NOT non_nullable_col@1
+          UnionExec
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: partitions=1, partition_sizes=[0]
+            RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+              DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1602,31 +2076,31 @@ async fn test_remove_unnecessary_sort4() -> Result<()> {
 async fn test_remove_unnecessary_sort6() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let input = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![sort_expr("non_nullable_col", &schema)]),
-            source,
-        )
-        .with_fetch(Some(2)),
+    let input = sort_exec_with_fetch(
+        [sort_expr("non_nullable_col", &schema)].into(),
+        Some(2),
+        source,
     );
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("non_nullable_col", &schema),
             sort_expr("nullable_col", &schema),
-        ],
+        ]
+        .into(),
         input,
     );
-
-    let expected_input = [
-        "SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+      SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1635,33 +2109,33 @@ async fn test_remove_unnecessary_sort6() -> Result<()> {
 async fn test_remove_unnecessary_sort7() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let input = Arc::new(SortExec::new(
-        LexOrdering::new(vec![
+    let input = sort_exec(
+        [
             sort_expr("non_nullable_col", &schema),
             sort_expr("nullable_col", &schema),
-        ]),
+        ]
+        .into(),
         source,
-    ));
+    );
+    let physical_plan = sort_exec_with_fetch(
+        [sort_expr("non_nullable_col", &schema)].into(),
+        Some(2),
+        input,
+    );
 
-    let physical_plan = Arc::new(
-        SortExec::new(
-            LexOrdering::new(vec![sort_expr("non_nullable_col", &schema)]),
-            input,
-        )
-        .with_fetch(Some(2)),
-    ) as Arc<dyn ExecutionPlan>;
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false], sort_prefix=[non_nullable_col@1 ASC]
+      SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
 
-    let expected_input = [
-        "SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false], sort_prefix=[non_nullable_col@1 ASC]",
-        "  SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "GlobalLimitExec: skip=0, fetch=2",
-        "  SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    Optimized Plan:
+    GlobalLimitExec: skip=0, fetch=2
+      SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1670,31 +2144,31 @@ async fn test_remove_unnecessary_sort7() -> Result<()> {
 async fn test_remove_unnecessary_sort8() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let input = Arc::new(SortExec::new(
-        LexOrdering::new(vec![sort_expr("non_nullable_col", &schema)]),
-        source,
-    ));
+    let input = sort_exec([sort_expr("non_nullable_col", &schema)].into(), source);
     let limit = Arc::new(LocalLimitExec::new(input, 2));
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("non_nullable_col", &schema),
             sort_expr("nullable_col", &schema),
-        ],
+        ]
+        .into(),
         limit,
     );
 
-    let expected_input = [
-        "SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  LocalLimitExec: fetch=2",
-        "    SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "LocalLimitExec: fetch=2",
-        "  SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+      LocalLimitExec: fetch=2
+        SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    LocalLimitExec: fetch=2
+      SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1703,27 +2177,19 @@ async fn test_remove_unnecessary_sort8() -> Result<()> {
 async fn test_do_not_pushdown_through_limit() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    // let input = sort_exec(vec![sort_expr("non_nullable_col", &schema)], source);
-    let input = Arc::new(SortExec::new(
-        LexOrdering::new(vec![sort_expr("non_nullable_col", &schema)]),
-        source,
-    ));
+    let input = sort_exec([sort_expr("non_nullable_col", &schema)].into(), source);
     let limit = Arc::new(GlobalLimitExec::new(input, 0, Some(5))) as _;
-    let physical_plan = sort_exec(vec![sort_expr("nullable_col", &schema)], limit);
-
-    let expected_input = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  GlobalLimitExec: skip=0, fetch=5",
-        "    SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  GlobalLimitExec: skip=0, fetch=5",
-        "    SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let physical_plan = sort_exec([sort_expr("nullable_col", &schema)].into(), limit);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      GlobalLimitExec: skip=0, fetch=5
+        SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1732,24 +2198,25 @@ async fn test_do_not_pushdown_through_limit() -> Result<()> {
 async fn test_remove_unnecessary_spm1() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let input =
-        sort_preserving_merge_exec(vec![sort_expr("non_nullable_col", &schema)], source);
-    let input2 =
-        sort_preserving_merge_exec(vec![sort_expr("non_nullable_col", &schema)], input);
+    let ordering: LexOrdering = [sort_expr("non_nullable_col", &schema)].into();
+    let input = sort_preserving_merge_exec(ordering.clone(), source);
+    let input2 = sort_preserving_merge_exec(ordering, input);
     let physical_plan =
-        sort_preserving_merge_exec(vec![sort_expr("nullable_col", &schema)], input2);
-
-    let expected_input = [
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "    SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+        sort_preserving_merge_exec([sort_expr("nullable_col", &schema)].into(), input2);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      SortPreservingMergeExec: [non_nullable_col@1 ASC]
+        SortPreservingMergeExec: [non_nullable_col@1 ASC]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1759,21 +2226,22 @@ async fn test_remove_unnecessary_spm2() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
     let input = sort_preserving_merge_exec_with_fetch(
-        vec![sort_expr("non_nullable_col", &schema)],
+        [sort_expr("non_nullable_col", &schema)].into(),
         source,
         100,
     );
 
-    let expected_input = [
-        "SortPreservingMergeExec: [non_nullable_col@1 ASC], fetch=100",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "LocalLimitExec: fetch=100",
-        "  SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, input, true);
+    let test = EnforceSortingTest::new(input.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [non_nullable_col@1 ASC], fetch=100
+      DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    LocalLimitExec: fetch=100
+      SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1782,22 +2250,25 @@ async fn test_remove_unnecessary_spm2() -> Result<()> {
 async fn test_change_wrong_sorting() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let sort_exprs = vec![
+    let sort_exprs = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
     ];
-    let sort = sort_exec(vec![sort_exprs[0].clone()], source);
-    let physical_plan = sort_preserving_merge_exec(sort_exprs, sort);
-    let expected_input = [
-        "SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let sort = sort_exec([sort_exprs[0].clone()].into(), source);
+    let physical_plan = sort_preserving_merge_exec(sort_exprs.into(), sort);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1806,25 +2277,26 @@ async fn test_change_wrong_sorting() -> Result<()> {
 async fn test_change_wrong_sorting2() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-    let sort_exprs = vec![
+    let sort_exprs = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
     ];
-    let spm1 = sort_preserving_merge_exec(sort_exprs.clone(), source);
-    let sort2 = sort_exec(vec![sort_exprs[0].clone()], spm1);
-    let physical_plan = sort_preserving_merge_exec(vec![sort_exprs[1].clone()], sort2);
-
-    let expected_input = [
-        "SortPreservingMergeExec: [non_nullable_col@1 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "    SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let spm1 = sort_preserving_merge_exec(sort_exprs.clone().into(), source);
+    let sort2 = sort_exec([sort_exprs[0].clone()].into(), spm1);
+    let physical_plan = sort_preserving_merge_exec([sort_exprs[1].clone()].into(), sort2);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortPreservingMergeExec: [non_nullable_col@1 ASC]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+        SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1833,32 +2305,34 @@ async fn test_change_wrong_sorting2() -> Result<()> {
 async fn test_multiple_sort_window_exec() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
-
-    let sort_exprs1 = vec![sort_expr("nullable_col", &schema)];
-    let sort_exprs2 = vec![
+    let ordering1 = [sort_expr("nullable_col", &schema)];
+    let sort1 = sort_exec(ordering1.clone().into(), source);
+    let window_agg1 = bounded_window_exec("non_nullable_col", ordering1.clone(), sort1);
+    let ordering2 = [
         sort_expr("nullable_col", &schema),
         sort_expr("non_nullable_col", &schema),
     ];
-
-    let sort1 = sort_exec(sort_exprs1.clone(), source);
-    let window_agg1 = bounded_window_exec("non_nullable_col", sort_exprs1.clone(), sort1);
-    let window_agg2 = bounded_window_exec("non_nullable_col", sort_exprs2, window_agg1);
-    // let filter_exec = sort_exec;
-    let physical_plan = bounded_window_exec("non_nullable_col", sort_exprs1, window_agg2);
-
-    let expected_input = ["BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "    BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "        DataSourceExec: partitions=1, partition_sizes=[0]"];
-
-    let expected_optimized = ["BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "    SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]",
-        "      BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "          DataSourceExec: partitions=1, partition_sizes=[0]"];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let window_agg2 = bounded_window_exec("non_nullable_col", ordering2, window_agg1);
+    let physical_plan = bounded_window_exec("non_nullable_col", ordering1, window_agg2);
+
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r#"
+    Input Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false]
+          BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+              DataSourceExec: partitions=1, partition_sizes=[0]
+    "#);
 
     Ok(())
 }
@@ -1871,47 +2345,38 @@ async fn test_multiple_sort_window_exec() -> Result<()> {
 // EnforceDistribution may invalidate ordering invariant.
 async fn test_commutativity() -> Result<()> {
     let schema = create_test_schema()?;
-    let config = ConfigOptions::new();
-
     let memory_exec = memory_exec(&schema);
-    let sort_exprs = LexOrdering::new(vec![sort_expr("nullable_col", &schema)]);
+    let sort_exprs = [sort_expr("nullable_col", &schema)];
     let window = bounded_window_exec("nullable_col", sort_exprs.clone(), memory_exec);
     let repartition = repartition_exec(window);
+    let orig_plan = sort_exec(sort_exprs.into(), repartition);
 
-    let orig_plan =
-        Arc::new(SortExec::new(sort_exprs, repartition)) as Arc<dyn ExecutionPlan>;
-    let actual = get_plan_string(&orig_plan);
-    let expected_input = vec![
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "    BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_eq!(
-        expected_input, actual,
-        "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_input:#?}\nactual:\n\n{actual:#?}\n\n"
-    );
+    assert_snapshot!(displayable(orig_plan.as_ref()).indent(true), @r#"
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    "#);
 
-    let mut plan = orig_plan.clone();
+    let config = ConfigOptions::new();
     let rules = vec![
         Arc::new(EnforceDistribution::new()) as Arc<dyn PhysicalOptimizerRule>,
         Arc::new(EnforceSorting::new()) as Arc<dyn PhysicalOptimizerRule>,
     ];
+    let mut first_plan = orig_plan.clone();
     for rule in rules {
-        plan = rule.optimize(plan, &config)?;
+        first_plan = rule.optimize(first_plan, &config)?;
     }
-    let first_plan = plan.clone();
 
-    let mut plan = orig_plan.clone();
     let rules = vec![
         Arc::new(EnforceSorting::new()) as Arc<dyn PhysicalOptimizerRule>,
         Arc::new(EnforceDistribution::new()) as Arc<dyn PhysicalOptimizerRule>,
         Arc::new(EnforceSorting::new()) as Arc<dyn PhysicalOptimizerRule>,
     ];
+    let mut second_plan = orig_plan.clone();
     for rule in rules {
-        plan = rule.optimize(plan, &config)?;
+        second_plan = rule.optimize(second_plan, &config)?;
     }
-    let second_plan = plan.clone();
 
     assert_eq!(get_plan_string(&first_plan), get_plan_string(&second_plan));
     Ok(())
@@ -1922,35 +2387,37 @@ async fn test_coalesce_propagate() -> Result<()> {
     let schema = create_test_schema()?;
     let source = memory_exec(&schema);
     let repartition = repartition_exec(source);
-    let coalesce_partitions = Arc::new(CoalescePartitionsExec::new(repartition));
+    let coalesce_partitions = coalesce_partitions_exec(repartition);
     let repartition = repartition_exec(coalesce_partitions);
-    let sort_exprs = LexOrdering::new(vec![sort_expr("nullable_col", &schema)]);
+    let ordering: LexOrdering = [sort_expr("nullable_col", &schema)].into();
     // Add local sort
     let sort = Arc::new(
-        SortExec::new(sort_exprs.clone(), repartition).with_preserve_partitioning(true),
+        SortExec::new(ordering.clone(), repartition).with_preserve_partitioning(true),
     ) as _;
-    let spm = sort_preserving_merge_exec(sort_exprs.clone(), sort);
-    let sort = sort_exec(sort_exprs, spm);
+    let spm = sort_preserving_merge_exec(ordering.clone(), sort);
+    let sort = sort_exec(ordering, spm);
 
     let physical_plan = sort.clone();
     // Sort Parallelize rule should end Coalesce + Sort linkage when Sort is Global Sort
     // Also input plan is not valid as it is. We need to add SortExec before SortPreservingMergeExec.
-    let expected_input = [
-        "SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]",
-        "  SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]",
-        "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "        CoalescePartitionsExec",
-        "          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "            DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    let expected_optimized = [
-        "SortPreservingMergeExec: [nullable_col@0 ASC]",
-        "  SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]",
-        "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false]
+      SortPreservingMergeExec: [nullable_col@0 ASC]
+        SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+            CoalescePartitionsExec
+              RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+                DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    SortPreservingMergeExec: [nullable_col@0 ASC]
+      SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[true]
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
 
     Ok(())
 }
@@ -1958,1425 +2425,167 @@ async fn test_coalesce_propagate() -> Result<()> {
 #[tokio::test]
 async fn test_replace_with_partial_sort2() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("a", &schema), sort_expr("c", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs);
-
+    let input_ordering = [sort_expr("a", &schema), sort_expr("c", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering);
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("a", &schema),
             sort_expr("c", &schema),
             sort_expr("d", &schema),
-        ],
+        ]
+        .into(),
         unbounded_input,
     );
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], preserve_partitioning=[false]
+      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]
+
+    Optimized Plan:
+    PartialSortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], common_prefix_length=[2]
+      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]
+    ");
 
-    let expected_input = [
-        "SortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], preserve_partitioning=[false]",
-        "  StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]"
-    ];
-    // let optimized
-    let expected_optimized = [
-        "PartialSortExec: expr=[a@0 ASC, c@2 ASC, d@3 ASC], common_prefix_length=[2]",
-        "  StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC, c@2 ASC]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
     Ok(())
 }
 
 #[tokio::test]
 async fn test_push_with_required_input_ordering_prohibited() -> Result<()> {
-    // SortExec: expr=[b]            <-- can't push this down
-    //  RequiredInputOrder expr=[a]  <-- this requires input sorted by a, and preserves the input order
-    //    SortExec: expr=[a]
-    //      DataSourceExec
     let schema = create_test_schema3()?;
-    let sort_exprs_a = LexOrdering::new(vec![sort_expr("a", &schema)]);
-    let sort_exprs_b = LexOrdering::new(vec![sort_expr("b", &schema)]);
+    let ordering_a: LexOrdering = [sort_expr("a", &schema)].into();
+    let ordering_b: LexOrdering = [sort_expr("b", &schema)].into();
     let plan = memory_exec(&schema);
-    let plan = sort_exec(sort_exprs_a.clone(), plan);
+    let plan = sort_exec(ordering_a.clone(), plan);
     let plan = RequirementsTestExec::new(plan)
-        .with_required_input_ordering(sort_exprs_a)
+        .with_required_input_ordering(Some(ordering_a))
         .with_maintains_input_order(true)
         .into_arc();
-    let plan = sort_exec(sort_exprs_b, plan);
-
-    let expected_input = [
-        "SortExec: expr=[b@1 ASC], preserve_partitioning=[false]",
-        "  RequiredInputOrderingExec",
-        "    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "      DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
+    let plan = sort_exec(ordering_b, plan);
+    let test = EnforceSortingTest::new(plan.clone()).with_repartition_sorts(true);
     // should not be able to push shorts
-    let expected_no_change = expected_input;
-    assert_optimized!(expected_input, expected_no_change, plan, true);
+
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+      RequiredInputOrderingExec
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
     Ok(())
 }
 
 // test when the required input ordering is satisfied so could push through
 #[tokio::test]
 async fn test_push_with_required_input_ordering_allowed() -> Result<()> {
-    // SortExec: expr=[a,b]          <-- can push this down (as it is compatible with the required input ordering)
-    //  RequiredInputOrder expr=[a]  <-- this requires input sorted by a, and preserves the input order
-    //    SortExec: expr=[a]
-    //      DataSourceExec
     let schema = create_test_schema3()?;
-    let sort_exprs_a = LexOrdering::new(vec![sort_expr("a", &schema)]);
-    let sort_exprs_ab =
-        LexOrdering::new(vec![sort_expr("a", &schema), sort_expr("b", &schema)]);
+    let ordering_a: LexOrdering = [sort_expr("a", &schema)].into();
+    let ordering_ab = [sort_expr("a", &schema), sort_expr("b", &schema)].into();
     let plan = memory_exec(&schema);
-    let plan = sort_exec(sort_exprs_a.clone(), plan);
+    let plan = sort_exec(ordering_a.clone(), plan);
     let plan = RequirementsTestExec::new(plan)
-        .with_required_input_ordering(sort_exprs_a)
+        .with_required_input_ordering(Some(ordering_a))
         .with_maintains_input_order(true)
         .into_arc();
-    let plan = sort_exec(sort_exprs_ab, plan);
+    let plan = sort_exec(ordering_ab, plan);
 
+    /*
     let expected_input = [
-        "SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]",
-        "  RequiredInputOrderingExec",
+        "SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]", // <-- can push this down (as it is compatible with the required input ordering)
+        "  RequiredInputOrderingExec", // <-- this requires input sorted by a, and preserves the input order
         "    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
         "      DataSourceExec: partitions=1, partition_sizes=[0]",
     ];
-    // should able to push shorts
-    let expected = [
-        "RequiredInputOrderingExec",
-        "  SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]",
-    ];
-    assert_optimized!(expected_input, expected, plan, true);
+    */
+    let test = EnforceSortingTest::new(plan.clone()).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]
+      RequiredInputOrderingExec
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+
+    Optimized Plan:
+    RequiredInputOrderingExec
+      SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    ");
+    // Should be able to push down
     Ok(())
 }
 
 #[tokio::test]
 async fn test_replace_with_partial_sort() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("a", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs);
-
+    let input_ordering = [sort_expr("a", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering);
     let physical_plan = sort_exec(
-        vec![sort_expr("a", &schema), sort_expr("c", &schema)],
+        [sort_expr("a", &schema), sort_expr("c", &schema)].into(),
         unbounded_input,
     );
 
-    let expected_input = [
-        "SortExec: expr=[a@0 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]"
-    ];
-    let expected_optimized = [
-        "PartialSortExec: expr=[a@0 ASC, c@2 ASC], common_prefix_length=[1]",
-        "  StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[a@0 ASC, c@2 ASC], preserve_partitioning=[false]
+      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+
+    Optimized Plan:
+    PartialSortExec: expr=[a@0 ASC, c@2 ASC], common_prefix_length=[1]
+      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]
+    ");
     Ok(())
 }
 
 #[tokio::test]
 async fn test_not_replaced_with_partial_sort_for_unbounded_input() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("b", &schema), sort_expr("c", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs);
-
+    let input_ordering = [sort_expr("b", &schema), sort_expr("c", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering);
     let physical_plan = sort_exec(
-        vec![
+        [
             sort_expr("a", &schema),
             sort_expr("b", &schema),
             sort_expr("c", &schema),
-        ],
+        ]
+        .into(),
         unbounded_input,
     );
-    let expected_input = [
-        "SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]"
-    ];
-    let expected_no_change = expected_input;
-    assert_optimized!(expected_input, expected_no_change, physical_plan, true);
-    Ok(())
-}
-
-#[tokio::test]
-async fn test_window_partial_constant_and_set_monotonicity() -> Result<()> {
-    let input_schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr_options(
-        "nullable_col",
-        &input_schema,
-        SortOptions {
-            descending: false,
-            nulls_first: false,
-        },
-    )];
-    let source = parquet_exec_sorted(&input_schema, sort_exprs);
-
-    // Function definition - Alias of the resulting column - Arguments of the function
-    #[derive(Clone)]
-    struct WindowFuncParam(WindowFunctionDefinition, String, Vec<Arc<dyn PhysicalExpr>>);
-    let function_arg_ordered = vec![col("nullable_col", &input_schema)?];
-    let function_arg_unordered = vec![col("non_nullable_col", &input_schema)?];
-    let fn_count_on_ordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(count_udaf()),
-        "count".to_string(),
-        function_arg_ordered.clone(),
-    );
-    let fn_max_on_ordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(max_udaf()),
-        "max".to_string(),
-        function_arg_ordered.clone(),
-    );
-    let fn_min_on_ordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(min_udaf()),
-        "min".to_string(),
-        function_arg_ordered.clone(),
-    );
-    let fn_avg_on_ordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(avg_udaf()),
-        "avg".to_string(),
-        function_arg_ordered,
-    );
-    let fn_count_on_unordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(count_udaf()),
-        "count".to_string(),
-        function_arg_unordered.clone(),
-    );
-    let fn_max_on_unordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(max_udaf()),
-        "max".to_string(),
-        function_arg_unordered.clone(),
-    );
-    let fn_min_on_unordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(min_udaf()),
-        "min".to_string(),
-        function_arg_unordered.clone(),
-    );
-    let fn_avg_on_unordered = WindowFuncParam(
-        WindowFunctionDefinition::AggregateUDF(avg_udaf()),
-        "avg".to_string(),
-        function_arg_unordered,
-    );
-    struct TestCase<'a> {
-        // Whether window expression has a partition_by expression or not.
-        // If it does, it will be on the ordered column -- `nullable_col`.
-        partition_by: bool,
-        // Whether the frame is unbounded in both directions, or unbounded in
-        // only one direction (when set-monotonicity has a meaning), or it is
-        // a sliding window.
-        window_frame: Arc<WindowFrame>,
-        // Function definition - Alias of the resulting column - Arguments of the function
-        func: WindowFuncParam,
-        // Global sort requirement at the root and its direction,
-        // which is required to be removed or preserved -- (asc, nulls_first)
-        required_sort_columns: Vec<(&'a str, bool, bool)>,
-        initial_plan: Vec<&'a str>,
-        expected_plan: Vec<&'a str>,
-    }
-    let test_cases = vec![
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column
-        // Case 0:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 1:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 2:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 3:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column
-        // Case 4:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("non_nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 5:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("non_nullable_col", false, false), ("max", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 6:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", true, false), ("non_nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 7:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("avg", false, false), ("nullable_col", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column
-        // Case 8:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 9:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 10:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 11:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column
-        // Case 12:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("non_nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 13:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("non_nullable_col", true, false), ("max", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 14:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("non_nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 15:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(None)),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("avg", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on ordered column
-        // Case 16:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 17:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("max", false, true), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 18:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", true, true), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 19:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on unordered column
-        // Case 20:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 21:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", false, true)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 22:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 23:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("avg", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Sliding(current row, unbounded following) + partition_by + on ordered column
-        // Case 24:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 25:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 26:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 27:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // WindowAggExec + Sliding(current row, unbounded following) + partition_by + on unordered column
-        // Case 28:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("count", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-        },
-        // Case 29:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", false, true)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "WindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 30:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 31:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  WindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(NULL), is_causal: false }]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column
-        // Case 32:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 33:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("max", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-        },
-        // Case 34:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 35:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column
-        // Case 36:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, true)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 37:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("max", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 38:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", false, true), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 39:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column
-        // Case 40:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 41:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("max", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-        },
-        // Case 42:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 43:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column
-        // Case 44:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![ ("count", true, true)],
-            initial_plan: vec![
-                "SortExec: expr=[count@2 ASC], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[count@2 ASC], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",  ],
-        },
-        // Case 45:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 46:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("min", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 47:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new(Some(true))),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on ordered column
-        // Case 48:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("count", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 49:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("max", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 50:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("min", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 51:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on unordered column
-        // Case 52:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("count", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet"
-            ],
-        },
-        // Case 53:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 54:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("min", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 55:
-        TestCase {
-            partition_by: false,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on ordered column
-        // Case 56:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_count_on_ordered.clone(),
-            required_sort_columns: vec![("count", true, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 57:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32)?))),
-            func: fn_max_on_ordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: Following(UInt32(1)), is_causal: false }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 58:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_min_on_ordered.clone(),
-            required_sort_columns: vec![("min", false, false), ("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 59:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_avg_on_ordered.clone(),
-            required_sort_columns: vec![("avg", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-        // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-        // ============================================REGION STARTS============================================
-        // BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on unordered column
-        // Case 60:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_count_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("count", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 61:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_max_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("max", true, true)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[max: Ok(Field { name: \"max\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 62:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_min_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false), ("min", false, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[min: Ok(Field { name: \"min\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // Case 63:
-        TestCase {
-            partition_by: true,
-            window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32)?), WindowFrameBound::CurrentRow)),
-            func: fn_avg_on_unordered.clone(),
-            required_sort_columns: vec![("nullable_col", true, false)],
-            initial_plan: vec![
-                "SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]",
-                "  BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-            expected_plan: vec![
-                "BoundedWindowAggExec: wdw=[avg: Ok(Field { name: \"avg\", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt32(1)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-                "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet",
-            ],
-        },
-        // =============================================REGION ENDS=============================================
-    ];
-
-    for (case_idx, case) in test_cases.into_iter().enumerate() {
-        let partition_by = if case.partition_by {
-            vec![col("nullable_col", &input_schema)?]
-        } else {
-            vec![]
-        };
-        let window_expr = create_window_expr(
-            &case.func.0,
-            case.func.1,
-            &case.func.2,
-            &partition_by,
-            &LexOrdering::default(),
-            case.window_frame,
-            input_schema.as_ref(),
-            false,
-        )?;
-        let window_exec = if window_expr.uses_bounded_memory() {
-            Arc::new(BoundedWindowAggExec::try_new(
-                vec![window_expr],
-                Arc::clone(&source),
-                InputOrderMode::Sorted,
-                case.partition_by,
-            )?) as Arc<dyn ExecutionPlan>
-        } else {
-            Arc::new(WindowAggExec::try_new(
-                vec![window_expr],
-                Arc::clone(&source),
-                case.partition_by,
-            )?) as _
-        };
-        let output_schema = window_exec.schema();
-        let sort_expr = case
-            .required_sort_columns
-            .iter()
-            .map(|(col_name, asc, nf)| {
-                sort_expr_options(
-                    col_name,
-                    &output_schema,
-                    SortOptions {
-                        descending: !asc,
-                        nulls_first: *nf,
-                    },
-                )
-            })
-            .collect::<Vec<_>>();
-        let physical_plan = sort_exec(sort_expr, window_exec);
-
-        assert_optimized!(
-            case.initial_plan,
-            case.expected_plan,
-            physical_plan,
-            true,
-            case_idx
-        );
-    }
-
+    let test =
+        EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
+    ");
     Ok(())
 }
 
 #[test]
 fn test_removes_unused_orthogonal_sort() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("b", &schema), sort_expr("c", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs.clone());
-
-    let orthogonal_sort = sort_exec(vec![sort_expr("a", &schema)], unbounded_input);
-    let output_sort = sort_exec(input_sort_exprs, orthogonal_sort); // same sort as data source
+    let input_ordering: LexOrdering =
+        [sort_expr("b", &schema), sort_expr("c", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering.clone());
+    let orthogonal_sort = sort_exec([sort_expr("a", &schema)].into(), unbounded_input);
+    let output_sort = sort_exec(input_ordering, orthogonal_sort); // same sort as data source
 
     // Test scenario/input has an orthogonal sort:
-    let expected_input = [
-        "SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "    StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]"
-    ];
-    assert_eq!(get_plan_string(&output_sort), expected_input,);
+    let test = EnforceSortingTest::new(output_sort).with_repartition_sorts(true);
+
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
 
+    Optimized Plan:
+    StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
+    ");
     // Test: should remove orthogonal sort, and the uppermost (unneeded) sort:
-    let expected_optimized = [
-        "StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]"
-    ];
-    assert_optimized!(expected_input, expected_optimized, output_sort, true);
 
     Ok(())
 }
@@ -3384,24 +2593,23 @@ fn test_removes_unused_orthogonal_sort() -> Result<()> {
 #[test]
 fn test_keeps_used_orthogonal_sort() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("b", &schema), sort_expr("c", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs.clone());
-
+    let input_ordering: LexOrdering =
+        [sort_expr("b", &schema), sort_expr("c", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering.clone());
     let orthogonal_sort =
-        sort_exec_with_fetch(vec![sort_expr("a", &schema)], Some(3), unbounded_input); // has fetch, so this orthogonal sort changes the output
-    let output_sort = sort_exec(input_sort_exprs, orthogonal_sort);
+        sort_exec_with_fetch([sort_expr("a", &schema)].into(), Some(3), unbounded_input); // has fetch, so this orthogonal sort changes the output
+    let output_sort = sort_exec(input_ordering, orthogonal_sort);
 
     // Test scenario/input has an orthogonal sort:
-    let expected_input = [
-        "SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]",
-        "    StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]"
-    ];
-    assert_eq!(get_plan_string(&output_sort), expected_input,);
+    let test = EnforceSortingTest::new(output_sort).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input / Optimized Plan:
+    SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]
+        StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
+    ");
 
     // Test: should keep the orthogonal sort, since it modifies the output:
-    let expected_optimized = expected_input;
-    assert_optimized!(expected_input, expected_optimized, output_sort, true);
 
     Ok(())
 }
@@ -3409,35 +2617,36 @@ fn test_keeps_used_orthogonal_sort() -> Result<()> {
 #[test]
 fn test_handles_multiple_orthogonal_sorts() -> Result<()> {
     let schema = create_test_schema3()?;
-    let input_sort_exprs = vec![sort_expr("b", &schema), sort_expr("c", &schema)];
-    let unbounded_input = stream_exec_ordered(&schema, input_sort_exprs.clone());
-
-    let orthogonal_sort_0 = sort_exec(vec![sort_expr("c", &schema)], unbounded_input); // has no fetch, so can be removed
+    let input_ordering: LexOrdering =
+        [sort_expr("b", &schema), sort_expr("c", &schema)].into();
+    let unbounded_input = stream_exec_ordered(&schema, input_ordering.clone());
+    let ordering0: LexOrdering = [sort_expr("c", &schema)].into();
+    let orthogonal_sort_0 = sort_exec(ordering0.clone(), unbounded_input); // has no fetch, so can be removed
+    let ordering1: LexOrdering = [sort_expr("a", &schema)].into();
     let orthogonal_sort_1 =
-        sort_exec_with_fetch(vec![sort_expr("a", &schema)], Some(3), orthogonal_sort_0); // has fetch, so this orthogonal sort changes the output
-    let orthogonal_sort_2 = sort_exec(vec![sort_expr("c", &schema)], orthogonal_sort_1); // has no fetch, so can be removed
-    let orthogonal_sort_3 = sort_exec(vec![sort_expr("a", &schema)], orthogonal_sort_2); // has no fetch, so can be removed
-    let output_sort = sort_exec(input_sort_exprs, orthogonal_sort_3); // final sort
+        sort_exec_with_fetch(ordering1.clone(), Some(3), orthogonal_sort_0); // has fetch, so this orthogonal sort changes the output
+    let orthogonal_sort_2 = sort_exec(ordering0, orthogonal_sort_1); // has no fetch, so can be removed
+    let orthogonal_sort_3 = sort_exec(ordering1, orthogonal_sort_2); // has no fetch, so can be removed
+    let output_sort = sort_exec(input_ordering, orthogonal_sort_3); // final sort
 
     // Test scenario/input has an orthogonal sort:
-    let expected_input = [
-        "SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-        "    SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "      SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]",
-        "        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]",
-        "          StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]",
-    ];
-    assert_eq!(get_plan_string(&output_sort), expected_input,);
+    let test = EnforceSortingTest::new(output_sort.clone()).with_repartition_sorts(true);
+    assert_snapshot!(test.run(), @r"
+    Input Plan:
+    SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]
+            SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+              StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
+
+    Optimized Plan:
+    SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]
+      SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]
+        StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]
+    ");
 
     // Test: should keep only the needed orthogonal sort, and remove the unneeded ones:
-    let expected_optimized = [
-        "SortExec: expr=[b@1 ASC, c@2 ASC], preserve_partitioning=[false]",
-        "  SortExec: TopK(fetch=3), expr=[a@0 ASC], preserve_partitioning=[false]",
-        "    StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[b@1 ASC, c@2 ASC]",
-    ];
-    assert_optimized!(expected_input, expected_optimized, output_sort, true);
-
     Ok(())
 }
 
@@ -3445,13 +2654,14 @@ fn test_handles_multiple_orthogonal_sorts() -> Result<()> {
 fn test_parallelize_sort_preserves_fetch() -> Result<()> {
     // Create a schema
     let schema = create_test_schema3()?;
-    let parquet_exec = parquet_exec(&schema);
-    let coalesced = Arc::new(CoalescePartitionsExec::new(parquet_exec.clone()));
-    let top_coalesced =
-        Arc::new(CoalescePartitionsExec::new(coalesced.clone()).with_fetch(Some(10)));
+    let parquet_exec = parquet_exec(schema);
+    let coalesced = coalesce_partitions_exec(parquet_exec.clone());
+    let top_coalesced = coalesce_partitions_exec(coalesced.clone())
+        .with_fetch(Some(10))
+        .unwrap();
 
     let requirements = PlanWithCorrespondingCoalescePartitions::new(
-        top_coalesced.clone(),
+        top_coalesced,
         true,
         vec![PlanWithCorrespondingCoalescePartitions::new(
             coalesced,
@@ -3474,3 +2684,168 @@ fn test_parallelize_sort_preserves_fetch() -> Result<()> {
     );
     Ok(())
 }
+
+#[tokio::test]
+async fn test_partial_sort_with_homogeneous_batches() -> Result<()> {
+    // Create schema for the table
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+        Field::new("c", DataType::Int32, false),
+    ]));
+
+    // Create homogeneous batches - each batch has the same values for columns a and b
+    let batch1 = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 1, 1])),
+            Arc::new(Int32Array::from(vec![1, 1, 1])),
+            Arc::new(Int32Array::from(vec![3, 2, 1])),
+        ],
+    )?;
+    let batch2 = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![2, 2, 2])),
+            Arc::new(Int32Array::from(vec![2, 2, 2])),
+            Arc::new(Int32Array::from(vec![4, 6, 5])),
+        ],
+    )?;
+    let batch3 = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from(vec![3, 3, 3])),
+            Arc::new(Int32Array::from(vec![3, 3, 3])),
+            Arc::new(Int32Array::from(vec![9, 7, 8])),
+        ],
+    )?;
+
+    // Create session with batch size of 3 to match our homogeneous batch pattern
+    let session_config = SessionConfig::new()
+        .with_batch_size(3)
+        .with_target_partitions(1);
+    let ctx = SessionContext::new_with_config(session_config);
+
+    let sort_order = vec![
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "a",
+            )),
+            true,
+            false,
+        ),
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "b",
+            )),
+            true,
+            false,
+        ),
+    ];
+    let batches = Arc::new(DummyStreamPartition {
+        schema: schema.clone(),
+        batches: vec![batch1, batch2, batch3],
+    }) as _;
+    let provider = StreamingTable::try_new(schema.clone(), vec![batches])?
+        .with_sort_order(sort_order)
+        .with_infinite_table(true);
+    ctx.register_table("test_table", Arc::new(provider))?;
+
+    let sql = "SELECT * FROM test_table ORDER BY a ASC, c ASC";
+    let df = ctx.sql(sql).await?;
+
+    let physical_plan = df.create_physical_plan().await?;
+
+    // Verify that PartialSortExec is used
+    let plan_str = displayable(physical_plan.as_ref()).indent(true).to_string();
+    assert!(
+        plan_str.contains("PartialSortExec"),
+        "Expected PartialSortExec in plan:\n{plan_str}",
+    );
+
+    let task_ctx = Arc::new(TaskContext::default());
+    let mut stream = physical_plan.execute(0, task_ctx.clone())?;
+
+    let mut collected_batches = Vec::new();
+    while let Some(batch) = stream.next().await {
+        let batch = batch?;
+        if batch.num_rows() > 0 {
+            collected_batches.push(batch);
+        }
+    }
+
+    // Assert we got 3 separate batches (not concatenated into fewer)
+    assert_eq!(
+        collected_batches.len(),
+        3,
+        "Expected 3 separate batches, got {}",
+        collected_batches.len()
+    );
+
+    // Verify each batch has been sorted within itself
+    let expected_values = [vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]];
+
+    for (i, batch) in collected_batches.iter().enumerate() {
+        let c_array = batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        let actual = c_array.values().iter().copied().collect::<Vec<i32>>();
+        assert_eq!(actual, expected_values[i], "Batch {i} not sorted correctly",);
+    }
+
+    assert_eq!(
+        task_ctx.runtime_env().memory_pool.reserved(),
+        0,
+        "Memory should be released after execution"
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_sort_with_streaming_table() -> Result<()> {
+    let batch = record_batch!(("a", Int32, [1, 2, 3]), ("b", Int32, [1, 2, 3]))?;
+
+    let ctx = SessionContext::new();
+
+    let sort_order = vec![
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "a",
+            )),
+            true,
+            false,
+        ),
+        SortExpr::new(
+            Expr::Column(datafusion_common::Column::new(
+                Option::<TableReference>::None,
+                "b",
+            )),
+            true,
+            false,
+        ),
+    ];
+    let schema = batch.schema();
+    let batches = Arc::new(DummyStreamPartition {
+        schema: schema.clone(),
+        batches: vec![batch],
+    }) as _;
+    let provider = StreamingTable::try_new(schema.clone(), vec![batches])?
+        .with_sort_order(sort_order);
+    ctx.register_table("test_table", Arc::new(provider))?;
+
+    let sql = "SELECT a FROM test_table GROUP BY a ORDER BY a";
+    let results = ctx.sql(sql).await?.collect().await?;
+
+    assert_eq!(results.len(), 1);
+    assert_eq!(results[0].num_columns(), 1);
+    let expected = create_array!(Int32, vec![1, 2, 3]) as ArrayRef;
+    assert_eq!(results[0].column(0), &expected);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs
new file mode 100644
index 0000000000000..de7611ff211a5
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs
@@ -0,0 +1,1715 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::physical_optimizer::test_utils::{
+    create_test_schema, parquet_exec_with_sort, sort_exec, sort_expr_options,
+};
+use arrow::datatypes::DataType;
+use arrow_schema::SortOptions;
+use datafusion::common::ScalarValue;
+use datafusion::logical_expr::WindowFrameBound;
+use datafusion::logical_expr::WindowFrameUnits;
+use datafusion_expr::{WindowFrame, WindowFunctionDefinition};
+use datafusion_functions_aggregate::average::avg_udaf;
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf};
+use datafusion_physical_expr::expressions::col;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_plan::windows::{
+    BoundedWindowAggExec, WindowAggExec, create_window_expr,
+};
+use datafusion_physical_plan::{ExecutionPlan, InputOrderMode};
+use insta::assert_snapshot;
+use std::sync::{Arc, LazyLock};
+
+// Function definition - Alias of the resulting column - Arguments of the function
+#[derive(Clone)]
+struct WindowFuncParam(
+    WindowFunctionDefinition,
+    &'static str,
+    Vec<Arc<dyn PhysicalExpr>>,
+);
+
+fn function_arg_ordered() -> Vec<Arc<dyn PhysicalExpr>> {
+    let input_schema = create_test_schema().unwrap();
+    vec![col("nullable_col", &input_schema).unwrap()]
+}
+fn function_arg_unordered() -> Vec<Arc<dyn PhysicalExpr>> {
+    let input_schema = create_test_schema().unwrap();
+    vec![col("non_nullable_col", &input_schema).unwrap()]
+}
+
+fn fn_count_on_ordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(count_udaf()),
+        "count",
+        function_arg_ordered(),
+    )
+}
+
+fn fn_max_on_ordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(max_udaf()),
+        "max",
+        function_arg_ordered(),
+    )
+}
+
+fn fn_min_on_ordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(min_udaf()),
+        "min",
+        function_arg_ordered(),
+    )
+}
+
+fn fn_avg_on_ordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(avg_udaf()),
+        "avg",
+        function_arg_ordered(),
+    )
+}
+
+fn fn_count_on_unordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(count_udaf()),
+        "count",
+        function_arg_unordered(),
+    )
+}
+
+fn fn_max_on_unordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(max_udaf()),
+        "max",
+        function_arg_unordered(),
+    )
+}
+fn fn_min_on_unordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(min_udaf()),
+        "min",
+        function_arg_unordered(),
+    )
+}
+
+fn fn_avg_on_unordered() -> WindowFuncParam {
+    WindowFuncParam(
+        WindowFunctionDefinition::AggregateUDF(avg_udaf()),
+        "avg",
+        function_arg_unordered(),
+    )
+}
+
+struct TestWindowCase {
+    partition_by: bool,
+    window_frame: Arc<WindowFrame>,
+    func: WindowFuncParam,
+    required_sort: Vec<(&'static str, bool, bool)>, // (column name, ascending, nulls_first)
+}
+impl TestWindowCase {
+    fn source() -> Arc<dyn ExecutionPlan> {
+        static SOURCE: LazyLock<Arc<dyn ExecutionPlan>> = LazyLock::new(|| {
+            let input_schema = create_test_schema().unwrap();
+            let ordering = [sort_expr_options(
+                "nullable_col",
+                &input_schema,
+                SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            )]
+            .into();
+            parquet_exec_with_sort(input_schema.clone(), vec![ordering])
+        });
+        Arc::clone(&SOURCE)
+    }
+
+    // runs the window test case and returns the string representation of the plan
+    fn run(self) -> String {
+        let input_schema = create_test_schema().unwrap();
+        let source = Self::source();
+
+        let Self {
+            partition_by,
+            window_frame,
+            func: WindowFuncParam(func_def, func_name, func_args),
+            required_sort,
+        } = self;
+        let partition_by_exprs = if partition_by {
+            vec![col("nullable_col", &input_schema).unwrap()]
+        } else {
+            vec![]
+        };
+
+        let window_expr = create_window_expr(
+            &func_def,
+            func_name.to_string(),
+            &func_args,
+            &partition_by_exprs,
+            &[],
+            window_frame,
+            Arc::clone(&input_schema),
+            false,
+            false,
+            None,
+        )
+        .unwrap();
+
+        let window_exec = if window_expr.uses_bounded_memory() {
+            Arc::new(
+                BoundedWindowAggExec::try_new(
+                    vec![window_expr],
+                    Arc::clone(&source),
+                    InputOrderMode::Sorted,
+                    partition_by,
+                )
+                .unwrap(),
+            ) as Arc<dyn ExecutionPlan>
+        } else {
+            Arc::new(
+                WindowAggExec::try_new(
+                    vec![window_expr],
+                    Arc::clone(&source),
+                    partition_by,
+                )
+                .unwrap(),
+            ) as Arc<dyn ExecutionPlan>
+        };
+
+        let output_schema = window_exec.schema();
+        let sort_expr = required_sort.into_iter().map(|(col, asc, nulls_first)| {
+            sort_expr_options(
+                col,
+                &output_schema,
+                SortOptions {
+                    descending: !asc,
+                    nulls_first,
+                },
+            )
+        });
+        let ordering = LexOrdering::new(sort_expr).unwrap();
+        let physical_plan = sort_exec(ordering, window_exec);
+
+        crate::physical_optimizer::enforce_sorting::EnforceSortingTest::new(physical_plan)
+            .with_repartition_sorts(true)
+            .run()
+    }
+}
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_0() {
+    // ============================================REGION STARTS============================================
+    // WindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column
+    // Case 0:
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_1() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_2() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_3() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("avg", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_4() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("non_nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_5() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("non_nullable_col", false, false),
+            ("max", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_6() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", true, false),
+            ("non_nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_7() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("avg", false, false),
+            ("nullable_col", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+            );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_8() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_9() {
+    assert_snapshot!(TestWindowCase  {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_10() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_11() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("avg", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// WindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column
+// Case 12:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_12() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("non_nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 13:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_13() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("non_nullable_col", true, false),
+            ("max", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 14:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_14() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("non_nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 15:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_15() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(None)),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("avg", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on ordered column
+// Case 16:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_16() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 17:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_17() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("max", false, true),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 18:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_18() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", true, true),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 19:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_19() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("avg", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// WindowAggExec + Sliding(current row, unbounded following) + no partition_by + on unordered column
+// Case 20:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_20() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 21:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_21() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", false, true),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 22:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_22() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 23:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_23() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("avg", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// WindowAggExec + Sliding(current row, unbounded following) + partition_by + on ordered column
+// Case 24:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_24() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 25:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_25() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 26:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_26() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#);
+}
+
+// Case 27:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_27() {
+    assert_snapshot!(
+        TestWindowCase {
+            partition_by: true,
+            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+            func: fn_avg_on_ordered(),
+            required_sort: vec![
+                ("avg", false, false),
+            ],
+        }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#);
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// WindowAggExec + Sliding(current row, unbounded following) + partition_by + on unordered column
+
+// Case 28:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_28() {
+    assert_snapshot!(
+        TestWindowCase {
+            partition_by: true,
+            window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+            func: fn_count_on_unordered(),
+            required_sort: vec![
+                ("count", false, false),
+                ("nullable_col", true, false),
+            ],
+        }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 29:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_29() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", false, true),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false]
+      WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#)
+}
+
+// Case 30:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_30() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#);
+}
+
+// Case 31:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_31() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true)).reverse()),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on ordered column
+
+// Case 32:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_32() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 33:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_33() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("max", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 34:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_34() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+// Case 35:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_35() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+           ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + no partition_by + on unordered column
+
+// Case 36:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_36() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, true),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 37:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_37() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("max", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 38:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_38() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", false, true),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 39:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_39() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on ordered column
+
+// Case 40:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_40() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 41:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_41() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("max", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 42:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_42() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 43:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_43() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Plain(unbounded preceding, unbounded following) + partition_by + on unordered column
+
+// Case 44:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_44() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("count", true, true),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[count@2 ASC], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 45:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_45() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 46:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_46() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("min", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 47:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_47() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new(Some(true))),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on ordered column
+
+// Case 48:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_48() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("count", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 49:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_49() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("max", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 50:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_50() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("min", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 51:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_51() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + no partition_by + on unordered column
+
+// Case 52:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_52() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("count", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 53:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_53() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 54:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_54() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("min", true, false),
+            ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 55:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_55() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: false,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on ordered column
+
+// Case 56:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_56() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_count_on_ordered(),
+        required_sort: vec![
+            ("count", true, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 57:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_57() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::Following(ScalarValue::new_one(&DataType::UInt32).unwrap()))),
+        func: fn_max_on_ordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 58:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_58() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_min_on_ordered(),
+        required_sort: vec![
+            ("min", false, false),
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 59:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_59() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_avg_on_ordered(),
+        required_sort: vec![
+            ("avg", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// =============================================REGION ENDS=============================================
+// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+// ============================================REGION STARTS============================================
+// BoundedWindowAggExec + Sliding(bounded preceding, bounded following) + partition_by + on unordered column
+
+// Case 60:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_60() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_count_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("count", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 61:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_61() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_max_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("max", true, true),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 62:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_62() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_min_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+            ("min", false, false),
+        ],
+    }.run(),
+        @ r#"
+    Input / Optimized Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+
+// Case 63:
+#[test]
+fn test_window_partial_constant_and_set_monotonicity_63() {
+    assert_snapshot!(TestWindowCase {
+        partition_by: true,
+        window_frame: Arc::new(WindowFrame::new_bounds(WindowFrameUnits::Rows, WindowFrameBound::Preceding(ScalarValue::new_one(&DataType::UInt32).unwrap()), WindowFrameBound::CurrentRow)),
+        func: fn_avg_on_unordered(),
+        required_sort: vec![
+            ("nullable_col", true, false),
+        ],
+    }.run(),
+        @ r#"
+    Input Plan:
+    SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false]
+      BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+
+    Optimized Plan:
+    BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet
+    "#
+    );
+}
+// =============================================REGION ENDS=============================================
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs
new file mode 100644
index 0000000000000..8f430f7753ef6
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs
@@ -0,0 +1,4463 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::{Arc, LazyLock};
+
+use arrow::{
+    array::{Float64Array, Int32Array, RecordBatch, StringArray, record_batch},
+    datatypes::{DataType, Field, Schema, SchemaRef},
+    util::pretty::pretty_format_batches,
+};
+use arrow_schema::SortOptions;
+use datafusion::{
+    assert_batches_eq,
+    logical_expr::Operator,
+    physical_plan::{
+        PhysicalExpr,
+        expressions::{BinaryExpr, Column, Literal},
+    },
+    prelude::{ParquetReadOptions, SessionConfig, SessionContext},
+    scalar::ScalarValue,
+};
+use datafusion_catalog::memory::DataSourceExec;
+use datafusion_common::config::ConfigOptions;
+use datafusion_datasource::{
+    PartitionedFile, file_groups::FileGroup, file_scan_config::FileScanConfigBuilder,
+};
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_expr::ScalarUDF;
+use datafusion_functions::math::random::RandomFunc;
+use datafusion_functions_aggregate::{
+    count::count_udaf,
+    min_max::{max_udaf, min_udaf},
+};
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, expressions::col};
+use datafusion_physical_expr::{
+    Partitioning, ScalarFunctionExpr,
+    aggregate::{AggregateExprBuilder, AggregateFunctionExpr},
+};
+use datafusion_physical_optimizer::{
+    PhysicalOptimizerRule, filter_pushdown::FilterPushdown,
+};
+use datafusion_physical_plan::{
+    ExecutionPlan,
+    aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
+    coalesce_partitions::CoalescePartitionsExec,
+    collect,
+    filter::{FilterExec, FilterExecBuilder},
+    projection::ProjectionExec,
+    repartition::RepartitionExec,
+    sorts::sort::SortExec,
+};
+
+use super::pushdown_utils::{
+    OptimizationTest, TestNode, TestScanBuilder, TestSource, format_plan_for_test,
+};
+use datafusion_physical_plan::union::UnionExec;
+use futures::StreamExt;
+use object_store::{ObjectStore, memory::InMemory};
+use regex::Regex;
+
+#[test]
+fn test_pushdown_into_scan() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_volatile_functions_not_allowed() {
+    // Test that we do not push down filters with volatile functions
+    // Use random() as an example of a volatile function
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let cfg = Arc::new(ConfigOptions::default());
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new_with_schema("a", &schema()).unwrap()),
+        Operator::Eq,
+        Arc::new(
+            ScalarFunctionExpr::try_new(
+                Arc::new(ScalarUDF::from(RandomFunc::new())),
+                vec![],
+                &schema(),
+                cfg,
+            )
+            .unwrap(),
+        ),
+    )) as Arc<dyn PhysicalExpr>;
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+    // expect the filter to not be pushed down
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = random()
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = random()
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    ",
+    );
+}
+
+/// Show that we can use config options to determine how to do pushdown.
+#[test]
+fn test_pushdown_into_scan_with_config_options() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap()) as _;
+
+    let mut cfg = ConfigOptions::default();
+    insta::assert_snapshot!(
+        OptimizationTest::new(
+            Arc::clone(&plan),
+            FilterPushdown::new(),
+            false
+        ),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+
+    cfg.execution.parquet.pushdown_filters = true;
+    insta::assert_snapshot!(
+        OptimizationTest::new(
+            plan,
+            FilterPushdown::new(),
+            true
+        ),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8View, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8View, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("d", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("e", Utf8View, ["ba", "bb", "bc", "bd"]),
+            ("f", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("d", DataType::Utf8, false),
+        Field::new("e", DataType::Utf8View, false),
+        Field::new("f", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let on = vec![(
+        col("a", &build_side_schema).unwrap(),
+        col("d", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+
+    // Finally let's add a SortExec on the outside to test pushdown of dynamic filters
+    let sort_expr =
+        PhysicalSortExpr::new(col("e", &join_schema).unwrap(), SortOptions::default());
+    let plan = Arc::new(
+        SortExec::new(LexOrdering::new(vec![sort_expr]).unwrap(), join)
+            .with_fetch(Some(2)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    let mut config = ConfigOptions::default();
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    config.execution.parquet.pushdown_filters = true;
+
+    // Apply the FilterPushdown optimizer rule
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(Arc::clone(&plan), &config)
+        .unwrap();
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false]
+    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+    "
+    );
+
+    // Put some data through the plan to check that the filter is updated to reflect the TopK state
+    let session_ctx = SessionContext::new_with_config(SessionConfig::new());
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    // Iterate one batch
+    stream.next().await.unwrap().unwrap();
+
+    // Test that filters are pushed down correctly to each side of the join
+    // NOTE: We dropped the CASE expression here because we now optimize that away if there's only 1 partition
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb]
+    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN (SET) ([aa, ab]) ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]
+    "
+    );
+}
+
+// Test both static and dynamic filter pushdown in HashJoinExec.
+// Note that static filter pushdown is rare: it should have already happened in the logical optimizer phase.
+// However users may manually construct plans that could result in a FilterExec -> HashJoinExec -> Scan setup.
+// Dynamic filters arise in cases such as nested inner joins or TopK -> HashJoinExec -> Scan setups.
+#[tokio::test]
+async fn test_static_filter_pushdown_through_hash_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8View, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8View, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("d", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("e", Utf8View, ["ba", "bb", "bc", "bd"]),
+            ("f", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("d", DataType::Utf8, false),
+        Field::new("e", DataType::Utf8View, false),
+        Field::new("f", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let on = vec![(
+        col("a", &build_side_schema).unwrap(),
+        col("d", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Create filters that can be pushed down to different sides
+    // We need to create filters in the context of the join output schema
+    let join_schema = join.schema();
+
+    // Filter on build side column: a = 'aa'
+    let left_filter = col_lit_predicate("a", "aa", &join_schema);
+    // Filter on probe side column: e = 'ba'
+    let right_filter = col_lit_predicate("e", "ba", &join_schema);
+    // Filter that references both sides: a = d (should not be pushed down)
+    let cross_filter = Arc::new(BinaryExpr::new(
+        col("a", &join_schema).unwrap(),
+        Operator::Eq,
+        col("d", &join_schema).unwrap(),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let filter = Arc::new(FilterExec::try_new(right_filter, filter).unwrap());
+    let plan = Arc::new(FilterExec::try_new(cross_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = d@3
+        -   FilterExec: e@4 = ba
+        -     FilterExec: a@0 = aa
+        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = d@3
+          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=e@1 = ba
+    "
+    );
+
+    // Test left join: filter on preserved (build) side is pushed down,
+    // filter on non-preserved (probe) side is NOT pushed down.
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            TestScanBuilder::new(Arc::clone(&build_side_schema))
+                .with_support(true)
+                .build(),
+            TestScanBuilder::new(Arc::clone(&probe_side_schema))
+                .with_support(true)
+                .build(),
+            vec![(
+                col("a", &build_side_schema).unwrap(),
+                col("d", &probe_side_schema).unwrap(),
+            )],
+            None,
+            &JoinType::Left,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+    // Filter on build side column (preserved): should be pushed down
+    let left_filter = col_lit_predicate("a", "aa", &join_schema);
+    // Filter on probe side column (not preserved): should NOT be pushed down
+    let right_filter = col_lit_predicate("e", "ba", &join_schema);
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(right_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: e@4 = ba
+        -   FilterExec: a@0 = aa
+        -     HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: e@4 = ba
+          -   HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+#[test]
+fn test_filter_collapse() {
+    // filter should be pushed down into the parquet scan with two filters
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate1 = col_lit_predicate("a", "foo", &schema());
+    let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap());
+    let predicate2 = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   FilterExec: a@0 = foo
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_filter_with_projection() {
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let projection = vec![1, 0];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, Arc::clone(&scan))
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1, a@0]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b, a@0 as a]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    ",
+    );
+
+    // add a test where the filter is on a column that isn't included in the output
+    let projection = vec![1];
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, scan)
+            .apply_projection(Some(projection))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(),true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo, projection=[b@1]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - ProjectionExec: expr=[b@1 as b]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_push_down_through_transparent_nodes() {
+    // expect the predicate to be pushed down into the DataSource
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+    let repartition = Arc::new(
+        RepartitionExec::try_new(filter, Partitioning::RoundRobinBatch(1)).unwrap(),
+    );
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, repartition).unwrap());
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(),true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
+        -     FilterExec: a@0 = foo
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_aggregates_on_grouping_columns() {
+    // Test that filters on grouping columns can be pushed through AggregateExec.
+    // This test has two filters:
+    // 1. An inner filter (a@0 = foo) below the aggregate - gets pushed to DataSource
+    // 2. An outer filter (b@1 = bar) above the aggregate - also gets pushed through because 'b' is a grouping column
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let filter = Arc::new(
+        FilterExecBuilder::new(col_lit_predicate("a", "foo", &schema()), scan)
+            .with_batch_size(10)
+            .build()
+            .unwrap(),
+    );
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+    let group_by = PhysicalGroupBy::new_single(vec![
+        (col("a", &schema()).unwrap(), "a".to_string()),
+        (col("b", &schema()).unwrap(), "b".to_string()),
+    ]);
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            filter,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(
+        FilterExecBuilder::new(predicate, aggregate)
+            .with_batch_size(100)
+            .build()
+            .unwrap(),
+    );
+
+    // Both filters should be pushed down to the DataSource since both reference grouping columns
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0])
+        -     FilterExec: a@0 = foo
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=Sorted
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
+    "
+    );
+}
+
+/// Test various combinations of handling of child pushdown results
+/// in an ExecutionPlan in combination with support/not support in a DataSource.
+#[test]
+fn test_node_handles_child_pushdown_result() {
+    // If we set `with_support(true)` + `inject_filter = true` then the filter is pushed down to the DataSource
+    // and no FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: true }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: true }
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    ",
+    );
+
+    // If we set `with_support(false)` + `inject_filter = true` then the filter is not pushed down to the DataSource
+    // and a FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: true }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: false }
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    ",
+    );
+
+    // If we set `with_support(false)` + `inject_filter = false` then the filter is not pushed down to the DataSource
+    // and no FilterExec is created.
+    let scan = TestScanBuilder::new(schema()).with_support(false).build();
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate));
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - TestInsertExec { inject_filter: false }
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - TestInsertExec { inject_filter: false }
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    ",
+    );
+}
+
+#[tokio::test]
+async fn test_topk_dynamic_filter_pushdown() {
+    let batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["bd", "bc"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+        record_batch!(
+            ("a", Utf8, ["ac", "ad"]),
+            ("b", Utf8, ["bb", "ba"]),
+            ("c", Float64, [2.0, 1.0])
+        )
+        .unwrap(),
+    ];
+    let scan = TestScanBuilder::new(schema())
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+    let plan = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new(
+                col("b", &schema()).unwrap(),
+                SortOptions::new(true, false), // descending, nulls_first
+            )])
+            .unwrap(),
+            Arc::clone(&scan),
+        )
+        .with_fetch(Some(1)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(2);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    // Iterate one batch
+    stream.next().await.unwrap().unwrap();
+    // Now check what our filter looks like
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false], filter=[b@1 > bd]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 > bd ]
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_topk_dynamic_filter_pushdown_multi_column_sort() {
+    let batches = vec![
+        // We are going to do ORDER BY b ASC NULLS LAST, a DESC
+        // And we put the values in such a way that the first batch will fill the TopK
+        // and we skip the second batch.
+        record_batch!(
+            ("a", Utf8, ["ac", "ad"]),
+            ("b", Utf8, ["bb", "ba"]),
+            ("c", Float64, [2.0, 1.0])
+        )
+        .unwrap(),
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["bc", "bd"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let scan = TestScanBuilder::new(schema())
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+    let plan = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![
+                PhysicalSortExpr::new(
+                    col("b", &schema()).unwrap(),
+                    SortOptions::default().asc().nulls_last(),
+                ),
+                PhysicalSortExpr::new(
+                    col("a", &schema()).unwrap(),
+                    SortOptions::default().desc().nulls_first(),
+                ),
+            ])
+            .unwrap(),
+            Arc::clone(&scan),
+        )
+        .with_fetch(Some(2)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(2);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    // Iterate one batch
+    let res = stream.next().await.unwrap().unwrap();
+    #[rustfmt::skip]
+    let expected = [
+        "+----+----+-----+",
+        "| a  | b  | c   |",
+        "+----+----+-----+",
+        "| ad | ba | 1.0 |",
+        "| ac | bb | 2.0 |",
+        "+----+----+-----+",
+    ];
+    assert_batches_eq!(expected, &[res]);
+    // Now check what our filter looks like
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: TopK(fetch=2), expr=[b@1 ASC NULLS LAST, a@0 DESC], preserve_partitioning=[false], filter=[b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@1 < bb OR b@1 = bb AND (a@0 IS NULL OR a@0 > ac) ]
+    "
+    );
+    // There should be no more batches
+    assert!(stream.next().await.is_none());
+}
+
+#[tokio::test]
+async fn test_topk_filter_passes_through_coalesce_partitions() {
+    // Create multiple batches for different partitions
+    let batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["bd", "bc"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+        record_batch!(
+            ("a", Utf8, ["ac", "ad"]),
+            ("b", Utf8, ["bb", "ba"]),
+            ("c", Float64, [2.0, 1.0])
+        )
+        .unwrap(),
+    ];
+
+    // Create a source that supports all batches
+    let source = Arc::new(TestSource::new(schema(), true, batches));
+
+    let base_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source)
+            .with_file_groups(vec![
+                // Partition 0
+                FileGroup::new(vec![PartitionedFile::new("test1.parquet", 123)]),
+                // Partition 1
+                FileGroup::new(vec![PartitionedFile::new("test2.parquet", 123)]),
+            ])
+            .build();
+
+    let scan = DataSourceExec::from_data_source(base_config);
+
+    // Add CoalescePartitionsExec to merge the two partitions
+    let coalesce = Arc::new(CoalescePartitionsExec::new(scan)) as Arc<dyn ExecutionPlan>;
+
+    // Add SortExec with TopK
+    let plan = Arc::new(
+        SortExec::new(
+            LexOrdering::new(vec![PhysicalSortExpr::new(
+                col("b", &schema()).unwrap(),
+                SortOptions::new(true, false),
+            )])
+            .unwrap(),
+            coalesce,
+        )
+        .with_fetch(Some(1)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Test optimization - the filter SHOULD pass through CoalescePartitionsExec
+    // if it properly implements from_children (not all_unsupported)
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     DataSourceExec: file_groups={2 groups: [[test1.parquet], [test2.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec with dynamic filter
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    ",
+    );
+
+    // Actually apply the optimization to the plan and execute to see the filter in action
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    // Test for https://github.com/apache/datafusion/pull/17371: dynamic filter linking survives `with_new_children`
+    let children = plan.children().into_iter().map(Arc::clone).collect();
+    let plan = plan.with_new_children(children).unwrap();
+
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    // Iterate one batch
+    stream.next().await.unwrap().unwrap();
+
+    // Now check what our filter looks like
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_partitioned() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Rough sketch of the MRE we're trying to recreate:
+    // COPY (select i as k from generate_series(1, 10000000) as t(i))
+    // TO 'test_files/scratch/push_down_filter/t1.parquet'
+    // STORED AS PARQUET;
+    // COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
+    // TO 'test_files/scratch/push_down_filter/t2.parquet'
+    // STORED AS PARQUET;
+    // create external table t1 stored as parquet location 'test_files/scratch/push_down_filter/t1.parquet';
+    // create external table t2 stored as parquet location 'test_files/scratch/push_down_filter/t2.parquet';
+    // explain
+    // select *
+    // from t1
+    // join t2 on t1.k = t2.k;
+    // +---------------+------------------------------------------------------------+
+    // | plan_type     | plan                                                       |
+    // +---------------+------------------------------------------------------------+
+    // | physical_plan | ┌───────────────────────────┐                              |
+    // |               | │        HashJoinExec       │                              |
+    // |               | │    --------------------   ├──────────────┐               |
+    // |               | │        on: (k = k)        │              │               |
+    // |               | └─────────────┬─────────────┘              │               |
+    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
+    // |               | │      RepartitionExec      ││      RepartitionExec      │ |
+    // |               | │    --------------------   ││    --------------------   │ |
+    // |               | │ partition_count(in->out): ││ partition_count(in->out): │ |
+    // |               | │          12 -> 12         ││          12 -> 12         │ |
+    // |               | │                           ││                           │ |
+    // |               | │    partitioning_scheme:   ││    partitioning_scheme:   │ |
+    // |               | │      Hash([k@0], 12)      ││      Hash([k@0], 12)      │ |
+    // |               | └─────────────┬─────────────┘└─────────────┬─────────────┘ |
+    // |               | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ |
+    // |               | │       DataSourceExec      ││       DataSourceExec      │ |
+    // |               | │    --------------------   ││    --------------------   │ |
+    // |               | │         files: 12         ││         files: 12         │ |
+    // |               | │      format: parquet      ││      format: parquet      │ |
+    // |               | │                           ││      predicate: true      │ |
+    // |               | └───────────────────────────┘└───────────────────────────┘ |
+    // |               |                                                            |
+    // +---------------+------------------------------------------------------------+
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Build side: DataSource -> RepartitionExec (Hash)
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Probe side: DataSource -> RepartitionExec (Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec with partitioned inputs
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and execute to see the filter in action
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Now check what our filter looks like
+    #[cfg(not(feature = "force_hash_collisions"))]
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ CASE hash_repartition % 12 WHEN 5 THEN a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:ab,c1:bb}]) WHEN 8 THEN a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}]) ELSE false END ]
+    "
+    );
+
+    // When hash collisions force all data into a single partition, we optimize away the CASE expression.
+    // This avoids calling create_hashes() for every row on the probe side, since hash % 1 == 0 always,
+    // meaning the WHEN 0 branch would always match. This optimization is also important for primary key
+    // joins or any scenario where all build-side data naturally lands in one partition.
+    #[cfg(feature = "force_hash_collisions")]
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    // The number of output rows from the probe side scan should stay consistent across executions.
+    // Issue: https://github.com/apache/datafusion/issues/17451
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_collect_left() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Probe side: DataSource -> RepartitionExec(Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide
+        )
+        .unwrap(),
+    );
+
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // expect the predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+          -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Actually apply the optimization to the plan and execute to see the filter in action
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Now check what our filter looks like
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+    -   CoalescePartitionsExec
+    -     HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -       RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
+    -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    // The number of output rows from the probe side scan should stay consistent across executions.
+    // Issue: https://github.com/apache/datafusion/issues/17451
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+#[tokio::test]
+async fn test_nested_hashjoin_dynamic_filter_pushdown() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create test data for three tables: t1, t2, t3
+    // t1: small table with limited values (will be build side of outer join)
+    let t1_batches = vec![
+        record_batch!(("a", Utf8, ["aa", "ab"]), ("x", Float64, [1.0, 2.0])).unwrap(),
+    ];
+    let t1_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("x", DataType::Float64, false),
+    ]));
+    let t1_scan = TestScanBuilder::new(Arc::clone(&t1_schema))
+        .with_support(true)
+        .with_batches(t1_batches)
+        .build();
+
+    // t2: larger table (will be probe side of inner join, build side of outer join)
+    let t2_batches = vec![
+        record_batch!(
+            ("b", Utf8, ["aa", "ab", "ac", "ad", "ae"]),
+            ("c", Utf8, ["ca", "cb", "cc", "cd", "ce"]),
+            ("y", Float64, [1.0, 2.0, 3.0, 4.0, 5.0])
+        )
+        .unwrap(),
+    ];
+    let t2_schema = Arc::new(Schema::new(vec![
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Utf8, false),
+        Field::new("y", DataType::Float64, false),
+    ]));
+    let t2_scan = TestScanBuilder::new(Arc::clone(&t2_schema))
+        .with_support(true)
+        .with_batches(t2_batches)
+        .build();
+
+    // t3: largest table (will be probe side of inner join)
+    let t3_batches = vec![
+        record_batch!(
+            ("d", Utf8, ["ca", "cb", "cc", "cd", "ce", "cf", "cg", "ch"]),
+            ("z", Float64, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
+        )
+        .unwrap(),
+    ];
+    let t3_schema = Arc::new(Schema::new(vec![
+        Field::new("d", DataType::Utf8, false),
+        Field::new("z", DataType::Float64, false),
+    ]));
+    let t3_scan = TestScanBuilder::new(Arc::clone(&t3_schema))
+        .with_support(true)
+        .with_batches(t3_batches)
+        .build();
+
+    // Create nested join structure:
+    // Join (t1.a = t2.b)
+    // /        \
+    // t1    Join(t2.c = t3.d)
+    //         /    \
+    //        t2   t3
+
+    // First create inner join: t2.c = t3.d
+    let inner_join_on =
+        vec![(col("c", &t2_schema).unwrap(), col("d", &t3_schema).unwrap())];
+    let inner_join = Arc::new(
+        HashJoinExec::try_new(
+            t2_scan,
+            t3_scan,
+            inner_join_on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Then create outer join: t1.a = t2.b (from inner join result)
+    let outer_join_on = vec![(
+        col("a", &t1_schema).unwrap(),
+        col("b", &inner_join.schema()).unwrap(),
+    )];
+    let outer_join = Arc::new(
+        HashJoinExec::try_new(
+            t1_scan,
+            inner_join as Arc<dyn ExecutionPlan>,
+            outer_join_on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Test that dynamic filters are pushed down correctly through nested joins
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&outer_join), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
+        -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
+          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    ",
+    );
+
+    // Execute the plan to verify the dynamic filters are properly updated
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(outer_join, &config)
+        .unwrap();
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    // Execute to populate the dynamic filters
+    stream.next().await.unwrap().unwrap();
+
+    // Verify that both the inner and outer join have updated dynamic filters
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
+    -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN (SET) ([aa, ab]) ]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND d@0 IN (SET) ([ca, cb]) ]
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_parent_filter_pushdown() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("d", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("e", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("f", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("d", DataType::Utf8, false),
+        Field::new("e", DataType::Utf8, false),
+        Field::new("f", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let on = vec![(
+        col("a", &build_side_schema).unwrap(),
+        col("d", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Create filters that can be pushed down to different sides
+    // We need to create filters in the context of the join output schema
+    let join_schema = join.schema();
+
+    // Filter on build side column: a = 'aa'
+    let left_filter = col_lit_predicate("a", "aa", &join_schema);
+    // Filter on probe side column: e = 'ba'
+    let right_filter = col_lit_predicate("e", "ba", &join_schema);
+    // Filter that references both sides: a = d (should not be pushed down)
+    let cross_filter = Arc::new(BinaryExpr::new(
+        col("a", &join_schema).unwrap(),
+        Operator::Eq,
+        col("d", &join_schema).unwrap(),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let filter = Arc::new(FilterExec::try_new(right_filter, filter).unwrap());
+    let plan = Arc::new(FilterExec::try_new(cross_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = d@3
+        -   FilterExec: e@4 = ba
+        -     FilterExec: a@0 = aa
+        -       HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = d@3
+          -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = aa
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=e@1 = ba
+    "
+    );
+}
+
+#[test]
+fn test_hashjoin_parent_filter_pushdown_same_column_names() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("build_val", DataType::Utf8, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .build();
+
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("probe_val", DataType::Utf8, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("id", &build_side_schema).unwrap(),
+        col("id", &probe_side_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+
+    let build_id_filter = col_lit_predicate("id", "aa", &join_schema);
+    let probe_val_filter = col_lit_predicate("probe_val", "x", &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(build_id_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(probe_val_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: probe_val@3 = x
+        -   FilterExec: id@0 = aa
+        -     HashJoinExec: mode=Partitioned, join_type=Inner, on=[(id@0, id@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, build_val], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, probe_val], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(id@0, id@0)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, build_val], file_type=test, pushdown_supported=true, predicate=id@0 = aa
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, probe_val], file_type=test, pushdown_supported=true, predicate=probe_val@1 = x
+    "
+    );
+}
+
+#[test]
+fn test_hashjoin_parent_filter_pushdown_mark_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("val", DataType::Utf8, false),
+    ]));
+    let left_scan = TestScanBuilder::new(Arc::clone(&left_schema))
+        .with_support(true)
+        .build();
+
+    let right_schema =
+        Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)]));
+    let right_scan = TestScanBuilder::new(Arc::clone(&right_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("id", &left_schema).unwrap(),
+        col("id", &right_schema).unwrap(),
+    )];
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            left_scan,
+            right_scan,
+            on,
+            None,
+            &JoinType::LeftMark,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+
+    let left_filter = col_lit_predicate("val", "x", &join_schema);
+    let mark_filter = col_lit_predicate("mark", true, &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(left_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(mark_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: mark@2 = true
+        -   FilterExec: val@1 = x
+        -     HashJoinExec: mode=Partitioned, join_type=LeftMark, on=[(id@0, id@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, val], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: mark@2 = true
+          -   HashJoinExec: mode=Partitioned, join_type=LeftMark, on=[(id@0, id@0)]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id, val], file_type=test, pushdown_supported=true, predicate=val@1 = x
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[id], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+/// Test that filters on join key columns are pushed to both sides of semi/anti joins.
+/// For LeftSemi/LeftAnti, the output only contains left columns, but filters on
+/// join key columns can also be pushed to the right (non-preserved) side because
+/// the equijoin condition guarantees the key values match.
+#[test]
+fn test_hashjoin_parent_filter_pushdown_semi_anti_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let left_schema = Arc::new(Schema::new(vec![
+        Field::new("k", DataType::Utf8, false),
+        Field::new("v", DataType::Utf8, false),
+    ]));
+    let left_scan = TestScanBuilder::new(Arc::clone(&left_schema))
+        .with_support(true)
+        .build();
+
+    let right_schema = Arc::new(Schema::new(vec![
+        Field::new("k", DataType::Utf8, false),
+        Field::new("w", DataType::Utf8, false),
+    ]));
+    let right_scan = TestScanBuilder::new(Arc::clone(&right_schema))
+        .with_support(true)
+        .build();
+
+    let on = vec![(
+        col("k", &left_schema).unwrap(),
+        col("k", &right_schema).unwrap(),
+    )];
+
+    let join = Arc::new(
+        HashJoinExec::try_new(
+            left_scan,
+            right_scan,
+            on,
+            None,
+            &JoinType::LeftSemi,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    let join_schema = join.schema();
+    // Filter on join key column: k = 'x' — should be pushed to BOTH sides
+    let key_filter = col_lit_predicate("k", "x", &join_schema);
+    // Filter on non-key column: v = 'y' — should only be pushed to the left side
+    let val_filter = col_lit_predicate("v", "y", &join_schema);
+
+    let filter =
+        Arc::new(FilterExec::try_new(key_filter, Arc::clone(&join) as _).unwrap());
+    let plan = Arc::new(FilterExec::try_new(val_filter, filter).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: v@1 = y
+        -   FilterExec: k@0 = x
+        -     HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(k@0, k@0)]
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, v], file_type=test, pushdown_supported=true
+        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, w], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(k@0, k@0)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, v], file_type=test, pushdown_supported=true, predicate=k@0 = x AND v@1 = y
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[k, w], file_type=test, pushdown_supported=true, predicate=k@0 = x
+    "
+    );
+}
+
+/// Integration test for dynamic filter pushdown with TopK.
+/// We use an integration test because there are complex interactions in the optimizer rules
+/// that the unit tests applying a single optimizer rule do not cover.
+#[tokio::test]
+async fn test_topk_dynamic_filter_pushdown_integration() {
+    let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+    let mut cfg = SessionConfig::new();
+    cfg.options_mut().execution.parquet.pushdown_filters = true;
+    cfg.options_mut().execution.parquet.max_row_group_size = 128;
+    let ctx = SessionContext::new_with_config(cfg);
+    ctx.register_object_store(
+        ObjectStoreUrl::parse("memory://").unwrap().as_ref(),
+        Arc::clone(&store),
+    );
+    ctx.sql(
+        r"
+COPY  (
+  SELECT 1372708800 + value AS t
+  FROM generate_series(0, 99999)
+  ORDER BY t
+ ) TO 'memory:///1.parquet'
+STORED AS PARQUET;
+  ",
+    )
+    .await
+    .unwrap()
+    .collect()
+    .await
+    .unwrap();
+
+    // Register the file with the context
+    ctx.register_parquet(
+        "topk_pushdown",
+        "memory:///1.parquet",
+        ParquetReadOptions::default(),
+    )
+    .await
+    .unwrap();
+
+    // Create a TopK query that will use dynamic filter pushdown
+    // Note that we use t * t as the order by expression to avoid
+    // the order pushdown optimizer from optimizing away the TopK.
+    let df = ctx
+        .sql(r"EXPLAIN ANALYZE SELECT t FROM topk_pushdown ORDER BY t * t LIMIT 10;")
+        .await
+        .unwrap();
+    let batches = df.collect().await.unwrap();
+    let explain = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    assert!(explain.contains("output_rows=128")); // Read 1 row group
+    assert!(explain.contains("t@0 < 1884329474306198481")); // Dynamic filter was applied
+    assert!(
+        explain.contains("pushdown_rows_matched=128, pushdown_rows_pruned=99.87 K"),
+        "{explain}"
+    );
+    // Pushdown pruned most rows
+}
+
+#[test]
+fn test_filter_pushdown_through_union() {
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - UnionExec
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_union_mixed_support() {
+    // Test case where one child supports filter pushdown and one doesn't
+    let scan1 = TestScanBuilder::new(schema()).with_support(true).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(false).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - UnionExec
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+#[test]
+fn test_filter_pushdown_through_union_does_not_support() {
+    // Test case where one child supports filter pushdown and one doesn't
+    let scan1 = TestScanBuilder::new(schema()).with_support(false).build();
+    let scan2 = TestScanBuilder::new(schema()).with_support(false).build();
+
+    let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   UnionExec
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+      output:
+        Ok:
+          - UnionExec
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+          -   FilterExec: a@0 = foo
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
+    "
+    );
+}
+
+/// Schema:
+/// a: String
+/// b: String
+/// c: f64
+static TEST_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+    let fields = vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ];
+    Arc::new(Schema::new(fields))
+});
+
+fn schema() -> SchemaRef {
+    Arc::clone(&TEST_SCHEMA)
+}
+
+struct ProjectionDynFilterTestCase {
+    schema: SchemaRef,
+    batches: Vec<RecordBatch>,
+    projection: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    sort_expr: PhysicalSortExpr,
+    expected_plans: Vec<String>,
+}
+
+async fn run_projection_dyn_filter_case(case: ProjectionDynFilterTestCase) {
+    let ProjectionDynFilterTestCase {
+        schema,
+        batches,
+        projection,
+        sort_expr,
+        expected_plans,
+    } = case;
+
+    let scan = TestScanBuilder::new(Arc::clone(&schema))
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    let projection_exec = Arc::new(ProjectionExec::try_new(projection, scan).unwrap());
+
+    let sort = Arc::new(
+        SortExec::new(LexOrdering::new(vec![sort_expr]).unwrap(), projection_exec)
+            .with_fetch(Some(2)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+
+    let optimized_plan = FilterPushdown::new_post_optimization()
+        .optimize(Arc::clone(&sort), &config)
+        .unwrap();
+
+    pretty_assertions::assert_eq!(
+        format_plan_for_test(&optimized_plan).trim(),
+        expected_plans[0].trim()
+    );
+
+    let config = SessionConfig::new().with_batch_size(2);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = optimized_plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    for (idx, expected_plan) in expected_plans.iter().enumerate().skip(1) {
+        stream.next().await.unwrap().unwrap();
+        let formatted_plan = format_plan_for_test(&optimized_plan);
+        pretty_assertions::assert_eq!(
+            formatted_plan.trim(),
+            expected_plan.trim(),
+            "Mismatch at iteration {}",
+            idx
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_topk_with_projection_transformation_on_dyn_filter() {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let simple_abc = vec![
+        record_batch!(
+            ("a", Int32, [1, 2, 3]),
+            ("b", Utf8, ["x", "y", "z"]),
+            ("c", Float64, [1.0, 2.0, 3.0])
+        )
+        .unwrap(),
+    ];
+
+    // Case 1: Reordering [b, a]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("b", &schema).unwrap(), "b".to_string()),
+            (col("a", &schema).unwrap(), "a".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 1)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+r#"  - SortExec: TopK(fetch=2), expr=[a@1 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[b@1 as b, a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+r#"  - SortExec: TopK(fetch=2), expr=[a@1 ASC], preserve_partitioning=[false], filter=[a@1 IS NULL OR a@1 < 2]
+  -   ProjectionExec: expr=[b@1 as b, a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string()]
+    })
+    .await;
+
+    // Case 2: Pruning [a]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![(col("a", &schema).unwrap(), "a".to_string())],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 2]
+  -   ProjectionExec: expr=[a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 3: Identity [a, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("a", &schema).unwrap(), "a".to_string()),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 2]
+  -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 4: Expressions [a + 1, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (
+                Arc::new(BinaryExpr::new(
+                    col("a", &schema).unwrap(),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                "a_plus_1".to_string(),
+            ),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a_plus_1", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 + 1 as a_plus_1, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC], preserve_partitioning=[false], filter=[a_plus_1@0 IS NULL OR a_plus_1@0 < 3]
+  -   ProjectionExec: expr=[a@0 + 1 as a_plus_1, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 + 1 IS NULL OR a@0 + 1 < 3 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 5: [a as b, b as a] (swapped columns)
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("a", &schema).unwrap(), "b".to_string()),
+            (col("b", &schema).unwrap(), "a".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("b", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[b@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as b, b@1 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[b@0 ASC], preserve_partitioning=[false], filter=[b@0 IS NULL OR b@0 < 2]
+  -   ProjectionExec: expr=[a@0 as b, b@1 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 6: Confusing expr [a + 1 as a, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (
+                Arc::new(BinaryExpr::new(
+                    col("a", &schema).unwrap(),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                "a".to_string(),
+            ),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 + 1 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 3]
+  -   ProjectionExec: expr=[a@0 + 1 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 + 1 IS NULL OR a@0 + 1 < 3 ]"#.to_string(),
+        ],
+    })
+    .await;
+}
+
+/// Returns a predicate that is a binary expression col = lit
+fn col_lit_predicate(
+    column_name: &str,
+    scalar_value: impl Into<ScalarValue>,
+    schema: &Schema,
+) -> Arc<dyn PhysicalExpr> {
+    let scalar_value = scalar_value.into();
+    Arc::new(BinaryExpr::new(
+        Arc::new(Column::new_with_schema(column_name, schema).unwrap()),
+        Operator::Eq,
+        Arc::new(Literal::new(scalar_value)),
+    ))
+}
+
+// ==== Aggregate Dynamic Filter tests ====
+
+// ---- Test Utilities ----
+struct AggregateDynFilterCase<'a> {
+    schema: SchemaRef,
+    batches: Vec<RecordBatch>,
+    aggr_exprs: Vec<AggregateFunctionExpr>,
+    expected_before: Option<&'a str>,
+    expected_after: Option<&'a str>,
+    scan_support: bool,
+}
+
+async fn run_aggregate_dyn_filter_case(case: AggregateDynFilterCase<'_>) {
+    let AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs,
+        expected_before,
+        expected_after,
+        scan_support,
+    } = case;
+
+    let scan = TestScanBuilder::new(Arc::clone(&schema))
+        .with_support(scan_support)
+        .with_batches(batches)
+        .build();
+
+    let aggr_exprs: Vec<_> = aggr_exprs
+        .into_iter()
+        .map(|expr| Arc::new(expr) as Arc<AggregateFunctionExpr>)
+        .collect();
+    let aggr_len = aggr_exprs.len();
+
+    let plan: Arc<dyn ExecutionPlan> = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![]),
+            aggr_exprs,
+            vec![None; aggr_len],
+            scan,
+            Arc::clone(&schema),
+        )
+        .unwrap(),
+    );
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+
+    let optimized = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    let before = format_plan_for_test(&optimized);
+    if let Some(expected) = expected_before {
+        assert!(
+            before.contains(expected),
+            "expected `{expected}` before execution, got: {before}"
+        );
+    } else {
+        assert!(
+            !before.contains("DynamicFilter ["),
+            "dynamic filter unexpectedly present before execution: {before}"
+        );
+    }
+
+    let session_ctx = SessionContext::new();
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let task_ctx = session_ctx.state().task_ctx();
+    let mut stream = optimized.execute(0, Arc::clone(&task_ctx)).unwrap();
+    let _ = stream.next().await.transpose().unwrap();
+
+    let after = format_plan_for_test(&optimized);
+    if let Some(expected) = expected_after {
+        assert!(
+            after.contains(expected),
+            "expected `{expected}` after execution, got: {after}"
+        );
+    } else {
+        assert!(
+            !after.contains("DynamicFilter ["),
+            "dynamic filter unexpectedly present after execution: {after}"
+        );
+    }
+}
+
+// ---- Test Cases ----
+// Cases covered below:
+// 1. `min(a)` and `max(a)` baseline.
+// 2. Unsupported expression input (`min(a+1)`).
+// 3. Multiple supported columns (same column vs different columns).
+// 4. Mixed supported + unsupported aggregates.
+// 5. Entirely NULL input to surface current bound behavior.
+// 6. End-to-end tests on parquet files
+
+/// `MIN(a)`: able to pushdown dynamic filter
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_min_simple() {
+    // Single min(a) showcases the base case.
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        expected_after: Some("DynamicFilter [ a@0 < 1 ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// `MAX(a)`: able to pushdown dynamic filter
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_max_simple() {
+    // Single max(a) mirrors the base case on the upper bound.
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let max_expr =
+        AggregateExprBuilder::new(max_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("max_a")
+            .build()
+            .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![max_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        expected_after: Some("DynamicFilter [ a@0 > 8 ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// `MIN(a+1)`: Can't pushdown dynamic filter
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_min_expression_not_supported() {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        col("a", &schema).unwrap(),
+        Operator::Plus,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+    ));
+    let min_expr = AggregateExprBuilder::new(min_udaf(), vec![expr])
+        .schema(Arc::clone(&schema))
+        .alias("min_a_plus_one")
+        .build()
+        .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_expr],
+        expected_before: None,
+        expected_after: None,
+        scan_support: true,
+    })
+    .await;
+}
+
+/// `MIN(a), MAX(a)`: Pushdown dynamic filter like `(a<1) or (a>8)`
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_min_max_same_column() {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+    let max_expr =
+        AggregateExprBuilder::new(max_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("max_a")
+            .build()
+            .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_expr, max_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        expected_after: Some("DynamicFilter [ a@0 < 1 OR a@0 > 8 ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// `MIN(a), MAX(b)`: Pushdown dynamic filter like `(a<1) or (b>9)`
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_min_max_different_columns() {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, true),
+    ]));
+    let batches = vec![
+        record_batch!(("a", Int32, [5, 1, 3, 8]), ("b", Int32, [7, 2, 4, 9])).unwrap(),
+    ];
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+    let max_expr =
+        AggregateExprBuilder::new(max_udaf(), vec![col("b", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("max_b")
+            .build()
+            .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_expr, max_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        expected_after: Some("DynamicFilter [ a@0 < 1 OR b@1 > 9 ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// Mix of supported/unsupported aggregates retains only the valid ones.
+/// `MIN(a), MAX(a), MAX(b), MIN(c+1)`: Pushdown dynamic filter like `(a<1) or (a>8) OR (b>12)`
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_multiple_mixed_expressions() {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, true),
+        Field::new("c", DataType::Int32, true),
+    ]));
+    let batches = vec![
+        record_batch!(
+            ("a", Int32, [5, 1, 3, 8]),
+            ("b", Int32, [10, 4, 6, 12]),
+            ("c", Int32, [100, 70, 90, 110])
+        )
+        .unwrap(),
+    ];
+
+    let min_a = AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+        .schema(Arc::clone(&schema))
+        .alias("min_a")
+        .build()
+        .unwrap();
+    let max_a = AggregateExprBuilder::new(max_udaf(), vec![col("a", &schema).unwrap()])
+        .schema(Arc::clone(&schema))
+        .alias("max_a")
+        .build()
+        .unwrap();
+    let max_b = AggregateExprBuilder::new(max_udaf(), vec![col("b", &schema).unwrap()])
+        .schema(Arc::clone(&schema))
+        .alias("max_b")
+        .build()
+        .unwrap();
+    let expr_c: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        col("c", &schema).unwrap(),
+        Operator::Plus,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+    ));
+    let min_c_expr = AggregateExprBuilder::new(min_udaf(), vec![expr_c])
+        .schema(Arc::clone(&schema))
+        .alias("min_c_plus_one")
+        .build()
+        .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_a, max_a, max_b, min_c_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        expected_after: Some("DynamicFilter [ a@0 < 1 OR a@0 > 8 OR b@1 > 12 ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// Don't tighten the dynamic filter if all inputs are null
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_min_all_nulls() {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [None, None, None, None])).unwrap()];
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+
+    run_aggregate_dyn_filter_case(AggregateDynFilterCase {
+        schema,
+        batches,
+        aggr_exprs: vec![min_expr],
+        expected_before: Some("DynamicFilter [ empty ]"),
+        // After reading the input it hasn't a meaningful bound to update, so the
+        // predicate `true` means don't filter out anything
+        expected_after: Some("DynamicFilter [ true ]"),
+        scan_support: true,
+    })
+    .await;
+}
+
+/// Test aggregate dynamic filter is working when reading parquet files
+///
+/// Runs 'select max(id) from test_table where id > 1', and ensure some file ranges
+/// pruned by the dynamic filter.
+#[tokio::test]
+async fn test_aggregate_dynamic_filter_parquet_e2e() {
+    let config = SessionConfig::new()
+        .with_collect_statistics(true)
+        .with_target_partitions(2)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true);
+    let ctx = SessionContext::new_with_config(config);
+
+    let data_path = format!(
+        "{}/tests/data/test_statistics_per_partition/",
+        env!("CARGO_MANIFEST_DIR")
+    );
+
+    ctx.register_parquet("test_table", &data_path, ParquetReadOptions::default())
+        .await
+        .unwrap();
+
+    // partition 1:
+    //   files: ..03-01(id=4), ..03-02(id=3)
+    // partition 1:
+    //   files: ..03-03(id=2), ..03-04(id=1)
+    //
+    // In partition 1, after reading the first file, the dynamic filter will be update
+    // to "id > 4", so the `..03-02` file must be able to get pruned out
+    let df = ctx
+        .sql("explain analyze select max(id) from test_table where id > 1")
+        .await
+        .unwrap();
+
+    let result = df.collect().await.unwrap();
+
+    let formatted = pretty_format_batches(&result).unwrap();
+    let explain_analyze = format!("{formatted}");
+
+    // Capture "2" from "files_ranges_pruned_statistics=4 total → 2 matched"
+    let re = Regex::new(
+        r"files_ranges_pruned_statistics\s*=\s*(\d+)\s*total\s*[→>\-]\s*(\d+)\s*matched",
+    )
+    .unwrap();
+
+    if let Some(caps) = re.captures(&explain_analyze) {
+        let matched_num: i32 = caps[2].parse().unwrap();
+        assert!(
+            matched_num < 4,
+            "Total 4 files, if some pruned, the matched count is < 4"
+        );
+    } else {
+        unreachable!("metrics should exist")
+    }
+}
+
+/// Non-partial (Single) aggregates should skip dynamic filter initialization.
+#[test]
+fn test_aggregate_dynamic_filter_not_created_for_single_mode() {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+    let batches = vec![record_batch!(("a", Int32, [5, 1, 3, 8])).unwrap()];
+
+    let scan = TestScanBuilder::new(Arc::clone(&schema))
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    let min_expr =
+        AggregateExprBuilder::new(min_udaf(), vec![col("a", &schema).unwrap()])
+            .schema(Arc::clone(&schema))
+            .alias("min_a")
+            .build()
+            .unwrap();
+
+    let plan: Arc<dyn ExecutionPlan> = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new_single(vec![]),
+            vec![min_expr.into()],
+            vec![None],
+            scan,
+            Arc::clone(&schema),
+        )
+        .unwrap(),
+    );
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+
+    let optimized = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    let formatted = format_plan_for_test(&optimized);
+    assert!(
+        !formatted.contains("DynamicFilter ["),
+        "dynamic filter should not be created for AggregateMode::Single: {formatted}"
+    );
+}
+
+#[tokio::test]
+async fn test_aggregate_filter_pushdown() {
+    // Test that filters can pass through AggregateExec even with aggregate functions
+    // when the filter references grouping columns
+    // Simulates: SELECT a, COUNT(b) FROM table WHERE a = 'x' GROUP BY a
+
+    let batches = vec![
+        record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(),
+    ];
+
+    let scan = TestScanBuilder::new(schema())
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    // Create an aggregate: GROUP BY a with COUNT(b)
+    let group_by = PhysicalGroupBy::new_single(vec![(
+        col("a", &schema()).unwrap(),
+        "a".to_string(),
+    )]);
+
+    // Add COUNT aggregate
+    let count_expr =
+        AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()])
+            .schema(schema())
+            .alias("count")
+            .build()
+            .unwrap();
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            vec![count_expr.into()], // Has aggregate function
+            vec![None],              // No filter on the aggregate function
+            Arc::clone(&scan),
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Add a filter on the grouping column 'a'
+    let predicate = col_lit_predicate("a", "x", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    // Even with aggregate functions, filter on grouping column should be pushed through
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = x
+        -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count], ordering_mode=Sorted
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = x
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_no_pushdown_filter_on_aggregate_result() {
+    // Test that filters on aggregate results (not grouping columns) are NOT pushed through
+    // SELECT a, COUNT(b) as cnt FROM table GROUP BY a HAVING cnt > 5
+    // The filter on 'cnt' cannot be pushed down because it's an aggregate result
+
+    let batches = vec![
+        record_batch!(("a", Utf8, ["x", "y"]), ("b", Utf8, ["foo", "bar"])).unwrap(),
+    ];
+
+    let scan = TestScanBuilder::new(schema())
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    // Create an aggregate: GROUP BY a with COUNT(b)
+    let group_by = PhysicalGroupBy::new_single(vec![(
+        col("a", &schema()).unwrap(),
+        "a".to_string(),
+    )]);
+
+    // Add COUNT aggregate
+    let count_expr =
+        AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema()).unwrap()])
+            .schema(schema())
+            .alias("count")
+            .build()
+            .unwrap();
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            vec![count_expr.into()],
+            vec![None],
+            Arc::clone(&scan),
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Add a filter on the aggregate output column
+    // This simulates filtering on COUNT result, which should NOT be pushed through
+    let agg_schema = aggregate.schema();
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new_with_schema("count[count]", &agg_schema).unwrap()),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
+    ));
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap())
+        as Arc<dyn ExecutionPlan>;
+
+    // The filter should NOT be pushed through the aggregate since it's on an aggregate result
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: count[count]@1 > 5
+        -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: count[count]@1 > 5
+          -   AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_filter_on_non_first_grouping_column() {
+    // Test that filters on non-first grouping columns are still pushed down
+    // SELECT a, b, count(*) as cnt FROM table GROUP BY a, b HAVING b = 'bar'
+    // The filter is on 'b' (second grouping column), should push down
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    let group_by = PhysicalGroupBy::new_single(vec![
+        (col("a", &schema()).unwrap(), "a".to_string()),
+        (col("b", &schema()).unwrap(), "b".to_string()),
+    ]);
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_grouping_sets_filter_on_missing_column() {
+    // Test that filters on columns missing from some grouping sets are NOT pushed through
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Create GROUPING SETS with (a, b) and (b)
+    let group_by = PhysicalGroupBy::new(
+        vec![
+            (col("a", &schema()).unwrap(), "a".to_string()),
+            (col("b", &schema()).unwrap(), "b".to_string()),
+        ],
+        vec![
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "a".to_string(),
+            ),
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "b".to_string(),
+            ),
+        ],
+        vec![
+            vec![false, false], // (a, b) - both present
+            vec![true, false],  // (b) - a is NULL, b present
+        ],
+        true,
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on column 'a' which is missing in the second grouping set, should not be pushed down
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - FilterExec: a@0 = foo
+          -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+          -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_grouping_sets_filter_on_common_column() {
+    // Test that filters on columns present in ALL grouping sets ARE pushed through
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Create GROUPING SETS with (a, b) and (b)
+    let group_by = PhysicalGroupBy::new(
+        vec![
+            (col("a", &schema()).unwrap(), "a".to_string()),
+            (col("b", &schema()).unwrap(), "b".to_string()),
+        ],
+        vec![
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "a".to_string(),
+            ),
+            (
+                Arc::new(Literal::new(ScalarValue::Utf8(None))),
+                "b".to_string(),
+            ),
+        ],
+        vec![
+            vec![false, false], // (a, b) - both present
+            vec![true, false],  // (b) - a is NULL, b present
+        ],
+        true,
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on column 'b' which is present in all grouping sets will be pushed down
+    let predicate = col_lit_predicate("b", "bar", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: b@1 = bar
+        -   AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[(a@0 as a, b@1 as b), (NULL as a, b@1 as b)], aggr=[cnt], ordering_mode=PartiallySorted([1])
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=b@1 = bar
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_with_empty_group_by() {
+    // Test that filters can be pushed down when GROUP BY is empty (no grouping columns)
+    // SELECT count(*) as cnt FROM table WHERE a = 'foo'
+    // There are no grouping columns, so the filter should still push down
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("c", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    // Empty GROUP BY - no grouping columns
+    let group_by = PhysicalGroupBy::new_single(vec![]);
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            scan,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // Filter on 'a'
+    let predicate = col_lit_predicate("a", "foo", &schema());
+    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
+
+    // The filter should be pushed down even with empty GROUP BY
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - FilterExec: a@0 = foo
+        -   AggregateExec: mode=Final, gby=[], aggr=[cnt]
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[], aggr=[cnt]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_with_computed_grouping_key() {
+    // Test filter pushdown with computed grouping expression
+    // SELECT (c + 1.0) as c_plus_1, count(*) FROM table WHERE c > 5.0 GROUP BY (c + 1.0)
+
+    let scan = TestScanBuilder::new(schema()).with_support(true).build();
+
+    let predicate = Arc::new(BinaryExpr::new(
+        col("c", &schema()).unwrap(),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Float64(Some(5.0)))),
+    )) as Arc<dyn PhysicalExpr>;
+    let filter = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
+
+    let aggregate_expr = vec![
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
+            .schema(schema())
+            .alias("cnt")
+            .build()
+            .map(Arc::new)
+            .unwrap(),
+    ];
+
+    let c_plus_one = Arc::new(BinaryExpr::new(
+        col("c", &schema()).unwrap(),
+        Operator::Plus,
+        Arc::new(Literal::new(ScalarValue::Float64(Some(1.0)))),
+    )) as Arc<dyn PhysicalExpr>;
+
+    let group_by =
+        PhysicalGroupBy::new_single(vec![(c_plus_one, "c_plus_1".to_string())]);
+
+    let plan = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            aggregate_expr.clone(),
+            vec![None],
+            filter,
+            schema(),
+        )
+        .unwrap(),
+    );
+
+    // The filter should be pushed down because 'c' is extracted from the grouping expression (c + 1.0)
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, FilterPushdown::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
+        -   FilterExec: c@2 > 5
+        -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - AggregateExec: mode=Final, gby=[c@2 + 1 as c_plus_1], aggr=[cnt]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=c@2 > 5
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_all_partitions_empty() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Test scenario where all build-side partitions are empty
+    // This validates the code path that sets the filter to `false` when no rows can match
+
+    // Create empty build side
+    let build_batches = vec![];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with some data
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac"]),
+            ("b", Utf8, ["ba", "bb", "bc"])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides
+    let partition_count = 4;
+
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Apply the filter pushdown optimizer
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(plan, config.options()).unwrap();
+
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Put some data through the plan to check that the filter is updated to reflect the TopK state
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    // Execute all partitions (required for partitioned hash join coordination)
+    let _batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=1
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ false ]
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_with_nulls() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Test scenario where build side has NULL values in join keys
+    // This validates NULL handling in bounds computation and filter generation
+
+    // Create build side with NULL values
+    let build_batch = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, true),  // nullable
+            Field::new("b", DataType::Int32, true), // nullable
+        ])),
+        vec![
+            Arc::new(StringArray::from(vec![Some("aa"), None, Some("ab")])),
+            Arc::new(Int32Array::from(vec![Some(1), Some(2), None])),
+        ],
+    )
+    .unwrap();
+    let build_batches = vec![build_batch];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, true),
+        Field::new("b", DataType::Int32, true),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with nullable fields
+    let probe_batch = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Float64, false),
+        ])),
+        vec![
+            Arc::new(StringArray::from(vec![
+                Some("aa"),
+                Some("ab"),
+                Some("ac"),
+                None,
+            ])),
+            Arc::new(Int32Array::from(vec![Some(1), Some(3), Some(4), Some(5)])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+        ],
+    )
+    .unwrap();
+    let probe_batches = vec![probe_batch];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, true),
+        Field::new("b", DataType::Int32, true),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec in CollectLeft mode (simpler for this test)
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            Arc::clone(&probe_scan),
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Apply the filter pushdown optimizer
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.parquet.pushdown_filters = true;
+    let optimizer = FilterPushdown::new_post_optimization();
+    let plan = optimizer.optimize(plan, config.options()).unwrap();
+
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    "
+    );
+
+    // Put some data through the plan to check that the filter is updated to reflect the TopK state
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    // Execute all partitions (required for partitioned hash join coordination)
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Test that filters are pushed down correctly to each side of the join
+    insta::assert_snapshot!(
+        format_plan_for_test(&plan),
+        @r"
+    - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b], file_type=test, pushdown_supported=true
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= 1 AND b@1 <= 2 AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:1}, {c0:,c1:2}, {c0:ab,c1:}]) ]
+    "
+    );
+
+    #[rustfmt::skip]
+    let expected = [
+        "+----+---+----+---+-----+",
+        "| a  | b | a  | b | c   |",
+        "+----+---+----+---+-----+",
+        "| aa | 1 | aa | 1 | 1.0 |",
+        "+----+---+----+---+-----+",
+    ];
+    assert_batches_eq!(&expected, &batches);
+}
+
+/// Test that when hash_join_inlist_pushdown_max_size is set to a very small value,
+/// the HashTable strategy is used instead of InList strategy, even with small build sides.
+/// This test is identical to test_hashjoin_dynamic_filter_pushdown_partitioned except
+/// for the config setting that forces the HashTable strategy.
+#[tokio::test]
+async fn test_hashjoin_hash_table_pushdown_partitioned() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Build side: DataSource -> RepartitionExec (Hash)
+    let build_hash_exprs = vec![
+        col("a", &build_side_schema).unwrap(),
+        col("b", &build_side_schema).unwrap(),
+    ];
+    let build_repartition = Arc::new(
+        RepartitionExec::try_new(
+            build_scan,
+            Partitioning::Hash(build_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Probe side: DataSource -> RepartitionExec (Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count),
+        )
+        .unwrap(),
+    );
+
+    // Create HashJoinExec with partitioned inputs
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_repartition,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // Apply the optimization with config setting that forces HashTable strategy
+    let session_config = SessionConfig::default()
+        .with_batch_size(10)
+        .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, session_config.options())
+        .unwrap();
+    let session_ctx = SessionContext::new_with_config(session_config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Verify that hash_lookup is used instead of IN (SET)
+    let plan_str = format_plan_for_test(&plan).to_string();
+    assert!(
+        plan_str.contains("hash_lookup"),
+        "Expected hash_lookup in plan but got: {plan_str}"
+    );
+    assert!(
+        !plan_str.contains("IN (SET)"),
+        "Expected no IN (SET) in plan but got: {plan_str}"
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    // Results should be identical to the InList version
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+/// Test that when hash_join_inlist_pushdown_max_size is set to a very small value,
+/// the HashTable strategy is used instead of InList strategy in CollectLeft mode.
+/// This test is identical to test_hashjoin_dynamic_filter_pushdown_collect_left except
+/// for the config setting that forces the HashTable strategy.
+#[tokio::test]
+async fn test_hashjoin_hash_table_pushdown_collect_left() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0]) // Extra column not used in join
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create RepartitionExec nodes for both sides with hash partitioning on join keys
+    let partition_count = 12;
+
+    // Probe side: DataSource -> RepartitionExec(Hash)
+    let probe_hash_exprs = vec![
+        col("a", &probe_side_schema).unwrap(),
+        col("b", &probe_side_schema).unwrap(),
+    ];
+    let probe_repartition = Arc::new(
+        RepartitionExec::try_new(
+            Arc::clone(&probe_scan),
+            Partitioning::Hash(probe_hash_exprs, partition_count), // create multi partitions on probSide
+        )
+        .unwrap(),
+    );
+
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let hash_join = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_repartition,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Top-level CoalescePartitionsExec
+    let cp = Arc::new(CoalescePartitionsExec::new(hash_join)) as Arc<dyn ExecutionPlan>;
+    // Add a sort for deterministic output
+    let plan = Arc::new(SortExec::new(
+        LexOrdering::new(vec![PhysicalSortExpr::new(
+            col("a", &probe_side_schema).unwrap(),
+            SortOptions::new(true, false), // descending, nulls_first
+        )])
+        .unwrap(),
+        cp,
+    )) as Arc<dyn ExecutionPlan>;
+
+    // Apply the optimization with config setting that forces HashTable strategy
+    let session_config = SessionConfig::default()
+        .with_batch_size(10)
+        .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, session_config.options())
+        .unwrap();
+    let session_ctx = SessionContext::new_with_config(session_config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Verify that hash_lookup is used instead of IN (SET)
+    let plan_str = format_plan_for_test(&plan).to_string();
+    assert!(
+        plan_str.contains("hash_lookup"),
+        "Expected hash_lookup in plan but got: {plan_str}"
+    );
+    assert!(
+        !plan_str.contains("IN (SET)"),
+        "Expected no IN (SET) in plan but got: {plan_str}"
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+
+    // The probe side had 4 rows, but after applying the dynamic filter only 2 rows should remain.
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    // Results should be identical to the InList version
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    +----+----+-----+----+----+-----+
+    ",
+    );
+}
+
+/// Test HashTable strategy with integer multi-column join keys.
+/// Verifies that hash_lookup works correctly with integer data types.
+#[tokio::test]
+async fn test_hashjoin_hash_table_pushdown_integer_keys() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with integer keys
+    let build_batches = vec![
+        record_batch!(
+            ("id1", Int32, [1, 2]),
+            ("id2", Int32, [10, 20]),
+            ("value", Float64, [100.0, 200.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id1", DataType::Int32, false),
+        Field::new("id2", DataType::Int32, false),
+        Field::new("value", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more integer rows
+    let probe_batches = vec![
+        record_batch!(
+            ("id1", Int32, [1, 2, 3, 4]),
+            ("id2", Int32, [10, 20, 30, 40]),
+            ("data", Utf8, ["a", "b", "c", "d"])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("id1", DataType::Int32, false),
+        Field::new("id2", DataType::Int32, false),
+        Field::new("data", DataType::Utf8, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create join on multiple integer columns
+    let on = vec![
+        (
+            col("id1", &build_side_schema).unwrap(),
+            col("id1", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("id2", &build_side_schema).unwrap(),
+            col("id2", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            Arc::clone(&probe_scan),
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    );
+
+    // Apply optimization with forced HashTable strategy
+    let session_config = SessionConfig::default()
+        .with_batch_size(10)
+        .set_usize("datafusion.optimizer.hash_join_inlist_pushdown_max_size", 1)
+        .set_bool("datafusion.execution.parquet.pushdown_filters", true)
+        .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true);
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, session_config.options())
+        .unwrap();
+    let session_ctx = SessionContext::new_with_config(session_config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // Verify hash_lookup is used
+    let plan_str = format_plan_for_test(&plan).to_string();
+    assert!(
+        plan_str.contains("hash_lookup"),
+        "Expected hash_lookup in plan but got: {plan_str}"
+    );
+    assert!(
+        !plan_str.contains("IN (SET)"),
+        "Expected no IN (SET) in plan but got: {plan_str}"
+    );
+
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+
+    let probe_scan_metrics = probe_scan.metrics().unwrap();
+    // Only 2 rows from probe side match the build side
+    assert_eq!(probe_scan_metrics.output_rows().unwrap(), 2);
+
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +-----+-----+-------+-----+-----+------+
+    | id1 | id2 | value | id1 | id2 | data |
+    +-----+-----+-------+-----+-----+------+
+    | 1   | 10  | 100.0 | 1   | 10  | a    |
+    | 2   | 20  | 200.0 | 2   | 20  | b    |
+    +-----+-----+-------+-----+-----+------+
+    ",
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_is_used() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Test both cases: probe side with and without filter pushdown support
+    for (probe_supports_pushdown, expected_is_used) in [(false, false), (true, true)] {
+        let build_side_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+        let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+            .with_support(true)
+            .with_batches(vec![
+                record_batch!(("a", Utf8, ["aa", "ab"]), ("b", Utf8, ["ba", "bb"]))
+                    .unwrap(),
+            ])
+            .build();
+
+        let probe_side_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+        let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+            .with_support(probe_supports_pushdown)
+            .with_batches(vec![
+                record_batch!(
+                    ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+                    ("b", Utf8, ["ba", "bb", "bc", "bd"])
+                )
+                .unwrap(),
+            ])
+            .build();
+
+        let on = vec![
+            (
+                col("a", &build_side_schema).unwrap(),
+                col("a", &probe_side_schema).unwrap(),
+            ),
+            (
+                col("b", &build_side_schema).unwrap(),
+                col("b", &probe_side_schema).unwrap(),
+            ),
+        ];
+        let plan = Arc::new(
+            HashJoinExec::try_new(
+                build_scan,
+                probe_scan,
+                on,
+                None,
+                &JoinType::Inner,
+                None,
+                PartitionMode::CollectLeft,
+                datafusion_common::NullEquality::NullEqualsNothing,
+                false,
+            )
+            .unwrap(),
+        ) as Arc<dyn ExecutionPlan>;
+
+        // Apply filter pushdown optimization
+        let mut config = ConfigOptions::default();
+        config.execution.parquet.pushdown_filters = true;
+        config.optimizer.enable_dynamic_filter_pushdown = true;
+        let plan = FilterPushdown::new_post_optimization()
+            .optimize(plan, &config)
+            .unwrap();
+
+        // Get the HashJoinExec to check the dynamic filter
+        let hash_join = plan
+            .as_any()
+            .downcast_ref::<HashJoinExec>()
+            .expect("Plan should be HashJoinExec");
+
+        // Verify that a dynamic filter was created
+        let dynamic_filter = hash_join
+            .dynamic_filter_for_test()
+            .expect("Dynamic filter should be created");
+
+        // Verify that is_used() returns the expected value based on probe side support.
+        // When probe_supports_pushdown=false: no consumer holds a reference (is_used=false)
+        // When probe_supports_pushdown=true: probe side holds a reference (is_used=true)
+        assert_eq!(
+            dynamic_filter.is_used(),
+            expected_is_used,
+            "is_used() should return {expected_is_used} when probe side support is {probe_supports_pushdown}"
+        );
+    }
+}
+
+/// Regression test for https://github.com/apache/datafusion/issues/20109
+#[tokio::test]
+async fn test_filter_with_projection_pushdown() {
+    use arrow::array::{Int64Array, RecordBatch, StringArray};
+    use datafusion_physical_plan::collect;
+    use datafusion_physical_plan::filter::FilterExecBuilder;
+
+    // Create schema: [time, event, size]
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("time", DataType::Int64, false),
+        Field::new("event", DataType::Utf8, false),
+        Field::new("size", DataType::Int64, false),
+    ]));
+
+    // Create sample data
+    let timestamps = vec![100i64, 200, 300, 400, 500];
+    let events = vec!["Ingestion", "Ingestion", "Query", "Ingestion", "Query"];
+    let sizes = vec![10i64, 20, 30, 40, 50];
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int64Array::from(timestamps)),
+            Arc::new(StringArray::from(events)),
+            Arc::new(Int64Array::from(sizes)),
+        ],
+    )
+    .unwrap();
+
+    // Create data source
+    let memory_exec = datafusion_datasource::memory::MemorySourceConfig::try_new_exec(
+        &[vec![batch]],
+        schema.clone(),
+        None,
+    )
+    .unwrap();
+
+    // First FilterExec: time < 350 with projection=[event@1, size@2]
+    let time_col = col("time", &memory_exec.schema()).unwrap();
+    let time_filter = Arc::new(BinaryExpr::new(
+        time_col,
+        Operator::Lt,
+        Arc::new(Literal::new(ScalarValue::Int64(Some(350)))),
+    ));
+    let filter1 = Arc::new(
+        FilterExecBuilder::new(time_filter, memory_exec)
+            .apply_projection(Some(vec![1, 2]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // Second FilterExec: event = 'Ingestion' with projection=[size@1]
+    let event_col = col("event", &filter1.schema()).unwrap();
+    let event_filter = Arc::new(BinaryExpr::new(
+        event_col,
+        Operator::Eq,
+        Arc::new(Literal::new(ScalarValue::Utf8(Some(
+            "Ingestion".to_string(),
+        )))),
+    ));
+    let filter2 = Arc::new(
+        FilterExecBuilder::new(event_filter, filter1)
+            .apply_projection(Some(vec![1]))
+            .unwrap()
+            .build()
+            .unwrap(),
+    );
+
+    // Apply filter pushdown optimization
+    let config = ConfigOptions::default();
+    let optimized_plan = FilterPushdown::new()
+        .optimize(Arc::clone(&filter2) as Arc<dyn ExecutionPlan>, &config)
+        .unwrap();
+
+    // Execute the optimized plan - this should not error
+    let ctx = SessionContext::new();
+    let result = collect(optimized_plan, ctx.task_ctx()).await.unwrap();
+
+    // Verify results: should return rows where time < 350 AND event = 'Ingestion'
+    // That's rows with time=100,200 (both have event='Ingestion'), so sizes 10,20
+    let expected = [
+        "+------+", "| size |", "+------+", "| 10   |", "| 20   |", "+------+",
+    ];
+    assert_batches_eq!(expected, &result);
+}
+
+/// Test that ExecutionPlan::apply_expressions() can discover dynamic filters across the plan tree
+#[tokio::test]
+async fn test_discover_dynamic_filters_via_expressions_api() {
+    use datafusion_common::JoinType;
+    use datafusion_common::tree_node::TreeNodeRecursion;
+    use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    fn count_dynamic_filters(plan: &Arc<dyn ExecutionPlan>) -> usize {
+        let mut count = 0;
+
+        // Check expressions from this node using apply_expressions
+        let _ = plan.apply_expressions(&mut |expr| {
+            if let Some(_df) = expr.as_any().downcast_ref::<DynamicFilterPhysicalExpr>() {
+                count += 1;
+            }
+            Ok(TreeNodeRecursion::Continue)
+        });
+
+        // Recursively visit children
+        for child in plan.children() {
+            count += count_dynamic_filters(child);
+        }
+
+        count
+    }
+
+    // Create build side (left)
+    let build_batches =
+        vec![record_batch!(("a", Utf8, ["foo", "bar"]), ("b", Int32, [1, 2])).unwrap()];
+    let build_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let build_scan = TestScanBuilder::new(build_schema.clone())
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side (right)
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["foo", "bar", "baz", "qux"]),
+            ("c", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(probe_schema.clone())
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            probe_scan,
+            vec![(
+                col("a", &build_schema).unwrap(),
+                col("a", &probe_schema).unwrap(),
+            )],
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Before optimization: no dynamic filters
+    let count_before = count_dynamic_filters(&plan);
+    assert_eq!(
+        count_before, 0,
+        "Before optimization, should have no dynamic filters"
+    );
+
+    // Apply filter pushdown optimization (this creates dynamic filters)
+    let mut config = ConfigOptions::default();
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    config.execution.parquet.pushdown_filters = true;
+    let optimized_plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    // After optimization: should discover dynamic filters
+    // We expect 2 dynamic filters:
+    // 1. In the HashJoinExec (producer)
+    // 2. In the DataSourceExec (consumer, pushed down to the probe side)
+    let count_after = count_dynamic_filters(&optimized_plan);
+    assert_eq!(
+        count_after, 2,
+        "After optimization, should discover exactly 2 dynamic filters (1 in HashJoinExec, 1 in DataSourceExec), found {count_after}"
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_left_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values (some won't match)
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec with Left join and CollectLeft mode
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            Arc::clone(&probe_scan),
+            on,
+            None,
+            &JoinType::Left,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Expect the dynamic filter predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@0), (b@1, b@1)]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@0), (b@1, b@1)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    ",
+    );
+
+    // Actually apply the optimization and execute the plan
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    // Test that dynamic filter linking survives with_new_children
+    let children = plan.children().into_iter().map(Arc::clone).collect();
+    let plan = plan.with_new_children(children).unwrap();
+
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // After execution, verify the dynamic filter was populated with bounds and IN-list
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, a@0), (b@1, b@1)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    // Verify result correctness: left join preserves all build (left) rows.
+    // All build rows match probe rows here, so we get 2 matched rows.
+    // The dynamic filter pruned unmatched probe rows (ac, ad) at scan time,
+    // which is safe because those probe rows would never match any build row.
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+----+----+-----+
+    | a  | b  | c   | a  | b  | e   |
+    +----+----+-----+----+----+-----+
+    | aa | ba | 1.0 | aa | ba | 1.0 |
+    | ab | bb | 2.0 | ab | bb | 2.0 |
+    +----+----+-----+----+----+-----+
+    "
+    );
+}
+
+#[tokio::test]
+async fn test_hashjoin_dynamic_filter_pushdown_left_semi_join() {
+    use datafusion_common::JoinType;
+    use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+
+    // Create build side with limited values
+    let build_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab"]),
+            ("b", Utf8, ["ba", "bb"]),
+            ("c", Float64, [1.0, 2.0])
+        )
+        .unwrap(),
+    ];
+    let build_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let build_scan = TestScanBuilder::new(Arc::clone(&build_side_schema))
+        .with_support(true)
+        .with_batches(build_batches)
+        .build();
+
+    // Create probe side with more values (some won't match)
+    let probe_batches = vec![
+        record_batch!(
+            ("a", Utf8, ["aa", "ab", "ac", "ad"]),
+            ("b", Utf8, ["ba", "bb", "bc", "bd"]),
+            ("e", Float64, [1.0, 2.0, 3.0, 4.0])
+        )
+        .unwrap(),
+    ];
+    let probe_side_schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Utf8, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("e", DataType::Float64, false),
+    ]));
+    let probe_scan = TestScanBuilder::new(Arc::clone(&probe_side_schema))
+        .with_support(true)
+        .with_batches(probe_batches)
+        .build();
+
+    // Create HashJoinExec with LeftSemi join and CollectLeft mode
+    let on = vec![
+        (
+            col("a", &build_side_schema).unwrap(),
+            col("a", &probe_side_schema).unwrap(),
+        ),
+        (
+            col("b", &build_side_schema).unwrap(),
+            col("b", &probe_side_schema).unwrap(),
+        ),
+    ];
+    let plan = Arc::new(
+        HashJoinExec::try_new(
+            build_scan,
+            Arc::clone(&probe_scan),
+            on,
+            None,
+            &JoinType::LeftSemi,
+            None,
+            PartitionMode::CollectLeft,
+            datafusion_common::NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap(),
+    ) as Arc<dyn ExecutionPlan>;
+
+    // Expect the dynamic filter predicate to be pushed down into the probe side DataSource
+    insta::assert_snapshot!(
+        OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)]
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+        -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true
+      output:
+        Ok:
+          - HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)]
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+          -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]
+    ",
+    );
+
+    // Actually apply the optimization and execute the plan
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+    let plan = FilterPushdown::new_post_optimization()
+        .optimize(plan, &config)
+        .unwrap();
+
+    // Test that dynamic filter linking survives with_new_children
+    let children = plan.children().into_iter().map(Arc::clone).collect();
+    let plan = plan.with_new_children(children).unwrap();
+
+    let config = SessionConfig::new().with_batch_size(10);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let batches = collect(Arc::clone(&plan), Arc::clone(&task_ctx))
+        .await
+        .unwrap();
+
+    // After execution, verify the dynamic filter was populated with bounds and IN-list
+    insta::assert_snapshot!(
+        format!("{}", format_plan_for_test(&plan)),
+        @r"
+    - HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(a@0, a@0), (b@1, b@1)]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb AND struct(a@0, b@1) IN (SET) ([{c0:aa,c1:ba}, {c0:ab,c1:bb}]) ]
+    "
+    );
+
+    // Verify result correctness: left semi join returns only build (left) rows
+    // that have at least one matching probe row. Output schema is build-side columns only.
+    let result = format!("{}", pretty_format_batches(&batches).unwrap());
+    insta::assert_snapshot!(
+        result,
+        @r"
+    +----+----+-----+
+    | a  | b  | c   |
+    +----+----+-----+
+    | aa | ba | 1.0 |
+    | ab | bb | 2.0 |
+    +----+----+-----+
+    "
+    );
+}
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
deleted file mode 100644
index a28933d97bcd1..0000000000000
--- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
+++ /dev/null
@@ -1,378 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::{Arc, LazyLock};
-
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::{
-    logical_expr::Operator,
-    physical_plan::{
-        expressions::{BinaryExpr, Column, Literal},
-        PhysicalExpr,
-    },
-    scalar::ScalarValue,
-};
-use datafusion_common::config::ConfigOptions;
-use datafusion_functions_aggregate::count::count_udaf;
-use datafusion_physical_expr::expressions::col;
-use datafusion_physical_expr::{aggregate::AggregateExprBuilder, Partitioning};
-use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
-use datafusion_physical_plan::{
-    aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
-    coalesce_batches::CoalesceBatchesExec,
-    filter::FilterExec,
-    repartition::RepartitionExec,
-};
-
-use util::{OptimizationTest, TestNode, TestScanBuilder};
-
-mod util;
-
-#[test]
-fn test_pushdown_into_scan() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap());
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-/// Show that we can use config options to determine how to do pushdown.
-#[test]
-fn test_pushdown_into_scan_with_config_options() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, scan).unwrap()) as _;
-
-    let mut cfg = ConfigOptions::default();
-    insta::assert_snapshot!(
-        OptimizationTest::new(
-            Arc::clone(&plan),
-            FilterPushdown {},
-            false
-        ),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: a@0 = foo
-          -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    "
-    );
-
-    cfg.execution.parquet.pushdown_filters = true;
-    insta::assert_snapshot!(
-        OptimizationTest::new(
-            plan,
-            FilterPushdown {},
-            true
-        ),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[test]
-fn test_filter_collapse() {
-    // filter should be pushed down into the parquet scan with two filters
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate1 = col_lit_predicate("a", "foo", &schema());
-    let filter1 = Arc::new(FilterExec::try_new(predicate1, scan).unwrap());
-    let predicate2 = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap());
-
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   FilterExec: a@0 = foo
-        -     DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_filter_with_projection() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let projection = vec![1, 0];
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(
-        FilterExec::try_new(predicate, Arc::clone(&scan))
-            .unwrap()
-            .with_projection(Some(projection))
-            .unwrap(),
-    );
-
-    // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo, projection=[b@1, a@0]
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - ProjectionExec: expr=[b@1 as b, a@0 as a]
-          -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    ",
-    );
-
-    // add a test where the filter is on a column that isn't included in the output
-    let projection = vec![1];
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(
-        FilterExec::try_new(predicate, scan)
-            .unwrap()
-            .with_projection(Some(projection))
-            .unwrap(),
-    );
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{},true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: a@0 = foo, projection=[b@1]
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - ProjectionExec: expr=[b@1 as b]
-          -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-#[test]
-fn test_push_down_through_transparent_nodes() {
-    // expect the predicate to be pushed down into the DataSource
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 1));
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let filter = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap());
-    let repartition = Arc::new(
-        RepartitionExec::try_new(filter, Partitioning::RoundRobinBatch(1)).unwrap(),
-    );
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, repartition).unwrap());
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{},true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
-        -     FilterExec: a@0 = foo
-        -       CoalesceBatchesExec: target_batch_size=1
-        -         DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=1
-          -   CoalesceBatchesExec: target_batch_size=1
-          -     DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo AND b@1 = bar
-    "
-    );
-}
-
-#[test]
-fn test_no_pushdown_through_aggregates() {
-    // There are 2 important points here:
-    // 1. The outer filter **is not** pushed down at all because we haven't implemented pushdown support
-    //    yet for AggregateExec.
-    // 2. The inner filter **is** pushed down into the DataSource.
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let coalesce = Arc::new(CoalesceBatchesExec::new(scan, 10));
-
-    let filter = Arc::new(
-        FilterExec::try_new(col_lit_predicate("a", "foo", &schema()), coalesce).unwrap(),
-    );
-
-    let aggregate_expr =
-        vec![
-            AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema()).unwrap()])
-                .schema(schema())
-                .alias("cnt")
-                .build()
-                .map(Arc::new)
-                .unwrap(),
-        ];
-    let group_by = PhysicalGroupBy::new_single(vec![
-        (col("a", &schema()).unwrap(), "a".to_string()),
-        (col("b", &schema()).unwrap(), "b".to_string()),
-    ]);
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr.clone(),
-            vec![None],
-            filter,
-            schema(),
-        )
-        .unwrap(),
-    );
-
-    let coalesce = Arc::new(CoalesceBatchesExec::new(aggregate, 100));
-
-    let predicate = col_lit_predicate("b", "bar", &schema());
-    let plan = Arc::new(FilterExec::try_new(predicate, coalesce).unwrap());
-
-    // expect the predicate to be pushed down into the DataSource
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: b@1 = bar
-        -   CoalesceBatchesExec: target_batch_size=100
-        -     AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt], ordering_mode=PartiallySorted([0])
-        -       FilterExec: a@0 = foo
-        -         CoalesceBatchesExec: target_batch_size=10
-        -           DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: b@1 = bar
-          -   CoalesceBatchesExec: target_batch_size=100
-          -     AggregateExec: mode=Final, gby=[a@0 as a, b@1 as b], aggr=[cnt]
-          -       CoalesceBatchesExec: target_batch_size=10
-          -         DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    "
-    );
-}
-
-/// Test various combinations of handling of child pushdown results
-/// in an ExectionPlan in combination with support/not support in a DataSource.
-#[test]
-fn test_node_handles_child_pushdown_result() {
-    // If we set `with_support(true)` + `inject_filter = true` then the filter is pushed down to the DataSource
-    // and no FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: true }
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: true }
-          -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo
-    ",
-    );
-
-    // If we set `with_support(false)` + `inject_filter = true` then the filter is not pushed down to the DataSource
-    // and a FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(false).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: true }
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: false }
-          -   FilterExec: a@0 = foo
-          -     DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-    ",
-    );
-
-    // If we set `with_support(false)` + `inject_filter = false` then the filter is not pushed down to the DataSource
-    // and no FilterExec is created.
-    let scan = TestScanBuilder::new(schema()).with_support(false).build();
-    let predicate = col_lit_predicate("a", "foo", &schema());
-    let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate));
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown{}, true),
-        @r"
-    OptimizationTest:
-      input:
-        - TestInsertExec { inject_filter: false }
-        -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-      output:
-        Ok:
-          - TestInsertExec { inject_filter: false }
-          -   DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false
-    ",
-    );
-}
-
-/// Schema:
-/// a: String
-/// b: String
-/// c: f64
-static TEST_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
-    let fields = vec![
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-        Field::new("c", DataType::Float64, false),
-    ];
-    Arc::new(Schema::new(fields))
-});
-
-fn schema() -> SchemaRef {
-    Arc::clone(&TEST_SCHEMA)
-}
-
-/// Returns a predicate that is a binary expression col = lit
-fn col_lit_predicate(
-    column_name: &str,
-    scalar_value: impl Into<ScalarValue>,
-    schema: &Schema,
-) -> Arc<dyn PhysicalExpr> {
-    let scalar_value = scalar_value.into();
-    Arc::new(BinaryExpr::new(
-        Arc::new(Column::new_with_schema(column_name, schema).unwrap()),
-        Operator::Eq,
-        Arc::new(Literal::new(scalar_value)),
-    ))
-}
diff --git a/datafusion/core/tests/physical_optimizer/join_selection.rs b/datafusion/core/tests/physical_optimizer/join_selection.rs
index d8c0c142f7fb6..1c94a7bd1e91c 100644
--- a/datafusion/core/tests/physical_optimizer/join_selection.rs
+++ b/datafusion/core/tests/physical_optimizer/join_selection.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use insta::assert_snapshot;
 use std::sync::Arc;
 use std::{
     any::Any,
@@ -25,29 +26,28 @@ use std::{
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::JoinSide;
-use datafusion_common::{stats::Precision, ColumnStatistics, JoinType, ScalarValue};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{ColumnStatistics, JoinType, ScalarValue, stats::Precision};
+use datafusion_common::{JoinSide, NullEquality};
 use datafusion_common::{Result, Statistics};
 use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
 use datafusion_expr::Operator;
+use datafusion_physical_expr::PhysicalExprRef;
 use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::expressions::{BinaryExpr, Column, NegativeExpr};
 use datafusion_physical_expr::intervals::utils::check_support;
-use datafusion_physical_expr::PhysicalExprRef;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
-use datafusion_physical_optimizer::join_selection::{
-    hash_join_swap_subrule, JoinSelection,
-};
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::join_selection::JoinSelection;
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::displayable;
 use datafusion_physical_plan::joins::utils::ColumnIndex;
 use datafusion_physical_plan::joins::utils::JoinFilter;
 use datafusion_physical_plan::joins::{HashJoinExec, NestedLoopJoinExec, PartitionMode};
 use datafusion_physical_plan::projection::ProjectionExec;
-use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 use futures::Stream;
@@ -222,6 +222,7 @@ async fn test_join_with_swap() {
             &JoinType::Left,
             None,
             PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap(),
@@ -237,12 +238,12 @@ async fn test_join_with_swap() {
         .expect("A proj is required to swap columns back to their original order");
 
     assert_eq!(swapping_projection.expr().len(), 2);
-    let (col, name) = &swapping_projection.expr()[0];
-    assert_eq!(name, "big_col");
-    assert_col_expr(col, "big_col", 1);
-    let (col, name) = &swapping_projection.expr()[1];
-    assert_eq!(name, "small_col");
-    assert_col_expr(col, "small_col", 0);
+    let proj_expr = &swapping_projection.expr()[0];
+    assert_eq!(proj_expr.alias, "big_col");
+    assert_col_expr(&proj_expr.expr, "big_col", 1);
+    let proj_expr = &swapping_projection.expr()[1];
+    assert_eq!(proj_expr.alias, "small_col");
+    assert_col_expr(&proj_expr.expr, "small_col", 0);
 
     let swapped_join = swapping_projection
         .input()
@@ -284,6 +285,7 @@ async fn test_left_join_no_swap() {
             &JoinType::Left,
             None,
             PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap(),
@@ -333,6 +335,7 @@ async fn test_join_with_swap_semi() {
             &join_type,
             None,
             PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap();
@@ -371,10 +374,65 @@ async fn test_join_with_swap_semi() {
     }
 }
 
+#[tokio::test]
+async fn test_join_with_swap_mark() {
+    let join_types = [JoinType::LeftMark, JoinType::RightMark];
+    for join_type in join_types {
+        let (big, small) = create_big_and_small();
+
+        let join = HashJoinExec::try_new(
+            Arc::clone(&big),
+            Arc::clone(&small),
+            vec![(
+                Arc::new(Column::new_with_schema("big_col", &big.schema()).unwrap()),
+                Arc::new(Column::new_with_schema("small_col", &small.schema()).unwrap()),
+            )],
+            None,
+            &join_type,
+            None,
+            PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
+            false,
+        )
+        .unwrap();
+
+        let original_schema = join.schema();
+
+        let optimized_join = JoinSelection::new()
+            .optimize(Arc::new(join), &ConfigOptions::new())
+            .unwrap();
+
+        let swapped_join = optimized_join
+            .as_any()
+            .downcast_ref::<HashJoinExec>()
+            .expect(
+                "A proj is not required to swap columns back to their original order",
+            );
+
+        assert_eq!(swapped_join.schema().fields().len(), 2);
+        assert_eq!(
+            swapped_join
+                .left()
+                .partition_statistics(None)
+                .unwrap()
+                .total_byte_size,
+            Precision::Inexact(8192)
+        );
+        assert_eq!(
+            swapped_join
+                .right()
+                .partition_statistics(None)
+                .unwrap()
+                .total_byte_size,
+            Precision::Inexact(2097152)
+        );
+        assert_eq!(original_schema, swapped_join.schema());
+    }
+}
+
 /// Compare the input plan with the plan after running the probe order optimizer.
 macro_rules! assert_optimized {
-    ($EXPECTED_LINES: expr, $PLAN: expr) => {
-        let expected_lines = $EXPECTED_LINES.iter().map(|s| *s).collect::<Vec<&str>>();
+    ($PLAN: expr, @$EXPECTED_LINES: literal $(,)?) => {
 
         let plan = Arc::new($PLAN);
         let optimized = JoinSelection::new()
@@ -382,12 +440,11 @@ macro_rules! assert_optimized {
             .unwrap();
 
         let plan_string = displayable(optimized.as_ref()).indent(true).to_string();
-        let actual_lines = plan_string.split("\n").collect::<Vec<&str>>();
+        let actual = plan_string.trim();
 
-        assert_eq!(
-            &expected_lines, &actual_lines,
-            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
-            expected_lines, actual_lines
+        assert_snapshot!(
+            actual,
+            @$EXPECTED_LINES
         );
     };
 }
@@ -408,6 +465,7 @@ async fn test_nested_join_swap() {
         &JoinType::Inner,
         None,
         PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNothing,
         false,
     )
     .unwrap();
@@ -425,6 +483,7 @@ async fn test_nested_join_swap() {
         &JoinType::Left,
         None,
         PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNothing,
         false,
     )
     .unwrap();
@@ -436,17 +495,18 @@ async fn test_nested_join_swap() {
     // The first hash join's left is 'small' table (with 1000 rows), and the second hash join's
     // left is the F(small IJ big) which has an estimated cardinality of 2000 rows (vs medium which
     // has an exact cardinality of 10_000 rows).
-    let expected = [
-            "ProjectionExec: expr=[medium_col@2 as medium_col, big_col@0 as big_col, small_col@1 as small_col]",
-            "  HashJoinExec: mode=CollectLeft, join_type=Right, on=[(small_col@1, medium_col@0)]",
-            "    ProjectionExec: expr=[big_col@1 as big_col, small_col@0 as small_col]",
-            "      HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(small_col@0, big_col@0)]",
-            "        StatisticsExec: col_count=1, row_count=Inexact(1000)",
-            "        StatisticsExec: col_count=1, row_count=Inexact(100000)",
-            "    StatisticsExec: col_count=1, row_count=Inexact(10000)",
-            "",
-        ];
-    assert_optimized!(expected, join);
+    assert_optimized!(
+        join,
+        @r"
+    ProjectionExec: expr=[medium_col@2 as medium_col, big_col@0 as big_col, small_col@1 as small_col]
+      HashJoinExec: mode=CollectLeft, join_type=Right, on=[(small_col@1, medium_col@0)]
+        ProjectionExec: expr=[big_col@1 as big_col, small_col@0 as small_col]
+          HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(small_col@0, big_col@0)]
+            StatisticsExec: col_count=1, row_count=Inexact(1000)
+            StatisticsExec: col_count=1, row_count=Inexact(100000)
+        StatisticsExec: col_count=1, row_count=Inexact(10000)
+    "
+    );
 }
 
 #[tokio::test]
@@ -464,6 +524,7 @@ async fn test_join_no_swap() {
             &JoinType::Inner,
             None,
             PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap(),
@@ -528,12 +589,12 @@ async fn test_nl_join_with_swap(join_type: JoinType) {
         .expect("A proj is required to swap columns back to their original order");
 
     assert_eq!(swapping_projection.expr().len(), 2);
-    let (col, name) = &swapping_projection.expr()[0];
-    assert_eq!(name, "big_col");
-    assert_col_expr(col, "big_col", 1);
-    let (col, name) = &swapping_projection.expr()[1];
-    assert_eq!(name, "small_col");
-    assert_col_expr(col, "small_col", 0);
+    let proj_expr = &swapping_projection.expr()[0];
+    assert_eq!(proj_expr.alias, "big_col");
+    assert_col_expr(&proj_expr.expr, "big_col", 1);
+    let proj_expr = &swapping_projection.expr()[1];
+    assert_eq!(proj_expr.alias, "small_col");
+    assert_col_expr(&proj_expr.expr, "small_col", 0);
 
     let swapped_join = swapping_projection
         .input()
@@ -578,7 +639,8 @@ async fn test_nl_join_with_swap(join_type: JoinType) {
     case::left_semi(JoinType::LeftSemi),
     case::left_anti(JoinType::LeftAnti),
     case::right_semi(JoinType::RightSemi),
-    case::right_anti(JoinType::RightAnti)
+    case::right_anti(JoinType::RightAnti),
+    case::right_mark(JoinType::RightMark)
 )]
 #[tokio::test]
 async fn test_nl_join_with_swap_no_proj(join_type: JoinType) {
@@ -690,6 +752,7 @@ async fn test_hash_join_swap_on_joins_with_projections(
         &join_type,
         Some(projection),
         PartitionMode::Partitioned,
+        NullEquality::NullEqualsNothing,
         false,
     )?);
 
@@ -700,7 +763,7 @@ async fn test_hash_join_swap_on_joins_with_projections(
             "ProjectionExec won't be added above if HashJoinExec contains embedded projection",
         );
 
-    assert_eq!(swapped_join.projection, Some(vec![0_usize]));
+    assert_eq!(swapped_join.projection.as_deref().unwrap(), &[0_usize]);
     assert_eq!(swapped.schema().fields.len(), 1);
     assert_eq!(swapped.schema().fields[0].name(), "small_col");
     Ok(())
@@ -851,6 +914,7 @@ fn check_join_partition_mode(
             &JoinType::Inner,
             None,
             PartitionMode::Auto,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap(),
@@ -895,10 +959,10 @@ impl Stream for UnboundedStream {
         mut self: Pin<&mut Self>,
         _cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        if let Some(val) = self.batch_produce {
-            if val <= self.count {
-                return Poll::Ready(None);
-            }
+        if let Some(val) = self.batch_produce
+            && val <= self.count
+        {
+            return Poll::Ready(None);
         }
         self.count += 1;
         Poll::Ready(Some(Ok(self.batch.clone())))
@@ -916,7 +980,7 @@ impl RecordBatchStream for UnboundedStream {
 pub struct UnboundedExec {
     batch_produce: Option<usize>,
     batch: RecordBatch,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnboundedExec {
@@ -932,7 +996,7 @@ impl UnboundedExec {
         Self {
             batch_produce,
             batch,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -989,7 +1053,7 @@ impl ExecutionPlan for UnboundedExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1015,6 +1079,20 @@ impl ExecutionPlan for UnboundedExec {
             batch: self.batch.clone(),
         }))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 #[derive(Eq, PartialEq, Debug)]
@@ -1028,20 +1106,21 @@ pub enum SourceType {
 pub struct StatisticsExec {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl StatisticsExec {
     pub fn new(stats: Statistics, schema: Schema) -> Self {
         assert_eq!(
-                stats.column_statistics.len(), schema.fields().len(),
-                "if defined, the column statistics vector length should be the number of fields"
-            );
+            stats.column_statistics.len(),
+            schema.fields().len(),
+            "if defined, the column statistics vector length should be the number of fields"
+        );
         let cache = Self::compute_properties(Arc::new(schema.clone()));
         Self {
             stats,
             schema: Arc::new(schema),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -1089,7 +1168,7 @@ impl ExecutionPlan for StatisticsExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1112,16 +1191,26 @@ impl ExecutionPlan for StatisticsExec {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        Ok(if partition.is_some() {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(if partition.is_some() {
             Statistics::new_unknown(&self.schema)
         } else {
             self.stats.clone()
-        })
+        }))
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -1498,10 +1587,12 @@ async fn test_join_with_maybe_swap_unbounded_case(t: TestCase) -> Result<()> {
         &t.initial_join_type,
         None,
         t.initial_mode,
+        NullEquality::NullEqualsNothing,
         false,
     )?) as _;
 
-    let optimized_join_plan = hash_join_swap_subrule(join, &ConfigOptions::new())?;
+    let optimized_join_plan =
+        JoinSelection::new().optimize(Arc::clone(&join), &ConfigOptions::new())?;
 
     // If swap did happen
     let projection_added = optimized_join_plan.as_any().is::<ProjectionExec>();
diff --git a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
index dd2c1960a6580..b8c4d6d6f0d7a 100644
--- a/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
+++ b/datafusion/core/tests/physical_optimizer/limit_pushdown.rs
@@ -17,28 +17,28 @@
 
 use std::sync::Arc;
 
+use crate::physical_optimizer::test_utils::{
+    coalesce_partitions_exec, global_limit_exec, hash_join_exec, local_limit_exec,
+    sort_exec, sort_preserving_merge_exec, stream_exec,
+};
+
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
-use datafusion_execution::{SendableRecordBatchStream, TaskContext};
-use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::BinaryExpr;
-use datafusion_physical_expr::expressions::{col, lit};
-use datafusion_physical_expr::{Partitioning, PhysicalSortExpr};
-use datafusion_physical_optimizer::limit_pushdown::LimitPushdown;
+use datafusion_expr::{JoinType, Operator};
+use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr::expressions::{BinaryExpr, col, lit};
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_optimizer::limit_pushdown::LimitPushdown;
 use datafusion_physical_plan::empty::EmptyExec;
 use datafusion_physical_plan::filter::FilterExec;
-use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::joins::NestedLoopJoinExec;
 use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
-use datafusion_physical_plan::{get_plan_string, ExecutionPlan, ExecutionPlanProperties};
+use datafusion_physical_plan::{ExecutionPlan, get_plan_string};
 
 fn create_schema() -> SchemaRef {
     Arc::new(Schema::new(vec![
@@ -48,48 +48,6 @@ fn create_schema() -> SchemaRef {
     ]))
 }
 
-fn streaming_table_exec(schema: SchemaRef) -> Result<Arc<dyn ExecutionPlan>> {
-    Ok(Arc::new(StreamingTableExec::try_new(
-        Arc::clone(&schema),
-        vec![Arc::new(DummyStreamPartition { schema }) as _],
-        None,
-        None,
-        true,
-        None,
-    )?))
-}
-
-fn global_limit_exec(
-    input: Arc<dyn ExecutionPlan>,
-    skip: usize,
-    fetch: Option<usize>,
-) -> Arc<dyn ExecutionPlan> {
-    Arc::new(GlobalLimitExec::new(input, skip, fetch))
-}
-
-fn local_limit_exec(
-    input: Arc<dyn ExecutionPlan>,
-    fetch: usize,
-) -> Arc<dyn ExecutionPlan> {
-    Arc::new(LocalLimitExec::new(input, fetch))
-}
-
-fn sort_exec(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
-    input: Arc<dyn ExecutionPlan>,
-) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(SortExec::new(sort_exprs, input))
-}
-
-fn sort_preserving_merge_exec(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
-    input: Arc<dyn ExecutionPlan>,
-) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(SortPreservingMergeExec::new(sort_exprs, input))
-}
-
 fn projection_exec(
     schema: SchemaRef,
     input: Arc<dyn ExecutionPlan>,
@@ -118,16 +76,6 @@ fn filter_exec(
     )?))
 }
 
-fn coalesce_batches_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalesceBatchesExec::new(input, 8192))
-}
-
-fn coalesce_partitions_exec(
-    local_limit: Arc<dyn ExecutionPlan>,
-) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalescePartitionsExec::new(local_limit))
-}
-
 fn repartition_exec(
     streaming_table: Arc<dyn ExecutionPlan>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
@@ -141,168 +89,272 @@ fn empty_exec(schema: SchemaRef) -> Arc<dyn ExecutionPlan> {
     Arc::new(EmptyExec::new(schema))
 }
 
-#[derive(Debug)]
-struct DummyStreamPartition {
-    schema: SchemaRef,
+fn nested_loop_join_exec(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    join_type: JoinType,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    Ok(Arc::new(NestedLoopJoinExec::try_new(
+        left, right, None, &join_type, None,
+    )?))
 }
-impl PartitionStream for DummyStreamPartition {
-    fn schema(&self) -> &SchemaRef {
-        &self.schema
-    }
-    fn execute(&self, _ctx: Arc<TaskContext>) -> SendableRecordBatchStream {
-        unreachable!()
-    }
+
+fn format_plan(plan: &Arc<dyn ExecutionPlan>) -> String {
+    get_plan_string(plan).join("\n")
 }
 
 #[test]
 fn transforms_streaming_table_exec_into_fetching_version_when_skip_is_zero() -> Result<()>
 {
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(schema)?;
+    let streaming_table = stream_exec(&schema);
     let global_limit = global_limit_exec(streaming_table, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @"StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=5"
+    );
 
     Ok(())
 }
 
 #[test]
-fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero(
-) -> Result<()> {
+fn transforms_streaming_table_exec_into_fetching_version_and_keeps_the_global_limit_when_skip_is_nonzero()
+-> Result<()> {
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(schema)?;
+    let streaming_table = stream_exec(&schema);
     let global_limit = global_limit_exec(streaming_table, 2, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=2, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=2, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "GlobalLimitExec: skip=2, fetch=5",
-            "  StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=2, fetch=5
+      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true, fetch=7
+    "
+    );
 
     Ok(())
 }
 
+fn join_on_columns(
+    left_col: &str,
+    right_col: &str,
+) -> Vec<(PhysicalExprRef, PhysicalExprRef)> {
+    vec![(
+        Arc::new(datafusion_physical_expr::expressions::Column::new(
+            left_col, 0,
+        )) as _,
+        Arc::new(datafusion_physical_expr::expressions::Column::new(
+            right_col, 0,
+        )) as _,
+    )]
+}
+
 #[test]
-fn transforms_coalesce_batches_exec_into_fetching_version_and_removes_local_limit(
-) -> Result<()> {
+fn absorbs_limit_into_hash_join_inner() -> Result<()> {
+    // HashJoinExec with Inner join should absorb limit via with_fetch
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(Arc::clone(&schema))?;
-    let repartition = repartition_exec(streaming_table)?;
-    let filter = filter_exec(schema, repartition)?;
-    let coalesce_batches = coalesce_batches_exec(filter);
-    let local_limit = local_limit_exec(coalesce_batches, 5);
-    let coalesce_partitions = coalesce_partitions_exec(local_limit);
-    let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5));
-
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  CoalescePartitionsExec",
-            "    LocalLimitExec: fetch=5",
-            "      CoalesceBatchesExec: target_batch_size=8192",
-            "        FilterExec: c3@2 > 0",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Inner)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
-
-    let expected = [
-        "CoalescePartitionsExec: fetch=5",
-        "  CoalesceBatchesExec: target_batch_size=8192, fetch=5",
-        "    FilterExec: c3@2 > 0",
-        "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-        "        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    // The limit should be absorbed by the hash join (not pushed to children)
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)], fetch=5
+      EmptyExec
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
 
 #[test]
-fn pushes_global_limit_exec_through_projection_exec() -> Result<()> {
+fn absorbs_limit_into_hash_join_right() -> Result<()> {
+    // HashJoinExec with Right join should absorb limit via with_fetch
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(Arc::clone(&schema))?;
-    let filter = filter_exec(Arc::clone(&schema), streaming_table)?;
-    let projection = projection_exec(schema, filter)?;
-    let global_limit = global_limit_exec(projection, 0, Some(5));
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Right)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(10));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=10
+      HashJoinExec: mode=Partitioned, join_type=Right, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // The limit should be absorbed by the hash join
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Right, on=[(c1@0, c1@0)], fetch=10
+      EmptyExec
+      EmptyExec
+    "
+    );
+
+    Ok(())
+}
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "    FilterExec: c3@2 > 0",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+#[test]
+fn absorbs_limit_into_hash_join_left() -> Result<()> {
+    // during probing, then unmatched rows at the end, stopping when limit is reached
+    let schema = create_schema();
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Left)?;
+    let global_limit = global_limit_exec(hash_join, 0, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Left, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // Left join now absorbs the limit
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Left, on=[(c1@0, c1@0)], fetch=5
+      EmptyExec
+      EmptyExec
+    "
+    );
 
-    let expected = [
-            "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "  GlobalLimitExec: skip=0, fetch=5",
-            "    FilterExec: c3@2 > 0",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    Ok(())
+}
+
+#[test]
+fn absorbs_limit_with_skip_into_hash_join() -> Result<()> {
+    let schema = create_schema();
+    let left = empty_exec(Arc::clone(&schema));
+    let right = empty_exec(Arc::clone(&schema));
+    let on = join_on_columns("c1", "c1");
+    let hash_join = hash_join_exec(left, right, on, None, &JoinType::Inner)?;
+    let global_limit = global_limit_exec(hash_join, 3, Some(5));
+
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=3, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)]
+        EmptyExec
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    // With skip, GlobalLimit is kept but fetch (skip + limit = 8) is absorbed by the join
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=3, fetch=5
+      HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c1@0)], fetch=8
+        EmptyExec
+        EmptyExec
+    "
+    );
 
     Ok(())
 }
 
 #[test]
-fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batches_exec_into_fetching_version(
-) -> Result<()> {
+fn pushes_global_limit_exec_through_projection_exec() -> Result<()> {
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(Arc::clone(&schema)).unwrap();
-    let coalesce_batches = coalesce_batches_exec(streaming_table);
-    let projection = projection_exec(schema, coalesce_batches)?;
+    let streaming_table = stream_exec(&schema);
+    let filter = filter_exec(Arc::clone(&schema), streaming_table)?;
+    let projection = projection_exec(schema, filter)?;
     let global_limit = global_limit_exec(projection, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+        FilterExec: c3@2 > 0
+          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "  CoalesceBatchesExec: target_batch_size=8192, fetch=5",
-            "    StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+      FilterExec: c3@2 > 0, fetch=5
+        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -310,45 +362,45 @@ fn pushes_global_limit_exec_through_projection_exec_and_transforms_coalesce_batc
 #[test]
 fn pushes_global_limit_into_multiple_fetch_plans() -> Result<()> {
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(Arc::clone(&schema)).unwrap();
-    let coalesce_batches = coalesce_batches_exec(streaming_table);
-    let projection = projection_exec(Arc::clone(&schema), coalesce_batches)?;
+    let streaming_table = stream_exec(&schema);
+    let projection = projection_exec(Arc::clone(&schema), streaming_table)?;
     let repartition = repartition_exec(projection)?;
-    let sort = sort_exec(
-        vec![PhysicalSortExpr {
-            expr: col("c1", &schema)?,
-            options: SortOptions::default(),
-        }],
-        repartition,
-    );
-    let spm = sort_preserving_merge_exec(sort.output_ordering().unwrap().to_vec(), sort);
+    let ordering: LexOrdering = [PhysicalSortExpr {
+        expr: col("c1", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+    let sort = sort_exec(ordering.clone(), repartition);
+    let spm = sort_preserving_merge_exec(ordering, sort);
     let global_limit = global_limit_exec(spm, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  SortPreservingMergeExec: [c1@0 ASC]",
-            "    SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "          CoalesceBatchesExec: target_batch_size=8192",
-            "            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      SortPreservingMergeExec: [c1@0 ASC]
+        SortExec: expr=[c1@0 ASC], preserve_partitioning=[false]
+          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+            ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+              StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "SortPreservingMergeExec: [c1@0 ASC], fetch=5",
-            "  SortExec: TopK(fetch=5), expr=[c1@0 ASC], preserve_partitioning=[false]",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]",
-            "        CoalesceBatchesExec: target_batch_size=8192",
-            "          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    SortPreservingMergeExec: [c1@0 ASC], fetch=5
+      SortExec: TopK(fetch=5), expr=[c1@0 ASC], preserve_partitioning=[false]
+        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+          ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3]
+            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -357,32 +409,37 @@ fn pushes_global_limit_into_multiple_fetch_plans() -> Result<()> {
 fn keeps_pushed_local_limit_exec_when_there_are_multiple_input_partitions() -> Result<()>
 {
     let schema = create_schema();
-    let streaming_table = streaming_table_exec(Arc::clone(&schema))?;
+    let streaming_table = stream_exec(&schema);
     let repartition = repartition_exec(streaming_table)?;
     let filter = filter_exec(schema, repartition)?;
     let coalesce_partitions = coalesce_partitions_exec(filter);
     let global_limit = global_limit_exec(coalesce_partitions, 0, Some(5));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-            "GlobalLimitExec: skip=0, fetch=5",
-            "  CoalescePartitionsExec",
-            "    FilterExec: c3@2 > 0",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=0, fetch=5
+      CoalescePartitionsExec
+        FilterExec: c3@2 > 0
+          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+            StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = [
-            "CoalescePartitionsExec: fetch=5",
-            "  FilterExec: c3@2 > 0",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    CoalescePartitionsExec: fetch=5
+      FilterExec: c3@2 > 0, fetch=5
+        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+          StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+    "
+    );
 
     Ok(())
 }
@@ -394,20 +451,27 @@ fn merges_local_limit_with_local_limit() -> Result<()> {
     let child_local_limit = local_limit_exec(empty_exec, 10);
     let parent_local_limit = local_limit_exec(child_local_limit, 20);
 
-    let initial = get_plan_string(&parent_local_limit);
-    let expected_initial = [
-        "LocalLimitExec: fetch=20",
-        "  LocalLimitExec: fetch=10",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&parent_local_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    LocalLimitExec: fetch=20
+      LocalLimitExec: fetch=10
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(parent_local_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=0, fetch=10", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=0, fetch=10
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -419,20 +483,27 @@ fn merges_global_limit_with_global_limit() -> Result<()> {
     let child_global_limit = global_limit_exec(empty_exec, 10, Some(30));
     let parent_global_limit = global_limit_exec(child_global_limit, 10, Some(20));
 
-    let initial = get_plan_string(&parent_global_limit);
-    let expected_initial = [
-        "GlobalLimitExec: skip=10, fetch=20",
-        "  GlobalLimitExec: skip=10, fetch=30",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&parent_global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=10, fetch=20
+      GlobalLimitExec: skip=10, fetch=30
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(parent_global_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -444,20 +515,27 @@ fn merges_global_limit_with_local_limit() -> Result<()> {
     let local_limit = local_limit_exec(empty_exec, 40);
     let global_limit = global_limit_exec(local_limit, 20, Some(30));
 
-    let initial = get_plan_string(&global_limit);
-    let expected_initial = [
-        "GlobalLimitExec: skip=20, fetch=30",
-        "  LocalLimitExec: fetch=40",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&global_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=20, fetch=30
+      LocalLimitExec: fetch=40
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(global_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
 
     Ok(())
 }
@@ -469,20 +547,138 @@ fn merges_local_limit_with_global_limit() -> Result<()> {
     let global_limit = global_limit_exec(empty_exec, 20, Some(30));
     let local_limit = local_limit_exec(global_limit, 20);
 
-    let initial = get_plan_string(&local_limit);
-    let expected_initial = [
-        "LocalLimitExec: fetch=20",
-        "  GlobalLimitExec: skip=20, fetch=30",
-        "    EmptyExec",
-    ];
-
-    assert_eq!(initial, expected_initial);
+    let initial = format_plan(&local_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    LocalLimitExec: fetch=20
+      GlobalLimitExec: skip=20, fetch=30
+        EmptyExec
+    "
+    );
 
     let after_optimize =
         LimitPushdown::new().optimize(local_limit, &ConfigOptions::new())?;
 
-    let expected = ["GlobalLimitExec: skip=20, fetch=20", "  EmptyExec"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=20, fetch=20
+      EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn preserves_nested_global_limit() -> Result<()> {
+    // If there are multiple limits in an execution plan, they all need to be
+    // preserved in the optimized plan.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=1
+    //   NestedLoopJoinExec (Left)
+    //     EmptyExec (left side)
+    //     GlobalLimitExec: skip=2, fetch=1
+    //       NestedLoopJoinExec (Right)
+    //         EmptyExec (left side)
+    //         EmptyExec (right side)
+    let schema = create_schema();
+
+    // Build inner join: NestedLoopJoin(Empty, Empty)
+    let inner_left = empty_exec(Arc::clone(&schema));
+    let inner_right = empty_exec(Arc::clone(&schema));
+    let inner_join = nested_loop_join_exec(inner_left, inner_right, JoinType::Right)?;
+
+    // Add inner limit: GlobalLimitExec: skip=2, fetch=1
+    let inner_limit = global_limit_exec(inner_join, 2, Some(1));
+
+    // Build outer join: NestedLoopJoin(Empty, GlobalLimit)
+    let outer_left = empty_exec(Arc::clone(&schema));
+    let outer_join = nested_loop_join_exec(outer_left, inner_limit, JoinType::Left)?;
+
+    // Add outer limit: GlobalLimitExec: skip=1, fetch=1
+    let outer_limit = global_limit_exec(outer_join, 1, Some(1));
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=1
+      NestedLoopJoinExec: join_type=Left
+        EmptyExec
+        GlobalLimitExec: skip=2, fetch=1
+          NestedLoopJoinExec: join_type=Right
+            EmptyExec
+            EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=1
+      NestedLoopJoinExec: join_type=Left
+        EmptyExec
+        GlobalLimitExec: skip=2, fetch=1
+          NestedLoopJoinExec: join_type=Right
+            EmptyExec
+            EmptyExec
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn preserves_skip_before_sort() -> Result<()> {
+    // If there's a limit with skip before a node that (1) supports fetch but
+    // (2) does not support limit pushdown, that limit should not be removed.
+    //
+    // Plan structure:
+    // GlobalLimitExec: skip=1, fetch=None
+    //   SortExec: TopK(fetch=4)
+    //     EmptyExec
+    let schema = create_schema();
+
+    let empty = empty_exec(Arc::clone(&schema));
+
+    let ordering = [PhysicalSortExpr {
+        expr: col("c1", &schema)?,
+        options: SortOptions::default(),
+    }];
+    let sort = sort_exec(ordering.into(), empty)
+        .with_fetch(Some(4))
+        .unwrap();
+
+    let outer_limit = global_limit_exec(sort, 1, None);
+
+    let initial = format_plan(&outer_limit);
+    insta::assert_snapshot!(
+        initial,
+        @r"
+    GlobalLimitExec: skip=1, fetch=None
+      SortExec: TopK(fetch=4), expr=[c1@0 ASC], preserve_partitioning=[false]
+        EmptyExec
+    "
+    );
+
+    let after_optimize =
+        LimitPushdown::new().optimize(outer_limit, &ConfigOptions::new())?;
+    let optimized = format_plan(&after_optimize);
+    insta::assert_snapshot!(
+        optimized,
+        @r"
+    GlobalLimitExec: skip=1, fetch=3
+      SortExec: TopK(fetch=4), expr=[c1@0 ASC], preserve_partitioning=[false]
+        EmptyExec
+    "
+    );
 
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
index f9810eab8f594..c523b4a752a82 100644
--- a/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
+++ b/datafusion/core/tests/physical_optimizer/limited_distinct_aggregation.rs
@@ -17,11 +17,12 @@
 
 //! Integration tests for [`LimitedDistinctAggregation`] physical optimizer rule
 
+use insta::assert_snapshot;
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    assert_plan_matches_expected, build_group_by, mock_data, parquet_exec_with_sort,
-    schema, TestAggregate,
+    TestAggregate, build_group_by, get_optimized_plan, mock_data, parquet_exec_with_sort,
+    schema,
 };
 
 use arrow::datatypes::DataType;
@@ -30,26 +31,21 @@ use datafusion::prelude::SessionContext;
 use datafusion_common::Result;
 use datafusion_execution::config::SessionConfig;
 use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::cast;
-use datafusion_physical_expr::{expressions, expressions::col, PhysicalSortExpr};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr::expressions::{self, cast, col};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use datafusion_physical_plan::{
+    ExecutionPlan,
     aggregates::{AggregateExec, AggregateMode},
     collect,
     limit::{GlobalLimitExec, LocalLimitExec},
-    ExecutionPlan,
 };
 
-async fn assert_results_match_expected(
-    plan: Arc<dyn ExecutionPlan>,
-    expected: &str,
-) -> Result<()> {
+async fn run_plan_and_format(plan: Arc<dyn ExecutionPlan>) -> Result<String> {
     let cfg = SessionConfig::new().with_target_partitions(1);
     let ctx = SessionContext::new_with_config(cfg);
     let batches = collect(plan, ctx.task_ctx()).await?;
     let actual = format!("{}", pretty_format_batches(&batches)?);
-    assert_eq!(actual, expected);
-    Ok(())
+    Ok(actual)
 }
 
 #[tokio::test]
@@ -78,27 +74,33 @@ async fn test_partial_final() -> Result<()> {
         Arc::new(final_agg),
         4, // fetch
     );
-    // expected to push the limit to the Partial and Final AggregateExecs
-    let expected = [
-        "LocalLimitExec: fetch=4",
-        "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[], lim=[4]",
-        "AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[], lim=[4]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
-    let expected = r#"
-+---+
-| a |
-+---+
-| 1 |
-| 2 |
-|   |
-| 4 |
-+---+
-"#
-    .trim();
-    assert_results_match_expected(plan, expected).await?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=4
+      AggregateExec: mode=Final, gby=[a@0 as a], aggr=[], lim=[4]
+        AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[], lim=[4]
+          DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
+    let expected = run_plan_and_format(plan).await?;
+    assert_snapshot!(
+        expected,
+        @r"
+    +---+
+    | a |
+    +---+
+    | 1 |
+    | 2 |
+    |   |
+    | 4 |
+    +---+
+    "
+    );
+
     Ok(())
 }
 
@@ -121,25 +123,31 @@ async fn test_single_local() -> Result<()> {
         4, // fetch
     );
     // expected to push the limit to the AggregateExec
-    let expected = [
-        "LocalLimitExec: fetch=4",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
-    let expected = r#"
-+---+
-| a |
-+---+
-| 1 |
-| 2 |
-|   |
-| 4 |
-+---+
-"#
-    .trim();
-    assert_results_match_expected(plan, expected).await?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=4
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
+    let expected = run_plan_and_format(plan).await?;
+    assert_snapshot!(
+        expected,
+        @r"
+    +---+
+    | a |
+    +---+
+    | 1 |
+    | 2 |
+    |   |
+    | 4 |
+    +---+
+    "
+    );
     Ok(())
 }
 
@@ -163,24 +171,30 @@ async fn test_single_global() -> Result<()> {
         Some(3), // fetch
     );
     // expected to push the skip+fetch limit to the AggregateExec
-    let expected = [
-        "GlobalLimitExec: skip=1, fetch=3",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
-    let expected = r#"
-+---+
-| a |
-+---+
-| 2 |
-|   |
-| 4 |
-+---+
-"#
-    .trim();
-    assert_results_match_expected(plan, expected).await?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    GlobalLimitExec: skip=1, fetch=3
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
+    let expected = run_plan_and_format(plan).await?;
+    assert_snapshot!(
+        expected,
+        @r"
+    +---+
+    | a |
+    +---+
+    | 2 |
+    |   |
+    | 4 |
+    +---+
+    "
+    );
     Ok(())
 }
 
@@ -211,37 +225,44 @@ async fn test_distinct_cols_different_than_group_by_cols() -> Result<()> {
         4, // fetch
     );
     // expected to push the limit to the outer AggregateExec only
-    let expected = [
-        "LocalLimitExec: fetch=4",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]",
-        "AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
-    let expected = r#"
-+---+
-| a |
-+---+
-| 1 |
-| 2 |
-|   |
-| 4 |
-+---+
-"#
-    .trim();
-    assert_results_match_expected(plan, expected).await?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=4
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], lim=[4]
+        AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b], aggr=[]
+          DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
+    let expected = run_plan_and_format(plan).await?;
+    assert_snapshot!(
+        expected,
+        @r"
+    +---+
+    | a |
+    +---+
+    | 1 |
+    | 2 |
+    |   |
+    | 4 |
+    +---+
+    "
+    );
     Ok(())
 }
 
 #[test]
 fn test_has_order_by() -> Result<()> {
-    let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-        expr: col("a", &schema()).unwrap(),
+    let schema = schema();
+    let sort_key = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
         options: SortOptions::default(),
-    }]);
-    let source = parquet_exec_with_sort(vec![sort_key]);
-    let schema = source.schema();
+    }]
+    .into();
+    let source = parquet_exec_with_sort(schema.clone(), vec![sort_key]);
 
     // `SELECT a FROM DataSourceExec WHERE a > 1 GROUP BY a LIMIT 10;`, Single AggregateExec
     // the `a > 1` filter is applied in the AggregateExec
@@ -258,13 +279,17 @@ fn test_has_order_by() -> Result<()> {
         10, // fetch
     );
     // expected not to push the limit to the AggregateExec
-    let expected = [
-        "LocalLimitExec: fetch=10",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], ordering_mode=Sorted",
-        "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=10
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[], ordering_mode=Sorted
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    "
+    );
     Ok(())
 }
 
@@ -287,13 +312,17 @@ fn test_no_group_by() -> Result<()> {
         10, // fetch
     );
     // expected not to push the limit to the AggregateExec
-    let expected = [
-        "LocalLimitExec: fetch=10",
-        "AggregateExec: mode=Single, gby=[], aggr=[]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=10
+      AggregateExec: mode=Single, gby=[], aggr=[]
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
     Ok(())
 }
 
@@ -317,13 +346,17 @@ fn test_has_aggregate_expression() -> Result<()> {
         10, // fetch
     );
     // expected not to push the limit to the AggregateExec
-    let expected = [
-        "LocalLimitExec: fetch=10",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=10
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
     Ok(())
 }
 
@@ -355,12 +388,16 @@ fn test_has_filter() -> Result<()> {
     );
     // expected not to push the limit to the AggregateExec
     // TODO(msirek): open an issue for `filter_expr` of `AggregateExec` not printing out
-    let expected = [
-        "LocalLimitExec: fetch=10",
-        "AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]",
-        "DataSourceExec: partitions=1, partition_sizes=[1]",
-    ];
     let plan: Arc<dyn ExecutionPlan> = Arc::new(limit_exec);
-    assert_plan_matches_expected(&plan, &expected)?;
+    let formatted = get_optimized_plan(&plan)?;
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=10
+      AggregateExec: mode=Single, gby=[a@0 as a], aggr=[COUNT(*)]
+        DataSourceExec: partitions=1, partition_sizes=[1]
+    "
+    );
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/mod.rs b/datafusion/core/tests/physical_optimizer/mod.rs
index 98e7b87ad2157..cf179cb727cf1 100644
--- a/datafusion/core/tests/physical_optimizer/mod.rs
+++ b/datafusion/core/tests/physical_optimizer/mod.rs
@@ -17,16 +17,25 @@
 
 //! Physical Optimizer integration tests
 
+#[expect(clippy::needless_pass_by_value)]
 mod aggregate_statistics;
 mod combine_partial_final_agg;
+#[expect(clippy::needless_pass_by_value)]
 mod enforce_distribution;
 mod enforce_sorting;
+mod enforce_sorting_monotonicity;
 mod filter_pushdown;
 mod join_selection;
+#[expect(clippy::needless_pass_by_value)]
 mod limit_pushdown;
 mod limited_distinct_aggregation;
 mod partition_statistics;
 mod projection_pushdown;
+mod pushdown_sort;
 mod replace_with_order_preserving_variants;
 mod sanity_checker;
+#[expect(clippy::needless_pass_by_value)]
 mod test_utils;
+mod window_optimize;
+
+mod pushdown_utils;
diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
index 62f04f2fe740e..42c1e84534b6d 100644
--- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs
+++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
@@ -17,40 +17,52 @@
 
 #[cfg(test)]
 mod test {
+    use insta::assert_snapshot;
+    use std::sync::Arc;
+
     use arrow::array::{Int32Array, RecordBatch};
     use arrow_schema::{DataType, Field, Schema, SortOptions};
     use datafusion::datasource::listing::ListingTable;
     use datafusion::prelude::SessionContext;
     use datafusion_catalog::TableProvider;
-    use datafusion_common::stats::Precision;
     use datafusion_common::Result;
-    use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
-    use datafusion_execution::config::SessionConfig;
+    use datafusion_common::stats::Precision;
+    use datafusion_common::{
+        ColumnStatistics, JoinType, NullEquality, ScalarValue, Statistics,
+    };
     use datafusion_execution::TaskContext;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_expr::{WindowFrame, WindowFunctionDefinition};
     use datafusion_expr_common::operator::Operator;
     use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::Partitioning;
     use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::{binary, col, lit, Column};
+    use datafusion_physical_expr::expressions::{Column, binary, col, lit};
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
     use datafusion_physical_plan::aggregates::{
         AggregateExec, AggregateMode, PhysicalGroupBy,
     };
-    use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
     use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+    use datafusion_physical_plan::common::compute_record_batch_statistics;
     use datafusion_physical_plan::empty::EmptyExec;
     use datafusion_physical_plan::filter::FilterExec;
-    use datafusion_physical_plan::joins::CrossJoinExec;
+    use datafusion_physical_plan::joins::{
+        CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode,
+    };
     use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
-    use datafusion_physical_plan::projection::ProjectionExec;
+    use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+    use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
+    use datafusion_physical_plan::repartition::RepartitionExec;
     use datafusion_physical_plan::sorts::sort::SortExec;
-    use datafusion_physical_plan::union::UnionExec;
+    use datafusion_physical_plan::union::{InterleaveExec, UnionExec};
+    use datafusion_physical_plan::windows::{WindowAggExec, create_window_expr};
     use datafusion_physical_plan::{
-        execute_stream_partitioned, get_plan_string, ExecutionPlan,
-        ExecutionPlanProperties,
+        ExecutionPlan, ExecutionPlanProperties, execute_stream_partitioned,
+        get_plan_string,
     };
+
     use futures::TryStreamExt;
-    use std::sync::Arc;
 
     /// Creates a test table with statistics from the test data directory.
     ///
@@ -60,8 +72,9 @@ mod test {
     /// - Each partition has an "id" column (INT) with the following values:
     ///   - First partition: [3, 4]
     ///   - Second partition: [1, 2]
-    /// - Each row is 110 bytes in size
+    /// - Each partition has 16 bytes total (Int32 id: 4 bytes × 2 rows + Date32 date: 4 bytes × 2 rows)
     ///
+    /// @param create_table_sql Optional parameter to set the create table SQL
     /// @param target_partition Optional parameter to set the target partitions
     /// @return ExecutionPlan representing the scan of the table with statistics
     async fn create_scan_exec_with_statistics(
@@ -104,29 +117,53 @@ mod test {
             .unwrap()
     }
 
+    // Date32 values for test data (days since 1970-01-01):
+    // 2025-03-01 = 20148
+    // 2025-03-02 = 20149
+    // 2025-03-03 = 20150
+    // 2025-03-04 = 20151
+    const DATE_2025_03_01: i32 = 20148;
+    const DATE_2025_03_02: i32 = 20149;
+    const DATE_2025_03_03: i32 = 20150;
+    const DATE_2025_03_04: i32 = 20151;
+
     /// Helper function to create expected statistics for a partition with Int32 column
+    ///
+    /// If `date_range` is provided, includes exact statistics for the partition date column.
+    /// Partition column statistics are exact because all rows in a partition share the same value.
     fn create_partition_statistics(
         num_rows: usize,
         total_byte_size: usize,
         min_value: i32,
         max_value: i32,
-        include_date_column: bool,
+        date_range: Option<(i32, i32)>,
     ) -> Statistics {
+        // Int32 is 4 bytes per row
+        let int32_byte_size = num_rows * 4;
         let mut column_stats = vec![ColumnStatistics {
             null_count: Precision::Exact(0),
             max_value: Precision::Exact(ScalarValue::Int32(Some(max_value))),
             min_value: Precision::Exact(ScalarValue::Int32(Some(min_value))),
             sum_value: Precision::Absent,
             distinct_count: Precision::Absent,
+            byte_size: Precision::Exact(int32_byte_size),
         }];
 
-        if include_date_column {
+        if let Some((min_date, max_date)) = date_range {
+            // Partition column stats are computed from partition values:
+            // - null_count = 0 (partition values from paths are never null)
+            // - min/max are the merged partition values across files in the group
+            // - byte_size = num_rows * 4 (Date32 is 4 bytes per row)
+            // - distinct_count = Inexact(1) per partition file (single partition value per file),
+            //   preserved via max() when merging stats across partitions
+            let date32_byte_size = num_rows * 4;
             column_stats.push(ColumnStatistics {
-                null_count: Precision::Absent,
-                max_value: Precision::Absent,
-                min_value: Precision::Absent,
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Date32(Some(max_date))),
+                min_value: Precision::Exact(ScalarValue::Date32(Some(min_date))),
                 sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
+                distinct_count: Precision::Inexact(1),
+                byte_size: Precision::Exact(date32_byte_size),
             });
         }
 
@@ -206,14 +243,26 @@ mod test {
         let statistics = (0..scan.output_partitioning().partition_count())
             .map(|idx| scan.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
         // Check the statistics of each partition
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -229,21 +278,24 @@ mod test {
     async fn test_statistics_by_partition_of_projection() -> Result<()> {
         let scan = create_scan_exec_with_statistics(None, Some(2)).await;
         // Add projection execution plan
-        let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> =
-            vec![(Arc::new(Column::new("id", 0)), "id".to_string())];
+        let exprs = vec![ProjectionExpr {
+            expr: Arc::new(Column::new("id", 0)) as Arc<dyn PhysicalExpr>,
+            alias: "id".to_string(),
+        }];
         let projection: Arc<dyn ExecutionPlan> =
             Arc::new(ProjectionExec::try_new(exprs, scan)?);
         let statistics = (0..projection.output_partitioning().partition_count())
             .map(|idx| projection.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
+        // Projection only includes id column, not the date partition column
         let expected_statistic_partition_1 =
-            create_partition_statistics(2, 8, 3, 4, false);
+            create_partition_statistics(2, 8, 3, 4, None);
         let expected_statistic_partition_2 =
-            create_partition_statistics(2, 8, 1, 2, false);
+            create_partition_statistics(2, 8, 1, 2, None);
         // Check the statistics of each partition
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -258,24 +310,25 @@ mod test {
     async fn test_statistics_by_partition_of_sort() -> Result<()> {
         let scan_1 = create_scan_exec_with_statistics(None, Some(1)).await;
         // Add sort execution plan
-        let sort = SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::new(Column::new("id", 0)),
-                options: SortOptions {
-                    descending: false,
-                    nulls_first: false,
-                },
-            }]),
-            scan_1,
-        );
-        let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(sort.clone());
+        let ordering = [PhysicalSortExpr::new(
+            Arc::new(Column::new("id", 0)),
+            SortOptions::new(false, false),
+        )];
+        let sort = SortExec::new(ordering.clone().into(), scan_1);
+        let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(sort);
         let statistics = (0..sort_exec.output_partitioning().partition_count())
             .map(|idx| sort_exec.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
-        let expected_statistic_partition =
-            create_partition_statistics(4, 220, 1, 4, true);
+        // All 4 files merged: ids [1-4], dates [2025-03-01, 2025-03-04]
+        let expected_statistic_partition = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
         assert_eq!(statistics.len(), 1);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        assert_eq!(*statistics[0], expected_statistic_partition);
         // Check the statistics_by_partition with real results
         let expected_stats = vec![ExpectedStatistics::NonEmpty(1, 4, 4)];
         validate_statistics_with_data(sort_exec.clone(), expected_stats, 0).await?;
@@ -284,28 +337,30 @@ mod test {
         let scan_2 = create_scan_exec_with_statistics(None, Some(2)).await;
         // Add sort execution plan
         let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(
-            SortExec::new(
-                LexOrdering::new(vec![PhysicalSortExpr {
-                    expr: Arc::new(Column::new("id", 0)),
-                    options: SortOptions {
-                        descending: false,
-                        nulls_first: false,
-                    },
-                }]),
-                scan_2,
-            )
-            .with_preserve_partitioning(true),
+            SortExec::new(ordering.into(), scan_2).with_preserve_partitioning(true),
+        );
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
         );
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
         let statistics = (0..sort_exec.output_partitioning().partition_count())
             .map(|idx| sort_exec.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -329,34 +384,61 @@ mod test {
         let filter: Arc<dyn ExecutionPlan> =
             Arc::new(FilterExec::try_new(predicate, scan)?);
         let full_statistics = filter.partition_statistics(None)?;
+        // Filter preserves original total_rows and byte_size from input
+        // (4 total rows = 2 partitions * 2 rows each, byte_size = 4 * 4 = 16 bytes for int32)
         let expected_full_statistic = Statistics {
             num_rows: Precision::Inexact(0),
             total_byte_size: Precision::Inexact(0),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Exact(0),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(16),
                 },
                 ColumnStatistics {
                     null_count: Precision::Exact(0),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    max_value: Precision::Exact(ScalarValue::Date32(None)),
+                    min_value: Precision::Exact(ScalarValue::Date32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Date32(None)),
                     distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(16), // 4 rows * 4 bytes (Date32)
                 },
             ],
         };
-        assert_eq!(full_statistics, expected_full_statistic);
+        assert_eq!(*full_statistics, expected_full_statistic);
 
         let statistics = (0..filter.output_partitioning().partition_count())
             .map(|idx| filter.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_full_statistic);
-        assert_eq!(statistics[1], expected_full_statistic);
+        // Per-partition stats: each partition has 2 rows, byte_size = 2 * 4 = 8
+        let expected_partition_statistic = Statistics {
+            num_rows: Precision::Inexact(0),
+            total_byte_size: Precision::Inexact(0),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
+                    distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(None)),
+                    min_value: Precision::Exact(ScalarValue::Date32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Date32(None)),
+                    distinct_count: Precision::Exact(0),
+                    byte_size: Precision::Exact(8), // 2 rows * 4 bytes (Date32)
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_partition_statistic);
+        assert_eq!(*statistics[1], expected_partition_statistic);
         Ok(())
     }
 
@@ -364,24 +446,36 @@ mod test {
     async fn test_statistic_by_partition_of_union() -> Result<()> {
         let scan = create_scan_exec_with_statistics(None, Some(2)).await;
         let union_exec: Arc<dyn ExecutionPlan> =
-            Arc::new(UnionExec::new(vec![scan.clone(), scan]));
+            UnionExec::try_new(vec![scan.clone(), scan])?;
         let statistics = (0..union_exec.output_partitioning().partition_count())
             .map(|idx| union_exec.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         // Check that we have 4 partitions (2 from each scan)
         assert_eq!(statistics.len(), 4);
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
         // Verify first partition (from first scan)
-        assert_eq!(statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
         // Verify second partition (from first scan)
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
         // Verify third partition (from second scan - same as first partition)
-        assert_eq!(statistics[2], expected_statistic_partition_1);
+        assert_eq!(*statistics[2], expected_statistic_partition_1);
         // Verify fourth partition (from second scan - same as second partition)
-        assert_eq!(statistics[3], expected_statistic_partition_2);
+        assert_eq!(*statistics[3], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -394,6 +488,64 @@ mod test {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_statistics_by_partition_of_interleave() -> Result<()> {
+        let scan1 = create_scan_exec_with_statistics(None, Some(1)).await;
+        let scan2 = create_scan_exec_with_statistics(None, Some(1)).await;
+
+        // Create same hash partitioning on the 'id' column as InterleaveExec
+        // requires all children have a consistent hash partitioning
+        let hash_expr1 = vec![col("id", &scan1.schema())?];
+        let repartition1 = Arc::new(RepartitionExec::try_new(
+            scan1,
+            Partitioning::Hash(hash_expr1, 2),
+        )?);
+        let hash_expr2 = vec![col("id", &scan2.schema())?];
+        let repartition2 = Arc::new(RepartitionExec::try_new(
+            scan2,
+            Partitioning::Hash(hash_expr2, 2),
+        )?);
+
+        let interleave: Arc<dyn ExecutionPlan> =
+            Arc::new(InterleaveExec::try_new(vec![repartition1, repartition2])?);
+
+        // Verify the result of partition statistics
+        let stats = (0..interleave.output_partitioning().partition_count())
+            .map(|idx| interleave.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+        assert_eq!(stats.len(), 2);
+
+        // Each partition gets half of combined input, total_rows per partition = 4
+        let expected_stats = Statistics {
+            num_rows: Precision::Inexact(4),
+            total_byte_size: Precision::Inexact(32),
+            column_statistics: vec![
+                ColumnStatistics::new_unknown(),
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        assert_eq!(*stats[0], expected_stats);
+        assert_eq!(*stats[1], expected_stats);
+
+        // Verify the execution results
+        let partitions = execute_stream_partitioned(
+            interleave.clone(),
+            Arc::new(TaskContext::default()),
+        )?;
+        assert_eq!(partitions.len(), 2);
+
+        let mut partition_row_counts = Vec::new();
+        for partition_stream in partitions.into_iter() {
+            let results: Vec<RecordBatch> = partition_stream.try_collect().await?;
+            let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
+            partition_row_counts.push(total_rows);
+        }
+        assert_eq!(partition_row_counts.len(), 2);
+        assert_eq!(partition_row_counts.iter().sum::<usize>(), 8);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_statistic_by_partition_of_cross_join() -> Result<()> {
         let left_scan = create_scan_exec_with_statistics(None, Some(1)).await;
@@ -409,30 +561,78 @@ mod test {
             .collect::<Result<Vec<_>>>()?;
         // Check that we have 2 partitions
         assert_eq!(statistics.len(), 2);
-        let mut expected_statistic_partition_1 =
-            create_partition_statistics(8, 48400, 1, 4, true);
-        expected_statistic_partition_1
-            .column_statistics
-            .push(ColumnStatistics {
-                null_count: Precision::Exact(0),
-                max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
-                min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            });
-        let mut expected_statistic_partition_2 =
-            create_partition_statistics(8, 48400, 1, 4, true);
-        expected_statistic_partition_2
-            .column_statistics
-            .push(ColumnStatistics {
-                null_count: Precision::Exact(0),
-                max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
-                min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            });
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        // Cross join output schema: [left.id, left.date, right.id]
+        // Cross join doesn't propagate Column's byte_size
+        let expected_statistic_partition_1 = Statistics {
+            num_rows: Precision::Exact(8),
+            total_byte_size: Precision::Exact(512),
+            column_statistics: vec![
+                // column 0: left.id (Int32, file column from t1)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                // column 1: left.date (Date32, partition column from t1)
+                // Partition column statistics are exact because all rows in a partition share the same value.
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Absent,
+                },
+                // column 2: right.id (Int32, file column from t2) - right partition 0: ids [3,4]
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+        let expected_statistic_partition_2 = Statistics {
+            num_rows: Precision::Exact(8),
+            total_byte_size: Precision::Exact(512),
+            column_statistics: vec![
+                // column 0: left.id (Int32, file column from t1)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                // column 1: left.date (Date32, partition column from t1)
+                // Partition column statistics are exact because all rows in a partition share the same value.
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Absent,
+                },
+                // column 2: right.id (Int32, file column from t2) - right partition 1: ids [1,2]
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
@@ -444,28 +644,77 @@ mod test {
     }
 
     #[tokio::test]
-    async fn test_statistic_by_partition_of_coalesce_batches() -> Result<()> {
-        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
-        dbg!(scan.partition_statistics(Some(0))?);
-        let coalesce_batches: Arc<dyn ExecutionPlan> =
-            Arc::new(CoalesceBatchesExec::new(scan, 2));
-        let expected_statistic_partition_1 =
-            create_partition_statistics(2, 110, 3, 4, true);
-        let expected_statistic_partition_2 =
-            create_partition_statistics(2, 110, 1, 2, true);
-        let statistics = (0..coalesce_batches.output_partitioning().partition_count())
-            .map(|idx| coalesce_batches.partition_statistics(Some(idx)))
+    async fn test_statistic_by_partition_of_nested_loop_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+        let left_scan_coalesced: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalescePartitionsExec::new(left_scan));
+
+        let right_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let nested_loop_join: Arc<dyn ExecutionPlan> =
+            Arc::new(NestedLoopJoinExec::try_new(
+                left_scan_coalesced,
+                right_scan,
+                None,
+                &JoinType::RightSemi,
+                None,
+            )?);
+
+        // Test partition_statistics(None) - returns overall statistics
+        // For RightSemi join, output columns come from right side only
+        let full_statistics = nested_loop_join.partition_statistics(None)?;
+        // With empty join columns, estimate_join_statistics returns Inexact row count
+        // based on the outer side (right side for RightSemi)
+        let mut expected_full_statistics = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
+        expected_full_statistics.num_rows = Precision::Inexact(4);
+        expected_full_statistics.total_byte_size = Precision::Absent;
+        assert_eq!(*full_statistics, expected_full_statistics);
+
+        // Test partition_statistics(Some(idx)) - returns partition-specific statistics
+        // Partition 1: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let mut expected_statistic_partition_1 = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        expected_statistic_partition_1.num_rows = Precision::Inexact(2);
+        expected_statistic_partition_1.total_byte_size = Precision::Absent;
+
+        // Partition 2: ids [1,2], dates [2025-03-03, 2025-03-04]
+        let mut expected_statistic_partition_2 = create_partition_statistics(
+            2,
+            16,
+            1,
+            2,
+            Some((DATE_2025_03_03, DATE_2025_03_04)),
+        );
+        expected_statistic_partition_2.num_rows = Precision::Inexact(2);
+        expected_statistic_partition_2.total_byte_size = Precision::Absent;
+
+        let statistics = (0..nested_loop_join.output_partitioning().partition_count())
+            .map(|idx| nested_loop_join.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        assert_eq!(statistics[0], expected_statistic_partition_1);
-        assert_eq!(statistics[1], expected_statistic_partition_2);
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![
             ExpectedStatistics::NonEmpty(3, 4, 2),
             ExpectedStatistics::NonEmpty(1, 2, 2),
         ];
-        validate_statistics_with_data(coalesce_batches, expected_stats, 0).await?;
+        validate_statistics_with_data(nested_loop_join, expected_stats, 0).await?;
+
         Ok(())
     }
 
@@ -474,13 +723,19 @@ mod test {
         let scan = create_scan_exec_with_statistics(None, Some(2)).await;
         let coalesce_partitions: Arc<dyn ExecutionPlan> =
             Arc::new(CoalescePartitionsExec::new(scan));
-        let expected_statistic_partition =
-            create_partition_statistics(4, 220, 1, 4, true);
+        // All files merged: ids [1-4], dates [2025-03-01, 2025-03-04]
+        let expected_statistic_partition = create_partition_statistics(
+            4,
+            32,
+            1,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_04)),
+        );
         let statistics = (0..coalesce_partitions.output_partitioning().partition_count())
             .map(|idx| coalesce_partitions.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 1);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        assert_eq!(*statistics[0], expected_statistic_partition);
 
         // Check the statistics_by_partition with real results
         let expected_stats = vec![ExpectedStatistics::NonEmpty(1, 4, 4)];
@@ -497,11 +752,20 @@ mod test {
             .map(|idx| local_limit.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 2);
-        let schema = scan.schema();
-        let mut expected_statistic_partition = Statistics::new_unknown(&schema);
-        expected_statistic_partition.num_rows = Precision::Exact(1);
-        assert_eq!(statistics[0], expected_statistic_partition);
-        assert_eq!(statistics[1], expected_statistic_partition);
+        let mut expected_0 = Statistics::clone(&statistics[0]);
+        expected_0.column_statistics = expected_0
+            .column_statistics
+            .into_iter()
+            .map(|c| c.to_inexact())
+            .collect();
+        let mut expected_1 = Statistics::clone(&statistics[1]);
+        expected_1.column_statistics = expected_1
+            .column_statistics
+            .into_iter()
+            .map(|c| c.to_inexact())
+            .collect();
+        assert_eq!(*statistics[0], expected_0);
+        assert_eq!(*statistics[1], expected_1);
         Ok(())
     }
 
@@ -515,9 +779,15 @@ mod test {
             .map(|idx| global_limit.partition_statistics(Some(idx)))
             .collect::<Result<Vec<_>>>()?;
         assert_eq!(statistics.len(), 1);
-        let expected_statistic_partition =
-            create_partition_statistics(2, 110, 3, 4, true);
-        assert_eq!(statistics[0], expected_statistic_partition);
+        // GlobalLimit takes from first partition: ids [3,4], dates [2025-03-01, 2025-03-02]
+        let expected_statistic_partition = create_partition_statistics(
+            2,
+            16,
+            3,
+            4,
+            Some((DATE_2025_03_01, DATE_2025_03_02)),
+        );
+        assert_eq!(*statistics[0], expected_statistic_partition);
         Ok(())
     }
 
@@ -541,34 +811,36 @@ mod test {
             ),
         ]);
 
-        let aggr_expr = vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
-            .schema(Arc::clone(&scan_schema))
-            .alias(String::from("COUNT(c)"))
-            .build()
-            .map(Arc::new)?];
-
-        let aggregate_exec_partial = Arc::new(AggregateExec::try_new(
-            AggregateMode::Partial,
-            group_by.clone(),
-            aggr_expr.clone(),
-            vec![None],
-            Arc::clone(&scan),
-            scan_schema.clone(),
-        )?) as _;
-
-        let mut plan_string = get_plan_string(&aggregate_exec_partial);
-        let _ = plan_string.swap_remove(1);
-        let expected_plan = vec![
-            "AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]",
-            //"  DataSourceExec: file_groups={2 groups: [[.../datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, .../datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [.../datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, .../datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id, date], file_type=parquet
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
+                .schema(Arc::clone(&scan_schema))
+                .alias(String::from("COUNT(c)"))
+                .build()
+                .map(Arc::new)?,
         ];
-        assert_eq!(plan_string, expected_plan);
+
+        let aggregate_exec_partial: Arc<dyn ExecutionPlan> =
+            Arc::new(AggregateExec::try_new(
+                AggregateMode::Partial,
+                group_by.clone(),
+                aggr_expr.clone(),
+                vec![None],
+                Arc::clone(&scan),
+                scan_schema.clone(),
+            )?) as _;
+
+        let plan_string = get_plan_string(&aggregate_exec_partial).swap_remove(0);
+        assert_snapshot!(
+            plan_string,
+            @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]"
+        );
 
         let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?;
 
+        // Aggregate doesn't propagate num_rows and ColumnStatistics byte_size from input
         let expected_p0_statistics = Statistics {
             num_rows: Precision::Inexact(2),
-            total_byte_size: Precision::Absent,
+            total_byte_size: Precision::Inexact(16),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Absent,
@@ -576,17 +848,18 @@ mod test {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
         };
 
-        assert_eq!(&p0_statistics, &expected_p0_statistics);
+        assert_eq!(*p0_statistics, expected_p0_statistics);
 
         let expected_p1_statistics = Statistics {
             num_rows: Precision::Inexact(2),
-            total_byte_size: Precision::Absent,
+            total_byte_size: Precision::Inexact(16),
             column_statistics: vec![
                 ColumnStatistics {
                     null_count: Precision::Absent,
@@ -594,6 +867,7 @@ mod test {
                     min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
@@ -601,7 +875,7 @@ mod test {
         };
 
         let p1_statistics = aggregate_exec_partial.partition_statistics(Some(1))?;
-        assert_eq!(&p1_statistics, &expected_p1_statistics);
+        assert_eq!(*p1_statistics, expected_p1_statistics);
 
         validate_statistics_with_data(
             aggregate_exec_partial.clone(),
@@ -623,10 +897,10 @@ mod test {
         )?);
 
         let p0_statistics = agg_final.partition_statistics(Some(0))?;
-        assert_eq!(&p0_statistics, &expected_p0_statistics);
+        assert_eq!(*p0_statistics, expected_p0_statistics);
 
         let p1_statistics = agg_final.partition_statistics(Some(1))?;
-        assert_eq!(&p1_statistics, &expected_p1_statistics);
+        assert_eq!(*p1_statistics, expected_p1_statistics);
 
         validate_statistics_with_data(
             agg_final.clone(),
@@ -652,7 +926,10 @@ mod test {
         )?) as _;
 
         let agg_plan = get_plan_string(&agg_partial).remove(0);
-        assert_eq!("AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]",agg_plan);
+        assert_snapshot!(
+            agg_plan,
+            @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]"
+        );
 
         let empty_stat = Statistics {
             num_rows: Precision::Exact(0),
@@ -664,8 +941,8 @@ mod test {
             ],
         };
 
-        assert_eq!(&empty_stat, &agg_partial.partition_statistics(Some(0))?);
-        assert_eq!(&empty_stat, &agg_partial.partition_statistics(Some(1))?);
+        assert_eq!(empty_stat, *agg_partial.partition_statistics(Some(0))?);
+        assert_eq!(empty_stat, *agg_partial.partition_statistics(Some(1))?);
         validate_statistics_with_data(
             agg_partial.clone(),
             vec![ExpectedStatistics::Empty, ExpectedStatistics::Empty],
@@ -691,8 +968,8 @@ mod test {
             agg_partial.schema(),
         )?);
 
-        assert_eq!(&empty_stat, &agg_final.partition_statistics(Some(0))?);
-        assert_eq!(&empty_stat, &agg_final.partition_statistics(Some(1))?);
+        assert_eq!(empty_stat, *agg_final.partition_statistics(Some(0))?);
+        assert_eq!(empty_stat, *agg_final.partition_statistics(Some(1))?);
 
         validate_statistics_with_data(
             agg_final,
@@ -728,7 +1005,7 @@ mod test {
             column_statistics: vec![ColumnStatistics::new_unknown()],
         };
 
-        assert_eq!(&expect_stat, &agg_final.partition_statistics(Some(0))?);
+        assert_eq!(expect_stat, *agg_final.partition_statistics(Some(0))?);
 
         // Verify that the aggregate final result has exactly one partition with one row
         let mut partitions = execute_stream_partitioned(
@@ -741,4 +1018,594 @@ mod test {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_placeholder_rows() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let plan = Arc::new(PlaceholderRowExec::new(schema).with_partitions(2))
+            as Arc<dyn ExecutionPlan>;
+        let schema = plan.schema();
+
+        let ctx = TaskContext::default();
+        let partitions = execute_stream_partitioned(Arc::clone(&plan), Arc::new(ctx))?;
+
+        let mut all_batches = vec![];
+        for (i, partition_stream) in partitions.into_iter().enumerate() {
+            let batches: Vec<RecordBatch> = partition_stream.try_collect().await?;
+            let actual = plan.partition_statistics(Some(i))?;
+            let expected = compute_record_batch_statistics(
+                std::slice::from_ref(&batches),
+                &schema,
+                None,
+            );
+            assert_eq!(*actual, expected);
+            all_batches.push(batches);
+        }
+
+        let actual = plan.partition_statistics(None)?;
+        let expected = compute_record_batch_statistics(&all_batches, &schema, None);
+        assert_eq!(*actual, expected);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_repartition() -> Result<()> {
+        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let repartition = Arc::new(RepartitionExec::try_new(
+            scan.clone(),
+            Partitioning::RoundRobinBatch(3),
+        )?);
+
+        let statistics = (0..repartition.partitioning().partition_count())
+            .map(|idx| repartition.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+        assert_eq!(statistics.len(), 3);
+
+        // Repartition preserves original total_rows from input (4 rows total)
+        let expected_stats = Statistics {
+            num_rows: Precision::Inexact(1),
+            total_byte_size: Precision::Inexact(10),
+            column_statistics: vec![
+                ColumnStatistics::new_unknown(),
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+
+        // All partitions should have the same statistics
+        for stat in statistics.iter() {
+            assert_eq!(**stat, expected_stats);
+        }
+
+        // Verify that the result has exactly 3 partitions
+        let partitions = execute_stream_partitioned(
+            repartition.clone(),
+            Arc::new(TaskContext::default()),
+        )?;
+        assert_eq!(partitions.len(), 3);
+
+        // Collect row counts from each partition
+        let mut partition_row_counts = Vec::new();
+        for partition_stream in partitions.into_iter() {
+            let results: Vec<RecordBatch> = partition_stream.try_collect().await?;
+            let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
+            partition_row_counts.push(total_rows);
+        }
+        assert_eq!(partition_row_counts.len(), 3);
+        assert_eq!(partition_row_counts[0], 1);
+        assert_eq!(partition_row_counts[1], 2);
+        assert_eq!(partition_row_counts[2], 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_repartition_invalid_partition() -> Result<()>
+    {
+        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let repartition = Arc::new(RepartitionExec::try_new(
+            scan.clone(),
+            Partitioning::RoundRobinBatch(2),
+        )?);
+
+        let result = repartition.partition_statistics(Some(2));
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(
+            error
+                .to_string()
+                .contains("RepartitionExec invalid partition 2 (expected less than 2)")
+        );
+
+        let partitions = execute_stream_partitioned(
+            repartition.clone(),
+            Arc::new(TaskContext::default()),
+        )?;
+        assert_eq!(partitions.len(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_repartition_zero_partitions() -> Result<()> {
+        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
+        let scan_schema = scan.schema();
+
+        // Create a repartition with 0 partitions
+        let repartition = Arc::new(RepartitionExec::try_new(
+            Arc::new(EmptyExec::new(scan_schema.clone())),
+            Partitioning::RoundRobinBatch(0),
+        )?);
+
+        let result = repartition.partition_statistics(Some(0))?;
+        assert_eq!(*result, Statistics::new_unknown(&scan_schema));
+
+        // Verify that the result has exactly 0 partitions
+        let partitions = execute_stream_partitioned(
+            repartition.clone(),
+            Arc::new(TaskContext::default()),
+        )?;
+        assert_eq!(partitions.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_repartition_hash_partitioning() -> Result<()>
+    {
+        let scan = create_scan_exec_with_statistics(None, Some(1)).await;
+
+        // Create hash partitioning on the 'id' column
+        let hash_expr = vec![col("id", &scan.schema())?];
+        let repartition = Arc::new(RepartitionExec::try_new(
+            scan,
+            Partitioning::Hash(hash_expr, 2),
+        )?);
+
+        // Verify the result of partition statistics of repartition
+        let stats = (0..repartition.partitioning().partition_count())
+            .map(|idx| repartition.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+        assert_eq!(stats.len(), 2);
+
+        // Repartition preserves original total_rows from input (4 rows total)
+        let expected_stats = Statistics {
+            num_rows: Precision::Inexact(2),
+            total_byte_size: Precision::Inexact(16),
+            column_statistics: vec![
+                ColumnStatistics::new_unknown(),
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        assert_eq!(*stats[0], expected_stats);
+        assert_eq!(*stats[1], expected_stats);
+
+        // Verify the repartition execution results
+        let partitions =
+            execute_stream_partitioned(repartition, Arc::new(TaskContext::default()))?;
+        assert_eq!(partitions.len(), 2);
+
+        let mut partition_row_counts = Vec::new();
+        for partition_stream in partitions.into_iter() {
+            let results: Vec<RecordBatch> = partition_stream.try_collect().await?;
+            let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
+            partition_row_counts.push(total_rows);
+        }
+        assert_eq!(partition_row_counts.len(), 2);
+        assert_eq!(partition_row_counts.iter().sum::<usize>(), 4);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistic_by_partition_of_window_agg() -> Result<()> {
+        let scan = create_scan_exec_with_statistics(None, Some(2)).await;
+
+        let window_expr = create_window_expr(
+            &WindowFunctionDefinition::AggregateUDF(count_udaf()),
+            "count".to_owned(),
+            &[col("id", &scan.schema())?],
+            &[], // no partition by
+            &[PhysicalSortExpr::new(
+                col("id", &scan.schema())?,
+                SortOptions::default(),
+            )],
+            Arc::new(WindowFrame::new(Some(false))),
+            scan.schema(),
+            false,
+            false,
+            None,
+        )?;
+
+        let window_agg: Arc<dyn ExecutionPlan> =
+            Arc::new(WindowAggExec::try_new(vec![window_expr], scan, true)?);
+
+        // Verify partition statistics are properly propagated (not unknown)
+        let statistics = (0..window_agg.output_partitioning().partition_count())
+            .map(|idx| window_agg.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        assert_eq!(statistics.len(), 2);
+
+        // Window functions preserve input row counts and column statistics
+        // but add unknown statistics for the new window column
+        let expected_statistic_partition_1 = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics::new_unknown(), // window column
+            ],
+        };
+
+        let expected_statistic_partition_2 = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(2))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_03,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(8),
+                },
+                ColumnStatistics::new_unknown(), // window column
+            ],
+        };
+
+        assert_eq!(*statistics[0], expected_statistic_partition_1);
+        assert_eq!(*statistics[1], expected_statistic_partition_2);
+
+        // Verify the statistics match actual execution results
+        let expected_stats = vec![
+            ExpectedStatistics::NonEmpty(3, 4, 2),
+            ExpectedStatistics::NonEmpty(1, 2, 2),
+        ];
+        validate_statistics_with_data(window_agg, expected_stats, 0).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_statistics_by_partition_of_empty_exec() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]));
+
+        // Try to test with single partition
+        let empty_single = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let stats = empty_single.partition_statistics(Some(0))?;
+        assert_eq!(stats.num_rows, Precision::Exact(0));
+        assert_eq!(stats.total_byte_size, Precision::Exact(0));
+        assert_eq!(stats.column_statistics.len(), 2);
+
+        for col_stat in &stats.column_statistics {
+            assert_eq!(col_stat.null_count, Precision::Exact(0));
+            assert_eq!(col_stat.distinct_count, Precision::Exact(0));
+            assert_eq!(col_stat.byte_size, Precision::Exact(0));
+            assert_eq!(col_stat.min_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.max_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.sum_value, Precision::<ScalarValue>::Absent);
+            assert_eq!(col_stat.byte_size, Precision::Exact(0));
+        }
+
+        let overall_stats = empty_single.partition_statistics(None)?;
+        assert_eq!(stats, overall_stats);
+
+        validate_statistics_with_data(empty_single, vec![ExpectedStatistics::Empty], 0)
+            .await?;
+
+        // Test with multiple partitions
+        let empty_multi: Arc<dyn ExecutionPlan> =
+            Arc::new(EmptyExec::new(Arc::clone(&schema)).with_partitions(3));
+
+        let statistics = (0..empty_multi.output_partitioning().partition_count())
+            .map(|idx| empty_multi.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        assert_eq!(statistics.len(), 3);
+
+        for stat in &statistics {
+            assert_eq!(stat.num_rows, Precision::Exact(0));
+            assert_eq!(stat.total_byte_size, Precision::Exact(0));
+        }
+
+        validate_statistics_with_data(
+            empty_multi,
+            vec![
+                ExpectedStatistics::Empty,
+                ExpectedStatistics::Empty,
+                ExpectedStatistics::Empty,
+            ],
+            0,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hash_join_partition_statistics() -> Result<()> {
+        // Create left table scan and coalesce to 1 partition for CollectLeft mode
+        let left_scan = create_scan_exec_with_statistics(None, Some(2)).await;
+        let left_scan_coalesced = Arc::new(CoalescePartitionsExec::new(left_scan.clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        // Create right table scan with different table name
+        let right_create_table_sql = "CREATE EXTERNAL TABLE t2 (id INT NOT NULL, date DATE) \
+                                     STORED AS PARQUET LOCATION './tests/data/test_statistics_per_partition'\
+                                     PARTITIONED BY (date) \
+                                     WITH ORDER (id ASC);";
+        let right_scan =
+            create_scan_exec_with_statistics(Some(right_create_table_sql), Some(2)).await;
+
+        // Create join condition: t1.id = t2.id
+        let on = vec![(
+            Arc::new(Column::new("id", 0)) as Arc<dyn PhysicalExpr>,
+            Arc::new(Column::new("id", 0)) as Arc<dyn PhysicalExpr>,
+        )];
+
+        // Test CollectLeft mode - left child must have 1 partition
+        let collect_left_join = Arc::new(HashJoinExec::try_new(
+            left_scan_coalesced,
+            Arc::clone(&right_scan),
+            on.clone(),
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for CollectLeft mode
+        let statistics = (0..collect_left_join.output_partitioning().partition_count())
+            .map(|idx| collect_left_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For collect left mode, the min/max values are from the entire left table and the specific partition of the right table.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Left date column: all partitions (2025-03-01..2025-03-04)
+                // NDV is Inexact(1) because each Hive partition has exactly 1 distinct date value,
+                // and merging takes max as a conservative lower bound
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(16),
+                },
+                // Right id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Right date column: partition 0 only (2025-03-01..2025-03-02)
+                // NDV is Inexact(1) from the single Hive partition's date value
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(8),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+
+        // Test Partitioned mode
+        let partitioned_join = Arc::new(HashJoinExec::try_new(
+            Arc::clone(&left_scan),
+            Arc::clone(&right_scan),
+            on.clone(),
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for Partitioned mode
+        let statistics = (0..partitioned_join.output_partitioning().partition_count())
+            .map(|idx| partitioned_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For partitioned mode, the min/max values are from the specific partition for each side.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Left date column: partition 0 only (2025-03-01..2025-03-02)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(8),
+                },
+                // Right id column: partition 0 only (id 3..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(8),
+                },
+                // Right date column: partition 0 only (2025-03-01..2025-03-02)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_02,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(8),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+
+        // Test Auto mode - should fall back to getting all partition statistics
+        let auto_join = Arc::new(HashJoinExec::try_new(
+            Arc::clone(&left_scan),
+            Arc::clone(&right_scan),
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::Auto,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?) as Arc<dyn ExecutionPlan>;
+
+        // Test partition statistics for Auto mode
+        let statistics = (0..auto_join.output_partitioning().partition_count())
+            .map(|idx| auto_join.partition_statistics(Some(idx)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check that we have the expected number of partitions
+        assert_eq!(statistics.len(), 2);
+
+        // For auto mode, the min/max values are from the entire left and right tables.
+        let expected_p0_statistics = Statistics {
+            num_rows: Precision::Inexact(4),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                // Left id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Left date column: all partitions (2025-03-01..2025-03-04)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(16),
+                },
+                // Right id column: all partitions (id 1..4)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Exact(16),
+                },
+                // Right date column: all partitions (2025-03-01..2025-03-04)
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_04,
+                    ))),
+                    min_value: Precision::Exact(ScalarValue::Date32(Some(
+                        DATE_2025_03_01,
+                    ))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Inexact(1),
+                    byte_size: Precision::Exact(16),
+                },
+            ],
+        };
+        assert_eq!(*statistics[0], expected_p0_statistics);
+        Ok(())
+    }
 }
diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
index 7c00d323a8e69..00e016ae02cad 100644
--- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
+++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
@@ -24,47 +24,48 @@ use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::physical_plan::CsvSource;
 use datafusion::datasource::source::DataSourceExec;
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::Result;
-use datafusion_common::{JoinSide, JoinType, ScalarValue};
+use datafusion_common::config::{ConfigOptions, CsvOptions};
+use datafusion_common::{JoinSide, JoinType, NullEquality, Result, ScalarValue};
+use datafusion_datasource::TableSchema;
+use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::{
     Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
 };
+use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_physical_expr::expressions::{
-    binary, cast, col, BinaryExpr, CaseExpr, CastExpr, Column, Literal, NegativeExpr,
+    BinaryExpr, CaseExpr, CastExpr, Column, Literal, NegativeExpr, binary, cast, col,
 };
-use datafusion_physical_expr::ScalarFunctionExpr;
-use datafusion_physical_expr::{
-    Distribution, Partitioning, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
+use datafusion_physical_expr::{Distribution, Partitioning, ScalarFunctionExpr};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{
+    OrderingRequirements, PhysicalSortExpr, PhysicalSortRequirement,
 };
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_optimizer::output_requirements::OutputRequirementExec;
 use datafusion_physical_optimizer::projection_pushdown::ProjectionPushdown;
-use datafusion_physical_optimizer::PhysicalOptimizerRule;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::coop::CooperativeExec;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use datafusion_physical_plan::joins::{
     HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode,
     SymmetricHashJoinExec,
 };
-use datafusion_physical_plan::projection::{update_expr, ProjectionExec};
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr, update_expr};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use datafusion_physical_plan::streaming::PartitionStream;
-use datafusion_physical_plan::streaming::StreamingTableExec;
+use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use datafusion_physical_plan::union::UnionExec;
-use datafusion_physical_plan::{get_plan_string, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, displayable};
 
-use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
-use datafusion_expr_common::columnar_value::ColumnarValue;
+use insta::assert_snapshot;
 use itertools::Itertools;
 
 /// Mocked UDF
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct DummyUDF {
     signature: Signature,
 }
@@ -129,6 +130,7 @@ fn test_update_matching_exprs() -> Result<()> {
                 )),
             ],
             Field::new("f", DataType::Int32, true).into(),
+            Arc::new(ConfigOptions::default()),
         )),
         Arc::new(CaseExpr::try_new(
             Some(Arc::new(Column::new("d", 2))),
@@ -194,6 +196,7 @@ fn test_update_matching_exprs() -> Result<()> {
                 )),
             ],
             Field::new("f", DataType::Int32, true).into(),
+            Arc::new(ConfigOptions::default()),
         )),
         Arc::new(CaseExpr::try_new(
             Some(Arc::new(Column::new("d", 3))),
@@ -223,10 +226,16 @@ fn test_update_matching_exprs() -> Result<()> {
         )?),
     ];
 
+    let child_exprs: Vec<ProjectionExpr> = child
+        .iter()
+        .map(|(expr, alias)| ProjectionExpr::new(expr.clone(), alias.clone()))
+        .collect();
     for (expr, expected_expr) in exprs.into_iter().zip(expected_exprs.into_iter()) {
-        assert!(update_expr(&expr, &child, true)?
-            .unwrap()
-            .eq(&expected_expr));
+        assert!(
+            update_expr(&expr, &child_exprs, true)?
+                .unwrap()
+                .eq(&expected_expr)
+        );
     }
 
     Ok(())
@@ -262,6 +271,7 @@ fn test_update_projected_exprs() -> Result<()> {
                 )),
             ],
             Field::new("f", DataType::Int32, true).into(),
+            Arc::new(ConfigOptions::default()),
         )),
         Arc::new(CaseExpr::try_new(
             Some(Arc::new(Column::new("d", 2))),
@@ -327,6 +337,7 @@ fn test_update_projected_exprs() -> Result<()> {
                 )),
             ],
             Field::new("f", DataType::Int32, true).into(),
+            Arc::new(ConfigOptions::default()),
         )),
         Arc::new(CaseExpr::try_new(
             Some(Arc::new(Column::new("d_new", 3))),
@@ -356,10 +367,16 @@ fn test_update_projected_exprs() -> Result<()> {
         )?),
     ];
 
+    let proj_exprs: Vec<ProjectionExpr> = projected_exprs
+        .iter()
+        .map(|(expr, alias)| ProjectionExpr::new(expr.clone(), alias.clone()))
+        .collect();
     for (expr, expected_expr) in exprs.into_iter().zip(expected_exprs.into_iter()) {
-        assert!(update_expr(&expr, &projected_exprs, false)?
-            .unwrap()
-            .eq(&expected_expr));
+        assert!(
+            update_expr(&expr, &proj_exprs, false)?
+                .unwrap()
+                .eq(&expected_expr)
+        );
     }
 
     Ok(())
@@ -373,14 +390,20 @@ fn create_simple_csv_exec() -> Arc<dyn ExecutionPlan> {
         Field::new("d", DataType::Int32, true),
         Field::new("e", DataType::Int32, true),
     ]));
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(CsvSource::new(false, 0, 0)),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_projection(Some(vec![0, 1, 2, 3, 4]))
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: 0,
+                quote: 0,
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema.clone()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x", 100))
+        .with_projection_indices(Some(vec![0, 1, 2, 3, 4]))
+        .unwrap()
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -392,14 +415,20 @@ fn create_projecting_csv_exec() -> Arc<dyn ExecutionPlan> {
         Field::new("c", DataType::Int32, true),
         Field::new("d", DataType::Int32, true),
     ]));
-    let config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::parse("test:///").unwrap(),
-        schema,
-        Arc::new(CsvSource::new(false, 0, 0)),
-    )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_projection(Some(vec![3, 2, 1]))
-    .build();
+    let config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::parse("test:///").unwrap(), {
+            let options = CsvOptions {
+                has_header: Some(false),
+                delimiter: 0,
+                quote: 0,
+                ..Default::default()
+            };
+            Arc::new(CsvSource::new(schema.clone()).with_csv_options(options))
+        })
+        .with_file(PartitionedFile::new("x", 100))
+        .with_projection_indices(Some(vec![3, 2, 1]))
+        .unwrap()
+        .build();
 
     DataSourceExec::from_data_source(config)
 }
@@ -421,24 +450,34 @@ fn test_csv_after_projection() -> Result<()> {
     let csv = create_projecting_csv_exec();
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("b", 2)), "b".to_string()),
-            (Arc::new(Column::new("d", 0)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 2)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 0)), "d"),
         ],
         csv.clone(),
     )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-        "ProjectionExec: expr=[b@2 as b, d@0 as d]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[d, c, b], file_type=csv, has_header=false",
-    ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[b@2 as b, d@0 as d]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[d, c, b], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected =
-        ["DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, d], file_type=csv, has_header=false"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, d], file_type=csv, has_header=false"
+    );
 
     Ok(())
 }
@@ -448,24 +487,36 @@ fn test_memory_after_projection() -> Result<()> {
     let memory = create_projecting_memory_exec();
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("d", 2)), "d".to_string()),
-            (Arc::new(Column::new("e", 3)), "e".to_string()),
-            (Arc::new(Column::new("a", 1)), "a".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("d", 2)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 3)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 1)), "a"),
         ],
         memory.clone(),
     )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-        "ProjectionExec: expr=[d@2 as d, e@3 as e, a@1 as a]",
-        "  DataSourceExec: partitions=0, partition_sizes=[]",
-    ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[d@2 as d, e@3 as e, a@1 as a]
+      DataSourceExec: partitions=0, partition_sizes=[]
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = ["DataSourceExec: partitions=0, partition_sizes=[]"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: partitions=0, partition_sizes=[]"
+    );
+
     assert_eq!(
         after_optimize
             .clone()
@@ -519,7 +570,7 @@ fn test_streaming_table_after_projection() -> Result<()> {
         }) as _],
         Some(&vec![0_usize, 2, 4, 3]),
         vec![
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: Arc::new(Column::new("e", 2)),
                     options: SortOptions::default(),
@@ -528,11 +579,13 @@ fn test_streaming_table_after_projection() -> Result<()> {
                     expr: Arc::new(Column::new("a", 0)),
                     options: SortOptions::default(),
                 },
-            ]),
-            LexOrdering::new(vec![PhysicalSortExpr {
+            ]
+            .into(),
+            [PhysicalSortExpr {
                 expr: Arc::new(Column::new("d", 3)),
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
         ]
         .into_iter(),
         true,
@@ -540,9 +593,9 @@ fn test_streaming_table_after_projection() -> Result<()> {
     )?;
     let projection = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("d", 3)), "d".to_string()),
-            (Arc::new(Column::new("e", 2)), "e".to_string()),
-            (Arc::new(Column::new("a", 0)), "a".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 2)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
         ],
         Arc::new(streaming_table) as _,
     )?) as _;
@@ -579,7 +632,7 @@ fn test_streaming_table_after_projection() -> Result<()> {
     assert_eq!(
         result.projected_output_ordering().into_iter().collect_vec(),
         vec![
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: Arc::new(Column::new("e", 1)),
                     options: SortOptions::default(),
@@ -588,11 +641,13 @@ fn test_streaming_table_after_projection() -> Result<()> {
                     expr: Arc::new(Column::new("a", 2)),
                     options: SortOptions::default(),
                 },
-            ]),
-            LexOrdering::new(vec![PhysicalSortExpr {
+            ]
+            .into(),
+            [PhysicalSortExpr {
                 expr: Arc::new(Column::new("d", 0)),
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
         ]
     );
     assert!(result.is_infinite());
@@ -605,45 +660,55 @@ fn test_projection_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
     let child_projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("e", 4)), "new_e".to_string()),
-            (Arc::new(Column::new("a", 0)), "a".to_string()),
-            (Arc::new(Column::new("b", 1)), "new_b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "new_e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "new_b"),
         ],
         csv.clone(),
     )?);
     let top_projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("new_b", 3)), "new_b".to_string()),
-            (
+            ProjectionExpr::new(Arc::new(Column::new("new_b", 3)), "new_b"),
+            ProjectionExpr::new(
                 Arc::new(BinaryExpr::new(
                     Arc::new(Column::new("c", 0)),
                     Operator::Plus,
                     Arc::new(Column::new("new_e", 1)),
                 )),
-                "binary".to_string(),
+                "binary",
             ),
-            (Arc::new(Column::new("new_b", 3)), "newest_b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("new_b", 3)), "newest_b"),
         ],
         child_projection.clone(),
     )?);
 
-    let initial = get_plan_string(&top_projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[new_b@3 as new_b, c@0 + new_e@1 as binary, new_b@3 as newest_b]",
-            "  ProjectionExec: expr=[c@2 as c, e@4 as new_e, a@0 as a, b@1 as new_b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(top_projection.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[new_b@3 as new_b, c@0 + new_e@1 as binary, new_b@3 as newest_b]
+      ProjectionExec: expr=[c@2 as c, e@4 as new_e, a@0 as a, b@1 as new_b]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(top_projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "ProjectionExec: expr=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b]",
-            "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b], file_type=csv, has_header=false"
+    );
 
     Ok(())
 }
@@ -652,67 +717,84 @@ fn test_projection_after_projection() -> Result<()> {
 fn test_output_req_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
     let sort_req: Arc<dyn ExecutionPlan> = Arc::new(OutputRequirementExec::new(
-        csv.clone(),
-        Some(LexRequirement::new(vec![
-            PhysicalSortRequirement {
-                expr: Arc::new(Column::new("b", 1)),
-                options: Some(SortOptions::default()),
-            },
-            PhysicalSortRequirement {
-                expr: Arc::new(BinaryExpr::new(
-                    Arc::new(Column::new("c", 2)),
-                    Operator::Plus,
-                    Arc::new(Column::new("a", 0)),
-                )),
-                options: Some(SortOptions::default()),
-            },
-        ])),
+        csv,
+        Some(OrderingRequirements::new(
+            [
+                PhysicalSortRequirement::new(
+                    Arc::new(Column::new("b", 1)),
+                    Some(SortOptions::default()),
+                ),
+                PhysicalSortRequirement::new(
+                    Arc::new(BinaryExpr::new(
+                        Arc::new(Column::new("c", 2)),
+                        Operator::Plus,
+                        Arc::new(Column::new("a", 0)),
+                    )),
+                    Some(SortOptions::default()),
+                ),
+            ]
+            .into(),
+        )),
         Distribution::HashPartitioned(vec![
             Arc::new(Column::new("a", 0)),
             Arc::new(Column::new("b", 1)),
         ]),
+        None,
     ));
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         sort_req.clone(),
     )?);
 
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "  OutputRequirementExec",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
+      OutputRequirementExec: order_by=[(b@1, asc), (c@2 + a@0, asc)], dist_by=HashPartitioned[[a@0, b@1]])
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected: [&str; 3] = [
-            "OutputRequirementExec",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-        ];
-
-    assert_eq!(get_plan_string(&after_optimize), expected);
-    let expected_reqs = LexRequirement::new(vec![
-        PhysicalSortRequirement {
-            expr: Arc::new(Column::new("b", 2)),
-            options: Some(SortOptions::default()),
-        },
-        PhysicalSortRequirement {
-            expr: Arc::new(BinaryExpr::new(
-                Arc::new(Column::new("c", 0)),
-                Operator::Plus,
-                Arc::new(Column::new("new_a", 1)),
-            )),
-            options: Some(SortOptions::default()),
-        },
-    ]);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    OutputRequirementExec: order_by=[(b@2, asc), (c@0 + new_a@1, asc)], dist_by=HashPartitioned[[new_a@1, b@2]])
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+    "
+    );
+
+    let expected_reqs = OrderingRequirements::new(
+        [
+            PhysicalSortRequirement::new(
+                Arc::new(Column::new("b", 2)),
+                Some(SortOptions::default()),
+            ),
+            PhysicalSortRequirement::new(
+                Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("c", 0)),
+                    Operator::Plus,
+                    Arc::new(Column::new("new_a", 1)),
+                )),
+                Some(SortOptions::default()),
+            ),
+        ]
+        .into(),
+    );
     assert_eq!(
         after_optimize
             .as_any()
@@ -734,10 +816,11 @@ fn test_output_req_after_projection() -> Result<()> {
         .required_input_distribution()[0]
         .clone()
     {
-        assert!(vec
-            .iter()
-            .zip(expected_distribution)
-            .all(|(actual, expected)| actual.eq(&expected)));
+        assert!(
+            vec.iter()
+                .zip(expected_distribution)
+                .all(|(actual, expected)| actual.eq(&expected))
+        );
     } else {
         panic!("Expected HashPartitioned distribution!");
     };
@@ -752,29 +835,39 @@ fn test_coalesce_partitions_after_projection() -> Result<()> {
         Arc::new(CoalescePartitionsExec::new(csv));
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
-            (Arc::new(Column::new("a", 0)), "a_new".to_string()),
-            (Arc::new(Column::new("d", 3)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
         ],
         coalesce_partitions,
     )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-                "ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]",
-                "  CoalescePartitionsExec",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]
+      CoalescePartitionsExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-                "CoalescePartitionsExec",
-                "  ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    CoalescePartitionsExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, a@0 as a_new, d], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -795,33 +888,43 @@ fn test_filter_after_projection() -> Result<()> {
             Arc::new(Column::new("a", 0)),
         )),
     ));
-    let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(predicate, csv)?);
+    let filter = Arc::new(FilterExec::try_new(predicate, csv)?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("a", 0)), "a_new".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
-            (Arc::new(Column::new("d", 3)), "d".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_new"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
         ],
         filter.clone(),
-    )?);
+    )?) as _;
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
 
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-                "ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]",
-                "  FilterExec: b@1 - a@0 > d@3 - a@0",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(initial, expected_initial);
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]
+      FilterExec: b@1 - a@0 > d@3 - a@0
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-                "FilterExec: b@1 - a_new@0 > d@2 - a_new@0",
-                "  ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    FilterExec: b@1 - a_new@0 > d@2 - a_new@0
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_new, b, d], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -875,41 +978,50 @@ fn test_join_after_projection() -> Result<()> {
             ])),
         )),
         &JoinType::Inner,
-        true,
+        NullEquality::NullEqualsNull,
         None,
         None,
         StreamJoinPartitionMode::SinglePartition,
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c_from_left".to_string()),
-            (Arc::new(Column::new("b", 1)), "b_from_left".to_string()),
-            (Arc::new(Column::new("a", 0)), "a_from_left".to_string()),
-            (Arc::new(Column::new("a", 5)), "a_from_right".to_string()),
-            (Arc::new(Column::new("c", 7)), "c_from_right".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 5)), "a_from_right"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c_from_right"),
         ],
         join,
-    )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, a@5 as a_from_right, c@7 as c_from_right]",
-            "  SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    )?) as _;
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, a@5 as a_from_right, c@7 as c_from_right]
+      SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2",
-            "  ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "  ProjectionExec: expr=[a@0 as a_from_right, c@2 as c_from_right]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_from_right, c@2 as c_from_right], file_type=csv, has_header=false
+    "
+    );
 
     let expected_filter_col_ind = vec![
         ColumnIndex {
@@ -945,7 +1057,7 @@ fn test_join_after_required_projection() -> Result<()> {
     let left_csv = create_simple_csv_exec();
     let right_csv = create_simple_csv_exec();
 
-    let join: Arc<dyn ExecutionPlan> = Arc::new(SymmetricHashJoinExec::try_new(
+    let join = Arc::new(SymmetricHashJoinExec::try_new(
         left_csv,
         right_csv,
         vec![(Arc::new(Column::new("b", 1)), Arc::new(Column::new("c", 2)))],
@@ -989,45 +1101,56 @@ fn test_join_after_required_projection() -> Result<()> {
             ])),
         )),
         &JoinType::Inner,
-        true,
+        NullEquality::NullEqualsNull,
         None,
         None,
         StreamJoinPartitionMode::SinglePartition,
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("a", 5)), "a".to_string()),
-            (Arc::new(Column::new("b", 6)), "b".to_string()),
-            (Arc::new(Column::new("c", 7)), "c".to_string()),
-            (Arc::new(Column::new("d", 8)), "d".to_string()),
-            (Arc::new(Column::new("e", 9)), "e".to_string()),
-            (Arc::new(Column::new("a", 0)), "a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("d", 3)), "d".to_string()),
-            (Arc::new(Column::new("e", 4)), "e".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 5)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 6)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 8)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 9)), "e"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d"),
+            ProjectionExpr::new(Arc::new(Column::new("e", 4)), "e"),
         ],
         join,
-    )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]",
-            "  SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    )?) as _;
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]
+      SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]",
-            "  SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@5 as a, b@6 as b, c@7 as c, d@8 as d, e@9 as e, a@0 as a, b@1 as b, c@2 as c, d@3 as d, e@4 as e]
+      SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
     Ok(())
 }
 
@@ -1061,7 +1184,7 @@ fn test_nested_loop_join_after_projection() -> Result<()> {
         Field::new("c", DataType::Int32, true),
     ]);
 
-    let join: Arc<dyn ExecutionPlan> = Arc::new(NestedLoopJoinExec::try_new(
+    let join = Arc::new(NestedLoopJoinExec::try_new(
         left_csv,
         right_csv,
         Some(JoinFilter::new(
@@ -1071,29 +1194,39 @@ fn test_nested_loop_join_after_projection() -> Result<()> {
         )),
         &JoinType::Inner,
         None,
-    )?);
+    )?) as _;
 
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
-        vec![(col_left_c, "c".to_string())],
+        vec![ProjectionExpr::new(col_left_c, "c")],
         Arc::clone(&join),
-    )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c]",
-            "  NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            ];
-    assert_eq!(initial, expected_initial);
+    )?) as _;
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c]
+      NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
-    let after_optimize =
+    let after_optimize_string =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
-    let expected = [
-            "NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1, projection=[c@2]",
-            "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize_string.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    NestedLoopJoinExec: join_type=Inner, filter=a@0 < b@1, projection=[c@2]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+
+    );
     Ok(())
 }
 
@@ -1104,7 +1237,7 @@ fn test_hash_join_after_projection() -> Result<()> {
     let left_csv = create_simple_csv_exec();
     let right_csv = create_simple_csv_exec();
 
-    let join: Arc<dyn ExecutionPlan> = Arc::new(HashJoinExec::try_new(
+    let join = Arc::new(HashJoinExec::try_new(
         left_csv,
         right_csv,
         vec![(Arc::new(Column::new("b", 1)), Arc::new(Column::new("c", 2)))],
@@ -1150,46 +1283,74 @@ fn test_hash_join_after_projection() -> Result<()> {
         &JoinType::Inner,
         None,
         PartitionMode::Auto,
-        true,
+        NullEquality::NullEqualsNothing,
+        false,
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c_from_left".to_string()),
-            (Arc::new(Column::new("b", 1)), "b_from_left".to_string()),
-            (Arc::new(Column::new("a", 0)), "a_from_left".to_string()),
-            (Arc::new(Column::new("c", 7)), "c_from_right".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a_from_left"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c_from_right"),
         ],
         join.clone(),
-    )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-			"ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@7 as c_from_right]", "  HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2", "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    )?) as _;
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@7 as c_from_right]
+      HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
 
     // HashJoinExec only returns result after projection. Because there are some alias columns in the projection, the ProjectionExec is not removed.
-    let expected = ["ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@3 as c_from_right]", "  HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]", "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left, c@3 as c_from_right]
+      HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
-    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+    let projection = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("a", 0)), "a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("c", 7)), "c".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("c", 7)), "c"),
         ],
         join.clone(),
     )?);
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
 
     // Comparing to the previous result, this projection don't have alias columns either change the order of output fields. So the ProjectionExec is removed.
-    let expected = ["HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]", "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    assert_snapshot!(
+        actual,
+        @r"
+    HashJoinExec: mode=Auto, join_type=Inner, on=[(b@1, c@2)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2, projection=[a@0, b@1, c@2, c@7]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -1197,7 +1358,7 @@ fn test_hash_join_after_projection() -> Result<()> {
 #[test]
 fn test_repartition_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
-    let repartition: Arc<dyn ExecutionPlan> = Arc::new(RepartitionExec::try_new(
+    let repartition = Arc::new(RepartitionExec::try_new(
         csv,
         Partitioning::Hash(
             vec![
@@ -1210,29 +1371,37 @@ fn test_repartition_after_projection() -> Result<()> {
     )?);
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("b", 1)), "b_new".to_string()),
-            (Arc::new(Column::new("a", 0)), "a".to_string()),
-            (Arc::new(Column::new("d", 3)), "d_new".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b_new"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("d", 3)), "d_new"),
         ],
         repartition,
-    )?);
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-                "ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]",
-                "  RepartitionExec: partitioning=Hash([a@0, b@1, d@3], 6), input_partitions=1",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(initial, expected_initial);
+    )?) as _;
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]
+      RepartitionExec: partitioning=Hash([a@0, b@1, d@3], 6), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-                "RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1",
-                "  ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new]",
-                "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as b_new, a, d@3 as d_new], file_type=csv, has_header=false
+    "
+    );
 
     assert_eq!(
         after_optimize
@@ -1257,49 +1426,52 @@ fn test_repartition_after_projection() -> Result<()> {
 #[test]
 fn test_sort_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
-    let sort_req: Arc<dyn ExecutionPlan> = Arc::new(SortExec::new(
-        LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("b", 1)),
-                options: SortOptions::default(),
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(BinaryExpr::new(
-                    Arc::new(Column::new("c", 2)),
-                    Operator::Plus,
-                    Arc::new(Column::new("a", 0)),
-                )),
-                options: SortOptions::default(),
-            },
-        ]),
-        csv.clone(),
-    ));
+    let sort_exec = SortExec::new(
+        [
+            PhysicalSortExpr::new_default(Arc::new(Column::new("b", 1))),
+            PhysicalSortExpr::new_default(Arc::new(BinaryExpr::new(
+                Arc::new(Column::new("c", 2)),
+                Operator::Plus,
+                Arc::new(Column::new("a", 0)),
+            ))),
+        ]
+        .into(),
+        csv,
+    );
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
-        sort_req.clone(),
-    )?);
+        Arc::new(sort_exec),
+    )?) as _;
 
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "  SortExec: expr=[b@1 ASC, c@2 + a@0 ASC], preserve_partitioning=[false]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
+      SortExec: expr=[b@1 ASC, c@2 + a@0 ASC], preserve_partitioning=[false]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false]",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -1307,49 +1479,52 @@ fn test_sort_after_projection() -> Result<()> {
 #[test]
 fn test_sort_preserving_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
-    let sort_req: Arc<dyn ExecutionPlan> = Arc::new(SortPreservingMergeExec::new(
-        LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("b", 1)),
-                options: SortOptions::default(),
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(BinaryExpr::new(
-                    Arc::new(Column::new("c", 2)),
-                    Operator::Plus,
-                    Arc::new(Column::new("a", 0)),
-                )),
-                options: SortOptions::default(),
-            },
-        ]),
-        csv.clone(),
-    ));
+    let sort_exec = SortPreservingMergeExec::new(
+        [
+            PhysicalSortExpr::new_default(Arc::new(Column::new("b", 1))),
+            PhysicalSortExpr::new_default(Arc::new(BinaryExpr::new(
+                Arc::new(Column::new("c", 2)),
+                Operator::Plus,
+                Arc::new(Column::new("a", 0)),
+            ))),
+        ]
+        .into(),
+        csv,
+    );
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
-        sort_req.clone(),
-    )?);
+        Arc::new(sort_exec),
+    )?) as _;
 
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "  SortPreservingMergeExec: [b@1 ASC, c@2 + a@0 ASC]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
+      SortPreservingMergeExec: [b@1 ASC, c@2 + a@0 ASC]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC]",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -1357,40 +1532,45 @@ fn test_sort_preserving_after_projection() -> Result<()> {
 #[test]
 fn test_union_after_projection() -> Result<()> {
     let csv = create_simple_csv_exec();
-    let union: Arc<dyn ExecutionPlan> =
-        Arc::new(UnionExec::new(vec![csv.clone(), csv.clone(), csv]));
+    let union = UnionExec::try_new(vec![csv.clone(), csv.clone(), csv])?;
     let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (Arc::new(Column::new("c", 2)), "c".to_string()),
-            (Arc::new(Column::new("a", 0)), "new_a".to_string()),
-            (Arc::new(Column::new("b", 1)), "b".to_string()),
+            ProjectionExpr::new(Arc::new(Column::new("c", 2)), "c"),
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "new_a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
         ],
         union.clone(),
-    )?);
+    )?) as _;
 
-    let initial = get_plan_string(&projection);
-    let expected_initial = [
-            "ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "  UnionExec",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-            ];
-    assert_eq!(initial, expected_initial);
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]
+      UnionExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-            "UnionExec",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false",
-            "  ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b]",
-            "    DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false"
-        ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    UnionExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
@@ -1403,14 +1583,23 @@ fn partitioned_data_source() -> Arc<DataSourceExec> {
         Field::new("string_col", DataType::Utf8, true),
     ]));
 
+    let options = CsvOptions {
+        has_header: Some(false),
+        delimiter: b',',
+        quote: b'"',
+        ..Default::default()
+    };
+    let table_schema = TableSchema::new(
+        Arc::clone(&file_schema),
+        vec![Arc::new(Field::new("partition_col", DataType::Utf8, true))],
+    );
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        file_schema.clone(),
-        Arc::new(CsvSource::default()),
+        Arc::new(CsvSource::new(table_schema).with_csv_options(options)),
     )
-    .with_file(PartitionedFile::new("x".to_string(), 100))
-    .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)])
-    .with_projection(Some(vec![0, 1, 2]))
+    .with_file(PartitionedFile::new("x", 100))
+    .with_projection_indices(Some(vec![0, 1, 2]))
+    .unwrap()
     .build();
 
     DataSourceExec::from_data_source(config)
@@ -1421,20 +1610,17 @@ fn test_partition_col_projection_pushdown() -> Result<()> {
     let source = partitioned_data_source();
     let partitioned_schema = source.schema();
 
-    let projection = Arc::new(ProjectionExec::try_new(
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (
+            ProjectionExpr::new(
                 col("string_col", partitioned_schema.as_ref())?,
-                "string_col".to_string(),
+                "string_col",
             ),
-            (
+            ProjectionExpr::new(
                 col("partition_col", partitioned_schema.as_ref())?,
-                "partition_col".to_string(),
-            ),
-            (
-                col("int_col", partitioned_schema.as_ref())?,
-                "int_col".to_string(),
+                "partition_col",
             ),
+            ProjectionExpr::new(col("int_col", partitioned_schema.as_ref())?, "int_col"),
         ],
         source,
     )?);
@@ -1442,11 +1628,14 @@ fn test_partition_col_projection_pushdown() -> Result<()> {
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-        "ProjectionExec: expr=[string_col@1 as string_col, partition_col@2 as partition_col, int_col@0 as int_col]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false"
-    ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, partition_col, int_col], file_type=csv, has_header=false"
+    );
 
     Ok(())
 }
@@ -1456,25 +1645,22 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> {
     let source = partitioned_data_source();
     let partitioned_schema = source.schema();
 
-    let projection = Arc::new(ProjectionExec::try_new(
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
         vec![
-            (
+            ProjectionExpr::new(
                 col("string_col", partitioned_schema.as_ref())?,
-                "string_col".to_string(),
+                "string_col",
             ),
-            (
+            ProjectionExpr::new(
                 // CAST(partition_col, Utf8View)
                 cast(
                     col("partition_col", partitioned_schema.as_ref())?,
                     partitioned_schema.as_ref(),
                     DataType::Utf8View,
                 )?,
-                "partition_col".to_string(),
-            ),
-            (
-                col("int_col", partitioned_schema.as_ref())?,
-                "int_col".to_string(),
+                "partition_col",
             ),
+            ProjectionExpr::new(col("int_col", partitioned_schema.as_ref())?, "int_col"),
         ],
         source,
     )?);
@@ -1482,11 +1668,102 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> {
     let after_optimize =
         ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
 
-    let expected = [
-        "ProjectionExec: expr=[string_col@1 as string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col@0 as int_col]",
-        "  DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false"
-    ];
-    assert_eq!(get_plan_string(&after_optimize), expected);
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col], file_type=csv, has_header=false"
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_cooperative_exec_after_projection() -> Result<()> {
+    let csv = create_simple_csv_exec();
+    let cooperative: Arc<dyn ExecutionPlan> = Arc::new(CooperativeExec::new(csv));
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+        ],
+        cooperative,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as a, b@1 as b]
+      CooperativeExec
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    // Projection should be pushed down through CooperativeExec
+    assert_snapshot!(
+        actual,
+        @r"
+    CooperativeExec
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_hash_join_empty_projection_embeds() -> Result<()> {
+    let left_csv = create_simple_csv_exec();
+    let right_csv = create_simple_csv_exec();
+
+    let join = Arc::new(HashJoinExec::try_new(
+        left_csv,
+        right_csv,
+        vec![(Arc::new(Column::new("a", 0)), Arc::new(Column::new("a", 0)))],
+        None,
+        &JoinType::Right,
+        None,
+        PartitionMode::CollectLeft,
+        NullEquality::NullEqualsNothing,
+        false,
+    )?);
+
+    // Empty projection: no columns needed from the join output
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![] as Vec<ProjectionExpr>,
+        join,
+    )?);
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+
+    // The empty projection should be embedded into the HashJoinExec,
+    // resulting in projection=[] on the join and no ProjectionExec wrapper.
+    assert_snapshot!(
+        actual,
+        @r"
+    HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, a@0)], projection=[]
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
 
     Ok(())
 }
diff --git a/datafusion/core/tests/physical_optimizer/pushdown_sort.rs b/datafusion/core/tests/physical_optimizer/pushdown_sort.rs
new file mode 100644
index 0000000000000..d6fd4d8d00ae4
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/pushdown_sort.rs
@@ -0,0 +1,998 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for sort pushdown optimizer rule (Phase 1)
+//!
+//! Phase 1 tests verify that:
+//! 1. Reverse scan is enabled (reverse_row_groups=true)
+//! 2. SortExec is kept (because ordering is inexact)
+//! 3. output_ordering remains unchanged
+//! 4. Early termination is enabled for TopK queries
+//! 5. Prefix matching works correctly
+
+use datafusion_physical_expr::expressions;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::pushdown_sort::PushdownSort;
+use std::sync::Arc;
+
+use crate::physical_optimizer::test_utils::{
+    OptimizationTest, coalesce_partitions_exec, parquet_exec, parquet_exec_with_sort,
+    projection_exec, projection_exec_with_alias, repartition_exec, schema,
+    simple_projection_exec, sort_exec, sort_exec_with_fetch, sort_expr, sort_expr_named,
+    test_scan_with_ordering,
+};
+
+#[test]
+fn test_sort_pushdown_disabled() {
+    // When pushdown is disabled, plan should remain unchanged
+    let schema = schema();
+    let source = parquet_exec(schema.clone());
+    let sort_exprs = LexOrdering::new(vec![sort_expr("a", &schema)]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), false),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_basic_phase1() {
+    // Phase 1: Reverse scan enabled, Sort kept, output_ordering unchanged
+    let schema = schema();
+
+    // Source has ASC NULLS LAST ordering (default)
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request DESC NULLS LAST ordering (exact reverse)
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_with_limit_phase1() {
+    // Phase 1: Sort with fetch enables early termination but keeps Sort
+    let schema = schema();
+
+    // Source has ASC ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request DESC ordering with limit
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec_with_fetch(desc_ordering, Some(10), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_multiple_columns_phase1() {
+    // Phase 1: Sort on multiple columns - reverse multi-column ordering
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a ASC NULLS FIRST, b DESC] ordering (exact reverse)
+    let reverse_ordering =
+        LexOrdering::new(vec![a.clone().asc().nulls_first(), b.reverse()]).unwrap();
+    let plan = sort_exec(reverse_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// PREFIX MATCHING TESTS
+// ============================================================================
+
+#[test]
+fn test_prefix_match_single_column() {
+    // Test prefix matching: source has [a DESC, b ASC], query needs [a ASC]
+    // After reverse: [a ASC, b DESC] which satisfies [a ASC] prefix
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC NULLS LAST] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request only [a ASC NULLS FIRST] - a prefix of the reversed ordering
+    let prefix_ordering = LexOrdering::new(vec![a.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec(prefix_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_prefix_match_with_limit() {
+    // Test prefix matching with LIMIT - important for TopK optimization
+    let schema = schema();
+
+    // Source has [a ASC, b DESC, c ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering =
+        LexOrdering::new(vec![a.clone(), b.clone().reverse(), c]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a DESC NULLS LAST, b ASC NULLS FIRST] with LIMIT 100
+    // This is a prefix (2 columns) of the reversed 3-column ordering
+    let prefix_ordering =
+        LexOrdering::new(vec![a.reverse(), b.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec_with_fetch(prefix_ordering, Some(100), source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=100), expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 DESC NULLS LAST, c@2 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=100), expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_prefix_match_through_transparent_nodes() {
+    // Test prefix matching works through transparent nodes
+    let schema = schema();
+
+    // Source has [a DESC NULLS LAST, b ASC, c DESC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering =
+        LexOrdering::new(vec![a.clone().reverse(), b, c.reverse()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+
+    // Request only [a ASC NULLS FIRST] - prefix of reversed ordering
+    let prefix_ordering = LexOrdering::new(vec![a.clone().asc().nulls_first()]).unwrap();
+    let plan = sort_exec(prefix_ordering, repartition);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC, c@2 DESC NULLS LAST], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_no_prefix_match_wrong_direction() {
+    // Test that prefix matching does NOT work if the direction is wrong
+    let schema = schema();
+
+    // Source has [a DESC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse(), b]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a DESC] - same direction as source, NOT a reverse prefix
+    let same_direction = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let plan = sort_exec(same_direction, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST, b@1 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_no_prefix_match_longer_than_source() {
+    // Test that prefix matching does NOT work if requested is longer than source
+    let schema = schema();
+
+    // Source has [a DESC] ordering (single column)
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request [a ASC, b DESC] - longer than source, can't be a prefix
+    let longer_ordering =
+        LexOrdering::new(vec![a.clone().asc().nulls_first(), b.reverse()]).unwrap();
+    let plan = sort_exec(longer_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 DESC NULLS LAST], file_type=parquet
+    "
+    );
+}
+
+// ============================================================================
+// ORIGINAL TESTS
+// ============================================================================
+
+#[test]
+fn test_sort_through_repartition() {
+    // Sort should push through RepartitionExec
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, repartition);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_nested_sorts() {
+    // Nested sort operations - only innermost can be optimized
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let inner_sort = sort_exec(desc_ordering, source);
+
+    let sort_exprs2 = LexOrdering::new(vec![b]).unwrap();
+    let plan = sort_exec(sort_exprs2, inner_sort);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+        -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+          -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_non_sort_plans_unchanged() {
+    // Plans without SortExec should pass through unchanged
+    let schema = schema();
+    let plan = parquet_exec(schema.clone());
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_optimizer_properties() {
+    // Test optimizer metadata
+    let optimizer = PushdownSort::new();
+
+    assert_eq!(optimizer.name(), "PushdownSort");
+    assert!(optimizer.schema_check());
+}
+
+#[test]
+fn test_sort_through_coalesce_partitions() {
+    // Sort should push through CoalescePartitionsExec
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+    let coalesce_parts = coalesce_partitions_exec(repartition);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, coalesce_parts);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_complex_plan_with_multiple_operators() {
+    // Test a complex plan with multiple operators between sort and source
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+    let repartition = repartition_exec(source);
+    let coalesce_parts = coalesce_partitions_exec(repartition);
+
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, coalesce_parts);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   CoalescePartitionsExec
+        -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   CoalescePartitionsExec
+          -     RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_multiple_sorts_different_columns() {
+    // Test nested sorts on different columns - only innermost can optimize
+    let schema = schema();
+    let a = sort_expr("a", &schema);
+    let c = sort_expr("c", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // First sort by column 'a' DESC (reverse of source)
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let sort1 = sort_exec(desc_ordering, source);
+
+    // Then sort by column 'c' (different column, can't optimize)
+    let sort_exprs2 = LexOrdering::new(vec![c]).unwrap();
+    let plan = sort_exec(sort_exprs2, sort1);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+        -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[c@2 ASC], preserve_partitioning=[false]
+          -   SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_for_unordered_source() {
+    // Verify pushdown does NOT happen for sources without ordering
+    let schema = schema();
+    let source = parquet_exec(schema.clone()); // No output_ordering
+    let sort_exprs = LexOrdering::new(vec![sort_expr("a", &schema)]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_no_pushdown_for_non_reverse_sort() {
+    // Verify pushdown does NOT happen when sort doesn't reverse source ordering
+    let schema = schema();
+
+    // Source sorted by 'a' ASC
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Request sort by 'b' (different column)
+    let sort_exprs = LexOrdering::new(vec![b]).unwrap();
+    let plan = sort_exec(sort_exprs, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+        -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[b@1 ASC], preserve_partitioning=[false]
+          -   DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_pushdown_through_blocking_node() {
+    // Test that pushdown works for inner sort even when outer sort is blocked
+    // Structure: Sort -> Aggregate (blocks pushdown) -> Sort -> Scan
+    // The outer sort can't push through aggregate, but the inner sort should still optimize
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_plan::aggregates::{
+        AggregateExec, AggregateMode, PhysicalGroupBy,
+    };
+    use std::sync::Arc;
+
+    let schema = schema();
+
+    // Bottom: DataSource with [a ASC NULLS LAST] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Inner Sort: [a DESC NULLS FIRST] - exact reverse, CAN push down to source
+    let inner_sort_ordering = LexOrdering::new(vec![a.clone().reverse()]).unwrap();
+    let inner_sort = sort_exec(inner_sort_ordering, source);
+
+    // Middle: Aggregate (blocks pushdown from outer sort)
+    // GROUP BY a, COUNT(b)
+    let group_by = PhysicalGroupBy::new_single(vec![(
+        Arc::new(expressions::Column::new("a", 0)) as _,
+        "a".to_string(),
+    )]);
+
+    let count_expr = Arc::new(
+        AggregateExprBuilder::new(
+            count_udaf(),
+            vec![Arc::new(expressions::Column::new("b", 1)) as _],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("COUNT(b)")
+        .build()
+        .unwrap(),
+    );
+
+    let aggregate = Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by,
+            vec![count_expr],
+            vec![None],
+            inner_sort,
+            Arc::clone(&schema),
+        )
+        .unwrap(),
+    );
+
+    // Outer Sort: [a ASC] - this CANNOT push down through aggregate
+    let outer_sort_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let plan = sort_exec(outer_sort_ordering, aggregate);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+        -   AggregateExec: mode=Final, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+        -     SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          -   AggregateExec: mode=Final, gby=[a@0 as a], aggr=[COUNT(b)], ordering_mode=Sorted
+          -     SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -       DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// PROJECTION TESTS
+// ============================================================================
+
+#[test]
+fn test_sort_pushdown_through_simple_projection() {
+    // Sort pushes through projection with simple column references
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b (simple column references)
+    let projection = simple_projection_exec(source, vec![0, 1]); // columns a, b
+
+    // Request [a DESC] - should push through projection to source
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_through_projection_with_alias() {
+    // Sort pushes through projection with column aliases
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a AS id, b AS value
+    let projection = projection_exec_with_alias(source, vec![(0, "id"), (1, "value")]);
+
+    // Request [id DESC] - should map to [a DESC] and push down
+    let id_expr = sort_expr_named("id", 0);
+    let desc_ordering = LexOrdering::new(vec![id_expr.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[id@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as id, b@1 as value]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[id@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as id, b@1 as value]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_no_sort_pushdown_through_computed_projection() {
+    use datafusion_expr::Operator;
+
+    // Sort should NOT push through projection with computed columns
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a+b as sum, c
+    let projection = projection_exec(
+        vec![
+            (
+                Arc::new(expressions::BinaryExpr::new(
+                    Arc::new(expressions::Column::new("a", 0)),
+                    Operator::Plus,
+                    Arc::new(expressions::Column::new("b", 1)),
+                )) as Arc<dyn PhysicalExpr>,
+                "sum".to_string(),
+            ),
+            (
+                Arc::new(expressions::Column::new("c", 2)) as Arc<dyn PhysicalExpr>,
+                "c".to_string(),
+            ),
+        ],
+        source,
+    )
+    .unwrap();
+
+    // Request [sum DESC] - should NOT push down (sum is computed)
+    let sum_expr = sort_expr_named("sum", 0);
+    let desc_ordering = LexOrdering::new(vec![sum_expr.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[sum@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 + b@1 as sum, c@2 as c]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[sum@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 + b@1 as sum, c@2 as c]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_reordered_columns() {
+    // Sort pushes through projection that reorders columns
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT c, b, a (columns reordered)
+    let projection = simple_projection_exec(source, vec![2, 1, 0]); // c, b, a
+
+    // Request [a DESC] where a is now at index 2 in projection output
+    let a_expr_at_2 = sort_expr_named("a", 2);
+    let desc_ordering = LexOrdering::new(vec![a_expr_at_2.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@2 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[c@2 as c, b@1 as b, a@0 as a]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@2 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[c@2 as c, b@1 as b, a@0 as a]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_with_limit() {
+    // Sort with LIMIT pushes through simple projection
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b
+    let projection = simple_projection_exec(source, vec![0, 1]);
+
+    // Request [a DESC] with LIMIT 10
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec_with_fetch(desc_ordering, Some(10), projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: TopK(fetch=10), expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_through_projection() {
+    // Sort pushes through both projection and coalesce batches
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a, b
+    let projection = simple_projection_exec(source, vec![0, 1]);
+
+    // Request [a DESC]
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_projection_subset_of_columns() {
+    // Sort pushes through projection that selects subset of columns
+    let schema = schema();
+
+    // Source has [a ASC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone()]).unwrap();
+    let source = parquet_exec_with_sort(schema.clone(), vec![source_ordering]);
+
+    // Projection: SELECT a (subset of columns)
+    let projection = simple_projection_exec(source, vec![0]);
+
+    // Request [a DESC]
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, projection);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   ProjectionExec: expr=[a@0 as a]
+        -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], file_type=parquet
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   ProjectionExec: expr=[a@0 as a]
+          -     DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet, reverse_row_groups=true
+    "
+    );
+}
+
+// ============================================================================
+// TESTSCAN DEMONSTRATION TESTS
+// ============================================================================
+// These tests use TestScan to demonstrate how sort pushdown works more clearly
+// than ParquetExec. TestScan can accept ANY ordering (not just reverse) and
+// displays the requested ordering explicitly in the output.
+
+#[test]
+fn test_sort_pushdown_with_test_scan_basic() {
+    // Demonstrates TestScan showing requested ordering clearly
+    let schema = schema();
+
+    // Source has [a ASC] ordering
+    let a = sort_expr("a", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a DESC] ordering
+    let desc_ordering = LexOrdering::new(vec![a.reverse()]).unwrap();
+    let plan = sort_exec(desc_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC], requested_ordering=[a@0 DESC NULLS LAST]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_with_test_scan_multi_column() {
+    // Demonstrates TestScan with multi-column ordering
+    let schema = schema();
+
+    // Source has [a ASC, b DESC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone().reverse()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a DESC, b ASC] ordering (reverse of source)
+    let reverse_ordering = LexOrdering::new(vec![a.reverse(), b]).unwrap();
+    let plan = sort_exec(reverse_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC, b@1 DESC NULLS LAST]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 DESC NULLS LAST, b@1 ASC], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC, b@1 DESC NULLS LAST], requested_ordering=[a@0 DESC NULLS LAST, b@1 ASC]
+    "
+    );
+}
+
+#[test]
+fn test_sort_pushdown_with_test_scan_arbitrary_ordering() {
+    // Demonstrates that TestScan can accept ANY ordering (not just reverse)
+    // This is different from ParquetExec which only supports reverse scans
+    let schema = schema();
+
+    // Source has [a ASC, b ASC] ordering
+    let a = sort_expr("a", &schema);
+    let b = sort_expr("b", &schema);
+    let source_ordering = LexOrdering::new(vec![a.clone(), b.clone()]).unwrap();
+    let source = test_scan_with_ordering(schema.clone(), source_ordering);
+
+    // Request [a ASC, b DESC] - NOT a simple reverse, but TestScan accepts it
+    let mixed_ordering = LexOrdering::new(vec![a, b.reverse()]).unwrap();
+    let plan = sort_exec(mixed_ordering, source);
+
+    insta::assert_snapshot!(
+        OptimizationTest::new(plan, PushdownSort::new(), true),
+        @r"
+    OptimizationTest:
+      input:
+        - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+        -   TestScan: output_ordering=[a@0 ASC, b@1 ASC]
+      output:
+        Ok:
+          - SortExec: expr=[a@0 ASC, b@1 DESC NULLS LAST], preserve_partitioning=[false]
+          -   TestScan: output_ordering=[a@0 ASC, b@1 ASC], requested_ordering=[a@0 ASC, b@1 DESC NULLS LAST]
+    "
+    );
+}
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/pushdown_utils.rs
similarity index 65%
rename from datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs
rename to datafusion/core/tests/physical_optimizer/pushdown_utils.rs
index dc4d77194c082..ce2cb04b64a5f 100644
--- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs
+++ b/datafusion/core/tests/physical_optimizer/pushdown_utils.rs
@@ -16,31 +16,30 @@
 // under the License.
 
 use arrow::datatypes::SchemaRef;
-use arrow::error::ArrowError;
 use arrow::{array::RecordBatch, compute::concat_batches};
 use datafusion::{datasource::object_store::ObjectStoreUrl, physical_plan::PhysicalExpr};
-use datafusion_common::{config::ConfigOptions, internal_err, Result, Statistics};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, config::ConfigOptions, internal_err};
 use datafusion_datasource::{
-    file::FileSource, file_meta::FileMeta, file_scan_config::FileScanConfig,
+    PartitionedFile, file::FileSource, file_scan_config::FileScanConfig,
     file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture,
-    file_stream::FileOpener, impl_schema_adapter_methods,
-    schema_adapter::DefaultSchemaAdapterFactory, schema_adapter::SchemaAdapterFactory,
-    source::DataSourceExec, PartitionedFile,
+    file_stream::FileOpener, source::DataSourceExec,
 };
-use datafusion_physical_expr::conjunction;
+use datafusion_physical_expr::projection::ProjectionExprs;
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::filter::batch_filter;
+use datafusion_physical_plan::filter_pushdown::{FilterPushdownPhase, PushedDown};
 use datafusion_physical_plan::{
-    displayable,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, displayable,
     filter::FilterExec,
     filter_pushdown::{
-        ChildPushdownResult, FilterDescription, FilterPushdownPropagation,
-        PredicateSupport, PredicateSupports,
+        ChildFilterDescription, ChildPushdownResult, FilterDescription,
+        FilterPushdownPropagation,
     },
     metrics::ExecutionPlanMetricsSet,
-    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
-use futures::stream::BoxStream;
+use futures::StreamExt;
 use futures::{FutureExt, Stream};
 use object_store::ObjectStore;
 use std::{
@@ -53,13 +52,17 @@ use std::{
 pub struct TestOpener {
     batches: Vec<RecordBatch>,
     batch_size: Option<usize>,
-    schema: Option<SchemaRef>,
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionExprs>,
+    predicate: Option<Arc<dyn PhysicalExpr>>,
 }
 
 impl FileOpener for TestOpener {
-    fn open(&self, _file_meta: FileMeta) -> Result<FileOpenFuture> {
+    fn open(&self, _partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         let mut batches = self.batches.clone();
+        if self.batches.is_empty() {
+            return Ok((async { Ok(TestStream::new(vec![]).boxed()) }).boxed());
+        }
+        let schema = self.batches[0].schema();
         if let Some(batch_size) = self.batch_size {
             let batch = concat_batches(&batches[0].schema(), &batches)?;
             let mut new_batches = Vec::new();
@@ -70,56 +73,55 @@ impl FileOpener for TestOpener {
             }
             batches = new_batches.into_iter().collect();
         }
-        if let Some(schema) = &self.schema {
-            let factory = DefaultSchemaAdapterFactory::from_schema(Arc::clone(schema));
-            let (mapper, projection) = factory.map_schema(&batches[0].schema()).unwrap();
-            let mut new_batches = Vec::new();
-            for batch in batches {
-                let batch = batch.project(&projection).unwrap();
-                let batch = mapper.map_batch(batch).unwrap();
-                new_batches.push(batch);
-            }
-            batches = new_batches;
+
+        let mut new_batches = Vec::new();
+        for batch in batches {
+            let batch = if let Some(predicate) = &self.predicate {
+                batch_filter(&batch, predicate)?
+            } else {
+                batch
+            };
+            new_batches.push(batch);
         }
+        batches = new_batches;
+
         if let Some(projection) = &self.projection {
+            let projector = projection.make_projector(&schema)?;
             batches = batches
                 .into_iter()
-                .map(|batch| batch.project(projection).unwrap())
+                .map(|batch| projector.project_batch(&batch).unwrap())
                 .collect();
         }
 
         let stream = TestStream::new(batches);
 
-        Ok((async {
-            let stream: BoxStream<'static, Result<RecordBatch, ArrowError>> =
-                Box::pin(stream);
-            Ok(stream)
-        })
-        .boxed())
+        Ok((async { Ok(stream.boxed()) }).boxed())
     }
 }
 
 /// A placeholder data source that accepts filter pushdown
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct TestSource {
     support: bool,
     predicate: Option<Arc<dyn PhysicalExpr>>,
-    statistics: Option<Statistics>,
     batch_size: Option<usize>,
     batches: Vec<RecordBatch>,
-    schema: Option<SchemaRef>,
     metrics: ExecutionPlanMetricsSet,
-    projection: Option<Vec<usize>>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    projection: Option<ProjectionExprs>,
+    table_schema: datafusion_datasource::TableSchema,
 }
 
 impl TestSource {
-    fn new(support: bool, batches: Vec<RecordBatch>) -> Self {
+    pub fn new(schema: SchemaRef, support: bool, batches: Vec<RecordBatch>) -> Self {
+        let table_schema = datafusion_datasource::TableSchema::new(schema, vec![]);
         Self {
             support,
             metrics: ExecutionPlanMetricsSet::new(),
             batches,
-            ..Default::default()
+            predicate: None,
+            batch_size: None,
+            projection: None,
+            table_schema,
         }
     }
 }
@@ -130,13 +132,17 @@ impl FileSource for TestSource {
         _object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(TestOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(TestOpener {
             batches: self.batches.clone(),
             batch_size: self.batch_size,
-            schema: self.schema.clone(),
             projection: self.projection.clone(),
-        })
+            predicate: self.predicate.clone(),
+        }))
+    }
+
+    fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.predicate.clone()
     }
 
     fn as_any(&self) -> &dyn Any {
@@ -150,39 +156,10 @@ impl FileSource for TestSource {
         })
     }
 
-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
-        Arc::new(TestSource {
-            schema: Some(schema),
-            ..self.clone()
-        })
-    }
-
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(TestSource {
-            projection: config.projection.clone(),
-            ..self.clone()
-        })
-    }
-
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        Arc::new(TestSource {
-            statistics: Some(statistics),
-            ..self.clone()
-        })
-    }
-
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self
-            .statistics
-            .as_ref()
-            .expect("statistics not set")
-            .clone())
-    }
-
     fn file_type(&self) -> &str {
         "test"
     }
@@ -220,19 +197,68 @@ impl FileSource for TestSource {
                 filters.push(Arc::clone(internal));
             }
             let new_node = Arc::new(TestSource {
-                predicate: Some(conjunction(filters.clone())),
+                predicate: datafusion_physical_expr::utils::conjunction_opt(
+                    filters.clone(),
+                ),
                 ..self.clone()
             });
-            Ok(FilterPushdownPropagation {
-                filters: PredicateSupports::all_supported(filters),
-                updated_node: Some(new_node),
-            })
+            Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+                vec![PushedDown::Yes; filters.len()],
+            )
+            .with_updated_node(new_node))
         } else {
-            Ok(FilterPushdownPropagation::unsupported(filters))
+            Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+                vec![PushedDown::No; filters.len()],
+            ))
         }
     }
 
-    impl_schema_adapter_methods!();
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        if let Some(existing_projection) = &self.projection {
+            // Combine existing projection with new projection
+            let combined_projection = existing_projection.try_merge(projection)?;
+            Ok(Some(Arc::new(TestSource {
+                projection: Some(combined_projection),
+                table_schema: self.table_schema.clone(),
+                ..self.clone()
+            })))
+        } else {
+            Ok(Some(Arc::new(TestSource {
+                projection: Some(projection.clone()),
+                ..self.clone()
+            })))
+        }
+    }
+
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        self.projection.as_ref()
+    }
+
+    fn table_schema(&self) -> &datafusion_datasource::TableSchema {
+        &self.table_schema
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit predicate (filter) expression if present
+        if let Some(predicate) = &self.predicate {
+            f(predicate.as_ref())?;
+        }
+
+        // Visit projection expressions if present
+        if let Some(projection) = &self.projection {
+            for proj_expr in projection {
+                f(proj_expr.expr.as_ref())?;
+            }
+        }
+
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 #[derive(Debug, Clone)]
@@ -256,15 +282,21 @@ impl TestScanBuilder {
         self
     }
 
+    pub fn with_batches(mut self, batches: Vec<RecordBatch>) -> Self {
+        self.batches = batches;
+        self
+    }
+
     pub fn build(self) -> Arc<dyn ExecutionPlan> {
-        let source = Arc::new(TestSource::new(self.support, self.batches));
-        let base_config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::parse("test://").unwrap(),
+        let source = Arc::new(TestSource::new(
             Arc::clone(&self.schema),
-            source,
-        )
-        .with_file(PartitionedFile::new("test.paqruet", 123))
-        .build();
+            self.support,
+            self.batches,
+        ));
+        let base_config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::parse("test://").unwrap(), source)
+                .with_file(PartitionedFile::new("test.parquet", 123))
+                .build();
         DataSourceExec::from_data_source(base_config)
     }
 }
@@ -303,11 +335,12 @@ impl TestStream {
     /// least one entry in data (for the schema)
     pub fn new(data: Vec<RecordBatch>) -> Self {
         // check that there is at least one entry in data and that all batches have the same schema
-        assert!(!data.is_empty(), "data must not be empty");
-        assert!(
-            data.iter().all(|batch| batch.schema() == data[0].schema()),
-            "all batches must have the same schema"
-        );
+        if let Some(first) = data.first() {
+            assert!(
+                data.iter().all(|batch| batch.schema() == first.schema()),
+                "all batches must have the same schema"
+            );
+        }
         Self {
             data,
             ..Default::default()
@@ -316,7 +349,7 @@ impl TestStream {
 }
 
 impl Stream for TestStream {
-    type Item = Result<RecordBatch, ArrowError>;
+    type Item = Result<RecordBatch>;
 
     fn poll_next(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         let next_batch = self.index.value();
@@ -345,6 +378,7 @@ pub struct OptimizationTest {
 }
 
 impl OptimizationTest {
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new<O>(
         input_plan: Arc<dyn ExecutionPlan>,
         opt: O,
@@ -411,6 +445,15 @@ fn format_lines(s: &str) -> Vec<String> {
     s.trim().split('\n').map(|s| s.to_string()).collect()
 }
 
+pub fn format_plan_for_test(plan: &Arc<dyn ExecutionPlan>) -> String {
+    let mut out = String::new();
+    for line in format_execution_plan(plan) {
+        out.push_str(&format!("  - {line}\n"));
+    }
+    out.push('\n');
+    out
+}
+
 #[derive(Debug)]
 pub(crate) struct TestNode {
     inject_filter: bool,
@@ -451,7 +494,7 @@ impl ExecutionPlan for TestNode {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         self.input.properties()
     }
 
@@ -481,16 +524,21 @@ impl ExecutionPlan for TestNode {
 
     fn gather_filters_for_pushdown(
         &self,
+        _phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        Ok(FilterDescription::new_with_child_count(1)
-            .all_parent_filters_supported(parent_filters)
-            .with_self_filter(Arc::clone(&self.predicate)))
+        // Since TestNode marks all parent filters as supported and adds its own filter,
+        // we use from_child to create a description with all parent filters supported
+        let child = &self.input;
+        let child_desc = ChildFilterDescription::from_child(&parent_filters, child)?
+            .with_self_filter(Arc::clone(&self.predicate));
+        Ok(FilterDescription::new().with_child(child_desc))
     }
 
     fn handle_child_pushdown_result(
         &self,
+        _phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
@@ -502,29 +550,41 @@ impl ExecutionPlan for TestNode {
             let self_pushdown_result = child_pushdown_result.self_filters[0].clone();
             // And pushed down 1 filter
             assert_eq!(self_pushdown_result.len(), 1);
-            let self_pushdown_result = self_pushdown_result.into_inner();
+            let self_pushdown_result: Vec<_> = self_pushdown_result.into_iter().collect();
+
+            let first_pushdown_result = self_pushdown_result[0].clone();
 
-            match &self_pushdown_result[0] {
-                PredicateSupport::Unsupported(filter) => {
+            match &first_pushdown_result.discriminant {
+                PushedDown::No => {
                     // We have a filter to push down
-                    let new_child =
-                        FilterExec::try_new(Arc::clone(filter), Arc::clone(&self.input))?;
+                    let new_child = FilterExec::try_new(
+                        Arc::clone(&first_pushdown_result.predicate),
+                        Arc::clone(&self.input),
+                    )?;
                     let new_self =
                         TestNode::new(false, Arc::new(new_child), self.predicate.clone());
                     let mut res =
-                        FilterPushdownPropagation::transparent(child_pushdown_result);
+                        FilterPushdownPropagation::if_all(child_pushdown_result);
                     res.updated_node = Some(Arc::new(new_self) as Arc<dyn ExecutionPlan>);
                     Ok(res)
                 }
-                PredicateSupport::Supported(_) => {
-                    let res =
-                        FilterPushdownPropagation::transparent(child_pushdown_result);
+                PushedDown::Yes => {
+                    let res = FilterPushdownPropagation::if_all(child_pushdown_result);
                     Ok(res)
                 }
             }
         } else {
-            let res = FilterPushdownPropagation::transparent(child_pushdown_result);
+            let res = FilterPushdownPropagation::if_all(child_pushdown_result);
             Ok(res)
         }
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit the predicate expression
+        f(self.predicate.as_ref())?;
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
diff --git a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
index 71b9757604ecf..cdfed5011696e 100644
--- a/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
+++ b/datafusion/core/tests/physical_optimizer/replace_with_order_preserving_variants.rs
@@ -18,7 +18,9 @@
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
-    check_integrity, create_test_schema3, sort_preserving_merge_exec,
+    check_integrity, coalesce_partitions_exec, create_test_schema3,
+    parquet_exec_with_sort, sort_exec, sort_exec_with_preserve_partitioning,
+    sort_preserving_merge_exec, sort_preserving_merge_exec_with_fetch,
     stream_exec_ordered_with_projection,
 };
 
@@ -27,1101 +29,1044 @@ use arrow::array::{ArrayRef, Int32Array};
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
+use insta::{allow_duplicates, assert_snapshot};
+use datafusion_common::tree_node::{TransformedResult, TreeNode};
+use datafusion_common::{assert_contains, NullEquality, Result};
+use datafusion_common::config::ConfigOptions;
+use datafusion_datasource::source::DataSourceExec;
 use datafusion_execution::TaskContext;
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
-use datafusion_physical_plan::collect;
+use datafusion_expr::{JoinType, Operator};
+use datafusion_physical_expr::expressions::{self, col, Column};
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_optimizer::enforce_sorting::replace_with_order_preserving_variants::{
+    plan_with_order_breaking_variants, plan_with_order_preserving_variants, replace_with_order_preserving_variants, OrderPreservationContext
+};
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::{
-    displayable, get_plan_string, ExecutionPlan, Partitioning,
+    collect, displayable, ExecutionPlan, Partitioning,
 };
-use datafusion::datasource::source::DataSourceExec;
-use datafusion_common::tree_node::{TransformedResult, TreeNode};
-use datafusion_common::{assert_contains, Result};
-use datafusion_expr::{JoinType, Operator};
-use datafusion_physical_expr::expressions::{self, col, Column};
-use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_optimizer::enforce_sorting::replace_with_order_preserving_variants::{plan_with_order_breaking_variants, plan_with_order_preserving_variants, replace_with_order_preserving_variants, OrderPreservationContext};
-use datafusion_common::config::ConfigOptions;
 
-use crate::physical_optimizer::enforce_sorting::parquet_exec_sorted;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use object_store::ObjectStoreExt;
 use object_store::memory::InMemory;
-use object_store::ObjectStore;
 use rstest::rstest;
 use url::Url;
 
-/// Runs the `replace_with_order_preserving_variants` sub-rule and asserts
-/// the plan against the original and expected plans.
-///
-/// # Parameters
-///
-/// * `$EXPECTED_PLAN_LINES`: Expected input plan.
-/// * `EXPECTED_OPTIMIZED_PLAN_LINES`: Optimized plan when the flag
-///   `prefer_existing_sort` is `false`.
-/// * `EXPECTED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES`: Optimized plan when
-///   the flag `prefer_existing_sort` is `true`.
-/// * `$PLAN`: The plan to optimize.
-macro_rules! assert_optimized_prefer_sort_on_off {
-    ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $EXPECTED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $PREFER_EXISTING_SORT: expr, $SOURCE_UNBOUNDED: expr) => {
-        if $PREFER_EXISTING_SORT {
-            assert_optimized!(
-                $EXPECTED_PLAN_LINES,
-                $EXPECTED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES,
-                $PLAN,
-                $PREFER_EXISTING_SORT,
-                $SOURCE_UNBOUNDED
-            );
-        } else {
-            assert_optimized!(
-                $EXPECTED_PLAN_LINES,
-                $EXPECTED_OPTIMIZED_PLAN_LINES,
-                $PLAN,
-                $PREFER_EXISTING_SORT,
-                $SOURCE_UNBOUNDED
-            );
-        }
-    };
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Boundedness {
+    Unbounded,
+    Bounded,
 }
 
-/// Runs the `replace_with_order_preserving_variants` sub-rule and asserts
-/// the plan against the original and expected plans for both bounded and
-/// unbounded cases.
-///
-/// # Parameters
-///
-/// * `EXPECTED_UNBOUNDED_PLAN_LINES`: Expected input unbounded plan.
-/// * `EXPECTED_BOUNDED_PLAN_LINES`: Expected input bounded plan.
-/// * `EXPECTED_UNBOUNDED_OPTIMIZED_PLAN_LINES`: Optimized plan, which is
-///   the same regardless of the value of the `prefer_existing_sort` flag.
-/// * `EXPECTED_BOUNDED_OPTIMIZED_PLAN_LINES`: Optimized plan when the flag
-///   `prefer_existing_sort` is `false` for bounded cases.
-/// * `EXPECTED_BOUNDED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES`: Optimized plan
-///   when the flag `prefer_existing_sort` is `true` for bounded cases.
-/// * `$PLAN`: The plan to optimize.
-/// * `$SOURCE_UNBOUNDED`: Whether the given plan contains an unbounded source.
-macro_rules! assert_optimized_in_all_boundedness_situations {
-    ($EXPECTED_UNBOUNDED_PLAN_LINES: expr,  $EXPECTED_BOUNDED_PLAN_LINES: expr, $EXPECTED_UNBOUNDED_OPTIMIZED_PLAN_LINES: expr, $EXPECTED_BOUNDED_OPTIMIZED_PLAN_LINES: expr, $EXPECTED_BOUNDED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $SOURCE_UNBOUNDED: expr, $PREFER_EXISTING_SORT: expr) => {
-        if $SOURCE_UNBOUNDED {
-            assert_optimized_prefer_sort_on_off!(
-                $EXPECTED_UNBOUNDED_PLAN_LINES,
-                $EXPECTED_UNBOUNDED_OPTIMIZED_PLAN_LINES,
-                $EXPECTED_UNBOUNDED_OPTIMIZED_PLAN_LINES,
-                $PLAN,
-                $PREFER_EXISTING_SORT,
-                $SOURCE_UNBOUNDED
-            );
-        } else {
-            assert_optimized_prefer_sort_on_off!(
-                $EXPECTED_BOUNDED_PLAN_LINES,
-                $EXPECTED_BOUNDED_OPTIMIZED_PLAN_LINES,
-                $EXPECTED_BOUNDED_PREFER_SORT_ON_OPTIMIZED_PLAN_LINES,
-                $PLAN,
-                $PREFER_EXISTING_SORT,
-                $SOURCE_UNBOUNDED
-            );
-        }
-    };
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SortPreference {
+    PreserveOrder,
+    MaximizeParallelism,
 }
 
-/// Runs the `replace_with_order_preserving_variants` sub-rule and asserts
-/// the plan against the original and expected plans.
-///
-/// # Parameters
-///
-/// * `$EXPECTED_PLAN_LINES`: Expected input plan.
-/// * `$EXPECTED_OPTIMIZED_PLAN_LINES`: Expected optimized plan.
-/// * `$PLAN`: The plan to optimize.
-/// * `$PREFER_EXISTING_SORT`: Value of the `prefer_existing_sort` flag.
-#[macro_export]
-macro_rules! assert_optimized {
-        ($EXPECTED_PLAN_LINES: expr, $EXPECTED_OPTIMIZED_PLAN_LINES: expr, $PLAN: expr, $PREFER_EXISTING_SORT: expr, $SOURCE_UNBOUNDED: expr) => {
-            let physical_plan = $PLAN;
-            let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-            let actual: Vec<&str> = formatted.trim().lines().collect();
-
-            let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES
-                .iter().map(|s| *s).collect();
-
-            assert_eq!(
-                expected_plan_lines, actual,
-                "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n"
-            );
+struct ReplaceTest {
+    plan: Arc<dyn ExecutionPlan>,
+    boundedness: Boundedness,
+    sort_preference: SortPreference,
+}
 
-            let expected_optimized_lines: Vec<&str> = $EXPECTED_OPTIMIZED_PLAN_LINES.iter().map(|s| *s).collect();
+impl ReplaceTest {
+    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self {
+            plan,
+            boundedness: Boundedness::Bounded,
+            sort_preference: SortPreference::MaximizeParallelism,
+        }
+    }
 
-            // Run the rule top-down
-            let mut config = ConfigOptions::new();
-            config.optimizer.prefer_existing_sort=$PREFER_EXISTING_SORT;
-            let plan_with_pipeline_fixer = OrderPreservationContext::new_default(physical_plan);
-            let parallel = plan_with_pipeline_fixer.transform_up(|plan_with_pipeline_fixer| replace_with_order_preserving_variants(plan_with_pipeline_fixer, false, false, &config)).data().and_then(check_integrity)?;
-            let optimized_physical_plan = parallel.plan;
+    fn with_boundedness(mut self, boundedness: Boundedness) -> Self {
+        self.boundedness = boundedness;
+        self
+    }
 
-            // Get string representation of the plan
-            let actual = get_plan_string(&optimized_physical_plan);
-            assert_eq!(
-                expected_optimized_lines, actual,
-                "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_optimized_lines:#?}\nactual:\n\n{actual:#?}\n\n"
+    fn with_sort_preference(mut self, sort_preference: SortPreference) -> Self {
+        self.sort_preference = sort_preference;
+        self
+    }
+
+    async fn execute_plan(&self) -> String {
+        let mut config = ConfigOptions::new();
+        config.optimizer.prefer_existing_sort =
+            self.sort_preference == SortPreference::PreserveOrder;
+
+        let plan_with_pipeline_fixer = OrderPreservationContext::new_default(
+            self.plan.clone().reset_state().unwrap(),
+        );
+
+        let parallel = plan_with_pipeline_fixer
+            .transform_up(|plan_with_pipeline_fixer| {
+                replace_with_order_preserving_variants(
+                    plan_with_pipeline_fixer,
+                    false,
+                    false,
+                    &config,
+                )
+            })
+            .data()
+            .and_then(check_integrity)
+            .unwrap();
+
+        let optimized_physical_plan = parallel.plan;
+        let optimized_plan_string = displayable(optimized_physical_plan.as_ref())
+            .indent(true)
+            .to_string();
+
+        if self.boundedness == Boundedness::Bounded {
+            let ctx = SessionContext::new();
+            let object_store = InMemory::new();
+            object_store
+                .put(
+                    &object_store::path::Path::from("file_path"),
+                    bytes::Bytes::from("").into(),
+                )
+                .await
+                .expect("could not create object store");
+            ctx.register_object_store(
+                &Url::parse("test://").unwrap(),
+                Arc::new(object_store),
             );
+            let task_ctx = Arc::new(TaskContext::from(&ctx));
+            let res = collect(optimized_physical_plan, task_ctx).await;
+            assert!(
+                res.is_ok(),
+                "Some errors occurred while executing the optimized physical plan: {:?}\nPlan: {}",
+                res.unwrap_err(),
+                optimized_plan_string
+            );
+        }
+
+        optimized_plan_string
+    }
+
+    async fn run(&self) -> String {
+        let input_plan_string = displayable(self.plan.as_ref()).indent(true).to_string();
 
-            if !$SOURCE_UNBOUNDED {
-                let ctx = SessionContext::new();
-                let object_store = InMemory::new();
-                object_store.put(&object_store::path::Path::from("file_path"), bytes::Bytes::from("").into()).await?;
-                ctx.register_object_store(&Url::parse("test://").unwrap(), Arc::new(object_store));
-                let task_ctx = Arc::new(TaskContext::from(&ctx));
-                let res = collect(optimized_physical_plan, task_ctx).await;
-                assert!(
-                    res.is_ok(),
-                    "Some errors occurred while executing the optimized physical plan: {:?}", res.unwrap_err()
-                );
-            }
-        };
+        let optimized = self.execute_plan().await;
+
+        if input_plan_string == optimized {
+            format!("Input / Optimized:\n{input_plan_string}")
+        } else {
+            format!("Input:\n{input_plan_string}\nOptimized:\n{optimized}")
+        }
     }
+}
 
 #[rstest]
 #[tokio::test]
 // Searches for a simple sort and a repartition just after it, the second repartition with 1 input partition should not be affected
 async fn test_replace_multiple_input_repartition_1(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let sort_exprs: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, sort_exprs.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, sort_exprs.clone()),
     };
     let repartition = repartition_exec_hash(repartition_exec_round_robin(source));
-    let sort = sort_exec(vec![sort_expr("a", &schema)], repartition, true);
-
-    let physical_plan = sort_preserving_merge_exec(vec![sort_expr("a", &schema)], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort = sort_exec_with_preserve_partitioning(sort_exprs.clone(), repartition);
+    let physical_plan = sort_preserving_merge_exec(sort_exprs, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                  StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_with_inter_children_change_only(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr_default("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr_default("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
-    let sort = sort_exec(
-        vec![sort_expr_default("a", &coalesce_partitions.schema())],
-        coalesce_partitions,
-        false,
-    );
+    let sort = sort_exec(ordering.clone(), coalesce_partitions);
     let repartition_rr2 = repartition_exec_round_robin(sort);
     let repartition_hash2 = repartition_exec_hash(repartition_rr2);
     let filter = filter_exec(repartition_hash2);
-    let sort2 = sort_exec(vec![sort_expr_default("a", &filter.schema())], filter, true);
-
-    let physical_plan =
-        sort_preserving_merge_exec(vec![sort_expr_default("a", &sort2.schema())], sort2);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  FilterExec: c@1 > 3",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        SortPreservingMergeExec: [a@0 ASC]",
-            "          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  FilterExec: c@1 > 3",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        SortPreservingMergeExec: [a@0 ASC]",
-            "          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort2 = sort_exec_with_preserve_partitioning(ordering.clone(), filter);
+
+    let physical_plan = sort_preserving_merge_exec(ordering, sort2);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC]
+              SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    SortPreservingMergeExec: [a@0 ASC]
+                      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
+                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC]
+              SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC]
+              SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    SortPreservingMergeExec: [a@0 ASC]
+                      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC
+                        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_replace_multiple_input_repartition_2(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let filter = filter_exec(repartition_rr);
     let repartition_hash = repartition_exec_hash(filter);
-    let sort = sort_exec(vec![sort_expr("a", &schema)], repartition_hash, true);
-
-    let physical_plan = sort_preserving_merge_exec(vec![sort_expr("a", &schema)], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded =  [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded =  [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded =  [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), repartition_hash);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_replace_multiple_input_repartition_with_extra_steps(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec: Arc<dyn ExecutionPlan> = coalesce_batches_exec(filter);
-    let sort = sort_exec(vec![sort_expr("a", &schema)], coalesce_batches_exec, true);
-
-    let physical_plan = sort_preserving_merge_exec(vec![sort_expr("a", &schema)], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), filter);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_replace_multiple_input_repartition_with_extra_steps_2(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
-    let coalesce_batches_exec_1 = coalesce_batches_exec(repartition_rr);
-    let repartition_hash = repartition_exec_hash(coalesce_batches_exec_1);
+    let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec_2 = coalesce_batches_exec(filter);
-    let sort = sort_exec(vec![sort_expr("a", &schema)], coalesce_batches_exec_2, true);
-
-    let physical_plan = sort_preserving_merge_exec(vec![sort_expr("a", &schema)], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          CoalesceBatchesExec: target_batch_size=8192",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          CoalesceBatchesExec: target_batch_size=8192",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "        CoalesceBatchesExec: target_batch_size=8192",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          CoalesceBatchesExec: target_batch_size=8192",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "        CoalesceBatchesExec: target_batch_size=8192",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), filter);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_not_replacing_when_no_need_to_preserve_sorting(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => stream_exec_ordered_with_projection(&schema, ordering),
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches_exec: Arc<dyn ExecutionPlan> = coalesce_batches_exec(filter);
-
-    let physical_plan: Arc<dyn ExecutionPlan> =
-        coalesce_partitions_exec(coalesce_batches_exec);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "CoalescePartitionsExec",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "CoalescePartitionsExec",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "CoalescePartitionsExec",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results same with and without flag, because there is no executor  with ordering requirement
-    let expected_optimized_bounded = [
-            "CoalescePartitionsExec",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = expected_optimized_bounded;
-
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let physical_plan = coalesce_partitions_exec(filter);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            CoalescePartitionsExec
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            CoalescePartitionsExec
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+                // Expected bounded results same with and without flag, because there is no executor  with ordering requirement
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            CoalescePartitionsExec
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
-async fn test_with_multiple_replacable_repartitions(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+async fn test_with_multiple_replaceable_repartitions(
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let filter = filter_exec(repartition_hash);
-    let coalesce_batches = coalesce_batches_exec(filter);
-    let repartition_hash_2 = repartition_exec_hash(coalesce_batches);
-    let sort = sort_exec(vec![sort_expr("a", &schema)], repartition_hash_2, true);
-
-    let physical_plan = sort_preserving_merge_exec(vec![sort_expr("a", &schema)], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      CoalesceBatchesExec: target_batch_size=8192",
-            "        FilterExec: c@1 > 3",
-            "          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      CoalesceBatchesExec: target_batch_size=8192",
-            "        FilterExec: c@1 > 3",
-            "          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      CoalesceBatchesExec: target_batch_size=8192",
-            "        FilterExec: c@1 > 3",
-            "          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    CoalesceBatchesExec: target_batch_size=8192",
-            "      FilterExec: c@1 > 3",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let repartition_hash_2 = repartition_exec_hash(filter);
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), repartition_hash_2);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  FilterExec: c@1 > 3
+                    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_not_replace_with_different_orderings(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
+    use datafusion_physical_expr::LexOrdering;
+
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering_a = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering_a)
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering_a),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
-    let sort = sort_exec(
-        vec![sort_expr_default("c", &repartition_hash.schema())],
-        repartition_hash,
-        true,
-    );
+    let ordering_c: LexOrdering =
+        [sort_expr_default("c", &repartition_hash.schema())].into();
+    let sort = sort_exec_with_preserve_partitioning(ordering_c.clone(), repartition_hash);
+    let physical_plan = sort_preserving_merge_exec(ordering_c, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+                // Expected bounded results same with and without flag, because ordering requirement of the executor is
+                // different from the existing ordering.
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
 
-    let physical_plan =
-        sort_preserving_merge_exec(vec![sort_expr_default("c", &sort.schema())], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results same with and without flag, because ordering requirement of the executor is different than the existing ordering.
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = expected_optimized_bounded;
-
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_with_lost_ordering(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering: LexOrdering = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering.clone())
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering.clone()),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
-    let physical_plan =
-        sort_exec(vec![sort_expr("a", &schema)], coalesce_partitions, false);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]",
-            "  CoalescePartitionsExec",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]",
-            "  CoalescePartitionsExec",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]",
-            "  CoalescePartitionsExec",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [a@0 ASC NULLS LAST]",
-            "  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST",
-            "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let physical_plan = sort_exec(ordering, coalesce_partitions);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+              CoalescePartitionsExec
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                  StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+              CoalescePartitionsExec
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+              CoalescePartitionsExec
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST
+                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_with_lost_and_kept_ordering(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
+    use datafusion_physical_expr::LexOrdering;
+
     let schema = create_test_schema()?;
-    let sort_exprs = vec![sort_expr("a", &schema)];
-    let source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, sort_exprs)
+    let ordering_a = [sort_expr("a", &schema)].into();
+    let source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, ordering_a)
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, ordering_a),
     };
     let repartition_rr = repartition_exec_round_robin(source);
     let repartition_hash = repartition_exec_hash(repartition_rr);
     let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
-    let sort = sort_exec(
-        vec![sort_expr_default("c", &coalesce_partitions.schema())],
-        coalesce_partitions,
-        false,
-    );
+    let ordering_c: LexOrdering =
+        [sort_expr_default("c", &coalesce_partitions.schema())].into();
+    let sort = sort_exec(ordering_c.clone(), coalesce_partitions);
     let repartition_rr2 = repartition_exec_round_robin(sort);
     let repartition_hash2 = repartition_exec_hash(repartition_rr2);
     let filter = filter_exec(repartition_hash2);
-    let sort2 = sort_exec(vec![sort_expr_default("c", &filter.schema())], filter, true);
-
-    let physical_plan =
-        sort_preserving_merge_exec(vec![sort_expr_default("c", &sort2.schema())], sort2);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[c@1 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[c@1 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  FilterExec: c@1 > 3",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        SortExec: expr=[c@1 ASC], preserve_partitioning=[false]",
-            "          CoalescePartitionsExec",
-            "            RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "              RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results with and without flag
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  SortExec: expr=[c@1 ASC], preserve_partitioning=[true]",
-            "    FilterExec: c@1 > 3",
-            "      RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "        RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "          SortExec: expr=[c@1 ASC], preserve_partitioning=[false]",
-            "            CoalescePartitionsExec",
-            "              RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "                RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                  DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = [
-            "SortPreservingMergeExec: [c@1 ASC]",
-            "  FilterExec: c@1 > 3",
-            "    RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "        SortExec: expr=[c@1 ASC], preserve_partitioning=[false]",
-            "          CoalescePartitionsExec",
-            "            RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "              RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "                DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let sort2 = sort_exec_with_preserve_partitioning(ordering_c.clone(), filter);
+    let physical_plan = sort_preserving_merge_exec(ordering_c, sort2);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+
+            Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
+                      CoalescePartitionsExec
+                        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::MaximizeParallelism) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        },
+        (Boundedness::Bounded, SortPreference::PreserveOrder) => {
+            assert_snapshot!(physical_plan, @r"
+            Input:
+            SortPreservingMergeExec: [c@1 ASC]
+              SortExec: expr=[c@1 ASC], preserve_partitioning=[true]
+                FilterExec: c@1 > 3
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
+                        CoalescePartitionsExec
+                          RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                            RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                              DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+
+            Optimized:
+            SortPreservingMergeExec: [c@1 ASC]
+              FilterExec: c@1 > 3
+                RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC
+                  RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                    SortExec: expr=[c@1 ASC], preserve_partitioning=[false]
+                      CoalescePartitionsExec
+                        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+        }
+    }
+    }
+
     Ok(())
 }
 
 #[rstest]
 #[tokio::test]
 async fn test_with_multiple_child_trees(
-    #[values(false, true)] source_unbounded: bool,
-    #[values(false, true)] prefer_existing_sort: bool,
+    #[values(Boundedness::Unbounded, Boundedness::Bounded)] boundedness: Boundedness,
+    #[values(SortPreference::PreserveOrder, SortPreference::MaximizeParallelism)]
+    sort_pref: SortPreference,
 ) -> Result<()> {
     let schema = create_test_schema()?;
 
-    let left_sort_exprs = vec![sort_expr("a", &schema)];
-    let left_source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, left_sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, left_sort_exprs)
+    let left_ordering = [sort_expr("a", &schema)].into();
+    let left_source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, left_ordering)
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, left_ordering),
     };
     let left_repartition_rr = repartition_exec_round_robin(left_source);
     let left_repartition_hash = repartition_exec_hash(left_repartition_rr);
-    let left_coalesce_partitions =
-        Arc::new(CoalesceBatchesExec::new(left_repartition_hash, 4096));
-
-    let right_sort_exprs = vec![sort_expr("a", &schema)];
-    let right_source = if source_unbounded {
-        stream_exec_ordered_with_projection(&schema, right_sort_exprs)
-    } else {
-        memory_exec_sorted(&schema, right_sort_exprs)
+
+    let right_ordering = [sort_expr("a", &schema)].into();
+    let right_source = match boundedness {
+        Boundedness::Unbounded => {
+            stream_exec_ordered_with_projection(&schema, right_ordering)
+        }
+        Boundedness::Bounded => memory_exec_sorted(&schema, right_ordering),
     };
     let right_repartition_rr = repartition_exec_round_robin(right_source);
     let right_repartition_hash = repartition_exec_hash(right_repartition_rr);
-    let right_coalesce_partitions =
-        Arc::new(CoalesceBatchesExec::new(right_repartition_hash, 4096));
-
-    let hash_join_exec =
-        hash_join_exec(left_coalesce_partitions, right_coalesce_partitions);
-    let sort = sort_exec(
-        vec![sort_expr_default("a", &hash_join_exec.schema())],
-        hash_join_exec,
-        true,
-    );
 
-    let physical_plan =
-        sort_preserving_merge_exec(vec![sort_expr_default("a", &sort.schema())], sort);
-
-    // Expected inputs unbounded and bounded
-    let expected_input_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-    let expected_input_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-
-    // Expected unbounded result (same for with and without flag)
-    let expected_optimized_unbounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]",
-        ];
-
-    // Expected bounded results same with and without flag, because ordering get lost during intermediate executor anyway. Hence no need to preserve
-    // existing ordering.
-    let expected_optimized_bounded = [
-            "SortPreservingMergeExec: [a@0 ASC]",
-            "  SortExec: expr=[a@0 ASC], preserve_partitioning=[true]",
-            "    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-            "      CoalesceBatchesExec: target_batch_size=4096",
-            "        RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8",
-            "          RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            "            DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST",
-        ];
-    let expected_optimized_bounded_sort_preserve = expected_optimized_bounded;
-
-    assert_optimized_in_all_boundedness_situations!(
-        expected_input_unbounded,
-        expected_input_bounded,
-        expected_optimized_unbounded,
-        expected_optimized_bounded,
-        expected_optimized_bounded_sort_preserve,
-        physical_plan,
-        source_unbounded,
-        prefer_existing_sort
-    );
+    let hash_join_exec = hash_join_exec(left_repartition_hash, right_repartition_hash);
+    let ordering: LexOrdering = [sort_expr_default("a", &hash_join_exec.schema())].into();
+    let sort = sort_exec_with_preserve_partitioning(ordering.clone(), hash_join_exec);
+    let physical_plan = sort_preserving_merge_exec(ordering, sort);
+
+    let run = ReplaceTest::new(physical_plan)
+        .with_boundedness(boundedness)
+        .with_sort_preference(sort_pref);
+
+    let physical_plan = run.run().await;
+
+    allow_duplicates! {
+    match (boundedness, sort_pref) {
+        (Boundedness::Unbounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC]
+              SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      StreamingTableExec: partition_sizes=1, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST]
+            ");
+        },
+        (Boundedness::Bounded, _) => {
+            assert_snapshot!(physical_plan, @r"
+            Input / Optimized:
+            SortPreservingMergeExec: [a@0 ASC]
+              SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+                HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, c@1)]
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+                  RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8
+                    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+                      DataSourceExec: partitions=1, partition_sizes=[1], output_ordering=a@0 ASC NULLS LAST
+            ");
+                // Expected bounded results same with and without flag, because ordering get lost during intermediate executor anyway.
+                //  Hence, no need to preserve existing ordering.
+        }
+    }
+    }
+
     Ok(())
 }
 
@@ -1149,18 +1094,6 @@ fn sort_expr_options(
     }
 }
 
-fn sort_exec(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
-    input: Arc<dyn ExecutionPlan>,
-    preserve_partitioning: bool,
-) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(
-        SortExec::new(sort_exprs, input)
-            .with_preserve_partitioning(preserve_partitioning),
-    )
-}
-
 fn repartition_exec_round_robin(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     Arc::new(RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(8)).unwrap())
 }
@@ -1188,14 +1121,6 @@ fn filter_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     Arc::new(FilterExec::try_new(predicate, input).unwrap())
 }
 
-fn coalesce_batches_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalesceBatchesExec::new(input, 8192))
-}
-
-fn coalesce_partitions_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalescePartitionsExec::new(input))
-}
-
 fn hash_join_exec(
     left: Arc<dyn ExecutionPlan>,
     right: Arc<dyn ExecutionPlan>,
@@ -1213,6 +1138,7 @@ fn hash_join_exec(
             &JoinType::Inner,
             None,
             PartitionMode::Partitioned,
+            NullEquality::NullEqualsNothing,
             false,
         )
         .unwrap(),
@@ -1233,7 +1159,7 @@ fn create_test_schema() -> Result<SchemaRef> {
 // projection parameter is given static due to testing needs
 fn memory_exec_sorted(
     schema: &SchemaRef,
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
 ) -> Arc<dyn ExecutionPlan> {
     pub fn make_partition(schema: &SchemaRef, sz: i32) -> RecordBatch {
         let values = (0..sz).collect::<Vec<_>>();
@@ -1249,7 +1175,6 @@ fn memory_exec_sorted(
 
     let rows = 5;
     let partitions = 1;
-    let sort_exprs = sort_exprs.into_iter().collect();
     Arc::new({
         let data: Vec<Vec<_>> = (0..partitions)
             .map(|_| vec![make_partition(schema, rows)])
@@ -1258,7 +1183,7 @@ fn memory_exec_sorted(
         DataSourceExec::new(Arc::new(
             MemorySourceConfig::try_new(&data, schema.clone(), Some(projection))
                 .unwrap()
-                .try_with_sort_information(vec![sort_exprs])
+                .try_with_sort_information(vec![ordering])
                 .unwrap(),
         ))
     })
@@ -1268,12 +1193,11 @@ fn memory_exec_sorted(
 fn test_plan_with_order_preserving_variants_preserves_fetch() -> Result<()> {
     // Create a schema
     let schema = create_test_schema3()?;
-    let parquet_sort_exprs = vec![crate::physical_optimizer::test_utils::sort_expr(
-        "a", &schema,
-    )];
-    let parquet_exec = parquet_exec_sorted(&schema, parquet_sort_exprs);
-    let coalesced =
-        Arc::new(CoalescePartitionsExec::new(parquet_exec.clone()).with_fetch(Some(10)));
+    let parquet_sort_exprs = vec![[sort_expr("a", &schema)].into()];
+    let parquet_exec = parquet_exec_with_sort(schema, parquet_sort_exprs);
+    let coalesced = coalesce_partitions_exec(parquet_exec.clone())
+        .with_fetch(Some(10))
+        .unwrap();
 
     // Test sort's fetch is greater than coalesce fetch, return error because it's not reasonable
     let requirements = OrderPreservationContext::new(
@@ -1286,7 +1210,10 @@ fn test_plan_with_order_preserving_variants_preserves_fetch() -> Result<()> {
         )],
     );
     let res = plan_with_order_preserving_variants(requirements, false, true, Some(15));
-    assert_contains!(res.unwrap_err().to_string(), "CoalescePartitionsExec fetch [10] should be greater than or equal to SortExec fetch [15]");
+    assert_contains!(
+        res.unwrap_err().to_string(),
+        "CoalescePartitionsExec fetch [10] should be greater than or equal to SortExec fetch [15]"
+    );
 
     // Test sort is without fetch, expected to get the fetch value from the coalesced
     let requirements = OrderPreservationContext::new(
@@ -1315,17 +1242,15 @@ fn test_plan_with_order_preserving_variants_preserves_fetch() -> Result<()> {
 #[test]
 fn test_plan_with_order_breaking_variants_preserves_fetch() -> Result<()> {
     let schema = create_test_schema3()?;
-    let parquet_sort_exprs = vec![crate::physical_optimizer::test_utils::sort_expr(
-        "a", &schema,
-    )];
-    let parquet_exec = parquet_exec_sorted(&schema, parquet_sort_exprs.clone());
-    let spm = SortPreservingMergeExec::new(
-        LexOrdering::new(parquet_sort_exprs),
+    let parquet_sort_exprs: LexOrdering = [sort_expr("a", &schema)].into();
+    let parquet_exec = parquet_exec_with_sort(schema, vec![parquet_sort_exprs.clone()]);
+    let spm = sort_preserving_merge_exec_with_fetch(
+        parquet_sort_exprs,
         parquet_exec.clone(),
-    )
-    .with_fetch(Some(10));
+        10,
+    );
     let requirements = OrderPreservationContext::new(
-        Arc::new(spm),
+        spm,
         true,
         vec![OrderPreservationContext::new(
             parquet_exec.clone(),
diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs
index a73d084a081f3..217570846d56e 100644
--- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs
+++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use insta::assert_snapshot;
 use std::sync::Arc;
 
 use crate::physical_optimizer::test_utils::{
     bounded_window_exec, global_limit_exec, local_limit_exec, memory_exec,
-    repartition_exec, sort_exec, sort_expr_options, sort_merge_join_exec,
+    projection_exec, repartition_exec, sort_exec, sort_expr, sort_expr_options,
+    sort_merge_join_exec, sort_preserving_merge_exec, union_exec,
 };
 
 use arrow::compute::SortOptions;
@@ -27,13 +29,14 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
 use datafusion::prelude::{CsvReadOptions, SessionContext};
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{JoinType, Result};
-use datafusion_physical_expr::expressions::col;
+use datafusion_common::{JoinType, Result, ScalarValue};
 use datafusion_physical_expr::Partitioning;
-use datafusion_physical_optimizer::sanity_checker::SanityCheckPlan;
+use datafusion_physical_expr::expressions::{Literal, col};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::sanity_checker::SanityCheckPlan;
 use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::{displayable, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, displayable};
 
 use async_trait::async_trait;
 
@@ -397,34 +400,32 @@ fn assert_sanity_check(plan: &Arc<dyn ExecutionPlan>, is_sane: bool) {
     );
 }
 
-/// Check if the plan we created is as expected by comparing the plan
-/// formatted as a string.
-fn assert_plan(plan: &dyn ExecutionPlan, expected_lines: Vec<&str>) {
-    let plan_str = displayable(plan).indent(true).to_string();
-    let actual_lines: Vec<&str> = plan_str.trim().lines().collect();
-    assert_eq!(actual_lines, expected_lines);
-}
-
 #[tokio::test]
 /// Tests that plan is valid when the sort requirements are satisfied.
 async fn test_bounded_window_agg_sort_requirement() -> Result<()> {
     let schema = create_test_schema();
     let source = memory_exec(&schema);
-    let sort_exprs = vec![sort_expr_options(
+    let ordering: LexOrdering = [sort_expr_options(
         "c9",
         &source.schema(),
         SortOptions {
             descending: false,
             nulls_first: false,
         },
-    )];
-    let sort = sort_exec(sort_exprs.clone(), source);
-    let bw = bounded_window_exec("c9", sort_exprs, sort);
-    assert_plan(bw.as_ref(), vec![
-        "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]",
-        "    DataSourceExec: partitions=1, partition_sizes=[0]"
-    ]);
+    )]
+    .into();
+    let sort = sort_exec(ordering.clone(), source);
+    let bw = bounded_window_exec("c9", ordering, sort);
+    let plan_str = displayable(bw.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    "#
+    );
     assert_sanity_check(&bw, true);
     Ok(())
 }
@@ -443,10 +444,15 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> {
         },
     )];
     let bw = bounded_window_exec("c9", sort_exprs, source);
-    assert_plan(bw.as_ref(), vec![
-        "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]",
-        "  DataSourceExec: partitions=1, partition_sizes=[0]"
-    ]);
+    let plan_str = displayable(bw.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    "#
+    );
     // Order requirement of the `BoundedWindowAggExec` is not satisfied. We expect to receive error during sanity check.
     assert_sanity_check(&bw, false);
     Ok(())
@@ -458,14 +464,16 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> {
 async fn test_global_limit_single_partition() -> Result<()> {
     let schema = create_test_schema();
     let source = memory_exec(&schema);
-    let limit = global_limit_exec(source);
-
-    assert_plan(
-        limit.as_ref(),
-        vec![
-            "GlobalLimitExec: skip=0, fetch=100",
-            "  DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let limit = global_limit_exec(source, 0, Some(100));
+
+    let plan_str = displayable(limit.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     assert_sanity_check(&limit, true);
     Ok(())
@@ -477,15 +485,17 @@ async fn test_global_limit_single_partition() -> Result<()> {
 async fn test_global_limit_multi_partition() -> Result<()> {
     let schema = create_test_schema();
     let source = memory_exec(&schema);
-    let limit = global_limit_exec(repartition_exec(source));
-
-    assert_plan(
-        limit.as_ref(),
-        vec![
-            "GlobalLimitExec: skip=0, fetch=100",
-            "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "    DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let limit = global_limit_exec(repartition_exec(source), 0, Some(100));
+
+    let plan_str = displayable(limit.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    GlobalLimitExec: skip=0, fetch=100
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     // Distribution requirement of the `GlobalLimitExec` is not satisfied. We expect to receive error during sanity check.
     assert_sanity_check(&limit, false);
@@ -497,14 +507,16 @@ async fn test_global_limit_multi_partition() -> Result<()> {
 async fn test_local_limit() -> Result<()> {
     let schema = create_test_schema();
     let source = memory_exec(&schema);
-    let limit = local_limit_exec(source);
-
-    assert_plan(
-        limit.as_ref(),
-        vec![
-            "LocalLimitExec: fetch=100",
-            "  DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let limit = local_limit_exec(source, 100);
+
+    let plan_str = displayable(limit.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    LocalLimitExec: fetch=100
+      DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     assert_sanity_check(&limit, true);
     Ok(())
@@ -518,12 +530,12 @@ async fn test_sort_merge_join_satisfied() -> Result<()> {
     let source1 = memory_exec(&schema1);
     let source2 = memory_exec(&schema2);
     let sort_opts = SortOptions::default();
-    let sort_exprs1 = vec![sort_expr_options("c9", &source1.schema(), sort_opts)];
-    let sort_exprs2 = vec![sort_expr_options("a", &source2.schema(), sort_opts)];
-    let left = sort_exec(sort_exprs1, source1);
-    let right = sort_exec(sort_exprs2, source2);
-    let left_jcol = col("c9", &left.schema()).unwrap();
-    let right_jcol = col("a", &right.schema()).unwrap();
+    let ordering1 = [sort_expr_options("c9", &source1.schema(), sort_opts)].into();
+    let ordering2 = [sort_expr_options("a", &source2.schema(), sort_opts)].into();
+    let left = sort_exec(ordering1, source1);
+    let right = sort_exec(ordering2, source2);
+    let left_jcol = col("c9", &left.schema())?;
+    let right_jcol = col("a", &right.schema())?;
     let left = Arc::new(RepartitionExec::try_new(
         left,
         Partitioning::Hash(vec![left_jcol.clone()], 10),
@@ -538,17 +550,19 @@ async fn test_sort_merge_join_satisfied() -> Result<()> {
     let join_ty = JoinType::Inner;
     let smj = sort_merge_join_exec(left, right, &join_on, &join_ty);
 
-    assert_plan(
-        smj.as_ref(),
-        vec![
-            "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]",
-            "  RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1",
-            "    SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]",
-            "      DataSourceExec: partitions=1, partition_sizes=[0]",
-            "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1",
-            "    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-            "      DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let plan_str = displayable(smj.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     assert_sanity_check(&smj, true);
     Ok(())
@@ -562,15 +576,16 @@ async fn test_sort_merge_join_order_missing() -> Result<()> {
     let schema2 = create_test_schema2();
     let source1 = memory_exec(&schema1);
     let right = memory_exec(&schema2);
-    let sort_exprs1 = vec![sort_expr_options(
+    let ordering1 = [sort_expr_options(
         "c9",
         &source1.schema(),
         SortOptions::default(),
-    )];
-    let left = sort_exec(sort_exprs1, source1);
+    )]
+    .into();
+    let left = sort_exec(ordering1, source1);
     // Missing sort of the right child here..
-    let left_jcol = col("c9", &left.schema()).unwrap();
-    let right_jcol = col("a", &right.schema()).unwrap();
+    let left_jcol = col("c9", &left.schema())?;
+    let right_jcol = col("a", &right.schema())?;
     let left = Arc::new(RepartitionExec::try_new(
         left,
         Partitioning::Hash(vec![left_jcol.clone()], 10),
@@ -585,16 +600,18 @@ async fn test_sort_merge_join_order_missing() -> Result<()> {
     let join_ty = JoinType::Inner;
     let smj = sort_merge_join_exec(left, right, &join_on, &join_ty);
 
-    assert_plan(
-        smj.as_ref(),
-        vec![
-            "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]",
-            "  RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1",
-            "    SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]",
-            "      DataSourceExec: partitions=1, partition_sizes=[0]",
-            "  RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1",
-            "    DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let plan_str = displayable(smj.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+      RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=1
+        DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     // Order requirement for the `SortMergeJoin` is not satisfied for right child. We expect to receive error during sanity check.
     assert_sanity_check(&smj, false);
@@ -610,16 +627,16 @@ async fn test_sort_merge_join_dist_missing() -> Result<()> {
     let source1 = memory_exec(&schema1);
     let source2 = memory_exec(&schema2);
     let sort_opts = SortOptions::default();
-    let sort_exprs1 = vec![sort_expr_options("c9", &source1.schema(), sort_opts)];
-    let sort_exprs2 = vec![sort_expr_options("a", &source2.schema(), sort_opts)];
-    let left = sort_exec(sort_exprs1, source1);
-    let right = sort_exec(sort_exprs2, source2);
+    let ordering1 = [sort_expr_options("c9", &source1.schema(), sort_opts)].into();
+    let ordering2 = [sort_expr_options("a", &source2.schema(), sort_opts)].into();
+    let left = sort_exec(ordering1, source1);
+    let right = sort_exec(ordering2, source2);
     let right = Arc::new(RepartitionExec::try_new(
         right,
         Partitioning::RoundRobinBatch(10),
     )?);
-    let left_jcol = col("c9", &left.schema()).unwrap();
-    let right_jcol = col("a", &right.schema()).unwrap();
+    let left_jcol = col("c9", &left.schema())?;
+    let right_jcol = col("a", &right.schema())?;
     let left = Arc::new(RepartitionExec::try_new(
         left,
         Partitioning::Hash(vec![left_jcol.clone()], 10),
@@ -631,19 +648,95 @@ async fn test_sort_merge_join_dist_missing() -> Result<()> {
     let join_ty = JoinType::Inner;
     let smj = sort_merge_join_exec(left, right, &join_on, &join_ty);
 
-    assert_plan(
-        smj.as_ref(),
-        vec![
-            "SortMergeJoin: join_type=Inner, on=[(c9@0, a@0)]",
-            "  RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1",
-            "    SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]",
-            "      DataSourceExec: partitions=1, partition_sizes=[0]",
-            "  RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "    SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",
-            "      DataSourceExec: partitions=1, partition_sizes=[0]",
-        ],
+    let plan_str = displayable(smj.as_ref()).indent(true).to_string();
+    let actual = plan_str.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    SortMergeJoinExec: join_type=Inner, on=[(c9@0, a@0)]
+      RepartitionExec: partitioning=Hash([c9@0], 10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[c9@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: partitions=1, partition_sizes=[0]
+    "
     );
     // Distribution requirement for the `SortMergeJoin` is not satisfied for right child (has round-robin partitioning). We expect to receive error during sanity check.
     assert_sanity_check(&smj, false);
     Ok(())
 }
+
+/// A particular edge case.
+///
+/// See <https://github.com/apache/datafusion/issues/17372>.
+#[tokio::test]
+async fn test_union_with_sorts_and_constants() -> Result<()> {
+    let schema_in = create_test_schema2();
+
+    let proj_exprs_1 = vec![
+        (
+            Arc::new(Literal::new(ScalarValue::Utf8(Some("foo".to_owned())))) as _,
+            "const_1".to_owned(),
+        ),
+        (
+            Arc::new(Literal::new(ScalarValue::Utf8(Some("foo".to_owned())))) as _,
+            "const_2".to_owned(),
+        ),
+        (col("a", &schema_in).unwrap(), "a".to_owned()),
+    ];
+    let proj_exprs_2 = vec![
+        (
+            Arc::new(Literal::new(ScalarValue::Utf8(Some("foo".to_owned())))) as _,
+            "const_1".to_owned(),
+        ),
+        (
+            Arc::new(Literal::new(ScalarValue::Utf8(Some("bar".to_owned())))) as _,
+            "const_2".to_owned(),
+        ),
+        (col("a", &schema_in).unwrap(), "a".to_owned()),
+    ];
+
+    let source_1 = memory_exec(&schema_in);
+    let source_1 = projection_exec(proj_exprs_1.clone(), source_1).unwrap();
+    let schema_sources = source_1.schema();
+    let ordering_sources: LexOrdering =
+        [sort_expr("a", &schema_sources).nulls_last()].into();
+    let source_1 = sort_exec(ordering_sources.clone(), source_1);
+
+    let source_2 = memory_exec(&schema_in);
+    let source_2 = projection_exec(proj_exprs_2, source_2).unwrap();
+    let source_2 = sort_exec(ordering_sources.clone(), source_2);
+
+    let plan = union_exec(vec![source_1, source_2]);
+
+    let schema_out = plan.schema();
+    let ordering_out: LexOrdering = [
+        sort_expr("const_1", &schema_out).nulls_last(),
+        sort_expr("const_2", &schema_out).nulls_last(),
+        sort_expr("a", &schema_out).nulls_last(),
+    ]
+    .into();
+
+    let plan = sort_preserving_merge_exec(ordering_out, plan);
+
+    let plan_str = displayable(plan.as_ref()).indent(true).to_string();
+    let plan_str = plan_str.trim();
+    assert_snapshot!(
+        plan_str,
+        @r"
+    SortPreservingMergeExec: [const_1@0 ASC NULLS LAST, const_2@1 ASC NULLS LAST, a@2 ASC NULLS LAST]
+      UnionExec
+        SortExec: expr=[a@2 ASC NULLS LAST], preserve_partitioning=[false]
+          ProjectionExec: expr=[foo as const_1, foo as const_2, a@0 as a]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+        SortExec: expr=[a@2 ASC NULLS LAST], preserve_partitioning=[false]
+          ProjectionExec: expr=[foo as const_1, bar as const_2, a@0 as a]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+    "
+    );
+
+    assert_sanity_check(&plan, true);
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs
index 955486a310309..8d9e7b68b8c96 100644
--- a/datafusion/core/tests/physical_optimizer/test_utils.rs
+++ b/datafusion/core/tests/physical_optimizer/test_utils.rs
@@ -18,8 +18,8 @@
 //! Test utilities for physical optimizer tests
 
 use std::any::Any;
-use std::fmt::Formatter;
-use std::sync::Arc;
+use std::fmt::{Display, Formatter};
+use std::sync::{Arc, LazyLock};
 
 use arrow::array::Int32Array;
 use arrow::compute::SortOptions;
@@ -31,49 +31,54 @@ use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_common::utils::expr::COUNT_STAR_EXPANSION;
-use datafusion_common::{ColumnStatistics, JoinType, Result, Statistics};
+use datafusion_common::{
+    ColumnStatistics, JoinType, NullEquality, Result, Statistics, internal_err,
+};
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::{WindowFrame, WindowFunctionDefinition};
 use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::EquivalenceProperties;
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
-use datafusion_physical_expr::expressions::col;
-use datafusion_physical_expr::{expressions, PhysicalExpr};
+use datafusion_physical_expr::expressions::{self, col};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use datafusion_physical_expr_common::sort_expr::{
-    LexOrdering, LexRequirement, PhysicalSortExpr,
+    LexOrdering, OrderingRequirements, PhysicalSortExpr,
 };
-use datafusion_physical_optimizer::limited_distinct_aggregation::LimitedDistinctAggregation;
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_optimizer::limited_distinct_aggregation::LimitedDistinctAggregation;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{JoinFilter, JoinOn};
 use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec};
 use datafusion_physical_plan::tree_node::PlanContext;
 use datafusion_physical_plan::union::UnionExec;
-use datafusion_physical_plan::windows::{create_window_expr, BoundedWindowAggExec};
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, create_window_expr};
 use datafusion_physical_plan::{
-    displayable, DisplayAs, DisplayFormatType, ExecutionPlan, InputOrderMode,
-    Partitioning, PlanProperties,
+    DisplayAs, DisplayFormatType, ExecutionPlan, InputOrderMode, Partitioning,
+    PlanProperties, SortOrderPushdownResult, displayable,
 };
 
 /// Create a non sorted parquet exec
-pub fn parquet_exec(schema: &SchemaRef) -> Arc<DataSourceExec> {
+pub fn parquet_exec(schema: SchemaRef) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema.clone(),
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema)),
     )
     .with_file(PartitionedFile::new("x".to_string(), 100))
     .build();
@@ -83,12 +88,12 @@ pub fn parquet_exec(schema: &SchemaRef) -> Arc<DataSourceExec> {
 
 /// Create a single parquet file that is sorted
 pub(crate) fn parquet_exec_with_sort(
+    schema: SchemaRef,
     output_ordering: Vec<LexOrdering>,
 ) -> Arc<DataSourceExec> {
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(ParquetSource::default()),
+        Arc::new(ParquetSource::new(schema)),
     )
     .with_file(PartitionedFile::new("x".to_string(), 100))
     .with_output_ordering(output_ordering)
@@ -104,6 +109,7 @@ fn int64_stats() -> ColumnStatistics {
         max_value: Precision::Exact(1_000_000.into()),
         min_value: Precision::Exact(0.into()),
         distinct_count: Precision::Absent,
+        byte_size: Precision::Absent,
     }
 }
 
@@ -125,52 +131,60 @@ pub(crate) fn parquet_exec_with_stats(file_size: u64) -> Arc<DataSourceExec> {
 
     let config = FileScanConfigBuilder::new(
         ObjectStoreUrl::parse("test:///").unwrap(),
-        schema(),
-        Arc::new(ParquetSource::new(Default::default())),
+        Arc::new(ParquetSource::new(schema())),
     )
     .with_file(PartitionedFile::new("x".to_string(), file_size))
     .with_statistics(statistics)
     .build();
 
-    assert_eq!(
-        config.file_source.statistics().unwrap().num_rows,
-        Precision::Inexact(10000)
-    );
+    assert_eq!(config.statistics().num_rows, Precision::Inexact(10000));
     DataSourceExec::from_data_source(config)
 }
 
 pub fn schema() -> SchemaRef {
-    Arc::new(Schema::new(vec![
-        Field::new("a", DataType::Int64, true),
-        Field::new("b", DataType::Int64, true),
-        Field::new("c", DataType::Int64, true),
-        Field::new("d", DataType::Int32, true),
-        Field::new("e", DataType::Boolean, true),
-    ]))
+    static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Int64, true),
+            Field::new("c", DataType::Int64, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("e", DataType::Boolean, true),
+        ]))
+    });
+    Arc::clone(&SCHEMA)
 }
 
 pub fn create_test_schema() -> Result<SchemaRef> {
-    let nullable_column = Field::new("nullable_col", DataType::Int32, true);
-    let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false);
-    let schema = Arc::new(Schema::new(vec![nullable_column, non_nullable_column]));
+    static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+        let nullable_column = Field::new("nullable_col", DataType::Int32, true);
+        let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false);
+        Arc::new(Schema::new(vec![nullable_column, non_nullable_column]))
+    });
+    let schema = Arc::clone(&SCHEMA);
     Ok(schema)
 }
 
 pub fn create_test_schema2() -> Result<SchemaRef> {
-    let col_a = Field::new("col_a", DataType::Int32, true);
-    let col_b = Field::new("col_b", DataType::Int32, true);
-    let schema = Arc::new(Schema::new(vec![col_a, col_b]));
+    static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+        let col_a = Field::new("col_a", DataType::Int32, true);
+        let col_b = Field::new("col_b", DataType::Int32, true);
+        Arc::new(Schema::new(vec![col_a, col_b]))
+    });
+    let schema = Arc::clone(&SCHEMA);
     Ok(schema)
 }
 
 // Generate a schema which consists of 5 columns (a, b, c, d, e)
 pub fn create_test_schema3() -> Result<SchemaRef> {
-    let a = Field::new("a", DataType::Int32, true);
-    let b = Field::new("b", DataType::Int32, false);
-    let c = Field::new("c", DataType::Int32, true);
-    let d = Field::new("d", DataType::Int32, false);
-    let e = Field::new("e", DataType::Int32, false);
-    let schema = Arc::new(Schema::new(vec![a, b, c, d, e]));
+    static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
+        let a = Field::new("a", DataType::Int32, true);
+        let b = Field::new("b", DataType::Int32, false);
+        let c = Field::new("c", DataType::Int32, true);
+        let d = Field::new("d", DataType::Int32, false);
+        let e = Field::new("e", DataType::Int32, false);
+        Arc::new(Schema::new(vec![a, b, c, d, e]))
+    });
+    let schema = Arc::clone(&SCHEMA);
     Ok(schema)
 }
 
@@ -188,7 +202,7 @@ pub fn sort_merge_join_exec(
             None,
             *join_type,
             vec![SortOptions::default(); join_on.len()],
-            false,
+            NullEquality::NullEqualsNothing,
         )
         .unwrap(),
     )
@@ -234,7 +248,8 @@ pub fn hash_join_exec(
         join_type,
         None,
         PartitionMode::Partitioned,
-        true,
+        NullEquality::NullEqualsNothing,
+        false,
     )?))
 }
 
@@ -243,17 +258,28 @@ pub fn bounded_window_exec(
     sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
     input: Arc<dyn ExecutionPlan>,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs: LexOrdering = sort_exprs.into_iter().collect();
+    bounded_window_exec_with_partition(col_name, sort_exprs, &[], input)
+}
+
+pub fn bounded_window_exec_with_partition(
+    col_name: &str,
+    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    partition_by: &[Arc<dyn PhysicalExpr>],
+    input: Arc<dyn ExecutionPlan>,
+) -> Arc<dyn ExecutionPlan> {
+    let sort_exprs = sort_exprs.into_iter().collect::<Vec<_>>();
     let schema = input.schema();
     let window_expr = create_window_expr(
         &WindowFunctionDefinition::AggregateUDF(count_udaf()),
         "count".to_owned(),
         &[col(col_name, &schema).unwrap()],
-        &[],
-        sort_exprs.as_ref(),
+        partition_by,
+        &sort_exprs,
         Arc::new(WindowFrame::new(Some(false))),
-        schema.as_ref(),
+        schema,
+        false,
         false,
+        None,
     )
     .unwrap();
 
@@ -276,36 +302,37 @@ pub fn filter_exec(
 }
 
 pub fn sort_preserving_merge_exec(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
     input: Arc<dyn ExecutionPlan>,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(SortPreservingMergeExec::new(sort_exprs, input))
+    Arc::new(SortPreservingMergeExec::new(ordering, input))
 }
 
 pub fn sort_preserving_merge_exec_with_fetch(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
     input: Arc<dyn ExecutionPlan>,
     fetch: usize,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(SortPreservingMergeExec::new(sort_exprs, input).with_fetch(Some(fetch)))
+    Arc::new(SortPreservingMergeExec::new(ordering, input).with_fetch(Some(fetch)))
 }
 
 pub fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(UnionExec::new(input))
-}
-
-pub fn limit_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    global_limit_exec(local_limit_exec(input))
+    UnionExec::try_new(input).unwrap()
 }
 
-pub fn local_limit_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(LocalLimitExec::new(input, 100))
+pub fn local_limit_exec(
+    input: Arc<dyn ExecutionPlan>,
+    fetch: usize,
+) -> Arc<dyn ExecutionPlan> {
+    Arc::new(LocalLimitExec::new(input, fetch))
 }
 
-pub fn global_limit_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(GlobalLimitExec::new(input, 0, Some(100)))
+pub fn global_limit_exec(
+    input: Arc<dyn ExecutionPlan>,
+    skip: usize,
+    fetch: Option<usize>,
+) -> Arc<dyn ExecutionPlan> {
+    Arc::new(GlobalLimitExec::new(input, skip, fetch))
 }
 
 pub fn repartition_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
@@ -335,30 +362,43 @@ pub fn aggregate_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     )
 }
 
-pub fn coalesce_batches_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    Arc::new(CoalesceBatchesExec::new(input, 128))
+pub fn sort_exec(
+    ordering: LexOrdering,
+    input: Arc<dyn ExecutionPlan>,
+) -> Arc<dyn ExecutionPlan> {
+    sort_exec_with_fetch(ordering, None, input)
 }
 
-pub fn sort_exec(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+pub fn sort_exec_with_preserve_partitioning(
+    ordering: LexOrdering,
     input: Arc<dyn ExecutionPlan>,
 ) -> Arc<dyn ExecutionPlan> {
-    sort_exec_with_fetch(sort_exprs, None, input)
+    Arc::new(SortExec::new(ordering, input).with_preserve_partitioning(true))
 }
 
 pub fn sort_exec_with_fetch(
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
     fetch: Option<usize>,
     input: Arc<dyn ExecutionPlan>,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-    Arc::new(SortExec::new(sort_exprs, input).with_fetch(fetch))
+    Arc::new(SortExec::new(ordering, input).with_fetch(fetch))
+}
+
+pub fn projection_exec(
+    expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    input: Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let proj_exprs: Vec<ProjectionExpr> = expr
+        .into_iter()
+        .map(|(expr, alias)| ProjectionExpr { expr, alias })
+        .collect();
+    Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
 }
 
 /// A test [`ExecutionPlan`] whose requirements can be configured.
 #[derive(Debug)]
 pub struct RequirementsTestExec {
-    required_input_ordering: LexOrdering,
+    required_input_ordering: Option<LexOrdering>,
     maintains_input_order: bool,
     input: Arc<dyn ExecutionPlan>,
 }
@@ -366,7 +406,7 @@ pub struct RequirementsTestExec {
 impl RequirementsTestExec {
     pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
         Self {
-            required_input_ordering: LexOrdering::default(),
+            required_input_ordering: None,
             maintains_input_order: true,
             input,
         }
@@ -375,7 +415,7 @@ impl RequirementsTestExec {
     /// sets the required input ordering
     pub fn with_required_input_ordering(
         mut self,
-        required_input_ordering: LexOrdering,
+        required_input_ordering: Option<LexOrdering>,
     ) -> Self {
         self.required_input_ordering = required_input_ordering;
         self
@@ -416,13 +456,16 @@ impl ExecutionPlan for RequirementsTestExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         self.input.properties()
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
-        let requirement = LexRequirement::from(self.required_input_ordering.clone());
-        vec![Some(requirement)]
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        vec![
+            self.required_input_ordering
+                .as_ref()
+                .map(|ordering| OrderingRequirements::from(ordering.clone())),
+        ]
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -451,6 +494,20 @@ impl ExecutionPlan for RequirementsTestExec {
     ) -> Result<SendableRecordBatchStream> {
         unimplemented!("Test exec does not support execution")
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in required_input_ordering if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_input_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 /// A [`PlanContext`] object is susceptible to being left in an inconsistent state after
@@ -479,13 +536,6 @@ pub fn check_integrity<T: Clone>(context: PlanContext<T>) -> Result<PlanContext<
         .data()
 }
 
-pub fn trim_plan_display(plan: &str) -> Vec<&str> {
-    plan.split('\n')
-        .map(|s| s.trim())
-        .filter(|s| !s.is_empty())
-        .collect()
-}
-
 // construct a stream partition for test purposes
 #[derive(Debug)]
 pub struct TestStreamPartition {
@@ -501,13 +551,28 @@ impl PartitionStream for TestStreamPartition {
     }
 }
 
-/// Create an unbounded stream exec
+/// Create an unbounded stream table without data ordering.
+pub fn stream_exec(schema: &SchemaRef) -> Arc<dyn ExecutionPlan> {
+    Arc::new(
+        StreamingTableExec::try_new(
+            Arc::clone(schema),
+            vec![Arc::new(TestStreamPartition {
+                schema: Arc::clone(schema),
+            }) as _],
+            None,
+            vec![],
+            true,
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Create an unbounded stream table with data ordering.
 pub fn stream_exec_ordered(
     schema: &SchemaRef,
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
-
     Arc::new(
         StreamingTableExec::try_new(
             Arc::clone(schema),
@@ -515,7 +580,7 @@ pub fn stream_exec_ordered(
                 schema: Arc::clone(schema),
             }) as _],
             None,
-            vec![sort_exprs],
+            vec![ordering],
             true,
             None,
         )
@@ -523,12 +588,11 @@ pub fn stream_exec_ordered(
     )
 }
 
-// Creates a stream exec source for the test purposes
+/// Create an unbounded stream table with data ordering and built-in projection.
 pub fn stream_exec_ordered_with_projection(
     schema: &SchemaRef,
-    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ordering: LexOrdering,
 ) -> Arc<dyn ExecutionPlan> {
-    let sort_exprs = sort_exprs.into_iter().collect();
     let projection: Vec<usize> = vec![0, 2, 3];
 
     Arc::new(
@@ -538,7 +602,7 @@ pub fn stream_exec_ordered_with_projection(
                 schema: Arc::clone(schema),
             }) as _],
             Some(&projection),
-            vec![sort_exprs],
+            vec![ordering],
             true,
             None,
         )
@@ -585,25 +649,15 @@ pub fn build_group_by(input_schema: &SchemaRef, columns: Vec<String>) -> Physica
     PhysicalGroupBy::new_single(group_by_expr.clone())
 }
 
-pub fn assert_plan_matches_expected(
-    plan: &Arc<dyn ExecutionPlan>,
-    expected: &[&str],
-) -> Result<()> {
-    let expected_lines: Vec<&str> = expected.to_vec();
+pub fn get_optimized_plan(plan: &Arc<dyn ExecutionPlan>) -> Result<String> {
     let config = ConfigOptions::new();
 
     let optimized =
         LimitedDistinctAggregation::new().optimize(Arc::clone(plan), &config)?;
 
     let optimized_result = displayable(optimized.as_ref()).indent(true).to_string();
-    let actual_lines = trim_plan_display(&optimized_result);
 
-    assert_eq!(
-        &expected_lines, &actual_lines,
-        "\n\nexpected:\n\n{expected_lines:#?}\nactual:\n\n{actual_lines:#?}\n\n"
-    );
-
-    Ok(())
+    Ok(optimized_result)
 }
 
 /// Describe the type of aggregate being tested
@@ -659,3 +713,300 @@ impl TestAggregate {
         }
     }
 }
+
+/// A harness for testing physical optimizers.
+#[derive(Debug)]
+pub struct OptimizationTest {
+    input: Vec<String>,
+    output: Result<Vec<String>, String>,
+}
+
+impl OptimizationTest {
+    pub fn new<O>(
+        input_plan: Arc<dyn ExecutionPlan>,
+        opt: O,
+        enable_sort_pushdown: bool,
+    ) -> Self
+    where
+        O: PhysicalOptimizerRule,
+    {
+        let input = format_execution_plan(&input_plan);
+        let input_schema = input_plan.schema();
+
+        let mut config = ConfigOptions::new();
+        config.optimizer.enable_sort_pushdown = enable_sort_pushdown;
+        let output_result = opt.optimize(input_plan, &config);
+        let output = output_result
+            .and_then(|plan| {
+                if opt.schema_check() && (plan.schema() != input_schema) {
+                    internal_err!(
+                        "Schema mismatch:\n\nBefore:\n{:?}\n\nAfter:\n{:?}",
+                        input_schema,
+                        plan.schema()
+                    )
+                } else {
+                    Ok(plan)
+                }
+            })
+            .map(|plan| format_execution_plan(&plan))
+            .map_err(|e| e.to_string());
+
+        Self { input, output }
+    }
+}
+
+impl Display for OptimizationTest {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "OptimizationTest:")?;
+        writeln!(f, "  input:")?;
+        for line in &self.input {
+            writeln!(f, "    - {line}")?;
+        }
+        writeln!(f, "  output:")?;
+        match &self.output {
+            Ok(output) => {
+                writeln!(f, "    Ok:")?;
+                for line in output {
+                    writeln!(f, "      - {line}")?;
+                }
+            }
+            Err(err) => {
+                writeln!(f, "    Err: {err}")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+pub fn format_execution_plan(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
+    format_lines(&displayable(plan.as_ref()).indent(false).to_string())
+}
+
+fn format_lines(s: &str) -> Vec<String> {
+    s.trim().split('\n').map(|s| s.to_string()).collect()
+}
+
+/// Create a simple ProjectionExec with column indices (simplified version)
+pub fn simple_projection_exec(
+    input: Arc<dyn ExecutionPlan>,
+    columns: Vec<usize>,
+) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+    let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = columns
+        .iter()
+        .map(|&i| {
+            let field = schema.field(i);
+            (
+                Arc::new(expressions::Column::new(field.name(), i))
+                    as Arc<dyn PhysicalExpr>,
+                field.name().to_string(),
+            )
+        })
+        .collect();
+
+    projection_exec(exprs, input).unwrap()
+}
+
+/// Create a ProjectionExec with column aliases
+pub fn projection_exec_with_alias(
+    input: Arc<dyn ExecutionPlan>,
+    columns: Vec<(usize, &str)>,
+) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+    let exprs: Vec<(Arc<dyn PhysicalExpr>, String)> = columns
+        .iter()
+        .map(|&(i, alias)| {
+            (
+                Arc::new(expressions::Column::new(schema.field(i).name(), i))
+                    as Arc<dyn PhysicalExpr>,
+                alias.to_string(),
+            )
+        })
+        .collect();
+
+    projection_exec(exprs, input).unwrap()
+}
+
+/// Create a sort expression with custom name and index
+pub fn sort_expr_named(name: &str, index: usize) -> PhysicalSortExpr {
+    PhysicalSortExpr {
+        expr: Arc::new(expressions::Column::new(name, index)),
+        options: SortOptions::default(),
+    }
+}
+
+/// A test data source that can display any requested ordering
+/// This is useful for testing sort pushdown behavior
+#[derive(Debug, Clone)]
+pub struct TestScan {
+    schema: SchemaRef,
+    output_ordering: Vec<LexOrdering>,
+    plan_properties: Arc<PlanProperties>,
+    // Store the requested ordering for display
+    requested_ordering: Option<LexOrdering>,
+}
+
+impl TestScan {
+    /// Create a new TestScan with the given schema and output ordering
+    pub fn new(schema: SchemaRef, output_ordering: Vec<LexOrdering>) -> Self {
+        let eq_properties = if !output_ordering.is_empty() {
+            // Convert Vec<LexOrdering> to the format expected by new_with_orderings
+            // We need to extract the inner Vec<PhysicalSortExpr> from each LexOrdering
+            let orderings: Vec<Vec<PhysicalSortExpr>> = output_ordering
+                .iter()
+                .map(|lex_ordering| {
+                    // LexOrdering implements IntoIterator, so we can collect it
+                    lex_ordering.iter().cloned().collect()
+                })
+                .collect();
+
+            EquivalenceProperties::new_with_orderings(Arc::clone(&schema), orderings)
+        } else {
+            EquivalenceProperties::new(Arc::clone(&schema))
+        };
+
+        let plan_properties = PlanProperties::new(
+            eq_properties,
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
+
+        Self {
+            schema,
+            output_ordering,
+            plan_properties: Arc::new(plan_properties),
+            requested_ordering: None,
+        }
+    }
+
+    /// Create a TestScan with a single output ordering
+    pub fn with_ordering(schema: SchemaRef, ordering: LexOrdering) -> Self {
+        Self::new(schema, vec![ordering])
+    }
+}
+
+impl DisplayAs for TestScan {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "TestScan")?;
+                if !self.output_ordering.is_empty() {
+                    write!(f, ": output_ordering=[")?;
+                    // Format the ordering in a readable way
+                    for (i, sort_expr) in self.output_ordering[0].iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        write!(f, "{sort_expr}")?;
+                    }
+                    write!(f, "]")?;
+                }
+                // This is the key part - show what ordering was requested
+                if let Some(ref req) = self.requested_ordering {
+                    write!(f, ", requested_ordering=[")?;
+                    for (i, sort_expr) in req.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        write!(f, "{sort_expr}")?;
+                    }
+                    write!(f, "]")?;
+                }
+                Ok(())
+            }
+            DisplayFormatType::TreeRender => {
+                write!(f, "TestScan")
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for TestScan {
+    fn name(&self) -> &str {
+        "TestScan"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.plan_properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if children.is_empty() {
+            Ok(self)
+        } else {
+            internal_err!("TestScan should have no children")
+        }
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        internal_err!("TestScan is for testing optimizer only, not for execution")
+    }
+
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(Statistics::new_unknown(&self.schema)))
+    }
+
+    // This is the key method - implement sort pushdown
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // For testing purposes, accept ANY ordering request
+        // and create a new TestScan that shows what was requested
+        let requested_ordering = LexOrdering::new(order.to_vec());
+
+        let mut new_scan = self.clone();
+        new_scan.requested_ordering = requested_ordering;
+
+        // Always return Inexact to keep the Sort node (like Phase 1 behavior)
+        Ok(SortOrderPushdownResult::Inexact {
+            inner: Arc::new(new_scan),
+        })
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in output_ordering
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.output_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+
+        // Visit expressions in requested_ordering if present
+        if let Some(ordering) = &self.requested_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+
+        Ok(tnr)
+    }
+}
+
+/// Helper function to create a TestScan with ordering
+pub fn test_scan_with_ordering(
+    schema: SchemaRef,
+    ordering: LexOrdering,
+) -> Arc<dyn ExecutionPlan> {
+    Arc::new(TestScan::with_ordering(schema, ordering))
+}
diff --git a/datafusion/core/tests/physical_optimizer/window_optimize.rs b/datafusion/core/tests/physical_optimizer/window_optimize.rs
new file mode 100644
index 0000000000000..796f6b6259716
--- /dev/null
+++ b/datafusion/core/tests/physical_optimizer/window_optimize.rs
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod test {
+    use arrow::array::{Int32Array, RecordBatch};
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_datasource::memory::MemorySourceConfig;
+    use datafusion_datasource::source::DataSourceExec;
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::WindowFrame;
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::{Column, col};
+    use datafusion_physical_expr::window::PlainAggregateWindowExpr;
+    use datafusion_physical_plan::windows::BoundedWindowAggExec;
+    use datafusion_physical_plan::{ExecutionPlan, InputOrderMode, common};
+    use std::sync::Arc;
+
+    /// Test case for <https://github.com/apache/datafusion/issues/16308>
+    #[tokio::test]
+    async fn test_window_constant_aggregate() -> Result<()> {
+        let source = mock_data()?;
+        let schema = source.schema();
+        let c = Arc::new(Column::new("b", 1));
+        let cnt = AggregateExprBuilder::new(count_udaf(), vec![c])
+            .schema(schema.clone())
+            .alias("t")
+            .build()?;
+        let partition = [col("a", &schema)?];
+        let frame = WindowFrame::new(None);
+        let plain = PlainAggregateWindowExpr::new(
+            Arc::new(cnt),
+            &partition,
+            &[],
+            Arc::new(frame),
+            None,
+        );
+
+        let bounded_agg_exec = BoundedWindowAggExec::try_new(
+            vec![Arc::new(plain)],
+            source,
+            InputOrderMode::Linear,
+            true,
+        )?;
+        let task_ctx = Arc::new(TaskContext::default());
+        common::collect(bounded_agg_exec.execute(0, task_ctx)?).await?;
+
+        Ok(())
+    }
+
+    pub fn mock_data() -> Result<Arc<DataSourceExec>> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![
+                    Some(1),
+                    Some(1),
+                    Some(3),
+                    Some(2),
+                    Some(1),
+                ])),
+                Arc::new(Int32Array::from(vec![
+                    Some(1),
+                    Some(6),
+                    Some(2),
+                    Some(8),
+                    Some(9),
+                ])),
+            ],
+        )?;
+
+        MemorySourceConfig::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)
+    }
+}
diff --git a/datafusion/core/tests/set_comparison.rs b/datafusion/core/tests/set_comparison.rs
new file mode 100644
index 0000000000000..464d6c937b328
--- /dev/null
+++ b/datafusion/core/tests/set_comparison.rs
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Int32Array, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::SessionContext;
+use datafusion_common::{Result, assert_batches_eq, assert_contains};
+
+fn build_table(values: &[i32]) -> Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+    let array =
+        Arc::new(Int32Array::from(values.to_vec())) as Arc<dyn arrow::array::Array>;
+    RecordBatch::try_new(schema, vec![array]).map_err(Into::into)
+}
+
+#[tokio::test]
+async fn set_comparison_any() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 6, 10])?)?;
+    // Include a NULL in the subquery input to ensure we propagate UNKNOWN correctly.
+    ctx.register_batch("s", {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+        let array = Arc::new(Int32Array::from(vec![Some(5), None]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v > any(select v from s)")
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(
+        &["+----+", "| v  |", "+----+", "| 6  |", "| 10 |", "+----+",],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_any_aggregate_subquery() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 7])?)?;
+    ctx.register_batch("s", build_table(&[1, 2, 3])?)?;
+
+    let df = ctx
+        .sql(
+            "select v from t where v > any(select sum(v) from s group by v % 2) order by v",
+        )
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(&["+---+", "| v |", "+---+", "| 7 |", "+---+",], &results);
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_all_empty() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 6, 10])?)?;
+    ctx.register_batch(
+        "e",
+        RecordBatch::new_empty(Arc::new(Schema::new(vec![Field::new(
+            "v",
+            DataType::Int32,
+            true,
+        )]))),
+    )?;
+
+    let df = ctx
+        .sql("select v from t where v < all(select v from e)")
+        .await?;
+    let results = df.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+----+", "| v  |", "+----+", "| 1  |", "| 6  |", "| 10 |", "+----+",
+        ],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_type_mismatch() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1])?)?;
+    ctx.register_batch("strings", {
+        let schema = Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)]));
+        let array = Arc::new(StringArray::from(vec![Some("a"), Some("b")]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v > any(select s from strings)")
+        .await?;
+    let err = df.collect().await.unwrap_err();
+    assert_contains!(
+        err.to_string(),
+        "expr type Int32 can't cast to Utf8 in SetComparison"
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_multiple_operators() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[1, 2, 3, 4])?)?;
+    ctx.register_batch("s", build_table(&[2, 3])?)?;
+
+    let df = ctx
+        .sql("select v from t where v = any(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 2 |", "| 3 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v != all(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 1 |", "| 4 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v >= all(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &["+---+", "| v |", "+---+", "| 3 |", "| 4 |", "+---+",],
+        &results
+    );
+
+    let df = ctx
+        .sql("select v from t where v <= any(select v from s) order by v")
+        .await?;
+    let results = df.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+---+", "| v |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",
+        ],
+        &results
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn set_comparison_null_semantics_all() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    ctx.register_batch("t", build_table(&[5])?)?;
+    ctx.register_batch("s", {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, true)]));
+        let array = Arc::new(Int32Array::from(vec![Some(1), None]))
+            as Arc<dyn arrow::array::Array>;
+        RecordBatch::try_new(schema, vec![array])?
+    })?;
+
+    let df = ctx
+        .sql("select v from t where v != all(select v from s)")
+        .await?;
+    let results = df.collect().await?;
+    let row_count: usize = results.iter().map(|batch| batch.num_rows()).sum();
+    assert_eq!(0, row_count);
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates/basic.rs
similarity index 78%
rename from datafusion/core/tests/sql/aggregates.rs
rename to datafusion/core/tests/sql/aggregates/basic.rs
index 52372e01d41ac..d1b376b735ab9 100644
--- a/datafusion/core/tests/sql/aggregates.rs
+++ b/datafusion/core/tests/sql/aggregates/basic.rs
@@ -16,7 +16,10 @@
 // under the License.
 
 use super::*;
-use datafusion::scalar::ScalarValue;
+use datafusion::common::test_util::batches_to_string;
+use datafusion_catalog::MemTable;
+use datafusion_common::ScalarValue;
+use insta::assert_snapshot;
 
 #[tokio::test]
 async fn csv_query_array_agg_distinct() -> Result<()> {
@@ -45,11 +48,11 @@ async fn csv_query_array_agg_distinct() -> Result<()> {
     let column = actual[0].column(0);
     assert_eq!(column.len(), 1);
     let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&column)?;
-    let mut scalars = scalar_vec[0].clone();
+    let mut scalars = scalar_vec[0].as_ref().unwrap().clone();
 
     // workaround lack of Ord of ScalarValue
     let cmp = |a: &ScalarValue, b: &ScalarValue| {
-        a.partial_cmp(b).expect("Can compare ScalarValues")
+        a.try_cmp(b).expect("Can compare ScalarValues")
     };
     scalars.sort_by(cmp);
     assert_eq!(
@@ -321,3 +324,120 @@ async fn test_accumulator_row_accumulator() -> Result<()> {
 
     Ok(())
 }
+
+/// Test that COUNT(DISTINCT) correctly handles dictionary arrays with all null values.
+/// Verifies behavior across both single and multiple partitions.
+#[tokio::test]
+async fn count_distinct_dictionary_all_null_values() -> Result<()> {
+    let n: usize = 5;
+    let num = Arc::new(Int32Array::from_iter(0..n as i32)) as ArrayRef;
+
+    // Create dictionary where all indices point to a null value (index 0)
+    let dict_values = StringArray::from(vec![None, Some("abc")]);
+    let dict_indices = Int32Array::from(vec![0; n]);
+    let dict = DictionaryArray::new(dict_indices, Arc::new(dict_values));
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("num1", DataType::Int32, false),
+        Field::new("num2", DataType::Int32, false),
+        Field::new(
+            "dict",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![num.clone(), num.clone(), Arc::new(dict)],
+    )?;
+
+    // Test with single partition
+    let ctx =
+        SessionContext::new_with_config(SessionConfig::new().with_target_partitions(1));
+    let provider = MemTable::try_new(schema.clone(), vec![vec![batch.clone()]])?;
+    ctx.register_table("t", Arc::new(provider))?;
+
+    let df = ctx
+        .sql("SELECT count(distinct dict) as cnt, count(num2) FROM t GROUP BY num1")
+        .await?;
+    let results = df.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +-----+---------------+
+    | cnt | count(t.num2) |
+    +-----+---------------+
+    | 0   | 1             |
+    | 0   | 1             |
+    | 0   | 1             |
+    | 0   | 1             |
+    | 0   | 1             |
+    +-----+---------------+
+    "
+    );
+
+    // Test with multiple partitions
+    let ctx_multi =
+        SessionContext::new_with_config(SessionConfig::new().with_target_partitions(2));
+    let provider_multi = MemTable::try_new(schema, vec![vec![batch]])?;
+    ctx_multi.register_table("t", Arc::new(provider_multi))?;
+
+    let df_multi = ctx_multi
+        .sql("SELECT count(distinct dict) as cnt, count(num2) FROM t GROUP BY num1")
+        .await?;
+    let results_multi = df_multi.collect().await?;
+
+    // Results should be identical across partition configurations
+    assert_eq!(
+        batches_to_string(&results),
+        batches_to_string(&results_multi)
+    );
+
+    Ok(())
+}
+
+/// Test COUNT(DISTINCT) with mixed null and non-null dictionary values
+#[tokio::test]
+async fn count_distinct_dictionary_mixed_values() -> Result<()> {
+    let n: usize = 6;
+    let num = Arc::new(Int32Array::from_iter(0..n as i32)) as ArrayRef;
+
+    // Dictionary values array with nulls and non-nulls
+    let dict_values = StringArray::from(vec![None, Some("abc"), Some("def"), None]);
+    // Create indices that point to both null and non-null values
+    let dict_indices = Int32Array::from(vec![0, 1, 2, 0, 1, 3]);
+    let dict = DictionaryArray::new(dict_indices, Arc::new(dict_values));
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("num1", DataType::Int32, false),
+        Field::new(
+            "dict",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ),
+    ]));
+
+    let batch = RecordBatch::try_new(schema.clone(), vec![num, Arc::new(dict)])?;
+    let provider = MemTable::try_new(schema, vec![vec![batch]])?;
+    let ctx = SessionContext::new();
+    ctx.register_table("t", Arc::new(provider))?;
+
+    // COUNT(DISTINCT) should only count non-null values "abc" and "def"
+    let df = ctx.sql("SELECT count(distinct dict) FROM t").await?;
+    let results = df.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +------------------------+
+    | count(DISTINCT t.dict) |
+    +------------------------+
+    | 2                      |
+    +------------------------+
+    "
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/aggregates/dict_nulls.rs b/datafusion/core/tests/sql/aggregates/dict_nulls.rs
new file mode 100644
index 0000000000000..f9e15a71a20f8
--- /dev/null
+++ b/datafusion/core/tests/sql/aggregates/dict_nulls.rs
@@ -0,0 +1,454 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+use datafusion::common::test_util::batches_to_string;
+use insta::assert_snapshot;
+
+/// Comprehensive test for aggregate functions with null values and dictionary columns
+/// Tests COUNT, SUM, MIN, and MEDIAN null handling in single comprehensive test
+#[tokio::test]
+async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
+    let test_data_basic = TestData::new();
+    let test_data_extended = TestData::new_extended();
+    let test_data_min_max = TestData::new_for_min_max();
+    let test_data_median = TestData::new_for_median();
+
+    // Test COUNT null exclusion with basic data
+    let sql_count = "SELECT dict_null_keys, COUNT(value) as cnt FROM t GROUP BY dict_null_keys ORDER BY dict_null_keys NULLS FIRST";
+    let results_count = run_snapshot_test(&test_data_basic, sql_count).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results_count),
+        @r"
+    +----------------+-----+
+    | dict_null_keys | cnt |
+    +----------------+-----+
+    |                | 0   |
+    | group_a        | 2   |
+    | group_b        | 1   |
+    +----------------+-----+
+    "
+    );
+
+    // Test SUM null handling with extended data
+    let sql_sum = "SELECT dict_null_vals, SUM(value) as total FROM t GROUP BY dict_null_vals ORDER BY dict_null_vals NULLS FIRST";
+    let results_sum = run_snapshot_test(&test_data_extended, sql_sum).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results_sum),
+        @r"
+    +----------------+-------+
+    | dict_null_vals | total |
+    +----------------+-------+
+    |                | 4     |
+    | group_x        | 4     |
+    | group_y        | 2     |
+    | group_z        | 5     |
+    +----------------+-------+
+    "
+    );
+
+    // Test MIN null handling with min/max data
+    let sql_min = "SELECT dict_null_keys, MIN(value) as minimum FROM t GROUP BY dict_null_keys ORDER BY dict_null_keys NULLS FIRST";
+    let results_min = run_snapshot_test(&test_data_min_max, sql_min).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results_min),
+        @r"
+    +----------------+---------+
+    | dict_null_keys | minimum |
+    +----------------+---------+
+    |                | 2       |
+    | group_a        | 3       |
+    | group_b        | 1       |
+    | group_c        | 7       |
+    +----------------+---------+
+    "
+    );
+
+    // Test MEDIAN null handling with median data
+    let sql_median = "SELECT dict_null_vals, MEDIAN(value) as median_value FROM t GROUP BY dict_null_vals ORDER BY dict_null_vals NULLS FIRST";
+    let results_median = run_snapshot_test(&test_data_median, sql_median).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results_median),
+        @r"
+    +----------------+--------------+
+    | dict_null_vals | median_value |
+    +----------------+--------------+
+    |                | 3            |
+    | group_x        | 1            |
+    | group_y        | 5            |
+    | group_z        | 7            |
+    +----------------+--------------+
+    ");
+
+    Ok(())
+}
+
+/// Test FIRST_VAL and LAST_VAL with null values and GROUP BY dict with null keys and null values - may return null if first/last value is null (single and multiple partitions)
+#[tokio::test]
+async fn test_first_last_val_null_handling() -> Result<()> {
+    let test_data = TestData::new_for_first_last();
+
+    // Test FIRST_VALUE and LAST_VALUE with window functions over groups
+    let sql = "SELECT dict_null_keys, value, FIRST_VALUE(value) OVER (PARTITION BY dict_null_keys ORDER BY value NULLS FIRST) as first_val, LAST_VALUE(value) OVER (PARTITION BY dict_null_keys ORDER BY value NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_val FROM t ORDER BY dict_null_keys NULLS FIRST, value NULLS FIRST";
+
+    let results_single = run_snapshot_test(&test_data, sql).await?;
+
+    assert_snapshot!(batches_to_string(&results_single), @r"
+    +----------------+-------+-----------+----------+
+    | dict_null_keys | value | first_val | last_val |
+    +----------------+-------+-----------+----------+
+    |                | 1     | 1         | 3        |
+    |                | 3     | 1         | 3        |
+    | group_a        |       |           |          |
+    | group_a        |       |           |          |
+    | group_b        | 2     | 2         | 2        |
+    +----------------+-------+-----------+----------+
+    ");
+
+    Ok(())
+}
+
+/// Test FIRST_VALUE and LAST_VALUE with ORDER BY - comprehensive null handling
+#[tokio::test]
+async fn test_first_last_value_order_by_null_handling() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Create test data with nulls mixed in
+    let dict_keys = create_test_dict(
+        &[Some("group_a"), Some("group_b"), Some("group_c")],
+        &[Some(0), Some(1), Some(2), Some(0), Some(1)],
+    );
+
+    let values = Int32Array::from(vec![None, Some(10), Some(20), Some(5), None]);
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("dict_group", string_dict_type(), true),
+        Field::new("value", DataType::Int32, true),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(dict_keys), Arc::new(values)],
+    )?;
+
+    let table = MemTable::try_new(schema, vec![vec![batch]])?;
+    ctx.register_table("test_data", Arc::new(table))?;
+
+    // Test all combinations of FIRST_VALUE and LAST_VALUE with null handling
+    let sql = "SELECT 
+        dict_group,
+        value,
+        FIRST_VALUE(value IGNORE NULLS) OVER (ORDER BY value NULLS LAST) as first_ignore_nulls,
+        FIRST_VALUE(value RESPECT NULLS) OVER (ORDER BY value NULLS FIRST) as first_respect_nulls,
+        LAST_VALUE(value IGNORE NULLS) OVER (ORDER BY value NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_ignore_nulls,
+        LAST_VALUE(value RESPECT NULLS) OVER (ORDER BY value NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_respect_nulls
+    FROM test_data 
+    ORDER BY value NULLS LAST";
+
+    let df = ctx.sql(sql).await?;
+    let results = df.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +------------+-------+--------------------+---------------------+-------------------+--------------------+
+    | dict_group | value | first_ignore_nulls | first_respect_nulls | last_ignore_nulls | last_respect_nulls |
+    +------------+-------+--------------------+---------------------+-------------------+--------------------+
+    | group_a    | 5     | 5                  |                     | 20                |                    |
+    | group_b    | 10    | 5                  |                     | 20                |                    |
+    | group_c    | 20    | 5                  |                     | 20                |                    |
+    | group_a    |       | 5                  |                     | 20                |                    |
+    | group_b    |       | 5                  |                     | 20                |                    |
+    +------------+-------+--------------------+---------------------+-------------------+--------------------+
+    "
+    );
+
+    Ok(())
+}
+
+/// Test GROUP BY with dictionary columns containing null keys and values for FIRST_VALUE/LAST_VALUE
+#[tokio::test]
+async fn test_first_last_value_group_by_dict_nulls() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Create dictionary with null keys
+    let dict_null_keys = create_test_dict(
+        &[Some("group_a"), Some("group_b")],
+        &[
+            Some(0), // group_a
+            None,    // null key
+            Some(1), // group_b
+            None,    // null key
+            Some(0), // group_a
+        ],
+    );
+
+    // Create dictionary with null values
+    let dict_null_vals = create_test_dict(
+        &[Some("val_x"), None, Some("val_y")],
+        &[
+            Some(0), // val_x
+            Some(1), // null value
+            Some(2), // val_y
+            Some(1), // null value
+            Some(0), // val_x
+        ],
+    );
+
+    // Create test values
+    let values = Int32Array::from(vec![Some(10), Some(20), Some(30), Some(40), Some(50)]);
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("dict_null_keys", string_dict_type(), true),
+        Field::new("dict_null_vals", string_dict_type(), true),
+        Field::new("value", DataType::Int32, true),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(dict_null_keys),
+            Arc::new(dict_null_vals),
+            Arc::new(values),
+        ],
+    )?;
+
+    let table = MemTable::try_new(schema, vec![vec![batch]])?;
+    ctx.register_table("test_data", Arc::new(table))?;
+
+    // Test GROUP BY with null keys
+    let sql = "SELECT 
+        dict_null_keys,
+        FIRST_VALUE(value) as first_val,
+        LAST_VALUE(value) as last_val,
+        COUNT(*) as cnt
+    FROM test_data 
+    GROUP BY dict_null_keys 
+    ORDER BY dict_null_keys NULLS FIRST";
+
+    let df = ctx.sql(sql).await?;
+    let results = df.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +----------------+-----------+----------+-----+
+    | dict_null_keys | first_val | last_val | cnt |
+    +----------------+-----------+----------+-----+
+    |                | 20        | 40       | 2   |
+    | group_a        | 10        | 50       | 2   |
+    | group_b        | 30        | 30       | 1   |
+    +----------------+-----------+----------+-----+
+    "
+    );
+
+    // Test GROUP BY with null values in dictionary
+    let sql2 = "SELECT 
+        dict_null_vals,
+        FIRST_VALUE(value) as first_val,
+        LAST_VALUE(value) as last_val,
+        COUNT(*) as cnt
+    FROM test_data 
+    GROUP BY dict_null_vals 
+    ORDER BY dict_null_vals NULLS FIRST";
+
+    let df2 = ctx.sql(sql2).await?;
+    let results2 = df2.collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&results2),
+        @r"
+    +----------------+-----------+----------+-----+
+    | dict_null_vals | first_val | last_val | cnt |
+    +----------------+-----------+----------+-----+
+    |                | 20        | 40       | 2   |
+    | val_x          | 10        | 50       | 2   |
+    | val_y          | 30        | 30       | 1   |
+    +----------------+-----------+----------+-----+
+    "
+    );
+
+    Ok(())
+}
+
+/// Test MAX with dictionary columns containing null keys and values as specified in the SQL query
+#[tokio::test]
+async fn test_max_with_fuzz_table_dict_nulls() -> Result<()> {
+    let (ctx_single, ctx_multi) = setup_fuzz_test_contexts().await?;
+
+    // Execute the SQL query with MAX aggregations
+    let sql = "SELECT
+        u8_low,
+        dictionary_utf8_low,
+        utf8_low,
+        max(utf8_low) as col1,
+        max(utf8) as col2
+    FROM
+        fuzz_table
+    GROUP BY
+        u8_low,
+        dictionary_utf8_low,
+        utf8_low
+    ORDER BY u8_low, dictionary_utf8_low NULLS FIRST, utf8_low";
+
+    let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +--------+---------------------+----------+-------+---------+
+    | u8_low | dictionary_utf8_low | utf8_low | col1  | col2    |
+    +--------+---------------------+----------+-------+---------+
+    | 1      |                     | str_b    | str_b | value_2 |
+    | 1      | dict_a              | str_a    | str_a | value_5 |
+    | 2      |                     | str_c    | str_c | value_7 |
+    | 2      |                     | str_d    | str_d | value_4 |
+    | 2      | dict_b              | str_c    | str_c | value_3 |
+    | 3      |                     | str_e    | str_e |         |
+    | 3      | dict_c              | str_f    | str_f | value_6 |
+    +--------+---------------------+----------+-------+---------+
+    ");
+
+    Ok(())
+}
+
+/// Test MIN with fuzz table containing dictionary columns with null keys and values and timestamp data (single and multiple partitions)
+#[tokio::test]
+async fn test_min_timestamp_with_fuzz_table_dict_nulls() -> Result<()> {
+    let (ctx_single, ctx_multi) = setup_fuzz_timestamp_test_contexts().await?;
+
+    // Execute the SQL query with MIN aggregation on timestamp
+    let sql = "SELECT
+        utf8_low,
+        u8_low,
+        dictionary_utf8_low,
+        min(timestamp_us) as col1
+    FROM
+        fuzz_table
+    GROUP BY
+        utf8_low,
+        u8_low,
+        dictionary_utf8_low
+    ORDER BY utf8_low, u8_low, dictionary_utf8_low NULLS FIRST";
+
+    let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +----------+--------+---------------------+-------------------------+
+    | utf8_low | u8_low | dictionary_utf8_low | col1                    |
+    +----------+--------+---------------------+-------------------------+
+    | alpha    | 10     | dict_x              | 1970-01-01T00:00:01     |
+    | beta     | 20     |                     | 1970-01-01T00:00:02     |
+    | delta    | 20     |                     | 1970-01-01T00:00:03.500 |
+    | epsilon  | 40     |                     | 1970-01-01T00:00:04     |
+    | gamma    | 30     | dict_y              | 1970-01-01T00:00:02.800 |
+    | zeta     | 30     | dict_z              | 1970-01-01T00:00:02.500 |
+    +----------+--------+---------------------+-------------------------+
+    "
+    );
+
+    Ok(())
+}
+
+/// Test COUNT and COUNT DISTINCT with fuzz table containing dictionary columns with null keys and values (single and multiple partitions)
+#[tokio::test]
+async fn test_count_distinct_with_fuzz_table_dict_nulls() -> Result<()> {
+    let (ctx_single, ctx_multi) = setup_fuzz_count_test_contexts().await?;
+
+    // Execute the SQL query with COUNT and COUNT DISTINCT aggregations
+    let sql = "SELECT
+        u8_low,
+        utf8_low,
+        dictionary_utf8_low,
+        count(duration_nanosecond) as col1,
+        count(DISTINCT large_binary) as col2
+    FROM
+        fuzz_table
+    GROUP BY
+        u8_low,
+        utf8_low,
+        dictionary_utf8_low
+    ORDER BY u8_low, utf8_low, dictionary_utf8_low NULLS FIRST";
+
+    let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +--------+----------+---------------------+------+------+
+    | u8_low | utf8_low | dictionary_utf8_low | col1 | col2 |
+    +--------+----------+---------------------+------+------+
+    | 5      | text_a   | group_alpha         | 3    | 1    |
+    | 10     | text_b   |                     | 1    | 1    |
+    | 10     | text_d   |                     | 2    | 0    |
+    | 15     | text_c   | group_beta          | 1    | 1    |
+    | 20     | text_e   |                     | 0    | 1    |
+    | 25     | text_f   | group_gamma         | 1    | 1    |
+    +--------+----------+---------------------+------+------+
+    "
+    );
+
+    Ok(())
+}
+
+/// Test MEDIAN and MEDIAN DISTINCT with fuzz table containing various numeric types and dictionary columns with null keys and values (single and multiple partitions)
+#[tokio::test]
+async fn test_median_distinct_with_fuzz_table_dict_nulls() -> Result<()> {
+    let (ctx_single, ctx_multi) = setup_fuzz_median_test_contexts().await?;
+
+    // Execute the SQL query with MEDIAN and MEDIAN DISTINCT aggregations
+    let sql = "SELECT
+        u8_low,
+        dictionary_utf8_low,
+        median(DISTINCT u64) as col1,
+        median(DISTINCT u16) as col2,
+        median(u64) as col3,
+        median(decimal128) as col4,
+        median(DISTINCT u32) as col5
+    FROM
+        fuzz_table
+    GROUP BY
+        u8_low,
+        dictionary_utf8_low
+    ORDER BY u8_low, dictionary_utf8_low NULLS FIRST";
+
+    let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
+
+    assert_snapshot!(
+        batches_to_string(&results),
+        @r"
+    +--------+---------------------+------+------+------+--------+--------+
+    | u8_low | dictionary_utf8_low | col1 | col2 | col3 | col4   | col5   |
+    +--------+---------------------+------+------+------+--------+--------+
+    | 50     |                     |      | 30   |      | 987.65 | 400000 |
+    | 50     | group_three         | 5000 | 50   | 5000 | 555.55 | 500000 |
+    | 75     |                     | 4000 |      | 4000 |        | 450000 |
+    | 100    | group_one           | 1100 | 11   | 1000 | 123.45 | 110000 |
+    | 100    | group_two           | 1500 | 15   | 1500 | 111.11 | 150000 |
+    | 200    |                     | 2500 | 22   | 2500 | 506.11 | 250000 |
+    +--------+---------------------+------+------+------+--------+--------+
+    "
+    );
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/aggregates/mod.rs b/datafusion/core/tests/sql/aggregates/mod.rs
new file mode 100644
index 0000000000000..ede40d5c4ceca
--- /dev/null
+++ b/datafusion/core/tests/sql/aggregates/mod.rs
@@ -0,0 +1,1026 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Aggregate function tests
+
+use super::*;
+use arrow::{
+    array::{
+        Decimal128Array, DictionaryArray, DurationNanosecondArray, Int32Array,
+        LargeBinaryArray, StringArray, TimestampMicrosecondArray, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array, types::UInt32Type,
+    },
+    datatypes::{DataType, Field, Schema, TimeUnit},
+    record_batch::RecordBatch,
+};
+use datafusion::{
+    common::{Result, test_util::batches_to_string},
+    execution::{config::SessionConfig, context::SessionContext},
+};
+use datafusion_catalog::MemTable;
+use std::{cmp::min, sync::Arc};
+/// Helper function to create the commonly used UInt32 indexed UTF-8 dictionary data type
+pub fn string_dict_type() -> DataType {
+    DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8))
+}
+
+/// Helper functions for aggregate tests with dictionary columns and nulls
+/// Creates a dictionary array with null values in the dictionary
+pub fn create_test_dict(
+    values: &[Option<&str>],
+    indices: &[Option<u32>],
+) -> DictionaryArray<UInt32Type> {
+    let dict_values = StringArray::from(values.to_vec());
+    let dict_indices = UInt32Array::from(indices.to_vec());
+    DictionaryArray::new(dict_indices, Arc::new(dict_values))
+}
+
+/// Creates test data with both dictionary columns and value column
+pub struct TestData {
+    pub dict_null_keys: DictionaryArray<UInt32Type>,
+    pub dict_null_vals: DictionaryArray<UInt32Type>,
+    pub values: Int32Array,
+    pub schema: Arc<Schema>,
+}
+
+impl TestData {
+    pub fn new() -> Self {
+        // Create dictionary with null keys
+        let dict_null_keys = create_test_dict(
+            &[Some("group_a"), Some("group_b")],
+            &[
+                Some(0), // group_a
+                None,    // null key
+                Some(1), // group_b
+                None,    // null key
+                Some(0), // group_a
+            ],
+        );
+
+        // Create dictionary with null values
+        let dict_null_vals = create_test_dict(
+            &[Some("group_x"), None, Some("group_y")],
+            &[
+                Some(0), // group_x
+                Some(1), // null value
+                Some(2), // group_y
+                Some(1), // null value
+                Some(0), // group_x
+            ],
+        );
+
+        // Create test data with nulls
+        let values = Int32Array::from(vec![Some(1), None, Some(2), None, Some(3)]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dict_null_keys", string_dict_type(), true),
+            Field::new("dict_null_vals", string_dict_type(), true),
+            Field::new("value", DataType::Int32, true),
+        ]));
+
+        Self {
+            dict_null_keys,
+            dict_null_vals,
+            values,
+            schema,
+        }
+    }
+
+    /// Creates extended test data for more comprehensive testing
+    pub fn new_extended() -> Self {
+        // Create dictionary with null values in the dictionary array
+        let dict_null_vals = create_test_dict(
+            &[Some("group_a"), None, Some("group_b")],
+            &[
+                Some(0), // group_a
+                Some(1), // null value
+                Some(2), // group_b
+                Some(1), // null value
+                Some(0), // group_a
+                Some(1), // null value
+                Some(2), // group_b
+                Some(1), // null value
+            ],
+        );
+
+        // Create dictionary with null keys
+        let dict_null_keys = create_test_dict(
+            &[Some("group_x"), Some("group_y"), Some("group_z")],
+            &[
+                Some(0), // group_x
+                None,    // null key
+                Some(1), // group_y
+                None,    // null key
+                Some(0), // group_x
+                None,    // null key
+                Some(2), // group_z
+                None,    // null key
+            ],
+        );
+
+        // Create test data with nulls
+        let values = Int32Array::from(vec![
+            Some(1),
+            None,
+            Some(2),
+            None,
+            Some(3),
+            Some(4),
+            Some(5),
+            None,
+        ]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dict_null_vals", string_dict_type(), true),
+            Field::new("dict_null_keys", string_dict_type(), true),
+            Field::new("value", DataType::Int32, true),
+        ]));
+
+        Self {
+            dict_null_keys,
+            dict_null_vals,
+            values,
+            schema,
+        }
+    }
+
+    /// Creates test data for MIN/MAX testing with varied values
+    pub fn new_for_min_max() -> Self {
+        let dict_null_keys = create_test_dict(
+            &[Some("group_a"), Some("group_b"), Some("group_c")],
+            &[
+                Some(0),
+                Some(1),
+                Some(0),
+                Some(2),
+                None,
+                None, // group_a, group_b, group_a, group_c, null, null
+            ],
+        );
+
+        let dict_null_vals = create_test_dict(
+            &[Some("group_x"), None, Some("group_y")],
+            &[
+                Some(0),
+                Some(1),
+                Some(0),
+                Some(2),
+                Some(1),
+                Some(1), // group_x, null, group_x, group_y, null, null
+            ],
+        );
+
+        let values =
+            Int32Array::from(vec![Some(5), Some(1), Some(3), Some(7), Some(2), None]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dict_null_keys", string_dict_type(), true),
+            Field::new("dict_null_vals", string_dict_type(), true),
+            Field::new("value", DataType::Int32, true),
+        ]));
+
+        Self {
+            dict_null_keys,
+            dict_null_vals,
+            values,
+            schema,
+        }
+    }
+
+    /// Creates test data for MEDIAN testing with varied values
+    pub fn new_for_median() -> Self {
+        let dict_null_vals = create_test_dict(
+            &[Some("group_a"), None, Some("group_b")],
+            &[Some(0), Some(1), Some(2), Some(1), Some(0)],
+        );
+
+        let dict_null_keys = create_test_dict(
+            &[Some("group_x"), Some("group_y"), Some("group_z")],
+            &[Some(0), None, Some(1), None, Some(2)],
+        );
+
+        let values = Int32Array::from(vec![Some(1), None, Some(5), Some(3), Some(7)]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dict_null_vals", string_dict_type(), true),
+            Field::new("dict_null_keys", string_dict_type(), true),
+            Field::new("value", DataType::Int32, true),
+        ]));
+
+        Self {
+            dict_null_keys,
+            dict_null_vals,
+            values,
+            schema,
+        }
+    }
+
+    /// Creates test data for FIRST_VALUE/LAST_VALUE testing
+    pub fn new_for_first_last() -> Self {
+        let dict_null_keys = create_test_dict(
+            &[Some("group_a"), Some("group_b")],
+            &[Some(0), None, Some(1), None, Some(0)],
+        );
+
+        let dict_null_vals = create_test_dict(
+            &[Some("group_x"), None, Some("group_y")],
+            &[Some(0), Some(1), Some(2), Some(1), Some(0)],
+        );
+
+        let values = Int32Array::from(vec![None, Some(1), Some(2), Some(3), None]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dict_null_keys", string_dict_type(), true),
+            Field::new("dict_null_vals", string_dict_type(), true),
+            Field::new("value", DataType::Int32, true),
+        ]));
+
+        Self {
+            dict_null_keys,
+            dict_null_vals,
+            values,
+            schema,
+        }
+    }
+}
+
+/// Sets up test contexts for TestData with both single and multiple partitions
+pub async fn setup_test_contexts(
+    test_data: &TestData,
+) -> Result<(SessionContext, SessionContext)> {
+    // Single partition context
+    let ctx_single = create_context_with_partitions(test_data, 1).await?;
+
+    // Multiple partition context
+    let ctx_multi = create_context_with_partitions(test_data, 3).await?;
+
+    Ok((ctx_single, ctx_multi))
+}
+
+/// Creates a session context with the specified number of partitions and registers test data
+pub async fn create_context_with_partitions(
+    test_data: &TestData,
+    num_partitions: usize,
+) -> Result<SessionContext> {
+    let ctx = SessionContext::new_with_config(
+        SessionConfig::new().with_target_partitions(num_partitions),
+    );
+
+    let batches = split_test_data_into_batches(test_data, num_partitions)?;
+    let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
+    ctx.register_table("t", Arc::new(provider))?;
+
+    Ok(ctx)
+}
+
+/// Splits test data into multiple batches for partitioning
+pub fn split_test_data_into_batches(
+    test_data: &TestData,
+    num_partitions: usize,
+) -> Result<Vec<Vec<RecordBatch>>> {
+    debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
+    let total_len = test_data.values.len();
+    let chunk_size = total_len.div_ceil(num_partitions); // Ensure we cover all data
+
+    let mut batches = Vec::new();
+    let mut start = 0;
+
+    while start < total_len {
+        let end = min(start + chunk_size, total_len);
+        let len = end - start;
+
+        if len > 0 {
+            let batch = RecordBatch::try_new(
+                test_data.schema.clone(),
+                vec![
+                    Arc::new(test_data.dict_null_keys.slice(start, len)),
+                    Arc::new(test_data.dict_null_vals.slice(start, len)),
+                    Arc::new(test_data.values.slice(start, len)),
+                ],
+            )?;
+            batches.push(vec![batch]);
+        }
+        start = end;
+    }
+
+    Ok(batches)
+}
+
+/// Executes a query on both single and multi-partition contexts and verifies consistency
+pub async fn test_query_consistency(
+    ctx_single: &SessionContext,
+    ctx_multi: &SessionContext,
+    sql: &str,
+) -> Result<Vec<RecordBatch>> {
+    let df_single = ctx_single.sql(sql).await?;
+    let results_single = df_single.collect().await?;
+
+    let df_multi = ctx_multi.sql(sql).await?;
+    let results_multi = df_multi.collect().await?;
+
+    // Verify results are consistent between single and multiple partitions
+    assert_eq!(
+        batches_to_string(&results_single),
+        batches_to_string(&results_multi),
+        "Results should be identical between single and multiple partitions"
+    );
+
+    Ok(results_single)
+}
+
+/// Helper function to run snapshot tests with consistent setup, execution, and assertion
+/// This reduces the repetitive pattern of "setup data → SQL → assert_snapshot!"
+pub async fn run_snapshot_test(
+    test_data: &TestData,
+    sql: &str,
+) -> Result<Vec<RecordBatch>> {
+    let (ctx_single, ctx_multi) = setup_test_contexts(test_data).await?;
+    let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
+    Ok(results)
+}
+
+/// Test data structure for fuzz table with dictionary columns containing nulls
+pub struct FuzzTestData {
+    pub schema: Arc<Schema>,
+    pub u8_low: UInt8Array,
+    pub dictionary_utf8_low: DictionaryArray<UInt32Type>,
+    pub utf8_low: StringArray,
+    pub utf8: StringArray,
+}
+
+impl FuzzTestData {
+    pub fn new() -> Self {
+        // Create dictionary columns with null keys and values
+        let dictionary_utf8_low = create_test_dict(
+            &[Some("dict_a"), None, Some("dict_b"), Some("dict_c")],
+            &[
+                Some(0), // dict_a
+                Some(1), // null value
+                Some(2), // dict_b
+                None,    // null key
+                Some(0), // dict_a
+                Some(1), // null value
+                Some(3), // dict_c
+                None,    // null key
+            ],
+        );
+
+        let u8_low = UInt8Array::from(vec![
+            Some(1),
+            Some(1),
+            Some(2),
+            Some(2),
+            Some(1),
+            Some(3),
+            Some(3),
+            Some(2),
+        ]);
+
+        let utf8_low = StringArray::from(vec![
+            Some("str_a"),
+            Some("str_b"),
+            Some("str_c"),
+            Some("str_d"),
+            Some("str_a"),
+            Some("str_e"),
+            Some("str_f"),
+            Some("str_c"),
+        ]);
+
+        let utf8 = StringArray::from(vec![
+            Some("value_1"),
+            Some("value_2"),
+            Some("value_3"),
+            Some("value_4"),
+            Some("value_5"),
+            None,
+            Some("value_6"),
+            Some("value_7"),
+        ]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("u8_low", DataType::UInt8, true),
+            Field::new("dictionary_utf8_low", string_dict_type(), true),
+            Field::new("utf8_low", DataType::Utf8, true),
+            Field::new("utf8", DataType::Utf8, true),
+        ]));
+
+        Self {
+            schema,
+            u8_low,
+            dictionary_utf8_low,
+            utf8_low,
+            utf8,
+        }
+    }
+}
+
+/// Sets up test contexts for fuzz table with both single and multiple partitions
+pub async fn setup_fuzz_test_contexts() -> Result<(SessionContext, SessionContext)> {
+    let test_data = FuzzTestData::new();
+
+    // Single partition context
+    let ctx_single = create_fuzz_context_with_partitions(&test_data, 1).await?;
+
+    // Multiple partition context
+    let ctx_multi = create_fuzz_context_with_partitions(&test_data, 3).await?;
+
+    Ok((ctx_single, ctx_multi))
+}
+
+/// Creates a session context with fuzz table partitioned into specified number of partitions
+pub async fn create_fuzz_context_with_partitions(
+    test_data: &FuzzTestData,
+    num_partitions: usize,
+) -> Result<SessionContext> {
+    let ctx = SessionContext::new_with_config(
+        SessionConfig::new().with_target_partitions(num_partitions),
+    );
+
+    let batches = split_fuzz_data_into_batches(test_data, num_partitions)?;
+    let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
+    ctx.register_table("fuzz_table", Arc::new(provider))?;
+
+    Ok(ctx)
+}
+
+/// Splits fuzz test data into multiple batches for partitioning
+pub fn split_fuzz_data_into_batches(
+    test_data: &FuzzTestData,
+    num_partitions: usize,
+) -> Result<Vec<Vec<RecordBatch>>> {
+    debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
+    let total_len = test_data.u8_low.len();
+    let chunk_size = total_len.div_ceil(num_partitions);
+
+    let mut batches = Vec::new();
+    let mut start = 0;
+
+    while start < total_len {
+        let end = min(start + chunk_size, total_len);
+        let len = end - start;
+
+        if len > 0 {
+            let batch = RecordBatch::try_new(
+                test_data.schema.clone(),
+                vec![
+                    Arc::new(test_data.u8_low.slice(start, len)),
+                    Arc::new(test_data.dictionary_utf8_low.slice(start, len)),
+                    Arc::new(test_data.utf8_low.slice(start, len)),
+                    Arc::new(test_data.utf8.slice(start, len)),
+                ],
+            )?;
+            batches.push(vec![batch]);
+        }
+        start = end;
+    }
+
+    Ok(batches)
+}
+
+/// Test data structure for fuzz table with duration, large_binary and dictionary columns containing nulls
+pub struct FuzzCountTestData {
+    pub schema: Arc<Schema>,
+    pub u8_low: UInt8Array,
+    pub utf8_low: StringArray,
+    pub dictionary_utf8_low: DictionaryArray<UInt32Type>,
+    pub duration_nanosecond: DurationNanosecondArray,
+    pub large_binary: LargeBinaryArray,
+}
+
+impl FuzzCountTestData {
+    pub fn new() -> Self {
+        // Create dictionary columns with null keys and values
+        let dictionary_utf8_low = create_test_dict(
+            &[
+                Some("group_alpha"),
+                None,
+                Some("group_beta"),
+                Some("group_gamma"),
+            ],
+            &[
+                Some(0), // group_alpha
+                Some(1), // null value
+                Some(2), // group_beta
+                None,    // null key
+                Some(0), // group_alpha
+                Some(1), // null value
+                Some(3), // group_gamma
+                None,    // null key
+                Some(2), // group_beta
+                Some(0), // group_alpha
+            ],
+        );
+
+        let u8_low = UInt8Array::from(vec![
+            Some(5),
+            Some(10),
+            Some(15),
+            Some(10),
+            Some(5),
+            Some(20),
+            Some(25),
+            Some(10),
+            Some(15),
+            Some(5),
+        ]);
+
+        let utf8_low = StringArray::from(vec![
+            Some("text_a"),
+            Some("text_b"),
+            Some("text_c"),
+            Some("text_d"),
+            Some("text_a"),
+            Some("text_e"),
+            Some("text_f"),
+            Some("text_d"),
+            Some("text_c"),
+            Some("text_a"),
+        ]);
+
+        // Create duration data with some nulls (nanoseconds)
+        let duration_nanosecond = DurationNanosecondArray::from(vec![
+            Some(1000000000), // 1 second
+            Some(2000000000), // 2 seconds
+            None,             // null duration
+            Some(3000000000), // 3 seconds
+            Some(1500000000), // 1.5 seconds
+            None,             // null duration
+            Some(4000000000), // 4 seconds
+            Some(2500000000), // 2.5 seconds
+            Some(3500000000), // 3.5 seconds
+            Some(1200000000), // 1.2 seconds
+        ]);
+
+        // Create large binary data with some nulls and duplicates
+        let large_binary = LargeBinaryArray::from(vec![
+            Some(b"binary_data_1".as_slice()),
+            Some(b"binary_data_2".as_slice()),
+            Some(b"binary_data_3".as_slice()),
+            None,                              // null binary
+            Some(b"binary_data_1".as_slice()), // duplicate
+            Some(b"binary_data_4".as_slice()),
+            Some(b"binary_data_5".as_slice()),
+            None,                              // null binary
+            Some(b"binary_data_3".as_slice()), // duplicate
+            Some(b"binary_data_1".as_slice()), // duplicate
+        ]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("u8_low", DataType::UInt8, true),
+            Field::new("utf8_low", DataType::Utf8, true),
+            Field::new("dictionary_utf8_low", string_dict_type(), true),
+            Field::new(
+                "duration_nanosecond",
+                DataType::Duration(TimeUnit::Nanosecond),
+                true,
+            ),
+            Field::new("large_binary", DataType::LargeBinary, true),
+        ]));
+
+        Self {
+            schema,
+            u8_low,
+            utf8_low,
+            dictionary_utf8_low,
+            duration_nanosecond,
+            large_binary,
+        }
+    }
+}
+
+/// Sets up test contexts for fuzz table with duration/binary columns and both single and multiple partitions
+pub async fn setup_fuzz_count_test_contexts() -> Result<(SessionContext, SessionContext)>
+{
+    let test_data = FuzzCountTestData::new();
+
+    // Single partition context
+    let ctx_single = create_fuzz_count_context_with_partitions(&test_data, 1).await?;
+
+    // Multiple partition context
+    let ctx_multi = create_fuzz_count_context_with_partitions(&test_data, 3).await?;
+
+    Ok((ctx_single, ctx_multi))
+}
+
+/// Creates a session context with fuzz count table partitioned into specified number of partitions
+pub async fn create_fuzz_count_context_with_partitions(
+    test_data: &FuzzCountTestData,
+    num_partitions: usize,
+) -> Result<SessionContext> {
+    let ctx = SessionContext::new_with_config(
+        SessionConfig::new().with_target_partitions(num_partitions),
+    );
+
+    let batches = split_fuzz_count_data_into_batches(test_data, num_partitions)?;
+    let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
+    ctx.register_table("fuzz_table", Arc::new(provider))?;
+
+    Ok(ctx)
+}
+
+/// Splits fuzz count test data into multiple batches for partitioning
+pub fn split_fuzz_count_data_into_batches(
+    test_data: &FuzzCountTestData,
+    num_partitions: usize,
+) -> Result<Vec<Vec<RecordBatch>>> {
+    debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
+    let total_len = test_data.u8_low.len();
+    let chunk_size = total_len.div_ceil(num_partitions);
+
+    let mut batches = Vec::new();
+    let mut start = 0;
+
+    while start < total_len {
+        let end = min(start + chunk_size, total_len);
+        let len = end - start;
+
+        if len > 0 {
+            let batch = RecordBatch::try_new(
+                test_data.schema.clone(),
+                vec![
+                    Arc::new(test_data.u8_low.slice(start, len)),
+                    Arc::new(test_data.utf8_low.slice(start, len)),
+                    Arc::new(test_data.dictionary_utf8_low.slice(start, len)),
+                    Arc::new(test_data.duration_nanosecond.slice(start, len)),
+                    Arc::new(test_data.large_binary.slice(start, len)),
+                ],
+            )?;
+            batches.push(vec![batch]);
+        }
+        start = end;
+    }
+
+    Ok(batches)
+}
+
+/// Test data structure for fuzz table with numeric types for median testing and dictionary columns containing nulls
+pub struct FuzzMedianTestData {
+    pub schema: Arc<Schema>,
+    pub u8_low: UInt8Array,
+    pub dictionary_utf8_low: DictionaryArray<UInt32Type>,
+    pub u64: UInt64Array,
+    pub u16: UInt16Array,
+    pub u32: UInt32Array,
+    pub decimal128: Decimal128Array,
+}
+
+impl FuzzMedianTestData {
+    pub fn new() -> Self {
+        // Create dictionary columns with null keys and values
+        let dictionary_utf8_low = create_test_dict(
+            &[
+                Some("group_one"),
+                None,
+                Some("group_two"),
+                Some("group_three"),
+            ],
+            &[
+                Some(0), // group_one
+                Some(1), // null value
+                Some(2), // group_two
+                None,    // null key
+                Some(0), // group_one
+                Some(1), // null value
+                Some(3), // group_three
+                None,    // null key
+                Some(2), // group_two
+                Some(0), // group_one
+                Some(1), // null value
+                Some(3), // group_three
+            ],
+        );
+
+        let u8_low = UInt8Array::from(vec![
+            Some(100),
+            Some(200),
+            Some(100),
+            Some(200),
+            Some(100),
+            Some(50),
+            Some(50),
+            Some(200),
+            Some(100),
+            Some(100),
+            Some(75),
+            Some(50),
+        ]);
+
+        // Create u64 data with some nulls and duplicates for DISTINCT testing
+        let u64 = UInt64Array::from(vec![
+            Some(1000),
+            Some(2000),
+            Some(1500),
+            Some(3000),
+            Some(1000), // duplicate
+            None,       // null
+            Some(5000),
+            Some(2500),
+            Some(1500), // duplicate
+            Some(1200),
+            Some(4000),
+            Some(5000), // duplicate
+        ]);
+
+        // Create u16 data with some nulls and duplicates
+        let u16 = UInt16Array::from(vec![
+            Some(10),
+            Some(20),
+            Some(15),
+            None,     // null
+            Some(10), // duplicate
+            Some(30),
+            Some(50),
+            Some(25),
+            Some(15), // duplicate
+            Some(12),
+            None,     // null
+            Some(50), // duplicate
+        ]);
+
+        // Create u32 data with some nulls and duplicates
+        let u32 = UInt32Array::from(vec![
+            Some(100000),
+            Some(200000),
+            Some(150000),
+            Some(300000),
+            Some(100000), // duplicate
+            Some(400000),
+            Some(500000),
+            None,         // null
+            Some(150000), // duplicate
+            Some(120000),
+            Some(450000),
+            None, // null
+        ]);
+
+        // Create decimal128 data with precision 10, scale 2
+        let decimal128 = Decimal128Array::from(vec![
+            Some(12345), // 123.45
+            Some(67890), // 678.90
+            Some(11111), // 111.11
+            None,        // null
+            Some(12345), // 123.45 duplicate
+            Some(98765), // 987.65
+            Some(55555), // 555.55
+            Some(33333), // 333.33
+            Some(11111), // 111.11 duplicate
+            Some(12500), // 125.00
+            None,        // null
+            Some(55555), // 555.55 duplicate
+        ])
+        .with_precision_and_scale(10, 2)
+        .unwrap();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("u8_low", DataType::UInt8, true),
+            Field::new("dictionary_utf8_low", string_dict_type(), true),
+            Field::new("u64", DataType::UInt64, true),
+            Field::new("u16", DataType::UInt16, true),
+            Field::new("u32", DataType::UInt32, true),
+            Field::new("decimal128", DataType::Decimal128(10, 2), true),
+        ]));
+
+        Self {
+            schema,
+            u8_low,
+            dictionary_utf8_low,
+            u64,
+            u16,
+            u32,
+            decimal128,
+        }
+    }
+}
+
+/// Sets up test contexts for fuzz table with numeric types for median testing and both single and multiple partitions
+pub async fn setup_fuzz_median_test_contexts() -> Result<(SessionContext, SessionContext)>
+{
+    let test_data = FuzzMedianTestData::new();
+
+    // Single partition context
+    let ctx_single = create_fuzz_median_context_with_partitions(&test_data, 1).await?;
+
+    // Multiple partition context
+    let ctx_multi = create_fuzz_median_context_with_partitions(&test_data, 3).await?;
+
+    Ok((ctx_single, ctx_multi))
+}
+
+/// Creates a session context with fuzz median table partitioned into specified number of partitions
+pub async fn create_fuzz_median_context_with_partitions(
+    test_data: &FuzzMedianTestData,
+    num_partitions: usize,
+) -> Result<SessionContext> {
+    let ctx = SessionContext::new_with_config(
+        SessionConfig::new().with_target_partitions(num_partitions),
+    );
+
+    let batches = split_fuzz_median_data_into_batches(test_data, num_partitions)?;
+    let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
+    ctx.register_table("fuzz_table", Arc::new(provider))?;
+
+    Ok(ctx)
+}
+
+/// Splits fuzz median test data into multiple batches for partitioning
+pub fn split_fuzz_median_data_into_batches(
+    test_data: &FuzzMedianTestData,
+    num_partitions: usize,
+) -> Result<Vec<Vec<RecordBatch>>> {
+    debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
+    let total_len = test_data.u8_low.len();
+    let chunk_size = total_len.div_ceil(num_partitions);
+
+    let mut batches = Vec::new();
+    let mut start = 0;
+
+    while start < total_len {
+        let end = min(start + chunk_size, total_len);
+        let len = end - start;
+
+        if len > 0 {
+            let batch = RecordBatch::try_new(
+                test_data.schema.clone(),
+                vec![
+                    Arc::new(test_data.u8_low.slice(start, len)),
+                    Arc::new(test_data.dictionary_utf8_low.slice(start, len)),
+                    Arc::new(test_data.u64.slice(start, len)),
+                    Arc::new(test_data.u16.slice(start, len)),
+                    Arc::new(test_data.u32.slice(start, len)),
+                    Arc::new(test_data.decimal128.slice(start, len)),
+                ],
+            )?;
+            batches.push(vec![batch]);
+        }
+        start = end;
+    }
+
+    Ok(batches)
+}
+
+/// Test data structure for fuzz table with timestamp and dictionary columns containing nulls
+pub struct FuzzTimestampTestData {
+    pub schema: Arc<Schema>,
+    pub utf8_low: StringArray,
+    pub u8_low: UInt8Array,
+    pub dictionary_utf8_low: DictionaryArray<UInt32Type>,
+    pub timestamp_us: TimestampMicrosecondArray,
+}
+
+impl FuzzTimestampTestData {
+    pub fn new() -> Self {
+        // Create dictionary columns with null keys and values
+        let dictionary_utf8_low = create_test_dict(
+            &[Some("dict_x"), None, Some("dict_y"), Some("dict_z")],
+            &[
+                Some(0), // dict_x
+                Some(1), // null value
+                Some(2), // dict_y
+                None,    // null key
+                Some(0), // dict_x
+                Some(1), // null value
+                Some(3), // dict_z
+                None,    // null key
+                Some(2), // dict_y
+            ],
+        );
+
+        let utf8_low = StringArray::from(vec![
+            Some("alpha"),
+            Some("beta"),
+            Some("gamma"),
+            Some("delta"),
+            Some("alpha"),
+            Some("epsilon"),
+            Some("zeta"),
+            Some("delta"),
+            Some("gamma"),
+        ]);
+
+        let u8_low = UInt8Array::from(vec![
+            Some(10),
+            Some(20),
+            Some(30),
+            Some(20),
+            Some(10),
+            Some(40),
+            Some(30),
+            Some(20),
+            Some(30),
+        ]);
+
+        // Create timestamp data with some nulls
+        let timestamp_us = TimestampMicrosecondArray::from(vec![
+            Some(1000000), // 1970-01-01 00:00:01
+            Some(2000000), // 1970-01-01 00:00:02
+            Some(3000000), // 1970-01-01 00:00:03
+            None,          // null timestamp
+            Some(1500000), // 1970-01-01 00:00:01.5
+            Some(4000000), // 1970-01-01 00:00:04
+            Some(2500000), // 1970-01-01 00:00:02.5
+            Some(3500000), // 1970-01-01 00:00:03.5
+            Some(2800000), // 1970-01-01 00:00:02.8
+        ]);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("utf8_low", DataType::Utf8, true),
+            Field::new("u8_low", DataType::UInt8, true),
+            Field::new("dictionary_utf8_low", string_dict_type(), true),
+            Field::new(
+                "timestamp_us",
+                DataType::Timestamp(TimeUnit::Microsecond, None),
+                true,
+            ),
+        ]));
+
+        Self {
+            schema,
+            utf8_low,
+            u8_low,
+            dictionary_utf8_low,
+            timestamp_us,
+        }
+    }
+}
+
+/// Sets up test contexts for fuzz table with timestamps and both single and multiple partitions
+pub async fn setup_fuzz_timestamp_test_contexts()
+-> Result<(SessionContext, SessionContext)> {
+    let test_data = FuzzTimestampTestData::new();
+
+    // Single partition context
+    let ctx_single = create_fuzz_timestamp_context_with_partitions(&test_data, 1).await?;
+
+    // Multiple partition context
+    let ctx_multi = create_fuzz_timestamp_context_with_partitions(&test_data, 3).await?;
+
+    Ok((ctx_single, ctx_multi))
+}
+
+/// Creates a session context with fuzz timestamp table partitioned into specified number of partitions
+pub async fn create_fuzz_timestamp_context_with_partitions(
+    test_data: &FuzzTimestampTestData,
+    num_partitions: usize,
+) -> Result<SessionContext> {
+    let ctx = SessionContext::new_with_config(
+        SessionConfig::new().with_target_partitions(num_partitions),
+    );
+
+    let batches = split_fuzz_timestamp_data_into_batches(test_data, num_partitions)?;
+    let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
+    ctx.register_table("fuzz_table", Arc::new(provider))?;
+
+    Ok(ctx)
+}
+
+/// Splits fuzz timestamp test data into multiple batches for partitioning
+pub fn split_fuzz_timestamp_data_into_batches(
+    test_data: &FuzzTimestampTestData,
+    num_partitions: usize,
+) -> Result<Vec<Vec<RecordBatch>>> {
+    debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
+    let total_len = test_data.utf8_low.len();
+    let chunk_size = total_len.div_ceil(num_partitions);
+
+    let mut batches = Vec::new();
+    let mut start = 0;
+
+    while start < total_len {
+        let end = min(start + chunk_size, total_len);
+        let len = end - start;
+
+        if len > 0 {
+            let batch = RecordBatch::try_new(
+                test_data.schema.clone(),
+                vec![
+                    Arc::new(test_data.utf8_low.slice(start, len)),
+                    Arc::new(test_data.u8_low.slice(start, len)),
+                    Arc::new(test_data.dictionary_utf8_low.slice(start, len)),
+                    Arc::new(test_data.timestamp_us.slice(start, len)),
+                ],
+            )?;
+            batches.push(vec![batch]);
+        }
+        start = end;
+    }
+
+    Ok(batches)
+}
+
+pub mod basic;
+pub mod dict_nulls;
diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs
index 83712053b9542..4a60a79ff5de3 100644
--- a/datafusion/core/tests/sql/create_drop.rs
+++ b/datafusion/core/tests/sql/create_drop.rs
@@ -61,8 +61,31 @@ async fn create_external_table_with_ddl() -> Result<()> {
     assert_eq!(3, table_schema.fields().len());
 
     assert_eq!(&DataType::Int32, table_schema.field(0).data_type());
-    assert_eq!(&DataType::Utf8, table_schema.field(1).data_type());
+    assert_eq!(&DataType::Utf8View, table_schema.field(1).data_type());
     assert_eq!(&DataType::Boolean, table_schema.field(2).data_type());
 
     Ok(())
 }
+
+#[tokio::test]
+async fn create_drop_table() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    let sql = "CREATE TABLE dt (a_id integer, a_str string, a_bool boolean);";
+    ctx.sql(sql).await.unwrap();
+
+    let cat = ctx.catalog("datafusion").unwrap();
+    let schema = cat.schema("public").unwrap();
+
+    let exists = schema.table_exist("dt");
+    assert!(exists, "Table should have been created!");
+
+    // Drop the table
+    let sql = "DROP TABLE dt;";
+    ctx.sql(sql).await.unwrap();
+
+    let exists = schema.table_exist("dt");
+    assert!(!exists, "Table should have been dropped!");
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index 70e94227cfad8..5f62f7204eff1 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -16,11 +16,14 @@
 // under the License.
 
 use super::*;
+use insta::assert_snapshot;
 use rstest::rstest;
 
 use datafusion::config::ConfigOptions;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::metrics::Timestamp;
+use datafusion_common::format::ExplainAnalyzeLevel;
+use object_store::path::Path;
 
 #[tokio::test]
 async fn explain_analyze_baseline_metrics() {
@@ -52,42 +55,84 @@ async fn explain_analyze_baseline_metrics() {
     let formatted = arrow::util::pretty::pretty_format_batches(&results)
         .unwrap()
         .to_string();
+
     println!("Query Output:\n\n{formatted}");
 
     assert_metrics!(
         &formatted,
         "AggregateExec: mode=Partial, gby=[]",
-        "metrics=[output_rows=3, elapsed_compute="
+        "metrics=[output_rows=3, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=3"
     );
+
     assert_metrics!(
         &formatted,
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]",
-        "metrics=[output_rows=5, elapsed_compute="
+        "AggregateExec: mode=Partial, gby=[c1@0 as c1]",
+        "reduction_factor=5.1% (5/99)"
     );
+
+    {
+        let expected_batch_count_after_repartition =
+            if cfg!(not(feature = "force_hash_collisions")) {
+                "output_batches=3"
+            } else {
+                "output_batches=1"
+            };
+
+        assert_metrics!(
+            &formatted,
+            "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+
+        assert_metrics!(
+            &formatted,
+            "RepartitionExec: partitioning=Hash([c1@0], 3), input_partitions=3",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+
+        assert_metrics!(
+            &formatted,
+            "ProjectionExec: expr=[]",
+            "metrics=[output_rows=5, elapsed_compute=",
+            "output_bytes=",
+            expected_batch_count_after_repartition
+        );
+    }
+
     assert_metrics!(
         &formatted,
         "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434",
-        "metrics=[output_rows=99, elapsed_compute="
-    );
-    assert_metrics!(
-        &formatted,
-        "ProjectionExec: expr=[]",
-        "metrics=[output_rows=5, elapsed_compute="
+        "metrics=[output_rows=99, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=1"
     );
+
     assert_metrics!(
         &formatted,
-        "CoalesceBatchesExec: target_batch_size=4096",
-        "metrics=[output_rows=5, elapsed_compute"
+        "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434",
+        "selectivity=99% (99/100)"
     );
+
     assert_metrics!(
         &formatted,
         "UnionExec",
-        "metrics=[output_rows=3, elapsed_compute="
+        "metrics=[output_rows=3, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=3"
     );
+
     assert_metrics!(
         &formatted,
         "WindowAggExec",
-        "metrics=[output_rows=1, elapsed_compute="
+        "metrics=[output_rows=1, elapsed_compute=",
+        "output_bytes=",
+        "output_batches=1"
     );
 
     fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool {
@@ -99,7 +144,6 @@ async fn explain_analyze_baseline_metrics() {
             || plan.as_any().downcast_ref::<physical_plan::filter::FilterExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::limit::LocalLimitExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::projection::ProjectionExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::coalesce_batches::CoalesceBatchesExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::coalesce_partitions::CoalescePartitionsExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::union::UnionExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::windows::WindowAggExec>().is_some()
@@ -155,6 +199,116 @@ async fn explain_analyze_baseline_metrics() {
 fn nanos_from_timestamp(ts: &Timestamp) -> i64 {
     ts.value().unwrap().timestamp_nanos_opt().unwrap()
 }
+
+// Test different detail level for config `datafusion.explain.analyze_level`
+
+async fn collect_plan_with_context(
+    sql_str: &str,
+    ctx: &SessionContext,
+    level: ExplainAnalyzeLevel,
+) -> String {
+    {
+        let state = ctx.state_ref();
+        let mut state = state.write();
+        state.config_mut().options_mut().explain.analyze_level = level;
+    }
+    let dataframe = ctx.sql(sql_str).await.unwrap();
+    let batches = dataframe.collect().await.unwrap();
+    arrow::util::pretty::pretty_format_batches(&batches)
+        .unwrap()
+        .to_string()
+}
+
+async fn collect_plan(sql_str: &str, level: ExplainAnalyzeLevel) -> String {
+    let ctx = SessionContext::new();
+    collect_plan_with_context(sql_str, &ctx, level).await
+}
+
+#[tokio::test]
+async fn explain_analyze_level() {
+    let sql = "EXPLAIN ANALYZE \
+            SELECT * \
+            FROM generate_series(10) as t1(v1) \
+            ORDER BY v1 DESC";
+
+    for (level, needle, should_contain) in [
+        (ExplainAnalyzeLevel::Summary, "spill_count", false),
+        (ExplainAnalyzeLevel::Summary, "output_batches", false),
+        (ExplainAnalyzeLevel::Summary, "output_rows", true),
+        (ExplainAnalyzeLevel::Summary, "output_bytes", true),
+        (ExplainAnalyzeLevel::Dev, "spill_count", true),
+        (ExplainAnalyzeLevel::Dev, "output_rows", true),
+        (ExplainAnalyzeLevel::Dev, "output_bytes", true),
+        (ExplainAnalyzeLevel::Dev, "output_batches", true),
+    ] {
+        let plan = collect_plan(sql, level).await;
+        assert_eq!(
+            plan.contains(needle),
+            should_contain,
+            "plan for level {level:?} unexpected content: {plan}"
+        );
+    }
+}
+
+#[tokio::test]
+async fn explain_analyze_level_datasource_parquet() {
+    let table_name = "tpch_lineitem_small";
+    let parquet_path = "tests/data/tpch_lineitem_small.parquet";
+    let sql = format!("EXPLAIN ANALYZE SELECT * FROM {table_name}");
+
+    // Register test parquet file into context
+    let ctx = SessionContext::new();
+    ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default())
+        .await
+        .expect("register parquet table for explain analyze test");
+
+    for (level, needle, should_contain) in [
+        (ExplainAnalyzeLevel::Summary, "metadata_load_time", true),
+        (ExplainAnalyzeLevel::Summary, "page_index_eval_time", false),
+        (ExplainAnalyzeLevel::Dev, "metadata_load_time", true),
+        (ExplainAnalyzeLevel::Dev, "page_index_eval_time", true),
+    ] {
+        let plan = collect_plan_with_context(&sql, &ctx, level).await;
+
+        assert_eq!(
+            plan.contains(needle),
+            should_contain,
+            "plan for level {level:?} unexpected content: {plan}"
+        );
+    }
+}
+
+#[tokio::test]
+async fn explain_analyze_parquet_pruning_metrics() {
+    let table_name = "tpch_lineitem_small";
+    let parquet_path = "tests/data/tpch_lineitem_small.parquet";
+    let ctx = SessionContext::new();
+    ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default())
+        .await
+        .expect("register parquet table for explain analyze test");
+
+    // Test scenario:
+    // This table's l_orderkey has range [1, 7]
+    // So the following query can't prune the file:
+    //  select * from tpch_lineitem_small where l_orderkey = 5;
+    // If change filter to `l_orderkey=10`, the whole file can be pruned using stat.
+    for (l_orderkey, expected_pruning_metrics) in
+        [(5, "1 total → 1 matched"), (10, "1 total → 0 matched")]
+    {
+        let sql = format!(
+            "explain analyze select * from {table_name} where l_orderkey = {l_orderkey};"
+        );
+
+        let plan =
+            collect_plan_with_context(&sql, &ctx, ExplainAnalyzeLevel::Summary).await;
+
+        let expected_metrics =
+            format!("files_ranges_pruned_statistics={expected_pruning_metrics}");
+
+        assert_metrics!(&plan, "DataSourceExec", &expected_metrics);
+    }
+}
+
 #[tokio::test]
 async fn csv_explain_plans() {
     // This test verify the look of each plan in its full cycle plan creation
@@ -174,69 +328,66 @@ async fn csv_explain_plans() {
     println!("SQL: {sql}");
     //
     // Verify schema
-    let expected = vec![
-        "Explain [plan_type:Utf8, plan:Utf8]",
-        "  Projection: aggregate_test_100.c1 [c1:Utf8View]",
-        "    Filter: aggregate_test_100.c2 > Int64(10) [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]",
-        "      TableScan: aggregate_test_100 [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]",
-    ];
     let formatted = plan.display_indent_schema().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain [plan_type:Utf8, plan:Utf8]
+      Projection: aggregate_test_100.c1 [c1:Utf8View]
+        Filter: aggregate_test_100.c2 > Int64(10) [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]
+          TableScan: aggregate_test_100 [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]
+    "
     );
     //
     // Verify the text format of the plan
-    let expected = vec![
-        "Explain",
-        "  Projection: aggregate_test_100.c1",
-        "    Filter: aggregate_test_100.c2 > Int64(10)",
-        "      TableScan: aggregate_test_100",
-    ];
     let formatted = plan.display_indent().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain
+      Projection: aggregate_test_100.c1
+        Filter: aggregate_test_100.c2 > Int64(10)
+          TableScan: aggregate_test_100
+    "
     );
     //
     // verify the grahviz format of the plan
-    let expected = vec![
-        "// Begin DataFusion GraphViz Plan,",
-        "// display it online here: https://dreampuf.github.io/GraphvizOnline",
-        "",
-        "digraph {",
-        "  subgraph cluster_1",
-        "  {",
-        "    graph[label=\"LogicalPlan\"]",
-        "    2[shape=box label=\"Explain\"]",
-        "    3[shape=box label=\"Projection: aggregate_test_100.c1\"]",
-        "    2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    4[shape=box label=\"Filter: aggregate_test_100.c2 > Int64(10)\"]",
-        "    3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    5[shape=box label=\"TableScan: aggregate_test_100\"]",
-        "    4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "  subgraph cluster_6",
-        "  {",
-        "    graph[label=\"Detailed LogicalPlan\"]",
-        "    7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]",
-        "    8[shape=box label=\"Projection: aggregate_test_100.c1\\nSchema: [c1:Utf8View]\"]",
-        "    7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    9[shape=box label=\"Filter: aggregate_test_100.c2 > Int64(10)\\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]\"]",
-        "    8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    10[shape=box label=\"TableScan: aggregate_test_100\\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]\"]",
-        "    9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "}",
-        "// End DataFusion GraphViz Plan",
-    ];
     let formatted = plan.display_graphviz().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    // Begin DataFusion GraphViz Plan,
+    // display it online here: https://dreampuf.github.io/GraphvizOnline
+
+    digraph {
+      subgraph cluster_1
+      {
+        graph[label="LogicalPlan"]
+        2[shape=box label="Explain"]
+        3[shape=box label="Projection: aggregate_test_100.c1"]
+        2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+        4[shape=box label="Filter: aggregate_test_100.c2 > Int64(10)"]
+        3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+        5[shape=box label="TableScan: aggregate_test_100"]
+        4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+      subgraph cluster_6
+      {
+        graph[label="Detailed LogicalPlan"]
+        7[shape=box label="Explain\nSchema: [plan_type:Utf8, plan:Utf8]"]
+        8[shape=box label="Projection: aggregate_test_100.c1\nSchema: [c1:Utf8View]"]
+        7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+        9[shape=box label="Filter: aggregate_test_100.c2 > Int64(10)\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]"]
+        8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+        10[shape=box label="TableScan: aggregate_test_100\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]"]
+        9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+    }
+    // End DataFusion GraphViz Plan
+    "#
     );
 
     // Optimized logical plan
@@ -248,69 +399,66 @@ async fn csv_explain_plans() {
     assert_eq!(logical_schema, optimized_logical_schema.as_ref());
     //
     // Verify schema
-    let expected = vec![
-        "Explain [plan_type:Utf8, plan:Utf8]",
-        "  Projection: aggregate_test_100.c1 [c1:Utf8View]",
-        "    Filter: aggregate_test_100.c2 > Int8(10) [c1:Utf8View, c2:Int8]",
-        "      TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)] [c1:Utf8View, c2:Int8]",
-    ];
     let formatted = plan.display_indent_schema().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain [plan_type:Utf8, plan:Utf8]
+      Projection: aggregate_test_100.c1 [c1:Utf8View]
+        Filter: aggregate_test_100.c2 > Int8(10) [c1:Utf8View, c2:Int8]
+          TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)] [c1:Utf8View, c2:Int8]
+    "
     );
     //
     // Verify the text format of the plan
-    let expected = vec![
-        "Explain",
-        "  Projection: aggregate_test_100.c1",
-        "    Filter: aggregate_test_100.c2 > Int8(10)",
-        "      TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]",
-    ];
     let formatted = plan.display_indent().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain
+      Projection: aggregate_test_100.c1
+        Filter: aggregate_test_100.c2 > Int8(10)
+          TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
+    "
     );
     //
     // verify the grahviz format of the plan
-    let expected = vec![
-        "// Begin DataFusion GraphViz Plan,",
-        "// display it online here: https://dreampuf.github.io/GraphvizOnline",
-        "",
-        "digraph {",
-        "  subgraph cluster_1",
-        "  {",
-        "    graph[label=\"LogicalPlan\"]",
-        "    2[shape=box label=\"Explain\"]",
-        "    3[shape=box label=\"Projection: aggregate_test_100.c1\"]",
-        "    2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    4[shape=box label=\"Filter: aggregate_test_100.c2 > Int8(10)\"]",
-        "    3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    5[shape=box label=\"TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\"]",
-        "    4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "  subgraph cluster_6",
-        "  {",
-        "    graph[label=\"Detailed LogicalPlan\"]",
-        "    7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]",
-        "    8[shape=box label=\"Projection: aggregate_test_100.c1\\nSchema: [c1:Utf8View]\"]",
-        "    7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    9[shape=box label=\"Filter: aggregate_test_100.c2 > Int8(10)\\nSchema: [c1:Utf8View, c2:Int8]\"]",
-        "    8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    10[shape=box label=\"TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\\nSchema: [c1:Utf8View, c2:Int8]\"]",
-        "    9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "}",
-        "// End DataFusion GraphViz Plan",
-    ];
     let formatted = plan.display_graphviz().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    // Begin DataFusion GraphViz Plan,
+    // display it online here: https://dreampuf.github.io/GraphvizOnline
+
+    digraph {
+      subgraph cluster_1
+      {
+        graph[label="LogicalPlan"]
+        2[shape=box label="Explain"]
+        3[shape=box label="Projection: aggregate_test_100.c1"]
+        2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+        4[shape=box label="Filter: aggregate_test_100.c2 > Int8(10)"]
+        3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+        5[shape=box label="TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]"]
+        4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+      subgraph cluster_6
+      {
+        graph[label="Detailed LogicalPlan"]
+        7[shape=box label="Explain\nSchema: [plan_type:Utf8, plan:Utf8]"]
+        8[shape=box label="Projection: aggregate_test_100.c1\nSchema: [c1:Utf8View]"]
+        7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+        9[shape=box label="Filter: aggregate_test_100.c2 > Int8(10)\nSchema: [c1:Utf8View, c2:Int8]"]
+        8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+        10[shape=box label="TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\nSchema: [c1:Utf8View, c2:Int8]"]
+        9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+    }
+    // End DataFusion GraphViz Plan
+    "#
     );
 
     // Physical plan
@@ -396,69 +544,66 @@ async fn csv_explain_verbose_plans() {
 
     //
     // Verify schema
-    let expected = vec![
-        "Explain [plan_type:Utf8, plan:Utf8]",
-        "  Projection: aggregate_test_100.c1 [c1:Utf8View]",
-        "    Filter: aggregate_test_100.c2 > Int64(10) [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]",
-        "      TableScan: aggregate_test_100 [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]",
-    ];
     let formatted = dataframe.logical_plan().display_indent_schema().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain [plan_type:Utf8, plan:Utf8]
+      Projection: aggregate_test_100.c1 [c1:Utf8View]
+        Filter: aggregate_test_100.c2 > Int64(10) [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]
+          TableScan: aggregate_test_100 [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]
+    "
     );
     //
     // Verify the text format of the plan
-    let expected = vec![
-        "Explain",
-        "  Projection: aggregate_test_100.c1",
-        "    Filter: aggregate_test_100.c2 > Int64(10)",
-        "      TableScan: aggregate_test_100",
-    ];
     let formatted = dataframe.logical_plan().display_indent().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain
+      Projection: aggregate_test_100.c1
+        Filter: aggregate_test_100.c2 > Int64(10)
+          TableScan: aggregate_test_100
+    "
     );
     //
     // verify the grahviz format of the plan
-    let expected = vec![
-        "// Begin DataFusion GraphViz Plan,",
-        "// display it online here: https://dreampuf.github.io/GraphvizOnline",
-        "",
-        "digraph {",
-        "  subgraph cluster_1",
-        "  {",
-        "    graph[label=\"LogicalPlan\"]",
-        "    2[shape=box label=\"Explain\"]",
-        "    3[shape=box label=\"Projection: aggregate_test_100.c1\"]",
-        "    2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    4[shape=box label=\"Filter: aggregate_test_100.c2 > Int64(10)\"]",
-        "    3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    5[shape=box label=\"TableScan: aggregate_test_100\"]",
-        "    4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "  subgraph cluster_6",
-        "  {",
-        "    graph[label=\"Detailed LogicalPlan\"]",
-        "    7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]",
-        "    8[shape=box label=\"Projection: aggregate_test_100.c1\\nSchema: [c1:Utf8View]\"]",
-        "    7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    9[shape=box label=\"Filter: aggregate_test_100.c2 > Int64(10)\\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]\"]",
-        "    8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    10[shape=box label=\"TableScan: aggregate_test_100\\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]\"]",
-        "    9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "}",
-        "// End DataFusion GraphViz Plan",
-    ];
     let formatted = dataframe.logical_plan().display_graphviz().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    // Begin DataFusion GraphViz Plan,
+    // display it online here: https://dreampuf.github.io/GraphvizOnline
+
+    digraph {
+      subgraph cluster_1
+      {
+        graph[label="LogicalPlan"]
+        2[shape=box label="Explain"]
+        3[shape=box label="Projection: aggregate_test_100.c1"]
+        2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+        4[shape=box label="Filter: aggregate_test_100.c2 > Int64(10)"]
+        3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+        5[shape=box label="TableScan: aggregate_test_100"]
+        4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+      subgraph cluster_6
+      {
+        graph[label="Detailed LogicalPlan"]
+        7[shape=box label="Explain\nSchema: [plan_type:Utf8, plan:Utf8]"]
+        8[shape=box label="Projection: aggregate_test_100.c1\nSchema: [c1:Utf8View]"]
+        7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+        9[shape=box label="Filter: aggregate_test_100.c2 > Int64(10)\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]"]
+        8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+        10[shape=box label="TableScan: aggregate_test_100\nSchema: [c1:Utf8View, c2:Int8, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:UInt32, c10:UInt64, c11:Float32, c12:Float64, c13:Utf8View]"]
+        9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+    }
+    // End DataFusion GraphViz Plan
+    "#
     );
 
     // Optimized logical plan
@@ -470,69 +615,66 @@ async fn csv_explain_verbose_plans() {
     assert_eq!(&logical_schema, optimized_logical_schema.as_ref());
     //
     // Verify schema
-    let expected = vec![
-        "Explain [plan_type:Utf8, plan:Utf8]",
-        "  Projection: aggregate_test_100.c1 [c1:Utf8View]",
-        "    Filter: aggregate_test_100.c2 > Int8(10) [c1:Utf8View, c2:Int8]",
-        "      TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)] [c1:Utf8View, c2:Int8]",
-    ];
     let formatted = plan.display_indent_schema().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain [plan_type:Utf8, plan:Utf8]
+      Projection: aggregate_test_100.c1 [c1:Utf8View]
+        Filter: aggregate_test_100.c2 > Int8(10) [c1:Utf8View, c2:Int8]
+          TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)] [c1:Utf8View, c2:Int8]
+    "
     );
     //
     // Verify the text format of the plan
-    let expected = vec![
-        "Explain",
-        "  Projection: aggregate_test_100.c1",
-        "    Filter: aggregate_test_100.c2 > Int8(10)",
-        "      TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]",
-    ];
     let formatted = plan.display_indent().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    Explain
+      Projection: aggregate_test_100.c1
+        Filter: aggregate_test_100.c2 > Int8(10)
+          TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
+    "
     );
     //
     // verify the grahviz format of the plan
-    let expected = vec![
-        "// Begin DataFusion GraphViz Plan,",
-        "// display it online here: https://dreampuf.github.io/GraphvizOnline",
-        "",
-        "digraph {",
-        "  subgraph cluster_1",
-        "  {",
-        "    graph[label=\"LogicalPlan\"]",
-        "    2[shape=box label=\"Explain\"]",
-        "    3[shape=box label=\"Projection: aggregate_test_100.c1\"]",
-        "    2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    4[shape=box label=\"Filter: aggregate_test_100.c2 > Int8(10)\"]",
-        "    3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    5[shape=box label=\"TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\"]",
-        "    4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "  subgraph cluster_6",
-        "  {",
-        "    graph[label=\"Detailed LogicalPlan\"]",
-        "    7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]",
-        "    8[shape=box label=\"Projection: aggregate_test_100.c1\\nSchema: [c1:Utf8View]\"]",
-        "    7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    9[shape=box label=\"Filter: aggregate_test_100.c2 > Int8(10)\\nSchema: [c1:Utf8View, c2:Int8]\"]",
-        "    8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]",
-        "    10[shape=box label=\"TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\\nSchema: [c1:Utf8View, c2:Int8]\"]",
-        "    9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]",
-        "  }",
-        "}",
-        "// End DataFusion GraphViz Plan",
-    ];
     let formatted = plan.display_graphviz().to_string();
-    let actual: Vec<&str> = formatted.trim().lines().collect();
-    assert_eq!(
-        expected, actual,
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+    assert_snapshot!(
+        actual,
+        @r#"
+    // Begin DataFusion GraphViz Plan,
+    // display it online here: https://dreampuf.github.io/GraphvizOnline
+
+    digraph {
+      subgraph cluster_1
+      {
+        graph[label="LogicalPlan"]
+        2[shape=box label="Explain"]
+        3[shape=box label="Projection: aggregate_test_100.c1"]
+        2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+        4[shape=box label="Filter: aggregate_test_100.c2 > Int8(10)"]
+        3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+        5[shape=box label="TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]"]
+        4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+      subgraph cluster_6
+      {
+        graph[label="Detailed LogicalPlan"]
+        7[shape=box label="Explain\nSchema: [plan_type:Utf8, plan:Utf8]"]
+        8[shape=box label="Projection: aggregate_test_100.c1\nSchema: [c1:Utf8View]"]
+        7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+        9[shape=box label="Filter: aggregate_test_100.c2 > Int8(10)\nSchema: [c1:Utf8View, c2:Int8]"]
+        8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+        10[shape=box label="TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]\nSchema: [c1:Utf8View, c2:Int8]"]
+        9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+      }
+    }
+    // End DataFusion GraphViz Plan
+    "#
     );
 
     // Physical plan
@@ -602,19 +744,6 @@ async fn test_physical_plan_display_indent() {
                LIMIT 10";
     let dataframe = ctx.sql(sql).await.unwrap();
     let physical_plan = dataframe.create_physical_plan().await.unwrap();
-    let expected = vec![
-        "SortPreservingMergeExec: [the_min@2 DESC], fetch=10",
-        "  SortExec: TopK(fetch=10), expr=[the_min@2 DESC], preserve_partitioning=[true]",
-        "    ProjectionExec: expr=[c1@0 as c1, max(aggregate_test_100.c12)@1 as max(aggregate_test_100.c12), min(aggregate_test_100.c12)@2 as the_min]",
-        "      AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]",
-        "        CoalesceBatchesExec: target_batch_size=4096",
-        "          RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000",
-        "            AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]",
-        "              CoalesceBatchesExec: target_batch_size=4096",
-        "                FilterExec: c12@1 < 10",
-        "                  RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1",
-        "                    DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1, c12], file_type=csv, has_header=true",
-    ];
 
     let normalizer = ExplainNormalizer::new();
     let actual = format!("{}", displayable(physical_plan.as_ref()).indent(true))
@@ -622,10 +751,22 @@ async fn test_physical_plan_display_indent() {
         .lines()
         // normalize paths
         .map(|s| normalizer.normalize(s))
-        .collect::<Vec<_>>();
-    assert_eq!(
-        expected, actual,
-        "expected:\n{expected:#?}\nactual:\n\n{actual:#?}\n"
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SortPreservingMergeExec: [the_min@2 DESC], fetch=10
+      SortExec: TopK(fetch=10), expr=[the_min@2 DESC], preserve_partitioning=[true]
+        ProjectionExec: expr=[c1@0 as c1, max(aggregate_test_100.c12)@1 as max(aggregate_test_100.c12), min(aggregate_test_100.c12)@2 as the_min]
+          AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]
+            RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000
+              AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[max(aggregate_test_100.c12), min(aggregate_test_100.c12)]
+                FilterExec: c12@1 < 10
+                  RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1
+                    DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1, c12], file_type=csv, has_header=true
+    "
     );
 }
 
@@ -647,19 +788,6 @@ async fn test_physical_plan_display_indent_multi_children() {
 
     let dataframe = ctx.sql(sql).await.unwrap();
     let physical_plan = dataframe.create_physical_plan().await.unwrap();
-    let expected = vec![
-		"CoalesceBatchesExec: target_batch_size=4096",
-    	"  HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c2@0)], projection=[c1@0]",
-    	"    CoalesceBatchesExec: target_batch_size=4096",
-    	"      RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=9000",
-    	"        RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1",
-    	"          DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true",
-    	"    CoalesceBatchesExec: target_batch_size=4096",
-    	"      RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=9000",
-    	"        RepartitionExec: partitioning=RoundRobinBatch(9000), input_partitions=1",
-    	"          ProjectionExec: expr=[c1@0 as c2]",
-    	"            DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true",
-    ];
 
     let normalizer = ExplainNormalizer::new();
     let actual = format!("{}", displayable(physical_plan.as_ref()).indent(true))
@@ -667,11 +795,18 @@ async fn test_physical_plan_display_indent_multi_children() {
         .lines()
         // normalize paths
         .map(|s| normalizer.normalize(s))
-        .collect::<Vec<_>>();
+        .collect::<Vec<_>>()
+        .join("\n");
 
-    assert_eq!(
-        expected, actual,
-        "expected:\n{expected:#?}\nactual:\n\n{actual:#?}\n"
+    assert_snapshot!(
+        actual,
+        @r"
+    HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c1@0, c2@0)], projection=[c1@0]
+      RepartitionExec: partitioning=Hash([c1@0], 9000), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+      RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=1
+        DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1@0 as c2], file_type=csv, has_header=true
+    "
     );
 }
 
@@ -710,8 +845,7 @@ async fn csv_explain_analyze_order_by() {
 
     // Ensure that the ordering is not optimized away from the plan
     // https://github.com/apache/datafusion/issues/6379
-    let needle =
-        "SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=100, elapsed_compute";
+    let needle = "SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=100, elapsed_compute";
     assert_contains!(&formatted, needle);
 }
 
@@ -729,10 +863,153 @@ async fn parquet_explain_analyze() {
 
     // should contain aggregated stats
     assert_contains!(&formatted, "output_rows=8");
-    assert_contains!(&formatted, "row_groups_matched_bloom_filter=0");
-    assert_contains!(&formatted, "row_groups_pruned_bloom_filter=0");
-    assert_contains!(&formatted, "row_groups_matched_statistics=1");
-    assert_contains!(&formatted, "row_groups_pruned_statistics=0");
+    assert_contains!(
+        &formatted,
+        "row_groups_pruned_bloom_filter=1 total \u{2192} 1 matched"
+    );
+    assert_contains!(
+        &formatted,
+        "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
+    );
+    assert_contains!(&formatted, "scan_efficiency_ratio=14%");
+
+    // The order of metrics is expected to be the same as the actual pruning order
+    // (file-> row-group -> page)
+    let i_file = formatted.find("files_ranges_pruned_statistics").unwrap();
+    let i_rowgroup_stat = formatted.find("row_groups_pruned_statistics").unwrap();
+    let i_rowgroup_bloomfilter =
+        formatted.find("row_groups_pruned_bloom_filter").unwrap();
+    let i_page_rows = formatted.find("page_index_rows_pruned").unwrap();
+    let i_page_pages = formatted.find("page_index_pages_pruned").unwrap();
+
+    assert!(
+        (i_file < i_rowgroup_stat)
+            && (i_rowgroup_stat < i_rowgroup_bloomfilter)
+            && (i_rowgroup_bloomfilter < i_page_pages && i_page_pages < i_page_rows),
+        "The parquet pruning metrics should be displayed in an order of: file range -> row group statistics -> row group bloom filter -> page index."
+    );
+}
+
+// This test reproduces the behavior described in
+// https://github.com/apache/datafusion/issues/16684 where projection
+// pushdown with recursive CTEs could fail to remove unused columns
+// (e.g. nested/recursive expansion causing full schema to be scanned).
+// Keeping this test ensures we don't regress that behavior.
+#[tokio::test]
+#[cfg_attr(tarpaulin, ignore)]
+async fn parquet_recursive_projection_pushdown() -> Result<()> {
+    use parquet::arrow::arrow_writer::ArrowWriter;
+    use parquet::file::properties::WriterProperties;
+
+    let temp_dir = TempDir::new().unwrap();
+    let parquet_path = temp_dir.path().join("hierarchy.parquet");
+
+    let ids = Int64Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    let parent_ids = Int64Array::from(vec![0, 1, 1, 2, 2, 3, 4, 5, 6, 7]);
+    let values = Int64Array::from(vec![10, 20, 30, 40, 50, 60, 70, 80, 90, 100]);
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("parent_id", DataType::Int64, true),
+        Field::new("value", DataType::Int64, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![Arc::new(ids), Arc::new(parent_ids), Arc::new(values)],
+    )
+    .unwrap();
+
+    let file = File::create(&parquet_path).unwrap();
+    let props = WriterProperties::builder().build();
+    let mut writer = ArrowWriter::try_new(file, schema, Some(props)).unwrap();
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    let ctx = SessionContext::new();
+    ctx.register_parquet(
+        "hierarchy",
+        parquet_path.to_str().unwrap(),
+        ParquetReadOptions::default(),
+    )
+    .await?;
+
+    let sql = r#"
+        WITH RECURSIVE number_series AS (
+            SELECT id, 1 as level
+            FROM hierarchy
+            WHERE id = 1
+
+            UNION ALL
+
+            SELECT ns.id + 1, ns.level + 1
+            FROM number_series ns
+            WHERE ns.id < 10
+        )
+        SELECT * FROM number_series ORDER BY id
+    "#;
+
+    let dataframe = ctx.sql(sql).await?;
+    let physical_plan = dataframe.create_physical_plan().await?;
+
+    let normalizer = ExplainNormalizer::new();
+    let mut actual = format!("{}", displayable(physical_plan.as_ref()).indent(true))
+        .trim()
+        .lines()
+        .map(|line| normalizer.normalize(line))
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    fn replace_path_variants(actual: &mut String, path: &str) {
+        let mut candidates = vec![path.to_string()];
+
+        let trimmed = path.trim_start_matches(std::path::MAIN_SEPARATOR);
+        if trimmed != path {
+            candidates.push(trimmed.to_string());
+        }
+
+        let forward_slash = path.replace('\\', "/");
+        if forward_slash != path {
+            candidates.push(forward_slash.clone());
+
+            let trimmed_forward = forward_slash.trim_start_matches('/');
+            if trimmed_forward != forward_slash {
+                candidates.push(trimmed_forward.to_string());
+            }
+        }
+
+        for candidate in candidates {
+            *actual = actual.replace(&candidate, "TMP_DIR");
+        }
+    }
+
+    let temp_dir_path = temp_dir.path();
+    let fs_path = temp_dir_path.to_string_lossy().to_string();
+    replace_path_variants(&mut actual, &fs_path);
+
+    if let Ok(url_path) = Path::from_filesystem_path(temp_dir_path) {
+        replace_path_variants(&mut actual, url_path.as_ref());
+    }
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+      RecursiveQueryExec: name=number_series, is_distinct=false
+        CoalescePartitionsExec
+          ProjectionExec: expr=[id@0 as id, 1 as level]
+            FilterExec: id@0 = 1
+              RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[TMP_DIR/hierarchy.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 = 1, pruning_predicate=id_null_count@2 != row_count@3 AND id_min@0 <= 1 AND 1 <= id_max@1, required_guarantees=[id in (1)]
+        CoalescePartitionsExec
+          ProjectionExec: expr=[id@0 + 1 as ns.id + Int64(1), level@1 + 1 as ns.level + Int64(1)]
+            FilterExec: id@0 < 10
+              RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES), input_partitions=1
+                WorkTableExec: name=number_series
+    "
+    );
+
+    Ok(())
 }
 
 #[tokio::test]
@@ -748,9 +1025,7 @@ async fn parquet_explain_analyze_verbose() {
         .to_string();
 
     // should contain the raw per file stats (with the label)
-    assert_contains!(&formatted, "row_groups_matched_bloom_filter{partition=0");
     assert_contains!(&formatted, "row_groups_pruned_bloom_filter{partition=0");
-    assert_contains!(&formatted, "row_groups_matched_statistics{partition=0");
     assert_contains!(&formatted, "row_groups_pruned_statistics{partition=0");
 }
 
@@ -779,14 +1054,19 @@ async fn explain_logical_plan_only() {
     let sql = "EXPLAIN select count(*) from (values ('a', 1, 100), ('a', 2, 150)) as t (c1,c2,c3)";
     let actual = execute(&ctx, sql).await;
     let actual = normalize_vec_for_explain(actual);
+    let actual = actual.into_iter().map(|r| r.join("\n")).collect::<String>();
 
-    let expected = vec![
-        vec!["logical_plan", "Projection: count(Int64(1)) AS count(*)\
-        \n  Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]\
-        \n    SubqueryAlias: t\
-        \n      Projection:\
-        \n        Values: (Utf8(\"a\"), Int64(1), Int64(100)), (Utf8(\"a\"), Int64(2), Int64(150))"]];
-    assert_eq!(expected, actual);
+    assert_snapshot!(
+        actual,
+        @r#"
+    logical_plan
+    Projection: count(Int64(1)) AS count(*)
+      Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+        SubqueryAlias: t
+          Projection:
+            Values: (Utf8("a"), Int64(1), Int64(100)), (Utf8("a"), Int64(2), Int64(150))
+    "#
+    );
 }
 
 #[tokio::test]
@@ -797,14 +1077,16 @@ async fn explain_physical_plan_only() {
     let sql = "EXPLAIN select count(*) from (values ('a', 1, 100), ('a', 2, 150)) as t (c1,c2,c3)";
     let actual = execute(&ctx, sql).await;
     let actual = normalize_vec_for_explain(actual);
+    let actual = actual.into_iter().map(|r| r.join("\n")).collect::<String>();
 
-    let expected = vec![vec![
-        "physical_plan",
-        "ProjectionExec: expr=[2 as count(*)]\
-        \n  PlaceholderRowExec\
-        \n",
-    ]];
-    assert_eq!(expected, actual);
+    assert_snapshot!(
+        actual,
+        @r"
+    physical_plan
+    ProjectionExec: expr=[2 as count(*)]
+      PlaceholderRowExec
+    "
+    );
 }
 
 #[tokio::test]
@@ -827,3 +1109,54 @@ async fn csv_explain_analyze_with_statistics() {
         ", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]"
     );
 }
+
+#[tokio::test]
+async fn nested_loop_join_selectivity() {
+    for (join_type, expected_selectivity) in [
+        ("INNER", "1% (1/100)"),
+        ("LEFT", "10% (10/100)"),
+        ("RIGHT", "10% (10/100)"),
+        // 1 match + 9 left + 9 right = 19
+        ("FULL", "19% (19/100)"),
+    ] {
+        let ctx = SessionContext::new();
+        let sql = format!(
+            "EXPLAIN ANALYZE SELECT * \
+                FROM generate_series(1, 10) as t1(a) \
+                {join_type} JOIN generate_series(1, 10) as t2(b) \
+                ON (t1.a + t2.b) = 20"
+        );
+
+        let actual = execute_to_batches(&ctx, sql.as_str()).await;
+        let formatted = arrow::util::pretty::pretty_format_batches(&actual)
+            .unwrap()
+            .to_string();
+
+        assert_metrics!(
+            &formatted,
+            "NestedLoopJoinExec",
+            &format!("selectivity={expected_selectivity}")
+        );
+    }
+}
+
+#[tokio::test]
+async fn explain_analyze_hash_join() {
+    let sql = "EXPLAIN ANALYZE \
+            SELECT * \
+            FROM generate_series(10) as t1(a) \
+            JOIN generate_series(20) as t2(b) \
+            ON t1.a=t2.b";
+
+    for (level, needle, should_contain) in [
+        (ExplainAnalyzeLevel::Summary, "probe_hit_rate", true),
+        (ExplainAnalyzeLevel::Summary, "avg_fanout", true),
+    ] {
+        let plan = collect_plan(sql, level).await;
+        assert_eq!(
+            plan.contains(needle),
+            should_contain,
+            "plan for level {level:?} unexpected content: {plan}"
+        );
+    }
+}
diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs
index 77eec20eac006..7c0e89ee96418 100644
--- a/datafusion/core/tests/sql/joins.rs
+++ b/datafusion/core/tests/sql/joins.rs
@@ -15,8 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use insta::assert_snapshot;
+
+use datafusion::assert_batches_eq;
+use datafusion::catalog::MemTable;
 use datafusion::datasource::stream::{FileStreamProvider, StreamConfig, StreamTable};
 use datafusion::test_util::register_unbounded_file_with_ordering;
+use datafusion_sql::unparser::plan_to_sql;
 
 use super::*;
 
@@ -33,14 +38,16 @@ async fn join_change_in_planner() -> Result<()> {
         Field::new("a2", DataType::UInt32, false),
     ]));
     // Specify the ordering:
-    let file_sort_order = vec![[col("a1")]
-        .into_iter()
-        .map(|e| {
-            let ascending = true;
-            let nulls_first = false;
-            e.sort(ascending, nulls_first)
-        })
-        .collect::<Vec<_>>()];
+    let file_sort_order = vec![
+        [col("a1")]
+            .into_iter()
+            .map(|e| {
+                let ascending = true;
+                let nulls_first = false;
+                e.sort(ascending, nulls_first)
+            })
+            .collect::<Vec<_>>(),
+    ];
     register_unbounded_file_with_ordering(
         &ctx,
         schema.clone(),
@@ -61,28 +68,17 @@ async fn join_change_in_planner() -> Result<()> {
     let dataframe = ctx.sql(sql).await?;
     let physical_plan = dataframe.create_physical_plan().await?;
     let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-    let expected = {
-        [
-            "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a1@0 ASC NULLS LAST",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false"
-        ]
-    };
-    let mut actual: Vec<&str> = formatted.trim().lines().collect();
-    // Remove CSV lines
-    actual.remove(4);
-    actual.remove(7);
-
-    assert_eq!(
-        expected,
-        actual[..],
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+    "
     );
     Ok(())
 }
@@ -101,14 +97,16 @@ async fn join_no_order_on_filter() -> Result<()> {
         Field::new("a3", DataType::UInt32, false),
     ]));
     // Specify the ordering:
-    let file_sort_order = vec![[col("a1")]
-        .into_iter()
-        .map(|e| {
-            let ascending = true;
-            let nulls_first = false;
-            e.sort(ascending, nulls_first)
-        })
-        .collect::<Vec<_>>()];
+    let file_sort_order = vec![
+        [col("a1")]
+            .into_iter()
+            .map(|e| {
+                let ascending = true;
+                let nulls_first = false;
+                e.sort(ascending, nulls_first)
+            })
+            .collect::<Vec<_>>(),
+    ];
     register_unbounded_file_with_ordering(
         &ctx,
         schema.clone(),
@@ -129,28 +127,17 @@ async fn join_no_order_on_filter() -> Result<()> {
     let dataframe = ctx.sql(sql).await?;
     let physical_plan = dataframe.create_physical_plan().await?;
     let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-    let expected = {
-        [
-            "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false"
-        ]
-    };
-    let mut actual: Vec<&str> = formatted.trim().lines().collect();
-    // Remove CSV lines
-    actual.remove(4);
-    actual.remove(7);
-
-    assert_eq!(
-        expected,
-        actual[..],
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a3@0 AS Int64) > CAST(a3@1 AS Int64) + 3 AND CAST(a3@0 AS Int64) < CAST(a3@1 AS Int64) + 10
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1, maintains_sort_order=true
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2, a3], infinite_source=true, output_ordering=[a1@0 ASC NULLS LAST]
+    "
     );
     Ok(())
 }
@@ -179,28 +166,17 @@ async fn join_change_in_planner_without_sort() -> Result<()> {
     let dataframe = ctx.sql(sql).await?;
     let physical_plan = dataframe.create_physical_plan().await?;
     let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
-    let expected = {
-        [
-            "SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/left.csv]]}, projection=[a1, a2], file_type=csv, has_header=false",
-            "  CoalesceBatchesExec: target_batch_size=8192",
-            "    RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=8",
-            "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
-            // "     DataSourceExec: file_groups={1 group: [[tempdir/right.csv]]}, projection=[a1, a2], file_type=csv, has_header=false"
-        ]
-    };
-    let mut actual: Vec<&str> = formatted.trim().lines().collect();
-    // Remove CSV lines
-    actual.remove(4);
-    actual.remove(7);
-
-    assert_eq!(
-        expected,
-        actual[..],
-        "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+    let actual = formatted.trim();
+
+    assert_snapshot!(
+        actual,
+        @r"
+    SymmetricHashJoinExec: mode=Partitioned, join_type=Full, on=[(a2@1, a2@1)], filter=CAST(a1@0 AS Int64) > CAST(a1@1 AS Int64) + 3 AND CAST(a1@0 AS Int64) < CAST(a1@1 AS Int64) + 10
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
+      RepartitionExec: partitioning=Hash([a2@1], 8), input_partitions=1
+        StreamingTableExec: partition_sizes=1, projection=[a1, a2], infinite_source=true
+    "
     );
     Ok(())
 }
@@ -230,8 +206,96 @@ async fn join_change_in_planner_without_sort_not_allowed() -> Result<()> {
     match df.create_physical_plan().await {
         Ok(_) => panic!("Expecting error."),
         Err(e) => {
-            assert_eq!(e.strip_backtrace(), "SanityCheckPlan\ncaused by\nError during planning: Join operation cannot operate on a non-prunable stream without enabling the 'allow_symmetric_joins_without_pruning' configuration flag")
+            assert_eq!(
+                e.strip_backtrace(),
+                "SanityCheckPlan\ncaused by\nError during planning: Join operation cannot operate on a non-prunable stream without enabling the 'allow_symmetric_joins_without_pruning' configuration flag"
+            )
         }
     }
     Ok(())
 }
+
+#[tokio::test]
+async fn join_using_uppercase_column() -> Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "UPPER",
+        DataType::UInt32,
+        false,
+    )]));
+    let tmp_dir = TempDir::new()?;
+    let file_path = tmp_dir.path().join("uppercase-column.csv");
+    let mut file = File::create(file_path.clone())?;
+    file.write_all("0".as_bytes())?;
+    drop(file);
+
+    let ctx = SessionContext::new();
+    ctx.register_csv(
+        "test",
+        file_path.to_str().unwrap(),
+        CsvReadOptions::new().schema(&schema).has_header(false),
+    )
+    .await?;
+
+    let dataframe = ctx
+        .sql(
+            r#"
+        SELECT test."UPPER" FROM "test"
+        INNER JOIN (
+            SELECT test."UPPER" FROM "test"
+        ) AS selection USING ("UPPER")
+        ;
+        "#,
+        )
+        .await?;
+
+    assert_batches_eq!(
+        [
+            "+-------+",
+            "| UPPER |",
+            "+-------+",
+            "| 0     |",
+            "+-------+",
+        ],
+        &dataframe.collect().await?
+    );
+
+    Ok(())
+}
+
+// Issue #17359: https://github.com/apache/datafusion/issues/17359
+#[tokio::test]
+async fn unparse_cross_join() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    let j1_schema = Arc::new(Schema::new(vec![
+        Field::new("j1_id", DataType::Int32, true),
+        Field::new("j1_string", DataType::Utf8, true),
+    ]));
+    let j2_schema = Arc::new(Schema::new(vec![
+        Field::new("j2_id", DataType::Int32, true),
+        Field::new("j2_string", DataType::Utf8, true),
+    ]));
+
+    ctx.register_table("j1", Arc::new(MemTable::try_new(j1_schema, vec![vec![]])?))?;
+    ctx.register_table("j2", Arc::new(MemTable::try_new(j2_schema, vec![vec![]])?))?;
+
+    let df = ctx
+        .sql(
+            r#"
+            select j1.j1_id, j2.j2_string
+            from j1, j2
+            where j2.j2_id = 0
+            "#,
+        )
+        .await?;
+
+    let unopt_sql = plan_to_sql(df.logical_plan())?;
+    assert_snapshot!(unopt_sql, @"SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)");
+
+    let optimized_plan = df.into_optimized_plan()?;
+
+    let opt_sql = plan_to_sql(&optimized_plan)?;
+    assert_snapshot!(opt_sql, @"SELECT j1.j1_id, j2.j2_string FROM j1 CROSS JOIN j2 WHERE (j2.j2_id = 0)");
+
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 2a5597b9fb7ee..9a1dc5502ee60 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -24,36 +24,40 @@ use arrow::{
 
 use datafusion::error::Result;
 use datafusion::logical_expr::{Aggregate, LogicalPlan, TableScan};
-use datafusion::physical_plan::collect;
-use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::ExecutionPlanVisitor;
+use datafusion::physical_plan::collect;
+use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::prelude::*;
 use datafusion::test_util;
 use datafusion::{execution::context::SessionContext, physical_plan::displayable};
 use datafusion_common::test_util::batches_to_sort_string;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{assert_contains, assert_not_contains};
-use insta::assert_snapshot;
 use object_store::path::Path;
 use std::fs::File;
 use std::io::Write;
 use std::path::PathBuf;
 use tempfile::TempDir;
 
-/// A macro to assert that some particular line contains two substrings
-///
-/// Usage: `assert_metrics!(actual, operator_name, metrics)`
+/// A macro to assert that some particular line contains the given substrings
 ///
+/// Usage: `assert_metrics!(actual, operator_name, metrics_1, metrics_2, ...)`
 macro_rules! assert_metrics {
-    ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => {
+    ($ACTUAL: expr, $OPERATOR_NAME: expr, $($METRICS: expr),+) => {
         let found = $ACTUAL
             .lines()
-            .any(|line| line.contains($OPERATOR_NAME) && line.contains($METRICS));
+            .any(|line| line.contains($OPERATOR_NAME) $( && line.contains($METRICS))+);
+
+        let mut metrics = String::new();
+        $(metrics.push_str(format!(" '{}',", $METRICS).as_str());)+
+        // remove the last `,` from the string
+        metrics.pop();
+
         assert!(
             found,
-            "Can not find a line with both '{}' and '{}' in\n\n{}",
-            $OPERATOR_NAME, $METRICS, $ACTUAL
+            "Cannot find a line with operator name '{}' and metrics containing values {} in :\n\n{}",
+            $OPERATOR_NAME, metrics, $ACTUAL
         );
     };
 }
@@ -66,6 +70,7 @@ mod path_partition;
 mod runtime_config;
 pub mod select;
 mod sql_api;
+mod unparser;
 
 async fn register_aggregate_csv_by_sql(ctx: &SessionContext) {
     let testdata = test_util::arrow_test_data();
@@ -331,8 +336,7 @@ async fn nyc() -> Result<()> {
     match &optimized_plan {
         LogicalPlan::Aggregate(Aggregate { input, .. }) => match input.as_ref() {
             LogicalPlan::TableScan(TableScan {
-                ref projected_schema,
-                ..
+                projected_schema, ..
             }) => {
                 assert_eq!(2, projected_schema.fields().len());
                 assert_eq!(projected_schema.field(0).name(), "passenger_count");
diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs
index 5e9748d23d8cd..1afab529f019c 100644
--- a/datafusion/core/tests/sql/path_partition.rs
+++ b/datafusion/core/tests/sql/path_partition.rs
@@ -20,7 +20,6 @@
 use std::collections::BTreeSet;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
-use std::ops::Range;
 use std::sync::Arc;
 
 use arrow::datatypes::DataType;
@@ -31,26 +30,28 @@ use datafusion::{
         listing::{ListingOptions, ListingTable, ListingTableConfig},
     },
     error::Result,
-    physical_plan::ColumnStatistics,
     prelude::SessionContext,
     test_util::{self, arrow_test_data, parquet_test_data},
 };
 use datafusion_catalog::TableProvider;
+use datafusion_common::ScalarValue;
 use datafusion_common::stats::Precision;
 use datafusion_common::test_util::batches_to_sort_string;
-use datafusion_common::ScalarValue;
 use datafusion_execution::config::SessionConfig;
 
 use async_trait::async_trait;
 use bytes::Bytes;
 use chrono::{TimeZone, Utc};
+use futures::StreamExt;
 use futures::stream::{self, BoxStream};
 use insta::assert_snapshot;
 use object_store::{
-    path::Path, GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta,
-    ObjectStore, PutOptions, PutResult,
+    Attributes, CopyOptions, GetRange, MultipartUpload, PutMultipartOptions, PutPayload,
+};
+use object_store::{
+    GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore,
+    PutOptions, PutResult, path::Path,
 };
-use object_store::{Attributes, MultipartUpload, PutMultipartOpts, PutPayload};
 use url::Url;
 
 #[tokio::test]
@@ -460,14 +461,26 @@ async fn parquet_statistics() -> Result<()> {
     let schema = physical_plan.schema();
     assert_eq!(schema.fields().len(), 4);
 
-    let stat_cols = physical_plan.partition_statistics(None)?.column_statistics;
+    let stat_cols = physical_plan
+        .partition_statistics(None)?
+        .column_statistics
+        .clone();
     assert_eq!(stat_cols.len(), 4);
     // stats for the first col are read from the parquet file
     assert_eq!(stat_cols[0].null_count, Precision::Exact(3));
-    // TODO assert partition column (1,2,3) stats once implemented (#1186)
-    assert_eq!(stat_cols[1], ColumnStatistics::new_unknown(),);
-    assert_eq!(stat_cols[2], ColumnStatistics::new_unknown(),);
-    assert_eq!(stat_cols[3], ColumnStatistics::new_unknown(),);
+    // Partition column statistics (year=2021 for all 3 rows)
+    assert_eq!(stat_cols[1].null_count, Precision::Exact(0));
+    assert_eq!(
+        stat_cols[1].min_value,
+        Precision::Exact(ScalarValue::Int32(Some(2021)))
+    );
+    assert_eq!(
+        stat_cols[1].max_value,
+        Precision::Exact(ScalarValue::Int32(Some(2021)))
+    );
+    // month and day are Utf8 partition columns with statistics
+    assert_eq!(stat_cols[2].null_count, Precision::Exact(0));
+    assert_eq!(stat_cols[3].null_count, Precision::Exact(0));
 
     //// WITH PROJECTION ////
     let dataframe = ctx.sql("SELECT mycol, day FROM t WHERE day='28'").await?;
@@ -475,12 +488,23 @@ async fn parquet_statistics() -> Result<()> {
     let schema = physical_plan.schema();
     assert_eq!(schema.fields().len(), 2);
 
-    let stat_cols = physical_plan.partition_statistics(None)?.column_statistics;
+    let stat_cols = physical_plan
+        .partition_statistics(None)?
+        .column_statistics
+        .clone();
     assert_eq!(stat_cols.len(), 2);
     // stats for the first col are read from the parquet file
     assert_eq!(stat_cols[0].null_count, Precision::Exact(1));
-    // TODO assert partition column stats once implemented (#1186)
-    assert_eq!(stat_cols[1], ColumnStatistics::new_unknown());
+    // Partition column statistics for day='28' (1 row)
+    assert_eq!(stat_cols[1].null_count, Precision::Exact(0));
+    assert_eq!(
+        stat_cols[1].min_value,
+        Precision::Exact(ScalarValue::Utf8(Some("28".to_string())))
+    );
+    assert_eq!(
+        stat_cols[1].max_value,
+        Precision::Exact(ScalarValue::Utf8(Some("28".to_string())))
+    );
 
     Ok(())
 }
@@ -604,7 +628,7 @@ async fn create_partitioned_alltypes_parquet_table(
 }
 
 #[derive(Debug)]
-/// An object store implem that is mirrors a given file to multiple paths.
+/// An object store implem that mirrors a given file to multiple paths.
 pub struct MirroringObjectStore {
     /// The `(path,size)` of the files that "exist" in the store
     files: Vec<Path>,
@@ -645,7 +669,7 @@ impl ObjectStore for MirroringObjectStore {
     async fn put_multipart_opts(
         &self,
         _location: &Path,
-        _opts: PutMultipartOpts,
+        _opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
         unimplemented!()
     }
@@ -653,12 +677,13 @@ impl ObjectStore for MirroringObjectStore {
     async fn get_opts(
         &self,
         location: &Path,
-        _options: GetOptions,
+        options: GetOptions,
     ) -> object_store::Result<GetResult> {
         self.files.iter().find(|x| *x == location).unwrap();
         let path = std::path::PathBuf::from(&self.mirrored_file);
         let file = File::open(&path).unwrap();
         let metadata = file.metadata().unwrap();
+
         let meta = ObjectMeta {
             location: location.clone(),
             last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(),
@@ -667,37 +692,35 @@ impl ObjectStore for MirroringObjectStore {
             version: None,
         };
 
+        let payload = if options.head {
+            // no content for head requests
+            GetResultPayload::Stream(stream::empty().boxed())
+        } else if let Some(range) = options.range {
+            let GetRange::Bounded(range) = range else {
+                unimplemented!("Unbounded range not supported in MirroringObjectStore");
+            };
+            let mut file = File::open(path).unwrap();
+            file.seek(SeekFrom::Start(range.start)).unwrap();
+
+            let to_read = range.end - range.start;
+            let to_read: usize = to_read.try_into().unwrap();
+            let mut data = Vec::with_capacity(to_read);
+            let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
+            assert_eq!(read, to_read);
+            let stream = stream::once(async move { Ok(Bytes::from(data)) }).boxed();
+            GetResultPayload::Stream(stream)
+        } else {
+            GetResultPayload::File(file, path)
+        };
+
         Ok(GetResult {
             range: 0..meta.size,
-            payload: GetResultPayload::File(file, path),
+            payload,
             meta,
             attributes: Attributes::default(),
         })
     }
 
-    async fn get_range(
-        &self,
-        location: &Path,
-        range: Range<u64>,
-    ) -> object_store::Result<Bytes> {
-        self.files.iter().find(|x| *x == location).unwrap();
-        let path = std::path::PathBuf::from(&self.mirrored_file);
-        let mut file = File::open(path).unwrap();
-        file.seek(SeekFrom::Start(range.start)).unwrap();
-
-        let to_read = range.end - range.start;
-        let to_read: usize = to_read.try_into().unwrap();
-        let mut data = Vec::with_capacity(to_read);
-        let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
-        assert_eq!(read, to_read);
-
-        Ok(data.into())
-    }
-
-    async fn delete(&self, _location: &Path) -> object_store::Result<()> {
-        unimplemented!()
-    }
-
     fn list(
         &self,
         prefix: Option<&Path>,
@@ -712,6 +735,8 @@ impl ObjectStore for MirroringObjectStore {
                     .map(|mut x| x.next().is_some())
                     .unwrap_or(false);
 
+                #[expect(clippy::result_large_err)]
+                // closure only ever returns Ok; Err type is never constructed
                 filter.then(|| {
                     Ok(ObjectMeta {
                         location,
@@ -767,14 +792,18 @@ impl ObjectStore for MirroringObjectStore {
         })
     }
 
-    async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+    fn delete_stream(
+        &self,
+        _locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
         unimplemented!()
     }
 
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         _from: &Path,
         _to: &Path,
+        _options: CopyOptions,
     ) -> object_store::Result<()> {
         unimplemented!()
     }
diff --git a/datafusion/core/tests/sql/runtime_config.rs b/datafusion/core/tests/sql/runtime_config.rs
index 18e07bb61ed94..cf5237d725805 100644
--- a/datafusion/core/tests/sql/runtime_config.rs
+++ b/datafusion/core/tests/sql/runtime_config.rs
@@ -18,9 +18,14 @@
 //! Tests for runtime configuration SQL interface
 
 use std::sync::Arc;
+use std::time::Duration;
 
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::context::TaskContext;
+use datafusion::prelude::SessionConfig;
+use datafusion_execution::cache::DefaultListFilesCache;
+use datafusion_execution::cache::cache_manager::CacheManagerConfig;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
 use datafusion_physical_plan::common::collect;
 
 #[tokio::test]
@@ -140,7 +145,7 @@ async fn test_memory_limit_enforcement() {
 }
 
 #[tokio::test]
-async fn test_invalid_memory_limit() {
+async fn test_invalid_memory_limit_when_unit_is_invalid() {
     let ctx = SessionContext::new();
 
     let result = ctx
@@ -149,7 +154,194 @@ async fn test_invalid_memory_limit() {
 
     assert!(result.is_err());
     let error_message = result.unwrap_err().to_string();
-    assert!(error_message.contains("Unsupported unit 'X'"));
+    assert!(
+        error_message
+            .contains("Unsupported unit 'X' in 'datafusion.runtime.memory_limit'")
+            && error_message.contains("Unit must be one of: 'K', 'M', 'G'")
+    );
+}
+
+#[tokio::test]
+async fn test_invalid_memory_limit_when_limit_is_not_numeric() {
+    let ctx = SessionContext::new();
+
+    let result = ctx
+        .sql("SET datafusion.runtime.memory_limit = 'invalid_memory_limit'")
+        .await;
+
+    assert!(result.is_err());
+    let error_message = result.unwrap_err().to_string();
+    assert!(error_message.contains(
+        "Failed to parse number from 'datafusion.runtime.memory_limit', limit 'invalid_memory_limit'"
+    ));
+}
+
+#[tokio::test]
+async fn test_max_temp_directory_size_enforcement() {
+    let ctx = SessionContext::new();
+
+    ctx.sql("SET datafusion.runtime.memory_limit = '1M'")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    ctx.sql("SET datafusion.execution.sort_spill_reservation_bytes = 0")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    ctx.sql("SET datafusion.runtime.max_temp_directory_size = '0K'")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let query = "select * from generate_series(1,100000) as t1(v1) order by v1;";
+    let result = ctx.sql(query).await.unwrap().collect().await;
+
+    assert!(
+        result.is_err(),
+        "Should fail due to max temp directory size limit"
+    );
+
+    ctx.sql("SET datafusion.runtime.max_temp_directory_size = '1M'")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let result = ctx.sql(query).await.unwrap().collect().await;
+
+    assert!(
+        result.is_ok(),
+        "Should not fail due to max temp directory size limit"
+    );
+}
+
+#[tokio::test]
+async fn test_test_metadata_cache_limit() {
+    let ctx = SessionContext::new();
+
+    let update_limit = async |ctx: &SessionContext, limit: &str| {
+        ctx.sql(
+            format!("SET datafusion.runtime.metadata_cache_limit = '{limit}'").as_str(),
+        )
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    };
+
+    let get_limit = |ctx: &SessionContext| -> usize {
+        ctx.task_ctx()
+            .runtime_env()
+            .cache_manager
+            .get_file_metadata_cache()
+            .cache_limit()
+    };
+
+    update_limit(&ctx, "100M").await;
+    assert_eq!(get_limit(&ctx), 100 * 1024 * 1024);
+
+    update_limit(&ctx, "2G").await;
+    assert_eq!(get_limit(&ctx), 2 * 1024 * 1024 * 1024);
+
+    update_limit(&ctx, "123K").await;
+    assert_eq!(get_limit(&ctx), 123 * 1024);
+}
+
+#[tokio::test]
+async fn test_list_files_cache_limit() {
+    let list_files_cache = Arc::new(DefaultListFilesCache::default());
+
+    let rt = RuntimeEnvBuilder::new()
+        .with_cache_manager(
+            CacheManagerConfig::default().with_list_files_cache(Some(list_files_cache)),
+        )
+        .build_arc()
+        .unwrap();
+
+    let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+    let update_limit = async |ctx: &SessionContext, limit: &str| {
+        ctx.sql(
+            format!("SET datafusion.runtime.list_files_cache_limit = '{limit}'").as_str(),
+        )
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    };
+
+    let get_limit = |ctx: &SessionContext| -> usize {
+        ctx.task_ctx()
+            .runtime_env()
+            .cache_manager
+            .get_list_files_cache()
+            .unwrap()
+            .cache_limit()
+    };
+
+    update_limit(&ctx, "100M").await;
+    assert_eq!(get_limit(&ctx), 100 * 1024 * 1024);
+
+    update_limit(&ctx, "2G").await;
+    assert_eq!(get_limit(&ctx), 2 * 1024 * 1024 * 1024);
+
+    update_limit(&ctx, "123K").await;
+    assert_eq!(get_limit(&ctx), 123 * 1024);
+}
+
+#[tokio::test]
+async fn test_list_files_cache_ttl() {
+    let list_files_cache = Arc::new(DefaultListFilesCache::default());
+
+    let rt = RuntimeEnvBuilder::new()
+        .with_cache_manager(
+            CacheManagerConfig::default().with_list_files_cache(Some(list_files_cache)),
+        )
+        .build_arc()
+        .unwrap();
+
+    let ctx = SessionContext::new_with_config_rt(SessionConfig::default(), rt);
+
+    let update_limit = async |ctx: &SessionContext, limit: &str| {
+        ctx.sql(
+            format!("SET datafusion.runtime.list_files_cache_ttl = '{limit}'").as_str(),
+        )
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    };
+
+    let get_limit = |ctx: &SessionContext| -> Duration {
+        ctx.task_ctx()
+            .runtime_env()
+            .cache_manager
+            .get_list_files_cache()
+            .unwrap()
+            .cache_ttl()
+            .unwrap()
+    };
+
+    update_limit(&ctx, "1m").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(60));
+
+    update_limit(&ctx, "30s").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(30));
+
+    update_limit(&ctx, "1m30s").await;
+    assert_eq!(get_limit(&ctx), Duration::from_secs(90));
 }
 
 #[tokio::test]
diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs
index f874dd7c08428..6126793145efd 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -15,8 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
+
 use super::*;
-use datafusion_common::ScalarValue;
+use datafusion_common::{ParamValues, ScalarValue, metadata::ScalarAndMetadata};
+use insta::assert_snapshot;
 
 #[tokio::test]
 async fn test_list_query_parameters() -> Result<()> {
@@ -217,10 +220,12 @@ async fn test_parameter_invalid_types() -> Result<()> {
         .with_param_values(vec![ScalarValue::from(4_i32)])?
         .collect()
         .await;
-    assert_eq!(
-        results.unwrap_err().strip_backtrace(),
-        "type_coercion\ncaused by\nError during planning: Cannot infer common argument type for comparison operation List(Field { name: \"item\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) = Int32"
-);
+    assert_snapshot!(results.unwrap_err().strip_backtrace(),
+        @r"
+    type_coercion
+    caused by
+    Error during planning: Cannot infer common argument type for comparison operation List(Int32) = Int32
+    ");
     Ok(())
 }
 
@@ -314,6 +319,47 @@ async fn test_named_parameter_not_bound() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn test_query_parameters_with_metadata() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    let df = ctx.sql("SELECT $1, $2").await.unwrap();
+
+    let metadata1 = HashMap::from([("some_key".to_string(), "some_value".to_string())]);
+    let metadata2 =
+        HashMap::from([("some_other_key".to_string(), "some_other_value".to_string())]);
+
+    let df_with_params_replaced = df
+        .with_param_values(ParamValues::List(vec![
+            ScalarAndMetadata::new(
+                ScalarValue::UInt32(Some(1)),
+                Some(metadata1.clone().into()),
+            ),
+            ScalarAndMetadata::new(
+                ScalarValue::Utf8(Some("two".to_string())),
+                Some(metadata2.clone().into()),
+            ),
+        ]))
+        .unwrap();
+
+    let schema = df_with_params_replaced.schema();
+    assert_eq!(schema.field(0).data_type(), &DataType::UInt32);
+    assert_eq!(schema.field(0).metadata(), &metadata1);
+    assert_eq!(schema.field(1).data_type(), &DataType::Utf8);
+    assert_eq!(schema.field(1).metadata(), &metadata2);
+
+    let batches = df_with_params_replaced.collect().await.unwrap();
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+-----+
+    | $1 | $2  |
+    +----+-----+
+    | 1  | two |
+    +----+-----+
+    ");
+
+    Ok(())
+}
+
 #[tokio::test]
 async fn test_version_function() {
     let expected_version = format!(
@@ -343,3 +389,45 @@ async fn test_version_function() {
 
     assert_eq!(version.value(0), expected_version);
 }
+
+/// Regression test for https://github.com/apache/datafusion/issues/17513
+/// See https://github.com/apache/datafusion/pull/17520
+#[tokio::test]
+async fn test_select_no_projection() -> Result<()> {
+    let tmp_dir = TempDir::new()?;
+    // `create_ctx_with_partition` creates 10 rows per partition and we chose 1 partition
+    let ctx = create_ctx_with_partition(&tmp_dir, 1).await?;
+
+    let results = ctx.sql("SELECT FROM test").await?.collect().await?;
+    // We should get all of the rows, just without any columns
+    let total_rows: usize = results.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total_rows, 10);
+    // Check that none of the batches have any columns
+    for batch in &results {
+        assert_eq!(batch.num_columns(), 0);
+    }
+    // Sanity check the output, should be just empty columns
+    assert_snapshot!(batches_to_sort_string(&results), @r"
+    ++
+    ++
+    ++
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_select_cast_date_literal_to_timestamp_overflow() -> Result<()> {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .sql("SELECT CAST(DATE '9999-12-31' AS TIMESTAMP)")
+        .await?
+        .collect()
+        .await
+        .unwrap_err();
+
+    assert_contains!(
+        err.to_string(),
+        "Cannot cast Date32 value 2932896 to Timestamp(ns): converted value exceeds the representable i64 range"
+    );
+    Ok(())
+}
diff --git a/datafusion/core/tests/sql/sql_api.rs b/datafusion/core/tests/sql/sql_api.rs
index ec086bcc50c76..b87afd27ddea7 100644
--- a/datafusion/core/tests/sql/sql_api.rs
+++ b/datafusion/core/tests/sql/sql_api.rs
@@ -84,8 +84,8 @@ async fn dml_output_schema() {
     ctx.sql("CREATE TABLE test (x int)").await.unwrap();
     let sql = "INSERT INTO test VALUES (1)";
     let df = ctx.sql(sql).await.unwrap();
-    let count_schema = Schema::new(vec![Field::new("count", DataType::UInt64, false)]);
-    assert_eq!(Schema::from(df.schema()), count_schema);
+    let count_schema = &Schema::new(vec![Field::new("count", DataType::UInt64, false)]);
+    assert_eq!(df.schema().as_arrow(), count_schema);
 }
 
 #[tokio::test]
diff --git a/datafusion/core/tests/sql/unparser.rs b/datafusion/core/tests/sql/unparser.rs
new file mode 100644
index 0000000000000..e9bad71843ff2
--- /dev/null
+++ b/datafusion/core/tests/sql/unparser.rs
@@ -0,0 +1,456 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SQL Unparser Roundtrip Integration Tests
+//!
+//! This module tests the [`Unparser`] by running queries through a complete roundtrip:
+//! the original SQL is parsed into a logical plan, unparsed back to SQL, then that
+//! generated SQL is parsed and executed. The results are compared to verify semantic
+//! equivalence.
+//!
+//! ## Test Strategy
+//!
+//! Uses real-world benchmark queries (TPC-H and Clickbench) to validate that:
+//! 1. The unparser produces syntactically valid SQL
+//! 2. The unparsed SQL is semantically equivalent (produces identical results)
+//!
+//! ## Query Suites
+//!
+//! - **TPC-H**: Standard decision-support benchmark with 22 complex analytical queries
+//! - **Clickbench**: Web analytics benchmark with 43 queries against a denormalized schema
+//!
+//! [`Unparser`]: datafusion_sql::unparser::Unparser
+
+use std::fs::ReadDir;
+use std::future::Future;
+
+use arrow::array::RecordBatch;
+use datafusion::common::Result;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_common::Column;
+use datafusion_expr::Expr;
+use datafusion_sql::unparser::Unparser;
+use datafusion_sql::unparser::dialect::DefaultDialect;
+use itertools::Itertools;
+use recursive::{set_minimum_stack_size, set_stack_allocation_size};
+
+/// Paths to benchmark query files (supports running from repo root or different working directories).
+const BENCHMARK_PATHS: &[&str] = &["../../benchmarks/", "./benchmarks/"];
+
+/// Reads all `.sql` files from a directory and converts them to test queries.
+///
+/// Skips files that:
+/// - Are not regular files
+/// - Don't have a `.sql` extension
+/// - Contain multiple SQL statements (indicated by `;\n`)
+///
+/// Multi-statement files are skipped because the unparser doesn't support
+/// DML statements like `CREATE VIEW` that appear in multi-statement Clickbench queries.
+fn iterate_queries(dir: ReadDir) -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for entry in dir.flatten() {
+        let Ok(file_type) = entry.file_type() else {
+            continue;
+        };
+        if !file_type.is_file() {
+            continue;
+        }
+        let path = entry.path();
+        let Some(ext) = path.extension() else {
+            continue;
+        };
+        if ext != "sql" {
+            continue;
+        }
+        let name = path.file_stem().unwrap().to_string_lossy().to_string();
+        if let Ok(mut contents) = std::fs::read_to_string(entry.path()) {
+            // If the query contains ;\n it has DML statements like CREATE VIEW which the unparser doesn't support; skip it
+            contents = contents.trim().to_string();
+            if contents.contains(";\n") {
+                println!("Skipping query with multiple statements: {name}");
+                continue;
+            }
+            queries.push(TestQuery {
+                sql: contents,
+                name,
+            });
+        }
+    }
+    queries
+}
+
+/// A SQL query loaded from a benchmark file for roundtrip testing.
+///
+/// Each query is identified by its filename (without extension) and contains
+/// the full SQL text to be tested.
+struct TestQuery {
+    /// The SQL query text to test.
+    sql: String,
+    /// The query identifier (typically the filename without .sql extension).
+    name: String,
+}
+
+/// Collect SQL for Clickbench queries.
+fn clickbench_queries() -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for path in BENCHMARK_PATHS {
+        let dir = format!("{path}queries/clickbench/queries/");
+        println!("Reading Clickbench queries from {dir}");
+        if let Ok(dir) = std::fs::read_dir(dir) {
+            let read = iterate_queries(dir);
+            println!("Found {} Clickbench queries", read.len());
+            queries.extend(read);
+        }
+    }
+    queries.sort_unstable_by_key(|q| {
+        q.name
+            .split('q')
+            .next_back()
+            .and_then(|num| num.parse::<u32>().ok())
+    });
+    queries
+}
+
+/// Collect SQL for TPC-H queries.
+fn tpch_queries() -> Vec<TestQuery> {
+    let mut queries = vec![];
+    for path in BENCHMARK_PATHS {
+        let dir = format!("{path}queries/");
+        println!("Reading TPC-H queries from {dir}");
+        if let Ok(dir) = std::fs::read_dir(dir) {
+            let read = iterate_queries(dir);
+            queries.extend(read);
+        }
+    }
+    println!("Total TPC-H queries found: {}", queries.len());
+    queries.sort_unstable_by_key(|q| q.name.clone());
+    queries
+}
+
+/// Create a new SessionContext for testing that has all Clickbench tables registered.
+async fn clickbench_test_context() -> Result<SessionContext> {
+    let ctx = SessionContext::new();
+    ctx.register_parquet(
+        "hits",
+        "tests/data/clickbench_hits_10.parquet",
+        ParquetReadOptions::default(),
+    )
+    .await?;
+    // Sanity check we found the table by querying it's schema, it should not be empty
+    // Otherwise if the path is wrong the tests will all fail in confusing ways
+    let df = ctx.sql("SELECT * FROM hits LIMIT 1").await?;
+    assert!(
+        !df.schema().fields().is_empty(),
+        "Clickbench 'hits' table not registered correctly"
+    );
+    Ok(ctx)
+}
+
+/// Create a new SessionContext for testing that has all TPC-H tables registered.
+async fn tpch_test_context() -> Result<SessionContext> {
+    let ctx = SessionContext::new();
+    let data_dir = "tests/data/";
+    // All tables have the pattern "tpch_<table_name>_small.parquet"
+    for table in [
+        "customer", "lineitem", "nation", "orders", "part", "partsupp", "region",
+        "supplier",
+    ] {
+        let path = format!("{data_dir}tpch_{table}_small.parquet");
+        ctx.register_parquet(table, &path, ParquetReadOptions::default())
+            .await?;
+        // Sanity check we found the table by querying it's schema, it should not be empty
+        // Otherwise if the path is wrong the tests will all fail in confusing ways
+        let df = ctx.sql(&format!("SELECT * FROM {table} LIMIT 1")).await?;
+        assert!(
+            !df.schema().fields().is_empty(),
+            "TPC-H '{table}' table not registered correctly"
+        );
+    }
+    Ok(ctx)
+}
+
+/// Sorts record batches by all columns for deterministic comparison.
+///
+/// When comparing query results, we need a canonical ordering so that
+/// semantically equivalent results compare as equal. This function sorts
+/// by all columns in the schema to achieve that.
+async fn sort_batches(
+    ctx: &SessionContext,
+    batches: Vec<RecordBatch>,
+) -> Result<Vec<RecordBatch>> {
+    let mut df = ctx.read_batches(batches)?;
+    let schema = df.schema().as_arrow().clone();
+    let sort_exprs = schema
+        .fields()
+        .iter()
+        // Use Column directly, col() causes the column names to be normalized to lowercase
+        .map(|f| {
+            Expr::Column(Column::new_unqualified(f.name().to_string())).sort(true, false)
+        })
+        .collect_vec();
+    if !sort_exprs.is_empty() {
+        df = df.sort(sort_exprs)?;
+    }
+    df.collect().await
+}
+
+/// The outcome of running a single roundtrip test.
+///
+/// A successful test produces [`TestCaseResult::Success`].
+/// All other variants capture different failure modes with enough context to diagnose the issue.
+enum TestCaseResult {
+    /// The unparsed SQL produced identical results to the original.
+    Success,
+
+    /// Both queries executed but produced different results.
+    ///
+    /// This indicates a semantic bug in the unparser where the generated SQL
+    /// has different meaning than the original.
+    ResultsMismatch { original: String, unparsed: String },
+
+    /// The unparser failed to convert the logical plan to SQL.
+    ///
+    /// This may indicate an unsupported SQL feature or a bug in the unparser.
+    UnparseError { original: String, error: String },
+
+    /// The original SQL failed to execute.
+    ///
+    /// This indicates a problem with the test setup (missing tables,
+    /// invalid test data) rather than an unparser issue.
+    ExecutionError { original: String, error: String },
+
+    /// The unparsed SQL failed to execute, even though the original succeeded.
+    ///
+    /// This indicates the unparser generated syntactically invalid SQL or SQL
+    /// that references non-existent columns/tables.
+    UnparsedExecutionError {
+        original: String,
+        unparsed: String,
+        error: String,
+    },
+}
+
+impl TestCaseResult {
+    /// Returns true if the test case represents a failure
+    /// (anything other than [`TestCaseResult::Success`]).
+    fn is_failure(&self) -> bool {
+        !matches!(self, TestCaseResult::Success)
+    }
+
+    /// Formats a detailed error message for the test case into a string.
+    fn format_error(&self, name: &str) -> String {
+        match self {
+            TestCaseResult::Success => String::new(),
+            TestCaseResult::ResultsMismatch { original, unparsed } => {
+                format!(
+                    "Results mismatch for {name}.\nOriginal SQL:\n{original}\n\nUnparsed SQL:\n{unparsed}"
+                )
+            }
+            TestCaseResult::UnparseError { original, error } => {
+                format!("Unparse error for {name}: {error}\nOriginal SQL:\n{original}")
+            }
+            TestCaseResult::ExecutionError { original, error } => {
+                format!("Execution error for {name}: {error}\nOriginal SQL:\n{original}")
+            }
+            TestCaseResult::UnparsedExecutionError {
+                original,
+                unparsed,
+                error,
+            } => {
+                format!(
+                    "Unparsed execution error for {name}: {error}\nOriginal SQL:\n{original}\n\nUnparsed SQL:\n{unparsed}"
+                )
+            }
+        }
+    }
+}
+
+/// Executes a roundtrip test for a single SQL query.
+///
+/// This is the core test logic that:
+/// 1. Parses the original SQL and creates a logical plan
+/// 2. Unparses the logical plan back to SQL
+/// 3. Executes both the original and unparsed queries
+/// 4. Compares the results (sorting if the query has no ORDER BY)
+///
+/// This always uses [`DefaultDialect`] for unparsing.
+///
+/// # Arguments
+///
+/// * `ctx` - Session context with tables registered
+/// * `original` - The original SQL query to test
+///
+/// # Returns
+///
+/// A [`TestCaseResult`] indicating success or the specific failure mode.
+async fn collect_results(ctx: &SessionContext, original: &str) -> TestCaseResult {
+    let unparser = Unparser::new(&DefaultDialect {});
+
+    // Parse and create logical plan from original SQL
+    let df = match ctx.sql(original).await {
+        Ok(df) => df,
+        Err(e) => {
+            return TestCaseResult::ExecutionError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Unparse the logical plan back to SQL
+    let unparsed = match unparser.plan_to_sql(df.logical_plan()) {
+        Ok(sql) => format!("{sql:#}"),
+        Err(e) => {
+            return TestCaseResult::UnparseError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Collect results from original query
+    let mut expected = match df.collect().await {
+        Ok(batches) => batches,
+        Err(e) => {
+            return TestCaseResult::ExecutionError {
+                original: original.to_string(),
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Parse and execute the unparsed SQL
+    let actual_df = match ctx.sql(&unparsed).await {
+        Ok(df) => df,
+        Err(e) => {
+            return TestCaseResult::UnparsedExecutionError {
+                original: original.to_string(),
+                unparsed,
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Collect results from unparsed query
+    let mut actual = match actual_df.collect().await {
+        Ok(batches) => batches,
+        Err(e) => {
+            return TestCaseResult::UnparsedExecutionError {
+                original: original.to_string(),
+                unparsed,
+                error: e.to_string(),
+            };
+        }
+    };
+
+    // Always sort for deterministic comparison — even "sorted" results can have
+    // tied rows in different order between original and unparsed SQL.
+    {
+        expected = match sort_batches(ctx, expected).await {
+            Ok(batches) => batches,
+            Err(e) => {
+                return TestCaseResult::ExecutionError {
+                    original: original.to_string(),
+                    error: format!("Failed to sort expected results: {e}"),
+                };
+            }
+        };
+        actual = match sort_batches(ctx, actual).await {
+            Ok(batches) => batches,
+            Err(e) => {
+                return TestCaseResult::UnparsedExecutionError {
+                    original: original.to_string(),
+                    unparsed,
+                    error: format!("Failed to sort actual results: {e}"),
+                };
+            }
+        };
+    }
+
+    if expected != actual {
+        TestCaseResult::ResultsMismatch {
+            original: original.to_string(),
+            unparsed,
+        }
+    } else {
+        TestCaseResult::Success
+    }
+}
+
+/// Runs roundtrip tests for a collection of queries and reports results.
+///
+/// Iterates through all queries, running each through [`collect_results`].
+/// Prints colored status (green checkmark for success, red X for failure)
+/// and panics at the end if any tests failed, with detailed error messages.
+///
+/// # Type Parameters
+///
+/// * `F` - Factory function that creates fresh session contexts
+/// * `Fut` - Future type returned by the context factory
+///
+/// # Panics
+///
+/// Panics if any query fails the roundtrip test, displaying all failures.
+async fn run_roundtrip_tests<F, Fut>(
+    suite_name: &str,
+    queries: Vec<TestQuery>,
+    create_context: F,
+) where
+    F: Fn() -> Fut,
+    Fut: Future<Output = Result<SessionContext>>,
+{
+    let mut errors: Vec<String> = vec![];
+    for sql in queries {
+        let ctx = match create_context().await {
+            Ok(ctx) => ctx,
+            Err(e) => {
+                println!("\x1b[31m✗\x1b[0m {} query: {}", suite_name, sql.name);
+                errors.push(format!("Failed to create context for {}: {}", sql.name, e));
+                continue;
+            }
+        };
+        let result = collect_results(&ctx, &sql.sql).await;
+        if result.is_failure() {
+            println!("\x1b[31m✗\x1b[0m {} query: {}", suite_name, sql.name);
+            errors.push(result.format_error(&sql.name));
+        } else {
+            println!("\x1b[32m✓\x1b[0m {} query: {}", suite_name, sql.name);
+        }
+    }
+    if !errors.is_empty() {
+        panic!(
+            "{} {} test(s) failed:\n\n{}",
+            errors.len(),
+            suite_name,
+            errors.join("\n\n---\n\n")
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_clickbench_unparser_roundtrip() {
+    run_roundtrip_tests("Clickbench", clickbench_queries(), clickbench_test_context)
+        .await;
+}
+
+#[tokio::test]
+async fn test_tpch_unparser_roundtrip() {
+    // Grow stacker segments earlier to avoid deep unparser recursion overflow in q20.
+    set_minimum_stack_size(512 * 1024);
+    set_stack_allocation_size(8 * 1024 * 1024);
+    run_roundtrip_tests("TPC-H", tpch_queries(), tpch_test_context).await;
+}
diff --git a/datafusion/core/tests/test_adapter_updated.rs b/datafusion/core/tests/test_adapter_updated.rs
deleted file mode 100644
index c85b9a3447361..0000000000000
--- a/datafusion/core/tests/test_adapter_updated.rs
+++ /dev/null
@@ -1,214 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use arrow::record_batch::RecordBatch;
-use datafusion_common::{ColumnStatistics, DataFusionError, Result, Statistics};
-use datafusion_datasource::file::FileSource;
-use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::file_stream::FileOpener;
-use datafusion_datasource::schema_adapter::{
-    SchemaAdapter, SchemaAdapterFactory, SchemaMapper,
-};
-use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use object_store::ObjectStore;
-use std::any::Any;
-use std::fmt::Debug;
-use std::sync::Arc;
-
-/// A test source for testing schema adapters
-#[derive(Debug, Clone)]
-struct TestSource {
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
-}
-
-impl TestSource {
-    fn new() -> Self {
-        Self {
-            schema_adapter_factory: None,
-        }
-    }
-}
-
-impl FileSource for TestSource {
-    fn file_type(&self) -> &str {
-        "test"
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn create_file_opener(
-        &self,
-        _store: Arc<dyn ObjectStore>,
-        _conf: &FileScanConfig,
-        _index: usize,
-    ) -> Arc<dyn FileOpener> {
-        unimplemented!("Not needed for this test")
-    }
-
-    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
-        Arc::new(self.clone())
-    }
-
-    fn with_schema(&self, _schema: SchemaRef) -> Arc<dyn FileSource> {
-        Arc::new(self.clone())
-    }
-
-    fn with_projection(&self, _projection: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(self.clone())
-    }
-
-    fn with_statistics(&self, _statistics: Statistics) -> Arc<dyn FileSource> {
-        Arc::new(self.clone())
-    }
-
-    fn metrics(&self) -> &ExecutionPlanMetricsSet {
-        unimplemented!("Not needed for this test")
-    }
-
-    fn statistics(&self) -> Result<Statistics, DataFusionError> {
-        Ok(Statistics::default())
-    }
-
-    fn with_schema_adapter_factory(
-        &self,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Arc<dyn FileSource> {
-        Arc::new(Self {
-            schema_adapter_factory: Some(schema_adapter_factory),
-        })
-    }
-
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
-        self.schema_adapter_factory.clone()
-    }
-}
-
-/// A test schema adapter factory
-#[derive(Debug)]
-struct TestSchemaAdapterFactory {}
-
-impl SchemaAdapterFactory for TestSchemaAdapterFactory {
-    fn create(
-        &self,
-        projected_table_schema: SchemaRef,
-        _table_schema: SchemaRef,
-    ) -> Box<dyn SchemaAdapter> {
-        Box::new(TestSchemaAdapter {
-            table_schema: projected_table_schema,
-        })
-    }
-}
-
-/// A test schema adapter implementation
-#[derive(Debug)]
-struct TestSchemaAdapter {
-    table_schema: SchemaRef,
-}
-
-impl SchemaAdapter for TestSchemaAdapter {
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.table_schema.field(index);
-        file_schema.fields.find(field.name()).map(|(i, _)| i)
-    }
-
-    fn map_schema(
-        &self,
-        file_schema: &Schema,
-    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let mut projection = Vec::with_capacity(file_schema.fields().len());
-        for (file_idx, file_field) in file_schema.fields().iter().enumerate() {
-            if self.table_schema.fields().find(file_field.name()).is_some() {
-                projection.push(file_idx);
-            }
-        }
-
-        Ok((Arc::new(TestSchemaMapping {}), projection))
-    }
-}
-
-/// A test schema mapper implementation
-#[derive(Debug)]
-struct TestSchemaMapping {}
-
-impl SchemaMapper for TestSchemaMapping {
-    fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-        // For testing, just return the original batch
-        Ok(batch)
-    }
-
-    fn map_column_statistics(
-        &self,
-        stats: &[ColumnStatistics],
-    ) -> Result<Vec<ColumnStatistics>> {
-        // For testing, just return the input statistics
-        Ok(stats.to_vec())
-    }
-}
-
-#[test]
-fn test_schema_adapter() {
-    // This test verifies the functionality of the SchemaAdapter and SchemaAdapterFactory
-    // components used in DataFusion's file sources.
-    //
-    // The test specifically checks:
-    // 1. Creating and attaching a schema adapter factory to a file source
-    // 2. Creating a schema adapter using the factory
-    // 3. The schema adapter's ability to map column indices between a table schema and a file schema
-    // 4. The schema adapter's ability to create a projection that selects only the columns
-    //    from the file schema that are present in the table schema
-    //
-    // Schema adapters are used when the schema of data in files doesn't exactly match
-    // the schema expected by the query engine, allowing for field mapping and data transformation.
-
-    // Create a test schema
-    let table_schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-    ]));
-
-    // Create a file schema
-    let file_schema = Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-        Field::new("extra", DataType::Int64, true),
-    ]);
-
-    // Create a TestSource
-    let source = TestSource::new();
-    assert!(source.schema_adapter_factory().is_none());
-
-    // Add a schema adapter factory
-    let factory = Arc::new(TestSchemaAdapterFactory {});
-    let source_with_adapter = source.with_schema_adapter_factory(factory);
-    assert!(source_with_adapter.schema_adapter_factory().is_some());
-
-    // Create a schema adapter
-    let adapter_factory = source_with_adapter.schema_adapter_factory().unwrap();
-    let adapter =
-        adapter_factory.create(Arc::clone(&table_schema), Arc::clone(&table_schema));
-
-    // Test mapping column index
-    assert_eq!(adapter.map_column_index(0, &file_schema), Some(0));
-    assert_eq!(adapter.map_column_index(1, &file_schema), Some(1));
-
-    // Test creating schema mapper
-    let (_mapper, projection) = adapter.map_schema(&file_schema).unwrap();
-    assert_eq!(projection, vec![0, 1]);
-}
diff --git a/datafusion/core/tests/tpc-ds/30.sql b/datafusion/core/tests/tpc-ds/30.sql
index 78f34b807e5b5..80624f49006a9 100644
--- a/datafusion/core/tests/tpc-ds/30.sql
+++ b/datafusion/core/tests/tpc-ds/30.sql
@@ -14,7 +14,7 @@ with customer_total_return as
          ,ca_state)
   select  c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
        ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
-       ,c_last_review_date_sk,ctr_total_return
+       ,c_last_review_date,ctr_total_return
  from customer_total_return ctr1
      ,customer_address
      ,customer
@@ -26,7 +26,7 @@ with customer_total_return as
        and ctr1.ctr_customer_sk = c_customer_sk
  order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
                   ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
-                  ,c_last_review_date_sk,ctr_total_return
+                  ,c_last_review_date,ctr_total_return
 limit 100;
 
 
diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs
index 252d76d0f9d92..3ad74962bc2c0 100644
--- a/datafusion/core/tests/tpcds_planning.rs
+++ b/datafusion/core/tests/tpcds_planning.rs
@@ -1052,9 +1052,12 @@ async fn regression_test(query_no: u8, create_physical: bool) -> Result<()> {
     for sql in &sql {
         let df = ctx.sql(sql).await?;
         let (state, plan) = df.into_parts();
-        let plan = state.optimize(&plan)?;
         if create_physical {
             let _ = state.create_physical_plan(&plan).await?;
+        } else {
+            // Run the logical optimizer even if we are not creating the physical plan
+            // to ensure it will properly succeed
+            let _ = state.optimize(&plan)?;
         }
     }
 
diff --git a/datafusion/core/tests/tracing/asserting_tracer.rs b/datafusion/core/tests/tracing/asserting_tracer.rs
index 292e066e5f121..700f9f3308466 100644
--- a/datafusion/core/tests/tracing/asserting_tracer.rs
+++ b/datafusion/core/tests/tracing/asserting_tracer.rs
@@ -21,7 +21,7 @@ use std::ops::Deref;
 use std::sync::{Arc, LazyLock};
 
 use datafusion_common::{HashMap, HashSet};
-use datafusion_common_runtime::{set_join_set_tracer, JoinSetTracer};
+use datafusion_common_runtime::{JoinSetTracer, set_join_set_tracer};
 use futures::future::BoxFuture;
 use tokio::sync::{Mutex, MutexGuard};
 
diff --git a/datafusion/core/tests/tracing/mod.rs b/datafusion/core/tests/tracing/mod.rs
index df8a28c021d1c..0b66a49eea9f4 100644
--- a/datafusion/core/tests/tracing/mod.rs
+++ b/datafusion/core/tests/tracing/mod.rs
@@ -76,7 +76,13 @@ async fn run_query() {
     let ctx = SessionContext::new();
 
     // Get the test data directory
-    let test_data = parquet_test_data();
+    let test_data = if cfg!(target_os = "windows") {
+        // Prefix Windows paths with "/", since they start with <Drive>:/ but the URI should be
+        // test:///C:/... (https://datatracker.ietf.org/doc/html/rfc8089#appendix-E.2)
+        format!("/{}", parquet_test_data())
+    } else {
+        parquet_test_data()
+    };
 
     // Define a Parquet file format with pruning enabled
     let file_format = ParquetFormat::default().with_enable_pruning(true);
diff --git a/datafusion/core/tests/tracing/traceable_object_store.rs b/datafusion/core/tests/tracing/traceable_object_store.rs
index dfcafc3a63da1..71a61dbf8772a 100644
--- a/datafusion/core/tests/tracing/traceable_object_store.rs
+++ b/datafusion/core/tests/tracing/traceable_object_store.rs
@@ -18,10 +18,11 @@
 //! Object store implementation used for testing
 
 use crate::tracing::asserting_tracer::assert_traceability;
+use futures::StreamExt;
 use futures::stream::BoxStream;
 use object_store::{
-    path::Path, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
-    ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult,
+    CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
+    ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
 };
 use std::fmt::{Debug, Display, Formatter};
 use std::sync::Arc;
@@ -68,7 +69,7 @@ impl ObjectStore for TraceableObjectStore {
     async fn put_multipart_opts(
         &self,
         location: &Path,
-        opts: PutMultipartOpts,
+        opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
         assert_traceability().await;
         self.inner.put_multipart_opts(location, opts).await
@@ -83,14 +84,17 @@ impl ObjectStore for TraceableObjectStore {
         self.inner.get_opts(location, options).await
     }
 
-    async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
-        assert_traceability().await;
-        self.inner.head(location).await
-    }
-
-    async fn delete(&self, location: &Path) -> object_store::Result<()> {
-        assert_traceability().await;
-        self.inner.delete(location).await
+    fn delete_stream(
+        &self,
+        locations: BoxStream<'static, object_store::Result<Path>>,
+    ) -> BoxStream<'static, object_store::Result<Path>> {
+        self.inner
+            .delete_stream(locations)
+            .then(|res| async {
+                futures::executor::block_on(assert_traceability());
+                res
+            })
+            .boxed()
     }
 
     fn list(
@@ -109,17 +113,13 @@ impl ObjectStore for TraceableObjectStore {
         self.inner.list_with_delimiter(prefix).await
     }
 
-    async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
-        assert_traceability().await;
-        self.inner.copy(from, to).await
-    }
-
-    async fn copy_if_not_exists(
+    async fn copy_opts(
         &self,
         from: &Path,
         to: &Path,
+        options: CopyOptions,
     ) -> object_store::Result<()> {
         assert_traceability().await;
-        self.inner.copy_if_not_exists(from, to).await
+        self.inner.copy_opts(from, to, options).await
     }
 }
diff --git a/datafusion/core/tests/user_defined/expr_planner.rs b/datafusion/core/tests/user_defined/expr_planner.rs
index 1fc6d14c5b229..c5e5af731359f 100644
--- a/datafusion/core/tests/user_defined/expr_planner.rs
+++ b/datafusion/core/tests/user_defined/expr_planner.rs
@@ -26,9 +26,9 @@ use datafusion::logical_expr::Operator;
 use datafusion::prelude::*;
 use datafusion::sql::sqlparser::ast::BinaryOperator;
 use datafusion_common::ScalarValue;
+use datafusion_expr::BinaryExpr;
 use datafusion_expr::expr::Alias;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr};
-use datafusion_expr::BinaryExpr;
 
 #[derive(Debug)]
 struct MyCustomPlanner;
@@ -56,7 +56,7 @@ impl ExprPlanner for MyCustomPlanner {
             }
             BinaryOperator::Question => {
                 Ok(PlannerResult::Planned(Expr::Alias(Alias::new(
-                    Expr::Literal(ScalarValue::Boolean(Some(true))),
+                    Expr::Literal(ScalarValue::Boolean(Some(true)), None),
                     None::<&str>,
                     format!("{} ? {}", expr.left, expr.right),
                 ))))
@@ -77,25 +77,25 @@ async fn plan_and_collect(sql: &str) -> Result<Vec<RecordBatch>> {
 #[tokio::test]
 async fn test_custom_operators_arrow() {
     let actual = plan_and_collect("select 'foo'->'bar';").await.unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r#"
     +----------------------------+
     | Utf8("foo") || Utf8("bar") |
     +----------------------------+
     | foobar                     |
     +----------------------------+
-    "###);
+    "#);
 }
 
 #[tokio::test]
 async fn test_custom_operators_long_arrow() {
     let actual = plan_and_collect("select 1->>2;").await.unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +---------------------+
     | Int64(1) + Int64(2) |
     +---------------------+
     | 3                   |
     +---------------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -103,13 +103,13 @@ async fn test_question_select() {
     let actual = plan_and_collect("select a ? 2 from (select 1 as a);")
         .await
         .unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +--------------+
     | a ? Int64(2) |
     +--------------+
     | true         |
     +--------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -117,11 +117,11 @@ async fn test_question_filter() {
     let actual = plan_and_collect("select a from (select 1 as a) where a ? 2;")
         .await
         .unwrap();
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +---+
     | a |
     +---+
     | 1 |
     +---+
-    "###);
+    ");
 }
diff --git a/datafusion/core/tests/user_defined/insert_operation.rs b/datafusion/core/tests/user_defined/insert_operation.rs
index 12f700ce572ba..2a2aed82f0af3 100644
--- a/datafusion/core/tests/user_defined/insert_operation.rs
+++ b/datafusion/core/tests/user_defined/insert_operation.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, sync::Arc};
+use std::{any::Any, str::FromStr, sync::Arc};
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use async_trait::async_trait;
@@ -24,11 +24,14 @@ use datafusion::{
     prelude::{SessionConfig, SessionContext},
 };
 use datafusion_catalog::{Session, TableProvider};
-use datafusion_expr::{dml::InsertOp, Expr, TableType};
+use datafusion_common::config::Dialect;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_expr::{Expr, TableType, dml::InsertOp};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_plan::execution_plan::SchedulingType;
 use datafusion_physical_plan::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayAs, ExecutionPlan, PlanProperties,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 #[tokio::test]
@@ -62,7 +65,7 @@ async fn assert_insert_op(ctx: &SessionContext, sql: &str, insert_op: InsertOp)
 fn session_ctx_with_dialect(dialect: impl Into<String>) -> SessionContext {
     let mut config = SessionConfig::new();
     let options = config.options_mut();
-    options.sql_parser.dialect = dialect.into();
+    options.sql_parser.dialect = Dialect::from_str(&dialect.into()).unwrap();
     SessionContext::new_with_config(config)
 }
 
@@ -120,18 +123,21 @@ impl TableProvider for TestInsertTableProvider {
 #[derive(Debug)]
 struct TestInsertExec {
     op: InsertOp,
-    plan_properties: PlanProperties,
+    plan_properties: Arc<PlanProperties>,
 }
 
 impl TestInsertExec {
     fn new(op: InsertOp) -> Self {
         Self {
             op,
-            plan_properties: PlanProperties::new(
-                EquivalenceProperties::new(make_count_schema()),
-                Partitioning::UnknownPartitioning(1),
-                EmissionType::Incremental,
-                Boundedness::Bounded,
+            plan_properties: Arc::new(
+                PlanProperties::new(
+                    EquivalenceProperties::new(make_count_schema()),
+                    Partitioning::UnknownPartitioning(1),
+                    EmissionType::Incremental,
+                    Boundedness::Bounded,
+                )
+                .with_scheduling_type(SchedulingType::Cooperative),
             ),
         }
     }
@@ -156,7 +162,7 @@ impl ExecutionPlan for TestInsertExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.plan_properties
     }
 
@@ -179,6 +185,22 @@ impl ExecutionPlan for TestInsertExec {
     ) -> Result<datafusion_execution::SendableRecordBatchStream> {
         unimplemented!("TestInsertExec is a stub for testing.")
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.plan_properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 fn make_count_schema() -> SchemaRef {
diff --git a/datafusion/core/tests/user_defined/mod.rs b/datafusion/core/tests/user_defined/mod.rs
index 5d84cdb692830..bc9949f5d681c 100644
--- a/datafusion/core/tests/user_defined/mod.rs
+++ b/datafusion/core/tests/user_defined/mod.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+/// Tests for user defined Async Scalar functions
+mod user_defined_async_scalar_functions;
+
 /// Tests for user defined Scalar functions
 mod user_defined_scalar_functions;
 
@@ -33,5 +36,8 @@ mod user_defined_table_functions;
 /// Tests for Expression Planner
 mod expr_planner;
 
+/// Tests for Relation Planner extensions
+mod relation_planner;
+
 /// Tests for insert operations
 mod insert_operation;
diff --git a/datafusion/core/tests/user_defined/relation_planner.rs b/datafusion/core/tests/user_defined/relation_planner.rs
new file mode 100644
index 0000000000000..54af53ad858d4
--- /dev/null
+++ b/datafusion/core/tests/user_defined/relation_planner.rs
@@ -0,0 +1,531 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for the RelationPlanner extension point
+
+use std::sync::Arc;
+
+use arrow::array::{Int64Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::catalog::memory::MemTable;
+use datafusion::common::test_util::batches_to_string;
+use datafusion::prelude::*;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::Expr;
+use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+use insta::assert_snapshot;
+
+// ============================================================================
+// Test Planners - Example Implementations
+// ============================================================================
+
+// The planners in this section are deliberately minimal, static examples used
+// only for tests. In real applications a `RelationPlanner` would typically
+// construct richer logical plans tailored to external systems or custom
+// semantics rather than hard-coded in-memory tables.
+//
+// For more realistic examples, see `datafusion-examples/examples/relation_planner/`:
+// - `table_sample.rs`: Full TABLESAMPLE implementation (parsing → execution)
+// - `pivot_unpivot.rs`: PIVOT/UNPIVOT via SQL rewriting
+// - `match_recognize.rs`: MATCH_RECOGNIZE logical planning
+
+/// Helper to build simple static values-backed virtual tables used by the
+/// example planners below.
+fn plan_static_values_table(
+    relation: TableFactor,
+    table_name: &str,
+    column_name: &str,
+    values: Vec<ScalarValue>,
+) -> Result<RelationPlanning> {
+    match relation {
+        TableFactor::Table { name, alias, .. }
+            if name.to_string().eq_ignore_ascii_case(table_name) =>
+        {
+            let rows = values
+                .into_iter()
+                .map(|v| vec![Expr::Literal(v, None)])
+                .collect::<Vec<_>>();
+
+            let plan = LogicalPlanBuilder::values(rows)?
+                .project(vec![col("column1").alias(column_name)])?
+                .build()?;
+
+            Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                plan, alias,
+            ))))
+        }
+        other => Ok(RelationPlanning::Original(Box::new(other))),
+    }
+}
+
+/// Example planner that provides a virtual `numbers` table with values
+/// 1, 2, 3.
+#[derive(Debug)]
+struct NumbersPlanner;
+
+impl RelationPlanner for NumbersPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "numbers",
+            "number",
+            vec![
+                ScalarValue::Int64(Some(1)),
+                ScalarValue::Int64(Some(2)),
+                ScalarValue::Int64(Some(3)),
+            ],
+        )
+    }
+}
+
+/// Example planner that provides a virtual `colors` table with three string
+/// values: `red`, `green`, `blue`.
+#[derive(Debug)]
+struct ColorsPlanner;
+
+impl RelationPlanner for ColorsPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "colors",
+            "color",
+            vec![
+                ScalarValue::Utf8(Some("red".into())),
+                ScalarValue::Utf8(Some("green".into())),
+                ScalarValue::Utf8(Some("blue".into())),
+            ],
+        )
+    }
+}
+
+/// Alternative implementation of `numbers` (returns 100, 200) used to
+/// demonstrate planner precedence (last registered planner wins).
+#[derive(Debug)]
+struct AlternativeNumbersPlanner;
+
+impl RelationPlanner for AlternativeNumbersPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        plan_static_values_table(
+            relation,
+            "numbers",
+            "number",
+            vec![ScalarValue::Int64(Some(100)), ScalarValue::Int64(Some(200))],
+        )
+    }
+}
+
+/// Example planner that intercepts nested joins and samples both sides (limit 2)
+/// before joining, demonstrating recursive planning with `context.plan()`.
+#[derive(Debug)]
+struct SamplingJoinPlanner;
+
+impl RelationPlanner for SamplingJoinPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::NestedJoin {
+                table_with_joins,
+                alias,
+                ..
+            } if table_with_joins.joins.len() == 1 => {
+                // Use context.plan() to recursively plan both sides
+                // This ensures other planners (like NumbersPlanner) can handle them
+                let left = context.plan(table_with_joins.relation.clone())?;
+                let right = context.plan(table_with_joins.joins[0].relation.clone())?;
+
+                // Sample each table to 2 rows
+                let left_sampled =
+                    LogicalPlanBuilder::from(left).limit(0, Some(2))?.build()?;
+
+                let right_sampled =
+                    LogicalPlanBuilder::from(right).limit(0, Some(2))?.build()?;
+
+                // Cross join: 2 rows × 2 rows = 4 rows (instead of 3×3=9 without sampling)
+                let plan = LogicalPlanBuilder::from(left_sampled)
+                    .cross_join(right_sampled)?
+                    .build()?;
+
+                Ok(RelationPlanning::Planned(Box::new(PlannedRelation::new(
+                    plan, alias,
+                ))))
+            }
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+/// Example planner that never handles any relation and always delegates by
+/// returning `RelationPlanning::Original`.
+#[derive(Debug)]
+struct PassThroughPlanner;
+
+impl RelationPlanner for PassThroughPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        // Never handles anything - always delegates
+        Ok(RelationPlanning::Original(Box::new(relation)))
+    }
+}
+
+/// Example planner that shows how planners can block specific constructs and
+/// surface custom error messages by rejecting `UNNEST` relations (here framed
+/// as a mock premium feature check).
+#[derive(Debug)]
+struct PremiumFeaturePlanner;
+
+impl RelationPlanner for PremiumFeaturePlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        _context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            TableFactor::UNNEST { .. } => Err(datafusion_common::DataFusionError::Plan(
+                "UNNEST is a premium feature! Please upgrade to DataFusion Pro™ \
+                     to unlock advanced array operations."
+                    .to_string(),
+            )),
+            other => Ok(RelationPlanning::Original(Box::new(other))),
+        }
+    }
+}
+
+// ============================================================================
+// Test Helpers - SQL Execution
+// ============================================================================
+
+/// Execute SQL and return results with better error messages.
+async fn execute_sql(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
+    let df = ctx.sql(sql).await?;
+    df.collect().await
+}
+
+/// Execute SQL and convert to string format for snapshot comparison.
+async fn execute_sql_to_string(ctx: &SessionContext, sql: &str) -> String {
+    let batches = execute_sql(ctx, sql)
+        .await
+        .expect("SQL execution should succeed");
+    batches_to_string(&batches)
+}
+
+// ============================================================================
+// Test Helpers - Context Builders
+// ============================================================================
+
+/// Create a SessionContext with a catalog table containing Int64 and Utf8 columns.
+///
+/// Creates a table with the specified name and sample data for fallback/integration tests.
+fn create_context_with_catalog_table(
+    table_name: &str,
+    id_values: Vec<i64>,
+    name_values: Vec<&str>,
+) -> SessionContext {
+    let ctx = SessionContext::new();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new("name", DataType::Utf8, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int64Array::from(id_values)),
+            Arc::new(StringArray::from(name_values)),
+        ],
+    )
+    .unwrap();
+
+    let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
+    ctx.register_table(table_name, Arc::new(table)).unwrap();
+
+    ctx
+}
+
+/// Create a SessionContext with a simple single-column Int64 table.
+///
+/// Useful for basic tests that need a real catalog table.
+fn create_context_with_simple_table(
+    table_name: &str,
+    values: Vec<i64>,
+) -> SessionContext {
+    let ctx = SessionContext::new();
+
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "value",
+        DataType::Int64,
+        true,
+    )]));
+
+    let batch =
+        RecordBatch::try_new(schema.clone(), vec![Arc::new(Int64Array::from(values))])
+            .unwrap();
+
+    let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
+    ctx.register_table(table_name, Arc::new(table)).unwrap();
+
+    ctx
+}
+
+// ============================================================================
+// TESTS: Ordered from Basic to Complex
+// ============================================================================
+
+/// Comprehensive test suite for RelationPlanner extension point.
+/// Tests are ordered from simplest smoke test to most complex scenarios.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Small extension trait to make test setup read fluently.
+    trait TestSessionExt {
+        fn with_planner<P: RelationPlanner + 'static>(self, planner: P) -> Self;
+    }
+
+    impl TestSessionExt for SessionContext {
+        fn with_planner<P: RelationPlanner + 'static>(self, planner: P) -> Self {
+            self.register_relation_planner(Arc::new(planner)).unwrap();
+            self
+        }
+    }
+
+    /// Session context with only the `NumbersPlanner` registered.
+    fn ctx_with_numbers() -> SessionContext {
+        SessionContext::new().with_planner(NumbersPlanner)
+    }
+
+    /// Session context with virtual tables (`numbers`, `colors`) and the
+    /// `SamplingJoinPlanner` registered for nested joins.
+    fn ctx_with_virtual_tables_and_sampling() -> SessionContext {
+        SessionContext::new()
+            .with_planner(NumbersPlanner)
+            .with_planner(ColorsPlanner)
+            .with_planner(SamplingJoinPlanner)
+    }
+
+    // Basic smoke test: virtual table can be queried like a regular table.
+    #[tokio::test]
+    async fn virtual_table_basic_select() {
+        let ctx = ctx_with_numbers();
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+
+        assert_snapshot!(result, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+    }
+
+    // Virtual table supports standard SQL operations (projection, filter, aggregation).
+    #[tokio::test]
+    async fn virtual_table_filters_and_aggregation() {
+        let ctx = ctx_with_numbers();
+
+        let filtered = execute_sql_to_string(
+            &ctx,
+            "SELECT number * 10 AS scaled FROM numbers WHERE number > 1",
+        )
+        .await;
+
+        assert_snapshot!(filtered, @r"
+        +--------+
+        | scaled |
+        +--------+
+        | 20     |
+        | 30     |
+        +--------+
+        ");
+
+        let aggregated = execute_sql_to_string(
+            &ctx,
+            "SELECT COUNT(*) as count, SUM(number) as total, AVG(number) as average \
+             FROM numbers",
+        )
+        .await;
+
+        assert_snapshot!(aggregated, @r"
+        +-------+-------+---------+
+        | count | total | average |
+        +-------+-------+---------+
+        | 3     | 6     | 2.0     |
+        +-------+-------+---------+
+        ");
+    }
+
+    // Multiple planners can coexist and each handles its own virtual table.
+    #[tokio::test]
+    async fn multiple_planners_virtual_tables() {
+        let ctx = SessionContext::new()
+            .with_planner(NumbersPlanner)
+            .with_planner(ColorsPlanner);
+
+        let result1 = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+        assert_snapshot!(result1, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+
+        let result2 = execute_sql_to_string(&ctx, "SELECT * FROM colors").await;
+        assert_snapshot!(result2, @r"
+        +-------+
+        | color |
+        +-------+
+        | red   |
+        | green |
+        | blue  |
+        +-------+
+        ");
+    }
+
+    // Last registered planner for the same table name takes precedence (LIFO).
+    #[tokio::test]
+    async fn lifo_precedence_last_planner_wins() {
+        let ctx = SessionContext::new()
+            .with_planner(AlternativeNumbersPlanner)
+            .with_planner(NumbersPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM numbers").await;
+
+        // CustomValuesPlanner registered last, should win (returns 1,2,3 not 100,200)
+        assert_snapshot!(result, @r"
+        +--------+
+        | number |
+        +--------+
+        | 1      |
+        | 2      |
+        | 3      |
+        +--------+
+        ");
+    }
+
+    // Pass-through planner delegates to the catalog without changing behavior.
+    #[tokio::test]
+    async fn delegation_pass_through_to_catalog() {
+        let ctx = create_context_with_simple_table("real_table", vec![42])
+            .with_planner(PassThroughPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM real_table").await;
+
+        assert_snapshot!(result, @r"
+        +-------+
+        | value |
+        +-------+
+        | 42    |
+        +-------+
+        ");
+    }
+
+    // Catalog is used when no planner claims the relation.
+    #[tokio::test]
+    async fn catalog_fallback_when_no_planner() {
+        let ctx =
+            create_context_with_catalog_table("users", vec![1, 2], vec!["Alice", "Bob"])
+                .with_planner(NumbersPlanner);
+
+        let result = execute_sql_to_string(&ctx, "SELECT * FROM users ORDER BY id").await;
+
+        assert_snapshot!(result, @r"
+        +----+-------+
+        | id | name  |
+        +----+-------+
+        | 1  | Alice |
+        | 2  | Bob   |
+        +----+-------+
+        ");
+    }
+
+    // Planners can block specific constructs and surface custom error messages.
+    #[tokio::test]
+    async fn error_handling_premium_feature_blocking() {
+        // Verify UNNEST works without planner
+        let ctx_without_planner = SessionContext::new();
+        let result =
+            execute_sql(&ctx_without_planner, "SELECT * FROM UNNEST(ARRAY[1, 2, 3])")
+                .await
+                .expect("UNNEST should work by default");
+        assert_eq!(result.len(), 1);
+
+        // Same query with blocking planner registered
+        let ctx = SessionContext::new().with_planner(PremiumFeaturePlanner);
+
+        // Verify UNNEST is now rejected
+        let error = execute_sql(&ctx, "SELECT * FROM UNNEST(ARRAY[1, 2, 3])")
+            .await
+            .expect_err("UNNEST should be rejected");
+
+        let error_msg = error.to_string();
+        assert!(
+            error_msg.contains("premium feature") && error_msg.contains("DataFusion Pro"),
+            "Expected custom rejection message, got: {error_msg}"
+        );
+    }
+
+    // SamplingJoinPlanner recursively calls `context.plan()` on both sides of a
+    // nested join before sampling, exercising recursive relation planning.
+    #[tokio::test]
+    async fn recursive_planning_sampling_join() {
+        let ctx = ctx_with_virtual_tables_and_sampling();
+
+        let result =
+            execute_sql_to_string(&ctx, "SELECT * FROM (numbers JOIN colors ON true)")
+                .await;
+
+        // SamplingJoinPlanner limits each side to 2 rows: 2×2=4 (not 3×3=9)
+        assert_snapshot!(result, @r"
+        +--------+-------+
+        | number | color |
+        +--------+-------+
+        | 1      | red   |
+        | 1      | green |
+        | 2      | red   |
+        | 2      | green |
+        +--------+-------+
+        ");
+    }
+}
diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
index ae517795ab955..e7bd2241398ad 100644
--- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
+++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -20,16 +20,16 @@
 
 use std::any::Any;
 use std::collections::HashMap;
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::{Hash, Hasher};
 use std::mem::{size_of, size_of_val};
 use std::sync::{
-    atomic::{AtomicBool, Ordering},
     Arc,
+    atomic::{AtomicBool, Ordering},
 };
 
 use arrow::array::{
-    record_batch, types::UInt64Type, Array, AsArray, Int32Array, PrimitiveArray,
-    StringArray, StructArray, UInt64Array,
+    Array, AsArray, Int32Array, PrimitiveArray, StringArray, StructArray, UInt64Array,
+    record_batch, types::UInt64Type,
 };
 use arrow::datatypes::{Fields, Schema};
 use arrow_schema::FieldRef;
@@ -53,10 +53,11 @@ use datafusion::{
 };
 use datafusion_common::{assert_contains, exec_datafusion_err};
 use datafusion_common::{cast::as_primitive_array, exec_err};
+
 use datafusion_expr::expr::WindowFunction;
 use datafusion_expr::{
-    col, create_udaf, function::AccumulatorArgs, AggregateUDFImpl, Expr,
-    GroupsAccumulator, LogicalPlanBuilder, SimpleAggregateUDF, WindowFunctionDefinition,
+    AggregateUDFImpl, Expr, GroupsAccumulator, LogicalPlanBuilder, SimpleAggregateUDF,
+    WindowFunctionDefinition, col, create_udaf, function::AccumulatorArgs,
 };
 use datafusion_functions_aggregate::average::AvgAccumulator;
 
@@ -68,7 +69,7 @@ async fn test_setup() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +-------+----------------------------+
     | value | time                       |
     +-------+----------------------------+
@@ -78,7 +79,7 @@ async fn test_setup() {
     | 5.0   | 1970-01-01T00:00:00.000005 |
     | 5.0   | 1970-01-01T00:00:00.000005 |
     +-------+----------------------------+
-    "###);
+    ");
 }
 
 /// Basic user defined aggregate
@@ -90,13 +91,13 @@ async fn test_udaf() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum(t.time)           |
     +----------------------------+
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 
     // normal aggregates call update_batch
     assert!(test_state.update_batch());
@@ -111,7 +112,7 @@ async fn test_udaf_as_window() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum                   |
     +----------------------------+
@@ -121,7 +122,7 @@ async fn test_udaf_as_window() {
     | 1970-01-01T00:00:00.000019 |
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 
     // aggregate over the entire window function call update_batch
     assert!(test_state.update_batch());
@@ -136,7 +137,7 @@ async fn test_udaf_as_window_with_frame() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | time_sum                   |
     +----------------------------+
@@ -146,7 +147,7 @@ async fn test_udaf_as_window_with_frame() {
     | 1970-01-01T00:00:00.000014 |
     | 1970-01-01T00:00:00.000010 |
     +----------------------------+
-    "###);
+    ");
 
     // user defined aggregates with window frame should be calling retract batch
     assert!(test_state.update_batch());
@@ -163,7 +164,10 @@ async fn test_udaf_as_window_with_frame_without_retract_batch() {
     let sql = "SELECT time_sum(time) OVER(ORDER BY time ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as time_sum from t";
     // Note if this query ever does start working
     let err = execute(&ctx, sql).await.unwrap_err();
-    assert_contains!(err.to_string(), "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: time_sum(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING");
+    assert_contains!(
+        err.to_string(),
+        "This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented: time_sum(t.time) ORDER BY [t.time ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING"
+    );
 }
 
 /// Basic query for with a udaf returning a structure
@@ -174,13 +178,13 @@ async fn test_udaf_returning_struct() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------------+
     | first(t.value,t.time)                          |
     +------------------------------------------------+
     | {value: 2.0, time: 1970-01-01T00:00:00.000002} |
     +------------------------------------------------+
-    "###);
+    ");
 }
 
 /// Demonstrate extracting the fields from a structure using a subquery
@@ -191,13 +195,13 @@ async fn test_udaf_returning_struct_subquery() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +-----------------+----------------------------+
     | sq.first[value] | sq.first[time]             |
     +-----------------+----------------------------+
     | 2.0             | 1970-01-01T00:00:00.000002 |
     +-----------------+----------------------------+
-    "###);
+    ");
 }
 
 #[tokio::test]
@@ -211,13 +215,13 @@ async fn test_udaf_shadows_builtin_fn() {
     // compute with builtin `sum` aggregator
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r#"
     +---------------------------------------+
     | sum(arrow_cast(t.time,Utf8("Int64"))) |
     +---------------------------------------+
     | 19000                                 |
     +---------------------------------------+
-    "###);
+    "#);
 
     // Register `TimeSum` with name `sum`. This will shadow the builtin one
     TimeSum::register(&mut ctx, test_state.clone(), "sum");
@@ -225,13 +229,13 @@ async fn test_udaf_shadows_builtin_fn() {
 
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +----------------------------+
     | sum(t.time)                |
     +----------------------------+
     | 1970-01-01T00:00:00.000019 |
     +----------------------------+
-    "###);
+    ");
 }
 
 async fn execute(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
@@ -271,13 +275,13 @@ async fn simple_udaf() -> Result<()> {
 
     let result = ctx.sql("SELECT MY_AVG(a) FROM t").await?.collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-------------+
     | my_avg(t.a) |
     +-------------+
     | 3.0         |
     +-------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -297,10 +301,12 @@ async fn deregister_udaf() -> Result<()> {
     ctx.register_udaf(my_avg);
 
     assert!(ctx.state().aggregate_functions().contains_key("my_avg"));
+    assert!(datafusion_execution::FunctionRegistry::udafs(&ctx).contains("my_avg"));
 
     ctx.deregister_udaf("my_avg");
 
     assert!(!ctx.state().aggregate_functions().contains_key("my_avg"));
+    assert!(!datafusion_execution::FunctionRegistry::udafs(&ctx).contains("my_avg"));
 
     Ok(())
 }
@@ -326,9 +332,10 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> {
 
     // doesn't work as it was registered as non lowercase
     let err = ctx.sql("SELECT MY_AVG(i) FROM t").await.unwrap_err();
-    assert!(err
-        .to_string()
-        .contains("Error during planning: Invalid function \'my_avg\'"));
+    assert!(
+        err.to_string()
+            .contains("Error during planning: Invalid function \'my_avg\'")
+    );
 
     // Can call it if you put quotes
     let result = ctx
@@ -337,13 +344,13 @@ async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-------------+
     | MY_AVG(t.i) |
     +-------------+
     | 1.0         |
     +-------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -369,23 +376,23 @@ async fn test_user_defined_functions_with_alias() -> Result<()> {
 
     let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +------------+
     | dummy(t.i) |
     +------------+
     | 1.0        |
     +------------+
-    "###);
+    ");
 
     let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?;
 
-    insta::assert_snapshot!(batches_to_string(&alias_result), @r###"
-    +------------+
-    | dummy(t.i) |
-    +------------+
-    | 1.0        |
-    +------------+
-    "###);
+    insta::assert_snapshot!(batches_to_string(&alias_result), @r"
+    +------------------+
+    | dummy_alias(t.i) |
+    +------------------+
+    | 1.0              |
+    +------------------+
+    ");
 
     Ok(())
 }
@@ -446,13 +453,13 @@ async fn test_parameterized_aggregate_udf() -> Result<()> {
 
     let actual = DataFrame::new(ctx.state(), plan).collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------+---+---+
     | text | a | b |
     +------+---+---+
     | foo  | 1 | 2 |
     +------+---+---+
-    "###);
+    ");
 
     ctx.deregister_table("t")?;
     Ok(())
@@ -566,6 +573,7 @@ impl TimeSum {
         Self { sum: 0, test_state }
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     fn register(ctx: &mut SessionContext, test_state: Arc<TestState>, name: &str) {
         let timestamp_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
         let input_type = vec![timestamp_type.clone()];
@@ -757,11 +765,11 @@ impl Accumulator for FirstSelector {
 
         // Update the actual values
         for (value, time) in v.iter().zip(t.iter()) {
-            if let (Some(time), Some(value)) = (time, value) {
-                if time < self.time {
-                    self.value = value;
-                    self.time = time;
-                }
+            if let (Some(time), Some(value)) = (time, value)
+                && time < self.time
+            {
+                self.value = value;
+                self.time = time;
             }
         }
 
@@ -778,7 +786,7 @@ impl Accumulator for FirstSelector {
     }
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 struct TestGroupsAccumulator {
     signature: Signature,
     result: u64,
@@ -816,21 +824,6 @@ impl AggregateUDFImpl for TestGroupsAccumulator {
     ) -> Result<Box<dyn GroupsAccumulator>> {
         Ok(Box::new(self.clone()))
     }
-
-    fn equals(&self, other: &dyn AggregateUDFImpl) -> bool {
-        if let Some(other) = other.as_any().downcast_ref::<TestGroupsAccumulator>() {
-            self.result == other.result && self.signature == other.signature
-        } else {
-            false
-        }
-    }
-
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.signature.hash(hasher);
-        self.result.hash(hasher);
-        hasher.finish()
-    }
 }
 
 impl Accumulator for TestGroupsAccumulator {
@@ -902,6 +895,32 @@ struct MetadataBasedAggregateUdf {
     metadata: HashMap<String, String>,
 }
 
+impl PartialEq for MetadataBasedAggregateUdf {
+    fn eq(&self, other: &Self) -> bool {
+        let Self {
+            name,
+            signature,
+            metadata,
+        } = self;
+        name == &other.name
+            && signature == &other.signature
+            && metadata == &other.metadata
+    }
+}
+impl Eq for MetadataBasedAggregateUdf {}
+impl Hash for MetadataBasedAggregateUdf {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            name,
+            signature,
+            metadata: _, // unhashable
+        } = self;
+        std::any::type_name::<Self>().hash(state);
+        name.hash(state);
+        signature.hash(state);
+    }
+}
+
 impl MetadataBasedAggregateUdf {
     fn new(metadata: HashMap<String, String>) -> Self {
         // The name we return must be unique. Otherwise we will not call distinct
@@ -940,13 +959,7 @@ impl AggregateUDFImpl for MetadataBasedAggregateUdf {
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let input_expr = acc_args
-            .exprs
-            .first()
-            .ok_or(exec_datafusion_err!("Expected one argument"))?;
-        let input_field = input_expr.return_field(acc_args.schema)?;
-
-        let double_output = input_field
+        let double_output = acc_args.expr_fields[0]
             .metadata()
             .get("modify_values")
             .map(|v| v == "double_output")
@@ -1106,22 +1119,22 @@ async fn test_metadata_based_aggregate_as_window() -> Result<()> {
         )));
 
     let df = df.select(vec![
-        Expr::WindowFunction(WindowFunction::new(
+        Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(Arc::clone(&no_output_meta_udf)),
             vec![col("no_metadata")],
         ))
         .alias("meta_no_in_no_out"),
-        Expr::WindowFunction(WindowFunction::new(
+        Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(no_output_meta_udf),
             vec![col("with_metadata")],
         ))
         .alias("meta_with_in_no_out"),
-        Expr::WindowFunction(WindowFunction::new(
+        Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(Arc::clone(&with_output_meta_udf)),
             vec![col("no_metadata")],
         ))
         .alias("meta_no_in_with_out"),
-        Expr::WindowFunction(WindowFunction::new(
+        Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(with_output_meta_udf),
             vec![col("with_metadata")],
         ))
diff --git a/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs
new file mode 100644
index 0000000000000..31af4445ace08
--- /dev/null
+++ b/datafusion/core/tests/user_defined/user_defined_async_scalar_functions.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Int32Array, RecordBatch, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use async_trait::async_trait;
+use datafusion::prelude::*;
+use datafusion_common::test_util::format_batches;
+use datafusion_common::{Result, assert_batches_eq};
+use datafusion_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+
+fn register_table_and_udf() -> Result<SessionContext> {
+    let num_rows = 3;
+    let batch_size = 2;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("prompt", DataType::Utf8, false),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(Int32Array::from((0..num_rows).collect::<Vec<i32>>())),
+            Arc::new(StringArray::from(
+                (0..num_rows)
+                    .map(|i| format!("prompt{i}"))
+                    .collect::<Vec<_>>(),
+            )),
+        ],
+    )?;
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("test_table", batch)?;
+
+    ctx.register_udf(
+        AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl::new(batch_size)))
+            .into_scalar_udf(),
+    );
+
+    Ok(ctx)
+}
+
+// This test checks the case where batch_size doesn't evenly divide
+// the number of rows.
+#[tokio::test]
+async fn test_async_udf_with_non_modular_batch_size() -> Result<()> {
+    let ctx = register_table_and_udf()?;
+
+    let df = ctx
+        .sql("SELECT id, test_async_udf(prompt) as result FROM test_table")
+        .await?;
+
+    let result = df.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+----+---------+",
+            "| id | result  |",
+            "+----+---------+",
+            "| 0  | prompt0 |",
+            "| 1  | prompt1 |",
+            "| 2  | prompt2 |",
+            "+----+---------+"
+        ],
+        &result
+    );
+
+    Ok(())
+}
+
+// This test checks if metrics are printed for `AsyncFuncExec`
+#[tokio::test]
+async fn test_async_udf_metrics() -> Result<()> {
+    let ctx = register_table_and_udf()?;
+
+    let df = ctx
+        .sql(
+            "EXPLAIN ANALYZE SELECT id, test_async_udf(prompt) as result FROM test_table",
+        )
+        .await?;
+
+    let result = df.collect().await?;
+
+    let explain_analyze_str = format_batches(&result)?.to_string();
+    let async_func_exec_without_metrics =
+        explain_analyze_str.split("\n").any(|metric_line| {
+            metric_line.contains("AsyncFuncExec")
+                && !metric_line.contains("output_rows=3")
+        });
+
+    assert!(!async_func_exec_without_metrics);
+
+    Ok(())
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct TestAsyncUDFImpl {
+    batch_size: usize,
+    signature: Signature,
+}
+
+impl TestAsyncUDFImpl {
+    fn new(batch_size: usize) -> Self {
+        Self {
+            batch_size,
+            signature: Signature::exact(vec![DataType::Utf8], Volatility::Volatile),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TestAsyncUDFImpl {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "test_async_udf"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        panic!("Call invoke_async_with_args instead")
+    }
+}
+
+#[async_trait]
+impl AsyncScalarUDFImpl for TestAsyncUDFImpl {
+    fn ideal_batch_size(&self) -> Option<usize> {
+        Some(self.batch_size)
+    }
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let arg1 = &args.args[0];
+        let results = call_external_service(arg1.clone()).await?;
+        Ok(results)
+    }
+}
+
+/// Simulates calling an async external service
+async fn call_external_service(arg1: ColumnarValue) -> Result<ColumnarValue> {
+    Ok(arg1)
+}
diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs
index b68ef6aca0931..6e4ed69e508d3 100644
--- a/datafusion/core/tests/user_defined/user_defined_plan.rs
+++ b/datafusion/core/tests/user_defined/user_defined_plan.rs
@@ -56,7 +56,6 @@
 //!
 //! The same answer can be produced by simply keeping track of the top
 //! N elements, reducing the total amount of required buffer memory.
-//!
 
 use std::fmt::Debug;
 use std::hash::Hash;
@@ -71,7 +70,7 @@ use arrow::{
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::{
     common::cast::as_int64_array,
-    common::{arrow_datafusion_err, internal_err, DFSchemaRef},
+    common::{DFSchemaRef, arrow_datafusion_err},
     error::{DataFusionError, Result},
     execution::{
         context::{QueryPlanner, SessionState, TaskContext},
@@ -85,17 +84,19 @@ use datafusion::{
     physical_expr::EquivalenceProperties,
     physical_plan::{
         DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
-        PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
+        PlanProperties, RecordBatchStream, SendableRecordBatchStream,
     },
     physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
     prelude::{SessionConfig, SessionContext},
 };
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::ScalarValue;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
+use datafusion_common::{ScalarValue, assert_eq_or_internal_err, assert_or_internal_err};
 use datafusion_expr::{FetchType, InvariantLevel, Projection, SortExpr};
-use datafusion_optimizer::optimizer::ApplyOrder;
 use datafusion_optimizer::AnalyzerRule;
+use datafusion_optimizer::optimizer::ApplyOrder;
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 
 use async_trait::async_trait;
@@ -162,7 +163,7 @@ async fn run_and_compare_query(ctx: SessionContext, description: &str) -> Result
         insta::with_settings!({
             description => description,
         }, {
-            insta::assert_snapshot!(actual, @r###"
+            insta::assert_snapshot!(actual, @r"
             +-------------+---------+
             | customer_id | revenue |
             +-------------+---------+
@@ -170,7 +171,7 @@ async fn run_and_compare_query(ctx: SessionContext, description: &str) -> Result
             | jorge       | 200     |
             | andy        | 150     |
             +-------------+---------+
-        "###);
+            ");
         });
     }
 
@@ -189,13 +190,13 @@ async fn run_and_compare_query_with_analyzer_rule(
     insta::with_settings!({
         description => description,
     }, {
-        insta::assert_snapshot!(actual, @r###"
+        insta::assert_snapshot!(actual, @r"
         +------------+--------------------------+
         | UInt64(42) | arrow_typeof(UInt64(42)) |
         +------------+--------------------------+
         | 42         | UInt64                   |
         +------------+--------------------------+
-        "###);
+        ");
     });
 
     Ok(())
@@ -213,7 +214,7 @@ async fn run_and_compare_query_with_auto_schemas(
     insta::with_settings!({
             description => description,
         }, {
-            insta::assert_snapshot!(actual, @r###"
+            insta::assert_snapshot!(actual, @r"
             +----------+----------+
             | column_1 | column_2 |
             +----------+----------+
@@ -221,7 +222,7 @@ async fn run_and_compare_query_with_auto_schemas(
             | jorge    | 200      |
             | andy     | 150      |
             +----------+----------+
-        "###);
+            ");
     });
 
     Ok(())
@@ -434,21 +435,21 @@ impl OptimizerRule for OptimizerMakeExtensionNodeInvalid {
         plan: LogicalPlan,
         _config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>, DataFusionError> {
-        if let LogicalPlan::Extension(Extension { node }) = &plan {
-            if let Some(prev) = node.as_any().downcast_ref::<TopKPlanNode>() {
-                return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
-                    node: Arc::new(TopKPlanNode {
-                        k: prev.k,
-                        input: prev.input.clone(),
-                        expr: prev.expr.clone(),
-                        // In a real use case, this rewriter could have change the number of inputs, etc
-                        invariant_mock: Some(InvariantMock {
-                            should_fail_invariant: true,
-                            kind: InvariantLevel::Always,
-                        }),
+        if let LogicalPlan::Extension(Extension { node }) = &plan
+            && let Some(prev) = node.as_any().downcast_ref::<TopKPlanNode>()
+        {
+            return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+                node: Arc::new(TopKPlanNode {
+                    k: prev.k,
+                    input: prev.input.clone(),
+                    expr: prev.expr.clone(),
+                    // In a real use case, this rewriter could have change the number of inputs, etc
+                    invariant_mock: Some(InvariantMock {
+                        should_fail_invariant: true,
+                        kind: InvariantLevel::Always,
                     }),
-                })));
-            }
+                }),
+            })));
         };
 
         Ok(Transformed::no(plan))
@@ -516,23 +517,18 @@ impl OptimizerRule for TopKOptimizerRule {
             return Ok(Transformed::no(plan));
         };
 
-        if let LogicalPlan::Sort(Sort {
-            ref expr,
-            ref input,
-            ..
-        }) = limit.input.as_ref()
+        if let LogicalPlan::Sort(Sort { expr, input, .. }) = limit.input.as_ref()
+            && expr.len() == 1
         {
-            if expr.len() == 1 {
-                // we found a sort with a single sort expr, replace with a a TopK
-                return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
-                    node: Arc::new(TopKPlanNode {
-                        k: fetch,
-                        input: input.as_ref().clone(),
-                        expr: expr[0].clone(),
-                        invariant_mock: self.invariant_mock.clone(),
-                    }),
-                })));
-            }
+            // we found a sort with a single sort expr, replace with a a TopK
+            return Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+                node: Arc::new(TopKPlanNode {
+                    k: fetch,
+                    input: input.as_ref().clone(),
+                    expr: expr[0].clone(),
+                    invariant_mock: self.invariant_mock.clone(),
+                }),
+            })));
         }
 
         Ok(Transformed::no(plan))
@@ -580,15 +576,16 @@ impl UserDefinedLogicalNodeCore for TopKPlanNode {
         self.input.schema()
     }
 
-    fn check_invariants(&self, check: InvariantLevel, _plan: &LogicalPlan) -> Result<()> {
+    fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
         if let Some(InvariantMock {
             should_fail_invariant,
             kind,
         }) = self.invariant_mock.clone()
         {
-            if should_fail_invariant && check == kind {
-                return internal_err!("node fails check, such as improper inputs");
-            }
+            assert_or_internal_err!(
+                !(should_fail_invariant && check == kind),
+                "node fails check, such as improper inputs"
+            );
         }
         Ok(())
     }
@@ -658,13 +655,17 @@ struct TopKExec {
     input: Arc<dyn ExecutionPlan>,
     /// The maximum number of values
     k: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl TopKExec {
     fn new(input: Arc<dyn ExecutionPlan>, k: usize) -> Self {
         let cache = Self::compute_properties(input.schema());
-        Self { input, k, cache }
+        Self {
+            input,
+            k,
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -709,7 +710,7 @@ impl ExecutionPlan for TopKExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -734,9 +735,11 @@ impl ExecutionPlan for TopKExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if 0 != partition {
-            return internal_err!("TopKExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "TopKExec invalid partition {partition}"
+        );
 
         Ok(Box::pin(TopKReader {
             input: self.input.execute(partition, context)?,
@@ -746,10 +749,20 @@ impl ExecutionPlan for TopKExec {
         }))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // to improve the optimizability of this plan
-        // better statistics inference could be provided
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion::physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.cache.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
     }
 }
 
@@ -912,11 +925,12 @@ impl MyAnalyzerRule {
             .map(|e| {
                 e.transform(|e| {
                     Ok(match e {
-                        Expr::Literal(ScalarValue::Int64(i)) => {
+                        Expr::Literal(ScalarValue::Int64(i), _) => {
                             // transform to UInt64
-                            Transformed::yes(Expr::Literal(ScalarValue::UInt64(
-                                i.map(|i| i as u64),
-                            )))
+                            Transformed::yes(Expr::Literal(
+                                ScalarValue::UInt64(i.map(|i| i as u64)),
+                                None,
+                            ))
                         }
                         _ => Transformed::no(e),
                     })
diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
index 25458efa4fa55..025ee9767c694 100644
--- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs
@@ -17,35 +17,38 @@
 
 use std::any::Any;
 use std::collections::HashMap;
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use arrow::array::{as_string_array, record_batch, Int8Array, UInt64Array};
 use arrow::array::{
-    builder::BooleanBuilder, cast::AsArray, Array, ArrayRef, Float32Array, Float64Array,
-    Int32Array, RecordBatch, StringArray,
+    Array, ArrayRef, Float32Array, Float64Array, Int32Array, RecordBatch, StringArray,
+    builder::BooleanBuilder, cast::AsArray,
 };
+use arrow::array::{Int8Array, UInt64Array, as_string_array, create_array, record_batch};
 use arrow::compute::kernels::numeric::add;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow_schema::extension::{Bool8, CanonicalExtensionType, ExtensionType};
-use arrow_schema::{ArrowError, FieldRef};
+use arrow_schema::{ArrowError, FieldRef, SchemaRef};
 use datafusion::common::test_util::batches_to_string;
 use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionState};
 use datafusion::prelude::*;
 use datafusion::{execution::registry::FunctionRegistry, test_util};
 use datafusion_common::cast::{as_float64_array, as_int32_array};
+use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{
-    assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err, not_impl_err,
-    plan_err, DFSchema, DataFusionError, Result, ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, assert_batches_eq,
+    assert_batches_sorted_eq, assert_contains, exec_datafusion_err, exec_err,
+    not_impl_err, plan_err,
 };
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
     Accumulator, ColumnarValue, CreateFunction, CreateFunctionBody, LogicalPlanBuilder,
     OperateFunctionArg, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
-    Signature, Volatility,
+    Signature, Volatility, lit_with_metadata,
 };
+use datafusion_expr_common::signature::TypeSignature;
 use datafusion_functions_nested::range::range_udf;
 use parking_lot::Mutex;
 use regex::Regex;
@@ -62,13 +65,13 @@ async fn csv_query_custom_udf_with_cast() -> Result<()> {
     let sql = "SELECT avg(custom_sqrt(c11)) FROM aggregate_test_100";
     let actual = plan_and_collect(&ctx, sql).await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------+
     | avg(custom_sqrt(aggregate_test_100.c11)) |
     +------------------------------------------+
     | 0.6584408483418835                       |
     +------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -81,13 +84,13 @@ async fn csv_query_avg_sqrt() -> Result<()> {
     let sql = "SELECT avg(custom_sqrt(c12)) FROM aggregate_test_100";
     let actual = plan_and_collect(&ctx, sql).await?;
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
     +------------------------------------------+
     | avg(custom_sqrt(aggregate_test_100.c12)) |
     +------------------------------------------+
     | 0.6706002946036459                       |
     +------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -152,7 +155,7 @@ async fn scalar_udf() -> Result<()> {
 
     let result = DataFrame::new(ctx.state(), plan).collect().await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+-----+-----------------+
     | a   | b   | my_add(t.a,t.b) |
     +-----+-----+-----------------+
@@ -161,7 +164,7 @@ async fn scalar_udf() -> Result<()> {
     | 10  | 12  | 22              |
     | 100 | 120 | 220             |
     +-----+-----+-----------------+
-    "###);
+    ");
 
     let batch = &result[0];
     let a = as_int32_array(batch.column(0))?;
@@ -180,6 +183,7 @@ async fn scalar_udf() -> Result<()> {
     Ok(())
 }
 
+#[derive(PartialEq, Eq, Hash)]
 struct Simple0ArgsScalarUDF {
     name: String,
     signature: Signature,
@@ -277,7 +281,7 @@ async fn scalar_udf_zero_params() -> Result<()> {
     ctx.register_udf(ScalarUDF::from(get_100_udf));
 
     let result = plan_and_collect(&ctx, "select get_100() a from t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+
     | a   |
     +-----+
@@ -286,22 +290,22 @@ async fn scalar_udf_zero_params() -> Result<()> {
     | 100 |
     | 100 |
     +-----+
-    "###);
+    ");
 
     let result = plan_and_collect(&ctx, "select get_100() a").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +-----+
     | a   |
     +-----+
     | 100 |
     +-----+
-    "###);
+    ");
 
     let result = plan_and_collect(&ctx, "select get_100() from t where a=999").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     ++
     ++
-    "###);
+    ");
 
     Ok(())
 }
@@ -328,13 +332,13 @@ async fn scalar_udf_override_built_in_scalar_function() -> Result<()> {
 
     // Make sure that the UDF is used instead of the built-in function
     let result = plan_and_collect(&ctx, "select abs(a) a from t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +---+
     | a |
     +---+
     | 1 |
     +---+
-    "###);
+    ");
 
     Ok(())
 }
@@ -423,20 +427,21 @@ async fn case_sensitive_identifiers_user_defined_functions() -> Result<()> {
     let err = plan_and_collect(&ctx, "SELECT MY_FUNC(i) FROM t")
         .await
         .unwrap_err();
-    assert!(err
-        .to_string()
-        .contains("Error during planning: Invalid function \'my_func\'"));
+    assert!(
+        err.to_string()
+            .contains("Error during planning: Invalid function \'my_func\'")
+    );
 
     // Can call it if you put quotes
     let result = plan_and_collect(&ctx, "SELECT \"MY_FUNC\"(i) FROM t").await?;
 
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +--------------+
     | MY_FUNC(t.i) |
     +--------------+
     | 1            |
     +--------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -467,28 +472,28 @@ async fn test_user_defined_functions_with_alias() -> Result<()> {
     ctx.register_udf(udf);
 
     let result = plan_and_collect(&ctx, "SELECT dummy(i) FROM t").await?;
-    insta::assert_snapshot!(batches_to_string(&result), @r###"
+    insta::assert_snapshot!(batches_to_string(&result), @r"
     +------------+
     | dummy(t.i) |
     +------------+
     | 1          |
     +------------+
-    "###);
+    ");
 
     let alias_result = plan_and_collect(&ctx, "SELECT dummy_alias(i) FROM t").await?;
-    insta::assert_snapshot!(batches_to_string(&alias_result), @r###"
-    +------------+
-    | dummy(t.i) |
-    +------------+
-    | 1          |
-    +------------+
-    "###);
+    insta::assert_snapshot!(batches_to_string(&alias_result), @r"
+    +------------------+
+    | dummy_alias(t.i) |
+    +------------------+
+    | 1                |
+    +------------------+
+    ");
 
     Ok(())
 }
 
 /// Volatile UDF that should append a different value to each row
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct AddIndexToStringVolatileScalarUDF {
     name: String,
     signature: Signature,
@@ -659,7 +664,7 @@ async fn volatile_scalar_udf_with_params() -> Result<()> {
     Ok(())
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct CastToI64UDF {
     signature: Signature,
 }
@@ -694,7 +699,7 @@ impl ScalarUDFImpl for CastToI64UDF {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         // DataFusion should have ensured the function is called with just a
         // single argument
@@ -710,10 +715,7 @@ impl ScalarUDFImpl for CastToI64UDF {
             arg
         } else {
             // need to use an actual cast to get the correct type
-            Expr::Cast(datafusion_expr::Cast {
-                expr: Box::new(arg),
-                data_type: DataType::Int64,
-            })
+            Expr::Cast(datafusion_expr::Cast::new(Box::new(arg), DataType::Int64))
         };
         // return the newly written argument to DataFusion
         Ok(ExprSimplifyResult::Simplified(new_expr))
@@ -773,15 +775,17 @@ async fn deregister_udf() -> Result<()> {
     ctx.register_udf(cast2i64);
 
     assert!(ctx.udfs().contains("cast_to_i64"));
+    assert!(FunctionRegistry::udfs(&ctx).contains("cast_to_i64"));
 
     ctx.deregister_udf("cast_to_i64");
 
     assert!(!ctx.udfs().contains("cast_to_i64"));
+    assert!(!FunctionRegistry::udfs(&ctx).contains("cast_to_i64"));
 
     Ok(())
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct TakeUDF {
     signature: Signature,
 }
@@ -935,12 +939,13 @@ impl FunctionFactory for CustomFunctionFactory {
 //
 // it also defines custom [ScalarUDFImpl::simplify()]
 // to replace ScalarUDF expression with one instance contains.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct ScalarFunctionWrapper {
     name: String,
     expr: Expr,
     signature: Signature,
     return_type: DataType,
+    defaults: Vec<Option<Expr>>,
 }
 
 impl ScalarUDFImpl for ScalarFunctionWrapper {
@@ -967,21 +972,21 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let replacement = Self::replacement(&self.expr, &args)?;
+        let replacement = Self::replacement(&self.expr, &args, &self.defaults)?;
 
         Ok(ExprSimplifyResult::Simplified(replacement))
     }
-
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
 }
 
 impl ScalarFunctionWrapper {
     // replaces placeholders with actual arguments
-    fn replacement(expr: &Expr, args: &[Expr]) -> Result<Expr> {
+    fn replacement(
+        expr: &Expr,
+        args: &[Expr],
+        defaults: &[Option<Expr>],
+    ) -> Result<Expr> {
         let result = expr.clone().transform(|e| {
             let r = match e {
                 Expr::Placeholder(placeholder) => {
@@ -989,11 +994,19 @@ impl ScalarFunctionWrapper {
                         Self::parse_placeholder_identifier(&placeholder.id)?;
                     if placeholder_position < args.len() {
                         Transformed::yes(args[placeholder_position].clone())
-                    } else {
+                    } else if placeholder_position >= defaults.len() {
                         exec_err!(
-                            "Function argument {} not provided, argument missing!",
+                            "Invalid placeholder, out of range: {}",
                             placeholder.id
                         )?
+                    } else {
+                        match defaults[placeholder_position] {
+                            Some(ref default) => Transformed::yes(default.clone()),
+                            None => exec_err!(
+                                "Function argument {} not provided, argument missing!",
+                                placeholder.id
+                            )?,
+                        }
                     }
                 }
                 _ => Transformed::no(e),
@@ -1009,9 +1022,7 @@ impl ScalarFunctionWrapper {
     fn parse_placeholder_identifier(placeholder: &str) -> Result<usize> {
         if let Some(value) = placeholder.strip_prefix('$') {
             Ok(value.parse().map(|v: usize| v - 1).map_err(|e| {
-                DataFusionError::Execution(format!(
-                    "Placeholder `{placeholder}` parsing error: {e}!"
-                ))
+                exec_datafusion_err!("Placeholder `{placeholder}` parsing error: {e}!")
             })?)
         } else {
             exec_err!("Placeholder should start with `$`!")
@@ -1023,6 +1034,32 @@ impl TryFrom<CreateFunction> for ScalarFunctionWrapper {
     type Error = DataFusionError;
 
     fn try_from(definition: CreateFunction) -> std::result::Result<Self, Self::Error> {
+        let args = definition.args.unwrap_or_default();
+        let defaults: Vec<Option<Expr>> =
+            args.iter().map(|a| a.default_expr.clone()).collect();
+        let signature: Signature = match defaults.iter().position(|v| v.is_some()) {
+            Some(pos) => {
+                let mut type_signatures: Vec<TypeSignature> = vec![];
+                // Generate all valid signatures
+                for n in pos..defaults.len() + 1 {
+                    if n == 0 {
+                        type_signatures.push(TypeSignature::Nullary)
+                    } else {
+                        type_signatures.push(TypeSignature::Exact(
+                            args.iter().take(n).map(|a| a.data_type.clone()).collect(),
+                        ))
+                    }
+                }
+                Signature::one_of(
+                    type_signatures,
+                    definition.params.behavior.unwrap_or(Volatility::Volatile),
+                )
+            }
+            None => Signature::exact(
+                args.iter().map(|a| a.data_type.clone()).collect(),
+                definition.params.behavior.unwrap_or(Volatility::Volatile),
+            ),
+        };
         Ok(Self {
             name: definition.name,
             expr: definition
@@ -1032,15 +1069,8 @@ impl TryFrom<CreateFunction> for ScalarFunctionWrapper {
             return_type: definition
                 .return_type
                 .expect("Return type has to be defined!"),
-            signature: Signature::exact(
-                definition
-                    .args
-                    .unwrap_or_default()
-                    .into_iter()
-                    .map(|a| a.data_type)
-                    .collect(),
-                definition.params.behavior.unwrap_or(Volatility::Volatile),
-            ),
+            signature,
+            defaults,
         })
     }
 }
@@ -1063,10 +1093,11 @@ async fn create_scalar_function_from_sql_statement() -> Result<()> {
     // Create the `better_add` function dynamically via CREATE FUNCTION statement
     assert!(ctx.sql(sql).await.is_ok());
     // try to `drop function` when sql options have allow ddl disabled
-    assert!(ctx
-        .sql_with_options("drop function better_add", options)
-        .await
-        .is_err());
+    assert!(
+        ctx.sql_with_options("drop function better_add", options)
+            .await
+            .is_err()
+    );
 
     let result = ctx
         .sql("select better_add(2.0, 2.0)")
@@ -1111,6 +1142,175 @@ async fn create_scalar_function_from_sql_statement() -> Result<()> {
     "#;
     assert!(ctx.sql(bad_definition_sql).await.is_err());
 
+    // FIXME: Definitions with invalid placeholders are allowed, fail at runtime
+    let bad_expression_sql = r#"
+    CREATE FUNCTION better_add(DOUBLE, DOUBLE)
+        RETURNS DOUBLE
+        RETURN $1 + $3
+    "#;
+    assert!(ctx.sql(bad_expression_sql).await.is_ok());
+
+    let err = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await
+        .expect_err("unknown placeholder");
+    let expected = "Optimizer rule 'simplify_expressions' failed\ncaused by\nExecution error: Invalid placeholder, out of range: $3";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn create_scalar_function_from_sql_statement_named_arguments() -> Result<()> {
+    let function_factory = Arc::new(CustomFunctionFactory::default());
+    let ctx = SessionContext::new().with_function_factory(function_factory.clone());
+
+    let sql = r#"
+    CREATE FUNCTION better_add(a DOUBLE, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+
+    assert!(ctx.sql(sql).await.is_ok());
+
+    let result = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        &[
+            "+-----------------------------------+",
+            "| better_add(Float64(2),Float64(2)) |",
+            "+-----------------------------------+",
+            "| 4.0                               |",
+            "+-----------------------------------+",
+        ],
+        &result
+    );
+
+    // cannot mix named and positional style
+    let bad_expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(DOUBLE, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $1 + $b
+    "#;
+    let err = ctx
+        .sql(bad_expression_sql)
+        .await
+        .expect_err("cannot mix named and positional style");
+    let expected = "Error during planning: All function arguments must use either named or positional style.";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn create_scalar_function_from_sql_statement_default_arguments() -> Result<()> {
+    let function_factory = Arc::new(CustomFunctionFactory::default());
+    let ctx = SessionContext::new().with_function_factory(function_factory.clone());
+
+    let sql = r#"
+    CREATE FUNCTION better_add(a DOUBLE = 2.0, b DOUBLE = 2.0)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+
+    assert!(ctx.sql(sql).await.is_ok());
+
+    // Check all function arity supported
+    let result = ctx.sql("select better_add()").await?.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+--------------+",
+            "| better_add() |",
+            "+--------------+",
+            "| 4.0          |",
+            "+--------------+",
+        ],
+        &result
+    );
+
+    let result = ctx.sql("select better_add(2.0)").await?.collect().await?;
+
+    assert_batches_eq!(
+        &[
+            "+------------------------+",
+            "| better_add(Float64(2)) |",
+            "+------------------------+",
+            "| 4.0                    |",
+            "+------------------------+",
+        ],
+        &result
+    );
+
+    let result = ctx
+        .sql("select better_add(2.0, 2.0)")
+        .await?
+        .collect()
+        .await?;
+
+    assert_batches_eq!(
+        &[
+            "+-----------------------------------+",
+            "| better_add(Float64(2),Float64(2)) |",
+            "+-----------------------------------+",
+            "| 4.0                               |",
+            "+-----------------------------------+",
+        ],
+        &result
+    );
+
+    assert!(ctx.sql("select better_add(2.0, 2.0, 2.0)").await.is_err());
+    assert!(ctx.sql("drop function better_add").await.is_ok());
+
+    // works with positional style
+    let sql = r#"
+    CREATE FUNCTION better_add(DOUBLE, DOUBLE = 2.0)
+        RETURNS DOUBLE
+        RETURN $1 + $2
+    "#;
+    assert!(ctx.sql(sql).await.is_ok());
+
+    assert!(ctx.sql("select better_add()").await.is_err());
+    let result = ctx.sql("select better_add(2.0)").await?.collect().await?;
+    assert_batches_eq!(
+        &[
+            "+------------------------+",
+            "| better_add(Float64(2)) |",
+            "+------------------------+",
+            "| 4.0                    |",
+            "+------------------------+",
+        ],
+        &result
+    );
+
+    // non-default argument cannot follow default argument
+    let bad_expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(a DOUBLE = 2.0, b DOUBLE)
+        RETURNS DOUBLE
+        RETURN $a + $b
+    "#;
+    let err = ctx
+        .sql(bad_expression_sql)
+        .await
+        .expect_err("non-default argument cannot follow default argument");
+    let expected =
+        "Error during planning: Non-default arguments cannot follow default arguments.";
+    assert!(expected.starts_with(&err.strip_backtrace()));
+
+    let expression_sql = r#"
+    CREATE FUNCTION bad_expression_fun(DOUBLE, DOUBLE DEFAULT 2.0)
+        RETURNS DOUBLE
+        RETURN $1 + $2
+    "#;
+    let result = ctx.sql(expression_sql).await;
+
+    assert!(result.is_ok());
     Ok(())
 }
 
@@ -1184,7 +1384,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<(
                 quote_style: None,
                 span: Span::empty(),
             }),
-            data_type: DataType::Utf8,
+            data_type: DataType::Utf8View,
             default_expr: None,
         }]),
         return_type: Some(DataType::Int32),
@@ -1211,6 +1411,22 @@ struct MyRegexUdf {
     regex: Regex,
 }
 
+impl PartialEq for MyRegexUdf {
+    fn eq(&self, other: &Self) -> bool {
+        let Self { signature, regex } = self;
+        signature == &other.signature && regex.as_str() == other.regex.as_str()
+    }
+}
+impl Eq for MyRegexUdf {}
+
+impl Hash for MyRegexUdf {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self { signature, regex } = self;
+        signature.hash(state);
+        regex.as_str().hash(state);
+    }
+}
+
 impl MyRegexUdf {
     fn new(pattern: &str) -> Self {
         Self {
@@ -1262,20 +1478,6 @@ impl ScalarUDFImpl for MyRegexUdf {
             _ => exec_err!("regex_udf only accepts a Utf8 arguments"),
         }
     }
-
-    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        if let Some(other) = other.as_any().downcast_ref::<MyRegexUdf>() {
-            self.regex.as_str() == other.regex.as_str()
-        } else {
-            false
-        }
-    }
-
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.regex.as_str().hash(hasher);
-        hasher.finish()
-    }
 }
 
 #[tokio::test]
@@ -1373,13 +1575,25 @@ async fn plan_and_collect(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordB
     ctx.sql(sql).await?.collect().await
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 struct MetadataBasedUdf {
     name: String,
     signature: Signature,
     metadata: HashMap<String, String>,
 }
 
+impl Hash for MetadataBasedUdf {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            name,
+            signature,
+            metadata: _, // unhashable
+        } = self;
+        name.hash(state);
+        signature.hash(state);
+    }
+}
+
 impl MetadataBasedUdf {
     fn new(metadata: HashMap<String, String>) -> Self {
         // The name we return must be unique. Otherwise we will not call distinct
@@ -1426,7 +1640,7 @@ impl ScalarUDFImpl for MetadataBasedUdf {
             .get("modify_values")
             .map(|v| v == "double_output")
             .unwrap_or(false);
-        let mulitplier = if should_double { 2 } else { 1 };
+        let multiplier = if should_double { 2 } else { 1 };
 
         match &args.args[0] {
             ColumnarValue::Array(array) => {
@@ -1435,7 +1649,7 @@ impl ScalarUDFImpl for MetadataBasedUdf {
                     .downcast_ref::<UInt64Array>()
                     .unwrap()
                     .iter()
-                    .map(|v| v.map(|x| x * mulitplier))
+                    .map(|v| v.map(|x| x * multiplier))
                     .collect();
                 let array_ref = Arc::new(UInt64Array::from(array_values)) as ArrayRef;
                 Ok(ColumnarValue::Array(array_ref))
@@ -1446,15 +1660,11 @@ impl ScalarUDFImpl for MetadataBasedUdf {
                 };
 
                 Ok(ColumnarValue::Scalar(ScalarValue::UInt64(
-                    value.map(|v| v * mulitplier),
+                    value.map(|v| v * multiplier),
                 )))
             }
         }
     }
-
-    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        self.name == other.name()
-    }
 }
 
 #[tokio::test]
@@ -1529,11 +1739,71 @@ async fn test_metadata_based_udf() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn test_metadata_based_udf_with_literal() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input_metadata: HashMap<String, String> =
+        [("modify_values".to_string(), "double_output".to_string())]
+            .into_iter()
+            .collect();
+    let input_metadata = FieldMetadata::from(input_metadata);
+    let df = ctx.sql("select 0;").await?.select(vec![
+        lit(5u64).alias_with_metadata("lit_with_doubling", Some(input_metadata.clone())),
+        lit(5u64).alias("lit_no_doubling"),
+        lit_with_metadata(5u64, Some(input_metadata))
+            .alias("lit_with_double_no_alias_metadata"),
+    ])?;
+
+    let output_metadata: HashMap<String, String> =
+        [("output_metatype".to_string(), "custom_value".to_string())]
+            .into_iter()
+            .collect();
+    let custom_udf = ScalarUDF::from(MetadataBasedUdf::new(output_metadata.clone()));
+
+    let plan = LogicalPlanBuilder::from(df.into_optimized_plan()?)
+        .project(vec![
+            custom_udf
+                .call(vec![col("lit_with_doubling")])
+                .alias("doubled_output"),
+            custom_udf
+                .call(vec![col("lit_no_doubling")])
+                .alias("not_doubled_output"),
+            custom_udf
+                .call(vec![col("lit_with_double_no_alias_metadata")])
+                .alias("double_without_alias_metadata"),
+        ])?
+        .build()?;
+
+    let actual = DataFrame::new(ctx.state(), plan).collect().await?;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("doubled_output", DataType::UInt64, false)
+            .with_metadata(output_metadata.clone()),
+        Field::new("not_doubled_output", DataType::UInt64, false)
+            .with_metadata(output_metadata.clone()),
+        Field::new("double_without_alias_metadata", DataType::UInt64, false)
+            .with_metadata(output_metadata.clone()),
+    ]));
+
+    let expected = RecordBatch::try_new(
+        schema,
+        vec![
+            create_array!(UInt64, [10]),
+            create_array!(UInt64, [5]),
+            create_array!(UInt64, [10]),
+        ],
+    )?;
+
+    assert_eq!(expected, actual[0]);
+
+    Ok(())
+}
+
 /// This UDF is to test extension handling, both on the input and output
 /// sides. For the input, we will handle the data differently if there is
 /// the canonical extension type Bool8. For the output we will add a
 /// user defined extension type.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct ExtensionBasedUdf {
     name: String,
     signature: Signature,
@@ -1566,7 +1836,7 @@ impl ScalarUDFImpl for ExtensionBasedUdf {
 
     fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
         Ok(Field::new("canonical_extension_udf", DataType::Utf8, true)
-            .with_extension_type(MyUserExtentionType {})
+            .with_extension_type(MyUserExtensionType {})
             .into())
     }
 
@@ -1612,16 +1882,12 @@ impl ScalarUDFImpl for ExtensionBasedUdf {
             }
         }
     }
-
-    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        self.name == other.name()
-    }
 }
 
-struct MyUserExtentionType {}
+struct MyUserExtensionType {}
 
-impl ExtensionType for MyUserExtentionType {
-    const NAME: &'static str = "my_user_extention_type";
+impl ExtensionType for MyUserExtensionType {
+    const NAME: &'static str = "my_user_Extension_type";
     type Metadata = ();
 
     fn metadata(&self) -> &Self::Metadata {
@@ -1693,9 +1959,9 @@ async fn test_extension_based_udf() -> Result<()> {
     // To test for input extensions handling, we check the strings returned
     let expected_schema = Schema::new(vec![
         Field::new("without_bool8_extension", DataType::Utf8, true)
-            .with_extension_type(MyUserExtentionType {}),
+            .with_extension_type(MyUserExtensionType {}),
         Field::new("with_bool8_extension", DataType::Utf8, true)
-            .with_extension_type(MyUserExtentionType {}),
+            .with_extension_type(MyUserExtensionType {}),
     ]);
 
     let expected = record_batch!(
@@ -1713,3 +1979,237 @@ async fn test_extension_based_udf() -> Result<()> {
     ctx.deregister_table("t")?;
     Ok(())
 }
+
+#[tokio::test]
+async fn test_config_options_work_for_scalar_func() -> Result<()> {
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestScalarUDF {
+        signature: Signature,
+    }
+
+    impl ScalarUDFImpl for TestScalarUDF {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+        fn name(&self) -> &str {
+            "TestScalarUDF"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Utf8)
+        }
+
+        fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            let tz = args.config_options.execution.time_zone.clone();
+            Ok(ColumnarValue::Scalar(ScalarValue::from(tz)))
+        }
+    }
+
+    let udf = ScalarUDF::from(TestScalarUDF {
+        signature: Signature::uniform(1, vec![DataType::Utf8], Volatility::Stable),
+    });
+
+    let mut config = SessionConfig::new();
+    config.options_mut().execution.time_zone = Some("AEST".into());
+
+    let ctx = SessionContext::new_with_config(config);
+
+    ctx.register_udf(udf.clone());
+
+    let df = ctx.read_empty()?;
+    let df = df.select(vec![udf.call(vec![lit("a")]).alias("a")])?;
+    let actual = df.collect().await?;
+
+    let expected_schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+    let expected = RecordBatch::try_new(
+        SchemaRef::from(expected_schema),
+        vec![create_array!(Utf8, ["AEST"])],
+    )?;
+
+    assert_eq!(expected, actual[0]);
+
+    Ok(())
+}
+
+/// https://github.com/apache/datafusion/issues/17425
+#[tokio::test]
+async fn test_extension_metadata_preserve_in_sql_values() -> Result<()> {
+    #[derive(Debug, Hash, PartialEq, Eq)]
+    struct MakeExtension {
+        signature: Signature,
+    }
+
+    impl Default for MakeExtension {
+        fn default() -> Self {
+            Self {
+                signature: Signature::user_defined(Volatility::Immutable),
+            }
+        }
+    }
+
+    impl ScalarUDFImpl for MakeExtension {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "make_extension"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+            Ok(arg_types.to_vec())
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            unreachable!("This shouldn't have been called")
+        }
+
+        fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+            Ok(args.arg_fields[0]
+                .as_ref()
+                .clone()
+                .with_metadata(HashMap::from([(
+                    "ARROW:extension:metadata".to_string(),
+                    "foofy.foofy".to_string(),
+                )]))
+                .into())
+        }
+
+        fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            Ok(args.args[0].clone())
+        }
+    }
+
+    let ctx = SessionContext::new();
+    ctx.register_udf(MakeExtension::default().into());
+
+    let batches = ctx
+        .sql(
+            "
+SELECT extension FROM (VALUES
+    ('one', make_extension('foofy one')),
+    ('two', make_extension('foofy two')),
+    ('three', make_extension('foofy three')))
+AS t(string, extension)
+        ",
+        )
+        .await?
+        .collect()
+        .await?;
+
+    assert_eq!(
+        batches[0]
+            .schema()
+            .field(0)
+            .metadata()
+            .get("ARROW:extension:metadata"),
+        Some(&"foofy.foofy".into())
+    );
+    Ok(())
+}
+
+/// https://github.com/apache/datafusion/issues/17422
+#[tokio::test]
+async fn test_extension_metadata_preserve_in_subquery() -> Result<()> {
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct ExtensionScalarPredicate {
+        signature: Signature,
+    }
+
+    impl Default for ExtensionScalarPredicate {
+        fn default() -> Self {
+            Self {
+                signature: Signature::user_defined(Volatility::Immutable),
+            }
+        }
+    }
+
+    impl ScalarUDFImpl for ExtensionScalarPredicate {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "extension_predicate"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+            Ok(arg_types.to_vec())
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            unreachable!("This shouldn't have been called")
+        }
+
+        fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+            for arg in args.arg_fields {
+                assert!(arg.metadata().contains_key("ARROW:extension:name"));
+            }
+
+            Ok(Field::new("", DataType::Boolean, true).into())
+        }
+
+        fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            for arg in args.arg_fields {
+                assert!(arg.metadata().contains_key("ARROW:extension:name"));
+            }
+
+            let array =
+                ScalarValue::Boolean(Some(true)).to_array_of_size(args.number_rows)?;
+            Ok(ColumnarValue::Array(array))
+        }
+    }
+
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int64, true),
+        Field::new("geometry", DataType::Utf8, true).with_metadata(HashMap::from([(
+            "ARROW:extension:name".to_string(),
+            "foofy.foofy".to_string(),
+        )])),
+    ]);
+
+    let batch_lhs = RecordBatch::try_new(
+        schema.clone().into(),
+        vec![
+            create_array!(Int64, [1, 2]),
+            create_array!(Utf8, [Some("item1"), Some("item2")]),
+        ],
+    )?;
+
+    let batch_rhs = RecordBatch::try_new(
+        schema.clone().into(),
+        vec![
+            create_array!(Int64, [2, 3]),
+            create_array!(Utf8, [Some("item2"), Some("item3")]),
+        ],
+    )?;
+
+    let ctx = SessionContext::new();
+    ctx.register_batch("l", batch_lhs)?;
+    ctx.register_batch("r", batch_rhs)?;
+    ctx.register_udf(ExtensionScalarPredicate::default().into());
+
+    let df = ctx
+        .sql(
+            "
+        SELECT L.id l_id FROM L
+        WHERE EXISTS (SELECT 1 FROM R WHERE extension_predicate(L.geometry, R.geometry))
+        ORDER BY l_id
+        ",
+        )
+        .await?;
+    assert!(!df.collect().await?.is_empty());
+    Ok(())
+}
diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs
index e4aff0b00705d..95694d00a6c30 100644
--- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs
@@ -21,17 +21,17 @@ use std::path::Path;
 use std::sync::Arc;
 
 use arrow::array::Int64Array;
-use arrow::csv::reader::Format;
 use arrow::csv::ReaderBuilder;
+use arrow::csv::reader::Format;
 
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::common::test_util::batches_to_string;
-use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::TableProvider;
+use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::error::Result;
 use datafusion::execution::TaskContext;
-use datafusion::physical_plan::{collect, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, collect};
 use datafusion::prelude::SessionContext;
 use datafusion_catalog::Session;
 use datafusion_catalog::TableFunctionImpl;
@@ -55,7 +55,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&rbs), @r###"
+    insta::assert_snapshot!(batches_to_string(&rbs), @r"
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
     | n_nationkey | n_name    | n_regionkey | n_comment                                                                                                   |
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
@@ -65,7 +65,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
     | 4           | EGYPT     | 4           | y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d         |
     | 5           | ETHIOPIA  | 0           | ven packages wake quickly. regu                                                                             |
     +-------------+-----------+-------------+-------------------------------------------------------------------------------------------------------------+
-    "###);
+    ");
 
     // just run, return all rows
     let rbs = ctx
@@ -74,7 +74,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
         .collect()
         .await?;
 
-    insta::assert_snapshot!(batches_to_string(&rbs), @r###"
+    insta::assert_snapshot!(batches_to_string(&rbs), @r"
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
     | n_nationkey | n_name    | n_regionkey | n_comment                                                                                                          |
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
@@ -89,7 +89,7 @@ async fn test_simple_read_csv_udtf() -> Result<()> {
     | 9           | INDONESIA | 2           |  slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull |
     | 10          | IRAN      | 4           | efully alongside of the slyly final dependencies.                                                                  |
     +-------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------+
-    "###);
+    ");
 
     Ok(())
 }
@@ -205,7 +205,7 @@ impl TableFunctionImpl for SimpleCsvTableFunc {
         let mut filepath = String::new();
         for expr in exprs {
             match expr {
-                Expr::Literal(ScalarValue::Utf8(Some(ref path))) => {
+                Expr::Literal(ScalarValue::Utf8(Some(path)), _) => {
                     filepath.clone_from(path);
                 }
                 expr => new_exprs.push(expr.clone()),
@@ -221,6 +221,31 @@ impl TableFunctionImpl for SimpleCsvTableFunc {
     }
 }
 
+/// Test that expressions passed to UDTFs are properly type-coerced
+/// This is a regression test for https://github.com/apache/datafusion/issues/19914
+#[tokio::test]
+async fn test_udtf_type_coercion() -> Result<()> {
+    use datafusion::datasource::MemTable;
+
+    #[derive(Debug)]
+    struct NoOpTableFunc;
+
+    impl TableFunctionImpl for NoOpTableFunc {
+        fn call(&self, _: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+            let schema = Arc::new(arrow::datatypes::Schema::empty());
+            Ok(Arc::new(MemTable::try_new(schema, vec![vec![]])?))
+        }
+    }
+
+    let ctx = SessionContext::new();
+    ctx.register_udtf("f", Arc::new(NoOpTableFunc));
+
+    // This should not panic - the array elements should be coerced to Float64
+    let _ = ctx.sql("SELECT * FROM f(ARRAY[0.1, 1, 2])").await?;
+
+    Ok(())
+}
+
 fn read_csv_batches(csv_path: impl AsRef<Path>) -> Result<(SchemaRef, Vec<RecordBatch>)> {
     let mut file = File::open(csv_path)?;
     let (schema, _) = Format::default()
diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs
index bcd2c3945e392..775325a337184 100644
--- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs
+++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs
@@ -19,8 +19,8 @@
 //! user defined window functions
 
 use arrow::array::{
-    record_batch, Array, ArrayRef, AsArray, Int64Array, RecordBatch, StringArray,
-    UInt64Array,
+    Array, ArrayRef, AsArray, Int64Array, RecordBatch, StringArray, UInt64Array,
+    record_batch,
 };
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow_schema::FieldRef;
@@ -28,24 +28,27 @@ use datafusion::common::test_util::batches_to_string;
 use datafusion::common::{Result, ScalarValue};
 use datafusion::prelude::SessionContext;
 use datafusion_common::exec_datafusion_err;
+use datafusion_expr::ptr_eq::PtrEq;
 use datafusion_expr::{
-    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDF, WindowUDFImpl,
+    LimitEffect, PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDF,
+    WindowUDFImpl,
 };
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_functions_window_common::{
     expr::ExpressionArgs, field::WindowUDFFieldArgs,
 };
 use datafusion_physical_expr::{
-    expressions::{col, lit},
     PhysicalExpr,
+    expressions::{col, lit},
 };
 use std::collections::HashMap;
+use std::hash::{Hash, Hasher};
 use std::{
     any::Any,
     ops::Range,
     sync::{
-        atomic::{AtomicUsize, Ordering},
         Arc,
+        atomic::{AtomicUsize, Ordering},
     },
 };
 
@@ -59,8 +62,7 @@ const UNBOUNDED_WINDOW_QUERY_WITH_ALIAS: &str = "SELECT x, y, val, \
      from t ORDER BY x, y";
 
 /// A query with a window function evaluated over a moving window
-const BOUNDED_WINDOW_QUERY:  &str  =
-    "SELECT x, y, val, \
+const BOUNDED_WINDOW_QUERY: &str = "SELECT x, y, val, \
      odd_counter(val) OVER (PARTITION BY x ORDER BY y ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) \
      from t ORDER BY x, y";
 
@@ -72,22 +74,22 @@ async fn test_setup() {
     let sql = "SELECT * from t order by x, y";
     let actual = execute(&ctx, sql).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+
-         | x | y | val |
-         +---+---+-----+
-         | 1 | a | 0   |
-         | 1 | b | 1   |
-         | 1 | c | 2   |
-         | 2 | d | 3   |
-         | 2 | e | 4   |
-         | 2 | f | 5   |
-         | 2 | g | 6   |
-         | 2 | h | 6   |
-         | 2 | i | 6   |
-         | 2 | j | 6   |
-         +---+---+-----+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+
+    | x | y | val |
+    +---+---+-----+
+    | 1 | a | 0   |
+    | 1 | b | 1   |
+    | 1 | c | 2   |
+    | 2 | d | 3   |
+    | 2 | e | 4   |
+    | 2 | f | 5   |
+    | 2 | g | 6   |
+    | 2 | h | 6   |
+    | 2 | i | 6   |
+    | 2 | j | 6   |
+    +---+---+-----+
+    ");
 }
 
 /// Basic user defined window function
@@ -98,22 +100,22 @@ async fn test_udwf() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                                     |
-         | 1 | b | 1   | 1                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 2                                                                                                                     |
-         | 2 | e | 4   | 2                                                                                                                     |
-         | 2 | f | 5   | 2                                                                                                                     |
-         | 2 | g | 6   | 2                                                                                                                     |
-         | 2 | h | 6   | 2                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 2                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                                     |
+    | 1 | b | 1   | 1                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 2                                                                                                                     |
+    | 2 | e | 4   | 2                                                                                                                     |
+    | 2 | f | 5   | 2                                                                                                                     |
+    | 2 | g | 6   | 2                                                                                                                     |
+    | 2 | h | 6   | 2                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 2                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     // evaluated on two distinct batches
     assert_eq!(test_state.evaluate_all_called(), 2);
@@ -126,10 +128,12 @@ async fn test_deregister_udwf() -> Result<()> {
     OddCounter::register(&mut ctx, Arc::clone(&test_state));
 
     assert!(ctx.state().window_functions().contains_key("odd_counter"));
+    assert!(datafusion_execution::FunctionRegistry::udwfs(&ctx).contains("odd_counter"));
 
     ctx.deregister_udwf("odd_counter");
 
     assert!(!ctx.state().window_functions().contains_key("odd_counter"));
+    assert!(!datafusion_execution::FunctionRegistry::udwfs(&ctx).contains("odd_counter"));
 
     Ok(())
 }
@@ -143,22 +147,22 @@ async fn test_udwf_with_alias() {
         .await
         .unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                                     |
-         | 1 | b | 1   | 1                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 2                                                                                                                     |
-         | 2 | e | 4   | 2                                                                                                                     |
-         | 2 | f | 5   | 2                                                                                                                     |
-         | 2 | g | 6   | 2                                                                                                                     |
-         | 2 | h | 6   | 2                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 2                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------+
+    | x | y | val | odd_counter_alias(t.val) |
+    +---+---+-----+--------------------------+
+    | 1 | a | 0   | 1                        |
+    | 1 | b | 1   | 1                        |
+    | 1 | c | 2   | 1                        |
+    | 2 | d | 3   | 2                        |
+    | 2 | e | 4   | 2                        |
+    | 2 | f | 5   | 2                        |
+    | 2 | g | 6   | 2                        |
+    | 2 | h | 6   | 2                        |
+    | 2 | i | 6   | 2                        |
+    | 2 | j | 6   | 2                        |
+    +---+---+-----+--------------------------+
+    ");
 }
 
 /// Basic user defined window function with bounded window
@@ -170,22 +174,22 @@ async fn test_udwf_bounded_window_ignores_frame() {
     // Since the UDWF doesn't say it needs the window frame, the frame is ignored
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 2                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 2                                                                                                            |
-         | 2 | g | 6   | 2                                                                                                            |
-         | 2 | h | 6   | 2                                                                                                            |
-         | 2 | i | 6   | 2                                                                                                            |
-         | 2 | j | 6   | 2                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 2                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 2                                                                                                            |
+    | 2 | g | 6   | 2                                                                                                            |
+    | 2 | h | 6   | 2                                                                                                            |
+    | 2 | i | 6   | 2                                                                                                            |
+    | 2 | j | 6   | 2                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // evaluated on 2 distinct batches (when x=1 and x=2)
     assert_eq!(test_state.evaluate_called(), 0);
@@ -200,22 +204,22 @@ async fn test_udwf_bounded_window() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   | 0                                                                                                            |
-         | 2 | i | 6   | 0                                                                                                            |
-         | 2 | j | 6   | 0                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   | 0                                                                                                            |
+    | 2 | i | 6   | 0                                                                                                            |
+    | 2 | j | 6   | 0                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate is called for each input rows
     assert_eq!(test_state.evaluate_called(), 10);
@@ -232,22 +236,22 @@ async fn test_stateful_udwf() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 0                                                                                                                     |
-         | 1 | b | 1   | 1                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 1                                                                                                                     |
-         | 2 | e | 4   | 1                                                                                                                     |
-         | 2 | f | 5   | 2                                                                                                                     |
-         | 2 | g | 6   | 2                                                                                                                     |
-         | 2 | h | 6   | 2                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 2                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 0                                                                                                                     |
+    | 1 | b | 1   | 1                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 1                                                                                                                     |
+    | 2 | e | 4   | 1                                                                                                                     |
+    | 2 | f | 5   | 2                                                                                                                     |
+    | 2 | g | 6   | 2                                                                                                                     |
+    | 2 | h | 6   | 2                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 2                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 10);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -263,22 +267,22 @@ async fn test_stateful_udwf_bounded_window() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   | 0                                                                                                            |
-         | 2 | i | 6   | 0                                                                                                            |
-         | 2 | j | 6   | 0                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   | 0                                                                                                            |
+    | 2 | i | 6   | 0                                                                                                            |
+    | 2 | j | 6   | 0                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate and update_state is called for each input row
     assert_eq!(test_state.evaluate_called(), 10);
@@ -293,22 +297,22 @@ async fn test_udwf_query_include_rank() {
 
     let actual = execute(&ctx, UNBOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 3                                                                                                                     |
-         | 1 | b | 1   | 2                                                                                                                     |
-         | 1 | c | 2   | 1                                                                                                                     |
-         | 2 | d | 3   | 7                                                                                                                     |
-         | 2 | e | 4   | 6                                                                                                                     |
-         | 2 | f | 5   | 5                                                                                                                     |
-         | 2 | g | 6   | 4                                                                                                                     |
-         | 2 | h | 6   | 3                                                                                                                     |
-         | 2 | i | 6   | 2                                                                                                                     |
-         | 2 | j | 6   | 1                                                                                                                     |
-         +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 3                                                                                                                     |
+    | 1 | b | 1   | 2                                                                                                                     |
+    | 1 | c | 2   | 1                                                                                                                     |
+    | 2 | d | 3   | 7                                                                                                                     |
+    | 2 | e | 4   | 6                                                                                                                     |
+    | 2 | f | 5   | 5                                                                                                                     |
+    | 2 | g | 6   | 4                                                                                                                     |
+    | 2 | h | 6   | 3                                                                                                                     |
+    | 2 | i | 6   | 2                                                                                                                     |
+    | 2 | j | 6   | 1                                                                                                                     |
+    +---+---+-----+-----------------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 0);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -324,22 +328,22 @@ async fn test_udwf_bounded_query_include_rank() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 3                                                                                                            |
-         | 1 | b | 1   | 2                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 7                                                                                                            |
-         | 2 | e | 4   | 6                                                                                                            |
-         | 2 | f | 5   | 5                                                                                                            |
-         | 2 | g | 6   | 4                                                                                                            |
-         | 2 | h | 6   | 3                                                                                                            |
-         | 2 | i | 6   | 2                                                                                                            |
-         | 2 | j | 6   | 1                                                                                                            |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 3                                                                                                            |
+    | 1 | b | 1   | 2                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 7                                                                                                            |
+    | 2 | e | 4   | 6                                                                                                            |
+    | 2 | f | 5   | 5                                                                                                            |
+    | 2 | g | 6   | 4                                                                                                            |
+    | 2 | h | 6   | 3                                                                                                            |
+    | 2 | i | 6   | 2                                                                                                            |
+    | 2 | j | 6   | 1                                                                                                            |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     assert_eq!(test_state.evaluate_called(), 0);
     assert_eq!(test_state.evaluate_all_called(), 0);
@@ -357,22 +361,22 @@ async fn test_udwf_bounded_window_returns_null() {
 
     let actual = execute(&ctx, BOUNDED_WINDOW_QUERY).await.unwrap();
 
-    insta::assert_snapshot!(batches_to_string(&actual), @r###"
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         | 1 | a | 0   | 1                                                                                                            |
-         | 1 | b | 1   | 1                                                                                                            |
-         | 1 | c | 2   | 1                                                                                                            |
-         | 2 | d | 3   | 1                                                                                                            |
-         | 2 | e | 4   | 2                                                                                                            |
-         | 2 | f | 5   | 1                                                                                                            |
-         | 2 | g | 6   | 1                                                                                                            |
-         | 2 | h | 6   |                                                                                                              |
-         | 2 | i | 6   |                                                                                                              |
-         | 2 | j | 6   |                                                                                                              |
-         +---+---+-----+--------------------------------------------------------------------------------------------------------------+
-         "###);
+    insta::assert_snapshot!(batches_to_string(&actual), @r"
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | x | y | val | odd_counter(t.val) PARTITION BY [t.x] ORDER BY [t.y ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    | 1 | a | 0   | 1                                                                                                            |
+    | 1 | b | 1   | 1                                                                                                            |
+    | 1 | c | 2   | 1                                                                                                            |
+    | 2 | d | 3   | 1                                                                                                            |
+    | 2 | e | 4   | 2                                                                                                            |
+    | 2 | f | 5   | 1                                                                                                            |
+    | 2 | g | 6   | 1                                                                                                            |
+    | 2 | h | 6   |                                                                                                              |
+    | 2 | i | 6   |                                                                                                              |
+    | 2 | j | 6   |                                                                                                              |
+    +---+---+-----+--------------------------------------------------------------------------------------------------------------+
+    ");
 
     // Evaluate is called for each input rows
     assert_eq!(test_state.evaluate_called(), 10);
@@ -522,20 +526,20 @@ impl OddCounter {
     }
 
     fn register(ctx: &mut SessionContext, test_state: Arc<TestState>) {
-        #[derive(Debug, Clone)]
+        #[derive(Debug, Clone, PartialEq, Eq, Hash)]
         struct SimpleWindowUDF {
             signature: Signature,
-            test_state: Arc<TestState>,
+            test_state: PtrEq<Arc<TestState>>,
             aliases: Vec<String>,
         }
 
         impl SimpleWindowUDF {
             fn new(test_state: Arc<TestState>) -> Self {
                 let signature =
-                    Signature::exact(vec![DataType::Float64], Volatility::Immutable);
+                    Signature::exact(vec![DataType::Int64], Volatility::Immutable);
                 Self {
                     signature,
-                    test_state,
+                    test_state: test_state.into(),
                     aliases: vec!["odd_counter_alias".to_string()],
                 }
             }
@@ -568,6 +572,10 @@ impl OddCounter {
             fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
                 Ok(Field::new(field_args.name(), DataType::Int64, true).into())
             }
+
+            fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+                LimitEffect::Unknown
+            }
         }
 
         ctx.register_udwf(WindowUDF::from(SimpleWindowUDF::new(test_state)))
@@ -607,7 +615,9 @@ impl PartitionEvaluator for OddCounter {
         ranks_in_partition: &[Range<usize>],
     ) -> Result<ArrayRef> {
         self.test_state.inc_evaluate_all_with_rank_called();
-        println!("evaluate_all_with_rank, values: {num_rows:#?}, ranks_in_partitions: {ranks_in_partition:?}");
+        println!(
+            "evaluate_all_with_rank, values: {num_rows:#?}, ranks_in_partitions: {ranks_in_partition:?}"
+        );
         // when evaluating with ranks, just return the inverse rank instead
         let array: Int64Array = ranks_in_partition
             .iter()
@@ -643,7 +653,7 @@ fn odd_count_arr(arr: &Int64Array, num_rows: usize) -> ArrayRef {
     Arc::new(array)
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct VariadicWindowUDF {
     signature: Signature,
 }
@@ -687,6 +697,10 @@ impl WindowUDFImpl for VariadicWindowUDF {
     fn field(&self, _: WindowUDFFieldArgs) -> Result<FieldRef> {
         unimplemented!("unnecessary for testing");
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 #[test]
@@ -765,6 +779,31 @@ struct MetadataBasedWindowUdf {
     metadata: HashMap<String, String>,
 }
 
+impl PartialEq for MetadataBasedWindowUdf {
+    fn eq(&self, other: &Self) -> bool {
+        let Self {
+            name,
+            signature,
+            metadata,
+        } = self;
+        name == &other.name
+            && signature == &other.signature
+            && metadata == &other.metadata
+    }
+}
+impl Eq for MetadataBasedWindowUdf {}
+impl Hash for MetadataBasedWindowUdf {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            name,
+            signature,
+            metadata: _, // unhashable
+        } = self;
+        name.hash(state);
+        signature.hash(state);
+    }
+}
+
 impl MetadataBasedWindowUdf {
     fn new(metadata: HashMap<String, String>) -> Self {
         // The name we return must be unique. Otherwise we will not call distinct
@@ -815,6 +854,10 @@ impl WindowUDFImpl for MetadataBasedWindowUdf {
             .with_metadata(self.metadata.clone())
             .into())
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 #[derive(Debug)]
diff --git a/datafusion/datasource-arrow/Cargo.toml b/datafusion/datasource-arrow/Cargo.toml
new file mode 100644
index 0000000000000..fbadc8708ca69
--- /dev/null
+++ b/datafusion/datasource-arrow/Cargo.toml
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion-datasource-arrow"
+description = "datafusion-datasource-arrow"
+readme = "README.md"
+authors.workspace = true
+edition.workspace = true
+homepage.workspace = true
+license.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+
+[package.metadata.docs.rs]
+all-features = true
+
+[dependencies]
+arrow = { workspace = true }
+arrow-ipc = { workspace = true }
+async-trait = { workspace = true }
+bytes = { workspace = true }
+datafusion-common = { workspace = true, features = ["object_store"] }
+datafusion-common-runtime = { workspace = true }
+datafusion-datasource = { workspace = true }
+datafusion-execution = { workspace = true }
+datafusion-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
+datafusion-session = { workspace = true }
+futures = { workspace = true }
+itertools = { workspace = true }
+object_store = { workspace = true }
+tokio = { workspace = true }
+
+[dev-dependencies]
+chrono = { workspace = true }
+
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
+[lints]
+workspace = true
+
+[lib]
+name = "datafusion_datasource_arrow"
+path = "src/mod.rs"
+
+[features]
+compression = [
+    "arrow-ipc/zstd",
+]
diff --git a/datafusion/datasource-arrow/LICENSE.txt b/datafusion/datasource-arrow/LICENSE.txt
new file mode 100644
index 0000000000000..d74c6b599d2ae
--- /dev/null
+++ b/datafusion/datasource-arrow/LICENSE.txt
@@ -0,0 +1,212 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+This project includes code from Apache Aurora.
+
+* dev/release/{release,changelog,release-candidate} are based on the scripts from
+  Apache Aurora
+
+Copyright: 2016 The Apache Software Foundation.
+Home page: https://aurora.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
diff --git a/datafusion/datasource-arrow/NOTICE.txt b/datafusion/datasource-arrow/NOTICE.txt
new file mode 100644
index 0000000000000..0bd2d52368fea
--- /dev/null
+++ b/datafusion/datasource-arrow/NOTICE.txt
@@ -0,0 +1,5 @@
+Apache DataFusion
+Copyright 2019-2026 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
diff --git a/datafusion/datasource-arrow/README.md b/datafusion/datasource-arrow/README.md
new file mode 100644
index 0000000000000..9901b52105dd4
--- /dev/null
+++ b/datafusion/datasource-arrow/README.md
@@ -0,0 +1,34 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Arrow DataSource
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that defines a Arrow based file source.
+It works with files following the [Arrow IPC format].
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
+[arrow ipc format]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs
new file mode 100644
index 0000000000000..f60bce3249935
--- /dev/null
+++ b/datafusion/datasource-arrow/src/file_format.rs
@@ -0,0 +1,782 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ArrowFormat`]: Apache Arrow [`FileFormat`] abstractions
+//!
+//! Works with files following the [Arrow IPC format](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format)
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::fmt::{self, Debug};
+use std::io::{Seek, SeekFrom};
+use std::sync::Arc;
+
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow::error::ArrowError;
+use arrow::ipc::convert::fb_to_schema;
+use arrow::ipc::reader::{FileReader, StreamReader};
+use arrow::ipc::writer::IpcWriteOptions;
+use arrow::ipc::{CompressionType, root_as_message};
+use datafusion_common::error::Result;
+use datafusion_common::parsers::CompressionTypeVariant;
+use datafusion_common::{
+    DEFAULT_ARROW_EXTENSION, DataFusionError, GetExt, Statistics,
+    internal_datafusion_err, not_impl_err,
+};
+use datafusion_common_runtime::{JoinSet, SpawnedTask};
+use datafusion_datasource::TableSchema;
+use datafusion_datasource::display::FileGroupDisplay;
+use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::sink::{DataSink, DataSinkExec};
+use datafusion_datasource::write::{
+    ObjectWriterBuilder, SharedBuffer, get_writer_schema,
+};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_expr::dml::InsertOp;
+use datafusion_physical_expr_common::sort_expr::LexRequirement;
+
+use crate::source::ArrowSource;
+use async_trait::async_trait;
+use bytes::Bytes;
+use datafusion_datasource::file_compression_type::FileCompressionType;
+use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
+use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
+use datafusion_datasource::source::DataSourceExec;
+use datafusion_datasource::write::demux::DemuxedStreamReceiver;
+use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
+use datafusion_session::Session;
+use futures::StreamExt;
+use futures::stream::BoxStream;
+use object_store::{
+    GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt,
+    path::Path,
+};
+use tokio::io::AsyncWriteExt;
+
+/// Initial writing buffer size. Note this is just a size hint for efficiency. It
+/// will grow beyond the set value if needed.
+const INITIAL_BUFFER_BYTES: usize = 1048576;
+
+/// If the buffered Arrow data exceeds this size, it is flushed to object store
+const BUFFER_FLUSH_BYTES: usize = 1024000;
+
+/// Factory struct used to create [`ArrowFormat`]
+#[derive(Default, Debug)]
+pub struct ArrowFormatFactory;
+
+impl ArrowFormatFactory {
+    /// Creates an instance of [ArrowFormatFactory]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl FileFormatFactory for ArrowFormatFactory {
+    fn create(
+        &self,
+        _state: &dyn Session,
+        _format_options: &HashMap<String, String>,
+    ) -> Result<Arc<dyn FileFormat>> {
+        Ok(Arc::new(ArrowFormat))
+    }
+
+    fn default(&self) -> Arc<dyn FileFormat> {
+        Arc::new(ArrowFormat)
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+impl GetExt for ArrowFormatFactory {
+    fn get_ext(&self) -> String {
+        // Removes the dot, i.e. ".parquet" -> "parquet"
+        DEFAULT_ARROW_EXTENSION[1..].to_string()
+    }
+}
+
+/// Arrow [`FileFormat`] implementation.
+#[derive(Default, Debug)]
+pub struct ArrowFormat;
+
+#[async_trait]
+impl FileFormat for ArrowFormat {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn get_ext(&self) -> String {
+        ArrowFormatFactory::new().get_ext()
+    }
+
+    fn get_ext_with_compression(
+        &self,
+        file_compression_type: &FileCompressionType,
+    ) -> Result<String> {
+        let ext = self.get_ext();
+        match file_compression_type.get_variant() {
+            CompressionTypeVariant::UNCOMPRESSED => Ok(ext),
+            _ => Err(internal_datafusion_err!(
+                "Arrow FileFormat does not support compression."
+            )),
+        }
+    }
+
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        None
+    }
+
+    async fn infer_schema(
+        &self,
+        _state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        objects: &[ObjectMeta],
+    ) -> Result<SchemaRef> {
+        let mut schemas = vec![];
+        for object in objects {
+            let r = store.as_ref().get(&object.location).await?;
+            let schema = match r.payload {
+                #[cfg(not(target_arch = "wasm32"))]
+                GetResultPayload::File(mut file, _) => {
+                    match FileReader::try_new(&mut file, None) {
+                        Ok(reader) => reader.schema(),
+                        Err(file_error) => {
+                            // not in the file format, but FileReader read some bytes
+                            // while trying to parse the file and so we need to rewind
+                            // it to the beginning of the file
+                            file.seek(SeekFrom::Start(0))?;
+                            match StreamReader::try_new(&mut file, None) {
+                                Ok(reader) => reader.schema(),
+                                Err(stream_error) => {
+                                    return Err(internal_datafusion_err!(
+                                        "Failed to parse Arrow file as either file format or stream format. File format error: {file_error}. Stream format error: {stream_error}"
+                                    ));
+                                }
+                            }
+                        }
+                    }
+                }
+                GetResultPayload::Stream(stream) => infer_stream_schema(stream).await?,
+            };
+            schemas.push(schema.as_ref().clone());
+        }
+        let merged_schema = Schema::try_merge(schemas)?;
+        Ok(Arc::new(merged_schema))
+    }
+
+    async fn infer_stats(
+        &self,
+        _state: &dyn Session,
+        _store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        _object: &ObjectMeta,
+    ) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&table_schema))
+    }
+
+    async fn create_physical_plan(
+        &self,
+        state: &dyn Session,
+        conf: FileScanConfig,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let object_store = state.runtime_env().object_store(&conf.object_store_url)?;
+        let object_location = &conf
+            .file_groups
+            .first()
+            .ok_or_else(|| internal_datafusion_err!("No files found in file group"))?
+            .files()
+            .first()
+            .ok_or_else(|| internal_datafusion_err!("No files found in file group"))?
+            .object_meta
+            .location;
+
+        let table_schema = TableSchema::new(
+            Arc::clone(conf.file_schema()),
+            conf.table_partition_cols().clone(),
+        );
+
+        let mut source: Arc<dyn FileSource> =
+            match is_object_in_arrow_ipc_file_format(object_store, object_location).await
+            {
+                Ok(true) => Arc::new(ArrowSource::new_file_source(table_schema)),
+                Ok(false) => Arc::new(ArrowSource::new_stream_file_source(table_schema)),
+                Err(e) => Err(e)?,
+            };
+
+        // Preserve projection from the original file source
+        if let Some(projection) = conf.file_source.projection()
+            && let Some(new_source) = source.try_pushdown_projection(projection)?
+        {
+            source = new_source;
+        }
+
+        let config = FileScanConfigBuilder::from(conf)
+            .with_source(source)
+            .build();
+
+        Ok(DataSourceExec::from_data_source(config))
+    }
+
+    async fn create_writer_physical_plan(
+        &self,
+        input: Arc<dyn ExecutionPlan>,
+        _state: &dyn Session,
+        conf: FileSinkConfig,
+        order_requirements: Option<LexRequirement>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if conf.insert_op != InsertOp::Append {
+            return not_impl_err!("Overwrites are not implemented yet for Arrow format");
+        }
+
+        let sink = Arc::new(ArrowFileSink::new(conf));
+
+        Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
+    }
+
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(ArrowSource::new_file_source(table_schema))
+    }
+}
+
+/// Implements [`FileSink`] for Arrow IPC files
+struct ArrowFileSink {
+    config: FileSinkConfig,
+}
+
+impl ArrowFileSink {
+    fn new(config: FileSinkConfig) -> Self {
+        Self { config }
+    }
+}
+
+#[async_trait]
+impl FileSink for ArrowFileSink {
+    fn config(&self) -> &FileSinkConfig {
+        &self.config
+    }
+
+    async fn spawn_writer_tasks_and_join(
+        &self,
+        context: &Arc<TaskContext>,
+        demux_task: SpawnedTask<Result<()>>,
+        mut file_stream_rx: DemuxedStreamReceiver,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Result<u64> {
+        let mut file_write_tasks: JoinSet<std::result::Result<usize, DataFusionError>> =
+            JoinSet::new();
+
+        let ipc_options =
+            IpcWriteOptions::try_new(64, false, arrow_ipc::MetadataVersion::V5)?
+                .try_with_compression(Some(CompressionType::LZ4_FRAME))?;
+        while let Some((path, mut rx)) = file_stream_rx.recv().await {
+            let shared_buffer = SharedBuffer::new(INITIAL_BUFFER_BYTES);
+            let mut arrow_writer = arrow_ipc::writer::FileWriter::try_new_with_options(
+                shared_buffer.clone(),
+                &get_writer_schema(&self.config),
+                ipc_options.clone(),
+            )?;
+            let mut object_store_writer = ObjectWriterBuilder::new(
+                FileCompressionType::UNCOMPRESSED,
+                &path,
+                Arc::clone(&object_store),
+            )
+            .with_buffer_size(Some(
+                context
+                    .session_config()
+                    .options()
+                    .execution
+                    .objectstore_writer_buffer_size,
+            ))
+            .build()?;
+            file_write_tasks.spawn(async move {
+                let mut row_count = 0;
+                while let Some(batch) = rx.recv().await {
+                    row_count += batch.num_rows();
+                    arrow_writer.write(&batch)?;
+                    let mut buff_to_flush = shared_buffer.buffer.try_lock().unwrap();
+                    if buff_to_flush.len() > BUFFER_FLUSH_BYTES {
+                        object_store_writer
+                            .write_all(buff_to_flush.as_slice())
+                            .await?;
+                        buff_to_flush.clear();
+                    }
+                }
+                arrow_writer.finish()?;
+                let final_buff = shared_buffer.buffer.try_lock().unwrap();
+
+                object_store_writer.write_all(final_buff.as_slice()).await?;
+                object_store_writer.shutdown().await?;
+                Ok(row_count)
+            });
+        }
+
+        let mut row_count = 0;
+        while let Some(result) = file_write_tasks.join_next().await {
+            match result {
+                Ok(r) => {
+                    row_count += r?;
+                }
+                Err(e) => {
+                    if e.is_panic() {
+                        std::panic::resume_unwind(e.into_panic());
+                    } else {
+                        unreachable!();
+                    }
+                }
+            }
+        }
+
+        demux_task
+            .join_unwind()
+            .await
+            .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+        Ok(row_count as u64)
+    }
+}
+
+impl Debug for ArrowFileSink {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ArrowFileSink").finish()
+    }
+}
+
+impl DisplayAs for ArrowFileSink {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "ArrowFileSink(file_groups=",)?;
+                FileGroupDisplay(&self.config.file_group).fmt_as(t, f)?;
+                write!(f, ")")
+            }
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "format: arrow")?;
+                write!(f, "file={}", &self.config.original_url)
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl DataSink for ArrowFileSink {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> &SchemaRef {
+        self.config.output_schema()
+    }
+
+    async fn write_all(
+        &self,
+        data: SendableRecordBatchStream,
+        context: &Arc<TaskContext>,
+    ) -> Result<u64> {
+        FileSink::write_all(self, data, context).await
+    }
+}
+
+// Custom implementation of inferring schema. Should eventually be moved upstream to arrow-rs.
+// See <https://github.com/apache/arrow-rs/issues/5021>
+
+const ARROW_MAGIC: [u8; 6] = [b'A', b'R', b'R', b'O', b'W', b'1'];
+const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
+
+async fn infer_stream_schema(
+    mut stream: BoxStream<'static, object_store::Result<Bytes>>,
+) -> Result<SchemaRef> {
+    // IPC streaming format.
+    // See https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
+    //
+    //   <SCHEMA>
+    //   <DICTIONARY 0>
+    //   ...
+    //   <DICTIONARY k - 1>
+    //   <RECORD BATCH 0>
+    //   ...
+    //   <DICTIONARY x DELTA>
+    //   ...
+    //   <DICTIONARY y DELTA>
+    //   ...
+    //   <RECORD BATCH n - 1>
+    //   <EOS [optional]: 0xFFFFFFFF 0x00000000>
+
+    // The streaming format is made up of a sequence of encapsulated messages.
+    // See https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format
+    //
+    //   <continuation: 0xFFFFFFFF>  (added in v0.15.0)
+    //   <metadata_size: int32>
+    //   <metadata_flatbuffer: bytes>
+    //   <padding>
+    //   <message body>
+    //
+    // The first message is the schema.
+
+    // IPC file format is a wrapper around the streaming format with indexing information.
+    // See https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
+    //
+    //   <magic number "ARROW1">
+    //   <empty padding bytes [to 8 byte boundary]>
+    //   <STREAMING FORMAT with EOS>
+    //   <FOOTER>
+    //   <FOOTER SIZE: int32>
+    //   <magic number "ARROW1">
+
+    // For the purposes of this function, the arrow "preamble" is the magic number, padding,
+    // and the continuation marker. 16 bytes covers the preamble and metadata length
+    // no matter which version or format is used.
+    let bytes = extend_bytes_to_n_length_from_stream(vec![], 16, &mut stream).await?;
+
+    // The preamble length is everything before the metadata length
+    let preamble_len = if bytes[0..6] == ARROW_MAGIC {
+        // File format starts with magic number "ARROW1"
+        if bytes[8..12] == CONTINUATION_MARKER {
+            // Continuation marker was added in v0.15.0
+            12
+        } else {
+            // File format before v0.15.0
+            8
+        }
+    } else if bytes[0..4] == CONTINUATION_MARKER {
+        // Stream format after v0.15.0 starts with continuation marker
+        4
+    } else {
+        // Stream format before v0.15.0 does not have a preamble
+        0
+    };
+
+    let meta_len_bytes: [u8; 4] = bytes[preamble_len..preamble_len + 4]
+        .try_into()
+        .map_err(|err| {
+            ArrowError::ParseError(format!(
+                "Unable to read IPC message metadata length: {err:?}"
+            ))
+        })?;
+
+    let meta_len = i32::from_le_bytes([
+        meta_len_bytes[0],
+        meta_len_bytes[1],
+        meta_len_bytes[2],
+        meta_len_bytes[3],
+    ]);
+
+    if meta_len < 0 {
+        return Err(ArrowError::ParseError(
+            "IPC message metadata length is negative".to_string(),
+        )
+        .into());
+    }
+
+    let bytes = extend_bytes_to_n_length_from_stream(
+        bytes,
+        preamble_len + 4 + (meta_len as usize),
+        &mut stream,
+    )
+    .await?;
+
+    let message = root_as_message(&bytes[preamble_len + 4..]).map_err(|err| {
+        ArrowError::ParseError(format!("Unable to read IPC message metadata: {err:?}"))
+    })?;
+    let fb_schema = message.header_as_schema().ok_or_else(|| {
+        ArrowError::IpcError("Unable to read IPC message schema".to_string())
+    })?;
+    let schema = fb_to_schema(fb_schema);
+
+    Ok(Arc::new(schema))
+}
+
+async fn extend_bytes_to_n_length_from_stream(
+    bytes: Vec<u8>,
+    n: usize,
+    stream: &mut BoxStream<'static, object_store::Result<Bytes>>,
+) -> Result<Vec<u8>> {
+    if bytes.len() >= n {
+        return Ok(bytes);
+    }
+
+    let mut buf = bytes;
+
+    while let Some(b) = stream.next().await.transpose()? {
+        buf.extend_from_slice(&b);
+
+        if buf.len() >= n {
+            break;
+        }
+    }
+
+    if buf.len() < n {
+        return Err(ArrowError::ParseError(
+            "Unexpected end of byte stream for Arrow IPC file".to_string(),
+        )
+        .into());
+    }
+
+    Ok(buf)
+}
+
+async fn is_object_in_arrow_ipc_file_format(
+    store: Arc<dyn ObjectStore>,
+    object_location: &Path,
+) -> Result<bool> {
+    let get_opts = GetOptions {
+        range: Some(GetRange::Bounded(0..6)),
+        ..Default::default()
+    };
+    let bytes = store
+        .get_opts(object_location, get_opts)
+        .await?
+        .bytes()
+        .await?;
+    Ok(bytes.len() >= 6 && bytes[0..6] == ARROW_MAGIC)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use chrono::DateTime;
+    use datafusion_common::DFSchema;
+    use datafusion_common::config::TableOptions;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_execution::runtime_env::RuntimeEnv;
+    use datafusion_expr::execution_props::ExecutionProps;
+    use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use object_store::{chunked::ChunkedStore, memory::InMemory, path::Path};
+
+    struct MockSession {
+        config: SessionConfig,
+        runtime_env: Arc<RuntimeEnv>,
+    }
+
+    impl MockSession {
+        fn new() -> Self {
+            Self {
+                config: SessionConfig::new(),
+                runtime_env: Arc::new(RuntimeEnv::default()),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl Session for MockSession {
+        fn session_id(&self) -> &str {
+            unimplemented!()
+        }
+
+        fn config(&self) -> &SessionConfig {
+            &self.config
+        }
+
+        async fn create_physical_plan(
+            &self,
+            _logical_plan: &LogicalPlan,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn create_physical_expr(
+            &self,
+            _expr: Expr,
+            _df_schema: &DFSchema,
+        ) -> Result<Arc<dyn PhysicalExpr>> {
+            unimplemented!()
+        }
+
+        fn scalar_functions(&self) -> &HashMap<String, Arc<ScalarUDF>> {
+            unimplemented!()
+        }
+
+        fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
+            unimplemented!()
+        }
+
+        fn window_functions(&self) -> &HashMap<String, Arc<WindowUDF>> {
+            unimplemented!()
+        }
+
+        fn runtime_env(&self) -> &Arc<RuntimeEnv> {
+            &self.runtime_env
+        }
+
+        fn execution_props(&self) -> &ExecutionProps {
+            unimplemented!()
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            unimplemented!()
+        }
+
+        fn table_options(&self) -> &TableOptions {
+            unimplemented!()
+        }
+
+        fn table_options_mut(&mut self) -> &mut TableOptions {
+            unimplemented!()
+        }
+
+        fn task_ctx(&self) -> Arc<TaskContext> {
+            unimplemented!()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_infer_schema_stream() -> Result<()> {
+        for file in ["example.arrow", "example_stream.arrow"] {
+            let mut bytes = std::fs::read(format!("tests/data/{file}"))?;
+            bytes.truncate(bytes.len() - 20); // mangle end to show we don't need to read whole file
+            let location = Path::parse(file)?;
+            let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+            in_memory_store.put(&location, bytes.into()).await?;
+
+            let state = MockSession::new();
+            let object_meta = ObjectMeta {
+                location,
+                last_modified: DateTime::default(),
+                size: u64::MAX,
+                e_tag: None,
+                version: None,
+            };
+
+            let arrow_format = ArrowFormat {};
+            let expected = vec!["f0: Int64", "f1: Utf8", "f2: Boolean"];
+
+            // Test chunk sizes where too small so we keep having to read more bytes
+            // And when large enough that first read contains all we need
+            for chunk_size in [7, 3000] {
+                let store =
+                    Arc::new(ChunkedStore::new(in_memory_store.clone(), chunk_size));
+                let inferred_schema = arrow_format
+                    .infer_schema(
+                        &state,
+                        &(store.clone() as Arc<dyn ObjectStore>),
+                        std::slice::from_ref(&object_meta),
+                    )
+                    .await?;
+                let actual_fields = inferred_schema
+                    .fields()
+                    .iter()
+                    .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+                    .collect::<Vec<_>>();
+                assert_eq!(expected, actual_fields);
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_infer_schema_short_stream() -> Result<()> {
+        for file in ["example.arrow", "example_stream.arrow"] {
+            let mut bytes = std::fs::read(format!("tests/data/{file}"))?;
+            bytes.truncate(20); // should cause error that file shorter than expected
+            let location = Path::parse(file)?;
+            let in_memory_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+            in_memory_store.put(&location, bytes.into()).await?;
+
+            let state = MockSession::new();
+            let object_meta = ObjectMeta {
+                location,
+                last_modified: DateTime::default(),
+                size: u64::MAX,
+                e_tag: None,
+                version: None,
+            };
+
+            let arrow_format = ArrowFormat {};
+
+            let store = Arc::new(ChunkedStore::new(in_memory_store.clone(), 7));
+            let err = arrow_format
+                .infer_schema(
+                    &state,
+                    &(store.clone() as Arc<dyn ObjectStore>),
+                    std::slice::from_ref(&object_meta),
+                )
+                .await;
+
+            assert!(err.is_err());
+            assert_eq!(
+                "Arrow error: Parser error: Unexpected end of byte stream for Arrow IPC file",
+                err.unwrap_err().to_string().lines().next().unwrap()
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_file_format() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.arrow");
+
+        let file_bytes = std::fs::read("tests/data/example.arrow")?;
+        store.put(&path, file_bytes.into()).await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+        assert!(is_file, "Should detect file format");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_stream_format() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test_stream.arrow");
+
+        let stream_bytes = std::fs::read("tests/data/example_stream.arrow")?;
+        store.put(&path, stream_bytes.into()).await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+
+        assert!(!is_file, "Should detect stream format (not file)");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_corrupted_file() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("corrupted.arrow");
+
+        store
+            .put(&path, Bytes::from(vec![0x43, 0x4f, 0x52, 0x41]).into())
+            .await?;
+
+        let is_file = is_object_in_arrow_ipc_file_format(store.clone(), &path).await?;
+
+        assert!(
+            !is_file,
+            "Corrupted file should not be detected as file format"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_format_detection_empty_file() -> Result<()> {
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("empty.arrow");
+
+        store.put(&path, Bytes::new().into()).await?;
+
+        let result = is_object_in_arrow_ipc_file_format(store.clone(), &path).await;
+
+        // currently errors because it tries to read 0..6 from an empty file
+        assert!(result.is_err(), "Empty file should error");
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource-arrow/src/mod.rs b/datafusion/datasource-arrow/src/mod.rs
new file mode 100644
index 0000000000000..4816a45942e5a
--- /dev/null
+++ b/datafusion/datasource-arrow/src/mod.rs
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+// Make sure fast / cheap clones on Arc are explicit:
+// https://github.com/apache/datafusion/issues/11143
+#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+
+//! [`ArrowFormat`]: Apache Arrow file format abstractions
+
+pub mod file_format;
+pub mod source;
+
+pub use file_format::*;
diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs
new file mode 100644
index 0000000000000..0226dadd3d950
--- /dev/null
+++ b/datafusion/datasource-arrow/src/source.rs
@@ -0,0 +1,665 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Execution plan for reading Arrow IPC files
+//!
+//! # Naming Note
+//!
+//! The naming in this module can be confusing:
+//! - `ArrowFileOpener` handles the Arrow IPC **file format**
+//!   (with footer, supports parallel reading)
+//! - `ArrowStreamFileOpener` handles the Arrow IPC **stream format**
+//!   (without footer, sequential only)
+//! - `ArrowSource` is the unified `FileSource` implementation that uses either opener
+//!   depending on the format specified at construction
+//!
+//! Despite the name "ArrowStreamFileOpener", it still reads from files - the "Stream"
+//! refers to the Arrow IPC stream format, not streaming I/O. Both formats can be stored
+//! in files on disk or object storage.
+
+use std::sync::Arc;
+use std::{any::Any, io::Cursor};
+
+use datafusion_datasource::{TableSchema, as_file_source};
+
+use arrow::buffer::Buffer;
+use arrow::ipc::reader::{FileDecoder, FileReader, StreamReader};
+use datafusion_common::error::Result;
+use datafusion_common::exec_datafusion_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_datasource::PartitionedFile;
+use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_scan_config::FileScanConfig;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::projection::ProjectionExprs;
+
+use datafusion_datasource::file_stream::FileOpenFuture;
+use datafusion_datasource::file_stream::FileOpener;
+use futures::StreamExt;
+use itertools::Itertools;
+use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore, ObjectStoreExt};
+
+/// Enum indicating which Arrow IPC format to use
+#[derive(Clone, Copy, Debug)]
+enum ArrowFormat {
+    /// Arrow IPC file format (with footer, supports parallel reading)
+    File,
+    /// Arrow IPC stream format (without footer, sequential only)
+    Stream,
+}
+
+/// `FileOpener` for Arrow IPC stream format. Supports only sequential reading.
+pub(crate) struct ArrowStreamFileOpener {
+    object_store: Arc<dyn ObjectStore>,
+    projection: Option<Vec<usize>>,
+}
+
+impl FileOpener for ArrowStreamFileOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        if partitioned_file.range.is_some() {
+            return Err(exec_datafusion_err!(
+                "ArrowStreamFileOpener does not support range-based reading"
+            ));
+        }
+        let object_store = Arc::clone(&self.object_store);
+        let projection = self.projection.clone();
+
+        Ok(Box::pin(async move {
+            let r = object_store
+                .get(&partitioned_file.object_meta.location)
+                .await?;
+
+            let stream = match r.payload {
+                #[cfg(not(target_arch = "wasm32"))]
+                GetResultPayload::File(file, _) => futures::stream::iter(
+                    StreamReader::try_new(file.try_clone()?, projection.clone())?,
+                )
+                .map(|r| r.map_err(Into::into))
+                .boxed(),
+                GetResultPayload::Stream(_) => {
+                    let bytes = r.bytes().await?;
+                    let cursor = Cursor::new(bytes);
+                    futures::stream::iter(StreamReader::try_new(
+                        cursor,
+                        projection.clone(),
+                    )?)
+                    .map(|r| r.map_err(Into::into))
+                    .boxed()
+                }
+            };
+
+            Ok(stream)
+        }))
+    }
+}
+
+/// `FileOpener` for Arrow IPC file format. Supports range-based parallel reading.
+pub(crate) struct ArrowFileOpener {
+    object_store: Arc<dyn ObjectStore>,
+    projection: Option<Vec<usize>>,
+}
+
+impl FileOpener for ArrowFileOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        let object_store = Arc::clone(&self.object_store);
+        let projection = self.projection.clone();
+
+        Ok(Box::pin(async move {
+            let range = partitioned_file.range.clone();
+            match range {
+                None => {
+                    let r = object_store
+                        .get(&partitioned_file.object_meta.location)
+                        .await?;
+                    let stream = match r.payload {
+                        #[cfg(not(target_arch = "wasm32"))]
+                        GetResultPayload::File(file, _) => futures::stream::iter(
+                            FileReader::try_new(file.try_clone()?, projection.clone())?,
+                        )
+                        .map(|r| r.map_err(Into::into))
+                        .boxed(),
+                        GetResultPayload::Stream(_) => {
+                            let bytes = r.bytes().await?;
+                            let cursor = Cursor::new(bytes);
+                            futures::stream::iter(FileReader::try_new(
+                                cursor,
+                                projection.clone(),
+                            )?)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed()
+                        }
+                    };
+
+                    Ok(stream)
+                }
+                Some(range) => {
+                    // range is not none, the file maybe split into multiple parts to scan in parallel
+                    // get footer_len firstly
+                    let get_option = GetOptions {
+                        range: Some(GetRange::Suffix(10)),
+                        ..Default::default()
+                    };
+                    let get_result = object_store
+                        .get_opts(&partitioned_file.object_meta.location, get_option)
+                        .await?;
+                    let footer_len_buf = get_result.bytes().await?;
+                    let footer_len = arrow_ipc::reader::read_footer_length(
+                        footer_len_buf[..].try_into().unwrap(),
+                    )?;
+                    // read footer according to footer_len
+                    let get_option = GetOptions {
+                        range: Some(GetRange::Suffix(10 + (footer_len as u64))),
+                        ..Default::default()
+                    };
+                    let get_result = object_store
+                        .get_opts(&partitioned_file.object_meta.location, get_option)
+                        .await?;
+                    let footer_buf = get_result.bytes().await?;
+                    let footer = arrow_ipc::root_as_footer(
+                        footer_buf[..footer_len].try_into().unwrap(),
+                    )
+                    .map_err(|err| {
+                        exec_datafusion_err!("Unable to get root as footer: {err:?}")
+                    })?;
+                    // build decoder according to footer & projection
+                    let schema =
+                        arrow_ipc::convert::fb_to_schema(footer.schema().unwrap());
+                    let mut decoder = FileDecoder::new(schema.into(), footer.version());
+                    if let Some(projection) = projection {
+                        decoder = decoder.with_projection(projection);
+                    }
+                    let dict_ranges = footer
+                        .dictionaries()
+                        .iter()
+                        .flatten()
+                        .map(|block| {
+                            let block_len =
+                                block.bodyLength() as u64 + block.metaDataLength() as u64;
+                            let block_offset = block.offset() as u64;
+                            block_offset..block_offset + block_len
+                        })
+                        .collect_vec();
+                    let dict_results = object_store
+                        .get_ranges(&partitioned_file.object_meta.location, &dict_ranges)
+                        .await?;
+                    for (dict_block, dict_result) in
+                        footer.dictionaries().iter().flatten().zip(dict_results)
+                    {
+                        decoder
+                            .read_dictionary(dict_block, &Buffer::from(dict_result))?;
+                    }
+
+                    // filter recordbatches according to range
+                    let recordbatches = footer
+                        .recordBatches()
+                        .iter()
+                        .flatten()
+                        .filter(|block| {
+                            let block_offset = block.offset() as u64;
+                            block_offset >= range.start as u64
+                                && block_offset < range.end as u64
+                        })
+                        .copied()
+                        .collect_vec();
+
+                    let recordbatch_ranges = recordbatches
+                        .iter()
+                        .map(|block| {
+                            let block_len =
+                                block.bodyLength() as u64 + block.metaDataLength() as u64;
+                            let block_offset = block.offset() as u64;
+                            block_offset..block_offset + block_len
+                        })
+                        .collect_vec();
+
+                    let recordbatch_results = object_store
+                        .get_ranges(
+                            &partitioned_file.object_meta.location,
+                            &recordbatch_ranges,
+                        )
+                        .await?;
+
+                    let stream = futures::stream::iter(
+                        recordbatches
+                            .into_iter()
+                            .zip(recordbatch_results)
+                            .filter_map(move |(block, data)| {
+                                decoder
+                                    .read_record_batch(&block, &Buffer::from(data))
+                                    .transpose()
+                            }),
+                    )
+                    .map(|r| r.map_err(Into::into))
+                    .boxed();
+
+                    Ok(stream)
+                }
+            }
+        }))
+    }
+}
+
+/// `FileSource` for both Arrow IPC file and stream formats
+#[derive(Clone)]
+pub struct ArrowSource {
+    format: ArrowFormat,
+    metrics: ExecutionPlanMetricsSet,
+    projection: SplitProjection,
+    table_schema: TableSchema,
+}
+
+impl ArrowSource {
+    /// Creates an [`ArrowSource`] for file format
+    pub fn new_file_source(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            format: ArrowFormat::File,
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+
+    /// Creates an [`ArrowSource`] for stream format
+    pub fn new_stream_file_source(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            format: ArrowFormat::Stream,
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+}
+
+impl FileSource for ArrowSource {
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        _base_config: &FileScanConfig,
+        _partition: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        let split_projection = self.projection.clone();
+
+        let opener: Arc<dyn FileOpener> = match self.format {
+            ArrowFormat::File => Arc::new(ArrowFileOpener {
+                object_store,
+                projection: Some(split_projection.file_indices.clone()),
+            }),
+            ArrowFormat::Stream => Arc::new(ArrowStreamFileOpener {
+                object_store,
+                projection: Some(split_projection.file_indices.clone()),
+            }),
+        };
+        ProjectionOpener::try_new(
+            split_projection,
+            opener,
+            self.table_schema.file_schema(),
+        )
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
+        Arc::new(Self { ..self.clone() })
+    }
+
+    fn metrics(&self) -> &ExecutionPlanMetricsSet {
+        &self.metrics
+    }
+
+    fn file_type(&self) -> &str {
+        match self.format {
+            ArrowFormat::File => "arrow",
+            ArrowFormat::Stream => "arrow_stream",
+        }
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        repartition_file_min_size: usize,
+        output_ordering: Option<LexOrdering>,
+        config: &FileScanConfig,
+    ) -> Result<Option<FileScanConfig>> {
+        match self.format {
+            ArrowFormat::Stream => {
+                // The Arrow IPC stream format doesn't support range-based parallel reading
+                // because it lacks a footer with the information that would be needed to
+                // make range-based parallel reading practical. Without the data in the
+                // footer you would either need to read the the entire file and record the
+                // offsets of the record batches and dictionaries, essentially recreating
+                // the footer's contents, or else each partition would need to read the
+                // entire file up to the correct offset which is a lot of duplicate I/O.
+                // We're opting to avoid that entirely by only acting on a single partition
+                // and reading sequentially.
+                Ok(None)
+            }
+            ArrowFormat::File => {
+                // Use the default trait implementation logic for file format
+                use datafusion_datasource::file_groups::FileGroupPartitioner;
+
+                if config.file_compression_type.is_compressed() {
+                    return Ok(None);
+                }
+
+                let repartitioned_file_groups_option = FileGroupPartitioner::new()
+                    .with_target_partitions(target_partitions)
+                    .with_repartition_file_min_size(repartition_file_min_size)
+                    .with_preserve_order_within_groups(output_ordering.is_some())
+                    .repartition_file_groups(&config.file_groups);
+
+                if let Some(repartitioned_file_groups) = repartitioned_file_groups_option
+                {
+                    let mut source = config.clone();
+                    source.file_groups = repartitioned_file_groups;
+                    return Ok(Some(source));
+                }
+                Ok(None)
+            }
+        }
+    }
+
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
+    }
+
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        source.projection = SplitProjection::new(
+            self.table_schema().file_schema(),
+            &source.projection.source.try_merge(projection)?,
+        );
+        Ok(Some(Arc::new(source)))
+    }
+
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+}
+
+/// `FileOpener` wrapper for both Arrow IPC file and stream formats
+pub struct ArrowOpener {
+    pub inner: Arc<dyn FileOpener>,
+}
+
+impl FileOpener for ArrowOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        self.inner.open(partitioned_file)
+    }
+}
+
+impl ArrowOpener {
+    /// Creates a new [`ArrowOpener`]
+    pub fn new(inner: Arc<dyn FileOpener>) -> Self {
+        Self { inner }
+    }
+
+    pub fn new_file_opener(
+        object_store: Arc<dyn ObjectStore>,
+        projection: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            inner: Arc::new(ArrowFileOpener {
+                object_store,
+                projection,
+            }),
+        }
+    }
+
+    pub fn new_stream_file_opener(
+        object_store: Arc<dyn ObjectStore>,
+        projection: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            inner: Arc::new(ArrowStreamFileOpener {
+                object_store,
+                projection,
+            }),
+        }
+    }
+}
+
+impl From<ArrowSource> for Arc<dyn FileSource> {
+    fn from(source: ArrowSource) -> Self {
+        as_file_source(source)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{fs::File, io::Read};
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow_ipc::reader::{FileReader, StreamReader};
+    use bytes::Bytes;
+    use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use object_store::memory::InMemory;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_file_opener_without_ranges() -> Result<()> {
+        for filename in ["example.arrow", "example_stream.arrow"] {
+            let path = format!("tests/data/{filename}");
+            let path_str = path.as_str();
+            let mut file = File::open(path_str)?;
+            let file_size = file.metadata()?.len();
+
+            let mut buffer = Vec::new();
+            file.read_to_end(&mut buffer)?;
+            let bytes = Bytes::from(buffer);
+
+            let object_store = Arc::new(InMemory::new());
+            let partitioned_file = PartitionedFile::new(filename, file_size);
+            object_store
+                .put(&partitioned_file.object_meta.location, bytes.into())
+                .await?;
+
+            let schema = match FileReader::try_new(File::open(path_str)?, None) {
+                Ok(reader) => reader.schema(),
+                Err(_) => StreamReader::try_new(File::open(path_str)?, None)?.schema(),
+            };
+
+            let source: Arc<dyn FileSource> = if filename.contains("stream") {
+                Arc::new(ArrowSource::new_stream_file_source(schema))
+            } else {
+                Arc::new(ArrowSource::new_file_source(schema))
+            };
+
+            let scan_config = FileScanConfigBuilder::new(
+                ObjectStoreUrl::local_filesystem(),
+                source.clone(),
+            )
+            .build();
+
+            let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+            let mut stream = file_opener.open(partitioned_file)?.await?;
+
+            assert!(stream.next().await.is_some());
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_file_opener_with_ranges() -> Result<()> {
+        let filename = "example.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new_with_range(
+            filename.into(),
+            file_size,
+            0,
+            (file_size - 1) as i64,
+        );
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let schema = FileReader::try_new(File::open(path_str)?, None)?.schema();
+
+        let source = Arc::new(ArrowSource::new_file_source(schema));
+
+        let scan_config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            source.clone(),
+        )
+        .build();
+
+        let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+        let mut stream = file_opener.open(partitioned_file)?.await?;
+
+        assert!(stream.next().await.is_some());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_stream_opener_errors_with_ranges() -> Result<()> {
+        let filename = "example_stream.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new_with_range(
+            filename.into(),
+            file_size,
+            0,
+            (file_size - 1) as i64,
+        );
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let schema = StreamReader::try_new(File::open(path_str)?, None)?.schema();
+
+        let source = Arc::new(ArrowSource::new_stream_file_source(schema));
+
+        let scan_config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            source.clone(),
+        )
+        .build();
+
+        let file_opener = source.create_file_opener(object_store, &scan_config, 0)?;
+        let result = file_opener.open(partitioned_file);
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrow_stream_repartitioning_not_supported() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("f0", DataType::Int64, false)]));
+        let source = ArrowSource::new_stream_file_source(schema);
+
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            Arc::new(source.clone()) as Arc<dyn FileSource>,
+        )
+        .build();
+
+        for target_partitions in [2, 4, 8, 16] {
+            let result =
+                source.repartitioned(target_partitions, 1024 * 1024, None, &config)?;
+
+            assert!(
+                result.is_none(),
+                "Stream format should not support repartitioning with {target_partitions} partitions",
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_stream_opener_with_projection() -> Result<()> {
+        let filename = "example_stream.arrow";
+        let path = format!("tests/data/{filename}");
+        let path_str = path.as_str();
+        let mut file = File::open(path_str)?;
+        let file_size = file.metadata()?.len();
+
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let bytes = Bytes::from(buffer);
+
+        let object_store = Arc::new(InMemory::new());
+        let partitioned_file = PartitionedFile::new(filename, file_size);
+        object_store
+            .put(&partitioned_file.object_meta.location, bytes.into())
+            .await?;
+
+        let opener = ArrowStreamFileOpener {
+            object_store,
+            projection: Some(vec![0]), // just the first column
+        };
+
+        let mut stream = opener.open(partitioned_file)?.await?;
+
+        if let Some(batch) = stream.next().await {
+            let batch = batch?;
+            assert_eq!(
+                batch.num_columns(),
+                1,
+                "Projection should result in 1 column"
+            );
+        } else {
+            panic!("Expected at least one batch");
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/core/tests/data/example.arrow b/datafusion/datasource-arrow/tests/data/example.arrow
similarity index 100%
rename from datafusion/core/tests/data/example.arrow
rename to datafusion/datasource-arrow/tests/data/example.arrow
diff --git a/datafusion/datasource-arrow/tests/data/example_stream.arrow b/datafusion/datasource-arrow/tests/data/example_stream.arrow
new file mode 100644
index 0000000000000..dbe10596f3a9d
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream.arrow differ
diff --git a/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow b/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow
new file mode 100644
index 0000000000000..78e56749d7f0d
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow differ
diff --git a/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow b/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow
new file mode 100644
index 0000000000000..3fa48d7669d91
Binary files /dev/null and b/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow differ
diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml
index 064f9f87ee9fe..c9299aeb101da 100644
--- a/datafusion/datasource-avro/Cargo.toml
+++ b/datafusion/datasource-avro/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-datasource-avro"
 description = "datafusion-datasource-avro"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -35,24 +35,21 @@ apache-avro = { workspace = true }
 arrow = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-chrono = { workspace = true }
-datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store", "avro"] }
 datafusion-datasource = { workspace = true }
-datafusion-execution = { workspace = true }
-datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
 futures = { workspace = true }
-num-traits = { version = "0.2" }
+num-traits = { workspace = true }
 object_store = { workspace = true }
-tokio = { workspace = true }
 
 [dev-dependencies]
-rstest = { workspace = true }
 serde_json = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource-avro/README.md b/datafusion/datasource-avro/README.md
index f8d7aebdcad18..e9b8affe60e36 100644
--- a/datafusion/datasource-avro/README.md
+++ b/datafusion/datasource-avro/README.md
@@ -17,10 +17,17 @@
   under the License.
 -->
 
-# DataFusion datasource
+# Apache DataFusion Avro DataSource
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate is a submodule of DataFusion that defines a Avro based file source.
+This crate is a submodule of DataFusion that defines an [Apache Avro] based file source.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[apache avro]: https://avro.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs
index 36553b36bc6ce..ea676a7611db9 100644
--- a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs
+++ b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs
@@ -19,24 +19,25 @@
 
 use apache_avro::schema::RecordSchema;
 use apache_avro::{
+    Error as AvroError, Reader as AvroReader,
+    error::Details as AvroErrorDetails,
     schema::{Schema as AvroSchema, SchemaKind},
     types::Value,
-    Error as AvroError, Reader as AvroReader,
 };
 use arrow::array::{
-    make_array, Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef,
-    BooleanBuilder, LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait,
-    PrimitiveArray, StringArray, StringBuilder, StringDictionaryBuilder,
+    Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef, BooleanBuilder,
+    LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait, PrimitiveArray,
+    StringArray, StringBuilder, StringDictionaryBuilder, make_array,
 };
 use arrow::array::{BinaryArray, FixedSizeBinaryArray, GenericListArray};
 use arrow::buffer::{Buffer, MutableBuffer};
 use arrow::datatypes::{
     ArrowDictionaryKeyType, ArrowNumericType, ArrowPrimitiveType, DataType, Date32Type,
-    Date64Type, Field, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Date64Type, Field, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+    Int64Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
     Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
 use arrow::datatypes::{Fields, SchemaRef};
 use arrow::error::ArrowError;
@@ -45,7 +46,7 @@ use arrow::error::Result as ArrowResult;
 use arrow::record_batch::RecordBatch;
 use arrow::util::bit_util;
 use datafusion_common::arrow_err;
-use datafusion_common::error::{DataFusionError, Result};
+use datafusion_common::error::Result;
 use num_traits::NumCast;
 use std::collections::BTreeMap;
 use std::io::Read;
@@ -102,16 +103,16 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                     )
                     .is_some();
                 let sub_schemas = us.variants();
-                if has_nullable && sub_schemas.len() == 2 {
-                    if let Some(sub_schema) =
+                if has_nullable
+                    && sub_schemas.len() == 2
+                    && let Some(sub_schema) =
                         sub_schemas.iter().find(|&s| !matches!(s, AvroSchema::Null))
-                    {
-                        Self::child_schema_lookup(
-                            parent_field_name,
-                            sub_schema,
-                            schema_lookup,
-                        )?;
-                    }
+                {
+                    Self::child_schema_lookup(
+                        parent_field_name,
+                        sub_schema,
+                        schema_lookup,
+                    )?;
                 }
             }
             AvroSchema::Record(RecordSchema { fields, lookup, .. }) => {
@@ -131,9 +132,8 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                 }
             }
             AvroSchema::Array(schema) => {
-                let sub_parent_field_name = format!("{parent_field_name}.element");
                 Self::child_schema_lookup(
-                    &sub_parent_field_name,
+                    parent_field_name,
                     &schema.items,
                     schema_lookup,
                 )?;
@@ -152,7 +152,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
             .map(|value| match value {
                 Ok(Value::Record(v)) => Ok(v),
                 Err(e) => Err(ArrowError::ParseError(format!(
-                    "Failed to parse avro value: {e:?}"
+                    "Failed to parse avro value: {e}"
                 ))),
                 other => Err(ArrowError::ParseError(format!(
                     "Row needs to be of type object, got: {other:?}"
@@ -280,7 +280,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                 self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
             }
             ref e => Err(SchemaError(format!(
-                "Data type is currently not supported for dictionaries in list : {e:?}"
+                "Data type is currently not supported for dictionaries in list : {e}"
             ))),
         }
     }
@@ -307,8 +307,8 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
             }
             e => {
                 return Err(SchemaError(format!(
-                    "Nested list data builder type is not supported: {e:?}"
-                )))
+                    "Nested list data builder type is not supported: {e}"
+                )));
             }
         };
 
@@ -372,8 +372,8 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                     }
                     e => {
                         return Err(SchemaError(format!(
-                            "Nested list data builder type is not supported: {e:?}"
-                        )))
+                            "Nested list data builder type is not supported: {e}"
+                        )));
                     }
                 }
             }
@@ -517,7 +517,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
             DataType::UInt32 => self.read_primitive_list_values::<UInt32Type>(rows),
             DataType::UInt64 => self.read_primitive_list_values::<UInt64Type>(rows),
             DataType::Float16 => {
-                return Err(SchemaError("Float16 not supported".to_string()))
+                return Err(SchemaError("Float16 not supported".to_string()));
             }
             DataType::Float32 => self.read_primitive_list_values::<Float32Type>(rows),
             DataType::Float64 => self.read_primitive_list_values::<Float64Type>(rows),
@@ -528,7 +528,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
             | DataType::Time64(_) => {
                 return Err(SchemaError(
                     "Temporal types are not yet supported, see ARROW-4803".to_string(),
-                ))
+                ));
             }
             DataType::Utf8 => flatten_string_values(rows)
                 .into_iter()
@@ -595,10 +595,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                     })
                     .collect();
 
-                let sub_parent_field_name =
-                    format!("{}.{}", parent_field_name, list_field.name());
-                let arrays =
-                    self.build_struct_array(&rows, &sub_parent_field_name, fields)?;
+                let arrays = self.build_struct_array(&rows, parent_field_name, fields)?;
                 let data_type = DataType::Struct(fields.clone());
                 ArrayDataBuilder::new(data_type)
                     .len(rows.len())
@@ -609,7 +606,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
             }
             datatype => {
                 return Err(SchemaError(format!(
-                    "Nested list of {datatype:?} not supported"
+                    "Nested list of {datatype} not supported"
                 )));
             }
         };
@@ -718,7 +715,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                         t => {
                             return Err(SchemaError(format!(
                                 "TimeUnit {t:?} not supported with Time64"
-                            )))
+                            )));
                         }
                     },
                     DataType::Time32(unit) => match unit {
@@ -732,7 +729,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                         t => {
                             return Err(SchemaError(format!(
                                 "TimeUnit {t:?} not supported with Time32"
-                            )))
+                            )));
                         }
                     },
                     DataType::Utf8 | DataType::LargeUtf8 => Arc::new(
@@ -756,7 +753,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                             .collect::<BinaryArray>(),
                     )
                         as ArrayRef,
-                    DataType::FixedSizeBinary(ref size) => {
+                    DataType::FixedSizeBinary(size) => {
                         Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(
                             rows.iter().map(|row| {
                                 let maybe_value = self.field_lookup(&field_path, row);
@@ -765,9 +762,9 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                             *size,
                         )?) as ArrayRef
                     }
-                    DataType::List(ref list_field) => {
+                    DataType::List(list_field) => {
                         match list_field.data_type() {
-                            DataType::Dictionary(ref key_ty, _) => {
+                            DataType::Dictionary(key_ty, _) => {
                                 self.build_wrapped_list_array(rows, &field_path, key_ty)?
                             }
                             _ => {
@@ -787,7 +784,7 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                             }
                         }
                     }
-                    DataType::Dictionary(ref key_ty, ref val_ty) => self
+                    DataType::Dictionary(key_ty, val_ty) => self
                         .build_string_dictionary_array(
                             rows,
                             &field_path,
@@ -830,9 +827,9 @@ impl<R: Read> AvroArrowArrayReader<'_, R> {
                     }
                     _ => {
                         return Err(SchemaError(format!(
-                            "type {:?} not supported",
+                            "type {} not supported",
                             field.data_type()
-                        )))
+                        )));
                     }
                 };
                 Ok(arr)
@@ -929,13 +926,13 @@ fn resolve_string(v: &Value) -> ArrowResult<Option<String>> {
     match v {
         Value::String(s) => Ok(Some(s.clone())),
         Value::Bytes(bytes) => String::from_utf8(bytes.to_vec())
-            .map_err(AvroError::ConvertToUtf8)
+            .map_err(|e| AvroError::new(AvroErrorDetails::ConvertToUtf8(e)))
             .map(Some),
         Value::Enum(_, s) => Ok(Some(s.clone())),
         Value::Null => Ok(None),
-        other => Err(AvroError::GetString(other.into())),
+        other => Err(AvroError::new(AvroErrorDetails::GetString(other.clone()))),
     }
-    .map_err(|e| SchemaError(format!("expected resolvable string : {e:?}")))
+    .map_err(|e| SchemaError(format!("expected resolvable string : {e}")))
 }
 
 fn resolve_u8(v: &Value) -> Option<u8> {
@@ -1037,7 +1034,7 @@ where
 mod test {
     use crate::avro_to_arrow::{Reader, ReaderBuilder};
     use arrow::array::Array;
-    use arrow::datatypes::DataType;
+    use arrow::datatypes::{DataType, Fields};
     use arrow::datatypes::{Field, TimeUnit};
     use datafusion_common::assert_batches_eq;
     use datafusion_common::cast::{
@@ -1046,7 +1043,7 @@ mod test {
     use std::fs::File;
     use std::sync::Arc;
 
-    fn build_reader(name: &str, batch_size: usize) -> Reader<File> {
+    fn build_reader(name: &'_ str, batch_size: usize) -> Reader<'_, File> {
         let testdata = datafusion_common::test_util::arrow_test_data();
         let filename = format!("{testdata}/avro/{name}");
         let builder = ReaderBuilder::new()
@@ -1719,4 +1716,92 @@ mod test {
         assert_eq!(2, num_batches);
         assert_eq!(28, sum_id);
     }
+
+    #[test]
+    fn test_list_of_structs_with_custom_field_name() {
+        let schema = apache_avro::Schema::parse_str(
+            r#"
+        {
+          "type": "record",
+          "name": "root",
+          "fields": [
+            {
+              "name": "items",
+              "type": {
+                "type": "array",
+                "items": {
+                  "type": "record",
+                  "name": "item_record",
+                  "fields": [
+                    {
+                      "name": "id",
+                      "type": "long"
+                    },
+                    {
+                      "name": "name",
+                      "type": "string"
+                    }
+                  ]
+                }
+              }
+            }
+          ]
+        }"#,
+        )
+        .unwrap();
+
+        let r1 = apache_avro::to_value(serde_json::json!({
+            "items": [
+                {
+                    "id": 1,
+                    "name": "first"
+                },
+                {
+                    "id": 2,
+                    "name": "second"
+                }
+            ]
+        }))
+        .unwrap()
+        .resolve(&schema)
+        .unwrap();
+
+        let mut w = apache_avro::Writer::new(&schema, vec![]);
+        w.append(r1).unwrap();
+        let bytes = w.into_inner().unwrap();
+
+        // Create an Arrow schema where the list field is NOT named "element"
+        let arrow_schema = Arc::new(arrow::datatypes::Schema::new(vec![Field::new(
+            "items",
+            DataType::List(Arc::new(Field::new(
+                "item", // This is NOT "element"
+                DataType::Struct(Fields::from(vec![
+                    Field::new("id", DataType::Int64, false),
+                    Field::new("name", DataType::Utf8, false),
+                ])),
+                false,
+            ))),
+            false,
+        )]));
+
+        let mut reader = ReaderBuilder::new()
+            .with_schema(arrow_schema)
+            .with_batch_size(10)
+            .build(std::io::Cursor::new(bytes))
+            .unwrap();
+
+        // This used to fail because schema_lookup would have "items.element.id" and "items.element.name"
+        // but build_struct_array will try to look up "items.item.id" and "items.item.name",
+        // Now it it is simply "items.id" and "items.name"
+        let batch = reader.next().unwrap().unwrap();
+
+        let expected = [
+            "+-----------------------------------------------+",
+            "| items                                         |",
+            "+-----------------------------------------------+",
+            "| [{id: 1, name: first}, {id: 2, name: second}] |",
+            "+-----------------------------------------------+",
+        ];
+        assert_batches_eq!(expected, &[batch]);
+    }
 }
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs
index 7f5900605a060..bd96b47aea9e6 100644
--- a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs
+++ b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs
@@ -64,13 +64,9 @@ impl ReaderBuilder {
     ///     let file = File::open("test/data/basic.avro").unwrap();
     ///
     ///     // create a builder, inferring the schema with the first 100 records
-    ///     let builder = ReaderBuilder::new()
-    ///       .read_schema()
-    ///       .with_batch_size(100);
+    ///     let builder = ReaderBuilder::new().read_schema().with_batch_size(100);
     ///
-    ///     let reader = builder
-    ///       .build::<File>(file)
-    ///       .unwrap();
+    ///     let reader = builder.build::<File>(file).unwrap();
     ///
     ///     reader
     /// }
@@ -117,7 +113,7 @@ impl ReaderBuilder {
             None => Arc::new(super::read_avro_schema_from_reader(&mut source)?),
         };
         source.rewind()?;
-        Reader::try_new(source, schema, self.batch_size, self.projection)
+        Reader::try_new(source, &schema, self.batch_size, self.projection.as_ref())
     }
 }
 
@@ -139,12 +135,12 @@ impl<R: Read> Reader<'_, R> {
     /// useful if plucking values from a struct, e.g. getting `a.b.c.e` from `a.b.c.{d, e}`.
     pub fn try_new(
         reader: R,
-        schema: SchemaRef,
+        schema: &SchemaRef,
         batch_size: usize,
-        projection: Option<Vec<String>>,
+        projection: Option<&Vec<String>>,
     ) -> Result<Self> {
         let projected_schema = projection.as_ref().filter(|p| !p.is_empty()).map_or_else(
-            || Arc::clone(&schema),
+            || Arc::clone(schema),
             |proj| {
                 Arc::new(arrow::datatypes::Schema::new(
                     proj.iter()
@@ -195,7 +191,7 @@ mod tests {
     use arrow::datatypes::{DataType, Field};
     use std::fs::File;
 
-    fn build_reader(name: &str, projection: Option<Vec<String>>) -> Reader<File> {
+    fn build_reader(name: &'_ str, projection: Option<Vec<String>>) -> Reader<'_, File> {
         let testdata = datafusion_common::test_util::arrow_test_data();
         let filename = format!("{testdata}/avro/{name}");
         let mut builder = ReaderBuilder::new().read_schema().with_batch_size(64);
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
index f53d38e51d1fe..053be3c9aff94 100644
--- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
+++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use apache_avro::Schema as AvroSchema;
 use apache_avro::schema::{
     Alias, DecimalSchema, EnumSchema, FixedSchema, Name, RecordSchema,
 };
 use apache_avro::types::Value;
-use apache_avro::Schema as AvroSchema;
 use arrow::datatypes::{DataType, IntervalUnit, Schema, TimeUnit, UnionMode};
 use arrow::datatypes::{Field, UnionFields};
 use datafusion_common::error::Result;
@@ -107,15 +107,18 @@ fn schema_to_field_with_props(
                         .data_type()
                         .clone()
                 } else {
-                    return Err(apache_avro::Error::GetUnionDuplicate.into());
+                    return Err(apache_avro::Error::new(
+                        apache_avro::error::Details::GetUnionDuplicate,
+                    )
+                    .into());
                 }
             } else {
                 let fields = sub_schemas
                     .iter()
                     .map(|s| schema_to_field_with_props(s, None, has_nullable, None))
                     .collect::<Result<Vec<Field>>>()?;
-                let type_ids = 0_i8..fields.len() as i8;
-                DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
+                // Assign type_ids based on the order in which they appear
+                DataType::Union(UnionFields::from_fields(fields), UnionMode::Dense)
             }
         }
         AvroSchema::Record(RecordSchema { fields, .. }) => {
@@ -235,6 +238,8 @@ fn default_field_name(dt: &DataType) -> &str {
         | DataType::LargeListView(_) => {
             unimplemented!("View support not implemented")
         }
+        DataType::Decimal32(_, _) => "decimal",
+        DataType::Decimal64(_, _) => "decimal",
         DataType::Decimal128(_, _) => "decimal",
         DataType::Decimal256(_, _) => "decimal",
     }
@@ -243,15 +248,9 @@ fn default_field_name(dt: &DataType) -> &str {
 fn external_props(schema: &AvroSchema) -> HashMap<String, String> {
     let mut props = HashMap::new();
     match &schema {
-        AvroSchema::Record(RecordSchema {
-            doc: Some(ref doc), ..
-        })
-        | AvroSchema::Enum(EnumSchema {
-            doc: Some(ref doc), ..
-        })
-        | AvroSchema::Fixed(FixedSchema {
-            doc: Some(ref doc), ..
-        }) => {
+        AvroSchema::Record(RecordSchema { doc: Some(doc), .. })
+        | AvroSchema::Enum(EnumSchema { doc: Some(doc), .. })
+        | AvroSchema::Fixed(FixedSchema { doc: Some(doc), .. }) => {
             props.insert("avro::doc".to_string(), doc.clone());
         }
         _ => {}
@@ -307,8 +306,8 @@ pub fn aliased(
 #[cfg(test)]
 mod test {
     use super::{aliased, external_props, to_arrow_schema};
-    use apache_avro::schema::{Alias, EnumSchema, FixedSchema, Name, RecordSchema};
     use apache_avro::Schema as AvroSchema;
+    use apache_avro::schema::{Alias, EnumSchema, FixedSchema, Name, RecordSchema};
     use arrow::datatypes::DataType::{Binary, Float32, Float64, Timestamp, Utf8};
     use arrow::datatypes::DataType::{Boolean, Int32, Int64};
     use arrow::datatypes::TimeUnit::Microsecond;
diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs
index 47f8d9daca0ad..c4960dbcc99bb 100644
--- a/datafusion/datasource-avro/src/file_format.rs
+++ b/datafusion/datasource-avro/src/file_format.rs
@@ -27,21 +27,21 @@ use crate::source::AvroSource;
 
 use arrow::datatypes::Schema;
 use arrow::datatypes::SchemaRef;
+use datafusion_common::DEFAULT_AVRO_EXTENSION;
+use datafusion_common::GetExt;
 use datafusion_common::internal_err;
 use datafusion_common::parsers::CompressionTypeVariant;
-use datafusion_common::GetExt;
-use datafusion_common::DEFAULT_AVRO_EXTENSION;
 use datafusion_common::{Result, Statistics};
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_datasource::source::DataSourceExec;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
 
 use async_trait::async_trait;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
+use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
 
 #[derive(Default)]
 /// Factory struct used to create [`AvroFormat`]
@@ -110,6 +110,10 @@ impl FileFormat for AvroFormat {
         }
     }
 
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        None
+    }
+
     async fn infer_schema(
         &self,
         _state: &dyn Session,
@@ -150,13 +154,13 @@ impl FileFormat for AvroFormat {
         _state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let config = FileScanConfigBuilder::from(conf)
-            .with_source(self.file_source())
-            .build();
-        Ok(DataSourceExec::from_data_source(config))
+        Ok(DataSourceExec::from_data_source(conf))
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(AvroSource::new())
+    fn file_source(
+        &self,
+        table_schema: datafusion_datasource::TableSchema,
+    ) -> Arc<dyn FileSource> {
+        Arc::new(AvroSource::new(table_schema))
     }
 }
diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs
index 71996f3f0eaa2..5ad209591e380 100644
--- a/datafusion/datasource-avro/src/mod.rs
+++ b/datafusion/datasource-avro/src/mod.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! An [Avro](https://avro.apache.org/) based [`FileSource`](datafusion_datasource::file::FileSource) implementation and related functionality.
 
@@ -30,4 +31,5 @@ pub mod avro_to_arrow;
 pub mod file_format;
 pub mod source;
 
+pub use apache_avro;
 pub use file_format::*;
diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs
index 2fdf34b3cc56d..525d72e620fc6 100644
--- a/datafusion/datasource-avro/src/source.rs
+++ b/datafusion/datasource-avro/src/source.rs
@@ -22,42 +22,54 @@ use std::sync::Arc;
 
 use crate::avro_to_arrow::Reader as AvroReader;
 
-use arrow::datatypes::SchemaRef;
 use datafusion_common::error::Result;
-use datafusion_common::Statistics;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_datasource::file_stream::FileOpener;
-use datafusion_datasource::impl_schema_adapter_methods;
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::projection::ProjectionExprs;
 
 use object_store::ObjectStore;
 
 /// AvroSource holds the extra configuration that is necessary for opening avro files
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct AvroSource {
-    schema: Option<SchemaRef>,
+    table_schema: TableSchema,
     batch_size: Option<usize>,
-    projection: Option<Vec<String>>,
+    projection: SplitProjection,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
 }
 
 impl AvroSource {
-    /// Initialize an AvroSource with default values
-    pub fn new() -> Self {
-        Self::default()
+    /// Initialize an AvroSource with the provided schema
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
     }
 
     fn open<R: std::io::Read>(&self, reader: R) -> Result<AvroReader<'static, R>> {
+        let file_schema = self.table_schema.file_schema();
+        let projection = Some(
+            self.projection
+                .file_indices
+                .iter()
+                .map(|&idx| file_schema.field(idx).name().clone())
+                .collect::<Vec<_>>(),
+        );
         AvroReader::try_new(
             reader,
-            Arc::clone(self.schema.as_ref().expect("Schema must set before open")),
+            &Arc::clone(self.table_schema.file_schema()),
             self.batch_size.expect("Batch size must set before open"),
-            self.projection.clone(),
+            projection.as_ref(),
         )
     }
 }
@@ -68,51 +80,53 @@ impl FileSource for AvroSource {
         object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(private::AvroOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        let mut opener = Arc::new(private::AvroOpener {
             config: Arc::new(self.clone()),
             object_store,
-        })
+        }) as Arc<dyn FileOpener>;
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+        Ok(opener)
     }
 
     fn as_any(&self) -> &dyn Any {
         self
     }
 
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
+    }
+
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
         let mut conf = self.clone();
         conf.batch_size = Some(batch_size);
         Arc::new(conf)
     }
 
-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.schema = Some(schema);
-        Arc::new(conf)
-    }
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projection = config.projected_file_column_names();
-        Arc::new(conf)
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
-
     fn file_type(&self) -> &str {
         "avro"
     }
@@ -127,16 +141,28 @@ impl FileSource for AvroSource {
         Ok(None)
     }
 
-    impl_schema_adapter_methods!();
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
 }
 
 mod private {
     use super::*;
 
     use bytes::Buf;
-    use datafusion_datasource::{file_meta::FileMeta, file_stream::FileOpenFuture};
+    use datafusion_datasource::{PartitionedFile, file_stream::FileOpenFuture};
     use futures::StreamExt;
-    use object_store::{GetResultPayload, ObjectStore};
+    use object_store::{GetResultPayload, ObjectStore, ObjectStoreExt};
 
     pub struct AvroOpener {
         pub config: Arc<AvroSource>,
@@ -144,20 +170,26 @@ mod private {
     }
 
     impl FileOpener for AvroOpener {
-        fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture> {
+        fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
             let config = Arc::clone(&self.config);
             let object_store = Arc::clone(&self.object_store);
             Ok(Box::pin(async move {
-                let r = object_store.get(file_meta.location()).await?;
+                let r = object_store
+                    .get(&partitioned_file.object_meta.location)
+                    .await?;
                 match r.payload {
                     GetResultPayload::File(file, _) => {
                         let reader = config.open(file)?;
-                        Ok(futures::stream::iter(reader).boxed())
+                        Ok(futures::stream::iter(reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
                     }
                     GetResultPayload::Stream(_) => {
                         let bytes = r.bytes().await?;
                         let reader = config.open(bytes.reader())?;
-                        Ok(futures::stream::iter(reader).boxed())
+                        Ok(futures::stream::iter(reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
                     }
                 }
             }))
diff --git a/datafusion/datasource-csv/Cargo.toml b/datafusion/datasource-csv/Cargo.toml
index c9e4649bdc25d..295092512742b 100644
--- a/datafusion/datasource-csv/Cargo.toml
+++ b/datafusion/datasource-csv/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-datasource-csv"
 description = "datafusion-datasource-csv"
+readme = "README.md"
 authors.workspace = true
-edition.workspace = true
+edition = { workspace = true }
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -34,13 +34,11 @@ all-features = true
 arrow = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-common-runtime = { workspace = true }
 datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
@@ -49,6 +47,9 @@ object_store = { workspace = true }
 regex = { workspace = true }
 tokio = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource-csv/README.md b/datafusion/datasource-csv/README.md
index c5944f9e438fa..8bdadd0fe2c13 100644
--- a/datafusion/datasource-csv/README.md
+++ b/datafusion/datasource-csv/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion datasource
+# Apache DataFusion CSV DataSource
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that defines a CSV based file source.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs
index c9cd09bf676b7..7a253d81db9f8 100644
--- a/datafusion/datasource-csv/src/file_format.rs
+++ b/datafusion/datasource-csv/src/file_format.rs
@@ -31,23 +31,24 @@ use arrow::error::ArrowError;
 use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions};
 use datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::{
-    exec_err, not_impl_err, DataFusionError, GetExt, Result, Statistics,
-    DEFAULT_CSV_EXTENSION,
+    DEFAULT_CSV_EXTENSION, DataFusionError, GetExt, Result, Statistics, exec_err,
+    not_impl_err,
 };
 use datafusion_common_runtime::SpawnedTask;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::decoder::Decoder;
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{
-    FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD,
+    DEFAULT_SCHEMA_INFER_MAX_RECORD, FileFormat, FileFormatFactory,
 };
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
+use datafusion_datasource::write::BatchSerializer;
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join;
-use datafusion_datasource::write::BatchSerializer;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
@@ -58,8 +59,10 @@ use async_trait::async_trait;
 use bytes::{Buf, Bytes};
 use datafusion_datasource::source::DataSourceExec;
 use futures::stream::BoxStream;
-use futures::{pin_mut, Stream, StreamExt, TryStreamExt};
-use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore};
+use futures::{Stream, StreamExt, TryStreamExt, pin_mut};
+use object_store::{
+    ObjectMeta, ObjectStore, ObjectStoreExt, delimited::newline_delimited_stream,
+};
 use regex::Regex;
 
 #[derive(Default)]
@@ -151,13 +154,13 @@ impl CsvFormat {
         let stream = store
             .get(&object.location)
             .await
-            .map_err(DataFusionError::ObjectStore);
+            .map_err(|e| DataFusionError::ObjectStore(Box::new(e)));
         let stream = match stream {
             Ok(stream) => self
                 .read_to_delimited_chunks_from_stream(
                     stream
                         .into_stream()
-                        .map_err(DataFusionError::ObjectStore)
+                        .map_err(|e| DataFusionError::ObjectStore(Box::new(e)))
                         .boxed(),
                 )
                 .await
@@ -181,7 +184,7 @@ impl CsvFormat {
         let stream = match decoder {
             Ok(decoded_stream) => {
                 newline_delimited_stream(decoded_stream.map_err(|e| match e {
-                    DataFusionError::ObjectStore(e) => e,
+                    DataFusionError::ObjectStore(e) => *e,
                     err => object_store::Error::Generic {
                         store: "read to delimited chunks failed",
                         source: Box::new(err),
@@ -210,6 +213,11 @@ impl CsvFormat {
 
     /// Set a limit in terms of records to scan to infer the schema
     /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
+    ///
+    /// # Behavior when set to 0
+    ///
+    /// When `max_rec` is set to 0, schema inference is disabled and all fields
+    /// will be inferred as `Utf8` (string) type, regardless of their actual content.
     pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
         self.options.schema_infer_max_rec = Some(max_rec);
         self
@@ -222,6 +230,11 @@ impl CsvFormat {
         self
     }
 
+    pub fn with_truncated_rows(mut self, truncated_rows: bool) -> Self {
+        self.options.truncated_rows = Some(truncated_rows);
+        self
+    }
+
     /// Set the regex to use for null values in the CSV reader.
     /// - default to treat empty values as null.
     pub fn with_null_regex(mut self, null_regex: Option<String>) -> Self {
@@ -291,6 +304,13 @@ impl CsvFormat {
         self
     }
 
+    /// Set whether rows should be truncated to the column width
+    /// - defaults to false
+    pub fn with_truncate_rows(mut self, truncate_rows: bool) -> Self {
+        self.options.truncated_rows = Some(truncate_rows);
+        self
+    }
+
     /// The delimiter character.
     pub fn delimiter(&self) -> u8 {
         self.options.delimiter
@@ -358,6 +378,10 @@ impl FileFormat for CsvFormat {
         Ok(format!("{}{}", ext, file_compression_type.get_ext()))
     }
 
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        Some(self.options.compression.into())
+    }
+
     async fn infer_schema(
         &self,
         state: &dyn Session,
@@ -418,18 +442,23 @@ impl FileFormat for CsvFormat {
             .newlines_in_values
             .unwrap_or_else(|| state.config_options().catalog.newlines_in_values);
 
-        let conf_builder = FileScanConfigBuilder::from(conf)
-            .with_file_compression_type(self.options.compression.into())
-            .with_newlines_in_values(newlines_in_values);
+        let mut csv_options = self.options.clone();
+        csv_options.has_header = Some(has_header);
+        csv_options.newlines_in_values = Some(newlines_in_values);
 
-        let source = Arc::new(
-            CsvSource::new(has_header, self.options.delimiter, self.options.quote)
-                .with_escape(self.options.escape)
-                .with_terminator(self.options.terminator)
-                .with_comment(self.options.comment),
-        );
+        // Get the existing CsvSource and update its options
+        // We need to preserve the table_schema from the original source (which includes partition columns)
+        let csv_source = conf
+            .file_source
+            .as_any()
+            .downcast_ref::<CsvSource>()
+            .expect("file_source should be a CsvSource");
+        let source = Arc::new(csv_source.clone().with_csv_options(csv_options));
 
-        let config = conf_builder.with_source(source).build();
+        let config = FileScanConfigBuilder::from(conf)
+            .with_file_compression_type(self.options.compression.into())
+            .with_source(source)
+            .build();
 
         Ok(DataSourceExec::from_data_source(config))
     }
@@ -471,15 +500,32 @@ impl FileFormat for CsvFormat {
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(CsvSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        let mut csv_options = self.options.clone();
+        if csv_options.has_header.is_none() {
+            csv_options.has_header = Some(true);
+        }
+        Arc::new(CsvSource::new(table_schema).with_csv_options(csv_options))
     }
 }
 
 impl CsvFormat {
     /// Return the inferred schema reading up to records_to_read from a
     /// stream of delimited chunks returning the inferred schema and the
-    /// number of lines that were read
+    /// number of lines that were read.
+    ///
+    /// This method can handle CSV files with different numbers of columns.
+    /// The inferred schema will be the union of all columns found across all files.
+    /// Files with fewer columns will have missing columns filled with null values.
+    ///
+    /// # Example
+    ///
+    /// If you have two CSV files:
+    /// - `file1.csv`: `col1,col2,col3`
+    /// - `file2.csv`: `col1,col2,col3,col4,col5`
+    ///
+    /// The inferred schema will contain all 5 columns, with files that don't
+    /// have columns 4 and 5 having null values for those columns.
     pub async fn infer_schema_from_stream(
         &self,
         state: &dyn Session,
@@ -490,6 +536,7 @@ impl CsvFormat {
         let mut column_names = vec![];
         let mut column_type_possibilities = vec![];
         let mut record_number = -1;
+        let initial_records_to_read = records_to_read;
 
         pin_mut!(stream);
 
@@ -505,7 +552,8 @@ impl CsvFormat {
                             .unwrap_or_else(|| state.config_options().catalog.has_header),
                 )
                 .with_delimiter(self.options.delimiter)
-                .with_quote(self.options.quote);
+                .with_quote(self.options.quote)
+                .with_truncated_rows(self.options.truncated_rows.unwrap_or(false));
 
             if let Some(null_regex) = &self.options.null_regex {
                 let regex = Regex::new(null_regex.as_str())
@@ -541,21 +589,37 @@ impl CsvFormat {
                     })
                     .unzip();
             } else {
-                if fields.len() != column_type_possibilities.len() {
+                if fields.len() != column_type_possibilities.len()
+                    && !self.options.truncated_rows.unwrap_or(false)
+                {
                     return exec_err!(
-                            "Encountered unequal lengths between records on CSV file whilst inferring schema. \
-                             Expected {} fields, found {} fields at record {}",
-                            column_type_possibilities.len(),
-                            fields.len(),
-                            record_number + 1
-                        );
+                        "Encountered unequal lengths between records on CSV file whilst inferring schema. \
+                         Expected {} fields, found {} fields at record {}",
+                        column_type_possibilities.len(),
+                        fields.len(),
+                        record_number + 1
+                    );
                 }
 
+                // First update type possibilities for existing columns using zip
                 column_type_possibilities.iter_mut().zip(&fields).for_each(
                     |(possibilities, field)| {
                         possibilities.insert(field.data_type().clone());
                     },
                 );
+
+                // Handle files with different numbers of columns by extending the schema
+                if fields.len() > column_type_possibilities.len() {
+                    // New columns found - extend our tracking structures
+                    for field in fields.iter().skip(column_type_possibilities.len()) {
+                        column_names.push(field.name().clone());
+                        let mut possibilities = HashSet::new();
+                        if records_read > 0 {
+                            possibilities.insert(field.data_type().clone());
+                        }
+                        column_type_possibilities.push(possibilities);
+                    }
+                }
             }
 
             if records_to_read == 0 {
@@ -563,20 +627,54 @@ impl CsvFormat {
             }
         }
 
-        let schema = build_schema_helper(column_names, &column_type_possibilities);
+        let schema = build_schema_helper(
+            column_names,
+            column_type_possibilities,
+            initial_records_to_read == 0,
+        );
         Ok((schema, total_records_read))
     }
 }
 
-fn build_schema_helper(names: Vec<String>, types: &[HashSet<DataType>]) -> Schema {
+/// Builds a schema from column names and their possible data types.
+///
+/// # Arguments
+///
+/// * `names` - Vector of column names
+/// * `types` - Vector of possible data types for each column (as HashSets)
+/// * `disable_inference` - When true, forces all columns with no inferred types to be Utf8.
+///   This should be set to true when `schema_infer_max_rec` is explicitly
+///   set to 0, indicating the user wants to skip type inference and treat
+///   all fields as strings. When false, columns with no inferred types
+///   will be set to Null, allowing schema merging to work properly.
+fn build_schema_helper(
+    names: Vec<String>,
+    types: Vec<HashSet<DataType>>,
+    disable_inference: bool,
+) -> Schema {
     let fields = names
         .into_iter()
         .zip(types)
-        .map(|(field_name, data_type_possibilities)| {
+        .map(|(field_name, mut data_type_possibilities)| {
             // ripped from arrow::csv::reader::infer_reader_schema_with_csv_options
             // determine data type based on possible types
             // if there are incompatible types, use DataType::Utf8
+
+            // ignore nulls, to avoid conflicting datatypes (e.g. [nulls, int]) being inferred as Utf8.
+            data_type_possibilities.remove(&DataType::Null);
+
             match data_type_possibilities.len() {
+                // When no types were inferred (empty HashSet):
+                // - If schema_infer_max_rec was explicitly set to 0, return Utf8
+                // - Otherwise return Null (whether from reading null values or empty files)
+                //   This allows schema merging to work when reading folders with empty files
+                0 => {
+                    if disable_inference {
+                        Field::new(field_name, DataType::Utf8, true)
+                    } else {
+                        Field::new(field_name, DataType::Null, true)
+                    }
+                }
                 1 => Field::new(
                     field_name,
                     data_type_possibilities.iter().next().unwrap().clone(),
@@ -716,6 +814,7 @@ impl FileSink for CsvSink {
             context,
             serializer,
             self.writer_options.compression.into(),
+            self.writer_options.compression_level,
             object_store,
             demux_task,
             file_stream_rx,
@@ -742,3 +841,82 @@ impl DataSink for CsvSink {
         FileSink::write_all(self, data, context).await
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::build_schema_helper;
+    use arrow::datatypes::DataType;
+    use std::collections::HashSet;
+
+    #[test]
+    fn test_build_schema_helper_different_column_counts() {
+        // Test the core schema building logic with different column counts
+        let mut column_names =
+            vec!["col1".to_string(), "col2".to_string(), "col3".to_string()];
+
+        // Simulate adding two more columns from another file
+        column_names.push("col4".to_string());
+        column_names.push("col5".to_string());
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Int64]),
+            HashSet::from([DataType::Utf8]),
+            HashSet::from([DataType::Float64]),
+            HashSet::from([DataType::Utf8]), // col4
+            HashSet::from([DataType::Utf8]), // col5
+        ];
+
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
+
+        // Verify schema has 5 columns
+        assert_eq!(schema.fields().len(), 5);
+        assert_eq!(schema.field(0).name(), "col1");
+        assert_eq!(schema.field(1).name(), "col2");
+        assert_eq!(schema.field(2).name(), "col3");
+        assert_eq!(schema.field(3).name(), "col4");
+        assert_eq!(schema.field(4).name(), "col5");
+
+        // All fields should be nullable
+        for field in schema.fields() {
+            assert!(
+                field.is_nullable(),
+                "Field {} should be nullable",
+                field.name()
+            );
+        }
+    }
+
+    #[test]
+    fn test_build_schema_helper_type_merging() {
+        // Test type merging logic
+        let column_names = vec!["col1".to_string(), "col2".to_string()];
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Int64, DataType::Float64]), // Should resolve to Float64
+            HashSet::from([DataType::Utf8]),                     // Should remain Utf8
+        ];
+
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
+
+        // col1 should be Float64 due to Int64 + Float64 = Float64
+        assert_eq!(*schema.field(0).data_type(), DataType::Float64);
+
+        // col2 should remain Utf8
+        assert_eq!(*schema.field(1).data_type(), DataType::Utf8);
+    }
+
+    #[test]
+    fn test_build_schema_helper_conflicting_types() {
+        // Test when we have incompatible types - should default to Utf8
+        let column_names = vec!["col1".to_string()];
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Boolean, DataType::Int64, DataType::Utf8]), // Should resolve to Utf8 due to conflicts
+        ];
+
+        let schema = build_schema_helper(column_names, column_type_possibilities, false);
+
+        // Should default to Utf8 for conflicting types
+        assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
+    }
+}
diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs
index 90538d0808b1a..fdfee05d86a79 100644
--- a/datafusion/datasource-csv/src/mod.rs
+++ b/datafusion/datasource-csv/src/mod.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
@@ -24,7 +25,7 @@ pub mod source;
 
 use std::sync::Arc;
 
-use arrow::datatypes::SchemaRef;
+use datafusion_common::Result;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
 use datafusion_datasource::{file::FileSource, file_scan_config::FileScanConfig};
@@ -33,11 +34,12 @@ pub use file_format::*;
 
 /// Returns a [`FileScanConfig`] for given `file_groups`
 pub fn partitioned_csv_config(
-    schema: SchemaRef,
     file_groups: Vec<FileGroup>,
     file_source: Arc<dyn FileSource>,
-) -> FileScanConfig {
-    FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source)
-        .with_file_groups(file_groups)
-        .build()
+) -> Result<FileScanConfig> {
+    Ok(
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(file_groups)
+            .build(),
+    )
 }
diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs
index d45080dc20776..77a0dc9cf7995 100644
--- a/datafusion/datasource-csv/src/source.rs
+++ b/datafusion/datasource-csv/src/source.rs
@@ -17,30 +17,31 @@
 
 //! Execution plan for reading CSV files
 
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
+use datafusion_physical_plan::projection::ProjectionExprs;
 use std::any::Any;
 use std::fmt;
 use std::io::{Read, Seek, SeekFrom};
 use std::sync::Arc;
 use std::task::Poll;
 
-use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer};
+use datafusion_datasource::decoder::{DecoderDeserializer, deserialize_stream};
 use datafusion_datasource::file_compression_type::FileCompressionType;
-use datafusion_datasource::file_meta::FileMeta;
 use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
 use datafusion_datasource::{
-    as_file_source, calculate_range, impl_schema_adapter_methods, FileRange,
-    ListingTableUrl, RangeCalculation,
+    FileRange, ListingTableUrl, PartitionedFile, RangeCalculation, TableSchema,
+    as_file_source, calculate_range,
 };
 
 use arrow::csv;
-use arrow::datatypes::SchemaRef;
-use datafusion_common::{DataFusionError, Result, Statistics};
+use datafusion_common::config::CsvOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result};
 use datafusion_common_runtime::JoinSet;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_execution::TaskContext;
-use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
 use datafusion_physical_plan::{
     DisplayFormatType, ExecutionPlan, ExecutionPlanProperties,
 };
@@ -62,100 +63,122 @@ use tokio::io::AsyncWriteExt;
 /// # use datafusion_datasource_csv::source::CsvSource;
 /// # use datafusion_execution::object_store::ObjectStoreUrl;
 /// # use datafusion_datasource::source::DataSourceExec;
+/// # use datafusion_common::config::CsvOptions;
 ///
 /// # let object_store_url = ObjectStoreUrl::local_filesystem();
 /// # let file_schema = Arc::new(Schema::empty());
 ///
-/// let source = Arc::new(CsvSource::new(
-///         true,
-///         b',',
-///         b'"',
-///     )
-///     .with_terminator(Some(b'#')
-/// ));
+/// let options = CsvOptions {
+///     has_header: Some(true),
+///     delimiter: b',',
+///     quote: b'"',
+///     newlines_in_values: Some(true), // The file contains newlines in values
+///     ..Default::default()
+/// };
+/// let source = Arc::new(CsvSource::new(file_schema.clone())
+///     .with_csv_options(options)
+///     .with_terminator(Some(b'#'))
+/// );
 /// // Create a DataSourceExec for reading the first 100MB of `file1.csv`
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+/// let config = FileScanConfigBuilder::new(object_store_url, source)
 ///     .with_file(PartitionedFile::new("file1.csv", 100*1024*1024))
-///     .with_newlines_in_values(true) // The file contains newlines in values;
 ///     .build();
 /// let exec = (DataSourceExec::from_data_source(config));
 /// ```
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone)]
 pub struct CsvSource {
+    options: CsvOptions,
     batch_size: Option<usize>,
-    file_schema: Option<SchemaRef>,
-    file_projection: Option<Vec<usize>>,
-    pub(crate) has_header: bool,
-    delimiter: u8,
-    quote: u8,
-    terminator: Option<u8>,
-    escape: Option<u8>,
-    comment: Option<u8>,
+    table_schema: TableSchema,
+    projection: SplitProjection,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
 }
 
 impl CsvSource {
     /// Returns a [`CsvSource`]
-    pub fn new(has_header: bool, delimiter: u8, quote: u8) -> Self {
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
         Self {
-            has_header,
-            delimiter,
-            quote,
-            ..Self::default()
+            options: CsvOptions::default(),
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
         }
     }
 
+    /// Sets the CSV options
+    pub fn with_csv_options(mut self, options: CsvOptions) -> Self {
+        self.options = options;
+        self
+    }
+
     /// true if the first line of each file is a header
     pub fn has_header(&self) -> bool {
-        self.has_header
+        self.options.has_header.unwrap_or(true)
+    }
+
+    // true if rows length support truncate
+    pub fn truncate_rows(&self) -> bool {
+        self.options.truncated_rows.unwrap_or(false)
     }
     /// A column delimiter
     pub fn delimiter(&self) -> u8 {
-        self.delimiter
+        self.options.delimiter
     }
 
     /// The quote character
     pub fn quote(&self) -> u8 {
-        self.quote
+        self.options.quote
     }
 
     /// The line terminator
     pub fn terminator(&self) -> Option<u8> {
-        self.terminator
+        self.options.terminator
     }
 
     /// Lines beginning with this byte are ignored.
     pub fn comment(&self) -> Option<u8> {
-        self.comment
+        self.options.comment
     }
 
     /// The escape character
     pub fn escape(&self) -> Option<u8> {
-        self.escape
+        self.options.escape
     }
 
     /// Initialize a CsvSource with escape
     pub fn with_escape(&self, escape: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.escape = escape;
+        conf.options.escape = escape;
         conf
     }
 
     /// Initialize a CsvSource with terminator
     pub fn with_terminator(&self, terminator: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.terminator = terminator;
+        conf.options.terminator = terminator;
         conf
     }
 
     /// Initialize a CsvSource with comment
     pub fn with_comment(&self, comment: Option<u8>) -> Self {
         let mut conf = self.clone();
-        conf.comment = comment;
+        conf.options.comment = comment;
+        conf
+    }
+
+    /// Whether to support truncate rows when read csv file
+    pub fn with_truncate_rows(&self, truncate_rows: bool) -> Self {
+        let mut conf = self.clone();
+        conf.options.truncated_rows = Some(truncate_rows);
         conf
     }
+
+    /// Whether values may contain newline characters
+    pub fn newlines_in_values(&self) -> bool {
+        self.options.newlines_in_values.unwrap_or(false)
+    }
 }
 
 impl CsvSource {
@@ -164,28 +187,24 @@ impl CsvSource {
     }
 
     fn builder(&self) -> csv::ReaderBuilder {
-        let mut builder = csv::ReaderBuilder::new(Arc::clone(
-            self.file_schema
-                .as_ref()
-                .expect("Schema must be set before initializing builder"),
-        ))
-        .with_delimiter(self.delimiter)
-        .with_batch_size(
-            self.batch_size
-                .expect("Batch size must be set before initializing builder"),
-        )
-        .with_header(self.has_header)
-        .with_quote(self.quote);
-        if let Some(terminator) = self.terminator {
+        let mut builder =
+            csv::ReaderBuilder::new(Arc::clone(self.table_schema.file_schema()))
+                .with_delimiter(self.delimiter())
+                .with_batch_size(
+                    self.batch_size
+                        .expect("Batch size must be set before initializing builder"),
+                )
+                .with_header(self.has_header())
+                .with_quote(self.quote())
+                .with_truncated_rows(self.truncate_rows());
+        if let Some(terminator) = self.terminator() {
             builder = builder.with_terminator(terminator);
         }
-        if let Some(proj) = &self.file_projection {
-            builder = builder.with_projection(proj.clone());
-        }
-        if let Some(escape) = self.escape {
+        builder = builder.with_projection(self.projection.file_indices.clone());
+        if let Some(escape) = self.escape() {
             builder = builder.with_escape(escape)
         }
-        if let Some(comment) = self.comment {
+        if let Some(comment) = self.comment() {
             builder = builder.with_comment(comment);
         }
 
@@ -198,6 +217,7 @@ pub struct CsvOpener {
     config: Arc<CsvSource>,
     file_compression_type: FileCompressionType,
     object_store: Arc<dyn ObjectStore>,
+    partition_index: usize,
 }
 
 impl CsvOpener {
@@ -211,6 +231,7 @@ impl CsvOpener {
             config,
             file_compression_type,
             object_store,
+            partition_index: 0,
         }
     }
 }
@@ -226,65 +247,88 @@ impl FileSource for CsvSource {
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
-        _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(CsvOpener {
+        partition_index: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        let mut opener = Arc::new(CsvOpener {
             config: Arc::new(self.clone()),
             file_compression_type: base_config.file_compression_type,
             object_store,
-        })
+            partition_index,
+        }) as Arc<dyn FileOpener>;
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+        Ok(opener)
     }
 
     fn as_any(&self) -> &dyn Any {
         self
     }
 
-    fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.batch_size = Some(batch_size);
-        Arc::new(conf)
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
     }
 
-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
+    fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
         let mut conf = self.clone();
-        conf.file_schema = Some(schema);
+        conf.batch_size = Some(batch_size);
         Arc::new(conf)
     }
 
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.file_projection = config.file_column_projection_indices();
-        Arc::new(conf)
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set"))
-    }
+
     fn file_type(&self) -> &str {
         "csv"
     }
+
+    fn supports_repartitioning(&self) -> bool {
+        // Cannot repartition if values may contain newlines, as record
+        // boundaries cannot be determined by byte offset alone
+        !self.options.newlines_in_values.unwrap_or(false)
+    }
+
     fn fmt_extra(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, ", has_header={}", self.has_header)
+                write!(f, ", has_header={}", self.has_header())
             }
             DisplayFormatType::TreeRender => Ok(()),
         }
     }
 
-    impl_schema_adapter_methods!();
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
 }
 
 impl FileOpener for CsvOpener {
@@ -311,25 +355,24 @@ impl FileOpener for CsvOpener {
     ///  A,1,2,3,4,5,6,7,8,9\n
     ///  A},1,2,3,4,5,6,7,8,9\n
     ///  The lines read would be: [1, 2]
-    fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture> {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         // `self.config.has_header` controls whether to skip reading the 1st line header
         // If the .csv file is read in parallel and this `CsvOpener` is only reading some middle
         // partition, then don't skip first line
-        let mut csv_has_header = self.config.has_header;
-        if let Some(FileRange { start, .. }) = file_meta.range {
-            if start != 0 {
-                csv_has_header = false;
-            }
+        let mut csv_has_header = self.config.has_header();
+        if let Some(FileRange { start, .. }) = partitioned_file.range
+            && start != 0
+        {
+            csv_has_header = false;
         }
 
-        let config = CsvSource {
-            has_header: csv_has_header,
-            ..(*self.config).clone()
-        };
+        let mut config = (*self.config).clone();
+        config.options.has_header = Some(csv_has_header);
+        config.options.truncated_rows = Some(config.truncate_rows());
 
         let file_compression_type = self.file_compression_type.to_owned();
 
-        if file_meta.range.is_some() {
+        if partitioned_file.range.is_some() {
             assert!(
                 !file_compression_type.is_compressed(),
                 "Reading compressed .csv in parallel is not supported"
@@ -337,13 +380,16 @@ impl FileOpener for CsvOpener {
         }
 
         let store = Arc::clone(&self.object_store);
-        let terminator = self.config.terminator;
+        let terminator = self.config.terminator();
+
+        let baseline_metrics =
+            BaselineMetrics::new(&self.config.metrics, self.partition_index);
 
         Ok(Box::pin(async move {
             // Current partition contains bytes [start_byte, end_byte) (might contain incomplete lines at boundaries)
 
             let calculated_range =
-                calculate_range(&file_meta, &store, terminator).await?;
+                calculate_range(&partitioned_file, &store, terminator).await?;
 
             let range = match calculated_range {
                 RangeCalculation::Range(None) => None,
@@ -351,7 +397,7 @@ impl FileOpener for CsvOpener {
                 RangeCalculation::TerminateEarly => {
                     return Ok(
                         futures::stream::poll_fn(move |_| Poll::Ready(None)).boxed()
-                    )
+                    );
                 }
             };
 
@@ -360,12 +406,14 @@ impl FileOpener for CsvOpener {
                 ..Default::default()
             };
 
-            let result = store.get_opts(file_meta.location(), options).await?;
+            let result = store
+                .get_opts(&partitioned_file.object_meta.location, options)
+                .await?;
 
             match result.payload {
                 #[cfg(not(target_arch = "wasm32"))]
                 GetResultPayload::File(mut file, _) => {
-                    let is_whole_file_scanned = file_meta.range.is_none();
+                    let is_whole_file_scanned = partitioned_file.range.is_none();
                     let decoder = if is_whole_file_scanned {
                         // Don't seek if no range as breaks FIFO files
                         file_compression_type.convert_read(file)?
@@ -376,17 +424,30 @@ impl FileOpener for CsvOpener {
                         )?
                     };
 
-                    Ok(futures::stream::iter(config.open(decoder)?).boxed())
+                    let mut reader = config.open(decoder)?;
+
+                    // Use std::iter::from_fn to wrap execution of iterator's next() method.
+                    let iterator = std::iter::from_fn(move || {
+                        let mut timer = baseline_metrics.elapsed_compute().timer();
+                        let result = reader.next();
+                        timer.stop();
+                        result
+                    });
+
+                    Ok(futures::stream::iter(iterator)
+                        .map(|r| r.map_err(Into::into))
+                        .boxed())
                 }
                 GetResultPayload::Stream(s) => {
                     let decoder = config.builder().build_decoder();
                     let s = s.map_err(DataFusionError::from);
                     let input = file_compression_type.convert_stream(s.boxed())?.fuse();
 
-                    Ok(deserialize_stream(
+                    let stream = deserialize_stream(
                         input,
                         DecoderDeserializer::new(CsvDecoder::new(decoder)),
-                    ))
+                    );
+                    Ok(stream.map_err(Into::into).boxed())
                 }
             }
         }))
diff --git a/datafusion/datasource-json/Cargo.toml b/datafusion/datasource-json/Cargo.toml
index 6c74923ff79e9..b5947ea5c4c67 100644
--- a/datafusion/datasource-json/Cargo.toml
+++ b/datafusion/datasource-json/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-datasource-json"
 description = "datafusion-datasource-json"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -34,24 +34,28 @@ all-features = true
 arrow = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-common-runtime = { workspace = true }
 datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
 futures = { workspace = true }
 object_store = { workspace = true }
-serde_json = { workspace = true }
 tokio = { workspace = true }
+tokio-stream = { workspace = true, features = ["sync"] }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
+[dev-dependencies]
+serde_json = { workspace = true }
+
 [lib]
 name = "datafusion_datasource_json"
 path = "src/mod.rs"
diff --git a/datafusion/datasource-json/README.md b/datafusion/datasource-json/README.md
index 64181814736df..ca2771b9d67e4 100644
--- a/datafusion/datasource-json/README.md
+++ b/datafusion/datasource-json/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion datasource
+# Apache DataFusion JSON DataSource
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that defines a JSON based file source.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs
index f6b758b5bc51c..8fe445705a21c 100644
--- a/datafusion/datasource-json/src/file_format.rs
+++ b/datafusion/datasource-json/src/file_format.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`JsonFormat`]: Line delimited JSON [`FileFormat`] abstractions
+//! [`JsonFormat`]: Line delimited and array JSON [`FileFormat`] abstractions
 
 use std::any::Any;
 use std::collections::HashMap;
 use std::fmt;
 use std::fmt::Debug;
-use std::io::BufReader;
+use std::io::{BufReader, Read};
 use std::sync::Arc;
 
 use crate::source::JsonSource;
@@ -30,36 +30,38 @@ use arrow::array::RecordBatch;
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::error::ArrowError;
 use arrow::json;
-use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter};
+use arrow::json::reader::{ValueIter, infer_json_schema_from_iterator};
+use bytes::{Buf, Bytes};
 use datafusion_common::config::{ConfigField, ConfigFileType, JsonOptions};
 use datafusion_common::file_options::json_writer::JsonWriterOptions;
 use datafusion_common::{
-    not_impl_err, GetExt, Result, Statistics, DEFAULT_JSON_EXTENSION,
+    DEFAULT_JSON_EXTENSION, GetExt, Result, Statistics, not_impl_err,
 };
 use datafusion_common_runtime::SpawnedTask;
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::decoder::Decoder;
 use datafusion_datasource::display::FileGroupDisplay;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_format::{
-    FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD,
+    DEFAULT_SCHEMA_INFER_MAX_RECORD, FileFormat, FileFormatFactory,
 };
 use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::sink::{DataSink, DataSinkExec};
+use datafusion_datasource::source::DataSourceExec;
+use datafusion_datasource::write::BatchSerializer;
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join;
-use datafusion_datasource::write::BatchSerializer;
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
 
+use crate::utils::JsonArrayToNdjsonReader;
 use async_trait::async_trait;
-use bytes::{Buf, Bytes};
-use datafusion_datasource::source::DataSourceExec;
-use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
+use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
 
 #[derive(Default)]
 /// Factory struct used to create [JsonFormat]
@@ -131,7 +133,26 @@ impl Debug for JsonFormatFactory {
     }
 }
 
-/// New line delimited JSON `FileFormat` implementation.
+/// JSON `FileFormat` implementation supporting both line-delimited and array formats.
+///
+/// # Supported Formats
+///
+/// ## Line-Delimited JSON (default, `newline_delimited = true`)
+/// ```text
+/// {"key1": 1, "key2": "val"}
+/// {"key1": 2, "key2": "vals"}
+/// ```
+///
+/// ## JSON Array Format (`newline_delimited = false`)
+/// ```text
+/// [
+///     {"key1": 1, "key2": "val"},
+///     {"key1": 2, "key2": "vals"}
+/// ]
+/// ```
+///
+/// Note: JSON array format is processed using streaming conversion,
+/// which is memory-efficient even for large files.
 #[derive(Debug, Default)]
 pub struct JsonFormat {
     options: JsonOptions,
@@ -165,6 +186,57 @@ impl JsonFormat {
         self.options.compression = file_compression_type.into();
         self
     }
+
+    /// Set whether to read as newline-delimited JSON (NDJSON).
+    ///
+    /// When `true` (default), expects newline-delimited format:
+    /// ```text
+    /// {"a": 1}
+    /// {"a": 2}
+    /// ```
+    ///
+    /// When `false`, expects JSON array format:
+    /// ```text
+    /// [{"a": 1}, {"a": 2}]
+    /// ```
+    pub fn with_newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.options.newline_delimited = newline_delimited;
+        self
+    }
+
+    /// Returns whether this format expects newline-delimited JSON.
+    pub fn is_newline_delimited(&self) -> bool {
+        self.options.newline_delimited
+    }
+}
+
+/// Infer schema from JSON array format using streaming conversion.
+///
+/// This function converts JSON array format to NDJSON on-the-fly and uses
+/// arrow-json's schema inference. It properly tracks the number of records
+/// processed for correct `records_to_read` management.
+///
+/// # Returns
+/// A tuple of (Schema, records_consumed) where records_consumed is the
+/// number of records that were processed for schema inference.
+fn infer_schema_from_json_array<R: Read>(
+    reader: R,
+    max_records: usize,
+) -> Result<(Schema, usize)> {
+    let ndjson_reader = JsonArrayToNdjsonReader::new(reader);
+
+    let iter = ValueIter::new(ndjson_reader, None);
+    let mut count = 0;
+
+    let schema = infer_json_schema_from_iterator(iter.take_while(|_| {
+        let should_take = count < max_records;
+        if should_take {
+            count += 1;
+        }
+        should_take
+    }))?;
+
+    Ok((schema, count))
 }
 
 #[async_trait]
@@ -185,6 +257,10 @@ impl FileFormat for JsonFormat {
         Ok(format!("{}{}", ext, file_compression_type.get_ext()))
     }
 
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        Some(self.options.compression.into())
+    }
+
     async fn infer_schema(
         &self,
         _state: &dyn Session,
@@ -197,37 +273,67 @@ impl FileFormat for JsonFormat {
             .schema_infer_max_rec
             .unwrap_or(DEFAULT_SCHEMA_INFER_MAX_RECORD);
         let file_compression_type = FileCompressionType::from(self.options.compression);
+        let newline_delimited = self.options.newline_delimited;
+
         for object in objects {
-            let mut take_while = || {
-                let should_take = records_to_read > 0;
-                if should_take {
-                    records_to_read -= 1;
-                }
-                should_take
-            };
+            // Early exit if we've read enough records
+            if records_to_read == 0 {
+                break;
+            }
 
             let r = store.as_ref().get(&object.location).await?;
-            let schema = match r.payload {
+
+            let (schema, records_consumed) = match r.payload {
                 #[cfg(not(target_arch = "wasm32"))]
                 GetResultPayload::File(file, _) => {
                     let decoder = file_compression_type.convert_read(file)?;
-                    let mut reader = BufReader::new(decoder);
-                    let iter = ValueIter::new(&mut reader, None);
-                    infer_json_schema_from_iterator(iter.take_while(|_| take_while()))?
+                    let reader = BufReader::new(decoder);
+
+                    if newline_delimited {
+                        // NDJSON: use ValueIter directly
+                        let iter = ValueIter::new(reader, None);
+                        let mut count = 0;
+                        let schema =
+                            infer_json_schema_from_iterator(iter.take_while(|_| {
+                                let should_take = count < records_to_read;
+                                if should_take {
+                                    count += 1;
+                                }
+                                should_take
+                            }))?;
+                        (schema, count)
+                    } else {
+                        // JSON array format: use streaming converter
+                        infer_schema_from_json_array(reader, records_to_read)?
+                    }
                 }
                 GetResultPayload::Stream(_) => {
                     let data = r.bytes().await?;
                     let decoder = file_compression_type.convert_read(data.reader())?;
-                    let mut reader = BufReader::new(decoder);
-                    let iter = ValueIter::new(&mut reader, None);
-                    infer_json_schema_from_iterator(iter.take_while(|_| take_while()))?
+                    let reader = BufReader::new(decoder);
+
+                    if newline_delimited {
+                        let iter = ValueIter::new(reader, None);
+                        let mut count = 0;
+                        let schema =
+                            infer_json_schema_from_iterator(iter.take_while(|_| {
+                                let should_take = count < records_to_read;
+                                if should_take {
+                                    count += 1;
+                                }
+                                should_take
+                            }))?;
+                        (schema, count)
+                    } else {
+                        // JSON array format: use streaming converter
+                        infer_schema_from_json_array(reader, records_to_read)?
+                    }
                 }
             };
 
             schemas.push(schema);
-            if records_to_read == 0 {
-                break;
-            }
+            // Correctly decrement records_to_read
+            records_to_read = records_to_read.saturating_sub(records_consumed);
         }
 
         let schema = Schema::try_merge(schemas)?;
@@ -249,12 +355,10 @@ impl FileFormat for JsonFormat {
         _state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let source = Arc::new(JsonSource::new());
         let conf = FileScanConfigBuilder::from(conf)
             .with_file_compression_type(FileCompressionType::from(
                 self.options.compression,
             ))
-            .with_source(source)
             .build();
         Ok(DataSourceExec::from_data_source(conf))
     }
@@ -277,8 +381,11 @@ impl FileFormat for JsonFormat {
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(JsonSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(
+            JsonSource::new(table_schema)
+                .with_newline_delimited(self.options.newline_delimited),
+        )
     }
 }
 
@@ -370,6 +477,7 @@ impl FileSink for JsonSink {
             context,
             serializer,
             self.writer_options.compression.into(),
+            self.writer_options.compression_level,
             object_store,
             demux_task,
             file_stream_rx,
diff --git a/datafusion/datasource-json/src/mod.rs b/datafusion/datasource-json/src/mod.rs
index 18bb8792c3ffe..7dc0a0c7ba0f9 100644
--- a/datafusion/datasource-json/src/mod.rs
+++ b/datafusion/datasource-json/src/mod.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
 
 pub mod file_format;
 pub mod source;
+pub mod utils;
 
 pub use file_format::*;
diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs
index 187876522e48e..2f1d5abbee599 100644
--- a/datafusion/datasource-json/src/source.rs
+++ b/datafusion/datasource-json/src/source.rs
@@ -15,40 +15,87 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Execution plan for reading line-delimited JSON files
+//! Execution plan for reading JSON files (line-delimited and array formats)
 
 use std::any::Any;
 use std::io::{BufReader, Read, Seek, SeekFrom};
+use std::pin::Pin;
 use std::sync::Arc;
-use std::task::Poll;
+use std::task::{Context, Poll};
 
 use crate::file_format::JsonDecoder;
+use crate::utils::{ChannelReader, JsonArrayToNdjsonReader};
 
 use datafusion_common::error::{DataFusionError, Result};
-use datafusion_common_runtime::JoinSet;
-use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common_runtime::{JoinSet, SpawnedTask};
+use datafusion_datasource::decoder::{DecoderDeserializer, deserialize_stream};
 use datafusion_datasource::file_compression_type::FileCompressionType;
-use datafusion_datasource::file_meta::FileMeta;
 use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use datafusion_datasource::projection::{ProjectionOpener, SplitProjection};
 use datafusion_datasource::{
-    as_file_source, calculate_range, impl_schema_adapter_methods, ListingTableUrl,
-    RangeCalculation,
+    ListingTableUrl, PartitionedFile, RangeCalculation, as_file_source, calculate_range,
 };
+use datafusion_physical_plan::projection::ProjectionExprs;
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
+use arrow::array::RecordBatch;
 use arrow::json::ReaderBuilder;
 use arrow::{datatypes::SchemaRef, json};
-use datafusion_common::Statistics;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_execution::TaskContext;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 
-use futures::{StreamExt, TryStreamExt};
+use futures::{Stream, StreamExt, TryStreamExt};
 use object_store::buffered::BufWriter;
 use object_store::{GetOptions, GetResultPayload, ObjectStore};
 use tokio::io::AsyncWriteExt;
+use tokio_stream::wrappers::ReceiverStream;
+
+/// Channel buffer size for streaming JSON array processing.
+/// With ~128KB average chunk size, 128 chunks ≈ 16MB buffer.
+const CHANNEL_BUFFER_SIZE: usize = 128;
+
+/// Buffer size for JsonArrayToNdjsonReader (2MB each, 4MB total for input+output)
+const JSON_CONVERTER_BUFFER_SIZE: usize = 2 * 1024 * 1024;
+
+// ============================================================================
+// JsonArrayStream - Custom stream wrapper to hold SpawnedTask handles
+// ============================================================================
+
+/// A stream wrapper that holds SpawnedTask handles to keep them alive
+/// until the stream is fully consumed or dropped.
+///
+/// This ensures cancel-safety: when the stream is dropped, the tasks
+/// are properly aborted via SpawnedTask's Drop implementation.
+struct JsonArrayStream {
+    inner: ReceiverStream<std::result::Result<RecordBatch, arrow::error::ArrowError>>,
+    /// Task that reads from object store and sends bytes to channel.
+    /// Kept alive until stream is consumed or dropped.
+    _read_task: SpawnedTask<()>,
+    /// Task that parses JSON and sends RecordBatches.
+    /// Kept alive until stream is consumed or dropped.
+    _parse_task: SpawnedTask<()>,
+}
+
+impl Stream for JsonArrayStream {
+    type Item = std::result::Result<RecordBatch, arrow::error::ArrowError>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.inner).poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+// ============================================================================
+// JsonOpener and JsonSource
+// ============================================================================
 
 /// A [`FileOpener`] that opens a JSON file and yields a [`FileOpenFuture`]
 pub struct JsonOpener {
@@ -56,38 +103,62 @@ pub struct JsonOpener {
     projected_schema: SchemaRef,
     file_compression_type: FileCompressionType,
     object_store: Arc<dyn ObjectStore>,
+    /// When `true` (default), expects newline-delimited JSON (NDJSON).
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    newline_delimited: bool,
 }
 
 impl JsonOpener {
-    /// Returns a  [`JsonOpener`]
+    /// Returns a [`JsonOpener`]
     pub fn new(
         batch_size: usize,
         projected_schema: SchemaRef,
         file_compression_type: FileCompressionType,
         object_store: Arc<dyn ObjectStore>,
+        newline_delimited: bool,
     ) -> Self {
         Self {
             batch_size,
             projected_schema,
             file_compression_type,
             object_store,
+            newline_delimited,
         }
     }
 }
 
 /// JsonSource holds the extra configuration that is necessary for [`JsonOpener`]
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub struct JsonSource {
+    table_schema: datafusion_datasource::TableSchema,
     batch_size: Option<usize>,
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    projection: SplitProjection,
+    /// When `true` (default), expects newline-delimited JSON (NDJSON).
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    newline_delimited: bool,
 }
 
 impl JsonSource {
-    /// Initialize a JsonSource with default values
-    pub fn new() -> Self {
-        Self::default()
+    /// Initialize a JsonSource with the provided schema
+    pub fn new(table_schema: impl Into<datafusion_datasource::TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            projection: SplitProjection::unprojected(&table_schema),
+            table_schema,
+            batch_size: None,
+            metrics: ExecutionPlanMetricsSet::new(),
+            newline_delimited: true,
+        }
+    }
+
+    /// Set whether to read as newline-delimited JSON.
+    ///
+    /// When `true` (default), expects newline-delimited format.
+    /// When `false`, expects JSON array format `[{...}, {...}]`.
+    pub fn with_newline_delimited(mut self, newline_delimited: bool) -> Self {
+        self.newline_delimited = newline_delimited;
+        self
     }
 }
 
@@ -103,59 +174,87 @@ impl FileSource for JsonSource {
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        Arc::new(JsonOpener {
+    ) -> Result<Arc<dyn FileOpener>> {
+        // Get the projected file schema for JsonOpener
+        let file_schema = self.table_schema.file_schema();
+        let projected_schema =
+            Arc::new(file_schema.project(&self.projection.file_indices)?);
+
+        let mut opener = Arc::new(JsonOpener {
             batch_size: self
                 .batch_size
                 .expect("Batch size must set before creating opener"),
-            projected_schema: base_config.projected_file_schema(),
+            projected_schema,
             file_compression_type: base_config.file_compression_type,
             object_store,
-        })
+            newline_delimited: self.newline_delimited,
+        }) as Arc<dyn FileOpener>;
+
+        // Wrap with ProjectionOpener
+        opener = ProjectionOpener::try_new(
+            self.projection.clone(),
+            Arc::clone(&opener),
+            self.table_schema.file_schema(),
+        )?;
+
+        Ok(opener)
     }
 
     fn as_any(&self) -> &dyn Any {
         self
     }
 
+    fn table_schema(&self) -> &datafusion_datasource::TableSchema {
+        &self.table_schema
+    }
+
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
         let mut conf = self.clone();
         conf.batch_size = Some(batch_size);
         Arc::new(conf)
     }
 
-    fn with_schema(&self, _schema: SchemaRef) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
-    }
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
-        Arc::new(conf)
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection =
+            SplitProjection::new(self.table_schema.file_schema(), &new_projection);
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
     fn metrics(&self) -> &ExecutionPlanMetricsSet {
         &self.metrics
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        Ok(statistics
-            .clone()
-            .expect("projected_statistics must be set to call"))
-    }
-
     fn file_type(&self) -> &str {
         "json"
     }
-    impl_schema_adapter_methods!();
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit projection expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in &self.projection.source {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
 }
 
 impl FileOpener for JsonOpener {
-    /// Open a partitioned NDJSON file.
+    /// Open a partitioned JSON file.
     ///
     /// If `file_meta.range` is `None`, the entire file is opened.
     /// Else `file_meta.range` is `Some(FileRange{start, end})`, which corresponds to the byte range [start, end) within the file.
@@ -164,14 +263,27 @@ impl FileOpener for JsonOpener {
     /// are applied to determine which lines to read:
     /// 1. The first line of the partition is the line in which the index of the first character >= `start`.
     /// 2. The last line of the partition is the line in which the byte at position `end - 1` resides.
-    fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture> {
+    ///
+    /// Note: JSON array format does not support range-based scanning.
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
         let store = Arc::clone(&self.object_store);
         let schema = Arc::clone(&self.projected_schema);
         let batch_size = self.batch_size;
         let file_compression_type = self.file_compression_type.to_owned();
+        let newline_delimited = self.newline_delimited;
+
+        // JSON array format requires reading the complete file
+        if !newline_delimited && partitioned_file.range.is_some() {
+            return Err(DataFusionError::NotImplemented(
+                "JSON array format does not support range-based file scanning. \
+                 Disable repartition_file_scans or use newline-delimited JSON format."
+                    .to_string(),
+            ));
+        }
 
         Ok(Box::pin(async move {
-            let calculated_range = calculate_range(&file_meta, &store, None).await?;
+            let calculated_range =
+                calculate_range(&partitioned_file, &store, None).await?;
 
             let range = match calculated_range {
                 RangeCalculation::Range(None) => None,
@@ -179,7 +291,7 @@ impl FileOpener for JsonOpener {
                 RangeCalculation::TerminateEarly => {
                     return Ok(
                         futures::stream::poll_fn(move |_| Poll::Ready(None)).boxed()
-                    )
+                    );
                 }
             };
 
@@ -188,38 +300,162 @@ impl FileOpener for JsonOpener {
                 ..Default::default()
             };
 
-            let result = store.get_opts(file_meta.location(), options).await?;
+            let result = store
+                .get_opts(&partitioned_file.object_meta.location, options)
+                .await?;
 
             match result.payload {
                 #[cfg(not(target_arch = "wasm32"))]
                 GetResultPayload::File(mut file, _) => {
-                    let bytes = match file_meta.range {
+                    let bytes = match partitioned_file.range {
                         None => file_compression_type.convert_read(file)?,
                         Some(_) => {
                             file.seek(SeekFrom::Start(result.range.start as _))?;
                             let limit = result.range.end - result.range.start;
-                            file_compression_type.convert_read(file.take(limit as u64))?
+                            file_compression_type.convert_read(file.take(limit))?
                         }
                     };
 
-                    let reader = ReaderBuilder::new(schema)
-                        .with_batch_size(batch_size)
-                        .build(BufReader::new(bytes))?;
-
-                    Ok(futures::stream::iter(reader).boxed())
+                    if newline_delimited {
+                        // NDJSON: use BufReader directly
+                        let reader = BufReader::new(bytes);
+                        let arrow_reader = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build(reader)?;
+
+                        Ok(futures::stream::iter(arrow_reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
+                    } else {
+                        // JSON array format: wrap with streaming converter
+                        let ndjson_reader = JsonArrayToNdjsonReader::with_capacity(
+                            bytes,
+                            JSON_CONVERTER_BUFFER_SIZE,
+                        );
+                        let arrow_reader = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build(ndjson_reader)?;
+
+                        Ok(futures::stream::iter(arrow_reader)
+                            .map(|r| r.map_err(Into::into))
+                            .boxed())
+                    }
                 }
                 GetResultPayload::Stream(s) => {
-                    let s = s.map_err(DataFusionError::from);
-
-                    let decoder = ReaderBuilder::new(schema)
-                        .with_batch_size(batch_size)
-                        .build_decoder()?;
-                    let input = file_compression_type.convert_stream(s.boxed())?.fuse();
-
-                    Ok(deserialize_stream(
-                        input,
-                        DecoderDeserializer::new(JsonDecoder::new(decoder)),
-                    ))
+                    if newline_delimited {
+                        // Newline-delimited JSON (NDJSON) streaming reader
+                        let s = s.map_err(DataFusionError::from);
+                        let decoder = ReaderBuilder::new(schema)
+                            .with_batch_size(batch_size)
+                            .build_decoder()?;
+                        let input =
+                            file_compression_type.convert_stream(s.boxed())?.fuse();
+                        let stream = deserialize_stream(
+                            input,
+                            DecoderDeserializer::new(JsonDecoder::new(decoder)),
+                        );
+                        Ok(stream.map_err(Into::into).boxed())
+                    } else {
+                        // JSON array format: streaming conversion with channel-based byte transfer
+                        //
+                        // Architecture:
+                        // 1. Async task reads from object store stream, decompresses, sends to channel
+                        // 2. Blocking task receives bytes, converts JSON array to NDJSON, parses to Arrow
+                        // 3. RecordBatches are sent back via another channel
+                        //
+                        // Memory budget (~32MB):
+                        // - sync_channel: CHANNEL_BUFFER_SIZE chunks (~16MB)
+                        // - JsonArrayToNdjsonReader: 2 × JSON_CONVERTER_BUFFER_SIZE (~4MB)
+                        // - Arrow JsonReader internal buffer (~8MB)
+                        // - Miscellaneous (~4MB)
+
+                        let s = s.map_err(DataFusionError::from);
+                        let decompressed_stream =
+                            file_compression_type.convert_stream(s.boxed())?;
+
+                        // Channel for bytes: async producer -> blocking consumer
+                        // Uses tokio::sync::mpsc so the async send never blocks a
+                        // tokio worker thread; the consumer calls blocking_recv()
+                        // inside spawn_blocking.
+                        let (byte_tx, byte_rx) = tokio::sync::mpsc::channel::<bytes::Bytes>(
+                            CHANNEL_BUFFER_SIZE,
+                        );
+
+                        // Channel for results: sync producer -> async consumer
+                        let (result_tx, result_rx) = tokio::sync::mpsc::channel(2);
+                        let error_tx = result_tx.clone();
+
+                        // Async task: read from object store stream and send bytes to channel
+                        // Store the SpawnedTask to keep it alive until stream is dropped
+                        let read_task = SpawnedTask::spawn(async move {
+                            tokio::pin!(decompressed_stream);
+                            while let Some(chunk) = decompressed_stream.next().await {
+                                match chunk {
+                                    Ok(bytes) => {
+                                        if byte_tx.send(bytes).await.is_err() {
+                                            break; // Consumer dropped
+                                        }
+                                    }
+                                    Err(e) => {
+                                        let _ = error_tx
+                                            .send(Err(
+                                                arrow::error::ArrowError::ExternalError(
+                                                    Box::new(e),
+                                                ),
+                                            ))
+                                            .await;
+                                        break;
+                                    }
+                                }
+                            }
+                            // byte_tx dropped here, signals EOF to ChannelReader
+                        });
+
+                        // Blocking task: receive bytes from channel and parse JSON
+                        // Store the SpawnedTask to keep it alive until stream is dropped
+                        let parse_task = SpawnedTask::spawn_blocking(move || {
+                            let channel_reader = ChannelReader::new(byte_rx);
+                            let mut ndjson_reader =
+                                JsonArrayToNdjsonReader::with_capacity(
+                                    channel_reader,
+                                    JSON_CONVERTER_BUFFER_SIZE,
+                                );
+
+                            match ReaderBuilder::new(schema)
+                                .with_batch_size(batch_size)
+                                .build(&mut ndjson_reader)
+                            {
+                                Ok(arrow_reader) => {
+                                    for batch_result in arrow_reader {
+                                        if result_tx.blocking_send(batch_result).is_err()
+                                        {
+                                            break; // Receiver dropped
+                                        }
+                                    }
+                                }
+                                Err(e) => {
+                                    let _ = result_tx.blocking_send(Err(e));
+                                }
+                            }
+
+                            // Validate the JSON array was properly formed
+                            if let Err(e) = ndjson_reader.validate_complete() {
+                                let _ = result_tx.blocking_send(Err(
+                                    arrow::error::ArrowError::JsonError(e.to_string()),
+                                ));
+                            }
+                            // result_tx dropped here, closes the stream
+                        });
+
+                        // Wrap in JsonArrayStream to keep tasks alive until stream is consumed
+                        let stream = JsonArrayStream {
+                            inner: ReceiverStream::new(result_rx),
+                            _read_task: read_task,
+                            _parse_task: parse_task,
+                        };
+
+                        Ok(stream.map(|r| r.map_err(Into::into)).boxed())
+                    }
                 }
             }
         }))
@@ -280,3 +516,307 @@ pub async fn plan_to_json(
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use bytes::Bytes;
+    use datafusion_datasource::FileRange;
+    use futures::TryStreamExt;
+    use object_store::memory::InMemory;
+    use object_store::path::Path;
+    use object_store::{ObjectStoreExt, PutPayload};
+
+    /// Helper to create a test schema
+    fn test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("name", DataType::Utf8, true),
+        ]))
+    }
+
+    #[tokio::test]
+    async fn test_json_array_from_file() -> Result<()> {
+        // Test reading JSON array format from a file
+        let json_data = r#"[{"id": 1, "name": "alice"}, {"id": 2, "name": "bob"}]"#;
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_from_stream() -> Result<()> {
+        // Test reading JSON array format from object store stream (simulates S3)
+        let json_data = r#"[{"id": 1, "name": "alice"}, {"id": 2, "name": "bob"}, {"id": 3, "name": "charlie"}]"#;
+
+        // Use InMemory store which returns Stream payload
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test_stream.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            2, // small batch size to test multiple batches
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 3);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_nested_objects() -> Result<()> {
+        // Test JSON array with nested objects and arrays
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("data", DataType::Utf8, true),
+        ]));
+
+        let json_data = r#"[
+            {"id": 1, "data": "{\"nested\": true}"},
+            {"id": 2, "data": "[1, 2, 3]"}
+        ]"#;
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("nested.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            schema,
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_empty() -> Result<()> {
+        // Test empty JSON array
+        let json_data = "[]";
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("empty.json");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_range_not_supported() {
+        // Test that range-based scanning returns error for JSON array format
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store
+            .put(&path, PutPayload::from_static(b"[]"))
+            .await
+            .unwrap();
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false, // JSON array format
+        );
+
+        let meta = store.head(&path).await.unwrap();
+        let mut file = PartitionedFile::new(path.to_string(), meta.size);
+        file.range = Some(FileRange { start: 0, end: 10 });
+
+        let result = opener.open(file);
+        match result {
+            Ok(_) => panic!("Expected error for range-based JSON array scanning"),
+            Err(e) => {
+                assert!(
+                    e.to_string().contains("does not support range-based"),
+                    "Unexpected error message: {e}"
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_still_works() -> Result<()> {
+        // Ensure NDJSON format still works correctly
+        let json_data =
+            "{\"id\": 1, \"name\": \"alice\"}\n{\"id\": 2, \"name\": \"bob\"}\n";
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("test.ndjson");
+        store
+            .put(&path, PutPayload::from_static(json_data.as_bytes()))
+            .await?;
+
+        let opener = JsonOpener::new(
+            1024,
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            true, // NDJSON format
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_large_file() -> Result<()> {
+        // Test with a larger JSON array to verify streaming works
+        let mut json_data = String::from("[");
+        for i in 0..1000 {
+            if i > 0 {
+                json_data.push(',');
+            }
+            json_data.push_str(&format!(r#"{{"id": {i}, "name": "user{i}"}}"#));
+        }
+        json_data.push(']');
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("large.json");
+        store
+            .put(&path, PutPayload::from(Bytes::from(json_data)))
+            .await?;
+
+        let opener = JsonOpener::new(
+            100, // batch size of 100
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let stream = opener.open(file)?.await?;
+        let batches: Vec<_> = stream.try_collect().await?;
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 1000);
+
+        // Should have multiple batches due to batch_size=100
+        assert!(batches.len() >= 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_json_array_stream_cancellation() -> Result<()> {
+        // Test that cancellation works correctly (tasks are aborted when stream is dropped)
+        let mut json_data = String::from("[");
+        for i in 0..10000 {
+            if i > 0 {
+                json_data.push(',');
+            }
+            json_data.push_str(&format!(r#"{{"id": {i}, "name": "user{i}"}}"#));
+        }
+        json_data.push(']');
+
+        let store = Arc::new(InMemory::new());
+        let path = Path::from("cancel_test.json");
+        store
+            .put(&path, PutPayload::from(Bytes::from(json_data)))
+            .await?;
+
+        let opener = JsonOpener::new(
+            10, // small batch size
+            test_schema(),
+            FileCompressionType::UNCOMPRESSED,
+            store.clone(),
+            false,
+        );
+
+        let meta = store.head(&path).await?;
+        let file = PartitionedFile::new(path.to_string(), meta.size);
+
+        let mut stream = opener.open(file)?.await?;
+
+        // Read only first batch, then drop the stream (simulating cancellation)
+        let first_batch = stream.next().await;
+        assert!(first_batch.is_some());
+
+        // Drop the stream - this should abort the spawned tasks via SpawnedTask's Drop
+        drop(stream);
+
+        // Give tasks time to be aborted
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // If we reach here without hanging, cancellation worked
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource-json/src/utils.rs b/datafusion/datasource-json/src/utils.rs
new file mode 100644
index 0000000000000..bc75799edff73
--- /dev/null
+++ b/datafusion/datasource-json/src/utils.rs
@@ -0,0 +1,778 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility types for JSON processing
+
+use std::io::{BufRead, Read};
+
+use bytes::Bytes;
+
+// ============================================================================
+// JsonArrayToNdjsonReader - Streaming JSON Array to NDJSON Converter
+// ============================================================================
+//
+// Architecture:
+//
+// ```text
+// ┌─────────────────────────────────────────────────────────────┐
+// │  JSON Array File (potentially very large, e.g. 33GB)       │
+// │  [{"a":1}, {"a":2}, {"a":3}, ...... {"a":1000000}]         │
+// └─────────────────────────────────────────────────────────────┘
+//                           │
+//                           ▼ read chunks via ChannelReader
+//                 ┌───────────────────┐
+//                 │ JsonArrayToNdjson │  ← character substitution only:
+//                 │      Reader       │    '[' skip, ',' → '\n', ']' stop
+//                 └───────────────────┘
+//                           │
+//                           ▼ outputs NDJSON format
+//                 ┌───────────────────┐
+//                 │   Arrow Reader    │  ← internal buffer, batch parsing
+//                 │  batch_size=8192  │
+//                 └───────────────────┘
+//                           │
+//                           ▼ outputs RecordBatch
+//                 ┌───────────────────┐
+//                 │   RecordBatch     │
+//                 └───────────────────┘
+// ```
+//
+// Memory Efficiency:
+//
+// | Approach                              | Memory for 33GB file | Parse count |
+// |---------------------------------------|----------------------|-------------|
+// | Load entire file + serde_json         | ~100GB+              | 3x          |
+// | Streaming with JsonArrayToNdjsonReader| ~32MB (configurable) | 1x          |
+//
+// Design Note:
+//
+// This implementation uses `inner: R` directly (not `BufReader<R>`) and manages
+// its own input buffer. This is critical for compatibility with `SyncIoBridge`
+// and `ChannelReader` in `spawn_blocking` contexts.
+//
+
+/// Default buffer size for JsonArrayToNdjsonReader (2MB for better throughput)
+const DEFAULT_BUF_SIZE: usize = 2 * 1024 * 1024;
+
+/// Parser state for JSON array streaming
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum JsonArrayState {
+    /// Initial state, looking for opening '['
+    Start,
+    /// Inside the JSON array, processing objects
+    InArray,
+    /// Reached the closing ']', finished
+    Done,
+}
+
+/// A streaming reader that converts JSON array format to NDJSON format.
+///
+/// This reader wraps an underlying reader containing JSON array data
+/// `[{...}, {...}, ...]` and transforms it on-the-fly to newline-delimited
+/// JSON format that Arrow's JSON reader can process.
+///
+/// Implements both `Read` and `BufRead` traits for compatibility with Arrow's
+/// `ReaderBuilder::build()` which requires `BufRead`.
+///
+/// # Transformation Rules
+///
+/// - Skip leading `[` and whitespace before it
+/// - Convert top-level `,` (between objects) to `\n`
+/// - Skip whitespace at top level (between objects)
+/// - Stop at trailing `]`
+/// - Preserve everything inside objects (including nested `[`, `]`, `,`)
+/// - Properly handle strings (ignore special chars inside quotes)
+///
+/// # Example
+///
+/// ```text
+/// Input:  [{"a":1}, {"b":[1,2]}, {"c":"x,y"}]
+/// Output: {"a":1}
+///         {"b":[1,2]}
+///         {"c":"x,y"}
+/// ```
+pub struct JsonArrayToNdjsonReader<R: Read> {
+    /// Inner reader - we use R directly (not `BufReader<R>`) for SyncIoBridge compatibility
+    inner: R,
+    state: JsonArrayState,
+    /// Tracks nesting depth of `{` and `[` to identify top-level commas
+    depth: i32,
+    /// Whether we're currently inside a JSON string
+    in_string: bool,
+    /// Whether the next character is escaped (after `\`)
+    escape_next: bool,
+    /// Input buffer - stores raw bytes read from inner reader
+    input_buffer: Vec<u8>,
+    /// Current read position in input buffer
+    input_pos: usize,
+    /// Number of valid bytes in input buffer
+    input_filled: usize,
+    /// Output buffer - stores transformed NDJSON bytes
+    output_buffer: Vec<u8>,
+    /// Current read position in output buffer
+    output_pos: usize,
+    /// Number of valid bytes in output buffer
+    output_filled: usize,
+    /// Whether trailing non-whitespace content was detected after ']'
+    has_trailing_content: bool,
+    /// Whether leading non-whitespace content was detected before '['
+    has_leading_content: bool,
+}
+
+impl<R: Read> JsonArrayToNdjsonReader<R> {
+    /// Create a new streaming reader that converts JSON array to NDJSON.
+    pub fn new(reader: R) -> Self {
+        Self::with_capacity(reader, DEFAULT_BUF_SIZE)
+    }
+
+    /// Create a new streaming reader with custom buffer size.
+    ///
+    /// Larger buffers improve throughput but use more memory.
+    /// Total memory usage is approximately 2 * capacity (input + output buffers).
+    pub fn with_capacity(reader: R, capacity: usize) -> Self {
+        Self {
+            inner: reader,
+            state: JsonArrayState::Start,
+            depth: 0,
+            in_string: false,
+            escape_next: false,
+            input_buffer: vec![0; capacity],
+            input_pos: 0,
+            input_filled: 0,
+            output_buffer: vec![0; capacity],
+            output_pos: 0,
+            output_filled: 0,
+            has_trailing_content: false,
+            has_leading_content: false,
+        }
+    }
+
+    /// Check if the JSON array was properly terminated.
+    ///
+    /// This should be called after all data has been read.
+    ///
+    /// Returns an error if:
+    /// - Unbalanced braces/brackets (depth != 0)
+    /// - Unterminated string
+    /// - Missing closing `]`
+    /// - Unexpected trailing content after `]`
+    pub fn validate_complete(&self) -> std::io::Result<()> {
+        if self.has_leading_content {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON: unexpected leading content before '['",
+            ));
+        }
+        if self.depth != 0 {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON array: unbalanced braces or brackets",
+            ));
+        }
+        if self.in_string {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON array: unterminated string",
+            ));
+        }
+        if self.state != JsonArrayState::Done {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Incomplete JSON array: expected closing bracket ']'",
+            ));
+        }
+        if self.has_trailing_content {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Malformed JSON: unexpected trailing content after ']'",
+            ));
+        }
+        Ok(())
+    }
+
+    /// Process a single byte and return the transformed byte (if any)
+    #[inline]
+    fn process_byte(&mut self, byte: u8) -> Option<u8> {
+        match self.state {
+            JsonArrayState::Start => {
+                // Looking for the opening '[', skip whitespace
+                if byte == b'[' {
+                    self.state = JsonArrayState::InArray;
+                } else if !byte.is_ascii_whitespace() {
+                    self.has_leading_content = true;
+                }
+                None
+            }
+            JsonArrayState::InArray => {
+                // Handle escape sequences in strings
+                if self.escape_next {
+                    self.escape_next = false;
+                    return Some(byte);
+                }
+
+                if self.in_string {
+                    // Inside a string: handle escape and closing quote
+                    match byte {
+                        b'\\' => self.escape_next = true,
+                        b'"' => self.in_string = false,
+                        _ => {}
+                    }
+                    Some(byte)
+                } else {
+                    // Outside strings: track depth and transform
+                    match byte {
+                        b'"' => {
+                            self.in_string = true;
+                            Some(byte)
+                        }
+                        b'{' | b'[' => {
+                            self.depth += 1;
+                            Some(byte)
+                        }
+                        b'}' => {
+                            self.depth -= 1;
+                            Some(byte)
+                        }
+                        b']' => {
+                            if self.depth == 0 {
+                                // Top-level ']' means end of array
+                                self.state = JsonArrayState::Done;
+                                None
+                            } else {
+                                // Nested ']' inside an object
+                                self.depth -= 1;
+                                Some(byte)
+                            }
+                        }
+                        b',' if self.depth == 0 => {
+                            // Top-level comma between objects → newline
+                            Some(b'\n')
+                        }
+                        _ => {
+                            // At depth 0, skip whitespace between objects
+                            if self.depth == 0 && byte.is_ascii_whitespace() {
+                                None
+                            } else {
+                                Some(byte)
+                            }
+                        }
+                    }
+                }
+            }
+            JsonArrayState::Done => {
+                // After ']', check for non-whitespace trailing content
+                if !byte.is_ascii_whitespace() {
+                    self.has_trailing_content = true;
+                }
+                None
+            }
+        }
+    }
+
+    /// Refill input buffer from inner reader if needed.
+    /// Returns true if there's data available, false on EOF.
+    fn refill_input_if_needed(&mut self) -> std::io::Result<bool> {
+        if self.input_pos >= self.input_filled {
+            // Input buffer exhausted, read more from inner
+            let bytes_read = self.inner.read(&mut self.input_buffer)?;
+            if bytes_read == 0 {
+                return Ok(false); // EOF
+            }
+            self.input_pos = 0;
+            self.input_filled = bytes_read;
+        }
+        Ok(true)
+    }
+
+    /// Fill the output buffer with transformed data.
+    ///
+    /// This method manages its own input buffer, reading from the inner reader
+    /// as needed. When the output buffer is full, we stop processing but preserve
+    /// the current position in the input buffer for the next call.
+    fn fill_output_buffer(&mut self) -> std::io::Result<()> {
+        let mut write_pos = 0;
+
+        while write_pos < self.output_buffer.len() {
+            // Refill input buffer if exhausted
+            if !self.refill_input_if_needed()? {
+                break; // EOF
+            }
+
+            // Process bytes from input buffer
+            while self.input_pos < self.input_filled
+                && write_pos < self.output_buffer.len()
+            {
+                let byte = self.input_buffer[self.input_pos];
+                self.input_pos += 1;
+
+                if let Some(transformed) = self.process_byte(byte) {
+                    self.output_buffer[write_pos] = transformed;
+                    write_pos += 1;
+                }
+            }
+        }
+
+        self.output_pos = 0;
+        self.output_filled = write_pos;
+        Ok(())
+    }
+}
+
+impl<R: Read> Read for JsonArrayToNdjsonReader<R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        // If output buffer is empty, fill it
+        if self.output_pos >= self.output_filled {
+            self.fill_output_buffer()?;
+            if self.output_filled == 0 {
+                return Ok(0); // EOF
+            }
+        }
+
+        // Copy from output buffer to caller's buffer
+        let available = self.output_filled - self.output_pos;
+        let to_copy = std::cmp::min(available, buf.len());
+        buf[..to_copy].copy_from_slice(
+            &self.output_buffer[self.output_pos..self.output_pos + to_copy],
+        );
+        self.output_pos += to_copy;
+        Ok(to_copy)
+    }
+}
+
+impl<R: Read> BufRead for JsonArrayToNdjsonReader<R> {
+    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+        if self.output_pos >= self.output_filled {
+            self.fill_output_buffer()?;
+        }
+        Ok(&self.output_buffer[self.output_pos..self.output_filled])
+    }
+
+    fn consume(&mut self, amt: usize) {
+        self.output_pos = std::cmp::min(self.output_pos + amt, self.output_filled);
+    }
+}
+
+// ============================================================================
+// ChannelReader - Sync reader that receives bytes from async channel
+// ============================================================================
+//
+// Architecture:
+//
+// ```text
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                         S3 / MinIO (async)                              │
+// │                    (33GB JSON Array File)                               │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ async stream (Bytes chunks)
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                      Async Task (tokio runtime)                         │
+// │              while let Some(chunk) = stream.next().await                │
+// │                     byte_tx.send(chunk)                                 │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ tokio::sync::mpsc::channel<Bytes>
+//                                 │   (bounded, ~32MB buffer)
+//                                 ▼
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                   Blocking Task (spawn_blocking)                        │
+// │  ┌──────────────┐   ┌────────────────────────┐   ┌──────────────────┐  │
+// │  │ChannelReader │ → │JsonArrayToNdjsonReader │ → │ Arrow JsonReader │  │
+// │  │   (Read)     │   │  [{},...] → {}\n{}     │   │  (RecordBatch)   │  │
+// │  └──────────────┘   └────────────────────────┘   └──────────────────┘  │
+// └─────────────────────────────────────────────────────────────────────────┘
+//                                 │
+//                                 ▼ tokio::sync::mpsc::channel<RecordBatch>
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │                      ReceiverStream (async)                             │
+// │                   → DataFusion execution engine                         │
+// └─────────────────────────────────────────────────────────────────────────┘
+// ```
+//
+// Memory Budget (~32MB total):
+// - sync_channel buffer: 128 chunks × ~128KB = ~16MB
+// - JsonArrayToNdjsonReader: 2 × 2MB = 4MB
+// - Arrow JsonReader internal: ~8MB
+// - Miscellaneous: ~4MB
+//
+
+/// A synchronous `Read` implementation that receives bytes from an async channel.
+///
+/// This enables true streaming between async and sync contexts without
+/// loading the entire file into memory. Uses `tokio::sync::mpsc::Receiver`
+/// with `blocking_recv()` so the async producer never blocks a tokio worker
+/// thread, while the sync consumer (running in `spawn_blocking`) safely blocks.
+pub struct ChannelReader {
+    rx: tokio::sync::mpsc::Receiver<Bytes>,
+    current: Option<Bytes>,
+    pos: usize,
+}
+
+impl ChannelReader {
+    /// Create a new ChannelReader from a tokio mpsc receiver.
+    pub fn new(rx: tokio::sync::mpsc::Receiver<Bytes>) -> Self {
+        Self {
+            rx,
+            current: None,
+            pos: 0,
+        }
+    }
+}
+
+impl Read for ChannelReader {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        loop {
+            // If we have current chunk with remaining data, read from it
+            if let Some(ref chunk) = self.current {
+                let remaining = chunk.len() - self.pos;
+                if remaining > 0 {
+                    let to_copy = std::cmp::min(remaining, buf.len());
+                    buf[..to_copy].copy_from_slice(&chunk[self.pos..self.pos + to_copy]);
+                    self.pos += to_copy;
+                    return Ok(to_copy);
+                }
+            }
+
+            // Current chunk exhausted, get next from channel
+            match self.rx.blocking_recv() {
+                Some(bytes) => {
+                    self.current = Some(bytes);
+                    self.pos = 0;
+                    // Loop back to read from new chunk
+                }
+                None => return Ok(0), // Channel closed = EOF
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_json_array_to_ndjson_simple() {
+        let input = r#"[{"a":1}, {"a":2}, {"a":3}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}\n{\"a\":3}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_nested() {
+        let input = r#"[{"a":{"b":1}}, {"c":[1,2,3]}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":{\"b\":1}}\n{\"c\":[1,2,3]}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_strings_with_special_chars() {
+        let input = r#"[{"a":"[1,2]"}, {"b":"x,y"}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":\"[1,2]\"}\n{\"b\":\"x,y\"}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_escaped_quotes() {
+        let input = r#"[{"a":"say \"hello\""}, {"b":1}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":\"say \\\"hello\\\"\"}\n{\"b\":1}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_empty() {
+        let input = r#"[]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_single_element() {
+        let input = r#"[{"a":1}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}");
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_bufread() {
+        let input = r#"[{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+
+        let buf = reader.fill_buf().unwrap();
+        assert!(!buf.is_empty());
+
+        let first_len = buf.len();
+        reader.consume(first_len);
+
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+    }
+
+    #[test]
+    fn test_json_array_to_ndjson_whitespace() {
+        let input = r#"  [  {"a":1}  ,  {"a":2}  ]  "#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        // Top-level whitespace is skipped, internal whitespace preserved
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+    }
+
+    #[test]
+    fn test_validate_complete_valid_json() {
+        let valid_json = r#"[{"a":1},{"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(valid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        reader.validate_complete().unwrap();
+    }
+
+    #[test]
+    fn test_json_array_with_trailing_junk() {
+        let input = r#" [ {"a":1} , {"a":2} ] some { junk [ here ] "#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Should extract the valid array content
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // But validation should catch the trailing junk
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("trailing content")
+                || err_msg.contains("Unexpected trailing"),
+            "Expected trailing content error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_validate_complete_incomplete_array() {
+        let invalid_json = r#"[{"a":1},{"a":2}"#; // Missing closing ]
+        let mut reader = JsonArrayToNdjsonReader::new(invalid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("expected closing bracket")
+                || err_msg.contains("missing closing"),
+            "Expected missing bracket error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_validate_complete_unbalanced_braces() {
+        let invalid_json = r#"[{"a":1},{"a":2]"#; // Wrong closing bracket
+        let mut reader = JsonArrayToNdjsonReader::new(invalid_json.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("unbalanced")
+                || err_msg.contains("expected closing bracket"),
+            "Expected unbalanced or missing bracket error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_json_array_with_leading_junk() {
+        let input = r#"junk[{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Should still extract the valid array content
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // But validation should catch the leading junk
+        let result = reader.validate_complete();
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("leading content"),
+            "Expected leading content error, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_json_array_with_leading_whitespace_ok() {
+        let input = r#"
+  [{"a":1}, {"a":2}]"#;
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+        assert_eq!(output, "{\"a\":1}\n{\"a\":2}");
+
+        // Leading whitespace should be fine
+        reader.validate_complete().unwrap();
+    }
+
+    #[test]
+    fn test_validate_complete_valid_with_trailing_whitespace() {
+        let input = r#"[{"a":1},{"a":2}]
+    "#; // Trailing whitespace is OK
+        let mut reader = JsonArrayToNdjsonReader::new(input.as_bytes());
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Whitespace after ] should be allowed
+        reader.validate_complete().unwrap();
+    }
+
+    /// Test that data is not lost at buffer boundaries.
+    ///
+    /// This test creates input larger than the internal buffer to verify
+    /// that newline characters are not dropped when they occur at buffer boundaries.
+    #[test]
+    fn test_buffer_boundary_no_data_loss() {
+        // Create objects ~9KB each, so 10 objects = ~90KB
+        let large_value = "x".repeat(9000);
+
+        let mut objects = vec![];
+        for i in 0..10 {
+            objects.push(format!(r#"{{"id":{i},"data":"{large_value}"}}"#));
+        }
+
+        let input = format!("[{}]", objects.join(","));
+
+        // Use small buffer to force multiple fill cycles
+        let mut reader = JsonArrayToNdjsonReader::with_capacity(input.as_bytes(), 8192);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        // Verify correct number of newlines (9 newlines separate 10 objects)
+        let newline_count = output.matches('\n').count();
+        assert_eq!(
+            newline_count, 9,
+            "Expected 9 newlines separating 10 objects, got {newline_count}"
+        );
+
+        // Verify each line is valid JSON
+        for (i, line) in output.lines().enumerate() {
+            let parsed: Result<serde_json::Value, _> = serde_json::from_str(line);
+            assert!(
+                parsed.is_ok(),
+                "Line {} is not valid JSON: {}...",
+                i,
+                &line[..100.min(line.len())]
+            );
+
+            // Verify the id field matches expected value
+            let value = parsed.unwrap();
+            assert_eq!(
+                value["id"].as_i64(),
+                Some(i as i64),
+                "Object {i} has wrong id"
+            );
+        }
+    }
+
+    /// Test with real-world-like data format (with leading whitespace and newlines)
+    #[test]
+    fn test_real_world_format_large() {
+        let large_value = "x".repeat(8000);
+
+        // Format similar to real files: opening bracket on its own line,
+        // each object indented with 2 spaces
+        let mut objects = vec![];
+        for i in 0..10 {
+            objects.push(format!(r#"  {{"id":{i},"data":"{large_value}"}}"#));
+        }
+
+        let input = format!("[\n{}\n]", objects.join(",\n"));
+
+        let mut reader = JsonArrayToNdjsonReader::with_capacity(input.as_bytes(), 8192);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        let lines: Vec<&str> = output.lines().collect();
+        assert_eq!(lines.len(), 10, "Expected 10 objects");
+
+        for (i, line) in lines.iter().enumerate() {
+            assert!(
+                line.starts_with("{\"id\""),
+                "Line {} should start with object, got: {}...",
+                i,
+                &line[..50.min(line.len())]
+            );
+        }
+    }
+
+    /// Test ChannelReader
+    #[test]
+    fn test_channel_reader() {
+        let (tx, rx) = tokio::sync::mpsc::channel(4);
+
+        // Send some chunks (try_send is non-async)
+        tx.try_send(Bytes::from("Hello, ")).unwrap();
+        tx.try_send(Bytes::from("World!")).unwrap();
+        drop(tx); // Close channel
+
+        let mut reader = ChannelReader::new(rx);
+        let mut output = String::new();
+        reader.read_to_string(&mut output).unwrap();
+
+        assert_eq!(output, "Hello, World!");
+    }
+
+    /// Test ChannelReader with small reads
+    #[test]
+    fn test_channel_reader_small_reads() {
+        let (tx, rx) = tokio::sync::mpsc::channel(4);
+
+        tx.try_send(Bytes::from("ABCDEFGHIJ")).unwrap();
+        drop(tx);
+
+        let mut reader = ChannelReader::new(rx);
+        let mut buf = [0u8; 3];
+
+        // Read in small chunks
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"ABC");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"DEF");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 3);
+        assert_eq!(&buf, b"GHI");
+
+        assert_eq!(reader.read(&mut buf).unwrap(), 1);
+        assert_eq!(&buf[..1], b"J");
+
+        // EOF
+        assert_eq!(reader.read(&mut buf).unwrap(), 0);
+    }
+}
diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml
index b6a548c998dc2..a5855af17a536 100644
--- a/datafusion/datasource-parquet/Cargo.toml
+++ b/datafusion/datasource-parquet/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-datasource-parquet"
 description = "datafusion-datasource-parquet"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -34,17 +34,18 @@ all-features = true
 arrow = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true, features = ["object_store", "parquet"] }
 datafusion-common-runtime = { workspace = true }
-datafusion-datasource = { workspace = true, features = ["parquet"] }
+datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-functions-aggregate = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
-datafusion-physical-optimizer = { workspace = true }
 datafusion-physical-plan = { workspace = true }
+datafusion-pruning = { workspace = true }
 datafusion-session = { workspace = true }
 futures = { workspace = true }
 itertools = { workspace = true }
@@ -52,15 +53,36 @@ log = { workspace = true }
 object_store = { workspace = true }
 parking_lot = { workspace = true }
 parquet = { workspace = true }
-rand = { workspace = true }
 tokio = { workspace = true }
 
 [dev-dependencies]
 chrono = { workspace = true }
+criterion = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-functions-nested = { workspace = true }
+tempfile = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_datasource_parquet"
 path = "src/mod.rs"
+
+[features]
+parquet_encryption = [
+    "parquet/encryption",
+    "datafusion-common/parquet_encryption",
+    "datafusion-execution/parquet_encryption",
+]
+
+[[bench]]
+name = "parquet_nested_filter_pushdown"
+harness = false
+
+[[bench]]
+name = "parquet_struct_filter_pushdown"
+harness = false
diff --git a/datafusion/datasource-parquet/README.md b/datafusion/datasource-parquet/README.md
index abcdd5ab13402..833fc74a258b3 100644
--- a/datafusion/datasource-parquet/README.md
+++ b/datafusion/datasource-parquet/README.md
@@ -17,10 +17,17 @@
   under the License.
 -->
 
-# DataFusion datasource
+# Apache DataFusion Parquet DataSource
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate is a submodule of DataFusion that defines a Parquet based file source.
+This crate is a submodule of DataFusion that defines an [Apache Parquet] based file source.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[apache parquet]: https://parquet.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs
new file mode 100644
index 0000000000000..02137b5a1d288
--- /dev/null
+++ b/datafusion/datasource-parquet/benches/parquet_nested_filter_pushdown.rs
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{
+    BinaryBuilder, BooleanArray, ListBuilder, RecordBatch, StringBuilder,
+};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter};
+use datafusion_expr::{Expr, col};
+use datafusion_functions_nested::expr_fn::array_has;
+use datafusion_physical_expr::planner::logical2physical;
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::{ArrowWriter, ProjectionMask};
+use parquet::file::properties::WriterProperties;
+use tempfile::TempDir;
+
+const ROW_GROUP_ROW_COUNT: usize = 10_000;
+const TOTAL_ROW_GROUPS: usize = 10;
+const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS;
+const TARGET_VALUE: &str = "target_value";
+const COLUMN_NAME: &str = "list_col";
+const PAYLOAD_COLUMN_NAME: &str = "payload";
+// Large binary payload to emphasize decoding overhead when pushdown is disabled.
+const PAYLOAD_BYTES: usize = 8 * 1024;
+
+struct BenchmarkDataset {
+    _tempdir: TempDir,
+    file_path: PathBuf,
+}
+
+impl BenchmarkDataset {
+    fn path(&self) -> &Path {
+        &self.file_path
+    }
+}
+
+static DATASET: LazyLock<BenchmarkDataset> = LazyLock::new(|| {
+    create_dataset().expect("failed to prepare parquet benchmark dataset")
+});
+
+fn parquet_nested_filter_pushdown(c: &mut Criterion) {
+    let dataset_path = DATASET.path().to_owned();
+    let mut group = c.benchmark_group("parquet_nested_filter_pushdown");
+    group.throughput(Throughput::Elements(TOTAL_ROWS as u64));
+
+    group.bench_function("no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&create_predicate(), &file_schema);
+        b.iter(|| {
+            let matched = scan_with_predicate(&dataset_path, &predicate, false)
+                .expect("baseline parquet scan with filter succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&create_predicate(), &file_schema);
+        b.iter(|| {
+            let matched = scan_with_predicate(&dataset_path, &predicate, true)
+                .expect("pushdown parquet scan with filter succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.finish();
+}
+
+fn setup_reader(path: &Path) -> SchemaRef {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    Arc::clone(builder.schema())
+}
+
+fn create_predicate() -> Expr {
+    array_has(
+        col(COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some(TARGET_VALUE.to_string())), None),
+    )
+}
+
+fn scan_with_predicate(
+    path: &Path,
+    predicate: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    pushdown: bool,
+) -> datafusion_common::Result<usize> {
+    let file = std::fs::File::open(path)?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+    let metadata = builder.metadata().clone();
+    let file_schema = builder.schema();
+    let projection = ProjectionMask::all();
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics);
+
+    let builder = if pushdown {
+        if let Some(row_filter) =
+            build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)?
+        {
+            builder.with_row_filter(row_filter)
+        } else {
+            builder
+        }
+    } else {
+        builder
+    };
+
+    let reader = builder.with_projection(projection).build()?;
+
+    let mut matched_rows = 0usize;
+    for batch in reader {
+        let batch = batch?;
+        matched_rows += count_matches(predicate, &batch)?;
+    }
+
+    if pushdown {
+        let pruned_rows = file_metrics.pushdown_rows_pruned.value();
+        assert_eq!(
+            pruned_rows,
+            TOTAL_ROWS - matched_rows,
+            "row-level pushdown should prune 90% of rows"
+        );
+    }
+
+    Ok(matched_rows)
+}
+
+fn count_matches(
+    expr: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    batch: &RecordBatch,
+) -> datafusion_common::Result<usize> {
+    let values = expr.evaluate(batch)?.into_array(batch.num_rows())?;
+    let bools = values
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("boolean filter result");
+
+    Ok(bools.iter().filter(|v| matches!(v, Some(true))).count())
+}
+
+fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
+    let tempdir = TempDir::new()?;
+    let file_path = tempdir.path().join("nested_lists.parquet");
+
+    let field = Arc::new(Field::new("item", DataType::Utf8, true));
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(COLUMN_NAME, DataType::List(field), false),
+        Field::new(PAYLOAD_COLUMN_NAME, DataType::Binary, false),
+    ]));
+
+    let writer_props = WriterProperties::builder()
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer = ArrowWriter::try_new(
+        std::fs::File::create(&file_path)?,
+        Arc::clone(&schema),
+        Some(writer_props),
+    )?;
+
+    // Create sorted row groups with distinct values so that min/max statistics
+    // allow skipping most groups when applying a selective predicate.
+    let sorted_values = [
+        "alpha",
+        "bravo",
+        "charlie",
+        "delta",
+        "echo",
+        "foxtrot",
+        "golf",
+        "hotel",
+        "india",
+        TARGET_VALUE,
+    ];
+
+    for value in sorted_values {
+        let batch = build_list_batch(&schema, value, ROW_GROUP_ROW_COUNT)?;
+        writer.write(&batch)?;
+    }
+
+    writer.close()?;
+
+    // Ensure the writer respected the requested row group size
+    let reader =
+        ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?;
+    assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS);
+
+    Ok(BenchmarkDataset {
+        _tempdir: tempdir,
+        file_path,
+    })
+}
+
+fn build_list_batch(
+    schema: &SchemaRef,
+    value: &str,
+    len: usize,
+) -> datafusion_common::Result<RecordBatch> {
+    let mut builder = ListBuilder::new(StringBuilder::new());
+    let mut payload_builder = BinaryBuilder::new();
+    let payload = vec![1u8; PAYLOAD_BYTES];
+    for _ in 0..len {
+        builder.values().append_value(value);
+        builder.append(true);
+        payload_builder.append_value(&payload);
+    }
+
+    let array = builder.finish();
+    let payload_array = payload_builder.finish();
+    Ok(RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![Arc::new(array), Arc::new(payload_array)],
+    )?)
+}
+
+criterion_group!(benches, parquet_nested_filter_pushdown);
+criterion_main!(benches);
diff --git a/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs
new file mode 100644
index 0000000000000..b52408d4222d8
--- /dev/null
+++ b/datafusion/datasource-parquet/benches/parquet_struct_filter_pushdown.rs
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for struct field filter pushdown in Parquet.
+//!
+//! Compares scanning with vs without row-level filter pushdown for
+//! predicates on struct sub-fields (e.g. `get_field(s, 'id') = 42`).
+//!
+//! The dataset schema (in SQL-like notation):
+//!
+//! ```sql
+//! CREATE TABLE t (
+//!     id       INT,          -- top-level id, useful for correctness checks
+//!     large_string TEXT,     -- wide column so SELECT * is expensive
+//!     s STRUCT<
+//!         id: INT,           -- mirrors top-level id
+//!         large_string: TEXT -- wide sub-field; pushdown with proper projection
+//!                            -- should avoid reading this when filtering on s.id
+//!     >
+//! );
+//! ```
+//!
+//! Benchmark queries:
+//!
+//! 1. `SELECT * FROM t WHERE get_field(s, 'id') = 42`
+//!     - no pushdown vs. row-level filter pushdown
+//! 2. `SELECT * FROM t WHERE get_field(s, 'id') = id`
+//!     - cross-column predicate; no pushdown vs. row-level filter pushdown
+//! 3. `SELECT id FROM t WHERE get_field(s, 'id') = 42`
+//!     - narrow projection; pushdown should avoid reading s.large_string
+
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+use arrow::array::{BooleanArray, Int32Array, RecordBatch, StringBuilder, StructArray};
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_datasource_parquet::{ParquetFileMetrics, build_row_filter};
+use datafusion_expr::{Expr, col};
+use datafusion_physical_expr::planner::logical2physical;
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::{ArrowWriter, ProjectionMask};
+use parquet::file::properties::WriterProperties;
+use tempfile::TempDir;
+
+const ROW_GROUP_ROW_COUNT: usize = 10_000;
+const TOTAL_ROW_GROUPS: usize = 10;
+const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS;
+/// Only one row group will contain the target value.
+const TARGET_VALUE: i32 = 42;
+const ID_COLUMN_NAME: &str = "id";
+const LARGE_STRING_COLUMN_NAME: &str = "large_string";
+const STRUCT_COLUMN_NAME: &str = "s";
+// Large string payload to emphasize decoding overhead when pushdown is disabled.
+const LARGE_STRING_LEN: usize = 8 * 1024;
+
+struct BenchmarkDataset {
+    _tempdir: TempDir,
+    file_path: PathBuf,
+}
+
+impl BenchmarkDataset {
+    fn path(&self) -> &Path {
+        &self.file_path
+    }
+}
+
+static DATASET: LazyLock<BenchmarkDataset> = LazyLock::new(|| {
+    create_dataset().expect("failed to prepare parquet benchmark dataset")
+});
+
+fn parquet_struct_filter_pushdown(c: &mut Criterion) {
+    let dataset_path = DATASET.path().to_owned();
+    let mut group = c.benchmark_group("parquet_struct_filter_pushdown");
+    group.throughput(Throughput::Elements(TOTAL_ROWS as u64));
+
+    // Scenario 1: SELECT * FROM t WHERE get_field(s, 'id') = 42
+    group.bench_function("select_star/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("select_star/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    // Scenario 2: SELECT * FROM t WHERE get_field(s, 'id') = id
+    group.bench_function("select_star_cross_col/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, TOTAL_ROWS);
+        });
+    });
+
+    group.bench_function("select_star_cross_col/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_top_id(), &file_schema);
+        b.iter(|| {
+            let matched = scan(&dataset_path, &predicate, true, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, TOTAL_ROWS);
+        });
+    });
+
+    // Scenario 3: SELECT id FROM t WHERE get_field(s, 'id') = 42
+    group.bench_function("select_id/no_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        b.iter(|| {
+            // Without pushdown we must read all columns to evaluate the predicate.
+            let matched = scan(&dataset_path, &predicate, false, ProjectionMask::all())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.bench_function("select_id/with_pushdown", |b| {
+        let file_schema = setup_reader(&dataset_path);
+        let predicate = logical2physical(&struct_id_eq_literal(), &file_schema);
+        let id_only = id_projection(&dataset_path);
+        b.iter(|| {
+            // With pushdown the filter runs first, then we only project `id`.
+            let matched = scan(&dataset_path, &predicate, true, id_only.clone())
+                .expect("scan succeeded");
+            assert_eq!(matched, ROW_GROUP_ROW_COUNT);
+        });
+    });
+
+    group.finish();
+}
+
+fn setup_reader(path: &Path) -> SchemaRef {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    Arc::clone(builder.schema())
+}
+
+/// `get_field(s, 'id') = TARGET_VALUE`
+fn struct_id_eq_literal() -> Expr {
+    let get_field_expr = datafusion_functions::core::get_field().call(vec![
+        col(STRUCT_COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None),
+    ]);
+    get_field_expr.eq(Expr::Literal(ScalarValue::Int32(Some(TARGET_VALUE)), None))
+}
+
+/// `get_field(s, 'id') = id`
+fn struct_id_eq_top_id() -> Expr {
+    let get_field_expr = datafusion_functions::core::get_field().call(vec![
+        col(STRUCT_COLUMN_NAME),
+        Expr::Literal(ScalarValue::Utf8(Some("id".to_string())), None),
+    ]);
+    get_field_expr.eq(col(ID_COLUMN_NAME))
+}
+
+/// Build a [`ProjectionMask`] that only reads the top-level `id` leaf column.
+fn id_projection(path: &Path) -> ProjectionMask {
+    let file = std::fs::File::open(path).expect("failed to open file");
+    let builder =
+        ParquetRecordBatchReaderBuilder::try_new(file).expect("failed to build reader");
+    let parquet_schema = builder.metadata().file_metadata().schema_descr_ptr();
+    // Leaf index 0 corresponds to the top-level `id` column.
+    ProjectionMask::leaves(&parquet_schema, [0])
+}
+
+fn scan(
+    path: &Path,
+    predicate: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    pushdown: bool,
+    projection: ProjectionMask,
+) -> datafusion_common::Result<usize> {
+    let file = std::fs::File::open(path)?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
+    let metadata = builder.metadata().clone();
+    let file_schema = builder.schema();
+
+    let metrics = ExecutionPlanMetricsSet::new();
+    let file_metrics = ParquetFileMetrics::new(0, &path.display().to_string(), &metrics);
+
+    let mut filter_applied = false;
+    let builder = if pushdown {
+        if let Some(row_filter) =
+            build_row_filter(predicate, file_schema, &metadata, false, &file_metrics)?
+        {
+            filter_applied = true;
+            builder.with_row_filter(row_filter)
+        } else {
+            builder
+        }
+    } else {
+        builder
+    };
+
+    // Only apply a narrow projection when the filter was actually pushed down.
+    // Otherwise we need all columns to evaluate the predicate manually.
+    let output_projection = if filter_applied {
+        projection
+    } else {
+        ProjectionMask::all()
+    };
+    let reader = builder.with_projection(output_projection).build()?;
+
+    let mut matched_rows = 0usize;
+    for batch in reader {
+        let batch = batch?;
+        if filter_applied {
+            // When the row filter was applied, rows are already filtered.
+            matched_rows += batch.num_rows();
+        } else {
+            matched_rows += count_matches(predicate, &batch)?;
+        }
+    }
+
+    Ok(matched_rows)
+}
+
+fn count_matches(
+    expr: &Arc<dyn datafusion_physical_expr::PhysicalExpr>,
+    batch: &RecordBatch,
+) -> datafusion_common::Result<usize> {
+    let values = expr.evaluate(batch)?.into_array(batch.num_rows())?;
+    let bools = values
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .expect("boolean filter result");
+
+    Ok(bools.iter().filter(|v| matches!(v, Some(true))).count())
+}
+
+fn schema() -> SchemaRef {
+    let struct_fields = Fields::from(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false),
+    ]);
+    Arc::new(Schema::new(vec![
+        Field::new(ID_COLUMN_NAME, DataType::Int32, false),
+        Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false),
+        Field::new(STRUCT_COLUMN_NAME, DataType::Struct(struct_fields), false),
+    ]))
+}
+
+fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
+    let tempdir = TempDir::new()?;
+    let file_path = tempdir.path().join("struct_filter.parquet");
+
+    let schema = schema();
+    let writer_props = WriterProperties::builder()
+        .set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
+        .build();
+
+    let mut writer = ArrowWriter::try_new(
+        std::fs::File::create(&file_path)?,
+        Arc::clone(&schema),
+        Some(writer_props),
+    )?;
+
+    // Each row group has a distinct `s.id` value. Only one row group
+    // matches the target, so pushdown should prune 90% of rows.
+    for rg_idx in 0..TOTAL_ROW_GROUPS {
+        let id_value = if rg_idx == TOTAL_ROW_GROUPS - 1 {
+            TARGET_VALUE
+        } else {
+            (rg_idx as i32 + 1) * 1000
+        };
+        let batch = build_struct_batch(&schema, id_value, ROW_GROUP_ROW_COUNT)?;
+        writer.write(&batch)?;
+    }
+
+    writer.close()?;
+
+    let reader =
+        ParquetRecordBatchReaderBuilder::try_new(std::fs::File::open(&file_path)?)?;
+    assert_eq!(reader.metadata().row_groups().len(), TOTAL_ROW_GROUPS);
+
+    Ok(BenchmarkDataset {
+        _tempdir: tempdir,
+        file_path,
+    })
+}
+
+fn build_struct_batch(
+    schema: &SchemaRef,
+    id_value: i32,
+    len: usize,
+) -> datafusion_common::Result<RecordBatch> {
+    let large_string: String = "x".repeat(LARGE_STRING_LEN);
+
+    // Top-level columns
+    let top_id_array = Arc::new(Int32Array::from(vec![id_value; len]));
+    let mut top_string_builder = StringBuilder::new();
+    for _ in 0..len {
+        top_string_builder.append_value(&large_string);
+    }
+    let top_string_array = Arc::new(top_string_builder.finish());
+
+    // Struct sub-fields: s.id mirrors top-level id, s.large_string is the same payload
+    let struct_id_array = Arc::new(Int32Array::from(vec![id_value; len]));
+    let mut struct_string_builder = StringBuilder::new();
+    for _ in 0..len {
+        struct_string_builder.append_value(&large_string);
+    }
+    let struct_string_array = Arc::new(struct_string_builder.finish());
+
+    let struct_array = StructArray::from(vec![
+        (
+            Arc::new(Field::new("id", DataType::Int32, false)),
+            struct_id_array as Arc<dyn arrow::array::Array>,
+        ),
+        (
+            Arc::new(Field::new(LARGE_STRING_COLUMN_NAME, DataType::Utf8, false)),
+            struct_string_array as Arc<dyn arrow::array::Array>,
+        ),
+    ]);
+
+    Ok(RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![top_id_array, top_string_array, Arc::new(struct_array)],
+    )?)
+}
+
+criterion_group!(benches, parquet_struct_filter_pushdown);
+criterion_main!(benches);
diff --git a/datafusion/datasource-parquet/src/access_plan.rs b/datafusion/datasource-parquet/src/access_plan.rs
index 0c30f3ff85b6d..8e5f55277e00d 100644
--- a/datafusion/datasource-parquet/src/access_plan.rs
+++ b/datafusion/datasource-parquet/src/access_plan.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_err, Result};
+use crate::sort::reverse_row_selection;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_datafusion_err};
 use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
-use parquet::file::metadata::RowGroupMetaData;
+use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+use std::collections::VecDeque;
 
 /// A selection of rows and row groups within a ParquetFile to decode.
 ///
@@ -82,6 +84,10 @@ use parquet::file::metadata::RowGroupMetaData;
 /// └───────────────────┘
 ///  Row Group 3
 /// ```
+///
+/// For more background, please also see the [Embedding User-Defined Indexes in Apache Parquet Files blog]
+///
+/// [Embedding User-Defined Indexes in Apache Parquet Files blog]: https://datafusion.apache.org/blog/2025/07/14/user-defined-parquet-indexes
 #[derive(Debug, Clone, PartialEq)]
 pub struct ParquetAccessPlan {
     /// How to access the i-th row group
@@ -269,13 +275,13 @@ impl ParquetAccessPlan {
                 .sum::<usize>();
 
             let row_group_row_count = rg_meta.num_rows();
-            if rows_in_selection as i64 != row_group_row_count {
-                return internal_err!(
-                    "Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
+            assert_eq_or_internal_err!(
+                rows_in_selection as i64,
+                row_group_row_count,
+                "Invalid ParquetAccessPlan Selection. Row group {idx} has {row_group_row_count} rows \
                     but selection only specifies {rows_in_selection} rows. \
                     Selection: {selection:?}"
-                );
-            }
+            );
         }
 
         let total_selection: RowSelection = self
@@ -302,13 +308,10 @@ impl ParquetAccessPlan {
 
     /// Return an iterator over the row group indexes that should be scanned
     pub fn row_group_index_iter(&self) -> impl Iterator<Item = usize> + '_ {
-        self.row_groups.iter().enumerate().filter_map(|(idx, b)| {
-            if b.should_scan() {
-                Some(idx)
-            } else {
-                None
-            }
-        })
+        self.row_groups
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, b)| if b.should_scan() { Some(idx) } else { None })
     }
 
     /// Return a vec of all row group indexes to scan
@@ -336,6 +339,138 @@ impl ParquetAccessPlan {
     pub fn into_inner(self) -> Vec<RowGroupAccess> {
         self.row_groups
     }
+
+    /// Prepare this plan and resolve to the final `PreparedAccessPlan`
+    pub(crate) fn prepare(
+        self,
+        row_group_meta_data: &[RowGroupMetaData],
+    ) -> Result<PreparedAccessPlan> {
+        let row_group_indexes = self.row_group_indexes();
+        let row_selection = self.into_overall_row_selection(row_group_meta_data)?;
+
+        PreparedAccessPlan::new(row_group_indexes, row_selection)
+    }
+}
+
+/// Represents a prepared, fully resolved [`ParquetAccessPlan`]
+///
+/// The [`RowSelection`] represents the result of applying all pruning such as
+/// user provided scans, Row Group statistics, DataPage statistics, and Bloom
+/// Filters.
+///
+/// This plan is what is passed to the parquet reader
+pub(crate) struct PreparedAccessPlan {
+    /// Row group indexes to read
+    pub(crate) row_group_indexes: Vec<usize>,
+    /// Optional row selection for filtering within row groups
+    pub(crate) row_selection: Option<RowSelection>,
+}
+
+impl PreparedAccessPlan {
+    /// Create a new prepared access plan
+    fn new(
+        row_group_indexes: Vec<usize>,
+        row_selection: Option<RowSelection>,
+    ) -> Result<Self> {
+        Ok(Self {
+            row_group_indexes,
+            row_selection,
+        })
+    }
+
+    /// Reverse the access plan for reverse scanning
+    pub(crate) fn reverse(mut self, file_metadata: &ParquetMetaData) -> Result<Self> {
+        // Get the row group indexes before reversing
+        let row_groups_to_scan = self.row_group_indexes.clone();
+
+        // Reverse the row group indexes
+        self.row_group_indexes = self.row_group_indexes.into_iter().rev().collect();
+
+        // If we have a row selection, reverse it to match the new row group order
+        if let Some(row_selection) = self.row_selection {
+            self.row_selection = Some(reverse_row_selection(
+                &row_selection,
+                file_metadata,
+                &row_groups_to_scan, // Pass the original (non-reversed) row group indexes
+            )?);
+        }
+
+        Ok(self)
+    }
+
+    /// Split this access plan into one plan per selected row group.
+    ///
+    /// The returned plans preserve the current row-group ordering. If
+    /// `row_selection` is present, it is partitioned so each returned plan
+    /// contains only the selection entries for its single row group.
+    pub(crate) fn into_single_row_group_plans(
+        self,
+        file_metadata: &ParquetMetaData,
+    ) -> Result<Vec<Self>> {
+        let Self {
+            row_group_indexes,
+            row_selection,
+        } = self;
+
+        let Some(row_selection) = row_selection else {
+            return Ok(row_group_indexes
+                .into_iter()
+                .map(|row_group_index| Self {
+                    row_group_indexes: vec![row_group_index],
+                    row_selection: None,
+                })
+                .collect());
+        };
+
+        let mut selectors: VecDeque<RowSelector> =
+            Vec::<RowSelector>::from(row_selection).into();
+        let mut plans = Vec::with_capacity(row_group_indexes.len());
+
+        for row_group_index in row_group_indexes {
+            let mut remaining_rows =
+                file_metadata.row_groups()[row_group_index].num_rows() as usize;
+            let mut row_group_selectors = Vec::new();
+
+            while remaining_rows > 0 {
+                let selector = selectors.pop_front().ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "PreparedAccessPlan row selection ended before row group {row_group_index} was fully described"
+                    )
+                })?;
+
+                let rows_for_group = selector.row_count.min(remaining_rows);
+                row_group_selectors.push(if selector.skip {
+                    RowSelector::skip(rows_for_group)
+                } else {
+                    RowSelector::select(rows_for_group)
+                });
+
+                if selector.row_count > rows_for_group {
+                    let remaining_selector_rows = selector.row_count - rows_for_group;
+                    selectors.push_front(if selector.skip {
+                        RowSelector::skip(remaining_selector_rows)
+                    } else {
+                        RowSelector::select(remaining_selector_rows)
+                    });
+                }
+
+                remaining_rows -= rows_for_group;
+            }
+
+            plans.push(Self {
+                row_group_indexes: vec![row_group_index],
+                row_selection: Some(row_group_selectors.into()),
+            });
+        }
+
+        if !selectors.is_empty() {
+            return Err(internal_datafusion_err!(
+                "PreparedAccessPlan row selection had leftover selectors after splitting by row group"
+            ));
+        }
+
+        Ok(plans)
+    }
 }
 
 #[cfg(test)]
@@ -482,7 +617,10 @@ mod test {
             .unwrap_err()
             .to_string();
         assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
-        assert_contains!(err, "Internal error: Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 12 rows");
+        assert_contains!(
+            err,
+            "Row group 1 has 20 rows but selection only specifies 12 rows"
+        );
     }
 
     #[test]
@@ -508,7 +646,10 @@ mod test {
             .unwrap_err()
             .to_string();
         assert_eq!(row_group_indexes, vec![0, 1, 2, 3]);
-        assert_contains!(err, "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows");
+        assert_contains!(
+            err,
+            "Invalid ParquetAccessPlan Selection. Row group 1 has 20 rows but selection only specifies 22 rows"
+        );
     }
 
     /// [`RowGroupMetaData`] that returns 4 row groups with 10, 20, 30, 40 rows
diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs
index 851e33644381d..4e2affd98d551 100644
--- a/datafusion/datasource-parquet/src/file_format.rs
+++ b/datafusion/datasource-parquet/src/file_format.rs
@@ -19,31 +19,31 @@
 
 use std::any::Any;
 use std::cell::RefCell;
-use std::fmt;
 use std::fmt::Debug;
 use std::ops::Range;
 use std::rc::Rc;
 use std::sync::Arc;
+use std::{fmt, vec};
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit};
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file_compression_type::FileCompressionType;
 use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
 use datafusion_datasource::write::{
-    get_writer_schema, ObjectWriterBuilder, SharedBuffer,
+    ObjectWriterBuilder, SharedBuffer, get_writer_schema,
 };
 
 use datafusion_datasource::file_format::{FileFormat, FileFormatFactory};
 use datafusion_datasource::write::demux::DemuxedStreamReceiver;
 
-use arrow::compute::sum;
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions};
+use datafusion_common::encryption::FileDecryptionProperties;
 use datafusion_common::parsers::CompressionTypeVariant;
-use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, ColumnStatistics,
-    DataFusionError, GetExt, HashSet, Result, DEFAULT_PARQUET_EXTENSION,
+    DEFAULT_PARQUET_EXTENSION, DataFusionError, GetExt, HashSet, Result,
+    internal_datafusion_err, internal_err, not_impl_err,
 };
 use datafusion_common::{HashMap, Statistics};
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
@@ -54,35 +54,41 @@ use datafusion_datasource::sink::{DataSink, DataSinkExec};
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_expr::dml::InsertOp;
-use datafusion_functions_aggregate::min_max::{MaxAccumulator, MinAccumulator};
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
-use datafusion_physical_plan::Accumulator;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_plan::metrics::{
+    ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
+};
 use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan};
 use datafusion_session::Session;
 
-use crate::source::{parse_coerce_int96_string, ParquetSource};
+use crate::metadata::{DFParquetMetadata, lex_ordering_to_sorting_columns};
+use crate::reader::CachedParquetFileReaderFactory;
+use crate::source::{ParquetSource, parse_coerce_int96_string};
 use async_trait::async_trait;
 use bytes::Bytes;
 use datafusion_datasource::source::DataSourceExec;
+use datafusion_execution::cache::cache_manager::FileMetadataCache;
+use datafusion_execution::runtime_env::RuntimeEnv;
 use futures::future::BoxFuture;
 use futures::{FutureExt, StreamExt, TryStreamExt};
-use log::debug;
 use object_store::buffered::BufWriter;
 use object_store::path::Path;
-use object_store::{ObjectMeta, ObjectStore};
-use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
 use parquet::arrow::arrow_writer::{
-    compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter,
-    ArrowLeafColumn, ArrowWriterOptions,
+    ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, ArrowRowGroupWriterFactory,
+    ArrowWriterOptions, compute_leaves,
 };
 use parquet::arrow::async_reader::MetadataFetch;
-use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter};
+use parquet::arrow::{ArrowWriter, AsyncArrowWriter};
 use parquet::basic::Type;
+#[cfg(feature = "parquet_encryption")]
+use parquet::encryption::encrypt::FileEncryptionProperties;
 use parquet::errors::ParquetError;
-use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData};
-use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder};
+use parquet::file::metadata::{ParquetMetaData, SortingColumn};
+use parquet::file::properties::{
+    DEFAULT_MAX_ROW_GROUP_ROW_COUNT, WriterProperties, WriterPropertiesBuilder,
+};
 use parquet::file::writer::SerializedFileWriter;
-use parquet::format::FileMetaData;
 use parquet::schema::types::SchemaDescriptor;
 use tokio::io::{AsyncWrite, AsyncWriteExt};
 use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -299,15 +305,37 @@ fn clear_metadata(
     })
 }
 
-async fn fetch_schema_with_location(
-    store: &dyn ObjectStore,
-    file: &ObjectMeta,
-    metadata_size_hint: Option<usize>,
-    coerce_int96: Option<TimeUnit>,
-) -> Result<(Path, Schema)> {
-    let loc_path = file.location.clone();
-    let schema = fetch_schema(store, file, metadata_size_hint, coerce_int96).await?;
-    Ok((loc_path, schema))
+#[cfg(feature = "parquet_encryption")]
+async fn get_file_decryption_properties(
+    state: &dyn Session,
+    options: &TableParquetOptions,
+    file_path: &Path,
+) -> Result<Option<Arc<FileDecryptionProperties>>> {
+    Ok(match &options.crypto.file_decryption {
+        Some(cfd) => Some(Arc::new(FileDecryptionProperties::from(cfd.clone()))),
+        None => match &options.crypto.factory_id {
+            Some(factory_id) => {
+                let factory =
+                    state.runtime_env().parquet_encryption_factory(factory_id)?;
+                factory
+                    .get_file_decryption_properties(
+                        &options.crypto.factory_options,
+                        file_path,
+                    )
+                    .await?
+            }
+            None => None,
+        },
+    })
+}
+
+#[cfg(not(feature = "parquet_encryption"))]
+async fn get_file_decryption_properties(
+    _state: &dyn Session,
+    _options: &TableParquetOptions,
+    _file_path: &Path,
+) -> Result<Option<Arc<FileDecryptionProperties>>> {
+    Ok(None)
 }
 
 #[async_trait]
@@ -331,6 +359,10 @@ impl FileFormat for ParquetFormat {
         }
     }
 
+    fn compression_type(&self) -> Option<FileCompressionType> {
+        None
+    }
+
     async fn infer_schema(
         &self,
         state: &dyn Session,
@@ -341,17 +373,30 @@ impl FileFormat for ParquetFormat {
             Some(time_unit) => Some(parse_coerce_int96_string(time_unit.as_str())?),
             None => None,
         };
+
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+
         let mut schemas: Vec<_> = futures::stream::iter(objects)
-            .map(|object| {
-                fetch_schema_with_location(
-                    store.as_ref(),
-                    object,
-                    self.metadata_size_hint(),
-                    coerce_int96,
+            .map(|object| async {
+                let file_decryption_properties = get_file_decryption_properties(
+                    state,
+                    &self.options,
+                    &object.location,
                 )
+                .await?;
+                let result = DFParquetMetadata::new(store.as_ref(), object)
+                    .with_metadata_size_hint(self.metadata_size_hint())
+                    .with_decryption_properties(file_decryption_properties)
+                    .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache)))
+                    .with_coerce_int96(coerce_int96)
+                    .fetch_schema_with_location()
+                    .await?;
+                Ok::<_, DataFusionError>(result)
             })
             .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552
-            .buffered(state.config_options().execution.meta_fetch_concurrency)
+            // fetch schemas concurrently, if requested
+            .buffer_unordered(state.config_options().execution.meta_fetch_concurrency)
             .try_collect()
             .await?;
 
@@ -361,12 +406,10 @@ impl FileFormat for ParquetFormat {
         // is not deterministic. Thus, to ensure deterministic schema inference
         // sort the files first.
         // https://github.com/apache/datafusion/pull/6629
-        schemas.sort_by(|(location1, _), (location2, _)| location1.cmp(location2));
+        schemas
+            .sort_unstable_by(|(location1, _), (location2, _)| location1.cmp(location2));
 
-        let schemas = schemas
-            .into_iter()
-            .map(|(_, schema)| schema)
-            .collect::<Vec<_>>();
+        let schemas = schemas.into_iter().map(|(_, schema)| schema);
 
         let schema = if self.skip_metadata() {
             Schema::try_merge(clear_metadata(schemas))
@@ -391,24 +434,78 @@ impl FileFormat for ParquetFormat {
 
     async fn infer_stats(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         store: &Arc<dyn ObjectStore>,
         table_schema: SchemaRef,
         object: &ObjectMeta,
     ) -> Result<Statistics> {
-        let stats = fetch_statistics(
-            store.as_ref(),
-            table_schema,
-            object,
-            self.metadata_size_hint(),
+        let file_decryption_properties =
+            get_file_decryption_properties(state, &self.options, &object.location)
+                .await?;
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        DFParquetMetadata::new(store, object)
+            .with_metadata_size_hint(self.metadata_size_hint())
+            .with_decryption_properties(file_decryption_properties)
+            .with_file_metadata_cache(Some(file_metadata_cache))
+            .fetch_statistics(&table_schema)
+            .await
+    }
+
+    async fn infer_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<Option<LexOrdering>> {
+        let file_decryption_properties =
+            get_file_decryption_properties(state, &self.options, &object.location)
+                .await?;
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let metadata = DFParquetMetadata::new(store, object)
+            .with_metadata_size_hint(self.metadata_size_hint())
+            .with_decryption_properties(file_decryption_properties)
+            .with_file_metadata_cache(Some(file_metadata_cache))
+            .fetch_metadata()
+            .await?;
+        crate::metadata::ordering_from_parquet_metadata(&metadata, &table_schema)
+    }
+
+    async fn infer_stats_and_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<datafusion_datasource::file_format::FileMeta> {
+        let file_decryption_properties =
+            get_file_decryption_properties(state, &self.options, &object.location)
+                .await?;
+        let file_metadata_cache =
+            state.runtime_env().cache_manager.get_file_metadata_cache();
+        let metadata = DFParquetMetadata::new(store, object)
+            .with_metadata_size_hint(self.metadata_size_hint())
+            .with_decryption_properties(file_decryption_properties)
+            .with_file_metadata_cache(Some(file_metadata_cache))
+            .fetch_metadata()
+            .await?;
+        let statistics = DFParquetMetadata::statistics_from_parquet_metadata(
+            &metadata,
+            &table_schema,
+        )?;
+        let ordering =
+            crate::metadata::ordering_from_parquet_metadata(&metadata, &table_schema)?;
+        Ok(
+            datafusion_datasource::file_format::FileMeta::new(statistics)
+                .with_ordering(ordering),
         )
-        .await?;
-        Ok(stats)
     }
 
     async fn create_physical_plan(
         &self,
-        _state: &dyn Session,
+        state: &dyn Session,
         conf: FileScanConfig,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut metadata_size_hint = None;
@@ -417,16 +514,31 @@ impl FileFormat for ParquetFormat {
             metadata_size_hint = Some(metadata);
         }
 
-        let mut source = ParquetSource::new(self.options.clone());
+        let mut source = conf
+            .file_source()
+            .as_any()
+            .downcast_ref::<ParquetSource>()
+            .cloned()
+            .ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?;
+        source = source.with_table_parquet_options(self.options.clone());
+
+        // Use the CachedParquetFileReaderFactory
+        let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache();
+        let store = state
+            .runtime_env()
+            .object_store(conf.object_store_url.clone())?;
+        let cached_parquet_read_factory =
+            Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
+        source = source.with_parquet_file_reader_factory(cached_parquet_read_factory);
 
         if let Some(metadata_size_hint) = metadata_size_hint {
             source = source.with_metadata_size_hint(metadata_size_hint)
         }
-        // Apply schema adapter factory before building the new config
-        let file_source = source.apply_schema_adapter(&conf);
+
+        source = self.set_source_encryption_factory(source, state)?;
 
         let conf = FileScanConfigBuilder::from(conf)
-            .with_source(file_source)
+            .with_source(Arc::new(source))
             .build();
         Ok(DataSourceExec::from_data_source(conf))
     }
@@ -442,13 +554,67 @@ impl FileFormat for ParquetFormat {
             return not_impl_err!("Overwrites are not implemented yet for Parquet");
         }
 
-        let sink = Arc::new(ParquetSink::new(conf, self.options.clone()));
+        // Convert ordering requirements to Parquet SortingColumns for file metadata
+        let sorting_columns = if let Some(ref requirements) = order_requirements {
+            let ordering: LexOrdering = requirements.clone().into();
+            // In cases like `COPY (... ORDER BY ...) TO ...` the ORDER BY clause
+            // may not be compatible with Parquet sorting columns (e.g. ordering on `random()`).
+            // So if we cannot create a Parquet sorting column from the ordering requirement,
+            // we skip setting sorting columns on the Parquet sink.
+            lex_ordering_to_sorting_columns(&ordering).ok()
+        } else {
+            None
+        };
+
+        let sink = Arc::new(
+            ParquetSink::new(conf, self.options.clone())
+                .with_sorting_columns(sorting_columns),
+        );
 
         Ok(Arc::new(DataSinkExec::new(input, sink, order_requirements)) as _)
     }
 
-    fn file_source(&self) -> Arc<dyn FileSource> {
-        Arc::new(ParquetSource::default())
+    fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+        Arc::new(
+            ParquetSource::new(table_schema)
+                .with_table_parquet_options(self.options.clone()),
+        )
+    }
+}
+
+#[cfg(feature = "parquet_encryption")]
+impl ParquetFormat {
+    fn set_source_encryption_factory(
+        &self,
+        source: ParquetSource,
+        state: &dyn Session,
+    ) -> Result<ParquetSource> {
+        if let Some(encryption_factory_id) = &self.options.crypto.factory_id {
+            Ok(source.with_encryption_factory(
+                state
+                    .runtime_env()
+                    .parquet_encryption_factory(encryption_factory_id)?,
+            ))
+        } else {
+            Ok(source)
+        }
+    }
+}
+
+#[cfg(not(feature = "parquet_encryption"))]
+impl ParquetFormat {
+    fn set_source_encryption_factory(
+        &self,
+        source: ParquetSource,
+        _state: &dyn Session,
+    ) -> Result<ParquetSource> {
+        if let Some(encryption_factory_id) = &self.options.crypto.factory_id {
+            Err(DataFusionError::Configuration(format!(
+                "Parquet encryption factory id is set to '{encryption_factory_id}' but the parquet_encryption feature is disabled"
+            )))
+        } else {
+            Ok(source)
+        }
     }
 }
 
@@ -897,13 +1063,13 @@ pub fn transform_binary_to_string(schema: &Schema) -> Schema {
 }
 
 /// [`MetadataFetch`] adapter for reading bytes from an [`ObjectStore`]
-struct ObjectStoreFetch<'a> {
+pub struct ObjectStoreFetch<'a> {
     store: &'a dyn ObjectStore,
     meta: &'a ObjectMeta,
 }
 
 impl<'a> ObjectStoreFetch<'a> {
-    fn new(store: &'a dyn ObjectStore, meta: &'a ObjectMeta) -> Self {
+    pub fn new(store: &'a dyn ObjectStore, meta: &'a ObjectMeta) -> Self {
         Self { store, meta }
     }
 }
@@ -926,218 +1092,60 @@ impl MetadataFetch for ObjectStoreFetch<'_> {
 /// through [`ParquetFileReaderFactory`].
 ///
 /// [`ParquetFileReaderFactory`]: crate::ParquetFileReaderFactory
+#[deprecated(
+    since = "50.0.0",
+    note = "Use `DFParquetMetadata::fetch_metadata` instead"
+)]
 pub async fn fetch_parquet_metadata(
     store: &dyn ObjectStore,
-    meta: &ObjectMeta,
+    object_meta: &ObjectMeta,
     size_hint: Option<usize>,
-) -> Result<ParquetMetaData> {
-    let file_size = meta.size;
-    let fetch = ObjectStoreFetch::new(store, meta);
-
-    ParquetMetaDataReader::new()
-        .with_prefetch_hint(size_hint)
-        .load_and_finish(fetch, file_size)
+    decryption_properties: Option<&FileDecryptionProperties>,
+    file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+) -> Result<Arc<ParquetMetaData>> {
+    let decryption_properties = decryption_properties.cloned().map(Arc::new);
+    DFParquetMetadata::new(store, object_meta)
+        .with_metadata_size_hint(size_hint)
+        .with_decryption_properties(decryption_properties)
+        .with_file_metadata_cache(file_metadata_cache)
+        .fetch_metadata()
         .await
-        .map_err(DataFusionError::from)
-}
-
-/// Read and parse the schema of the Parquet file at location `path`
-async fn fetch_schema(
-    store: &dyn ObjectStore,
-    file: &ObjectMeta,
-    metadata_size_hint: Option<usize>,
-    coerce_int96: Option<TimeUnit>,
-) -> Result<Schema> {
-    let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?;
-    let file_metadata = metadata.file_metadata();
-    let schema = parquet_to_arrow_schema(
-        file_metadata.schema_descr(),
-        file_metadata.key_value_metadata(),
-    )?;
-    let schema = coerce_int96
-        .and_then(|time_unit| {
-            coerce_int96_to_resolution(file_metadata.schema_descr(), &schema, &time_unit)
-        })
-        .unwrap_or(schema);
-    Ok(schema)
 }
 
 /// Read and parse the statistics of the Parquet file at location `path`
 ///
 /// See [`statistics_from_parquet_meta_calc`] for more details
+#[deprecated(
+    since = "50.0.0",
+    note = "Use `DFParquetMetadata::fetch_statistics` instead"
+)]
 pub async fn fetch_statistics(
     store: &dyn ObjectStore,
     table_schema: SchemaRef,
     file: &ObjectMeta,
     metadata_size_hint: Option<usize>,
+    decryption_properties: Option<&FileDecryptionProperties>,
+    file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
 ) -> Result<Statistics> {
-    let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?;
-    statistics_from_parquet_meta_calc(&metadata, table_schema)
+    let decryption_properties = decryption_properties.cloned().map(Arc::new);
+    DFParquetMetadata::new(store, file)
+        .with_metadata_size_hint(metadata_size_hint)
+        .with_decryption_properties(decryption_properties)
+        .with_file_metadata_cache(file_metadata_cache)
+        .fetch_statistics(&table_schema)
+        .await
 }
 
-/// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using [`StatisticsConverter`]
-///
-/// The statistics are calculated for each column in the table schema
-/// using the row group statistics in the parquet metadata.
-///
-/// # Key behaviors:
-///
-/// 1. Extracts row counts and byte sizes from all row groups
-/// 2. Applies schema type coercions to align file schema with table schema
-/// 3. Collects and aggregates statistics across row groups when available
-///
-/// # When there are no statistics:
-///
-/// If the Parquet file doesn't contain any statistics (has_statistics is false), the function returns a Statistics object with:
-/// - Exact row count
-/// - Exact byte size
-/// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema)
-/// # When only some columns have statistics:
-///
-/// For columns with statistics:
-/// - Min/max values are properly extracted and represented as Precision::Exact
-/// - Null counts are calculated by summing across row groups
-///
-/// For columns without statistics,
-/// - For min/max, there are two situations:
-///     1. The column isn't in arrow schema, then min/max values are set to Precision::Absent
-///     2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null)
-/// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null)
+#[deprecated(
+    since = "50.0.0",
+    note = "Use `DFParquetMetadata::statistics_from_parquet_metadata` instead"
+)]
+#[expect(clippy::needless_pass_by_value)]
 pub fn statistics_from_parquet_meta_calc(
     metadata: &ParquetMetaData,
     table_schema: SchemaRef,
 ) -> Result<Statistics> {
-    let row_groups_metadata = metadata.row_groups();
-
-    let mut statistics = Statistics::new_unknown(&table_schema);
-    let mut has_statistics = false;
-    let mut num_rows = 0_usize;
-    let mut total_byte_size = 0_usize;
-    for row_group_meta in row_groups_metadata {
-        num_rows += row_group_meta.num_rows() as usize;
-        total_byte_size += row_group_meta.total_byte_size() as usize;
-
-        if !has_statistics {
-            has_statistics = row_group_meta
-                .columns()
-                .iter()
-                .any(|column| column.statistics().is_some());
-        }
-    }
-    statistics.num_rows = Precision::Exact(num_rows);
-    statistics.total_byte_size = Precision::Exact(total_byte_size);
-
-    let file_metadata = metadata.file_metadata();
-    let mut file_schema = parquet_to_arrow_schema(
-        file_metadata.schema_descr(),
-        file_metadata.key_value_metadata(),
-    )?;
-
-    if let Some(merged) = apply_file_schema_type_coercions(&table_schema, &file_schema) {
-        file_schema = merged;
-    }
-
-    statistics.column_statistics = if has_statistics {
-        let (mut max_accs, mut min_accs) = create_max_min_accs(&table_schema);
-        let mut null_counts_array =
-            vec![Precision::Exact(0); table_schema.fields().len()];
-
-        table_schema
-            .fields()
-            .iter()
-            .enumerate()
-            .for_each(|(idx, field)| {
-                match StatisticsConverter::try_new(
-                    field.name(),
-                    &file_schema,
-                    file_metadata.schema_descr(),
-                ) {
-                    Ok(stats_converter) => {
-                        summarize_min_max_null_counts(
-                            &mut min_accs,
-                            &mut max_accs,
-                            &mut null_counts_array,
-                            idx,
-                            num_rows,
-                            &stats_converter,
-                            row_groups_metadata,
-                        )
-                        .ok();
-                    }
-                    Err(e) => {
-                        debug!("Failed to create statistics converter: {e}");
-                        null_counts_array[idx] = Precision::Exact(num_rows);
-                    }
-                }
-            });
-
-        get_col_stats(
-            &table_schema,
-            null_counts_array,
-            &mut max_accs,
-            &mut min_accs,
-        )
-    } else {
-        Statistics::unknown_column(&table_schema)
-    };
-
-    Ok(statistics)
-}
-
-fn get_col_stats(
-    schema: &Schema,
-    null_counts: Vec<Precision<usize>>,
-    max_values: &mut [Option<MaxAccumulator>],
-    min_values: &mut [Option<MinAccumulator>],
-) -> Vec<ColumnStatistics> {
-    (0..schema.fields().len())
-        .map(|i| {
-            let max_value = match max_values.get_mut(i).unwrap() {
-                Some(max_value) => max_value.evaluate().ok(),
-                None => None,
-            };
-            let min_value = match min_values.get_mut(i).unwrap() {
-                Some(min_value) => min_value.evaluate().ok(),
-                None => None,
-            };
-            ColumnStatistics {
-                null_count: null_counts[i],
-                max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
-                min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
-                sum_value: Precision::Absent,
-                distinct_count: Precision::Absent,
-            }
-        })
-        .collect()
-}
-
-fn summarize_min_max_null_counts(
-    min_accs: &mut [Option<MinAccumulator>],
-    max_accs: &mut [Option<MaxAccumulator>],
-    null_counts_array: &mut [Precision<usize>],
-    arrow_schema_index: usize,
-    num_rows: usize,
-    stats_converter: &StatisticsConverter,
-    row_groups_metadata: &[RowGroupMetaData],
-) -> Result<()> {
-    let max_values = stats_converter.row_group_maxes(row_groups_metadata)?;
-    let min_values = stats_converter.row_group_mins(row_groups_metadata)?;
-    let null_counts = stats_converter.row_group_null_counts(row_groups_metadata)?;
-
-    if let Some(max_acc) = &mut max_accs[arrow_schema_index] {
-        max_acc.update_batch(&[max_values])?;
-    }
-
-    if let Some(min_acc) = &mut min_accs[arrow_schema_index] {
-        min_acc.update_batch(&[min_values])?;
-    }
-
-    null_counts_array[arrow_schema_index] = Precision::Exact(match sum(&null_counts) {
-        Some(null_count) => null_count as usize,
-        None => num_rows,
-    });
-
-    Ok(())
+    DFParquetMetadata::statistics_from_parquet_metadata(metadata, &table_schema)
 }
 
 /// Implements [`DataSink`] for writing to a parquet file.
@@ -1148,7 +1156,11 @@ pub struct ParquetSink {
     parquet_options: TableParquetOptions,
     /// File metadata from successfully produced parquet files. The Mutex is only used
     /// to allow inserting to HashMap from behind borrowed reference in DataSink::write_all.
-    written: Arc<parking_lot::Mutex<HashMap<Path, FileMetaData>>>,
+    written: Arc<parking_lot::Mutex<HashMap<Path, ParquetMetaData>>>,
+    /// Optional sorting columns to write to Parquet metadata
+    sorting_columns: Option<Vec<SortingColumn>>,
+    /// Metrics for tracking write operations
+    metrics: ExecutionPlanMetricsSet,
 }
 
 impl Debug for ParquetSink {
@@ -1180,26 +1192,34 @@ impl ParquetSink {
             config,
             parquet_options,
             written: Default::default(),
+            sorting_columns: None,
+            metrics: ExecutionPlanMetricsSet::new(),
         }
     }
 
+    /// Set sorting columns for the Parquet file metadata.
+    pub fn with_sorting_columns(
+        mut self,
+        sorting_columns: Option<Vec<SortingColumn>>,
+    ) -> Self {
+        self.sorting_columns = sorting_columns;
+        self
+    }
+
     /// Retrieve the file metadata for the written files, keyed to the path
     /// which may be partitioned (in the case of hive style partitioning).
-    pub fn written(&self) -> HashMap<Path, FileMetaData> {
+    pub fn written(&self) -> HashMap<Path, ParquetMetaData> {
         self.written.lock().clone()
     }
 
     /// Create writer properties based upon configuration settings,
     /// including partitioning and the inclusion of arrow schema metadata.
-    fn create_writer_props(&self) -> Result<WriterProperties> {
-        let schema = if self.parquet_options.global.allow_single_file_parallelism {
-            // If parallelizing writes, we may be also be doing hive style partitioning
-            // into multiple files which impacts the schema per file.
-            // Refer to `get_writer_schema()`
-            &get_writer_schema(&self.config)
-        } else {
-            self.config.output_schema()
-        };
+    async fn create_writer_props(
+        &self,
+        runtime: &Arc<RuntimeEnv>,
+        path: &Path,
+    ) -> Result<WriterProperties> {
+        let schema = self.config.output_schema();
 
         // TODO: avoid this clone in follow up PR, where the writer properties & schema
         // are calculated once on `ParquetSink::new`
@@ -1208,7 +1228,22 @@ impl ParquetSink {
             parquet_opts.arrow_schema(schema);
         }
 
-        Ok(WriterPropertiesBuilder::try_from(&parquet_opts)?.build())
+        let mut builder = WriterPropertiesBuilder::try_from(&parquet_opts)?;
+
+        // Set sorting columns if configured
+        if let Some(ref sorting_columns) = self.sorting_columns {
+            builder = builder.set_sorting_columns(Some(sorting_columns.clone()));
+        }
+
+        builder = set_writer_encryption_properties(
+            builder,
+            runtime,
+            parquet_opts,
+            schema,
+            path,
+        )
+        .await?;
+        Ok(builder.build())
     }
 
     /// Creates an AsyncArrowWriter which serializes a parquet file to an ObjectStore
@@ -1247,6 +1282,50 @@ impl ParquetSink {
     }
 }
 
+#[cfg(feature = "parquet_encryption")]
+async fn set_writer_encryption_properties(
+    builder: WriterPropertiesBuilder,
+    runtime: &Arc<RuntimeEnv>,
+    parquet_opts: TableParquetOptions,
+    schema: &Arc<Schema>,
+    path: &Path,
+) -> Result<WriterPropertiesBuilder> {
+    if let Some(file_encryption_properties) = parquet_opts.crypto.file_encryption {
+        // Encryption properties have been specified directly
+        return Ok(builder.with_file_encryption_properties(Arc::new(
+            FileEncryptionProperties::from(file_encryption_properties),
+        )));
+    } else if let Some(encryption_factory_id) = &parquet_opts.crypto.factory_id.as_ref() {
+        // Encryption properties will be generated by an encryption factory
+        let encryption_factory =
+            runtime.parquet_encryption_factory(encryption_factory_id)?;
+        let file_encryption_properties = encryption_factory
+            .get_file_encryption_properties(
+                &parquet_opts.crypto.factory_options,
+                schema,
+                path,
+            )
+            .await?;
+        if let Some(file_encryption_properties) = file_encryption_properties {
+            return Ok(
+                builder.with_file_encryption_properties(file_encryption_properties)
+            );
+        }
+    }
+    Ok(builder)
+}
+
+#[cfg(not(feature = "parquet_encryption"))]
+async fn set_writer_encryption_properties(
+    builder: WriterPropertiesBuilder,
+    _runtime: &Arc<RuntimeEnv>,
+    _parquet_opts: TableParquetOptions,
+    _schema: &Arc<Schema>,
+    _path: &Path,
+) -> Result<WriterPropertiesBuilder> {
+    Ok(builder)
+}
+
 #[async_trait]
 impl FileSink for ParquetSink {
     fn config(&self) -> &FileSinkConfig {
@@ -1260,15 +1339,24 @@ impl FileSink for ParquetSink {
         mut file_stream_rx: DemuxedStreamReceiver,
         object_store: Arc<dyn ObjectStore>,
     ) -> Result<u64> {
+        let rows_written_counter =
+            MetricBuilder::new(&self.metrics).global_counter("rows_written");
+        // Note: bytes_written is the sum of compressed row group sizes, which
+        // may differ slightly from the actual on-disk file size (excludes footer,
+        // page indexes, and other Parquet metadata overhead).
+        let bytes_written_counter =
+            MetricBuilder::new(&self.metrics).global_counter("bytes_written");
+        let elapsed_compute = MetricBuilder::new(&self.metrics).elapsed_compute(0);
+
+        let write_start = datafusion_common::instant::Instant::now();
+
         let parquet_opts = &self.parquet_options;
-        let allow_single_file_parallelism =
-            parquet_opts.global.allow_single_file_parallelism;
 
         let mut file_write_tasks: JoinSet<
-            std::result::Result<(Path, FileMetaData), DataFusionError>,
+            std::result::Result<(Path, ParquetMetaData), DataFusionError>,
         > = JoinSet::new();
 
-        let parquet_props = self.create_writer_props()?;
+        let runtime = context.runtime_env();
         let parallel_options = ParallelParquetWriterOptions {
             max_parallel_row_groups: parquet_opts
                 .global
@@ -1279,7 +1367,8 @@ impl FileSink for ParquetSink {
         };
 
         while let Some((path, mut rx)) = file_stream_rx.recv().await {
-            if !allow_single_file_parallelism {
+            let parquet_props = self.create_writer_props(&runtime, &path).await?;
+            if !parquet_opts.global.allow_single_file_parallelism {
                 let mut writer = self
                     .create_async_arrow_writer(
                         &path,
@@ -1288,18 +1377,18 @@ impl FileSink for ParquetSink {
                         parquet_props.clone(),
                     )
                     .await?;
-                let mut reservation = MemoryConsumer::new(format!("ParquetSink[{path}]"))
+                let reservation = MemoryConsumer::new(format!("ParquetSink[{path}]"))
                     .register(context.memory_pool());
                 file_write_tasks.spawn(async move {
                     while let Some(batch) = rx.recv().await {
                         writer.write(&batch).await?;
                         reservation.try_resize(writer.memory_size())?;
                     }
-                    let file_metadata = writer
+                    let parquet_meta_data = writer
                         .close()
                         .await
-                        .map_err(DataFusionError::ParquetError)?;
-                    Ok((path, file_metadata))
+                        .map_err(|e| DataFusionError::ParquetError(Box::new(e)))?;
+                    Ok((path, parquet_meta_data))
                 });
             } else {
                 let writer = ObjectWriterBuilder::new(
@@ -1319,32 +1408,40 @@ impl FileSink for ParquetSink {
                 .build()?;
                 let schema = get_writer_schema(&self.config);
                 let props = parquet_props.clone();
+                let skip_arrow_metadata = self.parquet_options.global.skip_arrow_metadata;
                 let parallel_options_clone = parallel_options.clone();
                 let pool = Arc::clone(context.memory_pool());
                 file_write_tasks.spawn(async move {
-                    let file_metadata = output_single_parquet_file_parallelized(
+                    let parquet_meta_data = output_single_parquet_file_parallelized(
                         writer,
                         rx,
                         schema,
                         &props,
+                        skip_arrow_metadata,
                         parallel_options_clone,
                         pool,
                     )
                     .await?;
-                    Ok((path, file_metadata))
+                    Ok((path, parquet_meta_data))
                 });
             }
         }
 
-        let mut row_count = 0;
         while let Some(result) = file_write_tasks.join_next().await {
             match result {
                 Ok(r) => {
-                    let (path, file_metadata) = r?;
-                    row_count += file_metadata.num_rows;
+                    let (path, parquet_meta_data) = r?;
+                    let file_rows = parquet_meta_data.file_metadata().num_rows() as usize;
+                    let file_bytes: usize = parquet_meta_data
+                        .row_groups()
+                        .iter()
+                        .map(|rg| rg.compressed_size() as usize)
+                        .sum();
+                    rows_written_counter.add(file_rows);
+                    bytes_written_counter.add(file_bytes);
                     let mut written_files = self.written.lock();
                     written_files
-                        .try_insert(path.clone(), file_metadata)
+                        .try_insert(path.clone(), parquet_meta_data)
                         .map_err(|e| internal_datafusion_err!("duplicate entry detected for partitioned file {path}: {e}"))?;
                     drop(written_files);
                 }
@@ -1361,9 +1458,11 @@ impl FileSink for ParquetSink {
         demux_task
             .join_unwind()
             .await
-            .map_err(DataFusionError::ExecutionJoin)??;
+            .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+
+        elapsed_compute.add_elapsed(write_start);
 
-        Ok(row_count as u64)
+        Ok(rows_written_counter.value() as u64)
     }
 }
 
@@ -1373,6 +1472,10 @@ impl DataSink for ParquetSink {
         self
     }
 
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
     fn schema(&self) -> &SchemaRef {
         self.config.output_schema()
     }
@@ -1391,7 +1494,7 @@ impl DataSink for ParquetSink {
 async fn column_serializer_task(
     mut rx: Receiver<ArrowLeafColumn>,
     mut writer: ArrowColumnWriter,
-    mut reservation: MemoryReservation,
+    reservation: MemoryReservation,
 ) -> Result<(ArrowColumnWriter, MemoryReservation)> {
     while let Some(col) = rx.recv().await {
         writer.write(&col)?;
@@ -1407,13 +1510,10 @@ type ColSender = Sender<ArrowLeafColumn>;
 /// Returns join handles for each columns serialization task along with a send channel
 /// to send arrow arrays to each serialization task.
 fn spawn_column_parallel_row_group_writer(
-    schema: Arc<Schema>,
-    parquet_props: Arc<WriterProperties>,
+    col_writers: Vec<ArrowColumnWriter>,
     max_buffer_size: usize,
     pool: &Arc<dyn MemoryPool>,
 ) -> Result<(Vec<ColumnWriterTask>, Vec<ColSender>)> {
-    let schema_desc = ArrowSchemaConverter::new().convert(&schema)?;
-    let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?;
     let num_columns = col_writers.len();
 
     let mut col_writer_tasks = Vec::with_capacity(num_columns);
@@ -1479,7 +1579,7 @@ fn spawn_rg_join_and_finalize_task(
     rg_rows: usize,
     pool: &Arc<dyn MemoryPool>,
 ) -> SpawnedTask<RBStreamSerializeResult> {
-    let mut rg_reservation =
+    let rg_reservation =
         MemoryConsumer::new("ParquetSink(SerializedRowGroupWriter)").register(pool);
 
     SpawnedTask::spawn(async move {
@@ -1489,7 +1589,7 @@ fn spawn_rg_join_and_finalize_task(
             let (writer, _col_reservation) = task
                 .join_unwind()
                 .await
-                .map_err(DataFusionError::ExecutionJoin)??;
+                .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
             let encoded_size = writer.get_estimated_total_bytes();
             rg_reservation.grow(encoded_size);
             finalized_rg.push(writer.close()?);
@@ -1508,23 +1608,24 @@ fn spawn_rg_join_and_finalize_task(
 /// across both columns and row_groups, with a theoretical max number of parallel tasks
 /// given by n_columns * num_row_groups.
 fn spawn_parquet_parallel_serialization_task(
+    row_group_writer_factory: ArrowRowGroupWriterFactory,
     mut data: Receiver<RecordBatch>,
     serialize_tx: Sender<SpawnedTask<RBStreamSerializeResult>>,
     schema: Arc<Schema>,
     writer_props: Arc<WriterProperties>,
-    parallel_options: ParallelParquetWriterOptions,
+    parallel_options: Arc<ParallelParquetWriterOptions>,
     pool: Arc<dyn MemoryPool>,
 ) -> SpawnedTask<Result<(), DataFusionError>> {
     SpawnedTask::spawn(async move {
         let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream;
-        let max_row_group_rows = writer_props.max_row_group_size();
+        let max_row_group_rows = writer_props
+            .max_row_group_row_count()
+            .unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT);
+        let mut row_group_index = 0;
+        let col_writers =
+            row_group_writer_factory.create_column_writers(row_group_index)?;
         let (mut column_writer_handles, mut col_array_channels) =
-            spawn_column_parallel_row_group_writer(
-                Arc::clone(&schema),
-                Arc::clone(&writer_props),
-                max_buffer_rb,
-                &pool,
-            )?;
+            spawn_column_parallel_row_group_writer(col_writers, max_buffer_rb, &pool)?;
         let mut current_rg_rows = 0;
 
         while let Some(mut rb) = data.recv().await {
@@ -1570,10 +1671,12 @@ fn spawn_parquet_parallel_serialization_task(
                     current_rg_rows = 0;
                     rb = rb.slice(rows_left, rb.num_rows() - rows_left);
 
+                    row_group_index += 1;
+                    let col_writers = row_group_writer_factory
+                        .create_column_writers(row_group_index)?;
                     (column_writer_handles, col_array_channels) =
                         spawn_column_parallel_row_group_writer(
-                            Arc::clone(&schema),
-                            Arc::clone(&writer_props),
+                            col_writers,
                             max_buffer_rb,
                             &pool,
                         )?;
@@ -1604,29 +1707,21 @@ fn spawn_parquet_parallel_serialization_task(
 /// Consume RowGroups serialized by other parallel tasks and concatenate them in
 /// to the final parquet file, while flushing finalized bytes to an [ObjectStore]
 async fn concatenate_parallel_row_groups(
+    mut parquet_writer: SerializedFileWriter<SharedBuffer>,
+    merged_buff: SharedBuffer,
     mut serialize_rx: Receiver<SpawnedTask<RBStreamSerializeResult>>,
-    schema: Arc<Schema>,
-    writer_props: Arc<WriterProperties>,
     mut object_store_writer: Box<dyn AsyncWrite + Send + Unpin>,
     pool: Arc<dyn MemoryPool>,
-) -> Result<FileMetaData> {
-    let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
-
-    let mut file_reservation =
+) -> Result<ParquetMetaData> {
+    let file_reservation =
         MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool);
 
-    let schema_desc = ArrowSchemaConverter::new().convert(schema.as_ref())?;
-    let mut parquet_writer = SerializedFileWriter::new(
-        merged_buff.clone(),
-        schema_desc.root_schema_ptr(),
-        writer_props,
-    )?;
-
     while let Some(task) = serialize_rx.recv().await {
         let result = task.join_unwind().await;
+        let (serialized_columns, rg_reservation, _cnt) =
+            result.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+
         let mut rg_out = parquet_writer.next_row_group()?;
-        let (serialized_columns, mut rg_reservation, _cnt) =
-            result.map_err(DataFusionError::ExecutionJoin)??;
         for chunk in serialized_columns {
             chunk.append_to_row_group(&mut rg_out)?;
             rg_reservation.free();
@@ -1645,14 +1740,14 @@ async fn concatenate_parallel_row_groups(
         rg_out.close()?;
     }
 
-    let file_metadata = parquet_writer.close()?;
+    let parquet_meta_data = parquet_writer.close()?;
     let final_buff = merged_buff.buffer.try_lock().unwrap();
 
     object_store_writer.write_all(final_buff.as_slice()).await?;
     object_store_writer.shutdown().await?;
     file_reservation.free();
 
-    Ok(file_metadata)
+    Ok(parquet_meta_data)
 }
 
 /// Parallelizes the serialization of a single parquet file, by first serializing N
@@ -1664,27 +1759,40 @@ async fn output_single_parquet_file_parallelized(
     data: Receiver<RecordBatch>,
     output_schema: Arc<Schema>,
     parquet_props: &WriterProperties,
+    skip_arrow_metadata: bool,
     parallel_options: ParallelParquetWriterOptions,
     pool: Arc<dyn MemoryPool>,
-) -> Result<FileMetaData> {
+) -> Result<ParquetMetaData> {
     let max_rowgroups = parallel_options.max_parallel_row_groups;
     // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel
     let (serialize_tx, serialize_rx) =
         mpsc::channel::<SpawnedTask<RBStreamSerializeResult>>(max_rowgroups);
 
     let arc_props = Arc::new(parquet_props.clone());
+    let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
+    let options = ArrowWriterOptions::new()
+        .with_properties(parquet_props.clone())
+        .with_skip_arrow_metadata(skip_arrow_metadata);
+    let writer = ArrowWriter::try_new_with_options(
+        merged_buff.clone(),
+        Arc::clone(&output_schema),
+        options,
+    )?;
+    let (writer, row_group_writer_factory) = writer.into_serialized_writer()?;
+
     let launch_serialization_task = spawn_parquet_parallel_serialization_task(
+        row_group_writer_factory,
         data,
         serialize_tx,
         Arc::clone(&output_schema),
         Arc::clone(&arc_props),
-        parallel_options,
+        parallel_options.into(),
         Arc::clone(&pool),
     );
-    let file_metadata = concatenate_parallel_row_groups(
+    let parquet_meta_data = concatenate_parallel_row_groups(
+        writer,
+        merged_buff,
         serialize_rx,
-        Arc::clone(&output_schema),
-        Arc::clone(&arc_props),
         object_store_writer,
         pool,
     )
@@ -1693,44 +1801,13 @@ async fn output_single_parquet_file_parallelized(
     launch_serialization_task
         .join_unwind()
         .await
-        .map_err(DataFusionError::ExecutionJoin)??;
-    Ok(file_metadata)
-}
-
-/// Min/max aggregation can take Dictionary encode input but always produces unpacked
-/// (aka non Dictionary) output. We need to adjust the output data type to reflect this.
-/// The reason min/max aggregate produces unpacked output because there is only one
-/// min/max value per group; there is no needs to keep them Dictionary encode
-fn min_max_aggregate_data_type(input_type: &DataType) -> &DataType {
-    if let DataType::Dictionary(_, value_type) = input_type {
-        value_type.as_ref()
-    } else {
-        input_type
-    }
-}
-
-fn create_max_min_accs(
-    schema: &Schema,
-) -> (Vec<Option<MaxAccumulator>>, Vec<Option<MinAccumulator>>) {
-    let max_values: Vec<Option<MaxAccumulator>> = schema
-        .fields()
-        .iter()
-        .map(|field| {
-            MaxAccumulator::try_new(min_max_aggregate_data_type(field.data_type())).ok()
-        })
-        .collect();
-    let min_values: Vec<Option<MinAccumulator>> = schema
-        .fields()
-        .iter()
-        .map(|field| {
-            MinAccumulator::try_new(min_max_aggregate_data_type(field.data_type())).ok()
-        })
-        .collect();
-    (max_values, min_values)
+        .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+    Ok(parquet_meta_data)
 }
 
 #[cfg(test)]
 mod tests {
+    use parquet::arrow::parquet_to_arrow_schema;
     use std::sync::Arc;
 
     use super::*;
diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs
new file mode 100644
index 0000000000000..e5781ad68ddf4
--- /dev/null
+++ b/datafusion/datasource-parquet/src/metadata.rs
@@ -0,0 +1,1234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`DFParquetMetadata`] for fetching Parquet file metadata, statistics
+//! and schema information.
+
+use crate::{
+    ObjectStoreFetch, apply_file_schema_type_coercions, coerce_int96_to_resolution,
+};
+use arrow::array::{Array, ArrayRef, BooleanArray};
+use arrow::compute::and;
+use arrow::compute::kernels::cmp::eq;
+use arrow::compute::sum;
+use arrow::datatypes::{DataType, Schema, SchemaRef, TimeUnit};
+use datafusion_common::encryption::FileDecryptionProperties;
+use datafusion_common::stats::Precision;
+use datafusion_common::{
+    ColumnStatistics, DataFusionError, Result, ScalarValue, Statistics,
+};
+use datafusion_execution::cache::cache_manager::{
+    CachedFileMetadataEntry, FileMetadata, FileMetadataCache,
+};
+use datafusion_functions_aggregate_common::min_max::{MaxAccumulator, MinAccumulator};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::Accumulator;
+use log::debug;
+use object_store::path::Path;
+use object_store::{ObjectMeta, ObjectStore};
+use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
+use parquet::arrow::{parquet_column, parquet_to_arrow_schema};
+use parquet::file::metadata::{
+    PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData,
+    SortingColumn,
+};
+use parquet::schema::types::SchemaDescriptor;
+use std::any::Any;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Minimum fraction of row groups that must report NDV statistics for the
+/// merged result to be `Inexact` rather than `Absent`, as the estimate
+/// would be too unreliable otherwise.
+const PARTIAL_NDV_THRESHOLD: f64 = 0.75;
+
+/// Handles fetching Parquet file schema, metadata and statistics
+/// from object store.
+///
+/// This component is exposed for low level integrations through
+/// [`ParquetFileReaderFactory`].
+///
+/// [`ParquetFileReaderFactory`]: crate::ParquetFileReaderFactory
+#[derive(Debug)]
+pub struct DFParquetMetadata<'a> {
+    store: &'a dyn ObjectStore,
+    object_meta: &'a ObjectMeta,
+    metadata_size_hint: Option<usize>,
+    decryption_properties: Option<Arc<FileDecryptionProperties>>,
+    file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    /// timeunit to coerce INT96 timestamps to
+    pub coerce_int96: Option<TimeUnit>,
+}
+
+impl<'a> DFParquetMetadata<'a> {
+    pub fn new(store: &'a dyn ObjectStore, object_meta: &'a ObjectMeta) -> Self {
+        Self {
+            store,
+            object_meta,
+            metadata_size_hint: None,
+            decryption_properties: None,
+            file_metadata_cache: None,
+            coerce_int96: None,
+        }
+    }
+
+    /// set metadata size hint
+    pub fn with_metadata_size_hint(mut self, metadata_size_hint: Option<usize>) -> Self {
+        self.metadata_size_hint = metadata_size_hint;
+        self
+    }
+
+    /// set decryption properties
+    pub fn with_decryption_properties(
+        mut self,
+        decryption_properties: Option<Arc<FileDecryptionProperties>>,
+    ) -> Self {
+        self.decryption_properties = decryption_properties;
+        self
+    }
+
+    /// set file metadata cache
+    pub fn with_file_metadata_cache(
+        mut self,
+        file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    ) -> Self {
+        self.file_metadata_cache = file_metadata_cache;
+        self
+    }
+
+    /// Set timeunit to coerce INT96 timestamps to
+    pub fn with_coerce_int96(mut self, time_unit: Option<TimeUnit>) -> Self {
+        self.coerce_int96 = time_unit;
+        self
+    }
+
+    /// Fetch parquet metadata from the remote object store
+    pub async fn fetch_metadata(&self) -> Result<Arc<ParquetMetaData>> {
+        let Self {
+            store,
+            object_meta,
+            metadata_size_hint,
+            decryption_properties,
+            file_metadata_cache,
+            coerce_int96: _,
+        } = self;
+
+        let fetch = ObjectStoreFetch::new(*store, object_meta);
+
+        // implementation to fetch parquet metadata
+        let cache_metadata =
+            !cfg!(feature = "parquet_encryption") || decryption_properties.is_none();
+
+        if cache_metadata
+            && let Some(file_metadata_cache) = file_metadata_cache.as_ref()
+            && let Some(cached) = file_metadata_cache.get(&object_meta.location)
+            && cached.is_valid_for(object_meta)
+            && let Some(cached_parquet) = cached
+                .file_metadata
+                .as_any()
+                .downcast_ref::<CachedParquetMetaData>()
+        {
+            return Ok(Arc::clone(cached_parquet.parquet_metadata()));
+        }
+
+        let mut reader =
+            ParquetMetaDataReader::new().with_prefetch_hint(*metadata_size_hint);
+
+        #[cfg(feature = "parquet_encryption")]
+        if let Some(decryption_properties) = decryption_properties {
+            reader = reader
+                .with_decryption_properties(Some(Arc::clone(decryption_properties)));
+        }
+
+        if cache_metadata && file_metadata_cache.is_some() {
+            // Need to retrieve the entire metadata for the caching to be effective.
+            reader = reader.with_page_index_policy(PageIndexPolicy::Optional);
+        }
+
+        let metadata = Arc::new(
+            reader
+                .load_and_finish(fetch, object_meta.size)
+                .await
+                .map_err(DataFusionError::from)?,
+        );
+
+        if cache_metadata && let Some(file_metadata_cache) = file_metadata_cache {
+            file_metadata_cache.put(
+                &object_meta.location,
+                CachedFileMetadataEntry::new(
+                    (*object_meta).clone(),
+                    Arc::new(CachedParquetMetaData::new(Arc::clone(&metadata))),
+                ),
+            );
+        }
+
+        Ok(metadata)
+    }
+
+    /// Read and parse the schema of the Parquet file
+    pub async fn fetch_schema(&self) -> Result<Schema> {
+        let metadata = self.fetch_metadata().await?;
+
+        let file_metadata = metadata.file_metadata();
+        let schema = parquet_to_arrow_schema(
+            file_metadata.schema_descr(),
+            file_metadata.key_value_metadata(),
+        )?;
+        let schema = self
+            .coerce_int96
+            .as_ref()
+            .and_then(|time_unit| {
+                coerce_int96_to_resolution(
+                    file_metadata.schema_descr(),
+                    &schema,
+                    time_unit,
+                )
+            })
+            .unwrap_or(schema);
+        Ok(schema)
+    }
+
+    /// Return (path, schema) tuple by fetching the schema from Parquet file
+    pub(crate) async fn fetch_schema_with_location(&self) -> Result<(Path, Schema)> {
+        let loc_path = self.object_meta.location.clone();
+        let schema = self.fetch_schema().await?;
+        Ok((loc_path, schema))
+    }
+
+    /// Fetch the metadata from the Parquet file via [`Self::fetch_metadata`] and convert
+    /// the statistics in the metadata using [`Self::statistics_from_parquet_metadata`]
+    pub async fn fetch_statistics(&self, table_schema: &SchemaRef) -> Result<Statistics> {
+        let metadata = self.fetch_metadata().await?;
+        Self::statistics_from_parquet_metadata(&metadata, table_schema)
+    }
+
+    /// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using [`StatisticsConverter`]
+    ///
+    /// The statistics are calculated for each column in the table schema
+    /// using the row group statistics in the parquet metadata.
+    ///
+    /// # Key behaviors:
+    ///
+    /// 1. Extracts row counts and byte sizes from all row groups
+    /// 2. Applies schema type coercions to align file schema with table schema
+    /// 3. Collects and aggregates statistics across row groups when available
+    ///
+    /// # When there are no statistics:
+    ///
+    /// If the Parquet file doesn't contain any statistics (has_statistics is false), the function returns a Statistics object with:
+    /// - Exact row count
+    /// - Exact byte size
+    /// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema)
+    /// - Column byte sizes are still calculated and recorded
+    ///
+    /// # When only some columns have statistics:
+    ///
+    /// For columns with statistics:
+    /// - Min/max values are properly extracted and represented as Precision::Exact
+    /// - Null counts are calculated by summing across row groups
+    /// - Byte sizes are calculated and recorded
+    ///
+    /// For columns without statistics,
+    /// - For min/max, there are two situations:
+    ///     1. The column isn't in arrow schema, then min/max values are set to Precision::Absent
+    ///     2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null)
+    /// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null)
+    ///
+    /// # Byte Size Calculation:
+    ///
+    /// - For primitive types with known fixed size, exact byte size is calculated as (byte width * number of rows)
+    /// - For other types, uncompressed Parquet size is used as an estimate for in-memory size
+    /// - If neither method is applicable, byte size is marked as Precision::Absent
+    pub fn statistics_from_parquet_metadata(
+        metadata: &ParquetMetaData,
+        logical_file_schema: &SchemaRef,
+    ) -> Result<Statistics> {
+        let row_groups_metadata = metadata.row_groups();
+
+        // Use Statistics::default() as opposed to Statistics::new_unknown()
+        // because we are going to replace the column statistics below
+        // and we don't want to initialize them twice.
+        let mut statistics = Statistics::default();
+        let mut has_statistics = false;
+        let mut num_rows = 0_usize;
+        for row_group_meta in row_groups_metadata {
+            num_rows += row_group_meta.num_rows() as usize;
+
+            if !has_statistics {
+                has_statistics = row_group_meta
+                    .columns()
+                    .iter()
+                    .any(|column| column.statistics().is_some());
+            }
+        }
+        statistics.num_rows = Precision::Exact(num_rows);
+
+        let file_metadata = metadata.file_metadata();
+        let mut physical_file_schema = parquet_to_arrow_schema(
+            file_metadata.schema_descr(),
+            file_metadata.key_value_metadata(),
+        )?;
+
+        if let Some(merged) =
+            apply_file_schema_type_coercions(logical_file_schema, &physical_file_schema)
+        {
+            physical_file_schema = merged;
+        }
+
+        statistics.column_statistics =
+            if has_statistics {
+                let (mut max_accs, mut min_accs) =
+                    create_max_min_accs(logical_file_schema);
+                let mut null_counts_array =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                let mut column_byte_sizes =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                let mut is_max_value_exact =
+                    vec![Some(true); logical_file_schema.fields().len()];
+                let mut is_min_value_exact =
+                    vec![Some(true); logical_file_schema.fields().len()];
+                let mut distinct_counts_array =
+                    vec![Precision::Absent; logical_file_schema.fields().len()];
+                logical_file_schema.fields().iter().enumerate().for_each(
+                    |(idx, field)| match StatisticsConverter::try_new(
+                        field.name(),
+                        &physical_file_schema,
+                        file_metadata.schema_descr(),
+                    ) {
+                        Ok(stats_converter) => {
+                            let mut accumulators = StatisticsAccumulators {
+                                min_accs: &mut min_accs,
+                                max_accs: &mut max_accs,
+                                null_counts_array: &mut null_counts_array,
+                                is_min_value_exact: &mut is_min_value_exact,
+                                is_max_value_exact: &mut is_max_value_exact,
+                                column_byte_sizes: &mut column_byte_sizes,
+                                distinct_counts_array: &mut distinct_counts_array,
+                            };
+                            summarize_column_statistics(
+                                file_metadata.schema_descr(),
+                                logical_file_schema,
+                                &physical_file_schema,
+                                &mut accumulators,
+                                idx,
+                                &stats_converter,
+                                row_groups_metadata,
+                            )
+                            .ok();
+                        }
+                        Err(e) => {
+                            debug!("Failed to create statistics converter: {e}");
+                            null_counts_array[idx] = Precision::Exact(num_rows);
+                        }
+                    },
+                );
+
+                let mut accumulators = StatisticsAccumulators {
+                    min_accs: &mut min_accs,
+                    max_accs: &mut max_accs,
+                    null_counts_array: &mut null_counts_array,
+                    is_min_value_exact: &mut is_min_value_exact,
+                    is_max_value_exact: &mut is_max_value_exact,
+                    column_byte_sizes: &mut column_byte_sizes,
+                    distinct_counts_array: &mut distinct_counts_array,
+                };
+                accumulators.build_column_statistics(logical_file_schema)
+            } else {
+                // Record column sizes
+                logical_file_schema
+                    .fields()
+                    .iter()
+                    .enumerate()
+                    .map(|(logical_file_schema_index, field)| {
+                        let arrow_field =
+                            logical_file_schema.field(logical_file_schema_index);
+                        let parquet_idx = parquet_column(
+                            file_metadata.schema_descr(),
+                            &physical_file_schema,
+                            arrow_field.name(),
+                        )
+                        .map(|(idx, _)| idx);
+                        let byte_size = compute_arrow_column_size(
+                            field.data_type(),
+                            row_groups_metadata,
+                            parquet_idx,
+                            num_rows,
+                        );
+                        ColumnStatistics::new_unknown().with_byte_size(byte_size)
+                    })
+                    .collect()
+            };
+
+        #[cfg(debug_assertions)]
+        {
+            // Check that the column statistics length matches the table schema fields length
+            assert_eq!(
+                statistics.column_statistics.len(),
+                logical_file_schema.fields().len(),
+                "Column statistics length does not match table schema fields length"
+            );
+        }
+
+        Ok(statistics)
+    }
+}
+
+/// Min/max aggregation can take Dictionary encode input but always produces unpacked
+/// (aka non Dictionary) output. We need to adjust the output data type to reflect this.
+/// The reason min/max aggregate produces unpacked output because there is only one
+/// min/max value per group; there is no needs to keep them Dictionary encoded
+fn min_max_aggregate_data_type(input_type: &DataType) -> &DataType {
+    if let DataType::Dictionary(_, value_type) = input_type {
+        value_type.as_ref()
+    } else {
+        input_type
+    }
+}
+
+fn create_max_min_accs(
+    schema: &Schema,
+) -> (Vec<Option<MaxAccumulator>>, Vec<Option<MinAccumulator>>) {
+    let max_values: Vec<Option<MaxAccumulator>> = schema
+        .fields()
+        .iter()
+        .map(|field| {
+            MaxAccumulator::try_new(min_max_aggregate_data_type(field.data_type())).ok()
+        })
+        .collect();
+    let min_values: Vec<Option<MinAccumulator>> = schema
+        .fields()
+        .iter()
+        .map(|field| {
+            MinAccumulator::try_new(min_max_aggregate_data_type(field.data_type())).ok()
+        })
+        .collect();
+    (max_values, min_values)
+}
+
+/// Holds the accumulator state for collecting statistics from row groups
+struct StatisticsAccumulators<'a> {
+    min_accs: &'a mut [Option<MinAccumulator>],
+    max_accs: &'a mut [Option<MaxAccumulator>],
+    null_counts_array: &'a mut [Precision<usize>],
+    is_min_value_exact: &'a mut [Option<bool>],
+    is_max_value_exact: &'a mut [Option<bool>],
+    column_byte_sizes: &'a mut [Precision<usize>],
+    distinct_counts_array: &'a mut [Precision<usize>],
+}
+
+impl StatisticsAccumulators<'_> {
+    /// Converts the accumulated statistics into a vector of `ColumnStatistics`
+    fn build_column_statistics(&mut self, schema: &Schema) -> Vec<ColumnStatistics> {
+        (0..schema.fields().len())
+            .map(|i| {
+                let max_value = match (
+                    self.max_accs.get_mut(i).unwrap(),
+                    self.is_max_value_exact.get(i).unwrap(),
+                ) {
+                    (Some(max_value), Some(true)) => {
+                        max_value.evaluate().ok().map(Precision::Exact)
+                    }
+                    (Some(max_value), Some(false)) | (Some(max_value), None) => {
+                        max_value.evaluate().ok().map(Precision::Inexact)
+                    }
+                    (None, _) => None,
+                };
+                let min_value = match (
+                    self.min_accs.get_mut(i).unwrap(),
+                    self.is_min_value_exact.get(i).unwrap(),
+                ) {
+                    (Some(min_value), Some(true)) => {
+                        min_value.evaluate().ok().map(Precision::Exact)
+                    }
+                    (Some(min_value), Some(false)) | (Some(min_value), None) => {
+                        min_value.evaluate().ok().map(Precision::Inexact)
+                    }
+                    (None, _) => None,
+                };
+                ColumnStatistics {
+                    null_count: self.null_counts_array[i],
+                    max_value: max_value.unwrap_or(Precision::Absent),
+                    min_value: min_value.unwrap_or(Precision::Absent),
+                    sum_value: Precision::Absent,
+                    distinct_count: self.distinct_counts_array[i],
+                    byte_size: self.column_byte_sizes[i],
+                }
+            })
+            .collect()
+    }
+}
+
+fn summarize_column_statistics(
+    parquet_schema: &SchemaDescriptor,
+    logical_file_schema: &Schema,
+    physical_file_schema: &Schema,
+    accumulators: &mut StatisticsAccumulators,
+    logical_schema_index: usize,
+    stats_converter: &StatisticsConverter,
+    row_groups_metadata: &[RowGroupMetaData],
+) -> Result<()> {
+    let max_values = stats_converter.row_group_maxes(row_groups_metadata)?;
+    let min_values = stats_converter.row_group_mins(row_groups_metadata)?;
+    let null_counts = stats_converter.row_group_null_counts(row_groups_metadata)?;
+    let is_max_value_exact_stat =
+        stats_converter.row_group_is_max_value_exact(row_groups_metadata)?;
+    let is_min_value_exact_stat =
+        stats_converter.row_group_is_min_value_exact(row_groups_metadata)?;
+
+    if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
+        max_acc.update_batch(&[Arc::clone(&max_values)])?;
+
+        // handle the common special case when all row groups have exact statistics
+        let exactness = &is_max_value_exact_stat;
+        if !exactness.is_empty()
+            && exactness.null_count() == 0
+            && exactness.true_count() == exactness.len()
+        {
+            accumulators.is_max_value_exact[logical_schema_index] = Some(true);
+        } else if exactness.true_count() == 0 {
+            accumulators.is_max_value_exact[logical_schema_index] = Some(false);
+        } else {
+            let val = max_acc.evaluate()?;
+            accumulators.is_max_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &max_values, exactness);
+        }
+    }
+
+    if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
+        min_acc.update_batch(&[Arc::clone(&min_values)])?;
+
+        // handle the common special case when all row groups have exact statistics
+        let exactness = &is_min_value_exact_stat;
+        if !exactness.is_empty()
+            && exactness.null_count() == 0
+            && exactness.true_count() == exactness.len()
+        {
+            accumulators.is_min_value_exact[logical_schema_index] = Some(true);
+        } else if exactness.true_count() == 0 {
+            accumulators.is_min_value_exact[logical_schema_index] = Some(false);
+        } else {
+            let val = min_acc.evaluate()?;
+            accumulators.is_min_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &min_values, exactness);
+        }
+    }
+
+    accumulators.null_counts_array[logical_schema_index] = match sum(&null_counts) {
+        Some(null_count) => Precision::Exact(null_count as usize),
+        None => match null_counts.len() {
+            // If sum() returned None we either have no rows or all values are null
+            0 => Precision::Exact(0),
+            _ => Precision::Absent,
+        },
+    };
+
+    // This is the same logic as parquet_column but we start from arrow schema index
+    // instead of looking up by name.
+    let parquet_index = parquet_column(
+        parquet_schema,
+        physical_file_schema,
+        logical_file_schema.field(logical_schema_index).name(),
+    )
+    .map(|(idx, _)| idx);
+
+    // Extract distinct counts from row group column statistics
+    accumulators.distinct_counts_array[logical_schema_index] =
+        if let Some(parquet_idx) = parquet_index {
+            let num_row_groups = row_groups_metadata.len();
+            let distinct_counts: Vec<u64> = row_groups_metadata
+                .iter()
+                .filter_map(|rg| {
+                    rg.columns()
+                        .get(parquet_idx)
+                        .and_then(|col| col.statistics())
+                        .and_then(|stats| stats.distinct_count_opt())
+                })
+                .collect();
+
+            let coverage = distinct_counts.len() as f64 / num_row_groups.max(1) as f64;
+
+            if coverage < PARTIAL_NDV_THRESHOLD {
+                Precision::Absent
+            } else if distinct_counts.len() == 1 && num_row_groups == 1 {
+                // Single row group with distinct count - use exact value
+                Precision::Exact(distinct_counts[0] as usize)
+            } else {
+                // Multiple row groups - use max as a lower bound estimate
+                // (can't accurately merge NDV since duplicates may exist across row groups)
+                match distinct_counts.iter().max() {
+                    Some(&max_ndv) => Precision::Inexact(max_ndv as usize),
+                    None => Precision::Absent,
+                }
+            }
+        } else {
+            Precision::Absent
+        };
+
+    let arrow_field = logical_file_schema.field(logical_schema_index);
+    accumulators.column_byte_sizes[logical_schema_index] = compute_arrow_column_size(
+        arrow_field.data_type(),
+        row_groups_metadata,
+        parquet_index,
+        row_groups_metadata
+            .iter()
+            .map(|rg| rg.num_rows() as usize)
+            .sum(),
+    );
+
+    Ok(())
+}
+
+/// Compute the Arrow in-memory size for a single column
+fn compute_arrow_column_size(
+    data_type: &DataType,
+    row_groups_metadata: &[RowGroupMetaData],
+    parquet_idx: Option<usize>,
+    num_rows: usize,
+) -> Precision<usize> {
+    // For primitive types with known fixed size, compute exact size
+    if let Some(byte_width) = data_type.primitive_width() {
+        return Precision::Exact(byte_width * num_rows);
+    }
+
+    // Use the uncompressed Parquet size as an estimate for other types
+    if let Some(parquet_idx) = parquet_idx {
+        let uncompressed_bytes: i64 = row_groups_metadata
+            .iter()
+            .filter_map(|rg| rg.columns().get(parquet_idx))
+            .map(|col| col.uncompressed_size())
+            .sum();
+        return Precision::Inexact(uncompressed_bytes as usize);
+    }
+
+    // Otherwise, we cannot determine the size
+    Precision::Absent
+}
+
+/// Checks if any occurrence of `value` in `array` corresponds to a `true`
+/// entry in the `exactness` array.
+///
+/// This is used to determine if a calculated statistic (e.g., min or max)
+/// is exact, by checking if at least one of its source values was exact.
+///
+/// # Example
+/// - `value`: `0`
+/// - `array`: `[0, 1, 0, 3, 0, 5]`
+/// - `exactness`: `[true, false, false, false, false, false]`
+///
+/// The value `0` appears at indices `[0, 2, 4]`. The corresponding exactness
+/// values are `[true, false, false]`. Since at least one is `true`, the
+/// function returns `Some(true)`.
+fn has_any_exact_match(
+    value: &ScalarValue,
+    array: &ArrayRef,
+    exactness: &BooleanArray,
+) -> Option<bool> {
+    if value.is_null() {
+        return Some(false);
+    }
+
+    // Shortcut for single row group
+    if array.len() == 1 {
+        return Some(exactness.is_valid(0) && exactness.value(0));
+    }
+
+    let scalar_array = value.to_scalar().ok()?;
+    let eq_mask = eq(&scalar_array, &array).ok()?;
+    let combined_mask = and(&eq_mask, exactness).ok()?;
+    Some(combined_mask.true_count() > 0)
+}
+
+/// Wrapper to implement [`FileMetadata`] for [`ParquetMetaData`].
+pub struct CachedParquetMetaData(Arc<ParquetMetaData>);
+
+impl CachedParquetMetaData {
+    pub fn new(metadata: Arc<ParquetMetaData>) -> Self {
+        Self(metadata)
+    }
+
+    pub fn parquet_metadata(&self) -> &Arc<ParquetMetaData> {
+        &self.0
+    }
+}
+
+impl FileMetadata for CachedParquetMetaData {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn memory_size(&self) -> usize {
+        self.0.memory_size()
+    }
+
+    fn extra_info(&self) -> HashMap<String, String> {
+        let page_index =
+            self.0.column_index().is_some() && self.0.offset_index().is_some();
+        HashMap::from([("page_index".to_owned(), page_index.to_string())])
+    }
+}
+
+/// Convert a [`PhysicalSortExpr`] to a Parquet [`SortingColumn`].
+///
+/// Returns `Err` if the expression is not a simple column reference.
+pub(crate) fn sort_expr_to_sorting_column(
+    sort_expr: &PhysicalSortExpr,
+) -> Result<SortingColumn> {
+    let column = sort_expr
+        .expr
+        .as_any()
+        .downcast_ref::<Column>()
+        .ok_or_else(|| {
+            DataFusionError::Plan(format!(
+                "Parquet sorting_columns only supports simple column references, \
+                 but got expression: {}",
+                sort_expr.expr
+            ))
+        })?;
+
+    let column_idx: i32 = column.index().try_into().map_err(|_| {
+        DataFusionError::Plan(format!(
+            "Column index {} is too large to be represented as i32",
+            column.index()
+        ))
+    })?;
+
+    Ok(SortingColumn {
+        column_idx,
+        descending: sort_expr.options.descending,
+        nulls_first: sort_expr.options.nulls_first,
+    })
+}
+
+/// Convert a [`LexOrdering`] to `Vec<SortingColumn>` for Parquet.
+///
+/// Returns `Err` if any expression is not a simple column reference.
+pub(crate) fn lex_ordering_to_sorting_columns(
+    ordering: &LexOrdering,
+) -> Result<Vec<SortingColumn>> {
+    ordering.iter().map(sort_expr_to_sorting_column).collect()
+}
+
+/// Extracts ordering information from Parquet metadata.
+///
+/// This function reads the sorting_columns from the first row group's metadata
+/// and converts them into a [`LexOrdering`] that can be used by the query engine.
+///
+/// # Arguments
+/// * `metadata` - The Parquet metadata containing sorting_columns information
+/// * `schema` - The Arrow schema to use for column lookup
+///
+/// # Returns
+/// * `Ok(Some(ordering))` if valid ordering information was found
+/// * `Ok(None)` if no sorting columns were specified or they couldn't be resolved
+pub fn ordering_from_parquet_metadata(
+    metadata: &ParquetMetaData,
+    schema: &SchemaRef,
+) -> Result<Option<LexOrdering>> {
+    // Get the sorting columns from the first row group metadata.
+    // If no row groups exist or no sorting columns are specified, return None.
+    let sorting_columns = metadata
+        .row_groups()
+        .first()
+        .and_then(|rg| rg.sorting_columns())
+        .filter(|cols| !cols.is_empty());
+
+    let Some(sorting_columns) = sorting_columns else {
+        return Ok(None);
+    };
+
+    let parquet_schema = metadata.file_metadata().schema_descr();
+
+    let sort_exprs =
+        sorting_columns_to_physical_exprs(sorting_columns, parquet_schema, schema);
+
+    if sort_exprs.is_empty() {
+        return Ok(None);
+    }
+
+    Ok(LexOrdering::new(sort_exprs))
+}
+
+/// Converts Parquet sorting columns to physical sort expressions.
+fn sorting_columns_to_physical_exprs(
+    sorting_columns: &[SortingColumn],
+    parquet_schema: &SchemaDescriptor,
+    arrow_schema: &SchemaRef,
+) -> Vec<PhysicalSortExpr> {
+    use arrow::compute::SortOptions;
+
+    sorting_columns
+        .iter()
+        .filter_map(|sc| {
+            let parquet_column = parquet_schema.column(sc.column_idx as usize);
+            let name = parquet_column.name();
+
+            // Find the column in the arrow schema
+            let (index, _) = arrow_schema.column_with_name(name)?;
+
+            let expr = Arc::new(Column::new(name, index));
+            let options = SortOptions {
+                descending: sc.descending,
+                nulls_first: sc.nulls_first,
+            };
+            Some(PhysicalSortExpr::new(expr, options))
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{ArrayRef, BooleanArray, Int32Array};
+    use datafusion_common::ScalarValue;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_has_any_exact_match() {
+        // Case 1: Mixed exact and inexact matches
+        {
+            let computed_min = ScalarValue::Int32(Some(0));
+            let row_group_mins =
+                Arc::new(Int32Array::from(vec![0, 1, 0, 3, 0, 5])) as ArrayRef;
+            let exactness =
+                BooleanArray::from(vec![true, false, false, false, false, false]);
+
+            let result = has_any_exact_match(&computed_min, &row_group_mins, &exactness);
+            assert_eq!(result, Some(true));
+        }
+        // Case 2: All inexact matches
+        {
+            let computed_min = ScalarValue::Int32(Some(0));
+            let row_group_mins =
+                Arc::new(Int32Array::from(vec![0, 1, 0, 3, 0, 5])) as ArrayRef;
+            let exactness =
+                BooleanArray::from(vec![false, false, false, false, false, false]);
+
+            let result = has_any_exact_match(&computed_min, &row_group_mins, &exactness);
+            assert_eq!(result, Some(false));
+        }
+        // Case 3: All exact matches
+        {
+            let computed_max = ScalarValue::Int32(Some(5));
+            let row_group_maxes =
+                Arc::new(Int32Array::from(vec![1, 5, 3, 5, 2, 5])) as ArrayRef;
+            let exactness =
+                BooleanArray::from(vec![false, true, true, true, false, true]);
+
+            let result = has_any_exact_match(&computed_max, &row_group_maxes, &exactness);
+            assert_eq!(result, Some(true));
+        }
+        // Case 4: All maxes are null values
+        {
+            let computed_max = ScalarValue::Int32(None);
+            let row_group_maxes =
+                Arc::new(Int32Array::from(vec![None, None, None, None])) as ArrayRef;
+            let exactness = BooleanArray::from(vec![None, Some(true), None, Some(false)]);
+
+            let result = has_any_exact_match(&computed_max, &row_group_maxes, &exactness);
+            assert_eq!(result, Some(false));
+        }
+    }
+
+    mod ndv_tests {
+        use super::*;
+        use arrow::datatypes::Field;
+        use parquet::arrow::parquet_to_arrow_schema;
+        use parquet::basic::Type as PhysicalType;
+        use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData};
+        use parquet::file::reader::{FileReader, SerializedFileReader};
+        use parquet::file::statistics::Statistics as ParquetStatistics;
+        use parquet::schema::types::{SchemaDescriptor, Type as SchemaType};
+        use std::fs::File;
+        use std::path::PathBuf;
+
+        fn create_schema_descr(num_columns: usize) -> Arc<SchemaDescriptor> {
+            let fields: Vec<Arc<SchemaType>> = (0..num_columns)
+                .map(|i| {
+                    Arc::new(
+                        SchemaType::primitive_type_builder(
+                            &format!("col_{i}"),
+                            PhysicalType::INT32,
+                        )
+                        .build()
+                        .unwrap(),
+                    )
+                })
+                .collect();
+
+            let schema = SchemaType::group_type_builder("schema")
+                .with_fields(fields)
+                .build()
+                .unwrap();
+
+            Arc::new(SchemaDescriptor::new(Arc::new(schema)))
+        }
+
+        fn create_arrow_schema(num_columns: usize) -> SchemaRef {
+            let fields: Vec<Field> = (0..num_columns)
+                .map(|i| Field::new(format!("col_{i}"), DataType::Int32, true))
+                .collect();
+            Arc::new(Schema::new(fields))
+        }
+
+        fn create_row_group_with_stats(
+            schema_descr: &Arc<SchemaDescriptor>,
+            column_stats: Vec<Option<ParquetStatistics>>,
+            num_rows: i64,
+        ) -> RowGroupMetaData {
+            let columns: Vec<ColumnChunkMetaData> = column_stats
+                .into_iter()
+                .enumerate()
+                .map(|(i, stats)| {
+                    let mut builder =
+                        ColumnChunkMetaData::builder(schema_descr.column(i));
+                    if let Some(s) = stats {
+                        builder = builder.set_statistics(s);
+                    }
+                    builder.set_num_values(num_rows).build().unwrap()
+                })
+                .collect();
+
+            RowGroupMetaData::builder(schema_descr.clone())
+                .set_num_rows(num_rows)
+                .set_total_byte_size(1000)
+                .set_column_metadata(columns)
+                .build()
+                .unwrap()
+        }
+
+        fn create_parquet_metadata(
+            schema_descr: Arc<SchemaDescriptor>,
+            row_groups: Vec<RowGroupMetaData>,
+        ) -> ParquetMetaData {
+            use parquet::file::metadata::FileMetaData;
+
+            let num_rows: i64 = row_groups.iter().map(|rg| rg.num_rows()).sum();
+            let file_meta = FileMetaData::new(
+                1,            // version
+                num_rows,     // num_rows
+                None,         // created_by
+                None,         // key_value_metadata
+                schema_descr, // schema_descr
+                None,         // column_orders
+            );
+
+            ParquetMetaData::new(file_meta, row_groups)
+        }
+
+        #[test]
+        fn test_distinct_count_single_row_group_with_ndv() {
+            // Single row group with distinct count should return Exact
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create statistics with distinct_count = 42
+            let stats = ParquetStatistics::int32(
+                Some(1),   // min
+                Some(100), // max
+                Some(42),  // distinct_count
+                Some(0),   // null_count
+                false,     // is_deprecated
+            );
+
+            let row_group =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats)], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Exact(42)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_multiple_row_groups_with_ndv() {
+            // Multiple row groups with distinct counts should return Inexact (sum)
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Row group 1: distinct_count = 10
+            let stats1 = ParquetStatistics::int32(
+                Some(1),
+                Some(50),
+                Some(10), // distinct_count
+                Some(0),
+                false,
+            );
+
+            // Row group 2: distinct_count = 20
+            let stats2 = ParquetStatistics::int32(
+                Some(51),
+                Some(100),
+                Some(20), // distinct_count
+                Some(0),
+                false,
+            );
+
+            let row_group1 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats1)], 500);
+            let row_group2 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats2)], 500);
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![row_group1, row_group2]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            // Max of distinct counts (lower bound since we can't accurately merge NDV)
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Inexact(20)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_no_ndv_available() {
+            // No distinct count in statistics should return Absent
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create statistics without distinct_count (None)
+            let stats = ParquetStatistics::int32(
+                Some(1),
+                Some(100),
+                None, // no distinct_count
+                Some(0),
+                false,
+            );
+
+            let row_group =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats)], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_partial_ndv_below_threshold() {
+            // 1 of 2 row groups has NDV (50% < 75% threshold) -> Absent
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            let stats1 =
+                ParquetStatistics::int32(Some(1), Some(50), Some(15), Some(0), false);
+            let stats2 =
+                ParquetStatistics::int32(Some(51), Some(100), None, Some(0), false);
+
+            let row_group1 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats1)], 500);
+            let row_group2 =
+                create_row_group_with_stats(&schema_descr, vec![Some(stats2)], 500);
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![row_group1, row_group2]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_partial_ndv_above_threshold() {
+            // 3 of 4 row groups have NDV (75% >= 75% threshold) -> Inexact
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            let stats_with = |ndv| {
+                ParquetStatistics::int32(Some(1), Some(100), Some(ndv), Some(0), false)
+            };
+            let stats_without =
+                ParquetStatistics::int32(Some(1), Some(100), None, Some(0), false);
+
+            let rg1 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(10))],
+                250,
+            );
+            let rg2 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(20))],
+                250,
+            );
+            let rg3 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_with(15))],
+                250,
+            );
+            let rg4 = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats_without)],
+                250,
+            );
+            let metadata =
+                create_parquet_metadata(schema_descr, vec![rg1, rg2, rg3, rg4]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Inexact(20)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_multiple_columns() {
+            // Test with multiple columns, each with different NDV
+            let schema_descr = create_schema_descr(3);
+            let arrow_schema = create_arrow_schema(3);
+
+            // col_0: distinct_count = 5
+            let stats0 =
+                ParquetStatistics::int32(Some(1), Some(10), Some(5), Some(0), false);
+            // col_1: no distinct_count
+            let stats1 =
+                ParquetStatistics::int32(Some(1), Some(100), None, Some(0), false);
+            // col_2: distinct_count = 100
+            let stats2 =
+                ParquetStatistics::int32(Some(1), Some(1000), Some(100), Some(0), false);
+
+            let row_group = create_row_group_with_stats(
+                &schema_descr,
+                vec![Some(stats0), Some(stats1), Some(stats2)],
+                1000,
+            );
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Exact(5)
+            );
+            assert_eq!(
+                result.column_statistics[1].distinct_count,
+                Precision::Absent
+            );
+            assert_eq!(
+                result.column_statistics[2].distinct_count,
+                Precision::Exact(100)
+            );
+        }
+
+        #[test]
+        fn test_distinct_count_no_statistics_at_all() {
+            // No statistics in row group should return Absent for all stats
+            let schema_descr = create_schema_descr(1);
+            let arrow_schema = create_arrow_schema(1);
+
+            // Create row group without any statistics
+            let row_group = create_row_group_with_stats(&schema_descr, vec![None], 1000);
+            let metadata = create_parquet_metadata(schema_descr, vec![row_group]);
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                &metadata,
+                &arrow_schema,
+            )
+            .unwrap();
+
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent
+            );
+        }
+
+        /// Integration test that reads a real Parquet file with distinct_count statistics.
+        /// The test file was created with DuckDB and has known NDV values:
+        /// - id: NULL (high cardinality, not tracked)
+        /// - category: 10 distinct values
+        /// - name: 5 distinct values
+        #[test]
+        fn test_distinct_count_from_real_parquet_file() {
+            // Path to test file created by DuckDB with distinct_count statistics
+            let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+            path.push("src/test_data/ndv_test.parquet");
+
+            let file = File::open(&path).expect("Failed to open test parquet file");
+            let reader =
+                SerializedFileReader::new(file).expect("Failed to create reader");
+            let parquet_metadata = reader.metadata();
+
+            // Derive Arrow schema from parquet file metadata
+            let arrow_schema = Arc::new(
+                parquet_to_arrow_schema(
+                    parquet_metadata.file_metadata().schema_descr(),
+                    None,
+                )
+                .expect("Failed to convert schema"),
+            );
+
+            let result = DFParquetMetadata::statistics_from_parquet_metadata(
+                parquet_metadata,
+                &arrow_schema,
+            )
+            .expect("Failed to extract statistics");
+
+            // id: no distinct_count (high cardinality)
+            assert_eq!(
+                result.column_statistics[0].distinct_count,
+                Precision::Absent,
+                "id column should have Absent distinct_count"
+            );
+
+            // category: 10 distinct values
+            assert_eq!(
+                result.column_statistics[1].distinct_count,
+                Precision::Exact(10),
+                "category column should have Exact(10) distinct_count"
+            );
+
+            // name: 5 distinct values
+            assert_eq!(
+                result.column_statistics[2].distinct_count,
+                Precision::Exact(5),
+                "name column should have Exact(5) distinct_count"
+            );
+        }
+    }
+}
diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs
index 3213d0201295a..2d6fb69270bf3 100644
--- a/datafusion/datasource-parquet/src/metrics.rs
+++ b/datafusion/datasource-parquet/src/metrics.rs
@@ -16,7 +16,8 @@
 // under the License.
 
 use datafusion_physical_plan::metrics::{
-    Count, ExecutionPlanMetricsSet, MetricBuilder, Time,
+    Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricType, PruningMetrics,
+    RatioMergeStrategy, RatioMetrics, Time,
 };
 
 /// Stores metrics about the parquet execution for a particular parquet file.
@@ -27,16 +28,29 @@ use datafusion_physical_plan::metrics::{
 /// [`ParquetFileReaderFactory`]: super::ParquetFileReaderFactory
 #[derive(Debug, Clone)]
 pub struct ParquetFileMetrics {
+    /// Number of file **ranges** pruned or matched by partition or file level statistics.
+    /// Pruning of files often happens at planning time but may happen at execution time
+    /// if dynamic filters (e.g. from a join) result in additional pruning.
+    ///
+    /// This does **not** necessarily equal the number of files pruned:
+    /// files may be scanned in sub-ranges to increase parallelism,
+    /// in which case this will represent the number of sub-ranges pruned, not the number of files.
+    /// The number of files pruned will always be less than or equal to this number.
+    ///
+    /// A single file may have some ranges that are not pruned and some that are pruned.
+    /// For example, with a query like `ORDER BY col LIMIT 10`, the TopK dynamic filter
+    /// pushdown optimization may fill up the TopK heap when reading the first part of a file,
+    /// then skip the second part if file statistics indicate it cannot contain rows
+    /// that would be in the TopK.
+    pub files_ranges_pruned_statistics: PruningMetrics,
     /// Number of times the predicate could not be evaluated
     pub predicate_evaluation_errors: Count,
-    /// Number of row groups whose bloom filters were checked and matched (not pruned)
-    pub row_groups_matched_bloom_filter: Count,
     /// Number of row groups pruned by bloom filters
-    pub row_groups_pruned_bloom_filter: Count,
-    /// Number of row groups whose statistics were checked and matched (not pruned)
-    pub row_groups_matched_statistics: Count,
+    pub row_groups_pruned_bloom_filter: PruningMetrics,
+    /// Number of row groups pruned due to limit pruning.
+    pub limit_pruned_row_groups: PruningMetrics,
     /// Number of row groups pruned by statistics
-    pub row_groups_pruned_statistics: Count,
+    pub row_groups_pruned_statistics: PruningMetrics,
     /// Total number of bytes scanned
     pub bytes_scanned: Count,
     /// Total rows filtered out by predicates pushed into parquet scan
@@ -49,14 +63,34 @@ pub struct ParquetFileMetrics {
     pub statistics_eval_time: Time,
     /// Total time spent evaluating row group Bloom Filters
     pub bloom_filter_eval_time: Time,
-    /// Total rows filtered out by parquet page index
-    pub page_index_rows_pruned: Count,
-    /// Total rows passed through the parquet page index
-    pub page_index_rows_matched: Count,
+    /// Total rows filtered or matched by parquet page index
+    pub page_index_rows_pruned: PruningMetrics,
+    /// Total pages filtered or matched by parquet page index
+    pub page_index_pages_pruned: PruningMetrics,
     /// Total time spent evaluating parquet page index filters
     pub page_index_eval_time: Time,
     /// Total time spent reading and parsing metadata from the footer
     pub metadata_load_time: Time,
+    /// Scan Efficiency Ratio, calculated as bytes_scanned / total_file_size
+    pub scan_efficiency_ratio: RatioMetrics,
+    /// Predicate Cache: Total number of rows physically read and decoded from the Parquet file.
+    ///
+    /// This metric tracks "cache misses" in the predicate pushdown optimization.
+    /// When the specialized predicate reader cannot find the requested data in its cache,
+    /// it must fall back to the "inner reader" to physically decode the data from the
+    /// Parquet.
+    ///
+    /// This is the expensive path (IO + Decompression + Decoding).
+    ///
+    /// We use a Gauge here as arrow-rs reports absolute numbers rather
+    /// than incremental readings, we want a `set` operation here rather
+    /// than `add`. Earlier it was `Count`, which led to this issue:
+    /// github.com/apache/datafusion/issues/19334
+    pub predicate_cache_inner_records: Gauge,
+    /// Predicate Cache: number of records read from the cache. This is the
+    /// number of rows that were stored in the cache after evaluating predicates
+    /// reused for the output.
+    pub predicate_cache_records: Gauge,
 }
 
 impl ParquetFileMetrics {
@@ -66,30 +100,59 @@ impl ParquetFileMetrics {
         filename: &str,
         metrics: &ExecutionPlanMetricsSet,
     ) -> Self {
-        let predicate_evaluation_errors = MetricBuilder::new(metrics)
-            .with_new_label("filename", filename.to_string())
-            .counter("predicate_evaluation_errors", partition);
-
-        let row_groups_matched_bloom_filter = MetricBuilder::new(metrics)
-            .with_new_label("filename", filename.to_string())
-            .counter("row_groups_matched_bloom_filter", partition);
-
+        // -----------------------
+        // 'summary' level metrics
+        // -----------------------
         let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("row_groups_pruned_bloom_filter", partition);
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("row_groups_pruned_bloom_filter", partition);
 
-        let row_groups_matched_statistics = MetricBuilder::new(metrics)
+        let limit_pruned_row_groups = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("row_groups_matched_statistics", partition);
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("limit_pruned_row_groups", partition);
 
         let row_groups_pruned_statistics = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("row_groups_pruned_statistics", partition);
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("row_groups_pruned_statistics", partition);
+
+        let page_index_pages_pruned = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("page_index_pages_pruned", partition);
 
         let bytes_scanned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::SUMMARY)
             .counter("bytes_scanned", partition);
 
+        let metadata_load_time = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::SUMMARY)
+            .subset_time("metadata_load_time", partition);
+
+        let files_ranges_pruned_statistics = MetricBuilder::new(metrics)
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("files_ranges_pruned_statistics", partition);
+
+        let scan_efficiency_ratio = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .with_type(MetricType::SUMMARY)
+            .ratio_metrics_with_strategy(
+                "scan_efficiency_ratio",
+                partition,
+                RatioMergeStrategy::AddPartSetTotal,
+            );
+
+        // -----------------------
+        // 'dev' level metrics
+        // -----------------------
+        let predicate_evaluation_errors = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .counter("predicate_evaluation_errors", partition);
+
         let pushdown_rows_pruned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
             .counter("pushdown_rows_pruned", partition);
@@ -107,37 +170,41 @@ impl ParquetFileMetrics {
             .with_new_label("filename", filename.to_string())
             .subset_time("bloom_filter_eval_time", partition);
 
-        let page_index_rows_pruned = MetricBuilder::new(metrics)
+        let page_index_eval_time = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("page_index_rows_pruned", partition);
-        let page_index_rows_matched = MetricBuilder::new(metrics)
+            .subset_time("page_index_eval_time", partition);
+
+        let page_index_rows_pruned = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("page_index_rows_matched", partition);
+            .pruning_metrics("page_index_rows_pruned", partition);
 
-        let page_index_eval_time = MetricBuilder::new(metrics)
+        let predicate_cache_inner_records = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .subset_time("page_index_eval_time", partition);
+            .gauge("predicate_cache_inner_records", partition);
 
-        let metadata_load_time = MetricBuilder::new(metrics)
+        let predicate_cache_records = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .subset_time("metadata_load_time", partition);
+            .gauge("predicate_cache_records", partition);
 
         Self {
+            files_ranges_pruned_statistics,
             predicate_evaluation_errors,
-            row_groups_matched_bloom_filter,
             row_groups_pruned_bloom_filter,
-            row_groups_matched_statistics,
             row_groups_pruned_statistics,
+            limit_pruned_row_groups,
             bytes_scanned,
             pushdown_rows_pruned,
             pushdown_rows_matched,
             row_pushdown_eval_time,
             page_index_rows_pruned,
-            page_index_rows_matched,
+            page_index_pages_pruned,
             statistics_eval_time,
             bloom_filter_eval_time,
             page_index_eval_time,
             metadata_load_time,
+            scan_efficiency_ratio,
+            predicate_cache_inner_records,
+            predicate_cache_records,
         }
     }
 }
diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs
index 0b4e862403837..4d46f84aa8167 100644
--- a/datafusion/datasource-parquet/src/mod.rs
+++ b/datafusion/datasource-parquet/src/mod.rs
@@ -18,23 +18,28 @@
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod access_plan;
 pub mod file_format;
+pub mod metadata;
 mod metrics;
 mod opener;
 mod page_filter;
 mod reader;
 mod row_filter;
 mod row_group_filter;
+mod sort;
 pub mod source;
+mod supported_predicates;
 mod writer;
 
 pub use access_plan::{ParquetAccessPlan, RowGroupAccess};
 pub use file_format::*;
 pub use metrics::ParquetFileMetrics;
+pub use opener::ParquetMorselizer;
 pub use page_filter::PagePruningAccessPlanFilter;
-pub use reader::{DefaultParquetFileReaderFactory, ParquetFileReaderFactory};
+pub use reader::*; // Expose so downstream crates can use it
 pub use row_filter::build_row_filter;
 pub use row_filter::can_expr_be_pushed_down_with_schemas;
 pub use row_group_filter::RowGroupAccessPlanFilter;
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index 9e14425074f78..9f17184b19373 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -15,49 +15,99 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ParquetOpener`] for opening Parquet files
-
-use std::sync::Arc;
+//! [`ParquetMorselizer`] for morselizing Parquet files
 
 use crate::page_filter::PagePruningAccessPlanFilter;
 use crate::row_group_filter::RowGroupAccessPlanFilter;
 use crate::{
-    apply_file_schema_type_coercions, coerce_int96_to_resolution, row_filter,
     ParquetAccessPlan, ParquetFileMetrics, ParquetFileReaderFactory,
+    apply_file_schema_type_coercions, coerce_int96_to_resolution, row_filter,
 };
-use datafusion_datasource::file_meta::FileMeta;
-use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener};
-use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
+use arrow::array::{RecordBatch, RecordBatchOptions};
+use arrow::datatypes::{DataType, Schema};
+use datafusion_datasource::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use datafusion_physical_expr::projection::{ProjectionExprs, Projector};
+use datafusion_physical_expr::utils::reassign_expr_columns;
+use datafusion_physical_expr_adapter::replace_columns_with_literals;
+use parquet::errors::ParquetError;
+use std::collections::HashMap;
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::future::Future;
+use std::mem;
+use std::ops::Deref;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
 
 use arrow::datatypes::{SchemaRef, TimeUnit};
-use arrow::error::ArrowError;
-use datafusion_common::{exec_err, Result};
-use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_optimizer::pruning::PruningPredicate;
-use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder};
+use datafusion_common::encryption::FileDecryptionProperties;
+use datafusion_common::stats::Precision;
+use datafusion_common::{
+    ColumnStatistics, DataFusionError, Result, ScalarValue, Statistics, exec_err,
+};
+use datafusion_datasource::{PartitionedFile, TableSchema};
+use datafusion_physical_expr::simplifier::PhysicalExprSimplifier;
+use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
+use datafusion_physical_expr_common::physical_expr::{
+    PhysicalExpr, is_dynamic_physical_expr,
+};
+use datafusion_physical_plan::metrics::{
+    Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, PruningMetrics,
+};
+use datafusion_pruning::{FilePruner, PruningPredicate, build_pruning_predicate};
 
-use futures::{StreamExt, TryStreamExt};
+#[cfg(feature = "parquet_encryption")]
+use datafusion_common::config::EncryptionFactoryOptions;
+#[cfg(feature = "parquet_encryption")]
+use datafusion_execution::parquet_encryption::EncryptionFactory;
+use futures::{FutureExt, Stream, StreamExt, ready, stream::BoxStream};
 use log::debug;
-use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
+use parquet::DecodeResult;
+use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics;
+use parquet::arrow::arrow_reader::{
+    ArrowReaderMetadata, ArrowReaderOptions, RowSelectionPolicy,
+};
 use parquet::arrow::async_reader::AsyncFileReader;
+use parquet::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder};
 use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
-use parquet::file::metadata::ParquetMetaDataReader;
+use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader};
+use tokio::sync::oneshot;
+
+/// Implements [`Morselizer`] for a parquet file.
+///
+/// The current implementation preserves parity with the existing opener path:
+///
+/// 1. `morselize` creates a single planner for the input file
+/// 2. the planner's first `plan` call returns an I/O future
+/// 3. that future runs the copied parquet open/setup flow:
+///    file pruning, metadata loading, optional page-index / bloom-filter work,
+///    row-group pruning, decoder construction, and final stream setup
+/// 4. the next `plan` call emits a single ready morsel wrapping that prepared stream
+///
+/// This keeps the behavioral parity of `opener.rs` while routing execution
+/// through the new `Morselizer` / `MorselPlanner` API.
+#[derive(Clone)]
+pub struct ParquetMorselizer {
+    state: Arc<ParquetMorselizerState>,
+}
 
-/// Implements [`FileOpener`] for a parquet file
-pub(super) struct ParquetOpener {
+/// State needed to plan Parquet morsels
+pub struct ParquetMorselizerState {
     /// Execution partition index
-    pub partition_index: usize,
-    /// Column indexes in `table_schema` needed by the query
-    pub projection: Arc<[usize]>,
+    pub(crate) partition_index: usize,
+    /// Projection to apply on top of the table schema (i.e. can reference partition columns).
+    pub projection: ProjectionExprs,
     /// Target number of rows in each output RecordBatch
     pub batch_size: usize,
     /// Optional limit on the number of rows to read
-    pub limit: Option<usize>,
+    pub(crate) limit: Option<usize>,
+    /// If should keep the output rows in order
+    pub preserve_order: bool,
     /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
-    /// Schema of the output table without partition columns.
-    /// This is the schema we coerce the physical file schema into.
-    pub logical_file_schema: SchemaRef,
+    /// Table schema, including partition columns.
+    pub table_schema: TableSchema,
     /// Optional hint for how large the initial request to read parquet metadata
     /// should be
     pub metadata_size_hint: Option<usize>,
@@ -65,260 +115,1544 @@ pub(super) struct ParquetOpener {
     pub metrics: ExecutionPlanMetricsSet,
     /// Factory for instantiating parquet reader
     pub parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
-    /// Should the filters be evaluated during the parquet scan using
-    /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)?
+    /// Should the filters be evaluated during the parquet scan using the
+    /// parquet row-filter predicate machinery?
     pub pushdown_filters: bool,
     /// Should the filters be reordered to optimize the scan?
     pub reorder_filters: bool,
+    /// Should we force the reader to use RowSelections for filtering
+    pub force_filter_selections: bool,
     /// Should the page index be read from parquet files, if present, to skip
     /// data pages
     pub enable_page_index: bool,
     /// Should the bloom filter be read from parquet, if present, to skip row
     /// groups
     pub enable_bloom_filter: bool,
-    /// Schema adapter factory
-    pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
     /// Should row group pruning be applied
     pub enable_row_group_stats_pruning: bool,
     /// Coerce INT96 timestamps to specific TimeUnit
     pub coerce_int96: Option<TimeUnit>,
+    /// Rewrite expressions in the context of the file schema
+    pub(crate) expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+    /// Encryption configuration used to resolve per-file decryption properties.
+    pub(crate) encryption_context: EncryptionContext,
+    /// Maximum size of the predicate cache, in bytes. If none, uses
+    /// the arrow-rs default.
+    pub max_predicate_cache_size: Option<usize>,
+    /// Whether to read row groups in reverse order
+    pub reverse_row_groups: bool,
+}
+
+impl ParquetMorselizer {
+    pub(crate) fn new(state: ParquetMorselizerState) -> Self {
+        Self {
+            state: Arc::new(state),
+        }
+    }
+}
+
+impl Deref for ParquetMorselizer {
+    type Target = ParquetMorselizerState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.state
+    }
+}
+
+impl Debug for ParquetMorselizer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParquetMorselizer")
+            .field("partition_index", &self.partition_index)
+            .field("batch_size", &self.batch_size)
+            .field("limit", &self.limit)
+            .field("preserve_order", &self.preserve_order)
+            .field("metadata_size_hint", &self.metadata_size_hint)
+            .field("pushdown_filters", &self.pushdown_filters)
+            .field("reorder_filters", &self.reorder_filters)
+            .field("force_filter_selections", &self.force_filter_selections)
+            .field("enable_page_index", &self.enable_page_index)
+            .field("enable_bloom_filter", &self.enable_bloom_filter)
+            .field(
+                "enable_row_group_stats_pruning",
+                &self.enable_row_group_stats_pruning,
+            )
+            .field("coerce_int96", &self.coerce_int96)
+            .field("max_predicate_cache_size", &self.max_predicate_cache_size)
+            .field("reverse_row_groups", &self.reverse_row_groups)
+            .finish()
+    }
+}
+
+/// Result of preparing a PartitionedFile using CPU before any I/O.
+///
+/// This captures the state computed from `PartitionedFile`, the table schema,
+/// and scan configuration so that later planner states only need to perform
+/// async work such as metadata loading and stream construction.
+struct PreparedParquetOpen {
+    state: Arc<ParquetMorselizerState>,
+    partitioned_file: PartitionedFile,
+    file_range: Option<datafusion_datasource::FileRange>,
+    extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
+    file_metrics: ParquetFileMetrics,
+    file_pruner: Option<FilePruner>,
+    metadata_size_hint: Option<usize>,
+    async_file_reader: Box<dyn AsyncFileReader>,
+    logical_file_schema: SchemaRef,
+    output_schema: Arc<Schema>,
+    projection: ProjectionExprs,
+    predicate: Option<Arc<dyn PhysicalExpr>>,
+    #[cfg(feature = "parquet_encryption")]
+    file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
+}
+
+/// Result of loading parquet metadata after file-level pruning has completed.
+struct MetadataLoadedParquetOpen {
+    prepared: PreparedParquetOpen,
+    reader_metadata: ArrowReaderMetadata,
+    options: ArrowReaderOptions,
+}
+
+/// Result of CPU-only preparation after metadata has been loaded.
+///
+/// This captures the file schema coercions and file-specific pruning predicates
+/// so the next async step only has to fetch any missing page index data.
+struct FiltersPreparedParquetOpen {
+    loaded: MetadataLoadedParquetOpen,
+    physical_file_schema: SchemaRef,
+    projection: ProjectionExprs,
+    predicate: Option<Arc<dyn PhysicalExpr>>,
+    pruning_predicate: Option<Arc<PruningPredicate>>,
+    page_pruning_predicate: Option<Arc<PagePruningAccessPlanFilter>>,
 }
 
-impl FileOpener for ParquetOpener {
-    fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture> {
-        let file_range = file_meta.range.clone();
-        let extensions = file_meta.extensions.clone();
-        let file_name = file_meta.location().to_string();
+/// Result of CPU-only row-group pruning using parquet metadata.
+///
+/// This captures the row groups that remain after range, statistics, and
+/// limit-based pruning so the next async step can optionally load and apply
+/// bloom filters before the final stream is built.
+struct RowGroupsPreparedParquetOpen {
+    prepared: FiltersPreparedParquetOpen,
+    row_groups: RowGroupAccessPlanFilter,
+}
+
+impl ParquetMorselizerState {
+    /// Perform the CPU-only setup for opening a parquet file.
+    fn prepare_open_file(
+        self: &Arc<Self>,
+        partitioned_file: PartitionedFile,
+    ) -> Result<PreparedParquetOpen> {
+        // -----------------------------------
+        // Step: prepare configurations, etc.
+        // -----------------------------------
+        let file_range = partitioned_file.range.clone();
+        let extensions = partitioned_file.extensions.clone();
+        let file_name = partitioned_file.object_meta.location.to_string();
         let file_metrics =
             ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics);
 
-        let metadata_size_hint = file_meta.metadata_size_hint.or(self.metadata_size_hint);
+        let metadata_size_hint = partitioned_file
+            .metadata_size_hint
+            .or(self.metadata_size_hint);
 
-        let mut async_file_reader: Box<dyn AsyncFileReader> =
+        let async_file_reader: Box<dyn AsyncFileReader> =
             self.parquet_file_reader_factory.create_reader(
                 self.partition_index,
-                file_meta,
+                partitioned_file.clone(),
                 metadata_size_hint,
                 &self.metrics,
             )?;
 
-        let batch_size = self.batch_size;
-
-        let projected_schema =
-            SchemaRef::from(self.logical_file_schema.project(&self.projection)?);
-        let schema_adapter_factory = Arc::clone(&self.schema_adapter_factory);
-        let schema_adapter = self
-            .schema_adapter_factory
-            .create(projected_schema, Arc::clone(&self.logical_file_schema));
-        let predicate = self.predicate.clone();
-        let logical_file_schema = Arc::clone(&self.logical_file_schema);
-        let reorder_predicates = self.reorder_filters;
-        let pushdown_filters = self.pushdown_filters;
-        let coerce_int96 = self.coerce_int96;
-        let enable_bloom_filter = self.enable_bloom_filter;
-        let enable_row_group_stats_pruning = self.enable_row_group_stats_pruning;
-        let limit = self.limit;
+        // Calculate the output schema from the original projection (before literal replacement)
+        // so we get correct field names from column references
+        let logical_file_schema = Arc::clone(self.table_schema.file_schema());
+        let output_schema = Arc::new(
+            self.projection
+                .project_schema(self.table_schema.table_schema())?,
+        );
+
+        // Build a combined map for replacing column references with literal values.
+        // This includes:
+        // 1. Partition column values from the file path (e.g., region=us-west-2)
+        // 2. Constant columns detected from file statistics (where min == max)
+        //
+        // Although partition columns *are* constant columns, we don't want to rely on
+        // statistics for them being populated if we can use the partition values
+        // (which are guaranteed to be present).
+        //
+        // For example, given a partition column `region` and predicate
+        // `region IN ('us-east-1', 'eu-central-1')` with file path
+        // `/data/region=us-west-2/...`, the predicate is rewritten to
+        // `'us-west-2' IN ('us-east-1', 'eu-central-1')` which simplifies to FALSE.
+        //
+        // While partition column optimization is done during logical planning,
+        // there are cases where partition columns may appear in more complex
+        // predicates that cannot be simplified until we open the file (such as
+        // dynamic predicates).
+        let mut literal_columns: HashMap<String, ScalarValue> = self
+            .table_schema
+            .table_partition_cols()
+            .iter()
+            .zip(partitioned_file.partition_values.iter())
+            .map(|(field, value)| (field.name().clone(), value.clone()))
+            .collect();
+        // Add constant columns from file statistics.
+        // Note that if there are statistics for partition columns there will be overlap,
+        // but since we use a HashMap, we'll just overwrite the partition values with the
+        // constant values from statistics (which should be the same).
+        literal_columns.extend(constant_columns_from_stats(
+            partitioned_file.statistics.as_deref(),
+            &logical_file_schema,
+        ));
+
+        // Apply literal replacements to projection and predicate
+        let mut projection = self.projection.clone();
+        let mut predicate = self.predicate.clone();
+        if !literal_columns.is_empty() {
+            projection = projection.try_map_exprs(|expr| {
+                replace_columns_with_literals(Arc::clone(&expr), &literal_columns)
+            })?;
+            predicate = predicate
+                .map(|p| replace_columns_with_literals(p, &literal_columns))
+                .transpose()?;
+        }
 
         let predicate_creation_errors = MetricBuilder::new(&self.metrics)
             .global_counter("num_predicate_creation_errors");
 
-        let enable_page_index = self.enable_page_index;
-
-        Ok(Box::pin(async move {
-            // Don't load the page index yet. Since it is not stored inline in
-            // the footer, loading the page index if it is not needed will do
-            // unecessary I/O. We decide later if it is needed to evaluate the
-            // pruning predicates. Thus default to not requesting if from the
-            // underlying reader.
-            let mut options = ArrowReaderOptions::new().with_page_index(false);
-            let mut metadata_timer = file_metrics.metadata_load_time.timer();
-
-            // Begin by loading the metadata from the underlying reader (note
-            // the returned metadata may actually include page indexes as some
-            // readers may return page indexes even when not requested -- for
-            // example when they are cached)
-            let mut reader_metadata =
-                ArrowReaderMetadata::load_async(&mut async_file_reader, options.clone())
-                    .await?;
-
-            // Note about schemas: we are actually dealing with **3 different schemas** here:
-            // - The table schema as defined by the TableProvider.
-            //   This is what the user sees, what they get when they `SELECT * FROM table`, etc.
-            // - The logical file schema: this is the table schema minus any hive partition columns and projections.
-            //   This is what the physicalfile schema is coerced to.
-            // - The physical file schema: this is the schema as defined by the parquet file. This is what the parquet file actually contains.
-            let mut physical_file_schema = Arc::clone(reader_metadata.schema());
-
-            // The schema loaded from the file may not be the same as the
-            // desired schema (for example if we want to instruct the parquet
-            // reader to read strings using Utf8View instead). Update if necessary
-            if let Some(merged) = apply_file_schema_type_coercions(
-                &logical_file_schema,
+        let file_pruner = predicate
+            .as_ref()
+            .filter(|p| is_dynamic_physical_expr(p) || partitioned_file.has_statistics())
+            .and_then(|p| {
+                FilePruner::try_new(
+                    Arc::clone(p),
+                    &logical_file_schema,
+                    &partitioned_file,
+                    predicate_creation_errors.clone(),
+                )
+            });
+
+        Ok(PreparedParquetOpen {
+            state: Arc::clone(self),
+            partitioned_file,
+            file_range,
+            extensions,
+            file_metrics,
+            file_pruner,
+            metadata_size_hint,
+            async_file_reader,
+            logical_file_schema,
+            output_schema,
+            projection,
+            predicate,
+            #[cfg(feature = "parquet_encryption")]
+            file_decryption_properties: None,
+        })
+    }
+}
+
+impl PreparedParquetOpen {
+    /// CPU-only file pruning performed before metadata I/O begins.
+    ///
+    /// Returns `None` if the file was completely pruned.
+    fn prune_file(mut self) -> Result<Option<Self>> {
+        if let Some(file_pruner) = &mut self.file_pruner
+            && file_pruner.should_prune()?
+        {
+            self.file_metrics
+                .files_ranges_pruned_statistics
+                .add_pruned(1);
+            return Ok(None);
+        }
+
+        self.file_metrics
+            .files_ranges_pruned_statistics
+            .add_matched(1);
+        Ok(Some(self))
+    }
+
+    /// Fetch parquet metadata once file-level pruning is complete.
+    async fn load(mut self) -> Result<MetadataLoadedParquetOpen> {
+        let options =
+            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Skip);
+        #[cfg(feature = "parquet_encryption")]
+        let mut options = options;
+        #[cfg(feature = "parquet_encryption")]
+        if let Some(fd_val) = &self.file_decryption_properties {
+            options = options.with_file_decryption_properties(Arc::clone(fd_val));
+        }
+        let reader_metadata = {
+            let mut metadata_timer = self.file_metrics.metadata_load_time.timer();
+            let reader_metadata = ArrowReaderMetadata::load_async(
+                &mut self.async_file_reader,
+                options.clone(),
+            )
+            .await?;
+            metadata_timer.stop();
+            reader_metadata
+        };
+        Ok(MetadataLoadedParquetOpen {
+            prepared: self,
+            reader_metadata,
+            options,
+        })
+    }
+}
+
+impl MetadataLoadedParquetOpen {
+    /// Prepare file-specific filters and schema coercions after metadata is loaded.
+    fn prepare_filters(self) -> Result<FiltersPreparedParquetOpen> {
+        let MetadataLoadedParquetOpen {
+            mut prepared,
+            mut reader_metadata,
+            mut options,
+        } = self;
+        let state = Arc::clone(&prepared.state);
+        let coerce_int96 = state.coerce_int96;
+        let predicate_creation_errors = MetricBuilder::new(&state.metrics)
+            .global_counter("num_predicate_creation_errors");
+        let expr_adapter_factory = Arc::clone(&state.expr_adapter_factory);
+
+        // Note about schemas: we are actually dealing with **3 different schemas** here:
+        // - The table schema as defined by the TableProvider.
+        //   This is what the user sees, what they get when they `SELECT * FROM table`, etc.
+        // - The logical file schema: this is the table schema minus any hive partition columns and projections.
+        //   This is what the physical file schema is coerced to.
+        // - The physical file schema: this is the schema that the arrow-rs
+        //   parquet reader will actually produce.
+        let logical_file_schema = Arc::clone(&prepared.logical_file_schema);
+        let mut physical_file_schema = Arc::clone(reader_metadata.schema());
+
+        // The schema loaded from the file may not be the same as the
+        // desired schema (for example if we want to instruct the parquet
+        // reader to read strings using Utf8View instead). Update if necessary.
+        if let Some(merged) =
+            apply_file_schema_type_coercions(&logical_file_schema, &physical_file_schema)
+        {
+            physical_file_schema = Arc::new(merged);
+            options = options.with_schema(Arc::clone(&physical_file_schema));
+            reader_metadata = ArrowReaderMetadata::try_new(
+                Arc::clone(reader_metadata.metadata()),
+                options.clone(),
+            )?;
+        }
+
+        if let Some(ref coerce) = coerce_int96
+            && let Some(merged) = coerce_int96_to_resolution(
+                reader_metadata.parquet_schema(),
                 &physical_file_schema,
-            ) {
-                physical_file_schema = Arc::new(merged);
-                options = options.with_schema(Arc::clone(&physical_file_schema));
-                reader_metadata = ArrowReaderMetadata::try_new(
-                    Arc::clone(reader_metadata.metadata()),
-                    options.clone(),
-                )?;
+                coerce,
+            )
+        {
+            physical_file_schema = Arc::new(merged);
+            options = options.with_schema(Arc::clone(&physical_file_schema));
+            reader_metadata = ArrowReaderMetadata::try_new(
+                Arc::clone(reader_metadata.metadata()),
+                options.clone(),
+            )?;
+        }
+
+        // Adapt the projection & filter predicate to the physical file schema.
+        // This evaluates missing columns and inserts any necessary casts.
+        // After rewriting to the file schema, further simplifications may be possible.
+        // For example, if `'a' = col_that_is_missing` becomes `'a' = NULL` that can then be simplified to `FALSE`
+        // and we can avoid doing any more work on the file (bloom filters, loading the page index, etc.).
+        // Additionally, if any casts were inserted we can move casts from the column to the literal side:
+        // `CAST(col AS INT) = 5` can become `col = CAST(5 AS <col type>)`, which can be evaluated statically.
+        let rewriter = expr_adapter_factory.create(
+            Arc::clone(&logical_file_schema),
+            Arc::clone(&physical_file_schema),
+        )?;
+        let simplifier = PhysicalExprSimplifier::new(&physical_file_schema);
+        prepared.predicate = prepared
+            .predicate
+            .map(|p| simplifier.simplify(rewriter.rewrite(p)?))
+            .transpose()?;
+        // Adapt projections to the physical file schema as well
+        prepared.projection = prepared
+            .projection
+            .try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?;
+
+        // Build predicates for this specific file
+        let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates(
+            prepared.predicate.as_ref(),
+            &physical_file_schema,
+            &predicate_creation_errors,
+        );
+
+        let projection = prepared.projection.clone();
+        let predicate = prepared.predicate.clone();
+
+        Ok(FiltersPreparedParquetOpen {
+            loaded: MetadataLoadedParquetOpen {
+                prepared,
+                reader_metadata,
+                options,
+            },
+            physical_file_schema,
+            projection,
+            predicate,
+            pruning_predicate,
+            page_pruning_predicate,
+        })
+    }
+}
+
+impl FiltersPreparedParquetOpen {
+    /// Fetch the page index if it is needed and missing from the loaded metadata.
+    async fn load_page_index(mut self) -> Result<Self> {
+        let enable_page_index = self.loaded.prepared.state.enable_page_index;
+        // The page index is not stored inline in the parquet footer so the
+        // metadata load above may not have read the page index structures yet.
+        // If we need them for reading and they aren't yet loaded, we need to
+        // load them now.
+        if should_enable_page_index(enable_page_index, &self.page_pruning_predicate) {
+            self.loaded.reader_metadata = load_page_index(
+                self.loaded.reader_metadata,
+                &mut self.loaded.prepared.async_file_reader,
+                // Since we're manually loading the page index the option here
+                // should not matter but we pass it in for consistency.
+                self.loaded
+                    .options
+                    .clone()
+                    .with_page_index_policy(PageIndexPolicy::Optional),
+            )
+            .await?;
+        }
+        Ok(self)
+    }
+
+    /// Prune row groups using file ranges and parquet metadata.
+    fn prepare_row_groups(self) -> Result<RowGroupsPreparedParquetOpen> {
+        let loaded = &self.loaded;
+        let inner = &loaded.prepared;
+        let state = &inner.state;
+
+        // Determine which row groups to actually read. The idea is to skip
+        // as many row groups as possible based on the metadata and query.
+        let file_metadata = Arc::clone(loaded.reader_metadata.metadata());
+        let rg_metadata = file_metadata.row_groups();
+        let file_name = inner.partitioned_file.object_meta.location.to_string();
+        let pruning_pred = self.pruning_predicate.as_ref().map(|p| p.as_ref());
+
+        let access_plan =
+            create_initial_plan(&file_name, inner.extensions.clone(), rg_metadata.len())?;
+        let mut row_groups = RowGroupAccessPlanFilter::new(access_plan);
+
+        // If there is a range restricting what parts of the file to read.
+        if let Some(range) = inner.file_range.as_ref() {
+            row_groups.prune_by_range(rg_metadata, range);
+        }
+
+        // If there is a predicate that can be evaluated against the metadata.
+        if let Some(pruning_pred) = pruning_pred {
+            if state.enable_row_group_stats_pruning {
+                row_groups.prune_by_statistics(
+                    &self.physical_file_schema,
+                    loaded.reader_metadata.parquet_schema(),
+                    rg_metadata,
+                    pruning_pred,
+                    &inner.file_metrics,
+                );
+            } else {
+                inner
+                    .file_metrics
+                    .row_groups_pruned_statistics
+                    .add_matched(row_groups.remaining_row_group_count());
+            }
+
+            if !state.enable_bloom_filter || row_groups.is_empty() {
+                inner
+                    .file_metrics
+                    .row_groups_pruned_bloom_filter
+                    .add_matched(row_groups.remaining_row_group_count());
             }
+        } else {
+            let n_remaining_row_groups = row_groups.remaining_row_group_count();
+            inner
+                .file_metrics
+                .row_groups_pruned_statistics
+                .add_matched(n_remaining_row_groups);
+            inner
+                .file_metrics
+                .row_groups_pruned_bloom_filter
+                .add_matched(n_remaining_row_groups);
+        }
+
+        Ok(RowGroupsPreparedParquetOpen {
+            prepared: self,
+            row_groups,
+        })
+    }
+}
+
+impl RowGroupsPreparedParquetOpen {
+    /// Apply bloom filter pruning when it is enabled and a pruning predicate exists.
+    async fn prune_bloom_filters(mut self) -> Result<Self> {
+        let loaded = &mut self.prepared.loaded;
+        let inner = &mut loaded.prepared;
+        let state = &inner.state;
+        let pruning_pred = self.prepared.pruning_predicate.as_ref().map(|p| p.as_ref());
+
+        if let Some(pruning_pred) = pruning_pred
+            && state.enable_bloom_filter
+            && !self.row_groups.is_empty()
+        {
+            let bf_reader = mem::replace(
+                &mut inner.async_file_reader,
+                state.parquet_file_reader_factory.create_reader(
+                    state.partition_index,
+                    inner.partitioned_file.clone(),
+                    inner.metadata_size_hint,
+                    &state.metrics,
+                )?,
+            );
+            let mut bf_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
+                bf_reader,
+                loaded.reader_metadata.clone(),
+            );
+            self.row_groups
+                .prune_by_bloom_filters(
+                    &self.prepared.physical_file_schema,
+                    &mut bf_builder,
+                    pruning_pred,
+                    &inner.file_metrics,
+                )
+                .await;
+        }
+
+        Ok(self)
+    }
+
+    /// Build one or more parquet reading states once all pruning work is complete.
+    ///
+    /// In the common case, this returns one child reader state per selected row
+    /// group. That lets each row group become its own morsel planner, which
+    /// exposes more CPU work to `FileStream`.
+    ///
+    /// The current implementation keeps a conservative fallback to a single
+    /// combined reader when the scan has a file-scoped dynamic pruner or a
+    /// global `LIMIT`, as both of those semantics are currently tracked at the
+    /// file level rather than the row-group level.
+    fn build_stream_readers(self) -> Result<Vec<ReadingParquetState>> {
+        let RowGroupsPreparedParquetOpen {
+            prepared,
+            mut row_groups,
+        } = self;
+        let FiltersPreparedParquetOpen {
+            loaded,
+            physical_file_schema,
+            projection,
+            predicate,
+            pruning_predicate: _,
+            page_pruning_predicate,
+        } = prepared;
+        let MetadataLoadedParquetOpen {
+            prepared,
+            reader_metadata,
+            options: _,
+        } = loaded;
+        let PreparedParquetOpen {
+            state,
+            partitioned_file,
+            file_range: _,
+            extensions: _,
+            file_metrics,
+            mut file_pruner,
+            metadata_size_hint,
+            async_file_reader,
+            logical_file_schema: _,
+            output_schema,
+            projection: _,
+            predicate: _,
+            #[cfg(feature = "parquet_encryption")]
+                file_decryption_properties: _,
+        } = prepared;
 
-            if coerce_int96.is_some() {
-                if let Some(merged) = coerce_int96_to_resolution(
-                    reader_metadata.parquet_schema(),
+        let batch_size = state.batch_size;
+        let reorder_predicates = state.reorder_filters;
+        let pushdown_filters = state.pushdown_filters;
+        let force_filter_selections = state.force_filter_selections;
+        let limit = state.limit;
+        let max_predicate_cache_size = state.max_predicate_cache_size;
+        let reverse_row_groups = state.reverse_row_groups;
+        let preserve_order = state.preserve_order;
+        let file_metadata = Arc::clone(reader_metadata.metadata());
+        let rg_metadata = file_metadata.row_groups();
+
+        // Prune by limit if limit is set and limit order is not sensitive.
+        if let (Some(limit), false) = (limit, preserve_order) {
+            row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);
+        }
+
+        // --------------------------------------------------------
+        // Step: prune pages from the kept row groups
+        //
+        // Page index pruning: if all data on individual pages can
+        // be ruled out using page metadata, rows from other columns
+        // with that range can be skipped as well.
+        // --------------------------------------------------------
+        let mut access_plan = row_groups.build();
+        if !access_plan.is_empty()
+            && let Some(ref p) = page_pruning_predicate
+        {
+            access_plan = p.prune_plan_with_page_index(
+                access_plan,
+                &physical_file_schema,
+                reader_metadata.parquet_schema(),
+                file_metadata.as_ref(),
+                &file_metrics,
+            );
+        }
+
+        // Prepare the access plan (extract row groups and row selection).
+        let mut prepared_plan = access_plan.prepare(rg_metadata)?;
+
+        // ----------------------------------------------------------
+        // Step: potentially reverse the access plan for performance.
+        // See `ParquetSource::try_pushdown_sort` for the rationale.
+        // ----------------------------------------------------------
+        if reverse_row_groups {
+            prepared_plan = prepared_plan.reverse(file_metadata.as_ref())?;
+        }
+
+        if prepared_plan.row_group_indexes.is_empty() {
+            return Ok(vec![]);
+        }
+
+        // Row-group fanout is currently only safe for plain unordered scans.
+        // Filter pushdown and page-index pruning tests compare full output
+        // batches across scan modes, and allowing individual row-group planners
+        // to overtake one another changes the observable row order within a
+        // single file. Keep those scans on the single-planner path until the
+        // scheduler has explicit within-file ordering support for child
+        // planners.
+        let split_by_row_group = limit.is_none()
+            && file_pruner.is_none()
+            && !preserve_order
+            && predicate.is_none()
+            && !pushdown_filters
+            && page_pruning_predicate.is_none();
+        let prepared_plans = if split_by_row_group {
+            prepared_plan.into_single_row_group_plans(file_metadata.as_ref())?
+        } else {
+            vec![prepared_plan]
+        };
+
+        let mut reusable_reader = Some(async_file_reader);
+        let mut reading_states = Vec::with_capacity(prepared_plans.len());
+        for prepared_plan in prepared_plans {
+            // ---------------------------------------------------------
+            // Step: construct builder for the final RecordBatch stream
+            // ---------------------------------------------------------
+            let mut builder =
+                ParquetPushDecoderBuilder::new_with_metadata(reader_metadata.clone())
+                    .with_batch_size(batch_size);
+
+            // -----------------------------------------------------------------
+            // Step: optionally add row filter to the builder
+            //
+            // Row filter is used for late materialization in parquet decoding,
+            // see `row_filter` for details.
+            // -----------------------------------------------------------------
+            if let Some(predicate) =
+                pushdown_filters.then_some(predicate.as_ref()).flatten()
+            {
+                let row_filter = row_filter::build_row_filter(
+                    predicate,
                     &physical_file_schema,
-                    &(coerce_int96.unwrap()),
-                ) {
-                    physical_file_schema = Arc::new(merged);
-                    options = options.with_schema(Arc::clone(&physical_file_schema));
-                    reader_metadata = ArrowReaderMetadata::try_new(
-                        Arc::clone(reader_metadata.metadata()),
-                        options.clone(),
-                    )?;
+                    file_metadata.as_ref(),
+                    reorder_predicates,
+                    &file_metrics,
+                );
+
+                match row_filter {
+                    Ok(Some(filter)) => {
+                        builder = builder.with_row_filter(filter);
+                    }
+                    Ok(None) => {}
+                    Err(e) => {
+                        debug!(
+                            "Ignoring error building row filter for '{predicate:?}': {e}"
+                        );
+                    }
+                };
+            };
+            if force_filter_selections {
+                builder =
+                    builder.with_row_selection_policy(RowSelectionPolicy::Selectors);
+            }
+
+            if let Some(row_selection) = prepared_plan.row_selection {
+                builder = builder.with_row_selection(row_selection);
+            }
+            builder = builder.with_row_groups(prepared_plan.row_group_indexes);
+
+            if let Some(limit) = limit {
+                builder = builder.with_limit(limit)
+            }
+
+            if let Some(max_predicate_cache_size) = max_predicate_cache_size {
+                builder = builder.with_max_predicate_cache_size(max_predicate_cache_size);
+            }
+
+            // Metrics from the arrow reader itself.
+            let arrow_reader_metrics = ArrowReaderMetrics::enabled();
+
+            let indices = projection.column_indices();
+            let mask =
+                ProjectionMask::roots(reader_metadata.parquet_schema(), indices.clone());
+
+            let decoder = builder
+                .with_projection(mask)
+                .with_metrics(arrow_reader_metrics.clone())
+                .build()?;
+
+            let predicate_cache_inner_records =
+                file_metrics.predicate_cache_inner_records.clone();
+            let predicate_cache_records = file_metrics.predicate_cache_records.clone();
+
+            // Rebase column indices to match the narrowed stream schema.
+            // The projection expressions have indices based on
+            // `physical_file_schema`, but the stream only contains the columns
+            // selected by the `ProjectionMask`.
+            let stream_schema = Arc::new(physical_file_schema.project(&indices)?);
+            let replace_schema = stream_schema != output_schema;
+            let projection = projection
+                .clone()
+                .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?;
+            let projector = projection.make_projector(&stream_schema)?;
+            let push_decoder_state = PushDecoderStreamState {
+                decoder,
+                reader: if let Some(reader) = reusable_reader.take() {
+                    reader
+                } else {
+                    state.parquet_file_reader_factory.create_reader(
+                        state.partition_index,
+                        partitioned_file.clone(),
+                        metadata_size_hint,
+                        &state.metrics,
+                    )?
+                },
+                projector,
+                output_schema: Arc::clone(&output_schema),
+                replace_schema,
+                arrow_reader_metrics,
+                predicate_cache_inner_records,
+                predicate_cache_records,
+            };
+
+            // Keep file-scoped early-stop behavior only on the single-planner
+            // fallback path. A row-group split path would need a file-shared
+            // pruner to preserve the exact semantics across child planners.
+            if let Some(file_pruner) = file_pruner.take() {
+                reading_states.push(ReadingParquetState::with_early_stop(
+                    push_decoder_state,
+                    file_pruner,
+                    file_metrics.files_ranges_pruned_statistics.clone(),
+                ));
+            } else {
+                reading_states.push(ReadingParquetState::new(push_decoder_state));
+            }
+        }
+
+        Ok(reading_states)
+    }
+}
+
+impl ParquetMorselizerState {
+    /// Resolve file-specific decryption properties before metadata I/O.
+    #[cfg(feature = "parquet_encryption")]
+    async fn load_file_decryption_properties(
+        self: &ParquetMorselizerState,
+        file_location: object_store::path::Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>> {
+        let encryption_context = self.get_encryption_context();
+        encryption_context
+            .get_file_decryption_properties(&file_location)
+            .await
+    }
+
+    /// Resolve file-specific decryption properties before metadata I/O.
+    #[cfg(not(feature = "parquet_encryption"))]
+    #[expect(dead_code)]
+    async fn load_file_decryption_properties(
+        self: &ParquetMorselizerState,
+        _file_location: object_store::path::Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>> {
+        Ok(None)
+    }
+}
+
+impl Morselizer for ParquetMorselizer {
+    fn morselize(&self, file: PartitionedFile) -> Result<Vec<Box<dyn MorselPlanner>>> {
+        Ok(vec![Box::new(ParquetMorselPlanner::new(
+            Arc::clone(&self.state),
+            file,
+        ))])
+    }
+}
+
+/// CPU-only states for [`ParquetMorselPlanner`].
+///
+/// These are the states when the MorselPlanner has more CPU work to do
+enum ReadyState {
+    /// Planner has not started any work yet.
+    Start(Box<PartitionedFile>),
+    /// Planner is ready to resolve any file-specific encryption properties.
+    #[cfg(feature = "parquet_encryption")]
+    PrepareEncryption(Box<PreparedParquetOpen>),
+    /// Planner can do file-level pruning before requesting parquet metadata.
+    PruneFiles(Box<PreparedParquetOpen>),
+    /// Planner has loaded parquet metadata and can do CPU-only filter preparation.
+    PrepareFilters(Box<MetadataLoadedParquetOpen>),
+    /// Planner has prepared filters and can request any missing page index data.
+    Prepared(Box<FiltersPreparedParquetOpen>),
+    /// Planner has prepared row-group pruning and can optionally load bloom filters.
+    BuildStream(Box<RowGroupsPreparedParquetOpen>),
+    /// Planner has one or more per-row-group reading states ready to turn into
+    /// the current planner plus any child planners.
+    FanoutRowGroups(Vec<ReadingParquetState>),
+    /// Planner has a prepared push decoder and is trying to produce the first
+    /// record batch before yielding a morsel.
+    ReadingParquet(Box<ReadingParquetState>),
+    /// Planner has a fully prepared morsel ready to emit.
+    EmitMorsel(BoxStream<'static, Result<RecordBatch>>),
+}
+
+impl ReadyState {
+    fn start(file: PartitionedFile) -> Self {
+        Self::Start(Box::new(file))
+    }
+
+    #[cfg(feature = "parquet_encryption")]
+    fn prepare_encryption(prepared: PreparedParquetOpen) -> Self {
+        Self::PrepareEncryption(Box::new(prepared))
+    }
+
+    fn prune_files(prepared: PreparedParquetOpen) -> Self {
+        Self::PruneFiles(Box::new(prepared))
+    }
+
+    fn prepared(prepared: MetadataLoadedParquetOpen) -> Self {
+        Self::PrepareFilters(prepared.into())
+    }
+
+    fn filters_prepared(prepared: FiltersPreparedParquetOpen) -> Self {
+        Self::Prepared(prepared.into())
+    }
+
+    fn build_stream(prepared: RowGroupsPreparedParquetOpen) -> Self {
+        Self::BuildStream(prepared.into())
+    }
+
+    fn fanout_row_groups(states: Vec<ReadingParquetState>) -> Self {
+        Self::FanoutRowGroups(states)
+    }
+
+    fn reading_parquet(state: ReadingParquetState) -> Self {
+        Self::ReadingParquet(Box::new(state))
+    }
+
+    fn emit_morsel(stream: BoxStream<'static, Result<RecordBatch>>) -> Self {
+        Self::EmitMorsel(stream)
+    }
+}
+
+/// Scheduler-visible state for [`ParquetMorselPlanner`].
+///
+/// This allows tracking outstanding IOs
+enum ParquetMorselPlannerState {
+    /// Planner can make progress using CPU only.
+    Ready(Box<ReadyState>),
+    /// Planner has outstanding async I/O and will become ready again when it completes.
+    WaitingIo(WaitingIoState),
+    /// Planner has emitted its morsel and has no further work.
+    Done,
+}
+
+impl ParquetMorselPlannerState {
+    fn ready(ready_state: ReadyState) -> Self {
+        Self::Ready(Box::new(ready_state))
+    }
+
+    /// Return a planner state that emits an empty morsel stream.
+    ///
+    /// This is used when file-level pruning determines the file can be skipped
+    /// before any parquet metadata or row-group work is needed, while still
+    /// flowing through the normal morsel emission path in `FileStream`.
+    fn empty_file() -> Self {
+        Self::ready(ReadyState::emit_morsel(futures::stream::empty().boxed()))
+    }
+}
+
+/// Result of an in-flight planner I/O phase.
+struct WaitingIoState {
+    /// Waiting for an async step to produce the next CPU-ready planner state.
+    receiver: oneshot::Receiver<Result<ReadyState>>,
+}
+
+impl ParquetMorselPlannerState {
+    fn name(&self) -> &'static str {
+        match self {
+            Self::Ready(ready_state) => match ready_state.as_ref() {
+                ReadyState::Start(_) => "Ready(Start)",
+                #[cfg(feature = "parquet_encryption")]
+                ReadyState::PrepareEncryption(_) => "Ready(PrepareEncryption)",
+                ReadyState::PruneFiles(_) => "Ready(PruneFiles)",
+                ReadyState::PrepareFilters(_) => "Ready(PrepareFilters)",
+                ReadyState::Prepared(_) => "Ready(Prepared)",
+                ReadyState::BuildStream(_) => "Ready(BuildStream)",
+                ReadyState::FanoutRowGroups(_) => "Ready(FanoutRowGroups)",
+                ReadyState::ReadingParquet(_) => "Ready(ReadingParquet)",
+                ReadyState::EmitMorsel(_) => "Ready(EmitMorsel)",
+            },
+            Self::WaitingIo(_) => "WaitingIo",
+            Self::Done => "Done",
+        }
+    }
+}
+
+/// Planner wrapper that exposes the copied opener logic through the generic
+/// morsel-planning API.
+struct ParquetMorselPlanner {
+    morselizer: Arc<ParquetMorselizerState>,
+    state: ParquetMorselPlannerState,
+}
+
+impl Debug for ParquetMorselPlanner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParquetMorselPlanner")
+            .field("morselizer", &"...")
+            .field("state", &self.state.name())
+            .finish()
+    }
+}
+
+impl ParquetMorselPlanner {
+    fn new(morselizer: Arc<ParquetMorselizerState>, file: PartitionedFile) -> Self {
+        Self {
+            morselizer,
+            state: ParquetMorselPlannerState::ready(ReadyState::start(file)),
+        }
+    }
+
+    fn with_ready_state(
+        morselizer: Arc<ParquetMorselizerState>,
+        ready_state: ReadyState,
+    ) -> Self {
+        Self {
+            morselizer,
+            state: ParquetMorselPlannerState::ready(ready_state),
+        }
+    }
+
+    #[cfg(feature = "parquet_encryption")]
+    fn needs_file_decryption_properties(&self) -> bool {
+        self.morselizer
+            .encryption_context
+            .needs_file_decryption_properties()
+    }
+
+    /// Schedule an async step and transition the planner into `WaitingIo`.
+    ///
+    /// Sets `self.state` to ParquetMorselPlannerState::WaitingIo
+    fn schedule_io<F>(&mut self, future: F) -> Option<MorselPlan>
+    where
+        F: Future<Output = Result<ReadyState>> + Send + 'static,
+    {
+        let (sender, receiver) = oneshot::channel();
+        let io_future = async move {
+            let next_state = future.await?;
+            // Ignore error as it means the receiver shutdown (likely due to a
+            // real error) and we don't want to confuse error reporting by
+            // reporting a closed channel.
+            let _ = sender.send(Ok(next_state));
+            Ok(())
+        };
+        self.state = ParquetMorselPlannerState::WaitingIo(WaitingIoState { receiver });
+        Some(MorselPlan::new().with_io_future(io_future.boxed()))
+    }
+
+    /// Drive the initial push-decoder loop until it either needs more I/O or
+    /// can yield a stream with at least its first batch ready.
+    ///
+    /// This relies on the push decoder not doing IO after it has begun to produce
+    /// RecordBatches, which is only true when reading a single record batch
+    fn prepare_reading_parquet(
+        &mut self,
+        reading: ReadingParquetState,
+    ) -> Result<Option<MorselPlan>> {
+        let ReadingParquetState {
+            mut state,
+            file_pruner,
+            files_ranges_pruned_statistics,
+        } = reading;
+
+        match state.decoder.try_decode() {
+            Ok(DecodeResult::NeedsData(ranges)) => Ok(self.schedule_io(async move {
+                let data = state.reader.get_byte_ranges(ranges.clone()).await?;
+                state.decoder.push_ranges(ranges, data)?;
+                Ok(ReadyState::reading_parquet(ReadingParquetState {
+                    state,
+                    file_pruner,
+                    files_ranges_pruned_statistics,
+                }))
+            })),
+            Ok(DecodeResult::Data(batch)) => {
+                state.copy_arrow_reader_metrics();
+                let batch = state.project_batch(&batch)?;
+                let stream = ReadingParquetState {
+                    state,
+                    file_pruner,
+                    files_ranges_pruned_statistics,
+                }
+                .into_stream(Some(batch));
+                Ok(Some(MorselPlan::new().with_morsels(vec![Box::new(
+                    ParquetStreamMorsel::new(stream),
+                )])))
+            }
+            Ok(DecodeResult::Finished) => {
+                let stream = ReadingParquetState {
+                    state,
+                    file_pruner,
+                    files_ranges_pruned_statistics,
                 }
+                .into_stream(None);
+                Ok(Some(MorselPlan::new().with_morsels(vec![Box::new(
+                    ParquetStreamMorsel::new(stream),
+                )])))
             }
+            Err(e) => Err(DataFusionError::from(e)),
+        }
+    }
+}
 
-            // Build predicates for this specific file
-            let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates(
-                predicate.as_ref(),
-                &logical_file_schema,
-                &predicate_creation_errors,
-            );
+impl MorselPlanner for ParquetMorselPlanner {
+    fn plan(&mut self) -> Result<Option<MorselPlan>> {
+        // Core state machine transition
+        let state = mem::replace(&mut self.state, ParquetMorselPlannerState::Done);
+        match state {
+            ParquetMorselPlannerState::Ready(ready_state) => match *ready_state {
+                ReadyState::Start(file) => {
+                    let prepared = self.morselizer.prepare_open_file(*file)?;
+                    #[cfg(feature = "parquet_encryption")]
+                    {
+                        if self.needs_file_decryption_properties() {
+                            self.state = ParquetMorselPlannerState::ready(
+                                ReadyState::prepare_encryption(prepared),
+                            );
+                        } else {
+                            self.state = ParquetMorselPlannerState::ready(
+                                ReadyState::prune_files(prepared),
+                            );
+                        }
+                    }
+                    #[cfg(not(feature = "parquet_encryption"))]
+                    {
+                        self.state = ParquetMorselPlannerState::ready(
+                            ReadyState::prune_files(prepared),
+                        );
+                    }
+                    Ok(Some(MorselPlan::new()))
+                }
+                #[cfg(feature = "parquet_encryption")]
+                ReadyState::PrepareEncryption(mut prepared) => {
+                    let file_location =
+                        prepared.partitioned_file.object_meta.location.clone();
+                    let state = Arc::clone(&prepared.state);
+                    Ok(self.schedule_io(async move {
+                        let properties =
+                            state.load_file_decryption_properties(file_location).await?;
+                        prepared.file_decryption_properties = properties;
+                        Ok(ReadyState::prune_files(*prepared))
+                    }))
+                }
+                ReadyState::PruneFiles(prepared) => {
+                    let Some(prepared) = prepared.prune_file()? else {
+                        // File was totally pruned
+                        self.state = ParquetMorselPlannerState::empty_file();
+                        return Ok(Some(MorselPlan::new()));
+                    };
+                    Ok(self.schedule_io(async move {
+                        let loaded = prepared.load().await?;
+                        Ok(ReadyState::prepared(loaded))
+                    }))
+                }
+                ReadyState::PrepareFilters(prepared) => {
+                    let prepared = prepared.prepare_filters()?;
+                    self.state = ParquetMorselPlannerState::ready(
+                        ReadyState::filters_prepared(prepared),
+                    );
+                    Ok(Some(MorselPlan::new()))
+                }
+                ReadyState::Prepared(prepared) => Ok(self.schedule_io(async move {
+                    let prepared = prepared.load_page_index().await?;
+                    let prepared = prepared.prepare_row_groups()?;
+                    Ok(ReadyState::build_stream(prepared))
+                })),
+                ReadyState::BuildStream(prepared) => {
+                    let should_prune_bloom = prepared
+                        .prepared
+                        .pruning_predicate
+                        .is_some()
+                        && prepared.prepared.loaded.prepared.state.enable_bloom_filter
+                        && !prepared.row_groups.is_empty();
+                    if should_prune_bloom {
+                        Ok(self.schedule_io(async move {
+                            let prepared = prepared.prune_bloom_filters().await?;
+                            let reading_states = prepared.build_stream_readers()?;
+                            Ok(ReadyState::fanout_row_groups(reading_states))
+                        }))
+                    } else {
+                        let reading_states = prepared.build_stream_readers()?;
+                        self.state = ParquetMorselPlannerState::ready(
+                            ReadyState::fanout_row_groups(reading_states),
+                        );
+                        Ok(Some(MorselPlan::new()))
+                    }
+                }
+                ReadyState::FanoutRowGroups(reading_states) => {
+                    let mut reading_states: VecDeque<_> = reading_states.into();
+                    let Some(first_state) = reading_states.pop_front() else {
+                        self.state = ParquetMorselPlannerState::empty_file();
+                        return Ok(Some(MorselPlan::new()));
+                    };
+
+                    let child_planners = reading_states
+                        .into_iter()
+                        .map(|reading_state| {
+                            Box::new(ParquetMorselPlanner::with_ready_state(
+                                Arc::clone(&self.morselizer),
+                                ReadyState::reading_parquet(reading_state),
+                            )) as Box<dyn MorselPlanner>
+                        })
+                        .collect();
+
+                    self.state = ParquetMorselPlannerState::ready(
+                        ReadyState::reading_parquet(first_state),
+                    );
+                    Ok(Some(MorselPlan::new().with_planners(child_planners)))
+                }
+                ReadyState::ReadingParquet(reading) => {
+                    self.prepare_reading_parquet(*reading)
+                }
+                ReadyState::EmitMorsel(stream) => Ok(Some(
+                    MorselPlan::new()
+                        .with_morsels(vec![Box::new(ParquetStreamMorsel::new(stream))]),
+                )),
+            },
+            ParquetMorselPlannerState::WaitingIo(WaitingIoState { mut receiver }) => {
+                match receiver.try_recv() {
+                    Ok(next_state) => {
+                        self.state = ParquetMorselPlannerState::ready(next_state?);
+                        Ok(Some(MorselPlan::new()))
+                    }
+                    Err(oneshot::error::TryRecvError::Empty) => {
+                        self.state =
+                            ParquetMorselPlannerState::WaitingIo(WaitingIoState {
+                                receiver,
+                            });
+                        Ok(None)
+                    }
+                    Err(oneshot::error::TryRecvError::Closed) => {
+                        Err(DataFusionError::Execution(
+                            "Parquet morsel planner I/O completion channel closed"
+                                .to_string(),
+                        ))
+                    }
+                }
+            }
+            ParquetMorselPlannerState::Done => Ok(None),
+        }
+    }
+}
+
+struct ReadingParquetState {
+    state: PushDecoderStreamState,
+    file_pruner: Option<FilePruner>,
+    files_ranges_pruned_statistics: Option<PruningMetrics>,
+}
+
+impl ReadingParquetState {
+    fn new(state: PushDecoderStreamState) -> Self {
+        Self {
+            state,
+            file_pruner: None,
+            files_ranges_pruned_statistics: None,
+        }
+    }
+
+    fn with_early_stop(
+        state: PushDecoderStreamState,
+        file_pruner: FilePruner,
+        files_ranges_pruned_statistics: PruningMetrics,
+    ) -> Self {
+        Self {
+            state,
+            file_pruner: Some(file_pruner),
+            files_ranges_pruned_statistics: Some(files_ranges_pruned_statistics),
+        }
+    }
+
+    fn into_stream(
+        self,
+        first_batch: Option<RecordBatch>,
+    ) -> BoxStream<'static, Result<RecordBatch>> {
+        let stream = stream_from_push_decoder_state(self.state, first_batch).boxed();
+        wrap_stream_with_early_stop(
+            stream,
+            self.file_pruner,
+            self.files_ranges_pruned_statistics,
+        )
+    }
+}
+
+struct ParquetStreamMorsel {
+    stream: BoxStream<'static, Result<RecordBatch>>,
+}
+
+impl Debug for ParquetStreamMorsel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParquetStreamMorsel")
+            .field("stream", &"...")
+            .finish()
+    }
+}
+
+impl ParquetStreamMorsel {
+    fn new(stream: BoxStream<'static, Result<RecordBatch>>) -> Self {
+        Self { stream }
+    }
+}
+
+impl Morsel for ParquetStreamMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.stream
+    }
+
+    fn split(&mut self) -> Result<Vec<Box<dyn Morsel>>> {
+        Ok(vec![])
+    }
+}
+
+/// State for a stream that decodes a single Parquet file using a push-based decoder.
+///
+/// The [`transition`](Self::transition) method drives the decoder in a loop: it requests
+/// byte ranges from the [`AsyncFileReader`], pushes the fetched data into the
+/// [`ParquetPushDecoder`], and yields projected [`RecordBatch`]es until the file is
+/// fully consumed.
+struct PushDecoderStreamState {
+    decoder: ParquetPushDecoder,
+    reader: Box<dyn AsyncFileReader>,
+    projector: Projector,
+    output_schema: Arc<Schema>,
+    replace_schema: bool,
+    arrow_reader_metrics: ArrowReaderMetrics,
+    predicate_cache_inner_records: Gauge,
+    predicate_cache_records: Gauge,
+}
+
+impl PushDecoderStreamState {
+    /// Advances the decoder state machine until the next [`RecordBatch`] is
+    /// produced, the file is fully consumed, or an error occurs.
+    ///
+    /// On each iteration the decoder is polled via [`ParquetPushDecoder::try_decode`]:
+    /// - [`NeedsData`](DecodeResult::NeedsData) – the requested byte ranges are
+    ///   fetched from the [`AsyncFileReader`] and fed back into the decoder.
+    /// - [`Data`](DecodeResult::Data) – a decoded batch is projected and returned.
+    /// - [`Finished`](DecodeResult::Finished) – signals end-of-stream (`None`).
+    async fn transition(&mut self) -> Option<Result<RecordBatch>> {
+        loop {
+            match self.decoder.try_decode() {
+                Ok(DecodeResult::NeedsData(ranges)) => {
+                    let fetch = async {
+                        let data = self.reader.get_byte_ranges(ranges.clone()).await?;
+                        self.decoder.push_ranges(ranges, data)?;
+                        Ok::<_, ParquetError>(())
+                    };
+                    if let Err(e) = fetch.await {
+                        return Some(Err(DataFusionError::from(e)));
+                    }
+                }
+                Ok(DecodeResult::Data(batch)) => {
+                    self.copy_arrow_reader_metrics();
+                    return Some(self.project_batch(&batch));
+                }
+                Ok(DecodeResult::Finished) => {
+                    return None;
+                }
+                Err(e) => {
+                    return Some(Err(DataFusionError::from(e)));
+                }
+            }
+        }
+    }
+
+    /// Copies metrics from ArrowReaderMetrics (the metrics collected by the
+    /// arrow-rs parquet reader) to the parquet file metrics for DataFusion
+    fn copy_arrow_reader_metrics(&self) {
+        if let Some(v) = self.arrow_reader_metrics.records_read_from_inner() {
+            self.predicate_cache_inner_records.set(v);
+        }
+        if let Some(v) = self.arrow_reader_metrics.records_read_from_cache() {
+            self.predicate_cache_records.set(v);
+        }
+    }
+
+    fn project_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let mut batch = self.projector.project_batch(batch)?;
+        if self.replace_schema {
+            let (_schema, arrays, num_rows) = batch.into_parts();
+            let options = RecordBatchOptions::new().with_row_count(Some(num_rows));
+            batch = RecordBatch::try_new_with_options(
+                Arc::clone(&self.output_schema),
+                arrays,
+                &options,
+            )?;
+        }
+        Ok(batch)
+    }
+}
+
+fn stream_from_push_decoder_state(
+    state: PushDecoderStreamState,
+    first_batch: Option<RecordBatch>,
+) -> impl Stream<Item = Result<RecordBatch>> + Send + 'static {
+    let first = first_batch
+        .map(|batch| futures::stream::once(async move { Ok(batch) }).left_stream())
+        .unwrap_or_else(|| futures::stream::empty().right_stream());
+
+    first.chain(futures::stream::unfold(state, |mut state| async move {
+        let result = state.transition().await;
+        result.map(|r| (r, state))
+    }))
+}
+
+fn wrap_stream_with_early_stop(
+    stream: BoxStream<'static, Result<RecordBatch>>,
+    file_pruner: Option<FilePruner>,
+    files_ranges_pruned_statistics: Option<PruningMetrics>,
+) -> BoxStream<'static, Result<RecordBatch>> {
+    match (file_pruner, files_ranges_pruned_statistics) {
+        (Some(file_pruner), Some(files_ranges_pruned_statistics)) => {
+            EarlyStoppingStream::new(stream, file_pruner, files_ranges_pruned_statistics)
+                .boxed()
+        }
+        _ => stream,
+    }
+}
+
+type ConstantColumns = HashMap<String, ScalarValue>;
+
+/// Extract constant column values from statistics, keyed by column name in the logical file schema.
+fn constant_columns_from_stats(
+    statistics: Option<&Statistics>,
+    file_schema: &SchemaRef,
+) -> ConstantColumns {
+    let mut constants = HashMap::new();
+    let Some(statistics) = statistics else {
+        return constants;
+    };
+
+    let num_rows = match statistics.num_rows {
+        Precision::Exact(num_rows) => Some(num_rows),
+        _ => None,
+    };
+
+    for (idx, column_stats) in statistics
+        .column_statistics
+        .iter()
+        .take(file_schema.fields().len())
+        .enumerate()
+    {
+        let field = file_schema.field(idx);
+        if let Some(value) =
+            constant_value_from_stats(column_stats, num_rows, field.data_type())
+        {
+            constants.insert(field.name().clone(), value);
+        }
+    }
+
+    constants
+}
+
+fn constant_value_from_stats(
+    column_stats: &ColumnStatistics,
+    num_rows: Option<usize>,
+    data_type: &DataType,
+) -> Option<ScalarValue> {
+    if let (Precision::Exact(min), Precision::Exact(max)) =
+        (&column_stats.min_value, &column_stats.max_value)
+        && min == max
+        && !min.is_null()
+        && matches!(column_stats.null_count, Precision::Exact(0))
+    {
+        // Cast to the expected data type if needed (e.g., Utf8 -> Dictionary)
+        if min.data_type() != *data_type {
+            return min.cast_to(data_type).ok();
+        }
+        return Some(min.clone());
+    }
 
-            // The page index is not stored inline in the parquet footer so the
-            // code above may not have read the page index structures yet. If we
-            // need them for reading and they aren't yet loaded, we need to load them now.
-            if should_enable_page_index(enable_page_index, &page_pruning_predicate) {
-                reader_metadata = load_page_index(
-                    reader_metadata,
-                    &mut async_file_reader,
-                    // Since we're manually loading the page index the option here should not matter but we pass it in for consistency
-                    options.with_page_index(true),
-                )
-                .await?;
-            }
+    if let (Some(num_rows), Precision::Exact(nulls)) =
+        (num_rows, &column_stats.null_count)
+        && *nulls == num_rows
+    {
+        return ScalarValue::try_new_null(data_type).ok();
+    }
 
-            metadata_timer.stop();
+    None
+}
 
-            let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
-                async_file_reader,
-                reader_metadata,
-            );
+/// Wraps an inner RecordBatchStream and a [`FilePruner`]
+///
+/// This can terminate the scan early when some dynamic filters is updated after
+/// the scan starts, so we discover after the scan starts that the file can be
+/// pruned (can't have matching rows).
+struct EarlyStoppingStream<S> {
+    /// Has the stream finished processing? All subsequent polls will return
+    /// None
+    done: bool,
+    file_pruner: FilePruner,
+    files_ranges_pruned_statistics: PruningMetrics,
+    /// The inner stream
+    inner: S,
+}
 
-            let (schema_mapping, adapted_projections) =
-                schema_adapter.map_schema(&physical_file_schema)?;
+impl<S> EarlyStoppingStream<S> {
+    pub fn new(
+        stream: S,
+        file_pruner: FilePruner,
+        files_ranges_pruned_statistics: PruningMetrics,
+    ) -> Self {
+        Self {
+            done: false,
+            inner: stream,
+            file_pruner,
+            files_ranges_pruned_statistics,
+        }
+    }
+}
 
-            let mask = ProjectionMask::roots(
-                builder.parquet_schema(),
-                adapted_projections.iter().cloned(),
-            );
+impl<S> EarlyStoppingStream<S>
+where
+    S: Stream<Item = Result<RecordBatch>> + Unpin,
+{
+    fn check_prune(&mut self, input: Result<RecordBatch>) -> Result<Option<RecordBatch>> {
+        let batch = input?;
 
-            // Filter pushdown: evaluate predicates during scan
-            if let Some(predicate) = pushdown_filters.then_some(predicate).flatten() {
-                let row_filter = row_filter::build_row_filter(
-                    &predicate,
-                    &physical_file_schema,
-                    &logical_file_schema,
-                    builder.metadata(),
-                    reorder_predicates,
-                    &file_metrics,
-                    &schema_adapter_factory,
-                );
+        // Since dynamic filters may have been updated, see if we can stop
+        // reading this stream entirely.
+        if self.file_pruner.should_prune()? {
+            self.files_ranges_pruned_statistics.add_pruned(1);
+            // Previously this file range has been counted as matched
+            self.files_ranges_pruned_statistics.subtract_matched(1);
+            self.done = true;
+            Ok(None)
+        } else {
+            // Return the adapted batch
+            Ok(Some(batch))
+        }
+    }
+}
 
-                match row_filter {
-                    Ok(Some(filter)) => {
-                        builder = builder.with_row_filter(filter);
-                    }
-                    Ok(None) => {}
-                    Err(e) => {
-                        debug!(
-                            "Ignoring error building row filter for '{predicate:?}': {e}"
-                        );
-                    }
-                };
-            };
+impl<S> Stream for EarlyStoppingStream<S>
+where
+    S: Stream<Item = Result<RecordBatch>> + Unpin,
+{
+    type Item = Result<RecordBatch>;
 
-            // Determine which row groups to actually read. The idea is to skip
-            // as many row groups as possible based on the metadata and query
-            let file_metadata = Arc::clone(builder.metadata());
-            let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
-            let rg_metadata = file_metadata.row_groups();
-            // track which row groups to actually read
-            let access_plan =
-                create_initial_plan(&file_name, extensions, rg_metadata.len())?;
-            let mut row_groups = RowGroupAccessPlanFilter::new(access_plan);
-            // if there is a range restricting what parts of the file to read
-            if let Some(range) = file_range.as_ref() {
-                row_groups.prune_by_range(rg_metadata, range);
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if self.done {
+            return Poll::Ready(None);
+        }
+        match ready!(self.inner.poll_next_unpin(cx)) {
+            None => {
+                // input done
+                self.done = true;
+                Poll::Ready(None)
             }
-            // If there is a predicate that can be evaluated against the metadata
-            if let Some(predicate) = predicate.as_ref() {
-                if enable_row_group_stats_pruning {
-                    row_groups.prune_by_statistics(
-                        &physical_file_schema,
-                        builder.parquet_schema(),
-                        rg_metadata,
-                        predicate,
-                        &file_metrics,
-                    );
-                }
-
-                if enable_bloom_filter && !row_groups.is_empty() {
-                    row_groups
-                        .prune_by_bloom_filters(
-                            &physical_file_schema,
-                            &mut builder,
-                            predicate,
-                            &file_metrics,
-                        )
-                        .await;
-                }
+            Some(input_batch) => {
+                let output = self.check_prune(input_batch);
+                Poll::Ready(output.transpose())
             }
+        }
+    }
+}
 
-            let mut access_plan = row_groups.build();
-
-            // page index pruning: if all data on individual pages can
-            // be ruled using page metadata, rows from other columns
-            // with that range can be skipped as well
-            if enable_page_index && !access_plan.is_empty() {
-                if let Some(p) = page_pruning_predicate {
-                    access_plan = p.prune_plan_with_page_index(
-                        access_plan,
-                        &physical_file_schema,
-                        builder.parquet_schema(),
-                        file_metadata.as_ref(),
-                        &file_metrics,
-                    );
-                }
-            }
+#[derive(Default, Clone)]
+pub(crate) struct EncryptionContext {
+    #[cfg(feature = "parquet_encryption")]
+    file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
+    #[cfg(feature = "parquet_encryption")]
+    encryption_factory: Option<(Arc<dyn EncryptionFactory>, EncryptionFactoryOptions)>,
+}
 
-            let row_group_indexes = access_plan.row_group_indexes();
-            if let Some(row_selection) =
-                access_plan.into_overall_row_selection(rg_metadata)?
-            {
-                builder = builder.with_row_selection(row_selection);
-            }
+#[cfg(feature = "parquet_encryption")]
+impl EncryptionContext {
+    fn needs_file_decryption_properties(&self) -> bool {
+        self.file_decryption_properties.is_some() || self.encryption_factory.is_some()
+    }
 
-            if let Some(limit) = limit {
-                builder = builder.with_limit(limit)
+    pub(crate) fn new(
+        file_decryption_properties: Option<Arc<FileDecryptionProperties>>,
+        encryption_factory: Option<(
+            Arc<dyn EncryptionFactory>,
+            EncryptionFactoryOptions,
+        )>,
+    ) -> Self {
+        Self {
+            file_decryption_properties,
+            encryption_factory,
+        }
+    }
+
+    async fn get_file_decryption_properties(
+        &self,
+        file_location: &object_store::path::Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>> {
+        match &self.file_decryption_properties {
+            Some(file_decryption_properties) => {
+                Ok(Some(Arc::clone(file_decryption_properties)))
             }
+            None => match &self.encryption_factory {
+                Some((encryption_factory, encryption_config)) => Ok(encryption_factory
+                    .get_file_decryption_properties(encryption_config, file_location)
+                    .await?),
+                None => Ok(None),
+            },
+        }
+    }
+}
 
-            let stream = builder
-                .with_projection(mask)
-                .with_batch_size(batch_size)
-                .with_row_groups(row_group_indexes)
-                .build()?;
+#[cfg(not(feature = "parquet_encryption"))]
+#[expect(dead_code)]
+impl EncryptionContext {
+    fn needs_file_decryption_properties(&self) -> bool {
+        false
+    }
+
+    async fn get_file_decryption_properties(
+        &self,
+        _file_location: &object_store::path::Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>> {
+        Ok(None)
+    }
+}
 
-            let adapted = stream
-                .map_err(|e| ArrowError::ExternalError(Box::new(e)))
-                .map(move |maybe_batch| {
-                    maybe_batch
-                        .and_then(|b| schema_mapping.map_batch(b).map_err(Into::into))
-                });
+impl ParquetMorselizerState {
+    #[cfg(feature = "parquet_encryption")]
+    fn get_encryption_context(&self) -> EncryptionContext {
+        self.encryption_context.clone()
+    }
 
-            Ok(adapted.boxed())
-        }))
+    #[cfg(not(feature = "parquet_encryption"))]
+    #[expect(dead_code)]
+    fn get_encryption_context(&self) -> EncryptionContext {
+        self.encryption_context.clone()
     }
 }
 
@@ -355,30 +1689,6 @@ fn create_initial_plan(
     Ok(ParquetAccessPlan::new_all(row_group_count))
 }
 
-/// Build a pruning predicate from an optional predicate expression.
-/// If the predicate is None or the predicate cannot be converted to a pruning
-/// predicate, return None.
-/// If there is an error creating the pruning predicate it is recorded by incrementing
-/// the `predicate_creation_errors` counter.
-pub(crate) fn build_pruning_predicate(
-    predicate: Arc<dyn PhysicalExpr>,
-    file_schema: &SchemaRef,
-    predicate_creation_errors: &Count,
-) -> Option<Arc<PruningPredicate>> {
-    match PruningPredicate::try_new(predicate, Arc::clone(file_schema)) {
-        Ok(pruning_predicate) => {
-            if !pruning_predicate.always_true() {
-                return Some(Arc::new(pruning_predicate));
-            }
-        }
-        Err(e) => {
-            debug!("Could not create pruning predicate for: {e}");
-            predicate_creation_errors.add(1);
-        }
-    }
-    None
-}
-
 /// Build a page pruning predicate from an optional predicate expression.
 /// If the predicate is None or the predicate cannot be converted to a page pruning
 /// predicate, return None.
@@ -430,8 +1740,8 @@ async fn load_page_index<T: AsyncFileReader>(
     if missing_column_index || missing_offset_index {
         let m = Arc::try_unwrap(Arc::clone(parquet_metadata))
             .unwrap_or_else(|e| e.as_ref().clone());
-        let mut reader =
-            ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true);
+        let mut reader = ParquetMetaDataReader::new_with_metadata(m)
+            .with_page_index_policy(PageIndexPolicy::Optional);
         reader.load_page_index(input).await?;
         let new_parquet_metadata = reader.finish()?;
         let new_arrow_reader =
@@ -454,3 +1764,1060 @@ fn should_enable_page_index(
             .map(|p| p.filter_number() > 0)
             .unwrap_or(false)
 }
+
+#[cfg(test)]
+mod test {
+    use std::collections::VecDeque;
+    use std::sync::Arc;
+
+    use super::{
+        ConstantColumns, EncryptionContext, ParquetMorselizerState,
+        constant_columns_from_stats,
+    };
+    use crate::{DefaultParquetFileReaderFactory, ParquetMorselizer, RowGroupAccess};
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use arrow::record_batch::RecordBatch;
+    use bytes::{BufMut, BytesMut};
+    use datafusion_common::{
+        ColumnStatistics, DataFusionError, ScalarValue, Statistics, record_batch,
+        stats::Precision,
+    };
+    use datafusion_datasource::{
+        PartitionedFile, TableSchema,
+        morsel::{MorselPlanner, Morselizer},
+    };
+    use datafusion_expr::{col, lit};
+    use datafusion_physical_expr::{
+        PhysicalExpr,
+        expressions::{Column, DynamicFilterPhysicalExpr, Literal},
+        planner::logical2physical,
+        projection::ProjectionExprs,
+    };
+    use datafusion_physical_expr_adapter::{
+        DefaultPhysicalExprAdapterFactory, replace_columns_with_literals,
+    };
+    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+    use futures::{Stream, StreamExt};
+    use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
+    use parquet::arrow::ArrowWriter;
+    use parquet::file::properties::WriterProperties;
+
+    /// Builder for creating [`ParquetMorselizer`] instances with sensible defaults for tests.
+    /// This helps reduce code duplication and makes it clear what differs between test cases.
+    struct ParquetMorselizerBuilder {
+        store: Option<Arc<dyn ObjectStore>>,
+        table_schema: Option<TableSchema>,
+        partition_index: usize,
+        projection_indices: Option<Vec<usize>>,
+        projection: Option<ProjectionExprs>,
+        batch_size: usize,
+        limit: Option<usize>,
+        predicate: Option<Arc<dyn PhysicalExpr>>,
+        metadata_size_hint: Option<usize>,
+        metrics: ExecutionPlanMetricsSet,
+        pushdown_filters: bool,
+        reorder_filters: bool,
+        force_filter_selections: bool,
+        enable_page_index: bool,
+        enable_bloom_filter: bool,
+        enable_row_group_stats_pruning: bool,
+        coerce_int96: Option<arrow::datatypes::TimeUnit>,
+        max_predicate_cache_size: Option<usize>,
+        reverse_row_groups: bool,
+        preserve_order: bool,
+    }
+
+    impl ParquetMorselizerBuilder {
+        /// Create a new builder with sensible defaults for tests.
+        fn new() -> Self {
+            Self {
+                store: None,
+                table_schema: None,
+                partition_index: 0,
+                projection_indices: None,
+                projection: None,
+                batch_size: 1024,
+                limit: None,
+                predicate: None,
+                metadata_size_hint: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                pushdown_filters: false,
+                reorder_filters: false,
+                force_filter_selections: false,
+                enable_page_index: false,
+                enable_bloom_filter: false,
+                enable_row_group_stats_pruning: false,
+                coerce_int96: None,
+                max_predicate_cache_size: None,
+                reverse_row_groups: false,
+                preserve_order: false,
+            }
+        }
+
+        /// Set the object store (required for building).
+        fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
+            self.store = Some(store);
+            self
+        }
+
+        /// Create a simple table schema from a file schema (for files without partition columns).
+        fn with_schema(mut self, file_schema: SchemaRef) -> Self {
+            self.table_schema = Some(TableSchema::from_file_schema(file_schema));
+            self
+        }
+
+        /// Set a custom table schema (for files with partition columns).
+        fn with_table_schema(mut self, table_schema: TableSchema) -> Self {
+            self.table_schema = Some(table_schema);
+            self
+        }
+
+        /// Set projection by column indices (convenience method for common case).
+        fn with_projection_indices(mut self, indices: &[usize]) -> Self {
+            self.projection_indices = Some(indices.to_vec());
+            self
+        }
+
+        /// Set the predicate.
+        fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
+            self.predicate = Some(predicate);
+            self
+        }
+
+        /// Enable pushdown filters.
+        fn with_pushdown_filters(mut self, enable: bool) -> Self {
+            self.pushdown_filters = enable;
+            self
+        }
+
+        /// Enable filter reordering.
+        fn with_reorder_filters(mut self, enable: bool) -> Self {
+            self.reorder_filters = enable;
+            self
+        }
+
+        /// Enable row group stats pruning.
+        fn with_row_group_stats_pruning(mut self, enable: bool) -> Self {
+            self.enable_row_group_stats_pruning = enable;
+            self
+        }
+
+        /// Set reverse row groups flag.
+        fn with_reverse_row_groups(mut self, enable: bool) -> Self {
+            self.reverse_row_groups = enable;
+            self
+        }
+
+        /// Build the ParquetMorselizer instance.
+        ///
+        /// # Panics
+        ///
+        /// Panics if required fields (store, schema/table_schema) are not set.
+        fn build(self) -> ParquetMorselizer {
+            let store = self
+                .store
+                .expect("ParquetMorselizerBuilder: store must be set via with_store()");
+            let table_schema = self.table_schema.expect(
+                "ParquetMorselizerBuilder: table_schema must be set via with_schema() or with_table_schema()",
+            );
+            let file_schema = Arc::clone(table_schema.file_schema());
+
+            let projection = if let Some(projection) = self.projection {
+                projection
+            } else if let Some(indices) = self.projection_indices {
+                ProjectionExprs::from_indices(&indices, &file_schema)
+            } else {
+                // Default: project all columns
+                let all_indices: Vec<usize> = (0..file_schema.fields().len()).collect();
+                ProjectionExprs::from_indices(&all_indices, &file_schema)
+            };
+
+            ParquetMorselizer::new(ParquetMorselizerState {
+                partition_index: self.partition_index,
+                projection,
+                batch_size: self.batch_size,
+                limit: self.limit,
+                predicate: self.predicate,
+                table_schema,
+                metadata_size_hint: self.metadata_size_hint,
+                metrics: self.metrics,
+                parquet_file_reader_factory: Arc::new(
+                    DefaultParquetFileReaderFactory::new(store),
+                ),
+                pushdown_filters: self.pushdown_filters,
+                reorder_filters: self.reorder_filters,
+                force_filter_selections: self.force_filter_selections,
+                enable_page_index: self.enable_page_index,
+                enable_bloom_filter: self.enable_bloom_filter,
+                enable_row_group_stats_pruning: self.enable_row_group_stats_pruning,
+                coerce_int96: self.coerce_int96,
+                expr_adapter_factory: Arc::new(DefaultPhysicalExprAdapterFactory),
+                encryption_context: EncryptionContext::default(),
+                max_predicate_cache_size: self.max_predicate_cache_size,
+                reverse_row_groups: self.reverse_row_groups,
+                preserve_order: self.preserve_order,
+            })
+        }
+    }
+
+    fn constant_int_stats() -> (Statistics, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+        let statistics = Statistics {
+            num_rows: Precision::Exact(3),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(ScalarValue::from(5i32)),
+                    min_value: Precision::Exact(ScalarValue::from(5i32)),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics::new_unknown(),
+            ],
+        };
+        (statistics, schema)
+    }
+
+    #[test]
+    fn extract_constant_columns_non_null() {
+        let (statistics, schema) = constant_int_stats();
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        assert_eq!(constants.len(), 1);
+        assert_eq!(constants.get("a"), Some(&ScalarValue::from(5i32)));
+        assert!(!constants.contains_key("b"));
+    }
+
+    #[test]
+    fn extract_constant_columns_all_null() {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+        let statistics = Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(2),
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
+            }],
+        };
+
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        assert_eq!(
+            constants.get("a"),
+            Some(&ScalarValue::Utf8(None)),
+            "all-null column should be treated as constant null"
+        );
+    }
+
+    #[test]
+    fn rewrite_projection_to_literals() {
+        let (statistics, schema) = constant_int_stats();
+        let constants = constant_columns_from_stats(Some(&statistics), &schema);
+        let projection = ProjectionExprs::from_indices(&[0, 1], &schema);
+
+        let rewritten = projection
+            .try_map_exprs(|expr| replace_columns_with_literals(expr, &constants))
+            .unwrap();
+        let exprs = rewritten.as_ref();
+        assert!(exprs[0].expr.as_any().downcast_ref::<Literal>().is_some());
+        assert!(exprs[1].expr.as_any().downcast_ref::<Column>().is_some());
+
+        // Only column `b` should remain in the projection mask
+        assert_eq!(rewritten.column_indices(), vec![1]);
+    }
+
+    #[test]
+    fn rewrite_physical_expr_literal() {
+        let mut constants = ConstantColumns::new();
+        constants.insert("a".to_string(), ScalarValue::from(7i32));
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+        let rewritten = replace_columns_with_literals(expr, &constants).unwrap();
+        assert!(rewritten.as_any().downcast_ref::<Literal>().is_some());
+    }
+
+    async fn count_batches_and_rows(
+        mut stream: std::pin::Pin<
+            Box<dyn Stream<Item = Result<RecordBatch, DataFusionError>> + Send>,
+        >,
+    ) -> (usize, usize) {
+        let mut num_batches = 0;
+        let mut num_rows = 0;
+        while let Some(Ok(batch)) = stream.next().await {
+            num_rows += batch.num_rows();
+            num_batches += 1;
+        }
+        (num_batches, num_rows)
+    }
+
+    /// Helper to collect all int32 values from the first column of batches
+    async fn collect_int32_values(
+        mut stream: std::pin::Pin<
+            Box<dyn Stream<Item = Result<RecordBatch, DataFusionError>> + Send>,
+        >,
+    ) -> Vec<i32> {
+        use arrow::array::Array;
+        let mut values = vec![];
+        while let Some(Ok(batch)) = stream.next().await {
+            let array = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<arrow::array::Int32Array>()
+                .unwrap();
+            for i in 0..array.len() {
+                if !array.is_null(i) {
+                    values.push(array.value(i));
+                }
+            }
+        }
+        values
+    }
+
+    async fn open_via_morselizer(
+        morselizer: &ParquetMorselizer,
+        file: PartitionedFile,
+    ) -> std::pin::Pin<Box<dyn Stream<Item = Result<RecordBatch, DataFusionError>> + Send>>
+    {
+        let mut planners: VecDeque<Box<dyn MorselPlanner>> =
+            morselizer.morselize(file).unwrap().into();
+        let mut streams = Vec::new();
+
+        while let Some(mut planner) = planners.pop_front() {
+            while let Some(mut plan) = planner.plan().unwrap() {
+                if let Some(io_future) = plan.take_io_future() {
+                    io_future.await.unwrap();
+                    continue;
+                }
+
+                streams.extend(
+                    plan.take_morsels()
+                        .into_iter()
+                        .map(|morsel| morsel.into_stream()),
+                );
+                planners.extend(plan.take_planners());
+            }
+        }
+
+        futures::stream::iter(streams).flatten().boxed()
+    }
+
+    async fn write_parquet(
+        store: Arc<dyn ObjectStore>,
+        filename: &str,
+        batch: RecordBatch,
+    ) -> usize {
+        write_parquet_batches(store, filename, vec![batch], None).await
+    }
+
+    /// Write multiple batches to a parquet file with optional writer properties
+    async fn write_parquet_batches(
+        store: Arc<dyn ObjectStore>,
+        filename: &str,
+        batches: Vec<RecordBatch>,
+        props: Option<WriterProperties>,
+    ) -> usize {
+        let mut out = BytesMut::new().writer();
+        {
+            let schema = batches[0].schema();
+            let mut writer = ArrowWriter::try_new(&mut out, schema, props).unwrap();
+            for batch in batches {
+                writer.write(&batch).unwrap();
+            }
+            writer.finish().unwrap();
+        }
+        let data = out.into_inner().freeze();
+        let data_len = data.len();
+        store.put(&Path::from(filename), data.into()).await.unwrap();
+        data_len
+    }
+
+    fn make_dynamic_expr(expr: Arc<dyn PhysicalExpr>) -> Arc<dyn PhysicalExpr> {
+        Arc::new(DynamicFilterPhysicalExpr::new(
+            expr.children().into_iter().map(Arc::clone).collect(),
+            expr,
+        ))
+    }
+
+    #[tokio::test]
+    async fn test_prune_on_statistics() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let batch = record_batch!(
+            ("a", Int32, vec![Some(1), Some(2), Some(2)]),
+            ("b", Float32, vec![Some(1.0), Some(2.0), None])
+        )
+        .unwrap();
+
+        let data_size =
+            write_parquet(Arc::clone(&store), "test.parquet", batch.clone()).await;
+
+        let schema = batch.schema();
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        )
+        .with_statistics(Arc::new(
+            Statistics::new_unknown(&schema)
+                .add_column_statistics(ColumnStatistics::new_unknown())
+                .add_column_statistics(
+                    ColumnStatistics::new_unknown()
+                        .with_min_value(Precision::Exact(ScalarValue::Float32(Some(1.0))))
+                        .with_max_value(Precision::Exact(ScalarValue::Float32(Some(2.0))))
+                        .with_null_count(Precision::Exact(1)),
+                ),
+        ));
+
+        let make_opener = |predicate| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0, 1])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
+        };
+
+        // A filter on "a" should not exclude any rows even if it matches the data
+        let expr = col("a").eq(lit(1));
+        let predicate = logical2physical(&expr, &schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 3);
+
+        // A filter on `b = 5.0` should exclude all rows
+        let expr = col("b").eq(lit(ScalarValue::Float32(Some(5.0))));
+        let predicate = logical2physical(&expr, &schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_prune_on_partition_statistics_with_dynamic_expression() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let data_size =
+            write_parquet(Arc::clone(&store), "part=1/file.parquet", batch.clone()).await;
+
+        let file_schema = batch.schema();
+        let mut file = PartitionedFile::new(
+            "part=1/file.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        );
+        file.partition_values = vec![ScalarValue::Int32(Some(1))];
+
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("part", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
+        let make_opener = |predicate| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
+        };
+
+        // Filter should match the partition value
+        let expr = col("part").eq(lit(1));
+        // Mark the expression as dynamic even if it's not to force partition pruning to happen
+        // Otherwise we assume it already happened at the planning stage and won't re-do the work here
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 3);
+
+        // Filter should not match the partition value
+        let expr = col("part").eq(lit(2));
+        // Mark the expression as dynamic even if it's not to force partition pruning to happen
+        // Otherwise we assume it already happened at the planning stage and won't re-do the work here
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_prune_on_partition_values_and_file_statistics() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let batch = record_batch!(
+            ("a", Int32, vec![Some(1), Some(2), Some(3)]),
+            ("b", Float64, vec![Some(1.0), Some(2.0), None])
+        )
+        .unwrap();
+        let data_size =
+            write_parquet(Arc::clone(&store), "part=1/file.parquet", batch.clone()).await;
+        let file_schema = batch.schema();
+        let mut file = PartitionedFile::new(
+            "part=1/file.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        );
+        file.partition_values = vec![ScalarValue::Int32(Some(1))];
+        file.statistics = Some(Arc::new(
+            Statistics::new_unknown(&file_schema)
+                .add_column_statistics(ColumnStatistics::new_unknown())
+                .add_column_statistics(
+                    ColumnStatistics::new_unknown()
+                        .with_min_value(Precision::Exact(ScalarValue::Float64(Some(1.0))))
+                        .with_max_value(Precision::Exact(ScalarValue::Float64(Some(2.0))))
+                        .with_null_count(Precision::Exact(1)),
+                ),
+        ));
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("part", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Float32, true),
+        ]));
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
+        let make_opener = |predicate| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_row_group_stats_pruning(true)
+                .build()
+        };
+
+        // Filter should match the partition value and file statistics
+        let expr = col("part").eq(lit(1)).and(col("b").eq(lit(1.0)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 3);
+
+        // Should prune based on partition value but not file statistics
+        let expr = col("part").eq(lit(2)).and(col("b").eq(lit(1.0)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // Should prune based on file statistics but not partition value
+        let expr = col("part").eq(lit(1)).and(col("b").eq(lit(7.0)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // Should prune based on both partition value and file statistics
+        let expr = col("part").eq(lit(2)).and(col("b").eq(lit(7.0)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_prune_on_partition_value_and_data_value() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Note: number 3 is missing!
+        let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(4)])).unwrap();
+        let data_size =
+            write_parquet(Arc::clone(&store), "part=1/file.parquet", batch.clone()).await;
+
+        let file_schema = batch.schema();
+        let mut file = PartitionedFile::new(
+            "part=1/file.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        );
+        file.partition_values = vec![ScalarValue::Int32(Some(1))];
+
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("part", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
+        let make_opener = |predicate| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .with_pushdown_filters(true) // note that this is true!
+                .with_reorder_filters(true)
+                .build()
+        };
+
+        // Filter should match the partition value and data value
+        let expr = col("part").eq(lit(1)).or(col("a").eq(lit(1)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 3);
+
+        // Filter should match the partition value but not the data value
+        let expr = col("part").eq(lit(1)).or(col("a").eq(lit(3)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 3);
+
+        // Filter should not match the partition value but match the data value
+        let expr = col("part").eq(lit(2)).or(col("a").eq(lit(1)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 1);
+        assert_eq!(num_rows, 1);
+
+        // Filter should not match the partition value or the data value
+        let expr = col("part").eq(lit(2)).or(col("a").eq(lit(3)));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+    }
+
+    /// Test that if the filter is not a dynamic filter and we have no stats we don't do extra pruning work at the file level.
+    #[tokio::test]
+    async fn test_opener_pruning_skipped_on_static_filters() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let data_size =
+            write_parquet(Arc::clone(&store), "part=1/file.parquet", batch.clone()).await;
+
+        let file_schema = batch.schema();
+        let mut file = PartitionedFile::new(
+            "part=1/file.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        );
+        file.partition_values = vec![ScalarValue::Int32(Some(1))];
+        file.statistics = Some(Arc::new(
+            Statistics::default().add_column_statistics(
+                ColumnStatistics::new_unknown()
+                    .with_min_value(Precision::Exact(ScalarValue::Int32(Some(1))))
+                    .with_max_value(Precision::Exact(ScalarValue::Int32(Some(3))))
+                    .with_null_count(Precision::Exact(0)),
+            ),
+        ));
+
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("part", DataType::Int32, false),
+        ]));
+
+        let table_schema_for_opener = TableSchema::new(
+            file_schema.clone(),
+            vec![Arc::new(Field::new("part", DataType::Int32, false))],
+        );
+        let make_opener = |predicate| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_table_schema(table_schema_for_opener.clone())
+                .with_projection_indices(&[0])
+                .with_predicate(predicate)
+                .build()
+        };
+
+        // This filter could prune based on statistics, but since it's not dynamic it's not applied for pruning
+        // (the assumption is this happened already at planning time)
+        let expr = col("a").eq(lit(42));
+        let predicate = logical2physical(&expr, &table_schema);
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // If we make the filter dynamic, it should prune.
+        // This allows dynamic filters to prune partitions/files even if they are populated late into execution.
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // If we have a filter that touches partition columns only and is dynamic, it should prune even if there are no stats.
+        file.statistics = Some(Arc::new(Statistics::new_unknown(&file_schema)));
+        let expr = col("part").eq(lit(2));
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+
+        // Similarly a filter that combines partition and data columns should prune even if there are no stats.
+        let expr = col("part").eq(lit(2)).and(col("a").eq(lit(42)));
+        let predicate = make_dynamic_expr(logical2physical(&expr, &table_schema));
+        let opener = make_opener(predicate);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let (num_batches, num_rows) = count_batches_and_rows(stream).await;
+        assert_eq!(num_batches, 0);
+        assert_eq!(num_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_row_groups() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create multiple batches to ensure multiple row groups
+        let batch1 =
+            record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let batch2 =
+            record_batch!(("a", Int32, vec![Some(4), Some(5), Some(6)])).unwrap();
+        let batch3 =
+            record_batch!(("a", Int32, vec![Some(7), Some(8), Some(9)])).unwrap();
+
+        // Write parquet file with multiple row groups
+        // Force small row groups by setting max_row_group_size
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(3)) // Force each batch into its own row group
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch1.clone(), batch2, batch3],
+            Some(props),
+        )
+        .await;
+
+        let schema = batch1.schema();
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // Test normal scan (forward)
+        let opener = make_opener(false);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let forward_values = collect_int32_values(stream).await;
+
+        // Test reverse scan
+        let opener = make_opener(true);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let reverse_values = collect_int32_values(stream).await;
+
+        // The forward scan should return data in the order written
+        assert_eq!(forward_values, vec![1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // With reverse scan, row groups are reversed, so we expect:
+        // Row group 3 (7,8,9), then row group 2 (4,5,6), then row group 1 (1,2,3)
+        assert_eq!(reverse_values, vec![7, 8, 9, 4, 5, 6, 1, 2, 3]);
+    }
+
+    #[tokio::test]
+    async fn test_morselizer_basic_parity() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let batch1 =
+            record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let batch2 = record_batch!(("a", Int32, vec![Some(4), Some(5)])).unwrap();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "morselizer_basic_parity.parquet",
+            vec![batch1, batch2],
+            None,
+        )
+        .await;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(schema)
+            .with_projection_indices(&[0])
+            .build();
+
+        let file = PartitionedFile::new(
+            "morselizer_basic_parity.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let opener_values =
+            collect_int32_values(open_via_morselizer(&opener, file.clone()).await).await;
+        let morsel_values =
+            collect_int32_values(open_via_morselizer(&opener, file).await).await;
+
+        assert_eq!(opener_values, morsel_values);
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_single_row_group() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create a single batch (single row group)
+        let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap();
+        let data_size =
+            write_parquet(Arc::clone(&store), "test.parquet", batch.clone()).await;
+
+        let schema = batch.schema();
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_size).unwrap(),
+        );
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // With a single row group, forward and reverse should be the same
+        // (only the row group order is reversed, not the rows within)
+        let opener_forward = make_opener(false);
+        let stream_forward = open_via_morselizer(&opener_forward, file.clone()).await;
+        let (batches_forward, _) = count_batches_and_rows(stream_forward).await;
+
+        let opener_reverse = make_opener(true);
+        let stream_reverse = open_via_morselizer(&opener_reverse, file).await;
+        let (batches_reverse, _) = count_batches_and_rows(stream_reverse).await;
+
+        // Both should have the same number of batches since there's only one row group
+        assert_eq!(batches_forward, batches_reverse);
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_with_row_selection() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create 3 batches with DIFFERENT selection patterns
+        let batch1 =
+            record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3), Some(4)]))
+                .unwrap(); // 4 rows
+        let batch2 =
+            record_batch!(("a", Int32, vec![Some(5), Some(6), Some(7), Some(8)]))
+                .unwrap(); // 4 rows
+        let batch3 =
+            record_batch!(("a", Int32, vec![Some(9), Some(10), Some(11), Some(12)]))
+                .unwrap(); // 4 rows
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(4))
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch1.clone(), batch2, batch3],
+            Some(props),
+        )
+        .await;
+
+        let schema = batch1.schema();
+
+        use crate::ParquetAccessPlan;
+        use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+        // Row group 0: skip first 2, select last 2 (should get: 3, 4)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]),
+        );
+        // Row group 1: select all (should get: 5, 6, 7, 8)
+        // Row group 2: select first 2, skip last 2 (should get: 9, 10)
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(2), RowSelector::skip(2)]),
+        );
+
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        )
+        .with_extensions(Arc::new(access_plan));
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // Forward scan: RG0(3,4), RG1(5,6,7,8), RG2(9,10)
+        let opener = make_opener(false);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let forward_values = collect_int32_values(stream).await;
+
+        // Forward scan should produce: RG0(3,4), RG1(5,6,7,8), RG2(9,10)
+        assert_eq!(
+            forward_values,
+            vec![3, 4, 5, 6, 7, 8, 9, 10],
+            "Forward scan should select correct rows based on RowSelection"
+        );
+
+        // Reverse scan
+        // CORRECT behavior: reverse row groups AND their corresponding selections
+        // - RG2 is read first, WITH RG2's selection (select 2, skip 2) -> 9, 10
+        // - RG1 is read second, WITH RG1's selection (select all) -> 5, 6, 7, 8
+        // - RG0 is read third, WITH RG0's selection (skip 2, select 2) -> 3, 4
+        let opener = make_opener(true);
+        let stream = open_via_morselizer(&opener, file).await;
+        let reverse_values = collect_int32_values(stream).await;
+
+        // Correct expected result: row groups reversed but each keeps its own selection
+        // RG2 with its selection (9,10), RG1 with its selection (5,6,7,8), RG0 with its selection (3,4)
+        assert_eq!(
+            reverse_values,
+            vec![9, 10, 5, 6, 7, 8, 3, 4],
+            "Reverse scan should reverse row group order while maintaining correct RowSelection for each group"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_reverse_scan_with_non_contiguous_row_groups() {
+        use parquet::file::properties::WriterProperties;
+
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // Create 4 batches (4 row groups)
+        let batch0 = record_batch!(("a", Int32, vec![Some(1), Some(2)])).unwrap();
+        let batch1 = record_batch!(("a", Int32, vec![Some(3), Some(4)])).unwrap();
+        let batch2 = record_batch!(("a", Int32, vec![Some(5), Some(6)])).unwrap();
+        let batch3 = record_batch!(("a", Int32, vec![Some(7), Some(8)])).unwrap();
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(2))
+            .build();
+
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "test.parquet",
+            vec![batch0.clone(), batch1, batch2, batch3],
+            Some(props),
+        )
+        .await;
+
+        let schema = batch0.schema();
+
+        use crate::ParquetAccessPlan;
+        use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+        // KEY: Skip RG1 (non-contiguous!)
+        // Only scan row groups: [0, 2, 3]
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - SKIPPED!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add RowSelection for each scanned row group
+        // RG0: select first row (1), skip second (2)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+        // RG1: skipped, no selection needed
+        // RG2: select first row (5), skip second (6)
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+        // RG3: select first row (7), skip second (8)
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]),
+        );
+
+        let file = PartitionedFile::new(
+            "test.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        )
+        .with_extensions(Arc::new(access_plan));
+
+        let make_opener = |reverse_scan: bool| {
+            ParquetMorselizerBuilder::new()
+                .with_store(Arc::clone(&store))
+                .with_schema(Arc::clone(&schema))
+                .with_projection_indices(&[0])
+                .with_reverse_row_groups(reverse_scan)
+                .build()
+        };
+
+        // Forward scan: RG0(1), RG2(5), RG3(7)
+        // Note: RG1 is completely skipped
+        let opener = make_opener(false);
+        let stream = open_via_morselizer(&opener, file.clone()).await;
+        let forward_values = collect_int32_values(stream).await;
+
+        assert_eq!(
+            forward_values,
+            vec![1, 5, 7],
+            "Forward scan with non-contiguous row groups"
+        );
+
+        // Reverse scan: RG3(7), RG2(5), RG0(1)
+        // WITHOUT the bug fix, this would return WRONG values
+        // because the RowSelection would be incorrectly mapped
+        let opener = make_opener(true);
+        let stream = open_via_morselizer(&opener, file).await;
+        let reverse_values = collect_int32_values(stream).await;
+
+        assert_eq!(
+            reverse_values,
+            vec![7, 5, 1],
+            "Reverse scan with non-contiguous row groups should correctly map RowSelection"
+        );
+    }
+}
diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs
index 84f5c4c2d6d5f..194e6e94fba3a 100644
--- a/datafusion/datasource-parquet/src/page_filter.rs
+++ b/datafusion/datasource-parquet/src/page_filter.rs
@@ -28,15 +28,15 @@ use arrow::{
     array::ArrayRef,
     datatypes::{Schema, SchemaRef},
 };
-use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::ScalarValue;
-use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
-use datafusion_physical_optimizer::pruning::PruningPredicate;
+use datafusion_common::pruning::PruningStatistics;
+use datafusion_physical_expr::{PhysicalExpr, split_conjunction};
+use datafusion_pruning::PruningPredicate;
 
 use log::{debug, trace};
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use parquet::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex};
-use parquet::format::PageLocation;
+use parquet::file::page_index::offset_index::PageLocation;
 use parquet::schema::types::SchemaDescriptor;
 use parquet::{
     arrow::arrow_reader::{RowSelection, RowSelector},
@@ -90,7 +90,6 @@ use parquet::{
 ///  ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
 ///
 ///   Total rows: 300
-///
 /// ```
 ///
 /// Given the predicate `A > 35 AND B = 'F'`:
@@ -119,6 +118,7 @@ pub struct PagePruningAccessPlanFilter {
 impl PagePruningAccessPlanFilter {
     /// Create a new [`PagePruningAccessPlanFilter`] from a physical
     /// expression.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new(expr: &Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Self {
         // extract any single column predicates
         let predicates = split_conjunction(expr)
@@ -178,9 +178,10 @@ impl PagePruningAccessPlanFilter {
             || parquet_metadata.column_index().is_none()
         {
             debug!(
-                    "Can not prune pages due to lack of indexes. Have offset: {}, column index: {}",
-                    parquet_metadata.offset_index().is_some(), parquet_metadata.column_index().is_some()
-                );
+                "Can not prune pages due to lack of indexes. Have offset: {}, column index: {}",
+                parquet_metadata.offset_index().is_some(),
+                parquet_metadata.column_index().is_some()
+            );
             return access_plan;
         };
 
@@ -188,6 +189,10 @@ impl PagePruningAccessPlanFilter {
         let mut total_skip = 0;
         // track the total number of rows that should not be skipped
         let mut total_select = 0;
+        // track the total number of pages that should be skipped
+        let mut total_pages_skip = 0;
+        // track the total number of pages that should not be skipped
+        let mut total_pages_select = 0;
 
         // for each row group specified in the access plan
         let row_group_indexes = access_plan.row_group_indexes();
@@ -225,12 +230,15 @@ impl PagePruningAccessPlanFilter {
                     file_metrics,
                 );
 
-                let Some(selection) = selection else {
+                let Some((selection, total_pages, matched_pages)) = selection else {
                     trace!("No pages pruned in prune_pages_in_one_row_group");
                     continue;
                 };
+                total_pages_select += matched_pages;
+                total_pages_skip += total_pages - matched_pages;
 
-                debug!("Use filter and page index to create RowSelection {:?} from predicate: {:?}",
+                debug!(
+                    "Use filter and page index to create RowSelection {:?} from predicate: {:?}",
                     &selection,
                     predicate.predicate_expr(),
                 );
@@ -253,7 +261,9 @@ impl PagePruningAccessPlanFilter {
                 let rows_selected = overall_selection.row_count();
                 if rows_selected > 0 {
                     let rows_skipped = overall_selection.skipped_row_count();
-                    trace!("Overall selection from predicate skipped {rows_skipped}, selected {rows_selected}: {overall_selection:?}");
+                    trace!(
+                        "Overall selection from predicate skipped {rows_skipped}, selected {rows_selected}: {overall_selection:?}"
+                    );
                     total_skip += rows_skipped;
                     total_select += rows_selected;
                     access_plan.scan_selection(row_group_index, overall_selection)
@@ -270,8 +280,16 @@ impl PagePruningAccessPlanFilter {
             }
         }
 
-        file_metrics.page_index_rows_pruned.add(total_skip);
-        file_metrics.page_index_rows_matched.add(total_select);
+        file_metrics.page_index_rows_pruned.add_pruned(total_skip);
+        file_metrics
+            .page_index_rows_pruned
+            .add_matched(total_select);
+        file_metrics
+            .page_index_pages_pruned
+            .add_pruned(total_pages_skip);
+        file_metrics
+            .page_index_pages_pruned
+            .add_matched(total_pages_select);
         access_plan
     }
 
@@ -291,7 +309,8 @@ fn update_selection(
     }
 }
 
-/// Returns a [`RowSelection`] for the rows in this row group to scan.
+/// Returns a [`RowSelection`] for the rows in this row group to scan, in addition to the number of
+/// total and matched pages.
 ///
 /// This Row Selection is formed from the page index and the predicate skips row
 /// ranges that can be ruled out based on the predicate.
@@ -304,7 +323,7 @@ fn prune_pages_in_one_row_group(
     converter: StatisticsConverter<'_>,
     parquet_metadata: &ParquetMetaData,
     metrics: &ParquetFileMetrics,
-) -> Option<RowSelection> {
+) -> Option<(RowSelection, usize, usize)> {
     let pruning_stats =
         PagesPruningStatistics::try_new(row_group_index, converter, parquet_metadata)?;
 
@@ -356,7 +375,11 @@ fn prune_pages_in_one_row_group(
         RowSelector::skip(sum_row)
     };
     vec.push(selector);
-    Some(RowSelection::from(vec))
+
+    let total_pages = values.len();
+    let matched_pages = values.iter().filter(|v| **v).count();
+
+    Some((RowSelection::from(vec), total_pages, matched_pages))
 }
 
 /// Implement [`PruningStatistics`] for one column's PageIndex (column_index + offset_index)
diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs
index 27ec843c1991d..4291c9af76a63 100644
--- a/datafusion/datasource-parquet/src/reader.rs
+++ b/datafusion/datasource-parquet/src/reader.rs
@@ -19,14 +19,20 @@
 //! low level control of parquet file readers
 
 use crate::ParquetFileMetrics;
+use crate::metadata::DFParquetMetadata;
 use bytes::Bytes;
-use datafusion_datasource::file_meta::FileMeta;
+use datafusion_datasource::PartitionedFile;
+use datafusion_execution::cache::cache_manager::FileMetadata;
+use datafusion_execution::cache::cache_manager::FileMetadataCache;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use futures::FutureExt;
 use futures::future::BoxFuture;
 use object_store::ObjectStore;
 use parquet::arrow::arrow_reader::ArrowReaderOptions;
 use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
 use parquet::file::metadata::ParquetMetaData;
+use std::any::Any;
+use std::collections::HashMap;
 use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
@@ -50,13 +56,13 @@ pub trait ParquetFileReaderFactory: Debug + Send + Sync + 'static {
     ///
     /// # Arguments
     /// * partition_index - Index of the partition (for reporting metrics)
-    /// * file_meta - The file to be read
+    /// * file - The file to be read
     /// * metadata_size_hint - If specified, the first IO reads this many bytes from the footer
     /// * metrics - Execution metrics
     fn create_reader(
         &self,
         partition_index: usize,
-        file_meta: FileMeta,
+        partitioned_file: PartitionedFile,
         metadata_size_hint: Option<usize>,
         metrics: &ExecutionPlanMetricsSet,
     ) -> datafusion_common::Result<Box<dyn AsyncFileReader + Send>>;
@@ -88,9 +94,10 @@ impl DefaultParquetFileReaderFactory {
 /// This implementation does not coalesce I/O operations or cache bytes. Such
 /// optimizations can be done either at the object store level or by providing a
 /// custom implementation of [`ParquetFileReaderFactory`].
-pub(crate) struct ParquetFileReader {
+pub struct ParquetFileReader {
     pub file_metrics: ParquetFileMetrics,
     pub inner: ParquetObjectReader,
+    pub partitioned_file: PartitionedFile,
 }
 
 impl AsyncFileReader for ParquetFileReader {
@@ -123,22 +130,37 @@ impl AsyncFileReader for ParquetFileReader {
     }
 }
 
+impl Drop for ParquetFileReader {
+    fn drop(&mut self) {
+        self.file_metrics
+            .scan_efficiency_ratio
+            .add_part(self.file_metrics.bytes_scanned.value());
+        // Multiple ParquetFileReaders may run, so we set_total to avoid adding the total multiple times
+        self.file_metrics
+            .scan_efficiency_ratio
+            .set_total(self.partitioned_file.object_meta.size as usize);
+    }
+}
+
 impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
     fn create_reader(
         &self,
         partition_index: usize,
-        file_meta: FileMeta,
+        partitioned_file: PartitionedFile,
         metadata_size_hint: Option<usize>,
         metrics: &ExecutionPlanMetricsSet,
     ) -> datafusion_common::Result<Box<dyn AsyncFileReader + Send>> {
         let file_metrics = ParquetFileMetrics::new(
             partition_index,
-            file_meta.location().as_ref(),
+            partitioned_file.object_meta.location.as_ref(),
             metrics,
         );
         let store = Arc::clone(&self.store);
-        let mut inner = ParquetObjectReader::new(store, file_meta.object_meta.location)
-            .with_file_size(file_meta.object_meta.size);
+        let mut inner = ParquetObjectReader::new(
+            store,
+            partitioned_file.object_meta.location.clone(),
+        )
+        .with_file_size(partitioned_file.object_meta.size);
 
         if let Some(hint) = metadata_size_hint {
             inner = inner.with_footer_size_hint(hint)
@@ -147,6 +169,195 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
         Ok(Box::new(ParquetFileReader {
             inner,
             file_metrics,
+            partitioned_file,
         }))
     }
 }
+
+/// Implementation of [`ParquetFileReaderFactory`] supporting the caching of footer and page
+/// metadata. Reads and updates the [`FileMetadataCache`] with the [`ParquetMetaData`] data.
+/// This reader always loads the entire metadata (including page index, unless the file is
+/// encrypted), even if not required by the current query, to ensure it is always available for
+/// those that need it.
+#[derive(Debug)]
+pub struct CachedParquetFileReaderFactory {
+    store: Arc<dyn ObjectStore>,
+    metadata_cache: Arc<dyn FileMetadataCache>,
+}
+
+impl CachedParquetFileReaderFactory {
+    pub fn new(
+        store: Arc<dyn ObjectStore>,
+        metadata_cache: Arc<dyn FileMetadataCache>,
+    ) -> Self {
+        Self {
+            store,
+            metadata_cache,
+        }
+    }
+}
+
+impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
+    fn create_reader(
+        &self,
+        partition_index: usize,
+        partitioned_file: PartitionedFile,
+        metadata_size_hint: Option<usize>,
+        metrics: &ExecutionPlanMetricsSet,
+    ) -> datafusion_common::Result<Box<dyn AsyncFileReader + Send>> {
+        let file_metrics = ParquetFileMetrics::new(
+            partition_index,
+            partitioned_file.object_meta.location.as_ref(),
+            metrics,
+        );
+        let store = Arc::clone(&self.store);
+
+        let mut inner = ParquetObjectReader::new(
+            store,
+            partitioned_file.object_meta.location.clone(),
+        )
+        .with_file_size(partitioned_file.object_meta.size);
+
+        if let Some(hint) = metadata_size_hint {
+            inner = inner.with_footer_size_hint(hint)
+        };
+
+        Ok(Box::new(CachedParquetFileReader::new(
+            file_metrics,
+            Arc::clone(&self.store),
+            inner,
+            partitioned_file,
+            Arc::clone(&self.metadata_cache),
+            metadata_size_hint,
+        )))
+    }
+}
+
+/// Implements [`AsyncFileReader`] for a Parquet file in object storage. Reads the file metadata
+/// from the [`FileMetadataCache`], if available, otherwise reads it directly from the file and then
+/// updates the cache.
+pub struct CachedParquetFileReader {
+    pub file_metrics: ParquetFileMetrics,
+    store: Arc<dyn ObjectStore>,
+    pub inner: ParquetObjectReader,
+    partitioned_file: PartitionedFile,
+    metadata_cache: Arc<dyn FileMetadataCache>,
+    metadata_size_hint: Option<usize>,
+}
+
+impl CachedParquetFileReader {
+    pub fn new(
+        file_metrics: ParquetFileMetrics,
+        store: Arc<dyn ObjectStore>,
+        inner: ParquetObjectReader,
+        partitioned_file: PartitionedFile,
+        metadata_cache: Arc<dyn FileMetadataCache>,
+        metadata_size_hint: Option<usize>,
+    ) -> Self {
+        Self {
+            file_metrics,
+            store,
+            inner,
+            partitioned_file,
+            metadata_cache,
+            metadata_size_hint,
+        }
+    }
+}
+
+impl AsyncFileReader for CachedParquetFileReader {
+    fn get_bytes(
+        &mut self,
+        range: Range<u64>,
+    ) -> BoxFuture<'_, parquet::errors::Result<Bytes>> {
+        let bytes_scanned = range.end - range.start;
+        self.file_metrics.bytes_scanned.add(bytes_scanned as usize);
+        self.inner.get_bytes(range)
+    }
+
+    fn get_byte_ranges(
+        &mut self,
+        ranges: Vec<Range<u64>>,
+    ) -> BoxFuture<'_, parquet::errors::Result<Vec<Bytes>>>
+    where
+        Self: Send,
+    {
+        let total: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+        self.file_metrics.bytes_scanned.add(total as usize);
+        self.inner.get_byte_ranges(ranges)
+    }
+
+    fn get_metadata<'a>(
+        &'a mut self,
+        #[cfg_attr(not(feature = "parquet_encryption"), expect(unused_variables))]
+        options: Option<&'a ArrowReaderOptions>,
+    ) -> BoxFuture<'a, parquet::errors::Result<Arc<ParquetMetaData>>> {
+        let object_meta = self.partitioned_file.object_meta.clone();
+        let metadata_cache = Arc::clone(&self.metadata_cache);
+
+        async move {
+            #[cfg(feature = "parquet_encryption")]
+            let file_decryption_properties = options
+                .and_then(|o| o.file_decryption_properties())
+                .map(Arc::clone);
+
+            #[cfg(not(feature = "parquet_encryption"))]
+            let file_decryption_properties = None;
+
+            DFParquetMetadata::new(&self.store, &object_meta)
+                .with_decryption_properties(file_decryption_properties)
+                .with_file_metadata_cache(Some(Arc::clone(&metadata_cache)))
+                .with_metadata_size_hint(self.metadata_size_hint)
+                .fetch_metadata()
+                .await
+                .map_err(|e| {
+                    parquet::errors::ParquetError::General(format!(
+                        "Failed to fetch metadata for file {}: {e}",
+                        object_meta.location,
+                    ))
+                })
+        }
+        .boxed()
+    }
+}
+
+impl Drop for CachedParquetFileReader {
+    fn drop(&mut self) {
+        self.file_metrics
+            .scan_efficiency_ratio
+            .add_part(self.file_metrics.bytes_scanned.value());
+        // Multiple ParquetFileReaders may run, so we set_total to avoid adding the total multiple times
+        self.file_metrics
+            .scan_efficiency_ratio
+            .set_total(self.partitioned_file.object_meta.size as usize);
+    }
+}
+
+/// Wrapper to implement [`FileMetadata`] for [`ParquetMetaData`].
+pub struct CachedParquetMetaData(Arc<ParquetMetaData>);
+
+impl CachedParquetMetaData {
+    pub fn new(metadata: Arc<ParquetMetaData>) -> Self {
+        Self(metadata)
+    }
+
+    pub fn parquet_metadata(&self) -> &Arc<ParquetMetaData> {
+        &self.0
+    }
+}
+
+impl FileMetadata for CachedParquetMetaData {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn memory_size(&self) -> usize {
+        self.0.memory_size()
+    }
+
+    fn extra_info(&self) -> HashMap<String, String> {
+        let page_index =
+            self.0.column_index().is_some() && self.0.offset_index().is_some();
+        HashMap::from([("page_index".to_owned(), page_index.to_string())])
+    }
+}
diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs
index cde9e56c92800..d120f743fa1d5 100644
--- a/datafusion/datasource-parquet/src/row_filter.rs
+++ b/datafusion/datasource-parquet/src/row_filter.rs
@@ -50,38 +50,46 @@
 //! 2. Determine whether each predicate can be evaluated as an `ArrowPredicate`.
 //! 3. Determine, for each predicate, the total compressed size of all
 //!    columns required to evaluate the predicate.
-//! 4. Determine, for each predicate, whether all columns required to
-//!    evaluate the expression are sorted.
-//! 5. Re-order the predicate by total size (from step 3).
-//! 6. Partition the predicates according to whether they are sorted (from step 4)
-//! 7. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`.
-//! 8. Build the `RowFilter` with the sorted predicates followed by
-//!    the unsorted predicates. Within each partition, predicates are
-//!    still be sorted by size.
-
-use std::cmp::Ordering;
+//! 4. Re-order predicates by total size (from step 3).
+//! 5. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`.
+//! 6. Build the `RowFilter` from the ordered predicates.
+//!
+//! List-aware predicates (for example, `array_has`, `array_has_all`, and
+//! `array_has_any`) can be evaluated directly during Parquet decoding.
+//! Struct field access via `get_field` is also supported when the accessed
+//! leaf is a primitive type. Filters that reference entire struct columns
+//! rather than individual fields cannot be pushed down and are instead
+//! evaluated after the full batches are materialized.
+//!
+//! For example, given a struct column `s {name: Utf8, value: Int32}`:
+//! - `WHERE s['value'] > 5` — pushed down (accesses a primitive leaf)
+//! - `WHERE s IS NOT NULL`  — not pushed down (references the whole struct)
+
 use std::collections::BTreeSet;
 use std::sync::Arc;
 
 use arrow::array::BooleanArray;
-use arrow::datatypes::{DataType, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::error::{ArrowError, Result as ArrowResult};
 use arrow::record_batch::RecordBatch;
-use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
+use datafusion_functions::core::getfield::GetFieldFunc;
 use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
 use parquet::file::metadata::ParquetMetaData;
+use parquet::schema::types::SchemaDescriptor;
 
+use datafusion_common::Result;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::Result;
-use datafusion_datasource::schema_adapter::{SchemaAdapterFactory, SchemaMapper};
-use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::utils::reassign_predicate_columns;
-use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::expressions::{Column, Literal};
+use datafusion_physical_expr::utils::reassign_expr_columns;
+use datafusion_physical_expr::{PhysicalExpr, split_conjunction};
 
 use datafusion_physical_plan::metrics;
 
 use super::ParquetFileMetrics;
+use super::supported_predicates::supports_list_predicates;
 
 /// A "compiled" predicate passed to `ParquetRecordBatchStream` to perform
 /// row-level filtering during parquet decoding.
@@ -92,12 +100,17 @@ use super::ParquetFileMetrics;
 ///
 /// An expression can be evaluated as a `DatafusionArrowPredicate` if it:
 /// * Does not reference any projected columns
-/// * Does not reference columns with non-primitive types (e.g. structs / lists)
+/// * References either primitive columns or list columns used by
+///   supported predicates (such as `array_has_all` or NULL checks).
+/// * References struct fields via `get_field` where the accessed leaf
+///   is a primitive type (e.g. `get_field(struct_col, 'field') > 5`).
+///   Direct references to whole struct columns are still evaluated after
+///   decoding.
 #[derive(Debug)]
 pub(crate) struct DatafusionArrowPredicate {
     /// the filter expression
     physical_expr: Arc<dyn PhysicalExpr>,
-    /// Path to the columns in the parquet schema required to evaluate the
+    /// Path to the leaf columns in the parquet schema required to evaluate the
     /// expression
     projection_mask: ProjectionMask,
     /// how many rows were filtered out by this predicate
@@ -106,33 +119,25 @@ pub(crate) struct DatafusionArrowPredicate {
     rows_matched: metrics::Count,
     /// how long was spent evaluating this predicate
     time: metrics::Time,
-    /// used to perform type coercion while filtering rows
-    schema_mapper: Arc<dyn SchemaMapper>,
 }
 
 impl DatafusionArrowPredicate {
     /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate`
     pub fn try_new(
         candidate: FilterCandidate,
-        metadata: &ParquetMetaData,
         rows_pruned: metrics::Count,
         rows_matched: metrics::Count,
         time: metrics::Time,
     ) -> Result<Self> {
-        let projected_schema = Arc::clone(&candidate.filter_schema);
         let physical_expr =
-            reassign_predicate_columns(candidate.expr, &projected_schema, true)?;
+            reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?;
 
         Ok(Self {
             physical_expr,
-            projection_mask: ProjectionMask::roots(
-                metadata.file_metadata().schema_descr(),
-                candidate.projection,
-            ),
+            projection_mask: candidate.read_plan.projection_mask,
             rows_pruned,
             rows_matched,
             time,
-            schema_mapper: candidate.schema_mapper,
         })
     }
 }
@@ -143,8 +148,6 @@ impl ArrowPredicate for DatafusionArrowPredicate {
     }
 
     fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult<BooleanArray> {
-        let batch = self.schema_mapper.map_batch(batch)?;
-
         // scoped timer updates on drop
         let mut timer = self.time.timer();
 
@@ -181,75 +184,45 @@ pub(crate) struct FilterCandidate {
     /// the filter and to order the filters when `reorder_predicates` is true.
     /// This is generated by summing the compressed size of all columns that the filter references.
     required_bytes: usize,
-    /// Can this filter use an index (e.g. a page index) to prune rows?
-    can_use_index: bool,
-    /// The projection to read from the file schema to get the columns
-    /// required to pass thorugh a `SchemaMapper` to the table schema
-    /// upon which we then evaluate the filter expression.
-    projection: Vec<usize>,
-    ///  A `SchemaMapper` used to map batches read from the file schema to
-    /// the filter's projection of the table schema.
-    schema_mapper: Arc<dyn SchemaMapper>,
-    /// The projected table schema that this filter references
-    filter_schema: SchemaRef,
+    /// The resolved Parquet read plan (leaf indices + projected schema).
+    read_plan: ParquetReadPlan,
+}
+
+/// The result of resolving which Parquet leaf columns and Arrow schema fields
+/// are needed to evaluate an expression against a Parquet file
+///
+/// This is the shared output of the column resolution pipeline used by both
+/// the row filter to build `ArrowPredicate`s and the opener to build `ProjectionMask`s
+#[derive(Debug, Clone)]
+pub(crate) struct ParquetReadPlan {
+    /// Projection mask built from leaf column indices in the Parquet schema.
+    /// Using a `ProjectionMask` directly (rather than raw indices) prevents
+    /// bugs from accidentally mixing up root vs leaf indices.
+    pub projection_mask: ProjectionMask,
+    /// The projected Arrow schema containing only the columns/fields required
+    /// Struct types are pruned to include only the accessed sub-fields
+    pub projected_schema: SchemaRef,
 }
 
 /// Helper to build a `FilterCandidate`.
 ///
-/// This will do several things
+/// This will do several things:
 /// 1. Determine the columns required to evaluate the expression
 /// 2. Calculate data required to estimate the cost of evaluating the filter
-/// 3. Rewrite column expressions in the predicate which reference columns not
-///    in the particular file schema.
-///
-/// # Schema Rewrite
-///
-/// When parquet files are read in the context of "schema evolution" there are
-/// potentially wo schemas:
-///
-/// 1. The table schema (the columns of the table that the parquet file is part of)
-/// 2. The file schema (the columns actually in the parquet file)
-///
-/// There are times when the table schema contains columns that are not in the
-/// file schema, such as when new columns have been added in new parquet files
-/// but old files do not have the columns.
-///
-/// When a file is missing a column from the table schema, the value of the
-/// missing column is filled in by a `SchemaAdapter` (by default as `NULL`).
 ///
-/// When a predicate is pushed down to the parquet reader, the predicate is
-/// evaluated in the context of the file schema.
-/// For each predicate we build a filter schema which is the projection of the table
-/// schema that contains only the columns that this filter references.
-/// If any columns from the file schema are missing from a particular file they are
-/// added by the `SchemaAdapter`, by default as `NULL`.
+/// Note: This does *not* handle any adaptation of the expression to the file schema.
+/// The expression must already be adapted before being passed in here, generally using
+/// [`PhysicalExprAdapter`](datafusion_physical_expr_adapter::PhysicalExprAdapter).
 struct FilterCandidateBuilder {
     expr: Arc<dyn PhysicalExpr>,
-    /// The schema of this parquet file.
-    /// Columns may have different types from the table schema and there may be
-    /// columns in the file schema that are not in the table schema or columns that
-    /// are in the table schema that are not in the file schema.
+    /// The Arrow schema of this parquet file (the result of converting the
+    /// parquet schema to Arrow, potentially with type coercions applied).
     file_schema: SchemaRef,
-    /// The schema of the table (merged schema) -- columns may be in different
-    /// order than in the file and have columns that are not in the file schema
-    table_schema: SchemaRef,
-    /// A `SchemaAdapterFactory` used to map the file schema to the table schema.
-    schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
 }
 
 impl FilterCandidateBuilder {
-    pub fn new(
-        expr: Arc<dyn PhysicalExpr>,
-        file_schema: Arc<Schema>,
-        table_schema: Arc<Schema>,
-        schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Self {
-        Self {
-            expr,
-            file_schema,
-            table_schema,
-            schema_adapter_factory,
-        }
+    pub fn new(expr: Arc<dyn PhysicalExpr>, file_schema: Arc<Schema>) -> Self {
+        Self { expr, file_schema }
     }
 
     /// Attempt to build a `FilterCandidate` from the expression
@@ -260,118 +233,583 @@ impl FilterCandidateBuilder {
     /// * `Ok(None)` if the expression cannot be used as an ArrowFilter
     /// * `Err(e)` if an error occurs while building the candidate
     pub fn build(self, metadata: &ParquetMetaData) -> Result<Option<FilterCandidate>> {
-        let Some(required_indices_into_table_schema) =
-            pushdown_columns(&self.expr, &self.table_schema)?
-        else {
-            return Ok(None);
-        };
-
-        let projected_table_schema = Arc::new(
-            self.table_schema
-                .project(&required_indices_into_table_schema)?,
-        );
-
-        let (schema_mapper, projection_into_file_schema) = self
-            .schema_adapter_factory
-            .create(Arc::clone(&projected_table_schema), self.table_schema)
-            .map_schema(&self.file_schema)?;
-
-        let required_bytes = size_of_columns(&projection_into_file_schema, metadata)?;
-        let can_use_index = columns_sorted(&projection_into_file_schema, metadata)?;
-
-        Ok(Some(FilterCandidate {
-            expr: self.expr,
-            required_bytes,
-            can_use_index,
-            projection: projection_into_file_schema,
-            schema_mapper: Arc::clone(&schema_mapper),
-            filter_schema: Arc::clone(&projected_table_schema),
-        }))
+        Ok(
+            build_parquet_read_plan(&self.expr, &self.file_schema, metadata)?.map(
+                |(read_plan, required_bytes)| FilterCandidate {
+                    expr: self.expr,
+                    required_bytes,
+                    read_plan,
+                },
+            ),
+        )
     }
 }
 
-// a struct that implements TreeNodeRewriter to traverse a PhysicalExpr tree structure to determine
-// if any column references in the expression would prevent it from being predicate-pushed-down.
-// if non_primitive_columns || projected_columns, it can't be pushed down.
-// can't be reused between calls to `rewrite`; each construction must be used only once.
+/// Traverses a `PhysicalExpr` tree to determine if any column references would
+/// prevent the expression from being pushed down to the parquet decoder.
+///
+/// An expression cannot be pushed down if it references:
+/// - Unsupported nested columns (whole struct references or list fields that are
+///   not covered by the supported predicate set)
+/// - Columns that don't exist in the file schema
+///
+/// Struct field access via `get_field` is supported when the resolved leaf type
+/// is primitive (e.g. `get_field(struct_col, 'field') > 5`).
 struct PushdownChecker<'schema> {
     /// Does the expression require any non-primitive columns (like structs)?
     non_primitive_columns: bool,
-    /// Does the expression reference any columns that are in the table
-    /// schema but not in the file schema?
-    /// This includes partition columns and projected columns.
+    /// Does the expression reference any columns not present in the file schema?
     projected_columns: bool,
-    // Indices into the table schema of the columns required to evaluate the expression
-    required_columns: BTreeSet<usize>,
-    table_schema: &'schema Schema,
+    /// Indices into the file schema of columns required to evaluate the expression.
+    /// Does not include struct columns accessed via `get_field`.
+    required_columns: Vec<usize>,
+    /// Struct field accesses via `get_field`.
+    struct_field_accesses: Vec<StructFieldAccess>,
+    /// Whether nested list columns are supported by the predicate semantics.
+    allow_list_columns: bool,
+    /// The Arrow schema of the parquet file.
+    file_schema: &'schema Schema,
 }
 
 impl<'schema> PushdownChecker<'schema> {
-    fn new(table_schema: &'schema Schema) -> Self {
+    fn new(file_schema: &'schema Schema, allow_list_columns: bool) -> Self {
         Self {
             non_primitive_columns: false,
             projected_columns: false,
-            required_columns: BTreeSet::default(),
-            table_schema,
+            required_columns: Vec::new(),
+            struct_field_accesses: Vec::new(),
+            allow_list_columns,
+            file_schema,
         }
     }
 
+    /// Checks whether a struct's root column exists in the file schema and, if so,
+    /// records its index so the entire struct is decoded for filter evaluation.
+    ///
+    /// This is called when we see a `get_field` expression that resolves to a
+    /// primitive leaf type. We only need the *root* column index because the
+    /// Parquet reader decodes all leaves of a struct together.
+    ///
+    /// # Example
+    ///
+    /// Given file schema `{a: Int32, s: Struct(foo: Utf8, bar: Int64)}` and the
+    /// expression `get_field(s, 'foo') = 'hello'`:
+    ///
+    /// - `column_name` = `"s"` (the root struct column)
+    /// - `file_schema.index_of("s")` returns `1`
+    /// - We push `1` into `required_columns`
+    /// - Return `None` (no issue — traversal continues in the caller)
+    ///
+    /// If `"s"` is not in the file schema (e.g. a projected-away column), we set
+    /// `projected_columns = true` and return `Jump` to skip the subtree.
+    fn check_struct_field_column(
+        &mut self,
+        column_name: &str,
+        field_path: Vec<String>,
+    ) -> Option<TreeNodeRecursion> {
+        let Ok(idx) = self.file_schema.index_of(column_name) else {
+            self.projected_columns = true;
+            return Some(TreeNodeRecursion::Jump);
+        };
+
+        self.struct_field_accesses.push(StructFieldAccess {
+            root_index: idx,
+            field_path,
+        });
+
+        None
+    }
+
     fn check_single_column(&mut self, column_name: &str) -> Option<TreeNodeRecursion> {
-        if let Ok(idx) = self.table_schema.index_of(column_name) {
-            self.required_columns.insert(idx);
-            if DataType::is_nested(self.table_schema.field(idx).data_type()) {
-                self.non_primitive_columns = true;
+        let idx = match self.file_schema.index_of(column_name) {
+            Ok(idx) => idx,
+            Err(_) => {
+                // Column does not exist in the file schema, so we can't push this down.
+                self.projected_columns = true;
                 return Some(TreeNodeRecursion::Jump);
             }
+        };
+
+        // Duplicates are handled by dedup() in into_sorted_columns()
+        self.required_columns.push(idx);
+        let data_type = self.file_schema.field(idx).data_type();
+
+        if DataType::is_nested(data_type) {
+            self.handle_nested_type(data_type)
         } else {
-            // If the column does not exist in the (un-projected) table schema then
-            // it must be a projected column.
-            self.projected_columns = true;
-            return Some(TreeNodeRecursion::Jump);
+            None
         }
+    }
 
-        None
+    /// Determines whether a nested data type can be pushed down to Parquet decoding.
+    ///
+    /// Returns `Some(TreeNodeRecursion::Jump)` if the nested type prevents pushdown,
+    /// `None` if the type is supported and pushdown can continue.
+    fn handle_nested_type(&mut self, data_type: &DataType) -> Option<TreeNodeRecursion> {
+        if self.is_nested_type_supported(data_type) {
+            None
+        } else {
+            // Block pushdown for unsupported nested types:
+            // - Structs (regardless of predicate support)
+            // - Lists without supported predicates
+            self.non_primitive_columns = true;
+            Some(TreeNodeRecursion::Jump)
+        }
+    }
+
+    /// Checks if a nested data type is supported for list column pushdown.
+    ///
+    /// List columns are only supported if:
+    /// 1. The data type is a list variant (List, LargeList, or FixedSizeList)
+    /// 2. The expression contains supported list predicates (e.g., array_has_all)
+    fn is_nested_type_supported(&self, data_type: &DataType) -> bool {
+        let is_list = matches!(
+            data_type,
+            DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _)
+        );
+        self.allow_list_columns && is_list
     }
 
     #[inline]
     fn prevents_pushdown(&self) -> bool {
         self.non_primitive_columns || self.projected_columns
     }
+
+    /// Consumes the checker and returns sorted, deduplicated column indices
+    /// wrapped in a `PushdownColumns` struct.
+    ///
+    /// This method sorts the column indices and removes duplicates. The sort
+    /// is required because downstream code relies on column indices being in
+    /// ascending order for correct schema projection.
+    fn into_sorted_columns(mut self) -> PushdownColumns {
+        self.required_columns.sort_unstable();
+        self.required_columns.dedup();
+        PushdownColumns {
+            required_columns: self.required_columns,
+            struct_field_accesses: self.struct_field_accesses,
+        }
+    }
 }
 
 impl TreeNodeVisitor<'_> for PushdownChecker<'_> {
     type Node = Arc<dyn PhysicalExpr>;
 
     fn f_down(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
-        if let Some(column) = node.as_any().downcast_ref::<Column>() {
-            if let Some(recursion) = self.check_single_column(column.name()) {
-                return Ok(recursion);
+        // Handle struct field access like `s['foo']['bar'] > 10`.
+        //
+        // DataFusion represents nested field access as `get_field(Column("s"), "foo")`
+        // (or chained: `get_field(get_field(Column("s"), "foo"), "bar")`).
+        //
+        // We intercept the outermost `get_field` on the way *down* the tree so
+        // the visitor never reaches the raw `Column("s")` node. Without this,
+        // `check_single_column` would see that `s` is a Struct and reject it.
+        //
+        // The strategy:
+        //   1. Match `get_field` whose first arg is a `Column` (the struct root).
+        //   2. Check that the *resolved* return type is primitive — meaning we've
+        //      drilled all the way to a leaf (e.g. `s['foo']` → Utf8).
+        //   3. Record the root column index via `check_struct_field_column` and
+        //      return `Jump` to skip visiting the children (the Column and the
+        //      literal field-name args), since we've already handled them.
+        //
+        // If the return type is still nested (e.g. `s['nested_struct']` → Struct),
+        // we fall through and let normal traversal continue, which will
+        // eventually reject the expression when it hits the struct Column.
+        if let Some(func) =
+            ScalarFunctionExpr::try_downcast_func::<GetFieldFunc>(node.as_ref())
+        {
+            let args = func.args();
+
+            if let Some(column) = args
+                .first()
+                .and_then(|a| a.as_any().downcast_ref::<Column>())
+            {
+                let return_type = func.return_type();
+
+                if !DataType::is_nested(return_type)
+                    || self.is_nested_type_supported(return_type)
+                {
+                    // try to resolve all field name arguments to strinrg literals
+                    // if any argument is not a string literal, we can not determine the exact
+                    // leaf path so we fall back to reading the entire struct root column
+                    let field_path = args[1..]
+                        .iter()
+                        .map(|arg| {
+                            arg.as_any().downcast_ref::<Literal>().and_then(|lit| {
+                                lit.value().try_as_str().flatten().map(|s| s.to_string())
+                            })
+                        })
+                        .collect();
+
+                    match field_path {
+                        Some(path) => {
+                            if let Some(recursion) =
+                                self.check_struct_field_column(column.name(), path)
+                            {
+                                return Ok(recursion);
+                            }
+                        }
+                        None => {
+                            // Could not resolve field path — fall back to
+                            // reading the entire struct root column.
+                            if let Some(recursion) =
+                                self.check_single_column(column.name())
+                            {
+                                return Ok(recursion);
+                            }
+                        }
+                    }
+
+                    return Ok(TreeNodeRecursion::Jump);
+                }
             }
         }
 
+        if let Some(column) = node.as_any().downcast_ref::<Column>()
+            && let Some(recursion) = self.check_single_column(column.name())
+        {
+            return Ok(recursion);
+        }
+
         Ok(TreeNodeRecursion::Continue)
     }
 }
 
-// Checks if a given expression can be pushed down into `DataSourceExec` as opposed to being evaluated
-// post-parquet-scan in a `FilterExec`. If it can be pushed down, this returns all the
-// columns in the given expression so that they can be used in the parquet scanning, along with the
-// expression rewritten as defined in [`PushdownChecker::f_up`]
+/// Describes the nested column behavior for filter pushdown.
+///
+/// This enum makes explicit the different states a predicate can be in
+/// with respect to nested column handling during Parquet decoding.
+/// Result of checking which columns are required for filter pushdown.
+#[derive(Debug)]
+struct PushdownColumns {
+    /// Sorted, unique column indices into the file schema required to evaluate
+    /// the filter expression. Must be in ascending order for correct schema
+    /// projection matching. Does not include struct columns accessed via `get_field`.
+    required_columns: Vec<usize>,
+    /// Struct field accesses via `get_field`. Each entry records the root struct
+    /// column index and the field path being accessed.
+    struct_field_accesses: Vec<StructFieldAccess>,
+}
+
+/// Records a struct field access via `get_field(struct_col, 'field1', 'field2', ...)`.
+///
+/// This allows the row filter to project only the specific Parquet leaf columns
+/// needed by the filter, rather than all leaves of the struct.
+#[derive(Debug, Clone)]
+struct StructFieldAccess {
+    /// Arrow root column index of the struct in the file schema.
+    root_index: usize,
+    /// Field names forming the path into the struct.
+    /// e.g., `["value"]` for `s['value']`, `["outer", "inner"]` for `s['outer']['inner']`.
+    field_path: Vec<String>,
+}
+
+/// Checks if a given expression can be pushed down to the parquet decoder.
+///
+/// Returns `Some(PushdownColumns)` if the expression can be pushed down,
+/// where the struct contains the indices into the file schema of all columns
+/// required to evaluate the expression.
+///
+/// Returns `None` if the expression cannot be pushed down (e.g., references
+/// unsupported nested types or columns not in the file).
 fn pushdown_columns(
     expr: &Arc<dyn PhysicalExpr>,
-    table_schema: &Schema,
-) -> Result<Option<Vec<usize>>> {
-    let mut checker = PushdownChecker::new(table_schema);
+    file_schema: &Schema,
+) -> Result<Option<PushdownColumns>> {
+    let allow_list_columns = supports_list_predicates(expr);
+    let mut checker = PushdownChecker::new(file_schema, allow_list_columns);
     expr.visit(&mut checker)?;
-    Ok((!checker.prevents_pushdown())
-        .then_some(checker.required_columns.into_iter().collect()))
+    Ok((!checker.prevents_pushdown()).then(|| checker.into_sorted_columns()))
+}
+
+/// Resolves which Parquet leaf columns and Arrow schema fields are needed
+/// to evaluate `expr` against a Parquet file
+///
+/// Returns `Ok(Some((plan, required_bytes)))` when the expression can be
+/// evaluated using only pushdown-compatible columns. `Ok(None)` when it
+/// cannot (it references whole struct columns or columns missing from disk).
+///
+/// The `required_bytes` is the total compressed size of all referenced columns
+/// across all row groups, used to estimate filter evaluation cost.
+///
+/// Note: this is a shared entry point used by both row filter construction and
+/// the opener's projection logic
+pub(crate) fn build_parquet_read_plan(
+    expr: &Arc<dyn PhysicalExpr>,
+    file_schema: &Schema,
+    metadata: &ParquetMetaData,
+) -> Result<Option<(ParquetReadPlan, usize)>> {
+    let schema_descr = metadata.file_metadata().schema_descr();
+
+    let Some(required_columns) = pushdown_columns(expr, file_schema)? else {
+        return Ok(None);
+    };
+
+    let root_indices = &required_columns.required_columns;
+
+    let mut leaf_indices =
+        leaf_indices_for_roots(root_indices.iter().copied(), schema_descr);
+
+    let struct_leaf_indices = resolve_struct_field_leaves(
+        &required_columns.struct_field_accesses,
+        file_schema,
+        schema_descr,
+    );
+    leaf_indices.extend_from_slice(&struct_leaf_indices);
+    leaf_indices.sort_unstable();
+    leaf_indices.dedup();
+
+    let required_bytes = size_of_columns(&leaf_indices, metadata)?;
+
+    let projection_mask =
+        ProjectionMask::leaves(schema_descr, leaf_indices.iter().copied());
+
+    let projected_schema = build_filter_schema(
+        file_schema,
+        root_indices,
+        &required_columns.struct_field_accesses,
+    );
+
+    Ok(Some((
+        ParquetReadPlan {
+            projection_mask,
+            projected_schema,
+        },
+        required_bytes,
+    )))
 }
 
-/// Recurses through expr as a tree, finds all `column`s, and checks if any of them would prevent
-/// this expression from being predicate pushed down. If any of them would, this returns false.
-/// Otherwise, true.
-/// Note that the schema passed in here is *not* the physical file schema (as it is not available at that point in time);
-/// it is the schema of the table that this expression is being evaluated against minus any projected columns and partition columns.
+fn leaf_indices_for_roots<I>(
+    root_indices: I,
+    schema_descr: &SchemaDescriptor,
+) -> Vec<usize>
+where
+    I: IntoIterator<Item = usize>,
+{
+    // Always map root (Arrow) indices to Parquet leaf indices via the schema
+    // descriptor. Arrow root indices only equal Parquet leaf indices when the
+    // schema has no group columns (Struct, Map, etc.); when group columns
+    // exist, their children become separate leaves and shift all subsequent
+    // leaf indices.
+    // Struct columns are unsupported.
+    let root_set: BTreeSet<_> = root_indices.into_iter().collect();
+
+    (0..schema_descr.num_columns())
+        .filter(|leaf_idx| {
+            root_set.contains(&schema_descr.get_column_root_idx(*leaf_idx))
+        })
+        .collect()
+}
+
+/// Resolves struct field access to specific Parquet leaf column indices
+///
+/// For every `StructFieldAccess`, finds the leaf columns in the Parquet schema
+/// whose path matches the struct root name + field path. This avoids reading all
+/// leaves of a struct when only specific fields are needed
+fn resolve_struct_field_leaves(
+    accesses: &[StructFieldAccess],
+    file_schema: &Schema,
+    schema_descr: &SchemaDescriptor,
+) -> Vec<usize> {
+    let mut leaf_indices = Vec::new();
+
+    for access in accesses {
+        let root_name = file_schema.field(access.root_index).name();
+        let prefix = std::iter::once(root_name.as_str())
+            .chain(access.field_path.iter().map(|p| p.as_str()))
+            .collect::<Vec<_>>();
+
+        for leaf_idx in 0..schema_descr.num_columns() {
+            let col = schema_descr.column(leaf_idx);
+            let col_path = col.path().parts();
+
+            // A leaf matches if its path starts with our prefix.
+            // e.g., prefix=["s", "value"] matches leaf path ["s", "value"]
+            // prefix=["s", "outer"] matches ["s", "outer", "inner"]
+
+            // a leaf matches iff its path starts with our prefix
+            // for example: prefix=["s", "value"] matches leaf path ["s", "value"]
+            //              prefix=["s", "outer"] matches ["s", "outer", "inner"]
+            let leaf_matches_path = col_path.len() >= prefix.len()
+                && col_path.iter().zip(prefix.iter()).all(|(a, b)| a == b);
+
+            if leaf_matches_path {
+                leaf_indices.push(leaf_idx);
+            }
+        }
+    }
+
+    leaf_indices
+}
+
+/// Builds a filter schema that includes only the fields actually accessed by the
+/// filter expression.
+///
+/// For regular (non-struct) columns, the full field type is used.
+/// For struct columns accessed via `get_field`, a pruned struct type is created
+/// containing only the fields along the access path. Note: it must match the schema
+/// that the Parquet reader produces when projecting specific struct leaves
+fn build_filter_schema(
+    file_schema: &Schema,
+    regular_indices: &[usize],
+    struct_field_accesses: &[StructFieldAccess],
+) -> SchemaRef {
+    let all_indices = regular_indices
+        .iter()
+        .copied()
+        .chain(
+            struct_field_accesses
+                .iter()
+                .map(|&StructFieldAccess { root_index, .. }| root_index),
+        )
+        .collect::<BTreeSet<_>>();
+
+    let fields = all_indices
+        .iter()
+        .map(|&idx| {
+            let field = file_schema.field(idx);
+
+            // collect all field paths that access this root struct column
+            let field_paths = struct_field_accesses
+                .iter()
+                .filter_map(
+                    |&StructFieldAccess {
+                         root_index,
+                         ref field_path,
+                     }| {
+                        (root_index == idx).then_some(field_path.as_slice())
+                    },
+                )
+                .collect::<Vec<_>>();
+
+            if field_paths.is_empty() {
+                // its a regular column - use the full type
+                return Arc::new(field.clone());
+            }
+
+            let pruned_data_type = prune_struct_type(field.data_type(), &field_paths);
+            Arc::new(Field::new(
+                field.name(),
+                pruned_data_type,
+                field.is_nullable(),
+            ))
+        })
+        .collect::<Vec<_>>();
+
+    Arc::new(Schema::new(fields))
+}
+
+fn prune_struct_type(dt: &DataType, paths: &[&[String]]) -> DataType {
+    let DataType::Struct(fields) = dt else {
+        return dt.clone();
+    };
+
+    let needed = paths
+        .iter()
+        .filter_map(|p| p.first().map(|s| s.as_str()))
+        .collect::<BTreeSet<_>>();
+
+    let pruned_fields = fields
+        .iter()
+        .filter_map(|f| {
+            if !needed.contains(f.name().as_str()) {
+                return None;
+            }
+
+            let sub_paths = paths
+                .iter()
+                .filter_map(|path| {
+                    if path.first().map(|s| s.as_str()) == Some(f.name()) {
+                        Some(&path[1..])
+                    } else {
+                        None
+                    }
+                })
+                .filter(|sub| !sub.is_empty())
+                .collect::<Vec<_>>();
+
+            let out = if sub_paths.is_empty() {
+                // Leaf of access path — keep the field as-is.
+                Arc::clone(f)
+            } else {
+                // Recurse into nested struct.
+                let pruned = prune_struct_type(f.data_type(), &sub_paths);
+                Arc::new(Field::new(f.name(), pruned, f.is_nullable()))
+            };
+
+            Some(out)
+        })
+        .collect::<Vec<_>>();
+
+    DataType::Struct(pruned_fields.into())
+}
+
+/// Checks if a predicate expression can be pushed down to the parquet decoder.
+///
+/// Returns `true` if all columns referenced by the expression:
+/// - Exist in the provided schema
+/// - Are primitive types OR list columns with supported predicates
+///   (e.g., `array_has`, `array_has_all`, `array_has_any`, IS NULL, IS NOT NULL)
+/// - Are struct columns accessed via `get_field` where the leaf type is primitive
+/// - Direct references to whole struct columns will prevent pushdown
+///
+/// # Arguments
+/// * `expr` - The filter expression to check
+/// * `file_schema` - The Arrow schema of the parquet file (or table schema when
+///   the file schema is not yet available during planning)
+///
+/// # Examples
+///
+/// Primitive column filters can be pushed down:
+/// ```ignore
+/// use datafusion_expr::{col, Expr};
+/// use datafusion_common::ScalarValue;
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use std::sync::Arc;
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("age", DataType::Int32, false),
+/// ]));
+///
+/// // Primitive filter: can be pushed down
+/// let expr = col("age").gt(Expr::Literal(ScalarValue::Int32(Some(30)), None));
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
+///
+/// Struct column filters cannot be pushed down:
+/// ```ignore
+/// use arrow::datatypes::Fields;
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("person", DataType::Struct(
+///         Fields::from(vec![Field::new("name", DataType::Utf8, true)])
+///     ), true),
+/// ]));
+///
+/// // Struct filter: cannot be pushed down
+/// let expr = col("person").is_not_null();
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(!can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
+///
+/// List column filters with supported predicates can be pushed down:
+/// ```ignore
+/// use datafusion_functions_nested::expr_fn::{array_has_all, make_array};
+///
+/// let schema = Arc::new(Schema::new(vec![
+///     Field::new("tags", DataType::List(
+///         Arc::new(Field::new("item", DataType::Utf8, true))
+///     ), true),
+/// ]));
+///
+/// // Array filter with supported predicate: can be pushed down
+/// let expr = array_has_all(col("tags"), make_array(vec![
+///     Expr::Literal(ScalarValue::Utf8(Some("rust".to_string())), None)
+/// ]));
+/// let expr = logical2physical(&expr, &schema);
+/// assert!(can_expr_be_pushed_down_with_schemas(&expr, &schema));
+/// ```
 pub fn can_expr_be_pushed_down_with_schemas(
     expr: &Arc<dyn PhysicalExpr>,
     file_schema: &Schema,
@@ -382,7 +820,7 @@ pub fn can_expr_be_pushed_down_with_schemas(
     }
 }
 
-/// Calculate the total compressed size of all `Column`'s required for
+/// Calculate the total compressed size of all leaf columns required for
 /// predicate `Expr`.
 ///
 /// This value represents the total amount of IO required to evaluate the
@@ -399,38 +837,33 @@ fn size_of_columns(columns: &[usize], metadata: &ParquetMetaData) -> Result<usiz
     Ok(total_size)
 }
 
-/// For a given set of `Column`s required for predicate `Expr` determine whether
-/// all columns are sorted.
+/// Build a [`RowFilter`] from the given predicate expression if possible.
 ///
-/// Sorted columns may be queried more efficiently in the presence of
-/// a PageIndex.
-fn columns_sorted(_columns: &[usize], _metadata: &ParquetMetaData) -> Result<bool> {
-    // TODO How do we know this?
-    Ok(false)
-}
-
-/// Build a [`RowFilter`] from the given predicate `Expr` if possible
+/// # Arguments
+/// * `expr` - The filter predicate, already adapted to reference columns in `file_schema`
+/// * `file_schema` - The Arrow schema of the parquet file (the result of converting
+///   the parquet schema to Arrow, potentially with type coercions applied)
+/// * `metadata` - Parquet file metadata used for cost estimation
+/// * `reorder_predicates` - If true, reorder predicates to minimize I/O
+/// * `file_metrics` - Metrics for tracking filter performance
 ///
-/// # returns
-/// * `Ok(Some(row_filter))` if the expression can be used as RowFilter
-/// * `Ok(None)` if the expression cannot be used as an RowFilter
+/// # Returns
+/// * `Ok(Some(row_filter))` if the expression can be used as a RowFilter
+/// * `Ok(None)` if the expression cannot be used as a RowFilter
 /// * `Err(e)` if an error occurs while building the filter
 ///
-/// Note that the returned `RowFilter` may not contains all conjuncts in the
-/// original expression. This is because some conjuncts may not be able to be
-/// evaluated as an `ArrowPredicate` and will be ignored.
+/// Note: The returned `RowFilter` may not contain all conjuncts from the original
+/// expression. Conjuncts that cannot be evaluated as an `ArrowPredicate` are ignored.
 ///
 /// For example, if the expression is `a = 1 AND b = 2 AND c = 3` and `b = 2`
-/// can not be evaluated for some reason, the returned `RowFilter` will contain
-/// `a = 1` and `c = 3`.
+/// cannot be evaluated for some reason, the returned `RowFilter` will contain
+/// only `a = 1` and `c = 3`.
 pub fn build_row_filter(
     expr: &Arc<dyn PhysicalExpr>,
-    physical_file_schema: &SchemaRef,
-    logical_file_schema: &SchemaRef,
+    file_schema: &SchemaRef,
     metadata: &ParquetMetaData,
     reorder_predicates: bool,
     file_metrics: &ParquetFileMetrics,
-    schema_adapter_factory: &Arc<dyn SchemaAdapterFactory>,
 ) -> Result<Option<RowFilter>> {
     let rows_pruned = &file_metrics.pushdown_rows_pruned;
     let rows_matched = &file_metrics.pushdown_rows_matched;
@@ -444,13 +877,8 @@ pub fn build_row_filter(
     let mut candidates: Vec<FilterCandidate> = predicates
         .into_iter()
         .map(|expr| {
-            FilterCandidateBuilder::new(
-                Arc::clone(expr),
-                Arc::clone(physical_file_schema),
-                Arc::clone(logical_file_schema),
-                Arc::clone(schema_adapter_factory),
-            )
-            .build(metadata)
+            FilterCandidateBuilder::new(Arc::clone(expr), Arc::clone(file_schema))
+                .build(metadata)
         })
         .collect::<Result<Vec<_>, _>>()?
         .into_iter()
@@ -463,22 +891,35 @@ pub fn build_row_filter(
     }
 
     if reorder_predicates {
-        candidates.sort_unstable_by(|c1, c2| {
-            match c1.can_use_index.cmp(&c2.can_use_index) {
-                Ordering::Equal => c1.required_bytes.cmp(&c2.required_bytes),
-                ord => ord,
-            }
-        });
+        candidates.sort_unstable_by_key(|c| c.required_bytes);
     }
 
+    // To avoid double-counting metrics when multiple predicates are used:
+    // - All predicates should count rows_pruned (cumulative pruned rows)
+    // - Only the last predicate should count rows_matched (final result)
+    // This ensures: rows_matched + rows_pruned = total rows processed
+    let total_candidates = candidates.len();
+
     candidates
         .into_iter()
-        .map(|candidate| {
+        .enumerate()
+        .map(|(idx, candidate)| {
+            let is_last = idx == total_candidates - 1;
+
+            // All predicates share the pruned counter (cumulative)
+            let predicate_rows_pruned = rows_pruned.clone();
+
+            // Only the last predicate tracks matched rows (final result)
+            let predicate_rows_matched = if is_last {
+                rows_matched.clone()
+            } else {
+                metrics::Count::new()
+            };
+
             DatafusionArrowPredicate::try_new(
                 candidate,
-                metadata,
-                rows_pruned.clone(),
-                rows_matched.clone(),
+                predicate_rows_pruned,
+                predicate_rows_matched,
                 time.clone(),
             )
             .map(|pred| Box::new(pred) as _)
@@ -490,21 +931,36 @@ pub fn build_row_filter(
 #[cfg(test)]
 mod test {
     use super::*;
+    use arrow::datatypes::Fields;
     use datafusion_common::ScalarValue;
 
+    use arrow::array::{
+        Int32Array, ListBuilder, StringArray, StringBuilder, StructArray,
+    };
     use arrow::datatypes::{Field, TimeUnit::Nanosecond};
-    use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory;
-    use datafusion_expr::{col, Expr};
+    use datafusion_expr::{Expr, col};
+    use datafusion_functions::core::get_field;
+    use datafusion_functions_nested::array_has::{
+        array_has_all_udf, array_has_any_udf, array_has_udf,
+    };
+    use datafusion_functions_nested::expr_fn::{
+        array_has, array_has_all, array_has_any, make_array,
+    };
     use datafusion_physical_expr::planner::logical2physical;
-    use datafusion_physical_plan::metrics::{Count, Time};
+    use datafusion_physical_expr_adapter::{
+        DefaultPhysicalExprAdapterFactory, PhysicalExprAdapterFactory,
+    };
+    use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, Time};
 
+    use parquet::arrow::ArrowWriter;
     use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
     use parquet::arrow::parquet_to_arrow_schema;
     use parquet::file::reader::{FileReader, SerializedFileReader};
+    use tempfile::NamedTempFile;
 
-    // We should ignore predicate that read non-primitive columns
+    // List predicates used by the decoder should be accepted for pushdown
     #[test]
-    fn test_filter_candidate_builder_ignore_complex_types() {
+    fn test_filter_candidate_builder_supports_list_types() {
         let testdata = datafusion_common::test_util::parquet_test_data();
         let file = std::fs::File::open(format!("{testdata}/list_columns.parquet"))
             .expect("opening file");
@@ -520,19 +976,20 @@ mod test {
         let expr = col("int64_list").is_not_null();
         let expr = logical2physical(&expr, &table_schema);
 
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
         let table_schema = Arc::new(table_schema.clone());
 
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            table_schema.clone(),
-            table_schema,
-            schema_adapter_factory,
-        )
-        .build(metadata)
-        .expect("building candidate");
+        let list_index = table_schema
+            .index_of("int64_list")
+            .expect("list column should exist");
 
-        assert!(candidate.is_none());
+        let candidate = FilterCandidateBuilder::new(expr, table_schema)
+            .build(metadata)
+            .expect("building candidate")
+            .expect("list pushdown should be supported");
+
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [list_index]);
+        assert_eq!(candidate.read_plan.projection_mask, expected_mask);
     }
 
     #[test]
@@ -557,23 +1014,21 @@ mod test {
         // Test all should fail
         let expr = col("timestamp_col").lt(Expr::Literal(
             ScalarValue::TimestampNanosecond(Some(1), Some(Arc::from("UTC"))),
+            None,
         ));
         let expr = logical2physical(&expr, &table_schema);
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
-        let table_schema = Arc::new(table_schema.clone());
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            file_schema.clone(),
-            table_schema.clone(),
-            schema_adapter_factory,
-        )
-        .build(&metadata)
-        .expect("building candidate")
-        .expect("candidate expected");
+        let expr = DefaultPhysicalExprAdapterFactory {}
+            .create(Arc::new(table_schema.clone()), Arc::clone(&file_schema))
+            .expect("creating expr adapter")
+            .rewrite(expr)
+            .expect("rewriting expression");
+        let candidate = FilterCandidateBuilder::new(expr, file_schema.clone())
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("candidate expected");
 
         let mut row_filter = DatafusionArrowPredicate::try_new(
             candidate,
-            &metadata,
             Count::new(),
             Count::new(),
             Time::new(),
@@ -597,22 +1052,22 @@ mod test {
         // Test all should pass
         let expr = col("timestamp_col").gt(Expr::Literal(
             ScalarValue::TimestampNanosecond(Some(0), Some(Arc::from("UTC"))),
+            None,
         ));
         let expr = logical2physical(&expr, &table_schema);
-        let schema_adapter_factory = Arc::new(DefaultSchemaAdapterFactory);
-        let candidate = FilterCandidateBuilder::new(
-            expr,
-            file_schema,
-            table_schema,
-            schema_adapter_factory,
-        )
-        .build(&metadata)
-        .expect("building candidate")
-        .expect("candidate expected");
+        // Rewrite the expression to add CastExpr for type coercion
+        let expr = DefaultPhysicalExprAdapterFactory {}
+            .create(Arc::new(table_schema), Arc::clone(&file_schema))
+            .expect("creating expr adapter")
+            .rewrite(expr)
+            .expect("rewriting expression");
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("candidate expected");
 
         let mut row_filter = DatafusionArrowPredicate::try_new(
             candidate,
-            &metadata,
             Count::new(),
             Count::new(),
             Time::new(),
@@ -624,14 +1079,233 @@ mod test {
     }
 
     #[test]
-    fn nested_data_structures_prevent_pushdown() {
+    fn struct_data_structures_prevent_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+            ),
+            true,
+        )]));
+
+        let expr = col("struct_col").is_not_null();
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    #[test]
+    fn mixed_primitive_and_struct_prevents_pushdown() {
+        // Even when a predicate contains both primitive and unsupported nested columns,
+        // the entire predicate should not be pushed down because the struct column
+        // cannot be evaluated during Parquet decoding.
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "struct_col",
+                DataType::Struct(
+                    vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+                ),
+                true,
+            ),
+            Field::new("int_col", DataType::Int32, false),
+        ]));
+
+        // Expression: (struct_col IS NOT NULL) AND (int_col = 5)
+        // Even though int_col is primitive, the presence of struct_col in the
+        // conjunction should prevent pushdown of the entire expression.
+        let expr = col("struct_col")
+            .is_not_null()
+            .and(col("int_col").eq(Expr::Literal(ScalarValue::Int32(Some(5)), None)));
+        let expr = logical2physical(&expr, &table_schema);
+
+        // The entire expression should not be pushed down
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+
+        // However, just the int_col predicate alone should be pushable
+        let expr_int_only =
+            col("int_col").eq(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr_int_only = logical2physical(&expr_int_only, &table_schema);
+        assert!(can_expr_be_pushed_down_with_schemas(
+            &expr_int_only,
+            &table_schema
+        ));
+    }
+
+    #[test]
+    fn nested_lists_allow_pushdown_checks() {
         let table_schema = Arc::new(get_lists_table_schema());
 
         let expr = col("utf8_list").is_not_null();
         let expr = logical2physical(&expr, &table_schema);
         check_expression_can_evaluate_against_schema(&expr, &table_schema);
 
-        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    #[test]
+    fn array_has_all_pushdown_filters_rows() {
+        // Test array_has_all: checks if array contains all of ["c"]
+        // Rows with "c": row 1 and row 2
+        let expr = array_has_all(
+            col("letters"),
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("c".to_string())),
+                None,
+            )]),
+        );
+        test_array_predicate_pushdown("array_has_all", expr, 1, 2, true);
+    }
+
+    /// Helper function to test array predicate pushdown functionality.
+    ///
+    /// Creates a Parquet file with a list column, applies the given predicate,
+    /// and verifies that rows are correctly filtered during decoding.
+    fn test_array_predicate_pushdown(
+        func_name: &str,
+        predicate_expr: Expr,
+        expected_pruned: usize,
+        expected_matched: usize,
+        expect_list_support: bool,
+    ) {
+        let item_field = Arc::new(Field::new("item", DataType::Utf8, true));
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "letters",
+            DataType::List(item_field),
+            true,
+        )]));
+
+        let mut builder = ListBuilder::new(StringBuilder::new());
+        // Row 0: ["a", "b"]
+        builder.values().append_value("a");
+        builder.values().append_value("b");
+        builder.append(true);
+
+        // Row 1: ["c"]
+        builder.values().append_value("c");
+        builder.append(true);
+
+        // Row 2: ["c", "d"]
+        builder.values().append_value("c");
+        builder.values().append_value("d");
+        builder.append(true);
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(builder.finish())])
+                .expect("record batch");
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), schema, None).expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let parquet_reader_builder =
+            ParquetRecordBatchReaderBuilder::try_new(reader_file)
+                .expect("reader builder");
+        let metadata = parquet_reader_builder.metadata().clone();
+        let file_schema = parquet_reader_builder.schema().clone();
+
+        let expr = logical2physical(&predicate_expr, &file_schema);
+        if expect_list_support {
+            assert!(supports_list_predicates(&expr));
+        }
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let file_metrics =
+            ParquetFileMetrics::new(0, &format!("{func_name}.parquet"), &metrics);
+
+        let row_filter =
+            build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics)
+                .expect("building row filter")
+                .expect("row filter should exist");
+
+        let reader = parquet_reader_builder
+            .with_row_filter(row_filter)
+            .build()
+            .expect("build reader");
+
+        let mut total_rows = 0;
+        for batch in reader {
+            let batch = batch.expect("record batch");
+            total_rows += batch.num_rows();
+        }
+
+        assert_eq!(
+            file_metrics.pushdown_rows_pruned.value(),
+            expected_pruned,
+            "{func_name}: expected {expected_pruned} pruned rows"
+        );
+        assert_eq!(
+            file_metrics.pushdown_rows_matched.value(),
+            expected_matched,
+            "{func_name}: expected {expected_matched} matched rows"
+        );
+        assert_eq!(
+            total_rows, expected_matched,
+            "{func_name}: expected {expected_matched} total rows"
+        );
+    }
+
+    #[test]
+    fn array_has_pushdown_filters_rows() {
+        // Test array_has: checks if "c" is in the array
+        // Rows with "c": row 1 and row 2
+        let expr = array_has(
+            col("letters"),
+            Expr::Literal(ScalarValue::Utf8(Some("c".to_string())), None),
+        );
+        test_array_predicate_pushdown("array_has", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_any_pushdown_filters_rows() {
+        // Test array_has_any: checks if array contains any of ["a", "d"]
+        // Row 0 has "a", row 2 has "d" - both should match
+        let expr = array_has_any(
+            col("letters"),
+            make_array(vec![
+                Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+                Expr::Literal(ScalarValue::Utf8(Some("d".to_string())), None),
+            ]),
+        );
+        test_array_predicate_pushdown("array_has_any", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_udf_pushdown_filters_rows() {
+        let expr = array_has_udf().call(vec![
+            col("letters"),
+            Expr::Literal(ScalarValue::Utf8(Some("c".to_string())), None),
+        ]);
+
+        test_array_predicate_pushdown("array_has_udf", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_all_udf_pushdown_filters_rows() {
+        let expr = array_has_all_udf().call(vec![
+            col("letters"),
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("c".to_string())),
+                None,
+            )]),
+        ]);
+
+        test_array_predicate_pushdown("array_has_all_udf", expr, 1, 2, true);
+    }
+
+    #[test]
+    fn array_has_any_udf_pushdown_filters_rows() {
+        let expr = array_has_any_udf().call(vec![
+            col("letters"),
+            make_array(vec![
+                Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+                Expr::Literal(ScalarValue::Utf8(Some("d".to_string())), None),
+            ]),
+        ]);
+
+        test_array_predicate_pushdown("array_has_any_udf", expr, 1, 2, true);
     }
 
     #[test]
@@ -660,7 +1334,7 @@ mod test {
 
         let expr = col("string_col")
             .is_not_null()
-            .or(col("bigint_col").gt(Expr::Literal(ScalarValue::Int64(Some(5)))));
+            .or(col("bigint_col").gt(Expr::Literal(ScalarValue::Int64(Some(5)), None)));
         let expr = logical2physical(&expr, &table_schema);
 
         assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
@@ -692,6 +1366,454 @@ mod test {
             .expect("parsing schema")
     }
 
+    /// Regression test: when a schema has Struct columns, Arrow field indices diverge
+    /// from Parquet leaf indices (Struct children become separate leaves). The
+    /// `PrimitiveOnly` fast-path in `leaf_indices_for_roots` assumes they are equal,
+    /// so a filter on a primitive column *after* a Struct gets the wrong leaf index.
+    ///
+    /// Schema:
+    ///   Arrow indices:   col_a=0  struct_col=1  col_b=2
+    ///   Parquet leaves:  col_a=0  struct_col.x=1  struct_col.y=2  col_b=3
+    ///
+    /// A filter on col_b should project Parquet leaf 3, but the bug causes it to
+    /// project leaf 2 (struct_col.y).
+    #[test]
+    fn test_filter_pushdown_leaf_index_with_struct_in_schema() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col_a", DataType::Int32, false),
+            Field::new(
+                "struct_col",
+                DataType::Struct(
+                    vec![
+                        Arc::new(Field::new("x", DataType::Int32, true)),
+                        Arc::new(Field::new("y", DataType::Int32, true)),
+                    ]
+                    .into(),
+                ),
+                true,
+            ),
+            Field::new("col_b", DataType::Utf8, false),
+        ]));
+
+        let col_a = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let struct_col = Arc::new(StructArray::from(vec![
+            (
+                Arc::new(Field::new("x", DataType::Int32, true)),
+                Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+            ),
+            (
+                Arc::new(Field::new("y", DataType::Int32, true)),
+                Arc::new(Int32Array::from(vec![100, 200, 300])) as _,
+            ),
+        ]));
+        let col_b = Arc::new(StringArray::from(vec!["aaa", "target", "zzz"]));
+
+        let batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![col_a, struct_col, col_b])
+                .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // sanity check: 4 Parquet leaves, 3 Arrow fields
+        assert_eq!(metadata.file_metadata().schema_descr().num_columns(), 4);
+        assert_eq!(file_schema.fields().len(), 3);
+
+        // build a filter candidate for `col_b = 'target'` through the public API
+        let expr = col("col_b").eq(Expr::Literal(
+            ScalarValue::Utf8(Some("target".to_string())),
+            None,
+        ));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("filter on primitive col_b should be pushable");
+
+        // col_b is Parquet leaf 3 (shifted by struct_col's two children).
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [3]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only leaf 3 for col_b"
+        );
+    }
+
+    /// get_field(struct_col, 'a') on a struct with a primitive leaf should allow pushdown.
+    #[test]
+    fn get_field_on_struct_allows_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("a", DataType::Int32, true))].into(),
+            ),
+            true,
+        )]));
+
+        // get_field(struct_col, 'a') > 5
+        let get_field_expr = get_field().call(vec![
+            col("struct_col"),
+            Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field on a struct field that resolves to a nested type should still block pushdown.
+    #[test]
+    fn get_field_on_nested_leaf_prevents_pushdown() {
+        let inner_struct = DataType::Struct(
+            vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
+        );
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Arc::new(Field::new("nested", inner_struct, true))].into(),
+            ),
+            true,
+        )]));
+
+        // get_field(struct_col, 'nested') IS NOT NULL — the leaf is still a struct
+        let get_field_expr = get_field().call(vec![
+            col("struct_col"),
+            Expr::Literal(ScalarValue::Utf8(Some("nested".to_string())), None),
+        ]);
+        let expr = get_field_expr.is_not_null();
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(!can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field returning a list inside a struct should allow pushdown when
+    /// wrapped in a supported list predicate like `array_has_any`.
+    /// e.g. `array_has_any(get_field(s, 'items'), make_array('x'))`
+    #[test]
+    fn get_field_list_leaf_with_array_predicate_allows_pushdown() {
+        let item_field = Arc::new(Field::new("item", DataType::Utf8, true));
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(
+                vec![
+                    Arc::new(Field::new("id", DataType::Int32, true)),
+                    Arc::new(Field::new("items", DataType::List(item_field), true)),
+                ]
+                .into(),
+            ),
+            true,
+        )]));
+
+        // array_has_any(get_field(s, 'items'), make_array('x'))
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("items".to_string())), None),
+        ]);
+        let expr = array_has_any(
+            get_field_expr,
+            make_array(vec![Expr::Literal(
+                ScalarValue::Utf8(Some("x".to_string())),
+                None,
+            )]),
+        );
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// get_field on a struct produces correct Parquet leaf indices.
+    #[test]
+    fn get_field_filter_candidate_has_correct_leaf_indices() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        // Schema: id (Int32), s (Struct{value: Int32, label: Utf8})
+        // Parquet leaves: id=0, s.value=1, s.label=2
+        let struct_fields: Fields = vec![
+            Arc::new(Field::new("value", DataType::Int32, false)),
+            Arc::new(Field::new("label", DataType::Utf8, false)),
+        ]
+        .into();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(struct_fields.clone()), false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StructArray::new(
+                    struct_fields,
+                    vec![
+                        Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+                        Arc::new(StringArray::from(vec!["a", "b", "c"])) as _,
+                    ],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // get_field(s, 'value') > 5
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("value".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("get_field filter on struct should be pushable");
+
+        // The filter accesses only s.value, so only Parquet leaf 1 is needed.
+        // Leaf 2 (s.label) is not read, reducing unnecessary I/O.
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [1]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only the accessed struct field leaf"
+        );
+    }
+
+    /// Deeply nested get_field: get_field(struct_col, 'outer', 'inner') where the
+    /// leaf is primitive should allow pushdown. The logical simplifier flattens
+    /// nested get_field(get_field(col, 'a'), 'b') into get_field(col, 'a', 'b').
+    #[test]
+    fn get_field_deeply_nested_allows_pushdown() {
+        let table_schema = Arc::new(Schema::new(vec![Field::new(
+            "s",
+            DataType::Struct(
+                vec![Arc::new(Field::new(
+                    "outer",
+                    DataType::Struct(
+                        vec![Arc::new(Field::new("inner", DataType::Int32, true))].into(),
+                    ),
+                    true,
+                ))]
+                .into(),
+            ),
+            true,
+        )]));
+
+        // s['outer']['inner'] > 5
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("outer".to_string())), None),
+            Expr::Literal(ScalarValue::Utf8(Some("inner".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(5)), None));
+        let expr = logical2physical(&expr, &table_schema);
+
+        assert!(can_expr_be_pushed_down_with_schemas(&expr, &table_schema));
+    }
+
+    /// End-to-end: deeply nested get_field filter produces correct leaf indices
+    /// and the filter actually works against a Parquet file.
+    #[test]
+    fn get_field_deeply_nested_filter_candidate() {
+        use arrow::array::{Int32Array, StringArray, StructArray};
+
+        // Schema: id (Int32), s (Struct{outer: Struct{extra: Int32, inner: Int32}, tag: Utf8})
+        // Parquet leaves: id=0, s.outer.extra=1, s.outer.inner=2, s.tag=3
+        let inner_fields: Fields = vec![
+            Arc::new(Field::new("extra", DataType::Int32, false)),
+            Arc::new(Field::new("inner", DataType::Int32, false)),
+        ]
+        .into();
+        let outer_fields: Fields = vec![
+            Arc::new(Field::new(
+                "outer",
+                DataType::Struct(inner_fields.clone()),
+                false,
+            )),
+            Arc::new(Field::new("tag", DataType::Utf8, false)),
+        ]
+        .into();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(outer_fields.clone()), false),
+        ]));
+
+        let inner_struct = StructArray::new(
+            inner_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![100, 200, 300])) as _,
+                Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+            ],
+            None,
+        );
+        let outer_struct = StructArray::new(
+            outer_fields,
+            vec![
+                Arc::new(inner_struct) as _,
+                Arc::new(StringArray::from(vec!["x", "y", "z"])) as _,
+            ],
+            None,
+        );
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(outer_struct),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let builder = ParquetRecordBatchReaderBuilder::try_new(reader_file)
+            .expect("reader builder");
+        let metadata = builder.metadata().clone();
+        let file_schema = builder.schema().clone();
+
+        // Parquet should have 4 leaves: id=0, s.outer.extra=1, s.outer.inner=2, s.tag=3
+        assert_eq!(metadata.file_metadata().schema_descr().num_columns(), 4);
+
+        // get_field(s, 'outer', 'inner') > 15
+        // Should only need leaf 2 (s.outer.inner), not leaf 1 (s.outer.extra) or leaf 3 (s.tag).
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("outer".to_string())), None),
+            Expr::Literal(ScalarValue::Utf8(Some("inner".to_string())), None),
+        ]);
+        let expr = get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(15)), None));
+        let expr = logical2physical(&expr, &file_schema);
+
+        let candidate = FilterCandidateBuilder::new(expr, file_schema)
+            .build(&metadata)
+            .expect("building candidate")
+            .expect("deeply nested get_field filter should be pushable");
+
+        // Only s.outer.inner (leaf 2) should be projected,
+        let expected_mask =
+            ProjectionMask::leaves(metadata.file_metadata().schema_descr(), [2]);
+        assert_eq!(
+            candidate.read_plan.projection_mask, expected_mask,
+            "projection_mask should select only leaf 2 for s.outer.inner, skipping sibling and cousin leaves"
+        );
+    }
+
+    /// End-to-end: get_field filter on a struct column with multiple fields
+    /// reads only the needed leaf and correctly filters rows during Parquet decoding.
+    #[test]
+    fn get_field_end_to_end_filters_rows() {
+        // Schema: id (Int32), s (Struct{value: Int32, label: Utf8})
+        // Parquet leaves: id=0, s.value=1, s.label=2
+        let struct_fields: Fields = vec![
+            Arc::new(Field::new("value", DataType::Int32, false)),
+            Arc::new(Field::new("label", DataType::Utf8, false)),
+        ]
+        .into();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("s", DataType::Struct(struct_fields.clone()), false),
+        ]));
+
+        // +----+--------------------------+
+        // | id | s                        |
+        // +----+--------------------------+
+        // |  1 | {value: 10, label: "a"}  |
+        // |  2 | {value: 20, label: "b"}  |
+        // |  3 | {value: 30, label: "c"}  |
+        // +----+--------------------------+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StructArray::new(
+                    struct_fields,
+                    vec![
+                        Arc::new(Int32Array::from(vec![10, 20, 30])) as _,
+                        Arc::new(StringArray::from(vec!["a", "b", "c"])) as _,
+                    ],
+                    None,
+                )),
+            ],
+        )
+        .unwrap();
+
+        let file = NamedTempFile::new().expect("temp file");
+        let mut writer =
+            ArrowWriter::try_new(file.reopen().unwrap(), Arc::clone(&schema), None)
+                .expect("writer");
+        writer.write(&batch).expect("write batch");
+        writer.close().expect("close writer");
+
+        let reader_file = file.reopen().expect("reopen file");
+        let parquet_reader_builder =
+            ParquetRecordBatchReaderBuilder::try_new(reader_file)
+                .expect("reader builder");
+        let metadata = parquet_reader_builder.metadata().clone();
+        let file_schema = parquet_reader_builder.schema().clone();
+
+        // get_field(s, 'value') > 15  — should match rows with value=20 and value=30
+        let get_field_expr = get_field().call(vec![
+            col("s"),
+            Expr::Literal(ScalarValue::Utf8(Some("value".to_string())), None),
+        ]);
+        let predicate_expr =
+            get_field_expr.gt(Expr::Literal(ScalarValue::Int32(Some(15)), None));
+        let expr = logical2physical(&predicate_expr, &file_schema);
+
+        let metrics = ExecutionPlanMetricsSet::new();
+        let file_metrics = ParquetFileMetrics::new(0, "struct_e2e.parquet", &metrics);
+
+        let row_filter =
+            build_row_filter(&expr, &file_schema, &metadata, false, &file_metrics)
+                .expect("building row filter")
+                .expect("row filter should exist");
+
+        let reader = parquet_reader_builder
+            .with_row_filter(row_filter)
+            .build()
+            .expect("build reader");
+
+        let mut total_rows = 0;
+        for batch in reader {
+            let batch = batch.expect("record batch");
+            total_rows += batch.num_rows();
+        }
+
+        assert_eq!(total_rows, 2, "expected 2 rows matching value > 15");
+        assert_eq!(file_metrics.pushdown_rows_pruned.value(), 1);
+        assert_eq!(file_metrics.pushdown_rows_matched.value(), 2);
+    }
+
     /// Sanity check that the given expression could be evaluated against the given schema without any errors.
     /// This will fail if the expression references columns that are not in the schema or if the types of the columns are incompatible, etc.
     fn check_expression_can_evaluate_against_schema(
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index d44fa16843201..932988af051e4 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -24,14 +24,16 @@ use arrow::datatypes::Schema;
 use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::{Column, Result, ScalarValue};
 use datafusion_datasource::FileRange;
-use datafusion_physical_optimizer::pruning::PruningPredicate;
+use datafusion_physical_expr::PhysicalExprSimplifier;
+use datafusion_physical_expr::expressions::NotExpr;
+use datafusion_pruning::PruningPredicate;
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use parquet::arrow::parquet_column;
 use parquet::basic::Type;
 use parquet::data_type::Decimal;
 use parquet::schema::types::SchemaDescriptor;
 use parquet::{
-    arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder},
+    arrow::{ParquetRecordBatchStreamBuilder, async_reader::AsyncFileReader},
     bloom_filter::Sbbf,
     file::metadata::RowGroupMetaData,
 };
@@ -46,13 +48,20 @@ use parquet::{
 pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
+    /// Row groups where ALL rows are known to match the pruning predicate
+    /// (the predicate does not filter any rows)
+    is_fully_matched: Vec<bool>,
 }
 
 impl RowGroupAccessPlanFilter {
     /// Create a new `RowGroupPlanBuilder` for pruning out the groups to scan
     /// based on metadata and statistics
     pub fn new(access_plan: ParquetAccessPlan) -> Self {
-        Self { access_plan }
+        let num_row_groups = access_plan.len();
+        Self {
+            access_plan,
+            is_fully_matched: vec![false; num_row_groups],
+        }
     }
 
     /// Return true if there are no row groups
@@ -60,11 +69,149 @@ impl RowGroupAccessPlanFilter {
         self.access_plan.is_empty()
     }
 
+    /// Return the number of row groups that are currently expected to be scanned
+    pub fn remaining_row_group_count(&self) -> usize {
+        self.access_plan.row_group_index_iter().count()
+    }
+
     /// Returns the inner access plan
     pub fn build(self) -> ParquetAccessPlan {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row groups.
+    ///
+    /// The pruning works by leveraging the concept of fully matched row groups. Consider a query like:
+    /// `WHERE species LIKE 'Alpine%' AND s >= 50 LIMIT N`
+    ///
+    /// After initial filtering, row groups can be classified into three states:
+    ///
+    /// 1. Not Matching / Pruned
+    /// 2. Partially Matching (Row Group/Page contains some matches)
+    /// 3. Fully Matching (Entire range is within predicate)
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                            NOT MATCHING                               |
+    /// |  Row group 1                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES                           | S                           |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Snow Vole                         | 7                           |  |
+    /// |  | Brown Bear                        | 133 ✅                      |  |
+    /// |  | Gray Wolf                         | 82  ✅                      |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// +---------------------------------------------------------------------------+
+    /// |                          PARTIALLY MATCHING                               |
+    /// |                                                                           |
+    /// |  Row group 2                              Row group 4                     |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | SPECIES          | S            |      | SPECIES          | S        | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | Lynx             | 71 ✅        |      | Europ. Mole      | 4        | |
+    /// |  | Red Fox          | 40           |      | Polecat          | 16       | |
+    /// |  | Alpine Bat  ✅   | 6            |      | Alpine Ibex ✅  | 97 ✅    | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// +---------------------------------------------------------------------------+
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                           FULLY MATCHING                              |
+    /// |  Row group 3                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES                           | S                           |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Alpine Ibex  ✅                  | 101    ✅                   |  |
+    /// |  | Alpine Goat  ✅                  | 76     ✅                   |  |
+    /// |  | Alpine Sheep ✅                  | 83     ✅                   |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// ### Identification of Fully Matching Row Groups
+    ///
+    /// DataFusion identifies row groups where ALL rows satisfy the filter by inverting the
+    /// predicate and checking if statistics prove the inverted version is false for the group.
+    ///
+    /// For example, prefix matches like `species LIKE 'Alpine%'` are pruned using ranges:
+    /// 1. Candidate Range: `species >= 'Alpine' AND species < 'Alpinf'`
+    /// 2. Inverted Condition (to prove full match): `species < 'Alpine' OR species >= 'Alpinf'`
+    /// 3. Statistical Evaluation (check if any row *could* satisfy the inverted condition):
+    ///    `min < 'Alpine' OR max >= 'Alpinf'`
+    ///
+    /// If this evaluation is **false**, it proves no row can fail the original filter,
+    /// so the row group is **FULLY MATCHING**.
+    ///
+    /// ### Impact of Statistics Truncation
+    ///
+    /// The precision of pruning depends on the metadata quality. Truncated statistics
+    /// may prevent the system from proving a full match.
+    ///
+    /// **Example**: `WHERE species LIKE 'Alpine%'` (Target range: `['Alpine', 'Alpinf')`)
+    ///
+    /// | Truncation Length | min / max           | Inverted Evaluation                                                 | Status                 |
+    /// |-------------------|---------------------|---------------------------------------------------------------------|------------------------|
+    /// | **Length 6**      | `Alpine` / `Alpine` | `"Alpine" < "Alpine" (F) OR "Alpine" >= "Alpinf" (F)` -> **false**  | **FULLY MATCHING**     |
+    /// | **Length 3**      | `Alp` / `Alq`       | `"Alp" < "Alpine" (T) OR "Alq" >= "Alpinf" (T)` -> **true**         | **PARTIALLY MATCHING** |
+    ///
+    /// Even though Row Group 3 only contains matching rows, truncation to length 3 makes
+    /// the statistics `[Alp, Alq]` too broad to prove it (they could include "Alpha").
+    /// The system must conservatively scan the group.
+    ///
+    /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
+    /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,
+    /// skip Partitions 2 and 4 entirely and go directly to Partition 3.
+    ///
+    /// This optimization is particularly effective when:
+    /// - The limit is small relative to the total dataset size
+    /// - There are row groups that are fully matched by the filter predicates
+    /// - The fully matched row groups contain sufficient rows to satisfy the limit
+    ///
+    /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
+    pub fn prune_by_limit(
+        &mut self,
+        limit: usize,
+        rg_metadata: &[RowGroupMetaData],
+        metrics: &ParquetFileMetrics,
+    ) {
+        let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
+        let mut fully_matched_rows_count: usize = 0;
+
+        // Iterate through the currently accessible row groups and try to
+        // find a set of matching row groups that can satisfy the limit
+        for &idx in self.access_plan.row_group_indexes().iter() {
+            if self.is_fully_matched[idx] {
+                let row_group_row_count = rg_metadata[idx].num_rows() as usize;
+                fully_matched_row_group_indexes.push(idx);
+                fully_matched_rows_count += row_group_row_count;
+                if fully_matched_rows_count >= limit {
+                    break;
+                }
+            }
+        }
+
+        // If we can satisfy the limit with fully matching row groups,
+        // rewrite the plan to do so
+        if fully_matched_rows_count >= limit {
+            let original_num_accessible_row_groups =
+                self.access_plan.row_group_indexes().len();
+            let new_num_accessible_row_groups = fully_matched_row_group_indexes.len();
+            let pruned_count = original_num_accessible_row_groups
+                .saturating_sub(new_num_accessible_row_groups);
+            metrics.limit_pruned_row_groups.add_pruned(pruned_count);
+
+            let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len());
+            for &idx in &fully_matched_row_group_indexes {
+                new_access_plan.scan(idx);
+            }
+            self.access_plan = new_access_plan;
+        }
+    }
+
     /// Prune remaining row groups to only those  within the specified range.
     ///
     /// Updates this set to mark row groups that should not be scanned
@@ -130,15 +277,26 @@ impl RowGroupAccessPlanFilter {
         // try to prune the row groups in a single call
         match predicate.prune(&pruning_stats) {
             Ok(values) => {
-                // values[i] is false means the predicate could not be true for row group i
+                let mut fully_contained_candidates_original_idx: Vec<usize> = Vec::new();
                 for (idx, &value) in row_group_indexes.iter().zip(values.iter()) {
                     if !value {
                         self.access_plan.skip(*idx);
-                        metrics.row_groups_pruned_statistics.add(1);
+                        metrics.row_groups_pruned_statistics.add_pruned(1);
                     } else {
-                        metrics.row_groups_matched_statistics.add(1);
+                        metrics.row_groups_pruned_statistics.add_matched(1);
+                        fully_contained_candidates_original_idx.push(*idx);
                     }
                 }
+
+                // Check if any of the matched row groups are fully contained by the predicate
+                self.identify_fully_matched_row_groups(
+                    &fully_contained_candidates_original_idx,
+                    arrow_schema,
+                    parquet_schema,
+                    groups,
+                    predicate,
+                    metrics,
+                );
             }
             // stats filter array could not be built, so we can't prune
             Err(e) => {
@@ -148,6 +306,68 @@ impl RowGroupAccessPlanFilter {
         }
     }
 
+    /// Identifies row groups that are fully matched by the predicate.
+    ///
+    /// This optimization checks whether all rows in a row group satisfy the predicate
+    /// by inverting the predicate and checking if it prunes the row group. If the
+    /// inverted predicate prunes a row group, it means no rows match the inverted
+    /// predicate, which implies all rows match the original predicate.
+    ///
+    /// Note: This optimization is relatively inexpensive for a limited number of row groups.
+    fn identify_fully_matched_row_groups(
+        &mut self,
+        candidate_row_group_indices: &[usize],
+        arrow_schema: &Schema,
+        parquet_schema: &SchemaDescriptor,
+        groups: &[RowGroupMetaData],
+        predicate: &PruningPredicate,
+        metrics: &ParquetFileMetrics,
+    ) {
+        if candidate_row_group_indices.is_empty() {
+            return;
+        }
+
+        // Use NotExpr to create the inverted predicate
+        let inverted_expr = Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
+
+        // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
+        // before building the pruning predicate
+        let simplifier = PhysicalExprSimplifier::new(arrow_schema);
+        let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else {
+            return;
+        };
+
+        let Ok(inverted_predicate) =
+            PruningPredicate::try_new(inverted_expr, Arc::clone(predicate.schema()))
+        else {
+            return;
+        };
+
+        let inverted_pruning_stats = RowGroupPruningStatistics {
+            parquet_schema,
+            row_group_metadatas: candidate_row_group_indices
+                .iter()
+                .map(|&i| &groups[i])
+                .collect::<Vec<_>>(),
+            arrow_schema,
+        };
+
+        let Ok(inverted_values) = inverted_predicate.prune(&inverted_pruning_stats)
+        else {
+            return;
+        };
+
+        for (i, &original_row_group_idx) in candidate_row_group_indices.iter().enumerate()
+        {
+            // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
+            // it implies that *all* rows in this group satisfy the original predicate.
+            if !inverted_values[i] {
+                self.is_fully_matched[original_row_group_idx] = true;
+                metrics.row_groups_pruned_statistics.add_fully_matched(1);
+            }
+        }
+    }
+
     /// Prune remaining row groups using available bloom filters and the
     /// [`PruningPredicate`].
     ///
@@ -215,10 +435,10 @@ impl RowGroupAccessPlanFilter {
             };
 
             if prune_group {
-                metrics.row_groups_pruned_bloom_filter.add(1);
+                metrics.row_groups_pruned_bloom_filter.add_pruned(1);
                 self.access_plan.skip(idx)
-            } else if !stats.column_sbbf.is_empty() {
-                metrics.row_groups_matched_bloom_filter.add(1);
+            } else {
+                metrics.row_groups_pruned_bloom_filter.add_matched(1);
             }
         }
     }
@@ -439,11 +659,12 @@ mod tests {
     use arrow::datatypes::DataType::Decimal128;
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::Result;
-    use datafusion_expr::{cast, col, lit, Expr};
+    use datafusion_expr::{Expr, cast, col, lit};
     use datafusion_physical_expr::planner::logical2physical;
     use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-    use parquet::arrow::async_reader::ParquetObjectReader;
+    use object_store::ObjectStoreExt;
     use parquet::arrow::ArrowSchemaConverter;
+    use parquet::arrow::async_reader::ParquetObjectReader;
     use parquet::basic::LogicalType;
     use parquet::data_type::{ByteArray, FixedLenByteArray};
     use parquet::file::metadata::ColumnChunkMetaData;
@@ -494,6 +715,18 @@ mod tests {
         }
     }
 
+    #[test]
+    fn remaining_row_group_count_reports_non_skipped_groups() {
+        let mut filter = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(4));
+        assert_eq!(filter.remaining_row_group_count(), 4);
+
+        filter.access_plan.skip(1);
+        assert_eq!(filter.remaining_row_group_count(), 3);
+
+        filter.access_plan.skip(3);
+        assert_eq!(filter.remaining_row_group_count(), 2);
+    }
+
     #[test]
     fn row_group_pruning_predicate_simple_expr() {
         use datafusion_expr::{col, lit};
@@ -1242,12 +1475,16 @@ mod tests {
             .run(
                 lit("1").eq(lit("1")).and(
                     col(r#""String""#)
-                        .eq(Expr::Literal(ScalarValue::Utf8View(Some(String::from(
-                            "Hello_Not_Exists",
-                        )))))
-                        .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View(
-                            Some(String::from("Hello_Not_Exists2")),
-                        )))),
+                        .eq(Expr::Literal(
+                            ScalarValue::Utf8View(Some(String::from("Hello_Not_Exists"))),
+                            None,
+                        ))
+                        .or(col(r#""String""#).eq(Expr::Literal(
+                            ScalarValue::Utf8View(Some(String::from(
+                                "Hello_Not_Exists2",
+                            ))),
+                            None,
+                        ))),
                 ),
             )
             .await
@@ -1327,15 +1564,18 @@ mod tests {
             // generate pruning predicate `(String = "Hello") OR (String = "the quick") OR (String = "are you")`
             .run(
                 col(r#""String""#)
-                    .eq(Expr::Literal(ScalarValue::Utf8View(Some(String::from(
-                        "Hello",
-                    )))))
-                    .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View(
-                        Some(String::from("the quick")),
-                    ))))
-                    .or(col(r#""String""#).eq(Expr::Literal(ScalarValue::Utf8View(
-                        Some(String::from("are you")),
-                    )))),
+                    .eq(Expr::Literal(
+                        ScalarValue::Utf8View(Some(String::from("Hello"))),
+                        None,
+                    ))
+                    .or(col(r#""String""#).eq(Expr::Literal(
+                        ScalarValue::Utf8View(Some(String::from("the quick"))),
+                        None,
+                    )))
+                    .or(col(r#""String""#).eq(Expr::Literal(
+                        ScalarValue::Utf8View(Some(String::from("are you"))),
+                        None,
+                    ))),
             )
             .await
     }
@@ -1401,7 +1641,10 @@ mod tests {
                 }
                 ExpectedPruning::Some(expected) => {
                     let actual = row_groups.access_plan.row_group_indexes();
-                    assert_eq!(expected, &actual, "Unexpected row groups pruned. Expected {expected:?}, got {actual:?}");
+                    assert_eq!(
+                        expected, &actual,
+                        "Unexpected row groups pruned. Expected {expected:?}, got {actual:?}"
+                    );
                 }
             }
         }
@@ -1509,7 +1752,8 @@ mod tests {
         data: bytes::Bytes,
         pruning_predicate: &PruningPredicate,
     ) -> Result<RowGroupAccessPlanFilter> {
-        use object_store::{ObjectMeta, ObjectStore};
+        use datafusion_datasource::PartitionedFile;
+        use object_store::ObjectMeta;
 
         let object_meta = ObjectMeta {
             location: object_store::path::Path::parse(file_name).expect("creating path"),
@@ -1527,12 +1771,16 @@ mod tests {
         let metrics = ExecutionPlanMetricsSet::new();
         let file_metrics =
             ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics);
-        let inner = ParquetObjectReader::new(Arc::new(in_memory), object_meta.location)
-            .with_file_size(object_meta.size);
+        let inner =
+            ParquetObjectReader::new(Arc::new(in_memory), object_meta.location.clone())
+                .with_file_size(object_meta.size);
+
+        let partitioned_file = PartitionedFile::new_from_meta(object_meta);
 
         let reader = ParquetFileReader {
             inner,
             file_metrics: file_metrics.clone(),
+            partitioned_file,
         };
         let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
 
diff --git a/datafusion/datasource-parquet/src/sort.rs b/datafusion/datasource-parquet/src/sort.rs
new file mode 100644
index 0000000000000..db22363aa3746
--- /dev/null
+++ b/datafusion/datasource-parquet/src/sort.rs
@@ -0,0 +1,1021 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort-related utilities for Parquet scanning
+
+use datafusion_common::Result;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+use parquet::file::metadata::ParquetMetaData;
+use std::collections::HashMap;
+
+/// Reverse a row selection to match reversed row group order.
+///
+/// When scanning row groups in reverse order, we need to adjust the row selection
+/// to account for the new ordering. This function:
+/// 1. Maps each selection to its corresponding row group
+/// 2. Reverses the order of row groups
+/// 3. Reconstructs the row selection for the new order
+///
+/// # Arguments
+/// * `row_selection` - Original row selection (only covers row groups that are scanned)
+/// * `parquet_metadata` - Metadata containing row group information
+/// * `row_groups_to_scan` - Indexes of row groups that will be scanned (in original order)
+///
+/// # Returns
+/// A new `RowSelection` adjusted for reversed row group order
+///
+/// # Important Notes
+/// The input `row_selection` only covers the row groups specified in `row_groups_to_scan`.
+/// Row groups that are skipped (not in `row_groups_to_scan`) are not represented in the
+/// `row_selection` at all. This function needs `row_groups_to_scan` to correctly map
+/// the selection back to the original row groups.
+pub fn reverse_row_selection(
+    row_selection: &RowSelection,
+    parquet_metadata: &ParquetMetaData,
+    row_groups_to_scan: &[usize],
+) -> Result<RowSelection> {
+    let rg_metadata = parquet_metadata.row_groups();
+
+    // Build a mapping of row group index to its row range, but ONLY for
+    // the row groups that are actually being scanned.
+    //
+    // IMPORTANT: The row numbers in this mapping are RELATIVE to the scanned row groups,
+    // not absolute positions in the file.
+    //
+    // Example: If row_groups_to_scan = [0, 2, 3] and each has 100 rows:
+    //   RG0: rows 0-99 (relative to scanned data)
+    //   RG2: rows 100-199 (relative to scanned data, NOT 200-299 in file!)
+    //   RG3: rows 200-299 (relative to scanned data, NOT 300-399 in file!)
+    let mut rg_row_ranges: Vec<(usize, usize, usize)> =
+        Vec::with_capacity(row_groups_to_scan.len());
+    let mut current_row = 0;
+    for &rg_idx in row_groups_to_scan {
+        let rg = &rg_metadata[rg_idx];
+        let num_rows = rg.num_rows() as usize;
+        rg_row_ranges.push((rg_idx, current_row, current_row + num_rows));
+        current_row += num_rows; // This is relative row number, NOT absolute file position
+    }
+
+    // Map selections to row groups
+    let mut rg_selections: HashMap<usize, Vec<RowSelector>> = HashMap::new();
+
+    let mut current_file_row = 0;
+    for selector in row_selection.iter() {
+        let selector_end = current_file_row + selector.row_count;
+
+        // Find which row groups this selector spans
+        for (rg_idx, rg_start, rg_end) in rg_row_ranges.iter() {
+            if current_file_row < *rg_end && selector_end > *rg_start {
+                // This selector overlaps with this row group
+                let overlap_start = current_file_row.max(*rg_start);
+                let overlap_end = selector_end.min(*rg_end);
+                let overlap_count = overlap_end - overlap_start;
+
+                if overlap_count > 0 {
+                    let entry = rg_selections.entry(*rg_idx).or_default();
+                    if selector.skip {
+                        entry.push(RowSelector::skip(overlap_count));
+                    } else {
+                        entry.push(RowSelector::select(overlap_count));
+                    }
+                }
+            }
+        }
+
+        current_file_row = selector_end;
+    }
+
+    // Build new selection for reversed row group order
+    // Only iterate over the row groups that are being scanned, in reverse order
+    let mut reversed_selectors = Vec::new();
+    for &rg_idx in row_groups_to_scan.iter().rev() {
+        if let Some(selectors) = rg_selections.get(&rg_idx) {
+            reversed_selectors.extend(selectors.iter().cloned());
+        } else {
+            // No specific selection for this row group means select all rows in it
+            if let Some((_, start, end)) =
+                rg_row_ranges.iter().find(|(idx, _, _)| *idx == rg_idx)
+            {
+                reversed_selectors.push(RowSelector::select(end - start));
+            }
+        }
+    }
+
+    Ok(RowSelection::from(reversed_selectors))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::ParquetAccessPlan;
+    use crate::RowGroupAccess;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use bytes::Bytes;
+    use parquet::arrow::ArrowWriter;
+    use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use std::sync::Arc;
+
+    /// Helper function to create a ParquetMetaData with specified row group sizes
+    /// by actually writing a parquet file in memory
+    fn create_test_metadata(
+        row_group_sizes: Vec<i64>,
+    ) -> parquet::file::metadata::ParquetMetaData {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let mut buffer = Vec::new();
+        {
+            let props = parquet::file::properties::WriterProperties::builder().build();
+            let mut writer =
+                ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap();
+
+            for &size in &row_group_sizes {
+                let array = arrow::array::Int32Array::from(vec![1; size as usize]);
+                let batch = arrow::record_batch::RecordBatch::try_new(
+                    schema.clone(),
+                    vec![Arc::new(array)],
+                )
+                .unwrap();
+                writer.write(&batch).unwrap();
+                writer.flush().unwrap();
+            }
+            writer.close().unwrap();
+        }
+
+        let bytes = Bytes::from(buffer);
+        let reader = SerializedFileReader::new(bytes).unwrap();
+        reader.metadata().clone()
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_simple() {
+        // Test: all row groups are scanned, no row selection
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let access_plan = ParquetAccessPlan::new_all(3);
+        let rg_metadata = metadata.row_groups();
+
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 1, 2]);
+
+        // No row selection originally due to scanning all rows
+        assert_eq!(prepared_plan.row_selection, None);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify row groups are reversed
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 1, 0]);
+
+        // If no selection originally, after reversal should still select all rows,
+        // and the selection should be None
+        assert_eq!(reversed_plan.row_selection, None);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_selection() {
+        // Test: simple row selection that spans multiple row groups
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Select first 50 rows from first row group, skip rest
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            original_selected, reversed_selected,
+            "Total selected rows should remain the same"
+        );
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_multi_row_group_selection() {
+        // Test: row selection spanning multiple row groups
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Create selection that spans RG0 and RG1
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_empty_selection() {
+        // Test: all rows are skipped
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Skip all rows in all row groups
+        for i in 0..3 {
+            access_plan
+                .scan_selection(i, RowSelection::from(vec![RowSelector::skip(100)]));
+        }
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Should still skip all rows
+        let total_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(total_selected, 0);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_different_row_group_sizes() {
+        // Test: row groups with different sizes
+        let metadata = create_test_metadata(vec![50, 150, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Create complex selection pattern
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::skip(25), RowSelector::select(25)]),
+        );
+        access_plan.scan_selection(1, RowSelection::from(vec![RowSelector::select(150)]));
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_single_row_group() {
+        // Test: single row group case
+        let metadata = create_test_metadata(vec![100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(1);
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(50), RowSelector::skip(50)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // With single row group, row_group_indexes should remain [0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![0]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 50);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_complex_pattern() {
+        // Test: complex pattern with multiple select/skip segments
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Complex pattern: select some, skip some, select some more
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![
+                RowSelector::select(30),
+                RowSelector::skip(40),
+                RowSelector::select(30),
+            ]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 210); // 30 + 30 + 50 + 100
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_skipped_row_groups() {
+        // This is the KEY test case for the bug fix!
+        // Test scenario where some row groups are completely skipped (not in scan plan)
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scenario: RG0 (scan all), RG1 (completely skipped), RG2 (partial), RG3 (scan all)
+        // Only row groups [0, 2, 3] are in the scan plan
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - NOT in scan plan!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add row selections for the scanned row groups
+        // Note: The RowSelection only covers row groups [0, 2, 3] (300 rows total)
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG0: all 100 rows
+        );
+        // RG1 is skipped, no selection needed
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![
+                RowSelector::select(25), // RG2: first 25 rows
+                RowSelector::skip(75),   // RG2: skip last 75 rows
+            ]),
+        );
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG3: all 100 rows
+        );
+
+        let rg_metadata = metadata.row_groups();
+
+        // Step 1: Create PreparedAccessPlan
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2, 3]);
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 225); // 100 + 25 + 100
+
+        // Step 2: Reverse the plan (this is the production code path)
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed results
+        // Row group order should be reversed: [3, 2, 0]
+        assert_eq!(
+            reversed_plan.row_group_indexes,
+            vec![3, 2, 0],
+            "Row groups should be reversed"
+        );
+
+        // Verify row selection is also correctly reversed
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 225,
+            "Total selected rows should remain the same"
+        );
+
+        // Verify the reversed selection structure
+        // After reversal, the order becomes: RG3, RG2, RG0
+        // - RG3: select(100)
+        // - RG2: select(25), skip(75)  (note: internal order preserved, not reversed)
+        // - RG0: select(100)
+        //
+        // After RowSelection::from() merges adjacent selectors of the same type:
+        // - RG3's select(100) + RG2's select(25) = select(125)
+        // - RG2's skip(75) remains as skip(75)
+        // - RG0's select(100) remains as select(100)
+        let selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+        assert_eq!(selectors.len(), 3);
+
+        // RG3 (100) + RG2 first part (25) merged into select(125)
+        assert!(!selectors[0].skip);
+        assert_eq!(selectors[0].row_count, 125);
+
+        // RG2: skip last 75 rows
+        assert!(selectors[1].skip);
+        assert_eq!(selectors[1].row_count, 75);
+
+        // RG0: select all 100 rows
+        assert!(!selectors[2].skip);
+        assert_eq!(selectors[2].row_count, 100);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_alternating_row_groups() {
+        // Test with alternating scan/skip pattern
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scan RG0 and RG2, skip RG1 and RG3
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Skip, // RG3
+        ]);
+
+        access_plan.scan_selection(0, RowSelection::from(vec![RowSelector::select(100)]));
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        // Original: [0, 2]
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2]);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: [2, 0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 0]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 200);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_middle_row_group_only() {
+        // Test selecting only the middle row group
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Skip, // RG0
+            RowGroupAccess::Scan, // RG1
+            RowGroupAccess::Skip, // RG2
+        ]);
+
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::select(100)]), // Select all of RG1
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        let original_selected: usize = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        // Original: [1]
+        assert_eq!(prepared_plan.row_group_indexes, vec![1]);
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: still [1] (only one row group)
+        assert_eq!(reversed_plan.row_group_indexes, vec![1]);
+
+        let reversed_selected: usize = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(original_selected, reversed_selected);
+        assert_eq!(original_selected, 100);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_with_skipped_row_groups_detailed() {
+        // This is the KEY test case for the bug fix!
+        // Test scenario where some row groups are completely skipped (not in scan plan)
+        // This version includes DETAILED verification of the selector distribution
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scenario: RG0 (scan all), RG1 (completely skipped), RG2 (partial), RG3 (scan all)
+        // Only row groups [0, 2, 3] are in the scan plan
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1 - NOT in scan plan!
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Scan, // RG3
+        ]);
+
+        // Add row selections for the scanned row groups
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG0: all 100 rows
+        );
+        // RG1 is skipped, no selection needed
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![
+                RowSelector::select(25), // RG2: first 25 rows
+                RowSelector::skip(75),   // RG2: skip last 75 rows
+            ]),
+        );
+        access_plan.scan_selection(
+            3,
+            RowSelection::from(vec![RowSelector::select(100)]), // RG3: all 100 rows
+        );
+
+        let rg_metadata = metadata.row_groups();
+
+        // Step 1: Create PreparedAccessPlan
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original plan in detail
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2, 3]);
+
+        // Detailed verification of original selection
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // Original structure should be:
+        // RG0: select(100)
+        // RG2: select(25), skip(75)
+        // RG3: select(100)
+        // After merging by RowSelection::from(): select(125), skip(75), select(100)
+        assert_eq!(
+            orig_selectors.len(),
+            3,
+            "Original should have 3 selectors after merging"
+        );
+        assert!(
+            !orig_selectors[0].skip && orig_selectors[0].row_count == 125,
+            "Original: First selector should be select(125) from RG0(100) + RG2(25)"
+        );
+        assert!(
+            orig_selectors[1].skip && orig_selectors[1].row_count == 75,
+            "Original: Second selector should be skip(75) from RG2"
+        );
+        assert!(
+            !orig_selectors[2].skip && orig_selectors[2].row_count == 100,
+            "Original: Third selector should be select(100) from RG3"
+        );
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 225); // 100 + 25 + 100
+
+        // Step 2: Reverse the plan (this is the production code path)
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed results
+        // Row group order should be reversed: [3, 2, 0]
+        assert_eq!(
+            reversed_plan.row_group_indexes,
+            vec![3, 2, 0],
+            "Row groups should be reversed"
+        );
+
+        // Detailed verification of reversed selection
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal, the order becomes: RG3, RG2, RG0
+        // - RG3: select(100)
+        // - RG2: select(25), skip(75)  (note: internal order preserved, not reversed)
+        // - RG0: select(100)
+        //
+        // After RowSelection::from() merges adjacent selectors of the same type:
+        // - RG3's select(100) + RG2's select(25) = select(125)
+        // - RG2's skip(75) remains as skip(75)
+        // - RG0's select(100) remains as select(100)
+
+        assert_eq!(
+            rev_selectors.len(),
+            3,
+            "Reversed should have 3 selectors after merging"
+        );
+
+        // First selector: RG3 (100) + RG2 first part (25) merged into select(125)
+        assert!(
+            !rev_selectors[0].skip && rev_selectors[0].row_count == 125,
+            "Reversed: First selector should be select(125) from RG3(100) + RG2(25), got skip={} count={}",
+            rev_selectors[0].skip,
+            rev_selectors[0].row_count
+        );
+
+        // Second selector: RG2 skip last 75 rows
+        assert!(
+            rev_selectors[1].skip && rev_selectors[1].row_count == 75,
+            "Reversed: Second selector should be skip(75) from RG2, got skip={} count={}",
+            rev_selectors[1].skip,
+            rev_selectors[1].row_count
+        );
+
+        // Third selector: RG0 select all 100 rows
+        assert!(
+            !rev_selectors[2].skip && rev_selectors[2].row_count == 100,
+            "Reversed: Third selector should be select(100) from RG0, got skip={} count={}",
+            rev_selectors[2].skip,
+            rev_selectors[2].row_count
+        );
+
+        // Verify row selection is also correctly reversed (total count)
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 225,
+            "Total selected rows should remain the same"
+        );
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_complex_pattern_detailed() {
+        // Test: complex pattern with detailed verification
+        let metadata = create_test_metadata(vec![100, 100, 100]);
+
+        let mut access_plan = ParquetAccessPlan::new_all(3);
+
+        // Complex pattern: select some, skip some, select some more
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![
+                RowSelector::select(30),
+                RowSelector::skip(40),
+                RowSelector::select(30),
+            ]),
+        );
+        access_plan.scan_selection(
+            1,
+            RowSelection::from(vec![RowSelector::skip(50), RowSelector::select(50)]),
+        );
+        access_plan.scan_selection(2, RowSelection::from(vec![RowSelector::select(100)]));
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Verify original selection structure in detail
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // RG0: select(30), skip(40), select(30)
+        // RG1: skip(50), select(50)
+        // RG2: select(100)
+        // Sequential: sel(30), skip(40), sel(30), skip(50), sel(50), sel(100)
+        // After merge: sel(30), skip(40), sel(30), skip(50), sel(150)
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 210); // 30 + 30 + 50 + 100
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // Verify reversed selection structure
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal: RG2, RG1, RG0
+        // RG2: select(100)
+        // RG1: skip(50), select(50)
+        // RG0: select(30), skip(40), select(30)
+        // Sequential: sel(100), skip(50), sel(50), sel(30), skip(40), sel(30)
+        // After merge: sel(100), skip(50), sel(80), skip(40), sel(30)
+
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(
+            reversed_selected, 210,
+            "Total selected rows should remain the same (30 + 30 + 50 + 100)"
+        );
+
+        // Verify row group order
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 1, 0]);
+    }
+
+    #[test]
+    fn test_prepared_access_plan_reverse_alternating_detailed() {
+        // Test with alternating scan/skip pattern with detailed verification
+        let metadata = create_test_metadata(vec![100, 100, 100, 100]);
+
+        // Scan RG0 and RG2, skip RG1 and RG3
+        let mut access_plan = ParquetAccessPlan::new(vec![
+            RowGroupAccess::Scan, // RG0
+            RowGroupAccess::Skip, // RG1
+            RowGroupAccess::Scan, // RG2
+            RowGroupAccess::Skip, // RG3
+        ]);
+
+        access_plan.scan_selection(
+            0,
+            RowSelection::from(vec![RowSelector::select(30), RowSelector::skip(70)]),
+        );
+        access_plan.scan_selection(
+            2,
+            RowSelection::from(vec![RowSelector::skip(20), RowSelector::select(80)]),
+        );
+
+        let rg_metadata = metadata.row_groups();
+        let prepared_plan = access_plan
+            .prepare(rg_metadata)
+            .expect("Failed to create PreparedAccessPlan");
+
+        // Original: [0, 2]
+        assert_eq!(prepared_plan.row_group_indexes, vec![0, 2]);
+
+        // Verify original selection
+        let orig_selectors: Vec<_> = prepared_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // Original:
+        // RG0: select(30), skip(70)
+        // RG2: skip(20), select(80)
+        // Sequential: sel(30), skip(90), sel(80)
+        //   (RG0's skip(70) + RG2's skip(20) = skip(90))
+
+        let original_selected: usize = orig_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+        assert_eq!(original_selected, 110); // 30 + 80
+
+        let reversed_plan = prepared_plan
+            .reverse(&metadata)
+            .expect("Failed to reverse PreparedAccessPlan");
+
+        // After reverse: [2, 0]
+        assert_eq!(reversed_plan.row_group_indexes, vec![2, 0]);
+
+        // Verify reversed selection
+        let rev_selectors: Vec<_> = reversed_plan
+            .row_selection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .collect();
+
+        // After reversal: RG2, RG0
+        // RG2: skip(20), select(80)
+        // RG0: select(30), skip(70)
+        // Sequential: skip(20), sel(110), skip(70)
+        //   (RG2's select(80) + RG0's select(30) = select(110))
+
+        let reversed_selected: usize = rev_selectors
+            .iter()
+            .filter(|s| !s.skip)
+            .map(|s| s.row_count)
+            .sum();
+
+        assert_eq!(reversed_selected, 110); // Should still be 30 + 80
+
+        // Detailed verification of structure
+        assert_eq!(rev_selectors.len(), 3, "Reversed should have 3 selectors");
+
+        assert!(
+            rev_selectors[0].skip && rev_selectors[0].row_count == 20,
+            "First selector should be skip(20) from RG2"
+        );
+
+        assert!(
+            !rev_selectors[1].skip && rev_selectors[1].row_count == 110,
+            "Second selector should be select(110) from RG2(80) + RG0(30)"
+        );
+
+        assert!(
+            rev_selectors[2].skip && rev_selectors[2].row_count == 70,
+            "Third selector should be skip(70) from RG0"
+        );
+    }
+}
diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs
index 30b774d08f1d6..d439bc62252c2 100644
--- a/datafusion/datasource-parquet/src/source.rs
+++ b/datafusion/datasource-parquet/src/source.rs
@@ -21,36 +21,49 @@ use std::fmt::Debug;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
-use crate::opener::build_pruning_predicates;
-use crate::opener::ParquetOpener;
-use crate::row_filter::can_expr_be_pushed_down_with_schemas;
 use crate::DefaultParquetFileReaderFactory;
 use crate::ParquetFileReaderFactory;
+use crate::ParquetMorselizer;
+use crate::opener::{
+    EncryptionContext, ParquetMorselizerState, build_pruning_predicates,
+};
+use crate::row_filter::can_expr_be_pushed_down_with_schemas;
 use datafusion_common::config::ConfigOptions;
+#[cfg(feature = "parquet_encryption")]
+use datafusion_common::config::EncryptionFactoryOptions;
 use datafusion_datasource::as_file_source;
 use datafusion_datasource::file_stream::FileOpener;
-use datafusion_datasource::impl_schema_adapter_methods;
-use datafusion_datasource::schema_adapter::{
-    DefaultSchemaAdapterFactory, SchemaAdapterFactory,
-};
+use datafusion_datasource::morsel::Morselizer;
 
-use arrow::datatypes::{SchemaRef, TimeUnit};
+use arrow::datatypes::TimeUnit;
 use datafusion_common::config::TableParquetOptions;
-use datafusion_common::{DataFusionError, Statistics};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, internal_err};
+use datafusion_datasource::TableSchema;
 use datafusion_datasource::file::FileSource;
 use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_physical_expr::conjunction;
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::{EquivalenceProperties, conjunction};
+use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_plan::filter_pushdown::FilterPushdownPropagation;
-use datafusion_physical_plan::filter_pushdown::PredicateSupport;
-use datafusion_physical_plan::filter_pushdown::PredicateSupports;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_physical_plan::DisplayFormatType;
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::filter_pushdown::PushedDown;
+use datafusion_physical_plan::filter_pushdown::{
+    FilterPushdownPropagation, PushedDownPredicate,
+};
 use datafusion_physical_plan::metrics::Count;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-use datafusion_physical_plan::DisplayFormatType;
 
+#[cfg(feature = "parquet_encryption")]
+use datafusion_execution::parquet_encryption::EncryptionFactory;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use itertools::Itertools;
 use object_store::ObjectStore;
+#[cfg(feature = "parquet_encryption")]
+use parquet::encryption::decrypt::FileDecryptionProperties;
+
 /// Execution plan for reading one or more Parquet files.
 ///
 /// ```text
@@ -77,7 +90,6 @@ use object_store::ObjectStore;
 ///  │.───────────────────.│
 ///  │                     )
 ///   `───────────────────'
-///
 /// ```
 ///
 /// # Example: Create a `DataSourceExec`
@@ -96,11 +108,11 @@ use object_store::ObjectStore;
 /// # let object_store_url = ObjectStoreUrl::local_filesystem();
 /// # let predicate = lit(true);
 /// let source = Arc::new(
-///     ParquetSource::default()
-///     .with_predicate(predicate)
+///     ParquetSource::new(Arc::clone(&file_schema))
+///         .with_predicate(predicate)
 /// );
 /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source)
+/// let config = FileScanConfigBuilder::new(object_store_url, source)
 ///    .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build();
 /// let exec = DataSourceExec::from_data_source(config);
 /// ```
@@ -125,7 +137,7 @@ use object_store::ObjectStore;
 ///   details.
 ///
 /// * Schema evolution: read parquet files with different schemas into a unified
-///   table schema. See [`SchemaAdapterFactory`] for more details.
+///   table schema. See [`DefaultPhysicalExprAdapterFactory`] for more details.
 ///
 /// * metadata_size_hint: controls the number of bytes read from the end of the
 ///   file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
@@ -166,7 +178,7 @@ use object_store::ObjectStore;
 /// ```no_run
 /// # use std::sync::Arc;
 /// # use arrow::datatypes::Schema;
-/// # use datafusion_datasource::file_scan_config::FileScanConfig;
+/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 /// # use datafusion_datasource::PartitionedFile;
 /// # use datafusion_datasource::source::DataSourceExec;
 ///
@@ -180,9 +192,9 @@ use object_store::ObjectStore;
 ///   .iter()
 ///   .map(|file_group| {
 ///     // create a new exec by copying the existing exec's source config
-///     let new_config = base_config
-///         .clone()
-///        .with_file_groups(vec![file_group.clone()]);
+///     let new_config = FileScanConfigBuilder::from(base_config.clone())
+///        .with_file_groups(vec![file_group.clone()])
+///       .build();
 ///
 ///     (DataSourceExec::from_data_source(new_config))
 ///   })
@@ -223,7 +235,7 @@ use object_store::ObjectStore;
 /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234)
 ///   .with_extensions(Arc::new(access_plan));
 /// // create a FileScanConfig to scan this file
-/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default()))
+/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(ParquetSource::new(schema())))
 ///     .with_file(partitioned_file).build();
 /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional
 /// // pruning based on predicates may also happen
@@ -232,7 +244,7 @@ use object_store::ObjectStore;
 ///
 /// For a complete example, see the [`advanced_parquet_index` example]).
 ///
-/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_parquet_index.rs
+/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/parquet_advanced_index.rs
 ///
 /// # Execution Overview
 ///
@@ -252,13 +264,13 @@ use object_store::ObjectStore;
 ///   [`Self::with_pushdown_filters`]).
 ///
 /// * Step 5: As each [`RecordBatch`] is read, it may be adapted by a
-///   [`SchemaAdapter`] to match the table schema. By default missing columns are
-///   filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
+///   [`DefaultPhysicalExprAdapterFactory`] to match the table schema. By default missing columns are
+///   filled with nulls, but this can be customized via [`PhysicalExprAdapterFactory`].
 ///
 /// [`RecordBatch`]: arrow::record_batch::RecordBatch
-/// [`SchemaAdapter`]: datafusion_datasource::schema_adapter::SchemaAdapter
 /// [`ParquetMetadata`]: parquet::file::metadata::ParquetMetaData
-#[derive(Clone, Default, Debug)]
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[derive(Clone, Debug)]
 pub struct ParquetSource {
     /// Options for reading Parquet files
     pub(crate) table_parquet_options: TableParquetOptions,
@@ -267,31 +279,61 @@ pub struct ParquetSource {
     /// The schema of the file.
     /// In particular, this is the schema of the table without partition columns,
     /// *not* the physical schema of the file.
-    pub(crate) file_schema: Option<SchemaRef>,
+    pub(crate) table_schema: TableSchema,
     /// Optional predicate for row filtering during parquet scan
     pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
     /// Optional user defined parquet file reader factory
     pub(crate) parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>,
-    /// Optional user defined schema adapter
-    pub(crate) schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
     /// Batch size configuration
     pub(crate) batch_size: Option<usize>,
     /// Optional hint for the size of the parquet metadata
     pub(crate) metadata_size_hint: Option<usize>,
-    pub(crate) projected_statistics: Option<Statistics>,
+    /// Projection to apply to the output.
+    pub(crate) projection: ProjectionExprs,
+    #[cfg(feature = "parquet_encryption")]
+    pub(crate) encryption_factory: Option<Arc<dyn EncryptionFactory>>,
+    /// If true, read files in reverse order and reverse row groups within files.
+    /// But it's not guaranteed that rows within row groups are in reverse order,
+    /// so we still need to sort them after reading, so the reverse scan is inexact.
+    /// Used to optimize ORDER BY ... DESC on sorted data.
+    reverse_row_groups: bool,
 }
 
 impl ParquetSource {
     /// Create a new ParquetSource to read the data specified in the file scan
-    /// configuration with the provided `TableParquetOptions`.
-    /// if default values are going to be used, use `ParguetConfig::default()` instead
-    pub fn new(table_parquet_options: TableParquetOptions) -> Self {
+    /// configuration with the provided schema.
+    ///
+    /// Uses default `TableParquetOptions`.
+    /// To set custom options, use [ParquetSource::with_table_parquet_options`].
+    pub fn new(table_schema: impl Into<TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        // Projection over the full table schema (file columns + partition columns)
+        let full_schema = table_schema.table_schema();
+        let indices: Vec<usize> = (0..full_schema.fields().len()).collect();
         Self {
-            table_parquet_options,
-            ..Self::default()
+            projection: ProjectionExprs::from_indices(&indices, full_schema),
+            table_schema,
+            table_parquet_options: TableParquetOptions::default(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            predicate: None,
+            parquet_file_reader_factory: None,
+            batch_size: None,
+            metadata_size_hint: None,
+            #[cfg(feature = "parquet_encryption")]
+            encryption_factory: None,
+            reverse_row_groups: false,
         }
     }
 
+    /// Set the `TableParquetOptions` for this ParquetSource.
+    pub fn with_table_parquet_options(
+        mut self,
+        table_parquet_options: TableParquetOptions,
+    ) -> Self {
+        self.table_parquet_options = table_parquet_options;
+        self
+    }
+
     /// Set the metadata size hint
     ///
     /// This value determines how many bytes at the end of the file the default
@@ -303,26 +345,31 @@ impl ParquetSource {
         self
     }
 
-    fn with_metrics(mut self, metrics: ExecutionPlanMetricsSet) -> Self {
-        self.metrics = metrics;
-        self
-    }
-
     /// Set predicate information
+    #[expect(clippy::needless_pass_by_value)]
     pub fn with_predicate(&self, predicate: Arc<dyn PhysicalExpr>) -> Self {
         let mut conf = self.clone();
-        let metrics = ExecutionPlanMetricsSet::new();
-        conf = conf.with_metrics(metrics);
         conf.predicate = Some(Arc::clone(&predicate));
         conf
     }
 
+    /// Set the encryption factory to use to generate file decryption properties
+    #[cfg(feature = "parquet_encryption")]
+    pub fn with_encryption_factory(
+        mut self,
+        encryption_factory: Arc<dyn EncryptionFactory>,
+    ) -> Self {
+        self.encryption_factory = Some(encryption_factory);
+        self
+    }
+
     /// Options passed to the parquet reader for this scan
     pub fn table_parquet_options(&self) -> &TableParquetOptions {
         &self.table_parquet_options
     }
 
     /// Optional predicate.
+    #[deprecated(since = "50.2.0", note = "use `filter` instead")]
     pub fn predicate(&self) -> Option<&Arc<dyn PhysicalExpr>> {
         self.predicate.as_ref()
     }
@@ -335,7 +382,6 @@ impl ParquetSource {
     }
 
     /// Optional user defined parquet file reader factory.
-    ///
     pub fn with_parquet_file_reader_factory(
         mut self,
         parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
@@ -345,9 +391,7 @@ impl ParquetSource {
     }
 
     /// If true, the predicate will be used during the parquet scan.
-    /// Defaults to false
-    ///
-    /// [`Expr`]: datafusion_expr::Expr
+    /// Defaults to false.
     pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self {
         self.table_parquet_options.global.pushdown_filters = pushdown_filters;
         self
@@ -374,6 +418,11 @@ impl ParquetSource {
         self.table_parquet_options.global.reorder_filters
     }
 
+    /// Return the value of [`datafusion_common::config::ParquetOptions::force_filter_selections`]
+    fn force_filter_selections(&self) -> bool {
+        self.table_parquet_options.global.force_filter_selections
+    }
+
     /// If enabled, the reader will read the page index
     /// This is used to optimize filter pushdown
     /// via `RowSelector` and `RowFilter` by
@@ -409,71 +458,60 @@ impl ParquetSource {
         self.table_parquet_options.global.bloom_filter_on_read
     }
 
-    /// Applies schema adapter factory from the FileScanConfig if present.
-    ///
-    /// # Arguments
-    /// * `conf` - FileScanConfig that may contain a schema adapter factory
-    /// # Returns
-    /// The converted FileSource with schema adapter factory applied if provided
-    pub fn apply_schema_adapter(self, conf: &FileScanConfig) -> Arc<dyn FileSource> {
-        let file_source: Arc<dyn FileSource> = self.into();
-
-        // If the FileScanConfig.file_source() has a schema adapter factory, apply it
-        if let Some(factory) = conf.file_source().schema_adapter_factory() {
-            file_source.with_schema_adapter_factory(
-                Arc::<dyn SchemaAdapterFactory>::clone(&factory),
-            )
-        } else {
-            file_source
-        }
-    }
-}
-
-/// Parses datafusion.common.config.ParquetOptions.coerce_int96 String to a arrow_schema.datatype.TimeUnit
-pub(crate) fn parse_coerce_int96_string(
-    str_setting: &str,
-) -> datafusion_common::Result<TimeUnit> {
-    let str_setting_lower: &str = &str_setting.to_lowercase();
-
-    match str_setting_lower {
-        "ns" => Ok(TimeUnit::Nanosecond),
-        "us" => Ok(TimeUnit::Microsecond),
-        "ms" => Ok(TimeUnit::Millisecond),
-        "s" => Ok(TimeUnit::Second),
-        _ => Err(DataFusionError::Configuration(format!(
-            "Unknown or unsupported parquet coerce_int96: \
-        {str_setting}. Valid values are: ns, us, ms, and s."
-        ))),
+    /// Return the maximum predicate cache size, in bytes, used when
+    /// `pushdown_filters`
+    pub fn max_predicate_cache_size(&self) -> Option<usize> {
+        self.table_parquet_options.global.max_predicate_cache_size
     }
-}
 
-/// Allows easy conversion from ParquetSource to Arc&lt;dyn FileSource&gt;
-impl From<ParquetSource> for Arc<dyn FileSource> {
-    fn from(source: ParquetSource) -> Self {
-        as_file_source(source)
+    #[cfg(feature = "parquet_encryption")]
+    fn get_encryption_factory_with_config(
+        &self,
+    ) -> Option<(Arc<dyn EncryptionFactory>, EncryptionFactoryOptions)> {
+        match &self.encryption_factory {
+            None => None,
+            Some(factory) => Some((
+                Arc::clone(factory),
+                self.table_parquet_options.crypto.factory_options.clone(),
+            )),
+        }
     }
-}
 
-impl FileSource for ParquetSource {
-    fn create_file_opener(
+    fn create_parquet_morselizer(
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         partition: usize,
-    ) -> Arc<dyn FileOpener> {
-        let projection = base_config
-            .file_column_projection_indices()
-            .unwrap_or_else(|| (0..base_config.file_schema.fields().len()).collect());
-        let schema_adapter_factory = self
-            .schema_adapter_factory
+    ) -> datafusion_common::Result<ParquetMorselizer> {
+        let expr_adapter_factory = base_config
+            .expr_adapter_factory
             .clone()
-            .unwrap_or_else(|| Arc::new(DefaultSchemaAdapterFactory));
+            .unwrap_or_else(|| Arc::new(DefaultPhysicalExprAdapterFactory) as _);
 
         let parquet_file_reader_factory =
             self.parquet_file_reader_factory.clone().unwrap_or_else(|| {
                 Arc::new(DefaultParquetFileReaderFactory::new(object_store)) as _
             });
 
+        #[cfg(not(feature = "parquet_encryption"))]
+        let encryption_context = EncryptionContext::default();
+
+        #[cfg(feature = "parquet_encryption")]
+        let encryption_context = {
+            let file_decryption_properties = self
+                .table_parquet_options()
+                .crypto
+                .file_decryption
+                .clone()
+                .map(FileDecryptionProperties::from)
+                .map(Arc::new);
+
+            EncryptionContext::new(
+                file_decryption_properties,
+                self.get_encryption_factory_with_config(),
+            )
+        };
+
         let coerce_int96 = self
             .table_parquet_options
             .global
@@ -481,74 +519,124 @@ impl FileSource for ParquetSource {
             .as_ref()
             .map(|time_unit| parse_coerce_int96_string(time_unit.as_str()).unwrap());
 
-        Arc::new(ParquetOpener {
+        Ok(ParquetMorselizer::new(ParquetMorselizerState {
             partition_index: partition,
-            projection: Arc::from(projection),
+            projection: self.projection.clone(),
             batch_size: self
                 .batch_size
-                .expect("Batch size must set before creating ParquetOpener"),
+                .expect("Batch size must set before creating ParquetMorselizer"),
             limit: base_config.limit,
+            preserve_order: base_config.preserve_order,
             predicate: self.predicate.clone(),
-            logical_file_schema: Arc::clone(&base_config.file_schema),
+            table_schema: self.table_schema.clone(),
             metadata_size_hint: self.metadata_size_hint,
             metrics: self.metrics().clone(),
             parquet_file_reader_factory,
             pushdown_filters: self.pushdown_filters(),
             reorder_filters: self.reorder_filters(),
+            force_filter_selections: self.force_filter_selections(),
             enable_page_index: self.enable_page_index(),
             enable_bloom_filter: self.bloom_filter_on_read(),
             enable_row_group_stats_pruning: self.table_parquet_options.global.pruning,
-            schema_adapter_factory,
             coerce_int96,
-        })
+            expr_adapter_factory,
+            encryption_context,
+            max_predicate_cache_size: self.max_predicate_cache_size(),
+            reverse_row_groups: self.reverse_row_groups,
+        }))
+    }
+
+    pub(crate) fn with_reverse_row_groups(mut self, reverse_row_groups: bool) -> Self {
+        self.reverse_row_groups = reverse_row_groups;
+        self
+    }
+    #[cfg(test)]
+    pub(crate) fn reverse_row_groups(&self) -> bool {
+        self.reverse_row_groups
+    }
+}
+
+/// Parses datafusion.common.config.ParquetOptions.coerce_int96 String to a arrow_schema.datatype.TimeUnit
+pub(crate) fn parse_coerce_int96_string(
+    str_setting: &str,
+) -> datafusion_common::Result<TimeUnit> {
+    let str_setting_lower: &str = &str_setting.to_lowercase();
+
+    match str_setting_lower {
+        "ns" => Ok(TimeUnit::Nanosecond),
+        "us" => Ok(TimeUnit::Microsecond),
+        "ms" => Ok(TimeUnit::Millisecond),
+        "s" => Ok(TimeUnit::Second),
+        _ => Err(DataFusionError::Configuration(format!(
+            "Unknown or unsupported parquet coerce_int96: \
+        {str_setting}. Valid values are: ns, us, ms, and s."
+        ))),
+    }
+}
+
+/// Allows easy conversion from ParquetSource to Arc&lt;dyn FileSource&gt;
+impl From<ParquetSource> for Arc<dyn FileSource> {
+    fn from(source: ParquetSource) -> Self {
+        as_file_source(source)
+    }
+}
+
+impl FileSource for ParquetSource {
+    fn create_file_opener(
+        &self,
+        _object_store: Arc<dyn ObjectStore>,
+        _base_config: &FileScanConfig,
+        _partition: usize,
+    ) -> datafusion_common::Result<Arc<dyn FileOpener>> {
+        internal_err!(
+            "ParquetSource::create_file_opener called but it supports Morsel API"
+        )
+    }
+
+    fn create_morselizer(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> datafusion_common::Result<Box<dyn Morselizer>> {
+        let morselizer =
+            self.create_parquet_morselizer(object_store, base_config, partition)?;
+        Ok(Box::new(morselizer))
     }
 
     fn as_any(&self) -> &dyn Any {
         self
     }
 
-    fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
-        let mut conf = self.clone();
-        conf.batch_size = Some(batch_size);
-        Arc::new(conf)
+    fn table_schema(&self) -> &TableSchema {
+        &self.table_schema
     }
 
-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
-        Arc::new(Self {
-            file_schema: Some(schema),
-            ..self.clone()
-        })
+    fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.predicate.clone()
     }
 
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
+    fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
         let mut conf = self.clone();
-        conf.projected_statistics = Some(statistics);
+        conf.batch_size = Some(batch_size);
         Arc::new(conf)
     }
 
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> datafusion_common::Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        source.projection = self.projection.try_merge(projection)?;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn metrics(&self) -> &ExecutionPlanMetricsSet {
-        &self.metrics
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        Some(&self.projection)
     }
 
-    fn statistics(&self) -> datafusion_common::Result<Statistics> {
-        let statistics = &self.projected_statistics;
-        let statistics = statistics
-            .clone()
-            .expect("projected_statistics must be set");
-        // When filters are pushed down, we have no way of knowing the exact statistics.
-        // Note that pruning predicate is also a kind of filter pushdown.
-        // (bloom filters use `pruning_predicate` too).
-        // Because filter pushdown may happen dynamically as long as there is a predicate
-        // if we have *any* predicate applied, we can't guarantee the statistics are exact.
-        if self.predicate().is_some() {
-            Ok(statistics.to_inexact())
-        } else {
-            Ok(statistics)
-        }
+    fn metrics(&self) -> &ExecutionPlanMetricsSet {
+        &self.metrics
     }
 
     fn file_type(&self) -> &str {
@@ -559,12 +647,17 @@ impl FileSource for ParquetSource {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 let predicate_string = self
-                    .predicate()
+                    .filter()
                     .map(|p| format!(", predicate={p}"))
                     .unwrap_or_default();
 
                 write!(f, "{predicate_string}")?;
 
+                // Add reverse_scan info if enabled
+                if self.reverse_row_groups {
+                    write!(f, ", reverse_row_groups=true")?;
+                }
+
                 // Try to build a the pruning predicates.
                 // These are only generated here because it's useful to have *some*
                 // idea of what pushdown is happening when viewing plans.
@@ -573,13 +666,11 @@ impl FileSource for ParquetSource {
                 // the actual predicates are built in reference to the physical schema of
                 // each file, which we do not have at this point and hence cannot use.
                 // Instead we use the logical schema of the file (the table schema without partition columns).
-                if let (Some(file_schema), Some(predicate)) =
-                    (&self.file_schema, &self.predicate)
-                {
+                if let Some(predicate) = &self.predicate {
                     let predicate_creation_errors = Count::new();
                     if let (Some(pruning_predicate), _) = build_pruning_predicates(
                         Some(predicate),
-                        file_schema,
+                        self.table_schema.table_schema(),
                         &predicate_creation_errors,
                     ) {
                         let mut guarantees = pruning_predicate
@@ -588,7 +679,7 @@ impl FileSource for ParquetSource {
                             .map(|item| format!("{item}"))
                             .collect_vec();
                         guarantees.sort();
-                        writeln!(
+                        write!(
                             f,
                             ", pruning_predicate={}, required_guarantees=[{}]",
                             pruning_predicate.predicate_expr(),
@@ -599,7 +690,7 @@ impl FileSource for ParquetSource {
                 Ok(())
             }
             DisplayFormatType::TreeRender => {
-                if let Some(predicate) = self.predicate() {
+                if let Some(predicate) = self.filter() {
                     writeln!(f, "predicate={}", fmt_sql(predicate.as_ref()))?;
                 }
                 Ok(())
@@ -612,57 +703,270 @@ impl FileSource for ParquetSource {
         filters: Vec<Arc<dyn PhysicalExpr>>,
         config: &ConfigOptions,
     ) -> datafusion_common::Result<FilterPushdownPropagation<Arc<dyn FileSource>>> {
-        let Some(file_schema) = self.file_schema.clone() else {
-            return Ok(FilterPushdownPropagation::unsupported(filters));
-        };
-        // Can we push down the filters themselves into the scan or only use stats pruning?
+        let table_schema = self.table_schema.table_schema();
+        // Determine if based on configs we should push filters down.
+        // If either the table / scan itself or the config has pushdown enabled,
+        // we will push down the filters.
+        // If both are disabled, we will not push down the filters.
+        // By default they are both disabled.
+        // Regardless of pushdown, we will update the predicate to include the filters
+        // because even if scan pushdown is disabled we can still use the filters for stats pruning.
         let config_pushdown_enabled = config.execution.parquet.pushdown_filters;
         let table_pushdown_enabled = self.pushdown_filters();
         let pushdown_filters = table_pushdown_enabled || config_pushdown_enabled;
 
         let mut source = self.clone();
-        let mut allowed_filters = vec![];
-        let mut remaining_filters = vec![];
-        for filter in &filters {
-            if can_expr_be_pushed_down_with_schemas(filter, &file_schema) {
-                // This filter can be pushed down
-                allowed_filters.push(Arc::clone(filter));
-            } else {
-                // This filter cannot be pushed down
-                remaining_filters.push(Arc::clone(filter));
-            }
-        }
-        if allowed_filters.is_empty() {
+        let filters: Vec<PushedDownPredicate> = filters
+            .into_iter()
+            .map(|filter| {
+                if can_expr_be_pushed_down_with_schemas(&filter, table_schema) {
+                    PushedDownPredicate::supported(filter)
+                } else {
+                    PushedDownPredicate::unsupported(filter)
+                }
+            })
+            .collect();
+        if filters
+            .iter()
+            .all(|f| matches!(f.discriminant, PushedDown::No))
+        {
             // No filters can be pushed down, so we can just return the remaining filters
             // and avoid replacing the source in the physical plan.
-            return Ok(FilterPushdownPropagation::unsupported(filters));
+            return Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+                vec![PushedDown::No; filters.len()],
+            ));
         }
+        let allowed_filters = filters
+            .iter()
+            .filter_map(|f| match f.discriminant {
+                PushedDown::Yes => Some(Arc::clone(&f.predicate)),
+                PushedDown::No => None,
+            })
+            .collect_vec();
         let predicate = match source.predicate {
-            Some(predicate) => conjunction(
-                std::iter::once(predicate).chain(allowed_filters.iter().cloned()),
-            ),
-            None => conjunction(allowed_filters.iter().cloned()),
+            Some(predicate) => {
+                conjunction(std::iter::once(predicate).chain(allowed_filters))
+            }
+            None => conjunction(allowed_filters),
         };
         source.predicate = Some(predicate);
+        source = source.with_pushdown_filters(pushdown_filters);
         let source = Arc::new(source);
-        let filters = PredicateSupports::new(
-            allowed_filters
-                .into_iter()
-                .map(|f| {
-                    if pushdown_filters {
-                        PredicateSupport::Supported(f)
-                    } else {
-                        PredicateSupport::Unsupported(f)
-                    }
+        // If pushdown_filters is false we tell our parents that they still have to handle the filters,
+        // even if we updated the predicate to include the filters (they will only be used for stats pruning).
+        if !pushdown_filters {
+            return Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+                vec![PushedDown::No; filters.len()],
+            )
+            .with_updated_node(source));
+        }
+        Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+            filters.iter().map(|f| f.discriminant).collect(),
+        )
+        .with_updated_node(source))
+    }
+
+    /// Try to optimize the scan to produce data in the requested sort order.
+    ///
+    /// This method receives:
+    /// 1. The query's required ordering (`order` parameter)
+    /// 2. The file's natural ordering (via `self.file_ordering`, set by FileScanConfig)
+    ///
+    /// With both pieces of information, ParquetSource can decide what optimizations to apply.
+    ///
+    /// # Phase 1 Behavior (Current)
+    /// Returns `Inexact` when reversing the row group scan order would help satisfy the
+    /// requested ordering. We still need a Sort operator at a higher level because:
+    /// - We only reverse row group read order, not rows within row groups
+    /// - This provides approximate ordering that benefits limit pushdown
+    ///
+    /// # Phase 2 (Future)
+    /// Could return `Exact` when we can guarantee perfect ordering through techniques like:
+    /// - File reordering based on statistics
+    /// - Detecting already-sorted data
+    ///   This would allow removing the Sort operator entirely.
+    ///
+    /// # Returns
+    /// - `Inexact`: Created an optimized source (e.g., reversed scan) that approximates the order
+    /// - `Unsupported`: Cannot optimize for this ordering
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+        eq_properties: &EquivalenceProperties,
+    ) -> datafusion_common::Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        if order.is_empty() {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Build new equivalence properties with the reversed ordering.
+        // This allows us to check if the reversed ordering satisfies the request
+        // by leveraging:
+        // - Function monotonicity (e.g., extract_year_month preserves ordering)
+        // - Constant columns (from filters)
+        // - Other equivalence relationships
+        //
+        // Example flow:
+        // 1. File ordering: [extract_year_month(ws) DESC, ws DESC]
+        // 2. After reversal: [extract_year_month(ws) ASC, ws ASC]
+        // 3. Requested: [ws ASC]
+        // 4. Through extract_year_month's monotonicity property, the reversed
+        //    ordering satisfies [ws ASC] even though it has additional prefix
+        let reversed_eq_properties = {
+            let mut new = eq_properties.clone();
+            new.clear_orderings();
+
+            // Reverse each ordering in the equivalence properties
+            let reversed_orderings = eq_properties
+                .oeq_class()
+                .iter()
+                .map(|ordering| {
+                    ordering
+                        .iter()
+                        .map(|expr| expr.reverse())
+                        .collect::<Vec<_>>()
                 })
-                .chain(
-                    remaining_filters
-                        .into_iter()
-                        .map(PredicateSupport::Unsupported),
-                )
-                .collect(),
-        );
-        Ok(FilterPushdownPropagation::with_filters(filters).with_updated_node(source))
-    }
-    impl_schema_adapter_methods!();
+                .collect::<Vec<_>>();
+
+            new.add_orderings(reversed_orderings);
+            new
+        };
+
+        // Check if the reversed ordering satisfies the requested ordering
+        if !reversed_eq_properties.ordering_satisfy(order.iter().cloned())? {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Return Inexact because we're only reversing row group order,
+        // not guaranteeing perfect row-level ordering
+        let new_source = self.clone().with_reverse_row_groups(true);
+        Ok(SortOrderPushdownResult::Inexact {
+            inner: Arc::new(new_source) as Arc<dyn FileSource>,
+        })
+
+        // TODO Phase 2: Add support for other optimizations:
+        // - File reordering based on min/max statistics
+        // - Detection of exact ordering (return Exact to remove Sort operator)
+        // - Partial sort pushdown for prefix matches
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn PhysicalExpr,
+        ) -> datafusion_common::Result<TreeNodeRecursion>,
+    ) -> datafusion_common::Result<TreeNodeRecursion> {
+        // Visit predicate (filter) expression if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(predicate) = &self.predicate {
+            tnr = tnr.visit_sibling(|| f(predicate.as_ref()))?;
+        }
+
+        // Visit projection expressions
+        for proj_expr in &self.projection {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Schema;
+    use datafusion_physical_expr::expressions::lit;
+
+    #[test]
+    #[expect(deprecated)]
+    fn test_parquet_source_predicate_same_as_filter() {
+        let predicate = lit(true);
+
+        let parquet_source =
+            ParquetSource::new(Arc::new(Schema::empty())).with_predicate(predicate);
+        // same value. but filter() call Arc::clone internally
+        assert_eq!(parquet_source.predicate(), parquet_source.filter().as_ref());
+    }
+
+    #[test]
+    fn test_reverse_scan_default_value() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+        let source = ParquetSource::new(schema);
+
+        assert!(!source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_with_setter() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema.clone()).with_reverse_row_groups(true);
+        assert!(source.reverse_row_groups());
+
+        let source = source.with_reverse_row_groups(false);
+        assert!(!source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_clone_preserves_value() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema).with_reverse_row_groups(true);
+        let cloned = source.clone();
+
+        assert!(cloned.reverse_row_groups());
+        assert_eq!(source.reverse_row_groups(), cloned.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_with_other_options() {
+        use arrow::datatypes::Schema;
+        use datafusion_common::config::TableParquetOptions;
+
+        let schema = Arc::new(Schema::empty());
+        let options = TableParquetOptions::default();
+
+        let source = ParquetSource::new(schema)
+            .with_table_parquet_options(options)
+            .with_metadata_size_hint(8192)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+        assert_eq!(source.metadata_size_hint, Some(8192));
+    }
+
+    #[test]
+    fn test_reverse_scan_builder_pattern() {
+        use arrow::datatypes::Schema;
+
+        let schema = Arc::new(Schema::empty());
+
+        let source = ParquetSource::new(schema)
+            .with_reverse_row_groups(true)
+            .with_reverse_row_groups(false)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+    }
+
+    #[test]
+    fn test_reverse_scan_independent_of_predicate() {
+        use arrow::datatypes::Schema;
+        use datafusion_physical_expr::expressions::lit;
+
+        let schema = Arc::new(Schema::empty());
+        let predicate = lit(true);
+
+        let source = ParquetSource::new(schema)
+            .with_predicate(predicate)
+            .with_reverse_row_groups(true);
+
+        assert!(source.reverse_row_groups());
+        assert!(source.filter().is_some());
+    }
 }
diff --git a/datafusion/datasource-parquet/src/supported_predicates.rs b/datafusion/datasource-parquet/src/supported_predicates.rs
new file mode 100644
index 0000000000000..a205c12dd06aa
--- /dev/null
+++ b/datafusion/datasource-parquet/src/supported_predicates.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Registry of physical expressions that support nested list column pushdown
+//! to the Parquet decoder.
+//!
+//! This module provides a trait-based approach for determining which predicates
+//! can be safely evaluated on nested list columns during Parquet decoding.
+
+use std::sync::Arc;
+
+use datafusion_physical_expr::expressions::{IsNotNullExpr, IsNullExpr};
+use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
+
+/// Trait for physical expressions that support list column pushdown during
+/// Parquet decoding.
+///
+/// This trait provides a type-safe mechanism for identifying expressions that
+/// can be safely pushed down to the Parquet decoder for evaluation on nested
+/// list columns.
+///
+/// # Implementation Notes
+///
+/// Expression types in external crates cannot directly implement this trait
+/// due to Rust's orphan rules. Instead, we use a blanket implementation that
+/// delegates to a registration mechanism.
+///
+/// # Examples
+///
+/// ```ignore
+/// use datafusion_physical_expr::PhysicalExpr;
+/// use datafusion_datasource_parquet::SupportsListPushdown;
+///
+/// let expr: Arc<dyn PhysicalExpr> = ...;
+/// if expr.supports_list_pushdown() {
+///     // Can safely push down to Parquet decoder
+/// }
+/// ```
+pub trait SupportsListPushdown {
+    /// Returns `true` if this expression supports list column pushdown.
+    fn supports_list_pushdown(&self) -> bool;
+}
+
+/// Blanket implementation for all physical expressions.
+///
+/// This delegates to specialized predicates that check whether the concrete
+/// expression type is registered as supporting list pushdown. This design
+/// allows the trait to work with expression types defined in external crates.
+impl SupportsListPushdown for dyn PhysicalExpr {
+    fn supports_list_pushdown(&self) -> bool {
+        is_null_check(self) || is_supported_scalar_function(self)
+    }
+}
+
+/// Checks if an expression is a NULL or NOT NULL check.
+///
+/// These checks are universally supported for all column types.
+fn is_null_check(expr: &dyn PhysicalExpr) -> bool {
+    expr.as_any().downcast_ref::<IsNullExpr>().is_some()
+        || expr.as_any().downcast_ref::<IsNotNullExpr>().is_some()
+}
+
+/// Checks if an expression is a scalar function registered for list pushdown.
+///
+/// Returns `true` if the expression is a `ScalarFunctionExpr` whose function
+/// is in the registry of supported operations.
+fn is_supported_scalar_function(expr: &dyn PhysicalExpr) -> bool {
+    scalar_function_name(expr).is_some_and(|name| {
+        // Registry of verified array functions
+        matches!(name, "array_has" | "array_has_all" | "array_has_any")
+    })
+}
+
+fn scalar_function_name(expr: &dyn PhysicalExpr) -> Option<&str> {
+    expr.as_any()
+        .downcast_ref::<ScalarFunctionExpr>()
+        .map(ScalarFunctionExpr::name)
+}
+
+/// Checks whether the given physical expression contains a supported nested
+/// predicate (for example, `array_has_all`).
+///
+/// This function recursively traverses the expression tree to determine if
+/// any node contains predicates that support list column pushdown to the
+/// Parquet decoder.
+///
+/// # Supported predicates
+///
+/// - `IS NULL` and `IS NOT NULL` checks on any column type
+/// - Array functions: `array_has`, `array_has_all`, `array_has_any`
+///
+/// # Returns
+///
+/// `true` if the expression or any of its children contain supported predicates.
+pub fn supports_list_predicates(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    expr.supports_list_pushdown()
+        || expr
+            .children()
+            .iter()
+            .any(|child| supports_list_predicates(child))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_null_check_detection() {
+        use datafusion_physical_expr::expressions::Column;
+
+        let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("test", 0));
+        assert!(!is_null_check(col_expr.as_ref()));
+
+        // IsNullExpr and IsNotNullExpr detection requires actual instances
+        // which need schema setup - tested in integration tests
+    }
+
+    #[test]
+    fn test_supported_scalar_functions() {
+        use datafusion_physical_expr::expressions::Column;
+
+        let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("test", 0));
+
+        // Non-function expressions should return false
+        assert!(!is_supported_scalar_function(col_expr.as_ref()));
+
+        // Testing with actual ScalarFunctionExpr requires function setup
+        // and is better suited for integration tests
+    }
+}
diff --git a/datafusion/datasource-parquet/src/test_data/ndv_test.parquet b/datafusion/datasource-parquet/src/test_data/ndv_test.parquet
new file mode 100644
index 0000000000000..3ecbe320f506e
Binary files /dev/null and b/datafusion/datasource-parquet/src/test_data/ndv_test.parquet differ
diff --git a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs b/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs
deleted file mode 100644
index 89406fb742dce..0000000000000
--- a/datafusion/datasource-parquet/tests/apply_schema_adapter_tests.rs
+++ /dev/null
@@ -1,206 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-mod parquet_adapter_tests {
-    use arrow::{
-        datatypes::{DataType, Field, Schema, SchemaRef},
-        record_batch::RecordBatch,
-    };
-    use datafusion_common::{ColumnStatistics, DataFusionError, Result};
-    use datafusion_datasource::{
-        file::FileSource,
-        file_scan_config::FileScanConfigBuilder,
-        schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper},
-    };
-    use datafusion_datasource_parquet::source::ParquetSource;
-    use datafusion_execution::object_store::ObjectStoreUrl;
-    use std::{fmt::Debug, sync::Arc};
-
-    /// A test schema adapter factory that adds prefix to column names
-    #[derive(Debug)]
-    struct PrefixAdapterFactory {
-        prefix: String,
-    }
-
-    impl SchemaAdapterFactory for PrefixAdapterFactory {
-        fn create(
-            &self,
-            projected_table_schema: SchemaRef,
-            _table_schema: SchemaRef,
-        ) -> Box<dyn SchemaAdapter> {
-            Box::new(PrefixAdapter {
-                input_schema: projected_table_schema,
-                prefix: self.prefix.clone(),
-            })
-        }
-    }
-
-    /// A test schema adapter that adds prefix to column names
-    #[derive(Debug)]
-    struct PrefixAdapter {
-        input_schema: SchemaRef,
-        prefix: String,
-    }
-
-    impl SchemaAdapter for PrefixAdapter {
-        fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-            let field = self.input_schema.field(index);
-            file_schema.fields.find(field.name()).map(|(i, _)| i)
-        }
-
-        fn map_schema(
-            &self,
-            file_schema: &Schema,
-        ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-            let mut projection = Vec::with_capacity(file_schema.fields().len());
-            for (file_idx, file_field) in file_schema.fields().iter().enumerate() {
-                if self.input_schema.fields().find(file_field.name()).is_some() {
-                    projection.push(file_idx);
-                }
-            }
-
-            // Create a schema mapper that adds a prefix to column names
-            #[derive(Debug)]
-            struct PrefixSchemaMapping {
-                // Keep only the prefix field which is actually used in the implementation
-                prefix: String,
-            }
-
-            impl SchemaMapper for PrefixSchemaMapping {
-                fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch> {
-                    // Create a new schema with prefixed field names
-                    let prefixed_fields: Vec<Field> = batch
-                        .schema()
-                        .fields()
-                        .iter()
-                        .map(|field| {
-                            Field::new(
-                                format!("{}{}", self.prefix, field.name()),
-                                field.data_type().clone(),
-                                field.is_nullable(),
-                            )
-                        })
-                        .collect();
-                    let prefixed_schema = Arc::new(Schema::new(prefixed_fields));
-
-                    // Create a new batch with the prefixed schema but the same data
-                    let options = arrow::record_batch::RecordBatchOptions::default();
-                    RecordBatch::try_new_with_options(
-                        prefixed_schema,
-                        batch.columns().to_vec(),
-                        &options,
-                    )
-                    .map_err(|e| DataFusionError::ArrowError(e, None))
-                }
-
-                fn map_column_statistics(
-                    &self,
-                    stats: &[ColumnStatistics],
-                ) -> Result<Vec<ColumnStatistics>> {
-                    // For testing, just return the input statistics
-                    Ok(stats.to_vec())
-                }
-            }
-
-            Ok((
-                Arc::new(PrefixSchemaMapping {
-                    prefix: self.prefix.clone(),
-                }),
-                projection,
-            ))
-        }
-    }
-
-    #[test]
-    fn test_apply_schema_adapter_with_factory() {
-        // Create a schema
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, true),
-        ]));
-
-        // Create a parquet source
-        let source = ParquetSource::default();
-
-        // Create a file scan config with source that has a schema adapter factory
-        let factory = Arc::new(PrefixAdapterFactory {
-            prefix: "test_".to_string(),
-        });
-
-        let file_source = source.clone().with_schema_adapter_factory(factory);
-
-        let config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            schema.clone(),
-            file_source,
-        )
-        .build();
-
-        // Apply schema adapter to a new source
-        let result_source = source.apply_schema_adapter(&config);
-
-        // Verify the adapter was applied
-        assert!(result_source.schema_adapter_factory().is_some());
-
-        // Create adapter and test it produces expected schema
-        let adapter_factory = result_source.schema_adapter_factory().unwrap();
-        let adapter = adapter_factory.create(schema.clone(), schema.clone());
-
-        // Create a dummy batch to test the schema mapping
-        let dummy_batch = RecordBatch::new_empty(schema.clone());
-
-        // Get the file schema (which is the same as the table schema in this test)
-        let (mapper, _) = adapter.map_schema(&schema).unwrap();
-
-        // Apply the mapping to get the output schema
-        let mapped_batch = mapper.map_batch(dummy_batch).unwrap();
-        let output_schema = mapped_batch.schema();
-
-        // Check the column names have the prefix
-        assert_eq!(output_schema.field(0).name(), "test_id");
-        assert_eq!(output_schema.field(1).name(), "test_name");
-    }
-
-    #[test]
-    fn test_apply_schema_adapter_without_factory() {
-        // Create a schema
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("name", DataType::Utf8, true),
-        ]));
-
-        // Create a parquet source
-        let source = ParquetSource::default();
-
-        // Convert to Arc<dyn FileSource>
-        let file_source: Arc<dyn FileSource> = Arc::new(source.clone());
-
-        // Create a file scan config without a schema adapter factory
-        let config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            schema.clone(),
-            file_source,
-        )
-        .build();
-
-        // Apply schema adapter function - should pass through the source unchanged
-        let result_source = source.apply_schema_adapter(&config);
-
-        // Verify no adapter was applied
-        assert!(result_source.schema_adapter_factory().is_none());
-    }
-}
diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml
index c936e4c1004c7..f7a8f01746081 100644
--- a/datafusion/datasource/Cargo.toml
+++ b/datafusion/datasource/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-datasource"
 description = "datafusion-datasource"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -31,13 +31,12 @@ version.workspace = true
 all-features = true
 
 [features]
-parquet = ["dep:parquet", "tempfile"]
-compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"]
+compression = ["async-compression", "liblzma", "bzip2", "flate2", "zstd", "tokio-util"]
 default = ["compression"]
 
 [dependencies]
 arrow = { workspace = true }
-async-compression = { version = "0.4.19", features = [
+async-compression = { version = "0.4.40", features = [
     "bzip2",
     "gzip",
     "xz",
@@ -46,35 +45,40 @@ async-compression = { version = "0.4.19", features = [
 ], optional = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
-bzip2 = { version = "0.5.2", optional = true }
+bzip2 = { workspace = true, optional = true }
 chrono = { workspace = true }
+crossbeam-queue = "0.3"
 datafusion-common = { workspace = true, features = ["object_store"] }
 datafusion-common-runtime = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-adapter = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-session = { workspace = true }
-flate2 = { version = "1.1.1", optional = true }
+flate2 = { workspace = true, optional = true }
 futures = { workspace = true }
-glob = "0.3.0"
+glob = { workspace = true }
 itertools = { workspace = true }
+liblzma = { workspace = true, optional = true }
 log = { workspace = true }
 object_store = { workspace = true }
-parquet = { workspace = true, optional = true }
 rand = { workspace = true }
 tempfile = { workspace = true, optional = true }
 tokio = { workspace = true }
-tokio-util = { version = "0.7.15", features = ["io"], optional = true }
+tokio-util = { version = "0.7.17", features = ["io"], optional = true }
 url = { workspace = true }
-xz2 = { version = "0.1", optional = true, features = ["static"] }
-zstd = { version = "0.13", optional = true, default-features = false }
+zstd = { workspace = true, optional = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
+insta = { workspace = true }
 tempfile = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/datasource/README.md b/datafusion/datasource/README.md
index 750ee9375154f..cf0bb7547c078 100644
--- a/datafusion/datasource/README.md
+++ b/datafusion/datasource/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion datasource
+# Apache DataFusion DataSource
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that defines common DataSource related components like FileScanConfig, FileCompression etc.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/datasource/benches/split_groups_by_statistics.rs b/datafusion/datasource/benches/split_groups_by_statistics.rs
index 3876b0b1217b5..e2ae4a9753df8 100644
--- a/datafusion/datasource/benches/split_groups_by_statistics.rs
+++ b/datafusion/datasource/benches/split_groups_by_statistics.rs
@@ -15,14 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::sync::Arc;
+use std::time::Duration;
+
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use datafusion_datasource::file_scan_config::FileScanConfig;
 use datafusion_datasource::{generate_test_files, verify_sort_integrity};
-use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use std::sync::Arc;
-use std::time::Duration;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 
 pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
     let file_schema = Arc::new(Schema::new(vec![Field::new(
@@ -31,13 +33,8 @@ pub fn compare_split_groups_by_statistics_algorithms(c: &mut Criterion) {
         false,
     )]));
 
-    let sort_expr = PhysicalSortExpr {
-        expr: Arc::new(datafusion_physical_expr::expressions::Column::new(
-            "value", 0,
-        )),
-        options: arrow::compute::SortOptions::default(),
-    };
-    let sort_ordering = LexOrdering::from(vec![sort_expr]);
+    let sort_expr = PhysicalSortExpr::new_default(Arc::new(Column::new("value", 0)));
+    let sort_ordering = LexOrdering::from([sort_expr]);
 
     // Small, medium, large number of files
     let file_counts = [10, 100, 1000];
diff --git a/datafusion/datasource/src/decoder.rs b/datafusion/datasource/src/decoder.rs
index 654569f741138..9f9fc0d94bb1c 100644
--- a/datafusion/datasource/src/decoder.rs
+++ b/datafusion/datasource/src/decoder.rs
@@ -24,9 +24,9 @@ use arrow::error::ArrowError;
 use bytes::Buf;
 use bytes::Bytes;
 use datafusion_common::Result;
-use futures::stream::BoxStream;
 use futures::StreamExt as _;
-use futures::{ready, Stream};
+use futures::stream::BoxStream;
+use futures::{Stream, ready};
 use std::collections::VecDeque;
 use std::fmt;
 use std::task::Poll;
@@ -175,17 +175,19 @@ pub fn deserialize_stream<'a>(
     mut input: impl Stream<Item = Result<Bytes>> + Unpin + Send + 'a,
     mut deserializer: impl BatchDeserializer<Bytes> + 'a,
 ) -> BoxStream<'a, Result<RecordBatch, ArrowError>> {
-    futures::stream::poll_fn(move |cx| loop {
-        match ready!(input.poll_next_unpin(cx)).transpose()? {
-            Some(b) => _ = deserializer.digest(b),
-            None => deserializer.finish(),
-        };
-
-        return match deserializer.next()? {
-            DeserializerOutput::RecordBatch(rb) => Poll::Ready(Some(Ok(rb))),
-            DeserializerOutput::InputExhausted => Poll::Ready(None),
-            DeserializerOutput::RequiresMoreData => continue,
-        };
+    futures::stream::poll_fn(move |cx| {
+        loop {
+            match ready!(input.poll_next_unpin(cx)).transpose()? {
+                Some(b) => _ = deserializer.digest(b),
+                None => deserializer.finish(),
+            };
+
+            return match deserializer.next()? {
+                DeserializerOutput::RecordBatch(rb) => Poll::Ready(Some(Ok(rb))),
+                DeserializerOutput::InputExhausted => Poll::Ready(None),
+                DeserializerOutput::RequiresMoreData => continue,
+            };
+        }
     })
     .boxed()
 }
diff --git a/datafusion/datasource/src/display.rs b/datafusion/datasource/src/display.rs
index c9e979535963c..0f59e33ff9eac 100644
--- a/datafusion/datasource/src/display.rs
+++ b/datafusion/datasource/src/display.rs
@@ -135,7 +135,7 @@ mod tests {
     use super::*;
 
     use datafusion_physical_plan::{DefaultDisplay, VerboseDisplay};
-    use object_store::{path::Path, ObjectMeta};
+    use object_store::{ObjectMeta, path::Path};
 
     use crate::PartitionedFile;
     use chrono::Utc;
@@ -287,13 +287,6 @@ mod tests {
             version: None,
         };
 
-        PartitionedFile {
-            object_meta,
-            partition_values: vec![],
-            range: None,
-            statistics: None,
-            extensions: None,
-            metadata_size_hint: None,
-        }
+        PartitionedFile::new_from_meta(object_meta)
     }
 }
diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs
index d0557e9f0872c..f13cc863c7761 100644
--- a/datafusion/datasource/src/file.rs
+++ b/datafusion/datasource/src/file.rs
@@ -25,23 +25,35 @@ use std::sync::Arc;
 use crate::file_groups::FileGroupPartitioner;
 use crate::file_scan_config::FileScanConfig;
 use crate::file_stream::FileOpener;
+use crate::morsel::{FileOpenerMorselizer, Morselizer};
+#[expect(deprecated)]
 use crate::schema_adapter::SchemaAdapterFactory;
-use arrow::datatypes::SchemaRef;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{Result, Statistics};
-use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
-use datafusion_physical_plan::filter_pushdown::FilterPushdownPropagation;
-use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, not_impl_err};
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr};
 use datafusion_physical_plan::DisplayFormatType;
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::filter_pushdown::{FilterPushdownPropagation, PushedDown};
+use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use object_store::ObjectStore;
 
-/// Helper function to convert any type implementing FileSource to Arc&lt;dyn FileSource&gt;
+/// Helper function to convert any type implementing [`FileSource`] to `Arc<dyn FileSource>`
 pub fn as_file_source<T: FileSource + 'static>(source: T) -> Arc<dyn FileSource> {
     Arc::new(source)
 }
 
-/// file format specific behaviors for elements in [`DataSource`]
+/// File format specific behaviors for [`DataSource`]
+///
+/// # Schema information
+/// There are two important schemas for a [`FileSource`]:
+/// 1. [`Self::table_schema`] -- the schema for the overall table
+///    (file data plus partition columns)
+/// 2. The logical output schema, comprised of [`Self::table_schema`] with
+///    [`Self::projection`] applied
 ///
 /// See more details on specific implementations:
 /// * [`ArrowSource`](https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.ArrowSource.html)
@@ -52,34 +64,94 @@ pub fn as_file_source<T: FileSource + 'static>(source: T) -> Arc<dyn FileSource>
 ///
 /// [`DataSource`]: crate::source::DataSource
 pub trait FileSource: Send + Sync {
-    /// Creates a `dyn FileOpener` based on given parameters
+    /// Creates a `dyn FileOpener` based on given parameters.
+    ///
+    /// `FileSource`s that implement the Morsel API should return a "Not
+    /// Implemented" or "Internal" error for this API.
+    ///
+    /// TODO: deprecate
     fn create_file_opener(
         &self,
         object_store: Arc<dyn ObjectStore>,
         base_config: &FileScanConfig,
         partition: usize,
-    ) -> Arc<dyn FileOpener>;
+    ) -> Result<Arc<dyn FileOpener>>;
+
+    /// Creates a `dyn Morselizer` based on given parameters.
+    ///
+    /// The default implementation preserves existing behavior by adapting the
+    /// legacy [`FileOpener`] API into a [`Morselizer`]. File formats with a
+    /// native morsel-driven implementation should override this method to
+    /// return a [`Morselizer`] and not implement the [`FileOpener`] API.
+    fn create_morselizer(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Result<Box<dyn Morselizer>> {
+        let opener = self.create_file_opener(object_store, base_config, partition)?;
+        Ok(Box::new(FileOpenerMorselizer::new(opener)))
+    }
     /// Any
     fn as_any(&self) -> &dyn Any;
+
+    /// Returns the table schema for the overall table (including partition columns, if any)
+    ///
+    /// This method returns the unprojected schema: the full schema of the data
+    /// without [`Self::projection`] applied.
+    ///
+    /// The output schema of this `FileSource` is this TableSchema
+    /// with [`Self::projection`] applied.
+    ///
+    /// Use [`ProjectionExprs::project_schema`] to get the projected schema
+    /// after applying the projection.
+    fn table_schema(&self) -> &crate::table_schema::TableSchema;
+
     /// Initialize new type with batch size configuration
     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource>;
-    /// Initialize new instance with a new schema
-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource>;
-    /// Initialize new instance with projection information
-    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource>;
-    /// Initialize new instance with projected statistics
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource>;
+
+    /// Returns the filter expression that will be applied *during* the file scan.
+    ///
+    /// These expressions are in terms of the unprojected [`Self::table_schema`].
+    fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        None
+    }
+
+    /// Return the projection that will be applied to the output stream on top
+    /// of [`Self::table_schema`].
+    ///
+    /// Note you can use [`ProjectionExprs::project_schema`] on the table
+    /// schema to get the effective output schema of this source.
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        None
+    }
+
     /// Return execution plan metrics
     fn metrics(&self) -> &ExecutionPlanMetricsSet;
-    /// Return projected statistics
-    fn statistics(&self) -> Result<Statistics>;
+
     /// String representation of file source such as "csv", "json", "parquet"
     fn file_type(&self) -> &str;
+
     /// Format FileType specific information
     fn fmt_extra(&self, _t: DisplayFormatType, _f: &mut Formatter) -> fmt::Result {
         Ok(())
     }
 
+    /// Returns whether this file source supports repartitioning files by byte ranges.
+    ///
+    /// When this returns `true`, files can be split into multiple partitions
+    /// based on byte offsets for parallel reading.
+    ///
+    /// When this returns `false`, files cannot be repartitioned (e.g., CSV files
+    /// with `newlines_in_values` enabled cannot be split because record boundaries
+    /// cannot be determined by byte offset alone).
+    ///
+    /// The default implementation returns `true`. File sources that cannot support
+    /// repartitioning should override this method.
+    fn supports_repartitioning(&self) -> bool {
+        true
+    }
+
     /// If supported by the [`FileSource`], redistribute files across partitions
     /// according to their size. Allows custom file formats to implement their
     /// own repartitioning logic.
@@ -93,7 +165,8 @@ pub trait FileSource: Send + Sync {
         output_ordering: Option<LexOrdering>,
         config: &FileScanConfig,
     ) -> Result<Option<FileScanConfig>> {
-        if config.file_compression_type.is_compressed() || config.new_lines_in_values {
+        if config.file_compression_type.is_compressed() || !self.supports_repartitioning()
+        {
             return Ok(None);
         }
 
@@ -112,6 +185,19 @@ pub trait FileSource: Send + Sync {
     }
 
     /// Try to push down filters into this FileSource.
+    ///
+    /// `filters` must be in terms of the unprojected table schema (file schema
+    /// plus partition columns), before any projection is applied.
+    ///
+    /// Any filters that this FileSource chooses to evaluate itself should be
+    /// returned as `PushedDown::Yes` in the result, along with a FileSource
+    /// instance that incorporates those filters. Such filters are logically
+    /// applied "during" the file scan, meaning they may refer to columns not
+    /// included in the final output projection.
+    ///
+    /// Filters that cannot be pushed down should be marked as `PushedDown::No`,
+    /// and will be evaluated by an execution plan after the file source.
+    ///
     /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details.
     ///
     /// [`ExecutionPlan::handle_child_pushdown_result`]: datafusion_physical_plan::ExecutionPlan::handle_child_pushdown_result
@@ -120,25 +206,161 @@ pub trait FileSource: Send + Sync {
         filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn FileSource>>> {
-        Ok(FilterPushdownPropagation::unsupported(filters))
+        Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+            vec![PushedDown::No; filters.len()],
+        ))
+    }
+
+    /// Try to create a new FileSource that can produce data in the specified sort order.
+    ///
+    /// This method attempts to optimize data retrieval to match the requested ordering.
+    /// It receives both the requested ordering and equivalence properties that describe
+    /// the output data from this file source.
+    ///
+    /// # Parameters
+    /// * `order` - The requested sort ordering from the query
+    /// * `eq_properties` - Equivalence properties of the data that will be produced by this
+    ///   file source. These properties describe the ordering, constant columns, and other
+    ///   relationships in the output data, allowing the implementation to determine if
+    ///   optimizations like reversed scanning can help satisfy the requested ordering.
+    ///   This includes information about:
+    ///   - The file's natural ordering (from output_ordering in FileScanConfig)
+    ///   - Constant columns (e.g., from filters like `ticker = 'AAPL'`)
+    ///   - Monotonic functions (e.g., `extract_year_month(timestamp)`)
+    ///   - Other equivalence relationships
+    ///
+    /// # Examples
+    ///
+    /// ## Example 1: Simple reverse
+    /// ```text
+    /// File ordering: [a ASC, b DESC]
+    /// Requested:     [a DESC]
+    /// Reversed file: [a DESC, b ASC]
+    /// Result: Satisfies request (prefix match) → Inexact
+    /// ```
+    ///
+    /// ## Example 2: Monotonic function
+    /// ```text
+    /// File ordering: [extract_year_month(ts) ASC, ts ASC]
+    /// Requested:     [ts DESC]
+    /// Reversed file: [extract_year_month(ts) DESC, ts DESC]
+    /// Result: Through monotonicity, satisfies [ts DESC] → Inexact
+    /// ```
+    ///
+    /// # Returns
+    /// * `Exact` - Created a source that guarantees perfect ordering
+    /// * `Inexact` - Created a source optimized for ordering (e.g., reversed row groups) but not perfectly sorted
+    /// * `Unsupported` - Cannot optimize for this ordering
+    ///
+    /// # Deprecation / migration notes
+    /// - [`Self::try_reverse_output`] was renamed to this method and deprecated since `53.0.0`.
+    ///   Per DataFusion's deprecation guidelines, it will be removed in `59.0.0` or later
+    ///   (6 major versions or 6 months, whichever is longer).
+    /// - New implementations should override [`Self::try_pushdown_sort`] directly.
+    /// - For backwards compatibility, the default implementation of
+    ///   [`Self::try_pushdown_sort`] delegates to the deprecated
+    ///   [`Self::try_reverse_output`] until it is removed. After that point, the
+    ///   default implementation will return [`SortOrderPushdownResult::Unsupported`].
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+        eq_properties: &EquivalenceProperties,
+    ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        #[expect(deprecated)]
+        self.try_reverse_output(order, eq_properties)
+    }
+
+    /// Deprecated: Renamed to [`Self::try_pushdown_sort`].
+    #[deprecated(
+        since = "53.0.0",
+        note = "Renamed to try_pushdown_sort. This method was never limited to reversing output. It will be removed in 59.0.0 or later."
+    )]
+    fn try_reverse_output(
+        &self,
+        _order: &[PhysicalSortExpr],
+        _eq_properties: &EquivalenceProperties,
+    ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
     }
 
-    /// Set optional schema adapter factory.
+    /// Try to push down a projection into this FileSource.
     ///
-    /// [`SchemaAdapterFactory`] allows user to specify how fields from the
-    /// file get mapped to that of the table schema. The default implementation
-    /// returns the original source.
+    /// `FileSource` implementations that support projection pushdown should
+    /// override this method and return a new `FileSource` instance with the
+    /// projection incorporated.
     ///
-    /// Note: You can implement this method and `schema_adapter_factory`
-    /// automatically using the [`crate::impl_schema_adapter_methods`] macro.
+    /// If a `FileSource` does accept a projection it is expected to handle
+    /// the projection in it's entirety, including partition columns.
+    /// For example, the `FileSource` may translate that projection into a
+    /// file format specific projection (e.g. Parquet can push down struct field access,
+    /// some other file formats like Vortex can push down computed expressions into un-decoded data)
+    /// and also need to handle partition column projection (generally done by replacing partition column
+    /// references with literal values derived from each files partition values).
+    ///
+    /// Not all FileSource's can handle complex expression pushdowns. For example,
+    /// a CSV file source may only support simple column selections. In such cases,
+    /// the `FileSource` can use [`SplitProjection`] and [`ProjectionOpener`]
+    /// to split the projection into a pushdownable part and a non-pushdownable part.
+    /// These helpers also handle partition column projection.
+    ///
+    /// [`SplitProjection`]: crate::projection::SplitProjection
+    /// [`ProjectionOpener`]: crate::projection::ProjectionOpener
+    fn try_pushdown_projection(
+        &self,
+        _projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        Ok(None)
+    }
+
+    /// Deprecated: Set optional schema adapter factory.
+    ///
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    #[deprecated(
+        since = "53.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
     fn with_schema_adapter_factory(
         &self,
-        factory: Arc<dyn SchemaAdapterFactory>,
-    ) -> Arc<dyn FileSource>;
+        _factory: Arc<dyn SchemaAdapterFactory>,
+    ) -> Result<Arc<dyn FileSource>> {
+        not_impl_err!(
+            "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+        )
+    }
 
-    /// Returns the current schema adapter factory if set
+    /// Deprecated: Returns the current schema adapter factory if set.
     ///
-    /// Note: You can implement this method and `with_schema_adapter_factory`
-    /// automatically using the [`crate::impl_schema_adapter_methods`] macro.
-    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>>;
+    /// `SchemaAdapterFactory` has been removed. Use `PhysicalExprAdapterFactory` instead.
+    /// See `upgrading.md` for more details.
+    #[deprecated(
+        since = "53.0.0",
+        note = "SchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    #[expect(deprecated)]
+    fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> {
+        None
+    }
+
+    /// Apply a function to all physical expressions used by this file source.
+    ///
+    /// This includes:
+    /// - Filter predicates (which may contain dynamic filters)
+    /// - Projection expressions
+    ///
+    /// The function `f` is called once for each expression. The function should
+    /// return `TreeNodeRecursion::Continue` to continue visiting other expressions,
+    /// or `TreeNodeRecursion::Stop` to stop visiting expressions early.
+    ///
+    /// Implementations must explicitly visit all expressions. There is no default
+    /// implementation to ensure that all FileSource implementations handle this correctly.
+    ///
+    /// See [`ExecutionPlan::apply_expressions`] for more details and examples.
+    ///
+    /// [`ExecutionPlan::apply_expressions`]: datafusion_physical_plan::ExecutionPlan::apply_expressions
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
 }
diff --git a/datafusion/datasource/src/file_compression_type.rs b/datafusion/datasource/src/file_compression_type.rs
index 7cc3142564e9b..89efb580652b1 100644
--- a/datafusion/datasource/src/file_compression_type.rs
+++ b/datafusion/datasource/src/file_compression_type.rs
@@ -21,8 +21,8 @@ use std::str::FromStr;
 
 use datafusion_common::error::{DataFusionError, Result};
 
-use datafusion_common::parsers::CompressionTypeVariant::{self, *};
 use datafusion_common::GetExt;
+use datafusion_common::parsers::CompressionTypeVariant::{self, *};
 
 #[cfg(feature = "compression")]
 use async_compression::tokio::bufread::{
@@ -39,17 +39,17 @@ use bytes::Bytes;
 use bzip2::read::MultiBzDecoder;
 #[cfg(feature = "compression")]
 use flate2::read::MultiGzDecoder;
-use futures::stream::BoxStream;
 use futures::StreamExt;
 #[cfg(feature = "compression")]
 use futures::TryStreamExt;
+use futures::stream::BoxStream;
+#[cfg(feature = "compression")]
+use liblzma::read::XzDecoder;
 use object_store::buffered::BufWriter;
 use tokio::io::AsyncWrite;
 #[cfg(feature = "compression")]
 use tokio_util::io::{ReaderStream, StreamReader};
 #[cfg(feature = "compression")]
-use xz2::read::XzDecoder;
-#[cfg(feature = "compression")]
 use zstd::Decoder as ZstdDecoder;
 
 /// Readable file compression type
@@ -148,32 +148,70 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => s.boxed(),
         })
     }
 
     /// Wrap the given `BufWriter` so that it performs compressed writes
-    /// according to this `FileCompressionType`.
+    /// according to this `FileCompressionType` using the default compression level.
     pub fn convert_async_writer(
         &self,
         w: BufWriter,
     ) -> Result<Box<dyn AsyncWrite + Send + Unpin>> {
+        self.convert_async_writer_with_level(w, None)
+    }
+
+    /// Wrap the given `BufWriter` so that it performs compressed writes
+    /// according to this `FileCompressionType`.
+    ///
+    /// If `compression_level` is `Some`, the encoder will use the specified
+    /// compression level. If `None`, the default level for each algorithm is used.
+    pub fn convert_async_writer_with_level(
+        &self,
+        w: BufWriter,
+        compression_level: Option<u32>,
+    ) -> Result<Box<dyn AsyncWrite + Send + Unpin>> {
+        #[cfg(feature = "compression")]
+        use async_compression::Level;
+
         Ok(match self.variant {
             #[cfg(feature = "compression")]
-            GZIP => Box::new(GzipEncoder::new(w)),
+            GZIP => match compression_level {
+                Some(level) => {
+                    Box::new(GzipEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(GzipEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            BZIP2 => Box::new(BzEncoder::new(w)),
+            BZIP2 => match compression_level {
+                Some(level) => {
+                    Box::new(BzEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(BzEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            XZ => Box::new(XzEncoder::new(w)),
+            XZ => match compression_level {
+                Some(level) => {
+                    Box::new(XzEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(XzEncoder::new(w)),
+            },
             #[cfg(feature = "compression")]
-            ZSTD => Box::new(ZstdEncoder::new(w)),
+            ZSTD => match compression_level {
+                Some(level) => {
+                    Box::new(ZstdEncoder::with_quality(w, Level::Precise(level as i32)))
+                }
+                None => Box::new(ZstdEncoder::new(w)),
+            },
             #[cfg(not(feature = "compression"))]
             GZIP | BZIP2 | XZ | ZSTD => {
+                // compression_level is not used when compression feature is disabled
+                let _ = compression_level;
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => Box::new(w),
         })
@@ -210,7 +248,7 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => s.boxed(),
         })
@@ -237,7 +275,7 @@ impl FileCompressionType {
             GZIP | BZIP2 | XZ | ZSTD => {
                 return Err(DataFusionError::NotImplemented(
                     "Compression feature is not enabled".to_owned(),
-                ))
+                ));
             }
             UNCOMPRESSED => Box::new(r),
         })
diff --git a/datafusion/datasource/src/file_format.rs b/datafusion/datasource/src/file_format.rs
index b2caf5277a25f..9f8fa622d2587 100644
--- a/datafusion/datasource/src/file_format.rs
+++ b/datafusion/datasource/src/file_format.rs
@@ -30,8 +30,9 @@ use crate::file_sink_config::FileSinkConfig;
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::file_options::file_type::FileType;
-use datafusion_common::{internal_err, not_impl_err, GetExt, Result, Statistics};
+use datafusion_common::{GetExt, Result, Statistics, internal_err, not_impl_err};
 use datafusion_physical_expr::LexRequirement;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::ExecutionPlan;
 use datafusion_session::Session;
 
@@ -41,6 +42,35 @@ use object_store::{ObjectMeta, ObjectStore};
 /// Default max records to scan to infer the schema
 pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
 
+/// Metadata fetched from a file, including statistics and ordering.
+///
+/// This struct is returned by [`FileFormat::infer_stats_and_ordering`] to
+/// provide all metadata in a single read, avoiding duplicate I/O operations.
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub struct FileMeta {
+    /// Statistics for the file (row counts, byte sizes, column statistics).
+    pub statistics: Statistics,
+    /// The ordering (sort order) of the file, if known.
+    pub ordering: Option<LexOrdering>,
+}
+
+impl FileMeta {
+    /// Creates a new `FileMeta` with the given statistics and no ordering.
+    pub fn new(statistics: Statistics) -> Self {
+        Self {
+            statistics,
+            ordering: None,
+        }
+    }
+
+    /// Sets the ordering for this file metadata.
+    pub fn with_ordering(mut self, ordering: Option<LexOrdering>) -> Self {
+        self.ordering = ordering;
+        self
+    }
+}
+
 /// This trait abstracts all the file format specific implementations
 /// from the [`TableProvider`]. This helps code re-utilization across
 /// providers that support the same file formats.
@@ -48,7 +78,7 @@ pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
 /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
 #[async_trait]
 pub trait FileFormat: Send + Sync + fmt::Debug {
-    /// Returns the table provider as [`Any`](std::any::Any) so that it can be
+    /// Returns the table provider as [`Any`] so that it can be
     /// downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
 
@@ -61,6 +91,9 @@ pub trait FileFormat: Send + Sync + fmt::Debug {
         _file_compression_type: &FileCompressionType,
     ) -> Result<String>;
 
+    /// Returns whether this instance uses compression if applicable
+    fn compression_type(&self) -> Option<FileCompressionType>;
+
     /// Infer the common schema of the provided objects. The objects will usually
     /// be analysed up to a given number of records or files (as specified in the
     /// format config) then give the estimated common schema. This might fail if
@@ -87,6 +120,52 @@ pub trait FileFormat: Send + Sync + fmt::Debug {
         object: &ObjectMeta,
     ) -> Result<Statistics>;
 
+    /// Infer the ordering (sort order) for the provided object from file metadata.
+    ///
+    /// Returns `Ok(None)` if the file format does not support ordering inference
+    /// or if the file does not have ordering information.
+    ///
+    /// `table_schema` is the (combined) schema of the overall table
+    /// and may be a superset of the schema contained in this file.
+    ///
+    /// The default implementation returns `Ok(None)`.
+    async fn infer_ordering(
+        &self,
+        _state: &dyn Session,
+        _store: &Arc<dyn ObjectStore>,
+        _table_schema: SchemaRef,
+        _object: &ObjectMeta,
+    ) -> Result<Option<LexOrdering>> {
+        Ok(None)
+    }
+
+    /// Infer both statistics and ordering from a single metadata read.
+    ///
+    /// This is more efficient than calling [`Self::infer_stats`] and
+    /// [`Self::infer_ordering`] separately when both are needed, as it avoids
+    /// reading file metadata twice.
+    ///
+    /// The default implementation calls both methods separately. File formats
+    /// that can extract both from a single read should override this method.
+    async fn infer_stats_and_ordering(
+        &self,
+        state: &dyn Session,
+        store: &Arc<dyn ObjectStore>,
+        table_schema: SchemaRef,
+        object: &ObjectMeta,
+    ) -> Result<FileMeta> {
+        let statistics = self
+            .infer_stats(state, store, Arc::clone(&table_schema), object)
+            .await?;
+        let ordering = self
+            .infer_ordering(state, store, table_schema, object)
+            .await?;
+        Ok(FileMeta {
+            statistics,
+            ordering,
+        })
+    }
+
     /// Take a list of files and convert it to the appropriate executor
     /// according to this file format.
     async fn create_physical_plan(
@@ -108,7 +187,10 @@ pub trait FileFormat: Send + Sync + fmt::Debug {
     }
 
     /// Return the related FileSource such as `CsvSource`, `JsonSource`, etc.
-    fn file_source(&self) -> Arc<dyn FileSource>;
+    ///
+    /// # Arguments
+    /// * `table_schema` - The table schema to use for the FileSource (includes partition columns)
+    fn file_source(&self, table_schema: crate::TableSchema) -> Arc<dyn FileSource>;
 }
 
 /// Factory for creating [`FileFormat`] instances based on session and command level options
diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs
index 8bfadbef775cf..28a403ab92ad8 100644
--- a/datafusion/datasource/src/file_groups.rs
+++ b/datafusion/datasource/src/file_groups.rs
@@ -18,13 +18,15 @@
 //! Logic for managing groups of [`PartitionedFile`]s in DataFusion
 
 use crate::{FileRange, PartitionedFile};
+use arrow::compute::SortOptions;
 use datafusion_common::Statistics;
+use datafusion_common::utils::compare_rows;
 use itertools::Itertools;
-use std::cmp::min;
-use std::collections::BinaryHeap;
+use std::cmp::{Ordering, min};
+use std::collections::{BinaryHeap, HashMap};
 use std::iter::repeat_with;
 use std::mem;
-use std::ops::{Index, IndexMut};
+use std::ops::{Deref, DerefMut, Index, IndexMut};
 use std::sync::Arc;
 
 /// Repartition input files into `target_partitions` partitions, if total file size exceed
@@ -189,15 +191,6 @@ impl FileGroupPartitioner {
             return None;
         }
 
-        // Perform redistribution only in case all files should be read from beginning to end
-        let has_ranges = file_groups
-            .iter()
-            .flat_map(FileGroup::iter)
-            .any(|f| f.range.is_some());
-        if has_ranges {
-            return None;
-        }
-
         //  special case when order must be preserved
         if self.preserve_order_within_groups {
             self.repartition_preserving_order(file_groups)
@@ -218,14 +211,13 @@ impl FileGroupPartitioner {
 
         let total_size = flattened_files
             .iter()
-            .map(|f| f.object_meta.size as i64)
-            .sum::<i64>();
-        if total_size < (repartition_file_min_size as i64) || total_size == 0 {
+            .map(|f| f.effective_size())
+            .sum::<u64>();
+        if total_size < (repartition_file_min_size as u64) || total_size == 0 {
             return None;
         }
 
-        let target_partition_size =
-            (total_size as u64).div_ceil(target_partitions as u64);
+        let target_partition_size = total_size.div_ceil(target_partitions as u64);
 
         let current_partition_index: usize = 0;
         let current_partition_size: u64 = 0;
@@ -235,13 +227,14 @@ impl FileGroupPartitioner {
             .into_iter()
             .scan(
                 (current_partition_index, current_partition_size),
-                |state, source_file| {
+                |(current_partition_index, current_partition_size), source_file| {
                     let mut produced_files = vec![];
-                    let mut range_start = 0;
-                    while range_start < source_file.object_meta.size {
+                    let (mut range_start, file_end) = source_file.range();
+                    while range_start < file_end {
                         let range_end = min(
-                            range_start + (target_partition_size - state.1),
-                            source_file.object_meta.size,
+                            range_start
+                                + (target_partition_size - *current_partition_size),
+                            file_end,
                         );
 
                         let mut produced_file = source_file.clone();
@@ -249,13 +242,15 @@ impl FileGroupPartitioner {
                             start: range_start as i64,
                             end: range_end as i64,
                         });
-                        produced_files.push((state.0, produced_file));
+                        produced_files.push((*current_partition_index, produced_file));
 
-                        if state.1 + (range_end - range_start) >= target_partition_size {
-                            state.0 += 1;
-                            state.1 = 0;
+                        if *current_partition_size + (range_end - range_start)
+                            >= target_partition_size
+                        {
+                            *current_partition_index += 1;
+                            *current_partition_size = 0;
                         } else {
-                            state.1 += range_end - range_start;
+                            *current_partition_size += range_end - range_start;
                         }
                         range_start = range_end;
                     }
@@ -297,13 +292,14 @@ impl FileGroupPartitioner {
                 if group.len() == 1 {
                     Some(ToRepartition {
                         source_index: group_index,
-                        file_size: group[0].object_meta.size,
+                        file_size: group[0].effective_size(),
                         new_groups: vec![group_index],
                     })
                 } else {
                     None
                 }
             })
+            .map(CompareByRangeSize)
             .collect();
 
         // No files can be redistributed
@@ -332,28 +328,31 @@ impl FileGroupPartitioner {
 
         // Distribute files to their newly assigned groups
         while let Some(to_repartition) = heap.pop() {
-            let range_size = to_repartition.range_size() as i64;
+            let range_size = to_repartition.range_size();
             let ToRepartition {
                 source_index,
-                file_size,
+                file_size: _,
                 new_groups,
-            } = to_repartition;
+            } = to_repartition.into_inner();
             assert_eq!(file_groups[source_index].len(), 1);
             let original_file = file_groups[source_index].pop().unwrap();
 
             let last_group = new_groups.len() - 1;
-            let mut range_start: i64 = 0;
-            let mut range_end: i64 = range_size;
+            let (mut range_start, file_end) = original_file.range();
+            let mut range_end = range_start + range_size;
             for (i, group_index) in new_groups.into_iter().enumerate() {
                 let target_group = &mut file_groups[group_index];
                 assert!(target_group.is_empty());
 
                 // adjust last range to include the entire file
                 if i == last_group {
-                    range_end = file_size as i64;
+                    range_end = file_end;
                 }
-                target_group
-                    .push(original_file.clone().with_range(range_start, range_end));
+                target_group.push(
+                    original_file
+                        .clone()
+                        .with_range(range_start as i64, range_end as i64),
+                );
                 range_start = range_end;
                 range_end += range_size;
             }
@@ -365,11 +364,27 @@ impl FileGroupPartitioner {
 
 /// Represents a group of partitioned files that'll be processed by a single thread.
 /// Maintains optional statistics across all files in the group.
+///
+/// # Statistics
+///
+/// The group-level [`FileGroup::file_statistics`] field contains merged statistics from all files
+/// in the group for the **full table schema** (file columns + partition columns).
+///
+/// Partition column statistics are derived from the individual file partition values:
+/// - `min` = minimum partition value across all files in the group
+/// - `max` = maximum partition value across all files in the group
+/// - `null_count` = 0 (partition values are never null)
+///
+/// This allows query optimizers to prune entire file groups based on partition bounds.
 #[derive(Debug, Clone)]
 pub struct FileGroup {
     /// The files in this group
     files: Vec<PartitionedFile>,
-    /// Optional statistics for the data across all files in the group
+    /// Optional statistics for the data across all files in the group.
+    ///
+    /// These statistics cover the full table schema: file columns plus partition columns.
+    /// Partition column statistics are merged from individual [`PartitionedFile::statistics`],
+    /// which compute exact values from [`PartitionedFile::partition_values`].
     statistics: Option<Arc<Statistics>>,
 }
 
@@ -416,8 +431,8 @@ impl FileGroup {
     }
 
     /// Adds a file to the group
-    pub fn push(&mut self, file: PartitionedFile) {
-        self.files.push(file);
+    pub fn push(&mut self, partitioned_file: PartitionedFile) {
+        self.files.push(partitioned_file);
     }
 
     /// Get the specific file statistics for the given index
@@ -467,6 +482,64 @@ impl FileGroup {
 
         chunks
     }
+
+    /// Groups files by their partition values, ensuring all files with same
+    /// partition values are in the same group.
+    ///
+    /// Note: May return fewer groups than `max_target_partitions` when the
+    /// number of unique partition values is less than the target.
+    pub fn group_by_partition_values(
+        self,
+        max_target_partitions: usize,
+    ) -> Vec<FileGroup> {
+        if self.is_empty() || max_target_partitions == 0 {
+            return vec![];
+        }
+
+        let mut partition_groups: HashMap<
+            Vec<datafusion_common::ScalarValue>,
+            Vec<PartitionedFile>,
+        > = HashMap::new();
+
+        for file in self.files {
+            partition_groups
+                .entry(file.partition_values.clone())
+                .or_default()
+                .push(file);
+        }
+
+        let num_unique_partitions = partition_groups.len();
+
+        // Sort for deterministic bucket assignment across query executions.
+        let mut sorted_partitions: Vec<_> = partition_groups.into_iter().collect();
+        let sort_options =
+            vec![
+                SortOptions::default();
+                sorted_partitions.first().map(|(k, _)| k.len()).unwrap_or(0)
+            ];
+        sorted_partitions.sort_by(|a, b| {
+            compare_rows(&a.0, &b.0, &sort_options).unwrap_or(Ordering::Equal)
+        });
+
+        if num_unique_partitions <= max_target_partitions {
+            sorted_partitions
+                .into_iter()
+                .map(|(_, files)| FileGroup::new(files))
+                .collect()
+        } else {
+            // Merge into max_target_partitions buckets using round-robin.
+            // This maintains grouping by partition value as we are merging groups which already
+            // contain all values for a partition key.
+            let mut target_groups = vec![vec![]; max_target_partitions];
+
+            for (idx, (_, files)) in sorted_partitions.into_iter().enumerate() {
+                let bucket = idx % max_target_partitions;
+                target_groups[bucket].extend(files);
+            }
+
+            target_groups.into_iter().map(FileGroup::new).collect()
+        }
+    }
 }
 
 impl Index<usize> for FileGroup {
@@ -503,7 +576,7 @@ impl Default for FileGroup {
 }
 
 /// Tracks how a individual file will be repartitioned
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone)]
 struct ToRepartition {
     /// the index from which the original file will be taken
     source_index: usize,
@@ -520,22 +593,45 @@ impl ToRepartition {
     }
 }
 
-impl PartialOrd for ToRepartition {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+struct CompareByRangeSize(ToRepartition);
+impl CompareByRangeSize {
+    fn into_inner(self) -> ToRepartition {
+        self.0
+    }
+}
+impl Ord for CompareByRangeSize {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.range_size().cmp(&other.0.range_size())
+    }
+}
+impl PartialOrd for CompareByRangeSize {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
-
-/// Order based on individual range
-impl Ord for ToRepartition {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        self.range_size().cmp(&other.range_size())
+impl PartialEq for CompareByRangeSize {
+    fn eq(&self, other: &Self) -> bool {
+        // PartialEq must be consistent with PartialOrd
+        self.cmp(other) == Ordering::Equal
+    }
+}
+impl Eq for CompareByRangeSize {}
+impl Deref for CompareByRangeSize {
+    type Target = ToRepartition;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+impl DerefMut for CompareByRangeSize {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
     }
 }
 
 #[cfg(test)]
 mod test {
     use super::*;
+    use datafusion_common::ScalarValue;
 
     /// Empty file won't get partitioned
     #[test]
@@ -622,6 +718,68 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_single_file_with_range() {
+        // Single file, single partition into multiple partitions
+        let single_partition =
+            vec![FileGroup::new(vec![pfile("a", 123).with_range(0, 123)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 123).with_range(0, 31)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(31, 62)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(62, 93)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(93, 123)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_single_file_with_incomplete_range() {
+        // Single file, single partition into multiple partitions
+        let single_partition =
+            vec![FileGroup::new(vec![pfile("a", 123).with_range(10, 100)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 123).with_range(10, 33)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(33, 56)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(56, 79)]),
+            FileGroup::new(vec![pfile("a", 123).with_range(79, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_single_file_duplicated_with_range() {
+        // Single file, two partitions into multiple partitions
+        let single_partition = vec![FileGroup::new(vec![
+            pfile("a", 100).with_range(0, 50),
+            pfile("a", 100).with_range(50, 100),
+        ])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&single_partition);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 25)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(25, 50)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 75)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(75, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_too_much_partitions() {
         // Single file, single partition into 96 partitions
@@ -694,22 +852,6 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
-    #[test]
-    fn repartition_no_action_ranges() {
-        // No action due to Some(range) in second file
-        let source_partitions = vec![
-            FileGroup::new(vec![pfile("a", 123)]),
-            FileGroup::new(vec![pfile("b", 144).with_range(1, 50)]),
-        ];
-
-        let actual = FileGroupPartitioner::new()
-            .with_target_partitions(65)
-            .with_repartition_file_min_size(10)
-            .repartition_file_groups(&source_partitions);
-
-        assert_partitioned_files(None, actual)
-    }
-
     #[test]
     fn repartition_no_action_min_size() {
         // No action due to target_partition_size
@@ -786,6 +928,26 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_ordered_one_large_file_with_range() {
+        // "Rebalance" the single large file across partitions
+        let source_partitions =
+            vec![FileGroup::new(vec![pfile("a", 100).with_range(0, 100)])];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(3)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 34)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(34, 68)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(68, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_ordered_one_large_one_small_file() {
         // "Rebalance" the single large file across empty partitions, but can't split
@@ -814,6 +976,91 @@ mod test {
         assert_partitioned_files(expected, actual);
     }
 
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_full_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 100)]),
+            FileGroup::new(vec![pfile("b", 30)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 33)]),
+            // only b in this group (can't do this)
+            FileGroup::new(vec![pfile("b", 30).with_range(0, 30)]),
+            // second third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(33, 66)]),
+            // final third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(66, 100)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_split_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 50)]),
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 100)]),
+            FileGroup::new(vec![pfile("b", 30)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first half of first "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(0, 25)]),
+            // second "a" fully (not split)
+            FileGroup::new(vec![pfile("a", 100).with_range(50, 100)]),
+            // only b in this group (can't do this)
+            FileGroup::new(vec![pfile("b", 30).with_range(0, 30)]),
+            // second half of first "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(25, 50)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
+    #[test]
+    fn repartition_ordered_one_large_one_small_file_with_non_full_range() {
+        // "Rebalance" the single large file across empty partitions, but can't split
+        // small file
+        let source_partitions = vec![
+            FileGroup::new(vec![pfile("a", 100).with_range(20, 80)]),
+            FileGroup::new(vec![pfile("b", 30).with_range(5, 25)]),
+        ];
+
+        let actual = FileGroupPartitioner::new()
+            .with_preserve_order_within_groups(true)
+            .with_target_partitions(4)
+            .with_repartition_file_min_size(10)
+            .repartition_file_groups(&source_partitions);
+
+        let expected = Some(vec![
+            // scan first third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(20, 40)]),
+            // only b in this group (can't split this)
+            FileGroup::new(vec![pfile("b", 30).with_range(5, 25)]),
+            // second third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(40, 60)]),
+            // final third of "a"
+            FileGroup::new(vec![pfile("a", 100).with_range(60, 80)]),
+        ]);
+        assert_partitioned_files(expected, actual);
+    }
+
     #[test]
     fn repartition_ordered_two_large_files() {
         // "Rebalance" two large files across empty partitions, but can't mix them
@@ -975,6 +1222,13 @@ mod test {
         PartitionedFile::new(path, file_size)
     }
 
+    /// Creates a file with partition value with a static size of 10.
+    fn pfile_with_pv(path: &str, pv: &str) -> PartitionedFile {
+        let mut file = pfile(path, 10);
+        file.partition_values = vec![ScalarValue::from(pv)];
+        file
+    }
+
     /// repartition the file groups both with and without preserving order
     /// asserting they return the same value and returns that value
     fn repartition_test(
@@ -990,4 +1244,50 @@ mod test {
         assert_partitioned_files(repartitioned.clone(), repartitioned_preserving_sort);
         repartitioned
     }
+
+    #[test]
+    fn test_group_by_partition_values_edge_cases() {
+        // Edge cases: empty and zero target
+        assert!(FileGroup::default().group_by_partition_values(4).is_empty());
+        assert!(
+            FileGroup::new(vec![pfile("a", 100)])
+                .group_by_partition_values(0)
+                .is_empty()
+        );
+    }
+
+    #[test]
+    fn test_group_by_partition_values_less_groups_than_target() {
+        // File a and b have partition value p1.
+        // File c has partition value p2.
+        // Grouping by partition value should not redistribute any files since the number of partition
+        // values <= max_target_partitions.
+        let fg = FileGroup::new(vec![
+            pfile_with_pv("a", "p1"),
+            pfile_with_pv("b", "p1"),
+            pfile_with_pv("c", "p2"),
+        ]);
+        let groups = fg.group_by_partition_values(4);
+        assert_eq!(groups.len(), 2);
+        assert_eq!(groups[0].len(), 2);
+        assert_eq!(groups[1].len(), 1);
+    }
+
+    #[test]
+    fn test_group_by_partition_values_more_groups_than_target() {
+        // Each file has a single partition value. The number of partition values > max_target_partitions, so
+        // they should be round-robin distributed into groups.
+        let fg = FileGroup::new(vec![
+            pfile_with_pv("a", "p1"),
+            pfile_with_pv("b", "p2"),
+            pfile_with_pv("c", "p3"),
+            pfile_with_pv("d", "p4"),
+            pfile_with_pv("e", "p5"),
+        ]);
+        let groups = fg.group_by_partition_values(3);
+        assert_eq!(groups.len(), 3);
+        assert_eq!(groups[0].len(), 2);
+        assert_eq!(groups[1].len(), 2);
+        assert_eq!(groups[2].len(), 1);
+    }
 }
diff --git a/datafusion/datasource/src/file_meta.rs b/datafusion/datasource/src/file_meta.rs
deleted file mode 100644
index 098a15eeb38a2..0000000000000
--- a/datafusion/datasource/src/file_meta.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use object_store::{path::Path, ObjectMeta};
-
-use crate::FileRange;
-
-/// A single file or part of a file that should be read, along with its schema, statistics
-pub struct FileMeta {
-    /// Path for the file (e.g. URL, filesystem path, etc)
-    pub object_meta: ObjectMeta,
-    /// An optional file range for a more fine-grained parallel execution
-    pub range: Option<FileRange>,
-    /// An optional field for user defined per object metadata
-    pub extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
-    /// Size hint for the metadata of this file
-    pub metadata_size_hint: Option<usize>,
-}
-
-impl FileMeta {
-    /// The full path to the object
-    pub fn location(&self) -> &Path {
-        &self.object_meta.location
-    }
-}
-
-impl From<ObjectMeta> for FileMeta {
-    fn from(object_meta: ObjectMeta) -> Self {
-        Self {
-            object_meta,
-            range: None,
-            extensions: None,
-            metadata_size_hint: None,
-        }
-    }
-}
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 39431f3e9f36a..c310d2bdf04ad 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -18,56 +18,59 @@
 //! [`FileScanConfig`] to configure scanning of possibly partitioned
 //! file sources.
 
-use std::{
-    any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter,
-    fmt::Result as FmtResult, marker::PhantomData, sync::Arc,
-};
-
 use crate::file_groups::FileGroup;
-#[allow(unused_imports)]
-use crate::schema_adapter::SchemaAdapterFactory;
 use crate::{
-    display::FileGroupsDisplay,
-    file::FileSource,
-    file_compression_type::FileCompressionType,
-    file_stream::FileStream,
-    source::{DataSource, DataSourceExec},
-    statistics::MinMaxStatistics,
-    PartitionedFile,
-};
-use arrow::{
-    array::{
-        ArrayData, ArrayRef, BufferBuilder, DictionaryArray, RecordBatch,
-        RecordBatchOptions,
-    },
-    buffer::Buffer,
-    datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type},
+    PartitionedFile, display::FileGroupsDisplay, file::FileSource,
+    file_compression_type::FileCompressionType, file_stream::FileStreamBuilder,
+    source::DataSource, statistics::MinMaxStatistics,
 };
+use arrow::datatypes::FieldRef;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    config::ConfigOptions, exec_err, ColumnStatistics, Constraints, Result, Statistics,
+    Constraints, Result, ScalarValue, Statistics, internal_datafusion_err, internal_err,
 };
-use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_execution::{
-    object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext,
-};
-use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_expr::{
-    expressions::Column, EquivalenceProperties, LexOrdering, Partitioning,
-    PhysicalSortExpr,
+    SendableRecordBatchStream, TaskContext, object_store::ObjectStoreUrl,
 };
-use datafusion_physical_plan::filter_pushdown::FilterPushdownPropagation;
+use datafusion_expr::Operator;
+
+use datafusion_physical_expr::equivalence::project_orderings;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column};
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_expr::utils::reassign_expr_columns;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning, split_conjunction};
+use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::coop::cooperative;
+use datafusion_physical_plan::execution_plan::SchedulingType;
 use datafusion_physical_plan::{
-    display::{display_orderings, ProjectSchemaDisplay},
+    DisplayAs, DisplayFormatType,
+    display::{ProjectSchemaDisplay, display_orderings},
+    filter_pushdown::FilterPushdownPropagation,
     metrics::ExecutionPlanMetricsSet,
-    projection::{all_alias_free_columns, new_projections_for_columns, ProjectionExec},
-    DisplayAs, DisplayFormatType, ExecutionPlan,
 };
 use log::{debug, warn};
+use std::{any::Any, fmt::Debug, fmt::Formatter, fmt::Result as FmtResult, sync::Arc};
 
-/// The base configurations for a [`DataSourceExec`], the a physical plan for
-/// any given file format.
+/// [`FileScanConfig`] represents scanning data from a group of files
 ///
-/// Use [`Self::build`] to create a [`DataSourceExec`] from a ``FileScanConfig`.
+/// `FileScanConfig` is used to create a [`DataSourceExec`], the physical plan
+/// for scanning files with a particular file format.
+///
+/// The [`FileSource`] (e.g. `ParquetSource`, `CsvSource`, etc.) is responsible
+/// for creating the actual execution plan to read the files based on a
+/// `FileScanConfig`. Fields in a `FileScanConfig` such as Statistics represent
+/// information about the files **before** any projection or filtering is
+/// applied in the file source.
+///
+/// Use [`FileScanConfigBuilder`] to construct a `FileScanConfig`.
+///
+/// Use [`DataSourceExec::from_data_source`] to create a [`DataSourceExec`] from
+/// a `FileScanConfig`.
 ///
 /// # Example
 /// ```
@@ -75,17 +78,20 @@ use log::{debug, warn};
 /// # use std::sync::Arc;
 /// # use arrow::datatypes::{Field, Fields, DataType, Schema, SchemaRef};
 /// # use object_store::ObjectStore;
-/// # use datafusion_common::Statistics;
+/// # use datafusion_common::Result;
+/// # use datafusion_common::tree_node::TreeNodeRecursion;
 /// # use datafusion_datasource::file::FileSource;
+/// # use datafusion_physical_plan::PhysicalExpr;
 /// # use datafusion_datasource::file_groups::FileGroup;
 /// # use datafusion_datasource::PartitionedFile;
 /// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
 /// # use datafusion_datasource::file_stream::FileOpener;
 /// # use datafusion_datasource::source::DataSourceExec;
+/// # use datafusion_datasource::table_schema::TableSchema;
 /// # use datafusion_execution::object_store::ObjectStoreUrl;
+/// # use datafusion_physical_expr::projection::ProjectionExprs;
 /// # use datafusion_physical_plan::ExecutionPlan;
 /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-/// # use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
 /// # let file_schema = Arc::new(Schema::new(vec![
 /// #  Field::new("c1", DataType::Int32, false),
 /// #  Field::new("c2", DataType::Int32, false),
@@ -95,31 +101,29 @@ use log::{debug, warn};
 /// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
 /// #[derive(Clone)]
 /// # struct ParquetSource {
-/// #    projected_statistics: Option<Statistics>,
-/// #    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>
+/// #    table_schema: TableSchema,
 /// # };
 /// # impl FileSource for ParquetSource {
-/// #  fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Arc<dyn FileOpener> { unimplemented!() }
+/// #  fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Result<Arc<dyn FileOpener>> { unimplemented!() }
 /// #  fn as_any(&self) -> &dyn Any { self  }
+/// #  fn table_schema(&self) -> &TableSchema { &self.table_schema }
 /// #  fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
-/// #  fn with_schema(&self, _: SchemaRef) -> Arc<dyn FileSource> { Arc::new(self.clone()) as Arc<dyn FileSource> }
-/// #  fn with_projection(&self, _: &FileScanConfig) -> Arc<dyn FileSource> { unimplemented!() }
-/// #  fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) }
 /// #  fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
-/// #  fn statistics(&self) -> datafusion_common::Result<Statistics> { Ok(self.projected_statistics.clone().expect("projected_statistics should be set")) }
 /// #  fn file_type(&self) -> &str { "parquet" }
-/// #  fn with_schema_adapter_factory(&self, factory: Arc<dyn SchemaAdapterFactory>) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: self.projected_statistics.clone(), schema_adapter_factory: Some(factory)} ) }
-/// #  fn schema_adapter_factory(&self) -> Option<Arc<dyn SchemaAdapterFactory>> { self.schema_adapter_factory.clone() }
+/// #  // Note that this implementation drops the projection on the floor, it is not complete!
+/// #  fn try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result<Option<Arc<dyn FileSource>>> { Ok(Some(Arc::new(self.clone()) as Arc<dyn FileSource>)) }
+/// #  fn apply_expressions(&self, _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>) -> Result<TreeNodeRecursion> { Ok(TreeNodeRecursion::Continue) }
 /// #  }
 /// # impl ParquetSource {
-/// #  fn new() -> Self { Self {projected_statistics: None, schema_adapter_factory: None} }
+/// #  fn new(table_schema: impl Into<TableSchema>) -> Self { Self {table_schema: table_schema.into()} }
 /// # }
 /// // create FileScan config for reading parquet files from file://
 /// let object_store_url = ObjectStoreUrl::local_filesystem();
-/// let file_source = Arc::new(ParquetSource::new());
-/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
+/// let file_source = Arc::new(ParquetSource::new(file_schema.clone()));
+/// let config = FileScanConfigBuilder::new(object_store_url, file_source)
 ///   .with_limit(Some(1000))            // read only the first 1000 records
-///   .with_projection(Some(vec![2, 3])) // project columns 2 and 3
+///   .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3
+///   .expect("Failed to push down projection")
 ///    // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
 ///   .with_file(PartitionedFile::new("file1.parquet", 1234))
 ///   // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
@@ -131,6 +135,9 @@ use log::{debug, warn};
 /// // create an execution plan from the config
 /// let plan: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(config);
 /// ```
+///
+/// [`DataSourceExec`]: crate::source::DataSourceExec
+/// [`DataSourceExec::from_data_source`]: crate::source::DataSourceExec::from_data_source
 #[derive(Clone)]
 pub struct FileScanConfig {
     /// Object store URL, used to get an [`ObjectStore`] instance from
@@ -145,13 +152,6 @@ pub struct FileScanConfig {
     /// [`RuntimeEnv::register_object_store`]: datafusion_execution::runtime_env::RuntimeEnv::register_object_store
     /// [`RuntimeEnv::object_store`]: datafusion_execution::runtime_env::RuntimeEnv::object_store
     pub object_store_url: ObjectStoreUrl,
-    /// Schema before `projection` is applied. It contains the all columns that may
-    /// appear in the files. It does not include table partition columns
-    /// that may be added.
-    /// Note that this is **not** the schema of the physical files.
-    /// This is the schema that the physical file schema will be
-    /// mapped onto, and the schema that the [`DataSourceExec`] will return.
-    pub file_schema: SchemaRef,
     /// List of files to be processed, grouped into partitions
     ///
     /// Each file must have a schema of `file_schema` or a subset. If
@@ -164,25 +164,50 @@ pub struct FileScanConfig {
     pub file_groups: Vec<FileGroup>,
     /// Table constraints
     pub constraints: Constraints,
-    /// Columns on which to project the data. Indexes that are higher than the
-    /// number of columns of `file_schema` refer to `table_partition_cols`.
-    pub projection: Option<Vec<usize>>,
     /// The maximum number of records to read from this plan. If `None`,
     /// all records after filtering are returned.
     pub limit: Option<usize>,
-    /// The partitioning columns
-    pub table_partition_cols: Vec<Field>,
-    /// All equivalent lexicographical orderings that describe the schema.
+    /// Whether the scan's limit is order sensitive
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without affecting correctness.
+    pub preserve_order: bool,
+    /// All equivalent lexicographical output orderings of this file scan, in terms of
+    /// [`FileSource::table_schema`]. See [`FileScanConfigBuilder::with_output_ordering`] for more
+    /// details.
+    ///
+    /// [`Self::eq_properties`] uses this information along with projection
+    /// and filtering information to compute the effective
+    /// [`EquivalenceProperties`]
     pub output_ordering: Vec<LexOrdering>,
     /// File compression type
     pub file_compression_type: FileCompressionType,
-    /// Are new lines in values supported for CSVOptions
-    pub new_lines_in_values: bool,
     /// File source such as `ParquetSource`, `CsvSource`, `JsonSource`, etc.
     pub file_source: Arc<dyn FileSource>,
     /// Batch size while creating new batches
     /// Defaults to [`datafusion_common::config::ExecutionOptions`] batch_size.
     pub batch_size: Option<usize>,
+    /// Expression adapter used to adapt filters and projections that are pushed down into the scan
+    /// from the logical schema to the physical schema of the file.
+    pub expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    /// Statistics for the entire table (file schema + partition columns).
+    /// See [`FileScanConfigBuilder::with_statistics`] for more details.
+    ///
+    /// The effective statistics are computed on-demand via
+    /// [`ProjectionExprs::project_statistics`].
+    ///
+    /// Note that this field is pub(crate) because accessing it directly from outside
+    /// would be incorrect if there are filters being applied, thus this should be accessed
+    /// via [`FileScanConfig::statistics`].
+    pub(crate) statistics: Statistics,
+    /// When true, file_groups are organized by partition column values
+    /// and output_partitioning will return Hash partitioning on partition columns.
+    /// This allows the optimizer to skip hash repartitioning for aggregates and joins
+    /// on partition columns.
+    ///
+    /// If the number of file partitions > target_partitions, the file partitions will be grouped
+    /// in a round-robin fashion such that number of file partitions = target_partitions.
+    pub partitioned_by_file_group: bool,
 }
 
 /// A builder for [`FileScanConfig`]'s.
@@ -196,6 +221,7 @@ pub struct FileScanConfig {
 /// # use datafusion_datasource::file_compression_type::FileCompressionType;
 /// # use datafusion_datasource::file_groups::FileGroup;
 /// # use datafusion_datasource::PartitionedFile;
+/// # use datafusion_datasource::table_schema::TableSchema;
 /// # use datafusion_execution::object_store::ObjectStoreUrl;
 /// # use datafusion_common::Statistics;
 /// # use datafusion_datasource::file::FileSource;
@@ -203,25 +229,29 @@ pub struct FileScanConfig {
 /// # fn main() {
 /// # fn with_source(file_source: Arc<dyn FileSource>) {
 ///     // Create a schema for our Parquet files
-///     let schema = Arc::new(Schema::new(vec![
+///     let file_schema = Arc::new(Schema::new(vec![
 ///         Field::new("id", DataType::Int32, false),
 ///         Field::new("value", DataType::Utf8, false),
 ///     ]));
 ///
+///     // Create partition columns
+///     let partition_cols = vec![
+///         Arc::new(Field::new("date", DataType::Utf8, false)),
+///     ];
+///
+///     // Create table schema with file schema and partition columns
+///     let table_schema = TableSchema::new(file_schema, partition_cols);
+///
 ///     // Create a builder for scanning Parquet files from a local filesystem
 ///     let config = FileScanConfigBuilder::new(
 ///         ObjectStoreUrl::local_filesystem(),
-///         schema,
 ///         file_source,
 ///     )
 ///     // Set a limit of 1000 rows
 ///     .with_limit(Some(1000))
 ///     // Project only the first column
-///     .with_projection(Some(vec![0]))
-///     // Add partition columns
-///     .with_table_partition_cols(vec![
-///         Field::new("date", DataType::Utf8, false),
-///     ])
+///     .with_projection_indices(Some(vec![0]))
+///     .expect("Failed to push down projection")
 ///     // Add a file group with two files
 ///     .with_file_group(FileGroup::new(vec![
 ///         PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024),
@@ -237,29 +267,17 @@ pub struct FileScanConfig {
 #[derive(Clone)]
 pub struct FileScanConfigBuilder {
     object_store_url: ObjectStoreUrl,
-    /// Table schema before any projections or partition columns are applied.
-    ///
-    /// This schema is used to read the files, but is **not** necessarily the
-    /// schema of the physical files. Rather this is the schema that the
-    /// physical file schema will be mapped onto, and the schema that the
-    /// [`DataSourceExec`] will return.
-    ///
-    /// This is usually the same as the table schema as specified by the `TableProvider` minus any partition columns.
-    ///
-    /// This probably would be better named `table_schema`
-    file_schema: SchemaRef,
     file_source: Arc<dyn FileSource>,
-
     limit: Option<usize>,
-    projection: Option<Vec<usize>>,
-    table_partition_cols: Vec<Field>,
+    preserve_order: bool,
     constraints: Option<Constraints>,
     file_groups: Vec<FileGroup>,
     statistics: Option<Statistics>,
     output_ordering: Vec<LexOrdering>,
     file_compression_type: Option<FileCompressionType>,
-    new_lines_in_values: Option<bool>,
     batch_size: Option<usize>,
+    expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    partitioned_by_file_group: bool,
 }
 
 impl FileScanConfigBuilder {
@@ -267,57 +285,117 @@ impl FileScanConfigBuilder {
     ///
     /// # Parameters:
     /// * `object_store_url`: See [`FileScanConfig::object_store_url`]
-    /// * `file_schema`: See [`FileScanConfig::file_schema`]
-    /// * `file_source`: See [`FileScanConfig::file_source`]
+    /// * `file_source`: See [`FileScanConfig::file_source`]. The file source must have
+    ///   a schema set via its constructor.
     pub fn new(
         object_store_url: ObjectStoreUrl,
-        file_schema: SchemaRef,
         file_source: Arc<dyn FileSource>,
     ) -> Self {
         Self {
             object_store_url,
-            file_schema,
             file_source,
             file_groups: vec![],
             statistics: None,
             output_ordering: vec![],
             file_compression_type: None,
-            new_lines_in_values: None,
             limit: None,
-            projection: None,
-            table_partition_cols: vec![],
+            preserve_order: false,
             constraints: None,
             batch_size: None,
+            expr_adapter_factory: None,
+            partitioned_by_file_group: false,
         }
     }
 
-    /// Set the maximum number of records to read from this plan. If `None`,
-    /// all records after filtering are returned.
+    /// Set the maximum number of records to read from this plan.
+    ///
+    /// If `None`, all records after filtering are returned.
     pub fn with_limit(mut self, limit: Option<usize>) -> Self {
         self.limit = limit;
         self
     }
 
+    /// Set whether the limit should be order-sensitive.
+    ///
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without
+    /// affecting correctness.
+    pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
+        self.preserve_order = order_sensitive;
+        self
+    }
+
     /// Set the file source for scanning files.
     ///
-    /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.)
-    /// after the builder has been created.
+    /// This method allows you to change the file source implementation (e.g.
+    /// ParquetSource, CsvSource, etc.) after the builder has been created.
     pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
         self.file_source = file_source;
         self
     }
 
+    /// Return the table schema
+    pub fn table_schema(&self) -> &SchemaRef {
+        self.file_source.table_schema().table_schema()
+    }
+
     /// Set the columns on which to project the data. Indexes that are higher than the
     /// number of columns of `file_schema` refer to `table_partition_cols`.
-    pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
-        self.projection = projection;
-        self
+    ///
+    /// # Deprecated
+    /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release.
+    #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")]
+    pub fn with_projection(self, indices: Option<Vec<usize>>) -> Self {
+        match self.clone().with_projection_indices(indices) {
+            Ok(builder) => builder,
+            Err(e) => {
+                warn!(
+                    "Failed to push down projection in FileScanConfigBuilder::with_projection: {e}"
+                );
+                self
+            }
+        }
     }
 
-    /// Set the partitioning columns
-    pub fn with_table_partition_cols(mut self, table_partition_cols: Vec<Field>) -> Self {
-        self.table_partition_cols = table_partition_cols;
-        self
+    /// Set the columns on which to project the data using column indices.
+    ///
+    /// This method attempts to push down the projection to the underlying file
+    /// source if supported. If the file source does not support projection
+    /// pushdown, an error is returned.
+    ///
+    /// Indexes that are higher than the number of columns of `file_schema`
+    /// refer to `table_partition_cols`.
+    pub fn with_projection_indices(
+        mut self,
+        indices: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        let projection_exprs = indices.map(|indices| {
+            ProjectionExprs::from_indices(
+                &indices,
+                self.file_source.table_schema().table_schema(),
+            )
+        });
+        let Some(projection_exprs) = projection_exprs else {
+            return Ok(self);
+        };
+        let new_source = self
+            .file_source
+            .try_pushdown_projection(&projection_exprs)
+            .map_err(|e| {
+                internal_datafusion_err!(
+                    "Failed to push down projection in FileScanConfigBuilder::build: {e}"
+                )
+            })?;
+        if let Some(new_source) = new_source {
+            self.file_source = new_source;
+        } else {
+            internal_err!(
+                "FileSource {} does not support projection pushdown",
+                self.file_source.file_type()
+            )?;
+        }
+        Ok(self)
     }
 
     /// Set the table constraints
@@ -326,8 +404,18 @@ impl FileScanConfigBuilder {
         self
     }
 
-    /// Set the estimated overall statistics of the files, taking `filters` into account.
-    /// Defaults to [`Statistics::new_unknown`].
+    /// Set the statistics of the files, including partition
+    /// columns. Defaults to [`Statistics::new_unknown`].
+    ///
+    /// These statistics are for the entire table (file schema + partition
+    /// columns) before any projection or filtering is applied. Projections are
+    /// applied when statistics are retrieved, and if a filter is present,
+    /// [`FileScanConfig::statistics`] will mark the statistics as inexact
+    /// (counts are not adjusted).
+    ///
+    /// Projections and filters may be applied by the file source, either by
+    /// [`Self::with_projection_indices`] or a preexisting
+    /// [`FileSource::projection`] or [`FileSource::filter`].
     pub fn with_statistics(mut self, statistics: Statistics) -> Self {
         self.statistics = Some(statistics);
         self
@@ -358,11 +446,18 @@ impl FileScanConfigBuilder {
     /// Add a file as a single group
     ///
     /// See [`Self::with_file_groups`] for more information.
-    pub fn with_file(self, file: PartitionedFile) -> Self {
-        self.with_file_group(FileGroup::new(vec![file]))
+    pub fn with_file(self, partitioned_file: PartitionedFile) -> Self {
+        self.with_file_group(FileGroup::new(vec![partitioned_file]))
     }
 
     /// Set the output ordering of the files
+    ///
+    /// The expressions are in terms of the entire table schema (file schema +
+    /// partition columns), before any projection or filtering from the file
+    /// scan is applied.
+    ///
+    /// This is used for optimization purposes, e.g. to determine if a file scan
+    /// can satisfy an `ORDER BY` without an additional sort.
     pub fn with_output_ordering(mut self, output_ordering: Vec<LexOrdering>) -> Self {
         self.output_ordering = output_ordering;
         self
@@ -377,67 +472,84 @@ impl FileScanConfigBuilder {
         self
     }
 
-    /// Set whether new lines in values are supported for CSVOptions
-    ///
-    /// Parsing newlines in quoted values may be affected by execution behaviour such as
-    /// parallel file scanning. Setting this to `true` ensures that newlines in values are
-    /// parsed successfully, which may reduce performance.
-    pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self {
-        self.new_lines_in_values = Some(new_lines_in_values);
-        self
-    }
-
     /// Set the batch_size property
     pub fn with_batch_size(mut self, batch_size: Option<usize>) -> Self {
         self.batch_size = batch_size;
         self
     }
 
+    /// Register an expression adapter used to adapt filters and projections that are pushed down into the scan
+    /// from the logical schema to the physical schema of the file.
+    /// This can include things like:
+    /// - Column ordering changes
+    /// - Handling of missing columns
+    /// - Rewriting expression to use pre-computed values or file format specific optimizations
+    pub fn with_expr_adapter(
+        mut self,
+        expr_adapter: Option<Arc<dyn PhysicalExprAdapterFactory>>,
+    ) -> Self {
+        self.expr_adapter_factory = expr_adapter;
+        self
+    }
+
+    /// Set whether file groups are organized by partition column values.
+    ///
+    /// When set to true, the output partitioning will be declared as Hash partitioning
+    /// on the partition columns.
+    pub fn with_partitioned_by_file_group(
+        mut self,
+        partitioned_by_file_group: bool,
+    ) -> Self {
+        self.partitioned_by_file_group = partitioned_by_file_group;
+        self
+    }
+
     /// Build the final [`FileScanConfig`] with all the configured settings.
     ///
     /// This method takes ownership of the builder and returns the constructed `FileScanConfig`.
     /// Any unset optional fields will use their default values.
+    ///
+    /// # Errors
+    /// Returns an error if projection pushdown fails or if schema operations fail.
     pub fn build(self) -> FileScanConfig {
         let Self {
             object_store_url,
-            file_schema,
             file_source,
             limit,
-            projection,
-            table_partition_cols,
+            preserve_order,
             constraints,
             file_groups,
             statistics,
             output_ordering,
             file_compression_type,
-            new_lines_in_values,
             batch_size,
+            expr_adapter_factory: expr_adapter,
+            partitioned_by_file_group,
         } = self;
 
         let constraints = constraints.unwrap_or_default();
-        let statistics =
-            statistics.unwrap_or_else(|| Statistics::new_unknown(&file_schema));
-
-        let file_source = file_source
-            .with_statistics(statistics.clone())
-            .with_schema(Arc::clone(&file_schema));
+        let statistics = statistics.unwrap_or_else(|| {
+            Statistics::new_unknown(file_source.table_schema().table_schema())
+        });
         let file_compression_type =
             file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);
-        let new_lines_in_values = new_lines_in_values.unwrap_or(false);
+
+        // If there is an output ordering, we should preserve it.
+        let preserve_order = preserve_order || !output_ordering.is_empty();
 
         FileScanConfig {
             object_store_url,
-            file_schema,
             file_source,
             limit,
-            projection,
-            table_partition_cols,
+            preserve_order,
             constraints,
             file_groups,
             output_ordering,
             file_compression_type,
-            new_lines_in_values,
             batch_size,
+            expr_adapter_factory: expr_adapter,
+            statistics,
+            partitioned_by_file_group,
         }
     }
 }
@@ -446,18 +558,17 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
     fn from(config: FileScanConfig) -> Self {
         Self {
             object_store_url: config.object_store_url,
-            file_schema: config.file_schema,
             file_source: Arc::<dyn FileSource>::clone(&config.file_source),
             file_groups: config.file_groups,
-            statistics: config.file_source.statistics().ok(),
+            statistics: Some(config.statistics),
             output_ordering: config.output_ordering,
             file_compression_type: Some(config.file_compression_type),
-            new_lines_in_values: Some(config.new_lines_in_values),
             limit: config.limit,
-            projection: config.projection,
-            table_partition_cols: config.table_partition_cols,
+            preserve_order: config.preserve_order,
             constraints: Some(config.constraints),
             batch_size: config.batch_size,
+            expr_adapter_factory: config.expr_adapter_factory,
+            partitioned_by_file_group: config.partitioned_by_file_group,
         }
     }
 }
@@ -473,15 +584,18 @@ impl DataSource for FileScanConfig {
             .batch_size
             .unwrap_or_else(|| context.session_config().batch_size());
 
-        let source = self
-            .file_source
-            .with_batch_size(batch_size)
-            .with_projection(self);
+        let source = self.file_source.with_batch_size(batch_size);
 
-        let opener = source.create_file_opener(object_store, self, partition);
+        let morselizer = source.create_morselizer(object_store, self, partition)?;
 
-        let stream = FileStream::new(self, partition, opener, source.metrics())?;
-        Ok(Box::pin(stream))
+        let stream = FileStreamBuilder::new_with_morselizer(
+            self,
+            partition,
+            morselizer,
+            source.metrics(),
+        )
+        .build()?;
+        Ok(Box::pin(cooperative(stream)))
     }
 
     fn as_any(&self) -> &dyn Any {
@@ -491,14 +605,40 @@ impl DataSource for FileScanConfig {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                let schema = self.projected_schema();
+                let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?;
                 let orderings = get_projected_output_ordering(self, &schema);
 
                 write!(f, "file_groups=")?;
                 FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?;
 
                 if !schema.fields().is_empty() {
-                    write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
+                    if let Some(projection) = self.file_source.projection() {
+                        // This matches what ProjectionExec does.
+                        // TODO: can we put this into ProjectionExprs so that it's shared code?
+                        let expr: Vec<String> = projection
+                            .as_ref()
+                            .iter()
+                            .map(|proj_expr| {
+                                if let Some(column) =
+                                    proj_expr.expr.as_any().downcast_ref::<Column>()
+                                {
+                                    if column.name() == proj_expr.alias {
+                                        column.name().to_string()
+                                    } else {
+                                        format!(
+                                            "{} as {}",
+                                            proj_expr.expr, proj_expr.alias
+                                        )
+                                    }
+                                } else {
+                                    format!("{} as {}", proj_expr.expr, proj_expr.alias)
+                                }
+                            })
+                            .collect();
+                        write!(f, ", projection=[{}]", expr.join(", "))?;
+                    } else {
+                        write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?;
+                    }
                 }
 
                 if let Some(limit) = self.limit {
@@ -530,6 +670,13 @@ impl DataSource for FileScanConfig {
         repartition_file_min_size: usize,
         output_ordering: Option<LexOrdering>,
     ) -> Result<Option<Arc<dyn DataSource>>> {
+        // When files are grouped by partition values, we cannot allow byte-range
+        // splitting. It would mix rows from different partition values across
+        // file groups, breaking the Hash partitioning.
+        if self.partitioned_by_file_group {
+            return Ok(None);
+        }
+
         let source = self.file_source.repartitioned(
             target_partitions,
             repartition_file_min_size,
@@ -540,18 +687,140 @@ impl DataSource for FileScanConfig {
         Ok(source.map(|s| Arc::new(s) as _))
     }
 
+    /// Returns the output partitioning for this file scan.
+    ///
+    /// When `partitioned_by_file_group` is true, this returns `Partitioning::Hash` on
+    /// the Hive partition columns, allowing the optimizer to skip hash repartitioning
+    /// for aggregates and joins on those columns.
+    ///
+    /// Tradeoffs
+    /// - Benefit: Eliminates `RepartitionExec` and `SortExec` for queries with
+    ///   `GROUP BY` or `ORDER BY` on partition columns.
+    /// - Cost: Files are grouped by partition values rather than split by byte
+    ///   ranges, which may reduce I/O parallelism when partition sizes are uneven.
+    ///   For simple aggregations without `ORDER BY`, this cost may outweigh the benefit.
+    ///
+    /// Follow-up Work
+    /// - Idea: Could allow byte-range splitting within partition-aware groups,
+    ///   preserving I/O parallelism while maintaining partition semantics.
     fn output_partitioning(&self) -> Partitioning {
+        if self.partitioned_by_file_group {
+            let partition_cols = self.table_partition_cols();
+            if !partition_cols.is_empty() {
+                let projected_schema = match self.projected_schema() {
+                    Ok(schema) => schema,
+                    Err(_) => {
+                        debug!(
+                            "Could not get projected schema, falling back to UnknownPartitioning."
+                        );
+                        return Partitioning::UnknownPartitioning(self.file_groups.len());
+                    }
+                };
+
+                // Build Column expressions for partition columns based on their
+                // position in the projected schema
+                let mut exprs: Vec<Arc<dyn PhysicalExpr>> = Vec::new();
+                for partition_col in partition_cols {
+                    if let Some((idx, _)) = projected_schema
+                        .fields()
+                        .iter()
+                        .enumerate()
+                        .find(|(_, f)| f.name() == partition_col.name())
+                    {
+                        exprs.push(Arc::new(Column::new(partition_col.name(), idx)));
+                    }
+                }
+
+                if exprs.len() == partition_cols.len() {
+                    return Partitioning::Hash(exprs, self.file_groups.len());
+                }
+            }
+        }
         Partitioning::UnknownPartitioning(self.file_groups.len())
     }
 
+    /// Computes the effective equivalence properties of this file scan, taking
+    /// into account the file schema, any projections or filters applied by the
+    /// file source, and the output ordering.
     fn eq_properties(&self) -> EquivalenceProperties {
-        let (schema, constraints, _, orderings) = self.project();
-        EquivalenceProperties::new_with_orderings(schema, orderings.as_slice())
-            .with_constraints(constraints)
+        let schema = self.file_source.table_schema().table_schema();
+        let mut eq_properties = EquivalenceProperties::new_with_orderings(
+            Arc::clone(schema),
+            self.validated_output_ordering(),
+        )
+        .with_constraints(self.constraints.clone());
+
+        if let Some(filter) = self.file_source.filter() {
+            // We need to remap column indexes to match the projected schema since that's what the equivalence properties deal with.
+            // Note that this will *ignore* any non-projected columns: these don't factor into ordering / equivalence.
+            match Self::add_filter_equivalence_info(&filter, &mut eq_properties, schema) {
+                Ok(()) => {}
+                Err(e) => {
+                    warn!("Failed to add filter equivalence info: {e}");
+                    #[cfg(debug_assertions)]
+                    panic!("Failed to add filter equivalence info: {e}");
+                }
+            }
+        }
+
+        if let Some(projection) = self.file_source.projection() {
+            match (
+                projection.project_schema(schema),
+                projection.projection_mapping(schema),
+            ) {
+                (Ok(output_schema), Ok(mapping)) => {
+                    eq_properties =
+                        eq_properties.project(&mapping, Arc::new(output_schema));
+                }
+                (Err(e), _) | (_, Err(e)) => {
+                    warn!("Failed to project equivalence properties: {e}");
+                    #[cfg(debug_assertions)]
+                    panic!("Failed to project equivalence properties: {e}");
+                }
+            }
+        }
+
+        eq_properties
+    }
+
+    fn scheduling_type(&self) -> SchedulingType {
+        SchedulingType::Cooperative
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.projected_stats())
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        if let Some(partition) = partition {
+            // Get statistics for a specific partition
+            // Note: FileGroup statistics include partition columns (computed from partition_values)
+            if let Some(file_group) = self.file_groups.get(partition)
+                && let Some(stat) = file_group.file_statistics(None)
+            {
+                // Project the statistics based on the projection
+                let output_schema = self.projected_schema()?;
+                return if let Some(projection) = self.file_source.projection() {
+                    Ok(Arc::new(
+                        projection.project_statistics(stat.clone(), &output_schema)?,
+                    ))
+                } else {
+                    Ok(Arc::new(stat.clone()))
+                };
+            }
+            // If no statistics available for this partition, return unknown
+            Ok(Arc::new(Statistics::new_unknown(
+                self.projected_schema()?.as_ref(),
+            )))
+        } else {
+            // Return aggregate statistics across all partitions
+            let statistics = self.statistics();
+            let projection = self.file_source.projection();
+            let output_schema = self.projected_schema()?;
+            if let Some(projection) = &projection {
+                Ok(Arc::new(
+                    projection.project_statistics(statistics.clone(), &output_schema)?,
+                ))
+            } else {
+                Ok(Arc::new(statistics))
+            }
+        }
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
@@ -571,39 +840,16 @@ impl DataSource for FileScanConfig {
 
     fn try_swapping_with_projection(
         &self,
-        projection: &ProjectionExec,
-    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
-        // This process can be moved into CsvExec, but it would be an overlap of their responsibility.
-
-        // Must be all column references, with no table partition columns (which can not be projected)
-        let partitioned_columns_in_proj = projection.expr().iter().any(|(expr, _)| {
-            expr.as_any()
-                .downcast_ref::<Column>()
-                .map(|expr| expr.index() >= self.file_schema.fields().len())
-                .unwrap_or(false)
-        });
-
-        // If there is any non-column or alias-carrier expression, Projection should not be removed.
-        let no_aliases = all_alias_free_columns(projection.expr());
-
-        Ok((no_aliases && !partitioned_columns_in_proj).then(|| {
-            let file_scan = self.clone();
-            let source = Arc::clone(&file_scan.file_source);
-            let new_projections = new_projections_for_columns(
-                projection,
-                &file_scan
-                    .projection
-                    .clone()
-                    .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect()),
-            );
-            DataSourceExec::from_data_source(
-                FileScanConfigBuilder::from(file_scan)
-                    // Assign projected statistics to source
-                    .with_projection(Some(new_projections))
-                    .with_source(source)
-                    .build(),
-            ) as _
-        }))
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
+        match self.file_source.try_pushdown_projection(projection)? {
+            Some(new_source) => {
+                let mut new_file_scan_config = self.clone();
+                new_file_scan_config.file_source = new_source;
+                Ok(Some(Arc::new(new_file_scan_config) as Arc<dyn DataSource>))
+            }
+            None => Ok(None),
+        }
     }
 
     fn try_pushdown_filters(
@@ -611,15 +857,39 @@ impl DataSource for FileScanConfig {
         filters: Vec<Arc<dyn PhysicalExpr>>,
         config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> {
-        let result = self.file_source.try_pushdown_filters(filters, config)?;
+        // Remap filter Column indices to match the table schema (file + partition columns).
+        // This is necessary because filters refer to the output schema of this `DataSource`
+        // (e.g., after projection pushdown has been applied) and need to be remapped to the table schema
+        // before being passed to the file source
+        //
+        // For example, consider a filter `c1_c2 > 5` being pushed down. If the
+        // `DataSource` has a projection `c1 + c2 as c1_c2`, the filter must be rewritten
+        // to refer to the table schema `c1 + c2 > 5`
+        let table_schema = self.file_source.table_schema().table_schema();
+        let filters_to_remap = if let Some(projection) = self.file_source.projection() {
+            filters
+                .into_iter()
+                .map(|filter| projection.unproject_expr(&filter))
+                .collect::<Result<Vec<_>>>()?
+        } else {
+            filters
+        };
+        // Now remap column indices to match the table schema.
+        let remapped_filters = filters_to_remap
+            .into_iter()
+            .map(|filter| reassign_expr_columns(filter, table_schema))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = self
+            .file_source
+            .try_pushdown_filters(remapped_filters, config)?;
         match result.updated_node {
             Some(new_file_source) => {
-                let file_scan_config = FileScanConfigBuilder::from(self.clone())
-                    .with_source(new_file_source)
-                    .build();
+                let mut new_file_scan_config = self.clone();
+                new_file_scan_config.file_source = new_file_source;
                 Ok(FilterPushdownPropagation {
                     filters: result.filters,
-                    updated_node: Some(Arc::new(file_scan_config) as _),
+                    updated_node: Some(Arc::new(new_file_scan_config) as _),
                 })
             }
             None => {
@@ -631,275 +901,182 @@ impl DataSource for FileScanConfig {
             }
         }
     }
-}
-
-impl FileScanConfig {
-    /// Create a new [`FileScanConfig`] with default settings for scanning files.
-    ///
-    /// See example on [`FileScanConfig`]
-    ///
-    /// No file groups are added by default. See [`Self::with_file`], [`Self::with_file_group`] and
-    /// [`Self::with_file_groups`].
-    ///
-    /// # Parameters:
-    /// * `object_store_url`: See [`Self::object_store_url`]
-    /// * `file_schema`: See [`Self::file_schema`]
-    #[allow(deprecated)] // `new` will be removed same time as `with_source`
-    pub fn new(
-        object_store_url: ObjectStoreUrl,
-        file_schema: SchemaRef,
-        file_source: Arc<dyn FileSource>,
-    ) -> Self {
-        let statistics = Statistics::new_unknown(&file_schema);
-        let file_source = file_source
-            .with_statistics(statistics.clone())
-            .with_schema(Arc::clone(&file_schema));
-        Self {
-            object_store_url,
-            file_schema,
-            file_groups: vec![],
-            constraints: Constraints::empty(),
-            projection: None,
-            limit: None,
-            table_partition_cols: vec![],
-            output_ordering: vec![],
-            file_compression_type: FileCompressionType::UNCOMPRESSED,
-            new_lines_in_values: false,
-            file_source: Arc::clone(&file_source),
-            batch_size: None,
-        }
-    }
-
-    /// Set the file source
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
-        self.file_source =
-            file_source.with_statistics(Statistics::new_unknown(&self.file_schema));
-        self
-    }
-
-    /// Set the table constraints of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.constraints = constraints;
-        self
-    }
 
-    /// Set the statistics of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_statistics(mut self, statistics: Statistics) -> Self {
-        self.file_source = self.file_source.with_statistics(statistics);
-        self
-    }
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        // Delegate to FileSource to see if it can optimize for the requested ordering.
+        let pushdown_result = self
+            .file_source
+            .try_pushdown_sort(order, &self.eq_properties())?;
 
-    fn projection_indices(&self) -> Vec<usize> {
-        match &self.projection {
-            Some(proj) => proj.clone(),
-            None => (0..self.file_schema.fields().len()
-                + self.table_partition_cols.len())
-                .collect(),
+        match pushdown_result {
+            SortOrderPushdownResult::Exact { inner } => {
+                Ok(SortOrderPushdownResult::Exact {
+                    inner: self.rebuild_with_source(inner, true, order)?,
+                })
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                Ok(SortOrderPushdownResult::Inexact {
+                    inner: self.rebuild_with_source(inner, false, order)?,
+                })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                Ok(SortOrderPushdownResult::Unsupported)
+            }
         }
     }
 
-    pub fn projected_stats(&self) -> Statistics {
-        let statistics = self.file_source.statistics().unwrap();
-
-        let table_cols_stats = self
-            .projection_indices()
-            .into_iter()
-            .map(|idx| {
-                if idx < self.file_schema.fields().len() {
-                    statistics.column_statistics[idx].clone()
-                } else {
-                    // TODO provide accurate stat for partition column (#1186)
-                    ColumnStatistics::new_unknown()
-                }
-            })
-            .collect();
-
-        Statistics {
-            num_rows: statistics.num_rows,
-            // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-            total_byte_size: statistics.total_byte_size,
-            column_statistics: table_cols_stats,
+    fn with_preserve_order(&self, preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        if self.preserve_order == preserve_order {
+            return Some(Arc::new(self.clone()));
         }
-    }
-
-    pub fn projected_schema(&self) -> Arc<Schema> {
-        let table_fields: Vec<_> = self
-            .projection_indices()
-            .into_iter()
-            .map(|idx| {
-                if idx < self.file_schema.fields().len() {
-                    self.file_schema.field(idx).clone()
-                } else {
-                    let partition_idx = idx - self.file_schema.fields().len();
-                    self.table_partition_cols[partition_idx].clone()
-                }
-            })
-            .collect();
-
-        Arc::new(Schema::new_with_metadata(
-            table_fields,
-            self.file_schema.metadata().clone(),
-        ))
-    }
-
-    pub fn projected_constraints(&self) -> Constraints {
-        let indexes = self.projection_indices();
 
-        self.constraints
-            .project(&indexes)
-            .unwrap_or_else(Constraints::empty)
-    }
-
-    /// Set the projection of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
-        self.projection = projection;
-        self
+        let new_config = FileScanConfig {
+            preserve_order,
+            ..self.clone()
+        };
+        Some(Arc::new(new_config))
     }
 
-    /// Set the limit of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-        self.limit = limit;
-        self
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Delegate to the file source
+        self.file_source.apply_expressions(f)
     }
+}
 
-    /// Add a file as a single group
+impl FileScanConfig {
+    /// Returns only the output orderings that are validated against actual
+    /// file group statistics.
     ///
-    /// See [Self::file_groups] for more information.
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    #[allow(deprecated)]
-    pub fn with_file(self, file: PartitionedFile) -> Self {
-        self.with_file_group(FileGroup::new(vec![file]))
-    }
-
-    /// Add the file groups
+    /// For example, individual files may be ordered by `col1 ASC`,
+    /// but if we have files with these min/max statistics in a single partition / file group:
     ///
-    /// See [Self::file_groups] for more information.
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_file_groups(mut self, mut file_groups: Vec<FileGroup>) -> Self {
-        self.file_groups.append(&mut file_groups);
-        self
-    }
-
-    /// Add a new file group
+    /// - file1: min(col1) = 10, max(col1) = 20
+    /// - file2: min(col1) = 5, max(col1) = 15
     ///
-    /// See [Self::file_groups] for more information
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_file_group(mut self, file_group: FileGroup) -> Self {
-        self.file_groups.push(file_group);
-        self
-    }
-
-    /// Set the partitioning columns of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_table_partition_cols(mut self, table_partition_cols: Vec<Field>) -> Self {
-        self.table_partition_cols = table_partition_cols;
-        self
-    }
-
-    /// Set the output ordering of the files
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_output_ordering(mut self, output_ordering: Vec<LexOrdering>) -> Self {
-        self.output_ordering = output_ordering;
-        self
-    }
-
-    /// Set the file compression type
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_file_compression_type(
-        mut self,
-        file_compression_type: FileCompressionType,
-    ) -> Self {
-        self.file_compression_type = file_compression_type;
-        self
+    /// Because reading file1 followed by file2 would produce out-of-order output (there is overlap
+    /// in the ranges), we cannot retain `col1 ASC` as a valid output ordering.
+    ///
+    /// Similarly this would not be a valid order (non-overlapping ranges but not ordered):
+    ///
+    /// - file1: min(col1) = 20, max(col1) = 30
+    /// - file2: min(col1) = 10, max(col1) = 15
+    ///
+    /// On the other hand if we had:
+    ///
+    /// - file1: min(col1) = 5, max(col1) = 15
+    /// - file2: min(col1) = 16, max(col1) = 25
+    ///
+    /// Then we know that reading file1 followed by file2 will produce ordered output,
+    /// so `col1 ASC` would be retained.
+    ///
+    /// Note that we are checking for ordering *within* *each* file group / partition,
+    /// files in different partitions are read independently and do not affect each other's ordering.
+    /// Merging of the multiple partition streams into a single ordered stream is handled
+    /// upstream e.g. by `SortPreservingMergeExec`.
+    fn validated_output_ordering(&self) -> Vec<LexOrdering> {
+        let schema = self.file_source.table_schema().table_schema();
+        validate_orderings(&self.output_ordering, schema, &self.file_groups, None)
     }
 
-    /// Set the new_lines_in_values property
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self {
-        self.new_lines_in_values = new_lines_in_values;
-        self
+    /// Get the file schema (schema of the files without partition columns)
+    pub fn file_schema(&self) -> &SchemaRef {
+        self.file_source.table_schema().file_schema()
     }
 
-    /// Set the batch_size property
-    #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")]
-    pub fn with_batch_size(mut self, batch_size: Option<usize>) -> Self {
-        self.batch_size = batch_size;
-        self
+    /// Get the table partition columns
+    pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
+        self.file_source.table_schema().table_partition_cols()
     }
 
-    /// Specifies whether newlines in (quoted) values are supported.
+    /// Returns the unprojected table statistics, marking them as inexact if filters are present.
     ///
-    /// Parsing newlines in quoted values may be affected by execution behaviour such as
-    /// parallel file scanning. Setting this to `true` ensures that newlines in values are
-    /// parsed successfully, which may reduce performance.
-    ///
-    /// The default behaviour depends on the `datafusion.catalog.newlines_in_values` setting.
-    pub fn newlines_in_values(&self) -> bool {
-        self.new_lines_in_values
+    /// When filters are pushed down (including pruning predicates and bloom filters),
+    /// we can't guarantee the statistics are exact because we don't know how many
+    /// rows will be filtered out.
+    pub fn statistics(&self) -> Statistics {
+        if self.file_source.filter().is_some() {
+            self.statistics.clone().to_inexact()
+        } else {
+            self.statistics.clone()
+        }
     }
 
-    /// Project the schema, constraints, and the statistics on the given column indices
-    pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec<LexOrdering>) {
-        if self.projection.is_none() && self.table_partition_cols.is_empty() {
-            return (
-                Arc::clone(&self.file_schema),
-                self.constraints.clone(),
-                self.file_source.statistics().unwrap().clone(),
-                self.output_ordering.clone(),
-            );
+    pub fn projected_schema(&self) -> Result<Arc<Schema>> {
+        let schema = self.file_source.table_schema().table_schema();
+        match self.file_source.projection() {
+            Some(proj) => Ok(Arc::new(proj.project_schema(schema)?)),
+            None => Ok(Arc::clone(schema)),
         }
+    }
 
-        let schema = self.projected_schema();
-        let constraints = self.projected_constraints();
-        let stats = self.projected_stats();
+    fn add_filter_equivalence_info(
+        filter: &Arc<dyn PhysicalExpr>,
+        eq_properties: &mut EquivalenceProperties,
+        schema: &Schema,
+    ) -> Result<()> {
+        // Gather valid equality pairs from the filter expression
+        let equal_pairs = split_conjunction(filter).into_iter().filter_map(|expr| {
+            // Ignore any binary expressions that reference non-existent columns in the current schema
+            // (e.g. due to unnecessary projections being removed)
+            reassign_expr_columns(Arc::clone(expr), schema)
+                .ok()
+                .and_then(|expr| match expr.as_any().downcast_ref::<BinaryExpr>() {
+                    Some(expr) if expr.op() == &Operator::Eq => {
+                        Some((Arc::clone(expr.left()), Arc::clone(expr.right())))
+                    }
+                    _ => None,
+                })
+        });
 
-        let output_ordering = get_projected_output_ordering(self, &schema);
+        for (lhs, rhs) in equal_pairs {
+            eq_properties.add_equal_conditions(lhs, rhs)?
+        }
 
-        (schema, constraints, stats, output_ordering)
+        Ok(())
     }
 
-    pub fn projected_file_column_names(&self) -> Option<Vec<String>> {
-        self.projection.as_ref().map(|p| {
-            p.iter()
-                .filter(|col_idx| **col_idx < self.file_schema.fields().len())
-                .map(|col_idx| self.file_schema.field(*col_idx).name())
-                .cloned()
-                .collect()
-        })
+    /// Returns whether newlines in values are supported.
+    ///
+    /// This method always returns `false`. The actual newlines_in_values setting
+    /// has been moved to [`CsvSource`] and should be accessed via
+    /// [`CsvSource::csv_options()`] instead.
+    ///
+    /// [`CsvSource`]: https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.CsvSource.html
+    /// [`CsvSource::csv_options()`]: https://docs.rs/datafusion/latest/datafusion/datasource/physical_plan/struct.CsvSource.html#method.csv_options
+    #[deprecated(
+        since = "52.0.0",
+        note = "newlines_in_values has moved to CsvSource. Access it via CsvSource::csv_options().newlines_in_values instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn newlines_in_values(&self) -> bool {
+        false
     }
 
-    /// Projects only file schema, ignoring partition columns
-    pub fn projected_file_schema(&self) -> SchemaRef {
-        let fields = self.file_column_projection_indices().map(|indices| {
-            indices
-                .iter()
-                .map(|col_idx| self.file_schema.field(*col_idx))
-                .cloned()
-                .collect::<Vec<_>>()
-        });
-
-        fields.map_or_else(
-            || Arc::clone(&self.file_schema),
-            |f| {
-                Arc::new(Schema::new_with_metadata(
-                    f,
-                    self.file_schema.metadata.clone(),
-                ))
-            },
-        )
+    #[deprecated(
+        since = "52.0.0",
+        note = "This method is no longer used, use eq_properties instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn projected_constraints(&self) -> Constraints {
+        let props = self.eq_properties();
+        props.constraints().clone()
     }
 
+    #[deprecated(
+        since = "52.0.0",
+        note = "This method is no longer used, use eq_properties instead. It will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
     pub fn file_column_projection_indices(&self) -> Option<Vec<usize>> {
-        self.projection.as_ref().map(|p| {
-            p.iter()
-                .filter(|col_idx| **col_idx < self.file_schema.fields().len())
-                .copied()
-                .collect()
+        #[expect(deprecated)]
+        self.file_source.projection().as_ref().map(|p| {
+            p.ordered_column_indices()
+                .into_iter()
+                .filter(|&i| i < self.file_schema().fields().len())
+                .collect::<Vec<_>>()
         })
     }
 
@@ -931,8 +1108,8 @@ impl FileScanConfig {
         target_partitions: usize,
     ) -> Result<Vec<FileGroup>> {
         if target_partitions == 0 {
-            return Err(DataFusionError::Internal(
-                "target_partitions must be greater than 0".to_string(),
+            return Err(internal_datafusion_err!(
+                "target_partitions must be greater than 0"
             ));
         }
 
@@ -1061,12 +1238,6 @@ impl FileScanConfig {
             .collect())
     }
 
-    /// Returns a new [`DataSourceExec`] to scan the files specified by this config
-    #[deprecated(since = "47.0.0", note = "use DataSourceExec::new instead")]
-    pub fn build(self) -> Arc<DataSourceExec> {
-        DataSourceExec::from_data_source(self)
-    }
-
     /// Write the data_type based on file_source
     fn fmt_file_source(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
         write!(f, ", file_type={}", self.file_source.file_type())?;
@@ -1077,18 +1248,69 @@ impl FileScanConfig {
     pub fn file_source(&self) -> &Arc<dyn FileSource> {
         &self.file_source
     }
-}
-
-impl Debug for FileScanConfig {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "FileScanConfig {{")?;
-        write!(f, "object_store_url={:?}, ", self.object_store_url)?;
-
-        write!(
-            f,
-            "statistics={:?}, ",
-            self.file_source.statistics().unwrap()
-        )?;
+
+    /// Helper: Rebuild FileScanConfig with new file source
+    fn rebuild_with_source(
+        &self,
+        new_file_source: Arc<dyn FileSource>,
+        is_exact: bool,
+        order: &[PhysicalSortExpr],
+    ) -> Result<Arc<dyn DataSource>> {
+        let mut new_config = self.clone();
+
+        // Reverse file order (within each group) if the caller is requesting a reversal of this
+        // scan's declared output ordering.
+        //
+        // Historically this function always reversed `file_groups` because it was only reached
+        // via `FileSource::try_reverse_output` (where a reversal was the only supported
+        // optimization).
+        //
+        // Now that `FileSource::try_pushdown_sort` is generic, we must not assume reversal: other
+        // optimizations may become possible (e.g. already-sorted data, statistics-based file
+        // reordering). Therefore we only reverse files when it is known to help satisfy the
+        // requested ordering.
+        let reverse_file_groups = if self.output_ordering.is_empty() {
+            false
+        } else if let Some(requested) = LexOrdering::new(order.iter().cloned()) {
+            let projected_schema = self.projected_schema()?;
+            let orderings = project_orderings(&self.output_ordering, &projected_schema);
+            orderings
+                .iter()
+                .any(|ordering| ordering.is_reverse(&requested))
+        } else {
+            false
+        };
+
+        if reverse_file_groups {
+            new_config.file_groups = new_config
+                .file_groups
+                .into_iter()
+                .map(|group| {
+                    let mut files = group.into_inner();
+                    files.reverse();
+                    files.into()
+                })
+                .collect();
+        }
+
+        new_config.file_source = new_file_source;
+
+        // Phase 1: Clear output_ordering for Inexact
+        // (we're only reversing row groups, not guaranteeing perfect ordering)
+        if !is_exact {
+            new_config.output_ordering = vec![];
+        }
+
+        Ok(Arc::new(new_config))
+    }
+}
+
+impl Debug for FileScanConfig {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "FileScanConfig {{")?;
+        write!(f, "object_store_url={:?}, ", self.object_store_url)?;
+
+        write!(f, "statistics={:?}, ", self.statistics())?;
 
         DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
         write!(f, "}}")
@@ -1097,7 +1319,7 @@ impl Debug for FileScanConfig {
 
 impl DisplayAs for FileScanConfig {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult {
-        let schema = self.projected_schema();
+        let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?;
         let orderings = get_projected_output_ordering(self, &schema);
 
         write!(f, "file_groups=")?;
@@ -1121,252 +1343,64 @@ impl DisplayAs for FileScanConfig {
     }
 }
 
-/// A helper that projects partition columns into the file record batches.
-///
-/// One interesting trick is the usage of a cache for the key buffers of the partition column
-/// dictionaries. Indeed, the partition columns are constant, so the dictionaries that represent them
-/// have all their keys equal to 0. This enables us to re-use the same "all-zero" buffer across batches,
-/// which makes the space consumption of the partition columns O(batch_size) instead of O(record_count).
-pub struct PartitionColumnProjector {
-    /// An Arrow buffer initialized to zeros that represents the key array of all partition
-    /// columns (partition columns are materialized by dictionary arrays with only one
-    /// value in the dictionary, thus all the keys are equal to zero).
-    key_buffer_cache: ZeroBufferGenerators,
-    /// Mapping between the indexes in the list of partition columns and the target
-    /// schema. Sorted by index in the target schema so that we can iterate on it to
-    /// insert the partition columns in the target record batch.
-    projected_partition_indexes: Vec<(usize, usize)>,
-    /// The schema of the table once the projection was applied.
-    projected_schema: SchemaRef,
+/// Get the indices of columns in a projection if the projection is a simple
+/// list of columns.
+/// If there are any expressions other than columns, returns None.
+fn ordered_column_indices_from_projection(
+    projection: &ProjectionExprs,
+) -> Option<Vec<usize>> {
+    projection
+        .expr_iter()
+        .map(|e| {
+            let index = e.as_any().downcast_ref::<Column>()?.index();
+            Some(index)
+        })
+        .collect::<Option<Vec<usize>>>()
 }
 
-impl PartitionColumnProjector {
-    // Create a projector to insert the partitioning columns into batches read from files
-    // - `projected_schema`: the target schema with both file and partitioning columns
-    // - `table_partition_cols`: all the partitioning column names
-    pub fn new(projected_schema: SchemaRef, table_partition_cols: &[String]) -> Self {
-        let mut idx_map = HashMap::new();
-        for (partition_idx, partition_name) in table_partition_cols.iter().enumerate() {
-            if let Ok(schema_idx) = projected_schema.index_of(partition_name) {
-                idx_map.insert(partition_idx, schema_idx);
-            }
-        }
-
-        let mut projected_partition_indexes: Vec<_> = idx_map.into_iter().collect();
-        projected_partition_indexes.sort_by(|(_, a), (_, b)| a.cmp(b));
-
-        Self {
-            projected_partition_indexes,
-            key_buffer_cache: Default::default(),
-            projected_schema,
+/// Check whether a given ordering is valid for all file groups by verifying
+/// that files within each group are sorted according to their min/max statistics.
+///
+/// For single-file (or empty) groups, the ordering is trivially valid.
+/// For multi-file groups, we check that the min/max statistics for the sort
+/// columns are in order and non-overlapping (or touching at boundaries).
+///
+/// `projection` maps projected column indices back to table-schema indices
+/// when validating after projection; pass `None` when validating at
+/// table-schema level.
+fn is_ordering_valid_for_file_groups(
+    file_groups: &[FileGroup],
+    ordering: &LexOrdering,
+    schema: &SchemaRef,
+    projection: Option<&[usize]>,
+) -> bool {
+    file_groups.iter().all(|group| {
+        if group.len() <= 1 {
+            return true; // single-file groups are trivially sorted
         }
-    }
-
-    // Transform the batch read from the file by inserting the partitioning columns
-    // to the right positions as deduced from `projected_schema`
-    // - `file_batch`: batch read from the file, with internal projection applied
-    // - `partition_values`: the list of partition values, one for each partition column
-    pub fn project(
-        &mut self,
-        file_batch: RecordBatch,
-        partition_values: &[ScalarValue],
-    ) -> Result<RecordBatch> {
-        let expected_cols =
-            self.projected_schema.fields().len() - self.projected_partition_indexes.len();
-
-        if file_batch.columns().len() != expected_cols {
-            return exec_err!(
-                "Unexpected batch schema from file, expected {} cols but got {}",
-                expected_cols,
-                file_batch.columns().len()
-            );
+        match MinMaxStatistics::new_from_files(ordering, schema, projection, group.iter())
+        {
+            Ok(stats) => stats.is_sorted(),
+            Err(_) => false, // can't prove sorted → reject
         }
-
-        let mut cols = file_batch.columns().to_vec();
-        for &(pidx, sidx) in &self.projected_partition_indexes {
-            let p_value =
-                partition_values
-                    .get(pidx)
-                    .ok_or(DataFusionError::Execution(
-                        "Invalid partitioning found on disk".to_string(),
-                    ))?;
-
-            let mut partition_value = Cow::Borrowed(p_value);
-
-            // check if user forgot to dict-encode the partition value
-            let field = self.projected_schema.field(sidx);
-            let expected_data_type = field.data_type();
-            let actual_data_type = partition_value.data_type();
-            if let DataType::Dictionary(key_type, _) = expected_data_type {
-                if !matches!(actual_data_type, DataType::Dictionary(_, _)) {
-                    warn!("Partition value for column {} was not dictionary-encoded, applied auto-fix.", field.name());
-                    partition_value = Cow::Owned(ScalarValue::Dictionary(
-                        key_type.clone(),
-                        Box::new(partition_value.as_ref().clone()),
-                    ));
-                }
-            }
-
-            cols.insert(
-                sidx,
-                create_output_array(
-                    &mut self.key_buffer_cache,
-                    partition_value.as_ref(),
-                    file_batch.num_rows(),
-                )?,
-            )
-        }
-
-        RecordBatch::try_new_with_options(
-            Arc::clone(&self.projected_schema),
-            cols,
-            &RecordBatchOptions::new().with_row_count(Some(file_batch.num_rows())),
-        )
-        .map_err(Into::into)
-    }
-}
-
-#[derive(Debug, Default)]
-struct ZeroBufferGenerators {
-    gen_i8: ZeroBufferGenerator<i8>,
-    gen_i16: ZeroBufferGenerator<i16>,
-    gen_i32: ZeroBufferGenerator<i32>,
-    gen_i64: ZeroBufferGenerator<i64>,
-    gen_u8: ZeroBufferGenerator<u8>,
-    gen_u16: ZeroBufferGenerator<u16>,
-    gen_u32: ZeroBufferGenerator<u32>,
-    gen_u64: ZeroBufferGenerator<u64>,
+    })
 }
 
-/// Generate a arrow [`Buffer`] that contains zero values.
-#[derive(Debug, Default)]
-struct ZeroBufferGenerator<T>
-where
-    T: ArrowNativeType,
-{
-    cache: Option<Buffer>,
-    _t: PhantomData<T>,
-}
-
-impl<T> ZeroBufferGenerator<T>
-where
-    T: ArrowNativeType,
-{
-    const SIZE: usize = size_of::<T>();
-
-    fn get_buffer(&mut self, n_vals: usize) -> Buffer {
-        match &mut self.cache {
-            Some(buf) if buf.len() >= n_vals * Self::SIZE => {
-                buf.slice_with_length(0, n_vals * Self::SIZE)
-            }
-            _ => {
-                let mut key_buffer_builder = BufferBuilder::<T>::new(n_vals);
-                key_buffer_builder.advance(n_vals); // keys are all 0
-                self.cache.insert(key_buffer_builder.finish()).clone()
-            }
-        }
-    }
-}
-
-fn create_dict_array<T>(
-    buffer_gen: &mut ZeroBufferGenerator<T>,
-    dict_val: &ScalarValue,
-    len: usize,
-    data_type: DataType,
-) -> Result<ArrayRef>
-where
-    T: ArrowNativeType,
-{
-    let dict_vals = dict_val.to_array()?;
-
-    let sliced_key_buffer = buffer_gen.get_buffer(len);
-
-    // assemble pieces together
-    let mut builder = ArrayData::builder(data_type)
-        .len(len)
-        .add_buffer(sliced_key_buffer);
-    builder = builder.add_child_data(dict_vals.to_data());
-    Ok(Arc::new(DictionaryArray::<UInt16Type>::from(
-        builder.build().unwrap(),
-    )))
-}
-
-fn create_output_array(
-    key_buffer_cache: &mut ZeroBufferGenerators,
-    val: &ScalarValue,
-    len: usize,
-) -> Result<ArrayRef> {
-    if let ScalarValue::Dictionary(key_type, dict_val) = &val {
-        match key_type.as_ref() {
-            DataType::Int8 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i8,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int16 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i16,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int32 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i32,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::Int64 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_i64,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt8 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u8,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt16 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u16,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt32 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u32,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            DataType::UInt64 => {
-                return create_dict_array(
-                    &mut key_buffer_cache.gen_u64,
-                    dict_val,
-                    len,
-                    val.data_type(),
-                );
-            }
-            _ => {}
-        }
-    }
-
-    val.to_array_of_size(len)
+/// Filters orderings to retain only those valid for all file groups,
+/// verified via min/max statistics.
+fn validate_orderings(
+    orderings: &[LexOrdering],
+    schema: &SchemaRef,
+    file_groups: &[FileGroup],
+    projection: Option<&[usize]>,
+) -> Vec<LexOrdering> {
+    orderings
+        .iter()
+        .filter(|ordering| {
+            is_ordering_valid_for_file_groups(file_groups, ordering, schema, projection)
+        })
+        .cloned()
+        .collect()
 }
 
 /// The various listing tables does not attempt to read all files
@@ -1383,25 +1417,25 @@ fn create_output_array(
 /// correctly sorted on `(A, B, C)`
 ///
 /// ```text
-///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓
-///  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
-///┃   ┌───────────────┐     ┌──────────────┐ │   ┌──────────────┐ │   ┌─────────────┐   ┃
-///  │ │   1.parquet   │ │ │ │  2.parquet   │   │ │  3.parquet   │   │ │  4.parquet  │ │
-///┃   │ Sort: A, B, C │     │Sort: A, B, C │ │   │Sort: A, B, C │ │   │Sort: A, B, C│   ┃
-///  │ └───────────────┘ │ │ └──────────────┘   │ └──────────────┘   │ └─────────────┘ │
-///┃                                          │                    │                     ┃
-///  │                   │ │                    │                    │                 │
-///┃                                          │                    │                     ┃
-///  │                   │ │                    │                    │                 │
-///┃                                          │                    │                     ┃
-///  │                   │ │                    │                    │                 │
-///┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
-///     DataFusion           DataFusion           DataFusion           DataFusion
-///┃    Partition 1          Partition 2          Partition 3          Partition 4       ┃
-/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
+/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓
+///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐
+/// ┃   ┌───────────────┐     ┌──────────────┐ │   ┌──────────────┐ │   ┌─────────────┐   ┃
+///   │ │   1.parquet   │ │ │ │  2.parquet   │   │ │  3.parquet   │   │ │  4.parquet  │ │
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │   │Sort: A, B, C │ │   │Sort: A, B, C│   ┃
+///   │ └───────────────┘ │ │ └──────────────┘   │ └──────────────┘   │ └─────────────┘ │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃                                          │                    │                     ┃
+///   │                   │ │                    │                    │                 │
+/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘  ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
+///      DataFusion           DataFusion           DataFusion           DataFusion
+/// ┃    Partition 1          Partition 2          Partition 3          Partition 4       ┃
+///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
 ///
 ///                                      DataSourceExec
-///```
+/// ```
 ///
 /// However, when more than 1 file is assigned to each partition, each
 /// partition is NOT correctly sorted on `(A, B, C)`. Once the second
@@ -1409,89 +1443,73 @@ fn create_output_array(
 /// the same sorted stream
 ///
 ///```text
-///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
-///  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
-///┃   ┌───────────────┐     ┌──────────────┐ │
-///  │ │   1.parquet   │ │ │ │  2.parquet   │   ┃
-///┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
-///  │ └───────────────┘ │ │ └──────────────┘   ┃
-///┃   ┌───────────────┐     ┌──────────────┐ │
-///  │ │   3.parquet   │ │ │ │  4.parquet   │   ┃
-///┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
-///  │ └───────────────┘ │ │ └──────────────┘   ┃
-///┃                                          │
-///  │                   │ │                    ┃
-///┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
-///     DataFusion           DataFusion         ┃
-///┃    Partition 1          Partition 2
-/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛
+/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━
+///   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─  ┃
+/// ┃   ┌───────────────┐     ┌──────────────┐ │
+///   │ │   1.parquet   │ │ │ │  2.parquet   │   ┃
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
+///   │ └───────────────┘ │ │ └──────────────┘   ┃
+/// ┃   ┌───────────────┐     ┌──────────────┐ │
+///   │ │   3.parquet   │ │ │ │  4.parquet   │   ┃
+/// ┃   │ Sort: A, B, C │     │Sort: A, B, C │ │
+///   │ └───────────────┘ │ │ └──────────────┘   ┃
+/// ┃                                          │
+///   │                   │ │                    ┃
+/// ┃  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
+///      DataFusion           DataFusion         ┃
+/// ┃    Partition 1          Partition 2
+///  ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛
 ///
 ///              DataSourceExec
-///```
+/// ```
 fn get_projected_output_ordering(
     base_config: &FileScanConfig,
     projected_schema: &SchemaRef,
 ) -> Vec<LexOrdering> {
-    let mut all_orderings = vec![];
-    for output_ordering in &base_config.output_ordering {
-        let mut new_ordering = LexOrdering::default();
-        for PhysicalSortExpr { expr, options } in output_ordering.iter() {
-            if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-                let name = col.name();
-                if let Some((idx, _)) = projected_schema.column_with_name(name) {
-                    // Compute the new sort expression (with correct index) after projection:
-                    new_ordering.push(PhysicalSortExpr {
-                        expr: Arc::new(Column::new(name, idx)),
-                        options: *options,
-                    });
-                    continue;
-                }
-            }
-            // Cannot find expression in the projected_schema, stop iterating
-            // since rest of the orderings are violated
-            break;
+    let projected_orderings =
+        project_orderings(&base_config.output_ordering, projected_schema);
+
+    let indices = base_config
+        .file_source
+        .projection()
+        .as_ref()
+        .map(|p| ordered_column_indices_from_projection(p));
+
+    match indices {
+        Some(Some(indices)) => {
+            // Simple column projection — validate with statistics
+            validate_orderings(
+                &projected_orderings,
+                projected_schema,
+                &base_config.file_groups,
+                Some(indices.as_slice()),
+            )
         }
-
-        // do not push empty entries
-        // otherwise we may have `Some(vec![])` at the output ordering.
-        if new_ordering.is_empty() {
-            continue;
+        None => {
+            // No projection — validate with statistics (no remapping needed)
+            validate_orderings(
+                &projected_orderings,
+                projected_schema,
+                &base_config.file_groups,
+                None,
+            )
         }
-
-        // Check if any file groups are not sorted
-        if base_config.file_groups.iter().any(|group| {
-            if group.len() <= 1 {
-                // File groups with <= 1 files are always sorted
-                return false;
+        Some(None) => {
+            // Complex projection (expressions, not simple columns) — can't
+            // determine column indices for statistics. Still valid if all
+            // file groups have at most one file.
+            if base_config.file_groups.iter().all(|g| g.len() <= 1) {
+                projected_orderings
+            } else {
+                debug!(
+                    "Skipping specified output orderings. \
+                     Some file groups couldn't be determined to be sorted: {:?}",
+                    base_config.file_groups
+                );
+                vec![]
             }
-
-            let statistics = match MinMaxStatistics::new_from_files(
-                &new_ordering,
-                projected_schema,
-                base_config.projection.as_deref(),
-                group.iter(),
-            ) {
-                Ok(statistics) => statistics,
-                Err(e) => {
-                    log::trace!("Error fetching statistics for file group: {e}");
-                    // we can't prove that it's ordered, so we have to reject it
-                    return true;
-                }
-            };
-
-            !statistics.is_sorted()
-        }) {
-            debug!(
-                "Skipping specified output ordering {:?}. \
-                Some file groups couldn't be determined to be sorted: {:?}",
-                base_config.output_ordering[0], base_config.file_groups
-            );
-            continue;
         }
-
-        all_orderings.push(new_ordering);
     }
-    all_orderings
 }
 
 /// Convert type to a type suitable for use as a `ListingTable`
@@ -1517,78 +1535,86 @@ pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+    use crate::TableSchema;
+    use crate::test_util::col;
     use crate::{
         generate_test_files, test_util::MockSource, tests::aggr_test_schema,
         verify_sort_integrity,
     };
 
-    use super::*;
-    use arrow::{
-        array::{Int32Array, RecordBatch},
-        compute::SortOptions,
-    };
-
+    use arrow::datatypes::Field;
     use datafusion_common::stats::Precision;
-    use datafusion_common::{assert_batches_eq, DFSchema};
-    use datafusion_expr::{execution_props::ExecutionProps, SortExpr};
-    use datafusion_physical_expr::create_physical_expr;
-    use std::collections::HashMap;
-
-    fn create_physical_sort_expr(
-        e: &SortExpr,
-        input_dfschema: &DFSchema,
-        execution_props: &ExecutionProps,
-    ) -> Result<PhysicalSortExpr> {
-        let SortExpr {
-            expr,
-            asc,
-            nulls_first,
-        } = e;
-        Ok(PhysicalSortExpr {
-            expr: create_physical_expr(expr, input_dfschema, execution_props)?,
-            options: SortOptions {
-                descending: !asc,
-                nulls_first: *nulls_first,
-            },
-        })
+    use datafusion_common::{ColumnStatistics, internal_err};
+    use datafusion_expr::{Operator, SortExpr};
+    use datafusion_physical_expr::create_physical_sort_expr;
+    use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
+    use datafusion_physical_expr::projection::ProjectionExpr;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+    #[derive(Clone)]
+    struct InexactSortPushdownSource {
+        metrics: ExecutionPlanMetricsSet,
+        table_schema: TableSchema,
+    }
+
+    impl InexactSortPushdownSource {
+        fn new(table_schema: TableSchema) -> Self {
+            Self {
+                metrics: ExecutionPlanMetricsSet::new(),
+                table_schema,
+            }
+        }
     }
 
-    /// Returns the column names on the schema
-    pub fn columns(schema: &Schema) -> Vec<String> {
-        schema.fields().iter().map(|f| f.name().clone()).collect()
-    }
+    impl FileSource for InexactSortPushdownSource {
+        fn create_file_opener(
+            &self,
+            _object_store: Arc<dyn object_store::ObjectStore>,
+            _base_config: &FileScanConfig,
+            _partition: usize,
+        ) -> Result<Arc<dyn crate::file_stream::FileOpener>> {
+            unimplemented!()
+        }
 
-    #[test]
-    fn physical_plan_config_no_projection() {
-        let file_schema = aggr_test_schema();
-        let conf = config_for_projection(
-            Arc::clone(&file_schema),
-            None,
-            Statistics::new_unknown(&file_schema),
-            to_partition_cols(vec![(
-                "date".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            )]),
-        );
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
 
-        let (proj_schema, _, proj_statistics, _) = conf.project();
-        assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1);
-        assert_eq!(
-            proj_schema.field(file_schema.fields().len()).name(),
-            "date",
-            "partition columns are the last columns"
-        );
-        assert_eq!(
-            proj_statistics.column_statistics.len(),
-            file_schema.fields().len() + 1
-        );
-        // TODO implement tests for partition column statistics once implemented
+        fn table_schema(&self) -> &TableSchema {
+            &self.table_schema
+        }
+
+        fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
+            Arc::new(self.clone())
+        }
+
+        fn metrics(&self) -> &ExecutionPlanMetricsSet {
+            &self.metrics
+        }
+
+        fn file_type(&self) -> &str {
+            "mock"
+        }
 
-        let col_names = conf.projected_file_column_names();
-        assert_eq!(col_names, None);
+        fn try_pushdown_sort(
+            &self,
+            _order: &[PhysicalSortExpr],
+            _eq_properties: &EquivalenceProperties,
+        ) -> Result<SortOrderPushdownResult<Arc<dyn FileSource>>> {
+            Ok(SortOrderPushdownResult::Inexact {
+                inner: Arc::new(self.clone()) as Arc<dyn FileSource>,
+            })
+        }
 
-        let col_indices = conf.file_column_projection_indices();
-        assert_eq!(col_indices, None);
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
     }
 
     #[test]
@@ -1611,7 +1637,7 @@ mod tests {
         );
 
         // verify the proj_schema includes the last column and exactly the same the field it is defined
-        let proj_schema = conf.projected_schema();
+        let proj_schema = conf.projected_schema().unwrap();
         assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1);
         assert_eq!(
             *proj_schema.field(file_schema.fields().len()),
@@ -1620,300 +1646,38 @@ mod tests {
         );
     }
 
-    #[test]
-    fn physical_plan_config_with_projection() {
-        let file_schema = aggr_test_schema();
-        let conf = config_for_projection(
-            Arc::clone(&file_schema),
-            Some(vec![file_schema.fields().len(), 0]),
-            Statistics {
-                num_rows: Precision::Inexact(10),
-                // assign the column index to distinct_count to help assert
-                // the source statistic after the projection
-                column_statistics: (0..file_schema.fields().len())
-                    .map(|i| ColumnStatistics {
-                        distinct_count: Precision::Inexact(i),
-                        ..Default::default()
-                    })
-                    .collect(),
-                total_byte_size: Precision::Absent,
-            },
-            to_partition_cols(vec![(
-                "date".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            )]),
-        );
-
-        let (proj_schema, _, proj_statistics, _) = conf.project();
-        assert_eq!(
-            columns(&proj_schema),
-            vec!["date".to_owned(), "c1".to_owned()]
-        );
-        let proj_stat_cols = proj_statistics.column_statistics;
-        assert_eq!(proj_stat_cols.len(), 2);
-        // TODO implement tests for proj_stat_cols[0] once partition column
-        // statistics are implemented
-        assert_eq!(proj_stat_cols[1].distinct_count, Precision::Inexact(0));
-
-        let col_names = conf.projected_file_column_names();
-        assert_eq!(col_names, Some(vec!["c1".to_owned()]));
-
-        let col_indices = conf.file_column_projection_indices();
-        assert_eq!(col_indices, Some(vec![0]));
-    }
-
-    #[test]
-    fn partition_column_projector() {
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 2]),
-            ("b", &vec![-2, -1, 0]),
-            ("c", &vec![10, 11, 12]),
-        );
-        let partition_cols = vec![
-            (
-                "year".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "month".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "day".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-        // create a projected schema
-        let statistics = Statistics {
-            num_rows: Precision::Inexact(3),
-            total_byte_size: Precision::Absent,
-            column_statistics: Statistics::unknown_column(&file_batch.schema()),
-        };
-
-        let conf = config_for_projection(
-            file_batch.schema(),
-            // keep all cols from file and 2 from partitioning
-            Some(vec![
-                0,
-                1,
-                2,
-                file_batch.schema().fields().len(),
-                file_batch.schema().fields().len() + 2,
-            ]),
-            statistics.clone(),
-            to_partition_cols(partition_cols.clone()),
-        );
-
-        let source_statistics = conf.file_source.statistics().unwrap();
-        let conf_stats = conf.statistics().unwrap();
-
-        // projection should be reflected in the file source statistics
-        assert_eq!(conf_stats.num_rows, Precision::Inexact(3));
-
-        // 3 original statistics + 2 partition statistics
-        assert_eq!(conf_stats.column_statistics.len(), 5);
-
-        // file statics should not be modified
-        assert_eq!(source_statistics, statistics);
-        assert_eq!(source_statistics.column_statistics.len(), 3);
-
-        let proj_schema = conf.projected_schema();
-        // created a projector for that projected schema
-        let mut proj = PartitionColumnProjector::new(
-            proj_schema,
-            &partition_cols
-                .iter()
-                .map(|x| x.0.clone())
-                .collect::<Vec<_>>(),
-        );
-
-        // project first batch
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("26")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+----+----+------+-----+",
-            "| a | b  | c  | year | day |",
-            "+---+----+----+------+-----+",
-            "| 0 | -2 | 10 | 2021 | 26  |",
-            "| 1 | -1 | 11 | 2021 | 26  |",
-            "| 2 | 0  | 12 | 2021 | 26  |",
-            "+---+----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // project another batch that is larger than the previous one
-        let file_batch = build_table_i32(
-            ("a", &vec![5, 6, 7, 8, 9]),
-            ("b", &vec![-10, -9, -8, -7, -6]),
-            ("c", &vec![12, 13, 14, 15, 16]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("27")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+-----+----+------+-----+",
-            "| a | b   | c  | year | day |",
-            "+---+-----+----+------+-----+",
-            "| 5 | -10 | 12 | 2021 | 27  |",
-            "| 6 | -9  | 13 | 2021 | 27  |",
-            "| 7 | -8  | 14 | 2021 | 27  |",
-            "| 8 | -7  | 15 | 2021 | 27  |",
-            "| 9 | -6  | 16 | 2021 | 27  |",
-            "+---+-----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // project another batch that is smaller than the previous one
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 3]),
-            ("b", &vec![2, 3, 4]),
-            ("c", &vec![4, 5, 6]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    wrap_partition_value_in_dict(ScalarValue::from("2021")),
-                    wrap_partition_value_in_dict(ScalarValue::from("10")),
-                    wrap_partition_value_in_dict(ScalarValue::from("28")),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+---+---+------+-----+",
-            "| a | b | c | year | day |",
-            "+---+---+---+------+-----+",
-            "| 0 | 2 | 4 | 2021 | 28  |",
-            "| 1 | 3 | 5 | 2021 | 28  |",
-            "| 3 | 4 | 6 | 2021 | 28  |",
-            "+---+---+---+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-
-        // forgot to dictionary-wrap the scalar value
-        let file_batch = build_table_i32(
-            ("a", &vec![0, 1, 2]),
-            ("b", &vec![-2, -1, 0]),
-            ("c", &vec![10, 11, 12]),
-        );
-        let projected_batch = proj
-            .project(
-                // file_batch is ok here because we kept all the file cols in the projection
-                file_batch,
-                &[
-                    ScalarValue::from("2021"),
-                    ScalarValue::from("10"),
-                    ScalarValue::from("26"),
-                ],
-            )
-            .expect("Projection of partition columns into record batch failed");
-        let expected = [
-            "+---+----+----+------+-----+",
-            "| a | b  | c  | year | day |",
-            "+---+----+----+------+-----+",
-            "| 0 | -2 | 10 | 2021 | 26  |",
-            "| 1 | -1 | 11 | 2021 | 26  |",
-            "| 2 | 0  | 12 | 2021 | 26  |",
-            "+---+----+----+------+-----+",
-        ];
-        assert_batches_eq!(expected, &[projected_batch]);
-    }
-
-    #[test]
-    fn test_projected_file_schema_with_partition_col() {
-        let schema = aggr_test_schema();
-        let partition_cols = vec![
-            (
-                "part1".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "part2".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-
-        // Projected file schema for config with projection including partition column
-        let projection = config_for_projection(
-            schema.clone(),
-            Some(vec![0, 3, 5, schema.fields().len()]),
-            Statistics::new_unknown(&schema),
-            to_partition_cols(partition_cols),
-        )
-        .projected_file_schema();
-
-        // Assert partition column filtered out in projected file schema
-        let expected_columns = vec!["c1", "c4", "c6"];
-        let actual_columns = projection
-            .fields()
-            .iter()
-            .map(|f| f.name().clone())
-            .collect::<Vec<_>>();
-        assert_eq!(expected_columns, actual_columns);
-    }
-
-    #[test]
-    fn test_projected_file_schema_without_projection() {
-        let schema = aggr_test_schema();
-        let partition_cols = vec![
-            (
-                "part1".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-            (
-                "part2".to_owned(),
-                wrap_partition_type_in_dict(DataType::Utf8),
-            ),
-        ];
-
-        // Projected file schema for config without projection
-        let projection = config_for_projection(
-            schema.clone(),
-            None,
-            Statistics::new_unknown(&schema),
-            to_partition_cols(partition_cols),
-        )
-        .projected_file_schema();
-
-        // Assert projected file schema is equal to file schema
-        assert_eq!(projection.fields(), schema.fields());
-    }
-
     #[test]
     fn test_split_groups_by_statistics() -> Result<()> {
         use chrono::TimeZone;
         use datafusion_common::DFSchema;
         use datafusion_expr::execution_props::ExecutionProps;
-        use object_store::{path::Path, ObjectMeta};
+        use object_store::{ObjectMeta, path::Path};
 
         struct File {
             name: &'static str,
             date: &'static str,
-            statistics: Vec<Option<(f64, f64)>>,
+            statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
         }
         impl File {
             fn new(
                 name: &'static str,
                 date: &'static str,
                 statistics: Vec<Option<(f64, f64)>>,
+            ) -> Self {
+                Self::new_nullable(
+                    name,
+                    date,
+                    statistics
+                        .into_iter()
+                        .map(|opt| opt.map(|(min, max)| (Some(min), Some(max))))
+                        .collect(),
+                )
+            }
+
+            fn new_nullable(
+                name: &'static str,
+                date: &'static str,
+                statistics: Vec<Option<(Option<f64>, Option<f64>)>>,
             ) -> Self {
                 Self {
                     name,
@@ -1980,21 +1744,43 @@ mod tests {
                 sort: vec![col("value").sort(false, true)],
                 expected_result: Ok(vec![vec!["1", "0"], vec!["2"]]),
             },
-            // reject nullable sort columns
             TestCase {
-                name: "no nullable sort columns",
+                name: "nullable sort columns, nulls last",
                 file_schema: Schema::new(vec![Field::new(
                     "value".to_string(),
                     DataType::Float64,
-                    true, // should fail because nullable
+                    true,
                 )]),
                 files: vec![
-                    File::new("0", "2023-01-01", vec![Some((0.00, 0.49))]),
-                    File::new("1", "2023-01-01", vec![Some((0.50, 1.00))]),
-                    File::new("2", "2023-01-02", vec![Some((0.00, 1.00))]),
+                    File::new_nullable(
+                        "0",
+                        "2023-01-01",
+                        vec![Some((Some(0.00), Some(0.49)))],
+                    ),
+                    File::new_nullable("1", "2023-01-01", vec![Some((Some(0.50), None))]),
+                    File::new_nullable("2", "2023-01-02", vec![Some((Some(0.00), None))]),
                 ],
                 sort: vec![col("value").sort(true, false)],
-                expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\nbuild min rows\ncaused by\ncreate sorting columns\ncaused by\nError during planning: cannot sort by nullable column")
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
+            },
+            TestCase {
+                name: "nullable sort columns, nulls first",
+                file_schema: Schema::new(vec![Field::new(
+                    "value".to_string(),
+                    DataType::Float64,
+                    true,
+                )]),
+                files: vec![
+                    File::new_nullable("0", "2023-01-01", vec![Some((None, Some(0.49)))]),
+                    File::new_nullable(
+                        "1",
+                        "2023-01-01",
+                        vec![Some((Some(0.50), Some(1.00)))],
+                    ),
+                    File::new_nullable("2", "2023-01-02", vec![Some((None, Some(1.00)))]),
+                ],
+                sort: vec![col("value").sort(true, true)],
+                expected_result: Ok(vec![vec!["0", "1"], vec!["2"]]),
             },
             TestCase {
                 name: "all three non-overlapping",
@@ -2050,7 +1836,9 @@ mod tests {
                     File::new("2", "2023-01-02", vec![None]),
                 ],
                 sort: vec![col("value").sort(true, false)],
-                expected_result: Err("construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found"),
+                expected_result: Err(
+                    "construct min/max statistics for split_groups_by_statistics\ncaused by\ncollect min/max values\ncaused by\nget min/max for column: 'value'\ncaused by\nError during planning: statistics not found",
+                ),
             },
         ];
 
@@ -2068,25 +1856,27 @@ mod tests {
                     ))))
                     .collect::<Vec<_>>(),
             ));
-            let sort_order = LexOrdering::from(
+            let Some(sort_order) = LexOrdering::new(
                 case.sort
                     .into_iter()
                     .map(|expr| {
                         create_physical_sort_expr(
                             &expr,
-                            &DFSchema::try_from(table_schema.as_ref().clone())?,
+                            &DFSchema::try_from(Arc::clone(&table_schema))?,
                             &ExecutionProps::default(),
                         )
                     })
                     .collect::<Result<Vec<_>>>()?,
-            );
+            ) else {
+                return internal_err!("This test should always use an ordering");
+            };
 
             let partitioned_files = FileGroup::new(
                 case.files.into_iter().map(From::from).collect::<Vec<_>>(),
             );
             let result = FileScanConfig::split_groups_by_statistics(
                 &table_schema,
-                &[partitioned_files.clone()],
+                std::slice::from_ref(&partitioned_files),
                 &sort_order,
             );
             let results_by_name = result
@@ -2130,43 +1920,40 @@ mod tests {
 
         impl From<File> for PartitionedFile {
             fn from(file: File) -> Self {
-                PartitionedFile {
-                    object_meta: ObjectMeta {
-                        location: Path::from(format!(
-                            "data/date={}/{}.parquet",
-                            file.date, file.name
-                        )),
-                        last_modified: chrono::Utc.timestamp_nanos(0),
-                        size: 0,
-                        e_tag: None,
-                        version: None,
-                    },
-                    partition_values: vec![ScalarValue::from(file.date)],
-                    range: None,
-                    statistics: Some(Arc::new(Statistics {
-                        num_rows: Precision::Absent,
-                        total_byte_size: Precision::Absent,
-                        column_statistics: file
-                            .statistics
-                            .into_iter()
-                            .map(|stats| {
-                                stats
-                                    .map(|(min, max)| ColumnStatistics {
-                                        min_value: Precision::Exact(ScalarValue::from(
-                                            min,
-                                        )),
-                                        max_value: Precision::Exact(ScalarValue::from(
-                                            max,
-                                        )),
-                                        ..Default::default()
-                                    })
-                                    .unwrap_or_default()
-                            })
-                            .collect::<Vec<_>>(),
-                    })),
-                    extensions: None,
-                    metadata_size_hint: None,
-                }
+                let object_meta = ObjectMeta {
+                    location: Path::from(format!(
+                        "data/date={}/{}.parquet",
+                        file.date, file.name
+                    )),
+                    last_modified: chrono::Utc.timestamp_nanos(0),
+                    size: 0,
+                    e_tag: None,
+                    version: None,
+                };
+                let statistics = Arc::new(Statistics {
+                    num_rows: Precision::Absent,
+                    total_byte_size: Precision::Absent,
+                    column_statistics: file
+                        .statistics
+                        .into_iter()
+                        .map(|stats| {
+                            stats
+                                .map(|(min, max)| ColumnStatistics {
+                                    min_value: Precision::Exact(ScalarValue::Float64(
+                                        min,
+                                    )),
+                                    max_value: Precision::Exact(ScalarValue::Float64(
+                                        max,
+                                    )),
+                                    ..Default::default()
+                                })
+                                .unwrap_or_default()
+                        })
+                        .collect::<Vec<_>>(),
+                });
+                PartitionedFile::new_from_meta(object_meta)
+                    .with_partition_values(vec![ScalarValue::from(file.date)])
+                    .with_statistics(statistics)
             }
         }
     }
@@ -2178,88 +1965,76 @@ mod tests {
         statistics: Statistics,
         table_partition_cols: Vec<Field>,
     ) -> FileScanConfig {
+        let table_schema = TableSchema::new(
+            file_schema,
+            table_partition_cols.into_iter().map(Arc::new).collect(),
+        );
         FileScanConfigBuilder::new(
             ObjectStoreUrl::parse("test:///").unwrap(),
-            file_schema,
-            Arc::new(MockSource::default()),
+            Arc::new(MockSource::new(table_schema.clone())),
         )
-        .with_projection(projection)
+        .with_projection_indices(projection)
+        .unwrap()
         .with_statistics(statistics)
-        .with_table_partition_cols(table_partition_cols)
         .build()
     }
 
-    /// Convert partition columns from Vec<String DataType> to Vec<Field>
-    fn to_partition_cols(table_partition_cols: Vec<(String, DataType)>) -> Vec<Field> {
-        table_partition_cols
-            .iter()
-            .map(|(name, dtype)| Field::new(name, dtype.clone(), false))
-            .collect::<Vec<_>>()
-    }
-
-    /// returns record batch with 3 columns of i32 in memory
-    pub fn build_table_i32(
-        a: (&str, &Vec<i32>),
-        b: (&str, &Vec<i32>),
-        c: (&str, &Vec<i32>),
-    ) -> RecordBatch {
-        let schema = Schema::new(vec![
-            Field::new(a.0, DataType::Int32, false),
-            Field::new(b.0, DataType::Int32, false),
-            Field::new(c.0, DataType::Int32, false),
-        ]);
-
-        RecordBatch::try_new(
-            Arc::new(schema),
-            vec![
-                Arc::new(Int32Array::from(a.1.clone())),
-                Arc::new(Int32Array::from(b.1.clone())),
-                Arc::new(Int32Array::from(c.1.clone())),
-            ],
-        )
-        .unwrap()
-    }
-
     #[test]
     fn test_file_scan_config_builder() {
         let file_schema = aggr_test_schema();
         let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
+
+        let table_schema = TableSchema::new(
+            Arc::clone(&file_schema),
+            vec![Arc::new(Field::new(
+                "date",
+                wrap_partition_type_in_dict(DataType::Utf8),
+                false,
+            ))],
+        );
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
 
         // Create a builder with required parameters
         let builder = FileScanConfigBuilder::new(
             object_store_url.clone(),
-            Arc::clone(&file_schema),
             Arc::clone(&file_source),
         );
 
         // Build with various configurations
         let config = builder
             .with_limit(Some(1000))
-            .with_projection(Some(vec![0, 1]))
-            .with_table_partition_cols(vec![Field::new(
-                "date",
-                wrap_partition_type_in_dict(DataType::Utf8),
-                false,
-            )])
-            .with_constraints(Constraints::empty())
+            .with_projection_indices(Some(vec![0, 1]))
+            .unwrap()
             .with_statistics(Statistics::new_unknown(&file_schema))
             .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
                 "test.parquet".to_string(),
                 1024,
             )])])
-            .with_output_ordering(vec![LexOrdering::default()])
+            .with_output_ordering(vec![
+                [PhysicalSortExpr::new_default(Arc::new(Column::new(
+                    "date", 0,
+                )))]
+                .into(),
+            ])
             .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
-            .with_newlines_in_values(true)
             .build();
 
         // Verify the built config has all the expected values
         assert_eq!(config.object_store_url, object_store_url);
-        assert_eq!(config.file_schema, file_schema);
+        assert_eq!(*config.file_schema(), file_schema);
         assert_eq!(config.limit, Some(1000));
-        assert_eq!(config.projection, Some(vec![0, 1]));
-        assert_eq!(config.table_partition_cols.len(), 1);
-        assert_eq!(config.table_partition_cols[0].name(), "date");
+        assert_eq!(
+            config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(vec![0, 1])
+        );
+        assert_eq!(config.table_partition_cols().len(), 1);
+        assert_eq!(config.table_partition_cols()[0].name(), "date");
         assert_eq!(config.file_groups.len(), 1);
         assert_eq!(config.file_groups[0].len(), 1);
         assert_eq!(
@@ -2270,58 +2045,111 @@ mod tests {
             config.file_compression_type,
             FileCompressionType::UNCOMPRESSED
         );
-        assert!(config.new_lines_in_values);
         assert_eq!(config.output_ordering.len(), 1);
     }
 
+    #[test]
+    fn equivalence_properties_after_schema_change() {
+        let file_schema = aggr_test_schema();
+        let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+
+        // Create a file source with a filter
+        let file_source: Arc<dyn FileSource> = Arc::new(
+            MockSource::new(table_schema.clone()).with_filter(Arc::new(BinaryExpr::new(
+                col("c2", &file_schema).unwrap(),
+                Operator::Eq,
+                Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+            ))),
+        );
+
+        let config = FileScanConfigBuilder::new(
+            object_store_url.clone(),
+            Arc::clone(&file_source),
+        )
+        .with_projection_indices(Some(vec![0, 1, 2]))
+        .unwrap()
+        .build();
+
+        // Simulate projection being updated. Since the filter has already been pushed down,
+        // the new projection won't include the filtered column.
+        let exprs = ProjectionExprs::new(vec![ProjectionExpr::new(
+            col("c1", &file_schema).unwrap(),
+            "c1",
+        )]);
+        let data_source = config
+            .try_swapping_with_projection(&exprs)
+            .unwrap()
+            .unwrap();
+
+        // Gather the equivalence properties from the new data source. There should
+        // be no equivalence class for column c2 since it was removed by the projection.
+        let eq_properties = data_source.eq_properties();
+        let eq_group = eq_properties.eq_group();
+
+        for class in eq_group.iter() {
+            for expr in class.iter() {
+                if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                    assert_ne!(
+                        col.name(),
+                        "c2",
+                        "c2 should not be present in any equivalence class"
+                    );
+                }
+            }
+        }
+    }
+
     #[test]
     fn test_file_scan_config_builder_defaults() {
         let file_schema = aggr_test_schema();
         let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
 
         // Create a builder with only required parameters and build without any additional configurations
         let config = FileScanConfigBuilder::new(
             object_store_url.clone(),
-            Arc::clone(&file_schema),
             Arc::clone(&file_source),
         )
         .build();
 
         // Verify default values
         assert_eq!(config.object_store_url, object_store_url);
-        assert_eq!(config.file_schema, file_schema);
+        assert_eq!(*config.file_schema(), file_schema);
         assert_eq!(config.limit, None);
-        assert_eq!(config.projection, None);
-        assert!(config.table_partition_cols.is_empty());
+        // When no projection is specified, the file source should have an unprojected projection
+        // (i.e., all columns)
+        let expected_projection: Vec<usize> = (0..file_schema.fields().len()).collect();
+        assert_eq!(
+            config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(expected_projection)
+        );
+        assert!(config.table_partition_cols().is_empty());
         assert!(config.file_groups.is_empty());
         assert_eq!(
             config.file_compression_type,
             FileCompressionType::UNCOMPRESSED
         );
-        assert!(!config.new_lines_in_values);
         assert!(config.output_ordering.is_empty());
         assert!(config.constraints.is_empty());
 
         // Verify statistics are set to unknown
+        assert_eq!(config.statistics().num_rows, Precision::Absent);
+        assert_eq!(config.statistics().total_byte_size, Precision::Absent);
         assert_eq!(
-            config.file_source.statistics().unwrap().num_rows,
-            Precision::Absent
-        );
-        assert_eq!(
-            config.file_source.statistics().unwrap().total_byte_size,
-            Precision::Absent
-        );
-        assert_eq!(
-            config
-                .file_source
-                .statistics()
-                .unwrap()
-                .column_statistics
-                .len(),
+            config.statistics().column_statistics.len(),
             file_schema.fields().len()
         );
-        for stat in config.file_source.statistics().unwrap().column_statistics {
+        for stat in config.statistics().column_statistics {
             assert_eq!(stat.distinct_count, Precision::Absent);
             assert_eq!(stat.min_value, Precision::Absent);
             assert_eq!(stat.max_value, Precision::Absent);
@@ -2333,7 +2161,6 @@ mod tests {
     fn test_file_scan_config_builder_new_from() {
         let schema = aggr_test_schema();
         let object_store_url = ObjectStoreUrl::parse("test:///").unwrap();
-        let file_source: Arc<dyn FileSource> = Arc::new(MockSource::default());
         let partition_cols = vec![Field::new(
             "date",
             wrap_partition_type_in_dict(DataType::Utf8),
@@ -2341,18 +2168,24 @@ mod tests {
         )];
         let file = PartitionedFile::new("test_file.parquet", 100);
 
+        let table_schema = TableSchema::new(
+            Arc::clone(&schema),
+            partition_cols.iter().map(|f| Arc::new(f.clone())).collect(),
+        );
+
+        let file_source: Arc<dyn FileSource> =
+            Arc::new(MockSource::new(table_schema.clone()));
+
         // Create a config with non-default values
         let original_config = FileScanConfigBuilder::new(
             object_store_url.clone(),
-            Arc::clone(&schema),
             Arc::clone(&file_source),
         )
-        .with_projection(Some(vec![0, 2]))
+        .with_projection_indices(Some(vec![0, 2]))
+        .unwrap()
         .with_limit(Some(10))
-        .with_table_partition_cols(partition_cols.clone())
         .with_file(file.clone())
         .with_constraints(Constraints::default())
-        .with_newlines_in_values(true)
         .build();
 
         // Create a new builder from the config
@@ -2362,11 +2195,19 @@ mod tests {
         let new_config = new_builder.build();
 
         // Verify properties match
+        let partition_cols = partition_cols.into_iter().map(Arc::new).collect::<Vec<_>>();
         assert_eq!(new_config.object_store_url, object_store_url);
-        assert_eq!(new_config.file_schema, schema);
-        assert_eq!(new_config.projection, Some(vec![0, 2]));
+        assert_eq!(*new_config.file_schema(), schema);
+        assert_eq!(
+            new_config
+                .file_source
+                .projection()
+                .as_ref()
+                .map(|p| p.column_indices()),
+            Some(vec![0, 2])
+        );
         assert_eq!(new_config.limit, Some(10));
-        assert_eq!(new_config.table_partition_cols, partition_cols);
+        assert_eq!(*new_config.table_partition_cols(), partition_cols);
         assert_eq!(new_config.file_groups.len(), 1);
         assert_eq!(new_config.file_groups[0].len(), 1);
         assert_eq!(
@@ -2374,7 +2215,6 @@ mod tests {
             "test_file.parquet"
         );
         assert_eq!(new_config.constraints, Constraints::default());
-        assert!(new_config.new_lines_in_values);
     }
 
     #[test]
@@ -2391,14 +2231,12 @@ mod tests {
         // Setup sort expression
         let exec_props = ExecutionProps::new();
         let df_schema = DFSchema::try_from_qualified_schema("test", schema.as_ref())?;
-        let sort_expr = vec![col("value").sort(true, false)];
-
-        let physical_sort_exprs: Vec<_> = sort_expr
-            .iter()
-            .map(|expr| create_physical_sort_expr(expr, &df_schema, &exec_props).unwrap())
-            .collect();
-
-        let sort_ordering = LexOrdering::from(physical_sort_exprs);
+        let sort_expr = [col("value").sort(true, false)];
+        let sort_ordering = sort_expr
+            .map(|expr| {
+                create_physical_sort_expr(&expr, &df_schema, &exec_props).unwrap()
+            })
+            .into();
 
         // Test case parameters
         struct TestCase {
@@ -2532,4 +2370,248 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_partition_statistics_projection() {
+        // This test verifies that partition_statistics applies projection correctly.
+        // The old implementation had a bug where it returned file group statistics
+        // without applying the projection, returning all column statistics instead
+        // of just the projected ones.
+
+        use crate::source::DataSourceExec;
+        use datafusion_physical_plan::ExecutionPlan;
+
+        // Create a schema with 4 columns
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col0", DataType::Int32, false),
+            Field::new("col1", DataType::Int32, false),
+            Field::new("col2", DataType::Int32, false),
+            Field::new("col3", DataType::Int32, false),
+        ]));
+
+        // Create statistics for all 4 columns
+        let file_group_stats = Statistics {
+            num_rows: Precision::Exact(100),
+            total_byte_size: Precision::Exact(1024),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(5),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(10),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(15),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        // Create a file group with statistics
+        let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)])
+            .with_statistics(Arc::new(file_group_stats));
+
+        let table_schema = TableSchema::new(Arc::clone(&schema), vec![]);
+
+        // Create a FileScanConfig with projection: only keep columns 0 and 2
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::new(MockSource::new(table_schema.clone())),
+        )
+        .with_projection_indices(Some(vec![0, 2]))
+        .unwrap() // Only project columns 0 and 2
+        .with_file_groups(vec![file_group])
+        .build();
+
+        // Create a DataSourceExec from the config
+        let exec = DataSourceExec::from_data_source(config);
+
+        // Get statistics for partition 0
+        let partition_stats = exec.partition_statistics(Some(0)).unwrap();
+
+        // Verify that only 2 columns are in the statistics (the projected ones)
+        assert_eq!(
+            partition_stats.column_statistics.len(),
+            2,
+            "Expected 2 column statistics (projected), but got {}",
+            partition_stats.column_statistics.len()
+        );
+
+        // Verify the column statistics are for columns 0 and 2
+        assert_eq!(
+            partition_stats.column_statistics[0].null_count,
+            Precision::Exact(0),
+            "First projected column should be col0 with 0 nulls"
+        );
+        assert_eq!(
+            partition_stats.column_statistics[1].null_count,
+            Precision::Exact(10),
+            "Second projected column should be col2 with 10 nulls"
+        );
+
+        // Verify row count and byte size
+        assert_eq!(partition_stats.num_rows, Precision::Exact(100));
+        assert_eq!(partition_stats.total_byte_size, Precision::Exact(800));
+    }
+
+    #[test]
+    fn test_output_partitioning_not_partitioned_by_file_group() {
+        let file_schema = aggr_test_schema();
+        let partition_col =
+            Field::new("date", wrap_partition_type_in_dict(DataType::Utf8), false);
+
+        let config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            vec![partition_col],
+        );
+
+        // partitioned_by_file_group defaults to false
+        let partitioning = config.output_partitioning();
+        assert!(matches!(partitioning, Partitioning::UnknownPartitioning(_)));
+    }
+
+    #[test]
+    fn test_output_partitioning_no_partition_columns() {
+        let file_schema = aggr_test_schema();
+        let mut config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            vec![], // No partition columns
+        );
+        config.partitioned_by_file_group = true;
+
+        let partitioning = config.output_partitioning();
+        assert!(matches!(partitioning, Partitioning::UnknownPartitioning(_)));
+    }
+
+    #[test]
+    fn test_output_partitioning_with_partition_columns() {
+        let file_schema = aggr_test_schema();
+
+        // Test single partition column
+        let single_partition_col = vec![Field::new(
+            "date",
+            wrap_partition_type_in_dict(DataType::Utf8),
+            false,
+        )];
+
+        let mut config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            single_partition_col,
+        );
+        config.partitioned_by_file_group = true;
+        config.file_groups = vec![
+            FileGroup::new(vec![PartitionedFile::new("f1.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f2.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f3.parquet".to_string(), 1024)]),
+        ];
+
+        let partitioning = config.output_partitioning();
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                assert_eq!(num_partitions, 3);
+                assert_eq!(exprs.len(), 1);
+                assert_eq!(
+                    exprs[0].as_any().downcast_ref::<Column>().unwrap().name(),
+                    "date"
+                );
+            }
+            _ => panic!("Expected Hash partitioning"),
+        }
+
+        // Test multiple partition columns
+        let multiple_partition_cols = vec![
+            Field::new("year", wrap_partition_type_in_dict(DataType::Utf8), false),
+            Field::new("month", wrap_partition_type_in_dict(DataType::Utf8), false),
+        ];
+
+        config = config_for_projection(
+            Arc::clone(&file_schema),
+            None,
+            Statistics::new_unknown(&file_schema),
+            multiple_partition_cols,
+        );
+        config.partitioned_by_file_group = true;
+        config.file_groups = vec![
+            FileGroup::new(vec![PartitionedFile::new("f1.parquet".to_string(), 1024)]),
+            FileGroup::new(vec![PartitionedFile::new("f2.parquet".to_string(), 1024)]),
+        ];
+
+        let partitioning = config.output_partitioning();
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                assert_eq!(num_partitions, 2);
+                assert_eq!(exprs.len(), 2);
+                let col_names: Vec<_> = exprs
+                    .iter()
+                    .map(|e| e.as_any().downcast_ref::<Column>().unwrap().name())
+                    .collect();
+                assert_eq!(col_names, vec!["year", "month"]);
+            }
+            _ => panic!("Expected Hash partitioning"),
+        }
+    }
+
+    #[test]
+    fn try_pushdown_sort_reverses_file_groups_only_when_requested_is_reverse()
+    -> Result<()> {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+
+        let table_schema = TableSchema::new(Arc::clone(&file_schema), vec![]);
+        let file_source = Arc::new(InexactSortPushdownSource::new(table_schema));
+
+        let file_groups = vec![FileGroup::new(vec![
+            PartitionedFile::new("file1", 1),
+            PartitionedFile::new("file2", 1),
+        ])];
+
+        let sort_expr_asc = PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)));
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+                .with_file_groups(file_groups)
+                .with_output_ordering(vec![
+                    LexOrdering::new(vec![sort_expr_asc.clone()]).unwrap(),
+                ])
+                .build();
+
+        let requested_asc = vec![sort_expr_asc.clone()];
+        let result = config.try_pushdown_sort(&requested_asc)?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .as_any()
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let pushed_files = pushed_config.file_groups[0].files();
+        assert_eq!(pushed_files[0].object_meta.location.as_ref(), "file1");
+        assert_eq!(pushed_files[1].object_meta.location.as_ref(), "file2");
+
+        let requested_desc = vec![sort_expr_asc.reverse()];
+        let result = config.try_pushdown_sort(&requested_desc)?;
+        let SortOrderPushdownResult::Inexact { inner } = result else {
+            panic!("Expected Inexact result");
+        };
+        let pushed_config = inner
+            .as_any()
+            .downcast_ref::<FileScanConfig>()
+            .expect("Expected FileScanConfig");
+        let pushed_files = pushed_config.file_groups[0].files();
+        assert_eq!(pushed_files[0].object_meta.location.as_ref(), "file2");
+        assert_eq!(pushed_files[1].object_meta.location.as_ref(), "file1");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/datasource/src/file_sink_config.rs b/datafusion/datasource/src/file_sink_config.rs
index 2968bd1ee0449..1abce86a3565f 100644
--- a/datafusion/datasource/src/file_sink_config.rs
+++ b/datafusion/datasource/src/file_sink_config.rs
@@ -17,10 +17,10 @@
 
 use std::sync::Arc;
 
+use crate::ListingTableUrl;
 use crate::file_groups::FileGroup;
 use crate::sink::DataSink;
-use crate::write::demux::{start_demuxer_task, DemuxedStreamReceiver};
-use crate::ListingTableUrl;
+use crate::write::demux::{DemuxedStreamReceiver, start_demuxer_task};
 
 use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_common::Result;
@@ -32,6 +32,52 @@ use datafusion_expr::dml::InsertOp;
 use async_trait::async_trait;
 use object_store::ObjectStore;
 
+/// Determines how `FileSink` output paths are interpreted.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum FileOutputMode {
+    /// Infer output mode from the output URL (for example, by extension / trailing `/`).
+    #[default]
+    Automatic,
+    /// Write to a single output file at the exact output path.
+    SingleFile,
+    /// Write to a directory under the output path with generated filenames.
+    Directory,
+}
+
+impl FileOutputMode {
+    /// Resolve this mode into a `single_file_output` boolean for the demuxer.
+    pub fn single_file_output(self, base_output_path: &ListingTableUrl) -> bool {
+        match self {
+            Self::Automatic => {
+                !base_output_path.is_collection()
+                    && base_output_path.file_extension().is_some()
+            }
+            Self::SingleFile => true,
+            Self::Directory => false,
+        }
+    }
+}
+
+impl From<Option<bool>> for FileOutputMode {
+    fn from(value: Option<bool>) -> Self {
+        match value {
+            None => Self::Automatic,
+            Some(true) => Self::SingleFile,
+            Some(false) => Self::Directory,
+        }
+    }
+}
+
+impl From<FileOutputMode> for Option<bool> {
+    fn from(value: FileOutputMode) -> Self {
+        match value {
+            FileOutputMode::Automatic => None,
+            FileOutputMode::SingleFile => Some(true),
+            FileOutputMode::Directory => Some(false),
+        }
+    }
+}
+
 /// General behaviors for files that do `DataSink` operations
 #[async_trait]
 pub trait FileSink: DataSink {
@@ -112,6 +158,8 @@ pub struct FileSinkConfig {
     pub keep_partition_by_columns: bool,
     /// File extension without a dot(.)
     pub file_extension: String,
+    /// Determines how the output path is interpreted.
+    pub file_output_mode: FileOutputMode,
 }
 
 impl FileSinkConfig {
diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs
deleted file mode 100644
index 1dc53bd6b9319..0000000000000
--- a/datafusion/datasource/src/file_stream.rs
+++ /dev/null
@@ -1,986 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! A generic stream over file format readers that can be used by
-//! any file format that read its files from start to end.
-//!
-//! Note: Most traits here need to be marked `Sync + Send` to be
-//! compliant with the `SendableRecordBatchStream` trait.
-
-use std::collections::VecDeque;
-use std::mem;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use crate::file_meta::FileMeta;
-use crate::file_scan_config::{FileScanConfig, PartitionColumnProjector};
-use crate::PartitionedFile;
-use arrow::datatypes::SchemaRef;
-use datafusion_common::error::Result;
-use datafusion_execution::RecordBatchStream;
-use datafusion_physical_plan::metrics::{
-    BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, Time,
-};
-
-use arrow::error::ArrowError;
-use arrow::record_batch::RecordBatch;
-use datafusion_common::instant::Instant;
-use datafusion_common::ScalarValue;
-
-use futures::future::BoxFuture;
-use futures::stream::BoxStream;
-use futures::{ready, FutureExt as _, Stream, StreamExt as _};
-
-/// A stream that iterates record batch by record batch, file over file.
-pub struct FileStream {
-    /// An iterator over input files.
-    file_iter: VecDeque<PartitionedFile>,
-    /// The stream schema (file schema including partition columns and after
-    /// projection).
-    projected_schema: SchemaRef,
-    /// The remaining number of records to parse, None if no limit
-    remain: Option<usize>,
-    /// A dynamic [`FileOpener`]. Calling `open()` returns a [`FileOpenFuture`],
-    /// which can be resolved to a stream of `RecordBatch`.
-    file_opener: Arc<dyn FileOpener>,
-    /// The partition column projector
-    pc_projector: PartitionColumnProjector,
-    /// The stream state
-    state: FileStreamState,
-    /// File stream specific metrics
-    file_stream_metrics: FileStreamMetrics,
-    /// runtime baseline metrics
-    baseline_metrics: BaselineMetrics,
-    /// Describes the behavior of the `FileStream` if file opening or scanning fails
-    on_error: OnError,
-}
-
-impl FileStream {
-    /// Create a new `FileStream` using the give `FileOpener` to scan underlying files
-    pub fn new(
-        config: &FileScanConfig,
-        partition: usize,
-        file_opener: Arc<dyn FileOpener>,
-        metrics: &ExecutionPlanMetricsSet,
-    ) -> Result<Self> {
-        let projected_schema = config.projected_schema();
-        let pc_projector = PartitionColumnProjector::new(
-            Arc::clone(&projected_schema),
-            &config
-                .table_partition_cols
-                .iter()
-                .map(|x| x.name().clone())
-                .collect::<Vec<_>>(),
-        );
-
-        let file_group = config.file_groups[partition].clone();
-
-        Ok(Self {
-            file_iter: file_group.into_inner().into_iter().collect(),
-            projected_schema,
-            remain: config.limit,
-            file_opener,
-            pc_projector,
-            state: FileStreamState::Idle,
-            file_stream_metrics: FileStreamMetrics::new(metrics, partition),
-            baseline_metrics: BaselineMetrics::new(metrics, partition),
-            on_error: OnError::Fail,
-        })
-    }
-
-    /// Specify the behavior when an error occurs opening or scanning a file
-    ///
-    /// If `OnError::Skip` the stream will skip files which encounter an error and continue
-    /// If `OnError:Fail` (default) the stream will fail and stop processing when an error occurs
-    pub fn with_on_error(mut self, on_error: OnError) -> Self {
-        self.on_error = on_error;
-        self
-    }
-
-    /// Begin opening the next file in parallel while decoding the current file in FileStream.
-    ///
-    /// Since file opening is mostly IO (and may involve a
-    /// bunch of sequential IO), it can be parallelized with decoding.
-    fn start_next_file(&mut self) -> Option<Result<(FileOpenFuture, Vec<ScalarValue>)>> {
-        let part_file = self.file_iter.pop_front()?;
-
-        let file_meta = FileMeta {
-            object_meta: part_file.object_meta,
-            range: part_file.range,
-            extensions: part_file.extensions,
-            metadata_size_hint: part_file.metadata_size_hint,
-        };
-
-        Some(
-            self.file_opener
-                .open(file_meta)
-                .map(|future| (future, part_file.partition_values)),
-        )
-    }
-
-    fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<RecordBatch>>> {
-        loop {
-            match &mut self.state {
-                FileStreamState::Idle => {
-                    self.file_stream_metrics.time_opening.start();
-
-                    match self.start_next_file().transpose() {
-                        Ok(Some((future, partition_values))) => {
-                            self.state = FileStreamState::Open {
-                                future,
-                                partition_values,
-                            }
-                        }
-                        Ok(None) => return Poll::Ready(None),
-                        Err(e) => {
-                            self.state = FileStreamState::Error;
-                            return Poll::Ready(Some(Err(e)));
-                        }
-                    }
-                }
-                FileStreamState::Open {
-                    future,
-                    partition_values,
-                } => match ready!(future.poll_unpin(cx)) {
-                    Ok(reader) => {
-                        let partition_values = mem::take(partition_values);
-
-                        // include time needed to start opening in `start_next_file`
-                        self.file_stream_metrics.time_opening.stop();
-                        let next = self.start_next_file().transpose();
-                        self.file_stream_metrics.time_scanning_until_data.start();
-                        self.file_stream_metrics.time_scanning_total.start();
-
-                        match next {
-                            Ok(Some((next_future, next_partition_values))) => {
-                                self.state = FileStreamState::Scan {
-                                    partition_values,
-                                    reader,
-                                    next: Some((
-                                        NextOpen::Pending(next_future),
-                                        next_partition_values,
-                                    )),
-                                };
-                            }
-                            Ok(None) => {
-                                self.state = FileStreamState::Scan {
-                                    reader,
-                                    partition_values,
-                                    next: None,
-                                };
-                            }
-                            Err(e) => {
-                                self.state = FileStreamState::Error;
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        self.file_stream_metrics.file_open_errors.add(1);
-                        match self.on_error {
-                            OnError::Skip => {
-                                self.file_stream_metrics.time_opening.stop();
-                                self.state = FileStreamState::Idle
-                            }
-                            OnError::Fail => {
-                                self.state = FileStreamState::Error;
-                                return Poll::Ready(Some(Err(e)));
-                            }
-                        }
-                    }
-                },
-                FileStreamState::Scan {
-                    reader,
-                    partition_values,
-                    next,
-                } => {
-                    // We need to poll the next `FileOpenFuture` here to drive it forward
-                    if let Some((next_open_future, _)) = next {
-                        if let NextOpen::Pending(f) = next_open_future {
-                            if let Poll::Ready(reader) = f.as_mut().poll(cx) {
-                                *next_open_future = NextOpen::Ready(reader);
-                            }
-                        }
-                    }
-                    match ready!(reader.poll_next_unpin(cx)) {
-                        Some(Ok(batch)) => {
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-                            let result = self
-                                .pc_projector
-                                .project(batch, partition_values)
-                                .map_err(|e| ArrowError::ExternalError(e.into()))
-                                .map(|batch| match &mut self.remain {
-                                    Some(remain) => {
-                                        if *remain > batch.num_rows() {
-                                            *remain -= batch.num_rows();
-                                            batch
-                                        } else {
-                                            let batch = batch.slice(0, *remain);
-                                            self.state = FileStreamState::Limit;
-                                            *remain = 0;
-                                            batch
-                                        }
-                                    }
-                                    None => batch,
-                                });
-
-                            if result.is_err() {
-                                // If the partition value projection fails, this is not governed by
-                                // the `OnError` behavior
-                                self.state = FileStreamState::Error
-                            }
-                            self.file_stream_metrics.time_scanning_total.start();
-                            return Poll::Ready(Some(result.map_err(Into::into)));
-                        }
-                        Some(Err(err)) => {
-                            self.file_stream_metrics.file_scan_errors.add(1);
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-
-                            match self.on_error {
-                                // If `OnError::Skip` we skip the file as soon as we hit the first error
-                                OnError::Skip => match mem::take(next) {
-                                    Some((future, partition_values)) => {
-                                        self.file_stream_metrics.time_opening.start();
-
-                                        match future {
-                                            NextOpen::Pending(future) => {
-                                                self.state = FileStreamState::Open {
-                                                    future,
-                                                    partition_values,
-                                                }
-                                            }
-                                            NextOpen::Ready(reader) => {
-                                                self.state = FileStreamState::Open {
-                                                    future: Box::pin(std::future::ready(
-                                                        reader,
-                                                    )),
-                                                    partition_values,
-                                                }
-                                            }
-                                        }
-                                    }
-                                    None => return Poll::Ready(None),
-                                },
-                                OnError::Fail => {
-                                    self.state = FileStreamState::Error;
-                                    return Poll::Ready(Some(Err(err.into())));
-                                }
-                            }
-                        }
-                        None => {
-                            self.file_stream_metrics.time_scanning_until_data.stop();
-                            self.file_stream_metrics.time_scanning_total.stop();
-
-                            match mem::take(next) {
-                                Some((future, partition_values)) => {
-                                    self.file_stream_metrics.time_opening.start();
-
-                                    match future {
-                                        NextOpen::Pending(future) => {
-                                            self.state = FileStreamState::Open {
-                                                future,
-                                                partition_values,
-                                            }
-                                        }
-                                        NextOpen::Ready(reader) => {
-                                            self.state = FileStreamState::Open {
-                                                future: Box::pin(std::future::ready(
-                                                    reader,
-                                                )),
-                                                partition_values,
-                                            }
-                                        }
-                                    }
-                                }
-                                None => return Poll::Ready(None),
-                            }
-                        }
-                    }
-                }
-                FileStreamState::Error | FileStreamState::Limit => {
-                    return Poll::Ready(None)
-                }
-            }
-        }
-    }
-}
-
-impl Stream for FileStream {
-    type Item = Result<RecordBatch>;
-
-    fn poll_next(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        self.file_stream_metrics.time_processing.start();
-        let result = self.poll_inner(cx);
-        self.file_stream_metrics.time_processing.stop();
-        self.baseline_metrics.record_poll(result)
-    }
-}
-
-impl RecordBatchStream for FileStream {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.projected_schema)
-    }
-}
-
-/// A fallible future that resolves to a stream of [`RecordBatch`]
-pub type FileOpenFuture =
-    BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch, ArrowError>>>>;
-
-/// Describes the behavior of the `FileStream` if file opening or scanning fails
-pub enum OnError {
-    /// Fail the entire stream and return the underlying error
-    Fail,
-    /// Continue scanning, ignoring the failed file
-    Skip,
-}
-
-impl Default for OnError {
-    fn default() -> Self {
-        Self::Fail
-    }
-}
-
-/// Generic API for opening a file using an [`ObjectStore`] and resolving to a
-/// stream of [`RecordBatch`]
-///
-/// [`ObjectStore`]: object_store::ObjectStore
-pub trait FileOpener: Unpin + Send + Sync {
-    /// Asynchronously open the specified file and return a stream
-    /// of [`RecordBatch`]
-    fn open(&self, file_meta: FileMeta) -> Result<FileOpenFuture>;
-}
-
-/// Represents the state of the next `FileOpenFuture`. Since we need to poll
-/// this future while scanning the current file, we need to store the result if it
-/// is ready
-pub enum NextOpen {
-    Pending(FileOpenFuture),
-    Ready(Result<BoxStream<'static, Result<RecordBatch, ArrowError>>>),
-}
-
-pub enum FileStreamState {
-    /// The idle state, no file is currently being read
-    Idle,
-    /// Currently performing asynchronous IO to obtain a stream of RecordBatch
-    /// for a given file
-    Open {
-        /// A [`FileOpenFuture`] returned by [`FileOpener::open`]
-        future: FileOpenFuture,
-        /// The partition values for this file
-        partition_values: Vec<ScalarValue>,
-    },
-    /// Scanning the [`BoxStream`] returned by the completion of a [`FileOpenFuture`]
-    /// returned by [`FileOpener::open`]
-    Scan {
-        /// Partitioning column values for the current batch_iter
-        partition_values: Vec<ScalarValue>,
-        /// The reader instance
-        reader: BoxStream<'static, Result<RecordBatch, ArrowError>>,
-        /// A [`FileOpenFuture`] for the next file to be processed,
-        /// and its corresponding partition column values, if any.
-        /// This allows the next file to be opened in parallel while the
-        /// current file is read.
-        next: Option<(NextOpen, Vec<ScalarValue>)>,
-    },
-    /// Encountered an error
-    Error,
-    /// Reached the row limit
-    Limit,
-}
-
-/// A timer that can be started and stopped.
-pub struct StartableTime {
-    pub metrics: Time,
-    // use for record each part cost time, will eventually add into 'metrics'.
-    pub start: Option<Instant>,
-}
-
-impl StartableTime {
-    pub fn start(&mut self) {
-        assert!(self.start.is_none());
-        self.start = Some(Instant::now());
-    }
-
-    pub fn stop(&mut self) {
-        if let Some(start) = self.start.take() {
-            self.metrics.add_elapsed(start);
-        }
-    }
-}
-
-#[allow(rustdoc::broken_intra_doc_links)]
-/// Metrics for [`FileStream`]
-///
-/// Note that all of these metrics are in terms of wall clock time
-/// (not cpu time) so they include time spent waiting on I/O as well
-/// as other operators.
-///
-/// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_stream.rs>
-pub struct FileStreamMetrics {
-    /// Wall clock time elapsed for file opening.
-    ///
-    /// Time between when [`FileOpener::open`] is called and when the
-    /// [`FileStream`] receives a stream for reading.
-    ///
-    /// If there are multiple files being scanned, the stream
-    /// will open the next file in the background while scanning the
-    /// current file. This metric will only capture time spent opening
-    /// while not also scanning.
-    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_stream.rs>
-    pub time_opening: StartableTime,
-    /// Wall clock time elapsed for file scanning + first record batch of decompression + decoding
-    ///
-    /// Time between when the [`FileStream`] requests data from the
-    /// stream and when the first [`RecordBatch`] is produced.
-    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_stream.rs>
-    pub time_scanning_until_data: StartableTime,
-    /// Total elapsed wall clock time for scanning + record batch decompression / decoding
-    ///
-    /// Sum of time between when the [`FileStream`] requests data from
-    /// the stream and when a [`RecordBatch`] is produced for all
-    /// record batches in the stream. Note that this metric also
-    /// includes the time of the parent operator's execution.
-    pub time_scanning_total: StartableTime,
-    /// Wall clock time elapsed for data decompression + decoding
-    ///
-    /// Time spent waiting for the FileStream's input.
-    pub time_processing: StartableTime,
-    /// Count of errors opening file.
-    ///
-    /// If using `OnError::Skip` this will provide a count of the number of files
-    /// which were skipped and will not be included in the scan results.
-    pub file_open_errors: Count,
-    /// Count of errors scanning file
-    ///
-    /// If using `OnError::Skip` this will provide a count of the number of files
-    /// which were skipped and will not be included in the scan results.
-    pub file_scan_errors: Count,
-}
-
-impl FileStreamMetrics {
-    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
-        let time_opening = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_opening", partition),
-            start: None,
-        };
-
-        let time_scanning_until_data = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_scanning_until_data", partition),
-            start: None,
-        };
-
-        let time_scanning_total = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_scanning_total", partition),
-            start: None,
-        };
-
-        let time_processing = StartableTime {
-            metrics: MetricBuilder::new(metrics)
-                .subset_time("time_elapsed_processing", partition),
-            start: None,
-        };
-
-        let file_open_errors =
-            MetricBuilder::new(metrics).counter("file_open_errors", partition);
-
-        let file_scan_errors =
-            MetricBuilder::new(metrics).counter("file_scan_errors", partition);
-
-        Self {
-            time_opening,
-            time_scanning_until_data,
-            time_scanning_total,
-            time_processing,
-            file_open_errors,
-            file_scan_errors,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::file_scan_config::FileScanConfigBuilder;
-    use crate::tests::make_partition;
-    use crate::PartitionedFile;
-    use arrow::error::ArrowError;
-    use datafusion_common::error::Result;
-    use datafusion_execution::object_store::ObjectStoreUrl;
-    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-    use futures::{FutureExt as _, StreamExt as _};
-    use std::sync::atomic::{AtomicUsize, Ordering};
-    use std::sync::Arc;
-
-    use crate::file_meta::FileMeta;
-    use crate::file_stream::{FileOpenFuture, FileOpener, FileStream, OnError};
-    use crate::test_util::MockSource;
-    use arrow::array::RecordBatch;
-    use arrow::datatypes::Schema;
-
-    use datafusion_common::{assert_batches_eq, internal_err};
-
-    /// Test `FileOpener` which will simulate errors during file opening or scanning
-    #[derive(Default)]
-    struct TestOpener {
-        /// Index in stream of files which should throw an error while opening
-        error_opening_idx: Vec<usize>,
-        /// Index in stream of files which should throw an error while scanning
-        error_scanning_idx: Vec<usize>,
-        /// Index of last file in stream
-        current_idx: AtomicUsize,
-        /// `RecordBatch` to return
-        records: Vec<RecordBatch>,
-    }
-
-    impl FileOpener for TestOpener {
-        fn open(&self, _file_meta: FileMeta) -> Result<FileOpenFuture> {
-            let idx = self.current_idx.fetch_add(1, Ordering::SeqCst);
-
-            if self.error_opening_idx.contains(&idx) {
-                Ok(futures::future::ready(internal_err!("error opening")).boxed())
-            } else if self.error_scanning_idx.contains(&idx) {
-                let error = futures::future::ready(Err(ArrowError::IpcError(
-                    "error scanning".to_owned(),
-                )));
-                let stream = futures::stream::once(error).boxed();
-                Ok(futures::future::ready(Ok(stream)).boxed())
-            } else {
-                let iterator = self.records.clone().into_iter().map(Ok);
-                let stream = futures::stream::iter(iterator).boxed();
-                Ok(futures::future::ready(Ok(stream)).boxed())
-            }
-        }
-    }
-
-    #[derive(Default)]
-    struct FileStreamTest {
-        /// Number of files in the stream
-        num_files: usize,
-        /// Global limit of records emitted by the stream
-        limit: Option<usize>,
-        /// Error-handling behavior of the stream
-        on_error: OnError,
-        /// Mock `FileOpener`
-        opener: TestOpener,
-    }
-
-    impl FileStreamTest {
-        pub fn new() -> Self {
-            Self::default()
-        }
-
-        /// Specify the number of files in the stream
-        pub fn with_num_files(mut self, num_files: usize) -> Self {
-            self.num_files = num_files;
-            self
-        }
-
-        /// Specify the limit
-        pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-            self.limit = limit;
-            self
-        }
-
-        /// Specify the index of files in the stream which should
-        /// throw an error when opening
-        pub fn with_open_errors(mut self, idx: Vec<usize>) -> Self {
-            self.opener.error_opening_idx = idx;
-            self
-        }
-
-        /// Specify the index of files in the stream which should
-        /// throw an error when scanning
-        pub fn with_scan_errors(mut self, idx: Vec<usize>) -> Self {
-            self.opener.error_scanning_idx = idx;
-            self
-        }
-
-        /// Specify the behavior of the stream when an error occurs
-        pub fn with_on_error(mut self, on_error: OnError) -> Self {
-            self.on_error = on_error;
-            self
-        }
-
-        /// Specify the record batches that should be returned from each
-        /// file that is successfully scanned
-        pub fn with_records(mut self, records: Vec<RecordBatch>) -> Self {
-            self.opener.records = records;
-            self
-        }
-
-        /// Collect the results of the `FileStream`
-        pub async fn result(self) -> Result<Vec<RecordBatch>> {
-            let file_schema = self
-                .opener
-                .records
-                .first()
-                .map(|batch| batch.schema())
-                .unwrap_or_else(|| Arc::new(Schema::empty()));
-
-            // let ctx = SessionContext::new();
-            let mock_files: Vec<(String, u64)> = (0..self.num_files)
-                .map(|idx| (format!("mock_file{idx}"), 10_u64))
-                .collect();
-
-            // let mock_files_ref: Vec<(&str, u64)> = mock_files
-            //     .iter()
-            //     .map(|(name, size)| (name.as_str(), *size))
-            //     .collect();
-
-            let file_group = mock_files
-                .into_iter()
-                .map(|(name, size)| PartitionedFile::new(name, size))
-                .collect();
-
-            let on_error = self.on_error;
-
-            let config = FileScanConfigBuilder::new(
-                ObjectStoreUrl::parse("test:///").unwrap(),
-                file_schema,
-                Arc::new(MockSource::default()),
-            )
-            .with_file_group(file_group)
-            .with_limit(self.limit)
-            .build();
-            let metrics_set = ExecutionPlanMetricsSet::new();
-            let file_stream =
-                FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set)
-                    .unwrap()
-                    .with_on_error(on_error);
-
-            file_stream
-                .collect::<Vec<_>>()
-                .await
-                .into_iter()
-                .collect::<Result<Vec<_>>>()
-        }
-    }
-
-    /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1])
-    async fn create_and_collect(limit: Option<usize>) -> Vec<RecordBatch> {
-        FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_limit(limit)
-            .result()
-            .await
-            .expect("error executing stream")
-    }
-
-    #[tokio::test]
-    async fn on_error_opening() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_scanning_fail() -> Result<()> {
-        let result = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Fail)
-            .with_scan_errors(vec![1])
-            .result()
-            .await;
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_opening_fail() -> Result<()> {
-        let result = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Fail)
-            .with_open_errors(vec![1])
-            .result()
-            .await;
-
-        assert!(result.is_err());
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_scanning() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(2)
-            .with_on_error(OnError::Skip)
-            .with_scan_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn on_error_mixed() -> Result<()> {
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![1])
-            .with_scan_errors(vec![0])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0])
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![2])
-            .with_scan_errors(vec![0, 1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        let batches = FileStreamTest::new()
-            .with_records(vec![make_partition(3), make_partition(2)])
-            .with_num_files(3)
-            .with_on_error(OnError::Skip)
-            .with_open_errors(vec![0, 2])
-            .with_scan_errors(vec![1])
-            .result()
-            .await?;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "++",
-            "++",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn without_limit() -> Result<()> {
-        let batches = create_and_collect(None).await;
-
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn with_limit_between_files() -> Result<()> {
-        let batches = create_and_collect(Some(5)).await;
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn with_limit_at_middle_of_batch() -> Result<()> {
-        let batches = create_and_collect(Some(6)).await;
-        #[rustfmt::skip]
-        assert_batches_eq!(&[
-            "+---+",
-            "| i |",
-            "+---+",
-            "| 0 |",
-            "| 1 |",
-            "| 2 |",
-            "| 0 |",
-            "| 1 |",
-            "| 0 |",
-            "+---+",
-        ], &batches);
-
-        Ok(())
-    }
-}
diff --git a/datafusion/datasource/src/file_stream/mod.rs b/datafusion/datasource/src/file_stream/mod.rs
new file mode 100644
index 0000000000000..17f8a99b9e59a
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/mod.rs
@@ -0,0 +1,2340 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! A generic stream over file format readers that can be used by
+//! any file format that read its files from start to end.
+//!
+//! Note: Most traits here need to be marked `Sync + Send` to be
+//! compliant with the `SendableRecordBatchStream` trait.
+
+pub mod shared_state;
+mod trace;
+
+pub use shared_state::{
+    FileStreamId, OutstandingIoPermit, SharedFileStreamMode, SharedFileStreamState,
+};
+
+use std::collections::VecDeque;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::PartitionedFile;
+use crate::file_scan_config::FileScanConfig;
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{Result, internal_datafusion_err};
+use datafusion_execution::RecordBatchStream;
+use datafusion_physical_plan::metrics::{
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, Time,
+};
+
+use arrow::record_batch::RecordBatch;
+use datafusion_common::instant::Instant;
+
+use crate::morsel::{FileOpenerMorselizer, Morsel, MorselPlanner, Morselizer};
+use datafusion_common_runtime::SpawnedTask;
+use futures::future::BoxFuture;
+use futures::stream::BoxStream;
+use futures::{FutureExt, Stream, StreamExt as _};
+use trace::{ReadTrace, file_stream_trace_enabled};
+
+const DEFAULT_OUTSTANDING_IOS_PER_PARTITION: usize = 2;
+
+/// Keep at most this many morsels buffered before pausing additional planning.
+///
+/// TODO make this a config option
+fn max_buffered_morsels() -> usize {
+    2
+}
+
+/// Resolve the shared outstanding-I/O budget for one `DataSourceExec`.
+///
+/// This is temporary wiring until the datasource layer constructs and passes
+/// the shared state directly into each sibling `FileStream`.
+fn target_datasource_outstanding_ios(num_partitions: usize) -> usize {
+    std::env::var("DATAFUSION_DATASOURCE_OUTSTANDING_IOS")
+        .ok()
+        .and_then(|value| value.parse::<usize>().ok())
+        .filter(|value| *value > 0)
+        .unwrap_or_else(|| {
+            DEFAULT_OUTSTANDING_IOS_PER_PARTITION * std::cmp::max(num_partitions, 1)
+        })
+}
+
+/// Build a default shared state object for streams created directly from a
+/// `FileScanConfig`.
+pub(crate) fn shared_file_stream_state_for(
+    config: &FileScanConfig,
+) -> SharedFileStreamState {
+    SharedFileStreamState::new(
+        target_datasource_outstanding_ios(config.file_groups.len()),
+        if config.preserve_order {
+            SharedFileStreamMode::PreserveOrder
+        } else {
+            SharedFileStreamMode::Unordered
+        },
+    )
+}
+
+/// A stream that iterates record batch by record batch, file over file.
+///
+/// When running, a FileStream has some number of waiting planners (that are
+/// waiting on IO) and some number of read_planners that are waiting on CPU.
+///
+/// When the next batch is requested, the FileStream will first poll any
+/// outstanding io_requests to ensure I/O is making progress in parallel with
+/// batch processing.
+///
+/// It then tries to prioritize processing data it has in its cache by read from
+/// the active stream, if any. If that is not ready, it will use the CPU to
+/// prepare more morsels or discover new IO before launching the next morsel.
+///
+/// Sibling `FileStream`s created for the same `DataSourceExec` may also share a
+/// [`SharedFileStreamState`]. That shared state coordinates resources such as
+/// the total number of outstanding planner I/O operations across all sibling
+/// streams. Each `FileStream` registers itself with the shared state during
+/// construction (via `with_shared_state`) and must acquire a shared permit
+/// before advancing a ready planner toward its next I/O phase. If the planner
+/// actually issues an `io_future`, the permit remains attached to that waiting
+/// planner until the future resolves.
+///
+/// Future feature:
+///  Other FileStreams may steal morsels from this stream to increase parallelism and resource utilization.
+pub struct FileStream {
+    /// Local file/planner/morsel queues owned by this stream.
+    queues: MorselQueue,
+    /// The current reader, if any
+    reader: Option<BoxStream<'static, Result<RecordBatch>>>,
+    /// The stream schema (file schema including partition columns and after
+    /// projection).
+    projected_schema: SchemaRef,
+    /// The remaining number of records to parse until limit is reached, None if no limit
+    remain: Option<usize>,
+    /// A type specific [`Morselizer`] that examines the input files and produces a stream of `Morsels`
+    morselizer: Box<dyn Morselizer>,
+    /// File stream specific metrics
+    file_stream_metrics: FileStreamMetrics,
+    /// runtime baseline metrics
+    baseline_metrics: BaselineMetrics,
+    /// Describes the behavior of the `FileStream` if file opening or scanning fails
+    on_error: OnError,
+    /// Preserve the logical planner/morsel order defined by the
+    /// [`MorselPlan`] API?
+    ///
+    /// If false (the default) morsels will be produced in the order
+    /// that they are ready to be run.
+    ///
+    /// If true, Morsels will be produced in the logical order defined on
+    /// [`MorselPlan`]
+    ///
+    /// [`MorselPlan`]: crate::morsel::MorselPlan
+    preserve_order: bool,
+    /// Preserve output partition boundaries.
+    ///
+    /// If false (the default), morsels may be run by a sibling `FileStream`
+    ///
+    /// If true, morsels will be produced by the same stream that created
+    /// planned them.
+    preserve_partitions: bool,
+    /// Shared scheduling state across all sibling `FileStream`s for the same
+    /// `DataSourceExec`.
+    ///
+    /// This shared state enforces the cross-stream outstanding-I/O budget and
+    /// wakes blocked sibling streams when capacity becomes available again.
+    shared_file_stream_state: SharedFileStreamState,
+    /// This stream's identity within `shared_file_stream_state`.
+    ///
+    /// The id is assigned when the stream registers itself with the shared
+    /// state and is cleared once the stream unregisters after reaching a
+    /// terminal state.
+    stream_id: Option<FileStreamId>,
+    /// Optional runtime trace for observing scheduler behavior.
+    trace: ReadTrace,
+    /// Is the stream complete?
+    state: StreamState,
+}
+
+enum StreamState {
+    /// Stream can make progress when polled
+    Active,
+    /// Stream is done
+    Done,
+    /// Stream is done, and errord
+    Error,
+}
+
+/// Queues for a file/planner/morsel
+///
+/// The flow is:
+/// 1. Read each file  and turn it into morsels (potentially in parallel)
+/// 2. Read each morsel individually and produce `RecordBatch`es for processing
+#[derive(Debug)]
+pub(super) struct MorselQueue {
+    /// Input files that have not yet been morselized.
+    file_iter: VecDeque<PartitionedFile>,
+    /// Planners that are currently waiting on an outstanding I/O phase.
+    waiting_planners: VecDeque<WaitingPlanner>,
+    /// Planners that are CPU-ready and may be advanced by calling `plan()`.
+    ready_planners: VecDeque<Box<dyn MorselPlanner>>,
+    /// Morsels that are ready to be scanned into `RecordBatch`es.
+    morsels: VecDeque<Box<dyn Morsel>>,
+}
+
+impl MorselQueue {
+    /// Create an empty queue set for one file group.
+    pub(super) fn new(file_iter: VecDeque<PartitionedFile>) -> Self {
+        Self {
+            file_iter,
+            waiting_planners: VecDeque::new(),
+            ready_planners: VecDeque::new(),
+            morsels: VecDeque::new(),
+        }
+    }
+
+    /// Clear all planner and morsel work currently owned by this stream.
+    pub(super) fn clear(&mut self) {
+        self.waiting_planners.clear();
+        self.ready_planners.clear();
+        self.morsels.clear();
+    }
+
+    /// Return true if the stream has no remaining queued file or morsel work.
+    pub(super) fn is_empty(&self) -> bool {
+        self.file_iter.is_empty()
+            && self.waiting_planners.is_empty()
+            && self.ready_planners.is_empty()
+            && self.morsels.is_empty()
+    }
+
+    /// Return the number of queued ready morsels.
+    pub(super) fn morsel_len(&self) -> usize {
+        self.morsels.len()
+    }
+
+    /// Return true if there is at least one queued ready morsel.
+    pub(super) fn has_morsels(&self) -> bool {
+        !self.morsels.is_empty()
+    }
+
+    /// Return true if there is at least one queued ready planner.
+    pub(super) fn has_ready_planners(&self) -> bool {
+        !self.ready_planners.is_empty()
+    }
+
+    /// Return true if there is at least one queued waiting planner.
+    pub(super) fn has_waiting_planners(&self) -> bool {
+        !self.waiting_planners.is_empty()
+    }
+
+    /// Return the total number of queued ready and waiting planners.
+    pub(super) fn planner_count(&self) -> usize {
+        self.waiting_planners.len() + self.ready_planners.len()
+    }
+
+    /// Push one CPU-ready planner into the local queue.
+    pub(super) fn push_ready_planner(&mut self, planner: Box<dyn MorselPlanner>) {
+        self.ready_planners.push_back(planner);
+    }
+
+    /// Extend the local queue with CPU-ready planners.
+    pub(super) fn extend_ready_planners(
+        &mut self,
+        planners: impl IntoIterator<Item = Box<dyn MorselPlanner>>,
+    ) {
+        self.ready_planners.extend(planners);
+    }
+
+    /// Pop the next CPU-ready planner from the local queue.
+    pub(super) fn pop_ready_planner(&mut self) -> Option<Box<dyn MorselPlanner>> {
+        self.ready_planners.pop_front()
+    }
+
+    /// Push one waiting planner into the local queue.
+    fn push_waiting_planner(&mut self, planner: WaitingPlanner) {
+        self.waiting_planners.push_back(planner);
+    }
+
+    /// Drain all waiting planners from the queue.
+    fn take_waiting_planners(&mut self) -> VecDeque<WaitingPlanner> {
+        std::mem::take(&mut self.waiting_planners)
+    }
+
+    /// Push one ready morsel into the local queue.
+    pub(super) fn push_morsel(&mut self, morsel: Box<dyn Morsel>) {
+        self.morsels.push_back(morsel);
+    }
+
+    /// Extend the local queue with ready morsels.
+    pub(super) fn extend_morsels(
+        &mut self,
+        morsels: impl IntoIterator<Item = Box<dyn Morsel>>,
+    ) {
+        self.morsels.extend(morsels);
+    }
+
+    /// Pop the next ready morsel from the local queue.
+    pub(super) fn pop_morsel(&mut self) -> Option<Box<dyn Morsel>> {
+        self.morsels.pop_front()
+    }
+
+    /// Pop the next input file from the local queue.
+    pub(super) fn pop_file(&mut self) -> Option<PartitionedFile> {
+        self.file_iter.pop_front()
+    }
+}
+
+/// Builder for constructing a [`FileStream`].
+pub struct FileStreamBuilder<'a> {
+    config: &'a FileScanConfig,
+    partition: usize,
+    morselizer: Box<dyn Morselizer>,
+    metrics: &'a ExecutionPlanMetricsSet,
+    on_error: OnError,
+    preserve_order: bool,
+    preserve_partitions: bool,
+    shared_file_stream_state: Option<SharedFileStreamState>,
+}
+
+impl<'a> FileStreamBuilder<'a> {
+    /// Create a new builder using a legacy [`FileOpener`].
+    pub fn new(
+        config: &'a FileScanConfig,
+        partition: usize,
+        file_opener: Arc<dyn FileOpener>,
+        metrics: &'a ExecutionPlanMetricsSet,
+    ) -> Self {
+        Self::new_with_morselizer(
+            config,
+            partition,
+            Box::new(FileOpenerMorselizer::new(file_opener)),
+            metrics,
+        )
+    }
+
+    /// Create a new builder using a [`Morselizer`].
+    pub fn new_with_morselizer(
+        config: &'a FileScanConfig,
+        partition: usize,
+        morselizer: Box<dyn Morselizer>,
+        metrics: &'a ExecutionPlanMetricsSet,
+    ) -> Self {
+        Self {
+            config,
+            partition,
+            morselizer,
+            on_error: OnError::Fail,
+            preserve_order: config.preserve_order,
+            preserve_partitions: config.partitioned_by_file_group,
+            metrics,
+            shared_file_stream_state: None,
+        }
+    }
+
+    /// Configure the behavior when opening or scanning a file fails.
+    pub fn with_on_error(mut self, on_error: OnError) -> Self {
+        self.on_error = on_error;
+        self
+    }
+
+    /// Configure whether this stream should preserve logical planner order.
+    pub fn with_preserve_order(mut self, preserve_order: bool) -> Self {
+        self.preserve_order = preserve_order;
+        self
+    }
+
+    /// Use the provided shared scheduler state instead of the default one.
+    pub fn with_shared_state(
+        mut self,
+        shared_file_stream_state: SharedFileStreamState,
+    ) -> Self {
+        self.shared_file_stream_state = Some(shared_file_stream_state);
+        self
+    }
+
+    /// Build the configured [`FileStream`].
+    pub fn build(self) -> Result<FileStream> {
+        let shared_file_stream_state = self
+            .shared_file_stream_state
+            .unwrap_or_else(|| shared_file_stream_state_for(self.config));
+        let projected_schema = self.config.projected_schema()?;
+        let file_group = self.config.file_groups[self.partition].clone();
+        let stream_id = shared_file_stream_state.register_stream();
+        let trace =
+            ReadTrace::new(file_stream_trace_enabled(), self.partition, stream_id);
+
+        Ok(FileStream {
+            queues: MorselQueue::new(file_group.into_inner().into_iter().collect()),
+            reader: None,
+            projected_schema,
+            remain: self.config.limit,
+            morselizer: self.morselizer,
+            file_stream_metrics: FileStreamMetrics::new(self.metrics, self.partition),
+            baseline_metrics: BaselineMetrics::new(self.metrics, self.partition),
+            on_error: self.on_error,
+            preserve_order: self.preserve_order,
+            preserve_partitions: self.preserve_partitions,
+            shared_file_stream_state,
+            stream_id: Some(stream_id),
+            trace,
+            state: StreamState::Active,
+        })
+    }
+}
+
+impl FileStream {
+    /// Return true if this stream may publish and steal ready work from the
+    /// shared queue.
+    ///
+    /// The shared queue permits output to be reordered, so only do this when
+    /// stream does not need to preserve ordering within the stream or across
+    /// partitions.
+    fn can_share_ready_work(&self) -> bool {
+        !self.preserve_order
+            && !self.preserve_partitions
+            && self.shared_file_stream_state.registered_stream_count() > 1
+    }
+
+    /// Enqueue ready planners either locally or into the shared queue.
+    fn push_ready_planners(
+        &mut self,
+        planners: impl IntoIterator<Item = Box<dyn MorselPlanner>>,
+    ) {
+        let planners: Vec<_> = planners.into_iter().collect();
+        if planners.is_empty() {
+            return;
+        }
+        if self.can_share_ready_work() {
+            self.trace.planners_ready(planners.len(), true);
+            for planner in planners {
+                self.shared_file_stream_state.push_ready_planner(planner);
+            }
+        } else {
+            self.trace.planners_ready(planners.len(), false);
+            self.queues.extend_ready_planners(planners);
+        }
+    }
+
+    /// Enqueue ready morsels either locally or into the shared queue.
+    fn push_ready_morsels(&mut self, morsels: impl IntoIterator<Item = Box<dyn Morsel>>) {
+        let morsels: Vec<_> = morsels.into_iter().collect();
+        if morsels.is_empty() {
+            return;
+        }
+        if self.can_share_ready_work() {
+            self.trace.morsels_ready(morsels.len(), true);
+            for morsel in morsels {
+                self.shared_file_stream_state.push_ready_morsel(morsel);
+            }
+        } else {
+            self.trace.morsels_ready(morsels.len(), false);
+            self.queues.extend_morsels(morsels);
+        }
+    }
+
+    /// Try to steal one ready morsel or planner from the shared queue and place
+    /// them in the local queue.
+    ///
+    /// Morsels are preferred because they are already fully prepared CPU work.
+    fn try_steal_ready_work(&mut self) -> bool {
+        if !self.can_share_ready_work()
+            || self.reader.is_some()
+            || self.queues.has_morsels()
+            || self.queues.has_ready_planners()
+        {
+            return false;
+        }
+
+        if let Some(morsel) = self.shared_file_stream_state.pop_ready_morsel() {
+            self.queues.push_morsel(morsel);
+            self.trace.stole_work("morsel");
+            return true;
+        }
+
+        if let Some(planner) = self.shared_file_stream_state.pop_ready_planner() {
+            self.queues.push_ready_planner(planner);
+            self.trace.stole_work("planner");
+            return true;
+        }
+
+        false
+    }
+
+    /// Create a new [`FileStream`] using a legacy [`FileOpener`].
+    ///
+    /// Prefer [`FileStreamBuilder`] for new code.
+    #[deprecated(since = "52.3.0", note = "use FileStreamBuilder instead")]
+    pub fn new(
+        config: &FileScanConfig,
+        partition: usize,
+        file_opener: Arc<dyn FileOpener>,
+        metrics: &ExecutionPlanMetricsSet,
+    ) -> Result<Self> {
+        FileStreamBuilder::new(config, partition, file_opener, metrics).build()
+    }
+
+    /// Return this stream's registered shared-state id.
+    fn stream_id(&self) -> Result<FileStreamId> {
+        self.stream_id.ok_or_else(|| {
+            internal_datafusion_err!("file stream is not registered with shared state")
+        })
+    }
+
+    /// Unregister this stream from the shared scheduler once it reaches a
+    /// terminal state.
+    fn unregister_stream_if_needed(&mut self) {
+        if let Some(stream_id) = self.stream_id.take() {
+            self.shared_file_stream_state.unregister_stream(stream_id);
+        }
+    }
+
+    /// Run a planner on CPU until it either needs I/O or fully completes.
+    ///
+    /// Any morsels produced along the way are appended to `self.morsels`. If
+    /// the planner needs more I/O, it is moved to `waiting_planners`.
+    fn plan_morsels(
+        &mut self,
+        mut planner: Box<dyn MorselPlanner>,
+        io_permit: OutstandingIoPermit,
+    ) -> Result<()> {
+        let max_buffered_morsels = max_buffered_morsels();
+        let mut io_permit = Some(io_permit);
+        while let Some(mut plan) = planner.plan()? {
+            let morsels = plan.take_morsels();
+            let planners = plan.take_planners();
+            let io_future = plan.take_io_future();
+            self.trace
+                .plan_result(morsels.len(), planners.len(), io_future.is_some());
+            self.push_ready_morsels(morsels);
+            self.push_ready_planners(planners);
+            if let Some(io_future) = io_future {
+                self.queues.push_waiting_planner(WaitingPlanner::new(
+                    planner,
+                    io_future,
+                    io_permit
+                        .take()
+                        .expect("planner I/O permit should be available"),
+                ));
+                self.trace.io_scheduled(self.queues.waiting_planners.len());
+                break;
+            }
+
+            if self.queues.morsel_len() >= max_buffered_morsels {
+                self.push_ready_planners(std::iter::once(planner));
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Turn one file into one or more planners and immediately drive each of
+    /// them into the ready queue.
+    ///
+    /// The actual `plan()` calls happen in `poll_inner` once the stream has
+    /// acquired a shared permit to potentially issue another outstanding I/O.
+    fn morselize_next_file(&mut self, file: PartitionedFile) -> Result<()> {
+        self.trace.file_opened(&file);
+        for planner in self.morselizer.morselize(file)? {
+            self.push_ready_planners(std::iter::once(planner));
+        }
+        Ok(())
+    }
+
+    /// Pull additional files into the planner pipeline until the configured
+    /// planner concurrency target is reached.
+    ///
+    /// This is where new file-level work enters the stream. Formats that do all
+    /// of their planning synchronously may immediately populate `self.morsels`,
+    /// while formats that need metadata I/O will populate `waiting_planners`.
+    fn start_next_files(&mut self) -> Result<()> {
+        let max_buffered_morsels = max_buffered_morsels();
+        // Keep local file admission bounded per stream. The shared state
+        // controls the total outstanding I/O budget across sibling streams,
+        // but using that global budget as a per-stream admission target causes
+        // each stream to eagerly admit far too many files and planners.
+        let local_planner_target = DEFAULT_OUTSTANDING_IOS_PER_PARTITION.max(1);
+        while self.queues.planner_count() < local_planner_target {
+            // In ordered mode, do not admit later files while there is any
+            // earlier file work still buffered, waiting on I/O, or actively
+            // being scanned. This keeps file-level planning from introducing
+            // later output ahead of earlier files.
+            if self.preserve_order
+                && (self.reader.is_some()
+                    || self.queues.has_morsels()
+                    || self.queues.has_ready_planners()
+                    || self.queues.has_waiting_planners())
+            {
+                break;
+            }
+            if self.queues.morsel_len() >= max_buffered_morsels {
+                break;
+            }
+            let Some(file) = self.queues.pop_file() else {
+                break;
+            };
+            self.morselize_next_file(file)?;
+        }
+        Ok(())
+    }
+
+    /// Poll each waiting planner's outstanding I/O once.
+    ///
+    /// When a future completes successfully, the planner becomes CPU-ready
+    /// again and is moved back to `ready_planners`. Failed futures are handled
+    /// according to `OnError`.
+    fn check_io(&mut self, cx: &mut Context<'_>) -> Result<()> {
+        for mut waiting_planner in self.queues.take_waiting_planners() {
+            match waiting_planner.io_task.poll_unpin(cx) {
+                Poll::Ready(Ok(Ok(()))) => {
+                    self.file_stream_metrics.files_opened.add(1);
+                    self.push_ready_planners(std::iter::once(waiting_planner.planner));
+                    self.trace.io_completed(self.queues.planner_count());
+                }
+                Poll::Ready(Ok(Err(e))) => {
+                    self.file_stream_metrics.file_open_errors.add(1);
+                    match self.on_error {
+                        OnError::Skip => {
+                            self.file_stream_metrics.files_processed.add(1);
+                        }
+                        OnError::Fail => return Err(e),
+                    }
+                }
+                Poll::Ready(Err(join_err)) => {
+                    self.file_stream_metrics.file_open_errors.add(1);
+                    let e =
+                        datafusion_common::DataFusionError::External(Box::new(join_err));
+                    match self.on_error {
+                        OnError::Skip => {
+                            self.file_stream_metrics.files_processed.add(1);
+                        }
+                        OnError::Fail => return Err(e),
+                    }
+                }
+                Poll::Pending => self.queues.push_waiting_planner(waiting_planner),
+            }
+        }
+        Ok(())
+    }
+
+    /// Convert the next ready morsel into an active `RecordBatch` reader.
+    ///
+    /// This only happens when there is no reader currently in flight. The
+    /// corresponding scan timers start here because the morsel is now eligible
+    /// to produce batches.
+    fn start_next_morsel(&mut self) {
+        if self.reader.is_none()
+            && let Some(morsel) = self.queues.pop_morsel()
+        {
+            self.reader = Some(morsel.into_stream());
+            self.trace.morsel_started(self.queues.morsel_len());
+            self.file_stream_metrics.time_scanning_until_data.start();
+            self.file_stream_metrics.time_scanning_total.start();
+        }
+    }
+
+    /// Drive the `FileStream` scheduler forward by one poll.
+    ///
+    /// The order is important:
+    /// 1. Admit more files into the planner pipeline up to the concurrency
+    ///    target (ensures I/O are scheduled if needed)
+    /// 2. Poll outstanding planner I/O (ensure I/O completes in parallel)
+    /// 3. Spend CPU on ready planners only when there is no morsel already ready
+    ///    to execute.
+    /// 4. Launch and poll the active morsel reader.
+    fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<RecordBatch>>> {
+        loop {
+            match self.state {
+                StreamState::Active => {}
+                StreamState::Done => {
+                    self.unregister_stream_if_needed();
+                    return Poll::Ready(None);
+                }
+                StreamState::Error => {
+                    self.unregister_stream_if_needed();
+                    return Poll::Ready(None);
+                }
+            }
+
+            if let Err(e) = self.start_next_files() {
+                self.queues.clear();
+                self.state = StreamState::Error;
+                self.unregister_stream_if_needed();
+                return Poll::Ready(Some(Err(e)));
+            }
+            if let Err(e) = self.check_io(cx) {
+                self.queues.clear();
+                self.state = StreamState::Error;
+                self.unregister_stream_if_needed();
+                return Poll::Ready(Some(Err(e)));
+            }
+
+            // Opportunistically refill the local queues from shared ready work
+            // before spending more CPU locally. We intentionally ignore the
+            // return value here because this is only a best-effort steal: the
+            // normal scheduler flow below will observe any newly queued work.
+            let _ = self.try_steal_ready_work();
+
+            // Give ready planners CPU whenever there is buffer space, even if a
+            // reader is currently active. This avoids starving planner work
+            // behind a reader that is itself waiting on I/O.
+            while self.queues.morsel_len() < max_buffered_morsels() {
+                // In ordered mode, once an earlier planner has produced a
+                // morsel or is blocked on I/O, do not advance later sibling
+                // planners yet. This preserves the logical `MorselPlan` order:
+                // direct morsels first, then child planners in API order.
+                if self.preserve_order
+                    && (self.reader.is_some()
+                        || self.queues.has_morsels()
+                        || self.queues.has_waiting_planners())
+                {
+                    break;
+                }
+                let stream_id = match self.stream_id() {
+                    Ok(stream_id) => stream_id,
+                    Err(e) => {
+                        self.queues.clear();
+                        self.state = StreamState::Error;
+                        self.unregister_stream_if_needed();
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                };
+                let Some(io_permit) = self
+                    .shared_file_stream_state
+                    .try_acquire_io_permit(stream_id)
+                else {
+                    self.shared_file_stream_state
+                        .register_waker(stream_id, cx.waker());
+                    break;
+                };
+                let Some(planner) = self.queues.pop_ready_planner() else {
+                    drop(io_permit);
+                    break;
+                };
+                if let Err(e) = self.plan_morsels(planner, io_permit) {
+                    self.queues.clear();
+                    self.state = StreamState::Error;
+                    self.unregister_stream_if_needed();
+                    return Poll::Ready(Some(Err(e)));
+                }
+
+                // Once a morsel is buffered and a reader is already active,
+                // return to the scan side of the scheduler rather than
+                // continuing to spend CPU on planning in this poll.
+                if self.reader.is_some() && self.queues.has_morsels() {
+                    break;
+                }
+            }
+
+            // Newly planned work may have just discovered fresh I/O. Poll it
+            // once now so the future can register the current waker before we
+            // return `Pending`; otherwise the stream can stall waiting on an
+            // I/O future that has never been polled.
+            if let Err(e) = self.check_io(cx) {
+                self.queues.clear();
+                self.state = StreamState::Error;
+                self.unregister_stream_if_needed();
+                return Poll::Ready(Some(Err(e)));
+            }
+
+            // After polling I/O, see if a sibling published newly ready work.
+            // The boolean result is ignored because this is only an
+            // opportunistic prefetch into the local queues; the subsequent
+            // checks for local planners/morsels will handle any stolen work.
+            let _ = self.try_steal_ready_work();
+
+            // The second I/O poll may have completed planner work discovered
+            // during this same call to `poll_inner`. Loop back so newly ready
+            // planners get CPU time before we consider returning `Pending`.
+            if self.queues.has_ready_planners()
+                && self.queues.morsel_len() < max_buffered_morsels()
+                // In ordered mode, only loop back for more planner CPU when
+                // there is no earlier reader, buffered morsel, or waiting I/O
+                // that should be drained first. Otherwise, drop to
+                // `start_next_morsel()` so output is produced in order.
+                && (!self.preserve_order
+                    || (self.reader.is_none()
+                        && !self.queues.has_morsels()
+                        && !self.queues.has_waiting_planners()))
+            {
+                continue;
+            }
+
+            self.start_next_morsel();
+
+            if let Some(reader) = self.reader.as_mut() {
+                match reader.poll_next_unpin(cx) {
+                    Poll::Ready(Some(Ok(batch))) => {
+                        self.file_stream_metrics.time_scanning_until_data.stop();
+                        self.file_stream_metrics.time_scanning_total.stop();
+                        let batch = match &mut self.remain {
+                            Some(remain) => {
+                                if batch.num_rows() > *remain {
+                                    let batch = batch.slice(0, *remain);
+                                    *remain = 0;
+                                    self.state = StreamState::Done;
+                                    batch
+                                } else {
+                                    *remain -= batch.num_rows();
+                                    batch
+                                }
+                            }
+                            None => batch,
+                        };
+                        self.file_stream_metrics.time_scanning_total.start();
+                        self.trace.batch_emitted(batch.num_rows());
+                        return Poll::Ready(Some(Ok(batch)));
+                    }
+                    Poll::Ready(Some(Err(e))) => {
+                        self.reader = None;
+                        self.file_stream_metrics.file_scan_errors.add(1);
+                        self.file_stream_metrics.time_scanning_until_data.stop();
+                        self.file_stream_metrics.time_scanning_total.stop();
+
+                        match self.on_error {
+                            OnError::Fail => {
+                                self.queues.clear();
+                                self.state = StreamState::Error;
+                                self.unregister_stream_if_needed();
+                                return Poll::Ready(Some(Err(e)));
+                            }
+                            OnError::Skip => {
+                                self.file_stream_metrics.files_processed.add(1);
+                                continue;
+                            }
+                        }
+                    }
+                    Poll::Ready(None) => {
+                        self.reader = None;
+                        self.file_stream_metrics.files_processed.add(1);
+                        self.file_stream_metrics.time_scanning_until_data.stop();
+                        self.file_stream_metrics.time_scanning_total.stop();
+                        continue;
+                    }
+                    Poll::Pending => {}
+                }
+            }
+
+            if self.reader.is_none() && self.queues.is_empty() {
+                self.state = StreamState::Done;
+                self.unregister_stream_if_needed();
+                return Poll::Ready(None);
+            }
+
+            // try and find more work if possible, but if not,wait on the waker
+            if !self.try_steal_ready_work()
+                && self.can_share_ready_work()
+                && self.reader.is_none()
+                && !self.queues.has_morsels()
+                && !self.queues.has_ready_planners()
+            {
+                self.trace.waiting("shared_work_or_io");
+                let stream_id = match self.stream_id() {
+                    Ok(stream_id) => stream_id,
+                    Err(e) => {
+                        self.queues.clear();
+                        self.state = StreamState::Error;
+                        self.unregister_stream_if_needed();
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                };
+                self.shared_file_stream_state
+                    .register_waker(stream_id, cx.waker());
+            // If the active reader just returned `Pending`, yield back to the
+            // executor instead of looping immediately. Otherwise a reader that
+            // needs more I/O can hot-loop inside `poll_inner` as long as there
+            // is buffered work behind it, repeatedly polling the same pending
+            // reader without giving the executor a chance to wake it.
+            } else if self.reader.is_none()
+                && (self.queues.has_morsels() || self.queues.has_ready_planners())
+            {
+                continue;
+            }
+
+            self.trace.waiting("reader_or_io");
+            return Poll::Pending;
+        }
+    }
+}
+
+/// A planner that has already discovered its next I/O phase.
+///
+/// The I/O future is spawned onto the tokio runtime so it progresses
+/// independently of when this `FileStream` is polled, enabling true
+/// parallel prefetch of row-group data.
+struct WaitingPlanner {
+    planner: Box<dyn MorselPlanner>,
+    io_task: SpawnedTask<Result<()>>,
+    _io_permit: OutstandingIoPermit,
+}
+
+impl WaitingPlanner {
+    fn new(
+        planner: Box<dyn MorselPlanner>,
+        io_future: BoxFuture<'static, Result<()>>,
+        io_permit: OutstandingIoPermit,
+    ) -> Self {
+        let io_task = SpawnedTask::spawn(io_future);
+        Self {
+            planner,
+            io_task,
+            _io_permit: io_permit,
+        }
+    }
+}
+
+impl std::fmt::Debug for WaitingPlanner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("WaitingPlanner").finish_non_exhaustive()
+    }
+}
+
+impl Drop for FileStream {
+    fn drop(&mut self) {
+        // Release any outstanding permits before unregistering this stream
+        // from the shared scheduler.
+        self.queues.take_waiting_planners();
+        self.unregister_stream_if_needed();
+    }
+}
+
+impl Stream for FileStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        self.file_stream_metrics.time_processing.start();
+        let result = self.poll_inner(cx);
+        self.file_stream_metrics.time_processing.stop();
+        self.baseline_metrics.record_poll(result)
+    }
+}
+
+impl RecordBatchStream for FileStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.projected_schema)
+    }
+}
+
+/// A fallible future that resolves to a stream of [`RecordBatch`]
+///
+/// This is typically an `async` function that opens the file, and returns a
+/// stream that reads the file and produces `RecordBatch`es.
+pub type FileOpenFuture =
+    BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
+
+/// Describes the behavior of the `FileStream` if file opening or scanning fails
+#[derive(Default)]
+pub enum OnError {
+    /// Fail the entire stream and return the underlying error
+    #[default]
+    Fail,
+    /// Continue scanning, ignoring the failed file
+    Skip,
+}
+
+/// Generic API for opening a file using an [`ObjectStore`] and resolving to a
+/// stream of [`RecordBatch`]
+///
+/// [`ObjectStore`]: object_store::ObjectStore
+pub trait FileOpener: Unpin + Send + Sync {
+    /// Asynchronously open the specified file and return a stream
+    /// of [`RecordBatch`]
+    ///
+    /// TODO: describe prefetching behavior here, and expectations around IO
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture>;
+}
+
+/// A timer that can be started and stopped.
+pub struct StartableTime {
+    pub metrics: Time,
+    // use for record each part cost time, will eventually add into 'metrics'.
+    pub start: Option<Instant>,
+}
+
+impl StartableTime {
+    pub fn start(&mut self) {
+        assert!(self.start.is_none());
+        self.start = Some(Instant::now());
+    }
+
+    pub fn stop(&mut self) {
+        if let Some(start) = self.start.take() {
+            self.metrics.add_elapsed(start);
+        }
+    }
+}
+
+/// Metrics for [`FileStream`]
+///
+/// Note that all of these metrics are in terms of wall clock time
+/// (not cpu time) so they include time spent waiting on I/O as well
+/// as other operators.
+///
+/// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
+pub struct FileStreamMetrics {
+    /// Wall clock time elapsed for file opening.
+    ///
+    /// Time between when [`FileOpener::open`] is called and when the
+    /// [`FileStream`] receives a stream for reading.
+    ///
+    /// If there are multiple files being scanned, the stream
+    /// will open the next file in the background while scanning the
+    /// current file. This metric will only capture time spent opening
+    /// while not also scanning.
+    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
+    pub time_opening: StartableTime,
+    /// Wall clock time elapsed for file scanning + first record batch of decompression + decoding
+    ///
+    /// Time between when the [`FileStream`] requests data from the
+    /// stream and when the first [`RecordBatch`] is produced.
+    /// [`FileStream`]: <https://github.com/apache/datafusion/blob/main/datafusion/datasource/src/file_stream.rs>
+    pub time_scanning_until_data: StartableTime,
+    /// Total elapsed wall clock time for scanning + record batch decompression / decoding
+    ///
+    /// Sum of time between when the [`FileStream`] requests data from
+    /// the stream and when a [`RecordBatch`] is produced for all
+    /// record batches in the stream. Note that this metric also
+    /// includes the time of the parent operator's execution.
+    pub time_scanning_total: StartableTime,
+    /// Wall clock time elapsed for data decompression + decoding
+    ///
+    /// Time spent waiting for the FileStream's input.
+    pub time_processing: StartableTime,
+    /// Count of errors opening file.
+    ///
+    /// If using `OnError::Skip` this will provide a count of the number of files
+    /// which were skipped and will not be included in the scan results.
+    pub file_open_errors: Count,
+    /// Count of errors scanning file
+    ///
+    /// If using `OnError::Skip` this will provide a count of the number of files
+    /// which were skipped and will not be included in the scan results.
+    pub file_scan_errors: Count,
+    /// Count of files successfully opened or evaluated for processing.
+    /// At t=end (completion of a query) this is equal to `files_opened`, and both values are equal
+    /// to the total number of files in the query; unless the query itself fails.
+    /// This value will always be greater than or equal to `files_open`.
+    /// Note that this value does *not* mean the file was actually scanned.
+    /// We increment this value for any processing of a file, even if that processing is
+    /// discarding it because we hit a `LIMIT` (in this case `files_opened` and `files_processed` are both incremented at the same time).
+    pub files_opened: Count,
+    /// Count of files completely processed / closed (opened, pruned, or skipped due to limit).
+    /// At t=0 (the beginning of a query) this is 0.
+    /// At t=end (completion of a query) this is equal to `files_opened`, and both values are equal
+    /// to the total number of files in the query; unless the query itself fails.
+    /// This value will always be less than or equal to `files_open`.
+    /// We increment this value for any processing of a file, even if that processing is
+    /// discarding it because we hit a `LIMIT` (in this case `files_opened` and `files_processed` are both incremented at the same time).
+    pub files_processed: Count,
+}
+
+impl FileStreamMetrics {
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        let time_opening = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_opening", partition),
+            start: None,
+        };
+
+        let time_scanning_until_data = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_scanning_until_data", partition),
+            start: None,
+        };
+
+        let time_scanning_total = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_scanning_total", partition),
+            start: None,
+        };
+
+        let time_processing = StartableTime {
+            metrics: MetricBuilder::new(metrics)
+                .subset_time("time_elapsed_processing", partition),
+            start: None,
+        };
+
+        let file_open_errors =
+            MetricBuilder::new(metrics).counter("file_open_errors", partition);
+
+        let file_scan_errors =
+            MetricBuilder::new(metrics).counter("file_scan_errors", partition);
+
+        let files_opened = MetricBuilder::new(metrics).counter("files_opened", partition);
+
+        let files_processed =
+            MetricBuilder::new(metrics).counter("files_processed", partition);
+
+        Self {
+            time_opening,
+            time_scanning_until_data,
+            time_scanning_total,
+            time_processing,
+            file_open_errors,
+            file_scan_errors,
+            files_opened,
+            files_processed,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::file_groups::FileGroup;
+    use crate::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+    use crate::morsel::test_utils::{
+        IoFutureId, MockMorselSpec, MockMorselizer, MockPlanner, MorselId,
+        MorselObserver, PlannerId, ReturnPlanBuilder,
+    };
+    use crate::tests::make_partition;
+    use crate::{PartitionedFile, TableSchema};
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::error::Result;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use crate::file_stream::{
+        FileOpenFuture, FileOpener, FileStream, FileStreamBuilder, OnError,
+        SharedFileStreamMode, SharedFileStreamState,
+    };
+    use crate::test_util::MockSource;
+    use arrow::array::{Array, AsArray, RecordBatch};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{assert_batches_eq, exec_err, internal_err};
+    use futures::{FutureExt, StreamExt};
+
+    /// Test `FileOpener` which will simulate errors during file opening or scanning
+    #[derive(Default)]
+    struct TestOpener {
+        /// Index in stream of files which should throw an error while opening
+        error_opening_idx: Vec<usize>,
+        /// Index in stream of files which should throw an error while scanning
+        error_scanning_idx: Vec<usize>,
+        /// Index of last file in stream
+        current_idx: AtomicUsize,
+        /// `RecordBatch` to return
+        records: Vec<RecordBatch>,
+    }
+
+    impl FileOpener for TestOpener {
+        fn open(&self, _partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+            let idx = self.current_idx.fetch_add(1, Ordering::SeqCst);
+
+            if self.error_opening_idx.contains(&idx) {
+                Ok(futures::future::ready(internal_err!("error opening")).boxed())
+            } else if self.error_scanning_idx.contains(&idx) {
+                let error = futures::future::ready(exec_err!("error scanning"));
+                let stream = futures::stream::once(error).boxed();
+                Ok(futures::future::ready(Ok(stream)).boxed())
+            } else {
+                let iterator = self.records.clone().into_iter().map(Ok);
+                let stream = futures::stream::iter(iterator).boxed();
+                Ok(futures::future::ready(Ok(stream)).boxed())
+            }
+        }
+    }
+
+    #[derive(Default)]
+    struct FileStreamTest {
+        /// Number of files in the stream
+        num_files: usize,
+        /// Global limit of records emitted by the stream
+        limit: Option<usize>,
+        /// Error-handling behavior of the stream
+        on_error: OnError,
+        /// Mock `FileOpener`
+        opener: TestOpener,
+    }
+
+    impl FileStreamTest {
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Specify the number of files in the stream
+        pub fn with_num_files(mut self, num_files: usize) -> Self {
+            self.num_files = num_files;
+            self
+        }
+
+        /// Specify the limit
+        pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+            self.limit = limit;
+            self
+        }
+
+        /// Specify the index of files in the stream which should
+        /// throw an error when opening
+        pub fn with_open_errors(mut self, idx: Vec<usize>) -> Self {
+            self.opener.error_opening_idx = idx;
+            self
+        }
+
+        /// Specify the index of files in the stream which should
+        /// throw an error when scanning
+        pub fn with_scan_errors(mut self, idx: Vec<usize>) -> Self {
+            self.opener.error_scanning_idx = idx;
+            self
+        }
+
+        /// Specify the behavior of the stream when an error occurs
+        pub fn with_on_error(mut self, on_error: OnError) -> Self {
+            self.on_error = on_error;
+            self
+        }
+
+        /// Specify the record batches that should be returned from each
+        /// file that is successfully scanned
+        pub fn with_records(mut self, records: Vec<RecordBatch>) -> Self {
+            self.opener.records = records;
+            self
+        }
+
+        /// Collect the results of the `FileStream`
+        pub async fn result(self) -> Result<Vec<RecordBatch>> {
+            let file_schema = self
+                .opener
+                .records
+                .first()
+                .map(|batch| batch.schema())
+                .unwrap_or_else(|| Arc::new(Schema::empty()));
+
+            // let ctx = SessionContext::new();
+            let mock_files: Vec<(String, u64)> = (0..self.num_files)
+                .map(|idx| (format!("mock_file{idx}"), 10_u64))
+                .collect();
+
+            // let mock_files_ref: Vec<(&str, u64)> = mock_files
+            //     .iter()
+            //     .map(|(name, size)| (name.as_str(), *size))
+            //     .collect();
+
+            let file_group = mock_files
+                .into_iter()
+                .map(|(name, size)| PartitionedFile::new(name, size))
+                .collect();
+
+            let on_error = self.on_error;
+
+            let table_schema = TableSchema::new(file_schema, vec![]);
+            let config = FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                Arc::new(MockSource::new(table_schema)),
+            )
+            .with_file_group(file_group)
+            .with_limit(self.limit)
+            .build();
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let file_stream =
+                FileStreamBuilder::new(&config, 0, Arc::new(self.opener), &metrics_set)
+                    .with_on_error(on_error)
+                    .build()
+                    .unwrap();
+
+            file_stream
+                .collect::<Vec<_>>()
+                .await
+                .into_iter()
+                .collect::<Result<Vec<_>>>()
+        }
+    }
+
+    /// helper that creates a stream of 2 files with the same pair of batches in each ([0,1,2] and [0,1])
+    async fn create_and_collect(limit: Option<usize>) -> Vec<RecordBatch> {
+        FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_limit(limit)
+            .result()
+            .await
+            .expect("error executing stream")
+    }
+
+    /// Helper for morsel-driven `FileStream` tests that bundles the mock
+    /// `Morselizer` setup with the corresponding `FileScanConfig`.
+    #[derive(Clone)]
+    struct MorselTest {
+        morselizer: MockMorselizer,
+        file_names: Vec<String>,
+        preserve_order: bool,
+        event_summaries: bool,
+    }
+
+    impl MorselTest {
+        /// Create an empty morsel-driven test harness.
+        fn new() -> Self {
+            Self {
+                morselizer: MockMorselizer::new(),
+                file_names: vec![],
+                preserve_order: false,
+                event_summaries: false,
+            }
+        }
+
+        /// Add one file and its root mock planner to the test input.
+        fn with_file(mut self, path: impl Into<String>, planner: MockPlanner) -> Self {
+            let path = path.into();
+            self.morselizer = self.morselizer.with_file(path.clone(), planner);
+            self.file_names.push(path);
+            self
+        }
+
+        /// Run this test harness with ordered output semantics enabled.
+        fn with_preserve_order(mut self, preserve_order: bool) -> Self {
+            self.preserve_order = preserve_order;
+            self
+        }
+
+        /// Snapshot only the higher-level scheduler events.
+        ///
+        /// The full event trace is still useful for detailed tests, but for
+        /// more complex tests those lower level events obscure the important
+        /// events.
+        fn with_event_summaries(mut self) -> Self {
+            self.event_summaries = true;
+            self
+        }
+
+        /// Build the `FileScanConfig` corresponding to the configured mock
+        /// file set.
+        fn test_config(&self) -> FileScanConfig {
+            let file_group = self
+                .file_names
+                .iter()
+                .map(|name| PartitionedFile::new(name, 10))
+                .collect();
+            let table_schema = TableSchema::new(
+                Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)])),
+                vec![],
+            );
+            FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                Arc::new(MockSource::new(table_schema)),
+            )
+            .with_file_group(file_group)
+            .with_preserve_order(self.preserve_order)
+            .build()
+        }
+
+        async fn run(self) -> Result<String> {
+            // handle to shared observer
+            let observer = self.morselizer.observer().clone();
+            // Clear any prior observer events before running the test, so the
+            // snapshot only includes events from this run.
+            observer.clear();
+
+            let config = self.test_config();
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let mut stream = FileStreamBuilder::new_with_morselizer(
+                &config,
+                0,
+                Box::new(self.morselizer),
+                &metrics_set,
+            )
+            .build()?;
+
+            let mut stream_contents = Vec::new();
+            while let Some(result) = stream.next().await {
+                match result {
+                    Ok(batch) => {
+                        // Each batch should have a single int32 column with the
+                        // mocked batch id, which keeps snapshot output compact.
+                        let col = batch.column(0).as_primitive::<Int32Type>();
+                        assert_eq!(col.len(), 1);
+                        assert!(col.is_valid(0));
+                        let batch_id = col.value(0);
+                        stream_contents.push(format!("Batch: {batch_id}"));
+                    }
+                    Err(e) => {
+                        stream_contents.push(format!("Error: {e}"));
+                    }
+                }
+            }
+            stream_contents.push("Done".to_string());
+            let output = stream_contents.join("\n");
+
+            // Snapshot both the produced output and the scheduler trace
+            // together. This makes scheduler changes much easier to review than
+            // maintaining long hand-written event assertions separately.
+            let mut parts = vec!["----- Output Stream -----".to_string(), output];
+            parts.push("----- File Stream Events -----".to_string());
+            let events = if self.event_summaries {
+                observer.format_summary_events()
+            } else {
+                observer.format_events()
+            };
+            parts.push(events);
+            Ok(parts.join("\n"))
+        }
+    }
+
+    /// Helper for multi-stream morsel tests that share one
+    /// [`SharedFileStreamState`].
+    #[derive(Clone)]
+    struct MultiStreamMorselTest {
+        /// Shared mock morselizer used by all sibling streams in the test.
+        morselizer: MockMorselizer,
+        /// Per-partition file assignments used to build one sibling
+        /// `FileStream` per partition.
+        partitions: Vec<Vec<String>>,
+        /// The sequence of sibling streams to poll while exercising the
+        /// stealing scenario under test.
+        reads: Vec<TestStreamId>,
+    }
+
+    /// Identifies one sibling stream in a [`MultiStreamMorselTest`].
+    #[derive(Debug, Clone, Copy)]
+    struct TestStreamId(usize);
+
+    impl MultiStreamMorselTest {
+        /// Create a sibling-stream test harness with `num_partitions`
+        /// independent `FileStream`s.
+        fn new(num_partitions: usize) -> Self {
+            Self {
+                morselizer: MockMorselizer::new(),
+                partitions: vec![vec![]; num_partitions],
+                reads: vec![],
+            }
+        }
+
+        /// Add one file and its root planner to a specific sibling stream.
+        ///
+        /// This lets tests control which stream owns the original file-local
+        /// work before stealing redistributes ready morsels or planners.
+        fn with_file_in_partition(
+            mut self,
+            partition: usize,
+            path: impl Into<String>,
+            planner: MockPlanner,
+        ) -> Self {
+            let path = path.into();
+            self.morselizer = self.morselizer.with_file(path.clone(), planner);
+            self.partitions[partition].push(path);
+            self
+        }
+
+        /// Configure the order in which sibling streams are polled while the
+        /// test scenario is executing.
+        fn with_reads(mut self, reads: Vec<TestStreamId>) -> Self {
+            self.reads = reads;
+            self
+        }
+
+        /// Build a multi-partition `FileScanConfig` matching the configured
+        /// sibling test layout.
+        fn test_config(&self) -> FileScanConfig {
+            let table_schema = TableSchema::new(
+                Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)])),
+                vec![],
+            );
+            let mut builder = FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                Arc::new(MockSource::new(table_schema)),
+            );
+            for file_group in &self.partitions {
+                let file_group = file_group
+                    .iter()
+                    .map(|name| PartitionedFile::new(name, 10))
+                    .collect::<Vec<_>>();
+                builder = builder.with_file_group(FileGroup::new(file_group));
+            }
+            builder.build()
+        }
+
+        /// Build one `FileStream` per configured partition, all sharing the
+        /// same `SharedFileStreamState`.
+        ///
+        /// This is the core helper for stealing tests: separate streams have
+        /// distinct local queues, but share the same outstanding-I/O budget
+        /// and shared ready-work queues.
+        fn build_streams(&self) -> Result<(MorselObserver, Vec<FileStream>)> {
+            let observer = self.morselizer.observer().clone();
+            observer.clear();
+
+            let config = self.test_config();
+            let shared_state =
+                SharedFileStreamState::new(2, SharedFileStreamMode::Unordered);
+            let metrics_set = ExecutionPlanMetricsSet::new();
+            let streams = (0..self.partitions.len())
+                .map(|partition| {
+                    FileStreamBuilder::new_with_morselizer(
+                        &config,
+                        partition,
+                        Box::new(self.morselizer.clone()),
+                        &metrics_set,
+                    )
+                    .with_shared_state(shared_state.clone())
+                    .build()
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            Ok((observer, streams))
+        }
+
+        /// Run the configured poll sequence and format the per-stream outputs
+        /// plus shared scheduler events into one snapshot string.
+        async fn run(self) -> Result<String> {
+            let reads = self.reads.clone();
+            let (observer, mut streams) = self.build_streams()?;
+            let mut outputs = vec![vec![]; streams.len()];
+
+            for stream_id in reads {
+                let batch_id = next_batch_id(&mut streams[stream_id.0]).await?;
+                assert!(
+                    batch_id.is_some(),
+                    "expected stream {stream_id:?} to produce a batch"
+                );
+                outputs[stream_id.0].push(batch_id.unwrap());
+            }
+
+            for stream in &mut streams {
+                assert_eq!(next_batch_id(stream).await?, None);
+            }
+
+            let mut parts = vec![];
+            for (idx, output) in outputs.iter().enumerate() {
+                parts.push(format!("----- Stream {idx} Output -----"));
+                parts.push(
+                    output
+                        .iter()
+                        .map(|batch_id| format!("Batch: {batch_id}"))
+                        .chain(std::iter::once("Done".to_string()))
+                        .collect::<Vec<_>>()
+                        .join("\n"),
+                );
+            }
+            parts.push("----- File Stream Events -----".to_string());
+            parts.push(observer.format_summary_events());
+            Ok(parts.join("\n"))
+        }
+    }
+
+    /// Read the next single-row batch from a test stream and return its batch
+    /// id.
+    async fn next_batch_id(stream: &mut FileStream) -> Result<Option<i32>> {
+        let batch = stream.next().await.transpose()?;
+        Ok(batch.map(|batch| {
+            let col = batch.column(0).as_primitive::<Int32Type>();
+            assert_eq!(col.len(), 1);
+            assert!(col.is_valid(0));
+            col.value(0)
+        }))
+    }
+
+    /// Verifies the simplest morsel-driven flow: one planner produces one
+    /// morsel immediately, and the morsel is then scanned to completion.
+    #[tokio::test]
+    async fn morsel_framework_single_morsel_no_io() -> Result<()> {
+        let test = MorselTest::new().with_file(
+            "file1.parquet",
+            MockPlanner::builder()
+                .with_id(PlannerId(0))
+                .return_morsel(MorselId(10), 42)
+                .return_none()
+                .build(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        planner_called: PlannerId(0)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a planner can block on one I/O phase, resume, and only
+    /// then produce its morsel.
+    #[tokio::test]
+    async fn morsel_framework_single_morsel_io() -> Result<()> {
+        let test = MorselTest::new().with_file(
+            "file1.parquet",
+            MockPlanner::builder()
+                .with_id(PlannerId(0))
+                .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 1))
+                .return_morsel(MorselId(10), 42)
+                .return_none()
+                .build(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        io_future_created: PlannerId(0), IoFutureId(100)
+        io_future_polled: PlannerId(0), IoFutureId(100)
+        io_future_polled: PlannerId(0), IoFutureId(100)
+        io_future_resolved: PlannerId(0), IoFutureId(100)
+        planner_called: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        planner_called: PlannerId(0)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that a planner can require multiple CPU-only `plan()` calls
+    /// before it discovers any morsels or I/O, matching the staged behavior of
+    /// the Parquet morsel planner.
+    #[tokio::test]
+    async fn morsel_framework_two_cpu_steps_before_morsel() -> Result<()> {
+        let test = MorselTest::new().with_file(
+            "file1.parquet",
+            MockPlanner::builder()
+                .with_id(PlannerId(0))
+                .return_plan(ReturnPlanBuilder::new())
+                .return_plan(ReturnPlanBuilder::new())
+                .return_morsel(MorselId(10), 42)
+                .return_none()
+                .build(),
+        );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        planner_called: PlannerId(0)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_finished: MorselId(10)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies direct morsels returned from a planner are consumed before
+    /// batches produced by any returned child planners.
+    #[tokio::test]
+    async fn morsel_framework_morsels_before_child_planner() -> Result<()> {
+        let child_planner = MockPlanner::builder()
+            .with_id(PlannerId(1))
+            .return_morsel(MorselId(11), 43)
+            .return_none()
+            .build();
+
+        // planner 0 returns batch 42
+        let parent_planner = MockPlanner::builder()
+            .with_id(PlannerId(0))
+            .return_plan(
+                ReturnPlanBuilder::new()
+                    .with_morsel(MockMorselSpec::single_batch(MorselId(10), 42))
+                    .with_planner(child_planner),
+            )
+            .return_none()
+            .build();
+
+        let test = MorselTest::new()
+            .with_file("file1.parquet", parent_planner)
+            .with_event_summaries();
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Batch: 43
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        planner_produced_child: PlannerId(0) -> PlannerId(1)
+        morsel_produced: PlannerId(1), MorselId(11)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        morsel_stream_batch_produced: MorselId(11), BatchId(43)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies the non-ordered behavior for child planners: if the first child
+    /// planner blocks on I/O and the second can make progress immediately, the
+    /// second planner's batches are emitted first.
+    #[tokio::test]
+    async fn morsel_framework_child_planner_reorder() -> Result<()> {
+        let planner_1 = MockPlanner::builder()
+            .with_id(PlannerId(1))
+            // Note IO required 2 polls
+            .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 2))
+            .return_morsel(MorselId(11), 41)
+            .return_none()
+            .build();
+        let planner_2 = MockPlanner::builder()
+            .with_id(PlannerId(2))
+            // IO only requires 1 poll, so it will resolve before planner 1's IO
+            .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(101), 1)) // IO returns after 1 poll
+            .return_morsel(MorselId(12), 42)
+            .return_none()
+            .build();
+
+        let parent_planner = MockPlanner::builder()
+            .with_id(PlannerId(0))
+            .return_plan(
+                ReturnPlanBuilder::new()
+                    .with_planner(planner_1)
+                    .with_planner(planner_2),
+            )
+            .return_none()
+            .build();
+
+        let test = MorselTest::new().with_file("file1.parquet", parent_planner);
+
+        // Expect both futures to be polled, but second planner's (42) batch to be
+        // produced first
+        insta::assert_snapshot!(test.clone().run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Batch: 41
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        planner_produced_child: PlannerId(0) -> PlannerId(1)
+        planner_produced_child: PlannerId(0) -> PlannerId(2)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(1)
+        io_future_created: PlannerId(1), IoFutureId(100)
+        planner_called: PlannerId(2)
+        io_future_created: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_resolved: PlannerId(2), IoFutureId(101)
+        planner_called: PlannerId(2)
+        morsel_produced: PlannerId(2), MorselId(12)
+        planner_called: PlannerId(2)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_resolved: PlannerId(1), IoFutureId(100)
+        planner_called: PlannerId(1)
+        morsel_produced: PlannerId(1), MorselId(11)
+        morsel_stream_started: MorselId(12)
+        morsel_stream_batch_produced: MorselId(12), BatchId(42)
+        planner_called: PlannerId(1)
+        morsel_stream_finished: MorselId(12)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(41)
+        morsel_stream_finished: MorselId(11)
+        ");
+
+        // Run same test using `with_preserve_order(true)`, but expect the first
+        // planner's batch (41) to be produced before the second's (42), even
+        // though the second planner's I/O resolves first.
+        let test = test.with_preserve_order(true);
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_called: PlannerId(0)
+        planner_produced_child: PlannerId(0) -> PlannerId(1)
+        planner_produced_child: PlannerId(0) -> PlannerId(2)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(1)
+        io_future_created: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_polled: PlannerId(1), IoFutureId(100)
+        io_future_resolved: PlannerId(1), IoFutureId(100)
+        planner_called: PlannerId(2)
+        io_future_created: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_polled: PlannerId(2), IoFutureId(101)
+        io_future_resolved: PlannerId(2), IoFutureId(101)
+        planner_called: PlannerId(1)
+        morsel_produced: PlannerId(1), MorselId(11)
+        planner_called: PlannerId(1)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(41)
+        morsel_stream_finished: MorselId(11)
+        planner_called: PlannerId(2)
+        morsel_produced: PlannerId(2), MorselId(12)
+        planner_called: PlannerId(2)
+        morsel_stream_started: MorselId(12)
+        morsel_stream_batch_produced: MorselId(12), BatchId(42)
+        morsel_stream_finished: MorselId(12)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that child planners still respect the global outstanding-I/O
+    /// cap. Even if a parent planner returns three ready children, only two of
+    /// them should be allowed to create waiting I/O futures at once.
+    #[tokio::test]
+    async fn morsel_framework_child_planner_io_respects_global_cap() -> Result<()> {
+        let planner_1 = MockPlanner::builder()
+            .with_id(PlannerId(1))
+            .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 1))
+            .return_morsel(MorselId(11), 41)
+            .return_none()
+            .build();
+        let planner_2 = MockPlanner::builder()
+            .with_id(PlannerId(2))
+            .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(101), 3))
+            .return_morsel(MorselId(12), 42)
+            .return_none()
+            .build();
+        let planner_3 = MockPlanner::builder()
+            .with_id(PlannerId(3))
+            .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(102), 1))
+            .return_morsel(MorselId(13), 43)
+            .return_none()
+            .build();
+
+        let parent_planner = MockPlanner::builder()
+            .with_id(PlannerId(0))
+            .return_plan(
+                ReturnPlanBuilder::new()
+                    .with_planner(planner_1)
+                    .with_planner(planner_2)
+                    .with_planner(planner_3),
+            )
+            .return_none()
+            .build();
+
+        let test = MorselTest::new()
+            .with_file("file1.parquet", parent_planner)
+            .with_event_summaries();
+
+        // Note that the future for planner 1 must resolve before planner 2 begins
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 41
+        Batch: 42
+        Batch: 43
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        planner_produced_child: PlannerId(0) -> PlannerId(1)
+        planner_produced_child: PlannerId(0) -> PlannerId(2)
+        planner_produced_child: PlannerId(0) -> PlannerId(3)
+        io_future_created: PlannerId(1), IoFutureId(100)
+        io_future_created: PlannerId(2), IoFutureId(101)
+        io_future_resolved: PlannerId(1), IoFutureId(100)
+        io_future_created: PlannerId(3), IoFutureId(102)
+        io_future_resolved: PlannerId(2), IoFutureId(101)
+        io_future_resolved: PlannerId(3), IoFutureId(102)
+        morsel_produced: PlannerId(1), MorselId(11)
+        morsel_produced: PlannerId(2), MorselId(12)
+        morsel_stream_batch_produced: MorselId(11), BatchId(41)
+        morsel_produced: PlannerId(3), MorselId(13)
+        morsel_stream_batch_produced: MorselId(12), BatchId(42)
+        morsel_stream_batch_produced: MorselId(13), BatchId(43)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that `FileStream` overlaps planner I/O across multiple files
+    /// rather than waiting for the first file to finish before starting the
+    /// second.
+    #[tokio::test]
+    async fn morsel_framework_two_files_overlapping_io() -> Result<()> {
+        let test = MorselTest::new()
+            .with_file(
+                "file1.parquet",
+                MockPlanner::builder()
+                    .with_id(PlannerId(0))
+                    .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 1))
+                    .return_morsel(MorselId(10), 42)
+                    .return_none()
+                    .build(),
+            )
+            .with_file(
+                "file2.parquet",
+                MockPlanner::builder()
+                    .with_id(PlannerId(1))
+                    .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(101), 1))
+                    .return_morsel(MorselId(11), 43)
+                    .return_none()
+                    .build(),
+            );
+
+        insta::assert_snapshot!(test.run().await.unwrap(), @r"
+        ----- Output Stream -----
+        Batch: 42
+        Batch: 43
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        morselize_file: file2.parquet
+        planner_created: PlannerId(1)
+        planner_called: PlannerId(0)
+        io_future_created: PlannerId(0), IoFutureId(100)
+        planner_called: PlannerId(1)
+        io_future_created: PlannerId(1), IoFutureId(101)
+        io_future_polled: PlannerId(0), IoFutureId(100)
+        io_future_polled: PlannerId(1), IoFutureId(101)
+        io_future_polled: PlannerId(0), IoFutureId(100)
+        io_future_resolved: PlannerId(0), IoFutureId(100)
+        io_future_polled: PlannerId(1), IoFutureId(101)
+        io_future_resolved: PlannerId(1), IoFutureId(101)
+        planner_called: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        planner_called: PlannerId(0)
+        planner_called: PlannerId(1)
+        morsel_produced: PlannerId(1), MorselId(11)
+        morsel_stream_started: MorselId(10)
+        morsel_stream_batch_produced: MorselId(10), BatchId(42)
+        planner_called: PlannerId(1)
+        morsel_stream_finished: MorselId(10)
+        morsel_stream_started: MorselId(11)
+        morsel_stream_batch_produced: MorselId(11), BatchId(43)
+        morsel_stream_finished: MorselId(11)
+        ");
+
+        Ok(())
+    }
+
+    /// Verifies that an idle sibling stream can steal ready morsels even when
+    /// it has no local files of its own.
+    #[tokio::test]
+    async fn morsel_framework_sibling_stream_steals_when_only_one_has_files() -> Result<()>
+    {
+        let test = MultiStreamMorselTest::new(2)
+            .with_file_in_partition(
+                0,
+                "file1.parquet",
+                MockPlanner::builder()
+                    .with_id(PlannerId(0))
+                    .return_plan(
+                        ReturnPlanBuilder::new()
+                            .return_morsel(MorselId(10), 41)
+                            .return_morsel(MorselId(11), 42),
+                    )
+                    .return_none()
+                    .build(),
+            )
+            // Poll sibling 0 first so it discovers the file and publishes
+            // ready morsels. Poll sibling 1 next: because it has no local
+            // files, any batch it returns must have been stolen from sibling 0.
+            .with_reads(vec![TestStreamId(0), TestStreamId(1)]);
+
+        insta::assert_snapshot!(
+            test.run().await.unwrap(),
+            @r"
+        ----- Stream 0 Output -----
+        Batch: 41
+        Done
+        ----- Stream 1 Output -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: file1.parquet
+        planner_created: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        morsel_produced: PlannerId(0), MorselId(11)
+        morsel_stream_batch_produced: MorselId(10), BatchId(41)
+        morsel_stream_batch_produced: MorselId(11), BatchId(42)
+        "
+        );
+
+        Ok(())
+    }
+
+    /// Verifies that a sibling stream waiting on its own file's I/O can steal
+    /// ready work from a faster sibling and continue making progress.
+    #[tokio::test]
+    async fn morsel_framework_sibling_stream_steals_while_own_file_waits_on_io()
+    -> Result<()> {
+        let test = MultiStreamMorselTest::new(2)
+            .with_file_in_partition(
+                0,
+                "fast.parquet",
+                MockPlanner::builder()
+                    .with_id(PlannerId(0))
+                    .return_plan(
+                        ReturnPlanBuilder::new()
+                            .return_morsel(MorselId(10), 41)
+                            .return_morsel(MorselId(11), 42),
+                    )
+                    .return_none()
+                    .build(),
+            )
+            .with_file_in_partition(
+                1,
+                "slow.parquet",
+                MockPlanner::builder()
+                    .with_id(PlannerId(1))
+                    .return_plan(ReturnPlanBuilder::new().with_io(IoFutureId(100), 2))
+                    .return_morsel(MorselId(12), 51)
+                    .return_none()
+                    .build(),
+            )
+            // Poll sibling 0 first so it publishes one ready morsel from the
+            // fast file. Poll sibling 1 next while its own file is still
+            // blocked on I/O: the batch it returns at that point must have
+            // been stolen from sibling 0. Poll sibling 0 again last so it can
+            // finish once sibling 1's local I/O has resolved.
+            .with_reads(vec![TestStreamId(0), TestStreamId(1), TestStreamId(0)]);
+
+        insta::assert_snapshot!(
+            test.run().await.unwrap(),
+            @r"
+        ----- Stream 0 Output -----
+        Batch: 41
+        Batch: 51
+        Done
+        ----- Stream 1 Output -----
+        Batch: 42
+        Done
+        ----- File Stream Events -----
+        morselize_file: fast.parquet
+        planner_created: PlannerId(0)
+        morsel_produced: PlannerId(0), MorselId(10)
+        morsel_produced: PlannerId(0), MorselId(11)
+        morsel_stream_batch_produced: MorselId(10), BatchId(41)
+        morselize_file: slow.parquet
+        planner_created: PlannerId(1)
+        morsel_stream_batch_produced: MorselId(11), BatchId(42)
+        io_future_created: PlannerId(1), IoFutureId(100)
+        io_future_resolved: PlannerId(1), IoFutureId(100)
+        morsel_produced: PlannerId(1), MorselId(12)
+        morsel_stream_batch_produced: MorselId(12), BatchId(51)
+        "
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_opening() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_scanning_fail() -> Result<()> {
+        let result = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Fail)
+            .with_scan_errors(vec![1])
+            .result()
+            .await;
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_opening_fail() -> Result<()> {
+        let result = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Fail)
+            .with_open_errors(vec![1])
+            .result()
+            .await;
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_scanning() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(2)
+            .with_on_error(OnError::Skip)
+            .with_scan_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn on_error_mixed() -> Result<()> {
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![1])
+            .with_scan_errors(vec![0])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0])
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![2])
+            .with_scan_errors(vec![0, 1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        let batches = FileStreamTest::new()
+            .with_records(vec![make_partition(3), make_partition(2)])
+            .with_num_files(3)
+            .with_on_error(OnError::Skip)
+            .with_open_errors(vec![0, 2])
+            .with_scan_errors(vec![1])
+            .result()
+            .await?;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "++",
+            "++",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn without_limit() -> Result<()> {
+        let batches = create_and_collect(None).await;
+
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn with_limit_between_files() -> Result<()> {
+        let batches = create_and_collect(Some(5)).await;
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn with_limit_at_middle_of_batch() -> Result<()> {
+        let batches = create_and_collect(Some(6)).await;
+        #[rustfmt::skip]
+        assert_batches_eq!(&[
+            "+---+",
+            "| i |",
+            "+---+",
+            "| 0 |",
+            "| 1 |",
+            "| 2 |",
+            "| 0 |",
+            "| 1 |",
+            "| 0 |",
+            "+---+",
+        ], &batches);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/datasource/src/file_stream/shared_state.rs b/datafusion/datasource/src/file_stream/shared_state.rs
new file mode 100644
index 0000000000000..dede280cb06b8
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/shared_state.rs
@@ -0,0 +1,597 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Shared state for sibling [`crate::file_stream::FileStream`] instances.
+//!
+//! A single `DataSourceExec` may create multiple sibling `FileStream`s, one per
+//! output partition. These streams need a place to coordinate shared scan
+//! resources such as:
+//!
+//! - the total number of outstanding planner I/O operations
+//! - ready morsels and planners that may be stolen by idle siblings
+//!
+//! [`SharedFileStreamState`] is that shared home.
+//!
+//! # Outstanding I/O Scheduling Modes
+//!
+//! The shared state currently controls planner I/O in one of two modes:
+//!
+//! - [`SharedFileStreamMode::Unordered`]: Every registered stream has a chance
+//!   to hold one outstanding request before another stream is allowed to start
+//!   a second. After that first request per stream, remaining capacity may be
+//!   used in any order.
+//!
+//! - [`SharedFileStreamMode::PreserveOrder`]: Outstanding I/Os are split fairly
+//!   across all active streams, even if some streams do not currently want to
+//!   issue I/O. This prevents a subset of streams from consuming the full budget
+//!   and is intended for scans that require stable cross-stream ordering.
+//!
+//! # Ready Work Stealing
+//!
+//! In unordered mode, sibling streams may also exchange CPU-ready work through
+//! this shared state:
+//!
+//! - ready morsels may be published into a shared morsel queue
+//! - ready planners may be published into a shared planner queue
+//! - idle siblings will try to steal a ready morsel first, then a ready
+//!   planner
+//!
+//! In preserve-order mode, streams keep their ready morsels and planners on
+//! their local per-stream queues so later siblings cannot overtake earlier
+//! output.
+//!
+//! Streams can call [`SharedFileStreamState::unregister_stream`] once they know
+//! they will never need another I/O permit. Unregistered streams are removed
+//! from future fairness calculations so their share of the budget can be
+//! redistributed.
+
+use crate::morsel::{Morsel, MorselPlanner};
+use crossbeam_queue::SegQueue;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::sync::{Arc, RwLock};
+use std::task::Waker;
+
+/// Shared scheduling mode for sibling `FileStream`s.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SharedFileStreamMode {
+    /// Allow streams to run in any order after each active, I/O-hungry stream
+    /// has had a chance to start one outstanding I/O.
+    Unordered,
+    /// Split outstanding I/O budget fairly across all active streams.
+    PreserveOrder,
+}
+
+/// Stable identifier for one sibling `FileStream` registered with a shared
+/// state object.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct FileStreamId(usize);
+
+/// Shared state for all sibling `FileStream`s that belong to one `DataSourceExec`.
+///
+/// # Intended Usage
+///
+/// Create one `SharedFileStreamState` for the whole `DataSourceExec` and clone
+/// it into each sibling `FileStream`.
+///
+/// # IO Behavior
+///
+/// Each stream should register itself with [`Self::register_stream`] and then:
+///
+/// 1. call [`Self::try_acquire_io_permit`] before moving a planner into a
+///    waiting-I/O state
+/// 2. keep the returned [`OutstandingIoPermit`] alive for as long as that I/O
+///    operation is outstanding
+/// 3. call [`Self::unregister_stream`] once the stream knows it will never need
+///    another I/O permit
+///
+/// # Ready Work Behavior
+///
+/// In unordered mode, streams may also publish ready morsels and planners into
+/// the shared queues via [`Self::push_ready_morsel`] and
+/// [`Self::push_ready_planner`]. Idle siblings can then steal that ready work
+/// with [`Self::pop_ready_morsel`] and [`Self::pop_ready_planner`].
+///
+/// In preserve-order mode, streams should keep ready morsels and planners on
+/// their local per-stream queues rather than publishing them into the shared
+/// queues.
+///
+/// If no permit is available, the caller should typically:
+///
+/// 1. keep the planner in a CPU-ready state locally
+/// 2. register the current task's waker with [`Self::register_waker`]
+/// 3. return `Poll::Pending`
+///
+/// The shared state will wake waiting tasks whenever shared capacity may have
+/// become available again, or when new shared ready work is published.
+#[derive(Clone)]
+pub struct SharedFileStreamState {
+    /// Shared outstanding-I/O accounting and fairness state.
+    io_state: Arc<RwLock<IoState>>,
+    /// Shared ready morsels used for unordered morsel stealing.
+    ready_morsels: Arc<SegQueue<Box<dyn Morsel>>>,
+    /// Shared ready planners used for unordered morsel stealing.
+    ready_planners: Arc<SegQueue<Box<dyn MorselPlanner>>>,
+    /// Tasks waiting to be woken when shared capacity or ready work appears.
+    waiters: Arc<SegQueue<Waker>>,
+}
+
+impl fmt::Debug for SharedFileStreamState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SharedFileStreamState")
+            .field("io_state", &self.io_state)
+            .finish_non_exhaustive()
+    }
+}
+
+/// Shared outstanding-I/O accounting and fairness state for sibling streams.
+#[derive(Debug)]
+struct IoState {
+    /// Shared scheduling policy for sibling streams.
+    mode: SharedFileStreamMode,
+    /// Total number of planner I/O phases currently in flight.
+    outstanding_ios: usize,
+    /// Global cap on outstanding planner I/O phases across sibling streams.
+    max_outstanding_ios: usize,
+    /// Monotonic counter used to assign stable `FileStreamId`s.
+    next_stream_id: usize,
+    /// Per-stream scheduling state for each registered sibling stream.
+    streams: BTreeMap<FileStreamId, StreamIOState>,
+}
+
+#[derive(Debug, Default)]
+struct StreamIOState {
+    outstanding_ios: usize,
+    waker: Option<Waker>,
+}
+
+impl SharedFileStreamState {
+    /// Create a new shared state object with the provided global I/O limit and
+    /// scheduling mode.
+    ///
+    /// The limit applies across all sibling `FileStream`s that share this
+    /// state, not per individual stream.
+    pub fn new(max_outstanding_ios: usize, mode: SharedFileStreamMode) -> Self {
+        Self {
+            io_state: Arc::new(RwLock::new(IoState {
+                mode,
+                outstanding_ios: 0,
+                max_outstanding_ios,
+                next_stream_id: 0,
+                streams: BTreeMap::new(),
+            })),
+            ready_morsels: Arc::new(SegQueue::new()),
+            ready_planners: Arc::new(SegQueue::new()),
+            waiters: Arc::new(SegQueue::new()),
+        }
+    }
+
+    /// Register a sibling stream with this shared state and return its stable id.
+    pub fn register_stream(&self) -> FileStreamId {
+        self.io_state
+            .write()
+            .expect("shared file stream state poisoned")
+            .register_stream()
+    }
+
+    /// Returns the configured shared scheduling mode.
+    pub fn mode(&self) -> SharedFileStreamMode {
+        self.io_state
+            .read()
+            .expect("shared file stream state poisoned")
+            .mode
+    }
+
+    /// Returns the number of currently registered sibling streams.
+    pub fn registered_stream_count(&self) -> usize {
+        self.io_state
+            .read()
+            .expect("shared file stream state poisoned")
+            .streams
+            .len()
+    }
+
+    /// Returns the maximum number of outstanding planner I/O operations
+    /// allowed across all sibling streams.
+    pub fn max_outstanding_ios(&self) -> usize {
+        self.io_state
+            .read()
+            .expect("shared file stream state poisoned")
+            .max_outstanding_ios
+    }
+
+    /// Returns the number of currently outstanding planner I/O operations
+    /// across all sibling streams.
+    pub fn outstanding_ios(&self) -> usize {
+        self.io_state
+            .read()
+            .expect("shared file stream state poisoned")
+            .outstanding_ios
+    }
+
+    /// Unregister a stream that will never request another I/O permit.
+    ///
+    /// This removes the stream from future fairness calculations, allowing its
+    /// share of the I/O budget to be redistributed to sibling streams.
+    ///
+    /// The stream must not have any outstanding permits when it unregisters.
+    pub fn unregister_stream(&self, stream_id: FileStreamId) {
+        let waiters = {
+            let mut io_state = self
+                .io_state
+                .write()
+                .expect("shared file stream state poisoned");
+            if let Some(stream) = io_state.streams.remove(&stream_id) {
+                assert_eq!(
+                    stream.outstanding_ios, 0,
+                    "stream must not unregister while it still holds I/O permits"
+                );
+            }
+            Self::take_waiters(&mut io_state, &self.waiters)
+        };
+
+        Self::wake_waiters(waiters);
+    }
+
+    /// Register a task waker to be notified when shared capacity may have
+    /// become available.
+    ///
+    /// Callers should typically register a waker after failing to acquire an
+    /// I/O permit and before returning `Poll::Pending`.
+    pub fn register_waker(&self, stream_id: FileStreamId, waker: &Waker) {
+        let mut io_state = self
+            .io_state
+            .write()
+            .expect("shared file stream state poisoned");
+
+        self.waiters.push(waker.clone());
+        if let Some(stream) = io_state.streams.get_mut(&stream_id) {
+            stream.waker = Some(waker.clone());
+        }
+    }
+
+    /// Try to reserve one shared outstanding-I/O slot for `stream_id`.
+    ///
+    /// Returns `Some(permit)` if the stream is currently eligible to issue a
+    /// new I/O under the configured fairness mode, or `None` otherwise.
+    pub fn try_acquire_io_permit(
+        &self,
+        stream_id: FileStreamId,
+    ) -> Option<OutstandingIoPermit> {
+        let mut io_state = self
+            .io_state
+            .write()
+            .expect("shared file stream state poisoned");
+
+        if !io_state.can_issue_io(stream_id) {
+            return None;
+        }
+
+        io_state.acquire_io(stream_id);
+        drop(io_state);
+
+        Some(OutstandingIoPermit {
+            state: Some(self.clone()),
+            stream_id,
+        })
+    }
+
+    /// Publish one ready morsel into the shared queue.
+    pub fn push_ready_morsel(&self, morsel: Box<dyn Morsel>) {
+        let waiters = {
+            self.ready_morsels.push(morsel);
+            let mut io_state = self
+                .io_state
+                .write()
+                .expect("shared file stream state poisoned");
+            Self::take_waiters(&mut io_state, &self.waiters)
+        };
+        Self::wake_waiters(waiters);
+    }
+
+    /// Publish one ready planner into the shared queue.
+    pub fn push_ready_planner(&self, planner: Box<dyn MorselPlanner>) {
+        let waiters = {
+            self.ready_planners.push(planner);
+            let mut io_state = self
+                .io_state
+                .write()
+                .expect("shared file stream state poisoned");
+            Self::take_waiters(&mut io_state, &self.waiters)
+        };
+        Self::wake_waiters(waiters);
+    }
+
+    /// Try to steal one ready morsel from the shared queue.
+    pub fn pop_ready_morsel(&self) -> Option<Box<dyn Morsel>> {
+        self.ready_morsels.pop()
+    }
+
+    /// Try to steal one ready planner from the shared queue.
+    pub fn pop_ready_planner(&self) -> Option<Box<dyn MorselPlanner>> {
+        self.ready_planners.pop()
+    }
+
+    fn release_io_permit(&self, stream_id: FileStreamId) {
+        let waiters = {
+            let mut io_state = self
+                .io_state
+                .write()
+                .expect("shared file stream state poisoned");
+            io_state.release_io(stream_id);
+            Self::take_waiters(&mut io_state, &self.waiters)
+        };
+
+        Self::wake_waiters(waiters);
+    }
+
+    fn take_waiters(
+        io_state: &mut IoState,
+        shared_waiters: &SegQueue<Waker>,
+    ) -> Vec<Waker> {
+        let mut waiters = Vec::new();
+        while let Some(waiter) = shared_waiters.pop() {
+            waiters.push(waiter);
+        }
+        for stream in io_state.streams.values_mut() {
+            if let Some(waker) = stream.waker.take() {
+                waiters.push(waker);
+            }
+        }
+        waiters
+    }
+
+    fn wake_waiters(waiters: Vec<Waker>) {
+        for waiter in waiters {
+            waiter.wake();
+        }
+    }
+}
+
+impl IoState {
+    fn register_stream(&mut self) -> FileStreamId {
+        let id = FileStreamId(self.next_stream_id);
+        self.next_stream_id += 1;
+        self.streams.insert(id, StreamIOState::default());
+        id
+    }
+
+    fn can_issue_io(&self, stream_id: FileStreamId) -> bool {
+        if self.outstanding_ios >= self.max_outstanding_ios {
+            return false;
+        }
+
+        if !self.streams.contains_key(&stream_id) {
+            return false;
+        }
+
+        match self.mode {
+            SharedFileStreamMode::Unordered => self.can_issue_unordered(stream_id),
+            SharedFileStreamMode::PreserveOrder => {
+                self.can_issue_preserve_order(stream_id)
+            }
+        }
+    }
+
+    fn acquire_io(&mut self, stream_id: FileStreamId) {
+        self.outstanding_ios += 1;
+        self.streams
+            .get_mut(&stream_id)
+            .expect("unregistered file stream")
+            .outstanding_ios += 1;
+    }
+
+    fn release_io(&mut self, stream_id: FileStreamId) {
+        self.outstanding_ios = self
+            .outstanding_ios
+            .checked_sub(1)
+            .expect("outstanding I/O count underflow");
+        self.streams
+            .get_mut(&stream_id)
+            .expect("unregistered file stream")
+            .outstanding_ios = self
+            .streams
+            .get(&stream_id)
+            .expect("unregistered file stream")
+            .outstanding_ios
+            .checked_sub(1)
+            .expect("per-stream outstanding I/O count underflow");
+    }
+
+    fn can_issue_unordered(&self, stream_id: FileStreamId) -> bool {
+        let stream = self
+            .streams
+            .get(&stream_id)
+            .expect("unregistered file stream");
+
+        if stream.outstanding_ios == 0 {
+            return true;
+        }
+
+        // Once a stream already has one outstanding I/O, it may only start a
+        // second if every other registered sibling stream has also reached at
+        // least one outstanding request.
+        self.streams.values().all(|state| state.outstanding_ios > 0)
+    }
+
+    fn can_issue_preserve_order(&self, stream_id: FileStreamId) -> bool {
+        let share = self.fair_share_for(stream_id);
+        let stream = self
+            .streams
+            .get(&stream_id)
+            .expect("unregistered file stream");
+        stream.outstanding_ios < share
+    }
+
+    fn fair_share_for(&self, stream_id: FileStreamId) -> usize {
+        let active_streams = self.streams.keys().collect::<Vec<_>>();
+
+        if active_streams.is_empty() {
+            return 0;
+        }
+
+        let active_count = active_streams.len();
+        let base_share = self.max_outstanding_ios / active_count;
+        let remainder = self.max_outstanding_ios % active_count;
+
+        let position = active_streams
+            .iter()
+            .position(|id| **id == stream_id)
+            .expect("stream should be active");
+
+        base_share + usize::from(position < remainder)
+    }
+}
+
+/// RAII guard representing one shared outstanding-I/O slot.
+///
+/// Hold this permit for exactly as long as the corresponding planner I/O
+/// future remains outstanding. Dropping the permit releases the slot back to
+/// the shared state.
+#[derive(Debug)]
+pub struct OutstandingIoPermit {
+    state: Option<SharedFileStreamState>,
+    stream_id: FileStreamId,
+}
+
+impl Drop for OutstandingIoPermit {
+    fn drop(&mut self) {
+        if let Some(state) = self.state.take() {
+            state.release_io_permit(self.stream_id);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::task::{Wake, Waker};
+
+    #[test]
+    /// In unordered mode, each stream that currently wants I/O must get a
+    /// chance to issue its first request before another stream is allowed to
+    /// issue a second. Once every interested stream has one outstanding I/O,
+    /// any remaining permits may be consumed in any order.
+    fn unordered_every_stream_gets_one_before_extras() {
+        let state = SharedFileStreamState::new(3, SharedFileStreamMode::Unordered);
+        let stream1 = state.register_stream();
+        let stream2 = state.register_stream();
+
+        let permit1 = state.try_acquire_io_permit(stream1).unwrap();
+        assert!(state.try_acquire_io_permit(stream1).is_none());
+
+        let permit2 = state.try_acquire_io_permit(stream2).unwrap();
+        let permit3 = state.try_acquire_io_permit(stream1).unwrap();
+        assert_eq!(state.outstanding_ios(), 3);
+        assert!(state.try_acquire_io_permit(stream2).is_none());
+
+        drop(permit2);
+        drop(permit3);
+        drop(permit1);
+    }
+
+    #[test]
+    /// In unordered mode, unregistering a stream removes it from the
+    /// "everyone gets one first" rule, so its reserved share may be
+    /// immediately reused by remaining active streams.
+    fn unordered_closed_stream_releases_capacity_to_others() {
+        let state = SharedFileStreamState::new(2, SharedFileStreamMode::Unordered);
+        let stream1 = state.register_stream();
+        let stream2 = state.register_stream();
+
+        let _permit1 = state.try_acquire_io_permit(stream1).unwrap();
+        state.unregister_stream(stream2);
+
+        assert!(state.try_acquire_io_permit(stream1).is_some());
+    }
+
+    #[test]
+    /// In preserve-order mode, the outstanding-I/O budget is split fairly
+    /// across all active streams. With three streams and three permits total,
+    /// each stream is capped at one concurrent I/O.
+    fn preserve_order_splits_evenly() {
+        let state = SharedFileStreamState::new(3, SharedFileStreamMode::PreserveOrder);
+        let stream1 = state.register_stream();
+        let stream2 = state.register_stream();
+        let stream3 = state.register_stream();
+
+        let _permit1 = state.try_acquire_io_permit(stream1).unwrap();
+        let _permit2 = state.try_acquire_io_permit(stream2).unwrap();
+        let _permit3 = state.try_acquire_io_permit(stream3).unwrap();
+
+        assert!(state.try_acquire_io_permit(stream1).is_none());
+        assert!(state.try_acquire_io_permit(stream2).is_none());
+        assert!(state.try_acquire_io_permit(stream3).is_none());
+    }
+
+    #[test]
+    /// In preserve-order mode, once a stream has released its final permit and
+    /// unregisters itself, its fair share is redistributed to the remaining
+    /// active streams.
+    fn preserve_order_redistributes_closed_stream_share() {
+        let state = SharedFileStreamState::new(3, SharedFileStreamMode::PreserveOrder);
+        let stream1 = state.register_stream();
+        let stream2 = state.register_stream();
+        let stream3 = state.register_stream();
+
+        let _permit1 = state.try_acquire_io_permit(stream1).unwrap();
+        let _permit2 = state.try_acquire_io_permit(stream2).unwrap();
+        let permit3 = state.try_acquire_io_permit(stream3).unwrap();
+
+        drop(permit3);
+        state.unregister_stream(stream3);
+
+        // Shares rebalance from [1,1,1] to [2,1] in registration order.
+        assert!(state.try_acquire_io_permit(stream1).is_some());
+        assert!(state.try_acquire_io_permit(stream2).is_none());
+    }
+
+    #[test]
+    /// Releasing an outstanding permit should wake blocked sibling streams so
+    /// they can retry permit acquisition on a future poll.
+    fn releasing_permit_wakes_waiters() {
+        let state = SharedFileStreamState::new(1, SharedFileStreamMode::Unordered);
+        let stream = state.register_stream();
+        let permit = state.try_acquire_io_permit(stream).unwrap();
+
+        let wake_counter = Arc::new(WakeCounter::default());
+        let waker = Waker::from(Arc::clone(&wake_counter));
+        state.register_waker(stream, &waker);
+
+        drop(permit);
+
+        assert_eq!(wake_counter.wake_count.load(Ordering::SeqCst), 2);
+        assert_eq!(state.outstanding_ios(), 0);
+    }
+
+    #[derive(Default)]
+    struct WakeCounter {
+        wake_count: AtomicUsize,
+    }
+
+    impl Wake for WakeCounter {
+        fn wake(self: Arc<Self>) {
+            self.wake_count.fetch_add(1, Ordering::SeqCst);
+        }
+
+        fn wake_by_ref(self: &Arc<Self>) {
+            self.wake_count.fetch_add(1, Ordering::SeqCst);
+        }
+    }
+}
diff --git a/datafusion/datasource/src/file_stream/trace.rs b/datafusion/datasource/src/file_stream/trace.rs
new file mode 100644
index 0000000000000..fcac226adc3da
--- /dev/null
+++ b/datafusion/datasource/src/file_stream/trace.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Runtime tracing for [`crate::file_stream::FileStream`].
+//!
+//! This module provides a lightweight, opt-in stderr trace for observing
+//! `FileStream` behavior during a real query. The trace is intended for
+//! performance debugging and scheduler analysis rather than correctness tests.
+//!
+//! # Enabling
+//!
+//! Set:
+//!
+//! ```text
+//! DATAFUSION_FILE_STREAM_TRACE=true
+//! ```
+//!
+//! before running the CLI or any DataFusion-based application.
+//!
+//! Example:
+//!
+//! ```text
+//! DATAFUSION_FILE_STREAM_TRACE=true ../datafusion-cli-morsels -f q23.sql
+//! ```
+//!
+//! # Output
+//!
+//! The trace writes one structured line per event to stderr. Events include:
+//!
+//! - file admission
+//! - planner CPU steps
+//! - planner I/O scheduling and completion
+//! - ready work publication and stealing
+//! - morsel start
+//! - batch emission
+//! - waiting / idle points
+//!
+//! Timestamps are relative to the first traced `FileStream` event in the
+//! process so traces from sibling partitions can be compared directly.
+
+use crate::PartitionedFile;
+use crate::file_stream::FileStreamId;
+use datafusion_common::instant::Instant;
+use std::sync::OnceLock;
+
+static FILE_STREAM_TRACE_START: OnceLock<Instant> = OnceLock::new();
+
+/// Return true if file stream tracing is enabled for this process.
+pub(super) fn file_stream_trace_enabled() -> bool {
+    std::env::var("DATAFUSION_FILE_STREAM_TRACE")
+        .ok()
+        .map(|value| {
+            let value = value.trim();
+            value == "1"
+                || value.eq_ignore_ascii_case("true")
+                || value.eq_ignore_ascii_case("yes")
+                || value.eq_ignore_ascii_case("on")
+        })
+        .unwrap_or(false)
+}
+
+/// Lightweight stderr trace for observing `FileStream` runtime behavior.
+///
+/// The trace is intentionally simple: when disabled it returns immediately,
+/// and when enabled it emits timestamped scheduler events to stderr so runtime
+/// behavior can be inspected after a real query finishes.
+#[derive(Debug, Clone)]
+pub(super) struct ReadTrace {
+    enabled: bool,
+    partition: usize,
+    stream_id: String,
+}
+
+impl ReadTrace {
+    /// Create a new trace for one `FileStream`.
+    pub(super) fn new(enabled: bool, partition: usize, stream_id: FileStreamId) -> Self {
+        Self {
+            enabled,
+            partition,
+            stream_id: format!("{stream_id:?}"),
+        }
+    }
+
+    /// Emit one structured scheduler event to stderr.
+    fn emit(&self, event: &str, details: impl AsRef<str>) {
+        if !self.enabled {
+            return;
+        }
+
+        let start = FILE_STREAM_TRACE_START.get_or_init(Instant::now);
+        let elapsed = start.elapsed();
+        let details = details.as_ref();
+        if details.is_empty() {
+            eprintln!(
+                "+{:>8.3}s partition={} stream={} event={}",
+                elapsed.as_secs_f64(),
+                self.partition,
+                self.stream_id,
+                event
+            );
+        } else {
+            eprintln!(
+                "+{:>8.3}s partition={} stream={} event={} {}",
+                elapsed.as_secs_f64(),
+                self.partition,
+                self.stream_id,
+                event,
+                details
+            );
+        }
+    }
+
+    /// Emit an event for one admitted file.
+    pub(super) fn file_opened(&self, file: &PartitionedFile) {
+        self.emit("file_opened", format!("file={}", file.object_meta.location));
+    }
+
+    /// Emit an event for ready work produced by planning.
+    pub(super) fn plan_result(&self, morsels: usize, planners: usize, has_io: bool) {
+        self.emit(
+            "planner_step",
+            format!("morsels={morsels} planners={planners} io={has_io}"),
+        );
+    }
+
+    /// Emit an event when a new planner I/O future is scheduled.
+    pub(super) fn io_scheduled(&self, waiting_planners: usize) {
+        self.emit(
+            "io_scheduled",
+            format!("waiting_planners={waiting_planners}"),
+        );
+    }
+
+    /// Emit an event when a planner I/O future completes.
+    pub(super) fn io_completed(&self, ready_planners: usize) {
+        self.emit("io_completed", format!("ready_planners={ready_planners}"));
+    }
+
+    /// Emit an event when ready morsels are shared or queued locally.
+    pub(super) fn morsels_ready(&self, morsels: usize, shared: bool) {
+        self.emit("morsels_ready", format!("count={morsels} shared={shared}"));
+    }
+
+    /// Emit an event when ready planners are shared or queued locally.
+    pub(super) fn planners_ready(&self, planners: usize, shared: bool) {
+        self.emit(
+            "planners_ready",
+            format!("count={planners} shared={shared}"),
+        );
+    }
+
+    /// Emit an event when shared work is stolen by this stream.
+    pub(super) fn stole_work(&self, kind: &str) {
+        self.emit("stole_work", format!("kind={kind}"));
+    }
+
+    /// Emit an event when a morsel becomes the active reader.
+    pub(super) fn morsel_started(&self, buffered_morsels: usize) {
+        self.emit(
+            "morsel_started",
+            format!("buffered_morsels={buffered_morsels}"),
+        );
+    }
+
+    /// Emit an event when a batch is produced to the consumer.
+    pub(super) fn batch_emitted(&self, rows: usize) {
+        self.emit("batch_emitted", format!("rows={rows}"));
+    }
+
+    /// Emit an event when the stream is blocked waiting for more work.
+    pub(super) fn waiting(&self, reason: &str) {
+        self.emit("waiting", reason);
+    }
+}
diff --git a/datafusion/datasource/src/macros.rs b/datafusion/datasource/src/macros.rs
deleted file mode 100644
index c7a4058f2310e..0000000000000
--- a/datafusion/datasource/src/macros.rs
+++ /dev/null
@@ -1,145 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Macros for the datafusion-datasource crate
-
-/// Helper macro to generate schema adapter methods for FileSource implementations
-///
-/// Place this inside *any* `impl FileSource for YourType { … }` to
-/// avoid copy-pasting `with_schema_adapter_factory` and
-/// `schema_adapter_factory`.
-///
-/// # Availability
-///
-/// This macro is exported at the crate root level via `#[macro_export]`, so it can be
-/// imported directly from the crate:
-///
-/// ```rust,no_run
-/// use datafusion_datasource::impl_schema_adapter_methods;
-/// ```
-///
-/// # Note on path resolution
-/// When this macro is used:
-/// - `$crate` expands to `datafusion_datasource` (the crate root)
-/// - `$crate::file::FileSource` refers to the FileSource trait from this crate
-/// - `$crate::schema_adapter::SchemaAdapterFactory` refers to the SchemaAdapterFactory trait
-///
-/// # Example Usage
-///
-/// ```rust,no_run
-/// use std::sync::Arc;
-/// use std::any::Any;
-/// use std::fmt::{Formatter, Display, self};
-/// use arrow::datatypes::SchemaRef;
-/// use datafusion_common::{Result, Statistics};
-/// use object_store::ObjectStore;
-/// use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
-/// use datafusion_physical_plan::DisplayFormatType;
-/// use datafusion_physical_expr_common::sort_expr::LexOrdering;
-/// use datafusion_datasource::file::FileSource;
-/// use datafusion_datasource::file_stream::FileOpener;
-/// use datafusion_datasource::file_scan_config::FileScanConfig;
-/// use datafusion_datasource::impl_schema_adapter_methods;
-/// use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
-///
-/// #[derive(Clone)]
-/// struct MyFileSource {
-///     schema: SchemaRef,
-///     batch_size: usize,
-///     statistics: Statistics,
-///     projection: Option<Vec<usize>>,
-///     schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
-///     metrics: ExecutionPlanMetricsSet,
-/// }
-///
-/// impl FileSource for MyFileSource {
-///     fn create_file_opener(
-///         &self,
-///         object_store: Arc<dyn ObjectStore>,
-///         base_config: &FileScanConfig,
-///         partition: usize,
-///     ) -> Arc<dyn FileOpener> {
-///         // Implementation here
-///         unimplemented!()
-///     }
-///     
-///     fn as_any(&self) -> &dyn Any {
-///         self
-///     }
-///     
-///     fn with_batch_size(&self, batch_size: usize) -> Arc<dyn FileSource> {
-///         let mut new_source = self.clone();
-///         new_source.batch_size = batch_size;
-///         Arc::new(new_source)
-///     }
-///     
-///     fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
-///         let mut new_source = self.clone();
-///         new_source.schema = schema;
-///         Arc::new(new_source)
-///     }
-///     
-///     fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
-///         let mut new_source = self.clone();
-///         new_source.projection = config.file_column_projection_indices();
-///         Arc::new(new_source)
-///     }
-///     
-///     fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-///         let mut new_source = self.clone();
-///         new_source.statistics = statistics;
-///         Arc::new(new_source)
-///     }
-///     
-///     fn metrics(&self) -> &ExecutionPlanMetricsSet {
-///         &self.metrics
-///     }
-///     
-///     fn statistics(&self) -> Result<Statistics> {
-///         Ok(self.statistics.clone())
-///     }
-///     
-///     fn file_type(&self) -> &str {
-///         "my_file_type"
-///     }
-///     
-///     // Use the macro to implement schema adapter methods
-///     impl_schema_adapter_methods!();
-/// }
-/// ```
-#[macro_export(local_inner_macros)]
-macro_rules! impl_schema_adapter_methods {
-    () => {
-        fn with_schema_adapter_factory(
-            &self,
-            schema_adapter_factory: std::sync::Arc<
-                dyn $crate::schema_adapter::SchemaAdapterFactory,
-            >,
-        ) -> std::sync::Arc<dyn $crate::file::FileSource> {
-            std::sync::Arc::new(Self {
-                schema_adapter_factory: Some(schema_adapter_factory),
-                ..self.clone()
-            })
-        }
-
-        fn schema_adapter_factory(
-            &self,
-        ) -> Option<std::sync::Arc<dyn $crate::schema_adapter::SchemaAdapterFactory>> {
-            self.schema_adapter_factory.clone()
-        }
-    };
-}
diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs
index 54cea71843eec..aca943ed096b9 100644
--- a/datafusion/datasource/src/memory.rs
+++ b/datafusion/datasource/src/memory.rs
@@ -20,28 +20,36 @@ use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::fmt;
 use std::fmt::Debug;
+use std::ops::Deref;
+use std::slice::from_ref;
 use std::sync::Arc;
 
 use crate::sink::DataSink;
 use crate::source::{DataSource, DataSourceExec};
-use async_trait::async_trait;
-use datafusion_physical_plan::memory::MemoryStream;
-use datafusion_physical_plan::projection::{
-    all_alias_free_columns, new_projections_for_columns, ProjectionExec,
-};
-use datafusion_physical_plan::{
-    common, ColumnarValue, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    PhysicalExpr, SendableRecordBatchStream, Statistics,
-};
 
 use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{internal_err, plan_err, project_schema, Result, ScalarValue};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, plan_err, project_schema,
+};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::equivalence::project_orderings;
+use datafusion_physical_expr::projection::ProjectionExprs;
 use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{EquivalenceProperties, LexOrdering};
+use datafusion_physical_plan::memory::MemoryStream;
+use datafusion_physical_plan::projection::{
+    all_alias_free_columns, new_projections_for_columns,
+};
+use datafusion_physical_plan::{
+    ColumnarValue, DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr,
+    SendableRecordBatchStream, Statistics, common,
+};
+
+use async_trait::async_trait;
+use datafusion_physical_plan::coop::cooperative;
+use datafusion_physical_plan::execution_plan::SchedulingType;
 use futures::StreamExt;
 use itertools::Itertools;
 use tokio::sync::RwLock;
@@ -74,14 +82,14 @@ impl DataSource for MemorySourceConfig {
         partition: usize,
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        Ok(Box::pin(
+        Ok(Box::pin(cooperative(
             MemoryStream::try_new(
                 self.partitions[partition].clone(),
                 Arc::clone(&self.projected_schema),
                 self.projection.clone(),
             )?
             .with_fetch(self.fetch),
-        ))
+        )))
     }
 
     fn as_any(&self) -> &dyn Any {
@@ -113,10 +121,10 @@ impl DataSource for MemorySourceConfig {
                     .map_or(String::new(), |limit| format!(", fetch={limit}"));
                 if self.show_sizes {
                     write!(
-                                f,
-                                "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
-                                partition_sizes.len(),
-                            )
+                        f,
+                        "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
+                        partition_sizes.len(),
+                    )
                 } else {
                     write!(
                         f,
@@ -181,16 +189,35 @@ impl DataSource for MemorySourceConfig {
     fn eq_properties(&self) -> EquivalenceProperties {
         EquivalenceProperties::new_with_orderings(
             Arc::clone(&self.projected_schema),
-            self.sort_information.as_slice(),
+            self.sort_information.clone(),
         )
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(common::compute_record_batch_statistics(
-            &self.partitions,
-            &self.schema,
-            self.projection.clone(),
-        ))
+    fn scheduling_type(&self) -> SchedulingType {
+        SchedulingType::Cooperative
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        if let Some(partition) = partition {
+            // Compute statistics for a specific partition
+            if let Some(batches) = self.partitions.get(partition) {
+                Ok(Arc::new(common::compute_record_batch_statistics(
+                    from_ref(batches),
+                    &self.schema,
+                    self.projection.clone(),
+                )))
+            } else {
+                // Invalid partition index
+                Ok(Arc::new(Statistics::new_unknown(&self.projected_schema)))
+            }
+        } else {
+            // Compute statistics across all partitions
+            Ok(Arc::new(common::compute_record_batch_statistics(
+                &self.partitions,
+                &self.schema,
+                self.projection.clone(),
+            )))
+        }
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
@@ -204,27 +231,42 @@ impl DataSource for MemorySourceConfig {
 
     fn try_swapping_with_projection(
         &self,
-        projection: &ProjectionExec,
-    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
         // If there is any non-column or alias-carrier expression, Projection should not be removed.
         // This process can be moved into MemoryExec, but it would be an overlap of their responsibility.
-        all_alias_free_columns(projection.expr())
+        let exprs = projection.iter().cloned().collect_vec();
+        all_alias_free_columns(exprs.as_slice())
             .then(|| {
                 let all_projections = (0..self.schema.fields().len()).collect();
                 let new_projections = new_projections_for_columns(
-                    projection,
+                    &exprs,
                     self.projection().as_ref().unwrap_or(&all_projections),
                 );
 
-                MemorySourceConfig::try_new_exec(
+                MemorySourceConfig::try_new(
                     self.partitions(),
                     self.original_schema(),
                     Some(new_projections),
                 )
-                .map(|e| e as _)
+                .map(|s| Arc::new(s) as Arc<dyn DataSource>)
             })
             .transpose()
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in sort_information
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.sort_information {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
 impl MemorySourceConfig {
@@ -259,6 +301,7 @@ impl MemorySourceConfig {
     }
 
     /// Create a new execution plan from a list of constant values (`ValuesExec`)
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new_as_values(
         schema: SchemaRef,
         data: Vec<Vec<Arc<dyn PhysicalExpr>>>,
@@ -316,6 +359,7 @@ impl MemorySourceConfig {
     ///
     /// Errors if any of the batches don't match the provided schema, or if no
     /// batches are provided.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new_from_batches(
         schema: SchemaRef,
         batches: Vec<RecordBatch>,
@@ -415,33 +459,16 @@ impl MemorySourceConfig {
                     .map(|field| field.name() != col.name())
                     .unwrap_or(true)
             });
-        if let Some(col) = ambiguous_column {
-            return internal_err!(
-                "Column {:?} is not found in the original schema of the MemorySourceConfig",
-                col
-            );
-        }
+        assert_or_internal_err!(
+            ambiguous_column.is_none(),
+            "Column {:?} is not found in the original schema of the MemorySourceConfig",
+            ambiguous_column.as_ref().unwrap()
+        );
 
         // If there is a projection on the source, we also need to project orderings
-        if let Some(projection) = &self.projection {
-            let base_eqp = EquivalenceProperties::new_with_orderings(
-                self.original_schema(),
-                &sort_information,
-            );
-            let proj_exprs = projection
-                .iter()
-                .map(|idx| {
-                    let base_schema = self.original_schema();
-                    let name = base_schema.field(*idx).name();
-                    (Arc::new(Column::new(name, *idx)) as _, name.to_string())
-                })
-                .collect::<Vec<_>>();
-            let projection_mapping =
-                ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?;
-            sort_information = base_eqp
-                .project(&projection_mapping, Arc::clone(&self.projected_schema))
-                .into_oeq_class()
-                .into_inner();
+        if self.projection.is_some() {
+            sort_information =
+                project_orderings(&sort_information, &self.projected_schema);
         }
 
         self.sort_information = sort_information;
@@ -463,7 +490,7 @@ impl MemorySourceConfig {
         target_partitions: usize,
         output_ordering: LexOrdering,
     ) -> Result<Option<Vec<Vec<RecordBatch>>>> {
-        if !self.eq_properties().ordering_satisfy(&output_ordering) {
+        if !self.eq_properties().ordering_satisfy(output_ordering)? {
             Ok(None)
         } else {
             let total_num_batches =
@@ -492,7 +519,7 @@ impl MemorySourceConfig {
             // by count of rows.
             let mut max_heap = BinaryHeap::with_capacity(target_partitions);
             for rep in to_repartition {
-                max_heap.push(rep);
+                max_heap.push(CompareByRowCount(rep));
             }
 
             // Split the largest partitions into smaller partitions. Maintaining the output
@@ -508,10 +535,10 @@ impl MemorySourceConfig {
                     };
 
                     // Split the partition. The new partitions will be ordered with idx and idx+1.
-                    let mut new_partitions = to_split.split();
+                    let mut new_partitions = to_split.into_inner().split();
                     if new_partitions.len() > 1 {
                         for new_partition in new_partitions {
-                            max_heap.push(new_partition);
+                            max_heap.push(CompareByRowCount(new_partition));
                         }
                         // Successful repartition. Break inner loop, and return to outer `cnt_to_repartition` loop.
                         break;
@@ -520,7 +547,10 @@ impl MemorySourceConfig {
                     }
                 }
             }
-            let mut partitions = max_heap.drain().collect_vec();
+            let mut partitions = max_heap
+                .drain()
+                .map(CompareByRowCount::into_inner)
+                .collect_vec();
             partitions.extend(cannot_split_further);
 
             // Finally, sort all partitions by the output ordering.
@@ -642,26 +672,6 @@ impl RePartition {
     }
 }
 
-impl PartialOrd for RePartition {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.row_count.cmp(&other.row_count))
-    }
-}
-
-impl Ord for RePartition {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.row_count.cmp(&other.row_count)
-    }
-}
-
-impl PartialEq for RePartition {
-    fn eq(&self, other: &Self) -> bool {
-        self.row_count.eq(&other.row_count)
-    }
-}
-
-impl Eq for RePartition {}
-
 impl fmt::Display for RePartition {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
@@ -674,6 +684,36 @@ impl fmt::Display for RePartition {
     }
 }
 
+struct CompareByRowCount(RePartition);
+impl CompareByRowCount {
+    fn into_inner(self) -> RePartition {
+        self.0
+    }
+}
+impl Ord for CompareByRowCount {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.row_count.cmp(&other.0.row_count)
+    }
+}
+impl PartialOrd for CompareByRowCount {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl PartialEq for CompareByRowCount {
+    fn eq(&self, other: &Self) -> bool {
+        // PartialEq must be consistent with PartialOrd
+        self.cmp(other) == Ordering::Equal
+    }
+}
+impl Eq for CompareByRowCount {}
+impl Deref for CompareByRowCount {
+    type Target = RePartition;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
 /// Type alias for partition data
 pub type PartitionData = Arc<RwLock<Vec<RecordBatch>>>;
 
@@ -765,22 +805,22 @@ mod memory_source_tests {
 
     use crate::memory::MemorySourceConfig;
     use crate::source::DataSourceExec;
-    use datafusion_physical_plan::ExecutionPlan;
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::Result;
     use datafusion_physical_expr::expressions::col;
-    use datafusion_physical_expr::PhysicalSortExpr;
-    use datafusion_physical_expr_common::sort_expr::LexOrdering;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::ExecutionPlan;
 
     #[test]
-    fn test_memory_order_eq() -> datafusion_common::Result<()> {
+    fn test_memory_order_eq() -> Result<()> {
         let schema = Arc::new(Schema::new(vec![
             Field::new("a", DataType::Int64, false),
             Field::new("b", DataType::Int64, false),
             Field::new("c", DataType::Int64, false),
         ]));
-        let sort1 = LexOrdering::new(vec![
+        let sort1: LexOrdering = [
             PhysicalSortExpr {
                 expr: col("a", &schema)?,
                 options: SortOptions::default(),
@@ -789,13 +829,14 @@ mod memory_source_tests {
                 expr: col("b", &schema)?,
                 options: SortOptions::default(),
             },
-        ]);
-        let sort2 = LexOrdering::new(vec![PhysicalSortExpr {
+        ]
+        .into();
+        let sort2: LexOrdering = [PhysicalSortExpr {
             expr: col("c", &schema)?,
             options: SortOptions::default(),
-        }]);
-        let mut expected_output_order = LexOrdering::default();
-        expected_output_order.extend(sort1.clone());
+        }]
+        .into();
+        let mut expected_output_order = sort1.clone();
         expected_output_order.extend(sort2.clone());
 
         let sort_information = vec![sort1.clone(), sort2.clone()];
@@ -817,19 +858,18 @@ mod memory_source_tests {
 
 #[cfg(test)]
 mod tests {
+    use super::*;
     use crate::test_util::col;
     use crate::tests::{aggr_test_schema, make_partition};
 
-    use super::*;
-
     use arrow::array::{ArrayRef, Int32Array, Int64Array, StringArray};
-    use arrow::compute::SortOptions;
-    use datafusion_physical_expr::PhysicalSortExpr;
-    use datafusion_physical_plan::expressions::lit;
-
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::assert_batches_eq;
     use datafusion_common::stats::{ColumnStatistics, Precision};
+    use datafusion_physical_expr::PhysicalSortExpr;
+    use datafusion_physical_plan::expressions::lit;
+
+    use datafusion_physical_plan::ExecutionPlan;
     use futures::StreamExt;
 
     #[tokio::test]
@@ -923,13 +963,12 @@ mod tests {
             vec![lit(ScalarValue::Null)],
         ];
         let rows = data.len();
-        let values = MemorySourceConfig::try_new_as_values(
-            Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])),
-            data,
-        )?;
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)]));
+        let values = MemorySourceConfig::try_new_as_values(schema, data)?;
 
         assert_eq!(
-            values.partition_statistics(None)?,
+            *values.partition_statistics(None)?,
             Statistics {
                 num_rows: Precision::Exact(rows),
                 total_byte_size: Precision::Exact(8), // not important
@@ -939,6 +978,7 @@ mod tests {
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },],
             }
         );
@@ -1059,8 +1099,7 @@ mod tests {
         let actual = partitioned_datasrc
             .map(|datasrc| datasrc.output_partitioning().partition_count());
         assert_eq!(
-            actual,
-            partition_cnt,
+            actual, partition_cnt,
             "partitioned datasrc does not match expected, we expected {should_exist}, instead found {actual:?}"
         );
     }
@@ -1246,8 +1285,8 @@ mod tests {
     }
 
     #[test]
-    fn test_repartition_no_sort_information_no_output_ordering_lopsized_batches(
-    ) -> Result<()> {
+    fn test_repartition_no_sort_information_no_output_ordering_lopsized_batches()
+    -> Result<()> {
         let no_sort = vec![];
         let no_output_ordering = None;
 
@@ -1310,10 +1349,8 @@ mod tests {
     #[test]
     fn test_repartition_with_sort_information() -> Result<()> {
         let schema = schema();
-        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("c", &schema).unwrap(),
-            options: SortOptions::default(),
-        }]);
+        let sort_key: LexOrdering =
+            [PhysicalSortExpr::new_default(col("c", &schema)?)].into();
         let has_sort = vec![sort_key.clone()];
         let output_ordering = Some(sort_key);
 
@@ -1360,10 +1397,8 @@ mod tests {
     #[test]
     fn test_repartition_with_batch_ordering_not_matching_sizing() -> Result<()> {
         let schema = schema();
-        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("c", &schema).unwrap(),
-            options: SortOptions::default(),
-        }]);
+        let sort_key: LexOrdering =
+            [PhysicalSortExpr::new_default(col("c", &schema)?)].into();
         let has_sort = vec![sort_key.clone()];
         let output_ordering = Some(sort_key);
 
diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs
index 1c27cd4922c39..6bb172f86e38d 100644
--- a/datafusion/datasource/src/mod.rs
+++ b/datafusion/datasource/src/mod.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! A table that uses the `ObjectStore` listing capability
 //! to get the list of files to process.
@@ -33,16 +34,17 @@ pub mod file;
 pub mod file_compression_type;
 pub mod file_format;
 pub mod file_groups;
-pub mod file_meta;
 pub mod file_scan_config;
 pub mod file_sink_config;
 pub mod file_stream;
-pub mod macros;
 pub mod memory;
+pub mod morsel;
+pub mod projection;
 pub mod schema_adapter;
 pub mod sink;
 pub mod source;
 mod statistics;
+pub mod table_schema;
 
 #[cfg(test)]
 pub mod test_util;
@@ -54,14 +56,15 @@ pub use self::url::ListingTableUrl;
 use crate::file_groups::FileGroup;
 use chrono::TimeZone;
 use datafusion_common::stats::Precision;
-use datafusion_common::{exec_datafusion_err, ColumnStatistics, Result};
+use datafusion_common::{ColumnStatistics, Result, exec_datafusion_err};
 use datafusion_common::{ScalarValue, Statistics};
-use file_meta::FileMeta;
+use datafusion_physical_expr::LexOrdering;
 use futures::{Stream, StreamExt};
-use object_store::{path::Path, ObjectMeta};
 use object_store::{GetOptions, GetRange, ObjectStore};
+use object_store::{ObjectMeta, path::Path};
+pub use table_schema::TableSchema;
 // Remove when add_row_stats is remove
-#[allow(deprecated)]
+#[expect(deprecated)]
 pub use statistics::add_row_stats;
 pub use statistics::compute_all_files_statistics;
 use std::ops::Range;
@@ -69,6 +72,10 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 /// Stream of files get listed from object store
+#[deprecated(
+    since = "54.0.0",
+    note = "This type is unused and will be removed in a future release"
+)]
 pub type PartitionedFileStream =
     Pin<Box<dyn Stream<Item = Result<PartitionedFile>> + Send + Sync + 'static>>;
 
@@ -93,6 +100,19 @@ impl FileRange {
 #[derive(Debug, Clone)]
 /// A single file or part of a file that should be read, along with its schema, statistics
 /// and partition column values that need to be appended to each row.
+///
+/// # Statistics
+///
+/// The [`Self::statistics`] field contains statistics for the **full table schema**,
+/// which includes both file columns and partition columns. When statistics are set via
+/// [`Self::with_statistics`], exact statistics for partition columns are automatically
+/// computed from [`Self::partition_values`]:
+///
+/// - `min = max = partition_value` (all rows in a file share the same partition value)
+/// - `null_count = 0` (partition values extracted from paths are never null)
+/// - `distinct_count = 1` (single distinct value per file for each partition column)
+///
+/// This enables query optimizers to use partition column bounds for pruning and planning.
 pub struct PartitionedFile {
     /// Path for the file (e.g. URL, filesystem path, etc)
     pub object_meta: ObjectMeta,
@@ -103,17 +123,32 @@ pub struct PartitionedFile {
     /// You may use [`wrap_partition_value_in_dict`] to wrap them if you have used [`wrap_partition_type_in_dict`] to wrap the column type.
     ///
     ///
-    /// [`wrap_partition_type_in_dict`]: https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L55
-    /// [`wrap_partition_value_in_dict`]: https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L62
-    /// [`table_partition_cols`]: https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/file_format/options.rs#L190
+    /// [`wrap_partition_type_in_dict`]: crate::file_scan_config::wrap_partition_type_in_dict
+    /// [`wrap_partition_value_in_dict`]: crate::file_scan_config::wrap_partition_value_in_dict
+    /// [`table_partition_cols`]: https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/file_format/options.rs#L87
     pub partition_values: Vec<ScalarValue>,
-    /// An optional file range for a more fine-grained parallel execution
+    /// An optional file range for this file. This is used to statically
+    /// schedule non-overlapping sections of a file to be read in parallel.
     pub range: Option<FileRange>,
     /// Optional statistics that describe the data in this file if known.
     ///
     /// DataFusion relies on these statistics for planning (in particular to sort file groups),
     /// so if they are incorrect, incorrect answers may result.
+    ///
+    /// These statistics cover the full table schema: file columns plus partition columns.
+    /// When set via [`Self::with_statistics`], partition column statistics are automatically
+    /// computed from [`Self::partition_values`] with exact min/max/null_count/distinct_count.
     pub statistics: Option<Arc<Statistics>>,
+    /// The known lexicographical ordering of the rows in this file, if any.
+    ///
+    /// This describes how the data within the file is sorted with respect to one or more
+    /// columns, and is used by the optimizer for planning operations that depend on input
+    /// ordering (e.g. merges, sorts, and certain aggregations).
+    ///
+    /// When available, this is typically inferred from file-level metadata exposed by the
+    /// underlying format (for example, Parquet `sorting_columns`), but it may also be set
+    /// explicitly via [`Self::with_ordering`].
+    pub ordering: Option<LexOrdering>,
     /// An optional field for user defined per object metadata
     pub extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
     /// The estimated size of the parquet metadata, in bytes
@@ -134,6 +169,20 @@ impl PartitionedFile {
             partition_values: vec![],
             range: None,
             statistics: None,
+            ordering: None,
+            extensions: None,
+            metadata_size_hint: None,
+        }
+    }
+
+    /// Create a file from a known ObjectMeta without partition
+    pub fn new_from_meta(object_meta: ObjectMeta) -> Self {
+        Self {
+            object_meta,
+            partition_values: vec![],
+            range: None,
+            statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
@@ -152,12 +201,38 @@ impl PartitionedFile {
             partition_values: vec![],
             range: Some(FileRange { start, end }),
             statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
         .with_range(start, end)
     }
 
+    /// Attach partition values to this file.
+    /// This replaces any existing partition values.
+    pub fn with_partition_values(mut self, partition_values: Vec<ScalarValue>) -> Self {
+        self.partition_values = partition_values;
+        self
+    }
+
+    /// Size of the file to be scanned (taking into account the range, if present).
+    pub fn effective_size(&self) -> u64 {
+        if let Some(range) = &self.range {
+            (range.end - range.start) as u64
+        } else {
+            self.object_meta.size
+        }
+    }
+
+    /// Effective range of the file to be scanned.
+    pub fn range(&self) -> (u64, u64) {
+        if let Some(range) = &self.range {
+            (range.start as u64, range.end as u64)
+        } else {
+            (0, self.object_meta.size)
+        }
+    }
+
     /// Provide a hint to the size of the file metadata. If a hint is provided
     /// the reader will try and fetch the last `size_hint` bytes of the parquet file optimistically.
     /// Without an appropriate hint, two read may be required to fetch the metadata.
@@ -194,9 +269,64 @@ impl PartitionedFile {
         self
     }
 
-    // Update the statistics for this file.
-    pub fn with_statistics(mut self, statistics: Arc<Statistics>) -> Self {
-        self.statistics = Some(statistics);
+    /// Update the statistics for this file.
+    ///
+    /// The provided `statistics` should cover only the file schema columns.
+    /// This method will automatically append exact statistics for partition columns
+    /// based on `partition_values`:
+    /// - `min = max = partition_value` (all rows have the same value)
+    /// - `null_count = 0` (partition values from paths are never null)
+    /// - `distinct_count = 1` (all rows have the same partition value)
+    pub fn with_statistics(mut self, file_statistics: Arc<Statistics>) -> Self {
+        if self.partition_values.is_empty() {
+            // No partition columns, use stats as-is
+            self.statistics = Some(file_statistics);
+        } else {
+            // Extend stats with exact partition column statistics
+            let mut stats = Arc::unwrap_or_clone(file_statistics);
+            for partition_value in &self.partition_values {
+                let col_stats = ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Exact(partition_value.clone()),
+                    min_value: Precision::Exact(partition_value.clone()),
+                    distinct_count: Precision::Exact(1),
+                    sum_value: Precision::Absent,
+                    byte_size: partition_value
+                        .data_type()
+                        .primitive_width()
+                        .map(|w| stats.num_rows.multiply(&Precision::Exact(w)))
+                        .unwrap_or_else(|| Precision::Absent),
+                };
+                stats.column_statistics.push(col_stats);
+            }
+            self.statistics = Some(Arc::new(stats));
+        }
+        self
+    }
+
+    /// Check if this file has any statistics.
+    /// This returns `true` if the file has any Exact or Inexact statistics
+    /// and `false` if all statistics are `Precision::Absent`.
+    pub fn has_statistics(&self) -> bool {
+        if let Some(stats) = &self.statistics {
+            stats.column_statistics.iter().any(|col_stats| {
+                col_stats.null_count != Precision::Absent
+                    || col_stats.max_value != Precision::Absent
+                    || col_stats.min_value != Precision::Absent
+                    || col_stats.sum_value != Precision::Absent
+                    || col_stats.distinct_count != Precision::Absent
+            })
+        } else {
+            false
+        }
+    }
+
+    /// Set the known ordering of data in this file.
+    ///
+    /// The ordering represents the lexicographical sort order of the data,
+    /// typically inferred from file metadata (e.g., Parquet sorting_columns).
+    pub fn with_ordering(mut self, ordering: Option<LexOrdering>) -> Self {
+        self.ordering = ordering;
         self
     }
 }
@@ -208,6 +338,7 @@ impl From<ObjectMeta> for PartitionedFile {
             partition_values: vec![],
             range: None,
             statistics: None,
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         }
@@ -235,23 +366,23 @@ pub enum RangeCalculation {
 /// Calculates an appropriate byte range for reading from an object based on the
 /// provided metadata.
 ///
-/// This asynchronous function examines the `FileMeta` of an object in an object store
+/// This asynchronous function examines the [`PartitionedFile`] of an object in an object store
 /// and determines the range of bytes to be read. The range calculation may adjust
 /// the start and end points to align with meaningful data boundaries (like newlines).
 ///
-/// Returns a `Result` wrapping a `RangeCalculation`, which is either a calculated byte range or an indication to terminate early.
+/// Returns a `Result` wrapping a [`RangeCalculation`], which is either a calculated byte range or an indication to terminate early.
 ///
 /// Returns an `Error` if any part of the range calculation fails, such as issues in reading from the object store or invalid range boundaries.
 pub async fn calculate_range(
-    file_meta: &FileMeta,
+    file: &PartitionedFile,
     store: &Arc<dyn ObjectStore>,
     terminator: Option<u8>,
 ) -> Result<RangeCalculation> {
-    let location = file_meta.location();
-    let file_size = file_meta.object_meta.size;
+    let location = &file.object_meta.location;
+    let file_size = file.object_meta.size;
     let newline = terminator.unwrap_or(b'\n');
 
-    match file_meta.range {
+    match file.range {
         None => Ok(RangeCalculation::Range(None)),
         Some(FileRange { start, end }) => {
             let start: u64 = start.try_into().map_err(|_| {
@@ -267,6 +398,10 @@ pub async fn calculate_range(
                 0
             };
 
+            if start + start_delta > end {
+                return Ok(RangeCalculation::TerminateEarly);
+            }
+
             let end_delta = if end != file_size {
                 find_first_newline(store, location, end - 1, file_size, newline).await?
             } else {
@@ -275,7 +410,7 @@ pub async fn calculate_range(
 
             let range = start + start_delta..end + end_delta;
 
-            if range.start == range.end {
+            if range.start >= range.end {
                 return Ok(RangeCalculation::TerminateEarly);
             }
 
@@ -294,7 +429,6 @@ pub async fn calculate_range(
 /// Returns a `Result` wrapping a `usize` that represents the position of the first newline character found within the specified range. If no newline is found, it returns the length of the scanned data, effectively indicating the end of the range.
 ///
 /// The function returns an `Error` if any issues arise while reading from the object store or processing the data stream.
-///
 async fn find_first_newline(
     object_store: &Arc<dyn ObjectStore>,
     location: &Path,
@@ -398,8 +532,10 @@ pub fn generate_test_files(num_files: usize, overlap_factor: f64) -> Vec<FileGro
                     min_value: Precision::Exact(ScalarValue::Float64(Some(min))),
                     sum_value: Precision::Absent,
                     distinct_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 }],
             })),
+            ordering: None,
             extensions: None,
             metadata_size_hint: None,
         };
@@ -443,7 +579,7 @@ mod tests {
     use datafusion_execution::object_store::{
         DefaultObjectStoreRegistry, ObjectStoreRegistry,
     };
-    use object_store::{local::LocalFileSystem, path::Path};
+    use object_store::{ObjectStoreExt, local::LocalFileSystem, path::Path};
     use std::{collections::HashMap, ops::Not, sync::Arc};
     use url::Url;
 
@@ -524,6 +660,70 @@ mod tests {
         sut.get_store(url.as_ref()).unwrap();
     }
 
+    #[test]
+    fn test_with_statistics_appends_partition_column_stats() {
+        use crate::PartitionedFile;
+        use datafusion_common::stats::Precision;
+        use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
+
+        // Create a PartitionedFile with partition values
+        let mut pf = PartitionedFile::new(
+            "test.parquet",
+            100, // file size
+        );
+        pf.partition_values = vec![
+            ScalarValue::Date32(Some(20148)), // 2025-03-01
+        ];
+
+        // Create file-only statistics (1 column for 'id')
+        let file_stats = Arc::new(Statistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Exact(16),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(0),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(4))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(3))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
+            }],
+        });
+
+        // Call with_statistics - should append partition column stats
+        let pf = pf.with_statistics(file_stats);
+
+        // Verify the statistics now have 2 columns
+        let stats = pf.statistics.unwrap();
+        assert_eq!(
+            stats.column_statistics.len(),
+            2,
+            "Expected 2 columns (id + date partition)"
+        );
+
+        // Verify partition column statistics
+        let partition_col_stats = &stats.column_statistics[1];
+        assert_eq!(
+            partition_col_stats.null_count,
+            Precision::Exact(0),
+            "Partition column null_count should be Exact(0)"
+        );
+        assert_eq!(
+            partition_col_stats.min_value,
+            Precision::Exact(ScalarValue::Date32(Some(20148))),
+            "Partition column min should match partition value"
+        );
+        assert_eq!(
+            partition_col_stats.max_value,
+            Precision::Exact(ScalarValue::Date32(Some(20148))),
+            "Partition column max should match partition value"
+        );
+        assert_eq!(
+            partition_col_stats.distinct_count,
+            Precision::Exact(1),
+            "Partition column distinct_count should be Exact(1)"
+        );
+    }
+
     #[test]
     fn test_url_contains() {
         let url = ListingTableUrl::parse("file:///var/data/mytable/").unwrap();
@@ -542,12 +742,13 @@ mod tests {
 
         // as per documentation, when `ignore_subdirectory` is true, we should ignore files that aren't
         // a direct child of the `url`
-        assert!(url
-            .contains(
+        assert!(
+            url.contains(
                 &Path::parse("/var/data/mytable/mysubfolder/data.parquet").unwrap(),
                 true
             )
-            .not());
+            .not()
+        );
 
         // when we set `ignore_subdirectory` to false, we should not ignore the file
         assert!(url.contains(
@@ -575,4 +776,31 @@ mod tests {
         // testing an empty path with `ignore_subdirectory` set to false
         assert!(url.contains(&Path::parse("/var/data/mytable/").unwrap(), false));
     }
+
+    /// Regression test for <https://github.com/apache/datafusion/issues/19605>
+    #[tokio::test]
+    async fn test_calculate_range_single_line_file() {
+        use super::{PartitionedFile, RangeCalculation, calculate_range};
+        use object_store::ObjectStore;
+        use object_store::memory::InMemory;
+
+        let content = r#"{"id":1,"data":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}"#;
+        let file_size = content.len() as u64;
+
+        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+        let path = Path::from("test.json");
+        store.put(&path, content.into()).await.unwrap();
+
+        let mid = file_size / 2;
+        let partitioned_file = PartitionedFile::new_with_range(
+            path.to_string(),
+            file_size,
+            mid as i64,
+            file_size as i64,
+        );
+
+        let result = calculate_range(&partitioned_file, &store, None).await;
+
+        assert!(matches!(result, Ok(RangeCalculation::TerminateEarly)));
+    }
 }
diff --git a/datafusion/datasource/src/morsel/adapters.rs b/datafusion/datasource/src/morsel/adapters.rs
new file mode 100644
index 0000000000000..460ba314f3264
--- /dev/null
+++ b/datafusion/datasource/src/morsel/adapters.rs
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PartitionedFile;
+use crate::file_stream::FileOpener;
+use crate::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use arrow::array::RecordBatch;
+use datafusion_common::Result;
+use futures::FutureExt;
+use futures::stream::BoxStream;
+use std::fmt::Debug;
+use std::sync::{Arc, Mutex};
+
+/// An adapter for `FileOpener` that allows it to be used as a `Morselizer` for
+/// backwards compatibility.
+///
+/// This is useful for file formats that do not support morselization, where we
+/// can treat the entire file as a single morsel.
+pub struct FileOpenerMorselizer {
+    file_opener: Arc<dyn FileOpener>,
+}
+
+impl Debug for FileOpenerMorselizer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileOpenerMorselizer")
+            .field("file_opener", &"...")
+            .finish()
+    }
+}
+
+impl FileOpenerMorselizer {
+    pub fn new(file_opener: Arc<dyn FileOpener>) -> Self {
+        Self { file_opener }
+    }
+}
+
+impl Morselizer for FileOpenerMorselizer {
+    fn morselize(&self, file: PartitionedFile) -> Result<Vec<Box<dyn MorselPlanner>>> {
+        let opener = Arc::clone(&self.file_opener);
+        let planner = FileOpenFutureMorselPlanner::new(opener, file);
+        Ok(vec![Box::new(planner)])
+    }
+}
+
+/// Adapter for `FileOpenFuture` that allows it to be used as a `MorselPlanner`
+/// for backwards compatibility.
+struct FileOpenFutureMorselPlanner {
+    file_opener: Arc<dyn FileOpener>,
+    stream: Arc<Mutex<Option<BoxStream<'static, Result<RecordBatch>>>>>,
+    file: Mutex<Option<PartitionedFile>>,
+}
+
+impl Debug for FileOpenFutureMorselPlanner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileOpenFutureMorselPlanner")
+            .field("file_opener", &"...")
+            .field("stream", &"...")
+            .field("file", &self.file)
+            .finish()
+    }
+}
+
+impl FileOpenFutureMorselPlanner {
+    pub fn new(file_opener: Arc<dyn FileOpener>, file: PartitionedFile) -> Self {
+        Self {
+            file_opener,
+            stream: Arc::new(Mutex::new(None)),
+            file: Mutex::new(Some(file)),
+        }
+    }
+}
+
+impl MorselPlanner for FileOpenFutureMorselPlanner {
+    fn plan(&mut self) -> Result<Option<MorselPlan>> {
+        let mut morsel_plan = MorselPlan::new();
+        let mut made_progress = false;
+
+        // Note that plan should **not** do IO work so setup a callback if needed
+        if let Some(file) = self.file.lock().unwrap().take() {
+            let file_opener = Arc::clone(&self.file_opener);
+            let output_stream = Arc::clone(&self.stream);
+            let load_future = async move {
+                let stream = file_opener
+                    // open the file to get a stream
+                    .open(file)?
+                    // create the stream
+                    .await?;
+                // store the stream for later retrieval
+                *(output_stream.lock().unwrap()) = Some(stream);
+                Ok(())
+            };
+            morsel_plan = morsel_plan.with_io_future(load_future.boxed());
+            made_progress = true;
+        }
+
+        // If the stream is ready, return it as a morsel
+        if let Some(stream) = self.stream.lock().unwrap().take() {
+            let morsel = FileStreamMorsel::new(stream);
+            morsel_plan = morsel_plan.with_morsels(vec![Box::new(morsel)]);
+            made_progress = true;
+        }
+
+        Ok(made_progress.then_some(morsel_plan))
+    }
+}
+
+struct FileStreamMorsel {
+    stream: BoxStream<'static, Result<RecordBatch>>,
+}
+
+impl Debug for FileStreamMorsel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FileStreamMorsel")
+            .field("stream", &"...")
+            .finish()
+    }
+}
+
+impl FileStreamMorsel {
+    pub fn new(stream: BoxStream<'static, Result<RecordBatch>>) -> Self {
+        Self { stream }
+    }
+}
+
+impl Morsel for FileStreamMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.stream
+    }
+
+    fn split(&mut self) -> Result<Vec<Box<dyn Morsel>>> {
+        Ok(vec![]) // no splitting supported
+    }
+}
diff --git a/datafusion/datasource/src/morsel/mod.rs b/datafusion/datasource/src/morsel/mod.rs
new file mode 100644
index 0000000000000..5a6569d5307c4
--- /dev/null
+++ b/datafusion/datasource/src/morsel/mod.rs
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Structures for Morsel Driven IO
+//!
+//! Morsel Driven IO is a technique for parallelizing the reading of large files
+//! by dividing them into smaller "morsels" that can be processed independently.
+//! It is inspired by the paper [Morsel-Driven Parallelism: A NUMA-Aware Query
+//! Evaluation Framework for the Many-Core Age](https://db.in.tum.de/~leis/papers/morsels.pdf)
+
+mod adapters;
+#[cfg(test)]
+pub(crate) mod test_utils;
+
+use crate::PartitionedFile;
+use arrow::array::RecordBatch;
+use datafusion_common::error::Result;
+use futures::future::BoxFuture;
+use futures::stream::BoxStream;
+use std::fmt::Debug;
+
+pub use adapters::FileOpenerMorselizer;
+
+/// A Morsel of work ready to resolve to a stream of [`RecordBatch`]es
+///
+/// This represents a single morsel of work that is ready to be processed. It
+/// has all data necessary (does not need any I/O) and is ready to be turned
+/// into a stream of RecordBatches for processing by the execution engine.
+pub trait Morsel: Send + Debug {
+    /// Consume this morsel and produce a stream of RecordBatches for processing.
+    ///
+    /// This should not do any IO work, such as reading from the file.
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>>;
+
+    /// If supported, split this morsel into smaller morsels.
+    ///
+    /// If not possible or not supported, return an empty Vector.
+    ///
+    /// This is used for dynamic load balancing of work where there are some
+    /// tasks that have nothing else scheduled.
+    fn split(&mut self) -> Result<Vec<Box<dyn Morsel>>>;
+}
+
+/// A Morselizer takes a single PartitionedFile and breaks it down into smaller chunks
+/// that can be planned and read in parallel by the execution engine. This is the entry point for
+/// morsel driven IO.
+pub trait Morselizer: Send + Sync + Debug {
+    /// Return MorselPlanners for this file.
+    ///
+    /// Each MorselPlanner is responsible for I/O and planning morsels for a
+    /// single scan of the file. Returning multiple MorselPlanners allows for
+    /// multiple concurrent scans of the same file.
+    ///
+    /// This may involve CPU work, such as parsing parquet metadata and
+    /// evaluating pruning predicates. It should NOT do any IO work, such as
+    /// reading from the file. If IO is required, it should return a future that
+    /// the caller can poll to drive the IO work to completion, and once the
+    /// future is complete, the caller can call `morselize` again to get the
+    /// next morsels.
+    fn morselize(&self, file: PartitionedFile) -> Result<Vec<Box<dyn MorselPlanner>>>;
+}
+
+/// A Morsel Planner is responsible for creating morsels for a given scan.
+///
+/// The MorselPlanner is the unit of I/O -- there is only ever a single IO
+/// outstanding for a specific  MorselPlanner. DataFusion will potentially run
+/// multiple MorselPlanners in parallel which corresponds to multiple parallel
+/// I/O requests.
+///
+/// It is not a Rust `Stream` so that it can explicitly separate CPU bound
+/// work from IO work.
+///
+/// The design is similar to `ParquetPushDecoder` -- when `plan` is called, it
+/// should do CPU work to produce the next morsels or discover the next I/O
+/// phase.
+///
+/// Best practice is to spawn IO in a tokio Task in a separate IO runtime to
+/// ensure that CPU work doesn't block/slowdown IO work, but this is not
+/// strictly required by the API.
+pub trait MorselPlanner: Send + Debug {
+    /// Attempt to plan morsels. This may involve CPU work, such as parsing
+    /// parquet metadata and evaluating pruning predicates.
+    ///
+    /// It should NOT do any IO work, such as reading from the file. If IO is
+    /// required, the returned [`MorselPlan`] should contain a future that the
+    /// caller polls to drive the IO work to completion. Once the future is
+    /// complete, the caller can call `plan` again to get the next morsels.
+    ///
+    /// Note this function is not async to make it clear explicitly that if IO
+    /// is required, it should be done in the returned `io_future`.
+    ///
+    /// Returns `None` if the MorselPlanner has no more work to do (is done).
+    ///
+    /// # Empty Morsel Plans
+    ///
+    /// It may return Some(..) with an empty MorselPlan, which means it is ready
+    /// for more CPU work and should be called again.
+    ///
+    /// # Output Ordering
+    ///
+    /// See the comments on [`MorselPlan`] for the logical output order
+    fn plan(&mut self) -> Result<Option<MorselPlan>>;
+}
+
+/// Return result of [`MorselPlanner::plan`]
+///
+/// # Logical Ordering
+/// For plans where the output order of rows is maintained, the output order of
+/// a [`MorselPlanner`] is logically defined as follows:
+/// 1. All morsels that are directly produced
+/// 2. (recursively) All morsels produced by the returned `planners`
+#[derive(Default)]
+pub struct MorselPlan {
+    /// Any Morsels that are ready for processing.
+    morsels: Vec<Box<dyn Morsel>>,
+    /// Any newly-created planners that are ready for CPU work.
+    planners: Vec<Box<dyn MorselPlanner>>,
+    /// A future that will drive any IO work to completion
+    ///
+    /// DataFusion will poll this future occasionally to drive the IO work to
+    /// completion. Once the future resolves, DataFusion will call  `plan` again
+    /// to get the next morsels. Best practice is to run this in a task on a
+    /// separate IO runtime to ensure that CPU work is not blocked by IO work,
+    /// but this is not strictly required by the API.
+    io_future: Option<BoxFuture<'static, Result<()>>>,
+}
+
+impl MorselPlan {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    pub fn with_morsels(mut self, morsels: Vec<Box<dyn Morsel>>) -> Self {
+        self.morsels = morsels;
+        self
+    }
+
+    pub fn with_planners(mut self, planners: Vec<Box<dyn MorselPlanner>>) -> Self {
+        self.planners = planners;
+        self
+    }
+
+    pub fn with_io_future(mut self, io_future: BoxFuture<'static, Result<()>>) -> Self {
+        self.io_future = Some(io_future);
+        self
+    }
+
+    pub fn take_io_future(&mut self) -> Option<BoxFuture<'static, Result<()>>> {
+        self.io_future.take()
+    }
+
+    pub fn set_io_future(&mut self, io_future: BoxFuture<'static, Result<()>>) {
+        self.io_future = Some(io_future);
+    }
+
+    pub fn take_morsels(&mut self) -> Vec<Box<dyn Morsel>> {
+        std::mem::take(&mut self.morsels)
+    }
+
+    pub fn take_planners(&mut self) -> Vec<Box<dyn MorselPlanner>> {
+        std::mem::take(&mut self.planners)
+    }
+
+    pub fn has_io_future(&self) -> bool {
+        self.io_future.is_some()
+    }
+}
diff --git a/datafusion/datasource/src/morsel/test_utils.rs b/datafusion/datasource/src/morsel/test_utils.rs
new file mode 100644
index 0000000000000..a5a066d16dc5a
--- /dev/null
+++ b/datafusion/datasource/src/morsel/test_utils.rs
@@ -0,0 +1,705 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test-only mocks for exercising `FileStream`'s morsel scheduler.
+//!
+//! These utilities let tests describe morsel-planning behavior directly,
+//! without depending on a particular file format implementation. They are used
+//! to verify:
+//! - the sequence in which `FileStream` calls `morselize`, `plan`, and polls I/O
+//! - the order in which morsels and child planners are consumed
+//! - the eventual order of `RecordBatch` output produced by the scheduler
+
+use crate::PartitionedFile;
+use crate::morsel::{Morsel, MorselPlan, MorselPlanner, Morselizer};
+use arrow::array::{Int32Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::Result;
+use futures::future::BoxFuture;
+use futures::stream::BoxStream;
+use futures::{Future, Stream};
+use std::collections::{HashMap, VecDeque};
+use std::fmt::{Debug, Display, Formatter};
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+/// Identifier for a mock planner in test traces.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct PlannerId(pub usize);
+
+/// Identifier for a mock morsel in test traces.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct MorselId(pub usize);
+
+/// Identifier for a mock I/O future in test traces.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct IoFutureId(pub usize);
+
+/// Identifier for a produced batch in test traces.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct BatchId(pub usize);
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum MorselEvent {
+    /// `FileStream` asked the `Morselizer` to start work for a file.
+    MorselizeFile { path: String },
+    /// A root planner was created from a test specification.
+    PlannerCreated { planner_id: PlannerId },
+    /// `MorselPlanner::plan()` was invoked.
+    PlannerPlanCalled { planner_id: PlannerId },
+    /// A planner returned one or more child planners.
+    PlannerProducedChild {
+        planner_id: PlannerId,
+        child_planner_id: PlannerId,
+    },
+    /// A planner returned an I/O future.
+    IoFutureCreated {
+        planner_id: PlannerId,
+        io_future_id: IoFutureId,
+    },
+    /// `FileStream` polled a planner's outstanding I/O future.
+    IoFuturePolled {
+        planner_id: PlannerId,
+        io_future_id: IoFutureId,
+    },
+    /// A planner's I/O future completed successfully.
+    IoFutureResolved {
+        planner_id: PlannerId,
+        io_future_id: IoFutureId,
+    },
+    /// A planner produced a morsel that is ready for CPU work.
+    MorselProduced {
+        planner_id: PlannerId,
+        morsel_id: MorselId,
+    },
+    /// `FileStream` began reading a morsel.
+    MorselStreamStarted { morsel_id: MorselId },
+    /// A morsel stream yielded one batch.
+    MorselStreamBatchProduced {
+        morsel_id: MorselId,
+        batch_id: BatchId,
+    },
+    /// A morsel stream reached EOF.
+    MorselStreamFinished { morsel_id: MorselId },
+}
+
+/// Observer of `MorselEvent`s emitted by the test harness.
+#[derive(Debug, Default, Clone)]
+pub struct MorselObserver {
+    events: Arc<Mutex<Vec<MorselEvent>>>,
+}
+
+impl MorselObserver {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Clear any previously buffered events
+    pub fn clear(&self) {
+        self.events.lock().unwrap().clear();
+    }
+
+    /// Push a new [`MorselEvent`]
+    pub fn push(&self, event: MorselEvent) {
+        self.events.lock().unwrap().push(event);
+    }
+
+    /// Return a copy of the current list of [`MorselEvents`]
+    pub fn events(&self) -> Vec<MorselEvent> {
+        self.events.lock().unwrap().clone()
+    }
+
+    /// Format the recorded events as a stable, human-readable snapshot.
+    ///
+    /// We prefer snapshotting the event trace in tests rather than asserting it
+    /// programmatically because `FileStream` scheduling behavior is easier to
+    /// review as a full ordered trace. When the scheduler changes, updating the
+    /// snapshot is typically simpler and more informative than rebuilding a
+    /// hand-authored sequence of enum constructors.
+    pub fn format_events(&self) -> String {
+        self.events()
+            .into_iter()
+            .map(|event| event.to_string())
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+
+    /// Format only the high-level scheduler events.
+    ///
+    /// This is useful for tests where the exact `plan()` / `poll()` interleave
+    /// is not the main point of the assertion. Summary snapshots are easier to
+    /// review when validating broader behavior such as ordering or outstanding
+    /// I/O limits.
+    pub fn format_summary_events(&self) -> String {
+        self.events()
+            .into_iter()
+            .filter(MorselEvent::is_summary_event)
+            .map(|event| event.to_string())
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
+}
+
+impl MorselEvent {
+    /// Return true for the higher-level events that are most useful in compact
+    /// scheduler snapshots.
+    pub fn is_summary_event(&self) -> bool {
+        matches!(
+            self,
+            MorselEvent::MorselizeFile { .. }
+                | MorselEvent::PlannerCreated { .. }
+                | MorselEvent::PlannerProducedChild { .. }
+                | MorselEvent::IoFutureCreated { .. }
+                | MorselEvent::IoFutureResolved { .. }
+                | MorselEvent::MorselProduced { .. }
+                | MorselEvent::MorselStreamBatchProduced { .. }
+        )
+    }
+}
+
+/// Test [`Morselizer`] that maps file paths to a fixed set of planner specs.
+///
+/// This lets tests describe file-level morselization behavior without any real
+/// file metadata or object-store access.
+#[derive(Debug, Clone)]
+pub struct MockMorselizer {
+    observer: MorselObserver,
+    files: HashMap<String, MockPlanner>,
+}
+
+impl MockMorselizer {
+    pub fn new() -> Self {
+        Self {
+            observer: MorselObserver::new(),
+            files: HashMap::new(),
+        }
+    }
+
+    /// Return a reference to the observer
+    pub fn observer(&self) -> &MorselObserver {
+        &self.observer
+    }
+
+    /// Add the description of how a file will be planned
+    pub fn with_file(mut self, path: impl Into<String>, planner: MockPlanner) -> Self {
+        self.files.insert(path.into(), planner);
+        self
+    }
+}
+
+impl Morselizer for MockMorselizer {
+    fn morselize(&self, file: PartitionedFile) -> Result<Vec<Box<dyn MorselPlanner>>> {
+        let path = file.object_meta.location.to_string();
+        self.observer
+            .push(MorselEvent::MorselizeFile { path: path.clone() });
+
+        Ok(self
+            .files
+            .get(&path)
+            .cloned()
+            .into_iter()
+            .map(|planner| {
+                self.observer.push(MorselEvent::PlannerCreated {
+                    planner_id: planner.planner_id,
+                });
+                Box::new(MockMorselPlanner::new(self.observer.clone(), planner))
+                    as Box<dyn MorselPlanner>
+            })
+            .collect())
+    }
+}
+
+/// Steps for a MockPlanner
+///
+/// Tests build these specs up front, and `MockMorselizer` turns them into real
+/// planners when `FileStream` starts work on a file.
+#[derive(Debug, Clone)]
+pub struct MockPlanner {
+    planner_id: PlannerId,
+    steps: Vec<PlannerStep>,
+}
+
+impl MockPlanner {
+    /// Create a fluent builder for a planner specification.
+    pub fn builder() -> MockPlannerBuilder {
+        MockPlannerBuilder::default()
+    }
+}
+
+/// One scheduler-visible step in a mock planner's lifecycle.
+///
+/// A single step can produce morsels, child planners, and an I/O future, which
+/// makes it possible to model the generic `MorselPlan` API closely in tests.
+#[derive(Debug, Clone)]
+pub enum PlannerStep {
+    ReturnPlan {
+        morsels: Vec<MockMorselSpec>,
+        planners: Vec<MockPlanner>,
+        /// Optional identifier for the I/O future returned by this step.
+        ///
+        /// Tests use this to assert the ordering of multiple outstanding I/O
+        /// phases. It must be `Some` when `io_polls > 0`.
+        io_future_id: Option<IoFutureId>,
+        io_polls: usize,
+    },
+    ReturnNone,
+}
+
+/// Builder for [`MockPlanner`].
+///
+/// This keeps `FileStream` scheduler tests readable as they grow to include
+/// child planners and multiple I/O phases.
+#[derive(Debug, Default)]
+pub struct MockPlannerBuilder {
+    planner_id: Option<PlannerId>,
+    steps: Vec<PlannerStep>,
+}
+
+impl MockPlannerBuilder {
+    pub fn with_id(mut self, planner_id: PlannerId) -> Self {
+        self.planner_id = Some(planner_id);
+        self
+    }
+
+    pub fn return_morsel(self, morsel_id: MorselId, batch_id: i32) -> Self {
+        self.return_plan(
+            ReturnPlanBuilder::new()
+                .with_morsel(MockMorselSpec::single_batch(morsel_id, batch_id)),
+        )
+    }
+
+    pub fn return_plan(mut self, plan: ReturnPlanBuilder) -> Self {
+        self.steps.push(plan.build());
+        self
+    }
+
+    pub fn return_none(mut self) -> Self {
+        self.steps.push(PlannerStep::ReturnNone);
+        self
+    }
+
+    pub fn build(self) -> MockPlanner {
+        MockPlanner {
+            planner_id: self
+                .planner_id
+                .expect("MockPlannerBuilder requires planner_id"),
+            steps: self.steps,
+        }
+    }
+}
+
+/// Builder for `PlannerStep::ReturnPlan`.
+#[derive(Debug, Default)]
+pub struct ReturnPlanBuilder {
+    /// Morsels that should be returned immediately by this planner step.
+    morsels: Vec<MockMorselSpec>,
+    /// Child planners that should be returned immediately by this planner step.
+    planners: Vec<MockPlanner>,
+    /// Identifier for the mock I/O future returned by this step, if any.
+    io_future_id: Option<IoFutureId>,
+    /// Number of `Poll::Pending` results the mock I/O future should yield
+    /// before resolving successfully.
+    ///
+    /// This is a deterministic test-only knob. It does not model elapsed time
+    /// or bytes read; it only controls how many scheduler polls are required
+    /// before the mock I/O future becomes ready.
+    io_polls: usize,
+}
+
+impl ReturnPlanBuilder {
+    /// Create an empty return-plan builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Add a single-batch morsel that should be returned immediately by this
+    /// planner step.
+    pub fn return_morsel(self, morsel_id: MorselId, batch_id: i32) -> Self {
+        self.with_morsel(MockMorselSpec::single_batch(morsel_id, batch_id))
+    }
+
+    /// Add a morsel that should be returned immediately by this planner step.
+    pub fn with_morsel(mut self, morsel: MockMorselSpec) -> Self {
+        self.morsels.push(morsel);
+        self
+    }
+
+    /// Add a child planner that should be returned immediately by this step.
+    pub fn with_planner(mut self, planner: MockPlanner) -> Self {
+        self.planners.push(planner);
+        self
+    }
+
+    /// Return a mock I/O future from this step.
+    ///
+    /// `io_future_id` is recorded in the emitted `MorselEvent`s so tests can
+    /// distinguish multiple I/O phases from the same planner.
+    ///
+    /// `io_polls` controls how many times that future returns `Poll::Pending`
+    /// before it resolves with `Poll::Ready(Ok(()))`.
+    ///
+    /// For example, `with_io(id, 1)` means:
+    /// - first poll: `Poll::Pending`
+    /// - second poll: `Poll::Ready(Ok(()))`
+    ///
+    /// So the total number of polls observed in the trace is `io_polls + 1`.
+    pub fn with_io(mut self, io_future_id: IoFutureId, io_polls: usize) -> Self {
+        self.io_future_id = Some(io_future_id);
+        self.io_polls = io_polls;
+        self
+    }
+
+    /// Build the corresponding [`PlannerStep::ReturnPlan`]
+    pub fn build(self) -> PlannerStep {
+        PlannerStep::ReturnPlan {
+            morsels: self.morsels,
+            planners: self.planners,
+            io_future_id: self.io_future_id,
+            io_polls: self.io_polls,
+        }
+    }
+}
+
+/// Declarative description of a mock morsel and the batches it should yield.
+///
+/// Each batch id is turned into a one-row `RecordBatch`, which makes output
+/// order easy to assert in `FileStream` tests.
+#[derive(Debug, Clone)]
+pub struct MockMorselSpec {
+    morsel_id: MorselId,
+    batch_ids: Vec<i32>,
+}
+
+impl MockMorselSpec {
+    pub fn single_batch(morsel_id: MorselId, batch_id: i32) -> Self {
+        Self {
+            morsel_id,
+            batch_ids: vec![batch_id],
+        }
+    }
+}
+
+/// Concrete `MorselPlanner` used by `FileStream` tests.
+///
+/// It consumes a queue of `PlannerStep`s so tests can deterministically control
+/// when a planner emits morsels, yields child planners, blocks on I/O, or
+/// finishes.
+struct MockMorselPlanner {
+    observer: MorselObserver,
+    planner_id: PlannerId,
+    steps: VecDeque<PlannerStep>,
+}
+
+impl MockMorselPlanner {
+    fn new(observer: MorselObserver, spec: MockPlanner) -> Self {
+        Self {
+            observer,
+            planner_id: spec.planner_id,
+            steps: spec.steps.into(),
+        }
+    }
+}
+
+impl Debug for MockMorselPlanner {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MockMorselPlanner")
+            .field("planner_id", &self.planner_id)
+            .finish()
+    }
+}
+
+impl MorselPlanner for MockMorselPlanner {
+    fn plan(&mut self) -> Result<Option<MorselPlan>> {
+        self.observer.push(MorselEvent::PlannerPlanCalled {
+            planner_id: self.planner_id,
+        });
+
+        let Some(step) = self.steps.pop_front() else {
+            return Ok(None);
+        };
+
+        match step {
+            PlannerStep::ReturnPlan {
+                morsels,
+                planners,
+                io_future_id,
+                io_polls,
+            } => {
+                let mut plan = MorselPlan::new();
+
+                if !morsels.is_empty() {
+                    for morsel in &morsels {
+                        self.observer.push(MorselEvent::MorselProduced {
+                            planner_id: self.planner_id,
+                            morsel_id: morsel.morsel_id,
+                        });
+                    }
+                    plan = plan.with_morsels(
+                        morsels
+                            .into_iter()
+                            .map(|morsel| {
+                                Box::new(MockMorsel::new(self.observer.clone(), morsel))
+                                    as Box<dyn Morsel>
+                            })
+                            .collect(),
+                    );
+                }
+
+                if !planners.is_empty() {
+                    for planner in &planners {
+                        self.observer.push(MorselEvent::PlannerProducedChild {
+                            planner_id: self.planner_id,
+                            child_planner_id: planner.planner_id,
+                        });
+                    }
+                    plan = plan.with_planners(
+                        planners
+                            .into_iter()
+                            .map(|planner| {
+                                Box::new(MockMorselPlanner::new(
+                                    self.observer.clone(),
+                                    planner,
+                                ))
+                                    as Box<dyn MorselPlanner>
+                            })
+                            .collect(),
+                    );
+                }
+
+                if io_polls > 0 {
+                    let io_future_id = io_future_id.expect(
+                        "PlannerStep::ReturnPlan with io_polls > 0 must specify io_future_id",
+                    );
+                    self.observer.push(MorselEvent::IoFutureCreated {
+                        planner_id: self.planner_id,
+                        io_future_id,
+                    });
+                    plan = plan.with_io_future(Box::pin(MockIoFuture::new(
+                        self.observer.clone(),
+                        self.planner_id,
+                        io_future_id,
+                        io_polls,
+                    ))
+                        as BoxFuture<'static, Result<()>>);
+                }
+
+                Ok(Some(plan))
+            }
+            PlannerStep::ReturnNone => Ok(None),
+        }
+    }
+}
+
+/// Concrete `Morsel` used by the test harness.
+///
+/// It yields a deterministic sequence of one-row batches and records lifecycle
+/// events so tests can correlate scheduler activity with produced output.
+struct MockMorsel {
+    observer: MorselObserver,
+    morsel_id: MorselId,
+    batch_ids: Vec<i32>,
+}
+
+impl MockMorsel {
+    fn new(observer: MorselObserver, spec: MockMorselSpec) -> Self {
+        Self {
+            observer,
+            morsel_id: spec.morsel_id,
+            batch_ids: spec.batch_ids,
+        }
+    }
+}
+
+impl Debug for MockMorsel {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MockMorsel")
+            .field("morsel_id", &self.morsel_id)
+            .finish()
+    }
+}
+
+impl Morsel for MockMorsel {
+    fn into_stream(self: Box<Self>) -> BoxStream<'static, Result<RecordBatch>> {
+        self.observer.push(MorselEvent::MorselStreamStarted {
+            morsel_id: self.morsel_id,
+        });
+        Box::pin(MockMorselStream::new(
+            self.observer.clone(),
+            self.morsel_id,
+            self.batch_ids,
+        ))
+    }
+
+    fn split(&mut self) -> Result<Vec<Box<dyn Morsel>>> {
+        Ok(vec![])
+    }
+}
+
+/// Stream returned by `MockMorsel::into_stream`.
+///
+/// This stream exists so tests can observe exactly when a morsel starts,
+/// produces batches, and finishes.
+struct MockMorselStream {
+    observer: MorselObserver,
+    morsel_id: MorselId,
+    batches: VecDeque<i32>,
+    finished: bool,
+}
+
+impl MockMorselStream {
+    fn new(observer: MorselObserver, morsel_id: MorselId, batch_ids: Vec<i32>) -> Self {
+        Self {
+            observer,
+            morsel_id,
+            batches: batch_ids.into(),
+            finished: false,
+        }
+    }
+}
+
+impl Stream for MockMorselStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if let Some(batch_id) = self.batches.pop_front() {
+            self.observer.push(MorselEvent::MorselStreamBatchProduced {
+                morsel_id: self.morsel_id,
+                batch_id: BatchId(batch_id as usize),
+            });
+            return Poll::Ready(Some(Ok(single_value_batch(batch_id))));
+        }
+
+        if !self.finished {
+            self.finished = true;
+            self.observer.push(MorselEvent::MorselStreamFinished {
+                morsel_id: self.morsel_id,
+            });
+        }
+
+        Poll::Ready(None)
+    }
+}
+
+/// Deterministic future used to simulate planner I/O in tests.
+///
+/// It resolves after a configured number of pending polls and self-wakes so
+/// `FileStream` can make forward progress without timers or real async I/O.
+struct MockIoFuture {
+    observer: MorselObserver,
+    planner_id: PlannerId,
+    io_future_id: IoFutureId,
+    pending_polls_remaining: usize,
+}
+
+impl MockIoFuture {
+    fn new(
+        observer: MorselObserver,
+        planner_id: PlannerId,
+        io_future_id: IoFutureId,
+        pending_polls: usize,
+    ) -> Self {
+        Self {
+            observer,
+            planner_id,
+            io_future_id,
+            pending_polls_remaining: pending_polls,
+        }
+    }
+}
+
+impl Future for MockIoFuture {
+    type Output = Result<()>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.observer.push(MorselEvent::IoFuturePolled {
+            planner_id: self.planner_id,
+            io_future_id: self.io_future_id,
+        });
+        if self.pending_polls_remaining > 0 {
+            self.pending_polls_remaining -= 1;
+            cx.waker().wake_by_ref();
+            return Poll::Pending;
+        }
+
+        self.observer.push(MorselEvent::IoFutureResolved {
+            planner_id: self.planner_id,
+            io_future_id: self.io_future_id,
+        });
+        Poll::Ready(Ok(()))
+    }
+}
+
+fn single_value_batch(value: i32) -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
+    RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![value]))]).unwrap()
+}
+
+impl Display for MorselEvent {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MorselEvent::MorselizeFile { path } => write!(f, "morselize_file: {path}"),
+            MorselEvent::PlannerCreated { planner_id } => {
+                write!(f, "planner_created: {planner_id:?}")
+            }
+            MorselEvent::PlannerPlanCalled { planner_id } => {
+                write!(f, "planner_called: {planner_id:?}")
+            }
+            MorselEvent::PlannerProducedChild {
+                planner_id,
+                child_planner_id,
+            } => write!(
+                f,
+                "planner_produced_child: {planner_id:?} -> {child_planner_id:?}"
+            ),
+            MorselEvent::IoFutureCreated {
+                planner_id,
+                io_future_id,
+            } => write!(f, "io_future_created: {planner_id:?}, {io_future_id:?}"),
+            MorselEvent::IoFuturePolled {
+                planner_id,
+                io_future_id,
+            } => write!(f, "io_future_polled: {planner_id:?}, {io_future_id:?}"),
+            MorselEvent::IoFutureResolved {
+                planner_id,
+                io_future_id,
+            } => write!(f, "io_future_resolved: {planner_id:?}, {io_future_id:?}"),
+            MorselEvent::MorselProduced {
+                planner_id,
+                morsel_id,
+            } => write!(f, "morsel_produced: {planner_id:?}, {morsel_id:?}"),
+            MorselEvent::MorselStreamStarted { morsel_id } => {
+                write!(f, "morsel_stream_started: {morsel_id:?}")
+            }
+            MorselEvent::MorselStreamBatchProduced {
+                morsel_id,
+                batch_id,
+            } => {
+                write!(
+                    f,
+                    "morsel_stream_batch_produced: {morsel_id:?}, {batch_id:?}"
+                )
+            }
+            MorselEvent::MorselStreamFinished { morsel_id } => {
+                write!(f, "morsel_stream_finished: {morsel_id:?}")
+            }
+        }
+    }
+}
diff --git a/datafusion/datasource/src/projection.rs b/datafusion/datasource/src/projection.rs
new file mode 100644
index 0000000000000..9a0cb494e495f
--- /dev/null
+++ b/datafusion/datasource/src/projection.rs
@@ -0,0 +1,630 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{Schema, SchemaRef};
+use datafusion_common::{
+    Result, ScalarValue,
+    tree_node::{Transformed, TransformedResult, TreeNode},
+};
+use datafusion_physical_expr::{
+    expressions::{Column, Literal},
+    projection::{ProjectionExpr, ProjectionExprs},
+};
+use futures::{FutureExt, StreamExt};
+use itertools::Itertools;
+
+use crate::{
+    PartitionedFile, TableSchema,
+    file_stream::{FileOpenFuture, FileOpener},
+};
+
+/// A file opener that handles applying a projection on top of an inner opener.
+///
+/// This includes handling partition columns.
+///
+/// Any projection pushed down will be split up into:
+/// - Simple column indices / column selection
+/// - A remainder projection that this opener applies on top of it
+///
+/// This is meant to simplify projection pushdown for sources like CSV
+/// that can only handle "simple" column selection.
+pub struct ProjectionOpener {
+    inner: Arc<dyn FileOpener>,
+    projection: ProjectionExprs,
+    input_schema: SchemaRef,
+    partition_columns: Vec<PartitionColumnIndex>,
+}
+
+impl ProjectionOpener {
+    pub fn try_new(
+        projection: SplitProjection,
+        inner: Arc<dyn FileOpener>,
+        file_schema: &Schema,
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(ProjectionOpener {
+            inner,
+            projection: projection.remapped_projection,
+            input_schema: Arc::new(file_schema.project(&projection.file_indices)?),
+            partition_columns: projection.partition_columns,
+        }))
+    }
+}
+
+impl FileOpener for ProjectionOpener {
+    fn open(&self, partitioned_file: PartitionedFile) -> Result<FileOpenFuture> {
+        let partition_values = partitioned_file.partition_values.clone();
+        // Modify any references to partition columns in the projection expressions
+        // and substitute them with literal values from PartitionedFile.partition_values
+        let projection = if self.partition_columns.is_empty() {
+            self.projection.clone()
+        } else {
+            inject_partition_columns_into_projection(
+                &self.projection,
+                &self.partition_columns,
+                partition_values,
+            )
+        };
+        let projector = projection.make_projector(&self.input_schema)?;
+
+        let inner = self.inner.open(partitioned_file)?;
+
+        Ok(async move {
+            let stream = inner.await?;
+            let stream = stream.map(move |batch| {
+                let batch = batch?;
+                let batch = projector.project_batch(&batch)?;
+                Ok(batch)
+            });
+            Ok(stream.boxed())
+        }
+        .boxed())
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct PartitionColumnIndex {
+    /// The index of this partition column in the remainder projection (>= num_file_columns)
+    pub in_remainder_projection: usize,
+    /// The index of this partition column in the partition_values array
+    pub in_partition_values: usize,
+}
+
+fn inject_partition_columns_into_projection(
+    projection: &ProjectionExprs,
+    partition_columns: &[PartitionColumnIndex],
+    partition_values: Vec<ScalarValue>,
+) -> ProjectionExprs {
+    // Pre-create all literals for partition columns to avoid cloning ScalarValues multiple times.
+    let partition_literals: Vec<Arc<Literal>> = partition_values
+        .into_iter()
+        .map(|value| Arc::new(Literal::new(value)))
+        .collect();
+
+    let projections = projection
+        .iter()
+        .map(|projection| {
+            let expr = Arc::clone(&projection.expr)
+                .transform(|expr| {
+                    let original_expr = Arc::clone(&expr);
+                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                        // Check if this column index corresponds to a partition column
+                        if let Some(pci) = partition_columns
+                            .iter()
+                            .find(|pci| pci.in_remainder_projection == column.index())
+                        {
+                            let literal =
+                                Arc::clone(&partition_literals[pci.in_partition_values]);
+                            return Ok(Transformed::yes(literal));
+                        }
+                    }
+                    Ok(Transformed::no(original_expr))
+                })
+                .data()
+                .expect("infallible transform");
+            ProjectionExpr::new(expr, projection.alias.clone())
+        })
+        .collect_vec();
+    ProjectionExprs::new(projections)
+}
+
+/// At a high level the goal of SplitProjection is to take a ProjectionExprs meant to be applied to the table schema
+/// and split that into:
+/// - The projection indices into the file schema (file_indices)
+/// - The projection indices into the partition values (partition_value_indices), which pre-compute both the index into the table schema
+///   and the index into the partition values array
+/// - A remapped projection that can be applied after the file projection is applied
+///   This remapped projection has the following properties:
+///     - Column indices referring to file columns are remapped to [0..file_indices.len())
+///     - Column indices referring to partition columns are remapped to [file_indices.len()..)
+///
+///   This allows the ProjectionOpener to easily identify which columns in the remapped projection
+///   refer to partition columns and substitute them with literals from the partition values.
+#[derive(Debug, Clone)]
+pub struct SplitProjection {
+    /// The original projection this [`SplitProjection`] was derived from
+    pub source: ProjectionExprs,
+    /// Column indices to read from file (public for file sources)
+    pub file_indices: Vec<usize>,
+    /// Pre-computed partition column mappings (internal, used by ProjectionOpener)
+    pub(crate) partition_columns: Vec<PartitionColumnIndex>,
+    /// The remapped projection (internal, used by ProjectionOpener)
+    pub(crate) remapped_projection: ProjectionExprs,
+}
+
+impl SplitProjection {
+    pub fn unprojected(table_schema: &TableSchema) -> Self {
+        let projection = ProjectionExprs::from_indices(
+            &(0..table_schema.table_schema().fields().len()).collect_vec(),
+            table_schema.table_schema(),
+        );
+        Self::new(table_schema.file_schema(), &projection)
+    }
+
+    /// Creates a new [`SplitProjection`] by splitting a projection into
+    /// simple file column indices and a remainder projection that is applied after reading the file.
+    ///
+    /// In other words: we get a `Vec<usize>` projection that is meant to be applied on top of `file_schema`
+    /// and a remainder projection that is applied to the result of that first projection.
+    ///
+    /// Here `file_schema` is expected to be the *logical* schema of the file, that is the
+    /// table schema minus any partition columns.
+    /// Partition columns are always expected to be at the end of the table schema.
+    /// Note that `file_schema` is *not* the physical schema of the file.
+    pub fn new(logical_file_schema: &Schema, projection: &ProjectionExprs) -> Self {
+        let num_file_schema_columns = logical_file_schema.fields().len();
+
+        // Collect all unique columns and classify as file or partition
+        let mut file_columns = Vec::new();
+        let mut partition_columns = Vec::new();
+        let mut all_columns = std::collections::HashMap::new();
+
+        // Extract all unique column references (index -> name)
+        for proj_expr in projection {
+            proj_expr
+                .expr
+                .apply(|expr| {
+                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                        all_columns
+                            .entry(column.index())
+                            .or_insert_with(|| column.name().to_string());
+                    }
+                    Ok(datafusion_common::tree_node::TreeNodeRecursion::Continue)
+                })
+                .expect("infallible apply");
+        }
+
+        // Sort by index and classify into file vs partition columns
+        let mut sorted_columns: Vec<_> = all_columns
+            .into_iter()
+            .map(|(idx, name)| (name, idx))
+            .collect();
+        sorted_columns.sort_by_key(|(_, idx)| *idx);
+
+        // Separate file and partition columns, assigning final indices
+        // Pre-create all remapped columns to avoid duplicate Arc'd expressions
+        let mut column_mapping = std::collections::HashMap::new();
+        let mut file_idx = 0;
+        let mut partition_idx = 0;
+
+        for (name, original_index) in sorted_columns {
+            let new_index = if original_index < num_file_schema_columns {
+                // File column: gets index [0..num_file_columns)
+                file_columns.push(original_index);
+                let idx = file_idx;
+                file_idx += 1;
+                idx
+            } else {
+                // Partition column: gets index [num_file_columns..)
+                partition_columns.push(original_index);
+                let idx = file_idx + partition_idx;
+                partition_idx += 1;
+                idx
+            };
+
+            // Pre-create the remapped column so all references can share the same Arc
+            let new_column: Arc<dyn datafusion_physical_plan::PhysicalExpr> =
+                Arc::new(Column::new(&name, new_index));
+            column_mapping.insert(original_index, new_column);
+        }
+
+        // Single tree transformation: remap all column references using pre-created columns
+        let remapped_projection = projection
+            .iter()
+            .map(|proj_expr| {
+                let expr = Arc::clone(&proj_expr.expr)
+                    .transform(|expr| {
+                        let original_expr = Arc::clone(&expr);
+                        if let Some(column) = expr.as_any().downcast_ref::<Column>()
+                            && let Some(new_column) = column_mapping.get(&column.index())
+                        {
+                            return Ok(Transformed::yes(Arc::clone(new_column)));
+                        }
+                        Ok(Transformed::no(original_expr))
+                    })
+                    .data()
+                    .expect("infallible transform");
+                ProjectionExpr::new(expr, proj_expr.alias.clone())
+            })
+            .collect_vec();
+
+        // Pre-compute partition column mappings for ProjectionOpener
+        let num_file_columns = file_columns.len();
+        let partition_column_mappings = partition_columns
+            .iter()
+            .enumerate()
+            .map(|(partition_idx, &table_index)| PartitionColumnIndex {
+                in_remainder_projection: num_file_columns + partition_idx,
+                in_partition_values: table_index - num_file_schema_columns,
+            })
+            .collect_vec();
+
+        Self {
+            source: projection.clone(),
+            file_indices: file_columns,
+            partition_columns: partition_column_mappings,
+            remapped_projection: ProjectionExprs::from(remapped_projection),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::array::AsArray;
+    use arrow::datatypes::{DataType, SchemaRef};
+    use datafusion_common::{DFSchema, ScalarValue, record_batch};
+    use datafusion_expr::{Expr, col, execution_props::ExecutionProps};
+    use datafusion_physical_expr::{create_physical_exprs, projection::ProjectionExpr};
+    use itertools::Itertools;
+
+    use super::*;
+
+    fn create_projection_exprs<'a>(
+        exprs: impl IntoIterator<Item = &'a Expr>,
+        schema: &SchemaRef,
+    ) -> ProjectionExprs {
+        let df_schema = DFSchema::try_from(Arc::clone(schema)).unwrap();
+        let physical_exprs =
+            create_physical_exprs(exprs, &df_schema, &ExecutionProps::default()).unwrap();
+        let projection_exprs = physical_exprs
+            .into_iter()
+            .enumerate()
+            .map(|(i, e)| ProjectionExpr::new(Arc::clone(&e), format!("col{i}")))
+            .collect_vec();
+        ProjectionExprs::from(projection_exprs)
+    }
+
+    #[test]
+    fn test_split_projection_with_partition_columns() {
+        use arrow::array::AsArray;
+        use arrow::datatypes::Field;
+        // Simulate the avro_exec_with_partition test scenario:
+        // file_schema has 3 fields
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("bool_col", DataType::Boolean, false),
+            Field::new("tinyint_col", DataType::Int8, false),
+        ]));
+
+        // table_schema has 4 fields (3 file + 1 partition)
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("bool_col", DataType::Boolean, false),
+            Field::new("tinyint_col", DataType::Int8, false),
+            Field::new("date", DataType::Utf8, false), // partition column at index 3
+        ]));
+
+        // projection indices: [0, 1, 3, 2]
+        // This should select: id (0), bool_col (1), date (3-partition), tinyint_col (2)
+        let projection_indices = vec![0, 1, 3, 2];
+
+        // Create projection expressions from indices using the table schema
+        let projection =
+            ProjectionExprs::from_indices(&projection_indices, &table_schema);
+
+        // Call SplitProjection to separate file and partition columns
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // The file_indices should be [0, 1, 2] (all file columns needed)
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+
+        // Should have 1 partition column at in_partition_values index 0
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+
+        // Now create a batch with only the file columns
+        let file_batch = record_batch!(
+            ("id", Int32, vec![4]),
+            ("bool_col", Boolean, vec![true]),
+            ("tinyint_col", Int8, vec![0])
+        )
+        .unwrap();
+
+        // After the fix, the remainder projection should have remapped indices:
+        // - File columns: [0, 1, 2] (unchanged since they're already in order)
+        // - Partition column: [3] (stays at index 3, which is >= num_file_columns)
+        // So the remainder expects input columns [0, 1, 2] and references column [3] for partition
+
+        // Verify that we can inject partition columns and apply the projection
+        let partition_values = vec![ScalarValue::from("2021-10-26")];
+
+        // Create partition column mapping
+        let partition_columns = vec![PartitionColumnIndex {
+            in_remainder_projection: 3, // partition column is at index 3 in remainder
+            in_partition_values: 0,     // first partition value
+        }];
+
+        // Inject partition columns (replaces Column(3) with Literal)
+        let injected_projection = inject_partition_columns_into_projection(
+            &split.remapped_projection,
+            &partition_columns,
+            partition_values,
+        );
+
+        // Now the projection should work on the file batch
+        let projector = injected_projection
+            .make_projector(&file_batch.schema())
+            .unwrap();
+        let result = projector.project_batch(&file_batch).unwrap();
+
+        // Verify the output has the correct column order: id, bool_col, date, tinyint_col
+        assert_eq!(result.num_columns(), 4);
+        assert_eq!(
+            result
+                .column(0)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            4
+        );
+        assert!(result.column(1).as_boolean().value(0));
+        assert_eq!(result.column(2).as_string::<i32>().value(0), "2021-10-26");
+        assert_eq!(
+            result
+                .column(3)
+                .as_primitive::<arrow::datatypes::Int8Type>()
+                .value(0),
+            0
+        );
+    }
+
+    // ========================================================================
+    // Comprehensive Test Suite for SplitProjection
+    // ========================================================================
+
+    // Helper to create test schemas with file and partition columns
+    fn create_test_schemas(
+        file_cols: usize,
+        partition_cols: usize,
+    ) -> (SchemaRef, SchemaRef) {
+        use arrow::datatypes::Field;
+
+        let file_fields: Vec<_> = (0..file_cols)
+            .map(|i| Field::new(format!("col_{i}"), DataType::Int32, false))
+            .collect();
+
+        let mut table_fields = file_fields.clone();
+        table_fields.extend(
+            (0..partition_cols)
+                .map(|i| Field::new(format!("part_{i}"), DataType::Utf8, false)),
+        );
+
+        (
+            Arc::new(Schema::new(file_fields)),
+            Arc::new(Schema::new(table_fields)),
+        )
+    }
+
+    // ========================================================================
+    // Partition Column Handling Tests
+    // ========================================================================
+
+    #[test]
+    fn test_split_projection_only_file_columns() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Select only file columns [0, 1, 2]
+        let projection = ProjectionExprs::from_indices(&[0, 1, 2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+        assert_eq!(split.partition_columns.len(), 0);
+    }
+
+    #[test]
+    fn test_split_projection_only_partition_columns() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Select only partition columns [3, 4]
+        let projection = ProjectionExprs::from_indices(&[3, 4], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 2);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+    }
+
+    #[test]
+    fn test_split_projection_multiple_partition_columns() {
+        let (file_schema, table_schema) = create_test_schemas(2, 3);
+        // File cols: 0, 1; Partition cols: 2, 3, 4
+        // Select: [0, 2, 4, 1, 3] (mixed file and partition)
+        let projection = ProjectionExprs::from_indices(&[0, 2, 4, 1, 3], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1]);
+        assert_eq!(split.partition_columns.len(), 3);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+        assert_eq!(split.partition_columns[2].in_partition_values, 2);
+
+        // Verify remapped projection has correct indices
+        // File columns should be at [0, 1], partition columns at [2, 3, 4]
+        assert_eq!(split.remapped_projection.iter().count(), 5);
+    }
+
+    #[test]
+    fn test_split_projection_partition_columns_reverse_order() {
+        let (file_schema, table_schema) = create_test_schemas(2, 2);
+        // File cols: 0, 1; Partition cols: 2, 3
+        // Select: [3, 2] (partitions in reverse)
+        let projection = ProjectionExprs::from_indices(&[3, 2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 2);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+    }
+
+    #[test]
+    fn test_split_projection_interleaved_file_and_partition() {
+        let (file_schema, table_schema) = create_test_schemas(3, 3);
+        // File cols: 0, 1, 2; Partition cols: 3, 4, 5
+        // Select: [0, 3, 1, 4, 2, 5] (alternating)
+        let projection =
+            ProjectionExprs::from_indices(&[0, 3, 1, 4, 2, 5], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![0, 1, 2]);
+        assert_eq!(split.partition_columns.len(), 3);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+        assert_eq!(split.partition_columns[1].in_partition_values, 1);
+        assert_eq!(split.partition_columns[2].in_partition_values, 2);
+    }
+
+    #[test]
+    fn test_split_projection_expression_with_file_and_partition_columns() {
+        use arrow::datatypes::Field;
+
+        // Create schemas: 2 file columns, 1 partition column
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("file_a", DataType::Int32, false),
+            Field::new("file_b", DataType::Int32, false),
+        ]));
+        let table_schema = Arc::new(Schema::new(vec![
+            Field::new("file_a", DataType::Int32, false),
+            Field::new("file_b", DataType::Int32, false),
+            Field::new("part_c", DataType::Int32, false),
+        ]));
+
+        // Create expression: file_a + part_c
+        let exprs = [col("file_a") + col("part_c")];
+        let projection = create_projection_exprs(exprs.iter(), &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // Should extract both columns
+        assert_eq!(split.file_indices, vec![0]);
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+    }
+
+    // ========================================================================
+    // Category 4: Boundary Conditions
+    // ========================================================================
+
+    #[test]
+    fn test_split_projection_boundary_last_file_column() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // Last file column is index 2
+        let projection = ProjectionExprs::from_indices(&[2], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, vec![2]);
+        assert_eq!(split.partition_columns.len(), 0);
+    }
+
+    #[test]
+    fn test_split_projection_boundary_first_partition_column() {
+        let (file_schema, table_schema) = create_test_schemas(3, 2);
+        // First partition column is index 3
+        let projection = ProjectionExprs::from_indices(&[3], &table_schema);
+
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        assert_eq!(split.file_indices, Vec::<usize>::new());
+        assert_eq!(split.partition_columns.len(), 1);
+        assert_eq!(split.partition_columns[0].in_partition_values, 0);
+    }
+
+    // ========================================================================
+    // Category 6: Integration Tests
+    // ========================================================================
+
+    #[test]
+    fn test_inject_partition_columns_multiple_partitions() {
+        let data =
+            record_batch!(("col_0", Int32, vec![1]), ("col_1", Int32, vec![2])).unwrap();
+
+        // Create projection that references file columns and partition columns
+        let (file_schema, table_schema) = create_test_schemas(2, 2);
+        // Projection: [0, 2, 1, 3] = [file_0, part_0, file_1, part_1]
+        let projection = ProjectionExprs::from_indices(&[0, 2, 1, 3], &table_schema);
+        let split = SplitProjection::new(&file_schema, &projection);
+
+        // Create partition column mappings
+        let partition_columns = vec![
+            PartitionColumnIndex {
+                in_remainder_projection: 2, // First partition column at index 2
+                in_partition_values: 0,
+            },
+            PartitionColumnIndex {
+                in_remainder_projection: 3, // Second partition column at index 3
+                in_partition_values: 1,
+            },
+        ];
+
+        let partition_values =
+            vec![ScalarValue::from("part_a"), ScalarValue::from("part_b")];
+
+        let injected = inject_partition_columns_into_projection(
+            &split.remapped_projection,
+            &partition_columns,
+            partition_values,
+        );
+
+        // Apply projection
+        let projector = injected.make_projector(&data.schema()).unwrap();
+        let result = projector.project_batch(&data).unwrap();
+
+        assert_eq!(result.num_columns(), 4);
+        assert_eq!(
+            result
+                .column(0)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            1
+        );
+        assert_eq!(result.column(1).as_string::<i32>().value(0), "part_a");
+        assert_eq!(
+            result
+                .column(2)
+                .as_primitive::<arrow::datatypes::Int32Type>()
+                .value(0),
+            2
+        );
+        assert_eq!(result.column(3).as_string::<i32>().value(0), "part_b");
+    }
+}
diff --git a/datafusion/datasource/src/schema_adapter.rs b/datafusion/datasource/src/schema_adapter.rs
index bacec7f4f9f00..c995fa58d6c89 100644
--- a/datafusion/datasource/src/schema_adapter.rs
+++ b/datafusion/datasource/src/schema_adapter.rs
@@ -15,451 +15,218 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`SchemaAdapter`] and [`SchemaAdapterFactory`] to adapt file-level record batches to a table schema.
+//! Deprecated: [`SchemaAdapter`] and [`SchemaAdapterFactory`] have been removed.
 //!
-//! Adapter provides a method of translating the RecordBatches that come out of the
-//! physical format into how they should be used by DataFusion.  For instance, a schema
-//! can be stored external to a parquet file that maps parquet logical types to arrow types.
+//! Use [`PhysicalExprAdapterFactory`] instead. See `upgrading.md` for more details.
+//!
+//! [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+
+#![allow(deprecated)]
 
-use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions};
-use arrow::compute::{can_cast_types, cast};
-use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{plan_err, ColumnStatistics};
+use arrow::array::{ArrayRef, RecordBatch};
+use arrow::datatypes::{Field, Schema, SchemaRef};
+use datafusion_common::{ColumnStatistics, Result, not_impl_err};
+use log::warn;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-/// Factory for creating [`SchemaAdapter`]
+/// Deprecated: Function type for casting columns.
+///
+/// This type has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
+///
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
+pub type CastColumnFn = dyn Fn(&ArrayRef, &Field, &arrow::compute::CastOptions) -> Result<ArrayRef>
+    + Send
+    + Sync;
+
+/// Deprecated: Factory for creating [`SchemaAdapter`].
 ///
-/// This interface provides a way to implement custom schema adaptation logic
-/// for DataSourceExec (for example, to fill missing columns with default value
-/// other than null).
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// Most users should use [`DefaultSchemaAdapterFactory`]. See that struct for
-/// more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaAdapterFactory: Debug + Send + Sync + 'static {
     /// Create a [`SchemaAdapter`]
-    ///
-    /// Arguments:
-    ///
-    /// * `projected_table_schema`: The schema for the table, projected to
-    ///   include only the fields being output (projected) by the this mapping.
-    ///
-    /// * `table_schema`: The entire table schema for the table
     fn create(
         &self,
         projected_table_schema: SchemaRef,
         table_schema: SchemaRef,
     ) -> Box<dyn SchemaAdapter>;
+
+    /// Create a [`SchemaAdapter`] using only the projected table schema.
+    fn create_with_projected_schema(
+        &self,
+        projected_table_schema: SchemaRef,
+    ) -> Box<dyn SchemaAdapter> {
+        self.create(Arc::clone(&projected_table_schema), projected_table_schema)
+    }
 }
 
-/// Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table
-/// schema, which may have a schema obtained from merging multiple file-level
-/// schemas.
+/// Deprecated: Creates [`SchemaMapper`]s to map file-level [`RecordBatch`]es to a table schema.
 ///
-/// This is useful for implementing schema evolution in partitioned datasets.
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// See [`DefaultSchemaAdapterFactory`] for more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaAdapter: Send + Sync {
-    /// Map a column index in the table schema to a column index in a particular
-    /// file schema
-    ///
-    /// This is used while reading a file to push down projections by mapping
-    /// projected column indexes from the table schema to the file schema
-    ///
-    /// Panics if index is not in range for the table schema
+    /// Map a column index in the table schema to a column index in a particular file schema.
     fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize>;
 
-    /// Creates a mapping for casting columns from the file schema to the table
-    /// schema.
-    ///
-    /// This is used after reading a record batch. The returned [`SchemaMapper`]:
-    ///
-    /// 1. Maps columns to the expected columns indexes
-    /// 2. Handles missing values (e.g. fills nulls or a default value) for
-    ///    columns in the in the table schema not in the file schema
-    /// 2. Handles different types: if the column in the file schema has a
-    ///    different type than `table_schema`, the mapper will resolve this
-    ///    difference (e.g. by casting to the appropriate type)
-    ///
-    /// Returns:
-    /// * a [`SchemaMapper`]
-    /// * an ordered list of columns to project from the file
+    /// Creates a mapping for casting columns from the file schema to the table schema.
     fn map_schema(
         &self,
         file_schema: &Schema,
-    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
+    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
 }
 
-/// Maps, columns from a specific file schema to the table schema.
+/// Deprecated: Maps columns from a specific file schema to the table schema.
+///
+/// This trait has been removed. Use [`PhysicalExprAdapterFactory`] instead.
+/// See `upgrading.md` for more details.
 ///
-/// See [`DefaultSchemaAdapterFactory`] for more details and examples.
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaMapper has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 pub trait SchemaMapper: Debug + Send + Sync {
-    /// Adapts a `RecordBatch` to match the `table_schema`
-    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>;
+    /// Adapts a `RecordBatch` to match the `table_schema`.
+    fn map_batch(&self, batch: RecordBatch) -> Result<RecordBatch>;
 
-    /// Adapts file-level column `Statistics` to match the `table_schema`
+    /// Adapts file-level column `Statistics` to match the `table_schema`.
     fn map_column_statistics(
         &self,
         file_col_statistics: &[ColumnStatistics],
-    ) -> datafusion_common::Result<Vec<ColumnStatistics>>;
+    ) -> Result<Vec<ColumnStatistics>>;
 }
 
-/// Default  [`SchemaAdapterFactory`] for mapping schemas.
-///
-/// This can be used to adapt file-level record batches to a table schema and
-/// implement schema evolution.
-///
-/// Given an input file schema and a table schema, this factory returns
-/// [`SchemaAdapter`] that return [`SchemaMapper`]s that:
-///
-/// 1. Reorder columns
-/// 2. Cast columns to the correct type
-/// 3. Fill missing columns with nulls
-///
-/// # Errors:
-///
-/// * If a column in the table schema is non-nullable but is not present in the
-///   file schema (i.e. it is missing), the returned mapper tries to fill it with
-///   nulls resulting in a schema error.
-///
-/// # Illustration of Schema Mapping
+/// Deprecated: Default [`SchemaAdapterFactory`] for mapping schemas.
 ///
-/// ```text
-/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─                  ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
-///  ┌───────┐   ┌───────┐ │                  ┌───────┐   ┌───────┐   ┌───────┐ │
-/// ││  1.0  │   │ "foo" │                   ││ NULL  │   │ "foo" │   │ "1.0" │
-///  ├───────┤   ├───────┤ │ Schema mapping   ├───────┤   ├───────┤   ├───────┤ │
-/// ││  2.0  │   │ "bar" │                   ││  NULL │   │ "bar" │   │ "2.0" │
-///  └───────┘   └───────┘ │────────────────▶ └───────┘   └───────┘   └───────┘ │
-/// │                                        │
-///  column "c"  column "b"│                  column "a"  column "b"  column "c"│
-/// │ Float64       Utf8                     │  Int32        Utf8        Utf8
-///  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘                  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
-///     Input Record Batch                         Output Record Batch
+/// This struct has been removed.
 ///
-///     Schema {                                   Schema {
-///      "c": Float64,                              "a": Int32,
-///      "b": Utf8,                                 "b": Utf8,
-///     }                                           "c": Utf8,
-///                                                }
-/// ```
+/// Use [`PhysicalExprAdapterFactory`] instead to customize scans via
+/// [`FileScanConfigBuilder`], i.e. if you had implemented a custom [`SchemaAdapter`]
+/// and passed that into [`FileScanConfigBuilder`] / [`ParquetSource`].
+/// Use [`BatchAdapter`] if you want to map a stream of [`RecordBatch`]es
+/// between one schema and another, i.e. if you were calling [`SchemaMapper::map_batch`] manually.
 ///
-/// # Example of using the `DefaultSchemaAdapterFactory` to map [`RecordBatch`]s
+/// See `upgrading.md` for more details.
 ///
-/// Note `SchemaMapping` also supports mapping partial batches, which is used as
-/// part of predicate pushdown.
-///
-/// ```
-/// # use std::sync::Arc;
-/// # use arrow::datatypes::{DataType, Field, Schema};
-/// # use datafusion_datasource::schema_adapter::{DefaultSchemaAdapterFactory, SchemaAdapterFactory};
-/// # use datafusion_common::record_batch;
-/// // Table has fields "a",  "b" and "c"
-/// let table_schema = Schema::new(vec![
-///     Field::new("a", DataType::Int32, true),
-///     Field::new("b", DataType::Utf8, true),
-///     Field::new("c", DataType::Utf8, true),
-/// ]);
-///
-/// // create an adapter to map the table schema to the file schema
-/// let adapter = DefaultSchemaAdapterFactory::from_schema(Arc::new(table_schema));
-///
-/// // The file schema has fields "c" and "b" but "b" is stored as an 'Float64'
-/// // instead of 'Utf8'
-/// let file_schema = Schema::new(vec![
-///    Field::new("c", DataType::Utf8, true),
-///    Field::new("b", DataType::Float64, true),
-/// ]);
-///
-/// // Get a mapping from the file schema to the table schema
-/// let (mapper, _indices) = adapter.map_schema(&file_schema).unwrap();
-///
-/// let file_batch = record_batch!(
-///     ("c", Utf8, vec!["foo", "bar"]),
-///     ("b", Float64, vec![1.0, 2.0])
-/// ).unwrap();
-///
-/// let mapped_batch = mapper.map_batch(file_batch).unwrap();
-///
-/// // the mapped batch has the correct schema and the "b" column has been cast to Utf8
-/// let expected_batch = record_batch!(
-///    ("a", Int32, vec![None, None]),  // missing column filled with nulls
-///    ("b", Utf8, vec!["1.0", "2.0"]), // b was cast to string and order was changed
-///    ("c", Utf8, vec!["foo", "bar"])
-/// ).unwrap();
-/// assert_eq!(mapped_batch, expected_batch);
-/// ```
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+/// [`FileScanConfigBuilder`]: crate::file_scan_config::FileScanConfigBuilder
+/// [`ParquetSource`]: https://docs.rs/datafusion-datasource-parquet/latest/datafusion_datasource_parquet/source/struct.ParquetSource.html
+/// [`BatchAdapter`]: datafusion_physical_expr_adapter::BatchAdapter
+#[deprecated(
+    since = "52.0.0",
+    note = "DefaultSchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 #[derive(Clone, Debug, Default)]
 pub struct DefaultSchemaAdapterFactory;
 
-impl DefaultSchemaAdapterFactory {
-    /// Create a new factory for mapping batches from a file schema to a table
-    /// schema.
-    ///
-    /// This is a convenience for [`DefaultSchemaAdapterFactory::create`] with
-    /// the same schema for both the projected table schema and the table
-    /// schema.
-    pub fn from_schema(table_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
-        Self.create(Arc::clone(&table_schema), table_schema)
-    }
-}
-
 impl SchemaAdapterFactory for DefaultSchemaAdapterFactory {
     fn create(
         &self,
         projected_table_schema: SchemaRef,
         _table_schema: SchemaRef,
     ) -> Box<dyn SchemaAdapter> {
-        Box::new(DefaultSchemaAdapter {
-            projected_table_schema,
+        Box::new(DeprecatedSchemaAdapter {
+            _projected_table_schema: projected_table_schema,
+        })
+    }
+}
+
+impl DefaultSchemaAdapterFactory {
+    /// Deprecated: Create a new factory for mapping batches from a file schema to a table schema.
+    #[deprecated(
+        since = "52.0.0",
+        note = "DefaultSchemaAdapterFactory has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+    )]
+    pub fn from_schema(table_schema: SchemaRef) -> Box<dyn SchemaAdapter> {
+        // Note: this method did not return an error thus the errors are raised from the returned adapter
+        warn!(
+            "DefaultSchemaAdapterFactory::from_schema is deprecated. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+        );
+        Box::new(DeprecatedSchemaAdapter {
+            _projected_table_schema: table_schema,
         })
     }
 }
 
-/// This SchemaAdapter requires both the table schema and the projected table
-/// schema. See  [`SchemaMapping`] for more details
-#[derive(Clone, Debug)]
-pub(crate) struct DefaultSchemaAdapter {
-    /// The schema for the table, projected to include only the fields being output (projected) by the
-    /// associated ParquetSource
-    projected_table_schema: SchemaRef,
+/// Internal deprecated adapter that returns errors when methods are called.
+struct DeprecatedSchemaAdapter {
+    _projected_table_schema: SchemaRef,
 }
 
-impl SchemaAdapter for DefaultSchemaAdapter {
-    /// Map a column index in the table schema to a column index in a particular
-    /// file schema
-    ///
-    /// Panics if index is not in range for the table schema
-    fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
-        let field = self.projected_table_schema.field(index);
-        Some(file_schema.fields.find(field.name())?.0)
+impl SchemaAdapter for DeprecatedSchemaAdapter {
+    fn map_column_index(&self, _index: usize, _file_schema: &Schema) -> Option<usize> {
+        None // Safe no-op
     }
 
-    /// Creates a `SchemaMapping` for casting or mapping the columns from the
-    /// file schema to the table schema.
-    ///
-    /// If the provided `file_schema` contains columns of a different type to
-    /// the expected `table_schema`, the method will attempt to cast the array
-    /// data from the file schema to the table schema where possible.
-    ///
-    /// Returns a [`SchemaMapping`] that can be applied to the output batch
-    /// along with an ordered list of columns to project from the file
     fn map_schema(
         &self,
-        file_schema: &Schema,
-    ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
-        let mut projection = Vec::with_capacity(file_schema.fields().len());
-        let mut field_mappings = vec![None; self.projected_table_schema.fields().len()];
-
-        for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
-            if let Some((table_idx, table_field)) =
-                self.projected_table_schema.fields().find(file_field.name())
-            {
-                match can_cast_types(file_field.data_type(), table_field.data_type()) {
-                    true => {
-                        field_mappings[table_idx] = Some(projection.len());
-                        projection.push(file_idx);
-                    }
-                    false => {
-                        return plan_err!(
-                            "Cannot cast file schema field {} of type {:?} to table schema field of type {:?}",
-                            file_field.name(),
-                            file_field.data_type(),
-                            table_field.data_type()
-                        )
-                    }
-                }
-            }
-        }
-
-        Ok((
-            Arc::new(SchemaMapping {
-                projected_table_schema: Arc::clone(&self.projected_table_schema),
-                field_mappings,
-            }),
-            projection,
-        ))
+        _file_schema: &Schema,
+    ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
+        not_impl_err!(
+            "SchemaAdapter has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
+        )
     }
 }
 
-/// The SchemaMapping struct holds a mapping from the file schema to the table
-/// schema and any necessary type conversions.
+/// Deprecated: The SchemaMapping struct held a mapping from the file schema to the table schema.
 ///
-/// [`map_batch`] is used by the ParquetOpener to produce a RecordBatch which
-/// has the projected schema, since that's the schema which is supposed to come
-/// out of the execution of this query. Thus `map_batch` uses
-/// `projected_table_schema` as it can only operate on the projected fields.
+/// This struct has been removed.
 ///
-/// [`map_batch`]: Self::map_batch
+/// Use [`PhysicalExprAdapterFactory`] instead to customize scans via
+/// [`FileScanConfigBuilder`], i.e. if you had implemented a custom [`SchemaAdapter`]
+/// and passed that into [`FileScanConfigBuilder`] / [`ParquetSource`].
+/// Use [`BatchAdapter`] if you want to map a stream of [`RecordBatch`]es
+/// between one schema and another, i.e. if you were calling [`SchemaMapper::map_batch`] manually.
+///
+/// See `upgrading.md` for more details.
+///
+/// [`PhysicalExprAdapterFactory`]: datafusion_physical_expr_adapter::PhysicalExprAdapterFactory
+/// [`FileScanConfigBuilder`]: crate::file_scan_config::FileScanConfigBuilder
+/// [`ParquetSource`]: https://docs.rs/datafusion-datasource-parquet/latest/datafusion_datasource_parquet/source/struct.ParquetSource.html
+/// [`BatchAdapter`]: datafusion_physical_expr_adapter::BatchAdapter
+#[deprecated(
+    since = "52.0.0",
+    note = "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. See upgrading.md for more details."
+)]
 #[derive(Debug)]
 pub struct SchemaMapping {
-    /// The schema of the table. This is the expected schema after conversion
-    /// and it should match the schema of the query result.
-    projected_table_schema: SchemaRef,
-    /// Mapping from field index in `projected_table_schema` to index in
-    /// projected file_schema.
-    ///
-    /// They are Options instead of just plain `usize`s because the table could
-    /// have fields that don't exist in the file.
-    field_mappings: Vec<Option<usize>>,
+    // Private fields removed - this is a skeleton for deprecation purposes only
+    _private: (),
 }
 
 impl SchemaMapper for SchemaMapping {
-    /// Adapts a `RecordBatch` to match the `projected_table_schema` using the stored mapping and
-    /// conversions.
-    /// The produced RecordBatch has a schema that contains only the projected columns.
-    fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch> {
-        let batch_rows = batch.num_rows();
-        let batch_cols = batch.columns().to_vec();
-
-        let cols = self
-            .projected_table_schema
-            // go through each field in the projected schema
-            .fields()
-            .iter()
-            // and zip it with the index that maps fields from the projected table schema to the
-            // projected file schema in `batch`
-            .zip(&self.field_mappings)
-            // and for each one...
-            .map(|(field, file_idx)| {
-                file_idx.map_or_else(
-                    // If this field only exists in the table, and not in the file, then we know
-                    // that it's null, so just return that.
-                    || Ok(new_null_array(field.data_type(), batch_rows)),
-                    // However, if it does exist in both, then try to cast it to the correct output
-                    // type
-                    |batch_idx| cast(&batch_cols[batch_idx], field.data_type()),
-                )
-            })
-            .collect::<datafusion_common::Result<Vec<_>, _>>()?;
-
-        // Necessary to handle empty batches
-        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
-
-        let schema = Arc::clone(&self.projected_table_schema);
-        let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?;
-        Ok(record_batch)
+    fn map_batch(&self, _batch: RecordBatch) -> Result<RecordBatch> {
+        not_impl_err!(
+            "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
+        )
     }
 
-    /// Adapts file-level column `Statistics` to match the `table_schema`
     fn map_column_statistics(
         &self,
-        file_col_statistics: &[ColumnStatistics],
-    ) -> datafusion_common::Result<Vec<ColumnStatistics>> {
-        let mut table_col_statistics = vec![];
-
-        // Map the statistics for each field in the file schema to the corresponding field in the
-        // table schema, if a field is not present in the file schema, we need to fill it with `ColumnStatistics::new_unknown`
-        for (_, file_col_idx) in self
-            .projected_table_schema
-            .fields()
-            .iter()
-            .zip(&self.field_mappings)
-        {
-            if let Some(file_col_idx) = file_col_idx {
-                table_col_statistics.push(
-                    file_col_statistics
-                        .get(*file_col_idx)
-                        .cloned()
-                        .unwrap_or_default(),
-                );
-            } else {
-                table_col_statistics.push(ColumnStatistics::new_unknown());
-            }
-        }
-
-        Ok(table_col_statistics)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use arrow::datatypes::{DataType, Field};
-    use datafusion_common::{stats::Precision, Statistics};
-
-    use super::*;
-
-    #[test]
-    fn test_schema_mapping_map_statistics_basic() {
-        // Create table schema (a, b, c)
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-            Field::new("c", DataType::Float64, true),
-        ]));
-
-        // Create file schema (b, a) - different order, missing c
-        let file_schema = Schema::new(vec![
-            Field::new("b", DataType::Utf8, true),
-            Field::new("a", DataType::Int32, true),
-        ]);
-
-        // Create SchemaAdapter
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-
-        // Get mapper and projection
-        let (mapper, projection) = adapter.map_schema(&file_schema).unwrap();
-
-        // Should project columns 0,1 from file
-        assert_eq!(projection, vec![0, 1]);
-
-        // Create file statistics
-        let mut file_stats = Statistics::default();
-
-        // Statistics for column b (index 0 in file)
-        let b_stats = ColumnStatistics {
-            null_count: Precision::Exact(5),
-            ..Default::default()
-        };
-
-        // Statistics for column a (index 1 in file)
-        let a_stats = ColumnStatistics {
-            null_count: Precision::Exact(10),
-            ..Default::default()
-        };
-
-        file_stats.column_statistics = vec![b_stats, a_stats];
-
-        // Map statistics
-        let table_col_stats = mapper
-            .map_column_statistics(&file_stats.column_statistics)
-            .unwrap();
-
-        // Verify stats
-        assert_eq!(table_col_stats.len(), 3);
-        assert_eq!(table_col_stats[0].null_count, Precision::Exact(10)); // a from file idx 1
-        assert_eq!(table_col_stats[1].null_count, Precision::Exact(5)); // b from file idx 0
-        assert_eq!(table_col_stats[2].null_count, Precision::Absent); // c (unknown)
-    }
-
-    #[test]
-    fn test_schema_mapping_map_statistics_empty() {
-        // Create schemas
-        let table_schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]));
-        let file_schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Utf8, true),
-        ]);
-
-        let adapter = DefaultSchemaAdapter {
-            projected_table_schema: Arc::clone(&table_schema),
-        };
-        let (mapper, _) = adapter.map_schema(&file_schema).unwrap();
-
-        // Empty file statistics
-        let file_stats = Statistics::default();
-        let table_col_stats = mapper
-            .map_column_statistics(&file_stats.column_statistics)
-            .unwrap();
-
-        // All stats should be unknown
-        assert_eq!(table_col_stats.len(), 2);
-        assert_eq!(table_col_stats[0], ColumnStatistics::new_unknown(),);
-        assert_eq!(table_col_stats[1], ColumnStatistics::new_unknown(),);
+        _file_col_statistics: &[ColumnStatistics],
+    ) -> Result<Vec<ColumnStatistics>> {
+        not_impl_err!(
+            "SchemaMapping has been removed. Use PhysicalExprAdapterFactory instead. \
+            See upgrading.md for more details."
+        )
     }
 }
diff --git a/datafusion/datasource/src/sink.rs b/datafusion/datasource/src/sink.rs
index 0552370d8ed0c..155c951fe5756 100644
--- a/datafusion/datasource/src/sink.rs
+++ b/datafusion/datasource/src/sink.rs
@@ -22,22 +22,22 @@ use std::fmt;
 use std::fmt::Debug;
 use std::sync::Arc;
 
+use arrow::array::{ArrayRef, RecordBatch, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalExpr};
+use datafusion_physical_expr_common::sort_expr::{LexRequirement, OrderingRequirements};
 use datafusion_physical_plan::metrics::MetricsSet;
 use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
-use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::{
-    execute_input_stream, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-    PlanProperties, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
+    PlanProperties, SendableRecordBatchStream, execute_input_stream,
 };
 
-use arrow::array::{ArrayRef, RecordBatch, UInt64Array};
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{Distribution, EquivalenceProperties};
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
-
 use async_trait::async_trait;
+use datafusion_physical_plan::execution_plan::{EvaluationType, SchedulingType};
 use futures::StreamExt;
 
 /// `DataSink` implements writing streams of [`RecordBatch`]es to
@@ -47,7 +47,7 @@ use futures::StreamExt;
 /// output.
 #[async_trait]
 pub trait DataSink: DisplayAs + Debug + Send + Sync {
-    /// Returns the data sink as [`Any`](std::any::Any) so that it can be
+    /// Returns the data sink as [`Any`] so that it can be
     /// downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
 
@@ -90,17 +90,22 @@ pub struct DataSinkExec {
     count_schema: SchemaRef,
     /// Optional required sort order for output data.
     sort_order: Option<LexRequirement>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl Debug for DataSinkExec {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "DataSinkExec schema: {:?}", self.count_schema)
+        write!(f, "DataSinkExec schema: {}", self.count_schema)
     }
 }
 
 impl DataSinkExec {
     /// Create a plan to write to `sink`
+    /// Note: DataSinkExec requires its input to have a single partition.
+    /// If the input has multiple partitions, the physical optimizer will
+    /// automatically insert a Merge-related operator to merge them.
+    /// If you construct PhysicalPlan without going through the physical optimizer,
+    /// you must ensure that the input has a single partition.
     pub fn new(
         input: Arc<dyn ExecutionPlan>,
         sink: Arc<dyn DataSink>,
@@ -113,7 +118,7 @@ impl DataSinkExec {
             sink,
             count_schema: make_count_schema(),
             sort_order,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -143,6 +148,8 @@ impl DataSinkExec {
             input.pipeline_behavior(),
             input.boundedness(),
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
+        .with_evaluation_type(EvaluationType::Eager)
     }
 }
 
@@ -168,7 +175,7 @@ impl ExecutionPlan for DataSinkExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -184,10 +191,10 @@ impl ExecutionPlan for DataSinkExec {
         vec![Distribution::SinglePartition; self.children().len()]
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         // The required input ordering is set externally (e.g. by a `ListingTable`).
-        // Otherwise, there is no specific requirement (i.e. `sort_expr` is `None`).
-        vec![self.sort_order.as_ref().cloned()]
+        // Otherwise, there is no specific requirement (i.e. `sort_order` is `None`).
+        vec![self.sort_order.as_ref().cloned().map(Into::into)]
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -213,6 +220,19 @@ impl ExecutionPlan for DataSinkExec {
         )))
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to sort order requirements if present
+        if let Some(sort_order) = &self.sort_order {
+            for req in sort_order.iter() {
+                f(req.expr.as_ref())?;
+            }
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Execute the plan and return a stream of `RecordBatch`es for
     /// the specified partition.
     fn execute(
@@ -220,9 +240,11 @@ impl ExecutionPlan for DataSinkExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if partition != 0 {
-            return internal_err!("DataSinkExec can only be called on partition 0!");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "DataSinkExec can only be called on partition 0!"
+        );
         let data = execute_input_stream(
             Arc::clone(&self.input),
             Arc::clone(self.sink.schema()),
diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs
index 30ecc38709f49..e5945245a408f 100644
--- a/datafusion/datasource/src/source.rs
+++ b/datafusion/datasource/src/source.rs
@@ -22,21 +22,33 @@ use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
-use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion_physical_expr::projection::ProjectionExprs;
+use datafusion_physical_plan::execution_plan::{
+    Boundedness, EmissionType, SchedulingType,
+};
+use datafusion_physical_plan::metrics::SplitMetrics;
 use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::stream::BatchSplitStream;
 use datafusion_physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
+use itertools::Itertools;
 
 use crate::file_scan_config::FileScanConfig;
+use crate::file_stream::{
+    FileStreamBuilder, SharedFileStreamState, shared_file_stream_state_for,
+};
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{Constraints, Result, Statistics};
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::coop::cooperative;
 use datafusion_physical_plan::filter_pushdown::{
-    ChildPushdownResult, FilterPushdownPropagation,
+    ChildPushdownResult, FilterPushdownPhase, FilterPushdownPropagation, PushedDown,
 };
 
 /// A source of data, typically a list of files or memory
@@ -67,8 +79,8 @@ use datafusion_physical_plan::filter_pushdown::{
 /// ```text
 ///                       ┌─────────────────────┐                              -----► execute path
 ///                       │                     │                              ┄┄┄┄┄► init path
-///                       │   DataSourceExec    │  
-///                       │                     │    
+///                       │   DataSourceExec    │
+///                       │                     │
 ///                       └───────▲─────────────┘
 ///                               ┊  │
 ///                               ┊  │
@@ -143,7 +155,14 @@ pub trait DataSource: Send + Sync + Debug {
 
     fn output_partitioning(&self) -> Partitioning;
     fn eq_properties(&self) -> EquivalenceProperties;
-    fn statistics(&self) -> Result<Statistics>;
+    fn scheduling_type(&self) -> SchedulingType {
+        SchedulingType::NonCooperative
+    }
+
+    /// Returns statistics for a specific partition, or aggregate statistics
+    /// across all partitions if `partition` is `None`.
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>>;
+
     /// Return a copy of this DataSource with a new fetch limit
     fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn DataSource>>;
     fn fetch(&self) -> Option<usize>;
@@ -152,9 +171,15 @@ pub trait DataSource: Send + Sync + Debug {
     }
     fn try_swapping_with_projection(
         &self,
-        _projection: &ProjectionExec,
-    ) -> Result<Option<Arc<dyn ExecutionPlan>>>;
+        _projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn DataSource>>>;
+
     /// Try to push down filters into this DataSource.
+    ///
+    /// These filters are in terms of the output schema of this DataSource (e.g.
+    /// [`Self::eq_properties`] and output of any projections pushed into the
+    /// source), not the original table schema.
+    ///
     /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details.
     ///
     /// [`ExecutionPlan::handle_child_pushdown_result`]: datafusion_physical_plan::ExecutionPlan::handle_child_pushdown_result
@@ -163,7 +188,65 @@ pub trait DataSource: Send + Sync + Debug {
         filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> {
-        Ok(FilterPushdownPropagation::unsupported(filters))
+        Ok(FilterPushdownPropagation::with_parent_pushdown_result(
+            vec![PushedDown::No; filters.len()],
+        ))
+    }
+
+    /// Try to create a new DataSource that produces data in the specified sort order.
+    ///
+    /// # Arguments
+    /// * `order` - The desired output ordering
+    ///
+    /// # Returns
+    /// * `Ok(SortOrderPushdownResult::Exact { .. })` - Created a source that guarantees exact ordering
+    /// * `Ok(SortOrderPushdownResult::Inexact { .. })` - Created a source optimized for the ordering
+    /// * `Ok(SortOrderPushdownResult::Unsupported)` - Cannot optimize for this ordering
+    /// * `Err(e)` - Error occurred
+    ///
+    /// Default implementation returns `Unsupported`.
+    fn try_pushdown_sort(
+        &self,
+        _order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
+    }
+
+    /// Returns a variant of this `DataSource` that is aware of order-sensitivity.
+    fn with_preserve_order(&self, _preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        None
+    }
+
+    /// Apply a closure to each expression used by this data source.
+    ///
+    /// This includes filter predicates (which may contain dynamic filters) and any
+    /// other expressions used during data scanning.
+    ///
+    /// Implementations must override this method. If the data source has no expressions,
+    /// return `Ok(TreeNodeRecursion::Continue)` immediately.
+    ///
+    /// See [`ExecutionPlan::apply_expressions`] for more details and implementation examples.
+    ///
+    /// [`ExecutionPlan::apply_expressions`]: datafusion_physical_plan::ExecutionPlan::apply_expressions
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
+
+    /// Injects arbitrary run-time state into this DataSource, returning a new instance
+    /// that incorporates that state *if* it is relevant to the concrete DataSource implementation.
+    ///
+    /// This is a generic entry point: the `state` can be any type wrapped in
+    /// `Arc<dyn Any + Send + Sync>`.  A data source that cares about the state should
+    /// down-cast it to the concrete type it expects and, if successful, return a
+    /// modified copy of itself that captures the provided value.  If the state is
+    /// not applicable, the default behaviour is to return `None` so that parent
+    /// nodes can continue propagating the attempt further down the plan tree.
+    fn with_new_state(
+        &self,
+        _state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn DataSource>> {
+        None
     }
 }
 
@@ -184,7 +267,180 @@ pub struct DataSourceExec {
     /// The source of the data -- for example, `FileScanConfig` or `MemorySourceConfig`
     data_source: Arc<dyn DataSource>,
     /// Cached plan properties such as sort order
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+}
+
+/// `FileScanConfig` wrapper that shares one `SharedFileStreamState` across all
+/// sibling `FileStream`s opened by the same `DataSourceExec`.
+///
+/// This keeps cross-partition I/O budgeting and ready-work stealing truly
+/// shared at runtime instead of each partition constructing its own fallback
+/// local scheduler state.
+#[derive(Clone, Debug)]
+struct SharedStateFileScanConfig {
+    config: FileScanConfig,
+    shared_file_stream_state: SharedFileStreamState,
+}
+
+impl SharedStateFileScanConfig {
+    fn new(config: FileScanConfig) -> Self {
+        let shared_file_stream_state = shared_file_stream_state_for(&config);
+        Self {
+            config,
+            shared_file_stream_state,
+        }
+    }
+}
+
+impl DisplayAs for SharedStateFileScanConfig {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
+        DataSource::fmt_as(&self.config, t, f)
+    }
+}
+
+impl DataSource for SharedStateFileScanConfig {
+    fn open(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let object_store = context
+            .runtime_env()
+            .object_store(&self.config.object_store_url)?;
+        let batch_size = self
+            .config
+            .batch_size
+            .unwrap_or_else(|| context.session_config().batch_size());
+
+        let source = self.config.file_source.with_batch_size(batch_size);
+        let morselizer =
+            source.create_morselizer(object_store, &self.config, partition)?;
+
+        let stream = FileStreamBuilder::new_with_morselizer(
+            &self.config,
+            partition,
+            morselizer,
+            source.metrics(),
+        )
+        .with_shared_state(self.shared_file_stream_state.clone())
+        .build()?;
+        Ok(Box::pin(cooperative(stream)))
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
+        DataSource::fmt_as(&self.config, t, f)
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        repartition_file_min_size: usize,
+        output_ordering: Option<LexOrdering>,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
+        Ok(self
+            .config
+            .repartitioned(
+                target_partitions,
+                repartition_file_min_size,
+                output_ordering,
+            )?
+            .map(wrap_file_scan_shared_state))
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.config.output_partitioning()
+    }
+
+    fn eq_properties(&self) -> EquivalenceProperties {
+        self.config.eq_properties()
+    }
+
+    fn scheduling_type(&self) -> SchedulingType {
+        self.config.scheduling_type()
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.config.partition_statistics(partition)
+    }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
+        self.config
+            .with_fetch(limit)
+            .map(wrap_file_scan_shared_state)
+    }
+
+    fn fetch(&self) -> Option<usize> {
+        self.config.fetch()
+    }
+
+    fn metrics(&self) -> ExecutionPlanMetricsSet {
+        self.config.metrics()
+    }
+
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn DataSource>>> {
+        Ok(self
+            .config
+            .try_swapping_with_projection(projection)?
+            .map(wrap_file_scan_shared_state))
+    }
+
+    fn try_pushdown_filters(
+        &self,
+        filters: Vec<Arc<dyn PhysicalExpr>>,
+        config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn DataSource>>> {
+        let result = self.config.try_pushdown_filters(filters, config)?;
+        Ok(FilterPushdownPropagation {
+            filters: result.filters,
+            updated_node: result.updated_node.map(wrap_file_scan_shared_state),
+        })
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
+        self.config
+            .try_pushdown_sort(order)?
+            .try_map(|data_source| Ok(wrap_file_scan_shared_state(data_source)))
+    }
+
+    fn with_preserve_order(&self, preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        self.config
+            .with_preserve_order(preserve_order)
+            .map(wrap_file_scan_shared_state)
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        self.config.apply_expressions(f)
+    }
+
+    fn with_new_state(
+        &self,
+        state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn DataSource>> {
+        self.config
+            .with_new_state(state)
+            .map(wrap_file_scan_shared_state)
+    }
+}
+
+fn wrap_file_scan_shared_state(data_source: Arc<dyn DataSource>) -> Arc<dyn DataSource> {
+    if let Some(config) = data_source.as_any().downcast_ref::<FileScanConfig>() {
+        Arc::new(SharedStateFileScanConfig::new(config.clone()))
+    } else {
+        data_source
+    }
 }
 
 impl DisplayAs for DataSourceExec {
@@ -208,7 +464,7 @@ impl ExecutionPlan for DataSourceExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -216,6 +472,14 @@ impl ExecutionPlan for DataSourceExec {
         Vec::new()
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Delegate to the underlying data source
+        self.data_source.apply_expressions(f)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -238,17 +502,15 @@ impl ExecutionPlan for DataSourceExec {
             self.properties().eq_properties.output_ordering(),
         )?;
 
-        if let Some(source) = data_source {
+        Ok(data_source.map(|source| {
             let output_partitioning = source.output_partitioning();
             let plan = self
                 .clone()
                 .with_data_source(source)
                 // Changing source partitioning may invalidate output partitioning. Update it also
                 .with_partitioning(output_partitioning);
-            Ok(Some(Arc::new(plan)))
-        } else {
-            Ok(Some(Arc::new(self.clone())))
-        }
+            Arc::new(plan) as _
+        }))
     }
 
     fn execute(
@@ -256,38 +518,31 @@ impl ExecutionPlan for DataSourceExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        self.data_source.open(partition, context)
+        let stream = self.data_source.open(partition, Arc::clone(&context))?;
+        let batch_size = context.session_config().batch_size();
+        log::debug!(
+            "Batch splitting enabled for partition {partition}: batch_size={batch_size}"
+        );
+        let metrics = self.data_source.metrics();
+        let split_metrics = SplitMetrics::new(&metrics, partition);
+        Ok(Box::pin(BatchSplitStream::new(
+            stream,
+            batch_size,
+            split_metrics,
+        )))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.data_source.metrics().clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.data_source.statistics()
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if let Some(partition) = partition {
-            let mut statistics = Statistics::new_unknown(&self.schema());
-            if let Some(file_config) =
-                self.data_source.as_any().downcast_ref::<FileScanConfig>()
-            {
-                if let Some(file_group) = file_config.file_groups.get(partition) {
-                    if let Some(stat) = file_group.file_statistics(None) {
-                        statistics = stat.clone();
-                    }
-                }
-            }
-            Ok(statistics)
-        } else {
-            Ok(self.data_source.statistics()?)
-        }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.data_source.partition_statistics(partition)
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
         let data_source = self.data_source.with_fetch(limit)?;
-        let cache = self.cache.clone();
+        let cache = Arc::clone(&self.cache);
 
         Some(Arc::new(Self { data_source, cache }))
     }
@@ -300,25 +555,40 @@ impl ExecutionPlan for DataSourceExec {
         &self,
         projection: &ProjectionExec,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
-        self.data_source.try_swapping_with_projection(projection)
+        match self
+            .data_source
+            .try_swapping_with_projection(projection.projection_expr())?
+        {
+            Some(new_data_source) => {
+                Ok(Some(Arc::new(DataSourceExec::new(new_data_source))))
+            }
+            None => Ok(None),
+        }
     }
 
     fn handle_child_pushdown_result(
         &self,
+        _phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
         // Push any remaining filters into our data source
-        let res = self.data_source.try_pushdown_filters(
-            child_pushdown_result.parent_filters.collect_all(),
-            config,
-        )?;
+        let parent_filters = child_pushdown_result
+            .parent_filters
+            .into_iter()
+            .map(|f| f.filter)
+            .collect_vec();
+        let res = self
+            .data_source
+            .try_pushdown_filters(parent_filters, config)?;
         match res.updated_node {
             Some(data_source) => {
                 let mut new_node = self.clone();
                 new_node.data_source = data_source;
+                // Re-compute properties since we have new filters which will impact equivalence info
                 new_node.cache =
-                    Self::compute_properties(Arc::clone(&new_node.data_source));
+                    Arc::new(Self::compute_properties(&new_node.data_source));
+
                 Ok(FilterPushdownPropagation {
                     filters: res.filters,
                     updated_node: Some(Arc::new(new_node)),
@@ -330,6 +600,43 @@ impl ExecutionPlan for DataSourceExec {
             }),
         }
     }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // Delegate to the data source and wrap result with DataSourceExec
+        self.data_source
+            .try_pushdown_sort(order)?
+            .try_map(|new_data_source| {
+                let new_exec = self.clone().with_data_source(new_data_source);
+                Ok(Arc::new(new_exec) as Arc<dyn ExecutionPlan>)
+            })
+    }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.data_source
+            .with_preserve_order(preserve_order)
+            .map(|new_data_source| {
+                Arc::new(self.clone().with_data_source(new_data_source))
+                    as Arc<dyn ExecutionPlan>
+            })
+    }
+
+    fn with_new_state(
+        &self,
+        state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.data_source
+            .with_new_state(state)
+            .map(|new_data_source| {
+                Arc::new(self.clone().with_data_source(new_data_source))
+                    as Arc<dyn ExecutionPlan>
+            })
+    }
 }
 
 impl DataSourceExec {
@@ -337,9 +644,14 @@ impl DataSourceExec {
         Arc::new(Self::new(Arc::new(data_source)))
     }
 
+    // Default constructor for `DataSourceExec`, setting the `cooperative` flag to `true`.
     pub fn new(data_source: Arc<dyn DataSource>) -> Self {
-        let cache = Self::compute_properties(Arc::clone(&data_source));
-        Self { data_source, cache }
+        let data_source = wrap_file_scan_shared_state(data_source);
+        let cache = Self::compute_properties(&data_source);
+        Self {
+            data_source,
+            cache: Arc::new(cache),
+        }
     }
 
     /// Return the source object
@@ -348,30 +660,32 @@ impl DataSourceExec {
     }
 
     pub fn with_data_source(mut self, data_source: Arc<dyn DataSource>) -> Self {
-        self.cache = Self::compute_properties(Arc::clone(&data_source));
+        let data_source = wrap_file_scan_shared_state(data_source);
+        self.cache = Arc::new(Self::compute_properties(&data_source));
         self.data_source = data_source;
         self
     }
 
     /// Assign constraints
     pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.cache = self.cache.with_constraints(constraints);
+        Arc::make_mut(&mut self.cache).set_constraints(constraints);
         self
     }
 
     /// Assign output partitioning
     pub fn with_partitioning(mut self, partitioning: Partitioning) -> Self {
-        self.cache = self.cache.with_partitioning(partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = partitioning;
         self
     }
 
-    fn compute_properties(data_source: Arc<dyn DataSource>) -> PlanProperties {
+    fn compute_properties(data_source: &Arc<dyn DataSource>) -> PlanProperties {
         PlanProperties::new(
             data_source.eq_properties(),
             data_source.output_partitioning(),
             EmissionType::Incremental,
             Boundedness::Bounded,
         )
+        .with_scheduling_type(data_source.scheduling_type())
     }
 
     /// Downcast the `DataSourceExec`'s `data_source` to a specific file source
diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs
index b42d3bb361b71..b1a56e096c222 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -20,24 +20,25 @@
 //! Currently, this module houses code to sort file groups if they are non-overlapping with
 //! respect to the required sort order. See [`MinMaxStatistics`]
 
-use futures::{Stream, StreamExt};
 use std::sync::Arc;
 
-use crate::file_groups::FileGroup;
 use crate::PartitionedFile;
+use crate::file_groups::FileGroup;
 
 use arrow::array::RecordBatch;
+use arrow::compute::SortColumn;
 use arrow::datatypes::SchemaRef;
-use arrow::{
-    compute::SortColumn,
-    row::{Row, Rows},
-};
+use arrow::row::{Row, Rows};
 use datafusion_common::stats::Precision;
-use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError, Result};
-use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, plan_datafusion_err, plan_err,
+};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 use datafusion_physical_plan::{ColumnStatistics, Statistics};
 
+use futures::{Stream, StreamExt};
+
 /// A normalized representation of file min/max statistics that allows for efficient sorting & comparison.
 /// The min/max values are ordered by [`Self::sort_order`].
 /// Furthermore, any columns that are reversed in the sort order have their min/max values swapped.
@@ -49,19 +50,19 @@ pub(crate) struct MinMaxStatistics {
 
 impl MinMaxStatistics {
     /// Sort order used to sort the statistics
-    #[allow(unused)]
+    #[expect(unused)]
     pub fn sort_order(&self) -> &LexOrdering {
         &self.sort_order
     }
 
     /// Min value at index
-    #[allow(unused)]
-    pub fn min(&self, idx: usize) -> Row {
+    #[expect(unused)]
+    pub fn min(&'_ self, idx: usize) -> Row<'_> {
         self.min_by_sort_order.row(idx)
     }
 
     /// Max value at index
-    pub fn max(&self, idx: usize) -> Row {
+    pub fn max(&'_ self, idx: usize) -> Row<'_> {
         self.max_by_sort_order.row(idx)
     }
 
@@ -71,9 +72,7 @@ impl MinMaxStatistics {
         projection: Option<&[usize]>, // Indices of projection in full table schema (None = all columns)
         files: impl IntoIterator<Item = &'a PartitionedFile>,
     ) -> Result<Self> {
-        use datafusion_common::ScalarValue;
-
-        let statistics_and_partition_values = files
+        let Some(statistics_and_partition_values) = files
             .into_iter()
             .map(|file| {
                 file.statistics
@@ -81,9 +80,9 @@ impl MinMaxStatistics {
                     .zip(Some(file.partition_values.as_slice()))
             })
             .collect::<Option<Vec<_>>>()
-            .ok_or_else(|| {
-                DataFusionError::Plan("Parquet file missing statistics".to_string())
-            })?;
+        else {
+            return plan_err!("Parquet file missing statistics");
+        };
 
         // Helper function to get min/max statistics for a given column of projected_schema
         let get_min_max = |i: usize| -> Result<(Vec<ScalarValue>, Vec<ScalarValue>)> {
@@ -96,9 +95,7 @@ impl MinMaxStatistics {
                             .get_value()
                             .cloned()
                             .zip(s.column_statistics[i].max_value.get_value().cloned())
-                            .ok_or_else(|| {
-                                DataFusionError::Plan("statistics not found".to_string())
-                            })
+                            .ok_or_else(|| plan_datafusion_err!("statistics not found"))
                     } else {
                         let partition_value = &pv[i - s.column_statistics.len()];
                         Ok((partition_value.clone(), partition_value.clone()))
@@ -109,27 +106,28 @@ impl MinMaxStatistics {
                 .unzip())
         };
 
-        let sort_columns = sort_columns_from_physical_sort_exprs(projected_sort_order)
-            .ok_or(DataFusionError::Plan(
-                "sort expression must be on column".to_string(),
-            ))?;
+        let Some(sort_columns) =
+            sort_columns_from_physical_sort_exprs(projected_sort_order)
+        else {
+            return plan_err!("sort expression must be on column");
+        };
 
         // Project the schema & sort order down to just the relevant columns
         let min_max_schema = Arc::new(
             projected_schema
                 .project(&(sort_columns.iter().map(|c| c.index()).collect::<Vec<_>>()))?,
         );
-        let min_max_sort_order = LexOrdering::from(
-            sort_columns
-                .iter()
-                .zip(projected_sort_order.iter())
-                .enumerate()
-                .map(|(i, (col, sort))| PhysicalSortExpr {
-                    expr: Arc::new(Column::new(col.name(), i)),
-                    options: sort.options,
-                })
-                .collect::<Vec<_>>(),
-        );
+
+        let min_max_sort_order = projected_sort_order
+            .iter()
+            .zip(sort_columns.iter())
+            .enumerate()
+            .map(|(idx, (sort_expr, col))| {
+                let expr = Arc::new(Column::new(col.name(), idx));
+                PhysicalSortExpr::new(expr, sort_expr.options)
+            });
+        // Safe to `unwrap` as we know that sort columns are non-empty:
+        let min_max_sort_order = LexOrdering::new(min_max_sort_order).unwrap();
 
         let (min_values, max_values): (Vec<_>, Vec<_>) = sort_columns
             .iter()
@@ -154,22 +152,25 @@ impl MinMaxStatistics {
             .into_iter()
             .unzip();
 
-        Self::new(
-            &min_max_sort_order,
-            &min_max_schema,
-            RecordBatch::try_new(Arc::clone(&min_max_schema), min_values).map_err(
-                |e| {
-                    DataFusionError::ArrowError(e, Some("\ncreate min batch".to_string()))
-                },
-            )?,
-            RecordBatch::try_new(Arc::clone(&min_max_schema), max_values).map_err(
-                |e| {
-                    DataFusionError::ArrowError(e, Some("\ncreate max batch".to_string()))
-                },
-            )?,
-        )
+        let min_batch = RecordBatch::try_new(Arc::clone(&min_max_schema), min_values)
+            .map_err(|e| {
+                DataFusionError::ArrowError(
+                    Box::new(e),
+                    Some("\ncreate min batch".to_string()),
+                )
+            })?;
+        let max_batch = RecordBatch::try_new(Arc::clone(&min_max_schema), max_values)
+            .map_err(|e| {
+                DataFusionError::ArrowError(
+                    Box::new(e),
+                    Some("\ncreate max batch".to_string()),
+                )
+            })?;
+
+        Self::new(&min_max_sort_order, &min_max_schema, min_batch, max_batch)
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new(
         sort_order: &LexOrdering,
         schema: &SchemaRef,
@@ -189,25 +190,23 @@ impl MinMaxStatistics {
             .map_err(|e| e.context("create sort fields"))?;
         let converter = RowConverter::new(sort_fields)?;
 
-        let sort_columns = sort_columns_from_physical_sort_exprs(sort_order).ok_or(
-            DataFusionError::Plan("sort expression must be on column".to_string()),
-        )?;
+        let Some(sort_columns) = sort_columns_from_physical_sort_exprs(sort_order) else {
+            return plan_err!("sort expression must be on column");
+        };
 
         // swap min/max if they're reversed in the ordering
         let (new_min_cols, new_max_cols): (Vec<_>, Vec<_>) = sort_order
             .iter()
             .zip(sort_columns.iter().copied())
             .map(|(sort_expr, column)| {
-                if sort_expr.options.descending {
-                    max_values
-                        .column_by_name(column.name())
-                        .zip(min_values.column_by_name(column.name()))
+                let maxes = max_values.column_by_name(column.name());
+                let mins = min_values.column_by_name(column.name());
+                let opt_value = if sort_expr.options.descending {
+                    maxes.zip(mins)
                 } else {
-                    min_values
-                        .column_by_name(column.name())
-                        .zip(max_values.column_by_name(column.name()))
-                }
-                .ok_or_else(|| {
+                    mins.zip(maxes)
+                };
+                opt_value.ok_or_else(|| {
                     plan_datafusion_err!(
                         "missing column in MinMaxStatistics::new: '{}'",
                         column.name()
@@ -228,14 +227,7 @@ impl MinMaxStatistics {
                 .zip(sort_columns.iter().copied())
                 .map(|(sort_expr, column)| {
                     let schema = values.schema();
-
                     let idx = schema.index_of(column.name())?;
-                    let field = schema.field(idx);
-
-                    // check that sort columns are non-nullable
-                    if field.is_nullable() {
-                        return plan_err!("cannot sort by nullable column");
-                    }
 
                     Ok(SortColumn {
                         values: Arc::clone(values.column(idx)),
@@ -252,7 +244,10 @@ impl MinMaxStatistics {
                         .collect::<Vec<_>>(),
                 )
                 .map_err(|e| {
-                    DataFusionError::ArrowError(e, Some("convert columns".to_string()))
+                    DataFusionError::ArrowError(
+                        Box::new(e),
+                        Some("convert columns".to_string()),
+                    )
                 })
         });
 
@@ -271,11 +266,12 @@ impl MinMaxStatistics {
     }
 
     /// Check if the min/max statistics are in order and non-overlapping
+    /// (or touching at boundaries)
     pub fn is_sorted(&self) -> bool {
         self.max_by_sort_order
             .iter()
             .zip(self.min_by_sort_order.iter().skip(1))
-            .all(|(max, next_min)| max < next_min)
+            .all(|(max, next_min)| max <= next_min)
     }
 }
 
@@ -285,7 +281,7 @@ fn sort_columns_from_physical_sort_exprs(
     sort_order
         .iter()
         .map(|expr| expr.expr.as_any().downcast_ref::<Column>())
-        .collect::<Option<Vec<_>>>()
+        .collect()
 }
 
 /// Get all files as well as the file level summary statistics (no statistic for partition columns).
@@ -297,7 +293,7 @@ fn sort_columns_from_physical_sort_exprs(
     since = "47.0.0",
     note = "Please use `get_files_with_limit` and  `compute_all_files_statistics` instead"
 )]
-#[allow(unused)]
+#[expect(unused)]
 pub async fn get_statistics_with_limit(
     all_files: impl Stream<Item = Result<(PartitionedFile, Arc<Statistics>)>>,
     file_schema: SchemaRef,
@@ -372,12 +368,14 @@ pub async fn get_statistics_with_limit(
                         min_value: file_min,
                         sum_value: file_sum,
                         distinct_count: _,
+                        byte_size: file_sbs,
                     } = file_col_stats;
 
                     col_stats.null_count = col_stats.null_count.add(file_nc);
                     col_stats.max_value = col_stats.max_value.max(file_max);
                     col_stats.min_value = col_stats.min_value.min(file_min);
                     col_stats.sum_value = col_stats.sum_value.add(file_sum);
+                    col_stats.byte_size = col_stats.byte_size.add(file_sbs);
                 }
 
                 // If the number of rows exceeds the limit, we can stop processing
@@ -423,6 +421,7 @@ pub async fn get_statistics_with_limit(
 ///
 /// # Returns
 /// A new file group with summary statistics attached
+#[expect(clippy::needless_pass_by_value)]
 pub fn compute_file_group_statistics(
     file_group: FileGroup,
     file_schema: SchemaRef,
@@ -458,6 +457,7 @@ pub fn compute_file_group_statistics(
 /// A tuple containing:
 /// * The processed file groups with their individual statistics attached
 /// * The summary statistics across all file groups, aka all files summary statistics
+#[expect(clippy::needless_pass_by_value)]
 pub fn compute_all_files_statistics(
     file_groups: Vec<FileGroup>,
     table_schema: SchemaRef,
diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs
new file mode 100644
index 0000000000000..5b7fc4727df05
--- /dev/null
+++ b/datafusion/datasource/src/table_schema.rs
@@ -0,0 +1,279 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Helper struct to manage table schemas with partition columns
+
+use arrow::datatypes::{FieldRef, SchemaBuilder, SchemaRef};
+use std::sync::Arc;
+
+/// The overall schema for potentially partitioned data sources.
+///
+/// When reading partitioned data (such as Hive-style partitioning), a [`TableSchema`]
+/// consists of two parts:
+/// 1. **File schema**: The schema of the actual data files on disk
+/// 2. **Partition columns**: Columns whose values are encoded in the directory structure,
+///    but not stored in the files themselves
+///
+/// # Example: Partitioned Table
+///
+/// Consider a table with the following directory structure:
+/// ```text
+/// /data/date=2025-10-10/region=us-west/data.parquet
+/// /data/date=2025-10-11/region=us-east/data.parquet
+/// ```
+///
+/// In this case:
+/// - **File schema**: The schema of `data.parquet` files (e.g., `[user_id, amount]`)
+/// - **Partition columns**: `[date, region]` extracted from the directory path
+/// - **Table schema**: The full schema combining both (e.g., `[user_id, amount, date, region]`)
+///
+/// # When to Use
+///
+/// Use `TableSchema` when:
+/// - Reading partitioned data sources (Parquet, CSV, etc. with Hive-style partitioning)
+/// - You need to efficiently access different schema representations without reconstructing them
+/// - You want to avoid repeatedly concatenating file and partition schemas
+///
+/// For non-partitioned data or when working with a single schema representation,
+/// working directly with Arrow's `Schema` or `SchemaRef` is simpler.
+///
+/// # Performance
+///
+/// This struct pre-computes and caches the full table schema, allowing cheap references
+/// to any representation without repeated allocations or reconstructions.
+#[derive(Debug, Clone)]
+pub struct TableSchema {
+    /// The schema of the data files themselves, without partition columns.
+    ///
+    /// For example, if your Parquet files contain `[user_id, amount]`,
+    /// this field holds that schema.
+    file_schema: SchemaRef,
+
+    /// Columns that are derived from the directory structure (partitioning scheme).
+    ///
+    /// For Hive-style partitioning like `/date=2025-10-10/region=us-west/`,
+    /// this contains the `date` and `region` fields.
+    ///
+    /// These columns are NOT present in the data files but are appended to each
+    /// row during query execution based on the file's location.
+    table_partition_cols: Arc<Vec<FieldRef>>,
+
+    /// The complete table schema: file_schema columns followed by partition columns.
+    ///
+    /// This is pre-computed during construction by concatenating `file_schema`
+    /// and `table_partition_cols`, so it can be returned as a cheap reference.
+    table_schema: SchemaRef,
+}
+
+impl TableSchema {
+    /// Create a new TableSchema from a file schema and partition columns.
+    ///
+    /// The table schema is automatically computed by appending the partition columns
+    /// to the file schema.
+    ///
+    /// You should prefer calling this method over
+    /// chaining [`TableSchema::from_file_schema`] and [`TableSchema::with_table_partition_cols`]
+    /// if you have both the file schema and partition columns available at construction time
+    /// since it avoids re-computing the table schema.
+    ///
+    /// # Arguments
+    ///
+    /// * `file_schema` - Schema of the data files (without partition columns)
+    /// * `table_partition_cols` - Partition columns to append to each row
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow::datatypes::{Schema, Field, DataType};
+    /// # use datafusion_datasource::TableSchema;
+    /// let file_schema = Arc::new(Schema::new(vec![
+    ///     Field::new("user_id", DataType::Int64, false),
+    ///     Field::new("amount", DataType::Float64, false),
+    /// ]));
+    ///
+    /// let partition_cols = vec![
+    ///     Arc::new(Field::new("date", DataType::Utf8, false)),
+    ///     Arc::new(Field::new("region", DataType::Utf8, false)),
+    /// ];
+    ///
+    /// let table_schema = TableSchema::new(file_schema, partition_cols);
+    ///
+    /// // Table schema will have 4 columns: user_id, amount, date, region
+    /// assert_eq!(table_schema.table_schema().fields().len(), 4);
+    /// ```
+    pub fn new(file_schema: SchemaRef, table_partition_cols: Vec<FieldRef>) -> Self {
+        let mut builder = SchemaBuilder::from(file_schema.as_ref());
+        builder.extend(table_partition_cols.iter().cloned());
+        Self {
+            file_schema,
+            table_partition_cols: Arc::new(table_partition_cols),
+            table_schema: Arc::new(builder.finish()),
+        }
+    }
+
+    /// Create a new TableSchema with no partition columns.
+    ///
+    /// You should prefer calling [`TableSchema::new`] if you have partition columns at
+    /// construction time since it avoids re-computing the table schema.
+    pub fn from_file_schema(file_schema: SchemaRef) -> Self {
+        Self::new(file_schema, vec![])
+    }
+
+    /// Add partition columns to an existing TableSchema, returning a new instance.
+    ///
+    /// You should prefer calling [`TableSchema::new`] instead of chaining [`TableSchema::from_file_schema`]
+    /// into [`TableSchema::with_table_partition_cols`] if you have partition columns at construction time
+    /// since it avoids re-computing the table schema.
+    pub fn with_table_partition_cols(mut self, partition_cols: Vec<FieldRef>) -> Self {
+        if self.table_partition_cols.is_empty() {
+            self.table_partition_cols = Arc::new(partition_cols);
+        } else {
+            // Append to existing partition columns
+            let table_partition_cols = Arc::get_mut(&mut self.table_partition_cols).expect(
+                "Expected to be the sole owner of table_partition_cols since this function accepts mut self",
+            );
+            table_partition_cols.extend(partition_cols);
+        }
+        let mut builder = SchemaBuilder::from(self.file_schema.as_ref());
+        builder.extend(self.table_partition_cols.iter().cloned());
+        self.table_schema = Arc::new(builder.finish());
+        self
+    }
+
+    /// Get the file schema (without partition columns).
+    ///
+    /// This is the schema of the actual data files on disk.
+    pub fn file_schema(&self) -> &SchemaRef {
+        &self.file_schema
+    }
+
+    /// Get the table partition columns.
+    ///
+    /// These are the columns derived from the directory structure that
+    /// will be appended to each row during query execution.
+    pub fn table_partition_cols(&self) -> &Vec<FieldRef> {
+        &self.table_partition_cols
+    }
+
+    /// Get the full table schema (file schema + partition columns).
+    ///
+    /// This is the complete schema that will be seen by queries, combining
+    /// both the columns from the files and the partition columns.
+    pub fn table_schema(&self) -> &SchemaRef {
+        &self.table_schema
+    }
+}
+
+impl From<SchemaRef> for TableSchema {
+    fn from(schema: SchemaRef) -> Self {
+        Self::from_file_schema(schema)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::TableSchema;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_table_schema_creation() {
+        let file_schema = Arc::new(Schema::new(vec![
+            Field::new("user_id", DataType::Int64, false),
+            Field::new("amount", DataType::Float64, false),
+        ]));
+
+        let partition_cols = vec![
+            Arc::new(Field::new("date", DataType::Utf8, false)),
+            Arc::new(Field::new("region", DataType::Utf8, false)),
+        ];
+
+        let table_schema = TableSchema::new(file_schema.clone(), partition_cols.clone());
+
+        // Verify file schema
+        assert_eq!(table_schema.file_schema().as_ref(), file_schema.as_ref());
+
+        // Verify partition columns
+        assert_eq!(table_schema.table_partition_cols().len(), 2);
+        assert_eq!(table_schema.table_partition_cols()[0], partition_cols[0]);
+        assert_eq!(table_schema.table_partition_cols()[1], partition_cols[1]);
+
+        // Verify full table schema
+        let expected_fields = vec![
+            Field::new("user_id", DataType::Int64, false),
+            Field::new("amount", DataType::Float64, false),
+            Field::new("date", DataType::Utf8, false),
+            Field::new("region", DataType::Utf8, false),
+        ];
+        let expected_schema = Schema::new(expected_fields);
+        assert_eq!(table_schema.table_schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_add_multiple_partition_columns() {
+        let file_schema =
+            Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+        let initial_partition_cols =
+            vec![Arc::new(Field::new("country", DataType::Utf8, false))];
+
+        let table_schema = TableSchema::new(file_schema.clone(), initial_partition_cols);
+
+        let additional_partition_cols = vec![
+            Arc::new(Field::new("city", DataType::Utf8, false)),
+            Arc::new(Field::new("year", DataType::Int32, false)),
+        ];
+
+        let updated_table_schema =
+            table_schema.with_table_partition_cols(additional_partition_cols);
+
+        // Verify file schema remains unchanged
+        assert_eq!(
+            updated_table_schema.file_schema().as_ref(),
+            file_schema.as_ref()
+        );
+
+        // Verify partition columns
+        assert_eq!(updated_table_schema.table_partition_cols().len(), 3);
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[0].name(),
+            "country"
+        );
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[1].name(),
+            "city"
+        );
+        assert_eq!(
+            updated_table_schema.table_partition_cols()[2].name(),
+            "year"
+        );
+
+        // Verify full table schema
+        let expected_fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("country", DataType::Utf8, false),
+            Field::new("city", DataType::Utf8, false),
+            Field::new("year", DataType::Int32, false),
+        ];
+        let expected_schema = Schema::new(expected_fields);
+        assert_eq!(
+            updated_table_schema.table_schema().as_ref(),
+            &expected_schema
+        );
+    }
+}
diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs
index aac61c7812a41..3a9e78943b07b 100644
--- a/datafusion/datasource/src/test_util.rs
+++ b/datafusion/datasource/src/test_util.rs
@@ -17,23 +17,53 @@
 
 use crate::{
     file::FileSource, file_scan_config::FileScanConfig, file_stream::FileOpener,
-    impl_schema_adapter_methods, schema_adapter::SchemaAdapterFactory,
 };
 
 use std::sync::Arc;
 
-use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{Result, Statistics};
-use datafusion_physical_expr::{expressions::Column, PhysicalExpr};
+use arrow::datatypes::Schema;
+use datafusion_common::{Result, tree_node::TreeNodeRecursion};
+use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 use object_store::ObjectStore;
 
 /// Minimal [`crate::file::FileSource`] implementation for use in tests.
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub(crate) struct MockSource {
     metrics: ExecutionPlanMetricsSet,
-    projected_statistics: Option<Statistics>,
-    schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    filter: Option<Arc<dyn PhysicalExpr>>,
+    table_schema: crate::table_schema::TableSchema,
+    projection: crate::projection::SplitProjection,
+}
+
+impl Default for MockSource {
+    fn default() -> Self {
+        let table_schema =
+            crate::table_schema::TableSchema::new(Arc::new(Schema::empty()), vec![]);
+        Self {
+            metrics: ExecutionPlanMetricsSet::new(),
+            filter: None,
+            projection: crate::projection::SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+}
+
+impl MockSource {
+    pub fn new(table_schema: impl Into<crate::table_schema::TableSchema>) -> Self {
+        let table_schema = table_schema.into();
+        Self {
+            metrics: ExecutionPlanMetricsSet::new(),
+            filter: None,
+            projection: crate::projection::SplitProjection::unprojected(&table_schema),
+            table_schema,
+        }
+    }
+
+    pub fn with_filter(mut self, filter: Arc<dyn PhysicalExpr>) -> Self {
+        self.filter = Some(filter);
+        self
+    }
 }
 
 impl FileSource for MockSource {
@@ -42,7 +72,7 @@ impl FileSource for MockSource {
         _object_store: Arc<dyn ObjectStore>,
         _base_config: &FileScanConfig,
         _partition: usize,
-    ) -> Arc<dyn FileOpener> {
+    ) -> Result<Arc<dyn FileOpener>> {
         unimplemented!()
     }
 
@@ -50,41 +80,52 @@ impl FileSource for MockSource {
         self
     }
 
-    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
+        self.filter.clone()
     }
 
-    fn with_schema(&self, _schema: SchemaRef) -> Arc<dyn FileSource> {
+    fn with_batch_size(&self, _batch_size: usize) -> Arc<dyn FileSource> {
         Arc::new(Self { ..self.clone() })
     }
 
-    fn with_projection(&self, _config: &FileScanConfig) -> Arc<dyn FileSource> {
-        Arc::new(Self { ..self.clone() })
+    fn metrics(&self) -> &ExecutionPlanMetricsSet {
+        &self.metrics
     }
 
-    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
-        let mut source = self.clone();
-        source.projected_statistics = Some(statistics);
-        Arc::new(source)
+    fn file_type(&self) -> &str {
+        "mock"
     }
 
-    fn metrics(&self) -> &ExecutionPlanMetricsSet {
-        &self.metrics
+    fn table_schema(&self) -> &crate::table_schema::TableSchema {
+        &self.table_schema
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self
-            .projected_statistics
-            .as_ref()
-            .expect("projected_statistics must be set")
-            .clone())
+    fn try_pushdown_projection(
+        &self,
+        projection: &datafusion_physical_plan::projection::ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        let mut source = self.clone();
+        let new_projection = self.projection.source.try_merge(projection)?;
+        let split_projection = crate::projection::SplitProjection::new(
+            self.table_schema.file_schema(),
+            &new_projection,
+        );
+        source.projection = split_projection;
+        Ok(Some(Arc::new(source)))
     }
 
-    fn file_type(&self) -> &str {
-        "mock"
+    fn projection(
+        &self,
+    ) -> Option<&datafusion_physical_plan::projection::ProjectionExprs> {
+        Some(&self.projection.source)
     }
 
-    impl_schema_adapter_methods!();
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 
 /// Create a column expression
diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs
index bddfdbcc06d13..39d1047984ff6 100644
--- a/datafusion/datasource/src/url.rs
+++ b/datafusion/datasource/src/url.rs
@@ -17,7 +17,9 @@
 
 use std::sync::Arc;
 
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, TableReference};
+use datafusion_execution::cache::TableScopedPath;
+use datafusion_execution::cache::cache_manager::CachedFileList;
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_session::Session;
 
@@ -26,9 +28,9 @@ use futures::{StreamExt, TryStreamExt};
 use glob::Pattern;
 use itertools::Itertools;
 use log::debug;
-use object_store::path::Path;
 use object_store::path::DELIMITER;
-use object_store::{ObjectMeta, ObjectStore};
+use object_store::path::Path;
+use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
 use url::Url;
 
 /// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`]
@@ -41,6 +43,8 @@ pub struct ListingTableUrl {
     prefix: Path,
     /// An optional glob expression used to filter files
     glob: Option<Pattern>,
+    /// Optional table reference for the table this url belongs to
+    table_ref: Option<TableReference>,
 }
 
 impl ListingTableUrl {
@@ -145,7 +149,12 @@ impl ListingTableUrl {
     /// to create a [`ListingTableUrl`].
     pub fn try_new(url: Url, glob: Option<Pattern>) -> Result<Self> {
         let prefix = Path::from_url_path(url.path())?;
-        Ok(Self { url, prefix, glob })
+        Ok(Self {
+            url,
+            prefix,
+            glob,
+            table_ref: None,
+        })
     }
 
     /// Returns the URL scheme
@@ -209,12 +218,12 @@ impl ListingTableUrl {
     /// assert_eq!(url.file_extension(), None);
     /// ```
     pub fn file_extension(&self) -> Option<&str> {
-        if let Some(mut segments) = self.url.path_segments() {
-            if let Some(last_segment) = segments.next_back() {
-                if last_segment.contains(".") && !last_segment.ends_with(".") {
-                    return last_segment.split('.').next_back();
-                }
-            }
+        if let Some(mut segments) = self.url.path_segments()
+            && let Some(last_segment) = segments.next_back()
+            && last_segment.contains(".")
+            && !last_segment.ends_with(".")
+        {
+            return last_segment.split('.').next_back();
         }
 
         None
@@ -233,34 +242,57 @@ impl ListingTableUrl {
         Some(stripped.split_terminator(DELIMITER))
     }
 
-    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`
-    pub async fn list_all_files<'a>(
+    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`,
+    /// optionally filtering by a path prefix
+    pub async fn list_prefixed_files<'a>(
         &'a self,
         ctx: &'a dyn Session,
         store: &'a dyn ObjectStore,
+        prefix: Option<Path>,
         file_extension: &'a str,
     ) -> Result<BoxStream<'a, Result<ObjectMeta>>> {
         let exec_options = &ctx.config_options().execution;
         let ignore_subdirectory = exec_options.listing_table_ignore_subdirectory;
-        // If the prefix is a file, use a head request, otherwise list
-        let list = match self.is_collection() {
-            true => match ctx.runtime_env().cache_manager.get_list_files_cache() {
-                None => store.list(Some(&self.prefix)),
-                Some(cache) => {
-                    if let Some(res) = cache.get(&self.prefix) {
-                        debug!("Hit list all files cache");
-                        futures::stream::iter(res.as_ref().clone().into_iter().map(Ok))
-                            .boxed()
-                    } else {
-                        let list_res = store.list(Some(&self.prefix));
-                        let vec = list_res.try_collect::<Vec<ObjectMeta>>().await?;
-                        cache.put(&self.prefix, Arc::new(vec.clone()));
-                        futures::stream::iter(vec.into_iter().map(Ok)).boxed()
-                    }
+
+        // Build full_prefix for non-cached path and head() calls
+        let full_prefix = if let Some(ref p) = prefix {
+            let mut parts = self.prefix.parts().collect::<Vec<_>>();
+            parts.extend(p.parts());
+            Path::from_iter(parts.into_iter())
+        } else {
+            self.prefix.clone()
+        };
+
+        let list: BoxStream<'a, Result<ObjectMeta>> = if self.is_collection() {
+            list_with_cache(
+                ctx,
+                store,
+                self.table_ref.as_ref(),
+                &self.prefix,
+                prefix.as_ref(),
+            )
+            .await?
+        } else {
+            match store.head(&full_prefix).await {
+                Ok(meta) => futures::stream::once(async { Ok(meta) })
+                    .map_err(|e| DataFusionError::ObjectStore(Box::new(e)))
+                    .boxed(),
+                // If the head command fails, it is likely that object doesn't exist.
+                // Retry as though it were a prefix (aka a collection)
+                Err(object_store::Error::NotFound { .. }) => {
+                    list_with_cache(
+                        ctx,
+                        store,
+                        self.table_ref.as_ref(),
+                        &self.prefix,
+                        prefix.as_ref(),
+                    )
+                    .await?
                 }
-            },
-            false => futures::stream::once(store.head(&self.prefix)).boxed(),
+                Err(e) => return Err(e.into()),
+            }
         };
+
         Ok(list
             .try_filter(move |meta| {
                 let path = &meta.location;
@@ -268,10 +300,20 @@ impl ListingTableUrl {
                 let glob_match = self.contains(path, ignore_subdirectory);
                 futures::future::ready(extension_match && glob_match)
             })
-            .map_err(DataFusionError::ObjectStore)
             .boxed())
     }
 
+    /// List all files identified by this [`ListingTableUrl`] for the provided `file_extension`
+    pub async fn list_all_files<'a>(
+        &'a self,
+        ctx: &'a dyn Session,
+        store: &'a dyn ObjectStore,
+        file_extension: &'a str,
+    ) -> Result<BoxStream<'a, Result<ObjectMeta>>> {
+        self.list_prefixed_files(ctx, store, None, file_extension)
+            .await
+    }
+
     /// Returns this [`ListingTableUrl`] as a string
     pub fn as_str(&self) -> &str {
         self.as_ref()
@@ -282,6 +324,112 @@ impl ListingTableUrl {
         let url = &self.url[url::Position::BeforeScheme..url::Position::BeforePath];
         ObjectStoreUrl::parse(url).unwrap()
     }
+
+    /// Returns true if the [`ListingTableUrl`] points to the folder
+    pub fn is_folder(&self) -> bool {
+        self.url.scheme() == "file" && self.is_collection()
+    }
+
+    /// Return the `url` for [`ListingTableUrl`]
+    pub fn get_url(&self) -> &Url {
+        &self.url
+    }
+
+    /// Return the `glob` for [`ListingTableUrl`]
+    pub fn get_glob(&self) -> &Option<Pattern> {
+        &self.glob
+    }
+
+    /// Returns a copy of current [`ListingTableUrl`] with a specified `glob`
+    pub fn with_glob(mut self, glob: &str) -> Result<Self> {
+        self.glob =
+            Some(Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?);
+        Ok(self)
+    }
+
+    /// Set the table reference for this [`ListingTableUrl`]
+    pub fn with_table_ref(mut self, table_ref: TableReference) -> Self {
+        self.table_ref = Some(table_ref);
+        self
+    }
+
+    /// Return the table reference for this [`ListingTableUrl`]
+    pub fn get_table_ref(&self) -> &Option<TableReference> {
+        &self.table_ref
+    }
+}
+
+/// Lists files with cache support, using prefix-aware lookups.
+///
+/// # Arguments
+/// * `ctx` - The session context
+/// * `store` - The object store to list from
+/// * `table_base_path` - The table's base path (the stable cache key)
+/// * `prefix` - Optional prefix relative to table base for filtering results
+///
+/// # Cache Behavior:
+/// The cache key is always `table_base_path`. When a prefix-filtered listing
+/// is requested via `prefix`, the cache:
+/// - Looks up `table_base_path` in the cache
+/// - Filters results to match `table_base_path/prefix`
+/// - Returns filtered results without a storage call
+///
+/// On cache miss, the full table is always listed and cached, ensuring
+/// subsequent prefix queries can be served from cache.
+async fn list_with_cache<'b>(
+    ctx: &'b dyn Session,
+    store: &'b dyn ObjectStore,
+    table_ref: Option<&TableReference>,
+    table_base_path: &Path,
+    prefix: Option<&Path>,
+) -> Result<BoxStream<'b, Result<ObjectMeta>>> {
+    // Build the full listing path (table_base + prefix)
+    let full_prefix = match prefix {
+        Some(p) => {
+            let mut parts: Vec<_> = table_base_path.parts().collect();
+            parts.extend(p.parts());
+            Path::from_iter(parts)
+        }
+        None => table_base_path.clone(),
+    };
+
+    match ctx.runtime_env().cache_manager.get_list_files_cache() {
+        None => Ok(store
+            .list(Some(&full_prefix))
+            .map(|res| res.map_err(|e| DataFusionError::ObjectStore(Box::new(e))))
+            .boxed()),
+        Some(cache) => {
+            // Build the filter prefix (only Some if prefix was requested)
+            let filter_prefix = prefix.is_some().then(|| full_prefix.clone());
+
+            let table_scoped_base_path = TableScopedPath {
+                table: table_ref.cloned(),
+                path: table_base_path.clone(),
+            };
+
+            // Try cache lookup - get returns CachedFileList
+            let vec = if let Some(cached) = cache.get(&table_scoped_base_path) {
+                debug!("Hit list files cache");
+                cached.files_matching_prefix(&filter_prefix)
+            } else {
+                // Cache miss - always list and cache the full table
+                // This ensures we have complete data for future prefix queries
+                let mut vec = store
+                    .list(Some(table_base_path))
+                    .try_collect::<Vec<ObjectMeta>>()
+                    .await?;
+                vec.shrink_to_fit(); // Right-size before caching
+                let cached: CachedFileList = vec.into();
+                let result = cached.files_matching_prefix(&filter_prefix);
+                cache.put(&table_scoped_base_path, cached);
+                result
+            };
+            Ok(
+                futures::stream::iter(Arc::unwrap_or_clone(vec).into_iter().map(Ok))
+                    .boxed(),
+            )
+        }
+    }
 }
 
 /// Creates a file URL from a potentially relative filesystem path
@@ -339,7 +487,6 @@ const GLOB_START_CHARS: [char; 3] = ['?', '*', '['];
 ///
 /// Path delimiters are determined using [`std::path::is_separator`] which
 /// permits `/` as a path delimiter even on Windows platforms.
-///
 #[cfg(not(target_arch = "wasm32"))]
 fn split_glob_expression(path: &str) -> Option<(&str, &str)> {
     let mut last_separator = 0;
@@ -362,6 +509,25 @@ fn split_glob_expression(path: &str) -> Option<(&str, &str)> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use async_trait::async_trait;
+    use bytes::Bytes;
+    use datafusion_common::DFSchema;
+    use datafusion_common::config::TableOptions;
+    use datafusion_execution::TaskContext;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_execution::runtime_env::RuntimeEnv;
+    use datafusion_expr::execution_props::ExecutionProps;
+    use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_plan::ExecutionPlan;
+    use object_store::{
+        CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload,
+        PutMultipartOptions, PutPayload,
+    };
+    use std::any::Any;
+    use std::collections::HashMap;
+    use std::ops::Range;
+    use std::sync::Arc;
     use tempfile::tempdir;
 
     #[test]
@@ -575,4 +741,503 @@ mod tests {
             "file path ends with .ext - extension is ext",
         );
     }
+
+    #[tokio::test]
+    async fn test_list_files() -> Result<()> {
+        let store = MockObjectStore {
+            in_mem: object_store::memory::InMemory::new(),
+            forbidden_paths: vec!["forbidden/e.parquet".into()],
+        };
+
+        // Create some files:
+        create_file(&store, "a.parquet").await;
+        create_file(&store, "/t/b.parquet").await;
+        create_file(&store, "/t/c.csv").await;
+        create_file(&store, "/t/d.csv").await;
+
+        // This file returns a permission error.
+        create_file(&store, "/forbidden/e.parquet").await;
+
+        assert_eq!(
+            list_all_files("/", &store, "parquet").await?,
+            vec!["a.parquet"],
+        );
+
+        // test with and without trailing slash
+        assert_eq!(
+            list_all_files("/t/", &store, "parquet").await?,
+            vec!["t/b.parquet"],
+        );
+        assert_eq!(
+            list_all_files("/t", &store, "parquet").await?,
+            vec!["t/b.parquet"],
+        );
+
+        // test with and without trailing slash
+        assert_eq!(
+            list_all_files("/t", &store, "csv").await?,
+            vec!["t/c.csv", "t/d.csv"],
+        );
+        assert_eq!(
+            list_all_files("/t/", &store, "csv").await?,
+            vec!["t/c.csv", "t/d.csv"],
+        );
+
+        // Test a non existing prefix
+        assert_eq!(
+            list_all_files("/NonExisting", &store, "csv").await?,
+            vec![] as Vec<String>
+        );
+        assert_eq!(
+            list_all_files("/NonExisting/", &store, "csv").await?,
+            vec![] as Vec<String>
+        );
+
+        // Including forbidden.parquet generates an error.
+        let Err(DataFusionError::ObjectStore(err)) =
+            list_all_files("/forbidden/e.parquet", &store, "parquet").await
+        else {
+            panic!("Expected ObjectStore error");
+        };
+
+        let object_store::Error::PermissionDenied { .. } = &*err else {
+            panic!("Expected PermissionDenied error");
+        };
+
+        // Test prefix filtering with partition-style paths
+        create_file(&store, "/data/a=1/file1.parquet").await;
+        create_file(&store, "/data/a=1/b=100/file2.parquet").await;
+        create_file(&store, "/data/a=2/b=200/file3.parquet").await;
+        create_file(&store, "/data/a=2/b=200/file4.csv").await;
+
+        assert_eq!(
+            list_prefixed_files("/data/", &store, Some(Path::from("a=1")), "parquet")
+                .await?,
+            vec!["data/a=1/b=100/file2.parquet", "data/a=1/file1.parquet"],
+        );
+
+        assert_eq!(
+            list_prefixed_files(
+                "/data/",
+                &store,
+                Some(Path::from("a=1/b=100")),
+                "parquet"
+            )
+            .await?,
+            vec!["data/a=1/b=100/file2.parquet"],
+        );
+
+        assert_eq!(
+            list_prefixed_files("/data/", &store, Some(Path::from("a=2")), "parquet")
+                .await?,
+            vec!["data/a=2/b=200/file3.parquet"],
+        );
+
+        Ok(())
+    }
+
+    /// Tests that the cached code path produces identical results to the non-cached path.
+    ///
+    /// This is critical: the cache is a transparent optimization, so both paths
+    /// MUST return the same files. Note: order is not guaranteed by ObjectStore::list,
+    /// so we sort results before comparison.
+    #[tokio::test]
+    async fn test_cache_path_equivalence() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let store = MockObjectStore {
+            in_mem: object_store::memory::InMemory::new(),
+            forbidden_paths: vec![],
+        };
+
+        // Create test files with partition-style paths
+        create_file(&store, "/table/year=2023/data1.parquet").await;
+        create_file(&store, "/table/year=2023/month=01/data2.parquet").await;
+        create_file(&store, "/table/year=2024/data3.parquet").await;
+        create_file(&store, "/table/year=2024/month=06/data4.parquet").await;
+        create_file(&store, "/table/year=2024/month=12/data5.parquet").await;
+
+        // Session WITHOUT cache
+        let session_no_cache = MockSession::new();
+
+        // Session WITH cache - use RuntimeEnvBuilder with cache limit (no TTL needed for this test)
+        let runtime_with_cache = RuntimeEnvBuilder::new()
+            .with_object_list_cache_limit(1024 * 1024) // 1MB limit
+            .build_arc()?;
+        let session_with_cache = MockSession::with_runtime_env(runtime_with_cache);
+
+        // Test cases: (url, prefix, description)
+        let test_cases = vec![
+            ("/table/", None, "full table listing"),
+            (
+                "/table/",
+                Some(Path::from("year=2023")),
+                "single partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2024")),
+                "different partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2024/month=06")),
+                "nested partition filter",
+            ),
+            (
+                "/table/",
+                Some(Path::from("year=2025")),
+                "non-existent partition",
+            ),
+        ];
+
+        for (url_str, prefix, description) in test_cases {
+            let url = ListingTableUrl::parse(url_str)?;
+
+            // Get results WITHOUT cache (sorted for comparison)
+            let mut results_no_cache: Vec<String> = url
+                .list_prefixed_files(&session_no_cache, &store, prefix.clone(), "parquet")
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_no_cache.sort();
+
+            // Get results WITH cache (first call - cache miss, sorted for comparison)
+            let mut results_with_cache_miss: Vec<String> = url
+                .list_prefixed_files(
+                    &session_with_cache,
+                    &store,
+                    prefix.clone(),
+                    "parquet",
+                )
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_with_cache_miss.sort();
+
+            // Get results WITH cache (second call - cache hit, sorted for comparison)
+            let mut results_with_cache_hit: Vec<String> = url
+                .list_prefixed_files(&session_with_cache, &store, prefix, "parquet")
+                .await?
+                .try_collect::<Vec<_>>()
+                .await?
+                .into_iter()
+                .map(|m| m.location.to_string())
+                .collect();
+            results_with_cache_hit.sort();
+
+            // All three should contain the same files
+            assert_eq!(
+                results_no_cache, results_with_cache_miss,
+                "Cache miss path should match non-cached path for: {description}"
+            );
+            assert_eq!(
+                results_no_cache, results_with_cache_hit,
+                "Cache hit path should match non-cached path for: {description}"
+            );
+        }
+
+        Ok(())
+    }
+
+    /// Tests that prefix queries can be served from a cached full-table listing
+    #[tokio::test]
+    async fn test_cache_serves_partition_from_full_listing() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let store = MockObjectStore {
+            in_mem: object_store::memory::InMemory::new(),
+            forbidden_paths: vec![],
+        };
+
+        // Create test files
+        create_file(&store, "/sales/region=US/q1.parquet").await;
+        create_file(&store, "/sales/region=US/q2.parquet").await;
+        create_file(&store, "/sales/region=EU/q1.parquet").await;
+
+        // Create session with cache (no TTL needed for this test)
+        let runtime = RuntimeEnvBuilder::new()
+            .with_object_list_cache_limit(1024 * 1024) // 1MB limit
+            .build_arc()?;
+        let session = MockSession::with_runtime_env(runtime);
+
+        let url = ListingTableUrl::parse("/sales/")?;
+
+        // First: query full table (populates cache)
+        let full_results: Vec<String> = url
+            .list_prefixed_files(&session, &store, None, "parquet")
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+        assert_eq!(full_results.len(), 3);
+
+        // Second: query with prefix (should be served from cache)
+        let mut us_results: Vec<String> = url
+            .list_prefixed_files(
+                &session,
+                &store,
+                Some(Path::from("region=US")),
+                "parquet",
+            )
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+        us_results.sort();
+
+        assert_eq!(
+            us_results,
+            vec!["sales/region=US/q1.parquet", "sales/region=US/q2.parquet"]
+        );
+
+        // Third: different prefix (also from cache)
+        let eu_results: Vec<String> = url
+            .list_prefixed_files(
+                &session,
+                &store,
+                Some(Path::from("region=EU")),
+                "parquet",
+            )
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|m| m.location.to_string())
+            .collect();
+
+        assert_eq!(eu_results, vec!["sales/region=EU/q1.parquet"]);
+
+        Ok(())
+    }
+
+    /// Creates a file with "hello world" content at the specified path
+    async fn create_file(object_store: &dyn ObjectStore, path: &str) {
+        object_store
+            .put(&Path::from(path), PutPayload::from_static(b"hello world"))
+            .await
+            .expect("failed to create test file");
+    }
+
+    /// Runs "list_prefixed_files"  with no prefix to list all files and returns their paths
+    ///
+    /// Panic's on error
+    async fn list_all_files(
+        url: &str,
+        store: &dyn ObjectStore,
+        file_extension: &str,
+    ) -> Result<Vec<String>> {
+        try_list_prefixed_files(url, store, None, file_extension).await
+    }
+
+    /// Runs "list_prefixed_files" and returns their paths
+    ///
+    /// Panic's on error
+    async fn list_prefixed_files(
+        url: &str,
+        store: &dyn ObjectStore,
+        prefix: Option<Path>,
+        file_extension: &str,
+    ) -> Result<Vec<String>> {
+        try_list_prefixed_files(url, store, prefix, file_extension).await
+    }
+
+    /// Runs "list_prefixed_files" and returns their paths
+    async fn try_list_prefixed_files(
+        url: &str,
+        store: &dyn ObjectStore,
+        prefix: Option<Path>,
+        file_extension: &str,
+    ) -> Result<Vec<String>> {
+        let session = MockSession::new();
+        let url = ListingTableUrl::parse(url)?;
+        let files = url
+            .list_prefixed_files(&session, store, prefix, file_extension)
+            .await?
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|meta| meta.location.as_ref().to_string())
+            .collect();
+        Ok(files)
+    }
+
+    #[derive(Debug)]
+    struct MockObjectStore {
+        in_mem: object_store::memory::InMemory,
+        forbidden_paths: Vec<Path>,
+    }
+
+    impl std::fmt::Display for MockObjectStore {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            self.in_mem.fmt(f)
+        }
+    }
+
+    #[async_trait]
+    impl ObjectStore for MockObjectStore {
+        async fn put_opts(
+            &self,
+            location: &Path,
+            payload: PutPayload,
+            opts: object_store::PutOptions,
+        ) -> object_store::Result<object_store::PutResult> {
+            self.in_mem.put_opts(location, payload, opts).await
+        }
+
+        async fn put_multipart_opts(
+            &self,
+            location: &Path,
+            opts: PutMultipartOptions,
+        ) -> object_store::Result<Box<dyn MultipartUpload>> {
+            self.in_mem.put_multipart_opts(location, opts).await
+        }
+
+        async fn get_opts(
+            &self,
+            location: &Path,
+            options: GetOptions,
+        ) -> object_store::Result<GetResult> {
+            if options.head && self.forbidden_paths.contains(location) {
+                Err(object_store::Error::PermissionDenied {
+                    path: location.to_string(),
+                    source: "forbidden".into(),
+                })
+            } else {
+                self.in_mem.get_opts(location, options).await
+            }
+        }
+
+        async fn get_ranges(
+            &self,
+            location: &Path,
+            ranges: &[Range<u64>],
+        ) -> object_store::Result<Vec<Bytes>> {
+            self.in_mem.get_ranges(location, ranges).await
+        }
+
+        fn delete_stream(
+            &self,
+            locations: BoxStream<'static, object_store::Result<Path>>,
+        ) -> BoxStream<'static, object_store::Result<Path>> {
+            self.in_mem.delete_stream(locations)
+        }
+
+        fn list(
+            &self,
+            prefix: Option<&Path>,
+        ) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
+            self.in_mem.list(prefix)
+        }
+
+        async fn list_with_delimiter(
+            &self,
+            prefix: Option<&Path>,
+        ) -> object_store::Result<ListResult> {
+            self.in_mem.list_with_delimiter(prefix).await
+        }
+
+        async fn copy_opts(
+            &self,
+            from: &Path,
+            to: &Path,
+            options: CopyOptions,
+        ) -> object_store::Result<()> {
+            self.in_mem.copy_opts(from, to, options).await
+        }
+    }
+
+    struct MockSession {
+        config: SessionConfig,
+        runtime_env: Arc<RuntimeEnv>,
+    }
+
+    impl MockSession {
+        fn new() -> Self {
+            Self {
+                config: SessionConfig::new(),
+                runtime_env: Arc::new(RuntimeEnv::default()),
+            }
+        }
+
+        /// Create a MockSession with a custom RuntimeEnv (for cache testing)
+        fn with_runtime_env(runtime_env: Arc<RuntimeEnv>) -> Self {
+            Self {
+                config: SessionConfig::new(),
+                runtime_env,
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl Session for MockSession {
+        fn session_id(&self) -> &str {
+            unimplemented!()
+        }
+
+        fn config(&self) -> &SessionConfig {
+            &self.config
+        }
+
+        async fn create_physical_plan(
+            &self,
+            _logical_plan: &LogicalPlan,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn create_physical_expr(
+            &self,
+            _expr: Expr,
+            _df_schema: &DFSchema,
+        ) -> Result<Arc<dyn PhysicalExpr>> {
+            unimplemented!()
+        }
+
+        fn scalar_functions(&self) -> &HashMap<String, Arc<ScalarUDF>> {
+            unimplemented!()
+        }
+
+        fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
+            unimplemented!()
+        }
+
+        fn window_functions(&self) -> &HashMap<String, Arc<WindowUDF>> {
+            unimplemented!()
+        }
+
+        fn runtime_env(&self) -> &Arc<RuntimeEnv> {
+            &self.runtime_env
+        }
+
+        fn execution_props(&self) -> &ExecutionProps {
+            unimplemented!()
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            unimplemented!()
+        }
+
+        fn table_options(&self) -> &TableOptions {
+            unimplemented!()
+        }
+
+        fn table_options_mut(&mut self) -> &mut TableOptions {
+            unimplemented!()
+        }
+
+        fn task_ctx(&self) -> Arc<TaskContext> {
+            unimplemented!()
+        }
+    }
 }
diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs
index 75fb557b63d2f..1648624747af2 100644
--- a/datafusion/datasource/src/write/demux.rs
+++ b/datafusion/datasource/src/write/demux.rs
@@ -28,21 +28,21 @@ use datafusion_common::error::Result;
 use datafusion_physical_plan::SendableRecordBatchStream;
 
 use arrow::array::{
-    builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, ArrayAccessor,
-    RecordBatch, StringArray, StructArray,
+    ArrayAccessor, RecordBatch, StringArray, StructArray, builder::UInt64Builder,
+    cast::AsArray, downcast_dictionary_array,
 };
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::cast::{
     as_boolean_array, as_date32_array, as_date64_array, as_float16_array,
-    as_float32_array, as_float64_array, as_int16_array, as_int32_array, as_int64_array,
-    as_int8_array, as_string_array, as_string_view_array, as_uint16_array,
-    as_uint32_array, as_uint64_array, as_uint8_array,
+    as_float32_array, as_float64_array, as_int8_array, as_int16_array, as_int32_array,
+    as_int64_array, as_large_string_array, as_string_array, as_string_view_array,
+    as_uint8_array, as_uint16_array, as_uint32_array, as_uint64_array,
 };
-use datafusion_common::{exec_datafusion_err, not_impl_err, DataFusionError};
+use datafusion_common::{exec_datafusion_err, internal_datafusion_err, not_impl_err};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_execution::TaskContext;
 
 use chrono::NaiveDate;
+use datafusion_execution::TaskContext;
 use futures::StreamExt;
 use object_store::path::Path;
 use rand::distr::SampleString;
@@ -68,6 +68,11 @@ pub type DemuxedStreamReceiver = UnboundedReceiver<(Path, RecordBatchReceiver)>;
 /// be written with the extension from the path. Otherwise the default extension
 /// will be used and the output will be split into multiple files.
 ///
+/// Output file guarantees:
+///  - Partitioned files: Files are created only for non-empty partitions.
+///  - Single-file output: 1 file is always written, even when the stream is empty.
+///  - Multi-file output: Depending on the number of record batches, 0 or more files are written.
+///
 /// Examples of `base_output_path`
 ///  * `tmp/dataset/` -> is a folder since it ends in `/`
 ///  * `tmp/dataset` -> is still a folder since it does not end in `/` but has no valid file extension
@@ -101,8 +106,9 @@ pub(crate) fn start_demuxer_task(
     let file_extension = config.file_extension.clone();
     let base_output_path = config.table_paths[0].clone();
     let task = if config.table_partition_cols.is_empty() {
-        let single_file_output = !base_output_path.is_collection()
-            && base_output_path.file_extension().is_some();
+        let single_file_output = config
+            .file_output_mode
+            .single_file_output(&base_output_path);
         SpawnedTask::spawn(async move {
             row_count_demuxer(
                 tx,
@@ -171,7 +177,26 @@ async fn row_count_demuxer(
         max_rows_per_file
     };
 
+    if single_file_output {
+        // ensure we have one file open, even when the input stream is empty
+        open_file_streams.push(create_new_file_stream(
+            &base_output_path,
+            &write_id,
+            part_idx,
+            &file_extension,
+            single_file_output,
+            max_buffered_batches,
+            &mut tx,
+        )?);
+        row_counts.push(0);
+        part_idx += 1;
+    }
+
+    let schema = input.schema();
+    let mut is_batch_received = false;
+
     while let Some(rb) = input.next().await.transpose()? {
+        is_batch_received = true;
         // ensure we have at least minimum_parallel_files open
         if open_file_streams.len() < minimum_parallel_files {
             open_file_streams.push(create_new_file_stream(
@@ -203,13 +228,24 @@ async fn row_count_demuxer(
             .send(rb)
             .await
             .map_err(|_| {
-                DataFusionError::Execution(
-                    "Error sending RecordBatch to file stream!".into(),
-                )
+                exec_datafusion_err!("Error sending RecordBatch to file stream!")
             })?;
 
         next_send_steam = (next_send_steam + 1) % minimum_parallel_files;
     }
+
+    // if there is no batch send but with a single file, send an empty batch
+    if single_file_output && !is_batch_received {
+        open_file_streams
+            .first_mut()
+            .ok_or_else(|| internal_datafusion_err!("Expected a single output file"))?
+            .send(RecordBatch::new_empty(schema))
+            .await
+            .map_err(|_| {
+                exec_datafusion_err!("Error sending empty RecordBatch to file stream!")
+            })?;
+    }
+
     Ok(())
 }
 
@@ -248,9 +284,8 @@ fn create_new_file_stream(
         single_file_output,
     );
     let (tx_file, rx_file) = mpsc::channel(max_buffered_batches / 2);
-    tx.send((file_path, rx_file)).map_err(|_| {
-        DataFusionError::Execution("Error sending RecordBatch to file stream!".into())
-    })?;
+    tx.send((file_path, rx_file))
+        .map_err(|_| exec_datafusion_err!("Error sending RecordBatch to file stream!"))?;
     Ok(tx_file)
 }
 
@@ -279,7 +314,7 @@ async fn hive_style_partitions_demuxer(
         let all_partition_values = compute_partition_keys_by_row(&rb, &partition_by)?;
 
         // Next compute how the batch should be split up to take each distinct key to its own batch
-        let take_map = compute_take_arrays(&rb, all_partition_values);
+        let take_map = compute_take_arrays(&rb, &all_partition_values);
 
         // Divide up the batch into distinct partition key batches and send each batch
         for (part_key, mut builder) in take_map.into_iter() {
@@ -307,17 +342,13 @@ async fn hive_style_partitions_demuxer(
                     );
 
                     tx.send((file_path, part_rx)).map_err(|_| {
-                        DataFusionError::Execution(
-                            "Error sending new file stream!".into(),
-                        )
+                        exec_datafusion_err!("Error sending new file stream!")
                     })?;
 
                     value_map.insert(part_key.clone(), part_tx);
-                    value_map
-                        .get_mut(&part_key)
-                        .ok_or(DataFusionError::Internal(
-                            "Key must exist since it was just inserted!".into(),
-                        ))?
+                    value_map.get_mut(&part_key).ok_or_else(|| {
+                        exec_datafusion_err!("Key must exist since it was just inserted!")
+                    })?
                 }
             };
 
@@ -329,7 +360,7 @@ async fn hive_style_partitions_demuxer(
 
             // Finally send the partial batch partitioned by distinct value!
             part_tx.send(final_batch_to_send).await.map_err(|_| {
-                DataFusionError::Internal("Unexpected error sending parted batch!".into())
+                internal_datafusion_err!("Unexpected error sending parted batch!")
             })?;
         }
     }
@@ -367,6 +398,12 @@ fn compute_partition_keys_by_row<'a>(
                     partition_values.push(Cow::from(array.value(i)));
                 }
             }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array(col_array)?;
+                for i in 0..rb.num_rows() {
+                    partition_values.push(Cow::from(array.value(i)));
+                }
+            }
             DataType::Utf8View => {
                 let array = as_string_view_array(col_array)?;
                 for i in 0..rb.num_rows() {
@@ -489,9 +526,9 @@ fn compute_partition_keys_by_row<'a>(
             }
             _ => {
                 return not_impl_err!(
-                "it is not yet supported to write to hive partitions with datatype {}",
-                dtype
-            )
+                    "it is not yet supported to write to hive partitions with datatype {}",
+                    dtype
+                );
             }
         }
 
@@ -503,7 +540,7 @@ fn compute_partition_keys_by_row<'a>(
 
 fn compute_take_arrays(
     rb: &RecordBatch,
-    all_partition_values: Vec<Vec<Cow<str>>>,
+    all_partition_values: &[Vec<Cow<str>>],
 ) -> HashMap<Vec<String>, UInt64Builder> {
     let mut take_map = HashMap::new();
     for i in 0..rb.num_rows() {
diff --git a/datafusion/datasource/src/write/mod.rs b/datafusion/datasource/src/write/mod.rs
index 3694568682a5d..e8d2d17da8ee8 100644
--- a/datafusion/datasource/src/write/mod.rs
+++ b/datafusion/datasource/src/write/mod.rs
@@ -28,9 +28,9 @@ use datafusion_common::error::Result;
 use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
 use bytes::Bytes;
+use object_store::ObjectStore;
 use object_store::buffered::BufWriter;
 use object_store::path::Path;
-use object_store::ObjectStore;
 use tokio::io::AsyncWrite;
 
 pub mod demux;
@@ -131,6 +131,8 @@ pub struct ObjectWriterBuilder {
     object_store: Arc<dyn ObjectStore>,
     /// The size of the buffer for the object writer.
     buffer_size: Option<usize>,
+    /// The compression level for the object writer.
+    compression_level: Option<u32>,
 }
 
 impl ObjectWriterBuilder {
@@ -145,6 +147,7 @@ impl ObjectWriterBuilder {
             location: location.clone(),
             object_store,
             buffer_size: None,
+            compression_level: None,
         }
     }
 
@@ -162,7 +165,11 @@ impl ObjectWriterBuilder {
     /// # let object_store = Arc::new(InMemory::new());
     /// let mut builder = ObjectWriterBuilder::new(compression_type, &location, object_store);
     /// builder.set_buffer_size(Some(20 * 1024 * 1024)); //20 MiB
-    /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match");
+    /// assert_eq!(
+    ///     builder.get_buffer_size(),
+    ///     Some(20 * 1024 * 1024),
+    ///     "Internal error: Builder buffer size doesn't match"
+    /// );
     /// ```
     pub fn set_buffer_size(&mut self, buffer_size: Option<usize>) {
         self.buffer_size = buffer_size;
@@ -182,7 +189,11 @@ impl ObjectWriterBuilder {
     /// # let object_store = Arc::new(InMemory::new());
     /// let builder = ObjectWriterBuilder::new(compression_type, &location, object_store)
     ///     .with_buffer_size(Some(20 * 1024 * 1024)); //20 MiB
-    /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match");
+    /// assert_eq!(
+    ///     builder.get_buffer_size(),
+    ///     Some(20 * 1024 * 1024),
+    ///     "Internal error: Builder buffer size doesn't match"
+    /// );
     /// ```
     pub fn with_buffer_size(mut self, buffer_size: Option<usize>) -> Self {
         self.buffer_size = buffer_size;
@@ -194,6 +205,22 @@ impl ObjectWriterBuilder {
         self.buffer_size
     }
 
+    /// Set compression level for object writer.
+    pub fn set_compression_level(&mut self, compression_level: Option<u32>) {
+        self.compression_level = compression_level;
+    }
+
+    /// Set compression level for object writer, returning the builder.
+    pub fn with_compression_level(mut self, compression_level: Option<u32>) -> Self {
+        self.compression_level = compression_level;
+        self
+    }
+
+    /// Currently specified compression level.
+    pub fn get_compression_level(&self) -> Option<u32> {
+        self.compression_level
+    }
+
     /// Return a writer object that writes to the object store location.
     ///
     /// If a buffer size has not been set, the default buffer buffer size will
@@ -207,6 +234,7 @@ impl ObjectWriterBuilder {
             location,
             object_store,
             buffer_size,
+            compression_level,
         } = self;
 
         let buf_writer = match buffer_size {
@@ -214,6 +242,7 @@ impl ObjectWriterBuilder {
             None => BufWriter::new(object_store, location),
         };
 
-        file_compression_type.convert_async_writer(buf_writer)
+        file_compression_type
+            .convert_async_writer_with_level(buf_writer, compression_level)
     }
 }
diff --git a/datafusion/datasource/src/write/orchestration.rs b/datafusion/datasource/src/write/orchestration.rs
index a09509ac58626..39c91a1c0d676 100644
--- a/datafusion/datasource/src/write/orchestration.rs
+++ b/datafusion/datasource/src/write/orchestration.rs
@@ -27,7 +27,9 @@ use crate::file_compression_type::FileCompressionType;
 use datafusion_common::error::Result;
 
 use arrow::array::RecordBatch;
-use datafusion_common::{internal_datafusion_err, internal_err, DataFusionError};
+use datafusion_common::{
+    DataFusionError, exec_datafusion_err, internal_datafusion_err, internal_err,
+};
 use datafusion_common_runtime::{JoinSet, SpawnedTask};
 use datafusion_execution::TaskContext;
 
@@ -117,10 +119,8 @@ pub(crate) async fn serialize_rb_stream_to_object_store(
                     Err(e) => {
                         return SerializedRecordBatchResult::failure(
                             None,
-                            DataFusionError::Execution(format!(
-                                "Error writing to object store: {e}"
-                            )),
-                        )
+                            exec_datafusion_err!("Error writing to object store: {e}"),
+                        );
                     }
                 };
                 row_count += cnt;
@@ -133,9 +133,9 @@ pub(crate) async fn serialize_rb_stream_to_object_store(
                 // Handle task panic or cancellation
                 return SerializedRecordBatchResult::failure(
                     Some(writer),
-                    DataFusionError::Execution(format!(
+                    exec_datafusion_err!(
                         "Serialization task panicked or was cancelled: {e}"
-                    )),
+                    ),
                 );
             }
         }
@@ -148,7 +148,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store(
             return SerializedRecordBatchResult::failure(
                 Some(writer),
                 internal_datafusion_err!("Unknown error writing to object store"),
-            )
+            );
         }
     }
     SerializedRecordBatchResult::success(writer, row_count)
@@ -216,12 +216,20 @@ pub(crate) async fn stateless_serialize_and_write_files(
     }
 
     if any_errors {
-        match any_abort_errors{
-            true => return internal_err!("Error encountered during writing to ObjectStore and failed to abort all writers. Partial result may have been written."),
+        match any_abort_errors {
+            true => {
+                return internal_err!(
+                    "Error encountered during writing to ObjectStore and failed to abort all writers. Partial result may have been written."
+                );
+            }
             false => match triggering_error {
                 Some(e) => return Err(e),
-                None => return internal_err!("Unknown Error encountered during writing to ObjectStore. All writers successfully aborted.")
-            }
+                None => {
+                    return internal_err!(
+                        "Unknown Error encountered during writing to ObjectStore. All writers successfully aborted."
+                    );
+                }
+            },
         }
     }
 
@@ -240,6 +248,7 @@ pub async fn spawn_writer_tasks_and_join(
     context: &Arc<TaskContext>,
     serializer: Arc<dyn BatchSerializer>,
     compression: FileCompressionType,
+    compression_level: Option<u32>,
     object_store: Arc<dyn ObjectStore>,
     demux_task: SpawnedTask<Result<()>>,
     mut file_stream_rx: DemuxedStreamReceiver,
@@ -265,6 +274,7 @@ pub async fn spawn_writer_tasks_and_join(
                         .execution
                         .objectstore_writer_buffer_size,
                 ))
+                .with_compression_level(compression_level)
                 .build()?;
 
         if tx_file_bundle
@@ -285,8 +295,8 @@ pub async fn spawn_writer_tasks_and_join(
         write_coordinator_task.join_unwind(),
         demux_task.join_unwind()
     );
-    r1.map_err(DataFusionError::ExecutionJoin)??;
-    r2.map_err(DataFusionError::ExecutionJoin)??;
+    r1.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+    r2.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
 
     // Return total row count:
     rx_row_cnt.await.map_err(|_| {
diff --git a/datafusion/doc/Cargo.toml b/datafusion/doc/Cargo.toml
index fa316348a6daa..c1368c1531533 100644
--- a/datafusion/doc/Cargo.toml
+++ b/datafusion/doc/Cargo.toml
@@ -19,6 +19,7 @@
 name = "datafusion-doc"
 description = "Documentation module for DataFusion query engine"
 keywords = ["datafusion", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
 homepage = { workspace = true }
@@ -30,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/doc/README.md b/datafusion/doc/README.md
new file mode 100644
index 0000000000000..f137a273e31ab
--- /dev/null
+++ b/datafusion/doc/README.md
@@ -0,0 +1,33 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Documentation
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that provides structures and macros
+for documenting user defined functions.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/doc/src/lib.rs b/datafusion/doc/src/lib.rs
index f9b916c2b3aba..591a5a62f3b20 100644
--- a/datafusion/doc/src/lib.rs
+++ b/datafusion/doc/src/lib.rs
@@ -15,21 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
-#[allow(rustdoc::broken_intra_doc_links)]
-/// Documentation for use by [`ScalarUDFImpl`](ScalarUDFImpl),
-/// [`AggregateUDFImpl`](AggregateUDFImpl) and [`WindowUDFImpl`](WindowUDFImpl) functions.
+mod udaf;
+mod udf;
+mod udwf;
+
+pub use udaf::aggregate_doc_sections;
+pub use udf::scalar_doc_sections;
+pub use udwf::window_doc_sections;
+
+/// Documentation for use by `ScalarUDFImpl`, `AggregateUDFImpl` and `WindowUDFImpl` functions.
 ///
 /// See the [`DocumentationBuilder`] to create a new [`Documentation`] struct.
 ///
 /// The DataFusion [SQL function documentation] is automatically  generated from these structs.
-/// The name of the udf will be pulled from the [`ScalarUDFImpl::name`](ScalarUDFImpl::name),
-/// [`AggregateUDFImpl::name`](AggregateUDFImpl::name) or [`WindowUDFImpl::name`](WindowUDFImpl::name)
+/// The name of the udf will be pulled from the `ScalarUDFImpl::name`,
+/// `AggregateUDFImpl::name` or `WindowUDFImpl::name`
 /// function as appropriate.
 ///
 /// All strings in the documentation are required to be
@@ -39,7 +46,7 @@
 /// thus all text should be in English.
 ///
 /// [SQL function documentation]: https://datafusion.apache.org/user-guide/sql/index.html
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct Documentation {
     /// The section in the documentation where the UDF will be documented
     pub doc_section: DocSection,
@@ -158,7 +165,7 @@ impl Documentation {
     }
 }
 
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct DocSection {
     /// True to include this doc section in the public
     /// documentation, false otherwise
@@ -212,15 +219,6 @@ pub struct DocumentationBuilder {
 }
 
 impl DocumentationBuilder {
-    #[allow(clippy::new_without_default)]
-    #[deprecated(
-        since = "44.0.0",
-        note = "please use `DocumentationBuilder::new_with_details` instead"
-    )]
-    pub fn new() -> Self {
-        Self::new_with_details(DocSection::default(), "<no description>", "<no example>")
-    }
-
     /// Creates a new [`DocumentationBuilder`] with all required fields
     pub fn new_with_details(
         doc_section: DocSection,
diff --git a/datafusion/doc/src/udaf.rs b/datafusion/doc/src/udaf.rs
new file mode 100644
index 0000000000000..c3a0b4adbcb1e
--- /dev/null
+++ b/datafusion/doc/src/udaf.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Aggregate UDF doc sections for use in public documentation
+pub mod aggregate_doc_sections {
+    use crate::DocSection;
+
+    pub fn doc_sections() -> Vec<DocSection> {
+        vec![
+            DOC_SECTION_GENERAL,
+            DOC_SECTION_STATISTICAL,
+            DOC_SECTION_APPROXIMATE,
+        ]
+    }
+
+    pub const DOC_SECTION_GENERAL: DocSection = DocSection {
+        include: true,
+        label: "General Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_STATISTICAL: DocSection = DocSection {
+        include: true,
+        label: "Statistical Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection {
+        include: true,
+        label: "Approximate Functions",
+        description: None,
+    };
+}
diff --git a/datafusion/doc/src/udf.rs b/datafusion/doc/src/udf.rs
new file mode 100644
index 0000000000000..d1f51d919478d
--- /dev/null
+++ b/datafusion/doc/src/udf.rs
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Scalar UDF doc sections for use in public documentation
+pub mod scalar_doc_sections {
+    use crate::DocSection;
+
+    pub fn doc_sections() -> Vec<DocSection> {
+        vec![
+            DOC_SECTION_MATH,
+            DOC_SECTION_CONDITIONAL,
+            DOC_SECTION_STRING,
+            DOC_SECTION_BINARY_STRING,
+            DOC_SECTION_REGEX,
+            DOC_SECTION_DATETIME,
+            DOC_SECTION_ARRAY,
+            DOC_SECTION_STRUCT,
+            DOC_SECTION_MAP,
+            DOC_SECTION_HASHING,
+            DOC_SECTION_UNION,
+            DOC_SECTION_OTHER,
+        ]
+    }
+
+    pub const fn doc_sections_const() -> &'static [DocSection] {
+        &[
+            DOC_SECTION_MATH,
+            DOC_SECTION_CONDITIONAL,
+            DOC_SECTION_STRING,
+            DOC_SECTION_BINARY_STRING,
+            DOC_SECTION_REGEX,
+            DOC_SECTION_DATETIME,
+            DOC_SECTION_ARRAY,
+            DOC_SECTION_STRUCT,
+            DOC_SECTION_MAP,
+            DOC_SECTION_HASHING,
+            DOC_SECTION_UNION,
+            DOC_SECTION_OTHER,
+        ]
+    }
+
+    pub const DOC_SECTION_MATH: DocSection = DocSection {
+        include: true,
+        label: "Math Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection {
+        include: true,
+        label: "Conditional Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_STRING: DocSection = DocSection {
+        include: true,
+        label: "String Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection {
+        include: true,
+        label: "Binary String Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_REGEX: DocSection = DocSection {
+        include: true,
+        label: "Regular Expression Functions",
+        description: Some(
+            r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions)
+regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
+(minus support for several features including look-around and backreferences).
+The following regular expression functions are supported:"#,
+        ),
+    };
+
+    pub const DOC_SECTION_DATETIME: DocSection = DocSection {
+        include: true,
+        label: "Time and Date Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_ARRAY: DocSection = DocSection {
+        include: true,
+        label: "Array Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_STRUCT: DocSection = DocSection {
+        include: true,
+        label: "Struct Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_MAP: DocSection = DocSection {
+        include: true,
+        label: "Map Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_HASHING: DocSection = DocSection {
+        include: true,
+        label: "Hashing Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_OTHER: DocSection = DocSection {
+        include: true,
+        label: "Other Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_UNION: DocSection = DocSection {
+        include: true,
+        label: "Union Functions",
+        description: Some(
+            "Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator",
+        ),
+    };
+}
diff --git a/datafusion/doc/src/udwf.rs b/datafusion/doc/src/udwf.rs
new file mode 100644
index 0000000000000..0257ce5ba66b5
--- /dev/null
+++ b/datafusion/doc/src/udwf.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Window UDF doc sections for use in public documentation
+pub mod window_doc_sections {
+    use crate::DocSection;
+
+    pub fn doc_sections() -> Vec<DocSection> {
+        vec![
+            DOC_SECTION_AGGREGATE,
+            DOC_SECTION_RANKING,
+            DOC_SECTION_ANALYTICAL,
+        ]
+    }
+
+    pub const DOC_SECTION_AGGREGATE: DocSection = DocSection {
+        include: true,
+        label: "Aggregate Functions",
+        description: Some("All aggregate functions can be used as window functions."),
+    };
+
+    pub const DOC_SECTION_RANKING: DocSection = DocSection {
+        include: true,
+        label: "Ranking Functions",
+        description: None,
+    };
+
+    pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection {
+        include: true,
+        label: "Analytical Functions",
+        description: None,
+    };
+}
diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml
index 5988d3a336602..06c84d8acb493 100644
--- a/datafusion/execution/Cargo.toml
+++ b/datafusion/execution/Cargo.toml
@@ -31,21 +31,39 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_execution"
 
+[features]
+default = ["sql"]
+
+parquet_encryption = [
+    "parquet/encryption",
+]
+arrow_buffer_pool = [
+    "arrow-buffer/pool",
+]
+sql = []
+
 [dependencies]
 arrow = { workspace = true }
+arrow-buffer = { workspace = true }
+async-trait = { workspace = true }
 dashmap = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
-datafusion-expr = { workspace = true }
+datafusion-common = { workspace = true, default-features = false }
+datafusion-expr = { workspace = true, default-features = false }
+datafusion-physical-expr-common = { workspace = true, default-features = false }
 futures = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true, features = ["fs"] }
 parking_lot = { workspace = true }
+parquet = { workspace = true, optional = true }
 rand = { workspace = true }
 tempfile = { workspace = true }
 url = { workspace = true }
diff --git a/datafusion/execution/README.md b/datafusion/execution/README.md
index 8a03255ee4ad3..5b1528b0daab9 100644
--- a/datafusion/execution/README.md
+++ b/datafusion/execution/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Execution
+# Apache DataFusion Execution
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides execution runtime such as the memory pools and disk manager.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs
index c2403e34c6657..bd34c441bdbde 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -16,89 +16,564 @@
 // under the License.
 
 use crate::cache::CacheAccessor;
+use crate::cache::DefaultListFilesCache;
+use crate::cache::cache_unit::DefaultFilesMetadataCache;
+use crate::cache::list_files_cache::ListFilesEntry;
+use crate::cache::list_files_cache::TableScopedPath;
+use datafusion_common::TableReference;
+use datafusion_common::stats::Precision;
 use datafusion_common::{Result, Statistics};
-use object_store::path::Path;
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use object_store::ObjectMeta;
+use object_store::path::Path;
+use std::any::Any;
+use std::collections::HashMap;
 use std::fmt::{Debug, Formatter};
+use std::ops::Deref;
 use std::sync::Arc;
+use std::time::Duration;
+
+pub use super::list_files_cache::{
+    DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL,
+};
+
+/// Cached metadata for a file, including statistics and ordering.
+///
+/// This struct embeds the [`ObjectMeta`] used for cache validation,
+/// along with the cached statistics and ordering information.
+#[derive(Debug, Clone)]
+pub struct CachedFileMetadata {
+    /// File metadata used for cache validation (size, last_modified).
+    pub meta: ObjectMeta,
+    /// Cached statistics for the file, if available.
+    pub statistics: Arc<Statistics>,
+    /// Cached ordering for the file.
+    pub ordering: Option<LexOrdering>,
+}
+
+impl CachedFileMetadata {
+    /// Create a new cached file metadata entry.
+    pub fn new(
+        meta: ObjectMeta,
+        statistics: Arc<Statistics>,
+        ordering: Option<LexOrdering>,
+    ) -> Self {
+        Self {
+            meta,
+            statistics,
+            ordering,
+        }
+    }
+
+    /// Check if this cached entry is still valid for the given metadata.
+    ///
+    /// Returns true if the file size and last modified time match.
+    pub fn is_valid_for(&self, current_meta: &ObjectMeta) -> bool {
+        self.meta.size == current_meta.size
+            && self.meta.last_modified == current_meta.last_modified
+    }
+}
+
+/// A cache for file statistics and orderings.
+///
+/// This cache stores [`CachedFileMetadata`] which includes:
+/// - File metadata for validation (size, last_modified)
+/// - Statistics for the file
+/// - Ordering information for the file
+///
+/// If enabled via [`CacheManagerConfig::with_files_statistics_cache`] this
+/// cache avoids inferring the same file statistics repeatedly during the
+/// session lifetime.
+///
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
+/// See [`crate::runtime_env::RuntimeEnv`] for more details
+pub trait FileStatisticsCache: CacheAccessor<Path, CachedFileMetadata> {
+    /// Retrieves the information about the entries currently cached.
+    fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry>;
+}
+
+/// Represents information about a cached statistics entry.
+/// This is used to expose the statistics cache contents to outside modules.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct FileStatisticsCacheEntry {
+    pub object_meta: ObjectMeta,
+    /// Number of table rows.
+    pub num_rows: Precision<usize>,
+    /// Number of table columns.
+    pub num_columns: usize,
+    /// Total table size, in bytes.
+    pub table_size_bytes: Precision<usize>,
+    /// Size of the statistics entry, in bytes.
+    pub statistics_size_bytes: usize,
+    /// Whether ordering information is cached for this file.
+    pub has_ordering: bool,
+}
+
+/// Cached file listing.
+///
+/// TTL expiration is handled internally by the cache implementation.
+#[derive(Debug, Clone, PartialEq)]
+pub struct CachedFileList {
+    /// The cached file list.
+    pub files: Arc<Vec<ObjectMeta>>,
+}
+
+impl CachedFileList {
+    /// Create a new cached file list.
+    pub fn new(files: Vec<ObjectMeta>) -> Self {
+        Self {
+            files: Arc::new(files),
+        }
+    }
+
+    /// Filter the files by prefix.
+    fn filter_by_prefix(&self, prefix: &Option<Path>) -> Vec<ObjectMeta> {
+        match prefix {
+            Some(prefix) => self
+                .files
+                .iter()
+                .filter(|meta| meta.location.as_ref().starts_with(prefix.as_ref()))
+                .cloned()
+                .collect(),
+            None => self.files.as_ref().clone(),
+        }
+    }
+
+    /// Returns files matching the given prefix.
+    ///
+    /// When prefix is `None`, returns a clone of the `Arc` (no data copy).
+    /// When filtering is needed, returns a new `Arc` with filtered results (clones each matching [`ObjectMeta`]).
+    pub fn files_matching_prefix(&self, prefix: &Option<Path>) -> Arc<Vec<ObjectMeta>> {
+        match prefix {
+            None => Arc::clone(&self.files),
+            Some(p) => Arc::new(self.filter_by_prefix(&Some(p.clone()))),
+        }
+    }
+}
+
+impl Deref for CachedFileList {
+    type Target = Arc<Vec<ObjectMeta>>;
+    fn deref(&self) -> &Self::Target {
+        &self.files
+    }
+}
+
+impl From<Vec<ObjectMeta>> for CachedFileList {
+    fn from(files: Vec<ObjectMeta>) -> Self {
+        Self::new(files)
+    }
+}
+
+/// Cache for storing the [`ObjectMeta`]s that result from listing a path
+///
+/// Listing a path means doing an object store "list" operation or `ls`
+/// command on the local filesystem. This operation can be expensive,
+/// especially when done over remote object stores.
+///
+/// The cache key is always the table's base path, ensuring a stable cache key.
+/// The cached value is a [`CachedFileList`] containing the files and a timestamp.
+///
+/// Partition filtering is done after retrieval using [`CachedFileList::files_matching_prefix`].
+///
+/// See [`crate::runtime_env::RuntimeEnv`] for more details.
+pub trait ListFilesCache: CacheAccessor<TableScopedPath, CachedFileList> {
+    /// Returns the cache's memory limit in bytes.
+    fn cache_limit(&self) -> usize;
+
+    /// Returns the TTL (time-to-live) for cache entries, if configured.
+    fn cache_ttl(&self) -> Option<Duration>;
+
+    /// Updates the cache with a new memory limit in bytes.
+    fn update_cache_limit(&self, limit: usize);
+
+    /// Updates the cache with a new TTL (time-to-live).
+    fn update_cache_ttl(&self, ttl: Option<Duration>);
+
+    /// Retrieves the information about the entries currently cached.
+    fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry>;
+
+    /// Drop all entries for the given table reference.
+    fn drop_table_entries(&self, table_ref: &Option<TableReference>) -> Result<()>;
+}
+
+/// Generic file-embedded metadata used with [`FileMetadataCache`].
+///
+/// For example, Parquet footers and page metadata can be represented
+/// using this trait.
+///
+/// See [`crate::runtime_env::RuntimeEnv`] for more details
+pub trait FileMetadata: Any + Send + Sync {
+    /// Returns the file metadata as [`Any`] so that it can be downcast to a specific
+    /// implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Returns the size of the metadata in bytes.
+    fn memory_size(&self) -> usize;
+
+    /// Returns extra information about this entry (used by [`FileMetadataCache::list_entries`]).
+    fn extra_info(&self) -> HashMap<String, String>;
+}
 
-/// The cache of listing files statistics.
-/// if set [`CacheManagerConfig::with_files_statistics_cache`]
-/// Will avoid infer same file statistics repeatedly during the session lifetime,
-/// this cache will store in [`crate::runtime_env::RuntimeEnv`].
-pub type FileStatisticsCache =
-    Arc<dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta>>;
+/// Cached file metadata entry with validation information.
+#[derive(Clone)]
+pub struct CachedFileMetadataEntry {
+    /// File metadata used for cache validation (size, last_modified).
+    pub meta: ObjectMeta,
+    /// The cached file metadata.
+    pub file_metadata: Arc<dyn FileMetadata>,
+}
+
+impl CachedFileMetadataEntry {
+    /// Create a new cached file metadata entry.
+    pub fn new(meta: ObjectMeta, file_metadata: Arc<dyn FileMetadata>) -> Self {
+        Self {
+            meta,
+            file_metadata,
+        }
+    }
 
-pub type ListFilesCache =
-    Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>;
+    /// Check if this cached entry is still valid for the given metadata.
+    pub fn is_valid_for(&self, current_meta: &ObjectMeta) -> bool {
+        self.meta.size == current_meta.size
+            && self.meta.last_modified == current_meta.last_modified
+    }
+}
 
-impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
+impl Debug for CachedFileMetadataEntry {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CachedFileMetadataEntry")
+            .field("meta", &self.meta)
+            .field("memory_size", &self.file_metadata.memory_size())
+            .finish()
+    }
+}
+
+/// Cache for file-embedded metadata.
+///
+/// This cache stores per-file metadata in the form of [`CachedFileMetadataEntry`],
+/// which includes the [`ObjectMeta`] for validation.
+///
+/// For example, the built in [`ListingTable`] uses this cache to avoid parsing
+/// Parquet footers multiple times for the same file.
+///
+/// DataFusion provides a default implementation, [`DefaultFilesMetadataCache`],
+/// and users can also provide their own implementations to implement custom
+/// caching strategies.
+///
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
+/// See [`crate::runtime_env::RuntimeEnv`] for more details.
+///
+/// [`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
+pub trait FileMetadataCache: CacheAccessor<Path, CachedFileMetadataEntry> {
+    /// Returns the cache's memory limit in bytes.
+    fn cache_limit(&self) -> usize;
+
+    /// Updates the cache with a new memory limit in bytes.
+    fn update_cache_limit(&self, limit: usize);
+
+    /// Retrieves the information about the entries currently cached.
+    fn list_entries(&self) -> HashMap<Path, FileMetadataCacheEntry>;
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+/// Represents information about a cached metadata entry.
+/// This is used to expose the metadata cache contents to outside modules.
+pub struct FileMetadataCacheEntry {
+    pub object_meta: ObjectMeta,
+    /// Size of the cached metadata, in bytes.
+    pub size_bytes: usize,
+    /// Number of times this entry was retrieved.
+    pub hits: usize,
+    /// Additional object-specific information.
+    pub extra: HashMap<String, String>,
+}
+
+impl Debug for dyn FileStatisticsCache {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Cache name: {} with length: {}", self.name(), self.len())
+    }
+}
+
+impl Debug for dyn ListFilesCache {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Cache name: {} with length: {}", self.name(), self.len())
     }
 }
 
-impl Debug for dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> {
+impl Debug for dyn FileMetadataCache {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Cache name: {} with length: {}", self.name(), self.len())
     }
 }
 
-#[derive(Default, Debug)]
+/// Manages various caches used in DataFusion.
+///
+/// Following DataFusion design principles, DataFusion provides default cache
+/// implementations, while also allowing users to provide their own custom cache
+/// implementations by implementing the relevant traits.
+///
+/// See [`CacheManagerConfig`] for configuration options.
+#[derive(Debug)]
 pub struct CacheManager {
-    file_statistic_cache: Option<FileStatisticsCache>,
-    list_files_cache: Option<ListFilesCache>,
+    file_statistic_cache: Option<Arc<dyn FileStatisticsCache>>,
+    list_files_cache: Option<Arc<dyn ListFilesCache>>,
+    file_metadata_cache: Arc<dyn FileMetadataCache>,
 }
 
 impl CacheManager {
     pub fn try_new(config: &CacheManagerConfig) -> Result<Arc<Self>> {
-        let mut manager = CacheManager::default();
-        if let Some(cc) = &config.table_files_statistics_cache {
-            manager.file_statistic_cache = Some(Arc::clone(cc))
-        }
-        if let Some(lc) = &config.list_files_cache {
-            manager.list_files_cache = Some(Arc::clone(lc))
-        }
-        Ok(Arc::new(manager))
+        let file_statistic_cache =
+            config.table_files_statistics_cache.as_ref().map(Arc::clone);
+
+        let list_files_cache = match &config.list_files_cache {
+            Some(lfc) if config.list_files_cache_limit > 0 => {
+                // the cache memory limit or ttl might have changed, ensure they are updated
+                lfc.update_cache_limit(config.list_files_cache_limit);
+                // Only update TTL if explicitly set in config, otherwise preserve the cache's existing TTL
+                if let Some(ttl) = config.list_files_cache_ttl {
+                    lfc.update_cache_ttl(Some(ttl));
+                }
+                Some(Arc::clone(lfc))
+            }
+            None if config.list_files_cache_limit > 0 => {
+                let lfc: Arc<dyn ListFilesCache> = Arc::new(DefaultListFilesCache::new(
+                    config.list_files_cache_limit,
+                    config.list_files_cache_ttl,
+                ));
+                Some(lfc)
+            }
+            _ => None,
+        };
+
+        let file_metadata_cache = config
+            .file_metadata_cache
+            .as_ref()
+            .map(Arc::clone)
+            .unwrap_or_else(|| {
+                Arc::new(DefaultFilesMetadataCache::new(config.metadata_cache_limit))
+            });
+
+        // the cache memory limit might have changed, ensure the limit is updated
+        file_metadata_cache.update_cache_limit(config.metadata_cache_limit);
+
+        Ok(Arc::new(CacheManager {
+            file_statistic_cache,
+            list_files_cache,
+            file_metadata_cache,
+        }))
     }
 
     /// Get the cache of listing files statistics.
-    pub fn get_file_statistic_cache(&self) -> Option<FileStatisticsCache> {
+    pub fn get_file_statistic_cache(&self) -> Option<Arc<dyn FileStatisticsCache>> {
         self.file_statistic_cache.clone()
     }
 
-    /// Get the cache of objectMeta under same path.
-    pub fn get_list_files_cache(&self) -> Option<ListFilesCache> {
+    /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
+    pub fn get_list_files_cache(&self) -> Option<Arc<dyn ListFilesCache>> {
         self.list_files_cache.clone()
     }
+
+    /// Get the memory limit of the list files cache.
+    pub fn get_list_files_cache_limit(&self) -> usize {
+        self.list_files_cache
+            .as_ref()
+            .map_or(0, |c| c.cache_limit())
+    }
+
+    /// Get the TTL (time-to-live) of the list files cache.
+    pub fn get_list_files_cache_ttl(&self) -> Option<Duration> {
+        self.list_files_cache.as_ref().and_then(|c| c.cache_ttl())
+    }
+
+    /// Get the file embedded metadata cache.
+    pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> {
+        Arc::clone(&self.file_metadata_cache)
+    }
+
+    /// Get the limit of the file embedded metadata cache.
+    pub fn get_metadata_cache_limit(&self) -> usize {
+        self.file_metadata_cache.cache_limit()
+    }
 }
 
-#[derive(Clone, Default)]
+pub const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M
+
+#[derive(Clone)]
 pub struct CacheManagerConfig {
-    /// Enable cache of files statistics when listing files.
-    /// Avoid get same file statistics repeatedly in same datafusion session.
-    /// Default is disable. Fow now only supports Parquet files.
-    pub table_files_statistics_cache: Option<FileStatisticsCache>,
-    /// Enable cache of file metadata when listing files.
-    /// This setting avoids listing file meta of the same path repeatedly
-    /// in same session, which may be expensive in certain situations (e.g. remote object storage).
+    /// Enable caching of file statistics when listing files.
+    /// Enabling the cache avoids repeatedly reading file statistics in a DataFusion session.
+    /// Default is disabled. Currently only Parquet files are supported.
+    pub table_files_statistics_cache: Option<Arc<dyn FileStatisticsCache>>,
+    /// Enable caching of file metadata when listing files.
+    /// Enabling the cache avoids repeat list and object metadata fetch operations, which may be
+    /// expensive in certain situations (e.g. remote object storage), for objects under paths that
+    /// are cached.
     /// Note that if this option is enabled, DataFusion will not see any updates to the underlying
-    /// location.  
-    /// Default is disable.
-    pub list_files_cache: Option<ListFilesCache>,
+    /// storage for at least `list_files_cache_ttl` duration.
+    /// Default is disabled.
+    pub list_files_cache: Option<Arc<dyn ListFilesCache>>,
+    /// Limit of the `list_files_cache`, in bytes. Default: 1MiB.
+    pub list_files_cache_limit: usize,
+    /// The duration the list files cache will consider an entry valid after insertion. Note that
+    /// changes to the underlying storage system, such as adding or removing data, will not be
+    /// visible until an entry expires. Default: None (infinite).
+    pub list_files_cache_ttl: Option<Duration>,
+    /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a
+    /// data file (e.g., Parquet footer and page metadata).
+    /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
+    pub file_metadata_cache: Option<Arc<dyn FileMetadataCache>>,
+    /// Limit of the file-embedded metadata cache, in bytes.
+    pub metadata_cache_limit: usize,
+}
+
+impl Default for CacheManagerConfig {
+    fn default() -> Self {
+        Self {
+            table_files_statistics_cache: Default::default(),
+            list_files_cache: Default::default(),
+            list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT,
+            list_files_cache_ttl: DEFAULT_LIST_FILES_CACHE_TTL,
+            file_metadata_cache: Default::default(),
+            metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT,
+        }
+    }
 }
 
 impl CacheManagerConfig {
+    /// Set the cache for files statistics.
+    ///
+    /// Default is `None` (disabled).
     pub fn with_files_statistics_cache(
         mut self,
-        cache: Option<FileStatisticsCache>,
+        cache: Option<Arc<dyn FileStatisticsCache>>,
     ) -> Self {
         self.table_files_statistics_cache = cache;
         self
     }
 
-    pub fn with_list_files_cache(mut self, cache: Option<ListFilesCache>) -> Self {
+    /// Set the cache for listing files.
+    ///
+    /// Default is `None` (disabled).
+    pub fn with_list_files_cache(
+        mut self,
+        cache: Option<Arc<dyn ListFilesCache>>,
+    ) -> Self {
         self.list_files_cache = cache;
         self
     }
+
+    /// Sets the limit of the list files cache, in bytes.
+    ///
+    /// Default: 1MiB (1,048,576 bytes).
+    pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self {
+        self.list_files_cache_limit = limit;
+        self
+    }
+
+    /// Sets the TTL (time-to-live) for entries in the list files cache.
+    ///
+    /// Default: None (infinite).
+    pub fn with_list_files_cache_ttl(mut self, ttl: Option<Duration>) -> Self {
+        self.list_files_cache_ttl = ttl;
+        self
+    }
+
+    /// Sets the cache for file-embedded metadata.
+    ///
+    /// Default is a [`DefaultFilesMetadataCache`].
+    pub fn with_file_metadata_cache(
+        mut self,
+        cache: Option<Arc<dyn FileMetadataCache>>,
+    ) -> Self {
+        self.file_metadata_cache = cache;
+        self
+    }
+
+    /// Sets the limit of the file-embedded metadata cache, in bytes.
+    pub fn with_metadata_cache_limit(mut self, limit: usize) -> Self {
+        self.metadata_cache_limit = limit;
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cache::DefaultListFilesCache;
+
+    /// Test to verify that TTL is preserved when not explicitly set in config.
+    /// This fixes issue #19396 where TTL was being unset from DefaultListFilesCache
+    /// when CacheManagerConfig::list_files_cache_ttl was not set explicitly.
+    #[test]
+    fn test_ttl_preserved_when_not_set_in_config() {
+        use std::time::Duration;
+
+        // Create a cache with TTL = 1 second
+        let list_file_cache =
+            DefaultListFilesCache::new(1024, Some(Duration::from_secs(1)));
+
+        // Verify the cache has TTL set initially
+        assert_eq!(
+            list_file_cache.cache_ttl(),
+            Some(Duration::from_secs(1)),
+            "Cache should have TTL = 1 second initially"
+        );
+
+        // Put cache in config WITHOUT setting list_files_cache_ttl
+        let config = CacheManagerConfig::default()
+            .with_list_files_cache(Some(Arc::new(list_file_cache)));
+
+        // Create CacheManager from config
+        let cache_manager = CacheManager::try_new(&config).unwrap();
+
+        // Verify TTL is preserved (not unset)
+        let cache_ttl = cache_manager.get_list_files_cache().unwrap().cache_ttl();
+
+        assert!(
+            cache_ttl.is_some(),
+            "TTL should be preserved when not set in config. Expected Some(Duration::from_secs(1)), got {cache_ttl:?}"
+        );
+
+        // Verify it's the correct TTL value
+        assert_eq!(
+            cache_ttl,
+            Some(Duration::from_secs(1)),
+            "TTL should be exactly 1 second"
+        );
+    }
+
+    /// Test to verify that TTL can still be overridden when explicitly set in config.
+    #[test]
+    fn test_ttl_overridden_when_set_in_config() {
+        use std::time::Duration;
+
+        // Create a cache with TTL = 1 second
+        let list_file_cache =
+            DefaultListFilesCache::new(1024, Some(Duration::from_secs(1)));
+
+        // Put cache in config WITH a different TTL set
+        let config = CacheManagerConfig::default()
+            .with_list_files_cache(Some(Arc::new(list_file_cache)))
+            .with_list_files_cache_ttl(Some(Duration::from_secs(60)));
+
+        // Create CacheManager from config
+        let cache_manager = CacheManager::try_new(&config).unwrap();
+
+        // Verify TTL is overridden to the config value
+        let cache_ttl = cache_manager.get_list_files_cache().unwrap().cache_ttl();
+
+        assert_eq!(
+            cache_ttl,
+            Some(Duration::from_secs(60)),
+            "TTL should be overridden to 60 seconds when set in config"
+        );
+    }
 }
diff --git a/datafusion/execution/src/cache/cache_unit.rs b/datafusion/execution/src/cache/cache_unit.rs
index a9291659a3efa..d98d23821ec7f 100644
--- a/datafusion/execution/src/cache/cache_unit.rs
+++ b/datafusion/execution/src/cache/cache_unit.rs
@@ -15,221 +15,402 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+use std::collections::HashMap;
 
 use crate::cache::CacheAccessor;
-
-use datafusion_common::Statistics;
+use crate::cache::cache_manager::{
+    CachedFileMetadata, FileStatisticsCache, FileStatisticsCacheEntry,
+};
 
 use dashmap::DashMap;
 use object_store::path::Path;
-use object_store::ObjectMeta;
 
-/// Collected statistics for files
-/// Cache is invalided when file size or last modification has changed
+pub use crate::cache::DefaultFilesMetadataCache;
+
+/// Default implementation of [`FileStatisticsCache`]
+///
+/// Stores cached file metadata (statistics and orderings) for files.
+///
+/// The typical usage pattern is:
+/// 1. Call `get(path)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(path, new_value)`
+///
+/// Uses DashMap for lock-free concurrent access.
+///
+/// [`FileStatisticsCache`]: crate::cache::cache_manager::FileStatisticsCache
 #[derive(Default)]
 pub struct DefaultFileStatisticsCache {
-    statistics: DashMap<Path, (ObjectMeta, Arc<Statistics>)>,
+    cache: DashMap<Path, CachedFileMetadata>,
 }
 
-impl CacheAccessor<Path, Arc<Statistics>> for DefaultFileStatisticsCache {
-    type Extra = ObjectMeta;
-
-    /// Get `Statistics` for file location.
-    fn get(&self, k: &Path) -> Option<Arc<Statistics>> {
-        self.statistics
-            .get(k)
-            .map(|s| Some(Arc::clone(&s.value().1)))
-            .unwrap_or(None)
-    }
-
-    /// Get `Statistics` for file location. Returns None if file has changed or not found.
-    fn get_with_extra(&self, k: &Path, e: &Self::Extra) -> Option<Arc<Statistics>> {
-        self.statistics
-            .get(k)
-            .map(|s| {
-                let (saved_meta, statistics) = s.value();
-                if saved_meta.size != e.size
-                    || saved_meta.last_modified != e.last_modified
-                {
-                    // file has changed
-                    None
-                } else {
-                    Some(Arc::clone(statistics))
-                }
-            })
-            .unwrap_or(None)
-    }
-
-    /// Save collected file statistics
-    fn put(&self, _key: &Path, _value: Arc<Statistics>) -> Option<Arc<Statistics>> {
-        panic!("Put cache in DefaultFileStatisticsCache without Extra not supported.")
+impl CacheAccessor<Path, CachedFileMetadata> for DefaultFileStatisticsCache {
+    fn get(&self, key: &Path) -> Option<CachedFileMetadata> {
+        self.cache.get(key).map(|entry| entry.value().clone())
     }
 
-    fn put_with_extra(
-        &self,
-        key: &Path,
-        value: Arc<Statistics>,
-        e: &Self::Extra,
-    ) -> Option<Arc<Statistics>> {
-        self.statistics
-            .insert(key.clone(), (e.clone(), value))
-            .map(|x| x.1)
+    fn put(&self, key: &Path, value: CachedFileMetadata) -> Option<CachedFileMetadata> {
+        self.cache.insert(key.clone(), value)
     }
 
-    fn remove(&mut self, k: &Path) -> Option<Arc<Statistics>> {
-        self.statistics.remove(k).map(|x| x.1 .1)
+    fn remove(&self, k: &Path) -> Option<CachedFileMetadata> {
+        self.cache.remove(k).map(|(_, entry)| entry)
     }
 
     fn contains_key(&self, k: &Path) -> bool {
-        self.statistics.contains_key(k)
+        self.cache.contains_key(k)
     }
 
     fn len(&self) -> usize {
-        self.statistics.len()
+        self.cache.len()
     }
 
     fn clear(&self) {
-        self.statistics.clear()
+        self.cache.clear();
     }
+
     fn name(&self) -> String {
         "DefaultFileStatisticsCache".to_string()
     }
 }
 
-/// Collected files metadata for listing files.
-/// Cache will not invalided until user call remove or clear.
-#[derive(Default)]
-pub struct DefaultListFilesCache {
-    statistics: DashMap<Path, Arc<Vec<ObjectMeta>>>,
+impl FileStatisticsCache for DefaultFileStatisticsCache {
+    fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry> {
+        let mut entries = HashMap::<Path, FileStatisticsCacheEntry>::new();
+
+        for entry in self.cache.iter() {
+            let path = entry.key();
+            let cached = entry.value();
+            entries.insert(
+                path.clone(),
+                FileStatisticsCacheEntry {
+                    object_meta: cached.meta.clone(),
+                    num_rows: cached.statistics.num_rows,
+                    num_columns: cached.statistics.column_statistics.len(),
+                    table_size_bytes: cached.statistics.total_byte_size,
+                    statistics_size_bytes: 0, // TODO: set to the real size in the future
+                    has_ordering: cached.ordering.is_some(),
+                },
+            );
+        }
+
+        entries
+    }
 }
 
-impl CacheAccessor<Path, Arc<Vec<ObjectMeta>>> for DefaultListFilesCache {
-    type Extra = ObjectMeta;
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::cache::CacheAccessor;
+    use crate::cache::cache_manager::{
+        CachedFileMetadata, FileStatisticsCache, FileStatisticsCacheEntry,
+    };
+    use arrow::array::RecordBatch;
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use chrono::DateTime;
+    use datafusion_common::Statistics;
+    use datafusion_common::stats::Precision;
+    use datafusion_expr::ColumnarValue;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use object_store::ObjectMeta;
+    use object_store::path::Path;
+    use std::sync::Arc;
 
-    fn get(&self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.get(k).map(|x| Arc::clone(x.value()))
+    fn create_test_meta(path: &str, size: u64) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(path),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size,
+            e_tag: None,
+            version: None,
+        }
     }
 
-    fn get_with_extra(
-        &self,
-        _k: &Path,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache get_with_extra")
-    }
+    #[test]
+    fn test_statistics_cache() {
+        let meta = create_test_meta("test", 1024);
+        let cache = DefaultFileStatisticsCache::default();
 
-    fn put(
-        &self,
-        key: &Path,
-        value: Arc<Vec<ObjectMeta>>,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.insert(key.clone(), value)
-    }
+        let schema = Schema::new(vec![Field::new(
+            "test_column",
+            DataType::Timestamp(TimeUnit::Second, None),
+            false,
+        )]);
 
-    fn put_with_extra(
-        &self,
-        _key: &Path,
-        _value: Arc<Vec<ObjectMeta>>,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache put_with_extra")
-    }
+        // Cache miss
+        assert!(cache.get(&meta.location).is_none());
 
-    fn remove(&mut self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.remove(k).map(|x| x.1)
+        // Put a value
+        let cached_value = CachedFileMetadata::new(
+            meta.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&meta.location, cached_value);
+
+        // Cache hit
+        let result = cache.get(&meta.location);
+        assert!(result.is_some());
+        let cached = result.unwrap();
+        assert!(cached.is_valid_for(&meta));
+
+        // File size changed - validation should fail
+        let meta2 = create_test_meta("test", 2048);
+        let cached = cache.get(&meta2.location).unwrap();
+        assert!(!cached.is_valid_for(&meta2));
+
+        // Update with new value
+        let cached_value2 = CachedFileMetadata::new(
+            meta2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&meta2.location, cached_value2);
+
+        // Test list_entries
+        let entries = cache.list_entries();
+        assert_eq!(entries.len(), 1);
+        let entry = entries.get(&Path::from("test")).unwrap();
+        assert_eq!(entry.object_meta.size, 2048); // Should be updated value
     }
 
-    fn contains_key(&self, k: &Path) -> bool {
-        self.statistics.contains_key(k)
+    #[derive(Clone, Debug, PartialEq, Eq, Hash)]
+    struct MockExpr {}
+
+    impl std::fmt::Display for MockExpr {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockExpr")
+        }
     }
 
-    fn len(&self) -> usize {
-        self.statistics.len()
+    impl PhysicalExpr for MockExpr {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn data_type(
+            &self,
+            _input_schema: &Schema,
+        ) -> datafusion_common::Result<DataType> {
+            Ok(DataType::Int32)
+        }
+
+        fn nullable(&self, _input_schema: &Schema) -> datafusion_common::Result<bool> {
+            Ok(false)
+        }
+
+        fn evaluate(
+            &self,
+            _batch: &RecordBatch,
+        ) -> datafusion_common::Result<ColumnarValue> {
+            unimplemented!()
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            children: Vec<Arc<dyn PhysicalExpr>>,
+        ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+            assert!(children.is_empty());
+            Ok(self)
+        }
+
+        fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockExpr")
+        }
     }
 
-    fn clear(&self) {
-        self.statistics.clear()
+    fn ordering() -> LexOrdering {
+        let expr = Arc::new(MockExpr {}) as Arc<dyn PhysicalExpr>;
+        LexOrdering::new(vec![PhysicalSortExpr::new_default(expr)]).unwrap()
     }
 
-    fn name(&self) -> String {
-        "DefaultListFilesCache".to_string()
+    #[test]
+    fn test_ordering_cache() {
+        let meta = create_test_meta("test.parquet", 100);
+        let cache = DefaultFileStatisticsCache::default();
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        // Cache statistics with no ordering
+        let cached_value = CachedFileMetadata::new(
+            meta.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None, // No ordering yet
+        );
+        cache.put(&meta.location, cached_value);
+
+        let result = cache.get(&meta.location).unwrap();
+        assert!(result.ordering.is_none());
+
+        // Update to add ordering
+        let mut cached = cache.get(&meta.location).unwrap();
+        if cached.is_valid_for(&meta) && cached.ordering.is_none() {
+            cached.ordering = Some(ordering());
+        }
+        cache.put(&meta.location, cached);
+
+        let result2 = cache.get(&meta.location).unwrap();
+        assert!(result2.ordering.is_some());
+
+        // Verify list_entries shows has_ordering = true
+        let entries = cache.list_entries();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.get(&meta.location).unwrap().has_ordering);
     }
-}
 
-#[cfg(test)]
-mod tests {
-    use crate::cache::cache_unit::{DefaultFileStatisticsCache, DefaultListFilesCache};
-    use crate::cache::CacheAccessor;
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
-    use chrono::DateTime;
-    use datafusion_common::Statistics;
-    use object_store::path::Path;
-    use object_store::ObjectMeta;
+    #[test]
+    fn test_cache_invalidation_on_file_modification() {
+        let cache = DefaultFileStatisticsCache::default();
+        let path = Path::from("test.parquet");
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        let meta_v1 = create_test_meta("test.parquet", 100);
+
+        // Cache initial value
+        let cached_value = CachedFileMetadata::new(
+            meta_v1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&path, cached_value);
+
+        // File modified (size changed)
+        let meta_v2 = create_test_meta("test.parquet", 200);
+
+        let cached = cache.get(&path).unwrap();
+        // Should not be valid for new meta
+        assert!(!cached.is_valid_for(&meta_v2));
+
+        // Compute new value and update
+        let new_cached = CachedFileMetadata::new(
+            meta_v2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&path, new_cached);
+
+        // Should have new metadata
+        let result = cache.get(&path).unwrap();
+        assert_eq!(result.meta.size, 200);
+    }
 
     #[test]
-    fn test_statistics_cache() {
-        let meta = ObjectMeta {
-            location: Path::from("test"),
+    fn test_ordering_cache_invalidation_on_file_modification() {
+        let cache = DefaultFileStatisticsCache::default();
+        let path = Path::from("test.parquet");
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        // Cache with original metadata and ordering
+        let meta_v1 = ObjectMeta {
+            location: path.clone(),
             last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
                 .unwrap()
                 .into(),
-            size: 1024,
+            size: 100,
             e_tag: None,
             version: None,
         };
-        let cache = DefaultFileStatisticsCache::default();
-        assert!(cache.get_with_extra(&meta.location, &meta).is_none());
-
-        cache.put_with_extra(
-            &meta.location,
-            Statistics::new_unknown(&Schema::new(vec![Field::new(
-                "test_column",
-                DataType::Timestamp(TimeUnit::Second, None),
-                false,
-            )]))
-            .into(),
-            &meta,
+        let ordering_v1 = ordering();
+        let cached_v1 = CachedFileMetadata::new(
+            meta_v1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering_v1),
         );
-        assert!(cache.get_with_extra(&meta.location, &meta).is_some());
-
-        // file size changed
-        let mut meta2 = meta.clone();
-        meta2.size = 2048;
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
+        cache.put(&path, cached_v1);
 
-        // file last_modified changed
-        let mut meta2 = meta.clone();
-        meta2.last_modified = DateTime::parse_from_rfc3339("2022-09-27T22:40:00+02:00")
-            .unwrap()
-            .into();
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
+        // Verify cached ordering is valid
+        let cached = cache.get(&path).unwrap();
+        assert!(cached.is_valid_for(&meta_v1));
+        assert!(cached.ordering.is_some());
 
-        // different file
-        let mut meta2 = meta;
-        meta2.location = Path::from("test2");
-        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
-    }
-
-    #[test]
-    fn test_list_file_cache() {
-        let meta = ObjectMeta {
-            location: Path::from("test"),
-            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+        // File modified (size changed)
+        let meta_v2 = ObjectMeta {
+            location: path.clone(),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-28T10:00:00+02:00")
                 .unwrap()
                 .into(),
-            size: 1024,
+            size: 200, // Changed
             e_tag: None,
             version: None,
         };
 
-        let cache = DefaultListFilesCache::default();
-        assert!(cache.get(&meta.location).is_none());
+        // Cache entry exists but should be invalid for new metadata
+        let cached = cache.get(&path).unwrap();
+        assert!(!cached.is_valid_for(&meta_v2));
+
+        // Cache new version with different ordering
+        let ordering_v2 = ordering(); // New ordering instance
+        let cached_v2 = CachedFileMetadata::new(
+            meta_v2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering_v2),
+        );
+        cache.put(&path, cached_v2);
+
+        // Old metadata should be invalid
+        let cached = cache.get(&path).unwrap();
+        assert!(!cached.is_valid_for(&meta_v1));
+
+        // New metadata should be valid
+        assert!(cached.is_valid_for(&meta_v2));
+        assert!(cached.ordering.is_some());
+    }
+
+    #[test]
+    fn test_list_entries() {
+        let cache = DefaultFileStatisticsCache::default();
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        let meta1 = create_test_meta("test1.parquet", 100);
+
+        let cached_value = CachedFileMetadata::new(
+            meta1.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            None,
+        );
+        cache.put(&meta1.location, cached_value);
+        let meta2 = create_test_meta("test2.parquet", 200);
+        let cached_value = CachedFileMetadata::new(
+            meta2.clone(),
+            Arc::new(Statistics::new_unknown(&schema)),
+            Some(ordering()),
+        );
+        cache.put(&meta2.location, cached_value);
 
-        cache.put(&meta.location, vec![meta.clone()].into());
+        let entries = cache.list_entries();
         assert_eq!(
-            cache.get(&meta.location).unwrap().first().unwrap().clone(),
-            meta.clone()
+            entries,
+            HashMap::from([
+                (
+                    Path::from("test1.parquet"),
+                    FileStatisticsCacheEntry {
+                        object_meta: meta1,
+                        num_rows: Precision::Absent,
+                        num_columns: 1,
+                        table_size_bytes: Precision::Absent,
+                        statistics_size_bytes: 0,
+                        has_ordering: false,
+                    }
+                ),
+                (
+                    Path::from("test2.parquet"),
+                    FileStatisticsCacheEntry {
+                        object_meta: meta2,
+                        num_rows: Precision::Absent,
+                        num_columns: 1,
+                        table_size_bytes: Precision::Absent,
+                        statistics_size_bytes: 0,
+                        has_ordering: true,
+                    }
+                ),
+            ])
         );
     }
 }
diff --git a/datafusion/execution/src/cache/file_metadata_cache.rs b/datafusion/execution/src/cache/file_metadata_cache.rs
new file mode 100644
index 0000000000000..5e899d7dd9f8b
--- /dev/null
+++ b/datafusion/execution/src/cache/file_metadata_cache.rs
@@ -0,0 +1,764 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::HashMap, sync::Mutex};
+
+use object_store::path::Path;
+
+use crate::cache::{
+    CacheAccessor,
+    cache_manager::{CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry},
+    lru_queue::LruQueue,
+};
+
+/// Handles the inner state of the [`DefaultFilesMetadataCache`] struct.
+struct DefaultFilesMetadataCacheState {
+    lru_queue: LruQueue<Path, CachedFileMetadataEntry>,
+    memory_limit: usize,
+    memory_used: usize,
+    cache_hits: HashMap<Path, usize>,
+}
+
+impl DefaultFilesMetadataCacheState {
+    fn new(memory_limit: usize) -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit,
+            memory_used: 0,
+            cache_hits: HashMap::new(),
+        }
+    }
+
+    /// Returns the respective entry from the cache, if it exists.
+    /// If the entry exists, it becomes the most recently used.
+    fn get(&mut self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        self.lru_queue.get(k).cloned().inspect(|_| {
+            *self.cache_hits.entry(k.clone()).or_insert(0) += 1;
+        })
+    }
+
+    /// Checks if the metadata is currently cached.
+    /// The LRU queue is not updated.
+    fn contains_key(&self, k: &Path) -> bool {
+        self.lru_queue.peek(k).is_some()
+    }
+
+    /// Adds a new key-value pair to cache, meaning LRU entries might be evicted if required.
+    /// If the key is already in the cache, the previous metadata is returned.
+    /// If the size of the metadata is greater than the `memory_limit`, the value is not inserted.
+    fn put(
+        &mut self,
+        key: Path,
+        value: CachedFileMetadataEntry,
+    ) -> Option<CachedFileMetadataEntry> {
+        let value_size = value.file_metadata.memory_size();
+
+        // no point in trying to add this value to the cache if it cannot fit entirely
+        if value_size > self.memory_limit {
+            return None;
+        }
+
+        self.cache_hits.insert(key.clone(), 0);
+        // if the key is already in the cache, the old value is removed
+        let old_value = self.lru_queue.put(key, value);
+        self.memory_used += value_size;
+        if let Some(ref old_entry) = old_value {
+            self.memory_used -= old_entry.file_metadata.memory_size();
+        }
+
+        self.evict_entries();
+
+        old_value
+    }
+
+    /// Evicts entries from the LRU cache until `memory_used` is lower than `memory_limit`.
+    fn evict_entries(&mut self) {
+        while self.memory_used > self.memory_limit {
+            if let Some(removed) = self.lru_queue.pop() {
+                self.memory_used -= removed.1.file_metadata.memory_size();
+            } else {
+                // cache is empty while memory_used > memory_limit, cannot happen
+                debug_assert!(
+                    false,
+                    "cache is empty while memory_used > memory_limit, cannot happen"
+                );
+                return;
+            }
+        }
+    }
+
+    /// Removes an entry from the cache and returns it, if it exists.
+    fn remove(&mut self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        if let Some(old_entry) = self.lru_queue.remove(k) {
+            self.memory_used -= old_entry.file_metadata.memory_size();
+            self.cache_hits.remove(k);
+            Some(old_entry)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the number of entries currently cached.
+    fn len(&self) -> usize {
+        self.lru_queue.len()
+    }
+
+    /// Removes all entries from the cache.
+    fn clear(&mut self) {
+        self.lru_queue.clear();
+        self.memory_used = 0;
+        self.cache_hits.clear();
+    }
+}
+
+/// Default implementation of [`FileMetadataCache`]
+///
+/// Collected file embedded metadata cache.
+///
+/// The metadata for each file is validated by comparing the cached [`ObjectMeta`]
+/// (size and last_modified) against the current file state using `cached.is_valid_for(&current_meta)`.
+///
+/// # Internal details
+///
+/// The `memory_limit` controls the maximum size of the cache, which uses a
+/// Least Recently Used eviction algorithm. When adding a new entry, if the total
+/// size of the cached entries exceeds `memory_limit`, the least recently used entries
+/// are evicted until the total size is lower than `memory_limit`.
+///
+/// [`ObjectMeta`]: object_store::ObjectMeta
+pub struct DefaultFilesMetadataCache {
+    // the state is wrapped in a Mutex to ensure the operations are atomic
+    state: Mutex<DefaultFilesMetadataCacheState>,
+}
+
+impl DefaultFilesMetadataCache {
+    /// Create a new instance of [`DefaultFilesMetadataCache`].
+    ///
+    /// # Arguments
+    /// `memory_limit`:  the maximum size of the cache, in bytes
+    //
+    pub fn new(memory_limit: usize) -> Self {
+        Self {
+            state: Mutex::new(DefaultFilesMetadataCacheState::new(memory_limit)),
+        }
+    }
+
+    /// Returns the size of the cached memory, in bytes.
+    pub fn memory_used(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_used
+    }
+}
+
+impl CacheAccessor<Path, CachedFileMetadataEntry> for DefaultFilesMetadataCache {
+    fn get(&self, key: &Path) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.get(key)
+    }
+
+    fn put(
+        &self,
+        key: &Path,
+        value: CachedFileMetadataEntry,
+    ) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.put(key.clone(), value)
+    }
+
+    fn remove(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        let mut state = self.state.lock().unwrap();
+        state.remove(k)
+    }
+
+    fn contains_key(&self, k: &Path) -> bool {
+        let state = self.state.lock().unwrap();
+        state.contains_key(k)
+    }
+
+    fn len(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.len()
+    }
+
+    fn clear(&self) {
+        let mut state = self.state.lock().unwrap();
+        state.clear();
+    }
+
+    fn name(&self) -> String {
+        "DefaultFilesMetadataCache".to_string()
+    }
+}
+
+impl FileMetadataCache for DefaultFilesMetadataCache {
+    fn cache_limit(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_limit
+    }
+
+    fn update_cache_limit(&self, limit: usize) {
+        let mut state = self.state.lock().unwrap();
+        state.memory_limit = limit;
+        state.evict_entries();
+    }
+
+    fn list_entries(&self) -> HashMap<Path, FileMetadataCacheEntry> {
+        let state = self.state.lock().unwrap();
+        let mut entries = HashMap::<Path, FileMetadataCacheEntry>::new();
+
+        for (path, entry) in state.lru_queue.list_entries() {
+            entries.insert(
+                path.clone(),
+                FileMetadataCacheEntry {
+                    object_meta: entry.meta.clone(),
+                    size_bytes: entry.file_metadata.memory_size(),
+                    hits: *state.cache_hits.get(path).expect("entry must exist"),
+                    extra: entry.file_metadata.extra_info(),
+                },
+            );
+        }
+
+        entries
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use crate::cache::CacheAccessor;
+    use crate::cache::cache_manager::{
+        CachedFileMetadataEntry, FileMetadata, FileMetadataCache, FileMetadataCacheEntry,
+    };
+    use crate::cache::file_metadata_cache::DefaultFilesMetadataCache;
+    use object_store::ObjectMeta;
+    use object_store::path::Path;
+
+    pub struct TestFileMetadata {
+        metadata: String,
+    }
+
+    impl FileMetadata for TestFileMetadata {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn memory_size(&self) -> usize {
+            self.metadata.len()
+        }
+
+        fn extra_info(&self) -> HashMap<String, String> {
+            HashMap::from([("extra_info".to_owned(), "abc".to_owned())])
+        }
+    }
+
+    fn create_test_object_meta(path: &str, size: usize) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(path),
+            last_modified: chrono::DateTime::parse_from_rfc3339(
+                "2025-07-29T12:12:12+00:00",
+            )
+            .unwrap()
+            .into(),
+            size: size as u64,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache() {
+        let object_meta = create_test_object_meta("test", 1024);
+
+        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
+            metadata: "retrieved_metadata".to_owned(),
+        });
+
+        let cache = DefaultFilesMetadataCache::new(1024 * 1024);
+
+        // Cache miss
+        assert!(cache.get(&object_meta.location).is_none());
+
+        // Put a value
+        let cached_entry =
+            CachedFileMetadataEntry::new(object_meta.clone(), Arc::clone(&metadata));
+        cache.put(&object_meta.location, cached_entry);
+
+        // Verify the cached value
+        assert!(cache.contains_key(&object_meta.location));
+        let result = cache.get(&object_meta.location).unwrap();
+        let test_file_metadata = Arc::downcast::<TestFileMetadata>(result.file_metadata);
+        assert!(test_file_metadata.is_ok());
+        assert_eq!(test_file_metadata.unwrap().metadata, "retrieved_metadata");
+
+        // Cache hit - check validation
+        let result2 = cache.get(&object_meta.location).unwrap();
+        assert!(result2.is_valid_for(&object_meta));
+
+        // File size changed - closure should detect invalidity
+        let object_meta2 = create_test_object_meta("test", 2048);
+        let result3 = cache.get(&object_meta2.location).unwrap();
+        // Cached entry should NOT be valid for new meta
+        assert!(!result3.is_valid_for(&object_meta2));
+
+        // Return new entry
+        let new_entry =
+            CachedFileMetadataEntry::new(object_meta2.clone(), Arc::clone(&metadata));
+        cache.put(&object_meta2.location, new_entry);
+
+        let result4 = cache.get(&object_meta2.location).unwrap();
+        assert_eq!(result4.meta.size, 2048);
+
+        // remove
+        cache.remove(&object_meta.location);
+        assert!(!cache.contains_key(&object_meta.location));
+
+        // len and clear
+        let object_meta3 = create_test_object_meta("test3", 100);
+        cache.put(
+            &object_meta.location,
+            CachedFileMetadataEntry::new(object_meta.clone(), Arc::clone(&metadata)),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), Arc::clone(&metadata)),
+        );
+        assert_eq!(cache.len(), 2);
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+    }
+
+    fn generate_test_metadata_with_size(
+        path: &str,
+        size: usize,
+    ) -> (ObjectMeta, Arc<dyn FileMetadata>) {
+        let object_meta = ObjectMeta {
+            location: Path::from(path),
+            last_modified: chrono::Utc::now(),
+            size: size as u64,
+            e_tag: None,
+            version: None,
+        };
+        let metadata: Arc<dyn FileMetadata> = Arc::new(TestFileMetadata {
+            metadata: "a".repeat(size),
+        });
+
+        (object_meta, metadata)
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache_with_limit() {
+        let cache = DefaultFilesMetadataCache::new(1000);
+        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
+        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 500);
+        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
+
+        cache.put(
+            &object_meta1.location,
+            CachedFileMetadataEntry::new(object_meta1.clone(), metadata1),
+        );
+        cache.put(
+            &object_meta2.location,
+            CachedFileMetadataEntry::new(object_meta2.clone(), metadata2),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), metadata3),
+        );
+
+        // all entries will fit
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 900);
+        assert!(cache.contains_key(&object_meta1.location));
+        assert!(cache.contains_key(&object_meta2.location));
+        assert!(cache.contains_key(&object_meta3.location));
+
+        // add a new entry which will remove the least recently used ("1")
+        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 200);
+        cache.put(
+            &object_meta4.location,
+            CachedFileMetadataEntry::new(object_meta4.clone(), metadata4),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 1000);
+        assert!(!cache.contains_key(&object_meta1.location));
+        assert!(cache.contains_key(&object_meta4.location));
+
+        // get entry "2", which will move it to the top of the queue, and add a new one which will
+        // remove the new least recently used ("3")
+        let _ = cache.get(&object_meta2.location);
+        let (object_meta5, metadata5) = generate_test_metadata_with_size("5", 100);
+        cache.put(
+            &object_meta5.location,
+            CachedFileMetadataEntry::new(object_meta5.clone(), metadata5),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 800);
+        assert!(!cache.contains_key(&object_meta3.location));
+        assert!(cache.contains_key(&object_meta5.location));
+
+        // new entry which will not be able to fit in the 1000 bytes allocated
+        let (object_meta6, metadata6) = generate_test_metadata_with_size("6", 1200);
+        cache.put(
+            &object_meta6.location,
+            CachedFileMetadataEntry::new(object_meta6.clone(), metadata6),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 800);
+        assert!(!cache.contains_key(&object_meta6.location));
+
+        // new entry which is able to fit without removing any entry
+        let (object_meta7, metadata7) = generate_test_metadata_with_size("7", 200);
+        cache.put(
+            &object_meta7.location,
+            CachedFileMetadataEntry::new(object_meta7.clone(), metadata7),
+        );
+        assert_eq!(cache.len(), 4);
+        assert_eq!(cache.memory_used(), 1000);
+        assert!(cache.contains_key(&object_meta7.location));
+
+        // new entry which will remove all other entries
+        let (object_meta8, metadata8) = generate_test_metadata_with_size("8", 999);
+        cache.put(
+            &object_meta8.location,
+            CachedFileMetadataEntry::new(object_meta8.clone(), metadata8),
+        );
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 999);
+        assert!(cache.contains_key(&object_meta8.location));
+
+        // when updating an entry, the previous ones are not unnecessarily removed
+        let (object_meta9, metadata9) = generate_test_metadata_with_size("9", 300);
+        let (object_meta10, metadata10) = generate_test_metadata_with_size("10", 200);
+        let (object_meta11_v1, metadata11_v1) =
+            generate_test_metadata_with_size("11", 400);
+        cache.put(
+            &object_meta9.location,
+            CachedFileMetadataEntry::new(object_meta9.clone(), metadata9),
+        );
+        cache.put(
+            &object_meta10.location,
+            CachedFileMetadataEntry::new(object_meta10.clone(), metadata10),
+        );
+        cache.put(
+            &object_meta11_v1.location,
+            CachedFileMetadataEntry::new(object_meta11_v1.clone(), metadata11_v1),
+        );
+        assert_eq!(cache.memory_used(), 900);
+        assert_eq!(cache.len(), 3);
+        let (object_meta11_v2, metadata11_v2) =
+            generate_test_metadata_with_size("11", 500);
+        cache.put(
+            &object_meta11_v2.location,
+            CachedFileMetadataEntry::new(object_meta11_v2.clone(), metadata11_v2),
+        );
+        assert_eq!(cache.memory_used(), 1000);
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&object_meta9.location));
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(cache.contains_key(&object_meta11_v2.location));
+
+        // when updating an entry that now exceeds the limit, the LRU ("9") needs to be removed
+        let (object_meta11_v3, metadata11_v3) =
+            generate_test_metadata_with_size("11", 501);
+        cache.put(
+            &object_meta11_v3.location,
+            CachedFileMetadataEntry::new(object_meta11_v3.clone(), metadata11_v3),
+        );
+        assert_eq!(cache.memory_used(), 701);
+        assert_eq!(cache.len(), 2);
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(cache.contains_key(&object_meta11_v3.location));
+
+        // manually removing an entry that is not the LRU
+        cache.remove(&object_meta11_v3.location);
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 200);
+        assert!(cache.contains_key(&object_meta10.location));
+        assert!(!cache.contains_key(&object_meta11_v3.location));
+
+        // clear
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+        assert_eq!(cache.memory_used(), 0);
+
+        // resizing the cache should clear the extra entries
+        let (object_meta12, metadata12) = generate_test_metadata_with_size("12", 300);
+        let (object_meta13, metadata13) = generate_test_metadata_with_size("13", 200);
+        let (object_meta14, metadata14) = generate_test_metadata_with_size("14", 500);
+        cache.put(
+            &object_meta12.location,
+            CachedFileMetadataEntry::new(object_meta12.clone(), metadata12),
+        );
+        cache.put(
+            &object_meta13.location,
+            CachedFileMetadataEntry::new(object_meta13.clone(), metadata13),
+        );
+        cache.put(
+            &object_meta14.location,
+            CachedFileMetadataEntry::new(object_meta14.clone(), metadata14),
+        );
+        assert_eq!(cache.len(), 3);
+        assert_eq!(cache.memory_used(), 1000);
+        cache.update_cache_limit(600);
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.memory_used(), 500);
+        assert!(!cache.contains_key(&object_meta12.location));
+        assert!(!cache.contains_key(&object_meta13.location));
+        assert!(cache.contains_key(&object_meta14.location));
+    }
+
+    #[test]
+    fn test_default_file_metadata_cache_entries_info() {
+        let cache = DefaultFilesMetadataCache::new(1000);
+        let (object_meta1, metadata1) = generate_test_metadata_with_size("1", 100);
+        let (object_meta2, metadata2) = generate_test_metadata_with_size("2", 200);
+        let (object_meta3, metadata3) = generate_test_metadata_with_size("3", 300);
+
+        // initial entries, all will have hits = 0
+        cache.put(
+            &object_meta1.location,
+            CachedFileMetadataEntry::new(object_meta1.clone(), metadata1),
+        );
+        cache.put(
+            &object_meta2.location,
+            CachedFileMetadataEntry::new(object_meta2.clone(), metadata2),
+        );
+        cache.put(
+            &object_meta3.location,
+            CachedFileMetadataEntry::new(object_meta3.clone(), metadata3),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("2"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta2.clone(),
+                        size_bytes: 200,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // new hit on "1"
+        let _ = cache.get(&object_meta1.location);
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 1,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("2"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta2.clone(),
+                        size_bytes: 200,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // new entry, will evict "2"
+        let (object_meta4, metadata4) = generate_test_metadata_with_size("4", 600);
+        cache.put(
+            &object_meta4.location,
+            CachedFileMetadataEntry::new(object_meta4.clone(), metadata4),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1.clone(),
+                        size_bytes: 100,
+                        hits: 1,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("4"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta4.clone(),
+                        size_bytes: 600,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // replace entry "1"
+        let (object_meta1_new, metadata1_new) = generate_test_metadata_with_size("1", 50);
+        cache.put(
+            &object_meta1_new.location,
+            CachedFileMetadataEntry::new(object_meta1_new.clone(), metadata1_new),
+        );
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1_new.clone(),
+                        size_bytes: 50,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("4"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta4.clone(),
+                        size_bytes: 600,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // remove entry "4"
+        cache.remove(&object_meta4.location);
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    Path::from("1"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta1_new.clone(),
+                        size_bytes: 50,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                ),
+                (
+                    Path::from("3"),
+                    FileMetadataCacheEntry {
+                        object_meta: object_meta3.clone(),
+                        size_bytes: 300,
+                        hits: 0,
+                        extra: HashMap::from([(
+                            "extra_info".to_owned(),
+                            "abc".to_owned()
+                        )]),
+                    }
+                )
+            ])
+        );
+
+        // clear
+        cache.clear();
+        assert_eq!(cache.list_entries(), HashMap::from([]));
+    }
+}
diff --git a/datafusion/execution/src/cache/list_files_cache.rs b/datafusion/execution/src/cache/list_files_cache.rs
new file mode 100644
index 0000000000000..b1b8e6b500169
--- /dev/null
+++ b/datafusion/execution/src/cache/list_files_cache.rs
@@ -0,0 +1,1218 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::mem::size_of;
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+    time::Duration,
+};
+
+use datafusion_common::TableReference;
+use datafusion_common::instant::Instant;
+use object_store::{ObjectMeta, path::Path};
+
+use crate::cache::{
+    CacheAccessor,
+    cache_manager::{CachedFileList, ListFilesCache},
+    lru_queue::LruQueue,
+};
+
+pub trait TimeProvider: Send + Sync + 'static {
+    fn now(&self) -> Instant;
+}
+
+#[derive(Debug, Default)]
+pub struct SystemTimeProvider;
+
+impl TimeProvider for SystemTimeProvider {
+    fn now(&self) -> Instant {
+        Instant::now()
+    }
+}
+
+/// Default implementation of [`ListFilesCache`]
+///
+/// Caches file metadata for file listing operations.
+///
+/// # Internal details
+///
+/// The `memory_limit` parameter controls the maximum size of the cache, which uses a Least
+/// Recently Used eviction algorithm. When adding a new entry, if the total number of entries in
+/// the cache exceeds `memory_limit`, the least recently used entries are evicted until the total
+/// size is lower than the `memory_limit`.
+///
+/// # Cache API
+///
+/// Uses `get` and `put` methods for cache operations. TTL validation is handled internally -
+/// expired entries return `None` from `get`.
+pub struct DefaultListFilesCache {
+    state: Mutex<DefaultListFilesCacheState>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl Default for DefaultListFilesCache {
+    fn default() -> Self {
+        Self::new(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, None)
+    }
+}
+
+impl DefaultListFilesCache {
+    /// Creates a new instance of [`DefaultListFilesCache`].
+    ///
+    /// # Arguments
+    /// * `memory_limit` - The maximum size of the cache, in bytes.
+    /// * `ttl` - The TTL (time-to-live) of entries in the cache.
+    pub fn new(memory_limit: usize, ttl: Option<Duration>) -> Self {
+        Self {
+            state: Mutex::new(DefaultListFilesCacheState::new(memory_limit, ttl)),
+            time_provider: Arc::new(SystemTimeProvider),
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn with_time_provider(mut self, provider: Arc<dyn TimeProvider>) -> Self {
+        self.time_provider = provider;
+        self
+    }
+}
+
+#[derive(Clone, PartialEq, Debug)]
+pub struct ListFilesEntry {
+    pub metas: CachedFileList,
+    pub size_bytes: usize,
+    pub expires: Option<Instant>,
+}
+
+impl ListFilesEntry {
+    fn try_new(
+        cached_file_list: CachedFileList,
+        ttl: Option<Duration>,
+        now: Instant,
+    ) -> Option<Self> {
+        let size_bytes = (cached_file_list.files.capacity() * size_of::<ObjectMeta>())
+            + cached_file_list
+                .files
+                .iter()
+                .map(meta_heap_bytes)
+                .reduce(|acc, b| acc + b)?;
+
+        Some(Self {
+            metas: cached_file_list,
+            size_bytes,
+            expires: ttl.map(|t| now + t),
+        })
+    }
+}
+
+/// Calculates the number of bytes an [`ObjectMeta`] occupies in the heap.
+fn meta_heap_bytes(object_meta: &ObjectMeta) -> usize {
+    let mut size = object_meta.location.as_ref().len();
+
+    if let Some(e) = &object_meta.e_tag {
+        size += e.len();
+    }
+    if let Some(v) = &object_meta.version {
+        size += v.len();
+    }
+
+    size
+}
+
+/// The default memory limit for the [`DefaultListFilesCache`]
+pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize = 1024 * 1024; // 1MiB
+
+/// The default cache TTL for the [`DefaultListFilesCache`]
+pub const DEFAULT_LIST_FILES_CACHE_TTL: Option<Duration> = None; // Infinite
+
+/// Key for [`DefaultListFilesCache`]
+///
+/// Each entry is scoped to its use within a specific table so that the cache
+/// can differentiate between identical paths in different tables, and
+/// table-level cache invalidation.
+#[derive(PartialEq, Eq, Hash, Clone, Debug)]
+pub struct TableScopedPath {
+    pub table: Option<TableReference>,
+    pub path: Path,
+}
+
+/// Handles the inner state of the [`DefaultListFilesCache`] struct.
+pub struct DefaultListFilesCacheState {
+    lru_queue: LruQueue<TableScopedPath, ListFilesEntry>,
+    memory_limit: usize,
+    memory_used: usize,
+    ttl: Option<Duration>,
+}
+
+impl Default for DefaultListFilesCacheState {
+    fn default() -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit: DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT,
+            memory_used: 0,
+            ttl: DEFAULT_LIST_FILES_CACHE_TTL,
+        }
+    }
+}
+
+impl DefaultListFilesCacheState {
+    fn new(memory_limit: usize, ttl: Option<Duration>) -> Self {
+        Self {
+            lru_queue: LruQueue::new(),
+            memory_limit,
+            memory_used: 0,
+            ttl,
+        }
+    }
+
+    /// Gets an entry from the cache, checking for expiration.
+    ///
+    /// Returns the cached file list if it exists and hasn't expired.
+    /// If the entry has expired, it is removed from the cache.
+    fn get(&mut self, key: &TableScopedPath, now: Instant) -> Option<CachedFileList> {
+        let entry = self.lru_queue.get(key)?;
+
+        // Check expiration
+        if let Some(exp) = entry.expires
+            && now > exp
+        {
+            self.remove(key);
+            return None;
+        }
+
+        Some(entry.metas.clone())
+    }
+
+    /// Checks if the respective entry is currently cached.
+    ///
+    /// If the entry has expired by `now` it is removed from the cache.
+    ///
+    /// The LRU queue is not updated.
+    fn contains_key(&mut self, k: &TableScopedPath, now: Instant) -> bool {
+        let Some(entry) = self.lru_queue.peek(k) else {
+            return false;
+        };
+
+        match entry.expires {
+            Some(exp) if now > exp => {
+                self.remove(k);
+                false
+            }
+            _ => true,
+        }
+    }
+
+    /// Adds a new key-value pair to cache expiring at `now` + the TTL.
+    ///
+    /// This means that LRU entries might be evicted if required.
+    /// If the key is already in the cache, the previous entry is returned.
+    /// If the size of the entry is greater than the `memory_limit`, the value is not inserted.
+    fn put(
+        &mut self,
+        key: &TableScopedPath,
+        value: CachedFileList,
+        now: Instant,
+    ) -> Option<CachedFileList> {
+        let entry = ListFilesEntry::try_new(value, self.ttl, now)?;
+        let entry_size = entry.size_bytes;
+
+        // no point in trying to add this value to the cache if it cannot fit entirely
+        if entry_size > self.memory_limit {
+            return None;
+        }
+
+        // if the key is already in the cache, the old value is removed
+        let old_value = self.lru_queue.put(key.clone(), entry);
+        self.memory_used += entry_size;
+
+        if let Some(entry) = &old_value {
+            self.memory_used -= entry.size_bytes;
+        }
+
+        self.evict_entries();
+
+        old_value.map(|v| v.metas)
+    }
+
+    /// Evicts entries from the LRU cache until `memory_used` is lower than `memory_limit`.
+    fn evict_entries(&mut self) {
+        while self.memory_used > self.memory_limit {
+            if let Some(removed) = self.lru_queue.pop() {
+                self.memory_used -= removed.1.size_bytes;
+            } else {
+                // cache is empty while memory_used > memory_limit, cannot happen
+                debug_assert!(
+                    false,
+                    "cache is empty while memory_used > memory_limit, cannot happen"
+                );
+                return;
+            }
+        }
+    }
+
+    /// Removes an entry from the cache and returns it, if it exists.
+    fn remove(&mut self, k: &TableScopedPath) -> Option<CachedFileList> {
+        if let Some(entry) = self.lru_queue.remove(k) {
+            self.memory_used -= entry.size_bytes;
+            Some(entry.metas)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the number of entries currently cached.
+    fn len(&self) -> usize {
+        self.lru_queue.len()
+    }
+
+    /// Removes all entries from the cache.
+    fn clear(&mut self) {
+        self.lru_queue.clear();
+        self.memory_used = 0;
+    }
+}
+
+impl CacheAccessor<TableScopedPath, CachedFileList> for DefaultListFilesCache {
+    fn get(&self, key: &TableScopedPath) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.get(key, now)
+    }
+
+    fn put(
+        &self,
+        key: &TableScopedPath,
+        value: CachedFileList,
+    ) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.put(key, value, now)
+    }
+
+    fn remove(&self, k: &TableScopedPath) -> Option<CachedFileList> {
+        let mut state = self.state.lock().unwrap();
+        state.remove(k)
+    }
+
+    fn contains_key(&self, k: &TableScopedPath) -> bool {
+        let mut state = self.state.lock().unwrap();
+        let now = self.time_provider.now();
+        state.contains_key(k, now)
+    }
+
+    fn len(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.len()
+    }
+
+    fn clear(&self) {
+        let mut state = self.state.lock().unwrap();
+        state.clear();
+    }
+
+    fn name(&self) -> String {
+        String::from("DefaultListFilesCache")
+    }
+}
+
+impl ListFilesCache for DefaultListFilesCache {
+    fn cache_limit(&self) -> usize {
+        let state = self.state.lock().unwrap();
+        state.memory_limit
+    }
+
+    fn cache_ttl(&self) -> Option<Duration> {
+        let state = self.state.lock().unwrap();
+        state.ttl
+    }
+
+    fn update_cache_limit(&self, limit: usize) {
+        let mut state = self.state.lock().unwrap();
+        state.memory_limit = limit;
+        state.evict_entries();
+    }
+
+    fn update_cache_ttl(&self, ttl: Option<Duration>) {
+        let mut state = self.state.lock().unwrap();
+        state.ttl = ttl;
+        state.evict_entries();
+    }
+
+    fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry> {
+        let state = self.state.lock().unwrap();
+        let mut entries = HashMap::<TableScopedPath, ListFilesEntry>::new();
+        for (path, entry) in state.lru_queue.list_entries() {
+            entries.insert(path.clone(), entry.clone());
+        }
+        entries
+    }
+
+    fn drop_table_entries(
+        &self,
+        table_ref: &Option<TableReference>,
+    ) -> datafusion_common::Result<()> {
+        let mut state = self.state.lock().unwrap();
+        let mut table_paths = vec![];
+        for (path, _) in state.lru_queue.list_entries() {
+            if path.table == *table_ref {
+                table_paths.push(path.clone());
+            }
+        }
+        for path in table_paths {
+            state.remove(&path);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::DateTime;
+    use std::thread;
+
+    struct MockTimeProvider {
+        base: Instant,
+        offset: Mutex<Duration>,
+    }
+
+    impl MockTimeProvider {
+        fn new() -> Self {
+            Self {
+                base: Instant::now(),
+                offset: Mutex::new(Duration::ZERO),
+            }
+        }
+
+        fn inc(&self, duration: Duration) {
+            let mut offset = self.offset.lock().unwrap();
+            *offset += duration;
+        }
+    }
+
+    impl TimeProvider for MockTimeProvider {
+        fn now(&self) -> Instant {
+            self.base + *self.offset.lock().unwrap()
+        }
+    }
+
+    /// Helper function to create a test ObjectMeta with a specific path and location string size
+    fn create_test_object_meta(path: &str, location_size: usize) -> ObjectMeta {
+        // Create a location string of the desired size by padding with zeros
+        let location_str = if location_size > path.len() {
+            format!("{}{}", path, "0".repeat(location_size - path.len()))
+        } else {
+            path.to_string()
+        };
+
+        ObjectMeta {
+            location: Path::from(location_str),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size: 1024,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    /// Helper function to create a CachedFileList with at least meta_size bytes
+    fn create_test_list_files_entry(
+        path: &str,
+        count: usize,
+        meta_size: usize,
+    ) -> (Path, CachedFileList, usize) {
+        let metas: Vec<ObjectMeta> = (0..count)
+            .map(|i| create_test_object_meta(&format!("file{i}"), meta_size))
+            .collect();
+
+        // Calculate actual size using the same logic as ListFilesEntry::try_new
+        let size = (metas.capacity() * size_of::<ObjectMeta>())
+            + metas.iter().map(meta_heap_bytes).sum::<usize>();
+
+        (Path::from(path), CachedFileList::new(metas), size)
+    }
+
+    #[test]
+    fn test_basic_operations() {
+        let cache = DefaultListFilesCache::default();
+        let table_ref = Some(TableReference::from("table"));
+        let path = Path::from("test_path");
+        let key = TableScopedPath {
+            table: table_ref.clone(),
+            path,
+        };
+
+        // Initially cache is empty
+        assert!(!cache.contains_key(&key));
+        assert_eq!(cache.len(), 0);
+
+        // Cache miss - get returns None
+        assert!(cache.get(&key).is_none());
+
+        // Put a value
+        let meta = create_test_object_meta("file1", 50);
+        cache.put(&key, CachedFileList::new(vec![meta]));
+
+        // Entry should be cached
+        assert!(cache.contains_key(&key));
+        assert_eq!(cache.len(), 1);
+        let result = cache.get(&key).unwrap();
+        assert_eq!(result.files.len(), 1);
+
+        // Remove the entry
+        let removed = cache.remove(&key).unwrap();
+        assert_eq!(removed.files.len(), 1);
+        assert!(!cache.contains_key(&key));
+        assert_eq!(cache.len(), 0);
+
+        // Put multiple entries
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 2, 50);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 3, 50);
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref,
+            path: path2,
+        };
+        cache.put(&key1, value1.clone());
+        cache.put(&key2, value2.clone());
+        assert_eq!(cache.len(), 2);
+
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key1.clone(),
+                    ListFilesEntry {
+                        metas: value1,
+                        size_bytes: size1,
+                        expires: None,
+                    }
+                ),
+                (
+                    key2.clone(),
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: None,
+                    }
+                )
+            ])
+        );
+
+        // Clear all entries
+        cache.clear();
+        assert_eq!(cache.len(), 0);
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_lru_eviction_basic() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit to exactly fit all three entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+
+        // All three entries should fit
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // Adding a new entry should evict path1 (LRU)
+        let (path4, value4, _) = create_test_list_files_entry("path4", 1, 100);
+        let key4 = TableScopedPath {
+            table: table_ref,
+            path: path4,
+        };
+        cache.put(&key4, value4);
+
+        assert_eq!(cache.len(), 3);
+        assert!(!cache.contains_key(&key1)); // Evicted
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key4));
+    }
+
+    #[test]
+    fn test_lru_ordering_after_access() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit to fit exactly three entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Access path1 to move it to front (MRU)
+        // Order is now: path2 (LRU), path3, path1 (MRU)
+        let _ = cache.get(&key1);
+
+        // Adding a new entry should evict path2 (the LRU)
+        let (path4, value4, _) = create_test_list_files_entry("path4", 1, 100);
+        let key4 = TableScopedPath {
+            table: table_ref,
+            path: path4,
+        };
+        cache.put(&key4, value4);
+
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1)); // Still present (recently accessed)
+        assert!(!cache.contains_key(&key2)); // Evicted (was LRU)
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key4));
+    }
+
+    #[test]
+    fn test_reject_too_large() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+
+        // Set cache limit to fit both entries
+        let cache = DefaultListFilesCache::new(size * 2, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        assert_eq!(cache.len(), 2);
+
+        // Try to add an entry that's too large to fit in the cache
+        // The entry is not stored (too large)
+        let (path_large, value_large, _) = create_test_list_files_entry("large", 1, 1000);
+        let key_large = TableScopedPath {
+            table: table_ref,
+            path: path_large,
+        };
+        cache.put(&key_large, value_large);
+
+        // Large entry should not be added
+        assert!(!cache.contains_key(&key_large));
+        assert_eq!(cache.len(), 2);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_multiple_evictions() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        // Set cache limit for exactly 3 entries
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path3,
+        };
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Add a large entry that requires evicting 2 entries
+        let (path_large, value_large, _) = create_test_list_files_entry("large", 1, 200);
+        let key_large = TableScopedPath {
+            table: table_ref,
+            path: path_large,
+        };
+        cache.put(&key_large, value_large);
+
+        // path1 and path2 should be evicted (both LRU), path3 and path_large remain
+        assert_eq!(cache.len(), 2);
+        assert!(!cache.contains_key(&key1)); // Evicted
+        assert!(!cache.contains_key(&key2)); // Evicted
+        assert!(cache.contains_key(&key3));
+        assert!(cache.contains_key(&key_large));
+    }
+
+    #[test]
+    fn test_cache_limit_resize() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        // Add three entries
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+        assert_eq!(cache.len(), 3);
+
+        // Resize cache to only fit one entry
+        cache.update_cache_limit(size);
+
+        // Should keep only the most recent entry (path3, the MRU)
+        assert_eq!(cache.len(), 1);
+        assert!(cache.contains_key(&key3));
+        // Earlier entries (LRU) should be evicted
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+    }
+
+    #[test]
+    fn test_entry_update_with_size_change() {
+        let (path1, value1, size) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3_v1, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let cache = DefaultListFilesCache::new(size * 3, None);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        // Add three entries
+        cache.put(&key1, value1);
+        cache.put(&key2, value2.clone());
+        cache.put(&key3, value3_v1);
+        assert_eq!(cache.len(), 3);
+
+        // Update path3 with same size - should not cause eviction
+        let (_, value3_v2, _) = create_test_list_files_entry("path3", 1, 100);
+        cache.put(&key3, value3_v2);
+
+        assert_eq!(cache.len(), 3);
+        assert!(cache.contains_key(&key1));
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // Update path3 with larger size that requires evicting path1 (LRU)
+        let (_, value3_v3, size3_v3) = create_test_list_files_entry("path3", 1, 200);
+        cache.put(&key3, value3_v3.clone());
+
+        assert_eq!(cache.len(), 2);
+        assert!(!cache.contains_key(&key1)); // Evicted (was LRU)
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key2,
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: None,
+                    }
+                ),
+                (
+                    key3,
+                    ListFilesEntry {
+                        metas: value3_v3,
+                        size_bytes: size3_v3,
+                        expires: None,
+                    }
+                )
+            ])
+        );
+    }
+
+    #[test]
+    fn test_cache_with_ttl() {
+        let ttl = Duration::from_millis(100);
+
+        let mock_time = Arc::new(MockTimeProvider::new());
+        let cache = DefaultListFilesCache::new(10000, Some(ttl))
+            .with_time_provider(Arc::clone(&mock_time) as Arc<dyn TimeProvider>);
+
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 2, 50);
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 2, 50);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref,
+            path: path2,
+        };
+        cache.put(&key1, value1.clone());
+        cache.put(&key2, value2.clone());
+
+        // Entries should be accessible immediately
+        assert!(cache.get(&key1).is_some());
+        assert!(cache.get(&key2).is_some());
+        // List cache entries
+        assert_eq!(
+            cache.list_entries(),
+            HashMap::from([
+                (
+                    key1.clone(),
+                    ListFilesEntry {
+                        metas: value1,
+                        size_bytes: size1,
+                        expires: mock_time.now().checked_add(ttl),
+                    }
+                ),
+                (
+                    key2.clone(),
+                    ListFilesEntry {
+                        metas: value2,
+                        size_bytes: size2,
+                        expires: mock_time.now().checked_add(ttl),
+                    }
+                )
+            ])
+        );
+        // Wait for TTL to expire
+        mock_time.inc(Duration::from_millis(150));
+
+        // Entries should now return None when observed through contains_key
+        assert!(!cache.contains_key(&key1));
+        assert_eq!(cache.len(), 1); // key1 was removed by contains_key()
+        assert!(!cache.contains_key(&key2));
+        assert_eq!(cache.len(), 0); // key2 was removed by contains_key()
+    }
+
+    #[test]
+    fn test_cache_with_ttl_and_lru() {
+        let ttl = Duration::from_millis(200);
+
+        let mock_time = Arc::new(MockTimeProvider::new());
+        let cache = DefaultListFilesCache::new(1000, Some(ttl))
+            .with_time_provider(Arc::clone(&mock_time) as Arc<dyn TimeProvider>);
+
+        let (path1, value1, _) = create_test_list_files_entry("path1", 1, 400);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 400);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 400);
+
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        let key3 = TableScopedPath {
+            table: table_ref,
+            path: path3,
+        };
+        cache.put(&key1, value1);
+        mock_time.inc(Duration::from_millis(50));
+        cache.put(&key2, value2);
+        mock_time.inc(Duration::from_millis(50));
+
+        // path3 should evict path1 due to size limit
+        cache.put(&key3, value3);
+        assert!(!cache.contains_key(&key1)); // Evicted by LRU
+        assert!(cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+
+        mock_time.inc(Duration::from_millis(151));
+
+        assert!(!cache.contains_key(&key2)); // Expired
+        assert!(cache.contains_key(&key3)); // Still valid
+    }
+
+    #[test]
+    fn test_ttl_expiration_in_get() {
+        let ttl = Duration::from_millis(100);
+        let cache = DefaultListFilesCache::new(10000, Some(ttl));
+
+        let (path, value, _) = create_test_list_files_entry("path", 2, 50);
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path,
+        };
+
+        // Cache the entry
+        cache.put(&key, value.clone());
+
+        // Entry should be accessible immediately
+        let result = cache.get(&key);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().files.len(), 2);
+
+        // Wait for TTL to expire
+        thread::sleep(Duration::from_millis(150));
+
+        // Get should return None because entry expired
+        let result2 = cache.get(&key);
+        assert!(result2.is_none());
+    }
+
+    #[test]
+    fn test_meta_heap_bytes_calculation() {
+        // Test with minimal ObjectMeta (no e_tag, no version)
+        let meta1 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: None,
+        };
+        assert_eq!(meta_heap_bytes(&meta1), 4); // Just the location string "test"
+
+        // Test with e_tag
+        let meta2 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: Some("etag123".to_string()),
+            version: None,
+        };
+        assert_eq!(meta_heap_bytes(&meta2), 4 + 7); // location (4) + e_tag (7)
+
+        // Test with version
+        let meta3 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: None,
+            version: Some("v1.0".to_string()),
+        };
+        assert_eq!(meta_heap_bytes(&meta3), 4 + 4); // location (4) + version (4)
+
+        // Test with both e_tag and version
+        let meta4 = ObjectMeta {
+            location: Path::from("test"),
+            last_modified: chrono::Utc::now(),
+            size: 100,
+            e_tag: Some("tag".to_string()),
+            version: Some("ver".to_string()),
+        };
+        assert_eq!(meta_heap_bytes(&meta4), 4 + 3 + 3); // location (4) + e_tag (3) + version (3)
+    }
+
+    #[test]
+    fn test_entry_creation() {
+        // Test with empty vector
+        let empty_list = CachedFileList::new(vec![]);
+        let now = Instant::now();
+        let entry = ListFilesEntry::try_new(empty_list, None, now);
+        assert!(entry.is_none());
+
+        // Validate entry size
+        let metas: Vec<ObjectMeta> = (0..5)
+            .map(|i| create_test_object_meta(&format!("file{i}"), 30))
+            .collect();
+        let cached_list = CachedFileList::new(metas);
+        let entry = ListFilesEntry::try_new(cached_list, None, now).unwrap();
+        assert_eq!(entry.metas.files.len(), 5);
+        // Size should be: capacity * sizeof(ObjectMeta) + (5 * 30) for heap bytes
+        let expected_size = (entry.metas.files.capacity() * size_of::<ObjectMeta>())
+            + (entry.metas.files.len() * 30);
+        assert_eq!(entry.size_bytes, expected_size);
+
+        // Test with TTL
+        let meta = create_test_object_meta("file", 50);
+        let ttl = Duration::from_secs(10);
+        let cached_list = CachedFileList::new(vec![meta]);
+        let entry = ListFilesEntry::try_new(cached_list, Some(ttl), now).unwrap();
+        assert!(entry.expires.unwrap() > now);
+    }
+
+    #[test]
+    fn test_memory_tracking() {
+        let cache = DefaultListFilesCache::new(1000, None);
+
+        // Verify cache starts with 0 memory used
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, 0);
+        }
+
+        // Add entry and verify memory tracking
+        let (path1, value1, size1) = create_test_list_files_entry("path1", 1, 100);
+        let table_ref = Some(TableReference::from("table"));
+        let key1 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path1,
+        };
+        cache.put(&key1, value1);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size1);
+        }
+
+        // Add another entry
+        let (path2, value2, size2) = create_test_list_files_entry("path2", 1, 200);
+        let key2 = TableScopedPath {
+            table: table_ref.clone(),
+            path: path2,
+        };
+        cache.put(&key2, value2);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size1 + size2);
+        }
+
+        // Remove first entry and verify memory decreases
+        cache.remove(&key1);
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, size2);
+        }
+
+        // Clear and verify memory is 0
+        cache.clear();
+        {
+            let state = cache.state.lock().unwrap();
+            assert_eq!(state.memory_used, 0);
+        }
+    }
+
+    // Prefix filtering tests using CachedFileList::filter_by_prefix
+
+    /// Helper function to create ObjectMeta with a specific location path
+    fn create_object_meta_with_path(location: &str) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from(location),
+            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
+                .unwrap()
+                .into(),
+            size: 1024,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    #[test]
+    fn test_prefix_filtering() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        // Create files for a partitioned table
+        let table_base = Path::from("my_table");
+        let files = vec![
+            create_object_meta_with_path("my_table/a=1/file1.parquet"),
+            create_object_meta_with_path("my_table/a=1/file2.parquet"),
+            create_object_meta_with_path("my_table/a=2/file3.parquet"),
+            create_object_meta_with_path("my_table/a=2/file4.parquet"),
+        ];
+
+        // Cache the full table listing
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+
+        let result = cache.get(&key).unwrap();
+
+        // Filter for partition a=1
+        let prefix_a1 = Some(Path::from("my_table/a=1"));
+        let filtered = result.files_matching_prefix(&prefix_a1);
+        assert_eq!(filtered.len(), 2);
+        assert!(
+            filtered
+                .iter()
+                .all(|m| m.location.as_ref().starts_with("my_table/a=1"))
+        );
+
+        // Filter for partition a=2
+        let prefix_a2 = Some(Path::from("my_table/a=2"));
+        let filtered_2 = result.files_matching_prefix(&prefix_a2);
+        assert_eq!(filtered_2.len(), 2);
+        assert!(
+            filtered_2
+                .iter()
+                .all(|m| m.location.as_ref().starts_with("my_table/a=2"))
+        );
+
+        // No filter returns all
+        let all = result.files_matching_prefix(&None);
+        assert_eq!(all.len(), 4);
+    }
+
+    #[test]
+    fn test_prefix_no_matching_files() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        let table_base = Path::from("my_table");
+        let files = vec![
+            create_object_meta_with_path("my_table/a=1/file1.parquet"),
+            create_object_meta_with_path("my_table/a=2/file2.parquet"),
+        ];
+
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+        let result = cache.get(&key).unwrap();
+
+        // Query for partition a=3 which doesn't exist
+        let prefix_a3 = Some(Path::from("my_table/a=3"));
+        let filtered = result.files_matching_prefix(&prefix_a3);
+        assert!(filtered.is_empty());
+    }
+
+    #[test]
+    fn test_nested_partitions() {
+        let cache = DefaultListFilesCache::new(100000, None);
+
+        let table_base = Path::from("events");
+        let files = vec![
+            create_object_meta_with_path(
+                "events/year=2024/month=01/day=01/file1.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2024/month=01/day=02/file2.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2024/month=02/day=01/file3.parquet",
+            ),
+            create_object_meta_with_path(
+                "events/year=2025/month=01/day=01/file4.parquet",
+            ),
+        ];
+
+        let table_ref = Some(TableReference::from("table"));
+        let key = TableScopedPath {
+            table: table_ref,
+            path: table_base,
+        };
+        cache.put(&key, CachedFileList::new(files));
+        let result = cache.get(&key).unwrap();
+
+        // Filter for year=2024/month=01
+        let prefix_month = Some(Path::from("events/year=2024/month=01"));
+        let filtered = result.files_matching_prefix(&prefix_month);
+        assert_eq!(filtered.len(), 2);
+
+        // Filter for year=2024
+        let prefix_year = Some(Path::from("events/year=2024"));
+        let filtered_year = result.files_matching_prefix(&prefix_year);
+        assert_eq!(filtered_year.len(), 3);
+    }
+
+    #[test]
+    fn test_drop_table_entries() {
+        let cache = DefaultListFilesCache::default();
+
+        let (path1, value1, _) = create_test_list_files_entry("path1", 1, 100);
+        let (path2, value2, _) = create_test_list_files_entry("path2", 1, 100);
+        let (path3, value3, _) = create_test_list_files_entry("path3", 1, 100);
+
+        let table_ref1 = Some(TableReference::from("table1"));
+        let key1 = TableScopedPath {
+            table: table_ref1.clone(),
+            path: path1,
+        };
+        let key2 = TableScopedPath {
+            table: table_ref1.clone(),
+            path: path2,
+        };
+
+        let table_ref2 = Some(TableReference::from("table2"));
+        let key3 = TableScopedPath {
+            table: table_ref2.clone(),
+            path: path3,
+        };
+
+        cache.put(&key1, value1);
+        cache.put(&key2, value2);
+        cache.put(&key3, value3);
+
+        cache.drop_table_entries(&table_ref1).unwrap();
+
+        assert!(!cache.contains_key(&key1));
+        assert!(!cache.contains_key(&key2));
+        assert!(cache.contains_key(&key3));
+    }
+}
diff --git a/datafusion/execution/src/cache/lru_queue.rs b/datafusion/execution/src/cache/lru_queue.rs
new file mode 100644
index 0000000000000..fb3d158ced425
--- /dev/null
+++ b/datafusion/execution/src/cache/lru_queue.rs
@@ -0,0 +1,542 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    collections::HashMap,
+    hash::Hash,
+    sync::{Arc, Weak},
+};
+
+use parking_lot::Mutex;
+
+#[derive(Default)]
+/// Provides a Least Recently Used queue with unbounded capacity.
+///
+/// # Examples
+///
+/// ```
+/// use datafusion_execution::cache::lru_queue::LruQueue;
+///
+/// let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+/// lru_queue.put(1, 10);
+/// lru_queue.put(2, 20);
+/// lru_queue.put(3, 30);
+/// assert_eq!(lru_queue.get(&2), Some(&20));
+/// assert_eq!(lru_queue.pop(), Some((1, 10)));
+/// assert_eq!(lru_queue.pop(), Some((3, 30)));
+/// assert_eq!(lru_queue.pop(), Some((2, 20)));
+/// assert_eq!(lru_queue.pop(), None);
+/// ```
+pub struct LruQueue<K: Eq + Hash + Clone, V> {
+    data: LruData<K, V>,
+    queue: LruList<K>,
+}
+
+/// Maps the key to the [`LruNode`] in queue and the value.
+type LruData<K, V> = HashMap<K, (Arc<Mutex<LruNode<K>>>, V)>;
+
+#[derive(Default)]
+/// Doubly-linked list that maintains the LRU order
+struct LruList<K> {
+    head: Link<K>,
+    tail: Link<K>,
+}
+
+/// Doubly-linked list node.
+struct LruNode<K> {
+    key: K,
+    prev: Link<K>,
+    next: Link<K>,
+}
+
+/// Weak pointer to a [`LruNode`], used to connect nodes in the doubly-linked list.
+/// The strong reference is guaranteed to be stored in the `data` map of the [`LruQueue`].
+type Link<K> = Option<Weak<Mutex<LruNode<K>>>>;
+
+impl<K: Eq + Hash + Clone, V> LruQueue<K, V> {
+    pub fn new() -> Self {
+        Self {
+            data: HashMap::new(),
+            queue: LruList {
+                head: None,
+                tail: None,
+            },
+        }
+    }
+
+    /// Returns a reference to value mapped by `key`, if it exists.
+    /// If the entry exists, it becomes the most recently used.
+    pub fn get(&mut self, key: &K) -> Option<&V> {
+        if let Some(value) = self.remove(key) {
+            self.put(key.clone(), value);
+        }
+        self.data.get(key).map(|(_, value)| value)
+    }
+
+    /// Returns a reference to value mapped by `key`, if it exists.
+    /// Does not affect the queue order.
+    pub fn peek(&self, key: &K) -> Option<&V> {
+        self.data.get(key).map(|(_, value)| value)
+    }
+
+    /// Checks whether there is an entry with key `key` in the queue.
+    /// Does not affect the queue order.
+    pub fn contains_key(&self, key: &K) -> bool {
+        self.data.contains_key(key)
+    }
+
+    /// Inserts an entry in the queue, becoming the most recently used.
+    /// If the entry already exists, returns the previous value.
+    pub fn put(&mut self, key: K, value: V) -> Option<V> {
+        let old_value = self.remove(&key);
+
+        let node = Arc::new(Mutex::new(LruNode {
+            key: key.clone(),
+            prev: None,
+            next: None,
+        }));
+
+        match self.queue.head {
+            // queue is not empty
+            Some(ref old_head) => {
+                old_head
+                    .upgrade()
+                    .expect("value has been unexpectedly dropped")
+                    .lock()
+                    .prev = Some(Arc::downgrade(&node));
+                node.lock().next = Some(Weak::clone(old_head));
+                self.queue.head = Some(Arc::downgrade(&node));
+            }
+            // queue is empty
+            _ => {
+                self.queue.head = Some(Arc::downgrade(&node));
+                self.queue.tail = Some(Arc::downgrade(&node));
+            }
+        }
+
+        self.data.insert(key, (node, value));
+
+        old_value
+    }
+
+    /// Removes and returns the least recently used value.
+    /// Returns `None` if the queue is empty.
+    pub fn pop(&mut self) -> Option<(K, V)> {
+        let key_to_remove = self.queue.tail.as_ref().map(|n| {
+            n.upgrade()
+                .expect("value has been unexpectedly dropped")
+                .lock()
+                .key
+                .clone()
+        });
+        if let Some(k) = key_to_remove {
+            let value = self.remove(&k).unwrap(); // confirmed above that the entry exists
+            Some((k, value))
+        } else {
+            None
+        }
+    }
+
+    /// Removes a specific entry from the queue, if it exists.
+    pub fn remove(&mut self, key: &K) -> Option<V> {
+        if let Some((old_node, old_value)) = self.data.remove(key) {
+            let LruNode { key: _, prev, next } = &*old_node.lock();
+            match (prev, next) {
+                // single node in the queue
+                (None, None) => {
+                    self.queue.head = None;
+                    self.queue.tail = None;
+                }
+                // removed the head node
+                (None, Some(n)) => {
+                    let n_strong =
+                        n.upgrade().expect("value has been unexpectedly dropped");
+                    n_strong.lock().prev = None;
+                    self.queue.head = Some(Weak::clone(n));
+                }
+                // removed the tail node
+                (Some(p), None) => {
+                    let p_strong =
+                        p.upgrade().expect("value has been unexpectedly dropped");
+                    p_strong.lock().next = None;
+                    self.queue.tail = Some(Weak::clone(p));
+                }
+                // removed a middle node
+                (Some(p), Some(n)) => {
+                    let n_strong =
+                        n.upgrade().expect("value has been unexpectedly dropped");
+                    let p_strong =
+                        p.upgrade().expect("value has been unexpectedly dropped");
+                    n_strong.lock().prev = Some(Weak::clone(p));
+                    p_strong.lock().next = Some(Weak::clone(n));
+                }
+            };
+            Some(old_value)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the number of entries in the queue.
+    pub fn len(&self) -> usize {
+        self.data.len()
+    }
+
+    /// Checks whether the queue has no items.
+    pub fn is_empty(&self) -> bool {
+        self.data.is_empty()
+    }
+
+    /// Removes all entries from the queue.
+    pub fn clear(&mut self) {
+        self.queue.head = None;
+        self.queue.tail = None;
+        self.data.clear();
+    }
+
+    /// Returns a reference to the entries currently in the queue.
+    pub fn list_entries(&self) -> HashMap<&K, &V> {
+        self.data.iter().map(|(k, (_, v))| (k, v)).collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use rand::seq::IndexedRandom;
+
+    use crate::cache::lru_queue::LruQueue;
+
+    #[test]
+    fn test_get() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // value does not exist
+        assert_eq!(lru_queue.get(&1), None);
+
+        // value exists
+        lru_queue.put(1, 10);
+        assert_eq!(lru_queue.get(&1), Some(&10));
+        assert_eq!(lru_queue.get(&1), Some(&10));
+
+        // value is removed
+        lru_queue.remove(&1);
+        assert_eq!(lru_queue.get(&1), None);
+    }
+
+    #[test]
+    fn test_peek() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // value does not exist
+        assert_eq!(lru_queue.peek(&1), None);
+
+        // value exists
+        lru_queue.put(1, 10);
+        assert_eq!(lru_queue.peek(&1), Some(&10));
+        assert_eq!(lru_queue.peek(&1), Some(&10));
+
+        // value is removed
+        lru_queue.remove(&1);
+        assert_eq!(lru_queue.peek(&1), None);
+    }
+
+    #[test]
+    fn test_put() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // no previous value
+        assert_eq!(lru_queue.put(1, 10), None);
+
+        // update, the previous value is returned
+        assert_eq!(lru_queue.put(1, 11), Some(10));
+        assert_eq!(lru_queue.put(1, 12), Some(11));
+        assert_eq!(lru_queue.put(1, 13), Some(12));
+    }
+
+    #[test]
+    fn test_remove() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // value does not exist
+        assert_eq!(lru_queue.remove(&1), None);
+
+        // value exists and is returned
+        lru_queue.put(1, 10);
+        assert_eq!(lru_queue.remove(&1), Some(10));
+
+        // value does not exist
+        assert_eq!(lru_queue.remove(&1), None);
+    }
+
+    #[test]
+    fn test_contains_key() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // value does not exist
+        assert!(!lru_queue.contains_key(&1));
+
+        // value exists
+        lru_queue.put(1, 10);
+        assert!(lru_queue.contains_key(&1));
+
+        // value is removed
+        lru_queue.remove(&1);
+        assert!(!lru_queue.contains_key(&1));
+    }
+
+    #[test]
+    fn test_len() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // empty
+        assert_eq!(lru_queue.len(), 0);
+
+        // puts
+        lru_queue.put(1, 10);
+        assert_eq!(lru_queue.len(), 1);
+        lru_queue.put(2, 20);
+        assert_eq!(lru_queue.len(), 2);
+        lru_queue.put(3, 30);
+        assert_eq!(lru_queue.len(), 3);
+        lru_queue.put(1, 11);
+        lru_queue.put(3, 31);
+        assert_eq!(lru_queue.len(), 3);
+
+        // removes
+        lru_queue.remove(&1);
+        assert_eq!(lru_queue.len(), 2);
+        lru_queue.remove(&1);
+        assert_eq!(lru_queue.len(), 2);
+        lru_queue.remove(&4);
+        assert_eq!(lru_queue.len(), 2);
+        lru_queue.remove(&3);
+        assert_eq!(lru_queue.len(), 1);
+        lru_queue.remove(&2);
+        assert_eq!(lru_queue.len(), 0);
+        lru_queue.remove(&2);
+        assert_eq!(lru_queue.len(), 0);
+
+        // clear
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        assert_eq!(lru_queue.len(), 3);
+        lru_queue.clear();
+        assert_eq!(lru_queue.len(), 0);
+    }
+
+    #[test]
+    fn test_is_empty() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // empty
+        assert!(lru_queue.is_empty());
+
+        // puts
+        lru_queue.put(1, 10);
+        assert!(!lru_queue.is_empty());
+        lru_queue.put(2, 20);
+        assert!(!lru_queue.is_empty());
+
+        // removes
+        lru_queue.remove(&1);
+        assert!(!lru_queue.is_empty());
+        lru_queue.remove(&1);
+        assert!(!lru_queue.is_empty());
+        lru_queue.remove(&2);
+        assert!(lru_queue.is_empty());
+
+        // clear
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        assert!(!lru_queue.is_empty());
+        lru_queue.clear();
+        assert!(lru_queue.is_empty());
+    }
+
+    #[test]
+    fn test_clear() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // empty
+        lru_queue.clear();
+
+        // filled
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        assert_eq!(lru_queue.get(&1), Some(&10));
+        assert_eq!(lru_queue.get(&2), Some(&20));
+        assert_eq!(lru_queue.get(&3), Some(&30));
+        lru_queue.clear();
+        assert_eq!(lru_queue.get(&1), None);
+        assert_eq!(lru_queue.get(&2), None);
+        assert_eq!(lru_queue.get(&3), None);
+        assert_eq!(lru_queue.len(), 0);
+    }
+
+    #[test]
+    fn test_pop() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+
+        // empty queue
+        assert_eq!(lru_queue.pop(), None);
+
+        // simplest case
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'get' changes the order
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.get(&2);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // multiple 'gets'
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.get(&2);
+        lru_queue.get(&3);
+        lru_queue.get(&1);
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'peak' does not change the order
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.peek(&2);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'contains' does not change the order
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.contains_key(&2);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'put' on the same key promotes it
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.put(2, 21);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), Some((2, 21)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // multiple 'puts'
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.put(2, 21);
+        lru_queue.put(3, 31);
+        lru_queue.put(1, 11);
+        assert_eq!(lru_queue.pop(), Some((2, 21)));
+        assert_eq!(lru_queue.pop(), Some((3, 31)));
+        assert_eq!(lru_queue.pop(), Some((1, 11)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'remove' an element in the middle of the queue
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.remove(&2);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'remove' the LRU
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.remove(&1);
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), Some((3, 30)));
+        assert_eq!(lru_queue.pop(), None);
+
+        // 'remove' the MRU
+        lru_queue.put(1, 10);
+        lru_queue.put(2, 20);
+        lru_queue.put(3, 30);
+        lru_queue.remove(&3);
+        assert_eq!(lru_queue.pop(), Some((1, 10)));
+        assert_eq!(lru_queue.pop(), Some((2, 20)));
+        assert_eq!(lru_queue.pop(), None);
+    }
+
+    #[test]
+    /// Fuzzy test using an hashmap as the base to check the methods.
+    fn test_fuzzy() {
+        let mut lru_queue: LruQueue<i32, i32> = LruQueue::new();
+        let mut map: HashMap<i32, i32> = HashMap::new();
+        let max_keys = 1_000;
+        let methods = ["get", "put", "remove", "pop", "contains", "len"];
+        let mut rng = rand::rng();
+
+        for i in 0..1_000_000 {
+            match *methods.choose(&mut rng).unwrap() {
+                "get" => {
+                    assert_eq!(lru_queue.get(&(i % max_keys)), map.get(&(i % max_keys)))
+                }
+                "put" => assert_eq!(
+                    lru_queue.put(i % max_keys, i),
+                    map.insert(i % max_keys, i)
+                ),
+                "remove" => assert_eq!(
+                    lru_queue.remove(&(i % max_keys)),
+                    map.remove(&(i % max_keys))
+                ),
+                "pop" => {
+                    let removed = lru_queue.pop();
+                    if let Some((k, v)) = removed {
+                        assert_eq!(Some(v), map.remove(&k))
+                    }
+                }
+                "contains" => {
+                    assert_eq!(
+                        lru_queue.contains_key(&(i % max_keys)),
+                        map.contains_key(&(i % max_keys))
+                    )
+                }
+                "len" => assert_eq!(lru_queue.len(), map.len()),
+                _ => unreachable!(),
+            }
+        }
+    }
+}
diff --git a/datafusion/execution/src/cache/mod.rs b/datafusion/execution/src/cache/mod.rs
index 4271bebd0b326..0380e50c0935c 100644
--- a/datafusion/execution/src/cache/mod.rs
+++ b/datafusion/execution/src/cache/mod.rs
@@ -17,35 +17,64 @@
 
 pub mod cache_manager;
 pub mod cache_unit;
+pub mod lru_queue;
 
-/// The cache accessor, users usually working on this interface while manipulating caches.
-/// This interface does not get `mut` references and thus has to handle its own
-/// locking via internal mutability. It can be accessed via multiple concurrent queries
-/// during planning and execution.
+mod file_metadata_cache;
+mod list_files_cache;
+
+pub use file_metadata_cache::DefaultFilesMetadataCache;
+pub use list_files_cache::DefaultListFilesCache;
+pub use list_files_cache::ListFilesEntry;
+pub use list_files_cache::TableScopedPath;
+
+/// Base trait for cache implementations with common operations.
+///
+/// This trait provides the fundamental cache operations (`get`, `put`, `remove`, etc.)
+/// that all cache types share. Specific cache traits like [`cache_manager::FileStatisticsCache`],
+/// [`cache_manager::ListFilesCache`], and [`cache_manager::FileMetadataCache`] extend this
+/// trait with their specialized methods.
+///
+/// ## Thread Safety
+///
+/// Implementations must handle their own locking via internal mutability, as methods do not
+/// take mutable references and may be accessed by multiple concurrent queries.
+///
+/// ## Validation Pattern
+///
+/// Validation metadata (e.g., file size, last modified time) should be embedded in the
+/// value type `V`. The typical usage pattern is:
+/// 1. Call `get(key)` to check for cached value
+/// 2. If `Some(cached)`, validate with `cached.is_valid_for(&current_meta)`
+/// 3. If invalid or missing, compute new value and call `put(key, new_value)`
 pub trait CacheAccessor<K, V>: Send + Sync {
-    // Extra info but not part of the cache key or cache value.
-    type Extra: Clone;
-
-    /// Get value from cache.
-    fn get(&self, k: &K) -> Option<V>;
-    /// Get value from cache.
-    fn get_with_extra(&self, k: &K, e: &Self::Extra) -> Option<V>;
-    /// Put value into cache. Returns the old value associated with the key if there was one.
+    /// Get a cached entry if it exists.
+    ///
+    /// Returns the cached value without any validation. The caller should
+    /// validate the returned value if freshness matters.
+    fn get(&self, key: &K) -> Option<V>;
+
+    /// Store a value in the cache.
+    ///
+    /// Returns the previous value if one existed.
     fn put(&self, key: &K, value: V) -> Option<V>;
-    /// Put value into cache. Returns the old value associated with the key if there was one.
-    fn put_with_extra(&self, key: &K, value: V, e: &Self::Extra) -> Option<V>;
-    /// Remove an entry from the cache, returning value if they existed in the map.
-    fn remove(&mut self, k: &K) -> Option<V>;
+
+    /// Remove an entry from the cache, returning the value if it existed.
+    fn remove(&self, k: &K) -> Option<V>;
+
     /// Check if the cache contains a specific key.
     fn contains_key(&self, k: &K) -> bool;
+
     /// Fetch the total number of cache entries.
     fn len(&self) -> usize;
-    /// Check if the Cache collection is empty or not.
+
+    /// Check if the cache collection is empty.
     fn is_empty(&self) -> bool {
         self.len() == 0
     }
+
     /// Remove all entries from the cache.
     fn clear(&self);
+
     /// Return the cache name.
     fn name(&self) -> String;
 }
diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
index 1e00a1ce4725e..854d239236766 100644
--- a/datafusion/execution/src/config.rs
+++ b/datafusion/execution/src/config.rs
@@ -23,8 +23,8 @@ use std::{
 };
 
 use datafusion_common::{
-    config::{ConfigExtension, ConfigOptions},
     Result, ScalarValue,
+    config::{ConfigExtension, ConfigOptions, SpillCompression},
 };
 
 /// Configuration options for [`SessionContext`].
@@ -44,12 +44,15 @@ use datafusion_common::{
 /// shorthand for setting `datafusion.execution.batch_size`.
 ///
 /// ```
-/// use datafusion_execution::config::SessionConfig;
 /// use datafusion_common::ScalarValue;
+/// use datafusion_execution::config::SessionConfig;
 ///
 /// let config = SessionConfig::new()
-///    .set("datafusion.execution.batch_size", &ScalarValue::UInt64(Some(1234)))
-///    .set_bool("datafusion.execution.parquet.pushdown_filters", true);
+///     .set(
+///         "datafusion.execution.batch_size",
+///         &ScalarValue::UInt64(Some(1234)),
+///     )
+///     .set_bool("datafusion.execution.parquet.pushdown_filters", true);
 ///
 /// assert_eq!(config.batch_size(), 1234);
 /// assert_eq!(config.options().execution.batch_size, 1234);
@@ -91,8 +94,11 @@ use datafusion_common::{
 /// [`SessionContext::new_with_config`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.new_with_config
 #[derive(Clone, Debug)]
 pub struct SessionConfig {
-    /// Configuration options
-    options: ConfigOptions,
+    /// Configuration options for the current session.
+    ///
+    /// A new copy is created on write, if there are other outstanding
+    /// references to the same options.
+    options: Arc<ConfigOptions>,
     /// Opaque extensions.
     extensions: AnyMap,
 }
@@ -100,7 +106,7 @@ pub struct SessionConfig {
 impl Default for SessionConfig {
     fn default() -> Self {
         Self {
-            options: ConfigOptions::new(),
+            options: Arc::new(ConfigOptions::new()),
             // Assume no extensions by default.
             extensions: HashMap::with_capacity_and_hasher(
                 0,
@@ -117,6 +123,9 @@ impl SessionConfig {
     }
 
     /// Create an execution config with config options read from the environment
+    ///
+    /// See [`ConfigOptions::from_env`] for details on how environment variables
+    /// are mapped to config options.
     pub fn from_env() -> Result<Self> {
         Ok(ConfigOptions::from_env()?.into())
     }
@@ -136,7 +145,7 @@ impl SessionConfig {
     /// let config = SessionConfig::new();
     /// assert!(config.options().execution.batch_size > 0);
     /// ```
-    pub fn options(&self) -> &ConfigOptions {
+    pub fn options(&self) -> &Arc<ConfigOptions> {
         &self.options
     }
 
@@ -152,7 +161,7 @@ impl SessionConfig {
     /// assert_eq!(config.options().execution.batch_size, 1024);
     /// ```
     pub fn options_mut(&mut self) -> &mut ConfigOptions {
-        &mut self.options
+        Arc::make_mut(&mut self.options)
     }
 
     /// Set a configuration option
@@ -177,7 +186,7 @@ impl SessionConfig {
 
     /// Set a generic `str` configuration option
     pub fn set_str(mut self, key: &str, value: &str) -> Self {
-        self.options.set(key, value).unwrap();
+        self.options_mut().set(key, value).unwrap();
         self
     }
 
@@ -185,7 +194,7 @@ impl SessionConfig {
     pub fn with_batch_size(mut self, n: usize) -> Self {
         // batch size must be greater than zero
         assert!(n > 0);
-        self.options.execution.batch_size = n;
+        self.options_mut().execution.batch_size = n;
         self
     }
 
@@ -193,7 +202,7 @@ impl SessionConfig {
     ///
     /// [`target_partitions`]: datafusion_common::config::ExecutionOptions::target_partitions
     pub fn with_target_partitions(mut self, n: usize) -> Self {
-        self.options.execution.target_partitions = if n == 0 {
+        self.options_mut().execution.target_partitions = if n == 0 {
             datafusion_common::config::ExecutionOptions::default().target_partitions
         } else {
             n
@@ -258,68 +267,75 @@ impl SessionConfig {
         self.options.execution.collect_statistics
     }
 
+    /// Compression codec for spill file
+    pub fn spill_compression(&self) -> SpillCompression {
+        self.options.execution.spill_compression
+    }
+
     /// Selects a name for the default catalog and schema
     pub fn with_default_catalog_and_schema(
         mut self,
         catalog: impl Into<String>,
         schema: impl Into<String>,
     ) -> Self {
-        self.options.catalog.default_catalog = catalog.into();
-        self.options.catalog.default_schema = schema.into();
+        self.options_mut().catalog.default_catalog = catalog.into();
+        self.options_mut().catalog.default_schema = schema.into();
         self
     }
 
     /// Controls whether the default catalog and schema will be automatically created
     pub fn with_create_default_catalog_and_schema(mut self, create: bool) -> Self {
-        self.options.catalog.create_default_catalog_and_schema = create;
+        self.options_mut().catalog.create_default_catalog_and_schema = create;
         self
     }
 
     /// Enables or disables the inclusion of `information_schema` virtual tables
     pub fn with_information_schema(mut self, enabled: bool) -> Self {
-        self.options.catalog.information_schema = enabled;
+        self.options_mut().catalog.information_schema = enabled;
         self
     }
 
     /// Enables or disables the use of repartitioning for joins to improve parallelism
     pub fn with_repartition_joins(mut self, enabled: bool) -> Self {
-        self.options.optimizer.repartition_joins = enabled;
+        self.options_mut().optimizer.repartition_joins = enabled;
         self
     }
 
     /// Enables or disables the use of repartitioning for aggregations to improve parallelism
     pub fn with_repartition_aggregations(mut self, enabled: bool) -> Self {
-        self.options.optimizer.repartition_aggregations = enabled;
+        self.options_mut().optimizer.repartition_aggregations = enabled;
         self
     }
 
     /// Sets minimum file range size for repartitioning scans
     pub fn with_repartition_file_min_size(mut self, size: usize) -> Self {
-        self.options.optimizer.repartition_file_min_size = size;
+        self.options_mut().optimizer.repartition_file_min_size = size;
         self
     }
 
     /// Enables or disables the allowing unordered symmetric hash join
     pub fn with_allow_symmetric_joins_without_pruning(mut self, enabled: bool) -> Self {
-        self.options.optimizer.allow_symmetric_joins_without_pruning = enabled;
+        self.options_mut()
+            .optimizer
+            .allow_symmetric_joins_without_pruning = enabled;
         self
     }
 
     /// Enables or disables the use of repartitioning for file scans
     pub fn with_repartition_file_scans(mut self, enabled: bool) -> Self {
-        self.options.optimizer.repartition_file_scans = enabled;
+        self.options_mut().optimizer.repartition_file_scans = enabled;
         self
     }
 
     /// Enables or disables the use of repartitioning for window functions to improve parallelism
     pub fn with_repartition_windows(mut self, enabled: bool) -> Self {
-        self.options.optimizer.repartition_windows = enabled;
+        self.options_mut().optimizer.repartition_windows = enabled;
         self
     }
 
     /// Enables or disables the use of per-partition sorting to improve parallelism
     pub fn with_repartition_sorts(mut self, enabled: bool) -> Self {
-        self.options.optimizer.repartition_sorts = enabled;
+        self.options_mut().optimizer.repartition_sorts = enabled;
         self
     }
 
@@ -328,7 +344,7 @@ impl SessionConfig {
     ///
     /// [prefer_existing_sort]: datafusion_common::config::OptimizerOptions::prefer_existing_sort
     pub fn with_prefer_existing_sort(mut self, enabled: bool) -> Self {
-        self.options.optimizer.prefer_existing_sort = enabled;
+        self.options_mut().optimizer.prefer_existing_sort = enabled;
         self
     }
 
@@ -336,13 +352,13 @@ impl SessionConfig {
     ///
     /// [prefer_existing_union]: datafusion_common::config::OptimizerOptions::prefer_existing_union
     pub fn with_prefer_existing_union(mut self, enabled: bool) -> Self {
-        self.options.optimizer.prefer_existing_union = enabled;
+        self.options_mut().optimizer.prefer_existing_union = enabled;
         self
     }
 
     /// Enables or disables the use of pruning predicate for parquet readers to skip row groups
     pub fn with_parquet_pruning(mut self, enabled: bool) -> Self {
-        self.options.execution.parquet.pruning = enabled;
+        self.options_mut().execution.parquet.pruning = enabled;
         self
     }
 
@@ -358,7 +374,7 @@ impl SessionConfig {
 
     /// Enables or disables the use of bloom filter for parquet readers to skip row groups
     pub fn with_parquet_bloom_filter_pruning(mut self, enabled: bool) -> Self {
-        self.options.execution.parquet.bloom_filter_on_read = enabled;
+        self.options_mut().execution.parquet.bloom_filter_on_read = enabled;
         self
     }
 
@@ -369,13 +385,13 @@ impl SessionConfig {
 
     /// Enables or disables the use of page index for parquet readers to skip parquet data pages
     pub fn with_parquet_page_index_pruning(mut self, enabled: bool) -> Self {
-        self.options.execution.parquet.enable_page_index = enabled;
+        self.options_mut().execution.parquet.enable_page_index = enabled;
         self
     }
 
     /// Enables or disables the collection of statistics after listing files
     pub fn with_collect_statistics(mut self, enabled: bool) -> Self {
-        self.options.execution.collect_statistics = enabled;
+        self.options_mut().execution.collect_statistics = enabled;
         self
     }
 
@@ -386,7 +402,7 @@ impl SessionConfig {
 
     /// Enables or disables the coalescence of small batches into larger batches
     pub fn with_coalesce_batches(mut self, enabled: bool) -> Self {
-        self.options.execution.coalesce_batches = enabled;
+        self.options_mut().execution.coalesce_batches = enabled;
         self
     }
 
@@ -398,7 +414,7 @@ impl SessionConfig {
 
     /// Enables or disables the round robin repartition for increasing parallelism
     pub fn with_round_robin_repartition(mut self, enabled: bool) -> Self {
-        self.options.optimizer.enable_round_robin_repartition = enabled;
+        self.options_mut().optimizer.enable_round_robin_repartition = enabled;
         self
     }
 
@@ -408,6 +424,13 @@ impl SessionConfig {
         self.options.optimizer.enable_round_robin_repartition
     }
 
+    /// Enables or disables sort pushdown optimization, and currently only
+    /// applies to Parquet data source.
+    pub fn with_enable_sort_pushdown(mut self, enabled: bool) -> Self {
+        self.options_mut().optimizer.enable_sort_pushdown = enabled;
+        self
+    }
+
     /// Set the size of [`sort_spill_reservation_bytes`] to control
     /// memory pre-reservation
     ///
@@ -416,11 +439,19 @@ impl SessionConfig {
         mut self,
         sort_spill_reservation_bytes: usize,
     ) -> Self {
-        self.options.execution.sort_spill_reservation_bytes =
+        self.options_mut().execution.sort_spill_reservation_bytes =
             sort_spill_reservation_bytes;
         self
     }
 
+    /// Set the compression codec [`spill_compression`] used when spilling data to disk.
+    ///
+    /// [`spill_compression`]: datafusion_common::config::ExecutionOptions::spill_compression
+    pub fn with_spill_compression(mut self, spill_compression: SpillCompression) -> Self {
+        self.options_mut().execution.spill_compression = spill_compression;
+        self
+    }
+
     /// Set the size of [`sort_in_place_threshold_bytes`] to control
     /// how sort does things.
     ///
@@ -429,7 +460,7 @@ impl SessionConfig {
         mut self,
         sort_in_place_threshold_bytes: usize,
     ) -> Self {
-        self.options.execution.sort_in_place_threshold_bytes =
+        self.options_mut().execution.sort_in_place_threshold_bytes =
             sort_in_place_threshold_bytes;
         self
     }
@@ -439,7 +470,8 @@ impl SessionConfig {
         mut self,
         enforce_batch_size_in_joins: bool,
     ) -> Self {
-        self.options.execution.enforce_batch_size_in_joins = enforce_batch_size_in_joins;
+        self.options_mut().execution.enforce_batch_size_in_joins =
+            enforce_batch_size_in_joins;
         self
     }
 
@@ -448,6 +480,12 @@ impl SessionConfig {
         self.options.execution.enforce_batch_size_in_joins
     }
 
+    /// Toggle SQL ANSI mode for expressions, casting, and error handling
+    pub fn with_enable_ansi_mode(mut self, enable_ansi_mode: bool) -> Self {
+        self.options_mut().execution.enable_ansi_mode = enable_ansi_mode;
+        self
+    }
+
     /// Convert configuration options to name-value pairs with values
     /// converted to strings.
     ///
@@ -480,8 +518,8 @@ impl SessionConfig {
     ///
     /// # Example
     /// ```
-    /// use std::sync::Arc;
     /// use datafusion_execution::config::SessionConfig;
+    /// use std::sync::Arc;
     ///
     /// // application-specific extension types
     /// struct Ext1(u8);
@@ -523,8 +561,8 @@ impl SessionConfig {
     ///
     /// # Example
     /// ```
-    /// use std::sync::Arc;
     /// use datafusion_execution::config::SessionConfig;
+    /// use std::sync::Arc;
     ///
     /// // application-specific extension types
     /// struct Ext1(u8);
@@ -577,6 +615,7 @@ impl SessionConfig {
 
 impl From<ConfigOptions> for SessionConfig {
     fn from(options: ConfigOptions) -> Self {
+        let options = Arc::new(options);
         Self {
             options,
             ..Default::default()
diff --git a/datafusion/execution/src/disk_manager.rs b/datafusion/execution/src/disk_manager.rs
index 1810601fd362a..1a14bd239a61a 100644
--- a/datafusion/execution/src/disk_manager.rs
+++ b/datafusion/execution/src/disk_manager.rs
@@ -18,19 +18,19 @@
 //! [`DiskManager`]: Manages files generated during query execution
 
 use datafusion_common::{
-    config_err, resources_datafusion_err, resources_err, DataFusionError, Result,
+    DataFusionError, Result, config_err, resources_datafusion_err, resources_err,
 };
 use log::debug;
 use parking_lot::Mutex;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use tempfile::{Builder, NamedTempFile, TempDir};
 
-use crate::memory_pool::human_readable_size;
+use datafusion_common::human_readable_size;
 
-const DEFAULT_MAX_TEMP_DIRECTORY_SIZE: u64 = 100 * 1024 * 1024 * 1024; // 100GB
+pub const DEFAULT_MAX_TEMP_DIRECTORY_SIZE: u64 = 100 * 1024 * 1024 * 1024; // 100GB
 
 /// Builder pattern for the [DiskManager] structure
 #[derive(Clone, Debug)]
@@ -77,9 +77,10 @@ impl DiskManagerBuilder {
                 local_dirs: Mutex::new(Some(vec![])),
                 max_temp_directory_size: self.max_temp_directory_size,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             }),
             DiskManagerMode::Directories(conf_dirs) => {
-                let local_dirs = create_local_dirs(conf_dirs)?;
+                let local_dirs = create_local_dirs(&conf_dirs)?;
                 debug!(
                     "Created local dirs {local_dirs:?} as DataFusion working directory"
                 );
@@ -87,21 +88,24 @@ impl DiskManagerBuilder {
                     local_dirs: Mutex::new(Some(local_dirs)),
                     max_temp_directory_size: self.max_temp_directory_size,
                     used_disk_space: Arc::new(AtomicU64::new(0)),
+                    active_files_count: Arc::new(AtomicUsize::new(0)),
                 })
             }
             DiskManagerMode::Disabled => Ok(DiskManager {
                 local_dirs: Mutex::new(None),
                 max_temp_directory_size: self.max_temp_directory_size,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             }),
         }
     }
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub enum DiskManagerMode {
     /// Create a new [DiskManager] that creates temporary files within
     /// a temporary directory chosen by the OS
+    #[default]
     OsTmpDirectory,
 
     /// Create a new [DiskManager] that creates temporary files within
@@ -113,21 +117,18 @@ pub enum DiskManagerMode {
     Disabled,
 }
 
-impl Default for DiskManagerMode {
-    fn default() -> Self {
-        Self::OsTmpDirectory
-    }
-}
-
 /// Configuration for temporary disk access
 #[deprecated(since = "48.0.0", note = "Use DiskManagerBuilder instead")]
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
+#[allow(clippy::allow_attributes)]
+#[allow(deprecated)]
 pub enum DiskManagerConfig {
     /// Use the provided [DiskManager] instance
     Existing(Arc<DiskManager>),
 
     /// Create a new [DiskManager] that creates temporary files within
     /// a temporary directory chosen by the OS
+    #[default]
     NewOs,
 
     /// Create a new [DiskManager] that creates temporary files within
@@ -138,14 +139,7 @@ pub enum DiskManagerConfig {
     Disabled,
 }
 
-#[allow(deprecated)]
-impl Default for DiskManagerConfig {
-    fn default() -> Self {
-        Self::NewOs
-    }
-}
-
-#[allow(deprecated)]
+#[expect(deprecated)]
 impl DiskManagerConfig {
     /// Create temporary files in a temporary directory chosen by the OS
     pub fn new() -> Self {
@@ -178,6 +172,17 @@ pub struct DiskManager {
     /// Used disk space in the temporary directories. Now only spilled data for
     /// external executors are counted.
     used_disk_space: Arc<AtomicU64>,
+    /// Number of active temporary files created by this disk manager
+    active_files_count: Arc<AtomicUsize>,
+}
+
+/// Information about the current disk usage for spilling
+#[derive(Debug, Clone, Copy)]
+pub struct SpillingProgress {
+    /// Total bytes currently used on disk for spilling
+    pub current_bytes: u64,
+    /// Total number of active spill files
+    pub active_files_count: usize,
 }
 
 impl DiskManager {
@@ -187,7 +192,7 @@ impl DiskManager {
     }
 
     /// Create a DiskManager given the configuration
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     #[deprecated(since = "48.0.0", note = "Use DiskManager::builder() instead")]
     pub fn try_new(config: DiskManagerConfig) -> Result<Arc<Self>> {
         match config {
@@ -196,9 +201,10 @@ impl DiskManager {
                 local_dirs: Mutex::new(Some(vec![])),
                 max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             })),
             DiskManagerConfig::NewSpecified(conf_dirs) => {
-                let local_dirs = create_local_dirs(conf_dirs)?;
+                let local_dirs = create_local_dirs(&conf_dirs)?;
                 debug!(
                     "Created local dirs {local_dirs:?} as DataFusion working directory"
                 );
@@ -206,12 +212,14 @@ impl DiskManager {
                     local_dirs: Mutex::new(Some(local_dirs)),
                     max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                     used_disk_space: Arc::new(AtomicU64::new(0)),
+                    active_files_count: Arc::new(AtomicUsize::new(0)),
                 }))
             }
             DiskManagerConfig::Disabled => Ok(Arc::new(Self {
                 local_dirs: Mutex::new(None),
                 max_temp_directory_size: DEFAULT_MAX_TEMP_DIRECTORY_SIZE,
                 used_disk_space: Arc::new(AtomicU64::new(0)),
+                active_files_count: Arc::new(AtomicUsize::new(0)),
             })),
         }
     }
@@ -256,6 +264,32 @@ impl DiskManager {
         self.used_disk_space.load(Ordering::Relaxed)
     }
 
+    /// Returns the maximum temporary directory size in bytes
+    pub fn max_temp_directory_size(&self) -> u64 {
+        self.max_temp_directory_size
+    }
+
+    /// Returns the current spilling progress
+    pub fn spilling_progress(&self) -> SpillingProgress {
+        SpillingProgress {
+            current_bytes: self.used_disk_space.load(Ordering::Relaxed),
+            active_files_count: self.active_files_count.load(Ordering::Relaxed),
+        }
+    }
+
+    /// Returns the temporary directory paths
+    pub fn temp_dir_paths(&self) -> Vec<PathBuf> {
+        self.local_dirs
+            .lock()
+            .as_ref()
+            .map(|dirs| {
+                dirs.iter()
+                    .map(|temp_dir| temp_dir.path().to_path_buf())
+                    .collect()
+            })
+            .unwrap_or_default()
+    }
+
     /// Return true if this disk manager supports creating temporary
     /// files. If this returns false, any call to `create_tmp_file`
     /// will error.
@@ -292,12 +326,15 @@ impl DiskManager {
         }
 
         let dir_index = rng().random_range(0..local_dirs.len());
+        self.active_files_count.fetch_add(1, Ordering::Relaxed);
         Ok(RefCountedTempFile {
-            _parent_temp_dir: Arc::clone(&local_dirs[dir_index]),
-            tempfile: Builder::new()
-                .tempfile_in(local_dirs[dir_index].as_ref())
-                .map_err(DataFusionError::IoError)?,
-            current_file_disk_usage: 0,
+            parent_temp_dir: Arc::clone(&local_dirs[dir_index]),
+            tempfile: Arc::new(
+                Builder::new()
+                    .tempfile_in(local_dirs[dir_index].as_ref())
+                    .map_err(DataFusionError::IoError)?,
+            ),
+            current_file_disk_usage: Arc::new(AtomicU64::new(0)),
             disk_manager: Arc::clone(self),
         })
     }
@@ -311,26 +348,50 @@ impl DiskManager {
 /// must invoke [`Self::update_disk_usage`] to update the global disk usage counter.
 /// This ensures the disk manager can properly enforce usage limits configured by
 /// [`DiskManager::with_max_temp_directory_size`].
+///
+/// This type is Clone-able, allowing multiple references to the same underlying file.
+/// The file is deleted only when the last reference is dropped.
+///
+/// The parent temporary directory is also kept alive as long as any reference to
+/// this file exists, preventing premature cleanup of the directory.
+///
+/// Once all references to this file are dropped, the file is deleted, and the
+/// disk usage is subtracted from the disk manager's total.
 #[derive(Debug)]
 pub struct RefCountedTempFile {
     /// The reference to the directory in which temporary files are created to ensure
     /// it is not cleaned up prior to the NamedTempFile
-    _parent_temp_dir: Arc<TempDir>,
-    tempfile: NamedTempFile,
+    parent_temp_dir: Arc<TempDir>,
+    /// The underlying temporary file, wrapped in Arc to allow cloning
+    tempfile: Arc<NamedTempFile>,
     /// Tracks the current disk usage of this temporary file. See
     /// [`Self::update_disk_usage`] for more details.
-    current_file_disk_usage: u64,
+    ///
+    /// This is wrapped in `Arc<AtomicU64>` so that all clones share the same
+    /// disk usage tracking, preventing incorrect accounting when clones are dropped.
+    current_file_disk_usage: Arc<AtomicU64>,
     /// The disk manager that created and manages this temporary file
     disk_manager: Arc<DiskManager>,
 }
 
+impl Clone for RefCountedTempFile {
+    fn clone(&self) -> Self {
+        Self {
+            parent_temp_dir: Arc::clone(&self.parent_temp_dir),
+            tempfile: Arc::clone(&self.tempfile),
+            current_file_disk_usage: Arc::clone(&self.current_file_disk_usage),
+            disk_manager: Arc::clone(&self.disk_manager),
+        }
+    }
+}
+
 impl RefCountedTempFile {
     pub fn path(&self) -> &Path {
         self.tempfile.path()
     }
 
     pub fn inner(&self) -> &NamedTempFile {
-        &self.tempfile
+        self.tempfile.as_ref()
     }
 
     /// Updates the global disk usage counter after modifications to the underlying file.
@@ -342,11 +403,14 @@ impl RefCountedTempFile {
         let metadata = self.tempfile.as_file().metadata()?;
         let new_disk_usage = metadata.len();
 
+        // Get the old disk usage
+        let old_disk_usage = self.current_file_disk_usage.load(Ordering::Relaxed);
+
         // Update the global disk usage by:
         // 1. Subtracting the old file size from the global counter
         self.disk_manager
             .used_disk_space
-            .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed);
+            .fetch_sub(old_disk_usage, Ordering::Relaxed);
         // 2. Adding the new file size to the global counter
         self.disk_manager
             .used_disk_space
@@ -356,30 +420,44 @@ impl RefCountedTempFile {
         let global_disk_usage = self.disk_manager.used_disk_space.load(Ordering::Relaxed);
         if global_disk_usage > self.disk_manager.max_temp_directory_size {
             return resources_err!(
-                "The used disk space during the spilling process has exceeded the allowable limit of {}. Try increasing the `max_temp_directory_size` in the disk manager configuration.",
+                "The used disk space during the spilling process has exceeded the allowable limit of {}. \
+                Please try increasing the config: `datafusion.runtime.max_temp_directory_size`.",
                 human_readable_size(self.disk_manager.max_temp_directory_size as usize)
             );
         }
 
         // 4. Update the local file size tracking
-        self.current_file_disk_usage = new_disk_usage;
+        self.current_file_disk_usage
+            .store(new_disk_usage, Ordering::Relaxed);
 
         Ok(())
     }
+
+    pub fn current_disk_usage(&self) -> u64 {
+        self.current_file_disk_usage.load(Ordering::Relaxed)
+    }
 }
 
 /// When the temporary file is dropped, subtract its disk usage from the disk manager's total
 impl Drop for RefCountedTempFile {
     fn drop(&mut self) {
-        // Subtract the current file's disk usage from the global counter
-        self.disk_manager
-            .used_disk_space
-            .fetch_sub(self.current_file_disk_usage, Ordering::Relaxed);
+        // Only subtract disk usage when this is the last reference to the file
+        // Check if we're the last one by seeing if there's only one strong reference
+        // left to the underlying tempfile (the one we're holding)
+        if Arc::strong_count(&self.tempfile) == 1 {
+            let current_usage = self.current_file_disk_usage.load(Ordering::Relaxed);
+            self.disk_manager
+                .used_disk_space
+                .fetch_sub(current_usage, Ordering::Relaxed);
+            self.disk_manager
+                .active_files_count
+                .fetch_sub(1, Ordering::Relaxed);
+        }
     }
 }
 
 /// Setup local dirs by creating one new dir in each of the given dirs
-fn create_local_dirs(local_dirs: Vec<PathBuf>) -> Result<Vec<Arc<TempDir>>> {
+fn create_local_dirs(local_dirs: &[PathBuf]) -> Result<Vec<Arc<TempDir>>> {
     local_dirs
         .iter()
         .map(|root| {
@@ -461,7 +539,10 @@ mod tests {
         );
         assert!(!manager.tmp_files_enabled());
         assert_eq!(
-            manager.create_tmp_file("Testing").unwrap_err().strip_backtrace(),
+            manager
+                .create_tmp_file("Testing")
+                .unwrap_err()
+                .strip_backtrace(),
             "Resources exhausted: Memory Exhausted while Testing (DiskManager is disabled)",
         )
     }
@@ -529,4 +610,190 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_disk_usage_basic() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Initially, disk usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+        assert_eq!(temp_file.current_disk_usage(), 0);
+
+        // Write some data to the file
+        temp_file.inner().as_file().write_all(b"hello world")?;
+        temp_file.update_disk_usage()?;
+
+        // Disk usage should now reflect the written data
+        let expected_usage = temp_file.current_disk_usage();
+        assert!(expected_usage > 0);
+        assert_eq!(dm.used_disk_space(), expected_usage);
+
+        // Write more data
+        temp_file.inner().as_file().write_all(b" more data")?;
+        temp_file.update_disk_usage()?;
+
+        // Disk usage should increase
+        let new_usage = temp_file.current_disk_usage();
+        assert!(new_usage > expected_usage);
+        assert_eq!(dm.used_disk_space(), new_usage);
+
+        // Drop the file
+        drop(temp_file);
+
+        // Disk usage should return to 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_with_clones() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Write some data
+        temp_file.inner().as_file().write_all(b"test data")?;
+        temp_file.update_disk_usage()?;
+
+        let usage_after_write = temp_file.current_disk_usage();
+        assert!(usage_after_write > 0);
+        assert_eq!(dm.used_disk_space(), usage_after_write);
+
+        // Clone the file
+        let clone1 = temp_file.clone();
+        let clone2 = temp_file.clone();
+
+        // All clones should see the same disk usage
+        assert_eq!(clone1.current_disk_usage(), usage_after_write);
+        assert_eq!(clone2.current_disk_usage(), usage_after_write);
+
+        // Global disk usage should still be the same (not multiplied by number of clones)
+        assert_eq!(dm.used_disk_space(), usage_after_write);
+
+        // Write more data through one clone
+        clone1.inner().as_file().write_all(b" more data")?;
+        let mut mutable_clone1 = clone1;
+        mutable_clone1.update_disk_usage()?;
+
+        let new_usage = mutable_clone1.current_disk_usage();
+        assert!(new_usage > usage_after_write);
+
+        // All clones should see the updated disk usage
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+        assert_eq!(clone2.current_disk_usage(), new_usage);
+        assert_eq!(mutable_clone1.current_disk_usage(), new_usage);
+
+        // Global disk usage should reflect the new size (not multiplied)
+        assert_eq!(dm.used_disk_space(), new_usage);
+
+        // Drop one clone
+        drop(mutable_clone1);
+
+        // Disk usage should NOT change (other clones still exist)
+        assert_eq!(dm.used_disk_space(), new_usage);
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+        assert_eq!(clone2.current_disk_usage(), new_usage);
+
+        // Drop another clone
+        drop(clone2);
+
+        // Disk usage should still NOT change (original still exists)
+        assert_eq!(dm.used_disk_space(), new_usage);
+        assert_eq!(temp_file.current_disk_usage(), new_usage);
+
+        // Drop the original
+        drop(temp_file);
+
+        // Now disk usage should return to 0 (last reference dropped)
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_clones_dropped_out_of_order() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+        let mut temp_file = dm.create_tmp_file("Testing")?;
+
+        // Write data
+        temp_file.inner().as_file().write_all(b"test")?;
+        temp_file.update_disk_usage()?;
+
+        let usage = temp_file.current_disk_usage();
+        assert_eq!(dm.used_disk_space(), usage);
+
+        // Create multiple clones
+        let clone1 = temp_file.clone();
+        let clone2 = temp_file.clone();
+        let clone3 = temp_file.clone();
+
+        // Drop the original first (out of order)
+        drop(temp_file);
+
+        // Disk usage should still be tracked (clones exist)
+        assert_eq!(dm.used_disk_space(), usage);
+        assert_eq!(clone1.current_disk_usage(), usage);
+
+        // Drop clones in different order
+        drop(clone2);
+        assert_eq!(dm.used_disk_space(), usage);
+
+        drop(clone1);
+        assert_eq!(dm.used_disk_space(), usage);
+
+        // Drop the last clone
+        drop(clone3);
+
+        // Now disk usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_disk_usage_multiple_files() -> Result<()> {
+        use std::io::Write;
+
+        let dm = Arc::new(DiskManagerBuilder::default().build()?);
+
+        // Create multiple temp files
+        let mut file1 = dm.create_tmp_file("Testing1")?;
+        let mut file2 = dm.create_tmp_file("Testing2")?;
+
+        // Write to first file
+        file1.inner().as_file().write_all(b"file1")?;
+        file1.update_disk_usage()?;
+        let usage1 = file1.current_disk_usage();
+
+        assert_eq!(dm.used_disk_space(), usage1);
+
+        // Write to second file
+        file2.inner().as_file().write_all(b"file2 data")?;
+        file2.update_disk_usage()?;
+        let usage2 = file2.current_disk_usage();
+
+        // Global usage should be sum of both files
+        assert_eq!(dm.used_disk_space(), usage1 + usage2);
+
+        // Drop first file
+        drop(file1);
+
+        // Usage should only reflect second file
+        assert_eq!(dm.used_disk_space(), usage2);
+
+        // Drop second file
+        drop(file2);
+
+        // Usage should be 0
+        assert_eq!(dm.used_disk_space(), 0);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/execution/src/lib.rs b/datafusion/execution/src/lib.rs
index 6a0a4b6322ee8..1a8da9459ae10 100644
--- a/datafusion/execution/src/lib.rs
+++ b/datafusion/execution/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! DataFusion execution configuration and runtime structures
 
@@ -31,6 +32,8 @@ pub mod config;
 pub mod disk_manager;
 pub mod memory_pool;
 pub mod object_store;
+#[cfg(feature = "parquet_encryption")]
+pub mod parquet_encryption;
 pub mod runtime_env;
 mod stream;
 mod task;
@@ -44,4 +47,4 @@ pub mod registry {
 pub use disk_manager::DiskManager;
 pub use registry::FunctionRegistry;
 pub use stream::{RecordBatchStream, SendableRecordBatchStream};
-pub use task::TaskContext;
+pub use task::{TaskContext, TaskContextProvider};
diff --git a/datafusion/execution/src/memory_pool/arrow.rs b/datafusion/execution/src/memory_pool/arrow.rs
new file mode 100644
index 0000000000000..4e8d986f1f5e3
--- /dev/null
+++ b/datafusion/execution/src/memory_pool/arrow.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Adapter for integrating DataFusion's [`MemoryPool`] with Arrow's memory tracking APIs.
+
+use crate::memory_pool::{MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation};
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// An adapter that implements Arrow's [`arrow_buffer::MemoryPool`] trait
+/// by wrapping a DataFusion [`MemoryPool`].
+///
+/// This allows DataFusion's memory management system to be used with Arrow's
+/// memory allocation APIs. Each reservation made through this pool will be
+/// tracked using the provided [`MemoryConsumer`], enabling DataFusion to
+/// monitor and limit memory usage across Arrow operations.
+///
+/// This is useful when you want Arrow operations (such as array builders
+/// or compute kernels) to participate in DataFusion's memory management
+/// and respect the same memory limits as DataFusion operators.
+#[derive(Debug)]
+pub struct ArrowMemoryPool {
+    inner: Arc<dyn MemoryPool>,
+    consumer: MemoryConsumer,
+}
+
+impl ArrowMemoryPool {
+    /// Creates a new [`ArrowMemoryPool`] that wraps the given DataFusion [`MemoryPool`]
+    /// and tracks allocations under the specified [`MemoryConsumer`].
+    pub fn new(inner: Arc<dyn MemoryPool>, consumer: MemoryConsumer) -> Self {
+        Self { inner, consumer }
+    }
+}
+
+impl arrow_buffer::MemoryReservation for MemoryReservation {
+    fn size(&self) -> usize {
+        MemoryReservation::size(self)
+    }
+
+    fn resize(&mut self, new_size: usize) {
+        MemoryReservation::resize(self, new_size)
+    }
+}
+
+impl arrow_buffer::MemoryPool for ArrowMemoryPool {
+    fn reserve(&self, size: usize) -> Box<dyn arrow_buffer::MemoryReservation> {
+        let consumer = self.consumer.clone_with_new_id();
+        let mut reservation = consumer.register(&self.inner);
+        reservation.grow(size);
+
+        Box::new(reservation)
+    }
+
+    fn available(&self) -> isize {
+        // The pool may be overfilled, so this method might return a negative value.
+        (self.capacity() as i128 - self.used() as i128)
+            .try_into()
+            .unwrap_or(isize::MIN)
+    }
+
+    fn used(&self) -> usize {
+        self.inner.reserved()
+    }
+
+    fn capacity(&self) -> usize {
+        match self.inner.memory_limit() {
+            MemoryLimit::Infinite | MemoryLimit::Unknown => usize::MAX,
+            MemoryLimit::Finite(capacity) => capacity,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::memory_pool::{GreedyMemoryPool, UnboundedMemoryPool};
+    use arrow::array::{Array, Int32Array};
+    use arrow_buffer::MemoryPool;
+
+    // Until https://github.com/apache/arrow-rs/pull/8918 lands, we need to iterate all
+    // buffers in the array. Change once the PR is released.
+    fn claim_array(array: &dyn Array, pool: &dyn MemoryPool) {
+        for buffer in array.to_data().buffers() {
+            buffer.claim(pool);
+        }
+    }
+
+    #[test]
+    pub fn can_claim_array() {
+        let pool = Arc::new(UnboundedMemoryPool::default());
+
+        let consumer = MemoryConsumer::new("arrow");
+        let arrow_pool = ArrowMemoryPool::new(pool, consumer);
+
+        let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        claim_array(&array, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+
+        let slice = array.slice(0, 2);
+
+        // This should be a no-op
+        claim_array(&slice, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+    }
+
+    #[test]
+    pub fn can_claim_array_with_finite_limit() {
+        let pool_capacity = 1024;
+        let pool = Arc::new(GreedyMemoryPool::new(pool_capacity));
+
+        let consumer = MemoryConsumer::new("arrow");
+        let arrow_pool = ArrowMemoryPool::new(pool, consumer);
+
+        assert_eq!(arrow_pool.capacity(), pool_capacity);
+        assert_eq!(arrow_pool.available(), pool_capacity as isize);
+
+        let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        claim_array(&array, &arrow_pool);
+
+        assert_eq!(arrow_pool.used(), array.get_buffer_memory_size());
+        assert_eq!(
+            arrow_pool.available(),
+            (pool_capacity - array.get_buffer_memory_size()) as isize
+        );
+    }
+}
diff --git a/datafusion/execution/src/memory_pool/mod.rs b/datafusion/execution/src/memory_pool/mod.rs
index 19e509d263ea2..a544cdfdb02e8 100644
--- a/datafusion/execution/src/memory_pool/mod.rs
+++ b/datafusion/execution/src/memory_pool/mod.rs
@@ -18,17 +18,22 @@
 //! [`MemoryPool`] for memory management during query execution, [`proxy`] for
 //! help with allocation accounting.
 
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_datafusion_err};
 use std::hash::{Hash, Hasher};
-use std::{cmp::Ordering, sync::atomic, sync::Arc};
+use std::{cmp::Ordering, sync::Arc, sync::atomic};
 
 mod pool;
+
+#[cfg(feature = "arrow_buffer_pool")]
+pub mod arrow;
+
 pub mod proxy {
-    pub use datafusion_common::utils::proxy::{
-        HashTableAllocExt, RawTableAllocExt, VecAllocExt,
-    };
+    pub use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 }
 
+pub use datafusion_common::{
+    human_readable_count, human_readable_duration, human_readable_size, units,
+};
 pub use pool::*;
 
 /// Tracks and potentially limits memory use across operators during execution.
@@ -57,8 +62,8 @@ pub use pool::*;
 /// `GroupByHashExec`. It does NOT track and limit memory used internally by
 /// other operators such as `DataSourceExec` or the `RecordBatch`es that flow
 /// between operators. Furthermore, operators should not reserve memory for the
-/// batches they produce. Instead, if a parent operator needs to hold batches
-/// from its children in memory for an extended period, it is the parent
+/// batches they produce. Instead, if a consumer operator needs to hold batches
+/// from its producers in memory for an extended period, it is the consumer
 /// operator's responsibility to reserve the necessary memory for those batches.
 ///
 /// In order to avoid allocating memory until the OS or the container system
@@ -98,6 +103,67 @@ pub use pool::*;
 /// operator will spill the intermediate buffers to disk, and release memory
 /// from the memory pool, and continue to retry memory reservation.
 ///
+/// # Related Structs
+///
+/// To better understand memory management in DataFusion, here are the key structs
+/// and their relationships:
+///
+/// - [`MemoryConsumer`]: A named allocation traced by a particular operator. If an
+///   execution is parallelized, and there are multiple partitions of the same
+///   operator, each partition will have a separate `MemoryConsumer`.
+/// - `SharedRegistration`: A registration of a `MemoryConsumer` with a `MemoryPool`.
+///   `SharedRegistration` and `MemoryPool` have a many-to-one relationship. `MemoryPool`
+///   implementation can decide how to allocate memory based on the registered consumers.
+///   (e.g. `FairSpillPool` will try to share available memory evenly among all registered
+///   consumers)
+/// - [`MemoryReservation`]: Each `MemoryConsumer`/operator can have multiple
+///   `MemoryReservation`s for different internal data structures. The relationship
+///   between `MemoryConsumer` and `MemoryReservation` is one-to-many. This design
+///   enables cleaner operator implementations:
+///   - Different `MemoryReservation`s can be used for different purposes
+///   - `MemoryReservation` follows RAII principles - to release a reservation,
+///     simply drop the `MemoryReservation` object. When all `MemoryReservation`s
+///     for a `SharedRegistration` are dropped, the `SharedRegistration` is dropped
+///     when its reference count reaches zero, automatically unregistering the
+///     `MemoryConsumer` from the `MemoryPool`.
+///
+/// ## Relationship Diagram
+///
+/// ```text
+/// ┌──────────────────┐     ┌──────────────────┐
+/// │MemoryReservation │     │MemoryReservation │
+/// └───┬──────────────┘     └──────────────────┘ ......
+///     │belongs to                    │
+///     │      ┌───────────────────────┘           │  │
+///     │      │                                   │  │
+///     ▼      ▼                                   ▼  ▼
+/// ┌────────────────────────┐       ┌────────────────────────┐
+/// │   SharedRegistration   │       │   SharedRegistration   │
+/// │   ┌────────────────┐   │       │   ┌────────────────┐   │
+/// │   │                │   │       │   │                │   │
+/// │   │ MemoryConsumer │   │       │   │ MemoryConsumer │   │
+/// │   │                │   │       │   │                │   │
+/// │   └────────────────┘   │       │   └────────────────┘   │
+/// └────────────┬───────────┘       └────────────┬───────────┘
+///              │                                │
+///              │                        register│into
+///              │                                │
+///              └─────────────┐   ┌──────────────┘
+///                            │   │
+///                            ▼   ▼
+///    ╔═══════════════════════════════════════════════════╗
+///    ║                                                   ║
+///    ║                    MemoryPool                     ║
+///    ║                                                   ║
+///    ╚═══════════════════════════════════════════════════╝
+/// ```
+///
+/// For example, there are two parallel partitions of an operator X: each partition
+/// corresponds to a `MemoryConsumer` in the above diagram. Inside each partition of
+/// operator X, there are typically several `MemoryReservation`s - one for each
+/// internal data structure that needs memory tracking (e.g., 1 reservation for the hash
+/// table, and 1 reservation for buffered input, etc.).
+///
 /// # Implementing `MemoryPool`
 ///
 /// You can implement a custom allocation policy by implementing the
@@ -260,7 +326,7 @@ impl MemoryConsumer {
                 pool: Arc::clone(pool),
                 consumer: self,
             }),
-            size: 0,
+            size: atomic::AtomicUsize::new(0),
         }
     }
 }
@@ -289,13 +355,13 @@ impl Drop for SharedRegistration {
 #[derive(Debug)]
 pub struct MemoryReservation {
     registration: Arc<SharedRegistration>,
-    size: usize,
+    size: atomic::AtomicUsize,
 }
 
 impl MemoryReservation {
     /// Returns the size of this reservation in bytes
     pub fn size(&self) -> usize {
-        self.size
+        self.size.load(atomic::Ordering::Relaxed)
     }
 
     /// Returns [MemoryConsumer] for this [MemoryReservation]
@@ -305,10 +371,10 @@ impl MemoryReservation {
 
     /// Frees all bytes from this reservation back to the underlying
     /// pool, returning the number of bytes freed.
-    pub fn free(&mut self) -> usize {
-        let size = self.size;
+    pub fn free(&self) -> usize {
+        let size = self.size.swap(0, atomic::Ordering::Relaxed);
         if size != 0 {
-            self.shrink(size)
+            self.registration.pool.shrink(self, size);
         }
         size
     }
@@ -318,60 +384,76 @@ impl MemoryReservation {
     /// # Panics
     ///
     /// Panics if `capacity` exceeds [`Self::size`]
-    pub fn shrink(&mut self, capacity: usize) {
-        let new_size = self.size.checked_sub(capacity).unwrap();
+    pub fn shrink(&self, capacity: usize) {
+        self.size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
+            )
+            .unwrap_or_else(|prev| {
+                panic!("Cannot free the capacity {capacity} out of allocated size {prev}")
+            });
         self.registration.pool.shrink(self, capacity);
-        self.size = new_size
     }
 
     /// Tries to free `capacity` bytes from this reservation
-    /// if `capacity` does not exceed [`Self::size`]
-    /// Returns new reservation size
-    /// or error if shrinking capacity is more than allocated size
-    pub fn try_shrink(&mut self, capacity: usize) -> Result<usize> {
-        if let Some(new_size) = self.size.checked_sub(capacity) {
-            self.registration.pool.shrink(self, capacity);
-            self.size = new_size;
-            Ok(new_size)
-        } else {
-            internal_err!(
-                "Cannot free the capacity {capacity} out of allocated size {}",
-                self.size
+    /// if `capacity` does not exceed [`Self::size`].
+    /// Returns new reservation size,
+    /// or error if shrinking capacity is more than allocated size.
+    pub fn try_shrink(&self, capacity: usize) -> Result<usize> {
+        let prev = self
+            .size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
             )
-        }
+            .map_err(|prev| {
+                internal_datafusion_err!(
+                    "Cannot free the capacity {capacity} out of allocated size {prev}"
+                )
+            })?;
+
+        self.registration.pool.shrink(self, capacity);
+        Ok(prev - capacity)
     }
 
     /// Sets the size of this reservation to `capacity`
-    pub fn resize(&mut self, capacity: usize) {
-        match capacity.cmp(&self.size) {
-            Ordering::Greater => self.grow(capacity - self.size),
-            Ordering::Less => self.shrink(self.size - capacity),
+    pub fn resize(&self, capacity: usize) {
+        let size = self.size.load(atomic::Ordering::Relaxed);
+        match capacity.cmp(&size) {
+            Ordering::Greater => self.grow(capacity - size),
+            Ordering::Less => self.shrink(size - capacity),
             _ => {}
         }
     }
 
     /// Try to set the size of this reservation to `capacity`
-    pub fn try_resize(&mut self, capacity: usize) -> Result<()> {
-        match capacity.cmp(&self.size) {
-            Ordering::Greater => self.try_grow(capacity - self.size)?,
-            Ordering::Less => self.shrink(self.size - capacity),
+    pub fn try_resize(&self, capacity: usize) -> Result<()> {
+        let size = self.size.load(atomic::Ordering::Relaxed);
+        match capacity.cmp(&size) {
+            Ordering::Greater => self.try_grow(capacity - size)?,
+            Ordering::Less => {
+                self.try_shrink(size - capacity)?;
+            }
             _ => {}
         };
         Ok(())
     }
 
     /// Increase the size of this reservation by `capacity` bytes
-    pub fn grow(&mut self, capacity: usize) {
+    pub fn grow(&self, capacity: usize) {
         self.registration.pool.grow(self, capacity);
-        self.size += capacity;
+        self.size.fetch_add(capacity, atomic::Ordering::Relaxed);
     }
 
     /// Try to increase the size of this reservation by `capacity`
     /// bytes, returning error if there is insufficient capacity left
     /// in the pool.
-    pub fn try_grow(&mut self, capacity: usize) -> Result<()> {
+    pub fn try_grow(&self, capacity: usize) -> Result<()> {
         self.registration.pool.try_grow(self, capacity)?;
-        self.size += capacity;
+        self.size.fetch_add(capacity, atomic::Ordering::Relaxed);
         Ok(())
     }
 
@@ -385,10 +467,16 @@ impl MemoryReservation {
     /// # Panics
     ///
     /// Panics if `capacity` exceeds [`Self::size`]
-    pub fn split(&mut self, capacity: usize) -> MemoryReservation {
-        self.size = self.size.checked_sub(capacity).unwrap();
+    pub fn split(&self, capacity: usize) -> MemoryReservation {
+        self.size
+            .fetch_update(
+                atomic::Ordering::Relaxed,
+                atomic::Ordering::Relaxed,
+                |prev| prev.checked_sub(capacity),
+            )
+            .unwrap();
         Self {
-            size: capacity,
+            size: atomic::AtomicUsize::new(capacity),
             registration: Arc::clone(&self.registration),
         }
     }
@@ -396,7 +484,7 @@ impl MemoryReservation {
     /// Returns a new empty [`MemoryReservation`] with the same [`MemoryConsumer`]
     pub fn new_empty(&self) -> Self {
         Self {
-            size: 0,
+            size: atomic::AtomicUsize::new(0),
             registration: Arc::clone(&self.registration),
         }
     }
@@ -404,7 +492,7 @@ impl MemoryReservation {
     /// Splits off all the bytes from this [`MemoryReservation`] into
     /// a new [`MemoryReservation`] with the same [`MemoryConsumer`]
     pub fn take(&mut self) -> MemoryReservation {
-        self.split(self.size)
+        self.split(self.size.load(atomic::Ordering::Relaxed))
     }
 }
 
@@ -414,34 +502,6 @@ impl Drop for MemoryReservation {
     }
 }
 
-pub mod units {
-    pub const TB: u64 = 1 << 40;
-    pub const GB: u64 = 1 << 30;
-    pub const MB: u64 = 1 << 20;
-    pub const KB: u64 = 1 << 10;
-}
-
-/// Present size in human-readable form
-pub fn human_readable_size(size: usize) -> String {
-    use units::*;
-
-    let size = size as u64;
-    let (value, unit) = {
-        if size >= 2 * TB {
-            (size as f64 / TB as f64, "TB")
-        } else if size >= 2 * GB {
-            (size as f64 / GB as f64, "GB")
-        } else if size >= 2 * MB {
-            (size as f64 / MB as f64, "MB")
-        } else if size >= 2 * KB {
-            (size as f64 / KB as f64, "KB")
-        } else {
-            (size as f64, "B")
-        }
-    };
-    format!("{value:.1} {unit}")
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -458,7 +518,7 @@ mod tests {
     #[test]
     fn test_memory_pool_underflow() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut a1 = MemoryConsumer::new("a1").register(&pool);
+        let a1 = MemoryConsumer::new("a1").register(&pool);
         assert_eq!(pool.reserved(), 0);
 
         a1.grow(100);
@@ -473,7 +533,7 @@ mod tests {
         a1.try_grow(30).unwrap();
         assert_eq!(pool.reserved(), 30);
 
-        let mut a2 = MemoryConsumer::new("a2").register(&pool);
+        let a2 = MemoryConsumer::new("a2").register(&pool);
         a2.try_grow(25).unwrap_err();
         assert_eq!(pool.reserved(), 30);
 
@@ -487,7 +547,7 @@ mod tests {
     #[test]
     fn test_split() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
         assert_eq!(r1.size(), 20);
@@ -508,10 +568,10 @@ mod tests {
     #[test]
     fn test_new_empty() {
         let pool = Arc::new(GreedyMemoryPool::new(50)) as _;
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
-        let mut r2 = r1.new_empty();
+        let r2 = r1.new_empty();
         r2.try_grow(5).unwrap();
 
         assert_eq!(r1.size(), 20);
@@ -525,7 +585,7 @@ mod tests {
         let mut r1 = MemoryConsumer::new("r1").register(&pool);
 
         r1.try_grow(20).unwrap();
-        let mut r2 = r1.take();
+        let r2 = r1.take();
         r2.try_grow(5).unwrap();
 
         assert_eq!(r1.size(), 0);
@@ -538,4 +598,37 @@ mod tests {
         assert_eq!(r2.size(), 25);
         assert_eq!(pool.reserved(), 28);
     }
+
+    #[test]
+    fn test_try_shrink() {
+        let pool = Arc::new(GreedyMemoryPool::new(100)) as _;
+        let r1 = MemoryConsumer::new("r1").register(&pool);
+
+        r1.try_grow(50).unwrap();
+        assert_eq!(r1.size(), 50);
+        assert_eq!(pool.reserved(), 50);
+
+        // Successful shrink returns new size and frees pool memory
+        let new_size = r1.try_shrink(30).unwrap();
+        assert_eq!(new_size, 20);
+        assert_eq!(r1.size(), 20);
+        assert_eq!(pool.reserved(), 20);
+
+        // Freed pool memory is now available to other consumers
+        let r2 = MemoryConsumer::new("r2").register(&pool);
+        r2.try_grow(80).unwrap();
+        assert_eq!(pool.reserved(), 100);
+
+        // Shrinking more than allocated fails without changing state
+        let err = r1.try_shrink(25);
+        assert!(err.is_err());
+        assert_eq!(r1.size(), 20);
+        assert_eq!(pool.reserved(), 100);
+
+        // Shrink to exactly zero
+        let new_size = r1.try_shrink(20).unwrap();
+        assert_eq!(new_size, 0);
+        assert_eq!(r1.size(), 0);
+        assert_eq!(pool.reserved(), 80);
+    }
 }
diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs
index 11467f69be1ca..b10270851cc06 100644
--- a/datafusion/execution/src/memory_pool/pool.rs
+++ b/datafusion/execution/src/memory_pool/pool.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::memory_pool::{
-    human_readable_size, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
+    MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation, human_readable_size,
 };
 use datafusion_common::HashMap;
-use datafusion_common::{resources_datafusion_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, resources_datafusion_err};
 use log::debug;
 use parking_lot::Mutex;
 use std::{
@@ -212,7 +212,7 @@ impl MemoryPool for FairSpillPool {
                     .checked_div(state.num_spill)
                     .unwrap_or(spill_available);
 
-                if reservation.size + additional > available {
+                if reservation.size() + additional > available {
                     return Err(insufficient_capacity_err(
                         reservation,
                         additional,
@@ -260,8 +260,13 @@ fn insufficient_capacity_err(
     additional: usize,
     available: usize,
 ) -> DataFusionError {
-    resources_datafusion_err!("Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total pool", 
-    human_readable_size(additional), reservation.registration.consumer.name, human_readable_size(reservation.size), human_readable_size(available))
+    resources_datafusion_err!(
+        "Failed to allocate additional {} for {} with {} already allocated for this reservation - {} remain available for the total pool",
+        human_readable_size(additional),
+        reservation.registration.consumer.name,
+        human_readable_size(reservation.size()),
+        human_readable_size(available)
+    )
 }
 
 #[derive(Debug)]
@@ -269,6 +274,7 @@ struct TrackedConsumer {
     name: String,
     can_spill: bool,
     reserved: AtomicUsize,
+    peak: AtomicUsize,
 }
 
 impl TrackedConsumer {
@@ -277,10 +283,16 @@ impl TrackedConsumer {
         self.reserved.load(Ordering::Relaxed)
     }
 
+    /// Return the peak value
+    fn peak(&self) -> usize {
+        self.peak.load(Ordering::Relaxed)
+    }
+
     /// Grows the tracked consumer's reserved size,
     /// should be called after the pool has successfully performed the grow().
     fn grow(&self, additional: usize) {
         self.reserved.fetch_add(additional, Ordering::Relaxed);
+        self.peak.fetch_max(self.reserved(), Ordering::Relaxed);
     }
 
     /// Reduce the tracked consumer's reserved size,
@@ -295,9 +307,25 @@ impl TrackedConsumer {
 ///
 /// By tracking memory reservations more carefully this pool
 /// can provide better error messages on the largest memory users
+/// when memory allocation fails.
 ///
 /// Tracking is per hashed [`MemoryConsumer`], not per [`MemoryReservation`].
 /// The same consumer can have multiple reservations.
+///
+/// # Automatic Usage via [`RuntimeEnvBuilder`]
+///
+/// The easiest way to use `TrackConsumersPool` is via
+/// [`RuntimeEnvBuilder::with_memory_limit()`].
+///
+/// [`RuntimeEnvBuilder`]: crate::runtime_env::RuntimeEnvBuilder
+/// [`RuntimeEnvBuilder::with_memory_limit()`]: crate::runtime_env::RuntimeEnvBuilder::with_memory_limit
+///
+/// # Usage Examples
+///
+/// For more examples of using `TrackConsumersPool`, see the [memory_pool_tracking.rs] example
+///
+/// [memory_pool_tracking.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_tracking.rs
+/// [memory_pool_execution_plan.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
 #[derive(Debug)]
 pub struct TrackConsumersPool<I> {
     /// The wrapped memory pool that actually handles reservation logic
@@ -311,6 +339,38 @@ pub struct TrackConsumersPool<I> {
 impl<I: MemoryPool> TrackConsumersPool<I> {
     /// Creates a new [`TrackConsumersPool`].
     ///
+    /// # Arguments
+    /// * `inner` - The underlying memory pool that handles actual memory allocation
+    /// * `top` - The number of top memory consumers to include in error messages
+    ///
+    /// # Note
+    /// In most cases, you should use [`RuntimeEnvBuilder::with_memory_limit()`](crate::runtime_env::RuntimeEnvBuilder::with_memory_limit)
+    /// instead of creating this pool manually, as it automatically sets up tracking with
+    /// sensible defaults (top 5 consumers).
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use datafusion_execution::memory_pool::{
+    ///     FairSpillPool, GreedyMemoryPool, TrackConsumersPool,
+    /// };
+    /// use std::num::NonZeroUsize;
+    ///
+    /// // Create with a greedy pool backend, reporting top 3 consumers in error messages
+    /// let tracked_greedy = TrackConsumersPool::new(
+    ///     GreedyMemoryPool::new(1024 * 1024), // 1MB limit
+    ///     NonZeroUsize::new(3).unwrap(),
+    /// );
+    ///
+    /// // Create with a fair spill pool backend, reporting top 5 consumers in error messages
+    /// let tracked_fair = TrackConsumersPool::new(
+    ///     FairSpillPool::new(2 * 1024 * 1024), // 2MB limit
+    ///     NonZeroUsize::new(5).unwrap(),
+    /// );
+    /// ```
+    ///
+    /// # Impact on Error Messages
+    ///
     /// The `top` determines how many Top K [`MemoryConsumer`]s to include
     /// in the reported [`DataFusionError::ResourcesExhausted`].
     pub fn new(inner: I, top: NonZeroUsize) -> Self {
@@ -321,7 +381,7 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
         }
     }
 
-    /// The top consumers in a report string.
+    /// Returns a formatted string with the top memory consumers.
     pub fn report_top(&self, top: usize) -> String {
         let mut consumers = self
             .tracked_consumers
@@ -333,6 +393,7 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
                         *consumer_id,
                         tracked_consumer.name.to_owned(),
                         tracked_consumer.can_spill,
+                        tracked_consumer.peak(),
                     ),
                     tracked_consumer.reserved(),
                 )
@@ -342,10 +403,11 @@ impl<I: MemoryPool> TrackConsumersPool<I> {
 
         consumers[0..std::cmp::min(top, consumers.len())]
             .iter()
-            .map(|((id, name, can_spill), size)| {
+            .map(|((id, name, can_spill, peak), size)| {
                 format!(
-                    "  {name}#{id}(can spill: {can_spill}) consumed {}",
-                    human_readable_size(*size)
+                    "  {name}#{id}(can spill: {can_spill}) consumed {}, peak {}",
+                    human_readable_size(*size),
+                    human_readable_size(*peak),
                 )
             })
             .collect::<Vec<_>>()
@@ -365,6 +427,7 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
                 name: consumer.name().to_string(),
                 can_spill: consumer.can_spill(),
                 reserved: Default::default(),
+                peak: Default::default(),
             },
         );
 
@@ -407,8 +470,9 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
                     // wrap OOM message in top consumers
                     DataFusionError::ResourcesExhausted(
                         provide_top_memory_consumers_to_error_msg(
-                            e,
-                            self.report_top(self.top.into()),
+                            &reservation.consumer().name,
+                            &e,
+                            &self.report_top(self.top.into()),
                         ),
                     )
                 }
@@ -434,16 +498,19 @@ impl<I: MemoryPool> MemoryPool for TrackConsumersPool<I> {
 }
 
 fn provide_top_memory_consumers_to_error_msg(
-    error_msg: String,
-    top_consumers: String,
+    consumer_name: &str,
+    error_msg: &str,
+    top_consumers: &str,
 ) -> String {
-    format!("Additional allocation failed with top memory consumers (across reservations) as:\n{top_consumers}\nError: {error_msg}")
+    format!(
+        "Additional allocation failed for {consumer_name} with top memory consumers (across reservations) as:\n{top_consumers}\nError: {error_msg}"
+    )
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use insta::{allow_duplicates, assert_snapshot, Settings};
+    use insta::{Settings, allow_duplicates, assert_snapshot};
     use std::sync::Arc;
 
     fn make_settings() -> Settings {
@@ -459,12 +526,12 @@ mod tests {
     fn test_fair() {
         let pool = Arc::new(FairSpillPool::new(100)) as _;
 
-        let mut r1 = MemoryConsumer::new("unspillable").register(&pool);
+        let r1 = MemoryConsumer::new("unspillable").register(&pool);
         // Can grow beyond capacity of pool
         r1.grow(2000);
         assert_eq!(pool.reserved(), 2000);
 
-        let mut r2 = MemoryConsumer::new("r2")
+        let r2 = MemoryConsumer::new("r2")
             .with_can_spill(true)
             .register(&pool);
         // Can grow beyond capacity of pool
@@ -496,7 +563,7 @@ mod tests {
         assert_eq!(r2.size(), 10);
         assert_eq!(pool.reserved(), 30);
 
-        let mut r3 = MemoryConsumer::new("r3")
+        let r3 = MemoryConsumer::new("r3")
             .with_can_spill(true)
             .register(&pool);
 
@@ -517,7 +584,7 @@ mod tests {
         r1.free();
         assert_eq!(pool.reserved(), 80);
 
-        let mut r4 = MemoryConsumer::new("s4").register(&pool);
+        let r4 = MemoryConsumer::new("s4").register(&pool);
         let err = r4.try_grow(30).unwrap_err().strip_backtrace();
         assert_snapshot!(err, @"Resources exhausted: Failed to allocate additional 30.0 B for s4 with 0.0 B already allocated for this reservation - 20.0 B remain available for the total pool");
     }
@@ -534,17 +601,18 @@ mod tests {
         // Test: use all the different interfaces to change reservation size
 
         // set r1=50, using grow and shrink
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
-        r1.grow(70);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
+        r1.grow(50);
+        r1.grow(20);
         r1.shrink(20);
 
         // set r2=15 using try_grow
-        let mut r2 = MemoryConsumer::new("r2").register(&pool);
+        let r2 = MemoryConsumer::new("r2").register(&pool);
         r2.try_grow(15)
             .expect("should succeed in memory allotment for r2");
 
         // set r3=20 using try_resize
-        let mut r3 = MemoryConsumer::new("r3").register(&pool);
+        let r3 = MemoryConsumer::new("r3").register(&pool);
         r3.try_resize(25)
             .expect("should succeed in memory allotment for r3");
         r3.try_resize(20)
@@ -552,20 +620,20 @@ mod tests {
 
         // set r4=10
         // this should not be reported in top 3
-        let mut r4 = MemoryConsumer::new("r4").register(&pool);
+        let r4 = MemoryConsumer::new("r4").register(&pool);
         r4.grow(10);
 
         // Test: reports if new reservation causes error
         // using the previously set sizes for other consumers
-        let mut r5 = MemoryConsumer::new("r5").register(&pool);
+        let r5 = MemoryConsumer::new("r5").register(&pool);
         let res = r5.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
-        Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-          r1#[ID](can spill: false) consumed 50.0 B,
-          r3#[ID](can spill: false) consumed 20.0 B,
-          r2#[ID](can spill: false) consumed 15.0 B.
+        Resources exhausted: Additional allocation failed for r5 with top memory consumers (across reservations) as:
+          r1#[ID](can spill: false) consumed 50.0 B, peak 70.0 B,
+          r3#[ID](can spill: false) consumed 20.0 B, peak 25.0 B,
+          r2#[ID](can spill: false) consumed 15.0 B, peak 15.0 B.
         Error: Failed to allocate additional 150.0 B for r5 with 0.0 B already allocated for this reservation - 5.0 B remain available for the total pool
         ");
     }
@@ -582,13 +650,13 @@ mod tests {
         let same_name = "foo";
 
         // Test: see error message when no consumers recorded yet
-        let mut r0 = MemoryConsumer::new(same_name).register(&pool);
+        let r0 = MemoryConsumer::new(same_name).register(&pool);
         let res = r0.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
-        Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-          foo#[ID](can spill: false) consumed 0.0 B.
+        Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
+          foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B.
         Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 100.0 B remain available for the total pool
         ");
 
@@ -597,16 +665,16 @@ mod tests {
 
         r0.grow(10); // make r0=10, pool available=90
         let new_consumer_same_name = MemoryConsumer::new(same_name);
-        let mut r1 = new_consumer_same_name.register(&pool);
+        let r1 = new_consumer_same_name.register(&pool);
         // TODO: the insufficient_capacity_err() message is per reservation, not per consumer.
         // a followup PR will clarify this message "0 bytes already allocated for this reservation"
         let res = r1.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
-        Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-          foo#[ID](can spill: false) consumed 10.0 B,
-          foo#[ID](can spill: false) consumed 0.0 B.
+        Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
+          foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B,
+          foo#[ID](can spill: false) consumed 0.0 B, peak 0.0 B.
         Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 90.0 B remain available for the total pool
         ");
 
@@ -617,9 +685,9 @@ mod tests {
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
-        Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-          foo#[ID](can spill: false) consumed 20.0 B,
-          foo#[ID](can spill: false) consumed 10.0 B.
+        Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
+          foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
+          foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
         Error: Failed to allocate additional 150.0 B for foo with 20.0 B already allocated for this reservation - 70.0 B remain available for the total pool
         ");
 
@@ -627,15 +695,15 @@ mod tests {
         // will be recognized as different in the TrackConsumersPool
         let consumer_with_same_name_but_different_hash =
             MemoryConsumer::new(same_name).with_can_spill(true);
-        let mut r2 = consumer_with_same_name_but_different_hash.register(&pool);
+        let r2 = consumer_with_same_name_but_different_hash.register(&pool);
         let res = r2.try_grow(150);
         assert!(res.is_err());
         let error = res.unwrap_err().strip_backtrace();
         assert_snapshot!(error, @r"
-        Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-          foo#[ID](can spill: false) consumed 20.0 B,
-          foo#[ID](can spill: false) consumed 10.0 B,
-          foo#[ID](can spill: true) consumed 0.0 B.
+        Resources exhausted: Additional allocation failed for foo with top memory consumers (across reservations) as:
+          foo#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
+          foo#[ID](can spill: false) consumed 10.0 B, peak 10.0 B,
+          foo#[ID](can spill: true) consumed 0.0 B, peak 0.0 B.
         Error: Failed to allocate additional 150.0 B for foo with 0.0 B already allocated for this reservation - 70.0 B remain available for the total pool
         ");
     }
@@ -646,21 +714,21 @@ mod tests {
             // Baseline: see the 2 memory consumers
             let setting = make_settings();
             let _bound = setting.bind_to_scope();
-            let mut r0 = MemoryConsumer::new("r0").register(&pool);
+            let r0 = MemoryConsumer::new("r0").register(&pool);
             r0.grow(10);
             let r1_consumer = MemoryConsumer::new("r1");
-            let mut r1 = r1_consumer.register(&pool);
+            let r1 = r1_consumer.register(&pool);
             r1.grow(20);
 
             let res = r0.try_grow(150);
             assert!(res.is_err());
             let error = res.unwrap_err().strip_backtrace();
             allow_duplicates!(assert_snapshot!(error, @r"
-                Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-                  r1#[ID](can spill: false) consumed 20.0 B,
-                  r0#[ID](can spill: false) consumed 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total pool
-                "));
+            Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
+              r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B,
+              r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
+            Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 70.0 B remain available for the total pool
+            "));
 
             // Test: unregister one
             // only the remaining one should be listed
@@ -669,10 +737,10 @@ mod tests {
             assert!(res.is_err());
             let error = res.unwrap_err().strip_backtrace();
             allow_duplicates!(assert_snapshot!(error, @r"
-                Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-                  r0#[ID](can spill: false) consumed 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
+            Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
+              r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
+            Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
+            "));
 
             // Test: actual message we see is the `available is 70`. When it should be `available is 90`.
             // This is because the pool.shrink() does not automatically occur within the inner_pool.deregister().
@@ -680,10 +748,10 @@ mod tests {
             assert!(res.is_err());
             let error = res.unwrap_err().strip_backtrace();
             allow_duplicates!(assert_snapshot!(error, @r"
-                Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-                  r0#[ID](can spill: false) consumed 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
+            Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
+              r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
+            Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
+            "));
 
             // Test: the registration needs to free itself (or be dropped),
             // for the proper error message
@@ -691,10 +759,10 @@ mod tests {
             assert!(res.is_err());
             let error = res.unwrap_err().strip_backtrace();
             allow_duplicates!(assert_snapshot!(error, @r"
-                Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:
-                  r0#[ID](can spill: false) consumed 10.0 B.
-                Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
-                "));
+            Resources exhausted: Additional allocation failed for r0 with top memory consumers (across reservations) as:
+              r0#[ID](can spill: false) consumed 10.0 B, peak 10.0 B.
+            Error: Failed to allocate additional 150.0 B for r0 with 10.0 B already allocated for this reservation - 90.0 B remain available for the total pool
+            "));
         }
 
         let tracked_spill_pool: Arc<dyn MemoryPool> = Arc::new(TrackConsumersPool::new(
@@ -723,13 +791,13 @@ mod tests {
             .downcast::<TrackConsumersPool<GreedyMemoryPool>>()
             .unwrap();
         // set r1=20
-        let mut r1 = MemoryConsumer::new("r1").register(&pool);
+        let r1 = MemoryConsumer::new("r1").register(&pool);
         r1.grow(20);
         // set r2=15
-        let mut r2 = MemoryConsumer::new("r2").register(&pool);
+        let r2 = MemoryConsumer::new("r2").register(&pool);
         r2.grow(15);
         // set r3=45
-        let mut r3 = MemoryConsumer::new("r3").register(&pool);
+        let r3 = MemoryConsumer::new("r3").register(&pool);
         r3.grow(45);
 
         let downcasted = upcasted
@@ -739,8 +807,8 @@ mod tests {
         // Test: can get runtime metrics, even without an error thrown
         let res = downcasted.report_top(2);
         assert_snapshot!(res, @r"
-        r3#[ID](can spill: false) consumed 45.0 B,
-        r1#[ID](can spill: false) consumed 20.0 B.
+        r3#[ID](can spill: false) consumed 45.0 B, peak 45.0 B,
+        r1#[ID](can spill: false) consumed 20.0 B, peak 20.0 B.
         ");
     }
 }
diff --git a/datafusion/execution/src/object_store.rs b/datafusion/execution/src/object_store.rs
index cd75c9f3c49ee..22ce1f0cf2bbf 100644
--- a/datafusion/execution/src/object_store.rs
+++ b/datafusion/execution/src/object_store.rs
@@ -20,10 +20,12 @@
 //! and query data inside these systems.
 
 use dashmap::DashMap;
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::{
+    DataFusionError, Result, exec_err, internal_datafusion_err, not_impl_err,
+};
+use object_store::ObjectStore;
 #[cfg(not(target_arch = "wasm32"))]
 use object_store::local::LocalFileSystem;
-use object_store::ObjectStore;
 use std::sync::Arc;
 use url::Url;
 
@@ -154,6 +156,15 @@ pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static {
         store: Arc<dyn ObjectStore>,
     ) -> Option<Arc<dyn ObjectStore>>;
 
+    /// Deregister the store previously registered with the same key. Returns the
+    /// deregistered store if it existed.
+    #[expect(unused_variables)]
+    fn deregister_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
+        not_impl_err!(
+            "ObjectStoreRegistry::deregister_store is not implemented for this ObjectStoreRegistry"
+        )
+    }
+
     /// Get a suitable store for the provided URL. For example:
     ///
     /// - URL with scheme `file:///` or no scheme will return the default LocalFS store
@@ -230,15 +241,24 @@ impl ObjectStoreRegistry for DefaultObjectStoreRegistry {
         self.object_stores.insert(s, store)
     }
 
+    fn deregister_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
+        let s = get_url_key(url);
+        let (_, object_store) = self.object_stores
+            .remove(&s)
+            .ok_or_else(|| {
+                internal_datafusion_err!("Failed to deregister object store. No suitable object store found for {url}. See `RuntimeEnv::register_object_store`")
+            })?;
+
+        Ok(object_store)
+    }
+
     fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
         let s = get_url_key(url);
         self.object_stores
             .get(&s)
             .map(|o| Arc::clone(o.value()))
             .ok_or_else(|| {
-                DataFusionError::Internal(format!(
-                    "No suitable object store found for {url}. See `RuntimeEnv::register_object_store`"
-                ))
+                internal_datafusion_err!("No suitable object store found for {url}. See `RuntimeEnv::register_object_store`")
             })
     }
 }
@@ -272,17 +292,29 @@ mod tests {
         assert_eq!(err.strip_backtrace(), "External error: invalid port number");
 
         let err = ObjectStoreUrl::parse("s3://bucket?").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?"
+        );
 
         let err = ObjectStoreUrl::parse("s3://bucket?foo=bar").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar"
+        );
 
         let err = ObjectStoreUrl::parse("s3://host:123/foo").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"
+        );
 
         let err =
             ObjectStoreUrl::parse("s3://username:password@host:123/foo").unwrap_err();
-        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"
+        );
     }
 
     #[test]
diff --git a/datafusion/execution/src/parquet_encryption.rs b/datafusion/execution/src/parquet_encryption.rs
new file mode 100644
index 0000000000000..45eac10264e88
--- /dev/null
+++ b/datafusion/execution/src/parquet_encryption.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::SchemaRef;
+use async_trait::async_trait;
+use dashmap::DashMap;
+use datafusion_common::config::EncryptionFactoryOptions;
+use datafusion_common::error::Result;
+use datafusion_common::internal_datafusion_err;
+use object_store::path::Path;
+use parquet::encryption::decrypt::FileDecryptionProperties;
+use parquet::encryption::encrypt::FileEncryptionProperties;
+use std::sync::Arc;
+
+/// Trait for types that generate file encryption and decryption properties to
+/// write and read encrypted Parquet files.
+/// This allows flexibility in how encryption keys are managed, for example, to
+/// integrate with a user's key management service (KMS).
+/// For example usage, see the [`parquet_encrypted_with_kms` example].
+///
+/// [`parquet_encrypted_with_kms` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/parquet_encrypted_with_kms.rs
+#[async_trait]
+pub trait EncryptionFactory: Send + Sync + std::fmt::Debug + 'static {
+    /// Generate file encryption properties to use when writing a Parquet file.
+    async fn get_file_encryption_properties(
+        &self,
+        config: &EncryptionFactoryOptions,
+        schema: &SchemaRef,
+        file_path: &Path,
+    ) -> Result<Option<Arc<FileEncryptionProperties>>>;
+
+    /// Generate file decryption properties to use when reading a Parquet file.
+    async fn get_file_decryption_properties(
+        &self,
+        config: &EncryptionFactoryOptions,
+        file_path: &Path,
+    ) -> Result<Option<Arc<FileDecryptionProperties>>>;
+}
+
+/// Stores [`EncryptionFactory`] implementations that can be retrieved by a unique string identifier
+#[derive(Clone, Debug, Default)]
+pub struct EncryptionFactoryRegistry {
+    factories: DashMap<String, Arc<dyn EncryptionFactory>>,
+}
+
+impl EncryptionFactoryRegistry {
+    /// Register an [`EncryptionFactory`] with an associated identifier that can be later
+    /// used to configure encryption when reading or writing Parquet.
+    /// If an encryption factory with the same identifier was already registered, it is replaced and returned.
+    pub fn register_factory(
+        &self,
+        id: &str,
+        factory: Arc<dyn EncryptionFactory>,
+    ) -> Option<Arc<dyn EncryptionFactory>> {
+        self.factories.insert(id.to_owned(), factory)
+    }
+
+    /// Retrieve an [`EncryptionFactory`] by its identifier
+    pub fn get_factory(&self, id: &str) -> Result<Arc<dyn EncryptionFactory>> {
+        self.factories
+            .get(id)
+            .map(|f| Arc::clone(f.value()))
+            .ok_or_else(|| {
+                internal_datafusion_err!(
+                    "No Parquet encryption factory found for id '{id}'"
+                )
+            })
+    }
+}
diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs
index b086430a4ef71..67604c424c766 100644
--- a/datafusion/execution/src/runtime_env.rs
+++ b/datafusion/execution/src/runtime_env.rs
@@ -18,8 +18,8 @@
 //! Execution [`RuntimeEnv`] environment that manages access to object
 //! store, memory manager, disk manager.
 
-#[allow(deprecated)]
-use crate::disk_manager::DiskManagerConfig;
+#[expect(deprecated)]
+use crate::disk_manager::{DiskManagerConfig, SpillingProgress};
 use crate::{
     disk_manager::{DiskManager, DiskManagerBuilder, DiskManagerMode},
     memory_pool::{
@@ -29,14 +29,16 @@ use crate::{
 };
 
 use crate::cache::cache_manager::{CacheManager, CacheManagerConfig};
-use datafusion_common::{config::ConfigEntry, Result};
+#[cfg(feature = "parquet_encryption")]
+use crate::parquet_encryption::{EncryptionFactory, EncryptionFactoryRegistry};
+use datafusion_common::{Result, config::ConfigEntry};
 use object_store::ObjectStore;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::{
     fmt::{Debug, Formatter},
     num::NonZeroUsize,
 };
+use std::{path::PathBuf, time::Duration};
 use url::Url;
 
 #[derive(Clone)]
@@ -65,9 +67,9 @@ use url::Url;
 /// // restrict to using at most 100MB of memory
 /// let pool_size = 100 * 1024 * 1024;
 /// let runtime_env = RuntimeEnvBuilder::new()
-///   .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size)))
-///   .build()
-///   .unwrap();
+///     .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size)))
+///     .build()
+///     .unwrap();
 /// ```
 pub struct RuntimeEnv {
     /// Runtime memory management
@@ -78,6 +80,9 @@ pub struct RuntimeEnv {
     pub cache_manager: Arc<CacheManager>,
     /// Object Store Registry
     pub object_store_registry: Arc<dyn ObjectStoreRegistry>,
+    /// Parquet encryption factory registry
+    #[cfg(feature = "parquet_encryption")]
+    pub parquet_encryption_factory_registry: Arc<EncryptionFactoryRegistry>,
 }
 
 impl Debug for RuntimeEnv {
@@ -86,19 +91,54 @@ impl Debug for RuntimeEnv {
     }
 }
 
-impl RuntimeEnv {
-    #[deprecated(since = "43.0.0", note = "please use `RuntimeEnvBuilder` instead")]
-    #[allow(deprecated)]
-    pub fn new(config: RuntimeConfig) -> Result<Self> {
-        Self::try_new(config)
-    }
-    /// Create env based on configuration
-    #[deprecated(since = "44.0.0", note = "please use `RuntimeEnvBuilder` instead")]
-    #[allow(deprecated)]
-    pub fn try_new(config: RuntimeConfig) -> Result<Self> {
-        config.build()
-    }
+/// Creates runtime configuration entries with the provided values
+///
+/// This helper function defines the structure and metadata for all runtime configuration
+/// entries to avoid duplication between `RuntimeEnv::config_entries()` and
+/// `RuntimeEnvBuilder::entries()`.
+fn create_runtime_config_entries(
+    memory_limit: Option<String>,
+    max_temp_directory_size: Option<String>,
+    temp_directory: Option<String>,
+    metadata_cache_limit: Option<String>,
+    list_files_cache_limit: Option<String>,
+    list_files_cache_ttl: Option<String>,
+) -> Vec<ConfigEntry> {
+    vec![
+        ConfigEntry {
+            key: "datafusion.runtime.memory_limit".to_string(),
+            value: memory_limit,
+            description: "Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.max_temp_directory_size".to_string(),
+            value: max_temp_directory_size,
+            description: "Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.temp_directory".to_string(),
+            value: temp_directory,
+            description: "The path to the temporary file directory.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.metadata_cache_limit".to_string(),
+            value: metadata_cache_limit,
+            description: "Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.list_files_cache_limit".to_string(),
+            value: list_files_cache_limit,
+            description: "Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
+        },
+        ConfigEntry {
+            key: "datafusion.runtime.list_files_cache_ttl".to_string(),
+            value: list_files_cache_ttl,
+            description: "TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.",
+        },
+    ]
+}
 
+impl RuntimeEnv {
     /// Registers a custom `ObjectStore` to be used with a specific url.
     /// This allows DataFusion to create external tables from urls that do not have
     /// built in support such as `hdfs://namenode:port/...`.
@@ -121,8 +161,6 @@ impl RuntimeEnv {
     /// ```
     ///
     /// # Example: Register remote URL object store like [Github](https://github.com)
-    ///
-    ///
     /// ```
     /// # use std::sync::Arc;
     /// # use url::Url;
@@ -148,12 +186,125 @@ impl RuntimeEnv {
         self.object_store_registry.register_store(url, object_store)
     }
 
+    /// Deregisters a custom `ObjectStore` previously registered for a specific url.
+    /// See [`ObjectStoreRegistry::deregister_store`] for more details.
+    pub fn deregister_object_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
+        self.object_store_registry.deregister_store(url)
+    }
+
     /// Retrieves a `ObjectStore` instance for a url by consulting the
     /// registry. See [`ObjectStoreRegistry::get_store`] for more
     /// details.
     pub fn object_store(&self, url: impl AsRef<Url>) -> Result<Arc<dyn ObjectStore>> {
         self.object_store_registry.get_store(url.as_ref())
     }
+
+    /// Returns the current spilling progress
+    pub fn spilling_progress(&self) -> SpillingProgress {
+        self.disk_manager.spilling_progress()
+    }
+
+    /// Register an [`EncryptionFactory`] with an associated identifier that can be later
+    /// used to configure encryption when reading or writing Parquet.
+    /// If an encryption factory with the same identifier was already registered, it is replaced and returned.
+    #[cfg(feature = "parquet_encryption")]
+    pub fn register_parquet_encryption_factory(
+        &self,
+        id: &str,
+        encryption_factory: Arc<dyn EncryptionFactory>,
+    ) -> Option<Arc<dyn EncryptionFactory>> {
+        self.parquet_encryption_factory_registry
+            .register_factory(id, encryption_factory)
+    }
+
+    /// Retrieve an [`EncryptionFactory`] by its identifier
+    #[cfg(feature = "parquet_encryption")]
+    pub fn parquet_encryption_factory(
+        &self,
+        id: &str,
+    ) -> Result<Arc<dyn EncryptionFactory>> {
+        self.parquet_encryption_factory_registry.get_factory(id)
+    }
+
+    /// Returns the current runtime configuration entries
+    pub fn config_entries(&self) -> Vec<ConfigEntry> {
+        use crate::memory_pool::MemoryLimit;
+
+        /// Convert bytes to a human-readable format
+        fn format_byte_size(size: u64) -> String {
+            const GB: u64 = 1024 * 1024 * 1024;
+            const MB: u64 = 1024 * 1024;
+            const KB: u64 = 1024;
+
+            match size {
+                s if s >= GB => format!("{}G", s / GB),
+                s if s >= MB => format!("{}M", s / MB),
+                s if s >= KB => format!("{}K", s / KB),
+                s => format!("{s}"),
+            }
+        }
+
+        fn format_duration(duration: Duration) -> String {
+            let total = duration.as_secs();
+            let mins = total / 60;
+            let secs = total % 60;
+
+            format!("{mins}m{secs}s")
+        }
+
+        let memory_limit_value = match self.memory_pool.memory_limit() {
+            MemoryLimit::Finite(size) => Some(format_byte_size(
+                size.try_into()
+                    .expect("Memory limit size conversion failed"),
+            )),
+            MemoryLimit::Infinite => Some("unlimited".to_string()),
+            MemoryLimit::Unknown => None,
+        };
+
+        let max_temp_dir_size = self.disk_manager.max_temp_directory_size();
+        let max_temp_dir_value = format_byte_size(max_temp_dir_size);
+
+        let temp_paths = self.disk_manager.temp_dir_paths();
+        let temp_dir_value = if temp_paths.is_empty() {
+            None
+        } else {
+            Some(
+                temp_paths
+                    .iter()
+                    .map(|p| p.display().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            )
+        };
+
+        let metadata_cache_limit = self.cache_manager.get_metadata_cache_limit();
+        let metadata_cache_value = format_byte_size(
+            metadata_cache_limit
+                .try_into()
+                .expect("Metadata cache size conversion failed"),
+        );
+
+        let list_files_cache_limit = self.cache_manager.get_list_files_cache_limit();
+        let list_files_cache_value = format_byte_size(
+            list_files_cache_limit
+                .try_into()
+                .expect("List files cache size conversion failed"),
+        );
+
+        let list_files_cache_ttl = self
+            .cache_manager
+            .get_list_files_cache_ttl()
+            .map(format_duration);
+
+        create_runtime_config_entries(
+            memory_limit_value,
+            Some(max_temp_dir_value),
+            temp_dir_value,
+            Some(metadata_cache_value),
+            Some(list_files_cache_value),
+            list_files_cache_ttl,
+        )
+    }
 }
 
 impl Default for RuntimeEnv {
@@ -162,17 +313,12 @@ impl Default for RuntimeEnv {
     }
 }
 
-/// Please see: <https://github.com/apache/datafusion/issues/12156>
-/// This a type alias for backwards compatibility.
-#[deprecated(since = "43.0.0", note = "please use `RuntimeEnvBuilder` instead")]
-pub type RuntimeConfig = RuntimeEnvBuilder;
-
-#[derive(Clone)]
 /// Execution runtime configuration builder.
 ///
 /// See example on [`RuntimeEnv`]
+#[derive(Clone)]
 pub struct RuntimeEnvBuilder {
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     /// DiskManager to manage temporary disk file usage
     pub disk_manager: DiskManagerConfig,
     /// DiskManager builder to manager temporary disk file usage
@@ -185,6 +331,9 @@ pub struct RuntimeEnvBuilder {
     pub cache_manager: CacheManagerConfig,
     /// ObjectStoreRegistry to get object store based on url
     pub object_store_registry: Arc<dyn ObjectStoreRegistry>,
+    /// Parquet encryption factory registry
+    #[cfg(feature = "parquet_encryption")]
+    pub parquet_encryption_factory_registry: Arc<EncryptionFactoryRegistry>,
 }
 
 impl Default for RuntimeEnvBuilder {
@@ -202,10 +351,12 @@ impl RuntimeEnvBuilder {
             memory_pool: Default::default(),
             cache_manager: Default::default(),
             object_store_registry: Arc::new(DefaultObjectStoreRegistry::default()),
+            #[cfg(feature = "parquet_encryption")]
+            parquet_encryption_factory_registry: Default::default(),
         }
     }
 
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     #[deprecated(since = "48.0.0", note = "Use with_disk_manager_builder instead")]
     /// Customize disk manager
     pub fn with_disk_manager(mut self, disk_manager: DiskManagerConfig) -> Self {
@@ -243,7 +394,8 @@ impl RuntimeEnvBuilder {
     /// Specify the total memory to use while running the DataFusion
     /// plan to `max_memory * memory_fraction` in bytes.
     ///
-    /// This defaults to using [`GreedyMemoryPool`]
+    /// This defaults to using [`GreedyMemoryPool`] wrapped in the
+    /// [`TrackConsumersPool`] with a maximum of 5 consumers.
     ///
     /// Note DataFusion does not yet respect this limit in all cases.
     pub fn with_memory_limit(self, max_memory: usize, memory_fraction: f64) -> Self {
@@ -255,13 +407,37 @@ impl RuntimeEnvBuilder {
     }
 
     /// Use the specified path to create any needed temporary files
-    pub fn with_temp_file_path(self, path: impl Into<PathBuf>) -> Self {
+    pub fn with_temp_file_path(mut self, path: impl Into<PathBuf>) -> Self {
+        let builder = self.disk_manager_builder.take().unwrap_or_default();
         self.with_disk_manager_builder(
-            DiskManagerBuilder::default()
-                .with_mode(DiskManagerMode::Directories(vec![path.into()])),
+            builder.with_mode(DiskManagerMode::Directories(vec![path.into()])),
         )
     }
 
+    /// Specify a limit on the size of the temporary file directory in bytes
+    pub fn with_max_temp_directory_size(mut self, size: u64) -> Self {
+        let builder = self.disk_manager_builder.take().unwrap_or_default();
+        self.with_disk_manager_builder(builder.with_max_temp_directory_size(size))
+    }
+
+    /// Specify the limit of the file-embedded metadata cache, in bytes.
+    pub fn with_metadata_cache_limit(mut self, limit: usize) -> Self {
+        self.cache_manager = self.cache_manager.with_metadata_cache_limit(limit);
+        self
+    }
+
+    /// Specifies the memory limit for the object list cache, in bytes.
+    pub fn with_object_list_cache_limit(mut self, limit: usize) -> Self {
+        self.cache_manager = self.cache_manager.with_list_files_cache_limit(limit);
+        self
+    }
+
+    /// Specifies the duration entries in the object list cache will be considered valid.
+    pub fn with_object_list_cache_ttl(mut self, ttl: Option<Duration>) -> Self {
+        self.cache_manager = self.cache_manager.with_list_files_cache_ttl(ttl);
+        self
+    }
+
     /// Build a RuntimeEnv
     pub fn build(self) -> Result<RuntimeEnv> {
         let Self {
@@ -270,6 +446,8 @@ impl RuntimeEnvBuilder {
             memory_pool,
             cache_manager,
             object_store_registry,
+            #[cfg(feature = "parquet_encryption")]
+            parquet_encryption_factory_registry,
         } = self;
         let memory_pool =
             memory_pool.unwrap_or_else(|| Arc::new(UnboundedMemoryPool::default()));
@@ -279,11 +457,13 @@ impl RuntimeEnvBuilder {
             disk_manager: if let Some(builder) = disk_manager_builder {
                 Arc::new(builder.build()?)
             } else {
-                #[allow(deprecated)]
+                #[expect(deprecated)]
                 DiskManager::try_new(disk_manager)?
             },
             cache_manager: CacheManager::try_new(&cache_manager)?,
             object_store_registry,
+            #[cfg(feature = "parquet_encryption")]
+            parquet_encryption_factory_registry,
         })
     }
 
@@ -299,10 +479,18 @@ impl RuntimeEnvBuilder {
                 .cache_manager
                 .get_file_statistic_cache(),
             list_files_cache: runtime_env.cache_manager.get_list_files_cache(),
+            list_files_cache_limit: runtime_env
+                .cache_manager
+                .get_list_files_cache_limit(),
+            list_files_cache_ttl: runtime_env.cache_manager.get_list_files_cache_ttl(),
+            file_metadata_cache: Some(
+                runtime_env.cache_manager.get_file_metadata_cache(),
+            ),
+            metadata_cache_limit: runtime_env.cache_manager.get_metadata_cache_limit(),
         };
 
         Self {
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             disk_manager: DiskManagerConfig::Existing(Arc::clone(
                 &runtime_env.disk_manager,
             )),
@@ -310,17 +498,23 @@ impl RuntimeEnvBuilder {
             memory_pool: Some(Arc::clone(&runtime_env.memory_pool)),
             cache_manager: cache_config,
             object_store_registry: Arc::clone(&runtime_env.object_store_registry),
+            #[cfg(feature = "parquet_encryption")]
+            parquet_encryption_factory_registry: Arc::clone(
+                &runtime_env.parquet_encryption_factory_registry,
+            ),
         }
     }
 
     /// Returns a list of all available runtime configurations with their current values and descriptions
     pub fn entries(&self) -> Vec<ConfigEntry> {
-        // Memory pool configuration
-        vec![ConfigEntry {
-            key: "datafusion.runtime.memory_limit".to_string(),
-            value: None, // Default is system-dependent
-            description: "Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.",
-        }]
+        create_runtime_config_entries(
+            None,
+            Some("100G".to_string()),
+            None,
+            Some("50M".to_owned()),
+            Some("1M".to_owned()),
+            None,
+        )
     }
 
     /// Generate documentation that can be included in the user guide
diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs
index b11596c4a30f4..38f31cf4629eb 100644
--- a/datafusion/execution/src/task.rs
+++ b/datafusion/execution/src/task.rs
@@ -19,7 +19,7 @@ use crate::{
     config::SessionConfig, memory_pool::MemoryPool, registry::FunctionRegistry,
     runtime_env::RuntimeEnv,
 };
-use datafusion_common::{plan_datafusion_err, DataFusionError, Result};
+use datafusion_common::{Result, internal_datafusion_err, plan_datafusion_err};
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
 use std::collections::HashSet;
@@ -168,9 +168,9 @@ impl FunctionRegistry for TaskContext {
         let result = self.window_functions.get(name);
 
         result.cloned().ok_or_else(|| {
-            DataFusionError::Internal(format!(
+            internal_datafusion_err!(
                 "There is no UDWF named \"{name}\" in the TaskContext"
-            ))
+            )
         })
     }
     fn register_udaf(
@@ -201,6 +201,19 @@ impl FunctionRegistry for TaskContext {
     fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
         vec![]
     }
+
+    fn udafs(&self) -> HashSet<String> {
+        self.aggregate_functions.keys().cloned().collect()
+    }
+
+    fn udwfs(&self) -> HashSet<String> {
+        self.window_functions.keys().cloned().collect()
+    }
+}
+
+/// Produce the [`TaskContext`].
+pub trait TaskContextProvider {
+    fn task_ctx(&self) -> Arc<TaskContext>;
 }
 
 #[cfg(test)]
diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml
index 14717dd78135d..072c8f14da503 100644
--- a/datafusion/expr-common/Cargo.toml
+++ b/datafusion/expr-common/Cargo.toml
@@ -19,6 +19,7 @@
 name = "datafusion-expr-common"
 description = "Logical plan and expression representation for DataFusion query engine"
 keywords = ["datafusion", "logical", "plan", "expressions"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
 homepage = { workspace = true }
@@ -30,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -41,4 +45,6 @@ arrow = { workspace = true }
 datafusion-common = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
-paste = "^1.0"
+
+[dev-dependencies]
+insta = { workspace = true }
diff --git a/datafusion/expr-common/README.md b/datafusion/expr-common/README.md
new file mode 100644
index 0000000000000..97006702542a0
--- /dev/null
+++ b/datafusion/expr-common/README.md
@@ -0,0 +1,32 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Common Logical Plan and Expressions
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that provides common logical expressions
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/expr-common/src/accumulator.rs b/datafusion/expr-common/src/accumulator.rs
index 2829a9416f033..59fb6a595206a 100644
--- a/datafusion/expr-common/src/accumulator.rs
+++ b/datafusion/expr-common/src/accumulator.rs
@@ -18,7 +18,7 @@
 //! Accumulator module contains the trait definition for aggregation function's accumulators.
 
 use arrow::array::ArrayRef;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err};
 use std::fmt::Debug;
 
 /// Tracks an aggregate function's state.
@@ -48,7 +48,7 @@ use std::fmt::Debug;
 /// [`evaluate`]: Self::evaluate
 /// [`merge_batch`]: Self::merge_batch
 /// [window function]: https://en.wikipedia.org/wiki/Window_function_(SQL)
-pub trait Accumulator: Send + Sync + Debug {
+pub trait Accumulator: Send + Sync + Debug + std::any::Any {
     /// Updates the accumulator's state from its input.
     ///
     /// `values` contains the arguments to this aggregate function.
@@ -58,17 +58,30 @@ pub trait Accumulator: Send + Sync + Debug {
     /// running sum.
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()>;
 
-    /// Returns the final aggregate value, consuming the internal state.
+    /// Returns the final aggregate value.
     ///
     /// For example, the `SUM` accumulator maintains a running sum,
     /// and `evaluate` will produce that running sum as its output.
     ///
-    /// This function should not be called twice, otherwise it will
-    /// result in potentially non-deterministic behavior.
-    ///
     /// This function gets `&mut self` to allow for the accumulator to build
     /// arrow-compatible internal state that can be returned without copying
-    /// when possible (for example distinct strings)
+    /// when possible (for example distinct strings).
+    ///
+    /// ## Correctness
+    ///
+    /// This function must not consume the internal state, as it is also used in window
+    /// aggregate functions where it can be executed multiple times depending on the
+    /// current window frame. Consuming the internal state can cause the next invocation
+    /// to have incorrect results.
+    ///
+    /// - Even if this accumulator doesn't implement [`retract_batch`] it may still be used
+    ///   in window aggregate functions where the window frame is
+    ///   `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`
+    ///
+    /// It is fine to modify the state (e.g. re-order elements within internal state vec) so long
+    /// as this doesn't cause an incorrect computation on the next call of evaluate.
+    ///
+    /// [`retract_batch`]: Self::retract_batch
     fn evaluate(&mut self) -> Result<ScalarValue>;
 
     /// Returns the allocated size required for this accumulator, in
diff --git a/datafusion/expr-common/src/casts.rs b/datafusion/expr-common/src/casts.rs
new file mode 100644
index 0000000000000..dc0bd74b1f286
--- /dev/null
+++ b/datafusion/expr-common/src/casts.rs
@@ -0,0 +1,1292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for casting scalar literals to different data types
+//!
+//! This module contains functions for casting ScalarValue literals
+//! to different data types, originally extracted from the optimizer's
+//! unwrap_cast module to be shared between logical and physical layers.
+
+use std::cmp::Ordering;
+
+use arrow::datatypes::{
+    DataType, MAX_DECIMAL32_FOR_EACH_PRECISION, MAX_DECIMAL64_FOR_EACH_PRECISION,
+    MAX_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL32_FOR_EACH_PRECISION,
+    MIN_DECIMAL64_FOR_EACH_PRECISION, MIN_DECIMAL128_FOR_EACH_PRECISION, TimeUnit,
+};
+use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS};
+use datafusion_common::ScalarValue;
+
+/// Convert a literal value from one data type to another
+pub fn try_cast_literal_to_type(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    let lit_data_type = lit_value.data_type();
+    if !is_supported_type(&lit_data_type) || !is_supported_type(target_type) {
+        return None;
+    }
+    if lit_value.is_null() {
+        // null value can be cast to any type of null value
+        return ScalarValue::try_from(target_type).ok();
+    }
+    try_cast_numeric_literal(lit_value, target_type)
+        .or_else(|| try_cast_string_literal(lit_value, target_type))
+        .or_else(|| try_cast_dictionary(lit_value, target_type))
+        .or_else(|| try_cast_binary(lit_value, target_type))
+}
+
+/// Returns true if unwrap_cast_in_comparison supports this data type
+pub fn is_supported_type(data_type: &DataType) -> bool {
+    is_supported_numeric_type(data_type)
+        || is_supported_string_type(data_type)
+        || is_supported_dictionary_type(data_type)
+        || is_supported_binary_type(data_type)
+}
+
+/// Returns true if unwrap_cast_in_comparison support this numeric type
+fn is_supported_numeric_type(data_type: &DataType) -> bool {
+    matches!(
+        data_type,
+        DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::Decimal32(_, _)
+            | DataType::Decimal64(_, _)
+            | DataType::Decimal128(_, _)
+            | DataType::Timestamp(_, _)
+    )
+}
+
+/// Returns true if unwrap_cast_in_comparison supports casting this value as a string
+fn is_supported_string_type(data_type: &DataType) -> bool {
+    matches!(
+        data_type,
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
+    )
+}
+
+/// Returns true if unwrap_cast_in_comparison supports casting this value as a dictionary
+fn is_supported_dictionary_type(data_type: &DataType) -> bool {
+    matches!(data_type,
+                    DataType::Dictionary(_, inner) if is_supported_type(inner))
+}
+
+fn is_supported_binary_type(data_type: &DataType) -> bool {
+    matches!(data_type, DataType::Binary | DataType::FixedSizeBinary(_))
+}
+
+/// Convert a numeric value from one numeric data type to another
+fn try_cast_numeric_literal(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    let lit_data_type = lit_value.data_type();
+    if !is_supported_numeric_type(&lit_data_type)
+        || !is_supported_numeric_type(target_type)
+    {
+        return None;
+    }
+
+    let mul = match target_type {
+        DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64 => 1_i128,
+        DataType::Timestamp(_, _) => 1_i128,
+        DataType::Decimal32(_, scale) => 10_i128.pow(*scale as u32),
+        DataType::Decimal64(_, scale) => 10_i128.pow(*scale as u32),
+        DataType::Decimal128(_, scale) => 10_i128.pow(*scale as u32),
+        _ => return None,
+    };
+    let (target_min, target_max) = match target_type {
+        DataType::UInt8 => (u8::MIN as i128, u8::MAX as i128),
+        DataType::UInt16 => (u16::MIN as i128, u16::MAX as i128),
+        DataType::UInt32 => (u32::MIN as i128, u32::MAX as i128),
+        DataType::UInt64 => (u64::MIN as i128, u64::MAX as i128),
+        DataType::Int8 => (i8::MIN as i128, i8::MAX as i128),
+        DataType::Int16 => (i16::MIN as i128, i16::MAX as i128),
+        DataType::Int32 => (i32::MIN as i128, i32::MAX as i128),
+        DataType::Int64 => (i64::MIN as i128, i64::MAX as i128),
+        DataType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128),
+        DataType::Decimal32(precision, _) => (
+            // Different precision for decimal32 can store different range of value.
+            // For example, the precision is 3, the max of value is `999` and the min
+            // value is `-999`
+            MIN_DECIMAL32_FOR_EACH_PRECISION[*precision as usize] as i128,
+            MAX_DECIMAL32_FOR_EACH_PRECISION[*precision as usize] as i128,
+        ),
+        DataType::Decimal64(precision, _) => (
+            // Different precision for decimal64 can store different range of value.
+            // For example, the precision is 3, the max of value is `999` and the min
+            // value is `-999`
+            MIN_DECIMAL64_FOR_EACH_PRECISION[*precision as usize] as i128,
+            MAX_DECIMAL64_FOR_EACH_PRECISION[*precision as usize] as i128,
+        ),
+        DataType::Decimal128(precision, _) => (
+            // Different precision for decimal128 can store different range of value.
+            // For example, the precision is 3, the max of value is `999` and the min
+            // value is `-999`
+            MIN_DECIMAL128_FOR_EACH_PRECISION[*precision as usize],
+            MAX_DECIMAL128_FOR_EACH_PRECISION[*precision as usize],
+        ),
+        _ => return None,
+    };
+    let lit_value_target_type = match lit_value {
+        ScalarValue::Int8(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::Int16(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::Int32(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::Int64(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::UInt8(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::UInt16(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::UInt32(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::UInt64(Some(v)) => (*v as i128).checked_mul(mul),
+        ScalarValue::TimestampSecond(Some(v), _) => (*v as i128).checked_mul(mul),
+        ScalarValue::TimestampMillisecond(Some(v), _) => (*v as i128).checked_mul(mul),
+        ScalarValue::TimestampMicrosecond(Some(v), _) => (*v as i128).checked_mul(mul),
+        ScalarValue::TimestampNanosecond(Some(v), _) => (*v as i128).checked_mul(mul),
+        ScalarValue::Decimal32(Some(v), _, scale) => {
+            let v = *v as i128;
+            let lit_scale_mul = 10_i128.pow(*scale as u32);
+            if mul >= lit_scale_mul {
+                // Example:
+                // lit is decimal(123,3,2)
+                // target type is decimal(5,3)
+                // the lit can be converted to the decimal(1230,5,3)
+                v.checked_mul(mul / lit_scale_mul)
+            } else if v % (lit_scale_mul / mul) == 0 {
+                // Example:
+                // lit is decimal(123000,10,3)
+                // target type is int32: the lit can be converted to INT32(123)
+                // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2)
+                Some(v / (lit_scale_mul / mul))
+            } else {
+                // can't convert the lit decimal to the target data type
+                None
+            }
+        }
+        ScalarValue::Decimal64(Some(v), _, scale) => {
+            let v = *v as i128;
+            let lit_scale_mul = 10_i128.pow(*scale as u32);
+            if mul >= lit_scale_mul {
+                // Example:
+                // lit is decimal(123,3,2)
+                // target type is decimal(5,3)
+                // the lit can be converted to the decimal(1230,5,3)
+                v.checked_mul(mul / lit_scale_mul)
+            } else if v % (lit_scale_mul / mul) == 0 {
+                // Example:
+                // lit is decimal(123000,10,3)
+                // target type is int32: the lit can be converted to INT32(123)
+                // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2)
+                Some(v / (lit_scale_mul / mul))
+            } else {
+                // can't convert the lit decimal to the target data type
+                None
+            }
+        }
+        ScalarValue::Decimal128(Some(v), _, scale) => {
+            let lit_scale_mul = 10_i128.pow(*scale as u32);
+            if mul >= lit_scale_mul {
+                // Example:
+                // lit is decimal(123,3,2)
+                // target type is decimal(5,3)
+                // the lit can be converted to the decimal(1230,5,3)
+                (*v).checked_mul(mul / lit_scale_mul)
+            } else if (*v) % (lit_scale_mul / mul) == 0 {
+                // Example:
+                // lit is decimal(123000,10,3)
+                // target type is int32: the lit can be converted to INT32(123)
+                // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2)
+                Some(*v / (lit_scale_mul / mul))
+            } else {
+                // can't convert the lit decimal to the target data type
+                None
+            }
+        }
+        _ => None,
+    };
+
+    match lit_value_target_type {
+        None => None,
+        Some(value) => {
+            if value >= target_min && value <= target_max {
+                // the value casted from lit to the target type is in the range of target type.
+                // return the target type of scalar value
+                let result_scalar = match target_type {
+                    DataType::Int8 => ScalarValue::Int8(Some(value as i8)),
+                    DataType::Int16 => ScalarValue::Int16(Some(value as i16)),
+                    DataType::Int32 => ScalarValue::Int32(Some(value as i32)),
+                    DataType::Int64 => ScalarValue::Int64(Some(value as i64)),
+                    DataType::UInt8 => ScalarValue::UInt8(Some(value as u8)),
+                    DataType::UInt16 => ScalarValue::UInt16(Some(value as u16)),
+                    DataType::UInt32 => ScalarValue::UInt32(Some(value as u32)),
+                    DataType::UInt64 => ScalarValue::UInt64(Some(value as u64)),
+                    DataType::Timestamp(TimeUnit::Second, tz) => {
+                        let value = cast_between_timestamp(
+                            &lit_data_type,
+                            &DataType::Timestamp(TimeUnit::Second, tz.clone()),
+                            value,
+                        );
+                        ScalarValue::TimestampSecond(value, tz.clone())
+                    }
+                    DataType::Timestamp(TimeUnit::Millisecond, tz) => {
+                        let value = cast_between_timestamp(
+                            &lit_data_type,
+                            &DataType::Timestamp(TimeUnit::Millisecond, tz.clone()),
+                            value,
+                        );
+                        ScalarValue::TimestampMillisecond(value, tz.clone())
+                    }
+                    DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+                        let value = cast_between_timestamp(
+                            &lit_data_type,
+                            &DataType::Timestamp(TimeUnit::Microsecond, tz.clone()),
+                            value,
+                        );
+                        ScalarValue::TimestampMicrosecond(value, tz.clone())
+                    }
+                    DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
+                        let value = cast_between_timestamp(
+                            &lit_data_type,
+                            &DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()),
+                            value,
+                        );
+                        ScalarValue::TimestampNanosecond(value, tz.clone())
+                    }
+                    DataType::Decimal32(p, s) => {
+                        ScalarValue::Decimal32(Some(value as i32), *p, *s)
+                    }
+                    DataType::Decimal64(p, s) => {
+                        ScalarValue::Decimal64(Some(value as i64), *p, *s)
+                    }
+                    DataType::Decimal128(p, s) => {
+                        ScalarValue::Decimal128(Some(value), *p, *s)
+                    }
+                    _ => {
+                        return None;
+                    }
+                };
+                Some(result_scalar)
+            } else {
+                None
+            }
+        }
+    }
+}
+
+fn try_cast_string_literal(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    let string_value = lit_value.try_as_str()?.map(|s| s.to_string());
+    let scalar_value = match target_type {
+        DataType::Utf8 => ScalarValue::Utf8(string_value),
+        DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value),
+        DataType::Utf8View => ScalarValue::Utf8View(string_value),
+        _ => return None,
+    };
+    Some(scalar_value)
+}
+
+/// Attempt to cast to/from a dictionary type by wrapping/unwrapping the dictionary
+fn try_cast_dictionary(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    let lit_value_type = lit_value.data_type();
+    let result_scalar = match (lit_value, target_type) {
+        // Unwrap dictionary when inner type matches target type
+        (ScalarValue::Dictionary(_, inner_value), _)
+            if inner_value.data_type() == *target_type =>
+        {
+            (**inner_value).clone()
+        }
+        // Wrap type when target type is dictionary
+        (_, DataType::Dictionary(index_type, inner_type))
+            if **inner_type == lit_value_type =>
+        {
+            ScalarValue::Dictionary(index_type.clone(), Box::new(lit_value.clone()))
+        }
+        _ => {
+            return None;
+        }
+    };
+    Some(result_scalar)
+}
+
+/// Cast a timestamp value from one unit to another
+fn cast_between_timestamp(from: &DataType, to: &DataType, value: i128) -> Option<i64> {
+    let value = value as i64;
+    let from_scale = match from {
+        DataType::Timestamp(TimeUnit::Second, _) => 1,
+        DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS,
+        DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS,
+        DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS,
+        _ => return Some(value),
+    };
+
+    let to_scale = match to {
+        DataType::Timestamp(TimeUnit::Second, _) => 1,
+        DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS,
+        DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS,
+        DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS,
+        _ => return Some(value),
+    };
+
+    match from_scale.cmp(&to_scale) {
+        Ordering::Less => value.checked_mul(to_scale / from_scale),
+        Ordering::Greater => Some(value / (from_scale / to_scale)),
+        Ordering::Equal => Some(value),
+    }
+}
+
+fn try_cast_binary(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    match (lit_value, target_type) {
+        (ScalarValue::Binary(Some(v)), DataType::FixedSizeBinary(n))
+            if v.len() == *n as usize =>
+        {
+            Some(ScalarValue::FixedSizeBinary(*n, Some(v.clone())))
+        }
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::compute::{CastOptions, cast_with_options};
+    use arrow::datatypes::{Field, Fields, TimeUnit};
+    use std::sync::Arc;
+
+    #[derive(Debug, Clone)]
+    enum ExpectedCast {
+        /// test successfully cast value and it is as specified
+        Value(ScalarValue),
+        /// test returned OK, but could not cast the value
+        NoValue,
+    }
+
+    /// Runs try_cast_literal_to_type with the specified inputs and
+    /// ensure it computes the expected output, and ensures the
+    /// casting is consistent with the Arrow kernels
+    fn expect_cast(
+        literal: ScalarValue,
+        target_type: DataType,
+        expected_result: ExpectedCast,
+    ) {
+        let actual_value = try_cast_literal_to_type(&literal, &target_type);
+
+        println!("expect_cast: ");
+        println!("  {literal:?} --> {target_type}");
+        println!("  expected_result: {expected_result:?}");
+        println!("  actual_result:   {actual_value:?}");
+
+        match expected_result {
+            ExpectedCast::Value(expected_value) => {
+                let actual_value =
+                    actual_value.expect("Expected cast value but got None");
+
+                assert_eq!(actual_value, expected_value);
+
+                // Verify that calling the arrow
+                // cast kernel yields the same results
+                // input array
+                let literal_array = literal
+                    .to_array_of_size(1)
+                    .expect("Failed to convert to array of size");
+                let expected_array = expected_value
+                    .to_array_of_size(1)
+                    .expect("Failed to convert to array of size");
+                let cast_array = cast_with_options(
+                    &literal_array,
+                    &target_type,
+                    &CastOptions::default(),
+                )
+                .expect("Expected to be cast array with arrow cast kernel");
+
+                assert_eq!(
+                    &expected_array, &cast_array,
+                    "Result of casting {literal:?} with arrow was\n {cast_array:#?}\nbut expected\n{expected_array:#?}"
+                );
+
+                // Verify that for timestamp types the timezones are the same
+                // (ScalarValue::cmp doesn't account for timezones);
+                if let (
+                    DataType::Timestamp(left_unit, left_tz),
+                    DataType::Timestamp(right_unit, right_tz),
+                ) = (actual_value.data_type(), expected_value.data_type())
+                {
+                    assert_eq!(left_unit, right_unit);
+                    assert_eq!(left_tz, right_tz);
+                }
+            }
+            ExpectedCast::NoValue => {
+                assert!(
+                    actual_value.is_none(),
+                    "Expected no cast value, but got {actual_value:?}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_try_cast_to_type_nulls() {
+        // test that nulls can be cast to/from all integer types
+        let scalars = vec![
+            ScalarValue::Int8(None),
+            ScalarValue::Int16(None),
+            ScalarValue::Int32(None),
+            ScalarValue::Int64(None),
+            ScalarValue::UInt8(None),
+            ScalarValue::UInt16(None),
+            ScalarValue::UInt32(None),
+            ScalarValue::UInt64(None),
+            ScalarValue::Decimal128(None, 3, 0),
+            ScalarValue::Decimal128(None, 8, 2),
+            ScalarValue::Utf8(None),
+            ScalarValue::LargeUtf8(None),
+        ];
+
+        for s1 in &scalars {
+            for s2 in &scalars {
+                let expected_value = ExpectedCast::Value(s2.clone());
+
+                expect_cast(s1.clone(), s2.data_type(), expected_value);
+            }
+        }
+    }
+
+    #[test]
+    fn test_try_cast_to_type_int_in_range() {
+        // test values that can be cast to/from all integer types
+        let scalars = vec![
+            ScalarValue::Int8(Some(123)),
+            ScalarValue::Int16(Some(123)),
+            ScalarValue::Int32(Some(123)),
+            ScalarValue::Int64(Some(123)),
+            ScalarValue::UInt8(Some(123)),
+            ScalarValue::UInt16(Some(123)),
+            ScalarValue::UInt32(Some(123)),
+            ScalarValue::UInt64(Some(123)),
+            ScalarValue::Decimal128(Some(123), 3, 0),
+            ScalarValue::Decimal128(Some(12300), 8, 2),
+        ];
+
+        for s1 in &scalars {
+            for s2 in &scalars {
+                let expected_value = ExpectedCast::Value(s2.clone());
+
+                expect_cast(s1.clone(), s2.data_type(), expected_value);
+            }
+        }
+
+        let max_i32 = ScalarValue::Int32(Some(i32::MAX));
+        expect_cast(
+            max_i32,
+            DataType::UInt64,
+            ExpectedCast::Value(ScalarValue::UInt64(Some(i32::MAX as u64))),
+        );
+
+        let min_i32 = ScalarValue::Int32(Some(i32::MIN));
+        expect_cast(
+            min_i32,
+            DataType::Int64,
+            ExpectedCast::Value(ScalarValue::Int64(Some(i32::MIN as i64))),
+        );
+
+        let max_i64 = ScalarValue::Int64(Some(i64::MAX));
+        expect_cast(
+            max_i64,
+            DataType::UInt64,
+            ExpectedCast::Value(ScalarValue::UInt64(Some(i64::MAX as u64))),
+        );
+    }
+
+    #[test]
+    fn test_try_cast_to_type_int_out_of_range() {
+        let min_i32 = ScalarValue::Int32(Some(i32::MIN));
+        let min_i64 = ScalarValue::Int64(Some(i64::MIN));
+        let max_i64 = ScalarValue::Int64(Some(i64::MAX));
+        let max_u64 = ScalarValue::UInt64(Some(u64::MAX));
+
+        expect_cast(max_i64.clone(), DataType::Int8, ExpectedCast::NoValue);
+
+        expect_cast(max_i64.clone(), DataType::Int16, ExpectedCast::NoValue);
+
+        expect_cast(max_i64, DataType::Int32, ExpectedCast::NoValue);
+
+        expect_cast(max_u64, DataType::Int64, ExpectedCast::NoValue);
+
+        expect_cast(min_i64, DataType::UInt64, ExpectedCast::NoValue);
+
+        expect_cast(min_i32, DataType::UInt64, ExpectedCast::NoValue);
+
+        // decimal out of range
+        expect_cast(
+            ScalarValue::Decimal128(Some(99999999999999999999999999999999999900), 38, 0),
+            DataType::Int64,
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::Decimal128(Some(-9999999999999999999999999999999999), 37, 1),
+            DataType::Int64,
+            ExpectedCast::NoValue,
+        );
+    }
+
+    #[test]
+    fn test_try_decimal_cast_in_range() {
+        expect_cast(
+            ScalarValue::Decimal128(Some(12300), 5, 2),
+            DataType::Decimal128(3, 0),
+            ExpectedCast::Value(ScalarValue::Decimal128(Some(123), 3, 0)),
+        );
+
+        expect_cast(
+            ScalarValue::Decimal128(Some(12300), 5, 2),
+            DataType::Decimal128(8, 0),
+            ExpectedCast::Value(ScalarValue::Decimal128(Some(123), 8, 0)),
+        );
+
+        expect_cast(
+            ScalarValue::Decimal128(Some(12300), 5, 2),
+            DataType::Decimal128(8, 5),
+            ExpectedCast::Value(ScalarValue::Decimal128(Some(12300000), 8, 5)),
+        );
+    }
+
+    #[test]
+    fn test_try_decimal_cast_out_of_range() {
+        // decimal would lose precision
+        expect_cast(
+            ScalarValue::Decimal128(Some(12345), 5, 2),
+            DataType::Decimal128(3, 0),
+            ExpectedCast::NoValue,
+        );
+
+        // decimal would lose precision
+        expect_cast(
+            ScalarValue::Decimal128(Some(12300), 5, 2),
+            DataType::Decimal128(2, 0),
+            ExpectedCast::NoValue,
+        );
+    }
+
+    #[test]
+    fn test_try_cast_to_type_timestamps() {
+        for time_unit in [
+            TimeUnit::Second,
+            TimeUnit::Millisecond,
+            TimeUnit::Microsecond,
+            TimeUnit::Nanosecond,
+        ] {
+            let utc = Some("+00:00".into());
+            // No timezone, utc timezone
+            let (lit_tz_none, lit_tz_utc) = match time_unit {
+                TimeUnit::Second => (
+                    ScalarValue::TimestampSecond(Some(12345), None),
+                    ScalarValue::TimestampSecond(Some(12345), utc),
+                ),
+
+                TimeUnit::Millisecond => (
+                    ScalarValue::TimestampMillisecond(Some(12345), None),
+                    ScalarValue::TimestampMillisecond(Some(12345), utc),
+                ),
+
+                TimeUnit::Microsecond => (
+                    ScalarValue::TimestampMicrosecond(Some(12345), None),
+                    ScalarValue::TimestampMicrosecond(Some(12345), utc),
+                ),
+
+                TimeUnit::Nanosecond => (
+                    ScalarValue::TimestampNanosecond(Some(12345), None),
+                    ScalarValue::TimestampNanosecond(Some(12345), utc),
+                ),
+            };
+
+            // DataFusion ignores timezones for comparisons of ScalarValue
+            // so double check it here
+            assert_eq!(lit_tz_none, lit_tz_utc);
+
+            // e.g. DataType::Timestamp(_, None)
+            let dt_tz_none = lit_tz_none.data_type();
+
+            // e.g. DataType::Timestamp(_, Some(utc))
+            let dt_tz_utc = lit_tz_utc.data_type();
+
+            // None <--> None
+            expect_cast(
+                lit_tz_none.clone(),
+                dt_tz_none.clone(),
+                ExpectedCast::Value(lit_tz_none.clone()),
+            );
+
+            // None <--> Utc
+            expect_cast(
+                lit_tz_none.clone(),
+                dt_tz_utc.clone(),
+                ExpectedCast::Value(lit_tz_utc.clone()),
+            );
+
+            // Utc <--> None
+            expect_cast(
+                lit_tz_utc.clone(),
+                dt_tz_none.clone(),
+                ExpectedCast::Value(lit_tz_none.clone()),
+            );
+
+            // Utc <--> Utc
+            expect_cast(
+                lit_tz_utc.clone(),
+                dt_tz_utc.clone(),
+                ExpectedCast::Value(lit_tz_utc.clone()),
+            );
+
+            // timestamp to int64
+            expect_cast(
+                lit_tz_utc.clone(),
+                DataType::Int64,
+                ExpectedCast::Value(ScalarValue::Int64(Some(12345))),
+            );
+
+            // int64 to timestamp
+            expect_cast(
+                ScalarValue::Int64(Some(12345)),
+                dt_tz_none.clone(),
+                ExpectedCast::Value(lit_tz_none.clone()),
+            );
+
+            // int64 to timestamp
+            expect_cast(
+                ScalarValue::Int64(Some(12345)),
+                dt_tz_utc.clone(),
+                ExpectedCast::Value(lit_tz_utc.clone()),
+            );
+
+            // timestamp to string (not supported yet)
+            expect_cast(
+                lit_tz_utc.clone(),
+                DataType::LargeUtf8,
+                ExpectedCast::NoValue,
+            );
+        }
+    }
+
+    #[test]
+    fn test_try_cast_to_type_unsupported() {
+        // int64 to list
+        expect_cast(
+            ScalarValue::Int64(Some(12345)),
+            DataType::List(Arc::new(Field::new("f", DataType::Int32, true))),
+            ExpectedCast::NoValue,
+        );
+    }
+
+    #[test]
+    fn test_try_cast_literal_to_timestamp() {
+        // same timestamp
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampNanosecond(Some(123456), None),
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        )
+        .unwrap();
+
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampNanosecond(Some(123456), None)
+        );
+
+        // TimestampNanosecond to TimestampMicrosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampNanosecond(Some(123456), None),
+            &DataType::Timestamp(TimeUnit::Microsecond, None),
+        )
+        .unwrap();
+
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampMicrosecond(Some(123), None)
+        );
+
+        // TimestampNanosecond to TimestampMillisecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampNanosecond(Some(123456), None),
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+        )
+        .unwrap();
+
+        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(Some(0), None));
+
+        // TimestampNanosecond to TimestampSecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampNanosecond(Some(123456), None),
+            &DataType::Timestamp(TimeUnit::Second, None),
+        )
+        .unwrap();
+
+        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(0), None));
+
+        // TimestampMicrosecond to TimestampNanosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMicrosecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        )
+        .unwrap();
+
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampNanosecond(Some(123000), None)
+        );
+
+        // TimestampMicrosecond to TimestampMillisecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMicrosecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+        )
+        .unwrap();
+
+        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(Some(0), None));
+
+        // TimestampMicrosecond to TimestampSecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMicrosecond(Some(123456789), None),
+            &DataType::Timestamp(TimeUnit::Second, None),
+        )
+        .unwrap();
+        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123), None));
+
+        // TimestampMillisecond to TimestampNanosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMillisecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        )
+        .unwrap();
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampNanosecond(Some(123000000), None)
+        );
+
+        // TimestampMillisecond to TimestampMicrosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMillisecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Microsecond, None),
+        )
+        .unwrap();
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampMicrosecond(Some(123000), None)
+        );
+        // TimestampMillisecond to TimestampSecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampMillisecond(Some(123456789), None),
+            &DataType::Timestamp(TimeUnit::Second, None),
+        )
+        .unwrap();
+        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123456), None));
+
+        // TimestampSecond to TimestampNanosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampSecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Nanosecond, None),
+        )
+        .unwrap();
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampNanosecond(Some(123000000000), None)
+        );
+
+        // TimestampSecond to TimestampMicrosecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampSecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Microsecond, None),
+        )
+        .unwrap();
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampMicrosecond(Some(123000000), None)
+        );
+
+        // TimestampSecond to TimestampMillisecond
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampSecond(Some(123), None),
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+        )
+        .unwrap();
+        assert_eq!(
+            new_scalar,
+            ScalarValue::TimestampMillisecond(Some(123000), None)
+        );
+
+        // overflow
+        let new_scalar = try_cast_literal_to_type(
+            &ScalarValue::TimestampSecond(Some(i64::MAX), None),
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+        )
+        .unwrap();
+        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(None, None));
+    }
+
+    #[test]
+    fn test_try_cast_to_string_type() {
+        let scalars = vec![
+            ScalarValue::from("string"),
+            ScalarValue::LargeUtf8(Some("string".to_owned())),
+        ];
+
+        for s1 in &scalars {
+            for s2 in &scalars {
+                let expected_value = ExpectedCast::Value(s2.clone());
+
+                expect_cast(s1.clone(), s2.data_type(), expected_value);
+            }
+        }
+    }
+
+    #[test]
+    fn test_try_cast_to_dictionary_type() {
+        fn dictionary_type(t: DataType) -> DataType {
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(t))
+        }
+        fn dictionary_value(value: ScalarValue) -> ScalarValue {
+            ScalarValue::Dictionary(Box::new(DataType::Int32), Box::new(value))
+        }
+        let scalars = vec![
+            ScalarValue::from("string"),
+            ScalarValue::LargeUtf8(Some("string".to_owned())),
+        ];
+        for s in &scalars {
+            expect_cast(
+                s.clone(),
+                dictionary_type(s.data_type()),
+                ExpectedCast::Value(dictionary_value(s.clone())),
+            );
+            expect_cast(
+                dictionary_value(s.clone()),
+                s.data_type(),
+                ExpectedCast::Value(s.clone()),
+            )
+        }
+    }
+
+    #[test]
+    fn test_try_cast_to_fixed_size_binary() {
+        expect_cast(
+            ScalarValue::Binary(Some(vec![1, 2, 3])),
+            DataType::FixedSizeBinary(3),
+            ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1, 2, 3]))),
+        )
+    }
+
+    #[test]
+    fn test_numeric_boundary_values() {
+        // Test exact boundary values for signed integers
+        expect_cast(
+            ScalarValue::Int8(Some(i8::MAX)),
+            DataType::UInt8,
+            ExpectedCast::Value(ScalarValue::UInt8(Some(i8::MAX as u8))),
+        );
+
+        expect_cast(
+            ScalarValue::Int8(Some(i8::MIN)),
+            DataType::UInt8,
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::UInt8(Some(u8::MAX)),
+            DataType::Int8,
+            ExpectedCast::NoValue,
+        );
+
+        // Test cross-type boundary scenarios
+        expect_cast(
+            ScalarValue::Int32(Some(i32::MAX)),
+            DataType::Int64,
+            ExpectedCast::Value(ScalarValue::Int64(Some(i32::MAX as i64))),
+        );
+
+        expect_cast(
+            ScalarValue::Int64(Some(i64::MIN)),
+            DataType::UInt64,
+            ExpectedCast::NoValue,
+        );
+
+        // Test unsigned to signed edge cases
+        expect_cast(
+            ScalarValue::UInt32(Some(u32::MAX)),
+            DataType::Int32,
+            ExpectedCast::NoValue,
+        );
+
+        expect_cast(
+            ScalarValue::UInt64(Some(u64::MAX)),
+            DataType::Int64,
+            ExpectedCast::NoValue,
+        );
+    }
+
+    #[test]
+    fn test_decimal_precision_limits() {
+        use arrow::datatypes::{
+            MAX_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL128_FOR_EACH_PRECISION,
+        };
+
+        // Test maximum precision values
+        expect_cast(
+            ScalarValue::Decimal128(Some(MAX_DECIMAL128_FOR_EACH_PRECISION[3]), 3, 0),
+            DataType::Decimal128(5, 0),
+            ExpectedCast::Value(ScalarValue::Decimal128(
+                Some(MAX_DECIMAL128_FOR_EACH_PRECISION[3]),
+                5,
+                0,
+            )),
+        );
+
+        // Test minimum precision values
+        expect_cast(
+            ScalarValue::Decimal128(Some(MIN_DECIMAL128_FOR_EACH_PRECISION[3]), 3, 0),
+            DataType::Decimal128(5, 0),
+            ExpectedCast::Value(ScalarValue::Decimal128(
+                Some(MIN_DECIMAL128_FOR_EACH_PRECISION[3]),
+                5,
+                0,
+            )),
+        );
+
+        // Test scale increase
+        expect_cast(
+            ScalarValue::Decimal128(Some(123), 3, 0),
+            DataType::Decimal128(5, 2),
+            ExpectedCast::Value(ScalarValue::Decimal128(Some(12300), 5, 2)),
+        );
+
+        // Test precision overflow (value too large for target precision)
+        expect_cast(
+            ScalarValue::Decimal128(Some(MAX_DECIMAL128_FOR_EACH_PRECISION[10]), 10, 0),
+            DataType::Decimal128(3, 0),
+            ExpectedCast::NoValue,
+        );
+
+        // Test non-divisible decimal conversion (should fail)
+        expect_cast(
+            ScalarValue::Decimal128(Some(12345), 5, 3), // 12.345
+            DataType::Int32,
+            ExpectedCast::NoValue, // Can't convert 12.345 to integer without loss
+        );
+
+        // Test edge case: scale reduction with precision loss
+        expect_cast(
+            ScalarValue::Decimal128(Some(12345), 5, 2), // 123.45
+            DataType::Decimal128(3, 0),                 // Can only hold up to 999
+            ExpectedCast::NoValue,
+        );
+    }
+
+    #[test]
+    fn test_timestamp_overflow_scenarios() {
+        // Test overflow in timestamp conversions
+        let max_seconds = i64::MAX / 1_000_000_000; // Avoid overflow when converting to nanos
+
+        // This should work - within safe range
+        expect_cast(
+            ScalarValue::TimestampSecond(Some(max_seconds), None),
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            ExpectedCast::Value(ScalarValue::TimestampNanosecond(
+                Some(max_seconds * 1_000_000_000),
+                None,
+            )),
+        );
+
+        // Test very large nanosecond value conversion to smaller units
+        expect_cast(
+            ScalarValue::TimestampNanosecond(Some(i64::MAX), None),
+            DataType::Timestamp(TimeUnit::Second, None),
+            ExpectedCast::Value(ScalarValue::TimestampSecond(
+                Some(i64::MAX / 1_000_000_000),
+                None,
+            )),
+        );
+
+        // Test precision loss in downscaling
+        expect_cast(
+            ScalarValue::TimestampNanosecond(Some(1), None),
+            DataType::Timestamp(TimeUnit::Second, None),
+            ExpectedCast::Value(ScalarValue::TimestampSecond(Some(0), None)),
+        );
+
+        expect_cast(
+            ScalarValue::TimestampMicrosecond(Some(999), None),
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            ExpectedCast::Value(ScalarValue::TimestampMillisecond(Some(0), None)),
+        );
+    }
+
+    #[test]
+    fn test_string_view() {
+        // Test Utf8View to other string types
+        expect_cast(
+            ScalarValue::Utf8View(Some("test".to_string())),
+            DataType::Utf8,
+            ExpectedCast::Value(ScalarValue::Utf8(Some("test".to_string()))),
+        );
+
+        expect_cast(
+            ScalarValue::Utf8View(Some("test".to_string())),
+            DataType::LargeUtf8,
+            ExpectedCast::Value(ScalarValue::LargeUtf8(Some("test".to_string()))),
+        );
+
+        // Test other string types to Utf8View
+        expect_cast(
+            ScalarValue::Utf8(Some("hello".to_string())),
+            DataType::Utf8View,
+            ExpectedCast::Value(ScalarValue::Utf8View(Some("hello".to_string()))),
+        );
+
+        expect_cast(
+            ScalarValue::LargeUtf8(Some("world".to_string())),
+            DataType::Utf8View,
+            ExpectedCast::Value(ScalarValue::Utf8View(Some("world".to_string()))),
+        );
+
+        // Test empty string
+        expect_cast(
+            ScalarValue::Utf8(Some("".to_string())),
+            DataType::Utf8View,
+            ExpectedCast::Value(ScalarValue::Utf8View(Some("".to_string()))),
+        );
+
+        // Test large string
+        let large_string = "x".repeat(1000);
+        expect_cast(
+            ScalarValue::LargeUtf8(Some(large_string.clone())),
+            DataType::Utf8View,
+            ExpectedCast::Value(ScalarValue::Utf8View(Some(large_string))),
+        );
+    }
+
+    #[test]
+    fn test_binary_size_edge_cases() {
+        // Test size mismatch - too small
+        expect_cast(
+            ScalarValue::Binary(Some(vec![1, 2])),
+            DataType::FixedSizeBinary(3),
+            ExpectedCast::NoValue,
+        );
+
+        // Test size mismatch - too large
+        expect_cast(
+            ScalarValue::Binary(Some(vec![1, 2, 3, 4])),
+            DataType::FixedSizeBinary(3),
+            ExpectedCast::NoValue,
+        );
+
+        // Test empty binary
+        expect_cast(
+            ScalarValue::Binary(Some(vec![])),
+            DataType::FixedSizeBinary(0),
+            ExpectedCast::Value(ScalarValue::FixedSizeBinary(0, Some(vec![]))),
+        );
+
+        // Test exact size match
+        expect_cast(
+            ScalarValue::Binary(Some(vec![1, 2, 3])),
+            DataType::FixedSizeBinary(3),
+            ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1, 2, 3]))),
+        );
+
+        // Test single byte
+        expect_cast(
+            ScalarValue::Binary(Some(vec![42])),
+            DataType::FixedSizeBinary(1),
+            ExpectedCast::Value(ScalarValue::FixedSizeBinary(1, Some(vec![42]))),
+        );
+    }
+
+    #[test]
+    fn test_dictionary_index_types() {
+        // Test different dictionary index types
+        let string_value = ScalarValue::Utf8(Some("test".to_string()));
+
+        // Int8 index dictionary
+        let dict_int8 =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8));
+        expect_cast(
+            string_value.clone(),
+            dict_int8,
+            ExpectedCast::Value(ScalarValue::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(string_value.clone()),
+            )),
+        );
+
+        // Int16 index dictionary
+        let dict_int16 =
+            DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8));
+        expect_cast(
+            string_value.clone(),
+            dict_int16,
+            ExpectedCast::Value(ScalarValue::Dictionary(
+                Box::new(DataType::Int16),
+                Box::new(string_value.clone()),
+            )),
+        );
+
+        // Int64 index dictionary
+        let dict_int64 =
+            DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8));
+        expect_cast(
+            string_value.clone(),
+            dict_int64,
+            ExpectedCast::Value(ScalarValue::Dictionary(
+                Box::new(DataType::Int64),
+                Box::new(string_value.clone()),
+            )),
+        );
+
+        // Test dictionary unwrapping
+        let dict_value = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::LargeUtf8(Some("unwrap_test".to_string()))),
+        );
+        expect_cast(
+            dict_value,
+            DataType::LargeUtf8,
+            ExpectedCast::Value(ScalarValue::LargeUtf8(Some("unwrap_test".to_string()))),
+        );
+    }
+
+    #[test]
+    fn test_type_support_functions() {
+        // Test numeric type support
+        assert!(is_supported_numeric_type(&DataType::Int8));
+        assert!(is_supported_numeric_type(&DataType::UInt64));
+        assert!(is_supported_numeric_type(&DataType::Decimal128(10, 2)));
+        assert!(is_supported_numeric_type(&DataType::Timestamp(
+            TimeUnit::Nanosecond,
+            None
+        )));
+        assert!(!is_supported_numeric_type(&DataType::Float32));
+        assert!(!is_supported_numeric_type(&DataType::Float64));
+
+        // Test string type support
+        assert!(is_supported_string_type(&DataType::Utf8));
+        assert!(is_supported_string_type(&DataType::LargeUtf8));
+        assert!(is_supported_string_type(&DataType::Utf8View));
+        assert!(!is_supported_string_type(&DataType::Binary));
+
+        // Test binary type support
+        assert!(is_supported_binary_type(&DataType::Binary));
+        assert!(is_supported_binary_type(&DataType::FixedSizeBinary(10)));
+        assert!(!is_supported_binary_type(&DataType::Utf8));
+
+        // Test dictionary type support with nested types
+        assert!(is_supported_dictionary_type(&DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Utf8)
+        )));
+        assert!(is_supported_dictionary_type(&DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Int64)
+        )));
+        assert!(!is_supported_dictionary_type(&DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true
+            ))))
+        )));
+
+        // Test overall type support
+        assert!(is_supported_type(&DataType::Int32));
+        assert!(is_supported_type(&DataType::Utf8));
+        assert!(is_supported_type(&DataType::Binary));
+        assert!(is_supported_type(&DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Utf8)
+        )));
+        assert!(!is_supported_type(&DataType::List(Arc::new(Field::new(
+            "item",
+            DataType::Int32,
+            true
+        )))));
+        assert!(!is_supported_type(&DataType::Struct(Fields::empty())));
+    }
+
+    #[test]
+    fn test_error_conditions() {
+        // Test unsupported source type
+        expect_cast(
+            ScalarValue::Float32(Some(1.5)),
+            DataType::Int32,
+            ExpectedCast::NoValue,
+        );
+
+        // Test unsupported target type
+        expect_cast(
+            ScalarValue::Int32(Some(123)),
+            DataType::Float64,
+            ExpectedCast::NoValue,
+        );
+
+        // Test both types unsupported
+        expect_cast(
+            ScalarValue::Float64(Some(1.5)),
+            DataType::Float32,
+            ExpectedCast::NoValue,
+        );
+
+        // Test complex unsupported types
+        let list_type =
+            DataType::List(Arc::new(Field::new("item", DataType::Int32, true)));
+        expect_cast(
+            ScalarValue::Int32(Some(123)),
+            list_type,
+            ExpectedCast::NoValue,
+        );
+
+        // Test dictionary with unsupported inner type
+        let bad_dict = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::List(Arc::new(Field::new(
+                "item",
+                DataType::Int32,
+                true,
+            )))),
+        );
+        expect_cast(
+            ScalarValue::Int32(Some(123)),
+            bad_dict,
+            ExpectedCast::NoValue,
+        );
+    }
+}
diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs
index a21ad5bbbcc30..1aa42470a1481 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -17,12 +17,19 @@
 
 //! [`ColumnarValue`] represents the result of evaluating an expression.
 
-use arrow::array::{Array, ArrayRef, NullArray};
-use arrow::compute::{kernels, CastOptions};
-use arrow::datatypes::DataType;
-use arrow::util::pretty::pretty_format_columns;
-use datafusion_common::format::DEFAULT_CAST_OPTIONS;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use arrow::{
+    array::{Array, ArrayRef, Date32Array, Date64Array, NullArray},
+    compute::{CastOptions, kernels, max, min},
+    datatypes::{DataType, Field},
+    util::pretty::pretty_format_columns,
+};
+use datafusion_common::internal_datafusion_err;
+use datafusion_common::{
+    Result, ScalarValue,
+    format::DEFAULT_CAST_OPTIONS,
+    internal_err,
+    scalar::{date_to_timestamp_multiplier, ensure_timestamp_in_bounds},
+};
 use std::fmt;
 use std::sync::Arc;
 
@@ -113,10 +120,12 @@ impl ColumnarValue {
         }
     }
 
-    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
-    /// number of rows. [`Self::Scalar`] is converted by repeating the same
-    /// scalar multiple times which is not as efficient as handling the scalar
-    /// directly.
+    /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified
+    /// number of rows  by repeating the same scalar multiple times,
+    /// which is not as efficient as handling the scalar directly.
+    /// [`Self::Array`] will just be returned as is.
+    ///
+    /// See [`Self::into_array_of_size`] if you need to validate the length of the output array.
     ///
     /// See [`Self::values_to_arrays`] to convert multiple columnar values into
     /// arrays of the same length.
@@ -135,6 +144,38 @@ impl ColumnarValue {
     /// number of rows. [`Self::Scalar`] is converted by repeating the same
     /// scalar multiple times which is not as efficient as handling the scalar
     /// directly.
+    /// This validates that if this is [`Self::Array`], it has the expected length.
+    ///
+    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+    /// arrays of the same length.
+    ///
+    /// # Errors
+    ///
+    /// Errors if `self` is a Scalar that fails to be converted into an array of size or
+    /// if the array length does not match the expected length
+    pub fn into_array_of_size(self, num_rows: usize) -> Result<ArrayRef> {
+        match self {
+            ColumnarValue::Array(array) => {
+                if array.len() == num_rows {
+                    Ok(array)
+                } else {
+                    internal_err!(
+                        "Array length {} does not match expected length {}",
+                        array.len(),
+                        num_rows
+                    )
+                }
+            }
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows),
+        }
+    }
+
+    /// Convert any [`Self::Scalar`] into an Arrow [`ArrayRef`] with the specified
+    /// number of rows  by repeating the same scalar multiple times,
+    /// which is not as efficient as handling the scalar directly.
+    /// [`Self::Array`] will just be returned as is.
+    ///
+    /// See [`Self::to_array_of_size`] if you need to validate the length of the output array.
     ///
     /// See [`Self::values_to_arrays`] to convert multiple columnar values into
     /// arrays of the same length.
@@ -149,6 +190,36 @@ impl ColumnarValue {
         })
     }
 
+    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
+    /// number of rows. [`Self::Scalar`] is converted by repeating the same
+    /// scalar multiple times which is not as efficient as handling the scalar
+    /// directly.
+    /// This validates that if this is [`Self::Array`], it has the expected length.
+    ///
+    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
+    /// arrays of the same length.
+    ///
+    /// # Errors
+    ///
+    /// Errors if `self` is a Scalar that fails to be converted into an array of size or
+    /// if the array length does not match the expected length
+    pub fn to_array_of_size(&self, num_rows: usize) -> Result<ArrayRef> {
+        match self {
+            ColumnarValue::Array(array) => {
+                if array.len() == num_rows {
+                    Ok(Arc::clone(array))
+                } else {
+                    internal_err!(
+                        "Array length {} does not match expected length {}",
+                        array.len(),
+                        num_rows
+                    )
+                }
+            }
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows),
+        }
+    }
+
     /// Null columnar values are implemented as a null array in order to pass batch
     /// num_rows
     pub fn create_null_array(num_rows: usize) -> Self {
@@ -183,7 +254,8 @@ impl ColumnarValue {
                         Some(array_len)
                     } else {
                         return internal_err!(
-                            "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
+                            "Arguments has mixed length. Expected length: {array_len}, found length: {}",
+                            a.len()
                         );
                     }
                 }
@@ -202,7 +274,17 @@ impl ColumnarValue {
         Ok(args)
     }
 
-    /// Cast's this [ColumnarValue] to the specified `DataType`
+    /// Cast this [ColumnarValue] to the specified `DataType`
+    ///
+    /// # Struct Casting Behavior
+    ///
+    /// When casting struct types, fields are matched **by name** rather than position:
+    /// - Source fields are matched to target fields using case-sensitive name comparison
+    /// - Fields are reordered to match the target schema
+    /// - Missing target fields are filled with null arrays
+    /// - Extra source fields are ignored
+    ///
+    /// For non-struct types, uses Arrow's standard positional casting.
     pub fn cast_to(
         &self,
         cast_type: &DataType,
@@ -210,9 +292,10 @@ impl ColumnarValue {
     ) -> Result<ColumnarValue> {
         let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
         match self {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
-                kernels::cast::cast_with_options(array, cast_type, &cast_options)?,
-            )),
+            ColumnarValue::Array(array) => {
+                let casted = cast_array_by_name(array, cast_type, &cast_options)?;
+                Ok(ColumnarValue::Array(casted))
+            }
             ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
                 scalar.cast_to_with_options(cast_type, &cast_options)?,
             )),
@@ -220,6 +303,90 @@ impl ColumnarValue {
     }
 }
 
+fn cast_array_by_name(
+    array: &ArrayRef,
+    cast_type: &DataType,
+    cast_options: &CastOptions<'static>,
+) -> Result<ArrayRef> {
+    // If types are already equal, no cast needed
+    if array.data_type() == cast_type {
+        return Ok(Arc::clone(array));
+    }
+
+    match cast_type {
+        DataType::Struct(_) => {
+            // Field name is unused; only the struct's inner field names matter
+            let target_field = Field::new("_", cast_type.clone(), true);
+            datafusion_common::nested_struct::cast_column(
+                array,
+                &target_field,
+                cast_options,
+            )
+        }
+        _ => {
+            ensure_date_array_timestamp_bounds(array, cast_type)?;
+            Ok(kernels::cast::cast_with_options(
+                array,
+                cast_type,
+                cast_options,
+            )?)
+        }
+    }
+}
+
+fn ensure_date_array_timestamp_bounds(
+    array: &ArrayRef,
+    cast_type: &DataType,
+) -> Result<()> {
+    let source_type = array.data_type().clone();
+    let Some(multiplier) = date_to_timestamp_multiplier(&source_type, cast_type) else {
+        return Ok(());
+    };
+
+    if multiplier <= 1 {
+        return Ok(());
+    }
+
+    // Use compute kernels to find min/max instead of iterating all elements
+    let (min_val, max_val): (Option<i64>, Option<i64>) = match &source_type {
+        DataType::Date32 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date32Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr).map(|v| v as i64), max(arr).map(|v| v as i64))
+        }
+        DataType::Date64 => {
+            let arr = array
+                .as_any()
+                .downcast_ref::<Date64Array>()
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Expected Date64Array but found {}",
+                        array.data_type()
+                    )
+                })?;
+            (min(arr), max(arr))
+        }
+        _ => return Ok(()), // Not a date type, nothing to do
+    };
+
+    // Only validate the min and max values instead of all elements
+    if let Some(min) = min_val {
+        ensure_timestamp_in_bounds(min, multiplier, &source_type, cast_type)?;
+    }
+    if let Some(max) = max_val {
+        ensure_timestamp_in_bounds(max, multiplier, &source_type, cast_type)?;
+    }
+
+    Ok(())
+}
+
 // Implement Display trait for ColumnarValue
 impl fmt::Display for ColumnarValue {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -247,7 +414,38 @@ impl fmt::Display for ColumnarValue {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::Int32Array;
+    use arrow::{
+        array::{Date64Array, Int32Array, StructArray},
+        datatypes::{Field, Fields, TimeUnit},
+    };
+
+    #[test]
+    fn into_array_of_size() {
+        // Array case
+        let arr = make_array(1, 3);
+        let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr));
+        assert_eq!(&arr_columnar_value.into_array_of_size(3).unwrap(), &arr);
+
+        // Scalar case
+        let scalar_columnar_value = ColumnarValue::Scalar(ScalarValue::Int32(Some(42)));
+        let expected_array = make_array(42, 100);
+        assert_eq!(
+            &scalar_columnar_value.into_array_of_size(100).unwrap(),
+            &expected_array
+        );
+
+        // Array case with wrong size
+        let arr = make_array(1, 3);
+        let arr_columnar_value = ColumnarValue::Array(Arc::clone(&arr));
+        let result = arr_columnar_value.into_array_of_size(5);
+        let err = result.unwrap_err();
+        assert!(
+            err.to_string().starts_with(
+                "Internal error: Array length 3 does not match expected length 5"
+            ),
+            "Found: {err}"
+        );
+    }
 
     #[test]
     fn values_to_arrays() {
@@ -391,4 +589,115 @@ mod tests {
             )
         );
     }
+
+    #[test]
+    fn cast_struct_by_field_name() {
+        let source_fields = Fields::from(vec![
+            Field::new("b", DataType::Int32, true),
+            Field::new("a", DataType::Int32, true),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]);
+
+        let struct_array = StructArray::new(
+            source_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3)])),
+                Arc::new(Int32Array::from(vec![Some(4)])),
+            ],
+            None,
+        );
+
+        let value = ColumnarValue::Array(Arc::new(struct_array));
+        let casted = value
+            .cast_to(&DataType::Struct(target_fields.clone()), None)
+            .expect("struct cast should succeed");
+
+        let ColumnarValue::Array(arr) = casted else {
+            panic!("expected array after cast");
+        };
+
+        let struct_array = arr
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .expect("expected StructArray");
+
+        let field_a = struct_array
+            .column_by_name("a")
+            .expect("expected field a in cast result");
+        let field_b = struct_array
+            .column_by_name("b")
+            .expect("expected field b in cast result");
+
+        assert_eq!(
+            field_a
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("expected Int32 array")
+                .value(0),
+            4
+        );
+        assert_eq!(
+            field_b
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("expected Int32 array")
+                .value(0),
+            3
+        );
+    }
+
+    #[test]
+    fn cast_struct_missing_field_inserts_nulls() {
+        let source_fields = Fields::from(vec![Field::new("a", DataType::Int32, true)]);
+
+        let target_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]);
+
+        let struct_array = StructArray::new(
+            source_fields,
+            vec![Arc::new(Int32Array::from(vec![Some(5)]))],
+            None,
+        );
+
+        let value = ColumnarValue::Array(Arc::new(struct_array));
+        let casted = value
+            .cast_to(&DataType::Struct(target_fields.clone()), None)
+            .expect("struct cast should succeed");
+
+        let ColumnarValue::Array(arr) = casted else {
+            panic!("expected array after cast");
+        };
+
+        let struct_array = arr
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .expect("expected StructArray");
+
+        let field_b = struct_array
+            .column_by_name("b")
+            .expect("expected missing field to be added");
+
+        assert!(field_b.is_null(0));
+    }
+
+    #[test]
+    fn cast_date64_array_to_timestamp_overflow() {
+        let overflow_value = i64::MAX / 1_000_000 + 1;
+        let array: ArrayRef = Arc::new(Date64Array::from(vec![Some(overflow_value)]));
+        let value = ColumnarValue::Array(array);
+        let result =
+            value.cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None), None);
+        let err = result.expect_err("expected overflow to be detected");
+        assert!(
+            err.to_string()
+                .contains("converted value exceeds the representable i64 range"),
+            "unexpected error: {err}"
+        );
+    }
 }
diff --git a/datafusion/expr-common/src/dyn_eq.rs b/datafusion/expr-common/src/dyn_eq.rs
new file mode 100644
index 0000000000000..75d9c06d67f56
--- /dev/null
+++ b/datafusion/expr-common/src/dyn_eq.rs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::hash::{Hash, Hasher};
+
+/// A dyn-compatible version of [`Eq`] trait.
+/// The implementation constraints for this trait are the same as for [`Eq`]:
+/// the implementation must be reflexive, symmetric, and transitive.
+/// Additionally, if two values can be compared with [`DynEq`] and [`PartialEq`] then
+/// they must be [`DynEq`]-equal if and only if they are [`PartialEq`]-equal.
+/// It is therefore strongly discouraged to implement this trait for types
+/// that implement `PartialEq<Other>` or `Eq<Other>` for any type `Other` other than `Self`.
+///
+/// Note: This trait should not be implemented directly. Implement `Eq` and `Any` and use
+/// the blanket implementation.
+#[expect(private_bounds)]
+pub trait DynEq: private::EqSealed {
+    fn dyn_eq(&self, other: &dyn Any) -> bool;
+}
+
+impl<T: Eq + Any> private::EqSealed for T {}
+impl<T: Eq + Any> DynEq for T {
+    fn dyn_eq(&self, other: &dyn Any) -> bool {
+        other.downcast_ref::<Self>() == Some(self)
+    }
+}
+
+/// A dyn-compatible version of [`Hash`] trait.
+/// If two values are equal according to [`DynEq`], they must produce the same hash value.
+///
+/// Note: This trait should not be implemented directly. Implement `Hash` and `Any` and use
+/// the blanket implementation.
+#[expect(private_bounds)]
+pub trait DynHash: private::HashSealed {
+    fn dyn_hash(&self, _state: &mut dyn Hasher);
+}
+
+impl<T: Hash + Any> private::HashSealed for T {}
+impl<T: Hash + Any> DynHash for T {
+    fn dyn_hash(&self, mut state: &mut dyn Hasher) {
+        self.type_id().hash(&mut state);
+        self.hash(&mut state)
+    }
+}
+
+mod private {
+    pub(super) trait EqSealed {}
+    pub(super) trait HashSealed {}
+}
diff --git a/datafusion/expr-common/src/groups_accumulator.rs b/datafusion/expr-common/src/groups_accumulator.rs
index 5ff1c1d072164..9053f7a8eab9f 100644
--- a/datafusion/expr-common/src/groups_accumulator.rs
+++ b/datafusion/expr-common/src/groups_accumulator.rs
@@ -18,10 +18,10 @@
 //! Vectorized [`GroupsAccumulator`]
 
 use arrow::array::{ArrayRef, BooleanArray};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 
 /// Describes how many rows should be emitted during grouping.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EmitTo {
     /// Emit all groups
     All,
@@ -89,6 +89,9 @@ impl EmitTo {
 /// optional and is harder to implement than `Accumulator`, but can be much
 /// faster for queries with many group values.  See the [Aggregating Millions of
 /// Groups Fast blog] for more background.
+/// For more background, please also see the [Aggregating Millions of Groups Fast in Apache Arrow DataFusion 28.0.0 blog]
+///
+/// [Aggregating Millions of Groups Fast in Apache Arrow DataFusion 28.0.0 blog]: https://datafusion.apache.org/blog/2023/08/05/datafusion_fast_grouping
 ///
 /// [`NullState`] can help keep the state for groups that have not seen any
 /// values and produce the correct output for those groups.
@@ -105,7 +108,7 @@ impl EmitTo {
 ///
 /// [`Accumulator`]: crate::accumulator::Accumulator
 /// [Aggregating Millions of Groups Fast blog]: https://arrow.apache.org/blog/2023/08/05/datafusion_fast_grouping/
-pub trait GroupsAccumulator: Send {
+pub trait GroupsAccumulator: Send + std::any::Any {
     /// Updates the accumulator's state from its arguments, encoded as
     /// a vector of [`ArrayRef`]s.
     ///
diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs
index d656c676bd01d..0f88723d116f5 100644
--- a/datafusion/expr-common/src/interval_arithmetic.rs
+++ b/datafusion/expr-common/src/interval_arithmetic.rs
@@ -22,19 +22,22 @@ use std::fmt::{self, Display, Formatter};
 use std::ops::{AddAssign, SubAssign};
 
 use crate::operator::Operator;
-use crate::type_coercion::binary::{comparison_coercion_numeric, BinaryTypeCoercer};
+use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion_numeric};
 
-use arrow::compute::{cast_with_options, CastOptions};
+use arrow::compute::{CastOptions, cast_with_options};
 use arrow::datatypes::{
-    DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit, TimeUnit,
+    DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit,
     MAX_DECIMAL128_FOR_EACH_PRECISION, MAX_DECIMAL256_FOR_EACH_PRECISION,
-    MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL256_FOR_EACH_PRECISION,
+    MIN_DECIMAL128_FOR_EACH_PRECISION, MIN_DECIMAL256_FOR_EACH_PRECISION, TimeUnit,
 };
 use datafusion_common::rounding::{alter_fp_rounding_mode, next_down, next_up};
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, assert_eq_or_internal_err,
+    assert_or_internal_err, internal_err,
+};
 
 macro_rules! get_extreme_value {
-    ($extreme:ident, $value:expr) => {
+    ($extreme:ident, $DECIMAL128_ARRAY:ident, $DECIMAL256_ARRAY:ident, $value:expr) => {
         match $value {
             DataType::UInt8 => ScalarValue::UInt8(Some(u8::$extreme)),
             DataType::UInt16 => ScalarValue::UInt16(Some(u16::$extreme)),
@@ -80,18 +83,12 @@ macro_rules! get_extreme_value {
                 ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano::$extreme))
             }
             DataType::Decimal128(precision, scale) => ScalarValue::Decimal128(
-                Some(
-                    paste::paste! {[<$extreme _DECIMAL128_FOR_EACH_PRECISION>]}
-                        [*precision as usize],
-                ),
+                Some($DECIMAL128_ARRAY[*precision as usize]),
                 *precision,
                 *scale,
             ),
             DataType::Decimal256(precision, scale) => ScalarValue::Decimal256(
-                Some(
-                    paste::paste! {[<$extreme _DECIMAL256_FOR_EACH_PRECISION>]}
-                        [*precision as usize],
-                ),
+                Some($DECIMAL256_ARRAY[*precision as usize]),
                 *precision,
                 *scale,
             ),
@@ -266,24 +263,23 @@ impl Interval {
     ///   - Floating-point endpoints with `NaN`, `INF`, or `NEG_INF` are converted
     ///     to `NULL`s.
     pub fn try_new(lower: ScalarValue, upper: ScalarValue) -> Result<Self> {
-        if lower.data_type() != upper.data_type() {
-            return internal_err!("Endpoints of an Interval should have the same type");
-        }
+        assert_eq_or_internal_err!(
+            lower.data_type(),
+            upper.data_type(),
+            "Endpoints of an Interval should have the same type"
+        );
 
         let interval = Self::new(lower, upper);
 
-        if interval.lower.is_null()
-            || interval.upper.is_null()
-            || interval.lower <= interval.upper
-        {
-            Ok(interval)
-        } else {
-            internal_err!(
-                "Interval's lower bound {} is greater than the upper bound {}",
-                interval.lower,
-                interval.upper
-            )
-        }
+        assert_or_internal_err!(
+            interval.lower.is_null()
+                || interval.upper.is_null()
+                || interval.lower <= interval.upper,
+            "Interval's lower bound {} is greater than the upper bound {}",
+            interval.lower,
+            interval.upper
+        );
+        Ok(interval)
     }
 
     /// Only for internal usage. Responsible for standardizing booleans and
@@ -430,21 +426,33 @@ impl Interval {
         )
     }
 
-    pub const CERTAINLY_FALSE: Self = Self {
+    /// An interval containing only the 'false' truth value.
+    pub const FALSE: Self = Self {
         lower: ScalarValue::Boolean(Some(false)),
         upper: ScalarValue::Boolean(Some(false)),
     };
 
-    pub const UNCERTAIN: Self = Self {
+    #[deprecated(since = "52.0.0", note = "Use `FALSE` instead")]
+    pub const CERTAINLY_FALSE: Self = Self::FALSE;
+
+    /// An interval containing both the 'true', and 'false' truth values.
+    pub const TRUE_OR_FALSE: Self = Self {
         lower: ScalarValue::Boolean(Some(false)),
         upper: ScalarValue::Boolean(Some(true)),
     };
 
-    pub const CERTAINLY_TRUE: Self = Self {
+    #[deprecated(since = "52.0.0", note = "Use `TRUE_OR_FALSE` instead")]
+    pub const UNCERTAIN: Self = Self::TRUE_OR_FALSE;
+
+    /// An interval containing only the 'true' truth value.
+    pub const TRUE: Self = Self {
         lower: ScalarValue::Boolean(Some(true)),
         upper: ScalarValue::Boolean(Some(true)),
     };
 
+    #[deprecated(since = "52.0.0", note = "Use `TRUE` instead")]
+    pub const CERTAINLY_TRUE: Self = Self::TRUE;
+
     /// Decide if this interval is certainly greater than, possibly greater than,
     /// or can't be greater than `other` by returning `[true, true]`,
     /// `[false, true]` or `[false, false]` respectively.
@@ -454,27 +462,28 @@ impl Interval {
     ///       to an error.
     pub fn gt<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            internal_err!(
-                "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !(self.upper.is_null() || rhs.lower.is_null())
-            && self.upper <= rhs.lower
-        {
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !(self.upper.is_null() || rhs.lower.is_null()) && self.upper <= rhs.lower {
             // Values in this interval are certainly less than or equal to
             // those in the given interval.
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else if !(self.lower.is_null() || rhs.upper.is_null())
             && (self.lower > rhs.upper)
         {
             // Values in this interval are certainly greater than those in the
             // given interval.
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else {
             // All outcomes are possible.
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -487,27 +496,28 @@ impl Interval {
     ///       to an error.
     pub fn gt_eq<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            internal_err!(
-                "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !(self.lower.is_null() || rhs.upper.is_null())
-            && self.lower >= rhs.upper
-        {
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are comparable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !(self.lower.is_null() || rhs.upper.is_null()) && self.lower >= rhs.upper {
             // Values in this interval are certainly greater than or equal to
             // those in the given interval.
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else if !(self.upper.is_null() || rhs.lower.is_null())
             && (self.upper < rhs.lower)
         {
             // Values in this interval are certainly less than those in the
             // given interval.
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else {
             // All outcomes are possible.
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -542,25 +552,26 @@ impl Interval {
     ///       to an error.
     pub fn equal<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type())
-            .get_result_type()
-            .is_err()
-        {
-            internal_err!(
-                "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            )
-        } else if !self.lower.is_null()
+        let types_compatible =
+            BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type())
+                .get_result_type()
+                .is_ok();
+        assert_or_internal_err!(
+            types_compatible,
+            "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
+        if !self.lower.is_null()
             && (self.lower == self.upper)
             && (rhs.lower == rhs.upper)
             && (self.lower == rhs.lower)
         {
-            Ok(Self::CERTAINLY_TRUE)
+            Ok(Self::TRUE)
         } else if self.intersect(rhs)?.is_none() {
-            Ok(Self::CERTAINLY_FALSE)
+            Ok(Self::FALSE)
         } else {
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -583,7 +594,9 @@ impl Interval {
                     upper: ScalarValue::Boolean(Some(upper)),
                 })
             }
-            _ => internal_err!("Incompatible data types for logical conjunction"),
+
+            // Return TRUE_OR_FALSE when intervals don't have concrete boolean bounds
+            _ => Ok(Self::TRUE_OR_FALSE),
         }
     }
 
@@ -606,20 +619,25 @@ impl Interval {
                     upper: ScalarValue::Boolean(Some(upper)),
                 })
             }
-            _ => internal_err!("Incompatible data types for logical disjunction"),
+
+            // Return TRUE_OR_FALSE when intervals don't have concrete boolean bounds
+            _ => Ok(Self::TRUE_OR_FALSE),
         }
     }
 
     /// Compute the logical negation of this (boolean) interval.
     pub fn not(&self) -> Result<Self> {
-        if self.data_type().ne(&DataType::Boolean) {
-            internal_err!("Cannot apply logical negation to a non-boolean interval")
-        } else if self == &Self::CERTAINLY_TRUE {
-            Ok(Self::CERTAINLY_FALSE)
-        } else if self == &Self::CERTAINLY_FALSE {
-            Ok(Self::CERTAINLY_TRUE)
+        assert_eq_or_internal_err!(
+            self.data_type(),
+            DataType::Boolean,
+            "Cannot apply logical negation to a non-boolean interval"
+        );
+        if self == &Self::TRUE {
+            Ok(Self::FALSE)
+        } else if self == &Self::FALSE {
+            Ok(Self::TRUE)
         } else {
-            Ok(Self::UNCERTAIN)
+            Ok(Self::TRUE_OR_FALSE)
         }
     }
 
@@ -631,13 +649,15 @@ impl Interval {
     ///       to an error.
     pub fn intersect<T: Borrow<Self>>(&self, other: T) -> Result<Option<Self>> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Only intervals with the same data type are intersectable, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Only intervals with the same data type are intersectable, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         // If it is evident that the result is an empty interval, short-circuit
         // and directly return `None`.
@@ -666,13 +686,15 @@ impl Interval {
     ///       to an error.
     pub fn union<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Cannot calculate the union of intervals with different data types, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Cannot calculate the union of intervals with different data types, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         let lower = if self.lower.is_null()
             || (!rhs.lower.is_null() && self.lower <= rhs.lower)
@@ -702,27 +724,28 @@ impl Interval {
     pub fn contains_value<T: Borrow<ScalarValue>>(&self, other: T) -> Result<bool> {
         let rhs = other.borrow();
 
-        let (lhs_lower, lhs_upper, rhs) = if self.data_type().eq(&rhs.data_type()) {
-            (&self.lower, &self.upper, rhs)
-        } else if let Some(common_type) =
-            comparison_coercion_numeric(&self.data_type(), &rhs.data_type())
-        {
-            (
-                &self.lower.cast_to(&common_type)?,
-                &self.upper.cast_to(&common_type)?,
-                &rhs.cast_to(&common_type)?,
-            )
+        let (lhs_lower, lhs_upper, rhs_value) = if self.data_type().eq(&rhs.data_type()) {
+            (self.lower.clone(), self.upper.clone(), rhs.clone())
         } else {
-            return internal_err!(
+            let maybe_common_type =
+                comparison_coercion_numeric(&self.data_type(), &rhs.data_type());
+            assert_or_internal_err!(
+                maybe_common_type.is_some(),
                 "Data types must be compatible for containment checks, lhs:{}, rhs:{}",
                 self.data_type(),
                 rhs.data_type()
             );
+            let common_type = maybe_common_type.expect("checked for Some");
+            (
+                self.lower.cast_to(&common_type)?,
+                self.upper.cast_to(&common_type)?,
+                rhs.cast_to(&common_type)?,
+            )
         };
 
         // We only check the upper bound for a `None` value because `None`
         // values are less than `Some` values according to Rust.
-        Ok(lhs_lower <= rhs && (lhs_upper.is_null() || rhs <= lhs_upper))
+        Ok(lhs_lower <= rhs_value && (lhs_upper.is_null() || rhs_value <= lhs_upper))
     }
 
     /// Decide if this interval is a superset of, overlaps with, or
@@ -734,26 +757,38 @@ impl Interval {
     ///       to an error.
     pub fn contains<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        if self.data_type().ne(&rhs.data_type()) {
-            return internal_err!(
-                "Interval data types must match for containment checks, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let lhs_type = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            lhs_type,
+            rhs_type,
+            "Interval data types must match for containment checks, lhs:{}, rhs:{}",
+            self.data_type(),
+            rhs.data_type()
+        );
 
         match self.intersect(rhs)? {
             Some(intersection) => {
                 if &intersection == rhs {
-                    Ok(Self::CERTAINLY_TRUE)
+                    Ok(Self::TRUE)
                 } else {
-                    Ok(Self::UNCERTAIN)
+                    Ok(Self::TRUE_OR_FALSE)
                 }
             }
-            None => Ok(Self::CERTAINLY_FALSE),
+            None => Ok(Self::FALSE),
         }
     }
 
+    /// Decide if this interval is a superset of `other`. If argument `strict`
+    /// is `true`, only returns `true` if this interval is a strict superset.
+    ///
+    /// NOTE: This function only works with intervals of the same data type.
+    ///       Attempting to compare intervals of different data types will lead
+    ///       to an error.
+    pub fn is_superset(&self, other: &Interval, strict: bool) -> Result<bool> {
+        Ok(!(strict && self.eq(other)) && (self.contains(other)? == Interval::TRUE))
+    }
+
     /// Add the given interval (`other`) to this interval. Say we have intervals
     /// `[a1, b1]` and `[a2, b2]`, then their sum is `[a1 + a2, b1 + b2]`. Note
     /// that this represents all possible values the sum can take if one can
@@ -798,15 +833,15 @@ impl Interval {
     ///       to an error.
     pub fn mul<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        let dt = if self.data_type().eq(&rhs.data_type()) {
-            self.data_type()
-        } else {
-            return internal_err!(
-                "Intervals must have the same data type for multiplication, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let dt = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            dt.clone(),
+            rhs_type.clone(),
+            "Intervals must have the same data type for multiplication, lhs:{}, rhs:{}",
+            dt.clone(),
+            rhs_type.clone()
+        );
 
         let zero = ScalarValue::new_zero(&dt)?;
 
@@ -817,12 +852,12 @@ impl Interval {
         ) {
             (true, true, false) => mul_helper_multi_zero_inclusive(&dt, self, rhs),
             (true, false, false) => {
-                mul_helper_single_zero_inclusive(&dt, self, rhs, zero)
+                mul_helper_single_zero_inclusive(&dt, self, rhs, &zero)
             }
             (false, true, false) => {
-                mul_helper_single_zero_inclusive(&dt, rhs, self, zero)
+                mul_helper_single_zero_inclusive(&dt, rhs, self, &zero)
             }
-            _ => mul_helper_zero_exclusive(&dt, self, rhs, zero),
+            _ => mul_helper_zero_exclusive(&dt, self, rhs, &zero),
         };
         Ok(result)
     }
@@ -841,15 +876,15 @@ impl Interval {
     ///           zero should result in an interval set, not the universal set.
     pub fn div<T: Borrow<Self>>(&self, other: T) -> Result<Self> {
         let rhs = other.borrow();
-        let dt = if self.data_type().eq(&rhs.data_type()) {
-            self.data_type()
-        } else {
-            return internal_err!(
-                "Intervals must have the same data type for division, lhs:{}, rhs:{}",
-                self.data_type(),
-                rhs.data_type()
-            );
-        };
+        let dt = self.data_type();
+        let rhs_type = rhs.data_type();
+        assert_eq_or_internal_err!(
+            dt.clone(),
+            rhs_type.clone(),
+            "Intervals must have the same data type for division, lhs:{}, rhs:{}",
+            dt.clone(),
+            rhs_type.clone()
+        );
 
         let zero = ScalarValue::new_zero(&dt)?;
         // We want 0 to be approachable from both negative and positive sides.
@@ -860,15 +895,12 @@ impl Interval {
 
         // Exit early with an unbounded interval if zero is strictly inside the
         // right hand side:
-        if rhs.contains(&zero_point)? == Self::CERTAINLY_TRUE && !dt.is_unsigned_integer()
-        {
+        if rhs.contains(&zero_point)? == Self::TRUE && !dt.is_unsigned_integer() {
             Self::make_unbounded(&dt)
         }
         // At this point, we know that only one endpoint of the right hand side
         // can be zero.
-        else if self.contains(&zero_point)? == Self::CERTAINLY_TRUE
-            && !dt.is_unsigned_integer()
-        {
+        else if self.contains(&zero_point)? == Self::TRUE && !dt.is_unsigned_integer() {
             Ok(div_helper_lhs_zero_inclusive(&dt, self, rhs, &zero_point))
         } else {
             Ok(div_helper_zero_exclusive(&dt, self, rhs, &zero_point))
@@ -1124,10 +1156,20 @@ fn handle_overflow<const UPPER: bool>(
     match (UPPER, positive_sign) {
         (true, true) | (false, false) => ScalarValue::try_from(dt).unwrap(),
         (true, false) => {
-            get_extreme_value!(MIN, dt)
+            get_extreme_value!(
+                MIN,
+                MIN_DECIMAL128_FOR_EACH_PRECISION,
+                MIN_DECIMAL256_FOR_EACH_PRECISION,
+                dt
+            )
         }
         (false, true) => {
-            get_extreme_value!(MAX, dt)
+            get_extreme_value!(
+                MAX,
+                MAX_DECIMAL128_FOR_EACH_PRECISION,
+                MAX_DECIMAL256_FOR_EACH_PRECISION,
+                dt
+            )
         }
     }
 }
@@ -1302,13 +1344,15 @@ pub fn satisfy_greater(
     right: &Interval,
     strict: bool,
 ) -> Result<Option<(Interval, Interval)>> {
-    if left.data_type().ne(&right.data_type()) {
-        return internal_err!(
-            "Intervals must have the same data type, lhs:{}, rhs:{}",
-            left.data_type(),
-            right.data_type()
-        );
-    }
+    let lhs_type = left.data_type();
+    let rhs_type = right.data_type();
+    assert_eq_or_internal_err!(
+        lhs_type.clone(),
+        rhs_type.clone(),
+        "Intervals must have the same data type, lhs:{}, rhs:{}",
+        lhs_type,
+        rhs_type
+    );
 
     if !left.upper.is_null() && left.upper <= right.lower {
         if !strict && left.upper == right.lower {
@@ -1422,10 +1466,10 @@ fn mul_helper_single_zero_inclusive(
     dt: &DataType,
     lhs: &Interval,
     rhs: &Interval,
-    zero: ScalarValue,
+    zero: &ScalarValue,
 ) -> Interval {
     // With the following interval bounds, there is no possibility to create an invalid interval.
-    if rhs.upper <= zero && !rhs.upper.is_null() {
+    if rhs.upper <= *zero && !rhs.upper.is_null() {
         // <-------=====0=====------->
         // <--======----0------------>
         let lower = mul_bounds::<false>(dt, &lhs.upper, &rhs.lower);
@@ -1474,11 +1518,11 @@ fn mul_helper_zero_exclusive(
     dt: &DataType,
     lhs: &Interval,
     rhs: &Interval,
-    zero: ScalarValue,
+    zero: &ScalarValue,
 ) -> Interval {
     let (lower, upper) = match (
-        lhs.upper <= zero && !lhs.upper.is_null(),
-        rhs.upper <= zero && !rhs.upper.is_null(),
+        lhs.upper <= *zero && !lhs.upper.is_null(),
+        rhs.upper <= *zero && !rhs.upper.is_null(),
     ) {
         // With the following interval bounds, there is no possibility to create an invalid interval.
         (true, true) => (
@@ -1659,22 +1703,23 @@ fn cast_scalar_value(
 ///
 /// // [1, 2) U {NULL}
 /// let maybe_null = NullableInterval::MaybeNull {
-///    values: Interval::try_new(
-///            ScalarValue::Int32(Some(1)),
-///            ScalarValue::Int32(Some(2)),
-///        ).unwrap(),
+///     values: Interval::try_new(
+///         ScalarValue::Int32(Some(1)),
+///         ScalarValue::Int32(Some(2)),
+///     )
+///     .unwrap(),
 /// };
 ///
 /// // (0, ∞)
 /// let not_null = NullableInterval::NotNull {
-///   values: Interval::try_new(
-///            ScalarValue::Int32(Some(0)),
-///            ScalarValue::Int32(None),
-///        ).unwrap(),
+///     values: Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(None))
+///         .unwrap(),
 /// };
 ///
 /// // {NULL}
-/// let null_interval = NullableInterval::Null { datatype: DataType::Int32 };
+/// let null_interval = NullableInterval::Null {
+///     datatype: DataType::Int32,
+/// };
 ///
 /// // {4}
 /// let single_value = NullableInterval::from(ScalarValue::Int32(Some(4)));
@@ -1722,6 +1767,44 @@ impl From<ScalarValue> for NullableInterval {
 }
 
 impl NullableInterval {
+    /// An interval containing only the 'false' truth value.
+    /// This interval is semantically equivalent to [Interval::FALSE].
+    pub const FALSE: Self = NullableInterval::NotNull {
+        values: Interval::FALSE,
+    };
+
+    /// An interval containing only the 'true' truth value.
+    /// This interval is semantically equivalent to [Interval::TRUE].
+    pub const TRUE: Self = NullableInterval::NotNull {
+        values: Interval::TRUE,
+    };
+
+    /// An interval containing only the 'unknown' truth value.
+    pub const UNKNOWN: Self = NullableInterval::Null {
+        datatype: DataType::Boolean,
+    };
+
+    /// An interval containing both the 'true', and 'false' truth values.
+    /// This interval is semantically equivalent to [Interval::TRUE_OR_FALSE].
+    pub const TRUE_OR_FALSE: Self = NullableInterval::NotNull {
+        values: Interval::TRUE_OR_FALSE,
+    };
+
+    /// An interval containing both the 'true' and 'unknown' truth values.
+    pub const TRUE_OR_UNKNOWN: Self = NullableInterval::MaybeNull {
+        values: Interval::TRUE,
+    };
+
+    /// An interval containing both the 'false' and 'unknown' truth values.
+    pub const FALSE_OR_UNKNOWN: Self = NullableInterval::MaybeNull {
+        values: Interval::FALSE,
+    };
+
+    /// An interval that contains all possible truth values: 'true', 'false' and 'unknown'.
+    pub const ANY_TRUTH_VALUE: Self = NullableInterval::MaybeNull {
+        values: Interval::TRUE_OR_FALSE,
+    };
+
     /// Get the values interval, or None if this interval is definitely null.
     pub fn values(&self) -> Option<&Interval> {
         match self {
@@ -1740,27 +1823,98 @@ impl NullableInterval {
 
     /// Return true if the value is definitely true (and not null).
     pub fn is_certainly_true(&self) -> bool {
-        match self {
-            Self::Null { .. } | Self::MaybeNull { .. } => false,
-            Self::NotNull { values } => values == &Interval::CERTAINLY_TRUE,
+        self == &Self::TRUE
+    }
+
+    /// Returns the set of possible values after applying the `is true` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_true(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (true, false, false) => Ok(Self::TRUE),
+            (true, _, _) => Ok(Self::TRUE_OR_FALSE),
+            (false, _, _) => Ok(Self::FALSE),
         }
     }
 
     /// Return true if the value is definitely false (and not null).
     pub fn is_certainly_false(&self) -> bool {
-        match self {
-            Self::Null { .. } => false,
-            Self::MaybeNull { .. } => false,
-            Self::NotNull { values } => values == &Interval::CERTAINLY_FALSE,
+        self == &Self::FALSE
+    }
+
+    /// Returns the set of possible values after applying the `is false` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_false(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (false, true, false) => Ok(Self::TRUE),
+            (_, true, _) => Ok(Self::TRUE_OR_FALSE),
+            (_, false, _) => Ok(Self::FALSE),
+        }
+    }
+
+    /// Return true if the value is definitely null (and not true or false).
+    pub fn is_certainly_unknown(&self) -> bool {
+        self == &Self::UNKNOWN
+    }
+
+    /// Returns the set of possible values after applying the `is unknown` test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'.
+    pub fn is_unknown(&self) -> Result<Self> {
+        let (t, f, u) = self.is_true_false_unknown()?;
+
+        match (t, f, u) {
+            (false, false, true) => Ok(Self::TRUE),
+            (_, _, true) => Ok(Self::TRUE_OR_FALSE),
+            (_, _, false) => Ok(Self::FALSE),
         }
     }
 
-    /// Perform logical negation on a boolean nullable interval.
-    fn not(&self) -> Result<Self> {
+    /// Returns a tuple of booleans indicating if this interval contains the
+    /// true, false, and unknown truth values respectively.
+    fn is_true_false_unknown(&self) -> Result<(bool, bool, bool), DataFusionError> {
+        Ok(match self {
+            NullableInterval::Null { .. } => (false, false, true),
+            NullableInterval::MaybeNull { values } => (
+                values.contains_value(ScalarValue::Boolean(Some(true)))?,
+                values.contains_value(ScalarValue::Boolean(Some(false)))?,
+                true,
+            ),
+            NullableInterval::NotNull { values } => (
+                values.contains_value(ScalarValue::Boolean(Some(true)))?,
+                values.contains_value(ScalarValue::Boolean(Some(false)))?,
+                false,
+            ),
+        })
+    }
+
+    /// Returns an interval representing the set of possible values after applying
+    /// SQL three-valued logical NOT on possible value in this interval.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///  A  | ¬A
+    /// ----|----
+    ///  F  |  T
+    ///  U  |  U
+    ///  T  |  F
+    /// ```
+    pub fn not(&self) -> Result<Self> {
         match self {
-            Self::Null { datatype } => Ok(Self::Null {
-                datatype: datatype.clone(),
-            }),
+            Self::Null { datatype } => {
+                assert_eq_or_internal_err!(
+                    datatype,
+                    &DataType::Boolean,
+                    "Cannot apply logical negation to a non-boolean interval"
+                );
+                Ok(Self::UNKNOWN)
+            }
             Self::MaybeNull { values } => Ok(Self::MaybeNull {
                 values: values.not()?,
             }),
@@ -1770,28 +1924,112 @@ impl NullableInterval {
         }
     }
 
+    /// Returns an interval representing the set of possible values after applying SQL
+    /// three-valued logical AND on each combination of possible values from `self` and `other`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///       │   B
+    /// A ∧ B ├──────
+    ///       │ F U T
+    /// ──┬───┼──────
+    ///   │ F │ F F F
+    /// A │ U │ F U U
+    ///   │ T │ F U T
+    /// ```
+    pub fn and<T: Borrow<Self>>(&self, rhs: T) -> Result<Self> {
+        if self == &Self::FALSE || rhs.borrow() == &Self::FALSE {
+            return Ok(Self::FALSE);
+        }
+
+        match (self.values(), rhs.borrow().values()) {
+            (Some(l), Some(r)) => {
+                let values = l.and(r)?;
+                match (self, rhs.borrow()) {
+                    (Self::NotNull { .. }, Self::NotNull { .. }) => {
+                        Ok(Self::NotNull { values })
+                    }
+                    _ => Ok(Self::MaybeNull { values }),
+                }
+            }
+            (Some(v), None) | (None, Some(v)) => {
+                if v.contains_value(ScalarValue::Boolean(Some(false)))? {
+                    Ok(Self::FALSE_OR_UNKNOWN)
+                } else {
+                    Ok(Self::UNKNOWN)
+                }
+            }
+            _ => Ok(Self::UNKNOWN),
+        }
+    }
+
+    /// Returns an interval representing the set of possible values after applying SQL three-valued
+    /// logical OR on each combination of possible values from `self` and `other`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///       │   B
+    /// A ∨ B ├──────
+    ///       │ F U T
+    /// ──┬───┼──────
+    ///   │ F │ F U T
+    /// A │ U │ U U T
+    ///   │ T │ T T T
+    /// ```
+    pub fn or<T: Borrow<Self>>(&self, rhs: T) -> Result<Self> {
+        if self == &Self::TRUE || rhs.borrow() == &Self::TRUE {
+            return Ok(Self::TRUE);
+        }
+
+        match (self.values(), rhs.borrow().values()) {
+            (Some(l), Some(r)) => {
+                let values = l.or(r)?;
+                match (self, rhs.borrow()) {
+                    (Self::NotNull { .. }, Self::NotNull { .. }) => {
+                        Ok(Self::NotNull { values })
+                    }
+                    _ => Ok(Self::MaybeNull { values }),
+                }
+            }
+            (Some(v), None) | (None, Some(v)) => {
+                if v.contains_value(ScalarValue::Boolean(Some(true)))? {
+                    Ok(Self::TRUE_OR_UNKNOWN)
+                } else {
+                    Ok(Self::UNKNOWN)
+                }
+            }
+            _ => Ok(Self::UNKNOWN),
+        }
+    }
+
     /// Apply the given operator to this interval and the given interval.
     ///
     /// # Examples
     ///
     /// ```
     /// use datafusion_common::ScalarValue;
-    /// use datafusion_expr_common::operator::Operator;
     /// use datafusion_expr_common::interval_arithmetic::Interval;
     /// use datafusion_expr_common::interval_arithmetic::NullableInterval;
+    /// use datafusion_expr_common::operator::Operator;
     ///
     /// // 4 > 3 -> true
     /// let lhs = NullableInterval::from(ScalarValue::Int32(Some(4)));
     /// let rhs = NullableInterval::from(ScalarValue::Int32(Some(3)));
     /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap();
-    /// assert_eq!(result, NullableInterval::from(ScalarValue::Boolean(Some(true))));
+    /// assert_eq!(
+    ///     result,
+    ///     NullableInterval::from(ScalarValue::Boolean(Some(true)))
+    /// );
     ///
     /// // [1, 3) > NULL -> NULL
     /// let lhs = NullableInterval::NotNull {
     ///     values: Interval::try_new(
-    ///            ScalarValue::Int32(Some(1)),
-    ///            ScalarValue::Int32(Some(3)),
-    ///        ).unwrap(),
+    ///         ScalarValue::Int32(Some(1)),
+    ///         ScalarValue::Int32(Some(3)),
+    ///     )
+    ///     .unwrap(),
     /// };
     /// let rhs = NullableInterval::from(ScalarValue::Int32(None));
     /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap();
@@ -1800,29 +2038,34 @@ impl NullableInterval {
     /// // [1, 3] > [2, 4] -> [false, true]
     /// let lhs = NullableInterval::NotNull {
     ///     values: Interval::try_new(
-    ///            ScalarValue::Int32(Some(1)),
-    ///            ScalarValue::Int32(Some(3)),
-    ///        ).unwrap(),
+    ///         ScalarValue::Int32(Some(1)),
+    ///         ScalarValue::Int32(Some(3)),
+    ///     )
+    ///     .unwrap(),
     /// };
     /// let rhs = NullableInterval::NotNull {
-    ///    values: Interval::try_new(
-    ///            ScalarValue::Int32(Some(2)),
-    ///            ScalarValue::Int32(Some(4)),
-    ///        ).unwrap(),
+    ///     values: Interval::try_new(
+    ///         ScalarValue::Int32(Some(2)),
+    ///         ScalarValue::Int32(Some(4)),
+    ///     )
+    ///     .unwrap(),
     /// };
     /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap();
     /// // Both inputs are valid (non-null), so result must be non-null
-    /// assert_eq!(result, NullableInterval::NotNull {
-    /// // Uncertain whether inequality is true or false
-    ///    values: Interval::UNCERTAIN,
-    /// });
+    /// assert_eq!(
+    ///     result,
+    ///     NullableInterval::NotNull {
+    ///         // Uncertain whether inequality is true or false
+    ///         values: Interval::TRUE_OR_FALSE,
+    ///     }
+    /// );
     /// ```
     pub fn apply_operator(&self, op: &Operator, rhs: &Self) -> Result<Self> {
         match op {
             Operator::IsDistinctFrom => {
                 let values = match (self, rhs) {
                     // NULL is distinct from NULL -> False
-                    (Self::Null { .. }, Self::Null { .. }) => Interval::CERTAINLY_FALSE,
+                    (Self::Null { .. }, Self::Null { .. }) => Interval::FALSE,
                     // x is distinct from y -> x != y,
                     // if at least one of them is never null.
                     (Self::NotNull { .. }, _) | (_, Self::NotNull { .. }) => {
@@ -1832,11 +2075,11 @@ impl NullableInterval {
                             (Some(lhs_values), Some(rhs_values)) => {
                                 lhs_values.equal(rhs_values)?.not()?
                             }
-                            (Some(_), None) | (None, Some(_)) => Interval::CERTAINLY_TRUE,
+                            (Some(_), None) | (None, Some(_)) => Interval::TRUE,
                             (None, None) => unreachable!("Null case handled above"),
                         }
                     }
-                    _ => Interval::UNCERTAIN,
+                    _ => Interval::TRUE_OR_FALSE,
                 };
                 // IsDistinctFrom never returns null.
                 Ok(Self::NotNull { values })
@@ -1844,6 +2087,8 @@ impl NullableInterval {
             Operator::IsNotDistinctFrom => self
                 .apply_operator(&Operator::IsDistinctFrom, rhs)
                 .map(|i| i.not())?,
+            Operator::And => self.and(rhs),
+            Operator::Or => self.or(rhs),
             _ => {
                 if let (Some(left_values), Some(right_values)) =
                     (self.values(), rhs.values())
@@ -1893,6 +2138,30 @@ impl NullableInterval {
         }
     }
 
+    /// Determines if this interval contains a [`ScalarValue`] or not.
+    pub fn contains_value<T: Borrow<ScalarValue>>(&self, value: T) -> Result<bool> {
+        match value.borrow() {
+            ScalarValue::Null => match self {
+                NullableInterval::Null { .. } | NullableInterval::MaybeNull { .. } => {
+                    Ok(true)
+                }
+                NullableInterval::NotNull { .. } => Ok(false),
+            },
+            s if s.is_null() => match self {
+                NullableInterval::Null { datatype } => Ok(datatype.eq(&s.data_type())),
+                NullableInterval::MaybeNull { values } => {
+                    Ok(values.data_type().eq(&s.data_type()))
+                }
+                NullableInterval::NotNull { .. } => Ok(false),
+            },
+            s => match self {
+                NullableInterval::Null { .. } => Ok(false),
+                NullableInterval::MaybeNull { values }
+                | NullableInterval::NotNull { values } => values.contains_value(s),
+            },
+        }
+    }
+
     /// If the interval has collapsed to a single value, return that value.
     /// Otherwise, returns `None`.
     ///
@@ -1913,7 +2182,8 @@ impl NullableInterval {
     ///     values: Interval::try_new(
     ///         ScalarValue::Int32(Some(1)),
     ///         ScalarValue::Int32(Some(4)),
-    ///     ).unwrap(),
+    ///     )
+    ///     .unwrap(),
     /// };
     /// assert_eq!(interval.single_value(), None);
     /// ```
@@ -1936,11 +2206,12 @@ impl NullableInterval {
 mod tests {
     use crate::{
         interval_arithmetic::{
-            handle_overflow, next_value, prev_value, satisfy_greater, Interval,
+            Interval, handle_overflow, next_value, prev_value, satisfy_greater,
         },
         operator::Operator,
     };
 
+    use crate::interval_arithmetic::NullableInterval;
     use arrow::datatypes::DataType;
     use datafusion_common::rounding::{next_down, next_up};
     use datafusion_common::{Result, ScalarValue};
@@ -1981,10 +2252,12 @@ mod tests {
             ScalarValue::Float64(Some(1e-6)),
         ];
         values.into_iter().zip(eps).for_each(|(value, eps)| {
-            assert!(next_value(value.clone())
-                .sub(value.clone())
-                .unwrap()
-                .lt(&eps));
+            assert!(
+                next_value(value.clone())
+                    .sub(value.clone())
+                    .unwrap()
+                    .lt(&eps)
+            );
             assert!(value.sub(prev_value(value.clone())).unwrap().lt(&eps));
             assert_ne!(next_value(value.clone()), value);
             assert_ne!(prev_value(value.clone()), value);
@@ -2162,8 +2435,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.lt(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.gt(second.clone())?, Interval::TRUE);
+            assert_eq!(second.lt(first)?, Interval::TRUE);
         }
 
         let possibly_gt_cases = vec![
@@ -2199,8 +2472,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.lt(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.gt(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.lt(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_gt_cases = vec![
@@ -2236,8 +2509,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_gt_cases {
-            assert_eq!(first.gt(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.lt(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.gt(second.clone())?, Interval::FALSE);
+            assert_eq!(second.lt(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2282,8 +2555,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.lt_eq(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::TRUE);
+            assert_eq!(second.lt_eq(first)?, Interval::TRUE);
         }
 
         let possibly_gteq_cases = vec![
@@ -2319,8 +2592,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.lt_eq(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.lt_eq(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_gteq_cases = vec![
@@ -2352,8 +2625,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_gteq_cases {
-            assert_eq!(first.gt_eq(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.lt_eq(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.gt_eq(second.clone())?, Interval::FALSE);
+            assert_eq!(second.lt_eq(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2380,8 +2653,8 @@ mod tests {
             ),
         ];
         for (first, second) in exactly_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::CERTAINLY_TRUE);
-            assert_eq!(second.equal(first)?, Interval::CERTAINLY_TRUE);
+            assert_eq!(first.equal(second.clone())?, Interval::TRUE);
+            assert_eq!(second.equal(first)?, Interval::TRUE);
         }
 
         let possibly_eq_cases = vec![
@@ -2417,8 +2690,8 @@ mod tests {
             ),
         ];
         for (first, second) in possibly_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::UNCERTAIN);
-            assert_eq!(second.equal(first)?, Interval::UNCERTAIN);
+            assert_eq!(first.equal(second.clone())?, Interval::TRUE_OR_FALSE);
+            assert_eq!(second.equal(first)?, Interval::TRUE_OR_FALSE);
         }
 
         let not_eq_cases = vec![
@@ -2450,8 +2723,8 @@ mod tests {
             ),
         ];
         for (first, second) in not_eq_cases {
-            assert_eq!(first.equal(second.clone())?, Interval::CERTAINLY_FALSE);
-            assert_eq!(second.equal(first)?, Interval::CERTAINLY_FALSE);
+            assert_eq!(first.equal(second.clone())?, Interval::FALSE);
+            assert_eq!(second.equal(first)?, Interval::FALSE);
         }
 
         Ok(())
@@ -2460,41 +2733,182 @@ mod tests {
     #[test]
     fn and_test() -> Result<()> {
         let cases = vec![
-            (false, true, false, false, false, false),
-            (false, false, false, true, false, false),
-            (false, true, false, true, false, true),
-            (false, true, true, true, false, true),
-            (false, false, false, false, false, false),
-            (true, true, true, true, true, true),
+            (Interval::TRUE_OR_FALSE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::FALSE, Interval::FALSE, Interval::FALSE),
+            (Interval::FALSE, Interval::TRUE_OR_FALSE, Interval::FALSE),
+            (Interval::FALSE, Interval::TRUE, Interval::FALSE),
+            (Interval::TRUE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::TRUE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::TRUE, Interval::TRUE, Interval::TRUE),
         ];
 
         for case in cases {
             assert_eq!(
-                Interval::make(Some(case.0), Some(case.1))?
-                    .and(Interval::make(Some(case.2), Some(case.3))?)?,
-                Interval::make(Some(case.4), Some(case.5))?
+                case.0.and(&case.1)?,
+                case.2,
+                "Failed for {} AND {}",
+                case.0,
+                case.1
             );
         }
         Ok(())
     }
 
     #[test]
-    fn not_test() -> Result<()> {
+    fn or_test() -> Result<()> {
         let cases = vec![
-            (false, true, false, true),
-            (false, false, true, true),
-            (true, true, false, false),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::TRUE_OR_FALSE, Interval::TRUE, Interval::TRUE),
+            (Interval::FALSE, Interval::FALSE, Interval::FALSE),
+            (
+                Interval::FALSE,
+                Interval::TRUE_OR_FALSE,
+                Interval::TRUE_OR_FALSE,
+            ),
+            (Interval::FALSE, Interval::TRUE, Interval::TRUE),
+            (Interval::TRUE, Interval::FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::TRUE_OR_FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::TRUE, Interval::TRUE),
         ];
 
         for case in cases {
             assert_eq!(
-                Interval::make(Some(case.0), Some(case.1))?.not()?,
-                Interval::make(Some(case.2), Some(case.3))?
+                case.0.or(&case.1)?,
+                case.2,
+                "Failed for {} OR {}",
+                case.0,
+                case.1
             );
         }
         Ok(())
     }
 
+    #[test]
+    fn not_test() -> Result<()> {
+        let cases = vec![
+            (Interval::TRUE_OR_FALSE, Interval::TRUE_OR_FALSE),
+            (Interval::FALSE, Interval::TRUE),
+            (Interval::TRUE, Interval::FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(case.0.not()?, case.1, "Failed for NOT {}", case.0);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn test_and_or_with_normalized_boolean_intervals() -> Result<()> {
+        // Verify that NULL boolean bounds are normalized and don't cause errors
+        let from_nulls =
+            Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?;
+
+        assert!(from_nulls.or(&Interval::TRUE).is_ok());
+        assert!(from_nulls.and(&Interval::FALSE).is_ok());
+
+        Ok(())
+    }
+
+    // Tests that there's no such thing as a 'null' boolean interval.
+    // An interval with two `Boolean(None)` boundaries is normalised to `Interval::TRUE_OR_FALSE`.
+    #[test]
+    fn test_null_boolean_interval() {
+        let null_interval =
+            Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))
+                .unwrap();
+
+        assert_eq!(null_interval, Interval::TRUE_OR_FALSE);
+    }
+
+    // Asserts that `Interval::TRUE_OR_FALSE` represents a set that contains `true`, `false`, and does
+    // not contain `null`.
+    #[test]
+    fn test_uncertain_boolean_interval() {
+        assert!(
+            Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(Some(true)))
+                .unwrap()
+        );
+        assert!(
+            Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(Some(false)))
+                .unwrap()
+        );
+        assert!(
+            !Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Boolean(None))
+                .unwrap()
+        );
+        assert!(
+            !Interval::TRUE_OR_FALSE
+                .contains_value(ScalarValue::Null)
+                .unwrap()
+        );
+    }
+
+    #[test]
+    fn test_and_uncertain_boolean_intervals() -> Result<()> {
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::FALSE)?;
+        assert_eq!(and_result, Interval::FALSE);
+
+        let and_result = Interval::FALSE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::FALSE);
+
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::TRUE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
+
+        let and_result = Interval::TRUE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
+
+        let and_result = Interval::TRUE_OR_FALSE.and(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(and_result, Interval::TRUE_OR_FALSE);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_or_uncertain_boolean_intervals() -> Result<()> {
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
+
+        let or_result = Interval::FALSE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
+
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::TRUE)?;
+        assert_eq!(or_result, Interval::TRUE);
+
+        let or_result = Interval::TRUE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE);
+
+        let or_result = Interval::TRUE_OR_FALSE.or(&Interval::TRUE_OR_FALSE)?;
+        assert_eq!(or_result, Interval::TRUE_OR_FALSE);
+
+        Ok(())
+    }
+
     #[test]
     fn intersect_test() -> Result<()> {
         let possible_cases = vec![
@@ -2733,37 +3147,37 @@ mod tests {
             (
                 Interval::make::<i64>(None, None)?,
                 Interval::make::<i64>(None, None)?,
-                Interval::CERTAINLY_TRUE,
+                Interval::TRUE,
             ),
             (
                 Interval::make(Some(1500_i64), Some(2000_i64))?,
                 Interval::make(Some(1501_i64), Some(1999_i64))?,
-                Interval::CERTAINLY_TRUE,
+                Interval::TRUE,
             ),
             (
                 Interval::make(Some(1000_i64), None)?,
                 Interval::make::<i64>(None, None)?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(1000_i64), Some(2000_i64))?,
                 Interval::make(Some(500), Some(1500_i64))?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(16.0), Some(32.0))?,
                 Interval::make(Some(32.0), Some(64.0))?,
-                Interval::UNCERTAIN,
+                Interval::TRUE_OR_FALSE,
             ),
             (
                 Interval::make(Some(1000_i64), None)?,
                 Interval::make(None, Some(0_i64))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::make(Some(1500_i64), Some(2000_i64))?,
                 Interval::make(Some(1000_i64), Some(1499_i64))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::try_new(
@@ -2771,7 +3185,7 @@ mod tests {
                     prev_value(ScalarValue::Float32(Some(1.0))),
                 )?,
                 Interval::make(Some(1.0_f32), Some(1.0_f32))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
             (
                 Interval::try_new(
@@ -2779,7 +3193,7 @@ mod tests {
                     next_value(ScalarValue::Float32(Some(1.0))),
                 )?,
                 Interval::make(Some(1.0_f32), Some(1.0_f32))?,
-                Interval::CERTAINLY_FALSE,
+                Interval::FALSE,
             ),
         ];
         for (first, second, expected) in possible_cases {
@@ -3525,7 +3939,7 @@ mod tests {
         assert_eq!(interval.cardinality().unwrap(), 9178336040581070850);
 
         let interval = Interval::try_new(
-            ScalarValue::UInt64(Some(u64::MIN + 1)),
+            ScalarValue::UInt64(Some(1)),
             ScalarValue::UInt64(Some(u64::MAX)),
         )?;
         assert_eq!(interval.cardinality().unwrap(), u64::MAX);
@@ -3687,6 +4101,76 @@ mod tests {
                 Interval::make(Some(-500.0_f64), Some(1000.0_f64))?,
                 Interval::make(Some(-500.0_f64), Some(500.0_f64))?,
             ),
+            (
+                Interval::make(Some(0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+                true,
+                Interval::make(Some(0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+            ),
+            (
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(-0_i64))?,
+                true,
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(-0_i64))?,
+            ),
+            (
+                Interval::make(Some(0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+                true,
+                Interval::make(Some(0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+            ),
+            (
+                Interval::make(Some(0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+                false,
+                Interval::make(Some(0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(-0.0_f64))?,
+            ),
+            (
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(-0.0_f64))?,
+                true,
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(-0.0_f64))?,
+            ),
+            (
+                Interval::make(Some(-0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(-0.0_f64))?,
+                false,
+                Interval::make(Some(0.0_f64), Some(0.0_f64))?,
+                Interval::make(Some(-0.0_f64), Some(-0.0_f64))?,
+            ),
+            (
+                Interval::make(Some(0_i64), None)?,
+                Interval::make(Some(-0_i64), None)?,
+                true,
+                Interval::make(Some(0_i64), None)?,
+                Interval::make(Some(-0_i64), None)?,
+            ),
+            (
+                Interval::make(Some(0_i64), None)?,
+                Interval::make(Some(-0_i64), None)?,
+                false,
+                Interval::make(Some(1_i64), None)?,
+                Interval::make(Some(-0_i64), None)?,
+            ),
+            (
+                Interval::make(Some(0.0_f64), None)?,
+                Interval::make(Some(-0.0_f64), None)?,
+                true,
+                Interval::make(Some(0.0_f64), None)?,
+                Interval::make(Some(-0.0_f64), None)?,
+            ),
+            (
+                Interval::make(Some(0.0_f64), None)?,
+                Interval::make(Some(-0.0_f64), None)?,
+                false,
+                Interval::make(Some(0.0_f64), None)?,
+                Interval::make(Some(-0.0_f64), None)?,
+            ),
         ];
         for (first, second, includes_endpoints, left_modified, right_modified) in cases {
             assert_eq!(
@@ -3706,6 +4190,16 @@ mod tests {
                 Interval::make(Some(1500.0_f32), Some(2000.0_f32))?,
                 false,
             ),
+            (
+                Interval::make(Some(0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+                false,
+            ),
+            (
+                Interval::make(Some(-0_i64), Some(0_i64))?,
+                Interval::make(Some(-0_i64), Some(-0_i64))?,
+                false,
+            ),
         ];
         for (first, second, includes_endpoints) in infeasible_cases {
             assert_eq!(satisfy_greater(&first, &second, !includes_endpoints)?, None);
@@ -3728,12 +4222,8 @@ mod tests {
     }
 
     macro_rules! capture_mode_change {
-        ($TYPE:ty) => {
-            paste::item! {
-                capture_mode_change_helper!([<capture_mode_change_ $TYPE>],
-                                            [<create_interval_ $TYPE>],
-                                            $TYPE);
-            }
+        ($TYPE:ty, $TEST_FN_NAME:ident, $CREATE_FN_NAME:ident) => {
+            capture_mode_change_helper!($TEST_FN_NAME, $CREATE_FN_NAME, $TYPE);
         };
     }
 
@@ -3761,8 +4251,8 @@ mod tests {
         };
     }
 
-    capture_mode_change!(f32);
-    capture_mode_change!(f64);
+    capture_mode_change!(f32, capture_mode_change_f32, create_interval_f32);
+    capture_mode_change!(f64, capture_mode_change_f64, create_interval_f64);
 
     #[cfg(all(
         any(target_arch = "x86_64", target_arch = "aarch64"),
@@ -3805,4 +4295,465 @@ mod tests {
         let upper = 1.5;
         capture_mode_change_f32((lower, upper), true, true);
     }
+
+    #[test]
+    fn test_is_superset() -> Result<()> {
+        // Test cases: (interval1, interval2, strict, expected)
+        let test_cases = vec![
+            // Equal intervals - non-strict should be true, strict should be false
+            (
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                true,
+                false,
+            ),
+            // Unbounded intervals
+            (
+                Interval::make::<i32>(None, None)?,
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make::<i32>(None, None)?,
+                Interval::make::<i32>(None, None)?,
+                false,
+                true,
+            ),
+            (
+                Interval::make::<i32>(None, None)?,
+                Interval::make::<i32>(None, None)?,
+                true,
+                false,
+            ),
+            // Half-bounded intervals
+            (
+                Interval::make(Some(0_i32), None)?,
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(None, Some(100_i32))?,
+                Interval::make(Some(10_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            // Non-superset cases - partial overlap
+            (
+                Interval::make(Some(0_i32), Some(50_i32))?,
+                Interval::make(Some(25_i32), Some(75_i32))?,
+                false,
+                false,
+            ),
+            (
+                Interval::make(Some(0_i32), Some(50_i32))?,
+                Interval::make(Some(25_i32), Some(75_i32))?,
+                true,
+                false,
+            ),
+            // Non-superset cases - disjoint intervals
+            (
+                Interval::make(Some(0_i32), Some(50_i32))?,
+                Interval::make(Some(60_i32), Some(100_i32))?,
+                false,
+                false,
+            ),
+            // Subset relationship (reversed)
+            (
+                Interval::make(Some(20_i32), Some(80_i32))?,
+                Interval::make(Some(0_i32), Some(100_i32))?,
+                false,
+                false,
+            ),
+            // Float cases
+            (
+                Interval::make(Some(0.0_f32), Some(100.0_f32))?,
+                Interval::make(Some(25.5_f32), Some(75.5_f32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(Some(0.0_f64), Some(100.0_f64))?,
+                Interval::make(Some(0.0_f64), Some(100.0_f64))?,
+                true,
+                false,
+            ),
+            // Edge cases with single point intervals
+            (
+                Interval::make(Some(0_i32), Some(100_i32))?,
+                Interval::make(Some(50_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(Some(50_i32), Some(50_i32))?,
+                Interval::make(Some(50_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(Some(50_i32), Some(50_i32))?,
+                Interval::make(Some(50_i32), Some(50_i32))?,
+                true,
+                false,
+            ),
+            // Boundary touch cases
+            (
+                Interval::make(Some(0_i32), Some(50_i32))?,
+                Interval::make(Some(0_i32), Some(25_i32))?,
+                false,
+                true,
+            ),
+            (
+                Interval::make(Some(0_i32), Some(50_i32))?,
+                Interval::make(Some(25_i32), Some(50_i32))?,
+                false,
+                true,
+            ),
+        ];
+
+        for (interval1, interval2, strict, expected) in test_cases {
+            let result = interval1.is_superset(&interval2, strict)?;
+            assert_eq!(
+                result, expected,
+                "Failed for interval1: {interval1}, interval2: {interval2}, strict: {strict}",
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_and_test() -> Result<()> {
+        // Test cases: (lhs, rhs, expected) => lhs AND rhs = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE, NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                case.0.apply_operator(&Operator::And, &case.1).unwrap(),
+                case.2,
+                "Failed for {} AND {}",
+                case.0,
+                case.1
+            );
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_or_test() -> Result<()> {
+        // Test cases: (lhs, rhs, expected) => lhs OR rhs = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_FALSE, NullableInterval::ANY_TRUTH_VALUE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                case.0.apply_operator(&Operator::Or, &case.1).unwrap(),
+                case.2,
+                "Failed for {} OR {}",
+                case.0,
+                case.1
+            );
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_not_test() -> Result<()> {
+        // Test cases: (interval, expected) => NOT interval = expected
+        #[rustfmt::skip]
+        let cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::UNKNOWN),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::FALSE_OR_UNKNOWN),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_UNKNOWN),
+            (NullableInterval::ANY_TRUTH_VALUE, NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(case.0.not().unwrap(), case.1, "Failed for NOT {}", case.0,);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_true() {
+        // Test cases: (interval, expected) => interval.is_certainly_true() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, true),
+            (NullableInterval::FALSE, false),
+            (NullableInterval::UNKNOWN, false),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_true();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_true() {
+        // Test cases: (interval, expected) => interval.is_true() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::TRUE),
+            (NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_true().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_false() {
+        // Test cases: (interval, expected) => interval.is_certainly_false() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, false),
+            (NullableInterval::FALSE, true),
+            (NullableInterval::UNKNOWN, false),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_false();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_false() {
+        // Test cases: (interval, expected) => interval.is_false() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::TRUE),
+            (NullableInterval::UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_FALSE,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN, NullableInterval::FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_false().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_certainly_unknown() {
+        // Test cases: (interval, expected) => interval.is_certainly_unknown() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, false),
+            (NullableInterval::FALSE, false),
+            (NullableInterval::UNKNOWN, true),
+            (NullableInterval::TRUE_OR_FALSE, false),
+            (NullableInterval::TRUE_OR_UNKNOWN, false),
+            (NullableInterval::FALSE_OR_UNKNOWN, false),
+            (NullableInterval::ANY_TRUTH_VALUE, false),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_certainly_unknown();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_is_unknown() {
+        // Test cases: (interval, expected) => interval.is_unknown() = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, NullableInterval::FALSE),
+            (NullableInterval::FALSE, NullableInterval::FALSE),
+            (NullableInterval::UNKNOWN, NullableInterval::TRUE),
+            (NullableInterval::TRUE_OR_FALSE, NullableInterval::FALSE),
+            (NullableInterval::TRUE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::FALSE_OR_UNKNOWN,NullableInterval::TRUE_OR_FALSE),
+            (NullableInterval::ANY_TRUTH_VALUE,NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for (interval, expected) in test_cases {
+            let result = interval.is_unknown().unwrap();
+            assert_eq!(result, expected, "Failed for interval: {interval}",);
+        }
+    }
+
+    #[test]
+    fn nullable_interval_contains_value() {
+        // Test cases: (interval, value, expected) => interval.contains_value(value) = expected
+        #[rustfmt::skip]
+        let test_cases = vec![
+            (NullableInterval::TRUE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::TRUE, ScalarValue::Boolean(None), false),
+            (NullableInterval::TRUE, ScalarValue::Null, false),
+            (NullableInterval::TRUE, ScalarValue::UInt32(None), false),
+            (NullableInterval::FALSE, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::FALSE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::FALSE, ScalarValue::Boolean(None), false),
+            (NullableInterval::FALSE, ScalarValue::Null, false),
+            (NullableInterval::FALSE, ScalarValue::UInt32(None), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Boolean(None), false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::Null, false),
+            (NullableInterval::TRUE_OR_FALSE, ScalarValue::UInt32(None), false),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(Some(false)), false),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::TRUE_OR_UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(Some(true)), false),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Boolean(None), true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::Null, true),
+            (NullableInterval::FALSE_OR_UNKNOWN, ScalarValue::UInt32(None), false),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(Some(true)), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(Some(false)), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Boolean(None), true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::Null, true),
+            (NullableInterval::ANY_TRUTH_VALUE, ScalarValue::UInt32(None), false),
+        ];
+
+        for (interval, value, expected) in test_cases {
+            let result = interval.contains_value(value.clone()).unwrap();
+            assert_eq!(
+                result, expected,
+                "Failed for interval: {interval} and value {value:?}",
+            );
+        }
+    }
 }
diff --git a/datafusion/expr-common/src/lib.rs b/datafusion/expr-common/src/lib.rs
index 961670a3b7f45..c9a95fd294503 100644
--- a/datafusion/expr-common/src/lib.rs
+++ b/datafusion/expr-common/src/lib.rs
@@ -27,17 +27,23 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod accumulator;
+pub mod casts;
 pub mod columnar_value;
+pub mod dyn_eq;
 pub mod groups_accumulator;
 pub mod interval_arithmetic;
 pub mod operator;
+pub mod placement;
 pub mod signature;
 pub mod sort_properties;
 pub mod statistics;
 pub mod type_coercion;
+
+pub use placement::ExpressionPlacement;
diff --git a/datafusion/expr-common/src/operator.rs b/datafusion/expr-common/src/operator.rs
index 19fc6b80745e2..427069b326f9d 100644
--- a/datafusion/expr-common/src/operator.rs
+++ b/datafusion/expr-common/src/operator.rs
@@ -140,6 +140,10 @@ pub enum Operator {
     ///
     /// Not implemented in DataFusion yet.
     QuestionPipe,
+    /// Colon operator, like `:`
+    ///
+    /// Not implemented in DataFusion yet.
+    Colon,
 }
 
 impl Operator {
@@ -188,7 +192,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => None,
+            | Operator::QuestionPipe
+            | Operator::Colon => None,
         }
     }
 
@@ -229,15 +234,6 @@ impl Operator {
         )
     }
 
-    /// Return true if the comparison operator can be used in interval arithmetic and constraint
-    /// propagation
-    ///
-    /// For example, 'Binary(a, >, b)' expression supports propagation.
-    #[deprecated(since = "43.0.0", note = "please use `supports_propagation` instead")]
-    pub fn is_comparison_operator(&self) -> bool {
-        self.supports_propagation()
-    }
-
     /// Return true if the operator is a logic operator.
     ///
     /// For example, 'Binary(Binary(a, >, b), AND, Binary(a, <, b + 3))' would
@@ -292,7 +288,8 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => None,
+            | Operator::QuestionPipe
+            | Operator::Colon => None,
         }
     }
 
@@ -332,11 +329,67 @@ impl Operator {
             | Operator::AtQuestion
             | Operator::Question
             | Operator::QuestionAnd
-            | Operator::QuestionPipe => 30,
+            | Operator::QuestionPipe
+            | Operator::Colon => 30,
             Operator::Plus | Operator::Minus => 40,
             Operator::Multiply | Operator::Divide | Operator::Modulo => 45,
         }
     }
+
+    /// Returns true if the `Expr::BinaryOperator` with this operator
+    /// is guaranteed to return null if either side is null.
+    pub fn returns_null_on_null(&self) -> bool {
+        match self {
+            Operator::Eq
+            | Operator::NotEq
+            | Operator::Lt
+            | Operator::LtEq
+            | Operator::Gt
+            | Operator::GtEq
+            | Operator::Plus
+            | Operator::Minus
+            | Operator::Multiply
+            | Operator::Divide
+            | Operator::Modulo
+            | Operator::RegexMatch
+            | Operator::RegexIMatch
+            | Operator::RegexNotMatch
+            | Operator::RegexNotIMatch
+            | Operator::LikeMatch
+            | Operator::ILikeMatch
+            | Operator::NotLikeMatch
+            | Operator::NotILikeMatch
+            | Operator::BitwiseAnd
+            | Operator::BitwiseOr
+            | Operator::BitwiseXor
+            | Operator::BitwiseShiftRight
+            | Operator::BitwiseShiftLeft
+            | Operator::AtArrow
+            | Operator::ArrowAt
+            | Operator::Arrow
+            | Operator::LongArrow
+            | Operator::HashArrow
+            | Operator::HashLongArrow
+            | Operator::AtAt
+            | Operator::IntegerDivide
+            | Operator::HashMinus
+            | Operator::AtQuestion
+            | Operator::Question
+            | Operator::QuestionAnd
+            | Operator::QuestionPipe
+            | Operator::Colon => true,
+
+            // E.g. `TRUE OR NULL` is `TRUE`
+            Operator::Or
+            // E.g. `FALSE AND NULL` is `FALSE`
+            | Operator::And
+            // IS DISTINCT FROM and IS NOT DISTINCT FROM always return a TRUE/FALSE value, never NULL
+            | Operator::IsDistinctFrom
+            | Operator::IsNotDistinctFrom
+            // DataFusion string concatenation operator treats NULL as an empty string
+            | Operator::StringConcat => false,
+        }
+    }
 }
 
 impl fmt::Display for Operator {
@@ -384,6 +437,7 @@ impl fmt::Display for Operator {
             Operator::Question => "?",
             Operator::QuestionAnd => "?&",
             Operator::QuestionPipe => "?|",
+            Operator::Colon => ":",
         };
         write!(f, "{display}")
     }
diff --git a/datafusion/expr-common/src/placement.rs b/datafusion/expr-common/src/placement.rs
new file mode 100644
index 0000000000000..8212ba618e322
--- /dev/null
+++ b/datafusion/expr-common/src/placement.rs
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Expression placement information for optimization decisions.
+
+/// Describes where an expression should be placed in the query plan for
+/// optimal execution. This is used by optimizers to make decisions about
+/// expression placement, such as whether to push expressions down through
+/// projections.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ExpressionPlacement {
+    /// A constant literal value.
+    Literal,
+    /// A simple column reference.
+    Column,
+    /// A cheap expression that can be pushed to leaf nodes in the plan.
+    /// Examples include `get_field` for struct field access.
+    /// Pushing these expressions down in the plan can reduce data early
+    /// at low compute cost.
+    /// See [`ExpressionPlacement::should_push_to_leaves`] for details.
+    MoveTowardsLeafNodes,
+    /// An expensive expression that should stay where it is in the plan.
+    /// Examples include complex scalar functions or UDFs.
+    KeepInPlace,
+}
+
+impl ExpressionPlacement {
+    /// Returns true if the expression can be pushed down to leaf nodes
+    /// in the query plan.
+    ///
+    /// This returns true for:
+    /// - [`ExpressionPlacement::Column`]: Simple column references can be pushed down. They do no compute and do not increase or
+    ///   decrease the amount of data being processed.
+    ///   A projection that reduces the number of columns can eliminate unnecessary data early,
+    ///   but this method only considers one expression at a time, not a projection as a whole.
+    /// - [`ExpressionPlacement::MoveTowardsLeafNodes`]: Cheap expressions can be pushed down to leaves to take advantage of
+    ///   early computation and potential optimizations at the data source level.
+    ///   For example `struct_col['field']` is cheap to compute (just an Arc clone of the nested array for `'field'`)
+    ///   and thus can reduce data early in the plan at very low compute cost.
+    ///   It may even be possible to eliminate the expression entirely if the data source can project only the needed field
+    ///   (as e.g. Parquet can).
+    pub fn should_push_to_leaves(&self) -> bool {
+        matches!(
+            self,
+            ExpressionPlacement::Column | ExpressionPlacement::MoveTowardsLeafNodes
+        )
+    }
+}
diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs
index 5e1705d8ff615..82759be9f75e8 100644
--- a/datafusion/expr-common/src/signature.rs
+++ b/datafusion/expr-common/src/signature.rs
@@ -15,17 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Signature module contains foundational types that are used to represent signatures, types,
-//! and return types of functions in DataFusion.
+//! Function signatures: [`Volatility`], [`Signature`] and [`TypeSignature`]
 
 use std::fmt::Display;
 use std::hash::Hash;
+use std::sync::Arc;
 
 use crate::type_coercion::aggregates::NUMERICS;
-use arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
-use datafusion_common::internal_err;
+use arrow::datatypes::{
+    DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
+    Decimal128Type, DecimalType, Field, IntervalUnit, TimeUnit,
+};
 use datafusion_common::types::{LogicalType, LogicalTypeRef, NativeType};
 use datafusion_common::utils::ListCoercion;
+use datafusion_common::{Result, internal_err, plan_err};
 use indexmap::IndexSet;
 use itertools::Itertools;
 
@@ -44,42 +47,97 @@ pub const TIMEZONE_WILDCARD: &str = "+TZ";
 /// valid length. It exists to avoid the need to enumerate all possible fixed size list lengths.
 pub const FIXED_SIZE_LIST_WILDCARD: i32 = i32::MIN;
 
-/// A function's volatility, which defines the functions eligibility for certain optimizations
+/// How a function's output changes with respect to a fixed input
+///
+/// The volatility of a function determines eligibility for certain
+/// optimizations. You should always define your function to have the strictest
+/// possible volatility to maximize performance and avoid unexpected
+/// results.
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub enum Volatility {
-    /// An immutable function will always return the same output when given the same
-    /// input. DataFusion will attempt to inline immutable functions during planning.
+    /// Always returns the same output when given the same input.
+    ///
+    /// DataFusion will inline immutable functions during planning.
+    ///
+    /// For example, the `abs` function is immutable, so `abs(-1)` will be
+    /// evaluated and replaced  with `1` during planning rather than invoking
+    /// the function at runtime.
     Immutable,
-    /// A stable function may return different values given the same input across different
-    /// queries but must return the same value for a given input within a query. An example of
-    /// this is the `Now` function. DataFusion will attempt to inline `Stable` functions
-    /// during planning, when possible.
-    /// For query `select col1, now() from t1`, it might take a while to execute but
-    /// `now()` column will be the same for each output row, which is evaluated
-    /// during planning.
+    /// May return different values given the same input across different
+    /// queries but must return the same value for a given input within a query.
+    ///
+    /// For example, the `now()` function is stable, because the query `select
+    /// col1, now() from t1`, will return different results each time it is run,
+    /// but within the same query, the output of the `now()` function has the
+    /// same value for each output row.
+    ///
+    /// DataFusion will inline `Stable` functions when possible. For example,
+    /// `Stable` functions are inlined when planning a query for execution, but
+    /// not in View definitions or prepared statements.
     Stable,
-    /// A volatile function may change the return value from evaluation to evaluation.
-    /// Multiple invocations of a volatile function may return different results when used in the
-    /// same query. An example of this is the random() function. DataFusion
-    /// can not evaluate such functions during planning.
-    /// In the query `select col1, random() from t1`, `random()` function will be evaluated
-    /// for each output row, resulting in a unique random value for each row.
+    /// May change the return value from evaluation to evaluation.
+    ///
+    /// Multiple invocations of a volatile function may return different results
+    /// when used in the same query on different rows. An example of this is the
+    /// `random()` function.
+    ///
+    /// DataFusion can not evaluate such functions during planning or push these
+    /// predicates into scans. In the query `select col1, random() from t1`,
+    /// `random()` function will be evaluated for each output row, resulting in
+    /// a unique random value for each row.
     Volatile,
 }
 
-/// A function's type signature defines the types of arguments the function supports.
+/// Represents the arity (number of arguments) of a function signature
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Arity {
+    /// Fixed number of arguments
+    Fixed(usize),
+    /// Variable number of arguments (e.g., Variadic, VariadicAny, UserDefined)
+    Variable,
+}
+
+/// The types of arguments for which a function has implementations.
+///
+/// [`TypeSignature`] **DOES NOT** define the types that a user query could call the
+/// function with. DataFusion will automatically coerce (cast) argument types to
+/// one of the supported function signatures, if possible.
+///
+/// # Overview
+/// Functions typically provide implementations for a small number of different
+/// argument [`DataType`]s, rather than all possible combinations. If a user
+/// calls a function with arguments that do not match any of the declared types,
+/// DataFusion will attempt to automatically coerce (add casts to) function
+/// arguments so they match the [`TypeSignature`]. See the [`type_coercion`] module
+/// for more details
+///
+/// # Example: Numeric Functions
+/// For example, a function like `cos` may only provide an implementation for
+/// [`DataType::Float64`]. When users call `cos` with a different argument type,
+/// such as `cos(int_column)`, and type coercion automatically adds a cast such
+/// as `cos(CAST int_column AS DOUBLE)` during planning.
 ///
-/// Functions typically support only a few different types of arguments compared to the
-/// different datatypes in Arrow. To make functions easy to use, when possible DataFusion
-/// automatically coerces (add casts to) function arguments so they match the type signature.
+/// [`type_coercion`]: crate::type_coercion
 ///
-/// For example, a function like `cos` may only be implemented for `Float64` arguments. To support a query
-/// that calls `cos` with a different argument type, such as `cos(int_column)`, type coercion automatically
-/// adds a cast such as `cos(CAST int_column AS DOUBLE)` during planning.
+/// ## Example: Strings
 ///
-/// # Data Types
+/// There are several different string types in Arrow, such as
+/// [`DataType::Utf8`], [`DataType::LargeUtf8`], and [`DataType::Utf8View`].
 ///
-/// ## Timestamps
+/// Some functions may have specialized implementations for these types, while others
+/// may be able to handle only one of them. For example, a function that
+/// only works with [`DataType::Utf8View`] would have the following signature:
+///
+/// ```
+/// # use arrow::datatypes::DataType;
+/// # use datafusion_expr_common::signature::{TypeSignature};
+/// // Declares the function must be invoked with a single argument of type `Utf8View`.
+/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will
+/// // automatically add a cast to `Utf8View` during planning.
+/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]);
+/// ```
+///
+/// # Example: Timestamps
 ///
 /// Types to match are represented using Arrow's [`DataType`].  [`DataType::Timestamp`] has an optional variable
 /// timezone specification. To specify a function can handle a timestamp with *ANY* timezone, use
@@ -89,11 +147,11 @@ pub enum Volatility {
 /// # use arrow::datatypes::{DataType, TimeUnit};
 /// # use datafusion_expr_common::signature::{TIMEZONE_WILDCARD, TypeSignature};
 /// let type_signature = TypeSignature::Exact(vec![
-///   // A nanosecond precision timestamp with ANY timezone
-///   // matches  Timestamp(Nanosecond, Some("+0:00"))
-///   // matches  Timestamp(Nanosecond, Some("+5:00"))
-///   // does not match  Timestamp(Nanosecond, None)
-///   DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())),
+///     // A nanosecond precision timestamp with ANY timezone
+///     // matches  Timestamp(Nanosecond, Some("+0:00"))
+///     // matches  Timestamp(Nanosecond, Some("+5:00"))
+///     // does not match  Timestamp(Nanosecond, None)
+///     DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())),
 /// ]);
 /// ```
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
@@ -130,8 +188,9 @@ pub enum TypeSignature {
     Exact(Vec<DataType>),
     /// One or more arguments belonging to the [`TypeSignatureClass`], in order.
     ///
-    /// [`Coercion`] contains not only the desired type but also the allowed casts.
-    /// For example, if you expect a function has string type, but you also allow it to be casted from binary type.
+    /// [`Coercion`] contains not only the desired type but also the allowed
+    /// casts. For example, if you expect a function has string type, but you
+    /// also allow it to be casted from binary type.
     ///
     /// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`].
     Coercible(Vec<Coercion>),
@@ -170,7 +229,7 @@ pub enum TypeSignature {
     OneOf(Vec<TypeSignature>),
     /// A function that has an [`ArrayFunctionSignature`]
     ArraySignature(ArrayFunctionSignature),
-    /// One or more arguments of numeric types.
+    /// One or more arguments of numeric types, coerced to a common numeric type.
     ///
     /// See [`NativeType::is_numeric`] to know which type is considered numeric
     ///
@@ -198,6 +257,106 @@ impl TypeSignature {
     pub fn is_one_of(&self) -> bool {
         matches!(self, TypeSignature::OneOf(_))
     }
+
+    /// Returns the arity (expected number of arguments) for this type signature.
+    ///
+    /// Returns `Arity::Fixed(n)` for signatures with a specific argument count,
+    /// or `Arity::Variable` for variable-arity signatures like `Variadic`, `VariadicAny`, `UserDefined`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datafusion_expr_common::signature::{TypeSignature, Arity};
+    /// # use arrow::datatypes::DataType;
+    /// // Exact signature has fixed arity
+    /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]);
+    /// assert_eq!(sig.arity(), Arity::Fixed(2));
+    ///
+    /// // Variadic signature has variable arity
+    /// let sig = TypeSignature::VariadicAny;
+    /// assert_eq!(sig.arity(), Arity::Variable);
+    /// ```
+    pub fn arity(&self) -> Arity {
+        match self {
+            TypeSignature::Exact(types) => Arity::Fixed(types.len()),
+            TypeSignature::Uniform(count, _) => Arity::Fixed(*count),
+            TypeSignature::Numeric(count) => Arity::Fixed(*count),
+            TypeSignature::String(count) => Arity::Fixed(*count),
+            TypeSignature::Comparable(count) => Arity::Fixed(*count),
+            TypeSignature::Any(count) => Arity::Fixed(*count),
+            TypeSignature::Coercible(types) => Arity::Fixed(types.len()),
+            TypeSignature::Nullary => Arity::Fixed(0),
+            TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                arguments,
+                ..
+            }) => Arity::Fixed(arguments.len()),
+            TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray) => {
+                Arity::Fixed(1)
+            }
+            TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray) => {
+                Arity::Fixed(1)
+            }
+            TypeSignature::OneOf(variants) => {
+                // If any variant is Variable, the whole OneOf is Variable
+                let has_variable = variants.iter().any(|v| v.arity() == Arity::Variable);
+                if has_variable {
+                    return Arity::Variable;
+                }
+                // Otherwise, get max arity from all fixed arity variants
+                let max_arity = variants
+                    .iter()
+                    .filter_map(|v| match v.arity() {
+                        Arity::Fixed(n) => Some(n),
+                        Arity::Variable => None,
+                    })
+                    .max();
+                match max_arity {
+                    Some(n) => Arity::Fixed(n),
+                    None => Arity::Variable,
+                }
+            }
+            TypeSignature::Variadic(_)
+            | TypeSignature::VariadicAny
+            | TypeSignature::UserDefined => Arity::Variable,
+        }
+    }
+}
+
+impl Display for TypeSignature {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TypeSignature::Variadic(types) => {
+                write!(f, "Variadic({})", types.iter().join(", "))
+            }
+            TypeSignature::UserDefined => write!(f, "UserDefined"),
+            TypeSignature::VariadicAny => write!(f, "VariadicAny"),
+            TypeSignature::Uniform(count, types) => {
+                write!(f, "Uniform({count}, [{}])", types.iter().join(", "))
+            }
+            TypeSignature::Exact(types) => {
+                write!(f, "Exact({})", types.iter().join(", "))
+            }
+            TypeSignature::Coercible(coercions) => {
+                write!(f, "Coercible({})", coercions.iter().join(", "))
+            }
+            TypeSignature::Comparable(count) => write!(f, "Comparable({count})"),
+            TypeSignature::Any(count) => write!(f, "Any({count})"),
+            TypeSignature::OneOf(sigs) => {
+                write!(f, "OneOf(")?;
+                for (i, sig) in sigs.iter().enumerate() {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "{sig}")?;
+                }
+                write!(f, ")")
+            }
+            TypeSignature::ArraySignature(sig) => write!(f, "ArraySignature({sig})"),
+            TypeSignature::Numeric(count) => write!(f, "Numeric({count})"),
+            TypeSignature::String(count) => write!(f, "String({count})"),
+            TypeSignature::Nullary => write!(f, "Nullary"),
+        }
+    }
 }
 
 /// Represents the class of types that can be used in a function signature.
@@ -206,23 +365,49 @@ impl TypeSignature {
 /// just listing specific DataTypes. For example, TypeSignatureClass::Timestamp matches any timestamp
 /// type regardless of timezone or precision.
 ///
-/// Used primarily with TypeSignature::Coercible to define function signatures that can accept
+/// Used primarily with [`TypeSignature::Coercible`] to define function signatures that can accept
 /// arguments that can be coerced to a particular class of types.
 #[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)]
 pub enum TypeSignatureClass {
+    /// Allows an arbitrary type argument without coercing the argument.
+    Any,
+    /// Timestamps, allowing arbitrary (or no) timezones
     Timestamp,
+    /// All time types
     Time,
+    /// All interval types
     Interval,
+    /// All duration types
     Duration,
+    /// A specific native type
     Native(LogicalTypeRef),
-    // TODO:
-    // Numeric
+    /// Signed and unsigned integers
     Integer,
+    /// All float types
+    Float,
+    /// All decimal types, allowing arbitrary precision & scale
+    Decimal,
+    /// Integers, floats and decimals
+    Numeric,
+    /// Encompasses both the native Binary/LargeBinary types as well as arbitrarily sized FixedSizeBinary types
+    Binary,
 }
 
 impl Display for TypeSignatureClass {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "TypeSignatureClass::{self:?}")
+        match self {
+            Self::Any => write!(f, "Any"),
+            Self::Timestamp => write!(f, "Timestamp"),
+            Self::Time => write!(f, "Time"),
+            Self::Interval => write!(f, "Interval"),
+            Self::Duration => write!(f, "Duration"),
+            Self::Native(logical_type) => write!(f, "{logical_type}"),
+            Self::Integer => write!(f, "Integer"),
+            Self::Float => write!(f, "Float"),
+            Self::Decimal => write!(f, "Decimal"),
+            Self::Numeric => write!(f, "Numeric"),
+            Self::Binary => write!(f, "Binary"),
+        }
     }
 }
 
@@ -233,6 +418,9 @@ impl TypeSignatureClass {
     /// documentation or error messages.
     fn get_example_types(&self) -> Vec<DataType> {
         match self {
+            // TODO: might be too much info to return every single type here
+            //       maybe https://github.com/apache/datafusion/issues/14761 will help here?
+            TypeSignatureClass::Any => vec![],
             TypeSignatureClass::Native(l) => get_data_types(l.native()),
             TypeSignatureClass::Timestamp => {
                 vec![
@@ -255,25 +443,37 @@ impl TypeSignatureClass {
             TypeSignatureClass::Integer => {
                 vec![DataType::Int64]
             }
+            TypeSignatureClass::Binary => {
+                vec![DataType::Binary]
+            }
+            TypeSignatureClass::Decimal => vec![Decimal128Type::DEFAULT_TYPE],
+            TypeSignatureClass::Float => vec![DataType::Float64],
+            TypeSignatureClass::Numeric => vec![
+                DataType::Float64,
+                DataType::Int64,
+                Decimal128Type::DEFAULT_TYPE,
+            ],
         }
     }
 
     /// Does the specified `NativeType` match this type signature class?
-    pub fn matches_native_type(
-        self: &TypeSignatureClass,
-        logical_type: &NativeType,
-    ) -> bool {
+    pub fn matches_native_type(&self, logical_type: &NativeType) -> bool {
         if logical_type == &NativeType::Null {
             return true;
         }
 
         match self {
+            TypeSignatureClass::Any => true,
             TypeSignatureClass::Native(t) if t.native() == logical_type => true,
             TypeSignatureClass::Timestamp if logical_type.is_timestamp() => true,
             TypeSignatureClass::Time if logical_type.is_time() => true,
             TypeSignatureClass::Interval if logical_type.is_interval() => true,
             TypeSignatureClass::Duration if logical_type.is_duration() => true,
             TypeSignatureClass::Integer if logical_type.is_integer() => true,
+            TypeSignatureClass::Binary if logical_type.is_binary() => true,
+            TypeSignatureClass::Decimal if logical_type.is_decimal() => true,
+            TypeSignatureClass::Float if logical_type.is_float() => true,
+            TypeSignatureClass::Numeric if logical_type.is_numeric() => true,
             _ => false,
         }
     }
@@ -283,8 +483,9 @@ impl TypeSignatureClass {
         &self,
         native_type: &NativeType,
         origin_type: &DataType,
-    ) -> datafusion_common::Result<DataType> {
+    ) -> Result<DataType> {
         match self {
+            TypeSignatureClass::Any => Ok(origin_type.to_owned()),
             TypeSignatureClass::Native(logical_type) => {
                 logical_type.native().default_cast_for(origin_type)
             }
@@ -304,6 +505,19 @@ impl TypeSignatureClass {
             TypeSignatureClass::Integer if native_type.is_integer() => {
                 Ok(origin_type.to_owned())
             }
+            TypeSignatureClass::Binary if native_type.is_binary() => {
+                Ok(origin_type.to_owned())
+            }
+            TypeSignatureClass::Decimal if native_type.is_decimal() => {
+                Ok(origin_type.to_owned())
+            }
+            TypeSignatureClass::Float if native_type.is_float() => {
+                Ok(origin_type.to_owned())
+            }
+            TypeSignatureClass::Numeric if native_type.is_numeric() => {
+                Ok(origin_type.to_owned())
+            }
+            _ if native_type.is_null() => Ok(origin_type.to_owned()),
             _ => internal_err!("May miss the matching logic in `matches_native_type`"),
         }
     }
@@ -413,9 +627,11 @@ impl TypeSignature {
                 vec![Self::join_types(types, ", ")]
             }
             TypeSignature::Any(arg_count) => {
-                vec![std::iter::repeat_n("Any", *arg_count)
-                    .collect::<Vec<&str>>()
-                    .join(", ")]
+                vec![
+                    std::iter::repeat_n("Any", *arg_count)
+                        .collect::<Vec<&str>>()
+                        .join(", "),
+                ]
             }
             TypeSignature::UserDefined => {
                 vec!["UserDefined".to_string()]
@@ -430,6 +646,196 @@ impl TypeSignature {
         }
     }
 
+    /// Return string representation of the function signature with parameter names.
+    ///
+    /// This method is similar to [`Self::to_string_repr`] but uses parameter names
+    /// instead of types when available. This is useful for generating more helpful
+    /// error messages.
+    ///
+    /// # Arguments
+    /// * `parameter_names` - Optional slice of parameter names. When provided, these
+    ///   names will be used instead of type names in the output.
+    ///
+    /// # Examples
+    /// ```
+    /// # use datafusion_expr_common::signature::TypeSignature;
+    /// # use arrow::datatypes::DataType;
+    /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]);
+    ///
+    /// // Without names: shows types only
+    /// assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]);
+    ///
+    /// // With names: shows parameter names with types
+    /// assert_eq!(
+    ///     sig.to_string_repr_with_names(Some(&["id".to_string(), "name".to_string()])),
+    ///     vec!["id: Int32, name: Utf8"]
+    /// );
+    /// ```
+    pub fn to_string_repr_with_names(
+        &self,
+        parameter_names: Option<&[String]>,
+    ) -> Vec<String> {
+        match self {
+            TypeSignature::Exact(types) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .zip(types.iter())
+                            .map(|(name, typ)| format!("{name}: {typ}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    vec![Self::join_types(types, ", ")]
+                }
+            }
+            TypeSignature::Any(count) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Any"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    vec![
+                        std::iter::repeat_n("Any", *count)
+                            .collect::<Vec<&str>>()
+                            .join(", "),
+                    ]
+                }
+            }
+            TypeSignature::Uniform(count, types) => {
+                if let Some(names) = parameter_names {
+                    let type_str = Self::join_types(types, "/");
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: {type_str}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            TypeSignature::Coercible(coercions) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .zip(coercions.iter())
+                            .map(|(name, coercion)| format!("{name}: {coercion}"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    vec![Self::join_types(coercions, ", ")]
+                }
+            }
+            TypeSignature::Comparable(count) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Comparable"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            TypeSignature::Numeric(count) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: Numeric"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            TypeSignature::String(count) => {
+                if let Some(names) = parameter_names {
+                    vec![
+                        names
+                            .iter()
+                            .take(*count)
+                            .map(|name| format!("{name}: String"))
+                            .collect::<Vec<_>>()
+                            .join(", "),
+                    ]
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            TypeSignature::Nullary => self.to_string_repr(),
+            TypeSignature::ArraySignature(array_sig) => {
+                if let Some(names) = parameter_names {
+                    match array_sig {
+                        ArrayFunctionSignature::Array { arguments, .. } => {
+                            vec![
+                                names
+                                    .iter()
+                                    .zip(arguments.iter())
+                                    .map(|(name, arg_type)| format!("{name}: {arg_type}"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
+                        }
+                        ArrayFunctionSignature::RecursiveArray => {
+                            vec![
+                                names
+                                    .iter()
+                                    .take(1)
+                                    .map(|name| format!("{name}: recursive_array"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
+                        }
+                        ArrayFunctionSignature::MapArray => {
+                            vec![
+                                names
+                                    .iter()
+                                    .take(1)
+                                    .map(|name| format!("{name}: map_array"))
+                                    .collect::<Vec<_>>()
+                                    .join(", "),
+                            ]
+                        }
+                    }
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            TypeSignature::OneOf(sigs) => sigs
+                .iter()
+                .flat_map(|s| s.to_string_repr_with_names(parameter_names))
+                .collect(),
+            TypeSignature::UserDefined => {
+                if let Some(names) = parameter_names {
+                    vec![names.join(", ")]
+                } else {
+                    self.to_string_repr()
+                }
+            }
+            // Variable arity signatures cannot use parameter names
+            TypeSignature::Variadic(_) | TypeSignature::VariadicAny => {
+                self.to_string_repr()
+            }
+        }
+    }
+
     /// Helper function to join types with specified delimiter.
     pub fn join_types<T: Display>(types: &[T], delimiter: &str) -> String {
         types
@@ -551,8 +957,56 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> {
         NativeType::String => {
             vec![DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View]
         }
-        // TODO: support other native types
-        _ => vec![],
+        NativeType::Decimal(precision, scale) => {
+            // We assume incoming NativeType is valid already, in terms of precision & scale
+            let mut types = vec![DataType::Decimal256(*precision, *scale)];
+            if *precision <= DECIMAL32_MAX_PRECISION {
+                types.push(DataType::Decimal32(*precision, *scale));
+            }
+            if *precision <= DECIMAL64_MAX_PRECISION {
+                types.push(DataType::Decimal64(*precision, *scale));
+            }
+            if *precision <= DECIMAL128_MAX_PRECISION {
+                types.push(DataType::Decimal128(*precision, *scale));
+            }
+            types
+        }
+        NativeType::Timestamp(time_unit, timezone) => {
+            vec![DataType::Timestamp(*time_unit, timezone.to_owned())]
+        }
+        NativeType::Time(TimeUnit::Second) => vec![DataType::Time32(TimeUnit::Second)],
+        NativeType::Time(TimeUnit::Millisecond) => {
+            vec![DataType::Time32(TimeUnit::Millisecond)]
+        }
+        NativeType::Time(TimeUnit::Microsecond) => {
+            vec![DataType::Time64(TimeUnit::Microsecond)]
+        }
+        NativeType::Time(TimeUnit::Nanosecond) => {
+            vec![DataType::Time64(TimeUnit::Nanosecond)]
+        }
+        NativeType::Duration(time_unit) => vec![DataType::Duration(*time_unit)],
+        NativeType::Interval(interval_unit) => vec![DataType::Interval(*interval_unit)],
+        NativeType::FixedSizeBinary(size) => vec![DataType::FixedSizeBinary(*size)],
+        NativeType::FixedSizeList(logical_field, size) => {
+            get_data_types(logical_field.logical_type.native())
+                .iter()
+                .map(|child_dt| {
+                    let field = Field::new(
+                        logical_field.name.clone(),
+                        child_dt.clone(),
+                        logical_field.nullable,
+                    );
+                    DataType::FixedSizeList(Arc::new(field), *size)
+                })
+                .collect()
+        }
+        // TODO: implement for nested types
+        NativeType::List(_)
+        | NativeType::Struct(_)
+        | NativeType::Union(_)
+        | NativeType::Map(_) => {
+            vec![]
+        }
     }
 }
 
@@ -562,8 +1016,8 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> {
 /// # Examples
 ///
 /// ```
+/// use datafusion_common::types::{logical_binary, logical_string, NativeType};
 /// use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-/// use datafusion_common::types::{NativeType, logical_binary, logical_string};
 ///
 /// // Exact coercion that only accepts timestamp types
 /// let exact = Coercion::new_exact(TypeSignatureClass::Timestamp);
@@ -572,7 +1026,7 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> {
 /// let implicit = Coercion::new_implicit(
 ///     TypeSignatureClass::Native(logical_string()),
 ///     vec![TypeSignatureClass::Native(logical_binary())],
-///     NativeType::String
+///     NativeType::String,
 /// );
 /// ```
 ///
@@ -657,12 +1111,7 @@ impl Coercion {
 
 impl Display for Coercion {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Coercion({}", self.desired_type())?;
-        if let Some(implicit_coercion) = self.implicit_coercion() {
-            write!(f, ", implicit_coercion={implicit_coercion}",)
-        } else {
-            write!(f, ")")
-        }
+        write!(f, "{}", self.desired_type())
     }
 }
 
@@ -714,11 +1163,14 @@ pub struct ImplicitCoercion {
 
 impl Display for ImplicitCoercion {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "ImplicitCoercion({:?}, default_type={:?})",
-            self.allowed_source_types, self.default_casted_type
-        )
+        write!(f, "ImplicitCoercion(")?;
+        for (i, source_type) in self.allowed_source_types.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{source_type}")?;
+        }
+        write!(f, "; default={}", self.default_casted_type)
     }
 }
 
@@ -736,16 +1188,25 @@ impl Hash for ImplicitCoercion {
     }
 }
 
-/// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function.
+/// Provides  information necessary for calling a function.
+///
+/// - [`TypeSignature`] defines the argument types that a function has implementations
+///   for.
 ///
-/// DataFusion will automatically coerce (cast) argument types to one of the supported
-/// function signatures, if possible.
+/// - [`Volatility`] defines how the output of the function changes with the input.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
 pub struct Signature {
     /// The data types that the function accepts. See [TypeSignature] for more information.
     pub type_signature: TypeSignature,
     /// The volatility of the function. See [Volatility] for more information.
     pub volatility: Volatility,
+    /// Optional parameter names for the function arguments.
+    ///
+    /// If provided, enables named argument notation for function calls (e.g., `func(a => 1, b => 2)`).
+    /// The length must match the number of arguments defined by `type_signature`.
+    ///
+    /// Defaults to `None`, meaning only positional arguments are supported.
+    pub parameter_names: Option<Vec<String>>,
 }
 
 impl Signature {
@@ -754,6 +1215,7 @@ impl Signature {
         Signature {
             type_signature,
             volatility,
+            parameter_names: None,
         }
     }
     /// An arbitrary number of arguments with the same type, from those listed in `common_types`.
@@ -761,6 +1223,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::Variadic(common_types),
             volatility,
+            parameter_names: None,
         }
     }
     /// User-defined coercion rules for the function.
@@ -768,6 +1231,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::UserDefined,
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -776,14 +1240,16 @@ impl Signature {
         Self {
             type_signature: TypeSignature::Numeric(arg_count),
             volatility,
+            parameter_names: None,
         }
     }
 
-    /// A specified number of numeric arguments
+    /// A specified number of string arguments
     pub fn string(arg_count: usize, volatility: Volatility) -> Self {
         Self {
             type_signature: TypeSignature::String(arg_count),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -792,6 +1258,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::VariadicAny,
             volatility,
+            parameter_names: None,
         }
     }
     /// A fixed number of arguments of the same type, from those listed in `valid_types`.
@@ -803,6 +1270,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::Uniform(arg_count, valid_types),
             volatility,
+            parameter_names: None,
         }
     }
     /// Exactly matches the types in `exact_types`, in order.
@@ -810,6 +1278,7 @@ impl Signature {
         Signature {
             type_signature: TypeSignature::Exact(exact_types),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -818,6 +1287,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::Coercible(target_types),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -826,6 +1296,7 @@ impl Signature {
         Self {
             type_signature: TypeSignature::Comparable(arg_count),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -833,6 +1304,7 @@ impl Signature {
         Signature {
             type_signature: TypeSignature::Nullary,
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -841,6 +1313,7 @@ impl Signature {
         Signature {
             type_signature: TypeSignature::Any(arg_count),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -849,6 +1322,7 @@ impl Signature {
         Signature {
             type_signature: TypeSignature::OneOf(type_signatures),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -865,6 +1339,7 @@ impl Signature {
                 },
             ),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -881,6 +1356,7 @@ impl Signature {
                 },
             ),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -898,6 +1374,7 @@ impl Signature {
                 },
             ),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -910,7 +1387,7 @@ impl Signature {
                         ArrayFunctionArgument::Array,
                         ArrayFunctionArgument::Element,
                     ],
-                    array_coercion: None,
+                    array_coercion: Some(ListCoercion::FixedSizedListToList),
                 }),
                 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
                     arguments: vec![
@@ -918,10 +1395,11 @@ impl Signature {
                         ArrayFunctionArgument::Element,
                         ArrayFunctionArgument::Index,
                     ],
-                    array_coercion: None,
+                    array_coercion: Some(ListCoercion::FixedSizedListToList),
                 }),
             ]),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -938,6 +1416,7 @@ impl Signature {
                 },
             ),
             volatility,
+            parameter_names: None,
         }
     }
 
@@ -945,13 +1424,75 @@ impl Signature {
     pub fn array(volatility: Volatility) -> Self {
         Signature::arrays(1, Some(ListCoercion::FixedSizedListToList), volatility)
     }
+
+    /// Add parameter names to this signature, enabling named argument notation.
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion_expr_common::signature::{Signature, Volatility};
+    /// # use arrow::datatypes::DataType;
+    /// let sig =
+    ///     Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable)
+    ///         .with_parameter_names(vec!["count".to_string(), "name".to_string()]);
+    /// ```
+    ///
+    /// # Errors
+    /// Returns an error if the number of parameter names doesn't match the signature's arity.
+    /// For signatures with variable arity (e.g., `Variadic`, `VariadicAny`), parameter names
+    /// cannot be specified.
+    pub fn with_parameter_names(mut self, names: Vec<impl Into<String>>) -> Result<Self> {
+        let names = names.into_iter().map(Into::into).collect::<Vec<String>>();
+        // Validate that the number of names matches the signature
+        self.validate_parameter_names(&names)?;
+        self.parameter_names = Some(names);
+        Ok(self)
+    }
+
+    /// Validate that parameter names are compatible with this signature
+    fn validate_parameter_names(&self, names: &[String]) -> Result<()> {
+        match self.type_signature.arity() {
+            Arity::Fixed(expected) => {
+                if names.len() != expected {
+                    return plan_err!(
+                        "Parameter names count ({}) does not match signature arity ({})",
+                        names.len(),
+                        expected
+                    );
+                }
+            }
+            Arity::Variable => {
+                // For UserDefined signatures, allow parameter names
+                // The function implementer is responsible for validating the names match the actual arguments
+                if self.type_signature != TypeSignature::UserDefined {
+                    return plan_err!(
+                        "Cannot specify parameter names for variable arity signature: {:?}",
+                        self.type_signature
+                    );
+                }
+            }
+        }
+
+        let mut seen = std::collections::HashSet::new();
+        for name in names {
+            if !seen.insert(name) {
+                return plan_err!("Duplicate parameter name: '{}'", name);
+            }
+        }
+
+        Ok(())
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use datafusion_common::types::{logical_int64, logical_string};
+    use datafusion_common::types::{
+        NativeType, logical_float64, logical_int32, logical_int64, logical_string,
+    };
 
     use super::*;
+    use crate::signature::{
+        ArrayFunctionArgument, ArrayFunctionSignature, Coercion, TypeSignatureClass,
+    };
 
     #[test]
     fn supports_zero_argument_tests() {
@@ -1093,6 +1634,7 @@ mod tests {
                 vec![DataType::UInt16, DataType::UInt16],
                 vec![DataType::UInt32, DataType::UInt32],
                 vec![DataType::UInt64, DataType::UInt64],
+                vec![DataType::Float16, DataType::Float16],
                 vec![DataType::Float32, DataType::Float32],
                 vec![DataType::Float64, DataType::Float64]
             ]
@@ -1109,4 +1651,556 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_signature_with_parameter_names() {
+        let sig = Signature::exact(
+            vec![DataType::Int32, DataType::Utf8],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec!["count".to_string(), "name".to_string()])
+        .unwrap();
+
+        assert_eq!(
+            sig.parameter_names,
+            Some(vec!["count".to_string(), "name".to_string()])
+        );
+        assert_eq!(
+            sig.type_signature,
+            TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8])
+        );
+    }
+
+    #[test]
+    fn test_signature_parameter_names_wrong_count() {
+        let result = Signature::exact(
+            vec![DataType::Int32, DataType::Utf8],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec!["count".to_string()]); // Only 1 name for 2 args
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("does not match signature arity")
+        );
+    }
+
+    #[test]
+    fn test_signature_parameter_names_duplicate() {
+        let result = Signature::exact(
+            vec![DataType::Int32, DataType::Int32],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec!["count".to_string(), "count".to_string()]);
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Duplicate parameter name")
+        );
+    }
+
+    #[test]
+    fn test_signature_parameter_names_variadic() {
+        let result = Signature::variadic(vec![DataType::Int32], Volatility::Immutable)
+            .with_parameter_names(vec!["arg".to_string()]);
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("variable arity signature")
+        );
+    }
+
+    #[test]
+    fn test_signature_without_parameter_names() {
+        let sig = Signature::exact(
+            vec![DataType::Int32, DataType::Utf8],
+            Volatility::Immutable,
+        );
+
+        assert_eq!(sig.parameter_names, None);
+    }
+
+    #[test]
+    fn test_signature_uniform_with_parameter_names() {
+        let sig = Signature::uniform(3, vec![DataType::Float64], Volatility::Immutable)
+            .with_parameter_names(vec!["x".to_string(), "y".to_string(), "z".to_string()])
+            .unwrap();
+
+        assert_eq!(
+            sig.parameter_names,
+            Some(vec!["x".to_string(), "y".to_string(), "z".to_string()])
+        );
+    }
+
+    #[test]
+    fn test_signature_numeric_with_parameter_names() {
+        let sig = Signature::numeric(2, Volatility::Immutable)
+            .with_parameter_names(vec!["a".to_string(), "b".to_string()])
+            .unwrap();
+
+        assert_eq!(
+            sig.parameter_names,
+            Some(vec!["a".to_string(), "b".to_string()])
+        );
+    }
+
+    #[test]
+    fn test_signature_nullary_with_empty_names() {
+        let sig = Signature::nullary(Volatility::Immutable)
+            .with_parameter_names(Vec::<String>::new())
+            .unwrap();
+
+        assert_eq!(sig.parameter_names, Some(vec![]));
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_exact() {
+        let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]);
+
+        assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]);
+
+        let names = vec!["id".to_string(), "name".to_string()];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["id: Int32, name: Utf8"]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_any() {
+        let sig = TypeSignature::Any(3);
+
+        assert_eq!(sig.to_string_repr_with_names(None), vec!["Any, Any, Any"]);
+
+        let names = vec!["x".to_string(), "y".to_string(), "z".to_string()];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["x: Any, y: Any, z: Any"]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_one_of() {
+        let sig =
+            TypeSignature::OneOf(vec![TypeSignature::Any(2), TypeSignature::Any(3)]);
+
+        assert_eq!(
+            sig.to_string_repr_with_names(None),
+            vec!["Any, Any", "Any, Any, Any"]
+        );
+
+        let names = vec![
+            "str".to_string(),
+            "start_pos".to_string(),
+            "length".to_string(),
+        ];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec![
+                "str: Any, start_pos: Any",
+                "str: Any, start_pos: Any, length: Any"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_partial() {
+        // This simulates providing max arity names for a OneOf signature
+        let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]);
+
+        // Provide 3 names for 2-parameter signature (extra name is ignored via zip)
+        let names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["a: Int32, b: Utf8"]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_uniform() {
+        let sig = TypeSignature::Uniform(2, vec![DataType::Float64]);
+
+        assert_eq!(
+            sig.to_string_repr_with_names(None),
+            vec!["Float64, Float64"]
+        );
+
+        let names = vec!["x".to_string(), "y".to_string()];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["x: Float64, y: Float64"]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_coercible() {
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_exact(TypeSignatureClass::Native(logical_int32())),
+            Coercion::new_exact(TypeSignatureClass::Native(logical_int32())),
+        ]);
+
+        let names = vec!["a".to_string(), "b".to_string()];
+        let result = sig.to_string_repr_with_names(Some(&names));
+        // Check that it contains the parameter names with type annotations
+        assert_eq!(result.len(), 1);
+        assert!(result[0].starts_with("a: "));
+        assert!(result[0].contains(", b: "));
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_comparable_numeric_string() {
+        let comparable = TypeSignature::Comparable(3);
+        let numeric = TypeSignature::Numeric(2);
+        let string_sig = TypeSignature::String(2);
+
+        let names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+
+        // All should show parameter names with type annotations
+        assert_eq!(
+            comparable.to_string_repr_with_names(Some(&names)),
+            vec!["a: Comparable, b: Comparable, c: Comparable"]
+        );
+        assert_eq!(
+            numeric.to_string_repr_with_names(Some(&names)),
+            vec!["a: Numeric, b: Numeric"]
+        );
+        assert_eq!(
+            string_sig.to_string_repr_with_names(Some(&names)),
+            vec!["a: String, b: String"]
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_variadic_fallback() {
+        let variadic = TypeSignature::Variadic(vec![DataType::Utf8, DataType::LargeUtf8]);
+        let names = vec!["x".to_string()];
+        assert_eq!(
+            variadic.to_string_repr_with_names(Some(&names)),
+            variadic.to_string_repr()
+        );
+
+        let variadic_any = TypeSignature::VariadicAny;
+        assert_eq!(
+            variadic_any.to_string_repr_with_names(Some(&names)),
+            variadic_any.to_string_repr()
+        );
+
+        // UserDefined now shows parameter names when available
+        let user_defined = TypeSignature::UserDefined;
+        assert_eq!(
+            user_defined.to_string_repr_with_names(Some(&names)),
+            vec!["x"]
+        );
+        assert_eq!(
+            user_defined.to_string_repr_with_names(None),
+            user_defined.to_string_repr()
+        );
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_nullary() {
+        let sig = TypeSignature::Nullary;
+        let names = vec!["x".to_string()];
+
+        // Should return empty representation, names don't apply
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["NullAry()"]
+        );
+        assert_eq!(sig.to_string_repr_with_names(None), vec!["NullAry()"]);
+    }
+
+    #[test]
+    fn test_to_string_repr_with_names_array_signature() {
+        let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+            arguments: vec![
+                ArrayFunctionArgument::Array,
+                ArrayFunctionArgument::Index,
+                ArrayFunctionArgument::Element,
+            ],
+            array_coercion: None,
+        });
+
+        assert_eq!(
+            sig.to_string_repr_with_names(None),
+            vec!["array, index, element"]
+        );
+
+        let names = vec!["arr".to_string(), "idx".to_string(), "val".to_string()];
+        assert_eq!(
+            sig.to_string_repr_with_names(Some(&names)),
+            vec!["arr: array, idx: index, val: element"]
+        );
+
+        let recursive =
+            TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray);
+        let names = vec!["array".to_string()];
+        assert_eq!(
+            recursive.to_string_repr_with_names(Some(&names)),
+            vec!["array: recursive_array"]
+        );
+
+        // Test MapArray (1 argument)
+        let map_array = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray);
+        let names = vec!["map".to_string()];
+        assert_eq!(
+            map_array.to_string_repr_with_names(Some(&names)),
+            vec!["map: map_array"]
+        );
+    }
+
+    #[test]
+    fn test_type_signature_arity_exact() {
+        let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]);
+        assert_eq!(sig.arity(), Arity::Fixed(2));
+
+        let sig = TypeSignature::Exact(vec![]);
+        assert_eq!(sig.arity(), Arity::Fixed(0));
+    }
+
+    #[test]
+    fn test_type_signature_arity_uniform() {
+        let sig = TypeSignature::Uniform(3, vec![DataType::Float64]);
+        assert_eq!(sig.arity(), Arity::Fixed(3));
+
+        let sig = TypeSignature::Uniform(1, vec![DataType::Int32]);
+        assert_eq!(sig.arity(), Arity::Fixed(1));
+    }
+
+    #[test]
+    fn test_type_signature_arity_numeric() {
+        let sig = TypeSignature::Numeric(2);
+        assert_eq!(sig.arity(), Arity::Fixed(2));
+    }
+
+    #[test]
+    fn test_type_signature_arity_string() {
+        let sig = TypeSignature::String(3);
+        assert_eq!(sig.arity(), Arity::Fixed(3));
+    }
+
+    #[test]
+    fn test_type_signature_arity_comparable() {
+        let sig = TypeSignature::Comparable(2);
+        assert_eq!(sig.arity(), Arity::Fixed(2));
+    }
+
+    #[test]
+    fn test_type_signature_arity_any() {
+        let sig = TypeSignature::Any(4);
+        assert_eq!(sig.arity(), Arity::Fixed(4));
+    }
+
+    #[test]
+    fn test_type_signature_arity_coercible() {
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_exact(TypeSignatureClass::Native(logical_int32())),
+            Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+        ]);
+        assert_eq!(sig.arity(), Arity::Fixed(2));
+    }
+
+    #[test]
+    fn test_type_signature_arity_nullary() {
+        let sig = TypeSignature::Nullary;
+        assert_eq!(sig.arity(), Arity::Fixed(0));
+    }
+
+    #[test]
+    fn test_type_signature_arity_array_signature() {
+        // Test Array variant with 2 arguments
+        let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+            arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index],
+            array_coercion: None,
+        });
+        assert_eq!(sig.arity(), Arity::Fixed(2));
+
+        // Test Array variant with 3 arguments
+        let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+            arguments: vec![
+                ArrayFunctionArgument::Array,
+                ArrayFunctionArgument::Element,
+                ArrayFunctionArgument::Index,
+            ],
+            array_coercion: None,
+        });
+        assert_eq!(sig.arity(), Arity::Fixed(3));
+
+        // Test RecursiveArray variant
+        let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray);
+        assert_eq!(sig.arity(), Arity::Fixed(1));
+
+        // Test MapArray variant
+        let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray);
+        assert_eq!(sig.arity(), Arity::Fixed(1));
+    }
+
+    #[test]
+    fn test_type_signature_arity_one_of_fixed() {
+        // OneOf with all fixed arity variants should return max arity
+        let sig = TypeSignature::OneOf(vec![
+            TypeSignature::Exact(vec![DataType::Int32]),
+            TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]),
+            TypeSignature::Exact(vec![
+                DataType::Int32,
+                DataType::Utf8,
+                DataType::Float64,
+            ]),
+        ]);
+        assert_eq!(sig.arity(), Arity::Fixed(3));
+    }
+
+    #[test]
+    fn test_type_signature_arity_one_of_variable() {
+        // OneOf with variable arity variant should return Variable
+        let sig = TypeSignature::OneOf(vec![
+            TypeSignature::Exact(vec![DataType::Int32]),
+            TypeSignature::VariadicAny,
+        ]);
+        assert_eq!(sig.arity(), Arity::Variable);
+    }
+
+    #[test]
+    fn test_type_signature_arity_variadic() {
+        let sig = TypeSignature::Variadic(vec![DataType::Int32]);
+        assert_eq!(sig.arity(), Arity::Variable);
+
+        let sig = TypeSignature::VariadicAny;
+        assert_eq!(sig.arity(), Arity::Variable);
+    }
+
+    #[test]
+    fn test_type_signature_arity_user_defined() {
+        let sig = TypeSignature::UserDefined;
+        assert_eq!(sig.arity(), Arity::Variable);
+    }
+
+    #[test]
+    fn test_type_signature_display() {
+        use insta::assert_snapshot;
+
+        assert_snapshot!(TypeSignature::Nullary, @"Nullary");
+        assert_snapshot!(TypeSignature::Any(2), @"Any(2)");
+        assert_snapshot!(TypeSignature::Numeric(3), @"Numeric(3)");
+        assert_snapshot!(TypeSignature::String(1), @"String(1)");
+        assert_snapshot!(TypeSignature::Comparable(2), @"Comparable(2)");
+        assert_snapshot!(TypeSignature::VariadicAny, @"VariadicAny");
+        assert_snapshot!(TypeSignature::UserDefined, @"UserDefined");
+
+        assert_snapshot!(
+            TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]),
+            @"Exact(Int32, Utf8)"
+        );
+        assert_snapshot!(
+            TypeSignature::Variadic(vec![DataType::Utf8, DataType::LargeUtf8]),
+            @"Variadic(Utf8, LargeUtf8)"
+        );
+        assert_snapshot!(
+            TypeSignature::Uniform(2, vec![DataType::Float32, DataType::Float64]),
+            @"Uniform(2, [Float32, Float64])"
+        );
+
+        assert_snapshot!(
+            TypeSignature::Coercible(vec![
+                Coercion::new_exact(TypeSignatureClass::Native(logical_float64())),
+                Coercion::new_exact(TypeSignatureClass::Native(logical_int32())),
+            ]),
+            @"Coercible(Float64, Int32)"
+        );
+
+        assert_snapshot!(
+            TypeSignature::OneOf(vec![
+                TypeSignature::Nullary,
+                TypeSignature::VariadicAny,
+            ]),
+            @"OneOf(Nullary, VariadicAny)"
+        );
+    }
+
+    #[test]
+    fn test_type_signature_class_display() {
+        use insta::assert_snapshot;
+
+        assert_snapshot!(TypeSignatureClass::Any, @"Any");
+        assert_snapshot!(TypeSignatureClass::Numeric, @"Numeric");
+        assert_snapshot!(TypeSignatureClass::Integer, @"Integer");
+        assert_snapshot!(TypeSignatureClass::Float, @"Float");
+        assert_snapshot!(TypeSignatureClass::Decimal, @"Decimal");
+        assert_snapshot!(TypeSignatureClass::Timestamp, @"Timestamp");
+        assert_snapshot!(TypeSignatureClass::Time, @"Time");
+        assert_snapshot!(TypeSignatureClass::Interval, @"Interval");
+        assert_snapshot!(TypeSignatureClass::Duration, @"Duration");
+        assert_snapshot!(TypeSignatureClass::Binary, @"Binary");
+        assert_snapshot!(TypeSignatureClass::Native(logical_int32()), @"Int32");
+        assert_snapshot!(TypeSignatureClass::Native(logical_string()), @"String");
+    }
+
+    #[test]
+    fn test_coercion_display() {
+        use insta::assert_snapshot;
+
+        let exact_int = Coercion::new_exact(TypeSignatureClass::Native(logical_int32()));
+        assert_snapshot!(exact_int, @"Int32");
+
+        let exact_numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
+        assert_snapshot!(exact_numeric, @"Numeric");
+
+        let implicit = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+        assert_snapshot!(implicit, @"Float64");
+
+        let implicit_with_multiple_sources = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer, TypeSignatureClass::Numeric],
+            NativeType::Int64,
+        );
+        assert_snapshot!(implicit_with_multiple_sources, @"Int64");
+    }
+
+    #[test]
+    fn test_to_string_repr_coercible() {
+        use insta::assert_snapshot;
+
+        // Simulates a function like round(Float64, Int64) with coercion
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_implicit(
+                TypeSignatureClass::Native(logical_float64()),
+                vec![TypeSignatureClass::Numeric],
+                NativeType::Float64,
+            ),
+            Coercion::new_implicit(
+                TypeSignatureClass::Native(logical_int64()),
+                vec![TypeSignatureClass::Integer],
+                NativeType::Int64,
+            ),
+        ]);
+        let repr = sig.to_string_repr();
+        assert_eq!(repr.len(), 1);
+        assert_snapshot!(repr[0], @"Float64, Int64");
+    }
+
+    #[test]
+    fn test_to_string_repr_coercible_exact() {
+        use insta::assert_snapshot;
+
+        let sig = TypeSignature::Coercible(vec![
+            Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+            Coercion::new_exact(TypeSignatureClass::Native(logical_int64())),
+        ]);
+        let repr = sig.to_string_repr();
+        assert_eq!(repr.len(), 1);
+        assert_snapshot!(repr[0], @"String, Int64");
+    }
 }
diff --git a/datafusion/expr-common/src/statistics.rs b/datafusion/expr-common/src/statistics.rs
index 14f2f331ef5b0..6c8cef35b3a71 100644
--- a/datafusion/expr-common/src/statistics.rs
+++ b/datafusion/expr-common/src/statistics.rs
@@ -17,14 +17,17 @@
 
 use std::f64::consts::LN_2;
 
-use crate::interval_arithmetic::{apply_operator, Interval};
+use crate::interval_arithmetic::{Interval, apply_operator};
 use crate::operator::Operator;
 use crate::type_coercion::binary::binary_numeric_coercion;
 
 use arrow::array::ArrowNativeTypeOp;
 use arrow::datatypes::DataType;
 use datafusion_common::rounding::alter_fp_rounding_mode;
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, assert_eq_or_internal_err, assert_ne_or_internal_err,
+    assert_or_internal_err, internal_err, not_impl_err,
+};
 
 /// This object defines probabilistic distributions that encode uncertain
 /// information about a single, scalar value. Currently, we support five core
@@ -159,9 +162,9 @@ impl Distribution {
     /// - A [`Uniform`] distribution's range is simply its interval.
     /// - An [`Exponential`] distribution's range is `[offset, +∞)`.
     /// - A [`Gaussian`] distribution's range is unbounded.
-    /// - A [`Bernoulli`] distribution's range is [`Interval::UNCERTAIN`], if
-    ///   `p` is neither `0` nor `1`. Otherwise, it is [`Interval::CERTAINLY_FALSE`]
-    ///   and [`Interval::CERTAINLY_TRUE`], respectively.
+    /// - A [`Bernoulli`] distribution's range is [`Interval::TRUE_OR_FALSE`], if
+    ///   `p` is neither `0` nor `1`. Otherwise, it is [`Interval::FALSE`]
+    ///   and [`Interval::TRUE`], respectively.
     /// - A [`Generic`] distribution is unbounded by default, but more information
     ///   may be present.
     pub fn range(&self) -> Result<Interval> {
@@ -189,7 +192,7 @@ impl Distribution {
     pub fn target_type(args: &[&ScalarValue]) -> Result<DataType> {
         let mut arg_types = args
             .iter()
-            .filter(|&&arg| (arg != &ScalarValue::Null))
+            .filter(|&&arg| arg != &ScalarValue::Null)
             .map(|&arg| arg.data_type());
 
         let Some(dt) = arg_types.next().map_or_else(
@@ -275,11 +278,11 @@ pub struct GenericDistribution {
 
 impl UniformDistribution {
     fn try_new(interval: Interval) -> Result<Self> {
-        if interval.data_type().eq(&DataType::Boolean) {
-            return internal_err!(
-                "Construction of a boolean `Uniform` distribution is prohibited, create a `Bernoulli` distribution instead."
-            );
-        }
+        assert_ne_or_internal_err!(
+            interval.data_type(),
+            DataType::Boolean,
+            "Construction of a boolean `Uniform` distribution is prohibited, create a `Bernoulli` distribution instead."
+        );
 
         Ok(Self { interval })
     }
@@ -337,21 +340,29 @@ impl ExponentialDistribution {
         positive_tail: bool,
     ) -> Result<Self> {
         let dt = rate.data_type();
-        if offset.data_type() != dt {
-            internal_err!("Rate and offset must have the same data type")
-        } else if offset.is_null() {
-            internal_err!("Offset of an `ExponentialDistribution` cannot be null")
-        } else if rate.is_null() {
-            internal_err!("Rate of an `ExponentialDistribution` cannot be null")
-        } else if rate.le(&ScalarValue::new_zero(&dt)?) {
-            internal_err!("Rate of an `ExponentialDistribution` must be positive")
-        } else {
-            Ok(Self {
-                rate,
-                offset,
-                positive_tail,
-            })
-        }
+        assert_eq_or_internal_err!(
+            offset.data_type(),
+            dt,
+            "Rate and offset must have the same data type"
+        );
+        assert_or_internal_err!(
+            !offset.is_null(),
+            "Offset of an `ExponentialDistribution` cannot be null"
+        );
+        assert_or_internal_err!(
+            !rate.is_null(),
+            "Rate of an `ExponentialDistribution` cannot be null"
+        );
+        let zero = ScalarValue::new_zero(&dt)?;
+        assert_or_internal_err!(
+            !rate.le(&zero),
+            "Rate of an `ExponentialDistribution` must be positive"
+        );
+        Ok(Self {
+            rate,
+            offset,
+            positive_tail,
+        })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -412,15 +423,21 @@ impl ExponentialDistribution {
 impl GaussianDistribution {
     fn try_new(mean: ScalarValue, variance: ScalarValue) -> Result<Self> {
         let dt = mean.data_type();
-        if variance.data_type() != dt {
-            internal_err!("Mean and variance must have the same data type")
-        } else if variance.is_null() {
-            internal_err!("Variance of a `GaussianDistribution` cannot be null")
-        } else if variance.lt(&ScalarValue::new_zero(&dt)?) {
-            internal_err!("Variance of a `GaussianDistribution` must be positive")
-        } else {
-            Ok(Self { mean, variance })
-        }
+        assert_eq_or_internal_err!(
+            variance.data_type(),
+            dt,
+            "Mean and variance must have the same data type"
+        );
+        assert_or_internal_err!(
+            !variance.is_null(),
+            "Variance of a `GaussianDistribution` cannot be null"
+        );
+        let zero = ScalarValue::new_zero(&dt)?;
+        assert_or_internal_err!(
+            !variance.lt(&zero),
+            "Variance of a `GaussianDistribution` must be positive"
+        );
+        Ok(Self { mean, variance })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -447,19 +464,16 @@ impl GaussianDistribution {
 impl BernoulliDistribution {
     fn try_new(p: ScalarValue) -> Result<Self> {
         if p.is_null() {
-            Ok(Self { p })
-        } else {
-            let dt = p.data_type();
-            let zero = ScalarValue::new_zero(&dt)?;
-            let one = ScalarValue::new_one(&dt)?;
-            if p.ge(&zero) && p.le(&one) {
-                Ok(Self { p })
-            } else {
-                internal_err!(
-                    "Success probability of a `BernoulliDistribution` must be in [0, 1]"
-                )
-            }
+            return Ok(Self { p });
         }
+        let dt = p.data_type();
+        let zero = ScalarValue::new_zero(&dt)?;
+        let one = ScalarValue::new_one(&dt)?;
+        assert_or_internal_err!(
+            p.ge(&zero) && p.le(&one),
+            "Success probability of a `BernoulliDistribution` must be in [0, 1]"
+        );
+        Ok(Self { p })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -505,11 +519,11 @@ impl BernoulliDistribution {
         // Unwraps are safe as the constructor guarantees that the data type
         // supports zero and one values.
         if ScalarValue::new_zero(&dt).unwrap().eq(&self.p) {
-            Interval::CERTAINLY_FALSE
+            Interval::FALSE
         } else if ScalarValue::new_one(&dt).unwrap().eq(&self.p) {
-            Interval::CERTAINLY_TRUE
+            Interval::TRUE
         } else {
-            Interval::UNCERTAIN
+            Interval::TRUE_OR_FALSE
         }
     }
 }
@@ -521,11 +535,11 @@ impl GenericDistribution {
         variance: ScalarValue,
         range: Interval,
     ) -> Result<Self> {
-        if range.data_type().eq(&DataType::Boolean) {
-            return internal_err!(
-                "Construction of a boolean `Generic` distribution is prohibited, create a `Bernoulli` distribution instead."
-            );
-        }
+        assert_ne_or_internal_err!(
+            range.data_type(),
+            DataType::Boolean,
+            "Construction of a boolean `Generic` distribution is prohibited, create a `Bernoulli` distribution instead."
+        );
 
         let validate_location = |m: &ScalarValue| -> Result<bool> {
             // Checks whether the given location estimate is within the range.
@@ -536,20 +550,24 @@ impl GenericDistribution {
             }
         };
 
-        if !validate_location(&mean)?
-            || !validate_location(&median)?
-            || (!variance.is_null()
-                && variance.lt(&ScalarValue::new_zero(&variance.data_type())?))
-        {
-            internal_err!("Tried to construct an invalid `GenericDistribution` instance")
+        let locations_valid = validate_location(&mean)? && validate_location(&median)?;
+        let variance_non_negative = if variance.is_null() {
+            true
         } else {
-            Ok(Self {
-                mean,
-                median,
-                variance,
-                range,
-            })
-        }
+            let zero = ScalarValue::new_zero(&variance.data_type())?;
+            !variance.lt(&zero)
+        };
+        assert_or_internal_err!(
+            locations_valid && variance_non_negative,
+            "Tried to construct an invalid `GenericDistribution` instance"
+        );
+
+        Ok(Self {
+            mean,
+            median,
+            variance,
+            range,
+        })
     }
 
     pub fn data_type(&self) -> DataType {
@@ -718,11 +736,11 @@ pub fn create_bernoulli_from_comparison(
     }
     let (li, ri) = (left.range()?, right.range()?);
     let range_evaluation = apply_operator(op, &li, &ri)?;
-    if range_evaluation.eq(&Interval::CERTAINLY_FALSE) {
+    if range_evaluation.eq(&Interval::FALSE) {
         Distribution::new_bernoulli(ScalarValue::from(0.0))
-    } else if range_evaluation.eq(&Interval::CERTAINLY_TRUE) {
+    } else if range_evaluation.eq(&Interval::TRUE) {
         Distribution::new_bernoulli(ScalarValue::from(1.0))
-    } else if range_evaluation.eq(&Interval::UNCERTAIN) {
+    } else if range_evaluation.eq(&Interval::TRUE_OR_FALSE) {
         Distribution::new_bernoulli(ScalarValue::try_from(&DataType::Float64)?)
     } else {
         internal_err!("This function must be called with a comparison operator")
@@ -860,11 +878,11 @@ pub fn compute_variance(
 #[cfg(test)]
 mod tests {
     use super::{
+        BernoulliDistribution, Distribution, GaussianDistribution, UniformDistribution,
         combine_bernoullis, combine_gaussians, compute_mean, compute_median,
         compute_variance, create_bernoulli_from_comparison, new_generic_from_binary_op,
-        BernoulliDistribution, Distribution, GaussianDistribution, UniformDistribution,
     };
-    use crate::interval_arithmetic::{apply_operator, Interval};
+    use crate::interval_arithmetic::{Interval, apply_operator};
     use crate::operator::Operator;
 
     use arrow::datatypes::DataType;
@@ -879,7 +897,7 @@ mod tests {
             })
         );
 
-        assert!(Distribution::new_uniform(Interval::UNCERTAIN).is_err());
+        assert!(Distribution::new_uniform(Interval::TRUE_OR_FALSE).is_err());
         Ok(())
     }
 
@@ -992,7 +1010,7 @@ mod tests {
                     ScalarValue::Null,
                     ScalarValue::Null,
                     ScalarValue::Null,
-                    Interval::UNCERTAIN,
+                    Interval::TRUE_OR_FALSE,
                 ),
                 false,
             ),
diff --git a/datafusion/expr-common/src/type_coercion/aggregates.rs b/datafusion/expr-common/src/type_coercion/aggregates.rs
index e9377ce7de5a2..df86ff582d658 100644
--- a/datafusion/expr-common/src/type_coercion/aggregates.rs
+++ b/datafusion/expr-common/src/type_coercion/aggregates.rs
@@ -16,30 +16,12 @@
 // under the License.
 
 use crate::signature::TypeSignature;
-use arrow::datatypes::{
-    DataType, FieldRef, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
-    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
-};
+use arrow::datatypes::{DataType, FieldRef};
 
-use datafusion_common::{internal_err, plan_err, Result};
-
-pub static STRINGS: &[DataType] =
-    &[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
-
-pub static SIGNED_INTEGERS: &[DataType] = &[
-    DataType::Int8,
-    DataType::Int16,
-    DataType::Int32,
-    DataType::Int64,
-];
-
-pub static UNSIGNED_INTEGERS: &[DataType] = &[
-    DataType::UInt8,
-    DataType::UInt16,
-    DataType::UInt32,
-    DataType::UInt64,
-];
+use datafusion_common::{Result, internal_err, plan_err};
 
+// TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures
+//       see https://github.com/apache/datafusion/issues/18092
 pub static INTEGERS: &[DataType] = &[
     DataType::Int8,
     DataType::Int16,
@@ -60,28 +42,11 @@ pub static NUMERICS: &[DataType] = &[
     DataType::UInt16,
     DataType::UInt32,
     DataType::UInt64,
+    DataType::Float16,
     DataType::Float32,
     DataType::Float64,
 ];
 
-pub static TIMESTAMPS: &[DataType] = &[
-    DataType::Timestamp(TimeUnit::Second, None),
-    DataType::Timestamp(TimeUnit::Millisecond, None),
-    DataType::Timestamp(TimeUnit::Microsecond, None),
-    DataType::Timestamp(TimeUnit::Nanosecond, None),
-];
-
-pub static DATES: &[DataType] = &[DataType::Date32, DataType::Date64];
-
-pub static BINARYS: &[DataType] = &[DataType::Binary, DataType::LargeBinary];
-
-pub static TIMES: &[DataType] = &[
-    DataType::Time32(TimeUnit::Second),
-    DataType::Time32(TimeUnit::Millisecond),
-    DataType::Time64(TimeUnit::Microsecond),
-    DataType::Time64(TimeUnit::Nanosecond),
-];
-
 /// Validate the length of `input_fields` matches the `signature` for `agg_fun`.
 ///
 /// This method DOES NOT validate the argument fields - only that (at least one,
@@ -96,8 +61,7 @@ pub fn check_arg_count(
         TypeSignature::Uniform(agg_count, _) | TypeSignature::Any(agg_count) => {
             if input_fields.len() != *agg_count {
                 return plan_err!(
-                    "The function {func_name} expects {:?} arguments, but {:?} were provided",
-                    agg_count,
+                    "The function {func_name} expects {agg_count} arguments, but {} were provided",
                     input_fields.len()
                 );
             }
@@ -105,7 +69,7 @@ pub fn check_arg_count(
         TypeSignature::Exact(types) => {
             if types.len() != input_fields.len() {
                 return plan_err!(
-                    "The function {func_name} expects {:?} arguments, but {:?} were provided",
+                    "The function {func_name} expects {} arguments, but {} were provided",
                     types.len(),
                     input_fields.len()
                 );
@@ -117,7 +81,7 @@ pub fn check_arg_count(
                 .any(|v| check_arg_count(func_name, input_fields, v).is_ok());
             if !ok {
                 return plan_err!(
-                    "The function {func_name} does not accept {:?} function arguments.",
+                    "The function {func_name} does not accept {} function arguments.",
                     input_fields.len()
                 );
             }
@@ -136,229 +100,8 @@ pub fn check_arg_count(
             // Numeric and Coercible signature is validated in `get_valid_types`
         }
         _ => {
-            return internal_err!(
-                "Aggregate functions do not support this {signature:?}"
-            );
+            return internal_err!("Aggregate functions do not support this {signature}");
         }
     }
     Ok(())
 }
-
-/// Function return type of a sum
-pub fn sum_return_type(arg_type: &DataType) -> Result<DataType> {
-    match arg_type {
-        DataType::Int64 => Ok(DataType::Int64),
-        DataType::UInt64 => Ok(DataType::UInt64),
-        DataType::Float64 => Ok(DataType::Float64),
-        DataType::Decimal128(precision, scale) => {
-            // In the spark, the result type is DECIMAL(min(38,precision+10), s)
-            // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
-            let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10);
-            Ok(DataType::Decimal128(new_precision, *scale))
-        }
-        DataType::Decimal256(precision, scale) => {
-            // In the spark, the result type is DECIMAL(min(38,precision+10), s)
-            // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
-            let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 10);
-            Ok(DataType::Decimal256(new_precision, *scale))
-        }
-        other => plan_err!("SUM does not support type \"{other:?}\""),
-    }
-}
-
-/// Function return type of variance
-pub fn variance_return_type(arg_type: &DataType) -> Result<DataType> {
-    if NUMERICS.contains(arg_type) {
-        Ok(DataType::Float64)
-    } else {
-        plan_err!("VAR does not support {arg_type:?}")
-    }
-}
-
-/// Function return type of covariance
-pub fn covariance_return_type(arg_type: &DataType) -> Result<DataType> {
-    if NUMERICS.contains(arg_type) {
-        Ok(DataType::Float64)
-    } else {
-        plan_err!("COVAR does not support {arg_type:?}")
-    }
-}
-
-/// Function return type of correlation
-pub fn correlation_return_type(arg_type: &DataType) -> Result<DataType> {
-    if NUMERICS.contains(arg_type) {
-        Ok(DataType::Float64)
-    } else {
-        plan_err!("CORR does not support {arg_type:?}")
-    }
-}
-
-/// Function return type of an average
-pub fn avg_return_type(func_name: &str, arg_type: &DataType) -> Result<DataType> {
-    match arg_type {
-        DataType::Decimal128(precision, scale) => {
-            // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
-            // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
-            let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 4);
-            let new_scale = DECIMAL128_MAX_SCALE.min(*scale + 4);
-            Ok(DataType::Decimal128(new_precision, new_scale))
-        }
-        DataType::Decimal256(precision, scale) => {
-            // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
-            // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
-            let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 4);
-            let new_scale = DECIMAL256_MAX_SCALE.min(*scale + 4);
-            Ok(DataType::Decimal256(new_precision, new_scale))
-        }
-        DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
-        arg_type if NUMERICS.contains(arg_type) => Ok(DataType::Float64),
-        DataType::Dictionary(_, dict_value_type) => {
-            avg_return_type(func_name, dict_value_type.as_ref())
-        }
-        other => plan_err!("{func_name} does not support {other:?}"),
-    }
-}
-
-/// Internal sum type of an average
-pub fn avg_sum_type(arg_type: &DataType) -> Result<DataType> {
-    match arg_type {
-        DataType::Decimal128(precision, scale) => {
-            // In the spark, the sum type of avg is DECIMAL(min(38,precision+10), s)
-            let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10);
-            Ok(DataType::Decimal128(new_precision, *scale))
-        }
-        DataType::Decimal256(precision, scale) => {
-            // In Spark the sum type of avg is DECIMAL(min(38,precision+10), s)
-            let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 10);
-            Ok(DataType::Decimal256(new_precision, *scale))
-        }
-        DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
-        arg_type if NUMERICS.contains(arg_type) => Ok(DataType::Float64),
-        DataType::Dictionary(_, dict_value_type) => {
-            avg_sum_type(dict_value_type.as_ref())
-        }
-        other => plan_err!("AVG does not support {other:?}"),
-    }
-}
-
-pub fn is_sum_support_arg_type(arg_type: &DataType) -> bool {
-    match arg_type {
-        DataType::Dictionary(_, dict_value_type) => {
-            is_sum_support_arg_type(dict_value_type.as_ref())
-        }
-        _ => matches!(
-            arg_type,
-            arg_type if NUMERICS.contains(arg_type)
-            || matches!(arg_type, DataType::Decimal128(_, _) | DataType::Decimal256(_, _))
-        ),
-    }
-}
-
-pub fn is_avg_support_arg_type(arg_type: &DataType) -> bool {
-    match arg_type {
-        DataType::Dictionary(_, dict_value_type) => {
-            is_avg_support_arg_type(dict_value_type.as_ref())
-        }
-        _ => matches!(
-            arg_type,
-            arg_type if NUMERICS.contains(arg_type)
-                || matches!(arg_type, DataType::Decimal128(_, _)| DataType::Decimal256(_, _))
-        ),
-    }
-}
-
-pub fn is_variance_support_arg_type(arg_type: &DataType) -> bool {
-    matches!(
-        arg_type,
-        arg_type if NUMERICS.contains(arg_type)
-    )
-}
-
-pub fn is_covariance_support_arg_type(arg_type: &DataType) -> bool {
-    matches!(
-        arg_type,
-        arg_type if NUMERICS.contains(arg_type)
-    )
-}
-
-pub fn is_correlation_support_arg_type(arg_type: &DataType) -> bool {
-    matches!(
-        arg_type,
-        arg_type if NUMERICS.contains(arg_type)
-    )
-}
-
-pub fn is_integer_arg_type(arg_type: &DataType) -> bool {
-    arg_type.is_integer()
-}
-
-pub fn coerce_avg_type(func_name: &str, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-    // Supported types smallint, int, bigint, real, double precision, decimal, or interval
-    // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
-    fn coerced_type(func_name: &str, data_type: &DataType) -> Result<DataType> {
-        match &data_type {
-            DataType::Decimal128(p, s) => Ok(DataType::Decimal128(*p, *s)),
-            DataType::Decimal256(p, s) => Ok(DataType::Decimal256(*p, *s)),
-            d if d.is_numeric() => Ok(DataType::Float64),
-            DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
-            DataType::Dictionary(_, v) => coerced_type(func_name, v.as_ref()),
-            _ => {
-                plan_err!(
-                    "The function {:?} does not support inputs of type {:?}.",
-                    func_name,
-                    data_type
-                )
-            }
-        }
-    }
-    Ok(vec![coerced_type(func_name, &arg_types[0])?])
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_variance_return_data_type() -> Result<()> {
-        let data_type = DataType::Float64;
-        let result_type = variance_return_type(&data_type)?;
-        assert_eq!(DataType::Float64, result_type);
-
-        let data_type = DataType::Decimal128(36, 10);
-        assert!(variance_return_type(&data_type).is_err());
-        Ok(())
-    }
-
-    #[test]
-    fn test_sum_return_data_type() -> Result<()> {
-        let data_type = DataType::Decimal128(10, 5);
-        let result_type = sum_return_type(&data_type)?;
-        assert_eq!(DataType::Decimal128(20, 5), result_type);
-
-        let data_type = DataType::Decimal128(36, 10);
-        let result_type = sum_return_type(&data_type)?;
-        assert_eq!(DataType::Decimal128(38, 10), result_type);
-        Ok(())
-    }
-
-    #[test]
-    fn test_covariance_return_data_type() -> Result<()> {
-        let data_type = DataType::Float64;
-        let result_type = covariance_return_type(&data_type)?;
-        assert_eq!(DataType::Float64, result_type);
-
-        let data_type = DataType::Decimal128(36, 10);
-        assert!(covariance_return_type(&data_type).is_err());
-        Ok(())
-    }
-
-    #[test]
-    fn test_correlation_return_data_type() -> Result<()> {
-        let data_type = DataType::Float64;
-        let result_type = correlation_return_type(&data_type)?;
-        assert_eq!(DataType::Float64, result_type);
-
-        let data_type = DataType::Decimal128(36, 10);
-        assert!(correlation_return_type(&data_type).is_err());
-        Ok(())
-    }
-}
diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs
index d0fcda9733811..fa109e38a4382 100644
--- a/datafusion/expr-common/src/type_coercion/binary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary.rs
@@ -17,21 +17,26 @@
 
 //! Coercion rules for matching argument types for binary operators
 
+use std::collections::HashMap;
 use std::collections::HashSet;
 use std::sync::Arc;
 
 use crate::operator::Operator;
 
-use arrow::array::{new_empty_array, Array};
+use arrow::array::{Array, new_empty_array};
 use arrow::compute::can_cast_types;
+use arrow::datatypes::IntervalUnit::MonthDayNano;
+use arrow::datatypes::TimeUnit::*;
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, TimeUnit, DECIMAL128_MAX_PRECISION,
-    DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
+    DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, Field, FieldRef, Fields,
+    TimeUnit,
 };
 use datafusion_common::types::NativeType;
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, Diagnostic,
-    Result, Span, Spans,
+    Diagnostic, Result, Span, Spans, exec_err, internal_err, not_impl_err,
+    plan_datafusion_err, plan_err,
 };
 use itertools::Itertools;
 
@@ -124,8 +129,66 @@ impl<'a> BinaryTypeCoercer<'a> {
 
     /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs`
     fn signature(&'a self) -> Result<Signature> {
-        use arrow::datatypes::DataType::*;
+        // Special handling for arithmetic operations with both `lhs` and `rhs` NULL:
+        // When both operands are NULL, we are providing a concrete numeric type (Int64)
+        // to allow the arithmetic operation to proceed. This ensures NULL `op` NULL returns NULL
+        // instead of failing during planning.
+        if matches!((self.lhs, self.rhs), (DataType::Null, DataType::Null))
+            && self.op.is_numerical_operators()
+        {
+            return Ok(Signature::uniform(DataType::Int64));
+        }
+
+        if let Some(coerced) = null_coercion(self.lhs, self.rhs) {
+            // Special handling for arithmetic + null coercion:
+            // For arithmetic operators on non-temporal types, we must handle the result type here using Arrow's numeric kernel.
+            // This is because Arrow expects concrete numeric types, and this ensures the correct result type (e.g., for NULL + Int32, result is Int32).
+            // For all other cases (including temporal arithmetic and non-arithmetic operators),
+            // we can delegate to signature_inner(&coerced, &coerced), which handles the necessary logic for those operators.
+            // In those cases, signature_inner is designed to work with the coerced type, even if it originated from a NULL.
+            if self.op.is_numerical_operators() && !coerced.is_temporal() {
+                let ret = self.get_result(&coerced, &coerced).map_err(|e| {
+                    plan_datafusion_err!(
+                        "Cannot get result type for arithmetic operation {coerced} {} {coerced}: {e}",
+                        self.op
+                    )
+                })?;
+
+                return Ok(Signature {
+                    lhs: coerced.clone(),
+                    rhs: coerced,
+                    ret,
+                });
+            }
+            return self.signature_inner(&coerced, &coerced);
+        }
+        self.signature_inner(self.lhs, self.rhs)
+    }
+
+    /// Returns the result type for arithmetic operations
+    fn get_result(
+        &self,
+        lhs: &DataType,
+        rhs: &DataType,
+    ) -> arrow::error::Result<DataType> {
+        use arrow::compute::kernels::numeric::*;
+        let l = new_empty_array(lhs);
+        let r = new_empty_array(rhs);
+
+        let result = match self.op {
+            Operator::Plus => add_wrapping(&l, &r),
+            Operator::Minus => sub_wrapping(&l, &r),
+            Operator::Multiply => mul_wrapping(&l, &r),
+            Operator::Divide => div(&l, &r),
+            Operator::Modulo => rem(&l, &r),
+            _ => unreachable!(),
+        };
+        result.map(|x| x.data_type().clone())
+    }
+
+    fn signature_inner(&'a self, lhs: &DataType, rhs: &DataType) -> Result<Signature> {
         use Operator::*;
+        use arrow::datatypes::DataType::*;
         let result = match self.op {
         Eq |
         NotEq |
@@ -135,7 +198,7 @@ impl<'a> BinaryTypeCoercer<'a> {
         GtEq |
         IsDistinctFrom |
         IsNotDistinctFrom => {
-            comparison_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| {
+            comparison_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for comparison operation {} {} {}",
                     self.lhs,
@@ -144,9 +207,9 @@ impl<'a> BinaryTypeCoercer<'a> {
                 )
             })
         }
-        And | Or => if matches!((self.lhs, self.rhs), (Boolean | Null, Boolean | Null)) {
+        And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) {
             // Logical binary boolean operators can only be evaluated for
-            // boolean or null arguments.                   
+            // boolean or null arguments.
             Ok(Signature::uniform(Boolean))
         } else {
             plan_err!(
@@ -154,28 +217,28 @@ impl<'a> BinaryTypeCoercer<'a> {
             )
         }
         RegexMatch | RegexIMatch | RegexNotMatch | RegexNotIMatch => {
-            regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| {
+            regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs
                 )
             })
         }
         LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => {
-            regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| {
+            regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs
                 )
             })
         }
         BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => {
-            bitwise_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| {
+            bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common type for bitwise operation {} {} {}", self.lhs, self.op, self.rhs
                 )
             })
         }
         StringConcat => {
-            string_concat_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| {
+            string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common string type for string concat operation {} {} {}", self.lhs, self.op, self.rhs
                 )
@@ -183,8 +246,8 @@ impl<'a> BinaryTypeCoercer<'a> {
         }
         AtArrow | ArrowAt => {
             // Array contains or search (similar to LIKE) operation
-            array_coercion(self.lhs, self.rhs)
-                .or_else(|| like_coercion(self.lhs, self.rhs)).map(Signature::comparison).ok_or_else(|| {
+            array_coercion(lhs, rhs)
+                .or_else(|| like_coercion(lhs, rhs)).map(Signature::comparison).ok_or_else(|| {
                     plan_datafusion_err!(
                         "Cannot infer common argument type for operation {} {} {}", self.lhs, self.op, self.rhs
                     )
@@ -192,40 +255,45 @@ impl<'a> BinaryTypeCoercer<'a> {
         }
         AtAt => {
             // text search has similar signature to LIKE
-            like_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| {
+            like_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
                 plan_datafusion_err!(
                     "Cannot infer common argument type for AtAt operation {} {} {}", self.lhs, self.op, self.rhs
                 )
             })
         }
+        Minus if is_date_minus_date(lhs, rhs) => {
+            return Ok(Signature {
+                lhs: lhs.clone(),
+                rhs: rhs.clone(),
+                ret: Int64,
+            });
+        }
         Plus | Minus | Multiply | Divide | Modulo  =>  {
-            let get_result = |lhs, rhs| {
-                use arrow::compute::kernels::numeric::*;
-                let l = new_empty_array(lhs);
-                let r = new_empty_array(rhs);
-
-                let result = match self.op {
-                    Plus => add_wrapping(&l, &r),
-                    Minus => sub_wrapping(&l, &r),
-                    Multiply => mul_wrapping(&l, &r),
-                    Divide => div(&l, &r),
-                    Modulo => rem(&l, &r),
-                    _ => unreachable!(),
-                };
-                result.map(|x| x.data_type().clone())
-            };
+            if let Ok(ret) = self.get_result(lhs, rhs) {
 
-            if let Ok(ret) = get_result(self.lhs, self.rhs) {
                 // Temporal arithmetic, e.g. Date32 + Interval
                 Ok(Signature{
-                    lhs: self.lhs.clone(),
-                    rhs: self.rhs.clone(),
+                    lhs: lhs.clone(),
+                    rhs: rhs.clone(),
+                    ret,
+                })
+            } else if let Some((lhs, rhs)) = temporal_math_coercion(lhs, rhs) {
+                // Temporal arithmetic, e.g. Date32 + int64, Timestamp + duration, etc
+                let ret = self.get_result(&lhs, &rhs).map_err(|e| {
+                    plan_datafusion_err!(
+                        "Cannot get result type for temporal operation {} {} {}: {e}", self.lhs, self.op, self.rhs
+                    )
+                })?;
+                Ok(Signature {
+                    lhs,
+                    rhs,
                     ret,
                 })
-            } else if let Some(coerced) = temporal_coercion_strict_timezone(self.lhs, self.rhs) {
+            } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) {
+
                 // Temporal arithmetic by first coercing to a common time representation
                 // e.g. Date32 - Timestamp
-                let ret = get_result(&coerced, &coerced).map_err(|e| {
+                let ret = self.get_result(&coerced, &coerced).map_err(|e| {
                     plan_datafusion_err!(
                         "Cannot get result type for temporal operation {coerced} {} {coerced}: {e}", self.op
                     )
@@ -235,9 +303,9 @@ impl<'a> BinaryTypeCoercer<'a> {
                     rhs: coerced,
                     ret,
                 })
-            } else if let Some((lhs, rhs)) = math_decimal_coercion(self.lhs, self.rhs) {
+            } else if let Some((lhs, rhs)) = math_decimal_coercion(lhs, rhs) {
                 // Decimal arithmetic, e.g. Decimal(10, 2) + Decimal(10, 0)
-                let ret = get_result(&lhs, &rhs).map_err(|e| {
+                let ret = self.get_result(&lhs, &rhs).map_err(|e| {
                     plan_datafusion_err!(
                         "Cannot get result type for decimal operation {} {} {}: {e}", self.lhs, self.op, self.rhs
                     )
@@ -247,7 +315,7 @@ impl<'a> BinaryTypeCoercer<'a> {
                     rhs,
                     ret,
                 })
-            } else if let Some(numeric) = mathematics_numerical_coercion(self.lhs, self.rhs) {
+            } else if let Some(numeric) = mathematics_numerical_coercion(lhs, rhs) {
                 // Numeric arithmetic, e.g. Int32 + Int32
                 Ok(Signature::uniform(numeric))
             } else {
@@ -256,6 +324,9 @@ impl<'a> BinaryTypeCoercer<'a> {
                 )
             }
         },
+        Colon => {
+            Ok(Signature { lhs: lhs.clone(), rhs: rhs.clone(), ret: lhs.clone() })
+        },
         IntegerDivide | Arrow | LongArrow | HashArrow | HashLongArrow
         | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe => {
             not_impl_err!("Operator {} is not yet supported", self.op)
@@ -283,6 +354,15 @@ impl<'a> BinaryTypeCoercer<'a> {
 
 // TODO Move the rest inside of BinaryTypeCoercer
 
+/// Returns true if both operands are Date types (Date32 or Date64)
+/// Used to detect Date - Date operations which should return Int64 (days difference)
+fn is_date_minus_date(lhs: &DataType, rhs: &DataType) -> bool {
+    matches!(
+        (lhs, rhs),
+        (DataType::Date32, DataType::Date32) | (DataType::Date64, DataType::Date64)
+    )
+}
+
 /// Coercion rules for mathematics operators between decimal and non-decimal types.
 fn math_decimal_coercion(
     lhs_type: &DataType,
@@ -299,25 +379,84 @@ fn math_decimal_coercion(
             let (lhs_type, value_type) = math_decimal_coercion(lhs_type, value_type)?;
             Some((lhs_type, value_type))
         }
-        (Null, dec_type @ Decimal128(_, _)) | (dec_type @ Decimal128(_, _), Null) => {
-            Some((dec_type.clone(), dec_type.clone()))
-        }
-        (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => {
+        (
+            Null,
+            Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _),
+        ) => Some((rhs_type.clone(), rhs_type.clone())),
+        (
+            Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _),
+            Null,
+        ) => Some((lhs_type.clone(), lhs_type.clone())),
+        (Decimal32(_, _), Decimal32(_, _))
+        | (Decimal64(_, _), Decimal64(_, _))
+        | (Decimal128(_, _), Decimal128(_, _))
+        | (Decimal256(_, _), Decimal256(_, _)) => {
             Some((lhs_type.clone(), rhs_type.clone()))
         }
+        // Cross-variant decimal coercion - choose larger variant with appropriate precision/scale
+        (lhs, rhs)
+            if lhs.is_decimal()
+                && rhs.is_decimal()
+                && std::mem::discriminant(lhs) != std::mem::discriminant(rhs) =>
+        {
+            let coerced_type = get_wider_decimal_type_cross_variant(lhs_type, rhs_type)?;
+            Some((coerced_type.clone(), coerced_type))
+        }
         // Unlike with comparison we don't coerce to a decimal in the case of floating point
         // numbers, instead falling back to floating point arithmetic instead
-        (Decimal128(_, _), Int8 | Int16 | Int32 | Int64) => {
-            Some((lhs_type.clone(), coerce_numeric_type_to_decimal(rhs_type)?))
-        }
-        (Int8 | Int16 | Int32 | Int64, Decimal128(_, _)) => {
-            Some((coerce_numeric_type_to_decimal(lhs_type)?, rhs_type.clone()))
-        }
-        (Decimal256(_, _), Int8 | Int16 | Int32 | Int64) => Some((
+        (
+            Decimal32(_, _),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+        ) => Some((
+            lhs_type.clone(),
+            coerce_numeric_type_to_decimal32(rhs_type)?,
+        )),
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+            Decimal32(_, _),
+        ) => Some((
+            coerce_numeric_type_to_decimal32(lhs_type)?,
+            rhs_type.clone(),
+        )),
+        (
+            Decimal64(_, _),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+        ) => Some((
+            lhs_type.clone(),
+            coerce_numeric_type_to_decimal64(rhs_type)?,
+        )),
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+            Decimal64(_, _),
+        ) => Some((
+            coerce_numeric_type_to_decimal64(lhs_type)?,
+            rhs_type.clone(),
+        )),
+        (
+            Decimal128(_, _),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+        ) => Some((
+            lhs_type.clone(),
+            coerce_numeric_type_to_decimal128(rhs_type)?,
+        )),
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+            Decimal128(_, _),
+        ) => Some((
+            coerce_numeric_type_to_decimal128(lhs_type)?,
+            rhs_type.clone(),
+        )),
+        (
+            Decimal256(_, _),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+        ) => Some((
             lhs_type.clone(),
             coerce_numeric_type_to_decimal256(rhs_type)?,
         )),
-        (Int8 | Int16 | Int32 | Int64, Decimal256(_, _)) => Some((
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+            Decimal256(_, _),
+        ) => Some((
             coerce_numeric_type_to_decimal256(lhs_type)?,
             rhs_type.clone(),
         )),
@@ -334,7 +473,9 @@ fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option<DataT
         return None;
     }
 
-    if left_type == right_type {
+    let is_integer_dictionary =
+        matches!(left_type, Dictionary(_, value_type) if value_type.is_integer());
+    if left_type == right_type && (left_type.is_integer() || is_integer_dictionary) {
         return Some(left_type.clone());
     }
 
@@ -388,7 +529,7 @@ impl From<&DataType> for TypeCategory {
                     return TypeCategory::Numeric;
                 }
 
-                if matches!(data_type, DataType::Boolean) {
+                if *data_type == DataType::Boolean {
                     return TypeCategory::Boolean;
                 }
 
@@ -462,7 +603,7 @@ pub fn type_union_resolution(data_types: &[DataType]) -> Option<DataType> {
 
     // If all the data_types are null, return string
     if data_types.iter().all(|t| t == &DataType::Null) {
-        return Some(DataType::Utf8);
+        return Some(DataType::Utf8View);
     }
 
     // Ignore Nulls, if any data_type category is not the same, return None
@@ -615,15 +756,15 @@ fn type_union_resolution_coercion(
 
 /// Handle type union resolution including struct type and others.
 pub fn try_type_union_resolution(data_types: &[DataType]) -> Result<Vec<DataType>> {
-    let err = match try_type_union_resolution_with_struct(data_types) {
+    let struct_err = match try_type_union_resolution_with_struct(data_types) {
         Ok(struct_types) => return Ok(struct_types),
-        Err(e) => Some(e),
+        Err(e) => e,
     };
 
     if let Some(new_type) = type_union_resolution(data_types) {
         Ok(vec![new_type; data_types.len()])
     } else {
-        exec_err!("Fail to find the coerced type, errors: {:?}", err)
+        exec_err!("Fail to find the coerced type, errors: {struct_err}")
     }
 }
 
@@ -638,13 +779,17 @@ pub fn try_type_union_resolution_with_struct(
             let keys = fields.iter().map(|f| f.name().to_owned()).join(",");
             if let Some(ref k) = keys_string {
                 if *k != keys {
-                    return exec_err!("Expect same keys for struct type but got mismatched pair {} and {}", *k, keys);
+                    return exec_err!(
+                        "Expect same keys for struct type but got mismatched pair {} and {}",
+                        *k,
+                        keys
+                    );
                 }
             } else {
                 keys_string = Some(keys);
             }
         } else {
-            return exec_err!("Expect to get struct but got {}", data_type);
+            return exec_err!("Expect to get struct but got {data_type}");
         }
     }
 
@@ -652,7 +797,9 @@ pub fn try_type_union_resolution_with_struct(
     {
         fields.iter().map(|f| f.data_type().to_owned()).collect()
     } else {
-        return internal_err!("Struct type is checked is the previous function, so this should be unreachable");
+        return internal_err!(
+            "Struct type is checked is the previous function, so this should be unreachable"
+        );
     };
 
     for data_type in data_types.iter().skip(1) {
@@ -676,7 +823,7 @@ pub fn try_type_union_resolution_with_struct(
                 }
             }
         } else {
-            return exec_err!("Expect to get struct but got {}", data_type);
+            return exec_err!("Expect to get struct but got {data_type}");
         }
     }
 
@@ -725,6 +872,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D
     }
     binary_numeric_coercion(lhs_type, rhs_type)
         .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, true))
+        .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, true))
         .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
         .or_else(|| string_coercion(lhs_type, rhs_type))
         .or_else(|| list_coercion(lhs_type, rhs_type))
@@ -753,6 +901,8 @@ pub fn comparison_coercion_numeric(
         return Some(lhs_type.clone());
     }
     binary_numeric_coercion(lhs_type, rhs_type)
+        .or_else(|| dictionary_comparison_coercion_numeric(lhs_type, rhs_type, true))
+        .or_else(|| ree_comparison_coercion_numeric(lhs_type, rhs_type, true))
         .or_else(|| string_coercion(lhs_type, rhs_type))
         .or_else(|| null_coercion(lhs_type, rhs_type))
         .or_else(|| string_numeric_coercion_as_numeric(lhs_type, rhs_type))
@@ -817,13 +967,13 @@ fn string_temporal_coercion(
                 match temporal {
                     Date32 | Date64 => Some(temporal.clone()),
                     Time32(_) | Time64(_) => {
-                        if is_time_with_valid_unit(temporal.to_owned()) {
+                        if is_time_with_valid_unit(temporal) {
                             Some(temporal.to_owned())
                         } else {
                             None
                         }
                     }
-                    Timestamp(_, tz) => Some(Timestamp(TimeUnit::Nanosecond, tz.clone())),
+                    Timestamp(_, tz) => Some(Timestamp(Nanosecond, tz.clone())),
                     _ => None,
                 }
             }
@@ -859,21 +1009,92 @@ pub fn binary_numeric_coercion(
 pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
 
+    // Prefer decimal data type over floating point for comparison operation
     match (lhs_type, rhs_type) {
-        // Prefer decimal data type over floating point for comparison operation
-        (Decimal128(_, _), Decimal128(_, _)) => {
+        // Same decimal types
+        (lhs_type, rhs_type)
+            if lhs_type.is_decimal()
+                && rhs_type.is_decimal()
+                && std::mem::discriminant(lhs_type)
+                    == std::mem::discriminant(rhs_type) =>
+        {
             get_wider_decimal_type(lhs_type, rhs_type)
         }
-        (Decimal128(_, _), _) => get_common_decimal_type(lhs_type, rhs_type),
-        (_, Decimal128(_, _)) => get_common_decimal_type(rhs_type, lhs_type),
-        (Decimal256(_, _), Decimal256(_, _)) => {
-            get_wider_decimal_type(lhs_type, rhs_type)
+        // Mismatched decimal types
+        (lhs_type, rhs_type)
+            if lhs_type.is_decimal()
+                && rhs_type.is_decimal()
+                && std::mem::discriminant(lhs_type)
+                    != std::mem::discriminant(rhs_type) =>
+        {
+            get_wider_decimal_type_cross_variant(lhs_type, rhs_type)
+        }
+        // Decimal + non-decimal types
+        (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), _) => {
+            get_common_decimal_type(lhs_type, rhs_type)
+        }
+        (_, Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) => {
+            get_common_decimal_type(rhs_type, lhs_type)
         }
-        (Decimal256(_, _), _) => get_common_decimal_type(lhs_type, rhs_type),
-        (_, Decimal256(_, _)) => get_common_decimal_type(rhs_type, lhs_type),
         (_, _) => None,
     }
 }
+/// Handle cross-variant decimal widening by choosing the larger variant
+fn get_wider_decimal_type_cross_variant(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    let (p1, s1) = match lhs_type {
+        Decimal32(p, s) => (*p, *s),
+        Decimal64(p, s) => (*p, *s),
+        Decimal128(p, s) => (*p, *s),
+        Decimal256(p, s) => (*p, *s),
+        _ => return None,
+    };
+
+    let (p2, s2) = match rhs_type {
+        Decimal32(p, s) => (*p, *s),
+        Decimal64(p, s) => (*p, *s),
+        Decimal128(p, s) => (*p, *s),
+        Decimal256(p, s) => (*p, *s),
+        _ => return None,
+    };
+
+    // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+    let s = s1.max(s2);
+    let range = (p1 as i8 - s1).max(p2 as i8 - s2);
+    let required_precision = (range + s) as u8;
+
+    // Choose the larger variant between the two input types, while making sure we don't overflow the precision.
+    match (lhs_type, rhs_type) {
+        (Decimal32(_, _), Decimal64(_, _)) | (Decimal64(_, _), Decimal32(_, _))
+            if required_precision <= DECIMAL64_MAX_PRECISION =>
+        {
+            Some(Decimal64(required_precision, s))
+        }
+        (Decimal32(_, _), Decimal128(_, _))
+        | (Decimal128(_, _), Decimal32(_, _))
+        | (Decimal64(_, _), Decimal128(_, _))
+        | (Decimal128(_, _), Decimal64(_, _))
+            if required_precision <= DECIMAL128_MAX_PRECISION =>
+        {
+            Some(Decimal128(required_precision, s))
+        }
+        (Decimal32(_, _), Decimal256(_, _))
+        | (Decimal256(_, _), Decimal32(_, _))
+        | (Decimal64(_, _), Decimal256(_, _))
+        | (Decimal256(_, _), Decimal64(_, _))
+        | (Decimal128(_, _), Decimal256(_, _))
+        | (Decimal256(_, _), Decimal128(_, _))
+            if required_precision <= DECIMAL256_MAX_PRECISION =>
+        {
+            Some(Decimal256(required_precision, s))
+        }
+        _ => None,
+    }
+}
 
 /// Coerce `lhs_type` and `rhs_type` to a common type.
 fn get_common_decimal_type(
@@ -882,8 +1103,16 @@ fn get_common_decimal_type(
 ) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match decimal_type {
+        Decimal32(_, _) => {
+            let other_decimal_type = coerce_numeric_type_to_decimal32(other_type)?;
+            get_wider_decimal_type(decimal_type, &other_decimal_type)
+        }
+        Decimal64(_, _) => {
+            let other_decimal_type = coerce_numeric_type_to_decimal64(other_type)?;
+            get_wider_decimal_type(decimal_type, &other_decimal_type)
+        }
         Decimal128(_, _) => {
-            let other_decimal_type = coerce_numeric_type_to_decimal(other_type)?;
+            let other_decimal_type = coerce_numeric_type_to_decimal128(other_type)?;
             get_wider_decimal_type(decimal_type, &other_decimal_type)
         }
         Decimal256(_, _) => {
@@ -894,7 +1123,7 @@ fn get_common_decimal_type(
     }
 }
 
-/// Returns a `DataType::Decimal128` that can store any value from either
+/// Returns a decimal [`DataType`] variant that can store any value from either
 /// `lhs_decimal_type` and `rhs_decimal_type`
 ///
 /// The result decimal type is `(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))`.
@@ -903,11 +1132,23 @@ fn get_wider_decimal_type(
     rhs_type: &DataType,
 ) -> Option<DataType> {
     match (lhs_decimal_type, rhs_type) {
+        (DataType::Decimal32(p1, s1), DataType::Decimal32(p2, s2)) => {
+            // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+            let s = *s1.max(s2);
+            let range = (*p1 as i8 - s1).max(*p2 as i8 - s2);
+            Some(create_decimal32_type((range + s) as u8, s))
+        }
+        (DataType::Decimal64(p1, s1), DataType::Decimal64(p2, s2)) => {
+            // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+            let s = *s1.max(s2);
+            let range = (*p1 as i8 - s1).max(*p2 as i8 - s2);
+            Some(create_decimal64_type((range + s) as u8, s))
+        }
         (DataType::Decimal128(p1, s1), DataType::Decimal128(p2, s2)) => {
             // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
             let s = *s1.max(s2);
             let range = (*p1 as i8 - s1).max(*p2 as i8 - s2);
-            Some(create_decimal_type((range + s) as u8, s))
+            Some(create_decimal128_type((range + s) as u8, s))
         }
         (DataType::Decimal256(p1, s1), DataType::Decimal256(p2, s2)) => {
             // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
@@ -921,7 +1162,39 @@ fn get_wider_decimal_type(
 
 /// Convert the numeric data type to the decimal data type.
 /// We support signed and unsigned integer types and floating-point type.
-fn coerce_numeric_type_to_decimal(numeric_type: &DataType) -> Option<DataType> {
+fn coerce_numeric_type_to_decimal32(numeric_type: &DataType) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+    // This conversion rule is from spark
+    // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127
+    match numeric_type {
+        Int8 | UInt8 => Some(Decimal32(3, 0)),
+        Int16 | UInt16 => Some(Decimal32(5, 0)),
+        // TODO if we convert the floating-point data to the decimal type, it maybe overflow.
+        Float16 => Some(Decimal32(6, 3)),
+        _ => None,
+    }
+}
+
+/// Convert the numeric data type to the decimal data type.
+/// We support signed and unsigned integer types and floating-point type.
+fn coerce_numeric_type_to_decimal64(numeric_type: &DataType) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+    // This conversion rule is from spark
+    // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127
+    match numeric_type {
+        Int8 | UInt8 => Some(Decimal64(3, 0)),
+        Int16 | UInt16 => Some(Decimal64(5, 0)),
+        Int32 | UInt32 => Some(Decimal64(10, 0)),
+        // TODO if we convert the floating-point data to the decimal type, it maybe overflow.
+        Float16 => Some(Decimal64(6, 3)),
+        Float32 => Some(Decimal64(14, 7)),
+        _ => None,
+    }
+}
+
+/// Convert the numeric data type to the decimal data type.
+/// We support signed and unsigned integer types and floating-point type.
+fn coerce_numeric_type_to_decimal128(numeric_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     // This conversion rule is from spark
     // https://github.com/apache/spark/blob/1c81ad20296d34f137238dadd67cc6ae405944eb/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L127
@@ -959,30 +1232,123 @@ fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option<DataType
 
 fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
+
     match (lhs_type, rhs_type) {
         (Struct(lhs_fields), Struct(rhs_fields)) => {
+            // Field count must match for coercion
             if lhs_fields.len() != rhs_fields.len() {
                 return None;
             }
 
-            let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter())
-                .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type()))
-                .collect::<Option<Vec<DataType>>>()?;
-
-            // preserve the field name and nullability
-            let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter());
+            // If the two structs have exactly the same set of field names (possibly in
+            // different order), prefer name-based coercion. Otherwise fall back to
+            // positional coercion which preserves backward compatibility.
+            //
+            // Name-based coercion is used in:
+            // 1. Array construction: [s1, s2] where s1 and s2 have reordered fields
+            // 2. UNION operations: different field orders unified by name
+            // 3. VALUES clauses: heterogeneous struct rows unified by field name
+            // 4. JOIN conditions: structs with matching field names
+            // 5. Window functions: partitions/orders by struct fields
+            // 6. Aggregate functions: collecting structs with reordered fields
+            //
+            // See docs/source/user-guide/sql/struct_coercion.md for detailed examples.
+            if fields_have_same_names(lhs_fields, rhs_fields) {
+                return coerce_struct_by_name(lhs_fields, rhs_fields);
+            }
 
-            let fields: Vec<FieldRef> = coerced_types
-                .into_iter()
-                .zip(orig_fields)
-                .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs))
-                .collect();
-            Some(Struct(fields.into()))
+            coerce_struct_by_position(lhs_fields, rhs_fields)
         }
         _ => None,
     }
 }
 
+/// Return true if every left-field name exists in the right fields (and lengths are equal).
+///
+/// # Assumptions
+/// **This function assumes field names within each struct are unique.** This assumption is safe
+/// because field name uniqueness is enforced at multiple levels:
+/// - **Arrow level:** `StructType` construction enforces unique field names at the schema level
+/// - **DataFusion level:** SQL parser rejects duplicate field names in `CREATE TABLE` and struct type definitions
+/// - **Runtime level:** `StructArray::try_new()` validates field uniqueness
+///
+/// Therefore, we don't need to handle degenerate cases like:
+/// - `struct<c1 int> -> struct<c1 int, c1 int>` (target has duplicate field names)
+/// - `struct<c1 int, c1 int> -> struct<c1 int>` (source has duplicate field names)
+fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool {
+    // Debug assertions: field names should be unique within each struct
+    #[cfg(debug_assertions)]
+    {
+        let lhs_names: HashSet<_> = lhs_fields.iter().map(|f| f.name()).collect();
+        assert_eq!(
+            lhs_names.len(),
+            lhs_fields.len(),
+            "Struct has duplicate field names (should be caught by Arrow schema validation)"
+        );
+
+        let rhs_names_check: HashSet<_> = rhs_fields.iter().map(|f| f.name()).collect();
+        assert_eq!(
+            rhs_names_check.len(),
+            rhs_fields.len(),
+            "Struct has duplicate field names (should be caught by Arrow schema validation)"
+        );
+    }
+
+    let rhs_names: HashSet<&str> = rhs_fields.iter().map(|f| f.name().as_str()).collect();
+    lhs_fields
+        .iter()
+        .all(|lf| rhs_names.contains(lf.name().as_str()))
+}
+
+/// Coerce two structs by matching fields by name. Assumes the name-sets match.
+fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    let rhs_by_name: HashMap<&str, &FieldRef> =
+        rhs_fields.iter().map(|f| (f.name().as_str(), f)).collect();
+
+    let mut coerced: Vec<FieldRef> = Vec::with_capacity(lhs_fields.len());
+
+    for lhs in lhs_fields.iter() {
+        let rhs = rhs_by_name.get(lhs.name().as_str()).unwrap(); // safe: caller ensured names match
+        let coerced_type = comparison_coercion(lhs.data_type(), rhs.data_type())?;
+        let is_nullable = lhs.is_nullable() || rhs.is_nullable();
+        coerced.push(Arc::new(Field::new(
+            lhs.name().clone(),
+            coerced_type,
+            is_nullable,
+        )));
+    }
+
+    Some(Struct(coerced.into()))
+}
+
+/// Coerce two structs positionally (left-to-right). This preserves field names from
+/// the left struct and uses the combined nullability.
+fn coerce_struct_by_position(
+    lhs_fields: &Fields,
+    rhs_fields: &Fields,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    // First coerce individual types; fail early if any pair cannot be coerced.
+    let coerced_types: Vec<DataType> = lhs_fields
+        .iter()
+        .zip(rhs_fields.iter())
+        .map(|(l, r)| comparison_coercion(l.data_type(), r.data_type()))
+        .collect::<Option<Vec<DataType>>>()?;
+
+    // Build final fields preserving left-side names and combined nullability.
+    let orig_pairs = lhs_fields.iter().zip(rhs_fields.iter());
+    let fields: Vec<FieldRef> = coerced_types
+        .into_iter()
+        .zip(orig_pairs)
+        .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs))
+        .collect();
+
+    Some(Struct(fields.into()))
+}
+
 /// returns the result of coercing two fields to a common type
 fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef {
     let is_nullable = lhs.is_nullable() || rhs.is_nullable();
@@ -1070,7 +1436,21 @@ fn numerical_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTy
     }
 }
 
-fn create_decimal_type(precision: u8, scale: i8) -> DataType {
+fn create_decimal32_type(precision: u8, scale: i8) -> DataType {
+    DataType::Decimal32(
+        DECIMAL32_MAX_PRECISION.min(precision),
+        DECIMAL32_MAX_SCALE.min(scale),
+    )
+}
+
+fn create_decimal64_type(precision: u8, scale: i8) -> DataType {
+    DataType::Decimal64(
+        DECIMAL64_MAX_PRECISION.min(precision),
+        DECIMAL64_MAX_SCALE.min(scale),
+    )
+}
+
+fn create_decimal128_type(precision: u8, scale: i8) -> DataType {
     DataType::Decimal128(
         DECIMAL128_MAX_PRECISION.min(precision),
         DECIMAL128_MAX_SCALE.min(scale),
@@ -1103,38 +1483,142 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) ->
     }
 }
 
-/// Coercion rules for Dictionaries: the type that both lhs and rhs
+/// Generic coercion rules for Dictionaries: the type that both lhs and rhs
 /// can be casted to for the purpose of a computation.
 ///
 /// Not all operators support dictionaries, if `preserve_dictionaries` is true
-/// dictionaries will be preserved if possible
-fn dictionary_comparison_coercion(
+/// dictionaries will be preserved if possible.
+///
+/// The `coerce_fn` parameter determines which comparison coercion function to use
+/// for comparing the dictionary value types.
+fn dictionary_comparison_coercion_generic(
     lhs_type: &DataType,
     rhs_type: &DataType,
     preserve_dictionaries: bool,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
 ) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
         (
             Dictionary(_lhs_index_type, lhs_value_type),
             Dictionary(_rhs_index_type, rhs_value_type),
-        ) => comparison_coercion(lhs_value_type, rhs_value_type),
+        ) => coerce_fn(lhs_value_type, rhs_value_type),
         (d @ Dictionary(_, value_type), other_type)
         | (other_type, d @ Dictionary(_, value_type))
             if preserve_dictionaries && value_type.as_ref() == other_type =>
         {
             Some(d.clone())
         }
-        (Dictionary(_index_type, value_type), _) => {
-            comparison_coercion(value_type, rhs_type)
+        (Dictionary(_index_type, value_type), _) => coerce_fn(value_type, rhs_type),
+        (_, Dictionary(_index_type, value_type)) => coerce_fn(lhs_type, value_type),
+        _ => None,
+    }
+}
+
+/// Coercion rules for Dictionaries: the type that both lhs and rhs
+/// can be casted to for the purpose of a computation.
+///
+/// Not all operators support dictionaries, if `preserve_dictionaries` is true
+/// dictionaries will be preserved if possible
+fn dictionary_comparison_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    preserve_dictionaries: bool,
+) -> Option<DataType> {
+    dictionary_comparison_coercion_generic(
+        lhs_type,
+        rhs_type,
+        preserve_dictionaries,
+        comparison_coercion,
+    )
+}
+
+/// Coercion rules for Dictionaries with numeric preference: similar to
+/// [`dictionary_comparison_coercion`] but uses [`comparison_coercion_numeric`]
+/// which prefers numeric types over strings when both are present.
+///
+/// This is used by [`comparison_coercion_numeric`] to maintain consistent
+/// numeric-preferring semantics when dealing with dictionary types.
+fn dictionary_comparison_coercion_numeric(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    preserve_dictionaries: bool,
+) -> Option<DataType> {
+    dictionary_comparison_coercion_generic(
+        lhs_type,
+        rhs_type,
+        preserve_dictionaries,
+        comparison_coercion_numeric,
+    )
+}
+
+/// Coercion rules for RunEndEncoded: the type that both lhs and rhs
+/// can be casted to for the purpose of a computation.
+///
+/// Not all operators support REE, if `preserve_ree` is true
+/// REE will be preserved if possible
+///
+/// The `coerce_fn` parameter determines which comparison coercion function to use
+/// for comparing the REE value types.
+fn ree_comparison_coercion_generic(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    preserve_ree: bool,
+    coerce_fn: fn(&DataType, &DataType) -> Option<DataType>,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+    match (lhs_type, rhs_type) {
+        (RunEndEncoded(_, lhs_values_field), RunEndEncoded(_, rhs_values_field)) => {
+            coerce_fn(lhs_values_field.data_type(), rhs_values_field.data_type())
+        }
+        (ree @ RunEndEncoded(_, values_field), other_type)
+        | (other_type, ree @ RunEndEncoded(_, values_field))
+            if preserve_ree && values_field.data_type() == other_type =>
+        {
+            Some(ree.clone())
+        }
+        (RunEndEncoded(_, values_field), _) => {
+            coerce_fn(values_field.data_type(), rhs_type)
         }
-        (_, Dictionary(_index_type, value_type)) => {
-            comparison_coercion(lhs_type, value_type)
+        (_, RunEndEncoded(_, values_field)) => {
+            coerce_fn(lhs_type, values_field.data_type())
         }
         _ => None,
     }
 }
 
+/// Coercion rules for RunEndEncoded: the type that both lhs and rhs
+/// can be casted to for the purpose of a computation.
+///
+/// Not all operators support REE, if `preserve_ree` is true
+/// REE will be preserved if possible
+fn ree_comparison_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    preserve_ree: bool,
+) -> Option<DataType> {
+    ree_comparison_coercion_generic(lhs_type, rhs_type, preserve_ree, comparison_coercion)
+}
+
+/// Coercion rules for RunEndEncoded with numeric preference: similar to
+/// [`ree_comparison_coercion`] but uses [`comparison_coercion_numeric`]
+/// which prefers numeric types over strings when both are present.
+///
+/// This is used by [`comparison_coercion_numeric`] to maintain consistent
+/// numeric-preferring semantics when dealing with REE types.
+fn ree_comparison_coercion_numeric(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+    preserve_ree: bool,
+) -> Option<DataType> {
+    ree_comparison_coercion_generic(
+        lhs_type,
+        rhs_type,
+        preserve_ree,
+        comparison_coercion_numeric,
+    )
+}
+
 /// Coercion rules for string concat.
 /// This is a union of string coercion rules and specified rules:
 /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8)
@@ -1202,7 +1686,8 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
 fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
-        (Utf8 | LargeUtf8, other_type) | (other_type, Utf8 | LargeUtf8)
+        (Utf8 | LargeUtf8 | Utf8View, other_type)
+        | (other_type, Utf8 | LargeUtf8 | Utf8View)
             if other_type.is_numeric() =>
         {
             Some(other_type.clone())
@@ -1312,12 +1797,14 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
 }
 
 /// Coercion rules for like operations.
-/// This is a union of string coercion rules and dictionary coercion rules
+/// This is a union of string coercion rules, dictionary coercion rules, and REE coercion rules
+/// Note: list_coercion is intentionally NOT included here because LIKE is a string pattern
+/// matching operation and is not supported for nested types (List, Struct, etc.)
 pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     string_coercion(lhs_type, rhs_type)
-        .or_else(|| list_coercion(lhs_type, rhs_type))
         .or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
         .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
+        .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, false))
         .or_else(|| regex_null_coercion(lhs_type, rhs_type))
         .or_else(|| null_coercion(lhs_type, rhs_type))
 }
@@ -1344,13 +1831,13 @@ pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTy
 /// Checks if the TimeUnit associated with a Time32 or Time64 type is consistent,
 /// as Time32 can only be used to Second and Millisecond accuracy, while Time64
 /// is exclusively used to Microsecond and Nanosecond accuracy
-fn is_time_with_valid_unit(datatype: DataType) -> bool {
+fn is_time_with_valid_unit(datatype: &DataType) -> bool {
     matches!(
         datatype,
-        DataType::Time32(TimeUnit::Second)
-            | DataType::Time32(TimeUnit::Millisecond)
-            | DataType::Time64(TimeUnit::Microsecond)
-            | DataType::Time64(TimeUnit::Nanosecond)
+        &DataType::Time32(Second)
+            | &DataType::Time32(Millisecond)
+            | &DataType::Time64(Microsecond)
+            | &DataType::Time64(Nanosecond)
     )
 }
 
@@ -1436,6 +1923,73 @@ fn temporal_coercion_strict_timezone(
     }
 }
 
+fn temporal_math_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<(DataType, DataType)> {
+    use DataType::*;
+
+    match (lhs_type, rhs_type) {
+        // Coerce Date + int -> Date + Interval
+        (Date32, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64) => {
+            Some((Date32, Interval(MonthDayNano)))
+        }
+        (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date32) => {
+            Some((Interval(MonthDayNano), Date32))
+        }
+        (Date64, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64) => {
+            Some((Date64, Interval(MonthDayNano)))
+        }
+        (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date64) => {
+            Some((Interval(MonthDayNano), Date64))
+        }
+        // Coerce Date + time -> timestamp + Duration
+        (Date32, Time32(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time32(_), Date32) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date32, Time64(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time64(_), Date32) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date64, Time32(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time32(_), Date64) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        (Date64, Time64(_)) => Some((Timestamp(Nanosecond, None), Duration(Nanosecond))),
+        (Time64(_), Date64) => Some((Duration(Nanosecond), Timestamp(Nanosecond, None))),
+
+        // Coerce Duration to match Timestamp's unit,
+        // e.g. Timestamp(ms) + Duration(s) → Timestamp(ms) + Duration(ms)
+        (Timestamp(ts_unit, tz), Duration(_)) => {
+            Some((Timestamp(*ts_unit, tz.clone()), Duration(*ts_unit)))
+        }
+        (Duration(_), Timestamp(ts_unit, tz)) => {
+            Some((Duration(*ts_unit), Timestamp(*ts_unit, tz.clone())))
+        }
+        // time - time -> Interval
+        (Time32(_) | Time64(_), Time32(_) | Time64(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        // time + interval -> Interval
+        (Time32(_) | Time64(_), Interval(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        (Interval(_), Time32(_) | Time64(_)) => {
+            Some((Interval(MonthDayNano), Interval(MonthDayNano)))
+        }
+        // Interval * number => Interval
+        (
+            Interval(_),
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 | Float16
+            | Float32 | Float64,
+        ) => Some((Interval(MonthDayNano), Interval(MonthDayNano))),
+        (
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 | Float16
+            | Float32 | Float64,
+            Interval(_),
+        ) => Some((Interval(MonthDayNano), Interval(MonthDayNano))),
+        _ => None,
+    }
+}
+
 fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     use arrow::datatypes::DataType::*;
     use arrow::datatypes::IntervalUnit::*;
@@ -1445,7 +1999,19 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
         (Interval(_) | Duration(_), Interval(_) | Duration(_)) => {
             Some(Interval(MonthDayNano))
         }
+        (Date32, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64)
+        | (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date32) => {
+            Some(Date32)
+        }
+        (Date64, Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64)
+        | (Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64, Date64) => {
+            Some(Date64)
+        }
         (Date64, Date32) | (Date32, Date64) => Some(Date64),
+        (Date32, Time32(_)) | (Time32(_), Date32) => Some(Timestamp(Nanosecond, None)),
+        (Date32, Time64(_)) | (Time64(_), Date32) => Some(Timestamp(Nanosecond, None)),
+        (Date64, Time32(_)) | (Time32(_), Date64) => Some(Timestamp(Nanosecond, None)),
+        (Date64, Time64(_)) | (Time64(_), Date64) => Some(Timestamp(Nanosecond, None)),
         (Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => {
             Some(Timestamp(Nanosecond, None))
         }
@@ -1484,8 +2050,8 @@ fn timeunit_coercion(lhs_unit: &TimeUnit, rhs_unit: &TimeUnit) -> TimeUnit {
     }
 }
 
-/// Coercion rules from NULL type. Since NULL can be casted to any other type in arrow,
-/// either lhs or rhs is NULL, if NULL can be casted to type of the other side, the coercion is valid.
+/// Coercion rules from NULL type. Since NULL can be cast to any other type in arrow,
+/// either lhs or rhs is NULL, if NULL can be cast to type of the other side, the coercion is valid.
 fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
     match (lhs_type, rhs_type) {
         (DataType::Null, other_type) | (other_type, DataType::Null) => {
@@ -1500,1085 +2066,4 @@ fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
 }
 
 #[cfg(test)]
-mod tests {
-    use super::*;
-
-    use datafusion_common::assert_contains;
-
-    #[test]
-    fn test_coercion_error() -> Result<()> {
-        let coercer =
-            BinaryTypeCoercer::new(&DataType::Float32, &Operator::Plus, &DataType::Utf8);
-        let result_type = coercer.get_input_types();
-
-        let e = result_type.unwrap_err();
-        assert_eq!(e.strip_backtrace(), "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types");
-        Ok(())
-    }
-
-    #[test]
-    fn test_decimal_binary_comparison_coercion() -> Result<()> {
-        let input_decimal = DataType::Decimal128(20, 3);
-        let input_types = [
-            DataType::Int8,
-            DataType::Int16,
-            DataType::Int32,
-            DataType::Int64,
-            DataType::Float32,
-            DataType::Float64,
-            DataType::Decimal128(38, 10),
-            DataType::Decimal128(20, 8),
-            DataType::Null,
-        ];
-        let result_types = [
-            DataType::Decimal128(20, 3),
-            DataType::Decimal128(20, 3),
-            DataType::Decimal128(20, 3),
-            DataType::Decimal128(23, 3),
-            DataType::Decimal128(24, 7),
-            DataType::Decimal128(32, 15),
-            DataType::Decimal128(38, 10),
-            DataType::Decimal128(25, 8),
-            DataType::Decimal128(20, 3),
-        ];
-        let comparison_op_types = [
-            Operator::NotEq,
-            Operator::Eq,
-            Operator::Gt,
-            Operator::GtEq,
-            Operator::Lt,
-            Operator::LtEq,
-        ];
-        for (i, input_type) in input_types.iter().enumerate() {
-            let expect_type = &result_types[i];
-            for op in comparison_op_types {
-                let (lhs, rhs) = BinaryTypeCoercer::new(&input_decimal, &op, input_type)
-                    .get_input_types()?;
-                assert_eq!(expect_type, &lhs);
-                assert_eq!(expect_type, &rhs);
-            }
-        }
-        // negative test
-        let result_type =
-            BinaryTypeCoercer::new(&input_decimal, &Operator::Eq, &DataType::Boolean)
-                .get_input_types();
-        assert!(result_type.is_err());
-        Ok(())
-    }
-
-    #[test]
-    fn test_decimal_mathematics_op_type() {
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Int8).unwrap(),
-            DataType::Decimal128(3, 0)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Int16).unwrap(),
-            DataType::Decimal128(5, 0)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Int32).unwrap(),
-            DataType::Decimal128(10, 0)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Int64).unwrap(),
-            DataType::Decimal128(20, 0)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Float16).unwrap(),
-            DataType::Decimal128(6, 3)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Float32).unwrap(),
-            DataType::Decimal128(14, 7)
-        );
-        assert_eq!(
-            coerce_numeric_type_to_decimal(&DataType::Float64).unwrap(),
-            DataType::Decimal128(30, 15)
-        );
-    }
-
-    #[test]
-    fn test_dictionary_type_coercion() {
-        use DataType::*;
-
-        let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32));
-        let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-            Some(Int32)
-        );
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
-            Some(Int32)
-        );
-
-        // Since we can coerce values of Int16 to Utf8 can support this
-        let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
-        let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-            Some(Utf8)
-        );
-
-        // Since we can coerce values of Utf8 to Binary can support this
-        let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
-        let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary));
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-            Some(Binary)
-        );
-
-        let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
-        let rhs_type = Utf8;
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
-            Some(Utf8)
-        );
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-            Some(lhs_type.clone())
-        );
-
-        let lhs_type = Utf8;
-        let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
-            Some(Utf8)
-        );
-        assert_eq!(
-            dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
-            Some(rhs_type.clone())
-        );
-    }
-
-    /// Test coercion rules for binary operators
-    ///
-    /// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that
-    /// the result type is `$RESULT_TYPE`
-    macro_rules! test_coercion_binary_rule {
-        ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{
-            let (lhs, rhs) =
-                BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &$RHS_TYPE).get_input_types()?;
-            assert_eq!(lhs, $RESULT_TYPE);
-            assert_eq!(rhs, $RESULT_TYPE);
-        }};
-    }
-
-    /// Test coercion rules for binary operators
-    ///
-    /// Applies coercion rules for each RHS_TYPE in $RHS_TYPES such that
-    /// `$LHS_TYPE $OP RHS_TYPE` and asserts that the result type is `$RESULT_TYPE`.
-    /// Also tests that the inverse `RHS_TYPE $OP $LHS_TYPE` is true
-    macro_rules! test_coercion_binary_rule_multiple {
-        ($LHS_TYPE:expr, $RHS_TYPES:expr, $OP:expr, $RESULT_TYPE:expr) => {{
-            for rh_type in $RHS_TYPES {
-                let (lhs, rhs) = BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &rh_type)
-                    .get_input_types()?;
-                assert_eq!(lhs, $RESULT_TYPE);
-                assert_eq!(rhs, $RESULT_TYPE);
-
-                BinaryTypeCoercer::new(&rh_type, &$OP, &$LHS_TYPE).get_input_types()?;
-                assert_eq!(lhs, $RESULT_TYPE);
-                assert_eq!(rhs, $RESULT_TYPE);
-            }
-        }};
-    }
-
-    /// Test coercion rules for like
-    ///
-    /// Applies coercion rules for both
-    /// * `$LHS_TYPE LIKE $RHS_TYPE`
-    /// * `$RHS_TYPE LIKE $LHS_TYPE`
-    ///
-    /// And asserts the result type is `$RESULT_TYPE`
-    macro_rules! test_like_rule {
-        ($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{
-            println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE);
-            let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE);
-            assert_eq!(result, $RESULT_TYPE);
-            // reverse the order
-            let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE);
-            assert_eq!(result, $RESULT_TYPE);
-        }};
-    }
-
-    #[test]
-    fn test_date_timestamp_arithmetic_error() -> Result<()> {
-        let (lhs, rhs) = BinaryTypeCoercer::new(
-            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-            &Operator::Minus,
-            &DataType::Timestamp(TimeUnit::Millisecond, None),
-        )
-        .get_input_types()?;
-        assert_eq!(lhs.to_string(), "Timestamp(Millisecond, None)");
-        assert_eq!(rhs.to_string(), "Timestamp(Millisecond, None)");
-
-        let err =
-            BinaryTypeCoercer::new(&DataType::Date32, &Operator::Plus, &DataType::Date64)
-                .get_input_types()
-                .unwrap_err()
-                .to_string();
-
-        assert_contains!(
-            &err,
-            "Cannot get result type for temporal operation Date64 + Date64"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_like_coercion() {
-        // string coerce to strings
-        test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8));
-        test_like_rule!(
-            DataType::LargeUtf8,
-            DataType::Utf8,
-            Some(DataType::LargeUtf8)
-        );
-        test_like_rule!(
-            DataType::Utf8,
-            DataType::LargeUtf8,
-            Some(DataType::LargeUtf8)
-        );
-        test_like_rule!(
-            DataType::LargeUtf8,
-            DataType::LargeUtf8,
-            Some(DataType::LargeUtf8)
-        );
-
-        // Also coerce binary to strings
-        test_like_rule!(DataType::Binary, DataType::Utf8, Some(DataType::Utf8));
-        test_like_rule!(
-            DataType::LargeBinary,
-            DataType::Utf8,
-            Some(DataType::LargeUtf8)
-        );
-        test_like_rule!(
-            DataType::Binary,
-            DataType::LargeUtf8,
-            Some(DataType::LargeUtf8)
-        );
-        test_like_rule!(
-            DataType::LargeBinary,
-            DataType::LargeUtf8,
-            Some(DataType::LargeUtf8)
-        );
-    }
-
-    #[test]
-    fn test_type_coercion() -> Result<()> {
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Date32,
-            Operator::Eq,
-            DataType::Date32
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Date64,
-            Operator::Lt,
-            DataType::Date64
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Time32(TimeUnit::Second),
-            Operator::Eq,
-            DataType::Time32(TimeUnit::Second)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Time32(TimeUnit::Millisecond),
-            Operator::Eq,
-            DataType::Time32(TimeUnit::Millisecond)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Time64(TimeUnit::Microsecond),
-            Operator::Eq,
-            DataType::Time64(TimeUnit::Microsecond)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Time64(TimeUnit::Nanosecond),
-            Operator::Eq,
-            DataType::Time64(TimeUnit::Nanosecond)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Timestamp(TimeUnit::Second, None),
-            Operator::Lt,
-            DataType::Timestamp(TimeUnit::Nanosecond, None)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Timestamp(TimeUnit::Millisecond, None),
-            Operator::Lt,
-            DataType::Timestamp(TimeUnit::Nanosecond, None)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Timestamp(TimeUnit::Microsecond, None),
-            Operator::Lt,
-            DataType::Timestamp(TimeUnit::Nanosecond, None)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Timestamp(TimeUnit::Nanosecond, None),
-            Operator::Lt,
-            DataType::Timestamp(TimeUnit::Nanosecond, None)
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8,
-            Operator::RegexMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8View,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8View,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8,
-            Operator::RegexNotMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8View,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8View,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8,
-            Operator::RegexNotIMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Utf8View,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8View,
-            DataType::Utf8View,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8,
-            Operator::RegexMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8View,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8View,
-            Operator::RegexMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8,
-            Operator::RegexIMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8,
-            Operator::RegexIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8View,
-            Operator::RegexIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8View,
-            Operator::RegexIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8,
-            Operator::RegexNotMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8View,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8View,
-            Operator::RegexNotMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8,
-            Operator::RegexNotIMatch,
-            DataType::Utf8
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
-            DataType::Utf8View,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
-            DataType::Utf8View,
-            Operator::RegexNotIMatch,
-            DataType::Utf8View
-        );
-        test_coercion_binary_rule!(
-            DataType::Int16,
-            DataType::Int64,
-            Operator::BitwiseAnd,
-            DataType::Int64
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt64,
-            DataType::UInt64,
-            Operator::BitwiseAnd,
-            DataType::UInt64
-        );
-        test_coercion_binary_rule!(
-            DataType::Int8,
-            DataType::UInt32,
-            Operator::BitwiseAnd,
-            DataType::Int64
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt32,
-            DataType::Int32,
-            Operator::BitwiseAnd,
-            DataType::Int64
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt16,
-            DataType::Int16,
-            Operator::BitwiseAnd,
-            DataType::Int32
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt32,
-            DataType::UInt32,
-            Operator::BitwiseAnd,
-            DataType::UInt32
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt16,
-            DataType::UInt32,
-            Operator::BitwiseAnd,
-            DataType::UInt32
-        );
-        Ok(())
-    }
-
-    #[test]
-    fn test_type_coercion_arithmetic() -> Result<()> {
-        use DataType::*;
-
-        // (Float64, _) | (_, Float64) => Some(Float64),
-        test_coercion_binary_rule_multiple!(
-            Float64,
-            [
-                Float64, Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16,
-                Int8, UInt8
-            ],
-            Operator::Plus,
-            Float64
-        );
-        // (_, Float32) | (Float32, _) => Some(Float32),
-        test_coercion_binary_rule_multiple!(
-            Float32,
-            [
-                Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8,
-                UInt8
-            ],
-            Operator::Plus,
-            Float32
-        );
-        // (_, Float16) | (Float16, _) => Some(Float16),
-        test_coercion_binary_rule_multiple!(
-            Float16,
-            [Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8],
-            Operator::Plus,
-            Float16
-        );
-        // (UInt64, Int64 | Int32 | Int16 | Int8) | (Int64 | Int32 | Int16 | Int8, UInt64)  => Some(Decimal128(20, 0)),
-        test_coercion_binary_rule_multiple!(
-            UInt64,
-            [Int64, Int32, Int16, Int8],
-            Operator::Divide,
-            Decimal128(20, 0)
-        );
-        // (UInt64, _) | (_, UInt64) => Some(UInt64),
-        test_coercion_binary_rule_multiple!(
-            UInt64,
-            [UInt64, UInt32, UInt16, UInt8],
-            Operator::Modulo,
-            UInt64
-        );
-        // (Int64, _) | (_, Int64) => Some(Int64),
-        test_coercion_binary_rule_multiple!(
-            Int64,
-            [Int64, Int32, UInt32, Int16, UInt16, Int8, UInt8],
-            Operator::Modulo,
-            Int64
-        );
-        // (UInt32, Int32 | Int16 | Int8) | (Int32 | Int16 | Int8, UInt32) => Some(Int64)
-        test_coercion_binary_rule_multiple!(
-            UInt32,
-            [Int32, Int16, Int8],
-            Operator::Modulo,
-            Int64
-        );
-        // (UInt32, _) | (_, UInt32) => Some(UInt32),
-        test_coercion_binary_rule_multiple!(
-            UInt32,
-            [UInt32, UInt16, UInt8],
-            Operator::Modulo,
-            UInt32
-        );
-        // (Int32, _) | (_, Int32) => Some(Int32),
-        test_coercion_binary_rule_multiple!(
-            Int32,
-            [Int32, Int16, Int8],
-            Operator::Modulo,
-            Int32
-        );
-        // (UInt16, Int16 | Int8) | (Int16 | Int8, UInt16) => Some(Int32)
-        test_coercion_binary_rule_multiple!(
-            UInt16,
-            [Int16, Int8],
-            Operator::Minus,
-            Int32
-        );
-        // (UInt16, _) | (_, UInt16) => Some(UInt16),
-        test_coercion_binary_rule_multiple!(
-            UInt16,
-            [UInt16, UInt8, UInt8],
-            Operator::Plus,
-            UInt16
-        );
-        // (Int16, _) | (_, Int16) => Some(Int16),
-        test_coercion_binary_rule_multiple!(Int16, [Int16, Int8], Operator::Plus, Int16);
-        // (UInt8, Int8) | (Int8, UInt8) => Some(Int16)
-        test_coercion_binary_rule!(Int8, UInt8, Operator::Minus, Int16);
-        test_coercion_binary_rule!(UInt8, Int8, Operator::Multiply, Int16);
-        // (UInt8, _) | (_, UInt8) => Some(UInt8),
-        test_coercion_binary_rule!(UInt8, UInt8, Operator::Minus, UInt8);
-        // (Int8, _) | (_, Int8) => Some(Int8),
-        test_coercion_binary_rule!(Int8, Int8, Operator::Plus, Int8);
-
-        Ok(())
-    }
-
-    fn test_math_decimal_coercion_rule(
-        lhs_type: DataType,
-        rhs_type: DataType,
-        expected_lhs_type: DataType,
-        expected_rhs_type: DataType,
-    ) {
-        // The coerced types for lhs and rhs, if any of them is not decimal
-        let (lhs_type, rhs_type) = math_decimal_coercion(&lhs_type, &rhs_type).unwrap();
-        assert_eq!(lhs_type, expected_lhs_type);
-        assert_eq!(rhs_type, expected_rhs_type);
-    }
-
-    #[test]
-    fn test_coercion_arithmetic_decimal() -> Result<()> {
-        test_math_decimal_coercion_rule(
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 2),
-        );
-
-        test_math_decimal_coercion_rule(
-            DataType::Int32,
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 0),
-            DataType::Decimal128(10, 2),
-        );
-
-        test_math_decimal_coercion_rule(
-            DataType::Int32,
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 0),
-            DataType::Decimal128(10, 2),
-        );
-
-        test_math_decimal_coercion_rule(
-            DataType::Int32,
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 0),
-            DataType::Decimal128(10, 2),
-        );
-
-        test_math_decimal_coercion_rule(
-            DataType::Int32,
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 0),
-            DataType::Decimal128(10, 2),
-        );
-
-        test_math_decimal_coercion_rule(
-            DataType::Int32,
-            DataType::Decimal128(10, 2),
-            DataType::Decimal128(10, 0),
-            DataType::Decimal128(10, 2),
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_type_coercion_compare() -> Result<()> {
-        // boolean
-        test_coercion_binary_rule!(
-            DataType::Boolean,
-            DataType::Boolean,
-            Operator::Eq,
-            DataType::Boolean
-        );
-        // float
-        test_coercion_binary_rule!(
-            DataType::Float16,
-            DataType::Int64,
-            Operator::Eq,
-            DataType::Float16
-        );
-        test_coercion_binary_rule!(
-            DataType::Float16,
-            DataType::Float64,
-            Operator::Eq,
-            DataType::Float64
-        );
-        test_coercion_binary_rule!(
-            DataType::Float32,
-            DataType::Int64,
-            Operator::Eq,
-            DataType::Float32
-        );
-        test_coercion_binary_rule!(
-            DataType::Float32,
-            DataType::Float64,
-            Operator::GtEq,
-            DataType::Float64
-        );
-        // signed integer
-        test_coercion_binary_rule!(
-            DataType::Int8,
-            DataType::Int32,
-            Operator::LtEq,
-            DataType::Int32
-        );
-        test_coercion_binary_rule!(
-            DataType::Int64,
-            DataType::Int32,
-            Operator::LtEq,
-            DataType::Int64
-        );
-        // unsigned integer
-        test_coercion_binary_rule!(
-            DataType::UInt32,
-            DataType::UInt8,
-            Operator::Gt,
-            DataType::UInt32
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt64,
-            DataType::UInt8,
-            Operator::Eq,
-            DataType::UInt64
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt64,
-            DataType::Int64,
-            Operator::Eq,
-            DataType::Decimal128(20, 0)
-        );
-        // numeric/decimal
-        test_coercion_binary_rule!(
-            DataType::Int64,
-            DataType::Decimal128(10, 0),
-            Operator::Eq,
-            DataType::Decimal128(20, 0)
-        );
-        test_coercion_binary_rule!(
-            DataType::Int64,
-            DataType::Decimal128(10, 2),
-            Operator::Lt,
-            DataType::Decimal128(22, 2)
-        );
-        test_coercion_binary_rule!(
-            DataType::Float64,
-            DataType::Decimal128(10, 3),
-            Operator::Gt,
-            DataType::Decimal128(30, 15)
-        );
-        test_coercion_binary_rule!(
-            DataType::Int64,
-            DataType::Decimal128(10, 0),
-            Operator::Eq,
-            DataType::Decimal128(20, 0)
-        );
-        test_coercion_binary_rule!(
-            DataType::Decimal128(14, 2),
-            DataType::Decimal128(10, 3),
-            Operator::GtEq,
-            DataType::Decimal128(15, 3)
-        );
-        test_coercion_binary_rule!(
-            DataType::UInt64,
-            DataType::Decimal128(20, 0),
-            Operator::Eq,
-            DataType::Decimal128(20, 0)
-        );
-
-        // Binary
-        test_coercion_binary_rule!(
-            DataType::Binary,
-            DataType::Binary,
-            Operator::Eq,
-            DataType::Binary
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::Binary,
-            Operator::Eq,
-            DataType::Binary
-        );
-        test_coercion_binary_rule!(
-            DataType::Binary,
-            DataType::Utf8,
-            Operator::Eq,
-            DataType::Binary
-        );
-
-        // LargeBinary
-        test_coercion_binary_rule!(
-            DataType::LargeBinary,
-            DataType::LargeBinary,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::Binary,
-            DataType::LargeBinary,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeBinary,
-            DataType::Binary,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::Utf8,
-            DataType::LargeBinary,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeBinary,
-            DataType::Utf8,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeUtf8,
-            DataType::LargeBinary,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeBinary,
-            DataType::LargeUtf8,
-            Operator::Eq,
-            DataType::LargeBinary
-        );
-
-        // Timestamps
-        let utc: Option<Arc<str>> = Some("UTC".into());
-        test_coercion_binary_rule!(
-            DataType::Timestamp(TimeUnit::Second, utc.clone()),
-            DataType::Timestamp(TimeUnit::Second, utc.clone()),
-            Operator::Eq,
-            DataType::Timestamp(TimeUnit::Second, utc.clone())
-        );
-        test_coercion_binary_rule!(
-            DataType::Timestamp(TimeUnit::Second, utc.clone()),
-            DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
-            Operator::Eq,
-            DataType::Timestamp(TimeUnit::Second, utc.clone())
-        );
-        test_coercion_binary_rule!(
-            DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into())),
-            DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
-            Operator::Eq,
-            DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into()))
-        );
-        test_coercion_binary_rule!(
-            DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
-            DataType::Timestamp(TimeUnit::Second, utc),
-            Operator::Eq,
-            DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into()))
-        );
-
-        // list
-        let inner_field = Arc::new(Field::new_list_field(DataType::Int64, true));
-        test_coercion_binary_rule!(
-            DataType::List(Arc::clone(&inner_field)),
-            DataType::List(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::List(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::List(Arc::clone(&inner_field)),
-            DataType::LargeList(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::LargeList(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeList(Arc::clone(&inner_field)),
-            DataType::List(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::LargeList(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeList(Arc::clone(&inner_field)),
-            DataType::LargeList(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::LargeList(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            Operator::Eq,
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10)
-        );
-        test_coercion_binary_rule!(
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            DataType::LargeList(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::LargeList(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::LargeList(Arc::clone(&inner_field)),
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            Operator::Eq,
-            DataType::LargeList(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::List(Arc::clone(&inner_field)),
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            Operator::Eq,
-            DataType::List(Arc::clone(&inner_field))
-        );
-        test_coercion_binary_rule!(
-            DataType::FixedSizeList(Arc::clone(&inner_field), 10),
-            DataType::List(Arc::clone(&inner_field)),
-            Operator::Eq,
-            DataType::List(Arc::clone(&inner_field))
-        );
-
-        // Negative test: inner_timestamp_field and inner_field are not compatible because their inner types are not compatible
-        let inner_timestamp_field = Arc::new(Field::new_list_field(
-            DataType::Timestamp(TimeUnit::Microsecond, None),
-            true,
-        ));
-        let result_type = BinaryTypeCoercer::new(
-            &DataType::List(Arc::clone(&inner_field)),
-            &Operator::Eq,
-            &DataType::List(Arc::clone(&inner_timestamp_field)),
-        )
-        .get_input_types();
-        assert!(result_type.is_err());
-
-        // TODO add other data type
-        Ok(())
-    }
-
-    #[test]
-    fn test_list_coercion() {
-        let lhs_type = DataType::List(Arc::new(Field::new("lhs", DataType::Int8, false)));
-
-        let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true)));
-
-        let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap();
-        assert_eq!(
-            coerced_type,
-            DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true)))
-        ); // nullable because the RHS is nullable
-    }
-
-    #[test]
-    fn test_type_coercion_logical_op() -> Result<()> {
-        test_coercion_binary_rule!(
-            DataType::Boolean,
-            DataType::Boolean,
-            Operator::And,
-            DataType::Boolean
-        );
-
-        test_coercion_binary_rule!(
-            DataType::Boolean,
-            DataType::Boolean,
-            Operator::Or,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Boolean,
-            DataType::Null,
-            Operator::And,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Boolean,
-            DataType::Null,
-            Operator::Or,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Null,
-            DataType::Null,
-            Operator::Or,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Null,
-            DataType::Null,
-            Operator::And,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Null,
-            DataType::Boolean,
-            Operator::And,
-            DataType::Boolean
-        );
-        test_coercion_binary_rule!(
-            DataType::Null,
-            DataType::Boolean,
-            Operator::Or,
-            DataType::Boolean
-        );
-        Ok(())
-    }
-
-    #[test]
-    fn test_map_coercion() -> Result<()> {
-        let lhs = Field::new_map(
-            "lhs",
-            "entries",
-            Arc::new(Field::new("keys", DataType::Utf8, false)),
-            Arc::new(Field::new("values", DataType::LargeUtf8, false)),
-            true,
-            false,
-        );
-        let rhs = Field::new_map(
-            "rhs",
-            "kvp",
-            Arc::new(Field::new("k", DataType::Utf8, false)),
-            Arc::new(Field::new("v", DataType::Utf8, true)),
-            false,
-            true,
-        );
-
-        let expected = Field::new_map(
-            "expected",
-            "entries", // struct coercion takes lhs name
-            Arc::new(Field::new(
-                "keys", // struct coercion takes lhs name
-                DataType::Utf8,
-                false,
-            )),
-            Arc::new(Field::new(
-                "values",            // struct coercion takes lhs name
-                DataType::LargeUtf8, // lhs is large string
-                true,                // rhs is nullable
-            )),
-            false, // both sides must be sorted
-            true,  // rhs is nullable
-        );
-
-        test_coercion_binary_rule!(
-            lhs.data_type(),
-            rhs.data_type(),
-            Operator::Eq,
-            expected.data_type().clone()
-        );
-        Ok(())
-    }
-}
+mod tests;
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs
new file mode 100644
index 0000000000000..eb5622fedb8aa
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/arithmetic.rs
@@ -0,0 +1,477 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+use datafusion_common::assert_contains;
+
+#[test]
+fn test_coercion_error() -> Result<()> {
+    let coercer =
+        BinaryTypeCoercer::new(&DataType::Float32, &Operator::Plus, &DataType::Utf8);
+    let result_type = coercer.get_input_types();
+
+    let e = result_type.unwrap_err();
+    assert_eq!(
+        e.strip_backtrace(),
+        "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types"
+    );
+    Ok(())
+}
+
+#[test]
+fn test_date_timestamp_arithmetic_error() -> Result<()> {
+    let (lhs, rhs) = BinaryTypeCoercer::new(
+        &DataType::Timestamp(Nanosecond, None),
+        &Operator::Minus,
+        &DataType::Timestamp(Millisecond, None),
+    )
+    .get_input_types()?;
+    assert_eq!(lhs, DataType::Timestamp(Millisecond, None));
+    assert_eq!(rhs, DataType::Timestamp(Millisecond, None));
+
+    let err =
+        BinaryTypeCoercer::new(&DataType::Date32, &Operator::Plus, &DataType::Date64)
+            .get_input_types()
+            .unwrap_err()
+            .to_string();
+
+    assert_contains!(
+        &err,
+        "Cannot get result type for temporal operation Date64 + Date64"
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_decimal_mathematics_op_type() {
+    // Decimal32
+    assert_eq!(
+        coerce_numeric_type_to_decimal32(&DataType::Int8).unwrap(),
+        DataType::Decimal32(3, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal32(&DataType::Int16).unwrap(),
+        DataType::Decimal32(5, 0)
+    );
+    assert!(coerce_numeric_type_to_decimal32(&DataType::Int32).is_none());
+    assert!(coerce_numeric_type_to_decimal32(&DataType::Int64).is_none(),);
+    assert_eq!(
+        coerce_numeric_type_to_decimal32(&DataType::Float16).unwrap(),
+        DataType::Decimal32(6, 3)
+    );
+    assert!(coerce_numeric_type_to_decimal32(&DataType::Float32).is_none(),);
+    assert!(coerce_numeric_type_to_decimal32(&DataType::Float64).is_none());
+
+    // Decimal64
+    assert_eq!(
+        coerce_numeric_type_to_decimal64(&DataType::Int8).unwrap(),
+        DataType::Decimal64(3, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal64(&DataType::Int16).unwrap(),
+        DataType::Decimal64(5, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal64(&DataType::Int32).unwrap(),
+        DataType::Decimal64(10, 0)
+    );
+    assert!(coerce_numeric_type_to_decimal64(&DataType::Int64).is_none(),);
+    assert_eq!(
+        coerce_numeric_type_to_decimal64(&DataType::Float16).unwrap(),
+        DataType::Decimal64(6, 3)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal64(&DataType::Float32).unwrap(),
+        DataType::Decimal64(14, 7)
+    );
+    assert!(coerce_numeric_type_to_decimal64(&DataType::Float64).is_none());
+
+    // Decimal128
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Int8).unwrap(),
+        DataType::Decimal128(3, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Int16).unwrap(),
+        DataType::Decimal128(5, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Int32).unwrap(),
+        DataType::Decimal128(10, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Int64).unwrap(),
+        DataType::Decimal128(20, 0)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Float16).unwrap(),
+        DataType::Decimal128(6, 3)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Float32).unwrap(),
+        DataType::Decimal128(14, 7)
+    );
+    assert_eq!(
+        coerce_numeric_type_to_decimal128(&DataType::Float64).unwrap(),
+        DataType::Decimal128(30, 15)
+    );
+}
+
+#[test]
+fn test_type_coercion_arithmetic() -> Result<()> {
+    use DataType::*;
+
+    // (Float64, _) | (_, Float64) => Some(Float64)
+    test_coercion_binary_rule_multiple!(
+        Float64,
+        [
+            Float64, Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8,
+            UInt8
+        ],
+        Operator::Plus,
+        Float64
+    );
+    // (_, Float32) | (Float32, _) => Some(Float32)
+    test_coercion_binary_rule_multiple!(
+        Float32,
+        [
+            Float32, Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8
+        ],
+        Operator::Plus,
+        Float32
+    );
+    // (_, Float16) | (Float16, _) => Some(Float16)
+    test_coercion_binary_rule_multiple!(
+        Float16,
+        [
+            Float16, Int64, UInt64, Int32, UInt32, Int16, UInt16, Int8, UInt8
+        ],
+        Operator::Plus,
+        Float16
+    );
+    // (UInt64, Int64 | Int32 | Int16 | Int8) | (Int64 | Int32 | Int16 | Int8, UInt64)  => Some(Decimal128(20, 0))
+    test_coercion_binary_rule_multiple!(
+        UInt64,
+        [Int64, Int32, Int16, Int8],
+        Operator::Divide,
+        Decimal128(20, 0)
+    );
+    // (UInt64, _) | (_, UInt64) => Some(UInt64)
+    test_coercion_binary_rule_multiple!(
+        UInt64,
+        [UInt64, UInt32, UInt16, UInt8],
+        Operator::Modulo,
+        UInt64
+    );
+    // (Int64, _) | (_, Int64) => Some(Int64)
+    test_coercion_binary_rule_multiple!(
+        Int64,
+        [Int64, Int32, UInt32, Int16, UInt16, Int8, UInt8],
+        Operator::Modulo,
+        Int64
+    );
+    // (UInt32, Int32 | Int16 | Int8) | (Int32 | Int16 | Int8, UInt32) => Some(Int64)
+    test_coercion_binary_rule_multiple!(
+        UInt32,
+        [Int32, Int16, Int8],
+        Operator::Modulo,
+        Int64
+    );
+    // (UInt32, _) | (_, UInt32) => Some(UInt32)
+    test_coercion_binary_rule_multiple!(
+        UInt32,
+        [UInt32, UInt16, UInt8],
+        Operator::Modulo,
+        UInt32
+    );
+    // (Int32, _) | (_, Int32) => Some(Int32)
+    test_coercion_binary_rule_multiple!(
+        Int32,
+        [Int32, Int16, Int8],
+        Operator::Modulo,
+        Int32
+    );
+    // (UInt16, Int16 | Int8) | (Int16 | Int8, UInt16) => Some(Int32)
+    test_coercion_binary_rule_multiple!(UInt16, [Int16, Int8], Operator::Minus, Int32);
+    // (UInt16, _) | (_, UInt16) => Some(UInt16)
+    test_coercion_binary_rule_multiple!(
+        UInt16,
+        [UInt16, UInt8, UInt8],
+        Operator::Plus,
+        UInt16
+    );
+    // (Int16, _) | (_, Int16) => Some(Int16)
+    test_coercion_binary_rule_multiple!(Int16, [Int16, Int8], Operator::Plus, Int16);
+    // (UInt8, Int8) | (Int8, UInt8) => Some(Int16)
+    test_coercion_binary_rule!(Int8, UInt8, Operator::Minus, Int16);
+    test_coercion_binary_rule!(UInt8, Int8, Operator::Multiply, Int16);
+    // (UInt8, _) | (_, UInt8) => Some(UInt8)
+    test_coercion_binary_rule!(UInt8, UInt8, Operator::Minus, UInt8);
+    // (Int8, _) | (_, Int8) => Some(Int8)
+    test_coercion_binary_rule!(Int8, Int8, Operator::Plus, Int8);
+
+    Ok(())
+}
+
+#[test]
+fn test_bitwise_coercion_non_integer_types() -> Result<()> {
+    let err = BinaryTypeCoercer::new(
+        &DataType::Float32,
+        &Operator::BitwiseAnd,
+        &DataType::Float32,
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Float32 & Float32"
+    );
+
+    let err = BinaryTypeCoercer::new(
+        &DataType::Float32,
+        &Operator::BitwiseAnd,
+        &DataType::Float64,
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Float32 & Float64"
+    );
+
+    let err = BinaryTypeCoercer::new(
+        &DataType::Decimal128(10, 2),
+        &Operator::BitwiseAnd,
+        &DataType::Decimal128(10, 2),
+    )
+    .get_input_types()
+    .unwrap_err()
+    .to_string();
+    assert_contains!(
+        &err,
+        "Cannot infer common type for bitwise operation Decimal128(10, 2) & Decimal128(10, 2)"
+    );
+
+    let dict_int8 = DataType::Dictionary(DataType::Int8.into(), DataType::Int8.into());
+    test_coercion_binary_rule!(dict_int8, dict_int8, Operator::BitwiseAnd, dict_int8);
+
+    Ok(())
+}
+
+fn test_math_decimal_coercion_rule(
+    lhs_type: DataType,
+    rhs_type: DataType,
+    expected_lhs_type: DataType,
+    expected_rhs_type: DataType,
+) {
+    let (lhs_type, rhs_type) = math_decimal_coercion(&lhs_type, &rhs_type).unwrap();
+    assert_eq!(lhs_type, expected_lhs_type);
+    assert_eq!(rhs_type, expected_rhs_type);
+}
+
+#[test]
+fn test_coercion_arithmetic_decimal() -> Result<()> {
+    test_math_decimal_coercion_rule(
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::Int32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::Int32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::Int32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::Int32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::Int32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+
+    test_math_decimal_coercion_rule(
+        DataType::UInt32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+        DataType::Decimal128(10, 2),
+    );
+    test_math_decimal_coercion_rule(
+        DataType::Decimal128(10, 2),
+        DataType::UInt32,
+        DataType::Decimal128(10, 2),
+        DataType::Decimal128(10, 0),
+    );
+
+    Ok(())
+}
+
+#[test]
+fn test_coercion_arithmetic_decimal_cross_variant() -> Result<()> {
+    let test_cases = [
+        (
+            DataType::Decimal32(5, 2),
+            DataType::Decimal64(10, 3),
+            DataType::Decimal64(10, 3),
+            DataType::Decimal64(10, 3),
+        ),
+        (
+            DataType::Decimal32(7, 1),
+            DataType::Decimal128(15, 4),
+            DataType::Decimal128(15, 4),
+            DataType::Decimal128(15, 4),
+        ),
+        (
+            DataType::Decimal32(9, 0),
+            DataType::Decimal256(20, 5),
+            DataType::Decimal256(20, 5),
+            DataType::Decimal256(20, 5),
+        ),
+        (
+            DataType::Decimal64(12, 3),
+            DataType::Decimal128(18, 2),
+            DataType::Decimal128(19, 3),
+            DataType::Decimal128(19, 3),
+        ),
+        (
+            DataType::Decimal64(15, 4),
+            DataType::Decimal256(25, 6),
+            DataType::Decimal256(25, 6),
+            DataType::Decimal256(25, 6),
+        ),
+        (
+            DataType::Decimal128(20, 5),
+            DataType::Decimal256(30, 8),
+            DataType::Decimal256(30, 8),
+            DataType::Decimal256(30, 8),
+        ),
+        // Reverse order cases
+        (
+            DataType::Decimal64(10, 3),
+            DataType::Decimal32(5, 2),
+            DataType::Decimal64(10, 3),
+            DataType::Decimal64(10, 3),
+        ),
+        (
+            DataType::Decimal128(15, 4),
+            DataType::Decimal32(7, 1),
+            DataType::Decimal128(15, 4),
+            DataType::Decimal128(15, 4),
+        ),
+        (
+            DataType::Decimal256(20, 5),
+            DataType::Decimal32(9, 0),
+            DataType::Decimal256(20, 5),
+            DataType::Decimal256(20, 5),
+        ),
+        (
+            DataType::Decimal128(18, 2),
+            DataType::Decimal64(12, 3),
+            DataType::Decimal128(19, 3),
+            DataType::Decimal128(19, 3),
+        ),
+        (
+            DataType::Decimal256(25, 6),
+            DataType::Decimal64(15, 4),
+            DataType::Decimal256(25, 6),
+            DataType::Decimal256(25, 6),
+        ),
+        (
+            DataType::Decimal256(30, 8),
+            DataType::Decimal128(20, 5),
+            DataType::Decimal256(30, 8),
+            DataType::Decimal256(30, 8),
+        ),
+    ];
+
+    for (lhs_type, rhs_type, expected_lhs_type, expected_rhs_type) in test_cases {
+        test_math_decimal_coercion_rule(
+            lhs_type,
+            rhs_type,
+            expected_lhs_type,
+            expected_rhs_type,
+        );
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_decimal_precision_overflow_cross_variant() -> Result<()> {
+    // s = max(0, 1) = 1, range = max(76-0, 38-1) = 76, required_precision = 76 + 1 = 77 (overflow)
+    let result = get_wider_decimal_type_cross_variant(
+        &DataType::Decimal256(76, 0),
+        &DataType::Decimal128(38, 1),
+    );
+    assert!(result.is_none());
+
+    // s = max(0, 10) = 10, range = max(9-0, 18-10) = 9, required_precision = 9 + 10 = 19 (overflow > 18)
+    let result = get_wider_decimal_type_cross_variant(
+        &DataType::Decimal32(9, 0),
+        &DataType::Decimal64(18, 10),
+    );
+    assert!(result.is_none());
+
+    // s = max(5, 26) = 26, range = max(18-5, 38-26) = 13, required_precision = 13 + 26 = 39 (overflow > 38)
+    let result = get_wider_decimal_type_cross_variant(
+        &DataType::Decimal64(18, 5),
+        &DataType::Decimal128(38, 26),
+    );
+    assert!(result.is_none());
+
+    // s = max(10, 49) = 49, range = max(38-10, 76-49) = 28, required_precision = 28 + 49 = 77 (overflow > 76)
+    let result = get_wider_decimal_type_cross_variant(
+        &DataType::Decimal128(38, 10),
+        &DataType::Decimal256(76, 49),
+    );
+    assert!(result.is_none());
+
+    // s = max(2, 3) = 3, range = max(5-2, 10-3) = 7, required_precision = 7 + 3 = 10 (valid <= 18)
+    let result = get_wider_decimal_type_cross_variant(
+        &DataType::Decimal32(5, 2),
+        &DataType::Decimal64(10, 3),
+    );
+    assert!(result.is_some());
+
+    Ok(())
+}
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs
new file mode 100644
index 0000000000000..5d1b3bea75b0a
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs
@@ -0,0 +1,793 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+
+#[test]
+fn test_decimal_binary_comparison_coercion() -> Result<()> {
+    let input_decimal = DataType::Decimal128(20, 3);
+    let input_types = [
+        DataType::Int8,
+        DataType::Int16,
+        DataType::Int32,
+        DataType::Int64,
+        DataType::Float32,
+        DataType::Float64,
+        DataType::Decimal128(38, 10),
+        DataType::Decimal128(20, 8),
+        DataType::Null,
+    ];
+    let result_types = [
+        DataType::Decimal128(20, 3),
+        DataType::Decimal128(20, 3),
+        DataType::Decimal128(20, 3),
+        DataType::Decimal128(23, 3),
+        DataType::Decimal128(24, 7),
+        DataType::Decimal128(32, 15),
+        DataType::Decimal128(38, 10),
+        DataType::Decimal128(25, 8),
+        DataType::Decimal128(20, 3),
+    ];
+    let comparison_op_types = [
+        Operator::NotEq,
+        Operator::Eq,
+        Operator::Gt,
+        Operator::GtEq,
+        Operator::Lt,
+        Operator::LtEq,
+    ];
+    for (i, input_type) in input_types.iter().enumerate() {
+        let expect_type = &result_types[i];
+        for op in comparison_op_types {
+            let (lhs, rhs) = BinaryTypeCoercer::new(&input_decimal, &op, input_type)
+                .get_input_types()?;
+            assert_eq!(expect_type, &lhs);
+            assert_eq!(expect_type, &rhs);
+        }
+    }
+    // negative test
+    let result_type =
+        BinaryTypeCoercer::new(&input_decimal, &Operator::Eq, &DataType::Boolean)
+            .get_input_types();
+    assert!(result_type.is_err());
+    Ok(())
+}
+
+#[test]
+fn test_like_coercion() {
+    // string coerce to strings
+    test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8));
+    test_like_rule!(
+        DataType::LargeUtf8,
+        DataType::Utf8,
+        Some(DataType::LargeUtf8)
+    );
+    test_like_rule!(
+        DataType::Utf8,
+        DataType::LargeUtf8,
+        Some(DataType::LargeUtf8)
+    );
+    test_like_rule!(
+        DataType::LargeUtf8,
+        DataType::LargeUtf8,
+        Some(DataType::LargeUtf8)
+    );
+
+    // Also coerce binary to strings
+    test_like_rule!(DataType::Binary, DataType::Utf8, Some(DataType::Utf8));
+    test_like_rule!(
+        DataType::LargeBinary,
+        DataType::Utf8,
+        Some(DataType::LargeUtf8)
+    );
+    test_like_rule!(
+        DataType::Binary,
+        DataType::LargeUtf8,
+        Some(DataType::LargeUtf8)
+    );
+    test_like_rule!(
+        DataType::LargeBinary,
+        DataType::LargeUtf8,
+        Some(DataType::LargeUtf8)
+    );
+}
+
+#[test]
+fn test_type_coercion() -> Result<()> {
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Date32,
+        Operator::Eq,
+        DataType::Date32
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Date64,
+        Operator::Lt,
+        DataType::Date64
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Time32(Second),
+        Operator::Eq,
+        DataType::Time32(Second)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Time32(Millisecond),
+        Operator::Eq,
+        DataType::Time32(Millisecond)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Time64(Microsecond),
+        Operator::Eq,
+        DataType::Time64(Microsecond)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Time64(Nanosecond),
+        Operator::Eq,
+        DataType::Time64(Nanosecond)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Timestamp(Second, None),
+        Operator::Lt,
+        DataType::Timestamp(Nanosecond, None)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Timestamp(Millisecond, None),
+        Operator::Lt,
+        DataType::Timestamp(Nanosecond, None)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Timestamp(Microsecond, None),
+        Operator::Lt,
+        DataType::Timestamp(Nanosecond, None)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Timestamp(Nanosecond, None),
+        Operator::Lt,
+        DataType::Timestamp(Nanosecond, None)
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8,
+        Operator::RegexMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8View,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8View,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8,
+        Operator::RegexNotMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8View,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8View,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8,
+        Operator::RegexNotIMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Utf8View,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8View,
+        DataType::Utf8View,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8,
+        Operator::RegexMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8View,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8View,
+        Operator::RegexMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8,
+        Operator::RegexIMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8,
+        Operator::RegexIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8View,
+        Operator::RegexIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8View,
+        Operator::RegexIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8,
+        Operator::RegexNotMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8View,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8View,
+        Operator::RegexNotMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8,
+        Operator::RegexNotIMatch,
+        DataType::Utf8
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
+        DataType::Utf8View,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
+        DataType::Utf8View,
+        Operator::RegexNotIMatch,
+        DataType::Utf8View
+    );
+    test_coercion_binary_rule!(
+        DataType::Int16,
+        DataType::Int64,
+        Operator::BitwiseAnd,
+        DataType::Int64
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt64,
+        DataType::UInt64,
+        Operator::BitwiseAnd,
+        DataType::UInt64
+    );
+    test_coercion_binary_rule!(
+        DataType::Int8,
+        DataType::UInt32,
+        Operator::BitwiseAnd,
+        DataType::Int64
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt32,
+        DataType::Int32,
+        Operator::BitwiseAnd,
+        DataType::Int64
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt16,
+        DataType::Int16,
+        Operator::BitwiseAnd,
+        DataType::Int32
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt32,
+        DataType::UInt32,
+        Operator::BitwiseAnd,
+        DataType::UInt32
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt16,
+        DataType::UInt32,
+        Operator::BitwiseAnd,
+        DataType::UInt32
+    );
+    Ok(())
+}
+
+#[test]
+fn test_type_coercion_compare() -> Result<()> {
+    // boolean
+    test_coercion_binary_rule!(
+        DataType::Boolean,
+        DataType::Boolean,
+        Operator::Eq,
+        DataType::Boolean
+    );
+    // float
+    test_coercion_binary_rule!(
+        DataType::Float16,
+        DataType::Int64,
+        Operator::Eq,
+        DataType::Float16
+    );
+    test_coercion_binary_rule!(
+        DataType::Float16,
+        DataType::Float64,
+        Operator::Eq,
+        DataType::Float64
+    );
+    test_coercion_binary_rule!(
+        DataType::Float32,
+        DataType::Int64,
+        Operator::Eq,
+        DataType::Float32
+    );
+    test_coercion_binary_rule!(
+        DataType::Float32,
+        DataType::Float64,
+        Operator::GtEq,
+        DataType::Float64
+    );
+    // signed integer
+    test_coercion_binary_rule!(
+        DataType::Int8,
+        DataType::Int32,
+        Operator::LtEq,
+        DataType::Int32
+    );
+    test_coercion_binary_rule!(
+        DataType::Int64,
+        DataType::Int32,
+        Operator::LtEq,
+        DataType::Int64
+    );
+    // unsigned integer
+    test_coercion_binary_rule!(
+        DataType::UInt32,
+        DataType::UInt8,
+        Operator::Gt,
+        DataType::UInt32
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt64,
+        DataType::UInt8,
+        Operator::Eq,
+        DataType::UInt64
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt64,
+        DataType::Int64,
+        Operator::Eq,
+        DataType::Decimal128(20, 0)
+    );
+    // numeric/decimal
+    test_coercion_binary_rule!(
+        DataType::Int64,
+        DataType::Decimal128(10, 0),
+        Operator::Eq,
+        DataType::Decimal128(20, 0)
+    );
+    test_coercion_binary_rule!(
+        DataType::Int64,
+        DataType::Decimal128(10, 2),
+        Operator::Lt,
+        DataType::Decimal128(22, 2)
+    );
+    test_coercion_binary_rule!(
+        DataType::Float64,
+        DataType::Decimal128(10, 3),
+        Operator::Gt,
+        DataType::Decimal128(30, 15)
+    );
+    test_coercion_binary_rule!(
+        DataType::Int64,
+        DataType::Decimal128(10, 0),
+        Operator::Eq,
+        DataType::Decimal128(20, 0)
+    );
+    test_coercion_binary_rule!(
+        DataType::Decimal128(14, 2),
+        DataType::Decimal128(10, 3),
+        Operator::GtEq,
+        DataType::Decimal128(15, 3)
+    );
+    test_coercion_binary_rule!(
+        DataType::UInt64,
+        DataType::Decimal128(20, 0),
+        Operator::Eq,
+        DataType::Decimal128(20, 0)
+    );
+
+    // Binary
+    test_coercion_binary_rule!(
+        DataType::Binary,
+        DataType::Binary,
+        Operator::Eq,
+        DataType::Binary
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::Binary,
+        Operator::Eq,
+        DataType::Binary
+    );
+    test_coercion_binary_rule!(
+        DataType::Binary,
+        DataType::Utf8,
+        Operator::Eq,
+        DataType::Binary
+    );
+
+    // LargeBinary
+    test_coercion_binary_rule!(
+        DataType::LargeBinary,
+        DataType::LargeBinary,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::Binary,
+        DataType::LargeBinary,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeBinary,
+        DataType::Binary,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::Utf8,
+        DataType::LargeBinary,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeBinary,
+        DataType::Utf8,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeUtf8,
+        DataType::LargeBinary,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeBinary,
+        DataType::LargeUtf8,
+        Operator::Eq,
+        DataType::LargeBinary
+    );
+
+    // Timestamps
+    let utc: Option<Arc<str>> = Some("UTC".into());
+    test_coercion_binary_rule!(
+        DataType::Timestamp(Second, utc.clone()),
+        DataType::Timestamp(Second, utc.clone()),
+        Operator::Eq,
+        DataType::Timestamp(Second, utc.clone())
+    );
+    test_coercion_binary_rule!(
+        DataType::Timestamp(Second, utc.clone()),
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
+        Operator::Eq,
+        DataType::Timestamp(Second, utc.clone())
+    );
+    test_coercion_binary_rule!(
+        DataType::Timestamp(Second, Some("America/New_York".into())),
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
+        Operator::Eq,
+        DataType::Timestamp(Second, Some("America/New_York".into()))
+    );
+    test_coercion_binary_rule!(
+        DataType::Timestamp(Second, Some("Europe/Brussels".into())),
+        DataType::Timestamp(Second, utc),
+        Operator::Eq,
+        DataType::Timestamp(Second, Some("Europe/Brussels".into()))
+    );
+
+    // list
+    let inner_field = Arc::new(Field::new_list_field(DataType::Int64, true));
+    test_coercion_binary_rule!(
+        DataType::List(Arc::clone(&inner_field)),
+        DataType::List(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::List(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::List(Arc::clone(&inner_field)),
+        DataType::LargeList(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::LargeList(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeList(Arc::clone(&inner_field)),
+        DataType::List(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::LargeList(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeList(Arc::clone(&inner_field)),
+        DataType::LargeList(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::LargeList(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        Operator::Eq,
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10)
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        DataType::LargeList(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::LargeList(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::LargeList(Arc::clone(&inner_field)),
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        Operator::Eq,
+        DataType::LargeList(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::List(Arc::clone(&inner_field)),
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        Operator::Eq,
+        DataType::List(Arc::clone(&inner_field))
+    );
+    test_coercion_binary_rule!(
+        DataType::FixedSizeList(Arc::clone(&inner_field), 10),
+        DataType::List(Arc::clone(&inner_field)),
+        Operator::Eq,
+        DataType::List(Arc::clone(&inner_field))
+    );
+
+    let inner_timestamp_field = Arc::new(Field::new_list_field(
+        DataType::Timestamp(Microsecond, None),
+        true,
+    ));
+    let result_type = BinaryTypeCoercer::new(
+        &DataType::List(Arc::clone(&inner_field)),
+        &Operator::Eq,
+        &DataType::List(Arc::clone(&inner_timestamp_field)),
+    )
+    .get_input_types();
+    assert!(result_type.is_err());
+
+    Ok(())
+}
+
+#[test]
+fn test_list_coercion() {
+    let lhs_type = DataType::List(Arc::new(Field::new("lhs", DataType::Int8, false)));
+
+    let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true)));
+
+    let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap();
+    assert_eq!(
+        coerced_type,
+        DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true)))
+    );
+}
+
+#[test]
+fn test_map_coercion() -> Result<()> {
+    let lhs = Field::new_map(
+        "lhs",
+        "entries",
+        Arc::new(Field::new("keys", DataType::Utf8, false)),
+        Arc::new(Field::new("values", DataType::LargeUtf8, false)),
+        true,
+        false,
+    );
+    let rhs = Field::new_map(
+        "rhs",
+        "kvp",
+        Arc::new(Field::new("k", DataType::Utf8, false)),
+        Arc::new(Field::new("v", DataType::Utf8, true)),
+        false,
+        true,
+    );
+
+    let expected = Field::new_map(
+        "expected",
+        "entries",
+        Arc::new(Field::new("keys", DataType::Utf8, false)),
+        Arc::new(Field::new("values", DataType::LargeUtf8, true)),
+        false,
+        true,
+    );
+
+    test_coercion_binary_rule!(
+        lhs.data_type(),
+        rhs.data_type(),
+        Operator::Eq,
+        expected.data_type().clone()
+    );
+    Ok(())
+}
+
+#[test]
+fn test_decimal_cross_variant_comparison_coercion() -> Result<()> {
+    let test_cases = [
+        // (lhs, rhs, expected_result)
+        (
+            DataType::Decimal32(5, 2),
+            DataType::Decimal64(10, 3),
+            DataType::Decimal64(10, 3),
+        ),
+        (
+            DataType::Decimal32(7, 1),
+            DataType::Decimal128(15, 4),
+            DataType::Decimal128(15, 4),
+        ),
+        (
+            DataType::Decimal32(9, 0),
+            DataType::Decimal256(20, 5),
+            DataType::Decimal256(20, 5),
+        ),
+        (
+            DataType::Decimal64(12, 3),
+            DataType::Decimal128(18, 2),
+            DataType::Decimal128(19, 3),
+        ),
+        (
+            DataType::Decimal64(15, 4),
+            DataType::Decimal256(25, 6),
+            DataType::Decimal256(25, 6),
+        ),
+        (
+            DataType::Decimal128(20, 5),
+            DataType::Decimal256(30, 8),
+            DataType::Decimal256(30, 8),
+        ),
+        // Reverse order cases
+        (
+            DataType::Decimal64(10, 3),
+            DataType::Decimal32(5, 2),
+            DataType::Decimal64(10, 3),
+        ),
+        (
+            DataType::Decimal128(15, 4),
+            DataType::Decimal32(7, 1),
+            DataType::Decimal128(15, 4),
+        ),
+        (
+            DataType::Decimal256(20, 5),
+            DataType::Decimal32(9, 0),
+            DataType::Decimal256(20, 5),
+        ),
+        (
+            DataType::Decimal128(18, 2),
+            DataType::Decimal64(12, 3),
+            DataType::Decimal128(19, 3),
+        ),
+        (
+            DataType::Decimal256(25, 6),
+            DataType::Decimal64(15, 4),
+            DataType::Decimal256(25, 6),
+        ),
+        (
+            DataType::Decimal256(30, 8),
+            DataType::Decimal128(20, 5),
+            DataType::Decimal256(30, 8),
+        ),
+    ];
+
+    let comparison_op_types = [
+        Operator::NotEq,
+        Operator::Eq,
+        Operator::Gt,
+        Operator::GtEq,
+        Operator::Lt,
+        Operator::LtEq,
+    ];
+
+    for (lhs_type, rhs_type, expected_type) in test_cases {
+        for op in comparison_op_types {
+            let (lhs, rhs) =
+                BinaryTypeCoercer::new(&lhs_type, &op, &rhs_type).get_input_types()?;
+            assert_eq!(
+                expected_type, lhs,
+                "Coercion of type {lhs_type:?} with {rhs_type:?} resulted in unexpected type: {lhs:?}"
+            );
+            assert_eq!(
+                expected_type, rhs,
+                "Coercion of type {rhs_type:?} with {lhs_type:?} resulted in unexpected type: {rhs:?}"
+            );
+        }
+    }
+
+    Ok(())
+}
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs
new file mode 100644
index 0000000000000..0fb56a4a2c536
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+
+#[test]
+fn test_dictionary_type_coercion() {
+    use DataType::*;
+
+    let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32));
+    let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Int32)
+    );
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Int32)
+    );
+
+    // Since we can coerce values of Int16 to Utf8 can support this
+    let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
+    let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Utf8)
+    );
+
+    // Since we can coerce values of Utf8 to Binary can support this
+    let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
+    let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary));
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Binary)
+    );
+
+    let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
+    let rhs_type = Utf8;
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Utf8)
+    );
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(lhs_type.clone())
+    );
+
+    let lhs_type = Utf8;
+    let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Utf8)
+    );
+    assert_eq!(
+        dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(rhs_type.clone())
+    );
+}
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs b/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs
new file mode 100644
index 0000000000000..e4653d4955eb0
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/mod.rs
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+
+// Common test macros
+
+/// Tests that coercion for a binary operator between two types yields the expected result type for both sides.
+///
+/// Usage: test_coercion_binary_rule!(lhs_type, rhs_type, op, expected_type)
+/// - lhs_type: The left-hand side data type
+/// - rhs_type: The right-hand side data type
+/// - op: The binary operator (e.g., "+", "-", etc.)
+/// - expected_type: The type both sides should be coerced to
+macro_rules! test_coercion_binary_rule {
+    ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{
+        let (lhs, rhs) =
+            BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &$RHS_TYPE).get_input_types()?;
+        assert_eq!(lhs, $RESULT_TYPE);
+        assert_eq!(rhs, $RESULT_TYPE);
+    }};
+}
+
+/// Tests that coercion for a binary operator between one type and multiple right-hand side types
+/// yields the expected result type for both sides, in both lhs/rhs and rhs/lhs order.
+///
+/// Usage: test_coercion_binary_rule_multiple!(lhs_type, rhs_types, op, expected_type)
+/// - lhs_type: The left-hand side data type
+/// - rhs_types: An iterable of right-hand side data types
+/// - op: The binary operator
+/// - expected_type: The type both sides should be coerced to
+macro_rules! test_coercion_binary_rule_multiple {
+    ($LHS_TYPE:expr, $RHS_TYPES:expr, $OP:expr, $RESULT_TYPE:expr) => {{
+        for rh_type in $RHS_TYPES {
+            let (lhs, rhs) =
+                BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &rh_type).get_input_types()?;
+            assert_eq!(lhs, $RESULT_TYPE);
+            assert_eq!(rhs, $RESULT_TYPE);
+
+            BinaryTypeCoercer::new(&rh_type, &$OP, &$LHS_TYPE).get_input_types()?;
+            assert_eq!(lhs, $RESULT_TYPE);
+            assert_eq!(rhs, $RESULT_TYPE);
+        }
+    }};
+}
+
+/// Tests that the like_coercion function returns the expected result type for both lhs/rhs and rhs/lhs order.
+///
+/// Usage: test_like_rule!(lhs_type, rhs_type, expected_type)
+/// - lhs_type: The left-hand side data type
+/// - rhs_type: The right-hand side data type
+/// - expected_type: The expected result type from like_coercion
+macro_rules! test_like_rule {
+    ($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{
+        let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE);
+        assert_eq!(result, $RESULT_TYPE);
+        let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE);
+        assert_eq!(result, $RESULT_TYPE);
+    }};
+}
+
+mod arithmetic;
+mod comparison;
+mod dictionary;
+mod null_coercion;
+mod run_end_encoded;
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/null_coercion.rs b/datafusion/expr-common/src/type_coercion/binary/tests/null_coercion.rs
new file mode 100644
index 0000000000000..91c826b563c7c
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/null_coercion.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+
+#[test]
+fn test_type_coercion_logical_op() -> Result<()> {
+    test_coercion_binary_rule!(
+        DataType::Boolean,
+        DataType::Boolean,
+        Operator::And,
+        DataType::Boolean
+    );
+
+    test_coercion_binary_rule!(
+        DataType::Boolean,
+        DataType::Boolean,
+        Operator::Or,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Boolean,
+        DataType::Null,
+        Operator::And,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Boolean,
+        DataType::Null,
+        Operator::Or,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Null,
+        DataType::Null,
+        Operator::Or,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Null,
+        DataType::Null,
+        Operator::And,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Null,
+        DataType::Boolean,
+        Operator::And,
+        DataType::Boolean
+    );
+    test_coercion_binary_rule!(
+        DataType::Null,
+        DataType::Boolean,
+        Operator::Or,
+        DataType::Boolean
+    );
+    Ok(())
+}
diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs
new file mode 100644
index 0000000000000..9997db7a82688
--- /dev/null
+++ b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::*;
+
+#[test]
+fn test_ree_type_coercion() {
+    use DataType::*;
+
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int32, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int16, false)),
+    );
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Int32)
+    );
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Int32)
+    );
+
+    // Since we can coerce values of Int16 to Utf8 can support this: Coercion of Int16 to Utf8
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Int16, false)),
+    );
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Utf8)
+    );
+
+    // Since we can coerce values of Utf8 to Binary can support this
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Binary, false)),
+    );
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(Binary)
+    );
+    let lhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    let rhs_type = Utf8;
+    // Don't preserve REE
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Utf8)
+    );
+    // Preserve REE
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(lhs_type.clone())
+    );
+
+    let lhs_type = Utf8;
+    let rhs_type = RunEndEncoded(
+        Arc::new(Field::new("run_ends", Int8, false)),
+        Arc::new(Field::new("values", Utf8, false)),
+    );
+    // Don't preserve REE
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, false),
+        Some(Utf8)
+    );
+    // Preserve REE
+    assert_eq!(
+        ree_comparison_coercion(&lhs_type, &rhs_type, true),
+        Some(rhs_type.clone())
+    );
+}
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index d77c59ff64e17..6990714585001 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,22 +41,25 @@ workspace = true
 name = "datafusion_expr"
 
 [features]
+default = ["sql"]
 recursive_protection = ["dep:recursive"]
+sql = ["sqlparser"]
 
 [dependencies]
 arrow = { workspace = true }
+async-trait = { workspace = true }
 chrono = { workspace = true }
-datafusion-common = { workspace = true }
+datafusion-common = { workspace = true, default-features = false }
 datafusion-doc = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 indexmap = { workspace = true }
-paste = "^1.0"
+itertools = { workspace = true }
 recursive = { workspace = true, optional = true }
 serde_json = { workspace = true }
-sqlparser = { workspace = true }
+sqlparser = { workspace = true, optional = true }
 
 [dev-dependencies]
 ctor = { workspace = true }
diff --git a/datafusion/expr/README.md b/datafusion/expr/README.md
index b086f930e871b..b3ab9a383dbbd 100644
--- a/datafusion/expr/README.md
+++ b/datafusion/expr/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Logical Plan and Expressions
+# Apache DataFusion Logical Plan and Expressions
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides data types and utilities for logical plans and expressions.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/expr/src/arguments.rs b/datafusion/expr/src/arguments.rs
new file mode 100644
index 0000000000000..f10cf50f60b24
--- /dev/null
+++ b/datafusion/expr/src/arguments.rs
@@ -0,0 +1,674 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Argument resolution logic for named function parameters
+
+use crate::Expr;
+use datafusion_common::{Result, plan_err};
+
+/// Represents a named function argument with its original case and quote information.
+///
+/// This struct preserves whether an identifier was quoted in the SQL, which determines
+/// whether case-sensitive or case-insensitive matching should be used per SQL standards.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ArgumentName {
+    /// The argument name in its original case as it appeared in the SQL
+    pub value: String,
+    /// Whether the identifier was quoted (e.g., "STR" vs STR)
+    /// - true: quoted identifier, requires case-sensitive matching
+    /// - false: unquoted identifier, uses case-insensitive matching
+    pub is_quoted: bool,
+}
+
+/// Resolves function arguments, handling named and positional notation.
+///
+/// This function validates and reorders arguments to match the function's parameter names
+/// when named arguments are used.
+///
+/// # Rules
+/// - All positional arguments must come before named arguments
+/// - Named arguments can be in any order after positional arguments
+/// - Parameter names follow SQL identifier rules: unquoted names are case-insensitive
+///   (normalized to lowercase), quoted names are case-sensitive
+/// - No duplicate parameter names allowed
+///
+/// # Arguments
+/// * `param_names` - The function's parameter names in order
+/// * `args` - The argument expressions
+/// * `arg_names` - Optional parameter name for each argument
+///
+/// # Returns
+/// A vector of expressions in the correct order matching the parameter names
+///
+/// # Examples
+/// ```text
+/// Given parameters ["a", "b", "c"]
+/// And call: func(10, c => 30, b => 20)
+/// Returns: [Expr(10), Expr(20), Expr(30)]
+/// ```
+pub fn resolve_function_arguments(
+    param_names: &[String],
+    args: Vec<Expr>,
+    arg_names: Vec<Option<ArgumentName>>,
+) -> Result<Vec<Expr>> {
+    if args.len() != arg_names.len() {
+        return plan_err!(
+            "Internal error: args length ({}) != arg_names length ({})",
+            args.len(),
+            arg_names.len()
+        );
+    }
+
+    // Check if all arguments are positional (fast path)
+    if arg_names.iter().all(|name| name.is_none()) {
+        return Ok(args);
+    }
+
+    validate_argument_order(&arg_names)?;
+
+    reorder_named_arguments(param_names, args, arg_names)
+}
+
+/// Validates that positional arguments come before named arguments
+fn validate_argument_order(arg_names: &[Option<ArgumentName>]) -> Result<()> {
+    let mut seen_named = false;
+    for (i, arg_name) in arg_names.iter().enumerate() {
+        match arg_name {
+            Some(_) => seen_named = true,
+            None if seen_named => {
+                return plan_err!(
+                    "Positional argument at position {} follows named argument. \
+                     All positional arguments must come before named arguments.",
+                    i
+                );
+            }
+            None => {}
+        }
+    }
+    Ok(())
+}
+
+/// Reorders arguments based on named parameters to match signature order
+fn reorder_named_arguments(
+    param_names: &[String],
+    args: Vec<Expr>,
+    arg_names: Vec<Option<ArgumentName>>,
+) -> Result<Vec<Expr>> {
+    let positional_count = arg_names.iter().filter(|n| n.is_none()).count();
+
+    // Capture args length before consuming the vector
+    let args_len = args.len();
+
+    let expected_arg_count = param_names.len();
+
+    if positional_count > expected_arg_count {
+        return plan_err!(
+            "Too many positional arguments: expected at most {}, got {}",
+            expected_arg_count,
+            positional_count
+        );
+    }
+
+    let mut result: Vec<Option<Expr>> = vec![None; expected_arg_count];
+
+    for (i, (arg, arg_name)) in args.into_iter().zip(arg_names).enumerate() {
+        if let Some(arg_name) = arg_name {
+            // Named argument - find parameter index using linear search
+            // Match based on SQL identifier rules:
+            // - Quoted identifiers: case-sensitive (exact match)
+            // - Unquoted identifiers: case-insensitive match
+            let param_index = param_names
+                .iter()
+                .position(|p| {
+                    if arg_name.is_quoted {
+                        // Quoted: exact case match
+                        p == &arg_name.value
+                    } else {
+                        // Unquoted: case-insensitive match
+                        p.eq_ignore_ascii_case(&arg_name.value)
+                    }
+                })
+                .ok_or_else(|| {
+                    datafusion_common::plan_datafusion_err!(
+                        "Unknown parameter name '{}'. Valid parameters are: [{}]",
+                        arg_name.value,
+                        param_names.join(", ")
+                    )
+                })?;
+
+            if result[param_index].is_some() {
+                return plan_err!(
+                    "Parameter '{}' specified multiple times",
+                    arg_name.value
+                );
+            }
+
+            result[param_index] = Some(arg);
+        } else {
+            result[i] = Some(arg);
+        }
+    }
+
+    // Only require parameters up to the number of arguments provided (supports optional parameters)
+    let required_count = args_len;
+    for i in 0..required_count {
+        if result[i].is_none() {
+            return plan_err!("Missing required parameter '{}'", param_names[i]);
+        }
+    }
+
+    // Return only the assigned parameters (handles optional trailing parameters)
+    Ok(result.into_iter().take(required_count).flatten().collect())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::lit;
+
+    #[test]
+    fn test_all_positional() {
+        let param_names = vec!["a".to_string(), "b".to_string()];
+
+        let args = vec![lit(1), lit("hello")];
+        let arg_names = vec![None, None];
+
+        let result =
+            resolve_function_arguments(&param_names, args.clone(), arg_names).unwrap();
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn test_all_named() {
+        let param_names = vec!["a".to_string(), "b".to_string()];
+
+        let args = vec![lit(1), lit("hello")];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn test_case_insensitive_parameter_matching() {
+        // Parameter names in function signature (lowercase)
+        let param_names = vec!["startpos".to_string(), "length".to_string()];
+
+        // Unquoted arguments with different casing should match case-insensitively
+        let args = vec![lit(1), lit(10)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0], lit(1));
+        assert_eq!(result[1], lit(10));
+
+        // Test with reordering and different cases
+        let args2 = vec![lit(20), lit(5)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "Length".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "StartPos".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 2);
+        assert_eq!(result2[0], lit(5)); // startpos
+        assert_eq!(result2[1], lit(20)); // length
+    }
+
+    #[test]
+    fn test_quoted_parameter_case_sensitive() {
+        // Parameter names in function signature (lowercase)
+        let param_names = vec!["str".to_string(), "start_pos".to_string()];
+
+        // Quoted identifiers with wrong case should fail
+        let args = vec![lit("hello"), lit(1)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "STR".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "start_pos".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Quoted identifiers with correct case should succeed
+        let args2 = vec![lit("hello"), lit(1)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "str".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "start_pos".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 2);
+        assert_eq!(result2[0], lit("hello"));
+        assert_eq!(result2[1], lit(1));
+    }
+
+    #[test]
+    fn test_named_reordering() {
+        let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+
+        // Call with: func(c => 3.0, a => 1, b => "hello")
+        let args = vec![lit(3.0), lit(1), lit("hello")];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
+
+        // Should be reordered to [a, b, c] = [1, "hello", 3.0]
+        assert_eq!(result.len(), 3);
+        assert_eq!(result[0], lit(1));
+        assert_eq!(result[1], lit("hello"));
+        assert_eq!(result[2], lit(3.0));
+    }
+
+    #[test]
+    fn test_mixed_positional_and_named() {
+        let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+
+        // Call with: func(1, c => 3.0, b => "hello")
+        let args = vec![lit(1), lit(3.0), lit("hello")];
+        let arg_names = vec![
+            None,
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names).unwrap();
+
+        // Should be reordered to [a, b, c] = [1, "hello", 3.0]
+        assert_eq!(result.len(), 3);
+        assert_eq!(result[0], lit(1));
+        assert_eq!(result[1], lit("hello"));
+        assert_eq!(result[2], lit(3.0));
+    }
+
+    #[test]
+    fn test_positional_after_named_error() {
+        let param_names = vec!["a".to_string(), "b".to_string()];
+
+        // Call with: func(a => 1, "hello") - ERROR
+        let args = vec![lit(1), lit("hello")];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            None,
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Positional argument")
+        );
+    }
+
+    #[test]
+    fn test_unknown_parameter_name() {
+        let param_names = vec!["a".to_string(), "b".to_string()];
+
+        // Call with: func(x => 1, b => "hello") - ERROR
+        let args = vec![lit(1), lit("hello")];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "x".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "b".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+    }
+
+    #[test]
+    fn test_duplicate_parameter_name() {
+        let param_names = vec!["a".to_string(), "b".to_string()];
+
+        // Call with: func(a => 1, a => 2) - ERROR
+        let args = vec![lit(1), lit(2)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("specified multiple times")
+        );
+    }
+
+    #[test]
+    fn test_missing_required_parameter() {
+        let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()];
+
+        // Call with: func(a => 1, c => 3.0) - missing 'b'
+        let args = vec![lit(1), lit(3.0)];
+        let arg_names = vec![
+            Some(ArgumentName {
+                value: "a".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "c".to_string(),
+                is_quoted: false,
+            }),
+        ];
+
+        let result = resolve_function_arguments(&param_names, args, arg_names);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Missing required parameter")
+        );
+    }
+
+    #[test]
+    fn test_mixed_case_signature_unquoted_matching() {
+        // Test with mixed-case signature parameters (lowercase, camelCase, UPPERCASE)
+        // This proves case-insensitive matching works for unquoted identifiers
+        let param_names = vec![
+            "prefix".to_string(),   // lowercase
+            "startPos".to_string(), // camelCase
+            "LENGTH".to_string(),   // UPPERCASE
+        ];
+
+        // Test 1: All lowercase unquoted arguments should match
+        let args1 = vec![lit("a"), lit(1), lit(5)];
+        let arg_names1 = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "startpos".to_string(), // lowercase version of startPos
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "length".to_string(), // lowercase version of LENGTH
+                is_quoted: false,
+            }),
+        ];
+
+        let result1 =
+            resolve_function_arguments(&param_names, args1, arg_names1).unwrap();
+        assert_eq!(result1.len(), 3);
+        assert_eq!(result1[0], lit("a"));
+        assert_eq!(result1[1], lit(1));
+        assert_eq!(result1[2], lit(5));
+
+        // Test 2: All uppercase unquoted arguments should match
+        let args2 = vec![lit("b"), lit(2), lit(10)];
+        let arg_names2 = vec![
+            Some(ArgumentName {
+                value: "PREFIX".to_string(), // uppercase version of prefix
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(), // uppercase version of startPos
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(), // matches UPPERCASE
+                is_quoted: false,
+            }),
+        ];
+
+        let result2 =
+            resolve_function_arguments(&param_names, args2, arg_names2).unwrap();
+        assert_eq!(result2.len(), 3);
+        assert_eq!(result2[0], lit("b"));
+        assert_eq!(result2[1], lit(2));
+        assert_eq!(result2[2], lit(10));
+
+        // Test 3: Mixed case unquoted arguments should match
+        let args3 = vec![lit("c"), lit(3), lit(15)];
+        let arg_names3 = vec![
+            Some(ArgumentName {
+                value: "Prefix".to_string(), // Title case
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "StartPos".to_string(), // matches camelCase
+                is_quoted: false,
+            }),
+            Some(ArgumentName {
+                value: "Length".to_string(), // Title case
+                is_quoted: false,
+            }),
+        ];
+
+        let result3 =
+            resolve_function_arguments(&param_names, args3, arg_names3).unwrap();
+        assert_eq!(result3.len(), 3);
+        assert_eq!(result3[0], lit("c"));
+        assert_eq!(result3[1], lit(3));
+        assert_eq!(result3[2], lit(15));
+    }
+
+    #[test]
+    fn test_mixed_case_signature_quoted_matching() {
+        // Test that quoted identifiers require exact case match with signature
+        let param_names = vec![
+            "prefix".to_string(),   // lowercase
+            "startPos".to_string(), // camelCase
+            "LENGTH".to_string(),   // UPPERCASE
+        ];
+
+        // Test 1: Quoted with wrong case should fail for "prefix"
+        let args_wrong_prefix = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_prefix = vec![
+            Some(ArgumentName {
+                value: "PREFIX".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result = resolve_function_arguments(
+            &param_names,
+            args_wrong_prefix,
+            arg_names_wrong_prefix,
+        );
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 2: Quoted with wrong case should fail for "startPos"
+        let args_wrong_startpos = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_startpos = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "STARTPOS".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(),
+                is_quoted: true,
+            }),
+        ];
+
+        let result2 = resolve_function_arguments(
+            &param_names,
+            args_wrong_startpos,
+            arg_names_wrong_startpos,
+        );
+        assert!(result2.is_err());
+        assert!(
+            result2
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 3: Quoted with wrong case should fail for "LENGTH"
+        let args_wrong_length = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_wrong_length = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(),
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "length".to_string(), // Wrong case
+                is_quoted: true,
+            }),
+        ];
+
+        let result3 = resolve_function_arguments(
+            &param_names,
+            args_wrong_length,
+            arg_names_wrong_length,
+        );
+        assert!(result3.is_err());
+        assert!(
+            result3
+                .unwrap_err()
+                .to_string()
+                .contains("Unknown parameter")
+        );
+
+        // Test 4: Quoted with exact case should succeed
+        let args_correct = vec![lit("a"), lit(1), lit(5)];
+        let arg_names_correct = vec![
+            Some(ArgumentName {
+                value: "prefix".to_string(), // Exact match
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "startPos".to_string(), // Exact match
+                is_quoted: true,
+            }),
+            Some(ArgumentName {
+                value: "LENGTH".to_string(), // Exact match
+                is_quoted: true,
+            }),
+        ];
+
+        let result4 =
+            resolve_function_arguments(&param_names, args_correct, arg_names_correct)
+                .unwrap();
+        assert_eq!(result4.len(), 3);
+        assert_eq!(result4[0], lit("a"));
+        assert_eq!(result4[1], lit(1));
+        assert_eq!(result4[2], lit(5));
+    }
+}
diff --git a/datafusion/expr/src/async_udf.rs b/datafusion/expr/src/async_udf.rs
new file mode 100644
index 0000000000000..8afdfda68dea0
--- /dev/null
+++ b/datafusion/expr/src/async_udf.rs
@@ -0,0 +1,260 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl};
+use arrow::datatypes::{DataType, FieldRef};
+use async_trait::async_trait;
+use datafusion_common::error::Result;
+use datafusion_common::internal_err;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_expr_common::signature::Signature;
+use std::any::Any;
+use std::fmt::{Debug, Display};
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+/// A scalar UDF that can invoke using async methods
+///
+/// Note this is less efficient than the ScalarUDFImpl, but it can be used
+/// to register remote functions in the context.
+///
+/// The name is chosen to mirror ScalarUDFImpl
+#[async_trait]
+pub trait AsyncScalarUDFImpl: ScalarUDFImpl {
+    /// The ideal batch size for this function.
+    ///
+    /// This is used to determine what size of data to be evaluated at once.
+    /// If None, the whole batch will be evaluated at once.
+    fn ideal_batch_size(&self) -> Option<usize> {
+        None
+    }
+
+    /// Invoke the function asynchronously with the async arguments
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue>;
+}
+
+/// A scalar UDF that must be invoked using async methods
+///
+/// Note this is not meant to be used directly, but is meant to be an implementation detail
+/// for AsyncUDFImpl.
+#[derive(Debug)]
+pub struct AsyncScalarUDF {
+    inner: Arc<dyn AsyncScalarUDFImpl>,
+}
+
+impl PartialEq for AsyncScalarUDF {
+    fn eq(&self, other: &Self) -> bool {
+        // Deconstruct to catch any new fields added in future
+        let Self { inner } = self;
+        inner.dyn_eq(other.inner.as_any())
+    }
+}
+impl Eq for AsyncScalarUDF {}
+
+impl Hash for AsyncScalarUDF {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        // Deconstruct to catch any new fields added in future
+        let Self { inner } = self;
+        inner.dyn_hash(state);
+    }
+}
+
+impl AsyncScalarUDF {
+    pub fn new(inner: Arc<dyn AsyncScalarUDFImpl>) -> Self {
+        Self { inner }
+    }
+
+    /// The ideal batch size for this function
+    pub fn ideal_batch_size(&self) -> Option<usize> {
+        self.inner.ideal_batch_size()
+    }
+
+    /// Turn this AsyncUDF into a ScalarUDF, suitable for
+    /// registering in the context
+    pub fn into_scalar_udf(self) -> ScalarUDF {
+        ScalarUDF::new_from_impl(self)
+    }
+
+    /// Invoke the function asynchronously with the async arguments
+    pub async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        self.inner.invoke_async_with_args(args).await
+    }
+}
+
+impl ScalarUDFImpl for AsyncScalarUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.inner.name()
+    }
+
+    fn signature(&self) -> &Signature {
+        self.inner.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        self.inner.return_type(arg_types)
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        self.inner.return_field_from_args(args)
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("async functions should not be called directly")
+    }
+}
+
+impl Display for AsyncScalarUDF {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "AsyncScalarUDF: {}", self.inner.name())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        hash::{DefaultHasher, Hash, Hasher},
+        sync::Arc,
+    };
+
+    use arrow::datatypes::DataType;
+    use async_trait::async_trait;
+    use datafusion_common::error::Result;
+    use datafusion_expr_common::{columnar_value::ColumnarValue, signature::Signature};
+
+    use crate::{
+        ScalarFunctionArgs, ScalarUDFImpl,
+        async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl},
+    };
+
+    #[derive(Debug, PartialEq, Eq, Hash, Clone)]
+    struct TestAsyncUDFImpl1 {
+        a: i32,
+    }
+
+    impl ScalarUDFImpl for TestAsyncUDFImpl1 {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            todo!()
+        }
+
+        fn signature(&self) -> &Signature {
+            todo!()
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            todo!()
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            todo!()
+        }
+    }
+
+    #[async_trait]
+    impl AsyncScalarUDFImpl for TestAsyncUDFImpl1 {
+        async fn invoke_async_with_args(
+            &self,
+            _args: ScalarFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            todo!()
+        }
+    }
+
+    #[derive(Debug, PartialEq, Eq, Hash, Clone)]
+    struct TestAsyncUDFImpl2 {
+        a: i32,
+    }
+
+    impl ScalarUDFImpl for TestAsyncUDFImpl2 {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            todo!()
+        }
+
+        fn signature(&self) -> &Signature {
+            todo!()
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            todo!()
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            todo!()
+        }
+    }
+
+    #[async_trait]
+    impl AsyncScalarUDFImpl for TestAsyncUDFImpl2 {
+        async fn invoke_async_with_args(
+            &self,
+            _args: ScalarFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            todo!()
+        }
+    }
+
+    fn hash<T: Hash>(value: &T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
+
+    #[test]
+    fn test_async_udf_partial_eq_and_hash() {
+        // Inner is same cloned arc -> equal
+        let inner = Arc::new(TestAsyncUDFImpl1 { a: 1 });
+        let a = AsyncScalarUDF::new(Arc::clone(&inner) as Arc<dyn AsyncScalarUDFImpl>);
+        let b = AsyncScalarUDF::new(inner);
+        assert_eq!(a, b);
+        assert_eq!(hash(&a), hash(&b));
+
+        // Inner is distinct arc -> still equal
+        let a = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl1 { a: 1 }));
+        let b = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl1 { a: 1 }));
+        assert_eq!(a, b);
+        assert_eq!(hash(&a), hash(&b));
+
+        // Negative case: inner is different value -> not equal
+        let a = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl1 { a: 1 }));
+        let b = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl1 { a: 2 }));
+        assert_ne!(a, b);
+        assert_ne!(hash(&a), hash(&b));
+
+        // Negative case: different functions -> not equal
+        let a = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl1 { a: 1 }));
+        let b = AsyncScalarUDF::new(Arc::new(TestAsyncUDFImpl2 { a: 1 }));
+        assert_ne!(a, b);
+        assert_ne!(hash(&a), hash(&b));
+    }
+}
diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs
index 9cb51612d0cab..10a9fd6948e4f 100644
--- a/datafusion/expr/src/conditional_expressions.rs
+++ b/datafusion/expr/src/conditional_expressions.rs
@@ -17,11 +17,13 @@
 
 //! Conditional expressions
 use crate::expr::Case;
-use crate::{expr_schema::ExprSchemable, Expr};
+use crate::{Expr, expr_schema::ExprSchemable};
 use arrow::datatypes::DataType;
-use datafusion_common::{plan_err, DFSchema, HashSet, Result};
+use datafusion_common::{DFSchema, HashSet, Result, plan_err};
+use itertools::Itertools as _;
 
 /// Helper struct for building [Expr::Case]
+#[derive(Debug, Clone)]
 pub struct CaseBuilder {
     expr: Option<Box<Expr>>,
     when_expr: Vec<Expr>,
@@ -72,7 +74,7 @@ impl CaseBuilder {
         let then_types: Vec<DataType> = then_expr
             .iter()
             .map(|e| match e {
-                Expr::Literal(_) => e.get_type(&DFSchema::empty()),
+                Expr::Literal(_, _) => e.get_type(&DFSchema::empty()),
                 _ => Ok(DataType::Null),
             })
             .collect::<Result<Vec<_>>>()?;
@@ -81,9 +83,12 @@ impl CaseBuilder {
             // Cannot verify types until execution type
         } else {
             let unique_types: HashSet<&DataType> = then_types.iter().collect();
-            if unique_types.len() != 1 {
+            if unique_types.is_empty() {
+                return plan_err!("CASE expression 'then' values had no data types");
+            } else if unique_types.len() != 1 {
                 return plan_err!(
-                    "CASE expression 'then' values had multiple data types: {unique_types:?}"
+                    "CASE expression 'then' values had multiple data types: {}",
+                    unique_types.iter().join(", ")
                 );
             }
         }
diff --git a/datafusion/expr/src/execution_props.rs b/datafusion/expr/src/execution_props.rs
index d672bd1acc460..3bf6978eb60ee 100644
--- a/datafusion/expr/src/execution_props.rs
+++ b/datafusion/expr/src/execution_props.rs
@@ -16,9 +16,10 @@
 // under the License.
 
 use crate::var_provider::{VarProvider, VarType};
-use chrono::{DateTime, TimeZone, Utc};
-use datafusion_common::alias::AliasGenerator;
+use chrono::{DateTime, Utc};
 use datafusion_common::HashMap;
+use datafusion_common::alias::AliasGenerator;
+use datafusion_common::config::ConfigOptions;
 use std::sync::Arc;
 
 /// Holds per-query execution properties and data (such as statement
@@ -32,9 +33,13 @@ use std::sync::Arc;
 /// done so during predicate pruning and expression simplification
 #[derive(Clone, Debug)]
 pub struct ExecutionProps {
-    pub query_execution_start_time: DateTime<Utc>,
+    /// The time at which the query execution started. If `None`,
+    /// functions like `now()` will not be simplified during optimization.
+    pub query_execution_start_time: Option<DateTime<Utc>>,
     /// Alias generator used by subquery optimizer rules
     pub alias_generator: Arc<AliasGenerator>,
+    /// Snapshot of config options when the query started
+    pub config_options: Option<Arc<ConfigOptions>>,
     /// Providers for scalar variables
     pub var_providers: Option<HashMap<VarType, Arc<dyn VarProvider + Send + Sync>>>,
 }
@@ -49,10 +54,9 @@ impl ExecutionProps {
     /// Creates a new execution props
     pub fn new() -> Self {
         ExecutionProps {
-            // Set this to a fixed sentinel to make it obvious if this is
-            // not being updated / propagated correctly
-            query_execution_start_time: Utc.timestamp_nanos(0),
+            query_execution_start_time: None,
             alias_generator: Arc::new(AliasGenerator::new()),
+            config_options: None,
             var_providers: None,
         }
     }
@@ -62,15 +66,22 @@ impl ExecutionProps {
         mut self,
         query_execution_start_time: DateTime<Utc>,
     ) -> Self {
-        self.query_execution_start_time = query_execution_start_time;
+        self.query_execution_start_time = Some(query_execution_start_time);
         self
     }
 
+    #[deprecated(since = "50.0.0", note = "Use mark_start_execution instead")]
+    pub fn start_execution(&mut self) -> &Self {
+        let default_config = Arc::new(ConfigOptions::default());
+        self.mark_start_execution(default_config)
+    }
+
     /// Marks the execution of query started timestamp.
     /// This also instantiates a new alias generator.
-    pub fn start_execution(&mut self) -> &Self {
-        self.query_execution_start_time = Utc::now();
+    pub fn mark_start_execution(&mut self, config_options: Arc<ConfigOptions>) -> &Self {
+        self.query_execution_start_time = Some(Utc::now());
         self.alias_generator = Arc::new(AliasGenerator::new());
+        self.config_options = Some(config_options);
         &*self
     }
 
@@ -91,6 +102,7 @@ impl ExecutionProps {
     }
 
     /// Returns the provider for the `var_type`, if any
+    #[expect(clippy::needless_pass_by_value)]
     pub fn get_var_provider(
         &self,
         var_type: VarType,
@@ -99,6 +111,12 @@ impl ExecutionProps {
             .as_ref()
             .and_then(|var_providers| var_providers.get(&var_type).cloned())
     }
+
+    /// Returns the configuration properties for this execution
+    /// if the execution has started
+    pub fn config_options(&self) -> Option<&Arc<ConfigOptions>> {
+        self.config_options.as_ref()
+    }
 }
 
 #[cfg(test)]
@@ -107,6 +125,9 @@ mod test {
     #[test]
     fn debug() {
         let props = ExecutionProps::new();
-        assert_eq!("ExecutionProps { query_execution_start_time: 1970-01-01T00:00:00Z, alias_generator: AliasGenerator { next_id: 1 }, var_providers: None }", format!("{props:?}"));
+        assert_eq!(
+            "ExecutionProps { query_execution_start_time: None, alias_generator: AliasGenerator { next_id: 1 }, config_options: None, var_providers: None }",
+            format!("{props:?}")
+        );
     }
 }
diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index fe5ea2ecd5b8b..12c879a515716 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -17,6 +17,7 @@
 
 //! Logical Expressions: [`Expr`]
 
+use std::cmp::Ordering;
 use std::collections::HashSet;
 use std::fmt::{self, Display, Formatter, Write};
 use std::hash::{Hash, Hasher};
@@ -24,24 +25,60 @@ use std::mem;
 use std::sync::Arc;
 
 use crate::expr_fn::binary_expr;
+use crate::function::WindowFunctionSimplification;
 use crate::logical_plan::Subquery;
-use crate::Volatility;
-use crate::{udaf, ExprSchemable, Operator, Signature, WindowFrame, WindowUDF};
+use crate::{AggregateUDF, Volatility};
+use crate::{ExprSchemable, Operator, Signature, WindowFrame, WindowUDF};
 
-use arrow::datatypes::{DataType, FieldRef};
+use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::cse::{HashNode, NormalizeEq, Normalizeable};
+use datafusion_common::datatype::DataTypeExt;
+use datafusion_common::metadata::format_type_and_metadata;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeContainer, TreeNodeRecursion,
 };
 use datafusion_common::{
     Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference,
 };
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
+#[cfg(feature = "sql")]
 use sqlparser::ast::{
-    display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem,
-    NullTreatment, RenameSelectItem, ReplaceSelectElement,
+    ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, RenameSelectItem,
+    ReplaceSelectElement,
 };
 
+// Moved in 51.0.0 to datafusion_common
+pub use datafusion_common::metadata::FieldMetadata;
+use datafusion_common::metadata::ScalarAndMetadata;
+
+// This mirrors sqlparser::ast::NullTreatment but we need our own variant
+// for when the sql feature is disabled.
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)]
+pub enum NullTreatment {
+    IgnoreNulls,
+    RespectNulls,
+}
+
+impl Display for NullTreatment {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            NullTreatment::IgnoreNulls => "IGNORE NULLS",
+            NullTreatment::RespectNulls => "RESPECT NULLS",
+        })
+    }
+}
+
+#[cfg(feature = "sql")]
+impl From<sqlparser::ast::NullTreatment> for NullTreatment {
+    fn from(value: sqlparser::ast::NullTreatment) -> Self {
+        match value {
+            sqlparser::ast::NullTreatment::IgnoreNulls => Self::IgnoreNulls,
+            sqlparser::ast::NullTreatment::RespectNulls => Self::RespectNulls,
+        }
+    }
+}
+
 /// Represents logical expressions such as `A + 1`, or `CAST(c1 AS int)`.
 ///
 /// For example the expression `A + 1` will be represented as
@@ -50,7 +87,7 @@ use sqlparser::ast::{
 ///  BinaryExpr {
 ///    left: Expr::Column("A"),
 ///    op: Operator::Plus,
-///    right: Expr::Literal(ScalarValue::Int32(Some(1)))
+///    right: Expr::Literal(ScalarValue::Int32(Some(1)), None)
 /// }
 /// ```
 ///
@@ -66,7 +103,7 @@ use sqlparser::ast::{
 ///
 /// # Printing Expressions
 ///
-/// You can print `Expr`s using the the `Debug` trait, `Display` trait, or
+/// You can print `Expr`s using the `Debug` trait, `Display` trait, or
 /// [`Self::human_display`]. See the [examples](#examples-displaying-exprs) below.
 ///
 /// If you need  SQL to pass to other systems, consider using [`Unparser`].
@@ -112,10 +149,10 @@ use sqlparser::ast::{
 /// # use datafusion_expr::{lit, col, Expr};
 /// // All literals are strongly typed in DataFusion. To make an `i64` 42:
 /// let expr = lit(42i64);
-/// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42))));
-/// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42))));
+/// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42)), None));
+/// assert_eq!(expr, Expr::Literal(ScalarValue::Int64(Some(42)), None));
 /// // To make a (typed) NULL:
-/// let expr = Expr::Literal(ScalarValue::Int64(None));
+/// let expr = Expr::Literal(ScalarValue::Int64(None), None);
 /// // to make an (untyped) NULL (the optimizer will coerce this to the correct type):
 /// let expr = lit(ScalarValue::Null);
 /// ```
@@ -130,11 +167,11 @@ use sqlparser::ast::{
 /// # use datafusion_expr::{lit, col, Operator, Expr};
 /// // Use the `+` operator to add two columns together
 /// let expr = col("c1") + col("c2");
-/// assert!(matches!(expr, Expr::BinaryExpr { ..} ));
+/// assert!(matches!(expr, Expr::BinaryExpr { .. }));
 /// if let Expr::BinaryExpr(binary_expr) = expr {
-///   assert_eq!(*binary_expr.left, col("c1"));
-///   assert_eq!(*binary_expr.right, col("c2"));
-///   assert_eq!(binary_expr.op, Operator::Plus);
+///     assert_eq!(*binary_expr.left, col("c1"));
+///     assert_eq!(*binary_expr.right, col("c2"));
+///     assert_eq!(binary_expr.op, Operator::Plus);
 /// }
 /// ```
 ///
@@ -145,12 +182,12 @@ use sqlparser::ast::{
 /// # use datafusion_common::ScalarValue;
 /// # use datafusion_expr::{lit, col, Operator, Expr};
 /// let expr = col("c1").eq(lit(42_i32));
-/// assert!(matches!(expr, Expr::BinaryExpr { .. } ));
+/// assert!(matches!(expr, Expr::BinaryExpr { .. }));
 /// if let Expr::BinaryExpr(binary_expr) = expr {
-///   assert_eq!(*binary_expr.left, col("c1"));
-///   let scalar = ScalarValue::Int32(Some(42));
-///   assert_eq!(*binary_expr.right, Expr::Literal(scalar));
-///   assert_eq!(binary_expr.op, Operator::Eq);
+///     assert_eq!(*binary_expr.left, col("c1"));
+///     let scalar = ScalarValue::Int32(Some(42));
+///     assert_eq!(*binary_expr.right, Expr::Literal(scalar, None));
+///     assert_eq!(binary_expr.op, Operator::Eq);
 /// }
 /// ```
 ///
@@ -163,22 +200,22 @@ use sqlparser::ast::{
 /// # use datafusion_expr::Expr;
 /// // Create a schema c1(int, c2 float)
 /// let arrow_schema = Schema::new(vec![
-///    Field::new("c1", DataType::Int32, false),
-///    Field::new("c2", DataType::Float64, false),
+///     Field::new("c1", DataType::Int32, false),
+///     Field::new("c2", DataType::Float64, false),
 /// ]);
 /// // DFSchema is a an Arrow schema with optional relation name
-/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema)
-///   .unwrap();
+/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap();
 ///
 /// // Form Vec<Expr> with an expression for each column in the schema
-/// let exprs: Vec<_> = df_schema.iter()
-///   .map(Expr::from)
-///   .collect();
+/// let exprs: Vec<_> = df_schema.iter().map(Expr::from).collect();
 ///
-/// assert_eq!(exprs, vec![
-///   Expr::from(Column::from_qualified_name("t1.c1")),
-///   Expr::from(Column::from_qualified_name("t1.c2")),
-/// ]);
+/// assert_eq!(
+///     exprs,
+///     vec![
+///         Expr::from(Column::from_qualified_name("t1.c1")),
+///         Expr::from(Column::from_qualified_name("t1.c2")),
+///     ]
+/// );
 /// ```
 ///
 /// # Examples: Displaying `Exprs`
@@ -193,7 +230,7 @@ use sqlparser::ast::{
 /// ```
 /// # use datafusion_expr::{lit, col};
 /// let expr = col("c1") + lit(42);
-/// assert_eq!(format!("{expr:?}"), "BinaryExpr(BinaryExpr { left: Column(Column { relation: None, name: \"c1\" }), op: Plus, right: Literal(Int32(42)) })");
+/// assert_eq!(format!("{expr:?}"), "BinaryExpr(BinaryExpr { left: Column(Column { relation: None, name: \"c1\" }), op: Plus, right: Literal(Int32(42), None) })");
 /// ```
 ///
 /// ## Use the `Display` trait  (detailed expression)
@@ -239,12 +276,13 @@ use sqlparser::ast::{
 /// let mut scalars = HashSet::new();
 /// // apply recursively visits all nodes in the expression tree
 /// expr.apply(|e| {
-///    if let Expr::Literal(scalar) = e {
-///       scalars.insert(scalar);
-///    }
-///    // The return value controls whether to continue visiting the tree
-///    Ok(TreeNodeRecursion::Continue)
-/// }).unwrap();
+///     if let Expr::Literal(scalar, _) = e {
+///         scalars.insert(scalar);
+///     }
+///     // The return value controls whether to continue visiting the tree
+///     Ok(TreeNodeRecursion::Continue)
+/// })
+/// .unwrap();
 /// // All subtrees have been visited and literals found
 /// assert_eq!(scalars.len(), 2);
 /// assert!(scalars.contains(&ScalarValue::Int32(Some(5))));
@@ -274,16 +312,17 @@ use sqlparser::ast::{
 /// assert!(rewritten.transformed);
 /// // to 42 = 5 AND b = 6
 /// assert_eq!(rewritten.data, lit(42).eq(lit(5)).and(col("b").eq(lit(6))));
-#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+/// ```
+#[derive(Clone, PartialEq, PartialOrd, Eq, Debug, Hash)]
 pub enum Expr {
     /// An expression with a specific name.
     Alias(Alias),
     /// A named reference to a qualified field in a schema.
     Column(Column),
     /// A named reference to a variable in a registry.
-    ScalarVariable(DataType, Vec<String>),
-    /// A constant value.
-    Literal(ScalarValue),
+    ScalarVariable(FieldRef, Vec<String>),
+    /// A constant value along with associated [`FieldMetadata`].
+    Literal(ScalarValue, Option<FieldMetadata>),
     /// A binary expression such as "age > 21"
     BinaryExpr(BinaryExpr),
     /// LIKE expression
@@ -330,13 +369,15 @@ pub enum Expr {
     /// [`ExprFunctionExt`]: crate::expr_fn::ExprFunctionExt
     AggregateFunction(AggregateFunction),
     /// Call a window function with a set of arguments.
-    WindowFunction(WindowFunction),
+    WindowFunction(Box<WindowFunction>),
     /// Returns whether the list contains the expr value.
     InList(InList),
     /// EXISTS subquery
     Exists(Exists),
     /// IN subquery
     InSubquery(InSubquery),
+    /// Set comparison subquery (e.g. `= ANY`, `> ALL`)
+    SetComparison(SetComparison),
     /// Scalar subquery
     ScalarSubquery(Subquery),
     /// Represents a reference to all available fields in a specific schema,
@@ -360,14 +401,20 @@ pub enum Expr {
     Placeholder(Placeholder),
     /// A placeholder which holds a reference to a qualified field
     /// in the outer query, used for correlated sub queries.
-    OuterReferenceColumn(DataType, Column),
+    OuterReferenceColumn(FieldRef, Column),
     /// Unnest expression
     Unnest(Unnest),
 }
 
 impl Default for Expr {
     fn default() -> Self {
-        Expr::Literal(ScalarValue::Null)
+        Expr::Literal(ScalarValue::Null, None)
+    }
+}
+
+impl AsRef<Expr> for Expr {
+    fn as_ref(&self) -> &Expr {
+        self
     }
 }
 
@@ -378,6 +425,21 @@ impl From<Column> for Expr {
     }
 }
 
+/// Create an [`Expr`] from a [`WindowFunction`]
+impl From<WindowFunction> for Expr {
+    fn from(value: WindowFunction) -> Self {
+        Expr::WindowFunction(Box::new(value))
+    }
+}
+
+/// Create an [`Expr`] from an [`ScalarAndMetadata`]
+impl From<ScalarAndMetadata> for Expr {
+    fn from(value: ScalarAndMetadata) -> Self {
+        let (value, metadata) = value.into_inner();
+        Expr::Literal(value, metadata)
+    }
+}
+
 /// Create an [`Expr`] from an optional qualifier and a [`FieldRef`]. This is
 /// useful for creating [`Expr`] from a [`DFSchema`].
 ///
@@ -404,6 +466,65 @@ impl<'a> TreeNodeContainer<'a, Self> for Expr {
     }
 }
 
+/// The metadata used in [`Field::metadata`].
+///
+/// This represents the metadata associated with an Arrow [`Field`]. The metadata consists of key-value pairs.
+///
+/// # Common Use Cases
+///
+/// Field metadata is commonly used to store:
+/// - Default values for columns when data is missing
+/// - Column descriptions or documentation
+/// - Data lineage information
+/// - Custom application-specific annotations
+/// - Encoding hints or display formatting preferences
+///
+/// # Example: Storing Default Values
+///
+/// A practical example of using field metadata is storing default values for columns
+/// that may be missing in the physical data but present in the logical schema.
+/// See the [default_column_values.rs] example implementation.
+///
+/// [default_column_values.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs
+pub type SchemaFieldMetadata = std::collections::HashMap<String, String>;
+
+/// Intersects multiple metadata instances for UNION operations.
+///
+/// This function implements the intersection strategy used by UNION operations,
+/// where only metadata keys that exist in ALL inputs with identical values
+/// are preserved in the result.
+///
+/// # Union Metadata Behavior
+///
+/// Union operations require consistent metadata across all branches:
+/// - Only metadata keys present in ALL union branches are kept
+/// - For each kept key, the value must be identical across all branches
+/// - If a key has different values across branches, it is excluded from the result
+/// - If any input has no metadata, the result will be empty
+///
+/// # Arguments
+///
+/// * `metadatas` - An iterator of `SchemaFieldMetadata` instances to intersect
+///
+/// # Returns
+///
+/// A new `SchemaFieldMetadata` containing only the intersected metadata
+pub fn intersect_metadata_for_union<'a>(
+    metadatas: impl IntoIterator<Item = &'a SchemaFieldMetadata>,
+) -> SchemaFieldMetadata {
+    let mut metadatas = metadatas.into_iter();
+    let Some(mut intersected) = metadatas.next().cloned() else {
+        return Default::default();
+    };
+
+    for metadata in metadatas {
+        // Only keep keys that exist in both with the same value
+        intersected.retain(|k, v| metadata.get(k) == Some(v));
+    }
+
+    intersected
+}
+
 /// UNNEST expression.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct Unnest {
@@ -430,7 +551,7 @@ pub struct Alias {
     pub expr: Box<Expr>,
     pub relation: Option<TableReference>,
     pub name: String,
-    pub metadata: Option<std::collections::HashMap<String, String>>,
+    pub metadata: Option<FieldMetadata>,
 }
 
 impl Hash for Alias {
@@ -442,16 +563,19 @@ impl Hash for Alias {
 }
 
 impl PartialOrd for Alias {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         let cmp = self.expr.partial_cmp(&other.expr);
-        let Some(std::cmp::Ordering::Equal) = cmp else {
+        let Some(Ordering::Equal) = cmp else {
             return cmp;
         };
         let cmp = self.relation.partial_cmp(&other.relation);
-        let Some(std::cmp::Ordering::Equal) = cmp else {
+        let Some(Ordering::Equal) = cmp else {
             return cmp;
         };
-        self.name.partial_cmp(&other.name)
+        self.name
+            .partial_cmp(&other.name)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -470,16 +594,13 @@ impl Alias {
         }
     }
 
-    pub fn with_metadata(
-        mut self,
-        metadata: Option<std::collections::HashMap<String, String>>,
-    ) -> Self {
+    pub fn with_metadata(mut self, metadata: Option<FieldMetadata>) -> Self {
         self.metadata = metadata;
         self
     }
 }
 
-/// Binary expression
+/// Binary expression for [`Expr::BinaryExpr`]
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct BinaryExpr {
     /// Left-hand side of the expression
@@ -681,13 +802,20 @@ pub struct Cast {
     /// The expression being cast
     pub expr: Box<Expr>,
     /// The `DataType` the expression will yield
-    pub data_type: DataType,
+    pub field: FieldRef,
 }
 
 impl Cast {
     /// Create a new Cast expression
     pub fn new(expr: Box<Expr>, data_type: DataType) -> Self {
-        Self { expr, data_type }
+        Self {
+            expr,
+            field: data_type.into_nullable_field_ref(),
+        }
+    }
+
+    pub fn new_from_field(expr: Box<Expr>, field: FieldRef) -> Self {
+        Self { expr, field }
     }
 }
 
@@ -697,13 +825,20 @@ pub struct TryCast {
     /// The expression being cast
     pub expr: Box<Expr>,
     /// The `DataType` the expression will yield
-    pub data_type: DataType,
+    pub field: FieldRef,
 }
 
 impl TryCast {
     /// Create a new TryCast expression
     pub fn new(expr: Box<Expr>, data_type: DataType) -> Self {
-        Self { expr, data_type }
+        Self {
+            expr,
+            field: data_type.into_nullable_field_ref(),
+        }
+    }
+
+    pub fn new_from_field(expr: Box<Expr>, field: FieldRef) -> Self {
+        Self { expr, field }
     }
 }
 
@@ -790,7 +925,7 @@ impl<'a> TreeNodeContainer<'a, Expr> for Sort {
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct AggregateFunction {
     /// Name of the function
-    pub func: Arc<crate::AggregateUDF>,
+    pub func: Arc<AggregateUDF>,
     pub params: AggregateFunctionParams,
 }
 
@@ -802,18 +937,18 @@ pub struct AggregateFunctionParams {
     /// Optional filter
     pub filter: Option<Box<Expr>>,
     /// Optional ordering
-    pub order_by: Option<Vec<Sort>>,
+    pub order_by: Vec<Sort>,
     pub null_treatment: Option<NullTreatment>,
 }
 
 impl AggregateFunction {
     /// Create a new AggregateFunction expression with a user-defined function (UDF)
     pub fn new_udf(
-        func: Arc<crate::AggregateUDF>,
+        func: Arc<AggregateUDF>,
         args: Vec<Expr>,
         distinct: bool,
         filter: Option<Box<Expr>>,
-        order_by: Option<Vec<Sort>>,
+        order_by: Vec<Sort>,
         null_treatment: Option<NullTreatment>,
     ) -> Self {
         Self {
@@ -837,8 +972,8 @@ impl AggregateFunction {
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
 pub enum WindowFunctionDefinition {
     /// A user defined aggregate function
-    AggregateUDF(Arc<crate::AggregateUDF>),
-    /// A user defined aggregate function
+    AggregateUDF(Arc<AggregateUDF>),
+    /// A user defined window function
     WindowUDF(Arc<WindowUDF>),
 }
 
@@ -847,7 +982,6 @@ impl WindowFunctionDefinition {
     pub fn return_field(
         &self,
         input_expr_fields: &[FieldRef],
-        _input_expr_nullable: &[bool],
         display_name: &str,
     ) -> Result<FieldRef> {
         match self {
@@ -875,6 +1009,16 @@ impl WindowFunctionDefinition {
             WindowFunctionDefinition::AggregateUDF(fun) => fun.name(),
         }
     }
+
+    /// Returns this window function's simplification hook, if any.
+    ///
+    /// See [`WindowFunctionSimplification`] for more information
+    pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
+        match self {
+            WindowFunctionDefinition::AggregateUDF(_) => None,
+            WindowFunctionDefinition::WindowUDF(udwf) => udwf.simplify(),
+        }
+    }
 }
 
 impl Display for WindowFunctionDefinition {
@@ -886,8 +1030,8 @@ impl Display for WindowFunctionDefinition {
     }
 }
 
-impl From<Arc<crate::AggregateUDF>> for WindowFunctionDefinition {
-    fn from(value: Arc<crate::AggregateUDF>) -> Self {
+impl From<Arc<AggregateUDF>> for WindowFunctionDefinition {
+    fn from(value: Arc<AggregateUDF>) -> Self {
         Self::AggregateUDF(value)
     }
 }
@@ -927,8 +1071,12 @@ pub struct WindowFunctionParams {
     pub order_by: Vec<Sort>,
     /// Window frame
     pub window_frame: WindowFrame,
+    /// Optional filter expression (FILTER (WHERE ...))
+    pub filter: Option<Box<Expr>>,
     /// Specifies how NULL value is treated: ignore or respect
     pub null_treatment: Option<NullTreatment>,
+    /// Distinct flag
+    pub distinct: bool,
 }
 
 impl WindowFunction {
@@ -942,10 +1090,19 @@ impl WindowFunction {
                 partition_by: Vec::default(),
                 order_by: Vec::default(),
                 window_frame: WindowFrame::new(None),
+                filter: None,
                 null_treatment: None,
+                distinct: false,
             },
         }
     }
+
+    /// Returns this window function's simplification hook, if any.
+    ///
+    /// See [`WindowFunctionSimplification`] for more information
+    pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
+        self.fun.simplify()
+    }
 }
 
 /// EXISTS expression
@@ -964,34 +1121,50 @@ impl Exists {
     }
 }
 
-/// User Defined Aggregate Function
-///
-/// See [`udaf::AggregateUDF`] for more information.
-#[derive(Clone, PartialEq, Eq, Hash, Debug)]
-pub struct AggregateUDF {
-    /// The function
-    pub fun: Arc<udaf::AggregateUDF>,
-    /// List of expressions to feed to the functions as arguments
-    pub args: Vec<Expr>,
-    /// Optional filter
-    pub filter: Option<Box<Expr>>,
-    /// Optional ORDER BY applied prior to aggregating
-    pub order_by: Option<Vec<Expr>>,
+/// Whether the set comparison uses `ANY`/`SOME` or `ALL`
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub enum SetQuantifier {
+    /// `ANY` (or `SOME`)
+    Any,
+    /// `ALL`
+    All,
+}
+
+impl Display for SetQuantifier {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            SetQuantifier::Any => write!(f, "ANY"),
+            SetQuantifier::All => write!(f, "ALL"),
+        }
+    }
 }
 
-impl AggregateUDF {
-    /// Create a new AggregateUDF expression
+/// Set comparison subquery (e.g. `= ANY`, `> ALL`)
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+pub struct SetComparison {
+    /// The expression to compare
+    pub expr: Box<Expr>,
+    /// Subquery that will produce a single column of data to compare against
+    pub subquery: Subquery,
+    /// Comparison operator (e.g. `=`, `>`, `<`)
+    pub op: Operator,
+    /// Quantifier (`ANY`/`ALL`)
+    pub quantifier: SetQuantifier,
+}
+
+impl SetComparison {
+    /// Create a new set comparison expression
     pub fn new(
-        fun: Arc<udaf::AggregateUDF>,
-        args: Vec<Expr>,
-        filter: Option<Box<Expr>>,
-        order_by: Option<Vec<Expr>>,
+        expr: Box<Expr>,
+        subquery: Subquery,
+        op: Operator,
+        quantifier: SetQuantifier,
     ) -> Self {
         Self {
-            fun,
-            args,
-            filter,
-            order_by,
+            expr,
+            subquery,
+            op,
+            quantifier,
         }
     }
 }
@@ -1049,13 +1222,22 @@ pub struct Placeholder {
     /// The identifier of the parameter, including the leading `$` (e.g, `"$1"` or `"$foo"`)
     pub id: String,
     /// The type the parameter will be filled in with
-    pub data_type: Option<DataType>,
+    pub field: Option<FieldRef>,
 }
 
 impl Placeholder {
     /// Create a new Placeholder expression
+    #[deprecated(since = "51.0.0", note = "Use new_with_field instead")]
     pub fn new(id: String, data_type: Option<DataType>) -> Self {
-        Self { id, data_type }
+        Self {
+            id,
+            field: data_type.map(|dt| Arc::new(Field::new("", dt, true))),
+        }
+    }
+
+    /// Create a new Placeholder expression from a Field
+    pub fn new_with_field(id: String, field: Option<FieldRef>) -> Self {
+        Self { id, field }
     }
 }
 
@@ -1097,6 +1279,129 @@ impl GroupingSet {
     }
 }
 
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub struct IlikeSelectItem {
+    pub pattern: String,
+}
+#[cfg(not(feature = "sql"))]
+impl Display for IlikeSelectItem {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "ILIKE '{}'", &self.pattern)?;
+        Ok(())
+    }
+}
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub enum ExcludeSelectItem {
+    Single(Ident),
+    Multiple(Vec<Ident>),
+}
+#[cfg(not(feature = "sql"))]
+impl Display for ExcludeSelectItem {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "EXCLUDE")?;
+        match self {
+            Self::Single(column) => {
+                write!(f, " {column}")?;
+            }
+            Self::Multiple(columns) => {
+                write!(f, " ({})", display_comma_separated(columns))?;
+            }
+        }
+        Ok(())
+    }
+}
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub struct ExceptSelectItem {
+    pub first_element: Ident,
+    pub additional_elements: Vec<Ident>,
+}
+#[cfg(not(feature = "sql"))]
+impl Display for ExceptSelectItem {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "EXCEPT ")?;
+        if self.additional_elements.is_empty() {
+            write!(f, "({})", self.first_element)?;
+        } else {
+            write!(
+                f,
+                "({}, {})",
+                self.first_element,
+                display_comma_separated(&self.additional_elements)
+            )?;
+        }
+        Ok(())
+    }
+}
+
+pub fn display_comma_separated<T>(slice: &[T]) -> String
+where
+    T: Display,
+{
+    use itertools::Itertools;
+    slice.iter().map(|v| format!("{v}")).join(", ")
+}
+
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub enum RenameSelectItem {
+    Single(String),
+    Multiple(Vec<String>),
+}
+#[cfg(not(feature = "sql"))]
+impl Display for RenameSelectItem {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "RENAME")?;
+        match self {
+            Self::Single(column) => {
+                write!(f, " {column}")?;
+            }
+            Self::Multiple(columns) => {
+                write!(f, " ({})", display_comma_separated(columns))?;
+            }
+        }
+        Ok(())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub struct Ident {
+    /// The value of the identifier without quotes.
+    pub value: String,
+    /// The starting quote if any. Valid quote characters are the single quote,
+    /// double quote, backtick, and opening square bracket.
+    pub quote_style: Option<char>,
+    /// The span of the identifier in the original SQL string.
+    pub span: String,
+}
+#[cfg(not(feature = "sql"))]
+impl Display for Ident {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "[{}]", self.value)
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
+#[cfg(not(feature = "sql"))]
+pub struct ReplaceSelectElement {
+    pub expr: String,
+    pub column_name: Ident,
+    pub as_keyword: bool,
+}
+#[cfg(not(feature = "sql"))]
+impl Display for ReplaceSelectElement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if self.as_keyword {
+            write!(f, "{} AS {}", self.expr, self.column_name)
+        } else {
+            write!(f, "{} {}", self.expr, self.column_name)
+        }
+    }
+}
+
 /// Additional options for wildcards, e.g. Snowflake `EXCLUDE`/`RENAME` and Bigquery `EXCEPT`.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug, Default)]
 pub struct WildcardOptions {
@@ -1249,10 +1554,22 @@ impl Expr {
         }
     }
 
-    /// Returns a full and complete string representation of this expression.
-    #[deprecated(since = "42.0.0", note = "use format! instead")]
-    pub fn canonical_name(&self) -> String {
-        format!("{self}")
+    /// Returns placement information for this expression.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    pub fn placement(&self) -> ExpressionPlacement {
+        match self {
+            Expr::Column(_) => ExpressionPlacement::Column,
+            Expr::Literal(_, _) => ExpressionPlacement::Literal,
+            Expr::Alias(inner) => inner.expr.placement(),
+            Expr::ScalarFunction(func) => {
+                let arg_placements: Vec<_> =
+                    func.args.iter().map(|arg| arg.placement()).collect();
+                func.func.placement(&arg_placements)
+            }
+            _ => ExpressionPlacement::KeepInPlace,
+        }
     }
 
     /// Return String representation of the variant represented by `self`
@@ -1271,6 +1588,7 @@ impl Expr {
             Expr::GroupingSet(..) => "GroupingSet",
             Expr::InList { .. } => "InList",
             Expr::InSubquery(..) => "InSubquery",
+            Expr::SetComparison(..) => "SetComparison",
             Expr::IsNotNull(..) => "IsNotNull",
             Expr::IsNull(..) => "IsNull",
             Expr::Like { .. } => "Like",
@@ -1403,15 +1721,16 @@ impl Expr {
     /// # Example
     /// ```
     /// # use datafusion_expr::col;
-    /// use std::collections::HashMap;
+    /// # use std::collections::HashMap;
+    /// # use datafusion_common::metadata::FieldMetadata;
     /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]);
+    /// let metadata = FieldMetadata::from(metadata);
     /// let expr = col("foo").alias_with_metadata("bar", Some(metadata));
     /// ```
-    ///
     pub fn alias_with_metadata(
         self,
         name: impl Into<String>,
-        metadata: Option<std::collections::HashMap<String, String>>,
+        metadata: Option<FieldMetadata>,
     ) -> Expr {
         Expr::Alias(Alias::new(self, None::<&str>, name.into()).with_metadata(metadata))
     }
@@ -1433,16 +1752,18 @@ impl Expr {
     /// # Example
     /// ```
     /// # use datafusion_expr::col;
-    /// use std::collections::HashMap;
+    /// # use std::collections::HashMap;
+    /// # use datafusion_common::metadata::FieldMetadata;
     /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]);
-    /// let expr = col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata));
+    /// let metadata = FieldMetadata::from(metadata);
+    /// let expr =
+    ///     col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata));
     /// ```
-    ///
     pub fn alias_qualified_with_metadata(
         self,
         relation: Option<impl Into<TableReference>>,
         name: impl Into<String>,
-        metadata: Option<std::collections::HashMap<String, String>>,
+        metadata: Option<FieldMetadata>,
     ) -> Expr {
         Expr::Alias(Alias::new(self, relation, name.into()).with_metadata(metadata))
     }
@@ -1512,8 +1833,16 @@ impl Expr {
             |expr| {
                 // f_up: unalias on up so we can remove nested aliases like
                 // `(x as foo) as bar`
-                if let Expr::Alias(Alias { expr, .. }) = expr {
-                    Ok(Transformed::yes(*expr))
+                if let Expr::Alias(alias) = expr {
+                    match alias
+                        .metadata
+                        .as_ref()
+                        .map(|h| h.is_empty())
+                        .unwrap_or(true)
+                    {
+                        true => Ok(Transformed::yes(*alias.expr)),
+                        false => Ok(Transformed::no(Expr::Alias(alias))),
+                    }
                 } else {
                     Ok(Transformed::no(expr))
                 }
@@ -1815,6 +2144,7 @@ impl Expr {
             | Expr::GroupingSet(..)
             | Expr::InList(..)
             | Expr::InSubquery(..)
+            | Expr::SetComparison(..)
             | Expr::IsFalse(..)
             | Expr::IsNotFalse(..)
             | Expr::IsNotNull(..)
@@ -1848,11 +2178,20 @@ impl Expr {
             _ => None,
         }
     }
+
+    /// Check if the Expr is literal and get the literal value if it is.
+    pub fn as_literal(&self) -> Option<&ScalarValue> {
+        if let Expr::Literal(lit, _) = self {
+            Some(lit)
+        } else {
+            None
+        }
+    }
 }
 
 impl Normalizeable for Expr {
     fn can_normalize(&self) -> bool {
-        #[allow(clippy::match_like_matches_macro)]
+        #[expect(clippy::match_like_matches_macro)]
         match self {
             Expr::BinaryExpr(BinaryExpr {
                 op:
@@ -2000,23 +2339,23 @@ impl NormalizeEq for Expr {
             (
                 Expr::Cast(Cast {
                     expr: self_expr,
-                    data_type: self_data_type,
+                    field: self_field,
                 }),
                 Expr::Cast(Cast {
                     expr: other_expr,
-                    data_type: other_data_type,
+                    field: other_field,
                 }),
             )
             | (
                 Expr::TryCast(TryCast {
                     expr: self_expr,
-                    data_type: self_data_type,
+                    field: self_field,
                 }),
                 Expr::TryCast(TryCast {
                     expr: other_expr,
-                    data_type: other_data_type,
+                    field: other_field,
                 }),
-            ) => self_data_type == other_data_type && self_expr.normalize_eq(other_expr),
+            ) => self_field == other_field && self_expr.normalize_eq(other_expr),
             (
                 Expr::ScalarFunction(ScalarFunction {
                     func: self_func,
@@ -2073,48 +2412,51 @@ impl NormalizeEq for Expr {
                         (None, None) => true,
                         _ => false,
                     }
-                    && match (self_order_by, other_order_by) {
-                        (Some(self_order_by), Some(other_order_by)) => self_order_by
-                            .iter()
-                            .zip(other_order_by.iter())
-                            .all(|(a, b)| {
-                                a.asc == b.asc
-                                    && a.nulls_first == b.nulls_first
-                                    && a.expr.normalize_eq(&b.expr)
-                            }),
-                        (None, None) => true,
-                        _ => false,
-                    }
+                    && self_order_by
+                        .iter()
+                        .zip(other_order_by.iter())
+                        .all(|(a, b)| {
+                            a.asc == b.asc
+                                && a.nulls_first == b.nulls_first
+                                && a.expr.normalize_eq(&b.expr)
+                        })
+                    && self_order_by.len() == other_order_by.len()
             }
-            (
-                Expr::WindowFunction(WindowFunction {
+            (Expr::WindowFunction(left), Expr::WindowFunction(other)) => {
+                let WindowFunction {
                     fun: self_fun,
-                    params: self_params,
-                }),
-                Expr::WindowFunction(WindowFunction {
+                    params:
+                        WindowFunctionParams {
+                            args: self_args,
+                            window_frame: self_window_frame,
+                            partition_by: self_partition_by,
+                            order_by: self_order_by,
+                            filter: self_filter,
+                            null_treatment: self_null_treatment,
+                            distinct: self_distinct,
+                        },
+                } = left.as_ref();
+                let WindowFunction {
                     fun: other_fun,
-                    params: other_params,
-                }),
-            ) => {
-                let (
-                    WindowFunctionParams {
-                        args: self_args,
-                        window_frame: self_window_frame,
-                        partition_by: self_partition_by,
-                        order_by: self_order_by,
-                        null_treatment: self_null_treatment,
-                    },
-                    WindowFunctionParams {
-                        args: other_args,
-                        window_frame: other_window_frame,
-                        partition_by: other_partition_by,
-                        order_by: other_order_by,
-                        null_treatment: other_null_treatment,
-                    },
-                ) = (self_params, other_params);
+                    params:
+                        WindowFunctionParams {
+                            args: other_args,
+                            window_frame: other_window_frame,
+                            partition_by: other_partition_by,
+                            order_by: other_order_by,
+                            filter: other_filter,
+                            null_treatment: other_null_treatment,
+                            distinct: other_distinct,
+                        },
+                } = other.as_ref();
 
                 self_fun.name() == other_fun.name()
                     && self_window_frame == other_window_frame
+                    && match (self_filter, other_filter) {
+                        (Some(a), Some(b)) => a.normalize_eq(b),
+                        (None, None) => true,
+                        _ => false,
+                    }
                     && self_null_treatment == other_null_treatment
                     && self_args.len() == other_args.len()
                     && self_args
@@ -2133,6 +2475,7 @@ impl NormalizeEq for Expr {
                                 && a.nulls_first == b.nulls_first
                                 && a.expr.normalize_eq(&b.expr)
                         })
+                    && self_distinct == other_distinct
             }
             (
                 Expr::Exists(Exists {
@@ -2273,11 +2616,11 @@ impl HashNode for Expr {
             Expr::Column(column) => {
                 column.hash(state);
             }
-            Expr::ScalarVariable(data_type, name) => {
-                data_type.hash(state);
+            Expr::ScalarVariable(field, name) => {
+                field.hash(state);
                 name.hash(state);
             }
-            Expr::Literal(scalar_value) => {
+            Expr::Literal(scalar_value, _) => {
                 scalar_value.hash(state);
             }
             Expr::BinaryExpr(BinaryExpr {
@@ -2328,15 +2671,9 @@ impl HashNode for Expr {
                 when_then_expr: _when_then_expr,
                 else_expr: _else_expr,
             }) => {}
-            Expr::Cast(Cast {
-                expr: _expr,
-                data_type,
-            })
-            | Expr::TryCast(TryCast {
-                expr: _expr,
-                data_type,
-            }) => {
-                data_type.hash(state);
+            Expr::Cast(Cast { expr: _expr, field })
+            | Expr::TryCast(TryCast { expr: _expr, field }) => {
+                field.hash(state);
             }
             Expr::ScalarFunction(ScalarFunction { func, args: _args }) => {
                 func.hash(state);
@@ -2356,17 +2693,25 @@ impl HashNode for Expr {
                 distinct.hash(state);
                 null_treatment.hash(state);
             }
-            Expr::WindowFunction(WindowFunction { fun, params }) => {
-                let WindowFunctionParams {
-                    args: _args,
-                    partition_by: _,
-                    order_by: _,
-                    window_frame,
-                    null_treatment,
-                } = params;
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction {
+                    fun,
+                    params:
+                        WindowFunctionParams {
+                            args: _args,
+                            partition_by: _,
+                            order_by: _,
+                            window_frame,
+                            filter,
+                            null_treatment,
+                            distinct,
+                        },
+                } = window_fun.as_ref();
                 fun.hash(state);
                 window_frame.hash(state);
+                filter.hash(state);
                 null_treatment.hash(state);
+                distinct.hash(state);
             }
             Expr::InList(InList {
                 expr: _expr,
@@ -2387,6 +2732,16 @@ impl HashNode for Expr {
                 subquery.hash(state);
                 negated.hash(state);
             }
+            Expr::SetComparison(SetComparison {
+                expr: _,
+                subquery,
+                op,
+                quantifier,
+            }) => {
+                subquery.hash(state);
+                op.hash(state);
+                quantifier.hash(state);
+            }
             Expr::ScalarSubquery(subquery) => {
                 subquery.hash(state);
             }
@@ -2405,8 +2760,8 @@ impl HashNode for Expr {
             Expr::Placeholder(place_holder) => {
                 place_holder.hash(state);
             }
-            Expr::OuterReferenceColumn(data_type, column) => {
-                data_type.hash(state);
+            Expr::OuterReferenceColumn(field, column) => {
+                field.hash(state);
                 column.hash(state);
             }
             Expr::Unnest(Unnest { expr: _expr }) => {}
@@ -2414,23 +2769,26 @@ impl HashNode for Expr {
     }
 }
 
-// Modifies expr if it is a placeholder with datatype of right
+// Modifies expr to match the DataType, metadata, and nullability of other if it is
+// a placeholder with previously unspecified type information (i.e., most placeholders)
 fn rewrite_placeholder(expr: &mut Expr, other: &Expr, schema: &DFSchema) -> Result<()> {
-    if let Expr::Placeholder(Placeholder { id: _, data_type }) = expr {
-        if data_type.is_none() {
-            let other_dt = other.get_type(schema);
-            match other_dt {
-                Err(e) => {
-                    Err(e.context(format!(
-                        "Can not find type of {other} needed to infer type of {expr}"
-                    )))?;
-                }
-                Ok(dt) => {
-                    *data_type = Some(dt);
-                }
+    if let Expr::Placeholder(Placeholder { id: _, field }) = expr
+        && field.is_none()
+    {
+        let other_field = other.to_field(schema);
+        match other_field {
+            Err(e) => {
+                Err(e.context(format!(
+                    "Can not find type of {other} needed to infer type of {expr}"
+                )))?;
             }
-        };
-    }
+            Ok((_, other_field)) => {
+                // We can't infer the nullability of the future parameter that might
+                // be bound, so ensure this is set to true
+                *field = Some(other_field.as_ref().clone().with_nullable(true).into());
+            }
+        }
+    };
     Ok(())
 }
 
@@ -2453,7 +2811,7 @@ impl Display for SchemaDisplay<'_> {
             // TODO: remove the next line after `Expr::Wildcard` is removed
             #[expect(deprecated)]
             Expr::Column(_)
-            | Expr::Literal(_)
+            | Expr::Literal(_, _)
             | Expr::ScalarVariable(..)
             | Expr::OuterReferenceColumn(..)
             | Expr::Placeholder(_)
@@ -2574,6 +2932,12 @@ impl Display for SchemaDisplay<'_> {
                 write!(f, "NOT IN")
             }
             Expr::InSubquery(InSubquery { negated: false, .. }) => write!(f, "IN"),
+            Expr::SetComparison(SetComparison {
+                expr,
+                op,
+                quantifier,
+                ..
+            }) => write!(f, "{} {op} {quantifier}", SchemaDisplay(expr.as_ref())),
             Expr::IsTrue(expr) => write!(f, "{} IS TRUE", SchemaDisplay(expr)),
             Expr::IsFalse(expr) => write!(f, "{} IS FALSE", SchemaDisplay(expr)),
             Expr::IsNotTrue(expr) => {
@@ -2646,52 +3010,79 @@ impl Display for SchemaDisplay<'_> {
 
                 Ok(())
             }
-            Expr::WindowFunction(WindowFunction { fun, params }) => match fun {
-                WindowFunctionDefinition::AggregateUDF(fun) => {
-                    match fun.window_function_schema_name(params) {
-                        Ok(name) => {
-                            write!(f, "{name}")
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction { fun, params } = window_fun.as_ref();
+                match fun {
+                    WindowFunctionDefinition::AggregateUDF(fun) => {
+                        match fun.window_function_schema_name(params) {
+                            Ok(name) => {
+                                write!(f, "{name}")
+                            }
+                            Err(e) => {
+                                write!(
+                                    f,
+                                    "got error from window_function_schema_name {e}"
+                                )
+                            }
                         }
-                        Err(e) => {
-                            write!(f, "got error from window_function_schema_name {e}")
-                        }
-                    }
-                }
-                _ => {
-                    let WindowFunctionParams {
-                        args,
-                        partition_by,
-                        order_by,
-                        window_frame,
-                        null_treatment,
-                    } = params;
-
-                    write!(
-                        f,
-                        "{}({})",
-                        fun,
-                        schema_name_from_exprs_comma_separated_without_space(args)?
-                    )?;
-
-                    if let Some(null_treatment) = null_treatment {
-                        write!(f, " {null_treatment}")?;
                     }
+                    _ => {
+                        let WindowFunctionParams {
+                            args,
+                            partition_by,
+                            order_by,
+                            window_frame,
+                            filter,
+                            null_treatment,
+                            distinct,
+                        } = params;
+
+                        // Write function name and open parenthesis
+                        write!(f, "{fun}(")?;
+
+                        // If DISTINCT, emit the keyword
+                        if *distinct {
+                            write!(f, "DISTINCT ")?;
+                        }
 
-                    if !partition_by.is_empty() {
+                        // Write the comma‑separated argument list
                         write!(
                             f,
-                            " PARTITION BY [{}]",
-                            schema_name_from_exprs(partition_by)?
+                            "{}",
+                            schema_name_from_exprs_comma_separated_without_space(args)?
                         )?;
-                    }
 
-                    if !order_by.is_empty() {
-                        write!(f, " ORDER BY [{}]", schema_name_from_sorts(order_by)?)?;
-                    };
+                        // **Close the argument parenthesis**
+                        write!(f, ")")?;
+
+                        if let Some(null_treatment) = null_treatment {
+                            write!(f, " {null_treatment}")?;
+                        }
+
+                        if let Some(filter) = filter {
+                            write!(f, " FILTER (WHERE {filter})")?;
+                        }
+
+                        if !partition_by.is_empty() {
+                            write!(
+                                f,
+                                " PARTITION BY [{}]",
+                                schema_name_from_exprs(partition_by)?
+                            )?;
+                        }
+
+                        if !order_by.is_empty() {
+                            write!(
+                                f,
+                                " ORDER BY [{}]",
+                                schema_name_from_sorts(order_by)?
+                            )?;
+                        };
 
-                    write!(f, " {window_frame}")
+                        write!(f, " {window_frame}")
+                    }
                 }
-            },
+            }
         }
     }
 }
@@ -2702,7 +3093,7 @@ struct SqlDisplay<'a>(&'a Expr);
 impl Display for SqlDisplay<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         match self.0 {
-            Expr::Literal(scalar) => scalar.fmt(f),
+            Expr::Literal(scalar, _) => scalar.fmt(f),
             Expr::Alias(Alias { name, .. }) => write!(f, "{name}"),
             Expr::Between(Between {
                 expr,
@@ -2969,7 +3360,12 @@ impl Display for Expr {
                 write!(f, "{OUTER_REFERENCE_COLUMN_PREFIX}({c})")
             }
             Expr::ScalarVariable(_, var_names) => write!(f, "{}", var_names.join(".")),
-            Expr::Literal(v) => write!(f, "{v:?}"),
+            Expr::Literal(v, metadata) => {
+                match metadata.as_ref().map(|m| m.is_empty()).unwrap_or(true) {
+                    false => write!(f, "{v:?} {:?}", metadata.as_ref().unwrap()),
+                    true => write!(f, "{v:?}"),
+                }
+            }
             Expr::Case(case) => {
                 write!(f, "CASE ")?;
                 if let Some(e) = &case.expr {
@@ -2983,11 +3379,15 @@ impl Display for Expr {
                 }
                 write!(f, "END")
             }
-            Expr::Cast(Cast { expr, data_type }) => {
-                write!(f, "CAST({expr} AS {data_type:?})")
+            Expr::Cast(Cast { expr, field }) => {
+                let formatted =
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()));
+                write!(f, "CAST({expr} AS {formatted})")
             }
-            Expr::TryCast(TryCast { expr, data_type }) => {
-                write!(f, "TRY_CAST({expr} AS {data_type:?})")
+            Expr::TryCast(TryCast { expr, field }) => {
+                let formatted =
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()));
+                write!(f, "TRY_CAST({expr} AS {formatted})")
             }
             Expr::Not(expr) => write!(f, "NOT {expr}"),
             Expr::Negative(expr) => write!(f, "(- {expr})"),
@@ -3017,56 +3417,70 @@ impl Display for Expr {
                 subquery,
                 negated: false,
             }) => write!(f, "{expr} IN ({subquery:?})"),
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => write!(f, "{expr} {op} {quantifier} ({subquery:?})"),
             Expr::ScalarSubquery(subquery) => write!(f, "({subquery:?})"),
             Expr::BinaryExpr(expr) => write!(f, "{expr}"),
             Expr::ScalarFunction(fun) => {
                 fmt_function(f, fun.name(), false, &fun.args, true)
             }
-            // TODO: use udf's display_name, need to fix the separator issue, <https://github.com/apache/datafusion/issues/10364>
-            // Expr::ScalarFunction(ScalarFunction { func, args }) => {
-            //     write!(f, "{}", func.display_name(args).unwrap())
-            // }
-            Expr::WindowFunction(WindowFunction { fun, params }) => match fun {
-                WindowFunctionDefinition::AggregateUDF(fun) => {
-                    match fun.window_function_display_name(params) {
-                        Ok(name) => {
-                            write!(f, "{name}")
-                        }
-                        Err(e) => {
-                            write!(f, "got error from window_function_display_name {e}")
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction { fun, params } = window_fun.as_ref();
+                match fun {
+                    WindowFunctionDefinition::AggregateUDF(fun) => {
+                        match fun.window_function_display_name(params) {
+                            Ok(name) => {
+                                write!(f, "{name}")
+                            }
+                            Err(e) => {
+                                write!(
+                                    f,
+                                    "got error from window_function_display_name {e}"
+                                )
+                            }
                         }
                     }
-                }
-                WindowFunctionDefinition::WindowUDF(fun) => {
-                    let WindowFunctionParams {
-                        args,
-                        partition_by,
-                        order_by,
-                        window_frame,
-                        null_treatment,
-                    } = params;
-
-                    fmt_function(f, &fun.to_string(), false, args, true)?;
+                    WindowFunctionDefinition::WindowUDF(fun) => {
+                        let WindowFunctionParams {
+                            args,
+                            partition_by,
+                            order_by,
+                            window_frame,
+                            filter,
+                            null_treatment,
+                            distinct,
+                        } = params;
+
+                        fmt_function(f, &fun.to_string(), *distinct, args, true)?;
+
+                        if let Some(nt) = null_treatment {
+                            write!(f, "{nt}")?;
+                        }
 
-                    if let Some(nt) = null_treatment {
-                        write!(f, "{nt}")?;
-                    }
+                        if let Some(fe) = filter {
+                            write!(f, " FILTER (WHERE {fe})")?;
+                        }
 
-                    if !partition_by.is_empty() {
-                        write!(f, " PARTITION BY [{}]", expr_vec_fmt!(partition_by))?;
-                    }
-                    if !order_by.is_empty() {
-                        write!(f, " ORDER BY [{}]", expr_vec_fmt!(order_by))?;
+                        if !partition_by.is_empty() {
+                            write!(f, " PARTITION BY [{}]", expr_vec_fmt!(partition_by))?;
+                        }
+                        if !order_by.is_empty() {
+                            write!(f, " ORDER BY [{}]", expr_vec_fmt!(order_by))?;
+                        }
+                        write!(
+                            f,
+                            " {} BETWEEN {} AND {}",
+                            window_frame.units,
+                            window_frame.start_bound,
+                            window_frame.end_bound
+                        )
                     }
-                    write!(
-                        f,
-                        " {} BETWEEN {} AND {}",
-                        window_frame.units,
-                        window_frame.start_bound,
-                        window_frame.end_bound
-                    )
                 }
-            },
+            }
             Expr::AggregateFunction(AggregateFunction { func, params }) => {
                 match func.display_name(params) {
                     Ok(name) => {
@@ -3203,8 +3617,8 @@ pub fn physical_name(expr: &Expr) -> Result<String> {
 mod test {
     use crate::expr_fn::col;
     use crate::{
-        case, lit, qualified_wildcard, wildcard, wildcard_with_options, ColumnarValue,
-        ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility,
+        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility, case,
+        lit, placeholder, qualified_wildcard, wildcard, wildcard_with_options,
     };
     use arrow::datatypes::{Field, Schema};
     use sqlparser::ast;
@@ -3218,15 +3632,15 @@ mod test {
         let param_placeholders = vec![
             Expr::Placeholder(Placeholder {
                 id: "$1".to_string(),
-                data_type: None,
+                field: None,
             }),
             Expr::Placeholder(Placeholder {
                 id: "$2".to_string(),
-                data_type: None,
+                field: None,
             }),
             Expr::Placeholder(Placeholder {
                 id: "$3".to_string(),
-                data_type: None,
+                field: None,
             }),
         ];
         let in_list = Expr::InList(InList {
@@ -3252,8 +3666,8 @@ mod test {
                     match expr {
                         Expr::Placeholder(placeholder) => {
                             assert_eq!(
-                                placeholder.data_type,
-                                Some(DataType::Int32),
+                                placeholder.field.unwrap().data_type(),
+                                &DataType::Int32,
                                 "Placeholder {} should infer Int32",
                                 placeholder.id
                             );
@@ -3277,7 +3691,7 @@ mod test {
             expr: Box::new(col("name")),
             pattern: Box::new(Expr::Placeholder(Placeholder {
                 id: "$1".to_string(),
-                data_type: None,
+                field: None,
             })),
             negated: false,
             case_insensitive: false,
@@ -3290,7 +3704,7 @@ mod test {
         match inferred_expr {
             Expr::Like(like) => match *like.pattern {
                 Expr::Placeholder(placeholder) => {
-                    assert_eq!(placeholder.data_type, Some(DataType::Utf8));
+                    assert_eq!(placeholder.field.unwrap().data_type(), &DataType::Utf8);
                 }
                 _ => panic!("Expected Placeholder"),
             },
@@ -3305,8 +3719,8 @@ mod test {
             Expr::SimilarTo(like) => match *like.pattern {
                 Expr::Placeholder(placeholder) => {
                     assert_eq!(
-                        placeholder.data_type,
-                        Some(DataType::Utf8),
+                        placeholder.field.unwrap().data_type(),
+                        &DataType::Utf8,
                         "Placeholder {} should infer Utf8",
                         placeholder.id
                     );
@@ -3318,27 +3732,56 @@ mod test {
     }
 
     #[test]
-    #[allow(deprecated)]
+    fn infer_placeholder_with_metadata() {
+        // name == $1, where name is a non-nullable string
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("name", DataType::Utf8, false).with_metadata(
+                [("some_key".to_string(), "some_value".to_string())].into(),
+            ),
+        ]));
+        let df_schema = DFSchema::try_from(schema).unwrap();
+
+        let expr = binary_expr(col("name"), Operator::Eq, placeholder("$1"));
+
+        let (inferred_expr, _) = expr.infer_placeholder_types(&df_schema).unwrap();
+        match inferred_expr {
+            Expr::BinaryExpr(BinaryExpr { right, .. }) => match *right {
+                Expr::Placeholder(placeholder) => {
+                    assert_eq!(
+                        placeholder.field.as_ref().unwrap().data_type(),
+                        &DataType::Utf8
+                    );
+                    assert_eq!(
+                        placeholder.field.as_ref().unwrap().metadata(),
+                        df_schema.field(0).metadata()
+                    );
+                    // Inferred placeholder should still be nullable
+                    assert!(placeholder.field.as_ref().unwrap().is_nullable());
+                }
+                _ => panic!("Expected Placeholder"),
+            },
+            _ => panic!("Expected BinaryExpr"),
+        }
+    }
+
+    #[test]
     fn format_case_when() -> Result<()> {
         let expr = case(col("a"))
             .when(lit(1), lit(true))
             .when(lit(0), lit(false))
             .otherwise(lit(ScalarValue::Null))?;
         let expected = "CASE a WHEN Int32(1) THEN Boolean(true) WHEN Int32(0) THEN Boolean(false) ELSE NULL END";
-        assert_eq!(expected, expr.canonical_name());
         assert_eq!(expected, format!("{expr}"));
         Ok(())
     }
 
     #[test]
-    #[allow(deprecated)]
     fn format_cast() -> Result<()> {
         let expr = Expr::Cast(Cast {
-            expr: Box::new(Expr::Literal(ScalarValue::Float32(Some(1.23)))),
-            data_type: DataType::Utf8,
+            expr: Box::new(Expr::Literal(ScalarValue::Float32(Some(1.23)), None)),
+            field: DataType::Utf8.into_nullable_field_ref(),
         });
         let expected_canonical = "CAST(Float32(1.23) AS Utf8)";
-        assert_eq!(expected_canonical, expr.canonical_name());
         assert_eq!(expected_canonical, format!("{expr}"));
         // Note that CAST intentionally has a name that is different from its `Display`
         // representation. CAST does not change the name of expressions.
@@ -3420,7 +3863,7 @@ mod test {
     #[test]
     fn test_is_volatile_scalar_func() {
         // UDF
-        #[derive(Debug)]
+        #[derive(Debug, PartialEq, Eq, Hash)]
         struct TestScalarUDF {
             signature: Signature,
         }
@@ -3463,6 +3906,7 @@ mod test {
     }
 
     use super::*;
+    use crate::logical_plan::{EmptyRelation, LogicalPlan};
 
     #[test]
     fn test_display_wildcard() {
@@ -3553,6 +3997,28 @@ mod test {
         )
     }
 
+    #[test]
+    fn test_display_set_comparison() {
+        let subquery = Subquery {
+            subquery: Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+                produce_one_row: false,
+                schema: Arc::new(DFSchema::empty()),
+            })),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        };
+
+        let expr = Expr::SetComparison(SetComparison::new(
+            Box::new(Expr::Column(Column::from_name("a"))),
+            subquery,
+            Operator::Gt,
+            SetQuantifier::Any,
+        ));
+
+        assert_eq!(format!("{expr}"), "a > ANY (<subquery>)");
+        assert_eq!(format!("{}", expr.human_display()), "a > ANY (<subquery>)");
+    }
+
     #[test]
     fn test_schema_display_alias_with_relation() {
         assert_eq!(
@@ -3592,4 +4058,73 @@ mod test {
             rename: opt_rename,
         }
     }
+
+    #[test]
+    fn test_size_of_expr() {
+        // because Expr is such a widely used struct in DataFusion
+        // it is important to keep its size as small as possible
+        //
+        // If this test fails when you change `Expr`, please try
+        // `Box`ing the fields to make `Expr` smaller
+        // See https://github.com/apache/datafusion/issues/16199 for details
+        assert_eq!(size_of::<Expr>(), 112);
+        assert_eq!(size_of::<ScalarValue>(), 64);
+        assert_eq!(size_of::<DataType>(), 24); // 3 ptrs
+        assert_eq!(size_of::<Vec<Expr>>(), 24);
+        assert_eq!(size_of::<Arc<Expr>>(), 8);
+    }
+
+    #[test]
+    fn test_accept_exprs() {
+        fn accept_exprs<E: AsRef<Expr>>(_: &[E]) {}
+
+        let expr = || -> Expr { lit(1) };
+
+        // Call accept_exprs with owned expressions
+        let owned_exprs = vec![expr(), expr()];
+        accept_exprs(&owned_exprs);
+
+        // Call accept_exprs with expressions from expr tree
+        let udf = Expr::ScalarFunction(ScalarFunction {
+            func: Arc::new(ScalarUDF::new_from_impl(TestUDF {})),
+            args: vec![expr(), expr()],
+        });
+        let Expr::ScalarFunction(scalar) = &udf else {
+            unreachable!()
+        };
+        accept_exprs(&scalar.args);
+
+        // Call accept_exprs with expressions collected from expr tree, without cloning
+        let mut collected_refs: Vec<&Expr> = scalar.args.iter().collect();
+        collected_refs.extend(&owned_exprs);
+        accept_exprs(&collected_refs);
+
+        // test helpers
+        #[derive(Debug, PartialEq, Eq, Hash)]
+        struct TestUDF {}
+        impl ScalarUDFImpl for TestUDF {
+            fn as_any(&self) -> &dyn Any {
+                unimplemented!()
+            }
+
+            fn name(&self) -> &str {
+                unimplemented!()
+            }
+
+            fn signature(&self) -> &Signature {
+                unimplemented!()
+            }
+
+            fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+                unimplemented!()
+            }
+
+            fn invoke_with_args(
+                &self,
+                _args: ScalarFunctionArgs,
+            ) -> Result<ColumnarValue> {
+                unimplemented!()
+            }
+        }
+    }
 }
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 67e80a8d9bba9..4254602d7c555 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -19,17 +19,18 @@
 
 use crate::expr::{
     AggregateFunction, BinaryExpr, Cast, Exists, GroupingSet, InList, InSubquery,
-    Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction, WindowFunctionParams,
+    NullTreatment, Placeholder, TryCast, Unnest, WildcardOptions, WindowFunction,
 };
 use crate::function::{
     AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory,
     StateFieldsArgs,
 };
+use crate::ptr_eq::PtrEq;
 use crate::select_expr::SelectExpr;
 use crate::{
+    AggregateUDF, Expr, LimitEffect, LogicalPlan, Operator, PartitionEvaluator,
+    ScalarFunctionArgs, ScalarFunctionImplementation, ScalarUDF, Signature, Volatility,
     conditional_expressions::CaseBuilder, expr::Sort, logical_plan::Subquery,
-    AggregateUDF, Expr, LogicalPlan, Operator, PartitionEvaluator, ScalarFunctionArgs,
-    ScalarFunctionImplementation, ScalarUDF, Signature, Volatility,
 };
 use crate::{
     AggregateUDFImpl, ColumnarValue, ScalarUDFImpl, WindowFrame, WindowUDF, WindowUDFImpl,
@@ -38,12 +39,14 @@ use arrow::compute::kernels::cast_utils::{
     parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month,
 };
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{plan_err, Column, Result, ScalarValue, Spans, TableReference};
+use datafusion_common::{Column, Result, ScalarValue, Spans, TableReference, plan_err};
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
-use sqlparser::ast::NullTreatment;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use std::any::Any;
+use std::collections::HashMap;
 use std::fmt::Debug;
+use std::hash::Hash;
 use std::ops::Not;
 use std::sync::Arc;
 
@@ -68,8 +71,22 @@ pub fn col(ident: impl Into<Column>) -> Expr {
 
 /// Create an out reference column which hold a reference that has been resolved to a field
 /// outside of the current plan.
+/// The expression created by this function does not preserve the metadata of the outer column.
+/// Please use `out_ref_col_with_metadata` if you want to preserve the metadata.
 pub fn out_ref_col(dt: DataType, ident: impl Into<Column>) -> Expr {
-    Expr::OuterReferenceColumn(dt, ident.into())
+    out_ref_col_with_metadata(dt, HashMap::new(), ident)
+}
+
+/// Create an out reference column from an existing field (preserving metadata)
+pub fn out_ref_col_with_metadata(
+    dt: DataType,
+    metadata: HashMap<String, String>,
+    ident: impl Into<Column>,
+) -> Expr {
+    let column = ident.into();
+    let field: FieldRef =
+        Arc::new(Field::new(column.name(), dt, true).with_metadata(metadata));
+    Expr::OuterReferenceColumn(field, column)
 }
 
 /// Create an unqualified column expression from the provided name, without normalizing
@@ -102,13 +119,13 @@ pub fn ident(name: impl Into<String>) -> Expr {
 ///
 /// ```rust
 /// # use datafusion_expr::{placeholder};
-/// let p = placeholder("$0"); // $0, refers to parameter 1
-/// assert_eq!(p.to_string(), "$0")
+/// let p = placeholder("$1"); // $1, refers to parameter 1
+/// assert_eq!(p.to_string(), "$1")
 /// ```
 pub fn placeholder(id: impl Into<String>) -> Expr {
     Expr::Placeholder(Placeholder {
         id: id.into(),
-        data_type: None,
+        field: None,
     })
 }
 
@@ -324,6 +341,11 @@ pub fn is_null(expr: Expr) -> Expr {
     Expr::IsNull(Box::new(expr))
 }
 
+/// Create is not null expression
+pub fn is_not_null(expr: Expr) -> Expr {
+    Expr::IsNotNull(Box::new(expr))
+}
+
 /// Create is true expression
 pub fn is_true(expr: Expr) -> Expr {
     Expr::IsTrue(Box::new(expr))
@@ -401,11 +423,12 @@ pub fn create_udf(
 
 /// Implements [`ScalarUDFImpl`] for functions that have a single signature and
 /// return type.
+#[derive(PartialEq, Eq, Hash)]
 pub struct SimpleScalarUDF {
     name: String,
     signature: Signature,
     return_type: DataType,
-    fun: ScalarFunctionImplementation,
+    fun: PtrEq<ScalarFunctionImplementation>,
 }
 
 impl Debug for SimpleScalarUDF {
@@ -449,7 +472,7 @@ impl SimpleScalarUDF {
             name: name.into(),
             signature,
             return_type,
-            fun,
+            fun: fun.into(),
         }
     }
 }
@@ -506,11 +529,12 @@ pub fn create_udaf(
 
 /// Implements [`AggregateUDFImpl`] for functions that have a single signature and
 /// return type.
+#[derive(PartialEq, Eq, Hash)]
 pub struct SimpleAggregateUDF {
     name: String,
     signature: Signature,
     return_type: DataType,
-    accumulator: AccumulatorFactoryFunction,
+    accumulator: PtrEq<AccumulatorFactoryFunction>,
     state_fields: Vec<FieldRef>,
 }
 
@@ -542,7 +566,7 @@ impl SimpleAggregateUDF {
             name,
             signature,
             return_type,
-            accumulator,
+            accumulator: accumulator.into(),
             state_fields,
         }
     }
@@ -561,7 +585,7 @@ impl SimpleAggregateUDF {
             name,
             signature,
             return_type,
-            accumulator,
+            accumulator: accumulator.into(),
             state_fields,
         }
     }
@@ -620,11 +644,12 @@ pub fn create_udwf(
 
 /// Implements [`WindowUDFImpl`] for functions that have a single signature and
 /// return type.
+#[derive(PartialEq, Eq, Hash)]
 pub struct SimpleWindowUDF {
     name: String,
     signature: Signature,
     return_type: DataType,
-    partition_evaluator_factory: PartitionEvaluatorFactory,
+    partition_evaluator_factory: PtrEq<PartitionEvaluatorFactory>,
 }
 
 impl Debug for SimpleWindowUDF {
@@ -654,7 +679,7 @@ impl SimpleWindowUDF {
             name,
             signature,
             return_type,
-            partition_evaluator_factory,
+            partition_evaluator_factory: partition_evaluator_factory.into(),
         }
     }
 }
@@ -686,21 +711,25 @@ impl WindowUDFImpl for SimpleWindowUDF {
             true,
         )))
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 pub fn interval_year_month_lit(value: &str) -> Expr {
     let interval = parse_interval_year_month(value).ok();
-    Expr::Literal(ScalarValue::IntervalYearMonth(interval))
+    Expr::Literal(ScalarValue::IntervalYearMonth(interval), None)
 }
 
 pub fn interval_datetime_lit(value: &str) -> Expr {
     let interval = parse_interval_day_time(value).ok();
-    Expr::Literal(ScalarValue::IntervalDayTime(interval))
+    Expr::Literal(ScalarValue::IntervalDayTime(interval), None)
 }
 
 pub fn interval_month_day_nano_lit(value: &str) -> Expr {
     let interval = parse_interval_month_day_nano(value).ok();
-    Expr::Literal(ScalarValue::IntervalMonthDayNano(interval))
+    Expr::Literal(ScalarValue::IntervalMonthDayNano(interval), None)
 }
 
 /// Extensions for configuring [`Expr::AggregateFunction`] or [`Expr::WindowFunction`]
@@ -711,8 +740,8 @@ pub fn interval_month_day_nano_lit(value: &str) -> Expr {
 /// # Example
 /// ```no_run
 /// # use datafusion_common::Result;
+/// # use datafusion_expr::expr::NullTreatment;
 /// # use datafusion_expr::test::function_stub::count;
-/// # use sqlparser::ast::NullTreatment;
 /// # use datafusion_expr::{ExprFunctionExt, lit, Expr, col};
 /// # // first_value is an aggregate function in another crate
 /// # fn first_value(_arg: Expr) -> Expr {
@@ -765,7 +794,7 @@ pub trait ExprFunctionExt {
 #[derive(Debug, Clone)]
 pub enum ExprFuncKind {
     Aggregate(AggregateFunction),
-    Window(WindowFunction),
+    Window(Box<WindowFunction>),
 }
 
 /// Implementation of [`ExprFunctionExt`].
@@ -821,28 +850,22 @@ impl ExprFuncBuilder {
 
         let fun_expr = match fun {
             ExprFuncKind::Aggregate(mut udaf) => {
-                udaf.params.order_by = order_by;
+                udaf.params.order_by = order_by.unwrap_or_default();
                 udaf.params.filter = filter.map(Box::new);
                 udaf.params.distinct = distinct;
                 udaf.params.null_treatment = null_treatment;
                 Expr::AggregateFunction(udaf)
             }
-            ExprFuncKind::Window(WindowFunction {
-                fun,
-                params: WindowFunctionParams { args, .. },
-            }) => {
+            ExprFuncKind::Window(mut udwf) => {
                 let has_order_by = order_by.as_ref().map(|o| !o.is_empty());
-                Expr::WindowFunction(WindowFunction {
-                    fun,
-                    params: WindowFunctionParams {
-                        args,
-                        partition_by: partition_by.unwrap_or_default(),
-                        order_by: order_by.unwrap_or_default(),
-                        window_frame: window_frame
-                            .unwrap_or_else(|| WindowFrame::new(has_order_by)),
-                        null_treatment,
-                    },
-                })
+                udwf.params.partition_by = partition_by.unwrap_or_default();
+                udwf.params.order_by = order_by.unwrap_or_default();
+                udwf.params.window_frame =
+                    window_frame.unwrap_or_else(|| WindowFrame::new(has_order_by));
+                udwf.params.filter = filter.map(Box::new);
+                udwf.params.null_treatment = null_treatment;
+                udwf.params.distinct = distinct;
+                Expr::WindowFunction(udwf)
             }
         };
 
diff --git a/datafusion/expr/src/expr_rewriter/guarantees.rs b/datafusion/expr/src/expr_rewriter/guarantees.rs
new file mode 100644
index 0000000000000..30c79f6529ba3
--- /dev/null
+++ b/datafusion/expr/src/expr_rewriter/guarantees.rs
@@ -0,0 +1,668 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Rewrite expressions based on external expression value range guarantees.
+
+use crate::{Between, BinaryExpr, Expr, expr::InList, lit};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion_common::{DataFusionError, HashMap, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval};
+use std::borrow::Cow;
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// See [`rewrite_with_guarantees`] for more information
+pub struct GuaranteeRewriter<'a> {
+    guarantees: HashMap<&'a Expr, &'a NullableInterval>,
+}
+
+impl<'a> GuaranteeRewriter<'a> {
+    pub fn new(
+        guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
+    ) -> Self {
+        Self {
+            guarantees: guarantees.into_iter().map(|(k, v)| (k, v)).collect(),
+        }
+    }
+}
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// Guarantees are a mapping from an expression (which currently is always a
+/// column reference) to a [NullableInterval] that represents the known possible
+/// values of the expression.
+///
+/// Rewriting expressions using this type of guarantee can make the work of other expression
+/// simplifications, like const evaluation, easier.
+///
+/// For example, if we know that a column is not null and has values in the
+/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
+///
+/// If the set of guarantees will be used to rewrite more than one expression, consider using
+/// [rewrite_with_guarantees_map] instead.
+///
+/// A full example of using this rewrite rule can be found in
+/// [`ExprSimplifier::with_guarantees()`](https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html#method.with_guarantees).
+pub fn rewrite_with_guarantees<'a>(
+    expr: Expr,
+    guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
+) -> Result<Transformed<Expr>> {
+    let guarantees_map: HashMap<&Expr, &NullableInterval> =
+        guarantees.into_iter().map(|(k, v)| (k, v)).collect();
+    rewrite_with_guarantees_map(expr, &guarantees_map)
+}
+
+/// Rewrite expressions to incorporate guarantees.
+///
+/// Guarantees are a mapping from an expression (which currently is always a
+/// column reference) to a [NullableInterval]. The interval represents the known
+/// possible values of the column.
+///
+/// For example, if we know that a column is not null and has values in the
+/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
+pub fn rewrite_with_guarantees_map<'a>(
+    expr: Expr,
+    guarantees: &'a HashMap<&'a Expr, &'a NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    if guarantees.is_empty() {
+        return Ok(Transformed::no(expr));
+    }
+
+    expr.transform_up(|e| rewrite_expr(e, guarantees))
+}
+
+impl TreeNodeRewriter for GuaranteeRewriter<'_> {
+    type Node = Expr;
+
+    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
+        if self.guarantees.is_empty() {
+            return Ok(Transformed::no(expr));
+        }
+
+        rewrite_expr(expr, &self.guarantees)
+    }
+}
+
+fn rewrite_expr(
+    expr: Expr,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    // If an expression collapses to a single value, replace it with a literal
+    if let Some(interval) = guarantees.get(&expr)
+        && let Some(value) = interval.single_value()
+    {
+        return Ok(Transformed::yes(lit(value)));
+    }
+
+    let result = match expr {
+        Expr::IsNull(inner) => match guarantees.get(inner.as_ref()) {
+            Some(NullableInterval::Null { .. }) => Transformed::yes(lit(true)),
+            Some(NullableInterval::NotNull { .. }) => Transformed::yes(lit(false)),
+            _ => Transformed::no(Expr::IsNull(inner)),
+        },
+        Expr::IsNotNull(inner) => match guarantees.get(inner.as_ref()) {
+            Some(NullableInterval::Null { .. }) => Transformed::yes(lit(false)),
+            Some(NullableInterval::NotNull { .. }) => Transformed::yes(lit(true)),
+            _ => Transformed::no(Expr::IsNotNull(inner)),
+        },
+        Expr::Between(b) => rewrite_between(b, guarantees)?,
+        Expr::BinaryExpr(b) => rewrite_binary_expr(b, guarantees)?,
+        Expr::InList(i) => rewrite_inlist(i, guarantees)?,
+        expr => Transformed::no(expr),
+    };
+    Ok(result)
+}
+
+fn rewrite_between(
+    between: Between,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>> {
+    let (Some(expr_interval), Expr::Literal(low, _), Expr::Literal(high, _)) = (
+        guarantees.get(between.expr.as_ref()),
+        between.low.as_ref(),
+        between.high.as_ref(),
+    ) else {
+        return Ok(Transformed::no(Expr::Between(between)));
+    };
+
+    // Ensure that, if low or high are null, their type matches the other bound
+    let low = ensure_typed_null(low, high)?;
+    let high = ensure_typed_null(high, &low)?;
+
+    let Ok(between_interval) = Interval::try_new(low, high) else {
+        // If we can't create an interval from the literals, be conservative and simply leave
+        // the expression unmodified.
+        return Ok(Transformed::no(Expr::Between(between)));
+    };
+
+    if between_interval.lower().is_null() && between_interval.upper().is_null() {
+        return Ok(Transformed::yes(lit(between_interval.lower().clone())));
+    }
+
+    let expr_interval = match expr_interval {
+        NullableInterval::Null { datatype } => {
+            // Value is guaranteed to be null, so we can simplify to null.
+            return Ok(Transformed::yes(lit(
+                ScalarValue::try_new_null(datatype).unwrap_or(ScalarValue::Null)
+            )));
+        }
+        NullableInterval::MaybeNull { .. } => {
+            // Value may or may not be null, so we can't simplify the expression.
+            return Ok(Transformed::no(Expr::Between(between)));
+        }
+        NullableInterval::NotNull { values } => values,
+    };
+
+    let result = if between_interval.lower().is_null() {
+        // <expr> (NOT) BETWEEN NULL AND <high>
+        let upper_bound = Interval::from(between_interval.upper().clone());
+        if expr_interval.gt(&upper_bound)?.eq(&Interval::TRUE) {
+            // if <expr> > high, then certainly false
+            Transformed::yes(lit(between.negated))
+        } else if expr_interval.lt_eq(&upper_bound)?.eq(&Interval::TRUE) {
+            // if <expr> <= high, then certainly null
+            Transformed::yes(lit(ScalarValue::try_new_null(&expr_interval.data_type())
+                .unwrap_or(ScalarValue::Null)))
+        } else {
+            // otherwise unknown
+            Transformed::no(Expr::Between(between))
+        }
+    } else if between_interval.upper().is_null() {
+        // <expr> (NOT) BETWEEN <low> AND NULL
+        let lower_bound = Interval::from(between_interval.lower().clone());
+        if expr_interval.lt(&lower_bound)?.eq(&Interval::TRUE) {
+            // if <expr> < low, then certainly false
+            Transformed::yes(lit(between.negated))
+        } else if expr_interval.gt_eq(&lower_bound)?.eq(&Interval::TRUE) {
+            // if <expr> >= low, then certainly null
+            Transformed::yes(lit(ScalarValue::try_new_null(&expr_interval.data_type())
+                .unwrap_or(ScalarValue::Null)))
+        } else {
+            // otherwise unknown
+            Transformed::no(Expr::Between(between))
+        }
+    } else {
+        let contains = between_interval.contains(expr_interval)?;
+        if contains.eq(&Interval::TRUE) {
+            Transformed::yes(lit(!between.negated))
+        } else if contains.eq(&Interval::FALSE) {
+            Transformed::yes(lit(between.negated))
+        } else {
+            Transformed::no(Expr::Between(between))
+        }
+    };
+    Ok(result)
+}
+
+fn ensure_typed_null(
+    value: &ScalarValue,
+    other: &ScalarValue,
+) -> Result<ScalarValue, DataFusionError> {
+    Ok(
+        if value.data_type().is_null() && !other.data_type().is_null() {
+            ScalarValue::try_new_null(&other.data_type())?
+        } else {
+            value.clone()
+        },
+    )
+}
+
+fn rewrite_binary_expr(
+    binary: BinaryExpr,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>, DataFusionError> {
+    // The left or right side of expression might either have a guarantee
+    // or be a literal. Either way, we can resolve them to a NullableInterval.
+    let left_interval = guarantees
+        .get(binary.left.as_ref())
+        .map(|interval| Cow::Borrowed(*interval))
+        .or_else(|| {
+            if let Expr::Literal(value, _) = binary.left.as_ref() {
+                Some(Cow::Owned(value.clone().into()))
+            } else {
+                None
+            }
+        });
+    let right_interval = guarantees
+        .get(binary.right.as_ref())
+        .map(|interval| Cow::Borrowed(*interval))
+        .or_else(|| {
+            if let Expr::Literal(value, _) = binary.right.as_ref() {
+                Some(Cow::Owned(value.clone().into()))
+            } else {
+                None
+            }
+        });
+
+    if let (Some(left_interval), Some(right_interval)) = (left_interval, right_interval) {
+        let result = left_interval.apply_operator(&binary.op, right_interval.as_ref())?;
+        if result.is_certainly_true() {
+            return Ok(Transformed::yes(lit(true)));
+        } else if result.is_certainly_false() {
+            return Ok(Transformed::yes(lit(false)));
+        }
+    }
+    Ok(Transformed::no(Expr::BinaryExpr(binary)))
+}
+
+fn rewrite_inlist(
+    inlist: InList,
+    guarantees: &HashMap<&Expr, &NullableInterval>,
+) -> Result<Transformed<Expr>, DataFusionError> {
+    let Some(interval) = guarantees.get(inlist.expr.as_ref()) else {
+        return Ok(Transformed::no(Expr::InList(inlist)));
+    };
+
+    let InList {
+        expr,
+        list,
+        negated,
+    } = inlist;
+
+    // Can remove items from the list that don't match the guarantee
+    let list: Vec<Expr> = list
+        .into_iter()
+        .filter_map(|expr| {
+            if let Expr::Literal(item, _) = &expr {
+                match interval.contains(NullableInterval::from(item.clone())) {
+                    // If we know for certain the value isn't in the column's interval,
+                    // we can skip checking it.
+                    Ok(interval) if interval.is_certainly_false() => None,
+                    Ok(_) => Some(Ok(expr)),
+                    Err(e) => Some(Err(e)),
+                }
+            } else {
+                Some(Ok(expr))
+            }
+        })
+        .collect::<Result<_, DataFusionError>>()?;
+
+    Ok(Transformed::yes(Expr::InList(InList {
+        expr,
+        list,
+        negated,
+    })))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::{Operator, col};
+    use datafusion_common::ScalarValue;
+    use datafusion_common::tree_node::TransformedResult;
+
+    #[test]
+    fn test_not_null_guarantee() {
+        // IsNull / IsNotNull can be rewritten to true / false
+        let guarantees = [
+            // Note: AlwaysNull case handled by test_column_single_value test,
+            // since it's a special case of a column with a single value.
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::make(Some(1), Some(3)).unwrap(),
+                },
+            ),
+        ];
+
+        let is_null_cases = vec![
+            // x IS NULL => guaranteed false
+            (col("x").is_null(), Some(lit(false))),
+            // x IS NOT NULL => guaranteed true
+            (col("x").is_not_null(), Some(lit(true))),
+            // [1, 3] BETWEEN 0 AND 10 => guaranteed true
+            (col("x").between(lit(0), lit(10)), Some(lit(true))),
+            // x BETWEEN 1 AND -2 => unknown (actually guaranteed false)
+            (col("x").between(lit(1), lit(-2)), None),
+            // [1, 3] BETWEEN NULL AND 0 => guaranteed false
+            (
+                col("x").between(lit(ScalarValue::Null), lit(0)),
+                Some(lit(false)),
+            ),
+            // [1, 3] BETWEEN NULL AND 1 => unknown
+            (col("x").between(lit(ScalarValue::Null), lit(1)), None),
+            // [1, 3] BETWEEN NULL AND 2 => unknown
+            (col("x").between(lit(ScalarValue::Null), lit(2)), None),
+            // [1, 3] BETWEEN NULL AND 3 => guaranteed NULL
+            (
+                col("x").between(lit(ScalarValue::Null), lit(3)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN NULL AND 4 => guaranteed NULL
+            (
+                col("x").between(lit(ScalarValue::Null), lit(4)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").between(lit(0), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").between(lit(1), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] BETWEEN 2 AND NULL => unknown
+            (col("x").between(lit(2), lit(ScalarValue::Null)), None),
+            // [1, 3] BETWEEN 3 AND NULL => unknown
+            (col("x").between(lit(3), lit(ScalarValue::Null)), None),
+            // [1, 3] BETWEEN 4 AND NULL => guaranteed false
+            (
+                col("x").between(lit(4), lit(ScalarValue::Null)),
+                Some(lit(false)),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 0 => guaranteed false
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(0)),
+                Some(lit(true)),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 1 => unknown
+            (col("x").not_between(lit(ScalarValue::Null), lit(1)), None),
+            // [1, 3] NOT BETWEEN NULL AND 2 => unknown
+            (col("x").not_between(lit(ScalarValue::Null), lit(2)), None),
+            // [1, 3] NOT BETWEEN NULL AND 3 => guaranteed NULL
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(3)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN NULL AND 4 => guaranteed NULL
+            (
+                col("x").not_between(lit(ScalarValue::Null), lit(4)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").not_between(lit(0), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 1 AND NULL => guaranteed NULL
+            (
+                col("x").not_between(lit(1), lit(ScalarValue::Null)),
+                Some(lit(ScalarValue::Int32(None))),
+            ),
+            // [1, 3] NOT BETWEEN 2 AND NULL => unknown
+            (col("x").not_between(lit(2), lit(ScalarValue::Null)), None),
+            // [1, 3] NOT BETWEEN 3 AND NULL => unknown
+            (col("x").not_between(lit(3), lit(ScalarValue::Null)), None),
+            // [1, 3] NOT BETWEEN 4 AND NULL => guaranteed false
+            (
+                col("x").not_between(lit(4), lit(ScalarValue::Null)),
+                Some(lit(true)),
+            ),
+        ];
+
+        for case in is_null_cases {
+            let output = rewrite_with_guarantees(case.0.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected = match case.1 {
+                None => case.0.clone(),
+                Some(expected) => expected,
+            };
+
+            assert_eq!(output, expected, "Failed for {}", case.0);
+        }
+    }
+
+    fn validate_simplified_cases<T>(
+        guarantees: &[(Expr, NullableInterval)],
+        cases: &[(Expr, T)],
+    ) where
+        ScalarValue: From<T>,
+        T: Clone,
+    {
+        for (expr, expected_value) in cases {
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected = lit(ScalarValue::from(expected_value.clone()));
+            assert_eq!(
+                output, expected,
+                "{expr} simplified to {output}, but expected {expected}"
+            );
+        }
+    }
+
+    fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Expr]) {
+        for expr in cases {
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            assert_eq!(
+                &output, expr,
+                "{expr} was simplified to {output}, but expected it to be unchanged"
+            );
+        }
+    }
+
+    #[test]
+    fn test_inequalities_non_null_unbounded() {
+        let guarantees = [
+            // y ∈ [2021-01-01, ∞) (not null)
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::try_new(
+                        ScalarValue::Date32(Some(18628)),
+                        ScalarValue::Date32(None),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // (original_expr, expected_simplification)
+        let simplified_cases = &[
+            (col("x").lt(lit(ScalarValue::Date32(Some(18628)))), false),
+            (col("x").lt_eq(lit(ScalarValue::Date32(Some(17000)))), false),
+            (col("x").gt(lit(ScalarValue::Date32(Some(18627)))), true),
+            (col("x").gt_eq(lit(ScalarValue::Date32(Some(18628)))), true),
+            (col("x").eq(lit(ScalarValue::Date32(Some(17000)))), false),
+            (col("x").not_eq(lit(ScalarValue::Date32(Some(17000)))), true),
+            (
+                col("x").between(
+                    lit(ScalarValue::Date32(Some(16000))),
+                    lit(ScalarValue::Date32(Some(17000))),
+                ),
+                false,
+            ),
+            (
+                col("x").not_between(
+                    lit(ScalarValue::Date32(Some(16000))),
+                    lit(ScalarValue::Date32(Some(17000))),
+                ),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit(ScalarValue::Null)),
+                }),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit(ScalarValue::Date32(Some(17000)))),
+                }),
+                true,
+            ),
+        ];
+
+        validate_simplified_cases(&guarantees, simplified_cases);
+
+        let unchanged_cases = &[
+            col("x").lt(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").lt_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").gt(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").gt_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").not_eq(lit(ScalarValue::Date32(Some(19000)))),
+            col("x").between(
+                lit(ScalarValue::Date32(Some(18000))),
+                lit(ScalarValue::Date32(Some(19000))),
+            ),
+            col("x").not_between(
+                lit(ScalarValue::Date32(Some(18000))),
+                lit(ScalarValue::Date32(Some(19000))),
+            ),
+        ];
+
+        validate_unchanged_cases(&guarantees, unchanged_cases);
+    }
+
+    #[test]
+    fn test_inequalities_maybe_null() {
+        let guarantees = [
+            // x ∈ ("abc", "def"]? (maybe null)
+            (
+                col("x"),
+                NullableInterval::MaybeNull {
+                    values: Interval::try_new(
+                        ScalarValue::from("abc"),
+                        ScalarValue::from("def"),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // (original_expr, expected_simplification)
+        let simplified_cases = &[
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsDistinctFrom,
+                    right: Box::new(lit("z")),
+                }),
+                true,
+            ),
+            (
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(col("x")),
+                    op: Operator::IsNotDistinctFrom,
+                    right: Box::new(lit("z")),
+                }),
+                false,
+            ),
+        ];
+
+        validate_simplified_cases(&guarantees, simplified_cases);
+
+        let unchanged_cases = &[
+            col("x").lt(lit("z")),
+            col("x").lt_eq(lit("z")),
+            col("x").gt(lit("a")),
+            col("x").gt_eq(lit("a")),
+            col("x").eq(lit("abc")),
+            col("x").not_eq(lit("a")),
+            col("x").between(lit("a"), lit("z")),
+            col("x").not_between(lit("a"), lit("z")),
+            Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(col("x")),
+                op: Operator::IsDistinctFrom,
+                right: Box::new(lit(ScalarValue::Null)),
+            }),
+        ];
+
+        validate_unchanged_cases(&guarantees, unchanged_cases);
+    }
+
+    #[test]
+    fn test_column_single_value() {
+        let scalars = [
+            ScalarValue::Null,
+            ScalarValue::Int32(Some(1)),
+            ScalarValue::Boolean(Some(true)),
+            ScalarValue::Boolean(None),
+            ScalarValue::from("abc"),
+            ScalarValue::LargeUtf8(Some("def".to_string())),
+            ScalarValue::Date32(Some(18628)),
+            ScalarValue::Date32(None),
+            ScalarValue::Decimal128(Some(1000), 19, 2),
+        ];
+
+        for scalar in scalars {
+            let guarantees = [(col("x"), NullableInterval::from(scalar.clone()))];
+
+            let output = rewrite_with_guarantees(col("x"), guarantees.iter())
+                .data()
+                .unwrap();
+            assert_eq!(output, Expr::Literal(scalar.clone(), None));
+        }
+    }
+
+    #[test]
+    fn test_in_list() {
+        let guarantees = [
+            // x ∈ [1, 10] (not null)
+            (
+                col("x"),
+                NullableInterval::NotNull {
+                    values: Interval::try_new(
+                        ScalarValue::Int32(Some(1)),
+                        ScalarValue::Int32(Some(10)),
+                    )
+                    .unwrap(),
+                },
+            ),
+        ];
+
+        // These cases should be simplified so the list doesn't contain any
+        // values the guarantee says are outside the range.
+        // (column_name, starting_list, negated, expected_list)
+        let cases = &[
+            // x IN (9, 11) => x IN (9)
+            ("x", vec![9, 11], false, vec![9]),
+            // x IN (10, 2) => x IN (10, 2)
+            ("x", vec![10, 2], false, vec![10, 2]),
+            // x NOT IN (9, 11) => x NOT IN (9)
+            ("x", vec![9, 11], true, vec![9]),
+            // x NOT IN (0, 22) => x NOT IN ()
+            ("x", vec![0, 22], true, vec![]),
+        ];
+
+        for (column_name, starting_list, negated, expected_list) in cases {
+            let expr = col(*column_name).in_list(
+                starting_list
+                    .iter()
+                    .map(|v| lit(ScalarValue::Int32(Some(*v))))
+                    .collect(),
+                *negated,
+            );
+            let output = rewrite_with_guarantees(expr.clone(), guarantees.iter())
+                .data()
+                .unwrap();
+            let expected_list = expected_list
+                .iter()
+                .map(|v| lit(ScalarValue::Int32(Some(*v))))
+                .collect();
+            assert_eq!(
+                output,
+                Expr::InList(InList {
+                    expr: Box::new(col(*column_name)),
+                    list: expected_list,
+                    negated: *negated,
+                })
+            );
+        }
+    }
+}
diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs
index 90dcbce46b017..32a88ab8cf310 100644
--- a/datafusion/expr/src/expr_rewriter/mod.rs
+++ b/datafusion/expr/src/expr_rewriter/mod.rs
@@ -26,12 +26,17 @@ use crate::expr::{Alias, Sort, Unnest};
 use crate::logical_plan::Projection;
 use crate::{Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder};
 
+use datafusion_common::TableReference;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::TableReference;
 use datafusion_common::{Column, DFSchema, Result};
 
+mod guarantees;
+pub use guarantees::GuaranteeRewriter;
+pub use guarantees::rewrite_with_guarantees;
+pub use guarantees::rewrite_with_guarantees_map;
 mod order_by;
+
 pub use order_by::rewrite_sort_cols_by_aggs;
 
 /// Trait for rewriting [`Expr`]s into function calls.
@@ -255,7 +260,18 @@ fn coerce_exprs_for_schema(
                     }
                     #[expect(deprecated)]
                     Expr::Wildcard { .. } => Ok(expr),
-                    _ => expr.cast_to(new_type, src_schema),
+                    _ => {
+                        match expr {
+                            // maintain the original name when casting a column, to avoid the
+                            // tablename being added to it when not explicitly set by the query
+                            // (see: https://github.com/apache/datafusion/issues/18818)
+                            Expr::Column(ref column) => {
+                                let name = column.name().to_owned();
+                                Ok(expr.cast_to(new_type, src_schema)?.alias(name))
+                            }
+                            _ => Ok(expr.cast_to(new_type, src_schema)?),
+                        }
+                    }
                 }
             } else {
                 Ok(expr)
@@ -354,10 +370,11 @@ mod test {
     use std::ops::Add;
 
     use super::*;
-    use crate::{col, lit, Cast};
+    use crate::literal::lit_with_metadata;
+    use crate::{Cast, col, lit};
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::tree_node::TreeNodeRewriter;
     use datafusion_common::ScalarValue;
+    use datafusion_common::tree_node::TreeNodeRewriter;
 
     #[derive(Default)]
     struct RecordingRewriter {
@@ -383,13 +400,13 @@ mod test {
         // rewrites all "foo" string literals to "bar"
         let transformer = |expr: Expr| -> Result<Transformed<Expr>> {
             match expr {
-                Expr::Literal(ScalarValue::Utf8(Some(utf8_val))) => {
+                Expr::Literal(ScalarValue::Utf8(Some(utf8_val)), metadata) => {
                     let utf8_val = if utf8_val == "foo" {
                         "bar".to_string()
                     } else {
                         utf8_val
                     };
-                    Ok(Transformed::yes(lit(utf8_val)))
+                    Ok(Transformed::yes(lit_with_metadata(utf8_val, metadata)))
                 }
                 // otherwise, return None
                 _ => Ok(Transformed::no(expr)),
@@ -433,7 +450,7 @@ mod test {
             vec![Some("tableC".into()), Some("tableC".into())],
             vec!["f", "ff"],
         );
-        let schemas = vec![schema_c, schema_f, schema_b, schema_a];
+        let schemas = [schema_c, schema_f, schema_b, schema_a];
         let schemas = schemas.iter().collect::<Vec<_>>();
 
         let normalized_expr =
@@ -476,7 +493,7 @@ mod test {
     ) -> DFSchema {
         let fields = fields
             .iter()
-            .map(|f| Arc::new(Field::new(f.to_string(), DataType::Int8, false)))
+            .map(|f| Arc::new(Field::new((*f).to_string(), DataType::Int8, false)))
             .collect::<Vec<_>>();
         let schema = Arc::new(Schema::new(fields));
         DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap()
diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs
index 6db95555502da..720788113c6cb 100644
--- a/datafusion/expr/src/expr_rewriter/order_by.rs
+++ b/datafusion/expr/src/expr_rewriter/order_by.rs
@@ -19,11 +19,9 @@
 
 use crate::expr::Alias;
 use crate::expr_rewriter::normalize_col;
-use crate::{expr::Sort, Cast, Expr, LogicalPlan, TryCast};
+use crate::{Cast, Expr, LogicalPlan, TryCast, expr::Sort};
 
-use datafusion_common::tree_node::{
-    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
-};
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{Column, Result};
 
 /// Rewrite sort on aggregate expressions to sort on the column of aggregate output
@@ -52,7 +50,7 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result<Expr> {
     // on top of them)
     if plan_inputs.len() == 1 {
         let proj_exprs = plan.expressions();
-        rewrite_in_terms_of_projection(expr, proj_exprs, plan_inputs[0])
+        rewrite_in_terms_of_projection(expr, &proj_exprs, plan_inputs[0])
     } else {
         Ok(expr)
     }
@@ -71,14 +69,16 @@ fn rewrite_sort_col_by_aggs(expr: Expr, plan: &LogicalPlan) -> Result<Expr> {
 /// 2. t produces an output schema with two columns "a", "b + c"
 fn rewrite_in_terms_of_projection(
     expr: Expr,
-    proj_exprs: Vec<Expr>,
+    proj_exprs: &[Expr],
     input: &LogicalPlan,
 ) -> Result<Expr> {
     // assumption is that each item in exprs, such as "b + c" is
     // available as an output column named "b + c"
     expr.transform(|expr| {
-        // search for unnormalized names first such as "c1" (such as aliases)
-        if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) {
+        // search for unnormalized names first such as "c1" (such as aliases).
+        // Also look inside aliases so e.g. `count(Int64(1))` matches
+        // `count(Int64(1)) AS count(*)`.
+        if let Some(found) = proj_exprs.iter().find(|a| expr_match(&expr, a)) {
             let (qualifier, field_name) = found.qualified_name();
             let col = Expr::Column(Column::new(qualifier, field_name));
             return Ok(Transformed::yes(col));
@@ -102,29 +102,27 @@ fn rewrite_in_terms_of_projection(
 
         let search_col = Expr::Column(Column::new_unqualified(name));
 
-        // look for the column named the same as this expr
-        let mut found = None;
-        for proj_expr in &proj_exprs {
-            proj_expr.apply(|e| {
-                if expr_match(&search_col, e) {
-                    found = Some(e.clone());
-                    return Ok(TreeNodeRecursion::Stop);
-                }
-                Ok(TreeNodeRecursion::Continue)
-            })?;
-        }
+        // Search only top-level projection expressions for a match.
+        // We intentionally avoid a recursive search (e.g. `apply`) to
+        // prevent matching sub-expressions of composites like
+        // `min(c2) + max(c3)` when the ORDER BY is just `min(c2)`.
+        let found = proj_exprs
+            .iter()
+            .find(|proj_expr| expr_match(&search_col, proj_expr));
 
         if let Some(found) = found {
+            let (qualifier, field_name) = found.qualified_name();
+            let col = Expr::Column(Column::new(qualifier, field_name));
             return Ok(Transformed::yes(match normalized_expr {
-                Expr::Cast(Cast { expr: _, data_type }) => Expr::Cast(Cast {
-                    expr: Box::new(found),
-                    data_type,
+                Expr::Cast(Cast { expr: _, field }) => Expr::Cast(Cast {
+                    expr: Box::new(col),
+                    field,
                 }),
-                Expr::TryCast(TryCast { expr: _, data_type }) => Expr::TryCast(TryCast {
-                    expr: Box::new(found),
-                    data_type,
+                Expr::TryCast(TryCast { expr: _, field }) => Expr::TryCast(TryCast {
+                    expr: Box::new(col),
+                    field,
                 }),
-                _ => found,
+                _ => col,
             }));
         }
 
@@ -152,13 +150,16 @@ mod test {
     use arrow::datatypes::{DataType, Field, Schema};
 
     use crate::{
-        cast, col, lit, logical_plan::builder::LogicalTableSource, try_cast,
-        LogicalPlanBuilder,
+        LogicalPlanBuilder, cast, col, lit, logical_plan::builder::LogicalTableSource,
+        try_cast,
     };
 
     use super::*;
     use crate::test::function_stub::avg;
+    use crate::test::function_stub::count;
+    use crate::test::function_stub::max;
     use crate::test::function_stub::min;
+    use crate::test::function_stub::sum;
 
     #[test]
     fn rewrite_sort_cols_by_agg() {
@@ -235,18 +236,19 @@ mod test {
             TestCase {
                 desc: r#"min(c2) --> "min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(min(col("c2"))),
-                expected: sort(col("min(t.c2)")),
+                expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))),
             },
             TestCase {
                 desc: r#"c1 + min(c2) --> "c1 + min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(col("c1") + min(col("c2"))),
-                // should be "c1" not t.c1
-                expected: sort(col("c1") + col("min(t.c2)")),
+                expected: sort(
+                    col("c1") + Expr::Column(Column::new_unqualified("min(t.c2)")),
+                ),
             },
             TestCase {
-                desc: r#"avg(c3) --> "avg(t.c3)" as average (column *named* "avg(t.c3)", aliased)"#,
+                desc: r#"avg(c3) --> "average" (column *named* "average", from alias)"#,
                 input: sort(avg(col("c3"))),
-                expected: sort(col("avg(t.c3)").alias("average")),
+                expected: sort(col("average")),
             },
         ];
 
@@ -255,6 +257,202 @@ mod test {
         }
     }
 
+    /// When an aggregate is aliased in the projection,
+    /// ORDER BY on the original aggregate expression should resolve to
+    /// a Column reference using the alias name — not leak the inner
+    /// Alias expression node or resolve to a descendant subtree.
+    #[test]
+    fn rewrite_sort_resolves_alias_to_column_ref() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c3"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "min(c2) with alias 'min_val' should resolve to col(min_val)",
+                input: sort(min(col("c2"))),
+                expected: sort(col("min_val")),
+            },
+            TestCase {
+                desc: "max(c3) with alias 'max_val' should resolve to col(max_val)",
+                input: sort(max(col("c3"))),
+                expected: sort(col("max_val")),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn composite_proj_expr_containing_sort_col_as_subexpr() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c3"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                (min(col("c2")) + max(col("c3"))).alias("range"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "sort by min(c2) should resolve to col(min_val), not col(range)",
+                input: sort(min(col("c2"))),
+                expected: sort(col("min_val")),
+            },
+            TestCase {
+                desc: "sort by max(c3) should resolve to col(max_val), not col(range)",
+                input: sort(max(col("c3"))),
+                expected: sort(col("max_val")),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn composite_before_standalone_should_not_shadow() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2")), max(col("c2"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                (min(col("c2")) + max(col("c2"))).alias("combined"),
+                min(col("c2")),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by min(c2) should resolve to col(min(t.c2)), not col(combined)",
+            input: sort(min(col("c2"))),
+            expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn duplicate_aggregate_in_multiple_proj_exprs() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2"))])
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("first_alias"),
+                min(col("c2")).alias("second_alias"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by min(c2) with two aliases picks first_alias",
+            input: sort(min(col("c2"))),
+            expected: sort(col("first_alias")),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn sort_agg_not_in_select_with_aliased_aggs() {
+        let plan = make_input()
+            .aggregate(
+                vec![col("c1")],
+                vec![min(col("c2")), max(col("c3")), sum(col("c3"))],
+            )
+            .unwrap()
+            .project(vec![
+                col("c1"),
+                min(col("c2")).alias("min_val"),
+                max(col("c3")).alias("max_val"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by sum(c3) not in projection should not be rewritten",
+            input: sort(sum(col("c3"))),
+            expected: sort(sum(col("c3"))),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn cast_on_aliased_aggregate() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![min(col("c2"))])
+            .unwrap()
+            .project(vec![col("c1"), min(col("c2")).alias("min_val")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![
+            TestCase {
+                desc: "CAST on aliased aggregate should preserve cast and resolve alias",
+                input: sort(cast(min(col("c2")), DataType::Int64)),
+                expected: sort(cast(col("min_val"), DataType::Int64)),
+            },
+            TestCase {
+                desc: "TryCast on aliased aggregate should preserve try_cast and resolve alias",
+                input: sort(try_cast(min(col("c2")), DataType::Int64)),
+                expected: sort(try_cast(col("min_val"), DataType::Int64)),
+            },
+        ];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
+    #[test]
+    fn count_star_with_alias() {
+        let plan = make_input()
+            .aggregate(vec![col("c1")], vec![count(lit(1))])
+            .unwrap()
+            .project(vec![col("c1"), count(lit(1)).alias("cnt")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let cases = vec![TestCase {
+            desc: "sort by count(1) should resolve to cnt alias",
+            input: sort(count(lit(1))),
+            expected: sort(col("cnt")),
+        }];
+
+        for case in cases {
+            case.run(&plan)
+        }
+    }
+
     #[test]
     fn preserve_cast() {
         let plan = make_input()
@@ -269,12 +467,12 @@ mod test {
             TestCase {
                 desc: "Cast is preserved by rewrite_sort_cols_by_aggs",
                 input: sort(cast(col("c2"), DataType::Int64)),
-                expected: sort(cast(col("c2").alias("c2"), DataType::Int64)),
+                expected: sort(cast(col("c2"), DataType::Int64)),
             },
             TestCase {
                 desc: "TryCast is preserved by rewrite_sort_cols_by_aggs",
                 input: sort(try_cast(col("c2"), DataType::Int64)),
-                expected: sort(try_cast(col("c2").alias("c2"), DataType::Int64)),
+                expected: sort(try_cast(col("c2"), DataType::Int64)),
             },
         ];
 
diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
index bdf9911b006c7..92b78b157904f 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -15,26 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::{Between, Expr, Like};
+use super::{Between, Expr, Like, predicate_bounds};
 use crate::expr::{
     AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, InList,
     InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction,
     WindowFunctionParams,
 };
-use crate::type_coercion::functions::{
-    data_types_with_scalar_udf, fields_with_aggregate_udf, fields_with_window_udf,
-};
+use crate::type_coercion::functions::{UDFCoercionExt, fields_with_udf};
 use crate::udf::ReturnFieldArgs;
-use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition};
+use crate::{LogicalPlan, Projection, Subquery, WindowFunctionDefinition, utils};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::datatype::FieldExt;
+use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::{
-    not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema,
-    Result, Spans, TableReference,
+    Column, DataFusionError, ExprSchema, Result, ScalarValue, Spans, TableReference,
+    not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
-use std::collections::HashMap;
 use std::sync::Arc;
 
 /// Trait to allow expr to typable with respect to a schema
@@ -46,7 +45,7 @@ pub trait ExprSchemable {
     fn nullable(&self, input_schema: &dyn ExprSchema) -> Result<bool>;
 
     /// Given a schema, return the expr's optional metadata
-    fn metadata(&self, schema: &dyn ExprSchema) -> Result<HashMap<String, String>>;
+    fn metadata(&self, schema: &dyn ExprSchema) -> Result<FieldMetadata>;
 
     /// Convert to a field with respect to a schema
     fn to_field(
@@ -58,8 +57,12 @@ pub trait ExprSchemable {
     fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result<Expr>;
 
     /// Given a schema, return the type and nullability of the expr
+    #[deprecated(
+        since = "51.0.0",
+        note = "Use `to_field().1.is_nullable` and `to_field().1.data_type()` directly instead"
+    )]
     fn data_type_and_nullable(&self, schema: &dyn ExprSchema)
-        -> Result<(DataType, bool)>;
+    -> Result<(DataType, bool)>;
 }
 
 impl ExprSchemable for Expr {
@@ -82,15 +85,17 @@ impl ExprSchemable for Expr {
     /// # use std::collections::HashMap;
     ///
     /// fn main() {
-    ///   let expr = col("c1") + col("c2");
-    ///   let schema = DFSchema::from_unqualified_fields(
-    ///     vec![
-    ///       Field::new("c1", DataType::Int32, true),
-    ///       Field::new("c2", DataType::Float32, true),
-    ///       ].into(),
-    ///       HashMap::new(),
-    ///   ).unwrap();
-    ///   assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
+    ///     let expr = col("c1") + col("c2");
+    ///     let schema = DFSchema::from_unqualified_fields(
+    ///         vec![
+    ///             Field::new("c1", DataType::Int32, true),
+    ///             Field::new("c2", DataType::Float32, true),
+    ///         ]
+    ///         .into(),
+    ///         HashMap::new(),
+    ///     )
+    ///     .unwrap();
+    ///     assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
     /// }
     /// ```
     ///
@@ -105,17 +110,17 @@ impl ExprSchemable for Expr {
     fn get_type(&self, schema: &dyn ExprSchema) -> Result<DataType> {
         match self {
             Expr::Alias(Alias { expr, name, .. }) => match &**expr {
-                Expr::Placeholder(Placeholder { data_type, .. }) => match &data_type {
+                Expr::Placeholder(Placeholder { field, .. }) => match &field {
                     None => schema.data_type(&Column::from_name(name)).cloned(),
-                    Some(dt) => Ok(dt.clone()),
+                    Some(field) => Ok(field.data_type().clone()),
                 },
                 _ => expr.get_type(schema),
             },
             Expr::Negative(expr) => expr.get_type(schema),
             Expr::Column(c) => Ok(schema.data_type(c)?.clone()),
-            Expr::OuterReferenceColumn(ty, _) => Ok(ty.clone()),
-            Expr::ScalarVariable(ty, _) => Ok(ty.clone()),
-            Expr::Literal(l) => Ok(l.data_type()),
+            Expr::OuterReferenceColumn(field, _) => Ok(field.data_type().clone()),
+            Expr::ScalarVariable(field, _) => Ok(field.data_type().clone()),
+            Expr::Literal(l, _) => Ok(l.data_type()),
             Expr::Case(case) => {
                 for (_, then_expr) in &case.when_then_expr {
                     let then_type = then_expr.get_type(schema)?;
@@ -127,15 +132,18 @@ impl ExprSchemable for Expr {
                     .as_ref()
                     .map_or(Ok(DataType::Null), |e| e.get_type(schema))
             }
-            Expr::Cast(Cast { data_type, .. })
-            | Expr::TryCast(TryCast { data_type, .. }) => Ok(data_type.clone()),
+            Expr::Cast(Cast { field, .. }) | Expr::TryCast(TryCast { field, .. }) => {
+                Ok(field.data_type().clone())
+            }
             Expr::Unnest(Unnest { expr }) => {
                 let arg_data_type = expr.get_type(schema)?;
                 // Unnest's output type is the inner type of the list
                 match arg_data_type {
                     DataType::List(field)
                     | DataType::LargeList(field)
-                    | DataType::FixedSizeList(field, _) => Ok(field.data_type().clone()),
+                    | DataType::FixedSizeList(field, _)
+                    | DataType::ListView(field)
+                    | DataType::LargeListView(field) => Ok(field.data_type().clone()),
                     DataType::Struct(_) => Ok(arg_data_type),
                     DataType::Null => {
                         not_impl_err!("unnest() does not support null yet")
@@ -147,48 +155,16 @@ impl ExprSchemable for Expr {
                     }
                 }
             }
-            Expr::ScalarFunction(_func) => {
-                let (return_type, _) = self.data_type_and_nullable(schema)?;
-                Ok(return_type)
-            }
-            Expr::WindowFunction(window_function) => self
-                .data_type_and_nullable_with_window_function(schema, window_function)
-                .map(|(return_type, _)| return_type),
-            Expr::AggregateFunction(AggregateFunction {
-                func,
-                params: AggregateFunctionParams { args, .. },
-            }) => {
-                let fields = args
-                    .iter()
-                    .map(|e| e.to_field(schema).map(|(_, f)| f))
-                    .collect::<Result<Vec<_>>>()?;
-                let new_fields = fields_with_aggregate_udf(&fields, func)
-                    .map_err(|err| {
-                        let data_types = fields
-                            .iter()
-                            .map(|f| f.data_type().clone())
-                            .collect::<Vec<_>>();
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-                Ok(func.return_field(&new_fields)?.data_type().clone())
+            Expr::ScalarFunction(_)
+            | Expr::WindowFunction(_)
+            | Expr::AggregateFunction(_) => {
+                Ok(self.to_field(schema)?.1.data_type().clone())
             }
             Expr::Not(_)
             | Expr::IsNull(_)
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::Between { .. }
             | Expr::InList { .. }
             | Expr::IsNotNull(_)
@@ -201,20 +177,16 @@ impl ExprSchemable for Expr {
             Expr::ScalarSubquery(subquery) => {
                 Ok(subquery.subquery.schema().field(0).data_type().clone())
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ref op,
-            }) => BinaryTypeCoercer::new(
+            Expr::BinaryExpr(BinaryExpr { left, right, op }) => BinaryTypeCoercer::new(
                 &left.get_type(schema)?,
                 op,
                 &right.get_type(schema)?,
             )
             .get_result_type(),
             Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean),
-            Expr::Placeholder(Placeholder { data_type, .. }) => {
-                if let Some(dtype) = data_type {
-                    Ok(dtype.clone())
+            Expr::Placeholder(Placeholder { field, .. }) => {
+                if let Some(field) = field {
+                    Ok(field.data_type().clone())
                 } else {
                     // If the placeholder's type hasn't been specified, treat it as
                     // null (unspecified placeholders generate an error during planning)
@@ -277,18 +249,68 @@ impl ExprSchemable for Expr {
                 || high.nullable(input_schema)?),
 
             Expr::Column(c) => input_schema.nullable(c),
-            Expr::OuterReferenceColumn(_, _) => Ok(true),
-            Expr::Literal(value) => Ok(value.is_null()),
+            Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()),
+            Expr::Literal(value, _) => Ok(value.is_null()),
             Expr::Case(case) => {
-                // This expression is nullable if any of the input expressions are nullable
-                let then_nullable = case
+                let nullable_then = case
                     .when_then_expr
                     .iter()
-                    .map(|(_, t)| t.nullable(input_schema))
-                    .collect::<Result<Vec<_>>>()?;
-                if then_nullable.contains(&true) {
-                    Ok(true)
+                    .filter_map(|(w, t)| {
+                        let is_nullable = match t.nullable(input_schema) {
+                            Err(e) => return Some(Err(e)),
+                            Ok(n) => n,
+                        };
+
+                        // Branches with a then expression that is not nullable do not impact the
+                        // nullability of the case expression.
+                        if !is_nullable {
+                            return None;
+                        }
+
+                        // For case-with-expression assume all 'then' expressions are reachable
+                        if case.expr.is_some() {
+                            return Some(Ok(()));
+                        }
+
+                        // For branches with a nullable 'then' expression, try to determine
+                        // if the 'then' expression is ever reachable in the situation where
+                        // it would evaluate to null.
+                        let bounds = match predicate_bounds::evaluate_bounds(
+                            w,
+                            Some(unwrap_certainly_null_expr(t)),
+                            input_schema,
+                        ) {
+                            Err(e) => return Some(Err(e)),
+                            Ok(b) => b,
+                        };
+
+                        let can_be_true = match bounds
+                            .contains_value(ScalarValue::Boolean(Some(true)))
+                        {
+                            Err(e) => return Some(Err(e)),
+                            Ok(b) => b,
+                        };
+
+                        if !can_be_true {
+                            // If the derived 'when' expression can never evaluate to true, the
+                            // 'then' expression is not reachable when it would evaluate to NULL.
+                            // The most common pattern for this is `WHEN x IS NOT NULL THEN x`.
+                            None
+                        } else {
+                            // The branch might be taken
+                            Some(Ok(()))
+                        }
+                    })
+                    .next();
+
+                if let Some(nullable_then) = nullable_then {
+                    // There is at least one reachable nullable 'then' expression, so the case
+                    // expression itself is nullable.
+                    // Use `Result::map` to propagate the error from `nullable_then` if there is one.
+                    nullable_then.map(|_| true)
                 } else if let Some(e) = &case.else_expr {
+                    // There are no reachable nullable 'then' expressions, so all we still need to
+                    // check is the 'else' expression's nullability.
                     e.nullable(input_schema)
                 } else {
                     // CASE produces NULL if there is no `else` expr
@@ -297,23 +319,11 @@ impl ExprSchemable for Expr {
                 }
             }
             Expr::Cast(Cast { expr, .. }) => expr.nullable(input_schema),
-            Expr::ScalarFunction(_func) => {
-                let (_, nullable) = self.data_type_and_nullable(input_schema)?;
-                Ok(nullable)
-            }
-            Expr::AggregateFunction(AggregateFunction { func, .. }) => {
-                Ok(func.is_nullable())
-            }
-            Expr::WindowFunction(window_function) => self
-                .data_type_and_nullable_with_window_function(
-                    input_schema,
-                    window_function,
-                )
-                .map(|(_, nullable)| nullable),
-            Expr::ScalarVariable(_, _)
-            | Expr::TryCast { .. }
-            | Expr::Unnest(_)
-            | Expr::Placeholder(_) => Ok(true),
+            Expr::ScalarFunction(_)
+            | Expr::AggregateFunction(_)
+            | Expr::WindowFunction(_) => Ok(self.to_field(input_schema)?.1.is_nullable()),
+            Expr::ScalarVariable(field, _) => Ok(field.is_nullable()),
+            Expr::TryCast { .. } | Expr::Unnest(_) | Expr::Placeholder(_) => Ok(true),
             Expr::IsNull(_)
             | Expr::IsNotNull(_)
             | Expr::IsTrue(_)
@@ -323,15 +333,14 @@ impl ExprSchemable for Expr {
             | Expr::IsNotFalse(_)
             | Expr::IsNotUnknown(_)
             | Expr::Exists { .. } => Ok(false),
+            Expr::SetComparison(_) => Ok(true),
             Expr::InSubquery(InSubquery { expr, .. }) => expr.nullable(input_schema),
             Expr::ScalarSubquery(subquery) => {
                 Ok(subquery.subquery.schema().field(0).is_nullable())
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ..
-            }) => Ok(left.nullable(input_schema)? || right.nullable(input_schema)?),
+            Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
+                Ok(left.nullable(input_schema)? || right.nullable(input_schema)?)
+            }
             Expr::Like(Like { expr, pattern, .. })
             | Expr::SimilarTo(Like { expr, pattern, .. }) => {
                 Ok(expr.nullable(input_schema)? || pattern.nullable(input_schema)?)
@@ -346,9 +355,9 @@ impl ExprSchemable for Expr {
         }
     }
 
-    fn metadata(&self, schema: &dyn ExprSchema) -> Result<HashMap<String, String>> {
+    fn metadata(&self, schema: &dyn ExprSchema) -> Result<FieldMetadata> {
         self.to_field(schema)
-            .map(|(_, field)| field.metadata().clone())
+            .map(|(_, field)| FieldMetadata::from(field.metadata()))
     }
 
     /// Returns the datatype and nullability of the expression based on [ExprSchema].
@@ -372,59 +381,87 @@ impl ExprSchemable for Expr {
 
     /// Returns a [arrow::datatypes::Field] compatible with this expression.
     ///
+    /// This function converts an expression into a field with appropriate metadata
+    /// and nullability based on the expression type and context. It is the primary
+    /// mechanism for determining field-level schemas.
+    ///
+    /// # Field Property Resolution
+    ///
+    /// For each expression, the following properties are determined:
+    ///
+    /// ## Data Type Resolution
+    /// - **Column references**: Data type from input schema field
+    /// - **Literals**: Data type inferred from literal value
+    /// - **Aliases**: Data type inherited from the underlying expression (the aliased expression)
+    /// - **Binary expressions**: Result type from type coercion rules
+    /// - **Boolean expressions**: Always a boolean type
+    /// - **Cast expressions**: Target data type from cast operation
+    /// - **Function calls**: Return type based on function signature and argument types
+    ///
+    /// ## Nullability Determination
+    /// - **Column references**: Inherit nullability from input schema field
+    /// - **Literals**: Nullable only if literal value is NULL
+    /// - **Aliases**: Inherit nullability from the underlying expression (the aliased expression)
+    /// - **Binary expressions**: Nullable if either operand is nullable
+    /// - **Boolean expressions**: Always non-nullable (IS NULL, EXISTS, etc.)
+    /// - **Cast expressions**: determined by the input expression's nullability rules
+    /// - **Function calls**: Based on function nullability rules and input nullability
+    ///
+    /// ## Metadata Handling
+    /// - **Column references**: Preserve original field metadata from input schema
+    /// - **Literals**: Use explicitly provided metadata, otherwise empty
+    /// - **Aliases**: Merge underlying expr metadata with alias-specific metadata, preferring the alias metadata
+    /// - **Binary expressions**: field metadata is empty
+    /// - **Boolean expressions**: field metadata is empty
+    /// - **Cast expressions**: determined by the input expression's field metadata handling
+    /// - **Scalar functions**: Generate metadata via function's [`return_field_from_args`] method,
+    ///   with the default implementation returning empty field metadata
+    /// - **Aggregate functions**: Generate metadata via function's [`return_field`] method,
+    ///   with the default implementation returning empty field metadata
+    /// - **Window functions**: field metadata follows the function's return field
+    ///
+    /// ## Table Reference Scoping
+    /// - Establishes proper qualified field references when columns belong to specific tables
+    /// - Maintains table context for accurate field resolution in multi-table scenarios
+    ///
     /// So for example, a projected expression `col(c1) + col(c2)` is
     /// placed in an output field **named** col("c1 + c2")
+    ///
+    /// [`return_field_from_args`]: crate::ScalarUDF::return_field_from_args
+    /// [`return_field`]: crate::AggregateUDF::return_field
     fn to_field(
         &self,
         schema: &dyn ExprSchema,
     ) -> Result<(Option<TableReference>, Arc<Field>)> {
         let (relation, schema_name) = self.qualified_name();
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         let field = match self {
             Expr::Alias(Alias {
                 expr,
-                name,
+                name: _,
                 metadata,
                 ..
             }) => {
-                let field = match &**expr {
-                    Expr::Placeholder(Placeholder { data_type, .. }) => {
-                        match &data_type {
-                            None => schema
-                                .data_type_and_nullable(&Column::from_name(name))
-                                .map(|(d, n)| Field::new(&schema_name, d.clone(), n)),
-                            Some(dt) => Ok(Field::new(
-                                &schema_name,
-                                dt.clone(),
-                                expr.nullable(schema)?,
-                            )),
-                        }
-                    }
-                    _ => expr.to_field(schema).map(|(_, f)| f.as_ref().clone()),
-                }?;
-
                 let mut combined_metadata = expr.metadata(schema)?;
                 if let Some(metadata) = metadata {
-                    if !metadata.is_empty() {
-                        combined_metadata.extend(metadata.clone());
-                    }
+                    combined_metadata.extend(metadata.clone());
                 }
 
-                Ok(Arc::new(field.with_metadata(combined_metadata)))
+                Ok(expr
+                    .to_field(schema)
+                    .map(|(_, f)| f)?
+                    .with_field_metadata(&combined_metadata))
             }
             Expr::Negative(expr) => expr.to_field(schema).map(|(_, f)| f),
-            Expr::Column(c) => schema.field_from_column(c).map(|f| Arc::new(f.clone())),
-            Expr::OuterReferenceColumn(ty, _) => {
-                Ok(Arc::new(Field::new(&schema_name, ty.clone(), true)))
-            }
-            Expr::ScalarVariable(ty, _) => {
-                Ok(Arc::new(Field::new(&schema_name, ty.clone(), true)))
+            Expr::Column(c) => schema.field_from_column(c).map(Arc::clone),
+            Expr::OuterReferenceColumn(field, _) => {
+                Ok(Arc::clone(field).renamed(&schema_name))
             }
-            Expr::Literal(l) => Ok(Arc::new(Field::new(
-                &schema_name,
-                l.data_type(),
-                l.is_null(),
-            ))),
+            Expr::ScalarVariable(field, _) => Ok(Arc::clone(field).renamed(&schema_name)),
+            Expr::Literal(l, metadata) => Ok(Arc::new(
+                Field::new(&schema_name, l.data_type(), l.is_null())
+                    .with_field_metadata_opt(metadata.as_ref()),
+            )),
             Expr::IsNull(_)
             | Expr::IsNotNull(_)
             | Expr::IsTrue(_)
@@ -437,16 +474,17 @@ impl ExprSchemable for Expr {
                 Ok(Arc::new(Field::new(&schema_name, DataType::Boolean, false)))
             }
             Expr::ScalarSubquery(subquery) => {
-                Ok(Arc::new(subquery.subquery.schema().field(0).clone()))
+                Ok(Arc::clone(&subquery.subquery.schema().fields()[0]))
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                ref right,
-                ref op,
-            }) => {
-                let (lhs_type, lhs_nullable) = left.data_type_and_nullable(schema)?;
-                let (rhs_type, rhs_nullable) = right.data_type_and_nullable(schema)?;
-                let mut coercer = BinaryTypeCoercer::new(&lhs_type, op, &rhs_type);
+            Expr::BinaryExpr(BinaryExpr { left, right, op }) => {
+                let (left_field, right_field) =
+                    (left.to_field(schema)?.1, right.to_field(schema)?.1);
+
+                let (lhs_type, lhs_nullable) =
+                    (left_field.data_type(), left_field.is_nullable());
+                let (rhs_type, rhs_nullable) =
+                    (right_field.data_type(), right_field.is_nullable());
+                let mut coercer = BinaryTypeCoercer::new(lhs_type, op, rhs_type);
                 coercer.set_lhs_spans(left.spans().cloned().unwrap_or_default());
                 coercer.set_rhs_spans(right.spans().cloned().unwrap_or_default());
                 Ok(Arc::new(Field::new(
@@ -456,84 +494,54 @@ impl ExprSchemable for Expr {
                 )))
             }
             Expr::WindowFunction(window_function) => {
-                let (dt, nullable) = self.data_type_and_nullable_with_window_function(
-                    schema,
-                    window_function,
-                )?;
-                Ok(Arc::new(Field::new(&schema_name, dt, nullable)))
-            }
-            Expr::AggregateFunction(aggregate_function) => {
-                let AggregateFunction {
-                    func,
-                    params: AggregateFunctionParams { args, .. },
+                let WindowFunction {
+                    fun,
+                    params: WindowFunctionParams { args, .. },
                     ..
-                } = aggregate_function;
+                } = window_function.as_ref();
 
                 let fields = args
                     .iter()
                     .map(|e| e.to_field(schema).map(|(_, f)| f))
                     .collect::<Result<Vec<_>>>()?;
-                // Verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
-                let new_fields = fields_with_aggregate_udf(&fields, func)
-                    .map_err(|err| {
-                        let arg_types = fields
-                            .iter()
-                            .map(|f| f.data_type())
-                            .cloned()
-                            .collect::<Vec<_>>();
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &arg_types,
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-
+                match fun {
+                    WindowFunctionDefinition::AggregateUDF(udaf) => {
+                        let new_fields =
+                            verify_function_arguments(udaf.as_ref(), &fields)?;
+                        let return_field = udaf.return_field(&new_fields)?;
+                        Ok(return_field)
+                    }
+                    WindowFunctionDefinition::WindowUDF(udwf) => {
+                        let new_fields =
+                            verify_function_arguments(udwf.as_ref(), &fields)?;
+                        let return_field = udwf
+                            .field(WindowUDFFieldArgs::new(&new_fields, &schema_name))?;
+                        Ok(return_field)
+                    }
+                }
+            }
+            Expr::AggregateFunction(AggregateFunction {
+                func,
+                params: AggregateFunctionParams { args, .. },
+            }) => {
+                let fields = args
+                    .iter()
+                    .map(|e| e.to_field(schema).map(|(_, f)| f))
+                    .collect::<Result<Vec<_>>>()?;
+                let new_fields = verify_function_arguments(func.as_ref(), &fields)?;
                 func.return_field(&new_fields)
             }
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
-                let (arg_types, fields): (Vec<DataType>, Vec<Arc<Field>>) = args
+                let fields = args
                     .iter()
                     .map(|e| e.to_field(schema).map(|(_, f)| f))
-                    .collect::<Result<Vec<_>>>()?
-                    .into_iter()
-                    .map(|f| (f.data_type().clone(), f))
-                    .unzip();
-                // Verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
-                let new_data_types = data_types_with_scalar_udf(&arg_types, func)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                func.name(),
-                                func.signature().clone(),
-                                &arg_types,
-                            )
-                        )
-                    })?;
-                let new_fields = fields
-                    .into_iter()
-                    .zip(new_data_types)
-                    .map(|(f, d)| f.as_ref().clone().with_data_type(d))
-                    .map(Arc::new)
-                    .collect::<Vec<FieldRef>>();
+                    .collect::<Result<Vec<_>>>()?;
+                let new_fields = verify_function_arguments(func.as_ref(), &fields)?;
 
                 let arguments = args
                     .iter()
                     .map(|e| match e {
-                        Expr::Literal(sv) => Some(sv),
+                        Expr::Literal(sv, _) => Some(sv),
                         _ => None,
                     })
                     .collect::<Vec<_>>();
@@ -545,10 +553,27 @@ impl ExprSchemable for Expr {
                 func.return_field_from_args(args)
             }
             // _ => Ok((self.get_type(schema)?, self.nullable(schema)?)),
-            Expr::Cast(Cast { expr, data_type }) => expr
+            Expr::Cast(Cast { expr, field }) => expr
                 .to_field(schema)
-                .map(|(_, f)| f.as_ref().clone().with_data_type(data_type.clone()))
+                .map(|(_table_ref, destination_field)| {
+                    // This propagates the nullability of the input rather than
+                    // force the nullability of the destination field. This is
+                    // usually the desired behaviour (i.e., specifying a cast
+                    // destination type usually does not force a user to pick
+                    // nullability, and assuming `true` would prevent the non-nullability
+                    // of the parent expression to make the result eligible for
+                    // optimizations that only apply to non-nullable values).
+                    destination_field
+                        .as_ref()
+                        .clone()
+                        .with_data_type(field.data_type().clone())
+                        .with_metadata(destination_field.metadata().clone())
+                })
                 .map(Arc::new),
+            Expr::Placeholder(Placeholder {
+                id: _,
+                field: Some(field),
+            }) => Ok(Arc::clone(field).renamed(&schema_name)),
             Expr::Like(_)
             | Expr::SimilarTo(_)
             | Expr::Not(_)
@@ -557,6 +582,7 @@ impl ExprSchemable for Expr {
             | Expr::TryCast(_)
             | Expr::InList(_)
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::Wildcard { .. }
             | Expr::GroupingSet(_)
             | Expr::Placeholder(_)
@@ -569,7 +595,8 @@ impl ExprSchemable for Expr {
 
         Ok((
             relation,
-            Arc::new(field.as_ref().clone().with_name(schema_name)),
+            // todo avoid this rename / use the name above
+            field.renamed(&schema_name),
         ))
     }
 
@@ -589,7 +616,16 @@ impl ExprSchemable for Expr {
         // like all of the binary expressions below. Perhaps Expr should track the
         // type of the expression?
 
-        if can_cast_types(&this_type, cast_to_type) {
+        // Special handling for struct-to-struct casts with name-based field matching
+        let can_cast = match (&this_type, cast_to_type) {
+            (DataType::Struct(_), DataType::Struct(_)) => {
+                // Always allow struct-to-struct casts; field matching happens at runtime
+                true
+            }
+            _ => can_cast_types(&this_type, cast_to_type),
+        };
+
+        if can_cast {
             match self {
                 Expr::ScalarSubquery(subquery) => {
                     Ok(Expr::ScalarSubquery(cast_subquery(subquery, cast_to_type)?))
@@ -597,95 +633,45 @@ impl ExprSchemable for Expr {
                 _ => Ok(Expr::Cast(Cast::new(Box::new(self), cast_to_type.clone()))),
             }
         } else {
-            plan_err!("Cannot automatically convert {this_type:?} to {cast_to_type:?}")
+            plan_err!("Cannot automatically convert {this_type} to {cast_to_type}")
         }
     }
 }
 
-impl Expr {
-    /// Common method for window functions that applies type coercion
-    /// to all arguments of the window function to check if it matches
-    /// its signature.
-    ///
-    /// If successful, this method returns the data type and
-    /// nullability of the window function's result.
-    ///
-    /// Otherwise, returns an error if there's a type mismatch between
-    /// the window function's signature and the provided arguments.
-    fn data_type_and_nullable_with_window_function(
-        &self,
-        schema: &dyn ExprSchema,
-        window_function: &WindowFunction,
-    ) -> Result<(DataType, bool)> {
-        let WindowFunction {
-            fun,
-            params: WindowFunctionParams { args, .. },
-            ..
-        } = window_function;
-
-        let fields = args
+/// Verify that function is invoked with correct number and type of arguments as
+/// defined in `TypeSignature`.
+fn verify_function_arguments<F: UDFCoercionExt>(
+    function: &F,
+    input_fields: &[FieldRef],
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(input_fields, function).map_err(|err| {
+        let data_types = input_fields
             .iter()
-            .map(|e| e.to_field(schema).map(|(_, f)| f))
-            .collect::<Result<Vec<_>>>()?;
-        match fun {
-            WindowFunctionDefinition::AggregateUDF(udaf) => {
-                let data_types = fields
-                    .iter()
-                    .map(|f| f.data_type())
-                    .cloned()
-                    .collect::<Vec<_>>();
-                let new_fields = fields_with_aggregate_udf(&fields, udaf)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                fun.name(),
-                                fun.signature(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-
-                let return_field = udaf.return_field(&new_fields)?;
-
-                Ok((return_field.data_type().clone(), return_field.is_nullable()))
-            }
-            WindowFunctionDefinition::WindowUDF(udwf) => {
-                let data_types = fields
-                    .iter()
-                    .map(|f| f.data_type())
-                    .cloned()
-                    .collect::<Vec<_>>();
-                let new_fields = fields_with_window_udf(&fields, udwf)
-                    .map_err(|err| {
-                        plan_datafusion_err!(
-                            "{} {}",
-                            match err {
-                                DataFusionError::Plan(msg) => msg,
-                                err => err.to_string(),
-                            },
-                            utils::generate_signature_error_msg(
-                                fun.name(),
-                                fun.signature(),
-                                &data_types
-                            )
-                        )
-                    })?
-                    .into_iter()
-                    .collect::<Vec<_>>();
-                let (_, function_name) = self.qualified_name();
-                let field_args = WindowUDFFieldArgs::new(&new_fields, &function_name);
+            .map(|f| f.data_type())
+            .cloned()
+            .collect::<Vec<_>>();
+        plan_datafusion_err!(
+            "{}. {}",
+            match err {
+                DataFusionError::Plan(msg) => msg,
+                err => err.to_string(),
+            },
+            utils::generate_signature_error_message(
+                function.name(),
+                function.signature(),
+                &data_types
+            )
+        )
+    })
+}
 
-                udwf.field(field_args)
-                    .map(|field| (field.data_type().clone(), field.is_nullable()))
-            }
-        }
+/// Returns the innermost [Expr] that is provably null if `expr` is null.
+fn unwrap_certainly_null_expr(expr: &Expr) -> &Expr {
+    match expr {
+        Expr::Not(e) => unwrap_certainly_null_expr(e),
+        Expr::Negative(e) => unwrap_certainly_null_expr(e),
+        Expr::Cast(e) => unwrap_certainly_null_expr(e.expr.as_ref()),
+        _ => expr,
     }
 }
 
@@ -696,7 +682,6 @@ impl Expr {
 ///    new projection with the casted expression.
 /// 2. **Non-projection plan**: If the subquery isn't a projection, it adds a projection to the plan
 ///    with the casted first column.
-///
 pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result<Subquery> {
     if subquery.subquery.schema().field(0).data_type() == cast_to_type {
         return Ok(subquery);
@@ -731,10 +716,13 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result<Subq
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashMap;
+
     use super::*;
-    use crate::{col, lit};
+    use crate::{and, col, lit, not, or, out_ref_col_with_metadata, when};
 
-    use datafusion_common::{internal_err, DFSchema, ScalarValue};
+    use arrow::datatypes::FieldRef;
+    use datafusion_common::{DFSchema, ScalarValue, assert_or_internal_err};
 
     macro_rules! test_is_expr_nullable {
         ($EXPR_TYPE:ident) => {{
@@ -747,9 +735,10 @@ mod tests {
     fn expr_schema_nullability() {
         let expr = col("foo").eq(lit(1));
         assert!(!expr.nullable(&MockExprSchema::new()).unwrap());
-        assert!(expr
-            .nullable(&MockExprSchema::new().with_nullable(true))
-            .unwrap());
+        assert!(
+            expr.nullable(&MockExprSchema::new().with_nullable(true))
+                .unwrap()
+        );
 
         test_is_expr_nullable!(is_null);
         test_is_expr_nullable!(is_not_null);
@@ -785,6 +774,137 @@ mod tests {
         assert!(expr.nullable(&get_schema(false)).unwrap());
     }
 
+    fn assert_nullability(expr: &Expr, schema: &dyn ExprSchema, expected: bool) {
+        assert_eq!(
+            expr.nullable(schema).unwrap(),
+            expected,
+            "Nullability of '{expr}' should be {expected}"
+        );
+    }
+
+    fn assert_not_nullable(expr: &Expr, schema: &dyn ExprSchema) {
+        assert_nullability(expr, schema, false);
+    }
+
+    fn assert_nullable(expr: &Expr, schema: &dyn ExprSchema) {
+        assert_nullability(expr, schema, true);
+    }
+
+    #[test]
+    fn test_case_expression_nullability() -> Result<()> {
+        let nullable_schema = MockExprSchema::new()
+            .with_data_type(DataType::Int32)
+            .with_nullable(true);
+
+        let not_nullable_schema = MockExprSchema::new()
+            .with_data_type(DataType::Int32)
+            .with_nullable(false);
+
+        // CASE WHEN x IS NOT NULL THEN x ELSE 0
+        let e = when(col("x").is_not_null(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN NOT x IS NULL THEN x ELSE 0
+        let e = when(not(col("x").is_null()), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN X = 5 THEN x ELSE 0
+        let e = when(col("x").eq(lit(5)), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT NULL AND x = 5 THEN x ELSE 0
+        let e = when(and(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 AND x IS NOT NULL THEN x ELSE 0
+        let e = when(and(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT NULL OR x = 5 THEN x ELSE 0
+        let e = when(or(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 OR x IS NOT NULL THEN x ELSE 0
+        let e = when(or(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN (x = 5 AND x IS NOT NULL) OR (x = bar AND x IS NOT NULL) THEN x ELSE 0
+        let e = when(
+            or(
+                and(col("x").eq(lit(5)), col("x").is_not_null()),
+                and(col("x").eq(col("bar")), col("x").is_not_null()),
+            ),
+            col("x"),
+        )
+        .otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x = 5 OR x IS NULL THEN x ELSE 0
+        let e = when(or(col("x").eq(lit(5)), col("x").is_null()), col("x"))
+            .otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS TRUE THEN x ELSE 0
+        let e = when(col("x").is_true(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT TRUE THEN x ELSE 0
+        let e = when(col("x").is_not_true(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS FALSE THEN x ELSE 0
+        let e = when(col("x").is_false(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT FALSE THEN x ELSE 0
+        let e = when(col("x").is_not_false(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS UNKNOWN THEN x ELSE 0
+        let e = when(col("x").is_unknown(), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x IS NOT UNKNOWN THEN x ELSE 0
+        let e = when(col("x").is_not_unknown(), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN x LIKE 'x' THEN x ELSE 0
+        let e = when(col("x").like(lit("x")), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN 0 THEN x ELSE 0
+        let e = when(lit(0), col("x")).otherwise(lit(0))?;
+        assert_not_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        // CASE WHEN 1 THEN x ELSE 0
+        let e = when(lit(1), col("x")).otherwise(lit(0))?;
+        assert_nullable(&e, &nullable_schema);
+        assert_not_nullable(&e, &not_nullable_schema);
+
+        Ok(())
+    }
+
     #[test]
     fn test_inlist_nullability() {
         let get_schema = |nullable| {
@@ -797,9 +917,10 @@ mod tests {
         assert!(!expr.nullable(&get_schema(false)).unwrap());
         assert!(expr.nullable(&get_schema(true)).unwrap());
         // Testing nullable() returns an error.
-        assert!(expr
-            .nullable(&get_schema(false).with_error_on_nullable(true))
-            .is_err());
+        assert!(
+            expr.nullable(&get_schema(false).with_error_on_nullable(true))
+                .is_err()
+        );
 
         let null = lit(ScalarValue::Int32(None));
         let expr = col("foo").in_list(vec![null, lit(1)], false);
@@ -840,6 +961,7 @@ mod tests {
     fn test_expr_metadata() {
         let mut meta = HashMap::new();
         meta.insert("bar".to_string(), "buzz".to_string());
+        let meta = FieldMetadata::from(meta);
         let expr = col("foo");
         let schema = MockExprSchema::new()
             .with_data_type(DataType::Int32)
@@ -858,37 +980,95 @@ mod tests {
         );
 
         let schema = DFSchema::from_unqualified_fields(
-            vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())]
-                .into(),
+            vec![meta.add_to_field(Field::new("foo", DataType::Int32, true))].into(),
             HashMap::new(),
         )
         .unwrap();
 
         // verify to_field method populates metadata
-        assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata());
+        assert_eq!(meta, expr.metadata(&schema).unwrap());
+
+        // outer ref constructed by `out_ref_col_with_metadata` should be metadata-preserving
+        let outer_ref = out_ref_col_with_metadata(
+            DataType::Int32,
+            meta.to_hashmap(),
+            Column::from_name("foo"),
+        );
+        assert_eq!(meta, outer_ref.metadata(&schema).unwrap());
+    }
+
+    #[test]
+    fn test_expr_placeholder() {
+        let schema = MockExprSchema::new();
+
+        let mut placeholder_meta = HashMap::new();
+        placeholder_meta.insert("bar".to_string(), "buzz".to_string());
+        let placeholder_meta = FieldMetadata::from(placeholder_meta);
+
+        let expr = Expr::Placeholder(Placeholder::new_with_field(
+            "".to_string(),
+            Some(
+                Field::new("", DataType::Utf8, true)
+                    .with_metadata(placeholder_meta.to_hashmap())
+                    .into(),
+            ),
+        ));
+
+        let field = expr.to_field(&schema).unwrap().1;
+        assert_eq!(
+            (field.data_type(), field.is_nullable()),
+            (&DataType::Utf8, true)
+        );
+        assert_eq!(placeholder_meta, expr.metadata(&schema).unwrap());
+
+        let expr_alias = expr.alias("a placeholder by any other name");
+        let expr_alias_field = expr_alias.to_field(&schema).unwrap().1;
+        assert_eq!(
+            (expr_alias_field.data_type(), expr_alias_field.is_nullable()),
+            (&DataType::Utf8, true)
+        );
+        assert_eq!(placeholder_meta, expr_alias.metadata(&schema).unwrap());
+
+        // Non-nullable placeholder field should remain non-nullable
+        let expr = Expr::Placeholder(Placeholder::new_with_field(
+            "".to_string(),
+            Some(Field::new("", DataType::Utf8, false).into()),
+        ));
+        let expr_field = expr.to_field(&schema).unwrap().1;
+        assert_eq!(
+            (expr_field.data_type(), expr_field.is_nullable()),
+            (&DataType::Utf8, false)
+        );
+
+        let expr_alias = expr.alias("a placeholder by any other name");
+        let expr_alias_field = expr_alias.to_field(&schema).unwrap().1;
+        assert_eq!(
+            (expr_alias_field.data_type(), expr_alias_field.is_nullable()),
+            (&DataType::Utf8, false)
+        );
     }
 
     #[derive(Debug)]
     struct MockExprSchema {
-        field: Field,
+        field: FieldRef,
         error_on_nullable: bool,
     }
 
     impl MockExprSchema {
         fn new() -> Self {
             Self {
-                field: Field::new("mock_field", DataType::Null, false),
+                field: Arc::new(Field::new("mock_field", DataType::Null, false)),
                 error_on_nullable: false,
             }
         }
 
         fn with_nullable(mut self, nullable: bool) -> Self {
-            self.field = self.field.with_nullable(nullable);
+            Arc::make_mut(&mut self.field).set_nullable(nullable);
             self
         }
 
         fn with_data_type(mut self, data_type: DataType) -> Self {
-            self.field = self.field.with_data_type(data_type);
+            Arc::make_mut(&mut self.field).set_data_type(data_type);
             self
         }
 
@@ -897,23 +1077,38 @@ mod tests {
             self
         }
 
-        fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
-            self.field = self.field.with_metadata(metadata);
+        fn with_metadata(mut self, metadata: FieldMetadata) -> Self {
+            self.field =
+                Arc::new(metadata.add_to_field(Arc::unwrap_or_clone(self.field)));
             self
         }
     }
 
     impl ExprSchema for MockExprSchema {
         fn nullable(&self, _col: &Column) -> Result<bool> {
-            if self.error_on_nullable {
-                internal_err!("nullable error")
-            } else {
-                Ok(self.field.is_nullable())
-            }
+            assert_or_internal_err!(!self.error_on_nullable, "nullable error");
+            Ok(self.field.is_nullable())
         }
 
-        fn field_from_column(&self, _col: &Column) -> Result<&Field> {
+        fn field_from_column(&self, _col: &Column) -> Result<&FieldRef> {
             Ok(&self.field)
         }
     }
+
+    #[test]
+    fn test_scalar_variable() {
+        let mut meta = HashMap::new();
+        meta.insert("bar".to_string(), "buzz".to_string());
+        let meta = FieldMetadata::from(meta);
+
+        let field = Field::new("foo", DataType::Int32, true);
+        let field = meta.add_to_field(field);
+        let field = Arc::new(field);
+
+        let expr = Expr::ScalarVariable(field, vec!["foo".to_string()]);
+
+        let schema = MockExprSchema::new();
+
+        assert_eq!(meta, expr.metadata(&schema).unwrap());
+    }
 }
diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
index e0235d32292fa..68865cbe1ca54 100644
--- a/datafusion/expr/src/function.rs
+++ b/datafusion/expr/src/function.rs
@@ -27,6 +27,8 @@ pub use datafusion_functions_aggregate_common::accumulator::{
     AccumulatorArgs, AccumulatorFactoryFunction, StateFieldsArgs,
 };
 
+use crate::expr::{AggregateFunction, WindowFunction};
+use crate::simplify::SimplifyContext;
 pub use datafusion_functions_window_common::expr::ExpressionArgs;
 pub use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 pub use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
@@ -64,28 +66,22 @@ pub type PartitionEvaluatorFactory =
 pub type StateTypeFunction =
     Arc<dyn Fn(&DataType) -> Result<Arc<Vec<DataType>>> + Send + Sync>;
 
-/// [crate::udaf::AggregateUDFImpl::simplify] simplifier closure
-/// A closure with two arguments:
-/// * 'aggregate_function': [crate::expr::AggregateFunction] for which simplified has been invoked
-/// * 'info': [crate::simplify::SimplifyInfo]
+/// Type alias for [crate::udaf::AggregateUDFImpl::simplify].
 ///
-/// Closure returns simplified [Expr] or an error.
-pub type AggregateFunctionSimplification = Box<
-    dyn Fn(
-        crate::expr::AggregateFunction,
-        &dyn crate::simplify::SimplifyInfo,
-    ) -> Result<Expr>,
->;
+/// This closure is invoked with:
+/// * `aggregate_function`: [AggregateFunction] with already simplified arguments
+/// * `info`: [SimplifyContext]
+///
+/// It returns a simplified [Expr] or an error.
+pub type AggregateFunctionSimplification =
+    Box<dyn Fn(AggregateFunction, &SimplifyContext) -> Result<Expr>>;
 
-/// [crate::udwf::WindowUDFImpl::simplify] simplifier closure
-/// A closure with two arguments:
-/// * 'window_function': [crate::expr::WindowFunction] for which simplified has been invoked
-/// * 'info': [crate::simplify::SimplifyInfo]
+/// Type alias for [crate::udwf::WindowUDFImpl::simplify].
+///
+/// This closure is invoked with:
+/// * `window_function`: [WindowFunction] with already simplified arguments
+/// * `info`: [SimplifyContext]
 ///
-/// Closure returns simplified [Expr] or an error.
-pub type WindowFunctionSimplification = Box<
-    dyn Fn(
-        crate::expr::WindowFunction,
-        &dyn crate::simplify::SimplifyInfo,
-    ) -> Result<Expr>,
->;
+/// It returns a simplified [Expr] or an error.
+pub type WindowFunctionSimplification =
+    Box<dyn Fn(WindowFunction, &SimplifyContext) -> Result<Expr>>;
diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs
index 48931d6525af5..cb136229bf88d 100644
--- a/datafusion/expr/src/lib.rs
+++ b/datafusion/expr/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! [DataFusion](https://github.com/apache/datafusion)
 //! is an extensible query execution framework that uses
@@ -34,6 +35,8 @@
 //!
 //! The [expr_fn] module contains functions for creating expressions.
 
+extern crate core;
+
 mod literal;
 mod operation;
 mod partition_evaluator;
@@ -42,6 +45,7 @@ mod udaf;
 mod udf;
 mod udwf;
 
+pub mod arguments;
 pub mod conditional_expressions;
 pub mod execution_props;
 pub mod expr;
@@ -57,31 +61,44 @@ pub mod interval_arithmetic {
     pub use datafusion_expr_common::interval_arithmetic::*;
 }
 pub mod logical_plan;
+pub mod dml {
+    //! DML (Data Manipulation Language) types for DELETE, UPDATE operations.
+    pub use crate::logical_plan::dml::*;
+}
 pub mod planner;
 pub mod registry;
 pub mod simplify;
 pub mod sort_properties {
     pub use datafusion_expr_common::sort_properties::*;
 }
+pub mod async_udf;
 pub mod statistics {
     pub use datafusion_expr_common::statistics::*;
 }
+mod predicate_bounds;
+pub mod preimage;
+pub mod ptr_eq;
 pub mod test;
 pub mod tree_node;
 pub mod type_coercion;
+pub mod udf_eq;
 pub mod utils;
 pub mod var_provider;
 pub mod window_frame;
 pub mod window_state;
 
-pub use datafusion_doc::{DocSection, Documentation, DocumentationBuilder};
+pub use datafusion_doc::{
+    DocSection, Documentation, DocumentationBuilder, aggregate_doc_sections,
+    scalar_doc_sections, window_doc_sections,
+};
 pub use datafusion_expr_common::accumulator::Accumulator;
 pub use datafusion_expr_common::columnar_value::ColumnarValue;
 pub use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 pub use datafusion_expr_common::operator::Operator;
+pub use datafusion_expr_common::placement::ExpressionPlacement;
 pub use datafusion_expr_common::signature::{
-    ArrayFunctionArgument, ArrayFunctionSignature, Coercion, Signature, TypeSignature,
-    TypeSignatureClass, Volatility, TIMEZONE_WILDCARD,
+    ArrayFunctionArgument, ArrayFunctionSignature, Coercion, Signature,
+    TIMEZONE_WILDCARD, TypeSignature, TypeSignatureClass, Volatility,
 };
 pub use datafusion_expr_common::type_coercion::binary;
 pub use expr::{
@@ -94,19 +111,22 @@ pub use function::{
     AccumulatorFactoryFunction, PartitionEvaluatorFactory, ReturnTypeFunction,
     ScalarFunctionImplementation, StateTypeFunction,
 };
-pub use literal::{lit, lit_timestamp_nano, Literal, TimestampLiteral};
+pub use literal::{
+    Literal, TimestampLiteral, lit, lit_timestamp_nano, lit_with_metadata,
+};
 pub use logical_plan::*;
 pub use partition_evaluator::PartitionEvaluator;
+#[cfg(feature = "sql")]
 pub use sqlparser;
 pub use table_source::{TableProviderFilterPushDown, TableSource, TableType};
 pub use udaf::{
-    aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF,
-    SetMonotonicity, StatisticsArgs,
-};
-pub use udf::{
-    scalar_doc_sections, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    AggregateUDF, AggregateUDFImpl, ReversedUDAF, SetMonotonicity, StatisticsArgs,
+    udaf_default_display_name, udaf_default_human_display, udaf_default_return_field,
+    udaf_default_schema_name, udaf_default_window_function_display_name,
+    udaf_default_window_function_schema_name,
 };
-pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl};
+pub use udf::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl};
+pub use udwf::{LimitEffect, ReversedUDWF, WindowUDF, WindowUDFImpl};
 pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 
 #[cfg(test)]
diff --git a/datafusion/expr/src/literal.rs b/datafusion/expr/src/literal.rs
index 90ba5a9a693c7..2e2980d607648 100644
--- a/datafusion/expr/src/literal.rs
+++ b/datafusion/expr/src/literal.rs
@@ -18,14 +18,36 @@
 //! Literal module contains foundational types that are used to represent literals in DataFusion.
 
 use crate::Expr;
-use datafusion_common::ScalarValue;
+use datafusion_common::{ScalarValue, metadata::FieldMetadata};
 
 /// Create a literal expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit<T: Literal>(n: T) -> Expr {
     n.lit()
 }
 
+#[expect(clippy::needless_pass_by_value)]
+pub fn lit_with_metadata<T: Literal>(n: T, metadata: Option<FieldMetadata>) -> Expr {
+    let Some(metadata) = metadata else {
+        return n.lit();
+    };
+
+    let Expr::Literal(sv, prior_metadata) = n.lit() else {
+        unreachable!();
+    };
+    let new_metadata = match prior_metadata {
+        Some(mut prior) => {
+            prior.extend(metadata);
+            prior
+        }
+        None => metadata,
+    };
+
+    Expr::Literal(sv, Some(new_metadata))
+}
+
 /// Create a literal timestamp expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit_timestamp_nano<T: TimestampLiteral>(n: T) -> Expr {
     n.lit_timestamp_nano()
 }
@@ -43,37 +65,37 @@ pub trait TimestampLiteral {
 
 impl Literal for &str {
     fn lit(&self) -> Expr {
-        Expr::Literal(ScalarValue::from(*self))
+        Expr::Literal(ScalarValue::from(*self), None)
     }
 }
 
 impl Literal for String {
     fn lit(&self) -> Expr {
-        Expr::Literal(ScalarValue::from(self.as_ref()))
+        Expr::Literal(ScalarValue::from(self.as_ref()), None)
     }
 }
 
 impl Literal for &String {
     fn lit(&self) -> Expr {
-        Expr::Literal(ScalarValue::from(self.as_ref()))
+        Expr::Literal(ScalarValue::from(self.as_ref()), None)
     }
 }
 
 impl Literal for Vec<u8> {
     fn lit(&self) -> Expr {
-        Expr::Literal(ScalarValue::Binary(Some((*self).to_owned())))
+        Expr::Literal(ScalarValue::Binary(Some((*self).to_owned())), None)
     }
 }
 
 impl Literal for &[u8] {
     fn lit(&self) -> Expr {
-        Expr::Literal(ScalarValue::Binary(Some((*self).to_owned())))
+        Expr::Literal(ScalarValue::Binary(Some((*self).to_owned())), None)
     }
 }
 
 impl Literal for ScalarValue {
     fn lit(&self) -> Expr {
-        Expr::Literal(self.clone())
+        Expr::Literal(self.clone(), None)
     }
 }
 
@@ -82,7 +104,7 @@ macro_rules! make_literal {
         #[doc = $DOC]
         impl Literal for $TYPE {
             fn lit(&self) -> Expr {
-                Expr::Literal(ScalarValue::$SCALAR(Some(self.clone())))
+                Expr::Literal(ScalarValue::$SCALAR(Some(self.clone())), None)
             }
         }
     };
@@ -93,7 +115,7 @@ macro_rules! make_nonzero_literal {
         #[doc = $DOC]
         impl Literal for $TYPE {
             fn lit(&self) -> Expr {
-                Expr::Literal(ScalarValue::$SCALAR(Some(self.get())))
+                Expr::Literal(ScalarValue::$SCALAR(Some(self.get())), None)
             }
         }
     };
@@ -104,10 +126,10 @@ macro_rules! make_timestamp_literal {
         #[doc = $DOC]
         impl TimestampLiteral for $TYPE {
             fn lit_timestamp_nano(&self) -> Expr {
-                Expr::Literal(ScalarValue::TimestampNanosecond(
-                    Some((self.clone()).into()),
+                Expr::Literal(
+                    ScalarValue::TimestampNanosecond(Some((self.clone()).into()), None),
                     None,
-                ))
+                )
             }
         }
     };
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index fabdfecef104c..2e23fef1da768 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -18,6 +18,7 @@
 //! This module provides a builder for creating LogicalPlans
 
 use std::any::Any;
+use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{HashMap, HashSet};
 use std::iter::once;
@@ -43,20 +44,21 @@ use crate::utils::{
     group_window_expr_by_sort_keys,
 };
 use crate::{
-    and, binary_expr, lit, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery,
-    Statement, TableProviderFilterPushDown, TableSource, WriteOp,
+    DmlStatement, ExplainOption, Expr, ExprSchemable, Operator, RecursiveQuery,
+    Statement, TableProviderFilterPushDown, TableSource, WriteOp, and, binary_expr, lit,
 };
 
 use super::dml::InsertOp;
-use super::plan::{ColumnUnnestList, ExplainFormat};
 use arrow::compute::can_cast_types;
-use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef};
 use datafusion_common::display::ToStringifiedPlan;
 use datafusion_common::file_options::file_type::FileType;
+use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::{
-    exec_err, get_target_functional_dependencies, internal_err, not_impl_err,
-    plan_datafusion_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions,
+    Column, Constraints, DFSchema, DFSchemaRef, NullEquality, Result, ScalarValue,
+    TableReference, ToDFSchema, UnnestOptions, exec_err,
+    get_target_functional_dependencies, internal_datafusion_err, plan_datafusion_err,
+    plan_err,
 };
 use datafusion_expr_common::type_coercion::binary::type_union_resolution;
 
@@ -178,19 +180,14 @@ impl LogicalPlanBuilder {
         recursive_term: LogicalPlan,
         is_distinct: bool,
     ) -> Result<Self> {
-        // TODO: we need to do a bunch of validation here. Maybe more.
-        if is_distinct {
-            return not_impl_err!(
-                "Recursive queries with a distinct 'UNION' (in which the previous iteration's results will be de-duplicated) is not supported"
-            );
-        }
         // Ensure that the static term and the recursive term have the same number of fields
         let static_fields_len = self.plan.schema().fields().len();
         let recursive_fields_len = recursive_term.schema().fields().len();
         if static_fields_len != recursive_fields_len {
             return plan_err!(
                 "Non-recursive term and recursive term must have the same number of columns ({} != {})",
-                static_fields_len, recursive_fields_len
+                static_fields_len,
+                recursive_fields_len
             );
         }
         // Ensure that the recursive term has the same field types as the static term
@@ -282,15 +279,14 @@ impl LogicalPlanBuilder {
                 let value = &row[j];
                 let data_type = value.get_type(schema)?;
 
-                if !data_type.equals_datatype(field_type) {
-                    if can_cast_types(&data_type, field_type) {
-                    } else {
-                        return exec_err!(
-                            "type mismatch and can't cast to got {} and {}",
-                            data_type,
-                            field_type
-                        );
-                    }
+                if !data_type.equals_datatype(field_type)
+                    && !can_cast_types(&data_type, field_type)
+                {
+                    return exec_err!(
+                        "type mismatch and can't cast to got {} and {}",
+                        data_type,
+                        field_type
+                    );
                 }
             }
             fields.push(field_type.to_owned(), field_nullable);
@@ -306,8 +302,21 @@ impl LogicalPlanBuilder {
 
         for j in 0..n_cols {
             let mut common_type: Option<DataType> = None;
+            let mut common_metadata: Option<FieldMetadata> = None;
             for (i, row) in values.iter().enumerate() {
                 let value = &row[j];
+                let metadata = value.metadata(&schema)?;
+                if let Some(ref cm) = common_metadata {
+                    if &metadata != cm {
+                        return plan_err!(
+                            "Inconsistent metadata across values list at row {i} column {j}. Was {:?} but found {:?}",
+                            cm,
+                            metadata
+                        );
+                    }
+                } else {
+                    common_metadata = Some(metadata.clone());
+                }
                 let data_type = value.get_type(&schema)?;
                 if data_type == DataType::Null {
                     continue;
@@ -317,7 +326,9 @@ impl LogicalPlanBuilder {
                     // get common type of each column values.
                     let data_types = vec![prev_type.clone(), data_type.clone()];
                     let Some(new_type) = type_union_resolution(&data_types) else {
-                        return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}");
+                        return plan_err!(
+                            "Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"
+                        );
                     };
                     common_type = Some(new_type);
                 } else {
@@ -326,7 +337,11 @@ impl LogicalPlanBuilder {
             }
             // assuming common_type was not set, and no error, therefore the type should be NULL
             // since the code loop skips NULL
-            fields.push(common_type.unwrap_or(DataType::Null), true);
+            fields.push_with_metadata(
+                common_type.unwrap_or(DataType::Null),
+                true,
+                common_metadata,
+            );
         }
 
         Self::infer_inner(values, fields, &schema)
@@ -341,8 +356,11 @@ impl LogicalPlanBuilder {
         // wrap cast if data type is not same as common type.
         for row in &mut values {
             for (j, field_type) in fields.iter().map(|f| f.data_type()).enumerate() {
-                if let Expr::Literal(ScalarValue::Null) = row[j] {
-                    row[j] = Expr::Literal(ScalarValue::try_from(field_type)?);
+                if let Expr::Literal(ScalarValue::Null, metadata) = &row[j] {
+                    row[j] = Expr::Literal(
+                        ScalarValue::try_from(field_type)?,
+                        metadata.clone(),
+                    );
                 } else {
                     row[j] = std::mem::take(&mut row[j]).cast_to(field_type, schema)?;
                 }
@@ -403,13 +421,13 @@ impl LogicalPlanBuilder {
         options: HashMap<String, String>,
         partition_by: Vec<String>,
     ) -> Result<Self> {
-        Ok(Self::new(LogicalPlan::Copy(CopyTo {
-            input: Arc::new(input),
+        Ok(Self::new(LogicalPlan::Copy(CopyTo::new(
+            Arc::new(input),
             output_url,
             partition_by,
             file_type,
             options,
-        })))
+        ))))
     }
 
     /// Create a [`DmlStatement`] for inserting the contents of this builder into the named table.
@@ -434,14 +452,13 @@ impl LogicalPlanBuilder {
     /// # ])) as _;
     /// # let table_source = Arc::new(LogicalTableSource::new(employee_schema));
     /// // VALUES (1), (2)
-    /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])?
-    ///   .build()?;
+    /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])?.build()?;
     /// // INSERT INTO MyTable VALUES (1), (2)
     /// let insert_plan = LogicalPlanBuilder::insert_into(
-    ///   input,
-    ///   "MyTable",
-    ///   table_source,
-    ///   InsertOp::Append,
+    ///     input,
+    ///     "MyTable",
+    ///     table_source,
+    ///     InsertOp::Append,
     /// )?;
     /// # Ok(())
     /// # }
@@ -498,29 +515,27 @@ impl LogicalPlanBuilder {
             TableScan::try_new(table_name, table_source, projection, filters, fetch)?;
 
         // Inline TableScan
-        if table_scan.filters.is_empty() {
-            if let Some(p) = table_scan.source.get_logical_plan() {
-                let sub_plan = p.into_owned();
-
-                if let Some(proj) = table_scan.projection {
-                    let projection_exprs = proj
-                        .into_iter()
-                        .map(|i| {
-                            Expr::Column(Column::from(
-                                sub_plan.schema().qualified_field(i),
-                            ))
-                        })
-                        .collect::<Vec<_>>();
-                    return Self::new(sub_plan)
-                        .project(projection_exprs)?
-                        .alias(table_scan.table_name);
-                }
+        if table_scan.filters.is_empty()
+            && let Some(p) = table_scan.source.get_logical_plan()
+        {
+            let sub_plan = p.into_owned();
 
-                // Ensures that the reference to the inlined table remains the
-                // same, meaning we don't have to change any of the parent nodes
-                // that reference this table.
-                return Self::new(sub_plan).alias(table_scan.table_name);
+            if let Some(proj) = table_scan.projection {
+                let projection_exprs = proj
+                    .into_iter()
+                    .map(|i| {
+                        Expr::Column(Column::from(sub_plan.schema().qualified_field(i)))
+                    })
+                    .collect::<Vec<_>>();
+                return Self::new(sub_plan)
+                    .project(projection_exprs)?
+                    .alias(table_scan.table_name);
             }
+
+            // Ensures that the reference to the inlined table remains the
+            // same, meaning we don't have to change any of the parent nodes
+            // that reference this table.
+            return Self::new(sub_plan).alias(table_scan.table_name);
         }
 
         Ok(Self::new(LogicalPlan::TableScan(table_scan)))
@@ -607,11 +622,11 @@ impl LogicalPlanBuilder {
     }
 
     /// Make a builder for a prepare logical plan from the builder's plan
-    pub fn prepare(self, name: String, data_types: Vec<DataType>) -> Result<Self> {
+    pub fn prepare(self, name: String, fields: Vec<FieldRef>) -> Result<Self> {
         Ok(Self::new(LogicalPlan::Statement(Statement::Prepare(
             Prepare {
                 name,
-                data_types,
+                fields,
                 input: self.plan,
             },
         ))))
@@ -756,7 +771,9 @@ impl LogicalPlanBuilder {
             .map(|col| col.flat_name())
             .collect::<String>();
 
-        plan_err!("For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list")
+        plan_err!(
+            "For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list"
+        )
     }
 
     /// Apply a sort by provided expressions with default direction
@@ -900,7 +917,13 @@ impl LogicalPlanBuilder {
         join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
         filter: Option<Expr>,
     ) -> Result<Self> {
-        self.join_detailed(right, join_type, join_keys, filter, false)
+        self.join_detailed(
+            right,
+            join_type,
+            join_keys,
+            filter,
+            NullEquality::NullEqualsNothing,
+        )
     }
 
     /// Apply a join using the specified expressions.
@@ -931,8 +954,8 @@ impl LogicalPlanBuilder {
     /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)`
     /// let exprs = vec![
     ///     col("left.a").eq(col("right.a")),
-    ///     col("left.b").not_eq(col("right.b"))
-    ///  ];
+    ///     col("left.b").not_eq(col("right.b")),
+    /// ];
     ///
     /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
     /// // finding all pairs of rows from `left` and `right` where
@@ -956,15 +979,11 @@ impl LogicalPlanBuilder {
             join_type,
             (Vec::<Column>::new(), Vec::<Column>::new()),
             filter,
-            false,
+            NullEquality::NullEqualsNothing,
         )
     }
 
-    pub(crate) fn normalize(
-        plan: &LogicalPlan,
-        column: impl Into<Column>,
-    ) -> Result<Column> {
-        let column = column.into();
+    pub(crate) fn normalize(plan: &LogicalPlan, column: Column) -> Result<Column> {
         if column.relation.is_some() {
             // column is already normalized
             return Ok(column);
@@ -984,16 +1003,33 @@ impl LogicalPlanBuilder {
     /// The behavior is the same as [`join`](Self::join) except that it allows
     /// specifying the null equality behavior.
     ///
-    /// If `null_equals_null=true`, rows where both join keys are `null` will be
-    /// emitted. Otherwise rows where either or both join keys are `null` will be
-    /// omitted.
+    /// The `null_equality` dictates how `null` values are joined.
     pub fn join_detailed(
         self,
         right: LogicalPlan,
         join_type: JoinType,
         join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
         filter: Option<Expr>,
-        null_equals_null: bool,
+        null_equality: NullEquality,
+    ) -> Result<Self> {
+        self.join_detailed_with_options(
+            right,
+            join_type,
+            join_keys,
+            filter,
+            null_equality,
+            false,
+        )
+    }
+
+    pub fn join_detailed_with_options(
+        self,
+        right: LogicalPlan,
+        join_type: JoinType,
+        join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
+        filter: Option<Expr>,
+        null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
         if join_keys.0.len() != join_keys.1.len() {
             return plan_err!("left_keys and right_keys were not the same length");
@@ -1110,7 +1146,8 @@ impl LogicalPlanBuilder {
             join_type,
             join_constraint: JoinConstraint::On,
             schema: DFSchemaRef::new(join_schema),
-            null_equals_null,
+            null_equality,
+            null_aware,
         })))
     }
 
@@ -1119,7 +1156,7 @@ impl LogicalPlanBuilder {
         self,
         right: LogicalPlan,
         join_type: JoinType,
-        using_keys: Vec<impl Into<Column> + Clone>,
+        using_keys: Vec<Column>,
     ) -> Result<Self> {
         let left_keys: Vec<Column> = using_keys
             .clone()
@@ -1173,7 +1210,7 @@ impl LogicalPlanBuilder {
         if join_on.is_empty() {
             let join = Self::from(self.plan).cross_join(right)?;
             join.filter(filters.ok_or_else(|| {
-                DataFusionError::Internal("filters should not be None here".to_string())
+                internal_datafusion_err!("filters should not be None here")
             })?)
         } else {
             let join = Join::try_new(
@@ -1183,7 +1220,8 @@ impl LogicalPlanBuilder {
                 filters,
                 join_type,
                 JoinConstraint::Using,
-                false,
+                NullEquality::NullEqualsNothing,
+                false, // null_aware
             )?;
 
             Ok(Self::new(LogicalPlan::Join(join)))
@@ -1199,7 +1237,8 @@ impl LogicalPlanBuilder {
             None,
             JoinType::Inner,
             JoinConstraint::On,
-            false,
+            NullEquality::NullEqualsNothing,
+            false, // null_aware
         )?;
 
         Ok(Self::new(LogicalPlan::Join(join)))
@@ -1255,12 +1294,24 @@ impl LogicalPlanBuilder {
     ///
     /// if `verbose` is true, prints out additional details.
     pub fn explain(self, verbose: bool, analyze: bool) -> Result<Self> {
+        // Keep the format default to Indent
+        self.explain_option_format(
+            ExplainOption::default()
+                .with_verbose(verbose)
+                .with_analyze(analyze),
+        )
+    }
+
+    /// Create an expression to represent the explanation of the plan
+    /// The`explain_option` is used to specify the format and verbosity of the explanation.
+    /// Details see [`ExplainOption`].
+    pub fn explain_option_format(self, explain_option: ExplainOption) -> Result<Self> {
         let schema = LogicalPlan::explain_schema();
         let schema = schema.to_dfschema_ref()?;
 
-        if analyze {
+        if explain_option.analyze {
             Ok(Self::new(LogicalPlan::Analyze(Analyze {
-                verbose,
+                verbose: explain_option.verbose,
                 input: self.plan,
                 schema,
             })))
@@ -1269,9 +1320,9 @@ impl LogicalPlanBuilder {
                 vec![self.plan.to_stringified(PlanType::InitialLogicalPlan)];
 
             Ok(Self::new(LogicalPlan::Explain(Explain {
-                verbose,
+                verbose: explain_option.verbose,
                 plan: self.plan,
-                explain_format: ExplainFormat::Indent,
+                explain_format: explain_option.format,
                 stringified_plans,
                 schema,
                 logical_optimization_succeeded: false,
@@ -1323,6 +1374,15 @@ impl LogicalPlanBuilder {
             );
         }
 
+        // Requalify sides if needed to avoid duplicate qualified field names
+        // (e.g., when both sides reference the same table)
+        let left_builder = LogicalPlanBuilder::from(left_plan);
+        let right_builder = LogicalPlanBuilder::from(right_plan);
+        let (left_builder, right_builder, _requalified) =
+            requalify_sides_if_needed(left_builder, right_builder)?;
+        let left_plan = left_builder.build()?;
+        let right_plan = right_builder.build()?;
+
         let join_keys = left_plan
             .schema()
             .fields()
@@ -1337,12 +1397,24 @@ impl LogicalPlanBuilder {
             .unzip();
         if is_all {
             LogicalPlanBuilder::from(left_plan)
-                .join_detailed(right_plan, join_type, join_keys, None, true)?
+                .join_detailed(
+                    right_plan,
+                    join_type,
+                    join_keys,
+                    None,
+                    NullEquality::NullEqualsNull,
+                )?
                 .build()
         } else {
             LogicalPlanBuilder::from(left_plan)
                 .distinct()?
-                .join_detailed(right_plan, join_type, join_keys, None, true)?
+                .join_detailed(
+                    right_plan,
+                    join_type,
+                    join_keys,
+                    None,
+                    NullEquality::NullEqualsNull,
+                )?
                 .build()
         }
     }
@@ -1420,7 +1492,8 @@ impl LogicalPlanBuilder {
             filter,
             join_type,
             JoinConstraint::On,
-            false,
+            NullEquality::NullEqualsNothing,
+            false, // null_aware
         )?;
 
         Ok(Self::new(LogicalPlan::Join(join)))
@@ -1480,10 +1553,23 @@ impl ValuesFields {
     }
 
     pub fn push(&mut self, data_type: DataType, nullable: bool) {
+        self.push_with_metadata(data_type, nullable, None);
+    }
+
+    pub fn push_with_metadata(
+        &mut self,
+        data_type: DataType,
+        nullable: bool,
+        metadata: Option<FieldMetadata>,
+    ) {
         // Naming follows the convention described here:
         // https://www.postgresql.org/docs/current/queries-values.html
         let name = format!("column{}", self.inner.len() + 1);
-        self.inner.push(Field::new(name, data_type, nullable));
+        let mut field = Field::new(name, data_type, nullable);
+        if let Some(metadata) = metadata {
+            field.set_metadata(metadata.to_hashmap());
+        }
+        self.inner.push(field);
     }
 
     pub fn into_fields(self) -> Fields {
@@ -1491,37 +1577,49 @@ impl ValuesFields {
     }
 }
 
-// `name_map` tracks a mapping between a field name and the number of appearances of that field.
-//
-// Some field names might already come to this function with the count (number of times it appeared)
-// as a sufix e.g. id:1, so there's still a chance of name collisions, for example,
-// if these three fields passed to this function: "col:1", "col" and "col", the function
-// would rename them to -> col:1, col, col:1 causing a posteriror error when building the DFSchema.
-// that's why we need the `seen` set, so the fields are always unique.
-//
-pub fn change_redundant_column(fields: &Fields) -> Vec<Field> {
-    let mut name_map = HashMap::new();
-    let mut seen: HashSet<String> = HashSet::new();
+/// Returns aliases to make field names unique.
+///
+/// Returns a vector of optional aliases, one per input field. `None` means keep the original name,
+/// `Some(alias)` means rename to the alias to ensure uniqueness.
+///
+/// Used when creating [`SubqueryAlias`] or similar operations that strip table qualifiers but need
+/// to maintain unique column names.
+///
+/// # Example
+/// Input fields: `[a, a, b, b, a, a:1]` ([`DFSchema`] valid when duplicate fields have different qualifiers)
+/// Returns: `[None, Some("a:1"), None, Some("b:1"), Some("a:2"), Some("a:1:1")]`
+pub fn unique_field_aliases(fields: &Fields) -> Vec<Option<String>> {
+    // Some field names might already come to this function with the count (number of times it appeared)
+    // as a suffix e.g. id:1, so there's still a chance of name collisions, for example,
+    // if these three fields passed to this function: "col:1", "col" and "col", the function
+    // would rename them to -> col:1, col, col:1 causing a posterior error when building the DFSchema.
+    // That's why we need the `seen` set, so the fields are always unique.
+
+    // Tracks a mapping between a field name and the number of appearances of that field.
+    let mut name_map = HashMap::<&str, usize>::new();
+    // Tracks all the fields and aliases that were previously seen.
+    let mut seen = HashSet::<Cow<String>>::new();
 
     fields
-        .into_iter()
+        .iter()
         .map(|field| {
-            let base_name = field.name();
-            let count = name_map.entry(base_name.clone()).or_insert(0);
-            let mut new_name = base_name.clone();
+            let original_name = field.name();
+            let mut name = Cow::Borrowed(original_name);
+
+            let count = name_map.entry(original_name).or_insert(0);
 
-            // Loop until we find a name that hasn't been used
-            while seen.contains(&new_name) {
+            // Loop until we find a name that hasn't been used.
+            while seen.contains(&name) {
                 *count += 1;
-                new_name = format!("{base_name}:{count}");
+                name = Cow::Owned(format!("{original_name}:{count}"));
             }
 
-            seen.insert(new_name.clone());
+            seen.insert(name.clone());
 
-            let mut modified_field =
-                Field::new(&new_name, field.data_type().clone(), field.is_nullable());
-            modified_field.set_metadata(field.metadata().clone());
-            modified_field
+            match name {
+                Cow::Borrowed(_) => None,
+                Cow::Owned(alias) => Some(alias),
+            }
         })
         .collect()
 }
@@ -1620,22 +1718,106 @@ pub fn build_join_schema(
                 .map(|(q, f)| (q.cloned(), Arc::clone(f)))
                 .collect()
         }
+        JoinType::RightMark => right_fields
+            .map(|(q, f)| (q.cloned(), Arc::clone(f)))
+            .chain(once(mark_field(left)))
+            .collect(),
     };
     let func_dependencies = left.functional_dependencies().join(
         right.functional_dependencies(),
         join_type,
         left.fields().len(),
     );
-    let metadata = left
+
+    let (schema1, schema2) = match join_type {
+        JoinType::Right
+        | JoinType::RightSemi
+        | JoinType::RightAnti
+        | JoinType::RightMark => (left, right),
+        _ => (right, left),
+    };
+
+    let metadata = schema1
         .metadata()
         .clone()
         .into_iter()
-        .chain(right.metadata().clone())
+        .chain(schema2.metadata().clone())
         .collect();
+
     let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?;
     dfschema.with_functional_dependencies(func_dependencies)
 }
 
+/// (Re)qualify the sides of a join if needed, i.e. if the columns from one side would otherwise
+/// conflict with the columns from the other.
+/// This is especially useful for queries that come as Substrait, since Substrait doesn't currently allow specifying
+/// aliases, neither for columns nor for tables.  DataFusion requires columns to be uniquely identifiable, in some
+/// places (see e.g. DFSchema::check_names).
+/// The function returns:
+/// - The requalified or original left logical plan
+/// - The requalified or original right logical plan
+/// - If a requalification was needed or not
+pub fn requalify_sides_if_needed(
+    left: LogicalPlanBuilder,
+    right: LogicalPlanBuilder,
+) -> Result<(LogicalPlanBuilder, LogicalPlanBuilder, bool)> {
+    let left_cols = left.schema().columns();
+    let right_cols = right.schema().columns();
+
+    // Requalify if merging the schemas would cause an error during join.
+    // This can happen in several cases:
+    // 1. Duplicate qualified fields: both sides have same relation.name
+    // 2. Duplicate unqualified fields: both sides have same unqualified name
+    // 3. Ambiguous reference: one side qualified, other unqualified, same name
+    //
+    // Implementation note: This uses a simple O(n*m) nested loop rather than
+    // a HashMap-based O(n+m) approach. The nested loop is preferred because:
+    // - Schemas are typically small (in TPCH benchmark, max is 16 columns),
+    //   so n*m is negligible
+    // - Early return on first conflict makes common case very fast
+    // - Code is simpler and easier to reason about
+    // - Called only during plan construction, not in execution hot path
+    for l in &left_cols {
+        for r in &right_cols {
+            if l.name != r.name {
+                continue;
+            }
+
+            // Same name - check if this would cause a conflict
+            match (&l.relation, &r.relation) {
+                // Both qualified with same relation - duplicate qualified field
+                (Some(l_rel), Some(r_rel)) if l_rel == r_rel => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // Both unqualified - duplicate unqualified field
+                (None, None) => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // One qualified, one not - ambiguous reference
+                (Some(_), None) | (None, Some(_)) => {
+                    return Ok((
+                        left.alias(TableReference::bare("left"))?,
+                        right.alias(TableReference::bare("right"))?,
+                        true,
+                    ));
+                }
+                // Different qualifiers - OK, no conflict
+                _ => {}
+            }
+        }
+    }
+
+    // No conflicts found
+    Ok((left, right, false))
+}
 /// Add additional "synthetic" group by expressions based on functional
 /// dependencies.
 ///
@@ -1816,15 +1998,14 @@ fn replace_columns(
     replace: &PlannedReplaceSelectItem,
 ) -> Result<Vec<Expr>> {
     for expr in exprs.iter_mut() {
-        if let Expr::Column(Column { name, .. }) = expr {
-            if let Some((_, new_expr)) = replace
+        if let Expr::Column(Column { name, .. }) = expr
+            && let Some((_, new_expr)) = replace
                 .items()
                 .iter()
                 .zip(replace.expressions().iter())
                 .find(|(item, _)| item.column_name.value == *name)
-            {
-                *expr = new_expr.clone().alias(name.clone())
-            }
+        {
+            *expr = new_expr.clone().alias(name.clone())
         }
     }
     Ok(exprs)
@@ -1888,6 +2069,7 @@ pub fn table_scan_with_filter_and_fetch(
 }
 
 pub fn table_source(table_schema: &Schema) -> Arc<dyn TableSource> {
+    // TODO should we take SchemaRef and avoid cloning?
     let table_schema = Arc::new(table_schema.clone());
     Arc::new(LogicalTableSource {
         table_schema,
@@ -1899,6 +2081,7 @@ pub fn table_source_with_constraints(
     table_schema: &Schema,
     constraints: Constraints,
 ) -> Arc<dyn TableSource> {
+    // TODO should we take SchemaRef and avoid cloning?
     let table_schema = Arc::new(table_schema.clone());
     Arc::new(LogicalTableSource {
         table_schema,
@@ -2018,27 +2201,6 @@ pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> {
     unnest_with_options(input, columns, UnnestOptions::default())
 }
 
-// Get the data type of a multi-dimensional type after unnesting it
-// with a given depth
-fn get_unnested_list_datatype_recursive(
-    data_type: &DataType,
-    depth: usize,
-) -> Result<DataType> {
-    match data_type {
-        DataType::List(field)
-        | DataType::FixedSizeList(field, _)
-        | DataType::LargeList(field) => {
-            if depth == 1 {
-                return Ok(field.data_type().clone());
-            }
-            return get_unnested_list_datatype_recursive(field.data_type(), depth - 1);
-        }
-        _ => {}
-    };
-
-    internal_err!("trying to unnest on invalid data type {:?}", data_type)
-}
-
 pub fn get_struct_unnested_columns(
     col_name: &String,
     inner_fields: &Fields,
@@ -2049,53 +2211,6 @@ pub fn get_struct_unnested_columns(
         .collect()
 }
 
-// Based on data type, either struct or a variant of list
-// return a set of columns as the result of unnesting
-// the input columns.
-// For example, given a column with name "a",
-// - List(Element) returns ["a"] with data type Element
-// - Struct(field1, field2) returns ["a.field1","a.field2"]
-// For list data type, an argument depth is used to specify
-// the recursion level
-pub fn get_unnested_columns(
-    col_name: &String,
-    data_type: &DataType,
-    depth: usize,
-) -> Result<Vec<(Column, Arc<Field>)>> {
-    let mut qualified_columns = Vec::with_capacity(1);
-
-    match data_type {
-        DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => {
-            let data_type = get_unnested_list_datatype_recursive(data_type, depth)?;
-            let new_field = Arc::new(Field::new(
-                col_name, data_type,
-                // Unnesting may produce NULLs even if the list is not null.
-                // For example: unnest([1], []) -> 1, null
-                true,
-            ));
-            let column = Column::from_name(col_name);
-            // let column = Column::from((None, &new_field));
-            qualified_columns.push((column, new_field));
-        }
-        DataType::Struct(fields) => {
-            qualified_columns.extend(fields.iter().map(|f| {
-                let new_name = format!("{}.{}", col_name, f.name());
-                let column = Column::from_name(&new_name);
-                let new_field = f.as_ref().clone().with_name(new_name);
-                // let column = Column::from((None, &f));
-                (column, Arc::new(new_field))
-            }))
-        }
-        _ => {
-            return internal_err!(
-                "trying to unnest on invalid data type {:?}",
-                data_type
-            );
-        }
-    };
-    Ok(qualified_columns)
-}
-
 /// Create a [`LogicalPlan::Unnest`] plan with options
 /// This function receive a list of columns to be unnested
 /// because multiple unnest can be performed on the same column (e.g unnest with different depth)
@@ -2130,136 +2245,26 @@ pub fn unnest_with_options(
     columns_to_unnest: Vec<Column>,
     options: UnnestOptions,
 ) -> Result<LogicalPlan> {
-    let mut list_columns: Vec<(usize, ColumnUnnestList)> = vec![];
-    let mut struct_columns = vec![];
-    let indices_to_unnest = columns_to_unnest
-        .iter()
-        .map(|c| Ok((input.schema().index_of_column(c)?, c)))
-        .collect::<Result<HashMap<usize, &Column>>>()?;
-
-    let input_schema = input.schema();
-
-    let mut dependency_indices = vec![];
-    // Transform input schema into new schema
-    // Given this comprehensive example
-    //
-    // input schema:
-    // 1.col1_unnest_placeholder: list[list[int]],
-    // 2.col1: list[list[int]]
-    // 3.col2: list[int]
-    // with unnest on unnest(col1,depth=2), unnest(col1,depth=1) and unnest(col2,depth=1)
-    // output schema:
-    // 1.unnest_col1_depth_2: int
-    // 2.unnest_col1_depth_1: list[int]
-    // 3.col1: list[list[int]]
-    // 4.unnest_col2_depth_1: int
-    // Meaning the placeholder column will be replaced by its unnested variation(s), note
-    // the plural.
-    let fields = input_schema
-        .iter()
-        .enumerate()
-        .map(|(index, (original_qualifier, original_field))| {
-            match indices_to_unnest.get(&index) {
-                Some(column_to_unnest) => {
-                    let recursions_on_column = options
-                        .recursions
-                        .iter()
-                        .filter(|p| -> bool { &p.input_column == *column_to_unnest })
-                        .collect::<Vec<_>>();
-                    let mut transformed_columns = recursions_on_column
-                        .iter()
-                        .map(|r| {
-                            list_columns.push((
-                                index,
-                                ColumnUnnestList {
-                                    output_column: r.output_column.clone(),
-                                    depth: r.depth,
-                                },
-                            ));
-                            Ok(get_unnested_columns(
-                                &r.output_column.name,
-                                original_field.data_type(),
-                                r.depth,
-                            )?
-                            .into_iter()
-                            .next()
-                            .unwrap()) // because unnesting a list column always result into one result
-                        })
-                        .collect::<Result<Vec<(Column, Arc<Field>)>>>()?;
-                    if transformed_columns.is_empty() {
-                        transformed_columns = get_unnested_columns(
-                            &column_to_unnest.name,
-                            original_field.data_type(),
-                            1,
-                        )?;
-                        match original_field.data_type() {
-                            DataType::Struct(_) => {
-                                struct_columns.push(index);
-                            }
-                            DataType::List(_)
-                            | DataType::FixedSizeList(_, _)
-                            | DataType::LargeList(_) => {
-                                list_columns.push((
-                                    index,
-                                    ColumnUnnestList {
-                                        output_column: Column::from_name(
-                                            &column_to_unnest.name,
-                                        ),
-                                        depth: 1,
-                                    },
-                                ));
-                            }
-                            _ => {}
-                        };
-                    }
-
-                    // new columns dependent on the same original index
-                    dependency_indices
-                        .extend(std::iter::repeat_n(index, transformed_columns.len()));
-                    Ok(transformed_columns
-                        .iter()
-                        .map(|(col, field)| (col.relation.to_owned(), field.to_owned()))
-                        .collect())
-                }
-                None => {
-                    dependency_indices.push(index);
-                    Ok(vec![(
-                        original_qualifier.cloned(),
-                        Arc::clone(original_field),
-                    )])
-                }
-            }
-        })
-        .collect::<Result<Vec<_>>>()?
-        .into_iter()
-        .flatten()
-        .collect::<Vec<_>>();
-
-    let metadata = input_schema.metadata().clone();
-    let df_schema = DFSchema::new_with_metadata(fields, metadata)?;
-    // We can use the existing functional dependencies:
-    let deps = input_schema.functional_dependencies().clone();
-    let schema = Arc::new(df_schema.with_functional_dependencies(deps)?);
-
-    Ok(LogicalPlan::Unnest(Unnest {
-        input: Arc::new(input),
-        exec_columns: columns_to_unnest,
-        list_type_columns: list_columns,
-        struct_type_columns: struct_columns,
-        dependency_indices,
-        schema,
+    Ok(LogicalPlan::Unnest(Unnest::try_new(
+        Arc::new(input),
+        columns_to_unnest,
         options,
-    }))
+    )?))
 }
 
 #[cfg(test)]
 mod tests {
+    use std::vec;
+
     use super::*;
+    use crate::lit_with_metadata;
     use crate::logical_plan::StringifiedPlan;
     use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery};
 
     use crate::test::function_stub::sum;
-    use datafusion_common::{Constraint, RecursionUnnestOption, SchemaError};
+    use datafusion_common::{
+        Constraint, DataFusionError, RecursionUnnestOption, SchemaError,
+    };
     use insta::assert_snapshot;
 
     #[test]
@@ -2495,20 +2500,24 @@ mod tests {
         .project(vec![col("id"), col("first_name").alias("id")]);
 
         match plan {
-            Err(DataFusionError::SchemaError(
-                SchemaError::AmbiguousReference {
-                    field:
-                        Column {
-                            relation: Some(TableReference::Bare { table }),
-                            name,
-                            spans: _,
-                        },
-                },
-                _,
-            )) => {
-                assert_eq!(*"employee_csv", *table);
-                assert_eq!("id", &name);
-                Ok(())
+            Err(DataFusionError::SchemaError(err, _)) => {
+                if let SchemaError::AmbiguousReference { field } = *err {
+                    let Column {
+                        relation,
+                        name,
+                        spans: _,
+                    } = *field;
+                    let Some(TableReference::Bare { table }) = relation else {
+                        return plan_err!(
+                            "wrong relation: {relation:?}, expected table name"
+                        );
+                    };
+                    assert_eq!(*"employee_csv", *table);
+                    assert_eq!("id", &name);
+                    Ok(())
+                } else {
+                    plan_err!("Plan should have returned an DataFusionError::SchemaError")
+                }
             }
             _ => plan_err!("Plan should have returned an DataFusionError::SchemaError"),
         }
@@ -2592,12 +2601,12 @@ mod tests {
             return plan_err!("Plan should have returned an DataFusionError::Internal");
         };
 
-        let desc = desc
+        let desc = (*desc
             .split(DataFusionError::BACK_TRACE_SEP)
             .collect::<Vec<&str>>()
             .first()
-            .unwrap_or(&"")
-            .to_string();
+            .unwrap_or(&""))
+        .to_string();
 
         assert_snapshot!(desc, @"trying to unnest on invalid data type UInt32");
 
@@ -2770,12 +2779,12 @@ mod tests {
 
         assert_snapshot!(plan, @r"
         Union
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
               Values: (Int32(1))
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
@@ -2785,34 +2794,6 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_change_redundant_column() -> Result<()> {
-        let t1_field_1 = Field::new("a", DataType::Int32, false);
-        let t2_field_1 = Field::new("a", DataType::Int32, false);
-        let t2_field_3 = Field::new("a", DataType::Int32, false);
-        let t2_field_4 = Field::new("a:1", DataType::Int32, false);
-        let t1_field_2 = Field::new("b", DataType::Int32, false);
-        let t2_field_2 = Field::new("b", DataType::Int32, false);
-
-        let field_vec = vec![
-            t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4,
-        ];
-        let remove_redundant = change_redundant_column(&Fields::from(field_vec));
-
-        assert_eq!(
-            remove_redundant,
-            vec![
-                Field::new("a", DataType::Int32, false),
-                Field::new("a:1", DataType::Int32, false),
-                Field::new("b", DataType::Int32, false),
-                Field::new("b:1", DataType::Int32, false),
-                Field::new("a:2", DataType::Int32, false),
-                Field::new("a:1:1", DataType::Int32, false),
-            ]
-        );
-        Ok(())
-    }
-
     #[test]
     fn plan_builder_from_logical_plan() -> Result<()> {
         let plan =
@@ -2870,4 +2851,97 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_join_metadata() -> Result<()> {
+        let left_schema = DFSchema::new_with_metadata(
+            vec![(None, Arc::new(Field::new("a", DataType::Int32, false)))],
+            HashMap::from([("key".to_string(), "left".to_string())]),
+        )?;
+        let right_schema = DFSchema::new_with_metadata(
+            vec![(None, Arc::new(Field::new("b", DataType::Int32, false)))],
+            HashMap::from([("key".to_string(), "right".to_string())]),
+        )?;
+
+        let join_schema =
+            build_join_schema(&left_schema, &right_schema, &JoinType::Left)?;
+        assert_eq!(
+            join_schema.metadata(),
+            &HashMap::from([("key".to_string(), "left".to_string())])
+        );
+        let join_schema =
+            build_join_schema(&left_schema, &right_schema, &JoinType::Right)?;
+        assert_eq!(
+            join_schema.metadata(),
+            &HashMap::from([("key".to_string(), "right".to_string())])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_values_metadata() -> Result<()> {
+        let metadata: HashMap<String, String> =
+            [("ARROW:extension:metadata".to_string(), "test".to_string())]
+                .into_iter()
+                .collect();
+        let metadata = FieldMetadata::from(metadata);
+        let values = LogicalPlanBuilder::values(vec![
+            vec![lit_with_metadata(1, Some(metadata.clone()))],
+            vec![lit_with_metadata(2, Some(metadata.clone()))],
+        ])?
+        .build()?;
+        assert_eq!(*values.schema().field(0).metadata(), metadata.to_hashmap());
+
+        // Do not allow VALUES with different metadata mixed together
+        let metadata2: HashMap<String, String> =
+            [("ARROW:extension:metadata".to_string(), "test2".to_string())]
+                .into_iter()
+                .collect();
+        let metadata2 = FieldMetadata::from(metadata2);
+        assert!(
+            LogicalPlanBuilder::values(vec![
+                vec![lit_with_metadata(1, Some(metadata.clone()))],
+                vec![lit_with_metadata(2, Some(metadata2.clone()))],
+            ])
+            .is_err()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_unique_field_aliases() {
+        let t1_field_1 = Field::new("a", DataType::Int32, false);
+        let t2_field_1 = Field::new("a", DataType::Int32, false);
+        let t2_field_3 = Field::new("a", DataType::Int32, false);
+        let t2_field_4 = Field::new("a:1", DataType::Int32, false);
+        let t1_field_2 = Field::new("b", DataType::Int32, false);
+        let t2_field_2 = Field::new("b", DataType::Int32, false);
+
+        let fields = vec![
+            t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3, t2_field_4,
+        ];
+        let fields = Fields::from(fields);
+
+        let remove_redundant = unique_field_aliases(&fields);
+
+        // Input [a, a, b, b, a, a:1] becomes [None, a:1, None, b:1, a:2, a:1:1]
+        // First occurrence of each field name keeps original name (None), duplicates get
+        // incremental suffixes (:1, :2, etc.).
+        // Crucially in this case the 2nd occurrence of `a` gets rewritten to `a:1` which later
+        // conflicts with the last column which is _actually_ called `a:1` so we need to rename it
+        // as well to `a:1:1`.
+        assert_eq!(
+            remove_redundant,
+            vec![
+                None,
+                Some("a:1".to_string()),
+                None,
+                Some("b:1".to_string()),
+                Some("a:2".to_string()),
+                Some("a:1:1".to_string()),
+            ]
+        );
+    }
 }
diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs
index 827e2812ecae1..8a46e842a861e 100644
--- a/datafusion/expr/src/logical_plan/ddl.rs
+++ b/datafusion/expr/src/logical_plan/ddl.rs
@@ -24,12 +24,15 @@ use std::{
     hash::{Hash, Hasher},
 };
 
+#[cfg(not(feature = "sql"))]
+use crate::expr::Ident;
 use crate::expr::Sort;
 use arrow::datatypes::DataType;
 use datafusion_common::tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion};
 use datafusion_common::{
     Constraints, DFSchemaRef, Result, SchemaReference, TableReference,
 };
+#[cfg(feature = "sql")]
 use sqlparser::ast::Ident;
 
 /// Various types of DDL  (CREATE / DROP) catalog manipulation
@@ -129,7 +132,7 @@ impl DdlStatement {
             fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
                 match self.0 {
                     DdlStatement::CreateExternalTable(CreateExternalTable {
-                        ref name,
+                        name,
                         constraints,
                         ..
                     }) => {
@@ -183,7 +186,10 @@ impl DdlStatement {
                         cascade,
                         ..
                     }) => {
-                        write!(f, "DropCatalogSchema: {name:?} if not exist:={if_exists} cascade:={cascade}")
+                        write!(
+                            f,
+                            "DropCatalogSchema: {name:?} if not exist:={if_exists} cascade:={cascade}"
+                        )
                     }
                     DdlStatement::CreateFunction(CreateFunction { name, .. }) => {
                         write!(f, "CreateFunction: name {name:?}")
@@ -213,6 +219,8 @@ pub struct CreateExternalTable {
     pub table_partition_cols: Vec<String>,
     /// Option to not error if table already exists
     pub if_not_exists: bool,
+    /// Option to replace table content if table already exists
+    pub or_replace: bool,
     /// Whether the table is a temporary table
     pub temporary: bool,
     /// SQL used to create the table, if available
@@ -229,6 +237,158 @@ pub struct CreateExternalTable {
     pub column_defaults: HashMap<String, Expr>,
 }
 
+impl CreateExternalTable {
+    /// Creates a builder for [`CreateExternalTable`] with required fields.
+    ///
+    /// # Arguments
+    /// * `name` - The table name
+    /// * `location` - The physical location of the table files
+    /// * `file_type` - The file type (e.g., "parquet", "csv", "json")
+    /// * `schema` - The table schema
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion_expr::CreateExternalTable;
+    /// # use datafusion_common::{DFSchema, TableReference};
+    /// # use std::sync::Arc;
+    /// let table = CreateExternalTable::builder(
+    ///     TableReference::bare("my_table"),
+    ///     "/path/to/data",
+    ///     "parquet",
+    ///     Arc::new(DFSchema::empty())
+    /// ).build();
+    /// ```
+    pub fn builder(
+        name: impl Into<TableReference>,
+        location: impl Into<String>,
+        file_type: impl Into<String>,
+        schema: DFSchemaRef,
+    ) -> CreateExternalTableBuilder {
+        CreateExternalTableBuilder {
+            name: name.into(),
+            location: location.into(),
+            file_type: file_type.into(),
+            schema,
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        }
+    }
+}
+
+/// Builder for [`CreateExternalTable`] that provides a fluent API for construction.
+///
+/// Created via [`CreateExternalTable::builder`].
+#[derive(Debug, Clone)]
+pub struct CreateExternalTableBuilder {
+    name: TableReference,
+    location: String,
+    file_type: String,
+    schema: DFSchemaRef,
+    table_partition_cols: Vec<String>,
+    if_not_exists: bool,
+    or_replace: bool,
+    temporary: bool,
+    definition: Option<String>,
+    order_exprs: Vec<Vec<Sort>>,
+    unbounded: bool,
+    options: HashMap<String, String>,
+    constraints: Constraints,
+    column_defaults: HashMap<String, Expr>,
+}
+
+impl CreateExternalTableBuilder {
+    /// Set the partition columns
+    pub fn with_partition_cols(mut self, cols: Vec<String>) -> Self {
+        self.table_partition_cols = cols;
+        self
+    }
+
+    /// Set the if_not_exists flag
+    pub fn with_if_not_exists(mut self, if_not_exists: bool) -> Self {
+        self.if_not_exists = if_not_exists;
+        self
+    }
+
+    /// Set the or_replace flag
+    pub fn with_or_replace(mut self, or_replace: bool) -> Self {
+        self.or_replace = or_replace;
+        self
+    }
+
+    /// Set the temporary flag
+    pub fn with_temporary(mut self, temporary: bool) -> Self {
+        self.temporary = temporary;
+        self
+    }
+
+    /// Set the SQL definition
+    pub fn with_definition(mut self, definition: Option<String>) -> Self {
+        self.definition = definition;
+        self
+    }
+
+    /// Set the order expressions
+    pub fn with_order_exprs(mut self, order_exprs: Vec<Vec<Sort>>) -> Self {
+        self.order_exprs = order_exprs;
+        self
+    }
+
+    /// Set the unbounded flag
+    pub fn with_unbounded(mut self, unbounded: bool) -> Self {
+        self.unbounded = unbounded;
+        self
+    }
+
+    /// Set the table options
+    pub fn with_options(mut self, options: HashMap<String, String>) -> Self {
+        self.options = options;
+        self
+    }
+
+    /// Set the table constraints
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.constraints = constraints;
+        self
+    }
+
+    /// Set the column defaults
+    pub fn with_column_defaults(
+        mut self,
+        column_defaults: HashMap<String, Expr>,
+    ) -> Self {
+        self.column_defaults = column_defaults;
+        self
+    }
+
+    /// Build the [`CreateExternalTable`]
+    pub fn build(self) -> CreateExternalTable {
+        CreateExternalTable {
+            schema: self.schema,
+            name: self.name,
+            location: self.location,
+            file_type: self.file_type,
+            table_partition_cols: self.table_partition_cols,
+            if_not_exists: self.if_not_exists,
+            or_replace: self.or_replace,
+            temporary: self.temporary,
+            definition: self.definition,
+            order_exprs: self.order_exprs,
+            unbounded: self.unbounded,
+            options: self.options,
+            constraints: self.constraints,
+            column_defaults: self.column_defaults,
+        }
+    }
+}
+
 // Hashing refers to a subset of fields considered in PartialEq.
 impl Hash for CreateExternalTable {
     fn hash<H: Hasher>(&self, state: &mut H) {
@@ -292,7 +452,10 @@ impl PartialOrd for CreateExternalTable {
             unbounded: &other.unbounded,
             constraints: &other.constraints,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -348,6 +511,8 @@ impl PartialOrd for CreateCatalog {
             Some(Ordering::Equal) => self.if_not_exists.partial_cmp(&other.if_not_exists),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -369,6 +534,8 @@ impl PartialOrd for CreateCatalogSchema {
             Some(Ordering::Equal) => self.if_not_exists.partial_cmp(&other.if_not_exists),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -390,6 +557,8 @@ impl PartialOrd for DropTable {
             Some(Ordering::Equal) => self.if_exists.partial_cmp(&other.if_exists),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -411,6 +580,8 @@ impl PartialOrd for DropView {
             Some(Ordering::Equal) => self.if_exists.partial_cmp(&other.if_exists),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -437,17 +608,25 @@ impl PartialOrd for DropCatalogSchema {
             },
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
-/// Arguments passed to `CREATE FUNCTION`
+/// Arguments passed to the `CREATE FUNCTION` statement
+///
+/// These statements are turned into executable functions using [`FunctionFactory`]
 ///
-/// Note this meant to be the same as from sqlparser's [`sqlparser::ast::Statement::CreateFunction`]
+/// # Notes
+///
+/// This structure purposely mirrors the structure in sqlparser's
+/// [`sqlparser::ast::Statement::CreateFunction`], but does not use it directly
+/// to avoid a dependency on sqlparser in the core crate.
+///
+///
+/// [`FunctionFactory`]: https://docs.rs/datafusion/latest/datafusion/execution/context/trait.FunctionFactory.html
 #[derive(Clone, PartialEq, Eq, Hash, Debug)]
 pub struct CreateFunction {
-    // TODO: There is open question should we expose sqlparser types or redefine them here?
-    //       At the moment it make more sense to expose sqlparser types and leave
-    //       user to convert them as needed
     pub or_replace: bool,
     pub temporary: bool,
     pub name: String,
@@ -486,10 +665,16 @@ impl PartialOrd for CreateFunction {
             return_type: &other.return_type,
             params: &other.params,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
+/// Part of the `CREATE FUNCTION` statement
+///
+/// See [`CreateFunction`] for details
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct OperateFunctionArg {
     // TODO: figure out how to support mode
@@ -520,6 +705,9 @@ impl<'a> TreeNodeContainer<'a, Expr> for OperateFunctionArg {
     }
 }
 
+/// Part of the `CREATE FUNCTION` statement
+///
+/// See [`CreateFunction`] for details
 #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)]
 pub struct CreateFunctionBody {
     /// LANGUAGE lang_name
@@ -566,6 +754,8 @@ impl PartialOrd for DropFunction {
             Some(Ordering::Equal) => self.if_exists.partial_cmp(&other.if_exists),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -608,7 +798,10 @@ impl PartialOrd for CreateIndex {
             unique: &other.unique,
             if_not_exists: &other.if_not_exists,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs
index f1e455f46db30..58c7feb616179 100644
--- a/datafusion/expr/src/logical_plan/display.rs
+++ b/datafusion/expr/src/logical_plan/display.rs
@@ -21,17 +21,17 @@ use std::collections::HashMap;
 use std::fmt;
 
 use crate::{
-    expr_vec_fmt, Aggregate, DescribeTable, Distinct, DistinctOn, DmlStatement, Expr,
-    Filter, Join, Limit, LogicalPlan, Partitioning, Projection, RecursiveQuery,
-    Repartition, Sort, Subquery, SubqueryAlias, TableProviderFilterPushDown, TableScan,
-    Unnest, Values, Window,
+    Aggregate, DescribeTable, Distinct, DistinctOn, DmlStatement, Expr, Filter, Join,
+    Limit, LogicalPlan, Partitioning, Projection, RecursiveQuery, Repartition, Sort,
+    Subquery, SubqueryAlias, TableProviderFilterPushDown, TableScan, Unnest, Values,
+    Window, expr_vec_fmt,
 };
 
 use crate::dml::CopyTo;
 use arrow::datatypes::Schema;
 use datafusion_common::display::GraphvizBuilder;
 use datafusion_common::tree_node::{TreeNodeRecursion, TreeNodeVisitor};
-use datafusion_common::{Column, DataFusionError};
+use datafusion_common::{Column, DataFusionError, internal_datafusion_err};
 use serde_json::json;
 
 /// Formats plans with a single line per node. For example:
@@ -72,11 +72,7 @@ impl<'n> TreeNodeVisitor<'n> for IndentVisitor<'_, '_> {
         write!(self.f, "{:indent$}", "", indent = self.indent * 2)?;
         write!(self.f, "{}", plan.display())?;
         if self.with_schema {
-            write!(
-                self.f,
-                " {}",
-                display_schema(&plan.schema().as_ref().to_owned().into())
-            )?;
+            write!(self.f, " {}", display_schema(plan.schema().as_arrow()))?;
         }
 
         self.indent += 1;
@@ -98,17 +94,17 @@ impl<'n> TreeNodeVisitor<'n> for IndentVisitor<'_, '_> {
 /// `foo:Utf8;N` if `foo` is nullable.
 ///
 /// ```
-/// use arrow::datatypes::{Field, Schema, DataType};
+/// use arrow::datatypes::{DataType, Field, Schema};
 /// # use datafusion_expr::logical_plan::display_schema;
 /// let schema = Schema::new(vec![
 ///     Field::new("id", DataType::Int32, false),
 ///     Field::new("first_name", DataType::Utf8, true),
-///  ]);
+/// ]);
 ///
-///  assert_eq!(
-///      "[id:Int32, first_name:Utf8;N]",
-///      format!("{}", display_schema(&schema))
-///  );
+/// assert_eq!(
+///     "[id:Int32, first_name:Utf8;N]",
+///     format!("{}", display_schema(&schema))
+/// );
 /// ```
 pub fn display_schema(schema: &Schema) -> impl fmt::Display + '_ {
     struct Wrapper<'a>(&'a Schema);
@@ -121,13 +117,7 @@ pub fn display_schema(schema: &Schema) -> impl fmt::Display + '_ {
                     write!(f, ", ")?;
                 }
                 let nullable_str = if field.is_nullable() { ";N" } else { "" };
-                write!(
-                    f,
-                    "{}:{:?}{}",
-                    field.name(),
-                    field.data_type(),
-                    nullable_str
-                )?;
+                write!(f, "{}:{}{}", field.name(), field.data_type(), nullable_str)?;
             }
             write!(f, "]")
         }
@@ -196,7 +186,7 @@ impl<'n> TreeNodeVisitor<'n> for GraphvizVisitor<'_, '_> {
             format!(
                 r"{}\nSchema: {}",
                 plan.display(),
-                display_schema(&plan.schema().as_ref().to_owned().into())
+                display_schema(plan.schema().as_arrow())
             )
         } else {
             format!("{}", plan.display())
@@ -204,14 +194,14 @@ impl<'n> TreeNodeVisitor<'n> for GraphvizVisitor<'_, '_> {
 
         self.graphviz_builder
             .add_node(self.f, id, &label, None)
-            .map_err(|_e| DataFusionError::Internal("Fail to format".to_string()))?;
+            .map_err(|_e| internal_datafusion_err!("Fail to format"))?;
 
         // Create an edge to our parent node, if any
         //  parent_id -> id
         if let Some(parent_id) = self.parent_ids.last() {
             self.graphviz_builder
                 .add_edge(self.f, *parent_id, id)
-                .map_err(|_e| DataFusionError::Internal("Fail to format".to_string()))?;
+                .map_err(|_e| internal_datafusion_err!("Fail to format"))?;
         }
 
         self.parent_ids.push(id);
@@ -225,7 +215,7 @@ impl<'n> TreeNodeVisitor<'n> for GraphvizVisitor<'_, '_> {
         // always be non-empty as pre_visit always pushes
         // So it should always be Ok(true)
         let res = self.parent_ids.pop();
-        res.ok_or(DataFusionError::Internal("Fail to format".to_string()))
+        res.ok_or(internal_datafusion_err!("Fail to format"))
             .map(|_| TreeNodeRecursion::Continue)
     }
 }
@@ -323,7 +313,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     "Is Distinct": is_distinct,
                 })
             }
-            LogicalPlan::Values(Values { ref values, .. }) => {
+            LogicalPlan::Values(Values { values, .. }) => {
                 let str_values = values
                     .iter()
                     // limit to only 5 values to avoid horrible display
@@ -348,10 +338,10 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 })
             }
             LogicalPlan::TableScan(TableScan {
-                ref source,
-                ref table_name,
-                ref filters,
-                ref fetch,
+                source,
+                table_name,
+                filters,
+                fetch,
                 ..
             }) => {
                 let mut object = json!({
@@ -407,7 +397,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
 
                 object
             }
-            LogicalPlan::Projection(Projection { ref expr, .. }) => {
+            LogicalPlan::Projection(Projection { expr, .. }) => {
                 json!({
                     "Node Type": "Projection",
                     "Expressions": expr.iter().map(|e| e.to_string()).collect::<Vec<_>>()
@@ -426,6 +416,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 file_type,
                 partition_by: _,
                 options,
+                output_schema: _,
             }) => {
                 let op_str = options
                     .iter()
@@ -446,25 +437,22 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 })
             }
             LogicalPlan::Filter(Filter {
-                predicate: ref expr,
-                ..
+                predicate: expr, ..
             }) => {
                 json!({
                     "Node Type": "Filter",
                     "Condition": format!("{}", expr)
                 })
             }
-            LogicalPlan::Window(Window {
-                ref window_expr, ..
-            }) => {
+            LogicalPlan::Window(Window { window_expr, .. }) => {
                 json!({
                     "Node Type": "WindowAggr",
                     "Expressions": expr_vec_fmt!(window_expr)
                 })
             }
             LogicalPlan::Aggregate(Aggregate {
-                ref group_expr,
-                ref aggr_expr,
+                group_expr,
+                aggr_expr,
                 ..
             }) => {
                 json!({
@@ -486,7 +474,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                 object
             }
             LogicalPlan::Join(Join {
-                on: ref keys,
+                on: keys,
                 filter,
                 join_constraint,
                 join_type,
@@ -537,11 +525,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     })
                 }
             },
-            LogicalPlan::Limit(Limit {
-                ref skip,
-                ref fetch,
-                ..
-            }) => {
+            LogicalPlan::Limit(Limit { skip, fetch, .. }) => {
                 let mut object = serde_json::json!(
                     {
                         "Node Type": "Limit",
@@ -560,7 +544,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> {
                     "Node Type": "Subquery"
                 })
             }
-            LogicalPlan::SubqueryAlias(SubqueryAlias { ref alias, .. }) => {
+            LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => {
                 json!({
                     "Node Type": "Subquery",
                     "Alias": alias.table(),
@@ -689,9 +673,10 @@ impl<'n> TreeNodeVisitor<'n> for PgJsonVisitor<'_, '_> {
     ) -> datafusion_common::Result<TreeNodeRecursion> {
         let id = self.parent_ids.pop().unwrap();
 
-        let current_node = self.objects.remove(&id).ok_or_else(|| {
-            DataFusionError::Internal("Missing current node!".to_string())
-        })?;
+        let current_node = self
+            .objects
+            .remove(&id)
+            .ok_or_else(|| internal_datafusion_err!("Missing current node!"))?;
 
         if let Some(parent_id) = self.parent_ids.last() {
             let parent_node = self
diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs
index f3c95e696b4b6..b668cbfe2cc35 100644
--- a/datafusion/expr/src/logical_plan/dml.rs
+++ b/datafusion/expr/src/logical_plan/dml.rs
@@ -40,6 +40,8 @@ pub struct CopyTo {
     pub file_type: Arc<dyn FileType>,
     /// SQL Options that can affect the formats
     pub options: HashMap<String, String>,
+    /// The schema of the output (a single column "count")
+    pub output_schema: DFSchemaRef,
 }
 
 impl Debug for CopyTo {
@@ -50,6 +52,7 @@ impl Debug for CopyTo {
             .field("partition_by", &self.partition_by)
             .field("file_type", &"...")
             .field("options", &self.options)
+            .field("output_schema", &self.output_schema)
             .finish_non_exhaustive()
     }
 }
@@ -78,6 +81,8 @@ impl PartialOrd for CopyTo {
             },
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -89,6 +94,26 @@ impl Hash for CopyTo {
     }
 }
 
+impl CopyTo {
+    pub fn new(
+        input: Arc<LogicalPlan>,
+        output_url: String,
+        partition_by: Vec<String>,
+        file_type: Arc<dyn FileType>,
+        options: HashMap<String, String>,
+    ) -> Self {
+        Self {
+            input,
+            output_url,
+            partition_by,
+            file_type,
+            options,
+            // The output schema is always a single column "count" with the number of rows copied
+            output_schema: make_count_schema(),
+        }
+    }
+}
+
 /// Modifies the content of a database
 ///
 /// This operator is used to perform DML operations such as INSERT, DELETE,
@@ -97,11 +122,9 @@ impl Hash for CopyTo {
 /// * `INSERT` - Appends new rows to the existing table. Calls
 ///   [`TableProvider::insert_into`]
 ///
-/// * `DELETE` - Removes rows from the table. Currently NOT supported by the
-///   [`TableProvider`] trait or builtin sources.
+/// * `DELETE` - Removes rows from the table. Calls [`TableProvider::delete_from`]
 ///
-/// * `UPDATE` - Modifies existing rows in the table. Currently NOT supported by
-///   the [`TableProvider`] trait or builtin sources.
+/// * `UPDATE` - Modifies existing rows in the table. Calls [`TableProvider::update`]
 ///
 /// * `CREATE TABLE AS SELECT` - Creates a new table and populates it with data
 ///   from a query. This is similar to the `INSERT` operation, but it creates a new
@@ -111,6 +134,8 @@ impl Hash for CopyTo {
 ///
 /// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
 /// [`TableProvider::insert_into`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.insert_into
+/// [`TableProvider::delete_from`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.delete_from
+/// [`TableProvider::update`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#method.update
 #[derive(Clone)]
 pub struct DmlStatement {
     /// The table name
@@ -194,6 +219,8 @@ impl PartialOrd for DmlStatement {
             },
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -210,6 +237,8 @@ pub enum WriteOp {
     Update,
     /// `CREATE TABLE AS SELECT` operation
     Ctas,
+    /// `TRUNCATE` operation
+    Truncate,
 }
 
 impl WriteOp {
@@ -220,6 +249,7 @@ impl WriteOp {
             WriteOp::Delete => "Delete",
             WriteOp::Update => "Update",
             WriteOp::Ctas => "Ctas",
+            WriteOp::Truncate => "Truncate",
         }
     }
 }
diff --git a/datafusion/expr/src/logical_plan/extension.rs b/datafusion/expr/src/logical_plan/extension.rs
index 5bf64a36a6540..fe324d40fd952 100644
--- a/datafusion/expr/src/logical_plan/extension.rs
+++ b/datafusion/expr/src/logical_plan/extension.rs
@@ -39,10 +39,10 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync {
     /// # struct Dummy { }
     ///
     /// # impl Dummy {
-    ///   // canonical boiler plate
-    ///   fn as_any(&self) -> &dyn Any {
-    ///      self
-    ///   }
+    /// // canonical boiler plate
+    /// fn as_any(&self) -> &dyn Any {
+    ///     self
+    /// }
     /// # }
     /// ```
     fn as_any(&self) -> &dyn Any;
@@ -57,7 +57,7 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync {
     fn schema(&self) -> &DFSchemaRef;
 
     /// Perform check of invariants for the extension node.
-    fn check_invariants(&self, check: InvariantLevel, plan: &LogicalPlan) -> Result<()>;
+    fn check_invariants(&self, check: InvariantLevel) -> Result<()>;
 
     /// Returns all expressions in the current logical plan node. This should
     /// not include expressions of any inputs (aka non-recursively).
@@ -131,18 +131,18 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync {
     /// // User defined node that derives Hash
     /// #[derive(Hash, Debug, PartialEq, Eq)]
     /// struct MyNode {
-    ///   val: u64
+    ///     val: u64,
     /// }
     ///
     /// // impl UserDefinedLogicalNode {
     /// // ...
     /// # impl MyNode {
-    ///   // Boiler plate to call the derived Hash impl
-    ///   fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) {
+    /// // Boiler plate to call the derived Hash impl
+    /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) {
     ///     use std::hash::Hash;
     ///     let mut s = state;
     ///     self.hash(&mut s);
-    ///   }
+    /// }
     /// // }
     /// # }
     /// ```
@@ -150,7 +150,7 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync {
     /// directly because it must remain object safe.
     fn dyn_hash(&self, state: &mut dyn Hasher);
 
-    /// Compare `other`, respecting requirements from [std::cmp::Eq].
+    /// Compare `other`, respecting requirements from [Eq].
     ///
     /// Note: consider using [`UserDefinedLogicalNodeCore`] instead of
     /// [`UserDefinedLogicalNode`] directly.
@@ -169,25 +169,28 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync {
     /// // User defined node that derives Eq
     /// #[derive(Hash, Debug, PartialEq, Eq)]
     /// struct MyNode {
-    ///   val: u64
+    ///     val: u64,
     /// }
     ///
     /// // impl UserDefinedLogicalNode {
     /// // ...
     /// # impl MyNode {
-    ///   // Boiler plate to call the derived Eq impl
-    ///   fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool {
+    /// // Boiler plate to call the derived Eq impl
+    /// fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool {
     ///     match other.as_any().downcast_ref::<Self>() {
-    ///       Some(o) => self == o,
-    ///       None => false,
+    ///         Some(o) => self == o,
+    ///         None => false,
     ///     }
-    ///   }
+    /// }
     /// // }
     /// # }
     /// ```
     /// Note: [`UserDefinedLogicalNode`] is not constrained by [`Eq`]
     /// directly because it must remain object safe.
     fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool;
+
+    /// Compare `other`, respecting requirements from [PartialOrd].
+    /// Must return `Some(Equal)` if and only if `self.dyn_eq(other)`.
     fn dyn_ord(&self, other: &dyn UserDefinedLogicalNode) -> Option<Ordering>;
 
     /// Returns `true` if a limit can be safely pushed down through this
@@ -241,11 +244,7 @@ pub trait UserDefinedLogicalNodeCore:
     /// Perform check of invariants for the extension node.
     ///
     /// This is the default implementation for extension nodes.
-    fn check_invariants(
-        &self,
-        _check: InvariantLevel,
-        _plan: &LogicalPlan,
-    ) -> Result<()> {
+    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
         Ok(())
     }
 
@@ -316,7 +315,7 @@ pub trait UserDefinedLogicalNodeCore:
 }
 
 /// Automatically derive UserDefinedLogicalNode to `UserDefinedLogicalNode`
-/// to avoid boiler plate for implementing `as_any`, `Hash` and `PartialEq`
+/// to avoid boiler plate for implementing `as_any`, `Hash`, `PartialEq` and `PartialOrd`.
 impl<T: UserDefinedLogicalNodeCore> UserDefinedLogicalNode for T {
     fn as_any(&self) -> &dyn Any {
         self
@@ -334,8 +333,8 @@ impl<T: UserDefinedLogicalNodeCore> UserDefinedLogicalNode for T {
         self.schema()
     }
 
-    fn check_invariants(&self, check: InvariantLevel, plan: &LogicalPlan) -> Result<()> {
-        self.check_invariants(check, plan)
+    fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
+        self.check_invariants(check)
     }
 
     fn expressions(&self) -> Vec<Expr> {
diff --git a/datafusion/expr/src/logical_plan/invariants.rs b/datafusion/expr/src/logical_plan/invariants.rs
index 0c30c9785766b..0889afd08fee4 100644
--- a/datafusion/expr/src/logical_plan/invariants.rs
+++ b/datafusion/expr/src/logical_plan/invariants.rs
@@ -16,16 +16,15 @@
 // under the License.
 
 use datafusion_common::{
-    internal_err, plan_err,
+    DFSchemaRef, Result, assert_or_internal_err, plan_err,
     tree_node::{TreeNode, TreeNodeRecursion},
-    DFSchemaRef, Result,
 };
 
 use crate::{
-    expr::{Exists, InSubquery},
+    Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
+    expr::{Exists, InSubquery, SetComparison},
     expr_rewriter::strip_outer_reference,
     utils::{collect_subquery_cols, split_conjunction},
-    Aggregate, Expr, Filter, Join, JoinType, LogicalPlan, Window,
 };
 
 use super::Extension;
@@ -74,7 +73,7 @@ pub fn assert_executable_invariants(plan: &LogicalPlan) -> Result<()> {
 fn assert_valid_extension_nodes(plan: &LogicalPlan, check: InvariantLevel) -> Result<()> {
     plan.apply_with_subqueries(|plan: &LogicalPlan| {
         if let LogicalPlan::Extension(Extension { node }) = plan {
-            node.check_invariants(check, plan)?;
+            node.check_invariants(check)?;
         }
         plan.apply_expressions(|expr| {
             // recursively look for subqueries
@@ -82,6 +81,7 @@ fn assert_valid_extension_nodes(plan: &LogicalPlan, check: InvariantLevel) -> Re
                 match expr {
                     Expr::Exists(Exists { subquery, .. })
                     | Expr::InSubquery(InSubquery { subquery, .. })
+                    | Expr::SetComparison(SetComparison { subquery, .. })
                     | Expr::ScalarSubquery(subquery) => {
                         assert_valid_extension_nodes(&subquery.subquery, check)?;
                     }
@@ -102,7 +102,7 @@ fn assert_unique_field_names(plan: &LogicalPlan) -> Result<()> {
     plan.schema().check_names()
 }
 
-/// Returns an error if the plan is not sematically valid.
+/// Returns an error if the plan is not semantically valid.
 fn assert_valid_semantic_plan(plan: &LogicalPlan) -> Result<()> {
     assert_subqueries_are_valid(plan)?;
 
@@ -114,15 +114,13 @@ fn assert_valid_semantic_plan(plan: &LogicalPlan) -> Result<()> {
 pub fn assert_expected_schema(schema: &DFSchemaRef, plan: &LogicalPlan) -> Result<()> {
     let compatible = plan.schema().logically_equivalent_names_and_types(schema);
 
-    if !compatible {
-        internal_err!(
-            "Failed due to a difference in schemas: original schema: {:?}, new schema: {:?}",
-            schema,
-            plan.schema()
-        )
-    } else {
-        Ok(())
-    }
+    assert_or_internal_err!(
+        compatible,
+        "Failed due to a difference in schemas: original schema: {:?}, new schema: {:?}",
+        schema,
+        plan.schema()
+    );
+    Ok(())
 }
 
 /// Asserts that the subqueries are structured properly with valid node placement.
@@ -136,6 +134,7 @@ fn assert_subqueries_are_valid(plan: &LogicalPlan) -> Result<()> {
                 match expr {
                     Expr::Exists(Exists { subquery, .. })
                     | Expr::InSubquery(InSubquery { subquery, .. })
+                    | Expr::SetComparison(SetComparison { subquery, .. })
                     | Expr::ScalarSubquery(subquery) => {
                         check_subquery_expr(plan, &subquery.subquery, expr)?;
                     }
@@ -200,21 +199,26 @@ pub fn check_subquery_expr(
                 }
             }?;
             match outer_plan {
-                LogicalPlan::Projection(_)
-                | LogicalPlan::Filter(_) => Ok(()),
-                LogicalPlan::Aggregate(Aggregate { group_expr, aggr_expr, .. }) => {
+                LogicalPlan::Projection(_) | LogicalPlan::Filter(_) => Ok(()),
+                LogicalPlan::Aggregate(Aggregate {
+                    group_expr,
+                    aggr_expr,
+                    ..
+                }) => {
                     if group_expr.contains(expr) && !aggr_expr.contains(expr) {
                         // TODO revisit this validation logic
                         plan_err!(
-                            "Correlated scalar subquery in the GROUP BY clause must also be in the aggregate expressions"
+                            "Correlated scalar subquery in the GROUP BY clause must \
+                            also be in the aggregate expressions"
                         )
                     } else {
                         Ok(())
                     }
                 }
                 _ => plan_err!(
-                    "Correlated scalar subquery can only be used in Projection, Filter, Aggregate plan nodes"
-                )
+                    "Correlated scalar subquery can only be used in Projection, \
+                    Filter, Aggregate plan nodes"
+                ),
             }?;
         }
         check_correlations_in_subquery(inner_plan)
@@ -229,6 +233,20 @@ pub fn check_subquery_expr(
                 );
             }
         }
+        if let Expr::SetComparison(set_comparison) = expr
+            && set_comparison.subquery.subquery.schema().fields().len() > 1
+        {
+            return plan_err!(
+                "Set comparison subquery should only return one column, but found {}: {}",
+                set_comparison.subquery.subquery.schema().fields().len(),
+                set_comparison
+                    .subquery
+                    .subquery
+                    .schema()
+                    .field_names()
+                    .join(", ")
+            );
+        }
         match outer_plan {
             LogicalPlan::Projection(_)
             | LogicalPlan::Filter(_)
@@ -237,7 +255,7 @@ pub fn check_subquery_expr(
             | LogicalPlan::Aggregate(_)
             | LogicalPlan::Join(_) => Ok(()),
             _ => plan_err!(
-                "In/Exist subquery can only be used in \
+                "In/Exist/SetComparison subquery can only be used in \
                 Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, \
                 but was used in [{}]",
                 outer_plan.display()
@@ -310,7 +328,10 @@ fn check_inner_plan(inner_plan: &LogicalPlan) -> Result<()> {
                 check_inner_plan(left)?;
                 check_no_outer_references(right)
             }
-            JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => {
+            JoinType::Right
+            | JoinType::RightSemi
+            | JoinType::RightAnti
+            | JoinType::RightMark => {
                 check_no_outer_references(left)?;
                 check_inner_plan(right)
             }
diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs
index a55f4d97b2126..c2b01868c97f3 100644
--- a/datafusion/expr/src/logical_plan/mod.rs
+++ b/datafusion/expr/src/logical_plan/mod.rs
@@ -21,14 +21,15 @@ pub mod display;
 pub mod dml;
 mod extension;
 pub(crate) mod invariants;
-pub use invariants::{assert_expected_schema, check_subquery_expr, InvariantLevel};
+pub use invariants::{InvariantLevel, assert_expected_schema, check_subquery_expr};
 mod plan;
 mod statement;
 pub mod tree_node;
 
 pub use builder::{
-    build_join_schema, table_scan, union, wrap_projection_for_join_if_necessary,
     LogicalPlanBuilder, LogicalPlanBuilderOptions, LogicalTableSource, UNNAMED_TABLE,
+    build_join_schema, requalify_sides_if_needed, table_scan, union,
+    wrap_projection_for_join_if_necessary,
 };
 pub use ddl::{
     CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction,
@@ -37,17 +38,21 @@ pub use ddl::{
 };
 pub use dml::{DmlStatement, WriteOp};
 pub use plan::{
-    projection_schema, Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct,
-    DistinctOn, EmptyRelation, Explain, ExplainFormat, Extension, FetchType, Filter,
-    Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType,
-    Projection, RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery,
+    Aggregate, Analyze, ColumnUnnestList, DescribeTable, Distinct, DistinctOn,
+    EmptyRelation, Explain, ExplainOption, Extension, FetchType, Filter, Join,
+    JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Projection,
+    RecursiveQuery, Repartition, SkipType, Sort, StringifiedPlan, Subquery,
     SubqueryAlias, TableScan, ToStringifiedPlan, Union, Unnest, Values, Window,
+    projection_schema,
 };
 pub use statement::{
-    Deallocate, Execute, Prepare, SetVariable, Statement, TransactionAccessMode,
-    TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart,
+    Deallocate, Execute, Prepare, ResetVariable, SetVariable, Statement,
+    TransactionAccessMode, TransactionConclusion, TransactionEnd,
+    TransactionIsolationLevel, TransactionStart,
 };
 
+pub use datafusion_common::format::ExplainFormat;
+
 pub use display::display_schema;
 
 pub use extension::{UserDefinedLogicalNode, UserDefinedLogicalNodeCore};
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index 2fe1c0d7398f2..b2a56971837f0 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -21,19 +21,21 @@ use std::cmp::Ordering;
 use std::collections::{HashMap, HashSet};
 use std::fmt::{self, Debug, Display, Formatter};
 use std::hash::{Hash, Hasher};
-use std::str::FromStr;
 use std::sync::{Arc, LazyLock};
 
+use super::DdlStatement;
 use super::dml::CopyTo;
 use super::invariants::{
-    assert_always_invariants_at_current_node, assert_executable_invariants,
-    InvariantLevel,
+    InvariantLevel, assert_always_invariants_at_current_node,
+    assert_executable_invariants,
+};
+use crate::builder::{unique_field_aliases, unnest_with_options};
+use crate::expr::{
+    Alias, Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams,
+    intersect_metadata_for_union,
 };
-use super::DdlStatement;
-use crate::builder::{change_redundant_column, unnest_with_options};
-use crate::expr::{Placeholder, Sort as SortExpr, WindowFunction, WindowFunctionParams};
 use crate::expr_rewriter::{
-    create_col_from_scalar_expr, normalize_cols, normalize_sorts, NamePreserver,
+    NamePreserver, create_col_from_scalar_expr, normalize_cols, normalize_sorts,
 };
 use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor};
 use crate::logical_plan::extension::UserDefinedLogicalNode;
@@ -43,21 +45,23 @@ use crate::utils::{
     grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction,
 };
 use crate::{
-    build_join_schema, expr_vec_fmt, BinaryExpr, CreateMemoryTable, CreateView, Execute,
-    Expr, ExprSchemable, LogicalPlanBuilder, Operator, Prepare,
-    TableProviderFilterPushDown, TableSource, WindowFunctionDefinition,
+    BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable,
+    LogicalPlanBuilder, Operator, Prepare, TableProviderFilterPushDown, TableSource,
+    WindowFunctionDefinition, build_join_schema, expr_vec_fmt, requalify_sides_if_needed,
 };
 
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef};
 use datafusion_common::cse::{NormalizeEq, Normalizeable};
+use datafusion_common::format::ExplainFormat;
+use datafusion_common::metadata::check_metadata_with_storage_equal;
 use datafusion_common::tree_node::{
     Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion,
 };
 use datafusion_common::{
-    aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints,
-    DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence,
-    FunctionalDependencies, ParamValues, Result, ScalarValue, Spans, TableReference,
-    UnnestOptions,
+    Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Dependency,
+    FunctionalDependence, FunctionalDependencies, NullEquality, ParamValues, Result,
+    ScalarValue, Spans, TableReference, UnnestOptions, aggregate_functional_dependencies,
+    assert_eq_or_internal_err, assert_or_internal_err, internal_err, plan_err,
 };
 use indexmap::IndexSet;
 
@@ -198,7 +202,6 @@ pub use datafusion_common::{JoinConstraint, JoinType};
 /// # Ok(())
 /// # }
 /// ```
-///
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
 pub enum LogicalPlan {
     /// Evaluates an arbitrary list of expressions (essentially a
@@ -344,7 +347,7 @@ impl LogicalPlan {
                 output_schema
             }
             LogicalPlan::Dml(DmlStatement { output_schema, .. }) => output_schema,
-            LogicalPlan::Copy(CopyTo { input, .. }) => input.schema(),
+            LogicalPlan::Copy(CopyTo { output_schema, .. }) => output_schema,
             LogicalPlan::Ddl(ddl) => ddl.schema(),
             LogicalPlan::Unnest(Unnest { schema, .. }) => schema,
             LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, .. }) => {
@@ -556,7 +559,9 @@ impl LogicalPlan {
                 JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
                     left.head_output_expr()
                 }
-                JoinType::RightSemi | JoinType::RightAnti => right.head_output_expr(),
+                JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+                    right.head_output_expr()
+                }
             },
             LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, .. }) => {
                 static_term.head_output_expr()
@@ -655,7 +660,8 @@ impl LogicalPlan {
                 join_constraint,
                 on,
                 schema: _,
-                null_equals_null,
+                null_equality,
+                null_aware,
             }) => {
                 let schema =
                     build_join_schema(left.schema(), right.schema(), &join_type)?;
@@ -676,7 +682,8 @@ impl LogicalPlan {
                     on: new_on,
                     filter,
                     schema: DFSchemaRef::new(schema),
-                    null_equals_null,
+                    null_equality,
+                    null_aware,
                 }))
             }
             LogicalPlan::Subquery(_) => Ok(self),
@@ -807,16 +814,17 @@ impl LogicalPlan {
                 file_type,
                 options,
                 partition_by,
+                output_schema: _,
             }) => {
                 self.assert_no_expressions(expr)?;
                 let input = self.only_input(inputs)?;
-                Ok(LogicalPlan::Copy(CopyTo {
-                    input: Arc::new(input),
-                    output_url: output_url.clone(),
-                    file_type: Arc::clone(file_type),
-                    options: options.clone(),
-                    partition_by: partition_by.clone(),
-                }))
+                Ok(LogicalPlan::Copy(CopyTo::new(
+                    Arc::new(input),
+                    output_url.clone(),
+                    partition_by.clone(),
+                    Arc::clone(file_type),
+                    options.clone(),
+                )))
             }
             LogicalPlan::Values(Values { schema, .. }) => {
                 self.assert_no_inputs(inputs)?;
@@ -894,7 +902,8 @@ impl LogicalPlan {
                 join_type,
                 join_constraint,
                 on,
-                null_equals_null,
+                null_equality,
+                null_aware,
                 ..
             }) => {
                 let (left, right) = self.only_two_inputs(inputs)?;
@@ -918,7 +927,9 @@ impl LogicalPlan {
                 let mut iter = expr.into_iter();
                 while let Some(left) = iter.next() {
                     let Some(right) = iter.next() else {
-                        internal_err!("Expected a pair of expressions to construct the join on expression")?
+                        internal_err!(
+                            "Expected a pair of expressions to construct the join on expression"
+                        )?
                     };
 
                     // SimplifyExpression rule may add alias to the equi_expr.
@@ -933,7 +944,8 @@ impl LogicalPlan {
                     on: new_on,
                     filter: filter_expr,
                     schema: DFSchemaRef::new(schema),
-                    null_equals_null: *null_equals_null,
+                    null_equality: *null_equality,
+                    null_aware: *null_aware,
                 }))
             }
             LogicalPlan::Subquery(Subquery {
@@ -958,13 +970,13 @@ impl LogicalPlan {
             }
             LogicalPlan::Limit(Limit { skip, fetch, .. }) => {
                 let old_expr_len = skip.iter().chain(fetch.iter()).count();
-                if old_expr_len != expr.len() {
-                    return internal_err!(
-                        "Invalid number of new Limit expressions: expected {}, got {}",
-                        old_expr_len,
-                        expr.len()
-                    );
-                }
+                assert_eq_or_internal_err!(
+                    old_expr_len,
+                    expr.len(),
+                    "Invalid number of new Limit expressions: expected {}, got {}",
+                    old_expr_len,
+                    expr.len()
+                );
                 // `LogicalPlan::expressions()` returns in [skip, fetch] order, so we can pop from the end.
                 let new_fetch = fetch.as_ref().and_then(|_| expr.pop());
                 let new_skip = skip.as_ref().and_then(|_| expr.pop());
@@ -988,7 +1000,7 @@ impl LogicalPlan {
                 Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(
                     CreateMemoryTable {
                         input: Arc::new(input),
-                        constraints: Constraints::empty(),
+                        constraints: Constraints::default(),
                         name: name.clone(),
                         if_not_exists: *if_not_exists,
                         or_replace: *or_replace,
@@ -1046,7 +1058,10 @@ impl LogicalPlan {
                         let input = self.only_input(inputs)?;
                         let sort_expr = expr.split_off(on_expr.len() + select_expr.len());
                         let select_expr = expr.split_off(on_expr.len());
-                        assert!(sort_expr.is_empty(), "with_new_exprs for Distinct does not support sort expressions");
+                        assert!(
+                            sort_expr.is_empty(),
+                            "with_new_exprs for Distinct does not support sort expressions"
+                        );
                         Distinct::On(DistinctOn::try_new(
                             expr,
                             select_expr,
@@ -1091,15 +1106,13 @@ impl LogicalPlan {
                 }))
             }
             LogicalPlan::Statement(Statement::Prepare(Prepare {
-                name,
-                data_types,
-                ..
+                name, fields, ..
             })) => {
                 self.assert_no_expressions(expr)?;
                 let input = self.only_input(inputs)?;
                 Ok(LogicalPlan::Statement(Statement::Prepare(Prepare {
                     name: name.clone(),
-                    data_types: data_types.clone(),
+                    fields: fields.clone(),
                     input: Arc::new(input),
                 })))
             }
@@ -1151,45 +1164,49 @@ impl LogicalPlan {
 
     /// Helper for [Self::with_new_exprs] to use when no expressions are expected.
     #[inline]
-    #[allow(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again
+    #[expect(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again
     fn assert_no_expressions(&self, expr: Vec<Expr>) -> Result<()> {
-        if !expr.is_empty() {
-            return internal_err!("{self:?} should have no exprs, got {:?}", expr);
-        }
+        assert_or_internal_err!(
+            expr.is_empty(),
+            "{self:?} should have no exprs, got {:?}",
+            expr
+        );
         Ok(())
     }
 
     /// Helper for [Self::with_new_exprs] to use when no inputs are expected.
     #[inline]
-    #[allow(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again
+    #[expect(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again
     fn assert_no_inputs(&self, inputs: Vec<LogicalPlan>) -> Result<()> {
-        if !inputs.is_empty() {
-            return internal_err!("{self:?} should have no inputs, got: {:?}", inputs);
-        }
+        assert_or_internal_err!(
+            inputs.is_empty(),
+            "{self:?} should have no inputs, got: {:?}",
+            inputs
+        );
         Ok(())
     }
 
     /// Helper for [Self::with_new_exprs] to use when exactly one expression is expected.
     #[inline]
     fn only_expr(&self, mut expr: Vec<Expr>) -> Result<Expr> {
-        if expr.len() != 1 {
-            return internal_err!(
-                "{self:?} should have exactly one expr, got {:?}",
-                expr
-            );
-        }
+        assert_eq_or_internal_err!(
+            expr.len(),
+            1,
+            "{self:?} should have exactly one expr, got {:?}",
+            &expr
+        );
         Ok(expr.remove(0))
     }
 
     /// Helper for [Self::with_new_exprs] to use when exactly one input is expected.
     #[inline]
     fn only_input(&self, mut inputs: Vec<LogicalPlan>) -> Result<LogicalPlan> {
-        if inputs.len() != 1 {
-            return internal_err!(
-                "{self:?} should have exactly one input, got {:?}",
-                inputs
-            );
-        }
+        assert_eq_or_internal_err!(
+            inputs.len(),
+            1,
+            "{self:?} should have exactly one input, got {:?}",
+            &inputs
+        );
         Ok(inputs.remove(0))
     }
 
@@ -1199,12 +1216,12 @@ impl LogicalPlan {
         &self,
         mut inputs: Vec<LogicalPlan>,
     ) -> Result<(LogicalPlan, LogicalPlan)> {
-        if inputs.len() != 2 {
-            return internal_err!(
-                "{self:?} should have exactly two inputs, got {:?}",
-                inputs
-            );
-        }
+        assert_eq_or_internal_err!(
+            inputs.len(),
+            2,
+            "{self:?} should have exactly two inputs, got {:?}",
+            &inputs
+        );
         let right = inputs.remove(1);
         let left = inputs.remove(0);
         Ok((left, right))
@@ -1261,7 +1278,6 @@ impl LogicalPlan {
     ///    \n  TableScan: t1",
     ///    plan.display_indent().to_string()
     ///  );
-    ///
     /// ```
     pub fn with_param_values(
         self,
@@ -1275,7 +1291,7 @@ impl LogicalPlan {
             if let LogicalPlan::Statement(Statement::Prepare(prepare_lp)) =
                 plan_with_values
             {
-                param_values.verify(&prepare_lp.data_types)?;
+                param_values.verify_fields(&prepare_lp.fields)?;
                 // try and take ownership of the input if is not shared, clone otherwise
                 Arc::unwrap_or_clone(prepare_lp.input)
             } else {
@@ -1305,7 +1321,7 @@ impl LogicalPlan {
                 // Empty group_expr will return Some(1)
                 if group_expr
                     .iter()
-                    .all(|expr| matches!(expr, Expr::Literal(_)))
+                    .all(|expr| matches!(expr, Expr::Literal(_, _)))
                 {
                     Some(1)
                 } else {
@@ -1340,7 +1356,9 @@ impl LogicalPlan {
                 JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
                     left.max_rows()
                 }
-                JoinType::RightSemi | JoinType::RightAnti => right.max_rows(),
+                JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+                    right.max_rows()
+                }
             },
             LogicalPlan::Repartition(Repartition { input, .. }) => input.max_rows(),
             LogicalPlan::Union(Union { inputs, .. }) => {
@@ -1454,8 +1472,10 @@ impl LogicalPlan {
                     let original_name = name_preserver.save(&e);
                     let transformed_expr = e.transform_up(|e| {
                         if let Expr::Placeholder(Placeholder { id, .. }) = e {
-                            let value = param_values.get_placeholders_with_values(&id)?;
-                            Ok(Transformed::yes(Expr::Literal(value)))
+                            let (value, metadata) = param_values
+                                .get_placeholders_with_values(&id)?
+                                .into_inner();
+                            Ok(Transformed::yes(Expr::Literal(value, metadata)))
                         } else {
                             Ok(Transformed::no(e))
                         }
@@ -1463,11 +1483,30 @@ impl LogicalPlan {
                     // Preserve name to avoid breaking column references to this expression
                     Ok(transformed_expr.update_data(|expr| original_name.restore(expr)))
                 }
-            })
+            })?
+            .map_data(|plan| plan.update_schema_data_type())
         })
         .map(|res| res.data)
     }
 
+    /// Recompute schema fields' data type after replacing params, ensuring fields data type can be
+    /// updated according to the new parameters.
+    ///
+    /// Unlike `recompute_schema()`, this method rebuilds VALUES plans entirely to properly infer
+    /// types types from literal values after placeholder substitution.
+    fn update_schema_data_type(self) -> Result<LogicalPlan> {
+        match self {
+            // Build `LogicalPlan::Values` from the values for type inference.
+            // We can't use `recompute_schema` because it skips recomputing for
+            // `LogicalPlan::Values`.
+            LogicalPlan::Values(Values { values, schema: _ }) => {
+                LogicalPlanBuilder::values(values)?.build()
+            }
+            // other plans can just use `recompute_schema` directly.
+            plan => plan.recompute_schema(),
+        }
+    }
+
     /// Walk the logical plan, find any `Placeholder` tokens, and return a set of their names.
     pub fn get_parameter_names(&self) -> Result<HashSet<String>> {
         let mut param_names = HashSet::new();
@@ -1485,24 +1524,43 @@ impl LogicalPlan {
     }
 
     /// Walk the logical plan, find any `Placeholder` tokens, and return a map of their IDs and DataTypes
+    ///
+    /// Note that this will drop any extension or field metadata attached to parameters. Use
+    /// [`LogicalPlan::get_parameter_fields`] to keep extension metadata.
     pub fn get_parameter_types(
         &self,
     ) -> Result<HashMap<String, Option<DataType>>, DataFusionError> {
-        let mut param_types: HashMap<String, Option<DataType>> = HashMap::new();
+        let mut parameter_fields = self.get_parameter_fields()?;
+        Ok(parameter_fields
+            .drain()
+            .map(|(name, maybe_field)| {
+                (name, maybe_field.map(|field| field.data_type().clone()))
+            })
+            .collect())
+    }
+
+    /// Walk the logical plan, find any `Placeholder` tokens, and return a map of their IDs and FieldRefs
+    pub fn get_parameter_fields(
+        &self,
+    ) -> Result<HashMap<String, Option<FieldRef>>, DataFusionError> {
+        let mut param_types: HashMap<String, Option<FieldRef>> = HashMap::new();
 
         self.apply_with_subqueries(|plan| {
             plan.apply_expressions(|expr| {
                 expr.apply(|expr| {
-                    if let Expr::Placeholder(Placeholder { id, data_type }) = expr {
+                    if let Expr::Placeholder(Placeholder { id, field }) = expr {
                         let prev = param_types.get(id);
-                        match (prev, data_type) {
-                            (Some(Some(prev)), Some(dt)) => {
-                                if prev != dt {
-                                    plan_err!("Conflicting types for {id}")?;
-                                }
+                        match (prev, field) {
+                            (Some(Some(prev)), Some(field)) => {
+                                check_metadata_with_storage_equal(
+                                    (field.data_type(), Some(field.metadata())),
+                                    (prev.data_type(), Some(prev.metadata())),
+                                    "parameter",
+                                    &format!(": Conflicting types for id {id}"),
+                                )?;
                             }
-                            (_, Some(dt)) => {
-                                param_types.insert(id.clone(), Some(dt.clone()));
+                            (_, Some(field)) => {
+                                param_types.insert(id.clone(), Some(Arc::clone(field)));
                             }
                             _ => {
                                 param_types.insert(id.clone(), None);
@@ -1532,20 +1590,20 @@ impl LogicalPlan {
     /// ```
     ///
     /// ```
-    /// use arrow::datatypes::{Field, Schema, DataType};
-    /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan};
-    /// let schema = Schema::new(vec![
-    ///     Field::new("id", DataType::Int32, false),
-    /// ]);
-    /// let plan = table_scan(Some("t1"), &schema, None).unwrap()
-    ///     .filter(col("id").eq(lit(5))).unwrap()
-    ///     .build().unwrap();
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder};
+    /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+    /// let plan = table_scan(Some("t1"), &schema, None)
+    ///     .unwrap()
+    ///     .filter(col("id").eq(lit(5)))
+    ///     .unwrap()
+    ///     .build()
+    ///     .unwrap();
     ///
     /// // Format using display_indent
     /// let display_string = format!("{}", plan.display_indent());
     ///
-    /// assert_eq!("Filter: t1.id = Int32(5)\n  TableScan: t1",
-    ///             display_string);
+    /// assert_eq!("Filter: t1.id = Int32(5)\n  TableScan: t1", display_string);
     /// ```
     pub fn display_indent(&self) -> impl Display + '_ {
         // Boilerplate structure to wrap LogicalPlan with something
@@ -1574,21 +1632,24 @@ impl LogicalPlan {
     /// ```
     ///
     /// ```
-    /// use arrow::datatypes::{Field, Schema, DataType};
-    /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan};
-    /// let schema = Schema::new(vec![
-    ///     Field::new("id", DataType::Int32, false),
-    /// ]);
-    /// let plan = table_scan(Some("t1"), &schema, None).unwrap()
-    ///     .filter(col("id").eq(lit(5))).unwrap()
-    ///     .build().unwrap();
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder};
+    /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+    /// let plan = table_scan(Some("t1"), &schema, None)
+    ///     .unwrap()
+    ///     .filter(col("id").eq(lit(5)))
+    ///     .unwrap()
+    ///     .build()
+    ///     .unwrap();
     ///
     /// // Format using display_indent_schema
     /// let display_string = format!("{}", plan.display_indent_schema());
     ///
-    /// assert_eq!("Filter: t1.id = Int32(5) [id:Int32]\
+    /// assert_eq!(
+    ///     "Filter: t1.id = Int32(5) [id:Int32]\
     ///             \n  TableScan: t1 [id:Int32]",
-    ///             display_string);
+    ///     display_string
+    /// );
     /// ```
     pub fn display_indent_schema(&self) -> impl Display + '_ {
         // Boilerplate structure to wrap LogicalPlan with something
@@ -1636,14 +1697,15 @@ impl LogicalPlan {
     /// structure, and one with additional details such as schema.
     ///
     /// ```
-    /// use arrow::datatypes::{Field, Schema, DataType};
-    /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan};
-    /// let schema = Schema::new(vec![
-    ///     Field::new("id", DataType::Int32, false),
-    /// ]);
-    /// let plan = table_scan(Some("t1"), &schema, None).unwrap()
-    ///     .filter(col("id").eq(lit(5))).unwrap()
-    ///     .build().unwrap();
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder};
+    /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+    /// let plan = table_scan(Some("t1"), &schema, None)
+    ///     .unwrap()
+    ///     .filter(col("id").eq(lit(5)))
+    ///     .unwrap()
+    ///     .build()
+    ///     .unwrap();
     ///
     /// // Format using display_graphviz
     /// let graphviz_string = format!("{}", plan.display_graphviz());
@@ -1655,7 +1717,6 @@ impl LogicalPlan {
     /// ```bash
     ///   dot -Tpdf < /tmp/example.dot  > /tmp/example.pdf
     /// ```
-    ///
     pub fn display_graphviz(&self) -> impl Display + '_ {
         // Boilerplate structure to wrap LogicalPlan with something
         // that that can be formatted
@@ -1694,13 +1755,13 @@ impl LogicalPlan {
     /// Projection: id
     /// ```
     /// ```
-    /// use arrow::datatypes::{Field, Schema, DataType};
-    /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan};
-    /// let schema = Schema::new(vec![
-    ///     Field::new("id", DataType::Int32, false),
-    /// ]);
-    /// let plan = table_scan(Some("t1"), &schema, None).unwrap()
-    ///     .build().unwrap();
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder};
+    /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+    /// let plan = table_scan(Some("t1"), &schema, None)
+    ///     .unwrap()
+    ///     .build()
+    ///     .unwrap();
     ///
     /// // Format using display
     /// let display_string = format!("{}", plan.display());
@@ -1714,13 +1775,19 @@ impl LogicalPlan {
         impl Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 match self.0 {
-                    LogicalPlan::EmptyRelation(_) => write!(f, "EmptyRelation"),
+                    LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row,
+                        schema: _,
+                    }) => {
+                        let rows = if *produce_one_row { 1 } else { 0 };
+                        write!(f, "EmptyRelation: rows={rows}")
+                    }
                     LogicalPlan::RecursiveQuery(RecursiveQuery {
                         is_distinct, ..
                     }) => {
                         write!(f, "RecursiveQuery: is_distinct={is_distinct}")
                     }
-                    LogicalPlan::Values(Values { ref values, .. }) => {
+                    LogicalPlan::Values(Values { values, .. }) => {
                         let str_values: Vec<_> = values
                             .iter()
                             // limit to only 5 values to avoid horrible display
@@ -1740,11 +1807,11 @@ impl LogicalPlan {
                     }
 
                     LogicalPlan::TableScan(TableScan {
-                        ref source,
-                        ref table_name,
-                        ref projection,
-                        ref filters,
-                        ref fetch,
+                        source,
+                        table_name,
+                        projection,
+                        filters,
+                        fetch,
                         ..
                     }) => {
                         let projected_fields = match projection {
@@ -1814,7 +1881,7 @@ impl LogicalPlan {
 
                         Ok(())
                     }
-                    LogicalPlan::Projection(Projection { ref expr, .. }) => {
+                    LogicalPlan::Projection(Projection { expr, .. }) => {
                         write!(f, "Projection:")?;
                         for (i, expr_item) in expr.iter().enumerate() {
                             if i > 0 {
@@ -1840,18 +1907,19 @@ impl LogicalPlan {
                             .collect::<Vec<String>>()
                             .join(", ");
 
-                        write!(f, "CopyTo: format={} output_url={output_url} options: ({op_str})", file_type.get_ext())
+                        write!(
+                            f,
+                            "CopyTo: format={} output_url={output_url} options: ({op_str})",
+                            file_type.get_ext()
+                        )
                     }
                     LogicalPlan::Ddl(ddl) => {
                         write!(f, "{}", ddl.display())
                     }
                     LogicalPlan::Filter(Filter {
-                        predicate: ref expr,
-                        ..
+                        predicate: expr, ..
                     }) => write!(f, "Filter: {expr}"),
-                    LogicalPlan::Window(Window {
-                        ref window_expr, ..
-                    }) => {
+                    LogicalPlan::Window(Window { window_expr, .. }) => {
                         write!(
                             f,
                             "WindowAggr: windowExpr=[[{}]]",
@@ -1859,8 +1927,8 @@ impl LogicalPlan {
                         )
                     }
                     LogicalPlan::Aggregate(Aggregate {
-                        ref group_expr,
-                        ref aggr_expr,
+                        group_expr,
+                        aggr_expr,
                         ..
                     }) => write!(
                         f,
@@ -1883,7 +1951,7 @@ impl LogicalPlan {
                         Ok(())
                     }
                     LogicalPlan::Join(Join {
-                        on: ref keys,
+                        on: keys,
                         filter,
                         join_constraint,
                         join_type,
@@ -1895,20 +1963,26 @@ impl LogicalPlan {
                             .as_ref()
                             .map(|expr| format!(" Filter: {expr}"))
                             .unwrap_or_else(|| "".to_string());
-                        let join_type = if filter.is_none() && keys.is_empty() && matches!(join_type, JoinType::Inner) {
+                        let join_type = if filter.is_none()
+                            && keys.is_empty()
+                            && *join_type == JoinType::Inner
+                        {
                             "Cross".to_string()
                         } else {
                             join_type.to_string()
                         };
                         match join_constraint {
                             JoinConstraint::On => {
-                                write!(
-                                    f,
-                                    "{} Join: {}{}",
-                                    join_type,
-                                    join_expr.join(", "),
-                                    filter_expr
-                                )
+                                write!(f, "{join_type} Join:",)?;
+                                if !join_expr.is_empty() || !filter_expr.is_empty() {
+                                    write!(
+                                        f,
+                                        " {}{}",
+                                        join_expr.join(", "),
+                                        filter_expr
+                                    )?;
+                                }
+                                Ok(())
                             }
                             JoinConstraint::Using => {
                                 write!(
@@ -1952,22 +2026,25 @@ impl LogicalPlan {
                         // Attempt to display `skip` and `fetch` as literals if possible, otherwise as expressions.
                         let skip_str = match limit.get_skip_type() {
                             Ok(SkipType::Literal(n)) => n.to_string(),
-                            _ => limit.skip.as_ref().map_or_else(|| "None".to_string(), |x| x.to_string()),
+                            _ => limit
+                                .skip
+                                .as_ref()
+                                .map_or_else(|| "None".to_string(), |x| x.to_string()),
                         };
                         let fetch_str = match limit.get_fetch_type() {
                             Ok(FetchType::Literal(Some(n))) => n.to_string(),
                             Ok(FetchType::Literal(None)) => "None".to_string(),
-                            _ => limit.fetch.as_ref().map_or_else(|| "None".to_string(), |x| x.to_string())
+                            _ => limit
+                                .fetch
+                                .as_ref()
+                                .map_or_else(|| "None".to_string(), |x| x.to_string()),
                         };
-                        write!(
-                            f,
-                            "Limit: skip={skip_str}, fetch={fetch_str}",
-                        )
+                        write!(f, "Limit: skip={skip_str}, fetch={fetch_str}",)
                     }
                     LogicalPlan::Subquery(Subquery { .. }) => {
                         write!(f, "Subquery:")
                     }
-                    LogicalPlan::SubqueryAlias(SubqueryAlias { ref alias, .. }) => {
+                    LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => {
                         write!(f, "SubqueryAlias: {alias}")
                     }
                     LogicalPlan::Statement(statement) => {
@@ -1985,7 +2062,11 @@ impl LogicalPlan {
                             "DistinctOn: on_expr=[[{}]], select_expr=[[{}]], sort_expr=[[{}]]",
                             expr_vec_fmt!(on_expr),
                             expr_vec_fmt!(select_expr),
-                            if let Some(sort_expr) = sort_expr { expr_vec_fmt!(sort_expr) } else { "".to_string() },
+                            if let Some(sort_expr) = sort_expr {
+                                expr_vec_fmt!(sort_expr)
+                            } else {
+                                "".to_string()
+                            },
                         ),
                     },
                     LogicalPlan::Explain { .. } => write!(f, "Explain"),
@@ -1998,22 +2079,31 @@ impl LogicalPlan {
                     LogicalPlan::Unnest(Unnest {
                         input: plan,
                         list_type_columns: list_col_indices,
-                        struct_type_columns: struct_col_indices, .. }) => {
+                        struct_type_columns: struct_col_indices,
+                        ..
+                    }) => {
                         let input_columns = plan.schema().columns();
                         let list_type_columns = list_col_indices
                             .iter()
-                            .map(|(i,unnest_info)|
-                                format!("{}|depth={}", &input_columns[*i].to_string(),
-                                unnest_info.depth))
+                            .map(|(i, unnest_info)| {
+                                format!(
+                                    "{}|depth={}",
+                                    &input_columns[*i].to_string(),
+                                    unnest_info.depth
+                                )
+                            })
                             .collect::<Vec<String>>();
                         let struct_type_columns = struct_col_indices
                             .iter()
                             .map(|i| &input_columns[*i])
                             .collect::<Vec<&Column>>();
                         // get items from input_columns indexed by list_col_indices
-                        write!(f, "Unnest: lists[{}] structs[{}]",
-                        expr_vec_fmt!(list_type_columns),
-                        expr_vec_fmt!(struct_type_columns))
+                        write!(
+                            f,
+                            "Unnest: lists[{}] structs[{}]",
+                            expr_vec_fmt!(list_type_columns),
+                            expr_vec_fmt!(struct_type_columns)
+                        )
                     }
                 }
             }
@@ -2034,7 +2124,9 @@ impl ToStringifiedPlan for LogicalPlan {
     }
 }
 
-/// Produces no rows: An empty relation with an empty schema
+/// Relationship produces 0 or 1 placeholder rows with specified output schema
+/// In most cases the output schema for `EmptyRelation` would be empty,
+/// however, it can be non-empty typically for optimizer rules
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct EmptyRelation {
     /// Whether to produce a placeholder row
@@ -2046,7 +2138,10 @@ pub struct EmptyRelation {
 // Manual implementation needed because of `schema` field. Comparison excludes this field.
 impl PartialOrd for EmptyRelation {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        self.produce_one_row.partial_cmp(&other.produce_one_row)
+        self.produce_one_row
+            .partial_cmp(&other.produce_one_row)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2100,7 +2195,10 @@ pub struct Values {
 // Manual implementation needed because of `schema` field. Comparison excludes this field.
 impl PartialOrd for Values {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        self.values.partial_cmp(&other.values)
+        self.values
+            .partial_cmp(&other.values)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2125,6 +2223,8 @@ impl PartialOrd for Projection {
             Some(Ordering::Equal) => self.input.partial_cmp(&other.input),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2145,7 +2245,11 @@ impl Projection {
         if !expr.iter().any(|e| matches!(e, Expr::Wildcard { .. }))
             && expr.len() != schema.fields().len()
         {
-            return plan_err!("Projection has mismatch between number of expressions ({}) and number of fields in schema ({})", expr.len(), schema.fields().len());
+            return plan_err!(
+                "Projection has mismatch between number of expressions ({}) and number of fields in schema ({})",
+                expr.len(),
+                schema.fields().len()
+            );
         }
         Ok(Self {
             expr,
@@ -2173,14 +2277,22 @@ impl Projection {
 ///   will be computed.
 /// * `exprs`: A slice of `Expr` expressions representing the projection operation to apply.
 ///
+/// # Metadata Handling
+///
+/// - **Schema-level metadata**: Passed through unchanged from the input schema
+/// - **Field-level metadata**: Determined by each expression via [`exprlist_to_fields`], which
+///   calls [`Expr::to_field`] to handle expression-specific metadata (literals, aliases, etc.)
+///
 /// # Returns
 ///
 /// A `Result` containing an `Arc<DFSchema>` representing the schema of the result
 /// produced by the projection operation. If the schema computation is successful,
 /// the `Result` will contain the schema; otherwise, it will contain an error.
 pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result<Arc<DFSchema>> {
+    // Preserve input schema metadata at the schema level
     let metadata = input.schema().metadata().clone();
 
+    // Convert expressions to fields with Field properties determined by `Expr::to_field`
     let schema =
         DFSchema::new_with_metadata(exprlist_to_fields(exprs, input)?, metadata)?
             .with_functional_dependencies(calc_func_dependencies_for_project(
@@ -2209,15 +2321,47 @@ impl SubqueryAlias {
         alias: impl Into<TableReference>,
     ) -> Result<Self> {
         let alias = alias.into();
-        let fields = change_redundant_column(plan.schema().fields());
-        let meta_data = plan.schema().as_ref().metadata().clone();
-        let schema: Schema =
-            DFSchema::from_unqualified_fields(fields.into(), meta_data)?.into();
-        // Since schema is the same, other than qualifier, we can use existing
-        // functional dependencies:
+
+        // Since SubqueryAlias will replace all field qualification for the output schema of `plan`,
+        // no field must share the same column name as this would lead to ambiguity when referencing
+        // columns in parent logical nodes.
+
+        // Compute unique aliases, if any, for each column of the input's schema.
+        let aliases = unique_field_aliases(plan.schema().fields());
+        let is_projection_needed = aliases.iter().any(Option::is_some);
+
+        // Insert a projection node, if needed, to make sure aliases are applied.
+        let plan = if is_projection_needed {
+            let projection_expressions = aliases
+                .iter()
+                .zip(plan.schema().iter())
+                .map(|(alias, (qualifier, field))| {
+                    let column =
+                        Expr::Column(Column::new(qualifier.cloned(), field.name()));
+                    match alias {
+                        None => column,
+                        Some(alias) => {
+                            Expr::Alias(Alias::new(column, qualifier.cloned(), alias))
+                        }
+                    }
+                })
+                .collect();
+            let projection = Projection::try_new(projection_expressions, plan)?;
+            Arc::new(LogicalPlan::Projection(projection))
+        } else {
+            plan
+        };
+
+        // Requalify fields with the new `alias`.
+        let fields = plan.schema().fields().clone();
+        let meta_data = plan.schema().metadata().clone();
         let func_dependencies = plan.schema().functional_dependencies().clone();
+
+        let schema = DFSchema::from_unqualified_fields(fields, meta_data)?;
+        let schema = schema.as_arrow();
+
         let schema = DFSchemaRef::new(
-            DFSchema::try_from_qualified_schema(alias.clone(), &schema)?
+            DFSchema::try_from_qualified_schema(alias.clone(), schema)?
                 .with_functional_dependencies(func_dependencies)?,
         );
         Ok(SubqueryAlias {
@@ -2235,6 +2379,8 @@ impl PartialOrd for SubqueryAlias {
             Some(Ordering::Equal) => self.alias.partial_cmp(&other.alias),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2290,12 +2436,12 @@ impl Filter {
         // Note that it is not always possible to resolve the predicate expression during plan
         // construction (such as with correlated subqueries) so we make a best effort here and
         // ignore errors resolving the expression against the schema.
-        if let Ok(predicate_type) = predicate.get_type(input.schema()) {
-            if !Filter::is_allowed_filter_type(&predicate_type) {
-                return plan_err!(
-                    "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}"
-                );
-            }
+        if let Ok(predicate_type) = predicate.get_type(input.schema())
+            && !Filter::is_allowed_filter_type(&predicate_type)
+        {
+            return plan_err!(
+                "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}"
+            );
         }
 
         Ok(Self {
@@ -2422,18 +2568,23 @@ impl Window {
             .iter()
             .enumerate()
             .filter_map(|(idx, expr)| {
-                if let Expr::WindowFunction(WindowFunction {
+                let Expr::WindowFunction(window_fun) = expr else {
+                    return None;
+                };
+                let WindowFunction {
                     fun: WindowFunctionDefinition::WindowUDF(udwf),
                     params: WindowFunctionParams { partition_by, .. },
-                }) = expr
-                {
-                    // When there is no PARTITION BY, row number will be unique
-                    // across the entire table.
-                    if udwf.name() == "row_number" && partition_by.is_empty() {
-                        return Some(idx + input_len);
-                    }
+                } = window_fun.as_ref()
+                else {
+                    return None;
+                };
+                // When there is no PARTITION BY, row number will be unique
+                // across the entire table.
+                if udwf.name() == "row_number" && partition_by.is_empty() {
+                    Some(idx + input_len)
+                } else {
+                    None
                 }
-                None
             })
             .map(|idx| {
                 FunctionalDependence::new(vec![idx], vec![], false)
@@ -2450,6 +2601,20 @@ impl Window {
             window_func_dependencies.extend(new_deps);
         }
 
+        // Validate that FILTER clauses are only used with aggregate window functions
+        if let Some(e) = window_expr.iter().find(|e| {
+            matches!(
+                e,
+                Expr::WindowFunction(wf)
+                    if !matches!(wf.fun, WindowFunctionDefinition::AggregateUDF(_))
+                        && wf.params.filter.is_some()
+            )
+        }) {
+            return plan_err!(
+                "FILTER clause can only be used with aggregate window functions. Found in '{e}'"
+            );
+        }
+
         Self::try_new_with_schema(
             window_expr,
             input,
@@ -2460,16 +2625,22 @@ impl Window {
         )
     }
 
+    /// Create a new window function using the provided schema to avoid the overhead of
+    /// building the schema again when the schema is already known.
+    ///
+    /// This method should only be called when you are absolutely sure that the schema being
+    /// provided is correct for the window function. If in doubt, call [try_new](Self::try_new) instead.
     pub fn try_new_with_schema(
         window_expr: Vec<Expr>,
         input: Arc<LogicalPlan>,
         schema: DFSchemaRef,
     ) -> Result<Self> {
-        if window_expr.len() != schema.fields().len() - input.schema().fields().len() {
+        let input_fields_count = input.schema().fields().len();
+        if schema.fields().len() != input_fields_count + window_expr.len() {
             return plan_err!(
-                "Window has mismatch between number of expressions ({}) and number of fields in schema ({})",
-                window_expr.len(),
-                schema.fields().len() - input.schema().fields().len()
+                "Window schema has wrong number of fields. Expected {} got {}",
+                input_fields_count + window_expr.len(),
+                schema.fields().len()
             );
         }
 
@@ -2484,9 +2655,22 @@ impl Window {
 // Manual implementation needed because of `schema` field. Comparison excludes this field.
 impl PartialOrd for Window {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        match self.input.partial_cmp(&other.input) {
-            Some(Ordering::Equal) => self.window_expr.partial_cmp(&other.window_expr),
-            cmp => cmp,
+        match self.input.partial_cmp(&other.input)? {
+            Ordering::Equal => {} // continue
+            not_equal => return Some(not_equal),
+        }
+
+        match self.window_expr.partial_cmp(&other.window_expr)? {
+            Ordering::Equal => {} // continue
+            not_equal => return Some(not_equal),
+        }
+
+        // Contract for PartialOrd and PartialEq consistency requires that
+        // a == b if and only if partial_cmp(a, b) == Some(Equal).
+        if self == other {
+            Some(Ordering::Equal)
+        } else {
+            None
         }
     }
 }
@@ -2560,7 +2744,10 @@ impl PartialOrd for TableScan {
             filters: &other.filters,
             fetch: &other.fetch,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2603,7 +2790,7 @@ impl TableScan {
                 let df_schema = DFSchema::new_with_metadata(
                     p.iter()
                         .map(|i| {
-                            (Some(table_name.clone()), Arc::new(schema.field(*i).clone()))
+                            (Some(table_name.clone()), Arc::clone(&schema.fields()[*i]))
                         })
                         .collect(),
                     schema.metadata.clone(),
@@ -2648,7 +2835,8 @@ pub struct Union {
 
 impl Union {
     /// Constructs new Union instance deriving schema from inputs.
-    fn try_new(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
+    /// Schema data types must match exactly.
+    pub fn try_new(inputs: Vec<Arc<LogicalPlan>>) -> Result<Self> {
         let schema = Self::derive_schema_from_inputs(&inputs, false, false)?;
         Ok(Union { inputs, schema })
     }
@@ -2693,7 +2881,9 @@ impl Union {
                 {
                     expr.push(Expr::Column(column));
                 } else {
-                    expr.push(Expr::Literal(ScalarValue::Null).alias(column.name()));
+                    expr.push(
+                        Expr::Literal(ScalarValue::Null, None).alias(column.name()),
+                    );
                 }
             }
             wrapped_inputs.push(Arc::new(LogicalPlan::Projection(
@@ -2781,15 +2971,16 @@ impl Union {
 
                     let mut field =
                         Field::new(name, data_type.clone(), final_is_nullable);
-                    field.set_metadata(intersect_maps(unmerged_metadata));
+                    field.set_metadata(intersect_metadata_for_union(unmerged_metadata));
 
                     (None, Arc::new(field))
                 },
             )
             .collect::<Vec<(Option<TableReference>, _)>>();
 
-        let union_schema_metadata =
-            intersect_maps(inputs.iter().map(|input| input.schema().metadata()));
+        let union_schema_metadata = intersect_metadata_for_union(
+            inputs.iter().map(|input| input.schema().metadata()),
+        );
 
         // Functional Dependencies are not preserved after UNION operation
         let schema = DFSchema::new_with_metadata(union_fields, union_schema_metadata)?;
@@ -2858,14 +3049,16 @@ impl Union {
                 };
 
                 let mut field = Field::new(&name, data_type.clone(), nullable);
-                let field_metadata =
-                    intersect_maps(fields.iter().map(|field| field.metadata()));
+                let field_metadata = intersect_metadata_for_union(
+                    fields.iter().map(|field| field.metadata()),
+                );
                 field.set_metadata(field_metadata);
                 Ok((None, Arc::new(field)))
             })
             .collect::<Result<_>>()?;
-        let union_schema_metadata =
-            intersect_maps(inputs.iter().map(|input| input.schema().metadata()));
+        let union_schema_metadata = intersect_metadata_for_union(
+            inputs.iter().map(|input| input.schema().metadata()),
+        );
 
         // Functional Dependencies are not preserved after UNION operation
         let schema = DFSchema::new_with_metadata(union_fields, union_schema_metadata)?;
@@ -2875,25 +3068,13 @@ impl Union {
     }
 }
 
-fn intersect_maps<'a>(
-    inputs: impl IntoIterator<Item = &'a HashMap<String, String>>,
-) -> HashMap<String, String> {
-    let mut inputs = inputs.into_iter();
-    let mut merged: HashMap<String, String> = inputs.next().cloned().unwrap_or_default();
-    for input in inputs {
-        // The extra dereference below (`&*v`) is a workaround for https://github.com/rkyv/rkyv/issues/434.
-        // When this crate is used in a workspace that enables the `rkyv-64` feature in the `chrono` crate,
-        // this triggers a Rust compilation error:
-        // error[E0277]: can't compare `Option<&std::string::String>` with `Option<&mut std::string::String>`.
-        merged.retain(|k, v| input.get(k) == Some(&*v));
-    }
-    merged
-}
-
 // Manual implementation needed because of `schema` field. Comparison excludes this field.
 impl PartialOrd for Union {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        self.inputs.partial_cmp(&other.inputs)
+        self.inputs
+            .partial_cmp(&other.inputs)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -2936,154 +3117,47 @@ impl PartialOrd for DescribeTable {
     }
 }
 
-/// Output formats for controlling for Explain plans
+/// Options for EXPLAIN
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum ExplainFormat {
-    /// Indent mode
-    ///
-    /// Example:
-    /// ```text
-    /// > explain format indent select x from values (1) t(x);
-    /// +---------------+-----------------------------------------------------+
-    /// | plan_type     | plan                                                |
-    /// +---------------+-----------------------------------------------------+
-    /// | logical_plan  | SubqueryAlias: t                                    |
-    /// |               |   Projection: column1 AS x                          |
-    /// |               |     Values: (Int64(1))                              |
-    /// | physical_plan | ProjectionExec: expr=[column1@0 as x]               |
-    /// |               |   DataSourceExec: partitions=1, partition_sizes=[1] |
-    /// |               |                                                     |
-    /// +---------------+-----------------------------------------------------+
-    /// ```
-    Indent,
-    /// Tree mode
-    ///
-    /// Example:
-    /// ```text
-    /// > explain format tree select x from values (1) t(x);
-    /// +---------------+-------------------------------+
-    /// | plan_type     | plan                          |
-    /// +---------------+-------------------------------+
-    /// | physical_plan | ┌───────────────────────────┐ |
-    /// |               | │       ProjectionExec      │ |
-    /// |               | │    --------------------   │ |
-    /// |               | │        x: column1@0       │ |
-    /// |               | └─────────────┬─────────────┘ |
-    /// |               | ┌─────────────┴─────────────┐ |
-    /// |               | │       DataSourceExec      │ |
-    /// |               | │    --------------------   │ |
-    /// |               | │         bytes: 128        │ |
-    /// |               | │       format: memory      │ |
-    /// |               | │          rows: 1          │ |
-    /// |               | └───────────────────────────┘ |
-    /// |               |                               |
-    /// +---------------+-------------------------------+
-    /// ```
-    Tree,
-    /// Postgres Json mode
-    ///
-    /// A displayable structure that produces plan in postgresql JSON format.
-    ///
-    /// Users can use this format to visualize the plan in existing plan
-    /// visualization tools, for example [dalibo](https://explain.dalibo.com/)
-    ///
-    /// Example:
-    /// ```text
-    /// > explain format pgjson select x from values (1) t(x);
-    /// +--------------+--------------------------------------+
-    /// | plan_type    | plan                                 |
-    /// +--------------+--------------------------------------+
-    /// | logical_plan | [                                    |
-    /// |              |   {                                  |
-    /// |              |     "Plan": {                        |
-    /// |              |       "Alias": "t",                  |
-    /// |              |       "Node Type": "Subquery",       |
-    /// |              |       "Output": [                    |
-    /// |              |         "x"                          |
-    /// |              |       ],                             |
-    /// |              |       "Plans": [                     |
-    /// |              |         {                            |
-    /// |              |           "Expressions": [           |
-    /// |              |             "column1 AS x"           |
-    /// |              |           ],                         |
-    /// |              |           "Node Type": "Projection", |
-    /// |              |           "Output": [                |
-    /// |              |             "x"                      |
-    /// |              |           ],                         |
-    /// |              |           "Plans": [                 |
-    /// |              |             {                        |
-    /// |              |               "Node Type": "Values", |
-    /// |              |               "Output": [            |
-    /// |              |                 "column1"            |
-    /// |              |               ],                     |
-    /// |              |               "Plans": [],           |
-    /// |              |               "Values": "(Int64(1))" |
-    /// |              |             }                        |
-    /// |              |           ]                          |
-    /// |              |         }                            |
-    /// |              |       ]                              |
-    /// |              |     }                                |
-    /// |              |   }                                  |
-    /// |              | ]                                    |
-    /// +--------------+--------------------------------------+
-    /// ```
-    PostgresJSON,
-    /// Graphviz mode
-    ///
-    /// Example:
-    /// ```text
-    /// > explain format graphviz select x from values (1) t(x);
-    /// +--------------+------------------------------------------------------------------------+
-    /// | plan_type    | plan                                                                   |
-    /// +--------------+------------------------------------------------------------------------+
-    /// | logical_plan |                                                                        |
-    /// |              | // Begin DataFusion GraphViz Plan,                                     |
-    /// |              | // display it online here: https://dreampuf.github.io/GraphvizOnline   |
-    /// |              |                                                                        |
-    /// |              | digraph {                                                              |
-    /// |              |   subgraph cluster_1                                                   |
-    /// |              |   {                                                                    |
-    /// |              |     graph[label="LogicalPlan"]                                         |
-    /// |              |     2[shape=box label="SubqueryAlias: t"]                              |
-    /// |              |     3[shape=box label="Projection: column1 AS x"]                      |
-    /// |              |     2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]                |
-    /// |              |     4[shape=box label="Values: (Int64(1))"]                            |
-    /// |              |     3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]                |
-    /// |              |   }                                                                    |
-    /// |              |   subgraph cluster_5                                                   |
-    /// |              |   {                                                                    |
-    /// |              |     graph[label="Detailed LogicalPlan"]                                |
-    /// |              |     6[shape=box label="SubqueryAlias: t\nSchema: [x:Int64;N]"]         |
-    /// |              |     7[shape=box label="Projection: column1 AS x\nSchema: [x:Int64;N]"] |
-    /// |              |     6 -> 7 [arrowhead=none, arrowtail=normal, dir=back]                |
-    /// |              |     8[shape=box label="Values: (Int64(1))\nSchema: [column1:Int64;N]"] |
-    /// |              |     7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]                |
-    /// |              |   }                                                                    |
-    /// |              | }                                                                      |
-    /// |              | // End DataFusion GraphViz Plan                                        |
-    /// |              |                                                                        |
-    /// +--------------+------------------------------------------------------------------------+
-    /// ```
-    Graphviz,
+pub struct ExplainOption {
+    /// Include detailed debug info
+    pub verbose: bool,
+    /// Actually execute the plan and report metrics
+    pub analyze: bool,
+    /// Output syntax/format
+    pub format: ExplainFormat,
 }
 
-/// Implement  parsing strings to `ExplainFormat`
-impl FromStr for ExplainFormat {
-    type Err = DataFusionError;
-
-    fn from_str(format: &str) -> std::result::Result<Self, Self::Err> {
-        match format.to_lowercase().as_str() {
-            "indent" => Ok(ExplainFormat::Indent),
-            "tree" => Ok(ExplainFormat::Tree),
-            "pgjson" => Ok(ExplainFormat::PostgresJSON),
-            "graphviz" => Ok(ExplainFormat::Graphviz),
-            _ => {
-                plan_err!("Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got '{format}'")
-            }
+impl Default for ExplainOption {
+    fn default() -> Self {
+        ExplainOption {
+            verbose: false,
+            analyze: false,
+            format: ExplainFormat::Indent,
         }
     }
 }
 
+impl ExplainOption {
+    /// Builder‐style setter for `verbose`
+    pub fn with_verbose(mut self, verbose: bool) -> Self {
+        self.verbose = verbose;
+        self
+    }
+
+    /// Builder‐style setter for `analyze`
+    pub fn with_analyze(mut self, analyze: bool) -> Self {
+        self.analyze = analyze;
+        self
+    }
+
+    /// Builder‐style setter for `format`
+    pub fn with_format(mut self, format: ExplainFormat) -> Self {
+        self.format = format;
+        self
+    }
+}
+
 /// Produces a relation with string representations of
 /// various parts of the plan
 ///
@@ -3133,7 +3207,10 @@ impl PartialOrd for Explain {
             stringified_plans: &other.stringified_plans,
             logical_optimization_succeeded: &other.logical_optimization_succeeded,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -3156,6 +3233,8 @@ impl PartialOrd for Analyze {
             Some(Ordering::Equal) => self.input.partial_cmp(&other.input),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -3163,6 +3242,7 @@ impl PartialOrd for Analyze {
 // TODO(clippy): This clippy `allow` should be removed if
 // the manual `PartialEq` is removed in favor of a derive.
 // (see `PartialEq` the impl for details.)
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::derived_hash_with_manual_eq)]
 #[derive(Debug, Clone, Eq, Hash)]
 pub struct Extension {
@@ -3219,7 +3299,7 @@ impl Limit {
     pub fn get_skip_type(&self) -> Result<SkipType> {
         match self.skip.as_deref() {
             Some(expr) => match *expr {
-                Expr::Literal(ScalarValue::Int64(s)) => {
+                Expr::Literal(ScalarValue::Int64(s), _) => {
                     // `skip = NULL` is equivalent to `skip = 0`
                     let s = s.unwrap_or(0);
                     if s >= 0 {
@@ -3239,14 +3319,16 @@ impl Limit {
     pub fn get_fetch_type(&self) -> Result<FetchType> {
         match self.fetch.as_deref() {
             Some(expr) => match *expr {
-                Expr::Literal(ScalarValue::Int64(Some(s))) => {
+                Expr::Literal(ScalarValue::Int64(Some(s)), _) => {
                     if s >= 0 {
                         Ok(FetchType::Literal(Some(s as usize)))
                     } else {
                         plan_err!("LIMIT must be >= 0, '{}' was provided", s)
                     }
                 }
-                Expr::Literal(ScalarValue::Int64(None)) => Ok(FetchType::Literal(None)),
+                Expr::Literal(ScalarValue::Int64(None), _) => {
+                    Ok(FetchType::Literal(None))
+                }
                 _ => Ok(FetchType::UnsupportedExpr),
             },
             None => Ok(FetchType::Literal(None)),
@@ -3381,7 +3463,10 @@ impl PartialOrd for DistinctOn {
             sort_expr: &other.sort_expr,
             input: &other.input,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -3405,7 +3490,9 @@ pub struct Aggregate {
     pub input: Arc<LogicalPlan>,
     /// Grouping expressions
     pub group_expr: Vec<Expr>,
-    /// Aggregate expressions
+    /// Aggregate expressions.
+    ///
+    /// Note these *must* be either [`Expr::AggregateFunction`] or [`Expr::Alias`]
     pub aggr_expr: Vec<Expr>,
     /// The schema description of the aggregate output
     pub schema: DFSchemaRef,
@@ -3458,6 +3545,7 @@ impl Aggregate {
     ///
     /// This method should only be called when you are absolutely sure that the schema being
     /// provided is correct for the aggregate. If in doubt, call [try_new](Self::try_new) instead.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new_with_schema(
         input: Arc<LogicalPlan>,
         group_expr: Vec<Expr>,
@@ -3466,7 +3554,10 @@ impl Aggregate {
     ) -> Result<Self> {
         if group_expr.is_empty() && aggr_expr.is_empty() {
             return plan_err!(
-                "Aggregate requires at least one grouping or aggregate expression"
+                "Aggregate requires at least one grouping or aggregate expression. \
+                Aggregate without grouping expressions nor aggregate expressions is \
+                logically equivalent to, but less efficient than, VALUES producing \
+                single row. Please use VALUES instead."
             );
         }
         let group_expr_count = grouping_set_expr_count(&group_expr)?;
@@ -3565,6 +3656,8 @@ impl PartialOrd for Aggregate {
             }
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -3695,8 +3788,16 @@ pub struct Join {
     pub join_constraint: JoinConstraint,
     /// The output schema, containing fields from the left and right inputs
     pub schema: DFSchemaRef,
-    /// If null_equals_null is true, null == null else null != null
-    pub null_equals_null: bool,
+    /// Defines the null equality for the join.
+    pub null_equality: NullEquality,
+    /// Whether this is a null-aware anti join (for NOT IN semantics).
+    ///
+    /// Only applies to LeftAnti joins. When true, implements SQL NOT IN semantics where:
+    /// - If the right side (subquery) contains any NULL in join keys, no rows are output
+    /// - Left side rows with NULL in join keys are not output
+    ///
+    /// This is required for correct NOT IN subquery behavior with three-valued logic.
+    pub null_aware: bool,
 }
 
 impl Join {
@@ -3713,11 +3814,13 @@ impl Join {
     /// * `filter` - Optional filter expression (for non-equijoin conditions)
     /// * `join_type` - Type of join (Inner, Left, Right, etc.)
     /// * `join_constraint` - Join constraint (On, Using)
-    /// * `null_equals_null` - Whether NULL = NULL in join comparisons
+    /// * `null_equality` - How to handle nulls in join comparisons
+    /// * `null_aware` - Whether this is a null-aware anti join (for NOT IN semantics)
     ///
     /// # Returns
     ///
     /// A new Join operator with the computed schema
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<LogicalPlan>,
         right: Arc<LogicalPlan>,
@@ -3725,7 +3828,8 @@ impl Join {
         filter: Option<Expr>,
         join_type: JoinType,
         join_constraint: JoinConstraint,
-        null_equals_null: bool,
+        null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
         let join_schema = build_join_schema(left.schema(), right.schema(), &join_type)?;
 
@@ -3737,41 +3841,67 @@ impl Join {
             join_type,
             join_constraint,
             schema: Arc::new(join_schema),
-            null_equals_null,
+            null_equality,
+            null_aware,
         })
     }
 
-    /// Create Join with input which wrapped with projection, this method is used to help create physical join.
+    /// Create Join with input which wrapped with projection, this method is used in physical planning only to help
+    /// create the physical join.
     pub fn try_new_with_project_input(
         original: &LogicalPlan,
         left: Arc<LogicalPlan>,
         right: Arc<LogicalPlan>,
         column_on: (Vec<Column>, Vec<Column>),
-    ) -> Result<Self> {
+    ) -> Result<(Self, bool)> {
         let original_join = match original {
             LogicalPlan::Join(join) => join,
             _ => return plan_err!("Could not create join with project input"),
         };
 
+        let mut left_sch = LogicalPlanBuilder::from(Arc::clone(&left));
+        let mut right_sch = LogicalPlanBuilder::from(Arc::clone(&right));
+
+        let mut requalified = false;
+
+        // By definition, the resulting schema of an inner/left/right & full join will have first the left side fields and then the right,
+        // potentially having duplicate field names. Note this will only qualify fields if they have not been qualified before.
+        if original_join.join_type == JoinType::Inner
+            || original_join.join_type == JoinType::Left
+            || original_join.join_type == JoinType::Right
+            || original_join.join_type == JoinType::Full
+        {
+            (left_sch, right_sch, requalified) =
+                requalify_sides_if_needed(left_sch.clone(), right_sch.clone())?;
+        }
+
         let on: Vec<(Expr, Expr)> = column_on
             .0
             .into_iter()
             .zip(column_on.1)
             .map(|(l, r)| (Expr::Column(l), Expr::Column(r)))
             .collect();
-        let join_schema =
-            build_join_schema(left.schema(), right.schema(), &original_join.join_type)?;
 
-        Ok(Join {
-            left,
-            right,
-            on,
-            filter: original_join.filter.clone(),
-            join_type: original_join.join_type,
-            join_constraint: original_join.join_constraint,
-            schema: Arc::new(join_schema),
-            null_equals_null: original_join.null_equals_null,
-        })
+        let join_schema = build_join_schema(
+            left_sch.schema(),
+            right_sch.schema(),
+            &original_join.join_type,
+        )?;
+
+        Ok((
+            Join {
+                left,
+                right,
+                on,
+                filter: original_join.filter.clone(),
+                join_type: original_join.join_type,
+                join_constraint: original_join.join_constraint,
+                schema: Arc::new(join_schema),
+                null_equality: original_join.null_equality,
+                null_aware: original_join.null_aware,
+            },
+            requalified,
+        ))
     }
 }
 
@@ -3792,8 +3922,8 @@ impl PartialOrd for Join {
             pub join_type: &'a JoinType,
             /// Join constraint
             pub join_constraint: &'a JoinConstraint,
-            /// If null_equals_null is true, null == null else null != null
-            pub null_equals_null: &'a bool,
+            /// The null handling behavior for equalities
+            pub null_equality: &'a NullEquality,
         }
         let comparable_self = ComparableJoin {
             left: &self.left,
@@ -3802,7 +3932,7 @@ impl PartialOrd for Join {
             filter: &self.filter,
             join_type: &self.join_type,
             join_constraint: &self.join_constraint,
-            null_equals_null: &self.null_equals_null,
+            null_equality: &self.null_equality,
         };
         let comparable_other = ComparableJoin {
             left: &other.left,
@@ -3811,9 +3941,12 @@ impl PartialOrd for Join {
             filter: &other.filter,
             join_type: &other.join_type,
             join_constraint: &other.join_constraint,
-            null_equals_null: &other.null_equals_null,
+            null_equality: &other.null_equality,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -3978,28 +4111,241 @@ impl PartialOrd for Unnest {
             dependency_indices: &other.dependency_indices,
             options: &other.options,
         };
-        comparable_self.partial_cmp(&comparable_other)
+        comparable_self
+            .partial_cmp(&comparable_other)
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
+    }
+}
+
+impl Unnest {
+    pub fn try_new(
+        input: Arc<LogicalPlan>,
+        exec_columns: Vec<Column>,
+        options: UnnestOptions,
+    ) -> Result<Self> {
+        if exec_columns.is_empty() {
+            return plan_err!("unnest plan requires at least 1 column to unnest");
+        }
+
+        let mut list_columns: Vec<(usize, ColumnUnnestList)> = vec![];
+        let mut struct_columns = vec![];
+        let indices_to_unnest = exec_columns
+            .iter()
+            .map(|c| Ok((input.schema().index_of_column(c)?, c)))
+            .collect::<Result<HashMap<usize, &Column>>>()?;
+
+        let input_schema = input.schema();
+
+        let mut dependency_indices = vec![];
+        // Transform input schema into new schema
+        // Given this comprehensive example
+        //
+        // input schema:
+        // 1.col1_unnest_placeholder: list[list[int]],
+        // 2.col1: list[list[int]]
+        // 3.col2: list[int]
+        // with unnest on unnest(col1,depth=2), unnest(col1,depth=1) and unnest(col2,depth=1)
+        // output schema:
+        // 1.unnest_col1_depth_2: int
+        // 2.unnest_col1_depth_1: list[int]
+        // 3.col1: list[list[int]]
+        // 4.unnest_col2_depth_1: int
+        // Meaning the placeholder column will be replaced by its unnested variation(s), note
+        // the plural.
+        let fields = input_schema
+            .iter()
+            .enumerate()
+            .map(|(index, (original_qualifier, original_field))| {
+                match indices_to_unnest.get(&index) {
+                    Some(column_to_unnest) => {
+                        let recursions_on_column = options
+                            .recursions
+                            .iter()
+                            .filter(|p| -> bool { &p.input_column == *column_to_unnest })
+                            .collect::<Vec<_>>();
+                        let mut transformed_columns = recursions_on_column
+                            .iter()
+                            .map(|r| {
+                                list_columns.push((
+                                    index,
+                                    ColumnUnnestList {
+                                        output_column: r.output_column.clone(),
+                                        depth: r.depth,
+                                    },
+                                ));
+                                Ok(get_unnested_columns(
+                                    &r.output_column.name,
+                                    original_field.data_type(),
+                                    r.depth,
+                                )?
+                                .into_iter()
+                                .next()
+                                .unwrap()) // because unnesting a list column always result into one result
+                            })
+                            .collect::<Result<Vec<(Column, Arc<Field>)>>>()?;
+                        if transformed_columns.is_empty() {
+                            transformed_columns = get_unnested_columns(
+                                &column_to_unnest.name,
+                                original_field.data_type(),
+                                1,
+                            )?;
+                            match original_field.data_type() {
+                                DataType::Struct(_) => {
+                                    struct_columns.push(index);
+                                }
+                                DataType::List(_)
+                                | DataType::FixedSizeList(_, _)
+                                | DataType::LargeList(_)
+                                | DataType::ListView(_)
+                                | DataType::LargeListView(_) => {
+                                    list_columns.push((
+                                        index,
+                                        ColumnUnnestList {
+                                            output_column: Column::from_name(
+                                                &column_to_unnest.name,
+                                            ),
+                                            depth: 1,
+                                        },
+                                    ));
+                                }
+                                _ => {}
+                            };
+                        }
+
+                        // new columns dependent on the same original index
+                        dependency_indices.extend(std::iter::repeat_n(
+                            index,
+                            transformed_columns.len(),
+                        ));
+                        Ok(transformed_columns
+                            .iter()
+                            .map(|(col, field)| {
+                                (col.relation.to_owned(), field.to_owned())
+                            })
+                            .collect())
+                    }
+                    None => {
+                        dependency_indices.push(index);
+                        Ok(vec![(
+                            original_qualifier.cloned(),
+                            Arc::clone(original_field),
+                        )])
+                    }
+                }
+            })
+            .collect::<Result<Vec<_>>>()?
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+
+        let metadata = input_schema.metadata().clone();
+        let df_schema = DFSchema::new_with_metadata(fields, metadata)?;
+        // We can use the existing functional dependencies:
+        let deps = input_schema.functional_dependencies().clone();
+        let schema = Arc::new(df_schema.with_functional_dependencies(deps)?);
+
+        Ok(Unnest {
+            input,
+            exec_columns,
+            list_type_columns: list_columns,
+            struct_type_columns: struct_columns,
+            dependency_indices,
+            schema,
+            options,
+        })
     }
 }
 
+// Based on data type, either struct or a variant of list
+// return a set of columns as the result of unnesting
+// the input columns.
+// For example, given a column with name "a",
+// - List(Element) returns ["a"] with data type Element
+// - Struct(field1, field2) returns ["a.field1","a.field2"]
+// For list data type, an argument depth is used to specify
+// the recursion level
+fn get_unnested_columns(
+    col_name: &String,
+    data_type: &DataType,
+    depth: usize,
+) -> Result<Vec<(Column, Arc<Field>)>> {
+    let mut qualified_columns = Vec::with_capacity(1);
+
+    match data_type {
+        DataType::List(_)
+        | DataType::FixedSizeList(_, _)
+        | DataType::LargeList(_)
+        | DataType::ListView(_)
+        | DataType::LargeListView(_) => {
+            let data_type = get_unnested_list_datatype_recursive(data_type, depth)?;
+            let new_field = Arc::new(Field::new(
+                col_name, data_type,
+                // Unnesting may produce NULLs even if the list is not null.
+                // For example: unnest([1], []) -> 1, null
+                true,
+            ));
+            let column = Column::from_name(col_name);
+            // let column = Column::from((None, &new_field));
+            qualified_columns.push((column, new_field));
+        }
+        DataType::Struct(fields) => {
+            qualified_columns.extend(fields.iter().map(|f| {
+                let new_name = format!("{}.{}", col_name, f.name());
+                let column = Column::from_name(&new_name);
+                let new_field = f.as_ref().clone().with_name(new_name);
+                // let column = Column::from((None, &f));
+                (column, Arc::new(new_field))
+            }))
+        }
+        _ => {
+            return internal_err!("trying to unnest on invalid data type {data_type}");
+        }
+    };
+    Ok(qualified_columns)
+}
+
+// Get the data type of a multi-dimensional type after unnesting it
+// with a given depth
+fn get_unnested_list_datatype_recursive(
+    data_type: &DataType,
+    depth: usize,
+) -> Result<DataType> {
+    match data_type {
+        DataType::List(field)
+        | DataType::FixedSizeList(field, _)
+        | DataType::LargeList(field)
+        | DataType::ListView(field)
+        | DataType::LargeListView(field) => {
+            if depth == 1 {
+                return Ok(field.data_type().clone());
+            }
+            return get_unnested_list_datatype_recursive(field.data_type(), depth - 1);
+        }
+        _ => {}
+    };
+
+    internal_err!("trying to unnest on invalid data type {data_type}")
+}
+
 #[cfg(test)]
 mod tests {
-
     use super::*;
     use crate::builder::LogicalTableSource;
     use crate::logical_plan::table_scan;
+    use crate::select_expr::SelectExpr;
+    use crate::test::function_stub::{count, count_udaf};
     use crate::{
-        binary_expr, col, exists, in_subquery, lit, placeholder, scalar_subquery,
-        GroupingSet,
+        GroupingSet, binary_expr, col, exists, in_subquery, lit, placeholder,
+        scalar_subquery,
     };
-
+    use datafusion_common::metadata::ScalarAndMetadata;
     use datafusion_common::tree_node::{
         TransformedResult, TreeNodeRewriter, TreeNodeVisitor,
     };
-    use datafusion_common::{not_impl_err, Constraint, ScalarValue};
+    use datafusion_common::{Constraint, ScalarValue, not_impl_err};
     use insta::{assert_debug_snapshot, assert_snapshot};
-
-    use crate::test::function_stub::count;
+    use std::hash::DefaultHasher;
 
     fn employee_schema() -> Schema {
         Schema::new(vec![
@@ -4120,49 +4466,49 @@ mod tests {
         [
           {
             "Plan": {
+              "Node Type": "Projection",
               "Expressions": [
                 "employee_csv.id"
               ],
-              "Node Type": "Projection",
-              "Output": [
-                "id"
-              ],
               "Plans": [
                 {
-                  "Condition": "employee_csv.state IN (<subquery>)",
                   "Node Type": "Filter",
-                  "Output": [
-                    "id",
-                    "state"
-                  ],
+                  "Condition": "employee_csv.state IN (<subquery>)",
                   "Plans": [
                     {
                       "Node Type": "Subquery",
-                      "Output": [
-                        "state"
-                      ],
                       "Plans": [
                         {
                           "Node Type": "TableScan",
+                          "Relation Name": "employee_csv",
+                          "Plans": [],
                           "Output": [
                             "state"
-                          ],
-                          "Plans": [],
-                          "Relation Name": "employee_csv"
+                          ]
                         }
+                      ],
+                      "Output": [
+                        "state"
                       ]
                     },
                     {
                       "Node Type": "TableScan",
+                      "Relation Name": "employee_csv",
+                      "Plans": [],
                       "Output": [
                         "id",
                         "state"
-                      ],
-                      "Plans": [],
-                      "Relation Name": "employee_csv"
+                      ]
                     }
+                  ],
+                  "Output": [
+                    "id",
+                    "state"
                   ]
                 }
+              ],
+              "Output": [
+                "id"
               ]
             }
           }
@@ -4403,6 +4749,63 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_partial_eq_hash_and_partial_ord() {
+        let empty_values = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+            produce_one_row: true,
+            schema: Arc::new(DFSchema::empty()),
+        }));
+
+        let count_window_function = |schema| {
+            Window::try_new_with_schema(
+                vec![Expr::WindowFunction(Box::new(WindowFunction::new(
+                    WindowFunctionDefinition::AggregateUDF(count_udaf()),
+                    vec![],
+                )))],
+                Arc::clone(&empty_values),
+                Arc::new(schema),
+            )
+            .unwrap()
+        };
+
+        let schema_without_metadata = || {
+            DFSchema::from_unqualified_fields(
+                vec![Field::new("count", DataType::Int64, false)].into(),
+                HashMap::new(),
+            )
+            .unwrap()
+        };
+
+        let schema_with_metadata = || {
+            DFSchema::from_unqualified_fields(
+                vec![Field::new("count", DataType::Int64, false)].into(),
+                [("key".to_string(), "value".to_string())].into(),
+            )
+            .unwrap()
+        };
+
+        // A Window
+        let f = count_window_function(schema_without_metadata());
+
+        // Same like `f`, different instance
+        let f2 = count_window_function(schema_without_metadata());
+        assert_eq!(f, f2);
+        assert_eq!(hash(&f), hash(&f2));
+        assert_eq!(f.partial_cmp(&f2), Some(Ordering::Equal));
+
+        // Same like `f`, except for schema metadata
+        let o = count_window_function(schema_with_metadata());
+        assert_ne!(f, o);
+        assert_ne!(hash(&f), hash(&o)); // hash can collide for different values but does not collide in this test
+        assert_eq!(f.partial_cmp(&o), None);
+    }
+
+    fn hash<T: Hash>(value: &T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
+
     #[test]
     fn projection_expr_schema_mismatch() -> Result<()> {
         let empty_schema = Arc::new(DFSchema::empty());
@@ -4477,6 +4880,67 @@ mod tests {
             .expect_err("unexpectedly succeeded to replace an invalid placeholder");
     }
 
+    #[test]
+    fn test_replace_placeholder_mismatched_metadata() {
+        let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);
+
+        // Create a prepared statement with explicit fields that do not have metadata
+        let plan = table_scan(TableReference::none(), &schema, None)
+            .unwrap()
+            .filter(col("id").eq(placeholder("$1")))
+            .unwrap()
+            .build()
+            .unwrap();
+        let prepared_builder = LogicalPlanBuilder::new(plan)
+            .prepare(
+                "".to_string(),
+                vec![Field::new("", DataType::Int32, true).into()],
+            )
+            .unwrap();
+
+        // Attempt to bind a parameter with metadata
+        let mut scalar_meta = HashMap::new();
+        scalar_meta.insert("some_key".to_string(), "some_value".to_string());
+        let param_values = ParamValues::List(vec![ScalarAndMetadata::new(
+            ScalarValue::Int32(Some(42)),
+            Some(scalar_meta.into()),
+        )]);
+        prepared_builder
+            .plan()
+            .clone()
+            .with_param_values(param_values)
+            .expect_err("prepared field metadata mismatch unexpectedly succeeded");
+    }
+
+    #[test]
+    fn test_replace_placeholder_empty_relation_valid_schema() {
+        // SELECT $1, $2;
+        let plan = LogicalPlanBuilder::empty(false)
+            .project(vec![
+                SelectExpr::from(placeholder("$1")),
+                SelectExpr::from(placeholder("$2")),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // original
+        assert_snapshot!(plan.display_indent_schema(), @r"
+        Projection: $1, $2 [$1:Null;N, $2:Null;N]
+          EmptyRelation: rows=0 []
+        ");
+
+        let plan = plan
+            .with_param_values(vec![ScalarValue::from(1i32), ScalarValue::from("s")])
+            .unwrap();
+
+        // replaced
+        assert_snapshot!(plan.display_indent_schema(), @r#"
+        Projection: Int32(1) AS $1, Utf8("s") AS $2 [$1:Int32, $2:Utf8]
+          EmptyRelation: rows=0 []
+        "#);
+    }
+
     #[test]
     fn test_nullable_schema_after_grouping_set() {
         let schema = Schema::new(vec![
@@ -4499,14 +4963,18 @@ mod tests {
 
         let output_schema = plan.schema();
 
-        assert!(output_schema
-            .field_with_name(None, "foo")
-            .unwrap()
-            .is_nullable(),);
-        assert!(output_schema
-            .field_with_name(None, "bar")
-            .unwrap()
-            .is_nullable());
+        assert!(
+            output_schema
+                .field_with_name(None, "foo")
+                .unwrap()
+                .is_nullable(),
+        );
+        assert!(
+            output_schema
+                .field_with_name(None, "bar")
+                .unwrap()
+                .is_nullable()
+        );
     }
 
     #[test]
@@ -4534,7 +5002,7 @@ mod tests {
         let col = schema.field_names()[0].clone();
 
         let filter = Filter::try_new(
-            Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))),
+            Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)), None)),
             scan,
         )
         .unwrap();
@@ -4661,12 +5129,14 @@ mod tests {
                 skip: None,
                 fetch: Some(Box::new(Expr::Literal(
                     ScalarValue::new_ten(&DataType::UInt32).unwrap(),
+                    None,
                 ))),
                 input: Arc::clone(&input),
             }),
             LogicalPlan::Limit(Limit {
                 skip: Some(Box::new(Expr::Literal(
                     ScalarValue::new_ten(&DataType::UInt32).unwrap(),
+                    None,
                 ))),
                 fetch: None,
                 input: Arc::clone(&input),
@@ -4674,9 +5144,11 @@ mod tests {
             LogicalPlan::Limit(Limit {
                 skip: Some(Box::new(Expr::Literal(
                     ScalarValue::new_one(&DataType::UInt32).unwrap(),
+                    None,
                 ))),
                 fetch: Some(Box::new(Expr::Literal(
                     ScalarValue::new_ten(&DataType::UInt32).unwrap(),
+                    None,
                 ))),
                 input,
             }),
@@ -4767,7 +5239,11 @@ mod tests {
             .transform_down_with_subqueries(|plan| {
                 match plan {
                     LogicalPlan::Projection(..) => {
-                        return Ok(Transformed::new(plan, false, TreeNodeRecursion::Jump))
+                        return Ok(Transformed::new(
+                            plan,
+                            false,
+                            TreeNodeRecursion::Jump,
+                        ));
                     }
                     LogicalPlan::Filter(..) => filter_found = true,
                     _ => {}
@@ -4787,7 +5263,7 @@ mod tests {
                                 plan,
                                 false,
                                 TreeNodeRecursion::Jump,
-                            ))
+                            ));
                         }
                         LogicalPlan::Filter(..) => filter_found = true,
                         _ => {}
@@ -4817,7 +5293,11 @@ mod tests {
             fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
                 match node {
                     LogicalPlan::Projection(..) => {
-                        return Ok(Transformed::new(node, false, TreeNodeRecursion::Jump))
+                        return Ok(Transformed::new(
+                            node,
+                            false,
+                            TreeNodeRecursion::Jump,
+                        ));
                     }
                     LogicalPlan::Filter(..) => self.filter_found = true,
                     _ => {}
@@ -4845,7 +5325,7 @@ mod tests {
             .unwrap();
 
         // Check that the placeholder parameters have not received a DataType.
-        let params = plan.get_parameter_types().unwrap();
+        let params = plan.get_parameter_fields().unwrap();
         assert_eq!(params.len(), 1);
 
         let parameter_type = params.clone().get(placeholder_value).unwrap().clone();
@@ -4878,7 +5358,8 @@ mod tests {
                 join_type: JoinType::Inner,
                 join_constraint: JoinConstraint::On,
                 schema: Arc::new(left_schema.join(&right_schema)?),
-                null_equals_null: false,
+                null_equality: NullEquality::NullEqualsNothing,
+                null_aware: false,
             }))
         }
 
@@ -4989,6 +5470,7 @@ mod tests {
                 Some(col("t1.b").gt(col("t2.b"))),
                 join_type,
                 JoinConstraint::On,
+                NullEquality::NullEqualsNothing,
                 false,
             )?;
 
@@ -5099,7 +5581,7 @@ mod tests {
             assert_eq!(join.filter, Some(col("t1.b").gt(col("t2.b"))));
             assert_eq!(join.join_type, join_type);
             assert_eq!(join.join_constraint, JoinConstraint::On);
-            assert!(!join.null_equals_null);
+            assert_eq!(join.null_equality, NullEquality::NullEqualsNothing);
         }
 
         Ok(())
@@ -5134,6 +5616,7 @@ mod tests {
                 None,
                 JoinType::Inner,
                 JoinConstraint::Using,
+                NullEquality::NullEqualsNothing,
                 false,
             )?;
 
@@ -5185,6 +5668,7 @@ mod tests {
                 Some(col("t1.value").lt(col("t2.value"))), // Non-equi filter condition
                 JoinType::Inner,
                 JoinConstraint::On,
+                NullEquality::NullEqualsNothing,
                 false,
             )?;
 
@@ -5234,10 +5718,11 @@ mod tests {
                 None,
                 JoinType::Inner,
                 JoinConstraint::On,
-                true,
+                NullEquality::NullEqualsNull,
+                false,
             )?;
 
-            assert!(join.null_equals_null);
+            assert_eq!(join.null_equality, NullEquality::NullEqualsNull);
         }
 
         Ok(())
@@ -5276,11 +5761,12 @@ mod tests {
                 Some(col("t1.value").gt(lit(5.0))),
                 join_type,
                 JoinConstraint::On,
+                NullEquality::NullEqualsNothing,
                 false,
             )?;
 
             let fields = join.schema.fields();
-            assert_eq!(fields.len(), 6, "Expected 6 fields for {join_type:?} join");
+            assert_eq!(fields.len(), 6, "Expected 6 fields for {join_type} join");
 
             for (i, field) in fields.iter().enumerate() {
                 let expected_nullable = match (i, &join_type) {
@@ -5315,6 +5801,7 @@ mod tests {
             None,
             JoinType::Inner,
             JoinConstraint::Using,
+            NullEquality::NullEqualsNothing,
             false,
         )?;
 
diff --git a/datafusion/expr/src/logical_plan/statement.rs b/datafusion/expr/src/logical_plan/statement.rs
index 72eb6b39bb47b..384d99ca0899e 100644
--- a/datafusion/expr/src/logical_plan/statement.rs
+++ b/datafusion/expr/src/logical_plan/statement.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::DataType;
+use arrow::datatypes::FieldRef;
+use datafusion_common::metadata::format_type_and_metadata;
 use datafusion_common::{DFSchema, DFSchemaRef};
+use itertools::Itertools as _;
 use std::fmt::{self, Display};
 use std::sync::{Arc, LazyLock};
 
-use crate::{expr_vec_fmt, Expr, LogicalPlan};
+use crate::{Expr, LogicalPlan, expr_vec_fmt};
 
 /// Various types of Statements.
 ///
@@ -37,6 +39,8 @@ pub enum Statement {
     TransactionEnd(TransactionEnd),
     /// Set a Variable
     SetVariable(SetVariable),
+    /// Reset a Variable
+    ResetVariable(ResetVariable),
     /// Prepare a statement and find any bind parameters
     /// (e.g. `?`). This is used to implement SQL-prepared statements.
     Prepare(Prepare),
@@ -64,6 +68,7 @@ impl Statement {
             Statement::TransactionStart(_) => "TransactionStart",
             Statement::TransactionEnd(_) => "TransactionEnd",
             Statement::SetVariable(_) => "SetVariable",
+            Statement::ResetVariable(_) => "ResetVariable",
             Statement::Prepare(_) => "Prepare",
             Statement::Execute(_) => "Execute",
             Statement::Deallocate(_) => "Deallocate",
@@ -107,10 +112,21 @@ impl Statement {
                     }) => {
                         write!(f, "SetVariable: set {variable:?} to {value:?}")
                     }
-                    Statement::Prepare(Prepare {
-                        name, data_types, ..
-                    }) => {
-                        write!(f, "Prepare: {name:?} {data_types:?}")
+                    Statement::ResetVariable(ResetVariable { variable }) => {
+                        write!(f, "ResetVariable: reset {variable:?}")
+                    }
+                    Statement::Prepare(Prepare { name, fields, .. }) => {
+                        write!(
+                            f,
+                            "Prepare: {name:?} [{}]",
+                            fields
+                                .iter()
+                                .map(|f| format_type_and_metadata(
+                                    f.data_type(),
+                                    Some(f.metadata())
+                                ))
+                                .join(", ")
+                        )
                     }
                     Statement::Execute(Execute {
                         name, parameters, ..
@@ -184,6 +200,12 @@ pub struct SetVariable {
     pub value: String,
 }
 
+/// Reset a configuration variable to its default
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
+pub struct ResetVariable {
+    /// The variable name
+    pub variable: String,
+}
 /// Prepare a statement but do not execute it. Prepare statements can have 0 or more
 /// `Expr::Placeholder` expressions that are filled in during execution
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
@@ -191,7 +213,7 @@ pub struct Prepare {
     /// The name of the statement
     pub name: String,
     /// Data types of the parameters ([`Expr::Placeholder`])
-    pub data_types: Vec<DataType>,
+    pub fields: Vec<FieldRef>,
     /// The logical plan of the statements
     pub input: Arc<LogicalPlan>,
 }
diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
index 2a290e692a7b7..a1285510da569 100644
--- a/datafusion/expr/src/logical_plan/tree_node.rs
+++ b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -38,20 +38,20 @@
 //! * [`LogicalPlan::expressions`]: Return a copy of the plan's expressions
 
 use crate::{
-    dml::CopyTo, Aggregate, Analyze, CreateMemoryTable, CreateView, DdlStatement,
-    Distinct, DistinctOn, DmlStatement, Execute, Explain, Expr, Extension, Filter, Join,
-    Limit, LogicalPlan, Partitioning, Prepare, Projection, RecursiveQuery, Repartition,
-    Sort, Statement, Subquery, SubqueryAlias, TableScan, Union, Unnest,
-    UserDefinedLogicalNode, Values, Window,
+    Aggregate, Analyze, CreateMemoryTable, CreateView, DdlStatement, Distinct,
+    DistinctOn, DmlStatement, Execute, Explain, Expr, Extension, Filter, Join, Limit,
+    LogicalPlan, Partitioning, Prepare, Projection, RecursiveQuery, Repartition, Sort,
+    Statement, Subquery, SubqueryAlias, TableScan, Union, Unnest, UserDefinedLogicalNode,
+    Values, Window, dml::CopyTo,
 };
 use datafusion_common::tree_node::TreeNodeRefContainer;
 
-use crate::expr::{Exists, InSubquery};
+use crate::expr::{Exists, InSubquery, SetComparison};
 use datafusion_common::tree_node::{
     Transformed, TreeNode, TreeNodeContainer, TreeNodeIterator, TreeNodeRecursion,
     TreeNodeRewriter, TreeNodeVisitor,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 
 impl TreeNode for LogicalPlan {
     fn apply_children<'n, F: FnMut(&'n Self) -> Result<TreeNodeRecursion>>(
@@ -132,7 +132,8 @@ impl TreeNode for LogicalPlan {
                 join_type,
                 join_constraint,
                 schema,
-                null_equals_null,
+                null_equality,
+                null_aware,
             }) => (left, right).map_elements(f)?.update_data(|(left, right)| {
                 LogicalPlan::Join(Join {
                     left,
@@ -142,7 +143,8 @@ impl TreeNode for LogicalPlan {
                     join_type,
                     join_constraint,
                     schema,
-                    null_equals_null,
+                    null_equality,
+                    null_aware,
                 })
             }),
             LogicalPlan::Limit(Limit { skip, fetch, input }) => input
@@ -243,6 +245,7 @@ impl TreeNode for LogicalPlan {
                 partition_by,
                 file_type,
                 options,
+                output_schema,
             }) => input.map_elements(f)?.update_data(|input| {
                 LogicalPlan::Copy(CopyTo {
                     input,
@@ -250,6 +253,7 @@ impl TreeNode for LogicalPlan {
                     partition_by,
                     file_type,
                     options,
+                    output_schema,
                 })
             }),
             LogicalPlan::Ddl(ddl) => {
@@ -313,9 +317,9 @@ impl TreeNode for LogicalPlan {
                 LogicalPlan::Unnest(Unnest {
                     input,
                     exec_columns: input_columns,
-                    dependency_indices,
                     list_type_columns,
                     struct_type_columns,
+                    dependency_indices,
                     schema,
                     options,
                 })
@@ -436,11 +440,11 @@ impl LogicalPlan {
                 filters.apply_elements(f)
             }
             LogicalPlan::Unnest(unnest) => {
-                let columns = unnest.exec_columns.clone();
-
-                let exprs = columns
+                let exprs = unnest
+                    .exec_columns
                     .iter()
-                    .map(|c| Expr::Column(c.clone()))
+                    .cloned()
+                    .map(Expr::Column)
                     .collect::<Vec<_>>();
                 exprs.apply_elements(f)
             }
@@ -561,7 +565,8 @@ impl LogicalPlan {
                 join_type,
                 join_constraint,
                 schema,
-                null_equals_null,
+                null_equality,
+                null_aware,
             }) => (on, filter).map_elements(f)?.update_data(|(on, filter)| {
                 LogicalPlan::Join(Join {
                     left,
@@ -571,7 +576,8 @@ impl LogicalPlan {
                     join_type,
                     join_constraint,
                     schema,
-                    null_equals_null,
+                    null_equality,
+                    null_aware,
                 })
             }),
             LogicalPlan::Sort(Sort { expr, input, fetch }) => expr
@@ -813,6 +819,7 @@ impl LogicalPlan {
             expr.apply(|expr| match expr {
                 Expr::Exists(Exists { subquery, .. })
                 | Expr::InSubquery(InSubquery { subquery, .. })
+                | Expr::SetComparison(SetComparison { subquery, .. })
                 | Expr::ScalarSubquery(subquery) => {
                     // use a synthetic plan so the collector sees a
                     // LogicalPlan::Subquery (even though it is
@@ -854,6 +861,22 @@ impl LogicalPlan {
                     })),
                     _ => internal_err!("Transformation should return Subquery"),
                 }),
+                Expr::SetComparison(SetComparison {
+                    expr,
+                    subquery,
+                    op,
+                    quantifier,
+                }) => f(LogicalPlan::Subquery(subquery))?.map_data(|s| match s {
+                    LogicalPlan::Subquery(subquery) => {
+                        Ok(Expr::SetComparison(SetComparison {
+                            expr,
+                            subquery,
+                            op,
+                            quantifier,
+                        }))
+                    }
+                    _ => internal_err!("Transformation should return Subquery"),
+                }),
                 Expr::ScalarSubquery(subquery) => f(LogicalPlan::Subquery(subquery))?
                     .map_data(|s| match s {
                         LogicalPlan::Subquery(subquery) => {
diff --git a/datafusion/expr/src/operation.rs b/datafusion/expr/src/operation.rs
index 6b79a8248b293..3158a19dce449 100644
--- a/datafusion/expr/src/operation.rs
+++ b/datafusion/expr/src/operation.rs
@@ -17,8 +17,8 @@
 
 //! This module contains implementations of operations (unary, binary etc.) for DataFusion expressions.
 
+use crate::expr::{Exists, Expr, InList, InSubquery, Like};
 use crate::expr_fn::binary_expr;
-use crate::{Expr, Like};
 use datafusion_expr_common::operator::Operator;
 use std::ops::{self, Not};
 
@@ -153,6 +153,19 @@ impl Not for Expr {
                 escape_char,
                 case_insensitive,
             )),
+            Expr::InList(InList {
+                expr,
+                list,
+                negated,
+            }) => Expr::InList(InList::new(expr, list, !negated)),
+            Expr::Exists(Exists { subquery, negated }) => {
+                Expr::Exists(Exists::new(subquery, !negated))
+            }
+            Expr::InSubquery(InSubquery {
+                expr,
+                subquery,
+                negated,
+            }) => Expr::InSubquery(InSubquery::new(expr, subquery, !negated)),
             _ => Expr::Not(Box::new(self)),
         }
     }
diff --git a/datafusion/expr/src/partition_evaluator.rs b/datafusion/expr/src/partition_evaluator.rs
index a0f0988b4f4e5..5a4e20e5ac9ac 100644
--- a/datafusion/expr/src/partition_evaluator.rs
+++ b/datafusion/expr/src/partition_evaluator.rs
@@ -18,7 +18,7 @@
 //! Partition evaluation module
 
 use arrow::array::ArrayRef;
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
 use std::fmt::Debug;
 use std::ops::Range;
 
@@ -86,7 +86,11 @@ use crate::window_state::WindowAggState;
 /// [`uses_window_frame`]: Self::uses_window_frame
 /// [`include_rank`]: Self::include_rank
 /// [`supports_bounded_execution`]: Self::supports_bounded_execution
-pub trait PartitionEvaluator: Debug + Send {
+///
+/// For more background, please also see the [User defined Window Functions in DataFusion blog]
+///
+/// [User defined Window Functions in DataFusion blog]: https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions
+pub trait PartitionEvaluator: Debug + Send + std::any::Any {
     /// When the window frame has a fixed beginning (e.g UNBOUNDED
     /// PRECEDING), some functions such as FIRST_VALUE, LAST_VALUE and
     /// NTH_VALUE do not need the (unbounded) input once they have
@@ -175,7 +179,7 @@ pub trait PartitionEvaluator: Debug + Send {
     }
 
     /// Evaluate window function on a range of rows in an input
-    /// partition.x
+    /// partition.
     ///
     /// This is the simplest and most general function to implement
     /// but also the least performant as it creates output one row at
@@ -210,7 +214,7 @@ pub trait PartitionEvaluator: Debug + Send {
     ///  A  | 1
     ///  C  | 3
     ///  D  | 4
-    ///  D  | 5
+    ///  D  | 4
     /// ```
     ///
     /// For this case, `num_rows` would be `5` and the
diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs
index 4c03f919312eb..197ac8c035712 100644
--- a/datafusion/expr/src/planner.rs
+++ b/datafusion/expr/src/planner.rs
@@ -20,17 +20,21 @@
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use arrow::datatypes::{DataType, Field, SchemaRef};
-use datafusion_common::{
-    config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema,
-    Result, TableReference,
-};
-use sqlparser::ast::{self, NullTreatment};
-
+use crate::expr::NullTreatment;
+#[cfg(feature = "sql")]
+use crate::logical_plan::LogicalPlan;
 use crate::{
     AggregateUDF, Expr, GetFieldAccess, ScalarUDF, SortExpr, TableSource, WindowFrame,
     WindowFunctionDefinition, WindowUDF,
 };
+use arrow::datatypes::{DataType, Field, FieldRef, SchemaRef};
+use datafusion_common::datatype::DataTypeExt;
+use datafusion_common::{
+    DFSchema, Result, TableReference, config::ConfigOptions,
+    file_options::file_type::FileType, not_impl_err,
+};
+#[cfg(feature = "sql")]
+use sqlparser::ast::{Expr as SQLExpr, Ident, ObjectName, TableAlias, TableFactor};
 
 /// Provides the `SQL` query planner meta-data about tables and
 /// functions referenced in SQL statements, without a direct dependency on the
@@ -84,7 +88,14 @@ pub trait ContextProvider {
         &[]
     }
 
+    /// Return [`RelationPlanner`] extensions for planning table factors
+    #[cfg(feature = "sql")]
+    fn get_relation_planners(&self) -> &[Arc<dyn RelationPlanner>] {
+        &[]
+    }
+
     /// Return [`TypePlanner`] extensions for planning data types
+    #[cfg(feature = "sql")]
     fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
         None
     }
@@ -103,6 +114,17 @@ pub trait ContextProvider {
     /// A user defined variable is typically accessed via `@var_name`
     fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType>;
 
+    /// Return metadata about a system/user-defined variable, if any.
+    ///
+    /// By default, this wraps [`Self::get_variable_type`] in an Arrow [`Field`]
+    /// with nullable set to `true` and no metadata. Implementations that can
+    /// provide richer information (such as nullability or extension metadata)
+    /// should override this method.
+    fn get_variable_field(&self, variable_names: &[String]) -> Option<FieldRef> {
+        self.get_variable_type(variable_names)
+            .map(|data_type| data_type.into_nullable_field_ref())
+    }
+
     /// Return overall configuration options
     fn options(&self) -> &ConfigOptions;
 
@@ -117,6 +139,10 @@ pub trait ContextProvider {
 }
 
 /// Customize planning of SQL AST expressions to [`Expr`]s
+///
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
 pub trait ExprPlanner: Debug + Send + Sync {
     /// Plan the binary operation between two expressions, returns original
     /// BinaryExpr if not possible
@@ -227,13 +253,6 @@ pub trait ExprPlanner: Debug + Send + Sync {
         )
     }
 
-    /// Plans `ANY` expression, such as `expr = ANY(array_expr)`
-    ///
-    /// Returns origin binary expression if not possible
-    fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
-        Ok(PlannerResult::Original(expr))
-    }
-
     /// Plans aggregate functions, such as `COUNT(<expr>)`
     ///
     /// Returns original expression arguments if not possible
@@ -261,7 +280,10 @@ pub trait ExprPlanner: Debug + Send + Sync {
 /// custom expressions.
 #[derive(Debug, Clone)]
 pub struct RawBinaryExpr {
-    pub op: ast::BinaryOperator,
+    #[cfg(not(feature = "sql"))]
+    pub op: datafusion_expr_common::operator::Operator,
+    #[cfg(feature = "sql")]
+    pub op: sqlparser::ast::BinaryOperator,
     pub left: Expr,
     pub right: Expr,
 }
@@ -294,7 +316,7 @@ pub struct RawAggregateExpr {
     pub args: Vec<Expr>,
     pub distinct: bool,
     pub filter: Option<Box<Expr>>,
-    pub order_by: Option<Vec<SortExpr>>,
+    pub order_by: Vec<SortExpr>,
     pub null_treatment: Option<NullTreatment>,
 }
 
@@ -307,12 +329,13 @@ pub struct RawWindowExpr {
     pub partition_by: Vec<Expr>,
     pub order_by: Vec<SortExpr>,
     pub window_frame: WindowFrame,
+    pub filter: Option<Box<Expr>>,
     pub null_treatment: Option<NullTreatment>,
+    pub distinct: bool,
 }
 
 /// Result of planning a raw expr with [`ExprPlanner`]
 #[derive(Debug, Clone)]
-#[allow(clippy::large_enum_variant)]
 pub enum PlannerResult<T> {
     /// The raw expression was successfully planned as a new [`Expr`]
     Planned(Expr),
@@ -320,12 +343,119 @@ pub enum PlannerResult<T> {
     Original(T),
 }
 
+/// Result of planning a relation with [`RelationPlanner`]
+#[cfg(feature = "sql")]
+#[derive(Debug, Clone)]
+pub struct PlannedRelation {
+    /// The logical plan for the relation
+    pub plan: LogicalPlan,
+    /// Optional table alias for the relation
+    pub alias: Option<TableAlias>,
+}
+
+#[cfg(feature = "sql")]
+impl PlannedRelation {
+    /// Create a new `PlannedRelation` with the given plan and alias
+    pub fn new(plan: LogicalPlan, alias: Option<TableAlias>) -> Self {
+        Self { plan, alias }
+    }
+}
+
+/// Result of attempting to plan a relation with extension planners
+#[cfg(feature = "sql")]
+#[derive(Debug)]
+pub enum RelationPlanning {
+    /// The relation was successfully planned by an extension planner
+    Planned(Box<PlannedRelation>),
+    /// No extension planner handled the relation, return it for default processing
+    Original(Box<TableFactor>),
+}
+
+/// Customize planning SQL table factors to [`LogicalPlan`]s.
+#[cfg(feature = "sql")]
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
+pub trait RelationPlanner: Debug + Send + Sync {
+    /// Plan a table factor into a [`LogicalPlan`].
+    ///
+    /// Returning [`RelationPlanning::Planned`] short-circuits further planning and uses the
+    /// provided plan. Returning [`RelationPlanning::Original`] allows the next registered planner,
+    /// or DataFusion's default logic, to handle the relation.
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        context: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning>;
+}
+
+/// Provides utilities for relation planners to interact with DataFusion's SQL
+/// planner.
+///
+/// This trait provides SQL planning utilities specific to relation planning,
+/// such as converting SQL expressions to logical expressions and normalizing
+/// identifiers. It uses composition to provide access to session context via
+/// [`ContextProvider`].
+#[cfg(feature = "sql")]
+pub trait RelationPlannerContext {
+    /// Provides access to the underlying context provider for reading session
+    /// configuration, accessing tables, functions, and other metadata.
+    fn context_provider(&self) -> &dyn ContextProvider;
+
+    /// Plans the specified relation through the full planner pipeline, starting
+    /// from the first registered relation planner.
+    fn plan(&mut self, relation: TableFactor) -> Result<LogicalPlan>;
+
+    /// Converts a SQL expression into a logical expression using the current
+    /// planner context.
+    fn sql_to_expr(&mut self, expr: SQLExpr, schema: &DFSchema) -> Result<Expr>;
+
+    /// Converts a SQL expression into a logical expression without DataFusion
+    /// rewrites.
+    fn sql_expr_to_logical_expr(
+        &mut self,
+        expr: SQLExpr,
+        schema: &DFSchema,
+    ) -> Result<Expr>;
+
+    /// Normalizes an identifier according to session settings.
+    fn normalize_ident(&self, ident: Ident) -> String;
+
+    /// Normalizes a SQL object name into a [`TableReference`].
+    fn object_name_to_table_reference(&self, name: ObjectName) -> Result<TableReference>;
+}
+
 /// Customize planning SQL types to DataFusion (Arrow) types.
+#[cfg(feature = "sql")]
+/// For more background, please also see the [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]
+///
+/// [Extending SQL in DataFusion: from ->> to TABLESAMPLE blog]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
 pub trait TypePlanner: Debug + Send + Sync {
-    /// Plan SQL [`ast::DataType`] to DataFusion [`DataType`]
+    /// Plan SQL [`sqlparser::ast::DataType`] to DataFusion [`DataType`]
     ///
     /// Returns None if not possible
-    fn plan_type(&self, _sql_type: &ast::DataType) -> Result<Option<DataType>> {
+    #[deprecated(since = "53.0.0", note = "Use plan_type_field()")]
+    fn plan_type(
+        &self,
+        _sql_type: &sqlparser::ast::DataType,
+    ) -> Result<Option<DataType>> {
         Ok(None)
     }
+
+    /// Plan SQL [`sqlparser::ast::DataType`] to DataFusion [`FieldRef`]
+    ///
+    /// Returns None if not possible. Unlike [`Self::plan_type`], `plan_type_field()`
+    /// makes it possible to express extension types (e.g., `arrow.uuid`) or otherwise
+    /// insert metadata into the DataFusion type representation. The default implementation
+    /// falls back on [`Self::plan_type`] for backward compatibility and wraps the result
+    /// in a nullable field reference.
+    fn plan_type_field(
+        &self,
+        sql_type: &sqlparser::ast::DataType,
+    ) -> Result<Option<FieldRef>> {
+        #[expect(deprecated)]
+        Ok(self
+            .plan_type(sql_type)?
+            .map(|data_type| data_type.into_nullable_field_ref()))
+    }
 }
diff --git a/datafusion/expr/src/predicate_bounds.rs b/datafusion/expr/src/predicate_bounds.rs
new file mode 100644
index 0000000000000..992d9f88bb14a
--- /dev/null
+++ b/datafusion/expr/src/predicate_bounds.rs
@@ -0,0 +1,681 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{Between, BinaryExpr, Expr, ExprSchemable};
+use arrow::datatypes::DataType;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_common::{ExprSchema, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::NullableInterval;
+use datafusion_expr_common::operator::Operator;
+
+/// Computes the output interval for the given boolean expression based on statically
+/// available information.
+///
+/// # Arguments
+///
+/// * `predicate` - The boolean expression to analyze
+/// * `is_null` - A callback function that provides additional nullability information for
+///   expressions. When called with an expression, it should return:
+///   - `Some(true)` if the expression is known to evaluate to NULL
+///   - `Some(false)` if the expression is known to NOT evaluate to NULL
+///   - `None` if the nullability cannot be determined
+///
+///   This callback allows the caller to provide context-specific knowledge about expression
+///   nullability that cannot be determined from the schema alone. For example, it can be used
+///   to indicate that a particular column reference is known to be NULL in a specific context,
+///   or that certain expressions will never be NULL based on runtime constraints.
+///
+/// * `input_schema` - Schema information for resolving expression types and nullability
+///
+/// # Return Value
+///
+/// The function returns a [NullableInterval] that describes the possible boolean values the
+/// predicate can evaluate to.
+///
+pub(super) fn evaluate_bounds(
+    predicate: &Expr,
+    certainly_null_expr: Option<&Expr>,
+    input_schema: &dyn ExprSchema,
+) -> Result<NullableInterval> {
+    let evaluator = PredicateBoundsEvaluator {
+        input_schema,
+        certainly_null_expr,
+    };
+    evaluator.evaluate_bounds(predicate)
+}
+
+struct PredicateBoundsEvaluator<'a> {
+    input_schema: &'a dyn ExprSchema,
+    certainly_null_expr: Option<&'a Expr>,
+}
+
+impl PredicateBoundsEvaluator<'_> {
+    /// Derives the bounds of the given boolean expression
+    fn evaluate_bounds(&self, predicate: &Expr) -> Result<NullableInterval> {
+        Ok(match predicate {
+            Expr::Literal(scalar, _) => {
+                // Interpret literals as boolean, coercing if necessary
+                match scalar {
+                    ScalarValue::Null => NullableInterval::UNKNOWN,
+                    ScalarValue::Boolean(b) => match b {
+                        Some(true) => NullableInterval::TRUE,
+                        Some(false) => NullableInterval::FALSE,
+                        None => NullableInterval::UNKNOWN,
+                    },
+                    _ => {
+                        let b = Expr::Literal(scalar.cast_to(&DataType::Boolean)?, None);
+                        self.evaluate_bounds(&b)?
+                    }
+                }
+            }
+            Expr::IsNull(e) => {
+                // If `e` is not nullable, then `e IS NULL` is provably false
+                if !e.nullable(self.input_schema)? {
+                    NullableInterval::FALSE
+                } else {
+                    match e.get_type(self.input_schema)? {
+                        // If `e` is a boolean expression, check if `e` is provably 'unknown'.
+                        DataType::Boolean => self.evaluate_bounds(e)?.is_unknown()?,
+                        // If `e` is not a boolean expression, check if `e` is provably null
+                        _ => self.is_null(e),
+                    }
+                }
+            }
+            Expr::IsNotNull(e) => {
+                // If `e` is not nullable, then `e IS NOT NULL` is provably true
+                if !e.nullable(self.input_schema)? {
+                    NullableInterval::TRUE
+                } else {
+                    match e.get_type(self.input_schema)? {
+                        // If `e` is a boolean expression, try to evaluate it and test for not unknown
+                        DataType::Boolean => {
+                            self.evaluate_bounds(e)?.is_unknown()?.not()?
+                        }
+                        // If `e` is not a boolean expression, check if `e` is provably null
+                        _ => self.is_null(e).not()?,
+                    }
+                }
+            }
+            Expr::IsTrue(e) => self.evaluate_bounds(e)?.is_true()?,
+            Expr::IsNotTrue(e) => self.evaluate_bounds(e)?.is_true()?.not()?,
+            Expr::IsFalse(e) => self.evaluate_bounds(e)?.is_false()?,
+            Expr::IsNotFalse(e) => self.evaluate_bounds(e)?.is_false()?.not()?,
+            Expr::IsUnknown(e) => self.evaluate_bounds(e)?.is_unknown()?,
+            Expr::IsNotUnknown(e) => self.evaluate_bounds(e)?.is_unknown()?.not()?,
+            Expr::Not(e) => self.evaluate_bounds(e)?.not()?,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::And,
+                right,
+            }) => NullableInterval::and(
+                &self.evaluate_bounds(left)?,
+                &self.evaluate_bounds(right)?,
+            )?,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::Or,
+                right,
+            }) => NullableInterval::or(
+                &self.evaluate_bounds(left)?,
+                &self.evaluate_bounds(right)?,
+            )?,
+            e => {
+                let is_null = self.is_null(e);
+
+                // If an expression is null, then it's value is UNKNOWN
+                let maybe_null =
+                    is_null.contains_value(ScalarValue::Boolean(Some(true)))?;
+
+                let maybe_not_null =
+                    is_null.contains_value(ScalarValue::Boolean(Some(false)))?;
+
+                match (maybe_null, maybe_not_null) {
+                    (true, true) | (false, false) => NullableInterval::ANY_TRUTH_VALUE,
+                    (true, false) => NullableInterval::UNKNOWN,
+                    (false, true) => NullableInterval::TRUE_OR_FALSE,
+                }
+            }
+        })
+    }
+
+    /// Determines if the given expression can evaluate to `NULL`.
+    ///
+    /// This method only returns sets containing `TRUE`, `FALSE`, or both.
+    fn is_null(&self, expr: &Expr) -> NullableInterval {
+        // Fast path for literals
+        if let Expr::Literal(scalar, _) = expr {
+            if scalar.is_null() {
+                return NullableInterval::TRUE;
+            } else {
+                return NullableInterval::FALSE;
+            }
+        }
+
+        // If `expr` is not nullable, we can be certain `expr` is not null
+        if let Ok(false) = expr.nullable(self.input_schema) {
+            return NullableInterval::FALSE;
+        }
+
+        // Check if the expression is the `certainly_null_expr` that was passed in.
+        if let Some(certainly_null_expr) = &self.certainly_null_expr
+            && expr.eq(certainly_null_expr)
+        {
+            return NullableInterval::TRUE;
+        }
+
+        // `expr` is nullable, so our default answer for `is null` is going to be `{ TRUE, FALSE }`.
+        // Try to see if we can narrow it down to just one option.
+        match expr {
+            Expr::BinaryExpr(BinaryExpr { op, .. }) if op.returns_null_on_null() => {
+                self.is_null_if_any_child_null(expr)
+            }
+            Expr::Alias(_)
+            | Expr::Cast(_)
+            | Expr::Like(_)
+            | Expr::Negative(_)
+            | Expr::Not(_)
+            | Expr::SimilarTo(_) => self.is_null_if_any_child_null(expr),
+            Expr::Between(Between {
+                expr, low, high, ..
+            }) if self.is_null(expr).is_certainly_true()
+                || (self.is_null(low.as_ref()).is_certainly_true()
+                    && self.is_null(high.as_ref()).is_certainly_true()) =>
+            {
+                // Between is always null if the left side is null
+                // or both the low and high bounds are null
+                NullableInterval::TRUE
+            }
+            _ => NullableInterval::TRUE_OR_FALSE,
+        }
+    }
+
+    fn is_null_if_any_child_null(&self, expr: &Expr) -> NullableInterval {
+        // These expressions are null if any of their direct children is null
+        // If any child is inconclusive, the result for this expression is also inconclusive
+        let mut is_null = NullableInterval::FALSE;
+
+        let _ = expr.apply_children(|child| {
+            let child_is_null = self.is_null(child);
+
+            if child_is_null.contains_value(ScalarValue::Boolean(Some(true)))? {
+                // If a child might be null, then the result may also be null
+                is_null = NullableInterval::TRUE_OR_FALSE;
+            }
+
+            if !child_is_null.contains_value(ScalarValue::Boolean(Some(false)))? {
+                // If the child is never not null, then the result can also never be not null
+                // and we can stop traversing the children
+                is_null = NullableInterval::TRUE;
+                Ok(TreeNodeRecursion::Stop)
+            } else {
+                Ok(TreeNodeRecursion::Continue)
+            }
+        });
+
+        is_null
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::expr::ScalarFunction;
+    use crate::predicate_bounds::evaluate_bounds;
+    use crate::{
+        Expr, binary_expr, col, create_udf, is_false, is_not_false, is_not_null,
+        is_not_true, is_not_unknown, is_null, is_true, is_unknown, lit, not,
+    };
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{DFSchema, Result, ScalarValue};
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use datafusion_expr_common::interval_arithmetic::NullableInterval;
+    use datafusion_expr_common::operator::Operator::{And, Eq, Or};
+    use datafusion_expr_common::signature::Volatility;
+    use std::ops::Neg;
+    use std::sync::Arc;
+
+    fn eval_bounds(predicate: &Expr) -> Result<NullableInterval> {
+        let schema = DFSchema::try_from(Schema::empty())?;
+        evaluate_bounds(predicate, None, &schema)
+    }
+
+    #[test]
+    fn evaluate_bounds_literal() {
+        #[rustfmt::skip]
+        let cases = vec![
+            (lit(ScalarValue::Null), NullableInterval::UNKNOWN),
+            (lit(false), NullableInterval::FALSE),
+            (lit(true), NullableInterval::TRUE),
+            (lit(0), NullableInterval::FALSE),
+            (lit(1), NullableInterval::TRUE),
+            (lit(ScalarValue::Utf8(None)), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        assert!(eval_bounds(&lit("foo")).is_err());
+    }
+
+    #[test]
+    fn evaluate_bounds_and() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(null.clone(), And, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, one.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, zero.clone()), NullableInterval::FALSE),
+            (binary_expr(one.clone(), And, one.clone()), NullableInterval::TRUE),
+            (binary_expr(one.clone(), And, zero.clone()), NullableInterval::FALSE),
+            (binary_expr(null.clone(), And, t.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(t.clone(), And, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, null.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), And, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, t.clone()), NullableInterval::FALSE),
+            (binary_expr(f.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), And, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), And, t.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(f.clone(), And, func.clone()), NullableInterval::FALSE),
+            (binary_expr(func.clone(), And, f.clone()), NullableInterval::FALSE),
+            (binary_expr(null.clone(), And, func.clone()), NullableInterval::FALSE_OR_UNKNOWN),
+            (binary_expr(func.clone(), And, null.clone()), NullableInterval::FALSE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_or() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(null.clone(), Or, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), Or, one.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, zero.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(one.clone(), Or, one.clone()), NullableInterval::TRUE),
+            (binary_expr(one.clone(), Or, zero.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), Or, null.clone()), NullableInterval::TRUE),
+            (binary_expr(null.clone(), Or, f.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(f.clone(), Or, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(t.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(t.clone(), Or, f.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, f.clone()), NullableInterval::FALSE),
+            (binary_expr(t.clone(), Or, func.clone()), NullableInterval::TRUE),
+            (binary_expr(func.clone(), Or, t.clone()), NullableInterval::TRUE),
+            (binary_expr(f.clone(), Or, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), Or, f.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(null.clone(), Or, func.clone()), NullableInterval::TRUE_OR_UNKNOWN),
+            (binary_expr(func.clone(), Or, null.clone()), NullableInterval::TRUE_OR_UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_not() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (not(null.clone()), NullableInterval::UNKNOWN),
+            (not(one.clone()), NullableInterval::FALSE),
+            (not(zero.clone()), NullableInterval::TRUE),
+            (not(t.clone()), NullableInterval::FALSE),
+            (not(f.clone()), NullableInterval::TRUE),
+            (not(func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_is() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let one = lit(1);
+        let t = lit(true);
+        let f = lit(false);
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::UInt8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::UInt8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (is_null(null.clone()), NullableInterval::TRUE),
+            (is_null(one.clone()), NullableInterval::FALSE),
+            (is_null(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_not_null(null.clone()), NullableInterval::FALSE),
+            (is_not_null(one.clone()), NullableInterval::TRUE),
+            (is_not_null(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_true(null.clone()), NullableInterval::FALSE),
+            (is_true(t.clone()), NullableInterval::TRUE),
+            (is_true(f.clone()), NullableInterval::FALSE),
+            (is_true(zero.clone()), NullableInterval::FALSE),
+            (is_true(one.clone()), NullableInterval::TRUE),
+            (is_true(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_not_true(null.clone()), NullableInterval::TRUE),
+            (is_not_true(t.clone()), NullableInterval::FALSE),
+            (is_not_true(f.clone()), NullableInterval::TRUE),
+            (is_not_true(zero.clone()), NullableInterval::TRUE),
+            (is_not_true(one.clone()), NullableInterval::FALSE),
+            (is_not_true(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_false(null.clone()), NullableInterval::FALSE),
+            (is_false(t.clone()), NullableInterval::FALSE),
+            (is_false(f.clone()), NullableInterval::TRUE),
+            (is_false(zero.clone()), NullableInterval::TRUE),
+            (is_false(one.clone()), NullableInterval::FALSE),
+            (is_false(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+            (is_not_false(null.clone()), NullableInterval::TRUE),
+            (is_not_false(t.clone()), NullableInterval::TRUE),
+            (is_not_false(f.clone()), NullableInterval::FALSE),
+            (is_not_false(zero.clone()), NullableInterval::FALSE),
+            (is_not_false(one.clone()), NullableInterval::TRUE),
+            (is_not_false(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_unknown(null.clone()), NullableInterval::TRUE),
+            (is_unknown(t.clone()), NullableInterval::FALSE),
+            (is_unknown(f.clone()), NullableInterval::FALSE),
+            (is_unknown(zero.clone()), NullableInterval::FALSE),
+            (is_unknown(one.clone()), NullableInterval::FALSE),
+            (is_unknown(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::TRUE),
+            (is_not_unknown(null.clone()), NullableInterval::FALSE),
+            (is_not_unknown(t.clone()), NullableInterval::TRUE),
+            (is_not_unknown(f.clone()), NullableInterval::TRUE),
+            (is_not_unknown(zero.clone()), NullableInterval::TRUE),
+            (is_not_unknown(one.clone()), NullableInterval::TRUE),
+            (is_not_unknown(binary_expr(null.clone(), Eq, null.clone())), NullableInterval::FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (is_null(col.clone()), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_null(col.clone()), &not_nullable_schema, NullableInterval::FALSE),
+            (is_null(binary_expr(col.clone(), Eq, col.clone())), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_null(binary_expr(col.clone(), Eq, col.clone())), &not_nullable_schema, NullableInterval::FALSE),
+            (is_not_null(col.clone()), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_not_null(col.clone()), &not_nullable_schema, NullableInterval::TRUE),
+            (is_not_null(binary_expr(col.clone(), Eq, col.clone())), &nullable_schema, NullableInterval::TRUE_OR_FALSE),
+            (is_not_null(binary_expr(col.clone(), Eq, col.clone())), &not_nullable_schema, NullableInterval::TRUE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, case.1).unwrap(),
+                case.2,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_between() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (zero.clone().between(zero.clone(), zero.clone()), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().between(zero.clone(), zero.clone()), NullableInterval::UNKNOWN),
+            (zero.clone().between(null.clone(), zero.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (zero.clone().between(zero.clone(), null.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (zero.clone().between(null.clone(), null.clone()), NullableInterval::UNKNOWN),
+            (null.clone().between(null.clone(), null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_binary_op() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(zero.clone(), Eq, zero.clone()), NullableInterval::TRUE_OR_FALSE),
+            (binary_expr(null.clone(), Eq, zero.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(zero.clone(), Eq, null.clone()), NullableInterval::UNKNOWN),
+            (binary_expr(null.clone(), Eq, null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (binary_expr(zero.clone(), Eq, col.clone()), NullableInterval::TRUE_OR_FALSE),
+            (binary_expr(col.clone(), Eq, zero.clone()), NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &not_nullable_schema).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &nullable_schema).unwrap(),
+                NullableInterval::ANY_TRUTH_VALUE,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_negative() {
+        let null = lit(ScalarValue::Null);
+        let zero = lit(0);
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (zero.clone().neg(), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().neg(), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_like() {
+        let null = lit(ScalarValue::Null);
+        let expr = lit("foo");
+        let pattern = lit("f.*");
+        let col = col("col");
+        let nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            true,
+        )]))
+        .unwrap();
+        let not_nullable_schema = DFSchema::try_from(Schema::new(vec![Field::new(
+            "col",
+            DataType::Utf8,
+            false,
+        )]))
+        .unwrap();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (expr.clone().like(pattern.clone()), NullableInterval::TRUE_OR_FALSE),
+            (null.clone().like(pattern.clone()), NullableInterval::UNKNOWN),
+            (expr.clone().like(null.clone()), NullableInterval::UNKNOWN),
+            (null.clone().like(null.clone()), NullableInterval::UNKNOWN),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                eval_bounds(&case.0).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+        }
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (col.clone().like(pattern.clone()), NullableInterval::TRUE_OR_FALSE),
+            (expr.clone().like(col.clone()), NullableInterval::TRUE_OR_FALSE),
+        ];
+
+        for case in cases {
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &not_nullable_schema).unwrap(),
+                case.1,
+                "Failed for {}",
+                case.0
+            );
+
+            assert_eq!(
+                evaluate_bounds(&case.0, None, &nullable_schema).unwrap(),
+                NullableInterval::ANY_TRUTH_VALUE,
+                "Failed for {}",
+                case.0
+            );
+        }
+    }
+
+    #[test]
+    fn evaluate_bounds_udf() {
+        let func = make_scalar_func_expr();
+
+        #[rustfmt::skip]
+        let cases = vec![
+            (func.clone(), NullableInterval::ANY_TRUTH_VALUE),
+            (not(func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+            (binary_expr(func.clone(), And, func.clone()), NullableInterval::ANY_TRUTH_VALUE),
+        ];
+
+        for case in cases {
+            assert_eq!(eval_bounds(&case.0).unwrap(), case.1);
+        }
+    }
+
+    fn make_scalar_func_expr() -> Expr {
+        let scalar_func_impl =
+            |_: &[ColumnarValue]| Ok(ColumnarValue::Scalar(ScalarValue::Null));
+        let udf = create_udf(
+            "foo",
+            vec![],
+            DataType::Boolean,
+            Volatility::Stable,
+            Arc::new(scalar_func_impl),
+        );
+        Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), vec![]))
+    }
+}
diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs
new file mode 100644
index 0000000000000..67ca7a91bbf38
--- /dev/null
+++ b/datafusion/expr/src/preimage.rs
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr_common::interval_arithmetic::Interval;
+
+use crate::Expr;
+
+/// Return from [`crate::ScalarUDFImpl::preimage`]
+pub enum PreimageResult {
+    /// No preimage exists for the specified value
+    None,
+    /// The expression always evaluates to the specified constant
+    /// given that `expr` is within the interval
+    Range { expr: Expr, interval: Box<Interval> },
+}
diff --git a/datafusion/expr/src/ptr_eq.rs b/datafusion/expr/src/ptr_eq.rs
new file mode 100644
index 0000000000000..79ea3d7219143
--- /dev/null
+++ b/datafusion/expr/src/ptr_eq.rs
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+use std::hash::{Hash, Hasher};
+use std::ops::Deref;
+use std::sync::Arc;
+
+/// Compares two `Arc` pointers for equality based on their underlying pointers values.
+/// This is not equivalent to [`Arc::ptr_eq`] for fat pointers, see that method
+/// for more information.
+pub fn arc_ptr_eq<T: ?Sized>(a: &Arc<T>, b: &Arc<T>) -> bool {
+    std::ptr::eq(Arc::as_ptr(a), Arc::as_ptr(b))
+}
+
+/// Hashes an `Arc` pointer based on its underlying pointer value.
+/// The general contract for this function is that if [`arc_ptr_eq`] returns `true`
+/// for two `Arc`s, then this function should return the same hash value for both.
+pub fn arc_ptr_hash<T: ?Sized>(a: &Arc<T>, hasher: &mut impl Hasher) {
+    std::ptr::hash(Arc::as_ptr(a), hasher)
+}
+
+/// A wrapper around a pointer that implements `Eq` and `Hash` comparing
+/// the underlying pointer address.
+///
+/// If you have pointers to a `dyn UDF impl` consider using [`super::udf_eq::UdfEq`].
+#[derive(Clone)]
+#[expect(private_bounds)] // This is so that PtrEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
+pub struct PtrEq<Ptr: PointerType>(Ptr);
+
+impl<T> PartialEq for PtrEq<Arc<T>>
+where
+    T: ?Sized,
+{
+    fn eq(&self, other: &Self) -> bool {
+        arc_ptr_eq(&self.0, &other.0)
+    }
+}
+impl<T> Eq for PtrEq<Arc<T>> where T: ?Sized {}
+
+impl<T> Hash for PtrEq<Arc<T>>
+where
+    T: ?Sized,
+{
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        arc_ptr_hash(&self.0, state);
+    }
+}
+
+impl<Ptr> From<Ptr> for PtrEq<Ptr>
+where
+    Ptr: PointerType,
+{
+    fn from(ptr: Ptr) -> Self {
+        PtrEq(ptr)
+    }
+}
+
+impl<T> From<PtrEq<Arc<T>>> for Arc<T>
+where
+    T: ?Sized,
+{
+    fn from(wrapper: PtrEq<Arc<T>>) -> Self {
+        wrapper.0
+    }
+}
+
+impl<Ptr> Debug for PtrEq<Ptr>
+where
+    Ptr: PointerType + Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl<Ptr> Deref for PtrEq<Ptr>
+where
+    Ptr: PointerType,
+{
+    type Target = Ptr;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+trait PointerType {}
+impl<T> PointerType for Arc<T> where T: ?Sized {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::hash::DefaultHasher;
+
+    #[test]
+    pub fn test_ptr_eq_wrapper() {
+        let a = Arc::new("Hello".to_string());
+        let b = Arc::new(a.deref().clone());
+        let c = Arc::new("world".to_string());
+
+        let wrapper = PtrEq(Arc::clone(&a));
+        assert_eq!(wrapper, wrapper);
+
+        // same address (equal)
+        assert_eq!(PtrEq(Arc::clone(&a)), PtrEq(Arc::clone(&a)));
+        assert_eq!(hash(PtrEq(Arc::clone(&a))), hash(PtrEq(Arc::clone(&a))));
+
+        // different address, same content (not equal)
+        assert_ne!(PtrEq(Arc::clone(&a)), PtrEq(Arc::clone(&b)));
+
+        // different address, different content (not equal)
+        assert_ne!(PtrEq(Arc::clone(&a)), PtrEq(Arc::clone(&c)));
+    }
+
+    fn hash<T: Hash>(value: T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
+}
diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs
index 4eb49710bcf85..472e065211aac 100644
--- a/datafusion/expr/src/registry.rs
+++ b/datafusion/expr/src/registry.rs
@@ -20,16 +20,22 @@
 use crate::expr_rewriter::FunctionRewrite;
 use crate::planner::ExprPlanner;
 use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF};
-use datafusion_common::{not_impl_err, plan_datafusion_err, HashMap, Result};
+use datafusion_common::{HashMap, Result, not_impl_err, plan_datafusion_err};
 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::sync::Arc;
 
 /// A registry knows how to build logical expressions out of user-defined function' names
 pub trait FunctionRegistry {
-    /// Set of all available udfs.
+    /// Returns names of all available scalar user defined functions.
     fn udfs(&self) -> HashSet<String>;
 
+    /// Returns names of all available aggregate user defined functions.
+    fn udafs(&self) -> HashSet<String>;
+
+    /// Returns names of all available window user defined functions.
+    fn udwfs(&self) -> HashSet<String>;
+
     /// Returns a reference to the user defined scalar function (udf) named
     /// `name`.
     fn udf(&self, name: &str) -> Result<Arc<ScalarUDF>>;
@@ -200,4 +206,12 @@ impl FunctionRegistry for MemoryFunctionRegistry {
     fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
         vec![]
     }
+
+    fn udafs(&self) -> HashSet<String> {
+        self.udafs.keys().cloned().collect()
+    }
+
+    fn udwfs(&self) -> HashSet<String> {
+        self.udwfs.keys().cloned().collect()
+    }
 }
diff --git a/datafusion/expr/src/select_expr.rs b/datafusion/expr/src/select_expr.rs
index 039df20f397b5..22b9660572a66 100644
--- a/datafusion/expr/src/select_expr.rs
+++ b/datafusion/expr/src/select_expr.rs
@@ -20,7 +20,7 @@ use std::fmt;
 use arrow::datatypes::FieldRef;
 use datafusion_common::{Column, TableReference};
 
-use crate::{expr::WildcardOptions, Expr};
+use crate::{Expr, expr::WildcardOptions};
 
 /// Represents a SELECT expression in a SQL query.
 ///
@@ -44,10 +44,8 @@ use crate::{expr::WildcardOptions, Expr};
 /// let wildcard = SelectExpr::Wildcard(WildcardOptions::default());
 ///
 /// // SELECT mytable.*
-/// let qualified = SelectExpr::QualifiedWildcard(
-///     "mytable".into(),
-///     WildcardOptions::default()
-/// );
+/// let qualified =
+///     SelectExpr::QualifiedWildcard("mytable".into(), WildcardOptions::default());
 ///
 /// // SELECT col1
 /// let expr = SelectExpr::Expression(col("col1").into());
diff --git a/datafusion/expr/src/simplify.rs b/datafusion/expr/src/simplify.rs
index 411dbbdc4034e..8c68067a55a37 100644
--- a/datafusion/expr/src/simplify.rs
+++ b/datafusion/expr/src/simplify.rs
@@ -15,102 +15,103 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Structs and traits to provide the information needed for expression simplification.
+//! Structs to provide the information needed for expression simplification.
+
+use std::sync::Arc;
 
 use arrow::datatypes::DataType;
-use datafusion_common::{DFSchemaRef, DataFusionError, Result};
+use chrono::{DateTime, Utc};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DFSchema, DFSchemaRef, Result};
 
-use crate::{execution_props::ExecutionProps, Expr, ExprSchemable};
+use crate::{Expr, ExprSchemable};
 
-/// Provides the information necessary to apply algebraic simplification to an
-/// [Expr]. See [SimplifyContext] for one concrete implementation.
-///
-/// This trait exists so that other systems can plug schema
-/// information in without having to create `DFSchema` objects. If you
-/// have a [`DFSchemaRef`] you can use [`SimplifyContext`]
-pub trait SimplifyInfo {
-    /// Returns true if this Expr has boolean type
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool>;
-
-    /// Returns true of this expr is nullable (could possibly be NULL)
-    fn nullable(&self, expr: &Expr) -> Result<bool>;
-
-    /// Returns details needed for partial expression evaluation
-    fn execution_props(&self) -> &ExecutionProps;
-
-    /// Returns data type of this expr needed for determining optimized int type of a value
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType>;
-}
-
-/// Provides simplification information based on DFSchema and
-/// [`ExecutionProps`]. This is the default implementation used by DataFusion
+/// Provides simplification information based on schema, query execution time,
+/// and configuration options.
 ///
 /// # Example
 /// See the `simplify_demo` in the [`expr_api` example]
 ///
-/// [`expr_api` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+/// [`expr_api` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
 #[derive(Debug, Clone)]
-pub struct SimplifyContext<'a> {
-    schema: Option<DFSchemaRef>,
-    props: &'a ExecutionProps,
+pub struct SimplifyContext {
+    schema: DFSchemaRef,
+    query_execution_start_time: Option<DateTime<Utc>>,
+    config_options: Arc<ConfigOptions>,
 }
 
-impl<'a> SimplifyContext<'a> {
-    /// Create a new SimplifyContext
-    pub fn new(props: &'a ExecutionProps) -> Self {
+impl Default for SimplifyContext {
+    fn default() -> Self {
         Self {
-            schema: None,
-            props,
+            schema: Arc::new(DFSchema::empty()),
+            query_execution_start_time: None,
+            config_options: Arc::new(ConfigOptions::default()),
         }
     }
+}
+
+impl SimplifyContext {
+    /// Set the [`ConfigOptions`] for this context
+    pub fn with_config_options(mut self, config_options: Arc<ConfigOptions>) -> Self {
+        self.config_options = config_options;
+        self
+    }
 
-    /// Register a [`DFSchemaRef`] with this context
+    /// Set the schema for this context
     pub fn with_schema(mut self, schema: DFSchemaRef) -> Self {
-        self.schema = Some(schema);
+        self.schema = schema;
         self
     }
-}
 
-impl SimplifyInfo for SimplifyContext<'_> {
-    /// Returns true if this Expr has boolean type
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-        if let Some(schema) = &self.schema {
-            if let Ok(DataType::Boolean) = expr.get_type(schema) {
-                return Ok(true);
-            }
-        }
+    /// Set the query execution start time
+    pub fn with_query_execution_start_time(
+        mut self,
+        query_execution_start_time: Option<DateTime<Utc>>,
+    ) -> Self {
+        self.query_execution_start_time = query_execution_start_time;
+        self
+    }
 
-        Ok(false)
+    /// Set the query execution start to the current time
+    pub fn with_current_time(mut self) -> Self {
+        self.query_execution_start_time = Some(Utc::now());
+        self
+    }
+
+    /// Returns the schema
+    pub fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    /// Returns true if this Expr has boolean type
+    pub fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+        Ok(expr.get_type(&self.schema)? == DataType::Boolean)
     }
 
     /// Returns true if expr is nullable
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        let schema = self.schema.as_ref().ok_or_else(|| {
-            DataFusionError::Internal(
-                "attempt to get nullability without schema".to_string(),
-            )
-        })?;
-        expr.nullable(schema.as_ref())
+    pub fn nullable(&self, expr: &Expr) -> Result<bool> {
+        expr.nullable(self.schema.as_ref())
     }
 
     /// Returns data type of this expr needed for determining optimized int type of a value
-    fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-        let schema = self.schema.as_ref().ok_or_else(|| {
-            DataFusionError::Internal(
-                "attempt to get data type without schema".to_string(),
-            )
-        })?;
-        expr.get_type(schema)
+    pub fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
+        expr.get_type(&self.schema)
+    }
+
+    /// Returns the time at which the query execution started.
+    /// If `None`, time-dependent functions like `now()` will not be simplified.
+    pub fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
+        self.query_execution_start_time
     }
 
-    fn execution_props(&self) -> &ExecutionProps {
-        self.props
+    /// Returns the configuration options for the session.
+    pub fn config_options(&self) -> &Arc<ConfigOptions> {
+        &self.config_options
     }
 }
 
 /// Was the expression simplified?
 #[derive(Debug)]
-#[allow(clippy::large_enum_variant)]
 pub enum ExprSimplifyResult {
     /// The function call was simplified to an entirely new Expr
     Simplified(Expr),
diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs
index d6155cfb5dc02..d3b253c0e102c 100644
--- a/datafusion/expr/src/table_source.rs
+++ b/datafusion/expr/src/table_source.rs
@@ -32,7 +32,7 @@ use std::{any::Any, borrow::Cow};
 /// the filter") are returned. Rows that evaluate to `false` or `NULL` are
 /// omitted.
 ///
-/// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan
+/// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html#tymethod.scan
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum TableProviderFilterPushDown {
     /// The filter cannot be used by the provider and will not be pushed down.
@@ -89,7 +89,7 @@ impl std::fmt::Display for TableType {
 /// plan code be dependent on the DataFusion execution engine. Some projects use
 /// DataFusion's logical plans and have their own execution engine.
 ///
-/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html
+/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
 /// [`DefaultTableSource`]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html
 pub trait TableSource: Sync + Send {
     fn as_any(&self) -> &dyn Any;
@@ -121,7 +121,7 @@ pub trait TableSource: Sync + Send {
     /// Get the Logical plan of this table provider, if available.
     ///
     /// For example, a view may have a logical plan, but a CSV file does not.
-    fn get_logical_plan(&self) -> Option<Cow<LogicalPlan>> {
+    fn get_logical_plan(&'_ self) -> Option<Cow<'_, LogicalPlan>> {
         None
     }
 
diff --git a/datafusion/expr/src/test/function_stub.rs b/datafusion/expr/src/test/function_stub.rs
index 673908a4d7e7d..26ac16d90d69f 100644
--- a/datafusion/expr/src/test/function_stub.rs
+++ b/datafusion/expr/src/test/function_stub.rs
@@ -22,23 +22,25 @@
 use std::any::Any;
 
 use arrow::datatypes::{
-    DataType, FieldRef, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION,
+    DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, FieldRef,
 };
 
-use datafusion_common::{exec_err, not_impl_err, utils::take_function_args, Result};
+use datafusion_common::plan_err;
+use datafusion_common::{Result, exec_err, not_impl_err, utils::take_function_args};
 
-use crate::type_coercion::aggregates::{avg_return_type, coerce_avg_type, NUMERICS};
 use crate::Volatility::Immutable;
+use crate::type_coercion::aggregates::NUMERICS;
 use crate::{
+    Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature,
     expr::AggregateFunction,
     function::{AccumulatorArgs, StateFieldsArgs},
     utils::AggregateOrderSensitivity,
-    Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature,
 };
 
 macro_rules! create_func {
     ($UDAF:ty, $AGGREGATE_UDF_FN:ident) => {
-        paste::paste! {
             #[doc = concat!("AggregateFunction that returns a [AggregateUDF](crate::AggregateUDF) for [`", stringify!($UDAF), "`]")]
             pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc<crate::AggregateUDF> {
                 // Singleton instance of [$UDAF], ensures the UDAF is only created once
@@ -48,7 +50,6 @@ macro_rules! create_func {
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     }
 }
 
@@ -60,7 +61,7 @@ pub fn sum(expr: Expr) -> Expr {
         vec![expr],
         false,
         None,
-        None,
+        vec![],
         None,
     ))
 }
@@ -73,7 +74,7 @@ pub fn count(expr: Expr) -> Expr {
         vec![expr],
         false,
         None,
-        None,
+        vec![],
         None,
     ))
 }
@@ -86,13 +87,13 @@ pub fn avg(expr: Expr) -> Expr {
         vec![expr],
         false,
         None,
-        None,
+        vec![],
         None,
     ))
 }
 
 /// Stub `sum` used for optimizer testing
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Sum {
     signature: Signature,
 }
@@ -135,13 +136,14 @@ impl AggregateUDFImpl for Sum {
                 DataType::Dictionary(_, v) => coerced_type(v),
                 // in the spark, the result type is DECIMAL(min(38,precision+10), s)
                 // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
-                DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => {
-                    Ok(data_type.clone())
-                }
+                DataType::Decimal32(_, _)
+                | DataType::Decimal64(_, _)
+                | DataType::Decimal128(_, _)
+                | DataType::Decimal256(_, _) => Ok(data_type.clone()),
                 dt if dt.is_signed_integer() => Ok(DataType::Int64),
                 dt if dt.is_unsigned_integer() => Ok(DataType::UInt64),
                 dt if dt.is_floating() => Ok(DataType::Float64),
-                _ => exec_err!("Sum not supported for {}", data_type),
+                _ => exec_err!("Sum not supported for {data_type}"),
             }
         }
 
@@ -153,6 +155,18 @@ impl AggregateUDFImpl for Sum {
             DataType::Int64 => Ok(DataType::Int64),
             DataType::UInt64 => Ok(DataType::UInt64),
             DataType::Float64 => Ok(DataType::Float64),
+            DataType::Decimal32(precision, scale) => {
+                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
+                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+                let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal32(new_precision, *scale))
+            }
+            DataType::Decimal64(precision, scale) => {
+                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
+                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+                let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal64(new_precision, *scale))
+            }
             DataType::Decimal128(precision, scale) => {
                 // in the spark, the result type is DECIMAL(min(38,precision+10), s)
                 // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
@@ -179,10 +193,6 @@ impl AggregateUDFImpl for Sum {
         unreachable!("stub should not have state_fields()")
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
         false
     }
@@ -204,6 +214,7 @@ impl AggregateUDFImpl for Sum {
 }
 
 /// Testing stub implementation of COUNT aggregate
+#[derive(PartialEq, Eq, Hash)]
 pub struct Count {
     signature: Signature,
     aliases: Vec<String>,
@@ -286,12 +297,13 @@ pub fn min(expr: Expr) -> Expr {
         vec![expr],
         false,
         None,
-        None,
+        vec![],
         None,
     ))
 }
 
 /// Testing stub implementation of Min aggregate
+#[derive(PartialEq, Eq, Hash)]
 pub struct Min {
     signature: Signature,
 }
@@ -344,10 +356,6 @@ impl AggregateUDFImpl for Min {
         not_impl_err!("no impl for stub")
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn create_groups_accumulator(
         &self,
         _args: AccumulatorArgs,
@@ -371,12 +379,13 @@ pub fn max(expr: Expr) -> Expr {
         vec![expr],
         false,
         None,
-        None,
+        vec![],
         None,
     ))
 }
 
 /// Testing stub implementation of MAX aggregate
+#[derive(PartialEq, Eq, Hash)]
 pub struct Max {
     signature: Signature,
 }
@@ -429,10 +438,6 @@ impl AggregateUDFImpl for Max {
         not_impl_err!("no impl for stub")
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn create_groups_accumulator(
         &self,
         _args: AccumulatorArgs,
@@ -449,7 +454,7 @@ impl AggregateUDFImpl for Max {
 }
 
 /// Testing stub implementation of avg aggregate
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Avg {
     signature: Signature,
     aliases: Vec<String>,
@@ -483,8 +488,61 @@ impl AggregateUDFImpl for Avg {
         &self.signature
     }
 
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let [args] = take_function_args(self.name(), arg_types)?;
+
+        // Supported types smallint, int, bigint, real, double precision, decimal, or interval
+        // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+        fn coerced_type(data_type: &DataType) -> Result<DataType> {
+            match &data_type {
+                DataType::Decimal32(p, s) => Ok(DataType::Decimal32(*p, *s)),
+                DataType::Decimal64(p, s) => Ok(DataType::Decimal64(*p, *s)),
+                DataType::Decimal128(p, s) => Ok(DataType::Decimal128(*p, *s)),
+                DataType::Decimal256(p, s) => Ok(DataType::Decimal256(*p, *s)),
+                d if d.is_numeric() => Ok(DataType::Float64),
+                DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
+                DataType::Dictionary(_, v) => coerced_type(v.as_ref()),
+                _ => {
+                    plan_err!("Avg does not support inputs of type {data_type}.")
+                }
+            }
+        }
+        Ok(vec![coerced_type(args)?])
+    }
+
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        avg_return_type(self.name(), &arg_types[0])
+        match &arg_types[0] {
+            DataType::Decimal32(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL32_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal32(new_precision, new_scale))
+            }
+            DataType::Decimal64(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL64_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal64(new_precision, new_scale))
+            }
+            DataType::Decimal128(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL128_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal128(new_precision, new_scale))
+            }
+            DataType::Decimal256(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL256_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal256(new_precision, new_scale))
+            }
+            DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
+            _ => Ok(DataType::Float64),
+        }
     }
 
     fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -494,11 +552,8 @@ impl AggregateUDFImpl for Avg {
     fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         not_impl_err!("no impl for stub")
     }
+
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        coerce_avg_type(self.name(), arg_types)
-    }
 }
diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs
index f20dab7e165fc..f3bec6bbf9954 100644
--- a/datafusion/expr/src/tree_node.rs
+++ b/datafusion/expr/src/tree_node.rs
@@ -17,17 +17,17 @@
 
 //! Tree node implementation for Logical Expressions
 
+use crate::Expr;
 use crate::expr::{
     AggregateFunction, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Cast,
-    GroupingSet, InList, InSubquery, Like, Placeholder, ScalarFunction, TryCast, Unnest,
-    WindowFunction, WindowFunctionParams,
+    GroupingSet, InList, InSubquery, Like, Placeholder, ScalarFunction, SetComparison,
+    TryCast, Unnest, WindowFunction, WindowFunctionParams,
 };
-use crate::{Expr, ExprFunctionExt};
 
+use datafusion_common::Result;
 use datafusion_common::tree_node::{
     Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, TreeNodeRefContainer,
 };
-use datafusion_common::Result;
 
 /// Implementation of the [`TreeNode`] trait
 ///
@@ -58,7 +58,8 @@ impl TreeNode for Expr {
             | Expr::Negative(expr)
             | Expr::Cast(Cast { expr, .. })
             | Expr::TryCast(TryCast { expr, .. })
-            | Expr::InSubquery(InSubquery { expr, .. }) => expr.apply_elements(f),
+            | Expr::InSubquery(InSubquery { expr, .. })
+            | Expr::SetComparison(SetComparison { expr, .. }) => expr.apply_elements(f),
             Expr::GroupingSet(GroupingSet::Rollup(exprs))
             | Expr::GroupingSet(GroupingSet::Cube(exprs)) => exprs.apply_elements(f),
             Expr::ScalarFunction(ScalarFunction { args, .. }) => {
@@ -73,7 +74,7 @@ impl TreeNode for Expr {
             // Treat OuterReferenceColumn as a leaf expression
             | Expr::OuterReferenceColumn(_, _)
             | Expr::ScalarVariable(_, _)
-            | Expr::Literal(_)
+            | Expr::Literal(_, _)
             | Expr::Exists { .. }
             | Expr::ScalarSubquery(_)
             | Expr::Wildcard { .. }
@@ -92,14 +93,17 @@ impl TreeNode for Expr {
                 (expr, when_then_expr, else_expr).apply_ref_elements(f),
             Expr::AggregateFunction(AggregateFunction { params: AggregateFunctionParams { args, filter, order_by, ..}, .. }) =>
                 (args, filter, order_by).apply_ref_elements(f),
-            Expr::WindowFunction(WindowFunction {
-                params : WindowFunctionParams {
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunctionParams {
                     args,
                     partition_by,
                     order_by,
-                    ..}, ..}) => {
-                (args, partition_by, order_by).apply_ref_elements(f)
+                    filter,
+                    ..
+                } = &window_fun.as_ref().params;
+                (args, partition_by, order_by, filter).apply_ref_elements(f)
             }
+
             Expr::InList(InList { expr, list, .. }) => {
                 (expr, list).apply_ref_elements(f)
             }
@@ -124,7 +128,20 @@ impl TreeNode for Expr {
             | Expr::Exists { .. }
             | Expr::ScalarSubquery(_)
             | Expr::ScalarVariable(_, _)
-            | Expr::Literal(_) => Transformed::no(self),
+            | Expr::Literal(_, _) => Transformed::no(self),
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => expr.map_elements(f)?.update_data(|expr| {
+                Expr::SetComparison(SetComparison {
+                    expr,
+                    subquery,
+                    op,
+                    quantifier,
+                })
+            }),
             Expr::Unnest(Unnest { expr, .. }) => expr
                 .map_elements(f)?
                 .update_data(|expr| Expr::Unnest(Unnest { expr })),
@@ -217,12 +234,12 @@ impl TreeNode for Expr {
                 .update_data(|(new_expr, new_when_then_expr, new_else_expr)| {
                     Expr::Case(Case::new(new_expr, new_when_then_expr, new_else_expr))
                 }),
-            Expr::Cast(Cast { expr, data_type }) => expr
+            Expr::Cast(Cast { expr, field }) => expr
                 .map_elements(f)?
-                .update_data(|be| Expr::Cast(Cast::new(be, data_type))),
-            Expr::TryCast(TryCast { expr, data_type }) => expr
+                .update_data(|be| Expr::Cast(Cast::new_from_field(be, field))),
+            Expr::TryCast(TryCast { expr, field }) => expr
                 .map_elements(f)?
-                .update_data(|be| Expr::TryCast(TryCast::new(be, data_type))),
+                .update_data(|be| Expr::TryCast(TryCast::new_from_field(be, field))),
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
                 args.map_elements(f)?.map_data(|new_args| {
                     Ok(Expr::ScalarFunction(ScalarFunction::new_udf(
@@ -230,27 +247,40 @@ impl TreeNode for Expr {
                     )))
                 })?
             }
-            Expr::WindowFunction(WindowFunction {
-                fun,
-                params:
-                    WindowFunctionParams {
-                        args,
-                        partition_by,
-                        order_by,
-                        window_frame,
-                        null_treatment,
-                    },
-            }) => (args, partition_by, order_by).map_elements(f)?.update_data(
-                |(new_args, new_partition_by, new_order_by)| {
-                    Expr::WindowFunction(WindowFunction::new(fun, new_args))
-                        .partition_by(new_partition_by)
-                        .order_by(new_order_by)
-                        .window_frame(window_frame)
-                        .null_treatment(null_treatment)
-                        .build()
-                        .unwrap()
-                },
-            ),
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction {
+                    fun,
+                    params:
+                        WindowFunctionParams {
+                            args,
+                            partition_by,
+                            order_by,
+                            window_frame,
+                            filter,
+                            null_treatment,
+                            distinct,
+                        },
+                } = *window_fun;
+
+                (args, partition_by, order_by, filter)
+                    .map_elements(f)?
+                    .map_data(
+                        |(new_args, new_partition_by, new_order_by, new_filter)| {
+                            Ok(Expr::from(WindowFunction {
+                                fun,
+                                params: WindowFunctionParams {
+                                    args: new_args,
+                                    partition_by: new_partition_by,
+                                    order_by: new_order_by,
+                                    window_frame,
+                                    filter: new_filter,
+                                    null_treatment,
+                                    distinct,
+                                },
+                            }))
+                        },
+                    )?
+            }
             Expr::AggregateFunction(AggregateFunction {
                 func,
                 params:
diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs
index 763a4e6539fd8..d5cb98a46ef43 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -17,17 +17,17 @@
 
 use super::binary::binary_numeric_coercion;
 use crate::{AggregateUDF, ScalarUDF, Signature, TypeSignature, WindowUDF};
-use arrow::datatypes::FieldRef;
+use arrow::datatypes::{Field, FieldRef};
 use arrow::{
     compute::can_cast_types,
     datatypes::{DataType, TimeUnit},
 };
 use datafusion_common::types::LogicalType;
 use datafusion_common::utils::{
-    base_type, coerced_fixed_size_list_to_list, ListCoercion,
+    ListCoercion, base_type, coerced_fixed_size_list_to_list,
 };
 use datafusion_common::{
-    exec_err, internal_err, plan_err, types::NativeType, utils::list_ndims, Result,
+    Result, exec_err, internal_err, plan_err, types::NativeType, utils::list_ndims,
 };
 use datafusion_expr_common::signature::ArrayFunctionArgument;
 use datafusion_expr_common::type_coercion::binary::type_union_resolution;
@@ -36,56 +36,74 @@ use datafusion_expr_common::{
     type_coercion::binary::comparison_coercion_numeric,
     type_coercion::binary::string_coercion,
 };
+use itertools::Itertools as _;
 use std::sync::Arc;
 
-/// Performs type coercion for scalar function arguments.
-///
-/// Returns the data types to which each argument must be coerced to
-/// match `signature`.
-///
-/// For more details on coercion in general, please see the
-/// [`type_coercion`](crate::type_coercion) module.
-pub fn data_types_with_scalar_udf(
-    current_types: &[DataType],
-    func: &ScalarUDF,
-) -> Result<Vec<DataType>> {
-    let signature = func.signature();
-    let type_signature = &signature.type_signature;
+/// Extension trait to unify common functionality between [`ScalarUDF`], [`AggregateUDF`]
+/// and [`WindowUDF`] for use by signature coercion functions.
+pub trait UDFCoercionExt {
+    /// Should delegate to [`ScalarUDF::name`], [`AggregateUDF::name`] or [`WindowUDF::name`].
+    fn name(&self) -> &str;
+    /// Should delegate to [`ScalarUDF::signature`], [`AggregateUDF::signature`]
+    /// or [`WindowUDF::signature`].
+    fn signature(&self) -> &Signature;
+    /// Should delegate to [`ScalarUDF::coerce_types`], [`AggregateUDF::coerce_types`]
+    /// or [`WindowUDF::coerce_types`].
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>>;
+}
 
-    if current_types.is_empty() && type_signature != &TypeSignature::UserDefined {
-        if type_signature.supports_zero_argument() {
-            return Ok(vec![]);
-        } else if type_signature.used_to_support_zero_arguments() {
-            // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
-        } else {
-            return plan_err!("'{}' does not support zero arguments", func.name());
-        }
+impl UDFCoercionExt for ScalarUDF {
+    fn name(&self) -> &str {
+        self.name()
     }
 
-    let valid_types =
-        get_valid_types_with_scalar_udf(type_signature, current_types, func)?;
+    fn signature(&self) -> &Signature {
+        self.signature()
+    }
 
-    if valid_types
-        .iter()
-        .any(|data_type| data_type == current_types)
-    {
-        return Ok(current_types.to_vec());
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
+}
+
+impl UDFCoercionExt for AggregateUDF {
+    fn name(&self) -> &str {
+        self.name()
+    }
+
+    fn signature(&self) -> &Signature {
+        self.signature()
     }
 
-    try_coerce_types(func.name(), valid_types, current_types, type_signature)
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
 }
 
-/// Performs type coercion for aggregate function arguments.
+impl UDFCoercionExt for WindowUDF {
+    fn name(&self) -> &str {
+        self.name()
+    }
+
+    fn signature(&self) -> &Signature {
+        self.signature()
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        self.coerce_types(arg_types)
+    }
+}
+
+/// Performs type coercion for UDF arguments.
 ///
-/// Returns the fields to which each argument must be coerced to
+/// Returns the data types to which each argument must be coerced to
 /// match `signature`.
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
-pub fn fields_with_aggregate_udf(
+pub fn fields_with_udf<F: UDFCoercionExt>(
     current_fields: &[FieldRef],
-    func: &AggregateUDF,
+    func: &F,
 ) -> Result<Vec<FieldRef>> {
     let signature = func.signature();
     let type_signature = &signature.type_signature;
@@ -95,7 +113,10 @@ pub fn fields_with_aggregate_udf(
             return Ok(vec![]);
         } else if type_signature.used_to_support_zero_arguments() {
             // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
+            return plan_err!(
+                "'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
+                func.name()
+            );
         } else {
             return plan_err!("'{}' does not support zero arguments", func.name());
         }
@@ -106,8 +127,7 @@ pub fn fields_with_aggregate_udf(
         .cloned()
         .collect::<Vec<_>>();
 
-    let valid_types =
-        get_valid_types_with_aggregate_udf(type_signature, &current_types, func)?;
+    let valid_types = get_valid_types_with_udf(type_signature, &current_types, func)?;
     if valid_types
         .iter()
         .any(|data_type| data_type == &current_types)
@@ -128,56 +148,56 @@ pub fn fields_with_aggregate_udf(
         .collect())
 }
 
-/// Performs type coercion for window function arguments.
+/// Performs type coercion for scalar function arguments.
 ///
 /// Returns the data types to which each argument must be coerced to
 /// match `signature`.
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
-pub fn fields_with_window_udf(
-    current_fields: &[FieldRef],
-    func: &WindowUDF,
-) -> Result<Vec<FieldRef>> {
-    let signature = func.signature();
-    let type_signature = &signature.type_signature;
-
-    if current_fields.is_empty() && type_signature != &TypeSignature::UserDefined {
-        if type_signature.supports_zero_argument() {
-            return Ok(vec![]);
-        } else if type_signature.used_to_support_zero_arguments() {
-            // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
-            return plan_err!("'{}' does not support zero arguments. Use TypeSignature::Nullary for zero arguments", func.name());
-        } else {
-            return plan_err!("'{}' does not support zero arguments", func.name());
-        }
-    }
-
-    let current_types = current_fields
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn data_types_with_scalar_udf(
+    current_types: &[DataType],
+    func: &ScalarUDF,
+) -> Result<Vec<DataType>> {
+    let current_fields = current_types
         .iter()
-        .map(|f| f.data_type())
-        .cloned()
+        .map(|dt| Arc::new(Field::new("f", dt.clone(), true)))
         .collect::<Vec<_>>();
-    let valid_types =
-        get_valid_types_with_window_udf(type_signature, &current_types, func)?;
-    if valid_types
+    Ok(fields_with_udf(&current_fields, func)?
         .iter()
-        .any(|data_type| data_type == &current_types)
-    {
-        return Ok(current_fields.to_vec());
-    }
+        .map(|f| f.data_type().clone())
+        .collect())
+}
 
-    let updated_types =
-        try_coerce_types(func.name(), valid_types, &current_types, type_signature)?;
+/// Performs type coercion for aggregate function arguments.
+///
+/// Returns the fields to which each argument must be coerced to
+/// match `signature`.
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn fields_with_aggregate_udf(
+    current_fields: &[FieldRef],
+    func: &AggregateUDF,
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(current_fields, func)
+}
 
-    Ok(current_fields
-        .iter()
-        .zip(updated_types)
-        .map(|(current_field, new_type)| {
-            current_field.as_ref().clone().with_data_type(new_type)
-        })
-        .map(Arc::new)
-        .collect())
+/// Performs type coercion for window function arguments.
+///
+/// Returns the data types to which each argument must be coerced to
+/// match `signature`.
+///
+/// For more details on coercion in general, please see the
+/// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
+pub fn fields_with_window_udf(
+    current_fields: &[FieldRef],
+    func: &WindowUDF,
+) -> Result<Vec<FieldRef>> {
+    fields_with_udf(current_fields, func)
 }
 
 /// Performs type coercion for function arguments.
@@ -187,6 +207,7 @@ pub fn fields_with_window_udf(
 ///
 /// For more details on coercion in general, please see the
 /// [`type_coercion`](crate::type_coercion) module.
+#[deprecated(since = "52.0.0", note = "use fields_with_udf")]
 pub fn data_types(
     function_name: impl AsRef<str>,
     current_types: &[DataType],
@@ -200,12 +221,12 @@ pub fn data_types(
         } else if type_signature.used_to_support_zero_arguments() {
             // Special error to help during upgrade: https://github.com/apache/datafusion/issues/13763
             return plan_err!(
-                "function '{}' has signature {type_signature:?} which does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
+                "function '{}' has signature {type_signature} which does not support zero arguments. Use TypeSignature::Nullary for zero arguments",
                 function_name.as_ref()
             );
         } else {
             return plan_err!(
-                "Function '{}' has signature {type_signature:?} which does not support zero arguments",
+                "Function '{}' has signature {type_signature} which does not support zero arguments",
                 function_name.as_ref()
             );
         }
@@ -229,20 +250,23 @@ pub fn data_types(
 }
 
 fn is_well_supported_signature(type_signature: &TypeSignature) -> bool {
-    if let TypeSignature::OneOf(signatures) = type_signature {
-        return signatures.iter().all(is_well_supported_signature);
-    }
-
-    matches!(
-        type_signature,
+    match type_signature {
+        TypeSignature::OneOf(type_signatures) => {
+            type_signatures.iter().all(is_well_supported_signature)
+        }
         TypeSignature::UserDefined
-            | TypeSignature::Numeric(_)
-            | TypeSignature::String(_)
-            | TypeSignature::Coercible(_)
-            | TypeSignature::Any(_)
-            | TypeSignature::Nullary
-            | TypeSignature::Comparable(_)
-    )
+        | TypeSignature::Numeric(_)
+        | TypeSignature::String(_)
+        | TypeSignature::Coercible(_)
+        | TypeSignature::Any(_)
+        | TypeSignature::Nullary
+        | TypeSignature::Comparable(_) => true,
+        TypeSignature::Variadic(_)
+        | TypeSignature::VariadicAny
+        | TypeSignature::Uniform(_, _)
+        | TypeSignature::Exact(_)
+        | TypeSignature::ArraySignature(_) => false,
+    }
 }
 
 fn try_coerce_types(
@@ -278,29 +302,32 @@ fn try_coerce_types(
 
     // none possible -> Error
     plan_err!(
-        "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {current_types:?} to the signature {type_signature:?} failed"
+        "Failed to coerce arguments to satisfy a call to '{function_name}' function: coercion from {} to the signature {type_signature} failed",
+        current_types.iter().join(", ")
     )
 }
 
-fn get_valid_types_with_scalar_udf(
+fn get_valid_types_with_udf<F: UDFCoercionExt>(
     signature: &TypeSignature,
     current_types: &[DataType],
-    func: &ScalarUDF,
+    func: &F,
 ) -> Result<Vec<Vec<DataType>>> {
-    match signature {
+    let valid_types = match signature {
         TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => Ok(vec![coerced_types]),
-            Err(e) => exec_err!(
-                "Function '{}' user-defined coercion failed with {:?}",
-                func.name(),
-                e.strip_backtrace()
-            ),
+            Ok(coerced_types) => vec![coerced_types],
+            Err(e) => {
+                return exec_err!(
+                    "Function '{}' user-defined coercion failed with: {}",
+                    func.name(),
+                    e.strip_backtrace()
+                );
+            }
         },
         TypeSignature::OneOf(signatures) => {
             let mut res = vec![];
             let mut errors = vec![];
             for sig in signatures {
-                match get_valid_types_with_scalar_udf(sig, current_types, func) {
+                match get_valid_types_with_udf(sig, current_types, func) {
                     Ok(valid_types) => {
                         res.extend(valid_types);
                     }
@@ -312,69 +339,15 @@ fn get_valid_types_with_scalar_udf(
 
             // Every signature failed, return the joined error
             if res.is_empty() {
-                internal_err!(
+                return internal_err!(
                     "Function '{}' failed to match any signature, errors: {}",
                     func.name(),
                     errors.join(",")
-                )
+                );
             } else {
-                Ok(res)
+                res
             }
         }
-        _ => get_valid_types(func.name(), signature, current_types),
-    }
-}
-
-fn get_valid_types_with_aggregate_udf(
-    signature: &TypeSignature,
-    current_types: &[DataType],
-    func: &AggregateUDF,
-) -> Result<Vec<Vec<DataType>>> {
-    let valid_types = match signature {
-        TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => vec![coerced_types],
-            Err(e) => {
-                return exec_err!(
-                    "Function '{}' user-defined coercion failed with {:?}",
-                    func.name(),
-                    e.strip_backtrace()
-                )
-            }
-        },
-        TypeSignature::OneOf(signatures) => signatures
-            .iter()
-            .filter_map(|t| {
-                get_valid_types_with_aggregate_udf(t, current_types, func).ok()
-            })
-            .flatten()
-            .collect::<Vec<_>>(),
-        _ => get_valid_types(func.name(), signature, current_types)?,
-    };
-
-    Ok(valid_types)
-}
-
-fn get_valid_types_with_window_udf(
-    signature: &TypeSignature,
-    current_types: &[DataType],
-    func: &WindowUDF,
-) -> Result<Vec<Vec<DataType>>> {
-    let valid_types = match signature {
-        TypeSignature::UserDefined => match func.coerce_types(current_types) {
-            Ok(coerced_types) => vec![coerced_types],
-            Err(e) => {
-                return exec_err!(
-                    "Function '{}' user-defined coercion failed with {:?}",
-                    func.name(),
-                    e.strip_backtrace()
-                )
-            }
-        },
-        TypeSignature::OneOf(signatures) => signatures
-            .iter()
-            .filter_map(|t| get_valid_types_with_window_udf(t, current_types, func).ok())
-            .flatten()
-            .collect::<Vec<_>>(),
         _ => get_valid_types(func.name(), signature, current_types)?,
     };
 
@@ -401,25 +374,35 @@ fn get_valid_types(
         let mut fixed_size = array_coercion != Some(&ListCoercion::FixedSizedListToList);
         let mut list_sizes = Vec::with_capacity(arguments.len());
         let mut element_types = Vec::with_capacity(arguments.len());
+        let mut nested_item_nullability = Vec::with_capacity(arguments.len());
         for (argument, current_type) in arguments.iter().zip(current_types.iter()) {
             match argument {
-                ArrayFunctionArgument::Index | ArrayFunctionArgument::String => (),
+                ArrayFunctionArgument::Index | ArrayFunctionArgument::String => {
+                    nested_item_nullability.push(None);
+                }
                 ArrayFunctionArgument::Element => {
-                    element_types.push(current_type.clone())
+                    element_types.push(current_type.clone());
+                    nested_item_nullability.push(None);
                 }
                 ArrayFunctionArgument::Array => match current_type {
-                    DataType::Null => element_types.push(DataType::Null),
+                    DataType::Null => {
+                        element_types.push(DataType::Null);
+                        nested_item_nullability.push(None);
+                    }
                     DataType::List(field) => {
                         element_types.push(field.data_type().clone());
+                        nested_item_nullability.push(Some(field.is_nullable()));
                         fixed_size = false;
                     }
                     DataType::LargeList(field) => {
                         element_types.push(field.data_type().clone());
+                        nested_item_nullability.push(Some(field.is_nullable()));
                         large_list = true;
                         fixed_size = false;
                     }
                     DataType::FixedSizeList(field, size) => {
                         element_types.push(field.data_type().clone());
+                        nested_item_nullability.push(Some(field.is_nullable()));
                         list_sizes.push(*size)
                     }
                     arg_type => {
@@ -429,33 +412,49 @@ fn get_valid_types(
             }
         }
 
+        debug_assert_eq!(nested_item_nullability.len(), arguments.len());
+
         let Some(element_type) = type_union_resolution(&element_types) else {
             return Ok(vec![vec![]]);
         };
 
         if !fixed_size {
             list_sizes.clear()
-        }
+        };
 
         let mut list_sizes = list_sizes.into_iter();
-        let valid_types = arguments.iter().zip(current_types.iter()).map(
-            |(argument_type, current_type)| match argument_type {
-                ArrayFunctionArgument::Index => DataType::Int64,
-                ArrayFunctionArgument::String => DataType::Utf8,
-                ArrayFunctionArgument::Element => element_type.clone(),
-                ArrayFunctionArgument::Array => {
-                    if current_type.is_null() {
-                        DataType::Null
-                    } else if large_list {
-                        DataType::new_large_list(element_type.clone(), true)
-                    } else if let Some(size) = list_sizes.next() {
-                        DataType::new_fixed_size_list(element_type.clone(), size, true)
-                    } else {
-                        DataType::new_list(element_type.clone(), true)
+        let valid_types = arguments
+            .iter()
+            .zip(current_types.iter())
+            .zip(nested_item_nullability)
+            .map(|((argument_type, current_type), is_nested_item_nullable)| {
+                match argument_type {
+                    ArrayFunctionArgument::Index => DataType::Int64,
+                    ArrayFunctionArgument::String => DataType::Utf8,
+                    ArrayFunctionArgument::Element => element_type.clone(),
+                    ArrayFunctionArgument::Array => {
+                        if current_type.is_null() {
+                            DataType::Null
+                        } else if large_list {
+                            DataType::new_large_list(
+                                element_type.clone(),
+                                is_nested_item_nullable.unwrap_or(true),
+                            )
+                        } else if let Some(size) = list_sizes.next() {
+                            DataType::new_fixed_size_list(
+                                element_type.clone(),
+                                size,
+                                is_nested_item_nullable.unwrap_or(true),
+                            )
+                        } else {
+                            DataType::new_list(
+                                element_type.clone(),
+                                is_nested_item_nullable.unwrap_or(true),
+                            )
+                        }
                     }
                 }
-            },
-        );
+            });
 
         Ok(vec![valid_types.collect()])
     }
@@ -488,7 +487,7 @@ fn get_valid_types(
     let valid_types = match signature {
         TypeSignature::Variadic(valid_types) => valid_types
             .iter()
-            .map(|valid_type| current_types.iter().map(|_| valid_type.clone()).collect())
+            .map(|valid_type| vec![valid_type.clone(); current_types.len()])
             .collect(),
         TypeSignature::String(number) => {
             function_length_check(function_name, current_types.len(), *number)?;
@@ -503,7 +502,7 @@ fn get_valid_types(
                     new_types.push(DataType::Utf8);
                 } else {
                     return plan_err!(
-                        "Function '{function_name}' expects NativeType::String but received {logical_data_type}"
+                        "Function '{function_name}' expects String but received {logical_data_type}"
                     );
                 }
             }
@@ -563,7 +562,7 @@ fn get_valid_types(
 
                 if !logical_data_type.is_numeric() {
                     return plan_err!(
-                        "Function '{function_name}' expects NativeType::Numeric but received {logical_data_type}"
+                        "Function '{function_name}' expects Numeric but received {logical_data_type}"
                     );
                 }
 
@@ -584,7 +583,7 @@ fn get_valid_types(
                 valid_type = DataType::Float64;
             } else if !logical_data_type.is_numeric() {
                 return plan_err!(
-                    "Function '{function_name}' expects NativeType::Numeric but received {logical_data_type}"
+                    "Function '{function_name}' expects Numeric but received {logical_data_type}"
                 );
             }
 
@@ -597,7 +596,9 @@ fn get_valid_types(
                 if let Some(dt) = comparison_coercion_numeric(&target_type, data_type) {
                     target_type = dt;
                 } else {
-                    return plan_err!("For function '{function_name}' {target_type} and {data_type} is not comparable");
+                    return plan_err!(
+                        "For function '{function_name}' {target_type} and {data_type} is not comparable"
+                    );
                 }
             }
             // Convert null to String type.
@@ -614,24 +615,33 @@ fn get_valid_types(
             for (current_type, param) in current_types.iter().zip(param_types.iter()) {
                 let current_native_type: NativeType = current_type.into();
 
-                if param.desired_type().matches_native_type(&current_native_type) {
-                    let casted_type = param.desired_type().default_casted_type(
-                        &current_native_type,
-                        current_type,
-                    )?;
+                if param
+                    .desired_type()
+                    .matches_native_type(&current_native_type)
+                {
+                    let casted_type = param
+                        .desired_type()
+                        .default_casted_type(&current_native_type, current_type)?;
 
                     new_types.push(casted_type);
                 } else if param
-                .allowed_source_types()
-                .iter()
-                .any(|t| t.matches_native_type(&current_native_type)) {
+                    .allowed_source_types()
+                    .iter()
+                    .any(|t| t.matches_native_type(&current_native_type))
+                {
                     // If the condition is met which means `implicit coercion`` is provided so we can safely unwrap
                     let default_casted_type = param.default_casted_type().unwrap();
-                    let casted_type = default_casted_type.default_cast_for(current_type)?;
+                    let casted_type =
+                        default_casted_type.default_cast_for(current_type)?;
                     new_types.push(casted_type);
                 } else {
-                    return internal_err!(
-                        "Expect {} but received {}, DataType: {}",
+                    let hint = if matches!(current_native_type, NativeType::Binary) {
+                        "\n\nHint: Binary types are not automatically coerced to String. Use CAST(column AS VARCHAR) to convert Binary data to String."
+                    } else {
+                        ""
+                    };
+                    return plan_err!(
+                        "Function '{function_name}' requires {}, but received {} (DataType: {}).{hint}",
                         param.desired_type(),
                         current_native_type,
                         current_type
@@ -643,18 +653,20 @@ fn get_valid_types(
         }
         TypeSignature::Uniform(number, valid_types) => {
             if *number == 0 {
-                return plan_err!("The function '{function_name}' expected at least one argument");
+                return plan_err!(
+                    "The function '{function_name}' expected at least one argument"
+                );
             }
 
             valid_types
                 .iter()
-                .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect())
+                .map(|valid_type| vec![valid_type.clone(); *number])
                 .collect()
         }
         TypeSignature::UserDefined => {
             return internal_err!(
                 "Function '{function_name}' user-defined signature should be handled by function-specific coerce_types"
-            )
+            );
         }
         TypeSignature::VariadicAny => {
             if current_types.is_empty() {
@@ -665,10 +677,16 @@ fn get_valid_types(
             vec![current_types.to_vec()]
         }
         TypeSignature::Exact(valid_types) => vec![valid_types.clone()],
-        TypeSignature::ArraySignature(ref function_signature) => match function_signature {
-            ArrayFunctionSignature::Array { arguments, array_coercion, } => {
-                array_valid_types(function_name, current_types, arguments, array_coercion.as_ref())?
-            }
+        TypeSignature::ArraySignature(function_signature) => match function_signature {
+            ArrayFunctionSignature::Array {
+                arguments,
+                array_coercion,
+            } => array_valid_types(
+                function_name,
+                current_types,
+                arguments,
+                array_coercion.as_ref(),
+            )?,
             ArrayFunctionSignature::RecursiveArray => {
                 if current_types.len() != 1 {
                     return Ok(vec![vec![]]);
@@ -709,7 +727,7 @@ fn get_valid_types(
                     current_types.len()
                 );
             }
-            vec![(0..*number).map(|i| current_types[i].clone()).collect()]
+            vec![current_types.to_vec()]
         }
         TypeSignature::OneOf(types) => types
             .iter()
@@ -787,6 +805,7 @@ fn maybe_data_types_without_coercion(
 /// (losslessly converted) into a value of `type_to`
 ///
 /// See the module level documentation for more detail on coercion.
+#[deprecated(since = "53.0.0", note = "Unused internal function")]
 pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool {
     if type_into == type_from {
         return true;
@@ -833,10 +852,13 @@ fn coerced_from<'a>(
         (UInt16, Null | UInt8 | UInt16) => Some(type_into.clone()),
         (UInt32, Null | UInt8 | UInt16 | UInt32) => Some(type_into.clone()),
         (UInt64, Null | UInt8 | UInt16 | UInt32 | UInt64) => Some(type_into.clone()),
+        (Float16, Null | Int8 | Int16 | UInt8 | UInt16 | Float16) => {
+            Some(type_into.clone())
+        }
         (
             Float32,
             Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64
-            | Float32,
+            | Float16 | Float32,
         ) => Some(type_into.clone()),
         (
             Float64,
@@ -849,15 +871,19 @@ fn coerced_from<'a>(
             | UInt16
             | UInt32
             | UInt64
+            | Float16
             | Float32
             | Float64
-            | Decimal128(_, _),
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _),
         ) => Some(type_into.clone()),
         (
             Timestamp(TimeUnit::Nanosecond, None),
             Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8,
         ) => Some(type_into.clone()),
-        (Interval(_), Utf8 | LargeUtf8) => Some(type_into.clone()),
+        (Interval(_), Null | Utf8 | LargeUtf8) => Some(type_into.clone()),
         // We can go into a Utf8View from a Utf8 or LargeUtf8
         (Utf8View, Utf8 | LargeUtf8 | Null) => Some(type_into.clone()),
         // Any type can be coerced into strings
@@ -902,6 +928,13 @@ fn coerced_from<'a>(
         (Timestamp(_, Some(_)), Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8) => {
             Some(type_into.clone())
         }
+        // Null can be coerced to any target type, provided the cast is valid.
+        // This mirrors null_coercion() in binary comparison coercion
+        // (expr-common/src/type_coercion/binary.rs) and is the symmetric
+        // counterpart of the (Null, _) arm above. Without this, untyped
+        // placeholders ($1, $foo) inside function calls fail signature matching
+        // because their Null type doesn't match any Exact(...) variant.
+        (_, Null) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
         _ => None,
     }
 }
@@ -911,21 +944,55 @@ mod tests {
     use crate::Volatility;
 
     use super::*;
-    use arrow::datatypes::Field;
-    use datafusion_common::assert_contains;
+    use arrow::datatypes::{Field, IntervalUnit};
+    use datafusion_common::{
+        assert_contains,
+        types::{logical_binary, logical_int64},
+    };
+    use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 
     #[test]
     fn test_string_conversion() {
         let cases = vec![
-            (DataType::Utf8View, DataType::Utf8, true),
-            (DataType::Utf8View, DataType::LargeUtf8, true),
+            (DataType::Utf8View, DataType::Utf8),
+            (DataType::Utf8View, DataType::LargeUtf8),
         ];
 
         for case in cases {
-            assert_eq!(can_coerce_from(&case.0, &case.1), case.2);
+            assert_eq!(coerced_from(&case.0, &case.1), Some(case.0));
         }
     }
 
+    #[test]
+    fn test_coerced_from_null() {
+        // Null should coerce to Interval (the motivating case)
+        assert_eq!(
+            coerced_from(
+                &DataType::Interval(IntervalUnit::MonthDayNano),
+                &DataType::Null
+            ),
+            Some(DataType::Interval(IntervalUnit::MonthDayNano))
+        );
+
+        // Null should coerce to Date32
+        assert_eq!(
+            coerced_from(&DataType::Date32, &DataType::Null),
+            Some(DataType::Date32)
+        );
+
+        // Null should coerce to Timestamp with timezone
+        assert_eq!(
+            coerced_from(
+                &DataType::Timestamp(TimeUnit::Microsecond, Some("+00".into())),
+                &DataType::Null
+            ),
+            Some(DataType::Timestamp(
+                TimeUnit::Microsecond,
+                Some("+00".into())
+            ))
+        );
+    }
+
     #[test]
     fn test_maybe_data_types() {
         // this vec contains: arg1, arg2, expected result
@@ -1026,7 +1093,7 @@ mod tests {
         .unwrap_err();
         assert_contains!(
             got.to_string(),
-            "Function 'test' expects NativeType::Numeric but received NativeType::String"
+            "Function 'test' expects Numeric but received String"
         );
 
         // Fallbacks to float64 if the arg is of type null.
@@ -1046,7 +1113,7 @@ mod tests {
         .unwrap_err();
         assert_contains!(
             got.to_string(),
-            "Function 'test' expects NativeType::Numeric but received NativeType::Timestamp(Second, None)"
+            "Function 'test' expects Numeric but received Timestamp(s)"
         );
 
         Ok(())
@@ -1101,12 +1168,29 @@ mod tests {
         Ok(())
     }
 
+    struct MockUdf(Signature);
+
+    impl UDFCoercionExt for MockUdf {
+        fn name(&self) -> &str {
+            "test"
+        }
+        fn signature(&self) -> &Signature {
+            &self.0
+        }
+        fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
+            unimplemented!()
+        }
+    }
+
     #[test]
     fn test_fixed_list_wildcard_coerce() -> Result<()> {
         let inner = Arc::new(Field::new_list_field(DataType::Int32, false));
-        let current_types = vec![
-            DataType::FixedSizeList(Arc::clone(&inner), 2), // able to coerce for any size
-        ];
+        // able to coerce for any size
+        let current_fields = vec![Arc::new(Field::new(
+            "t",
+            DataType::FixedSizeList(Arc::clone(&inner), 2),
+            true,
+        ))];
 
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(
@@ -1116,24 +1200,25 @@ mod tests {
             Volatility::Stable,
         );
 
-        let coerced_data_types = data_types("test", &current_types, &signature)?;
-        assert_eq!(coerced_data_types, current_types);
+        let coerced_fields = fields_with_udf(&current_fields, &MockUdf(signature))?;
+        assert_eq!(coerced_fields, current_fields);
 
         // make sure it can't coerce to a different size
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(Arc::clone(&inner), 3)],
             Volatility::Stable,
         );
-        let coerced_data_types = data_types("test", &current_types, &signature);
-        assert!(coerced_data_types.is_err());
+        let coerced_fields = fields_with_udf(&current_fields, &MockUdf(signature));
+        assert!(coerced_fields.is_err());
 
         // make sure it works with the same type.
         let signature = Signature::exact(
             vec![DataType::FixedSizeList(Arc::clone(&inner), 2)],
             Volatility::Stable,
         );
-        let coerced_data_types = data_types("test", &current_types, &signature).unwrap();
-        assert_eq!(coerced_data_types, current_types);
+        let coerced_fields =
+            fields_with_udf(&current_fields, &MockUdf(signature)).unwrap();
+        assert_eq!(coerced_fields, current_fields);
 
         Ok(())
     }
@@ -1305,6 +1390,164 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_coercible_nulls() -> Result<()> {
+        fn null_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new("field", DataType::Null, true).into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts Null to Int64 if we use TypeSignatureClass::Native
+        let output = null_input(Coercion::new_exact(TypeSignatureClass::Native(
+            logical_int64(),
+        )))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = null_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // Null gets passed through if we use TypeSignatureClass apart from Native
+        let output = null_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![DataType::Null], output);
+
+        let output = null_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Null], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_coercible_dictionary() -> Result<()> {
+        let dictionary =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int64));
+        fn dictionary_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new(
+                    "field",
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Int64),
+                    ),
+                    true,
+                )
+                .into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts Dictionary to Int64 if we use TypeSignatureClass::Native
+        let output = dictionary_input(Coercion::new_exact(TypeSignatureClass::Native(
+            logical_int64(),
+        )))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = dictionary_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // Dictionary gets passed through if we use TypeSignatureClass apart from Native
+        let output = dictionary_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![dictionary.clone()], output);
+
+        let output = dictionary_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![dictionary.clone()], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_coercible_run_end_encoded() -> Result<()> {
+        let run_end_encoded = DataType::RunEndEncoded(
+            Field::new("run_ends", DataType::Int16, false).into(),
+            Field::new("values", DataType::Int64, true).into(),
+        );
+        fn run_end_encoded_input(coercion: Coercion) -> Result<Vec<DataType>> {
+            fields_with_udf(
+                &[Field::new(
+                    "field",
+                    DataType::RunEndEncoded(
+                        Field::new("run_ends", DataType::Int16, false).into(),
+                        Field::new("values", DataType::Int64, true).into(),
+                    ),
+                    true,
+                )
+                .into()],
+                &MockUdf(Signature::coercible(vec![coercion], Volatility::Immutable)),
+            )
+            .map(|v| v.into_iter().map(|f| f.data_type().clone()).collect())
+        }
+
+        // Casts REE to Int64 if we use TypeSignatureClass::Native
+        let output = run_end_encoded_input(Coercion::new_exact(
+            TypeSignatureClass::Native(logical_int64()),
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        let output = run_end_encoded_input(Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![DataType::Int64], output);
+
+        // REE gets passed through if we use TypeSignatureClass apart from Native
+        let output =
+            run_end_encoded_input(Coercion::new_exact(TypeSignatureClass::Integer))?;
+        assert_eq!(vec![run_end_encoded.clone()], output);
+
+        let output = run_end_encoded_input(Coercion::new_implicit(
+            TypeSignatureClass::Integer,
+            vec![],
+            NativeType::Int64,
+        ))?;
+        assert_eq!(vec![run_end_encoded.clone()], output);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_get_valid_types_coercible_binary() -> Result<()> {
+        let signature = Signature::coercible(
+            vec![Coercion::new_exact(TypeSignatureClass::Native(
+                logical_binary(),
+            ))],
+            Volatility::Immutable,
+        );
+
+        // Binary types should stay their original selves
+        for t in [
+            DataType::Binary,
+            DataType::BinaryView,
+            DataType::LargeBinary,
+        ] {
+            assert_eq!(
+                get_valid_types("", &signature.type_signature, std::slice::from_ref(&t))?,
+                vec![vec![t]]
+            );
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn test_get_valid_types_fixed_size_arrays() -> Result<()> {
         let function = "fixed_size_arrays";
@@ -1343,6 +1586,18 @@ mod tests {
             vec![vec![]]
         );
 
+        let data_types = vec![
+            DataType::new_fixed_size_list(DataType::Int64, 3, false),
+            DataType::new_list(DataType::Int32, false),
+        ];
+        assert_eq!(
+            get_valid_types(function, &signature.type_signature, &data_types)?,
+            vec![vec![
+                DataType::new_list(DataType::Int64, false),
+                DataType::new_list(DataType::Int64, false),
+            ]]
+        );
+
         Ok(())
     }
 }
diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs
index 4fc150ef2996a..c92d434e34abe 100644
--- a/datafusion/expr/src/type_coercion/mod.rs
+++ b/datafusion/expr/src/type_coercion/mod.rs
@@ -51,16 +51,13 @@ pub fn is_signed_numeric(dt: &DataType) -> bool {
             | DataType::Float16
             | DataType::Float32
             | DataType::Float64
+            | DataType::Decimal32(_, _)
+            | DataType::Decimal64(_, _)
             | DataType::Decimal128(_, _)
             | DataType::Decimal256(_, _),
     )
 }
 
-/// Determine whether the given data type `dt` is `Null`.
-pub fn is_null(dt: &DataType) -> bool {
-    *dt == DataType::Null
-}
-
 /// Determine whether the given data type `dt` is a `Timestamp`.
 pub fn is_timestamp(dt: &DataType) -> bool {
     matches!(dt, DataType::Timestamp(_, _))
@@ -78,16 +75,3 @@ pub fn is_datetime(dt: &DataType) -> bool {
         DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _)
     )
 }
-
-/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`.
-pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool {
-    matches!(
-        dt,
-        DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
-    )
-}
-
-/// Determine whether the given data type `dt` is a `Decimal`.
-pub fn is_decimal(dt: &DataType) -> bool {
-    matches!(dt, DataType::Decimal128(_, _) | DataType::Decimal256(_, _))
-}
diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
index d1bf45ce2fe8a..245a80c02c093 100644
--- a/datafusion/expr/src/udaf.rs
+++ b/datafusion/expr/src/udaf.rs
@@ -20,27 +20,30 @@
 use std::any::Any;
 use std::cmp::Ordering;
 use std::fmt::{self, Debug, Formatter, Write};
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 use std::vec;
 
 use arrow::datatypes::{DataType, Field, FieldRef};
 
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue, Statistics};
+use datafusion_common::{Result, ScalarValue, Statistics, exec_err, not_impl_err};
+use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
+use datafusion_expr_common::operator::Operator;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
 use crate::expr::{
+    AggregateFunction, AggregateFunctionParams, ExprListDisplay, WindowFunctionParams,
     schema_name_from_exprs, schema_name_from_exprs_comma_separated_without_space,
-    schema_name_from_sorts, AggregateFunction, AggregateFunctionParams, ExprListDisplay,
-    WindowFunctionParams,
+    schema_name_from_sorts,
 };
 use crate::function::{
     AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs,
 };
 use crate::groups_accumulator::GroupsAccumulator;
-use crate::utils::format_state_name;
+use crate::udf_eq::UdfEq;
 use crate::utils::AggregateOrderSensitivity;
-use crate::{expr_vec_fmt, Accumulator, Expr};
+use crate::utils::format_state_name;
+use crate::{Accumulator, Expr, expr_vec_fmt};
 use crate::{Documentation, Signature};
 
 /// Logical representation of a user-defined [aggregate function] (UDAF).
@@ -70,10 +73,10 @@ use crate::{Documentation, Signature};
 ///
 /// [the examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples#single-process
 /// [aggregate function]: https://en.wikipedia.org/wiki/Aggregate_function
-/// [`Accumulator`]: crate::Accumulator
+/// [`Accumulator`]: Accumulator
 /// [`create_udaf`]: crate::expr_fn::create_udaf
-/// [`simple_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udaf.rs
-/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+/// [`simple_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udaf.rs
+/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 #[derive(Debug, Clone, PartialOrd)]
 pub struct AggregateUDF {
     inner: Arc<dyn AggregateUDFImpl>,
@@ -81,7 +84,7 @@ pub struct AggregateUDF {
 
 impl PartialEq for AggregateUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.equals(other.inner.as_ref())
+        self.inner.dyn_eq(other.inner.as_any())
     }
 }
 
@@ -89,7 +92,7 @@ impl Eq for AggregateUDF {}
 
 impl Hash for AggregateUDF {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.inner.hash_value().hash(state)
+        self.inner.dyn_hash(state)
     }
 }
 
@@ -158,7 +161,7 @@ impl AggregateUDF {
             args,
             false,
             None,
-            None,
+            vec![],
             None,
         ))
     }
@@ -170,6 +173,11 @@ impl AggregateUDF {
         self.inner.name()
     }
 
+    /// Returns the aliases for this function.
+    pub fn aliases(&self) -> &[String] {
+        self.inner.aliases()
+    }
+
     /// See [`AggregateUDFImpl::schema_name`] for more details.
     pub fn schema_name(&self, params: &AggregateFunctionParams) -> Result<String> {
         self.inner.schema_name(params)
@@ -205,11 +213,6 @@ impl AggregateUDF {
         self.inner.is_nullable()
     }
 
-    /// Returns the aliases for this function.
-    pub fn aliases(&self) -> &[String] {
-        self.inner.aliases()
-    }
-
     /// Returns this function's signature (what input types are accepted)
     ///
     /// See [`AggregateUDFImpl::signature`] for more details.
@@ -292,13 +295,28 @@ impl AggregateUDF {
         self.inner.reverse_expr()
     }
 
-    /// Do the function rewrite
+    /// Returns this aggregate function's simplification hook, if any.
     ///
     /// See [`AggregateUDFImpl::simplify`] for more details.
     pub fn simplify(&self) -> Option<AggregateFunctionSimplification> {
         self.inner.simplify()
     }
 
+    /// Rewrite aggregate to have simpler arguments
+    ///
+    /// See  [`AggregateUDFImpl::simplify_expr_op_literal`] for more details
+    pub fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        self.inner
+            .simplify_expr_op_literal(agg_function, arg, op, lit, arg_is_left)
+    }
+
     /// Returns true if the function is max, false if the function is min
     /// None in all other cases, used in certain optimizations for
     /// or aggregate
@@ -327,9 +345,9 @@ impl AggregateUDF {
         self.inner.supports_null_handling_clause()
     }
 
-    /// See [`AggregateUDFImpl::is_ordered_set_aggregate`] for more details.
-    pub fn is_ordered_set_aggregate(&self) -> bool {
-        self.inner.is_ordered_set_aggregate()
+    /// See [`AggregateUDFImpl::supports_within_group_clause`] for more details.
+    pub fn supports_within_group_clause(&self) -> bool {
+        self.inner.supports_within_group_clause()
     }
 
     /// Returns the documentation for this Aggregate UDF.
@@ -358,7 +376,7 @@ where
 /// See [`advanced_udaf.rs`] for a full example with complete implementation and
 /// [`AggregateUDF`] for other available options.
 ///
-/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+/// [`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 ///
 /// # Basic Example
 /// ```
@@ -372,7 +390,7 @@ where
 /// # use arrow::datatypes::Schema;
 /// # use arrow::datatypes::Field;
 ///
-/// #[derive(Debug, Clone)]
+/// #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 /// struct GeoMeanUdf {
 ///   signature: Signature,
 /// }
@@ -394,7 +412,7 @@ where
 /// fn get_doc() -> &'static Documentation {
 ///     &DOCUMENTATION
 /// }
-///    
+///
 /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf
 /// impl AggregateUDFImpl for GeoMeanUdf {
 ///    fn as_any(&self) -> &dyn Any { self }
@@ -415,7 +433,7 @@ where
 ///        ])
 ///    }
 ///    fn documentation(&self) -> Option<&Documentation> {
-///        Some(get_doc())  
+///        Some(get_doc())
 ///    }
 /// }
 ///
@@ -425,108 +443,35 @@ where
 /// // Call the function `geo_mean(col)`
 /// let expr = geometric_mean.call(vec![col("a")]);
 /// ```
-pub trait AggregateUDFImpl: Debug + Send + Sync {
-    // Note: When adding any methods (with default implementations), remember to add them also
-    // into the AliasedAggregateUDFImpl below!
-
+pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     /// Returns this object as an [`Any`] trait object
     fn as_any(&self) -> &dyn Any;
 
     /// Returns this function's name
     fn name(&self) -> &str;
 
+    /// Returns any aliases (alternate names) for this function.
+    ///
+    /// Note: `aliases` should only include names other than [`Self::name`].
+    /// Defaults to `[]` (no aliases)
+    fn aliases(&self) -> &[String] {
+        &[]
+    }
+
     /// Returns the name of the column this expression would create
     ///
     /// See [`Expr::schema_name`] for details
     ///
     /// Example of schema_name: count(DISTINCT column1) FILTER (WHERE column2 > 10) ORDER BY [..]
     fn schema_name(&self, params: &AggregateFunctionParams) -> Result<String> {
-        let AggregateFunctionParams {
-            args,
-            distinct,
-            filter,
-            order_by,
-            null_treatment,
-        } = params;
-
-        // exclude the first function argument(= column) in ordered set aggregate function,
-        // because it is duplicated with the WITHIN GROUP clause in schema name.
-        let args = if self.is_ordered_set_aggregate() {
-            &args[1..]
-        } else {
-            &args[..]
-        };
-
-        let mut schema_name = String::new();
-
-        schema_name.write_fmt(format_args!(
-            "{}({}{})",
-            self.name(),
-            if *distinct { "DISTINCT " } else { "" },
-            schema_name_from_exprs_comma_separated_without_space(args)?
-        ))?;
-
-        if let Some(null_treatment) = null_treatment {
-            schema_name.write_fmt(format_args!(" {null_treatment}"))?;
-        }
-
-        if let Some(filter) = filter {
-            schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?;
-        };
-
-        if let Some(order_by) = order_by {
-            let clause = match self.is_ordered_set_aggregate() {
-                true => "WITHIN GROUP",
-                false => "ORDER BY",
-            };
-
-            schema_name.write_fmt(format_args!(
-                " {} [{}]",
-                clause,
-                schema_name_from_sorts(order_by)?
-            ))?;
-        };
-
-        Ok(schema_name)
+        udaf_default_schema_name(self, params)
     }
 
     /// Returns a human readable expression.
     ///
     /// See [`Expr::human_display`] for details.
     fn human_display(&self, params: &AggregateFunctionParams) -> Result<String> {
-        let AggregateFunctionParams {
-            args,
-            distinct,
-            filter,
-            order_by,
-            null_treatment,
-        } = params;
-
-        let mut schema_name = String::new();
-
-        schema_name.write_fmt(format_args!(
-            "{}({}{})",
-            self.name(),
-            if *distinct { "DISTINCT " } else { "" },
-            ExprListDisplay::comma_separated(args.as_slice())
-        ))?;
-
-        if let Some(null_treatment) = null_treatment {
-            schema_name.write_fmt(format_args!(" {null_treatment}"))?;
-        }
-
-        if let Some(filter) = filter {
-            schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?;
-        };
-
-        if let Some(order_by) = order_by {
-            schema_name.write_fmt(format_args!(
-                " ORDER BY [{}]",
-                schema_name_from_sorts(order_by)?
-            ))?;
-        };
-
-        Ok(schema_name)
+        udaf_default_human_display(self, params)
     }
 
     /// Returns the name of the column this expression would create
@@ -540,42 +485,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         &self,
         params: &WindowFunctionParams,
     ) -> Result<String> {
-        let WindowFunctionParams {
-            args,
-            partition_by,
-            order_by,
-            window_frame,
-            null_treatment,
-        } = params;
-
-        let mut schema_name = String::new();
-        schema_name.write_fmt(format_args!(
-            "{}({})",
-            self.name(),
-            schema_name_from_exprs(args)?
-        ))?;
-
-        if let Some(null_treatment) = null_treatment {
-            schema_name.write_fmt(format_args!(" {null_treatment}"))?;
-        }
-
-        if !partition_by.is_empty() {
-            schema_name.write_fmt(format_args!(
-                " PARTITION BY [{}]",
-                schema_name_from_exprs(partition_by)?
-            ))?;
-        }
-
-        if !order_by.is_empty() {
-            schema_name.write_fmt(format_args!(
-                " ORDER BY [{}]",
-                schema_name_from_sorts(order_by)?
-            ))?;
-        };
-
-        schema_name.write_fmt(format_args!(" {window_frame}"))?;
-
-        Ok(schema_name)
+        udaf_default_window_function_schema_name(self, params)
     }
 
     /// Returns the user-defined display name of function, given the arguments
@@ -585,40 +495,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     ///
     /// Defaults to `function_name([DISTINCT] column1, column2, ..) [null_treatment] [filter] [order_by [..]]`
     fn display_name(&self, params: &AggregateFunctionParams) -> Result<String> {
-        let AggregateFunctionParams {
-            args,
-            distinct,
-            filter,
-            order_by,
-            null_treatment,
-        } = params;
-
-        let mut display_name = String::new();
-
-        display_name.write_fmt(format_args!(
-            "{}({}{})",
-            self.name(),
-            if *distinct { "DISTINCT " } else { "" },
-            expr_vec_fmt!(args)
-        ))?;
-
-        if let Some(nt) = null_treatment {
-            display_name.write_fmt(format_args!(" {nt}"))?;
-        }
-        if let Some(fe) = filter {
-            display_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?;
-        }
-        if let Some(ob) = order_by {
-            display_name.write_fmt(format_args!(
-                " ORDER BY [{}]",
-                ob.iter()
-                    .map(|o| format!("{o}"))
-                    .collect::<Vec<String>>()
-                    .join(", ")
-            ))?;
-        }
-
-        Ok(display_name)
+        udaf_default_display_name(self, params)
     }
 
     /// Returns the user-defined display name of function, given the arguments
@@ -633,44 +510,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         &self,
         params: &WindowFunctionParams,
     ) -> Result<String> {
-        let WindowFunctionParams {
-            args,
-            partition_by,
-            order_by,
-            window_frame,
-            null_treatment,
-        } = params;
-
-        let mut display_name = String::new();
-
-        display_name.write_fmt(format_args!(
-            "{}({})",
-            self.name(),
-            expr_vec_fmt!(args)
-        ))?;
-
-        if let Some(null_treatment) = null_treatment {
-            display_name.write_fmt(format_args!(" {null_treatment}"))?;
-        }
-
-        if !partition_by.is_empty() {
-            display_name.write_fmt(format_args!(
-                " PARTITION BY [{}]",
-                expr_vec_fmt!(partition_by)
-            ))?;
-        }
-
-        if !order_by.is_empty() {
-            display_name
-                .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?;
-        };
-
-        display_name.write_fmt(format_args!(
-            " {} BETWEEN {} AND {}",
-            window_frame.units, window_frame.start_bound, window_frame.end_bound
-        ))?;
-
-        Ok(display_name)
+        udaf_default_window_function_display_name(self, params)
     }
 
     /// Returns the function's [`Signature`] for information about what input
@@ -699,15 +539,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     ///    their **types**.
     /// 3. return types based on metadata within the fields of the inputs
     fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
-        let arg_types: Vec<_> =
-            arg_fields.iter().map(|f| f.data_type()).cloned().collect();
-        let data_type = self.return_type(&arg_types)?;
-
-        Ok(Arc::new(Field::new(
-            self.name(),
-            data_type,
-            self.is_nullable(),
-        )))
+        udaf_default_return_field(self, arg_fields)
     }
 
     /// Whether the aggregate function is nullable.
@@ -749,11 +581,12 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     /// be derived from `name`. See [`format_state_name`] for a utility function
     /// to generate a unique name.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let fields = vec![args
-            .return_field
-            .as_ref()
-            .clone()
-            .with_name(format_state_name(args.name, "value"))];
+        let fields = vec![
+            args.return_field
+                .as_ref()
+                .clone()
+                .with_name(format_state_name(args.name, "value")),
+        ];
 
         Ok(fields
             .into_iter()
@@ -788,20 +621,12 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet")
     }
 
-    /// Returns any aliases (alternate names) for this function.
-    ///
-    /// Note: `aliases` should only include names other than [`Self::name`].
-    /// Defaults to `[]` (no aliases)
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     /// Sliding accumulator is an alternative accumulator that can be used for
     /// window functions. It has retract method to revert the previous update.
     ///
     /// See [retract_batch] for more details.
     ///
-    /// [retract_batch]: datafusion_expr_common::accumulator::Accumulator::retract_batch
+    /// [retract_batch]: Accumulator::retract_batch
     fn create_sliding_accumulator(
         &self,
         args: AccumulatorArgs,
@@ -842,31 +667,114 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         AggregateOrderSensitivity::HardRequirement
     }
 
-    /// Optionally apply per-UDaF simplification / rewrite rules.
+    /// Returns an optional hook for simplifying this user-defined aggregate.
+    ///
+    /// Use this hook to apply function-specific rewrites during optimization.
+    /// The default implementation returns `None`.
+    ///
+    /// For example, `percentile_cont(x, 0.0)` and `percentile_cont(x, 1.0)` can
+    /// be rewritten to `MIN(x)` or `MAX(x)` depending on the `ORDER BY`
+    /// direction.
     ///
-    /// This can be used to apply function specific simplification rules during
-    /// optimization (e.g. `arrow_cast` --> `Expr::Cast`). The default
-    /// implementation does nothing.
+    /// DataFusion already simplifies arguments and performs constant folding
+    /// (for example, `my_add(1, 2) -> 3`). For nested expressions, the optimizer
+    /// runs simplification in multiple passes, so arguments are typically
+    /// simplified before this hook is invoked. As a result, UDF implementations
+    /// usually do not need to handle argument simplification themselves.
     ///
-    /// Note that DataFusion handles simplifying arguments and  "constant
-    /// folding" (replacing a function call with constant arguments such as
-    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
-    /// optimizations manually for specific UDFs.
+    /// See configuration `datafusion.optimizer.max_passes` for details on how many
+    /// optimization passes may be applied.
     ///
     /// # Returns
     ///
-    /// [None] if simplify is not defined or,
+    /// `None` if simplify is not defined.
     ///
-    /// Or, a closure with two arguments:
-    /// * 'aggregate_function': [crate::expr::AggregateFunction] for which simplified has been invoked
-    /// * 'info': [crate::simplify::SimplifyInfo]
+    /// Or, a closure ([`AggregateFunctionSimplification`]) invoked with:
+    /// * `aggregate_function`: [AggregateFunction] with already simplified
+    ///   arguments
+    /// * `info`: [crate::simplify::SimplifyContext]
     ///
-    /// closure returns simplified [Expr] or an error.
+    /// The closure returns a simplified [Expr] or an error.
+    ///
+    /// # Notes
     ///
+    /// The returned expression must have the same schema as the original
+    /// expression, including both the data type and nullability. For example,
+    /// if the original expression is nullable, the returned expression must
+    /// also be nullable, otherwise it may lead to schema verification errors
+    /// later in query planning.
     fn simplify(&self) -> Option<AggregateFunctionSimplification> {
         None
     }
 
+    /// Rewrite the aggregate to have simpler arguments
+    ///
+    /// This query pattern is not common in most real workloads, and most
+    /// aggregate implementations can safely ignore it. This API is included in
+    /// DataFusion because it is important for ClickBench Q29. See backstory
+    /// on <https://github.com/apache/datafusion/issues/15524>
+    ///
+    /// # Rewrite Overview
+    ///
+    /// The idea is to rewrite multiple aggregates with "complex arguments" into
+    /// ones with simpler arguments that can be optimized by common subexpression
+    /// elimination (CSE). At a high level the rewrite looks like
+    ///
+    /// * `Aggregate(SUM(x + 1), SUM(x + 2), ...)`
+    ///
+    /// Into
+    ///
+    /// * `Aggregate(SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...)`
+    ///
+    /// While this rewrite may seem worse (slower) than the original as it
+    /// computes *more* aggregate expressions, the common subexpression
+    /// elimination (CSE) can then reduce the number of distinct aggregates the
+    /// query actually needs to compute with a rewrite like
+    ///
+    /// * `Projection(_A + 1*_B, _A + 2*_B)`
+    /// * `  Aggregate(_A = SUM(x), _B = COUNT(x))`
+    ///
+    /// This optimization is extremely important for ClickBench Q29, which has 90
+    /// such expressions for some reason, and so this optimization results in
+    /// only two aggregates being needed. The DataFusion optimizer will invoke
+    /// this method when it detects multiple aggregates in a query that share
+    /// arguments of the form `<arg> <op> <literal>`.
+    ///
+    /// # API
+    ///
+    /// If `agg_function` supports the rewrite, it should return a semantically
+    /// equivalent expression (likely with more aggregate expressions, but
+    /// simpler arguments)
+    ///
+    /// This is only called when:
+    /// 1. There are no "special" aggregate params (filters, null handling, etc)
+    /// 2. Aggregate functions with exactly one [`Expr`] argument
+    /// 3. There are no volatile expressions
+    ///
+    /// Arguments
+    /// * `agg_function`: the original aggregate function detected with complex
+    ///   arguments.
+    /// * `arg`: The common argument shared across multiple aggregates (e.g. `x`
+    ///   in the example above)
+    /// * `op`: the operator between the common argument and the literal (e.g.
+    ///   `+` in `x + 1` or `1 + x`)
+    /// * `lit`: the literal argument (e.g. `1` or `2` in the example above)
+    /// * `arg_is_left`: whether the common argument is on the left or right of
+    ///   the operator (e.g. `true` for `x + 1` and false for `1 + x`)
+    ///
+    /// The default implementation returns `None`, which is what most aggregates
+    /// should do.
+    fn simplify_expr_op_literal(
+        &self,
+        _agg_function: &AggregateFunction,
+        _arg: &Expr,
+        _op: Operator,
+        _lit: &Expr,
+        _arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        Ok(None)
+    }
+
     /// Returns the reverse expression of the aggregate function.
     fn reverse_expr(&self) -> ReversedUDAF {
         ReversedUDAF::NotSupported
@@ -895,33 +803,6 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         not_impl_err!("Function {} does not implement coerce_types", self.name())
     }
 
-    /// Return true if this aggregate UDF is equal to the other.
-    ///
-    /// Allows customizing the equality of aggregate UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]:
-    ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
-    ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
-    fn equals(&self, other: &dyn AggregateUDFImpl) -> bool {
-        self.name() == other.name() && self.signature() == other.signature()
-    }
-
-    /// Returns a hash value for this aggregate UDF.
-    ///
-    /// Allows customizing the hash code of aggregate UDFs. Similarly to [`Hash`] and [`Eq`],
-    /// if [`Self::equals`] returns true for two UDFs, their `hash_value`s must be the same.
-    ///
-    /// By default, hashes [`Self::name`] and [`Self::signature`].
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.name().hash(hasher);
-        self.signature().hash(hasher);
-        hasher.finish()
-    }
-
     /// If this function is max, return true
     /// If the function is min, return false
     /// Otherwise return None (the default)
@@ -952,15 +833,66 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         ScalarValue::try_from(data_type)
     }
 
-    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true
-    /// If the function does not, return false
+    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` SQL clause,
+    /// return `true`. Otherwise, return `false` which will cause an error to be
+    /// raised during SQL parsing if these clauses are detected for this function.
+    ///
+    /// Functions which implement this as `true` are expected to handle the resulting
+    /// null handling config present in [`AccumulatorArgs`], `ignore_nulls`.
     fn supports_null_handling_clause(&self) -> bool {
-        true
+        false
     }
 
-    /// If this function is ordered-set aggregate function, return true
-    /// If the function is not, return false
-    fn is_ordered_set_aggregate(&self) -> bool {
+    /// If this function supports the `WITHIN GROUP (ORDER BY column [ASC|DESC])`
+    /// SQL syntax, return `true`. Otherwise, return `false` (default) which will
+    /// cause an error when parsing SQL where this syntax is detected for this
+    /// function.
+    ///
+    /// This function should return `true` for ordered-set aggregate functions
+    /// only.
+    ///
+    /// # Ordered-set aggregate functions
+    ///
+    /// Ordered-set aggregate functions allow specifying a sort order that affects
+    /// how the function calculates its result, unlike other aggregate functions
+    /// like `sum` or `count`. For example, `percentile_cont` is an ordered-set
+    /// aggregate function that calculates the exact percentile value from a list
+    /// of values; the output of calculating the `0.75` percentile depends on if
+    /// you're calculating on an ascending or descending list of values.
+    ///
+    /// An example of how an ordered-set aggregate function is called with the
+    /// `WITHIN GROUP` SQL syntax:
+    ///
+    /// ```sql
+    /// -- Ascending
+    /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 ASC) FROM table;
+    /// -- Default ordering is ascending if not explicitly specified
+    /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1) FROM table;
+    /// -- Descending
+    /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 DESC) FROM table;
+    /// ```
+    ///
+    /// This calculates the `0.75` percentile of the column `c1` from `table`,
+    /// according to the specific ordering. The column specified in the `WITHIN GROUP`
+    /// ordering clause is taken as the column to calculate values on; specifying
+    /// the `WITHIN GROUP` clause is optional so these queries are equivalent:
+    ///
+    /// ```sql
+    /// -- If no WITHIN GROUP is specified then default ordering is implementation
+    /// -- dependent; in this case ascending for percentile_cont
+    /// SELECT percentile_cont(c1, 0.75) FROM table;
+    /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 ASC) FROM table;
+    /// ```
+    ///
+    /// Aggregate UDFs can define their default ordering if the function is called
+    /// without the `WITHIN GROUP` clause, though a default of ascending is the
+    /// standard practice.
+    ///
+    /// Ordered-set aggregate function implementations are responsible for handling
+    /// the input sort order themselves (e.g. `percentile_cont` must buffer and
+    /// sort the values internally). That is, DataFusion does not introduce any
+    /// kind of sort into the plan for these functions with this syntax.
+    fn supports_within_group_clause(&self) -> bool {
         false
     }
 
@@ -981,22 +913,290 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
 
 impl PartialEq for dyn AggregateUDFImpl {
     fn eq(&self, other: &Self) -> bool {
-        self.equals(other)
+        self.dyn_eq(other.as_any())
     }
 }
 
-// Manual implementation of `PartialOrd`
-// There might be some wackiness with it, but this is based on the impl of eq for AggregateUDFImpl
-// https://users.rust-lang.org/t/how-to-compare-two-trait-objects-for-equality/88063/5
 impl PartialOrd for dyn AggregateUDFImpl {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         match self.name().partial_cmp(other.name()) {
             Some(Ordering::Equal) => self.signature().partial_cmp(other.signature()),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
+/// Encapsulates default implementation of [`AggregateUDFImpl::schema_name`].
+pub fn udaf_default_schema_name<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    params: &AggregateFunctionParams,
+) -> Result<String> {
+    let AggregateFunctionParams {
+        args,
+        distinct,
+        filter,
+        order_by,
+        null_treatment,
+    } = params;
+
+    // exclude the first function argument(= column) in ordered set aggregate function,
+    // because it is duplicated with the WITHIN GROUP clause in schema name.
+    let args = if func.supports_within_group_clause() && !order_by.is_empty() {
+        &args[1..]
+    } else {
+        &args[..]
+    };
+
+    let mut schema_name = String::new();
+
+    schema_name.write_fmt(format_args!(
+        "{}({}{})",
+        func.name(),
+        if *distinct { "DISTINCT " } else { "" },
+        schema_name_from_exprs_comma_separated_without_space(args)?
+    ))?;
+
+    if let Some(null_treatment) = null_treatment {
+        schema_name.write_fmt(format_args!(" {null_treatment}"))?;
+    }
+
+    if let Some(filter) = filter {
+        schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?;
+    };
+
+    if !order_by.is_empty() {
+        let clause = match func.supports_within_group_clause() {
+            true => "WITHIN GROUP",
+            false => "ORDER BY",
+        };
+
+        schema_name.write_fmt(format_args!(
+            " {} [{}]",
+            clause,
+            schema_name_from_sorts(order_by)?
+        ))?;
+    };
+
+    Ok(schema_name)
+}
+
+/// Encapsulates default implementation of [`AggregateUDFImpl::human_display`].
+pub fn udaf_default_human_display<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    params: &AggregateFunctionParams,
+) -> Result<String> {
+    let AggregateFunctionParams {
+        args,
+        distinct,
+        filter,
+        order_by,
+        null_treatment,
+    } = params;
+
+    let mut schema_name = String::new();
+
+    schema_name.write_fmt(format_args!(
+        "{}({}{})",
+        func.name(),
+        if *distinct { "DISTINCT " } else { "" },
+        ExprListDisplay::comma_separated(args.as_slice())
+    ))?;
+
+    if let Some(null_treatment) = null_treatment {
+        schema_name.write_fmt(format_args!(" {null_treatment}"))?;
+    }
+
+    if let Some(filter) = filter {
+        schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?;
+    };
+
+    if !order_by.is_empty() {
+        schema_name.write_fmt(format_args!(
+            " ORDER BY [{}]",
+            schema_name_from_sorts(order_by)?
+        ))?;
+    };
+
+    Ok(schema_name)
+}
+
+/// Encapsulates default implementation of [`AggregateUDFImpl::window_function_schema_name`].
+pub fn udaf_default_window_function_schema_name<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    params: &WindowFunctionParams,
+) -> Result<String> {
+    let WindowFunctionParams {
+        args,
+        partition_by,
+        order_by,
+        window_frame,
+        filter,
+        null_treatment,
+        distinct,
+    } = params;
+
+    let mut schema_name = String::new();
+
+    // Inject DISTINCT into the schema name when requested
+    if *distinct {
+        schema_name.write_fmt(format_args!(
+            "{}(DISTINCT {})",
+            func.name(),
+            schema_name_from_exprs(args)?
+        ))?;
+    } else {
+        schema_name.write_fmt(format_args!(
+            "{}({})",
+            func.name(),
+            schema_name_from_exprs(args)?
+        ))?;
+    }
+
+    if let Some(null_treatment) = null_treatment {
+        schema_name.write_fmt(format_args!(" {null_treatment}"))?;
+    }
+
+    if let Some(filter) = filter {
+        schema_name.write_fmt(format_args!(" FILTER (WHERE {filter})"))?;
+    }
+
+    if !partition_by.is_empty() {
+        schema_name.write_fmt(format_args!(
+            " PARTITION BY [{}]",
+            schema_name_from_exprs(partition_by)?
+        ))?;
+    }
+
+    if !order_by.is_empty() {
+        schema_name.write_fmt(format_args!(
+            " ORDER BY [{}]",
+            schema_name_from_sorts(order_by)?
+        ))?;
+    }
+
+    schema_name.write_fmt(format_args!(" {window_frame}"))?;
+
+    Ok(schema_name)
+}
+
+/// Encapsulates default implementation of [`AggregateUDFImpl::display_name`].
+pub fn udaf_default_display_name<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    params: &AggregateFunctionParams,
+) -> Result<String> {
+    let AggregateFunctionParams {
+        args,
+        distinct,
+        filter,
+        order_by,
+        null_treatment,
+    } = params;
+
+    let mut display_name = String::new();
+
+    display_name.write_fmt(format_args!(
+        "{}({}{})",
+        func.name(),
+        if *distinct { "DISTINCT " } else { "" },
+        expr_vec_fmt!(args)
+    ))?;
+
+    if let Some(nt) = null_treatment {
+        display_name.write_fmt(format_args!(" {nt}"))?;
+    }
+    if let Some(fe) = filter {
+        display_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?;
+    }
+    if !order_by.is_empty() {
+        display_name.write_fmt(format_args!(
+            " ORDER BY [{}]",
+            order_by
+                .iter()
+                .map(|o| format!("{o}"))
+                .collect::<Vec<String>>()
+                .join(", ")
+        ))?;
+    }
+
+    Ok(display_name)
+}
+
+/// Encapsulates default implementation of [`AggregateUDFImpl::window_function_display_name`].
+pub fn udaf_default_window_function_display_name<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    params: &WindowFunctionParams,
+) -> Result<String> {
+    let WindowFunctionParams {
+        args,
+        partition_by,
+        order_by,
+        window_frame,
+        filter,
+        null_treatment,
+        distinct,
+    } = params;
+
+    let mut display_name = String::new();
+
+    if *distinct {
+        display_name.write_fmt(format_args!(
+            "{}(DISTINCT {})",
+            func.name(),
+            expr_vec_fmt!(args)
+        ))?;
+    } else {
+        display_name.write_fmt(format_args!(
+            "{}({})",
+            func.name(),
+            expr_vec_fmt!(args)
+        ))?;
+    }
+
+    if let Some(null_treatment) = null_treatment {
+        display_name.write_fmt(format_args!(" {null_treatment}"))?;
+    }
+
+    if let Some(fe) = filter {
+        display_name.write_fmt(format_args!(" FILTER (WHERE {fe})"))?;
+    }
+
+    if !partition_by.is_empty() {
+        display_name.write_fmt(format_args!(
+            " PARTITION BY [{}]",
+            expr_vec_fmt!(partition_by)
+        ))?;
+    }
+
+    if !order_by.is_empty() {
+        display_name
+            .write_fmt(format_args!(" ORDER BY [{}]", expr_vec_fmt!(order_by)))?;
+    };
+
+    display_name.write_fmt(format_args!(
+        " {} BETWEEN {} AND {}",
+        window_frame.units, window_frame.start_bound, window_frame.end_bound
+    ))?;
+
+    Ok(display_name)
+}
+
+/// Encapsulates default implementation of [`AggregateUDFImpl::return_field`].
+pub fn udaf_default_return_field<F: AggregateUDFImpl + ?Sized>(
+    func: &F,
+    arg_fields: &[FieldRef],
+) -> Result<FieldRef> {
+    let arg_types: Vec<_> = arg_fields.iter().map(|f| f.data_type()).cloned().collect();
+    let data_type = func.return_type(&arg_types)?;
+
+    Ok(Arc::new(Field::new(
+        func.name(),
+        data_type,
+        func.is_nullable(),
+    )))
+}
+
 pub enum ReversedUDAF {
     /// The expression is the same as the original expression, like SUM, COUNT
     Identical,
@@ -1008,9 +1208,9 @@ pub enum ReversedUDAF {
 
 /// AggregateUDF that adds an alias to the underlying function. It is better to
 /// implement [`AggregateUDFImpl`], which supports aliases, directly if possible.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct AliasedAggregateUDFImpl {
-    inner: Arc<dyn AggregateUDFImpl>,
+    inner: UdfEq<Arc<dyn AggregateUDFImpl>>,
     aliases: Vec<String>,
 }
 
@@ -1022,10 +1222,14 @@ impl AliasedAggregateUDFImpl {
         let mut aliases = inner.aliases().to_vec();
         aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
 
-        Self { inner, aliases }
+        Self {
+            inner: inner.into(),
+            aliases,
+        }
     }
 }
 
+#[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl AggregateUDFImpl for AliasedAggregateUDFImpl {
     fn as_any(&self) -> &dyn Any {
         self
@@ -1051,6 +1255,32 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
         &self.aliases
     }
 
+    fn schema_name(&self, params: &AggregateFunctionParams) -> Result<String> {
+        self.inner.schema_name(params)
+    }
+
+    fn human_display(&self, params: &AggregateFunctionParams) -> Result<String> {
+        self.inner.human_display(params)
+    }
+
+    fn window_function_schema_name(
+        &self,
+        params: &WindowFunctionParams,
+    ) -> Result<String> {
+        self.inner.window_function_schema_name(params)
+    }
+
+    fn display_name(&self, params: &AggregateFunctionParams) -> Result<String> {
+        self.inner.display_name(params)
+    }
+
+    fn window_function_display_name(
+        &self,
+        params: &WindowFunctionParams,
+    ) -> Result<String> {
+        self.inner.window_function_display_name(params)
+    }
+
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         self.inner.state_fields(args)
     }
@@ -1082,7 +1312,7 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
             .map(|udf| {
                 udf.map(|udf| {
                     Arc::new(AliasedAggregateUDFImpl {
-                        inner: udf,
+                        inner: udf.into(),
                         aliases: self.aliases.clone(),
                     }) as Arc<dyn AggregateUDFImpl>
                 })
@@ -1097,6 +1327,18 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
         self.inner.simplify()
     }
 
+    fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        self.inner
+            .simplify_expr_op_literal(agg_function, arg, op, lit, arg_is_left)
+    }
+
     fn reverse_expr(&self) -> ReversedUDAF {
         self.inner.reverse_expr()
     }
@@ -1105,59 +1347,41 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl {
         self.inner.coerce_types(arg_types)
     }
 
-    fn equals(&self, other: &dyn AggregateUDFImpl) -> bool {
-        if let Some(other) = other.as_any().downcast_ref::<AliasedAggregateUDFImpl>() {
-            self.inner.equals(other.inner.as_ref()) && self.aliases == other.aliases
-        } else {
-            false
-        }
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        self.inner.return_field(arg_fields)
     }
 
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.inner.hash_value().hash(hasher);
-        self.aliases.hash(hasher);
-        hasher.finish()
+    fn is_nullable(&self) -> bool {
+        self.inner.is_nullable()
     }
 
     fn is_descending(&self) -> Option<bool> {
         self.inner.is_descending()
     }
 
-    fn documentation(&self) -> Option<&Documentation> {
-        self.inner.documentation()
+    fn value_from_stats(&self, statistics_args: &StatisticsArgs) -> Option<ScalarValue> {
+        self.inner.value_from_stats(statistics_args)
     }
-}
 
-// Aggregate UDF doc sections for use in public documentation
-pub mod aggregate_doc_sections {
-    use crate::DocSection;
+    fn default_value(&self, data_type: &DataType) -> Result<ScalarValue> {
+        self.inner.default_value(data_type)
+    }
 
-    pub fn doc_sections() -> Vec<DocSection> {
-        vec![
-            DOC_SECTION_GENERAL,
-            DOC_SECTION_STATISTICAL,
-            DOC_SECTION_APPROXIMATE,
-        ]
+    fn supports_null_handling_clause(&self) -> bool {
+        self.inner.supports_null_handling_clause()
     }
 
-    pub const DOC_SECTION_GENERAL: DocSection = DocSection {
-        include: true,
-        label: "General Functions",
-        description: None,
-    };
+    fn supports_within_group_clause(&self) -> bool {
+        self.inner.supports_within_group_clause()
+    }
 
-    pub const DOC_SECTION_STATISTICAL: DocSection = DocSection {
-        include: true,
-        label: "Statistical Functions",
-        description: None,
-    };
+    fn set_monotonicity(&self, data_type: &DataType) -> SetMonotonicity {
+        self.inner.set_monotonicity(data_type)
+    }
 
-    pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection {
-        include: true,
-        label: "Approximate Functions",
-        description: None,
-    };
+    fn documentation(&self) -> Option<&Documentation> {
+        self.inner.documentation()
+    }
 }
 
 /// Indicates whether an aggregation function is monotonic as a set
@@ -1193,8 +1417,9 @@ mod test {
     };
     use std::any::Any;
     use std::cmp::Ordering;
+    use std::hash::{DefaultHasher, Hash, Hasher};
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct AMeanUdf {
         signature: Signature,
     }
@@ -1235,7 +1460,7 @@ mod test {
         }
     }
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct BMeanUdf {
         signature: Signature,
     }
@@ -1275,6 +1500,16 @@ mod test {
         }
     }
 
+    #[test]
+    fn test_partial_eq() {
+        let a1 = AggregateUDF::from(AMeanUdf::new());
+        let a2 = AggregateUDF::from(AMeanUdf::new());
+        let eq = a1 == a2;
+        assert!(eq);
+        assert_eq!(a1, a2);
+        assert_eq!(hash(a1), hash(a2));
+    }
+
     #[test]
     fn test_partial_ord() {
         // Test validates that partial ord is defined for AggregateUDF using the name and signature,
@@ -1287,4 +1522,10 @@ mod test {
         assert!(a1 < b1);
         assert!(!(a1 == b1));
     }
+
+    fn hash<T: Hash>(value: T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
 }
diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs
index 816929a1fba17..9286dd30b11a7 100644
--- a/datafusion/expr/src/udf.rs
+++ b/datafusion/expr/src/udf.rs
@@ -17,17 +17,25 @@
 
 //! [`ScalarUDF`]: Scalar User Defined Functions
 
+use crate::async_udf::AsyncScalarUDF;
 use crate::expr::schema_name_from_exprs_comma_separated_without_space;
-use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
+use crate::preimage::PreimageResult;
+use crate::simplify::{ExprSimplifyResult, SimplifyContext};
 use crate::sort_properties::{ExprProperties, SortProperties};
+use crate::udf_eq::UdfEq;
 use crate::{ColumnarValue, Documentation, Expr, Signature};
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue};
+#[cfg(debug_assertions)]
+use datafusion_common::assert_or_internal_err;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{ExprSchema, Result, ScalarValue, not_impl_err};
+use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use std::any::Any;
 use std::cmp::Ordering;
 use std::fmt::Debug;
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
 /// Logical representation of a Scalar User Defined Function.
@@ -50,8 +58,8 @@ use std::sync::Arc;
 /// compatibility with the older API.
 ///
 /// [`create_udf`]: crate::expr_fn::create_udf
-/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
-/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udf.rs
+/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
 #[derive(Debug, Clone)]
 pub struct ScalarUDF {
     inner: Arc<dyn ScalarUDFImpl>,
@@ -59,17 +67,36 @@ pub struct ScalarUDF {
 
 impl PartialEq for ScalarUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.equals(other.inner.as_ref())
+        self.inner.dyn_eq(other.inner.as_any())
     }
 }
 
-// Manual implementation based on `ScalarUDFImpl::equals`
 impl PartialOrd for ScalarUDF {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        match self.name().partial_cmp(other.name()) {
-            Some(Ordering::Equal) => self.signature().partial_cmp(other.signature()),
-            cmp => cmp,
+        let mut cmp = self.name().cmp(other.name());
+        if cmp == Ordering::Equal {
+            cmp = self.signature().partial_cmp(other.signature())?;
         }
+        if cmp == Ordering::Equal {
+            cmp = self.aliases().partial_cmp(other.aliases())?;
+        }
+        // Contract for PartialOrd and PartialEq consistency requires that
+        // a == b if and only if partial_cmp(a, b) == Some(Equal).
+        if cmp == Ordering::Equal && self != other {
+            // Functions may have other properties besides name and signature
+            // that differentiate two instances (e.g. type, or arbitrary parameters).
+            // We cannot return Some(Equal) in such case.
+            return None;
+        }
+        debug_assert!(
+            cmp == Ordering::Equal || self != other,
+            "Detected incorrect implementation of PartialEq when comparing functions: '{}' and '{}'. \
+            The functions compare as equal, but they are not equal based on general properties that \
+            the PartialOrd implementation observes,",
+            self.name(),
+            other.name()
+        );
+        Some(cmp)
     }
 }
 
@@ -77,7 +104,7 @@ impl Eq for ScalarUDF {}
 
 impl Hash for ScalarUDF {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.inner.hash_value().hash(state)
+        self.inner.dyn_hash(state)
     }
 }
 
@@ -140,7 +167,12 @@ impl ScalarUDF {
     /// Returns this function's display_name.
     ///
     /// See [`ScalarUDFImpl::display_name`] for more details
+    #[deprecated(
+        since = "50.0.0",
+        note = "This method is unused and will be removed in a future release"
+    )]
     pub fn display_name(&self, args: &[Expr]) -> Result<String> {
+        #[expect(deprecated)]
         self.inner.display_name(args)
     }
 
@@ -185,30 +217,75 @@ impl ScalarUDF {
         self.inner.return_field_from_args(args)
     }
 
-    /// Do the function rewrite
+    /// Returns this scalar function's simplification result.
     ///
     /// See [`ScalarUDFImpl::simplify`] for more details.
     pub fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         self.inner.simplify(args, info)
     }
 
-    #[allow(deprecated)]
+    #[deprecated(since = "50.0.0", note = "Use `return_field_from_args` instead.")]
     pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
+        #[expect(deprecated)]
         self.inner.is_nullable(args, schema)
     }
 
+    /// Return a preimage
+    ///
+    /// See [`ScalarUDFImpl::preimage`] for more details.
+    pub fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        self.inner.preimage(args, lit_expr, info)
+    }
+
     /// Invoke the function on `args`, returning the appropriate result.
     ///
     /// See [`ScalarUDFImpl::invoke_with_args`] for details.
     pub fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        self.inner.invoke_with_args(args)
+        #[cfg(debug_assertions)]
+        let return_field = Arc::clone(&args.return_field);
+        let result = self.inner.invoke_with_args(args)?;
+        // Maybe this could be enabled always?
+        // This doesn't use debug_assert!, but it's meant to run anywhere except on production. It's same in spirit, thus conditioning on debug_assertions.
+        #[cfg(debug_assertions)]
+        {
+            let result_data_type = result.data_type();
+            let expected_type = return_field.data_type();
+            assert_or_internal_err!(
+                result_data_type == *expected_type,
+                "Function '{}' returned value of type '{}' while the following type was promised at planning time and expected: '{}'",
+                self.name(),
+                result_data_type,
+                expected_type
+            );
+            // TODO verify return data is non-null when it was promised to be?
+        }
+        Ok(result)
+    }
+
+    /// Determines which of the arguments passed to this function are evaluated eagerly
+    /// and which may be evaluated lazily.
+    ///
+    /// See [ScalarUDFImpl::conditional_arguments] for more information.
+    pub fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        self.inner.conditional_arguments(args)
     }
 
-    /// Get the circuits of inner implementation
+    /// Returns true if some of this `exprs` subexpressions may not be evaluated
+    /// and thus any side effects (like divide by zero) may not be encountered.
+    ///
+    /// See [ScalarUDFImpl::short_circuits] for more information.
     pub fn short_circuits(&self) -> bool {
         self.inner.short_circuits()
     }
@@ -280,6 +357,18 @@ impl ScalarUDF {
     pub fn documentation(&self) -> Option<&Documentation> {
         self.inner.documentation()
     }
+
+    /// Return true if this function is an async function
+    pub fn as_async(&self) -> Option<&AsyncScalarUDF> {
+        self.inner().as_any().downcast_ref::<AsyncScalarUDF>()
+    }
+
+    /// Returns placement information for this function.
+    ///
+    /// See [`ScalarUDFImpl::placement`] for more details.
+    pub fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.inner.placement(args)
+    }
 }
 
 impl<F> From<F> for ScalarUDF
@@ -293,6 +382,7 @@ where
 
 /// Arguments passed to [`ScalarUDFImpl::invoke_with_args`] when invoking a
 /// scalar function.
+#[derive(Debug, Clone)]
 pub struct ScalarFunctionArgs {
     /// The evaluated arguments to the function
     pub args: Vec<ColumnarValue>,
@@ -304,6 +394,8 @@ pub struct ScalarFunctionArgs {
     /// or `return_field_from_args`) when creating the physical expression
     /// from the logical expression
     pub return_field: FieldRef,
+    /// The config options at execution time
+    pub config_options: Arc<ConfigOptions>,
 }
 
 impl ScalarFunctionArgs {
@@ -342,7 +434,7 @@ pub struct ReturnFieldArgs<'a> {
 /// See [`advanced_udf.rs`] for a full example with complete implementation and
 /// [`ScalarUDF`] for other available options.
 ///
-/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
 ///
 /// # Basic Example
 /// ```
@@ -354,7 +446,7 @@ pub struct ReturnFieldArgs<'a> {
 /// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF};
 /// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
 /// /// This struct for a simple UDF that adds one to an int32
-/// #[derive(Debug)]
+/// #[derive(Debug, PartialEq, Eq, Hash)]
 /// struct AddOne {
 ///   signature: Signature,
 /// }
@@ -403,22 +495,36 @@ pub struct ReturnFieldArgs<'a> {
 /// // Call the function `add_one(col)`
 /// let expr = add_one.call(vec![col("a")]);
 /// ```
-pub trait ScalarUDFImpl: Debug + Send + Sync {
-    // Note: When adding any methods (with default implementations), remember to add them also
-    // into the AliasedScalarUDFImpl below!
-
+pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     /// Returns this object as an [`Any`] trait object
     fn as_any(&self) -> &dyn Any;
 
     /// Returns this function's name
     fn name(&self) -> &str;
 
+    /// Returns any aliases (alternate names) for this function.
+    ///
+    /// Aliases can be used to invoke the same function using different names.
+    /// For example in some databases `now()` and `current_timestamp()` are
+    /// aliases for the same function. This behavior can be obtained by
+    /// returning `current_timestamp` as an alias for the `now` function.
+    ///
+    /// Note: `aliases` should only include names other than [`Self::name`].
+    /// Defaults to `[]` (no aliases)
+    fn aliases(&self) -> &[String] {
+        &[]
+    }
+
     /// Returns the user-defined display name of function, given the arguments
     ///
     /// This can be used to customize the output column name generated by this
     /// function.
     ///
     /// Defaults to `name(args[0], args[1], ...)`
+    #[deprecated(
+        since = "50.0.0",
+        note = "This method is unused and will be removed in a future release"
+    )]
     fn display_name(&self, args: &[Expr]) -> Result<String> {
         let names: Vec<String> = args.iter().map(ToString::to_string).collect();
         // TODO: join with ", " to standardize the formatting of Vec<Expr>, <https://github.com/apache/datafusion/issues/10364>
@@ -436,22 +542,62 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
         ))
     }
 
-    /// Returns the function's [`Signature`] for information about what input
-    /// types are accepted and the function's Volatility.
+    /// Returns a [`Signature`] describing the argument types for which this
+    /// function has an implementation, and the function's [`Volatility`].
+    ///
+    /// See [`Signature`] for more details on argument type handling
+    /// and [`Self::return_type`] for computing the return type.
+    ///
+    /// [`Volatility`]: datafusion_expr_common::signature::Volatility
     fn signature(&self) -> &Signature;
 
-    /// What [`DataType`] will be returned by this function, given the types of
-    /// the arguments.
+    /// [`DataType`] returned by this function, given the types of the
+    /// arguments.
+    ///
+    /// # Arguments
+    ///
+    /// `arg_types` Data types of the arguments. The implementation of
+    /// `return_type` can assume that some other part of the code has coerced
+    /// the actual argument types to match [`Self::signature`].
     ///
     /// # Notes
     ///
     /// If you provide an implementation for [`Self::return_field_from_args`],
-    /// DataFusion will not call `return_type` (this function). In such cases
-    /// is recommended to return [`DataFusionError::Internal`].
+    /// DataFusion will not call `return_type` (this function). While it is
+    /// valid to to put [`unimplemented!()`] or [`unreachable!()`], it is
+    /// recommended to return [`DataFusionError::Internal`] instead, which
+    /// reduces the severity of symptoms if bugs occur (an error rather than a
+    /// panic).
     ///
     /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
 
+    /// Create a new instance of this function with updated configuration.
+    ///
+    /// This method is called when configuration options change at runtime
+    /// (e.g., via `SET` statements) to allow functions that depend on
+    /// configuration to update themselves accordingly.
+    ///
+    /// Note the current [`ConfigOptions`] are also passed to [`Self::invoke_with_args`] so
+    /// this API is not needed for functions where the values may
+    /// depend on the current options.
+    ///
+    /// This API is useful for functions where the return
+    /// **type** depends on the configuration options, such as the `now()` function
+    /// which depends on the current timezone.
+    ///
+    /// # Arguments
+    ///
+    /// * `config` - The updated configuration options
+    ///
+    /// # Returns
+    ///
+    /// * `Some(ScalarUDF)` - A new instance of this function configured with the new settings
+    /// * `None` - If this function does not change with new configuration settings (the default)
+    fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF> {
+        None
+    }
+
     /// What type will be returned by this function, given the arguments?
     ///
     /// By default, this function calls [`Self::return_type`] with the
@@ -483,10 +629,10 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
     /// # struct Example{}
     /// # impl Example {
     /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
-    ///   // report output is only nullable if any one of the arguments are nullable
-    ///   let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
-    ///   let field = Arc::new(Field::new("ignored_name", DataType::Int32, true));
-    ///   Ok(field)
+    ///     // report output is only nullable if any one of the arguments are nullable
+    ///     let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+    ///     let field = Arc::new(Field::new("ignored_name", DataType::Int32, nullable));
+    ///     Ok(field)
     /// }
     /// # }
     /// ```
@@ -536,19 +682,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
     /// to arrays, which will likely be simpler code, but be slower.
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>;
 
-    /// Returns any aliases (alternate names) for this function.
-    ///
-    /// Aliases can be used to invoke the same function using different names.
-    /// For example in some databases `now()` and `current_timestamp()` are
-    /// aliases for the same function. This behavior can be obtained by
-    /// returning `current_timestamp` as an alias for the `now` function.
-    ///
-    /// Note: `aliases` should only include names other than [`Self::name`].
-    /// Defaults to `[]` (no aliases)
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     /// Optionally apply per-UDF simplification / rewrite rules.
     ///
     /// This can be used to apply function specific simplification rules during
@@ -568,23 +701,168 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
     /// [`ExprSimplifyResult`] indicating the result of the simplification NOTE
     /// if the function cannot be simplified, the arguments *MUST* be returned
     /// unmodified
+    ///
+    /// # Notes
+    ///
+    /// The returned expression must have the same schema as the original
+    /// expression, including both the data type and nullability. For example,
+    /// if the original expression is nullable, the returned expression must
+    /// also be nullable, otherwise it may lead to schema verification errors
+    /// later in query planning.
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         Ok(ExprSimplifyResult::Original(args))
     }
 
+    /// Returns a single contiguous preimage for this function and the specified
+    /// scalar expression, if any.
+    ///
+    /// Currently only applies to `=, !=, >, >=, <, <=, is distinct from, is not distinct from` predicates
+    /// # Return Value
+    ///
+    /// Implementations should return a half-open interval: inclusive lower
+    /// bound and exclusive upper bound. This is slightly different from normal
+    /// [`Interval`] semantics where the upper bound is closed (inclusive).
+    /// Typically this means the upper endpoint must be adjusted to the next
+    /// value not included in the preimage. See the Half-Open Intervals section
+    /// below for more details.
+    ///
+    /// # Background
+    ///
+    /// Inspired by the [ClickHouse Paper], a "preimage rewrite" transforms a
+    /// predicate containing a function call into a predicate containing an
+    /// equivalent set of input literal (constant) values. The resulting
+    /// predicate can often be further optimized by other rewrites (see
+    /// Examples).
+    ///
+    /// From the paper:
+    ///
+    /// > some functions can compute the preimage of a given function result.
+    /// > This is used to replace comparisons of constants with function calls
+    /// > on the key columns by comparing the key column value with the preimage.
+    /// > For example, `toYear(k) = 2024` can be replaced by
+    /// > `k >= 2024-01-01 && k < 2025-01-01`
+    ///
+    /// For example, given an expression like
+    /// ```sql
+    /// date_part('YEAR', k) = 2024
+    /// ```
+    ///
+    /// The interval `[2024-01-01, 2025-12-31`]` contains all possible input
+    /// values (preimage values) for which the function `date_part(YEAR, k)`
+    /// produces the output value `2024` (image value). Returning the interval
+    /// (note upper bound adjusted up) `[2024-01-01, 2025-01-01]` the expression
+    /// can be rewritten to
+    ///
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01'
+    /// ```
+    ///
+    /// which is a simpler and a more canonical form, making it easier for other
+    /// optimizer passes to recognize and apply further transformations.
+    ///
+    /// # Examples
+    ///
+    /// Case 1:
+    ///
+    /// Original:
+    /// ```sql
+    /// date_part('YEAR', k) = 2024 AND k >= '2024-06-01'
+    /// ```
+    ///
+    /// After preimage rewrite:
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01' AND k >= '2024-06-01'
+    /// ```
+    ///
+    /// Since this form is much simpler, the optimizer can combine and simplify
+    /// sub-expressions further into:
+    /// ```sql
+    /// k >= '2024-06-01' AND k < '2025-01-01'
+    /// ```
+    ///
+    /// Case 2:
+    ///
+    /// For min/max pruning, simpler predicates such as:
+    /// ```sql
+    /// k >= '2024-01-01' AND k < '2025-01-01'
+    /// ```
+    /// are much easier for the pruner to reason about. See [PruningPredicate]
+    /// for the backgrounds of predicate pruning.
+    ///
+    /// The trade-off with the preimage rewrite is that evaluating the rewritten
+    /// form might be slightly more expensive than evaluating the original
+    /// expression. In practice, this cost is usually outweighed by the more
+    /// aggressive optimization opportunities it enables.
+    ///
+    /// # Half-Open Intervals
+    ///
+    /// The preimage API uses half-open intervals, which makes the rewrite
+    /// easier to implement by avoiding calculations to adjust the upper bound.
+    /// For example, if a function returns its input unchanged and the desired
+    /// output is the single value `5`, a closed interval could be represented
+    /// as `[5, 5]`, but then the rewrite would require adjusting the upper
+    /// bound to `6` to create a proper range predicate. With a half-open
+    /// interval, the same range is represented as `[5, 6)`, which already
+    /// forms a valid predicate.
+    ///
+    /// [PruningPredicate]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html
+    /// [ClickHouse Paper]:  https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
+    /// [image]: https://en.wikipedia.org/wiki/Image_(mathematics)#Image_of_an_element
+    /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
+    fn preimage(
+        &self,
+        _args: &[Expr],
+        _lit_expr: &Expr,
+        _info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        Ok(PreimageResult::None)
+    }
+
     /// Returns true if some of this `exprs` subexpressions may not be evaluated
     /// and thus any side effects (like divide by zero) may not be encountered.
     ///
     /// Setting this to true prevents certain optimizations such as common
     /// subexpression elimination
+    ///
+    /// When overriding this function to return `true`, [ScalarUDFImpl::conditional_arguments] can also be
+    /// overridden to report more accurately which arguments are eagerly evaluated and which ones
+    /// lazily.
     fn short_circuits(&self) -> bool {
         false
     }
 
+    /// Determines which of the arguments passed to this function are evaluated eagerly
+    /// and which may be evaluated lazily.
+    ///
+    /// If this function returns `None`, all arguments are eagerly evaluated.
+    /// Returning `None` is a micro optimization that saves a needless `Vec`
+    /// allocation.
+    ///
+    /// If the function returns `Some`, returns (`eager`, `lazy`) where `eager`
+    /// are the arguments that are always evaluated, and `lazy` are the
+    /// arguments that may be evaluated lazily (i.e. may not be evaluated at all
+    /// in some cases).
+    ///
+    /// Implementations must ensure that the two returned `Vec`s are disjunct,
+    /// and that each argument from `args` is present in one the two `Vec`s.
+    ///
+    /// When overriding this function, [ScalarUDFImpl::short_circuits] must
+    /// be overridden to return `true`.
+    fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        if self.short_circuits() {
+            Some((vec![], args.iter().collect()))
+        } else {
+            None
+        }
+    }
+
     /// Computes the output [`Interval`] for a [`ScalarUDFImpl`], given the input
     /// intervals.
     ///
@@ -687,33 +965,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
         not_impl_err!("Function {} does not implement coerce_types", self.name())
     }
 
-    /// Return true if this scalar UDF is equal to the other.
-    ///
-    /// Allows customizing the equality of scalar UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]:
-    ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
-    ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
-    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        self.name() == other.name() && self.signature() == other.signature()
-    }
-
-    /// Returns a hash value for this scalar UDF.
-    ///
-    /// Allows customizing the hash code of scalar UDFs. Similarly to [`Hash`] and [`Eq`],
-    /// if [`Self::equals`] returns true for two UDFs, their `hash_value`s must be the same.
-    ///
-    /// By default, hashes [`Self::name`] and [`Self::signature`].
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.name().hash(hasher);
-        self.signature().hash(hasher);
-        hasher.finish()
-    }
-
     /// Returns the documentation for this Scalar UDF.
     ///
     /// Documentation can be accessed programmatically as well as generating
@@ -721,13 +972,27 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
     fn documentation(&self) -> Option<&Documentation> {
         None
     }
+
+    /// Returns placement information for this function.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    ///
+    /// The default implementation returns [`ExpressionPlacement::KeepInPlace`],
+    /// meaning the expression should be kept where it is in the plan.
+    ///
+    /// Override this method to indicate that the function can be pushed down
+    /// closer to the data source.
+    fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        ExpressionPlacement::KeepInPlace
+    }
 }
 
 /// ScalarUDF that adds an alias to the underlying function. It is better to
 /// implement [`ScalarUDFImpl`], which supports aliases, directly if possible.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct AliasedScalarUDFImpl {
-    inner: Arc<dyn ScalarUDFImpl>,
+    inner: UdfEq<Arc<dyn ScalarUDFImpl>>,
     aliases: Vec<String>,
 }
 
@@ -738,10 +1003,14 @@ impl AliasedScalarUDFImpl {
     ) -> Self {
         let mut aliases = inner.aliases().to_vec();
         aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
-        Self { inner, aliases }
+        Self {
+            inner: inner.into(),
+            aliases,
+        }
     }
 }
 
+#[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl ScalarUDFImpl for AliasedScalarUDFImpl {
     fn as_any(&self) -> &dyn Any {
         self
@@ -752,6 +1021,7 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
     }
 
     fn display_name(&self, args: &[Expr]) -> Result<String> {
+        #[expect(deprecated)]
         self.inner.display_name(args)
     }
 
@@ -771,10 +1041,19 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
         self.inner.return_field_from_args(args)
     }
 
+    fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
+        #[expect(deprecated)]
+        self.inner.is_nullable(args, schema)
+    }
+
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         self.inner.invoke_with_args(args)
     }
 
+    fn with_updated_config(&self, _config: &ConfigOptions) -> Option<ScalarUDF> {
+        None
+    }
+
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
@@ -782,11 +1061,27 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         self.inner.simplify(args, info)
     }
 
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        self.inner.preimage(args, lit_expr, info)
+    }
+
+    fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        self.inner.conditional_arguments(args)
+    }
+
     fn short_circuits(&self) -> bool {
         self.inner.short_circuits()
     }
@@ -815,138 +1110,91 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
         self.inner.coerce_types(arg_types)
     }
 
-    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        if let Some(other) = other.as_any().downcast_ref::<AliasedScalarUDFImpl>() {
-            self.inner.equals(other.inner.as_ref()) && self.aliases == other.aliases
-        } else {
-            false
+    fn documentation(&self) -> Option<&Documentation> {
+        self.inner.documentation()
+    }
+
+    fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.inner.placement(args)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_expr_common::signature::Volatility;
+    use std::hash::DefaultHasher;
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestScalarUDFImpl {
+        name: &'static str,
+        field: &'static str,
+        signature: Signature,
+    }
+    impl ScalarUDFImpl for TestScalarUDFImpl {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            self.name
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            unimplemented!()
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            unimplemented!()
         }
     }
 
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.inner.hash_value().hash(hasher);
-        self.aliases.hash(hasher);
-        hasher.finish()
+    // PartialEq and Hash must be consistent, and also PartialEq and PartialOrd
+    // must be consistent, so they are tested together.
+    #[test]
+    fn test_partial_eq_hash_and_partial_ord() {
+        // A parameterized function
+        let f = test_func("foo", "a");
+
+        // Same like `f`, different instance
+        let f2 = test_func("foo", "a");
+        assert_eq!(f, f2);
+        assert_eq!(hash(&f), hash(&f2));
+        assert_eq!(f.partial_cmp(&f2), Some(Ordering::Equal));
+
+        // Different parameter
+        let b = test_func("foo", "b");
+        assert_ne!(f, b);
+        assert_ne!(hash(&f), hash(&b)); // hash can collide for different values but does not collide in this test
+        assert_eq!(f.partial_cmp(&b), None);
+
+        // Different name
+        let o = test_func("other", "a");
+        assert_ne!(f, o);
+        assert_ne!(hash(&f), hash(&o)); // hash can collide for different values but does not collide in this test
+        assert_eq!(f.partial_cmp(&o), Some(Ordering::Less));
+
+        // Different name and parameter
+        assert_ne!(b, o);
+        assert_ne!(hash(&b), hash(&o)); // hash can collide for different values but does not collide in this test
+        assert_eq!(b.partial_cmp(&o), Some(Ordering::Less));
     }
 
-    fn documentation(&self) -> Option<&Documentation> {
-        self.inner.documentation()
+    fn test_func(name: &'static str, parameter: &'static str) -> ScalarUDF {
+        ScalarUDF::from(TestScalarUDFImpl {
+            name,
+            field: parameter,
+            signature: Signature::any(1, Volatility::Immutable),
+        })
     }
-}
 
-// Scalar UDF doc sections for use in public documentation
-pub mod scalar_doc_sections {
-    use crate::DocSection;
-
-    pub fn doc_sections() -> Vec<DocSection> {
-        vec![
-            DOC_SECTION_MATH,
-            DOC_SECTION_CONDITIONAL,
-            DOC_SECTION_STRING,
-            DOC_SECTION_BINARY_STRING,
-            DOC_SECTION_REGEX,
-            DOC_SECTION_DATETIME,
-            DOC_SECTION_ARRAY,
-            DOC_SECTION_STRUCT,
-            DOC_SECTION_MAP,
-            DOC_SECTION_HASHING,
-            DOC_SECTION_UNION,
-            DOC_SECTION_OTHER,
-        ]
-    }
-
-    pub const fn doc_sections_const() -> &'static [DocSection] {
-        &[
-            DOC_SECTION_MATH,
-            DOC_SECTION_CONDITIONAL,
-            DOC_SECTION_STRING,
-            DOC_SECTION_BINARY_STRING,
-            DOC_SECTION_REGEX,
-            DOC_SECTION_DATETIME,
-            DOC_SECTION_ARRAY,
-            DOC_SECTION_STRUCT,
-            DOC_SECTION_MAP,
-            DOC_SECTION_HASHING,
-            DOC_SECTION_UNION,
-            DOC_SECTION_OTHER,
-        ]
-    }
-
-    pub const DOC_SECTION_MATH: DocSection = DocSection {
-        include: true,
-        label: "Math Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection {
-        include: true,
-        label: "Conditional Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_STRING: DocSection = DocSection {
-        include: true,
-        label: "String Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection {
-        include: true,
-        label: "Binary String Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_REGEX: DocSection = DocSection {
-        include: true,
-        label: "Regular Expression Functions",
-        description: Some(
-            r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions)
-regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
-(minus support for several features including look-around and backreferences).
-The following regular expression functions are supported:"#,
-        ),
-    };
-
-    pub const DOC_SECTION_DATETIME: DocSection = DocSection {
-        include: true,
-        label: "Time and Date Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_ARRAY: DocSection = DocSection {
-        include: true,
-        label: "Array Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_STRUCT: DocSection = DocSection {
-        include: true,
-        label: "Struct Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_MAP: DocSection = DocSection {
-        include: true,
-        label: "Map Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_HASHING: DocSection = DocSection {
-        include: true,
-        label: "Hashing Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_OTHER: DocSection = DocSection {
-        include: true,
-        label: "Other Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_UNION: DocSection = DocSection {
-        include: true,
-        label: "Union Functions",
-        description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"),
-    };
+    fn hash<T: Hash>(value: &T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
 }
diff --git a/datafusion/expr/src/udf_eq.rs b/datafusion/expr/src/udf_eq.rs
new file mode 100644
index 0000000000000..30cfb1d831fde
--- /dev/null
+++ b/datafusion/expr/src/udf_eq.rs
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{AggregateUDFImpl, ScalarUDFImpl, WindowUDFImpl};
+use std::fmt::Debug;
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::ops::Deref;
+use std::sync::Arc;
+
+/// A wrapper around a pointer to UDF that implements `Eq` and `Hash` delegating to
+/// corresponding methods on the UDF trait.
+///
+/// If you want to just compare pointers for equality, use [`super::ptr_eq::PtrEq`].
+#[derive(Clone)]
+#[expect(private_bounds)] // This is so that UdfEq can only be used with allowed pointer types (e.g. Arc), without allowing misuse.
+pub struct UdfEq<Ptr: UdfPointer>(Ptr);
+
+impl<Ptr> PartialEq for UdfEq<Ptr>
+where
+    Ptr: UdfPointer,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0.equals(&other.0)
+    }
+}
+impl<Ptr> Eq for UdfEq<Ptr> where Ptr: UdfPointer {}
+impl<Ptr> Hash for UdfEq<Ptr>
+where
+    Ptr: UdfPointer,
+{
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.hash_value().hash(state);
+    }
+}
+
+impl<Ptr> From<Ptr> for UdfEq<Ptr>
+where
+    Ptr: UdfPointer,
+{
+    fn from(ptr: Ptr) -> Self {
+        UdfEq(ptr)
+    }
+}
+
+impl<Ptr> Debug for UdfEq<Ptr>
+where
+    Ptr: UdfPointer + Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl<Ptr> Deref for UdfEq<Ptr>
+where
+    Ptr: UdfPointer,
+{
+    type Target = Ptr;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+trait UdfPointer: Deref {
+    fn equals(&self, other: &Self::Target) -> bool;
+    fn hash_value(&self) -> u64;
+}
+
+impl UdfPointer for Arc<dyn ScalarUDFImpl + '_> {
+    fn equals(&self, other: &(dyn ScalarUDFImpl + '_)) -> bool {
+        self.as_ref().dyn_eq(other.as_any())
+    }
+
+    fn hash_value(&self) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        self.as_ref().dyn_hash(hasher);
+        hasher.finish()
+    }
+}
+
+impl UdfPointer for Arc<dyn AggregateUDFImpl + '_> {
+    fn equals(&self, other: &(dyn AggregateUDFImpl + '_)) -> bool {
+        self.as_ref().dyn_eq(other.as_any())
+    }
+
+    fn hash_value(&self) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        self.as_ref().dyn_hash(hasher);
+        hasher.finish()
+    }
+}
+
+impl UdfPointer for Arc<dyn WindowUDFImpl + '_> {
+    fn equals(&self, other: &(dyn WindowUDFImpl + '_)) -> bool {
+        self.as_ref().dyn_eq(other.as_any())
+    }
+
+    fn hash_value(&self) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        self.as_ref().dyn_hash(hasher);
+        hasher.finish()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ScalarFunctionArgs;
+    use arrow::datatypes::DataType;
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use datafusion_expr_common::signature::{Signature, Volatility};
+    use std::any::Any;
+    use std::hash::DefaultHasher;
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestScalarUDF {
+        signature: Signature,
+        name: &'static str,
+    }
+    impl ScalarUDFImpl for TestScalarUDF {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            self.name
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(
+            &self,
+            _arg_types: &[DataType],
+        ) -> datafusion_common::Result<DataType> {
+            unimplemented!()
+        }
+
+        fn invoke_with_args(
+            &self,
+            _args: ScalarFunctionArgs,
+        ) -> datafusion_common::Result<ColumnarValue> {
+            unimplemented!()
+        }
+    }
+
+    #[test]
+    pub fn test_eq_eq_wrapper() {
+        let signature = Signature::any(1, Volatility::Immutable);
+
+        let a1: Arc<dyn ScalarUDFImpl> = Arc::new(TestScalarUDF {
+            signature: signature.clone(),
+            name: "a",
+        });
+        let a2: Arc<dyn ScalarUDFImpl> = Arc::new(TestScalarUDF {
+            signature: signature.clone(),
+            name: "a",
+        });
+        let b: Arc<dyn ScalarUDFImpl> = Arc::new(TestScalarUDF {
+            signature: signature.clone(),
+            name: "b",
+        });
+
+        // Reflexivity
+        let wrapper = UdfEq(Arc::clone(&a1));
+        assert_eq!(wrapper, wrapper);
+
+        // Two wrappers around equal pointer
+        assert_eq!(UdfEq(Arc::clone(&a1)), UdfEq(Arc::clone(&a1)));
+        assert_eq!(hash(UdfEq(Arc::clone(&a1))), hash(UdfEq(Arc::clone(&a1))));
+
+        // Two wrappers around different pointers but equal in ScalarUDFImpl::equals sense
+        assert_eq!(UdfEq(Arc::clone(&a1)), UdfEq(Arc::clone(&a2)));
+        assert_eq!(hash(UdfEq(Arc::clone(&a1))), hash(UdfEq(Arc::clone(&a2))));
+
+        // different functions (not equal)
+        assert_ne!(UdfEq(Arc::clone(&a1)), UdfEq(Arc::clone(&b)));
+    }
+
+    fn hash<T: Hash>(value: T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
+}
diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs
index c0187735d6025..7f86a69f8712c 100644
--- a/datafusion/expr/src/udwf.rs
+++ b/datafusion/expr/src/udwf.rs
@@ -19,7 +19,7 @@
 
 use arrow::compute::SortOptions;
 use std::cmp::Ordering;
-use std::hash::{DefaultHasher, Hash, Hasher};
+use std::hash::{Hash, Hasher};
 use std::{
     any::Any,
     fmt::{self, Debug, Display, Formatter},
@@ -29,11 +29,13 @@ use std::{
 use arrow::datatypes::{DataType, FieldRef};
 
 use crate::expr::WindowFunction;
+use crate::udf_eq::UdfEq;
 use crate::{
-    function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature,
+    Expr, PartitionEvaluator, Signature, function::WindowFunctionSimplification,
 };
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_doc::Documentation;
+use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
 use datafusion_functions_window_common::expr::ExpressionArgs;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
@@ -64,8 +66,8 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 ///
 /// [`PartitionEvaluator`]: crate::PartitionEvaluator
 /// [`create_udwf`]: crate::expr_fn::create_udwf
-/// [`simple_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udwf.rs
-/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+/// [`simple_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udwf.rs
+/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 #[derive(Debug, Clone, PartialOrd)]
 pub struct WindowUDF {
     inner: Arc<dyn WindowUDFImpl>,
@@ -80,7 +82,7 @@ impl Display for WindowUDF {
 
 impl PartialEq for WindowUDF {
     fn eq(&self, other: &Self) -> bool {
-        self.inner.equals(other.inner.as_ref())
+        self.inner.dyn_eq(other.inner.as_any())
     }
 }
 
@@ -88,7 +90,7 @@ impl Eq for WindowUDF {}
 
 impl Hash for WindowUDF {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        self.inner.hash_value().hash(state)
+        self.inner.dyn_hash(state)
     }
 }
 
@@ -133,7 +135,7 @@ impl WindowUDF {
     pub fn call(&self, args: Vec<Expr>) -> Expr {
         let fun = crate::WindowFunctionDefinition::WindowUDF(Arc::new(self.clone()));
 
-        Expr::WindowFunction(WindowFunction::new(fun, args))
+        Expr::from(WindowFunction::new(fun, args))
     }
 
     /// Returns this function's name
@@ -155,7 +157,7 @@ impl WindowUDF {
         self.inner.signature()
     }
 
-    /// Do the function rewrite
+    /// Returns this window function's simplification hook, if any.
     ///
     /// See [`WindowUDFImpl::simplify`] for more details.
     pub fn simplify(&self) -> Option<WindowFunctionSimplification> {
@@ -227,24 +229,30 @@ where
 /// This trait exposes the full API for implementing user defined window functions and
 /// can be used to implement any function.
 ///
+/// While the trait depends on [`DynEq`] and [`DynHash`] traits, these should not be
+/// implemented directly. Instead, implement [`Eq`] and [`Hash`] and leverage the
+/// blanket implementations of [`DynEq`] and [`DynHash`].
+///
 /// See [`advanced_udwf.rs`] for a full example with complete implementation and
 /// [`WindowUDF`] for other available options.
 ///
 ///
-/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+/// [`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 /// # Basic Example
 /// ```
 /// # use std::any::Any;
 /// # use std::sync::LazyLock;
 /// # use arrow::datatypes::{DataType, Field, FieldRef};
 /// # use datafusion_common::{DataFusionError, plan_err, Result};
-/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation};
+/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation, LimitEffect};
 /// # use datafusion_expr::{WindowUDFImpl, WindowUDF};
 /// # use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 /// # use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 /// # use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL;
+/// # use datafusion_physical_expr_common::physical_expr;
+/// # use std::sync::Arc;
 ///
-/// #[derive(Debug, Clone)]
+/// #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 /// struct SmoothIt {
 ///   signature: Signature,
 /// }
@@ -289,6 +297,9 @@ where
 ///    fn documentation(&self) -> Option<&Documentation> {
 ///      Some(get_doc())
 ///    }
+///     fn limit_effect(&self, _args: &[Arc<dyn physical_expr::PhysicalExpr>]) -> LimitEffect {
+///         LimitEffect::Unknown
+///     }
 /// }
 ///
 /// // Create a new WindowUDF from the implementation
@@ -303,16 +314,21 @@ where
 ///     .build()
 ///     .unwrap();
 /// ```
-pub trait WindowUDFImpl: Debug + Send + Sync {
-    // Note: When adding any methods (with default implementations), remember to add them also
-    // into the AliasedWindowUDFImpl below!
-
+pub trait WindowUDFImpl: Debug + DynEq + DynHash + Send + Sync {
     /// Returns this object as an [`Any`] trait object
     fn as_any(&self) -> &dyn Any;
 
     /// Returns this function's name
     fn name(&self) -> &str;
 
+    /// Returns any aliases (alternate names) for this function.
+    ///
+    /// Note: `aliases` should only include names other than [`Self::name`].
+    /// Defaults to `[]` (no aliases)
+    fn aliases(&self) -> &[String] {
+        &[]
+    }
+
     /// Returns the function's [`Signature`] for information about what input
     /// types are accepted and the function's Volatility.
     fn signature(&self) -> &Signature;
@@ -328,62 +344,37 @@ pub trait WindowUDFImpl: Debug + Send + Sync {
         partition_evaluator_args: PartitionEvaluatorArgs,
     ) -> Result<Box<dyn PartitionEvaluator>>;
 
-    /// Returns any aliases (alternate names) for this function.
-    ///
-    /// Note: `aliases` should only include names other than [`Self::name`].
-    /// Defaults to `[]` (no aliases)
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
-    /// Optionally apply per-UDWF simplification / rewrite rules.
+    /// Returns an optional hook for simplifying this user-defined window
+    /// function.
     ///
-    /// This can be used to apply function specific simplification rules during
-    /// optimization. The default implementation does nothing.
+    /// Use this hook to apply function-specific rewrites during optimization.
+    /// The default implementation returns `None`.
     ///
-    /// Note that DataFusion handles simplifying arguments and  "constant
-    /// folding" (replacing a function call with constant arguments such as
-    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
-    /// optimizations manually for specific UDFs.
+    /// DataFusion already simplifies arguments and performs constant folding
+    /// (for example, `my_add(1, 2) -> 3`), so there is usually no need to
+    /// implement those optimizations manually for specific UDFs.
     ///
     /// Example:
-    /// [`advanced_udwf.rs`]: <https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs>
+    /// `advanced_udwf.rs`: <https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs>
     ///
     /// # Returns
-    /// [None] if simplify is not defined or,
-    ///
-    /// Or, a closure with two arguments:
-    /// * 'window_function': [crate::expr::WindowFunction] for which simplified has been invoked
-    /// * 'info': [crate::simplify::SimplifyInfo]
-    fn simplify(&self) -> Option<WindowFunctionSimplification> {
-        None
-    }
-
-    /// Return true if this window UDF is equal to the other.
-    ///
-    /// Allows customizing the equality of window UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]:
+    /// `None` if simplify is not defined.
     ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
+    /// Or, a closure ([`WindowFunctionSimplification`]) invoked with:
+    /// * `window_function`: [WindowFunction] with already simplified
+    ///   arguments
+    /// * `info`: [crate::simplify::SimplifyContext]
     ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
-    fn equals(&self, other: &dyn WindowUDFImpl) -> bool {
-        self.name() == other.name() && self.signature() == other.signature()
-    }
-
-    /// Returns a hash value for this window UDF.
+    /// The closure returns a simplified [Expr] or an error.
     ///
-    /// Allows customizing the hash code of window UDFs. Similarly to [`Hash`] and [`Eq`],
-    /// if [`Self::equals`] returns true for two UDFs, their `hash_value`s must be the same.
-    ///
-    /// By default, hashes [`Self::name`] and [`Self::signature`].
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.name().hash(hasher);
-        self.signature().hash(hasher);
-        hasher.finish()
+    /// # Notes
+    /// The returned expression must have the same schema as the original
+    /// expression, including both the data type and nullability. For example,
+    /// if the original expression is nullable, the returned expression must
+    /// also be nullable, otherwise it may lead to schema verification errors
+    /// later in query planning.
+    fn simplify(&self) -> Option<WindowFunctionSimplification> {
+        None
     }
 
     /// The [`FieldRef`] of the final result of evaluating this window function.
@@ -438,6 +429,23 @@ pub trait WindowUDFImpl: Debug + Send + Sync {
     fn documentation(&self) -> Option<&Documentation> {
         None
     }
+
+    /// If not causal, returns the effect this function will have on the window
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
+}
+
+/// the effect this function will have on the limit pushdown
+pub enum LimitEffect {
+    /// Does not affect the limit (i.e. this is causal)
+    None,
+    /// Either undeclared, or dynamic (only evaluatable at run time)
+    Unknown,
+    /// Grow the limit by N rows
+    Relative(usize),
+    /// Limit needs to be at least N rows
+    Absolute(usize),
 }
 
 pub enum ReversedUDWF {
@@ -454,7 +462,7 @@ pub enum ReversedUDWF {
 
 impl PartialEq for dyn WindowUDFImpl {
     fn eq(&self, other: &Self) -> bool {
-        self.equals(other)
+        self.dyn_eq(other.as_any())
     }
 }
 
@@ -464,14 +472,16 @@ impl PartialOrd for dyn WindowUDFImpl {
             Some(Ordering::Equal) => self.signature().partial_cmp(other.signature()),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
 /// WindowUDF that adds an alias to the underlying function. It is better to
 /// implement [`WindowUDFImpl`], which supports aliases, directly if possible.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct AliasedWindowUDFImpl {
-    inner: Arc<dyn WindowUDFImpl>,
+    inner: UdfEq<Arc<dyn WindowUDFImpl>>,
     aliases: Vec<String>,
 }
 
@@ -483,10 +493,14 @@ impl AliasedWindowUDFImpl {
         let mut aliases = inner.aliases().to_vec();
         aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
 
-        Self { inner, aliases }
+        Self {
+            inner: inner.into(),
+            aliases,
+        }
     }
 }
 
+#[warn(clippy::missing_trait_methods)] // Delegates, so it should implement every single trait method
 impl WindowUDFImpl for AliasedWindowUDFImpl {
     fn as_any(&self) -> &dyn Any {
         self
@@ -522,21 +536,6 @@ impl WindowUDFImpl for AliasedWindowUDFImpl {
         self.inner.simplify()
     }
 
-    fn equals(&self, other: &dyn WindowUDFImpl) -> bool {
-        if let Some(other) = other.as_any().downcast_ref::<AliasedWindowUDFImpl>() {
-            self.inner.equals(other.inner.as_ref()) && self.aliases == other.aliases
-        } else {
-            false
-        }
-    }
-
-    fn hash_value(&self) -> u64 {
-        let hasher = &mut DefaultHasher::new();
-        self.inner.hash_value().hash(hasher);
-        self.aliases.hash(hasher);
-        hasher.finish()
-    }
-
     fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
         self.inner.field(field_args)
     }
@@ -549,54 +548,34 @@ impl WindowUDFImpl for AliasedWindowUDFImpl {
         self.inner.coerce_types(arg_types)
     }
 
+    fn reverse_expr(&self) -> ReversedUDWF {
+        self.inner.reverse_expr()
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.inner.documentation()
     }
-}
 
-// Window UDF doc sections for use in public documentation
-pub mod window_doc_sections {
-    use datafusion_doc::DocSection;
-
-    pub fn doc_sections() -> Vec<DocSection> {
-        vec![
-            DOC_SECTION_AGGREGATE,
-            DOC_SECTION_RANKING,
-            DOC_SECTION_ANALYTICAL,
-        ]
-    }
-
-    pub const DOC_SECTION_AGGREGATE: DocSection = DocSection {
-        include: true,
-        label: "Aggregate Functions",
-        description: Some("All aggregate functions can be used as window functions."),
-    };
-
-    pub const DOC_SECTION_RANKING: DocSection = DocSection {
-        include: true,
-        label: "Ranking Functions",
-        description: None,
-    };
-
-    pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection {
-        include: true,
-        label: "Analytical Functions",
-        description: None,
-    };
+    fn limit_effect(&self, args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        self.inner.limit_effect(args)
+    }
 }
 
 #[cfg(test)]
 mod test {
-    use crate::{PartitionEvaluator, WindowUDF, WindowUDFImpl};
+    use crate::{LimitEffect, PartitionEvaluator, WindowUDF, WindowUDFImpl};
     use arrow::datatypes::{DataType, FieldRef};
     use datafusion_common::Result;
     use datafusion_expr_common::signature::{Signature, Volatility};
     use datafusion_functions_window_common::field::WindowUDFFieldArgs;
     use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use std::any::Any;
     use std::cmp::Ordering;
+    use std::hash::{DefaultHasher, Hash, Hasher};
+    use std::sync::Arc;
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct AWindowUDF {
         signature: Signature,
     }
@@ -633,9 +612,13 @@ mod test {
         fn field(&self, _field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
             unimplemented!()
         }
+
+        fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+            LimitEffect::Unknown
+        }
     }
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct BWindowUDF {
         signature: Signature,
     }
@@ -672,6 +655,20 @@ mod test {
         fn field(&self, _field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
             unimplemented!()
         }
+
+        fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+            LimitEffect::Unknown
+        }
+    }
+
+    #[test]
+    fn test_partial_eq() {
+        let a1 = WindowUDF::from(AWindowUDF::new());
+        let a2 = WindowUDF::from(AWindowUDF::new());
+        let eq = a1 == a2;
+        assert!(eq);
+        assert_eq!(a1, a2);
+        assert_eq!(hash(a1), hash(a2));
     }
 
     #[test]
@@ -684,4 +681,10 @@ mod test {
         assert!(a1 < b1);
         assert!(!(a1 == b1));
     }
+
+    fn hash<T: Hash>(value: T) -> u64 {
+        let hasher = &mut DefaultHasher::new();
+        value.hash(hasher);
+        hasher.finish()
+    }
 }
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 552ce1502d466..81a6fd393a989 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -21,10 +21,10 @@ use std::cmp::Ordering;
 use std::collections::{BTreeSet, HashSet};
 use std::sync::Arc;
 
-use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction, WindowFunctionParams};
+use crate::expr::{Alias, Sort, WildcardOptions, WindowFunctionParams};
 use crate::expr_rewriter::strip_outer_reference;
 use crate::{
-    and, BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator,
+    BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, and,
 };
 use datafusion_expr_common::signature::{Signature, TypeSignature};
 
@@ -34,11 +34,14 @@ use datafusion_common::tree_node::{
 };
 use datafusion_common::utils::get_at_indices;
 use datafusion_common::{
-    internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap,
-    Result, TableReference,
+    Column, DFSchema, DFSchemaRef, HashMap, Result, TableReference, internal_err,
+    plan_err,
 };
 
+#[cfg(not(feature = "sql"))]
+use crate::expr::{ExceptSelectItem, ExcludeSelectItem};
 use indexmap::IndexSet;
+#[cfg(feature = "sql")]
 use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem};
 
 pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
@@ -63,6 +66,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
     }
 }
 
+/// Internal helper that generates indices for powerset subsets using bitset iteration.
+/// Returns an iterator of index vectors, where each vector contains the indices
+/// of elements to include in that subset.
+fn powerset_indices(len: usize) -> impl Iterator<Item = Vec<usize>> {
+    (0..(1 << len)).map(move |mask| {
+        let mut indices = vec![];
+        let mut bitset = mask;
+        while bitset > 0 {
+            let rightmost: u64 = bitset & !(bitset - 1);
+            let idx = rightmost.trailing_zeros() as usize;
+            indices.push(idx);
+            bitset &= bitset - 1;
+        }
+        indices
+    })
+}
+
 /// The [power set] (or powerset) of a set S is the set of all subsets of S, \
 /// including the empty set and S itself.
 ///
@@ -80,33 +100,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
 ///  and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, z}, {x, y, z}}.
 ///
 /// [power set]: https://en.wikipedia.org/wiki/Power_set
-fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
+pub fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>> {
     if slice.len() >= 64 {
-        return Err("The size of the set must be less than 64.".into());
+        return plan_err!("The size of the set must be less than 64");
     }
 
-    let mut v = Vec::new();
-    for mask in 0..(1 << slice.len()) {
-        let mut ss = vec![];
-        let mut bitset = mask;
-        while bitset > 0 {
-            let rightmost: u64 = bitset & !(bitset - 1);
-            let idx = rightmost.trailing_zeros();
-            let item = slice.get(idx as usize).unwrap();
-            ss.push(item);
-            // zero the trailing bit
-            bitset &= bitset - 1;
-        }
-        v.push(ss);
-    }
-    Ok(v)
+    Ok(powerset_indices(slice.len())
+        .map(|indices| indices.iter().map(|&idx| &slice[idx]).collect())
+        .collect())
 }
 
 /// check the number of expressions contained in the grouping_set
 fn check_grouping_set_size_limit(size: usize) -> Result<()> {
     let max_grouping_set_size = 65535;
     if size > max_grouping_set_size {
-        return plan_err!("The number of group_expression in grouping_set exceeds the maximum limit {max_grouping_set_size}, found {size}");
+        return plan_err!(
+            "The number of group_expression in grouping_set exceeds the maximum limit {max_grouping_set_size}, found {size}"
+        );
     }
 
     Ok(())
@@ -116,7 +126,9 @@ fn check_grouping_set_size_limit(size: usize) -> Result<()> {
 fn check_grouping_sets_size_limit(size: usize) -> Result<()> {
     let max_grouping_sets_size = 4096;
     if size > max_grouping_sets_size {
-        return plan_err!("The number of grouping_set in grouping_sets exceeds the maximum limit {max_grouping_sets_size}, found {size}");
+        return plan_err!(
+            "The number of grouping_set in grouping_sets exceeds the maximum limit {max_grouping_sets_size}, found {size}"
+        );
     }
 
     Ok(())
@@ -204,8 +216,7 @@ pub fn enumerate_grouping_sets(group_expr: Vec<Expr>) -> Result<Vec<Expr>> {
                     grouping_sets.iter().map(|e| e.iter().collect()).collect()
                 }
                 Expr::GroupingSet(GroupingSet::Cube(group_exprs)) => {
-                    let grouping_sets = powerset(group_exprs)
-                        .map_err(|e| plan_datafusion_err!("{}", e))?;
+                    let grouping_sets = powerset(group_exprs)?;
                     check_grouping_sets_size_limit(grouping_sets.len())?;
                     grouping_sets
                 }
@@ -276,7 +287,7 @@ pub fn expr_to_columns(expr: &Expr, accum: &mut HashSet<Column>) -> Result<()> {
             Expr::Unnest(_)
             | Expr::ScalarVariable(_, _)
             | Expr::Alias(_)
-            | Expr::Literal(_)
+            | Expr::Literal(_, _)
             | Expr::BinaryExpr { .. }
             | Expr::Like { .. }
             | Expr::SimilarTo { .. }
@@ -301,6 +312,7 @@ pub fn expr_to_columns(expr: &Expr, accum: &mut HashSet<Column>) -> Result<()> {
             | Expr::InList { .. }
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::ScalarSubquery(_)
             | Expr::Wildcard { .. }
             | Expr::Placeholder(_)
@@ -351,7 +363,7 @@ fn get_excluded_columns(
 /// Returns all `Expr`s in the schema, except the `Column`s in the `columns_to_skip`
 fn get_exprs_except_skipped(
     schema: &DFSchema,
-    columns_to_skip: HashSet<Column>,
+    columns_to_skip: &HashSet<Column>,
 ) -> Vec<Expr> {
     if columns_to_skip.is_empty() {
         schema.iter().map(Expr::from).collect::<Vec<Expr>>()
@@ -416,7 +428,7 @@ pub fn expand_wildcard(
     };
     // Add each excluded `Column` to columns_to_skip
     columns_to_skip.extend(excluded_columns);
-    Ok(get_exprs_except_skipped(schema, columns_to_skip))
+    Ok(get_exprs_except_skipped(schema, &columns_to_skip))
 }
 
 /// Resolves an `Expr::Wildcard` to a collection of qualified `Expr::Column`'s.
@@ -461,7 +473,7 @@ pub fn expand_qualified_wildcard(
     columns_to_skip.extend(excluded_columns);
     Ok(get_exprs_except_skipped(
         &qualified_dfschema,
-        columns_to_skip,
+        &columns_to_skip,
     ))
 }
 
@@ -579,7 +591,8 @@ pub fn group_window_expr_by_sort_keys(
 ) -> Result<Vec<(WindowSortKey, Vec<Expr>)>> {
     let mut result = vec![];
     window_expr.into_iter().try_for_each(|expr| match &expr {
-        Expr::WindowFunction( WindowFunction{ params: WindowFunctionParams { partition_by, order_by, ..}, .. }) => {
+        Expr::WindowFunction(window_fun) => {
+            let WindowFunctionParams{ partition_by, order_by, ..} = &window_fun.as_ref().params;
             let sort_key = generate_sort_key(partition_by, order_by)?;
             if let Some((_, values)) = result.iter_mut().find(
                 |group: &&mut (WindowSortKey, Vec<Expr>)| matches!(group, (key, _) if *key == sort_key),
@@ -608,7 +621,7 @@ pub fn find_aggregate_exprs<'a>(exprs: impl IntoIterator<Item = &'a Expr>) -> Ve
 
 /// Collect all deeply nested `Expr::WindowFunction`. They are returned in order of occurrence
 /// (depth first), with duplicates omitted.
-pub fn find_window_exprs(exprs: &[Expr]) -> Vec<Expr> {
+pub fn find_window_exprs<'a>(exprs: impl IntoIterator<Item = &'a Expr>) -> Vec<Expr> {
     find_exprs_in_exprs(exprs, &|nested_expr| {
         matches!(nested_expr, Expr::WindowFunction { .. })
     })
@@ -689,7 +702,23 @@ where
     err
 }
 
-/// Create field meta-data from an expression, for use in a result set schema
+/// Create schema fields from an expression list, for use in result set schema construction
+///
+/// This function converts a list of expressions into a list of complete schema fields,
+/// making comprehensive determinations about each field's properties including:
+/// - **Data type**: Resolved based on expression type and input schema context
+/// - **Nullability**: Determined by expression-specific nullability rules
+/// - **Metadata**: Computed based on expression type (preserving, merging, or generating new metadata)
+/// - **Table reference scoping**: Establishing proper qualified field references
+///
+/// Each expression is converted to a field by calling [`Expr::to_field`], which performs
+/// the complete field resolution process for all field properties.
+///
+/// # Returns
+///
+/// A `Result` containing a vector of `(Option<TableReference>, Arc<Field>)` tuples,
+/// where each Field contains complete schema information (type, nullability, metadata)
+/// and proper table reference scoping for the corresponding expression.
 pub fn exprlist_to_fields<'a>(
     exprs: impl IntoIterator<Item = &'a Expr>,
     plan: &LogicalPlan,
@@ -784,7 +813,7 @@ pub(crate) fn find_column_indexes_referenced_by_expr(
                     indexes.push(idx);
                 }
             }
-            Expr::Literal(_) => {
+            Expr::Literal(_, _) => {
                 indexes.push(usize::MAX);
             }
             _ => {}
@@ -813,6 +842,8 @@ pub fn can_hash(data_type: &DataType) -> bool {
         DataType::Float16 => true,
         DataType::Float32 => true,
         DataType::Float64 => true,
+        DataType::Decimal32(_, _) => true,
+        DataType::Decimal64(_, _) => true,
         DataType::Decimal128(_, _) => true,
         DataType::Decimal256(_, _) => true,
         DataType::Timestamp(_, _) => true,
@@ -868,7 +899,6 @@ pub fn check_all_columns_from_schema(
 ///    all referenced column of the right side is from the right schema.
 /// 2. Or opposite. All referenced column of the left side is from the right schema,
 ///    and the right side is from the left schema.
-///
 pub fn find_valid_equijoin_key_pair(
     left_key: &Expr,
     right_key: &Expr,
@@ -907,6 +937,8 @@ pub fn find_valid_equijoin_key_pair(
 ///     round(Float64)
 ///     round(Float32)
 /// ```
+#[expect(clippy::needless_pass_by_value)]
+#[deprecated(since = "53.0.0", note = "Internal function")]
 pub fn generate_signature_error_msg(
     func_name: &str,
     func_signature: Signature,
@@ -914,16 +946,38 @@ pub fn generate_signature_error_msg(
 ) -> String {
     let candidate_signatures = func_signature
         .type_signature
-        .to_string_repr()
+        .to_string_repr_with_names(func_signature.parameter_names.as_deref())
         .iter()
         .map(|args_str| format!("\t{func_name}({args_str})"))
         .collect::<Vec<String>>()
         .join("\n");
 
     format!(
-            "No function matches the given name and argument types '{}({})'. You might need to add explicit type casts.\n\tCandidate functions:\n{}",
-            func_name, TypeSignature::join_types(input_expr_types, ", "), candidate_signatures
-        )
+        "No function matches the given name and argument types '{}({})'. You might need to add explicit type casts.\n\tCandidate functions:\n{}",
+        func_name,
+        TypeSignature::join_types(input_expr_types, ", "),
+        candidate_signatures
+    )
+}
+
+/// Creates a detailed error message for a function with wrong signature.
+///
+/// For example, a query like `select round(3.14, 1.1);` would yield:
+/// ```text
+/// Error during planning: No function matches 'round(Float64, Float64)'. You might need to add explicit type casts.
+///     Candidate functions:
+///     round(Float64, Int64)
+///     round(Float32, Int64)
+///     round(Float64)
+///     round(Float32)
+/// ```
+pub(crate) fn generate_signature_error_message(
+    func_name: &str,
+    func_signature: &Signature,
+    input_expr_types: &[DataType],
+) -> String {
+    #[expect(deprecated)]
+    generate_signature_error_msg(func_name, func_signature.clone(), input_expr_types)
 }
 
 /// Splits a conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]`
@@ -1012,10 +1066,7 @@ pub fn iter_conjunction_owned(expr: Expr) -> impl Iterator<Item = Expr> {
 /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2)));
 ///
 /// // [a=1, b=2]
-/// let split = vec![
-///   col("a").eq(lit(1)),
-///   col("b").eq(lit(2)),
-/// ];
+/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))];
 ///
 /// // use split_conjunction_owned to split them
 /// assert_eq!(split_conjunction_owned(expr), split);
@@ -1038,10 +1089,7 @@ pub fn split_conjunction_owned(expr: Expr) -> Vec<Expr> {
 /// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2)));
 ///
 /// // [a=1, b=2]
-/// let split = vec![
-///   col("a").eq(lit(1)),
-///   col("b").eq(lit(2)),
-/// ];
+/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))];
 ///
 /// // use split_binary_owned to split them
 /// assert_eq!(split_binary_owned(expr, Operator::Plus), split);
@@ -1109,10 +1157,7 @@ fn split_binary_impl<'a>(
 /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2)));
 ///
 /// // [a=1, b=2]
-/// let split = vec![
-///   col("a").eq(lit(1)),
-///   col("b").eq(lit(2)),
-/// ];
+/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))];
 ///
 /// // use conjunction to join them together with `AND`
 /// assert_eq!(conjunction(split), Some(expr));
@@ -1135,10 +1180,7 @@ pub fn conjunction(filters: impl IntoIterator<Item = Expr>) -> Option<Expr> {
 /// let expr = col("a").eq(lit(1)).or(col("b").eq(lit(2)));
 ///
 /// // [a=1, b=2]
-/// let split = vec![
-///   col("a").eq(lit(1)),
-///   col("b").eq(lit(2)),
-/// ];
+/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))];
 ///
 /// // use disjunction to join them together with `OR`
 /// assert_eq!(disjunction(split), Some(expr));
@@ -1222,6 +1264,9 @@ pub fn only_or_err<T>(slice: &[T]) -> Result<&T> {
 }
 
 /// merge inputs schema into a single schema.
+///
+/// This function merges schemas from multiple logical plan inputs using [`DFSchema::merge`].
+/// Refer to that documentation for details on precedence and metadata handling.
 pub fn merge_schema(inputs: &[&LogicalPlan]) -> DFSchema {
     if inputs.len() == 1 {
         inputs[0].schema().as_ref().clone()
@@ -1263,11 +1308,13 @@ pub fn collect_subquery_cols(
 mod tests {
     use super::*;
     use crate::{
-        col, cube, expr_vec_fmt, grouping_set, lit, rollup,
-        test::function_stub::max_udaf, test::function_stub::min_udaf,
-        test::function_stub::sum_udaf, Cast, ExprFunctionExt, WindowFunctionDefinition,
+        Cast, ExprFunctionExt, WindowFunctionDefinition, col, cube,
+        expr::WindowFunction,
+        expr_vec_fmt, grouping_set, lit, rollup,
+        test::function_stub::{max_udaf, min_udaf, sum_udaf},
     };
     use arrow::datatypes::{UnionFields, UnionMode};
+    use datafusion_expr_common::signature::{TypeSignature, Volatility};
 
     #[test]
     fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> {
@@ -1279,19 +1326,19 @@ mod tests {
 
     #[test]
     fn test_group_window_expr_by_sort_keys_empty_window() -> Result<()> {
-        let max1 = Expr::WindowFunction(WindowFunction::new(
+        let max1 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("name")],
         ));
-        let max2 = Expr::WindowFunction(WindowFunction::new(
+        let max2 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("name")],
         ));
-        let min3 = Expr::WindowFunction(WindowFunction::new(
+        let min3 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(min_udaf()),
             vec![col("name")],
         ));
-        let sum4 = Expr::WindowFunction(WindowFunction::new(
+        let sum4 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(sum_udaf()),
             vec![col("age")],
         ));
@@ -1309,25 +1356,25 @@ mod tests {
         let age_asc = Sort::new(col("age"), true, true);
         let name_desc = Sort::new(col("name"), false, true);
         let created_at_desc = Sort::new(col("created_at"), false, true);
-        let max1 = Expr::WindowFunction(WindowFunction::new(
+        let max1 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("name")],
         ))
         .order_by(vec![age_asc.clone(), name_desc.clone()])
         .build()
         .unwrap();
-        let max2 = Expr::WindowFunction(WindowFunction::new(
+        let max2 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("name")],
         ));
-        let min3 = Expr::WindowFunction(WindowFunction::new(
+        let min3 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(min_udaf()),
             vec![col("name")],
         ))
         .order_by(vec![age_asc.clone(), name_desc.clone()])
         .build()
         .unwrap();
-        let sum4 = Expr::WindowFunction(WindowFunction::new(
+        let sum4 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(sum_udaf()),
             vec![col("age")],
         ))
@@ -1687,4 +1734,154 @@ mod tests {
             DataType::List(Arc::new(Field::new("my_union", union_type, true)));
         assert!(!can_hash(&list_union_type));
     }
+
+    #[test]
+    fn test_generate_signature_error_msg_with_parameter_names() {
+        let sig = Signature::one_of(
+            vec![
+                TypeSignature::Exact(vec![DataType::Utf8, DataType::Int64]),
+                TypeSignature::Exact(vec![
+                    DataType::Utf8,
+                    DataType::Int64,
+                    DataType::Int64,
+                ]),
+            ],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec![
+            "str".to_string(),
+            "start_pos".to_string(),
+            "length".to_string(),
+        ])
+        .expect("valid parameter names");
+
+        // Generate error message with only 1 argument provided
+        let error_msg =
+            generate_signature_error_message("substr", &sig, &[DataType::Utf8]);
+
+        assert!(
+            error_msg.contains("str: Utf8, start_pos: Int64"),
+            "Expected 'str: Utf8, start_pos: Int64' in error message, got: {error_msg}"
+        );
+        assert!(
+            error_msg.contains("str: Utf8, start_pos: Int64, length: Int64"),
+            "Expected 'str: Utf8, start_pos: Int64, length: Int64' in error message, got: {error_msg}"
+        );
+    }
+
+    #[test]
+    fn test_generate_signature_error_msg_without_parameter_names() {
+        let sig = Signature::one_of(
+            vec![TypeSignature::Any(2), TypeSignature::Any(3)],
+            Volatility::Immutable,
+        );
+
+        let error_msg =
+            generate_signature_error_message("my_func", &sig, &[DataType::Int32]);
+
+        assert!(
+            error_msg.contains("Any, Any"),
+            "Expected 'Any, Any' without parameter names, got: {error_msg}"
+        );
+    }
+
+    #[test]
+    fn test_signature_error_msg_exact() {
+        use insta::assert_snapshot;
+
+        let sig = Signature::one_of(
+            vec![
+                TypeSignature::Exact(vec![DataType::Float64, DataType::Int64]),
+                TypeSignature::Exact(vec![DataType::Float32, DataType::Int64]),
+                TypeSignature::Exact(vec![DataType::Float64]),
+                TypeSignature::Exact(vec![DataType::Float32]),
+            ],
+            Volatility::Immutable,
+        );
+        let msg = generate_signature_error_message(
+            "round",
+            &sig,
+            &[DataType::Float64, DataType::Float64],
+        );
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'round(Float64, Float64)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	round(Float64, Int64)
+        	round(Float32, Int64)
+        	round(Float64)
+        	round(Float32)
+        ");
+    }
+
+    #[test]
+    fn test_signature_error_msg_coercible() {
+        use datafusion_common::types::NativeType;
+        use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+        use insta::assert_snapshot;
+
+        let sig = Signature::coercible(
+            vec![
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(
+                        datafusion_common::types::logical_float64(),
+                    ),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Float64,
+                ),
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(datafusion_common::types::logical_int64()),
+                    vec![TypeSignatureClass::Integer],
+                    NativeType::Int64,
+                ),
+            ],
+            Volatility::Immutable,
+        );
+        let msg = generate_signature_error_message(
+            "round",
+            &sig,
+            &[DataType::Utf8, DataType::Utf8],
+        );
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'round(Utf8, Utf8)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	round(Float64, Int64)
+        ");
+    }
+
+    #[test]
+    fn test_signature_error_msg_with_names_coercible() {
+        use datafusion_common::types::NativeType;
+        use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+        use insta::assert_snapshot;
+
+        let sig = Signature::coercible(
+            vec![
+                Coercion::new_exact(TypeSignatureClass::Native(
+                    datafusion_common::types::logical_string(),
+                )),
+                Coercion::new_exact(TypeSignatureClass::Native(
+                    datafusion_common::types::logical_int64(),
+                )),
+                Coercion::new_implicit(
+                    TypeSignatureClass::Native(datafusion_common::types::logical_int64()),
+                    vec![TypeSignatureClass::Integer],
+                    NativeType::Int64,
+                ),
+            ],
+            Volatility::Immutable,
+        )
+        .with_parameter_names(vec![
+            "string".to_string(),
+            "start_pos".to_string(),
+            "length".to_string(),
+        ])
+        .expect("valid parameter names");
+
+        let msg = generate_signature_error_message("substr", &sig, &[DataType::Int32]);
+        assert_snapshot!(msg, @r"
+        No function matches the given name and argument types 'substr(Int32)'. You might need to add explicit type casts.
+        	Candidate functions:
+        	substr(string: String, start_pos: Int64, length: Int64)
+        ");
+    }
 }
diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs
index 8771b25137cf2..334c1fa2a090b 100644
--- a/datafusion/expr/src/window_frame.rs
+++ b/datafusion/expr/src/window_frame.rs
@@ -24,13 +24,12 @@
 //! - An EXCLUDE clause.
 
 use crate::{expr::Sort, lit};
-use arrow::datatypes::DataType;
 use std::fmt::{self, Formatter};
 use std::hash::Hash;
 
-use datafusion_common::{plan_err, sql_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, plan_err};
+#[cfg(feature = "sql")]
 use sqlparser::ast::{self, ValueWithSpan};
-use sqlparser::parser::ParserError::ParserError;
 
 /// The frame specification determines which output rows are read by an aggregate
 /// window function. The ending frame boundary can be omitted if the `BETWEEN`
@@ -115,8 +114,9 @@ impl fmt::Debug for WindowFrame {
     }
 }
 
+#[cfg(feature = "sql")]
 impl TryFrom<ast::WindowFrame> for WindowFrame {
-    type Error = DataFusionError;
+    type Error = datafusion_common::error::DataFusionError;
 
     fn try_from(value: ast::WindowFrame) -> Result<Self> {
         let start_bound = WindowFrameBound::try_parse(value.start_bound, &value.units)?;
@@ -131,12 +131,10 @@ impl TryFrom<ast::WindowFrame> for WindowFrame {
                     "Invalid window frame: start bound cannot be UNBOUNDED FOLLOWING"
                 )?
             }
-        } else if let WindowFrameBound::Preceding(val) = &end_bound {
-            if val.is_null() {
-                plan_err!(
-                    "Invalid window frame: end bound cannot be UNBOUNDED PRECEDING"
-                )?
-            }
+        } else if let WindowFrameBound::Preceding(val) = &end_bound
+            && val.is_null()
+        {
+            plan_err!("Invalid window frame: end bound cannot be UNBOUNDED PRECEDING")?
         };
 
         let units = value.units.into();
@@ -160,7 +158,7 @@ impl WindowFrame {
                 } else {
                     WindowFrameUnits::Range
                 },
-                start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                start_bound: WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
                 end_bound: WindowFrameBound::CurrentRow,
                 causal: strict,
             }
@@ -307,7 +305,6 @@ impl WindowFrame {
 /// 3. CURRENT ROW
 /// 4. `<expr>` FOLLOWING
 /// 5. UNBOUNDED FOLLOWING
-///
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
 pub enum WindowFrameBound {
     /// 1. UNBOUNDED PRECEDING
@@ -343,6 +340,7 @@ impl WindowFrameBound {
 }
 
 impl WindowFrameBound {
+    #[cfg(feature = "sql")]
     fn try_parse(
         value: ast::WindowFrameBound,
         units: &ast::WindowFrameUnits,
@@ -351,26 +349,34 @@ impl WindowFrameBound {
             ast::WindowFrameBound::Preceding(Some(v)) => {
                 Self::Preceding(convert_frame_bound_to_scalar_value(*v, units)?)
             }
-            ast::WindowFrameBound::Preceding(None) => Self::Preceding(ScalarValue::Null),
+            ast::WindowFrameBound::Preceding(None) => {
+                Self::Preceding(ScalarValue::UInt64(None))
+            }
             ast::WindowFrameBound::Following(Some(v)) => {
                 Self::Following(convert_frame_bound_to_scalar_value(*v, units)?)
             }
-            ast::WindowFrameBound::Following(None) => Self::Following(ScalarValue::Null),
+            ast::WindowFrameBound::Following(None) => {
+                Self::Following(ScalarValue::UInt64(None))
+            }
             ast::WindowFrameBound::CurrentRow => Self::CurrentRow,
         })
     }
 }
 
+#[cfg(feature = "sql")]
 fn convert_frame_bound_to_scalar_value(
     v: ast::Expr,
     units: &ast::WindowFrameUnits,
 ) -> Result<ScalarValue> {
+    use arrow::datatypes::DataType;
+    use datafusion_common::exec_err;
     match units {
         // For ROWS and GROUPS we are sure that the ScalarValue must be a non-negative integer ...
         ast::WindowFrameUnits::Rows | ast::WindowFrameUnits::Groups => match v {
-            ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => {
-                Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?)
-            },
+            ast::Expr::Value(ValueWithSpan {
+                value: ast::Value::Number(value, false),
+                span: _,
+            }) => Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?),
             ast::Expr::Interval(ast::Interval {
                 value,
                 leading_field: None,
@@ -379,11 +385,12 @@ fn convert_frame_bound_to_scalar_value(
                 fractional_seconds_precision: None,
             }) => {
                 let value = match *value {
-                    ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item,
+                    ast::Expr::Value(ValueWithSpan {
+                        value: ast::Value::SingleQuotedString(item),
+                        span: _,
+                    }) => item,
                     e => {
-                        return sql_err!(ParserError(format!(
-                            "INTERVAL expression cannot be {e:?}"
-                        )));
+                        return exec_err!("INTERVAL expression cannot be {e:?}");
                     }
                 };
                 Ok(ScalarValue::try_from_string(value, &DataType::UInt64)?)
@@ -395,18 +402,22 @@ fn convert_frame_bound_to_scalar_value(
         // ... instead for RANGE it could be anything depending on the type of the ORDER BY clause,
         // so we use a ScalarValue::Utf8.
         ast::WindowFrameUnits::Range => Ok(ScalarValue::Utf8(Some(match v {
-            ast::Expr::Value(ValueWithSpan{value: ast::Value::Number(value, false), span: _}) => value,
+            ast::Expr::Value(ValueWithSpan {
+                value: ast::Value::Number(value, false),
+                span: _,
+            }) => value,
             ast::Expr::Interval(ast::Interval {
                 value,
                 leading_field,
                 ..
             }) => {
                 let result = match *value {
-                    ast::Expr::Value(ValueWithSpan{value: ast::Value::SingleQuotedString(item), span: _}) => item,
+                    ast::Expr::Value(ValueWithSpan {
+                        value: ast::Value::SingleQuotedString(item),
+                        span: _,
+                    }) => item,
                     e => {
-                        return sql_err!(ParserError(format!(
-                            "INTERVAL expression cannot be {e:?}"
-                        )));
+                        return exec_err!("INTERVAL expression cannot be {e:?}");
                     }
                 };
                 if let Some(leading_field) = leading_field {
@@ -473,6 +484,7 @@ impl fmt::Display for WindowFrameUnits {
     }
 }
 
+#[cfg(feature = "sql")]
 impl From<ast::WindowFrameUnits> for WindowFrameUnits {
     fn from(value: ast::WindowFrameUnits) -> Self {
         match value {
@@ -570,9 +582,9 @@ mod tests {
     #[test]
     fn test_window_frame_bound_creation() -> Result<()> {
         //  Unbounded
-        test_bound!(Rows, None, ScalarValue::Null);
-        test_bound!(Groups, None, ScalarValue::Null);
-        test_bound!(Range, None, ScalarValue::Null);
+        test_bound!(Rows, None, ScalarValue::UInt64(None));
+        test_bound!(Groups, None, ScalarValue::UInt64(None));
+        test_bound!(Range, None, ScalarValue::UInt64(None));
 
         // Number
         let number = Some(Box::new(ast::Expr::Value(
@@ -596,8 +608,16 @@ mod tests {
             last_field: None,
             leading_precision: None,
         })));
-        test_bound_err!(Rows, number.clone(), "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers");
-        test_bound_err!(Groups, number.clone(), "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers");
+        test_bound_err!(
+            Rows,
+            number.clone(),
+            "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers"
+        );
+        test_bound_err!(
+            Groups,
+            number.clone(),
+            "Error during planning: Invalid window frame: frame offsets for ROWS / GROUPS must be non negative integers"
+        );
         test_bound!(
             Range,
             number.clone(),
diff --git a/datafusion/expr/src/window_state.rs b/datafusion/expr/src/window_state.rs
index 4c37cc6b60137..d7da7a778b011 100644
--- a/datafusion/expr/src/window_state.rs
+++ b/datafusion/expr/src/window_state.rs
@@ -23,18 +23,17 @@ use crate::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 
 use arrow::{
     array::ArrayRef,
-    compute::{concat, concat_batches, SortOptions},
+    compute::{SortOptions, concat, concat_batches},
     datatypes::{DataType, SchemaRef},
     record_batch::RecordBatch,
 };
 use datafusion_common::{
-    internal_err,
+    Result, ScalarValue, internal_datafusion_err, internal_err,
     utils::{compare_rows, get_row_at_idx, search_in_slice},
-    DataFusionError, Result, ScalarValue,
 };
 
 /// Holds the state of evaluating a window function
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct WindowAggState {
     /// The range that we calculate the window function
     pub window_frame_range: Range<usize>,
@@ -90,7 +89,12 @@ impl WindowAggState {
         partition_batch_state: &PartitionBatchState,
     ) -> Result<()> {
         self.last_calculated_index += out_col.len();
-        self.out_col = concat(&[&self.out_col, &out_col])?;
+        // no need to use concat if the current `out_col` is empty
+        if self.out_col.is_empty() {
+            self.out_col = Arc::clone(out_col);
+        } else {
+            self.out_col = concat(&[&self.out_col, &out_col])?;
+        }
         self.n_row_result_missing =
             partition_batch_state.record_batch.num_rows() - self.last_calculated_index;
         self.is_end = partition_batch_state.is_end;
@@ -112,7 +116,7 @@ impl WindowAggState {
 }
 
 /// This object stores the window frame state for use in incremental calculations.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub enum WindowFrameContext {
     /// ROWS frames are inherently stateless.
     Rows(Arc<WindowFrame>),
@@ -165,7 +169,7 @@ impl WindowFrameContext {
             // comparison of rows.
             WindowFrameContext::Range {
                 window_frame,
-                ref mut state,
+                state,
             } => state.calculate_range(
                 window_frame,
                 last_range,
@@ -178,7 +182,7 @@ impl WindowFrameContext {
             // or position of NULLs do not impact inequality.
             WindowFrameContext::Groups {
                 window_frame,
-                ref mut state,
+                state,
             } => state.calculate_range(window_frame, range_columns, length, idx),
         }
     }
@@ -200,14 +204,14 @@ impl WindowFrameContext {
             WindowFrameBound::Following(ScalarValue::UInt64(None)) => {
                 return internal_err!(
                     "Frame start cannot be UNBOUNDED FOLLOWING '{window_frame:?}'"
-                )
+                );
             }
             WindowFrameBound::Following(ScalarValue::UInt64(Some(n))) => {
                 std::cmp::min(idx + n as usize, length)
             }
             // ERRONEOUS FRAMES
             WindowFrameBound::Preceding(_) | WindowFrameBound::Following(_) => {
-                return internal_err!("Rows should be Uint")
+                return internal_err!("Rows should be UInt64");
             }
         };
         let end = match window_frame.end_bound {
@@ -215,7 +219,7 @@ impl WindowFrameContext {
             WindowFrameBound::Preceding(ScalarValue::UInt64(None)) => {
                 return internal_err!(
                     "Frame end cannot be UNBOUNDED PRECEDING '{window_frame:?}'"
-                )
+                );
             }
             WindowFrameBound::Preceding(ScalarValue::UInt64(Some(n))) => {
                 if idx >= n as usize {
@@ -232,7 +236,7 @@ impl WindowFrameContext {
             }
             // ERRONEOUS FRAMES
             WindowFrameBound::Preceding(_) | WindowFrameBound::Following(_) => {
-                return internal_err!("Rows should be Uint")
+                return internal_err!("Rows should be UInt64");
             }
         };
         Ok(Range { start, end })
@@ -240,7 +244,7 @@ impl WindowFrameContext {
 }
 
 /// State for each unique partition determined according to PARTITION BY column(s)
-#[derive(Debug)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct PartitionBatchState {
     /// The record batch belonging to current partition
     pub record_batch: RecordBatch,
@@ -265,6 +269,15 @@ impl PartitionBatchState {
         }
     }
 
+    pub fn new_with_batch(batch: RecordBatch) -> Self {
+        Self {
+            record_batch: batch,
+            most_recent_row: None,
+            is_end: false,
+            n_out_row: 0,
+        }
+    }
+
     pub fn extend(&mut self, batch: &RecordBatch) -> Result<()> {
         self.record_batch =
             concat_batches(&self.record_batch.schema(), [&self.record_batch, batch])?;
@@ -282,7 +295,7 @@ impl PartitionBatchState {
 /// ranges of data while processing RANGE frames.
 /// Attribute `sort_options` stores the column ordering specified by the ORDER
 /// BY clause. This information is used to calculate the range.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub struct WindowFrameStateRange {
     sort_options: Vec<SortOptions>,
 }
@@ -388,8 +401,8 @@ impl WindowFrameStateRange {
                 .sort_options
                 .first()
                 .ok_or_else(|| {
-                    DataFusionError::Internal(
-                        "Sort options unexpectedly absent in a window frame".to_string(),
+                    internal_datafusion_err!(
+                        "Sort options unexpectedly absent in a window frame"
                     )
                 })?
                 .descending;
@@ -454,7 +467,7 @@ impl WindowFrameStateRange {
 
 /// This structure encapsulates all the state information we require as we
 /// scan groups of data while processing window frames.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub struct WindowFrameStateGroups {
     /// A tuple containing group values and the row index where the group ends.
     /// Example: [[1, 1], [1, 1], [2, 1], [2, 1], ...] would correspond to
@@ -675,9 +688,9 @@ mod tests {
         (range_columns, sort_options)
     }
 
-    fn assert_expected(
-        expected_results: Vec<(Range<usize>, usize)>,
+    fn assert_group_ranges(
         window_frame: &Arc<WindowFrame>,
+        expected_results: Vec<(Range<usize>, usize)>,
     ) -> Result<()> {
         let mut window_frame_groups = WindowFrameStateGroups::default();
         let (range_columns, _) = get_test_data();
@@ -697,6 +710,136 @@ mod tests {
         Ok(())
     }
 
+    fn assert_frame_ranges(
+        window_frame: &Arc<WindowFrame>,
+        expected_results: Vec<Range<usize>>,
+    ) -> Result<()> {
+        let mut window_frame_context =
+            WindowFrameContext::new(Arc::clone(window_frame), vec![]);
+        let (range_columns, _) = get_test_data();
+        let n_row = range_columns[0].len();
+        let mut last_range = Range { start: 0, end: 0 };
+        for (idx, expected_range) in expected_results.into_iter().enumerate() {
+            let range = window_frame_context.calculate_range(
+                &range_columns,
+                &last_range,
+                n_row,
+                idx,
+            )?;
+            assert_eq!(range, expected_range);
+            last_range = range;
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn test_default_window_frame_group_boundaries() -> Result<()> {
+        let window_frame = Arc::new(WindowFrame::new(None));
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+                (Range { start: 0, end: 9 }, 0),
+            ],
+        )?;
+
+        assert_frame_ranges(
+            &window_frame,
+            vec![
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+            ],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_unordered_window_frame_group_boundaries() -> Result<()> {
+        let window_frame = Arc::new(WindowFrame::new(Some(false)));
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range { start: 0, end: 1 }, 0),
+                (Range { start: 0, end: 2 }, 1),
+                (Range { start: 0, end: 4 }, 2),
+                (Range { start: 0, end: 4 }, 2),
+                (Range { start: 0, end: 5 }, 3),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 9 }, 5),
+            ],
+        )?;
+
+        assert_frame_ranges(
+            &window_frame,
+            vec![
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+                Range { start: 0, end: 9 },
+            ],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ordered_window_frame_group_boundaries() -> Result<()> {
+        let window_frame = Arc::new(WindowFrame::new(Some(true)));
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range { start: 0, end: 1 }, 0),
+                (Range { start: 0, end: 2 }, 1),
+                (Range { start: 0, end: 4 }, 2),
+                (Range { start: 0, end: 4 }, 2),
+                (Range { start: 0, end: 5 }, 3),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 8 }, 4),
+                (Range { start: 0, end: 9 }, 5),
+            ],
+        )?;
+
+        assert_frame_ranges(
+            &window_frame,
+            vec![
+                Range { start: 0, end: 1 },
+                Range { start: 0, end: 2 },
+                Range { start: 0, end: 3 },
+                Range { start: 0, end: 4 },
+                Range { start: 0, end: 5 },
+                Range { start: 0, end: 6 },
+                Range { start: 0, end: 7 },
+                Range { start: 0, end: 8 },
+                Range { start: 0, end: 9 },
+            ],
+        )?;
+
+        Ok(())
+    }
+
     #[test]
     fn test_window_frame_group_boundaries() -> Result<()> {
         let window_frame = Arc::new(WindowFrame::new_bounds(
@@ -704,18 +847,20 @@ mod tests {
             WindowFrameBound::Preceding(ScalarValue::UInt64(Some(1))),
             WindowFrameBound::Following(ScalarValue::UInt64(Some(1))),
         ));
-        let expected_results = vec![
-            (Range { start: 0, end: 2 }, 0),
-            (Range { start: 0, end: 4 }, 1),
-            (Range { start: 1, end: 5 }, 2),
-            (Range { start: 1, end: 5 }, 2),
-            (Range { start: 2, end: 8 }, 3),
-            (Range { start: 4, end: 9 }, 4),
-            (Range { start: 4, end: 9 }, 4),
-            (Range { start: 4, end: 9 }, 4),
-            (Range { start: 5, end: 9 }, 5),
-        ];
-        assert_expected(expected_results, &window_frame)
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range { start: 0, end: 2 }, 0),
+                (Range { start: 0, end: 4 }, 1),
+                (Range { start: 1, end: 5 }, 2),
+                (Range { start: 1, end: 5 }, 2),
+                (Range { start: 2, end: 8 }, 3),
+                (Range { start: 4, end: 9 }, 4),
+                (Range { start: 4, end: 9 }, 4),
+                (Range { start: 4, end: 9 }, 4),
+                (Range { start: 5, end: 9 }, 5),
+            ],
+        )
     }
 
     #[test]
@@ -725,18 +870,20 @@ mod tests {
             WindowFrameBound::Following(ScalarValue::UInt64(Some(1))),
             WindowFrameBound::Following(ScalarValue::UInt64(Some(2))),
         ));
-        let expected_results = vec![
-            (Range::<usize> { start: 1, end: 4 }, 0),
-            (Range::<usize> { start: 2, end: 5 }, 1),
-            (Range::<usize> { start: 4, end: 8 }, 2),
-            (Range::<usize> { start: 4, end: 8 }, 2),
-            (Range::<usize> { start: 5, end: 9 }, 3),
-            (Range::<usize> { start: 8, end: 9 }, 4),
-            (Range::<usize> { start: 8, end: 9 }, 4),
-            (Range::<usize> { start: 8, end: 9 }, 4),
-            (Range::<usize> { start: 9, end: 9 }, 5),
-        ];
-        assert_expected(expected_results, &window_frame)
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range::<usize> { start: 1, end: 4 }, 0),
+                (Range::<usize> { start: 2, end: 5 }, 1),
+                (Range::<usize> { start: 4, end: 8 }, 2),
+                (Range::<usize> { start: 4, end: 8 }, 2),
+                (Range::<usize> { start: 5, end: 9 }, 3),
+                (Range::<usize> { start: 8, end: 9 }, 4),
+                (Range::<usize> { start: 8, end: 9 }, 4),
+                (Range::<usize> { start: 8, end: 9 }, 4),
+                (Range::<usize> { start: 9, end: 9 }, 5),
+            ],
+        )
     }
 
     #[test]
@@ -746,17 +893,19 @@ mod tests {
             WindowFrameBound::Preceding(ScalarValue::UInt64(Some(2))),
             WindowFrameBound::Preceding(ScalarValue::UInt64(Some(1))),
         ));
-        let expected_results = vec![
-            (Range::<usize> { start: 0, end: 0 }, 0),
-            (Range::<usize> { start: 0, end: 1 }, 1),
-            (Range::<usize> { start: 0, end: 2 }, 2),
-            (Range::<usize> { start: 0, end: 2 }, 2),
-            (Range::<usize> { start: 1, end: 4 }, 3),
-            (Range::<usize> { start: 2, end: 5 }, 4),
-            (Range::<usize> { start: 2, end: 5 }, 4),
-            (Range::<usize> { start: 2, end: 5 }, 4),
-            (Range::<usize> { start: 4, end: 8 }, 5),
-        ];
-        assert_expected(expected_results, &window_frame)
+        assert_group_ranges(
+            &window_frame,
+            vec![
+                (Range::<usize> { start: 0, end: 0 }, 0),
+                (Range::<usize> { start: 0, end: 1 }, 1),
+                (Range::<usize> { start: 0, end: 2 }, 2),
+                (Range::<usize> { start: 0, end: 2 }, 2),
+                (Range::<usize> { start: 1, end: 4 }, 3),
+                (Range::<usize> { start: 2, end: 5 }, 4),
+                (Range::<usize> { start: 2, end: 5 }, 4),
+                (Range::<usize> { start: 2, end: 5 }, 4),
+                (Range::<usize> { start: 4, end: 8 }, 5),
+            ],
+        )
     }
 }
diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml
index 29f40df51444c..28e1b2ee5681f 100644
--- a/datafusion/ffi/Cargo.toml
+++ b/datafusion/ffi/Cargo.toml
@@ -30,6 +30,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -37,22 +40,50 @@ workspace = true
 name = "datafusion_ffi"
 crate-type = ["cdylib", "rlib"]
 
+# Note to developers: do *not* add `datafusion` as a dependency in this crate.
+# It increases build times and library binary size for users.
+
 [dependencies]
 abi_stable = "0.11.3"
 arrow = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true }
 async-ffi = { version = "0.5.0", features = ["abi_stable"] }
 async-trait = { workspace = true }
-datafusion = { workspace = true, default-features = false }
+datafusion-catalog = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-datasource = { workspace = true }
+datafusion-execution = { workspace = true }
+datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true, optional = true }
+datafusion-functions-aggregate = { workspace = true, optional = true }
+datafusion-functions-aggregate-common = { workspace = true }
+datafusion-functions-table = { workspace = true, optional = true }
+datafusion-functions-window = { workspace = true, optional = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
 datafusion-proto = { workspace = true }
+datafusion-proto-common = { workspace = true }
+datafusion-session = { workspace = true }
 futures = { workspace = true }
 log = { workspace = true }
 prost = { workspace = true }
-semver = "1.0.26"
+semver = "1.0.27"
 tokio = { workspace = true }
 
 [dev-dependencies]
+datafusion = { workspace = true, default-features = false, features = ["sql"] }
+datafusion-functions = { workspace = true }
+datafusion-functions-aggregate = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
+datafusion-functions-window = { workspace = true }
 doc-comment = { workspace = true }
 
 [features]
-integration-tests = []
+integration-tests = [
+    "datafusion-functions",
+    "datafusion-functions-aggregate",
+    "datafusion-functions-table",
+    "datafusion-functions-window",
+]
+tarpaulin_include = [] # Exists only to prevent warnings on stable and still have accurate coverage
diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md
index 48283f4cfdc14..304ebb90f49dd 100644
--- a/datafusion/ffi/README.md
+++ b/datafusion/ffi/README.md
@@ -17,10 +17,10 @@
   under the License.
 -->
 
-# `datafusion-ffi`: Apache DataFusion Foreign Function Interface
+# Apache DataFusion Foreign Function Interface
 
-This crate contains code to allow interoperability of Apache [DataFusion] with
-functions from other libraries and/or [DataFusion] versions using a stable
+This crate contains code to allow interoperability of [Apache DataFusion] with
+functions from other libraries and/or DataFusion versions using a stable
 interface.
 
 One of the limitations of the Rust programming language is that there is no
@@ -28,10 +28,10 @@ stable [Rust ABI] (Application Binary Interface). If a library is compiled with
 one version of the Rust compiler and you attempt to use that library with a
 program compiled by a different Rust compiler, there is no guarantee that you
 can access the data structures. In order to share code between libraries loaded
-at runtime, you need to use Rust's [FFI](Foreign Function Interface (FFI)).
+at runtime, you need to use Rust's [FFI] (Foreign Function Interface (FFI)).
 
-The purpose of this crate is to define interfaces between [DataFusion] libraries
-that will remain stable across different versions of [DataFusion]. This allows
+The purpose of this crate is to define interfaces between DataFusion libraries
+that will remain stable across different versions of DataFusion. This allows
 users to write libraries that can interface between each other at runtime rather
 than require compiling all of the code into a single executable.
 
@@ -46,7 +46,7 @@ See [API Docs] for details and examples.
 Two use cases have been identified for this crate, but they are not intended to
 be all inclusive.
 
-1. `datafusion-python` which will use the FFI to provide external services such
+1. [`datafusion-python`] which will use the FFI to provide external services such
    as a `TableProvider` without needing to re-export the entire `datafusion-python`
    code base. With `datafusion-ffi` these packages do not need `datafusion-python`
    as a dependency at all.
@@ -68,8 +68,8 @@ stable interfaces that closely mirror the Rust native approach. To learn more
 about this approach see the [abi_stable] and [async-ffi] crates.
 
 If you have a library in another language that you wish to interface to
-[DataFusion] the recommendation is to create a Rust wrapper crate to interface
-with your library and then to connect it to [DataFusion] using this crate.
+DataFusion the recommendation is to create a Rust wrapper crate to interface
+with your library and then to connect it to DataFusion using this crate.
 Alternatively, you could use [bindgen] to interface directly to the [FFI] provided
 by this crate, but that is currently not supported.
 
@@ -101,12 +101,104 @@ In this crate we have a variety of structs which closely mimic the behavior of
 their internal counterparts. To see detailed notes about how to use them, see
 the example in `FFI_TableProvider`.
 
-[datafusion]: https://datafusion.apache.org
+## Memory Management
+
+One of the advantages of Rust is the ownership model, which means programmers
+_usually_ do not need to worry about memory management. When interacting with
+foreign code, this is not necessarily true. If you review the structures in
+this crate, you will find that many of them implement the `Drop` trait and
+perform a foreign call.
+
+Suppose we have a `FFI_CatalogProvider`, for example. This struct is safe to
+pass across the FFI boundary, so it may be owned by either the library that
+produces the underlying `CatalogProvider` or by another library that consumes
+it. If we look closer at the `FFI_CatalogProvider`, it has a pointer to
+some private data. That private data is only accessible on the producer's
+side. If you attempt to access it on the consumer's side, you may get
+segmentation faults or other bad behavior. Within that private data is the
+actual `Arc<dyn CatalogProvider`. That `Arc<>` must be freed, but if the
+`FFI_CatalogProvider` is only owned on the consumer's side, we have no way
+to access the private data and free it.
+
+To account for this, most structs in this crate have a `release` method that
+is used to clean up any privately held data. This calls into the producer's
+side, regardless of if it is called on either the local or foreign side.
+Most of the structs in this crate carry atomic reference counts to the
+underlying data, and this is straight forward. Some structs like the
+`FFI_Accumulator` contain an inner `Box<dyn Accumulator>`. The reason for
+this is that we need to be able to mutably access these based on the
+`Accumulator` trait definition. For these we have slightly more complicated
+release code based on whether it is being dropped on the local or foreign side.
+Traits that use a `Box<>` for their underlying data also cannot implement
+`Clone`.
+
+## Library Marker ID
+
+When reviewing the code, many of the structs in this crate contain a call to
+a `library_marker_id`. The purpose of this call is to determine if a library is
+accessing _local_ code through the FFI structs. Consider this example: you have
+a `primary` program that exposes functions to create a schema provider. You
+have a `secondary` library that exposes a function to create a catalog provider
+and the `secondary` library uses the schema provider of the `primary` program.
+From the point of view of the `secondary` library, the schema provider is
+foreign code.
+
+Now when we register the `secondary` library with the `primary` program as a
+catalog provider and we make calls to get a schema, the `secondary` library
+will return a FFI wrapped schema provider back to the `primary` program. In
+this case that schema provider is actually local code to the `primary` program
+except that it is wrapped in the FFI code!
+
+We work around this by the `library_marker_id` calls. What this does is it
+creates a global variable within each library and returns a `usize` address
+of that library. This is guaranteed to be unique for every library that contains
+FFI code. By comparing these `usize` addresses we can determine if a FFI struct
+is local or foreign.
+
+In our example of the schema provider, if you were to make a call in your
+primary program to get the schema provider, it would reach out to the foreign
+catalog provider and send back a `FFI_SchemaProvider` object. By then
+comparing the `library_marker_id` of this object to the `primary` program, we
+determine it is local code. This means it is safe to access the underlying
+private data.
+
+Users of the FFI code should not need to access these function. If you are
+implementing a new FFI struct, then it is recommended that you follow the
+established patterns for converting from FFI struct into the underlying
+traits. Specifically you should use `crate::get_library_marker_id` and in
+your unit tests you should override this with
+`crate::mock_foreign_marker_id` to force your test to create the foreign
+variant of your struct.
+
+## Task Context Provider
+
+Many of the FFI structs in this crate contain a `FFI_TaskContextProvider`. The
+purpose of this struct is to _weakly_ hold a reference to a method to
+access the current `TaskContext`. The reason we need this accessor is because
+we use the `datafusion-proto` crate to serialize and deserialize data across
+the FFI boundary. In particular, we need to serialize and deserialize
+functions using a `TaskContext`, which implements `FunctionRegistry`.
+
+This becomes difficult because we may need to register multiple user defined
+functions, table or catalog providers, etc with a `Session`, and each of these
+will need the `TaskContext` to perform the processing. For this reason we
+cannot simply include the `TaskContext` at the time of registration because
+it would not have knowledge of anything registered afterward.
+
+The `FFI_TaskContextProvider` is built from a trait that provides a method
+to get the current `TaskContext`. `FFI_TaskContextProvider` only holds a
+`Weak` reference to the `TaskContextProvider`, because otherwise we could
+create a circular dependency at runtime. It is imperative that if you use
+these methods that your provider remains valid for the lifetime of the
+calls. The `FFI_TaskContextProvider` is implemented on `SessionContext`
+and it is easy to implement on any struct that implements `Session`.
+
+[apache datafusion]: https://datafusion.apache.org/
 [api docs]: http://docs.rs/datafusion-ffi/latest
 [rust abi]: https://doc.rust-lang.org/reference/abi.html
 [ffi]: https://doc.rust-lang.org/nomicon/ffi.html
 [abi_stable]: https://crates.io/crates/abi_stable
 [async-ffi]: https://crates.io/crates/async-ffi
 [bindgen]: https://crates.io/crates/bindgen
-[datafusion-python]: https://datafusion.apache.org/python/
+[`datafusion-python`]: https://datafusion.apache.org/python/
 [datafusion-contrib]: https://github.com/datafusion-contrib
diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs
index eb1f34b3d93a0..c83e412310e7f 100644
--- a/datafusion/ffi/src/arrow_wrappers.rs
+++ b/datafusion/ffi/src/arrow_wrappers.rs
@@ -18,11 +18,11 @@
 use std::sync::Arc;
 
 use abi_stable::StableAbi;
-use arrow::{
-    array::{make_array, ArrayRef},
-    datatypes::{Schema, SchemaRef},
-    ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
-};
+use arrow::array::{ArrayRef, make_array};
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow::error::ArrowError;
+use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi, to_ffi};
+use datafusion_common::{DataFusionError, ScalarValue};
 use log::error;
 
 /// This is a wrapper struct around FFI_ArrowSchema simply to indicate
@@ -36,7 +36,9 @@ impl From<SchemaRef> for WrappedSchema {
         let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) {
             Ok(s) => s,
             Err(e) => {
-                error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {e}");
+                error!(
+                    "Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {e}"
+                );
                 FFI_ArrowSchema::empty()
             }
         };
@@ -44,16 +46,22 @@ impl From<SchemaRef> for WrappedSchema {
         WrappedSchema(ffi_schema)
     }
 }
+/// Some functions are expected to always succeed, like getting the schema from a TableProvider.
+/// Since going through the FFI always has the potential to fail, we need to catch these errors,
+/// give the user a warning, and return some kind of result. In this case we default to an
+/// empty schema.
+#[cfg(not(tarpaulin_include))]
+fn catch_df_schema_error(e: &ArrowError) -> Schema {
+    error!(
+        "Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}"
+    );
+    Schema::empty()
+}
 
 impl From<WrappedSchema> for SchemaRef {
     fn from(value: WrappedSchema) -> Self {
-        let schema = match Schema::try_from(&value.0) {
-            Ok(s) => s,
-            Err(e) => {
-                error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}");
-                Schema::empty()
-            }
-        };
+        let schema =
+            Schema::try_from(&value.0).unwrap_or_else(|e| catch_df_schema_error(&e));
         Arc::new(schema)
     }
 }
@@ -71,7 +79,7 @@ pub struct WrappedArray {
 }
 
 impl TryFrom<WrappedArray> for ArrayRef {
-    type Error = arrow::error::ArrowError;
+    type Error = ArrowError;
 
     fn try_from(value: WrappedArray) -> Result<Self, Self::Error> {
         let data = unsafe { from_ffi(value.array, &value.schema.0)? };
@@ -79,3 +87,32 @@ impl TryFrom<WrappedArray> for ArrayRef {
         Ok(make_array(data))
     }
 }
+
+impl TryFrom<&ArrayRef> for WrappedArray {
+    type Error = ArrowError;
+
+    fn try_from(array: &ArrayRef) -> Result<Self, Self::Error> {
+        let (array, schema) = to_ffi(&array.to_data())?;
+        let schema = WrappedSchema(schema);
+
+        Ok(WrappedArray { array, schema })
+    }
+}
+
+impl TryFrom<&ScalarValue> for WrappedArray {
+    type Error = DataFusionError;
+
+    fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
+        let array = value.to_array()?;
+        WrappedArray::try_from(&array).map_err(Into::into)
+    }
+}
+
+impl TryFrom<WrappedArray> for ScalarValue {
+    type Error = DataFusionError;
+
+    fn try_from(value: WrappedArray) -> Result<Self, Self::Error> {
+        let array: ArrayRef = value.try_into()?;
+        ScalarValue::try_from_array(array.as_ref(), 0)
+    }
+}
diff --git a/datafusion/ffi/src/catalog_provider.rs b/datafusion/ffi/src/catalog_provider.rs
index 0886d4749d723..ff588a89a71b3 100644
--- a/datafusion/ffi/src/catalog_provider.rs
+++ b/datafusion/ffi/src/catalog_provider.rs
@@ -15,26 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
-
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RString, RVec};
+use datafusion_catalog::{CatalogProvider, SchemaProvider};
+use datafusion_common::error::Result;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
-use datafusion::catalog::{CatalogProvider, SchemaProvider};
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider},
-};
-
-use datafusion::error::Result;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider};
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
 
 /// A stable struct for sharing [`CatalogProvider`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_CatalogProvider {
     pub schema_names: unsafe extern "C" fn(provider: &Self) -> RVec<RString>,
 
@@ -43,19 +45,21 @@ pub struct FFI_CatalogProvider {
         name: RString,
     ) -> ROption<FFI_SchemaProvider>,
 
-    pub register_schema:
-        unsafe extern "C" fn(
-            provider: &Self,
-            name: RString,
-            schema: &FFI_SchemaProvider,
-        ) -> RResult<ROption<FFI_SchemaProvider>, RString>,
+    pub register_schema: unsafe extern "C" fn(
+        provider: &Self,
+        name: RString,
+        schema: &FFI_SchemaProvider,
+    )
+        -> FFIResult<ROption<FFI_SchemaProvider>>,
+
+    pub deregister_schema: unsafe extern "C" fn(
+        provider: &Self,
+        name: RString,
+        cascade: bool,
+    )
+        -> FFIResult<ROption<FFI_SchemaProvider>>,
 
-    pub deregister_schema:
-        unsafe extern "C" fn(
-            provider: &Self,
-            name: RString,
-            cascade: bool,
-        ) -> RResult<ROption<FFI_SchemaProvider>, RString>,
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -70,6 +74,11 @@ pub struct FFI_CatalogProvider {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignCatalogProvider`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_CatalogProvider {}
@@ -82,93 +91,132 @@ struct ProviderPrivateData {
 
 impl FFI_CatalogProvider {
     unsafe fn inner(&self) -> &Arc<dyn CatalogProvider + Send> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        &(*private_data).provider
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
     }
 
     unsafe fn runtime(&self) -> Option<Handle> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        (*private_data).runtime.clone()
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
     }
 }
 
 unsafe extern "C" fn schema_names_fn_wrapper(
     provider: &FFI_CatalogProvider,
 ) -> RVec<RString> {
-    let names = provider.inner().schema_names();
-    names.into_iter().map(|s| s.into()).collect()
+    unsafe {
+        let names = provider.inner().schema_names();
+        names.into_iter().map(|s| s.into()).collect()
+    }
 }
 
 unsafe extern "C" fn schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
     name: RString,
 ) -> ROption<FFI_SchemaProvider> {
-    let maybe_schema = provider.inner().schema(name.as_str());
-    maybe_schema
-        .map(|schema| FFI_SchemaProvider::new(schema, provider.runtime()))
-        .into()
+    unsafe {
+        let maybe_schema = provider.inner().schema(name.as_str());
+        maybe_schema
+            .map(|schema| {
+                FFI_SchemaProvider::new_with_ffi_codec(
+                    schema,
+                    provider.runtime(),
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
 }
 
 unsafe extern "C" fn register_schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
     name: RString,
     schema: &FFI_SchemaProvider,
-) -> RResult<ROption<FFI_SchemaProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-    let schema = Arc::new(ForeignSchemaProvider::from(schema));
-
-    let returned_schema =
-        rresult_return!(provider.register_schema(name.as_str(), schema))
-            .map(|schema| FFI_SchemaProvider::new(schema, runtime))
-            .into();
-
-    RResult::ROk(returned_schema)
+) -> FFIResult<ROption<FFI_SchemaProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        let schema: Arc<dyn SchemaProvider + Send> = schema.into();
+
+        let returned_schema =
+            rresult_return!(inner_provider.register_schema(name.as_str(), schema))
+                .map(|schema| {
+                    FFI_SchemaProvider::new_with_ffi_codec(
+                        schema,
+                        runtime,
+                        provider.logical_codec.clone(),
+                    )
+                })
+                .into();
+
+        RResult::ROk(returned_schema)
+    }
 }
 
 unsafe extern "C" fn deregister_schema_fn_wrapper(
     provider: &FFI_CatalogProvider,
     name: RString,
     cascade: bool,
-) -> RResult<ROption<FFI_SchemaProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-
-    let maybe_schema =
-        rresult_return!(provider.deregister_schema(name.as_str(), cascade));
-
-    RResult::ROk(
-        maybe_schema
-            .map(|schema| FFI_SchemaProvider::new(schema, runtime))
-            .into(),
-    )
+) -> FFIResult<ROption<FFI_SchemaProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+
+        let maybe_schema =
+            rresult_return!(inner_provider.deregister_schema(name.as_str(), cascade));
+
+        RResult::ROk(
+            maybe_schema
+                .map(|schema| {
+                    FFI_SchemaProvider::new_with_ffi_codec(
+                        schema,
+                        runtime,
+                        provider.logical_codec.clone(),
+                    )
+                })
+                .into(),
+        )
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_CatalogProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(
     provider: &FFI_CatalogProvider,
 ) -> FFI_CatalogProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
-
-    let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
-        runtime,
-    })) as *mut c_void;
-
-    FFI_CatalogProvider {
-        schema_names: schema_names_fn_wrapper,
-        schema: schema_fn_wrapper,
-        register_schema: register_schema_fn_wrapper,
-        deregister_schema: deregister_schema_fn_wrapper,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-        version: super::version,
-        private_data,
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_CatalogProvider {
+            schema_names: schema_names_fn_wrapper,
+            schema: schema_fn_wrapper,
+            register_schema: register_schema_fn_wrapper,
+            deregister_schema: deregister_schema_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
@@ -183,7 +231,30 @@ impl FFI_CatalogProvider {
     pub fn new(
         provider: Arc<dyn CatalogProvider + Send>,
         runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn CatalogProvider + Send>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
     ) -> Self {
+        if let Some(provider) = provider.as_any().downcast_ref::<ForeignCatalogProvider>()
+        {
+            return provider.0.clone();
+        }
+
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
@@ -191,10 +262,12 @@ impl FFI_CatalogProvider {
             schema: schema_fn_wrapper,
             register_schema: register_schema_fn_wrapper,
             deregister_schema: deregister_schema_fn_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             version: super::version,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -204,14 +277,19 @@ impl FFI_CatalogProvider {
 /// defined on this struct must only use the stable functions provided in
 /// FFI_CatalogProvider to interact with the foreign table provider.
 #[derive(Debug)]
-pub struct ForeignCatalogProvider(FFI_CatalogProvider);
+pub struct ForeignCatalogProvider(pub(crate) FFI_CatalogProvider);
 
 unsafe impl Send for ForeignCatalogProvider {}
 unsafe impl Sync for ForeignCatalogProvider {}
 
-impl From<&FFI_CatalogProvider> for ForeignCatalogProvider {
+impl From<&FFI_CatalogProvider> for Arc<dyn CatalogProvider + Send> {
     fn from(provider: &FFI_CatalogProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignCatalogProvider(provider.clone()))
+            as Arc<dyn CatalogProvider + Send>
     }
 }
 
@@ -254,7 +332,11 @@ impl CatalogProvider for ForeignCatalogProvider {
         unsafe {
             let schema = match schema.as_any().downcast_ref::<ForeignSchemaProvider>() {
                 Some(s) => &s.0,
-                None => &FFI_SchemaProvider::new(schema, None),
+                None => &FFI_SchemaProvider::new_with_ffi_codec(
+                    schema,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
             };
             let returned_schema: Option<FFI_SchemaProvider> =
                 df_result!((self.0.register_schema)(&self.0, name.into(), schema))?
@@ -292,15 +374,20 @@ mod tests {
         let prior_schema = Arc::new(MemorySchemaProvider::new());
 
         let catalog = Arc::new(MemoryCatalogProvider::new());
-        assert!(catalog
-            .as_ref()
-            .register_schema("prior_schema", prior_schema)
-            .unwrap()
-            .is_none());
+        assert!(
+            catalog
+                .as_ref()
+                .register_schema("prior_schema", prior_schema)
+                .unwrap()
+                .is_none()
+        );
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
 
-        let ffi_catalog = FFI_CatalogProvider::new(catalog, None);
+        let mut ffi_catalog =
+            FFI_CatalogProvider::new(catalog, None, task_ctx_provider, None);
+        ffi_catalog.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into();
+        let foreign_catalog: Arc<dyn CatalogProvider + Send> = (&ffi_catalog).into();
 
         let prior_schema_names = foreign_catalog.schema_names();
         assert_eq!(prior_schema_names.len(), 1);
@@ -327,7 +414,7 @@ mod tests {
         assert!(returned_schema.is_some());
         assert_eq!(foreign_catalog.schema_names().len(), 1);
 
-        // Retrieve non-existant schema
+        // Retrieve non-existent schema
         let returned_schema = foreign_catalog.schema("prior_schema");
         assert!(returned_schema.is_none());
 
@@ -335,4 +422,32 @@ mod tests {
         let returned_schema = foreign_catalog.schema("second_schema");
         assert!(returned_schema.is_some());
     }
+
+    #[test]
+    fn test_ffi_catalog_provider_local_bypass() {
+        let catalog = Arc::new(MemoryCatalogProvider::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog =
+            FFI_CatalogProvider::new(catalog, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_catalog: Arc<dyn CatalogProvider + Send> = (&ffi_catalog).into();
+        assert!(
+            foreign_catalog
+                .as_any()
+                .downcast_ref::<MemoryCatalogProvider>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_catalog.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_catalog: Arc<dyn CatalogProvider + Send> = (&ffi_catalog).into();
+        assert!(
+            foreign_catalog
+                .as_any()
+                .downcast_ref::<ForeignCatalogProvider>()
+                .is_some()
+        );
+    }
 }
diff --git a/datafusion/ffi/src/catalog_provider_list.rs b/datafusion/ffi/src/catalog_provider_list.rs
new file mode 100644
index 0000000000000..65574a7ac33de
--- /dev/null
+++ b/datafusion/ffi/src/catalog_provider_list.rs
@@ -0,0 +1,396 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RString, RVec};
+use datafusion_catalog::{CatalogProvider, CatalogProviderList};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+use tokio::runtime::Handle;
+
+use crate::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider};
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+
+/// A stable struct for sharing [`CatalogProviderList`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_CatalogProviderList {
+    /// Register a catalog
+    pub register_catalog: unsafe extern "C" fn(
+        &Self,
+        name: RString,
+        catalog: &FFI_CatalogProvider,
+    ) -> ROption<FFI_CatalogProvider>,
+
+    /// List of existing catalogs
+    pub catalog_names: unsafe extern "C" fn(&Self) -> RVec<RString>,
+
+    /// Access a catalog
+    pub catalog:
+        unsafe extern "C" fn(&Self, name: RString) -> ROption<FFI_CatalogProvider>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignCatalogProviderList`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_CatalogProviderList {}
+unsafe impl Sync for FFI_CatalogProviderList {}
+
+struct ProviderPrivateData {
+    provider: Arc<dyn CatalogProviderList + Send>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_CatalogProviderList {
+    unsafe fn inner(&self) -> &Arc<dyn CatalogProviderList + Send> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
+    }
+
+    unsafe fn runtime(&self) -> Option<Handle> {
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
+    }
+}
+
+unsafe extern "C" fn catalog_names_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+) -> RVec<RString> {
+    unsafe {
+        let names = provider.inner().catalog_names();
+        names.into_iter().map(|s| s.into()).collect()
+    }
+}
+
+unsafe extern "C" fn register_catalog_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+    name: RString,
+    catalog: &FFI_CatalogProvider,
+) -> ROption<FFI_CatalogProvider> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        let catalog: Arc<dyn CatalogProvider + Send> = catalog.into();
+
+        inner_provider
+            .register_catalog(name.into(), catalog)
+            .map(|catalog| {
+                FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    runtime,
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
+}
+
+unsafe extern "C" fn catalog_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+    name: RString,
+) -> ROption<FFI_CatalogProvider> {
+    unsafe {
+        let runtime = provider.runtime();
+        let inner_provider = provider.inner();
+        inner_provider
+            .catalog(name.as_str())
+            .map(|catalog| {
+                FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    runtime,
+                    provider.logical_codec.clone(),
+                )
+            })
+            .into()
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_CatalogProviderList) {
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    provider: &FFI_CatalogProviderList,
+) -> FFI_CatalogProviderList {
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_CatalogProviderList {
+            register_catalog: register_catalog_fn_wrapper,
+            catalog_names: catalog_names_fn_wrapper,
+            catalog: catalog_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_CatalogProviderList {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_CatalogProviderList {
+    /// Creates a new [`FFI_CatalogProviderList`].
+    pub fn new(
+        provider: Arc<dyn CatalogProviderList + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn CatalogProviderList + Send>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(provider) = provider
+            .as_any()
+            .downcast_ref::<ForeignCatalogProviderList>()
+        {
+            return provider.0.clone();
+        }
+
+        let private_data = Box::new(ProviderPrivateData { provider, runtime });
+
+        Self {
+            register_catalog: register_catalog_fn_wrapper,
+            catalog_names: catalog_names_fn_wrapper,
+            catalog: catalog_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_CatalogProviderList to interact with the foreign catalog provider list.
+#[derive(Debug)]
+pub struct ForeignCatalogProviderList(FFI_CatalogProviderList);
+
+unsafe impl Send for ForeignCatalogProviderList {}
+unsafe impl Sync for ForeignCatalogProviderList {}
+
+impl From<&FFI_CatalogProviderList> for Arc<dyn CatalogProviderList + Send> {
+    fn from(provider: &FFI_CatalogProviderList) -> Self {
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignCatalogProviderList(provider.clone()))
+            as Arc<dyn CatalogProviderList + Send>
+    }
+}
+
+impl Clone for FFI_CatalogProviderList {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl CatalogProviderList for ForeignCatalogProviderList {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        unsafe {
+            let catalog = match catalog.as_any().downcast_ref::<ForeignCatalogProvider>()
+            {
+                Some(s) => &s.0,
+                None => &FFI_CatalogProvider::new_with_ffi_codec(
+                    catalog,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
+            };
+
+            (self.0.register_catalog)(&self.0, name.into(), catalog)
+                .map(|s| Arc::new(ForeignCatalogProvider(s)) as Arc<dyn CatalogProvider>)
+                .into()
+        }
+    }
+
+    fn catalog_names(&self) -> Vec<String> {
+        unsafe {
+            (self.0.catalog_names)(&self.0)
+                .into_iter()
+                .map(Into::into)
+                .collect()
+        }
+    }
+
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        unsafe {
+            (self.0.catalog)(&self.0, name.into())
+                .map(|catalog| {
+                    Arc::new(ForeignCatalogProvider(catalog)) as Arc<dyn CatalogProvider>
+                })
+                .into()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::catalog::{MemoryCatalogProvider, MemoryCatalogProviderList};
+
+    use super::*;
+
+    #[test]
+    fn test_round_trip_ffi_catalog_provider_list() {
+        let prior_catalog = Arc::new(MemoryCatalogProvider::new());
+
+        let catalog_list = Arc::new(MemoryCatalogProviderList::new());
+        assert!(
+            catalog_list
+                .as_ref()
+                .register_catalog("prior_catalog".to_owned(), prior_catalog)
+                .is_none()
+        );
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog_list =
+            FFI_CatalogProviderList::new(catalog_list, None, task_ctx_provider, None);
+        ffi_catalog_list.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_catalog_list: Arc<dyn CatalogProviderList + Send> =
+            (&ffi_catalog_list).into();
+
+        let prior_catalog_names = foreign_catalog_list.catalog_names();
+        assert_eq!(prior_catalog_names.len(), 1);
+        assert_eq!(prior_catalog_names[0], "prior_catalog");
+
+        // Replace an existing catalog with one of the same name
+        let returned_catalog = foreign_catalog_list.register_catalog(
+            "prior_catalog".to_owned(),
+            Arc::new(MemoryCatalogProvider::new()),
+        );
+        assert!(returned_catalog.is_some());
+        assert_eq!(foreign_catalog_list.catalog_names().len(), 1);
+
+        // Add a new catalog
+        let returned_catalog = foreign_catalog_list.register_catalog(
+            "second_catalog".to_owned(),
+            Arc::new(MemoryCatalogProvider::new()),
+        );
+        assert!(returned_catalog.is_none());
+        assert_eq!(foreign_catalog_list.catalog_names().len(), 2);
+
+        // Retrieve non-existent catalog
+        let returned_catalog = foreign_catalog_list.catalog("non_existent_catalog");
+        assert!(returned_catalog.is_none());
+
+        // Retrieve valid catalog
+        let returned_catalog = foreign_catalog_list.catalog("second_catalog");
+        assert!(returned_catalog.is_some());
+    }
+
+    #[test]
+    fn test_ffi_catalog_provider_list_local_bypass() {
+        let catalog_list = Arc::new(MemoryCatalogProviderList::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_catalog_list =
+            FFI_CatalogProviderList::new(catalog_list, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_catalog_list: Arc<dyn CatalogProviderList + Send> =
+            (&ffi_catalog_list).into();
+        assert!(
+            foreign_catalog_list
+                .as_any()
+                .downcast_ref::<MemoryCatalogProviderList>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_catalog_list.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_catalog_list: Arc<dyn CatalogProviderList + Send> =
+            (&ffi_catalog_list).into();
+        assert!(
+            foreign_catalog_list
+                .as_any()
+                .downcast_ref::<ForeignCatalogProviderList>()
+                .is_some()
+        );
+    }
+}
diff --git a/datafusion/ffi/src/config/extension_options.rs b/datafusion/ffi/src/config/extension_options.rs
new file mode 100644
index 0000000000000..48fd4e710921a
--- /dev/null
+++ b/datafusion/ffi/src/config/extension_options.rs
@@ -0,0 +1,288 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::ffi::c_void;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RStr, RString, RVec, Tuple2};
+use datafusion_common::config::{ConfigEntry, ConfigExtension, ExtensionOptions};
+use datafusion_common::{Result, exec_err};
+
+use crate::df_result;
+
+/// A stable struct for sharing [`ExtensionOptions`] across FFI boundaries.
+///
+/// Unlike other FFI structs in this crate, we do not construct a foreign
+/// variant of this object. This is due to the typical method for interacting
+/// with extension options is by creating a local struct of your concrete type.
+/// To support this methodology use the `to_extension` method instead.
+///
+/// When using [`FFI_ExtensionOptions`] with multiple extensions, all extension
+/// values are stored on a single [`FFI_ExtensionOptions`] object. The keys
+/// are stored with the full path prefix to avoid overwriting values when using
+/// multiple extensions.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_ExtensionOptions {
+    /// Return a deep clone of this [`ExtensionOptions`]
+    pub cloned: unsafe extern "C" fn(&Self) -> FFI_ExtensionOptions,
+
+    /// Set the given `key`, `value` pair
+    pub set:
+        unsafe extern "C" fn(&mut Self, key: RStr, value: RStr) -> RResult<(), RString>,
+
+    /// Returns the [`ConfigEntry`] stored in this [`ExtensionOptions`]
+    pub entries: unsafe extern "C" fn(&Self) -> RVec<Tuple2<RString, RString>>,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(&mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the options.
+    pub private_data: *mut c_void,
+}
+
+unsafe impl Send for FFI_ExtensionOptions {}
+unsafe impl Sync for FFI_ExtensionOptions {}
+
+pub struct ExtensionOptionsPrivateData {
+    pub options: HashMap<String, String>,
+}
+
+impl FFI_ExtensionOptions {
+    #[inline]
+    fn inner_mut(&mut self) -> &mut HashMap<String, String> {
+        let private_data = self.private_data as *mut ExtensionOptionsPrivateData;
+        unsafe { &mut (*private_data).options }
+    }
+
+    #[inline]
+    fn inner(&self) -> &HashMap<String, String> {
+        let private_data = self.private_data as *const ExtensionOptionsPrivateData;
+        unsafe { &(*private_data).options }
+    }
+}
+
+unsafe extern "C" fn cloned_fn_wrapper(
+    options: &FFI_ExtensionOptions,
+) -> FFI_ExtensionOptions {
+    options
+        .inner()
+        .iter()
+        .map(|(k, v)| (k.to_owned(), v.to_owned()))
+        .collect::<HashMap<String, String>>()
+        .into()
+}
+
+unsafe extern "C" fn set_fn_wrapper(
+    options: &mut FFI_ExtensionOptions,
+    key: RStr,
+    value: RStr,
+) -> RResult<(), RString> {
+    let _ = options.inner_mut().insert(key.into(), value.into());
+    RResult::ROk(())
+}
+
+unsafe extern "C" fn entries_fn_wrapper(
+    options: &FFI_ExtensionOptions,
+) -> RVec<Tuple2<RString, RString>> {
+    options
+        .inner()
+        .iter()
+        .map(|(key, value)| (key.to_owned().into(), value.to_owned().into()).into())
+        .collect()
+}
+
+unsafe extern "C" fn release_fn_wrapper(options: &mut FFI_ExtensionOptions) {
+    unsafe {
+        debug_assert!(!options.private_data.is_null());
+        let private_data =
+            Box::from_raw(options.private_data as *mut ExtensionOptionsPrivateData);
+        drop(private_data);
+        options.private_data = std::ptr::null_mut();
+    }
+}
+
+impl Default for FFI_ExtensionOptions {
+    fn default() -> Self {
+        HashMap::new().into()
+    }
+}
+
+impl From<HashMap<String, String>> for FFI_ExtensionOptions {
+    fn from(options: HashMap<String, String>) -> Self {
+        let private_data = ExtensionOptionsPrivateData { options };
+
+        Self {
+            cloned: cloned_fn_wrapper,
+            set: set_fn_wrapper,
+            entries: entries_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+        }
+    }
+}
+
+impl Drop for FFI_ExtensionOptions {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl Clone for FFI_ExtensionOptions {
+    fn clone(&self) -> Self {
+        unsafe { (self.cloned)(self) }
+    }
+}
+
+impl ConfigExtension for FFI_ExtensionOptions {
+    const PREFIX: &'static str =
+        datafusion_common::config::DATAFUSION_FFI_CONFIG_NAMESPACE;
+}
+
+impl ExtensionOptions for FFI_ExtensionOptions {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn as_any_mut(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn cloned(&self) -> Box<dyn ExtensionOptions> {
+        let ffi_options = unsafe { (self.cloned)(self) };
+        Box::new(ffi_options)
+    }
+
+    fn set(&mut self, key: &str, value: &str) -> Result<()> {
+        if key.split_once('.').is_none() {
+            return exec_err!("Unable to set FFI config value without namespace set");
+        };
+
+        df_result!(unsafe { (self.set)(self, key.into(), value.into()) })
+    }
+
+    fn entries(&self) -> Vec<ConfigEntry> {
+        unsafe {
+            (self.entries)(self)
+                .into_iter()
+                .map(|entry_tuple| ConfigEntry {
+                    key: entry_tuple.0.into(),
+                    value: Some(entry_tuple.1.into()),
+                    description: "ffi_config_options",
+                })
+                .collect()
+        }
+    }
+}
+
+impl FFI_ExtensionOptions {
+    /// Add all of the values in a concrete configuration extension to the
+    /// FFI variant. This is safe to call on either side of the FFI
+    /// boundary.
+    pub fn add_config<C: ConfigExtension>(&mut self, config: &C) -> Result<()> {
+        for entry in config.entries() {
+            if let Some(value) = entry.value {
+                let key = format!("{}.{}", C::PREFIX, entry.key);
+                self.set(key.as_str(), value.as_str())?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Merge another `FFI_ExtensionOptions` configurations into this one.
+    /// This is safe to call on either side of the FFI boundary.
+    pub fn merge(&mut self, other: &FFI_ExtensionOptions) -> Result<()> {
+        for entry in other.entries() {
+            if let Some(value) = entry.value {
+                self.set(entry.key.as_str(), value.as_str())?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Create a concrete extension type from the FFI variant.
+    /// This is safe to call on either side of the FFI boundary.
+    pub fn to_extension<C: ConfigExtension + Default>(&self) -> Result<C> {
+        let mut result = C::default();
+
+        unsafe {
+            for entry in (self.entries)(self) {
+                let key = entry.0.as_str();
+                let value = entry.1.as_str();
+
+                if let Some((prefix, inner_key)) = key.split_once('.')
+                    && prefix == C::PREFIX
+                {
+                    result.set(inner_key, value)?;
+                }
+            }
+        }
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_common::config::{ConfigExtension, ConfigOptions};
+    use datafusion_common::extensions_options;
+
+    use crate::config::extension_options::FFI_ExtensionOptions;
+
+    // Define a new configuration struct using the `extensions_options` macro
+    extensions_options! {
+       /// My own config options.
+       pub struct MyConfig {
+           /// Should "foo" be replaced by "bar"?
+           pub foo_to_bar: bool, default = true
+
+           /// How many "baz" should be created?
+           pub baz_count: usize, default = 1337
+       }
+    }
+
+    impl ConfigExtension for MyConfig {
+        const PREFIX: &'static str = "my_config";
+    }
+
+    #[test]
+    fn round_trip_ffi_extension_options() {
+        // set up config struct and register extension
+        let mut config = ConfigOptions::default();
+        let mut ffi_options = FFI_ExtensionOptions::default();
+        ffi_options.add_config(&MyConfig::default()).unwrap();
+
+        config.extensions.insert(ffi_options);
+
+        // overwrite config default
+        config.set("my_config.baz_count", "42").unwrap();
+
+        // check config state
+        let returned_ffi_config =
+            config.extensions.get::<FFI_ExtensionOptions>().unwrap();
+        let my_config: MyConfig = returned_ffi_config.to_extension().unwrap();
+
+        // check default value
+        assert!(my_config.foo_to_bar);
+
+        // check overwritten value
+        assert_eq!(my_config.baz_count, 42);
+    }
+}
diff --git a/datafusion/ffi/src/config/mod.rs b/datafusion/ffi/src/config/mod.rs
new file mode 100644
index 0000000000000..850a4dc337336
--- /dev/null
+++ b/datafusion/ffi/src/config/mod.rs
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod extension_options;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RHashMap, RString};
+use datafusion_common::config::{
+    ConfigExtension, ConfigOptions, ExtensionOptions, TableOptions,
+};
+use datafusion_common::{DataFusionError, Result};
+
+use crate::config::extension_options::FFI_ExtensionOptions;
+
+/// A stable struct for sharing [`ConfigOptions`] across FFI boundaries.
+///
+/// Accessing FFI extension options require a slightly different pattern
+/// than local extensions. The trait [`ExtensionOptionsFFIProvider`] can
+/// be used to simplify accessing FFI extensions.
+#[repr(C)]
+#[derive(Debug, Clone, StableAbi)]
+pub struct FFI_ConfigOptions {
+    base_options: RHashMap<RString, RString>,
+
+    extensions: FFI_ExtensionOptions,
+}
+
+impl From<&ConfigOptions> for FFI_ConfigOptions {
+    fn from(options: &ConfigOptions) -> Self {
+        let base_options: RHashMap<RString, RString> = options
+            .entries()
+            .into_iter()
+            .filter_map(|entry| entry.value.map(|value| (entry.key, value)))
+            .map(|(key, value)| (key.into(), value.into()))
+            .collect();
+
+        let mut extensions = FFI_ExtensionOptions::default();
+        for (extension_name, extension) in options.extensions.iter() {
+            for entry in extension.entries().iter() {
+                if let Some(value) = entry.value.as_ref() {
+                    extensions
+                        .set(format!("{extension_name}.{}", entry.key).as_str(), value)
+                        .expect("FFI_ExtensionOptions set should always return Ok");
+                }
+            }
+        }
+
+        Self {
+            base_options,
+            extensions,
+        }
+    }
+}
+
+impl TryFrom<FFI_ConfigOptions> for ConfigOptions {
+    type Error = DataFusionError;
+    fn try_from(ffi_options: FFI_ConfigOptions) -> Result<Self, Self::Error> {
+        let mut options = ConfigOptions::default();
+        options.extensions.insert(ffi_options.extensions);
+
+        for kv_tuple in ffi_options.base_options.iter() {
+            options.set(kv_tuple.0.as_str(), kv_tuple.1.as_str())?;
+        }
+
+        Ok(options)
+    }
+}
+
+pub trait ExtensionOptionsFFIProvider {
+    /// Extract a [`ConfigExtension`]. This method should attempt to first extract
+    /// the extension from the local options when possible. Should that fail, it
+    /// should attempt to extract the FFI options and then convert them to the
+    /// desired [`ConfigExtension`].
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C>;
+}
+
+impl ExtensionOptionsFFIProvider for ConfigOptions {
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C> {
+        self.extensions
+            .get::<C>()
+            .map(|v| v.to_owned())
+            .or_else(|| {
+                self.extensions
+                    .get::<FFI_ExtensionOptions>()
+                    .and_then(|ffi_ext| ffi_ext.to_extension().ok())
+            })
+    }
+}
+
+impl ExtensionOptionsFFIProvider for TableOptions {
+    fn local_or_ffi_extension<C: ConfigExtension + Clone + Default>(&self) -> Option<C> {
+        self.extensions
+            .get::<C>()
+            .map(|v| v.to_owned())
+            .or_else(|| {
+                self.extensions
+                    .get::<FFI_ExtensionOptions>()
+                    .and_then(|ffi_ext| ffi_ext.to_extension().ok())
+            })
+    }
+}
+
+/// A stable struct for sharing [`TableOptions`] across FFI boundaries.
+///
+/// Accessing FFI extension options require a slightly different pattern
+/// than local extensions. The trait [`ExtensionOptionsFFIProvider`] can
+/// be used to simplify accessing FFI extensions.
+#[repr(C)]
+#[derive(Debug, Clone, StableAbi)]
+pub struct FFI_TableOptions {
+    base_options: RHashMap<RString, RString>,
+
+    extensions: FFI_ExtensionOptions,
+}
+
+impl From<&TableOptions> for FFI_TableOptions {
+    fn from(options: &TableOptions) -> Self {
+        let base_options: RHashMap<RString, RString> = options
+            .entries()
+            .into_iter()
+            .filter_map(|entry| entry.value.map(|value| (entry.key, value)))
+            .map(|(key, value)| (key.into(), value.into()))
+            .collect();
+
+        let mut extensions = FFI_ExtensionOptions::default();
+        for (extension_name, extension) in options.extensions.iter() {
+            for entry in extension.entries().iter() {
+                if let Some(value) = entry.value.as_ref() {
+                    extensions
+                        .set(format!("{extension_name}.{}", entry.key).as_str(), value)
+                        .expect("FFI_ExtensionOptions set should always return Ok");
+                }
+            }
+        }
+
+        Self {
+            base_options,
+            extensions,
+        }
+    }
+}
+
+impl TryFrom<FFI_TableOptions> for TableOptions {
+    type Error = DataFusionError;
+    fn try_from(ffi_options: FFI_TableOptions) -> Result<Self, Self::Error> {
+        let mut options = TableOptions::default();
+        options.extensions.insert(ffi_options.extensions);
+
+        for kv_tuple in ffi_options.base_options.iter() {
+            options.set(kv_tuple.0.as_str(), kv_tuple.1.as_str())?;
+        }
+
+        Ok(options)
+    }
+}
diff --git a/datafusion/ffi/src/execution/mod.rs b/datafusion/ffi/src/execution/mod.rs
new file mode 100644
index 0000000000000..41107947fff01
--- /dev/null
+++ b/datafusion/ffi/src/execution/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod task_ctx;
+pub mod task_ctx_provider;
+
+pub use task_ctx::FFI_TaskContext;
+pub use task_ctx_provider::FFI_TaskContextProvider;
diff --git a/datafusion/ffi/src/execution/task_ctx.rs b/datafusion/ffi/src/execution/task_ctx.rs
new file mode 100644
index 0000000000000..e0598db0a0170
--- /dev/null
+++ b/datafusion/ffi/src/execution/task_ctx.rs
@@ -0,0 +1,287 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::pmr::ROption;
+use abi_stable::std_types::{RHashMap, RString};
+use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, ScalarUDF, ScalarUDFImpl, WindowUDF, WindowUDFImpl,
+};
+
+use crate::session::config::FFI_SessionConfig;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+
+/// A stable struct for sharing [`TaskContext`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_TaskContext {
+    /// Return the session ID.
+    pub session_id: unsafe extern "C" fn(&Self) -> RString,
+
+    /// Return the task ID.
+    pub task_id: unsafe extern "C" fn(&Self) -> ROption<RString>,
+
+    /// Return the session configuration.
+    pub session_config: unsafe extern "C" fn(&Self) -> FFI_SessionConfig,
+
+    /// Returns a hashmap of names to scalar functions.
+    pub scalar_functions: unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_ScalarUDF>,
+
+    /// Returns a hashmap of names to aggregate functions.
+    pub aggregate_functions:
+        unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_AggregateUDF>,
+
+    /// Returns a hashmap of names to window functions.
+    pub window_functions: unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_WindowUDF>,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// The foreign library should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+struct TaskContextPrivateData {
+    ctx: Arc<TaskContext>,
+}
+
+impl FFI_TaskContext {
+    unsafe fn inner(&self) -> &Arc<TaskContext> {
+        unsafe {
+            let private_data = self.private_data as *const TaskContextPrivateData;
+            &(*private_data).ctx
+        }
+    }
+}
+
+unsafe extern "C" fn session_id_fn_wrapper(ctx: &FFI_TaskContext) -> RString {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.session_id().into()
+    }
+}
+
+unsafe extern "C" fn task_id_fn_wrapper(ctx: &FFI_TaskContext) -> ROption<RString> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.task_id().map(|s| s.as_str().into()).into()
+    }
+}
+
+unsafe extern "C" fn session_config_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> FFI_SessionConfig {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.session_config().into()
+    }
+}
+
+unsafe extern "C" fn scalar_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> RHashMap<RString, FFI_ScalarUDF> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.scalar_functions()
+            .iter()
+            .map(|(name, udf)| (name.to_owned().into(), Arc::clone(udf).into()))
+            .collect()
+    }
+}
+
+unsafe extern "C" fn aggregate_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> RHashMap<RString, FFI_AggregateUDF> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.aggregate_functions()
+            .iter()
+            .map(|(name, udaf)| {
+                (
+                    name.to_owned().into(),
+                    FFI_AggregateUDF::from(Arc::clone(udaf)),
+                )
+            })
+            .collect()
+    }
+}
+
+unsafe extern "C" fn window_functions_fn_wrapper(
+    ctx: &FFI_TaskContext,
+) -> RHashMap<RString, FFI_WindowUDF> {
+    unsafe {
+        let ctx = ctx.inner();
+        ctx.window_functions()
+            .iter()
+            .map(|(name, udf)| {
+                (name.to_owned().into(), FFI_WindowUDF::from(Arc::clone(udf)))
+            })
+            .collect()
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(ctx: &mut FFI_TaskContext) {
+    unsafe {
+        let private_data = Box::from_raw(ctx.private_data as *mut TaskContextPrivateData);
+        drop(private_data);
+    }
+}
+
+impl Drop for FFI_TaskContext {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<Arc<TaskContext>> for FFI_TaskContext {
+    fn from(ctx: Arc<TaskContext>) -> Self {
+        let private_data = Box::new(TaskContextPrivateData { ctx });
+
+        FFI_TaskContext {
+            session_id: session_id_fn_wrapper,
+            task_id: task_id_fn_wrapper,
+            session_config: session_config_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl From<FFI_TaskContext> for Arc<TaskContext> {
+    fn from(ffi_ctx: FFI_TaskContext) -> Self {
+        unsafe {
+            if (ffi_ctx.library_marker_id)() == crate::get_library_marker_id() {
+                return Arc::clone(ffi_ctx.inner());
+            }
+
+            let task_id = (ffi_ctx.task_id)(&ffi_ctx).map(|s| s.to_string()).into();
+            let session_id = (ffi_ctx.session_id)(&ffi_ctx).into();
+            let session_config = (ffi_ctx.session_config)(&ffi_ctx);
+            let session_config =
+                SessionConfig::try_from(&session_config).unwrap_or_default();
+
+            let scalar_functions = (ffi_ctx.scalar_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udf = <Arc<dyn ScalarUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(ScalarUDF::new_from_shared_impl(udf)),
+                    )
+                })
+                .collect();
+            let aggregate_functions = (ffi_ctx.aggregate_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udaf = <Arc<dyn AggregateUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(AggregateUDF::new_from_shared_impl(udaf)),
+                    )
+                })
+                .collect();
+            let window_functions = (ffi_ctx.window_functions)(&ffi_ctx)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udwf = <Arc<dyn WindowUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(WindowUDF::new_from_shared_impl(udwf)),
+                    )
+                })
+                .collect();
+
+            let runtime = Arc::new(RuntimeEnv::default());
+
+            Arc::new(TaskContext::new(
+                task_id,
+                session_id,
+                session_config,
+                scalar_functions,
+                aggregate_functions,
+                window_functions,
+                runtime,
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::prelude::SessionContext;
+    use datafusion_common::Result;
+    use datafusion_execution::TaskContext;
+
+    use crate::execution::FFI_TaskContext;
+
+    #[test]
+    fn ffi_task_ctx_round_trip() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let original = session_ctx.task_ctx();
+        let mut ffi_task_ctx = FFI_TaskContext::from(Arc::clone(&original));
+        ffi_task_ctx.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_task_ctx: Arc<TaskContext> = ffi_task_ctx.into();
+
+        // TaskContext doesn't implement Eq (nor should it) so check some of the
+        // data is round tripping correctly.
+
+        assert_eq!(
+            original.scalar_functions(),
+            foreign_task_ctx.scalar_functions()
+        );
+        assert_eq!(
+            original.aggregate_functions(),
+            foreign_task_ctx.aggregate_functions()
+        );
+        assert_eq!(
+            original.window_functions(),
+            foreign_task_ctx.window_functions()
+        );
+        assert_eq!(original.task_id(), foreign_task_ctx.task_id());
+        assert_eq!(original.session_id(), foreign_task_ctx.session_id());
+        assert_eq!(
+            format!("{:?}", original.session_config()),
+            format!("{:?}", foreign_task_ctx.session_config())
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/execution/task_ctx_provider.rs b/datafusion/ffi/src/execution/task_ctx_provider.rs
new file mode 100644
index 0000000000000..5d4eaac83975a
--- /dev/null
+++ b/datafusion/ffi/src/execution/task_ctx_provider.rs
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::sync::{Arc, Weak};
+
+use abi_stable::StableAbi;
+use datafusion_common::{DataFusionError, ffi_datafusion_err};
+use datafusion_execution::{TaskContext, TaskContextProvider};
+
+use crate::execution::task_ctx::FFI_TaskContext;
+use crate::util::FFIResult;
+use crate::{df_result, rresult};
+
+/// Struct for accessing the [`TaskContext`]. This method contains a weak
+/// reference, so there are no guarantees that the [`TaskContext`] remains
+/// valid. This is used primarily for protobuf encoding and decoding of
+/// data passed across the FFI boundary. See the crate README for
+/// additional information.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_TaskContextProvider {
+    /// Retrieve the current [`TaskContext`] provided the provider has not
+    /// gone out of scope. This function will return an error if the weakly
+    /// held reference to the underlying [`TaskContextProvider`] is no longer
+    /// available.
+    pub task_ctx: unsafe extern "C" fn(&Self) -> FFIResult<FFI_TaskContext>,
+
+    /// Used to create a clone on the task context accessor. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// The foreign library should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_TaskContextProvider {}
+unsafe impl Sync for FFI_TaskContextProvider {}
+
+struct TaskContextProviderPrivateData {
+    ctx: Weak<dyn TaskContextProvider>,
+}
+
+impl FFI_TaskContextProvider {
+    unsafe fn inner(&self) -> Option<Arc<TaskContext>> {
+        unsafe {
+            let private_data = self.private_data as *const TaskContextProviderPrivateData;
+            (*private_data).ctx.upgrade().map(|ctx| ctx.task_ctx())
+        }
+    }
+}
+
+unsafe extern "C" fn task_ctx_fn_wrapper(
+    ctx_provider: &FFI_TaskContextProvider,
+) -> FFIResult<FFI_TaskContext> {
+    unsafe {
+        rresult!(
+            ctx_provider
+                .inner()
+                .map(FFI_TaskContext::from)
+                .ok_or_else(|| {
+                    ffi_datafusion_err!(
+                        "TaskContextProvider went out of scope over FFI boundary."
+                    )
+                })
+        )
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    provider: &FFI_TaskContextProvider,
+) -> FFI_TaskContextProvider {
+    unsafe {
+        let private_data = provider.private_data as *const TaskContextProviderPrivateData;
+        let ctx = Weak::clone(&(*private_data).ctx);
+
+        let private_data = Box::new(TaskContextProviderPrivateData { ctx });
+
+        FFI_TaskContextProvider {
+            task_ctx: task_ctx_fn_wrapper,
+            release: release_fn_wrapper,
+            clone: clone_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+unsafe extern "C" fn release_fn_wrapper(ctx: &mut FFI_TaskContextProvider) {
+    unsafe {
+        let private_data =
+            Box::from_raw(ctx.private_data as *mut TaskContextProviderPrivateData);
+        drop(private_data);
+    }
+}
+impl Drop for FFI_TaskContextProvider {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl Clone for FFI_TaskContextProvider {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl From<&Arc<dyn TaskContextProvider>> for FFI_TaskContextProvider {
+    fn from(ctx: &Arc<dyn TaskContextProvider>) -> Self {
+        let ctx = Arc::downgrade(ctx);
+        let private_data = Box::new(TaskContextProviderPrivateData { ctx });
+
+        FFI_TaskContextProvider {
+            task_ctx: task_ctx_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl TryFrom<&FFI_TaskContextProvider> for Arc<TaskContext> {
+    type Error = DataFusionError;
+    fn try_from(ffi_ctx: &FFI_TaskContextProvider) -> Result<Self, Self::Error> {
+        unsafe {
+            if (ffi_ctx.library_marker_id)() == crate::get_library_marker_id() {
+                return ffi_ctx.inner().ok_or_else(|| {
+                    ffi_datafusion_err!(
+                        "TaskContextProvider went out of scope over FFI boundary."
+                    )
+                });
+            }
+
+            df_result!((ffi_ctx.task_ctx)(ffi_ctx)).map(Into::into)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion_common::{DataFusionError, Result};
+    use datafusion_execution::{TaskContext, TaskContextProvider};
+
+    use crate::execution::FFI_TaskContextProvider;
+
+    #[derive(Default)]
+    struct TestCtxProvider {
+        ctx: Arc<TaskContext>,
+    }
+
+    impl TaskContextProvider for TestCtxProvider {
+        fn task_ctx(&self) -> Arc<TaskContext> {
+            Arc::clone(&self.ctx)
+        }
+    }
+
+    #[test]
+    fn ffi_task_context_provider_round_trip() -> Result<()> {
+        let ctx = Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+        let mut ffi_ctx_provider: FFI_TaskContextProvider = (&Arc::clone(&ctx)).into();
+        ffi_ctx_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_task_ctx: Arc<TaskContext> = (&ffi_ctx_provider).try_into()?;
+
+        assert_eq!(
+            format!("{foreign_task_ctx:?}"),
+            format!("{:?}", ctx.task_ctx())
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_task_context_provider_clone() -> Result<()> {
+        let ctx = Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+        let first_provider: FFI_TaskContextProvider = (&ctx).into();
+
+        let second_provider = first_provider.clone();
+
+        let first_ctx: Arc<TaskContext> = (&first_provider).try_into()?;
+        let second_ctx: Arc<TaskContext> = (&second_provider).try_into()?;
+
+        assert!(Arc::ptr_eq(&first_ctx, &second_ctx));
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_task_context_provider_out_of_scope() {
+        fn create_ffi_out_of_scope() -> FFI_TaskContextProvider {
+            let ctx =
+                Arc::new(TestCtxProvider::default()) as Arc<dyn TaskContextProvider>;
+            (&ctx).into()
+        }
+
+        let provider = create_ffi_out_of_scope();
+        let failed_ctx = <Arc<TaskContext>>::try_from(&provider);
+
+        let Err(DataFusionError::Ffi(_)) = failed_ctx else {
+            panic!("Expected out of scope error")
+        };
+    }
+}
diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs
index 14a0908c47954..eba16d9390787 100644
--- a/datafusion/ffi/src/execution_plan.rs
+++ b/datafusion/ffi/src/execution_plan.rs
@@ -15,29 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, pin::Pin, sync::Arc};
-
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-use datafusion::{
-    error::DataFusionError,
-    execution::{SendableRecordBatchStream, TaskContext},
-    physical_plan::{DisplayAs, ExecutionPlan, PlanProperties},
+use std::ffi::c_void;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RString, RVec};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
 };
-use datafusion::{error::Result, physical_plan::DisplayFormatType};
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, plan_properties::FFI_PlanProperties,
-    record_batch_stream::FFI_RecordBatchStream, rresult,
-};
+use crate::config::FFI_ConfigOptions;
+use crate::execution::FFI_TaskContext;
+use crate::plan_properties::FFI_PlanProperties;
+use crate::record_batch_stream::FFI_RecordBatchStream;
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
 
 /// A stable struct for sharing a [`ExecutionPlan`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_ExecutionPlan {
     /// Return the plan properties
     pub properties: unsafe extern "C" fn(plan: &Self) -> FFI_PlanProperties,
@@ -45,6 +47,9 @@ pub struct FFI_ExecutionPlan {
     /// Return a vector of children plans
     pub children: unsafe extern "C" fn(plan: &Self) -> RVec<FFI_ExecutionPlan>,
 
+    pub with_new_children:
+        unsafe extern "C" fn(plan: &Self, children: RVec<Self>) -> FFIResult<Self>,
+
     /// Return the plan name.
     pub name: unsafe extern "C" fn(plan: &Self) -> RString,
 
@@ -53,7 +58,14 @@ pub struct FFI_ExecutionPlan {
     pub execute: unsafe extern "C" fn(
         plan: &Self,
         partition: usize,
-    ) -> RResult<FFI_RecordBatchStream, RString>,
+        context: FFI_TaskContext,
+    ) -> FFIResult<FFI_RecordBatchStream>,
+
+    pub repartitioned: unsafe extern "C" fn(
+        plan: &Self,
+        target_partitions: usize,
+        config: FFI_ConfigOptions,
+    ) -> FFIResult<ROption<FFI_ExecutionPlan>>,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -65,6 +77,11 @@ pub struct FFI_ExecutionPlan {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignExecutionPlan`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_ExecutionPlan {}
@@ -72,73 +89,114 @@ unsafe impl Sync for FFI_ExecutionPlan {}
 
 pub struct ExecutionPlanPrivateData {
     pub plan: Arc<dyn ExecutionPlan>,
-    pub context: Arc<TaskContext>,
     pub runtime: Option<Handle>,
 }
 
+impl FFI_ExecutionPlan {
+    fn inner(&self) -> &Arc<dyn ExecutionPlan> {
+        let private_data = self.private_data as *const ExecutionPlanPrivateData;
+        unsafe { &(*private_data).plan }
+    }
+
+    fn runtime(&self) -> Option<Handle> {
+        let private_data = self.private_data as *const ExecutionPlanPrivateData;
+        unsafe { (*private_data).runtime.clone() }
+    }
+}
+
 unsafe extern "C" fn properties_fn_wrapper(
     plan: &FFI_ExecutionPlan,
 ) -> FFI_PlanProperties {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-
-    plan.properties().into()
+    plan.inner().properties().as_ref().into()
 }
 
 unsafe extern "C" fn children_fn_wrapper(
     plan: &FFI_ExecutionPlan,
 ) -> RVec<FFI_ExecutionPlan> {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-    let ctx = &(*private_data).context;
-    let runtime = &(*private_data).runtime;
+    let runtime = plan.runtime();
+    let plan = plan.inner();
 
     let children: Vec<_> = plan
         .children()
         .into_iter()
-        .map(|child| {
-            FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(ctx), runtime.clone())
-        })
+        .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), runtime.clone()))
         .collect();
 
     children.into()
 }
 
+unsafe extern "C" fn with_new_children_fn_wrapper(
+    plan: &FFI_ExecutionPlan,
+    children: RVec<FFI_ExecutionPlan>,
+) -> FFIResult<FFI_ExecutionPlan> {
+    let runtime = plan.runtime();
+    let plan = Arc::clone(plan.inner());
+    let children = rresult_return!(
+        children
+            .iter()
+            .map(<Arc<dyn ExecutionPlan>>::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+
+    let new_plan = rresult_return!(plan.with_new_children(children));
+
+    RResult::ROk(FFI_ExecutionPlan::new(new_plan, runtime))
+}
+
 unsafe extern "C" fn execute_fn_wrapper(
     plan: &FFI_ExecutionPlan,
     partition: usize,
-) -> RResult<FFI_RecordBatchStream, RString> {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
-    let ctx = &(*private_data).context;
-    let runtime = (*private_data).runtime.clone();
-
-    rresult!(plan
-        .execute(partition, Arc::clone(ctx))
-        .map(|rbs| FFI_RecordBatchStream::new(rbs, runtime)))
+    context: FFI_TaskContext,
+) -> FFIResult<FFI_RecordBatchStream> {
+    let ctx = context.into();
+    let runtime = plan.runtime();
+    let plan = plan.inner();
+
+    let _runtime_guard = runtime.as_ref().map(|rt| rt.enter());
+
+    rresult!(
+        plan.execute(partition, ctx)
+            .map(|rbs| FFI_RecordBatchStream::new(rbs, runtime))
+    )
 }
 
-unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> RString {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan = &(*private_data).plan;
+unsafe extern "C" fn repartitioned_fn_wrapper(
+    plan: &FFI_ExecutionPlan,
+    target_partitions: usize,
+    config: FFI_ConfigOptions,
+) -> FFIResult<ROption<FFI_ExecutionPlan>> {
+    let maybe_config: Result<ConfigOptions, DataFusionError> = config.try_into();
+    let config = rresult_return!(maybe_config);
+    let runtime = plan.runtime();
+    let plan = plan.inner();
+
+    rresult!(
+        plan.repartitioned(target_partitions, &config)
+            .map(|maybe_plan| maybe_plan
+                .map(|plan| FFI_ExecutionPlan::new(plan, runtime))
+                .into())
+    )
+}
 
-    plan.name().into()
+unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> RString {
+    plan.inner().name().into()
 }
 
 unsafe extern "C" fn release_fn_wrapper(plan: &mut FFI_ExecutionPlan) {
-    let private_data = Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!plan.private_data.is_null());
+        let private_data =
+            Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData);
+        drop(private_data);
+        plan.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(plan: &FFI_ExecutionPlan) -> FFI_ExecutionPlan {
-    let private_data = plan.private_data as *const ExecutionPlanPrivateData;
-    let plan_data = &(*private_data);
+    let runtime = plan.runtime();
+    let plan = plan.inner();
 
-    FFI_ExecutionPlan::new(
-        Arc::clone(&plan_data.plan),
-        Arc::clone(&plan_data.context),
-        plan_data.runtime.clone(),
-    )
+    FFI_ExecutionPlan::new(Arc::clone(plan), runtime)
 }
 
 impl Clone for FFI_ExecutionPlan {
@@ -147,27 +205,78 @@ impl Clone for FFI_ExecutionPlan {
     }
 }
 
+/// Helper function to recursively identify any children that do not
+/// have a runtime set but should because they are local to this same
+/// library. This does imply a restriction that all execution plans
+/// in this chain that are within the same library use the same runtime.
+fn pass_runtime_to_children(
+    plan: &Arc<dyn ExecutionPlan>,
+    runtime: &Handle,
+) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    let mut updated_children = false;
+    let plan_is_foreign = plan.as_any().is::<ForeignExecutionPlan>();
+
+    let children = plan
+        .children()
+        .into_iter()
+        .map(|child| {
+            let child = match pass_runtime_to_children(child, runtime)? {
+                Some(child) => {
+                    updated_children = true;
+                    child
+                }
+                None => Arc::clone(child),
+            };
+
+            // If the parent is foreign and the child is local to this library, then when
+            // we called `children()` above we will get something other than a
+            // `ForeignExecutionPlan`. In this case wrap the plan in a `ForeignExecutionPlan`
+            // because when we call `with_new_children` below it will extract the
+            // FFI plan that does contain the runtime.
+            if plan_is_foreign && !child.as_any().is::<ForeignExecutionPlan>() {
+                updated_children = true;
+                let ffi_child = FFI_ExecutionPlan::new(child, Some(runtime.clone()));
+                let foreign_child = ForeignExecutionPlan::try_from(ffi_child);
+                foreign_child.map(|c| Arc::new(c) as Arc<dyn ExecutionPlan>)
+            } else {
+                Ok(child)
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+    if updated_children {
+        Arc::clone(plan).with_new_children(children).map(Some)
+    } else {
+        Ok(None)
+    }
+}
+
 impl FFI_ExecutionPlan {
     /// This function is called on the provider's side.
-    pub fn new(
-        plan: Arc<dyn ExecutionPlan>,
-        context: Arc<TaskContext>,
-        runtime: Option<Handle>,
-    ) -> Self {
-        let private_data = Box::new(ExecutionPlanPrivateData {
-            plan,
-            context,
-            runtime,
-        });
+    pub fn new(mut plan: Arc<dyn ExecutionPlan>, runtime: Option<Handle>) -> Self {
+        // Note to developers: `pass_runtime_to_children` relies on the logic here to
+        // get the underlying FFI plan during calls to `new_with_children`.
+        if let Some(plan) = plan.as_any().downcast_ref::<ForeignExecutionPlan>() {
+            return plan.plan.clone();
+        }
 
+        if let Some(rt) = &runtime
+            && let Ok(Some(p)) = pass_runtime_to_children(&plan, rt)
+        {
+            plan = p;
+        }
+
+        let private_data = Box::new(ExecutionPlanPrivateData { plan, runtime });
         Self {
             properties: properties_fn_wrapper,
             children: children_fn_wrapper,
+            with_new_children: with_new_children_fn_wrapper,
             name: name_fn_wrapper,
             execute: execute_fn_wrapper,
+            repartitioned: repartitioned_fn_wrapper,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -188,7 +297,7 @@ impl Drop for FFI_ExecutionPlan {
 pub struct ForeignExecutionPlan {
     name: String,
     plan: FFI_ExecutionPlan,
-    properties: PlanProperties,
+    properties: Arc<PlanProperties>,
     children: Vec<Arc<dyn ExecutionPlan>>,
 }
 
@@ -205,7 +314,8 @@ impl DisplayAs for ForeignExecutionPlan {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 write!(
                     f,
-                    "FFI_ExecutionPlan(number_of_children={})",
+                    "FFI_ExecutionPlan: {}, number_of_children={}",
+                    self.name,
                     self.children.len(),
                 )
             }
@@ -217,26 +327,37 @@ impl DisplayAs for ForeignExecutionPlan {
     }
 }
 
-impl TryFrom<&FFI_ExecutionPlan> for ForeignExecutionPlan {
+impl TryFrom<&FFI_ExecutionPlan> for Arc<dyn ExecutionPlan> {
     type Error = DataFusionError;
 
     fn try_from(plan: &FFI_ExecutionPlan) -> Result<Self, Self::Error> {
+        if (plan.library_marker_id)() == crate::get_library_marker_id() {
+            Ok(Arc::clone(plan.inner()))
+        } else {
+            let plan = ForeignExecutionPlan::try_from(plan.clone())?;
+            Ok(Arc::new(plan))
+        }
+    }
+}
+
+impl TryFrom<FFI_ExecutionPlan> for ForeignExecutionPlan {
+    type Error = DataFusionError;
+    fn try_from(plan: FFI_ExecutionPlan) -> Result<Self, Self::Error> {
         unsafe {
-            let name = (plan.name)(plan).into();
+            let name = (plan.name)(&plan).into();
 
-            let properties: PlanProperties = (plan.properties)(plan).try_into()?;
+            let properties: PlanProperties = (plan.properties)(&plan).try_into()?;
 
-            let children_rvec = (plan.children)(plan);
+            let children_rvec = (plan.children)(&plan);
             let children = children_rvec
                 .iter()
-                .map(ForeignExecutionPlan::try_from)
-                .map(|child| child.map(|c| Arc::new(c) as Arc<dyn ExecutionPlan>))
+                .map(<Arc<dyn ExecutionPlan>>::try_from)
                 .collect::<Result<Vec<_>>>()?;
 
-            Ok(Self {
+            Ok(ForeignExecutionPlan {
                 name,
-                plan: plan.clone(),
-                properties,
+                plan,
+                properties: Arc::new(properties),
                 children,
             })
         }
@@ -252,69 +373,95 @@ impl ExecutionPlan for ForeignExecutionPlan {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.properties
     }
 
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        self.children
-            .iter()
-            .map(|p| p as &Arc<dyn ExecutionPlan>)
-            .collect()
+        self.children.iter().collect()
     }
 
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(ForeignExecutionPlan {
-            plan: self.plan.clone(),
-            name: self.name.clone(),
-            children,
-            properties: self.properties.clone(),
-        }))
+        let children = children
+            .into_iter()
+            .map(|child| FFI_ExecutionPlan::new(child, None))
+            .collect::<RVec<_>>();
+        let new_plan =
+            unsafe { df_result!((self.plan.with_new_children)(&self.plan, children))? };
+
+        (&new_plan).try_into()
     }
 
     fn execute(
         &self,
         partition: usize,
-        _context: Arc<TaskContext>,
+        context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
+        let context = FFI_TaskContext::from(context);
         unsafe {
-            df_result!((self.plan.execute)(&self.plan, partition))
+            df_result!((self.plan.execute)(&self.plan, partition, context))
                 .map(|stream| Pin::new(Box::new(stream)) as SendableRecordBatchStream)
         }
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        config: &ConfigOptions,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let config = config.into();
+        let maybe_plan: Option<FFI_ExecutionPlan> = df_result!(unsafe {
+            (self.plan.repartitioned)(&self.plan, target_partitions, config)
+        })?
+        .into();
+
+        maybe_plan
+            .map(|plan| <Arc<dyn ExecutionPlan>>::try_from(&plan))
+            .transpose()
+    }
 }
 
-#[cfg(test)]
-mod tests {
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion::{
-        physical_plan::{
-            execution_plan::{Boundedness, EmissionType},
-            Partitioning,
-        },
-        prelude::SessionContext,
-    };
+#[cfg(any(test, feature = "integration-tests"))]
+pub mod tests {
+    use datafusion_physical_plan::Partitioning;
+    use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 
     use super::*;
 
     #[derive(Debug)]
     pub struct EmptyExec {
-        props: PlanProperties,
+        props: Arc<PlanProperties>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     }
 
     impl EmptyExec {
         pub fn new(schema: arrow::datatypes::SchemaRef) -> Self {
             Self {
-                props: PlanProperties::new(
-                    datafusion::physical_expr::EquivalenceProperties::new(schema),
+                props: Arc::new(PlanProperties::new(
+                    datafusion_physical_expr::EquivalenceProperties::new(schema),
                     Partitioning::UnknownPartitioning(3),
                     EmissionType::Incremental,
                     Boundedness::Bounded,
-                ),
+                )),
                 children: Vec::default(),
             }
         }
@@ -339,7 +486,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.props
         }
 
@@ -352,7 +499,7 @@ mod tests {
             children: Vec<Arc<dyn ExecutionPlan>>,
         ) -> Result<Arc<dyn ExecutionPlan>> {
             Ok(Arc::new(EmptyExec {
-                props: self.props.clone(),
+                props: Arc::clone(&self.props),
                 children,
             }))
         }
@@ -365,50 +512,68 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<datafusion::common::Statistics> {
-            unimplemented!()
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(
+                &dyn datafusion_physical_plan::PhysicalExpr,
+            ) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            // Visit expressions in the output ordering from equivalence properties
+            let mut tnr = TreeNodeRecursion::Continue;
+            if let Some(ordering) = self.props.output_ordering() {
+                for sort_expr in ordering {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+            Ok(tnr)
         }
     }
 
     #[test]
     fn test_round_trip_ffi_execution_plan() -> Result<()> {
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
-        let ctx = SessionContext::new();
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
 
         let original_plan = Arc::new(EmptyExec::new(schema));
         let original_name = original_plan.name().to_string();
 
-        let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx(), None);
+        let mut local_plan = FFI_ExecutionPlan::new(original_plan, None);
+        local_plan.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_plan: ForeignExecutionPlan = (&local_plan).try_into()?;
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&local_plan).try_into()?;
 
-        assert!(original_name == foreign_plan.name());
+        assert_eq!(original_name, foreign_plan.name());
 
-        let display = datafusion::physical_plan::display::DisplayableExecutionPlan::new(
-            &foreign_plan,
+        let display = datafusion_physical_plan::display::DisplayableExecutionPlan::new(
+            foreign_plan.as_ref(),
         );
 
         let buf = display.one_line().to_string();
-        assert_eq!(buf.trim(), "FFI_ExecutionPlan(number_of_children=0)");
+        assert_eq!(
+            buf.trim(),
+            "FFI_ExecutionPlan: empty-exec, number_of_children=0"
+        );
 
         Ok(())
     }
 
     #[test]
     fn test_ffi_execution_plan_children() -> Result<()> {
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
-        let ctx = SessionContext::new();
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
 
         // Version 1: Adding child to the foreign plan
         let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None);
-        let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?);
+        let mut child_local = FFI_ExecutionPlan::new(child_plan, None);
+        child_local.library_marker_id = crate::mock_foreign_marker_id;
+        let child_foreign = <Arc<dyn ExecutionPlan>>::try_from(&child_local)?;
 
         let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None);
-        let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?);
+        let mut parent_local = FFI_ExecutionPlan::new(parent_plan, None);
+        parent_local.library_marker_id = crate::mock_foreign_marker_id;
+        let parent_foreign = <Arc<dyn ExecutionPlan>>::try_from(&parent_local)?;
 
         assert_eq!(parent_foreign.children().len(), 0);
         assert_eq!(child_foreign.children().len(), 0);
@@ -418,16 +583,43 @@ mod tests {
 
         // Version 2: Adding child to the local plan
         let child_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
-        let child_local = FFI_ExecutionPlan::new(child_plan, ctx.task_ctx(), None);
-        let child_foreign = Arc::new(ForeignExecutionPlan::try_from(&child_local)?);
+        let mut child_local = FFI_ExecutionPlan::new(child_plan, None);
+        child_local.library_marker_id = crate::mock_foreign_marker_id;
+        let child_foreign = <Arc<dyn ExecutionPlan>>::try_from(&child_local)?;
 
         let parent_plan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
         let parent_plan = parent_plan.with_new_children(vec![child_foreign])?;
-        let parent_local = FFI_ExecutionPlan::new(parent_plan, ctx.task_ctx(), None);
-        let parent_foreign = Arc::new(ForeignExecutionPlan::try_from(&parent_local)?);
+        let mut parent_local = FFI_ExecutionPlan::new(parent_plan, None);
+        parent_local.library_marker_id = crate::mock_foreign_marker_id;
+        let parent_foreign = <Arc<dyn ExecutionPlan>>::try_from(&parent_local)?;
 
         assert_eq!(parent_foreign.children().len(), 1);
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_execution_plan_local_bypass() {
+        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            arrow::datatypes::Field::new("a", arrow::datatypes::DataType::Float32, false),
+        ]));
+
+        let plan = Arc::new(EmptyExec::new(schema));
+
+        let mut ffi_plan = FFI_ExecutionPlan::new(plan, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&ffi_plan).try_into().unwrap();
+        assert!(foreign_plan.as_any().downcast_ref::<EmptyExec>().is_some());
+
+        // Verify different library markers generate foreign providers
+        ffi_plan.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_plan: Arc<dyn ExecutionPlan> = (&ffi_plan).try_into().unwrap();
+        assert!(
+            foreign_plan
+                .as_any()
+                .downcast_ref::<ForeignExecutionPlan>()
+                .is_some()
+        );
+    }
 }
diff --git a/datafusion/ffi/src/expr/columnar_value.rs b/datafusion/ffi/src/expr/columnar_value.rs
new file mode 100644
index 0000000000000..7ad7645ecb6cf
--- /dev/null
+++ b/datafusion/ffi/src/expr/columnar_value.rs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::StableAbi;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::ColumnarValue;
+
+use crate::arrow_wrappers::WrappedArray;
+
+/// A stable struct for sharing [`ColumnarValue`] across FFI boundaries.
+/// Scalar values are passed as an Arrow array of length 1.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub enum FFI_ColumnarValue {
+    Array(WrappedArray),
+    Scalar(WrappedArray),
+}
+
+impl TryFrom<ColumnarValue> for FFI_ColumnarValue {
+    type Error = DataFusionError;
+    fn try_from(value: ColumnarValue) -> Result<Self, Self::Error> {
+        Ok(match value {
+            ColumnarValue::Array(v) => {
+                FFI_ColumnarValue::Array(WrappedArray::try_from(&v)?)
+            }
+            ColumnarValue::Scalar(v) => {
+                FFI_ColumnarValue::Scalar(WrappedArray::try_from(&v)?)
+            }
+        })
+    }
+}
+
+impl TryFrom<FFI_ColumnarValue> for ColumnarValue {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ColumnarValue) -> Result<Self, Self::Error> {
+        Ok(match value {
+            FFI_ColumnarValue::Array(v) => ColumnarValue::Array(v.try_into()?),
+            FFI_ColumnarValue::Scalar(v) => {
+                ColumnarValue::Scalar(ScalarValue::try_from(v)?)
+            }
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::create_array;
+    use datafusion_common::{DataFusionError, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+
+    use crate::expr::columnar_value::FFI_ColumnarValue;
+
+    #[test]
+    fn ffi_columnar_value_round_trip() -> Result<(), DataFusionError> {
+        let array = create_array!(Int32, [1, 2, 3, 4, 5]);
+
+        for original in [
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+        ] {
+            let ffi_variant = FFI_ColumnarValue::try_from(original.clone())?;
+
+            let returned_value = ColumnarValue::try_from(ffi_variant)?;
+
+            assert_eq!(format!("{returned_value:?}"), format!("{original:?}"));
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/expr/distribution.rs b/datafusion/ffi/src/expr/distribution.rs
new file mode 100644
index 0000000000000..b9ebfc2362c7a
--- /dev/null
+++ b/datafusion/ffi/src/expr/distribution.rs
@@ -0,0 +1,210 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::StableAbi;
+use datafusion_common::DataFusionError;
+use datafusion_expr::statistics::{
+    BernoulliDistribution, Distribution, ExponentialDistribution, GaussianDistribution,
+    GenericDistribution, UniformDistribution,
+};
+
+use crate::arrow_wrappers::WrappedArray;
+use crate::expr::interval::FFI_Interval;
+
+/// A stable struct for sharing [`Distribution`] across FFI boundaries.
+/// See ['Distribution'] for the meaning of each variant.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+#[expect(clippy::large_enum_variant)]
+pub enum FFI_Distribution {
+    Uniform(FFI_UniformDistribution),
+    Exponential(FFI_ExponentialDistribution),
+    Gaussian(FFI_GaussianDistribution),
+    Bernoulli(FFI_BernoulliDistribution),
+    Generic(FFI_GenericDistribution),
+}
+
+impl TryFrom<&Distribution> for FFI_Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: &Distribution) -> Result<Self, Self::Error> {
+        match value {
+            Distribution::Uniform(d) => Ok(FFI_Distribution::Uniform(d.try_into()?)),
+            Distribution::Exponential(d) => {
+                Ok(FFI_Distribution::Exponential(d.try_into()?))
+            }
+            Distribution::Gaussian(d) => Ok(FFI_Distribution::Gaussian(d.try_into()?)),
+            Distribution::Bernoulli(d) => Ok(FFI_Distribution::Bernoulli(d.try_into()?)),
+            Distribution::Generic(d) => Ok(FFI_Distribution::Generic(d.try_into()?)),
+        }
+    }
+}
+
+impl TryFrom<FFI_Distribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_Distribution) -> Result<Self, Self::Error> {
+        match value {
+            FFI_Distribution::Uniform(d) => d.try_into(),
+            FFI_Distribution::Exponential(d) => d.try_into(),
+            FFI_Distribution::Gaussian(d) => d.try_into(),
+            FFI_Distribution::Bernoulli(d) => d.try_into(),
+            FFI_Distribution::Generic(d) => d.try_into(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_UniformDistribution {
+    interval: FFI_Interval,
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_ExponentialDistribution {
+    rate: WrappedArray,
+    offset: WrappedArray,
+    positive_tail: bool,
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_GaussianDistribution {
+    mean: WrappedArray,
+    variance: WrappedArray,
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_BernoulliDistribution {
+    p: WrappedArray,
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_GenericDistribution {
+    mean: WrappedArray,
+    median: WrappedArray,
+    variance: WrappedArray,
+    range: FFI_Interval,
+}
+
+impl TryFrom<&UniformDistribution> for FFI_UniformDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &UniformDistribution) -> Result<Self, Self::Error> {
+        Ok(Self {
+            interval: value.range().try_into()?,
+        })
+    }
+}
+
+impl TryFrom<&ExponentialDistribution> for FFI_ExponentialDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &ExponentialDistribution) -> Result<Self, Self::Error> {
+        let rate = value.rate().try_into()?;
+        let offset = value.offset().try_into()?;
+
+        Ok(Self {
+            rate,
+            offset,
+            positive_tail: value.positive_tail(),
+        })
+    }
+}
+
+impl TryFrom<&GaussianDistribution> for FFI_GaussianDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &GaussianDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean().try_into()?;
+        let variance = value.variance().try_into()?;
+
+        Ok(Self { mean, variance })
+    }
+}
+
+impl TryFrom<&BernoulliDistribution> for FFI_BernoulliDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &BernoulliDistribution) -> Result<Self, Self::Error> {
+        let p = value.p_value().try_into()?;
+
+        Ok(Self { p })
+    }
+}
+
+impl TryFrom<&GenericDistribution> for FFI_GenericDistribution {
+    type Error = DataFusionError;
+    fn try_from(value: &GenericDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean().try_into()?;
+        let median = value.median().try_into()?;
+        let variance = value.variance().try_into()?;
+
+        Ok(Self {
+            mean,
+            median,
+            variance,
+            range: value.range().try_into()?,
+        })
+    }
+}
+
+impl TryFrom<FFI_UniformDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_UniformDistribution) -> Result<Self, Self::Error> {
+        let interval = value.interval.try_into()?;
+        Distribution::new_uniform(interval)
+    }
+}
+
+impl TryFrom<FFI_ExponentialDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ExponentialDistribution) -> Result<Self, Self::Error> {
+        let rate = value.rate.try_into()?;
+        let offset = value.offset.try_into()?;
+
+        Distribution::new_exponential(rate, offset, value.positive_tail)
+    }
+}
+
+impl TryFrom<FFI_GaussianDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_GaussianDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean.try_into()?;
+        let variance = value.variance.try_into()?;
+
+        Distribution::new_gaussian(mean, variance)
+    }
+}
+
+impl TryFrom<FFI_BernoulliDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_BernoulliDistribution) -> Result<Self, Self::Error> {
+        let p = value.p.try_into()?;
+
+        Distribution::new_bernoulli(p)
+    }
+}
+
+impl TryFrom<FFI_GenericDistribution> for Distribution {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_GenericDistribution) -> Result<Self, Self::Error> {
+        let mean = value.mean.try_into()?;
+        let median = value.median.try_into()?;
+        let variance = value.variance.try_into()?;
+        let range = value.range.try_into()?;
+
+        Distribution::new_generic(mean, median, variance, range)
+    }
+}
diff --git a/datafusion/ffi/src/expr/expr_properties.rs b/datafusion/ffi/src/expr/expr_properties.rs
new file mode 100644
index 0000000000000..199a399a6471f
--- /dev/null
+++ b/datafusion/ffi/src/expr/expr_properties.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::StableAbi;
+use arrow_schema::SortOptions;
+use datafusion_common::DataFusionError;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+
+use crate::expr::interval::FFI_Interval;
+
+/// A stable struct for sharing [`ExprProperties`] across FFI boundaries.
+/// See [`ExprProperties`] for the meaning of each field.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_ExprProperties {
+    sort_properties: FFI_SortProperties,
+    range: FFI_Interval,
+    preserves_lex_ordering: bool,
+}
+
+impl TryFrom<&ExprProperties> for FFI_ExprProperties {
+    type Error = DataFusionError;
+    fn try_from(value: &ExprProperties) -> Result<Self, Self::Error> {
+        let sort_properties = (&value.sort_properties).into();
+        let range = value.range.clone().try_into()?;
+
+        Ok(FFI_ExprProperties {
+            sort_properties,
+            range,
+            preserves_lex_ordering: value.preserves_lex_ordering,
+        })
+    }
+}
+
+impl TryFrom<FFI_ExprProperties> for ExprProperties {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_ExprProperties) -> Result<Self, Self::Error> {
+        let sort_properties = (&value.sort_properties).into();
+        let range = value.range.try_into()?;
+        Ok(ExprProperties {
+            sort_properties,
+            range,
+            preserves_lex_ordering: value.preserves_lex_ordering,
+        })
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub enum FFI_SortProperties {
+    Ordered(FFI_SortOptions),
+    Unordered,
+    Singleton,
+}
+
+impl From<&SortProperties> for FFI_SortProperties {
+    fn from(value: &SortProperties) -> Self {
+        match value {
+            SortProperties::Unordered => FFI_SortProperties::Unordered,
+            SortProperties::Singleton => FFI_SortProperties::Singleton,
+            SortProperties::Ordered(o) => FFI_SortProperties::Ordered(o.into()),
+        }
+    }
+}
+
+impl From<&FFI_SortProperties> for SortProperties {
+    fn from(value: &FFI_SortProperties) -> Self {
+        match value {
+            FFI_SortProperties::Unordered => SortProperties::Unordered,
+            FFI_SortProperties::Singleton => SortProperties::Singleton,
+            FFI_SortProperties::Ordered(o) => SortProperties::Ordered(o.into()),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_SortOptions {
+    pub descending: bool,
+    pub nulls_first: bool,
+}
+
+impl From<&SortOptions> for FFI_SortOptions {
+    fn from(value: &SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
+
+impl From<&FFI_SortOptions> for SortOptions {
+    fn from(value: &FFI_SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
diff --git a/datafusion/ffi/src/expr/interval.rs b/datafusion/ffi/src/expr/interval.rs
new file mode 100644
index 0000000000000..450f3747a57f0
--- /dev/null
+++ b/datafusion/ffi/src/expr/interval.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::StableAbi;
+use datafusion_common::DataFusionError;
+use datafusion_expr::interval_arithmetic::Interval;
+
+use crate::arrow_wrappers::WrappedArray;
+
+/// A stable struct for sharing [`Interval`] across FFI boundaries.
+/// See [`Interval`] for the meaning of each field. Scalar values
+/// are passed as Arrow arrays of length 1.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_Interval {
+    lower: WrappedArray,
+    upper: WrappedArray,
+}
+
+impl TryFrom<&Interval> for FFI_Interval {
+    type Error = DataFusionError;
+    fn try_from(value: &Interval) -> Result<Self, Self::Error> {
+        let upper = value.upper().try_into()?;
+        let lower = value.lower().try_into()?;
+
+        Ok(FFI_Interval { upper, lower })
+    }
+}
+impl TryFrom<Interval> for FFI_Interval {
+    type Error = DataFusionError;
+    fn try_from(value: Interval) -> Result<Self, Self::Error> {
+        FFI_Interval::try_from(&value)
+    }
+}
+
+impl TryFrom<FFI_Interval> for Interval {
+    type Error = DataFusionError;
+    fn try_from(value: FFI_Interval) -> Result<Self, Self::Error> {
+        let upper = value.upper.try_into()?;
+        let lower = value.lower.try_into()?;
+
+        Interval::try_new(lower, upper)
+    }
+}
diff --git a/datafusion/ffi/src/expr/mod.rs b/datafusion/ffi/src/expr/mod.rs
new file mode 100644
index 0000000000000..e11d52a2a1e57
--- /dev/null
+++ b/datafusion/ffi/src/expr/mod.rs
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod columnar_value;
+pub mod distribution;
+pub mod expr_properties;
+pub mod interval;
diff --git a/datafusion/ffi/src/insert_op.rs b/datafusion/ffi/src/insert_op.rs
index 8e8693076cc0e..6471039105e80 100644
--- a/datafusion/ffi/src/insert_op.rs
+++ b/datafusion/ffi/src/insert_op.rs
@@ -16,12 +16,11 @@
 // under the License.
 
 use abi_stable::StableAbi;
-use datafusion::logical_expr::logical_plan::dml::InsertOp;
+use datafusion_expr::logical_plan::dml::InsertOp;
 
 /// FFI safe version of [`InsertOp`].
 #[repr(C)]
 #[derive(StableAbi)]
-#[allow(non_camel_case_types)]
 pub enum FFI_InsertOp {
     Append,
     Overwrite,
diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs
index d877e182a1d89..d7410e8483735 100644
--- a/datafusion/ffi/src/lib.rs
+++ b/datafusion/ffi/src/lib.rs
@@ -19,23 +19,33 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod arrow_wrappers;
 pub mod catalog_provider;
+pub mod catalog_provider_list;
+pub mod config;
+pub mod execution;
 pub mod execution_plan;
+pub mod expr;
 pub mod insert_op;
+pub mod physical_expr;
 pub mod plan_properties;
+pub mod proto;
 pub mod record_batch_stream;
 pub mod schema_provider;
-pub mod session_config;
+pub mod session;
 pub mod table_provider;
+pub mod table_provider_factory;
 pub mod table_source;
+pub mod udaf;
 pub mod udf;
 pub mod udtf;
+pub mod udwf;
 pub mod util;
 pub mod volatility;
 
@@ -52,5 +62,34 @@ pub extern "C" fn version() -> u64 {
     version.major
 }
 
+static LIBRARY_MARKER: u8 = 0;
+
+/// This utility is used to determine if two FFI structs are within
+/// the same library. It is possible that the interplay between
+/// foreign and local functions calls create one FFI struct that
+/// references another. It is helpful to determine if a foreign
+/// struct in the same library or called from a different one.
+/// If we are in the same library, then we can access the underlying
+/// types directly.
+///
+/// This function works by checking the address of the library
+/// marker. Each library that implements the FFI code will have
+/// a different address for the marker. By checking the marker
+/// address we can determine if a struct is truly foreign or is
+/// actually within the same originating library.
+///
+/// See the crate's `README.md` for additional information.
+pub extern "C" fn get_library_marker_id() -> usize {
+    &LIBRARY_MARKER as *const u8 as usize
+}
+
+/// For unit testing in this crate we need to trick the providers
+/// into thinking we have a foreign call. We do this by overwriting
+/// their `library_marker_id` function to return a different value.
+#[cfg(test)]
+pub(crate) extern "C" fn mock_foreign_marker_id() -> usize {
+    get_library_marker_id() + 1
+}
+
 #[cfg(doctest)]
 doc_comment::doctest!("../README.md", readme_example_test);
diff --git a/datafusion/ffi/src/physical_expr/mod.rs b/datafusion/ffi/src/physical_expr/mod.rs
new file mode 100644
index 0000000000000..189a1e478217e
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/mod.rs
@@ -0,0 +1,992 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub(crate) mod partitioning;
+pub(crate) mod sort;
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::fmt::{Display, Formatter};
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RString, RVec};
+use arrow::array::{ArrayRef, BooleanArray, RecordBatch};
+use arrow::datatypes::SchemaRef;
+use arrow_schema::ffi::FFI_ArrowSchema;
+use arrow_schema::{DataType, Field, FieldRef, Schema};
+use datafusion_common::{Result, ffi_datafusion_err};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::sort_properties::ExprProperties;
+use datafusion_expr::statistics::Distribution;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::expr::columnar_value::FFI_ColumnarValue;
+use crate::expr::distribution::FFI_Distribution;
+use crate::expr::expr_properties::FFI_ExprProperties;
+use crate::expr::interval::FFI_Interval;
+use crate::record_batch_stream::{
+    record_batch_to_wrapped_array, wrapped_array_to_record_batch,
+};
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_PhysicalExpr {
+    pub data_type: unsafe extern "C" fn(
+        &Self,
+        input_schema: WrappedSchema,
+    ) -> FFIResult<WrappedSchema>,
+
+    pub nullable:
+        unsafe extern "C" fn(&Self, input_schema: WrappedSchema) -> FFIResult<bool>,
+
+    pub evaluate:
+        unsafe extern "C" fn(&Self, batch: WrappedArray) -> FFIResult<FFI_ColumnarValue>,
+
+    pub return_field: unsafe extern "C" fn(
+        &Self,
+        input_schema: WrappedSchema,
+    ) -> FFIResult<WrappedSchema>,
+
+    pub evaluate_selection: unsafe extern "C" fn(
+        &Self,
+        batch: WrappedArray,
+        selection: WrappedArray,
+    ) -> FFIResult<FFI_ColumnarValue>,
+
+    pub children: unsafe extern "C" fn(&Self) -> RVec<FFI_PhysicalExpr>,
+
+    pub new_with_children:
+        unsafe extern "C" fn(&Self, children: &RVec<FFI_PhysicalExpr>) -> FFIResult<Self>,
+
+    pub evaluate_bounds: unsafe extern "C" fn(
+        &Self,
+        children: RVec<FFI_Interval>,
+    ) -> FFIResult<FFI_Interval>,
+
+    pub propagate_constraints:
+        unsafe extern "C" fn(
+            &Self,
+            interval: FFI_Interval,
+            children: RVec<FFI_Interval>,
+        ) -> FFIResult<ROption<RVec<FFI_Interval>>>,
+
+    pub evaluate_statistics: unsafe extern "C" fn(
+        &Self,
+        children: RVec<FFI_Distribution>,
+    ) -> FFIResult<FFI_Distribution>,
+
+    pub propagate_statistics:
+        unsafe extern "C" fn(
+            &Self,
+            parent: FFI_Distribution,
+            children: RVec<FFI_Distribution>,
+        ) -> FFIResult<ROption<RVec<FFI_Distribution>>>,
+
+    pub get_properties: unsafe extern "C" fn(
+        &Self,
+        children: RVec<FFI_ExprProperties>,
+    ) -> FFIResult<FFI_ExprProperties>,
+
+    pub fmt_sql: unsafe extern "C" fn(&Self) -> FFIResult<RString>,
+
+    pub snapshot: unsafe extern "C" fn(&Self) -> FFIResult<ROption<FFI_PhysicalExpr>>,
+
+    pub snapshot_generation: unsafe extern "C" fn(&Self) -> u64,
+
+    pub is_volatile_node: unsafe extern "C" fn(&Self) -> bool,
+
+    // Display trait
+    pub display: unsafe extern "C" fn(&Self) -> RString,
+
+    // Hash trait
+    pub hash: unsafe extern "C" fn(&Self) -> u64,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignPhysicalExpr`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PhysicalExpr {}
+unsafe impl Sync for FFI_PhysicalExpr {}
+
+impl FFI_PhysicalExpr {
+    fn inner(&self) -> &Arc<dyn PhysicalExpr> {
+        unsafe {
+            let private_data = self.private_data as *const PhysicalExprPrivateData;
+            &(*private_data).expr
+        }
+    }
+}
+
+struct PhysicalExprPrivateData {
+    expr: Arc<dyn PhysicalExpr>,
+}
+
+unsafe extern "C" fn data_type_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFIResult<WrappedSchema> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    let data_type = expr
+        .data_type(&schema)
+        .and_then(|dt| FFI_ArrowSchema::try_from(dt).map_err(Into::into))
+        .map(WrappedSchema);
+    rresult!(data_type)
+}
+
+unsafe extern "C" fn nullable_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFIResult<bool> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    rresult!(expr.nullable(&schema))
+}
+
+unsafe extern "C" fn evaluate_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    batch: WrappedArray,
+) -> FFIResult<FFI_ColumnarValue> {
+    let batch = rresult_return!(wrapped_array_to_record_batch(batch));
+    rresult!(
+        expr.inner()
+            .evaluate(&batch)
+            .and_then(FFI_ColumnarValue::try_from)
+    )
+}
+
+unsafe extern "C" fn return_field_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    input_schema: WrappedSchema,
+) -> FFIResult<WrappedSchema> {
+    let expr = expr.inner();
+    let schema: SchemaRef = input_schema.into();
+    rresult!(
+        expr.return_field(&schema)
+            .and_then(|f| FFI_ArrowSchema::try_from(&f).map_err(Into::into))
+            .map(WrappedSchema)
+    )
+}
+
+unsafe extern "C" fn evaluate_selection_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    batch: WrappedArray,
+    selection: WrappedArray,
+) -> FFIResult<FFI_ColumnarValue> {
+    let batch = rresult_return!(wrapped_array_to_record_batch(batch));
+    let selection: ArrayRef = rresult_return!(selection.try_into());
+    let selection = rresult_return!(
+        selection
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .ok_or(ffi_datafusion_err!("Unexpected selection array type"))
+    );
+    rresult!(
+        expr.inner()
+            .evaluate_selection(&batch, selection)
+            .and_then(FFI_ColumnarValue::try_from)
+    )
+}
+
+unsafe extern "C" fn children_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+) -> RVec<FFI_PhysicalExpr> {
+    let expr = expr.inner();
+    let children = expr.children();
+    children
+        .into_iter()
+        .map(|child| FFI_PhysicalExpr::from(Arc::clone(child)))
+        .collect()
+}
+
+unsafe extern "C" fn new_with_children_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: &RVec<FFI_PhysicalExpr>,
+) -> FFIResult<FFI_PhysicalExpr> {
+    let expr = Arc::clone(expr.inner());
+    let children = children.iter().map(Into::into).collect::<Vec<_>>();
+    rresult!(expr.with_new_children(children).map(FFI_PhysicalExpr::from))
+}
+
+unsafe extern "C" fn evaluate_bounds_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: RVec<FFI_Interval>,
+) -> FFIResult<FFI_Interval> {
+    let expr = expr.inner();
+    let children = rresult_return!(
+        children
+            .into_iter()
+            .map(Interval::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    rresult!(
+        expr.evaluate_bounds(&children_borrowed)
+            .and_then(FFI_Interval::try_from)
+    )
+}
+
+unsafe extern "C" fn propagate_constraints_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    interval: FFI_Interval,
+    children: RVec<FFI_Interval>,
+) -> FFIResult<ROption<RVec<FFI_Interval>>> {
+    let expr = expr.inner();
+    let interval = rresult_return!(Interval::try_from(interval));
+    let children = rresult_return!(
+        children
+            .into_iter()
+            .map(Interval::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    let result =
+        rresult_return!(expr.propagate_constraints(&interval, &children_borrowed));
+
+    let result = rresult_return!(
+        result
+            .map(|intervals| intervals
+                .into_iter()
+                .map(FFI_Interval::try_from)
+                .collect::<Result<RVec<_>>>())
+            .transpose()
+    );
+
+    RResult::ROk(result.into())
+}
+
+unsafe extern "C" fn evaluate_statistics_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: RVec<FFI_Distribution>,
+) -> FFIResult<FFI_Distribution> {
+    let expr = expr.inner();
+    let children = rresult_return!(
+        children
+            .into_iter()
+            .map(Distribution::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+    rresult!(
+        expr.evaluate_statistics(&children_borrowed)
+            .and_then(|dist| FFI_Distribution::try_from(&dist))
+    )
+}
+
+unsafe extern "C" fn propagate_statistics_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    parent: FFI_Distribution,
+    children: RVec<FFI_Distribution>,
+) -> FFIResult<ROption<RVec<FFI_Distribution>>> {
+    let expr = expr.inner();
+    let parent = rresult_return!(Distribution::try_from(parent));
+    let children = rresult_return!(
+        children
+            .into_iter()
+            .map(Distribution::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    let children_borrowed = children.iter().collect::<Vec<_>>();
+
+    let result = rresult_return!(expr.propagate_statistics(&parent, &children_borrowed));
+    let result = rresult_return!(
+        result
+            .map(|dists| dists
+                .iter()
+                .map(FFI_Distribution::try_from)
+                .collect::<Result<RVec<_>>>())
+            .transpose()
+    );
+
+    RResult::ROk(result.into())
+}
+
+unsafe extern "C" fn get_properties_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+    children: RVec<FFI_ExprProperties>,
+) -> FFIResult<FFI_ExprProperties> {
+    let expr = expr.inner();
+    let children = rresult_return!(
+        children
+            .into_iter()
+            .map(ExprProperties::try_from)
+            .collect::<Result<Vec<_>>>()
+    );
+    rresult!(
+        expr.get_properties(&children)
+            .and_then(|p| FFI_ExprProperties::try_from(&p))
+    )
+}
+
+unsafe extern "C" fn fmt_sql_fn_wrapper(expr: &FFI_PhysicalExpr) -> FFIResult<RString> {
+    let expr = expr.inner();
+    let result = fmt_sql(expr.as_ref()).to_string();
+    RResult::ROk(result.into())
+}
+
+unsafe extern "C" fn snapshot_fn_wrapper(
+    expr: &FFI_PhysicalExpr,
+) -> FFIResult<ROption<FFI_PhysicalExpr>> {
+    let expr = expr.inner();
+    rresult!(
+        expr.snapshot()
+            .map(|snapshot| snapshot.map(FFI_PhysicalExpr::from).into())
+    )
+}
+
+unsafe extern "C" fn snapshot_generation_fn_wrapper(expr: &FFI_PhysicalExpr) -> u64 {
+    let expr = expr.inner();
+    expr.snapshot_generation()
+}
+
+unsafe extern "C" fn is_volatile_node_fn_wrapper(expr: &FFI_PhysicalExpr) -> bool {
+    let expr = expr.inner();
+    expr.is_volatile_node()
+}
+unsafe extern "C" fn display_fn_wrapper(expr: &FFI_PhysicalExpr) -> RString {
+    let expr = expr.inner();
+    format!("{expr}").into()
+}
+
+unsafe extern "C" fn hash_fn_wrapper(expr: &FFI_PhysicalExpr) -> u64 {
+    let expr = expr.inner();
+    let mut hasher = DefaultHasher::new();
+    expr.hash(&mut hasher);
+    hasher.finish()
+}
+
+unsafe extern "C" fn release_fn_wrapper(expr: &mut FFI_PhysicalExpr) {
+    unsafe {
+        debug_assert!(!expr.private_data.is_null());
+        let private_data =
+            Box::from_raw(expr.private_data as *mut PhysicalExprPrivateData);
+        drop(private_data);
+        expr.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(expr: &FFI_PhysicalExpr) -> FFI_PhysicalExpr {
+    unsafe {
+        let old_private_data = expr.private_data as *const PhysicalExprPrivateData;
+
+        let private_data = Box::into_raw(Box::new(PhysicalExprPrivateData {
+            expr: Arc::clone(&(*old_private_data).expr),
+        })) as *mut c_void;
+
+        FFI_PhysicalExpr {
+            data_type: data_type_fn_wrapper,
+            nullable: nullable_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            return_field: return_field_fn_wrapper,
+            evaluate_selection: evaluate_selection_fn_wrapper,
+            children: children_fn_wrapper,
+            new_with_children: new_with_children_fn_wrapper,
+            evaluate_bounds: evaluate_bounds_fn_wrapper,
+            propagate_constraints: propagate_constraints_fn_wrapper,
+            evaluate_statistics: evaluate_statistics_fn_wrapper,
+            propagate_statistics: propagate_statistics_fn_wrapper,
+            get_properties: get_properties_fn_wrapper,
+            fmt_sql: fmt_sql_fn_wrapper,
+            snapshot: snapshot_fn_wrapper,
+            snapshot_generation: snapshot_generation_fn_wrapper,
+            is_volatile_node: is_volatile_node_fn_wrapper,
+            display: display_fn_wrapper,
+            hash: hash_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_PhysicalExpr {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<Arc<dyn PhysicalExpr>> for FFI_PhysicalExpr {
+    /// Creates a new [`FFI_PhysicalExpr`].
+    fn from(expr: Arc<dyn PhysicalExpr>) -> Self {
+        if let Some(expr) = expr.as_any().downcast_ref::<ForeignPhysicalExpr>() {
+            return expr.expr.clone();
+        }
+
+        let private_data = Box::new(PhysicalExprPrivateData { expr });
+
+        Self {
+            data_type: data_type_fn_wrapper,
+            nullable: nullable_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            return_field: return_field_fn_wrapper,
+            evaluate_selection: evaluate_selection_fn_wrapper,
+            children: children_fn_wrapper,
+            new_with_children: new_with_children_fn_wrapper,
+            evaluate_bounds: evaluate_bounds_fn_wrapper,
+            propagate_constraints: propagate_constraints_fn_wrapper,
+            evaluate_statistics: evaluate_statistics_fn_wrapper,
+            propagate_statistics: propagate_statistics_fn_wrapper,
+            get_properties: get_properties_fn_wrapper,
+            fmt_sql: fmt_sql_fn_wrapper,
+            snapshot: snapshot_fn_wrapper,
+            snapshot_generation: snapshot_generation_fn_wrapper,
+            is_volatile_node: is_volatile_node_fn_wrapper,
+            display: display_fn_wrapper,
+            hash: hash_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_PhysicalExpr to interact with the expression.
+#[derive(Debug)]
+pub struct ForeignPhysicalExpr {
+    expr: FFI_PhysicalExpr,
+    children: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+unsafe impl Send for ForeignPhysicalExpr {}
+unsafe impl Sync for ForeignPhysicalExpr {}
+
+impl From<&FFI_PhysicalExpr> for Arc<dyn PhysicalExpr> {
+    fn from(ffi_expr: &FFI_PhysicalExpr) -> Self {
+        if (ffi_expr.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(ffi_expr.inner())
+        } else {
+            let children = unsafe {
+                (ffi_expr.children)(ffi_expr)
+                    .into_iter()
+                    .map(|expr| <Arc<dyn PhysicalExpr>>::from(&expr))
+                    .collect()
+            };
+
+            Arc::new(ForeignPhysicalExpr {
+                expr: ffi_expr.clone(),
+                children,
+            })
+        }
+    }
+}
+
+impl Clone for FFI_PhysicalExpr {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl PhysicalExpr for ForeignPhysicalExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            df_result!((self.expr.data_type)(&self.expr, schema))
+                .and_then(|d| DataType::try_from(&d.0).map_err(Into::into))
+        }
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            df_result!((self.expr.nullable)(&self.expr, schema))
+        }
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        unsafe {
+            let batch = df_result!(record_batch_to_wrapped_array(batch.clone()))?;
+            df_result!((self.expr.evaluate)(&self.expr, batch))
+                .and_then(ColumnarValue::try_from)
+        }
+    }
+
+    fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
+        unsafe {
+            let schema = WrappedSchema::from(Arc::new(input_schema.clone()));
+            let result = df_result!((self.expr.return_field)(&self.expr, schema))?;
+            Field::try_from(&result.0).map(Arc::new).map_err(Into::into)
+        }
+    }
+
+    fn evaluate_selection(
+        &self,
+        batch: &RecordBatch,
+        selection: &BooleanArray,
+    ) -> Result<ColumnarValue> {
+        unsafe {
+            let batch = df_result!(record_batch_to_wrapped_array(batch.clone()))?;
+            // This is not ideal - we are cloning the selection array
+            // This is not terrible since it will be a small array.
+            // The other alternative is to modify the trait signature.
+            let selection: ArrayRef = Arc::new(selection.clone());
+            let selection = WrappedArray::try_from(&selection)?;
+            df_result!((self.expr.evaluate_selection)(&self.expr, batch, selection))
+                .and_then(ColumnarValue::try_from)
+        }
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.children.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        unsafe {
+            let children = children.into_iter().map(FFI_PhysicalExpr::from).collect();
+            df_result!(
+                (self.expr.new_with_children)(&self.expr, &children).map(|expr| <Arc<
+                    dyn PhysicalExpr,
+                >>::from(
+                    &expr
+                ))
+            )
+        }
+    }
+
+    fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(|interval| FFI_Interval::try_from(*interval))
+                .collect::<Result<RVec<_>>>()?;
+            df_result!((self.expr.evaluate_bounds)(&self.expr, children))
+                .and_then(Interval::try_from)
+        }
+    }
+
+    fn propagate_constraints(
+        &self,
+        interval: &Interval,
+        children: &[&Interval],
+    ) -> Result<Option<Vec<Interval>>> {
+        unsafe {
+            let interval = interval.try_into()?;
+            let children = children
+                .iter()
+                .map(|interval| FFI_Interval::try_from(*interval))
+                .collect::<Result<RVec<_>>>()?;
+            let result = df_result!((self.expr.propagate_constraints)(
+                &self.expr, interval, children
+            ))?;
+
+            let result: Option<_> = result
+                .map(|intervals| {
+                    intervals
+                        .into_iter()
+                        .map(Interval::try_from)
+                        .collect::<Result<Vec<_>>>()
+                })
+                .into();
+            result.transpose()
+        }
+    }
+
+    fn evaluate_statistics(&self, children: &[&Distribution]) -> Result<Distribution> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(|dist| FFI_Distribution::try_from(*dist))
+                .collect::<Result<RVec<_>>>()?;
+
+            let result =
+                df_result!((self.expr.evaluate_statistics)(&self.expr, children))?;
+            Distribution::try_from(result)
+        }
+    }
+
+    fn propagate_statistics(
+        &self,
+        parent: &Distribution,
+        children: &[&Distribution],
+    ) -> Result<Option<Vec<Distribution>>> {
+        unsafe {
+            let parent = FFI_Distribution::try_from(parent)?;
+            let children = children
+                .iter()
+                .map(|dist| FFI_Distribution::try_from(*dist))
+                .collect::<Result<RVec<_>>>()?;
+            let result = df_result!((self.expr.propagate_statistics)(
+                &self.expr, parent, children
+            ))?;
+
+            let result: Option<Result<Vec<Distribution>>> = result
+                .map(|dists| {
+                    dists
+                        .into_iter()
+                        .map(Distribution::try_from)
+                        .collect::<Result<Vec<_>>>()
+                })
+                .into();
+
+            result.transpose()
+        }
+    }
+
+    fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
+        unsafe {
+            let children = children
+                .iter()
+                .map(FFI_ExprProperties::try_from)
+                .collect::<Result<RVec<_>>>()?;
+            df_result!((self.expr.get_properties)(&self.expr, children))
+                .and_then(ExprProperties::try_from)
+        }
+    }
+
+    fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        unsafe {
+            match (self.expr.fmt_sql)(&self.expr) {
+                RResult::ROk(sql) => write!(f, "{sql}"),
+                RResult::RErr(_) => Err(std::fmt::Error),
+            }
+        }
+    }
+
+    fn snapshot(&self) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        unsafe {
+            let result = df_result!((self.expr.snapshot)(&self.expr))?;
+            Ok(result
+                .map(|expr| <Arc<dyn PhysicalExpr>>::from(&expr))
+                .into())
+        }
+    }
+
+    fn snapshot_generation(&self) -> u64 {
+        unsafe { (self.expr.snapshot_generation)(&self.expr) }
+    }
+
+    fn is_volatile_node(&self) -> bool {
+        unsafe { (self.expr.is_volatile_node)(&self.expr) }
+    }
+}
+
+impl Eq for ForeignPhysicalExpr {}
+impl PartialEq for ForeignPhysicalExpr {
+    fn eq(&self, other: &Self) -> bool {
+        // FFI_PhysicalExpr cannot be compared, so identity equality is the best we can do.
+        std::ptr::eq(self, other)
+    }
+}
+impl Hash for ForeignPhysicalExpr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let value = unsafe { (self.expr.hash)(&self.expr) };
+        value.hash(state)
+    }
+}
+
+impl Display for ForeignPhysicalExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let display = unsafe { (self.expr.display)(&self.expr) };
+        write!(f, "{display}")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{DefaultHasher, Hash, Hasher};
+    use std::sync::Arc;
+
+    use arrow::array::{BooleanArray, RecordBatch, record_batch};
+    use datafusion_common::tree_node::DynTreeNode;
+    use datafusion_common::{DataFusionError, ScalarValue};
+    use datafusion_expr::interval_arithmetic::Interval;
+    use datafusion_expr::statistics::Distribution;
+    use datafusion_physical_expr::expressions::{Column, NegativeExpr, NotExpr};
+    use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, fmt_sql};
+
+    use crate::physical_expr::FFI_PhysicalExpr;
+
+    fn create_test_expr() -> (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>) {
+        let original = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let mut ffi_expr = FFI_PhysicalExpr::from(Arc::clone(&original));
+        ffi_expr.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_expr: Arc<dyn PhysicalExpr> = (&ffi_expr).into();
+
+        (original, foreign_expr)
+    }
+
+    fn test_record_batch() -> RecordBatch {
+        record_batch!(("a", Int32, [1, 2, 3])).unwrap()
+    }
+
+    #[test]
+    fn ffi_physical_expr_fields() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let schema = test_record_batch().schema();
+
+        // Verify the mock marker worked, otherwise tests to follow are not useful
+        assert_ne!(original.as_ref(), foreign_expr.as_ref());
+
+        assert_eq!(
+            original.return_field(&schema)?,
+            foreign_expr.return_field(&schema)?
+        );
+
+        assert_eq!(
+            original.data_type(&schema)?,
+            foreign_expr.data_type(&schema)?
+        );
+        assert_eq!(original.nullable(&schema)?, foreign_expr.nullable(&schema)?);
+
+        Ok(())
+    }
+    #[test]
+    fn ffi_physical_expr_evaluate() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let rb = test_record_batch();
+
+        assert_eq!(
+            original.evaluate(&rb)?.to_array(3)?.as_ref(),
+            foreign_expr.evaluate(&rb)?.to_array(3)?.as_ref()
+        );
+
+        Ok(())
+    }
+    #[test]
+    fn ffi_physical_expr_selection() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+        let rb = test_record_batch();
+
+        let selection = BooleanArray::from(vec![true, false, true]);
+
+        assert_eq!(
+            original
+                .evaluate_selection(&rb, &selection)?
+                .to_array(3)?
+                .as_ref(),
+            foreign_expr
+                .evaluate_selection(&rb, &selection)?
+                .to_array(3)?
+                .as_ref()
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_with_children() -> Result<(), DataFusionError> {
+        let (original, _) = create_test_expr();
+        let not_expr =
+            Arc::new(NotExpr::new(Arc::clone(&original))) as Arc<dyn PhysicalExpr>;
+        let mut ffi_not = FFI_PhysicalExpr::from(not_expr);
+        ffi_not.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_not: Arc<dyn PhysicalExpr> = (&ffi_not).into();
+
+        let replacement = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
+        let updated =
+            Arc::clone(&foreign_not).with_new_children(vec![Arc::clone(&replacement)])?;
+        assert_eq!(
+            format!("{updated:?}").as_str(),
+            "NotExpr { arg: Column { name: \"b\", index: 1 } }"
+        );
+
+        let updated = foreign_not
+            .with_new_arc_children(Arc::clone(&foreign_not), vec![replacement])?;
+        assert_eq!(format!("{updated}").as_str(), "NOT b@1");
+
+        Ok(())
+    }
+
+    fn create_test_negative_expr() -> (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>) {
+        let (original, _) = create_test_expr();
+
+        let negative_expr =
+            Arc::new(NegativeExpr::new(Arc::clone(&original))) as Arc<dyn PhysicalExpr>;
+        let mut ffi_neg = FFI_PhysicalExpr::from(Arc::clone(&negative_expr));
+        ffi_neg.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_neg: Arc<dyn PhysicalExpr> = (&ffi_neg).into();
+
+        (negative_expr, foreign_neg)
+    }
+
+    #[test]
+    fn ffi_physical_expr_bounds() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+        let left = negative_expr.evaluate_bounds(&[&interval])?;
+        let right = foreign_neg.evaluate_bounds(&[&interval])?;
+
+        assert_eq!(left, right);
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_constraints() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+
+        let child =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+        let left = negative_expr.propagate_constraints(&interval, &[&child])?;
+        let right = foreign_neg.propagate_constraints(&interval, &[&child])?;
+
+        assert_eq!(left, right);
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_statistics() -> Result<(), DataFusionError> {
+        let (negative_expr, foreign_neg) = create_test_negative_expr();
+        let interval =
+            Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(Some(10)))?;
+
+        for distribution in [
+            Distribution::new_uniform(interval.clone())?,
+            Distribution::new_exponential(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                true,
+            )?,
+            Distribution::new_gaussian(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+            )?,
+            Distribution::new_generic(
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                ScalarValue::Int32(Some(10)),
+                interval,
+            )?,
+        ] {
+            let left = negative_expr.evaluate_statistics(&[&distribution])?;
+            let right = foreign_neg.evaluate_statistics(&[&distribution])?;
+
+            assert_eq!(left, right);
+
+            let left =
+                negative_expr.propagate_statistics(&distribution, &[&distribution])?;
+            let right =
+                foreign_neg.propagate_statistics(&distribution, &[&distribution])?;
+
+            assert_eq!(left, right);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_properties() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = original.get_properties(&[])?;
+        let right = foreign_expr.get_properties(&[])?;
+
+        assert_eq!(left.sort_properties, right.sort_properties);
+        assert_eq!(left.range, right.range);
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_formatting() {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = format!("{}", fmt_sql(original.as_ref()));
+        let right = format!("{}", fmt_sql(foreign_expr.as_ref()));
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn ffi_physical_expr_snapshots() -> Result<(), DataFusionError> {
+        let (original, foreign_expr) = create_test_expr();
+
+        let left = original.snapshot()?;
+        let right = foreign_expr.snapshot()?;
+        assert_eq!(left, right);
+
+        assert_eq!(
+            original.snapshot_generation(),
+            foreign_expr.snapshot_generation()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_expr_volatility() {
+        let (original, foreign_expr) = create_test_expr();
+        assert_eq!(original.is_volatile_node(), foreign_expr.is_volatile_node());
+    }
+
+    #[test]
+    fn ffi_physical_expr_hash() {
+        let (_, foreign_1) = create_test_expr();
+        let (_, foreign_2) = create_test_expr();
+
+        assert_ne!(&foreign_1, &foreign_2);
+
+        let mut hasher = DefaultHasher::new();
+        foreign_1.as_ref().hash(&mut hasher);
+        let hash_1 = hasher.finish();
+
+        let mut hasher = DefaultHasher::new();
+        foreign_2.as_ref().hash(&mut hasher);
+        let hash_2 = hasher.finish();
+
+        // We cannot compare a local object and a foreign object
+        // so create two foreign objects that *should* be identical
+        // even though they were created differently.
+        assert_eq!(hash_1, hash_2);
+    }
+
+    #[test]
+    fn ffi_physical_expr_display() {
+        let (original, foreign_expr) = create_test_expr();
+        assert_eq!(format!("{original}"), format!("{foreign_expr}"));
+    }
+}
diff --git a/datafusion/ffi/src/physical_expr/partitioning.rs b/datafusion/ffi/src/physical_expr/partitioning.rs
new file mode 100644
index 0000000000000..cda4fd2c97f45
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/partitioning.rs
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::RVec;
+use datafusion_physical_expr::Partitioning;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+use crate::physical_expr::FFI_PhysicalExpr;
+
+/// A stable struct for sharing [`Partitioning`] across FFI boundaries.
+/// See ['Partitioning'] for the meaning of each variant.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub enum FFI_Partitioning {
+    RoundRobinBatch(usize),
+    Hash(RVec<FFI_PhysicalExpr>, usize),
+    UnknownPartitioning(usize),
+}
+
+impl From<&Partitioning> for FFI_Partitioning {
+    fn from(value: &Partitioning) -> Self {
+        match value {
+            Partitioning::RoundRobinBatch(size) => Self::RoundRobinBatch(*size),
+            Partitioning::Hash(exprs, size) => {
+                let exprs = exprs
+                    .iter()
+                    .map(Arc::clone)
+                    .map(FFI_PhysicalExpr::from)
+                    .collect();
+                Self::Hash(exprs, *size)
+            }
+            Partitioning::UnknownPartitioning(size) => Self::UnknownPartitioning(*size),
+        }
+    }
+}
+
+impl From<&FFI_Partitioning> for Partitioning {
+    fn from(value: &FFI_Partitioning) -> Self {
+        match value {
+            FFI_Partitioning::RoundRobinBatch(size) => {
+                Partitioning::RoundRobinBatch(*size)
+            }
+            FFI_Partitioning::Hash(exprs, size) => {
+                let exprs = exprs.iter().map(<Arc<dyn PhysicalExpr>>::from).collect();
+                Self::Hash(exprs, *size)
+            }
+            FFI_Partitioning::UnknownPartitioning(size) => {
+                Self::UnknownPartitioning(*size)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_physical_expr::Partitioning;
+    use datafusion_physical_expr::expressions::lit;
+
+    use crate::physical_expr::partitioning::FFI_Partitioning;
+
+    #[test]
+    fn round_trip_ffi_partitioning() {
+        for partitioning in [
+            Partitioning::RoundRobinBatch(10),
+            Partitioning::Hash(vec![lit(1)], 10),
+            Partitioning::UnknownPartitioning(10),
+        ] {
+            let ffi_partitioning: FFI_Partitioning = (&partitioning).into();
+            let returned: Partitioning = (&ffi_partitioning).into();
+
+            if let Partitioning::UnknownPartitioning(return_size) = returned {
+                let Partitioning::UnknownPartitioning(original_size) = partitioning
+                else {
+                    panic!("Expected unknown partitioning")
+                };
+                assert_eq!(return_size, original_size);
+            } else {
+                assert_eq!(partitioning, returned);
+            }
+        }
+    }
+}
diff --git a/datafusion/ffi/src/physical_expr/sort.rs b/datafusion/ffi/src/physical_expr/sort.rs
new file mode 100644
index 0000000000000..fd3339b10555a
--- /dev/null
+++ b/datafusion/ffi/src/physical_expr/sort.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use arrow_schema::SortOptions;
+use datafusion_physical_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+use crate::expr::expr_properties::FFI_SortOptions;
+use crate::physical_expr::FFI_PhysicalExpr;
+
+/// A stable struct for sharing [`PhysicalSortExpr`] across FFI boundaries.
+/// See [`PhysicalSortExpr`] for the meaning of each field.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_PhysicalSortExpr {
+    expr: FFI_PhysicalExpr,
+    options: FFI_SortOptions,
+}
+
+impl From<&PhysicalSortExpr> for FFI_PhysicalSortExpr {
+    fn from(value: &PhysicalSortExpr) -> Self {
+        let expr = FFI_PhysicalExpr::from(value.clone().expr);
+        let options = FFI_SortOptions::from(&value.options);
+
+        Self { expr, options }
+    }
+}
+
+impl From<&FFI_PhysicalSortExpr> for PhysicalSortExpr {
+    fn from(value: &FFI_PhysicalSortExpr) -> Self {
+        let expr: Arc<dyn PhysicalExpr> = (&value.expr).into();
+        let options = SortOptions::from(&value.options);
+
+        Self { expr, options }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::SortOptions;
+    use datafusion_physical_expr::expressions::Column;
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+    use crate::physical_expr::sort::FFI_PhysicalSortExpr;
+
+    #[test]
+    fn ffi_sort_expr_round_trip() {
+        let col_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let expr = PhysicalSortExpr::new(col_expr, SortOptions::default());
+
+        let ffi_expr = FFI_PhysicalSortExpr::from(&expr);
+        let foreign_expr = PhysicalSortExpr::from(&ffi_expr);
+
+        assert_eq!(expr, foreign_expr);
+    }
+}
diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs
index 5c878fa4be79a..d009de3f04b99 100644
--- a/datafusion/ffi/src/plan_properties.rs
+++ b/datafusion/ffi/src/plan_properties.rs
@@ -15,46 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, sync::Arc};
-
-use abi_stable::{
-    std_types::{
-        RResult::{self, ROk},
-        RString, RVec,
-    },
-    StableAbi,
-};
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RVec};
 use arrow::datatypes::SchemaRef;
-use datafusion::{
-    error::{DataFusionError, Result},
-    physical_expr::EquivalenceProperties,
-    physical_plan::{
-        execution_plan::{Boundedness, EmissionType},
-        PlanProperties,
-    },
-    prelude::SessionContext,
-};
-use datafusion_proto::{
-    physical_plan::{
-        from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning},
-        to_proto::{serialize_partitioning, serialize_physical_sort_exprs},
-        DefaultPhysicalExtensionCodec,
-    },
-    protobuf::{Partitioning, PhysicalSortExprNodeCollection},
-};
-use prost::Message;
-
-use crate::{arrow_wrappers::WrappedSchema, df_result, rresult_return};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_plan::PlanProperties;
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::partitioning::FFI_Partitioning;
+use crate::physical_expr::sort::FFI_PhysicalSortExpr;
 
 /// A stable struct for sharing [`PlanProperties`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_PlanProperties {
-    /// The output partitioning is a [`Partitioning`] protobuf message serialized
-    /// into bytes to pass across the FFI boundary.
-    pub output_partitioning:
-        unsafe extern "C" fn(plan: &Self) -> RResult<RVec<u8>, RString>,
+    /// The output partitioning of the plan.
+    pub output_partitioning: unsafe extern "C" fn(plan: &Self) -> FFI_Partitioning,
 
     /// Return the emission type of the plan.
     pub emission_type: unsafe extern "C" fn(plan: &Self) -> FFI_EmissionType,
@@ -62,9 +44,9 @@ pub struct FFI_PlanProperties {
     /// Indicate boundedness of the plan and its memory requirements.
     pub boundedness: unsafe extern "C" fn(plan: &Self) -> FFI_Boundedness,
 
-    /// The output ordering is a [`PhysicalSortExprNodeCollection`] protobuf message
-    /// serialized into bytes to pass across the FFI boundary.
-    pub output_ordering: unsafe extern "C" fn(plan: &Self) -> RResult<RVec<u8>, RString>,
+    /// The output ordering of the plan.
+    pub output_ordering:
+        unsafe extern "C" fn(plan: &Self) -> ROption<RVec<FFI_PhysicalSortExpr>>,
 
     /// Return the schema of the plan.
     pub schema: unsafe extern "C" fn(plan: &Self) -> WrappedSchema,
@@ -75,77 +57,70 @@ pub struct FFI_PlanProperties {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// The foreign library should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 struct PlanPropertiesPrivateData {
     props: PlanProperties,
 }
 
+impl FFI_PlanProperties {
+    fn inner(&self) -> &PlanProperties {
+        let private_data = self.private_data as *const PlanPropertiesPrivateData;
+        unsafe { &(*private_data).props }
+    }
+}
+
 unsafe extern "C" fn output_partitioning_fn_wrapper(
     properties: &FFI_PlanProperties,
-) -> RResult<RVec<u8>, RString> {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let codec = DefaultPhysicalExtensionCodec {};
-    let partitioning_data =
-        rresult_return!(serialize_partitioning(props.output_partitioning(), &codec));
-    let output_partitioning = partitioning_data.encode_to_vec();
-
-    ROk(output_partitioning.into())
+) -> FFI_Partitioning {
+    properties.inner().output_partitioning().into()
 }
 
 unsafe extern "C" fn emission_type_fn_wrapper(
     properties: &FFI_PlanProperties,
 ) -> FFI_EmissionType {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-    props.emission_type.into()
+    properties.inner().emission_type.into()
 }
 
 unsafe extern "C" fn boundedness_fn_wrapper(
     properties: &FFI_PlanProperties,
 ) -> FFI_Boundedness {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-    props.boundedness.into()
+    properties.inner().boundedness.into()
 }
 
 unsafe extern "C" fn output_ordering_fn_wrapper(
     properties: &FFI_PlanProperties,
-) -> RResult<RVec<u8>, RString> {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let codec = DefaultPhysicalExtensionCodec {};
-    let output_ordering = match props.output_ordering() {
-        Some(ordering) => {
-            let physical_sort_expr_nodes = rresult_return!(
-                serialize_physical_sort_exprs(ordering.to_owned(), &codec)
-            );
-            let ordering_data = PhysicalSortExprNodeCollection {
-                physical_sort_expr_nodes,
-            };
-
-            ordering_data.encode_to_vec()
-        }
-        None => Vec::default(),
-    };
-    ROk(output_ordering.into())
+) -> ROption<RVec<FFI_PhysicalSortExpr>> {
+    let ordering: Option<RVec<FFI_PhysicalSortExpr>> =
+        properties.inner().output_ordering().map(|lex_ordering| {
+            let vec_ordering: Vec<PhysicalSortExpr> = lex_ordering.clone().into();
+            vec_ordering
+                .iter()
+                .map(FFI_PhysicalSortExpr::from)
+                .collect()
+        });
+
+    ordering.into()
 }
 
 unsafe extern "C" fn schema_fn_wrapper(properties: &FFI_PlanProperties) -> WrappedSchema {
-    let private_data = properties.private_data as *const PlanPropertiesPrivateData;
-    let props = &(*private_data).props;
-
-    let schema: SchemaRef = Arc::clone(props.eq_properties.schema());
+    let schema: SchemaRef = Arc::clone(properties.inner().eq_properties.schema());
     schema.into()
 }
 
 unsafe extern "C" fn release_fn_wrapper(props: &mut FFI_PlanProperties) {
-    let private_data =
-        Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!props.private_data.is_null());
+        let private_data =
+            Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData);
+        drop(private_data);
+        props.private_data = std::ptr::null_mut();
+    }
 }
 
 impl Drop for FFI_PlanProperties {
@@ -168,6 +143,7 @@ impl From<&PlanProperties> for FFI_PlanProperties {
             schema: schema_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -176,46 +152,30 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
     type Error = DataFusionError;
 
     fn try_from(ffi_props: FFI_PlanProperties) -> Result<Self, Self::Error> {
+        if (ffi_props.library_marker_id)() == crate::get_library_marker_id() {
+            return Ok(ffi_props.inner().clone());
+        }
+
         let ffi_schema = unsafe { (ffi_props.schema)(&ffi_props) };
         let schema = (&ffi_schema.0).try_into()?;
 
-        // TODO Extend FFI to get the registry and codex
-        let default_ctx = SessionContext::new();
-        let codex = DefaultPhysicalExtensionCodec {};
-
-        let ffi_orderings = unsafe { (ffi_props.output_ordering)(&ffi_props) };
-
-        let proto_output_ordering =
-            PhysicalSortExprNodeCollection::decode(df_result!(ffi_orderings)?.as_ref())
-                .map_err(|e| DataFusionError::External(Box::new(e)))?;
-        let orderings = Some(parse_physical_sort_exprs(
-            &proto_output_ordering.physical_sort_expr_nodes,
-            &default_ctx,
-            &schema,
-            &codex,
-        )?);
-
-        let partitioning_vec =
-            unsafe { df_result!((ffi_props.output_partitioning)(&ffi_props))? };
-        let proto_output_partitioning =
-            Partitioning::decode(partitioning_vec.as_ref())
-                .map_err(|e| DataFusionError::External(Box::new(e)))?;
-        let partitioning = parse_protobuf_partitioning(
-            Some(&proto_output_partitioning),
-            &default_ctx,
-            &schema,
-            &codex,
-        )?
-        .ok_or(DataFusionError::Plan(
-            "Unable to deserialize partitioning protobuf in FFI_PlanProperties"
-                .to_string(),
-        ))?;
-
-        let eq_properties = match orderings {
-            Some(ordering) => {
-                EquivalenceProperties::new_with_orderings(Arc::new(schema), &[ordering])
-            }
-            None => EquivalenceProperties::new(Arc::new(schema)),
+        let ffi_orderings: Option<RVec<FFI_PhysicalSortExpr>> =
+            unsafe { (ffi_props.output_ordering)(&ffi_props) }.into();
+        let sort_exprs = ffi_orderings
+            .map(|ordering_vec| {
+                ordering_vec
+                    .iter()
+                    .map(PhysicalSortExpr::from)
+                    .collect::<Vec<_>>()
+            })
+            .unwrap_or_default();
+
+        let partitioning = unsafe { (ffi_props.output_partitioning)(&ffi_props) };
+
+        let eq_properties = if sort_exprs.is_empty() {
+            EquivalenceProperties::new(Arc::new(schema))
+        } else {
+            EquivalenceProperties::new_with_orderings(Arc::new(schema), [sort_exprs])
         };
 
         let emission_type: EmissionType =
@@ -226,7 +186,7 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
 
         Ok(PlanProperties::new(
             eq_properties,
-            partitioning,
+            (&partitioning).into(),
             emission_type,
             boundedness,
         ))
@@ -235,7 +195,6 @@ impl TryFrom<FFI_PlanProperties> for PlanProperties {
 
 /// FFI safe version of [`Boundedness`].
 #[repr(C)]
-#[allow(non_camel_case_types)]
 #[derive(Clone, StableAbi)]
 pub enum FFI_Boundedness {
     Bounded,
@@ -270,7 +229,6 @@ impl From<FFI_Boundedness> for Boundedness {
 
 /// FFI safe version of [`EmissionType`].
 #[repr(C)]
-#[allow(non_camel_case_types)]
 #[derive(Clone, StableAbi)]
 pub enum FFI_EmissionType {
     Incremental,
@@ -300,24 +258,34 @@ impl From<FFI_EmissionType> for EmissionType {
 
 #[cfg(test)]
 mod tests {
+    use datafusion::physical_expr::PhysicalSortExpr;
     use datafusion::physical_plan::Partitioning;
 
     use super::*;
 
-    #[test]
-    fn test_round_trip_ffi_plan_properties() -> Result<()> {
+    fn create_test_props() -> Result<PlanProperties> {
         use arrow::datatypes::{DataType, Field, Schema};
         let schema =
             Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
 
-        let original_props = PlanProperties::new(
-            EquivalenceProperties::new(schema),
-            Partitioning::UnknownPartitioning(3),
+        let mut eqp = EquivalenceProperties::new(Arc::clone(&schema));
+        let _ = eqp.reorder([PhysicalSortExpr::new_default(
+            datafusion::physical_plan::expressions::col("a", &schema)?,
+        )]);
+        Ok(PlanProperties::new(
+            eqp,
+            Partitioning::RoundRobinBatch(3),
             EmissionType::Incremental,
             Boundedness::Bounded,
-        );
+        ))
+    }
 
-        let local_props_ptr = FFI_PlanProperties::from(&original_props);
+    #[test]
+    fn test_round_trip_ffi_plan_properties() -> Result<()> {
+        let original_props = create_test_props()?;
+
+        let mut local_props_ptr = FFI_PlanProperties::from(&original_props);
+        local_props_ptr.library_marker_id = crate::mock_foreign_marker_id;
 
         let foreign_props: PlanProperties = local_props_ptr.try_into()?;
 
@@ -325,4 +293,23 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_plan_properties_local_bypass() -> Result<()> {
+        let props = create_test_props()?;
+
+        let ffi_plan = FFI_PlanProperties::from(&props);
+
+        // Verify local libraries
+        let foreign_plan: PlanProperties = ffi_plan.try_into()?;
+        assert_eq!(format!("{foreign_plan:?}"), format!("{props:?}"));
+
+        // Verify different library markers still can produce identical properties
+        let mut ffi_plan = FFI_PlanProperties::from(&props);
+        ffi_plan.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_plan: PlanProperties = ffi_plan.try_into()?;
+        assert_eq!(format!("{foreign_plan:?}"), format!("{props:?}"));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/proto/logical_extension_codec.rs b/datafusion/ffi/src/proto/logical_extension_codec.rs
new file mode 100644
index 0000000000000..2beeead7039c0
--- /dev/null
+++ b/datafusion/ffi/src/proto/logical_extension_codec.rs
@@ -0,0 +1,730 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RSlice, RStr, RVec};
+use arrow::datatypes::SchemaRef;
+use datafusion_catalog::TableProvider;
+use datafusion_common::error::Result;
+use datafusion_common::{TableReference, not_impl_err};
+use datafusion_datasource::file_format::FileFormatFactory;
+use datafusion_execution::{TaskContext, TaskContextProvider};
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, Extension, LogicalPlan, ScalarUDF, ScalarUDFImpl,
+    WindowUDF, WindowUDFImpl,
+};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+use tokio::runtime::Handle;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContextProvider;
+use crate::table_provider::FFI_TableProvider;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
+
+/// A stable struct for sharing [`LogicalExtensionCodec`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_LogicalExtensionCodec {
+    /// Decode bytes into a table provider.
+    try_decode_table_provider: unsafe extern "C" fn(
+        &Self,
+        buf: RSlice<u8>,
+        table_ref: RStr,
+        schema: WrappedSchema,
+    ) -> FFIResult<FFI_TableProvider>,
+
+    /// Encode a table provider into bytes.
+    try_encode_table_provider: unsafe extern "C" fn(
+        &Self,
+        table_ref: RStr,
+        node: FFI_TableProvider,
+    ) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined scalar function.
+    try_decode_udf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_ScalarUDF>,
+
+    /// Encode a user defined scalar function into bytes.
+    try_encode_udf:
+        unsafe extern "C" fn(&Self, node: FFI_ScalarUDF) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined aggregate function.
+    try_decode_udaf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_AggregateUDF>,
+
+    /// Encode a user defined aggregate function into bytes.
+    try_encode_udaf:
+        unsafe extern "C" fn(&Self, node: FFI_AggregateUDF) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined window function.
+    try_decode_udwf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_WindowUDF>,
+
+    /// Encode a user defined window function into bytes.
+    try_encode_udwf:
+        unsafe extern "C" fn(&Self, node: FFI_WindowUDF) -> FFIResult<RVec<u8>>,
+
+    pub task_ctx_provider: FFI_TaskContextProvider,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignLogicalExtensionCodec`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_LogicalExtensionCodec {}
+unsafe impl Sync for FFI_LogicalExtensionCodec {}
+
+struct LogicalExtensionCodecPrivateData {
+    codec: Arc<dyn LogicalExtensionCodec>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_LogicalExtensionCodec {
+    fn inner(&self) -> &Arc<dyn LogicalExtensionCodec> {
+        let private_data = self.private_data as *const LogicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).codec }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const LogicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+
+    fn task_ctx(&self) -> Result<Arc<TaskContext>> {
+        (&self.task_ctx_provider).try_into()
+    }
+}
+
+unsafe extern "C" fn try_decode_table_provider_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    buf: RSlice<u8>,
+    table_ref: RStr,
+    schema: WrappedSchema,
+) -> FFIResult<FFI_TableProvider> {
+    let ctx = rresult_return!(codec.task_ctx());
+    let runtime = codec.runtime().clone();
+    let codec_inner = codec.inner();
+    let table_ref = TableReference::from(table_ref.as_str());
+    let schema: SchemaRef = schema.into();
+
+    let table_provider = rresult_return!(codec_inner.try_decode_table_provider(
+        buf.as_ref(),
+        &table_ref,
+        schema,
+        ctx.as_ref()
+    ));
+
+    RResult::ROk(FFI_TableProvider::new_with_ffi_codec(
+        table_provider,
+        true,
+        runtime,
+        codec.clone(),
+    ))
+}
+
+unsafe extern "C" fn try_encode_table_provider_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    table_ref: RStr,
+    node: FFI_TableProvider,
+) -> FFIResult<RVec<u8>> {
+    let table_ref = TableReference::from(table_ref.as_str());
+    let table_provider: Arc<dyn TableProvider> = (&node).into();
+    let codec = codec.inner();
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_table_provider(
+        &table_ref,
+        table_provider,
+        &mut bytes
+    ));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_ScalarUDF> {
+    let codec = codec.inner();
+
+    let udf = rresult_return!(codec.try_decode_udf(name.as_str(), buf.as_ref()));
+    let udf = FFI_ScalarUDF::from(udf);
+
+    RResult::ROk(udf)
+}
+
+unsafe extern "C" fn try_encode_udf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_ScalarUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let node: Arc<dyn ScalarUDFImpl> = (&node).into();
+    let node = ScalarUDF::new_from_shared_impl(node);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udf(&node, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udaf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_AggregateUDF> {
+    let codec_inner = codec.inner();
+    let udaf = rresult_return!(codec_inner.try_decode_udaf(name.into(), buf.as_ref()));
+    let udaf = FFI_AggregateUDF::from(udaf);
+
+    RResult::ROk(udaf)
+}
+
+unsafe extern "C" fn try_encode_udaf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_AggregateUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let udaf: Arc<dyn AggregateUDFImpl> = (&node).into();
+    let udaf = AggregateUDF::new_from_shared_impl(udaf);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udaf(&udaf, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udwf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_WindowUDF> {
+    let codec = codec.inner();
+    let udwf = rresult_return!(codec.try_decode_udwf(name.into(), buf.as_ref()));
+    let udwf = FFI_WindowUDF::from(udwf);
+
+    RResult::ROk(udwf)
+}
+
+unsafe extern "C" fn try_encode_udwf_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+    node: FFI_WindowUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let udwf: Arc<dyn WindowUDFImpl> = (&node).into();
+    let udwf = WindowUDF::new_from_shared_impl(udwf);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udwf(&udwf, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_LogicalExtensionCodec) {
+    unsafe {
+        let private_data =
+            Box::from_raw(provider.private_data as *mut LogicalExtensionCodecPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    codec: &FFI_LogicalExtensionCodec,
+) -> FFI_LogicalExtensionCodec {
+    let old_codec = Arc::clone(codec.inner());
+    let runtime = codec.runtime().clone();
+
+    FFI_LogicalExtensionCodec::new(old_codec, runtime, codec.task_ctx_provider.clone())
+}
+
+impl Drop for FFI_LogicalExtensionCodec {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_LogicalExtensionCodec {
+    /// Creates a new [`FFI_LogicalExtensionCodec`].
+    pub fn new(
+        codec: Arc<dyn LogicalExtensionCodec + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+    ) -> Self {
+        if let Some(codec) = (Arc::clone(&codec) as Arc<dyn Any>)
+            .downcast_ref::<ForeignLogicalExtensionCodec>()
+        {
+            return codec.0.clone();
+        }
+
+        let task_ctx_provider = task_ctx_provider.into();
+        let private_data = Box::new(LogicalExtensionCodecPrivateData { codec, runtime });
+
+        Self {
+            try_decode_table_provider: try_decode_table_provider_fn_wrapper,
+            try_encode_table_provider: try_encode_table_provider_fn_wrapper,
+            try_decode_udf: try_decode_udf_fn_wrapper,
+            try_encode_udf: try_encode_udf_fn_wrapper,
+            try_decode_udaf: try_decode_udaf_fn_wrapper,
+            try_encode_udaf: try_encode_udaf_fn_wrapper,
+            try_decode_udwf: try_decode_udwf_fn_wrapper,
+            try_encode_udwf: try_encode_udwf_fn_wrapper,
+            task_ctx_provider,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: crate::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+
+    pub fn new_default(task_ctx_provider: &Arc<dyn TaskContextProvider>) -> Self {
+        let task_ctx_provider = FFI_TaskContextProvider::from(task_ctx_provider);
+        let codec = Arc::new(DefaultLogicalExtensionCodec {});
+
+        Self::new(codec, None, task_ctx_provider)
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_LogicalExtensionCodec to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignLogicalExtensionCodec(pub FFI_LogicalExtensionCodec);
+
+unsafe impl Send for ForeignLogicalExtensionCodec {}
+unsafe impl Sync for ForeignLogicalExtensionCodec {}
+
+impl From<&FFI_LogicalExtensionCodec> for Arc<dyn LogicalExtensionCodec> {
+    fn from(provider: &FFI_LogicalExtensionCodec) -> Self {
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(provider.inner())
+        } else {
+            Arc::new(ForeignLogicalExtensionCodec(provider.clone()))
+        }
+    }
+}
+
+impl Clone for FFI_LogicalExtensionCodec {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl LogicalExtensionCodec for ForeignLogicalExtensionCodec {
+    fn try_decode(
+        &self,
+        _buf: &[u8],
+        _inputs: &[LogicalPlan],
+        _ctx: &TaskContext,
+    ) -> Result<Extension> {
+        not_impl_err!("FFI does not support decode of Extensions")
+    }
+
+    fn try_encode(&self, _node: &Extension, _buf: &mut Vec<u8>) -> Result<()> {
+        not_impl_err!("FFI does not support encode of Extensions")
+    }
+
+    fn try_decode_table_provider(
+        &self,
+        buf: &[u8],
+        table_ref: &TableReference,
+        schema: SchemaRef,
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let table_ref = table_ref.to_string();
+        let schema: WrappedSchema = schema.into();
+
+        let ffi_table_provider = unsafe {
+            df_result!((self.0.try_decode_table_provider)(
+                &self.0,
+                buf.into(),
+                table_ref.as_str().into(),
+                schema
+            ))
+        }?;
+
+        Ok((&ffi_table_provider).into())
+    }
+
+    fn try_encode_table_provider(
+        &self,
+        table_ref: &TableReference,
+        node: Arc<dyn TableProvider>,
+        buf: &mut Vec<u8>,
+    ) -> Result<()> {
+        let table_ref = table_ref.to_string();
+        let node =
+            FFI_TableProvider::new_with_ffi_codec(node, true, None, self.0.clone());
+
+        let bytes = df_result!(unsafe {
+            (self.0.try_encode_table_provider)(&self.0, table_ref.as_str().into(), node)
+        })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_file_format(
+        &self,
+        _buf: &[u8],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn FileFormatFactory>> {
+        not_impl_err!("FFI does not support decode_file_format")
+    }
+
+    fn try_encode_file_format(
+        &self,
+        _buf: &mut Vec<u8>,
+        _node: Arc<dyn FileFormatFactory>,
+    ) -> Result<()> {
+        not_impl_err!("FFI does not support encode_file_format")
+    }
+
+    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        let udf = unsafe {
+            df_result!((self.0.try_decode_udf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udf: Arc<dyn ScalarUDFImpl> = (&udf).into();
+
+        Ok(Arc::new(ScalarUDF::new_from_shared_impl(udf)))
+    }
+
+    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = FFI_ScalarUDF::from(Arc::new(node.clone()));
+        let bytes = df_result!(unsafe { (self.0.try_encode_udf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+        let udaf = unsafe {
+            df_result!((self.0.try_decode_udaf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udaf: Arc<dyn AggregateUDFImpl> = (&udaf).into();
+
+        Ok(Arc::new(AggregateUDF::new_from_shared_impl(udaf)))
+    }
+
+    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_AggregateUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udaf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+        let udwf = unsafe {
+            df_result!((self.0.try_decode_udwf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udwf: Arc<dyn WindowUDFImpl> = (&udwf).into();
+
+        Ok(Arc::new(WindowUDF::new_from_shared_impl(udwf)))
+    }
+
+    fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_WindowUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udwf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::record_batch;
+    use arrow_schema::{DataType, Field, Schema, SchemaRef};
+    use datafusion_catalog::{MemTable, TableProvider};
+    use datafusion_common::{Result, TableReference, exec_err};
+    use datafusion_datasource::file_format::FileFormatFactory;
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::ptr_eq::arc_ptr_eq;
+    use datafusion_expr::{AggregateUDF, Extension, LogicalPlan, ScalarUDF, WindowUDF};
+    use datafusion_functions::math::abs::AbsFunc;
+    use datafusion_functions_aggregate::sum::Sum;
+    use datafusion_functions_window::rank::{Rank, RankType};
+    use datafusion_proto::logical_plan::LogicalExtensionCodec;
+    use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+    use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+    use crate::proto::physical_extension_codec::tests::TestExtensionCodec;
+
+    fn create_test_table() -> MemTable {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
+        let rb = record_batch!(("a", Int32, [1, 2, 3]))
+            .expect("should be able to create a record batch");
+        MemTable::try_new(schema, vec![vec![rb]])
+            .expect("should be able to create an in memory table")
+    }
+
+    impl LogicalExtensionCodec for TestExtensionCodec {
+        fn try_decode(
+            &self,
+            _buf: &[u8],
+            _inputs: &[LogicalPlan],
+            _ctx: &TaskContext,
+        ) -> Result<Extension> {
+            unimplemented!()
+        }
+
+        fn try_encode(&self, _node: &Extension, _buf: &mut Vec<u8>) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn try_decode_table_provider(
+            &self,
+            buf: &[u8],
+            _table_ref: &TableReference,
+            schema: SchemaRef,
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn TableProvider>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if schema != create_test_table().schema() {
+                return exec_err!("Incorrect test table schema");
+            }
+
+            if buf.len() != 2 || buf[1] != Self::MEMTABLE_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode table provider");
+            }
+
+            Ok(Arc::new(create_test_table()) as Arc<dyn TableProvider>)
+        }
+
+        fn try_encode_table_provider(
+            &self,
+            _table_ref: &TableReference,
+            node: Arc<dyn TableProvider>,
+            buf: &mut Vec<u8>,
+        ) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            if !node.as_any().is::<MemTable>() {
+                return exec_err!("TestExtensionCodec only expects MemTable");
+            };
+
+            if node.schema() != create_test_table().schema() {
+                return exec_err!("Unexpected schema for encoding.");
+            }
+
+            buf.push(Self::MEMTABLE_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_file_format(
+            &self,
+            _buf: &[u8],
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn FileFormatFactory>> {
+            unimplemented!()
+        }
+
+        fn try_encode_file_format(
+            &self,
+            _buf: &mut Vec<u8>,
+            _node: Arc<dyn FileFormatFactory>,
+        ) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+            PhysicalExtensionCodec::try_decode_udf(self, name, buf)
+        }
+
+        fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udf(self, node, buf)
+        }
+
+        fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+            PhysicalExtensionCodec::try_decode_udaf(self, name, buf)
+        }
+
+        fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udaf(self, node, buf)
+        }
+
+        fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+            PhysicalExtensionCodec::try_decode_udwf(self, name, buf)
+        }
+
+        fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+            PhysicalExtensionCodec::try_encode_udwf(self, node, buf)
+        }
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_table_provider() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let table = Arc::new(create_test_table()) as Arc<dyn TableProvider>;
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_table_provider(&"my_table".into(), table, &mut bytes)?;
+
+        let returned_table = foreign_codec.try_decode_table_provider(
+            &bytes,
+            &"my_table".into(),
+            create_test_table().schema(),
+            ctx.task_ctx().as_ref(),
+        )?;
+
+        assert!(returned_table.as_any().is::<MemTable>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(ScalarUDF::from(AbsFunc::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<AbsFunc>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udaf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(AggregateUDF::from(Sum::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udaf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udaf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<Sum>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_logical_extension_codec_udwf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(WindowUDF::from(Rank::new(
+            "my_rank".to_owned(),
+            RankType::Basic,
+        )));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udwf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udwf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<Rank>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_logical_extension_codec_local_bypass() {
+        let codec =
+            Arc::new(TestExtensionCodec {}) as Arc<dyn LogicalExtensionCodec + Send>;
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_LogicalExtensionCodec::new(Arc::clone(&codec), None, task_ctx_provider);
+
+        let codec = codec as Arc<dyn LogicalExtensionCodec>;
+        // Verify local libraries can be downcast to their original
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+        assert!(arc_ptr_eq(&foreign_codec, &codec));
+
+        // Verify different library markers generate foreign providers
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn LogicalExtensionCodec> = (&ffi_codec).into();
+        assert!(!arc_ptr_eq(&foreign_codec, &codec));
+    }
+}
diff --git a/datafusion/ffi/src/proto/mod.rs b/datafusion/ffi/src/proto/mod.rs
new file mode 100644
index 0000000000000..ae76027ecb64e
--- /dev/null
+++ b/datafusion/ffi/src/proto/mod.rs
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod logical_extension_codec;
+pub mod physical_extension_codec;
diff --git a/datafusion/ffi/src/proto/physical_extension_codec.rs b/datafusion/ffi/src/proto/physical_extension_codec.rs
new file mode 100644
index 0000000000000..9d5c2e8af3a6c
--- /dev/null
+++ b/datafusion/ffi/src/proto/physical_extension_codec.rs
@@ -0,0 +1,680 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RSlice, RStr, RVec};
+use datafusion_common::error::Result;
+use datafusion_execution::TaskContext;
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, ScalarUDF, ScalarUDFImpl, WindowUDF, WindowUDFImpl,
+};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+use std::{any::Any, ffi::c_void, sync::Arc};
+use tokio::runtime::Handle;
+
+use crate::execution::FFI_TaskContextProvider;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
+
+/// A stable struct for sharing [`PhysicalExtensionCodec`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_PhysicalExtensionCodec {
+    /// Decode bytes into an execution plan.
+    try_decode: unsafe extern "C" fn(
+        &Self,
+        buf: RSlice<u8>,
+        inputs: RVec<FFI_ExecutionPlan>,
+    ) -> FFIResult<FFI_ExecutionPlan>,
+
+    /// Encode an execution plan into bytes.
+    try_encode:
+        unsafe extern "C" fn(&Self, node: FFI_ExecutionPlan) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined scalar function.
+    try_decode_udf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_ScalarUDF>,
+
+    /// Encode a user defined scalar function into bytes.
+    try_encode_udf:
+        unsafe extern "C" fn(&Self, node: FFI_ScalarUDF) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined aggregate function.
+    try_decode_udaf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_AggregateUDF>,
+
+    /// Encode a user defined aggregate function into bytes.
+    try_encode_udaf:
+        unsafe extern "C" fn(&Self, node: FFI_AggregateUDF) -> FFIResult<RVec<u8>>,
+
+    /// Decode bytes into a user defined window function.
+    try_decode_udwf: unsafe extern "C" fn(
+        &Self,
+        name: RStr,
+        buf: RSlice<u8>,
+    ) -> FFIResult<FFI_WindowUDF>,
+
+    /// Encode a user defined window function into bytes.
+    try_encode_udwf:
+        unsafe extern "C" fn(&Self, node: FFI_WindowUDF) -> FFIResult<RVec<u8>>,
+
+    /// Access the current [`TaskContext`].
+    task_ctx_provider: FFI_TaskContextProvider,
+
+    /// Used to create a clone on the provider of the execution plan. This should
+    /// only need to be called by the receiver of the plan.
+    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this provider.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignPhysicalExtensionCodec`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PhysicalExtensionCodec {}
+unsafe impl Sync for FFI_PhysicalExtensionCodec {}
+
+struct PhysicalExtensionCodecPrivateData {
+    codec: Arc<dyn PhysicalExtensionCodec>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_PhysicalExtensionCodec {
+    fn inner(&self) -> &Arc<dyn PhysicalExtensionCodec> {
+        let private_data = self.private_data as *const PhysicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).codec }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const PhysicalExtensionCodecPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+}
+
+unsafe extern "C" fn try_decode_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    buf: RSlice<u8>,
+    inputs: RVec<FFI_ExecutionPlan>,
+) -> FFIResult<FFI_ExecutionPlan> {
+    let runtime = codec.runtime().clone();
+    let task_ctx: Arc<TaskContext> =
+        rresult_return!((&codec.task_ctx_provider).try_into());
+    let codec = codec.inner();
+    let inputs = inputs
+        .into_iter()
+        .map(|plan| <Arc<dyn ExecutionPlan>>::try_from(&plan))
+        .collect::<Result<Vec<_>>>();
+    let inputs = rresult_return!(inputs);
+
+    let plan =
+        rresult_return!(codec.try_decode(buf.as_ref(), &inputs, task_ctx.as_ref()));
+
+    RResult::ROk(FFI_ExecutionPlan::new(plan, runtime))
+}
+
+unsafe extern "C" fn try_encode_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_ExecutionPlan,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+
+    let plan: Arc<dyn ExecutionPlan> = rresult_return!((&node).try_into());
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode(plan, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_ScalarUDF> {
+    let codec = codec.inner();
+
+    let udf = rresult_return!(codec.try_decode_udf(name.as_str(), buf.as_ref()));
+    let udf = FFI_ScalarUDF::from(udf);
+
+    RResult::ROk(udf)
+}
+
+unsafe extern "C" fn try_encode_udf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_ScalarUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let node: Arc<dyn ScalarUDFImpl> = (&node).into();
+    let node = ScalarUDF::new_from_shared_impl(node);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udf(&node, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udaf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_AggregateUDF> {
+    let codec_inner = codec.inner();
+    let udaf = rresult_return!(codec_inner.try_decode_udaf(name.into(), buf.as_ref()));
+    let udaf = FFI_AggregateUDF::from(udaf);
+
+    RResult::ROk(udaf)
+}
+
+unsafe extern "C" fn try_encode_udaf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_AggregateUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let udaf: Arc<dyn AggregateUDFImpl> = (&node).into();
+    let udaf = AggregateUDF::new_from_shared_impl(udaf);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udaf(&udaf, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn try_decode_udwf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    name: RStr,
+    buf: RSlice<u8>,
+) -> FFIResult<FFI_WindowUDF> {
+    let codec = codec.inner();
+    let udwf = rresult_return!(codec.try_decode_udwf(name.into(), buf.as_ref()));
+    let udwf = FFI_WindowUDF::from(udwf);
+
+    RResult::ROk(udwf)
+}
+
+unsafe extern "C" fn try_encode_udwf_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+    node: FFI_WindowUDF,
+) -> FFIResult<RVec<u8>> {
+    let codec = codec.inner();
+    let udwf: Arc<dyn WindowUDFImpl> = (&node).into();
+    let udwf = WindowUDF::new_from_shared_impl(udwf);
+
+    let mut bytes = Vec::new();
+    rresult_return!(codec.try_encode_udwf(&udwf, &mut bytes));
+
+    RResult::ROk(bytes.into())
+}
+
+unsafe extern "C" fn release_fn_wrapper(codec: &mut FFI_PhysicalExtensionCodec) {
+    unsafe {
+        let private_data =
+            Box::from_raw(codec.private_data as *mut PhysicalExtensionCodecPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    codec: &FFI_PhysicalExtensionCodec,
+) -> FFI_PhysicalExtensionCodec {
+    let old_codec = Arc::clone(codec.inner());
+    let runtime = codec.runtime().clone();
+
+    FFI_PhysicalExtensionCodec::new(old_codec, runtime, codec.task_ctx_provider.clone())
+}
+
+impl Drop for FFI_PhysicalExtensionCodec {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_PhysicalExtensionCodec {
+    /// Creates a new [`FFI_PhysicalExtensionCodec`].
+    pub fn new(
+        codec: Arc<dyn PhysicalExtensionCodec + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+    ) -> Self {
+        if let Some(codec) = (Arc::clone(&codec) as Arc<dyn Any>)
+            .downcast_ref::<ForeignPhysicalExtensionCodec>()
+        {
+            return codec.0.clone();
+        }
+
+        let task_ctx_provider = task_ctx_provider.into();
+        let private_data = Box::new(PhysicalExtensionCodecPrivateData { codec, runtime });
+
+        Self {
+            try_decode: try_decode_fn_wrapper,
+            try_encode: try_encode_fn_wrapper,
+            try_decode_udf: try_decode_udf_fn_wrapper,
+            try_encode_udf: try_encode_udf_fn_wrapper,
+            try_decode_udaf: try_decode_udaf_fn_wrapper,
+            try_encode_udaf: try_encode_udaf_fn_wrapper,
+            try_decode_udwf: try_decode_udwf_fn_wrapper,
+            try_encode_udwf: try_encode_udwf_fn_wrapper,
+            task_ctx_provider,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: crate::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_PhysicalExtensionCodec to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignPhysicalExtensionCodec(pub FFI_PhysicalExtensionCodec);
+
+unsafe impl Send for ForeignPhysicalExtensionCodec {}
+unsafe impl Sync for ForeignPhysicalExtensionCodec {}
+
+impl From<&FFI_PhysicalExtensionCodec> for Arc<dyn PhysicalExtensionCodec> {
+    fn from(codec: &FFI_PhysicalExtensionCodec) -> Self {
+        if (codec.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(codec.inner())
+        } else {
+            Arc::new(ForeignPhysicalExtensionCodec(codec.clone()))
+        }
+    }
+}
+
+impl Clone for FFI_PhysicalExtensionCodec {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl PhysicalExtensionCodec for ForeignPhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let inputs = inputs
+            .iter()
+            .map(|plan| FFI_ExecutionPlan::new(Arc::clone(plan), None))
+            .collect();
+
+        let plan =
+            df_result!(unsafe { (self.0.try_decode)(&self.0, buf.into(), inputs) })?;
+        let plan: Arc<dyn ExecutionPlan> = (&plan).try_into()?;
+
+        Ok(plan)
+    }
+
+    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
+        let plan = FFI_ExecutionPlan::new(node, None);
+        let bytes = df_result!(unsafe { (self.0.try_encode)(&self.0, plan) })?;
+
+        buf.extend(bytes);
+        Ok(())
+    }
+
+    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        let udf = unsafe {
+            df_result!((self.0.try_decode_udf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udf: Arc<dyn ScalarUDFImpl> = (&udf).into();
+
+        Ok(Arc::new(ScalarUDF::new_from_shared_impl(udf)))
+    }
+
+    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = FFI_ScalarUDF::from(Arc::new(node.clone()));
+        let bytes = df_result!(unsafe { (self.0.try_encode_udf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+        let udaf = unsafe {
+            df_result!((self.0.try_decode_udaf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udaf: Arc<dyn AggregateUDFImpl> = (&udaf).into();
+
+        Ok(Arc::new(AggregateUDF::new_from_shared_impl(udaf)))
+    }
+
+    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_AggregateUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udaf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+
+    fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+        let udwf = unsafe {
+            df_result!((self.0.try_decode_udwf)(&self.0, name.into(), buf.into()))
+        }?;
+        let udwf: Arc<dyn WindowUDFImpl> = (&udwf).into();
+
+        Ok(Arc::new(WindowUDF::new_from_shared_impl(udwf)))
+    }
+
+    fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+        let node = Arc::new(node.clone());
+        let node = FFI_WindowUDF::from(node);
+        let bytes = df_result!(unsafe { (self.0.try_encode_udwf)(&self.0, node) })?;
+
+        buf.extend(bytes);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::{Result, exec_err};
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::ptr_eq::arc_ptr_eq;
+    use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF, WindowUDFImpl};
+    use datafusion_functions::math::abs::AbsFunc;
+    use datafusion_functions_aggregate::sum::Sum;
+    use datafusion_functions_window::rank::{Rank, RankType};
+    use datafusion_physical_plan::ExecutionPlan;
+    use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+
+    use crate::execution_plan::tests::EmptyExec;
+    use crate::proto::physical_extension_codec::FFI_PhysicalExtensionCodec;
+
+    #[derive(Debug)]
+    pub(crate) struct TestExtensionCodec;
+
+    impl TestExtensionCodec {
+        pub(crate) const MAGIC_NUMBER: u8 = 127;
+        pub(crate) const EMPTY_EXEC_SERIALIZED: u8 = 1;
+        pub(crate) const ABS_FUNC_SERIALIZED: u8 = 2;
+        pub(crate) const SUM_UDAF_SERIALIZED: u8 = 3;
+        pub(crate) const RANK_UDWF_SERIALIZED: u8 = 4;
+        pub(crate) const MEMTABLE_SERIALIZED: u8 = 5;
+    }
+
+    impl PhysicalExtensionCodec for TestExtensionCodec {
+        fn try_decode(
+            &self,
+            buf: &[u8],
+            _inputs: &[Arc<dyn ExecutionPlan>],
+            _ctx: &TaskContext,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::EMPTY_EXEC_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode execution plan");
+            }
+
+            Ok(create_test_exec())
+        }
+
+        fn try_encode(
+            &self,
+            node: Arc<dyn ExecutionPlan>,
+            buf: &mut Vec<u8>,
+        ) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let Some(_) = node.as_any().downcast_ref::<EmptyExec>() else {
+                return exec_err!("TestExtensionCodec only expects EmptyExec");
+            };
+
+            buf.push(Self::EMPTY_EXEC_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udf(&self, _name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::ABS_FUNC_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udf");
+            }
+
+            Ok(Arc::new(ScalarUDF::from(AbsFunc::new())))
+        }
+
+        fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            if !udf.as_any().is::<AbsFunc>() {
+                return exec_err!("TestExtensionCodec only expects Abs UDF");
+            };
+
+            buf.push(Self::ABS_FUNC_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udaf(&self, _name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::SUM_UDAF_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udaf");
+            }
+
+            Ok(Arc::new(AggregateUDF::from(Sum::new())))
+        }
+
+        fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            let Some(_udf) = udf.as_any().downcast_ref::<Sum>() else {
+                return exec_err!("TestExtensionCodec only expects Sum UDAF");
+            };
+
+            buf.push(Self::SUM_UDAF_SERIALIZED);
+
+            Ok(())
+        }
+
+        fn try_decode_udwf(&self, _name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
+            if buf[0] != Self::MAGIC_NUMBER {
+                return exec_err!(
+                    "TestExtensionCodec input buffer does not start with magic number"
+                );
+            }
+
+            if buf.len() != 2 || buf[1] != Self::RANK_UDWF_SERIALIZED {
+                return exec_err!("TestExtensionCodec unable to decode udwf");
+            }
+
+            Ok(Arc::new(WindowUDF::from(Rank::new(
+                "my_rank".to_owned(),
+                RankType::Basic,
+            ))))
+        }
+
+        fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) -> Result<()> {
+            buf.push(Self::MAGIC_NUMBER);
+
+            let udf = node.inner();
+            let Some(udf) = udf.as_any().downcast_ref::<Rank>() else {
+                return exec_err!("TestExtensionCodec only expects Rank UDWF");
+            };
+
+            if udf.name() != "my_rank" {
+                return exec_err!("TestExtensionCodec only expects my_rank UDWF name");
+            }
+
+            buf.push(Self::RANK_UDWF_SERIALIZED);
+
+            Ok(())
+        }
+    }
+
+    fn create_test_exec() -> Arc<dyn ExecutionPlan> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        Arc::new(EmptyExec::new(schema)) as Arc<dyn ExecutionPlan>
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_exec_plan() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let exec = create_test_exec();
+        let input_execs = [create_test_exec()];
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode(Arc::clone(&exec), &mut bytes)?;
+
+        let returned_exec =
+            foreign_codec.try_decode(&bytes, &input_execs, ctx.task_ctx().as_ref())?;
+
+        assert!(returned_exec.as_any().is::<EmptyExec>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(ScalarUDF::from(AbsFunc::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<AbsFunc>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udaf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(AggregateUDF::from(Sum::new()));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udaf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udaf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<Sum>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn roundtrip_ffi_physical_extension_codec_udwf() -> Result<()> {
+        let codec = Arc::new(TestExtensionCodec {});
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(codec, None, task_ctx_provider);
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+
+        let udf = Arc::new(WindowUDF::from(Rank::new(
+            "my_rank".to_owned(),
+            RankType::Basic,
+        )));
+        let mut bytes = Vec::new();
+        foreign_codec.try_encode_udwf(udf.as_ref(), &mut bytes)?;
+
+        let returned_udf = foreign_codec.try_decode_udwf(udf.name(), &bytes)?;
+
+        assert!(returned_udf.inner().as_any().is::<Rank>());
+
+        Ok(())
+    }
+
+    #[test]
+    fn ffi_physical_extension_codec_local_bypass() {
+        let codec =
+            Arc::new(TestExtensionCodec {}) as Arc<dyn PhysicalExtensionCodec + Send>;
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+
+        let mut ffi_codec =
+            FFI_PhysicalExtensionCodec::new(Arc::clone(&codec), None, task_ctx_provider);
+
+        let codec = codec as Arc<dyn PhysicalExtensionCodec>;
+        // Verify local libraries can be downcast to their original
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+        assert!(arc_ptr_eq(&foreign_codec, &codec));
+
+        // Verify different library markers generate foreign providers
+        ffi_codec.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_codec: Arc<dyn PhysicalExtensionCodec> = (&ffi_codec).into();
+        assert!(!arc_ptr_eq(&foreign_codec, &codec));
+    }
+}
diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs
index 939c4050028cb..53078a0e4bbae 100644
--- a/datafusion/ffi/src/record_batch_stream.rs
+++ b/datafusion/ffi/src/record_batch_stream.rs
@@ -15,48 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, task::Poll};
-
-use abi_stable::{
-    std_types::{ROption, RResult, RString},
-    StableAbi,
-};
-use arrow::array::{Array, RecordBatch};
-use arrow::{
-    array::{make_array, StructArray},
-    ffi::{from_ffi, to_ffi},
-};
+use std::ffi::c_void;
+use std::task::Poll;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult};
+use arrow::array::{Array, RecordBatch, StructArray, make_array};
+use arrow::ffi::{from_ffi, to_ffi};
 use async_ffi::{ContextExt, FfiContext, FfiPoll};
-use datafusion::error::Result;
-use datafusion::{
-    error::DataFusionError,
-    execution::{RecordBatchStream, SendableRecordBatchStream},
-};
+use datafusion_common::{DataFusionError, Result, ffi_datafusion_err, ffi_err};
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 use futures::{Stream, TryStreamExt};
 use tokio::runtime::Handle;
 
-use crate::{
-    arrow_wrappers::{WrappedArray, WrappedSchema},
-    rresult,
-};
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::rresult;
+use crate::util::FFIResult;
 
 /// A stable struct for sharing [`RecordBatchStream`] across FFI boundaries.
 /// We use the async-ffi crate for handling async calls across libraries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_RecordBatchStream {
     /// This mirrors the `poll_next` of [`RecordBatchStream`] but does so
     /// in a FFI safe manner.
-    pub poll_next:
-        unsafe extern "C" fn(
-            stream: &Self,
-            cx: &mut FfiContext,
-        ) -> FfiPoll<ROption<RResult<WrappedArray, RString>>>,
+    pub poll_next: unsafe extern "C" fn(
+        stream: &Self,
+        cx: &mut FfiContext,
+    ) -> FfiPoll<ROption<FFIResult<WrappedArray>>>,
 
     /// Return the schema of the record batch
     pub schema: unsafe extern "C" fn(stream: &Self) -> WrappedSchema,
 
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(arg: &mut Self),
+
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// The foreign library should never attempt to access this data.
     pub private_data: *mut c_void,
@@ -82,6 +75,7 @@ impl FFI_RecordBatchStream {
         FFI_RecordBatchStream {
             poll_next: poll_next_fn_wrapper,
             schema: schema_fn_wrapper,
+            release: release_fn_wrapper,
             private_data,
         }
     }
@@ -90,28 +84,39 @@ impl FFI_RecordBatchStream {
 unsafe impl Send for FFI_RecordBatchStream {}
 
 unsafe extern "C" fn schema_fn_wrapper(stream: &FFI_RecordBatchStream) -> WrappedSchema {
-    let private_data = stream.private_data as *const RecordBatchStreamPrivateData;
-    let stream = &(*private_data).rbs;
+    unsafe {
+        let private_data = stream.private_data as *const RecordBatchStreamPrivateData;
+        let stream = &(*private_data).rbs;
+
+        (*stream).schema().into()
+    }
+}
 
-    (*stream).schema().into()
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_RecordBatchStream) {
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut RecordBatchStreamPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
-fn record_batch_to_wrapped_array(
+pub(crate) fn record_batch_to_wrapped_array(
     record_batch: RecordBatch,
-) -> RResult<WrappedArray, RString> {
+) -> FFIResult<WrappedArray> {
+    let schema = WrappedSchema::from(record_batch.schema());
     let struct_array = StructArray::from(record_batch);
     rresult!(
-        to_ffi(&struct_array.to_data()).map(|(array, schema)| WrappedArray {
-            array,
-            schema: WrappedSchema(schema)
-        })
+        to_ffi(&struct_array.to_data())
+            .map(|(array, _schema)| WrappedArray { array, schema })
     )
 }
 
 // probably want to use pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result<ArrayData> {
 fn maybe_record_batch_to_wrapped_stream(
     record_batch: Option<Result<RecordBatch>>,
-) -> ROption<RResult<WrappedArray, RString>> {
+) -> ROption<FFIResult<WrappedArray>> {
     match record_batch {
         Some(Ok(record_batch)) => {
             ROption::RSome(record_batch_to_wrapped_array(record_batch))
@@ -124,19 +129,21 @@ fn maybe_record_batch_to_wrapped_stream(
 unsafe extern "C" fn poll_next_fn_wrapper(
     stream: &FFI_RecordBatchStream,
     cx: &mut FfiContext,
-) -> FfiPoll<ROption<RResult<WrappedArray, RString>>> {
-    let private_data = stream.private_data as *mut RecordBatchStreamPrivateData;
-    let stream = &mut (*private_data).rbs;
+) -> FfiPoll<ROption<FFIResult<WrappedArray>>> {
+    unsafe {
+        let private_data = stream.private_data as *mut RecordBatchStreamPrivateData;
+        let stream = &mut (*private_data).rbs;
 
-    let _guard = (*private_data).runtime.as_ref().map(|rt| rt.enter());
+        let _guard = (*private_data).runtime.as_ref().map(|rt| rt.enter());
 
-    let poll_result = cx.with_context(|std_cx| {
-        (*stream)
-            .try_poll_next_unpin(std_cx)
-            .map(maybe_record_batch_to_wrapped_stream)
-    });
+        let poll_result = cx.with_context(|std_cx| {
+            (*stream)
+                .try_poll_next_unpin(std_cx)
+                .map(maybe_record_batch_to_wrapped_stream)
+        });
 
-    poll_result.into()
+        poll_result.into()
+    }
 }
 
 impl RecordBatchStream for FFI_RecordBatchStream {
@@ -146,31 +153,31 @@ impl RecordBatchStream for FFI_RecordBatchStream {
     }
 }
 
-fn wrapped_array_to_record_batch(array: WrappedArray) -> Result<RecordBatch> {
+pub(crate) fn wrapped_array_to_record_batch(array: WrappedArray) -> Result<RecordBatch> {
     let array_data =
         unsafe { from_ffi(array.array, &array.schema.0).map_err(DataFusionError::from)? };
+    let schema: arrow::datatypes::SchemaRef = array.schema.into();
     let array = make_array(array_data);
     let struct_array = array
         .as_any()
         .downcast_ref::<StructArray>()
-        .ok_or(DataFusionError::Execution(
-        "Unexpected array type during record batch collection in FFI_RecordBatchStream"
-            .to_string(),
+        .ok_or_else(|| ffi_datafusion_err!(
+        "Unexpected array type during record batch collection in FFI_RecordBatchStream - expected StructArray"
     ))?;
 
-    Ok(struct_array.into())
+    let rb: RecordBatch = struct_array.into();
+
+    rb.with_schema(schema).map_err(Into::into)
 }
 
 fn maybe_wrapped_array_to_record_batch(
-    array: ROption<RResult<WrappedArray, RString>>,
+    array: ROption<FFIResult<WrappedArray>>,
 ) -> Option<Result<RecordBatch>> {
     match array {
         ROption::RSome(RResult::ROk(wrapped_array)) => {
             Some(wrapped_array_to_record_batch(wrapped_array))
         }
-        ROption::RSome(RResult::RErr(e)) => {
-            Some(Err(DataFusionError::Execution(e.to_string())))
-        }
+        ROption::RSome(RResult::RErr(e)) => Some(ffi_err!("{e}")),
         ROption::RNone => None,
     }
 }
@@ -190,9 +197,89 @@ impl Stream for FFI_RecordBatchStream {
                 Poll::Ready(maybe_wrapped_array_to_record_batch(array))
             }
             FfiPoll::Pending => Poll::Pending,
-            FfiPoll::Panicked => Poll::Ready(Some(Err(DataFusionError::Execution(
-                "Error occurred during poll_next on FFI_RecordBatchStream".to_string(),
-            )))),
+            FfiPoll::Panicked => Poll::Ready(Some(ffi_err!(
+                "Panic occurred during poll_next on FFI_RecordBatchStream"
+            ))),
         }
     }
 }
+
+impl Drop for FFI_RecordBatchStream {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::common::record_batch;
+    use datafusion::error::Result;
+    use datafusion::execution::SendableRecordBatchStream;
+    use datafusion::test_util::bounded_stream;
+    use futures::StreamExt;
+
+    use super::{
+        FFI_RecordBatchStream, record_batch_to_wrapped_array,
+        wrapped_array_to_record_batch,
+    };
+    use crate::df_result;
+
+    #[tokio::test]
+    async fn test_round_trip_record_batch_stream() -> Result<()> {
+        let record_batch = record_batch!(
+            ("a", Int32, vec![1, 2, 3]),
+            ("b", Float64, vec![Some(4.0), None, Some(5.0)])
+        )?;
+        let original_rbs = bounded_stream(record_batch.clone(), 1);
+
+        let ffi_rbs: FFI_RecordBatchStream = original_rbs.into();
+        let mut ffi_rbs: SendableRecordBatchStream = Box::pin(ffi_rbs);
+
+        let schema = ffi_rbs.schema();
+        assert_eq!(
+            schema,
+            Arc::new(Schema::new(vec![
+                Field::new("a", DataType::Int32, true),
+                Field::new("b", DataType::Float64, true)
+            ]))
+        );
+
+        let batch = ffi_rbs.next().await;
+        assert!(batch.is_some());
+        assert!(batch.as_ref().unwrap().is_ok());
+        assert_eq!(batch.unwrap().unwrap(), record_batch);
+
+        // There should only be one batch
+        let no_batch = ffi_rbs.next().await;
+        assert!(no_batch.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn round_trip_record_batch_with_metadata() -> Result<()> {
+        let rb = record_batch!(
+            ("a", Int32, vec![1, 2, 3]),
+            ("b", Float64, vec![Some(4.0), None, Some(5.0)])
+        )?;
+
+        let schema = rb
+            .schema()
+            .as_ref()
+            .clone()
+            .with_metadata([("some_key".to_owned(), "some_value".to_owned())].into())
+            .into();
+
+        let rb = rb.with_schema(schema)?;
+
+        let ffi_rb = df_result!(record_batch_to_wrapped_array(rb.clone()))?;
+
+        let round_trip_rb = wrapped_array_to_record_batch(ffi_rb)?;
+
+        assert_eq!(rb, round_trip_rb);
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/schema_provider.rs b/datafusion/ffi/src/schema_provider.rs
index 6e5a590e1a09d..5d1348e2328f7 100644
--- a/datafusion/ffi/src/schema_provider.rs
+++ b/datafusion/ffi/src/schema_provider.rs
@@ -15,31 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RString, RVec};
 use async_ffi::{FfiFuture, FutureExt};
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{SchemaProvider, TableProvider},
-    error::DataFusionError,
+use datafusion_catalog::{SchemaProvider, TableProvider};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    table_provider::{FFI_TableProvider, ForeignTableProvider},
-};
-
-use datafusion::error::Result;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::{FFI_TableProvider, ForeignTableProvider};
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
 
 /// A stable struct for sharing [`SchemaProvider`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_SchemaProvider {
     pub owner_name: ROption<RString>,
 
@@ -48,25 +47,26 @@ pub struct FFI_SchemaProvider {
     pub table: unsafe extern "C" fn(
         provider: &Self,
         name: RString,
-    ) -> FfiFuture<
-        RResult<ROption<FFI_TableProvider>, RString>,
-    >,
-
-    pub register_table:
-        unsafe extern "C" fn(
-            provider: &Self,
-            name: RString,
-            table: FFI_TableProvider,
-        ) -> RResult<ROption<FFI_TableProvider>, RString>,
-
-    pub deregister_table:
-        unsafe extern "C" fn(
-            provider: &Self,
-            name: RString,
-        ) -> RResult<ROption<FFI_TableProvider>, RString>,
+    )
+        -> FfiFuture<FFIResult<ROption<FFI_TableProvider>>>,
+
+    pub register_table: unsafe extern "C" fn(
+        provider: &Self,
+        name: RString,
+        table: FFI_TableProvider,
+    )
+        -> FFIResult<ROption<FFI_TableProvider>>,
+
+    pub deregister_table: unsafe extern "C" fn(
+        provider: &Self,
+        name: RString,
+    )
+        -> FFIResult<ROption<FFI_TableProvider>>,
 
     pub table_exist: unsafe extern "C" fn(provider: &Self, name: RString) -> bool,
 
+    pub logical_codec: FFI_LogicalExtensionCodec,
+
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
     pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
@@ -80,6 +80,11 @@ pub struct FFI_SchemaProvider {
     /// Internal data. This is only to be accessed by the provider of the plan.
     /// A [`ForeignSchemaProvider`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_SchemaProvider {}
@@ -92,105 +97,135 @@ struct ProviderPrivateData {
 
 impl FFI_SchemaProvider {
     unsafe fn inner(&self) -> &Arc<dyn SchemaProvider + Send> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        &(*private_data).provider
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            &(*private_data).provider
+        }
     }
 
     unsafe fn runtime(&self) -> Option<Handle> {
-        let private_data = self.private_data as *const ProviderPrivateData;
-        (*private_data).runtime.clone()
+        unsafe {
+            let private_data = self.private_data as *const ProviderPrivateData;
+            (*private_data).runtime.clone()
+        }
     }
 }
 
 unsafe extern "C" fn table_names_fn_wrapper(
     provider: &FFI_SchemaProvider,
 ) -> RVec<RString> {
-    let provider = provider.inner();
+    unsafe {
+        let provider = provider.inner();
 
-    let table_names = provider.table_names();
-    table_names.into_iter().map(|s| s.into()).collect()
+        let table_names = provider.table_names();
+        table_names.into_iter().map(|s| s.into()).collect()
+    }
 }
 
 unsafe extern "C" fn table_fn_wrapper(
     provider: &FFI_SchemaProvider,
     name: RString,
-) -> FfiFuture<RResult<ROption<FFI_TableProvider>, RString>> {
-    let runtime = provider.runtime();
-    let provider = Arc::clone(provider.inner());
-
-    async move {
-        let table = rresult_return!(provider.table(name.as_str()).await)
-            .map(|t| FFI_TableProvider::new(t, true, runtime))
-            .into();
-
-        RResult::ROk(table)
+) -> FfiFuture<FFIResult<ROption<FFI_TableProvider>>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = Arc::clone(provider.inner());
+
+        async move {
+            let table = rresult_return!(provider.table(name.as_str()).await)
+                .map(|t| {
+                    FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+                })
+                .into();
+
+            RResult::ROk(table)
+        }
+        .into_ffi()
     }
-    .into_ffi()
 }
 
 unsafe extern "C" fn register_table_fn_wrapper(
     provider: &FFI_SchemaProvider,
     name: RString,
     table: FFI_TableProvider,
-) -> RResult<ROption<FFI_TableProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
+) -> FFIResult<ROption<FFI_TableProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = provider.inner();
 
-    let table = Arc::new(ForeignTableProvider(table));
+        let table = Arc::new(ForeignTableProvider(table));
 
-    let returned_table = rresult_return!(provider.register_table(name.into(), table))
-        .map(|t| FFI_TableProvider::new(t, true, runtime));
+        let returned_table = rresult_return!(provider.register_table(name.into(), table))
+            .map(|t| {
+                FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+            });
 
-    RResult::ROk(returned_table.into())
+        RResult::ROk(returned_table.into())
+    }
 }
 
 unsafe extern "C" fn deregister_table_fn_wrapper(
     provider: &FFI_SchemaProvider,
     name: RString,
-) -> RResult<ROption<FFI_TableProvider>, RString> {
-    let runtime = provider.runtime();
-    let provider = provider.inner();
-
-    let returned_table = rresult_return!(provider.deregister_table(name.as_str()))
-        .map(|t| FFI_TableProvider::new(t, true, runtime));
+) -> FFIResult<ROption<FFI_TableProvider>> {
+    unsafe {
+        let runtime = provider.runtime();
+        let logical_codec = provider.logical_codec.clone();
+        let provider = provider.inner();
+
+        let returned_table = rresult_return!(provider.deregister_table(name.as_str()))
+            .map(|t| {
+                FFI_TableProvider::new_with_ffi_codec(t, true, runtime, logical_codec)
+            });
 
-    RResult::ROk(returned_table.into())
+        RResult::ROk(returned_table.into())
+    }
 }
 
 unsafe extern "C" fn table_exist_fn_wrapper(
     provider: &FFI_SchemaProvider,
     name: RString,
 ) -> bool {
-    provider.inner().table_exist(name.as_str())
+    unsafe { provider.inner().table_exist(name.as_str()) }
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_SchemaProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(
     provider: &FFI_SchemaProvider,
 ) -> FFI_SchemaProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
-
-    let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
-        runtime,
-    })) as *mut c_void;
-
-    FFI_SchemaProvider {
-        owner_name: provider.owner_name.clone(),
-        table_names: table_names_fn_wrapper,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
-        version: super::version,
-        private_data,
-        table: table_fn_wrapper,
-        register_table: register_table_fn_wrapper,
-        deregister_table: deregister_table_fn_wrapper,
-        table_exist: table_exist_fn_wrapper,
+    unsafe {
+        let old_private_data = provider.private_data as *const ProviderPrivateData;
+        let runtime = (*old_private_data).runtime.clone();
+
+        let private_data = Box::into_raw(Box::new(ProviderPrivateData {
+            provider: Arc::clone(&(*old_private_data).provider),
+            runtime,
+        })) as *mut c_void;
+
+        FFI_SchemaProvider {
+            owner_name: provider.owner_name.clone(),
+            table_names: table_names_fn_wrapper,
+            table: table_fn_wrapper,
+            register_table: register_table_fn_wrapper,
+            deregister_table: deregister_table_fn_wrapper,
+            table_exist: table_exist_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
@@ -205,21 +240,46 @@ impl FFI_SchemaProvider {
     pub fn new(
         provider: Arc<dyn SchemaProvider + Send>,
         runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
     ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(provider, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn SchemaProvider + Send>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(provider) = provider.as_any().downcast_ref::<ForeignSchemaProvider>()
+        {
+            return provider.0.clone();
+        }
+
         let owner_name = provider.owner_name().map(|s| s.into()).into();
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
             owner_name,
             table_names: table_names_fn_wrapper,
-            clone: clone_fn_wrapper,
-            release: release_fn_wrapper,
-            version: super::version,
-            private_data: Box::into_raw(private_data) as *mut c_void,
             table: table_fn_wrapper,
             register_table: register_table_fn_wrapper,
             deregister_table: deregister_table_fn_wrapper,
             table_exist: table_exist_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -234,9 +294,14 @@ pub struct ForeignSchemaProvider(pub FFI_SchemaProvider);
 unsafe impl Send for ForeignSchemaProvider {}
 unsafe impl Sync for ForeignSchemaProvider {}
 
-impl From<&FFI_SchemaProvider> for ForeignSchemaProvider {
+impl From<&FFI_SchemaProvider> for Arc<dyn SchemaProvider + Send> {
     fn from(provider: &FFI_SchemaProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { provider.inner() });
+        }
+
+        Arc::new(ForeignSchemaProvider(provider.clone()))
+            as Arc<dyn SchemaProvider + Send>
     }
 }
 
@@ -274,9 +339,7 @@ impl SchemaProvider for ForeignSchemaProvider {
             let table: Option<FFI_TableProvider> =
                 df_result!((self.0.table)(&self.0, name.into()).await)?.into();
 
-            let table = table.as_ref().map(|t| {
-                Arc::new(ForeignTableProvider::from(t)) as Arc<dyn TableProvider>
-            });
+            let table = table.as_ref().map(<Arc<dyn TableProvider>>::from);
 
             Ok(table)
         }
@@ -290,7 +353,12 @@ impl SchemaProvider for ForeignSchemaProvider {
         unsafe {
             let ffi_table = match table.as_any().downcast_ref::<ForeignTableProvider>() {
                 Some(t) => t.0.clone(),
-                None => FFI_TableProvider::new(table, true, None),
+                None => FFI_TableProvider::new_with_ffi_codec(
+                    table,
+                    true,
+                    None,
+                    self.0.logical_codec.clone(),
+                ),
             };
 
             let returned_provider: Option<FFI_TableProvider> =
@@ -320,7 +388,8 @@ impl SchemaProvider for ForeignSchemaProvider {
 #[cfg(test)]
 mod tests {
     use arrow::datatypes::Schema;
-    use datafusion::{catalog::MemorySchemaProvider, datasource::empty::EmptyTable};
+    use datafusion::catalog::MemorySchemaProvider;
+    use datafusion::datasource::empty::EmptyTable;
 
     use super::*;
 
@@ -331,15 +400,21 @@ mod tests {
     #[tokio::test]
     async fn test_round_trip_ffi_schema_provider() {
         let schema_provider = Arc::new(MemorySchemaProvider::new());
-        assert!(schema_provider
-            .as_ref()
-            .register_table("prior_table".to_string(), empty_table())
-            .unwrap()
-            .is_none());
+        assert!(
+            schema_provider
+                .as_ref()
+                .register_table("prior_table".to_string(), empty_table())
+                .unwrap()
+                .is_none()
+        );
 
-        let ffi_schema_provider = FFI_SchemaProvider::new(schema_provider, None);
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
 
-        let foreign_schema_provider: ForeignSchemaProvider =
+        let mut ffi_schema_provider =
+            FFI_SchemaProvider::new(schema_provider, None, task_ctx_provider, None);
+        ffi_schema_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_schema_provider: Arc<dyn SchemaProvider + Send> =
             (&ffi_schema_provider).into();
 
         let prior_table_names = foreign_schema_provider.table_names();
@@ -366,7 +441,7 @@ mod tests {
         assert!(returned_schema.is_some());
         assert_eq!(foreign_schema_provider.table_names().len(), 1);
 
-        // Retrieve non-existant table
+        // Retrieve non-existent table
         let returned_schema = foreign_schema_provider
             .table("prior_table")
             .await
@@ -382,4 +457,32 @@ mod tests {
         assert!(returned_schema.is_some());
         assert!(foreign_schema_provider.table_exist("second_table"));
     }
+
+    #[test]
+    fn test_ffi_schema_provider_local_bypass() {
+        let schema_provider = Arc::new(MemorySchemaProvider::new());
+
+        let (_ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut ffi_schema =
+            FFI_SchemaProvider::new(schema_provider, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_schema: Arc<dyn SchemaProvider + Send> = (&ffi_schema).into();
+        assert!(
+            foreign_schema
+                .as_any()
+                .downcast_ref::<MemorySchemaProvider>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_schema.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_schema: Arc<dyn SchemaProvider + Send> = (&ffi_schema).into();
+        assert!(
+            foreign_schema
+                .as_any()
+                .downcast_ref::<ForeignSchemaProvider>()
+                .is_some()
+        );
+    }
 }
diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session/config.rs
similarity index 50%
rename from datafusion/ffi/src/session_config.rs
rename to datafusion/ffi/src/session/config.rs
index aea03cf94e0af..63f0f20ecc7d5 100644
--- a/datafusion/ffi/src/session_config.rs
+++ b/datafusion/ffi/src/session/config.rs
@@ -15,17 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    collections::HashMap,
-    ffi::{c_char, c_void, CString},
-};
-
-use abi_stable::{
-    std_types::{RHashMap, RString},
-    StableAbi,
-};
-use datafusion::{config::ConfigOptions, error::Result};
-use datafusion::{error::DataFusionError, prelude::SessionConfig};
+use std::ffi::c_void;
+
+use crate::config::FFI_ConfigOptions;
+use abi_stable::StableAbi;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::config::SessionConfig;
 
 /// A stable struct for sharing [`SessionConfig`] across FFI boundaries.
 /// Instead of attempting to expose the entire SessionConfig interface, we
@@ -40,11 +36,9 @@ use datafusion::{error::DataFusionError, prelude::SessionConfig};
 /// value over this version.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_SessionConfig {
-    /// Return a hash map from key to value of the config options represented
-    /// by string values.
-    pub config_options: unsafe extern "C" fn(config: &Self) -> RHashMap<RString, RString>,
+    /// FFI stable configuration options.
+    pub config_options: FFI_ConfigOptions,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
@@ -54,80 +48,69 @@ pub struct FFI_SessionConfig {
     pub release: unsafe extern "C" fn(arg: &mut Self),
 
     /// Internal data. This is only to be accessed by the provider of the plan.
-    /// A [`ForeignSessionConfig`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_SessionConfig {}
 unsafe impl Sync for FFI_SessionConfig {}
 
-unsafe extern "C" fn config_options_fn_wrapper(
-    config: &FFI_SessionConfig,
-) -> RHashMap<RString, RString> {
-    let private_data = config.private_data as *mut SessionConfigPrivateData;
-    let config_options = &(*private_data).config;
-
-    let mut options = RHashMap::default();
-    for config_entry in config_options.entries() {
-        if let Some(value) = config_entry.value {
-            options.insert(config_entry.key.into(), value.into());
-        }
+impl FFI_SessionConfig {
+    fn inner(&self) -> &SessionConfig {
+        let private_data = self.private_data as *mut SessionConfigPrivateData;
+        unsafe { &(*private_data).config }
     }
-
-    options
 }
 
 unsafe extern "C" fn release_fn_wrapper(config: &mut FFI_SessionConfig) {
-    let private_data =
-        Box::from_raw(config.private_data as *mut SessionConfigPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!config.private_data.is_null());
+        let private_data =
+            Box::from_raw(config.private_data as *mut SessionConfigPrivateData);
+        drop(private_data);
+        config.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(config: &FFI_SessionConfig) -> FFI_SessionConfig {
-    let old_private_data = config.private_data as *mut SessionConfigPrivateData;
-    let old_config = &(*old_private_data).config;
-
-    let private_data = Box::new(SessionConfigPrivateData {
-        config: old_config.clone(),
-    });
-
-    FFI_SessionConfig {
-        config_options: config_options_fn_wrapper,
-        private_data: Box::into_raw(private_data) as *mut c_void,
-        clone: clone_fn_wrapper,
-        release: release_fn_wrapper,
+    unsafe {
+        let old_private_data = config.private_data as *mut SessionConfigPrivateData;
+        let old_config = (*old_private_data).config.clone();
+
+        let private_data = Box::new(SessionConfigPrivateData { config: old_config });
+
+        FFI_SessionConfig {
+            config_options: config.config_options.clone(),
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            library_marker_id: crate::get_library_marker_id,
+        }
     }
 }
 
 struct SessionConfigPrivateData {
-    pub config: ConfigOptions,
+    pub config: SessionConfig,
 }
 
 impl From<&SessionConfig> for FFI_SessionConfig {
     fn from(session: &SessionConfig) -> Self {
-        let mut config_keys = Vec::new();
-        let mut config_values = Vec::new();
-        for config_entry in session.options().entries() {
-            if let Some(value) = config_entry.value {
-                let key_cstr = CString::new(config_entry.key).unwrap_or_default();
-                let key_ptr = key_cstr.into_raw() as *const c_char;
-                config_keys.push(key_ptr);
-
-                config_values
-                    .push(CString::new(value).unwrap_or_default().into_raw()
-                        as *const c_char);
-            }
-        }
-
         let private_data = Box::new(SessionConfigPrivateData {
-            config: session.options().clone(),
+            config: session.clone(),
         });
 
+        let config_options = FFI_ConfigOptions::from(session.options().as_ref());
+
         Self {
-            config_options: config_options_fn_wrapper,
+            config_options,
             private_data: Box::into_raw(private_data) as *mut c_void,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -144,24 +127,17 @@ impl Drop for FFI_SessionConfig {
     }
 }
 
-/// A wrapper struct for accessing [`SessionConfig`] across a FFI boundary.
-/// The [`SessionConfig`] will be generated from a hash map of the config
-/// options in the provider and will be reconstructed on this side of the
-/// interface.s
-pub struct ForeignSessionConfig(pub SessionConfig);
-
-impl TryFrom<&FFI_SessionConfig> for ForeignSessionConfig {
+impl TryFrom<&FFI_SessionConfig> for SessionConfig {
     type Error = DataFusionError;
 
     fn try_from(config: &FFI_SessionConfig) -> Result<Self, Self::Error> {
-        let config_options = unsafe { (config.config_options)(config) };
+        if (config.library_marker_id)() == crate::get_library_marker_id() {
+            return Ok(config.inner().clone());
+        }
 
-        let mut options_map = HashMap::new();
-        config_options.iter().for_each(|kv_pair| {
-            options_map.insert(kv_pair.0.to_string(), kv_pair.1.to_string());
-        });
+        let config_options = ConfigOptions::try_from(config.config_options.clone())?;
 
-        Ok(Self(SessionConfig::from_string_hash_map(&options_map)?))
+        Ok(SessionConfig::from(config_options))
     }
 }
 
@@ -174,13 +150,15 @@ mod tests {
         let session_config = SessionConfig::new();
         let original_options = session_config.options().entries();
 
-        let ffi_config: FFI_SessionConfig = (&session_config).into();
+        let mut ffi_config: FFI_SessionConfig = (&session_config).into();
+        let _ = ffi_config.clone();
+        ffi_config.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_config: ForeignSessionConfig = (&ffi_config).try_into()?;
+        let foreign_config: SessionConfig = (&ffi_config).try_into()?;
 
-        let returned_options = foreign_config.0.options().entries();
+        let returned_options = foreign_config.options().entries();
 
-        assert!(original_options.len() == returned_options.len());
+        assert_eq!(original_options.len(), returned_options.len());
 
         Ok(())
     }
diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs
new file mode 100644
index 0000000000000..007181356e1b8
--- /dev/null
+++ b/datafusion/ffi/src/session/mod.rs
@@ -0,0 +1,708 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RHashMap, RResult, RStr, RString, RVec};
+use arrow_schema::SchemaRef;
+use arrow_schema::ffi::FFI_ArrowSchema;
+use async_ffi::{FfiFuture, FutureExt};
+use async_trait::async_trait;
+use datafusion_common::config::{ConfigFileType, ConfigOptions, TableOptions};
+use datafusion_common::{DFSchema, DataFusionError};
+use datafusion_execution::TaskContext;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_expr::{
+    AggregateUDF, AggregateUDFImpl, Expr, LogicalPlan, ScalarUDF, ScalarUDFImpl,
+    WindowUDF, WindowUDFImpl,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes};
+use datafusion_proto::logical_plan::LogicalExtensionCodec;
+use datafusion_proto::logical_plan::from_proto::parse_expr;
+use datafusion_proto::logical_plan::to_proto::serialize_expr;
+use datafusion_proto::protobuf::LogicalExprNode;
+use datafusion_session::Session;
+use prost::Message;
+use tokio::runtime::Handle;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContext;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::config::FFI_SessionConfig;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udwf::FFI_WindowUDF;
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
+
+pub mod config;
+
+/// A stable struct for sharing [`Session`] across FFI boundaries.
+///
+/// Care must be taken when using this struct. Unlike most of the structs in
+/// this crate, the private data for [`FFI_SessionRef`] contains borrowed data.
+/// The lifetime of the borrow is lost when hidden within the ``*mut c_void``
+/// of the private data. For this reason, it is the user's responsibility to
+/// ensure the lifetime of the [`Session`] remains valid.
+///
+/// The reason for storing `&dyn Session` is because the primary motivation
+/// for implementing this struct is [`crate::table_provider::FFI_TableProvider`]
+/// which has methods that require `&dyn Session`. For usage within this crate
+/// we know the [`Session`] lifetimes are valid.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub(crate) struct FFI_SessionRef {
+    session_id: unsafe extern "C" fn(&Self) -> RStr,
+
+    config: unsafe extern "C" fn(&Self) -> FFI_SessionConfig,
+
+    create_physical_plan: unsafe extern "C" fn(
+        &Self,
+        logical_plan_serialized: RVec<u8>,
+    )
+        -> FfiFuture<FFIResult<FFI_ExecutionPlan>>,
+
+    create_physical_expr: unsafe extern "C" fn(
+        &Self,
+        expr_serialized: RVec<u8>,
+        schema: WrappedSchema,
+    ) -> FFIResult<FFI_PhysicalExpr>,
+
+    scalar_functions: unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_ScalarUDF>,
+
+    aggregate_functions:
+        unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_AggregateUDF>,
+
+    window_functions: unsafe extern "C" fn(&Self) -> RHashMap<RString, FFI_WindowUDF>,
+
+    table_options: unsafe extern "C" fn(&Self) -> RHashMap<RString, RString>,
+
+    default_table_options: unsafe extern "C" fn(&Self) -> RHashMap<RString, RString>,
+
+    task_ctx: unsafe extern "C" fn(&Self) -> FFI_TaskContext,
+
+    logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone on the provider of the registry. This should
+    /// only need to be called by the receiver of the plan.
+    clone: unsafe extern "C" fn(plan: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    release: unsafe extern "C" fn(arg: &mut Self),
+
+    /// Return the major DataFusion version number of this registry.
+    pub version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the plan.
+    /// A [`ForeignSession`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_SessionRef {}
+unsafe impl Sync for FFI_SessionRef {}
+
+struct SessionPrivateData<'a> {
+    session: &'a (dyn Session + Send + Sync),
+    runtime: Option<Handle>,
+}
+
+impl FFI_SessionRef {
+    fn inner(&self) -> &(dyn Session + Send + Sync) {
+        let private_data = self.private_data as *const SessionPrivateData;
+        unsafe { (*private_data).session }
+    }
+
+    unsafe fn runtime(&self) -> &Option<Handle> {
+        unsafe {
+            let private_data = self.private_data as *const SessionPrivateData;
+            &(*private_data).runtime
+        }
+    }
+}
+
+unsafe extern "C" fn session_id_fn_wrapper(session: &FFI_SessionRef) -> RStr<'_> {
+    let session = session.inner();
+    session.session_id().into()
+}
+
+unsafe extern "C" fn config_fn_wrapper(session: &FFI_SessionRef) -> FFI_SessionConfig {
+    let session = session.inner();
+    session.config().into()
+}
+
+unsafe extern "C" fn create_physical_plan_fn_wrapper(
+    session: &FFI_SessionRef,
+    logical_plan_serialized: RVec<u8>,
+) -> FfiFuture<FFIResult<FFI_ExecutionPlan>> {
+    unsafe {
+        let runtime = session.runtime().clone();
+        let session = session.clone();
+        async move {
+            let session = session.inner();
+            let task_ctx = session.task_ctx();
+
+            let logical_plan = rresult_return!(logical_plan_from_bytes(
+                logical_plan_serialized.as_slice(),
+                task_ctx.as_ref(),
+            ));
+
+            let physical_plan = session.create_physical_plan(&logical_plan).await;
+
+            rresult!(physical_plan.map(|plan| FFI_ExecutionPlan::new(plan, runtime)))
+        }
+        .into_ffi()
+    }
+}
+
+unsafe extern "C" fn create_physical_expr_fn_wrapper(
+    session: &FFI_SessionRef,
+    expr_serialized: RVec<u8>,
+    schema: WrappedSchema,
+) -> FFIResult<FFI_PhysicalExpr> {
+    let codec: Arc<dyn LogicalExtensionCodec> = (&session.logical_codec).into();
+    let session = session.inner();
+
+    let logical_expr = LogicalExprNode::decode(expr_serialized.as_slice()).unwrap();
+    let logical_expr =
+        parse_expr(&logical_expr, session.task_ctx().as_ref(), codec.as_ref()).unwrap();
+    let schema: SchemaRef = schema.into();
+    let schema: DFSchema = rresult_return!(schema.try_into());
+
+    let physical_expr =
+        rresult_return!(session.create_physical_expr(logical_expr, &schema));
+
+    RResult::ROk(physical_expr.into())
+}
+
+unsafe extern "C" fn scalar_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> RHashMap<RString, FFI_ScalarUDF> {
+    let session = session.inner();
+    session
+        .scalar_functions()
+        .iter()
+        .map(|(name, udf)| (name.clone().into(), FFI_ScalarUDF::from(Arc::clone(udf))))
+        .collect()
+}
+
+unsafe extern "C" fn aggregate_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> RHashMap<RString, FFI_AggregateUDF> {
+    let session = session.inner();
+    session
+        .aggregate_functions()
+        .iter()
+        .map(|(name, udaf)| {
+            (
+                name.clone().into(),
+                FFI_AggregateUDF::from(Arc::clone(udaf)),
+            )
+        })
+        .collect()
+}
+
+unsafe extern "C" fn window_functions_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> RHashMap<RString, FFI_WindowUDF> {
+    let session = session.inner();
+    session
+        .window_functions()
+        .iter()
+        .map(|(name, udwf)| (name.clone().into(), FFI_WindowUDF::from(Arc::clone(udwf))))
+        .collect()
+}
+
+fn table_options_to_rhash(mut options: TableOptions) -> RHashMap<RString, RString> {
+    // It is important that we mutate options here and set current format
+    // to None so that when we call `entries()` we get ALL format entries.
+    // We will pass current_format as a special case and strip it on the
+    // other side of the boundary.
+    let current_format = options.current_format.take();
+    let mut options: HashMap<RString, RString> = options
+        .entries()
+        .into_iter()
+        .filter_map(|entry| entry.value.map(|v| (entry.key.into(), v.into())))
+        .collect();
+    if let Some(current_format) = current_format {
+        options.insert(
+            "datafusion_ffi.table_current_format".into(),
+            match current_format {
+                ConfigFileType::JSON => "json",
+                ConfigFileType::PARQUET => "parquet",
+                ConfigFileType::CSV => "csv",
+            }
+            .into(),
+        );
+    }
+
+    options.into()
+}
+
+unsafe extern "C" fn table_options_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> RHashMap<RString, RString> {
+    let session = session.inner();
+    let table_options = session.table_options();
+    table_options_to_rhash(table_options.clone())
+}
+
+unsafe extern "C" fn default_table_options_fn_wrapper(
+    session: &FFI_SessionRef,
+) -> RHashMap<RString, RString> {
+    let session = session.inner();
+    let table_options = session.default_table_options();
+
+    table_options_to_rhash(table_options)
+}
+
+unsafe extern "C" fn task_ctx_fn_wrapper(session: &FFI_SessionRef) -> FFI_TaskContext {
+    session.inner().task_ctx().into()
+}
+
+unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_SessionRef) {
+    unsafe {
+        let private_data =
+            Box::from_raw(provider.private_data as *mut SessionPrivateData);
+        drop(private_data);
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_SessionRef) -> FFI_SessionRef {
+    unsafe {
+        let old_private_data = provider.private_data as *const SessionPrivateData;
+
+        let private_data = Box::into_raw(Box::new(SessionPrivateData {
+            session: (*old_private_data).session,
+            runtime: (*old_private_data).runtime.clone(),
+        })) as *mut c_void;
+
+        FFI_SessionRef {
+            session_id: session_id_fn_wrapper,
+            config: config_fn_wrapper,
+            create_physical_plan: create_physical_plan_fn_wrapper,
+            create_physical_expr: create_physical_expr_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            table_options: table_options_fn_wrapper,
+            default_table_options: default_table_options_fn_wrapper,
+            task_ctx: task_ctx_fn_wrapper,
+            logical_codec: provider.logical_codec.clone(),
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_SessionRef {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl FFI_SessionRef {
+    /// Creates a new [`FFI_SessionRef`].
+    pub fn new(
+        session: &(dyn Session + Send + Sync),
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(session) = session.as_any().downcast_ref::<ForeignSession>() {
+            return session.session.clone();
+        }
+
+        let private_data = Box::new(SessionPrivateData { session, runtime });
+
+        Self {
+            session_id: session_id_fn_wrapper,
+            config: config_fn_wrapper,
+            create_physical_plan: create_physical_plan_fn_wrapper,
+            create_physical_expr: create_physical_expr_fn_wrapper,
+            scalar_functions: scalar_functions_fn_wrapper,
+            aggregate_functions: aggregate_functions_fn_wrapper,
+            window_functions: window_functions_fn_wrapper,
+            table_options: table_options_fn_wrapper,
+            default_table_options: default_table_options_fn_wrapper,
+            task_ctx: task_ctx_fn_wrapper,
+            logical_codec,
+
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_Session to interact with the foreign table provider.
+#[derive(Debug)]
+pub struct ForeignSession {
+    session: FFI_SessionRef,
+    config: SessionConfig,
+    scalar_functions: HashMap<String, Arc<ScalarUDF>>,
+    aggregate_functions: HashMap<String, Arc<AggregateUDF>>,
+    window_functions: HashMap<String, Arc<WindowUDF>>,
+    table_options: TableOptions,
+    runtime_env: Arc<RuntimeEnv>,
+    props: ExecutionProps,
+}
+
+unsafe impl Send for ForeignSession {}
+unsafe impl Sync for ForeignSession {}
+
+impl FFI_SessionRef {
+    pub fn as_local(&self) -> Option<&(dyn Session + Send + Sync)> {
+        if (self.library_marker_id)() == crate::get_library_marker_id() {
+            return Some(self.inner());
+        }
+        None
+    }
+}
+
+impl TryFrom<&FFI_SessionRef> for ForeignSession {
+    type Error = DataFusionError;
+    fn try_from(session: &FFI_SessionRef) -> Result<Self, Self::Error> {
+        unsafe {
+            let table_options =
+                table_options_from_rhashmap((session.table_options)(session));
+
+            let config = (session.config)(session);
+            let config = SessionConfig::try_from(&config)?;
+
+            let scalar_functions = (session.scalar_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udf = <Arc<dyn ScalarUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(ScalarUDF::new_from_shared_impl(udf)),
+                    )
+                })
+                .collect();
+            let aggregate_functions = (session.aggregate_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udaf = <Arc<dyn AggregateUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(AggregateUDF::new_from_shared_impl(udaf)),
+                    )
+                })
+                .collect();
+            let window_functions = (session.window_functions)(session)
+                .into_iter()
+                .map(|kv_pair| {
+                    let udwf = <Arc<dyn WindowUDFImpl>>::from(&kv_pair.1);
+
+                    (
+                        kv_pair.0.into_string(),
+                        Arc::new(WindowUDF::new_from_shared_impl(udwf)),
+                    )
+                })
+                .collect();
+
+            Ok(Self {
+                session: session.clone(),
+                config,
+                table_options,
+                scalar_functions,
+                aggregate_functions,
+                window_functions,
+                runtime_env: Default::default(),
+                props: Default::default(),
+            })
+        }
+    }
+}
+
+impl Clone for FFI_SessionRef {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+fn table_options_from_rhashmap(options: RHashMap<RString, RString>) -> TableOptions {
+    let mut options: HashMap<String, String> = options
+        .into_iter()
+        .map(|kv_pair| (kv_pair.0.into_string(), kv_pair.1.into_string()))
+        .collect();
+    let current_format = options.remove("datafusion_ffi.table_current_format");
+
+    let mut table_options = TableOptions::default();
+    let formats = [
+        ConfigFileType::CSV,
+        ConfigFileType::JSON,
+        ConfigFileType::PARQUET,
+    ];
+    for format in formats {
+        // It is imperative that if new enum variants are added below that they be
+        // included in the formats list above and in the extension check below.
+        let format_name = match &format {
+            ConfigFileType::CSV => "csv",
+            ConfigFileType::PARQUET => "parquet",
+            ConfigFileType::JSON => "json",
+        };
+        let format_options: HashMap<String, String> = options
+            .iter()
+            .filter_map(|(k, v)| {
+                let (prefix, key) = k.split_once(".")?;
+                if prefix == format_name {
+                    Some((format!("format.{key}"), v.to_owned()))
+                } else {
+                    None
+                }
+            })
+            .collect();
+        if !format_options.is_empty() {
+            table_options.current_format = Some(format.clone());
+            table_options
+                .alter_with_string_hash_map(&format_options)
+                .unwrap_or_else(|err| log::warn!("Error parsing table options: {err}"));
+        }
+    }
+
+    let extension_options: HashMap<String, String> = options
+        .iter()
+        .filter_map(|(k, v)| {
+            let (prefix, _) = k.split_once(".")?;
+            if !["json", "parquet", "csv"].contains(&prefix) {
+                Some((k.to_owned(), v.to_owned()))
+            } else {
+                None
+            }
+        })
+        .collect();
+    if !extension_options.is_empty() {
+        table_options
+            .alter_with_string_hash_map(&extension_options)
+            .unwrap_or_else(|err| log::warn!("Error parsing table options: {err}"));
+    }
+
+    table_options.current_format =
+        current_format.and_then(|format| match format.as_str() {
+            "csv" => Some(ConfigFileType::CSV),
+            "parquet" => Some(ConfigFileType::PARQUET),
+            "json" => Some(ConfigFileType::JSON),
+            _ => None,
+        });
+    table_options
+}
+
+#[async_trait]
+impl Session for ForeignSession {
+    fn session_id(&self) -> &str {
+        unsafe { (self.session.session_id)(&self.session).as_str() }
+    }
+
+    fn config(&self) -> &SessionConfig {
+        &self.config
+    }
+
+    fn config_options(&self) -> &ConfigOptions {
+        self.config.options()
+    }
+
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        unsafe {
+            let logical_plan = logical_plan_to_bytes(logical_plan)?;
+            let physical_plan = df_result!(
+                (self.session.create_physical_plan)(
+                    &self.session,
+                    logical_plan.as_ref().into()
+                )
+                .await
+            )?;
+            let physical_plan = <Arc<dyn ExecutionPlan>>::try_from(&physical_plan)?;
+
+            Ok(physical_plan)
+        }
+    }
+
+    fn create_physical_expr(
+        &self,
+        expr: Expr,
+        df_schema: &DFSchema,
+    ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+        unsafe {
+            let codec: Arc<dyn LogicalExtensionCodec> =
+                (&self.session.logical_codec).into();
+            let logical_expr = serialize_expr(&expr, codec.as_ref())?.encode_to_vec();
+            let schema = WrappedSchema(FFI_ArrowSchema::try_from(df_schema.as_arrow())?);
+
+            let physical_expr = df_result!((self.session.create_physical_expr)(
+                &self.session,
+                logical_expr.into(),
+                schema
+            ))?;
+
+            Ok((&physical_expr).into())
+        }
+    }
+
+    fn scalar_functions(&self) -> &HashMap<String, Arc<ScalarUDF>> {
+        &self.scalar_functions
+    }
+
+    fn aggregate_functions(&self) -> &HashMap<String, Arc<AggregateUDF>> {
+        &self.aggregate_functions
+    }
+
+    fn window_functions(&self) -> &HashMap<String, Arc<WindowUDF>> {
+        &self.window_functions
+    }
+
+    fn runtime_env(&self) -> &Arc<RuntimeEnv> {
+        &self.runtime_env
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        &self.props
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_options(&self) -> &TableOptions {
+        &self.table_options
+    }
+
+    fn default_table_options(&self) -> TableOptions {
+        unsafe {
+            table_options_from_rhashmap((self.session.default_table_options)(
+                &self.session,
+            ))
+        }
+    }
+
+    fn table_options_mut(&mut self) -> &mut TableOptions {
+        log::warn!(
+            "Mutating table options is not supported via FFI. Changes will not have an effect."
+        );
+        &mut self.table_options
+    }
+
+    fn task_ctx(&self) -> Arc<TaskContext> {
+        unsafe { (self.session.task_ctx)(&self.session).into() }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion::execution::SessionStateBuilder;
+    use datafusion_common::DataFusionError;
+    use datafusion_expr::col;
+    use datafusion_expr::registry::FunctionRegistry;
+    use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_ffi_session() -> Result<(), DataFusionError> {
+        let (ctx, task_ctx_provider) = crate::util::tests::test_session_and_ctx();
+        let mut table_options = TableOptions::default();
+        table_options.csv.has_header = Some(true);
+        table_options.json.schema_infer_max_rec = Some(10);
+        table_options.parquet.global.coerce_int96 = Some("123456789".into());
+        table_options.current_format = Some(ConfigFileType::JSON);
+
+        let state = SessionStateBuilder::new_from_existing(ctx.state())
+            .with_table_options(table_options)
+            .build();
+
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            Arc::new(DefaultLogicalExtensionCodec {}),
+            None,
+            task_ctx_provider,
+        );
+
+        let local_session = FFI_SessionRef::new(&state, None, logical_codec);
+        let foreign_session = ForeignSession::try_from(&local_session)?;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let df_schema = schema.try_into()?;
+        let physical_expr = foreign_session.create_physical_expr(col("a"), &df_schema)?;
+        assert_eq!(
+            format!("{physical_expr:?}"),
+            "Column { name: \"a\", index: 0 }"
+        );
+
+        assert_eq!(foreign_session.session_id(), state.session_id());
+
+        let logical_plan = LogicalPlan::default();
+        let physical_plan = foreign_session.create_physical_plan(&logical_plan).await?;
+        assert_eq!(
+            format!("{physical_plan:?}"),
+            "EmptyExec { schema: Schema { fields: [], metadata: {} }, partitions: 1, cache: PlanProperties { eq_properties: EquivalenceProperties { eq_group: EquivalenceGroup { map: {}, classes: [] }, oeq_class: OrderingEquivalenceClass { orderings: [] }, oeq_cache: OrderingEquivalenceCache { normal_cls: OrderingEquivalenceClass { orderings: [] }, leading_map: {} }, constraints: Constraints { inner: [] }, schema: Schema { fields: [], metadata: {} } }, partitioning: UnknownPartitioning(1), emission_type: Incremental, boundedness: Bounded, evaluation_type: Lazy, scheduling_type: Cooperative, output_ordering: None } }"
+        );
+
+        assert_eq!(
+            format!("{:?}", foreign_session.default_table_options()),
+            format!("{:?}", state.default_table_options())
+        );
+
+        assert_eq!(
+            format!("{:?}", foreign_session.table_options()),
+            format!("{:?}", state.table_options())
+        );
+
+        let local_udfs = state.udfs();
+        for udf in foreign_session.scalar_functions().keys() {
+            assert!(local_udfs.contains(udf));
+        }
+        let local_udafs = state.udafs();
+        for udaf in foreign_session.aggregate_functions().keys() {
+            assert!(local_udafs.contains(udaf));
+        }
+        let local_udwfs = state.udwfs();
+        for udwf in foreign_session.window_functions().keys() {
+            assert!(local_udwfs.contains(udwf));
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs
index 890511997a706..4a89bb025a56d 100644
--- a/datafusion/ffi/src/table_provider.rs
+++ b/datafusion/ffi/src/table_provider.rs
@@ -15,46 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{any::Any, ffi::c_void, sync::Arc};
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
 
-use abi_stable::{
-    std_types::{ROption, RResult, RString, RVec},
-    StableAbi,
-};
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RVec};
 use arrow::datatypes::SchemaRef;
 use async_ffi::{FfiFuture, FutureExt};
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{Session, TableProvider},
-    datasource::TableType,
-    error::DataFusionError,
-    execution::{session_state::SessionStateBuilder, TaskContext},
-    logical_expr::{logical_plan::dml::InsertOp, TableProviderFilterPushDown},
-    physical_plan::ExecutionPlan,
-    prelude::{Expr, SessionContext},
-};
-use datafusion_proto::{
-    logical_plan::{
-        from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec,
-    },
-    protobuf::LogicalExprList,
+use datafusion_catalog::{Session, TableProvider};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_expr::dml::InsertOp;
+use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_proto::logical_plan::from_proto::parse_exprs;
+use datafusion_proto::logical_plan::to_proto::serialize_exprs;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
+use datafusion_proto::protobuf::LogicalExprList;
 use prost::Message;
 use tokio::runtime::Handle;
 
-use crate::{
-    arrow_wrappers::WrappedSchema,
-    df_result, rresult_return,
-    session_config::ForeignSessionConfig,
-    table_source::{FFI_TableProviderFilterPushDown, FFI_TableType},
-};
-
-use super::{
-    execution_plan::{FFI_ExecutionPlan, ForeignExecutionPlan},
-    insert_op::FFI_InsertOp,
-    session_config::FFI_SessionConfig,
-};
-use datafusion::error::Result;
+use super::execution_plan::FFI_ExecutionPlan;
+use super::insert_op::FFI_InsertOp;
+use crate::arrow_wrappers::WrappedSchema;
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::{FFI_SessionRef, ForeignSession};
+use crate::table_source::{FFI_TableProviderFilterPushDown, FFI_TableType};
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
 
 /// A stable struct for sharing [`TableProvider`] across FFI boundaries.
 ///
@@ -97,65 +90,69 @@ use datafusion::error::Result;
 /// side of the interface each object refers to.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_TableProvider {
     /// Return the table schema
-    pub schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema,
+    schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema,
 
     /// Perform a scan on the table. See [`TableProvider`] for detailed usage information.
     ///
     /// # Arguments
     ///
     /// * `provider` - the table provider
-    /// * `session_config` - session configuration
+    /// * `session` - session
     /// * `projections` - if specified, only a subset of the columns are returned
     /// * `filters_serialized` - filters to apply to the scan, which are a
     ///   [`LogicalExprList`] protobuf message serialized into bytes to pass
     ///   across the FFI boundary.
     /// * `limit` - if specified, limit the number of rows returned
-    pub scan: unsafe extern "C" fn(
+    scan: unsafe extern "C" fn(
         provider: &Self,
-        session_config: &FFI_SessionConfig,
-        projections: RVec<usize>,
+        session: FFI_SessionRef,
+        projections: ROption<RVec<usize>>,
         filters_serialized: RVec<u8>,
         limit: ROption<usize>,
-    ) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>>,
+    ) -> FfiFuture<FFIResult<FFI_ExecutionPlan>>,
 
     /// Return the type of table. See [`TableType`] for options.
-    pub table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType,
+    table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType,
 
     /// Based upon the input filters, identify which are supported. The filters
     /// are a [`LogicalExprList`] protobuf message serialized into bytes to pass
     /// across the FFI boundary.
-    pub supports_filters_pushdown: Option<
+    supports_filters_pushdown: Option<
         unsafe extern "C" fn(
             provider: &FFI_TableProvider,
             filters_serialized: RVec<u8>,
-        )
-            -> RResult<RVec<FFI_TableProviderFilterPushDown>, RString>,
+        ) -> FFIResult<RVec<FFI_TableProviderFilterPushDown>>,
     >,
 
-    pub insert_into:
-        unsafe extern "C" fn(
-            provider: &Self,
-            session_config: &FFI_SessionConfig,
-            input: &FFI_ExecutionPlan,
-            insert_op: FFI_InsertOp,
-        ) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>>,
+    insert_into: unsafe extern "C" fn(
+        provider: &Self,
+        session: FFI_SessionRef,
+        input: &FFI_ExecutionPlan,
+        insert_op: FFI_InsertOp,
+    ) -> FfiFuture<FFIResult<FFI_ExecutionPlan>>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the execution plan. This should
     /// only need to be called by the receiver of the plan.
-    pub clone: unsafe extern "C" fn(plan: &Self) -> Self,
+    clone: unsafe extern "C" fn(plan: &Self) -> Self,
 
     /// Release the memory of the private data when it is no longer being used.
-    pub release: unsafe extern "C" fn(arg: &mut Self),
+    release: unsafe extern "C" fn(arg: &mut Self),
 
     /// Return the major DataFusion version number of this provider.
     pub version: unsafe extern "C" fn() -> u64,
 
     /// Internal data. This is only to be accessed by the provider of the plan.
-    /// A [`ForeignExecutionPlan`] should never attempt to access this data.
-    pub private_data: *mut c_void,
+    /// A [`ForeignTableProvider`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_TableProvider {}
@@ -166,36 +163,41 @@ struct ProviderPrivateData {
     runtime: Option<Handle>,
 }
 
-unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
+impl FFI_TableProvider {
+    fn inner(&self) -> &Arc<dyn TableProvider + Send> {
+        let private_data = self.private_data as *const ProviderPrivateData;
+        unsafe { &(*private_data).provider }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const ProviderPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+}
 
-    provider.schema().into()
+unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema {
+    provider.inner().schema().into()
 }
 
 unsafe extern "C" fn table_type_fn_wrapper(
     provider: &FFI_TableProvider,
 ) -> FFI_TableType {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
-
-    provider.table_type().into()
+    provider.inner().table_type().into()
 }
 
 fn supports_filters_pushdown_internal(
     provider: &Arc<dyn TableProvider + Send>,
     filters_serialized: &[u8],
+    task_ctx: &Arc<TaskContext>,
+    codec: &dyn LogicalExtensionCodec,
 ) -> Result<RVec<FFI_TableProviderFilterPushDown>> {
-    let default_ctx = SessionContext::new();
-    let codec = DefaultLogicalExtensionCodec {};
-
     let filters = match filters_serialized.is_empty() {
         true => vec![],
         false => {
             let proto_filters = LogicalExprList::decode(filters_serialized)
                 .map_err(|e| DataFusionError::Plan(e.to_string()))?;
 
-            parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)?
+            parse_exprs(proto_filters.expr.iter(), task_ctx.as_ref(), codec)?
         }
     };
     let filters_borrowed: Vec<&Expr> = filters.iter().collect();
@@ -212,119 +214,128 @@ fn supports_filters_pushdown_internal(
 unsafe extern "C" fn supports_filters_pushdown_fn_wrapper(
     provider: &FFI_TableProvider,
     filters_serialized: RVec<u8>,
-) -> RResult<RVec<FFI_TableProviderFilterPushDown>, RString> {
-    let private_data = provider.private_data as *const ProviderPrivateData;
-    let provider = &(*private_data).provider;
-
-    supports_filters_pushdown_internal(provider, &filters_serialized)
-        .map_err(|e| e.to_string().into())
-        .into()
+) -> FFIResult<RVec<FFI_TableProviderFilterPushDown>> {
+    let logical_codec: Arc<dyn LogicalExtensionCodec> = (&provider.logical_codec).into();
+    let task_ctx = rresult_return!(<Arc<TaskContext>>::try_from(
+        &provider.logical_codec.task_ctx_provider
+    ));
+    supports_filters_pushdown_internal(
+        provider.inner(),
+        &filters_serialized,
+        &task_ctx,
+        logical_codec.as_ref(),
+    )
+    .map_err(|e| e.to_string().into())
+    .into()
 }
 
 unsafe extern "C" fn scan_fn_wrapper(
     provider: &FFI_TableProvider,
-    session_config: &FFI_SessionConfig,
-    projections: RVec<usize>,
+    session: FFI_SessionRef,
+    projections: ROption<RVec<usize>>,
     filters_serialized: RVec<u8>,
     limit: ROption<usize>,
-) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>> {
-    let private_data = provider.private_data as *mut ProviderPrivateData;
-    let internal_provider = &(*private_data).provider;
-    let session_config = session_config.clone();
-    let runtime = &(*private_data).runtime;
+) -> FfiFuture<FFIResult<FFI_ExecutionPlan>> {
+    let task_ctx: Result<Arc<TaskContext>, DataFusionError> =
+        (&provider.logical_codec.task_ctx_provider).try_into();
+    let runtime = provider.runtime().clone();
+    let logical_codec: Arc<dyn LogicalExtensionCodec> = (&provider.logical_codec).into();
+    let internal_provider = Arc::clone(provider.inner());
 
     async move {
-        let config = rresult_return!(ForeignSessionConfig::try_from(&session_config));
-        let session = SessionStateBuilder::new()
-            .with_default_features()
-            .with_config(config.0)
-            .build();
-        let ctx = SessionContext::new_with_state(session);
+        let mut foreign_session = None;
+        let session = rresult_return!(
+            session
+                .as_local()
+                .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+                .unwrap_or_else(|| {
+                    foreign_session = Some(ForeignSession::try_from(&session)?);
+                    Ok(foreign_session.as_ref().unwrap())
+                })
+        );
 
+        let task_ctx = rresult_return!(task_ctx);
         let filters = match filters_serialized.is_empty() {
             true => vec![],
             false => {
-                let default_ctx = SessionContext::new();
-                let codec = DefaultLogicalExtensionCodec {};
-
                 let proto_filters =
                     rresult_return!(LogicalExprList::decode(filters_serialized.as_ref()));
 
                 rresult_return!(parse_exprs(
                     proto_filters.expr.iter(),
-                    &default_ctx,
-                    &codec
+                    task_ctx.as_ref(),
+                    logical_codec.as_ref(),
                 ))
             }
         };
 
-        let projections: Vec<_> = projections.into_iter().collect();
+        let projections: Option<Vec<usize>> =
+            projections.into_option().map(|p| p.into_iter().collect());
 
         let plan = rresult_return!(
             internal_provider
-                .scan(&ctx.state(), Some(&projections), &filters, limit.into())
+                .scan(session, projections.as_ref(), &filters, limit.into())
                 .await
         );
 
-        RResult::ROk(FFI_ExecutionPlan::new(
-            plan,
-            ctx.task_ctx(),
-            runtime.clone(),
-        ))
+        RResult::ROk(FFI_ExecutionPlan::new(plan, runtime.clone()))
     }
     .into_ffi()
 }
 
 unsafe extern "C" fn insert_into_fn_wrapper(
     provider: &FFI_TableProvider,
-    session_config: &FFI_SessionConfig,
+    session: FFI_SessionRef,
     input: &FFI_ExecutionPlan,
     insert_op: FFI_InsertOp,
-) -> FfiFuture<RResult<FFI_ExecutionPlan, RString>> {
-    let private_data = provider.private_data as *mut ProviderPrivateData;
-    let internal_provider = &(*private_data).provider;
-    let session_config = session_config.clone();
+) -> FfiFuture<FFIResult<FFI_ExecutionPlan>> {
+    let runtime = provider.runtime().clone();
+    let internal_provider = Arc::clone(provider.inner());
     let input = input.clone();
-    let runtime = &(*private_data).runtime;
 
     async move {
-        let config = rresult_return!(ForeignSessionConfig::try_from(&session_config));
-        let session = SessionStateBuilder::new()
-            .with_default_features()
-            .with_config(config.0)
-            .build();
-        let ctx = SessionContext::new_with_state(session);
+        let mut foreign_session = None;
+        let session = rresult_return!(
+            session
+                .as_local()
+                .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+                .unwrap_or_else(|| {
+                    foreign_session = Some(ForeignSession::try_from(&session)?);
+                    Ok(foreign_session.as_ref().unwrap())
+                })
+        );
 
-        let input = rresult_return!(ForeignExecutionPlan::try_from(&input).map(Arc::new));
+        let input = rresult_return!(<Arc<dyn ExecutionPlan>>::try_from(&input));
 
         let insert_op = InsertOp::from(insert_op);
 
         let plan = rresult_return!(
             internal_provider
-                .insert_into(&ctx.state(), input, insert_op)
+                .insert_into(session, input, insert_op)
                 .await
         );
 
-        RResult::ROk(FFI_ExecutionPlan::new(
-            plan,
-            ctx.task_ctx(),
-            runtime.clone(),
-        ))
+        RResult::ROk(FFI_ExecutionPlan::new(plan, runtime.clone()))
     }
     .into_ffi()
 }
 
 unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_TableProvider) {
-    let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!provider.private_data.is_null());
+        let private_data =
+            Box::from_raw(provider.private_data as *mut ProviderPrivateData);
+        drop(private_data);
+        provider.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_TableProvider {
-    let old_private_data = provider.private_data as *const ProviderPrivateData;
-    let runtime = (*old_private_data).runtime.clone();
+    let runtime = provider.runtime().clone();
+    let old_provider = Arc::clone(provider.inner());
 
     let private_data = Box::into_raw(Box::new(ProviderPrivateData {
-        provider: Arc::clone(&(*old_private_data).provider),
+        provider: old_provider,
         runtime,
     })) as *mut c_void;
 
@@ -334,10 +345,12 @@ unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_Table
         table_type: table_type_fn_wrapper,
         supports_filters_pushdown: provider.supports_filters_pushdown,
         insert_into: provider.insert_into,
+        logical_codec: provider.logical_codec.clone(),
         clone: clone_fn_wrapper,
         release: release_fn_wrapper,
         version: super::version,
         private_data,
+        library_marker_id: crate::get_library_marker_id,
     }
 }
 
@@ -353,7 +366,34 @@ impl FFI_TableProvider {
         provider: Arc<dyn TableProvider + Send>,
         can_support_pushdown_filters: bool,
         runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(
+            provider,
+            can_support_pushdown_filters,
+            runtime,
+            logical_codec,
+        )
+    }
+
+    pub fn new_with_ffi_codec(
+        provider: Arc<dyn TableProvider + Send>,
+        can_support_pushdown_filters: bool,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
     ) -> Self {
+        if let Some(provider) = provider.as_any().downcast_ref::<ForeignTableProvider>() {
+            return provider.0.clone();
+        }
         let private_data = Box::new(ProviderPrivateData { provider, runtime });
 
         Self {
@@ -365,10 +405,12 @@ impl FFI_TableProvider {
                 false => None,
             },
             insert_into: insert_into_fn_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             version: super::version,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -383,9 +425,13 @@ pub struct ForeignTableProvider(pub FFI_TableProvider);
 unsafe impl Send for ForeignTableProvider {}
 unsafe impl Sync for ForeignTableProvider {}
 
-impl From<&FFI_TableProvider> for ForeignTableProvider {
+impl From<&FFI_TableProvider> for Arc<dyn TableProvider> {
     fn from(provider: &FFI_TableProvider) -> Self {
-        Self(provider.clone())
+        if (provider.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(provider.inner()) as Arc<dyn TableProvider>
+        } else {
+            Arc::new(ForeignTableProvider(provider.clone()))
+        }
     }
 }
 
@@ -417,31 +463,32 @@ impl TableProvider for ForeignTableProvider {
         filters: &[Expr],
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let session_config: FFI_SessionConfig = session.config().into();
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
 
-        let projections: Option<RVec<usize>> =
-            projection.map(|p| p.iter().map(|v| v.to_owned()).collect());
+        let projections: ROption<RVec<usize>> = projection
+            .map(|p| p.iter().map(|v| v.to_owned()).collect())
+            .into();
 
-        let codec = DefaultLogicalExtensionCodec {};
+        let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
         let filter_list = LogicalExprList {
-            expr: serialize_exprs(filters, &codec)?,
+            expr: serialize_exprs(filters, codec.as_ref())?,
         };
         let filters_serialized = filter_list.encode_to_vec().into();
 
         let plan = unsafe {
             let maybe_plan = (self.0.scan)(
                 &self.0,
-                &session_config,
-                projections.unwrap_or_default(),
+                session,
+                projections,
                 filters_serialized,
                 limit.into(),
             )
             .await;
 
-            ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)?
+            <Arc<dyn ExecutionPlan>>::try_from(&df_result!(maybe_plan)?)?
         };
 
-        Ok(Arc::new(plan))
+        Ok(plan)
     }
 
     /// Tests whether the table provider can make use of a filter expression
@@ -457,14 +504,17 @@ impl TableProvider for ForeignTableProvider {
                     return Ok(vec![
                         TableProviderFilterPushDown::Unsupported;
                         filters.len()
-                    ])
+                    ]);
                 }
             };
 
-            let codec = DefaultLogicalExtensionCodec {};
+            let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
 
             let expr_list = LogicalExprList {
-                expr: serialize_exprs(filters.iter().map(|f| f.to_owned()), &codec)?,
+                expr: serialize_exprs(
+                    filters.iter().map(|f| f.to_owned()),
+                    codec.as_ref(),
+                )?,
             };
             let serialized_filters = expr_list.encode_to_vec();
 
@@ -480,37 +530,36 @@ impl TableProvider for ForeignTableProvider {
         input: Arc<dyn ExecutionPlan>,
         insert_op: InsertOp,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let session_config: FFI_SessionConfig = session.config().into();
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
 
         let rc = Handle::try_current().ok();
-        let input =
-            FFI_ExecutionPlan::new(input, Arc::new(TaskContext::from(session)), rc);
+        let input = FFI_ExecutionPlan::new(input, rc);
         let insert_op: FFI_InsertOp = insert_op.into();
 
         let plan = unsafe {
             let maybe_plan =
-                (self.0.insert_into)(&self.0, &session_config, &input, insert_op).await;
+                (self.0.insert_into)(&self.0, session, &input, insert_op).await;
 
-            ForeignExecutionPlan::try_from(&df_result!(maybe_plan)?)?
+            <Arc<dyn ExecutionPlan>>::try_from(&df_result!(maybe_plan)?)?
         };
 
-        Ok(Arc::new(plan))
+        Ok(plan)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use arrow::datatypes::Schema;
-    use datafusion::prelude::{col, lit};
+    use datafusion::prelude::{SessionContext, col, lit};
+    use datafusion_execution::TaskContextProvider;
 
     use super::*;
 
-    #[tokio::test]
-    async fn test_round_trip_ffi_table_provider_scan() -> Result<()> {
+    fn create_test_table_provider() -> Result<Arc<dyn TableProvider>> {
         use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
         use datafusion::datasource::MemTable;
 
         let schema =
@@ -526,16 +575,26 @@ mod tests {
             vec![Arc::new(Float32Array::from(vec![64.0]))],
         )?;
 
-        let ctx = SessionContext::new();
+        Ok(Arc::new(MemTable::try_new(
+            schema,
+            vec![vec![batch1], vec![batch2]],
+        )?))
+    }
 
-        let provider =
-            Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?);
+    #[tokio::test]
+    async fn test_round_trip_ffi_table_provider_scan() -> Result<()> {
+        let provider = create_test_table_provider()?;
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let df = ctx.table("t").await?;
 
@@ -549,35 +608,18 @@ mod tests {
 
     #[tokio::test]
     async fn test_round_trip_ffi_table_provider_insert_into() -> Result<()> {
-        use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
-        use datafusion::datasource::MemTable;
+        let provider = create_test_table_provider()?;
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        // define data in two partitions
-        let batch1 = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
-        )?;
-        let batch2 = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![Arc::new(Float32Array::from(vec![64.0]))],
-        )?;
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        let ctx = SessionContext::new();
-
-        let provider =
-            Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?);
-
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
-
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
-
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let result = ctx
             .sql("INSERT INTO t VALUES (128.0);")
@@ -600,9 +642,9 @@ mod tests {
     #[tokio::test]
     async fn test_aggregation() -> Result<()> {
         use arrow::datatypes::Field;
-        use datafusion::arrow::{
-            array::Float32Array, datatypes::DataType, record_batch::RecordBatch,
-        };
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
         use datafusion::common::assert_batches_eq;
         use datafusion::datasource::MemTable;
 
@@ -615,15 +657,19 @@ mod tests {
             vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
         )?;
 
-        let ctx = SessionContext::new();
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
         let provider = Arc::new(MemTable::try_new(schema, vec![vec![batch1]])?);
 
-        let ffi_provider = FFI_TableProvider::new(provider, true, None);
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into();
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
 
-        ctx.register_table("t", Arc::new(foreign_table_provider))?;
+        ctx.register_table("t", foreign_table_provider)?;
 
         let result = ctx
             .sql("SELECT COUNT(*) as cnt FROM t")
@@ -641,4 +687,93 @@ mod tests {
         assert_batches_eq!(expected, &result);
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_table_provider_local_bypass() -> Result<()> {
+        let table_provider = create_test_table_provider()?;
+
+        let ctx = Arc::new(SessionContext::new()) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&ctx);
+        let mut ffi_table =
+            FFI_TableProvider::new(table_provider, false, None, task_ctx_provider, None);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_table: Arc<dyn TableProvider> = (&ffi_table).into();
+        assert!(
+            foreign_table
+                .as_any()
+                .downcast_ref::<datafusion::datasource::MemTable>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_table.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_table: Arc<dyn TableProvider> = (&ffi_table).into();
+        assert!(
+            foreign_table
+                .as_any()
+                .downcast_ref::<ForeignTableProvider>()
+                .is_some()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_scan_with_none_projection_returns_all_columns() -> Result<()> {
+        use arrow::datatypes::Field;
+        use datafusion::arrow::array::Float32Array;
+        use datafusion::arrow::datatypes::DataType;
+        use datafusion::arrow::record_batch::RecordBatch;
+        use datafusion::datasource::MemTable;
+        use datafusion::physical_plan::collect;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Float32, false),
+            Field::new("b", DataType::Float32, false),
+            Field::new("c", DataType::Float32, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Float32Array::from(vec![1.0, 2.0])),
+                Arc::new(Float32Array::from(vec![3.0, 4.0])),
+                Arc::new(Float32Array::from(vec![5.0, 6.0])),
+            ],
+        )?;
+
+        let provider =
+            Arc::new(MemTable::try_new(Arc::clone(&schema), vec![vec![batch]])?);
+
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        // Wrap in FFI and force the foreign path (not local bypass)
+        let mut ffi_provider =
+            FFI_TableProvider::new(provider, true, None, task_ctx_provider, None);
+        ffi_provider.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_provider).into();
+
+        // Call scan with projection=None, meaning "return all columns"
+        let plan = foreign_table_provider
+            .scan(&ctx.state(), None, &[], None)
+            .await?;
+        assert_eq!(
+            plan.schema().fields().len(),
+            3,
+            "scan(projection=None) should return all columns; got {}",
+            plan.schema().fields().len()
+        );
+
+        // Also verify we can execute and get correct data
+        let batches = collect(plan, ctx.task_ctx()).await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_columns(), 3);
+        assert_eq!(batches[0].num_rows(), 2);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/table_provider_factory.rs b/datafusion/ffi/src/table_provider_factory.rs
new file mode 100644
index 0000000000000..15789eeab0421
--- /dev/null
+++ b/datafusion/ffi/src/table_provider_factory.rs
@@ -0,0 +1,429 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{ffi::c_void, sync::Arc};
+
+use abi_stable::{
+    StableAbi,
+    std_types::{RResult, RString, RVec},
+};
+use async_ffi::{FfiFuture, FutureExt};
+use async_trait::async_trait;
+use datafusion_catalog::{Session, TableProvider, TableProviderFactory};
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_expr::{CreateExternalTable, DdlStatement, LogicalPlan};
+use datafusion_proto::logical_plan::{
+    AsLogicalPlan, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+};
+use datafusion_proto::protobuf::LogicalPlanNode;
+use prost::Message;
+use tokio::runtime::Handle;
+
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::session::{FFI_SessionRef, ForeignSession};
+use crate::table_provider::{FFI_TableProvider, ForeignTableProvider};
+use crate::{df_result, rresult_return};
+
+/// A stable struct for sharing [`TableProviderFactory`] across FFI boundaries.
+///
+/// Similar to [`FFI_TableProvider`], this struct uses the FFI-safe pattern where:
+/// - The `FFI_*` struct exposes stable function pointers
+/// - Private data is stored as an opaque pointer
+/// - The `Foreign*` wrapper is used by consumers on the other side of the FFI boundary
+///
+/// [`FFI_TableProvider`]: crate::table_provider::FFI_TableProvider
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_TableProviderFactory {
+    /// Create a TableProvider with the given command.
+    ///
+    /// # Arguments
+    ///
+    /// * `factory` - the table provider factory
+    /// * `session_config` - session configuration
+    /// * `cmd_serialized` - a ['CreateExternalTable`] encoded as a [`LogicalPlanNode`] protobuf message serialized into bytes
+    ///   to pass across the FFI boundary.
+    create: unsafe extern "C" fn(
+        factory: &Self,
+        session: FFI_SessionRef,
+        cmd_serialized: RVec<u8>,
+    ) -> FfiFuture<RResult<FFI_TableProvider, RString>>,
+
+    logical_codec: FFI_LogicalExtensionCodec,
+
+    /// Used to create a clone of the factory. This should only need to be called
+    /// by the receiver of the factory.
+    clone: unsafe extern "C" fn(factory: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    release: unsafe extern "C" fn(factory: &mut Self),
+
+    /// Return the major DataFusion version number of this factory.
+    version: unsafe extern "C" fn() -> u64,
+
+    /// Internal data. This is only to be accessed by the provider of the factory.
+    /// A [`ForeignTableProviderFactory`] should never attempt to access this data.
+    private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_TableProviderFactory {}
+unsafe impl Sync for FFI_TableProviderFactory {}
+
+struct FactoryPrivateData {
+    factory: Arc<dyn TableProviderFactory + Send>,
+    runtime: Option<Handle>,
+}
+
+impl FFI_TableProviderFactory {
+    /// Creates a new [`FFI_TableProvider`].
+    pub fn new(
+        factory: Arc<dyn TableProviderFactory + Send>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+        Self::new_with_ffi_codec(factory, runtime, logical_codec)
+    }
+
+    pub fn new_with_ffi_codec(
+        factory: Arc<dyn TableProviderFactory + Send>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        let private_data = Box::new(FactoryPrivateData { factory, runtime });
+
+        Self {
+            create: create_fn_wrapper,
+            logical_codec,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            version: super::version,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+
+    fn inner(&self) -> &Arc<dyn TableProviderFactory + Send> {
+        let private_data = self.private_data as *const FactoryPrivateData;
+        unsafe { &(*private_data).factory }
+    }
+
+    fn runtime(&self) -> &Option<Handle> {
+        let private_data = self.private_data as *const FactoryPrivateData;
+        unsafe { &(*private_data).runtime }
+    }
+
+    fn deserialize_cmd(
+        &self,
+        cmd_serialized: &RVec<u8>,
+    ) -> Result<CreateExternalTable, DataFusionError> {
+        let task_ctx: Arc<TaskContext> =
+            (&self.logical_codec.task_ctx_provider).try_into()?;
+        let logical_codec: Arc<dyn LogicalExtensionCodec> = (&self.logical_codec).into();
+
+        let plan = LogicalPlanNode::decode(cmd_serialized.as_ref())
+            .map_err(|e| DataFusionError::Internal(format!("{e:?}")))?;
+        match plan.try_into_logical_plan(&task_ctx, logical_codec.as_ref())? {
+            LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) => Ok(cmd),
+            _ => Err(DataFusionError::Internal(
+                "Invalid logical plan in FFI_TableProviderFactory.".to_owned(),
+            )),
+        }
+    }
+}
+
+impl Clone for FFI_TableProviderFactory {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl Drop for FFI_TableProviderFactory {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+impl From<&FFI_TableProviderFactory> for Arc<dyn TableProviderFactory> {
+    fn from(factory: &FFI_TableProviderFactory) -> Self {
+        if (factory.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(factory.inner()) as Arc<dyn TableProviderFactory>
+        } else {
+            Arc::new(ForeignTableProviderFactory(factory.clone()))
+        }
+    }
+}
+
+unsafe extern "C" fn create_fn_wrapper(
+    factory: &FFI_TableProviderFactory,
+    session: FFI_SessionRef,
+    cmd_serialized: RVec<u8>,
+) -> FfiFuture<RResult<FFI_TableProvider, RString>> {
+    let factory = factory.clone();
+
+    async move {
+        let provider = rresult_return!(
+            create_fn_wrapper_impl(factory, session, cmd_serialized).await
+        );
+        RResult::ROk(provider)
+    }
+    .into_ffi()
+}
+
+async fn create_fn_wrapper_impl(
+    factory: FFI_TableProviderFactory,
+    session: FFI_SessionRef,
+    cmd_serialized: RVec<u8>,
+) -> Result<FFI_TableProvider, DataFusionError> {
+    let runtime = factory.runtime().clone();
+    let ffi_logical_codec = factory.logical_codec.clone();
+    let internal_factory = Arc::clone(factory.inner());
+    let cmd = factory.deserialize_cmd(&cmd_serialized)?;
+
+    let mut foreign_session = None;
+    let session = session
+        .as_local()
+        .map(Ok::<&(dyn Session + Send + Sync), DataFusionError>)
+        .unwrap_or_else(|| {
+            foreign_session = Some(ForeignSession::try_from(&session)?);
+            Ok(foreign_session.as_ref().unwrap())
+        })?;
+
+    let provider = internal_factory.create(session, &cmd).await?;
+    Ok(FFI_TableProvider::new_with_ffi_codec(
+        provider,
+        true,
+        runtime.clone(),
+        ffi_logical_codec,
+    ))
+}
+
+unsafe extern "C" fn clone_fn_wrapper(
+    factory: &FFI_TableProviderFactory,
+) -> FFI_TableProviderFactory {
+    let runtime = factory.runtime().clone();
+    let old_factory = Arc::clone(factory.inner());
+
+    let private_data = Box::into_raw(Box::new(FactoryPrivateData {
+        factory: old_factory,
+        runtime,
+    })) as *mut c_void;
+
+    FFI_TableProviderFactory {
+        create: create_fn_wrapper,
+        logical_codec: factory.logical_codec.clone(),
+        clone: clone_fn_wrapper,
+        release: release_fn_wrapper,
+        version: super::version,
+        private_data,
+        library_marker_id: crate::get_library_marker_id,
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(factory: &mut FFI_TableProviderFactory) {
+    unsafe {
+        debug_assert!(!factory.private_data.is_null());
+        let private_data = Box::from_raw(factory.private_data as *mut FactoryPrivateData);
+        drop(private_data);
+        factory.private_data = std::ptr::null_mut();
+    }
+}
+
+/// This wrapper struct exists on the receiver side of the FFI interface, so it has
+/// no guarantees about being able to access the data in `private_data`. Any functions
+/// defined on this struct must only use the stable functions provided in
+/// FFI_TableProviderFactory to interact with the foreign table provider factory.
+#[derive(Debug)]
+pub struct ForeignTableProviderFactory(pub FFI_TableProviderFactory);
+
+impl ForeignTableProviderFactory {
+    fn serialize_cmd(
+        &self,
+        cmd: CreateExternalTable,
+    ) -> Result<RVec<u8>, DataFusionError> {
+        let logical_codec: Arc<dyn LogicalExtensionCodec> =
+            (&self.0.logical_codec).into();
+
+        let plan = LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd));
+        let plan: LogicalPlanNode =
+            AsLogicalPlan::try_from_logical_plan(&plan, logical_codec.as_ref())?;
+
+        let mut buf: Vec<u8> = Vec::new();
+        plan.try_encode(&mut buf)?;
+
+        Ok(buf.into())
+    }
+}
+
+unsafe impl Send for ForeignTableProviderFactory {}
+unsafe impl Sync for ForeignTableProviderFactory {}
+
+#[async_trait]
+impl TableProviderFactory for ForeignTableProviderFactory {
+    async fn create(
+        &self,
+        session: &dyn Session,
+        cmd: &CreateExternalTable,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let session = FFI_SessionRef::new(session, None, self.0.logical_codec.clone());
+        let cmd = self.serialize_cmd(cmd.clone())?;
+
+        let provider = unsafe {
+            let maybe_provider = (self.0.create)(&self.0, session, cmd).await;
+
+            let ffi_provider = df_result!(maybe_provider)?;
+            ForeignTableProvider(ffi_provider)
+        };
+
+        Ok(Arc::new(provider))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::Schema;
+    use datafusion::prelude::SessionContext;
+    use datafusion_common::{TableReference, ToDFSchema};
+    use datafusion_execution::TaskContextProvider;
+    use std::collections::HashMap;
+
+    use super::*;
+
+    #[derive(Debug)]
+    struct TestTableProviderFactory {}
+
+    #[async_trait]
+    impl TableProviderFactory for TestTableProviderFactory {
+        async fn create(
+            &self,
+            _session: &dyn Session,
+            _cmd: &CreateExternalTable,
+        ) -> Result<Arc<dyn TableProvider>> {
+            use arrow::datatypes::Field;
+            use datafusion::arrow::array::Float32Array;
+            use datafusion::arrow::datatypes::DataType;
+            use datafusion::arrow::record_batch::RecordBatch;
+            use datafusion::datasource::MemTable;
+
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+            let batch1 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))],
+            )?;
+            let batch2 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Float32Array::from(vec![64.0]))],
+            )?;
+
+            Ok(Arc::new(MemTable::try_new(
+                schema,
+                vec![vec![batch1], vec![batch2]],
+            )?))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_round_trip_ffi_table_provider_factory() -> Result<()> {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        let factory = Arc::new(TestTableProviderFactory {});
+        let mut ffi_factory =
+            FFI_TableProviderFactory::new(factory, None, task_ctx_provider, None);
+        ffi_factory.library_marker_id = crate::mock_foreign_marker_id;
+
+        let factory: Arc<dyn TableProviderFactory> = (&ffi_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("test_table"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = factory.create(&ctx.state(), &cmd).await?;
+
+        assert_eq!(provider.schema().fields().len(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ffi_table_provider_factory_clone() -> Result<()> {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        let factory = Arc::new(TestTableProviderFactory {});
+        let ffi_factory =
+            FFI_TableProviderFactory::new(factory, None, task_ctx_provider, None);
+
+        // Test that we can clone the factory
+        let cloned_factory = ffi_factory.clone();
+        let factory: Arc<dyn TableProviderFactory> = (&cloned_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("cloned_test"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = factory.create(&ctx.state(), &cmd).await?;
+        assert_eq!(provider.schema().fields().len(), 1);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs
index 418fdf16a564f..2f17d9235a088 100644
--- a/datafusion/ffi/src/table_source.rs
+++ b/datafusion/ffi/src/table_source.rs
@@ -16,12 +16,11 @@
 // under the License.
 
 use abi_stable::StableAbi;
-use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown};
+use datafusion_expr::{TableProviderFilterPushDown, TableType};
 
 /// FFI safe version of [`TableProviderFilterPushDown`].
 #[repr(C)]
 #[derive(StableAbi)]
-#[allow(non_camel_case_types)]
 pub enum FFI_TableProviderFilterPushDown {
     Unsupported,
     Inexact,
@@ -58,7 +57,6 @@ impl From<&TableProviderFilterPushDown> for FFI_TableProviderFilterPushDown {
 
 /// FFI safe version of [`TableType`].
 #[repr(C)]
-#[allow(non_camel_case_types)]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, StableAbi)]
 pub enum FFI_TableType {
     Base,
@@ -88,9 +86,10 @@ impl From<TableType> for FFI_TableType {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use datafusion::error::Result;
 
+    use super::*;
+
     fn round_trip_filter_pushdown(pushdown: TableProviderFilterPushDown) -> Result<()> {
         let ffi_pushdown: FFI_TableProviderFilterPushDown = (&pushdown).into();
         let round_trip: TableProviderFilterPushDown = (&ffi_pushdown).into();
diff --git a/datafusion/ffi/src/tests/async_provider.rs b/datafusion/ffi/src/tests/async_provider.rs
index 60434a7dda128..e9fa31a7fc6ed 100644
--- a/datafusion/ffi/src/tests/async_provider.rs
+++ b/datafusion/ffi/src/tests/async_provider.rs
@@ -25,27 +25,28 @@
 //! access the runtime, then you will get a panic when trying to do operations
 //! such as spawning a tokio task.
 
-use std::{any::Any, fmt::Debug, sync::Arc};
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
 
-use crate::table_provider::FFI_TableProvider;
 use arrow::array::RecordBatch;
 use arrow::datatypes::Schema;
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{Session, TableProvider},
-    error::{DataFusionError, Result},
-    execution::RecordBatchStream,
-    physical_expr::EquivalenceProperties,
-    physical_plan::{ExecutionPlan, Partitioning},
-    prelude::Expr,
-};
+use datafusion_catalog::TableProvider;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, exec_err};
+use datafusion_execution::RecordBatchStream;
+use datafusion_expr::Expr;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_session::Session;
 use futures::Stream;
-use tokio::{
-    runtime::Handle,
-    sync::{broadcast, mpsc},
-};
+use tokio::runtime::Handle;
+use tokio::sync::{broadcast, mpsc};
 
 use super::create_record_batch;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
 
 #[derive(Debug)]
 pub struct AsyncTableProvider {
@@ -59,7 +60,7 @@ fn async_table_provider_thread(
     mut shutdown: mpsc::Receiver<bool>,
     mut batch_request: mpsc::Receiver<bool>,
     batch_sender: broadcast::Sender<Option<RecordBatch>>,
-    tokio_rt: mpsc::Sender<Handle>,
+    tokio_rt: &mpsc::Sender<Handle>,
 ) {
     let runtime = Arc::new(
         tokio::runtime::Builder::new_current_thread()
@@ -106,7 +107,7 @@ pub fn start_async_provider() -> (AsyncTableProvider, Handle) {
             shutdown_rx,
             batch_request_rx,
             record_batch_tx,
-            tokio_rt_tx,
+            &tokio_rt_tx,
         )
     }));
 
@@ -134,8 +135,8 @@ impl TableProvider for AsyncTableProvider {
         super::create_test_schema()
     }
 
-    fn table_type(&self) -> datafusion::logical_expr::TableType {
-        datafusion::logical_expr::TableType::Base
+    fn table_type(&self) -> datafusion_expr::TableType {
+        datafusion_expr::TableType::Base
     }
 
     async fn scan(
@@ -162,7 +163,7 @@ impl Drop for AsyncTableProvider {
 
 #[derive(Debug)]
 struct AsyncTestExecutionPlan {
-    properties: datafusion::physical_plan::PlanProperties,
+    properties: Arc<datafusion_physical_plan::PlanProperties>,
     batch_request: mpsc::Sender<bool>,
     batch_receiver: broadcast::Receiver<Option<RecordBatch>>,
 }
@@ -173,12 +174,12 @@ impl AsyncTestExecutionPlan {
         batch_receiver: broadcast::Receiver<Option<RecordBatch>>,
     ) -> Self {
         Self {
-            properties: datafusion::physical_plan::PlanProperties::new(
+            properties: Arc::new(datafusion_physical_plan::PlanProperties::new(
                 EquivalenceProperties::new(super::create_test_schema()),
                 Partitioning::UnknownPartitioning(3),
-                datafusion::physical_plan::execution_plan::EmissionType::Incremental,
-                datafusion::physical_plan::execution_plan::Boundedness::Bounded,
-            ),
+                datafusion_physical_plan::execution_plan::EmissionType::Incremental,
+                datafusion_physical_plan::execution_plan::Boundedness::Bounded,
+            )),
             batch_request,
             batch_receiver,
         }
@@ -194,7 +195,7 @@ impl ExecutionPlan for AsyncTestExecutionPlan {
         self
     }
 
-    fn properties(&self) -> &datafusion::physical_plan::PlanProperties {
+    fn properties(&self) -> &Arc<datafusion_physical_plan::PlanProperties> {
         &self.properties
     }
 
@@ -212,19 +213,35 @@ impl ExecutionPlan for AsyncTestExecutionPlan {
     fn execute(
         &self,
         _partition: usize,
-        _context: Arc<datafusion::execution::TaskContext>,
-    ) -> Result<datafusion::execution::SendableRecordBatchStream> {
+        _context: Arc<datafusion_execution::TaskContext>,
+    ) -> Result<datafusion_execution::SendableRecordBatchStream> {
         Ok(Box::pin(AsyncTestRecordBatchStream {
             batch_request: self.batch_request.clone(),
             batch_receiver: self.batch_receiver.resubscribe(),
         }))
     }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_plan::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in the output ordering from equivalence properties
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = self.properties.output_ordering() {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
 }
 
-impl datafusion::physical_plan::DisplayAs for AsyncTestExecutionPlan {
+impl datafusion_physical_plan::DisplayAs for AsyncTestExecutionPlan {
     fn fmt_as(
         &self,
-        _t: datafusion::physical_plan::DisplayFormatType,
+        _t: datafusion_physical_plan::DisplayFormatType,
         _f: &mut std::fmt::Formatter,
     ) -> std::fmt::Result {
         // Do nothing, just a test
@@ -252,16 +269,16 @@ impl Stream for AsyncTestRecordBatchStream {
     ) -> std::task::Poll<Option<Self::Item>> {
         let mut this = self.as_mut();
 
-        #[allow(clippy::disallowed_methods)]
+        #[expect(clippy::disallowed_methods)]
         tokio::spawn(async move {
             // Nothing to do. We just need to simulate an async
             // task running
         });
 
         if let Err(e) = this.batch_request.try_send(true) {
-            return std::task::Poll::Ready(Some(Err(DataFusionError::Execution(
-                format!("Unable to send batch request, {e}"),
-            ))));
+            return std::task::Poll::Ready(Some(exec_err!(
+                "Failed to send batch request: {e}"
+            )));
         }
 
         match this.batch_receiver.blocking_recv() {
@@ -269,14 +286,21 @@ impl Stream for AsyncTestRecordBatchStream {
                 Some(batch) => std::task::Poll::Ready(Some(Ok(batch))),
                 None => std::task::Poll::Ready(None),
             },
-            Err(e) => std::task::Poll::Ready(Some(Err(DataFusionError::Execution(
-                format!("Unable to receive record batch: {e}"),
-            )))),
+            Err(e) => std::task::Poll::Ready(Some(exec_err!(
+                "Failed to receive record batch: {e}"
+            ))),
         }
     }
 }
 
-pub(crate) fn create_async_table_provider() -> FFI_TableProvider {
+pub(crate) fn create_async_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let (table_provider, tokio_rt) = start_async_provider();
-    FFI_TableProvider::new(Arc::new(table_provider), true, Some(tokio_rt))
+    FFI_TableProvider::new_with_ffi_codec(
+        Arc::new(table_provider),
+        true,
+        Some(tokio_rt),
+        codec,
+    )
 }
diff --git a/datafusion/ffi/src/tests/catalog.rs b/datafusion/ffi/src/tests/catalog.rs
index f4293adb41b94..76d60ee379a7c 100644
--- a/datafusion/ffi/src/tests/catalog.rs
+++ b/datafusion/ffi/src/tests/catalog.rs
@@ -25,20 +25,21 @@
 //! access the runtime, then you will get a panic when trying to do operations
 //! such as spawning a tokio task.
 
-use std::{any::Any, fmt::Debug, sync::Arc};
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
 
-use crate::catalog_provider::FFI_CatalogProvider;
 use arrow::datatypes::Schema;
 use async_trait::async_trait;
-use datafusion::{
-    catalog::{
-        CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider,
-        TableProvider,
-    },
-    common::exec_err,
-    datasource::MemTable,
-    error::{DataFusionError, Result},
+use datafusion_catalog::{
+    CatalogProvider, CatalogProviderList, MemTable, MemoryCatalogProvider,
+    MemoryCatalogProviderList, MemorySchemaProvider, SchemaProvider, TableProvider,
 };
+use datafusion_common::{Result, exec_err};
+
+use crate::catalog_provider::FFI_CatalogProvider;
+use crate::catalog_provider_list::FFI_CatalogProviderList;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
 
 /// This schema provider is intended only for unit tests. It prepopulates with one
 /// table and only allows for tables named sales and purchases.
@@ -49,7 +50,7 @@ pub struct FixedSchemaProvider {
 
 pub fn fruit_table() -> Arc<dyn TableProvider + 'static> {
     use arrow::datatypes::{DataType, Field};
-    use datafusion::common::record_batch;
+    use datafusion_common::record_batch;
 
     let schema = Arc::new(Schema::new(vec![
         Field::new("units", DataType::Int32, true),
@@ -96,10 +97,7 @@ impl SchemaProvider for FixedSchemaProvider {
         self.inner.table_names()
     }
 
-    async fn table(
-        &self,
-        name: &str,
-    ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> {
+    async fn table(&self, name: &str) -> Result<Option<Arc<dyn TableProvider>>> {
         self.inner.table(name).await
     }
 
@@ -162,7 +160,9 @@ impl CatalogProvider for FixedCatalogProvider {
         schema: Arc<dyn SchemaProvider>,
     ) -> Result<Option<Arc<dyn SchemaProvider>>> {
         if !["apple", "banana", "cherry", "date"].contains(&name) {
-            return exec_err!("FixedCatalogProvider only provides four schemas: apple, banana, cherry, date");
+            return exec_err!(
+                "FixedCatalogProvider only provides four schemas: apple, banana, cherry, date"
+            );
         }
 
         self.inner.register_schema(name, schema)
@@ -177,7 +177,65 @@ impl CatalogProvider for FixedCatalogProvider {
     }
 }
 
-pub(crate) extern "C" fn create_catalog_provider() -> FFI_CatalogProvider {
+pub(crate) extern "C" fn create_catalog_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_CatalogProvider {
     let catalog_provider = Arc::new(FixedCatalogProvider::default());
-    FFI_CatalogProvider::new(catalog_provider, None)
+    FFI_CatalogProvider::new_with_ffi_codec(catalog_provider, None, codec)
+}
+
+/// This catalog provider list is intended only for unit tests. It prepopulates with one
+/// catalog and only allows for catalogs named after four colors.
+#[derive(Debug)]
+pub struct FixedCatalogProviderList {
+    inner: MemoryCatalogProviderList,
+}
+
+impl Default for FixedCatalogProviderList {
+    fn default() -> Self {
+        let inner = MemoryCatalogProviderList::new();
+
+        let _ = inner.register_catalog(
+            "blue".to_owned(),
+            Arc::new(FixedCatalogProvider::default()),
+        );
+
+        Self { inner }
+    }
+}
+
+impl CatalogProviderList for FixedCatalogProviderList {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn catalog_names(&self) -> Vec<String> {
+        self.inner.catalog_names()
+    }
+
+    fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>> {
+        self.inner.catalog(name)
+    }
+
+    fn register_catalog(
+        &self,
+        name: String,
+        catalog: Arc<dyn CatalogProvider>,
+    ) -> Option<Arc<dyn CatalogProvider>> {
+        if !["blue", "red", "green", "yellow"].contains(&name.as_str()) {
+            log::warn!(
+                "FixedCatalogProviderList only provides four catalogs: blue, red, green, yellow"
+            );
+            return None;
+        }
+
+        self.inner.register_catalog(name, catalog)
+    }
+}
+
+pub(crate) extern "C" fn create_catalog_provider_list(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_CatalogProviderList {
+    let catalog_provider_list = Arc::new(FixedCatalogProviderList::default());
+    FFI_CatalogProviderList::new_with_ffi_codec(catalog_provider_list, None, codec)
 }
diff --git a/datafusion/ffi/src/tests/config.rs b/datafusion/ffi/src/tests/config.rs
new file mode 100644
index 0000000000000..46fc9756203e3
--- /dev/null
+++ b/datafusion/ffi/src/tests/config.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::config::ConfigExtension;
+use datafusion_common::extensions_options;
+
+use crate::config::extension_options::FFI_ExtensionOptions;
+
+extensions_options! {
+   pub struct ExternalConfig {
+       /// Should "foo" be replaced by "bar"?
+       pub is_enabled: bool, default = true
+
+       /// Some value to be extracted
+       pub base_number: usize, default = 1000
+   }
+}
+
+impl PartialEq for ExternalConfig {
+    fn eq(&self, other: &Self) -> bool {
+        self.base_number == other.base_number && self.is_enabled == other.is_enabled
+    }
+}
+impl Eq for ExternalConfig {}
+
+impl ConfigExtension for ExternalConfig {
+    const PREFIX: &'static str = "external_config";
+}
+
+pub(crate) extern "C" fn create_extension_options() -> FFI_ExtensionOptions {
+    let mut extensions = FFI_ExtensionOptions::default();
+    extensions
+        .add_config(&ExternalConfig::default())
+        .expect("add_config should be infallible for ExternalConfig");
+
+    extensions
+}
diff --git a/datafusion/ffi/src/tests/mod.rs b/datafusion/ffi/src/tests/mod.rs
index 7a36ee52bdb4b..378d75b9560c7 100644
--- a/datafusion/ffi/src/tests/mod.rs
+++ b/datafusion/ffi/src/tests/mod.rs
@@ -17,31 +17,42 @@
 
 use std::sync::Arc;
 
+use abi_stable::library::{LibraryError, RootModule};
+use abi_stable::prefix_type::PrefixTypeTrait;
+use abi_stable::sabi_types::VersionStrings;
 use abi_stable::{
-    declare_root_module_statics, export_root_module,
-    library::{LibraryError, RootModule},
-    package_version_strings,
-    prefix_type::PrefixTypeTrait,
-    sabi_types::VersionStrings,
-    StableAbi,
+    StableAbi, declare_root_module_statics, export_root_module, package_version_strings,
 };
-use catalog::create_catalog_provider;
-
-use crate::{catalog_provider::FFI_CatalogProvider, udtf::FFI_TableFunction};
-
-use super::{table_provider::FFI_TableProvider, udf::FFI_ScalarUDF};
 use arrow::array::RecordBatch;
+use arrow_schema::{DataType, Field, Schema};
 use async_provider::create_async_table_provider;
-use datafusion::{
-    arrow::datatypes::{DataType, Field, Schema},
-    common::record_batch,
-};
+use catalog::create_catalog_provider;
+use datafusion_common::record_batch;
 use sync_provider::create_sync_table_provider;
-use udf_udaf_udwf::{create_ffi_abs_func, create_ffi_random_func, create_ffi_table_func};
+use udf_udaf_udwf::{
+    create_ffi_abs_func, create_ffi_random_func, create_ffi_rank_func,
+    create_ffi_stddev_func, create_ffi_sum_func, create_ffi_table_func,
+};
+
+use crate::catalog_provider::FFI_CatalogProvider;
+use crate::catalog_provider_list::FFI_CatalogProviderList;
+use crate::config::extension_options::FFI_ExtensionOptions;
+use crate::execution_plan::FFI_ExecutionPlan;
+use crate::execution_plan::tests::EmptyExec;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
+use crate::table_provider_factory::FFI_TableProviderFactory;
+use crate::tests::catalog::create_catalog_provider_list;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udtf::FFI_TableFunction;
+use crate::udwf::FFI_WindowUDF;
 
 mod async_provider;
 pub mod catalog;
+pub mod config;
 mod sync_provider;
+mod table_provider_factory;
 mod udf_udaf_udwf;
 pub mod utils;
 
@@ -53,17 +64,45 @@ pub mod utils;
 /// module.
 pub struct ForeignLibraryModule {
     /// Construct an opinionated catalog provider
-    pub create_catalog: extern "C" fn() -> FFI_CatalogProvider,
+    pub create_catalog:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_CatalogProvider,
+
+    /// Construct an opinionated catalog provider list
+    pub create_catalog_list:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_CatalogProviderList,
 
     /// Constructs the table provider
-    pub create_table: extern "C" fn(synchronous: bool) -> FFI_TableProvider,
+    pub create_table: extern "C" fn(
+        synchronous: bool,
+        codec: FFI_LogicalExtensionCodec,
+    ) -> FFI_TableProvider,
+
+    /// Constructs the table provider factory
+    pub create_table_factory:
+        extern "C" fn(codec: FFI_LogicalExtensionCodec) -> FFI_TableProviderFactory,
 
     /// Create a scalar UDF
     pub create_scalar_udf: extern "C" fn() -> FFI_ScalarUDF,
 
     pub create_nullary_udf: extern "C" fn() -> FFI_ScalarUDF,
 
-    pub create_table_function: extern "C" fn() -> FFI_TableFunction,
+    pub create_timezone_udf: extern "C" fn() -> FFI_ScalarUDF,
+
+    pub create_table_function:
+        extern "C" fn(FFI_LogicalExtensionCodec) -> FFI_TableFunction,
+
+    /// Create an aggregate UDAF using sum
+    pub create_sum_udaf: extern "C" fn() -> FFI_AggregateUDF,
+
+    /// Create  grouping UDAF using stddev
+    pub create_stddev_udaf: extern "C" fn() -> FFI_AggregateUDF,
+
+    pub create_rank_udwf: extern "C" fn() -> FFI_WindowUDF,
+
+    /// Create extension options, for either ConfigOptions or TableOptions
+    pub create_extension_options: extern "C" fn() -> FFI_ExtensionOptions,
+
+    pub create_empty_exec: extern "C" fn() -> FFI_ExecutionPlan,
 
     pub version: extern "C" fn() -> u64,
 }
@@ -96,22 +135,48 @@ pub fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
 
 /// Here we only wish to create a simple table provider as an example.
 /// We create an in-memory table and convert it to it's FFI counterpart.
-extern "C" fn construct_table_provider(synchronous: bool) -> FFI_TableProvider {
+extern "C" fn construct_table_provider(
+    synchronous: bool,
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     match synchronous {
-        true => create_sync_table_provider(),
-        false => create_async_table_provider(),
+        true => create_sync_table_provider(codec),
+        false => create_async_table_provider(codec),
     }
 }
 
+/// Here we only wish to create a simple table provider as an example.
+/// We create an in-memory table and convert it to it's FFI counterpart.
+extern "C" fn construct_table_provider_factory(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProviderFactory {
+    table_provider_factory::create(codec)
+}
+
+pub(crate) extern "C" fn create_empty_exec() -> FFI_ExecutionPlan {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+    let plan = Arc::new(EmptyExec::new(schema));
+    FFI_ExecutionPlan::new(plan, None)
+}
+
 #[export_root_module]
 /// This defines the entry point for using the module.
 pub fn get_foreign_library_module() -> ForeignLibraryModuleRef {
     ForeignLibraryModule {
         create_catalog: create_catalog_provider,
+        create_catalog_list: create_catalog_provider_list,
         create_table: construct_table_provider,
+        create_table_factory: construct_table_provider_factory,
         create_scalar_udf: create_ffi_abs_func,
         create_nullary_udf: create_ffi_random_func,
+        create_timezone_udf: udf_udaf_udwf::create_timezone_func,
         create_table_function: create_ffi_table_func,
+        create_sum_udaf: create_ffi_sum_func,
+        create_stddev_udaf: create_ffi_stddev_func,
+        create_rank_udwf: create_ffi_rank_func,
+        create_extension_options: config::create_extension_options,
+        create_empty_exec,
         version: super::version,
     }
     .leak_into_prefix()
diff --git a/datafusion/ffi/src/tests/sync_provider.rs b/datafusion/ffi/src/tests/sync_provider.rs
index ff85e0b15b395..e3cb54fff90eb 100644
--- a/datafusion/ffi/src/tests/sync_provider.rs
+++ b/datafusion/ffi/src/tests/sync_provider.rs
@@ -17,12 +17,15 @@
 
 use std::sync::Arc;
 
-use crate::table_provider::FFI_TableProvider;
-use datafusion::datasource::MemTable;
+use datafusion_catalog::MemTable;
 
 use super::{create_record_batch, create_test_schema};
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
 
-pub(crate) fn create_sync_table_provider() -> FFI_TableProvider {
+pub(crate) fn create_sync_table_provider(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableProvider {
     let schema = create_test_schema();
 
     // It is useful to create these as multiple record batches
@@ -35,5 +38,5 @@ pub(crate) fn create_sync_table_provider() -> FFI_TableProvider {
 
     let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
 
-    FFI_TableProvider::new(Arc::new(table_provider), true, None)
+    FFI_TableProvider::new_with_ffi_codec(Arc::new(table_provider), true, None, codec)
 }
diff --git a/datafusion/ffi/src/tests/table_provider_factory.rs b/datafusion/ffi/src/tests/table_provider_factory.rs
new file mode 100644
index 0000000000000..29af6aacf6484
--- /dev/null
+++ b/datafusion/ffi/src/tests/table_provider_factory.rs
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion_catalog::{MemTable, Session, TableProvider, TableProviderFactory};
+use datafusion_common::Result;
+use datafusion_expr::CreateExternalTable;
+
+use super::{create_record_batch, create_test_schema};
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider_factory::FFI_TableProviderFactory;
+
+#[derive(Debug)]
+pub struct TestTableProviderFactory {}
+
+#[async_trait]
+impl TableProviderFactory for TestTableProviderFactory {
+    async fn create(
+        &self,
+        _session: &dyn Session,
+        _cmd: &CreateExternalTable,
+    ) -> Result<Arc<dyn TableProvider>> {
+        let schema = create_test_schema();
+
+        // It is useful to create these as multiple record batches
+        // so that we can demonstrate the FFI stream.
+        let batches = vec![
+            create_record_batch(1, 5),
+            create_record_batch(6, 1),
+            create_record_batch(7, 5),
+        ];
+
+        let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
+
+        Ok(Arc::new(table_provider))
+    }
+}
+
+pub(crate) fn create(codec: FFI_LogicalExtensionCodec) -> FFI_TableProviderFactory {
+    let factory = TestTableProviderFactory {};
+    FFI_TableProviderFactory::new_with_ffi_codec(Arc::new(factory), None, codec)
+}
diff --git a/datafusion/ffi/src/tests/udf_udaf_udwf.rs b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
index c3cb1bcc35338..b9ab20b115d3a 100644
--- a/datafusion/ffi/src/tests/udf_udaf_udwf.rs
+++ b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
@@ -15,30 +15,139 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{udf::FFI_ScalarUDF, udtf::FFI_TableFunction};
-use datafusion::{
-    catalog::TableFunctionImpl,
-    functions::math::{abs::AbsFunc, random::RandomFunc},
-    functions_table::generate_series::RangeFunc,
-    logical_expr::ScalarUDF,
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow_schema::DataType;
+use datafusion_catalog::TableFunctionImpl;
+use datafusion_common::ScalarValue;
+use datafusion_expr::{
+    AggregateUDF, ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+    Volatility, WindowUDF,
 };
+use datafusion_functions::math::abs::AbsFunc;
+use datafusion_functions::math::random::RandomFunc;
+use datafusion_functions_aggregate::stddev::Stddev;
+use datafusion_functions_aggregate::sum::Sum;
+use datafusion_functions_table::generate_series::RangeFunc;
+use datafusion_functions_window::rank::Rank;
 
-use std::sync::Arc;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::udaf::FFI_AggregateUDF;
+use crate::udf::FFI_ScalarUDF;
+use crate::udtf::FFI_TableFunction;
+use crate::udwf::FFI_WindowUDF;
 
 pub(crate) extern "C" fn create_ffi_abs_func() -> FFI_ScalarUDF {
-    let udf: Arc<ScalarUDF> = Arc::new(AbsFunc::new().into());
+    let inner = WrappedAbs(Arc::new(AbsFunc::new().into()));
+    let udf: Arc<ScalarUDF> = Arc::new(inner.into());
 
     udf.into()
 }
 
+#[derive(Debug, Hash, Eq, PartialEq)]
+struct WrappedAbs(Arc<ScalarUDF>);
+
+impl ScalarUDFImpl for WrappedAbs {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "ffi_abs"
+    }
+
+    fn signature(&self) -> &Signature {
+        self.0.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        self.0.return_type(arg_types)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        self.0.invoke_with_args(args)
+    }
+}
+
 pub(crate) extern "C" fn create_ffi_random_func() -> FFI_ScalarUDF {
     let udf: Arc<ScalarUDF> = Arc::new(RandomFunc::new().into());
 
     udf.into()
 }
 
-pub(crate) extern "C" fn create_ffi_table_func() -> FFI_TableFunction {
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct TimeZoneUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for TimeZoneUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn name(&self) -> &str {
+        "TimeZoneUDF"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(
+        &self,
+        _arg_types: &[DataType],
+    ) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let tz = args.config_options.execution.time_zone.clone();
+        Ok(ColumnarValue::Scalar(ScalarValue::from(tz)))
+    }
+}
+
+pub(crate) extern "C" fn create_timezone_func() -> FFI_ScalarUDF {
+    let udf: Arc<ScalarUDF> = Arc::new(ScalarUDF::from(TimeZoneUDF {
+        signature: Signature::uniform(1, vec![DataType::Utf8], Volatility::Stable),
+    }));
+
+    udf.into()
+}
+
+pub(crate) extern "C" fn create_ffi_table_func(
+    codec: FFI_LogicalExtensionCodec,
+) -> FFI_TableFunction {
     let udtf: Arc<dyn TableFunctionImpl> = Arc::new(RangeFunc {});
 
-    FFI_TableFunction::new(udtf, None)
+    FFI_TableFunction::new_with_ffi_codec(udtf, None, codec)
+}
+
+pub(crate) extern "C" fn create_ffi_sum_func() -> FFI_AggregateUDF {
+    let udaf: Arc<AggregateUDF> = Arc::new(Sum::new().into());
+
+    udaf.into()
+}
+
+pub(crate) extern "C" fn create_ffi_stddev_func() -> FFI_AggregateUDF {
+    let udaf: Arc<AggregateUDF> = Arc::new(Stddev::new().into());
+
+    udaf.into()
+}
+
+pub(crate) extern "C" fn create_ffi_rank_func() -> FFI_WindowUDF {
+    let udwf: Arc<WindowUDF> = Arc::new(
+        Rank::new(
+            "rank_demo".to_string(),
+            datafusion_functions_window::rank::RankType::Basic,
+        )
+        .into(),
+    );
+
+    udwf.into()
 }
diff --git a/datafusion/ffi/src/tests/utils.rs b/datafusion/ffi/src/tests/utils.rs
index 6465b17d9b60c..9659a51f04b01 100644
--- a/datafusion/ffi/src/tests/utils.rs
+++ b/datafusion/ffi/src/tests/utils.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::tests::ForeignLibraryModuleRef;
-use abi_stable::library::RootModule;
-use datafusion::error::{DataFusionError, Result};
 use std::path::Path;
 
+use abi_stable::library::RootModule;
+use datafusion_common::{DataFusionError, Result};
+
+use crate::tests::ForeignLibraryModuleRef;
+
 /// Compute the path to the library. It would be preferable to simply use
 /// abi_stable::library::development_utils::compute_library_path however
 /// our current CI pipeline has a `ci` profile that we need to use to
diff --git a/datafusion/ffi/src/udaf/accumulator.rs b/datafusion/ffi/src/udaf/accumulator.rs
new file mode 100644
index 0000000000000..125b28598b433
--- /dev/null
+++ b/datafusion/ffi/src/udaf/accumulator.rs
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Deref;
+use std::ptr::null_mut;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RVec};
+use arrow::array::ArrayRef;
+use arrow::error::ArrowError;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_common::scalar::ScalarValue;
+use datafusion_expr::Accumulator;
+use prost::Message;
+
+use crate::arrow_wrappers::WrappedArray;
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
+
+/// A stable struct for sharing [`Accumulator`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding function
+/// defined in [`Accumulator`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_Accumulator {
+    pub update_batch: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        values: RVec<WrappedArray>,
+    ) -> FFIResult<()>,
+
+    // Evaluate and return a ScalarValues as protobuf bytes
+    pub evaluate: unsafe extern "C" fn(accumulator: &mut Self) -> FFIResult<RVec<u8>>,
+
+    pub size: unsafe extern "C" fn(accumulator: &Self) -> usize,
+
+    pub state: unsafe extern "C" fn(accumulator: &mut Self) -> FFIResult<RVec<RVec<u8>>>,
+
+    pub merge_batch: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        states: RVec<WrappedArray>,
+    ) -> FFIResult<()>,
+
+    pub retract_batch: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        values: RVec<WrappedArray>,
+    ) -> FFIResult<()>,
+
+    pub supports_retract_batch: bool,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(accumulator: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the accumulator.
+    /// A [`ForeignAccumulator`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_Accumulator {}
+unsafe impl Sync for FFI_Accumulator {}
+
+pub struct AccumulatorPrivateData {
+    pub accumulator: Box<dyn Accumulator>,
+}
+
+impl FFI_Accumulator {
+    #[inline]
+    unsafe fn inner_mut(&mut self) -> &mut Box<dyn Accumulator> {
+        unsafe {
+            let private_data = self.private_data as *mut AccumulatorPrivateData;
+            &mut (*private_data).accumulator
+        }
+    }
+
+    #[inline]
+    unsafe fn inner(&self) -> &dyn Accumulator {
+        unsafe {
+            let private_data = self.private_data as *const AccumulatorPrivateData;
+            (*private_data).accumulator.deref()
+        }
+    }
+}
+
+unsafe extern "C" fn update_batch_fn_wrapper(
+    accumulator: &mut FFI_Accumulator,
+    values: RVec<WrappedArray>,
+) -> FFIResult<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = rresult_return!(values_arrays);
+
+        rresult!(accumulator.update_batch(&values_arrays))
+    }
+}
+
+unsafe extern "C" fn evaluate_fn_wrapper(
+    accumulator: &mut FFI_Accumulator,
+) -> FFIResult<RVec<u8>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let scalar_result = rresult_return!(accumulator.evaluate());
+        let proto_result: datafusion_proto::protobuf::ScalarValue =
+            rresult_return!((&scalar_result).try_into());
+
+        RResult::ROk(proto_result.encode_to_vec().into())
+    }
+}
+
+unsafe extern "C" fn size_fn_wrapper(accumulator: &FFI_Accumulator) -> usize {
+    unsafe { accumulator.inner().size() }
+}
+
+unsafe extern "C" fn state_fn_wrapper(
+    accumulator: &mut FFI_Accumulator,
+) -> FFIResult<RVec<RVec<u8>>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let state = rresult_return!(accumulator.state());
+        let state = state
+            .into_iter()
+            .map(|state_val| {
+                datafusion_proto::protobuf::ScalarValue::try_from(&state_val)
+                    .map_err(DataFusionError::from)
+                    .map(|v| RVec::from(v.encode_to_vec()))
+            })
+            .collect::<Result<Vec<_>>>()
+            .map(|state_vec| state_vec.into());
+
+        rresult!(state)
+    }
+}
+
+unsafe extern "C" fn merge_batch_fn_wrapper(
+    accumulator: &mut FFI_Accumulator,
+    states: RVec<WrappedArray>,
+) -> FFIResult<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let states = rresult_return!(
+            states
+                .into_iter()
+                .map(|state| ArrayRef::try_from(state).map_err(DataFusionError::from))
+                .collect::<Result<Vec<_>>>()
+        );
+
+        rresult!(accumulator.merge_batch(&states))
+    }
+}
+
+unsafe extern "C" fn retract_batch_fn_wrapper(
+    accumulator: &mut FFI_Accumulator,
+    values: RVec<WrappedArray>,
+) -> FFIResult<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = rresult_return!(values_arrays);
+
+        rresult!(accumulator.retract_batch(&values_arrays))
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(accumulator: &mut FFI_Accumulator) {
+    unsafe {
+        if !accumulator.private_data.is_null() {
+            let private_data =
+                Box::from_raw(accumulator.private_data as *mut AccumulatorPrivateData);
+            drop(private_data);
+            accumulator.private_data = null_mut();
+        }
+    }
+}
+
+impl From<Box<dyn Accumulator>> for FFI_Accumulator {
+    fn from(accumulator: Box<dyn Accumulator>) -> Self {
+        if (accumulator.as_ref() as &dyn Any).is::<ForeignAccumulator>() {
+            let accumulator = (accumulator as Box<dyn Any>)
+                .downcast::<ForeignAccumulator>()
+                .expect("already checked type");
+            return accumulator.accumulator;
+        }
+
+        let supports_retract_batch = accumulator.supports_retract_batch();
+        let private_data = AccumulatorPrivateData { accumulator };
+
+        Self {
+            update_batch: update_batch_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            size: size_fn_wrapper,
+            state: state_fn_wrapper,
+            merge_batch: merge_batch_fn_wrapper,
+            retract_batch: retract_batch_fn_wrapper,
+            supports_retract_batch,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_Accumulator {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+/// This struct is used to access an UDF provided by a foreign
+/// library across a FFI boundary.
+///
+/// The ForeignAccumulator is to be used by the caller of the UDF, so it has
+/// no knowledge or access to the private data. All interaction with the UDF
+/// must occur through the functions defined in FFI_Accumulator.
+#[derive(Debug)]
+pub struct ForeignAccumulator {
+    accumulator: FFI_Accumulator,
+}
+
+impl From<FFI_Accumulator> for Box<dyn Accumulator> {
+    fn from(mut accumulator: FFI_Accumulator) -> Self {
+        if (accumulator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    accumulator.private_data as *mut AccumulatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                accumulator.private_data = null_mut();
+                private_data.accumulator
+            }
+        } else {
+            Box::new(ForeignAccumulator { accumulator })
+        }
+    }
+}
+
+impl Accumulator for ForeignAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
+            df_result!((self.accumulator.update_batch)(
+                &mut self.accumulator,
+                values.into()
+            ))
+        }
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        unsafe {
+            let scalar_bytes =
+                df_result!((self.accumulator.evaluate)(&mut self.accumulator))?;
+
+            let proto_scalar =
+                datafusion_proto::protobuf::ScalarValue::decode(scalar_bytes.as_ref())
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+            ScalarValue::try_from(&proto_scalar).map_err(DataFusionError::from)
+        }
+    }
+
+    fn size(&self) -> usize {
+        unsafe { (self.accumulator.size)(&self.accumulator) }
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        unsafe {
+            let state_protos =
+                df_result!((self.accumulator.state)(&mut self.accumulator))?;
+
+            state_protos
+                .into_iter()
+                .map(|proto_bytes| {
+                    datafusion_proto::protobuf::ScalarValue::decode(proto_bytes.as_ref())
+                        .map_err(|e| DataFusionError::External(Box::new(e)))
+                        .and_then(|proto_value| {
+                            ScalarValue::try_from(&proto_value)
+                                .map_err(DataFusionError::from)
+                        })
+                })
+                .collect::<Result<Vec<_>>>()
+        }
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        unsafe {
+            let states = states
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
+            df_result!((self.accumulator.merge_batch)(
+                &mut self.accumulator,
+                states.into()
+            ))
+        }
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
+            df_result!((self.accumulator.retract_batch)(
+                &mut self.accumulator,
+                values.into()
+            ))
+        }
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        self.accumulator.supports_retract_batch
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Array, make_array};
+    use datafusion::common::create_array;
+    use datafusion::error::Result;
+    use datafusion::functions_aggregate::average::AvgAccumulator;
+    use datafusion::logical_expr::Accumulator;
+    use datafusion::scalar::ScalarValue;
+
+    use super::{FFI_Accumulator, ForeignAccumulator};
+
+    #[test]
+    fn test_foreign_avg_accumulator() -> Result<()> {
+        let original_accum = AvgAccumulator::default();
+        let original_size = original_accum.size();
+        let original_supports_retract = original_accum.supports_retract_batch();
+
+        let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_Accumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let mut foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
+
+        // Send in an array to average. There are 5 values and it should average to 30.0
+        let values = create_array!(Float64, vec![10., 20., 30., 40., 50.]);
+        foreign_accum.update_batch(&[values])?;
+
+        let avg = foreign_accum.evaluate()?;
+        assert_eq!(avg, ScalarValue::Float64(Some(30.0)));
+
+        let state = foreign_accum.state()?;
+        assert_eq!(state.len(), 2);
+        assert_eq!(state[0], ScalarValue::UInt64(Some(5)));
+        assert_eq!(state[1], ScalarValue::Float64(Some(150.0)));
+
+        // To verify merging batches works, create a second state to add in
+        // This should cause our average to go down to 25.0
+        let second_states = vec![
+            make_array(create_array!(UInt64, vec![1]).to_data()),
+            make_array(create_array!(Float64, vec![0.0]).to_data()),
+        ];
+
+        foreign_accum.merge_batch(&second_states)?;
+        let avg = foreign_accum.evaluate()?;
+        assert_eq!(avg, ScalarValue::Float64(Some(25.0)));
+
+        // If we remove a batch that is equivalent to the state we added
+        // we should go back to our original value of 30.0
+        let values = create_array!(Float64, vec![0.0]);
+        foreign_accum.retract_batch(&[values])?;
+        let avg = foreign_accum.evaluate()?;
+        assert_eq!(avg, ScalarValue::Float64(Some(30.0)));
+
+        assert_eq!(original_size, foreign_accum.size());
+        assert_eq!(
+            original_supports_retract,
+            foreign_accum.supports_retract_batch()
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_accumulator_local_bypass() -> Result<()> {
+        let original_accum = AvgAccumulator::default();
+        let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
+        let original_size = boxed_accum.size();
+
+        let ffi_accum: FFI_Accumulator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn Accumulator
+                as *const AvgAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = AvgAccumulator::default();
+        let boxed_accum: Box<dyn Accumulator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_Accumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn Accumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn Accumulator
+                as *const ForeignAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udaf/accumulator_args.rs b/datafusion/ffi/src/udaf/accumulator_args.rs
new file mode 100644
index 0000000000000..a3359231073c4
--- /dev/null
+++ b/datafusion/ffi/src/udaf/accumulator_args.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RString, RVec};
+use arrow::datatypes::Schema;
+use arrow::ffi::FFI_ArrowSchema;
+use arrow_schema::FieldRef;
+use datafusion_common::error::DataFusionError;
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::physical_expr::sort::FFI_PhysicalSortExpr;
+use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
+
+/// A stable struct for sharing [`AccumulatorArgs`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding field
+/// defined in [`AccumulatorArgs`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_AccumulatorArgs {
+    return_field: WrappedSchema,
+    schema: WrappedSchema,
+    ignore_nulls: bool,
+    order_bys: RVec<FFI_PhysicalSortExpr>,
+    is_reversed: bool,
+    name: RString,
+    is_distinct: bool,
+    exprs: RVec<FFI_PhysicalExpr>,
+    expr_fields: RVec<WrappedSchema>,
+}
+
+impl TryFrom<AccumulatorArgs<'_>> for FFI_AccumulatorArgs {
+    type Error = DataFusionError;
+    fn try_from(args: AccumulatorArgs) -> Result<Self, DataFusionError> {
+        let return_field =
+            WrappedSchema(FFI_ArrowSchema::try_from(args.return_field.as_ref())?);
+        let schema = WrappedSchema(FFI_ArrowSchema::try_from(args.schema)?);
+
+        let order_bys: RVec<_> = args
+            .order_bys
+            .iter()
+            .map(FFI_PhysicalSortExpr::from)
+            .collect();
+
+        let exprs = args
+            .exprs
+            .iter()
+            .map(Arc::clone)
+            .map(FFI_PhysicalExpr::from)
+            .collect();
+
+        let expr_fields = vec_fieldref_to_rvec_wrapped(args.expr_fields)?;
+
+        Ok(Self {
+            return_field,
+            schema,
+            ignore_nulls: args.ignore_nulls,
+            order_bys,
+            is_reversed: args.is_reversed,
+            name: args.name.into(),
+            is_distinct: args.is_distinct,
+            exprs,
+            expr_fields,
+        })
+    }
+}
+
+/// This struct mirrors AccumulatorArgs except that it contains owned data.
+/// It is necessary to create this struct so that we can parse the protobuf
+/// data across the FFI boundary and turn it into owned data that
+/// AccumulatorArgs can then reference.
+pub struct ForeignAccumulatorArgs {
+    pub return_field: FieldRef,
+    pub schema: Schema,
+    pub expr_fields: Vec<FieldRef>,
+    pub ignore_nulls: bool,
+    pub order_bys: Vec<PhysicalSortExpr>,
+    pub is_reversed: bool,
+    pub name: String,
+    pub is_distinct: bool,
+    pub exprs: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl TryFrom<FFI_AccumulatorArgs> for ForeignAccumulatorArgs {
+    type Error = DataFusionError;
+
+    fn try_from(value: FFI_AccumulatorArgs) -> Result<Self, Self::Error> {
+        let return_field = Arc::new((&value.return_field.0).try_into()?);
+        let schema = Schema::try_from(&value.schema.0)?;
+
+        let order_bys = value.order_bys.iter().map(PhysicalSortExpr::from).collect();
+
+        let exprs = value
+            .exprs
+            .iter()
+            .map(<Arc<dyn PhysicalExpr>>::from)
+            .collect();
+
+        let expr_fields = rvec_wrapped_to_vec_fieldref(&value.expr_fields)?;
+
+        Ok(Self {
+            return_field,
+            schema,
+            expr_fields,
+            ignore_nulls: value.ignore_nulls,
+            order_bys,
+            is_reversed: value.is_reversed,
+            name: value.name.to_string(),
+            is_distinct: value.is_distinct,
+            exprs,
+        })
+    }
+}
+
+impl<'a> From<&'a ForeignAccumulatorArgs> for AccumulatorArgs<'a> {
+    fn from(value: &'a ForeignAccumulatorArgs) -> Self {
+        Self {
+            return_field: Arc::clone(&value.return_field),
+            schema: &value.schema,
+            expr_fields: &value.expr_fields,
+            ignore_nulls: value.ignore_nulls,
+            order_bys: &value.order_bys,
+            is_reversed: value.is_reversed,
+            name: value.name.as_str(),
+            is_distinct: value.is_distinct,
+            exprs: &value.exprs,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::error::Result;
+    use datafusion::logical_expr::function::AccumulatorArgs;
+    use datafusion::physical_expr::PhysicalSortExpr;
+    use datafusion::physical_plan::expressions::col;
+
+    use super::{FFI_AccumulatorArgs, ForeignAccumulatorArgs};
+
+    #[test]
+    fn test_round_trip_accumulator_args() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let orig_args = AccumulatorArgs {
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            schema: &schema,
+            expr_fields: &[Field::new("a", DataType::Int32, true).into()],
+            ignore_nulls: false,
+            order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)],
+            is_reversed: false,
+            name: "round_trip",
+            is_distinct: true,
+            exprs: &[col("a", &schema)?],
+        };
+        let orig_str = format!("{orig_args:?}");
+
+        let ffi_args = FFI_AccumulatorArgs::try_from(orig_args)?;
+        let foreign_args: ForeignAccumulatorArgs = ffi_args.try_into()?;
+        let round_trip_args: AccumulatorArgs = (&foreign_args).into();
+
+        let round_trip_str = format!("{round_trip_args:?}");
+
+        // Since AccumulatorArgs doesn't implement Eq, simply compare
+        // the debug strings.
+        assert_eq!(orig_str, round_trip_str);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udaf/groups_accumulator.rs b/datafusion/ffi/src/udaf/groups_accumulator.rs
new file mode 100644
index 0000000000000..0dc8edbfe5a85
--- /dev/null
+++ b/datafusion/ffi/src/udaf/groups_accumulator.rs
@@ -0,0 +1,587 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Deref;
+use std::ptr::null_mut;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RVec};
+use arrow::array::{Array, ArrayRef, BooleanArray};
+use arrow::error::ArrowError;
+use arrow::ffi::to_ffi;
+use datafusion_common::error::{DataFusionError, Result};
+use datafusion_expr::{EmitTo, GroupsAccumulator};
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
+
+/// A stable struct for sharing [`GroupsAccumulator`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding function
+/// defined in [`GroupsAccumulator`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_GroupsAccumulator {
+    pub update_batch: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        values: RVec<WrappedArray>,
+        group_indices: RVec<usize>,
+        opt_filter: ROption<WrappedArray>,
+        total_num_groups: usize,
+    ) -> FFIResult<()>,
+
+    // Evaluate and return a ScalarValues as protobuf bytes
+    pub evaluate: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        emit_to: FFI_EmitTo,
+    ) -> FFIResult<WrappedArray>,
+
+    pub size: unsafe extern "C" fn(accumulator: &Self) -> usize,
+
+    pub state: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        emit_to: FFI_EmitTo,
+    ) -> FFIResult<RVec<WrappedArray>>,
+
+    pub merge_batch: unsafe extern "C" fn(
+        accumulator: &mut Self,
+        values: RVec<WrappedArray>,
+        group_indices: RVec<usize>,
+        opt_filter: ROption<WrappedArray>,
+        total_num_groups: usize,
+    ) -> FFIResult<()>,
+
+    pub convert_to_state: unsafe extern "C" fn(
+        accumulator: &Self,
+        values: RVec<WrappedArray>,
+        opt_filter: ROption<WrappedArray>,
+    ) -> FFIResult<RVec<WrappedArray>>,
+
+    pub supports_convert_to_state: bool,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(accumulator: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the accumulator.
+    /// A [`ForeignGroupsAccumulator`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+pub struct GroupsAccumulatorPrivateData {
+    pub accumulator: Box<dyn GroupsAccumulator>,
+}
+
+impl FFI_GroupsAccumulator {
+    #[inline]
+    unsafe fn inner_mut(&mut self) -> &mut Box<dyn GroupsAccumulator> {
+        unsafe {
+            let private_data = self.private_data as *mut GroupsAccumulatorPrivateData;
+            &mut (*private_data).accumulator
+        }
+    }
+
+    #[inline]
+    unsafe fn inner(&self) -> &dyn GroupsAccumulator {
+        unsafe {
+            let private_data = self.private_data as *const GroupsAccumulatorPrivateData;
+            (*private_data).accumulator.deref()
+        }
+    }
+}
+
+fn process_values(values: RVec<WrappedArray>) -> Result<Vec<Arc<dyn Array>>> {
+    values
+        .into_iter()
+        .map(|v| v.try_into().map_err(DataFusionError::from))
+        .collect::<Result<Vec<ArrayRef>>>()
+}
+
+/// Convert C-typed opt_filter into the internal type.
+fn process_opt_filter(opt_filter: ROption<WrappedArray>) -> Result<Option<BooleanArray>> {
+    opt_filter
+        .into_option()
+        .map(|filter| {
+            ArrayRef::try_from(filter)
+                .map_err(DataFusionError::from)
+                .map(|arr| BooleanArray::from(arr.into_data()))
+        })
+        .transpose()
+}
+
+unsafe extern "C" fn update_batch_fn_wrapper(
+    accumulator: &mut FFI_GroupsAccumulator,
+    values: RVec<WrappedArray>,
+    group_indices: RVec<usize>,
+    opt_filter: ROption<WrappedArray>,
+    total_num_groups: usize,
+) -> FFIResult<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+        let values = rresult_return!(process_values(values));
+        let group_indices: Vec<usize> = group_indices.into_iter().collect();
+        let opt_filter = rresult_return!(process_opt_filter(opt_filter));
+
+        rresult!(accumulator.update_batch(
+            &values,
+            &group_indices,
+            opt_filter.as_ref(),
+            total_num_groups
+        ))
+    }
+}
+
+unsafe extern "C" fn evaluate_fn_wrapper(
+    accumulator: &mut FFI_GroupsAccumulator,
+    emit_to: FFI_EmitTo,
+) -> FFIResult<WrappedArray> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let result = rresult_return!(accumulator.evaluate(emit_to.into()));
+
+        rresult!(WrappedArray::try_from(&result))
+    }
+}
+
+unsafe extern "C" fn size_fn_wrapper(accumulator: &FFI_GroupsAccumulator) -> usize {
+    unsafe {
+        let accumulator = accumulator.inner();
+        accumulator.size()
+    }
+}
+
+unsafe extern "C" fn state_fn_wrapper(
+    accumulator: &mut FFI_GroupsAccumulator,
+    emit_to: FFI_EmitTo,
+) -> FFIResult<RVec<WrappedArray>> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+
+        let state = rresult_return!(accumulator.state(emit_to.into()));
+        rresult!(
+            state
+                .into_iter()
+                .map(|arr| WrappedArray::try_from(&arr).map_err(DataFusionError::from))
+                .collect::<Result<RVec<_>>>()
+        )
+    }
+}
+
+unsafe extern "C" fn merge_batch_fn_wrapper(
+    accumulator: &mut FFI_GroupsAccumulator,
+    values: RVec<WrappedArray>,
+    group_indices: RVec<usize>,
+    opt_filter: ROption<WrappedArray>,
+    total_num_groups: usize,
+) -> FFIResult<()> {
+    unsafe {
+        let accumulator = accumulator.inner_mut();
+        let values = rresult_return!(process_values(values));
+        let group_indices: Vec<usize> = group_indices.into_iter().collect();
+        let opt_filter = rresult_return!(process_opt_filter(opt_filter));
+
+        rresult!(accumulator.merge_batch(
+            &values,
+            &group_indices,
+            opt_filter.as_ref(),
+            total_num_groups
+        ))
+    }
+}
+
+unsafe extern "C" fn convert_to_state_fn_wrapper(
+    accumulator: &FFI_GroupsAccumulator,
+    values: RVec<WrappedArray>,
+    opt_filter: ROption<WrappedArray>,
+) -> FFIResult<RVec<WrappedArray>> {
+    unsafe {
+        let accumulator = accumulator.inner();
+        let values = rresult_return!(process_values(values));
+        let opt_filter = rresult_return!(process_opt_filter(opt_filter));
+        let state =
+            rresult_return!(accumulator.convert_to_state(&values, opt_filter.as_ref()));
+
+        rresult!(
+            state
+                .iter()
+                .map(|arr| WrappedArray::try_from(arr).map_err(DataFusionError::from))
+                .collect::<Result<RVec<_>>>()
+        )
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(accumulator: &mut FFI_GroupsAccumulator) {
+    unsafe {
+        if !accumulator.private_data.is_null() {
+            let private_data = Box::from_raw(
+                accumulator.private_data as *mut GroupsAccumulatorPrivateData,
+            );
+            drop(private_data);
+            accumulator.private_data = null_mut();
+        }
+    }
+}
+
+impl From<Box<dyn GroupsAccumulator>> for FFI_GroupsAccumulator {
+    fn from(accumulator: Box<dyn GroupsAccumulator>) -> Self {
+        if (accumulator.as_ref() as &dyn Any).is::<ForeignGroupsAccumulator>() {
+            let accumulator = (accumulator as Box<dyn Any>)
+                .downcast::<ForeignGroupsAccumulator>()
+                .expect("already checked type");
+            return accumulator.accumulator;
+        }
+
+        let supports_convert_to_state = accumulator.supports_convert_to_state();
+        let private_data = GroupsAccumulatorPrivateData { accumulator };
+
+        Self {
+            update_batch: update_batch_fn_wrapper,
+            evaluate: evaluate_fn_wrapper,
+            size: size_fn_wrapper,
+            state: state_fn_wrapper,
+            merge_batch: merge_batch_fn_wrapper,
+            convert_to_state: convert_to_state_fn_wrapper,
+            supports_convert_to_state,
+
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_GroupsAccumulator {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+/// This struct is used to access an UDF provided by a foreign
+/// library across a FFI boundary.
+///
+/// The ForeignGroupsAccumulator is to be used by the caller of the UDF, so it has
+/// no knowledge or access to the private data. All interaction with the UDF
+/// must occur through the functions defined in FFI_GroupsAccumulator.
+#[derive(Debug)]
+pub struct ForeignGroupsAccumulator {
+    accumulator: FFI_GroupsAccumulator,
+}
+
+unsafe impl Send for ForeignGroupsAccumulator {}
+unsafe impl Sync for ForeignGroupsAccumulator {}
+
+impl From<FFI_GroupsAccumulator> for Box<dyn GroupsAccumulator> {
+    fn from(mut accumulator: FFI_GroupsAccumulator) -> Self {
+        if (accumulator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    accumulator.private_data as *mut GroupsAccumulatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                accumulator.private_data = null_mut();
+                private_data.accumulator
+            }
+        } else {
+            Box::new(ForeignGroupsAccumulator { accumulator })
+        }
+    }
+}
+
+impl GroupsAccumulator for ForeignGroupsAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
+            let group_indices = group_indices.iter().cloned().collect();
+            let opt_filter = opt_filter
+                .map(|bool_array| to_ffi(&bool_array.to_data()))
+                .transpose()?
+                .map(|(array, schema)| WrappedArray {
+                    array,
+                    schema: WrappedSchema(schema),
+                })
+                .into();
+
+            df_result!((self.accumulator.update_batch)(
+                &mut self.accumulator,
+                values.into(),
+                group_indices,
+                opt_filter,
+                total_num_groups
+            ))
+        }
+    }
+
+    fn size(&self) -> usize {
+        unsafe { (self.accumulator.size)(&self.accumulator) }
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        unsafe {
+            let return_array = df_result!((self.accumulator.evaluate)(
+                &mut self.accumulator,
+                emit_to.into()
+            ))?;
+
+            return_array.try_into().map_err(DataFusionError::from)
+        }
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        unsafe {
+            let returned_arrays = df_result!((self.accumulator.state)(
+                &mut self.accumulator,
+                emit_to.into()
+            ))?;
+
+            returned_arrays
+                .into_iter()
+                .map(|wrapped_array| {
+                    wrapped_array.try_into().map_err(DataFusionError::from)
+                })
+                .collect::<Result<Vec<_>>>()
+        }
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<Vec<_>, ArrowError>>()?;
+            let group_indices = group_indices.iter().cloned().collect();
+            let opt_filter = opt_filter
+                .map(|bool_array| to_ffi(&bool_array.to_data()))
+                .transpose()?
+                .map(|(array, schema)| WrappedArray {
+                    array,
+                    schema: WrappedSchema(schema),
+                })
+                .into();
+
+            df_result!((self.accumulator.merge_batch)(
+                &mut self.accumulator,
+                values.into(),
+                group_indices,
+                opt_filter,
+                total_num_groups
+            ))
+        }
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+
+            let opt_filter = opt_filter
+                .map(|bool_array| to_ffi(&bool_array.to_data()))
+                .transpose()?
+                .map(|(array, schema)| WrappedArray {
+                    array,
+                    schema: WrappedSchema(schema),
+                })
+                .into();
+
+            let returned_array = df_result!((self.accumulator.convert_to_state)(
+                &self.accumulator,
+                values,
+                opt_filter
+            ))?;
+
+            returned_array
+                .into_iter()
+                .map(|arr| arr.try_into().map_err(DataFusionError::from))
+                .collect()
+        }
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        self.accumulator.supports_convert_to_state
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub enum FFI_EmitTo {
+    All,
+    First(usize),
+}
+
+impl From<EmitTo> for FFI_EmitTo {
+    fn from(value: EmitTo) -> Self {
+        match value {
+            EmitTo::All => Self::All,
+            EmitTo::First(v) => Self::First(v),
+        }
+    }
+}
+
+impl From<FFI_EmitTo> for EmitTo {
+    fn from(value: FFI_EmitTo) -> Self {
+        match value {
+            FFI_EmitTo::All => Self::All,
+            FFI_EmitTo::First(v) => Self::First(v),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Array, BooleanArray, make_array};
+    use datafusion::common::create_array;
+    use datafusion::error::Result;
+    use datafusion::functions_aggregate::stddev::StddevGroupsAccumulator;
+    use datafusion::logical_expr::{EmitTo, GroupsAccumulator};
+    use datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator;
+    use datafusion_functions_aggregate_common::stats::StatsType;
+
+    use super::{FFI_EmitTo, FFI_GroupsAccumulator, ForeignGroupsAccumulator};
+
+    #[test]
+    fn test_foreign_avg_accumulator() -> Result<()> {
+        let boxed_accum: Box<dyn GroupsAccumulator> =
+            Box::new(BooleanGroupsAccumulator::new(|a, b| a && b, true));
+        let mut ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let mut foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
+
+        // Send in an array to evaluate. We want a mean of 30 and standard deviation of 4.
+        let values = create_array!(Boolean, vec![true, true, true, false, true, true]);
+        let opt_filter =
+            create_array!(Boolean, vec![true, true, true, true, false, false]);
+        foreign_accum.update_batch(
+            &[values],
+            &[0, 0, 1, 1, 2, 2],
+            Some(opt_filter.as_ref()),
+            3,
+        )?;
+
+        let groups_bool = foreign_accum.evaluate(EmitTo::All)?;
+        let groups_bool = groups_bool.as_any().downcast_ref::<BooleanArray>().unwrap();
+
+        assert_eq!(
+            groups_bool,
+            create_array!(Boolean, vec![Some(true), Some(false), None]).as_ref()
+        );
+
+        let state = foreign_accum.state(EmitTo::All)?;
+        assert_eq!(state.len(), 1);
+
+        // To verify merging batches works, create a second state to add in
+        // This should cause our average to go down to 25.0
+        let second_states =
+            vec![make_array(create_array!(Boolean, vec![false]).to_data())];
+
+        let opt_filter = create_array!(Boolean, vec![true]);
+        foreign_accum.merge_batch(&second_states, &[0], Some(opt_filter.as_ref()), 1)?;
+        let groups_bool = foreign_accum.evaluate(EmitTo::All)?;
+        assert_eq!(groups_bool.len(), 1);
+        assert_eq!(
+            groups_bool.as_ref(),
+            make_array(create_array!(Boolean, vec![false]).to_data()).as_ref()
+        );
+
+        let values = create_array!(Boolean, vec![false]);
+        let opt_filter = create_array!(Boolean, vec![true]);
+        let groups_bool =
+            foreign_accum.convert_to_state(&[values], Some(opt_filter.as_ref()))?;
+
+        assert_eq!(
+            groups_bool[0].as_ref(),
+            make_array(create_array!(Boolean, vec![false]).to_data()).as_ref()
+        );
+
+        Ok(())
+    }
+
+    fn test_emit_to_round_trip(value: EmitTo) -> Result<()> {
+        let ffi_value: FFI_EmitTo = value.into();
+        let round_trip_value: EmitTo = ffi_value.into();
+
+        assert_eq!(value, round_trip_value);
+        Ok(())
+    }
+
+    /// This test ensures all enum values are properly translated
+    #[test]
+    fn test_all_emit_to_round_trip() -> Result<()> {
+        test_emit_to_round_trip(EmitTo::All)?;
+        test_emit_to_round_trip(EmitTo::First(10))?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_groups_accumulator_local_bypass_inner() -> Result<()> {
+        let original_accum = StddevGroupsAccumulator::new(StatsType::Population);
+        let boxed_accum: Box<dyn GroupsAccumulator> = Box::new(original_accum);
+        let original_size = boxed_accum.size();
+
+        let ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn GroupsAccumulator
+                as *const StddevGroupsAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = StddevGroupsAccumulator::new(StatsType::Population);
+        let boxed_accum: Box<dyn GroupsAccumulator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_GroupsAccumulator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn GroupsAccumulator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn GroupsAccumulator
+                as *const ForeignGroupsAccumulator);
+            assert_eq!(original_size, concrete.size());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udaf/mod.rs b/datafusion/ffi/src/udaf/mod.rs
new file mode 100644
index 0000000000000..8e791b28b1ad6
--- /dev/null
+++ b/datafusion/ffi/src/udaf/mod.rs
@@ -0,0 +1,877 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RStr, RString, RVec};
+use accumulator::FFI_Accumulator;
+use accumulator_args::{FFI_AccumulatorArgs, ForeignAccumulatorArgs};
+use arrow::datatypes::{DataType, Field};
+use arrow::ffi::FFI_ArrowSchema;
+use arrow_schema::FieldRef;
+use datafusion_common::{DataFusionError, Result, ffi_datafusion_err};
+use datafusion_expr::function::AggregateFunctionSimplification;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    Accumulator, AggregateUDF, AggregateUDFImpl, GroupsAccumulator, Signature,
+};
+use datafusion_functions_aggregate_common::accumulator::{
+    AccumulatorArgs, StateFieldsArgs,
+};
+use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
+use datafusion_proto_common::from_proto::parse_proto_fields_to_fields;
+use groups_accumulator::FFI_GroupsAccumulator;
+use prost::{DecodeError, Message};
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::util::{
+    FFIResult, rvec_wrapped_to_vec_datatype, rvec_wrapped_to_vec_fieldref,
+    vec_datatype_to_rvec_wrapped, vec_fieldref_to_rvec_wrapped,
+};
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, rresult, rresult_return};
+
+mod accumulator;
+mod accumulator_args;
+mod groups_accumulator;
+
+/// A stable struct for sharing a [`AggregateUDF`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_AggregateUDF {
+    /// FFI equivalent to the `name` of a [`AggregateUDF`]
+    pub name: RString,
+
+    /// FFI equivalent to the `aliases` of a [`AggregateUDF`]
+    pub aliases: RVec<RString>,
+
+    /// FFI equivalent to the `volatility` of a [`AggregateUDF`]
+    pub volatility: FFI_Volatility,
+
+    /// Determines the return field of the underlying [`AggregateUDF`] based on the
+    /// argument fields.
+    pub return_field: unsafe extern "C" fn(
+        udaf: &Self,
+        arg_fields: RVec<WrappedSchema>,
+    ) -> FFIResult<WrappedSchema>,
+
+    /// FFI equivalent to the `is_nullable` of a [`AggregateUDF`]
+    pub is_nullable: bool,
+
+    /// FFI equivalent to [`AggregateUDF::groups_accumulator_supported`]
+    pub groups_accumulator_supported:
+        unsafe extern "C" fn(udaf: &FFI_AggregateUDF, args: FFI_AccumulatorArgs) -> bool,
+
+    /// FFI equivalent to [`AggregateUDF::accumulator`]
+    pub accumulator: unsafe extern "C" fn(
+        udaf: &FFI_AggregateUDF,
+        args: FFI_AccumulatorArgs,
+    ) -> FFIResult<FFI_Accumulator>,
+
+    /// FFI equivalent to [`AggregateUDF::create_sliding_accumulator`]
+    pub create_sliding_accumulator: unsafe extern "C" fn(
+        udaf: &FFI_AggregateUDF,
+        args: FFI_AccumulatorArgs,
+    )
+        -> FFIResult<FFI_Accumulator>,
+
+    /// FFI equivalent to [`AggregateUDF::state_fields`]
+    pub state_fields: unsafe extern "C" fn(
+        udaf: &FFI_AggregateUDF,
+        name: &RStr,
+        input_fields: RVec<WrappedSchema>,
+        return_field: WrappedSchema,
+        ordering_fields: RVec<RVec<u8>>,
+        is_distinct: bool,
+    ) -> FFIResult<RVec<RVec<u8>>>,
+
+    /// FFI equivalent to [`AggregateUDF::create_groups_accumulator`]
+    pub create_groups_accumulator:
+        unsafe extern "C" fn(
+            udaf: &FFI_AggregateUDF,
+            args: FFI_AccumulatorArgs,
+        ) -> FFIResult<FFI_GroupsAccumulator>,
+
+    /// FFI equivalent to [`AggregateUDF::with_beneficial_ordering`]
+    pub with_beneficial_ordering:
+        unsafe extern "C" fn(
+            udaf: &FFI_AggregateUDF,
+            beneficial_ordering: bool,
+        ) -> FFIResult<ROption<FFI_AggregateUDF>>,
+
+    /// FFI equivalent to [`AggregateUDF::order_sensitivity`]
+    pub order_sensitivity:
+        unsafe extern "C" fn(udaf: &FFI_AggregateUDF) -> FFI_AggregateOrderSensitivity,
+
+    /// Performs type coercion. To simply this interface, all UDFs are treated as having
+    /// user defined signatures, which will in turn call coerce_types to be called. This
+    /// call should be transparent to most users as the internal function performs the
+    /// appropriate calls on the underlying [`AggregateUDF`]
+    pub coerce_types: unsafe extern "C" fn(
+        udf: &Self,
+        arg_types: RVec<WrappedSchema>,
+    ) -> FFIResult<RVec<WrappedSchema>>,
+
+    /// Used to create a clone on the provider of the udaf. This should
+    /// only need to be called by the receiver of the udaf.
+    pub clone: unsafe extern "C" fn(udaf: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(udaf: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the udaf.
+    /// A [`ForeignAggregateUDF`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_AggregateUDF {}
+unsafe impl Sync for FFI_AggregateUDF {}
+
+pub struct AggregateUDFPrivateData {
+    pub udaf: Arc<AggregateUDF>,
+}
+
+impl FFI_AggregateUDF {
+    unsafe fn inner(&self) -> &Arc<AggregateUDF> {
+        unsafe {
+            let private_data = self.private_data as *const AggregateUDFPrivateData;
+            &(*private_data).udaf
+        }
+    }
+}
+
+unsafe extern "C" fn return_field_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    arg_fields: RVec<WrappedSchema>,
+) -> FFIResult<WrappedSchema> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let arg_fields = rresult_return!(rvec_wrapped_to_vec_fieldref(&arg_fields));
+
+        let return_field = udaf
+            .return_field(&arg_fields)
+            .and_then(|v| {
+                FFI_ArrowSchema::try_from(v.as_ref()).map_err(DataFusionError::from)
+            })
+            .map(WrappedSchema);
+
+        rresult!(return_field)
+    }
+}
+
+unsafe extern "C" fn accumulator_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    args: FFI_AccumulatorArgs,
+) -> FFIResult<FFI_Accumulator> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+
+        rresult!(
+            udaf.accumulator(accumulator_args.into())
+                .map(FFI_Accumulator::from)
+        )
+    }
+}
+
+unsafe extern "C" fn create_sliding_accumulator_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    args: FFI_AccumulatorArgs,
+) -> FFIResult<FFI_Accumulator> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+
+        rresult!(
+            udaf.create_sliding_accumulator(accumulator_args.into())
+                .map(FFI_Accumulator::from)
+        )
+    }
+}
+
+unsafe extern "C" fn create_groups_accumulator_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    args: FFI_AccumulatorArgs,
+) -> FFIResult<FFI_GroupsAccumulator> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let accumulator_args = &rresult_return!(ForeignAccumulatorArgs::try_from(args));
+
+        rresult!(
+            udaf.create_groups_accumulator(accumulator_args.into())
+                .map(FFI_GroupsAccumulator::from)
+        )
+    }
+}
+
+unsafe extern "C" fn groups_accumulator_supported_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    args: FFI_AccumulatorArgs,
+) -> bool {
+    unsafe {
+        let udaf = udaf.inner();
+
+        ForeignAccumulatorArgs::try_from(args)
+            .map(|a| udaf.groups_accumulator_supported((&a).into()))
+            .unwrap_or_else(|e| {
+                log::warn!("Unable to parse accumulator args. {e}");
+                false
+            })
+    }
+}
+
+unsafe extern "C" fn with_beneficial_ordering_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    beneficial_ordering: bool,
+) -> FFIResult<ROption<FFI_AggregateUDF>> {
+    unsafe {
+        let udaf = udaf.inner().as_ref().clone();
+
+        let result = rresult_return!(udaf.with_beneficial_ordering(beneficial_ordering));
+        let result = rresult_return!(
+            result
+                .map(|func| func.with_beneficial_ordering(beneficial_ordering))
+                .transpose()
+        )
+        .flatten()
+        .map(|func| FFI_AggregateUDF::from(Arc::new(func)));
+
+        RResult::ROk(result.into())
+    }
+}
+
+unsafe extern "C" fn state_fields_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    name: &RStr,
+    input_fields: RVec<WrappedSchema>,
+    return_field: WrappedSchema,
+    ordering_fields: RVec<RVec<u8>>,
+    is_distinct: bool,
+) -> FFIResult<RVec<RVec<u8>>> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let input_fields = &rresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
+        let return_field = rresult_return!(Field::try_from(&return_field.0)).into();
+
+        let ordering_fields = &rresult_return!(
+            ordering_fields
+                .into_iter()
+                .map(|field_bytes| datafusion_proto_common::Field::decode(
+                    field_bytes.as_ref()
+                ))
+                .collect::<std::result::Result<Vec<_>, DecodeError>>()
+        );
+
+        let ordering_fields =
+            &rresult_return!(parse_proto_fields_to_fields(ordering_fields))
+                .into_iter()
+                .map(Arc::new)
+                .collect::<Vec<_>>();
+
+        let args = StateFieldsArgs {
+            name: name.as_str(),
+            input_fields,
+            return_field,
+            ordering_fields,
+            is_distinct,
+        };
+
+        let state_fields = rresult_return!(udaf.state_fields(args));
+        let state_fields = rresult_return!(
+            state_fields
+                .iter()
+                .map(|f| f.as_ref())
+                .map(datafusion_proto::protobuf::Field::try_from)
+                .map(|v| v.map_err(DataFusionError::from))
+                .collect::<Result<Vec<_>>>()
+        )
+        .into_iter()
+        .map(|field| field.encode_to_vec().into())
+        .collect();
+
+        RResult::ROk(state_fields)
+    }
+}
+
+unsafe extern "C" fn order_sensitivity_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+) -> FFI_AggregateOrderSensitivity {
+    unsafe { udaf.inner().order_sensitivity().into() }
+}
+
+unsafe extern "C" fn coerce_types_fn_wrapper(
+    udaf: &FFI_AggregateUDF,
+    arg_types: RVec<WrappedSchema>,
+) -> FFIResult<RVec<WrappedSchema>> {
+    unsafe {
+        let udaf = udaf.inner();
+
+        let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
+
+        let arg_fields = arg_types
+            .iter()
+            .map(|dt| Field::new("f", dt.clone(), true))
+            .map(Arc::new)
+            .collect::<Vec<_>>();
+        let return_types = rresult_return!(fields_with_udf(&arg_fields, udaf.as_ref()))
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
+
+        rresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(udaf: &mut FFI_AggregateUDF) {
+    unsafe {
+        debug_assert!(!udaf.private_data.is_null());
+        let private_data =
+            Box::from_raw(udaf.private_data as *mut AggregateUDFPrivateData);
+        drop(private_data);
+        udaf.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(udaf: &FFI_AggregateUDF) -> FFI_AggregateUDF {
+    unsafe { Arc::clone(udaf.inner()).into() }
+}
+
+impl Clone for FFI_AggregateUDF {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl From<Arc<AggregateUDF>> for FFI_AggregateUDF {
+    fn from(udaf: Arc<AggregateUDF>) -> Self {
+        if let Some(udaf) = udaf.inner().as_any().downcast_ref::<ForeignAggregateUDF>() {
+            return udaf.udaf.clone();
+        }
+
+        let name = udaf.name().into();
+        let aliases = udaf.aliases().iter().map(|a| a.to_owned().into()).collect();
+        let is_nullable = udaf.is_nullable();
+        let volatility = udaf.signature().volatility.into();
+
+        let private_data = Box::new(AggregateUDFPrivateData { udaf });
+
+        Self {
+            name,
+            is_nullable,
+            volatility,
+            aliases,
+            return_field: return_field_fn_wrapper,
+            accumulator: accumulator_fn_wrapper,
+            create_sliding_accumulator: create_sliding_accumulator_fn_wrapper,
+            create_groups_accumulator: create_groups_accumulator_fn_wrapper,
+            groups_accumulator_supported: groups_accumulator_supported_fn_wrapper,
+            with_beneficial_ordering: with_beneficial_ordering_fn_wrapper,
+            state_fields: state_fields_fn_wrapper,
+            order_sensitivity: order_sensitivity_fn_wrapper,
+            coerce_types: coerce_types_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_AggregateUDF {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+/// This struct is used to access an UDF provided by a foreign
+/// library across a FFI boundary.
+///
+/// The ForeignAggregateUDF is to be used by the caller of the UDF, so it has
+/// no knowledge or access to the private data. All interaction with the UDF
+/// must occur through the functions defined in FFI_AggregateUDF.
+#[derive(Debug)]
+pub struct ForeignAggregateUDF {
+    signature: Signature,
+    aliases: Vec<String>,
+    udaf: FFI_AggregateUDF,
+}
+
+unsafe impl Send for ForeignAggregateUDF {}
+unsafe impl Sync for ForeignAggregateUDF {}
+
+impl PartialEq for ForeignAggregateUDF {
+    fn eq(&self, other: &Self) -> bool {
+        // FFI_AggregateUDF cannot be compared, so identity equality is the best we can do.
+        std::ptr::eq(self, other)
+    }
+}
+impl Eq for ForeignAggregateUDF {}
+impl Hash for ForeignAggregateUDF {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        std::ptr::hash(self, state)
+    }
+}
+
+impl From<&FFI_AggregateUDF> for Arc<dyn AggregateUDFImpl> {
+    fn from(udaf: &FFI_AggregateUDF) -> Self {
+        if (udaf.library_marker_id)() == crate::get_library_marker_id() {
+            return Arc::clone(unsafe { udaf.inner().inner() });
+        }
+
+        let signature = Signature::user_defined((&udaf.volatility).into());
+        let aliases = udaf.aliases.iter().map(|s| s.to_string()).collect();
+
+        Arc::new(ForeignAggregateUDF {
+            udaf: udaf.clone(),
+            signature,
+            aliases,
+        })
+    }
+}
+
+impl AggregateUDFImpl for ForeignAggregateUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.udaf.name.as_str()
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        unimplemented!()
+    }
+
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        let arg_fields = vec_fieldref_to_rvec_wrapped(arg_fields)?;
+
+        let result = unsafe { (self.udaf.return_field)(&self.udaf, arg_fields) };
+
+        let result = df_result!(result);
+
+        result.and_then(|r| {
+            Field::try_from(&r.0)
+                .map(Arc::new)
+                .map_err(DataFusionError::from)
+        })
+    }
+
+    fn is_nullable(&self) -> bool {
+        self.udaf.is_nullable
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let args = acc_args.try_into()?;
+        unsafe {
+            df_result!((self.udaf.accumulator)(&self.udaf, args))
+                .map(<Box<dyn Accumulator>>::from)
+        }
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        unsafe {
+            let name = RStr::from_str(args.name);
+            let input_fields = vec_fieldref_to_rvec_wrapped(args.input_fields)?;
+            let return_field =
+                WrappedSchema(FFI_ArrowSchema::try_from(args.return_field.as_ref())?);
+            let ordering_fields = args
+                .ordering_fields
+                .iter()
+                .map(|f| f.as_ref())
+                .map(datafusion_proto::protobuf::Field::try_from)
+                .map(|v| v.map_err(DataFusionError::from))
+                .collect::<Result<Vec<_>>>()?
+                .into_iter()
+                .map(|proto_field| proto_field.encode_to_vec().into())
+                .collect();
+
+            let fields = df_result!((self.udaf.state_fields)(
+                &self.udaf,
+                &name,
+                input_fields,
+                return_field,
+                ordering_fields,
+                args.is_distinct
+            ))?;
+            let fields = fields
+                .into_iter()
+                .map(|field_bytes| {
+                    datafusion_proto_common::Field::decode(field_bytes.as_ref())
+                        .map_err(|e| ffi_datafusion_err!("{e}"))
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            parse_proto_fields_to_fields(fields.iter())
+                .map(|fields| fields.into_iter().map(Arc::new).collect())
+                .map_err(|e| ffi_datafusion_err!("{e}"))
+        }
+    }
+
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        let args = match FFI_AccumulatorArgs::try_from(args) {
+            Ok(v) => v,
+            Err(e) => {
+                log::warn!("Attempting to convert accumulator arguments: {e}");
+                return false;
+            }
+        };
+
+        unsafe { (self.udaf.groups_accumulator_supported)(&self.udaf, args) }
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let args = FFI_AccumulatorArgs::try_from(args)?;
+
+        unsafe {
+            df_result!((self.udaf.create_groups_accumulator)(&self.udaf, args))
+                .map(<Box<dyn GroupsAccumulator>>::from)
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn create_sliding_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn Accumulator>> {
+        let args = args.try_into()?;
+        unsafe {
+            df_result!((self.udaf.create_sliding_accumulator)(&self.udaf, args))
+                .map(<Box<dyn Accumulator>>::from)
+        }
+    }
+
+    fn with_beneficial_ordering(
+        self: Arc<Self>,
+        beneficial_ordering: bool,
+    ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
+        unsafe {
+            let result = df_result!((self.udaf.with_beneficial_ordering)(
+                &self.udaf,
+                beneficial_ordering
+            ))?
+            .into_option();
+
+            let result = result.map(|func| <Arc<dyn AggregateUDFImpl>>::from(&func));
+
+            Ok(result)
+        }
+    }
+
+    fn order_sensitivity(&self) -> AggregateOrderSensitivity {
+        unsafe { (self.udaf.order_sensitivity)(&self.udaf).into() }
+    }
+
+    fn simplify(&self) -> Option<AggregateFunctionSimplification> {
+        None
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        unsafe {
+            let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?;
+            let result_types =
+                df_result!((self.udaf.coerce_types)(&self.udaf, arg_types))?;
+            Ok(rvec_wrapped_to_vec_datatype(&result_types)?)
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub enum FFI_AggregateOrderSensitivity {
+    Insensitive,
+    HardRequirement,
+    SoftRequirement,
+    Beneficial,
+}
+
+impl From<FFI_AggregateOrderSensitivity> for AggregateOrderSensitivity {
+    fn from(value: FFI_AggregateOrderSensitivity) -> Self {
+        match value {
+            FFI_AggregateOrderSensitivity::Insensitive => Self::Insensitive,
+            FFI_AggregateOrderSensitivity::HardRequirement => Self::HardRequirement,
+            FFI_AggregateOrderSensitivity::SoftRequirement => Self::SoftRequirement,
+            FFI_AggregateOrderSensitivity::Beneficial => Self::Beneficial,
+        }
+    }
+}
+
+impl From<AggregateOrderSensitivity> for FFI_AggregateOrderSensitivity {
+    fn from(value: AggregateOrderSensitivity) -> Self {
+        match value {
+            AggregateOrderSensitivity::Insensitive => Self::Insensitive,
+            AggregateOrderSensitivity::HardRequirement => Self::HardRequirement,
+            AggregateOrderSensitivity::SoftRequirement => Self::SoftRequirement,
+            AggregateOrderSensitivity::Beneficial => Self::Beneficial,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::any::Any;
+    use std::collections::HashMap;
+
+    use arrow::datatypes::Schema;
+    use datafusion::common::create_array;
+    use datafusion::functions_aggregate::sum::Sum;
+    use datafusion::physical_expr::PhysicalSortExpr;
+    use datafusion::physical_plan::expressions::col;
+    use datafusion::scalar::ScalarValue;
+
+    use super::*;
+
+    #[derive(Default, Debug, Hash, Eq, PartialEq)]
+    struct SumWithCopiedMetadata {
+        inner: Sum,
+    }
+
+    impl AggregateUDFImpl for SumWithCopiedMetadata {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            self.inner.name()
+        }
+
+        fn signature(&self) -> &Signature {
+            self.inner.signature()
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            unimplemented!()
+        }
+
+        fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+            // Copy the input field, so any metadata gets returned
+            Ok(Arc::clone(&arg_fields[0]))
+        }
+
+        fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+            self.inner.accumulator(acc_args)
+        }
+    }
+
+    fn create_test_foreign_udaf(
+        original_udaf: impl AggregateUDFImpl + 'static,
+    ) -> Result<AggregateUDF> {
+        let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
+
+        let mut local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        local_udaf.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        Ok(AggregateUDF::new_from_shared_impl(foreign_udaf))
+    }
+
+    #[test]
+    fn test_round_trip_udaf() -> Result<()> {
+        let original_udaf = Sum::new();
+        let original_name = original_udaf.name().to_owned();
+        let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
+
+        // Convert to FFI format
+        let mut local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+        local_udaf.library_marker_id = crate::mock_foreign_marker_id;
+
+        // Convert back to native format
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        let foreign_udaf = AggregateUDF::new_from_shared_impl(foreign_udaf);
+
+        assert_eq!(original_name, foreign_udaf.name());
+        Ok(())
+    }
+
+    #[test]
+    fn test_foreign_udaf_aliases() -> Result<()> {
+        let foreign_udaf =
+            create_test_foreign_udaf(Sum::new())?.with_aliases(["my_function"]);
+
+        let return_field =
+            foreign_udaf
+                .return_field(&[Field::new("a", DataType::Float64, true).into()])?;
+        let return_type = return_field.data_type();
+        assert_eq!(return_type, &DataType::Float64);
+        Ok(())
+    }
+
+    #[test]
+    fn test_foreign_udaf_accumulator() -> Result<()> {
+        let foreign_udaf = create_test_foreign_udaf(Sum::new())?;
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]);
+        let acc_args = AccumulatorArgs {
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            schema: &schema,
+            expr_fields: &[Field::new("a", DataType::Float64, true).into()],
+            ignore_nulls: true,
+            order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)],
+            is_reversed: false,
+            name: "round_trip",
+            is_distinct: true,
+            exprs: &[col("a", &schema)?],
+        };
+        let mut accumulator = foreign_udaf.accumulator(acc_args)?;
+        let values = create_array!(Float64, vec![10., 20., 30., 40., 50.]);
+        accumulator.update_batch(&[values])?;
+        let resultant_value = accumulator.evaluate()?;
+        assert_eq!(resultant_value, ScalarValue::Float64(Some(150.)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_round_trip_udaf_metadata() -> Result<()> {
+        let original_udaf = SumWithCopiedMetadata::default();
+        let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
+
+        // Convert to FFI format
+        let local_udaf: FFI_AggregateUDF = Arc::clone(&original_udaf).into();
+
+        // Convert back to native format
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&local_udaf).into();
+        let foreign_udaf = AggregateUDF::new_from_shared_impl(foreign_udaf);
+
+        let metadata: HashMap<String, String> =
+            [("a_key".to_string(), "a_value".to_string())]
+                .into_iter()
+                .collect();
+        let input_field = Arc::new(
+            Field::new("a", DataType::Float64, false).with_metadata(metadata.clone()),
+        );
+        let return_field = foreign_udaf.return_field(&[input_field])?;
+
+        assert_eq!(&metadata, return_field.metadata());
+        Ok(())
+    }
+
+    #[test]
+    fn test_beneficial_ordering() -> Result<()> {
+        let foreign_udaf = create_test_foreign_udaf(
+            datafusion::functions_aggregate::first_last::FirstValue::new(),
+        )?;
+
+        let foreign_udaf = foreign_udaf.with_beneficial_ordering(true)?.unwrap();
+
+        assert_eq!(
+            foreign_udaf.order_sensitivity(),
+            AggregateOrderSensitivity::Beneficial
+        );
+
+        let a_field = Arc::new(Field::new("a", DataType::Float64, true));
+        let state_fields = foreign_udaf.state_fields(StateFieldsArgs {
+            name: "a",
+            input_fields: &[Field::new("f", DataType::Float64, true).into()],
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            ordering_fields: &[Arc::clone(&a_field)],
+            is_distinct: false,
+        })?;
+
+        assert_eq!(state_fields.len(), 3);
+        assert_eq!(state_fields[1], a_field);
+        Ok(())
+    }
+
+    #[test]
+    fn test_sliding_accumulator() -> Result<()> {
+        let foreign_udaf = create_test_foreign_udaf(Sum::new())?;
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]);
+        // Note: sum distinct is only support Int64 until now
+        let acc_args = AccumulatorArgs {
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            schema: &schema,
+            expr_fields: &[Field::new("a", DataType::Float64, true).into()],
+            ignore_nulls: true,
+            order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)],
+            is_reversed: false,
+            name: "round_trip",
+            is_distinct: false,
+            exprs: &[col("a", &schema)?],
+        };
+
+        let mut accumulator = foreign_udaf.create_sliding_accumulator(acc_args)?;
+        let values = create_array!(Float64, vec![10., 20., 30., 40., 50.]);
+        accumulator.update_batch(&[values])?;
+        let resultant_value = accumulator.evaluate()?;
+        assert_eq!(resultant_value, ScalarValue::Float64(Some(150.)));
+
+        Ok(())
+    }
+
+    fn test_round_trip_order_sensitivity(sensitivity: AggregateOrderSensitivity) {
+        let ffi_sensitivity: FFI_AggregateOrderSensitivity = sensitivity.into();
+        let round_trip_sensitivity: AggregateOrderSensitivity = ffi_sensitivity.into();
+
+        assert_eq!(sensitivity, round_trip_sensitivity);
+    }
+
+    #[test]
+    fn test_round_trip_all_order_sensitivities() {
+        test_round_trip_order_sensitivity(AggregateOrderSensitivity::Insensitive);
+        test_round_trip_order_sensitivity(AggregateOrderSensitivity::HardRequirement);
+        test_round_trip_order_sensitivity(AggregateOrderSensitivity::SoftRequirement);
+        test_round_trip_order_sensitivity(AggregateOrderSensitivity::Beneficial);
+    }
+
+    #[test]
+    fn test_ffi_udaf_local_bypass() -> Result<()> {
+        let original_udaf = Sum::new();
+        let original_udaf = Arc::new(AggregateUDF::from(original_udaf));
+
+        let mut ffi_udaf = FFI_AggregateUDF::from(original_udaf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&ffi_udaf).into();
+        assert!(foreign_udaf.as_any().downcast_ref::<Sum>().is_some());
+
+        // Verify different library markers generate foreign providers
+        ffi_udaf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udaf: Arc<dyn AggregateUDFImpl> = (&ffi_udaf).into();
+        assert!(
+            foreign_udaf
+                .as_any()
+                .downcast_ref::<ForeignAggregateUDF>()
+                .is_some()
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udf/mod.rs b/datafusion/ffi/src/udf/mod.rs
index 303acc783b2e4..0202fd8bcfe65 100644
--- a/datafusion/ffi/src/udf/mod.rs
+++ b/datafusion/ffi/src/udf/mod.rs
@@ -15,45 +15,42 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{
-    arrow_wrappers::{WrappedArray, WrappedSchema},
-    df_result, rresult, rresult_return,
-    util::{rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped},
-    volatility::FFI_Volatility,
-};
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
+use std::ffi::c_void;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RString, RVec};
+use arrow::array::Array;
 use arrow::datatypes::{DataType, Field};
-use arrow::{
-    array::ArrayRef,
-    error::ArrowError,
-    ffi::{from_ffi, to_ffi, FFI_ArrowSchema},
-};
+use arrow::error::ArrowError;
+use arrow::ffi::{FFI_ArrowSchema, from_ffi, to_ffi};
 use arrow_schema::FieldRef;
-use datafusion::logical_expr::ReturnFieldArgs;
-use datafusion::{
-    error::DataFusionError,
-    logical_expr::type_coercion::functions::data_types_with_scalar_udf,
-};
-use datafusion::{
-    error::Result,
-    logical_expr::{
-        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-    },
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, Result, internal_err};
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature,
 };
 use return_type_args::{
     FFI_ReturnFieldArgs, ForeignReturnFieldArgs, ForeignReturnFieldArgsOwned,
 };
-use std::{ffi::c_void, sync::Arc};
+
+use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
+use crate::config::FFI_ConfigOptions;
+use crate::expr::columnar_value::FFI_ColumnarValue;
+use crate::util::{
+    FFIResult, rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped,
+};
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, rresult, rresult_return};
 
 pub mod return_type_args;
 
 /// A stable struct for sharing a [`ScalarUDF`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_ScalarUDF {
     /// FFI equivalent to the `name` of a [`ScalarUDF`]
     pub name: RString,
@@ -64,43 +61,34 @@ pub struct FFI_ScalarUDF {
     /// FFI equivalent to the `volatility` of a [`ScalarUDF`]
     pub volatility: FFI_Volatility,
 
-    /// Determines the return type of the underlying [`ScalarUDF`] based on the
-    /// argument types.
-    pub return_type: unsafe extern "C" fn(
-        udf: &Self,
-        arg_types: RVec<WrappedSchema>,
-    ) -> RResult<WrappedSchema, RString>,
-
-    /// Determines the return info of the underlying [`ScalarUDF`]. Either this
-    /// or return_type may be implemented on a UDF.
+    /// Determines the return info of the underlying [`ScalarUDF`].
     pub return_field_from_args: unsafe extern "C" fn(
         udf: &Self,
         args: FFI_ReturnFieldArgs,
-    )
-        -> RResult<WrappedSchema, RString>,
+    ) -> FFIResult<WrappedSchema>,
 
     /// Execute the underlying [`ScalarUDF`] and return the result as a `FFI_ArrowArray`
     /// within an AbiStable wrapper.
-    #[allow(clippy::type_complexity)]
     pub invoke_with_args: unsafe extern "C" fn(
         udf: &Self,
         args: RVec<WrappedArray>,
         arg_fields: RVec<WrappedSchema>,
         num_rows: usize,
         return_field: WrappedSchema,
-    ) -> RResult<WrappedArray, RString>,
+        config_options: FFI_ConfigOptions,
+    ) -> FFIResult<FFI_ColumnarValue>,
 
     /// See [`ScalarUDFImpl`] for details on short_circuits
     pub short_circuits: bool,
 
-    /// Performs type coersion. To simply this interface, all UDFs are treated as having
+    /// Performs type coercion. To simply this interface, all UDFs are treated as having
     /// user defined signatures, which will in turn call coerce_types to be called. This
     /// call should be transparent to most users as the internal function performs the
     /// appropriate calls on the underlying [`ScalarUDF`]
     pub coerce_types: unsafe extern "C" fn(
         udf: &Self,
         arg_types: RVec<WrappedSchema>,
-    ) -> RResult<RVec<WrappedSchema>, RString>,
+    ) -> FFIResult<RVec<WrappedSchema>>,
 
     /// Used to create a clone on the provider of the udf. This should
     /// only need to be called by the receiver of the udf.
@@ -112,6 +100,11 @@ pub struct FFI_ScalarUDF {
     /// Internal data. This is only to be accessed by the provider of the udf.
     /// A [`ForeignScalarUDF`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_ScalarUDF {}
@@ -121,34 +114,22 @@ pub struct ScalarUDFPrivateData {
     pub udf: Arc<ScalarUDF>,
 }
 
-unsafe extern "C" fn return_type_fn_wrapper(
-    udf: &FFI_ScalarUDF,
-    arg_types: RVec<WrappedSchema>,
-) -> RResult<WrappedSchema, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
-
-    let return_type = udf
-        .return_type(&arg_types)
-        .and_then(|v| FFI_ArrowSchema::try_from(v).map_err(DataFusionError::from))
-        .map(WrappedSchema);
-
-    rresult!(return_type)
+impl FFI_ScalarUDF {
+    fn inner(&self) -> &Arc<ScalarUDF> {
+        let private_data = self.private_data as *const ScalarUDFPrivateData;
+        unsafe { &(*private_data).udf }
+    }
 }
 
 unsafe extern "C" fn return_field_from_args_fn_wrapper(
     udf: &FFI_ScalarUDF,
     args: FFI_ReturnFieldArgs,
-) -> RResult<WrappedSchema, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
+) -> FFIResult<WrappedSchema> {
     let args: ForeignReturnFieldArgsOwned = rresult_return!((&args).try_into());
     let args_ref: ForeignReturnFieldArgs = (&args).into();
 
     let return_type = udf
+        .inner()
         .return_field_from_args((&args_ref).into())
         .and_then(|f| FFI_ArrowSchema::try_from(&f).map_err(DataFusionError::from))
         .map(WrappedSchema);
@@ -159,13 +140,18 @@ unsafe extern "C" fn return_field_from_args_fn_wrapper(
 unsafe extern "C" fn coerce_types_fn_wrapper(
     udf: &FFI_ScalarUDF,
     arg_types: RVec<WrappedSchema>,
-) -> RResult<RVec<WrappedSchema>, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
+) -> FFIResult<RVec<WrappedSchema>> {
     let arg_types = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types));
 
-    let return_types = rresult_return!(data_types_with_scalar_udf(&arg_types, udf));
+    let arg_fields = arg_types
+        .iter()
+        .map(|dt| Arc::new(Field::new("f", dt.clone(), true)))
+        .collect::<Vec<_>>();
+    let return_types =
+        rresult_return!(fields_with_udf(&arg_fields, udf.inner().as_ref()))
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
 
     rresult!(vec_datatype_to_rvec_wrapped(&return_types))
 }
@@ -176,60 +162,64 @@ unsafe extern "C" fn invoke_with_args_fn_wrapper(
     arg_fields: RVec<WrappedSchema>,
     number_rows: usize,
     return_field: WrappedSchema,
-) -> RResult<WrappedArray, RString> {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf = &(*private_data).udf;
-
-    let args = args
-        .into_iter()
-        .map(|arr| {
-            from_ffi(arr.array, &arr.schema.0)
-                .map(|v| ColumnarValue::Array(arrow::array::make_array(v)))
-        })
-        .collect::<std::result::Result<_, _>>();
+    config_options: FFI_ConfigOptions,
+) -> FFIResult<FFI_ColumnarValue> {
+    unsafe {
+        let args = args
+            .into_iter()
+            .map(|arr| {
+                from_ffi(arr.array, &arr.schema.0)
+                    .map(|v| ColumnarValue::Array(arrow::array::make_array(v)))
+            })
+            .collect::<std::result::Result<_, _>>();
 
-    let args = rresult_return!(args);
-    let return_field = rresult_return!(Field::try_from(&return_field.0)).into();
+        let args = rresult_return!(args);
+        let return_field = rresult_return!(Field::try_from(&return_field.0)).into();
 
-    let arg_fields = arg_fields
-        .into_iter()
-        .map(|wrapped_field| {
-            Field::try_from(&wrapped_field.0)
-                .map(Arc::new)
-                .map_err(DataFusionError::from)
-        })
-        .collect::<Result<Vec<FieldRef>>>();
-    let arg_fields = rresult_return!(arg_fields);
-
-    let args = ScalarFunctionArgs {
-        args,
-        arg_fields,
-        number_rows,
-        return_field,
-    };
-
-    let result = rresult_return!(udf
-        .invoke_with_args(args)
-        .and_then(|r| r.to_array(number_rows)));
-
-    let (result_array, result_schema) = rresult_return!(to_ffi(&result.to_data()));
-
-    RResult::ROk(WrappedArray {
-        array: result_array,
-        schema: WrappedSchema(result_schema),
-    })
+        let arg_fields = arg_fields
+            .into_iter()
+            .map(|wrapped_field| {
+                Field::try_from(&wrapped_field.0)
+                    .map(Arc::new)
+                    .map_err(DataFusionError::from)
+            })
+            .collect::<Result<Vec<FieldRef>>>();
+        let arg_fields = rresult_return!(arg_fields);
+        let config_options = rresult_return!(ConfigOptions::try_from(config_options));
+        let config_options = Arc::new(config_options);
+
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field,
+            config_options,
+        };
+
+        rresult!(
+            udf.inner()
+                .invoke_with_args(args)
+                .and_then(FFI_ColumnarValue::try_from)
+        )
+    }
 }
 
 unsafe extern "C" fn release_fn_wrapper(udf: &mut FFI_ScalarUDF) {
-    let private_data = Box::from_raw(udf.private_data as *mut ScalarUDFPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udf.private_data.is_null());
+        let private_data = Box::from_raw(udf.private_data as *mut ScalarUDFPrivateData);
+        drop(private_data);
+        udf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udf: &FFI_ScalarUDF) -> FFI_ScalarUDF {
-    let private_data = udf.private_data as *const ScalarUDFPrivateData;
-    let udf_data = &(*private_data);
+    unsafe {
+        let private_data = udf.private_data as *const ScalarUDFPrivateData;
+        let udf_data = &(*private_data);
 
-    Arc::clone(&udf_data.udf).into()
+        Arc::clone(&udf_data.udf).into()
+    }
 }
 
 impl Clone for FFI_ScalarUDF {
@@ -240,6 +230,10 @@ impl Clone for FFI_ScalarUDF {
 
 impl From<Arc<ScalarUDF>> for FFI_ScalarUDF {
     fn from(udf: Arc<ScalarUDF>) -> Self {
+        if let Some(udf) = udf.inner().as_any().downcast_ref::<ForeignScalarUDF>() {
+            return udf.udf.clone();
+        }
+
         let name = udf.name().into();
         let aliases = udf.aliases().iter().map(|a| a.to_owned().into()).collect();
         let volatility = udf.signature().volatility.into();
@@ -253,12 +247,12 @@ impl From<Arc<ScalarUDF>> for FFI_ScalarUDF {
             volatility,
             short_circuits,
             invoke_with_args: invoke_with_args_fn_wrapper,
-            return_type: return_type_fn_wrapper,
             return_field_from_args: return_field_from_args_fn_wrapper,
             coerce_types: coerce_types_fn_wrapper,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -286,21 +280,54 @@ pub struct ForeignScalarUDF {
 unsafe impl Send for ForeignScalarUDF {}
 unsafe impl Sync for ForeignScalarUDF {}
 
-impl TryFrom<&FFI_ScalarUDF> for ForeignScalarUDF {
-    type Error = DataFusionError;
-
-    fn try_from(udf: &FFI_ScalarUDF) -> Result<Self, Self::Error> {
-        let name = udf.name.to_owned().into();
-        let signature = Signature::user_defined((&udf.volatility).into());
-
-        let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
+impl PartialEq for ForeignScalarUDF {
+    fn eq(&self, other: &Self) -> bool {
+        let Self {
+            name,
+            aliases,
+            udf,
+            signature,
+        } = self;
+        name == &other.name
+            && aliases == &other.aliases
+            && std::ptr::eq(udf, &other.udf)
+            && signature == &other.signature
+    }
+}
+impl Eq for ForeignScalarUDF {}
 
-        Ok(Self {
+impl Hash for ForeignScalarUDF {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
             name,
-            udf: udf.clone(),
             aliases,
+            udf,
             signature,
-        })
+        } = self;
+        name.hash(state);
+        aliases.hash(state);
+        std::ptr::hash(udf, state);
+        signature.hash(state);
+    }
+}
+
+impl From<&FFI_ScalarUDF> for Arc<dyn ScalarUDFImpl> {
+    fn from(udf: &FFI_ScalarUDF) -> Self {
+        if (udf.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(udf.inner().inner())
+        } else {
+            let name = udf.name.to_owned().into();
+            let signature = Signature::user_defined((&udf.volatility).into());
+
+            let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
+
+            Arc::new(ForeignScalarUDF {
+                name,
+                udf: udf.clone(),
+                aliases,
+                signature,
+            })
+        }
     }
 }
 
@@ -317,14 +344,8 @@ impl ScalarUDFImpl for ForeignScalarUDF {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?;
-
-        let result = unsafe { (self.udf.return_type)(&self.udf, arg_types) };
-
-        let result = df_result!(result);
-
-        result.and_then(|r| (&r.0).try_into().map_err(DataFusionError::from))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("ForeignScalarUDF implements return_field_from_args instead.")
     }
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
@@ -347,6 +368,7 @@ impl ScalarUDFImpl for ForeignScalarUDF {
             arg_fields,
             number_rows,
             return_field,
+            config_options,
         } = invoke_args;
 
         let args = args
@@ -375,6 +397,7 @@ impl ScalarUDFImpl for ForeignScalarUDF {
 
         let return_field = return_field.as_ref().clone();
         let return_field = WrappedSchema(FFI_ArrowSchema::try_from(return_field)?);
+        let config_options = config_options.as_ref().into();
 
         let result = unsafe {
             (self.udf.invoke_with_args)(
@@ -383,13 +406,12 @@ impl ScalarUDFImpl for ForeignScalarUDF {
                 arg_fields,
                 number_rows,
                 return_field,
+                config_options,
             )
         };
 
         let result = df_result!(result)?;
-        let result_array: ArrayRef = result.try_into()?;
-
-        Ok(ColumnarValue::Array(result_array))
+        result.try_into()
     }
 
     fn aliases(&self) -> &[String] {
@@ -418,12 +440,38 @@ mod tests {
         let original_udf = datafusion::functions::math::abs::AbsFunc::new();
         let original_udf = Arc::new(ScalarUDF::from(original_udf));
 
-        let local_udf: FFI_ScalarUDF = Arc::clone(&original_udf).into();
+        let mut local_udf: FFI_ScalarUDF = Arc::clone(&original_udf).into();
+        local_udf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udf: ForeignScalarUDF = (&local_udf).try_into()?;
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&local_udf).into();
 
         assert_eq!(original_udf.name(), foreign_udf.name());
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_udf_local_bypass() -> Result<()> {
+        use datafusion::functions::math::abs::AbsFunc;
+        let original_udf = AbsFunc::new();
+        let original_udf = Arc::new(ScalarUDF::from(original_udf));
+
+        let mut ffi_udf = FFI_ScalarUDF::from(original_udf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+        assert!(foreign_udf.as_any().downcast_ref::<AbsFunc>().is_some());
+
+        // Verify different library markers generate foreign providers
+        ffi_udf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+        assert!(
+            foreign_udf
+                .as_any()
+                .downcast_ref::<ForeignScalarUDF>()
+                .is_some()
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udf/return_type_args.rs b/datafusion/ffi/src/udf/return_type_args.rs
index c437c9537be6f..8fb015b7ed922 100644
--- a/datafusion/ffi/src/udf/return_type_args.rs
+++ b/datafusion/ffi/src/udf/return_type_args.rs
@@ -15,24 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use abi_stable::{
-    std_types::{ROption, RVec},
-    StableAbi,
-};
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RVec};
 use arrow_schema::FieldRef;
-use datafusion::{
-    common::exec_datafusion_err, error::DataFusionError, logical_expr::ReturnFieldArgs,
-    scalar::ScalarValue,
-};
+use datafusion_common::scalar::ScalarValue;
+use datafusion_common::{DataFusionError, ffi_datafusion_err};
+use datafusion_expr::ReturnFieldArgs;
+use prost::Message;
 
 use crate::arrow_wrappers::WrappedSchema;
 use crate::util::{rvec_wrapped_to_vec_fieldref, vec_fieldref_to_rvec_wrapped};
-use prost::Message;
 
 /// A stable struct for sharing a [`ReturnFieldArgs`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_ReturnFieldArgs {
     arg_fields: RVec<WrappedSchema>,
     scalar_arguments: RVec<ROption<RVec<u8>>>,
@@ -91,7 +87,7 @@ impl TryFrom<&FFI_ReturnFieldArgs> for ForeignReturnFieldArgsOwned {
                 let maybe_arg = maybe_arg.as_ref().map(|arg| {
                     let proto_value =
                         datafusion_proto::protobuf::ScalarValue::decode(arg.as_ref())
-                            .map_err(|err| exec_datafusion_err!("{}", err))?;
+                            .map_err(|err| ffi_datafusion_err!("{}", err))?;
                     let scalar_value: ScalarValue = (&proto_value).try_into()?;
                     Ok(scalar_value)
                 });
diff --git a/datafusion/ffi/src/udtf.rs b/datafusion/ffi/src/udtf.rs
index 08bc4d0cd83b4..35c13c1169c72 100644
--- a/datafusion/ffi/src/udtf.rs
+++ b/datafusion/ffi/src/udtf.rs
@@ -15,43 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{ffi::c_void, sync::Arc};
-
-use abi_stable::{
-    std_types::{RResult, RString, RVec},
-    StableAbi,
-};
-
-use datafusion::error::Result;
-use datafusion::{
-    catalog::{TableFunctionImpl, TableProvider},
-    prelude::{Expr, SessionContext},
-};
-use datafusion_proto::{
-    logical_plan::{
-        from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec,
-    },
-    protobuf::LogicalExprList,
+use std::any::Any;
+use std::ffi::c_void;
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RVec};
+use datafusion_catalog::{TableFunctionImpl, TableProvider};
+use datafusion_common::error::Result;
+use datafusion_execution::TaskContext;
+use datafusion_expr::Expr;
+use datafusion_proto::logical_plan::from_proto::parse_exprs;
+use datafusion_proto::logical_plan::to_proto::serialize_exprs;
+use datafusion_proto::logical_plan::{
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
+use datafusion_proto::protobuf::LogicalExprList;
 use prost::Message;
 use tokio::runtime::Handle;
 
-use crate::{
-    df_result, rresult_return,
-    table_provider::{FFI_TableProvider, ForeignTableProvider},
-};
+use crate::execution::FFI_TaskContextProvider;
+use crate::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use crate::table_provider::FFI_TableProvider;
+use crate::util::FFIResult;
+use crate::{df_result, rresult_return};
 
 /// A stable struct for sharing a [`TableFunctionImpl`] across FFI boundaries.
 #[repr(C)]
 #[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
 pub struct FFI_TableFunction {
     /// Equivalent to the `call` function of the TableFunctionImpl.
     /// The arguments are Expr passed as protobuf encoded bytes.
-    pub call: unsafe extern "C" fn(
-        udtf: &Self,
-        args: RVec<u8>,
-    ) -> RResult<FFI_TableProvider, RString>,
+    pub call:
+        unsafe extern "C" fn(udtf: &Self, args: RVec<u8>) -> FFIResult<FFI_TableProvider>,
+
+    pub logical_codec: FFI_LogicalExtensionCodec,
 
     /// Used to create a clone on the provider of the udtf. This should
     /// only need to be called by the receiver of the udtf.
@@ -63,6 +61,11 @@ pub struct FFI_TableFunction {
     /// Internal data. This is only to be accessed by the provider of the udtf.
     /// A [`ForeignTableFunction`] should never attempt to access this data.
     pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
 }
 
 unsafe impl Send for FFI_TableFunction {}
@@ -88,32 +91,50 @@ impl FFI_TableFunction {
 unsafe extern "C" fn call_fn_wrapper(
     udtf: &FFI_TableFunction,
     args: RVec<u8>,
-) -> RResult<FFI_TableProvider, RString> {
+) -> FFIResult<FFI_TableProvider> {
     let runtime = udtf.runtime();
-    let udtf = udtf.inner();
+    let udtf_inner = udtf.inner();
 
-    let default_ctx = SessionContext::new();
-    let codec = DefaultLogicalExtensionCodec {};
+    let ctx: Arc<TaskContext> =
+        rresult_return!((&udtf.logical_codec.task_ctx_provider).try_into());
+    let codec: Arc<dyn LogicalExtensionCodec> = (&udtf.logical_codec).into();
 
     let proto_filters = rresult_return!(LogicalExprList::decode(args.as_ref()));
 
-    let args =
-        rresult_return!(parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec));
-
-    let table_provider = rresult_return!(udtf.call(&args));
-    RResult::ROk(FFI_TableProvider::new(table_provider, false, runtime))
+    let args = rresult_return!(parse_exprs(
+        proto_filters.expr.iter(),
+        ctx.as_ref(),
+        codec.as_ref()
+    ));
+
+    let table_provider = rresult_return!(udtf_inner.call(&args));
+    RResult::ROk(FFI_TableProvider::new_with_ffi_codec(
+        table_provider,
+        false,
+        runtime,
+        udtf.logical_codec.clone(),
+    ))
 }
 
 unsafe extern "C" fn release_fn_wrapper(udtf: &mut FFI_TableFunction) {
-    let private_data = Box::from_raw(udtf.private_data as *mut TableFunctionPrivateData);
-    drop(private_data);
+    unsafe {
+        debug_assert!(!udtf.private_data.is_null());
+        let private_data =
+            Box::from_raw(udtf.private_data as *mut TableFunctionPrivateData);
+        drop(private_data);
+        udtf.private_data = std::ptr::null_mut();
+    }
 }
 
 unsafe extern "C" fn clone_fn_wrapper(udtf: &FFI_TableFunction) -> FFI_TableFunction {
     let runtime = udtf.runtime();
-    let udtf = udtf.inner();
+    let udtf_inner = udtf.inner();
 
-    FFI_TableFunction::new(Arc::clone(udtf), runtime)
+    FFI_TableFunction::new_with_ffi_codec(
+        Arc::clone(udtf_inner),
+        runtime,
+        udtf.logical_codec.clone(),
+    )
 }
 
 impl Clone for FFI_TableFunction {
@@ -123,30 +144,44 @@ impl Clone for FFI_TableFunction {
 }
 
 impl FFI_TableFunction {
-    pub fn new(udtf: Arc<dyn TableFunctionImpl>, runtime: Option<Handle>) -> Self {
-        let private_data = Box::new(TableFunctionPrivateData { udtf, runtime });
+    pub fn new(
+        udtf: Arc<dyn TableFunctionImpl>,
+        runtime: Option<Handle>,
+        task_ctx_provider: impl Into<FFI_TaskContextProvider>,
+        logical_codec: Option<Arc<dyn LogicalExtensionCodec>>,
+    ) -> Self {
+        let task_ctx_provider = task_ctx_provider.into();
+        let logical_codec =
+            logical_codec.unwrap_or_else(|| Arc::new(DefaultLogicalExtensionCodec {}));
+        let logical_codec = FFI_LogicalExtensionCodec::new(
+            logical_codec,
+            runtime.clone(),
+            task_ctx_provider.clone(),
+        );
+
+        Self::new_with_ffi_codec(udtf, runtime, logical_codec)
+    }
 
-        Self {
-            call: call_fn_wrapper,
-            clone: clone_fn_wrapper,
-            release: release_fn_wrapper,
-            private_data: Box::into_raw(private_data) as *mut c_void,
+    pub fn new_with_ffi_codec(
+        udtf: Arc<dyn TableFunctionImpl>,
+        runtime: Option<Handle>,
+        logical_codec: FFI_LogicalExtensionCodec,
+    ) -> Self {
+        if let Some(udtf) =
+            (Arc::clone(&udtf) as Arc<dyn Any>).downcast_ref::<ForeignTableFunction>()
+        {
+            return udtf.0.clone();
         }
-    }
-}
 
-impl From<Arc<dyn TableFunctionImpl>> for FFI_TableFunction {
-    fn from(udtf: Arc<dyn TableFunctionImpl>) -> Self {
-        let private_data = Box::new(TableFunctionPrivateData {
-            udtf,
-            runtime: None,
-        });
+        let private_data = Box::new(TableFunctionPrivateData { udtf, runtime });
 
         Self {
             call: call_fn_wrapper,
+            logical_codec,
             clone: clone_fn_wrapper,
             release: release_fn_wrapper,
             private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
         }
     }
 }
@@ -169,40 +204,45 @@ pub struct ForeignTableFunction(FFI_TableFunction);
 unsafe impl Send for ForeignTableFunction {}
 unsafe impl Sync for ForeignTableFunction {}
 
-impl From<FFI_TableFunction> for ForeignTableFunction {
+impl From<FFI_TableFunction> for Arc<dyn TableFunctionImpl> {
     fn from(value: FFI_TableFunction) -> Self {
-        Self(value)
+        if (value.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(value.inner())
+        } else {
+            Arc::new(ForeignTableFunction(value))
+        }
     }
 }
 
 impl TableFunctionImpl for ForeignTableFunction {
     fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-        let codec = DefaultLogicalExtensionCodec {};
+        let codec: Arc<dyn LogicalExtensionCodec> = (&self.0.logical_codec).into();
         let expr_list = LogicalExprList {
-            expr: serialize_exprs(args, &codec)?,
+            expr: serialize_exprs(args, codec.as_ref())?,
         };
         let filters_serialized = expr_list.encode_to_vec().into();
 
         let table_provider = unsafe { (self.0.call)(&self.0, filters_serialized) };
 
         let table_provider = df_result!(table_provider)?;
-        let table_provider: ForeignTableProvider = (&table_provider).into();
+        let table_provider: Arc<dyn TableProvider> = (&table_provider).into();
 
-        Ok(Arc::new(table_provider))
+        Ok(table_provider)
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::{
-        array::{
-            record_batch, ArrayRef, Float64Array, RecordBatch, StringArray, UInt64Array,
-        },
-        datatypes::{DataType, Field, Schema},
-    };
-    use datafusion::{
-        catalog::MemTable, common::exec_err, prelude::lit, scalar::ScalarValue,
+    use arrow::array::{
+        ArrayRef, Float64Array, RecordBatch, StringArray, UInt64Array, record_batch,
     };
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::catalog::MemTable;
+    use datafusion::common::exec_err;
+    use datafusion::logical_expr::ptr_eq::arc_ptr_eq;
+    use datafusion::prelude::{SessionContext, lit};
+    use datafusion::scalar::ScalarValue;
+    use datafusion_execution::TaskContextProvider;
 
     use super::*;
 
@@ -214,7 +254,7 @@ mod tests {
             let args = args
                 .iter()
                 .map(|arg| {
-                    if let Expr::Literal(scalar) = arg {
+                    if let Expr::Literal(scalar, _) = arg {
                         Ok(scalar)
                     } else {
                         exec_err!("Expected only literal arguments to table udf")
@@ -287,16 +327,22 @@ mod tests {
     #[tokio::test]
     async fn test_round_trip_udtf() -> Result<()> {
         let original_udtf = Arc::new(TestUDTF {}) as Arc<dyn TableFunctionImpl>;
+        let ctx = Arc::new(SessionContext::default());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
 
-        let local_udtf: FFI_TableFunction =
-            FFI_TableFunction::new(Arc::clone(&original_udtf), None);
+        let mut local_udtf: FFI_TableFunction = FFI_TableFunction::new(
+            Arc::clone(&original_udtf),
+            None,
+            task_ctx_provider,
+            None,
+        );
+        local_udtf.library_marker_id = crate::mock_foreign_marker_id;
 
-        let foreign_udf: ForeignTableFunction = local_udtf.into();
+        let foreign_udf: Arc<dyn TableFunctionImpl> = local_udtf.into();
 
-        let table =
-            foreign_udf.call(&vec![lit(6_u64), lit("one"), lit(2.0), lit(3_u64)])?;
+        let table = foreign_udf.call(&[lit(6_u64), lit("one"), lit(2.0), lit(3_u64)])?;
 
-        let ctx = SessionContext::default();
         let _ = ctx.register_table("test-table", table)?;
 
         let returned_batches = ctx.table("test-table").await?.collect().await?;
@@ -318,4 +364,29 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_ffi_udtf_local_bypass() -> Result<()> {
+        let original_udtf = Arc::new(TestUDTF {}) as Arc<dyn TableFunctionImpl>;
+
+        let ctx = Arc::new(SessionContext::default()) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&ctx);
+        let mut ffi_udtf = FFI_TableFunction::new(
+            Arc::clone(&original_udtf),
+            None,
+            task_ctx_provider,
+            None,
+        );
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udtf: Arc<dyn TableFunctionImpl> = ffi_udtf.clone().into();
+        assert!(arc_ptr_eq(&original_udtf, &foreign_udtf));
+
+        // Verify different library markers generate foreign providers
+        ffi_udtf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udtf: Arc<dyn TableFunctionImpl> = ffi_udtf.into();
+        assert!(!arc_ptr_eq(&original_udtf, &foreign_udtf));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/src/udwf/mod.rs b/datafusion/ffi/src/udwf/mod.rs
new file mode 100644
index 0000000000000..2e4bd0a294fd0
--- /dev/null
+++ b/datafusion/ffi/src/udwf/mod.rs
@@ -0,0 +1,498 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_void;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{ROption, RResult, RString, RVec};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Schema, SchemaRef};
+use arrow_schema::{Field, FieldRef};
+use datafusion_common::{Result, ffi_err};
+use datafusion_expr::function::WindowUDFFieldArgs;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
+use datafusion_expr::{
+    LimitEffect, PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl,
+};
+use datafusion_physical_expr::PhysicalExpr;
+use partition_evaluator::FFI_PartitionEvaluator;
+use partition_evaluator_args::{
+    FFI_PartitionEvaluatorArgs, ForeignPartitionEvaluatorArgs,
+};
+
+mod partition_evaluator;
+mod partition_evaluator_args;
+mod range;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::util::{
+    FFIResult, rvec_wrapped_to_vec_datatype, rvec_wrapped_to_vec_fieldref,
+    vec_datatype_to_rvec_wrapped, vec_fieldref_to_rvec_wrapped,
+};
+use crate::volatility::FFI_Volatility;
+use crate::{df_result, rresult, rresult_return};
+
+/// A stable struct for sharing a [`WindowUDF`] across FFI boundaries.
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_WindowUDF {
+    /// FFI equivalent to the `name` of a [`WindowUDF`]
+    pub name: RString,
+
+    /// FFI equivalent to the `aliases` of a [`WindowUDF`]
+    pub aliases: RVec<RString>,
+
+    /// FFI equivalent to the `volatility` of a [`WindowUDF`]
+    pub volatility: FFI_Volatility,
+
+    pub partition_evaluator: unsafe extern "C" fn(
+        udwf: &Self,
+        args: FFI_PartitionEvaluatorArgs,
+    )
+        -> FFIResult<FFI_PartitionEvaluator>,
+
+    pub field: unsafe extern "C" fn(
+        udwf: &Self,
+        input_types: RVec<WrappedSchema>,
+        display_name: RString,
+    ) -> FFIResult<WrappedSchema>,
+
+    /// Performs type coercion. To simply this interface, all UDFs are treated as having
+    /// user defined signatures, which will in turn call coerce_types to be called. This
+    /// call should be transparent to most users as the internal function performs the
+    /// appropriate calls on the underlying [`WindowUDF`]
+    pub coerce_types: unsafe extern "C" fn(
+        udf: &Self,
+        arg_types: RVec<WrappedSchema>,
+    ) -> FFIResult<RVec<WrappedSchema>>,
+
+    pub sort_options: ROption<FFI_SortOptions>,
+
+    /// Used to create a clone on the provider of the udf. This should
+    /// only need to be called by the receiver of the udf.
+    pub clone: unsafe extern "C" fn(udf: &Self) -> Self,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(udf: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the udf.
+    /// A [`ForeignWindowUDF`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_WindowUDF {}
+unsafe impl Sync for FFI_WindowUDF {}
+
+pub struct WindowUDFPrivateData {
+    pub udf: Arc<WindowUDF>,
+}
+
+impl FFI_WindowUDF {
+    unsafe fn inner(&self) -> &Arc<WindowUDF> {
+        unsafe {
+            let private_data = self.private_data as *const WindowUDFPrivateData;
+            &(*private_data).udf
+        }
+    }
+}
+
+unsafe extern "C" fn partition_evaluator_fn_wrapper(
+    udwf: &FFI_WindowUDF,
+    args: FFI_PartitionEvaluatorArgs,
+) -> FFIResult<FFI_PartitionEvaluator> {
+    unsafe {
+        let inner = udwf.inner();
+
+        let args = rresult_return!(ForeignPartitionEvaluatorArgs::try_from(args));
+
+        let evaluator =
+            rresult_return!(inner.partition_evaluator_factory((&args).into()));
+
+        RResult::ROk(evaluator.into())
+    }
+}
+
+unsafe extern "C" fn field_fn_wrapper(
+    udwf: &FFI_WindowUDF,
+    input_fields: RVec<WrappedSchema>,
+    display_name: RString,
+) -> FFIResult<WrappedSchema> {
+    unsafe {
+        let inner = udwf.inner();
+
+        let input_fields = rresult_return!(rvec_wrapped_to_vec_fieldref(&input_fields));
+
+        let field = rresult_return!(inner.field(WindowUDFFieldArgs::new(
+            &input_fields,
+            display_name.as_str()
+        )));
+
+        let schema = Arc::new(Schema::new(vec![field]));
+
+        RResult::ROk(WrappedSchema::from(schema))
+    }
+}
+
+unsafe extern "C" fn coerce_types_fn_wrapper(
+    udwf: &FFI_WindowUDF,
+    arg_types: RVec<WrappedSchema>,
+) -> FFIResult<RVec<WrappedSchema>> {
+    unsafe {
+        let inner = udwf.inner();
+
+        let arg_fields = rresult_return!(rvec_wrapped_to_vec_datatype(&arg_types))
+            .into_iter()
+            .map(|dt| Field::new("f", dt, false))
+            .map(Arc::new)
+            .collect::<Vec<_>>();
+
+        let return_fields = rresult_return!(fields_with_udf(&arg_fields, inner.as_ref()));
+        let return_types = return_fields
+            .into_iter()
+            .map(|f| f.data_type().to_owned())
+            .collect::<Vec<_>>();
+
+        rresult!(vec_datatype_to_rvec_wrapped(&return_types))
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(udwf: &mut FFI_WindowUDF) {
+    unsafe {
+        debug_assert!(!udwf.private_data.is_null());
+        let private_data = Box::from_raw(udwf.private_data as *mut WindowUDFPrivateData);
+        drop(private_data);
+        udwf.private_data = std::ptr::null_mut();
+    }
+}
+
+unsafe extern "C" fn clone_fn_wrapper(udwf: &FFI_WindowUDF) -> FFI_WindowUDF {
+    unsafe {
+        // let private_data = udf.private_data as *const WindowUDFPrivateData;
+        // let udf_data = &(*private_data);
+
+        // let private_data = Box::new(WindowUDFPrivateData {
+        //     udf: Arc::clone(&udf_data.udf),
+        // });
+        let private_data = Box::new(WindowUDFPrivateData {
+            udf: Arc::clone(udwf.inner()),
+        });
+
+        FFI_WindowUDF {
+            name: udwf.name.clone(),
+            aliases: udwf.aliases.clone(),
+            volatility: udwf.volatility.clone(),
+            partition_evaluator: partition_evaluator_fn_wrapper,
+            sort_options: udwf.sort_options.clone(),
+            coerce_types: coerce_types_fn_wrapper,
+            field: field_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Clone for FFI_WindowUDF {
+    fn clone(&self) -> Self {
+        unsafe { (self.clone)(self) }
+    }
+}
+
+impl From<Arc<WindowUDF>> for FFI_WindowUDF {
+    fn from(udf: Arc<WindowUDF>) -> Self {
+        if let Some(udwf) = udf.inner().as_any().downcast_ref::<ForeignWindowUDF>() {
+            return udwf.udf.clone();
+        }
+
+        let name = udf.name().into();
+        let aliases = udf.aliases().iter().map(|a| a.to_owned().into()).collect();
+        let volatility = udf.signature().volatility.into();
+        let sort_options = udf.sort_options().map(|v| (&v).into()).into();
+
+        let private_data = Box::new(WindowUDFPrivateData { udf });
+
+        Self {
+            name,
+            aliases,
+            volatility,
+            partition_evaluator: partition_evaluator_fn_wrapper,
+            sort_options,
+            coerce_types: coerce_types_fn_wrapper,
+            field: field_fn_wrapper,
+            clone: clone_fn_wrapper,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(private_data) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_WindowUDF {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+/// This struct is used to access an UDF provided by a foreign
+/// library across a FFI boundary.
+///
+/// The ForeignWindowUDF is to be used by the caller of the UDF, so it has
+/// no knowledge or access to the private data. All interaction with the UDF
+/// must occur through the functions defined in FFI_WindowUDF.
+#[derive(Debug)]
+pub struct ForeignWindowUDF {
+    name: String,
+    aliases: Vec<String>,
+    udf: FFI_WindowUDF,
+    signature: Signature,
+}
+
+unsafe impl Send for ForeignWindowUDF {}
+unsafe impl Sync for ForeignWindowUDF {}
+
+impl PartialEq for ForeignWindowUDF {
+    fn eq(&self, other: &Self) -> bool {
+        // FFI_WindowUDF cannot be compared, so identity equality is the best we can do.
+        std::ptr::eq(self, other)
+    }
+}
+impl Eq for ForeignWindowUDF {}
+impl Hash for ForeignWindowUDF {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        std::ptr::hash(self, state)
+    }
+}
+
+impl From<&FFI_WindowUDF> for Arc<dyn WindowUDFImpl> {
+    fn from(udf: &FFI_WindowUDF) -> Self {
+        if (udf.library_marker_id)() == crate::get_library_marker_id() {
+            Arc::clone(unsafe { udf.inner().inner() })
+        } else {
+            let name = udf.name.to_owned().into();
+            let signature = Signature::user_defined((&udf.volatility).into());
+
+            let aliases = udf.aliases.iter().map(|s| s.to_string()).collect();
+
+            Arc::new(ForeignWindowUDF {
+                name,
+                udf: udf.clone(),
+                aliases,
+                signature,
+            })
+        }
+    }
+}
+
+impl WindowUDFImpl for ForeignWindowUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        unsafe {
+            let arg_types = vec_datatype_to_rvec_wrapped(arg_types)?;
+            let result_types = df_result!((self.udf.coerce_types)(&self.udf, arg_types))?;
+            Ok(rvec_wrapped_to_vec_datatype(&result_types)?)
+        }
+    }
+
+    fn partition_evaluator(
+        &self,
+        args: datafusion_expr::function::PartitionEvaluatorArgs,
+    ) -> Result<Box<dyn PartitionEvaluator>> {
+        let evaluator = unsafe {
+            let args = FFI_PartitionEvaluatorArgs::try_from(args)?;
+            (self.udf.partition_evaluator)(&self.udf, args)
+        };
+
+        df_result!(evaluator).map(<Box<dyn PartitionEvaluator>>::from)
+    }
+
+    fn field(&self, field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
+        unsafe {
+            let input_types = vec_fieldref_to_rvec_wrapped(field_args.input_fields())?;
+            let schema = df_result!((self.udf.field)(
+                &self.udf,
+                input_types,
+                field_args.name().into()
+            ))?;
+            let schema: SchemaRef = schema.into();
+
+            match schema.fields().is_empty() {
+                true => ffi_err!(
+                    "Unable to retrieve field in WindowUDF via FFI - schema has no fields"
+                ),
+                false => Ok(schema.field(0).to_owned().into()),
+            }
+        }
+    }
+
+    fn sort_options(&self) -> Option<SortOptions> {
+        let options: Option<&FFI_SortOptions> = self.udf.sort_options.as_ref().into();
+        options.map(|s| s.into())
+    }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, StableAbi, Clone)]
+pub struct FFI_SortOptions {
+    pub descending: bool,
+    pub nulls_first: bool,
+}
+
+impl From<&SortOptions> for FFI_SortOptions {
+    fn from(value: &SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
+
+impl From<&FFI_SortOptions> for SortOptions {
+    fn from(value: &FFI_SortOptions) -> Self {
+        Self {
+            descending: value.descending,
+            nulls_first: value.nulls_first,
+        }
+    }
+}
+
+#[cfg(test)]
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::functions_window::lead_lag::{WindowShift, lag_udwf};
+    use datafusion::logical_expr::expr::Sort;
+    use datafusion::logical_expr::{ExprFunctionExt, WindowUDF, WindowUDFImpl, col};
+    use datafusion::prelude::SessionContext;
+
+    use crate::tests::create_record_batch;
+    use crate::udwf::{FFI_WindowUDF, ForeignWindowUDF};
+
+    fn create_test_foreign_udwf(
+        original_udwf: impl WindowUDFImpl + 'static,
+    ) -> datafusion::common::Result<WindowUDF> {
+        let original_udwf = Arc::new(WindowUDF::from(original_udwf));
+
+        let mut local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        local_udwf.library_marker_id = crate::mock_foreign_marker_id;
+
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&local_udwf).into();
+        Ok(WindowUDF::new_from_shared_impl(foreign_udwf))
+    }
+
+    #[test]
+    fn test_round_trip_udwf() -> datafusion::common::Result<()> {
+        let original_udwf = lag_udwf();
+        let original_name = original_udwf.name().to_owned();
+
+        // Convert to FFI format
+        let mut local_udwf: FFI_WindowUDF = Arc::clone(&original_udwf).into();
+        local_udwf.library_marker_id = crate::mock_foreign_marker_id;
+
+        // Convert back to native format
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&local_udwf).into();
+        let foreign_udwf = WindowUDF::new_from_shared_impl(foreign_udwf);
+
+        assert_eq!(original_name, foreign_udwf.name());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_lag_udwf() -> datafusion::common::Result<()> {
+        let udwf = create_test_foreign_udwf(WindowShift::lag())?;
+
+        let ctx = SessionContext::default();
+        let df = ctx.read_batch(create_record_batch(-5, 5))?;
+
+        let df = df.select(vec![
+            col("a"),
+            udwf.call(vec![col("a")])
+                .order_by(vec![Sort::new(col("a"), true, true)])
+                .build()
+                .unwrap()
+                .alias("lag_a"),
+        ])?;
+
+        df.clone().show().await?;
+
+        let result = df.collect().await?;
+        let expected =
+            create_array!(Int32, [None, Some(-5), Some(-4), Some(-3), Some(-2)])
+                as ArrayRef;
+
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].column(1), &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_udwf_local_bypass() -> datafusion_common::Result<()> {
+        let original_udwf = Arc::new(WindowUDF::from(WindowShift::lag()));
+
+        let mut ffi_udwf = FFI_WindowUDF::from(original_udwf);
+
+        // Verify local libraries can be downcast to their original
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&ffi_udwf).into();
+        assert!(
+            foreign_udwf
+                .as_any()
+                .downcast_ref::<WindowShift>()
+                .is_some()
+        );
+
+        // Verify different library markers generate foreign providers
+        ffi_udwf.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_udwf: Arc<dyn WindowUDFImpl> = (&ffi_udwf).into();
+        assert!(
+            foreign_udwf
+                .as_any()
+                .downcast_ref::<ForeignWindowUDF>()
+                .is_some()
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udwf/partition_evaluator.rs b/datafusion/ffi/src/udwf/partition_evaluator.rs
new file mode 100644
index 0000000000000..6820c6e335dd6
--- /dev/null
+++ b/datafusion/ffi/src/udwf/partition_evaluator.rs
@@ -0,0 +1,416 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ffi::c_void;
+use std::ops::Range;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::{RResult, RVec};
+use arrow::array::ArrayRef;
+use arrow::error::ArrowError;
+use datafusion_common::scalar::ScalarValue;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::PartitionEvaluator;
+use datafusion_expr::window_state::WindowAggState;
+use prost::Message;
+
+use super::range::FFI_Range;
+use crate::arrow_wrappers::WrappedArray;
+use crate::util::FFIResult;
+use crate::{df_result, rresult, rresult_return};
+
+/// A stable struct for sharing [`PartitionEvaluator`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding function
+/// defined in [`PartitionEvaluator`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_PartitionEvaluator {
+    pub evaluate_all: unsafe extern "C" fn(
+        evaluator: &mut Self,
+        values: RVec<WrappedArray>,
+        num_rows: usize,
+    ) -> FFIResult<WrappedArray>,
+
+    pub evaluate: unsafe extern "C" fn(
+        evaluator: &mut Self,
+        values: RVec<WrappedArray>,
+        range: FFI_Range,
+    ) -> FFIResult<RVec<u8>>,
+
+    pub evaluate_all_with_rank: unsafe extern "C" fn(
+        evaluator: &Self,
+        num_rows: usize,
+        ranks_in_partition: RVec<FFI_Range>,
+    ) -> FFIResult<WrappedArray>,
+
+    pub get_range: unsafe extern "C" fn(
+        evaluator: &Self,
+        idx: usize,
+        n_rows: usize,
+    ) -> FFIResult<FFI_Range>,
+
+    pub is_causal: bool,
+
+    pub supports_bounded_execution: bool,
+    pub uses_window_frame: bool,
+    pub include_rank: bool,
+
+    /// Release the memory of the private data when it is no longer being used.
+    pub release: unsafe extern "C" fn(evaluator: &mut Self),
+
+    /// Internal data. This is only to be accessed by the provider of the evaluator.
+    /// A [`ForeignPartitionEvaluator`] should never attempt to access this data.
+    pub private_data: *mut c_void,
+
+    /// Utility to identify when FFI objects are accessed locally through
+    /// the foreign interface. See [`crate::get_library_marker_id`] and
+    /// the crate's `README.md` for more information.
+    pub library_marker_id: extern "C" fn() -> usize,
+}
+
+unsafe impl Send for FFI_PartitionEvaluator {}
+unsafe impl Sync for FFI_PartitionEvaluator {}
+
+pub struct PartitionEvaluatorPrivateData {
+    pub evaluator: Box<dyn PartitionEvaluator>,
+}
+
+impl FFI_PartitionEvaluator {
+    unsafe fn inner_mut(&mut self) -> &mut Box<dyn PartitionEvaluator + 'static> {
+        unsafe {
+            let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
+            &mut (*private_data).evaluator
+        }
+    }
+
+    unsafe fn inner(&self) -> &(dyn PartitionEvaluator + 'static) {
+        unsafe {
+            let private_data = self.private_data as *mut PartitionEvaluatorPrivateData;
+            (*private_data).evaluator.as_ref()
+        }
+    }
+}
+
+unsafe extern "C" fn evaluate_all_fn_wrapper(
+    evaluator: &mut FFI_PartitionEvaluator,
+    values: RVec<WrappedArray>,
+    num_rows: usize,
+) -> FFIResult<WrappedArray> {
+    unsafe {
+        let inner = evaluator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = rresult_return!(values_arrays);
+
+        let return_array =
+            inner
+                .evaluate_all(&values_arrays, num_rows)
+                .and_then(|array| {
+                    WrappedArray::try_from(&array).map_err(DataFusionError::from)
+                });
+
+        rresult!(return_array)
+    }
+}
+
+unsafe extern "C" fn evaluate_fn_wrapper(
+    evaluator: &mut FFI_PartitionEvaluator,
+    values: RVec<WrappedArray>,
+    range: FFI_Range,
+) -> FFIResult<RVec<u8>> {
+    unsafe {
+        let inner = evaluator.inner_mut();
+
+        let values_arrays = values
+            .into_iter()
+            .map(|v| v.try_into().map_err(DataFusionError::from))
+            .collect::<Result<Vec<ArrayRef>>>();
+        let values_arrays = rresult_return!(values_arrays);
+
+        // let return_array = (inner.evaluate(&values_arrays, &range.into()));
+        // .and_then(|array| WrappedArray::try_from(&array).map_err(DataFusionError::from));
+        let scalar_result =
+            rresult_return!(inner.evaluate(&values_arrays, &range.into()));
+        let proto_result: datafusion_proto::protobuf::ScalarValue =
+            rresult_return!((&scalar_result).try_into());
+
+        RResult::ROk(proto_result.encode_to_vec().into())
+    }
+}
+
+unsafe extern "C" fn evaluate_all_with_rank_fn_wrapper(
+    evaluator: &FFI_PartitionEvaluator,
+    num_rows: usize,
+    ranks_in_partition: RVec<FFI_Range>,
+) -> FFIResult<WrappedArray> {
+    unsafe {
+        let inner = evaluator.inner();
+
+        let ranks_in_partition = ranks_in_partition
+            .into_iter()
+            .map(Range::from)
+            .collect::<Vec<_>>();
+
+        let return_array = inner
+            .evaluate_all_with_rank(num_rows, &ranks_in_partition)
+            .and_then(|array| {
+                WrappedArray::try_from(&array).map_err(DataFusionError::from)
+            });
+
+        rresult!(return_array)
+    }
+}
+
+unsafe extern "C" fn get_range_fn_wrapper(
+    evaluator: &FFI_PartitionEvaluator,
+    idx: usize,
+    n_rows: usize,
+) -> FFIResult<FFI_Range> {
+    unsafe {
+        let inner = evaluator.inner();
+        let range = inner.get_range(idx, n_rows).map(FFI_Range::from);
+
+        rresult!(range)
+    }
+}
+
+unsafe extern "C" fn release_fn_wrapper(evaluator: &mut FFI_PartitionEvaluator) {
+    unsafe {
+        if !evaluator.private_data.is_null() {
+            let private_data = Box::from_raw(
+                evaluator.private_data as *mut PartitionEvaluatorPrivateData,
+            );
+            drop(private_data);
+            evaluator.private_data = std::ptr::null_mut();
+        }
+    }
+}
+
+impl From<Box<dyn PartitionEvaluator>> for FFI_PartitionEvaluator {
+    fn from(evaluator: Box<dyn PartitionEvaluator>) -> Self {
+        if (evaluator.as_ref() as &dyn Any).is::<ForeignPartitionEvaluator>() {
+            let evaluator = (evaluator as Box<dyn Any>)
+                .downcast::<ForeignPartitionEvaluator>()
+                .expect("already checked type");
+            return evaluator.evaluator;
+        }
+
+        let is_causal = evaluator.is_causal();
+        let supports_bounded_execution = evaluator.supports_bounded_execution();
+        let include_rank = evaluator.include_rank();
+        let uses_window_frame = evaluator.uses_window_frame();
+
+        let private_data = PartitionEvaluatorPrivateData { evaluator };
+
+        Self {
+            evaluate: evaluate_fn_wrapper,
+            evaluate_all: evaluate_all_fn_wrapper,
+            evaluate_all_with_rank: evaluate_all_with_rank_fn_wrapper,
+            get_range: get_range_fn_wrapper,
+            is_causal,
+            supports_bounded_execution,
+            include_rank,
+            uses_window_frame,
+            release: release_fn_wrapper,
+            private_data: Box::into_raw(Box::new(private_data)) as *mut c_void,
+            library_marker_id: crate::get_library_marker_id,
+        }
+    }
+}
+
+impl Drop for FFI_PartitionEvaluator {
+    fn drop(&mut self) {
+        unsafe { (self.release)(self) }
+    }
+}
+
+/// This struct is used to access an UDF provided by a foreign
+/// library across a FFI boundary.
+///
+/// The ForeignPartitionEvaluator is to be used by the caller of the UDF, so it has
+/// no knowledge or access to the private data. All interaction with the UDF
+/// must occur through the functions defined in FFI_PartitionEvaluator.
+#[derive(Debug)]
+pub struct ForeignPartitionEvaluator {
+    evaluator: FFI_PartitionEvaluator,
+}
+
+impl From<FFI_PartitionEvaluator> for Box<dyn PartitionEvaluator> {
+    fn from(mut evaluator: FFI_PartitionEvaluator) -> Self {
+        if (evaluator.library_marker_id)() == crate::get_library_marker_id() {
+            unsafe {
+                let private_data = Box::from_raw(
+                    evaluator.private_data as *mut PartitionEvaluatorPrivateData,
+                );
+                // We must set this to null to avoid a double free
+                evaluator.private_data = std::ptr::null_mut();
+                private_data.evaluator
+            }
+        } else {
+            Box::new(ForeignPartitionEvaluator { evaluator })
+        }
+    }
+}
+
+impl PartitionEvaluator for ForeignPartitionEvaluator {
+    fn memoize(&mut self, _state: &mut WindowAggState) -> Result<()> {
+        // Exposing `memoize` increases the surface are of the FFI work
+        // so for now we dot support it.
+        Ok(())
+    }
+
+    fn get_range(&self, idx: usize, n_rows: usize) -> Result<Range<usize>> {
+        let range = unsafe { (self.evaluator.get_range)(&self.evaluator, idx, n_rows) };
+        df_result!(range).map(Range::from)
+    }
+
+    /// Get whether evaluator needs future data for its result (if so returns `false`) or not
+    fn is_causal(&self) -> bool {
+        self.evaluator.is_causal
+    }
+
+    fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result<ArrayRef> {
+        let result = unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+            (self.evaluator.evaluate_all)(&mut self.evaluator, values, num_rows)
+        };
+
+        let array = df_result!(result)?;
+
+        Ok(array.try_into()?)
+    }
+
+    fn evaluate(
+        &mut self,
+        values: &[ArrayRef],
+        range: &Range<usize>,
+    ) -> Result<ScalarValue> {
+        unsafe {
+            let values = values
+                .iter()
+                .map(WrappedArray::try_from)
+                .collect::<std::result::Result<RVec<_>, ArrowError>>()?;
+
+            let scalar_bytes = df_result!((self.evaluator.evaluate)(
+                &mut self.evaluator,
+                values,
+                range.to_owned().into()
+            ))?;
+
+            let proto_scalar =
+                datafusion_proto::protobuf::ScalarValue::decode(scalar_bytes.as_ref())
+                    .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+            ScalarValue::try_from(&proto_scalar).map_err(DataFusionError::from)
+        }
+    }
+
+    fn evaluate_all_with_rank(
+        &self,
+        num_rows: usize,
+        ranks_in_partition: &[Range<usize>],
+    ) -> Result<ArrayRef> {
+        let result = unsafe {
+            let ranks_in_partition = ranks_in_partition
+                .iter()
+                .map(|rank| FFI_Range::from(rank.to_owned()))
+                .collect();
+            (self.evaluator.evaluate_all_with_rank)(
+                &self.evaluator,
+                num_rows,
+                ranks_in_partition,
+            )
+        };
+
+        let array = df_result!(result)?;
+
+        Ok(array.try_into()?)
+    }
+
+    fn supports_bounded_execution(&self) -> bool {
+        self.evaluator.supports_bounded_execution
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        self.evaluator.uses_window_frame
+    }
+
+    fn include_rank(&self) -> bool {
+        self.evaluator.include_rank
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::ArrayRef;
+    use datafusion::logical_expr::PartitionEvaluator;
+
+    use crate::udwf::partition_evaluator::{
+        FFI_PartitionEvaluator, ForeignPartitionEvaluator,
+    };
+
+    #[derive(Debug)]
+    struct TestPartitionEvaluator {}
+
+    impl PartitionEvaluator for TestPartitionEvaluator {
+        fn evaluate_all(
+            &mut self,
+            values: &[ArrayRef],
+            _num_rows: usize,
+        ) -> datafusion_common::Result<ArrayRef> {
+            Ok(values[0].to_owned())
+        }
+    }
+
+    #[test]
+    fn test_ffi_partition_evaluator_local_bypass_inner() -> datafusion_common::Result<()>
+    {
+        let original_accum = TestPartitionEvaluator {};
+        let boxed_accum: Box<dyn PartitionEvaluator> = Box::new(original_accum);
+
+        let ffi_accum: FFI_PartitionEvaluator = boxed_accum.into();
+
+        // Verify local libraries can be downcast to their original
+        let foreign_accum: Box<dyn PartitionEvaluator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn PartitionEvaluator
+                as *const TestPartitionEvaluator);
+            assert!(!concrete.uses_window_frame());
+        }
+
+        // Verify different library markers generate foreign accumulator
+        let original_accum = TestPartitionEvaluator {};
+        let boxed_accum: Box<dyn PartitionEvaluator> = Box::new(original_accum);
+        let mut ffi_accum: FFI_PartitionEvaluator = boxed_accum.into();
+        ffi_accum.library_marker_id = crate::mock_foreign_marker_id;
+        let foreign_accum: Box<dyn PartitionEvaluator> = ffi_accum.into();
+        unsafe {
+            let concrete = &*(foreign_accum.as_ref() as *const dyn PartitionEvaluator
+                as *const ForeignPartitionEvaluator);
+            assert!(!concrete.uses_window_frame());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/src/udwf/partition_evaluator_args.rs b/datafusion/ffi/src/udwf/partition_evaluator_args.rs
new file mode 100644
index 0000000000000..ffad1f41ee42d
--- /dev/null
+++ b/datafusion/ffi/src/udwf/partition_evaluator_args.rs
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use abi_stable::StableAbi;
+use abi_stable::std_types::RVec;
+use arrow::error::ArrowError;
+use arrow::ffi::FFI_ArrowSchema;
+use arrow_schema::FieldRef;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_expr::function::PartitionEvaluatorArgs;
+use datafusion_physical_plan::PhysicalExpr;
+
+use crate::arrow_wrappers::WrappedSchema;
+use crate::physical_expr::FFI_PhysicalExpr;
+use crate::util::rvec_wrapped_to_vec_fieldref;
+
+/// A stable struct for sharing [`PartitionEvaluatorArgs`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding function
+/// defined in [`PartitionEvaluatorArgs`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_PartitionEvaluatorArgs {
+    input_exprs: RVec<FFI_PhysicalExpr>,
+    input_fields: RVec<WrappedSchema>,
+    is_reversed: bool,
+    ignore_nulls: bool,
+}
+
+impl TryFrom<PartitionEvaluatorArgs<'_>> for FFI_PartitionEvaluatorArgs {
+    type Error = DataFusionError;
+
+    fn try_from(args: PartitionEvaluatorArgs) -> Result<Self, DataFusionError> {
+        let input_exprs = args
+            .input_exprs()
+            .iter()
+            .map(Arc::clone)
+            .map(FFI_PhysicalExpr::from)
+            .collect();
+
+        let input_fields = args
+            .input_fields()
+            .iter()
+            .map(|input_type| FFI_ArrowSchema::try_from(input_type).map(WrappedSchema))
+            .collect::<Result<Vec<_>, ArrowError>>()?
+            .into();
+
+        Ok(Self {
+            input_exprs,
+            input_fields,
+            is_reversed: args.is_reversed(),
+            ignore_nulls: args.ignore_nulls(),
+        })
+    }
+}
+
+/// This struct mirrors PartitionEvaluatorArgs except that it contains owned data.
+/// It is necessary to create this struct so that we can parse the protobuf
+/// data across the FFI boundary and turn it into owned data that
+/// PartitionEvaluatorArgs can then reference.
+pub struct ForeignPartitionEvaluatorArgs {
+    input_exprs: Vec<Arc<dyn PhysicalExpr>>,
+    input_fields: Vec<FieldRef>,
+    is_reversed: bool,
+    ignore_nulls: bool,
+}
+
+impl TryFrom<FFI_PartitionEvaluatorArgs> for ForeignPartitionEvaluatorArgs {
+    type Error = DataFusionError;
+
+    fn try_from(value: FFI_PartitionEvaluatorArgs) -> Result<Self> {
+        let input_exprs = value.input_exprs.iter().map(Into::into).collect();
+
+        let input_fields = rvec_wrapped_to_vec_fieldref(&value.input_fields)?;
+
+        Ok(Self {
+            input_exprs,
+            input_fields,
+            is_reversed: value.is_reversed,
+            ignore_nulls: value.ignore_nulls,
+        })
+    }
+}
+
+impl<'a> From<&'a ForeignPartitionEvaluatorArgs> for PartitionEvaluatorArgs<'a> {
+    fn from(value: &'a ForeignPartitionEvaluatorArgs) -> Self {
+        PartitionEvaluatorArgs::new(
+            &value.input_exprs,
+            &value.input_fields,
+            value.is_reversed,
+            value.ignore_nulls,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {}
diff --git a/datafusion/ffi/src/udwf/range.rs b/datafusion/ffi/src/udwf/range.rs
new file mode 100644
index 0000000000000..19a908c5e2454
--- /dev/null
+++ b/datafusion/ffi/src/udwf/range.rs
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ops::Range;
+
+use abi_stable::StableAbi;
+
+/// A stable struct for sharing [`Range`] across FFI boundaries.
+/// For an explanation of each field, see the corresponding function
+/// defined in [`Range`].
+#[repr(C)]
+#[derive(Debug, StableAbi)]
+pub struct FFI_Range {
+    pub start: usize,
+    pub end: usize,
+}
+
+impl From<Range<usize>> for FFI_Range {
+    fn from(value: Range<usize>) -> Self {
+        Self {
+            start: value.start,
+            end: value.end,
+        }
+    }
+}
+
+impl From<FFI_Range> for Range<usize> {
+    fn from(value: FFI_Range) -> Self {
+        Self {
+            start: value.start,
+            end: value.end,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_round_trip_ffi_range() {
+        let original = Range { start: 10, end: 30 };
+
+        let ffi_range: FFI_Range = original.clone().into();
+        let round_trip: Range<usize> = ffi_range.into();
+
+        assert_eq!(original, round_trip);
+    }
+}
diff --git a/datafusion/ffi/src/util.rs b/datafusion/ffi/src/util.rs
index 3eb57963b44f8..db6eb0552d2aa 100644
--- a/datafusion/ffi/src/util.rs
+++ b/datafusion/ffi/src/util.rs
@@ -15,28 +15,37 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::arrow_wrappers::WrappedSchema;
-use abi_stable::std_types::RVec;
-use arrow::datatypes::Field;
-use arrow::{datatypes::DataType, ffi::FFI_ArrowSchema};
-use arrow_schema::FieldRef;
 use std::sync::Arc;
 
-/// This macro is a helpful conversion utility to conver from an abi_stable::RResult to a
+use abi_stable::std_types::{RResult, RString, RVec};
+use arrow::datatypes::{DataType, Field};
+use arrow::ffi::FFI_ArrowSchema;
+use arrow_schema::FieldRef;
+
+use crate::arrow_wrappers::WrappedSchema;
+
+/// Convenience type for results passed through the FFI boundary. Since the
+/// `DataFusionError` enum is complex and little value is gained from creating
+/// a FFI safe variant of it, we convert errors to strings when passing results
+/// back. These are converted back and forth using the `df_result`, `rresult`,
+/// and `rresult_return` macros.
+pub type FFIResult<T> = RResult<T, RString>;
+
+/// This macro is a helpful conversion utility to convert from an abi_stable::RResult to a
 /// DataFusion result.
 #[macro_export]
 macro_rules! df_result {
     ( $x:expr ) => {
         match $x {
             abi_stable::std_types::RResult::ROk(v) => Ok(v),
-            abi_stable::std_types::RResult::RErr(e) => {
-                Err(datafusion::error::DataFusionError::Execution(e.to_string()))
+            abi_stable::std_types::RResult::RErr(err) => {
+                datafusion_common::ffi_err!("{err}")
             }
         }
     };
 }
 
-/// This macro is a helpful conversion utility to conver from a DataFusion Result to an abi_stable::RResult
+/// This macro is a helpful conversion utility to convert from a DataFusion Result to an abi_stable::RResult
 #[macro_export]
 macro_rules! rresult {
     ( $x:expr ) => {
@@ -49,7 +58,7 @@ macro_rules! rresult {
     };
 }
 
-/// This macro is a helpful conversion utility to conver from a DataFusion Result to an abi_stable::RResult
+/// This macro is a helpful conversion utility to convert from a DataFusion Result to an abi_stable::RResult
 /// and to also call return when it is an error. Since you cannot use `?` on an RResult, this is designed
 /// to mimic the pattern.
 #[macro_export]
@@ -117,11 +126,27 @@ pub fn rvec_wrapped_to_vec_datatype(
 }
 
 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
+    use std::sync::Arc;
+
     use abi_stable::std_types::{RResult, RString};
     use datafusion::error::DataFusionError;
+    use datafusion::prelude::SessionContext;
+    use datafusion_execution::TaskContextProvider;
+
+    use crate::execution::FFI_TaskContextProvider;
+    use crate::util::FFIResult;
+
+    pub(crate) fn test_session_and_ctx() -> (Arc<SessionContext>, FFI_TaskContextProvider)
+    {
+        let ctx = Arc::new(SessionContext::new());
+        let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+        let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+
+        (ctx, task_ctx_provider)
+    }
 
-    fn wrap_result(result: Result<String, DataFusionError>) -> RResult<String, RString> {
+    fn wrap_result(result: Result<String, DataFusionError>) -> FFIResult<String> {
         RResult::ROk(rresult_return!(result))
     }
 
@@ -130,9 +155,9 @@ mod tests {
         const VALID_VALUE: &str = "valid_value";
         const ERROR_VALUE: &str = "error_value";
 
-        let ok_r_result: RResult<RString, RString> =
+        let ok_r_result: FFIResult<RString> =
             RResult::ROk(VALID_VALUE.to_string().into());
-        let err_r_result: RResult<RString, RString> =
+        let err_r_result: FFIResult<RString> =
             RResult::RErr(ERROR_VALUE.to_string().into());
 
         let returned_ok_result = df_result!(ok_r_result);
@@ -142,21 +167,23 @@ mod tests {
         let returned_err_result = df_result!(err_r_result);
         assert!(returned_err_result.is_err());
         assert!(
-            returned_err_result.unwrap_err().to_string()
-                == format!("Execution error: {ERROR_VALUE}")
+            returned_err_result.unwrap_err().strip_backtrace()
+                == format!("FFI error: {ERROR_VALUE}")
         );
 
         let ok_result: Result<String, DataFusionError> = Ok(VALID_VALUE.to_string());
         let err_result: Result<String, DataFusionError> =
-            Err(DataFusionError::Execution(ERROR_VALUE.to_string()));
+            datafusion_common::ffi_err!("{ERROR_VALUE}");
 
         let returned_ok_r_result = wrap_result(ok_result);
         assert!(returned_ok_r_result == RResult::ROk(VALID_VALUE.into()));
 
         let returned_err_r_result = wrap_result(err_result);
+        assert!(returned_err_r_result.is_err());
         assert!(
             returned_err_r_result
-                == RResult::RErr(format!("Execution error: {ERROR_VALUE}").into())
+                .unwrap_err()
+                .starts_with(format!("FFI error: {ERROR_VALUE}").as_str())
         );
     }
 }
diff --git a/datafusion/ffi/src/volatility.rs b/datafusion/ffi/src/volatility.rs
index 0aaf68a174cfd..bc714ae59587d 100644
--- a/datafusion/ffi/src/volatility.rs
+++ b/datafusion/ffi/src/volatility.rs
@@ -16,11 +16,10 @@
 // under the License.
 
 use abi_stable::StableAbi;
-use datafusion::logical_expr::Volatility;
+use datafusion_expr::Volatility;
 
 #[repr(C)]
-#[derive(Debug, StableAbi)]
-#[allow(non_camel_case_types)]
+#[derive(Debug, StableAbi, Clone)]
 pub enum FFI_Volatility {
     Immutable,
     Stable,
diff --git a/datafusion/ffi/tests/ffi_catalog.rs b/datafusion/ffi/tests/ffi_catalog.rs
new file mode 100644
index 0000000000000..28bb5f406f53f
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_catalog.rs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod utils;
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::catalog::{CatalogProvider, CatalogProviderList};
+    use datafusion_common::DataFusionError;
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[tokio::test]
+    async fn test_catalog() -> datafusion_common::Result<()> {
+        let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_catalog =
+            module
+                .create_catalog()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External catalog provider failed to implement create_catalog"
+                        .to_string(),
+                ))?(codec);
+        let foreign_catalog: Arc<dyn CatalogProvider + Send> = (&ffi_catalog).into();
+
+        let _ = ctx.register_catalog("fruit", foreign_catalog);
+
+        let df = ctx.table("fruit.apple.purchases").await?;
+
+        let results = df.collect().await?;
+
+        assert_eq!(results.len(), 2);
+        let num_rows: usize = results.into_iter().map(|rb| rb.num_rows()).sum();
+        assert_eq!(num_rows, 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_catalog_list() -> datafusion_common::Result<()> {
+        let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_catalog_list =
+            module
+                .create_catalog_list()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External catalog provider failed to implement create_catalog_list"
+                        .to_string(),
+                ))?(codec);
+        let foreign_catalog_list: Arc<dyn CatalogProviderList + Send> =
+            (&ffi_catalog_list).into();
+
+        ctx.register_catalog_list(foreign_catalog_list);
+
+        let df = ctx.table("blue.apple.purchases").await?;
+
+        let results = df.collect().await?;
+
+        assert_eq!(results.len(), 2);
+        let num_rows: usize = results.into_iter().map(|rb| rb.num_rows()).sum();
+        assert_eq!(num_rows, 5);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_config.rs b/datafusion/ffi/tests/ffi_config.rs
new file mode 100644
index 0000000000000..ca0a3e31e8de6
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_config.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use datafusion::error::{DataFusionError, Result};
+    use datafusion_common::ScalarValue;
+    use datafusion_common::config::{ConfigOptions, TableOptions};
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_ffi::config::ExtensionOptionsFFIProvider;
+    use datafusion_ffi::tests::config::ExternalConfig;
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[test]
+    fn test_ffi_config_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options =
+            module
+                .create_extension_options()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External test library failed to implement create_extension_options"
+                        .to_string(),
+                ))?();
+
+        let mut config = ConfigOptions::new();
+        config.extensions.insert(extension_options);
+
+        // Verify default values are as expected
+        let returned_config: ExternalConfig = config
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert_eq!(returned_config, ExternalConfig::default());
+
+        config.set("external_config.is_enabled", "false")?;
+        let returned_config: ExternalConfig = config
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_config.is_enabled);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_table_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options =
+            module
+                .create_extension_options()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External test library failed to implement create_extension_options"
+                        .to_string(),
+                ))?();
+
+        let mut table_options = TableOptions::new();
+        table_options.extensions.insert(extension_options);
+
+        // Verify default values are as expected
+        let returned_options: ExternalConfig = table_options
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+
+        assert_eq!(returned_options, ExternalConfig::default());
+
+        table_options.set("external_config.is_enabled", "false")?;
+        let returned_options: ExternalConfig = table_options
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_options.is_enabled);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ffi_session_config_options_extension() -> Result<()> {
+        let module = get_module()?;
+
+        let extension_options =
+            module
+                .create_extension_options()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External test library failed to implement create_extension_options"
+                        .to_string(),
+                ))?();
+
+        let mut config = SessionConfig::new().with_option_extension(extension_options);
+
+        // Verify default values are as expected
+        let returned_config: ExternalConfig = config
+            .options()
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert_eq!(returned_config, ExternalConfig::default());
+
+        config = config.set(
+            "external_config.is_enabled",
+            &ScalarValue::Boolean(Some(false)),
+        );
+        let returned_config: ExternalConfig = config
+            .options()
+            .local_or_ffi_extension()
+            .expect("should have external config extension");
+        assert!(!returned_config.is_enabled);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_execution_plan.rs b/datafusion/ffi/tests/ffi_execution_plan.rs
new file mode 100644
index 0000000000000..d81f947dc80ed
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_execution_plan.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use arrow::datatypes::Field;
+    use arrow::datatypes::Schema;
+    use arrow_schema::DataType;
+    use datafusion_common::DataFusionError;
+    use datafusion_ffi::execution_plan::FFI_ExecutionPlan;
+    use datafusion_ffi::execution_plan::ForeignExecutionPlan;
+    use datafusion_ffi::execution_plan::{ExecutionPlanPrivateData, tests::EmptyExec};
+    use datafusion_ffi::tests::utils::get_module;
+    use datafusion_physical_plan::ExecutionPlan;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_ffi_execution_plan_new_sets_runtimes_on_children()
+    -> Result<(), DataFusionError> {
+        // We want to test the case where we have two libraries.
+        // Library A will have a foreign plan from Library B, called child_plan.
+        // Library A will add a plan called grandchild_plan under child_plan
+        // Library A will create a plan called parent_plan, that has child_plan
+        // under it. So we should have:
+        // parent_plan (local) -> child_plan (foreign) -> grandchild_plan (local)
+        // Then we want to turn parent_plan into a FFI plan.
+        // Verify that grandchild_plan also gets the same runtime as parent_plan.
+
+        let module = get_module()?;
+
+        fn generate_local_plan() -> Arc<dyn ExecutionPlan> {
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
+
+            Arc::new(EmptyExec::new(schema))
+        }
+
+        let child_plan =
+            module
+                .create_empty_exec()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External module failed to implement create_empty_exec".to_string(),
+                ))?();
+        let child_plan: Arc<dyn ExecutionPlan> = (&child_plan)
+            .try_into()
+            .expect("should be able create plan");
+        assert!(child_plan.as_any().is::<ForeignExecutionPlan>());
+
+        let grandchild_plan = generate_local_plan();
+
+        let child_plan = child_plan.with_new_children(vec![grandchild_plan])?;
+
+        unsafe {
+            // Originally the runtime is not set. We go through the unsafe casting
+            // of data here because the `inner()` function is private and this is
+            // only an integration test so we do not want to expose it.
+            let ffi_child = FFI_ExecutionPlan::new(Arc::clone(&child_plan), None);
+            let ffi_grandchild =
+                (ffi_child.children)(&ffi_child).into_iter().next().unwrap();
+
+            let grandchild_private_data =
+                ffi_grandchild.private_data as *const ExecutionPlanPrivateData;
+            assert!((*grandchild_private_data).runtime.is_none());
+        }
+
+        let parent_plan = generate_local_plan().with_new_children(vec![child_plan])?;
+
+        // Adding the grandchild beneath this FFI plan should get the runtime passed down.
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .build()
+            .unwrap();
+        let ffi_parent =
+            FFI_ExecutionPlan::new(parent_plan, Some(runtime.handle().clone()));
+
+        unsafe {
+            let ffi_child = (ffi_parent.children)(&ffi_parent)
+                .into_iter()
+                .next()
+                .unwrap();
+            let ffi_grandchild =
+                (ffi_child.children)(&ffi_child).into_iter().next().unwrap();
+            assert_eq!(
+                (ffi_grandchild.library_marker_id)(),
+                (ffi_parent.library_marker_id)()
+            );
+
+            let grandchild_private_data =
+                ffi_grandchild.private_data as *const ExecutionPlanPrivateData;
+            assert!((*grandchild_private_data).runtime.is_some());
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_integration.rs b/datafusion/ffi/tests/ffi_integration.rs
index c6df324e9a17c..1be486589b722 100644
--- a/datafusion/ffi/tests/ffi_integration.rs
+++ b/datafusion/ffi/tests/ffi_integration.rs
@@ -15,24 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod utils;
+
 /// Add an additional module here for convenience to scope this to only
-/// when the feature integtation-tests is built
+/// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
 
+    use arrow::datatypes::Schema;
+    use datafusion::catalog::{TableProvider, TableProviderFactory};
     use datafusion::error::{DataFusionError, Result};
-    use datafusion::prelude::SessionContext;
-    use datafusion_ffi::catalog_provider::ForeignCatalogProvider;
-    use datafusion_ffi::table_provider::ForeignTableProvider;
+    use datafusion_common::TableReference;
+    use datafusion_common::ToDFSchema;
+    use datafusion_expr::CreateExternalTable;
     use datafusion_ffi::tests::create_record_batch;
     use datafusion_ffi::tests::utils::get_module;
-    use std::sync::Arc;
 
     /// It is important that this test is in the `tests` directory and not in the
     /// library directory so we can verify we are building a dynamic library and
     /// testing it via a different executable.
     async fn test_table_provider(synchronous: bool) -> Result<()> {
         let table_provider_module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
 
         // By calling the code below, the table provided will be created within
         // the module's code.
@@ -40,23 +46,21 @@ mod tests {
             DataFusionError::NotImplemented(
                 "External table provider failed to implement create_table".to_string(),
             ),
-        )?(synchronous);
+        )?(synchronous, codec);
 
         // In order to access the table provider within this executable, we need to
-        // turn it into a `ForeignTableProvider`.
-        let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into();
-
-        let ctx = SessionContext::new();
+        // turn it into a `TableProvider`.
+        let foreign_table_provider: Arc<dyn TableProvider> = (&ffi_table_provider).into();
 
         // Display the data to show the full cycle works.
-        ctx.register_table("external_table", Arc::new(foreign_table_provider))?;
+        ctx.register_table("external_table", foreign_table_provider)?;
         let df = ctx.table("external_table").await?;
         let results = df.collect().await?;
 
         assert_eq!(results.len(), 3);
-        assert_eq!(results[0], create_record_batch(1, 5));
-        assert_eq!(results[1], create_record_batch(6, 1));
-        assert_eq!(results[2], create_record_batch(7, 5));
+        assert!(results.contains(&create_record_batch(1, 5)));
+        assert!(results.contains(&create_record_batch(6, 1)));
+        assert!(results.contains(&create_record_batch(7, 5)));
 
         Ok(())
     }
@@ -72,27 +76,40 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_catalog() -> Result<()> {
-        let module = get_module()?;
-
-        let ffi_catalog =
-            module
-                .create_catalog()
-                .ok_or(DataFusionError::NotImplemented(
-                    "External catalog provider failed to implement create_catalog"
-                        .to_string(),
-                ))?();
-        let foreign_catalog: ForeignCatalogProvider = (&ffi_catalog).into();
-
-        let ctx = SessionContext::default();
-        let _ = ctx.register_catalog("fruit", Arc::new(foreign_catalog));
-
-        let df = ctx.table("fruit.apple.purchases").await?;
-
-        let results = df.collect().await?;
-
-        assert!(!results.is_empty());
-        assert!(results[0].num_rows() != 0);
+    async fn test_table_provider_factory() -> Result<()> {
+        let table_provider_module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
+
+        let ffi_table_provider_factory = table_provider_module
+            .create_table_factory()
+            .ok_or(DataFusionError::NotImplemented(
+                "External table provider factory failed to implement create".to_string(),
+            ))?(codec);
+
+        let foreign_table_provider_factory: Arc<dyn TableProviderFactory> =
+            (&ffi_table_provider_factory).into();
+
+        let cmd = CreateExternalTable {
+            schema: Schema::empty().to_dfschema_ref()?,
+            name: TableReference::bare("cloned_test"),
+            location: "test".to_string(),
+            file_type: "test".to_string(),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            or_replace: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Default::default(),
+            column_defaults: HashMap::new(),
+        };
+
+        let provider = foreign_table_provider_factory
+            .create(&ctx.state(), &cmd)
+            .await?;
+        assert_eq!(provider.schema().fields().len(), 2);
 
         Ok(())
     }
diff --git a/datafusion/ffi/tests/ffi_udaf.rs b/datafusion/ffi/tests/ffi_udaf.rs
new file mode 100644
index 0000000000000..f219979a85062
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_udaf.rs
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::Float64Array;
+    use datafusion::common::record_batch;
+    use datafusion::error::{DataFusionError, Result};
+    use datafusion::logical_expr::{AggregateUDF, AggregateUDFImpl};
+    use datafusion::prelude::{SessionContext, col};
+    use datafusion_catalog::MemTable;
+    use datafusion_expr::{ScalarUDF, ScalarUDFImpl};
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[tokio::test]
+    async fn test_ffi_udaf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_sum_func =
+            module
+                .create_sum_udaf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External table provider failed to implement create_udaf".to_string(),
+                ))?();
+        let foreign_sum_func: Arc<dyn AggregateUDFImpl> = (&ffi_sum_func).into();
+
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_sum_func);
+
+        let ctx = SessionContext::default();
+        let record_batch = record_batch!(
+            ("a", Int32, vec![1, 2, 2, 4, 4, 4, 4]),
+            ("b", Float64, vec![1.0, 2.0, 2.0, 4.0, 4.0, 4.0, 4.0])
+        )
+        .unwrap();
+
+        let df = ctx.read_batch(record_batch)?;
+
+        let df = df
+            .aggregate(
+                vec![col("a")],
+                vec![udaf.call(vec![col("b")]).alias("sum_b")],
+            )?
+            .sort_by(vec![col("a")])?;
+
+        let result = df.collect().await?;
+
+        let expected = record_batch!(
+            ("a", Int32, vec![1, 2, 4]),
+            ("sum_b", Float64, vec![1.0, 4.0, 16.0])
+        )?;
+
+        assert_eq!(result[0], expected);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ffi_grouping_udaf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_stddev_func =
+            module
+                .create_stddev_udaf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External table provider failed to implement create_udaf".to_string(),
+                ))?();
+        let foreign_stddev_func: Arc<dyn AggregateUDFImpl> = (&ffi_stddev_func).into();
+
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_stddev_func);
+
+        let ctx = SessionContext::default();
+        let record_batch = record_batch!(
+            ("a", Int32, vec![1, 2, 2, 4, 4, 4, 4]),
+            (
+                "b",
+                Float64,
+                vec![
+                    1.0,
+                    2.0,
+                    2.0 + 2.0_f64.sqrt(),
+                    4.0,
+                    4.0,
+                    4.0 + 3.0_f64.sqrt(),
+                    4.0 + 3.0_f64.sqrt()
+                ]
+            )
+        )
+        .unwrap();
+
+        let df = ctx.read_batch(record_batch)?;
+
+        let df = df
+            .aggregate(
+                vec![col("a")],
+                vec![udaf.call(vec![col("b")]).alias("stddev_b")],
+            )?
+            .sort_by(vec![col("a")])?;
+
+        let result = df.collect().await?;
+        let result = result[0].column_by_name("stddev_b").unwrap();
+        let result = result
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap()
+            .values();
+
+        assert!(result.first().unwrap().is_nan());
+        assert!(result.get(1).unwrap() - 1.0 < 0.00001);
+        assert!(result.get(2).unwrap() - 1.0 < 0.00001);
+
+        Ok(())
+    }
+
+    /// This test FFI UDFs can be used as inputs to FFI Aggregate UDFs.
+    /// Really this is a test of the Protobuf serialization and deserialization
+    /// using the TaskContextProvider. It can be demonstrated through the
+    /// UDAF accumulator arguments as an end-to-end test.
+    #[tokio::test]
+    async fn udf_as_input_to_udf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_abs_func =
+            module
+                .create_scalar_udf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External table provider failed to implement create_scalar_udf"
+                        .to_string(),
+                ))?();
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
+        let abs_udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
+
+        let ctx = SessionContext::new();
+        ctx.deregister_udf("abs");
+
+        let ffi_sum_func =
+            module
+                .create_sum_udaf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External table provider failed to implement create_udaf".to_string(),
+                ))?();
+        let foreign_sum_func: Arc<dyn AggregateUDFImpl> = (&ffi_sum_func).into();
+
+        let udaf = AggregateUDF::new_from_shared_impl(foreign_sum_func);
+
+        // We need at least 2 record batches so we get an accumulator
+        let ctx = SessionContext::default();
+        let rb1 = record_batch!(
+            ("a", Int32, vec![1, 2, 2, 4, 4, 4, 4]),
+            ("b", Float64, vec![-1.0, 2.0, -2.0, 4.0, -4.0, -4.0, -4.0])
+        )
+        .unwrap();
+        let rb2 = rb1.clone();
+
+        let table = Arc::new(MemTable::try_new(rb1.schema(), vec![vec![rb1, rb2]])?);
+
+        let df = ctx.read_table(table)?;
+
+        let df = df
+            .aggregate(
+                vec![col("a")],
+                vec![udaf.call(vec![abs_udf.call(vec![col("b")])]).alias("sum_b")],
+            )?
+            .sort_by(vec![col("a")])?;
+
+        df.clone().show().await?;
+
+        let result = df.collect().await?;
+
+        let expected = record_batch!(
+            ("a", Int32, vec![1, 2, 4]),
+            ("sum_b", Float64, vec![2.0, 8.0, 32.0])
+        )?;
+
+        assert_eq!(result[0], expected);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/ffi_udf.rs b/datafusion/ffi/tests/ffi_udf.rs
index bbc23552def43..02dfba599f316 100644
--- a/datafusion/ffi/tests/ffi_udf.rs
+++ b/datafusion/ffi/tests/ffi_udf.rs
@@ -16,19 +16,20 @@
 // under the License.
 
 /// Add an additional module here for convenience to scope this to only
-/// when the feature integtation-tests is built
+/// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
-
+    use arrow::array::{Array, AsArray};
     use arrow::datatypes::DataType;
     use datafusion::common::record_batch;
     use datafusion::error::{DataFusionError, Result};
-    use datafusion::logical_expr::ScalarUDF;
-    use datafusion::prelude::{col, SessionContext};
-
+    use datafusion::logical_expr::{ScalarUDF, ScalarUDFImpl};
+    use datafusion::prelude::{SessionContext, col};
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_expr::lit;
     use datafusion_ffi::tests::create_record_batch;
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udf::ForeignScalarUDF;
+    use std::sync::Arc;
 
     /// This test validates that we can load an external module and use a scalar
     /// udf defined in it via the foreign function interface. In this case we are
@@ -44,9 +45,9 @@ mod tests {
                     "External table provider failed to implement create_scalar_udf"
                         .to_string(),
                 ))?();
-        let foreign_abs_func: ForeignScalarUDF = (&ffi_abs_func).try_into()?;
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
 
-        let udf: ScalarUDF = foreign_abs_func.into();
+        let udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
 
         let ctx = SessionContext::default();
         let df = ctx.read_batch(create_record_batch(-5, 5))?;
@@ -82,9 +83,9 @@ mod tests {
                     "External table provider failed to implement create_scalar_udf"
                         .to_string(),
                 ))?();
-        let foreign_abs_func: ForeignScalarUDF = (&ffi_abs_func).try_into()?;
+        let foreign_abs_func: Arc<dyn ScalarUDFImpl> = (&ffi_abs_func).into();
 
-        let udf: ScalarUDF = foreign_abs_func.into();
+        let udf = ScalarUDF::new_from_shared_impl(foreign_abs_func);
 
         let ctx = SessionContext::default();
         let df = ctx.read_batch(create_record_batch(-5, 5))?;
@@ -101,4 +102,46 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_config_on_scalar_udf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_udf =
+            module
+                .create_timezone_udf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External module failed to implement create_timezone_udf".to_string(),
+                ))?();
+        let foreign_udf: Arc<dyn ScalarUDFImpl> = (&ffi_udf).into();
+
+        let udf = ScalarUDF::new_from_shared_impl(foreign_udf);
+
+        let ctx = SessionContext::default();
+
+        let df = ctx
+            .read_empty()?
+            .select(vec![udf.call(vec![lit("a")]).alias("a")])?;
+
+        let result = df.collect().await?;
+        assert!(result[0].column(0).as_string::<i32>().is_null(0));
+
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.time_zone = Some("AEST".into());
+
+        let ctx = SessionContext::new_with_config(config);
+
+        let df = ctx
+            .read_empty()?
+            .select(vec![udf.call(vec![lit("a")]).alias("a")])?;
+
+        let result = df.collect().await?;
+
+        assert!(result.len() == 1);
+        assert!(!result[0].column(0).as_string::<i32>().is_null(0));
+        let result = result[0].column(0).as_string::<i32>().value(0);
+        assert_eq!(result, "AEST");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/ffi/tests/ffi_udtf.rs b/datafusion/ffi/tests/ffi_udtf.rs
index 5a46211d3b9c6..ab7818932959c 100644
--- a/datafusion/ffi/tests/ffi_udtf.rs
+++ b/datafusion/ffi/tests/ffi_udtf.rs
@@ -15,19 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod utils;
+
 /// Add an additional module here for convenience to scope this to only
-/// when the feature integtation-tests is built
+/// when the feature integration-tests is built
 #[cfg(feature = "integration-tests")]
 mod tests {
 
     use std::sync::Arc;
 
-    use arrow::array::{create_array, ArrayRef};
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::catalog::TableFunctionImpl;
     use datafusion::error::{DataFusionError, Result};
-    use datafusion::prelude::SessionContext;
-
     use datafusion_ffi::tests::utils::get_module;
-    use datafusion_ffi::udtf::ForeignTableFunction;
 
     /// This test validates that we can load an external module and use a scalar
     /// udf defined in it via the foreign function interface. In this case we are
@@ -35,19 +35,17 @@ mod tests {
     #[tokio::test]
     async fn test_user_defined_table_function() -> Result<()> {
         let module = get_module()?;
+        let (ctx, codec) = super::utils::ctx_and_codec();
 
         let ffi_table_func = module
             .create_table_function()
             .ok_or(DataFusionError::NotImplemented(
             "External table function provider failed to implement create_table_function"
                 .to_string(),
-        ))?();
-        let foreign_table_func: ForeignTableFunction = ffi_table_func.into();
-
-        let udtf = Arc::new(foreign_table_func);
+        ))?(codec);
+        let foreign_table_func: Arc<dyn TableFunctionImpl> = ffi_table_func.into();
 
-        let ctx = SessionContext::default();
-        ctx.register_udtf("my_range", udtf);
+        ctx.register_udtf("my_range", foreign_table_func);
 
         let result = ctx
             .sql("SELECT * FROM my_range(5)")
diff --git a/datafusion/ffi/tests/ffi_udwf.rs b/datafusion/ffi/tests/ffi_udwf.rs
new file mode 100644
index 0000000000000..c4e889b796008
--- /dev/null
+++ b/datafusion/ffi/tests/ffi_udwf.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Add an additional module here for convenience to scope this to only
+/// when the feature integration-tests is built
+#[cfg(feature = "integration-tests")]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, create_array};
+    use datafusion::error::{DataFusionError, Result};
+    use datafusion::logical_expr::expr::Sort;
+    use datafusion::logical_expr::{ExprFunctionExt, WindowUDF, WindowUDFImpl, col};
+    use datafusion::prelude::SessionContext;
+    use datafusion_ffi::tests::create_record_batch;
+    use datafusion_ffi::tests::utils::get_module;
+
+    #[tokio::test]
+    async fn test_rank_udwf() -> Result<()> {
+        let module = get_module()?;
+
+        let ffi_rank_func =
+            module
+                .create_rank_udwf()
+                .ok_or(DataFusionError::NotImplemented(
+                    "External table provider failed to implement create_scalar_udf"
+                        .to_string(),
+                ))?();
+        let foreign_rank_func: Arc<dyn WindowUDFImpl> = (&ffi_rank_func).into();
+
+        let udwf = WindowUDF::new_from_shared_impl(foreign_rank_func);
+
+        let ctx = SessionContext::default();
+        let df = ctx.read_batch(create_record_batch(-5, 5))?;
+
+        let df = df.select(vec![
+            col("a"),
+            udwf.call(vec![])
+                .order_by(vec![Sort::new(col("a"), true, true)])
+                .build()
+                .unwrap()
+                .alias("rank_a"),
+        ])?;
+
+        df.clone().show().await?;
+
+        let result = df.collect().await?;
+        let expected = create_array!(UInt64, [1, 2, 3, 4, 5]) as ArrayRef;
+
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].column(1), &expected);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/ffi/tests/utils/mod.rs b/datafusion/ffi/tests/utils/mod.rs
new file mode 100644
index 0000000000000..acf59de7f3464
--- /dev/null
+++ b/datafusion/ffi/tests/utils/mod.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion::prelude::SessionContext;
+use datafusion_execution::TaskContextProvider;
+use datafusion_ffi::execution::FFI_TaskContextProvider;
+use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec;
+use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+
+// Creates a default SessionContext and FFI Logical Extension Codec
+// for use in FFI integration tests.
+//
+// This helper centralizes setup logic and is kept intentionally
+// for upcoming FFI test expansions.
+#[cfg_attr(not(feature = "integration-tests"), expect(dead_code))]
+pub fn ctx_and_codec() -> (Arc<SessionContext>, FFI_LogicalExtensionCodec) {
+    let ctx = Arc::new(SessionContext::default());
+    let task_ctx_provider = Arc::clone(&ctx) as Arc<dyn TaskContextProvider>;
+    let task_ctx_provider = FFI_TaskContextProvider::from(&task_ctx_provider);
+    let codec = FFI_LogicalExtensionCodec::new(
+        Arc::new(DefaultLogicalExtensionCodec {}),
+        None,
+        task_ctx_provider,
+    );
+
+    (ctx, codec)
+}
diff --git a/datafusion/functions-aggregate-common/Cargo.toml b/datafusion/functions-aggregate-common/Cargo.toml
index cf065ca1cb174..1714e1800a4fe 100644
--- a/datafusion/functions-aggregate-common/Cargo.toml
+++ b/datafusion/functions-aggregate-common/Cargo.toml
@@ -19,6 +19,7 @@
 name = "datafusion-functions-aggregate-common"
 description = "Utility functions for implementing aggregate functions for the DataFusion query engine"
 keywords = ["datafusion", "logical", "plan", "expressions"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
 homepage = { workspace = true }
@@ -30,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -37,7 +41,6 @@ workspace = true
 name = "datafusion_functions_aggregate_common"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr-common = { workspace = true }
diff --git a/datafusion/functions-aggregate-common/README.md b/datafusion/functions-aggregate-common/README.md
new file mode 100644
index 0000000000000..3d52aa722033a
--- /dev/null
+++ b/datafusion/functions-aggregate-common/README.md
@@ -0,0 +1,32 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Aggregate Function Common Library
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate contains common functionality for implementation aggregate and window functions.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-aggregate-common/benches/accumulate.rs b/datafusion/functions-aggregate-common/benches/accumulate.rs
index f422f8a2a7bfd..aceec57df9666 100644
--- a/datafusion/functions-aggregate-common/benches/accumulate.rs
+++ b/datafusion/functions-aggregate-common/benches/accumulate.rs
@@ -15,12 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, BooleanArray, Int64Array};
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate_indices;
 
 fn generate_group_indices(len: usize) -> Vec<usize> {
diff --git a/datafusion/functions-aggregate-common/src/accumulator.rs b/datafusion/functions-aggregate-common/src/accumulator.rs
index 01b16f1b0a8cc..8db0ab4133dc0 100644
--- a/datafusion/functions-aggregate-common/src/accumulator.rs
+++ b/datafusion/functions-aggregate-common/src/accumulator.rs
@@ -19,18 +19,19 @@ use arrow::datatypes::{DataType, FieldRef, Schema};
 use datafusion_common::Result;
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use std::sync::Arc;
 
 /// [`AccumulatorArgs`] contains information about how an aggregate
 /// function was called, including the types of its arguments and any optional
 /// ordering expressions.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct AccumulatorArgs<'a> {
     /// The return field of the aggregate function.
     pub return_field: FieldRef,
 
-    /// The schema of the input arguments
+    /// Input schema to the aggregate function. If you need to check data type, nullability
+    /// or metadata of input arguments then you should use `expr_fields` below instead.
     pub schema: &'a Schema,
 
     /// Whether to ignore nulls.
@@ -50,9 +51,7 @@ pub struct AccumulatorArgs<'a> {
     /// ```sql
     /// SELECT FIRST_VALUE(column1 ORDER BY column2) FROM t;
     /// ```
-    ///
-    /// If no `ORDER BY` is specified, `ordering_req` will be empty.
-    pub ordering_req: &'a LexOrdering,
+    pub order_bys: &'a [PhysicalSortExpr],
 
     /// Whether the aggregation is running in reverse order
     pub is_reversed: bool,
@@ -69,6 +68,9 @@ pub struct AccumulatorArgs<'a> {
 
     /// The physical expression of arguments the aggregate function takes.
     pub exprs: &'a [Arc<dyn PhysicalExpr>],
+
+    /// Fields corresponding to each expr (same order & length).
+    pub expr_fields: &'a [FieldRef],
 }
 
 impl AccumulatorArgs<'_> {
diff --git a/datafusion/functions-aggregate-common/src/aggregate.rs b/datafusion/functions-aggregate-common/src/aggregate.rs
index c9cbaa8396fc5..aadce907e7cc3 100644
--- a/datafusion/functions-aggregate-common/src/aggregate.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate.rs
@@ -15,5 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod avg_distinct;
 pub mod count_distinct;
 pub mod groups_accumulator;
+pub mod sum_distinct;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs
new file mode 100644
index 0000000000000..56cdaf6618de5
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod decimal;
+mod numeric;
+
+pub use decimal::DecimalDistinctAvgAccumulator;
+pub use numeric::Float64DistinctAvgAccumulator;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
new file mode 100644
index 0000000000000..0a4c1692baa84
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/decimal.rs
@@ -0,0 +1,282 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+    array::{ArrayRef, ArrowNumericType},
+    datatypes::{
+        Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType, i256,
+    },
+};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::accumulator::Accumulator;
+use std::fmt::Debug;
+use std::mem::size_of_val;
+
+use crate::aggregate::sum_distinct::DistinctSumAccumulator;
+use crate::utils::DecimalAverager;
+
+/// Generic implementation of `AVG DISTINCT` for Decimal types.
+/// Handles both all Arrow decimal types (32, 64, 128 and 256 bits).
+#[derive(Debug)]
+pub struct DecimalDistinctAvgAccumulator<T: DecimalType + Debug> {
+    sum_accumulator: DistinctSumAccumulator<T>,
+    sum_scale: i8,
+    target_precision: u8,
+    target_scale: i8,
+}
+
+impl<T: DecimalType + Debug> DecimalDistinctAvgAccumulator<T> {
+    pub fn with_decimal_params(
+        sum_scale: i8,
+        target_precision: u8,
+        target_scale: i8,
+    ) -> Self {
+        let data_type = T::TYPE_CONSTRUCTOR(T::MAX_PRECISION, sum_scale);
+
+        Self {
+            sum_accumulator: DistinctSumAccumulator::new(&data_type),
+            sum_scale,
+            target_precision,
+            target_scale,
+        }
+    }
+}
+
+impl<T: DecimalType + ArrowNumericType + Debug> Accumulator
+    for DecimalDistinctAvgAccumulator<T>
+{
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.sum_accumulator.state()
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.sum_accumulator.distinct_count() == 0 {
+            return ScalarValue::new_primitive::<T>(
+                None,
+                &T::TYPE_CONSTRUCTOR(self.target_precision, self.target_scale),
+            );
+        }
+
+        let sum_scalar = self.sum_accumulator.evaluate()?;
+
+        match sum_scalar {
+            ScalarValue::Decimal32(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal32Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                let avg = decimal_averager
+                    .avg(sum, self.sum_accumulator.distinct_count() as i32)?;
+                Ok(ScalarValue::Decimal32(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+            ScalarValue::Decimal64(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal64Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                let avg = decimal_averager
+                    .avg(sum, self.sum_accumulator.distinct_count() as i64)?;
+                Ok(ScalarValue::Decimal64(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+            ScalarValue::Decimal128(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal128Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                let avg = decimal_averager
+                    .avg(sum, self.sum_accumulator.distinct_count() as i128)?;
+                Ok(ScalarValue::Decimal128(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+            ScalarValue::Decimal256(Some(sum), _, _) => {
+                let decimal_averager = DecimalAverager::<Decimal256Type>::try_new(
+                    self.sum_scale,
+                    self.target_precision,
+                    self.target_scale,
+                )?;
+                // `distinct_count` returns `u64`, but `avg` expects `i256`
+                // first convert `u64` to `i128`, then convert `i128` to `i256` to avoid overflow
+                let distinct_cnt: i128 = self.sum_accumulator.distinct_count() as i128;
+                let count: i256 = i256::from_i128(distinct_cnt);
+                let avg = decimal_averager.avg(sum, count)?;
+                Ok(ScalarValue::Decimal256(
+                    Some(avg),
+                    self.target_precision,
+                    self.target_scale,
+                ))
+            }
+
+            _ => unreachable!("Unsupported decimal type: {:?}", sum_scalar),
+        }
+    }
+
+    fn size(&self) -> usize {
+        let fixed_size = size_of_val(self);
+
+        // Account for the size of the sum_accumulator with its contained values
+        fixed_size + self.sum_accumulator.size()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+    };
+    use std::sync::Arc;
+
+    #[test]
+    fn test_decimal32_distinct_avg_accumulator() -> Result<()> {
+        let precision = 5_u8;
+        let scale = 2_i8;
+        let array = Decimal32Array::from(vec![
+            Some(10_00),
+            Some(12_50),
+            Some(17_50),
+            Some(20_00),
+            Some(20_00),
+            Some(30_00),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal32Type>::with_decimal_params(
+                scale, 9, 6,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result = ScalarValue::Decimal32(Some(18000000), 9, 6);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decimal64_distinct_avg_accumulator() -> Result<()> {
+        let precision = 10_u8;
+        let scale = 4_i8;
+        let array = Decimal64Array::from(vec![
+            Some(100_0000),
+            Some(125_0000),
+            Some(175_0000),
+            Some(200_0000),
+            Some(200_0000),
+            Some(300_0000),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal64Type>::with_decimal_params(
+                scale, 14, 8,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result = ScalarValue::Decimal64(Some(180_00000000), 14, 8);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decimal128_distinct_avg_accumulator() -> Result<()> {
+        let precision = 10_u8;
+        let scale = 4_i8;
+        let array = Decimal128Array::from(vec![
+            Some(100_0000),
+            Some(125_0000),
+            Some(175_0000),
+            Some(200_0000),
+            Some(200_0000),
+            Some(300_0000),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal128Type>::with_decimal_params(
+                scale, 14, 8,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result = ScalarValue::Decimal128(Some(180_00000000), 14, 8);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decimal256_distinct_avg_accumulator() -> Result<()> {
+        let precision = 50_u8;
+        let scale = 2_i8;
+
+        let array = Decimal256Array::from(vec![
+            Some(i256::from_i128(10_000)),
+            Some(i256::from_i128(12_500)),
+            Some(i256::from_i128(17_500)),
+            Some(i256::from_i128(20_000)),
+            Some(i256::from_i128(20_000)),
+            Some(i256::from_i128(30_000)),
+            None,
+            None,
+        ])
+        .with_precision_and_scale(precision, scale)?;
+
+        let mut accumulator =
+            DecimalDistinctAvgAccumulator::<Decimal256Type>::with_decimal_params(
+                scale, 54, 6,
+            );
+        accumulator.update_batch(&[Arc::new(array)])?;
+
+        let result = accumulator.evaluate()?;
+        let expected_result =
+            ScalarValue::Decimal256(Some(i256::from_i128(180_000000)), 54, 6);
+        assert_eq!(result, expected_result);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/numeric.rs b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/numeric.rs
new file mode 100644
index 0000000000000..bb43acc2614f9
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/avg_distinct/numeric.rs
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::accumulator::Accumulator;
+
+use crate::aggregate::sum_distinct::DistinctSumAccumulator;
+
+/// Specialized implementation of `AVG DISTINCT` for Float64 values, leveraging
+/// the existing DistinctSumAccumulator implementation.
+#[derive(Debug)]
+pub struct Float64DistinctAvgAccumulator {
+    // We use the DistinctSumAccumulator to handle the set of distinct values
+    sum_accumulator: DistinctSumAccumulator<Float64Type>,
+}
+
+impl Default for Float64DistinctAvgAccumulator {
+    fn default() -> Self {
+        Self {
+            sum_accumulator: DistinctSumAccumulator::<Float64Type>::new(
+                &DataType::Float64,
+            ),
+        }
+    }
+}
+
+impl Accumulator for Float64DistinctAvgAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.sum_accumulator.state()
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.sum_accumulator.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        // Get the sum from the DistinctSumAccumulator
+        let sum_result = self.sum_accumulator.evaluate()?;
+
+        // Extract the sum value
+        if let ScalarValue::Float64(Some(sum)) = sum_result {
+            // Get the count of distinct values
+            let count = self.sum_accumulator.distinct_count() as f64;
+            // Calculate average
+            let avg = sum / count;
+            Ok(ScalarValue::Float64(Some(avg)))
+        } else {
+            // If sum is None, return None (null)
+            Ok(ScalarValue::Float64(None))
+        }
+    }
+
+    fn size(&self) -> usize {
+        self.sum_accumulator.size()
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
index 7d772f7c649dc..25b40382299b4 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
@@ -16,9 +16,11 @@
 // under the License.
 
 mod bytes;
+mod dict;
 mod native;
 
 pub use bytes::BytesDistinctCountAccumulator;
 pub use bytes::BytesViewDistinctCountAccumulator;
+pub use dict::DictionaryCountAccumulator;
 pub use native::FloatDistinctCountAccumulator;
 pub use native::PrimitiveDistinctCountAccumulator;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
index e321df61ddc6a..6e0d55bd64372 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
@@ -18,9 +18,9 @@
 //! [`BytesDistinctCountAccumulator`] for Utf8/LargeUtf8/Binary/LargeBinary values
 
 use arrow::array::{ArrayRef, OffsetSizeTrait};
+use datafusion_common::ScalarValue;
 use datafusion_common::cast::as_list_array;
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::ScalarValue;
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType};
 use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewSet;
@@ -48,7 +48,9 @@ impl<O: OffsetSizeTrait> Accumulator for BytesDistinctCountAccumulator<O> {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
         let set = self.0.take();
         let arr = set.into_state();
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
@@ -107,7 +109,9 @@ impl Accumulator for BytesViewDistinctCountAccumulator {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
         let set = self.0.take();
         let arr = set.into_state();
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs
new file mode 100644
index 0000000000000..d71aed3debe95
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/dict.rs
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray};
+use arrow::downcast_dictionary_array;
+use datafusion_common::internal_err;
+use datafusion_common::{ScalarValue, arrow_datafusion_err};
+use datafusion_expr_common::accumulator::Accumulator;
+
+#[derive(Debug)]
+pub struct DictionaryCountAccumulator {
+    inner: Box<dyn Accumulator>,
+}
+
+impl DictionaryCountAccumulator {
+    pub fn new(inner: Box<dyn Accumulator>) -> Self {
+        Self { inner }
+    }
+}
+
+impl Accumulator for DictionaryCountAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
+        let values: Vec<_> = values
+            .iter()
+            .map(|dict| {
+                downcast_dictionary_array! {
+                    dict => {
+                        let buff: BooleanArray = dict.occupancy().into();
+                        arrow::compute::filter(
+                            dict.values(),
+                            &buff
+                        ).map_err(|e| arrow_datafusion_err!(e))
+                    },
+                    _ => internal_err!("DictionaryCountAccumulator only supports dictionary arrays")
+                }
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        self.inner.update_batch(values.as_slice())
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        self.inner.evaluate()
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
+        self.inner.merge_batch(states)
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
index e8b6588dc0913..e506b4acb1418 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
@@ -26,19 +26,19 @@ use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
-use ahash::RandomState;
-use arrow::array::types::ArrowPrimitiveType;
 use arrow::array::ArrayRef;
 use arrow::array::PrimitiveArray;
+use arrow::array::types::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
+use datafusion_common::hash_utils::RandomState;
 
+use datafusion_common::ScalarValue;
 use datafusion_common::cast::{as_list_array, as_primitive_array};
-use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::ScalarValue;
+use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_expr_common::accumulator::Accumulator;
 
-use crate::utils::Hashable;
+use crate::utils::GenericDistinctBuffer;
 
 #[derive(Debug)]
 pub struct PrimitiveDistinctCountAccumulator<T>
@@ -73,7 +73,9 @@ where
             PrimitiveArray::<T>::from_iter_values(self.values.iter().cloned())
                 .with_data_type(self.data_type.clone()),
         );
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
@@ -124,88 +126,42 @@ where
 }
 
 #[derive(Debug)]
-pub struct FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
-    values: HashSet<Hashable<T::Native>, RandomState>,
+pub struct FloatDistinctCountAccumulator<T: ArrowPrimitiveType> {
+    values: GenericDistinctBuffer<T>,
 }
 
-impl<T> FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<T: ArrowPrimitiveType> FloatDistinctCountAccumulator<T> {
     pub fn new() -> Self {
         Self {
-            values: HashSet::default(),
+            values: GenericDistinctBuffer::new(T::DATA_TYPE),
         }
     }
 }
 
-impl<T> Default for FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send,
-{
+impl<T: ArrowPrimitiveType> Default for FloatDistinctCountAccumulator<T> {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl<T> Accumulator for FloatDistinctCountAccumulator<T>
-where
-    T: ArrowPrimitiveType + Send + Debug,
-{
+impl<T: ArrowPrimitiveType + Debug> Accumulator for FloatDistinctCountAccumulator<T> {
     fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
-        let arr = Arc::new(PrimitiveArray::<T>::from_iter_values(
-            self.values.iter().map(|v| v.0),
-        )) as ArrayRef;
-        Ok(vec![SingleRowListArrayBuilder::new(arr).build_list_scalar()])
+        self.values.state()
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        let arr = as_primitive_array::<T>(&values[0])?;
-        arr.iter().for_each(|value| {
-            if let Some(value) = value {
-                self.values.insert(Hashable(value));
-            }
-        });
-
-        Ok(())
+        self.values.update_batch(values)
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
-        if states.is_empty() {
-            return Ok(());
-        }
-        assert_eq!(
-            states.len(),
-            1,
-            "count_distinct states must be single array"
-        );
-
-        let arr = as_list_array(&states[0])?;
-        arr.iter().try_for_each(|maybe_list| {
-            if let Some(list) = maybe_list {
-                let list = as_primitive_array::<T>(&list)?;
-                self.values
-                    .extend(list.values().iter().map(|v| Hashable(*v)));
-            };
-            Ok(())
-        })
+        self.values.merge_batch(states)
     }
 
     fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
-        Ok(ScalarValue::Int64(Some(self.values.len() as i64)))
+        Ok(ScalarValue::Int64(Some(self.values.values.len() as i64)))
     }
 
     fn size(&self) -> usize {
-        let num_elements = self.values.len();
-        let fixed_size = size_of_val(self) + size_of_val(&self.values);
-
-        estimate_memory_size::<T::Native>(num_elements, fixed_size).unwrap()
+        size_of_val(self) + self.values.size()
     }
 }
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
index aa2f5a586e877..ad2a21bb4733c 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
@@ -32,7 +32,7 @@ use arrow::{
     compute::take_arrays,
     datatypes::UInt32Type,
 };
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, arrow_datafusion_err};
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 
@@ -80,15 +80,13 @@ use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 ///  Logical group         Current Min/Max value for that group stored
 ///     number             as a ScalarValue which points to an
 ///                        individually allocated String
-///
-///```
+/// ```
 ///
 /// # Optimizations
 ///
 /// The adapter minimizes the number of calls to [`Accumulator::update_batch`]
 /// by first collecting the input rows for each group into a contiguous array
 /// using [`compute::take`]
-///
 pub struct GroupsAccumulatorAdapter {
     factory: Box<dyn Fn() -> Result<Box<dyn Accumulator>> + Send>,
 
@@ -184,7 +182,6 @@ impl GroupsAccumulatorAdapter {
     /// └─────────┘   └─────────┘   └ ─ ─ ─ ─ ┘                       └─────────┘   └ ─ ─ ─ ─ ┘
     ///
     /// logical group   values      opt_filter           logical group  values       opt_filter
-    ///
     /// ```
     fn invoke_per_accumulator<F>(
         &mut self,
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
index 987ba57f7719e..25f52df61136f 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/accumulate.rs
@@ -20,10 +20,70 @@
 //! [`GroupsAccumulator`]: datafusion_expr_common::groups_accumulator::GroupsAccumulator
 
 use arrow::array::{Array, BooleanArray, BooleanBufferBuilder, PrimitiveArray};
-use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::ArrowPrimitiveType;
 
 use datafusion_expr_common::groups_accumulator::EmitTo;
+
+/// If the input has nulls, then the accumulator must potentially
+/// handle each input null value specially (e.g. for `SUM` to mark the
+/// corresponding sum as null)
+///
+/// If there are filters present, `NullState` tracks if it has seen
+/// *any* value for that group (as some values may be filtered
+/// out). Without a filter, the accumulator is only passed groups that
+/// had at least one value to accumulate so they do not need to track
+/// if they have seen values for a particular group.
+#[derive(Debug)]
+pub enum SeenValues {
+    /// All groups seen so far have seen at least one non-null value
+    All {
+        num_values: usize,
+    },
+    // Some groups have not yet seen a non-null value
+    Some {
+        values: BooleanBufferBuilder,
+    },
+}
+
+impl Default for SeenValues {
+    fn default() -> Self {
+        SeenValues::All { num_values: 0 }
+    }
+}
+
+impl SeenValues {
+    /// Return a mutable reference to the `BooleanBufferBuilder` in `SeenValues::Some`.
+    ///
+    /// If `self` is `SeenValues::All`, it is transitioned to `SeenValues::Some`
+    /// by creating a new `BooleanBufferBuilder` where the first `num_values` are true.
+    ///
+    /// The builder is then ensured to have at least `total_num_groups` length,
+    /// with any new entries initialized to false.
+    fn get_builder(&mut self, total_num_groups: usize) -> &mut BooleanBufferBuilder {
+        match self {
+            SeenValues::All { num_values } => {
+                let mut builder = BooleanBufferBuilder::new(total_num_groups);
+                builder.append_n(*num_values, true);
+                if total_num_groups > *num_values {
+                    builder.append_n(total_num_groups - *num_values, false);
+                }
+                *self = SeenValues::Some { values: builder };
+                match self {
+                    SeenValues::Some { values } => values,
+                    _ => unreachable!(),
+                }
+            }
+            SeenValues::Some { values } => {
+                if values.len() < total_num_groups {
+                    values.append_n(total_num_groups - values.len(), false);
+                }
+                values
+            }
+        }
+    }
+}
+
 /// Track the accumulator null state per row: if any values for that
 /// group were null and if any values have been seen at all for that group.
 ///
@@ -53,12 +113,14 @@ use datafusion_expr_common::groups_accumulator::EmitTo;
 pub struct NullState {
     /// Have we seen any non-filtered input values for `group_index`?
     ///
-    /// If `seen_values[i]` is true, have seen at least one non null
+    /// If `seen_values` is `SeenValues::Some(buffer)` and buffer\[i\] is true, have seen at least one non null
     /// value for group `i`
     ///
-    /// If `seen_values[i]` is false, have not seen any values that
+    /// If `seen_values` is `SeenValues::Some(buffer)` and buffer\[i\] is false, have not seen any values that
     /// pass the filter yet for group `i`
-    seen_values: BooleanBufferBuilder,
+    ///
+    /// If `seen_values` is `SeenValues::All`, all groups have seen at least one non null value
+    seen_values: SeenValues,
 }
 
 impl Default for NullState {
@@ -70,14 +132,16 @@ impl Default for NullState {
 impl NullState {
     pub fn new() -> Self {
         Self {
-            seen_values: BooleanBufferBuilder::new(0),
+            seen_values: SeenValues::All { num_values: 0 },
         }
     }
 
     /// return the size of all buffers allocated by this null state, not including self
     pub fn size(&self) -> usize {
-        // capacity is in bits, so convert to bytes
-        self.seen_values.capacity() / 8
+        match &self.seen_values {
+            SeenValues::All { .. } => 0,
+            SeenValues::Some { values } => values.capacity() / 8,
+        }
     }
 
     /// Invokes `value_fn(group_index, value)` for each non null, non
@@ -107,10 +171,17 @@ impl NullState {
         T: ArrowPrimitiveType + Send,
         F: FnMut(usize, T::Native) + Send,
     {
-        // ensure the seen_values is big enough (start everything at
-        // "not seen" valid)
-        let seen_values =
-            initialize_builder(&mut self.seen_values, total_num_groups, false);
+        // skip null handling if no nulls in input or accumulator
+        if let SeenValues::All { num_values } = &mut self.seen_values
+            && opt_filter.is_none()
+            && values.null_count() == 0
+        {
+            accumulate(group_indices, values, None, value_fn);
+            *num_values = total_num_groups;
+            return;
+        }
+
+        let seen_values = self.seen_values.get_builder(total_num_groups);
         accumulate(group_indices, values, opt_filter, |group_index, value| {
             seen_values.set_bit(group_index, true);
             value_fn(group_index, value);
@@ -140,10 +211,21 @@ impl NullState {
         let data = values.values();
         assert_eq!(data.len(), group_indices.len());
 
-        // ensure the seen_values is big enough (start everything at
-        // "not seen" valid)
-        let seen_values =
-            initialize_builder(&mut self.seen_values, total_num_groups, false);
+        // skip null handling if no nulls in input or accumulator
+        if let SeenValues::All { num_values } = &mut self.seen_values
+            && opt_filter.is_none()
+            && values.null_count() == 0
+        {
+            group_indices
+                .iter()
+                .zip(data.iter())
+                .for_each(|(&group_index, new_value)| value_fn(group_index, new_value));
+            *num_values = total_num_groups;
+
+            return;
+        }
+
+        let seen_values = self.seen_values.get_builder(total_num_groups);
 
         // These could be made more performant by iterating in chunks of 64 bits at a time
         match (values.null_count() > 0, opt_filter) {
@@ -195,11 +277,11 @@ impl NullState {
                     .zip(group_indices.iter())
                     .zip(values.iter())
                     .for_each(|((filter_value, &group_index), new_value)| {
-                        if let Some(true) = filter_value {
-                            if let Some(new_value) = new_value {
-                                seen_values.set_bit(group_index, true);
-                                value_fn(group_index, new_value)
-                            }
+                        if let Some(true) = filter_value
+                            && let Some(new_value) = new_value
+                        {
+                            seen_values.set_bit(group_index, true);
+                            value_fn(group_index, new_value)
                         }
                     })
             }
@@ -211,25 +293,39 @@ impl NullState {
     /// for the `emit_to` rows.
     ///
     /// resets the internal state appropriately
-    pub fn build(&mut self, emit_to: EmitTo) -> NullBuffer {
-        let nulls: BooleanBuffer = self.seen_values.finish();
-
-        let nulls = match emit_to {
-            EmitTo::All => nulls,
-            EmitTo::First(n) => {
-                // split off the first N values in seen_values
-                //
-                // TODO make this more efficient rather than two
-                // copies and bitwise manipulation
-                let first_n_null: BooleanBuffer = nulls.iter().take(n).collect();
-                // reset the existing seen buffer
-                for seen in nulls.iter().skip(n) {
-                    self.seen_values.append(seen);
+    pub fn build(&mut self, emit_to: EmitTo) -> Option<NullBuffer> {
+        match emit_to {
+            EmitTo::All => {
+                let old_seen = std::mem::take(&mut self.seen_values);
+                match old_seen {
+                    SeenValues::All { .. } => None,
+                    SeenValues::Some { mut values } => {
+                        Some(NullBuffer::new(values.finish()))
+                    }
                 }
-                first_n_null
             }
-        };
-        NullBuffer::new(nulls)
+            EmitTo::First(n) => match &mut self.seen_values {
+                SeenValues::All { num_values } => {
+                    *num_values = num_values.saturating_sub(n);
+                    None
+                }
+                SeenValues::Some { .. } => {
+                    let mut old_values = match std::mem::take(&mut self.seen_values) {
+                        SeenValues::Some { values } => values,
+                        _ => unreachable!(),
+                    };
+                    let nulls = old_values.finish();
+                    let first_n_null = nulls.slice(0, n);
+                    let remainder = nulls.slice(n, nulls.len() - n);
+                    let mut new_builder = BooleanBufferBuilder::new(remainder.len());
+                    new_builder.append_buffer(&remainder);
+                    self.seen_values = SeenValues::Some {
+                        values: new_builder,
+                    };
+                    Some(NullBuffer::new(first_n_null))
+                }
+            },
+        }
     }
 }
 
@@ -361,10 +457,10 @@ pub fn accumulate<T, F>(
                 .zip(group_indices.iter())
                 .zip(values.iter())
                 .for_each(|((filter_value, &group_index), new_value)| {
-                    if let Some(true) = filter_value {
-                        if let Some(new_value) = new_value {
-                            value_fn(group_index, new_value)
-                        }
+                    if let Some(true) = filter_value
+                        && let Some(new_value) = new_value
+                    {
+                        value_fn(group_index, new_value)
                     }
                 })
         }
@@ -577,28 +673,15 @@ pub fn accumulate_indices<F>(
     }
 }
 
-/// Ensures that `builder` contains a `BooleanBufferBuilder with at
-/// least `total_num_groups`.
-///
-/// All new entries are initialized to `default_value`
-fn initialize_builder(
-    builder: &mut BooleanBufferBuilder,
-    total_num_groups: usize,
-    default_value: bool,
-) -> &mut BooleanBufferBuilder {
-    if builder.len() < total_num_groups {
-        let new_groups = total_num_groups - builder.len();
-        builder.append_n(new_groups, default_value);
-    }
-    builder
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
 
-    use arrow::array::{Int32Array, UInt32Array};
-    use rand::{rngs::ThreadRng, Rng};
+    use arrow::{
+        array::{Int32Array, UInt32Array},
+        buffer::BooleanBuffer,
+    };
+    use rand::{Rng, rngs::ThreadRng};
     use std::collections::HashSet;
 
     #[test]
@@ -694,11 +777,7 @@ mod test {
             let values_with_nulls: Vec<Option<u32>> = (0..num_values)
                 .map(|_| {
                     let is_null = null_pct < rng.random_range(0.0..1.0);
-                    if is_null {
-                        None
-                    } else {
-                        Some(rng.random())
-                    }
+                    if is_null { None } else { Some(rng.random()) }
                 })
                 .collect();
 
@@ -760,7 +839,6 @@ mod test {
 
         /// Calls `NullState::accumulate` and `accumulate_indices` to
         /// ensure it generates the correct values.
-        ///
         fn accumulate_test(
             group_indices: &[usize],
             values: &UInt32Array,
@@ -829,27 +907,38 @@ mod test {
                         .zip(filter.iter())
                         .for_each(|((&group_index, value), is_included)| {
                             // if value passed filter
-                            if let Some(true) = is_included {
-                                if let Some(value) = value {
-                                    mock.saw_value(group_index);
-                                    expected_values.push((group_index, value));
-                                }
+                            if let Some(true) = is_included
+                                && let Some(value) = value
+                            {
+                                mock.saw_value(group_index);
+                                expected_values.push((group_index, value));
                             }
                         });
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
-            let seen_values = null_state.seen_values.finish_cloned();
-            mock.validate_seen_values(&seen_values);
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
+
+            match &null_state.seen_values {
+                SeenValues::All { num_values } => {
+                    assert_eq!(*num_values, total_num_groups);
+                }
+                SeenValues::Some { values } => {
+                    let seen_values = values.finish_cloned();
+                    mock.validate_seen_values(&seen_values);
+                }
+            }
 
             // Validate the final buffer (one value per group)
             let expected_null_buffer = mock.expected_null_buffer(total_num_groups);
 
             let null_buffer = null_state.build(EmitTo::All);
-
-            assert_eq!(null_buffer, expected_null_buffer);
+            if let Some(nulls) = &null_buffer {
+                assert_eq!(*nulls, expected_null_buffer);
+            }
         }
 
         // Calls `accumulate_indices`
@@ -900,8 +989,10 @@ mod test {
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
         }
 
         /// This is effectively a different implementation of
@@ -945,28 +1036,40 @@ mod test {
                         .zip(filter.iter())
                         .for_each(|((&group_index, value), is_included)| {
                             // if value passed filter
-                            if let Some(true) = is_included {
-                                if let Some(value) = value {
-                                    mock.saw_value(group_index);
-                                    expected_values.push((group_index, value));
-                                }
+                            if let Some(true) = is_included
+                                && let Some(value) = value
+                            {
+                                mock.saw_value(group_index);
+                                expected_values.push((group_index, value));
                             }
                         });
                 }
             }
 
-            assert_eq!(accumulated_values, expected_values,
-                       "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}");
+            assert_eq!(
+                accumulated_values, expected_values,
+                "\n\naccumulated_values:{accumulated_values:#?}\n\nexpected_values:{expected_values:#?}"
+            );
 
-            let seen_values = null_state.seen_values.finish_cloned();
-            mock.validate_seen_values(&seen_values);
+            match &null_state.seen_values {
+                SeenValues::All { num_values } => {
+                    assert_eq!(*num_values, total_num_groups);
+                }
+                SeenValues::Some { values } => {
+                    let seen_values = values.finish_cloned();
+                    mock.validate_seen_values(&seen_values);
+                }
+            }
 
             // Validate the final buffer (one value per group)
-            let expected_null_buffer = mock.expected_null_buffer(total_num_groups);
+            let expected_null_buffer = Some(mock.expected_null_buffer(total_num_groups));
 
+            let is_all_seen = matches!(null_state.seen_values, SeenValues::All { .. });
             let null_buffer = null_state.build(EmitTo::All);
 
-            assert_eq!(null_buffer, expected_null_buffer);
+            if !is_all_seen {
+                assert_eq!(null_buffer, expected_null_buffer);
+            }
         }
     }
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
index 149312e5a9c0f..d1d8924a2c3e8 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs
@@ -37,7 +37,7 @@ use super::accumulate::NullState;
 #[derive(Debug)]
 pub struct BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     /// values per group
     values: BooleanBufferBuilder,
@@ -55,7 +55,7 @@ where
 
 impl<F> BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     pub fn new(bool_fn: F, identity: bool) -> Self {
         Self {
@@ -69,7 +69,7 @@ where
 
 impl<F> GroupsAccumulator for BooleanGroupsAccumulator<F>
 where
-    F: Fn(bool, bool) -> bool + Send + Sync,
+    F: Fn(bool, bool) -> bool + Send + Sync + 'static,
 {
     fn update_batch(
         &mut self,
@@ -120,7 +120,7 @@ where
         };
 
         let nulls = self.null_state.build(emit_to);
-        let values = BooleanArray::new(values, Some(nulls));
+        let values = BooleanArray::new(values, nulls);
         Ok(Arc::new(values))
     }
 
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
index c8c7736bba14f..5b56b77e11d3f 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/nulls.rs
@@ -24,7 +24,7 @@ use arrow::array::{
 };
 use arrow::buffer::NullBuffer;
 use arrow::datatypes::DataType;
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use std::sync::Arc;
 
 /// Sets the validity mask for a `PrimitiveArray` to `nulls`
@@ -44,7 +44,7 @@ pub fn set_nulls<T: ArrowNumericType + Send>(
 /// The `NullBuffer` is
 /// * `true` (representing valid) for values that were `true` in filter
 /// * `false` (representing null) for values that were `false` or `null` in filter
-fn filter_to_nulls(filter: &BooleanArray) -> Option<NullBuffer> {
+pub fn filter_to_nulls(filter: &BooleanArray) -> Option<NullBuffer> {
     let (filter_bools, filter_nulls) = filter.clone().into_parts();
     let filter_bools = NullBuffer::from(filter_bools);
     NullBuffer::union(Some(&filter_bools), filter_nulls.as_ref())
@@ -206,7 +206,7 @@ pub fn set_nulls_dyn(input: &dyn Array, nulls: Option<NullBuffer>) -> Result<Arr
             }
         }
         _ => {
-            return not_impl_err!("Applying nulls {:?}", input.data_type());
+            return not_impl_err!("Applying nulls {}", input.data_type());
         }
     };
     assert_eq!(input.len(), output.len());
diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
index 078982c983fc7..a81b89e1e46f1 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/prim_op.rs
@@ -23,7 +23,7 @@ use arrow::buffer::NullBuffer;
 use arrow::compute;
 use arrow::datatypes::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_datafusion_err, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, internal_datafusion_err};
 use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
 
 use super::accumulate::NullState;
@@ -41,7 +41,7 @@ use super::accumulate::NullState;
 pub struct PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     /// values per group, stored as the native type
     values: Vec<T::Native>,
@@ -62,7 +62,7 @@ where
 impl<T, F> PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     pub fn new(data_type: &DataType, prim_fn: F) -> Self {
         Self {
@@ -84,7 +84,7 @@ where
 impl<T, F> GroupsAccumulator for PrimitiveGroupsAccumulator<T, F>
 where
     T: ArrowPrimitiveType + Send,
-    F: Fn(&mut T::Native, T::Native) + Send + Sync,
+    F: Fn(&mut T::Native, T::Native) + Send + Sync + 'static,
 {
     fn update_batch(
         &mut self,
@@ -106,7 +106,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value| {
-                let value = &mut self.values[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let value = unsafe { self.values.get_unchecked_mut(group_index) };
                 (self.prim_fn)(value, new_value);
             },
         );
@@ -117,7 +118,7 @@ where
     fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
         let values = emit_to.take_needed(&mut self.values);
         let nulls = self.null_state.build(emit_to);
-        let values = PrimitiveArray::<T>::new(values.into(), Some(nulls)) // no copy
+        let values = PrimitiveArray::<T>::new(values.into(), nulls) // no copy
             .with_data_type(self.data_type.clone());
         Ok(Arc::new(values))
     }
@@ -142,7 +143,6 @@ where
     /// The state is:
     /// - self.prim_fn for all non null, non filtered values
     /// - null otherwise
-    ///
     fn convert_to_state(
         &self,
         values: &[ArrayRef],
diff --git a/datafusion/functions-aggregate-common/src/aggregate/sum_distinct.rs b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct.rs
new file mode 100644
index 0000000000000..932bfba0bf0dc
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sum distinct accumulator implementations
+
+pub mod numeric;
+
+pub use numeric::DistinctSumAccumulator;
diff --git a/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs
new file mode 100644
index 0000000000000..e5a23597c44ad
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/aggregate/sum_distinct/numeric.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the accumulator for `SUM DISTINCT` for primitive numeric types
+
+use std::fmt::Debug;
+use std::mem::size_of_val;
+
+use arrow::array::ArrayRef;
+use arrow::array::ArrowNativeTypeOp;
+use arrow::array::ArrowPrimitiveType;
+use arrow::datatypes::ArrowNativeType;
+use arrow::datatypes::DataType;
+
+use datafusion_common::Result;
+use datafusion_common::ScalarValue;
+use datafusion_expr_common::accumulator::Accumulator;
+
+use crate::utils::GenericDistinctBuffer;
+
+/// Accumulator for computing SUM(DISTINCT expr)
+#[derive(Debug)]
+pub struct DistinctSumAccumulator<T: ArrowPrimitiveType> {
+    values: GenericDistinctBuffer<T>,
+    data_type: DataType,
+}
+
+impl<T: ArrowPrimitiveType> DistinctSumAccumulator<T> {
+    pub fn new(data_type: &DataType) -> Self {
+        Self {
+            values: GenericDistinctBuffer::new(data_type.clone()),
+            data_type: data_type.clone(),
+        }
+    }
+
+    pub fn distinct_count(&self) -> usize {
+        self.values.values.len()
+    }
+}
+
+impl<T: ArrowPrimitiveType + Debug> Accumulator for DistinctSumAccumulator<T> {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.values.state()
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.values.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.values.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.distinct_count() == 0 {
+            ScalarValue::new_primitive::<T>(None, &self.data_type)
+        } else {
+            let mut acc = T::Native::usize_as(0);
+            for distinct_value in self.values.values.iter() {
+                acc = acc.add_wrapping(distinct_value.0)
+            }
+            ScalarValue::new_primitive::<T>(Some(acc), &self.data_type)
+        }
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.values.size()
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/lib.rs b/datafusion/functions-aggregate-common/src/lib.rs
index da718e7ceefe6..574d160d4214a 100644
--- a/datafusion/functions-aggregate-common/src/lib.rs
+++ b/datafusion/functions-aggregate-common/src/lib.rs
@@ -26,14 +26,17 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod accumulator;
 pub mod aggregate;
 pub mod merge_arrays;
+pub mod min_max;
+pub mod noop_accumulator;
 pub mod order;
 pub mod stats;
 pub mod tdigest;
diff --git a/datafusion/functions-aggregate-common/src/merge_arrays.rs b/datafusion/functions-aggregate-common/src/merge_arrays.rs
index 0cfea662497e1..fc96931853b9d 100644
--- a/datafusion/functions-aggregate-common/src/merge_arrays.rs
+++ b/datafusion/functions-aggregate-common/src/merge_arrays.rs
@@ -17,7 +17,7 @@
 
 use arrow::compute::SortOptions;
 use datafusion_common::utils::compare_rows;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::{ScalarValue, exec_err};
 use std::cmp::Ordering;
 use std::collections::{BinaryHeap, VecDeque};
 
@@ -67,6 +67,7 @@ impl<'a> CustomElement<'a> {
 // - When used inside `BinaryHeap` it is a min-heap.
 impl Ord for CustomElement<'_> {
     fn cmp(&self, other: &Self) -> Ordering {
+        // TODO Ord/PartialOrd is not consistent with PartialEq; PartialOrd contract is violated
         // Compares according to custom ordering
         self.ordering(&self.ordering, &other.ordering)
             // Convert max heap to min heap
@@ -86,7 +87,7 @@ impl PartialOrd for CustomElement<'_> {
 
 /// This functions merges `values` array (`&[Vec<ScalarValue>]`) into single array `Vec<ScalarValue>`
 /// Merging done according to ordering values stored inside `ordering_values` (`&[Vec<Vec<ScalarValue>>]`)
-/// Inner `Vec<ScalarValue>` in the `ordering_values` can be thought as ordering information for the
+/// Inner `Vec<ScalarValue>` in the `ordering_values` can be thought as ordering information for
 /// each `ScalarValue` in the `values` array.
 /// Desired ordering specified by `sort_options` argument (Should have same size with inner `Vec<ScalarValue>`
 /// of the `ordering_values` array).
@@ -118,17 +119,27 @@ pub fn merge_ordered_arrays(
     // Defines according to which ordering comparisons should be done.
     sort_options: &[SortOptions],
 ) -> datafusion_common::Result<(Vec<ScalarValue>, Vec<Vec<ScalarValue>>)> {
-    // Keep track the most recent data of each branch, in binary heap data structure.
+    // Keep track of the most recent data of each branch, in a binary heap data structure.
     let mut heap = BinaryHeap::<CustomElement>::new();
 
-    if values.len() != ordering_values.len()
-        || values
-            .iter()
-            .zip(ordering_values.iter())
-            .any(|(vals, ordering_vals)| vals.len() != ordering_vals.len())
+    if values.len() != ordering_values.len() {
+        return exec_err!(
+            "Expects values and ordering_values to have same size but got {} and {}",
+            values.len(),
+            ordering_values.len()
+        );
+    }
+    if let Some((idx, (values, ordering_values))) = values
+        .iter()
+        .zip(ordering_values.iter())
+        .enumerate()
+        .find(|(_, (vals, ordering_vals))| vals.len() != ordering_vals.len())
     {
         return exec_err!(
-            "Expects values arguments and/or ordering_values arguments to have same size"
+            "Expects values elements and ordering_values elements to have same size but got {} and {} at index {}",
+            values.len(),
+            ordering_values.len(),
+            idx
         );
     }
     let n_branch = values.len();
diff --git a/datafusion/functions-aggregate-common/src/min_max.rs b/datafusion/functions-aggregate-common/src/min_max.rs
new file mode 100644
index 0000000000000..27620221cf23c
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/min_max.rs
@@ -0,0 +1,856 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Basic min/max functionality shared across DataFusion aggregate functions
+
+use arrow::array::{
+    ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
+    Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+    DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray,
+    DurationSecondArray, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array,
+    Int8Array, Int16Array, Int32Array, Int64Array, IntervalDayTimeArray,
+    IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray,
+    LargeStringArray, StringArray, StringViewArray, Time32MillisecondArray,
+    Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
+};
+use arrow::compute;
+use arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, downcast_value, internal_err,
+};
+use datafusion_expr_common::accumulator::Accumulator;
+use std::{cmp::Ordering, mem::size_of_val};
+
+// min/max of two non-string scalar values.
+macro_rules! typed_min_max {
+    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident $(, $EXTRA_ARGS:ident)*) => {{
+        ScalarValue::$SCALAR(
+            match ($VALUE, $DELTA) {
+                (None, None) => None,
+                (Some(a), None) => Some(*a),
+                (None, Some(b)) => Some(*b),
+                (Some(a), Some(b)) => Some((*a).$OP(*b)),
+            },
+            $($EXTRA_ARGS.clone()),*
+        )
+    }};
+}
+
+macro_rules! typed_min_max_float {
+    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{
+        ScalarValue::$SCALAR(match ($VALUE, $DELTA) {
+            (None, None) => None,
+            (Some(a), None) => Some(*a),
+            (None, Some(b)) => Some(*b),
+            (Some(a), Some(b)) => match a.total_cmp(b) {
+                choose_min_max!($OP) => Some(*b),
+                _ => Some(*a),
+            },
+        })
+    }};
+}
+
+// min/max of two scalar string values.
+macro_rules! typed_min_max_string {
+    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{
+        ScalarValue::$SCALAR(match ($VALUE, $DELTA) {
+            (None, None) => None,
+            (Some(a), None) => Some(a.clone()),
+            (None, Some(b)) => Some(b.clone()),
+            (Some(a), Some(b)) => Some((a).$OP(b).clone()),
+        })
+    }};
+}
+
+// min/max of two scalar string values with a prefix argument.
+macro_rules! typed_min_max_string_arg {
+    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident, $ARG:expr) => {{
+        ScalarValue::$SCALAR(
+            $ARG,
+            match ($VALUE, $DELTA) {
+                (None, None) => None,
+                (Some(a), None) => Some(a.clone()),
+                (None, Some(b)) => Some(b.clone()),
+                (Some(a), Some(b)) => Some((a).$OP(b).clone()),
+            },
+        )
+    }};
+}
+
+macro_rules! choose_min_max {
+    (min) => {
+        std::cmp::Ordering::Greater
+    };
+    (max) => {
+        std::cmp::Ordering::Less
+    };
+}
+
+macro_rules! interval_min_max {
+    ($OP:tt, $LHS:expr, $RHS:expr) => {{
+        match $LHS.partial_cmp(&$RHS) {
+            Some(choose_min_max!($OP)) => $RHS.clone(),
+            Some(_) => $LHS.clone(),
+            None => {
+                return internal_err!(
+                    "Comparison error while computing interval min/max"
+                );
+            }
+        }
+    }};
+}
+
+macro_rules! min_max_generic {
+    ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
+        if $VALUE.is_null() {
+            let mut delta_copy = $DELTA.clone();
+            // When the new value won we want to compact it to
+            // avoid storing the entire input
+            delta_copy.compact();
+            delta_copy
+        } else if $DELTA.is_null() {
+            $VALUE.clone()
+        } else {
+            match $VALUE.partial_cmp(&$DELTA) {
+                Some(choose_min_max!($OP)) => {
+                    // When the new value won we want to compact it to
+                    // avoid storing the entire input
+                    let mut delta_copy = $DELTA.clone();
+                    delta_copy.compact();
+                    delta_copy
+                }
+                _ => $VALUE.clone(),
+            }
+        }
+    }};
+}
+
+// min/max of two scalar values of the same type
+macro_rules! min_max {
+    ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
+        Ok(match ($VALUE, $DELTA) {
+            (ScalarValue::Null, ScalarValue::Null) => ScalarValue::Null,
+            (
+                lhs @ ScalarValue::Decimal32(lhsv, lhsp, lhss),
+                rhs @ ScalarValue::Decimal32(rhsv, rhsp, rhss)
+            ) => {
+                if lhsp.eq(rhsp) && lhss.eq(rhss) {
+                    typed_min_max!(lhsv, rhsv, Decimal32, $OP, lhsp, lhss)
+                } else {
+                    return internal_err!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    (lhs, rhs)
+                );
+                }
+            }
+            (
+                lhs @ ScalarValue::Decimal64(lhsv, lhsp, lhss),
+                rhs @ ScalarValue::Decimal64(rhsv, rhsp, rhss)
+            ) => {
+                if lhsp.eq(rhsp) && lhss.eq(rhss) {
+                    typed_min_max!(lhsv, rhsv, Decimal64, $OP, lhsp, lhss)
+                } else {
+                    return internal_err!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    (lhs, rhs)
+                );
+                }
+            }
+            (
+                lhs @ ScalarValue::Decimal128(lhsv, lhsp, lhss),
+                rhs @ ScalarValue::Decimal128(rhsv, rhsp, rhss)
+            ) => {
+                if lhsp.eq(rhsp) && lhss.eq(rhss) {
+                    typed_min_max!(lhsv, rhsv, Decimal128, $OP, lhsp, lhss)
+                } else {
+                    return internal_err!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    (lhs, rhs)
+                );
+                }
+            }
+            (
+                lhs @ ScalarValue::Decimal256(lhsv, lhsp, lhss),
+                rhs @ ScalarValue::Decimal256(rhsv, rhsp, rhss)
+            ) => {
+                if lhsp.eq(rhsp) && lhss.eq(rhss) {
+                    typed_min_max!(lhsv, rhsv, Decimal256, $OP, lhsp, lhss)
+                } else {
+                    return internal_err!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    (lhs, rhs)
+                );
+                }
+            }
+            (ScalarValue::Boolean(lhs), ScalarValue::Boolean(rhs)) => {
+                typed_min_max!(lhs, rhs, Boolean, $OP)
+            }
+            (ScalarValue::Float64(lhs), ScalarValue::Float64(rhs)) => {
+                typed_min_max_float!(lhs, rhs, Float64, $OP)
+            }
+            (ScalarValue::Float32(lhs), ScalarValue::Float32(rhs)) => {
+                typed_min_max_float!(lhs, rhs, Float32, $OP)
+            }
+            (ScalarValue::Float16(lhs), ScalarValue::Float16(rhs)) => {
+                typed_min_max_float!(lhs, rhs, Float16, $OP)
+            }
+            (ScalarValue::UInt64(lhs), ScalarValue::UInt64(rhs)) => {
+                typed_min_max!(lhs, rhs, UInt64, $OP)
+            }
+            (ScalarValue::UInt32(lhs), ScalarValue::UInt32(rhs)) => {
+                typed_min_max!(lhs, rhs, UInt32, $OP)
+            }
+            (ScalarValue::UInt16(lhs), ScalarValue::UInt16(rhs)) => {
+                typed_min_max!(lhs, rhs, UInt16, $OP)
+            }
+            (ScalarValue::UInt8(lhs), ScalarValue::UInt8(rhs)) => {
+                typed_min_max!(lhs, rhs, UInt8, $OP)
+            }
+            (ScalarValue::Int64(lhs), ScalarValue::Int64(rhs)) => {
+                typed_min_max!(lhs, rhs, Int64, $OP)
+            }
+            (ScalarValue::Int32(lhs), ScalarValue::Int32(rhs)) => {
+                typed_min_max!(lhs, rhs, Int32, $OP)
+            }
+            (ScalarValue::Int16(lhs), ScalarValue::Int16(rhs)) => {
+                typed_min_max!(lhs, rhs, Int16, $OP)
+            }
+            (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => {
+                typed_min_max!(lhs, rhs, Int8, $OP)
+            }
+            (ScalarValue::Utf8(lhs), ScalarValue::Utf8(rhs)) => {
+                typed_min_max_string!(lhs, rhs, Utf8, $OP)
+            }
+            (ScalarValue::LargeUtf8(lhs), ScalarValue::LargeUtf8(rhs)) => {
+                typed_min_max_string!(lhs, rhs, LargeUtf8, $OP)
+            }
+            (ScalarValue::Utf8View(lhs), ScalarValue::Utf8View(rhs)) => {
+                typed_min_max_string!(lhs, rhs, Utf8View, $OP)
+            }
+            (ScalarValue::Binary(lhs), ScalarValue::Binary(rhs)) => {
+                typed_min_max_string!(lhs, rhs, Binary, $OP)
+            }
+            (ScalarValue::LargeBinary(lhs), ScalarValue::LargeBinary(rhs)) => {
+                typed_min_max_string!(lhs, rhs, LargeBinary, $OP)
+            }
+            (ScalarValue::FixedSizeBinary(lsize, lhs), ScalarValue::FixedSizeBinary(rsize, rhs)) => {
+                if lsize == rsize {
+                    typed_min_max_string_arg!(lhs, rhs, FixedSizeBinary, $OP, *lsize)
+                }
+                else {
+                    return internal_err!(
+                        "MIN/MAX is not expected to receive FixedSizeBinary of incompatible sizes {:?}",
+                        (lsize, rsize))
+                }
+            }
+            (ScalarValue::BinaryView(lhs), ScalarValue::BinaryView(rhs)) => {
+                typed_min_max_string!(lhs, rhs, BinaryView, $OP)
+            }
+            (ScalarValue::TimestampSecond(lhs, l_tz), ScalarValue::TimestampSecond(rhs, _)) => {
+                typed_min_max!(lhs, rhs, TimestampSecond, $OP, l_tz)
+            }
+            (
+                ScalarValue::TimestampMillisecond(lhs, l_tz),
+                ScalarValue::TimestampMillisecond(rhs, _),
+            ) => {
+                typed_min_max!(lhs, rhs, TimestampMillisecond, $OP, l_tz)
+            }
+            (
+                ScalarValue::TimestampMicrosecond(lhs, l_tz),
+                ScalarValue::TimestampMicrosecond(rhs, _),
+            ) => {
+                typed_min_max!(lhs, rhs, TimestampMicrosecond, $OP, l_tz)
+            }
+            (
+                ScalarValue::TimestampNanosecond(lhs, l_tz),
+                ScalarValue::TimestampNanosecond(rhs, _),
+            ) => {
+                typed_min_max!(lhs, rhs, TimestampNanosecond, $OP, l_tz)
+            }
+            (
+                ScalarValue::Date32(lhs),
+                ScalarValue::Date32(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Date32, $OP)
+            }
+            (
+                ScalarValue::Date64(lhs),
+                ScalarValue::Date64(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Date64, $OP)
+            }
+            (
+                ScalarValue::Time32Second(lhs),
+                ScalarValue::Time32Second(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Time32Second, $OP)
+            }
+            (
+                ScalarValue::Time32Millisecond(lhs),
+                ScalarValue::Time32Millisecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Time32Millisecond, $OP)
+            }
+            (
+                ScalarValue::Time64Microsecond(lhs),
+                ScalarValue::Time64Microsecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Time64Microsecond, $OP)
+            }
+            (
+                ScalarValue::Time64Nanosecond(lhs),
+                ScalarValue::Time64Nanosecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, Time64Nanosecond, $OP)
+            }
+            (
+                ScalarValue::IntervalYearMonth(lhs),
+                ScalarValue::IntervalYearMonth(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, IntervalYearMonth, $OP)
+            }
+            (
+                ScalarValue::IntervalMonthDayNano(lhs),
+                ScalarValue::IntervalMonthDayNano(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, IntervalMonthDayNano, $OP)
+            }
+            (
+                ScalarValue::IntervalDayTime(lhs),
+                ScalarValue::IntervalDayTime(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, IntervalDayTime, $OP)
+            }
+            (
+                ScalarValue::IntervalYearMonth(_),
+                ScalarValue::IntervalMonthDayNano(_),
+            ) | (
+                ScalarValue::IntervalYearMonth(_),
+                ScalarValue::IntervalDayTime(_),
+            ) | (
+                ScalarValue::IntervalMonthDayNano(_),
+                ScalarValue::IntervalDayTime(_),
+            ) | (
+                ScalarValue::IntervalMonthDayNano(_),
+                ScalarValue::IntervalYearMonth(_),
+            ) | (
+                ScalarValue::IntervalDayTime(_),
+                ScalarValue::IntervalYearMonth(_),
+            ) | (
+                ScalarValue::IntervalDayTime(_),
+                ScalarValue::IntervalMonthDayNano(_),
+            ) => {
+                interval_min_max!($OP, $VALUE, $DELTA)
+            }
+                    (
+                ScalarValue::DurationSecond(lhs),
+                ScalarValue::DurationSecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, DurationSecond, $OP)
+            }
+                                (
+                ScalarValue::DurationMillisecond(lhs),
+                ScalarValue::DurationMillisecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, DurationMillisecond, $OP)
+            }
+                                (
+                ScalarValue::DurationMicrosecond(lhs),
+                ScalarValue::DurationMicrosecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, DurationMicrosecond, $OP)
+            }
+                                        (
+                ScalarValue::DurationNanosecond(lhs),
+                ScalarValue::DurationNanosecond(rhs),
+            ) => {
+                typed_min_max!(lhs, rhs, DurationNanosecond, $OP)
+            }
+
+            (
+                lhs @ ScalarValue::Struct(_),
+                rhs @ ScalarValue::Struct(_),
+            ) => {
+                min_max_generic!(lhs, rhs, $OP)
+            }
+
+            (
+                lhs @ ScalarValue::List(_),
+                rhs @ ScalarValue::List(_),
+            ) => {
+                min_max_generic!(lhs, rhs, $OP)
+            }
+
+
+            (
+                lhs @ ScalarValue::LargeList(_),
+                rhs @ ScalarValue::LargeList(_),
+            ) => {
+                min_max_generic!(lhs, rhs, $OP)
+            }
+
+
+            (
+                lhs @ ScalarValue::FixedSizeList(_),
+                rhs @ ScalarValue::FixedSizeList(_),
+            ) => {
+                min_max_generic!(lhs, rhs, $OP)
+            }
+
+            e => {
+                return internal_err!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    e
+                )
+            }
+        })
+    }};
+}
+
+/// An accumulator to compute the maximum value
+#[derive(Debug, Clone)]
+pub struct MaxAccumulator {
+    max: ScalarValue,
+}
+
+impl MaxAccumulator {
+    /// new max accumulator
+    pub fn try_new(datatype: &DataType) -> Result<Self> {
+        Ok(Self {
+            max: ScalarValue::try_from(datatype)?,
+        })
+    }
+}
+
+impl Accumulator for MaxAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = &values[0];
+        let delta = &max_batch(values)?;
+        let new_max: Result<ScalarValue, DataFusionError> =
+            min_max!(&self.max, delta, max);
+        self.max = new_max?;
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.update_batch(states)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![self.evaluate()?])
+    }
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.max.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) - size_of_val(&self.max) + self.max.size()
+    }
+}
+
+/// An accumulator to compute the minimum value
+#[derive(Debug, Clone)]
+pub struct MinAccumulator {
+    min: ScalarValue,
+}
+
+impl MinAccumulator {
+    /// new min accumulator
+    pub fn try_new(datatype: &DataType) -> Result<Self> {
+        Ok(Self {
+            min: ScalarValue::try_from(datatype)?,
+        })
+    }
+}
+
+impl Accumulator for MinAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![self.evaluate()?])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = &values[0];
+        let delta = &min_batch(values)?;
+        let new_min: Result<ScalarValue, DataFusionError> =
+            min_max!(&self.min, delta, min);
+        self.min = new_min?;
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.update_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.min.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) - size_of_val(&self.min) + self.min.size()
+    }
+}
+
+// Statically-typed version of min/max(array) -> ScalarValue for string types
+macro_rules! typed_min_max_batch_string {
+    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{
+        let array = downcast_value!($VALUES, $ARRAYTYPE);
+        let value = compute::$OP(array);
+        let value = value.and_then(|e| Some(e.to_string()));
+        ScalarValue::$SCALAR(value)
+    }};
+}
+
+// Statically-typed version of min/max(array) -> ScalarValue for binary types.
+macro_rules! typed_min_max_batch_binary {
+    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{
+        let array = downcast_value!($VALUES, $ARRAYTYPE);
+        let value = compute::$OP(array);
+        let value = value.and_then(|e| Some(e.to_vec()));
+        ScalarValue::$SCALAR(value)
+    }};
+}
+
+// Statically-typed version of min/max(array) -> ScalarValue for non-string types.
+macro_rules! typed_min_max_batch {
+    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident $(, $EXTRA_ARGS:ident)*) => {{
+        let array = downcast_value!($VALUES, $ARRAYTYPE);
+        let value = compute::$OP(array);
+        ScalarValue::$SCALAR(value, $($EXTRA_ARGS.clone()),*)
+    }};
+}
+
+// Statically-typed version of min/max(array) -> ScalarValue  for non-string types.
+// this is a macro to support both operations (min and max).
+macro_rules! min_max_batch {
+    ($VALUES:expr, $OP:ident) => {{
+        match $VALUES.data_type() {
+            DataType::Null => ScalarValue::Null,
+            DataType::Decimal32(precision, scale) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Decimal32Array,
+                    Decimal32,
+                    $OP,
+                    precision,
+                    scale
+                )
+            }
+            DataType::Decimal64(precision, scale) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Decimal64Array,
+                    Decimal64,
+                    $OP,
+                    precision,
+                    scale
+                )
+            }
+            DataType::Decimal128(precision, scale) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Decimal128Array,
+                    Decimal128,
+                    $OP,
+                    precision,
+                    scale
+                )
+            }
+            DataType::Decimal256(precision, scale) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Decimal256Array,
+                    Decimal256,
+                    $OP,
+                    precision,
+                    scale
+                )
+            }
+            // all types that have a natural order
+            DataType::Float64 => {
+                typed_min_max_batch!($VALUES, Float64Array, Float64, $OP)
+            }
+            DataType::Float32 => {
+                typed_min_max_batch!($VALUES, Float32Array, Float32, $OP)
+            }
+            DataType::Float16 => {
+                typed_min_max_batch!($VALUES, Float16Array, Float16, $OP)
+            }
+            DataType::Int64 => typed_min_max_batch!($VALUES, Int64Array, Int64, $OP),
+            DataType::Int32 => typed_min_max_batch!($VALUES, Int32Array, Int32, $OP),
+            DataType::Int16 => typed_min_max_batch!($VALUES, Int16Array, Int16, $OP),
+            DataType::Int8 => typed_min_max_batch!($VALUES, Int8Array, Int8, $OP),
+            DataType::UInt64 => typed_min_max_batch!($VALUES, UInt64Array, UInt64, $OP),
+            DataType::UInt32 => typed_min_max_batch!($VALUES, UInt32Array, UInt32, $OP),
+            DataType::UInt16 => typed_min_max_batch!($VALUES, UInt16Array, UInt16, $OP),
+            DataType::UInt8 => typed_min_max_batch!($VALUES, UInt8Array, UInt8, $OP),
+            DataType::Timestamp(TimeUnit::Second, tz_opt) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    TimestampSecondArray,
+                    TimestampSecond,
+                    $OP,
+                    tz_opt
+                )
+            }
+            DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => typed_min_max_batch!(
+                $VALUES,
+                TimestampMillisecondArray,
+                TimestampMillisecond,
+                $OP,
+                tz_opt
+            ),
+            DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => typed_min_max_batch!(
+                $VALUES,
+                TimestampMicrosecondArray,
+                TimestampMicrosecond,
+                $OP,
+                tz_opt
+            ),
+            DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => typed_min_max_batch!(
+                $VALUES,
+                TimestampNanosecondArray,
+                TimestampNanosecond,
+                $OP,
+                tz_opt
+            ),
+            DataType::Date32 => typed_min_max_batch!($VALUES, Date32Array, Date32, $OP),
+            DataType::Date64 => typed_min_max_batch!($VALUES, Date64Array, Date64, $OP),
+            DataType::Time32(TimeUnit::Second) => {
+                typed_min_max_batch!($VALUES, Time32SecondArray, Time32Second, $OP)
+            }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Time32MillisecondArray,
+                    Time32Millisecond,
+                    $OP
+                )
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Time64MicrosecondArray,
+                    Time64Microsecond,
+                    $OP
+                )
+            }
+            DataType::Time64(TimeUnit::Nanosecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    Time64NanosecondArray,
+                    Time64Nanosecond,
+                    $OP
+                )
+            }
+            DataType::Interval(IntervalUnit::YearMonth) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    IntervalYearMonthArray,
+                    IntervalYearMonth,
+                    $OP
+                )
+            }
+            DataType::Interval(IntervalUnit::DayTime) => {
+                typed_min_max_batch!($VALUES, IntervalDayTimeArray, IntervalDayTime, $OP)
+            }
+            DataType::Interval(IntervalUnit::MonthDayNano) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    IntervalMonthDayNanoArray,
+                    IntervalMonthDayNano,
+                    $OP
+                )
+            }
+            DataType::Duration(TimeUnit::Second) => {
+                typed_min_max_batch!($VALUES, DurationSecondArray, DurationSecond, $OP)
+            }
+            DataType::Duration(TimeUnit::Millisecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    DurationMillisecondArray,
+                    DurationMillisecond,
+                    $OP
+                )
+            }
+            DataType::Duration(TimeUnit::Microsecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    DurationMicrosecondArray,
+                    DurationMicrosecond,
+                    $OP
+                )
+            }
+            DataType::Duration(TimeUnit::Nanosecond) => {
+                typed_min_max_batch!(
+                    $VALUES,
+                    DurationNanosecondArray,
+                    DurationNanosecond,
+                    $OP
+                )
+            }
+            other => {
+                // This should have been handled before
+                return datafusion_common::internal_err!(
+                    "Min/Max accumulator not implemented for type {}",
+                    other
+                );
+            }
+        }
+    }};
+}
+
+/// dynamically-typed min(array) -> ScalarValue
+pub fn min_batch(values: &ArrayRef) -> Result<ScalarValue> {
+    Ok(match values.data_type() {
+        DataType::Utf8 => {
+            typed_min_max_batch_string!(values, StringArray, Utf8, min_string)
+        }
+        DataType::LargeUtf8 => {
+            typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, min_string)
+        }
+        DataType::Utf8View => {
+            typed_min_max_batch_string!(
+                values,
+                StringViewArray,
+                Utf8View,
+                min_string_view
+            )
+        }
+        DataType::Boolean => {
+            typed_min_max_batch!(values, BooleanArray, Boolean, min_boolean)
+        }
+        DataType::Binary => {
+            typed_min_max_batch_binary!(&values, BinaryArray, Binary, min_binary)
+        }
+        DataType::LargeBinary => {
+            typed_min_max_batch_binary!(
+                &values,
+                LargeBinaryArray,
+                LargeBinary,
+                min_binary
+            )
+        }
+        DataType::FixedSizeBinary(size) => {
+            let array = downcast_value!(&values, FixedSizeBinaryArray);
+            let value = compute::min_fixed_size_binary(array);
+            let value = value.map(|e| e.to_vec());
+            ScalarValue::FixedSizeBinary(*size, value)
+        }
+        DataType::BinaryView => {
+            typed_min_max_batch_binary!(
+                &values,
+                BinaryViewArray,
+                BinaryView,
+                min_binary_view
+            )
+        }
+        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Greater)?,
+        DataType::List(_) => min_max_batch_generic(values, Ordering::Greater)?,
+        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Greater)?,
+        DataType::FixedSizeList(_, _) => {
+            min_max_batch_generic(values, Ordering::Greater)?
+        }
+        DataType::Dictionary(_, _) => {
+            let values = values.as_any_dictionary().values();
+            min_batch(values)?
+        }
+        _ => min_max_batch!(values, min),
+    })
+}
+
+/// Generic min/max implementation for complex types
+fn min_max_batch_generic(array: &ArrayRef, ordering: Ordering) -> Result<ScalarValue> {
+    if array.len() == array.null_count() {
+        return ScalarValue::try_from(array.data_type());
+    }
+    let mut extreme = ScalarValue::try_from_array(array, 0)?;
+    for i in 1..array.len() {
+        let current = ScalarValue::try_from_array(array, i)?;
+        if current.is_null() {
+            continue;
+        }
+        if extreme.is_null() {
+            extreme = current;
+            continue;
+        }
+        let cmp = extreme.try_cmp(&current)?;
+        if cmp == ordering {
+            extreme = current;
+        }
+    }
+
+    Ok(extreme)
+}
+
+/// dynamically-typed max(array) -> ScalarValue
+pub fn max_batch(values: &ArrayRef) -> Result<ScalarValue> {
+    Ok(match values.data_type() {
+        DataType::Utf8 => {
+            typed_min_max_batch_string!(values, StringArray, Utf8, max_string)
+        }
+        DataType::LargeUtf8 => {
+            typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, max_string)
+        }
+        DataType::Utf8View => {
+            typed_min_max_batch_string!(
+                values,
+                StringViewArray,
+                Utf8View,
+                max_string_view
+            )
+        }
+        DataType::Boolean => {
+            typed_min_max_batch!(values, BooleanArray, Boolean, max_boolean)
+        }
+        DataType::Binary => {
+            typed_min_max_batch_binary!(&values, BinaryArray, Binary, max_binary)
+        }
+        DataType::BinaryView => {
+            typed_min_max_batch_binary!(
+                &values,
+                BinaryViewArray,
+                BinaryView,
+                max_binary_view
+            )
+        }
+        DataType::LargeBinary => {
+            typed_min_max_batch_binary!(
+                &values,
+                LargeBinaryArray,
+                LargeBinary,
+                max_binary
+            )
+        }
+        DataType::FixedSizeBinary(size) => {
+            let array = downcast_value!(&values, FixedSizeBinaryArray);
+            let value = compute::max_fixed_size_binary(array);
+            let value = value.map(|e| e.to_vec());
+            ScalarValue::FixedSizeBinary(*size, value)
+        }
+        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Less)?,
+        DataType::List(_) => min_max_batch_generic(values, Ordering::Less)?,
+        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Less)?,
+        DataType::FixedSizeList(_, _) => min_max_batch_generic(values, Ordering::Less)?,
+        DataType::Dictionary(_, _) => {
+            let values = values.as_any_dictionary().values();
+            max_batch(values)?
+        }
+        _ => min_max_batch!(values, max),
+    })
+}
diff --git a/datafusion/functions-aggregate-common/src/noop_accumulator.rs b/datafusion/functions-aggregate-common/src/noop_accumulator.rs
new file mode 100644
index 0000000000000..e34d58770a69d
--- /dev/null
+++ b/datafusion/functions-aggregate-common/src/noop_accumulator.rs
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::accumulator::Accumulator;
+
+/// [`Accumulator`] that does no work and always returns a fixed value (default
+/// of `NULL` but can be customized).
+///
+/// Useful for aggregate functions that need to handle an input of [`DataType::Null`]
+/// that does no work.
+///
+/// [`DataType::Null`]: arrow::datatypes::DataType::Null
+#[derive(Debug)]
+pub struct NoopAccumulator {
+    evaluate_value: ScalarValue,
+}
+
+impl NoopAccumulator {
+    pub fn new(evaluate_value: ScalarValue) -> Self {
+        Self { evaluate_value }
+    }
+}
+
+impl Default for NoopAccumulator {
+    fn default() -> Self {
+        Self {
+            evaluate_value: ScalarValue::Null,
+        }
+    }
+}
+
+impl Accumulator for NoopAccumulator {
+    fn update_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.evaluate_value.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        // We ensure we return a state field even if unused otherwise we run into
+        // issues with queries like `SELECT agg_fn(NULL) FROM table`
+        Ok(vec![ScalarValue::Null])
+    }
+
+    fn merge_batch(&mut self, _states: &[ArrayRef]) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-aggregate-common/src/order.rs b/datafusion/functions-aggregate-common/src/order.rs
index bfa6e39138f9e..0908396d78341 100644
--- a/datafusion/functions-aggregate-common/src/order.rs
+++ b/datafusion/functions-aggregate-common/src/order.rs
@@ -22,9 +22,20 @@ pub enum AggregateOrderSensitivity {
     /// Ordering at the input is not important for the result of the aggregator.
     Insensitive,
     /// Indicates that the aggregate expression has a hard requirement on ordering.
-    /// The aggregator can not produce a correct result unless its ordering
+    /// The aggregator cannot produce a correct result unless its ordering
     /// requirement is satisfied.
     HardRequirement,
+    /// Indicates that the aggregator is more efficient when the input is ordered
+    /// but can still produce its result correctly regardless of the input ordering.
+    /// This is similar to, but stronger than, [`Self::Beneficial`].
+    ///
+    /// Similarly to [`Self::HardRequirement`], when possible DataFusion will insert
+    /// a `SortExec`, to reorder the input to match the SoftRequirement. However,
+    /// when such a `SortExec` cannot be inserted, (for example, due to conflicting
+    /// [`Self::HardRequirement`] with other ordered aggregates in the query),
+    /// the aggregate function will still execute, without the preferred order, unlike
+    /// with [`Self::HardRequirement`]
+    SoftRequirement,
     /// Indicates that ordering is beneficial for the aggregate expression in terms
     /// of evaluation efficiency. The aggregator can produce its result efficiently
     /// when its required ordering is satisfied; however, it can still produce the
@@ -38,7 +49,7 @@ impl AggregateOrderSensitivity {
     }
 
     pub fn is_beneficial(&self) -> bool {
-        self.eq(&AggregateOrderSensitivity::Beneficial)
+        matches!(self, Self::SoftRequirement | Self::Beneficial)
     }
 
     pub fn hard_requires(&self) -> bool {
diff --git a/datafusion/functions-aggregate-common/src/stats.rs b/datafusion/functions-aggregate-common/src/stats.rs
index bcd004db78319..593b105426be5 100644
--- a/datafusion/functions-aggregate-common/src/stats.rs
+++ b/datafusion/functions-aggregate-common/src/stats.rs
@@ -17,7 +17,7 @@
 
 /// TODO: Move this to functions-aggregate module
 /// Enum used for differentiating population and sample for statistical functions
-#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
 pub enum StatsType {
     /// Population
     Population,
diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs
index 378fc8c42bc66..a7450f0eb52e9 100644
--- a/datafusion/functions-aggregate-common/src/tdigest.rs
+++ b/datafusion/functions-aggregate-common/src/tdigest.rs
@@ -31,9 +31,8 @@
 
 use arrow::datatypes::DataType;
 use arrow::datatypes::Float64Type;
-use datafusion_common::cast::as_primitive_array;
-use datafusion_common::Result;
 use datafusion_common::ScalarValue;
+use datafusion_common::cast::as_primitive_array;
 use std::cmp::Ordering;
 use std::mem::{size_of, size_of_val};
 
@@ -45,57 +44,11 @@ macro_rules! cast_scalar_f64 {
     ($value:expr ) => {
         match &$value {
             ScalarValue::Float64(Some(v)) => *v,
-            v => panic!("invalid type {:?}", v),
-        }
-    };
-}
-
-// Cast a non-null [`ScalarValue::UInt64`] to an [`u64`], or
-// panic.
-macro_rules! cast_scalar_u64 {
-    ($value:expr ) => {
-        match &$value {
-            ScalarValue::UInt64(Some(v)) => *v,
-            v => panic!("invalid type {:?}", v),
-        }
-    };
-}
-
-/// This trait is implemented for each type a [`TDigest`] can operate on,
-/// allowing it to support both numerical rust types (obtained from
-/// `PrimitiveArray` instances), and [`ScalarValue`] instances.
-pub trait TryIntoF64 {
-    /// A fallible conversion of a possibly null `self` into a [`f64`].
-    ///
-    /// If `self` is null, this method must return `Ok(None)`.
-    ///
-    /// If `self` cannot be coerced to the desired type, this method must return
-    /// an `Err` variant.
-    fn try_as_f64(&self) -> Result<Option<f64>>;
-}
-
-/// Generate an infallible conversion from `type` to an [`f64`].
-macro_rules! impl_try_ordered_f64 {
-    ($type:ty) => {
-        impl TryIntoF64 for $type {
-            fn try_as_f64(&self) -> Result<Option<f64>> {
-                Ok(Some(*self as f64))
-            }
+            v => panic!("invalid type {}", v),
         }
     };
 }
 
-impl_try_ordered_f64!(f64);
-impl_try_ordered_f64!(f32);
-impl_try_ordered_f64!(i64);
-impl_try_ordered_f64!(i32);
-impl_try_ordered_f64!(i16);
-impl_try_ordered_f64!(i8);
-impl_try_ordered_f64!(u64);
-impl_try_ordered_f64!(u32);
-impl_try_ordered_f64!(u16);
-impl_try_ordered_f64!(u8);
-
 /// Centroid implementation to the cluster mentioned in the paper.
 #[derive(Debug, PartialEq, Clone)]
 pub struct Centroid {
@@ -103,20 +56,6 @@ pub struct Centroid {
     weight: f64,
 }
 
-impl PartialOrd for Centroid {
-    fn partial_cmp(&self, other: &Centroid) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl Eq for Centroid {}
-
-impl Ord for Centroid {
-    fn cmp(&self, other: &Centroid) -> Ordering {
-        self.mean.total_cmp(&other.mean)
-    }
-}
-
 impl Centroid {
     pub fn new(mean: f64, weight: f64) -> Self {
         Centroid { mean, weight }
@@ -139,6 +78,10 @@ impl Centroid {
         self.mean = new_sum / new_weight;
         new_sum
     }
+
+    pub fn cmp_mean(&self, other: &Self) -> Ordering {
+        self.mean.total_cmp(&other.mean)
+    }
 }
 
 impl Default for Centroid {
@@ -156,7 +99,7 @@ pub struct TDigest {
     centroids: Vec<Centroid>,
     max_size: usize,
     sum: f64,
-    count: u64,
+    count: f64,
     max: f64,
     min: f64,
 }
@@ -166,26 +109,27 @@ impl TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
     }
 
+    #[expect(clippy::needless_pass_by_value)]
     pub fn new_with_centroid(max_size: usize, centroid: Centroid) -> Self {
         TDigest {
             centroids: vec![centroid.clone()],
             max_size,
             sum: centroid.mean * centroid.weight,
-            count: 1,
+            count: centroid.weight,
             max: centroid.mean,
             min: centroid.mean,
         }
     }
 
     #[inline]
-    pub fn count(&self) -> u64 {
+    pub fn count(&self) -> f64 {
         self.count
     }
 
@@ -215,8 +159,8 @@ impl Default for TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size: 100,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
@@ -238,7 +182,11 @@ impl TDigest {
         if lo.is_nan() || hi.is_nan() {
             return v;
         }
-        v.clamp(lo, hi)
+
+        // Handle the case where floating point precision causes min > max.
+        let (min, max) = if lo > hi { (hi, lo) } else { (lo, hi) };
+
+        v.clamp(min, max)
     }
 
     // public for testing in other modules
@@ -257,12 +205,12 @@ impl TDigest {
         }
 
         let mut result = TDigest::new(self.max_size());
-        result.count = self.count() + sorted_values.len() as u64;
+        result.count = self.count() + sorted_values.len() as f64;
 
         let maybe_min = *sorted_values.first().unwrap();
         let maybe_max = *sorted_values.last().unwrap();
 
-        if self.count() > 0 {
+        if self.count() > 0.0 {
             result.min = self.min.min(maybe_min);
             result.max = self.max.max(maybe_max);
         } else {
@@ -274,7 +222,7 @@ impl TDigest {
 
         let mut k_limit: u64 = 1;
         let mut q_limit_times_count =
-            Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+            Self::k_to_q(k_limit, self.max_size) * result.count();
         k_limit += 1;
 
         let mut iter_centroids = self.centroids.iter().peekable();
@@ -322,7 +270,7 @@ impl TDigest {
 
                 compressed.push(curr.clone());
                 q_limit_times_count =
-                    Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+                    Self::k_to_q(k_limit, self.max_size) * result.count();
                 k_limit += 1;
                 curr = next;
             }
@@ -331,7 +279,7 @@ impl TDigest {
         result.sum += curr.add(sums_to_merge, weights_to_merge);
         compressed.push(curr);
         compressed.shrink_to_fit();
-        compressed.sort();
+        compressed.sort_by(|a, b| a.cmp_mean(b));
 
         result.centroids = compressed;
         result
@@ -349,7 +297,7 @@ impl TDigest {
         let mut j = middle;
 
         while i < middle && j < last {
-            match centroids[i].cmp(&centroids[j]) {
+            match centroids[i].cmp_mean(&centroids[j]) {
                 Ordering::Less => {
                     result.push(centroids[i].clone());
                     i += 1;
@@ -394,7 +342,7 @@ impl TDigest {
         let mut centroids: Vec<Centroid> = Vec::with_capacity(n_centroids);
         let mut starts: Vec<usize> = Vec::with_capacity(digests.len());
 
-        let mut count = 0;
+        let mut count = 0.0;
         let mut min = f64::INFINITY;
         let mut max = f64::NEG_INFINITY;
 
@@ -403,7 +351,7 @@ impl TDigest {
             starts.push(start);
 
             let curr_count = digest.count();
-            if curr_count > 0 {
+            if curr_count > 0.0 {
                 min = min.min(digest.min);
                 max = max.max(digest.max);
                 count += curr_count;
@@ -414,6 +362,11 @@ impl TDigest {
             }
         }
 
+        // If no centroids were added (all digests had zero count), return default
+        if centroids.is_empty() {
+            return TDigest::default();
+        }
+
         let mut digests_per_block: usize = 1;
         while digests_per_block < starts.len() {
             for i in (0..starts.len()).step_by(digests_per_block * 2) {
@@ -438,7 +391,7 @@ impl TDigest {
         let mut compressed: Vec<Centroid> = Vec::with_capacity(max_size);
 
         let mut k_limit = 1;
-        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
 
         let mut iter_centroids = centroids.iter_mut();
         let mut curr = iter_centroids.next().unwrap();
@@ -457,7 +410,7 @@ impl TDigest {
                 sums_to_merge = 0_f64;
                 weights_to_merge = 0_f64;
                 compressed.push(curr.clone());
-                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
                 k_limit += 1;
                 curr = centroid;
             }
@@ -466,7 +419,7 @@ impl TDigest {
         result.sum += curr.add(sums_to_merge, weights_to_merge);
         compressed.push(curr.clone());
         compressed.shrink_to_fit();
-        compressed.sort();
+        compressed.sort_by(|a, b| a.cmp_mean(b));
 
         result.count = count;
         result.min = min;
@@ -481,7 +434,7 @@ impl TDigest {
             return 0.0;
         }
 
-        let rank = q * self.count as f64;
+        let rank = q * self.count;
 
         let mut pos: usize;
         let mut t;
@@ -491,7 +444,7 @@ impl TDigest {
             }
 
             pos = 0;
-            t = self.count as f64;
+            t = self.count;
 
             for (k, centroid) in self.centroids.iter().enumerate().rev() {
                 t -= centroid.weight();
@@ -585,7 +538,6 @@ impl TDigest {
     ///                          │└ ─ ─ ─ ┘│
     ///                          │         │
     ///                              ...
-    ///
     /// ```
     ///
     /// The [`TDigest::from_scalar_state()`] method reverses this processes,
@@ -605,7 +557,7 @@ impl TDigest {
         vec![
             ScalarValue::UInt64(Some(self.max_size as u64)),
             ScalarValue::Float64(Some(self.sum)),
-            ScalarValue::UInt64(Some(self.count)),
+            ScalarValue::Float64(Some(self.count)),
             ScalarValue::Float64(Some(self.max)),
             ScalarValue::Float64(Some(self.min)),
             ScalarValue::List(arr),
@@ -653,7 +605,7 @@ impl TDigest {
         Self {
             max_size,
             sum: cast_scalar_f64!(state[1]),
-            count: cast_scalar_u64!(&state[2]),
+            count: cast_scalar_f64!(state[2]),
             max,
             min,
             centroids,
@@ -790,4 +742,22 @@ mod tests {
 
         assert_eq!(t.size(), 96);
     }
+
+    #[test]
+    fn test_identical_values_floating_point_precision() {
+        // Regression test for https://github.com/apache/datafusion/issues/14855
+        // When all values are the same, floating-point arithmetic during centroid
+        // merging can cause slight precision differences between min and max,
+        // which previously caused a panic in clamp().
+
+        let t = TDigest::new(100);
+        let values: Vec<_> = (0..215).map(|_| 15.699999988079073_f64).collect();
+
+        let t = t.merge_unsorted_f64(values);
+
+        // This should not panic
+        let result = t.estimate_quantile(0.99);
+        // The result should be approximately equal to the input value
+        assert!((result - 15.699999988079073).abs() < 1e-10);
+    }
 }
diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs
index 229d9a900105a..256d80a67b1df 100644
--- a/datafusion/functions-aggregate-common/src/utils.rs
+++ b/datafusion/functions-aggregate-common/src/utils.rs
@@ -15,22 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
-
-use arrow::array::{ArrayRef, AsArray};
-use arrow::datatypes::{ArrowNativeType, FieldRef};
-use arrow::{
-    array::ArrowNativeTypeOp,
-    compute::SortOptions,
-    datatypes::{
-        DataType, Decimal128Type, DecimalType, Field, TimeUnit, TimestampMicrosecondType,
-        TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
-        ToByteSlice,
-    },
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, PrimitiveArray,
+};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{
+    ArrowNativeType, DataType, DecimalType, Field, FieldRef, ToByteSlice,
+};
+use datafusion_common::cast::{as_list_array, as_primitive_array};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use datafusion_common::utils::memory::estimate_memory_size;
+use datafusion_common::{
+    HashSet, Result, ScalarValue, exec_err, internal_datafusion_err,
 };
-use datafusion_common::{exec_err, DataFusionError, Result};
 use datafusion_expr_common::accumulator::Accumulator;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use std::sync::Arc;
 
 /// Convert scalar values from an accumulator into arrays.
 pub fn get_accum_scalar_values_as_arrays(
@@ -43,57 +44,13 @@ pub fn get_accum_scalar_values_as_arrays(
         .collect()
 }
 
-/// Adjust array type metadata if needed
-///
-/// Since `Decimal128Arrays` created from `Vec<NativeType>` have
-/// default precision and scale, this function adjusts the output to
-/// match `data_type`, if necessary
-#[deprecated(since = "44.0.0", note = "use PrimitiveArray::with_datatype")]
-pub fn adjust_output_array(data_type: &DataType, array: ArrayRef) -> Result<ArrayRef> {
-    let array = match data_type {
-        DataType::Decimal128(p, s) => Arc::new(
-            array
-                .as_primitive::<Decimal128Type>()
-                .clone()
-                .with_precision_and_scale(*p, *s)?,
-        ) as ArrayRef,
-        DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
-            array
-                .as_primitive::<TimestampNanosecondType>()
-                .clone()
-                .with_timezone_opt(tz.clone()),
-        ),
-        DataType::Timestamp(TimeUnit::Microsecond, tz) => Arc::new(
-            array
-                .as_primitive::<TimestampMicrosecondType>()
-                .clone()
-                .with_timezone_opt(tz.clone()),
-        ),
-        DataType::Timestamp(TimeUnit::Millisecond, tz) => Arc::new(
-            array
-                .as_primitive::<TimestampMillisecondType>()
-                .clone()
-                .with_timezone_opt(tz.clone()),
-        ),
-        DataType::Timestamp(TimeUnit::Second, tz) => Arc::new(
-            array
-                .as_primitive::<TimestampSecondType>()
-                .clone()
-                .with_timezone_opt(tz.clone()),
-        ),
-        // no adjustment needed for other arrays
-        _ => array,
-    };
-    Ok(array)
-}
-
-/// Construct corresponding fields for lexicographical ordering requirement expression
+/// Construct corresponding fields for the expressions in an ORDER BY clause.
 pub fn ordering_fields(
-    ordering_req: &LexOrdering,
+    order_bys: &[PhysicalSortExpr],
     // Data type of each expression in the ordering requirement
     data_types: &[DataType],
 ) -> Vec<FieldRef> {
-    ordering_req
+    order_bys
         .iter()
         .zip(data_types.iter())
         .map(|(sort_expr, dtype)| {
@@ -146,6 +103,8 @@ pub struct DecimalAverager<T: DecimalType> {
     target_mul: T::Native,
     /// the output precision
     target_precision: u8,
+    /// the output scale
+    target_scale: i8,
 }
 
 impl<T: DecimalType> DecimalAverager<T> {
@@ -163,21 +122,24 @@ impl<T: DecimalType> DecimalAverager<T> {
     ) -> Result<Self> {
         let sum_mul = T::Native::from_usize(10_usize)
             .map(|b| b.pow_wrapping(sum_scale as u32))
-            .ok_or(DataFusionError::Internal(
-                "Failed to compute sum_mul in DecimalAverager".to_string(),
-            ))?;
+            .ok_or_else(|| {
+                internal_datafusion_err!("Failed to compute sum_mul in DecimalAverager")
+            })?;
 
         let target_mul = T::Native::from_usize(10_usize)
             .map(|b| b.pow_wrapping(target_scale as u32))
-            .ok_or(DataFusionError::Internal(
-                "Failed to compute target_mul in DecimalAverager".to_string(),
-            ))?;
+            .ok_or_else(|| {
+                internal_datafusion_err!(
+                    "Failed to compute target_mul in DecimalAverager"
+                )
+            })?;
 
         if target_mul >= sum_mul {
             Ok(Self {
                 sum_mul,
                 target_mul,
                 target_precision,
+                target_scale,
             })
         } else {
             // can't convert the lit decimal to the returned data type
@@ -196,8 +158,11 @@ impl<T: DecimalType> DecimalAverager<T> {
         if let Ok(value) = sum.mul_checked(self.target_mul.div_wrapping(self.sum_mul)) {
             let new_value = value.div_wrapping(count);
 
-            let validate =
-                T::validate_decimal_precision(new_value, self.target_precision);
+            let validate = T::validate_decimal_precision(
+                new_value,
+                self.target_precision,
+                self.target_scale,
+            );
 
             if validate.is_ok() {
                 Ok(new_value)
@@ -210,3 +175,92 @@ impl<T: DecimalType> DecimalAverager<T> {
         }
     }
 }
+
+/// Generic way to collect distinct values for accumulators.
+///
+/// The intermediate state is represented as a List of scalar values updated by
+/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
+/// in the final evaluation step so that we avoid expensive conversions and
+/// allocations during `update_batch`.
+pub struct GenericDistinctBuffer<T: ArrowPrimitiveType> {
+    pub values: HashSet<Hashable<T::Native>, RandomState>,
+    data_type: DataType,
+}
+
+impl<T: ArrowPrimitiveType> std::fmt::Debug for GenericDistinctBuffer<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "GenericDistinctBuffer({}, values={})",
+            self.data_type,
+            self.values.len()
+        )
+    }
+}
+
+impl<T: ArrowPrimitiveType> GenericDistinctBuffer<T> {
+    pub fn new(data_type: DataType) -> Self {
+        Self {
+            values: HashSet::default(),
+            data_type,
+        }
+    }
+
+    /// Mirrors [`Accumulator::state`].
+    pub fn state(&self) -> Result<Vec<ScalarValue>> {
+        let arr = Arc::new(
+            PrimitiveArray::<T>::from_iter_values(self.values.iter().map(|v| v.0))
+                // Ideally we'd just use T::DATA_TYPE but this misses things like
+                // decimal scale/precision and timestamp timezones, which need to
+                // match up with Accumulator::state_fields
+                .with_data_type(self.data_type.clone()),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    /// Mirrors [`Accumulator::update_batch`].
+    pub fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        debug_assert_eq!(
+            values.len(),
+            1,
+            "DistinctValuesBuffer::update_batch expects only a single input array"
+        );
+
+        let arr = as_primitive_array::<T>(&values[0])?;
+        if arr.null_count() > 0 {
+            self.values.extend(arr.iter().flatten().map(Hashable));
+        } else {
+            self.values
+                .extend(arr.values().iter().cloned().map(Hashable));
+        }
+
+        Ok(())
+    }
+
+    /// Mirrors [`Accumulator::merge_batch`].
+    pub fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
+
+        let array = as_list_array(&states[0])?;
+        for list in array.iter().flatten() {
+            self.update_batch(&[list])?;
+        }
+
+        Ok(())
+    }
+
+    /// Mirrors [`Accumulator::size`].
+    pub fn size(&self) -> usize {
+        let num_elements = self.values.len();
+        let fixed_size = size_of_val(self) + size_of_val(&self.values);
+        estimate_memory_size::<T::Native>(num_elements, fixed_size).unwrap()
+    }
+}
diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml
index ec6e6b633bb81..07ee89af84a0e 100644
--- a/datafusion/functions-aggregate/Cargo.toml
+++ b/datafusion/functions-aggregate/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,7 +41,6 @@ workspace = true
 name = "datafusion_functions_aggregate"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-doc = { workspace = true }
@@ -48,9 +50,10 @@ datafusion-functions-aggregate-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
+foldhash = "0.2"
 half = { workspace = true }
 log = { workspace = true }
-paste = "1.0.14"
+num-traits = { workspace = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
@@ -68,3 +71,11 @@ harness = false
 [[bench]]
 name = "array_agg"
 harness = false
+
+[[bench]]
+harness = false
+name = "min_max_bytes"
+
+[[bench]]
+name = "approx_distinct"
+harness = false
diff --git a/datafusion/functions-aggregate/README.md b/datafusion/functions-aggregate/README.md
index 29b313d2a9037..aa50eaeedae03 100644
--- a/datafusion/functions-aggregate/README.md
+++ b/datafusion/functions-aggregate/README.md
@@ -17,11 +17,16 @@
   under the License.
 -->
 
-# DataFusion Aggregate Function Library
+# Apache DataFusion Aggregate Function Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate contains packages of function that can be used to customize the
-functionality of DataFusion.
+This crate contains implementations of aggregate functions.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-aggregate/benches/approx_distinct.rs b/datafusion/functions-aggregate/benches/approx_distinct.rs
new file mode 100644
index 0000000000000..538103d991f1f
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/approx_distinct.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl};
+use datafusion_functions_aggregate::approx_distinct::ApproxDistinct;
+use datafusion_physical_expr::expressions::col;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+const BATCH_SIZE: usize = 8192;
+const STRING_LENGTH: usize = 20;
+
+fn prepare_accumulator(data_type: DataType) -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
+    let expr = col("f", &schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Field::new("f", DataType::UInt64, true).into(),
+        schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "approx_distinct(f)",
+        is_distinct: false,
+        exprs: &[expr],
+    };
+    ApproxDistinct::new().accumulator(accumulator_args).unwrap()
+}
+
+/// Creates an Int64Array where values are drawn from `0..n_distinct`.
+fn create_i64_array(n_distinct: usize) -> Int64Array {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..BATCH_SIZE)
+        .map(|_| Some(rng.random_range(0..n_distinct as i64)))
+        .collect()
+}
+
+/// Creates a pool of `n_distinct` random strings.
+fn create_string_pool(n_distinct: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..n_distinct)
+        .map(|_| {
+            (0..STRING_LENGTH)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect()
+        })
+        .collect()
+}
+
+/// Creates a StringArray where values are drawn from the given pool.
+fn create_string_array(pool: &[String]) -> StringArray {
+    let mut rng = StdRng::seed_from_u64(99);
+    (0..BATCH_SIZE)
+        .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str()))
+        .collect()
+}
+
+/// Creates a StringViewArray where values are drawn from the given pool.
+fn create_string_view_array(pool: &[String]) -> StringViewArray {
+    let mut rng = StdRng::seed_from_u64(99);
+    (0..BATCH_SIZE)
+        .map(|_| Some(pool[rng.random_range(0..pool.len())].as_str()))
+        .collect()
+}
+
+fn approx_distinct_benchmark(c: &mut Criterion) {
+    for pct in [80, 99] {
+        let n_distinct = BATCH_SIZE * pct / 100;
+
+        // --- Int64 benchmarks ---
+        let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef;
+        c.bench_function(&format!("approx_distinct i64 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Int64);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+
+        let string_pool = create_string_pool(n_distinct);
+
+        // --- Utf8 benchmarks ---
+        let values = Arc::new(create_string_array(&string_pool)) as ArrayRef;
+        c.bench_function(&format!("approx_distinct utf8 {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Utf8);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+
+        // --- Utf8View benchmarks ---
+        let values = Arc::new(create_string_view_array(&string_pool)) as ArrayRef;
+        c.bench_function(&format!("approx_distinct utf8view {pct}% distinct"), |b| {
+            b.iter(|| {
+                let mut accumulator = prepare_accumulator(DataType::Utf8View);
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap()
+            })
+        });
+    }
+}
+
+criterion_group!(benches, approx_distinct_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs
index 6dadb12aba864..793c2aac96293 100644
--- a/datafusion/functions-aggregate/benches/array_agg.rs
+++ b/datafusion/functions-aggregate/benches/array_agg.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{
@@ -22,30 +23,31 @@ use arrow::array::{
     PrimitiveArray,
 };
 use arrow::datatypes::{Field, Int64Type};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::Accumulator;
 use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator;
 
 use arrow::buffer::OffsetBuffer;
-use rand::distr::{Distribution, StandardUniform};
-use rand::prelude::StdRng;
 use rand::Rng;
 use rand::SeedableRng;
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::StdRng;
 
 /// Returns fixed seedable RNG
 pub fn seedable_rng() -> StdRng {
     StdRng::seed_from_u64(42)
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
     let list_item_data_type = values.as_list::<i32>().values().data_type().clone();
     c.bench_function(name, |b| {
         b.iter(|| {
-            #[allow(clippy::unit_arg)]
+            #[expect(clippy::unit_arg)]
             black_box(
                 ArrayAggAccumulator::try_new(&list_item_data_type, false)
                     .unwrap()
-                    .merge_batch(&[values.clone()])
+                    .merge_batch(std::slice::from_ref(&values))
                     .unwrap(),
             )
         })
diff --git a/datafusion/functions-aggregate/benches/count.rs b/datafusion/functions-aggregate/benches/count.rs
index d5abf6b8ac281..48f71858c1204 100644
--- a/datafusion/functions-aggregate/benches/count.rs
+++ b/datafusion/functions-aggregate/benches/count.rs
@@ -15,27 +15,36 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::hint::black_box;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, BooleanArray};
 use arrow::datatypes::{DataType, Field, Int32Type, Schema};
-use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use datafusion_expr::{function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator};
+use arrow::util::bench_util::{
+    create_boolean_array, create_dict_from_values, create_primitive_array,
+    create_string_array_with_len,
+};
+
+use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, GroupsAccumulator};
 use datafusion_functions_aggregate::count::Count;
 use datafusion_physical_expr::expressions::col;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use std::sync::Arc;
 
-fn prepare_accumulator() -> Box<dyn GroupsAccumulator> {
+use criterion::{Criterion, criterion_group, criterion_main};
+
+fn prepare_group_accumulator() -> Box<dyn GroupsAccumulator> {
     let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Int32, true)]));
+    let expr = col("f", &schema).unwrap();
     let accumulator_args = AccumulatorArgs {
         return_field: Field::new("f", DataType::Int64, true).into(),
         schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
         ignore_nulls: false,
-        ordering_req: &LexOrdering::default(),
+        order_bys: &[],
         is_reversed: false,
         name: "COUNT(f)",
         is_distinct: false,
-        exprs: &[col("f", &schema).unwrap()],
+        exprs: &[expr],
     };
     let count_fn = Count::new();
 
@@ -44,18 +53,42 @@ fn prepare_accumulator() -> Box<dyn GroupsAccumulator> {
         .unwrap()
 }
 
+fn prepare_accumulator() -> Box<dyn Accumulator> {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "f",
+        DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+        true,
+    )]));
+    let expr = col("f", &schema).unwrap();
+    let accumulator_args = AccumulatorArgs {
+        return_field: Arc::new(Field::new_list_field(DataType::Int64, true)),
+        schema: &schema,
+        expr_fields: &[expr.return_field(&schema).unwrap()],
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "COUNT(f)",
+        is_distinct: true,
+        exprs: &[expr],
+    };
+    let count_fn = Count::new();
+
+    count_fn.accumulator(accumulator_args).unwrap()
+}
+
+#[expect(clippy::needless_pass_by_value)]
 fn convert_to_state_bench(
     c: &mut Criterion,
     name: &str,
     values: ArrayRef,
     opt_filter: Option<&BooleanArray>,
 ) {
-    let accumulator = prepare_accumulator();
+    let accumulator = prepare_group_accumulator();
     c.bench_function(name, |b| {
         b.iter(|| {
             black_box(
                 accumulator
-                    .convert_to_state(&[values.clone()], opt_filter)
+                    .convert_to_state(std::slice::from_ref(&values), opt_filter)
                     .unwrap(),
             )
         })
@@ -89,6 +122,22 @@ fn count_benchmark(c: &mut Criterion) {
         values,
         Some(&filter),
     );
+
+    let arr = create_string_array_with_len::<i32>(20, 0.0, 50);
+    let values =
+        Arc::new(create_dict_from_values::<Int32Type>(200_000, 0.8, &arr)) as ArrayRef;
+
+    let mut accumulator = prepare_accumulator();
+    c.bench_function("count low cardinality dict 20% nulls, no filter", |b| {
+        b.iter(|| {
+            #[expect(clippy::unit_arg)]
+            black_box(
+                accumulator
+                    .update_batch(std::slice::from_ref(&values))
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, count_benchmark);
diff --git a/datafusion/functions-aggregate/benches/min_max_bytes.rs b/datafusion/functions-aggregate/benches/min_max_bytes.rs
new file mode 100644
index 0000000000000..9f4eb0f0c6246
--- /dev/null
+++ b/datafusion/functions-aggregate/benches/min_max_bytes.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// A minimal benchmark of the min_max accumulator for byte-like data types.
+//
+// The benchmark simulates the insertion of NUM_BATCHES batches into an aggregation,
+// where every row belongs to a distinct group. The data generated beforehand to
+// ensure that (mostly) the cost of the update_batch method is measured.
+//
+// The throughput value describes the rows per second that are ingested.
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{ArrayRef, StringArray},
+    datatypes::{DataType, Field, Schema},
+};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use datafusion_expr::{GroupsAccumulator, function::AccumulatorArgs};
+use datafusion_functions_aggregate::min_max;
+use datafusion_physical_expr::expressions::col;
+
+const BATCH_SIZE: usize = 8192;
+
+fn create_max_bytes_accumulator() -> Box<dyn GroupsAccumulator> {
+    let input_schema =
+        Arc::new(Schema::new(vec![Field::new("value", DataType::Utf8, true)]));
+
+    let max = min_max::max_udaf();
+    max.create_groups_accumulator(AccumulatorArgs {
+        return_field: Arc::new(Field::new("value", DataType::Utf8, true)),
+        schema: &input_schema,
+        expr_fields: &[Field::new("value", DataType::Utf8, true).into()],
+        ignore_nulls: true,
+        order_bys: &[],
+        is_reversed: false,
+        name: "max_utf8",
+        is_distinct: true,
+        exprs: &[col("value", &input_schema).unwrap()],
+    })
+    .unwrap()
+}
+
+fn bench_min_max_bytes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("min_max_bytes");
+
+    for num_batches in [10, 20, 50, 100, 150, 200, 300, 400, 500] {
+        let id = BenchmarkId::from_parameter(num_batches);
+        group.throughput(Throughput::Elements((num_batches * BATCH_SIZE) as u64));
+        group.bench_with_input(id, &num_batches, |bencher, num_batches| {
+            bencher.iter_with_large_drop(|| {
+                let mut accumulator = create_max_bytes_accumulator();
+                let mut group_indices = Vec::with_capacity(BATCH_SIZE);
+                let strings: ArrayRef = Arc::new(StringArray::from_iter_values(
+                    (0..BATCH_SIZE).map(|i| i.to_string()),
+                ));
+
+                for batch_idx in 0..*num_batches {
+                    group_indices.clear();
+                    group_indices
+                        .extend((batch_idx * BATCH_SIZE)..(batch_idx + 1) * BATCH_SIZE);
+                    let total_num_groups = (batch_idx + 1) * BATCH_SIZE;
+
+                    accumulator
+                        .update_batch(
+                            &[Arc::clone(&strings)],
+                            &group_indices,
+                            None,
+                            total_num_groups,
+                        )
+                        .unwrap()
+                }
+            });
+        });
+    }
+}
+
+criterion_group!(benches, bench_min_max_bytes);
+criterion_main!(benches);
diff --git a/datafusion/functions-aggregate/benches/sum.rs b/datafusion/functions-aggregate/benches/sum.rs
index 25df78b15f11c..52998179024c1 100644
--- a/datafusion/functions-aggregate/benches/sum.rs
+++ b/datafusion/functions-aggregate/benches/sum.rs
@@ -15,24 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::hint::black_box;
+use std::sync::Arc;
+
 use arrow::array::{ArrayRef, BooleanArray};
 use arrow::datatypes::{DataType, Field, Int64Type, Schema};
 use arrow::util::bench_util::{create_boolean_array, create_primitive_array};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use datafusion_expr::{function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator};
+
+use datafusion_expr::{AggregateUDFImpl, GroupsAccumulator, function::AccumulatorArgs};
 use datafusion_functions_aggregate::sum::Sum;
 use datafusion_physical_expr::expressions::col;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use std::sync::Arc;
+
+use criterion::{Criterion, criterion_group, criterion_main};
 
 fn prepare_accumulator(data_type: &DataType) -> Box<dyn GroupsAccumulator> {
     let field = Field::new("f", data_type.clone(), true).into();
     let schema = Arc::new(Schema::new(vec![Arc::clone(&field)]));
     let accumulator_args = AccumulatorArgs {
-        return_field: field,
+        return_field: Arc::clone(&field),
         schema: &schema,
+        expr_fields: &[field],
         ignore_nulls: false,
-        ordering_req: &LexOrdering::default(),
+        order_bys: &[],
         is_reversed: false,
         name: "SUM(f)",
         is_distinct: false,
@@ -43,6 +47,7 @@ fn prepare_accumulator(data_type: &DataType) -> Box<dyn GroupsAccumulator> {
     sum_fn.create_groups_accumulator(accumulator_args).unwrap()
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn convert_to_state_bench(
     c: &mut Criterion,
     name: &str,
@@ -54,7 +59,7 @@ fn convert_to_state_bench(
         b.iter(|| {
             black_box(
                 accumulator
-                    .convert_to_state(&[values.clone()], opt_filter)
+                    .convert_to_state(std::slice::from_ref(&values), opt_filter)
                     .unwrap(),
             )
         })
diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs
index 0d5dcd5c2085a..9cebd3e8518a0 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -23,19 +23,24 @@ use arrow::array::{
     GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
 };
 use arrow::datatypes::{
-    ArrowPrimitiveType, FieldRef, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
-    UInt32Type, UInt64Type, UInt8Type,
+    ArrowPrimitiveType, Date32Type, Date64Type, FieldRef, Int8Type, Int16Type, Int32Type,
+    Int64Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
 use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
 use datafusion_common::ScalarValue;
 use datafusion_common::{
-    downcast_value, internal_err, not_impl_err, DataFusionError, Result,
+    DataFusionError, Result, downcast_value, internal_datafusion_err, internal_err,
+    not_impl_err,
 };
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use datafusion_macros::user_doc;
 use std::any::Any;
 use std::fmt::{Debug, Formatter};
@@ -50,26 +55,24 @@ make_udaf_expr_and_func!(
     approx_distinct_udaf
 );
 
-impl<T: Hash> From<&HyperLogLog<T>> for ScalarValue {
+impl<T: Hash + ?Sized> From<&HyperLogLog<T>> for ScalarValue {
     fn from(v: &HyperLogLog<T>) -> ScalarValue {
         let values = v.as_ref().to_vec();
         ScalarValue::Binary(Some(values))
     }
 }
 
-impl<T: Hash> TryFrom<&[u8]> for HyperLogLog<T> {
+impl<T: Hash + ?Sized> TryFrom<&[u8]> for HyperLogLog<T> {
     type Error = DataFusionError;
     fn try_from(v: &[u8]) -> Result<HyperLogLog<T>> {
         let arr: [u8; 16384] = v.try_into().map_err(|_| {
-            DataFusionError::Internal(
-                "Impossibly got invalid binary array from states".into(),
-            )
+            internal_datafusion_err!("Impossibly got invalid binary array from states")
         })?;
         Ok(HyperLogLog::<T>::new_with_registers(arr))
     }
 }
 
-impl<T: Hash> TryFrom<&ScalarValue> for HyperLogLog<T> {
+impl<T: Hash + ?Sized> TryFrom<&ScalarValue> for HyperLogLog<T> {
     type Error = DataFusionError;
     fn try_from(v: &ScalarValue) -> Result<HyperLogLog<T>> {
         if let ScalarValue::Binary(Some(slice)) = v {
@@ -96,7 +99,6 @@ where
     T: ArrowPrimitiveType,
     T::Native: Hash,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -109,7 +111,7 @@ struct StringHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    hll: HyperLogLog<String>,
+    hll: HyperLogLog<str>,
     phantom_data: PhantomData<T>,
 }
 
@@ -117,7 +119,6 @@ impl<T> StringHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -127,22 +128,14 @@ where
 }
 
 #[derive(Debug)]
-struct StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
-    hll: HyperLogLog<String>,
-    phantom_data: PhantomData<T>,
+struct StringViewHLLAccumulator {
+    hll: HyperLogLog<str>,
 }
 
-impl<T> StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
+impl StringViewHLLAccumulator {
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
-            phantom_data: PhantomData,
         }
     }
 }
@@ -152,7 +145,7 @@ struct BinaryHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    hll: HyperLogLog<Vec<u8>>,
+    hll: HyperLogLog<[u8]>,
     phantom_data: PhantomData<T>,
 }
 
@@ -160,7 +153,6 @@ impl<T> BinaryHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
 {
-    /// new approx_distinct accumulator
     pub fn new() -> Self {
         Self {
             hll: HyperLogLog::new(),
@@ -176,8 +168,8 @@ macro_rules! default_accumulator_impl {
             let binary_array = downcast_value!(states[0], BinaryArray);
             for v in binary_array.iter() {
                 let v = v.ok_or_else(|| {
-                    DataFusionError::Internal(
-                        "Impossibly got empty binary array from states".into(),
+                    internal_datafusion_err!(
+                        "Impossibly got empty binary array from states"
                     )
                 })?;
                 let other = v.try_into()?;
@@ -210,23 +202,18 @@ where
         let array: &GenericBinaryArray<T> =
             downcast_value!(values[0], GenericBinaryArray, T);
         // flatten because we would skip nulls
-        self.hll
-            .extend(array.into_iter().flatten().map(|v| v.to_vec()));
+        self.hll.extend(array.into_iter().flatten());
         Ok(())
     }
 
     default_accumulator_impl!();
 }
 
-impl<T> Accumulator for StringViewHLLAccumulator<T>
-where
-    T: OffsetSizeTrait,
-{
+impl Accumulator for StringViewHLLAccumulator {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
         // flatten because we would skip nulls
-        self.hll
-            .extend(array.iter().flatten().map(|s| s.to_string()));
+        self.hll.extend(array.iter().flatten());
         Ok(())
     }
 
@@ -241,8 +228,7 @@ where
         let array: &GenericStringArray<T> =
             downcast_value!(values[0], GenericStringArray, T);
         // flatten because we would skip nulls
-        self.hll
-            .extend(array.into_iter().flatten().map(|i| i.to_string()));
+        self.hll.extend(array.into_iter().flatten());
         Ok(())
     }
 
@@ -293,6 +279,7 @@ impl Default for ApproxDistinct {
 ```"#,
     standard_argument(name = "expression",)
 )]
+#[derive(PartialEq, Eq, Hash)]
 pub struct ApproxDistinct {
     signature: Signature,
 }
@@ -323,16 +310,29 @@ impl AggregateUDFImpl for ApproxDistinct {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, "hll_registers"),
-            DataType::Binary,
-            false,
-        )
-        .into()])
+        if args.input_fields[0].data_type().is_null() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ])
+        } else {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "hll_registers"),
+                    DataType::Binary,
+                    false,
+                )
+                .into(),
+            ])
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+        let data_type = acc_args.expr_fields[0].data_type();
 
         let accumulator: Box<dyn Accumulator> = match data_type {
             // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL
@@ -346,15 +346,44 @@ impl AggregateUDFImpl for ApproxDistinct {
             DataType::Int16 => Box::new(NumericHLLAccumulator::<Int16Type>::new()),
             DataType::Int32 => Box::new(NumericHLLAccumulator::<Int32Type>::new()),
             DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
+            DataType::Date32 => Box::new(NumericHLLAccumulator::<Date32Type>::new()),
+            DataType::Date64 => Box::new(NumericHLLAccumulator::<Date64Type>::new()),
+            DataType::Time32(TimeUnit::Second) => {
+                Box::new(NumericHLLAccumulator::<Time32SecondType>::new())
+            }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                Box::new(NumericHLLAccumulator::<Time32MillisecondType>::new())
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                Box::new(NumericHLLAccumulator::<Time64MicrosecondType>::new())
+            }
+            DataType::Time64(TimeUnit::Nanosecond) => {
+                Box::new(NumericHLLAccumulator::<Time64NanosecondType>::new())
+            }
+            DataType::Timestamp(TimeUnit::Second, _) => {
+                Box::new(NumericHLLAccumulator::<TimestampSecondType>::new())
+            }
+            DataType::Timestamp(TimeUnit::Millisecond, _) => {
+                Box::new(NumericHLLAccumulator::<TimestampMillisecondType>::new())
+            }
+            DataType::Timestamp(TimeUnit::Microsecond, _) => {
+                Box::new(NumericHLLAccumulator::<TimestampMicrosecondType>::new())
+            }
+            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+                Box::new(NumericHLLAccumulator::<TimestampNanosecondType>::new())
+            }
             DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
             DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
-            DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
+            DataType::Utf8View => Box::new(StringViewHLLAccumulator::new()),
             DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
             DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
+            DataType::Null => {
+                Box::new(NoopAccumulator::new(ScalarValue::UInt64(Some(0))))
+            }
             other => {
                 return not_impl_err!(
-                "Support for 'approx_distinct' for data type {other} is not implemented"
-            )
+                    "Support for 'approx_distinct' for data type {other} is not implemented"
+                );
             }
         };
         Ok(accumulator)
diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs
index 0f2e3039ca9f1..2205b009ecb27 100644
--- a/datafusion/functions-aggregate/src/approx_median.rs
+++ b/datafusion/functions-aggregate/src/approx_median.rs
@@ -19,16 +19,18 @@
 
 use arrow::datatypes::DataType::{Float64, UInt64};
 use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::NativeType;
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -57,19 +59,11 @@ make_udaf_expr_and_func!(
 ```"#,
     standard_argument(name = "expression",)
 )]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ApproxMedian {
     signature: Signature,
 }
 
-impl Debug for ApproxMedian {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("ApproxMedian")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxMedian {
     fn default() -> Self {
         Self::new()
@@ -80,33 +74,55 @@ impl ApproxMedian {
     /// Create a new APPROX_MEDIAN aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Integer,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Decimal],
+                        NativeType::Float64,
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
 
 impl AggregateUDFImpl for ApproxMedian {
-    /// Return a reference to Any that can be used for downcasting
     fn as_any(&self) -> &dyn Any {
         self
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![
-            Field::new(format_state_name(args.name, "max_size"), UInt64, false),
-            Field::new(format_state_name(args.name, "sum"), Float64, false),
-            Field::new(format_state_name(args.name, "count"), UInt64, false),
-            Field::new(format_state_name(args.name, "max"), Float64, false),
-            Field::new(format_state_name(args.name, "min"), Float64, false),
-            Field::new_list(
-                format_state_name(args.name, "centroids"),
-                Field::new_list_field(Float64, true),
-                false,
-            ),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        if args.input_fields[0].data_type().is_null() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ])
+        } else {
+            Ok(vec![
+                Field::new(format_state_name(args.name, "max_size"), UInt64, false),
+                Field::new(format_state_name(args.name, "sum"), Float64, false),
+                Field::new(format_state_name(args.name, "count"), Float64, false),
+                Field::new(format_state_name(args.name, "max"), Float64, false),
+                Field::new(format_state_name(args.name, "min"), Float64, false),
+                Field::new_list(
+                    format_state_name(args.name, "centroids"),
+                    Field::new_list_field(Float64, true),
+                    false,
+                ),
+            ]
+            .into_iter()
+            .map(Arc::new)
+            .collect())
+        }
     }
 
     fn name(&self) -> &str {
@@ -118,9 +134,6 @@ impl AggregateUDFImpl for ApproxMedian {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("ApproxMedian requires numeric input types");
-        }
         Ok(arg_types[0].clone())
     }
 
@@ -131,10 +144,14 @@ impl AggregateUDFImpl for ApproxMedian {
             );
         }
 
-        Ok(Box::new(ApproxPercentileAccumulator::new(
-            0.5_f64,
-            acc_args.exprs[0].data_type(acc_args.schema)?,
-        )))
+        if acc_args.expr_fields[0].data_type().is_null() {
+            Ok(Box::new(NoopAccumulator::default()))
+        } else {
+            Ok(Box::new(ApproxPercentileAccumulator::new(
+                0.5_f64,
+                acc_args.expr_fields[0].data_type().clone(),
+            )))
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
index 024c0a823fa9e..392a044d01394 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -16,38 +16,38 @@
 // under the License.
 
 use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
-use arrow::array::{Array, RecordBatch};
+use arrow::array::{Array, Float16Array};
 use arrow::compute::{filter, is_not_null};
 use arrow::datatypes::FieldRef;
 use arrow::{
     array::{
-        ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-        Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+        ArrayRef, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
+        Int64Array, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
     },
-    datatypes::{DataType, Field, Schema},
+    datatypes::{DataType, Field},
 };
 use datafusion_common::{
-    downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
-    Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, downcast_value, internal_err, not_impl_err,
+    plan_err,
 };
 use datafusion_expr::expr::{AggregateFunction, Sort};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, ColumnarValue, Documentation, Expr, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_functions_aggregate_common::tdigest::{
-    TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
+    Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
+    Volatility,
 };
+use datafusion_functions_aggregate_common::tdigest::{DEFAULT_MAX_SIZE, TDigest};
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
+use crate::utils::{get_scalar_value, validate_percentile_expr};
+
 create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
 
 /// Computes the approximate percentile continuous of a set of numbers
@@ -69,7 +69,7 @@ pub fn approx_percentile_cont(
         args,
         false,
         None,
-        Some(vec![order_by]),
+        vec![order_by],
         None,
     ))
 }
@@ -77,15 +77,38 @@ pub fn approx_percentile_cont(
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)",
+    syntax_example = "approx_percentile_cont(percentile [, centroids]) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
+> SELECT approx_percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++------------------------------------------------------------------+
+| approx_percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) |
++------------------------------------------------------------------+
+| 65.0                                                             |
++------------------------------------------------------------------+
 > SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
 +-----------------------------------------------------------------------+
 | approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) |
 +-----------------------------------------------------------------------+
 | 65.0                                                                  |
 +-----------------------------------------------------------------------+
-```"#,
+```
+An alternate syntax is also supported:
+```sql
+> SELECT approx_percentile_cont(column_name, 0.75) FROM table_name;
++-----------------------------------------------+
+| approx_percentile_cont(column_name, 0.75)     |
++-----------------------------------------------+
+| 65.0                                          |
++-----------------------------------------------+
+
+> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
++----------------------------------------------------------+
+| approx_percentile_cont(column_name, 0.75, 100)           |
++----------------------------------------------------------+
+| 65.0                                                     |
++----------------------------------------------------------+
+```
+"#,
     standard_argument(name = "expression",),
     argument(
         name = "percentile",
@@ -96,19 +119,11 @@ pub fn approx_percentile_cont(
         description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
     )
 )]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ApproxPercentileCont {
     signature: Signature,
 }
 
-impl Debug for ApproxPercentileCont {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("ApproxPercentileCont")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxPercentileCont {
     fn default() -> Self {
         Self::new()
@@ -138,12 +153,13 @@ impl ApproxPercentileCont {
 
     pub(crate) fn create_accumulator(
         &self,
-        args: AccumulatorArgs,
+        args: &AccumulatorArgs,
     ) -> Result<ApproxPercentileAccumulator> {
-        let percentile = validate_input_percentile_expr(&args.exprs[1])?;
+        let percentile =
+            validate_percentile_expr(&args.exprs[1], "APPROX_PERCENTILE_CONT")?;
 
         let is_descending = args
-            .ordering_req
+            .order_bys
             .first()
             .map(|sort_expr| sort_expr.options.descending)
             .unwrap_or(false);
@@ -160,9 +176,9 @@ impl ApproxPercentileCont {
             None
         };
 
-        let data_type = args.exprs[0].data_type(args.schema)?;
+        let data_type = args.expr_fields[0].data_type();
         let accumulator: ApproxPercentileAccumulator = match data_type {
-            t @ (DataType::UInt8
+            DataType::UInt8
             | DataType::UInt16
             | DataType::UInt32
             | DataType::UInt64
@@ -170,19 +186,23 @@ impl ApproxPercentileCont {
             | DataType::Int16
             | DataType::Int32
             | DataType::Int64
+            | DataType::Float16
             | DataType::Float32
-            | DataType::Float64) => {
+            | DataType::Float64 => {
                 if let Some(max_size) = tdigest_max_size {
-                    ApproxPercentileAccumulator::new_with_max_size(percentile, t, max_size)
-                }else{
-                    ApproxPercentileAccumulator::new(percentile, t)
-
+                    ApproxPercentileAccumulator::new_with_max_size(
+                        percentile,
+                        data_type.clone(),
+                        max_size,
+                    )
+                } else {
+                    ApproxPercentileAccumulator::new(percentile, data_type.clone())
                 }
             }
             other => {
                 return not_impl_err!(
                     "Support for 'APPROX_PERCENTILE_CONT' for data type {other} is not implemented"
-                )
+                );
             }
         };
 
@@ -190,45 +210,15 @@ impl ApproxPercentileCont {
     }
 }
 
-fn get_scalar_value(expr: &Arc<dyn PhysicalExpr>) -> Result<ScalarValue> {
-    let empty_schema = Arc::new(Schema::empty());
-    let batch = RecordBatch::new_empty(Arc::clone(&empty_schema));
-    if let ColumnarValue::Scalar(s) = expr.evaluate(&batch)? {
-        Ok(s)
-    } else {
-        internal_err!("Didn't expect ColumnarValue::Array")
-    }
-}
-
-fn validate_input_percentile_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<f64> {
-    let percentile = match get_scalar_value(expr)
-        .map_err(|_| not_impl_datafusion_err!("Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal, got: {expr}"))? {
-        ScalarValue::Float32(Some(value)) => {
-            value as f64
-        }
-        ScalarValue::Float64(Some(value)) => {
-            value
-        }
-        sv => {
-            return not_impl_err!(
-                "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal (got data type {})",
-                sv.data_type()
-            )
-        }
-    };
-
-    // Ensure the percentile is between 0 and 1.
-    if !(0.0..=1.0).contains(&percentile) {
-        return plan_err!(
-            "Percentile value must be between 0.0 and 1.0 inclusive, {percentile} is invalid"
-        );
-    }
-    Ok(percentile)
-}
-
 fn validate_input_max_size_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<usize> {
-    let max_size = match get_scalar_value(expr)
-        .map_err(|_| not_impl_datafusion_err!("Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal, got: {expr}"))? {
+    let scalar_value = get_scalar_value(expr).map_err(|_e| {
+        DataFusionError::Plan(
+            "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal"
+                .to_string(),
+        )
+    })?;
+
+    let max_size = match scalar_value {
         ScalarValue::UInt8(Some(q)) => q as usize,
         ScalarValue::UInt16(Some(q)) => q as usize,
         ScalarValue::UInt32(Some(q)) => q as usize,
@@ -238,11 +228,11 @@ fn validate_input_max_size_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<usize> {
         ScalarValue::Int16(Some(q)) if q > 0 => q as usize,
         ScalarValue::Int8(Some(q)) if q > 0 => q as usize,
         sv => {
-            return not_impl_err!(
+            return plan_err!(
                 "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal (got data type {}).",
                 sv.data_type()
-            )
-        },
+            );
+        }
     };
 
     Ok(max_size)
@@ -253,7 +243,6 @@ impl AggregateUDFImpl for ApproxPercentileCont {
         self
     }
 
-    #[allow(rustdoc::private_intra_doc_links)]
     /// See [`TDigest::to_scalar_state()`] for a description of the serialized
     /// state.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
@@ -270,7 +259,7 @@ impl AggregateUDFImpl for ApproxPercentileCont {
             ),
             Field::new(
                 format_state_name(args.name, "count"),
-                DataType::UInt64,
+                DataType::Float64,
                 false,
             ),
             Field::new(
@@ -304,7 +293,7 @@ impl AggregateUDFImpl for ApproxPercentileCont {
 
     #[inline]
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        Ok(Box::new(self.create_accumulator(acc_args)?))
+        Ok(Box::new(self.create_accumulator(&acc_args)?))
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
@@ -313,17 +302,13 @@ impl AggregateUDFImpl for ApproxPercentileCont {
         }
         if arg_types.len() == 3 && !arg_types[2].is_integer() {
             return plan_err!(
-                "approx_percentile_cont requires integer max_size input types"
+                "approx_percentile_cont requires integer centroids input types"
             );
         }
         Ok(arg_types[0].clone())
     }
 
-    fn supports_null_handling_clause(&self) -> bool {
-        false
-    }
-
-    fn is_ordered_set_aggregate(&self) -> bool {
+    fn supports_within_group_clause(&self) -> bool {
         true
     }
 
@@ -360,94 +345,71 @@ impl ApproxPercentileAccumulator {
         }
     }
 
-    // public for approx_percentile_cont_with_weight
-    pub fn merge_digests(&mut self, digests: &[TDigest]) {
+    // pub(crate) for approx_percentile_cont_with_weight
+    pub(crate) fn max_size(&self) -> usize {
+        self.digest.max_size()
+    }
+
+    // pub(crate) for approx_percentile_cont_with_weight
+    pub(crate) fn merge_digests(&mut self, digests: &[TDigest]) {
         let digests = digests.iter().chain(std::iter::once(&self.digest));
         self.digest = TDigest::merge_digests(digests)
     }
 
-    // public for approx_percentile_cont_with_weight
-    pub fn convert_to_float(values: &ArrayRef) -> Result<Vec<f64>> {
+    // pub(crate) for approx_percentile_cont_with_weight
+    pub(crate) fn convert_to_float(values: &ArrayRef) -> Result<Vec<f64>> {
+        debug_assert!(
+            values.null_count() == 0,
+            "convert_to_float assumes nulls have already been filtered out"
+        );
         match values.data_type() {
             DataType::Float64 => {
                 let array = downcast_value!(values, Float64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().copied().collect::<Vec<_>>())
             }
             DataType::Float32 => {
                 let array = downcast_value!(values, Float32Array);
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
+            }
+            DataType::Float16 => {
+                let array = downcast_value!(values, Float16Array);
                 Ok(array
                     .values()
                     .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                    .map(|v| v.to_f64())
+                    .collect::<Vec<_>>())
             }
             DataType::Int64 => {
                 let array = downcast_value!(values, Int64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::Int32 => {
                 let array = downcast_value!(values, Int32Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::Int16 => {
                 let array = downcast_value!(values, Int16Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::Int8 => {
                 let array = downcast_value!(values, Int8Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::UInt64 => {
                 let array = downcast_value!(values, UInt64Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::UInt32 => {
                 let array = downcast_value!(values, UInt32Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::UInt16 => {
                 let array = downcast_value!(values, UInt16Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             DataType::UInt8 => {
                 let array = downcast_value!(values, UInt8Array);
-                Ok(array
-                    .values()
-                    .iter()
-                    .filter_map(|v| v.try_as_f64().transpose())
-                    .collect::<Result<Vec<_>>>()?)
+                Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
             }
             e => internal_err!(
                 "APPROX_PERCENTILE_CONT is not expected to receive the type {e:?}"
@@ -464,7 +426,7 @@ impl Accumulator for ApproxPercentileAccumulator {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         // Remove any nulls before computing the percentile
         let mut values = Arc::clone(&values[0]);
-        if values.nulls().is_some() {
+        if values.null_count() > 0 {
             values = filter(&values, &is_not_null(&values)?)?;
         }
         let sorted_values = &arrow::compute::sort(&values, None)?;
@@ -474,7 +436,7 @@ impl Accumulator for ApproxPercentileAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        if self.digest.count() == 0 {
+        if self.digest.count() == 0.0 {
             return ScalarValue::try_from(self.return_type.clone());
         }
         let q = self.digest.estimate_quantile(self.percentile);
@@ -490,9 +452,10 @@ impl Accumulator for ApproxPercentileAccumulator {
             DataType::UInt16 => ScalarValue::UInt16(Some(q as u16)),
             DataType::UInt32 => ScalarValue::UInt32(Some(q as u32)),
             DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)),
+            DataType::Float16 => ScalarValue::Float16(Some(half::f16::from_f64(q))),
             DataType::Float32 => ScalarValue::Float32(Some(q as f32)),
             DataType::Float64 => ScalarValue::Float64(Some(q)),
-            v => unreachable!("unexpected return type {:?}", v),
+            v => unreachable!("unexpected return type {}", v),
         })
     }
 
@@ -550,8 +513,8 @@ mod tests {
             ApproxPercentileAccumulator::new_with_max_size(0.5, DataType::Float64, 100);
 
         accumulator.merge_digests(&[t1]);
-        assert_eq!(accumulator.digest.count(), 50_000);
+        assert_eq!(accumulator.digest.count(), 50_000.0);
         accumulator.merge_digests(&[t2]);
-        assert_eq!(accumulator.digest.count(), 100_000);
+        assert_eq!(accumulator.digest.count(), 100_000.0);
     }
 }
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
index 5180d45889620..6fd90130e6741 100644
--- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
+++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
@@ -16,40 +16,63 @@
 // under the License.
 
 use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
+use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
+use arrow::compute::{and, filter, is_not_null};
 use arrow::datatypes::FieldRef;
 use arrow::{array::ArrayRef, datatypes::DataType};
 use datafusion_common::ScalarValue;
-use datafusion_common::{not_impl_err, plan_err, Result};
-use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
+use datafusion_common::{Result, not_impl_err, plan_err};
 use datafusion_expr::Volatility::Immutable;
+use datafusion_expr::expr::{AggregateFunction, Sort};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature,
-};
-use datafusion_functions_aggregate_common::tdigest::{
-    Centroid, TDigest, DEFAULT_MAX_SIZE,
+    Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
 };
+use datafusion_functions_aggregate_common::tdigest::{Centroid, TDigest};
 use datafusion_macros::user_doc;
 
 use crate::approx_percentile_cont::{ApproxPercentileAccumulator, ApproxPercentileCont};
 
-make_udaf_expr_and_func!(
+create_func!(
     ApproxPercentileContWithWeight,
-    approx_percentile_cont_with_weight,
-    expression weight percentile,
-    "Computes the approximate percentile continuous with weight of a set of numbers",
     approx_percentile_cont_with_weight_udaf
 );
 
+/// Computes the approximate percentile continuous with weight of a set of numbers
+pub fn approx_percentile_cont_with_weight(
+    order_by: Sort,
+    weight: Expr,
+    percentile: Expr,
+    centroids: Option<Expr>,
+) -> Expr {
+    let expr = order_by.expr.clone();
+
+    let args = if let Some(centroids) = centroids {
+        vec![expr, weight, percentile, centroids]
+    } else {
+        vec![expr, weight, percentile]
+    };
+
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        approx_percentile_cont_with_weight_udaf(),
+        args,
+        false,
+        None,
+        vec![order_by],
+        None,
+    ))
+}
+
 /// APPROX_PERCENTILE_CONT_WITH_WEIGHT aggregate expression
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the weighted approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression)",
+    syntax_example = "approx_percentile_cont_with_weight(weight, percentile [, centroids]) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
 > SELECT approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) FROM table_name;
 +---------------------------------------------------------------------------------------------+
@@ -57,6 +80,22 @@ make_udaf_expr_and_func!(
 +---------------------------------------------------------------------------------------------+
 | 78.5                                                                                        |
 +---------------------------------------------------------------------------------------------+
+> SELECT approx_percentile_cont_with_weight(weight_column, 0.90, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++--------------------------------------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(weight_column, 0.90, 100) WITHIN GROUP (ORDER BY column_name) |
++--------------------------------------------------------------------------------------------------+
+| 78.5                                                                                             |
++--------------------------------------------------------------------------------------------------+
+```
+An alternative syntax is also supported:
+
+```sql
+> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name;
++--------------------------------------------------+
+| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
++--------------------------------------------------+
+| 78.5                                             |
++--------------------------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "The"),
     argument(
@@ -66,21 +105,18 @@ make_udaf_expr_and_func!(
     argument(
         name = "percentile",
         description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)."
+    ),
+    argument(
+        name = "centroids",
+        description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
     )
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct ApproxPercentileContWithWeight {
     signature: Signature,
     approx_percentile_cont: ApproxPercentileCont,
 }
 
-impl Debug for ApproxPercentileContWithWeight {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("ApproxPercentileContWithWeight")
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for ApproxPercentileContWithWeight {
     fn default() -> Self {
         Self::new()
@@ -90,21 +126,26 @@ impl Default for ApproxPercentileContWithWeight {
 impl ApproxPercentileContWithWeight {
     /// Create a new [`ApproxPercentileContWithWeight`] aggregate function.
     pub fn new() -> Self {
+        let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1));
+        // Accept any numeric value paired with weight and float64 percentile
+        for num in NUMERICS {
+            variants.push(TypeSignature::Exact(vec![
+                num.clone(),
+                num.clone(),
+                DataType::Float64,
+            ]));
+            // Additionally accept an integer number of centroids for T-Digest
+            for int in INTEGERS {
+                variants.push(TypeSignature::Exact(vec![
+                    num.clone(),
+                    num.clone(),
+                    DataType::Float64,
+                    int.clone(),
+                ]));
+            }
+        }
         Self {
-            signature: Signature::one_of(
-                // Accept any numeric value paired with a float64 percentile
-                NUMERICS
-                    .iter()
-                    .map(|t| {
-                        TypeSignature::Exact(vec![
-                            t.clone(),
-                            t.clone(),
-                            DataType::Float64,
-                        ])
-                    })
-                    .collect(),
-                Immutable,
-            ),
+            signature: Signature::one_of(variants, Immutable),
             approx_percentile_cont: ApproxPercentileCont::new(),
         }
     }
@@ -135,7 +176,14 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
             );
         }
         if arg_types[2] != DataType::Float64 {
-            return plan_err!("approx_percentile_cont_with_weight requires float64 percentile input types");
+            return plan_err!(
+                "approx_percentile_cont_with_weight requires float64 percentile input types"
+            );
+        }
+        if arg_types.len() == 4 && !arg_types[3].is_integer() {
+            return plan_err!(
+                "approx_percentile_cont_with_weight requires integer centroids input types"
+            );
         }
         Ok(arg_types[0].clone())
     }
@@ -147,39 +195,63 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
             );
         }
 
-        if acc_args.exprs.len() != 3 {
+        if acc_args.exprs.len() != 3 && acc_args.exprs.len() != 4 {
             return plan_err!(
-                "approx_percentile_cont_with_weight requires three arguments: value, weight, percentile"
+                "approx_percentile_cont_with_weight requires three or four arguments: value, weight, percentile[, centroids]"
             );
         }
 
         let sub_args = AccumulatorArgs {
-            exprs: &[
-                Arc::clone(&acc_args.exprs[0]),
-                Arc::clone(&acc_args.exprs[2]),
-            ],
-            ..acc_args
+            exprs: if acc_args.exprs.len() == 4 {
+                &[
+                    Arc::clone(&acc_args.exprs[0]), // value
+                    Arc::clone(&acc_args.exprs[2]), // percentile
+                    Arc::clone(&acc_args.exprs[3]), // centroids
+                ]
+            } else {
+                &[
+                    Arc::clone(&acc_args.exprs[0]), // value
+                    Arc::clone(&acc_args.exprs[2]), // percentile
+                ]
+            },
+            expr_fields: if acc_args.exprs.len() == 4 {
+                &[
+                    Arc::clone(&acc_args.expr_fields[0]), // value
+                    Arc::clone(&acc_args.expr_fields[2]), // percentile
+                    Arc::clone(&acc_args.expr_fields[3]), // centroids
+                ]
+            } else {
+                &[
+                    Arc::clone(&acc_args.expr_fields[0]), // value
+                    Arc::clone(&acc_args.expr_fields[2]), // percentile
+                ]
+            },
+            // Unchanged below; we list each field explicitly in case we ever add more
+            // fields to AccumulatorArgs making it easier to see if changes are also
+            // needed here.
+            return_field: acc_args.return_field,
+            schema: acc_args.schema,
+            ignore_nulls: acc_args.ignore_nulls,
+            order_bys: acc_args.order_bys,
+            is_reversed: acc_args.is_reversed,
+            name: acc_args.name,
+            is_distinct: acc_args.is_distinct,
         };
         let approx_percentile_cont_accumulator =
-            self.approx_percentile_cont.create_accumulator(sub_args)?;
+            self.approx_percentile_cont.create_accumulator(&sub_args)?;
         let accumulator = ApproxPercentileWithWeightAccumulator::new(
             approx_percentile_cont_accumulator,
         );
         Ok(Box::new(accumulator))
     }
 
-    #[allow(rustdoc::private_intra_doc_links)]
     /// See [`TDigest::to_scalar_state()`] for a description of the serialized
     /// state.
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         self.approx_percentile_cont.state_fields(args)
     }
 
-    fn supports_null_handling_clause(&self) -> bool {
-        false
-    }
-
-    fn is_ordered_set_aggregate(&self) -> bool {
+    fn supports_within_group_clause(&self) -> bool {
         true
     }
 
@@ -207,19 +279,41 @@ impl Accumulator for ApproxPercentileWithWeightAccumulator {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let means = &values[0];
-        let weights = &values[1];
+        let mut means = Arc::clone(&values[0]);
+        let mut weights = Arc::clone(&values[1]);
+        // If nulls are present in either array, need to filter those rows out in both arrays
+        match (means.null_count() > 0, weights.null_count() > 0) {
+            // Both have nulls
+            (true, true) => {
+                let predicate = and(&is_not_null(&means)?, &is_not_null(&weights)?)?;
+                means = filter(&means, &predicate)?;
+                weights = filter(&weights, &predicate)?;
+            }
+            // Only one has nulls
+            (false, true) => {
+                let predicate = &is_not_null(&weights)?;
+                means = filter(&means, predicate)?;
+                weights = filter(&weights, predicate)?;
+            }
+            (true, false) => {
+                let predicate = &is_not_null(&means)?;
+                means = filter(&means, predicate)?;
+                weights = filter(&weights, predicate)?;
+            }
+            // No nulls
+            (false, false) => {}
+        }
         debug_assert_eq!(
             means.len(),
             weights.len(),
             "invalid number of values in means and weights"
         );
-        let means_f64 = ApproxPercentileAccumulator::convert_to_float(means)?;
-        let weights_f64 = ApproxPercentileAccumulator::convert_to_float(weights)?;
+        let means_f64 = ApproxPercentileAccumulator::convert_to_float(&means)?;
+        let weights_f64 = ApproxPercentileAccumulator::convert_to_float(&weights)?;
         let mut digests: Vec<TDigest> = vec![];
         for (mean, weight) in means_f64.iter().zip(weights_f64.iter()) {
             digests.push(TDigest::new_with_centroid(
-                DEFAULT_MAX_SIZE,
+                self.approx_percentile_cont_accumulator.max_size(),
                 Centroid::new(*mean, *weight),
             ))
         }
diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs
index 71278767a83fc..cd4cb9b19ff77 100644
--- a/datafusion/functions-aggregate/src/array_agg.rs
+++ b/datafusion/functions-aggregate/src/array_agg.rs
@@ -17,28 +17,36 @@
 
 //! `ARRAY_AGG` aggregate implementation: [`ArrayAgg`]
 
+use std::cmp::Ordering;
+use std::collections::{HashSet, VecDeque};
+use std::mem::{size_of, size_of_val, take};
+use std::sync::Arc;
+
 use arrow::array::{
-    new_empty_array, Array, ArrayRef, AsArray, BooleanArray, ListArray, StructArray,
+    Array, ArrayRef, AsArray, BooleanArray, ListArray, NullBufferBuilder, StructArray,
+    UInt32Array, new_empty_array,
 };
-use arrow::compute::{filter, SortOptions};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::compute::{SortOptions, filter};
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
 
 use datafusion_common::cast::as_list_array;
-use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder};
-use datafusion_common::{exec_err, ScalarValue};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::{
+    SingleRowListArrayBuilder, compare_rows, get_row_at_idx, take_function_args,
+};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err, exec_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
-use datafusion_expr::{Accumulator, Signature, Volatility};
-use datafusion_expr::{AggregateUDFImpl, Documentation};
+use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, Signature,
+    Volatility,
+};
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filter_to_nulls;
 use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
+use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
 use datafusion_functions_aggregate_common::utils::ordering_fields;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use std::cmp::Ordering;
-use std::collections::{HashSet, VecDeque};
-use std::mem::{size_of, size_of_val};
-use std::sync::Arc;
 
 make_udaf_expr_and_func!(
     ArrayAgg,
@@ -71,16 +79,18 @@ This aggregation function can only mix DISTINCT and ORDER BY if the ordering exp
 "#,
     standard_argument(name = "expression",)
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 /// ARRAY_AGG aggregate expression
 pub struct ArrayAgg {
     signature: Signature,
+    is_input_pre_ordered: bool,
 }
 
 impl Default for ArrayAgg {
     fn default() -> Self {
         Self {
             signature: Signature::any(1, Volatility::Immutable),
+            is_input_pre_ordered: false,
         }
     }
 }
@@ -94,10 +104,6 @@ impl AggregateUDFImpl for ArrayAgg {
         "array_agg"
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn signature(&self) -> &Signature {
         &self.signature
     }
@@ -111,22 +117,26 @@ impl AggregateUDFImpl for ArrayAgg {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            return Ok(vec![Field::new_list(
-                format_state_name(args.name, "distinct_array_agg"),
+            return Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "distinct_array_agg"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                    true,
+                )
+                .into(),
+            ]);
+        }
+
+        let mut fields = vec![
+            Field::new_list(
+                format_state_name(args.name, "array_agg"),
                 // See COMMENTS.md to understand why nullable is set to true
                 Field::new_list_field(args.input_fields[0].data_type().clone(), true),
                 true,
             )
-            .into()]);
-        }
-
-        let mut fields = vec![Field::new_list(
-            format_state_name(args.name, "array_agg"),
-            // See COMMENTS.md to understand why nullable is set to true
-            Field::new_list_field(args.input_fields[0].data_type().clone(), true),
-            true,
-        )
-        .into()];
+            .into(),
+        ];
 
         if args.ordering_fields.is_empty() {
             return Ok(fields);
@@ -145,10 +155,24 @@ impl AggregateUDFImpl for ArrayAgg {
         Ok(fields)
     }
 
+    fn order_sensitivity(&self) -> AggregateOrderSensitivity {
+        AggregateOrderSensitivity::SoftRequirement
+    }
+
+    fn with_beneficial_ordering(
+        self: Arc<Self>,
+        beneficial_ordering: bool,
+    ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
+        Ok(Some(Arc::new(Self {
+            signature: self.signature.clone(),
+            is_input_pre_ordered: beneficial_ordering,
+        })))
+    }
+
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
-        let ignore_nulls =
-            acc_args.ignore_nulls && acc_args.exprs[0].nullable(acc_args.schema)?;
+        let field = &acc_args.expr_fields[0];
+        let data_type = field.data_type();
+        let ignore_nulls = acc_args.ignore_nulls && field.is_nullable();
 
         if acc_args.is_distinct {
             // Limitation similar to Postgres. The aggregation function can only mix
@@ -165,41 +189,39 @@ impl AggregateUDFImpl for ArrayAgg {
             // ARRAY_AGG(DISTINCT concat(col, '') ORDER BY concat(col, '')) <- Valid
             // ARRAY_AGG(DISTINCT col ORDER BY other_col)                   <- Invalid
             // ARRAY_AGG(DISTINCT col ORDER BY concat(col, ''))             <- Invalid
-            if acc_args.ordering_req.len() > 1 {
-                return exec_err!("In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list");
-            }
-            let mut sort_option: Option<SortOptions> = None;
-            if let Some(order) = acc_args.ordering_req.first() {
-                if !order.expr.eq(&acc_args.exprs[0]) {
-                    return exec_err!("In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list");
+            let sort_option = match acc_args.order_bys {
+                [single] if single.expr.eq(&acc_args.exprs[0]) => Some(single.options),
+                [] => None,
+                _ => {
+                    return exec_err!(
+                        "In an aggregate with DISTINCT, ORDER BY expressions must appear in argument list"
+                    );
                 }
-                sort_option = Some(order.options)
-            }
-
+            };
             return Ok(Box::new(DistinctArrayAggAccumulator::try_new(
-                &data_type,
+                data_type,
                 sort_option,
                 ignore_nulls,
             )?));
         }
 
-        if acc_args.ordering_req.is_empty() {
+        let Some(ordering) = LexOrdering::new(acc_args.order_bys.to_vec()) else {
             return Ok(Box::new(ArrayAggAccumulator::try_new(
-                &data_type,
+                data_type,
                 ignore_nulls,
             )?));
-        }
+        };
 
-        let ordering_dtypes = acc_args
-            .ordering_req
+        let ordering_dtypes = ordering
             .iter()
             .map(|e| e.expr.data_type(acc_args.schema))
             .collect::<Result<Vec<_>>>()?;
 
         OrderSensitiveArrayAggAccumulator::try_new(
-            &data_type,
+            data_type,
             &ordering_dtypes,
-            acc_args.ordering_req.clone(),
+            ordering,
+            self.is_input_pre_ordered,
             acc_args.is_reversed,
             ignore_nulls,
         )
@@ -210,6 +232,27 @@ impl AggregateUDFImpl for ArrayAgg {
         datafusion_expr::ReversedUDAF::Reversed(array_agg_udaf())
     }
 
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        !args.is_distinct && args.order_bys.is_empty()
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let field = &args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = args.ignore_nulls && field.is_nullable();
+        Ok(Box::new(ArrayAggGroupsAccumulator::new(
+            data_type,
+            ignore_nulls,
+        )))
+    }
+
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -301,9 +344,7 @@ impl Accumulator for ArrayAggAccumulator {
             return Ok(());
         }
 
-        if values.len() != 1 {
-            return internal_err!("expects single batch");
-        }
+        assert_eq_or_internal_err!(values.len(), 1, "expects single batch");
 
         let val = &values[0];
         let nulls = if self.ignore_nulls {
@@ -319,7 +360,7 @@ impl Accumulator for ArrayAggAccumulator {
         };
 
         if !val.is_empty() {
-            self.values.push(val);
+            self.values.push(val)
         }
 
         Ok(())
@@ -331,9 +372,7 @@ impl Accumulator for ArrayAggAccumulator {
             return Ok(());
         }
 
-        if states.len() != 1 {
-            return internal_err!("expects single state");
-        }
+        assert_eq_or_internal_err!(states.len(), 1, "expects single state");
 
         let list_arr = as_list_array(&states[0])?;
 
@@ -378,7 +417,18 @@ impl Accumulator for ArrayAggAccumulator {
             + self
                 .values
                 .iter()
-                .map(|arr| arr.get_array_memory_size())
+                // Each ArrayRef might be just a reference to a bigger array, and many
+                // ArrayRefs here might be referencing exactly the same array, so if we
+                // were to call `arr.get_array_memory_size()`, we would be double-counting
+                // the same underlying data many times.
+                //
+                // Instead, we do an approximation by estimating how much memory each
+                // ArrayRef would occupy if its underlying data was fully owned by this
+                // accumulator.
+                //
+                // Note that this is just an estimation, but the reality is that this
+                // accumulator might not own any data.
+                .map(|arr| arr.to_data().get_slice_memory_size().unwrap_or_default())
                 .sum::<usize>()
             + self.datatype.size()
             - size_of_val(&self.datatype)
@@ -386,7 +436,332 @@ impl Accumulator for ArrayAggAccumulator {
 }
 
 #[derive(Debug)]
-struct DistinctArrayAggAccumulator {
+struct ArrayAggGroupsAccumulator {
+    datatype: DataType,
+    ignore_nulls: bool,
+    /// Source arrays — input arrays (from update_batch) or list backing
+    /// arrays (from merge_batch).
+    batches: Vec<ArrayRef>,
+    /// Per-batch list of (group_idx, row_idx) pairs.
+    batch_entries: Vec<Vec<(u32, u32)>>,
+    /// Total number of groups tracked.
+    num_groups: usize,
+}
+
+impl ArrayAggGroupsAccumulator {
+    fn new(datatype: DataType, ignore_nulls: bool) -> Self {
+        Self {
+            datatype,
+            ignore_nulls,
+            batches: Vec::new(),
+            batch_entries: Vec::new(),
+            num_groups: 0,
+        }
+    }
+
+    fn clear_state(&mut self) {
+        // `size()` measures Vec capacity rather than len, so allocate new
+        // buffers instead of using `clear()`.
+        self.batches = Vec::new();
+        self.batch_entries = Vec::new();
+        self.num_groups = 0;
+    }
+
+    fn compact_retained_state(&mut self, emit_groups: usize) -> Result<()> {
+        // EmitTo::First is used to recover from memory pressure. Simply
+        // removing emitted entries in place is not enough because mixed batches
+        // would continue to pin their original Array arrays, even if only a few
+        // retained rows remain.
+        //
+        // Rebuild the retained state from scratch so fully emitted batches are
+        // dropped, mixed batches are compacted to arrays containing only the
+        // surviving rows, and retained metadata is right-sized.
+        let emit_groups = emit_groups as u32;
+        let old_batches = take(&mut self.batches);
+        let old_batch_entries = take(&mut self.batch_entries);
+
+        let mut batches = Vec::new();
+        let mut batch_entries = Vec::new();
+
+        for (batch, entries) in old_batches.into_iter().zip(old_batch_entries) {
+            let retained_len = entries.iter().filter(|(g, _)| *g >= emit_groups).count();
+
+            if retained_len == 0 {
+                continue;
+            }
+
+            if retained_len == entries.len() {
+                // Nothing was emitted from this batch, so we keep the existing
+                // array and only renumber the remaining group IDs so that they
+                // start from 0.
+                let mut retained_entries = entries;
+                for (g, _) in &mut retained_entries {
+                    *g -= emit_groups;
+                }
+                retained_entries.shrink_to_fit();
+                batches.push(batch);
+                batch_entries.push(retained_entries);
+                continue;
+            }
+
+            let mut retained_entries = Vec::with_capacity(retained_len);
+            let mut retained_rows = Vec::with_capacity(retained_len);
+
+            for (g, r) in entries {
+                if g >= emit_groups {
+                    // Compute the new `(group_idx, row_idx)` pair for a
+                    // retained row. `group_idx` is renumbered to start from
+                    // 0, and `row_idx` points into the new dense batch we are
+                    // building.
+                    retained_entries.push((g - emit_groups, retained_rows.len() as u32));
+                    retained_rows.push(r);
+                }
+            }
+
+            debug_assert_eq!(retained_entries.len(), retained_len);
+            debug_assert_eq!(retained_rows.len(), retained_len);
+
+            let batch = if retained_len == batch.len() {
+                batch
+            } else {
+                // Compact mixed batches so retained rows no longer pin the
+                // original array.
+                let retained_rows = UInt32Array::from(retained_rows);
+                arrow::compute::take(batch.as_ref(), &retained_rows, None)?
+            };
+
+            batches.push(batch);
+            batch_entries.push(retained_entries);
+        }
+
+        self.batches = batches;
+        self.batch_entries = batch_entries;
+        self.num_groups -= emit_groups as usize;
+
+        Ok(())
+    }
+}
+
+impl GroupsAccumulator for ArrayAggGroupsAccumulator {
+    /// Store a reference to the input batch, plus a `(group_idx, row_idx)` pair
+    /// for every row.
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let input = &values[0];
+
+        self.num_groups = self.num_groups.max(total_num_groups);
+
+        let nulls = if self.ignore_nulls {
+            input.logical_nulls()
+        } else {
+            None
+        };
+
+        let mut entries = Vec::new();
+
+        for (row_idx, &group_idx) in group_indices.iter().enumerate() {
+            // Skip filtered rows
+            if let Some(filter) = opt_filter
+                && (filter.is_null(row_idx) || !filter.value(row_idx))
+            {
+                continue;
+            }
+
+            // Skip null values when ignore_nulls is set
+            if let Some(ref nulls) = nulls
+                && nulls.is_null(row_idx)
+            {
+                continue;
+            }
+
+            entries.push((group_idx as u32, row_idx as u32));
+        }
+
+        // We only need to record the batch if it was non-empty.
+        if !entries.is_empty() {
+            self.batches.push(Arc::clone(input));
+            self.batch_entries.push(entries);
+        }
+
+        Ok(())
+    }
+
+    /// Produce a `ListArray` ordered by group index: the list at
+    /// position N contains the aggregated values for group N.
+    ///
+    /// Uses a counting sort to rearrange the stored `(group, row)`
+    /// entries into group order, then calls `interleave` to gather
+    /// the values into a flat array that backs the output `ListArray`.
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let emit_groups = match emit_to {
+            EmitTo::All => self.num_groups,
+            EmitTo::First(n) => n,
+        };
+
+        // Step 1: Count entries per group. For EmitTo::First(n), only groups
+        // 0..n are counted; the rest are retained to be emitted in the future.
+        let mut counts = vec![0u32; emit_groups];
+        for entries in &self.batch_entries {
+            for &(g, _) in entries {
+                let g = g as usize;
+                if g < emit_groups {
+                    counts[g] += 1;
+                }
+            }
+        }
+
+        // Step 2: Do a prefix sum over the counts and use it to build ListArray
+        // offsets, null buffer, and write positions for the counting sort.
+        let mut offsets = Vec::<i32>::with_capacity(emit_groups + 1);
+        offsets.push(0);
+        let mut nulls_builder = NullBufferBuilder::new(emit_groups);
+        let mut write_positions = Vec::with_capacity(emit_groups);
+        let mut cur_offset = 0u32;
+        for &count in &counts {
+            if count == 0 {
+                nulls_builder.append_null();
+            } else {
+                nulls_builder.append_non_null();
+            }
+            write_positions.push(cur_offset);
+            cur_offset += count;
+            offsets.push(cur_offset as i32);
+        }
+        let total_rows = cur_offset as usize;
+
+        // Step 3: Scatter entries into group order using the counting sort. The
+        // batch index is implicit from the outer loop position.
+        let flat_values = if total_rows == 0 {
+            new_empty_array(&self.datatype)
+        } else {
+            let mut interleave_indices = vec![(0usize, 0usize); total_rows];
+            for (batch_idx, entries) in self.batch_entries.iter().enumerate() {
+                for &(g, r) in entries {
+                    let g = g as usize;
+                    if g < emit_groups {
+                        let wp = write_positions[g] as usize;
+                        interleave_indices[wp] = (batch_idx, r as usize);
+                        write_positions[g] += 1;
+                    }
+                }
+            }
+
+            let sources: Vec<&dyn Array> =
+                self.batches.iter().map(|b| b.as_ref()).collect();
+            arrow::compute::interleave(&sources, &interleave_indices)?
+        };
+
+        // Step 4: Release state for emitted groups.
+        match emit_to {
+            EmitTo::All => self.clear_state(),
+            EmitTo::First(_) => self.compact_retained_state(emit_groups)?,
+        }
+
+        let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets));
+        let field = Arc::new(Field::new_list_field(self.datatype.clone(), true));
+        let result = ListArray::new(field, offsets, flat_values, nulls_builder.finish());
+
+        Ok(Arc::new(result))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        Ok(vec![self.evaluate(emit_to)?])
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "one argument to merge_batch");
+        let input_list = values[0].as_list::<i32>();
+
+        self.num_groups = self.num_groups.max(total_num_groups);
+
+        // Push the ListArray's backing values array as a single batch.
+        let list_values = input_list.values();
+        let list_offsets = input_list.offsets();
+
+        let mut entries = Vec::new();
+
+        for (row_idx, &group_idx) in group_indices.iter().enumerate() {
+            if input_list.is_null(row_idx) {
+                continue;
+            }
+            let start = list_offsets[row_idx] as u32;
+            let end = list_offsets[row_idx + 1] as u32;
+            for pos in start..end {
+                entries.push((group_idx as u32, pos));
+            }
+        }
+
+        if !entries.is_empty() {
+            self.batches.push(Arc::clone(list_values));
+            self.batch_entries.push(entries);
+        }
+
+        Ok(())
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        assert_eq!(values.len(), 1, "one argument to convert_to_state");
+
+        let input = &values[0];
+
+        // Each row becomes a 1-element list: offsets are [0, 1, 2, ..., n].
+        let offsets = OffsetBuffer::from_repeated_length(1, input.len());
+
+        // Filtered rows become null list entries, which merge_batch will skip.
+        let filter_nulls = opt_filter.and_then(filter_to_nulls);
+
+        // With ignore_nulls, null values also become null list entries. Without
+        // ignore_nulls, null values stay as [NULL] so merge_batch retains them.
+        let nulls = if self.ignore_nulls {
+            let logical = input.logical_nulls();
+            NullBuffer::union(filter_nulls.as_ref(), logical.as_ref())
+        } else {
+            filter_nulls
+        };
+
+        let field = Arc::new(Field::new_list_field(self.datatype.clone(), true));
+        let list_array = ListArray::new(field, offsets, Arc::clone(input), nulls);
+
+        Ok(vec![Arc::new(list_array)])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
+    fn size(&self) -> usize {
+        self.batches
+            .iter()
+            .map(|arr| arr.to_data().get_slice_memory_size().unwrap_or_default())
+            .sum::<usize>()
+            + self.batches.capacity() * size_of::<ArrayRef>()
+            + self
+                .batch_entries
+                .iter()
+                .map(|e| e.capacity() * size_of::<(u32, u32)>())
+                .sum::<usize>()
+            + self.batch_entries.capacity() * size_of::<Vec<(u32, u32)>>()
+    }
+}
+
+#[derive(Debug)]
+pub struct DistinctArrayAggAccumulator {
     values: HashSet<ScalarValue>,
     datatype: DataType,
     sort_options: Option<SortOptions>,
@@ -429,7 +804,8 @@ impl Accumulator for DistinctArrayAggAccumulator {
         if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
             for i in 0..val.len() {
                 if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
-                    self.values.insert(ScalarValue::try_from_array(val, i)?);
+                    self.values
+                        .insert(ScalarValue::try_from_array(val, i)?.compacted());
                 }
             }
         }
@@ -442,9 +818,7 @@ impl Accumulator for DistinctArrayAggAccumulator {
             return Ok(());
         }
 
-        if states.len() != 1 {
-            return internal_err!("expects single state");
-        }
+        assert_eq_or_internal_err!(states.len(), 1, "expects single state");
 
         states[0]
             .as_list::<i32>()
@@ -460,6 +834,7 @@ impl Accumulator for DistinctArrayAggAccumulator {
         }
 
         if let Some(opts) = self.sort_options {
+            let mut delayed_cmp_err = Ok(());
             values.sort_by(|a, b| {
                 if a.is_null() {
                     return match opts.nulls_first {
@@ -474,10 +849,15 @@ impl Accumulator for DistinctArrayAggAccumulator {
                     };
                 }
                 match opts.descending {
-                    true => b.partial_cmp(a).unwrap_or(Ordering::Equal),
-                    false => a.partial_cmp(b).unwrap_or(Ordering::Equal),
+                    true => b.try_cmp(a),
+                    false => a.try_cmp(b),
                 }
+                .unwrap_or_else(|err| {
+                    delayed_cmp_err = Err(err);
+                    Ordering::Equal
+                })
             });
+            delayed_cmp_err?;
         };
 
         let arr = ScalarValue::new_list(&values, &self.datatype, true);
@@ -511,6 +891,8 @@ pub(crate) struct OrderSensitiveArrayAggAccumulator {
     datatypes: Vec<DataType>,
     /// Stores the ordering requirement of the `Accumulator`.
     ordering_req: LexOrdering,
+    /// Whether the input is known to be pre-ordered
+    is_input_pre_ordered: bool,
     /// Whether the aggregation is running in reverse.
     reverse: bool,
     /// Whether the aggregation should ignore null values.
@@ -524,6 +906,7 @@ impl OrderSensitiveArrayAggAccumulator {
         datatype: &DataType,
         ordering_dtypes: &[DataType],
         ordering_req: LexOrdering,
+        is_input_pre_ordered: bool,
         reverse: bool,
         ignore_nulls: bool,
     ) -> Result<Self> {
@@ -534,10 +917,58 @@ impl OrderSensitiveArrayAggAccumulator {
             ordering_values: vec![],
             datatypes,
             ordering_req,
+            is_input_pre_ordered,
             reverse,
             ignore_nulls,
         })
     }
+
+    fn sort(&mut self) {
+        let sort_options = self
+            .ordering_req
+            .iter()
+            .map(|sort_expr| sort_expr.options)
+            .collect::<Vec<_>>();
+        let mut values = take(&mut self.values)
+            .into_iter()
+            .zip(take(&mut self.ordering_values))
+            .collect::<Vec<_>>();
+        let mut delayed_cmp_err = Ok(());
+        values.sort_by(|(_, left_ordering), (_, right_ordering)| {
+            compare_rows(left_ordering, right_ordering, &sort_options).unwrap_or_else(
+                |err| {
+                    delayed_cmp_err = Err(err);
+                    Ordering::Equal
+                },
+            )
+        });
+        (self.values, self.ordering_values) = values.into_iter().unzip();
+    }
+
+    fn evaluate_orderings(&self) -> Result<ScalarValue> {
+        let fields = ordering_fields(&self.ordering_req, &self.datatypes[1..]);
+
+        let column_wise_ordering_values = if self.ordering_values.is_empty() {
+            fields
+                .iter()
+                .map(|f| new_empty_array(f.data_type()))
+                .collect::<Vec<_>>()
+        } else {
+            (0..fields.len())
+                .map(|i| {
+                    let column_values = self.ordering_values.iter().map(|x| x[i].clone());
+                    ScalarValue::iter_to_array(column_values)
+                })
+                .collect::<Result<_>>()?
+        };
+
+        let ordering_array = StructArray::try_new(
+            Fields::from(fields),
+            column_wise_ordering_values,
+            None,
+        )?;
+        Ok(SingleRowListArrayBuilder::new(Arc::new(ordering_array)).build_list_scalar())
+    }
 }
 
 impl Accumulator for OrderSensitiveArrayAggAccumulator {
@@ -558,8 +989,14 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
         if nulls.is_none_or(|nulls| nulls.null_count() < val.len()) {
             for i in 0..val.len() {
                 if nulls.is_none_or(|nulls| nulls.is_valid(i)) {
-                    self.values.push(ScalarValue::try_from_array(val, i)?);
-                    self.ordering_values.push(get_row_at_idx(ord, i)?)
+                    self.values
+                        .push(ScalarValue::try_from_array(val, i)?.compacted());
+                    self.ordering_values.push(
+                        get_row_at_idx(ord, i)?
+                            .into_iter()
+                            .map(|v| v.compacted())
+                            .collect(),
+                    )
                 }
             }
         }
@@ -578,9 +1015,8 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
         // inside `ARRAY_AGG` list, we will receive an `Array` that stores values
         // received from its ordering requirement expression. (This information
         // is necessary for during merging).
-        let [array_agg_values, agg_orderings, ..] = &states else {
-            return exec_err!("State should have two elements");
-        };
+        let [array_agg_values, agg_orderings] =
+            take_function_args("OrderSensitiveArrayAggAccumulator::merge_batch", states)?;
         let Some(agg_orderings) = agg_orderings.as_list_opt::<i32>() else {
             return exec_err!("Expects to receive a list array");
         };
@@ -591,18 +1027,24 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
         let mut partition_ordering_values = vec![];
 
         // Existing values should be merged also.
-        partition_values.push(self.values.clone().into());
-        partition_ordering_values.push(self.ordering_values.clone().into());
+        if !self.is_input_pre_ordered {
+            self.sort();
+        }
+        partition_values.push(take(&mut self.values).into());
+        partition_ordering_values.push(take(&mut self.ordering_values).into());
 
         // Convert array to Scalars to sort them easily. Convert back to array at evaluation.
         let array_agg_res = ScalarValue::convert_array_to_scalar_vec(array_agg_values)?;
-        for v in array_agg_res.into_iter() {
-            partition_values.push(v.into());
+        for maybe_v in array_agg_res.into_iter() {
+            if let Some(v) = maybe_v {
+                partition_values.push(v.into());
+            } else {
+                partition_values.push(vec![].into());
+            }
         }
 
         let orderings = ScalarValue::convert_array_to_scalar_vec(agg_orderings)?;
-
-        for partition_ordering_rows in orderings.into_iter() {
+        for partition_ordering_rows in orderings.into_iter().flatten() {
             // Extract value from struct to ordering_rows for each group/partition
             let ordering_value = partition_ordering_rows.into_iter().map(|ordering_row| {
                     if let ScalarValue::Struct(s) = ordering_row {
@@ -641,6 +1083,10 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
     }
 
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        if !self.is_input_pre_ordered {
+            self.sort();
+        }
+
         let mut result = vec![self.evaluate()?];
         result.push(self.evaluate_orderings()?);
 
@@ -648,6 +1094,10 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
+        if !self.is_input_pre_ordered {
+            self.sort();
+        }
+
         if self.values.is_empty() {
             return Ok(ScalarValue::new_null_list(
                 self.datatypes[0].clone(),
@@ -692,41 +1142,16 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator {
     }
 }
 
-impl OrderSensitiveArrayAggAccumulator {
-    fn evaluate_orderings(&self) -> Result<ScalarValue> {
-        let fields = ordering_fields(self.ordering_req.as_ref(), &self.datatypes[1..]);
-        let num_columns = fields.len();
-
-        let mut column_wise_ordering_values = vec![];
-        for i in 0..num_columns {
-            let column_values = self
-                .ordering_values
-                .iter()
-                .map(|x| x[i].clone())
-                .collect::<Vec<_>>();
-            let array = if column_values.is_empty() {
-                new_empty_array(fields[i].data_type())
-            } else {
-                ScalarValue::iter_to_array(column_values.into_iter())?
-            };
-            column_wise_ordering_values.push(array);
-        }
-
-        let struct_field = Fields::from(fields);
-        let ordering_array =
-            StructArray::try_new(struct_field, column_wise_ordering_values, None)?;
-        Ok(SingleRowListArrayBuilder::new(Arc::new(ordering_array)).build_list_scalar())
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
+    use arrow::array::{ListBuilder, StringBuilder};
     use arrow::datatypes::{FieldRef, Schema};
     use datafusion_common::cast::as_generic_string_array;
     use datafusion_common::internal_err;
+    use datafusion_physical_expr::PhysicalExpr;
     use datafusion_physical_expr::expressions::Column;
-    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
     use std::sync::Arc;
 
     #[test]
@@ -988,10 +1413,59 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn does_not_over_account_memory() -> Result<()> {
+        let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string().build_two()?;
+
+        acc1.update_batch(&[data(["a", "c", "b"])])?;
+        acc2.update_batch(&[data(["b", "c", "a"])])?;
+        acc1 = merge(acc1, acc2)?;
+
+        assert_eq!(acc1.size(), 266);
+
+        Ok(())
+    }
+    #[test]
+    fn does_not_over_account_memory_distinct() -> Result<()> {
+        let (mut acc1, mut acc2) = ArrayAggAccumulatorBuilder::string()
+            .distinct()
+            .build_two()?;
+
+        acc1.update_batch(&[string_list_data([
+            vec!["a", "b", "c"],
+            vec!["d", "e", "f"],
+        ])])?;
+        acc2.update_batch(&[string_list_data([vec!["e", "f", "g"]])])?;
+        acc1 = merge(acc1, acc2)?;
+
+        // without compaction, the size is 16660
+        assert_eq!(acc1.size(), 1660);
+
+        Ok(())
+    }
+
+    #[test]
+    fn does_not_over_account_memory_ordered() -> Result<()> {
+        let mut acc = ArrayAggAccumulatorBuilder::string()
+            .order_by_col("col", SortOptions::new(false, false))
+            .build()?;
+
+        acc.update_batch(&[string_list_data([
+            vec!["a", "b", "c"],
+            vec!["c", "d", "e"],
+            vec!["b", "c", "d"],
+        ])])?;
+
+        // without compaction, the size is 17112
+        assert_eq!(acc.size(), 2224);
+
+        Ok(())
+    }
+
     struct ArrayAggAccumulatorBuilder {
         return_field: FieldRef,
         distinct: bool,
-        ordering: LexOrdering,
+        order_bys: Vec<PhysicalSortExpr>,
         schema: Schema,
     }
 
@@ -1004,7 +1478,7 @@ mod tests {
             Self {
                 return_field: Field::new("f", data_type.clone(), true).into(),
                 distinct: false,
-                ordering: Default::default(),
+                order_bys: vec![],
                 schema: Schema {
                     fields: Fields::from(vec![Field::new(
                         "col",
@@ -1022,26 +1496,30 @@ mod tests {
         }
 
         fn order_by_col(mut self, col: &str, sort_options: SortOptions) -> Self {
-            self.ordering.extend([PhysicalSortExpr::new(
+            let new_order = PhysicalSortExpr::new(
                 Arc::new(
                     Column::new_with_schema(col, &self.schema)
                         .expect("column not available in schema"),
                 ),
                 sort_options,
-            )]);
+            );
+            self.order_bys.push(new_order);
             self
         }
 
         fn build(&self) -> Result<Box<dyn Accumulator>> {
+            let expr = Arc::new(Column::new("col", 0));
+            let expr_field = expr.return_field(&self.schema)?;
             ArrayAgg::default().accumulator(AccumulatorArgs {
                 return_field: Arc::clone(&self.return_field),
                 schema: &self.schema,
+                expr_fields: &[expr_field],
                 ignore_nulls: false,
-                ordering_req: &self.ordering,
+                order_bys: &self.order_bys,
                 is_reversed: false,
                 name: "",
                 is_distinct: self.distinct,
-                exprs: &[Arc::new(Column::new("col", 0))],
+                exprs: &[expr],
             })
         }
 
@@ -1066,6 +1544,15 @@ mod tests {
             .collect()
     }
 
+    fn string_list_data<'a>(data: impl IntoIterator<Item = Vec<&'a str>>) -> ArrayRef {
+        let mut builder = ListBuilder::new(StringBuilder::new());
+        for string_list in data.into_iter() {
+            builder.append_value(string_list.iter().map(Some).collect::<Vec<_>>());
+        }
+
+        Arc::new(builder.finish())
+    }
+
     fn data<T, const N: usize>(list: [T; N]) -> ArrayRef
     where
         ScalarValue: From<T>,
@@ -1086,4 +1573,372 @@ mod tests {
         acc1.merge_batch(&intermediate_state)?;
         Ok(acc1)
     }
+
+    // ---- GroupsAccumulator tests ----
+
+    use arrow::array::Int32Array;
+
+    fn list_array_to_i32_vecs(list: &ListArray) -> Vec<Option<Vec<Option<i32>>>> {
+        (0..list.len())
+            .map(|i| {
+                if list.is_null(i) {
+                    None
+                } else {
+                    let arr = list.value(i);
+                    let vals: Vec<Option<i32>> = arr
+                        .as_any()
+                        .downcast_ref::<Int32Array>()
+                        .unwrap()
+                        .iter()
+                        .collect();
+                    Some(vals)
+                }
+            })
+            .collect()
+    }
+
+    fn eval_i32_lists(
+        acc: &mut ArrayAggGroupsAccumulator,
+        emit_to: EmitTo,
+    ) -> Result<Vec<Option<Vec<Option<i32>>>>> {
+        let result = acc.evaluate(emit_to)?;
+        Ok(list_array_to_i32_vecs(result.as_list::<i32>()))
+    }
+
+    #[test]
+    fn groups_accumulator_multiple_batches() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        // First batch
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        acc.update_batch(&[values], &[0, 1, 0], None, 2)?;
+
+        // Second batch
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![4, 5]));
+        acc.update_batch(&[values], &[1, 0], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), Some(3), Some(5)]));
+        assert_eq!(vals[1], Some(vec![Some(2), Some(4)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        acc.update_batch(&[values], &[0, 1, 2], None, 3)?;
+
+        // Emit first 2 groups
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(2))?;
+        assert_eq!(vals.len(), 2);
+        assert_eq!(vals[0], Some(vec![Some(10)]));
+        assert_eq!(vals[1], Some(vec![Some(20)]));
+
+        // Remaining group (was index 2, now shifted to 0)
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals.len(), 1);
+        assert_eq!(vals[0], Some(vec![Some(30)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first_frees_batches() -> Result<()> {
+        // Batch 0 has rows only for group 0; batch 1 has rows for
+        // both groups. After emitting group 0, batch 0 should be
+        // dropped entirely and batch 1 should be compacted to the
+        // retained row(s).
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch0: ArrayRef = Arc::new(Int32Array::from(vec![10, 20]));
+        acc.update_batch(&[batch0], &[0, 0], None, 2)?;
+
+        let batch1: ArrayRef = Arc::new(Int32Array::from(vec![30, 40]));
+        acc.update_batch(&[batch1], &[0, 1], None, 2)?;
+
+        assert_eq!(acc.batches.len(), 2);
+        assert!(!acc.batches[0].is_empty());
+        assert!(!acc.batches[1].is_empty());
+
+        // Emit group 0. Batch 0 is only referenced by group 0, so it
+        // should be removed. Batch 1 is mixed, so it should be compacted
+        // to contain only the retained row for group 1.
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(1))?;
+        assert_eq!(vals[0], Some(vec![Some(10), Some(20), Some(30)]));
+
+        assert_eq!(acc.batches.len(), 1);
+        let retained = acc.batches[0]
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(retained.values(), &[40]);
+        assert_eq!(acc.batch_entries, vec![vec![(0, 0)]]);
+
+        // Emit remaining group 1
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(40)]));
+
+        assert!(acc.batches.is_empty());
+        assert_eq!(acc.size(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_first_compacts_mixed_batches() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30, 40]));
+        acc.update_batch(&[batch], &[0, 1, 0, 1], None, 2)?;
+
+        let size_before = acc.size();
+        let vals = eval_i32_lists(&mut acc, EmitTo::First(1))?;
+        assert_eq!(vals[0], Some(vec![Some(10), Some(30)]));
+
+        assert_eq!(acc.num_groups, 1);
+        assert_eq!(acc.batches.len(), 1);
+        let retained = acc.batches[0]
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(retained.values(), &[20, 40]);
+        assert_eq!(acc.batch_entries, vec![vec![(0, 0), (0, 1)]]);
+        assert!(acc.size() < size_before);
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(20), Some(40)]));
+        assert_eq!(acc.size(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_emit_all_releases_capacity() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let batch: ArrayRef = Arc::new(Int32Array::from_iter_values(0..64));
+        acc.update_batch(
+            &[batch],
+            &(0..64).map(|i| i % 4).collect::<Vec<_>>(),
+            None,
+            4,
+        )?;
+
+        assert!(acc.size() > 0);
+        let _ = eval_i32_lists(&mut acc, EmitTo::All)?;
+
+        assert_eq!(acc.size(), 0);
+        assert_eq!(acc.batches.capacity(), 0);
+        assert_eq!(acc.batch_entries.capacity(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_null_groups() -> Result<()> {
+        // Groups that never receive values should produce null
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1]));
+        // Only group 0 gets a value, groups 1 and 2 are empty
+        acc.update_batch(&[values], &[0], None, 3)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals, vec![Some(vec![Some(1)]), None, None]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_ignore_nulls() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None]));
+        acc.update_batch(&[values], &[0, 0, 1, 1], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        // Group 0: only non-null value is 1
+        assert_eq!(vals[0], Some(vec![Some(1)]));
+        // Group 1: only non-null value is 3
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_opt_filter() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        // Use a mix of false and null to filter out rows — both should
+        // be skipped.
+        let filter = BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]);
+        acc.update_batch(&[values], &[0, 0, 1, 1], Some(&filter), 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1)])); // row 1 filtered (null)
+        assert_eq!(vals[1], Some(vec![Some(3)])); // row 3 filtered (false)
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_state_merge_roundtrip() -> Result<()> {
+        // Accumulator 1: update_batch, then merge, then update_batch again.
+        // Verifies that values appear in chronological insertion order.
+        let mut acc1 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        acc1.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Accumulator 2
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![3, 4]));
+        acc2.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Merge acc2's state into acc1
+        let state = acc2.state(EmitTo::All)?;
+        acc1.merge_batch(&state, &[0, 1], None, 2)?;
+
+        // Another update_batch on acc1 after the merge
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![5, 6]));
+        acc1.update_batch(&[values], &[0, 1], None, 2)?;
+
+        // Each group's values in insertion order:
+        // group 0: update(1), merge(3), update(5) → [1, 3, 5]
+        // group 1: update(2), merge(4), update(6) → [2, 4, 6]
+        let vals = eval_i32_lists(&mut acc1, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), Some(3), Some(5)]));
+        assert_eq!(vals[1], Some(vec![Some(2), Some(4), Some(6)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state() -> Result<()> {
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![Some(10), None, Some(30)]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        assert_eq!(state.len(), 1);
+        let vals = list_array_to_i32_vecs(state[0].as_list::<i32>());
+        assert_eq!(
+            vals,
+            vec![
+                Some(vec![Some(10)]),
+                Some(vec![None]), // null preserved inside list, not promoted
+                Some(vec![Some(30)]),
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_with_filter() -> Result<()> {
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30]));
+        let filter = BooleanArray::from(vec![true, false, true]);
+        let state = acc.convert_to_state(&[values], Some(&filter))?;
+
+        let vals = list_array_to_i32_vecs(state[0].as_list::<i32>());
+        assert_eq!(
+            vals,
+            vec![
+                Some(vec![Some(10)]),
+                None, // filtered
+                Some(vec![Some(30)]),
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_merge_preserves_nulls() -> Result<()> {
+        // Verifies that null values survive the convert_to_state -> merge_batch
+        // round-trip when ignore_nulls is false (default null handling).
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        // Feed state into a new accumulator via merge_batch
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+        acc2.merge_batch(&state, &[0, 0, 1], None, 2)?;
+
+        // Group 0 received rows 0 ([1]) and 1 ([NULL]) → [1, NULL]
+        let vals = eval_i32_lists(&mut acc2, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1), None]));
+        // Group 1 received row 2 ([3]) → [3]
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_convert_to_state_merge_ignore_nulls() -> Result<()> {
+        // Verifies that null values are dropped in the convert_to_state ->
+        // merge_batch round-trip when ignore_nulls is true.
+        let acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef =
+            Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None]));
+        let state = acc.convert_to_state(&[values], None)?;
+
+        let list = state[0].as_list::<i32>();
+        // Rows 0 and 2 are valid lists; rows 1 and 3 are null list entries
+        assert!(!list.is_null(0));
+        assert!(list.is_null(1));
+        assert!(!list.is_null(2));
+        assert!(list.is_null(3));
+
+        // Feed state into a new accumulator via merge_batch
+        let mut acc2 = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+        acc2.merge_batch(&state, &[0, 0, 1, 1], None, 2)?;
+
+        // Group 0: received [1] and null (skipped) → [1]
+        let vals = eval_i32_lists(&mut acc2, EmitTo::All)?;
+        assert_eq!(vals[0], Some(vec![Some(1)]));
+        // Group 1: received [3] and null (skipped) → [3]
+        assert_eq!(vals[1], Some(vec![Some(3)]));
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_all_groups_empty() -> Result<()> {
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, false);
+
+        // Create groups but don't add any values (all filtered out)
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let filter = BooleanArray::from(vec![false, false]);
+        acc.update_batch(&[values], &[0, 1], Some(&filter), 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals, vec![None, None]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn groups_accumulator_ignore_nulls_all_null_group() -> Result<()> {
+        // When ignore_nulls is true and a group receives only nulls,
+        // it should produce a null output
+        let mut acc = ArrayAggGroupsAccumulator::new(DataType::Int32, true);
+
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(1), None]));
+        acc.update_batch(&[values], &[0, 1, 0], None, 2)?;
+
+        let vals = eval_i32_lists(&mut acc, EmitTo::All)?;
+        assert_eq!(vals[0], None); // group 0 got only nulls, all filtered
+        assert_eq!(vals[1], Some(vec![Some(1)])); // group 1 got value 1
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs
index 3c1d33e093b50..1ddb549ae87d5 100644
--- a/datafusion/functions-aggregate/src/average.rs
+++ b/datafusion/functions-aggregate/src/average.rs
@@ -24,27 +24,29 @@ use arrow::array::{
 
 use arrow::compute::sum;
 use arrow::datatypes::{
-    i256, ArrowNativeType, DataType, Decimal128Type, Decimal256Type, DecimalType,
+    ArrowNativeType, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE,
+    DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION,
+    DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType,
+    Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType,
     DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
-    DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, UInt64Type,
-};
-use datafusion_common::{
-    exec_err, not_impl_err, utils::take_function_args, Result, ScalarValue,
+    DurationSecondType, Field, FieldRef, Float64Type, TimeUnit, UInt64Type, i256,
 };
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::{avg_return_type, coerce_avg_type};
 use datafusion_expr::utils::format_state_name;
-use datafusion_expr::Volatility::Immutable;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator,
-    ReversedUDAF, Signature,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, EmitTo, Expr,
+    GroupsAccumulator, ReversedUDAF, Signature, TypeSignature, TypeSignatureClass,
+    Volatility,
+};
+use datafusion_functions_aggregate_common::aggregate::avg_distinct::{
+    DecimalDistinctAvgAccumulator, Float64DistinctAvgAccumulator,
 };
-
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::{
     filtered_null_mask, set_nulls,
 };
-
 use datafusion_functions_aggregate_common::utils::DecimalAverager;
 use datafusion_macros::user_doc;
 use log::debug;
@@ -61,6 +63,17 @@ make_udaf_expr_and_func!(
     avg_udaf
 );
 
+pub fn avg_distinct(expr: Expr) -> Expr {
+    Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf(
+        avg_udaf(),
+        vec![expr],
+        true,
+        None,
+        vec![],
+        None,
+    ))
+}
+
 #[user_doc(
     doc_section(label = "General Functions"),
     description = "Returns the average of numeric values in the specified column.",
@@ -75,7 +88,7 @@ make_udaf_expr_and_func!(
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Avg {
     signature: Signature,
     aliases: Vec<String>,
@@ -84,7 +97,24 @@ pub struct Avg {
 impl Avg {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Immutable),
+            // Supported types smallint, int, bigint, real, double precision, decimal, or interval
+            // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Duration,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Integer, TypeSignatureClass::Float],
+                        NativeType::Float64,
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("mean")],
         }
     }
@@ -110,83 +140,213 @@ impl AggregateUDFImpl for Avg {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        avg_return_type(self.name(), &arg_types[0])
+        match &arg_types[0] {
+            DataType::Decimal32(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL32_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal32(new_precision, new_scale))
+            }
+            DataType::Decimal64(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL64_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal64(new_precision, new_scale))
+            }
+            DataType::Decimal128(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL128_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal128(new_precision, new_scale))
+            }
+            DataType::Decimal256(precision, scale) => {
+                // In the spark, the result type is DECIMAL(min(38,precision+4), min(38,scale+4)).
+                // Ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L66
+                let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 4);
+                let new_scale = DECIMAL256_MAX_SCALE.min(*scale + 4);
+                Ok(DataType::Decimal256(new_precision, new_scale))
+            }
+            DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
+            _ => Ok(DataType::Float64),
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        if acc_args.is_distinct {
-            return exec_err!("avg(DISTINCT) aggregations are not available");
-        }
+        let data_type = acc_args.expr_fields[0].data_type();
         use DataType::*;
 
-        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
         // instantiate specialized accumulator based for the type
-        match (&data_type, acc_args.return_field.data_type()) {
-            (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
-            (
-                Decimal128(sum_precision, sum_scale),
-                Decimal128(target_precision, target_scale),
-            ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal128Type> {
-                sum: None,
-                count: 0,
-                sum_scale: *sum_scale,
-                sum_precision: *sum_precision,
-                target_precision: *target_precision,
-                target_scale: *target_scale,
-            })),
+        if acc_args.is_distinct {
+            match (data_type, acc_args.return_type()) {
+                // Numeric types are converted to Float64 via `coerce_avg_type` during logical plan creation
+                (Float64, _) => Ok(Box::new(Float64DistinctAvgAccumulator::default())),
+
+                (
+                    Decimal32(_, scale),
+                    Decimal32(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal32Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
+                                (
+                    Decimal64(_, scale),
+                    Decimal64(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal64Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
+                (
+                    Decimal128(_, scale),
+                    Decimal128(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal128Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
 
-            (
-                Decimal256(sum_precision, sum_scale),
-                Decimal256(target_precision, target_scale),
-            ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal256Type> {
-                sum: None,
-                count: 0,
-                sum_scale: *sum_scale,
-                sum_precision: *sum_precision,
-                target_precision: *target_precision,
-                target_scale: *target_scale,
-            })),
-
-            (Duration(time_unit), Duration(result_unit)) => {
-                Ok(Box::new(DurationAvgAccumulator {
+                (
+                    Decimal256(_, scale),
+                    Decimal256(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalDistinctAvgAccumulator::<Decimal256Type>::with_decimal_params(
+                    *scale,
+                    *target_precision,
+                    *target_scale,
+                ))),
+
+                (dt, return_type) => exec_err!(
+                    "AVG(DISTINCT) for ({} --> {}) not supported",
+                    dt,
+                    return_type
+                ),
+            }
+        } else {
+            match (&data_type, acc_args.return_type()) {
+                (Float64, Float64) => Ok(Box::<AvgAccumulator>::default()),
+                (
+                    Decimal32(sum_precision, sum_scale),
+                    Decimal32(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal32Type> {
                     sum: None,
                     count: 0,
-                    time_unit: *time_unit,
-                    result_unit: *result_unit,
-                }))
-            }
+                    sum_scale: *sum_scale,
+                    sum_precision: *sum_precision,
+                    target_precision: *target_precision,
+                    target_scale: *target_scale,
+                })),
+                (
+                    Decimal64(sum_precision, sum_scale),
+                    Decimal64(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal64Type> {
+                    sum: None,
+                    count: 0,
+                    sum_scale: *sum_scale,
+                    sum_precision: *sum_precision,
+                    target_precision: *target_precision,
+                    target_scale: *target_scale,
+                })),
+                (
+                    Decimal128(sum_precision, sum_scale),
+                    Decimal128(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal128Type> {
+                    sum: None,
+                    count: 0,
+                    sum_scale: *sum_scale,
+                    sum_precision: *sum_precision,
+                    target_precision: *target_precision,
+                    target_scale: *target_scale,
+                })),
+
+                (
+                    Decimal256(sum_precision, sum_scale),
+                    Decimal256(target_precision, target_scale),
+                ) => Ok(Box::new(DecimalAvgAccumulator::<Decimal256Type> {
+                    sum: None,
+                    count: 0,
+                    sum_scale: *sum_scale,
+                    sum_precision: *sum_precision,
+                    target_precision: *target_precision,
+                    target_scale: *target_scale,
+                })),
+
+                (Duration(time_unit), Duration(result_unit)) => {
+                    Ok(Box::new(DurationAvgAccumulator {
+                        sum: None,
+                        count: 0,
+                        time_unit: *time_unit,
+                        result_unit: *result_unit,
+                    }))
+                }
 
-            _ => exec_err!(
-                "AvgAccumulator for ({} --> {})",
-                &data_type,
-                acc_args.return_field.data_type()
-            ),
+                (dt, return_type) => {
+                    exec_err!("AvgAccumulator for ({} --> {})", dt, return_type)
+                }
+            }
         }
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![
-            Field::new(
-                format_state_name(args.name, "count"),
-                DataType::UInt64,
-                true,
-            ),
-            Field::new(
-                format_state_name(args.name, "sum"),
-                args.input_fields[0].data_type().clone(),
-                true,
-            ),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        if args.is_distinct {
+            // Decimal accumulator actually uses a different precision during accumulation,
+            // see DecimalDistinctAvgAccumulator::with_decimal_params
+            let dt = match args.input_fields[0].data_type() {
+                DataType::Decimal32(_, scale) => {
+                    DataType::Decimal32(DECIMAL32_MAX_PRECISION, *scale)
+                }
+                DataType::Decimal64(_, scale) => {
+                    DataType::Decimal64(DECIMAL64_MAX_PRECISION, *scale)
+                }
+                DataType::Decimal128(_, scale) => {
+                    DataType::Decimal128(DECIMAL128_MAX_PRECISION, *scale)
+                }
+                DataType::Decimal256(_, scale) => {
+                    DataType::Decimal256(DECIMAL256_MAX_PRECISION, *scale)
+                }
+                _ => args.return_type().clone(),
+            };
+            // Similar to datafusion_functions_aggregate::sum::Sum::state_fields
+            // since the accumulator uses DistinctSumAccumulator internally.
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "avg distinct"),
+                    Field::new_list_field(dt, true),
+                    false,
+                )
+                .into(),
+            ])
+        } else {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "count"),
+                    DataType::UInt64,
+                    true,
+                ),
+                Field::new(
+                    format_state_name(args.name, "sum"),
+                    args.input_fields[0].data_type().clone(),
+                    true,
+                ),
+            ]
+            .into_iter()
+            .map(Arc::new)
+            .collect())
+        }
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         matches!(
             args.return_field.data_type(),
-            DataType::Float64 | DataType::Decimal128(_, _) | DataType::Duration(_)
-        )
+            DataType::Float64
+                | DataType::Decimal32(_, _)
+                | DataType::Decimal64(_, _)
+                | DataType::Decimal128(_, _)
+                | DataType::Decimal256(_, _)
+                | DataType::Duration(_)
+        ) && !args.is_distinct
     }
 
     fn create_groups_accumulator(
@@ -195,16 +355,55 @@ impl AggregateUDFImpl for Avg {
     ) -> Result<Box<dyn GroupsAccumulator>> {
         use DataType::*;
 
-        let data_type = args.exprs[0].data_type(args.schema)?;
+        let data_type = args.expr_fields[0].data_type();
+
         // instantiate specialized accumulator based for the type
-        match (&data_type, args.return_field.data_type()) {
+        match (data_type, args.return_field.data_type()) {
             (Float64, Float64) => {
                 Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
-                    &data_type,
+                    data_type,
                     args.return_field.data_type(),
                     |sum: f64, count: u64| Ok(sum / count as f64),
                 )))
             }
+            (
+                Decimal32(_sum_precision, sum_scale),
+                Decimal32(target_precision, target_scale),
+            ) => {
+                let decimal_averager = DecimalAverager::<Decimal32Type>::try_new(
+                    *sum_scale,
+                    *target_precision,
+                    *target_scale,
+                )?;
+
+                let avg_fn =
+                    move |sum: i32, count: u64| decimal_averager.avg(sum, count as i32);
+
+                Ok(Box::new(AvgGroupsAccumulator::<Decimal32Type, _>::new(
+                    data_type,
+                    args.return_field.data_type(),
+                    avg_fn,
+                )))
+            }
+            (
+                Decimal64(_sum_precision, sum_scale),
+                Decimal64(target_precision, target_scale),
+            ) => {
+                let decimal_averager = DecimalAverager::<Decimal64Type>::try_new(
+                    *sum_scale,
+                    *target_precision,
+                    *target_scale,
+                )?;
+
+                let avg_fn =
+                    move |sum: i64, count: u64| decimal_averager.avg(sum, count as i64);
+
+                Ok(Box::new(AvgGroupsAccumulator::<Decimal64Type, _>::new(
+                    data_type,
+                    args.return_field.data_type(),
+                    avg_fn,
+                )))
+            }
             (
                 Decimal128(_sum_precision, sum_scale),
                 Decimal128(target_precision, target_scale),
@@ -219,7 +418,7 @@ impl AggregateUDFImpl for Avg {
                     move |sum: i128, count: u64| decimal_averager.avg(sum, count as i128);
 
                 Ok(Box::new(AvgGroupsAccumulator::<Decimal128Type, _>::new(
-                    &data_type,
+                    data_type,
                     args.return_field.data_type(),
                     avg_fn,
                 )))
@@ -240,7 +439,7 @@ impl AggregateUDFImpl for Avg {
                 };
 
                 Ok(Box::new(AvgGroupsAccumulator::<Decimal256Type, _>::new(
-                    &data_type,
+                    data_type,
                     args.return_field.data_type(),
                     avg_fn,
                 )))
@@ -254,7 +453,7 @@ impl AggregateUDFImpl for Avg {
                         DurationSecondType,
                         _,
                     >::new(
-                        &data_type,
+                        data_type,
                         args.return_type(),
                         avg_fn,
                     ))),
@@ -262,7 +461,7 @@ impl AggregateUDFImpl for Avg {
                         DurationMillisecondType,
                         _,
                     >::new(
-                        &data_type,
+                        data_type,
                         args.return_type(),
                         avg_fn,
                     ))),
@@ -270,7 +469,7 @@ impl AggregateUDFImpl for Avg {
                         DurationMicrosecondType,
                         _,
                     >::new(
-                        &data_type,
+                        data_type,
                         args.return_type(),
                         avg_fn,
                     ))),
@@ -278,7 +477,7 @@ impl AggregateUDFImpl for Avg {
                         DurationNanosecondType,
                         _,
                     >::new(
-                        &data_type,
+                        data_type,
                         args.return_type(),
                         avg_fn,
                     ))),
@@ -301,11 +500,6 @@ impl AggregateUDFImpl for Avg {
         ReversedUDAF::Identical
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [args] = take_function_args(self.name(), arg_types)?;
-        coerce_avg_type(self.name(), std::slice::from_ref(args))
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -388,7 +582,7 @@ impl<T: DecimalType + ArrowNumericType + Debug> Accumulator for DecimalAvgAccumu
         self.count += (values.len() - values.null_count()) as u64;
 
         if let Some(x) = sum(values) {
-            let v = self.sum.get_or_insert(T::Native::default());
+            let v = self.sum.get_or_insert_with(T::Native::default);
             self.sum = Some(v.add_wrapping(x));
         }
         Ok(())
@@ -433,7 +627,7 @@ impl<T: DecimalType + ArrowNumericType + Debug> Accumulator for DecimalAvgAccumu
 
         // sums are summed
         if let Some(x) = sum(states[1].as_primitive::<T>()) {
-            let v = self.sum.get_or_insert(T::Native::default());
+            let v = self.sum.get_or_insert_with(T::Native::default);
             self.sum = Some(v.add_wrapping(x));
         }
         Ok(())
@@ -560,7 +754,7 @@ impl Accumulator for DurationAvgAccumulator {
 struct AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     /// The type of the internal sum
     sum_data_type: DataType,
@@ -584,11 +778,11 @@ where
 impl<T, F> AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     pub fn new(sum_data_type: &DataType, return_data_type: &DataType, avg_fn: F) -> Self {
         debug!(
-            "AvgGroupsAccumulator ({}, sum type: {sum_data_type:?}) --> {return_data_type:?}",
+            "AvgGroupsAccumulator ({}, sum type: {sum_data_type}) --> {return_data_type}",
             std::any::type_name::<T>()
         );
 
@@ -606,7 +800,7 @@ where
 impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
 where
     T: ArrowNumericType + Send,
-    F: Fn(T::Native, u64) -> Result<T::Native> + Send,
+    F: Fn(T::Native, u64) -> Result<T::Native> + Send + 'static,
 {
     fn update_batch(
         &mut self,
@@ -627,7 +821,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value| {
-                let sum = &mut self.sums[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let sum = unsafe { self.sums.get_unchecked_mut(group_index) };
                 *sum = sum.add_wrapping(new_value);
 
                 self.counts[group_index] += 1;
@@ -642,12 +837,16 @@ where
         let sums = emit_to.take_needed(&mut self.sums);
         let nulls = self.null_state.build(emit_to);
 
-        assert_eq!(nulls.len(), sums.len());
+        if let Some(nulls) = &nulls {
+            assert_eq!(nulls.len(), sums.len());
+        }
         assert_eq!(counts.len(), sums.len());
 
         // don't evaluate averages with null inputs to avoid errors on null values
 
-        let array: PrimitiveArray<T> = if nulls.null_count() > 0 {
+        let array: PrimitiveArray<T> = if let Some(nulls) = &nulls
+            && nulls.null_count() > 0
+        {
             let mut builder = PrimitiveBuilder::<T>::with_capacity(nulls.len())
                 .with_data_type(self.return_data_type.clone());
             let iter = sums.into_iter().zip(counts).zip(nulls.iter());
@@ -666,7 +865,7 @@ where
                 .zip(counts.into_iter())
                 .map(|(sum, count)| (self.avg_fn)(sum, count))
                 .collect::<Result<Vec<_>>>()?;
-            PrimitiveArray::new(averages.into(), Some(nulls)) // no copy
+            PrimitiveArray::new(averages.into(), nulls) // no copy
                 .with_data_type(self.return_data_type.clone())
         };
 
@@ -676,7 +875,6 @@ where
     // return arrays for sums and counts
     fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
         let nulls = self.null_state.build(emit_to);
-        let nulls = Some(nulls);
 
         let counts = emit_to.take_needed(&mut self.counts);
         let counts = UInt64Array::new(counts.into(), nulls.clone()); // zero copy
@@ -710,7 +908,9 @@ where
             opt_filter,
             total_num_groups,
             |group_index, partial_count| {
-                self.counts[group_index] += partial_count;
+                // SAFETY: group_index is guaranteed to be in bounds
+                let count = unsafe { self.counts.get_unchecked_mut(group_index) };
+                *count += partial_count;
             },
         );
 
@@ -722,7 +922,8 @@ where
             opt_filter,
             total_num_groups,
             |group_index, new_value: <T as ArrowPrimitiveType>::Native| {
-                let sum = &mut self.sums[group_index];
+                // SAFETY: group_index is guaranteed to be in bounds
+                let sum = unsafe { self.sums.get_unchecked_mut(group_index) };
                 *sum = sum.add_wrapping(new_value);
             },
         );
diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs
index 4512162ba5d33..48edbd5d4cbfe 100644
--- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs
+++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs
@@ -20,27 +20,28 @@
 use std::any::Any;
 use std::collections::HashSet;
 use std::fmt::{Display, Formatter};
+use std::hash::Hash;
 use std::mem::{size_of, size_of_val};
 
-use ahash::RandomState;
-use arrow::array::{downcast_integer, Array, ArrayRef, AsArray};
+use arrow::array::{Array, ArrayRef, AsArray, downcast_integer};
 use arrow::datatypes::{
-    ArrowNativeType, ArrowNumericType, DataType, Field, FieldRef, Int16Type, Int32Type,
-    Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    ArrowNativeType, ArrowNumericType, DataType, Field, FieldRef, Int8Type, Int16Type,
+    Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
+use datafusion_common::hash_utils::RandomState;
 
 use datafusion_common::cast::as_list_array;
-use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::INTEGERS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
-    Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, GroupsAccumulator,
+    ReversedUDAF, Signature, TypeSignatureClass, Volatility,
 };
 
-use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL;
+use datafusion_doc::aggregate_doc_sections::DOC_SECTION_GENERAL;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
 use std::sync::LazyLock;
 
@@ -88,6 +89,7 @@ macro_rules! accumulator_helper {
 macro_rules! downcast_bitwise_accumulator {
     ($args:ident, $opr:expr, $is_distinct: expr) => {
         match $args.return_field.data_type() {
+            DataType::Null => Ok(Box::new(NoopAccumulator::default())),
             DataType::Int8 => accumulator_helper!(Int8Type, $opr, $is_distinct),
             DataType::Int16 => accumulator_helper!(Int16Type, $opr, $is_distinct),
             DataType::Int32 => accumulator_helper!(Int32Type, $opr, $is_distinct),
@@ -196,7 +198,7 @@ make_bitwise_udaf_expr_and_func!(
 );
 
 /// The different types of bitwise operations that can be performed.
-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
 enum BitwiseOperationType {
     And,
     Or,
@@ -210,7 +212,7 @@ impl Display for BitwiseOperationType {
 }
 
 /// [BitwiseOperation] struct encapsulates information about a bitwise operation.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct BitwiseOperation {
     signature: Signature,
     /// `operation` indicates the type of bitwise operation to be performed.
@@ -227,7 +229,10 @@ impl BitwiseOperation {
     ) -> Self {
         Self {
             operation: operator,
-            signature: Signature::uniform(1, INTEGERS.to_vec(), Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Integer)],
+                Volatility::Immutable,
+            ),
             func_name,
             documentation,
         }
@@ -248,15 +253,7 @@ impl AggregateUDFImpl for BitwiseOperation {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let arg_type = &arg_types[0];
-        if !arg_type.is_integer() {
-            return exec_err!(
-                "[return_type] {} not supported for {}",
-                self.name(),
-                arg_type
-            );
-        }
-        Ok(arg_type.clone())
+        Ok(arg_types[0].clone())
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -264,24 +261,37 @@ impl AggregateUDFImpl for BitwiseOperation {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        if self.operation == BitwiseOperationType::Xor && args.is_distinct {
-            Ok(vec![Field::new_list(
-                format_state_name(
-                    args.name,
-                    format!("{} distinct", self.name()).as_str(),
-                ),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(args.return_type().clone(), true),
-                false,
-            )
-            .into()])
+        if args.input_fields[0].data_type().is_null() {
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ])
+        } else if self.operation == BitwiseOperationType::Xor && args.is_distinct {
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(
+                        args.name,
+                        format!("{} distinct", self.name()).as_str(),
+                    ),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.return_type().clone(), true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, self.name()),
-                args.return_field.data_type().clone(),
-                true,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    args.return_field.data_type().clone(),
+                    true,
+                )
+                .into(),
+            ])
         }
     }
 
@@ -381,7 +391,7 @@ where
 {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         if let Some(x) = arrow::compute::bit_or(values[0].as_primitive::<T>()) {
-            let v = self.value.get_or_insert(T::Native::usize_as(0));
+            let v = self.value.get_or_insert_with(|| T::Native::usize_as(0));
             *v = *v | x;
         }
         Ok(())
@@ -426,7 +436,7 @@ where
 {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         if let Some(x) = arrow::compute::bit_xor(values[0].as_primitive::<T>()) {
-            let v = self.value.get_or_insert(T::Native::usize_as(0));
+            let v = self.value.get_or_insert_with(|| T::Native::usize_as(0));
             *v = *v ^ x;
         }
         Ok(())
@@ -478,7 +488,7 @@ impl<T: ArrowNumericType> Default for DistinctBitXorAccumulator<T> {
 
 impl<T: ArrowNumericType> Accumulator for DistinctBitXorAccumulator<T>
 where
-    T::Native: std::ops::BitXor<Output = T::Native> + std::hash::Hash + Eq,
+    T::Native: std::ops::BitXor<Output = T::Native> + Hash + Eq,
 {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         if values.is_empty() {
diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs
index e5de6d76217fb..77b99cd1ae993 100644
--- a/datafusion/functions-aggregate/src/bool_and_or.rs
+++ b/datafusion/functions-aggregate/src/bool_and_or.rs
@@ -28,10 +28,10 @@ use arrow::datatypes::Field;
 use arrow::datatypes::{DataType, FieldRef};
 
 use datafusion_common::internal_err;
-use datafusion_common::{downcast_value, not_impl_err};
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{downcast_value, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
     Signature, Volatility,
@@ -106,7 +106,7 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression", prefix = "The")
 )]
 /// BOOL_AND aggregate expression
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct BoolAnd {
     signature: Signature,
 }
@@ -114,11 +114,7 @@ pub struct BoolAnd {
 impl BoolAnd {
     fn new() -> Self {
         Self {
-            signature: Signature::uniform(
-                1,
-                vec![DataType::Boolean],
-                Volatility::Immutable,
-            ),
+            signature: Signature::exact(vec![DataType::Boolean], Volatility::Immutable),
         }
     }
 }
@@ -151,12 +147,14 @@ impl AggregateUDFImpl for BoolAnd {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, self.name()),
-            DataType::Boolean,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, self.name()),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
@@ -179,10 +177,6 @@ impl AggregateUDFImpl for BoolAnd {
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn order_sensitivity(&self) -> AggregateOrderSensitivity {
         AggregateOrderSensitivity::Insensitive
     }
@@ -245,7 +239,7 @@ impl Accumulator for BoolAndAccumulator {
     standard_argument(name = "expression", prefix = "The")
 )]
 /// BOOL_OR aggregate expression
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct BoolOr {
     signature: Signature,
 }
@@ -253,11 +247,7 @@ pub struct BoolOr {
 impl BoolOr {
     fn new() -> Self {
         Self {
-            signature: Signature::uniform(
-                1,
-                vec![DataType::Boolean],
-                Volatility::Immutable,
-            ),
+            signature: Signature::exact(vec![DataType::Boolean], Volatility::Immutable),
         }
     }
 }
@@ -290,12 +280,14 @@ impl AggregateUDFImpl for BoolOr {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, self.name()),
-            DataType::Boolean,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, self.name()),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
@@ -319,10 +311,6 @@ impl AggregateUDFImpl for BoolOr {
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn order_sensitivity(&self) -> AggregateOrderSensitivity {
         AggregateOrderSensitivity::Insensitive
     }
diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs
index 0a7345245ca8c..6c76c6e940099 100644
--- a/datafusion/functions-aggregate/src/correlation.rs
+++ b/datafusion/functions-aggregate/src/correlation.rs
@@ -23,10 +23,10 @@ use std::mem::size_of_val;
 use std::sync::Arc;
 
 use arrow::array::{
-    downcast_array, Array, AsArray, BooleanArray, Float64Array, NullBufferBuilder,
-    UInt64Array,
+    Array, AsArray, BooleanArray, Float64Array, NullBufferBuilder, UInt64Array,
+    downcast_array,
 };
-use arrow::compute::{and, filter, is_not_null, kernels::cast};
+use arrow::compute::{and, filter, is_not_null};
 use arrow::datatypes::{FieldRef, Float64Type, UInt64Type};
 use arrow::{
     array::ArrayRef,
@@ -38,12 +38,11 @@ use log::debug;
 
 use crate::covariance::CovarianceAccumulator;
 use crate::stddev::StddevAccumulator;
-use datafusion_common::{plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
     function::{AccumulatorArgs, StateFieldsArgs},
-    type_coercion::aggregates::NUMERICS,
     utils::format_state_name,
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
 use datafusion_functions_aggregate_common::stats::StatsType;
 use datafusion_macros::user_doc;
@@ -71,7 +70,7 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression1", prefix = "First"),
     standard_argument(name = "expression2", prefix = "Second")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Correlation {
     signature: Signature,
 }
@@ -83,10 +82,15 @@ impl Default for Correlation {
 }
 
 impl Correlation {
-    /// Create a new COVAR_POP aggregate function
+    /// Create a new CORR aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec!["y".to_string(), "x".to_string()])
+            .expect("valid parameter names for corr"),
         }
     }
 }
@@ -105,11 +109,7 @@ impl AggregateUDFImpl for Correlation {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Correlation requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -200,15 +200,28 @@ impl Accumulator for CorrelationAccumulator {
         let stddev1 = self.stddev1.evaluate()?;
         let stddev2 = self.stddev2.evaluate()?;
 
-        if let ScalarValue::Float64(Some(c)) = covar {
-            if let ScalarValue::Float64(Some(s1)) = stddev1 {
-                if let ScalarValue::Float64(Some(s2)) = stddev2 {
-                    if s1 == 0_f64 || s2 == 0_f64 {
-                        return Ok(ScalarValue::Float64(Some(0_f64)));
-                    } else {
-                        return Ok(ScalarValue::Float64(Some(c / s1 / s2)));
-                    }
-                }
+        // First check if we have NaN values by examining the internal state
+        // This handles the case where both inputs are NaN even with count=1
+        let mean1 = self.covar.get_mean1();
+        let mean2 = self.covar.get_mean2();
+
+        // If both means are NaN, then both input columns contain only NaN values
+        if mean1.is_nan() && mean2.is_nan() {
+            return Ok(ScalarValue::Float64(Some(f64::NAN)));
+        }
+        let n = self.covar.get_count();
+        if mean1.is_nan() || mean2.is_nan() || n < 2 {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        if let ScalarValue::Float64(Some(c)) = covar
+            && let ScalarValue::Float64(Some(s1)) = stddev1
+            && let ScalarValue::Float64(Some(s2)) = stddev2
+        {
+            if s1 == 0_f64 || s2 == 0_f64 {
+                return Ok(ScalarValue::Float64(None));
+            } else {
+                return Ok(ScalarValue::Float64(Some(c / s1 / s2)));
             }
         }
 
@@ -354,7 +367,7 @@ fn accumulate_correlation_states(
 /// where:
 /// n = number of observations
 /// sum_x = sum of x values
-/// sum_y = sum of y values  
+/// sum_y = sum of y values
 /// sum_xy = sum of (x * y)
 /// sum_xx = sum of x^2 values
 /// sum_yy = sum of y^2 values
@@ -375,10 +388,8 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         self.sum_xx.resize(total_num_groups, 0.0);
         self.sum_yy.resize(total_num_groups, 0.0);
 
-        let array_x = &cast(&values[0], &DataType::Float64)?;
-        let array_x = downcast_array::<Float64Array>(array_x);
-        let array_y = &cast(&values[1], &DataType::Float64)?;
-        let array_y = downcast_array::<Float64Array>(array_y);
+        let array_x = downcast_array::<Float64Array>(&values[0]);
+        let array_y = downcast_array::<Float64Array>(&values[1]);
 
         accumulate_multiple(
             group_indices,
@@ -399,6 +410,87 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         Ok(())
     }
 
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        // Drain the state vectors for the groups being emitted
+        let counts = emit_to.take_needed(&mut self.count);
+        let sum_xs = emit_to.take_needed(&mut self.sum_x);
+        let sum_ys = emit_to.take_needed(&mut self.sum_y);
+        let sum_xys = emit_to.take_needed(&mut self.sum_xy);
+        let sum_xxs = emit_to.take_needed(&mut self.sum_xx);
+        let sum_yys = emit_to.take_needed(&mut self.sum_yy);
+
+        let n = counts.len();
+        let mut values = Vec::with_capacity(n);
+        let mut nulls = NullBufferBuilder::new(n);
+
+        // Notes for `Null` handling:
+        // - If the `count` state of a group is 0, no valid records are accumulated
+        //   for this group, so the aggregation result is `Null`.
+        // - Correlation can't be calculated when a group only has 1 record, or when
+        //   the `denominator` state is 0. In these cases, the final aggregation
+        //   result should be `Null` (according to PostgreSQL's behavior).
+        // - However, if any of the accumulated values contain NaN, the result should
+        //   be NaN regardless of the count (even for single-row groups).
+        for i in 0..n {
+            let count = counts[i];
+            let sum_x = sum_xs[i];
+            let sum_y = sum_ys[i];
+            let sum_xy = sum_xys[i];
+            let sum_xx = sum_xxs[i];
+            let sum_yy = sum_yys[i];
+
+            // If BOTH sum_x AND sum_y are NaN, then both input values are NaN → return NaN
+            // If only ONE of them is NaN, then only one input value is NaN → return NULL
+            if sum_x.is_nan() && sum_y.is_nan() {
+                // Both inputs are NaN → return NaN
+                values.push(f64::NAN);
+                nulls.append_non_null();
+                continue;
+            } else if count < 2 || sum_x.is_nan() || sum_y.is_nan() {
+                // Only one input is NaN → return NULL
+                values.push(0.0);
+                nulls.append_null();
+                continue;
+            }
+
+            let mean_x = sum_x / count as f64;
+            let mean_y = sum_y / count as f64;
+
+            let numerator = sum_xy - sum_x * mean_y;
+            let denominator =
+                ((sum_xx - sum_x * mean_x) * (sum_yy - sum_y * mean_y)).sqrt();
+
+            if denominator == 0.0 {
+                values.push(0.0);
+                nulls.append_null();
+            } else {
+                values.push(numerator / denominator);
+                nulls.append_non_null();
+            }
+        }
+
+        Ok(Arc::new(Float64Array::new(values.into(), nulls.finish())))
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        // Drain the state vectors for the groups being emitted
+        let count = emit_to.take_needed(&mut self.count);
+        let sum_x = emit_to.take_needed(&mut self.sum_x);
+        let sum_y = emit_to.take_needed(&mut self.sum_y);
+        let sum_xy = emit_to.take_needed(&mut self.sum_xy);
+        let sum_xx = emit_to.take_needed(&mut self.sum_xx);
+        let sum_yy = emit_to.take_needed(&mut self.sum_yy);
+
+        Ok(vec![
+            Arc::new(UInt64Array::from(count)),
+            Arc::new(Float64Array::from(sum_x)),
+            Arc::new(Float64Array::from(sum_y)),
+            Arc::new(Float64Array::from(sum_xy)),
+            Arc::new(Float64Array::from(sum_xx)),
+            Arc::new(Float64Array::from(sum_yy)),
+        ])
+    }
+
     fn merge_batch(
         &mut self,
         values: &[ArrayRef],
@@ -422,7 +514,10 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         let partial_sum_xx = values[4].as_primitive::<Float64Type>();
         let partial_sum_yy = values[5].as_primitive::<Float64Type>();
 
-        assert!(opt_filter.is_none(), "aggregate filter should be applied in partial stage, there should be no filter in final stage");
+        assert!(
+            opt_filter.is_none(),
+            "aggregate filter should be applied in partial stage, there should be no filter in final stage"
+        );
 
         accumulate_correlation_states(
             group_indices,
@@ -447,82 +542,13 @@ impl GroupsAccumulator for CorrelationGroupsAccumulator {
         Ok(())
     }
 
-    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
-        let n = match emit_to {
-            EmitTo::All => self.count.len(),
-            EmitTo::First(n) => n,
-        };
-
-        let mut values = Vec::with_capacity(n);
-        let mut nulls = NullBufferBuilder::new(n);
-
-        // Notes for `Null` handling:
-        // - If the `count` state of a group is 0, no valid records are accumulated
-        //   for this group, so the aggregation result is `Null`.
-        // - Correlation can't be calculated when a group only has 1 record, or when
-        //   the `denominator` state is 0. In these cases, the final aggregation
-        //   result should be `Null` (according to PostgreSQL's behavior).
-        //
-        // TODO: Old datafusion implementation returns 0.0 for these invalid cases.
-        // Update this to match PostgreSQL's behavior.
-        for i in 0..n {
-            if self.count[i] < 2 {
-                // TODO: Evaluate as `Null` (see notes above)
-                values.push(0.0);
-                nulls.append_null();
-                continue;
-            }
-
-            let count = self.count[i];
-            let sum_x = self.sum_x[i];
-            let sum_y = self.sum_y[i];
-            let sum_xy = self.sum_xy[i];
-            let sum_xx = self.sum_xx[i];
-            let sum_yy = self.sum_yy[i];
-
-            let mean_x = sum_x / count as f64;
-            let mean_y = sum_y / count as f64;
-
-            let numerator = sum_xy - sum_x * mean_y;
-            let denominator =
-                ((sum_xx - sum_x * mean_x) * (sum_yy - sum_y * mean_y)).sqrt();
-
-            if denominator == 0.0 {
-                // TODO: Evaluate as `Null` (see notes above)
-                values.push(0.0);
-                nulls.append_null();
-            } else {
-                values.push(numerator / denominator);
-                nulls.append_non_null();
-            }
-        }
-
-        Ok(Arc::new(Float64Array::new(values.into(), nulls.finish())))
-    }
-
-    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
-        let n = match emit_to {
-            EmitTo::All => self.count.len(),
-            EmitTo::First(n) => n,
-        };
-
-        Ok(vec![
-            Arc::new(UInt64Array::from(self.count[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_x[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_y[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_xy[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_xx[0..n].to_vec())),
-            Arc::new(Float64Array::from(self.sum_yy[0..n].to_vec())),
-        ])
-    }
-
     fn size(&self) -> usize {
-        size_of_val(&self.count)
-            + size_of_val(&self.sum_x)
-            + size_of_val(&self.sum_y)
-            + size_of_val(&self.sum_xy)
-            + size_of_val(&self.sum_xx)
-            + size_of_val(&self.sum_yy)
+        self.count.capacity() * size_of::<u64>()
+            + self.sum_x.capacity() * size_of::<f64>()
+            + self.sum_y.capacity() * size_of::<f64>()
+            + self.sum_xy.capacity() * size_of::<f64>()
+            + self.sum_xx.capacity() * size_of::<f64>()
+            + self.sum_yy.capacity() * size_of::<f64>()
     }
 }
 
diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs
index eccd0cd05187b..67e799d489409 100644
--- a/datafusion/functions-aggregate/src/count.rs
+++ b/datafusion/functions-aggregate/src/count.rs
@@ -15,55 +15,51 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use ahash::RandomState;
-use datafusion_common::stats::Precision;
-use datafusion_expr::expr::WindowFunction;
-use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator;
-use datafusion_macros::user_doc;
-use datafusion_physical_expr::expressions;
-use std::collections::HashSet;
-use std::fmt::Debug;
-use std::mem::{size_of, size_of_val};
-use std::ops::BitAnd;
-use std::sync::Arc;
-
 use arrow::{
-    array::{ArrayRef, AsArray},
+    array::{Array, ArrayRef, AsArray, BooleanArray, Int64Array, PrimitiveArray},
+    buffer::BooleanBuffer,
     compute,
     datatypes::{
         DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Field,
-        Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
-        Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+        FieldRef, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+        Int64Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
         Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
         TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
-        UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+        UInt8Type, UInt16Type, UInt32Type, UInt64Type,
     },
 };
-
-use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{Array, BooleanArray, Int64Array, PrimitiveArray},
-    buffer::BooleanBuffer,
-};
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::{
-    downcast_value, internal_err, not_impl_err, Result, ScalarValue,
+    HashMap, Result, ScalarValue, downcast_value, internal_err, not_impl_err,
+    stats::Precision, utils::expr::COUNT_STAR_EXPANSION,
 };
-use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::{
-    function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl,
-    Documentation, EmitTo, GroupsAccumulator, SetMonotonicity, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, GroupsAccumulator,
+    ReversedUDAF, SetMonotonicity, Signature, StatisticsArgs, TypeSignature, Volatility,
+    WindowFunctionDefinition,
+    expr::WindowFunction,
+    function::{AccumulatorArgs, StateFieldsArgs},
+    utils::format_state_name,
 };
-use datafusion_expr::{
-    Expr, ReversedUDAF, StatisticsArgs, TypeSignature, WindowFunctionDefinition,
+use datafusion_functions_aggregate_common::aggregate::{
+    count_distinct::BytesDistinctCountAccumulator,
+    count_distinct::BytesViewDistinctCountAccumulator,
+    count_distinct::DictionaryCountAccumulator,
+    count_distinct::FloatDistinctCountAccumulator,
+    count_distinct::PrimitiveDistinctCountAccumulator,
+    groups_accumulator::accumulate::accumulate_indices,
 };
-use datafusion_functions_aggregate_common::aggregate::count_distinct::{
-    BytesDistinctCountAccumulator, FloatDistinctCountAccumulator,
-    PrimitiveDistinctCountAccumulator,
-};
-use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate_indices;
+use datafusion_macros::user_doc;
+use datafusion_physical_expr::expressions;
 use datafusion_physical_expr_common::binary_map::OutputType;
+use std::{
+    collections::HashSet,
+    fmt::Debug,
+    mem::{size_of, size_of_val},
+    ops::BitAnd,
+    sync::Arc,
+};
 
-use datafusion_common::utils::expr::COUNT_STAR_EXPANSION;
 make_udaf_expr_and_func!(
     Count,
     count,
@@ -78,7 +74,7 @@ pub fn count_distinct(expr: Expr) -> Expr {
         vec![expr],
         true,
         None,
-        None,
+        vec![],
         None,
     ))
 }
@@ -101,7 +97,7 @@ pub fn count_distinct(expr: Expr) -> Expr {
 /// let expr = col(expr.schema_name().to_string());
 /// ```
 pub fn count_all() -> Expr {
-    count(Expr::Literal(COUNT_STAR_EXPANSION)).alias("count(*)")
+    count(Expr::Literal(COUNT_STAR_EXPANSION, None)).alias("count(*)")
 }
 
 /// Creates window aggregation to count all rows.
@@ -117,16 +113,16 @@ pub fn count_all() -> Expr {
 /// // create `count(*)` OVER ... window function expression
 /// let expr = count_all_window();
 /// assert_eq!(
-///   expr.schema_name().to_string(),
-///   "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
+///     expr.schema_name().to_string(),
+///     "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// );
 /// // if you need to refer to this column, use the `schema_name` function
 /// let expr = col(expr.schema_name().to_string());
 /// ```
 pub fn count_all_window() -> Expr {
-    Expr::WindowFunction(WindowFunction::new(
+    Expr::from(WindowFunction::new(
         WindowFunctionDefinition::AggregateUDF(count_udaf()),
-        vec![Expr::Literal(COUNT_STAR_EXPANSION)],
+        vec![Expr::Literal(COUNT_STAR_EXPANSION, None)],
     ))
 }
 
@@ -151,19 +147,11 @@ pub fn count_all_window() -> Expr {
 ```"#,
     standard_argument(name = "expression",)
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Count {
     signature: Signature,
 }
 
-impl Debug for Count {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("Count")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Count {
     fn default() -> Self {
         Self::new()
@@ -180,6 +168,107 @@ impl Count {
         }
     }
 }
+fn get_count_accumulator(data_type: &DataType) -> Box<dyn Accumulator> {
+    match data_type {
+        // try and use a specialized accumulator if possible, otherwise fall back to generic accumulator
+        DataType::Int8 => Box::new(PrimitiveDistinctCountAccumulator::<Int8Type>::new(
+            data_type,
+        )),
+        DataType::Int16 => Box::new(PrimitiveDistinctCountAccumulator::<Int16Type>::new(
+            data_type,
+        )),
+        DataType::Int32 => Box::new(PrimitiveDistinctCountAccumulator::<Int32Type>::new(
+            data_type,
+        )),
+        DataType::Int64 => Box::new(PrimitiveDistinctCountAccumulator::<Int64Type>::new(
+            data_type,
+        )),
+        DataType::UInt8 => Box::new(PrimitiveDistinctCountAccumulator::<UInt8Type>::new(
+            data_type,
+        )),
+        DataType::UInt16 => Box::new(
+            PrimitiveDistinctCountAccumulator::<UInt16Type>::new(data_type),
+        ),
+        DataType::UInt32 => Box::new(
+            PrimitiveDistinctCountAccumulator::<UInt32Type>::new(data_type),
+        ),
+        DataType::UInt64 => Box::new(
+            PrimitiveDistinctCountAccumulator::<UInt64Type>::new(data_type),
+        ),
+        DataType::Decimal128(_, _) => Box::new(PrimitiveDistinctCountAccumulator::<
+            Decimal128Type,
+        >::new(data_type)),
+        DataType::Decimal256(_, _) => Box::new(PrimitiveDistinctCountAccumulator::<
+            Decimal256Type,
+        >::new(data_type)),
+
+        DataType::Date32 => Box::new(
+            PrimitiveDistinctCountAccumulator::<Date32Type>::new(data_type),
+        ),
+        DataType::Date64 => Box::new(
+            PrimitiveDistinctCountAccumulator::<Date64Type>::new(data_type),
+        ),
+        DataType::Time32(TimeUnit::Millisecond) => Box::new(
+            PrimitiveDistinctCountAccumulator::<Time32MillisecondType>::new(data_type),
+        ),
+        DataType::Time32(TimeUnit::Second) => Box::new(
+            PrimitiveDistinctCountAccumulator::<Time32SecondType>::new(data_type),
+        ),
+        DataType::Time64(TimeUnit::Microsecond) => Box::new(
+            PrimitiveDistinctCountAccumulator::<Time64MicrosecondType>::new(data_type),
+        ),
+        DataType::Time64(TimeUnit::Nanosecond) => Box::new(
+            PrimitiveDistinctCountAccumulator::<Time64NanosecondType>::new(data_type),
+        ),
+        DataType::Timestamp(TimeUnit::Microsecond, _) => Box::new(
+            PrimitiveDistinctCountAccumulator::<TimestampMicrosecondType>::new(data_type),
+        ),
+        DataType::Timestamp(TimeUnit::Millisecond, _) => Box::new(
+            PrimitiveDistinctCountAccumulator::<TimestampMillisecondType>::new(data_type),
+        ),
+        DataType::Timestamp(TimeUnit::Nanosecond, _) => Box::new(
+            PrimitiveDistinctCountAccumulator::<TimestampNanosecondType>::new(data_type),
+        ),
+        DataType::Timestamp(TimeUnit::Second, _) => Box::new(
+            PrimitiveDistinctCountAccumulator::<TimestampSecondType>::new(data_type),
+        ),
+
+        DataType::Float16 => {
+            Box::new(FloatDistinctCountAccumulator::<Float16Type>::new())
+        }
+        DataType::Float32 => {
+            Box::new(FloatDistinctCountAccumulator::<Float32Type>::new())
+        }
+        DataType::Float64 => {
+            Box::new(FloatDistinctCountAccumulator::<Float64Type>::new())
+        }
+
+        DataType::Utf8 => {
+            Box::new(BytesDistinctCountAccumulator::<i32>::new(OutputType::Utf8))
+        }
+        DataType::Utf8View => {
+            Box::new(BytesViewDistinctCountAccumulator::new(OutputType::Utf8View))
+        }
+        DataType::LargeUtf8 => {
+            Box::new(BytesDistinctCountAccumulator::<i64>::new(OutputType::Utf8))
+        }
+        DataType::Binary => Box::new(BytesDistinctCountAccumulator::<i32>::new(
+            OutputType::Binary,
+        )),
+        DataType::BinaryView => Box::new(BytesViewDistinctCountAccumulator::new(
+            OutputType::BinaryView,
+        )),
+        DataType::LargeBinary => Box::new(BytesDistinctCountAccumulator::<i64>::new(
+            OutputType::Binary,
+        )),
+
+        // Use the generic accumulator based on `ScalarValue` for all other types
+        _ => Box::new(DistinctCountAccumulator {
+            values: HashSet::default(),
+            state_data_type: data_type.clone(),
+        }),
+    }
+}
 
 impl AggregateUDFImpl for Count {
     fn as_any(&self) -> &dyn std::any::Any {
@@ -204,20 +293,29 @@ impl AggregateUDFImpl for Count {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            Ok(vec![Field::new_list(
-                format_state_name(args.name, "count distinct"),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(args.input_fields[0].data_type().clone(), true),
-                false,
-            )
-            .into()])
+            let dtype: DataType = match &args.input_fields[0].data_type() {
+                DataType::Dictionary(_, values_type) => (**values_type).clone(),
+                &dtype => dtype.clone(),
+            };
+
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "count distinct"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(dtype, true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, "count"),
-                DataType::Int64,
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "count"),
+                    DataType::Int64,
+                    false,
+                )
+                .into(),
+            ])
         }
     }
 
@@ -230,122 +328,17 @@ impl AggregateUDFImpl for Count {
             return not_impl_err!("COUNT DISTINCT with multiple arguments");
         }
 
-        let data_type = &acc_args.exprs[0].data_type(acc_args.schema)?;
-        Ok(match data_type {
-            // try and use a specialized accumulator if possible, otherwise fall back to generic accumulator
-            DataType::Int8 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Int8Type>::new(data_type),
-            ),
-            DataType::Int16 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Int16Type>::new(data_type),
-            ),
-            DataType::Int32 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Int32Type>::new(data_type),
-            ),
-            DataType::Int64 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Int64Type>::new(data_type),
-            ),
-            DataType::UInt8 => Box::new(
-                PrimitiveDistinctCountAccumulator::<UInt8Type>::new(data_type),
-            ),
-            DataType::UInt16 => Box::new(
-                PrimitiveDistinctCountAccumulator::<UInt16Type>::new(data_type),
-            ),
-            DataType::UInt32 => Box::new(
-                PrimitiveDistinctCountAccumulator::<UInt32Type>::new(data_type),
-            ),
-            DataType::UInt64 => Box::new(
-                PrimitiveDistinctCountAccumulator::<UInt64Type>::new(data_type),
-            ),
-            DataType::Decimal128(_, _) => Box::new(PrimitiveDistinctCountAccumulator::<
-                Decimal128Type,
-            >::new(data_type)),
-            DataType::Decimal256(_, _) => Box::new(PrimitiveDistinctCountAccumulator::<
-                Decimal256Type,
-            >::new(data_type)),
-
-            DataType::Date32 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Date32Type>::new(data_type),
-            ),
-            DataType::Date64 => Box::new(
-                PrimitiveDistinctCountAccumulator::<Date64Type>::new(data_type),
-            ),
-            DataType::Time32(TimeUnit::Millisecond) => Box::new(
-                PrimitiveDistinctCountAccumulator::<Time32MillisecondType>::new(
-                    data_type,
-                ),
-            ),
-            DataType::Time32(TimeUnit::Second) => Box::new(
-                PrimitiveDistinctCountAccumulator::<Time32SecondType>::new(data_type),
-            ),
-            DataType::Time64(TimeUnit::Microsecond) => Box::new(
-                PrimitiveDistinctCountAccumulator::<Time64MicrosecondType>::new(
-                    data_type,
-                ),
-            ),
-            DataType::Time64(TimeUnit::Nanosecond) => Box::new(
-                PrimitiveDistinctCountAccumulator::<Time64NanosecondType>::new(data_type),
-            ),
-            DataType::Timestamp(TimeUnit::Microsecond, _) => Box::new(
-                PrimitiveDistinctCountAccumulator::<TimestampMicrosecondType>::new(
-                    data_type,
-                ),
-            ),
-            DataType::Timestamp(TimeUnit::Millisecond, _) => Box::new(
-                PrimitiveDistinctCountAccumulator::<TimestampMillisecondType>::new(
-                    data_type,
-                ),
-            ),
-            DataType::Timestamp(TimeUnit::Nanosecond, _) => Box::new(
-                PrimitiveDistinctCountAccumulator::<TimestampNanosecondType>::new(
-                    data_type,
-                ),
-            ),
-            DataType::Timestamp(TimeUnit::Second, _) => Box::new(
-                PrimitiveDistinctCountAccumulator::<TimestampSecondType>::new(data_type),
-            ),
-
-            DataType::Float16 => {
-                Box::new(FloatDistinctCountAccumulator::<Float16Type>::new())
-            }
-            DataType::Float32 => {
-                Box::new(FloatDistinctCountAccumulator::<Float32Type>::new())
-            }
-            DataType::Float64 => {
-                Box::new(FloatDistinctCountAccumulator::<Float64Type>::new())
-            }
+        let data_type = acc_args.expr_fields[0].data_type();
 
-            DataType::Utf8 => {
-                Box::new(BytesDistinctCountAccumulator::<i32>::new(OutputType::Utf8))
-            }
-            DataType::Utf8View => {
-                Box::new(BytesViewDistinctCountAccumulator::new(OutputType::Utf8View))
-            }
-            DataType::LargeUtf8 => {
-                Box::new(BytesDistinctCountAccumulator::<i64>::new(OutputType::Utf8))
+        Ok(match data_type {
+            DataType::Dictionary(_, values_type) => {
+                let inner = get_count_accumulator(values_type);
+                Box::new(DictionaryCountAccumulator::new(inner))
             }
-            DataType::Binary => Box::new(BytesDistinctCountAccumulator::<i32>::new(
-                OutputType::Binary,
-            )),
-            DataType::BinaryView => Box::new(BytesViewDistinctCountAccumulator::new(
-                OutputType::BinaryView,
-            )),
-            DataType::LargeBinary => Box::new(BytesDistinctCountAccumulator::<i64>::new(
-                OutputType::Binary,
-            )),
-
-            // Use the generic accumulator based on `ScalarValue` for all other types
-            _ => Box::new(DistinctCountAccumulator {
-                values: HashSet::default(),
-                state_data_type: data_type.clone(),
-            }),
+            _ => get_count_accumulator(data_type),
         })
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         // groups accumulator only supports `COUNT(c1)`, not
         // `COUNT(c1, c2)`, etc
@@ -372,32 +365,40 @@ impl AggregateUDFImpl for Count {
     }
 
     fn value_from_stats(&self, statistics_args: &StatisticsArgs) -> Option<ScalarValue> {
+        let [expr] = statistics_args.exprs else {
+            return None;
+        };
+        let col_stats = &statistics_args.statistics.column_statistics;
+
         if statistics_args.is_distinct {
+            // Only column references can be resolved from statistics;
+            // expressions like casts or literals are not supported.
+            let col_expr = expr.as_any().downcast_ref::<expressions::Column>()?;
+            if let Precision::Exact(dc) = col_stats[col_expr.index()].distinct_count {
+                let dc = i64::try_from(dc).ok()?;
+                return Some(ScalarValue::Int64(Some(dc)));
+            }
             return None;
         }
-        if let Precision::Exact(num_rows) = statistics_args.statistics.num_rows {
-            if statistics_args.exprs.len() == 1 {
-                // TODO optimize with exprs other than Column
-                if let Some(col_expr) = statistics_args.exprs[0]
-                    .as_any()
-                    .downcast_ref::<expressions::Column>()
-                {
-                    let current_val = &statistics_args.statistics.column_statistics
-                        [col_expr.index()]
-                    .null_count;
-                    if let &Precision::Exact(val) = current_val {
-                        return Some(ScalarValue::Int64(Some((num_rows - val) as i64)));
-                    }
-                } else if let Some(lit_expr) = statistics_args.exprs[0]
-                    .as_any()
-                    .downcast_ref::<expressions::Literal>()
-                {
-                    if lit_expr.value() == &COUNT_STAR_EXPANSION {
-                        return Some(ScalarValue::Int64(Some(num_rows as i64)));
-                    }
-                }
+
+        let Precision::Exact(num_rows) = statistics_args.statistics.num_rows else {
+            return None;
+        };
+
+        // TODO optimize with exprs other than Column
+        if let Some(col_expr) = expr.as_any().downcast_ref::<expressions::Column>() {
+            if let Precision::Exact(val) = col_stats[col_expr.index()].null_count {
+                let count = i64::try_from(num_rows - val).ok()?;
+                return Some(ScalarValue::Int64(Some(count)));
             }
+        } else if let Some(lit_expr) =
+            expr.as_any().downcast_ref::<expressions::Literal>()
+            && lit_expr.value() == &COUNT_STAR_EXPANSION
+        {
+            let num_rows = i64::try_from(num_rows).ok()?;
+            return Some(ScalarValue::Int64(Some(num_rows)));
         }
+
         None
     }
 
@@ -410,6 +411,98 @@ impl AggregateUDFImpl for Count {
         // the same as new values are seen.
         SetMonotonicity::Increasing
     }
+
+    fn create_sliding_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn Accumulator>> {
+        if args.is_distinct {
+            let acc =
+                SlidingDistinctCountAccumulator::try_new(args.return_field.data_type())?;
+            Ok(Box::new(acc))
+        } else {
+            let acc = CountAccumulator::new();
+            Ok(Box::new(acc))
+        }
+    }
+}
+
+// DistinctCountAccumulator does not support retract_batch and sliding window
+// this is a specialized accumulator for distinct count that supports retract_batch
+// and sliding window.
+#[derive(Debug)]
+pub struct SlidingDistinctCountAccumulator {
+    counts: HashMap<ScalarValue, usize, RandomState>,
+    data_type: DataType,
+}
+
+impl SlidingDistinctCountAccumulator {
+    pub fn try_new(data_type: &DataType) -> Result<Self> {
+        Ok(Self {
+            counts: HashMap::default(),
+            data_type: data_type.clone(),
+        })
+    }
+}
+
+impl Accumulator for SlidingDistinctCountAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        let keys = self.counts.keys().cloned().collect::<Vec<_>>();
+        Ok(vec![ScalarValue::List(ScalarValue::new_list_nullable(
+            keys.as_slice(),
+            &self.data_type,
+        ))])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = &values[0];
+        for i in 0..arr.len() {
+            let v = ScalarValue::try_from_array(arr, i)?;
+            if !v.is_null() {
+                *self.counts.entry(v).or_default() += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = &values[0];
+        for i in 0..arr.len() {
+            let v = ScalarValue::try_from_array(arr, i)?;
+            if !v.is_null()
+                && let Some(cnt) = self.counts.get_mut(&v)
+            {
+                *cnt -= 1;
+                if *cnt == 0 {
+                    self.counts.remove(&v);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let list_arr = states[0].as_list::<i32>();
+        for inner in list_arr.iter().flatten() {
+            for j in 0..inner.len() {
+                let v = ScalarValue::try_from_array(&*inner, j)?;
+                *self.counts.entry(v).or_default() += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.counts.len() as i64)))
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
 }
 
 #[derive(Debug)]
@@ -505,7 +598,9 @@ impl GroupsAccumulator for CountGroupsAccumulator {
             values.logical_nulls().as_ref(),
             opt_filter,
             |group_index| {
-                self.counts[group_index] += 1;
+                // SAFETY: group_index is guaranteed to be in bounds
+                let count = unsafe { self.counts.get_unchecked_mut(group_index) };
+                *count += 1;
             },
         );
 
@@ -711,8 +806,8 @@ impl Accumulator for DistinctCountAccumulator {
         }
 
         (0..arr.len()).try_for_each(|index| {
-            if !arr.is_null(index) {
-                let scalar = ScalarValue::try_from_array(arr, index)?;
+            let scalar = ScalarValue::try_from_array(arr, index)?;
+            if !scalar.is_null() {
                 self.values.insert(scalar);
             }
             Ok(())
@@ -757,8 +852,28 @@ impl Accumulator for DistinctCountAccumulator {
 
 #[cfg(test)]
 mod tests {
+
     use super::*;
-    use arrow::array::NullArray;
+    use arrow::{
+        array::{DictionaryArray, Int32Array, NullArray, StringArray},
+        datatypes::{DataType, Field, Int32Type, Schema},
+    };
+    use datafusion_expr::function::AccumulatorArgs;
+    use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
+    use std::sync::Arc;
+    /// Helper function to create a dictionary array with non-null keys but some null values
+    /// Returns a dictionary array where:
+    /// - keys are [0, 1, 2, 0, 1] (all non-null)
+    /// - values are ["a", null, "c"]
+    /// - so the keys reference: "a", null, "c", "a", null
+    fn create_dictionary_with_null_values() -> Result<DictionaryArray<Int32Type>> {
+        let values = StringArray::from(vec![Some("a"), None, Some("c")]);
+        let keys = Int32Array::from(vec![0, 1, 2, 0, 1]); // references "a", null, "c", "a", null
+        Ok(DictionaryArray::<Int32Type>::try_new(
+            keys,
+            Arc::new(values),
+        )?)
+    }
 
     #[test]
     fn count_accumulator_nulls() -> Result<()> {
@@ -767,4 +882,169 @@ mod tests {
         assert_eq!(accumulator.evaluate()?, ScalarValue::Int64(Some(0)));
         Ok(())
     }
+
+    #[test]
+    fn test_nested_dictionary() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "dict_col",
+            DataType::Dictionary(
+                Box::new(DataType::Int32),
+                Box::new(DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                )),
+            ),
+            true,
+        )]));
+
+        // Using Count UDAF's accumulator
+        let count = Count::new();
+        let expr = Arc::new(Column::new("dict_col", 0));
+        let expr_field = expr.return_field(&schema)?;
+        let args = AccumulatorArgs {
+            schema: &schema,
+            expr_fields: &[expr_field],
+            exprs: &[expr],
+            is_distinct: true,
+            name: "count",
+            ignore_nulls: false,
+            is_reversed: false,
+            return_field: Arc::new(Field::new_list_field(DataType::Int64, true)),
+            order_bys: &[],
+        };
+
+        let inner_dict =
+            DictionaryArray::<Int32Type>::from_iter(["a", "b", "c", "d", "a", "b"]);
+
+        let keys = Int32Array::from(vec![0, 1, 2, 0, 3, 1]);
+        let dict_of_dict =
+            DictionaryArray::<Int32Type>::try_new(keys, Arc::new(inner_dict))?;
+
+        let mut acc = count.accumulator(args)?;
+        acc.update_batch(&[Arc::new(dict_of_dict)])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(4)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn count_distinct_accumulator_dictionary_with_null_values() -> Result<()> {
+        let dict_array = create_dictionary_with_null_values()?;
+
+        // The expected behavior is that count_distinct should count only non-null values
+        // which in this case are "a" and "c" (appearing as 0 and 2 in keys)
+        let mut accumulator = DistinctCountAccumulator {
+            values: HashSet::default(),
+            state_data_type: dict_array.data_type().clone(),
+        };
+
+        accumulator.update_batch(&[Arc::new(dict_array)])?;
+
+        // Should have 2 distinct non-null values ("a" and "c")
+        assert_eq!(accumulator.evaluate()?, ScalarValue::Int64(Some(2)));
+        Ok(())
+    }
+
+    #[test]
+    fn count_accumulator_dictionary_with_null_values() -> Result<()> {
+        let dict_array = create_dictionary_with_null_values()?;
+
+        // The expected behavior is that count should only count non-null values
+        let mut accumulator = CountAccumulator::new();
+
+        accumulator.update_batch(&[Arc::new(dict_array)])?;
+
+        // 5 elements in the array, of which 2 reference null values (the two 1s in the keys)
+        // So we should count 3 non-null values
+        assert_eq!(accumulator.evaluate()?, ScalarValue::Int64(Some(3)));
+        Ok(())
+    }
+
+    #[test]
+    fn count_distinct_accumulator_dictionary_all_null_values() -> Result<()> {
+        // Create a dictionary array that only contains null values
+        let dict_values = StringArray::from(vec![None, Some("abc")]);
+        let dict_indices = Int32Array::from(vec![0; 5]);
+        let dict_array =
+            DictionaryArray::<Int32Type>::try_new(dict_indices, Arc::new(dict_values))?;
+
+        let mut accumulator = DistinctCountAccumulator {
+            values: HashSet::default(),
+            state_data_type: dict_array.data_type().clone(),
+        };
+
+        accumulator.update_batch(&[Arc::new(dict_array)])?;
+
+        // All referenced values are null so count(distinct) should be 0
+        assert_eq!(accumulator.evaluate()?, ScalarValue::Int64(Some(0)));
+        Ok(())
+    }
+
+    #[test]
+    fn sliding_distinct_count_accumulator_basic() -> Result<()> {
+        // Basic update_batch + evaluate functionality
+        let mut acc = SlidingDistinctCountAccumulator::try_new(&DataType::Int32)?;
+        // Create an Int32Array: [1, 2, 2, 3, null]
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(2),
+            Some(3),
+            None,
+        ]));
+        acc.update_batch(&[values])?;
+        // Expect distinct values {1,2,3} → count = 3
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(3)));
+        Ok(())
+    }
+
+    #[test]
+    fn sliding_distinct_count_accumulator_retract() -> Result<()> {
+        // Test that retract_batch properly decrements counts
+        let mut acc = SlidingDistinctCountAccumulator::try_new(&DataType::Utf8)?;
+        // Initial batch: ["a", "b", "a"]
+        let arr1 = Arc::new(StringArray::from(vec![Some("a"), Some("b"), Some("a")]))
+            as ArrayRef;
+        acc.update_batch(&[arr1])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(2))); // {"a","b"}
+
+        // Retract batch: ["a", null, "b"]
+        let arr2 =
+            Arc::new(StringArray::from(vec![Some("a"), None, Some("b")])) as ArrayRef;
+        acc.retract_batch(&[arr2])?;
+        // Before: a→2, b→1; after retract a→1, b→0 → b removed; remaining {"a"}
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(1)));
+        Ok(())
+    }
+
+    #[test]
+    fn sliding_distinct_count_accumulator_merge_states() -> Result<()> {
+        // Test merging multiple accumulator states with merge_batch
+        let mut acc1 = SlidingDistinctCountAccumulator::try_new(&DataType::Int32)?;
+        let mut acc2 = SlidingDistinctCountAccumulator::try_new(&DataType::Int32)?;
+        // acc1 sees [1, 2]
+        acc1.update_batch(&[Arc::new(Int32Array::from(vec![Some(1), Some(2)]))])?;
+        // acc2 sees [2, 3]
+        acc2.update_batch(&[Arc::new(Int32Array::from(vec![Some(2), Some(3)]))])?;
+        // Extract their states as Vec<ScalarValue>
+        let state_sv1 = acc1.state()?;
+        let state_sv2 = acc2.state()?;
+        // Convert ScalarValue states into Vec<ArrayRef>, propagating errors
+        // NOTE we pass `1` because each ScalarValue.to_array produces a 1‑row ListArray
+        let state_arr1: Vec<ArrayRef> = state_sv1
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+        let state_arr2: Vec<ArrayRef> = state_sv2
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+        // Merge both states into a fresh accumulator
+        let mut merged = SlidingDistinctCountAccumulator::try_new(&DataType::Int32)?;
+        merged.merge_batch(&state_arr1)?;
+        merged.merge_batch(&state_arr2)?;
+        // Expect distinct {1,2,3} → count = 3
+        assert_eq!(merged.evaluate()?, ScalarValue::Int64(Some(3)));
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs
index 9f37a73e5429e..8252cf1b19c4e 100644
--- a/datafusion/functions-aggregate/src/covariance.rs
+++ b/datafusion/functions-aggregate/src/covariance.rs
@@ -17,21 +17,14 @@
 
 //! [`CovarianceSample`]: covariance sample aggregations.
 
-use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{ArrayRef, Float64Array, UInt64Array},
-    compute::kernels::cast,
-    datatypes::{DataType, Field},
-};
-use datafusion_common::{
-    downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result,
-    ScalarValue,
-};
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
     function::{AccumulatorArgs, StateFieldsArgs},
-    type_coercion::aggregates::NUMERICS,
     utils::format_state_name,
-    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
 use datafusion_functions_aggregate_common::stats::StatsType;
 use datafusion_macros::user_doc;
@@ -70,20 +63,12 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression1", prefix = "First"),
     standard_argument(name = "expression2", prefix = "Second")
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct CovarianceSample {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for CovarianceSample {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("CovarianceSample")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for CovarianceSample {
     fn default() -> Self {
         Self::new()
@@ -94,7 +79,10 @@ impl CovarianceSample {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("covar")],
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -112,11 +100,7 @@ impl AggregateUDFImpl for CovarianceSample {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -165,19 +149,11 @@ impl AggregateUDFImpl for CovarianceSample {
     standard_argument(name = "expression1", prefix = "First"),
     standard_argument(name = "expression2", prefix = "Second")
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct CovariancePopulation {
     signature: Signature,
 }
 
-impl Debug for CovariancePopulation {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("CovariancePopulation")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for CovariancePopulation {
     fn default() -> Self {
         Self::new()
@@ -187,7 +163,10 @@ impl Default for CovariancePopulation {
 impl CovariancePopulation {
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -205,11 +184,7 @@ impl AggregateUDFImpl for CovariancePopulation {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -303,30 +278,15 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
+        let values1 = as_float64_array(&values[0])?;
+        let values2 = as_float64_array(&values[1])?;
 
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
+        for (value1, value2) in values1.iter().zip(values2) {
+            let (value1, value2) = match (value1, value2) {
+                (Some(a), Some(b)) => (a, b),
+                _ => continue,
             };
 
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
-
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
             let new_count = self.count + 1;
             let delta1 = value1 - self.mean1;
             let new_mean1 = delta1 / new_count as f64 + self.mean1;
@@ -344,29 +304,14 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values1 = &cast(&values[0], &DataType::Float64)?;
-        let values2 = &cast(&values[1], &DataType::Float64)?;
-        let mut arr1 = downcast_value!(values1, Float64Array).iter().flatten();
-        let mut arr2 = downcast_value!(values2, Float64Array).iter().flatten();
-
-        for i in 0..values1.len() {
-            let value1 = if values1.is_valid(i) {
-                arr1.next()
-            } else {
-                None
-            };
-            let value2 = if values2.is_valid(i) {
-                arr2.next()
-            } else {
-                None
-            };
-
-            if value1.is_none() || value2.is_none() {
-                continue;
-            }
+        let values1 = as_float64_array(&values[0])?;
+        let values2 = as_float64_array(&values[1])?;
 
-            let value1 = unwrap_or_internal_err!(value1);
-            let value2 = unwrap_or_internal_err!(value2);
+        for (value1, value2) in values1.iter().zip(values2) {
+            let (value1, value2) = match (value1, value2) {
+                (Some(a), Some(b)) => (a, b),
+                _ => continue,
+            };
 
             let new_count = self.count - 1;
             let delta1 = self.mean1 - value1;
@@ -385,10 +330,10 @@ impl Accumulator for CovarianceAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], UInt64Array);
-        let means1 = downcast_value!(states[1], Float64Array);
-        let means2 = downcast_value!(states[2], Float64Array);
-        let cs = downcast_value!(states[3], Float64Array);
+        let counts = as_uint64_array(&states[0])?;
+        let means1 = as_float64_array(&states[1])?;
+        let means2 = as_float64_array(&states[2])?;
+        let cs = as_float64_array(&states[3])?;
 
         for i in 0..counts.len() {
             let c = counts.value(i);
diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs
index e8022245dba55..b339479b35e9d 100644
--- a/datafusion/functions-aggregate/src/first_last.rs
+++ b/datafusion/functions-aggregate/src/first_last.rs
@@ -19,6 +19,7 @@
 
 use std::any::Any;
 use std::fmt::Debug;
+use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::Arc;
 
@@ -29,23 +30,24 @@ use arrow::array::{
 use arrow::buffer::{BooleanBuffer, NullBuffer};
 use arrow::compute::{self, LexicographicalComparator, SortColumn, SortOptions};
 use arrow::datatypes::{
-    DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Field, FieldRef,
-    Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
-    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
-    TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    DataType, Date32Type, Date64Type, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, Field, FieldRef, Float16Type, Float32Type, Float64Type, Int8Type,
+    Int16Type, Int32Type, Int64Type, Time32MillisecondType, Time32SecondType,
+    Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
 };
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::utils::{compare_rows, extract_row_at_idx_to_buf, get_row_at_idx};
 use datafusion_common::{
-    arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, arrow_datafusion_err, internal_err,
+    not_impl_err,
 };
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, EmitTo, Expr, ExprFunctionExt,
-    GroupsAccumulator, Signature, SortExpr, Volatility,
+    GroupsAccumulator, ReversedUDAF, Signature, SortExpr, Volatility,
 };
 use datafusion_functions_aggregate_common::utils::get_sort_options;
 use datafusion_macros::user_doc;
@@ -55,31 +57,23 @@ create_func!(FirstValue, first_value_udaf);
 create_func!(LastValue, last_value_udaf);
 
 /// Returns the first value in a group of values.
-pub fn first_value(expression: Expr, order_by: Option<Vec<SortExpr>>) -> Expr {
-    if let Some(order_by) = order_by {
-        first_value_udaf()
-            .call(vec![expression])
-            .order_by(order_by)
-            .build()
-            // guaranteed to be `Expr::AggregateFunction`
-            .unwrap()
-    } else {
-        first_value_udaf().call(vec![expression])
-    }
+pub fn first_value(expression: Expr, order_by: Vec<SortExpr>) -> Expr {
+    first_value_udaf()
+        .call(vec![expression])
+        .order_by(order_by)
+        .build()
+        // guaranteed to be `Expr::AggregateFunction`
+        .unwrap()
 }
 
 /// Returns the last value in a group of values.
-pub fn last_value(expression: Expr, order_by: Option<Vec<SortExpr>>) -> Expr {
-    if let Some(order_by) = order_by {
-        last_value_udaf()
-            .call(vec![expression])
-            .order_by(order_by)
-            .build()
-            // guaranteed to be `Expr::AggregateFunction`
-            .unwrap()
-    } else {
-        last_value_udaf().call(vec![expression])
-    }
+pub fn last_value(expression: Expr, order_by: Vec<SortExpr>) -> Expr {
+    last_value_udaf()
+        .call(vec![expression])
+        .order_by(order_by)
+        .build()
+        // guaranteed to be `Expr::AggregateFunction`
+        .unwrap()
 }
 
 #[user_doc(
@@ -96,19 +90,10 @@ pub fn last_value(expression: Expr, order_by: Option<Vec<SortExpr>>) -> Expr {
 ```"#,
     standard_argument(name = "expression",)
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct FirstValue {
     signature: Signature,
-    requirement_satisfied: bool,
-}
-
-impl Debug for FirstValue {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("FirstValue")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .field("accumulator", &"<FUNC>")
-            .finish()
-    }
+    is_input_pre_ordered: bool,
 }
 
 impl Default for FirstValue {
@@ -121,14 +106,9 @@ impl FirstValue {
     pub fn new() -> Self {
         Self {
             signature: Signature::any(1, Volatility::Immutable),
-            requirement_satisfied: false,
+            is_input_pre_ordered: false,
         }
     }
-
-    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
-        self.requirement_satisfied = requirement_satisfied;
-        self
-    }
 }
 
 impl AggregateUDFImpl for FirstValue {
@@ -144,167 +124,191 @@ impl AggregateUDFImpl for FirstValue {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        not_impl_err!("Not called because the return_field_from_args is implemented")
+    }
+
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        // Preserve metadata from the first argument field
+        Ok(Arc::new(
+            Field::new(
+                self.name(),
+                arg_fields[0].data_type().clone(),
+                true, // always nullable, there may be no rows
+            )
+            .with_metadata(arg_fields[0].metadata().clone()),
+        ))
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let ordering_dtypes = acc_args
-            .ordering_req
+        let Some(ordering) = LexOrdering::new(acc_args.order_bys.to_vec()) else {
+            return TrivialFirstValueAccumulator::try_new(
+                acc_args.return_field.data_type(),
+                acc_args.ignore_nulls,
+            )
+            .map(|acc| Box::new(acc) as _);
+        };
+        let ordering_dtypes = ordering
             .iter()
             .map(|e| e.expr.data_type(acc_args.schema))
             .collect::<Result<Vec<_>>>()?;
-
-        // When requirement is empty, or it is signalled by outside caller that
-        // the ordering requirement is/will be satisfied.
-        let requirement_satisfied =
-            acc_args.ordering_req.is_empty() || self.requirement_satisfied;
-
-        FirstValueAccumulator::try_new(
+        Ok(Box::new(FirstValueAccumulator::try_new(
             acc_args.return_field.data_type(),
             &ordering_dtypes,
-            acc_args.ordering_req.clone(),
+            ordering,
+            self.is_input_pre_ordered,
             acc_args.ignore_nulls,
-        )
-        .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _)
+        )?))
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let mut fields = vec![Field::new(
-            format_state_name(args.name, "first_value"),
-            args.return_type().clone(),
-            true,
-        )
-        .into()];
-        fields.extend(args.ordering_fields.to_vec());
-        fields.push(Field::new("is_set", DataType::Boolean, true).into());
+        let mut fields = vec![
+            Field::new(
+                format_state_name(args.name, "first_value"),
+                args.return_type().clone(),
+                true,
+            )
+            .into(),
+        ];
+        fields.extend(args.ordering_fields.iter().cloned());
+        fields.push(
+            Field::new(
+                format_state_name(args.name, "first_value_is_set"),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        );
         Ok(fields)
     }
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
-        // TODO: extract to function
         use DataType::*;
-        matches!(
-            args.return_field.data_type(),
-            Int8 | Int16
-                | Int32
-                | Int64
-                | UInt8
-                | UInt16
-                | UInt32
-                | UInt64
-                | Float16
-                | Float32
-                | Float64
-                | Decimal128(_, _)
-                | Decimal256(_, _)
-                | Date32
-                | Date64
-                | Time32(_)
-                | Time64(_)
-                | Timestamp(_, _)
-        )
+        !args.order_bys.is_empty()
+            && matches!(
+                args.return_field.data_type(),
+                Int8 | Int16
+                    | Int32
+                    | Int64
+                    | UInt8
+                    | UInt16
+                    | UInt32
+                    | UInt64
+                    | Float16
+                    | Float32
+                    | Float64
+                    | Decimal32(_, _)
+                    | Decimal64(_, _)
+                    | Decimal128(_, _)
+                    | Decimal256(_, _)
+                    | Date32
+                    | Date64
+                    | Time32(_)
+                    | Time64(_)
+                    | Timestamp(_, _)
+            )
     }
 
     fn create_groups_accumulator(
         &self,
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
-        // TODO: extract to function
-        fn create_accumulator<T>(
-            args: AccumulatorArgs,
-        ) -> Result<Box<dyn GroupsAccumulator>>
-        where
-            T: ArrowPrimitiveType + Send,
-        {
-            let ordering_dtypes = args
-                .ordering_req
+        fn create_accumulator<T: ArrowPrimitiveType + Send>(
+            args: &AccumulatorArgs,
+        ) -> Result<Box<dyn GroupsAccumulator>> {
+            let Some(ordering) = LexOrdering::new(args.order_bys.to_vec()) else {
+                return internal_err!("Groups accumulator must have an ordering.");
+            };
+
+            let ordering_dtypes = ordering
                 .iter()
                 .map(|e| e.expr.data_type(args.schema))
                 .collect::<Result<Vec<_>>>()?;
 
-            Ok(Box::new(FirstPrimitiveGroupsAccumulator::<T>::try_new(
-                args.ordering_req.clone(),
+            FirstPrimitiveGroupsAccumulator::<T>::try_new(
+                ordering,
                 args.ignore_nulls,
                 args.return_field.data_type(),
                 &ordering_dtypes,
                 true,
-            )?))
+            )
+            .map(|acc| Box::new(acc) as _)
         }
 
         match args.return_field.data_type() {
-            DataType::Int8 => create_accumulator::<Int8Type>(args),
-            DataType::Int16 => create_accumulator::<Int16Type>(args),
-            DataType::Int32 => create_accumulator::<Int32Type>(args),
-            DataType::Int64 => create_accumulator::<Int64Type>(args),
-            DataType::UInt8 => create_accumulator::<UInt8Type>(args),
-            DataType::UInt16 => create_accumulator::<UInt16Type>(args),
-            DataType::UInt32 => create_accumulator::<UInt32Type>(args),
-            DataType::UInt64 => create_accumulator::<UInt64Type>(args),
-            DataType::Float16 => create_accumulator::<Float16Type>(args),
-            DataType::Float32 => create_accumulator::<Float32Type>(args),
-            DataType::Float64 => create_accumulator::<Float64Type>(args),
-
-            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(args),
-            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(args),
+            DataType::Int8 => create_accumulator::<Int8Type>(&args),
+            DataType::Int16 => create_accumulator::<Int16Type>(&args),
+            DataType::Int32 => create_accumulator::<Int32Type>(&args),
+            DataType::Int64 => create_accumulator::<Int64Type>(&args),
+            DataType::UInt8 => create_accumulator::<UInt8Type>(&args),
+            DataType::UInt16 => create_accumulator::<UInt16Type>(&args),
+            DataType::UInt32 => create_accumulator::<UInt32Type>(&args),
+            DataType::UInt64 => create_accumulator::<UInt64Type>(&args),
+            DataType::Float16 => create_accumulator::<Float16Type>(&args),
+            DataType::Float32 => create_accumulator::<Float32Type>(&args),
+            DataType::Float64 => create_accumulator::<Float64Type>(&args),
+
+            DataType::Decimal32(_, _) => create_accumulator::<Decimal32Type>(&args),
+            DataType::Decimal64(_, _) => create_accumulator::<Decimal64Type>(&args),
+            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(&args),
+            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(&args),
 
             DataType::Timestamp(TimeUnit::Second, _) => {
-                create_accumulator::<TimestampSecondType>(args)
+                create_accumulator::<TimestampSecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Millisecond, _) => {
-                create_accumulator::<TimestampMillisecondType>(args)
+                create_accumulator::<TimestampMillisecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Microsecond, _) => {
-                create_accumulator::<TimestampMicrosecondType>(args)
+                create_accumulator::<TimestampMicrosecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                create_accumulator::<TimestampNanosecondType>(args)
+                create_accumulator::<TimestampNanosecondType>(&args)
             }
 
-            DataType::Date32 => create_accumulator::<Date32Type>(args),
-            DataType::Date64 => create_accumulator::<Date64Type>(args),
+            DataType::Date32 => create_accumulator::<Date32Type>(&args),
+            DataType::Date64 => create_accumulator::<Date64Type>(&args),
             DataType::Time32(TimeUnit::Second) => {
-                create_accumulator::<Time32SecondType>(args)
+                create_accumulator::<Time32SecondType>(&args)
             }
             DataType::Time32(TimeUnit::Millisecond) => {
-                create_accumulator::<Time32MillisecondType>(args)
+                create_accumulator::<Time32MillisecondType>(&args)
             }
 
             DataType::Time64(TimeUnit::Microsecond) => {
-                create_accumulator::<Time64MicrosecondType>(args)
+                create_accumulator::<Time64MicrosecondType>(&args)
             }
             DataType::Time64(TimeUnit::Nanosecond) => {
-                create_accumulator::<Time64NanosecondType>(args)
+                create_accumulator::<Time64NanosecondType>(&args)
             }
 
-            _ => {
-                internal_err!(
-                    "GroupsAccumulator not supported for first_value({})",
-                    args.return_field.data_type()
-                )
-            }
+            _ => internal_err!(
+                "GroupsAccumulator not supported for first_value({})",
+                args.return_field.data_type()
+            ),
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn with_beneficial_ordering(
         self: Arc<Self>,
         beneficial_ordering: bool,
     ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
-        Ok(Some(Arc::new(
-            FirstValue::new().with_requirement_satisfied(beneficial_ordering),
-        )))
+        Ok(Some(Arc::new(Self {
+            signature: self.signature.clone(),
+            is_input_pre_ordered: beneficial_ordering,
+        })))
     }
 
     fn order_sensitivity(&self) -> AggregateOrderSensitivity {
         AggregateOrderSensitivity::Beneficial
     }
 
-    fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
-        datafusion_expr::ReversedUDAF::Reversed(last_value_udaf())
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Reversed(last_value_udaf())
+    }
+
+    fn supports_null_handling_clause(&self) -> bool {
+        true
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -350,8 +354,6 @@ where
     pick_first_in_group: bool,
     // derived from `ordering_req`.
     sort_options: Vec<SortOptions>,
-    // Stores whether incoming data already satisfies the ordering requirement.
-    input_requirement_satisfied: bool,
     // Ignore null values.
     ignore_nulls: bool,
     /// The output type
@@ -370,20 +372,17 @@ where
         ordering_dtypes: &[DataType],
         pick_first_in_group: bool,
     ) -> Result<Self> {
-        let requirement_satisfied = ordering_req.is_empty();
-
         let default_orderings = ordering_dtypes
             .iter()
             .map(ScalarValue::try_from)
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Result<_>>()?;
 
-        let sort_options = get_sort_options(ordering_req.as_ref());
+        let sort_options = get_sort_options(&ordering_req);
 
         Ok(Self {
             null_builder: BooleanBufferBuilder::new(0),
             ordering_req,
             sort_options,
-            input_requirement_satisfied: requirement_satisfied,
             ignore_nulls,
             default_orderings,
             data_type: data_type.clone(),
@@ -396,18 +395,6 @@ where
         })
     }
 
-    fn need_update(&self, group_idx: usize) -> bool {
-        if !self.is_sets.get_bit(group_idx) {
-            return true;
-        }
-
-        if self.ignore_nulls && !self.null_builder.get_bit(group_idx) {
-            return true;
-        }
-
-        !self.input_requirement_satisfied
-    }
-
     fn should_update_state(
         &self,
         group_idx: usize,
@@ -573,17 +560,12 @@ where
             let group_idx = *group_idx;
 
             let passed_filter = opt_filter.is_none_or(|x| x.value(idx_in_val));
-
             let is_set = is_set_arr.is_none_or(|x| x.value(idx_in_val));
 
             if !passed_filter || !is_set {
                 continue;
             }
 
-            if !self.need_update(group_idx) {
-                continue;
-            }
-
             if self.ignore_nulls && vals.is_null(idx_in_val) {
                 continue;
             }
@@ -720,7 +702,7 @@ where
 
         let (is_set_arr, val_and_order_cols) = match values.split_last() {
             Some(result) => result,
-            None => return internal_err!("Empty row in FISRT_VALUE"),
+            None => return internal_err!("Empty row in FIRST_VALUE"),
         };
 
         let is_set_arr = as_boolean_array(is_set_arr)?;
@@ -782,19 +764,103 @@ where
         }
     }
 }
+
+/// This accumulator is used when there is no ordering specified for the
+/// `FIRST_VALUE` aggregation. It simply returns the first value it sees
+/// according to the pre-existing ordering of the input data, and provides
+/// a fast path for this case without needing to maintain any ordering state.
+#[derive(Debug)]
+pub struct TrivialFirstValueAccumulator {
+    first: ScalarValue,
+    // Whether we have seen the first value yet.
+    is_set: bool,
+    // Ignore null values.
+    ignore_nulls: bool,
+}
+
+impl TrivialFirstValueAccumulator {
+    /// Creates a new `TrivialFirstValueAccumulator` for the given `data_type`.
+    pub fn try_new(data_type: &DataType, ignore_nulls: bool) -> Result<Self> {
+        ScalarValue::try_from(data_type).map(|first| Self {
+            first,
+            is_set: false,
+            ignore_nulls,
+        })
+    }
+}
+
+impl Accumulator for TrivialFirstValueAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![self.first.clone(), ScalarValue::from(self.is_set)])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if !self.is_set {
+            // Get first entry according to the pre-existing ordering (0th index):
+            let value = &values[0];
+            let mut first_idx = None;
+            if self.ignore_nulls {
+                // If ignoring nulls, find the first non-null value.
+                for i in 0..value.len() {
+                    if !value.is_null(i) {
+                        first_idx = Some(i);
+                        break;
+                    }
+                }
+            } else if !value.is_empty() {
+                // If not ignoring nulls, return the first value if it exists.
+                first_idx = Some(0);
+            }
+            if let Some(first_idx) = first_idx {
+                let mut row = get_row_at_idx(values, first_idx)?;
+                self.first = row.swap_remove(0);
+                self.first.compact();
+                self.is_set = true;
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // FIRST_VALUE(first1, first2, first3, ...)
+        // Second index contains is_set flag.
+        if !self.is_set {
+            let flags = states[1].as_boolean();
+            validate_is_set_flags(flags, "first_value")?;
+
+            let filtered_states =
+                filter_states_according_to_is_set(&states[0..1], flags)?;
+            if let Some(first) = filtered_states.first()
+                && !first.is_empty()
+            {
+                self.first = ScalarValue::try_from_array(first, 0)?;
+                self.is_set = true;
+            }
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.first.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) - size_of_val(&self.first) + self.first.size()
+    }
+}
+
 #[derive(Debug)]
 pub struct FirstValueAccumulator {
     first: ScalarValue,
-    // At the beginning, `is_set` is false, which means `first` is not seen yet.
-    // Once we see the first value, we set the `is_set` flag and do not update `first` anymore.
+    // Whether we have seen the first value yet.
     is_set: bool,
-    // Stores ordering values, of the aggregator requirement corresponding to first value
-    // of the aggregator. These values are used during merging of multiple partitions.
+    // Stores values of the ordering columns corresponding to the first value.
+    // These values are used during merging of multiple partitions.
     orderings: Vec<ScalarValue>,
     // Stores the applicable ordering requirement.
     ordering_req: LexOrdering,
     // Stores whether incoming data already satisfies the ordering requirement.
-    requirement_satisfied: bool,
+    is_input_pre_ordered: bool,
     // Ignore null values.
     ignore_nulls: bool,
 }
@@ -805,35 +871,29 @@ impl FirstValueAccumulator {
         data_type: &DataType,
         ordering_dtypes: &[DataType],
         ordering_req: LexOrdering,
+        is_input_pre_ordered: bool,
         ignore_nulls: bool,
     ) -> Result<Self> {
         let orderings = ordering_dtypes
             .iter()
             .map(ScalarValue::try_from)
-            .collect::<Result<Vec<_>>>()?;
-        let requirement_satisfied = ordering_req.is_empty();
+            .collect::<Result<_>>()?;
         ScalarValue::try_from(data_type).map(|first| Self {
             first,
             is_set: false,
             orderings,
             ordering_req,
-            requirement_satisfied,
+            is_input_pre_ordered,
             ignore_nulls,
         })
     }
 
-    pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
-        self.requirement_satisfied = requirement_satisfied;
-        self
-    }
-
     // Updates state with the values in the given row.
     fn update_with_new_row(&mut self, mut row: Vec<ScalarValue>) {
         // Ensure any Array based scalars hold have a single value to reduce memory pressure
-        row.iter_mut().for_each(|s| {
+        for s in row.iter_mut() {
             s.compact();
-        });
-
+        }
         self.first = row.remove(0);
         self.orderings = row;
         self.is_set = true;
@@ -843,7 +903,7 @@ impl FirstValueAccumulator {
         let [value, ordering_values @ ..] = values else {
             return internal_err!("Empty row in FIRST_VALUE");
         };
-        if self.requirement_satisfied {
+        if self.is_input_pre_ordered {
             // Get first entry according to the pre-existing ordering (0th index):
             if self.ignore_nulls {
                 // If ignoring nulls, find the first non-null value.
@@ -886,30 +946,24 @@ impl Accumulator for FirstValueAccumulator {
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
         let mut result = vec![self.first.clone()];
         result.extend(self.orderings.iter().cloned());
-        result.push(ScalarValue::Boolean(Some(self.is_set)));
+        result.push(ScalarValue::from(self.is_set));
         Ok(result)
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if !self.is_set {
-            if let Some(first_idx) = self.get_first_idx(values)? {
-                let row = get_row_at_idx(values, first_idx)?;
+        if let Some(first_idx) = self.get_first_idx(values)? {
+            let row = get_row_at_idx(values, first_idx)?;
+            if !self.is_set
+                || (!self.is_input_pre_ordered
+                    && compare_rows(
+                        &self.orderings,
+                        &row[1..],
+                        &get_sort_options(&self.ordering_req),
+                    )?
+                    .is_gt())
+            {
                 self.update_with_new_row(row);
             }
-        } else if !self.requirement_satisfied {
-            if let Some(first_idx) = self.get_first_idx(values)? {
-                let row = get_row_at_idx(values, first_idx)?;
-                let orderings = &row[1..];
-                if compare_rows(
-                    &self.orderings,
-                    orderings,
-                    &get_sort_options(self.ordering_req.as_ref()),
-                )?
-                .is_gt()
-                {
-                    self.update_with_new_row(row);
-                }
-            }
         }
         Ok(())
     }
@@ -919,13 +973,13 @@ impl Accumulator for FirstValueAccumulator {
         // last index contains is_set flag.
         let is_set_idx = states.len() - 1;
         let flags = states[is_set_idx].as_boolean();
+        validate_is_set_flags(flags, "first_value")?;
+
         let filtered_states =
             filter_states_according_to_is_set(&states[0..is_set_idx], flags)?;
         // 1..is_set_idx range corresponds to ordering section
-        let sort_columns = convert_to_sort_cols(
-            &filtered_states[1..is_set_idx],
-            self.ordering_req.as_ref(),
-        );
+        let sort_columns =
+            convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req);
 
         let comparator = LexicographicalComparator::try_new(&sort_columns)?;
         let min = (0..filtered_states[0].len()).min_by(|&a, &b| comparator.compare(a, b));
@@ -934,7 +988,7 @@ impl Accumulator for FirstValueAccumulator {
             let mut first_row = get_row_at_idx(&filtered_states, first_idx)?;
             // When collecting orderings, we exclude the is_set flag from the state.
             let first_ordering = &first_row[1..is_set_idx];
-            let sort_options = get_sort_options(self.ordering_req.as_ref());
+            let sort_options = get_sort_options(&self.ordering_req);
             // Either there is no existing value, or there is an earlier version in new data.
             if !self.is_set
                 || compare_rows(&self.orderings, first_ordering, &sort_options)?.is_gt()
@@ -976,19 +1030,10 @@ impl Accumulator for FirstValueAccumulator {
 ```"#,
     standard_argument(name = "expression",)
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct LastValue {
     signature: Signature,
-    requirement_satisfied: bool,
-}
-
-impl Debug for LastValue {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("LastValue")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .field("accumulator", &"<FUNC>")
-            .finish()
-    }
+    is_input_pre_ordered: bool,
 }
 
 impl Default for LastValue {
@@ -1001,14 +1046,9 @@ impl LastValue {
     pub fn new() -> Self {
         Self {
             signature: Signature::any(1, Volatility::Immutable),
-            requirement_satisfied: false,
+            is_input_pre_ordered: false,
         }
     }
-
-    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
-        self.requirement_satisfied = requirement_satisfied;
-        self
-    }
 }
 
 impl AggregateUDFImpl for LastValue {
@@ -1024,67 +1064,84 @@ impl AggregateUDFImpl for LastValue {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        not_impl_err!("Not called because the return_field_from_args is implemented")
+    }
+
+    fn return_field(&self, arg_fields: &[FieldRef]) -> Result<FieldRef> {
+        // Preserve metadata from the first argument field
+        Ok(Arc::new(
+            Field::new(
+                self.name(),
+                arg_fields[0].data_type().clone(),
+                true, // always nullable, there may be no rows
+            )
+            .with_metadata(arg_fields[0].metadata().clone()),
+        ))
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
-        let ordering_dtypes = acc_args
-            .ordering_req
+        let Some(ordering) = LexOrdering::new(acc_args.order_bys.to_vec()) else {
+            return TrivialLastValueAccumulator::try_new(
+                acc_args.return_field.data_type(),
+                acc_args.ignore_nulls,
+            )
+            .map(|acc| Box::new(acc) as _);
+        };
+        let ordering_dtypes = ordering
             .iter()
             .map(|e| e.expr.data_type(acc_args.schema))
             .collect::<Result<Vec<_>>>()?;
-
-        let requirement_satisfied =
-            acc_args.ordering_req.is_empty() || self.requirement_satisfied;
-
-        LastValueAccumulator::try_new(
+        Ok(Box::new(LastValueAccumulator::try_new(
             acc_args.return_field.data_type(),
             &ordering_dtypes,
-            acc_args.ordering_req.clone(),
+            ordering,
+            self.is_input_pre_ordered,
             acc_args.ignore_nulls,
-        )
-        .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _)
+        )?))
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let StateFieldsArgs {
-            name,
-            input_fields,
-            return_field: _,
-            ordering_fields,
-            is_distinct: _,
-        } = args;
-        let mut fields = vec![Field::new(
-            format_state_name(name, "last_value"),
-            input_fields[0].data_type().clone(),
-            true,
-        )
-        .into()];
-        fields.extend(ordering_fields.to_vec());
-        fields.push(Field::new("is_set", DataType::Boolean, true).into());
+        let mut fields = vec![
+            Field::new(
+                format_state_name(args.name, "last_value"),
+                args.return_field.data_type().clone(),
+                true,
+            )
+            .into(),
+        ];
+        fields.extend(args.ordering_fields.iter().cloned());
+        fields.push(
+            Field::new(
+                format_state_name(args.name, "last_value_is_set"),
+                DataType::Boolean,
+                true,
+            )
+            .into(),
+        );
         Ok(fields)
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn with_beneficial_ordering(
         self: Arc<Self>,
         beneficial_ordering: bool,
     ) -> Result<Option<Arc<dyn AggregateUDFImpl>>> {
-        Ok(Some(Arc::new(
-            LastValue::new().with_requirement_satisfied(beneficial_ordering),
-        )))
+        Ok(Some(Arc::new(Self {
+            signature: self.signature.clone(),
+            is_input_pre_ordered: beneficial_ordering,
+        })))
     }
 
     fn order_sensitivity(&self) -> AggregateOrderSensitivity {
         AggregateOrderSensitivity::Beneficial
     }
 
-    fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
-        datafusion_expr::ReversedUDAF::Reversed(first_value_udaf())
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Reversed(first_value_udaf())
+    }
+
+    fn supports_null_handling_clause(&self) -> bool {
+        true
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -1093,26 +1150,29 @@ impl AggregateUDFImpl for LastValue {
 
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         use DataType::*;
-        matches!(
-            args.return_field.data_type(),
-            Int8 | Int16
-                | Int32
-                | Int64
-                | UInt8
-                | UInt16
-                | UInt32
-                | UInt64
-                | Float16
-                | Float32
-                | Float64
-                | Decimal128(_, _)
-                | Decimal256(_, _)
-                | Date32
-                | Date64
-                | Time32(_)
-                | Time64(_)
-                | Timestamp(_, _)
-        )
+        !args.order_bys.is_empty()
+            && matches!(
+                args.return_field.data_type(),
+                Int8 | Int16
+                    | Int32
+                    | Int64
+                    | UInt8
+                    | UInt16
+                    | UInt32
+                    | UInt64
+                    | Float16
+                    | Float32
+                    | Float64
+                    | Decimal32(_, _)
+                    | Decimal64(_, _)
+                    | Decimal128(_, _)
+                    | Decimal256(_, _)
+                    | Date32
+                    | Date64
+                    | Time32(_)
+                    | Time64(_)
+                    | Timestamp(_, _)
+            )
     }
 
     fn create_groups_accumulator(
@@ -1120,19 +1180,22 @@ impl AggregateUDFImpl for LastValue {
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
         fn create_accumulator<T>(
-            args: AccumulatorArgs,
+            args: &AccumulatorArgs,
         ) -> Result<Box<dyn GroupsAccumulator>>
         where
             T: ArrowPrimitiveType + Send,
         {
-            let ordering_dtypes = args
-                .ordering_req
+            let Some(ordering) = LexOrdering::new(args.order_bys.to_vec()) else {
+                return internal_err!("Groups accumulator must have an ordering.");
+            };
+
+            let ordering_dtypes = ordering
                 .iter()
                 .map(|e| e.expr.data_type(args.schema))
                 .collect::<Result<Vec<_>>>()?;
 
             Ok(Box::new(FirstPrimitiveGroupsAccumulator::<T>::try_new(
-                args.ordering_req.clone(),
+                ordering,
                 args.ignore_nulls,
                 args.return_field.data_type(),
                 &ordering_dtypes,
@@ -1141,48 +1204,50 @@ impl AggregateUDFImpl for LastValue {
         }
 
         match args.return_field.data_type() {
-            DataType::Int8 => create_accumulator::<Int8Type>(args),
-            DataType::Int16 => create_accumulator::<Int16Type>(args),
-            DataType::Int32 => create_accumulator::<Int32Type>(args),
-            DataType::Int64 => create_accumulator::<Int64Type>(args),
-            DataType::UInt8 => create_accumulator::<UInt8Type>(args),
-            DataType::UInt16 => create_accumulator::<UInt16Type>(args),
-            DataType::UInt32 => create_accumulator::<UInt32Type>(args),
-            DataType::UInt64 => create_accumulator::<UInt64Type>(args),
-            DataType::Float16 => create_accumulator::<Float16Type>(args),
-            DataType::Float32 => create_accumulator::<Float32Type>(args),
-            DataType::Float64 => create_accumulator::<Float64Type>(args),
-
-            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(args),
-            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(args),
+            DataType::Int8 => create_accumulator::<Int8Type>(&args),
+            DataType::Int16 => create_accumulator::<Int16Type>(&args),
+            DataType::Int32 => create_accumulator::<Int32Type>(&args),
+            DataType::Int64 => create_accumulator::<Int64Type>(&args),
+            DataType::UInt8 => create_accumulator::<UInt8Type>(&args),
+            DataType::UInt16 => create_accumulator::<UInt16Type>(&args),
+            DataType::UInt32 => create_accumulator::<UInt32Type>(&args),
+            DataType::UInt64 => create_accumulator::<UInt64Type>(&args),
+            DataType::Float16 => create_accumulator::<Float16Type>(&args),
+            DataType::Float32 => create_accumulator::<Float32Type>(&args),
+            DataType::Float64 => create_accumulator::<Float64Type>(&args),
+
+            DataType::Decimal32(_, _) => create_accumulator::<Decimal32Type>(&args),
+            DataType::Decimal64(_, _) => create_accumulator::<Decimal64Type>(&args),
+            DataType::Decimal128(_, _) => create_accumulator::<Decimal128Type>(&args),
+            DataType::Decimal256(_, _) => create_accumulator::<Decimal256Type>(&args),
 
             DataType::Timestamp(TimeUnit::Second, _) => {
-                create_accumulator::<TimestampSecondType>(args)
+                create_accumulator::<TimestampSecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Millisecond, _) => {
-                create_accumulator::<TimestampMillisecondType>(args)
+                create_accumulator::<TimestampMillisecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Microsecond, _) => {
-                create_accumulator::<TimestampMicrosecondType>(args)
+                create_accumulator::<TimestampMicrosecondType>(&args)
             }
             DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                create_accumulator::<TimestampNanosecondType>(args)
+                create_accumulator::<TimestampNanosecondType>(&args)
             }
 
-            DataType::Date32 => create_accumulator::<Date32Type>(args),
-            DataType::Date64 => create_accumulator::<Date64Type>(args),
+            DataType::Date32 => create_accumulator::<Date32Type>(&args),
+            DataType::Date64 => create_accumulator::<Date64Type>(&args),
             DataType::Time32(TimeUnit::Second) => {
-                create_accumulator::<Time32SecondType>(args)
+                create_accumulator::<Time32SecondType>(&args)
             }
             DataType::Time32(TimeUnit::Millisecond) => {
-                create_accumulator::<Time32MillisecondType>(args)
+                create_accumulator::<Time32MillisecondType>(&args)
             }
 
             DataType::Time64(TimeUnit::Microsecond) => {
-                create_accumulator::<Time64MicrosecondType>(args)
+                create_accumulator::<Time64MicrosecondType>(&args)
             }
             DataType::Time64(TimeUnit::Nanosecond) => {
-                create_accumulator::<Time64NanosecondType>(args)
+                create_accumulator::<Time64NanosecondType>(&args)
             }
 
             _ => {
@@ -1195,6 +1260,87 @@ impl AggregateUDFImpl for LastValue {
     }
 }
 
+/// This accumulator is used when there is no ordering specified for the
+/// `LAST_VALUE` aggregation. It simply updates the last value it sees
+/// according to the pre-existing ordering of the input data, and provides
+/// a fast path for this case without needing to maintain any ordering state.
+#[derive(Debug)]
+pub struct TrivialLastValueAccumulator {
+    last: ScalarValue,
+    // The `is_set` flag keeps track of whether the last value is finalized.
+    // This information is used to discriminate genuine NULLs and NULLS that
+    // occur due to empty partitions.
+    is_set: bool,
+    // Ignore null values.
+    ignore_nulls: bool,
+}
+
+impl TrivialLastValueAccumulator {
+    /// Creates a new `TrivialLastValueAccumulator` for the given `data_type`.
+    pub fn try_new(data_type: &DataType, ignore_nulls: bool) -> Result<Self> {
+        ScalarValue::try_from(data_type).map(|last| Self {
+            last,
+            is_set: false,
+            ignore_nulls,
+        })
+    }
+}
+
+impl Accumulator for TrivialLastValueAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![self.last.clone(), ScalarValue::from(self.is_set)])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        // Get last entry according to the pre-existing ordering (0th index):
+        let value = &values[0];
+        let mut last_idx = None;
+        if self.ignore_nulls {
+            // If ignoring nulls, find the last non-null value.
+            for i in (0..value.len()).rev() {
+                if !value.is_null(i) {
+                    last_idx = Some(i);
+                    break;
+                }
+            }
+        } else if !value.is_empty() {
+            // If not ignoring nulls, return the last value if it exists.
+            last_idx = Some(value.len() - 1);
+        }
+        if let Some(last_idx) = last_idx {
+            let mut row = get_row_at_idx(values, last_idx)?;
+            self.last = row.swap_remove(0);
+            self.last.compact();
+            self.is_set = true;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // LAST_VALUE(last1, last2, last3, ...)
+        // Second index contains is_set flag.
+        let flags = states[1].as_boolean();
+        validate_is_set_flags(flags, "last_value")?;
+
+        let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?;
+        if let Some(last) = filtered_states.last()
+            && !last.is_empty()
+        {
+            self.last = ScalarValue::try_from_array(last, 0)?;
+            self.is_set = true;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        Ok(self.last.clone())
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) - size_of_val(&self.last) + self.last.size()
+    }
+}
+
 #[derive(Debug)]
 struct LastValueAccumulator {
     last: ScalarValue,
@@ -1202,11 +1348,13 @@ struct LastValueAccumulator {
     // This information is used to discriminate genuine NULLs and NULLS that
     // occur due to empty partitions.
     is_set: bool,
+    // Stores values of the ordering columns corresponding to the first value.
+    // These values are used during merging of multiple partitions.
     orderings: Vec<ScalarValue>,
     // Stores the applicable ordering requirement.
     ordering_req: LexOrdering,
     // Stores whether incoming data already satisfies the ordering requirement.
-    requirement_satisfied: bool,
+    is_input_pre_ordered: bool,
     // Ignore null values.
     ignore_nulls: bool,
 }
@@ -1217,19 +1365,19 @@ impl LastValueAccumulator {
         data_type: &DataType,
         ordering_dtypes: &[DataType],
         ordering_req: LexOrdering,
+        is_input_pre_ordered: bool,
         ignore_nulls: bool,
     ) -> Result<Self> {
         let orderings = ordering_dtypes
             .iter()
             .map(ScalarValue::try_from)
-            .collect::<Result<Vec<_>>>()?;
-        let requirement_satisfied = ordering_req.is_empty();
+            .collect::<Result<_>>()?;
         ScalarValue::try_from(data_type).map(|last| Self {
             last,
             is_set: false,
             orderings,
             ordering_req,
-            requirement_satisfied,
+            is_input_pre_ordered,
             ignore_nulls,
         })
     }
@@ -1237,10 +1385,9 @@ impl LastValueAccumulator {
     // Updates state with the values in the given row.
     fn update_with_new_row(&mut self, mut row: Vec<ScalarValue>) {
         // Ensure any Array based scalars hold have a single value to reduce memory pressure
-        row.iter_mut().for_each(|s| {
+        for s in row.iter_mut() {
             s.compact();
-        });
-
+        }
         self.last = row.remove(0);
         self.orderings = row;
         self.is_set = true;
@@ -1250,7 +1397,7 @@ impl LastValueAccumulator {
         let [value, ordering_values @ ..] = values else {
             return internal_err!("Empty row in LAST_VALUE");
         };
-        if self.requirement_satisfied {
+        if self.is_input_pre_ordered {
             // Get last entry according to the order of data:
             if self.ignore_nulls {
                 // If ignoring nulls, find the last non-null value.
@@ -1264,6 +1411,7 @@ impl LastValueAccumulator {
                 return Ok((!value.is_empty()).then_some(value.len() - 1));
             }
         }
+
         let sort_columns = ordering_values
             .iter()
             .zip(self.ordering_req.iter())
@@ -1284,42 +1432,33 @@ impl LastValueAccumulator {
 
         Ok(max_ind)
     }
-
-    fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self {
-        self.requirement_satisfied = requirement_satisfied;
-        self
-    }
 }
 
 impl Accumulator for LastValueAccumulator {
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
         let mut result = vec![self.last.clone()];
         result.extend(self.orderings.clone());
-        result.push(ScalarValue::Boolean(Some(self.is_set)));
+        result.push(ScalarValue::from(self.is_set));
         Ok(result)
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if !self.is_set || self.requirement_satisfied {
-            if let Some(last_idx) = self.get_last_idx(values)? {
-                let row = get_row_at_idx(values, last_idx)?;
-                self.update_with_new_row(row);
-            }
-        } else if let Some(last_idx) = self.get_last_idx(values)? {
+        if let Some(last_idx) = self.get_last_idx(values)? {
             let row = get_row_at_idx(values, last_idx)?;
             let orderings = &row[1..];
             // Update when there is a more recent entry
-            if compare_rows(
-                &self.orderings,
-                orderings,
-                &get_sort_options(self.ordering_req.as_ref()),
-            )?
-            .is_lt()
+            if !self.is_set
+                || self.is_input_pre_ordered
+                || compare_rows(
+                    &self.orderings,
+                    orderings,
+                    &get_sort_options(&self.ordering_req),
+                )?
+                .is_lt()
             {
                 self.update_with_new_row(row);
             }
         }
-
         Ok(())
     }
 
@@ -1328,13 +1467,13 @@ impl Accumulator for LastValueAccumulator {
         // last index contains is_set flag.
         let is_set_idx = states.len() - 1;
         let flags = states[is_set_idx].as_boolean();
+        validate_is_set_flags(flags, "last_value")?;
+
         let filtered_states =
             filter_states_according_to_is_set(&states[0..is_set_idx], flags)?;
         // 1..is_set_idx range corresponds to ordering section
-        let sort_columns = convert_to_sort_cols(
-            &filtered_states[1..is_set_idx],
-            self.ordering_req.as_ref(),
-        );
+        let sort_columns =
+            convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req);
 
         let comparator = LexicographicalComparator::try_new(&sort_columns)?;
         let max = (0..filtered_states[0].len()).max_by(|&a, &b| comparator.compare(a, b));
@@ -1343,11 +1482,11 @@ impl Accumulator for LastValueAccumulator {
             let mut last_row = get_row_at_idx(&filtered_states, last_idx)?;
             // When collecting orderings, we exclude the is_set flag from the state.
             let last_ordering = &last_row[1..is_set_idx];
-            let sort_options = get_sort_options(self.ordering_req.as_ref());
+            let sort_options = get_sort_options(&self.ordering_req);
             // Either there is no existing value, or there is a newer (latest)
             // version in the new data:
             if !self.is_set
-                || self.requirement_satisfied
+                || self.is_input_pre_ordered
                 || compare_rows(&self.orderings, last_ordering, &sort_options)?.is_lt()
             {
                 // Update with last value in the state. Note that we should exclude the
@@ -1373,6 +1512,16 @@ impl Accumulator for LastValueAccumulator {
     }
 }
 
+/// Validates that `is_set flags` do not contain NULL values.
+fn validate_is_set_flags(flags: &BooleanArray, function_name: &str) -> Result<()> {
+    if flags.null_count() > 0 {
+        return Err(DataFusionError::Internal(format!(
+            "{function_name}: is_set flags contain nulls"
+        )));
+    }
+    Ok(())
+}
+
 /// Filters states according to the `is_set` flag at the last column and returns
 /// the resulting states.
 fn filter_states_according_to_is_set(
@@ -1382,7 +1531,7 @@ fn filter_states_according_to_is_set(
     states
         .iter()
         .map(|state| compute::filter(state, flags).map_err(|e| arrow_datafusion_err!(e)))
-        .collect::<Result<Vec<_>>>()
+        .collect()
 }
 
 /// Combines array refs and their corresponding orderings to construct `SortColumn`s.
@@ -1393,7 +1542,7 @@ fn convert_to_sort_cols(arrs: &[ArrayRef], sort_exprs: &LexOrdering) -> Vec<Sort
             values: Arc::clone(item),
             options: Some(sort_expr.options),
         })
-        .collect::<Vec<_>>()
+        .collect()
 }
 
 #[cfg(test)]
@@ -1401,28 +1550,20 @@ mod tests {
     use std::iter::repeat_with;
 
     use arrow::{
-        array::{Int64Array, ListArray},
+        array::{BooleanArray, Int64Array, ListArray, StringArray},
         compute::SortOptions,
         datatypes::Schema,
     };
-    use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
+    use datafusion_physical_expr::{PhysicalSortExpr, expressions::col};
 
     use super::*;
 
     #[test]
     fn test_first_last_value_value() -> Result<()> {
-        let mut first_accumulator = FirstValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
-        let mut last_accumulator = LastValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut first_accumulator =
+            TrivialFirstValueAccumulator::try_new(&DataType::Int64, false)?;
+        let mut last_accumulator =
+            TrivialLastValueAccumulator::try_new(&DataType::Int64, false)?;
         // first value in the tuple is start of the range (inclusive),
         // second value in the tuple is end of the range (exclusive)
         let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)];
@@ -1459,22 +1600,14 @@ mod tests {
             .collect::<Vec<_>>();
 
         // FirstValueAccumulator
-        let mut first_accumulator = FirstValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut first_accumulator =
+            TrivialFirstValueAccumulator::try_new(&DataType::Int64, false)?;
 
         first_accumulator.update_batch(&[Arc::clone(&arrs[0])])?;
         let state1 = first_accumulator.state()?;
 
-        let mut first_accumulator = FirstValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut first_accumulator =
+            TrivialFirstValueAccumulator::try_new(&DataType::Int64, false)?;
         first_accumulator.update_batch(&[Arc::clone(&arrs[1])])?;
         let state2 = first_accumulator.state()?;
 
@@ -1489,34 +1622,22 @@ mod tests {
             ])?);
         }
 
-        let mut first_accumulator = FirstValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut first_accumulator =
+            TrivialFirstValueAccumulator::try_new(&DataType::Int64, false)?;
         first_accumulator.merge_batch(&states)?;
 
         let merged_state = first_accumulator.state()?;
         assert_eq!(merged_state.len(), state1.len());
 
         // LastValueAccumulator
-        let mut last_accumulator = LastValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut last_accumulator =
+            TrivialLastValueAccumulator::try_new(&DataType::Int64, false)?;
 
         last_accumulator.update_batch(&[Arc::clone(&arrs[0])])?;
         let state1 = last_accumulator.state()?;
 
-        let mut last_accumulator = LastValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut last_accumulator =
+            TrivialLastValueAccumulator::try_new(&DataType::Int64, false)?;
         last_accumulator.update_batch(&[Arc::clone(&arrs[1])])?;
         let state2 = last_accumulator.state()?;
 
@@ -1531,12 +1652,8 @@ mod tests {
             ])?);
         }
 
-        let mut last_accumulator = LastValueAccumulator::try_new(
-            &DataType::Int64,
-            &[],
-            LexOrdering::default(),
-            false,
-        )?;
+        let mut last_accumulator =
+            TrivialLastValueAccumulator::try_new(&DataType::Int64, false)?;
         last_accumulator.merge_batch(&states)?;
 
         let merged_state = last_accumulator.state()?;
@@ -1546,7 +1663,7 @@ mod tests {
     }
 
     #[test]
-    fn test_frist_group_acc() -> Result<()> {
+    fn test_first_group_acc() -> Result<()> {
         let schema = Arc::new(Schema::new(vec![
             Field::new("a", DataType::Int64, true),
             Field::new("b", DataType::Int64, true),
@@ -1555,13 +1672,13 @@ mod tests {
             Field::new("e", DataType::Boolean, true),
         ]));
 
-        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort_keys = [PhysicalSortExpr {
             expr: col("c", &schema).unwrap(),
             options: SortOptions::default(),
-        }]);
+        }];
 
         let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
-            sort_key,
+            sort_keys.into(),
             true,
             &DataType::Int64,
             &[DataType::Int64],
@@ -1649,13 +1766,13 @@ mod tests {
             Field::new("e", DataType::Boolean, true),
         ]));
 
-        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort_keys = [PhysicalSortExpr {
             expr: col("c", &schema).unwrap(),
             options: SortOptions::default(),
-        }]);
+        }];
 
         let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
-            sort_key,
+            sort_keys.into(),
             true,
             &DataType::Int64,
             &[DataType::Int64],
@@ -1730,13 +1847,13 @@ mod tests {
             Field::new("e", DataType::Boolean, true),
         ]));
 
-        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort_keys = [PhysicalSortExpr {
             expr: col("c", &schema).unwrap(),
             options: SortOptions::default(),
-        }]);
+        }];
 
         let mut group_acc = FirstPrimitiveGroupsAccumulator::<Int64Type>::try_new(
-            sort_key,
+            sort_keys.into(),
             true,
             &DataType::Int64,
             &[DataType::Int64],
@@ -1798,10 +1915,8 @@ mod tests {
     #[test]
     fn test_first_list_acc_size() -> Result<()> {
         fn size_after_batch(values: &[ArrayRef]) -> Result<usize> {
-            let mut first_accumulator = FirstValueAccumulator::try_new(
+            let mut first_accumulator = TrivialFirstValueAccumulator::try_new(
                 &DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
-                &[],
-                LexOrdering::default(),
                 false,
             )?;
 
@@ -1826,10 +1941,8 @@ mod tests {
     #[test]
     fn test_last_list_acc_size() -> Result<()> {
         fn size_after_batch(values: &[ArrayRef]) -> Result<usize> {
-            let mut last_accumulator = LastValueAccumulator::try_new(
+            let mut last_accumulator = TrivialLastValueAccumulator::try_new(
                 &DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
-                &[],
-                LexOrdering::default(),
                 false,
             )?;
 
@@ -1850,4 +1963,98 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_first_value_merge_with_is_set_nulls() -> Result<()> {
+        // Test data with corrupted is_set flag
+        let value = Arc::new(StringArray::from(vec![Some("first_string")])) as ArrayRef;
+        let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef;
+
+        // Test TrivialFirstValueAccumulator
+        let mut trivial_accumulator =
+            TrivialFirstValueAccumulator::try_new(&DataType::Utf8, false)?;
+        let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)];
+        let result = trivial_accumulator.merge_batch(&trivial_states);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
+
+        // Test FirstValueAccumulator (with ordering)
+        let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]);
+        let ordering_expr = col("ordering", &schema)?;
+        let mut ordered_accumulator = FirstValueAccumulator::try_new(
+            &DataType::Utf8,
+            &[DataType::Int64],
+            LexOrdering::new(vec![PhysicalSortExpr {
+                expr: ordering_expr,
+                options: SortOptions::default(),
+            }])
+            .unwrap(),
+            false,
+            false,
+        )?;
+        let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef;
+        let ordered_states = vec![value, ordering, corrupted_flag];
+        let result = ordered_accumulator.merge_batch(&ordered_states);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_last_value_merge_with_is_set_nulls() -> Result<()> {
+        // Test data with corrupted is_set flag
+        let value = Arc::new(StringArray::from(vec![Some("last_string")])) as ArrayRef;
+        let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef;
+
+        // Test TrivialLastValueAccumulator
+        let mut trivial_accumulator =
+            TrivialLastValueAccumulator::try_new(&DataType::Utf8, false)?;
+        let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)];
+        let result = trivial_accumulator.merge_batch(&trivial_states);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
+
+        // Test LastValueAccumulator (with ordering)
+        let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]);
+        let ordering_expr = col("ordering", &schema)?;
+        let mut ordered_accumulator = LastValueAccumulator::try_new(
+            &DataType::Utf8,
+            &[DataType::Int64],
+            LexOrdering::new(vec![PhysicalSortExpr {
+                expr: ordering_expr,
+                options: SortOptions::default(),
+            }])
+            .unwrap(),
+            false,
+            false,
+        )?;
+        let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef;
+        let ordered_states = vec![value, ordering, corrupted_flag];
+        let result = ordered_accumulator.merge_batch(&ordered_states);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("is_set flags contain nulls")
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-aggregate/src/grouping.rs b/datafusion/functions-aggregate/src/grouping.rs
index 0727cf33036a0..c7af2df4b10fc 100644
--- a/datafusion/functions-aggregate/src/grouping.rs
+++ b/datafusion/functions-aggregate/src/grouping.rs
@@ -18,11 +18,10 @@
 //! Defines physical expressions that can evaluated at runtime during query execution
 
 use std::any::Any;
-use std::fmt;
 
 use arrow::datatypes::Field;
 use arrow::datatypes::{DataType, FieldRef};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::utils::format_state_name;
@@ -60,19 +59,11 @@ make_udaf_expr_and_func!(
         description = "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function."
     )
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Grouping {
     signature: Signature,
 }
 
-impl fmt::Debug for Grouping {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("Grouping")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Grouping {
     fn default() -> Self {
         Self::new()
@@ -106,12 +97,14 @@ impl AggregateUDFImpl for Grouping {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        Ok(vec![Field::new(
-            format_state_name(args.name, "grouping"),
-            DataType::Int32,
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, "grouping"),
+                DataType::Int32,
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
diff --git a/datafusion/functions-aggregate/src/hyperloglog.rs b/datafusion/functions-aggregate/src/hyperloglog.rs
index 3074889eab23c..3a467a811176d 100644
--- a/datafusion/functions-aggregate/src/hyperloglog.rs
+++ b/datafusion/functions-aggregate/src/hyperloglog.rs
@@ -34,7 +34,7 @@
 //!
 //! This module also borrows some code structure from [pdatastructs.rs](https://github.com/crepererum/pdatastructs.rs/blob/3997ed50f6b6871c9e53c4c5e0f48f431405fc63/src/hyperloglog.rs).
 
-use ahash::RandomState;
+use std::hash::BuildHasher;
 use std::hash::Hash;
 use std::marker::PhantomData;
 
@@ -61,12 +61,7 @@ where
 /// shared across cluster, this SEED will have to be consistent across all
 /// parties otherwise we might have corruption. So ideally for later this seed
 /// shall be part of the serialized form (or stay unchanged across versions).
-const SEED: RandomState = RandomState::with_seeds(
-    0x885f6cab121d01a3_u64,
-    0x71e4379f2976ad8f_u64,
-    0xbf30173dd28a8816_u64,
-    0x0eaea5d736d733a4_u64,
-);
+const SEED: foldhash::quality::FixedState = foldhash::quality::FixedState::with_seed(0);
 
 impl<T> Default for HyperLogLog<T>
 where
@@ -97,7 +92,7 @@ where
         }
     }
 
-    /// choice of hash function: ahash is already an dependency
+    /// choice of hash function: foldhash is already an dependency
     /// and it fits the requirements of being a 64bit hash with
     /// reasonable performance.
     #[inline]
diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs
index b5bb69f6da9d8..1b9996220d882 100644
--- a/datafusion/functions-aggregate/src/lib.rs
+++ b/datafusion/functions-aggregate/src/lib.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
@@ -81,6 +82,7 @@ pub mod hyperloglog;
 pub mod median;
 pub mod min_max;
 pub mod nth_value;
+pub mod percentile_cont;
 pub mod regr;
 pub mod stddev;
 pub mod string_agg;
@@ -88,6 +90,7 @@ pub mod sum;
 pub mod variance;
 
 pub mod planner;
+mod utils;
 
 use crate::approx_percentile_cont::approx_percentile_cont_udaf;
 use crate::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight_udaf;
@@ -105,6 +108,7 @@ pub mod expr_fn {
     pub use super::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight;
     pub use super::array_agg::array_agg;
     pub use super::average::avg;
+    pub use super::average::avg_distinct;
     pub use super::bit_and_or_xor::bit_and;
     pub use super::bit_and_or_xor::bit_or;
     pub use super::bit_and_or_xor::bit_xor;
@@ -122,6 +126,7 @@ pub mod expr_fn {
     pub use super::min_max::max;
     pub use super::min_max::min;
     pub use super::nth_value::nth_value;
+    pub use super::percentile_cont::percentile_cont;
     pub use super::regr::regr_avgx;
     pub use super::regr::regr_avgy;
     pub use super::regr::regr_count;
@@ -134,6 +139,7 @@ pub mod expr_fn {
     pub use super::stddev::stddev;
     pub use super::stddev::stddev_pop;
     pub use super::sum::sum;
+    pub use super::sum::sum_distinct;
     pub use super::variance::var_pop;
     pub use super::variance::var_sample;
 }
@@ -169,6 +175,7 @@ pub fn all_default_aggregate_functions() -> Vec<Arc<AggregateUDF>> {
         approx_distinct::approx_distinct_udaf(),
         approx_percentile_cont_udaf(),
         approx_percentile_cont_with_weight_udaf(),
+        percentile_cont::percentile_cont_udaf(),
         string_agg::string_agg_udaf(),
         bit_and_or_xor::bit_and_udaf(),
         bit_and_or_xor::bit_or_udaf(),
@@ -205,13 +212,7 @@ mod tests {
     #[test]
     fn test_no_duplicate_name() -> Result<()> {
         let mut names = HashSet::new();
-        let migrated_functions = ["array_agg", "count", "max", "min"];
         for func in all_default_aggregate_functions() {
-            // TODO: remove this
-            // These functions are in intermediate migration state, skip them
-            if migrated_functions.contains(&func.name().to_lowercase().as_str()) {
-                continue;
-            }
             assert!(
                 names.insert(func.name().to_string().to_lowercase()),
                 "duplicate function name: {}",
diff --git a/datafusion/functions-aggregate/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs
index 18f27c3c4ae3b..0c919a1e5ea74 100644
--- a/datafusion/functions-aggregate/src/macros.rs
+++ b/datafusion/functions-aggregate/src/macros.rs
@@ -28,7 +28,7 @@ macro_rules! make_udaf_expr {
                 vec![$($arg),*],
                 false,
                 None,
-                None,
+                vec![],
                 None,
             ))
         }
@@ -52,7 +52,7 @@ macro_rules! make_udaf_expr_and_func {
                 args,
                 false,
                 None,
-                None,
+                vec![],
                 None,
             ))
         }
@@ -67,7 +67,6 @@ macro_rules! create_func {
         create_func!($UDAF, $AGGREGATE_UDF_FN, <$UDAF>::default());
     };
     ($UDAF:ty, $AGGREGATE_UDF_FN:ident, $CREATE:expr) => {
-        paste::paste! {
             #[doc = concat!("AggregateFunction that returns a [`AggregateUDF`](datafusion_expr::AggregateUDF) for [`", stringify!($UDAF), "`]")]
             pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc<datafusion_expr::AggregateUDF> {
                 // Singleton instance of [$UDAF], ensures the UDAF is only created once
@@ -76,7 +75,6 @@ macro_rules! create_func {
                         std::sync::Arc::new(datafusion_expr::AggregateUDF::from($CREATE))
                     });
                 std::sync::Arc::clone(&INSTANCE)
-            }
         }
     }
 }
diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs
index bfaea4b2398cc..db769918d1353 100644
--- a/datafusion/functions-aggregate/src/median.rs
+++ b/datafusion/functions-aggregate/src/median.rs
@@ -21,8 +21,8 @@ use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 
 use arrow::array::{
-    downcast_integer, ArrowNumericType, BooleanArray, ListArray, PrimitiveArray,
-    PrimitiveBuilder,
+    ArrowNumericType, BooleanArray, ListArray, PrimitiveArray, PrimitiveBuilder,
+    downcast_integer,
 };
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::{
@@ -35,21 +35,25 @@ use arrow::{
 
 use arrow::array::Array;
 use arrow::array::ArrowNativeTypeOp;
-use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, FieldRef};
+use arrow::datatypes::{
+    ArrowNativeType, ArrowPrimitiveType, Decimal32Type, Decimal64Type, FieldRef,
+};
 
 use datafusion_common::{
-    internal_datafusion_err, internal_err, DataFusionError, HashSet, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, assert_eq_or_internal_err,
+    internal_datafusion_err,
 };
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::{
-    function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl,
-    Documentation, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
+    function::AccumulatorArgs, utils::format_state_name,
 };
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate;
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask;
-use datafusion_functions_aggregate_common::utils::Hashable;
+use datafusion_functions_aggregate_common::utils::GenericDistinctBuffer;
 use datafusion_macros::user_doc;
+use std::collections::HashMap;
 
 make_udaf_expr_and_func!(
     Median,
@@ -81,19 +85,11 @@ make_udaf_expr_and_func!(
 /// If using the distinct variation, the memory usage will be similarly high if the
 /// cardinality is high as it stores all distinct values in memory before computing the
 /// result, but if cardinality is low then memory usage will also be lower.
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Median {
     signature: Signature,
 }
 
-impl Debug for Median {
-    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
-        f.debug_struct("Median")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Median {
     fn default() -> Self {
         Self::new()
@@ -134,12 +130,14 @@ impl AggregateUDFImpl for Median {
             "median"
         };
 
-        Ok(vec![Field::new(
-            format_state_name(args.name, state_name),
-            DataType::List(Arc::new(field)),
-            true,
-        )
-        .into()])
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, state_name),
+                DataType::List(Arc::new(field)),
+                true,
+            )
+            .into(),
+        ])
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -148,7 +146,7 @@ impl AggregateUDFImpl for Median {
                 if acc_args.is_distinct {
                     Ok(Box::new(DistinctMedianAccumulator::<$t> {
                         data_type: $dt.clone(),
-                        distinct_values: HashSet::new(),
+                        distinct_values: GenericDistinctBuffer::new($dt),
                     }))
                 } else {
                     Ok(Box::new(MedianAccumulator::<$t> {
@@ -159,12 +157,14 @@ impl AggregateUDFImpl for Median {
             };
         }
 
-        let dt = acc_args.exprs[0].data_type(acc_args.schema)?;
+        let dt = acc_args.expr_fields[0].data_type().clone();
         downcast_integer! {
             dt => (helper, dt),
             DataType::Float16 => helper!(Float16Type, dt),
             DataType::Float32 => helper!(Float32Type, dt),
             DataType::Float64 => helper!(Float64Type, dt),
+            DataType::Decimal32(_, _) => helper!(Decimal32Type, dt),
+            DataType::Decimal64(_, _) => helper!(Decimal64Type, dt),
             DataType::Decimal128(_, _) => helper!(Decimal128Type, dt),
             DataType::Decimal256(_, _) => helper!(Decimal256Type, dt),
             _ => Err(DataFusionError::NotImplemented(format!(
@@ -184,14 +184,14 @@ impl AggregateUDFImpl for Median {
         args: AccumulatorArgs,
     ) -> Result<Box<dyn GroupsAccumulator>> {
         let num_args = args.exprs.len();
-        if num_args != 1 {
-            return internal_err!(
-                "median should only have 1 arg, but found num args:{}",
-                args.exprs.len()
-            );
-        }
+        assert_eq_or_internal_err!(
+            num_args,
+            1,
+            "median should only have 1 arg, but found num args:{}",
+            num_args
+        );
 
-        let dt = args.exprs[0].data_type(args.schema)?;
+        let dt = args.expr_fields[0].data_type().clone();
 
         macro_rules! helper {
             ($t:ty, $dt:expr) => {
@@ -204,6 +204,8 @@ impl AggregateUDFImpl for Median {
             DataType::Float16 => helper!(Float16Type, dt),
             DataType::Float32 => helper!(Float32Type, dt),
             DataType::Float64 => helper!(Float64Type, dt),
+            DataType::Decimal32(_, _) => helper!(Decimal32Type, dt),
+            DataType::Decimal64(_, _) => helper!(Decimal64Type, dt),
             DataType::Decimal128(_, _) => helper!(Decimal128Type, dt),
             DataType::Decimal256(_, _) => helper!(Decimal256Type, dt),
             _ => Err(DataFusionError::NotImplemented(format!(
@@ -214,10 +216,6 @@ impl AggregateUDFImpl for Median {
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -283,14 +281,51 @@ impl<T: ArrowNumericType> Accumulator for MedianAccumulator<T> {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.all_values);
-        let median = calculate_median::<T>(d);
+        let median = calculate_median::<T>(&mut self.all_values);
         ScalarValue::new_primitive::<T>(median, &self.data_type)
     }
 
     fn size(&self) -> usize {
         size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
     }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let mut to_remove: HashMap<ScalarValue, usize> = HashMap::new();
+
+        let arr = &values[0];
+        for i in 0..arr.len() {
+            let v = ScalarValue::try_from_array(arr, i)?;
+            if !v.is_null() {
+                *to_remove.entry(v).or_default() += 1;
+            }
+        }
+
+        let mut i = 0;
+        while i < self.all_values.len() {
+            let k = ScalarValue::new_primitive::<T>(
+                Some(self.all_values[i]),
+                &self.data_type,
+            )?;
+            if let Some(count) = to_remove.get_mut(&k)
+                && *count > 0
+            {
+                self.all_values.swap_remove(i);
+                *count -= 1;
+                if *count == 0 {
+                    to_remove.remove(&k);
+                    if to_remove.is_empty() {
+                        break;
+                    }
+                }
+            }
+            i += 1;
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
 }
 
 /// The median groups accumulator accumulates the raw input values
@@ -299,7 +334,6 @@ impl<T: ArrowNumericType> Accumulator for MedianAccumulator<T> {
 /// of groups before final evaluation.
 /// So values in each group will be stored in a `Vec<T>`, and the total group values
 /// will be actually organized as a `Vec<Vec<T>>`.
-///
 #[derive(Debug)]
 struct MedianGroupsAccumulator<T: ArrowNumericType + Send> {
     data_type: DataType,
@@ -438,8 +472,8 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator for MedianGroupsAccumulator<T
         // Calculate median for each group
         let mut evaluate_result_builder =
             PrimitiveBuilder::<T>::new().with_data_type(self.data_type.clone());
-        for values in emit_group_values {
-            let median = calculate_median::<T>(values);
+        for mut values in emit_group_values {
+            let median = calculate_median::<T>(&mut values);
             evaluate_result_builder.append_option(median);
         }
 
@@ -503,74 +537,34 @@ impl<T: ArrowNumericType + Send> GroupsAccumulator for MedianGroupsAccumulator<T
     }
 }
 
-/// The distinct median accumulator accumulates the raw input values
-/// as `ScalarValue`s
-///
-/// The intermediate state is represented as a List of scalar values updated by
-/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
-/// in the final evaluation step so that we avoid expensive conversions and
-/// allocations during `update_batch`.
+#[derive(Debug)]
 struct DistinctMedianAccumulator<T: ArrowNumericType> {
+    distinct_values: GenericDistinctBuffer<T>,
     data_type: DataType,
-    distinct_values: HashSet<Hashable<T::Native>>,
 }
 
-impl<T: ArrowNumericType> Debug for DistinctMedianAccumulator<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "DistinctMedianAccumulator({})", self.data_type)
-    }
-}
-
-impl<T: ArrowNumericType> Accumulator for DistinctMedianAccumulator<T> {
+impl<T: ArrowNumericType + Debug> Accumulator for DistinctMedianAccumulator<T> {
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        let all_values = self
-            .distinct_values
-            .iter()
-            .map(|x| ScalarValue::new_primitive::<T>(Some(x.0), &self.data_type))
-            .collect::<Result<Vec<_>>>()?;
-
-        let arr = ScalarValue::new_list_nullable(&all_values, &self.data_type);
-        Ok(vec![ScalarValue::List(arr)])
+        self.distinct_values.state()
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        let array = values[0].as_primitive::<T>();
-        match array.nulls().filter(|x| x.null_count() > 0) {
-            Some(n) => {
-                for idx in n.valid_indices() {
-                    self.distinct_values.insert(Hashable(array.value(idx)));
-                }
-            }
-            None => array.values().iter().for_each(|x| {
-                self.distinct_values.insert(Hashable(*x));
-            }),
-        }
-        Ok(())
+        self.distinct_values.update_batch(values)
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let array = states[0].as_list::<i32>();
-        for v in array.iter().flatten() {
-            self.update_batch(&[v])?
-        }
-        Ok(())
+        self.distinct_values.merge_batch(states)
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        let d = std::mem::take(&mut self.distinct_values)
-            .into_iter()
-            .map(|v| v.0)
-            .collect::<Vec<_>>();
-        let median = calculate_median::<T>(d);
+        let mut d: Vec<T::Native> =
+            self.distinct_values.values.iter().map(|v| v.0).collect();
+        let median = calculate_median::<T>(&mut d);
         ScalarValue::new_primitive::<T>(median, &self.data_type)
     }
 
     fn size(&self) -> usize {
-        size_of_val(self) + self.distinct_values.capacity() * size_of::<T::Native>()
+        size_of_val(self) + self.distinct_values.size()
     }
 }
 
@@ -589,9 +583,7 @@ where
         .unwrap()
 }
 
-fn calculate_median<T: ArrowNumericType>(
-    mut values: Vec<T::Native>,
-) -> Option<T::Native> {
+fn calculate_median<T: ArrowNumericType>(values: &mut [T::Native]) -> Option<T::Native> {
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
 
     let len = values.len();
@@ -601,9 +593,25 @@ fn calculate_median<T: ArrowNumericType>(
         let (low, high, _) = values.select_nth_unstable_by(len / 2, cmp);
         // Get the maximum of the low (left side after bi-partitioning)
         let left_max = slice_max::<T>(low);
-        let median = left_max
-            .add_wrapping(*high)
-            .div_wrapping(T::Native::usize_as(2));
+        // Calculate median as the average of the two middle values.
+        // Use checked arithmetic to detect overflow and fall back to safe formula.
+        let two = T::Native::usize_as(2);
+        let median = match left_max.add_checked(*high) {
+            Ok(sum) => sum.div_wrapping(two),
+            Err(_) => {
+                // Overflow detected - use safe midpoint formula:
+                // a/2 + b/2 + ((a%2 + b%2) / 2)
+                // This avoids overflow by dividing before adding.
+                let half_left = left_max.div_wrapping(two);
+                let half_right = (*high).div_wrapping(two);
+                let rem_left = left_max.mod_wrapping(two);
+                let rem_right = (*high).mod_wrapping(two);
+                // The sum of remainders (0, 1, or 2 for unsigned; -2 to 2 for signed)
+                // divided by 2 gives the correction factor (0 or 1 for unsigned; -1, 0, or 1 for signed)
+                let correction = rem_left.add_wrapping(rem_right).div_wrapping(two);
+                half_left.add_wrapping(half_right).add_wrapping(correction)
+            }
+        };
         Some(median)
     } else {
         let (_, median, _) = values.select_nth_unstable_by(len / 2, cmp);
diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs
index bb46aa5404617..0eebad9e3a5c3 100644
--- a/datafusion/functions-aggregate/src/min_max.rs
+++ b/datafusion/functions-aggregate/src/min_max.rs
@@ -21,29 +21,15 @@
 mod min_max_bytes;
 mod min_max_struct;
 
-use arrow::array::{
-    ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
-    Date64Array, Decimal128Array, Decimal256Array, DurationMicrosecondArray,
-    DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, Float16Array,
-    Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
-    IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
-    LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
-    Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
-    Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
-    TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
-    UInt64Array, UInt8Array,
-};
-use arrow::compute;
+use arrow::array::ArrayRef;
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, DurationMicrosecondType,
-    DurationMillisecondType, DurationNanosecondType, DurationSecondType, Float16Type,
-    Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit,
-    UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
+    DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
+    DurationSecondType, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type,
+    Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
 use datafusion_common::stats::Precision;
-use datafusion_common::{
-    downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError, Result,
-};
+use datafusion_common::{ColumnStatistics, Result, exec_err, internal_err};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
 use datafusion_physical_expr::expressions;
 use std::cmp::Ordering;
@@ -60,8 +46,8 @@ use crate::min_max::min_max_bytes::MinMaxBytesAccumulator;
 use crate::min_max::min_max_struct::MinMaxStructAccumulator;
 use datafusion_common::ScalarValue;
 use datafusion_expr::{
-    function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation,
-    SetMonotonicity, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, SetMonotonicity, Signature, Volatility,
+    function::AccumulatorArgs,
 };
 use datafusion_expr::{GroupsAccumulator, StatisticsArgs};
 use datafusion_macros::user_doc;
@@ -105,7 +91,7 @@ fn get_min_max_result_type(input_types: &[DataType]) -> Result<Vec<DataType>> {
     standard_argument(name = "expression",)
 )]
 // MAX aggregate UDF
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Max {
     signature: Signature,
 }
@@ -207,10 +193,10 @@ impl FromColumnStatistics for Max {
         &self,
         col_stats: &ColumnStatistics,
     ) -> Option<ScalarValue> {
-        if let Precision::Exact(ref val) = col_stats.max_value {
-            if !val.is_null() {
-                return Some(val.clone());
-            }
+        if let Precision::Exact(ref val) = col_stats.max_value
+            && !val.is_null()
+        {
+            return Some(val.clone());
         }
         None
     }
@@ -239,10 +225,6 @@ impl AggregateUDFImpl for Max {
         )?))
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         use DataType::*;
         matches!(
@@ -257,6 +239,8 @@ impl AggregateUDFImpl for Max {
                 | Float16
                 | Float32
                 | Float64
+                | Decimal32(_, _)
+                | Decimal64(_, _)
                 | Decimal128(_, _)
                 | Decimal256(_, _)
                 | Date32
@@ -338,6 +322,12 @@ impl AggregateUDFImpl for Max {
             Duration(Nanosecond) => {
                 primitive_max_accumulator!(data_type, i64, DurationNanosecondType)
             }
+            Decimal32(_, _) => {
+                primitive_max_accumulator!(data_type, i32, Decimal32Type)
+            }
+            Decimal64(_, _) => {
+                primitive_max_accumulator!(data_type, i64, Decimal64Type)
+            }
             Decimal128(_, _) => {
                 primitive_max_accumulator!(data_type, i128, Decimal128Type)
             }
@@ -393,695 +383,6 @@ impl AggregateUDFImpl for Max {
     }
 }
 
-// Statically-typed version of min/max(array) -> ScalarValue for string types
-macro_rules! typed_min_max_batch_string {
-    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{
-        let array = downcast_value!($VALUES, $ARRAYTYPE);
-        let value = compute::$OP(array);
-        let value = value.and_then(|e| Some(e.to_string()));
-        ScalarValue::$SCALAR(value)
-    }};
-}
-// Statically-typed version of min/max(array) -> ScalarValue for binary types.
-macro_rules! typed_min_max_batch_binary {
-    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{
-        let array = downcast_value!($VALUES, $ARRAYTYPE);
-        let value = compute::$OP(array);
-        let value = value.and_then(|e| Some(e.to_vec()));
-        ScalarValue::$SCALAR(value)
-    }};
-}
-
-// Statically-typed version of min/max(array) -> ScalarValue for non-string types.
-macro_rules! typed_min_max_batch {
-    ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident $(, $EXTRA_ARGS:ident)*) => {{
-        let array = downcast_value!($VALUES, $ARRAYTYPE);
-        let value = compute::$OP(array);
-        ScalarValue::$SCALAR(value, $($EXTRA_ARGS.clone()),*)
-    }};
-}
-
-// Statically-typed version of min/max(array) -> ScalarValue  for non-string types.
-// this is a macro to support both operations (min and max).
-macro_rules! min_max_batch {
-    ($VALUES:expr, $OP:ident) => {{
-        match $VALUES.data_type() {
-            DataType::Null => ScalarValue::Null,
-            DataType::Decimal128(precision, scale) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    Decimal128Array,
-                    Decimal128,
-                    $OP,
-                    precision,
-                    scale
-                )
-            }
-            DataType::Decimal256(precision, scale) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    Decimal256Array,
-                    Decimal256,
-                    $OP,
-                    precision,
-                    scale
-                )
-            }
-            // all types that have a natural order
-            DataType::Float64 => {
-                typed_min_max_batch!($VALUES, Float64Array, Float64, $OP)
-            }
-            DataType::Float32 => {
-                typed_min_max_batch!($VALUES, Float32Array, Float32, $OP)
-            }
-            DataType::Float16 => {
-                typed_min_max_batch!($VALUES, Float16Array, Float16, $OP)
-            }
-            DataType::Int64 => typed_min_max_batch!($VALUES, Int64Array, Int64, $OP),
-            DataType::Int32 => typed_min_max_batch!($VALUES, Int32Array, Int32, $OP),
-            DataType::Int16 => typed_min_max_batch!($VALUES, Int16Array, Int16, $OP),
-            DataType::Int8 => typed_min_max_batch!($VALUES, Int8Array, Int8, $OP),
-            DataType::UInt64 => typed_min_max_batch!($VALUES, UInt64Array, UInt64, $OP),
-            DataType::UInt32 => typed_min_max_batch!($VALUES, UInt32Array, UInt32, $OP),
-            DataType::UInt16 => typed_min_max_batch!($VALUES, UInt16Array, UInt16, $OP),
-            DataType::UInt8 => typed_min_max_batch!($VALUES, UInt8Array, UInt8, $OP),
-            DataType::Timestamp(TimeUnit::Second, tz_opt) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    TimestampSecondArray,
-                    TimestampSecond,
-                    $OP,
-                    tz_opt
-                )
-            }
-            DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => typed_min_max_batch!(
-                $VALUES,
-                TimestampMillisecondArray,
-                TimestampMillisecond,
-                $OP,
-                tz_opt
-            ),
-            DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => typed_min_max_batch!(
-                $VALUES,
-                TimestampMicrosecondArray,
-                TimestampMicrosecond,
-                $OP,
-                tz_opt
-            ),
-            DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => typed_min_max_batch!(
-                $VALUES,
-                TimestampNanosecondArray,
-                TimestampNanosecond,
-                $OP,
-                tz_opt
-            ),
-            DataType::Date32 => typed_min_max_batch!($VALUES, Date32Array, Date32, $OP),
-            DataType::Date64 => typed_min_max_batch!($VALUES, Date64Array, Date64, $OP),
-            DataType::Time32(TimeUnit::Second) => {
-                typed_min_max_batch!($VALUES, Time32SecondArray, Time32Second, $OP)
-            }
-            DataType::Time32(TimeUnit::Millisecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    Time32MillisecondArray,
-                    Time32Millisecond,
-                    $OP
-                )
-            }
-            DataType::Time64(TimeUnit::Microsecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    Time64MicrosecondArray,
-                    Time64Microsecond,
-                    $OP
-                )
-            }
-            DataType::Time64(TimeUnit::Nanosecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    Time64NanosecondArray,
-                    Time64Nanosecond,
-                    $OP
-                )
-            }
-            DataType::Interval(IntervalUnit::YearMonth) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    IntervalYearMonthArray,
-                    IntervalYearMonth,
-                    $OP
-                )
-            }
-            DataType::Interval(IntervalUnit::DayTime) => {
-                typed_min_max_batch!($VALUES, IntervalDayTimeArray, IntervalDayTime, $OP)
-            }
-            DataType::Interval(IntervalUnit::MonthDayNano) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    IntervalMonthDayNanoArray,
-                    IntervalMonthDayNano,
-                    $OP
-                )
-            }
-            DataType::Duration(TimeUnit::Second) => {
-                typed_min_max_batch!($VALUES, DurationSecondArray, DurationSecond, $OP)
-            }
-            DataType::Duration(TimeUnit::Millisecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    DurationMillisecondArray,
-                    DurationMillisecond,
-                    $OP
-                )
-            }
-            DataType::Duration(TimeUnit::Microsecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    DurationMicrosecondArray,
-                    DurationMicrosecond,
-                    $OP
-                )
-            }
-            DataType::Duration(TimeUnit::Nanosecond) => {
-                typed_min_max_batch!(
-                    $VALUES,
-                    DurationNanosecondArray,
-                    DurationNanosecond,
-                    $OP
-                )
-            }
-            other => {
-                // This should have been handled before
-                return internal_err!(
-                    "Min/Max accumulator not implemented for type {:?}",
-                    other
-                );
-            }
-        }
-    }};
-}
-
-/// dynamically-typed min(array) -> ScalarValue
-fn min_batch(values: &ArrayRef) -> Result<ScalarValue> {
-    Ok(match values.data_type() {
-        DataType::Utf8 => {
-            typed_min_max_batch_string!(values, StringArray, Utf8, min_string)
-        }
-        DataType::LargeUtf8 => {
-            typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, min_string)
-        }
-        DataType::Utf8View => {
-            typed_min_max_batch_string!(
-                values,
-                StringViewArray,
-                Utf8View,
-                min_string_view
-            )
-        }
-        DataType::Boolean => {
-            typed_min_max_batch!(values, BooleanArray, Boolean, min_boolean)
-        }
-        DataType::Binary => {
-            typed_min_max_batch_binary!(&values, BinaryArray, Binary, min_binary)
-        }
-        DataType::LargeBinary => {
-            typed_min_max_batch_binary!(
-                &values,
-                LargeBinaryArray,
-                LargeBinary,
-                min_binary
-            )
-        }
-        DataType::BinaryView => {
-            typed_min_max_batch_binary!(
-                &values,
-                BinaryViewArray,
-                BinaryView,
-                min_binary_view
-            )
-        }
-        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::List(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Greater)?,
-        DataType::FixedSizeList(_, _) => {
-            min_max_batch_generic(values, Ordering::Greater)?
-        }
-        DataType::Dictionary(_, _) => {
-            let values = values.as_any_dictionary().values();
-            min_batch(values)?
-        }
-        _ => min_max_batch!(values, min),
-    })
-}
-
-fn min_max_batch_generic(array: &ArrayRef, ordering: Ordering) -> Result<ScalarValue> {
-    if array.len() == array.null_count() {
-        return ScalarValue::try_from(array.data_type());
-    }
-    let mut extreme = ScalarValue::try_from_array(array, 0)?;
-    for i in 1..array.len() {
-        let current = ScalarValue::try_from_array(array, i)?;
-        if current.is_null() {
-            continue;
-        }
-        if extreme.is_null() {
-            extreme = current;
-            continue;
-        }
-        if let Some(cmp) = extreme.partial_cmp(&current) {
-            if cmp == ordering {
-                extreme = current;
-            }
-        }
-    }
-
-    Ok(extreme)
-}
-
-macro_rules! min_max_generic {
-    ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
-        if $VALUE.is_null() {
-            let mut delta_copy = $DELTA.clone();
-            // When the new value won we want to compact it to
-            // avoid storing the entire input
-            delta_copy.compact();
-            delta_copy
-        } else if $DELTA.is_null() {
-            $VALUE.clone()
-        } else {
-            match $VALUE.partial_cmp(&$DELTA) {
-                Some(choose_min_max!($OP)) => {
-                    // When the new value won we want to compact it to
-                    // avoid storing the entire input
-                    let mut delta_copy = $DELTA.clone();
-                    delta_copy.compact();
-                    delta_copy
-                }
-                _ => $VALUE.clone(),
-            }
-        }
-    }};
-}
-
-/// dynamically-typed max(array) -> ScalarValue
-pub fn max_batch(values: &ArrayRef) -> Result<ScalarValue> {
-    Ok(match values.data_type() {
-        DataType::Utf8 => {
-            typed_min_max_batch_string!(values, StringArray, Utf8, max_string)
-        }
-        DataType::LargeUtf8 => {
-            typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, max_string)
-        }
-        DataType::Utf8View => {
-            typed_min_max_batch_string!(
-                values,
-                StringViewArray,
-                Utf8View,
-                max_string_view
-            )
-        }
-        DataType::Boolean => {
-            typed_min_max_batch!(values, BooleanArray, Boolean, max_boolean)
-        }
-        DataType::Binary => {
-            typed_min_max_batch_binary!(&values, BinaryArray, Binary, max_binary)
-        }
-        DataType::BinaryView => {
-            typed_min_max_batch_binary!(
-                &values,
-                BinaryViewArray,
-                BinaryView,
-                max_binary_view
-            )
-        }
-        DataType::LargeBinary => {
-            typed_min_max_batch_binary!(
-                &values,
-                LargeBinaryArray,
-                LargeBinary,
-                max_binary
-            )
-        }
-        DataType::Struct(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::List(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::LargeList(_) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::FixedSizeList(_, _) => min_max_batch_generic(values, Ordering::Less)?,
-        DataType::Dictionary(_, _) => {
-            let values = values.as_any_dictionary().values();
-            max_batch(values)?
-        }
-        _ => min_max_batch!(values, max),
-    })
-}
-
-// min/max of two non-string scalar values.
-macro_rules! typed_min_max {
-    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident $(, $EXTRA_ARGS:ident)*) => {{
-        ScalarValue::$SCALAR(
-            match ($VALUE, $DELTA) {
-                (None, None) => None,
-                (Some(a), None) => Some(*a),
-                (None, Some(b)) => Some(*b),
-                (Some(a), Some(b)) => Some((*a).$OP(*b)),
-            },
-            $($EXTRA_ARGS.clone()),*
-        )
-    }};
-}
-macro_rules! typed_min_max_float {
-    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{
-        ScalarValue::$SCALAR(match ($VALUE, $DELTA) {
-            (None, None) => None,
-            (Some(a), None) => Some(*a),
-            (None, Some(b)) => Some(*b),
-            (Some(a), Some(b)) => match a.total_cmp(b) {
-                choose_min_max!($OP) => Some(*b),
-                _ => Some(*a),
-            },
-        })
-    }};
-}
-
-// min/max of two scalar string values.
-macro_rules! typed_min_max_string {
-    ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{
-        ScalarValue::$SCALAR(match ($VALUE, $DELTA) {
-            (None, None) => None,
-            (Some(a), None) => Some(a.clone()),
-            (None, Some(b)) => Some(b.clone()),
-            (Some(a), Some(b)) => Some((a).$OP(b).clone()),
-        })
-    }};
-}
-
-macro_rules! choose_min_max {
-    (min) => {
-        std::cmp::Ordering::Greater
-    };
-    (max) => {
-        std::cmp::Ordering::Less
-    };
-}
-
-macro_rules! interval_min_max {
-    ($OP:tt, $LHS:expr, $RHS:expr) => {{
-        match $LHS.partial_cmp(&$RHS) {
-            Some(choose_min_max!($OP)) => $RHS.clone(),
-            Some(_) => $LHS.clone(),
-            None => {
-                return internal_err!("Comparison error while computing interval min/max")
-            }
-        }
-    }};
-}
-
-// min/max of two scalar values of the same type
-macro_rules! min_max {
-    ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
-        Ok(match ($VALUE, $DELTA) {
-            (ScalarValue::Null, ScalarValue::Null) => ScalarValue::Null,
-            (
-                lhs @ ScalarValue::Decimal128(lhsv, lhsp, lhss),
-                rhs @ ScalarValue::Decimal128(rhsv, rhsp, rhss)
-            ) => {
-                if lhsp.eq(rhsp) && lhss.eq(rhss) {
-                    typed_min_max!(lhsv, rhsv, Decimal128, $OP, lhsp, lhss)
-                } else {
-                    return internal_err!(
-                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
-                    (lhs, rhs)
-                );
-                }
-            }
-            (
-                lhs @ ScalarValue::Decimal256(lhsv, lhsp, lhss),
-                rhs @ ScalarValue::Decimal256(rhsv, rhsp, rhss)
-            ) => {
-                if lhsp.eq(rhsp) && lhss.eq(rhss) {
-                    typed_min_max!(lhsv, rhsv, Decimal256, $OP, lhsp, lhss)
-                } else {
-                    return internal_err!(
-                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
-                    (lhs, rhs)
-                );
-                }
-            }
-            (ScalarValue::Boolean(lhs), ScalarValue::Boolean(rhs)) => {
-                typed_min_max!(lhs, rhs, Boolean, $OP)
-            }
-            (ScalarValue::Float64(lhs), ScalarValue::Float64(rhs)) => {
-                typed_min_max_float!(lhs, rhs, Float64, $OP)
-            }
-            (ScalarValue::Float32(lhs), ScalarValue::Float32(rhs)) => {
-                typed_min_max_float!(lhs, rhs, Float32, $OP)
-            }
-            (ScalarValue::Float16(lhs), ScalarValue::Float16(rhs)) => {
-                typed_min_max_float!(lhs, rhs, Float16, $OP)
-            }
-            (ScalarValue::UInt64(lhs), ScalarValue::UInt64(rhs)) => {
-                typed_min_max!(lhs, rhs, UInt64, $OP)
-            }
-            (ScalarValue::UInt32(lhs), ScalarValue::UInt32(rhs)) => {
-                typed_min_max!(lhs, rhs, UInt32, $OP)
-            }
-            (ScalarValue::UInt16(lhs), ScalarValue::UInt16(rhs)) => {
-                typed_min_max!(lhs, rhs, UInt16, $OP)
-            }
-            (ScalarValue::UInt8(lhs), ScalarValue::UInt8(rhs)) => {
-                typed_min_max!(lhs, rhs, UInt8, $OP)
-            }
-            (ScalarValue::Int64(lhs), ScalarValue::Int64(rhs)) => {
-                typed_min_max!(lhs, rhs, Int64, $OP)
-            }
-            (ScalarValue::Int32(lhs), ScalarValue::Int32(rhs)) => {
-                typed_min_max!(lhs, rhs, Int32, $OP)
-            }
-            (ScalarValue::Int16(lhs), ScalarValue::Int16(rhs)) => {
-                typed_min_max!(lhs, rhs, Int16, $OP)
-            }
-            (ScalarValue::Int8(lhs), ScalarValue::Int8(rhs)) => {
-                typed_min_max!(lhs, rhs, Int8, $OP)
-            }
-            (ScalarValue::Utf8(lhs), ScalarValue::Utf8(rhs)) => {
-                typed_min_max_string!(lhs, rhs, Utf8, $OP)
-            }
-            (ScalarValue::LargeUtf8(lhs), ScalarValue::LargeUtf8(rhs)) => {
-                typed_min_max_string!(lhs, rhs, LargeUtf8, $OP)
-            }
-            (ScalarValue::Utf8View(lhs), ScalarValue::Utf8View(rhs)) => {
-                typed_min_max_string!(lhs, rhs, Utf8View, $OP)
-            }
-            (ScalarValue::Binary(lhs), ScalarValue::Binary(rhs)) => {
-                typed_min_max_string!(lhs, rhs, Binary, $OP)
-            }
-            (ScalarValue::LargeBinary(lhs), ScalarValue::LargeBinary(rhs)) => {
-                typed_min_max_string!(lhs, rhs, LargeBinary, $OP)
-            }
-            (ScalarValue::BinaryView(lhs), ScalarValue::BinaryView(rhs)) => {
-                typed_min_max_string!(lhs, rhs, BinaryView, $OP)
-            }
-            (ScalarValue::TimestampSecond(lhs, l_tz), ScalarValue::TimestampSecond(rhs, _)) => {
-                typed_min_max!(lhs, rhs, TimestampSecond, $OP, l_tz)
-            }
-            (
-                ScalarValue::TimestampMillisecond(lhs, l_tz),
-                ScalarValue::TimestampMillisecond(rhs, _),
-            ) => {
-                typed_min_max!(lhs, rhs, TimestampMillisecond, $OP, l_tz)
-            }
-            (
-                ScalarValue::TimestampMicrosecond(lhs, l_tz),
-                ScalarValue::TimestampMicrosecond(rhs, _),
-            ) => {
-                typed_min_max!(lhs, rhs, TimestampMicrosecond, $OP, l_tz)
-            }
-            (
-                ScalarValue::TimestampNanosecond(lhs, l_tz),
-                ScalarValue::TimestampNanosecond(rhs, _),
-            ) => {
-                typed_min_max!(lhs, rhs, TimestampNanosecond, $OP, l_tz)
-            }
-            (
-                ScalarValue::Date32(lhs),
-                ScalarValue::Date32(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Date32, $OP)
-            }
-            (
-                ScalarValue::Date64(lhs),
-                ScalarValue::Date64(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Date64, $OP)
-            }
-            (
-                ScalarValue::Time32Second(lhs),
-                ScalarValue::Time32Second(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Time32Second, $OP)
-            }
-            (
-                ScalarValue::Time32Millisecond(lhs),
-                ScalarValue::Time32Millisecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Time32Millisecond, $OP)
-            }
-            (
-                ScalarValue::Time64Microsecond(lhs),
-                ScalarValue::Time64Microsecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Time64Microsecond, $OP)
-            }
-            (
-                ScalarValue::Time64Nanosecond(lhs),
-                ScalarValue::Time64Nanosecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, Time64Nanosecond, $OP)
-            }
-            (
-                ScalarValue::IntervalYearMonth(lhs),
-                ScalarValue::IntervalYearMonth(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, IntervalYearMonth, $OP)
-            }
-            (
-                ScalarValue::IntervalMonthDayNano(lhs),
-                ScalarValue::IntervalMonthDayNano(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, IntervalMonthDayNano, $OP)
-            }
-            (
-                ScalarValue::IntervalDayTime(lhs),
-                ScalarValue::IntervalDayTime(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, IntervalDayTime, $OP)
-            }
-            (
-                ScalarValue::IntervalYearMonth(_),
-                ScalarValue::IntervalMonthDayNano(_),
-            ) | (
-                ScalarValue::IntervalYearMonth(_),
-                ScalarValue::IntervalDayTime(_),
-            ) | (
-                ScalarValue::IntervalMonthDayNano(_),
-                ScalarValue::IntervalDayTime(_),
-            ) | (
-                ScalarValue::IntervalMonthDayNano(_),
-                ScalarValue::IntervalYearMonth(_),
-            ) | (
-                ScalarValue::IntervalDayTime(_),
-                ScalarValue::IntervalYearMonth(_),
-            ) | (
-                ScalarValue::IntervalDayTime(_),
-                ScalarValue::IntervalMonthDayNano(_),
-            ) => {
-                interval_min_max!($OP, $VALUE, $DELTA)
-            }
-                    (
-                ScalarValue::DurationSecond(lhs),
-                ScalarValue::DurationSecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, DurationSecond, $OP)
-            }
-                                (
-                ScalarValue::DurationMillisecond(lhs),
-                ScalarValue::DurationMillisecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, DurationMillisecond, $OP)
-            }
-                                (
-                ScalarValue::DurationMicrosecond(lhs),
-                ScalarValue::DurationMicrosecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, DurationMicrosecond, $OP)
-            }
-                                        (
-                ScalarValue::DurationNanosecond(lhs),
-                ScalarValue::DurationNanosecond(rhs),
-            ) => {
-                typed_min_max!(lhs, rhs, DurationNanosecond, $OP)
-            }
-
-            (
-                lhs @ ScalarValue::Struct(_),
-                rhs @ ScalarValue::Struct(_),
-            ) => {
-                min_max_generic!(lhs, rhs, $OP)
-            }
-
-            (
-                lhs @ ScalarValue::List(_),
-                rhs @ ScalarValue::List(_),
-            ) => {
-                min_max_generic!(lhs, rhs, $OP)
-            }
-
-
-            (
-                lhs @ ScalarValue::LargeList(_),
-                rhs @ ScalarValue::LargeList(_),
-            ) => {
-                min_max_generic!(lhs, rhs, $OP)
-            }
-
-
-            (
-                lhs @ ScalarValue::FixedSizeList(_),
-                rhs @ ScalarValue::FixedSizeList(_),
-            ) => {
-                min_max_generic!(lhs, rhs, $OP)
-            }
-
-            e => {
-                return internal_err!(
-                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
-                    e
-                )
-            }
-        })
-    }};
-}
-
-/// An accumulator to compute the maximum value
-#[derive(Debug)]
-pub struct MaxAccumulator {
-    max: ScalarValue,
-}
-
-impl MaxAccumulator {
-    /// new max accumulator
-    pub fn try_new(datatype: &DataType) -> Result<Self> {
-        Ok(Self {
-            max: ScalarValue::try_from(datatype)?,
-        })
-    }
-}
-
-impl Accumulator for MaxAccumulator {
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &values[0];
-        let delta = &max_batch(values)?;
-        let new_max: Result<ScalarValue, DataFusionError> =
-            min_max!(&self.max, delta, max);
-        self.max = new_max?;
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        self.update_batch(states)
-    }
-
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![self.evaluate()?])
-    }
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        Ok(self.max.clone())
-    }
-
-    fn size(&self) -> usize {
-        size_of_val(self) - size_of_val(&self.max) + self.max.size()
-    }
-}
-
 #[derive(Debug)]
 pub struct SlidingMaxAccumulator {
     max: ScalarValue,
@@ -1155,7 +456,7 @@ impl Accumulator for SlidingMaxAccumulator {
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Min {
     signature: Signature,
 }
@@ -1179,10 +480,10 @@ impl FromColumnStatistics for Min {
         &self,
         col_stats: &ColumnStatistics,
     ) -> Option<ScalarValue> {
-        if let Precision::Exact(ref val) = col_stats.min_value {
-            if !val.is_null() {
-                return Some(val.clone());
-            }
+        if let Precision::Exact(ref val) = col_stats.min_value
+            && !val.is_null()
+        {
+            return Some(val.clone());
         }
         None
     }
@@ -1211,10 +512,6 @@ impl AggregateUDFImpl for Min {
         )?))
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         use DataType::*;
         matches!(
@@ -1229,6 +526,8 @@ impl AggregateUDFImpl for Min {
                 | Float16
                 | Float32
                 | Float64
+                | Decimal32(_, _)
+                | Decimal64(_, _)
                 | Decimal128(_, _)
                 | Decimal256(_, _)
                 | Date32
@@ -1310,6 +609,12 @@ impl AggregateUDFImpl for Min {
             Duration(Nanosecond) => {
                 primitive_min_accumulator!(data_type, i64, DurationNanosecondType)
             }
+            Decimal32(_, _) => {
+                primitive_min_accumulator!(data_type, i32, Decimal32Type)
+            }
+            Decimal64(_, _) => {
+                primitive_min_accumulator!(data_type, i64, Decimal64Type)
+            }
             Decimal128(_, _) => {
                 primitive_min_accumulator!(data_type, i128, Decimal128Type)
             }
@@ -1366,48 +671,6 @@ impl AggregateUDFImpl for Min {
     }
 }
 
-/// An accumulator to compute the minimum value
-#[derive(Debug)]
-pub struct MinAccumulator {
-    min: ScalarValue,
-}
-
-impl MinAccumulator {
-    /// new min accumulator
-    pub fn try_new(datatype: &DataType) -> Result<Self> {
-        Ok(Self {
-            min: ScalarValue::try_from(datatype)?,
-        })
-    }
-}
-
-impl Accumulator for MinAccumulator {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        Ok(vec![self.evaluate()?])
-    }
-
-    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &values[0];
-        let delta = &min_batch(values)?;
-        let new_min: Result<ScalarValue, DataFusionError> =
-            min_max!(&self.min, delta, min);
-        self.min = new_min?;
-        Ok(())
-    }
-
-    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        self.update_batch(states)
-    }
-
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        Ok(self.min.clone())
-    }
-
-    fn size(&self) -> usize {
-        size_of_val(self) - size_of_val(&self.min) + self.min.size()
-    }
-}
-
 #[derive(Debug)]
 pub struct SlidingMinAccumulator {
     min: ScalarValue,
@@ -1739,13 +1002,22 @@ make_udaf_expr_and_func!(
     min_udaf
 );
 
+// Re-export accumulators from the common module for backwards compatibility
+pub use datafusion_functions_aggregate_common::min_max::{
+    MaxAccumulator, MinAccumulator,
+};
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use arrow::{
-        array::DictionaryArray,
+        array::{
+            DictionaryArray, Float32Array, Int32Array, IntervalDayTimeArray,
+            IntervalMonthDayNanoArray, IntervalYearMonthArray, StringArray,
+        },
         datatypes::{
-            IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
+            IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
+            IntervalYearMonthType,
         },
     };
     use std::sync::Arc;
diff --git a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
index 05321c2ff52d2..e4ac7eccf5692 100644
--- a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
+++ b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
@@ -20,7 +20,8 @@ use arrow::array::{
     LargeBinaryBuilder, LargeStringBuilder, StringBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::hash_map::Entry;
+use datafusion_common::{HashMap, Result, internal_err};
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
 use std::mem::size_of;
@@ -391,14 +392,6 @@ struct MinMaxBytesState {
     total_data_bytes: usize,
 }
 
-#[derive(Debug, Clone, Copy)]
-enum MinMaxLocation<'a> {
-    /// the min/max value is stored in the existing `min_max` array
-    ExistingMinMax,
-    /// the min/max value is stored in the input array at the given index
-    Input(&'a [u8]),
-}
-
 /// Implement the MinMaxBytesAccumulator with a comparison function
 /// for comparing strings
 impl MinMaxBytesState {
@@ -450,7 +443,7 @@ impl MinMaxBytesState {
         // Minimize value copies by calculating the new min/maxes for each group
         // in this batch (either the existing min/max or the new input value)
         // and updating the owned values in `self.min_maxes` at most once
-        let mut locations = vec![MinMaxLocation::ExistingMinMax; total_num_groups];
+        let mut locations = HashMap::<usize, &[u8]>::with_capacity(group_indices.len());
 
         // Figure out the new min value for each group
         for (new_val, group_index) in iter.into_iter().zip(group_indices.iter()) {
@@ -459,32 +452,29 @@ impl MinMaxBytesState {
                 continue; // skip nulls
             };
 
-            let existing_val = match locations[group_index] {
-                // previous input value was the min/max, so compare it
-                MinMaxLocation::Input(existing_val) => existing_val,
-                MinMaxLocation::ExistingMinMax => {
-                    let Some(existing_val) = self.min_max[group_index].as_ref() else {
-                        // no existing min/max, so this is the new min/max
-                        locations[group_index] = MinMaxLocation::Input(new_val);
-                        continue;
-                    };
-                    existing_val.as_ref()
+            match locations.entry(group_index) {
+                Entry::Occupied(mut occupied_entry) => {
+                    if cmp(new_val, occupied_entry.get()) {
+                        occupied_entry.insert(new_val);
+                    }
+                }
+                Entry::Vacant(vacant_entry) => {
+                    if let Some(old_val) = self.min_max[group_index].as_ref() {
+                        if cmp(new_val, old_val) {
+                            vacant_entry.insert(new_val);
+                        }
+                    } else {
+                        vacant_entry.insert(new_val);
+                    }
                 }
             };
-
-            // Compare the new value to the existing value, replacing if necessary
-            if cmp(new_val, existing_val) {
-                locations[group_index] = MinMaxLocation::Input(new_val);
-            }
         }
 
         // Update self.min_max with any new min/max values we found in the input
-        for (group_index, location) in locations.iter().enumerate() {
-            match location {
-                MinMaxLocation::ExistingMinMax => {}
-                MinMaxLocation::Input(new_val) => self.set_value(group_index, new_val),
-            }
+        for (group_index, location) in locations.iter() {
+            self.set_value(*group_index, location);
         }
+
         Ok(())
     }
 
diff --git a/datafusion/functions-aggregate/src/min_max/min_max_struct.rs b/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
index 8038f2f01d90c..796fd586ca5c8 100644
--- a/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
+++ b/datafusion/functions-aggregate/src/min_max/min_max_struct.rs
@@ -24,9 +24,8 @@ use arrow::{
     datatypes::DataType,
 };
 use datafusion_common::{
-    internal_err,
+    Result, internal_err,
     scalar::{copy_array_data, partial_cmp_struct},
-    Result,
 };
 use datafusion_expr::{EmitTo, GroupsAccumulator};
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::apply_filter_as_nulls;
diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs
index 1525b2f991a1f..bc343a1969c09 100644
--- a/datafusion/functions-aggregate/src/nth_value.rs
+++ b/datafusion/functions-aggregate/src/nth_value.rs
@@ -23,16 +23,18 @@ use std::collections::VecDeque;
 use std::mem::{size_of, size_of_val};
 use std::sync::Arc;
 
-use arrow::array::{new_empty_array, ArrayRef, AsArray, StructArray};
+use arrow::array::{ArrayRef, AsArray, StructArray, new_empty_array};
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
 
-use datafusion_common::utils::{get_row_at_idx, SingleRowListArrayBuilder};
-use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::utils::{SingleRowListArrayBuilder, get_row_at_idx};
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, exec_err, not_impl_err,
+};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
-    lit, Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF,
-    Signature, SortExpr, Volatility,
+    Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF,
+    Signature, SortExpr, Volatility, lit,
 };
 use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays;
 use datafusion_functions_aggregate_common::utils::ordering_fields;
@@ -86,10 +88,10 @@ pub fn nth_value(
         description = "The position (nth) of the value to retrieve, based on the ordering."
     )
 )]
-/// Expression for a `NTH_VALUE(... ORDER BY ..., ...)` aggregation. In a multi
+/// Expression for a `NTH_VALUE(..., ... ORDER BY ...)` aggregation. In a multi
 /// partition setting, partial aggregations are computed for every partition,
 /// and then their results are merged.
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NthValueAgg {
     signature: Signature,
 }
@@ -144,24 +146,25 @@ impl AggregateUDFImpl for NthValueAgg {
                     "{} not supported for n: {}",
                     self.name(),
                     &acc_args.exprs[1]
-                )
+                );
             }
         };
 
-        let ordering_dtypes = acc_args
-            .ordering_req
+        let Some(ordering) = LexOrdering::new(acc_args.order_bys.to_vec()) else {
+            return TrivialNthValueAccumulator::try_new(
+                n,
+                acc_args.return_field.data_type(),
+            )
+            .map(|acc| Box::new(acc) as _);
+        };
+        let ordering_dtypes = ordering
             .iter()
             .map(|e| e.expr.data_type(acc_args.schema))
             .collect::<Result<Vec<_>>>()?;
 
-        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
-        NthValueAccumulator::try_new(
-            n,
-            &data_type,
-            &ordering_dtypes,
-            acc_args.ordering_req.clone(),
-        )
-        .map(|acc| Box::new(acc) as _)
+        let data_type = acc_args.expr_fields[0].data_type();
+        NthValueAccumulator::try_new(n, data_type, &ordering_dtypes, ordering)
+            .map(|acc| Box::new(acc) as _)
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
@@ -182,10 +185,6 @@ impl AggregateUDFImpl for NthValueAgg {
         Ok(fields.into_iter().map(Arc::new).collect())
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn reverse_expr(&self) -> ReversedUDAF {
         ReversedUDAF::Reversed(nth_value_udaf())
     }
@@ -195,6 +194,127 @@ impl AggregateUDFImpl for NthValueAgg {
     }
 }
 
+#[derive(Debug)]
+pub struct TrivialNthValueAccumulator {
+    /// The `N` value.
+    n: i64,
+    /// Stores entries in the `NTH_VALUE` result.
+    values: VecDeque<ScalarValue>,
+    /// Data types of the value.
+    datatype: DataType,
+}
+
+impl TrivialNthValueAccumulator {
+    /// Create a new order-insensitive NTH_VALUE accumulator based on the given
+    /// item data type.
+    pub fn try_new(n: i64, datatype: &DataType) -> Result<Self> {
+        // n cannot be 0
+        assert_or_internal_err!(
+            n != 0,
+            "Nth value indices are 1 based. 0 is invalid index"
+        );
+        Ok(Self {
+            n,
+            values: VecDeque::new(),
+            datatype: datatype.clone(),
+        })
+    }
+
+    /// Updates state, with the `values`. Fetch contains missing number of entries for state to be complete
+    /// None represents all of the new `values` need to be added to the state.
+    fn append_new_data(
+        &mut self,
+        values: &[ArrayRef],
+        fetch: Option<usize>,
+    ) -> Result<()> {
+        let n_row = values[0].len();
+        let n_to_add = if let Some(fetch) = fetch {
+            std::cmp::min(fetch, n_row)
+        } else {
+            n_row
+        };
+        for index in 0..n_to_add {
+            let mut row = get_row_at_idx(values, index)?;
+            self.values.push_back(row.swap_remove(0));
+            // At index 1, we have n index argument, which is constant.
+        }
+        Ok(())
+    }
+}
+
+impl Accumulator for TrivialNthValueAccumulator {
+    /// Updates its state with the `values`. Assumes data in the `values` satisfies the required
+    /// ordering for the accumulator (across consecutive batches, not just batch-wise).
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if !values.is_empty() {
+            let n_required = self.n.unsigned_abs() as usize;
+            let from_start = self.n > 0;
+            if from_start {
+                // direction is from start
+                let n_remaining = n_required.saturating_sub(self.values.len());
+                self.append_new_data(values, Some(n_remaining))?;
+            } else {
+                // direction is from end
+                self.append_new_data(values, None)?;
+                let start_offset = self.values.len().saturating_sub(n_required);
+                if start_offset > 0 {
+                    self.values.drain(0..start_offset);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        if !states.is_empty() {
+            // First entry in the state is the aggregation result.
+            let n_required = self.n.unsigned_abs() as usize;
+            let array_agg_res = ScalarValue::convert_array_to_scalar_vec(&states[0])?;
+            for v in array_agg_res.into_iter().flatten() {
+                self.values.extend(v);
+                if self.values.len() > n_required {
+                    // There is enough data collected, can stop merging:
+                    break;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        let mut values_cloned = self.values.clone();
+        let values_slice = values_cloned.make_contiguous();
+        Ok(vec![ScalarValue::List(ScalarValue::new_list_nullable(
+            values_slice,
+            &self.datatype,
+        ))])
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let n_required = self.n.unsigned_abs() as usize;
+        let from_start = self.n > 0;
+        let nth_value_idx = if from_start {
+            // index is from start
+            let forward_idx = n_required - 1;
+            (forward_idx < self.values.len()).then_some(forward_idx)
+        } else {
+            // index is from end
+            self.values.len().checked_sub(n_required)
+        };
+        if let Some(idx) = nth_value_idx {
+            Ok(self.values[idx].clone())
+        } else {
+            ScalarValue::try_from(self.datatype.clone())
+        }
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + ScalarValue::size_of_vec_deque(&self.values)
+            - size_of_val(&self.values)
+            + size_of::<DataType>()
+    }
+}
+
 #[derive(Debug)]
 pub struct NthValueAccumulator {
     /// The `N` value.
@@ -222,10 +342,11 @@ impl NthValueAccumulator {
         ordering_dtypes: &[DataType],
         ordering_req: LexOrdering,
     ) -> Result<Self> {
-        if n == 0 {
-            // n cannot be 0
-            return internal_err!("Nth value indices are 1 based. 0 is invalid index");
-        }
+        // n cannot be 0
+        assert_or_internal_err!(
+            n != 0,
+            "Nth value indices are 1 based. 0 is invalid index"
+        );
         let mut datatypes = vec![datatype.clone()];
         datatypes.extend(ordering_dtypes.iter().cloned());
         Ok(Self {
@@ -236,6 +357,64 @@ impl NthValueAccumulator {
             ordering_req,
         })
     }
+
+    fn evaluate_orderings(&self) -> Result<ScalarValue> {
+        let fields = ordering_fields(&self.ordering_req, &self.datatypes[1..]);
+
+        let mut column_wise_ordering_values = vec![];
+        let num_columns = fields.len();
+        for i in 0..num_columns {
+            let column_values = self
+                .ordering_values
+                .iter()
+                .map(|x| x[i].clone())
+                .collect::<Vec<_>>();
+            let array = if column_values.is_empty() {
+                new_empty_array(fields[i].data_type())
+            } else {
+                ScalarValue::iter_to_array(column_values.into_iter())?
+            };
+            column_wise_ordering_values.push(array);
+        }
+
+        let struct_field = Fields::from(fields);
+        let ordering_array =
+            StructArray::try_new(struct_field, column_wise_ordering_values, None)?;
+
+        Ok(SingleRowListArrayBuilder::new(Arc::new(ordering_array)).build_list_scalar())
+    }
+
+    fn evaluate_values(&self) -> ScalarValue {
+        let mut values_cloned = self.values.clone();
+        let values_slice = values_cloned.make_contiguous();
+        ScalarValue::List(ScalarValue::new_list_nullable(
+            values_slice,
+            &self.datatypes[0],
+        ))
+    }
+
+    /// Updates state, with the `values`. Fetch contains missing number of entries for state to be complete
+    /// None represents all of the new `values` need to be added to the state.
+    fn append_new_data(
+        &mut self,
+        values: &[ArrayRef],
+        fetch: Option<usize>,
+    ) -> Result<()> {
+        let n_row = values[0].len();
+        let n_to_add = if let Some(fetch) = fetch {
+            std::cmp::min(fetch, n_row)
+        } else {
+            n_row
+        };
+        for index in 0..n_to_add {
+            let row = get_row_at_idx(values, index)?;
+            self.values.push_back(row[0].clone());
+            // At index 1, we have n index argument.
+            // Ordering values cover starting from 2nd index to end
+            self.ordering_values.push_back(row[2..].to_vec());
+        }
+        Ok(())
+    }
 }
 
 impl Accumulator for NthValueAccumulator {
@@ -269,91 +448,60 @@ impl Accumulator for NthValueAccumulator {
         if states.is_empty() {
             return Ok(());
         }
-        // First entry in the state is the aggregation result.
-        let array_agg_values = &states[0];
-        let n_required = self.n.unsigned_abs() as usize;
-        if self.ordering_req.is_empty() {
-            let array_agg_res =
-                ScalarValue::convert_array_to_scalar_vec(array_agg_values)?;
-            for v in array_agg_res.into_iter() {
-                self.values.extend(v);
-                if self.values.len() > n_required {
-                    // There is enough data collected can stop merging
-                    break;
-                }
-            }
-        } else if let Some(agg_orderings) = states[1].as_list_opt::<i32>() {
-            // 2nd entry stores values received for ordering requirement columns, for each aggregation value inside NTH_VALUE list.
-            // For each `StructArray` inside NTH_VALUE list, we will receive an `Array` that stores
-            // values received from its ordering requirement expression. (This information is necessary for during merging).
-
-            // Stores NTH_VALUE results coming from each partition
-            let mut partition_values: Vec<VecDeque<ScalarValue>> = vec![];
-            // Stores ordering requirement expression results coming from each partition
-            let mut partition_ordering_values: Vec<VecDeque<Vec<ScalarValue>>> = vec![];
-
-            // Existing values should be merged also.
-            partition_values.push(self.values.clone());
-
-            partition_ordering_values.push(self.ordering_values.clone());
-
-            let array_agg_res =
-                ScalarValue::convert_array_to_scalar_vec(array_agg_values)?;
-
-            for v in array_agg_res.into_iter() {
-                partition_values.push(v.into());
-            }
-
-            let orderings = ScalarValue::convert_array_to_scalar_vec(agg_orderings)?;
-
-            let ordering_values = orderings.into_iter().map(|partition_ordering_rows| {
-                // Extract value from struct to ordering_rows for each group/partition
-                partition_ordering_rows.into_iter().map(|ordering_row| {
-                    if let ScalarValue::Struct(s) = ordering_row {
-                        let mut ordering_columns_per_row = vec![];
-
-                        for column in s.columns() {
-                            let sv = ScalarValue::try_from_array(column, 0)?;
-                            ordering_columns_per_row.push(sv);
-                        }
-
-                        Ok(ordering_columns_per_row)
-                    } else {
-                        exec_err!(
-                            "Expects to receive ScalarValue::Struct(Some(..), _) but got: {:?}",
-                            ordering_row.data_type()
-                        )
-                    }
-                }).collect::<Result<Vec<_>>>()
-            }).collect::<Result<Vec<_>>>()?;
-            for ordering_values in ordering_values.into_iter() {
-                partition_ordering_values.push(ordering_values.into());
-            }
-
-            let sort_options = self
-                .ordering_req
-                .iter()
-                .map(|sort_expr| sort_expr.options)
-                .collect::<Vec<_>>();
-            let (new_values, new_orderings) = merge_ordered_arrays(
-                &mut partition_values,
-                &mut partition_ordering_values,
-                &sort_options,
-            )?;
-            self.values = new_values.into();
-            self.ordering_values = new_orderings.into();
-        } else {
+        // Second entry stores values received for ordering requirement columns
+        // for each aggregation value inside NTH_VALUE list. For each `StructArray`
+        // inside this list, we will receive an `Array` that stores values received
+        // from its ordering requirement expression. This information is necessary
+        // during merging.
+        let Some(agg_orderings) = states[1].as_list_opt::<i32>() else {
             return exec_err!("Expects to receive a list array");
+        };
+
+        // Stores NTH_VALUE results coming from each partition
+        let mut partition_values = vec![self.values.clone()];
+        // First entry in the state is the aggregation result.
+        let array_agg_res = ScalarValue::convert_array_to_scalar_vec(&states[0])?;
+        for v in array_agg_res.into_iter().flatten() {
+            partition_values.push(v.into());
         }
+        // Stores ordering requirement expression results coming from each partition:
+        let mut partition_ordering_values = vec![self.ordering_values.clone()];
+        let orderings = ScalarValue::convert_array_to_scalar_vec(agg_orderings)?;
+        // Extract value from struct to ordering_rows for each group/partition:
+        for partition_ordering_rows in orderings.into_iter().flatten() {
+            let ordering_values = partition_ordering_rows.into_iter().map(|ordering_row| {
+                let ScalarValue::Struct(s_array) = ordering_row else {
+                    return exec_err!(
+                        "Expects to receive ScalarValue::Struct(Some(..), _) but got: {:?}",
+                        ordering_row.data_type()
+                    );
+                };
+                s_array
+                    .columns()
+                    .iter()
+                    .map(|column| ScalarValue::try_from_array(column, 0))
+                    .collect()
+            }).collect::<Result<VecDeque<_>>>()?;
+            partition_ordering_values.push(ordering_values);
+        }
+
+        let sort_options = self
+            .ordering_req
+            .iter()
+            .map(|sort_expr| sort_expr.options)
+            .collect::<Vec<_>>();
+        let (new_values, new_orderings) = merge_ordered_arrays(
+            &mut partition_values,
+            &mut partition_ordering_values,
+            &sort_options,
+        )?;
+        self.values = new_values.into();
+        self.ordering_values = new_orderings.into();
         Ok(())
     }
 
     fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        let mut result = vec![self.evaluate_values()];
-        if !self.ordering_req.is_empty() {
-            result.push(self.evaluate_orderings()?);
-        }
-        Ok(result)
+        Ok(vec![self.evaluate_values(), self.evaluate_orderings()?])
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
@@ -396,63 +544,3 @@ impl Accumulator for NthValueAccumulator {
         total
     }
 }
-
-impl NthValueAccumulator {
-    fn evaluate_orderings(&self) -> Result<ScalarValue> {
-        let fields = ordering_fields(self.ordering_req.as_ref(), &self.datatypes[1..]);
-
-        let mut column_wise_ordering_values = vec![];
-        let num_columns = fields.len();
-        for i in 0..num_columns {
-            let column_values = self
-                .ordering_values
-                .iter()
-                .map(|x| x[i].clone())
-                .collect::<Vec<_>>();
-            let array = if column_values.is_empty() {
-                new_empty_array(fields[i].data_type())
-            } else {
-                ScalarValue::iter_to_array(column_values.into_iter())?
-            };
-            column_wise_ordering_values.push(array);
-        }
-
-        let struct_field = Fields::from(fields);
-        let ordering_array =
-            StructArray::try_new(struct_field, column_wise_ordering_values, None)?;
-
-        Ok(SingleRowListArrayBuilder::new(Arc::new(ordering_array)).build_list_scalar())
-    }
-
-    fn evaluate_values(&self) -> ScalarValue {
-        let mut values_cloned = self.values.clone();
-        let values_slice = values_cloned.make_contiguous();
-        ScalarValue::List(ScalarValue::new_list_nullable(
-            values_slice,
-            &self.datatypes[0],
-        ))
-    }
-
-    /// Updates state, with the `values`. Fetch contains missing number of entries for state to be complete
-    /// None represents all of the new `values` need to be added to the state.
-    fn append_new_data(
-        &mut self,
-        values: &[ArrayRef],
-        fetch: Option<usize>,
-    ) -> Result<()> {
-        let n_row = values[0].len();
-        let n_to_add = if let Some(fetch) = fetch {
-            std::cmp::min(fetch, n_row)
-        } else {
-            n_row
-        };
-        for index in 0..n_to_add {
-            let row = get_row_at_idx(values, index)?;
-            self.values.push_back(row[0].clone());
-            // At index 1, we have n index argument.
-            // Ordering values cover starting from 2nd index to end
-            self.ordering_values.push_back(row[2..].to_vec());
-        }
-        Ok(())
-    }
-}
diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs
new file mode 100644
index 0000000000000..1aa150b56350b
--- /dev/null
+++ b/datafusion/functions-aggregate/src/percentile_cont.rs
@@ -0,0 +1,838 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::mem::{size_of, size_of_val};
+use std::sync::Arc;
+
+use arrow::array::{
+    ArrowNumericType, BooleanArray, ListArray, PrimitiveArray, PrimitiveBuilder,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::{
+    array::{Array, ArrayRef, AsArray},
+    datatypes::{DataType, Field, FieldRef, Float16Type, Float32Type, Float64Type},
+};
+
+use num_traits::AsPrimitive;
+
+use arrow::array::ArrowNativeTypeOp;
+use datafusion_common::internal_err;
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
+
+use crate::min_max::{max_udaf, min_udaf};
+use datafusion_common::{
+    Result, ScalarValue, internal_datafusion_err, utils::take_function_args,
+};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_expr::{EmitTo, GroupsAccumulator};
+use datafusion_expr::{
+    expr::{AggregateFunction, Sort},
+    function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
+    simplify::SimplifyContext,
+};
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::accumulate;
+use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls::filtered_null_mask;
+use datafusion_functions_aggregate_common::utils::{GenericDistinctBuffer, Hashable};
+use datafusion_macros::user_doc;
+
+use crate::utils::validate_percentile_expr;
+
+/// Precision multiplier for linear interpolation calculations.
+///
+/// This value of 1,000,000 was chosen to balance precision with overflow safety:
+/// - Provides 6 decimal places of precision for the fractional component
+/// - Small enough to avoid overflow when multiplied with typical numeric values
+/// - Sufficient precision for most statistical applications
+///
+/// The interpolation formula: `lower + (upper - lower) * fraction`
+/// is computed as: `lower + ((upper - lower) * (fraction * PRECISION)) / PRECISION`
+/// to avoid floating-point operations on integer types while maintaining precision.
+///
+/// The interpolation arithmetic is performed in f64 and then cast back to the
+/// native type to avoid overflowing Float16 intermediates.
+const INTERPOLATION_PRECISION: f64 = 1_000_000.0;
+
+create_func!(PercentileCont, percentile_cont_udaf);
+
+/// Computes the exact percentile continuous of a set of numbers
+pub fn percentile_cont(order_by: Sort, percentile: Expr) -> Expr {
+    let expr = order_by.expr.clone();
+    let args = vec![expr, percentile];
+
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        percentile_cont_udaf(),
+        args,
+        false,
+        None,
+        vec![order_by],
+        None,
+    ))
+}
+
+#[user_doc(
+    doc_section(label = "General Functions"),
+    description = "Returns the exact percentile of input values, interpolating between values if needed.",
+    syntax_example = "percentile_cont(percentile) WITHIN GROUP (ORDER BY expression)",
+    sql_example = r#"```sql
+> SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++----------------------------------------------------------+
+| percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) |
++----------------------------------------------------------+
+| 45.5                                                     |
++----------------------------------------------------------+
+```
+
+An alternate syntax is also supported:
+```sql
+> SELECT percentile_cont(column_name, 0.75) FROM table_name;
++---------------------------------------+
+| percentile_cont(column_name, 0.75)    |
++---------------------------------------+
+| 45.5                                  |
++---------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "The"),
+    argument(
+        name = "percentile",
+        description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)."
+    )
+)]
+/// PERCENTILE_CONT aggregate expression. This uses an exact calculation and stores all values
+/// in memory before computing the result. If an approximation is sufficient then
+/// APPROX_PERCENTILE_CONT provides a much more efficient solution.
+///
+/// If using the distinct variation, the memory usage will be similarly high if the
+/// cardinality is high as it stores all distinct values in memory before computing the
+/// result, but if cardinality is low then memory usage will also be lower.
+#[derive(PartialEq, Eq, Hash, Debug)]
+pub struct PercentileCont {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for PercentileCont {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PercentileCont {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Float,
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Numeric],
+                        NativeType::Float64,
+                    ),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec!["expr", "percentile"])
+            .unwrap(),
+            aliases: vec![String::from("quantile_cont")],
+        }
+    }
+}
+
+impl AggregateUDFImpl for PercentileCont {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "percentile_cont"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(DataType::Float64),
+            dt => Ok(dt.clone()),
+        }
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        let input_type = args.input_fields[0].data_type().clone();
+        if input_type.is_null() {
+            return Ok(vec![
+                Field::new(
+                    format_state_name(args.name, self.name()),
+                    DataType::Null,
+                    true,
+                )
+                .into(),
+            ]);
+        }
+
+        let field = Field::new_list_field(input_type, true);
+        let state_name = if args.is_distinct {
+            "distinct_percentile_cont"
+        } else {
+            "percentile_cont"
+        };
+
+        Ok(vec![
+            Field::new(
+                format_state_name(args.name, state_name),
+                DataType::List(Arc::new(field)),
+                true,
+            )
+            .into(),
+        ])
+    }
+
+    fn accumulator(&self, args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let percentile = get_percentile(&args)?;
+
+        let input_dt = args.expr_fields[0].data_type();
+        if input_dt.is_null() {
+            return Ok(Box::new(NoopAccumulator::new(ScalarValue::Float64(None))));
+        }
+
+        if args.is_distinct {
+            match input_dt {
+                DataType::Float16 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float16Type,
+                >::new(percentile))),
+                DataType::Float32 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float32Type,
+                >::new(percentile))),
+                DataType::Float64 => Ok(Box::new(DistinctPercentileContAccumulator::<
+                    Float64Type,
+                >::new(percentile))),
+                dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
+            }
+        } else {
+            match input_dt {
+                DataType::Float16 => Ok(Box::new(
+                    PercentileContAccumulator::<Float16Type>::new(percentile),
+                )),
+                DataType::Float32 => Ok(Box::new(
+                    PercentileContAccumulator::<Float32Type>::new(percentile),
+                )),
+                DataType::Float64 => Ok(Box::new(
+                    PercentileContAccumulator::<Float64Type>::new(percentile),
+                )),
+                dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
+            }
+        }
+    }
+
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        !args.is_distinct && !args.expr_fields[0].data_type().is_null()
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let percentile = get_percentile(&args)?;
+
+        let input_dt = args.expr_fields[0].data_type();
+        match input_dt {
+            DataType::Float16 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float16Type,
+            >::new(percentile))),
+            DataType::Float32 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float32Type,
+            >::new(percentile))),
+            DataType::Float64 => Ok(Box::new(PercentileContGroupsAccumulator::<
+                Float64Type,
+            >::new(percentile))),
+            dt => internal_err!("Unsupported datatype for percentile cont: {dt}"),
+        }
+    }
+
+    fn simplify(&self) -> Option<AggregateFunctionSimplification> {
+        Some(Box::new(|aggregate_function, info| {
+            simplify_percentile_cont_aggregate(aggregate_function, info)
+        }))
+    }
+
+    fn supports_within_group_clause(&self) -> bool {
+        true
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn get_percentile(args: &AccumulatorArgs) -> Result<f64> {
+    let percentile = validate_percentile_expr(&args.exprs[1], "PERCENTILE_CONT")?;
+
+    let is_descending = args
+        .order_bys
+        .first()
+        .map(|sort_expr| sort_expr.options.descending)
+        .unwrap_or(false);
+
+    let percentile = if is_descending {
+        1.0 - percentile
+    } else {
+        percentile
+    };
+
+    Ok(percentile)
+}
+
+fn simplify_percentile_cont_aggregate(
+    aggregate_function: AggregateFunction,
+    info: &SimplifyContext,
+) -> Result<Expr> {
+    enum PercentileRewriteTarget {
+        Min,
+        Max,
+    }
+
+    let params = &aggregate_function.params;
+    let [value, percentile] = take_function_args("percentile_cont", &params.args)?;
+    //
+    // For simplicity we don't bother with null types (otherwise we'd need to
+    // cast the return type)
+    let input_type = info.get_data_type(value)?;
+    if input_type.is_null() {
+        return Ok(Expr::AggregateFunction(aggregate_function));
+    }
+
+    let is_descending = params
+        .order_by
+        .first()
+        .map(|sort| !sort.asc)
+        .unwrap_or(false);
+
+    let rewrite_target = match percentile {
+        Expr::Literal(ScalarValue::Float64(Some(0.0)), _) => {
+            if is_descending {
+                PercentileRewriteTarget::Max
+            } else {
+                PercentileRewriteTarget::Min
+            }
+        }
+        Expr::Literal(ScalarValue::Float64(Some(1.0)), _) => {
+            if is_descending {
+                PercentileRewriteTarget::Min
+            } else {
+                PercentileRewriteTarget::Max
+            }
+        }
+        _ => return Ok(Expr::AggregateFunction(aggregate_function)),
+    };
+
+    let udaf = match rewrite_target {
+        PercentileRewriteTarget::Min => min_udaf(),
+        PercentileRewriteTarget::Max => max_udaf(),
+    };
+
+    let rewritten = Expr::AggregateFunction(AggregateFunction::new_udf(
+        udaf,
+        vec![value.clone()],
+        params.distinct,
+        params.filter.clone(),
+        vec![],
+        params.null_treatment,
+    ));
+    Ok(rewritten)
+}
+
+/// The percentile_cont accumulator accumulates the raw input values
+/// as native types.
+///
+/// The intermediate state is represented as a List of scalar values updated by
+/// `merge_batch` and a `Vec` of native values that are converted to scalar values
+/// in the final evaluation step so that we avoid expensive conversions and
+/// allocations during `update_batch`.
+#[derive(Debug)]
+struct PercentileContAccumulator<T: ArrowNumericType + Debug> {
+    all_values: Vec<T::Native>,
+    percentile: f64,
+}
+
+impl<T: ArrowNumericType + Debug> PercentileContAccumulator<T> {
+    fn new(percentile: f64) -> Self {
+        Self {
+            all_values: vec![],
+            percentile,
+        }
+    }
+}
+
+impl<T> Accumulator for PercentileContAccumulator<T>
+where
+    T: ArrowNumericType + Debug,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        // Convert `all_values` to `ListArray` and return a single List ScalarValue
+
+        // Build offsets
+        let offsets =
+            OffsetBuffer::new(ScalarBuffer::from(vec![0, self.all_values.len() as i32]));
+
+        // Build inner array
+        let values_array = PrimitiveArray::<T>::new(
+            ScalarBuffer::from(std::mem::take(&mut self.all_values)),
+            None,
+        );
+
+        // Build the result list array
+        let list_array = ListArray::new(
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
+            offsets,
+            Arc::new(values_array),
+            None,
+        );
+
+        Ok(vec![ScalarValue::List(Arc::new(list_array))])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = values[0].as_primitive::<T>();
+        self.all_values.reserve(values.len() - values.null_count());
+        self.all_values.extend(values.iter().flatten());
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        let array = states[0].as_list::<i32>();
+        self.update_batch(&[array.value(0)])?;
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let value = calculate_percentile::<T>(&mut self.all_values, self.percentile);
+        ScalarValue::new_primitive::<T>(value, &T::DATA_TYPE)
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.all_values.capacity() * size_of::<T::Native>()
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let mut to_remove: HashMap<ScalarValue, usize> = HashMap::new();
+        for i in 0..values[0].len() {
+            let v = ScalarValue::try_from_array(&values[0], i)?;
+            if !v.is_null() {
+                *to_remove.entry(v).or_default() += 1;
+            }
+        }
+
+        let mut i = 0;
+        while i < self.all_values.len() {
+            let k =
+                ScalarValue::new_primitive::<T>(Some(self.all_values[i]), &T::DATA_TYPE)?;
+            if let Some(count) = to_remove.get_mut(&k)
+                && *count > 0
+            {
+                self.all_values.swap_remove(i);
+                *count -= 1;
+                if *count == 0 {
+                    to_remove.remove(&k);
+                    if to_remove.is_empty() {
+                        break;
+                    }
+                }
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
+}
+
+/// The percentile_cont groups accumulator accumulates the raw input values
+///
+/// For calculating the exact percentile of groups, we need to store all values
+/// of groups before final evaluation.
+/// So values in each group will be stored in a `Vec<T>`, and the total group values
+/// will be actually organized as a `Vec<Vec<T>>`.
+#[derive(Debug)]
+struct PercentileContGroupsAccumulator<T: ArrowNumericType + Send> {
+    group_values: Vec<Vec<T::Native>>,
+    percentile: f64,
+}
+
+impl<T: ArrowNumericType + Send> PercentileContGroupsAccumulator<T> {
+    fn new(percentile: f64) -> Self {
+        Self {
+            group_values: vec![],
+            percentile,
+        }
+    }
+}
+
+impl<T> GroupsAccumulator for PercentileContGroupsAccumulator<T>
+where
+    T: ArrowNumericType + Send,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        // For ordered-set aggregates, we only care about the ORDER BY column (first element)
+        // The percentile parameter is already stored in self.percentile
+
+        let values = values[0].as_primitive::<T>();
+
+        // Push the `not nulls + not filtered` row into its group
+        self.group_values.resize(total_num_groups, Vec::new());
+        accumulate(
+            group_indices,
+            values,
+            opt_filter,
+            |group_index, new_value| {
+                self.group_values[group_index].push(new_value);
+            },
+        );
+
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        // Since aggregate filter should be applied in partial stage, in final stage there should be no filter
+        _opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "one argument to merge_batch");
+
+        let input_group_values = values[0].as_list::<i32>();
+
+        // Ensure group values big enough
+        self.group_values.resize(total_num_groups, Vec::new());
+
+        // Extend values to related groups
+        group_indices
+            .iter()
+            .zip(input_group_values.iter())
+            .for_each(|(&group_index, values_opt)| {
+                if let Some(values) = values_opt {
+                    let values = values.as_primitive::<T>();
+                    self.group_values[group_index].extend(values.values().iter());
+                }
+            });
+
+        Ok(())
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        // Emit values
+        let emit_group_values = emit_to.take_needed(&mut self.group_values);
+
+        // Build offsets
+        let mut offsets = Vec::with_capacity(self.group_values.len() + 1);
+        offsets.push(0);
+        let mut cur_len = 0_i32;
+        for group_value in &emit_group_values {
+            cur_len += group_value.len() as i32;
+            offsets.push(cur_len);
+        }
+        let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets));
+
+        // Build inner array
+        let flatten_group_values =
+            emit_group_values.into_iter().flatten().collect::<Vec<_>>();
+        let group_values_array =
+            PrimitiveArray::<T>::new(ScalarBuffer::from(flatten_group_values), None);
+
+        // Build the result list array
+        let result_list_array = ListArray::new(
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
+            offsets,
+            Arc::new(group_values_array),
+            None,
+        );
+
+        Ok(vec![Arc::new(result_list_array)])
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        // Emit values
+        let mut emit_group_values = emit_to.take_needed(&mut self.group_values);
+
+        // Calculate percentile for each group
+        let mut evaluate_result_builder =
+            PrimitiveBuilder::<T>::with_capacity(emit_group_values.len());
+        for values in &mut emit_group_values {
+            let value = calculate_percentile::<T>(values.as_mut_slice(), self.percentile);
+            evaluate_result_builder.append_option(value);
+        }
+
+        Ok(Arc::new(evaluate_result_builder.finish()))
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<Vec<ArrayRef>> {
+        assert_eq!(values.len(), 1, "one argument to merge_batch");
+
+        let input_array = values[0].as_primitive::<T>();
+
+        // Directly convert the input array to states, each row will be
+        // seen as a respective group.
+        // For detail, the `input_array` will be converted to a `ListArray`.
+        // And if row is `not null + not filtered`, it will be converted to a list
+        // with only one element; otherwise, this row in `ListArray` will be set
+        // to null.
+
+        // Reuse values buffer in `input_array` to build `values` in `ListArray`
+        let values = PrimitiveArray::<T>::new(input_array.values().clone(), None);
+
+        // `offsets` in `ListArray`, each row as a list element
+        let offset_end = i32::try_from(input_array.len()).map_err(|e| {
+            internal_datafusion_err!(
+                "cast array_len to i32 failed in convert_to_state of group percentile_cont, err:{e:?}"
+            )
+        })?;
+        let offsets = (0..=offset_end).collect::<Vec<_>>();
+        // Safety: The offsets vector is constructed as a sequential range from 0 to input_array.len(),
+        // which guarantees all OffsetBuffer invariants:
+        // 1. Offsets are monotonically increasing (each element is prev + 1)
+        // 2. No offset exceeds the values array length (max offset = input_array.len())
+        // 3. First offset is 0 and last offset equals the total length
+        // Therefore new_unchecked is safe to use here.
+        let offsets = unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(offsets)) };
+
+        // `nulls` for converted `ListArray`
+        let nulls = filtered_null_mask(opt_filter, input_array);
+
+        let converted_list_array = ListArray::new(
+            Arc::new(Field::new_list_field(T::DATA_TYPE, true)),
+            offsets,
+            Arc::new(values),
+            nulls,
+        );
+
+        Ok(vec![Arc::new(converted_list_array)])
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        true
+    }
+
+    fn size(&self) -> usize {
+        self.group_values
+            .iter()
+            .map(|values| values.capacity() * size_of::<T::Native>())
+            .sum::<usize>()
+            // account for size of self.group_values too
+            + self.group_values.capacity() * size_of::<Vec<T::Native>>()
+    }
+}
+
+#[derive(Debug)]
+struct DistinctPercentileContAccumulator<T: ArrowNumericType> {
+    distinct_values: GenericDistinctBuffer<T>,
+    percentile: f64,
+}
+
+impl<T: ArrowNumericType + Debug> DistinctPercentileContAccumulator<T> {
+    fn new(percentile: f64) -> Self {
+        Self {
+            distinct_values: GenericDistinctBuffer::new(T::DATA_TYPE),
+            percentile,
+        }
+    }
+}
+
+impl<T> Accumulator for DistinctPercentileContAccumulator<T>
+where
+    T: ArrowNumericType + Debug,
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.distinct_values.state()
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.merge_batch(states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let mut values: Vec<T::Native> =
+            self.distinct_values.values.iter().map(|v| v.0).collect();
+        let value = calculate_percentile::<T>(&mut values, self.percentile);
+        ScalarValue::new_primitive::<T>(value, &T::DATA_TYPE)
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.distinct_values.size()
+    }
+
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = values[0].as_primitive::<T>();
+        for value in arr.iter().flatten() {
+            self.distinct_values.values.remove(&Hashable(value));
+        }
+        Ok(())
+    }
+
+    fn supports_retract_batch(&self) -> bool {
+        true
+    }
+}
+
+/// Calculate the percentile value for a given set of values.
+/// This function performs an exact calculation by sorting all values.
+///
+/// The percentile is calculated using linear interpolation between closest ranks.
+/// For percentile p and n values:
+/// - If p * (n-1) is an integer, return the value at that position
+/// - Otherwise, interpolate between the two closest values
+///
+/// Note: This function takes a mutable slice and sorts it in place, but does not
+/// consume the data. This is important for window frame queries where evaluate()
+/// may be called multiple times on the same accumulator state.
+fn calculate_percentile<T: ArrowNumericType>(
+    values: &mut [T::Native],
+    percentile: f64,
+) -> Option<T::Native>
+where
+    T::Native: Copy + AsPrimitive<f64>,
+    f64: AsPrimitive<T::Native>,
+{
+    let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
+
+    let len = values.len();
+    if len == 0 {
+        None
+    } else if len == 1 {
+        Some(values[0])
+    } else if percentile == 0.0 {
+        // Get minimum value
+        Some(
+            *values
+                .iter()
+                .min_by(|a, b| cmp(a, b))
+                .expect("we checked for len > 0 a few lines above"),
+        )
+    } else if percentile == 1.0 {
+        // Get maximum value
+        Some(
+            *values
+                .iter()
+                .max_by(|a, b| cmp(a, b))
+                .expect("we checked for len > 0 a few lines above"),
+        )
+    } else {
+        // Calculate the index using the formula: p * (n - 1)
+        let index = percentile * ((len - 1) as f64);
+        let lower_index = index.floor() as usize;
+        let upper_index = index.ceil() as usize;
+
+        if lower_index == upper_index {
+            // Exact index, return the value at that position
+            let (_, value, _) = values.select_nth_unstable_by(lower_index, cmp);
+            Some(*value)
+        } else {
+            // Need to interpolate between two values
+            // First, partition at lower_index to get the lower value
+            let (_, lower_value, _) = values.select_nth_unstable_by(lower_index, cmp);
+            let lower_value = *lower_value;
+
+            // Then partition at upper_index to get the upper value
+            let (_, upper_value, _) = values.select_nth_unstable_by(upper_index, cmp);
+            let upper_value = *upper_value;
+
+            // Linear interpolation.
+            // We compute a quantized interpolation weight using `INTERPOLATION_PRECISION` because:
+            // 1. Both values come from the input data, so (upper - lower) is bounded by the value range
+            // 2. fraction is between 0 and 1; quantizing it provides stable, predictable results
+            // 3. The result is guaranteed to be between lower_value and upper_value (modulo cast rounding)
+            // 4. Arithmetic is performed in f64 and cast back to avoid overflowing Float16 intermediates
+            let fraction = index - (lower_index as f64);
+            let scaled = (fraction * INTERPOLATION_PRECISION) as usize;
+            let weight = scaled as f64 / INTERPOLATION_PRECISION;
+
+            let lower_f: f64 = lower_value.as_();
+            let upper_f: f64 = upper_value.as_();
+            let interpolated_f = lower_f + (upper_f - lower_f) * weight;
+            Some(interpolated_f.as_())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::calculate_percentile;
+    use half::f16;
+
+    #[test]
+    fn f16_interpolation_does_not_overflow_to_nan() {
+        // Regression test for https://github.com/apache/datafusion/issues/18945
+        // Interpolating between 0 and the max finite f16 value previously overflowed
+        // intermediate f16 computations and produced NaN.
+        let mut values = vec![f16::from_f32(0.0), f16::from_f32(65504.0)];
+        let result =
+            calculate_percentile::<arrow::datatypes::Float16Type>(&mut values, 0.5)
+                .expect("non-empty input");
+        let result_f = result.to_f32();
+        assert!(
+            !result_f.is_nan(),
+            "expected non-NaN result, got {result_f}"
+        );
+        // 0.5 percentile should be close to midpoint
+        assert!(
+            (result_f - 32752.0).abs() < 1.0,
+            "unexpected result {result_f}"
+        );
+    }
+}
diff --git a/datafusion/functions-aggregate/src/planner.rs b/datafusion/functions-aggregate/src/planner.rs
index c8cb841189954..8a6d9b9bb1e9f 100644
--- a/datafusion/functions-aggregate/src/planner.rs
+++ b/datafusion/functions-aggregate/src/planner.rs
@@ -19,11 +19,11 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::{AggregateFunction, AggregateFunctionParams},
     expr_rewriter::NamePreserver,
     planner::{ExprPlanner, PlannerResult, RawAggregateExpr},
     utils::COUNT_STAR_EXPANSION,
-    Expr,
 };
 
 #[derive(Debug)]
@@ -100,7 +100,7 @@ impl ExprPlanner for AggregateFunctionPlanner {
 
             let new_expr = Expr::AggregateFunction(AggregateFunction::new_udf(
                 func,
-                vec![Expr::Literal(COUNT_STAR_EXPANSION)],
+                vec![Expr::Literal(COUNT_STAR_EXPANSION, None)],
                 distinct,
                 filter,
                 order_by,
diff --git a/datafusion/functions-aggregate/src/regr.rs b/datafusion/functions-aggregate/src/regr.rs
index 0f84aa1323f52..7fef8ac981be4 100644
--- a/datafusion/functions-aggregate/src/regr.rs
+++ b/datafusion/functions-aggregate/src/regr.rs
@@ -17,27 +17,19 @@
 
 //! Defines physical expressions that can evaluated at runtime during query execution
 
-use arrow::array::Float64Array;
 use arrow::datatypes::FieldRef;
-use arrow::{
-    array::{ArrayRef, UInt64Array},
-    compute::cast,
-    datatypes::DataType,
-    datatypes::Field,
-};
-use datafusion_common::{
-    downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, HashMap, Result,
-    ScalarValue,
-};
-use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
+use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{HashMap, Result, ScalarValue};
+use datafusion_doc::aggregate_doc_sections::DOC_SECTION_STATISTICAL;
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
-use datafusion_expr::type_coercion::aggregates::NUMERICS;
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
 };
 use std::any::Any;
 use std::fmt::Debug;
+use std::hash::Hash;
 use std::mem::size_of_val;
 use std::sync::{Arc, LazyLock};
 
@@ -58,25 +50,20 @@ make_regr_udaf_expr_and_func!(regr_sxx, regr_sxx_udaf, RegrType::SXX);
 make_regr_udaf_expr_and_func!(regr_syy, regr_syy_udaf, RegrType::SYY);
 make_regr_udaf_expr_and_func!(regr_sxy, regr_sxy_udaf, RegrType::SXY);
 
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Regr {
     signature: Signature,
     regr_type: RegrType,
     func_name: &'static str,
 }
 
-impl Debug for Regr {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("regr")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Regr {
     pub fn new(regr_type: RegrType, func_name: &'static str) -> Self {
         Self {
-            signature: Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable),
+            signature: Signature::exact(
+                vec![DataType::Float64, DataType::Float64],
+                Volatility::Immutable,
+            ),
             regr_type,
             func_name,
         }
@@ -84,7 +71,6 @@ impl Regr {
 }
 
 #[derive(Debug, Clone, PartialEq, Hash, Eq)]
-#[allow(clippy::upper_case_acronyms)]
 pub enum RegrType {
     /// Variant for `regr_slope` aggregate expression
     /// Returns the slope of the linear regression line for non-null pairs in aggregate columns.
@@ -143,6 +129,29 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
                     Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k*X + b) using minimal RSS fitting.",
 
                 "regr_slope(expression_y, expression_x)")
+                .with_sql_example(
+                    r#"```sql
+create table weekly_performance(day int, user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++-----+--------------+
+| day | user_signups |
++-----+--------------+
+| 1   | 60           |
+| 2   | 65           |
+| 3   | 70           |
+| 4   | 75           |
+| 5   | 80           |
++-----+--------------+
+
+SELECT regr_slope(user_signups, day) AS slope FROM weekly_performance;
++--------+
+| slope  |
++--------+
+| 5.0    |
++--------+
+```
+"#
+                )
                 .with_standard_argument("expression_y", Some("Dependent variable"))
                 .with_standard_argument("expression_x", Some("Independent variable"))
                 .build()
@@ -156,6 +165,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
                     this function returns b.",
 
                 "regr_intercept(expression_y, expression_x)")
+                .with_sql_example(
+                    r#"```sql
+create table weekly_performance(week int, productivity_score int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++------+---------------------+
+| week | productivity_score  |
+| ---- | ------------------- |
+| 1    | 60                  |
+| 2    | 65                  |
+| 3    | 70                  |
+| 4    | 75                  |
+| 5    | 80                  |
++------+---------------------+
+
+SELECT regr_intercept(productivity_score, week) AS intercept FROM weekly_performance;
++----------+
+|intercept|
+|intercept |
++----------+
+|  55      |
++----------+
+```
+"#
+                )
                 .with_standard_argument("expression_y", Some("Dependent variable"))
                 .with_standard_argument("expression_x", Some("Independent variable"))
                 .build()
@@ -168,6 +201,29 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
             "Counts the number of non-null paired data points.",
             "regr_count(expression_y, expression_x)",
         )
+        .with_sql_example(
+            r#"```sql
+create table daily_metrics(day int, user_signups int) as values (1,100), (2,120), (3, NULL), (4,110), (5,NULL);
+select * from daily_metrics;
++-----+---------------+
+| day | user_signups  |
+| --- | ------------- |
+| 1   | 100           |
+| 2   | 120           |
+| 3   | NULL          |
+| 4   | 110           |
+| 5   | NULL          |
++-----+---------------+
+
+SELECT regr_count(user_signups, day) AS valid_pairs FROM daily_metrics;
++-------------+
+| valid_pairs |
++-------------+
+| 3           |
++-------------+
+```
+"#
+        )
         .with_standard_argument("expression_y", Some("Dependent variable"))
         .with_standard_argument("expression_x", Some("Independent variable"))
         .build(),
@@ -180,6 +236,29 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
                     "Computes the square of the correlation coefficient between the independent and dependent variables.",
 
                 "regr_r2(expression_y, expression_x)")
+                .with_sql_example(
+                    r#"```sql
+create table weekly_performance(day int ,user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++-----+--------------+
+| day | user_signups |
++-----+--------------+
+| 1   | 60           |
+| 2   | 65           |
+| 3   | 70           |
+| 4   | 75           |
+| 5   | 80           |
++-----+--------------+
+
+SELECT regr_r2(user_signups, day) AS r_squared FROM weekly_performance;
++---------+
+|r_squared|
++---------+
+| 1.0     |
++---------+
+```
+"#
+                )
                 .with_standard_argument("expression_y", Some("Dependent variable"))
                 .with_standard_argument("expression_x", Some("Independent variable"))
                 .build()
@@ -192,6 +271,29 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
                     "Computes the average of the independent variable (input) expression_x for the non-null paired data points.",
 
                 "regr_avgx(expression_y, expression_x)")
+                .with_sql_example(
+                    r#"```sql
+create table daily_sales(day int, total_sales int) as values (1,100), (2,150), (3,200), (4,NULL), (5,250);
+select * from daily_sales;
++-----+-------------+
+| day | total_sales |
+| --- | ----------- |
+| 1   | 100         |
+| 2   | 150         |
+| 3   | 200         |
+| 4   | NULL        |
+| 5   | 250         |
++-----+-------------+
+
+SELECT regr_avgx(total_sales, day) AS avg_day FROM daily_sales;
++----------+
+| avg_day  |
++----------+
+|   2.75   |
++----------+
+```
+"#
+                )
                 .with_standard_argument("expression_y", Some("Dependent variable"))
                 .with_standard_argument("expression_x", Some("Independent variable"))
                 .build()
@@ -204,6 +306,30 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
                     "Computes the average of the dependent variable (output) expression_y for the non-null paired data points.",
 
                 "regr_avgy(expression_y, expression_x)")
+                .with_sql_example(
+                    r#"```sql
+create table daily_temperature(day int, temperature int) as values (1,30), (2,32), (3, NULL), (4,35), (5,36);
+select * from daily_temperature;
++-----+-------------+
+| day | temperature |
+| --- | ----------- |
+| 1   | 30          |
+| 2   | 32          |
+| 3   | NULL        |
+| 4   | 35          |
+| 5   | 36          |
++-----+-------------+
+
+-- temperature as Dependent Variable(Y), day as Independent Variable(X)
+SELECT regr_avgy(temperature, day) AS avg_temperature FROM daily_temperature;
++-----------------+
+| avg_temperature |
++-----------------+
+| 33.25           |
++-----------------+
+```
+"#
+                )
                 .with_standard_argument("expression_y", Some("Dependent variable"))
                 .with_standard_argument("expression_x", Some("Independent variable"))
                 .build()
@@ -216,6 +342,29 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
             "Computes the sum of squares of the independent variable.",
             "regr_sxx(expression_y, expression_x)",
         )
+        .with_sql_example(
+            r#"```sql
+create table study_hours(student_id int, hours int, test_score int) as values (1,2,55), (2,4,65), (3,6,75), (4,8,85), (5,10,95);
+select * from study_hours;
++------------+-------+------------+
+| student_id | hours | test_score |
++------------+-------+------------+
+| 1          | 2     | 55         |
+| 2          | 4     | 65         |
+| 3          | 6     | 75         |
+| 4          | 8     | 85         |
+| 5          | 10    | 95         |
++------------+-------+------------+
+
+SELECT regr_sxx(test_score, hours) AS sxx FROM study_hours;
++------+
+| sxx  |
++------+
+| 40.0 |
++------+
+```
+"#
+        )
         .with_standard_argument("expression_y", Some("Dependent variable"))
         .with_standard_argument("expression_x", Some("Independent variable"))
         .build(),
@@ -228,6 +377,27 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
             "Computes the sum of squares of the dependent variable.",
             "regr_syy(expression_y, expression_x)",
         )
+        .with_sql_example(
+            r#"```sql
+create table employee_productivity(week int, productivity_score int) as values (1,60), (2,65), (3,70);
+select * from employee_productivity;
++------+--------------------+
+| week | productivity_score |
++------+--------------------+
+| 1    | 60                 |
+| 2    | 65                 |
+| 3    | 70                 |
++------+--------------------+
+
+SELECT regr_syy(productivity_score, week) AS sum_squares_y FROM employee_productivity;
++---------------+
+| sum_squares_y |
++---------------+
+|    50.0       |
++---------------+
+```
+"#
+        )
         .with_standard_argument("expression_y", Some("Dependent variable"))
         .with_standard_argument("expression_x", Some("Independent variable"))
         .build(),
@@ -240,6 +410,27 @@ static DOCUMENTATION: LazyLock<HashMap<RegrType, Documentation>> = LazyLock::new
             "Computes the sum of products of paired data points.",
             "regr_sxy(expression_y, expression_x)",
         )
+        .with_sql_example(
+            r#"```sql
+create table employee_productivity(week int, productivity_score int) as values(1,60), (2,65), (3,70);
+select * from employee_productivity;
++------+--------------------+
+| week | productivity_score |
++------+--------------------+
+| 1    | 60                 |
+| 2    | 65                 |
+| 3    | 70                 |
++------+--------------------+
+
+SELECT regr_sxy(productivity_score, week) AS sum_product_deviations FROM employee_productivity;
++------------------------+
+| sum_product_deviations |
++------------------------+
+|       10.0             |
++------------------------+
+```
+"#
+        )
         .with_standard_argument("expression_y", Some("Dependent variable"))
         .with_standard_argument("expression_x", Some("Independent variable"))
         .build(),
@@ -263,12 +454,8 @@ impl AggregateUDFImpl for Regr {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Covariance requires numeric input types");
-        }
-
-        if matches!(self.regr_type, RegrType::Count) {
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        if self.regr_type == RegrType::Count {
             Ok(DataType::UInt64)
         } else {
             Ok(DataType::Float64)
@@ -401,32 +588,18 @@ impl Accumulator for RegrAccumulator {
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         // regr_slope(Y, X) calculates k in y = k*x + b
-        let values_y = &cast(&values[0], &DataType::Float64)?;
-        let values_x = &cast(&values[1], &DataType::Float64)?;
-
-        let mut arr_y = downcast_value!(values_y, Float64Array).iter().flatten();
-        let mut arr_x = downcast_value!(values_x, Float64Array).iter().flatten();
+        let values_y = as_float64_array(&values[0])?;
+        let values_x = as_float64_array(&values[1])?;
 
-        for i in 0..values_y.len() {
+        for (value_y, value_x) in values_y.iter().zip(values_x) {
             // skip either x or y is NULL
-            let value_y = if values_y.is_valid(i) {
-                arr_y.next()
-            } else {
-                None
-            };
-            let value_x = if values_x.is_valid(i) {
-                arr_x.next()
-            } else {
-                None
+            let (value_y, value_x) = match (value_y, value_x) {
+                (Some(y), Some(x)) => (y, x),
+                // skip either x or y is NULL
+                _ => continue,
             };
-            if value_y.is_none() || value_x.is_none() {
-                continue;
-            }
 
             // Update states for regr_slope(y,x) [using cov_pop(x,y)/var_pop(x)]
-            let value_y = unwrap_or_internal_err!(value_y);
-            let value_x = unwrap_or_internal_err!(value_x);
-
             self.count += 1;
             let delta_x = value_x - self.mean_x;
             let delta_y = value_y - self.mean_y;
@@ -447,32 +620,18 @@ impl Accumulator for RegrAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values_y = &cast(&values[0], &DataType::Float64)?;
-        let values_x = &cast(&values[1], &DataType::Float64)?;
+        let values_y = as_float64_array(&values[0])?;
+        let values_x = as_float64_array(&values[1])?;
 
-        let mut arr_y = downcast_value!(values_y, Float64Array).iter().flatten();
-        let mut arr_x = downcast_value!(values_x, Float64Array).iter().flatten();
-
-        for i in 0..values_y.len() {
+        for (value_y, value_x) in values_y.iter().zip(values_x) {
             // skip either x or y is NULL
-            let value_y = if values_y.is_valid(i) {
-                arr_y.next()
-            } else {
-                None
-            };
-            let value_x = if values_x.is_valid(i) {
-                arr_x.next()
-            } else {
-                None
+            let (value_y, value_x) = match (value_y, value_x) {
+                (Some(y), Some(x)) => (y, x),
+                // skip either x or y is NULL
+                _ => continue,
             };
-            if value_y.is_none() || value_x.is_none() {
-                continue;
-            }
 
             // Update states for regr_slope(y,x) [using cov_pop(x,y)/var_pop(x)]
-            let value_y = unwrap_or_internal_err!(value_y);
-            let value_x = unwrap_or_internal_err!(value_x);
-
             if self.count > 1 {
                 self.count -= 1;
                 let delta_x = value_x - self.mean_x;
@@ -498,12 +657,12 @@ impl Accumulator for RegrAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let count_arr = downcast_value!(states[0], UInt64Array);
-        let mean_x_arr = downcast_value!(states[1], Float64Array);
-        let mean_y_arr = downcast_value!(states[2], Float64Array);
-        let m2_x_arr = downcast_value!(states[3], Float64Array);
-        let m2_y_arr = downcast_value!(states[4], Float64Array);
-        let algo_const_arr = downcast_value!(states[5], Float64Array);
+        let count_arr = as_uint64_array(&states[0])?;
+        let mean_x_arr = as_float64_array(&states[1])?;
+        let mean_y_arr = as_float64_array(&states[2])?;
+        let m2_x_arr = as_float64_array(&states[3])?;
+        let m2_y_arr = as_float64_array(&states[4])?;
+        let algo_const_arr = as_float64_array(&states[5])?;
 
         for i in 0..count_arr.len() {
             let count_b = count_arr.value(i);
diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs
index f948df840e73b..6f77e7df92547 100644
--- a/datafusion/functions-aggregate/src/stddev.rs
+++ b/datafusion/functions-aggregate/src/stddev.rs
@@ -18,15 +18,16 @@
 //! Defines physical expressions that can evaluated at runtime during query execution
 
 use std::any::Any;
-use std::fmt::{Debug, Formatter};
+use std::fmt::Debug;
+use std::hash::Hash;
 use std::mem::align_of_val;
 use std::sync::Arc;
 
 use arrow::array::Float64Array;
 use arrow::datatypes::FieldRef;
 use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field};
-use datafusion_common::{internal_err, not_impl_err, Result};
-use datafusion_common::{plan_err, ScalarValue};
+use datafusion_common::ScalarValue;
+use datafusion_common::{Result, internal_err, not_impl_err};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
@@ -61,20 +62,12 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression",)
 )]
 /// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct Stddev {
     signature: Signature,
     alias: Vec<String>,
 }
 
-impl Debug for Stddev {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Stddev")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for Stddev {
     fn default() -> Self {
         Self::new()
@@ -85,7 +78,7 @@ impl Stddev {
     /// Create a new STDDEV aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
             alias: vec!["stddev_samp".to_string()],
         }
     }
@@ -178,19 +171,11 @@ make_udaf_expr_and_func!(
     standard_argument(name = "expression",)
 )]
 /// STDDEV_POP population aggregate expression
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct StddevPop {
     signature: Signature,
 }
 
-impl Debug for StddevPop {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("StddevPop")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for StddevPop {
     fn default() -> Self {
         Self::new()
@@ -201,7 +186,7 @@ impl StddevPop {
     /// Create a new STDDEV_POP aggregate function
     pub fn new() -> Self {
         Self {
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
@@ -246,11 +231,7 @@ impl AggregateUDFImpl for StddevPop {
         Ok(Box::new(StddevAccumulator::try_new(StatsType::Population)?))
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("StddevPop requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
@@ -315,13 +296,8 @@ impl Accumulator for StddevAccumulator {
     fn evaluate(&mut self) -> Result<ScalarValue> {
         let variance = self.variance.evaluate()?;
         match variance {
-            ScalarValue::Float64(e) => {
-                if e.is_none() {
-                    Ok(ScalarValue::Float64(None))
-                } else {
-                    Ok(ScalarValue::Float64(e.map(|f| f.sqrt())))
-                }
-            }
+            ScalarValue::Float64(None) => Ok(ScalarValue::Float64(None)),
+            ScalarValue::Float64(Some(f)) => Ok(ScalarValue::Float64(Some(f.sqrt()))),
             _ => internal_err!("Variance should be f64"),
         }
     }
@@ -393,7 +369,6 @@ mod tests {
     use datafusion_expr::AggregateUDF;
     use datafusion_functions_aggregate_common::utils::get_accum_scalar_values_as_arrays;
     use datafusion_physical_expr::expressions::col;
-    use datafusion_physical_expr_common::sort_expr::LexOrdering;
     use std::sync::Arc;
 
     #[test]
@@ -441,37 +416,46 @@ mod tests {
         agg2: Arc<AggregateUDF>,
         schema: &Schema,
     ) -> Result<ScalarValue> {
+        let expr = col("a", schema)?;
+        let expr_field = expr.return_field(schema)?;
+
         let args1 = AccumulatorArgs {
             return_field: Field::new("f", DataType::Float64, true).into(),
             schema,
+            expr_fields: &[Arc::clone(&expr_field)],
             ignore_nulls: false,
-            ordering_req: &LexOrdering::default(),
+            order_bys: &[],
             name: "a",
             is_distinct: false,
             is_reversed: false,
-            exprs: &[col("a", schema)?],
+            exprs: &[Arc::clone(&expr)],
         };
 
         let args2 = AccumulatorArgs {
             return_field: Field::new("f", DataType::Float64, true).into(),
             schema,
+            expr_fields: &[expr_field],
             ignore_nulls: false,
-            ordering_req: &LexOrdering::default(),
+            order_bys: &[],
             name: "a",
             is_distinct: false,
             is_reversed: false,
-            exprs: &[col("a", schema)?],
+            exprs: &[expr],
         };
 
         let mut accum1 = agg1.accumulator(args1)?;
         let mut accum2 = agg2.accumulator(args2)?;
 
-        let value1 = vec![col("a", schema)?
-            .evaluate(batch1)
-            .and_then(|v| v.into_array(batch1.num_rows()))?];
-        let value2 = vec![col("a", schema)?
-            .evaluate(batch2)
-            .and_then(|v| v.into_array(batch2.num_rows()))?];
+        let value1 = vec![
+            col("a", schema)?
+                .evaluate(batch1)
+                .and_then(|v| v.into_array(batch1.num_rows()))?,
+        ];
+        let value2 = vec![
+            col("a", schema)?
+                .evaluate(batch2)
+                .and_then(|v| v.into_array(batch2.num_rows()))?,
+        ];
 
         accum1.update_batch(&value1)?;
         accum2.update_batch(&value2)?;
diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs
index 4682e574bfa21..1c10818c091db 100644
--- a/datafusion/functions-aggregate/src/string_agg.rs
+++ b/datafusion/functions-aggregate/src/string_agg.rs
@@ -17,21 +17,28 @@
 
 //! [`StringAgg`] accumulator for the `string_agg` function
 
+use std::any::Any;
+use std::hash::Hash;
+use std::mem::size_of_val;
+
 use crate::array_agg::ArrayAgg;
+
 use arrow::array::ArrayRef;
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
-use datafusion_common::Result;
-use datafusion_common::{internal_err, not_impl_err, ScalarValue};
+use datafusion_common::cast::{
+    as_generic_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{
+    Result, ScalarValue, internal_datafusion_err, internal_err, not_impl_err,
+};
 use datafusion_expr::function::AccumulatorArgs;
+use datafusion_expr::utils::format_state_name;
 use datafusion_expr::{
     Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility,
 };
 use datafusion_functions_aggregate_common::accumulator::StateFieldsArgs;
 use datafusion_macros::user_doc;
 use datafusion_physical_expr::expressions::Literal;
-use std::any::Any;
-use std::mem::size_of_val;
 
 make_udaf_expr_and_func!(
     StringAgg,
@@ -80,7 +87,7 @@ This aggregation function can only mix DISTINCT and ORDER BY if the ordering exp
     )
 )]
 /// STRING_AGG aggregate expression
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct StringAgg {
     signature: Signature,
     array_agg: ArrayAgg,
@@ -118,6 +125,8 @@ impl Default for StringAgg {
     }
 }
 
+/// If there is no `distinct` and `order by` required by the `string_agg` call, a
+/// more efficient accumulator `SimpleStringAggAccumulator` will be used.
 impl AggregateUDFImpl for StringAgg {
     fn as_any(&self) -> &dyn Any {
         self
@@ -136,7 +145,23 @@ impl AggregateUDFImpl for StringAgg {
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        self.array_agg.state_fields(args)
+        // See comments in `impl AggregateUDFImpl ...` for more detail
+        let no_order_no_distinct =
+            (args.ordering_fields.is_empty()) && (!args.is_distinct);
+        if no_order_no_distinct {
+            // Case `SimpleStringAggAccumulator`
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "string_agg"),
+                    DataType::LargeUtf8,
+                    true,
+                )
+                .into(),
+            ])
+        } else {
+            // Case `StringAggAccumulator`
+            self.array_agg.state_fields(args)
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
@@ -159,21 +184,44 @@ impl AggregateUDFImpl for StringAgg {
             );
         };
 
-        let array_agg_acc = self.array_agg.accumulator(AccumulatorArgs {
-            return_field: Field::new(
-                "f",
-                DataType::new_list(acc_args.return_field.data_type().clone(), true),
-                true,
-            )
-            .into(),
-            exprs: &filter_index(acc_args.exprs, 1),
-            ..acc_args
-        })?;
+        // See comments in `impl AggregateUDFImpl ...` for more detail
+        let no_order_no_distinct =
+            acc_args.order_bys.is_empty() && (!acc_args.is_distinct);
 
-        Ok(Box::new(StringAggAccumulator::new(
-            array_agg_acc,
-            delimiter,
-        )))
+        if no_order_no_distinct {
+            // simple case (more efficient)
+            Ok(Box::new(SimpleStringAggAccumulator::new(delimiter)))
+        } else {
+            // general case
+            let array_agg_acc = self.array_agg.accumulator(AccumulatorArgs {
+                return_field: Field::new(
+                    "f",
+                    DataType::new_list(acc_args.return_field.data_type().clone(), true),
+                    true,
+                )
+                .into(),
+                exprs: &filter_index(acc_args.exprs, 1),
+                expr_fields: &filter_index(acc_args.expr_fields, 1),
+                // Unchanged below; we list each field explicitly in case we ever add more
+                // fields to AccumulatorArgs making it easier to see if changes are also
+                // needed here.
+                schema: acc_args.schema,
+                ignore_nulls: acc_args.ignore_nulls,
+                order_bys: acc_args.order_bys,
+                is_reversed: acc_args.is_reversed,
+                name: acc_args.name,
+                is_distinct: acc_args.is_distinct,
+            })?;
+
+            Ok(Box::new(StringAggAccumulator::new(
+                array_agg_acc,
+                delimiter,
+            )))
+        }
+    }
+
+    fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF {
+        datafusion_expr::ReversedUDAF::Reversed(string_agg_udaf())
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -181,6 +229,7 @@ impl AggregateUDFImpl for StringAgg {
     }
 }
 
+/// StringAgg accumulator for the general case (with order or distinct specified)
 #[derive(Debug)]
 pub(crate) struct StringAggAccumulator {
     array_agg_acc: Box<dyn Accumulator>,
@@ -205,7 +254,10 @@ impl Accumulator for StringAggAccumulator {
         let scalar = self.array_agg_acc.evaluate()?;
 
         let ScalarValue::List(list) = scalar else {
-            return internal_err!("Expected a DataType::List while evaluating underlying ArrayAggAccumulator, but got {}", scalar.data_type());
+            return internal_err!(
+                "Expected a DataType::List while evaluating underlying ArrayAggAccumulator, but got {}",
+                scalar.data_type()
+            );
         };
 
         let string_arr: Vec<_> = match list.value_type() {
@@ -225,7 +277,7 @@ impl Accumulator for StringAggAccumulator {
                 return internal_err!(
                     "Expected elements to of type Utf8 or LargeUtf8, but got {}",
                     list.value_type()
-                )
+                );
             }
         };
 
@@ -263,6 +315,104 @@ fn filter_index<T: Clone>(values: &[T], index: usize) -> Vec<T> {
         .collect::<Vec<_>>()
 }
 
+/// StringAgg accumulator for the simple case (no order or distinct specified)
+/// This accumulator is more efficient than `StringAggAccumulator`
+/// because it accumulates the string directly,
+/// whereas `StringAggAccumulator` uses `ArrayAggAccumulator`.
+#[derive(Debug)]
+pub(crate) struct SimpleStringAggAccumulator {
+    delimiter: String,
+    /// Updated during `update_batch()`. e.g. "foo,bar"
+    accumulated_string: String,
+    has_value: bool,
+}
+
+impl SimpleStringAggAccumulator {
+    pub fn new(delimiter: &str) -> Self {
+        Self {
+            delimiter: delimiter.to_string(),
+            accumulated_string: "".to_string(),
+            has_value: false,
+        }
+    }
+
+    #[inline]
+    fn append_strings<'a, I>(&mut self, iter: I)
+    where
+        I: Iterator<Item = Option<&'a str>>,
+    {
+        for value in iter.flatten() {
+            if self.has_value {
+                self.accumulated_string.push_str(&self.delimiter);
+            }
+
+            self.accumulated_string.push_str(value);
+            self.has_value = true;
+        }
+    }
+}
+
+impl Accumulator for SimpleStringAggAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let string_arr = values.first().ok_or_else(|| {
+            internal_datafusion_err!(
+                "Planner should ensure its first arg is Utf8/Utf8View"
+            )
+        })?;
+
+        match string_arr.data_type() {
+            DataType::Utf8 => {
+                let array = as_string_array(string_arr)?;
+                self.append_strings(array.iter());
+            }
+            DataType::LargeUtf8 => {
+                let array = as_generic_string_array::<i64>(string_arr)?;
+                self.append_strings(array.iter());
+            }
+            DataType::Utf8View => {
+                let array = as_string_view_array(string_arr)?;
+                self.append_strings(array.iter());
+            }
+            other => {
+                return internal_err!(
+                    "Planner should ensure string_agg first argument is Utf8-like, found {other}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.has_value {
+            Ok(ScalarValue::LargeUtf8(Some(
+                self.accumulated_string.clone(),
+            )))
+        } else {
+            Ok(ScalarValue::LargeUtf8(None))
+        }
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.delimiter.capacity() + self.accumulated_string.capacity()
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        let result = if self.has_value {
+            ScalarValue::LargeUtf8(Some(std::mem::take(&mut self.accumulated_string)))
+        } else {
+            ScalarValue::LargeUtf8(None)
+        };
+        self.has_value = false;
+
+        Ok(vec![result])
+    }
+
+    fn merge_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.update_batch(values)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -271,7 +421,7 @@ mod tests {
     use arrow::datatypes::{Fields, Schema};
     use datafusion_common::internal_err;
     use datafusion_physical_expr::expressions::Column;
-    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
     use std::sync::Arc;
 
     #[test]
@@ -413,7 +563,7 @@ mod tests {
     struct StringAggAccumulatorBuilder {
         sep: String,
         distinct: bool,
-        ordering: LexOrdering,
+        order_bys: Vec<PhysicalSortExpr>,
         schema: Schema,
     }
 
@@ -422,7 +572,7 @@ mod tests {
             Self {
                 sep: sep.to_string(),
                 distinct: Default::default(),
-                ordering: Default::default(),
+                order_bys: vec![],
                 schema: Schema {
                     fields: Fields::from(vec![Field::new(
                         "col",
@@ -439,7 +589,7 @@ mod tests {
         }
 
         fn order_by_col(mut self, col: &str, sort_options: SortOptions) -> Self {
-            self.ordering.extend([PhysicalSortExpr::new(
+            self.order_bys.extend([PhysicalSortExpr::new(
                 Arc::new(
                     Column::new_with_schema(col, &self.schema)
                         .expect("column not available in schema"),
@@ -453,8 +603,12 @@ mod tests {
             StringAgg::new().accumulator(AccumulatorArgs {
                 return_field: Field::new("f", DataType::LargeUtf8, true).into(),
                 schema: &self.schema,
+                expr_fields: &[
+                    Field::new("col", DataType::LargeUtf8, true).into(),
+                    Field::new("lit", DataType::Utf8, false).into(),
+                ],
                 ignore_nulls: false,
-                ordering_req: &self.ordering,
+                order_bys: &self.order_bys,
                 is_reversed: false,
                 name: "",
                 is_distinct: self.distinct,
diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs
index 37d208ffb03ad..4f638f2cb005c 100644
--- a/datafusion/functions-aggregate/src/sum.rs
+++ b/datafusion/functions-aggregate/src/sum.rs
@@ -17,35 +17,36 @@
 
 //! Defines `SUM` and `SUM DISTINCT` aggregate accumulators
 
-use ahash::RandomState;
-use datafusion_expr::utils::AggregateOrderSensitivity;
-use std::any::Any;
-use std::collections::HashSet;
-use std::mem::{size_of, size_of_val};
-
-use arrow::array::Array;
-use arrow::array::ArrowNativeTypeOp;
-use arrow::array::{ArrowNumericType, AsArray};
-use arrow::datatypes::ArrowPrimitiveType;
-use arrow::datatypes::{ArrowNativeType, FieldRef};
+use arrow::array::{Array, ArrayRef, ArrowNativeTypeOp, ArrowNumericType, AsArray};
+use arrow::datatypes::Field;
 use arrow::datatypes::{
-    DataType, Decimal128Type, Decimal256Type, Float64Type, Int64Type, UInt64Type,
-    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    ArrowNativeType, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
+    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DataType, Decimal32Type,
+    Decimal64Type, Decimal128Type, Decimal256Type, DurationMicrosecondType,
+    DurationMillisecondType, DurationNanosecondType, DurationSecondType, FieldRef,
+    Float64Type, Int64Type, TimeUnit, UInt64Type,
 };
-use arrow::{array::ArrayRef, datatypes::Field};
-use datafusion_common::{
-    exec_err, not_impl_err, utils::take_function_args, Result, ScalarValue,
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::internal_err;
+use datafusion_common::types::{
+    NativeType, logical_float64, logical_int8, logical_int16, logical_int32,
+    logical_int64, logical_uint8, logical_uint16, logical_uint32, logical_uint64,
 };
-use datafusion_expr::function::AccumulatorArgs;
-use datafusion_expr::function::StateFieldsArgs;
-use datafusion_expr::utils::format_state_name;
+use datafusion_common::{HashMap, Result, ScalarValue, exec_err, not_impl_err};
+use datafusion_expr::expr::AggregateFunction;
+use datafusion_expr::expr_fn::cast;
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::{AggregateOrderSensitivity, format_state_name};
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF,
-    SetMonotonicity, Signature, Volatility,
+    Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, GroupsAccumulator,
+    Operator, ReversedUDAF, SetMonotonicity, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator;
-use datafusion_functions_aggregate_common::utils::Hashable;
+use datafusion_functions_aggregate_common::aggregate::sum_distinct::DistinctSumAccumulator;
 use datafusion_macros::user_doc;
+use std::any::Any;
+use std::mem::size_of_val;
 
 make_udaf_expr_and_func!(
     Sum,
@@ -55,6 +56,17 @@ make_udaf_expr_and_func!(
     sum_udaf
 );
 
+pub fn sum_distinct(expr: Expr) -> Expr {
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        sum_udaf(),
+        vec![expr],
+        true,
+        None,
+        vec![],
+        None,
+    ))
+}
+
 /// Sum only supports a subset of numeric types, instead relying on type coercion
 ///
 /// This macro is similar to [downcast_primitive](arrow::array::downcast_primitive)
@@ -73,12 +85,39 @@ macro_rules! downcast_sum {
             DataType::Float64 => {
                 $helper!(Float64Type, $args.return_field.data_type().clone())
             }
+            DataType::Decimal32(_, _) => {
+                $helper!(Decimal32Type, $args.return_field.data_type().clone())
+            }
+            DataType::Decimal64(_, _) => {
+                $helper!(Decimal64Type, $args.return_field.data_type().clone())
+            }
             DataType::Decimal128(_, _) => {
                 $helper!(Decimal128Type, $args.return_field.data_type().clone())
             }
             DataType::Decimal256(_, _) => {
                 $helper!(Decimal256Type, $args.return_field.data_type().clone())
             }
+            DataType::Duration(TimeUnit::Second) => {
+                $helper!(DurationSecondType, $args.return_field.data_type().clone())
+            }
+            DataType::Duration(TimeUnit::Millisecond) => {
+                $helper!(
+                    DurationMillisecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
+            DataType::Duration(TimeUnit::Microsecond) => {
+                $helper!(
+                    DurationMicrosecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
+            DataType::Duration(TimeUnit::Nanosecond) => {
+                $helper!(
+                    DurationNanosecondType,
+                    $args.return_field.data_type().clone()
+                )
+            }
             _ => {
                 not_impl_err!(
                     "Sum not supported for {}: {}",
@@ -104,7 +143,7 @@ macro_rules! downcast_sum {
 ```"#,
     standard_argument(name = "expression",)
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Sum {
     signature: Signature,
 }
@@ -112,7 +151,45 @@ pub struct Sum {
 impl Sum {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+            // smallint, int, bigint, real, double precision, decimal, or interval.
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    // Unsigned to u64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_uint64()),
+                        vec![
+                            TypeSignatureClass::Native(logical_uint8()),
+                            TypeSignatureClass::Native(logical_uint16()),
+                            TypeSignatureClass::Native(logical_uint32()),
+                        ],
+                        NativeType::UInt64,
+                    )]),
+                    // Signed to i64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![
+                            TypeSignatureClass::Native(logical_int8()),
+                            TypeSignatureClass::Native(logical_int16()),
+                            TypeSignatureClass::Native(logical_int32()),
+                        ],
+                        NativeType::Int64,
+                    )]),
+                    // Floats to f64
+                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_float64()),
+                        vec![TypeSignatureClass::Float],
+                        NativeType::Float64,
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Duration,
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -136,47 +213,30 @@ impl AggregateUDFImpl for Sum {
         &self.signature
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [args] = take_function_args(self.name(), arg_types)?;
-
-        // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
-        // smallint, int, bigint, real, double precision, decimal, or interval.
-
-        fn coerced_type(data_type: &DataType) -> Result<DataType> {
-            match data_type {
-                DataType::Dictionary(_, v) => coerced_type(v),
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
-                DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => {
-                    Ok(data_type.clone())
-                }
-                dt if dt.is_signed_integer() => Ok(DataType::Int64),
-                dt if dt.is_unsigned_integer() => Ok(DataType::UInt64),
-                dt if dt.is_floating() => Ok(DataType::Float64),
-                _ => exec_err!("Sum not supported for {}", data_type),
-            }
-        }
-
-        Ok(vec![coerced_type(args)?])
-    }
-
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
             DataType::Int64 => Ok(DataType::Int64),
             DataType::UInt64 => Ok(DataType::UInt64),
             DataType::Float64 => Ok(DataType::Float64),
+            // In the spark, the result type is DECIMAL(min(38,precision+10), s)
+            // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+            DataType::Decimal32(precision, scale) => {
+                let new_precision = DECIMAL32_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal32(new_precision, *scale))
+            }
+            DataType::Decimal64(precision, scale) => {
+                let new_precision = DECIMAL64_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal64(new_precision, *scale))
+            }
             DataType::Decimal128(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal128(new_precision, *scale))
             }
             DataType::Decimal256(precision, scale) => {
-                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
-                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
                 let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 10);
                 Ok(DataType::Decimal256(new_precision, *scale))
             }
+            DataType::Duration(time_unit) => Ok(DataType::Duration(*time_unit)),
             other => {
                 exec_err!("[return_type] SUM not supported for {}", other)
             }
@@ -187,7 +247,7 @@ impl AggregateUDFImpl for Sum {
         if args.is_distinct {
             macro_rules! helper {
                 ($t:ty, $dt:expr) => {
-                    Ok(Box::new(DistinctSumAccumulator::<$t>::try_new(&$dt)?))
+                    Ok(Box::new(DistinctSumAccumulator::<$t>::new(&$dt)))
                 };
             }
             downcast_sum!(args, helper)
@@ -203,27 +263,27 @@ impl AggregateUDFImpl for Sum {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         if args.is_distinct {
-            Ok(vec![Field::new_list(
-                format_state_name(args.name, "sum distinct"),
-                // See COMMENTS.md to understand why nullable is set to true
-                Field::new_list_field(args.return_type().clone(), true),
-                false,
-            )
-            .into()])
+            Ok(vec![
+                Field::new_list(
+                    format_state_name(args.name, "sum distinct"),
+                    // See COMMENTS.md to understand why nullable is set to true
+                    Field::new_list_field(args.return_type().clone(), true),
+                    false,
+                )
+                .into(),
+            ])
         } else {
-            Ok(vec![Field::new(
-                format_state_name(args.name, "sum"),
-                args.return_type().clone(),
-                true,
-            )
-            .into()])
+            Ok(vec![
+                Field::new(
+                    format_state_name(args.name, "sum"),
+                    args.return_type().clone(),
+                    true,
+                )
+                .into(),
+            ])
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &[]
-    }
-
     fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
         !args.is_distinct
     }
@@ -247,12 +307,23 @@ impl AggregateUDFImpl for Sum {
         &self,
         args: AccumulatorArgs,
     ) -> Result<Box<dyn Accumulator>> {
-        macro_rules! helper {
-            ($t:ty, $dt:expr) => {
-                Ok(Box::new(SlidingSumAccumulator::<$t>::new($dt.clone())))
-            };
+        if args.is_distinct {
+            // distinct path: use our sliding‐window distinct‐sum
+            macro_rules! helper_distinct {
+                ($t:ty, $dt:expr) => {
+                    Ok(Box::new(SlidingDistinctSumAccumulator::try_new(&$dt)?))
+                };
+            }
+            downcast_sum!(args, helper_distinct)
+        } else {
+            // non‐distinct path: existing sliding sum
+            macro_rules! helper {
+                ($t:ty, $dt:expr) => {
+                    Ok(Box::new(SlidingSumAccumulator::<$t>::new($dt.clone())))
+                };
+            }
+            downcast_sum!(args, helper)
         }
-        downcast_sum!(args, helper)
     }
 
     fn reverse_expr(&self) -> ReversedUDAF {
@@ -278,6 +349,47 @@ impl AggregateUDFImpl for Sum {
             _ => SetMonotonicity::NotMonotonic,
         }
     }
+
+    /// Implement ClickBench Q29 specific optimization:
+    /// `SUM(arg + constant)` --> `SUM(arg) + constant * COUNT(arg)`
+    ///
+    /// See background on [`AggregateUDFImpl::simplify_expr_op_literal`]
+    fn simplify_expr_op_literal(
+        &self,
+        agg_function: &AggregateFunction,
+        arg: &Expr,
+        op: Operator,
+        lit: &Expr,
+        // Only support '+' so the order of the args doesn't matter
+        _arg_is_left: bool,
+    ) -> Result<Option<Expr>> {
+        if op != Operator::Plus {
+            return Ok(None);
+        }
+
+        let lit_type = match &lit {
+            Expr::Literal(value, _) => value.data_type(),
+            _ => {
+                return internal_err!(
+                    "Sum::simplify_expr_op_literal got a non literal argument"
+                );
+            }
+        };
+        if lit_type == DataType::Null {
+            return Ok(None);
+        }
+
+        // Build up SUM(arg)
+        let mut sum_agg = agg_function.clone();
+        sum_agg.params.args = vec![arg.clone()];
+        let sum_agg = Expr::AggregateFunction(sum_agg);
+
+        // COUNT(arg) - cast to the correct type
+        let count_agg = cast(crate::count::count(arg.clone()), lit_type);
+
+        // SUM(arg) + lit * COUNT(arg)
+        Ok(Some(sum_agg + (lit.clone() * count_agg)))
+    }
 }
 
 /// This accumulator computes SUM incrementally
@@ -309,7 +421,7 @@ impl<T: ArrowNumericType> Accumulator for SumAccumulator<T> {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         let values = values[0].as_primitive::<T>();
         if let Some(x) = arrow::compute::sum(values) {
-            let v = self.sum.get_or_insert(T::Native::usize_as(0));
+            let v = self.sum.get_or_insert_with(|| T::Native::usize_as(0));
             *v = v.add_wrapping(x);
         }
         Ok(())
@@ -401,83 +513,106 @@ impl<T: ArrowNumericType> Accumulator for SlidingSumAccumulator<T> {
     }
 }
 
-struct DistinctSumAccumulator<T: ArrowPrimitiveType> {
-    values: HashSet<Hashable<T::Native>, RandomState>,
+/// A sliding‐window accumulator for `SUM(DISTINCT)` over Int64 columns.
+/// Maintains a running sum so that `evaluate()` is O(1).
+#[derive(Debug)]
+pub struct SlidingDistinctSumAccumulator {
+    /// Map each distinct value → its current count in the window
+    counts: HashMap<i64, usize, RandomState>,
+    /// Running sum of all distinct keys currently in the window
+    sum: i64,
+    /// Data type (must be Int64)
     data_type: DataType,
 }
 
-impl<T: ArrowPrimitiveType> std::fmt::Debug for DistinctSumAccumulator<T> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "DistinctSumAccumulator({})", self.data_type)
-    }
-}
-
-impl<T: ArrowPrimitiveType> DistinctSumAccumulator<T> {
+impl SlidingDistinctSumAccumulator {
+    /// Create a new accumulator; only `DataType::Int64` is supported.
     pub fn try_new(data_type: &DataType) -> Result<Self> {
+        // TODO support other numeric types
+        if *data_type != DataType::Int64 {
+            return exec_err!("SlidingDistinctSumAccumulator only supports Int64");
+        }
         Ok(Self {
-            values: HashSet::default(),
+            counts: HashMap::default(),
+            sum: 0,
             data_type: data_type.clone(),
         })
     }
 }
 
-impl<T: ArrowPrimitiveType> Accumulator for DistinctSumAccumulator<T> {
-    fn state(&mut self) -> Result<Vec<ScalarValue>> {
-        // 1. Stores aggregate state in `ScalarValue::List`
-        // 2. Constructs `ScalarValue::List` state from distinct numeric stored in hash set
-        let state_out = {
-            let distinct_values = self
-                .values
-                .iter()
-                .map(|value| {
-                    ScalarValue::new_primitive::<T>(Some(value.0), &self.data_type)
-                })
-                .collect::<Result<Vec<_>>>()?;
-
-            vec![ScalarValue::List(ScalarValue::new_list_nullable(
-                &distinct_values,
-                &self.data_type,
-            ))]
-        };
-        Ok(state_out)
-    }
-
+impl Accumulator for SlidingDistinctSumAccumulator {
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        if values.is_empty() {
-            return Ok(());
-        }
-
-        let array = values[0].as_primitive::<T>();
-        match array.nulls().filter(|x| x.null_count() > 0) {
-            Some(n) => {
-                for idx in n.valid_indices() {
-                    self.values.insert(Hashable(array.value(idx)));
-                }
+        let arr = values[0].as_primitive::<Int64Type>();
+        for &v in arr.values() {
+            let cnt = self.counts.entry(v).or_insert(0);
+            if *cnt == 0 {
+                // first occurrence in window
+                self.sum = self.sum.wrapping_add(v);
             }
-            None => array.values().iter().for_each(|x| {
-                self.values.insert(Hashable(*x));
-            }),
+            *cnt += 1;
         }
         Ok(())
     }
 
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        // O(1) wrap of running sum
+        Ok(ScalarValue::Int64(Some(self.sum)))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        // Serialize distinct keys for cross-partition merge if needed
+        let keys = self
+            .counts
+            .keys()
+            .cloned()
+            .map(Some)
+            .map(ScalarValue::Int64)
+            .collect::<Vec<_>>();
+        Ok(vec![ScalarValue::List(ScalarValue::new_list_nullable(
+            &keys,
+            &self.data_type,
+        ))])
+    }
+
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        for x in states[0].as_list::<i32>().iter().flatten() {
-            self.update_batch(&[x])?
+        // Merge distinct keys from other partitions
+        let list_arr = states[0].as_list::<i32>();
+        for maybe_inner in list_arr.iter().flatten() {
+            for idx in 0..maybe_inner.len() {
+                if let ScalarValue::Int64(Some(v)) =
+                    ScalarValue::try_from_array(&*maybe_inner, idx)?
+                {
+                    let cnt = self.counts.entry(v).or_insert(0);
+                    if *cnt == 0 {
+                        self.sum = self.sum.wrapping_add(v);
+                    }
+                    *cnt += 1;
+                }
+            }
         }
         Ok(())
     }
 
-    fn evaluate(&mut self) -> Result<ScalarValue> {
-        let mut acc = T::Native::usize_as(0);
-        for distinct_value in self.values.iter() {
-            acc = acc.add_wrapping(distinct_value.0)
+    fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let arr = values[0].as_primitive::<Int64Type>();
+        for &v in arr.values() {
+            if let Some(cnt) = self.counts.get_mut(&v) {
+                *cnt -= 1;
+                if *cnt == 0 {
+                    // last copy leaving window
+                    self.sum = self.sum.wrapping_sub(v);
+                    self.counts.remove(&v);
+                }
+            }
         }
-        let v = (!self.values.is_empty()).then_some(acc);
-        ScalarValue::new_primitive::<T>(v, &self.data_type)
+        Ok(())
     }
 
-    fn size(&self) -> usize {
-        size_of_val(self) + self.values.capacity() * size_of::<T::Native>()
+    fn supports_retract_batch(&self) -> bool {
+        true
     }
 }
diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/functions-aggregate/src/utils.rs
new file mode 100644
index 0000000000000..5e1925fcdbb5d
--- /dev/null
+++ b/datafusion/functions-aggregate/src/utils.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::RecordBatch;
+use arrow::datatypes::Schema;
+use datafusion_common::{DataFusionError, Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+/// Evaluates a physical expression to extract its scalar value.
+///
+/// This is used to extract constant values from expressions (like percentile parameters)
+/// by evaluating them against an empty record batch.
+pub(crate) fn get_scalar_value(expr: &Arc<dyn PhysicalExpr>) -> Result<ScalarValue> {
+    let empty_schema = Arc::new(Schema::empty());
+    let batch = RecordBatch::new_empty(Arc::clone(&empty_schema));
+    if let ColumnarValue::Scalar(s) = expr.evaluate(&batch)? {
+        Ok(s)
+    } else {
+        internal_err!("Didn't expect ColumnarValue::Array")
+    }
+}
+
+/// Validates that a percentile expression is a literal float value between 0.0 and 1.0.
+///
+/// Used by both `percentile_cont` and `approx_percentile_cont` to validate their
+/// percentile parameters.
+pub(crate) fn validate_percentile_expr(
+    expr: &Arc<dyn PhysicalExpr>,
+    fn_name: &str,
+) -> Result<f64> {
+    let scalar_value = get_scalar_value(expr).map_err(|_e| {
+        DataFusionError::Plan(format!(
+            "Percentile value for '{fn_name}' must be a literal"
+        ))
+    })?;
+
+    let percentile = match scalar_value {
+        ScalarValue::Float32(Some(value)) => value as f64,
+        ScalarValue::Float64(Some(value)) => value,
+        sv => {
+            return plan_err!(
+                "Percentile value for '{fn_name}' must be Float32 or Float64 literal (got data type {})",
+                sv.data_type()
+            );
+        }
+    };
+
+    // Ensure the percentile is between 0 and 1.
+    if !(0.0..=1.0).contains(&percentile) {
+        return plan_err!(
+            "Percentile value must be between 0.0 and 1.0 inclusive, {percentile} is invalid"
+        );
+    }
+    Ok(percentile)
+}
diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs
index 586b2dab0ae6b..fb089ba4f9cea 100644
--- a/datafusion/functions-aggregate/src/variance.rs
+++ b/datafusion/functions-aggregate/src/variance.rs
@@ -18,20 +18,21 @@
 //! [`VarianceSample`]: variance sample aggregations.
 //! [`VariancePopulation`]: variance population aggregations.
 
-use arrow::datatypes::FieldRef;
+use arrow::datatypes::{FieldRef, Float64Type};
 use arrow::{
     array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array},
     buffer::NullBuffer,
-    compute::kernels::cast,
     datatypes::{DataType, Field},
 };
-use datafusion_common::{downcast_value, not_impl_err, plan_err, Result, ScalarValue};
+use datafusion_common::cast::{as_float64_array, as_uint64_array};
+use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
-    function::{AccumulatorArgs, StateFieldsArgs},
-    utils::format_state_name,
     Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature,
     Volatility,
+    function::{AccumulatorArgs, StateFieldsArgs},
+    utils::format_state_name,
 };
+use datafusion_functions_aggregate_common::utils::GenericDistinctBuffer;
 use datafusion_functions_aggregate_common::{
     aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType,
 };
@@ -61,20 +62,12 @@ make_udaf_expr_and_func!(
     syntax_example = "var(expression)",
     standard_argument(name = "expression", prefix = "Numeric")
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct VarianceSample {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for VarianceSample {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("VarianceSample")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for VarianceSample {
     fn default() -> Self {
         Self::new()
@@ -85,7 +78,7 @@ impl VarianceSample {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("var_sample"), String::from("var_samp")],
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
@@ -109,19 +102,35 @@ impl AggregateUDFImpl for VarianceSample {
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
         let name = args.name;
-        Ok(vec![
-            Field::new(format_state_name(name, "count"), DataType::UInt64, true),
-            Field::new(format_state_name(name, "mean"), DataType::Float64, true),
-            Field::new(format_state_name(name, "m2"), DataType::Float64, true),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        match args.is_distinct {
+            false => Ok(vec![
+                Field::new(format_state_name(name, "count"), DataType::UInt64, true),
+                Field::new(format_state_name(name, "mean"), DataType::Float64, true),
+                Field::new(format_state_name(name, "m2"), DataType::Float64, true),
+            ]
+            .into_iter()
+            .map(Arc::new)
+            .collect()),
+            true => {
+                let field = Field::new_list_field(DataType::Float64, true);
+                let state_name = "distinct_var";
+                Ok(vec![
+                    Field::new(
+                        format_state_name(name, state_name),
+                        DataType::List(Arc::new(field)),
+                        true,
+                    )
+                    .into(),
+                ])
+            }
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         if acc_args.is_distinct {
-            return not_impl_err!("VAR(DISTINCT) aggregations are not available");
+            return Ok(Box::new(DistinctVarianceAccumulator::new(
+                StatsType::Sample,
+            )));
         }
 
         Ok(Box::new(VarianceAccumulator::try_new(StatsType::Sample)?))
@@ -153,20 +162,12 @@ impl AggregateUDFImpl for VarianceSample {
     syntax_example = "var_pop(expression)",
     standard_argument(name = "expression", prefix = "Numeric")
 )]
+#[derive(PartialEq, Eq, Hash, Debug)]
 pub struct VariancePopulation {
     signature: Signature,
     aliases: Vec<String>,
 }
 
-impl Debug for VariancePopulation {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        f.debug_struct("VariancePopulation")
-            .field("name", &self.name())
-            .field("signature", &self.signature)
-            .finish()
-    }
-}
-
 impl Default for VariancePopulation {
     fn default() -> Self {
         Self::new()
@@ -177,7 +178,7 @@ impl VariancePopulation {
     pub fn new() -> Self {
         Self {
             aliases: vec![String::from("var_population")],
-            signature: Signature::numeric(1, Volatility::Immutable),
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
@@ -195,29 +196,43 @@ impl AggregateUDFImpl for VariancePopulation {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if !arg_types[0].is_numeric() {
-            return plan_err!("Variance requires numeric input types");
-        }
-
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
-        let name = args.name;
-        Ok(vec![
-            Field::new(format_state_name(name, "count"), DataType::UInt64, true),
-            Field::new(format_state_name(name, "mean"), DataType::Float64, true),
-            Field::new(format_state_name(name, "m2"), DataType::Float64, true),
-        ]
-        .into_iter()
-        .map(Arc::new)
-        .collect())
+        match args.is_distinct {
+            false => {
+                let name = args.name;
+                Ok(vec![
+                    Field::new(format_state_name(name, "count"), DataType::UInt64, true),
+                    Field::new(format_state_name(name, "mean"), DataType::Float64, true),
+                    Field::new(format_state_name(name, "m2"), DataType::Float64, true),
+                ]
+                .into_iter()
+                .map(Arc::new)
+                .collect())
+            }
+            true => {
+                let field = Field::new_list_field(DataType::Float64, true);
+                let state_name = "distinct_var";
+                Ok(vec![
+                    Field::new(
+                        format_state_name(args.name, state_name),
+                        DataType::List(Arc::new(field)),
+                        true,
+                    )
+                    .into(),
+                ])
+            }
+        }
     }
 
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
         if acc_args.is_distinct {
-            return not_impl_err!("VAR_POP(DISTINCT) aggregations are not available");
+            return Ok(Box::new(DistinctVarianceAccumulator::new(
+                StatsType::Population,
+            )));
         }
 
         Ok(Box::new(VarianceAccumulator::try_new(
@@ -241,6 +256,7 @@ impl AggregateUDFImpl for VariancePopulation {
             StatsType::Population,
         )))
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -328,10 +344,8 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let arr = downcast_value!(values, Float64Array).iter().flatten();
-
-        for value in arr {
+        let arr = as_float64_array(&values[0])?;
+        for value in arr.iter().flatten() {
             (self.count, self.mean, self.m2) =
                 update(self.count, self.mean, self.m2, value)
         }
@@ -340,10 +354,8 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let arr = downcast_value!(values, Float64Array).iter().flatten();
-
-        for value in arr {
+        let arr = as_float64_array(&values[0])?;
+        for value in arr.iter().flatten() {
             let new_count = self.count - 1;
             let delta1 = self.mean - value;
             let new_mean = delta1 / new_count as f64 + self.mean;
@@ -359,9 +371,9 @@ impl Accumulator for VarianceAccumulator {
     }
 
     fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
-        let counts = downcast_value!(states[0], UInt64Array);
-        let means = downcast_value!(states[1], Float64Array);
-        let m2s = downcast_value!(states[2], Float64Array);
+        let counts = as_uint64_array(&states[0])?;
+        let means = as_float64_array(&states[1])?;
+        let m2s = as_float64_array(&states[2])?;
 
         for i in 0..counts.len() {
             let c = counts.value(i);
@@ -496,8 +508,7 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
         total_num_groups: usize,
     ) -> Result<()> {
         assert_eq!(values.len(), 1, "single argument to update_batch");
-        let values = &cast(&values[0], &DataType::Float64)?;
-        let values = downcast_value!(values, Float64Array);
+        let values = as_float64_array(&values[0])?;
 
         self.resize(total_num_groups);
         accumulate(group_indices, values, opt_filter, |group_index, value| {
@@ -524,9 +535,9 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
     ) -> Result<()> {
         assert_eq!(values.len(), 3, "two arguments to merge_batch");
         // first batch is counts, second is partial means, third is partial m2s
-        let partial_counts = downcast_value!(values[0], UInt64Array);
-        let partial_means = downcast_value!(values[1], Float64Array);
-        let partial_m2s = downcast_value!(values[2], Float64Array);
+        let partial_counts = as_uint64_array(&values[0])?;
+        let partial_means = as_float64_array(&values[1])?;
+        let partial_m2s = as_float64_array(&values[2])?;
 
         self.resize(total_num_groups);
         Self::merge(
@@ -579,6 +590,71 @@ impl GroupsAccumulator for VarianceGroupsAccumulator {
     }
 }
 
+#[derive(Debug)]
+pub struct DistinctVarianceAccumulator {
+    distinct_values: GenericDistinctBuffer<Float64Type>,
+    stat_type: StatsType,
+}
+
+impl DistinctVarianceAccumulator {
+    pub fn new(stat_type: StatsType) -> Self {
+        Self {
+            distinct_values: GenericDistinctBuffer::<Float64Type>::new(DataType::Float64),
+            stat_type,
+        }
+    }
+}
+
+impl Accumulator for DistinctVarianceAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.update_batch(values)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let values = self
+            .distinct_values
+            .values
+            .iter()
+            .map(|v| v.0)
+            .collect::<Vec<_>>();
+
+        let count = match self.stat_type {
+            StatsType::Sample => {
+                if !values.is_empty() {
+                    values.len() - 1
+                } else {
+                    0
+                }
+            }
+            StatsType::Population => values.len(),
+        };
+
+        let mean = values.iter().sum::<f64>() / values.len() as f64;
+        let m2 = values.iter().map(|x| (x - mean) * (x - mean)).sum::<f64>();
+
+        Ok(ScalarValue::Float64(match values.len() {
+            0 => None,
+            1 => match self.stat_type {
+                StatsType::Population => Some(0.0),
+                StatsType::Sample => None,
+            },
+            _ => Some(m2 / count as f64),
+        }))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + self.distinct_values.size()
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.distinct_values.state()
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.distinct_values.merge_batch(states)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use datafusion_expr::EmitTo;
diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml
index 9a7b1f460ef54..5fce3e854eb33 100644
--- a/datafusion/functions-nested/Cargo.toml
+++ b/datafusion/functions-nested/Cargo.toml
@@ -31,35 +31,81 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_functions_nested"
 
+[features]
+default = ["sql"]
+sql = ["datafusion-expr/sql"]
+
 [dependencies]
 arrow = { workspace = true }
 arrow-ord = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-doc = { workspace = true }
 datafusion-execution = { workspace = true }
-datafusion-expr = { workspace = true }
+datafusion-expr = { workspace = true, default-features = false }
+datafusion-expr-common = { workspace = true }
 datafusion-functions = { workspace = true }
 datafusion-functions-aggregate = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
 datafusion-macros = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
+hashbrown = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
+itoa = { workspace = true }
 log = { workspace = true }
-paste = "1.0.14"
 
 [dev-dependencies]
 criterion = { workspace = true, features = ["async_tokio"] }
 rand = { workspace = true }
 
+[[bench]]
+harness = false
+name = "array_concat"
+
 [[bench]]
 harness = false
 name = "array_expression"
 
+[[bench]]
+harness = false
+name = "array_has"
+
+[[bench]]
+harness = false
+name = "array_reverse"
+
+[[bench]]
+harness = false
+name = "array_slice"
+
 [[bench]]
 harness = false
 name = "map"
+
+[[bench]]
+harness = false
+name = "array_remove"
+
+[[bench]]
+harness = false
+name = "array_repeat"
+
+[[bench]]
+harness = false
+name = "array_set_ops"
+
+[[bench]]
+harness = false
+name = "array_to_string"
+
+[[bench]]
+harness = false
+name = "array_position"
diff --git a/datafusion/functions-nested/README.md b/datafusion/functions-nested/README.md
index 8a5047c838ab0..6ab456edb1925 100644
--- a/datafusion/functions-nested/README.md
+++ b/datafusion/functions-nested/README.md
@@ -17,11 +17,18 @@
   under the License.
 -->
 
-# DataFusion Nested Type Function Library
+# Apache DataFusion Nested Type Function Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate contains functions for working with arrays, maps and structs, such as `array_append` that work with
-`ListArray`, `LargeListArray` and `FixedListArray` types from the `arrow` crate.
+`ListArray`, `LargeListArray` and `FixedListArray` types from the [`arrow`] crate.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`arrow`]: https://crates.io/crates/arrow
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-nested/benches/array_concat.rs b/datafusion/functions-nested/benches/array_concat.rs
new file mode 100644
index 0000000000000..75dcc88f14737
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_concat.rs
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, ListArray};
+use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+use datafusion_functions_nested::concat::array_concat_inner;
+
+const SEED: u64 = 42;
+
+/// Build a `ListArray<i32>` with `num_lists` rows, each containing
+/// `elements_per_list` random i32 values. Every 10th row is null.
+fn make_list_array(
+    rng: &mut StdRng,
+    num_lists: usize,
+    elements_per_list: usize,
+) -> ArrayRef {
+    let total_values = num_lists * elements_per_list;
+    let values: Vec<i32> = (0..total_values).map(|_| rng.random()).collect();
+    let values = Arc::new(Int32Array::from(values));
+
+    let offsets: Vec<i32> = (0..=num_lists)
+        .map(|i| (i * elements_per_list) as i32)
+        .collect();
+    let offsets = OffsetBuffer::new(ScalarBuffer::from(offsets));
+
+    let nulls: Vec<bool> = (0..num_lists).map(|i| i % 10 != 0).collect();
+    let nulls = Some(NullBuffer::from(nulls));
+
+    Arc::new(ListArray::new(
+        Arc::new(Field::new("item", DataType::Int32, false)),
+        offsets,
+        values,
+        nulls,
+    ))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_concat");
+
+    // Benchmark: varying number of rows, 20 elements per list
+    for num_rows in [100, 1000, 10000] {
+        let mut rng = StdRng::seed_from_u64(SEED);
+        let list_a = make_list_array(&mut rng, num_rows, 20);
+        let list_b = make_list_array(&mut rng, num_rows, 20);
+        let args: Vec<ArrayRef> = vec![list_a, list_b];
+
+        group.bench_with_input(BenchmarkId::new("rows", num_rows), &args, |b, args| {
+            b.iter(|| black_box(array_concat_inner(args).unwrap()));
+        });
+    }
+
+    // Benchmark: 1000 rows, varying element counts per list
+    for elements_per_list in [5, 50, 500] {
+        let mut rng = StdRng::seed_from_u64(SEED);
+        let list_a = make_list_array(&mut rng, 1000, elements_per_list);
+        let list_b = make_list_array(&mut rng, 1000, elements_per_list);
+        let args: Vec<ArrayRef> = vec![list_a, list_b];
+
+        group.bench_with_input(
+            BenchmarkId::new("elements_per_list", elements_per_list),
+            &args,
+            |b, args| {
+                b.iter(|| black_box(array_concat_inner(args).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_expression.rs b/datafusion/functions-nested/benches/array_expression.rs
index 0e3ecbc726413..ad9f565f4d643 100644
--- a/datafusion/functions-nested/benches/array_expression.rs
+++ b/datafusion/functions-nested/benches/array_expression.rs
@@ -15,13 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[macro_use]
-extern crate criterion;
-extern crate arrow;
-
-use crate::criterion::Criterion;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::lit;
 use datafusion_functions_nested::expr_fn::{array_replace_all, make_array};
+use std::hint::black_box;
 
 fn criterion_benchmark(c: &mut Criterion) {
     // Construct large arrays for benchmarking
@@ -45,7 +42,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     from_array.clone(),
                     to_array.clone()
                 ),
-                *criterion::black_box(&expected_array)
+                *black_box(&expected_array)
             )
         })
     });
diff --git a/datafusion/functions-nested/benches/array_has.rs b/datafusion/functions-nested/benches/array_has.rs
new file mode 100644
index 0000000000000..f5e66d56c0efe
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_has.rs
@@ -0,0 +1,781 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::array_has::{ArrayHas, ArrayHasAll, ArrayHasAny};
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 10000;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+const NEEDLE_SIZE: usize = 3;
+
+// If not explicitly stated, `array` and `array_size` refer to the haystack array.
+fn criterion_benchmark(c: &mut Criterion) {
+    // Test different array sizes
+    let array_sizes = vec![10, 100, 500];
+
+    for &size in &array_sizes {
+        bench_array_has(c, size);
+        bench_array_has_all(c, size);
+        bench_array_has_any(c, size);
+    }
+
+    // Specific benchmarks for string arrays (common use case)
+    bench_array_has_strings(c);
+    bench_array_has_all_strings(c);
+    bench_array_has_any_strings(c);
+
+    // Benchmark for array_has_any with one scalar arg
+    bench_array_has_any_scalar(c);
+}
+
+fn bench_array_has(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_i64");
+    let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("arr", list_array.data_type().clone(), false).into(),
+        Field::new("el", DataType::Int64, false).into(),
+    ];
+
+    // Benchmark: element found
+    let args_found = vec![
+        ColumnarValue::Array(list_array.clone()),
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: element not found
+    let args_not_found = vec![
+        ColumnarValue::Array(list_array.clone()),
+        ColumnarValue::Scalar(ScalarValue::Int64(Some(-999))),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_has_all(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_all");
+    let haystack = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type = haystack.data_type().clone();
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", list_type.clone(), false).into(),
+        Field::new("needle", list_type.clone(), false).into(),
+    ];
+
+    // Benchmark: all elements found (small needle)
+    let needle_found = create_int64_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0);
+    let args_found = vec![
+        ColumnarValue::Array(haystack.clone()),
+        ColumnarValue::Array(needle_found),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("all_found_small_needle", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: not all found (needle contains elements outside haystack range)
+    let needle_missing =
+        create_int64_list_array_with_offset(NUM_ROWS, NEEDLE_SIZE, array_size as i64);
+    let args_missing = vec![
+        ColumnarValue::Array(haystack.clone()),
+        ColumnarValue::Array(needle_missing),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_all_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_missing.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+const SMALL_ARRAY_SIZE: usize = NEEDLE_SIZE;
+
+fn bench_array_has_any(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_has_any");
+    let first_arr = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type = first_arr.data_type().clone();
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type.clone(), false).into(),
+        Field::new("second", list_type.clone(), false).into(),
+    ];
+
+    // Benchmark: some elements match
+    let second_match = create_int64_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0);
+    let args_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Array(second_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("some_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: no match
+    let second_no_match = create_int64_list_array_with_offset(
+        NUM_ROWS,
+        SMALL_ARRAY_SIZE,
+        array_size as i64,
+    );
+    let args_no_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Array(second_no_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("no_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: scalar second arg, some match
+    let scalar_second_match = create_int64_scalar_list(SMALL_ARRAY_SIZE, 0);
+    let args_scalar_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Scalar(scalar_second_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("scalar_some_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_scalar_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: scalar second arg, no match
+    let scalar_second_no_match =
+        create_int64_scalar_list(SMALL_ARRAY_SIZE, array_size as i64);
+    let args_scalar_no_match = vec![
+        ColumnarValue::Array(first_arr.clone()),
+        ColumnarValue::Scalar(scalar_second_no_match),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("scalar_no_match", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_scalar_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_has_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let list_array = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("arr", list_array.data_type().clone(), false).into(),
+            Field::new("el", DataType::Utf8, false).into(),
+        ];
+
+        let args_found = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("value_1".to_string()))),
+        ];
+        group.bench_with_input(BenchmarkId::new("found", size), &size, |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let args_not_found = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("NOTFOUND".to_string()))),
+        ];
+        group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, _| {
+            let udf = ArrayHas::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_array_has_all_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_all_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let haystack = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let list_type = haystack.data_type().clone();
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("haystack", list_type.clone(), false).into(),
+            Field::new("needle", list_type.clone(), false).into(),
+        ];
+
+        let needle_found = create_string_list_array(NUM_ROWS, NEEDLE_SIZE, 0.0);
+        let args_found = vec![
+            ColumnarValue::Array(haystack.clone()),
+            ColumnarValue::Array(needle_found),
+        ];
+        group.bench_with_input(BenchmarkId::new("all_found", size), &size, |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let needle_missing =
+            create_string_list_array_with_prefix(NUM_ROWS, NEEDLE_SIZE, "missing_");
+        let args_missing = vec![
+            ColumnarValue::Array(haystack.clone()),
+            ColumnarValue::Array(needle_missing),
+        ];
+        group.bench_with_input(BenchmarkId::new("not_all_found", size), &size, |b, _| {
+            let udf = ArrayHasAll::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_missing.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_array_has_any_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_any_strings");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let sizes = vec![10, 100, 500];
+
+    for &size in &sizes {
+        let first_arr = create_string_list_array(NUM_ROWS, size, NULL_DENSITY);
+        let list_type = first_arr.data_type().clone();
+        let arg_fields: Vec<Arc<Field>> = vec![
+            Field::new("first", list_type.clone(), false).into(),
+            Field::new("second", list_type.clone(), false).into(),
+        ];
+
+        let second_match = create_string_list_array(NUM_ROWS, SMALL_ARRAY_SIZE, 0.0);
+        let args_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Array(second_match),
+        ];
+        group.bench_with_input(BenchmarkId::new("some_match", size), &size, |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        let second_no_match =
+            create_string_list_array_with_prefix(NUM_ROWS, SMALL_ARRAY_SIZE, "missing_");
+        let args_no_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Array(second_no_match),
+        ];
+        group.bench_with_input(BenchmarkId::new("no_match", size), &size, |b, _| {
+            let udf = ArrayHasAny::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_no_match.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: NUM_ROWS,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // Benchmark: scalar second arg, some match
+        let scalar_second_match = create_string_scalar_list(SMALL_ARRAY_SIZE, "value_");
+        let args_scalar_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Scalar(scalar_second_match),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("scalar_some_match", size),
+            &size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args_scalar_match.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+
+        // Benchmark: scalar second arg, no match
+        let scalar_second_no_match =
+            create_string_scalar_list(SMALL_ARRAY_SIZE, "missing_");
+        let args_scalar_no_match = vec![
+            ColumnarValue::Array(first_arr.clone()),
+            ColumnarValue::Scalar(scalar_second_no_match),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("scalar_no_match", size),
+            &size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args_scalar_no_match.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmarks array_has_any with one scalar arg.  Varies the scalar argument
+/// size while keeping the columnar array small (3 elements per row).
+fn bench_array_has_any_scalar(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_has_any_scalar");
+    let config_options = Arc::new(ConfigOptions::default());
+    let return_field: Arc<Field> = Field::new("result", DataType::Boolean, true).into();
+
+    let array_size = 3;
+    let scalar_sizes = vec![1, 10, 100, 1000];
+
+    // i64 benchmarks
+    let first_arr_i64 = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type_i64 = first_arr_i64.data_type().clone();
+    let arg_fields_i64: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type_i64.clone(), false).into(),
+        Field::new("second", list_type_i64.clone(), false).into(),
+    ];
+
+    for &scalar_size in &scalar_sizes {
+        let scalar_arg = create_int64_scalar_list(scalar_size, array_size as i64);
+        let args = vec![
+            ColumnarValue::Array(first_arr_i64.clone()),
+            ColumnarValue::Scalar(scalar_arg),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("i64_no_match", scalar_size),
+            &scalar_size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields_i64.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    // String benchmarks
+    let first_arr_str = create_string_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+    let list_type_str = first_arr_str.data_type().clone();
+    let arg_fields_str: Vec<Arc<Field>> = vec![
+        Field::new("first", list_type_str.clone(), false).into(),
+        Field::new("second", list_type_str.clone(), false).into(),
+    ];
+
+    for &scalar_size in &scalar_sizes {
+        let scalar_arg = create_string_scalar_list(scalar_size, "missing_");
+        let args = vec![
+            ColumnarValue::Array(first_arr_str.clone()),
+            ColumnarValue::Scalar(scalar_arg),
+        ];
+        group.bench_with_input(
+            BenchmarkId::new("string_no_match", scalar_size),
+            &scalar_size,
+            |b, _| {
+                let udf = ArrayHasAny::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields_str.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: return_field.clone(),
+                            config_options: config_options.clone(),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn create_int64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..array_size as i64))
+            }
+        })
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Like `create_int64_list_array` but values are offset so they won't
+/// appear in a standard list array (useful for "not found" benchmarks).
+fn create_int64_list_array_with_offset(
+    num_rows: usize,
+    array_size: usize,
+    offset: i64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED + 1);
+    let values = (0..num_rows * array_size)
+        .map(|_| Some(rng.random_range(0..array_size as i64) + offset))
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_string_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                let idx = rng.random_range(0..array_size);
+                Some(format!("value_{idx}"))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Like `create_string_list_array` but values use a different prefix so
+/// they won't appear in a standard string list array.
+fn create_string_list_array_with_prefix(
+    num_rows: usize,
+    array_size: usize,
+    prefix: &str,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED + 1);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            let idx = rng.random_range(0..array_size);
+            Some(format!("{prefix}{idx}"))
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Create a `ScalarValue::List` containing a single list of `size` i64 elements,
+/// with values starting at `offset`.
+fn create_int64_scalar_list(size: usize, offset: i64) -> ScalarValue {
+    let values = (0..size as i64)
+        .map(|i| Some(i + offset))
+        .collect::<Int64Array>();
+    let list = ListArray::try_new(
+        Arc::new(Field::new("item", DataType::Int64, true)),
+        OffsetBuffer::new(vec![0, size as i32].into()),
+        Arc::new(values),
+        None,
+    )
+    .unwrap();
+    ScalarValue::List(Arc::new(list))
+}
+
+/// Create a `ScalarValue::List` containing a single list of `size` string elements,
+/// with values like "{prefix}0", "{prefix}1", etc.
+fn create_string_scalar_list(size: usize, prefix: &str) -> ScalarValue {
+    let values = (0..size)
+        .map(|i| Some(format!("{prefix}{i}")))
+        .collect::<StringArray>();
+    let list = ListArray::try_new(
+        Arc::new(Field::new("item", DataType::Utf8, true)),
+        OffsetBuffer::new(vec![0, size as i32].into()),
+        Arc::new(values),
+        None,
+    )
+    .unwrap();
+    ScalarValue::List(Arc::new(list))
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_position.rs b/datafusion/functions-nested/benches/array_position.rs
new file mode 100644
index 0000000000000..c718b2b725640
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_position.rs
@@ -0,0 +1,344 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions};
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 10000;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+const SENTINEL_NEEDLE: i64 = -1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [10, 100, 500] {
+        bench_array_position(c, size);
+        bench_array_positions(c, size);
+    }
+}
+
+fn bench_array_position(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_position_i64");
+    let haystack_found_once = create_haystack_with_sentinel(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+        0,
+    );
+    let haystack_found_many = create_haystack_with_sentinels(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+    );
+    let haystack_not_found =
+        create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+    let num_rows = haystack_not_found.len();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", haystack_not_found.data_type().clone(), false).into(),
+        Field::new("needle", DataType::Int64, false).into(),
+    ];
+    let return_field: Arc<Field> = Field::new("result", DataType::UInt64, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+    // Benchmark: one match per row.
+    let args_found_once = vec![
+        ColumnarValue::Array(haystack_found_once.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_once", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_once.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: many matches per row.
+    let args_found_many = vec![
+        ColumnarValue::Array(haystack_found_many.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_many", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_many.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    // Benchmark: needle is not found in any row.
+    let args_not_found = vec![
+        ColumnarValue::Array(haystack_not_found.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPosition::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn bench_array_positions(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_positions_i64");
+    let haystack_found_once = create_haystack_with_sentinel(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+        0,
+    );
+    let haystack_found_many = create_haystack_with_sentinels(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+    );
+    let haystack_not_found =
+        create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+    let num_rows = haystack_not_found.len();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", haystack_not_found.data_type().clone(), false).into(),
+        Field::new("needle", DataType::Int64, false).into(),
+    ];
+    let return_field: Arc<Field> = Field::new(
+        "result",
+        DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, true))),
+        true,
+    )
+    .into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+    let args_found_once = vec![
+        ColumnarValue::Array(haystack_found_once.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_once", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_once.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_found_many = vec![
+        ColumnarValue::Array(haystack_found_many.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_many", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_many.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_not_found = vec![
+        ColumnarValue::Array(haystack_not_found.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
+fn create_haystack_without_sentinel(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    create_haystack_from_fn(num_rows, array_size, |_, _, rng| {
+        random_haystack_value(rng, array_size, null_density)
+    })
+}
+
+fn create_haystack_with_sentinel(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+    sentinel: i64,
+    sentinel_index: usize,
+) -> ArrayRef {
+    assert!(sentinel_index < array_size);
+
+    create_haystack_from_fn(num_rows, array_size, |_, col, rng| {
+        if col == sentinel_index {
+            Some(sentinel)
+        } else {
+            random_haystack_value(rng, array_size, null_density)
+        }
+    })
+}
+
+fn create_haystack_with_sentinels(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+    sentinel: i64,
+) -> ArrayRef {
+    create_haystack_from_fn(num_rows, array_size, |_, col, rng| {
+        // Place the sentinel in half the positions to create many matches per row.
+        if col % 2 == 0 {
+            Some(sentinel)
+        } else {
+            random_haystack_value(rng, array_size, null_density)
+        }
+    })
+}
+
+fn create_haystack_from_fn<F>(
+    num_rows: usize,
+    array_size: usize,
+    mut value_at: F,
+) -> ArrayRef
+where
+    F: FnMut(usize, usize, &mut StdRng) -> Option<i64>,
+{
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut values = Vec::with_capacity(num_rows * array_size);
+    for row in 0..num_rows {
+        for col in 0..array_size {
+            values.push(value_at(row, col, &mut rng));
+        }
+    }
+    let values = values.into_iter().collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn random_haystack_value(
+    rng: &mut StdRng,
+    array_size: usize,
+    null_density: f64,
+) -> Option<i64> {
+    if rng.random::<f64>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(0..array_size as i64))
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_remove.rs b/datafusion/functions-nested/benches/array_remove.rs
new file mode 100644
index 0000000000000..a494d322392a8
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_remove.rs
@@ -0,0 +1,572 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array, FixedSizeBinaryArray,
+    Float64Array, Int64Array, ListArray, StringArray,
+};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::remove::ArrayRemove;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 10000;
+const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Test array_remove with different data types and array sizes
+    // TODO: Add performance tests for nested datatypes
+    bench_array_remove_int64(c);
+    bench_array_remove_f64(c);
+    bench_array_remove_strings(c);
+    bench_array_remove_binary(c);
+    bench_array_remove_boolean(c);
+    bench_array_remove_decimal64(c);
+    bench_array_remove_fixed_size_binary(c);
+}
+
+fn bench_array_remove_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_int64");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Int64(Some(1));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Int64, false).into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_f64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_f64");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_f64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Float64(Some(1.0));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Float64, false).into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_strings(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_strings");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_string_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Utf8(Some("value_1".to_string()));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Utf8, false).into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_binary(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_binary");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_binary_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Binary(Some(b"value_1".to_vec()));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Binary, false).into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_boolean(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_boolean");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_boolean_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Boolean(Some(true));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Boolean, false).into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_decimal64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_decimal64");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = create_decimal64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::Decimal128(Some(100_i128), 10, 2);
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::Decimal128(10, 2), false)
+                                    .into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_array_remove_fixed_size_binary(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_fixed_size_binary");
+
+    for &array_size in ARRAY_SIZES {
+        let list_array =
+            create_fixed_size_binary_list_array(NUM_ROWS, array_size, NULL_DENSITY);
+        let element_to_remove = ScalarValue::FixedSizeBinary(16, Some(vec![1u8; 16]));
+        let args = create_args(list_array.clone(), element_to_remove.clone());
+
+        group.bench_with_input(
+            BenchmarkId::new("remove", array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayRemove::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: vec![
+                                Field::new("arr", list_array.data_type().clone(), false)
+                                    .into(),
+                                Field::new("el", DataType::FixedSizeBinary(16), false)
+                                    .into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                list_array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn create_args(list_array: ArrayRef, element: ScalarValue) -> Vec<ColumnarValue> {
+    vec![
+        ColumnarValue::Array(list_array),
+        ColumnarValue::Scalar(element),
+    ]
+}
+
+fn create_int64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..array_size as i64))
+            }
+        })
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_f64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0.0..array_size as f64))
+            }
+        })
+        .collect::<Float64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_string_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                let idx = rng.random_range(0..array_size);
+                Some(format!("value_{idx}"))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_binary_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                let idx = rng.random_range(0..array_size);
+                Some(format!("value_{idx}").into_bytes())
+            }
+        })
+        .collect::<BinaryArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Binary, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_boolean_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random::<bool>())
+            }
+        })
+        .collect::<BooleanArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Boolean, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_decimal64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..array_size) as i128 * 100)
+            }
+        })
+        .collect::<Decimal128Array>()
+        .with_precision_and_scale(10, 2)
+        .unwrap();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Decimal128(10, 2), true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_fixed_size_binary_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut buffer = Vec::with_capacity(num_rows * array_size * 16);
+    let mut null_buffer = Vec::with_capacity(num_rows * array_size);
+    for _ in 0..num_rows * array_size {
+        if rng.random::<f64>() < null_density {
+            null_buffer.push(false);
+            buffer.extend_from_slice(&[0u8; 16]);
+        } else {
+            null_buffer.push(true);
+            let mut bytes = [0u8; 16];
+            rng.fill(&mut bytes);
+            buffer.extend_from_slice(&bytes);
+        }
+    }
+    let nulls = arrow::buffer::NullBuffer::from_iter(null_buffer.iter().copied());
+    let values = FixedSizeBinaryArray::new(16, buffer.into(), Some(nulls));
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::FixedSizeBinary(16), true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_repeat.rs b/datafusion/functions-nested/benches/array_repeat.rs
new file mode 100644
index 0000000000000..0ce8db00ceb8f
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_repeat.rs
@@ -0,0 +1,476 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::repeat::ArrayRepeat;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: &[usize] = &[100, 1000, 10000];
+const REPEAT_COUNTS: &[u64] = &[5, 50];
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Test array_repeat with different element types
+    bench_array_repeat_int64(c);
+    bench_array_repeat_string(c);
+    bench_array_repeat_float64(c);
+    bench_array_repeat_boolean(c);
+
+    // Test array_repeat with list element (nested arrays)
+    bench_array_repeat_nested_int64_list(c);
+    bench_array_repeat_nested_string_list(c);
+}
+
+fn bench_array_repeat_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_int64");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = create_int64_array(num_rows, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Int64, false).into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Int64,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_string(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_string");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = create_string_array(num_rows, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Utf8, false).into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Utf8,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_nested_int64_list(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_nested_int64");
+
+    for &num_rows in NUM_ROWS {
+        let list_array = create_int64_list_array(num_rows, 5, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(list_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new(
+                                        "element",
+                                        list_array.data_type().clone(),
+                                        false,
+                                    )
+                                    .into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        list_array.data_type().clone(),
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_float64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_float64");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = create_float64_array(num_rows, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Float64, false)
+                                        .into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Float64,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_boolean(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_boolean");
+
+    for &num_rows in NUM_ROWS {
+        let element_array = create_boolean_array(num_rows, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(element_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new("element", DataType::Boolean, false)
+                                        .into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        DataType::Boolean,
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_repeat_nested_string_list(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_repeat_nested_string");
+
+    for &num_rows in NUM_ROWS {
+        let list_array = create_string_list_array(num_rows, 5, NULL_DENSITY);
+
+        for &repeat_count in REPEAT_COUNTS {
+            let args = vec![
+                ColumnarValue::Array(list_array.clone()),
+                ColumnarValue::Scalar(ScalarValue::from(repeat_count)),
+            ];
+
+            group.bench_with_input(
+                BenchmarkId::new(format!("repeat_{repeat_count}_count"), num_rows),
+                &num_rows,
+                |b, _| {
+                    let udf = ArrayRepeat::new();
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: vec![
+                                    Field::new(
+                                        "element",
+                                        list_array.data_type().clone(),
+                                        false,
+                                    )
+                                    .into(),
+                                    Field::new("count", DataType::UInt64, false).into(),
+                                ],
+                                number_rows: num_rows,
+                                return_field: Field::new(
+                                    "result",
+                                    DataType::List(Arc::new(Field::new_list_field(
+                                        list_array.data_type().clone(),
+                                        true,
+                                    ))),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn create_int64_array(num_rows: usize, null_density: f64) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..1000))
+            }
+        })
+        .collect::<Int64Array>();
+
+    Arc::new(values)
+}
+
+fn create_string_array(num_rows: usize, null_density: f64) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    use arrow::array::StringArray;
+
+    let values = (0..num_rows)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(format!("value_{}", rng.random_range(0..100)))
+            }
+        })
+        .collect::<StringArray>();
+
+    Arc::new(values)
+}
+
+fn create_int64_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0..1000))
+            }
+        })
+        .collect::<Int64Array>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_float64_array(num_rows: usize, null_density: f64) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random_range(0.0..1000.0))
+            }
+        })
+        .collect::<Float64Array>();
+
+    Arc::new(values)
+}
+
+fn create_boolean_array(num_rows: usize, null_density: f64) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..num_rows)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(rng.random())
+            }
+        })
+        .collect::<BooleanArray>();
+
+    Arc::new(values)
+}
+
+fn create_string_list_array(
+    num_rows: usize,
+    array_size: usize,
+    null_density: f64,
+) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    use arrow::array::StringArray;
+
+    let values = (0..num_rows * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < null_density {
+                None
+            } else {
+                Some(format!("value_{}", rng.random_range(0..100)))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_reverse.rs b/datafusion/functions-nested/benches/array_reverse.rs
new file mode 100644
index 0000000000000..0c37296188315
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_reverse.rs
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{hint::black_box, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, FixedSizeListArray, Int32Array, ListArray, ListViewArray},
+    buffer::{NullBuffer, OffsetBuffer, ScalarBuffer},
+    datatypes::{DataType, Field},
+};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_functions_nested::reverse::array_reverse_inner;
+
+fn array_reverse(array: &ArrayRef) -> ArrayRef {
+    black_box(array_reverse_inner(std::slice::from_ref(array)).unwrap())
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Create array sizes with step size of 100, starting from 100.
+    let number_of_arrays = 1000;
+    let sizes = (0..number_of_arrays)
+        .map(|i| 100 + i * 100)
+        .collect::<Vec<i32>>();
+
+    // Calculate the total number of values
+    let total_values = sizes.iter().sum::<i32>();
+
+    // Calculate sizes and offsets from array lengths
+    let offsets = sizes
+        .iter()
+        .scan(0, |acc, &x| {
+            let offset = *acc;
+            *acc += x;
+            Some(offset)
+        })
+        .collect::<Vec<i32>>();
+    let offsets = ScalarBuffer::from(offsets);
+    // Set every 10th array to null
+    let nulls = (0..number_of_arrays)
+        .map(|i| i % 10 != 0)
+        .collect::<Vec<bool>>();
+
+    let values = (0..total_values).collect::<Vec<i32>>();
+    let values = Arc::new(Int32Array::from(values));
+
+    // Create ListArray and ListViewArray
+    let nulls_list_array = Some(NullBuffer::from(
+        nulls[..((number_of_arrays as usize) - 1)].to_vec(),
+    ));
+    let list_array: ArrayRef = Arc::new(ListArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        OffsetBuffer::new(offsets.clone()),
+        values.clone(),
+        nulls_list_array,
+    ));
+    let nulls_list_view_array = Some(NullBuffer::from(
+        nulls[..(number_of_arrays as usize)].to_vec(),
+    ));
+    let list_view_array: ArrayRef = Arc::new(ListViewArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        offsets,
+        ScalarBuffer::from(sizes),
+        values.clone(),
+        nulls_list_view_array,
+    ));
+
+    c.bench_function("array_reverse_list", |b| {
+        b.iter(|| array_reverse(&list_array))
+    });
+
+    c.bench_function("array_reverse_list_view", |b| {
+        b.iter(|| array_reverse(&list_view_array))
+    });
+
+    // Create FixedSizeListArray
+    let array_len = 1000;
+    let num_arrays = 5000;
+    let total_values = num_arrays * array_len;
+    let values = (0..total_values).collect::<Vec<i32>>();
+    let values = Arc::new(Int32Array::from(values));
+    // Set every 10th array to null
+    let nulls = (0..num_arrays).map(|i| i % 10 != 0).collect::<Vec<bool>>();
+    let nulls = Some(NullBuffer::from(nulls));
+    let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new(
+        Arc::new(Field::new("a", DataType::Int32, false)),
+        array_len,
+        values.clone(),
+        nulls.clone(),
+    ));
+    c.bench_function("array_reverse_fixed_size_list", |b| {
+        b.iter(|| array_reverse(&fixed_size_list_array))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_set_ops.rs b/datafusion/functions-nested/benches/array_set_ops.rs
new file mode 100644
index 0000000000000..d43bbdb577d06
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_set_ops.rs
@@ -0,0 +1,389 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, ListArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    criterion_group, criterion_main, {BenchmarkId, Criterion},
+};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::except::ArrayExcept;
+use datafusion_functions_nested::set_ops::{ArrayDistinct, ArrayIntersect, ArrayUnion};
+use rand::SeedableRng;
+use rand::prelude::SliceRandom;
+use rand::rngs::StdRng;
+use std::collections::HashSet;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1000;
+const ARRAY_SIZES: &[usize] = &[10, 50, 100];
+const SEED: u64 = 42;
+/// Extra rows on each side when building sliced arrays, so the underlying
+/// values buffer is much larger than the visible portion.
+const SLICE_PADDING: usize = 5000;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_array_union(c);
+    bench_array_intersect(c);
+    bench_array_except(c);
+    bench_array_distinct(c);
+    bench_array_union_sliced(c);
+    bench_array_intersect_sliced(c);
+    bench_array_distinct_sliced(c);
+    bench_array_except_sliced(c);
+}
+
+fn invoke_udf(udf: &impl ScalarUDFImpl, array1: &ArrayRef, array2: &ArrayRef) {
+    black_box(
+        udf.invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(array1.clone()),
+                ColumnarValue::Array(array2.clone()),
+            ],
+            arg_fields: vec![
+                Field::new("arr1", array1.data_type().clone(), false).into(),
+                Field::new("arr2", array2.data_type().clone(), false).into(),
+            ],
+            number_rows: NUM_ROWS,
+            return_field: Field::new("result", array1.data_type().clone(), false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+        .unwrap(),
+    );
+}
+
+fn bench_array_union(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_union");
+    let udf = ArrayUnion::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_intersect(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_intersect");
+    let udf = ArrayIntersect::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_except(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_except");
+    let udf = ArrayExcept::new();
+
+    for (overlap_label, overlap_ratio) in &[("high_overlap", 0.8), ("low_overlap", 0.2)] {
+        for &array_size in ARRAY_SIZES {
+            let (array1, array2) =
+                create_arrays_with_overlap(NUM_ROWS, array_size, *overlap_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*overlap_label, array_size),
+                &array_size,
+                |b, _| b.iter(|| invoke_udf(&udf, &array1, &array2)),
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_array_distinct(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_distinct");
+    let udf = ArrayDistinct::new();
+
+    for (duplicate_label, duplicate_ratio) in
+        &[("high_duplicate", 0.8), ("low_duplicate", 0.2)]
+    {
+        for &array_size in ARRAY_SIZES {
+            let array =
+                create_array_with_duplicates(NUM_ROWS, array_size, *duplicate_ratio);
+            group.bench_with_input(
+                BenchmarkId::new(*duplicate_label, array_size),
+                &array_size,
+                |b, _| {
+                    b.iter(|| {
+                        black_box(
+                            udf.invoke_with_args(ScalarFunctionArgs {
+                                args: vec![ColumnarValue::Array(array.clone())],
+                                arg_fields: vec![
+                                    Field::new("arr", array.data_type().clone(), false)
+                                        .into(),
+                                ],
+                                number_rows: NUM_ROWS,
+                                return_field: Field::new(
+                                    "result",
+                                    array.data_type().clone(),
+                                    false,
+                                )
+                                .into(),
+                                config_options: Arc::new(ConfigOptions::default()),
+                            })
+                            .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+fn create_arrays_with_overlap(
+    num_rows: usize,
+    array_size: usize,
+    overlap_ratio: f64,
+) -> (ArrayRef, ArrayRef) {
+    assert!((0.0..=1.0).contains(&overlap_ratio));
+    let overlap_count = ((array_size as f64) * overlap_ratio).round() as usize;
+
+    let mut rng = StdRng::seed_from_u64(SEED);
+
+    let mut values1 = Vec::with_capacity(num_rows * array_size);
+    let mut values2 = Vec::with_capacity(num_rows * array_size);
+
+    for row in 0..num_rows {
+        let base = (row as i64) * (array_size as i64) * 2;
+
+        for i in 0..array_size {
+            values1.push(base + i as i64);
+        }
+
+        let mut positions: Vec<usize> = (0..array_size).collect();
+        positions.shuffle(&mut rng);
+
+        let overlap_positions: HashSet<_> =
+            positions[..overlap_count].iter().copied().collect();
+
+        for i in 0..array_size {
+            if overlap_positions.contains(&i) {
+                values2.push(base + i as i64);
+            } else {
+                values2.push(base + array_size as i64 + i as i64);
+            }
+        }
+    }
+
+    let values1 = Int64Array::from(values1);
+    let values2 = Int64Array::from(values2);
+
+    let field = Arc::new(Field::new("item", DataType::Int64, true));
+
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    let array1 = Arc::new(
+        ListArray::try_new(
+            field.clone(),
+            OffsetBuffer::new(offsets.clone().into()),
+            Arc::new(values1),
+            None,
+        )
+        .unwrap(),
+    );
+
+    let array2 = Arc::new(
+        ListArray::try_new(
+            field,
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values2),
+            None,
+        )
+        .unwrap(),
+    );
+
+    (array1, array2)
+}
+
+fn create_array_with_duplicates(
+    num_rows: usize,
+    array_size: usize,
+    duplicate_ratio: f64,
+) -> ArrayRef {
+    assert!((0.0..=1.0).contains(&duplicate_ratio));
+    let unique_count = ((array_size as f64) * (1.0 - duplicate_ratio)).round() as usize;
+    let duplicate_count = array_size - unique_count;
+
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let mut values = Vec::with_capacity(num_rows * array_size);
+
+    for row in 0..num_rows {
+        let base = (row as i64) * (array_size as i64) * 2;
+
+        // Add unique values first
+        for i in 0..unique_count {
+            values.push(base + i as i64);
+        }
+
+        // Fill the rest with duplicates randomly picked from the unique values
+        let mut unique_indices: Vec<i64> =
+            (0..unique_count).map(|i| base + i as i64).collect();
+        unique_indices.shuffle(&mut rng);
+
+        for i in 0..duplicate_count {
+            values.push(unique_indices[i % unique_count]);
+        }
+    }
+
+    let values = Int64Array::from(values);
+    let field = Arc::new(Field::new("item", DataType::Int64, true));
+
+    let offsets = (0..=num_rows)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            field,
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+/// Slice a pair of arrays to the middle `NUM_ROWS` rows from a larger array.
+fn slice_pair(arrays: &(ArrayRef, ArrayRef)) -> (ArrayRef, ArrayRef) {
+    let a1 = arrays.0.slice(SLICE_PADDING, NUM_ROWS);
+    let a2 = arrays.1.slice(SLICE_PADDING, NUM_ROWS);
+    (a1, a2)
+}
+
+fn bench_array_union_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_union_sliced");
+    let udf = ArrayUnion::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_intersect_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_intersect_sliced");
+    let udf = ArrayIntersect::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_except_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_except_sliced");
+    let udf = ArrayExcept::new();
+
+    for &array_size in ARRAY_SIZES {
+        let (a1, a2) = slice_pair(&create_arrays_with_overlap(
+            NUM_ROWS + 2 * SLICE_PADDING,
+            array_size,
+            0.5,
+        ));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| b.iter(|| invoke_udf(&udf, &a1, &a2)),
+        );
+    }
+    group.finish();
+}
+
+fn bench_array_distinct_sliced(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_distinct_sliced");
+    let udf = ArrayDistinct::new();
+
+    for &array_size in ARRAY_SIZES {
+        let array =
+            create_array_with_duplicates(NUM_ROWS + 2 * SLICE_PADDING, array_size, 0.5)
+                .slice(SLICE_PADDING, NUM_ROWS);
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| {
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: vec![ColumnarValue::Array(array.clone())],
+                            arg_fields: vec![
+                                Field::new("arr", array.data_type().clone(), false)
+                                    .into(),
+                            ],
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new(
+                                "result",
+                                array.data_type().clone(),
+                                false,
+                            )
+                            .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_slice.rs b/datafusion/functions-nested/benches/array_slice.rs
new file mode 100644
index 0000000000000..b95fe47575e53
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_slice.rs
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Int64Array, ListArray, ListViewArray, NullBufferBuilder, PrimitiveArray,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Int64Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions_nested::extract::array_slice_udf;
+use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_inputs(
+    rng: &mut StdRng,
+    size: usize,
+    child_array_size: usize,
+    null_density: f32,
+) -> (ListArray, ListViewArray) {
+    let mut nulls_builder = NullBufferBuilder::new(size);
+    let mut sizes = Vec::with_capacity(size);
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            nulls_builder.append_null();
+        } else {
+            nulls_builder.append_non_null();
+        }
+        sizes.push(rng.random_range(1..child_array_size));
+    }
+    let nulls = nulls_builder.finish();
+
+    let length = sizes.iter().sum();
+    let values: PrimitiveArray<Int64Type> =
+        (0..length).map(|_| Some(rng.random())).collect();
+    let values = Arc::new(values);
+
+    let offsets = OffsetBuffer::from_lengths(sizes.clone());
+    let list_array = ListArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets.clone(),
+        values.clone(),
+        nulls.clone(),
+    );
+
+    let offsets = ScalarBuffer::from(offsets.slice(0, size - 1));
+    let sizes = ScalarBuffer::from_iter(sizes.into_iter().map(|v| v as i32));
+    let list_view_array = ListViewArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets,
+        sizes,
+        values,
+        nulls,
+    );
+
+    (list_array, list_view_array)
+}
+
+/// Create `from`, `to`, and `stride` from an array of strides.
+fn random_from_to_stride(
+    rng: &mut StdRng,
+    size: i64,
+    null_density: f32,
+    stride_choices: &[Option<i64>],
+) -> (Option<i64>, Option<i64>, Option<i64>) {
+    let from = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(1..=size))
+    };
+
+    let to = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        match from {
+            Some(from) => Some(rng.random_range(from..=size)),
+            None => Some(rng.random_range(1..=size)),
+        }
+    };
+
+    let stride = stride_choices.choose(rng).cloned().unwrap_or(None);
+
+    if from.is_none() || to.is_none() || stride.is_none_or(|s| s > 0) {
+        (from, to, stride)
+    } else {
+        // stride < 0, swap from and to
+        (to, from, stride)
+    }
+}
+
+fn array_slice_benchmark(
+    name: &str,
+    input: ColumnarValue,
+    mut args: Vec<ColumnarValue>,
+    c: &mut Criterion,
+    size: usize,
+) {
+    args.insert(0, input);
+
+    let array_slice = array_slice_udf();
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            <Arc<Field>>::from(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+        })
+        .collect::<Vec<_>>();
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                array_slice
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new_list_field(args[0].data_type(), true)
+                            .into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let size = 1_000_000;
+    let child_array_size = 100;
+    let null_density = 0.1;
+
+    let (list_array, list_view_array) =
+        create_inputs(rng, size, child_array_size, null_density);
+
+    let mut array_from = Vec::with_capacity(size);
+    let mut array_to = Vec::with_capacity(size);
+    let mut array_stride = Vec::with_capacity(size);
+    for child_array_size in list_array.offsets().lengths() {
+        let (from, to, stride) = random_from_to_stride(
+            rng,
+            child_array_size as i64,
+            null_density,
+            &[None, Some(-2), Some(-1), Some(1), Some(2)],
+        );
+        array_from.push(from);
+        array_to.push(to);
+        array_stride.push(stride);
+    }
+
+    // input
+    let list_array = ColumnarValue::Array(Arc::new(list_array));
+    let list_view_array = ColumnarValue::Array(Arc::new(list_view_array));
+
+    // args
+    let array_from = ColumnarValue::Array(Arc::new(Int64Array::from(array_from)));
+    let array_to = ColumnarValue::Array(Arc::new(Int64Array::from(array_to)));
+    let array_stride = ColumnarValue::Array(Arc::new(Int64Array::from(array_stride)));
+    let scalar_from = ColumnarValue::Scalar(ScalarValue::from(1i64));
+    let scalar_to = ColumnarValue::Scalar(ScalarValue::from(child_array_size as i64 / 2));
+
+    for input in [list_array, list_view_array] {
+        let input_type = input.data_type().to_string();
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, array args"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone(), array_stride.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, array args, no stride"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("array_slice: input {input_type}, scalar args, no stride"),
+            input.clone(),
+            vec![scalar_from.clone(), scalar_to.clone()],
+            c,
+            size,
+        );
+
+        for stride in [-2i64, -1i64, 1i64, 2i64] {
+            // swap from and to if stride < 0
+            let (scalar_from, scalar_to) = if stride > 0 {
+                (scalar_from.clone(), scalar_to.clone())
+            } else {
+                (scalar_to.clone(), scalar_from.clone())
+            };
+            let scalar_stride = ColumnarValue::Scalar(ScalarValue::from(stride));
+            array_slice_benchmark(
+                &format!("array_slice: input {input_type}, scalar args, stride={stride}"),
+                input.clone(),
+                vec![scalar_from, scalar_to, scalar_stride],
+                c,
+                size,
+            );
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/array_to_string.rs b/datafusion/functions-nested/benches/array_to_string.rs
new file mode 100644
index 0000000000000..286ed4eeb0003
--- /dev/null
+++ b/datafusion/functions-nested/benches/array_to_string.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions_nested::string::ArrayToString;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const NUM_ROWS: usize = 1000;
+const ARRAY_SIZES: &[usize] = &[5, 20, 100];
+const NESTED_ARRAY_SIZE: usize = 3;
+const SEED: u64 = 42;
+const NULL_DENSITY: f64 = 0.1;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    bench_array_to_string(c, "array_to_string_int64", create_int64_list_array);
+    bench_array_to_string(c, "array_to_string_float64", create_float64_list_array);
+    bench_array_to_string(c, "array_to_string_string", create_string_list_array);
+    bench_array_to_string(
+        c,
+        "array_to_string_nested_int64",
+        create_nested_int64_list_array,
+    );
+}
+
+fn bench_array_to_string(
+    c: &mut Criterion,
+    group_name: &str,
+    make_array: impl Fn(usize) -> ArrayRef,
+) {
+    let mut group = c.benchmark_group(group_name);
+
+    for &array_size in ARRAY_SIZES {
+        let list_array = make_array(array_size);
+        let args = vec![
+            ColumnarValue::Array(list_array.clone()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))),
+        ];
+        let arg_fields = vec![
+            Field::new("array", list_array.data_type().clone(), true).into(),
+            Field::new("delimiter", DataType::Utf8, false).into(),
+        ];
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(array_size),
+            &array_size,
+            |b, _| {
+                let udf = ArrayToString::new();
+                b.iter(|| {
+                    black_box(
+                        udf.invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: NUM_ROWS,
+                            return_field: Field::new("result", DataType::Utf8, true)
+                                .into(),
+                            config_options: Arc::new(ConfigOptions::default()),
+                        })
+                        .unwrap(),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn create_int64_list_array(array_size: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..NUM_ROWS * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < NULL_DENSITY {
+                None
+            } else {
+                Some(rng.random_range(0..1000))
+            }
+        })
+        .collect::<Int64Array>();
+    let offsets = (0..=NUM_ROWS)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_nested_int64_list_array(array_size: usize) -> ArrayRef {
+    let inner = create_int64_list_array(array_size);
+    let inner_rows = NUM_ROWS;
+    let outer_rows = inner_rows / NESTED_ARRAY_SIZE;
+    let offsets = (0..=outer_rows)
+        .map(|i| (i * NESTED_ARRAY_SIZE) as i32)
+        .collect::<Vec<i32>>();
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", inner.data_type().clone(), true)),
+            OffsetBuffer::new(offsets.into()),
+            inner,
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_float64_list_array(array_size: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..NUM_ROWS * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < NULL_DENSITY {
+                None
+            } else {
+                Some(rng.random_range(-1000.0..1000.0))
+            }
+        })
+        .collect::<Float64Array>();
+    let offsets = (0..=NUM_ROWS)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Float64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+fn create_string_list_array(array_size: usize) -> ArrayRef {
+    let mut rng = StdRng::seed_from_u64(SEED);
+    let values = (0..NUM_ROWS * array_size)
+        .map(|_| {
+            if rng.random::<f64>() < NULL_DENSITY {
+                None
+            } else {
+                Some(format!("value_{}", rng.random_range(0..100)))
+            }
+        })
+        .collect::<StringArray>();
+    let offsets = (0..=NUM_ROWS)
+        .map(|i| (i * array_size) as i32)
+        .collect::<Vec<i32>>();
+
+    Arc::new(
+        ListArray::try_new(
+            Arc::new(Field::new("item", DataType::Utf8, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(values),
+            None,
+        )
+        .unwrap(),
+    )
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions-nested/benches/map.rs b/datafusion/functions-nested/benches/map.rs
index a752a47bcbaa5..e50c4659b17cd 100644
--- a/datafusion/functions-nested/benches/map.rs
+++ b/datafusion/functions-nested/benches/map.rs
@@ -15,22 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{Int32Array, ListArray, StringArray};
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use rand::prelude::ThreadRng;
-use rand::Rng;
-use std::collections::HashSet;
-use std::sync::Arc;
-
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionArgs};
 use datafusion_functions_nested::map::map_udf;
 use datafusion_functions_nested::planner::NestedFunctionPlanner;
+use rand::Rng;
+use rand::prelude::ThreadRng;
+use std::collections::HashSet;
+use std::hint::black_box;
+use std::sync::Arc;
 
 fn keys(rng: &mut ThreadRng) -> Vec<String> {
     let mut keys = HashSet::with_capacity(1000);
@@ -58,8 +57,11 @@ fn criterion_benchmark(c: &mut Criterion) {
         let values = values(&mut rng);
         let mut buffer = Vec::new();
         for i in 0..1000 {
-            buffer.push(Expr::Literal(ScalarValue::Utf8(Some(keys[i].clone()))));
-            buffer.push(Expr::Literal(ScalarValue::Int32(Some(values[i]))));
+            buffer.push(Expr::Literal(
+                ScalarValue::Utf8(Some(keys[i].clone())),
+                None,
+            ));
+            buffer.push(Expr::Literal(ScalarValue::Int32(Some(values[i])), None));
         }
 
         let planner = NestedFunctionPlanner {};
@@ -95,13 +97,14 @@ fn criterion_benchmark(c: &mut Criterion) {
         let values = ColumnarValue::Scalar(ScalarValue::List(Arc::new(value_list)));
 
         let return_type = map_udf()
-            .return_type(&[DataType::Utf8, DataType::Int32])
+            .return_type(&[keys.data_type(), values.data_type()])
             .expect("should get return type");
         let arg_fields = vec![
             Field::new("a", keys.data_type(), true).into(),
             Field::new("a", values.data_type(), true).into(),
         ];
         let return_field = Field::new("f", return_type, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
 
         b.iter(|| {
             black_box(
@@ -111,6 +114,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: 1,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("map should work on valid values"),
             );
diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs
index 5ef1491313b13..03ba0adde0aec 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -18,19 +18,21 @@
 //! [`ScalarUDFImpl`] definitions for array_has, array_has_all and array_has_any functions.
 
 use arrow::array::{
-    Array, ArrayRef, BooleanArray, Datum, GenericListArray, OffsetSizeTrait, Scalar,
+    Array, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder, Datum, Scalar,
+    StringArrayType,
 };
 use arrow::buffer::BooleanBuffer;
 use arrow::datatypes::DataType;
 use arrow::row::{RowConverter, Rows, SortField};
-use datafusion_common::cast::as_generic_list_array;
+use datafusion_common::cast::{as_fixed_size_list_array, as_generic_list_array};
 use datafusion_common::utils::string_utils::string_array_to_vec;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, Result, ScalarValue};
-use datafusion_expr::expr::{InList, ScalarFunction};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
+use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::simplify::ExprSimplifyResult;
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility, in_list,
 };
 use datafusion_macros::user_doc;
 use datafusion_physical_expr_common::datum::compare_with_eq;
@@ -39,7 +41,9 @@ use itertools::Itertools;
 use crate::make_array::make_array_udf;
 use crate::utils::make_scalar_function;
 
+use hashbrown::HashSet;
 use std::any::Any;
+use std::ops::Range;
 use std::sync::Arc;
 
 // Create static instances of ScalarUDFs for each function
@@ -57,7 +61,7 @@ make_udf_expr_and_func!(ArrayHasAll,
 );
 make_udf_expr_and_func!(ArrayHasAny,
     array_has_any,
-    haystack_array needle_array, // arg names
+    first_array second_array, // arg names
     "returns true if at least one element of the second array appears in the first array; otherwise, it returns false.", // doc
     array_has_any_udf // internal function name
 );
@@ -83,7 +87,7 @@ make_udf_expr_and_func!(ArrayHasAny,
         description = "Scalar or Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayHas {
     signature: Signature,
     aliases: Vec<String>,
@@ -127,54 +131,65 @@ impl ScalarUDFImpl for ArrayHas {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        _info: &dyn datafusion_expr::simplify::SimplifyInfo,
+        _info: &datafusion_expr::simplify::SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         let [haystack, needle] = take_function_args(self.name(), &mut args)?;
 
         // if the haystack is a constant list, we can use an inlist expression which is more
         // efficient because the haystack is not varying per-row
-        if let Expr::Literal(ScalarValue::List(array)) = haystack {
-            // TODO: support LargeList
-            // (not supported by `convert_array_to_scalar_vec`)
-            // (FixedSizeList not supported either, but seems to have worked fine when attempting to
-            // build a reproducer)
-
-            assert_eq!(array.len(), 1); // guarantee of ScalarValue
-            if let Ok(scalar_values) =
-                ScalarValue::convert_array_to_scalar_vec(array.as_ref())
-            {
-                assert_eq!(scalar_values.len(), 1);
-                let list = scalar_values
-                    .into_iter()
-                    .flatten()
-                    .map(Expr::Literal)
-                    .collect();
-
-                return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {
-                    expr: Box::new(std::mem::take(needle)),
-                    list,
-                    negated: false,
-                })));
+        match haystack {
+            Expr::Literal(scalar, _) if scalar.is_null() => {
+                return Ok(ExprSimplifyResult::Simplified(Expr::Literal(
+                    ScalarValue::Boolean(None),
+                    None,
+                )));
             }
-        } else if let Expr::ScalarFunction(ScalarFunction { func, args }) = haystack {
-            // make_array has a static set of arguments, so we can pull the arguments out from it
-            if func == &make_array_udf() {
-                return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {
-                    expr: Box::new(std::mem::take(needle)),
-                    list: std::mem::take(args),
-                    negated: false,
-                })));
+            Expr::Literal(
+                // FixedSizeList gets coerced to List
+                scalar @ ScalarValue::List(_) | scalar @ ScalarValue::LargeList(_),
+                _,
+            ) => {
+                if let Ok(scalar_values) =
+                    ScalarValue::convert_array_to_scalar_vec(&scalar.to_array()?)
+                {
+                    assert_eq!(scalar_values.len(), 1);
+                    let list = scalar_values
+                        .into_iter()
+                        .flatten()
+                        .flatten()
+                        .map(|v| Expr::Literal(v, None))
+                        .collect();
+
+                    return Ok(ExprSimplifyResult::Simplified(in_list(
+                        std::mem::take(needle),
+                        list,
+                        false,
+                    )));
+                }
             }
-        }
-
+            Expr::ScalarFunction(ScalarFunction { func, args })
+                if func == &make_array_udf() =>
+            {
+                // make_array has a static set of arguments, so we can pull the arguments out from it
+                return Ok(ExprSimplifyResult::Simplified(in_list(
+                    std::mem::take(needle),
+                    std::mem::take(args),
+                    false,
+                )));
+            }
+            _ => {}
+        };
         Ok(ExprSimplifyResult::Original(args))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?;
+        if first_arg.data_type().is_null() {
+            // Always return null if the first argument is null
+            // i.e. array_has(null, element) -> null
+            return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+        }
+
         match &second_arg {
             ColumnarValue::Array(array_needle) => {
                 // the needle is already an array, convert the haystack to an array of the same length
@@ -218,34 +233,105 @@ fn array_has_inner_for_scalar(
     haystack: &ArrayRef,
     needle: &dyn Datum,
 ) -> Result<ArrayRef> {
-    match haystack.data_type() {
-        DataType::List(_) => array_has_dispatch_for_scalar::<i32>(haystack, needle),
-        DataType::LargeList(_) => array_has_dispatch_for_scalar::<i64>(haystack, needle),
-        _ => exec_err!(
-            "array_has does not support type '{:?}'.",
-            haystack.data_type()
-        ),
-    }
+    let haystack = haystack.as_ref().try_into()?;
+    array_has_dispatch_for_scalar(haystack, needle)
 }
 
 fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<ArrayRef> {
-    match haystack.data_type() {
-        DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, needle),
-        DataType::LargeList(_) => array_has_dispatch_for_array::<i64>(haystack, needle),
-        _ => exec_err!(
-            "array_has does not support type '{:?}'.",
-            haystack.data_type()
-        ),
+    let haystack = haystack.as_ref().try_into()?;
+    array_has_dispatch_for_array(haystack, needle)
+}
+
+#[derive(Copy, Clone)]
+enum ArrayWrapper<'a> {
+    FixedSizeList(&'a arrow::array::FixedSizeListArray),
+    List(&'a arrow::array::GenericListArray<i32>),
+    LargeList(&'a arrow::array::GenericListArray<i64>),
+}
+
+impl<'a> TryFrom<&'a dyn Array> for ArrayWrapper<'a> {
+    type Error = DataFusionError;
+
+    fn try_from(
+        value: &'a dyn Array,
+    ) -> std::result::Result<ArrayWrapper<'a>, Self::Error> {
+        match value.data_type() {
+            DataType::List(_) => {
+                Ok(ArrayWrapper::List(as_generic_list_array::<i32>(value)?))
+            }
+            DataType::LargeList(_) => Ok(ArrayWrapper::LargeList(
+                as_generic_list_array::<i64>(value)?,
+            )),
+            DataType::FixedSizeList(_, _) => Ok(ArrayWrapper::FixedSizeList(
+                as_fixed_size_list_array(value)?,
+            )),
+            _ => exec_err!("array_has does not support type '{}'.", value.data_type()),
+        }
     }
 }
 
-fn array_has_dispatch_for_array<O: OffsetSizeTrait>(
-    haystack: &ArrayRef,
+impl<'a> ArrayWrapper<'a> {
+    fn len(&self) -> usize {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => arr.len(),
+            ArrayWrapper::List(arr) => arr.len(),
+            ArrayWrapper::LargeList(arr) => arr.len(),
+        }
+    }
+
+    fn iter(&self) -> Box<dyn Iterator<Item = Option<ArrayRef>> + 'a> {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => Box::new(arr.iter()),
+            ArrayWrapper::List(arr) => Box::new(arr.iter()),
+            ArrayWrapper::LargeList(arr) => Box::new(arr.iter()),
+        }
+    }
+
+    fn values(&self) -> &ArrayRef {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => arr.values(),
+            ArrayWrapper::List(arr) => arr.values(),
+            ArrayWrapper::LargeList(arr) => arr.values(),
+        }
+    }
+
+    fn value_type(&self) -> DataType {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => arr.value_type(),
+            ArrayWrapper::List(arr) => arr.value_type(),
+            ArrayWrapper::LargeList(arr) => arr.value_type(),
+        }
+    }
+
+    fn offsets(&self) -> Box<dyn Iterator<Item = usize> + 'a> {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => {
+                let value_length = arr.value_length() as usize;
+                Box::new((0..=arr.len()).map(move |i| i * value_length))
+            }
+            ArrayWrapper::List(arr) => {
+                Box::new(arr.offsets().iter().map(|o| (*o) as usize))
+            }
+            ArrayWrapper::LargeList(arr) => {
+                Box::new(arr.offsets().iter().map(|o| (*o) as usize))
+            }
+        }
+    }
+
+    fn nulls(&self) -> Option<&arrow::buffer::NullBuffer> {
+        match self {
+            ArrayWrapper::FixedSizeList(arr) => arr.nulls(),
+            ArrayWrapper::List(arr) => arr.nulls(),
+            ArrayWrapper::LargeList(arr) => arr.nulls(),
+        }
+    }
+}
+
+fn array_has_dispatch_for_array<'a>(
+    haystack: ArrayWrapper<'a>,
     needle: &ArrayRef,
 ) -> Result<ArrayRef> {
-    let haystack = as_generic_list_array::<O>(haystack)?;
     let mut boolean_builder = BooleanArray::builder(haystack.len());
-
     for (i, arr) in haystack.iter().enumerate() {
         if arr.is_none() || needle.is_null(i) {
             boolean_builder.append_null();
@@ -261,66 +347,440 @@ fn array_has_dispatch_for_array<O: OffsetSizeTrait>(
     Ok(Arc::new(boolean_builder.finish()))
 }
 
-fn array_has_dispatch_for_scalar<O: OffsetSizeTrait>(
-    haystack: &ArrayRef,
+fn array_has_dispatch_for_scalar(
+    haystack: ArrayWrapper<'_>,
     needle: &dyn Datum,
 ) -> Result<ArrayRef> {
-    let haystack = as_generic_list_array::<O>(haystack)?;
-    let values = haystack.values();
-    let is_nested = values.data_type().is_nested();
-    let offsets = haystack.value_offsets();
     // If first argument is empty list (second argument is non-null), return false
     // i.e. array_has([], non-null element) -> false
-    if values.is_empty() {
+    if haystack.len() == 0 {
         return Ok(Arc::new(BooleanArray::new(
             BooleanBuffer::new_unset(haystack.len()),
             None,
         )));
     }
-    let eq_array = compare_with_eq(values, needle, is_nested)?;
-    let mut final_contained = vec![None; haystack.len()];
-    for (i, offset) in offsets.windows(2).enumerate() {
-        let start = offset[0].to_usize().unwrap();
-        let end = offset[1].to_usize().unwrap();
-        let length = end - start;
-        // For non-nested list, length is 0 for null
-        if length == 0 {
-            continue;
+
+    // For sliced ListArrays, values() returns the full underlying array but
+    // only elements between the first and last offset are visible.
+    let offsets: Vec<usize> = haystack.offsets().collect();
+    let first_offset = offsets[0];
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, offsets[offsets.len() - 1] - first_offset);
+
+    let is_nested = visible_values.data_type().is_nested();
+    let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
+
+    // When a haystack element is null, `eq()` returns null (not false).
+    // In Arrow, a null BooleanArray entry has validity=0 but an
+    // undefined value bit that may happen to be 1. Since set_indices()
+    // operates on the raw value buffer and ignores validity, we AND the
+    // values with the validity bitmap to clear any undefined bits at
+    // null positions. This ensures set_indices() only yields positions
+    // where the comparison genuinely returned true.
+    let eq_bits = match eq_array.nulls() {
+        Some(nulls) => eq_array.values() & nulls.inner(),
+        None => eq_array.values().clone(),
+    };
+
+    let validity = match &haystack {
+        ArrayWrapper::FixedSizeList(arr) => arr.nulls(),
+        ArrayWrapper::List(arr) => arr.nulls(),
+        ArrayWrapper::LargeList(arr) => arr.nulls(),
+    };
+    let mut matches = eq_bits.set_indices().peekable();
+    let mut result = BooleanBufferBuilder::new(haystack.len());
+    result.append_n(haystack.len(), false);
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for (i, window) in offsets.windows(2).enumerate() {
+        let end = window[1] - first_offset;
+
+        let has_match = matches.peek().is_some_and(|&p| p < end);
+
+        // Advance past all match positions in this row's range.
+        while matches.peek().is_some_and(|&p| p < end) {
+            matches.next();
+        }
+
+        if has_match && validity.is_none_or(|v| v.is_valid(i)) {
+            result.set_bit(i, true);
         }
-        let sliced_array = eq_array.slice(start, length);
-        final_contained[i] = Some(sliced_array.true_count() > 0);
     }
 
-    Ok(Arc::new(BooleanArray::from(final_contained)))
+    // A null haystack row always produces a null output, so we can
+    // reuse the haystack's null buffer directly.
+    Ok(Arc::new(BooleanArray::new(
+        result.finish(),
+        validity.cloned(),
+    )))
 }
 
 fn array_has_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        DataType::List(_) => {
-            array_has_all_and_any_dispatch::<i32>(&args[0], &args[1], ComparisonType::All)
+    array_has_all_and_any_inner(args, ComparisonType::All)
+}
+
+/// Number of rows to process at a time when doing batched row conversion.  This
+/// amortizes the row conversion overhead over more rows, but making this too
+/// large can cause cache pressure for large arrays. See
+/// <https://github.com/apache/datafusion/pull/20588> for context.
+const ROW_CONVERSION_CHUNK_SIZE: usize = 512;
+
+// General row comparison for array_has_all and array_has_any
+fn general_array_has_for_all_and_any<'a>(
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
+    comparison_type: ComparisonType,
+) -> Result<ArrayRef> {
+    let num_rows = haystack.len();
+    let converter = RowConverter::new(vec![SortField::new(haystack.value_type())])?;
+
+    let h_offsets: Vec<usize> = haystack.offsets().collect();
+    let n_offsets: Vec<usize> = needle.offsets().collect();
+
+    let h_nulls = haystack.nulls();
+    let n_nulls = needle.nulls();
+    let mut builder = BooleanArray::builder(num_rows);
+
+    for chunk_start in (0..num_rows).step_by(ROW_CONVERSION_CHUNK_SIZE) {
+        let chunk_end = (chunk_start + ROW_CONVERSION_CHUNK_SIZE).min(num_rows);
+
+        // For efficiency with sliced arrays, only process the visible elements,
+        // not the entire underlying buffer.
+        let h_elem_start = h_offsets[chunk_start];
+        let h_elem_end = h_offsets[chunk_end];
+        let n_elem_start = n_offsets[chunk_start];
+        let n_elem_end = n_offsets[chunk_end];
+
+        let h_vals = haystack
+            .values()
+            .slice(h_elem_start, h_elem_end - h_elem_start);
+        let n_vals = needle
+            .values()
+            .slice(n_elem_start, n_elem_end - n_elem_start);
+
+        let chunk_h_rows = converter.convert_columns(&[h_vals])?;
+        let chunk_n_rows = converter.convert_columns(&[n_vals])?;
+
+        for i in chunk_start..chunk_end {
+            if h_nulls.is_some_and(|n| n.is_null(i))
+                || n_nulls.is_some_and(|n| n.is_null(i))
+            {
+                builder.append_null();
+                continue;
+            }
+            builder.append_value(general_array_has_all_and_any_kernel(
+                &chunk_h_rows,
+                (h_offsets[i] - h_elem_start)..(h_offsets[i + 1] - h_elem_start),
+                &chunk_n_rows,
+                (n_offsets[i] - n_elem_start)..(n_offsets[i + 1] - n_elem_start),
+                comparison_type,
+            ));
         }
-        DataType::LargeList(_) => {
-            array_has_all_and_any_dispatch::<i64>(&args[0], &args[1], ComparisonType::All)
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+
+// String comparison for array_has_all and array_has_any
+fn array_has_all_and_any_string_internal<'a>(
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
+    comparison_type: ComparisonType,
+) -> Result<ArrayRef> {
+    let num_rows = haystack.len();
+
+    let h_offsets: Vec<usize> = haystack.offsets().collect();
+    let n_offsets: Vec<usize> = needle.offsets().collect();
+
+    let h_nulls = haystack.nulls();
+    let n_nulls = needle.nulls();
+    let mut builder = BooleanArray::builder(num_rows);
+
+    for chunk_start in (0..num_rows).step_by(ROW_CONVERSION_CHUNK_SIZE) {
+        let chunk_end = (chunk_start + ROW_CONVERSION_CHUNK_SIZE).min(num_rows);
+
+        let h_elem_start = h_offsets[chunk_start];
+        let h_elem_end = h_offsets[chunk_end];
+        let n_elem_start = n_offsets[chunk_start];
+        let n_elem_end = n_offsets[chunk_end];
+
+        let h_vals = haystack
+            .values()
+            .slice(h_elem_start, h_elem_end - h_elem_start);
+        let n_vals = needle
+            .values()
+            .slice(n_elem_start, n_elem_end - n_elem_start);
+
+        let chunk_h_strings = string_array_to_vec(h_vals.as_ref());
+        let chunk_n_strings = string_array_to_vec(n_vals.as_ref());
+
+        for i in chunk_start..chunk_end {
+            if h_nulls.is_some_and(|n| n.is_null(i))
+                || n_nulls.is_some_and(|n| n.is_null(i))
+            {
+                builder.append_null();
+                continue;
+            }
+            let h_start = h_offsets[i] - h_elem_start;
+            let h_end = h_offsets[i + 1] - h_elem_start;
+            let n_start = n_offsets[i] - n_elem_start;
+            let n_end = n_offsets[i + 1] - n_elem_start;
+            builder.append_value(array_has_string_kernel(
+                &chunk_h_strings[h_start..h_end],
+                &chunk_n_strings[n_start..n_end],
+                comparison_type,
+            ));
         }
-        _ => exec_err!(
-            "array_has does not support type '{:?}'.",
-            args[0].data_type()
-        ),
     }
+
+    Ok(Arc::new(builder.finish()))
 }
 
-fn array_has_any_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        DataType::List(_) => {
-            array_has_all_and_any_dispatch::<i32>(&args[0], &args[1], ComparisonType::Any)
+fn array_has_all_and_any_dispatch<'a>(
+    haystack: ArrayWrapper<'a>,
+    needle: ArrayWrapper<'a>,
+    comparison_type: ComparisonType,
+) -> Result<ArrayRef> {
+    if needle.values().is_empty() {
+        let buffer = match comparison_type {
+            ComparisonType::All => BooleanBuffer::new_set(haystack.len()),
+            ComparisonType::Any => BooleanBuffer::new_unset(haystack.len()),
+        };
+        Ok(Arc::new(BooleanArray::from(buffer)))
+    } else {
+        match needle.value_type() {
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+                array_has_all_and_any_string_internal(haystack, needle, comparison_type)
+            }
+            _ => general_array_has_for_all_and_any(haystack, needle, comparison_type),
         }
-        DataType::LargeList(_) => {
-            array_has_all_and_any_dispatch::<i64>(&args[0], &args[1], ComparisonType::Any)
+    }
+}
+
+fn array_has_all_and_any_inner(
+    args: &[ArrayRef],
+    comparison_type: ComparisonType,
+) -> Result<ArrayRef> {
+    let haystack: ArrayWrapper = args[0].as_ref().try_into()?;
+    let needle: ArrayWrapper = args[1].as_ref().try_into()?;
+    array_has_all_and_any_dispatch(haystack, needle, comparison_type)
+}
+
+fn array_has_any_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    array_has_all_and_any_inner(args, ComparisonType::Any)
+}
+
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let offsets: Vec<usize> = scalar_list.offsets().collect();
+    let scalar_values = scalar_list
+        .values()
+        .slice(offsets[0], offsets[1] - offsets[0]);
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))));
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, &scalar_values)
         }
-        _ => exec_err!(
-            "array_has does not support type '{:?}'.",
-            args[0].data_type()
+        _ => array_has_any_with_scalar_general(columnar_arg, &scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let col_values = col_list.values();
+    let col_offsets: Vec<usize> = col_list.offsets().collect();
+    let col_nulls = col_list.nulls();
+
+    let scalar_lookup = ScalarStringLookup::new(scalar_values);
+    let has_null_scalar = scalar_values.null_count() > 0;
+
+    let result = match col_values.data_type() {
+        DataType::Utf8 => array_has_any_string_inner(
+            col_values.as_string::<i32>(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
+        ),
+        DataType::LargeUtf8 => array_has_any_string_inner(
+            col_values.as_string::<i64>(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
+        ),
+        DataType::Utf8View => array_has_any_string_inner(
+            col_values.as_string_view(),
+            &col_offsets,
+            col_nulls,
+            has_null_scalar,
+            &scalar_lookup,
         ),
+        _ => unreachable!("array_has_any_with_scalar_string called with non-string type"),
+    };
+
+    if is_scalar_output {
+        Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?))
+    } else {
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+/// Pre-computed lookup structure for the scalar string fastpath.
+enum ScalarStringLookup<'a> {
+    /// Large scalar: HashSet for O(1) lookups.
+    Set(HashSet<&'a str>),
+    /// Small scalar: Vec for linear scan.
+    List(Vec<Option<&'a str>>),
+}
+
+impl<'a> ScalarStringLookup<'a> {
+    fn new(scalar_values: &'a ArrayRef) -> Self {
+        let strings = string_array_to_vec(scalar_values.as_ref());
+        if strings.len() > SCALAR_SMALL_THRESHOLD {
+            ScalarStringLookup::Set(strings.into_iter().flatten().collect())
+        } else {
+            ScalarStringLookup::List(strings)
+        }
+    }
+
+    fn contains(&self, value: &str) -> bool {
+        match self {
+            ScalarStringLookup::Set(set) => set.contains(value),
+            ScalarStringLookup::List(list) => list.contains(&Some(value)),
+        }
+    }
+}
+
+/// Inner implementation of the string scalar fast path, generic over string
+/// array type to allow direct element access by index.
+fn array_has_any_string_inner<'a, C: StringArrayType<'a> + Copy>(
+    col_strings: C,
+    col_offsets: &[usize],
+    col_nulls: Option<&arrow::buffer::NullBuffer>,
+    has_null_scalar: bool,
+    scalar_lookup: &ScalarStringLookup<'_>,
+) -> ArrayRef {
+    let num_rows = col_offsets.len() - 1;
+    let mut builder = BooleanArray::builder(num_rows);
+
+    for i in 0..num_rows {
+        if col_nulls.is_some_and(|v| v.is_null(i)) {
+            builder.append_null();
+            continue;
+        }
+        let start = col_offsets[i];
+        let end = col_offsets[i + 1];
+        let found = (start..end).any(|j| {
+            if col_strings.is_null(j) {
+                has_null_scalar
+            } else {
+                scalar_lookup.contains(col_strings.value(j))
+            }
+        });
+        builder.append_value(found);
+    }
+
+    Arc::new(builder.finish())
+}
+
+/// General scalar fast path for `array_has_any`, using RowConverter for
+/// type-erased comparison.
+fn array_has_any_with_scalar_general(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let converter =
+        RowConverter::new(vec![SortField::new(scalar_values.data_type().clone())])?;
+    let scalar_rows = converter.convert_columns(&[Arc::clone(scalar_values)])?;
+
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let col_rows = converter.convert_columns(&[Arc::clone(col_list.values())])?;
+    let col_offsets: Vec<usize> = col_list.offsets().collect();
+    let col_nulls = col_list.nulls();
+
+    let mut builder = BooleanArray::builder(col_list.len());
+    let num_scalar = scalar_rows.num_rows();
+
+    if num_scalar > SCALAR_SMALL_THRESHOLD {
+        // Large scalar: build HashSet for O(1) lookups
+        let scalar_set: HashSet<Box<[u8]>> = (0..num_scalar)
+            .map(|i| Box::from(scalar_rows.row(i).as_ref()))
+            .collect();
+
+        for i in 0..col_list.len() {
+            if col_nulls.is_some_and(|v| v.is_null(i)) {
+                builder.append_null();
+                continue;
+            }
+            let start = col_offsets[i];
+            let end = col_offsets[i + 1];
+            let found =
+                (start..end).any(|j| scalar_set.contains(col_rows.row(j).as_ref()));
+            builder.append_value(found);
+        }
+    } else {
+        // Small scalar: linear scan avoids HashSet hashing overhead
+        for i in 0..col_list.len() {
+            if col_nulls.is_some_and(|v| v.is_null(i)) {
+                builder.append_null();
+                continue;
+            }
+            let start = col_offsets[i];
+            let end = col_offsets[i + 1];
+            let found = (start..end)
+                .any(|j| (0..num_scalar).any(|k| col_rows.row(j) == scalar_rows.row(k)));
+            builder.append_value(found);
+        }
+    }
+
+    let result: ArrayRef = Arc::new(builder.finish());
+
+    if is_scalar_output {
+        Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?))
+    } else {
+        Ok(ColumnarValue::Array(result))
     }
 }
 
@@ -345,7 +805,7 @@ fn array_has_any_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayHasAll {
     signature: Signature,
     aliases: Vec<String>,
@@ -360,7 +820,7 @@ impl Default for ArrayHasAll {
 impl ArrayHasAll {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::arrays(2, None, Volatility::Immutable),
             aliases: vec![String::from("list_has_all")],
         }
     }
@@ -382,10 +842,7 @@ impl ScalarUDFImpl for ArrayHasAll {
         Ok(DataType::Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_has_all_inner)(&args.args)
     }
 
@@ -400,8 +857,8 @@ impl ScalarUDFImpl for ArrayHasAll {
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns true if any elements exist in both arrays.",
-    syntax_example = "array_has_any(array, sub-array)",
+    description = "Returns true if the arrays have any elements in common.",
+    syntax_example = "array_has_any(array1, array2)",
     sql_example = r#"```sql
 > select array_has_any([1, 2, 3], [3, 4]);
 +------------------------------------------+
@@ -411,15 +868,15 @@ impl ScalarUDFImpl for ArrayHasAll {
 +------------------------------------------+
 ```"#,
     argument(
-        name = "array",
+        name = "array1",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
     argument(
-        name = "sub-array",
+        name = "array2",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayHasAny {
     signature: Signature,
     aliases: Vec<String>,
@@ -434,7 +891,7 @@ impl Default for ArrayHasAny {
 impl ArrayHasAny {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::arrays(2, None, Volatility::Immutable),
             aliases: vec![String::from("list_has_any"), String::from("arrays_overlap")],
         }
     }
@@ -456,11 +913,16 @@ impl ScalarUDFImpl for ArrayHasAny {
         Ok(DataType::Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_has_any_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?;
+
+        // If either argument is scalar, use the fast path.
+        match (&first_arg, &second_arg) {
+            (cv, ColumnarValue::Scalar(scalar)) | (ColumnarValue::Scalar(scalar), cv) => {
+                array_has_any_with_scalar(cv, scalar)
+            }
+            _ => make_scalar_function(array_has_any_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -481,58 +943,9 @@ enum ComparisonType {
     Any,
 }
 
-fn array_has_all_and_any_dispatch<O: OffsetSizeTrait>(
-    haystack: &ArrayRef,
-    needle: &ArrayRef,
-    comparison_type: ComparisonType,
-) -> Result<ArrayRef> {
-    let haystack = as_generic_list_array::<O>(haystack)?;
-    let needle = as_generic_list_array::<O>(needle)?;
-    if needle.values().is_empty() {
-        let buffer = match comparison_type {
-            ComparisonType::All => BooleanBuffer::new_set(haystack.len()),
-            ComparisonType::Any => BooleanBuffer::new_unset(haystack.len()),
-        };
-        return Ok(Arc::new(BooleanArray::from(buffer)));
-    }
-    match needle.data_type() {
-        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
-            array_has_all_and_any_string_internal::<O>(haystack, needle, comparison_type)
-        }
-        _ => general_array_has_for_all_and_any::<O>(haystack, needle, comparison_type),
-    }
-}
-
-// String comparison for array_has_all and array_has_any
-fn array_has_all_and_any_string_internal<O: OffsetSizeTrait>(
-    array: &GenericListArray<O>,
-    needle: &GenericListArray<O>,
-    comparison_type: ComparisonType,
-) -> Result<ArrayRef> {
-    let mut boolean_builder = BooleanArray::builder(array.len());
-    for (arr, sub_arr) in array.iter().zip(needle.iter()) {
-        match (arr, sub_arr) {
-            (Some(arr), Some(sub_arr)) => {
-                let haystack_array = string_array_to_vec(&arr);
-                let needle_array = string_array_to_vec(&sub_arr);
-                boolean_builder.append_value(array_has_string_kernel(
-                    haystack_array,
-                    needle_array,
-                    comparison_type,
-                ));
-            }
-            (_, _) => {
-                boolean_builder.append_null();
-            }
-        }
-    }
-
-    Ok(Arc::new(boolean_builder.finish()))
-}
-
 fn array_has_string_kernel(
-    haystack: Vec<Option<&str>>,
-    needle: Vec<Option<&str>>,
+    haystack: &[Option<&str>],
+    needle: &[Option<&str>],
     comparison_type: ComparisonType,
 ) -> bool {
     match comparison_type {
@@ -547,63 +960,53 @@ fn array_has_string_kernel(
     }
 }
 
-// General row comparison for array_has_all and array_has_any
-fn general_array_has_for_all_and_any<O: OffsetSizeTrait>(
-    haystack: &GenericListArray<O>,
-    needle: &GenericListArray<O>,
-    comparison_type: ComparisonType,
-) -> Result<ArrayRef> {
-    let mut boolean_builder = BooleanArray::builder(haystack.len());
-    let converter = RowConverter::new(vec![SortField::new(haystack.value_type())])?;
-
-    for (arr, sub_arr) in haystack.iter().zip(needle.iter()) {
-        if let (Some(arr), Some(sub_arr)) = (arr, sub_arr) {
-            let arr_values = converter.convert_columns(&[arr])?;
-            let sub_arr_values = converter.convert_columns(&[sub_arr])?;
-            boolean_builder.append_value(general_array_has_all_and_any_kernel(
-                arr_values,
-                sub_arr_values,
-                comparison_type,
-            ));
-        } else {
-            boolean_builder.append_null();
-        }
-    }
-
-    Ok(Arc::new(boolean_builder.finish()))
-}
-
 fn general_array_has_all_and_any_kernel(
-    haystack_rows: Rows,
-    needle_rows: Rows,
+    haystack_rows: &Rows,
+    h_range: Range<usize>,
+    needle_rows: &Rows,
+    mut n_range: Range<usize>,
     comparison_type: ComparisonType,
 ) -> bool {
+    let h_start = h_range.start;
+    let h_end = h_range.end;
+
     match comparison_type {
-        ComparisonType::All => needle_rows.iter().all(|needle_row| {
-            haystack_rows
-                .iter()
-                .any(|haystack_row| haystack_row == needle_row)
+        ComparisonType::All => n_range.all(|ni| {
+            let needle_row = needle_rows.row(ni);
+            (h_start..h_end).any(|hi| haystack_rows.row(hi) == needle_row)
         }),
-        ComparisonType::Any => needle_rows.iter().any(|needle_row| {
-            haystack_rows
-                .iter()
-                .any(|haystack_row| haystack_row == needle_row)
+        ComparisonType::Any => n_range.any(|ni| {
+            let needle_row = needle_rows.row(ni);
+            (h_start..h_end).any(|hi| haystack_rows.row(hi) == needle_row)
         }),
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::create_array;
-    use datafusion_common::utils::SingleRowListArrayBuilder;
+    use std::sync::Arc;
+
+    use arrow::datatypes::Int32Type;
+    use arrow::{
+        array::{
+            Array, ArrayRef, AsArray, FixedSizeListArray, Int32Array, ListArray,
+            create_array,
+        },
+        buffer::OffsetBuffer,
+        datatypes::{DataType, Field},
+    };
+    use datafusion_common::{
+        DataFusionError, ScalarValue, config::ConfigOptions,
+        utils::SingleRowListArrayBuilder,
+    };
     use datafusion_expr::{
-        col, execution_props::ExecutionProps, lit, simplify::ExprSimplifyResult, Expr,
-        ScalarUDFImpl,
+        ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, col, lit,
+        simplify::ExprSimplifyResult,
     };
 
     use crate::expr_fn::make_array;
 
-    use super::ArrayHas;
+    use super::{ArrayHas, ArrayHasAll, ArrayHasAny};
 
     #[test]
     fn test_simplify_array_has_to_in_list() {
@@ -614,8 +1017,7 @@ mod tests {
         .build_list_scalar());
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = datafusion_expr::simplify::SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Simplified(Expr::InList(in_list))) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -638,8 +1040,7 @@ mod tests {
         let haystack = make_array(vec![lit(1), lit(2), lit(3)]);
         let needle = col("c");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = datafusion_expr::simplify::SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Simplified(Expr::InList(in_list))) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -657,13 +1058,44 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_simplify_array_has_with_null_to_null() {
+        let haystack = Expr::Literal(ScalarValue::Null, None);
+        let needle = col("c");
+
+        let context = datafusion_expr::simplify::SimplifyContext::default();
+        let Ok(ExprSimplifyResult::Simplified(simplified)) =
+            ArrayHas::new().simplify(vec![haystack, needle], &context)
+        else {
+            panic!("Expected simplified expression");
+        };
+
+        assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None), None));
+    }
+
+    #[test]
+    fn test_simplify_array_has_with_null_list_to_null() {
+        let haystack =
+            ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0], _>([None]);
+        let haystack = Expr::Literal(ScalarValue::List(Arc::new(haystack)), None);
+        let needle = col("c");
+
+        let context = datafusion_expr::simplify::SimplifyContext::default();
+        let Ok(ExprSimplifyResult::Simplified(simplified)) =
+            ArrayHas::new().simplify(vec![haystack, needle], &context)
+        else {
+            panic!("Expected simplified expression");
+        };
+
+        assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None), None));
+    }
+
     #[test]
     fn test_array_has_complex_list_not_simplified() {
         let haystack = col("c1");
         let needle = col("c2");
 
-        let props = ExecutionProps::new();
-        let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+        let context = datafusion_expr::simplify::SimplifyContext::default();
 
         let Ok(ExprSimplifyResult::Original(args)) =
             ArrayHas::new().simplify(vec![haystack, needle.clone()], &context)
@@ -673,4 +1105,223 @@ mod tests {
 
         assert_eq!(args, vec![col("c1"), col("c2")],);
     }
+
+    #[test]
+    fn test_array_has_list_empty_child() -> Result<(), DataFusionError> {
+        let haystack_field = Arc::new(Field::new_list(
+            "haystack",
+            Field::new_list("", Field::new("", DataType::Int32, true), true),
+            true,
+        ));
+
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
+        let haystack = ListArray::new(
+            Field::new_list_field(DataType::Int32, true).into(),
+            OffsetBuffer::new(vec![0, 0].into()),
+            Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef,
+            Some(vec![false].into()),
+        );
+
+        let haystack = ColumnarValue::Array(Arc::new(haystack));
+        let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
+        let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![haystack, needle],
+            arg_fields: vec![haystack_field, needle_field],
+            number_rows: 1,
+            return_field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(1)?;
+        let output = output.as_boolean();
+        assert_eq!(output.len(), 1);
+        assert!(output.is_null(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
+            ArrayHas::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        let output = invoke(70)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
+        let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
+        let haystack =
+            ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0], _>([
+                None, None, None,
+            ]);
+
+        let haystack = ColumnarValue::Array(Arc::new(haystack));
+        let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
+        let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![haystack, needle],
+            arg_fields: vec![haystack_field, needle_field],
+            number_rows: 1,
+            return_field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(1)?;
+        let output = output.as_boolean();
+        assert_eq!(output.len(), 3);
+        for i in 0..3 {
+            assert!(output.is_null(i));
+        }
+
+        Ok(())
+    }
+
+    /// Invoke a two-argument list UDF with the given arrays and assert the
+    /// boolean output matches `expected`.
+    fn invoke_and_assert(
+        udf: &dyn ScalarUDFImpl,
+        haystack: &ArrayRef,
+        needle: ArrayRef,
+        expected: &[Option<bool>],
+    ) {
+        let num_rows = haystack.len();
+        let list_type = haystack.data_type();
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(haystack)),
+                    ColumnarValue::Array(needle),
+                ],
+                arg_fields: vec![
+                    Arc::new(Field::new("haystack", list_type.clone(), false)),
+                    Arc::new(Field::new("needle", list_type.clone(), false)),
+                ],
+                number_rows: num_rows,
+                return_field: Arc::new(Field::new("return", DataType::Boolean, true)),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+        let output = result.into_array(num_rows).unwrap();
+        assert_eq!(output.as_boolean().iter().collect::<Vec<_>>(), expected);
+    }
+
+    #[test]
+    fn test_sliced_list_offsets() {
+        // Full rows:
+        //   row 0: [1, 2]   (not visible after slicing)
+        //   row 1: [11, 12] (visible row 0)
+        //   row 2: [21, 22] (visible row 1)
+        //   row 3: [31, 32] (not visible after slicing)
+        let field: Arc<Field> = Arc::new(Field::new("item", DataType::Int32, false));
+        let full_values = Arc::new(Int32Array::from(vec![1, 2, 11, 12, 21, 22, 31, 32]));
+        let full_offsets = OffsetBuffer::new(vec![0, 2, 4, 6, 8].into());
+        let full = ListArray::new(Arc::clone(&field), full_offsets, full_values, None);
+        let sliced_haystack: ArrayRef = Arc::new(full.slice(1, 2));
+
+        // array_has_all: needle row 0 = [11], row 1 = [21]
+        let needle_all: ArrayRef = Arc::new(ListArray::new(
+            Arc::clone(&field),
+            OffsetBuffer::new(vec![0, 1, 2].into()),
+            Arc::new(Int32Array::from(vec![11, 21])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAll::new(),
+            &sliced_haystack,
+            needle_all,
+            &[Some(true), Some(true)],
+        );
+
+        // array_has_any: needle row 0 = [99, 11], row 1 = [99, 21]
+        let needle_any: ArrayRef = Arc::new(ListArray::new(
+            field,
+            OffsetBuffer::new(vec![0, 2, 4].into()),
+            Arc::new(Int32Array::from(vec![99, 11, 99, 21])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAny::new(),
+            &sliced_haystack,
+            needle_any,
+            &[Some(true), Some(true)],
+        );
+    }
+
+    #[test]
+    fn test_sliced_fixed_size_list_offsets() {
+        // Same logical data as test_sliced_list_offsets, but using FixedSizeListArray.
+        let field = Arc::new(Field::new("item", DataType::Int32, false));
+        let full_values = Arc::new(Int32Array::from(vec![1, 2, 11, 12, 21, 22, 31, 32]));
+        let full = FixedSizeListArray::new(Arc::clone(&field), 2, full_values, None);
+        let sliced_haystack: ArrayRef = Arc::new(full.slice(1, 2));
+
+        // array_has_all: needle row 0 = [11, 12], row 1 = [21, 22]
+        let needle_all: ArrayRef = Arc::new(FixedSizeListArray::new(
+            Arc::clone(&field),
+            2,
+            Arc::new(Int32Array::from(vec![11, 12, 21, 22])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAll::new(),
+            &sliced_haystack,
+            needle_all,
+            &[Some(true), Some(true)],
+        );
+
+        // array_has_any: needle row 0 = [99, 12], row 1 = [99, 22]
+        let needle_any: ArrayRef = Arc::new(FixedSizeListArray::new(
+            field,
+            2,
+            Arc::new(Int32Array::from(vec![99, 12, 99, 22])),
+            None,
+        ));
+        invoke_and_assert(
+            &ArrayHasAny::new(),
+            &sliced_haystack,
+            needle_any,
+            &[Some(true), Some(true)],
+        );
+    }
 }
diff --git a/datafusion/functions-nested/src/arrays_zip.rs b/datafusion/functions-nested/src/arrays_zip.rs
new file mode 100644
index 0000000000000..e70eb364bcc7e
--- /dev/null
+++ b/datafusion/functions-nested/src/arrays_zip.rs
@@ -0,0 +1,336 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarUDFImpl`] definitions for arrays_zip function.
+
+use crate::utils::make_scalar_function;
+use arrow::array::{
+    Array, ArrayRef, Capacities, ListArray, MutableArrayData, StructArray, new_null_array,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::cast::{
+    as_fixed_size_list_array, as_large_list_array, as_list_array,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Type-erased view of a list column (works for both List and LargeList).
+/// Stores the information needed to iterate rows without re-downcasting.
+struct ListColumnView {
+    /// The flat values array backing this list column.
+    values: ArrayRef,
+    /// Pre-computed per-row start offsets (length = num_rows + 1).
+    offsets: Vec<usize>,
+    /// Pre-computed null bitmap: true means the row is null.
+    is_null: Vec<bool>,
+}
+
+make_udf_expr_and_func!(
+    ArraysZip,
+    arrays_zip,
+    "combines one or multiple arrays into a single array of structs.",
+    arrays_zip_udf
+);
+
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Returns an array of structs created by combining the elements of each input array at the same index. If the arrays have different lengths, shorter arrays are padded with NULLs.",
+    syntax_example = "arrays_zip(array1[, ..., array_n])",
+    sql_example = r#"```sql
+> select arrays_zip([1, 2, 3]);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3])                             |
++---------------------------------------------------+
+| [{1: 1}, {1: 2}, {1: 3}]                          |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5])                     |
++---------------------------------------------------+
+| [{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]     |
++---------------------------------------------------+
+```"#,
+    argument(name = "array1", description = "First array expression."),
+    argument(
+        name = "array_n",
+        description = "Optional additional array expressions."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArraysZip {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for ArraysZip {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArraysZip {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+            aliases: vec![String::from("list_zip")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArraysZip {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "arrays_zip"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return exec_err!("arrays_zip requires at least one argument");
+        }
+
+        let mut fields = Vec::with_capacity(arg_types.len());
+        for (i, arg_type) in arg_types.iter().enumerate() {
+            let element_type = match arg_type {
+                List(field) | LargeList(field) | FixedSizeList(field, _) => {
+                    field.data_type().clone()
+                }
+                Null => Null,
+                dt => {
+                    return exec_err!("arrays_zip expects array arguments, got {dt}");
+                }
+            };
+            fields.push(Field::new(format!("{}", i + 1), element_type, true));
+        }
+
+        Ok(List(Arc::new(Field::new_list_field(
+            DataType::Struct(Fields::from(fields)),
+            true,
+        ))))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(arrays_zip_inner)(&args.args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Core implementation for arrays_zip.
+///
+/// Takes N list arrays and produces a list of structs where each struct
+/// has one field per input array. If arrays within a row have different
+/// lengths, shorter arrays are padded with NULLs.
+/// Supports List, LargeList, and Null input types.
+fn arrays_zip_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.is_empty() {
+        return exec_err!("arrays_zip requires at least one argument");
+    }
+
+    let num_rows = args[0].len();
+
+    // Build a type-erased ListColumnView for each argument.
+    // None means the argument is Null-typed (all nulls, no backing data).
+    let mut views: Vec<Option<ListColumnView>> = Vec::with_capacity(args.len());
+    let mut element_types: Vec<DataType> = Vec::with_capacity(args.len());
+
+    for (i, arg) in args.iter().enumerate() {
+        match arg.data_type() {
+            List(field) => {
+                let arr = as_list_array(arg)?;
+                let raw_offsets = arr.value_offsets();
+                let offsets: Vec<usize> =
+                    raw_offsets.iter().map(|&o| o as usize).collect();
+                let is_null = (0..num_rows).map(|row| arr.is_null(row)).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    is_null,
+                }));
+            }
+            LargeList(field) => {
+                let arr = as_large_list_array(arg)?;
+                let raw_offsets = arr.value_offsets();
+                let offsets: Vec<usize> =
+                    raw_offsets.iter().map(|&o| o as usize).collect();
+                let is_null = (0..num_rows).map(|row| arr.is_null(row)).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    is_null,
+                }));
+            }
+            FixedSizeList(field, size) => {
+                let arr = as_fixed_size_list_array(arg)?;
+                let size = *size as usize;
+                let offsets: Vec<usize> = (0..=num_rows).map(|row| row * size).collect();
+                let is_null = (0..num_rows).map(|row| arr.is_null(row)).collect();
+                element_types.push(field.data_type().clone());
+                views.push(Some(ListColumnView {
+                    values: Arc::clone(arr.values()),
+                    offsets,
+                    is_null,
+                }));
+            }
+            Null => {
+                element_types.push(Null);
+                views.push(None);
+            }
+            dt => {
+                return exec_err!("arrays_zip argument {i} expected list type, got {dt}");
+            }
+        }
+    }
+
+    // Collect per-column values data for MutableArrayData builders.
+    let values_data: Vec<_> = views
+        .iter()
+        .map(|v| v.as_ref().map(|view| view.values.to_data()))
+        .collect();
+
+    let struct_fields: Fields = element_types
+        .iter()
+        .enumerate()
+        .map(|(i, dt)| Field::new(format!("{}", i + 1), dt.clone(), true))
+        .collect::<Vec<_>>()
+        .into();
+
+    // Create a MutableArrayData builder per column. For None (Null-typed)
+    // args we only need extend_nulls, so we track them separately.
+    let mut builders: Vec<Option<MutableArrayData>> = values_data
+        .iter()
+        .map(|vd| {
+            vd.as_ref().map(|data| {
+                MutableArrayData::with_capacities(vec![data], true, Capacities::Array(0))
+            })
+        })
+        .collect();
+
+    let mut offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+    offsets.push(0);
+    let mut null_mask: Vec<bool> = Vec::with_capacity(num_rows);
+    let mut total_values: usize = 0;
+
+    // Process each row: compute per-array lengths, then copy values
+    // and pad shorter arrays with NULLs.
+    for row_idx in 0..num_rows {
+        let mut max_len: usize = 0;
+        let mut all_null = true;
+
+        for view in views.iter().flatten() {
+            if !view.is_null[row_idx] {
+                all_null = false;
+                let len = view.offsets[row_idx + 1] - view.offsets[row_idx];
+                max_len = max_len.max(len);
+            }
+        }
+
+        if all_null {
+            null_mask.push(true);
+            offsets.push(*offsets.last().unwrap());
+            continue;
+        }
+        null_mask.push(false);
+
+        // Extend each column builder for this row.
+        for (col_idx, view) in views.iter().enumerate() {
+            match view {
+                Some(v) if !v.is_null[row_idx] => {
+                    let start = v.offsets[row_idx];
+                    let end = v.offsets[row_idx + 1];
+                    let len = end - start;
+                    let builder = builders[col_idx].as_mut().unwrap();
+                    builder.extend(0, start, end);
+                    if len < max_len {
+                        builder.extend_nulls(max_len - len);
+                    }
+                }
+                _ => {
+                    // Null list entry or None (Null-typed) arg — all nulls.
+                    if let Some(builder) = builders[col_idx].as_mut() {
+                        builder.extend_nulls(max_len);
+                    }
+                }
+            }
+        }
+
+        total_values += max_len;
+        let last = *offsets.last().unwrap();
+        offsets.push(last + max_len as i32);
+    }
+
+    // Assemble struct columns from builders.
+    let struct_columns: Vec<ArrayRef> = builders
+        .into_iter()
+        .zip(element_types.iter())
+        .map(|(builder, elem_type)| match builder {
+            Some(b) => arrow::array::make_array(b.freeze()),
+            None => new_null_array(
+                if elem_type.is_null() {
+                    &Null
+                } else {
+                    elem_type
+                },
+                total_values,
+            ),
+        })
+        .collect();
+
+    let struct_array = StructArray::try_new(struct_fields, struct_columns, None)?;
+
+    let null_buffer = if null_mask.iter().any(|&v| v) {
+        Some(NullBuffer::from(
+            null_mask.iter().map(|v| !v).collect::<Vec<bool>>(),
+        ))
+    } else {
+        None
+    };
+
+    let result = ListArray::try_new(
+        Arc::new(Field::new_list_field(
+            struct_array.data_type().clone(),
+            true,
+        )),
+        OffsetBuffer::new(offsets.into()),
+        Arc::new(struct_array),
+        null_buffer,
+    )?;
+
+    Ok(Arc::new(result))
+}
diff --git a/datafusion/functions-nested/src/cardinality.rs b/datafusion/functions-nested/src/cardinality.rs
index 98bda81ef25f9..7994315adc0a2 100644
--- a/datafusion/functions-nested/src/cardinality.rs
+++ b/datafusion/functions-nested/src/cardinality.rs
@@ -25,13 +25,13 @@ use arrow::datatypes::{
     DataType,
     DataType::{LargeList, List, Map, Null, UInt64},
 };
+use datafusion_common::Result;
 use datafusion_common::cast::{as_large_list_array, as_list_array, as_map_array};
 use datafusion_common::exec_err;
-use datafusion_common::utils::{take_function_args, ListCoercion};
-use datafusion_common::Result;
+use datafusion_common::utils::{ListCoercion, take_function_args};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -58,7 +58,6 @@ impl Cardinality {
                 ],
                 Volatility::Immutable,
             ),
-            aliases: vec![],
         }
     }
 }
@@ -80,10 +79,9 @@ impl Cardinality {
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Cardinality {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for Cardinality {
@@ -107,27 +105,19 @@ impl ScalarUDFImpl for Cardinality {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(cardinality_inner)(&args.args)
     }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Cardinality SQL function
-pub fn cardinality_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn cardinality_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("cardinality", args)?;
     match array.data_type() {
-        Null => Ok(Arc::new(UInt64Array::from_value(0, array.len()))),
+        Null => Ok(Arc::new(UInt64Array::new_null(array.len()))),
         List(_) => {
             let list_array = as_list_array(array)?;
             generic_list_cardinality::<i32>(list_array)
@@ -159,9 +149,14 @@ fn generic_list_cardinality<O: OffsetSizeTrait>(
 ) -> Result<ArrayRef> {
     let result = array
         .iter()
-        .map(|arr| match crate::utils::compute_array_dims(arr)? {
-            Some(vector) => Ok(Some(vector.iter().map(|x| x.unwrap()).product::<u64>())),
-            None => Ok(None),
+        .map(|arr| match arr {
+            Some(arr) if arr.is_empty() => Ok(Some(0u64)),
+            arr => match crate::utils::compute_array_dims(arr)? {
+                Some(vector) => {
+                    Ok(Some(vector.iter().map(|x| x.unwrap()).product::<u64>()))
+                }
+                None => Ok(None),
+            },
         })
         .collect::<Result<UInt64Array>>()?;
     Ok(Arc::new(result) as ArrayRef)
diff --git a/datafusion/functions-nested/src/concat.rs b/datafusion/functions-nested/src/concat.rs
index dd8784d36c48c..dc7fd92f68bab 100644
--- a/datafusion/functions-nested/src/concat.rs
+++ b/datafusion/functions-nested/src/concat.rs
@@ -23,15 +23,15 @@ use std::sync::Arc;
 use crate::make_array::make_array_inner;
 use crate::utils::{align_array_dimensions, check_datatypes, make_scalar_function};
 use arrow::array::{
-    Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, NullArray,
-    NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayData, ArrayRef, Capacities, GenericListArray, MutableArrayData,
+    OffsetSizeTrait,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::datatypes::{DataType, Field};
+use datafusion_common::Result;
 use datafusion_common::utils::{
-    base_type, coerced_type_with_base_type_only, ListCoercion,
+    ListCoercion, base_type, coerced_type_with_base_type_only,
 };
-use datafusion_common::Result;
 use datafusion_common::{
     cast::as_generic_list_array,
     exec_err, plan_err,
@@ -39,9 +39,11 @@ use datafusion_common::{
 };
 use datafusion_expr::binary::type_union_resolution;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use itertools::Itertools;
 
 make_udf_expr_and_func!(
     ArrayAppend,
@@ -69,7 +71,7 @@ make_udf_expr_and_func!(
     ),
     argument(name = "element", description = "Element to append to the array.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayAppend {
     signature: Signature,
     aliases: Vec<String>,
@@ -116,10 +118,7 @@ impl ScalarUDFImpl for ArrayAppend {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_append_inner)(&args.args)
     }
 
@@ -158,7 +157,7 @@ make_udf_expr_and_func!(
     ),
     argument(name = "element", description = "Element to prepend to the array.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayPrepend {
     signature: Signature,
     aliases: Vec<String>,
@@ -205,10 +204,7 @@ impl ScalarUDFImpl for ArrayPrepend {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_prepend_inner)(&args.args)
     }
 
@@ -249,7 +245,7 @@ make_udf_expr_and_func!(
         description = "Subsequent array column or literal array to concatenate."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayConcat {
     signature: Signature,
     aliases: Vec<String>,
@@ -296,7 +292,7 @@ impl ScalarUDFImpl for ArrayConcat {
                 DataType::Null | DataType::List(_) | DataType::FixedSizeList(..) => (),
                 DataType::LargeList(_) => large_list = true,
                 arg_type => {
-                    return plan_err!("{} does not support type {arg_type}", self.name())
+                    return plan_err!("{} does not support type {arg_type}", self.name());
                 }
             }
 
@@ -318,16 +314,14 @@ impl ScalarUDFImpl for ArrayConcat {
             }
         } else {
             plan_err!(
-                "Failed to unify argument types of {}: {arg_types:?}",
-                self.name()
+                "Failed to unify argument types of {}: [{}]",
+                self.name(),
+                arg_types.iter().join(", ")
             )
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_concat_inner)(&args.args)
     }
 
@@ -350,8 +344,7 @@ impl ScalarUDFImpl for ArrayConcat {
     }
 }
 
-/// Array_concat/Array_cat SQL function
-pub(crate) fn array_concat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+pub fn array_concat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.is_empty() {
         return exec_err!("array_concat expects at least one argument");
     }
@@ -364,12 +357,23 @@ pub(crate) fn array_concat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::LargeList(_) => large_list = true,
             _ => (),
         }
-
-        all_null = false
+        if arg.null_count() < arg.len() {
+            all_null = false;
+        }
     }
 
     if all_null {
-        Ok(Arc::new(NullArray::new(args[0].len())))
+        // Return a null array with the same type as the first non-null-type argument
+        let return_type = args
+            .iter()
+            .map(|arg| arg.data_type())
+            .find_or_first(|d| !d.is_null())
+            .unwrap(); // Safe because args is non-empty
+
+        Ok(arrow::array::make_array(ArrayData::new_null(
+            return_type,
+            args[0].len(),
+        )))
     } else if large_list {
         concat_internal::<i64>(args)
     } else {
@@ -384,64 +388,70 @@ fn concat_internal<O: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
         .iter()
         .map(|arg| as_generic_list_array::<O>(arg))
         .collect::<Result<Vec<_>>>()?;
-    // Assume number of rows is the same for all arrays
     let row_count = list_arrays[0].len();
 
-    let mut array_lengths = vec![];
-    let mut arrays = vec![];
-    let mut valid = NullBufferBuilder::new(row_count);
-    for i in 0..row_count {
-        let nulls = list_arrays
+    // Extract underlying values ArrayData from each list array for MutableArrayData.
+    let values_data: Vec<ArrayData> =
+        list_arrays.iter().map(|la| la.values().to_data()).collect();
+    let values_data_refs: Vec<&ArrayData> = values_data.iter().collect();
+
+    // Estimate capacity as the sum of all values arrays' lengths.
+    let total_capacity: usize = values_data.iter().map(|d| d.len()).sum();
+
+    let mut mutable = MutableArrayData::with_capacities(
+        values_data_refs,
+        false,
+        Capacities::Array(total_capacity),
+    );
+    let mut offsets: Vec<O> = Vec::with_capacity(row_count + 1);
+    offsets.push(O::zero());
+
+    // Compute the output null buffer: a row is null only if null in ALL input
+    // arrays. This is the bitwise OR of validity bits (valid if valid in ANY
+    // input). If any array has no null buffer (all valid), no output row can be
+    // null.
+    let nulls = list_arrays
+        .iter()
+        .filter_map(|la| la.nulls())
+        .collect::<Vec<_>>();
+    let valid = if nulls.len() == list_arrays.len() {
+        nulls
             .iter()
-            .map(|arr| arr.is_null(i))
-            .collect::<Vec<_>>();
-
-        // If all the arrays are null, the concatenated array is null
-        let is_null = nulls.iter().all(|&x| x);
-        if is_null {
-            array_lengths.push(0);
-            valid.append_null();
-        } else {
-            // Get all the arrays on i-th row
-            let values = list_arrays
-                .iter()
-                .map(|arr| arr.value(i))
-                .collect::<Vec<_>>();
-
-            let elements = values
-                .iter()
-                .map(|a| a.as_ref())
-                .collect::<Vec<&dyn Array>>();
-
-            // Concatenated array on i-th row
-            let concatenated_array = arrow::compute::concat(elements.as_slice())?;
-            array_lengths.push(concatenated_array.len());
-            arrays.push(concatenated_array);
-            valid.append_non_null();
+            .map(|n| n.inner().clone())
+            .reduce(|a, b| &a | &b)
+            .map(NullBuffer::new)
+    } else {
+        None
+    };
+
+    for row_idx in 0..row_count {
+        for (arr_idx, list_array) in list_arrays.iter().enumerate() {
+            if list_array.is_null(row_idx) {
+                continue;
+            }
+            let start = list_array.offsets()[row_idx].to_usize().unwrap();
+            let end = list_array.offsets()[row_idx + 1].to_usize().unwrap();
+            if start < end {
+                mutable.extend(arr_idx, start, end);
+            }
         }
+        offsets.push(O::usize_as(mutable.len()));
     }
-    // Assume all arrays have the same data type
-    let data_type = list_arrays[0].value_type();
 
-    let elements = arrays
-        .iter()
-        .map(|a| a.as_ref())
-        .collect::<Vec<&dyn Array>>();
+    let data_type = list_arrays[0].value_type();
+    let data = mutable.freeze();
 
-    let list_arr = GenericListArray::<O>::new(
+    Ok(Arc::new(GenericListArray::<O>::try_new(
         Arc::new(Field::new_list_field(data_type, true)),
-        OffsetBuffer::from_lengths(array_lengths),
-        Arc::new(arrow::compute::concat(elements.as_slice())?),
-        valid.finish(),
-    );
-
-    Ok(Arc::new(list_arr))
+        OffsetBuffer::new(offsets.into()),
+        arrow::array::make_array(data),
+        valid,
+    )?))
 }
 
 // Kernel functions
 
-/// Array_append SQL function
-pub(crate) fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, values] = take_function_args("array_append", args)?;
     match array.data_type() {
         DataType::Null => make_array_inner(&[Arc::clone(values)]),
@@ -451,8 +461,7 @@ pub(crate) fn array_append_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-/// Array_prepend SQL function
-pub(crate) fn array_prepend_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_prepend_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [values, array] = take_function_args("array_prepend", args)?;
     match array.data_type() {
         DataType::Null => make_array_inner(&[Arc::clone(values)]),
diff --git a/datafusion/functions-nested/src/dimension.rs b/datafusion/functions-nested/src/dimension.rs
index d1e6b1be4cfa6..0f0a5949ebee2 100644
--- a/datafusion/functions-nested/src/dimension.rs
+++ b/datafusion/functions-nested/src/dimension.rs
@@ -28,12 +28,13 @@ use std::any::Any;
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 
 use crate::utils::{compute_array_dims, make_scalar_function};
 use datafusion_common::utils::list_ndims;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::Itertools;
@@ -64,7 +65,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayDims {
     signature: Signature,
     aliases: Vec<String>,
@@ -101,10 +102,7 @@ impl ScalarUDFImpl for ArrayDims {
         Ok(DataType::new_list(UInt64, true))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_dims_inner)(&args.args)
     }
 
@@ -143,7 +141,7 @@ make_udf_expr_and_func!(
     ),
     argument(name = "element", description = "Array element.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayNdims {
     signature: Signature,
     aliases: Vec<String>,
@@ -173,10 +171,7 @@ impl ScalarUDFImpl for ArrayNdims {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_ndims_inner)(&args.args)
     }
 
@@ -189,8 +184,7 @@ impl ScalarUDFImpl for ArrayNdims {
     }
 }
 
-/// Array_dims SQL function
-pub fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_dims", args)?;
     let data: Vec<_> = match array.data_type() {
         List(_) => as_list_array(&array)?
@@ -214,8 +208,7 @@ pub fn array_dims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     Ok(Arc::new(result))
 }
 
-/// Array_ndims SQL function
-pub fn array_ndims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_ndims_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_ndims", args)?;
 
     fn general_list_ndims(array: &ArrayRef) -> Result<ArrayRef> {
diff --git a/datafusion/functions-nested/src/distance.rs b/datafusion/functions-nested/src/distance.rs
index 3392e194b1760..817947a3e663a 100644
--- a/datafusion/functions-nested/src/distance.rs
+++ b/datafusion/functions-nested/src/distance.rs
@@ -29,14 +29,13 @@ use datafusion_common::cast::{
     as_float32_array, as_float64_array, as_generic_list_array, as_int32_array,
     as_int64_array,
 };
-use datafusion_common::utils::{coerced_type_with_base_type_only, ListCoercion};
-use datafusion_common::{
-    exec_err, internal_datafusion_err, plan_err, utils::take_function_args, Result,
-};
+use datafusion_common::utils::{ListCoercion, coerced_type_with_base_type_only};
+use datafusion_common::{Result, exec_err, plan_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
-use datafusion_functions::{downcast_arg, downcast_named_arg};
+use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
 use itertools::Itertools;
 use std::any::Any;
@@ -71,7 +70,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayDistance {
     signature: Signature,
     aliases: Vec<String>,
@@ -127,10 +126,7 @@ impl ScalarUDFImpl for ArrayDistance {
         arg_types.try_collect()
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_distance_inner)(&args.args)
     }
 
@@ -143,7 +139,7 @@ impl ScalarUDFImpl for ArrayDistance {
     }
 }
 
-pub fn array_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_distance_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_distance", args)?;
     match (array1.data_type(), array2.data_type()) {
         (List(_), List(_)) => general_array_distance::<i32>(args),
diff --git a/datafusion/functions-nested/src/empty.rs b/datafusion/functions-nested/src/empty.rs
index 67c795886bded..5ae5a05d6c188 100644
--- a/datafusion/functions-nested/src/empty.rs
+++ b/datafusion/functions-nested/src/empty.rs
@@ -25,9 +25,10 @@ use arrow::datatypes::{
     DataType::{Boolean, FixedSizeList, LargeList, List},
 };
 use datafusion_common::cast::as_generic_list_array;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -58,7 +59,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayEmpty {
     signature: Signature,
     aliases: Vec<String>,
@@ -94,10 +95,7 @@ impl ScalarUDFImpl for ArrayEmpty {
         Ok(Boolean)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_empty_inner)(&args.args)
     }
 
@@ -110,8 +108,7 @@ impl ScalarUDFImpl for ArrayEmpty {
     }
 }
 
-/// Array_empty SQL function
-pub fn array_empty_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_empty_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_empty", args)?;
     match array.data_type() {
         List(_) => general_array_empty::<i32>(array),
diff --git a/datafusion/functions-nested/src/except.rs b/datafusion/functions-nested/src/except.rs
index 2385f6d12d43e..7fe2b3e754364 100644
--- a/datafusion/functions-nested/src/except.rs
+++ b/datafusion/functions-nested/src/except.rs
@@ -15,19 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`ScalarUDFImpl`] definitions for array_except function.
+//! [`ScalarUDFImpl`] definition for array_except function.
 
 use crate::utils::{check_datatypes, make_scalar_function};
-use arrow::array::{cast::AsArray, Array, ArrayRef, GenericListArray, OffsetSizeTrait};
-use arrow::buffer::OffsetBuffer;
+use arrow::array::new_null_array;
+use arrow::array::{
+    Array, ArrayRef, GenericListArray, OffsetSizeTrait, UInt32Array, UInt64Array,
+    cast::AsArray,
+};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::compute::take;
 use arrow::datatypes::{DataType, FieldRef};
 use arrow::row::{RowConverter, SortField};
-use datafusion_common::utils::take_function_args;
-use datafusion_common::{internal_err, HashSet, Result};
+use datafusion_common::utils::{ListCoercion, take_function_args};
+use datafusion_common::{HashSet, Result, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use itertools::Itertools;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -66,7 +73,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayExcept {
     signature: Signature,
     aliases: Vec<String>,
@@ -81,7 +88,11 @@ impl Default for ArrayExcept {
 impl ArrayExcept {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::arrays(
+                2,
+                Some(ListCoercion::FixedSizedListToList),
+                Volatility::Immutable,
+            ),
             aliases: vec!["list_except".to_string()],
         }
     }
@@ -100,16 +111,16 @@ impl ScalarUDFImpl for ArrayExcept {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match (&arg_types[0].clone(), &arg_types[1].clone()) {
-            (DataType::Null, _) | (_, DataType::Null) => Ok(arg_types[0].clone()),
+        match (&arg_types[0], &arg_types[1]) {
+            (DataType::Null, DataType::Null) => {
+                Ok(DataType::new_list(DataType::Null, true))
+            }
+            (DataType::Null, dt) | (dt, DataType::Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_except_inner)(&args.args)
     }
 
@@ -122,12 +133,19 @@ impl ScalarUDFImpl for ArrayExcept {
     }
 }
 
-/// Array_except SQL function
-pub fn array_except_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_except_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_except", args)?;
 
+    let len = array1.len();
     match (array1.data_type(), array2.data_type()) {
-        (DataType::Null, _) | (_, DataType::Null) => Ok(array1.to_owned()),
+        (DataType::Null, DataType::Null) => Ok(new_null_array(
+            &DataType::new_list(DataType::Null, true),
+            len,
+        )),
+        (DataType::Null, dt @ DataType::List(_))
+        | (DataType::Null, dt @ DataType::LargeList(_))
+        | (dt @ DataType::List(_), DataType::Null)
+        | (dt @ DataType::LargeList(_), DataType::Null) => Ok(new_null_array(dt, len)),
         (DataType::List(field), DataType::List(_)) => {
             check_datatypes("array_except", &[array1, array2])?;
             let list1 = array1.as_list::<i32>();
@@ -155,43 +173,132 @@ fn general_except<OffsetSize: OffsetSizeTrait>(
 ) -> Result<GenericListArray<OffsetSize>> {
     let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
 
-    let l_values = l.values().to_owned();
-    let r_values = r.values().to_owned();
-    let l_values = converter.convert_columns(&[l_values])?;
-    let r_values = converter.convert_columns(&[r_values])?;
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let l_first = l.offsets()[0].as_usize();
+    let l_len = l.offsets()[l.len()].as_usize() - l_first;
+    let l_values = converter.convert_columns(&[l.values().slice(l_first, l_len)])?;
+
+    let r_first = r.offsets()[0].as_usize();
+    let r_len = r.offsets()[r.len()].as_usize() - r_first;
+    let r_values = converter.convert_columns(&[r.values().slice(r_first, r_len)])?;
 
     let mut offsets = Vec::<OffsetSize>::with_capacity(l.len() + 1);
     offsets.push(OffsetSize::usize_as(0));
 
-    let mut rows = Vec::with_capacity(l_values.num_rows());
+    let mut indices: Vec<usize> = Vec::with_capacity(l_values.num_rows());
     let mut dedup = HashSet::new();
 
-    for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) {
-        let l_slice = l_w[0].as_usize()..l_w[1].as_usize();
-        let r_slice = r_w[0].as_usize()..r_w[1].as_usize();
-        for i in r_slice {
-            let right_row = r_values.row(i);
+    let nulls = NullBuffer::union(l.nulls(), r.nulls());
+
+    let l_offsets_iter = l.offsets().iter().tuple_windows();
+    let r_offsets_iter = r.offsets().iter().tuple_windows();
+    for (list_index, ((l_start, l_end), (r_start, r_end))) in
+        l_offsets_iter.zip(r_offsets_iter).enumerate()
+    {
+        if nulls
+            .as_ref()
+            .is_some_and(|nulls| nulls.is_null(list_index))
+        {
+            offsets.push(OffsetSize::usize_as(indices.len()));
+            continue;
+        }
+
+        for element_index in r_start.as_usize() - r_first..r_end.as_usize() - r_first {
+            let right_row = r_values.row(element_index);
             dedup.insert(right_row);
         }
-        for i in l_slice {
-            let left_row = l_values.row(i);
+        for element_index in l_start.as_usize() - l_first..l_end.as_usize() - l_first {
+            let left_row = l_values.row(element_index);
             if dedup.insert(left_row) {
-                rows.push(left_row);
+                indices.push(element_index + l_first);
             }
         }
 
-        offsets.push(OffsetSize::usize_as(rows.len()));
+        offsets.push(OffsetSize::usize_as(indices.len()));
         dedup.clear();
     }
 
-    if let Some(values) = converter.convert_rows(rows)?.first() {
-        Ok(GenericListArray::<OffsetSize>::new(
-            field.to_owned(),
-            OffsetBuffer::new(offsets.into()),
-            values.to_owned(),
-            l.nulls().cloned(),
-        ))
+    // Gather distinct left-side values by index.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let values = if indices.is_empty() {
+        arrow::array::new_empty_array(&l.value_type())
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(l.values().as_ref(), &indices, None)?
     } else {
-        internal_err!("array_except failed to convert rows")
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(l.values().as_ref(), &indices, None)?
+    };
+
+    Ok(GenericListArray::<OffsetSize>::new(
+        field.to_owned(),
+        OffsetBuffer::new(offsets.into()),
+        values,
+        nulls,
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ArrayExcept;
+    use arrow::array::{Array, AsArray, Int32Array, ListArray};
+    use arrow::datatypes::{Field, Int32Type};
+    use datafusion_common::{Result, config::ConfigOptions};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_except_sliced_lists() -> Result<()> {
+        // l: [[1,2], [3,4], [5,6], [7,8]]  →  slice(1,2)  →  [[3,4], [5,6]]
+        // r: [[3],   [5],   [6],   [8]]    →  slice(1,2)  →  [[5],   [6]]
+        // except(l, r) should be [[3,4], [5]]
+        let l_full = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3), Some(4)]),
+            Some(vec![Some(5), Some(6)]),
+            Some(vec![Some(7), Some(8)]),
+        ]);
+        let r_full = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(3)]),
+            Some(vec![Some(5)]),
+            Some(vec![Some(6)]),
+            Some(vec![Some(8)]),
+        ]);
+        let l_sliced = l_full.slice(1, 2);
+        let r_sliced = r_full.slice(1, 2);
+
+        let list_field = Arc::new(Field::new("item", l_sliced.data_type().clone(), true));
+        let return_field =
+            Arc::new(Field::new("return", l_sliced.data_type().clone(), true));
+
+        let result = ArrayExcept::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l_sliced)),
+                ColumnarValue::Array(Arc::new(r_sliced)),
+            ],
+            arg_fields: vec![Arc::clone(&list_field), Arc::clone(&list_field)],
+            number_rows: 2,
+            return_field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+
+        // Row 0: [3,4] except [5] = [3,4]
+        let row0 = output.value(0);
+        let row0 = row0.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(row0.values().as_ref(), &[3, 4]);
+
+        // Row 1: [5,6] except [6] = [5]
+        let row1 = output.value(1);
+        let row1 = row1.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(row1.values().as_ref(), &[5]);
+
+        Ok(())
     }
 }
diff --git a/datafusion/functions-nested/src/expr_ext.rs b/datafusion/functions-nested/src/expr_ext.rs
index 4da4a3f583b7c..18c4c5fb59c38 100644
--- a/datafusion/functions-nested/src/expr_ext.rs
+++ b/datafusion/functions-nested/src/expr_ext.rs
@@ -36,8 +36,7 @@ use crate::extract::{array_element, array_slice};
 /// ```
 /// # use datafusion_expr::{lit, col, Expr};
 /// # use datafusion_functions_nested::expr_ext::IndexAccessor;
-/// let expr = col("c1")
-///    .index(lit(3));
+/// let expr = col("c1").index(lit(3));
 /// assert_eq!(expr.schema_name().to_string(), "c1[Int32(3)]");
 /// ```
 pub trait IndexAccessor {
@@ -66,8 +65,7 @@ impl IndexAccessor for Expr {
 /// ```
 /// # use datafusion_expr::{lit, col};
 /// # use datafusion_functions_nested::expr_ext::SliceAccessor;
-/// let expr = col("c1")
-///    .range(lit(2), lit(4));
+/// let expr = col("c1").range(lit(2), lit(4));
 /// assert_eq!(expr.schema_name().to_string(), "c1[Int32(2):Int32(4)]");
 /// ```
 pub trait SliceAccessor {
diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs
index 95bf5a7341d95..71883e7986af5 100644
--- a/datafusion/functions-nested/src/extract.rs
+++ b/datafusion/functions-nested/src/extract.rs
@@ -18,25 +18,29 @@
 //! [`ScalarUDFImpl`] definitions for array_element, array_slice, array_pop_front, array_pop_back, and array_any_value functions.
 
 use arrow::array::{
-    Array, ArrayRef, ArrowNativeTypeOp, Capacities, GenericListArray, Int64Array,
+    Array, ArrayRef, Capacities, GenericListArray, GenericListViewArray, Int64Array,
     MutableArrayData, NullArray, NullBufferBuilder, OffsetSizeTrait,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
-    DataType::{FixedSizeList, LargeList, List, Null},
+    DataType::{FixedSizeList, LargeList, LargeListView, List, ListView, Null},
     Field,
 };
-use datafusion_common::cast::as_int64_array;
 use datafusion_common::cast::as_large_list_array;
 use datafusion_common::cast::as_list_array;
+use datafusion_common::cast::{
+    as_int64_array, as_large_list_view_array, as_list_view_array,
+};
+use datafusion_common::internal_err;
 use datafusion_common::utils::ListCoercion;
 use datafusion_common::{
-    exec_err, internal_datafusion_err, plan_err, utils::take_function_args,
-    DataFusionError, Result,
+    Result, exec_datafusion_err, exec_err, internal_datafusion_err, plan_err,
+    utils::take_function_args,
 };
 use datafusion_expr::{
-    ArrayFunctionArgument, ArrayFunctionSignature, Expr, TypeSignature,
+    ArrayFunctionArgument, ArrayFunctionSignature, Expr, ScalarFunctionArgs,
+    TypeSignature,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
@@ -103,7 +107,7 @@ make_udf_expr_and_func!(
         description = "Index to extract the element from the array."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayElement {
     signature: Signature,
     aliases: Vec<String>,
@@ -169,10 +173,7 @@ impl ScalarUDFImpl for ArrayElement {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_element_inner)(&args.args)
     }
 
@@ -237,9 +238,7 @@ where
         i64: TryInto<O>,
     {
         let index: O = index.try_into().map_err(|_| {
-            DataFusionError::Execution(format!(
-                "array_element got invalid index: {index}"
-            ))
+            exec_datafusion_err!("array_element got invalid index: {index}")
         })?;
         // 0 ~ len - 1
         let adjusted_zero_index = if index < O::usize_as(0) {
@@ -320,7 +319,7 @@ pub fn array_slice(array: Expr, begin: Expr, end: Expr, stride: Option<Expr>) ->
         description = "Stride of the array slice. The default is 1."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArraySlice {
     signature: Signature,
     aliases: Vec<String>,
@@ -337,7 +336,7 @@ impl ArraySlice {
                             ArrayFunctionArgument::Index,
                             ArrayFunctionArgument::Index,
                         ],
-                        array_coercion: None,
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
                     }),
                     TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
                         arguments: vec![
@@ -346,7 +345,7 @@ impl ArraySlice {
                             ArrayFunctionArgument::Index,
                             ArrayFunctionArgument::Index,
                         ],
-                        array_coercion: None,
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
                     }),
                 ],
                 Volatility::Immutable,
@@ -394,10 +393,7 @@ impl ScalarUDFImpl for ArraySlice {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_slice_inner)(&args.args)
     }
 
@@ -451,7 +447,159 @@ fn array_slice_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(&args[0])?;
             general_array_slice::<i64>(array, from_array, to_array, stride)
         }
-        _ => exec_err!("array_slice does not support type: {:?}", array_data_type),
+        ListView(_) => {
+            let array = as_list_view_array(&args[0])?;
+            general_list_view_array_slice::<i32>(array, from_array, to_array, stride)
+        }
+        LargeListView(_) => {
+            let array = as_large_list_view_array(&args[0])?;
+            general_list_view_array_slice::<i64>(array, from_array, to_array, stride)
+        }
+        _ => exec_err!("array_slice does not support type: {}", array_data_type),
+    }
+}
+
+fn adjusted_from_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
+where
+    i64: TryInto<O>,
+{
+    // 0 ~ len - 1
+    let adjusted_zero_index = if index < 0 {
+        if let Ok(index) = index.try_into() {
+            // When index < 0 and -index > length, index is clamped to the beginning of the list.
+            // Otherwise, when index < 0, the index is counted from the end of the list.
+            //
+            // Note, we actually test the contrapositive, index < -length, because negating a
+            // negative will panic if the negative is equal to the smallest representable value
+            // while negating a positive is always safe.
+            if index < (O::zero() - O::one()) * len {
+                O::zero()
+            } else {
+                index + len
+            }
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    } else {
+        // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to)
+        if let Ok(index) = index.try_into() {
+            std::cmp::max(index - O::usize_as(1), O::usize_as(0))
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    };
+
+    if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
+        Ok(Some(adjusted_zero_index))
+    } else {
+        // Out of bounds
+        Ok(None)
+    }
+}
+
+fn adjusted_to_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
+where
+    i64: TryInto<O>,
+{
+    // 0 ~ len - 1
+    let adjusted_zero_index = if index < 0 {
+        // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive
+        if let Ok(index) = index.try_into() {
+            index + len
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    } else {
+        // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len)
+        if let Ok(index) = index.try_into() {
+            std::cmp::min(index - O::usize_as(1), len - O::usize_as(1))
+        } else {
+            return exec_err!("array_slice got invalid index: {}", index);
+        }
+    };
+
+    if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
+        Ok(Some(adjusted_zero_index))
+    } else {
+        // Out of bounds
+        Ok(None)
+    }
+}
+
+/// Internal plan describing how to materialize a single row's slice after
+/// the slice bounds/stride have been normalized. Both list layouts consume
+/// this to drive their copy logic.
+enum SlicePlan<O: OffsetSizeTrait> {
+    /// No values should be produced.
+    Empty,
+    /// A contiguous run starting at `start` (relative to the row) with `len`
+    /// elements can be copied in one go.
+    Contiguous { start: O, len: O },
+    /// Arbitrary positions (already relative to the row) must be copied in
+    /// sequence.
+    Indices(Vec<O>),
+}
+
+/// Produces a [`SlicePlan`] for the given logical slice parameters.
+fn compute_slice_plan<O: OffsetSizeTrait>(
+    len: O,
+    from_raw: i64,
+    to_raw: i64,
+    stride_raw: Option<i64>,
+) -> Result<SlicePlan<O>>
+where
+    i64: TryInto<O>,
+{
+    if len == O::usize_as(0) {
+        return Ok(SlicePlan::Empty);
+    }
+
+    let from_index = adjusted_from_index::<O>(from_raw, len)?;
+    let to_index = adjusted_to_index::<O>(to_raw, len)?;
+
+    let (Some(from), Some(to)) = (from_index, to_index) else {
+        return Ok(SlicePlan::Empty);
+    };
+
+    let stride_value = stride_raw.unwrap_or(1);
+    if stride_value == 0 {
+        return exec_err!(
+            "array_slice got invalid stride: {:?}, it cannot be 0",
+            stride_value
+        );
+    }
+
+    if (from < to && stride_value.is_negative())
+        || (from > to && stride_value.is_positive())
+    {
+        return Ok(SlicePlan::Empty);
+    }
+
+    let stride: O = stride_value.try_into().map_err(|_| {
+        internal_datafusion_err!("array_slice got invalid stride: {}", stride_value)
+    })?;
+
+    if from <= to && stride_value.is_positive() {
+        if stride_value == 1 {
+            let len = to - from + O::usize_as(1);
+            Ok(SlicePlan::Contiguous { start: from, len })
+        } else {
+            let mut indices = Vec::new();
+            let mut index = from;
+            while index <= to {
+                indices.push(index);
+                index += stride;
+            }
+            Ok(SlicePlan::Indices(indices))
+        }
+    } else {
+        let mut indices = Vec::new();
+        let mut index = from;
+        while index >= to {
+            indices.push(index);
+            index += stride;
+        }
+        Ok(SlicePlan::Indices(indices))
     }
 }
 
@@ -474,73 +622,6 @@ where
     // We have the slice syntax compatible with DuckDB v0.8.1.
     // The rule `adjusted_from_index` and `adjusted_to_index` follows the rule of array_slice in duckdb.
 
-    fn adjusted_from_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
-    where
-        i64: TryInto<O>,
-    {
-        // 0 ~ len - 1
-        let adjusted_zero_index = if index < 0 {
-            if let Ok(index) = index.try_into() {
-                // When index < 0 and -index > length, index is clamped to the beginning of the list.
-                // Otherwise, when index < 0, the index is counted from the end of the list.
-                //
-                // Note, we actually test the contrapositive, index < -length, because negating a
-                // negative will panic if the negative is equal to the smallest representable value
-                // while negating a positive is always safe.
-                if index < (O::zero() - O::one()) * len {
-                    O::zero()
-                } else {
-                    index + len
-                }
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        } else {
-            // array_slice(arr, 1, to) is the same as array_slice(arr, 0, to)
-            if let Ok(index) = index.try_into() {
-                std::cmp::max(index - O::usize_as(1), O::usize_as(0))
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        };
-
-        if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
-            Ok(Some(adjusted_zero_index))
-        } else {
-            // Out of bounds
-            Ok(None)
-        }
-    }
-
-    fn adjusted_to_index<O: OffsetSizeTrait>(index: i64, len: O) -> Result<Option<O>>
-    where
-        i64: TryInto<O>,
-    {
-        // 0 ~ len - 1
-        let adjusted_zero_index = if index < 0 {
-            // array_slice in duckdb with negative to_index is python-like, so index itself is exclusive
-            if let Ok(index) = index.try_into() {
-                index + len
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        } else {
-            // array_slice(arr, from, len + 1) is the same as array_slice(arr, from, len)
-            if let Ok(index) = index.try_into() {
-                std::cmp::min(index - O::usize_as(1), len - O::usize_as(1))
-            } else {
-                return exec_err!("array_slice got invalid index: {}", index);
-            }
-        };
-
-        if O::usize_as(0) <= adjusted_zero_index && adjusted_zero_index < len {
-            Ok(Some(adjusted_zero_index))
-        } else {
-            // Out of bounds
-            Ok(None)
-        }
-    }
-
     let mut offsets = vec![O::usize_as(0)];
     let mut null_builder = NullBufferBuilder::new(array.len());
 
@@ -553,6 +634,7 @@ where
         if array.is_null(row_index)
             || from_array.is_null(row_index)
             || to_array.is_null(row_index)
+            || stride.is_some_and(|s| s.is_null(row_index))
         {
             mutable.extend_nulls(1);
             offsets.push(offsets[row_index] + O::usize_as(1));
@@ -567,72 +649,32 @@ where
             continue;
         }
 
-        let from_index = adjusted_from_index::<O>(from_array.value(row_index), len)?;
-        let to_index = adjusted_to_index::<O>(to_array.value(row_index), len)?;
-
-        if let (Some(from), Some(to)) = (from_index, to_index) {
-            let stride = stride.map(|s| s.value(row_index));
-            // Default stride is 1 if not provided
-            let stride = stride.unwrap_or(1);
-            if stride.is_zero() {
-                return exec_err!(
-                    "array_slice got invalid stride: {:?}, it cannot be 0",
-                    stride
-                );
-            } else if (from < to && stride.is_negative())
-                || (from > to && stride.is_positive())
-            {
-                // return empty array
-                offsets.push(offsets[row_index]);
-                continue;
+        let slice_plan = compute_slice_plan::<O>(
+            len,
+            from_array.value(row_index),
+            to_array.value(row_index),
+            stride.map(|s| s.value(row_index)),
+        )?;
+
+        match slice_plan {
+            SlicePlan::Empty => offsets.push(offsets[row_index]),
+            SlicePlan::Contiguous {
+                start: rel_start,
+                len: slice_len,
+            } => {
+                let start_index = (start + rel_start).to_usize().unwrap();
+                let end_index = (start + rel_start + slice_len).to_usize().unwrap();
+                mutable.extend(0, start_index, end_index);
+                offsets.push(offsets[row_index] + slice_len);
             }
-
-            let stride: O = stride.try_into().map_err(|_| {
-                internal_datafusion_err!("array_slice got invalid stride: {}", stride)
-            })?;
-
-            if from <= to && stride > O::zero() {
-                assert!(start + to <= end);
-                if stride.eq(&O::one()) {
-                    // stride is default to 1
-                    mutable.extend(
-                        0,
-                        (start + from).to_usize().unwrap(),
-                        (start + to + O::usize_as(1)).to_usize().unwrap(),
-                    );
-                    offsets.push(offsets[row_index] + (to - from + O::usize_as(1)));
-                    continue;
-                }
-                let mut index = start + from;
-                let mut cnt = 0;
-                while index <= start + to {
-                    mutable.extend(
-                        0,
-                        index.to_usize().unwrap(),
-                        index.to_usize().unwrap() + 1,
-                    );
-                    index += stride;
-                    cnt += 1;
-                }
-                offsets.push(offsets[row_index] + O::usize_as(cnt));
-            } else {
-                let mut index = start + from;
-                let mut cnt = 0;
-                while index >= start + to {
-                    mutable.extend(
-                        0,
-                        index.to_usize().unwrap(),
-                        index.to_usize().unwrap() + 1,
-                    );
-                    index += stride;
-                    cnt += 1;
+            SlicePlan::Indices(indices) => {
+                let count = indices.len();
+                for rel_index in indices {
+                    let absolute_index = (start + rel_index).to_usize().unwrap();
+                    mutable.extend(0, absolute_index, absolute_index + 1);
                 }
-                // invalid range, return empty array
-                offsets.push(offsets[row_index] + O::usize_as(cnt));
+                offsets.push(offsets[row_index] + O::usize_as(count));
             }
-        } else {
-            // invalid range, return empty array
-            offsets.push(offsets[row_index]);
         }
     }
 
@@ -646,6 +688,107 @@ where
     )?))
 }
 
+fn general_list_view_array_slice<O: OffsetSizeTrait>(
+    array: &GenericListViewArray<O>,
+    from_array: &Int64Array,
+    to_array: &Int64Array,
+    stride: Option<&Int64Array>,
+) -> Result<ArrayRef>
+where
+    i64: TryInto<O>,
+{
+    let values = array.values();
+    let original_data = values.to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let field = match array.data_type() {
+        ListView(field) | LargeListView(field) => Arc::clone(field),
+        other => {
+            return internal_err!("array_slice got unexpected data type: {}", other);
+        }
+    };
+
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    // We must build `offsets` and `sizes` buffers manually as ListView does not enforce
+    // monotonically increasing offsets.
+    let mut offsets = Vec::with_capacity(array.len());
+    let mut sizes = Vec::with_capacity(array.len());
+    let mut current_offset = O::usize_as(0);
+    let mut null_builder = NullBufferBuilder::new(array.len());
+
+    for row_index in 0..array.len() {
+        // Propagate NULL semantics: any NULL input yields a NULL output slot.
+        if array.is_null(row_index)
+            || from_array.is_null(row_index)
+            || to_array.is_null(row_index)
+            || stride.is_some_and(|s| s.is_null(row_index))
+        {
+            null_builder.append_null();
+            offsets.push(current_offset);
+            sizes.push(O::usize_as(0));
+            continue;
+        }
+        null_builder.append_non_null();
+
+        let len = array.value_size(row_index);
+
+        // Empty arrays always return an empty array.
+        if len == O::usize_as(0) {
+            offsets.push(current_offset);
+            sizes.push(O::usize_as(0));
+            continue;
+        }
+
+        let slice_plan = compute_slice_plan::<O>(
+            len,
+            from_array.value(row_index),
+            to_array.value(row_index),
+            stride.map(|s| s.value(row_index)),
+        )?;
+
+        let start = array.value_offset(row_index);
+        match slice_plan {
+            SlicePlan::Empty => {
+                offsets.push(current_offset);
+                sizes.push(O::usize_as(0));
+            }
+            SlicePlan::Contiguous {
+                start: rel_start,
+                len: slice_len,
+            } => {
+                let start_index = (start + rel_start).to_usize().unwrap();
+                let end_index = (start + rel_start + slice_len).to_usize().unwrap();
+                mutable.extend(0, start_index, end_index);
+                offsets.push(current_offset);
+                sizes.push(slice_len);
+                current_offset += slice_len;
+            }
+            SlicePlan::Indices(indices) => {
+                let count = indices.len();
+                for rel_index in indices {
+                    let absolute_index = (start + rel_index).to_usize().unwrap();
+                    mutable.extend(0, absolute_index, absolute_index + 1);
+                }
+                let length = O::usize_as(count);
+                offsets.push(current_offset);
+                sizes.push(length);
+                current_offset += length;
+            }
+        }
+    }
+
+    let data = mutable.freeze();
+
+    Ok(Arc::new(GenericListViewArray::<O>::try_new(
+        field,
+        ScalarBuffer::from(offsets),
+        ScalarBuffer::from(sizes),
+        arrow::array::make_array(data),
+        null_builder.finish(),
+    )?))
+}
+
 #[user_doc(
     doc_section(label = "Array Functions"),
     description = "Returns the array without the first element.",
@@ -663,7 +806,7 @@ where
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayPopFront {
     signature: Signature,
     aliases: Vec<String>,
@@ -672,15 +815,7 @@ pub(super) struct ArrayPopFront {
 impl ArrayPopFront {
     pub fn new() -> Self {
         Self {
-            signature: Signature {
-                type_signature: TypeSignature::ArraySignature(
-                    ArrayFunctionSignature::Array {
-                        arguments: vec![ArrayFunctionArgument::Array],
-                        array_coercion: Some(ListCoercion::FixedSizedListToList),
-                    },
-                ),
-                volatility: Volatility::Immutable,
-            },
+            signature: Signature::array(Volatility::Immutable),
             aliases: vec![String::from("list_pop_front")],
         }
     }
@@ -702,10 +837,7 @@ impl ScalarUDFImpl for ArrayPopFront {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_pop_front_inner)(&args.args)
     }
 
@@ -730,10 +862,7 @@ fn array_pop_front_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(&args[0])?;
             general_pop_front_list::<i64>(array)
         }
-        _ => exec_err!(
-            "array_pop_front does not support type: {:?}",
-            array_data_type
-        ),
+        _ => exec_err!("array_pop_front does not support type: {}", array_data_type),
     }
 }
 
@@ -770,7 +899,7 @@ where
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayPopBack {
     signature: Signature,
     aliases: Vec<String>,
@@ -779,15 +908,7 @@ pub(super) struct ArrayPopBack {
 impl ArrayPopBack {
     pub fn new() -> Self {
         Self {
-            signature: Signature {
-                type_signature: TypeSignature::ArraySignature(
-                    ArrayFunctionSignature::Array {
-                        arguments: vec![ArrayFunctionArgument::Array],
-                        array_coercion: Some(ListCoercion::FixedSizedListToList),
-                    },
-                ),
-                volatility: Volatility::Immutable,
-            },
+            signature: Signature::array(Volatility::Immutable),
             aliases: vec![String::from("list_pop_back")],
         }
     }
@@ -809,10 +930,7 @@ impl ScalarUDFImpl for ArrayPopBack {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_pop_back_inner)(&args.args)
     }
 
@@ -839,7 +957,7 @@ fn array_pop_back_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             general_pop_back_list::<i64>(array)
         }
         _ => exec_err!(
-            "array_pop_back does not support type: {:?}",
+            "array_pop_back does not support type: {}",
             array.data_type()
         ),
     }
@@ -878,7 +996,7 @@ where
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayAnyValue {
     signature: Signature,
     aliases: Vec<String>,
@@ -905,19 +1023,16 @@ impl ScalarUDFImpl for ArrayAnyValue {
     }
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
-            List(field)
-            | LargeList(field)
-            | FixedSizeList(field, _) => Ok(field.data_type().clone()),
+            List(field) | LargeList(field) | FixedSizeList(field, _) => {
+                Ok(field.data_type().clone())
+            }
             _ => plan_err!(
                 "array_any_value can only accept List, LargeList or FixedSizeList as the argument"
             ),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_any_value_inner)(&args.args)
     }
 
@@ -942,7 +1057,7 @@ fn array_any_value_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(&array)?;
             general_array_any_value::<i64>(array)
         }
-        data_type => exec_err!("array_any_value does not support type: {:?}", data_type),
+        data_type => exec_err!("array_any_value does not support type: {data_type}"),
     }
 }
 
@@ -998,12 +1113,28 @@ where
 
 #[cfg(test)]
 mod tests {
-    use super::array_element_udf;
+    use super::{array_element_udf, general_list_view_array_slice};
+    use arrow::array::{
+        Array, ArrayRef, GenericListViewArray, Int32Array, Int64Array, ListViewArray,
+        cast::AsArray,
+    };
+    use arrow::buffer::ScalarBuffer;
     use arrow::datatypes::{DataType, Field};
-    use datafusion_common::{Column, DFSchema};
+    use datafusion_common::{Column, DFSchema, Result};
     use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::{Expr, ExprSchemable};
     use std::collections::HashMap;
+    use std::sync::Arc;
+
+    fn list_view_values(array: &GenericListViewArray<i32>) -> Vec<Vec<i32>> {
+        (0..array.len())
+            .map(|i| {
+                let child = array.value(i);
+                let values = child.as_any().downcast_ref::<Int32Array>().unwrap();
+                values.iter().map(|v| v.unwrap()).collect()
+            })
+            .collect()
+    }
 
     // Regression test for https://github.com/apache/datafusion/issues/13755
     #[test]
@@ -1049,4 +1180,164 @@ mod tests {
             fixed_size_list_type
         );
     }
+
+    #[test]
+    fn test_array_slice_list_view_basic() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![0, 3]);
+        let sizes = ScalarBuffer::from(vec![3, 2]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![2, 1]);
+        let to = Int64Array::from(vec![3, 2]);
+
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            None::<&Int64Array>,
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![2, 3], vec![4, 5]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_non_monotonic_offsets() -> Result<()> {
+        // First list references the tail of the values buffer, second list references the head.
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![3, 0]);
+        let sizes = ScalarBuffer::from(vec![2, 3]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![1, 1]);
+        let to = Int64Array::from(vec![2, 2]);
+
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            None::<&Int64Array>,
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![4, 5], vec![1, 2]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_negative_stride() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![0, 3]);
+        let sizes = ScalarBuffer::from(vec![3, 2]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![3, 2]);
+        let to = Int64Array::from(vec![1, 1]);
+        let stride = Int64Array::from(vec![-1, -1]);
+
+        let result =
+            general_list_view_array_slice::<i32>(&array, &from, &to, Some(&stride))?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(list_view_values(result), vec![vec![3, 2, 1], vec![5, 4]]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_out_of_order() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+        let offsets = ScalarBuffer::from(vec![3, 1, 0]);
+        let sizes = ScalarBuffer::from(vec![2, 2, 1]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+        assert_eq!(
+            list_view_values(&array),
+            vec![vec![4, 5], vec![2, 3], vec![1]]
+        );
+
+        let from = Int64Array::from(vec![2, 2, 2]);
+        let to = Int64Array::from(vec![1, 1, 1]);
+        let stride = Int64Array::from(vec![-1, -1, -1]);
+
+        let result =
+            general_list_view_array_slice::<i32>(&array, &from, &to, Some(&stride))?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        assert_eq!(
+            list_view_values(result),
+            vec![vec![5, 4], vec![3, 2], vec![]]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_slice_list_view_with_nulls() -> Result<()> {
+        let values: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1),
+            None,
+            Some(3),
+            Some(4),
+            Some(5),
+        ]));
+        let offsets = ScalarBuffer::from(vec![0, 2, 5]);
+        let sizes = ScalarBuffer::from(vec![2, 3, 0]);
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let array = ListViewArray::new(field, offsets, sizes, values, None);
+
+        let from = Int64Array::from(vec![1, 1, 1]);
+        let to = Int64Array::from(vec![2, 2, 1]);
+
+        let result = general_list_view_array_slice::<i32>(&array, &from, &to, None)?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        let actual: Vec<Vec<Option<i32>>> = (0..result.len())
+            .map(|i| {
+                result
+                    .value(i)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap()
+                    .iter()
+                    .collect()
+            })
+            .collect();
+
+        assert_eq!(
+            actual,
+            vec![vec![Some(1), None], vec![Some(3), Some(4)], Vec::new(),]
+        );
+
+        // Test with NULL stride - should return NULL for rows with NULL stride
+        let stride_with_null = Int64Array::from(vec![Some(1), None, Some(1)]);
+        let result = general_list_view_array_slice::<i32>(
+            &array,
+            &from,
+            &to,
+            Some(&stride_with_null),
+        )?;
+        let result = result.as_ref().as_list_view::<i32>();
+
+        // First row: stride = 1, should return [1, None]
+        // Second row: stride = NULL, should return NULL
+        // Third row: stride = 1, empty array should return empty
+        assert!(!result.is_null(0)); // First row should not be null
+        assert!(result.is_null(1)); // Second row should be null (stride is NULL)
+        assert!(!result.is_null(2)); // Third row should not be null
+
+        let first_row: Vec<Option<i32>> = result
+            .value(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap()
+            .iter()
+            .collect();
+        assert_eq!(first_row, vec![Some(1), None]);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions-nested/src/flatten.rs b/datafusion/functions-nested/src/flatten.rs
index c6fa2831f4f0a..8b301a6d25929 100644
--- a/datafusion/functions-nested/src/flatten.rs
+++ b/datafusion/functions-nested/src/flatten.rs
@@ -25,11 +25,10 @@ use arrow::datatypes::{
     DataType::{FixedSizeList, LargeList, List, Null},
 };
 use datafusion_common::cast::{as_large_list_array, as_list_array};
-use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -60,7 +59,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Flatten {
     signature: Signature,
     aliases: Vec<String>,
@@ -75,15 +74,7 @@ impl Default for Flatten {
 impl Flatten {
     pub fn new() -> Self {
         Self {
-            signature: Signature {
-                type_signature: TypeSignature::ArraySignature(
-                    ArrayFunctionSignature::Array {
-                        arguments: vec![ArrayFunctionArgument::Array],
-                        array_coercion: Some(ListCoercion::FixedSizedListToList),
-                    },
-                ),
-                volatility: Volatility::Immutable,
-            },
+            signature: Signature::array(Volatility::Immutable),
             aliases: vec![],
         }
     }
@@ -104,8 +95,9 @@ impl ScalarUDFImpl for Flatten {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         let data_type = match &arg_types[0] {
-            List(field) | FixedSizeList(field, _) => match field.data_type() {
+            List(field) => match field.data_type() {
                 List(field) | FixedSizeList(field, _) => List(Arc::clone(field)),
+                LargeList(field) => LargeList(Arc::clone(field)),
                 _ => arg_types[0].clone(),
             },
             LargeList(field) => match field.data_type() {
@@ -123,10 +115,7 @@ impl ScalarUDFImpl for Flatten {
         Ok(data_type)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(flatten_inner)(&args.args)
     }
 
@@ -139,8 +128,7 @@ impl ScalarUDFImpl for Flatten {
     }
 }
 
-/// Flatten SQL function
-pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("flatten", args)?;
 
     match array.data_type() {
@@ -153,7 +141,8 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                 List(_) => {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_list_array(&values)?.clone().into_parts();
-                    let offsets = get_offsets_for_flatten::<i32>(inner_offsets, offsets);
+                    let offsets =
+                        get_offsets_for_flatten::<i32, i32>(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i32>::new(
                         inner_field,
                         offsets,
@@ -164,7 +153,17 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                     Ok(Arc::new(flattened_array) as ArrayRef)
                 }
                 LargeList(_) => {
-                    exec_err!("flatten does not support type '{:?}'", array.data_type())?
+                    let (inner_field, inner_offsets, inner_values, _) =
+                        as_large_list_array(&values)?.clone().into_parts();
+                    let offsets =
+                        get_offsets_for_flatten::<i64, i32>(inner_offsets, &offsets);
+                    let flattened_array = GenericListArray::<i64>::new(
+                        inner_field,
+                        offsets,
+                        inner_values,
+                        nulls,
+                    );
+                    Ok(Arc::new(flattened_array) as ArrayRef)
                 }
                 _ => Ok(Arc::clone(array) as ArrayRef),
             }
@@ -178,7 +177,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                 List(_) => {
                     let (inner_field, inner_offsets, inner_values, _) =
                         as_list_array(&values)?.clone().into_parts();
-                    let offsets = get_large_offsets_for_flatten(inner_offsets, offsets);
+                    let offsets = get_large_offsets_for_flatten(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i64>::new(
                         inner_field,
                         offsets,
@@ -189,9 +188,10 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                     Ok(Arc::new(flattened_array) as ArrayRef)
                 }
                 LargeList(_) => {
-                    let (inner_field, inner_offsets, inner_values, nulls) =
+                    let (inner_field, inner_offsets, inner_values, _) =
                         as_large_list_array(&values)?.clone().into_parts();
-                    let offsets = get_offsets_for_flatten::<i64>(inner_offsets, offsets);
+                    let offsets =
+                        get_offsets_for_flatten::<i64, i64>(inner_offsets, &offsets);
                     let flattened_array = GenericListArray::<i64>::new(
                         inner_field,
                         offsets,
@@ -206,18 +206,18 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
         Null => Ok(Arc::clone(array)),
         _ => {
-            exec_err!("flatten does not support type '{:?}'", array.data_type())
+            exec_err!("flatten does not support type '{}'", array.data_type())
         }
     }
 }
 
 // Create new offsets that are equivalent to `flatten` the array.
-fn get_offsets_for_flatten<O: OffsetSizeTrait>(
-    offsets: OffsetBuffer<O>,
-    indexes: OffsetBuffer<O>,
+fn get_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
+    inner_offsets: OffsetBuffer<O>,
+    outer_offsets: &OffsetBuffer<P>,
 ) -> OffsetBuffer<O> {
-    let buffer = offsets.into_inner();
-    let offsets: Vec<O> = indexes
+    let buffer = inner_offsets.into_inner();
+    let offsets: Vec<O> = outer_offsets
         .iter()
         .map(|i| buffer[i.to_usize().unwrap()])
         .collect();
@@ -226,11 +226,11 @@ fn get_offsets_for_flatten<O: OffsetSizeTrait>(
 
 // Create new large offsets that are equivalent to `flatten` the array.
 fn get_large_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
-    offsets: OffsetBuffer<O>,
-    indexes: OffsetBuffer<P>,
+    inner_offsets: OffsetBuffer<O>,
+    outer_offsets: &OffsetBuffer<P>,
 ) -> OffsetBuffer<i64> {
-    let buffer = offsets.into_inner();
-    let offsets: Vec<i64> = indexes
+    let buffer = inner_offsets.into_inner();
+    let offsets: Vec<i64> = outer_offsets
         .iter()
         .map(|i| buffer[i.to_usize().unwrap()].to_i64().unwrap())
         .collect();
diff --git a/datafusion/functions-nested/src/length.rs b/datafusion/functions-nested/src/length.rs
index 0da12684158e4..77584f3ff4c1f 100644
--- a/datafusion/functions-nested/src/length.rs
+++ b/datafusion/functions-nested/src/length.rs
@@ -29,11 +29,12 @@ use arrow::datatypes::{
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_generic_list_array, as_int64_array,
 };
-use datafusion_common::{exec_err, internal_datafusion_err, plan_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
-use datafusion_functions::{downcast_arg, downcast_named_arg};
+use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
 use std::any::Any;
 use std::sync::Arc;
@@ -64,7 +65,7 @@ make_udf_expr_and_func!(
     ),
     argument(name = "dimension", description = "Array dimension.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayLength {
     signature: Signature,
     aliases: Vec<String>,
@@ -79,7 +80,22 @@ impl Default for ArrayLength {
 impl ArrayLength {
     pub fn new() -> Self {
         Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![ArrayFunctionArgument::Array],
+                        array_coercion: None,
+                    }),
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::Index,
+                        ],
+                        array_coercion: None,
+                    }),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("list_length")],
         }
     }
@@ -97,19 +113,11 @@ impl ScalarUDFImpl for ArrayLength {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(match arg_types[0] {
-            List(_) | LargeList(_) | FixedSizeList(_, _) => UInt64,
-            _ => {
-                return plan_err!("The array_length function can only accept List/LargeList/FixedSizeList.");
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_length_inner)(&args.args)
     }
 
@@ -139,8 +147,7 @@ macro_rules! array_length_impl {
     }};
 }
 
-/// Array_length SQL function
-pub fn array_length_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_length_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() != 1 && args.len() != 2 {
         return exec_err!("array_length expects one or two arguments");
     }
@@ -149,7 +156,7 @@ pub fn array_length_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         List(_) => general_array_length::<i32>(args),
         LargeList(_) => general_array_length::<i64>(args),
         FixedSizeList(_, _) => fixed_size_array_length(args),
-        array_type => exec_err!("array_length does not support type '{array_type:?}'"),
+        array_type => exec_err!("array_length does not support type '{array_type}'"),
     }
 }
 
diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs
index b05b53d2d8eec..99b25ec96454b 100644
--- a/datafusion/functions-nested/src/lib.rs
+++ b/datafusion/functions-nested/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Nested type Functions for [DataFusion].
 //!
@@ -32,12 +33,12 @@
 //! [DataFusion]: https://crates.io/crates/datafusion
 //!
 //! You can register the functions in this crate using the [`register_all`] function.
-//!
 
 #[macro_use]
 pub mod macros;
 
 pub mod array_has;
+pub mod arrays_zip;
 pub mod cardinality;
 pub mod concat;
 pub mod dimension;
@@ -50,10 +51,11 @@ pub mod flatten;
 pub mod length;
 pub mod make_array;
 pub mod map;
+pub mod map_entries;
 pub mod map_extract;
 pub mod map_keys;
 pub mod map_values;
-pub mod max;
+pub mod min_max;
 pub mod planner;
 pub mod position;
 pub mod range;
@@ -78,6 +80,7 @@ pub mod expr_fn {
     pub use super::array_has::array_has;
     pub use super::array_has::array_has_all;
     pub use super::array_has::array_has_any;
+    pub use super::arrays_zip::arrays_zip;
     pub use super::cardinality::cardinality;
     pub use super::concat::array_append;
     pub use super::concat::array_concat;
@@ -95,9 +98,12 @@ pub mod expr_fn {
     pub use super::flatten::flatten;
     pub use super::length::array_length;
     pub use super::make_array::make_array;
+    pub use super::map_entries::map_entries;
     pub use super::map_extract::map_extract;
     pub use super::map_keys::map_keys;
     pub use super::map_values::map_values;
+    pub use super::min_max::array_max;
+    pub use super::min_max::array_min;
     pub use super::position::array_position;
     pub use super::position::array_positions;
     pub use super::range::gen_series;
@@ -146,7 +152,8 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         length::array_length_udf(),
         distance::array_distance_udf(),
         flatten::flatten_udf(),
-        max::array_max_udf(),
+        min_max::array_max_udf(),
+        min_max::array_min_udf(),
         sort::array_sort_udf(),
         repeat::array_repeat_udf(),
         resize::array_resize_udf(),
@@ -154,6 +161,7 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         set_ops::array_distinct_udf(),
         set_ops::array_intersect_udf(),
         set_ops::array_union_udf(),
+        arrays_zip::arrays_zip_udf(),
         position::array_position_udf(),
         position::array_positions_udf(),
         remove::array_remove_udf(),
@@ -163,6 +171,7 @@ pub fn all_default_nested_functions() -> Vec<Arc<ScalarUDF>> {
         replace::array_replace_all_udf(),
         replace::array_replace_udf(),
         map::map_udf(),
+        map_entries::map_entries_udf(),
         map_extract::map_extract_udf(),
         map_keys::map_keys_udf(),
         map_values::map_values_udf(),
diff --git a/datafusion/functions-nested/src/macros.rs b/datafusion/functions-nested/src/macros.rs
index cec7f2fd562d6..5f12113150a40 100644
--- a/datafusion/functions-nested/src/macros.rs
+++ b/datafusion/functions-nested/src/macros.rs
@@ -41,11 +41,15 @@
 /// * `arg`: 0 or more named arguments for the function
 /// * `DOC`: documentation string for the function
 /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+///   automatically resolves to `$UDF::new()`.
 ///
 /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
 macro_rules! make_udf_expr_and_func {
-    ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $SCALAR_UDF_FN:ident) => {
-        paste::paste! {
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident) => {
+        make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, $SCALAR_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => {
             // "fluent expr_fn" style function
             #[doc = $DOC]
             pub fn $EXPR_FN($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr {
@@ -54,11 +58,12 @@ macro_rules! make_udf_expr_and_func {
                     vec![$($arg),*],
                 ))
             }
-            create_func!($UDF, $SCALAR_UDF_FN);
-        }
+            create_func!($UDF, $SCALAR_UDF_FN, $CTOR);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident) => {
+        make_udf_expr_and_func!($UDF, $EXPR_FN, $DOC, $SCALAR_UDF_FN, $UDF::new);
     };
-    ($UDF:ty, $EXPR_FN:ident, $DOC:expr , $SCALAR_UDF_FN:ident) => {
-        paste::paste! {
+    ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => {
             // "fluent expr_fn" style function
             #[doc = $DOC]
             pub fn $EXPR_FN(arg: Vec<datafusion_expr::Expr>) -> datafusion_expr::Expr {
@@ -67,8 +72,7 @@ macro_rules! make_udf_expr_and_func {
                     arg,
                 ))
             }
-            create_func!($UDF, $SCALAR_UDF_FN);
-        }
+            create_func!($UDF, $SCALAR_UDF_FN, $CTOR);
     };
 }
 
@@ -80,11 +84,15 @@ macro_rules! make_udf_expr_and_func {
 /// # Arguments
 /// * `UDF`: name of the [`ScalarUDFImpl`]
 /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+///   automatically resolves to `$UDF::new()`.
 ///
 /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
 macro_rules! create_func {
-    ($UDF:ty, $SCALAR_UDF_FN:ident) => {
-        paste::paste! {
+    ($UDF:ident, $SCALAR_UDF_FN:ident) => {
+        create_func!($UDF, $SCALAR_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $SCALAR_UDF_FN:ident, $CTOR:path) => {
             #[doc = concat!("ScalarFunction that returns a [`ScalarUDF`](datafusion_expr::ScalarUDF) for ")]
             #[doc = stringify!($UDF)]
             pub fn $SCALAR_UDF_FN() -> std::sync::Arc<datafusion_expr::ScalarUDF> {
@@ -92,11 +100,10 @@ macro_rules! create_func {
                 static INSTANCE: std::sync::LazyLock<std::sync::Arc<datafusion_expr::ScalarUDF>> =
                     std::sync::LazyLock::new(|| {
                         std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(
-                            <$UDF>::new(),
+                            $CTOR(),
                         ))
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     };
 }
diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs
index babb039191577..0a02e9507b6fb 100644
--- a/datafusion/functions-nested/src/make_array.rs
+++ b/datafusion/functions-nested/src/make_array.rs
@@ -23,22 +23,23 @@ use std::vec;
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayData, ArrayRef, Capacities, GenericListArray,
-    MutableArrayData, NullArray, OffsetSizeTrait,
+    Array, ArrayData, ArrayRef, Capacities, GenericListArray, MutableArrayData,
+    NullArray, OffsetSizeTrait, new_null_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{DataType::Null, Field};
 use datafusion_common::utils::SingleRowListArrayBuilder;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::{Result, plan_err};
 use datafusion_expr::binary::{
     try_type_union_resolution_with_struct, type_union_resolution,
 };
-use datafusion_expr::TypeSignature;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use itertools::Itertools as _;
 
 make_udf_expr_and_func!(
     MakeArray,
@@ -64,7 +65,7 @@ make_udf_expr_and_func!(
         description = "Expression to include in the output array. Can be a constant, column, or function, and any combination of arithmetic or string operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct MakeArray {
     signature: Signature,
     aliases: Vec<String>,
@@ -79,10 +80,7 @@ impl Default for MakeArray {
 impl MakeArray {
     pub fn new() -> Self {
         Self {
-            signature: Signature::one_of(
-                vec![TypeSignature::Nullary, TypeSignature::UserDefined],
-                Volatility::Immutable,
-            ),
+            signature: Signature::user_defined(Volatility::Immutable),
             aliases: vec![String::from("make_list")],
         }
     }
@@ -112,10 +110,7 @@ impl ScalarUDFImpl for MakeArray {
         Ok(DataType::new_list(element_type, true))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(make_array_inner)(&args.args)
     }
 
@@ -124,17 +119,10 @@ impl ScalarUDFImpl for MakeArray {
     }
 
     fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if let Ok(unified) = try_type_union_resolution_with_struct(arg_types) {
-            return Ok(unified);
-        }
-
-        if let Some(unified) = type_union_resolution(arg_types) {
-            Ok(vec![unified; arg_types.len()])
+        if arg_types.is_empty() {
+            Ok(vec![])
         } else {
-            plan_err!(
-                "Failed to unify argument types of {}: {arg_types:?}",
-                self.name()
-            )
+            coerce_types_inner(arg_types, self.name())
         }
     }
 
@@ -161,7 +149,7 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
             SingleRowListArrayBuilder::new(array).build_list_array(),
         ))
     } else {
-        array_array::<i32>(arrays, data_type.clone())
+        array_array::<i32>(arrays, data_type.clone(), Field::LIST_FIELD_DEFAULT_NAME)
     }
 }
 
@@ -205,9 +193,10 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
 /// └──────────────┘   └──────────────┘        └─────────────────────────────┘
 ///      col1               col2                         output
 /// ```
-fn array_array<O: OffsetSizeTrait>(
+pub fn array_array<O: OffsetSizeTrait>(
     args: &[ArrayRef],
     data_type: DataType,
+    field_name: &str,
 ) -> Result<ArrayRef> {
     // do not accept 0 arguments.
     if args.is_empty() {
@@ -250,9 +239,25 @@ fn array_array<O: OffsetSizeTrait>(
     let data = mutable.freeze();
 
     Ok(Arc::new(GenericListArray::<O>::try_new(
-        Arc::new(Field::new_list_field(data_type, true)),
+        Arc::new(Field::new(field_name, data_type, true)),
         OffsetBuffer::new(offsets.into()),
         arrow::array::make_array(data),
         None,
     )?))
 }
+
+pub fn coerce_types_inner(arg_types: &[DataType], name: &str) -> Result<Vec<DataType>> {
+    if let Ok(unified) = try_type_union_resolution_with_struct(arg_types) {
+        return Ok(unified);
+    }
+
+    if let Some(unified) = type_union_resolution(arg_types) {
+        Ok(vec![unified; arg_types.len()])
+    } else {
+        plan_err!(
+            "Failed to unify argument types of {}: [{}]",
+            name,
+            arg_types.iter().join(", ")
+        )
+    }
+}
diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs
index 828f2e244112b..dad63ee01452b 100644
--- a/datafusion/functions-nested/src/map.rs
+++ b/datafusion/functions-nested/src/map.rs
@@ -25,11 +25,12 @@ use arrow::datatypes::{DataType, Field, SchemaBuilder, ToByteSlice};
 
 use datafusion_common::utils::{fixed_size_list_to_arrays, list_to_arrays};
 use datafusion_common::{
-    exec_err, utils::take_function_args, HashSet, Result, ScalarValue,
+    HashSet, Result, ScalarValue, exec_err, utils::take_function_args,
 };
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -61,11 +62,7 @@ fn make_map_batch(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 
     let can_evaluate_to_const = can_evaluate_to_const(args);
 
-    // check the keys array is unique
     let keys = get_first_array_ref(keys_arg)?;
-    if keys.null_count() > 0 {
-        return exec_err!("map key cannot be null");
-    }
     let key_array = keys.as_ref();
 
     match keys_arg {
@@ -84,22 +81,31 @@ fn make_map_batch(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 
             row_keys
                 .iter()
-                .try_for_each(|key| check_unique_keys(key.as_ref()))?;
+                .try_for_each(|key| validate_map_keys(key.as_ref()))?;
         }
         ColumnarValue::Scalar(_) => {
-            check_unique_keys(key_array)?;
+            validate_map_keys(key_array)?;
         }
     }
 
     let values = get_first_array_ref(values_arg)?;
-    make_map_batch_internal(keys, values, can_evaluate_to_const, keys_arg.data_type())
+
+    make_map_batch_internal(&keys, &values, can_evaluate_to_const, &keys_arg.data_type())
 }
 
-fn check_unique_keys(array: &dyn Array) -> Result<()> {
+/// Validates that map keys are non-null and unique.
+fn validate_map_keys(array: &dyn Array) -> Result<()> {
     let mut seen_keys = HashSet::with_capacity(array.len());
 
     for i in 0..array.len() {
         let key = ScalarValue::try_from_array(array, i)?;
+
+        // Validation 1: Map keys cannot be null
+        if key.is_null() {
+            return exec_err!("map key cannot be null");
+        }
+
+        // Validation 2: Map keys must be unique
         if seen_keys.contains(&key) {
             return exec_err!("map key must be unique, duplicate key found: {}", key);
         }
@@ -114,27 +120,37 @@ fn get_first_array_ref(columnar_value: &ColumnarValue) -> Result<ArrayRef> {
             ScalarValue::List(array) => Ok(array.value(0)),
             ScalarValue::LargeList(array) => Ok(array.value(0)),
             ScalarValue::FixedSizeList(array) => Ok(array.value(0)),
-            _ => exec_err!("Expected array, got {:?}", value),
+            _ => exec_err!("Expected array, got {}", value),
         },
         ColumnarValue::Array(array) => Ok(array.to_owned()),
     }
 }
 
 fn make_map_batch_internal(
-    keys: ArrayRef,
-    values: ArrayRef,
+    keys: &ArrayRef,
+    values: &ArrayRef,
     can_evaluate_to_const: bool,
-    data_type: DataType,
+    data_type: &DataType,
 ) -> Result<ColumnarValue> {
     if keys.len() != values.len() {
         return exec_err!("map requires key and value lists to have the same length");
     }
 
-    if !can_evaluate_to_const {
-        return if let DataType::LargeList(..) = data_type {
-            make_map_array_internal::<i64>(keys, values)
-        } else {
-            make_map_array_internal::<i32>(keys, values)
+    // Use the array path (make_map_array_internal) in these cases:
+    // 1. Not const evaluation (!can_evaluate_to_const) - allows scalar elimination optimization
+    // 2. NULL maps present (keys.null_count() > 0) - fast path doesn't handle NULL list elements
+    if !can_evaluate_to_const || keys.null_count() > 0 {
+        return match data_type {
+            DataType::LargeList(..) => make_map_array_internal::<i64>(keys, values),
+            DataType::List(..) => make_map_array_internal::<i32>(keys, values),
+            DataType::FixedSizeList(..) => {
+                // FixedSizeList doesn't use OffsetSizeTrait, so handle it separately
+                make_map_array_from_fixed_size_list(keys, values)
+            }
+            _ => exec_err!(
+                "Expected List, LargeList, or FixedSizeList, got {:?}",
+                data_type
+            ),
         };
     }
 
@@ -144,8 +160,8 @@ fn make_map_batch_internal(
     let mut entry_offsets_buffer = VecDeque::new();
     entry_offsets_buffer.push_back(0);
 
-    entry_struct_buffer.push_back((Arc::clone(&key_field), Arc::clone(&keys)));
-    entry_struct_buffer.push_back((Arc::clone(&value_field), Arc::clone(&values)));
+    entry_struct_buffer.push_back((Arc::clone(&key_field), Arc::clone(keys)));
+    entry_struct_buffer.push_back((Arc::clone(&value_field), Arc::clone(values)));
     entry_offsets_buffer.push_back(keys.len() as u32);
 
     let entry_struct: Vec<(Arc<Field>, ArrayRef)> = entry_struct_buffer.into();
@@ -221,7 +237,7 @@ SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]);
                         For `make_map`: The list of values to be mapped to the corresponding keys."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct MapFunc {
     signature: Signature,
 }
@@ -273,10 +289,7 @@ impl ScalarUDFImpl for MapFunc {
         ))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_map_batch(&args.args)
     }
 
@@ -353,30 +366,123 @@ fn get_element_type(data_type: &DataType) -> Result<&DataType> {
 /// +-----------+      +-----------+
 /// ```text
 fn make_map_array_internal<O: OffsetSizeTrait>(
-    keys: ArrayRef,
-    values: ArrayRef,
+    keys: &ArrayRef,
+    values: &ArrayRef,
 ) -> Result<ColumnarValue> {
-    let mut offset_buffer = vec![O::zero()];
-    let mut running_offset = O::zero();
+    // Save original data types and array length before list_to_arrays transforms them
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len(); // This is the number of rows in the input
+
+    // Save the nulls bitmap from the original keys array (before list_to_arrays)
+    // This tells us which MAP values are NULL (not which keys within maps are null)
+    let nulls_bitmap = keys.nulls().cloned();
+
+    let keys = list_to_arrays::<O>(keys);
+    let values = list_to_arrays::<O>(values);
+
+    build_map_array(
+        &keys,
+        &values,
+        &keys_data_type,
+        &values_data_type,
+        original_len,
+        nulls_bitmap,
+    )
+}
 
-    let keys = list_to_arrays::<O>(&keys);
-    let values = list_to_arrays::<O>(&values);
+/// Helper function specifically for FixedSizeList inputs
+/// Similar to make_map_array_internal but uses fixed_size_list_to_arrays instead of list_to_arrays
+fn make_map_array_from_fixed_size_list(
+    keys: &ArrayRef,
+    values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    // Save original data types and array length
+    let keys_data_type = keys.data_type().clone();
+    let values_data_type = values.data_type().clone();
+    let original_len = keys.len();
+
+    // Save the nulls bitmap from the original keys array
+    let nulls_bitmap = keys.nulls().cloned();
+
+    let keys = fixed_size_list_to_arrays(keys);
+    let values = fixed_size_list_to_arrays(values);
+
+    build_map_array(
+        &keys,
+        &values,
+        &keys_data_type,
+        &values_data_type,
+        original_len,
+        nulls_bitmap,
+    )
+}
 
+/// Common logic to build a MapArray from decomposed list arrays
+fn build_map_array(
+    keys: &[ArrayRef],
+    values: &[ArrayRef],
+    keys_data_type: &DataType,
+    values_data_type: &DataType,
+    original_len: usize,
+    nulls_bitmap: Option<arrow::buffer::NullBuffer>,
+) -> Result<ColumnarValue> {
     let mut key_array_vec = vec![];
     let mut value_array_vec = vec![];
     for (k, v) in keys.iter().zip(values.iter()) {
-        running_offset = running_offset.add(O::usize_as(k.len()));
-        offset_buffer.push(running_offset);
         key_array_vec.push(k.as_ref());
         value_array_vec.push(v.as_ref());
     }
 
-    // concatenate all the arrays
-    let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
-    if flattened_keys.null_count() > 0 {
-        return exec_err!("keys cannot be null");
+    // Build offset buffer that accounts for NULL maps
+    // For each row, if it's NULL, the offset stays the same (empty range)
+    // If it's not NULL, the offset advances by the number of entries in that map
+    // NOTE: MapArray always requires i32 offsets, regardless of input list type
+    let mut running_offset = 0i32;
+    let mut offset_buffer = vec![running_offset];
+    let mut non_null_idx = 0;
+    for i in 0..original_len {
+        let is_null = nulls_bitmap.as_ref().is_some_and(|nulls| nulls.is_null(i));
+        if !is_null {
+            let entry_count = keys[non_null_idx].len();
+            // Validate that we won't overflow i32 when converting from potentially i64 offsets
+            let entry_count_i32 = i32::try_from(entry_count).map_err(|_| {
+                datafusion_common::DataFusionError::Execution(format!(
+                    "Map offset overflow: entry count {entry_count} at index {i} exceeds i32::MAX",
+                ))
+            })?;
+            running_offset =
+                running_offset.checked_add(entry_count_i32).ok_or_else(|| {
+                    datafusion_common::DataFusionError::Execution(format!(
+                    "Map offset overflow: cumulative offset exceeds i32::MAX at index {i}",
+                ))
+                })?;
+            non_null_idx += 1;
+        }
+        offset_buffer.push(running_offset);
     }
-    let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?;
+
+    // concatenate all the arrays
+    // If key_array_vec is empty, it means all maps were NULL (list elements were NULL).
+    // In this case, we need to create empty arrays with the correct data type.
+    let (flattened_keys, flattened_values) = if key_array_vec.is_empty() {
+        // All maps are NULL - create empty arrays
+        // We need to infer the data type from the original keys/values arrays
+        let key_type = get_element_type(keys_data_type)?;
+        let value_type = get_element_type(values_data_type)?;
+
+        (
+            arrow::array::new_empty_array(key_type),
+            arrow::array::new_empty_array(value_type),
+        )
+    } else {
+        let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?;
+        if flattened_keys.null_count() > 0 {
+            return exec_err!("keys cannot be null");
+        }
+        let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?;
+        (flattened_keys, flattened_values)
+    };
 
     let fields = vec![
         Arc::new(Field::new("key", flattened_keys.data_type().clone(), false)),
@@ -393,7 +499,7 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
         .add_child_data(flattened_values.to_data())
         .build()?;
 
-    let map_data = ArrayData::builder(DataType::Map(
+    let mut map_data_builder = ArrayData::builder(DataType::Map(
         Arc::new(Field::new(
             "entries",
             struct_data.data_type().clone(),
@@ -401,9 +507,241 @@ fn make_map_array_internal<O: OffsetSizeTrait>(
         )),
         false,
     ))
-    .len(keys.len())
+    .len(original_len) // Use the original number of rows, not the filtered count
     .add_child_data(struct_data)
-    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()))
-    .build()?;
+    .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice()));
+
+    // Add the nulls bitmap if present (to preserve NULL map values)
+    if let Some(nulls) = nulls_bitmap {
+        map_data_builder = map_data_builder.nulls(Some(nulls));
+    }
+
+    let map_data = map_data_builder.build()?;
     Ok(ColumnarValue::Array(Arc::new(MapArray::from(map_data))))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_make_map_with_null_maps() {
+        // Test that NULL map values (entire map is NULL) are correctly handled
+        // This test directly calls make_map_batch with a List containing NULL elements
+        //
+        // Background: On main branch, the code would fail with "map key cannot be null"
+        // because it couldn't distinguish between:
+        // - NULL map (entire map is NULL) - should be allowed
+        // - null key within a map - should be rejected
+
+        // Build keys array: [['a'], NULL, ['b']]
+        // The middle NULL represents an entire NULL map, not a null key
+        let mut key_builder =
+            arrow::array::ListBuilder::new(arrow::array::StringBuilder::new());
+
+        // First map: ['a']
+        key_builder.values().append_value("a");
+        key_builder.append(true);
+
+        // Second map: NULL (entire map is NULL)
+        key_builder.append(false);
+
+        // Third map: ['b']
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array: [[1], [2], [3]]
+        let mut value_builder =
+            arrow::array::ListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.append(true);
+
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(result.is_ok(), "Should handle NULL maps correctly");
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 3, "Should have 3 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(map_array.is_null(1), "Second map should be NULL");
+        assert!(!map_array.is_null(2), "Third map should not be NULL");
+    }
+
+    #[test]
+    fn test_make_map_with_null_key_within_map_should_fail() {
+        // Test that null keys WITHIN a map are properly rejected
+        // This ensures the fix doesn't accidentally allow invalid null keys
+
+        // Build keys array: [['a', NULL, 'b']]
+        // The NULL here is a null key within the map, which is invalid
+        let mut key_builder =
+            arrow::array::ListBuilder::new(arrow::array::StringBuilder::new());
+
+        key_builder.values().append_value("a");
+        key_builder.values().append_null(); // Invalid: null key
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array: [[1, 2, 3]]
+        let mut value_builder =
+            arrow::array::ListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should fail
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(result.is_err(), "Should reject null keys within maps");
+
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("cannot be null"),
+            "Error should mention null keys, got: {err_msg}"
+        );
+    }
+
+    #[test]
+    fn test_make_map_with_large_list() {
+        // Test that LargeList inputs work correctly with i32 offset conversion
+        // This verifies the fix for the offset buffer type mismatch issue
+
+        // Build keys array as LargeList: [['a', 'b'], ['c']]
+        let mut key_builder =
+            arrow::array::LargeListBuilder::new(arrow::array::StringBuilder::new());
+
+        // First map: ['a', 'b']
+        key_builder.values().append_value("a");
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        // Second map: ['c']
+        key_builder.values().append_value("c");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array as LargeList: [[1, 2], [3]]
+        let mut value_builder =
+            arrow::array::LargeListBuilder::new(arrow::array::Int32Builder::new());
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(
+            result.is_ok(),
+            "Should handle LargeList inputs correctly: {:?}",
+            result.err()
+        );
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 2, "Should have 2 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(!map_array.is_null(1), "Second map should not be NULL");
+    }
+
+    #[test]
+    fn test_make_map_with_fixed_size_list() {
+        // Test that FixedSizeList inputs work correctly
+        // This verifies the fix for FixedSizeList support in the data type check
+
+        use arrow::array::FixedSizeListBuilder;
+
+        // Build keys array as FixedSizeList(2): [['a', 'b'], ['c', 'd']]
+        let key_values_builder = arrow::array::StringBuilder::new();
+        let mut key_builder = FixedSizeListBuilder::new(key_values_builder, 2);
+
+        // First map: ['a', 'b']
+        key_builder.values().append_value("a");
+        key_builder.values().append_value("b");
+        key_builder.append(true);
+
+        // Second map: ['c', 'd']
+        key_builder.values().append_value("c");
+        key_builder.values().append_value("d");
+        key_builder.append(true);
+
+        let keys_array = Arc::new(key_builder.finish());
+
+        // Build values array as FixedSizeList(2): [[1, 2], [3, 4]]
+        let value_values_builder = arrow::array::Int32Builder::new();
+        let mut value_builder = FixedSizeListBuilder::new(value_values_builder, 2);
+
+        value_builder.values().append_value(1);
+        value_builder.values().append_value(2);
+        value_builder.append(true);
+
+        value_builder.values().append_value(3);
+        value_builder.values().append_value(4);
+        value_builder.append(true);
+
+        let values_array = Arc::new(value_builder.finish());
+
+        // Call make_map_batch - should succeed
+        let result = make_map_batch(&[
+            ColumnarValue::Array(keys_array),
+            ColumnarValue::Array(values_array),
+        ]);
+
+        assert!(
+            result.is_ok(),
+            "Should handle FixedSizeList inputs correctly: {:?}",
+            result.err()
+        );
+
+        // Verify the result
+        let map_array = match result.unwrap() {
+            ColumnarValue::Array(arr) => arr,
+            _ => panic!("Expected Array result"),
+        };
+
+        assert_eq!(map_array.len(), 2, "Should have 2 maps");
+        assert!(!map_array.is_null(0), "First map should not be NULL");
+        assert!(!map_array.is_null(1), "Second map should not be NULL");
+    }
+}
diff --git a/datafusion/functions-nested/src/map_entries.rs b/datafusion/functions-nested/src/map_entries.rs
new file mode 100644
index 0000000000000..daa4d3100f746
--- /dev/null
+++ b/datafusion/functions-nested/src/map_entries.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ScalarUDFImpl`] definitions for map_entries function.
+
+use crate::utils::{get_map_entry_field, make_scalar_function};
+use arrow::array::{Array, ArrayRef, ListArray};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, cast::as_map_array, exec_err};
+use datafusion_expr::{
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+make_udf_expr_and_func!(
+    MapEntriesFunc,
+    map_entries,
+    map,
+    "Return a list of all entries in the map.",
+    map_entries_udf
+);
+
+#[user_doc(
+    doc_section(label = "Map Functions"),
+    description = "Returns a list of all entries in the map.",
+    syntax_example = "map_entries(map)",
+    sql_example = r#"```sql
+SELECT map_entries(MAP {'a': 1, 'b': NULL, 'c': 3});
+----
+[{'key': a, 'value': 1}, {'key': b, 'value': NULL}, {'key': c, 'value': 3}]
+
+SELECT map_entries(map([100, 5], [42, 43]));
+----
+[{'key': 100, 'value': 42}, {'key': 5, 'value': 43}]
+```"#,
+    argument(
+        name = "map",
+        description = "Map expression. Can be a constant, column, or function, and any combination of map operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MapEntriesFunc {
+    signature: Signature,
+}
+
+impl Default for MapEntriesFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MapEntriesFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::new(
+                TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MapEntriesFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "map_entries"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        let [map_type] = take_function_args(self.name(), arg_types)?;
+        let map_fields = get_map_entry_field(map_type)?;
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            DataType::Struct(Fields::from(vec![
+                Field::new(
+                    "key",
+                    map_fields.first().unwrap().data_type().clone(),
+                    false,
+                ),
+                Field::new(
+                    "value",
+                    map_fields.get(1).unwrap().data_type().clone(),
+                    true,
+                ),
+            ])),
+            false,
+        ))))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(map_entries_inner)(&args.args)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn map_entries_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [map_arg] = take_function_args("map_entries", args)?;
+
+    let map_array = match map_arg.data_type() {
+        DataType::Map(_, _) => as_map_array(&map_arg)?,
+        _ => return exec_err!("Argument for map_entries should be a map"),
+    };
+
+    Ok(Arc::new(ListArray::new(
+        Arc::new(Field::new_list_field(
+            DataType::Struct(Fields::from(vec![
+                Field::new("key", map_array.key_type().clone(), false),
+                Field::new("value", map_array.value_type().clone(), true),
+            ])),
+            false,
+        )),
+        map_array.offsets().clone(),
+        Arc::new(map_array.entries().clone()),
+        map_array.nulls().cloned(),
+    )))
+}
diff --git a/datafusion/functions-nested/src/map_extract.rs b/datafusion/functions-nested/src/map_extract.rs
index 55ab8447c54f1..676696301e03f 100644
--- a/datafusion/functions-nested/src/map_extract.rs
+++ b/datafusion/functions-nested/src/map_extract.rs
@@ -19,14 +19,15 @@
 
 use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{
-    make_array, Array, ArrayRef, Capacities, ListArray, MapArray, MutableArrayData,
+    Array, ArrayRef, Capacities, ListArray, MapArray, MutableArrayData, make_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -68,7 +69,7 @@ SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y');
         description = "Key to extract from the map. Can be a constant, column, or function, any combination of arithmetic or string operators, or a named expression of the previously listed."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct MapExtract {
     signature: Signature,
     aliases: Vec<String>,
@@ -110,10 +111,7 @@ impl ScalarUDFImpl for MapExtract {
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_extract_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/map_keys.rs b/datafusion/functions-nested/src/map_keys.rs
index 0f15c06d86d15..294c01ed02ad6 100644
--- a/datafusion/functions-nested/src/map_keys.rs
+++ b/datafusion/functions-nested/src/map_keys.rs
@@ -21,10 +21,10 @@ use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{Array, ArrayRef, ListArray};
 use arrow::datatypes::{DataType, Field};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err};
 use datafusion_expr::{
-    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -56,7 +56,7 @@ SELECT map_keys(map([100, 5], [42, 43]));
         description = "Map expression. Can be a constant, column, or function, and any combination of map operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct MapKeysFunc {
     signature: Signature,
 }
@@ -94,16 +94,14 @@ impl ScalarUDFImpl for MapKeysFunc {
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         let [map_type] = take_function_args(self.name(), arg_types)?;
         let map_fields = get_map_entry_field(map_type)?;
+        // internal array nullability is true to be in sync with DuckDB
         Ok(DataType::List(Arc::new(Field::new_list_field(
             map_fields.first().unwrap().data_type().clone(),
-            false,
+            true,
         ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_keys_inner)(&args.args)
     }
 
@@ -121,7 +119,8 @@ fn map_keys_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     };
 
     Ok(Arc::new(ListArray::new(
-        Arc::new(Field::new_list_field(map_array.key_type().clone(), false)),
+        // internal array nullability is true to be in sync with DuckDB
+        Arc::new(Field::new_list_field(map_array.key_type().clone(), true)),
         map_array.offsets().clone(),
         Arc::clone(map_array.keys()),
         map_array.nulls().cloned(),
diff --git a/datafusion/functions-nested/src/map_values.rs b/datafusion/functions-nested/src/map_values.rs
index 8247fdd4a74ce..52286bcf75ecf 100644
--- a/datafusion/functions-nested/src/map_values.rs
+++ b/datafusion/functions-nested/src/map_values.rs
@@ -21,10 +21,10 @@ use crate::utils::{get_map_entry_field, make_scalar_function};
 use arrow::array::{Array, ArrayRef, ListArray};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{cast::as_map_array, exec_err, internal_err, Result};
+use datafusion_common::{Result, cast::as_map_array, exec_err, internal_err};
 use datafusion_expr::{
-    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ArrayFunctionSignature, ColumnarValue, Documentation, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -57,7 +57,7 @@ SELECT map_values(map([100, 5], [42, 43]));
         description = "Map expression. Can be a constant, column, or function, and any combination of map operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(crate) struct MapValuesFunc {
     signature: Signature,
 }
@@ -111,10 +111,7 @@ impl ScalarUDFImpl for MapValuesFunc {
         .into())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(map_values_inner)(&args.args)
     }
 
diff --git a/datafusion/functions-nested/src/max.rs b/datafusion/functions-nested/src/min_max.rs
similarity index 57%
rename from datafusion/functions-nested/src/max.rs
rename to datafusion/functions-nested/src/min_max.rs
index b667a7b426505..e3603b731fd89 100644
--- a/datafusion/functions-nested/src/max.rs
+++ b/datafusion/functions-nested/src/min_max.rs
@@ -20,15 +20,15 @@ use crate::utils::make_scalar_function;
 use arrow::array::{ArrayRef, GenericListArray, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeList, List};
+use datafusion_common::Result;
 use datafusion_common::cast::{as_large_list_array, as_list_array};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, plan_err, ScalarValue};
+use datafusion_common::{ScalarValue, exec_err, plan_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
 };
-use datafusion_functions_aggregate::min_max;
+use datafusion_functions_aggregate_common::min_max::{max_batch, min_batch};
 use datafusion_macros::user_doc;
 use itertools::Itertools;
 use std::any::Any;
@@ -58,7 +58,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayMax {
     signature: Signature,
     aliases: Vec<String>,
@@ -113,32 +113,105 @@ impl ScalarUDFImpl for ArrayMax {
     }
 }
 
-/// array_max SQL function
-///
-/// There is one argument for array_max as the array.
-/// `array_max(array)`
-///
-/// For example:
-/// > array_max(\[1, 3, 2]) -> 3
-pub fn array_max_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_max_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array] = take_function_args("array_max", args)?;
     match array.data_type() {
-        List(_) => general_array_max(as_list_array(array)?),
-        LargeList(_) => general_array_max(as_large_list_array(array)?),
+        List(_) => array_min_max_helper(as_list_array(array)?, max_batch),
+        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, max_batch),
         arg_type => exec_err!("array_max does not support type: {arg_type}"),
     }
 }
 
-fn general_array_max<O: OffsetSizeTrait>(
+make_udf_expr_and_func!(
+    ArrayMin,
+    array_min,
+    array,
+    "returns the minimum value in the array",
+    array_min_udf
+);
+#[user_doc(
+    doc_section(label = "Array Functions"),
+    description = "Returns the minimum value in the array.",
+    syntax_example = "array_min(array)",
+    sql_example = r#"```sql
+> select array_min([3,1,4,2]);
++-----------------------------------------+
+| array_min(List([3,1,4,2]))              |
++-----------------------------------------+
+| 1                                       |
++-----------------------------------------+
+```"#,
+    argument(
+        name = "array",
+        description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct ArrayMin {
+    signature: Signature,
+}
+
+impl Default for ArrayMin {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ArrayMin {
+    fn new() -> Self {
+        Self {
+            signature: Signature::array(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ArrayMin {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "array_min"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        let [array] = take_function_args(self.name(), arg_types)?;
+        match array {
+            List(field) | LargeList(field) => Ok(field.data_type().clone()),
+            arg_type => plan_err!("{} does not support type {}", self.name(), arg_type),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(array_min_inner)(&args.args)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn array_min_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("array_min", args)?;
+    match array.data_type() {
+        List(_) => array_min_max_helper(as_list_array(array)?, min_batch),
+        LargeList(_) => array_min_max_helper(as_large_list_array(array)?, min_batch),
+        arg_type => exec_err!("array_min does not support type: {arg_type}"),
+    }
+}
+
+fn array_min_max_helper<O: OffsetSizeTrait>(
     array: &GenericListArray<O>,
+    agg_fn: fn(&ArrayRef) -> Result<ScalarValue>,
 ) -> Result<ArrayRef> {
     let null_value = ScalarValue::try_from(array.value_type())?;
     let result_vec: Vec<ScalarValue> = array
         .iter()
-        .map(|arr| {
-            arr.as_ref()
-                .map_or_else(|| Ok(null_value.clone()), min_max::max_batch)
-        })
+        .map(|arr| arr.as_ref().map_or_else(|| Ok(null_value.clone()), agg_fn))
         .try_collect()?;
     ScalarValue::iter_to_array(result_vec)
 }
diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs
index 369eaecb1905f..e96fdb7d4baca 100644
--- a/datafusion/functions-nested/src/planner.rs
+++ b/datafusion/functions-nested/src/planner.rs
@@ -18,15 +18,18 @@
 //! SQL planning extensions like [`NestedFunctionPlanner`] and [`FieldAccessPlanner`]
 
 use arrow::datatypes::DataType;
-use datafusion_common::ExprSchema;
-use datafusion_common::{plan_err, utils::list_ndims, DFSchema, Result};
+use datafusion_common::{DFSchema, Result, plan_err, utils::list_ndims};
+use datafusion_expr::AggregateUDF;
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams};
-use datafusion_expr::AggregateUDF;
+#[cfg(feature = "sql")]
+use datafusion_expr::sqlparser::ast::BinaryOperator;
 use datafusion_expr::{
+    Expr, ExprSchemable, GetFieldAccess,
     planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr},
-    sqlparser, Expr, ExprSchemable, GetFieldAccess,
 };
+#[cfg(not(feature = "sql"))]
+use datafusion_expr_common::operator::Operator as BinaryOperator;
 use datafusion_functions::core::get_field as get_field_inner;
 use datafusion_functions::expr_fn::get_field;
 use datafusion_functions_aggregate::nth_value::nth_value_udaf;
@@ -34,7 +37,7 @@ use std::sync::Arc;
 
 use crate::map::map_udf;
 use crate::{
-    array_has::{array_has_all, array_has_udf},
+    array_has::array_has_all,
     expr_fn::{array_append, array_concat, array_prepend},
     extract::{array_element, array_slice},
     make_array::make_array,
@@ -51,7 +54,7 @@ impl ExprPlanner for NestedFunctionPlanner {
     ) -> Result<PlannerResult<RawBinaryExpr>> {
         let RawBinaryExpr { op, left, right } = expr;
 
-        if op == sqlparser::ast::BinaryOperator::StringConcat {
+        if op == BinaryOperator::StringConcat {
             let left_type = left.get_type(schema)?;
             let right_type = right.get_type(schema)?;
             let left_list_ndims = list_ndims(&left_type);
@@ -75,18 +78,14 @@ impl ExprPlanner for NestedFunctionPlanner {
             } else if left_list_ndims < right_list_ndims {
                 return Ok(PlannerResult::Planned(array_prepend(left, right)));
             }
-        } else if matches!(
-            op,
-            sqlparser::ast::BinaryOperator::AtArrow
-                | sqlparser::ast::BinaryOperator::ArrowAt
-        ) {
+        } else if matches!(op, BinaryOperator::AtArrow | BinaryOperator::ArrowAt) {
             let left_type = left.get_type(schema)?;
             let right_type = right.get_type(schema)?;
             let left_list_ndims = list_ndims(&left_type);
             let right_list_ndims = list_ndims(&right_type);
             // if both are list
             if left_list_ndims > 0 && right_list_ndims > 0 {
-                if op == sqlparser::ast::BinaryOperator::AtArrow {
+                if op == BinaryOperator::AtArrow {
                     // array1 @> array2 -> array_has_all(array1, array2)
                     return Ok(PlannerResult::Planned(array_has_all(left, right)));
                 } else {
@@ -108,7 +107,7 @@ impl ExprPlanner for NestedFunctionPlanner {
     }
 
     fn plan_make_map(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
-        if args.len() % 2 != 0 {
+        if !args.len().is_multiple_of(2) {
             return plan_err!("make_map requires an even number of arguments");
         }
 
@@ -121,20 +120,6 @@ impl ExprPlanner for NestedFunctionPlanner {
             ScalarFunction::new_udf(map_udf(), vec![keys, values]),
         )))
     }
-
-    fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
-        if expr.op == sqlparser::ast::BinaryOperator::Eq {
-            Ok(PlannerResult::Planned(Expr::ScalarFunction(
-                ScalarFunction::new_udf(
-                    array_has_udf(),
-                    // left and right are reversed here so `needle=any(haystack)` -> `array_has(haystack, needle)`
-                    vec![expr.right, expr.left],
-                ),
-            )))
-        } else {
-            plan_err!("Unsupported AnyOp: '{}', only '=' is supported", expr.op)
-        }
-    }
 }
 
 #[derive(Debug)]
@@ -149,6 +134,9 @@ impl ExprPlanner for FieldAccessPlanner {
 
         match field_access {
             // expr["field"] => get_field(expr, "field")
+            // Nested accesses like expr["a"]["b"] create nested get_field calls,
+            // which are then merged by the SimplifyExpressions optimizer pass via
+            // the GetFieldFunc::simplify() method.
             GetFieldAccess::NamedStructField { name } => {
                 Ok(PlannerResult::Planned(get_field(expr, name)))
             }
@@ -177,9 +165,7 @@ impl ExprPlanner for FieldAccessPlanner {
                         )),
                     )),
                     // special case for map access with
-                    Expr::Column(ref c)
-                        if matches!(schema.data_type(c)?, DataType::Map(_, _)) =>
-                    {
+                    _ if matches!(expr.get_type(schema)?, DataType::Map(_, _)) => {
                         Ok(PlannerResult::Planned(Expr::ScalarFunction(
                             ScalarFunction::new_udf(
                                 get_field_inner(),
diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs
index b186b65407c32..acdeb202f90bc 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -17,13 +17,17 @@
 
 //! [`ScalarUDFImpl`] definitions for array_position and array_positions functions.
 
+use arrow::array::Scalar;
+use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
     DataType::{LargeList, List, UInt64},
     Field,
 };
+use datafusion_common::ScalarValue;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -31,13 +35,13 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    types::UInt64Type, Array, ArrayRef, GenericListArray, ListArray, OffsetSizeTrait,
-    UInt64Array,
+    Array, ArrayRef, GenericListArray, ListArray, OffsetSizeTrait, UInt64Array,
+    types::UInt64Type,
 };
 use datafusion_common::cast::{
     as_generic_list_array, as_int64_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, internal_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use itertools::Itertools;
 
 use crate::utils::{compare_element_to_list, make_scalar_function};
@@ -52,7 +56,7 @@ make_udf_expr_and_func!(
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns the position of the first occurrence of the specified element in the array.",
+    description = "Returns the position of the first occurrence of the specified element in the array, or NULL if not found. Comparisons are done using `IS DISTINCT FROM` semantics, so NULL is considered to match NULL.",
     syntax_example = "array_position(array, element)\narray_position(array, element, index)",
     sql_example = r#"```sql
 > select array_position([1, 2, 2, 3, 1, 4], 2);
@@ -72,13 +76,13 @@ make_udf_expr_and_func!(
         name = "array",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
+    argument(name = "element", description = "Element to search for in the array."),
     argument(
-        name = "element",
-        description = "Element to search for position in the array."
-    ),
-    argument(name = "index", description = "Index at which to start searching.")
+        name = "index",
+        description = "Index at which to start searching (1-indexed)."
+    )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayPosition {
     signature: Signature,
     aliases: Vec<String>,
@@ -120,11 +124,11 @@ impl ScalarUDFImpl for ArrayPosition {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_position_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        match try_array_position_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_position_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -136,66 +140,206 @@ impl ScalarUDFImpl for ArrayPosition {
     }
 }
 
-/// Array_position SQL function
-pub fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+/// Attempts the scalar-needle fast path for `array_position`.
+fn try_array_position_scalar(args: &[ColumnarValue]) -> Result<Option<ColumnarValue>> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!("array_position expects two or three arguments");
+    }
+
+    // Fallback to the generic code path if the needle is an array
+    let scalar_needle = match &args[1] {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the generic code path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
+
+    // Determine batch length from whichever argument is columnar;
+    // if all inputs are scalar, batch length is 1.
+    let (num_rows, all_inputs_scalar) = match (&args[0], args.get(2)) {
+        (ColumnarValue::Array(a), _) => (a.len(), false),
+        (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
+        _ => (1, true),
+    };
+
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = args[0].to_array(num_rows)?;
+    let arr_from = resolve_start_from(args.get(2), num_rows)?;
+
+    let result = match haystack.data_type() {
+        List(_) => {
+            let list = as_list_array(&haystack)?;
+            array_position_scalar::<i32>(list, &needle, &arr_from)
+        }
+        LargeList(_) => {
+            let list = as_large_list_array(&haystack)?;
+            array_position_scalar::<i64>(list, &needle, &arr_from)
+        }
+        t => exec_err!("array_position does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
+    }
+}
+
+fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() < 2 || args.len() > 3 {
         return exec_err!("array_position expects two or three arguments");
     }
     match &args[0].data_type() {
         List(_) => general_position_dispatch::<i32>(args),
         LargeList(_) => general_position_dispatch::<i64>(args),
-        array_type => exec_err!("array_position does not support type '{array_type:?}'."),
+        dt => exec_err!("array_position does not support type '{dt}'"),
+    }
+}
+
+/// Resolves the optional `start_from` argument into a `Vec<i64>` of
+/// 0-indexed starting positions.
+fn resolve_start_from(
+    third_arg: Option<&ColumnarValue>,
+    num_rows: usize,
+) -> Result<Vec<i64>> {
+    match third_arg {
+        None => Ok(vec![0i64; num_rows]),
+        Some(ColumnarValue::Scalar(ScalarValue::Int64(Some(v)))) => {
+            Ok(vec![v - 1; num_rows])
+        }
+        Some(ColumnarValue::Scalar(s)) => {
+            exec_err!("array_position expected Int64 for start_from, got {s}")
+        }
+        Some(ColumnarValue::Array(a)) => {
+            Ok(as_int64_array(a)?.values().iter().map(|&x| x - 1).collect())
+        }
+    }
+}
+
+/// Fast path for `array_position` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// find per-row first-match positions.
+fn array_position_scalar<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+    arr_from: &[i64], // 0-indexed
+) -> Result<ArrayRef> {
+    crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?;
+
+    if haystack.len() == 0 {
+        return Ok(Arc::new(UInt64Array::new_null(0)));
+    }
+
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[haystack.len()].as_usize();
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
+    // `not_distinct` treats NULL=NULL as true, matching the semantics of
+    // `array_position`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?;
+    let eq_bits = eq_array.values();
+
+    let mut result: Vec<Option<u64>> = Vec::with_capacity(haystack.len());
+    let mut matches = eq_bits.set_indices().peekable();
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for i in 0..haystack.len() {
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
+
+        if validity.is_some_and(|v| v.is_null(i)) {
+            // Null row -> null output; advance past matches in range
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+            result.push(None);
+            continue;
+        }
+
+        let from = arr_from[i];
+        let row_len = end - start;
+        if !(from >= 0 && (from as usize) <= row_len) {
+            return exec_err!("start_from out of bounds: {}", from + 1);
+        }
+        let search_start = start + from as usize;
+
+        // Advance past matches before search_start
+        while matches.peek().is_some_and(|&p| p < search_start) {
+            matches.next();
+        }
+
+        // First match in [search_start, end)?
+        if matches.peek().is_some_and(|&p| p < end) {
+            let pos = *matches.peek().unwrap();
+            result.push(Some((pos - start + 1) as u64));
+            // Advance past remaining matches in this row
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+        } else {
+            result.push(None);
+        }
     }
+
+    debug_assert_eq!(result.len(), haystack.len());
+    Ok(Arc::new(UInt64Array::from(result)))
 }
+
 fn general_position_dispatch<O: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let list_array = as_generic_list_array::<O>(&args[0])?;
-    let element_array = &args[1];
+    let haystack = as_generic_list_array::<O>(&args[0])?;
+    let needle = &args[1];
 
-    crate::utils::check_datatypes(
-        "array_position",
-        &[list_array.values(), element_array],
-    )?;
+    crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?;
 
     let arr_from = if args.len() == 3 {
         as_int64_array(&args[2])?
             .values()
-            .to_vec()
             .iter()
             .map(|&x| x - 1)
             .collect::<Vec<_>>()
     } else {
-        vec![0; list_array.len()]
+        vec![0; haystack.len()]
     };
 
-    // if `start_from` index is out of bounds, return error
-    for (arr, &from) in list_array.iter().zip(arr_from.iter()) {
-        if let Some(arr) = arr {
-            if from < 0 || from as usize >= arr.len() {
-                return internal_err!("start_from index out of bounds");
-            }
-        } else {
-            // We will get null if we got null in the array, so we don't need to check
+    for (row, &from) in haystack.iter().zip(arr_from.iter()) {
+        if !row.is_none_or(|row| from >= 0 && (from as usize) <= row.len()) {
+            return exec_err!("start_from out of bounds: {}", from + 1);
         }
     }
 
-    generic_position::<O>(list_array, element_array, arr_from)
+    generic_position::<O>(haystack, needle, &arr_from)
 }
 
-fn generic_position<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
-    arr_from: Vec<i64>, // 0-indexed
+fn generic_position<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+    arr_from: &[i64], // 0-indexed
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, (list_array_row, &from)) in
-        list_array.iter().zip(arr_from.iter()).enumerate()
-    {
+    for (row_index, (row, &from)) in haystack.iter().zip(arr_from.iter()).enumerate() {
         let from = from as usize;
 
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, row_index, true)?;
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, true)?;
 
             // Collect `true`s in 1-indexed positions
             let index = eq_array
@@ -237,17 +381,20 @@ make_udf_expr_and_func!(
         name = "array",
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     ),
-    argument(
-        name = "element",
-        description = "Element to search for position in the array."
-    )
+    argument(name = "element", description = "Element to search for in the array.")
 )]
-#[derive(Debug)]
-pub(super) struct ArrayPositions {
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayPositions {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayPositions {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayPositions {
     pub fn new() -> Self {
         Self {
@@ -273,11 +420,11 @@ impl ScalarUDFImpl for ArrayPositions {
         Ok(List(Arc::new(Field::new_list_field(UInt64, true))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_positions_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        match try_array_positions_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_positions_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -289,37 +436,70 @@ impl ScalarUDFImpl for ArrayPositions {
     }
 }
 
-/// Array_positions SQL function
-pub fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [array, element] = take_function_args("array_positions", args)?;
+/// Attempts the scalar-needle fast path for `array_positions`.
+fn try_array_positions_scalar(args: &[ColumnarValue]) -> Result<Option<ColumnarValue>> {
+    let [haystack_arg, needle_arg] = take_function_args("array_positions", args)?;
+
+    let scalar_needle = match needle_arg {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the per-row path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
 
-    match &array.data_type() {
+    let (num_rows, all_inputs_scalar) = match haystack_arg {
+        ColumnarValue::Array(a) => (a.len(), false),
+        ColumnarValue::Scalar(_) => (1, true),
+    };
+
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = haystack_arg.to_array(num_rows)?;
+
+    let result = match haystack.data_type() {
         List(_) => {
-            let arr = as_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), element])?;
-            general_positions::<i32>(arr, element)
+            let list = as_list_array(&haystack)?;
+            array_positions_scalar::<i32>(list, &needle)
         }
         LargeList(_) => {
-            let arr = as_large_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), element])?;
-            general_positions::<i64>(arr, element)
-        }
-        array_type => {
-            exec_err!("array_positions does not support type '{array_type:?}'.")
+            let list = as_large_list_array(&haystack)?;
+            array_positions_scalar::<i64>(list, &needle)
         }
+        t => exec_err!("array_positions does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
     }
 }
 
-fn general_positions<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
+fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [haystack, needle] = take_function_args("array_positions", args)?;
+
+    match &haystack.data_type() {
+        List(_) => general_positions::<i32>(as_list_array(&haystack)?, needle),
+        LargeList(_) => general_positions::<i64>(as_large_list_array(&haystack)?, needle),
+        dt => exec_err!("array_positions does not support type '{dt}'"),
+    }
+}
+
+fn general_positions<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?;
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, list_array_row) in list_array.iter().enumerate() {
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, row_index, true)?;
+    for (row_index, row) in haystack.iter().enumerate() {
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, true)?;
 
             // Collect `true`s in 1-indexed positions
             let indexes = eq_array
@@ -338,3 +518,243 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
         ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
     ))
 }
+
+/// Fast path for `array_positions` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// collect all per-row match positions.
+fn array_positions_scalar<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+) -> Result<ArrayRef> {
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?;
+
+    let num_rows = haystack.len();
+    if num_rows == 0 {
+        return Ok(Arc::new(ListArray::try_new(
+            Arc::new(Field::new_list_field(UInt64, true)),
+            OffsetBuffer::new_zeroed(1),
+            Arc::new(UInt64Array::from(Vec::<u64>::new())),
+            None,
+        )?));
+    }
+
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[num_rows].as_usize();
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
+    // `not_distinct` treats NULL=NULL as true, matching the semantics of
+    // `array_positions`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?;
+    let eq_bits = eq_array.values();
+
+    let num_matches = eq_bits.count_set_bits();
+    let mut positions: Vec<u64> = Vec::with_capacity(num_matches);
+    let mut result_offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+    result_offsets.push(0);
+    let mut matches = eq_bits.set_indices().peekable();
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for i in 0..num_rows {
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
+
+        if validity.is_some_and(|v| v.is_null(i)) {
+            // Null row -> null output; advance past matches in range.
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+            result_offsets.push(positions.len() as i32);
+            continue;
+        }
+
+        // Collect all matches in [start, end).
+        while let Some(pos) = matches.next_if(|&p| p < end) {
+            positions.push((pos - start + 1) as u64);
+        }
+        result_offsets.push(positions.len() as i32);
+    }
+
+    debug_assert_eq!(result_offsets.len(), num_rows + 1);
+    Ok(Arc::new(ListArray::try_new(
+        Arc::new(Field::new_list_field(UInt64, true)),
+        OffsetBuffer::new(result_offsets.into()),
+        Arc::new(UInt64Array::from(positions)),
+        validity.cloned(),
+    )?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::AsArray;
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::config::ConfigOptions;
+
+    #[test]
+    fn test_array_position_sliced_list() -> Result<()> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new("return", UInt64, true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPosition::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        let output = invoke(70)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_positions_sliced_list() -> Result<()> {
+        // [[10, 20, 30], [30, 40, 30], [50, 60, 30], [70, 80, 30]]
+        //   → slice(1,2) → [[30, 40, 30], [50, 60, 30]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20), Some(30)]),
+            Some(vec![Some(30), Some(40), Some(30)]),
+            Some(vec![Some(50), Some(60), Some(30)]),
+            Some(vec![Some(70), Some(80), Some(30)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPositions::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        // Needle 30: appears at positions 1,3 in row 0 ([30,40,30])
+        // and position 3 in row 1 ([50,60,30]).
+        let output = invoke(30)?;
+        let output = output.as_list::<i32>();
+        let row0 = output.value(0);
+        let row0 = row0.as_primitive::<UInt64Type>();
+        assert_eq!(row0.values().as_ref(), &[1, 3]);
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[3]);
+
+        // Needle 10: only in the sliced-away prefix row → empty lists.
+        let output = invoke(10)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        // Needle 70: only in the sliced-away suffix row → empty lists.
+        let output = invoke(70)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_positions_sliced_list_with_nulls() -> Result<()> {
+        // [[1, 2], null, [3, 1], [4, 5]]  →  slice(1,2)  →  [null, [3, 1]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), Some(1)]),
+            Some(vec![Some(4), Some(5)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let output = ArrayPositions::new()
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::new(sliced)),
+                    ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+                ],
+                arg_fields: vec![Arc::clone(&haystack_field), Arc::clone(&needle_field)],
+                number_rows: 2,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::new(ConfigOptions::default()),
+            })?
+            .into_array(2)?;
+
+        let output = output.as_list::<i32>();
+        // Row 0 is null (from the sliced null row).
+        assert!(output.is_null(0));
+        // Row 1 is [3, 1] → needle 1 found at position 2.
+        assert!(!output.is_null(1));
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[2]);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs
index 637a78d158ab2..0893a1a40acef 100644
--- a/datafusion/functions-nested/src/range.rs
+++ b/datafusion/functions-nested/src/range.rs
@@ -18,30 +18,39 @@
 //! [`ScalarUDFImpl`] definitions for range and gen_series functions.
 
 use crate::utils::make_scalar_function;
-use arrow::array::{
-    builder::{Date32Builder, TimestampNanosecondBuilder},
-    temporal_conversions::as_datetime_with_timezone,
-    timezone::Tz,
-    types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType as TSNT},
-    Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullArray, NullBufferBuilder,
-    TimestampNanosecondArray,
-};
 use arrow::buffer::OffsetBuffer;
-use arrow::datatypes::{
-    DataType, DataType::*, Field, IntervalUnit::MonthDayNano, TimeUnit::Nanosecond,
+use arrow::datatypes::TimeUnit;
+use arrow::datatypes::{DataType, Field, IntervalUnit::MonthDayNano};
+use arrow::{
+    array::{
+        Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder,
+        builder::{Date32Builder, TimestampNanosecondBuilder},
+        temporal_conversions::as_datetime_with_timezone,
+        timezone::Tz,
+        types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType},
+    },
+    compute::cast,
 };
-use datafusion_common::cast::{
-    as_date32_array, as_int64_array, as_interval_mdn_array, as_timestamp_nanosecond_array,
+use datafusion_common::internal_err;
+use datafusion_common::{
+    Result, exec_datafusion_err, exec_err, not_impl_datafusion_err,
+    utils::take_function_args,
 };
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, not_impl_datafusion_err,
-    utils::take_function_args, Result,
+    ScalarValue,
+    cast::{
+        as_date32_array, as_int64_array, as_interval_mdn_array,
+        as_timestamp_nanosecond_array,
+    },
+    types::{
+        NativeType, logical_date, logical_int64, logical_interval_mdn, logical_string,
+    },
 };
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
-use itertools::Itertools;
 use std::any::Any;
 use std::cmp::Ordering;
 use std::iter::from_fn;
@@ -53,13 +62,24 @@ make_udf_expr_and_func!(
     range,
     start stop step,
     "create a list of values in the range between start and stop",
-    range_udf
+    range_udf,
+    Range::new
+);
+
+make_udf_expr_and_func!(
+    GenSeries,
+    gen_series,
+    start stop step,
+    "create a list of values in the range between start and stop, include upper bound",
+    gen_series_udf,
+    Range::generate_series
 );
 
 #[user_doc(
     doc_section(label = "Array Functions"),
     description = "Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.",
-    syntax_example = "range(start, stop, step)",
+    syntax_example = "range(stop)
+range(start, stop[, step])",
     sql_example = r#"```sql
 > select range(2, 10, 3);
 +-----------------------------------+
@@ -69,11 +89,11 @@ make_udf_expr_and_func!(
 +-----------------------------------+
 
 > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH);
-+--------------------------------------------------------------+
-| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) |
-+--------------------------------------------------------------+
++--------------------------------------------------------------------------+
+| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH)          |
++--------------------------------------------------------------------------+
 | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] |
-+--------------------------------------------------------------+
++--------------------------------------------------------------------------+
 ```"#,
     argument(
         name = "start",
@@ -88,115 +108,13 @@ make_udf_expr_and_func!(
         description = "Increase by step (cannot be 0). Steps less than a day are supported only for timestamp ranges."
     )
 )]
-#[derive(Debug)]
-pub struct Range {
-    signature: Signature,
-    aliases: Vec<String>,
-}
-
-impl Default for Range {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-impl Range {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
-        }
-    }
-}
-impl ScalarUDFImpl for Range {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-    fn name(&self) -> &str {
-        "range"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        arg_types
-            .iter()
-            .map(|arg_type| match arg_type {
-                Null => Ok(Null),
-                Int8 => Ok(Int64),
-                Int16 => Ok(Int64),
-                Int32 => Ok(Int64),
-                Int64 => Ok(Int64),
-                UInt8 => Ok(Int64),
-                UInt16 => Ok(Int64),
-                UInt32 => Ok(Int64),
-                UInt64 => Ok(Int64),
-                Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())),
-                Date32 => Ok(Date32),
-                Date64 => Ok(Date32),
-                Utf8 => Ok(Date32),
-                LargeUtf8 => Ok(Date32),
-                Utf8View => Ok(Date32),
-                Interval(_) => Ok(Interval(MonthDayNano)),
-                _ => exec_err!("Unsupported DataType"),
-            })
-            .try_collect()
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.iter().any(|t| t.is_null()) {
-            Ok(Null)
-        } else {
-            Ok(List(Arc::new(Field::new_list_field(
-                arg_types[0].clone(),
-                true,
-            ))))
-        }
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = &args.args;
-
-        if args.iter().any(|arg| arg.data_type().is_null()) {
-            return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
-        }
-        match args[0].data_type() {
-            Int64 => make_scalar_function(|args| gen_range_inner(args, false))(args),
-            Date32 => make_scalar_function(|args| gen_range_date(args, false))(args),
-            Timestamp(_, _) => {
-                make_scalar_function(|args| gen_range_timestamp(args, false))(args)
-            }
-            dt => {
-                exec_err!("unsupported type for RANGE. Expected Int64, Date32 or Timestamp, got: {dt}")
-            }
-        }
-    }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
-
-make_udf_expr_and_func!(
-    GenSeries,
-    gen_series,
-    start stop step,
-    "create a list of values in the range between start and stop, include upper bound",
-    gen_series_udf
-);
+struct RangeDoc {}
 
 #[user_doc(
     doc_section(label = "Array Functions"),
     description = "Similar to the range function, but it includes the upper bound.",
-    syntax_example = "generate_series(start, stop, step)",
+    syntax_example = "generate_series(stop)
+generate_series(start, stop[, step])",
     sql_example = r#"```sql
 > select generate_series(1,3);
 +------------------------------------+
@@ -218,175 +136,397 @@ make_udf_expr_and_func!(
         description = "Increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges."
     )
 )]
-#[derive(Debug)]
-pub(super) struct GenSeries {
+struct GenerateSeriesDoc {}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct Range {
     signature: Signature,
-    aliases: Vec<String>,
+    /// `false` for range, `true` for generate_series
+    include_upper_bound: bool,
 }
-impl GenSeries {
+
+impl Default for Range {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Range {
+    fn defined_signature() -> Signature {
+        // We natively only support i64 in our implementation; so ensure we cast other integer
+        // types to it.
+        let integer = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int64,
+        );
+        // We natively only support mdn in our implementation; so ensure we cast other interval
+        // types to it.
+        let interval = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_interval_mdn()),
+            vec![TypeSignatureClass::Interval],
+            NativeType::Interval(MonthDayNano),
+        );
+        // Ideally we'd limit to only Date32 & Timestamp(Nanoseconds) as those are the implementations
+        // we have but that is difficult to do with this current API; we'll cast later on to
+        // handle such types.
+        let date = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_date()),
+            vec![TypeSignatureClass::Native(logical_string())],
+            NativeType::Date,
+        );
+        let timestamp = Coercion::new_exact(TypeSignatureClass::Timestamp);
+        Signature::one_of(
+            vec![
+                // Integer ranges
+                // Stop
+                TypeSignature::Coercible(vec![integer.clone()]),
+                // Start & stop
+                TypeSignature::Coercible(vec![integer.clone(), integer.clone()]),
+                // Start, stop & step
+                TypeSignature::Coercible(vec![integer.clone(), integer.clone(), integer]),
+                // Date range
+                TypeSignature::Coercible(vec![date.clone(), date, interval.clone()]),
+                // Timestamp range
+                TypeSignature::Coercible(vec![timestamp.clone(), timestamp, interval]),
+            ],
+            Volatility::Immutable,
+        )
+    }
+
+    /// Generate `range()` function which excludes upper bound.
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Self::defined_signature(),
+            include_upper_bound: false,
+        }
+    }
+
+    /// Generate `generate_series()` function which includes upper bound.
+    fn generate_series() -> Self {
+        Self {
+            signature: Self::defined_signature(),
+            include_upper_bound: true,
         }
     }
 }
-impl ScalarUDFImpl for GenSeries {
+
+impl ScalarUDFImpl for Range {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn name(&self) -> &str {
-        "generate_series"
+        if self.include_upper_bound {
+            "generate_series"
+        } else {
+            "range"
+        }
     }
 
     fn signature(&self) -> &Signature {
         &self.signature
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        arg_types
-            .iter()
-            .map(|arg_type| match arg_type {
-                Null => Ok(Null),
-                Int8 => Ok(Int64),
-                Int16 => Ok(Int64),
-                Int32 => Ok(Int64),
-                Int64 => Ok(Int64),
-                UInt8 => Ok(Int64),
-                UInt16 => Ok(Int64),
-                UInt32 => Ok(Int64),
-                UInt64 => Ok(Int64),
-                Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())),
-                Date32 => Ok(Date32),
-                Date64 => Ok(Date32),
-                Utf8 => Ok(Date32),
-                LargeUtf8 => Ok(Date32),
-                Utf8View => Ok(Date32),
-                Interval(_) => Ok(Interval(MonthDayNano)),
-                _ => exec_err!("Unsupported DataType"),
-            })
-            .try_collect()
-    }
-
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         if arg_types.iter().any(|t| t.is_null()) {
-            Ok(Null)
-        } else {
-            Ok(List(Arc::new(Field::new_list_field(
+            return Ok(DataType::Null);
+        }
+
+        match (&arg_types[0], arg_types.get(1)) {
+            // In implementation we downcast to Date32 so ensure reflect that here
+            (_, Some(DataType::Date64)) | (DataType::Date64, _) => Ok(DataType::List(
+                Arc::new(Field::new_list_field(DataType::Date32, true)),
+            )),
+            // Ensure we preserve timezone
+            (DataType::Timestamp(_, tz), _) => {
+                Ok(DataType::List(Arc::new(Field::new_list_field(
+                    DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()),
+                    true,
+                ))))
+            }
+            _ => Ok(DataType::List(Arc::new(Field::new_list_field(
                 arg_types[0].clone(),
                 true,
-            ))))
+            )))),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         if args.iter().any(|arg| arg.data_type().is_null()) {
-            return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
+            return Ok(ColumnarValue::Scalar(ScalarValue::Null));
         }
         match args[0].data_type() {
-            Int64 => make_scalar_function(|args| gen_range_inner(args, true))(args),
-            Date32 => make_scalar_function(|args| gen_range_date(args, true))(args),
-            Timestamp(_, _) => {
-                make_scalar_function(|args| gen_range_timestamp(args, true))(args)
+            DataType::Int64 => {
+                make_scalar_function(|args| self.gen_range_inner(args))(args)
+            }
+            DataType::Date32 | DataType::Date64 => {
+                make_scalar_function(|args| self.gen_range_date(args))(args)
+            }
+            DataType::Timestamp(_, _) => {
+                make_scalar_function(|args| self.gen_range_timestamp(args))(args)
             }
             dt => {
-                exec_err!(
-                    "unsupported type for GENERATE_SERIES. Expected Int64, Date32 or Timestamp, got: {}",
-                    dt
+                internal_err!(
+                    "Signature failed to guard unknown input type for {}: {dt}",
+                    self.name()
                 )
             }
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
+        if self.include_upper_bound {
+            GenerateSeriesDoc {}.doc()
+        } else {
+            RangeDoc {}.doc()
+        }
     }
 }
 
-/// Generates an array of integers from start to stop with a given step.
-///
-/// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values.
-/// It returns a `Result<ArrayRef>` representing the resulting ListArray after the operation.
-///
-/// # Arguments
-///
-/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values.
-///
-/// # Examples
-///
-/// gen_range(3) => [0, 1, 2]
-/// gen_range(1, 4) => [1, 2, 3]
-/// gen_range(1, 7, 2) => [1, 3, 5]
-pub(super) fn gen_range_inner(
-    args: &[ArrayRef],
-    include_upper: bool,
-) -> Result<ArrayRef> {
-    let (start_array, stop_array, step_array) = match args.len() {
-        1 => (None, as_int64_array(&args[0])?, None),
-        2 => (
-            Some(as_int64_array(&args[0])?),
-            as_int64_array(&args[1])?,
-            None,
-        ),
-        3 => (
-            Some(as_int64_array(&args[0])?),
-            as_int64_array(&args[1])?,
-            Some(as_int64_array(&args[2])?),
-        ),
-        _ => return exec_err!("gen_range expects 1 to 3 arguments"),
-    };
-
-    let mut values = vec![];
-    let mut offsets = vec![0];
-    let mut valid = NullBufferBuilder::new(stop_array.len());
-    for (idx, stop) in stop_array.iter().enumerate() {
-        match retrieve_range_args(start_array, stop, step_array, idx) {
-            Some((_, _, 0)) => {
-                return exec_err!(
-                    "step can't be 0 for function {}(start [, stop, step])",
-                    if include_upper {
-                        "generate_series"
-                    } else {
-                        "range"
-                    }
-                );
+impl Range {
+    /// Generates an array of integers from start to stop with a given step.
+    ///
+    /// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values.
+    /// It returns a `Result<ArrayRef>` representing the resulting ListArray after the operation.
+    ///
+    /// # Arguments
+    ///
+    /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values.
+    ///
+    /// # Examples
+    ///
+    /// gen_range(3) => [0, 1, 2]
+    /// gen_range(1, 4) => [1, 2, 3]
+    /// gen_range(1, 7, 2) => [1, 3, 5]
+    fn gen_range_inner(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let (start_array, stop_array, step_array) = match args {
+            [stop_array] => (None, as_int64_array(stop_array)?, None),
+            [start_array, stop_array] => (
+                Some(as_int64_array(start_array)?),
+                as_int64_array(stop_array)?,
+                None,
+            ),
+            [start_array, stop_array, step_array] => (
+                Some(as_int64_array(start_array)?),
+                as_int64_array(stop_array)?,
+                Some(as_int64_array(step_array)?),
+            ),
+            _ => return internal_err!("{} expects 1 to 3 arguments", self.name()),
+        };
+
+        let mut values = vec![];
+        let mut offsets = vec![0];
+        let mut valid = NullBufferBuilder::new(stop_array.len());
+        for (idx, stop) in stop_array.iter().enumerate() {
+            match retrieve_range_args(start_array, stop, step_array, idx) {
+                Some((_, _, 0)) => {
+                    return exec_err!(
+                        "step can't be 0 for function {}(start [, stop, step])",
+                        self.name()
+                    );
+                }
+                Some((start, stop, step)) => {
+                    // Below, we utilize `usize` to represent steps.
+                    // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`.
+                    let step_abs =
+                        usize::try_from(step.unsigned_abs()).map_err(|_| {
+                            not_impl_datafusion_err!("step {} can't fit into usize", step)
+                        })?;
+                    values.extend(
+                        gen_range_iter(start, stop, step < 0, self.include_upper_bound)
+                            .step_by(step_abs),
+                    );
+                    offsets.push(values.len() as i32);
+                    valid.append_non_null();
+                }
+                // If any of the arguments is NULL, append a NULL value to the result.
+                None => {
+                    offsets.push(values.len() as i32);
+                    valid.append_null();
+                }
+            };
+        }
+        let arr = Arc::new(ListArray::try_new(
+            Arc::new(Field::new_list_field(DataType::Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(Int64Array::from(values)),
+            valid.finish(),
+        )?);
+        Ok(arr)
+    }
+
+    fn gen_range_date(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let [start, stop, step] = take_function_args(self.name(), args)?;
+        let step = as_interval_mdn_array(step)?;
+
+        // Signature can only guarantee we get a date type, not specifically
+        // date32 so handle potential cast from date64 here.
+        let start = cast(start, &DataType::Date32)?;
+        let start = as_date32_array(&start)?;
+        let stop = cast(stop, &DataType::Date32)?;
+        let stop = as_date32_array(&stop)?;
+
+        // values are date32s
+        let values_builder = Date32Builder::new();
+        let mut list_builder = ListBuilder::new(values_builder);
+
+        for idx in 0..stop.len() {
+            if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) {
+                list_builder.append_null();
+                continue;
             }
-            Some((start, stop, step)) => {
-                // Below, we utilize `usize` to represent steps.
-                // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`.
-                let step_abs = usize::try_from(step.unsigned_abs()).map_err(|_| {
-                    not_impl_datafusion_err!("step {} can't fit into usize", step)
-                })?;
-                values.extend(
-                    gen_range_iter(start, stop, step < 0, include_upper)
-                        .step_by(step_abs),
-                );
-                offsets.push(values.len() as i32);
-                valid.append_non_null();
+
+            let start = start.value(idx);
+            let stop = stop.value(idx);
+            let step = step.value(idx);
+
+            let (months, days, _) = IntervalMonthDayNanoType::to_parts(step);
+            if months == 0 && days == 0 {
+                return exec_err!("Cannot generate date range less than 1 day.");
             }
-            // If any of the arguments is NULL, append a NULL value to the result.
-            None => {
-                offsets.push(values.len() as i32);
-                valid.append_null();
+
+            let stop = if !self.include_upper_bound {
+                Date32Type::subtract_month_day_nano_opt(stop, step).ok_or_else(|| {
+                    exec_datafusion_err!(
+                        "Cannot generate date range where stop {} - {step:?}) overflows",
+                        date32_to_string(stop)
+                    )
+                })?
+            } else {
+                stop
+            };
+
+            let neg = months < 0 || days < 0;
+            let mut new_date = Some(start);
+
+            let values = from_fn(|| {
+                let Some(current_date) = new_date else {
+                    return None; // previous overflow
+                };
+                if (neg && current_date < stop) || (!neg && current_date > stop) {
+                    None
+                } else {
+                    new_date = Date32Type::add_month_day_nano_opt(current_date, step);
+                    Some(Some(current_date))
+                }
+            });
+
+            list_builder.append_value(values);
+        }
+
+        let arr = Arc::new(list_builder.finish());
+
+        Ok(arr)
+    }
+
+    fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let [start, stop, step] = take_function_args(self.name(), args)?;
+        let step = as_interval_mdn_array(step)?;
+
+        // Signature can only guarantee we get a timestamp type, not specifically
+        // timestamp(ns) so handle potential cast from other timestamps here.
+        fn cast_to_ns(arr: &ArrayRef) -> Result<ArrayRef> {
+            match arr.data_type() {
+                DataType::Timestamp(TimeUnit::Nanosecond, _) => Ok(Arc::clone(arr)),
+                DataType::Timestamp(_, tz) => Ok(cast(
+                    arr,
+                    &DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()),
+                )?),
+                _ => unreachable!(),
             }
-        };
+        }
+        let start = cast_to_ns(start)?;
+        let start = as_timestamp_nanosecond_array(&start)?;
+        let stop = cast_to_ns(stop)?;
+        let stop = as_timestamp_nanosecond_array(&stop)?;
+
+        let start_tz = parse_tz(&start.timezone())?;
+        let stop_tz = parse_tz(&stop.timezone())?;
+
+        // values are timestamps
+        let values_builder = start
+            .timezone()
+            .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| {
+                TimestampNanosecondBuilder::new().with_timezone(start_tz_str)
+            });
+        let mut list_builder = ListBuilder::new(values_builder);
+
+        for idx in 0..start.len() {
+            if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) {
+                list_builder.append_null();
+                continue;
+            }
+
+            let start = start.value(idx);
+            let stop = stop.value(idx);
+            let step = step.value(idx);
+
+            let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step);
+            if months == 0 && days == 0 && ns == 0 {
+                return exec_err!("Interval argument to {} must not be 0", self.name());
+            }
+
+            let neg = TimestampNanosecondType::add_month_day_nano(start, step, start_tz)
+                .ok_or(exec_datafusion_err!(
+                    "Cannot generate timestamp range where start + step overflows"
+                ))?
+                .cmp(&start)
+                == Ordering::Less;
+
+            let stop_dt =
+                as_datetime_with_timezone::<TimestampNanosecondType>(stop, stop_tz)
+                    .ok_or(exec_datafusion_err!(
+                        "Cannot generate timestamp for stop: {}: {:?}",
+                        stop,
+                        stop_tz
+                    ))?;
+
+            let mut current = start;
+            let mut current_dt =
+                as_datetime_with_timezone::<TimestampNanosecondType>(current, start_tz)
+                    .ok_or(exec_datafusion_err!(
+                    "Cannot generate timestamp for start: {}: {:?}",
+                    current,
+                    start_tz
+                ))?;
+
+            let values = from_fn(|| {
+                let generate_series_should_end = self.include_upper_bound
+                    && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt));
+                let range_should_end = !self.include_upper_bound
+                    && ((neg && current_dt <= stop_dt)
+                        || (!neg && current_dt >= stop_dt));
+                if generate_series_should_end || range_should_end {
+                    return None;
+                }
+
+                let prev_current = current;
+
+                if let Some(ts) =
+                    TimestampNanosecondType::add_month_day_nano(current, step, start_tz)
+                {
+                    current = ts;
+                    current_dt = as_datetime_with_timezone::<TimestampNanosecondType>(
+                        current, start_tz,
+                    )?;
+
+                    Some(Some(prev_current))
+                } else {
+                    // we failed to parse the timestamp here so terminate the series
+                    None
+                }
+            });
+
+            list_builder.append_value(values);
+        }
+
+        let arr = Arc::new(list_builder.finish());
+
+        Ok(arr)
     }
-    let arr = Arc::new(ListArray::try_new(
-        Arc::new(Field::new_list_field(Int64, true)),
-        OffsetBuffer::new(offsets.into()),
-        Arc::new(Int64Array::from(values)),
-        valid.finish(),
-    )?);
-    Ok(arr)
 }
 
 /// Get the (start, stop, step) args for the range and generate_series function.
@@ -436,203 +576,17 @@ fn gen_range_iter(
     }
 }
 
-fn gen_range_date(args: &[ArrayRef], include_upper_bound: bool) -> Result<ArrayRef> {
-    let [start, stop, step] = take_function_args("range", args)?;
+fn parse_tz(tz: &Option<&str>) -> Result<Tz> {
+    let tz = tz.unwrap_or_else(|| "+00");
 
-    let (start_array, stop_array, step_array) = (
-        Some(as_date32_array(start)?),
-        as_date32_array(stop)?,
-        Some(as_interval_mdn_array(step)?),
-    );
-
-    // values are date32s
-    let values_builder = Date32Builder::new();
-    let mut list_builder = ListBuilder::new(values_builder);
-
-    for idx in 0..stop_array.len() {
-        if stop_array.is_null(idx) {
-            list_builder.append_null();
-            continue;
-        }
-        let mut stop = stop_array.value(idx);
-
-        let start = if let Some(start_array_values) = start_array {
-            if start_array_values.is_null(idx) {
-                list_builder.append_null();
-                continue;
-            }
-            start_array_values.value(idx)
-        } else {
-            list_builder.append_null();
-            continue;
-        };
-
-        let step = if let Some(step) = step_array {
-            if step.is_null(idx) {
-                list_builder.append_null();
-                continue;
-            }
-            step.value(idx)
-        } else {
-            list_builder.append_null();
-            continue;
-        };
-
-        let (months, days, _) = IntervalMonthDayNanoType::to_parts(step);
-
-        if months == 0 && days == 0 {
-            return exec_err!("Cannot generate date range less than 1 day.");
-        }
-
-        let neg = months < 0 || days < 0;
-        if !include_upper_bound {
-            stop = Date32Type::subtract_month_day_nano(stop, step);
-        }
-        let mut new_date = start;
-
-        let values = from_fn(|| {
-            if (neg && new_date < stop) || (!neg && new_date > stop) {
-                None
-            } else {
-                let current_date = new_date;
-                new_date = Date32Type::add_month_day_nano(new_date, step);
-                Some(Some(current_date))
-            }
-        });
-
-        list_builder.append_value(values);
-    }
-
-    let arr = Arc::new(list_builder.finish());
-
-    Ok(arr)
+    Tz::from_str(tz)
+        .map_err(|op| exec_datafusion_err!("failed to parse timezone {tz}: {:?}", op))
 }
 
-fn gen_range_timestamp(args: &[ArrayRef], include_upper_bound: bool) -> Result<ArrayRef> {
-    let func_name = if include_upper_bound {
-        "GENERATE_SERIES"
+fn date32_to_string(value: i32) -> String {
+    if let Some(d) = Date32Type::to_naive_date_opt(value) {
+        format!("{value} ({d})")
     } else {
-        "RANGE"
-    };
-    let [start, stop, step] = take_function_args(func_name, args)?;
-
-    // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz)
-    let (start_arr, start_tz_opt) = cast_timestamp_arg(start, include_upper_bound)?;
-    let (stop_arr, stop_tz_opt) = cast_timestamp_arg(stop, include_upper_bound)?;
-    let step_arr = as_interval_mdn_array(step)?;
-    let start_tz = parse_tz(start_tz_opt)?;
-    let stop_tz = parse_tz(stop_tz_opt)?;
-
-    // values are timestamps
-    let values_builder = start_tz_opt
-        .clone()
-        .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| {
-            TimestampNanosecondBuilder::new().with_timezone(start_tz_str)
-        });
-    let mut list_builder = ListBuilder::new(values_builder);
-
-    for idx in 0..start_arr.len() {
-        if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) {
-            list_builder.append_null();
-            continue;
-        }
-
-        let start = start_arr.value(idx);
-        let stop = stop_arr.value(idx);
-        let step = step_arr.value(idx);
-
-        let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step);
-        if months == 0 && days == 0 && ns == 0 {
-            return exec_err!(
-                "Interval argument to {} must not be 0",
-                if include_upper_bound {
-                    "GENERATE_SERIES"
-                } else {
-                    "RANGE"
-                }
-            );
-        }
-
-        let neg = TSNT::add_month_day_nano(start, step, start_tz)
-            .ok_or(exec_datafusion_err!(
-                "Cannot generate timestamp range where start + step overflows"
-            ))?
-            .cmp(&start)
-            == Ordering::Less;
-
-        let stop_dt = as_datetime_with_timezone::<TSNT>(stop, stop_tz).ok_or(
-            exec_datafusion_err!(
-                "Cannot generate timestamp for stop: {}: {:?}",
-                stop,
-                stop_tz
-            ),
-        )?;
-
-        let mut current = start;
-        let mut current_dt = as_datetime_with_timezone::<TSNT>(current, start_tz).ok_or(
-            exec_datafusion_err!(
-                "Cannot generate timestamp for start: {}: {:?}",
-                current,
-                start_tz
-            ),
-        )?;
-
-        let values = from_fn(|| {
-            if (include_upper_bound
-                && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt)))
-                || (!include_upper_bound
-                    && ((neg && current_dt <= stop_dt)
-                        || (!neg && current_dt >= stop_dt)))
-            {
-                return None;
-            }
-
-            let prev_current = current;
-
-            if let Some(ts) = TSNT::add_month_day_nano(current, step, start_tz) {
-                current = ts;
-                current_dt = as_datetime_with_timezone::<TSNT>(current, start_tz)?;
-
-                Some(Some(prev_current))
-            } else {
-                // we failed to parse the timestamp here so terminate the series
-                None
-            }
-        });
-
-        list_builder.append_value(values);
-    }
-
-    let arr = Arc::new(list_builder.finish());
-
-    Ok(arr)
-}
-
-fn cast_timestamp_arg(
-    arg: &ArrayRef,
-    include_upper: bool,
-) -> Result<(&TimestampNanosecondArray, &Option<Arc<str>>)> {
-    match arg.data_type() {
-        Timestamp(Nanosecond, tz_opt) => {
-            Ok((as_timestamp_nanosecond_array(arg)?, tz_opt))
-        }
-        _ => {
-            internal_err!(
-                "Unexpected argument type for {} : {}",
-                if include_upper {
-                    "GENERATE_SERIES"
-                } else {
-                    "RANGE"
-                },
-                arg.data_type()
-            )
-        }
+        format!("{value} (unknown date)")
     }
 }
-
-fn parse_tz(tz: &Option<Arc<str>>) -> Result<Tz> {
-    let tz = tz.as_ref().map_or_else(|| "+00", |s| s);
-
-    Tz::from_str(tz)
-        .map_err(|op| exec_datafusion_err!("failed to parse timezone {tz}: {:?}", op))
-}
diff --git a/datafusion/functions-nested/src/remove.rs b/datafusion/functions-nested/src/remove.rs
index 7f5baa18e7693..a65ecba795776 100644
--- a/datafusion/functions-nested/src/remove.rs
+++ b/datafusion/functions-nested/src/remove.rs
@@ -20,15 +20,17 @@
 use crate::utils;
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    cast::AsArray, new_empty_array, Array, ArrayRef, BooleanArray, GenericListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, NullBufferBuilder,
+    OffsetSizeTrait, cast::AsArray, make_array,
 };
 use arrow::buffer::OffsetBuffer;
-use arrow::datatypes::{DataType, Field};
+use arrow::datatypes::{DataType, FieldRef};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::utils::ListCoercion;
+use datafusion_common::{Result, exec_err, internal_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -63,7 +65,7 @@ make_udf_expr_and_func!(
         description = "Element to be removed from the array."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayRemove {
     signature: Signature,
     aliases: Vec<String>,
@@ -97,14 +99,18 @@ impl ScalarUDFImpl for ArrayRemove {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_inner)(&args.args)
     }
 
@@ -147,7 +153,7 @@ make_udf_expr_and_func!(
     ),
     argument(name = "max", description = "Number of first occurrences to remove.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayRemoveN {
     signature: Signature,
     aliases: Vec<String>,
@@ -156,7 +162,17 @@ pub(super) struct ArrayRemoveN {
 impl ArrayRemoveN {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(3, Volatility::Immutable),
+            signature: Signature::new(
+                TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                    arguments: vec![
+                        ArrayFunctionArgument::Array,
+                        ArrayFunctionArgument::Element,
+                        ArrayFunctionArgument::Index,
+                    ],
+                    array_coercion: Some(ListCoercion::FixedSizedListToList),
+                }),
+                Volatility::Immutable,
+            ),
             aliases: vec!["list_remove_n".to_string()],
         }
     }
@@ -175,14 +191,18 @@ impl ScalarUDFImpl for ArrayRemoveN {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_n_inner)(&args.args)
     }
 
@@ -224,7 +244,7 @@ make_udf_expr_and_func!(
         description = "Element to be removed from the array."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayRemoveAll {
     signature: Signature,
     aliases: Vec<String>,
@@ -252,14 +272,18 @@ impl ScalarUDFImpl for ArrayRemoveAll {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn invoke_with_args(
+    fn return_field_from_args(
         &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_remove_all_inner)(&args.args)
     }
 
@@ -272,34 +296,31 @@ impl ScalarUDFImpl for ArrayRemoveAll {
     }
 }
 
-/// Array_remove SQL function
-pub fn array_remove_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element] = take_function_args("array_remove", args)?;
 
     let arr_n = vec![1; array.len()];
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
-/// Array_remove_n SQL function
-pub fn array_remove_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element, max] = take_function_args("array_remove_n", args)?;
 
     let arr_n = as_int64_array(max)?.values().to_vec();
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
-/// Array_remove_all SQL function
-pub fn array_remove_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_remove_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, element] = take_function_args("array_remove_all", args)?;
 
     let arr_n = vec![i64::MAX; array.len()];
-    array_remove_internal(array, element, arr_n)
+    array_remove_internal(array, element, &arr_n)
 }
 
 fn array_remove_internal(
     array: &ArrayRef,
     element_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
@@ -311,7 +332,7 @@ fn array_remove_internal(
             general_remove::<i64>(list_array, element_array, arr_n)
         }
         array_type => {
-            exec_err!("array_remove_all does not support type '{array_type:?}'.")
+            exec_err!("array_remove_all does not support type '{array_type}'.")
         }
     }
 }
@@ -336,74 +357,520 @@ fn array_remove_internal(
 fn general_remove<OffsetSize: OffsetSizeTrait>(
     list_array: &GenericListArray<OffsetSize>,
     element_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
-    let data_type = list_array.value_type();
-    let mut new_values = vec![];
+    let list_field = match list_array.data_type() {
+        DataType::List(field) | DataType::LargeList(field) => field,
+        _ => {
+            return exec_err!(
+                "Expected List or LargeList data type, got {:?}",
+                list_array.data_type()
+            );
+        }
+    };
+    let original_data = list_array.values().to_data();
     // Build up the offsets for the final output array
     let mut offsets = Vec::<OffsetSize>::with_capacity(arr_n.len() + 1);
     offsets.push(OffsetSize::zero());
 
-    // n is the number of elements to remove in this row
-    for (row_index, (list_array_row, n)) in
-        list_array.iter().zip(arr_n.iter()).enumerate()
-    {
-        match list_array_row {
-            Some(list_array_row) => {
-                let eq_array = utils::compare_element_to_list(
-                    &list_array_row,
-                    element_array,
-                    row_index,
-                    false,
-                )?;
-
-                // We need to keep at most first n elements as `false`, which represent the elements to remove.
-                let eq_array = if eq_array.false_count() < *n as usize {
-                    eq_array
-                } else {
-                    let mut count = 0;
-                    eq_array
-                        .iter()
-                        .map(|e| {
-                            // Keep first n `false` elements, and reverse other elements to `true`.
-                            if let Some(false) = e {
-                                if count < *n {
-                                    count += 1;
-                                    e
-                                } else {
-                                    Some(true)
-                                }
-                            } else {
-                                e
-                            }
-                        })
-                        .collect::<BooleanArray>()
-                };
-
-                let filtered_array = arrow::compute::filter(&list_array_row, &eq_array)?;
-                offsets.push(
-                    offsets[row_index] + OffsetSize::usize_as(filtered_array.len()),
-                );
-                new_values.push(filtered_array);
-            }
-            None => {
-                // Null element results in a null row (no new offsets)
-                offsets.push(offsets[row_index]);
+    let mut mutable = MutableArrayData::with_capacities(
+        vec![&original_data],
+        false,
+        Capacities::Array(original_data.len()),
+    );
+    let mut valid = NullBufferBuilder::new(list_array.len());
+
+    for (row_index, offset_window) in list_array.offsets().windows(2).enumerate() {
+        if list_array.is_null(row_index) || element_array.is_null(row_index) {
+            offsets.push(offsets[row_index]);
+            valid.append_null();
+            continue;
+        }
+
+        let start = offset_window[0].to_usize().unwrap();
+        let end = offset_window[1].to_usize().unwrap();
+        // n is the number of elements to remove in this row
+        let n = arr_n[row_index];
+
+        // compare each element in the list, `false` means the element matches and should be removed
+        let eq_array = utils::compare_element_to_list(
+            &list_array.value(row_index),
+            element_array,
+            row_index,
+            false,
+        )?;
+
+        let num_to_remove = eq_array.false_count();
+
+        // Fast path: no elements to remove, copy entire row
+        if num_to_remove == 0 {
+            mutable.extend(0, start, end);
+            offsets.push(offsets[row_index] + OffsetSize::usize_as(end - start));
+            valid.append_non_null();
+            continue;
+        }
+
+        // Remove at most `n` matching elements
+        let max_removals = n.min(num_to_remove as i64);
+        let mut removed = 0i64;
+        let mut copied = 0usize;
+        // marks the beginning of a range of elements pending to be copied.
+        let mut pending_batch_to_retain: Option<usize> = None;
+        for (i, keep) in eq_array.iter().enumerate() {
+            if keep == Some(false) && removed < max_removals {
+                // Flush pending batch before skipping this element
+                if let Some(bs) = pending_batch_to_retain {
+                    mutable.extend(0, start + bs, start + i);
+                    copied += i - bs;
+                    pending_batch_to_retain = None;
+                }
+                removed += 1;
+            } else if pending_batch_to_retain.is_none() {
+                pending_batch_to_retain = Some(i);
             }
         }
-    }
 
-    let values = if new_values.is_empty() {
-        new_empty_array(&data_type)
-    } else {
-        let new_values = new_values.iter().map(|x| x.as_ref()).collect::<Vec<_>>();
-        arrow::compute::concat(&new_values)?
-    };
+        // Flush remaining batch
+        if let Some(bs) = pending_batch_to_retain {
+            mutable.extend(0, start + bs, start + eq_array.len());
+            copied += eq_array.len() - bs;
+        }
 
+        offsets.push(offsets[row_index] + OffsetSize::usize_as(copied));
+        valid.append_non_null();
+    }
+
+    let new_values = make_array(mutable.freeze());
     Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
-        Arc::new(Field::new_list_field(data_type, true)),
+        Arc::clone(list_field),
         OffsetBuffer::new(offsets.into()),
-        values,
-        list_array.nulls().cloned(),
+        new_values,
+        valid.finish(),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::remove::{ArrayRemove, ArrayRemoveAll, ArrayRemoveN};
+    use arrow::array::{
+        Array, ArrayRef, AsArray, GenericListArray, ListArray, OffsetSizeTrait,
+    };
+    use arrow::datatypes::{DataType, Field, Int32Type};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl};
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use std::ops::Deref;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_remove_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let args_fields = vec![
+                    Arc::clone(&input_field),
+                    Arc::new(Field::new("a", DataType::Int32, false)),
+                ];
+                let scalar_args = vec![None, Some(&ScalarValue::Int32(Some(1)))];
+
+                let result = ArrayRemove::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &args_fields,
+                        scalar_arguments: &scalar_args,
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    #[test]
+    fn test_array_remove_n_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let args_fields = vec![
+                    Arc::clone(&input_field),
+                    Arc::new(Field::new("a", DataType::Int32, false)),
+                    Arc::new(Field::new("b", DataType::Int64, false)),
+                ];
+                let scalar_args = vec![
+                    None,
+                    Some(&ScalarValue::Int32(Some(1))),
+                    Some(&ScalarValue::Int64(Some(1))),
+                ];
+
+                let result = ArrayRemoveN::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &args_fields,
+                        scalar_arguments: &scalar_args,
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    #[test]
+    fn test_array_remove_all_nullability() {
+        for nullability in [true, false] {
+            for item_nullability in [true, false] {
+                let input_field = Arc::new(Field::new(
+                    "num",
+                    DataType::new_list(DataType::Int32, item_nullability),
+                    nullability,
+                ));
+                let result = ArrayRemoveAll::new()
+                    .return_field_from_args(ReturnFieldArgs {
+                        arg_fields: &[Arc::clone(&input_field)],
+                        scalar_arguments: &[None],
+                    })
+                    .unwrap();
+
+                assert_eq!(result, input_field);
+            }
+        }
+    }
+
+    fn ensure_field_nullability<O: OffsetSizeTrait>(
+        field_nullable: bool,
+        list: GenericListArray<O>,
+    ) -> GenericListArray<O> {
+        let (field, offsets, values, nulls) = list.into_parts();
+
+        if field.is_nullable() == field_nullable {
+            return GenericListArray::new(field, offsets, values, nulls);
+        }
+        if !field_nullable {
+            assert_eq!(nulls, None);
+        }
+
+        let field = Arc::new(field.deref().clone().with_nullable(field_nullable));
+
+        GenericListArray::new(field, offsets, values, nulls)
+    }
+
+    #[test]
+    fn test_array_remove_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove(input_list, expected_list, element_to_remove);
+    }
+
+    #[test]
+    fn test_array_remove_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(2), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63), Some(2)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove(input_list, expected_list, element_to_remove);
+    }
+
+    fn assert_array_remove(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let udf = ArrayRemove::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+
+    #[test]
+    fn test_array_remove_n_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_n(input_list, expected_list, element_to_remove, 2);
+    }
+
+    #[test]
+    fn test_array_remove_n_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_n(input_list, expected_list, element_to_remove, 2);
+    }
+
+    fn assert_array_remove_n(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+        n: i64,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let count_scalar = ScalarValue::Int64(Some(n));
+
+        let udf = ArrayRemoveN::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+            Arc::new(Field::new("count", DataType::Int64, false)),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove), Some(&count_scalar)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                    ColumnarValue::Scalar(count_scalar),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+
+    #[test]
+    fn test_array_remove_all_non_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 2, 2, 3, 2, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 2, 55, 63, 2]).iter().copied().map(Some)),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            false,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(([1, 3, 1, 4]).iter().copied().map(Some)),
+                Some(([42, 55, 63]).iter().copied().map(Some)),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_all(input_list, expected_list, element_to_remove);
+    }
+
+    #[test]
+    fn test_array_remove_all_nullable() {
+        let input_list = Arc::new(ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![
+                    Some(1),
+                    Some(2),
+                    Some(2),
+                    Some(3),
+                    None,
+                    Some(1),
+                    Some(4),
+                ]),
+                Some(vec![Some(42), Some(2), None, Some(63), Some(2)]),
+            ]),
+        ));
+        let expected_list = ensure_field_nullability(
+            true,
+            ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                Some(vec![Some(1), Some(3), None, Some(1), Some(4)]),
+                Some(vec![Some(42), None, Some(63)]),
+            ]),
+        );
+
+        let element_to_remove = ScalarValue::Int32(Some(2));
+
+        assert_array_remove_all(input_list, expected_list, element_to_remove);
+    }
+
+    fn assert_array_remove_all(
+        input_list: ArrayRef,
+        expected_list: GenericListArray<i32>,
+        element_to_remove: ScalarValue,
+    ) {
+        assert_eq!(input_list.data_type(), expected_list.data_type());
+        assert_eq!(expected_list.value_type(), element_to_remove.data_type());
+        let input_list_len = input_list.len();
+        let input_list_data_type = input_list.data_type().clone();
+
+        let udf = ArrayRemoveAll::new();
+        let args_fields = vec![
+            Arc::new(Field::new("num", input_list.data_type().clone(), false)),
+            Arc::new(Field::new(
+                "el",
+                element_to_remove.data_type(),
+                element_to_remove.is_null(),
+            )),
+        ];
+        let scalar_args = vec![None, Some(&element_to_remove)];
+
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &args_fields,
+                scalar_arguments: &scalar_args,
+            })
+            .unwrap();
+
+        let result = udf
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(input_list),
+                    ColumnarValue::Scalar(element_to_remove),
+                ],
+                arg_fields: args_fields,
+                number_rows: input_list_len,
+                return_field,
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), input_list_data_type);
+        match result {
+            ColumnarValue::Array(array) => {
+                let result_list = array.as_list::<i32>();
+                assert_eq!(result_list, &expected_list);
+            }
+            _ => panic!("Expected ColumnarValue::Array"),
+        }
+    }
+}
diff --git a/datafusion/functions-nested/src/repeat.rs b/datafusion/functions-nested/src/repeat.rs
index 26d67ad3113ff..8b36d0934fdd4 100644
--- a/datafusion/functions-nested/src/repeat.rs
+++ b/datafusion/functions-nested/src/repeat.rs
@@ -19,22 +19,24 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, Capacities, GenericListArray, ListArray,
-    MutableArrayData, OffsetSizeTrait, UInt64Array,
+    Array, ArrayRef, BooleanBufferBuilder, GenericListArray, Int64Array, OffsetSizeTrait,
+    UInt64Array,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::compute;
-use arrow::compute::cast;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
     DataType::{LargeList, List},
     Field,
 };
-use datafusion_common::cast::{as_large_list_array, as_list_array, as_uint64_array};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array};
+use datafusion_common::types::{NativeType, logical_int64};
+use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
 use std::any::Any;
 use std::sync::Arc;
@@ -74,7 +76,7 @@ make_udf_expr_and_func!(
         description = "Value of how many times to repeat the element."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayRepeat {
     signature: Signature,
     aliases: Vec<String>,
@@ -89,7 +91,17 @@ impl Default for ArrayRepeat {
 impl ArrayRepeat {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int64,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("list_repeat")],
         }
     }
@@ -109,16 +121,20 @@ impl ScalarUDFImpl for ArrayRepeat {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(List(Arc::new(Field::new_list_field(
-            arg_types[0].clone(),
-            true,
-        ))))
+        let element_type = &arg_types[0];
+        match element_type {
+            LargeList(_) => Ok(LargeList(Arc::new(Field::new_list_field(
+                element_type.clone(),
+                true,
+            )))),
+            _ => Ok(List(Arc::new(Field::new_list_field(
+                element_type.clone(),
+                true,
+            )))),
+        }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_repeat_inner)(&args.args)
     }
 
@@ -126,40 +142,14 @@ impl ScalarUDFImpl for ArrayRepeat {
         &self.aliases
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [first_type, second_type] = take_function_args(self.name(), arg_types)?;
-
-        // Coerce the second argument to Int64/UInt64 if it's a numeric type
-        let second = match second_type {
-            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
-                DataType::Int64
-            }
-            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
-                DataType::UInt64
-            }
-            _ => return exec_err!("count must be an integer type"),
-        };
-
-        Ok(vec![first_type.clone(), second])
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Array_repeat SQL function
-pub fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let element = &args[0];
-    let count_array = &args[1];
-
-    let count_array = match count_array.data_type() {
-        DataType::Int64 => &cast(count_array, &DataType::UInt64)?,
-        DataType::UInt64 => count_array,
-        _ => return exec_err!("count must be an integer type"),
-    };
-
-    let count_array = as_uint64_array(count_array)?;
+    let count_array = as_int64_array(&args[1])?;
 
     match element.data_type() {
         List(_) => {
@@ -188,45 +178,46 @@ pub fn array_repeat_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
 /// ```
 fn general_repeat<O: OffsetSizeTrait>(
     array: &ArrayRef,
-    count_array: &UInt64Array,
+    count_array: &Int64Array,
 ) -> Result<ArrayRef> {
-    let data_type = array.data_type();
-    let mut new_values = vec![];
-
-    let count_vec = count_array
-        .values()
-        .to_vec()
-        .iter()
-        .map(|x| *x as usize)
-        .collect::<Vec<_>>();
-
-    for (row_index, &count) in count_vec.iter().enumerate() {
-        let repeated_array = if array.is_null(row_index) {
-            new_null_array(data_type, count)
-        } else {
-            let original_data = array.to_data();
-            let capacity = Capacities::Array(count);
-            let mut mutable =
-                MutableArrayData::with_capacities(vec![&original_data], false, capacity);
-
-            for _ in 0..count {
-                mutable.extend(0, row_index, row_index + 1);
-            }
-
-            let data = mutable.freeze();
-            arrow::array::make_array(data)
-        };
-        new_values.push(repeated_array);
+    let total_repeated_values: usize = (0..count_array.len())
+        .map(|i| get_count_with_validity(count_array, i))
+        .sum();
+
+    let mut take_indices = Vec::with_capacity(total_repeated_values);
+    let mut offsets = Vec::with_capacity(count_array.len() + 1);
+    offsets.push(O::zero());
+    let mut running_offset = 0usize;
+
+    for idx in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, idx);
+        running_offset = running_offset.checked_add(count).ok_or_else(|| {
+            DataFusionError::Execution(
+                "array_repeat: running_offset overflowed usize".to_string(),
+            )
+        })?;
+        let offset = O::from_usize(running_offset).ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "array_repeat: offset {running_offset} exceeds the maximum value for offset type"
+            ))
+        })?;
+        offsets.push(offset);
+        take_indices.extend(std::iter::repeat_n(idx as u64, count));
     }
 
-    let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect();
-    let values = compute::concat(&new_values)?;
+    // Build the flattened values
+    let repeated_values = compute::take(
+        array.as_ref(),
+        &UInt64Array::from_iter_values(take_indices),
+        None,
+    )?;
 
+    // Construct final ListArray
     Ok(Arc::new(GenericListArray::<O>::try_new(
-        Arc::new(Field::new_list_field(data_type.to_owned(), true)),
-        OffsetBuffer::from_lengths(count_vec),
-        values,
-        None,
+        Arc::new(Field::new_list_field(array.data_type().to_owned(), true)),
+        OffsetBuffer::new(offsets.into()),
+        repeated_values,
+        count_array.nulls().cloned(),
     )?))
 }
 
@@ -242,58 +233,95 @@ fn general_repeat<O: OffsetSizeTrait>(
 /// ```
 fn general_list_repeat<O: OffsetSizeTrait>(
     list_array: &GenericListArray<O>,
-    count_array: &UInt64Array,
+    count_array: &Int64Array,
 ) -> Result<ArrayRef> {
-    let data_type = list_array.data_type();
-    let value_type = list_array.value_type();
-    let mut new_values = vec![];
-
-    let count_vec = count_array
-        .values()
-        .to_vec()
-        .iter()
-        .map(|x| *x as usize)
-        .collect::<Vec<_>>();
-
-    for (list_array_row, &count) in list_array.iter().zip(count_vec.iter()) {
-        let list_arr = match list_array_row {
-            Some(list_array_row) => {
-                let original_data = list_array_row.to_data();
-                let capacity = Capacities::Array(original_data.len() * count);
-                let mut mutable = MutableArrayData::with_capacities(
-                    vec![&original_data],
-                    false,
-                    capacity,
-                );
-
-                for _ in 0..count {
-                    mutable.extend(0, 0, original_data.len());
-                }
-
-                let data = mutable.freeze();
-                let repeated_array = arrow::array::make_array(data);
-
-                let list_arr = GenericListArray::<O>::try_new(
-                    Arc::new(Field::new_list_field(value_type.clone(), true)),
-                    OffsetBuffer::<O>::from_lengths(vec![original_data.len(); count]),
-                    repeated_array,
-                    None,
-                )?;
-                Arc::new(list_arr) as ArrayRef
+    let list_offsets = list_array.value_offsets();
+
+    // calculate capacities for pre-allocation
+    let mut outer_total = 0usize;
+    let mut inner_total = 0usize;
+    for i in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, i);
+        if count > 0 {
+            outer_total += count;
+            if list_array.is_valid(i) {
+                let len = list_offsets[i + 1].to_usize().unwrap()
+                    - list_offsets[i].to_usize().unwrap();
+                inner_total += len * count;
             }
-            None => new_null_array(data_type, count),
-        };
-        new_values.push(list_arr);
+        }
     }
 
-    let lengths = new_values.iter().map(|a| a.len()).collect::<Vec<_>>();
-    let new_values: Vec<_> = new_values.iter().map(|a| a.as_ref()).collect();
-    let values = compute::concat(&new_values)?;
+    // Build inner structures
+    let mut inner_offsets = Vec::with_capacity(outer_total + 1);
+    let mut take_indices = Vec::with_capacity(inner_total);
+    let mut inner_nulls = BooleanBufferBuilder::new(outer_total);
+    let mut inner_running = 0usize;
+    inner_offsets.push(O::zero());
+
+    for row_idx in 0..count_array.len() {
+        let count = get_count_with_validity(count_array, row_idx);
+        let list_is_valid = list_array.is_valid(row_idx);
+        let start = list_offsets[row_idx].to_usize().unwrap();
+        let end = list_offsets[row_idx + 1].to_usize().unwrap();
+        let row_len = end - start;
+
+        for _ in 0..count {
+            inner_running = inner_running.checked_add(row_len).ok_or_else(|| {
+                DataFusionError::Execution(
+                    "array_repeat: inner offset overflowed usize".to_string(),
+                )
+            })?;
+            let offset = O::from_usize(inner_running).ok_or_else(|| {
+                DataFusionError::Execution(format!(
+                    "array_repeat: offset {inner_running} exceeds the maximum value for offset type"
+                ))
+            })?;
+            inner_offsets.push(offset);
+            inner_nulls.append(list_is_valid);
+            if list_is_valid {
+                take_indices.extend(start as u64..end as u64);
+            }
+        }
+    }
 
-    Ok(Arc::new(ListArray::try_new(
-        Arc::new(Field::new_list_field(data_type.to_owned(), true)),
-        OffsetBuffer::<i32>::from_lengths(lengths),
-        values,
+    // Build inner ListArray
+    let inner_values = compute::take(
+        list_array.values().as_ref(),
+        &UInt64Array::from_iter_values(take_indices),
         None,
+    )?;
+    let inner_list = GenericListArray::<O>::try_new(
+        Arc::new(Field::new_list_field(list_array.value_type().clone(), true)),
+        OffsetBuffer::new(inner_offsets.into()),
+        inner_values,
+        Some(NullBuffer::new(inner_nulls.finish())),
+    )?;
+
+    // Build outer ListArray
+    Ok(Arc::new(GenericListArray::<O>::try_new(
+        Arc::new(Field::new_list_field(
+            list_array.data_type().to_owned(),
+            true,
+        )),
+        OffsetBuffer::<O>::from_lengths(
+            count_array
+                .iter()
+                .map(|c| c.map(|v| if v > 0 { v as usize } else { 0 }).unwrap_or(0)),
+        ),
+        Arc::new(inner_list),
+        count_array.nulls().cloned(),
     )?))
 }
+
+/// Helper function to get count from count_array at given index
+/// Return 0 for null values or non-positive count.
+#[inline]
+fn get_count_with_validity(count_array: &Int64Array, idx: usize) -> usize {
+    if count_array.is_null(idx) {
+        0
+    } else {
+        let c = count_array.value(idx);
+        if c > 0 { c as usize } else { 0 }
+    }
+}
diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs
index 3dbe672c5b028..87533356616fa 100644
--- a/datafusion/functions-nested/src/replace.rs
+++ b/datafusion/functions-nested/src/replace.rs
@@ -18,18 +18,18 @@
 //! [`ScalarUDFImpl`] definitions for array_replace, array_replace_n and array_replace_all functions.
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, Capacities, GenericListArray,
-    MutableArrayData, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, AsArray, Capacities, GenericListArray, MutableArrayData,
+    NullBufferBuilder, OffsetSizeTrait, new_null_array,
 };
 use arrow::datatypes::{DataType, Field};
 
 use arrow::buffer::OffsetBuffer;
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -78,7 +78,7 @@ make_udf_expr_and_func!(ArrayReplaceAll,
     argument(name = "from", description = "Initial element."),
     argument(name = "to", description = "Final element.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayReplace {
     signature: Signature,
     aliases: Vec<String>,
@@ -105,6 +105,7 @@ impl ArrayReplace {
                     },
                 ),
                 volatility: Volatility::Immutable,
+                parameter_names: None,
             },
             aliases: vec![String::from("list_replace")],
         }
@@ -128,10 +129,7 @@ impl ScalarUDFImpl for ArrayReplace {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_inner)(&args.args)
     }
 
@@ -164,7 +162,7 @@ impl ScalarUDFImpl for ArrayReplace {
     argument(name = "to", description = "Final element."),
     argument(name = "max", description = "Number of first occurrences to replace.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayReplaceN {
     signature: Signature,
     aliases: Vec<String>,
@@ -186,6 +184,7 @@ impl ArrayReplaceN {
                     },
                 ),
                 volatility: Volatility::Immutable,
+                parameter_names: None,
             },
             aliases: vec![String::from("list_replace_n")],
         }
@@ -209,10 +208,7 @@ impl ScalarUDFImpl for ArrayReplaceN {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_n_inner)(&args.args)
     }
 
@@ -244,7 +240,7 @@ impl ScalarUDFImpl for ArrayReplaceN {
     argument(name = "from", description = "Initial element."),
     argument(name = "to", description = "Final element.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct ArrayReplaceAll {
     signature: Signature,
     aliases: Vec<String>,
@@ -265,6 +261,7 @@ impl ArrayReplaceAll {
                     },
                 ),
                 volatility: Volatility::Immutable,
+                parameter_names: None,
             },
             aliases: vec![String::from("list_replace_all")],
         }
@@ -288,10 +285,7 @@ impl ScalarUDFImpl for ArrayReplaceAll {
         Ok(args[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_replace_all_inner)(&args.args)
     }
 
@@ -325,7 +319,7 @@ fn general_replace<O: OffsetSizeTrait>(
     list_array: &GenericListArray<O>,
     from_array: &ArrayRef,
     to_array: &ArrayRef,
-    arr_n: Vec<i64>,
+    arr_n: &[i64],
 ) -> Result<ArrayRef> {
     // Build up the offsets for the final output array
     let mut offsets: Vec<O> = vec![O::usize_as(0)];
@@ -415,7 +409,7 @@ fn general_replace<O: OffsetSizeTrait>(
     )?))
 }
 
-pub(crate) fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to] = take_function_args("array_replace", args)?;
 
     // replace at most one occurrence for each element
@@ -423,18 +417,18 @@ pub(crate) fn array_replace_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
-        array_type => exec_err!("array_replace does not support type '{array_type:?}'."),
+        array_type => exec_err!("array_replace does not support type '{array_type}'."),
     }
 }
 
-pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to, max] = take_function_args("array_replace_n", args)?;
 
     // replace the specified number of occurrences
@@ -442,20 +436,20 @@ pub(crate) fn array_replace_n_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
         array_type => {
-            exec_err!("array_replace_n does not support type '{array_type:?}'.")
+            exec_err!("array_replace_n does not support type '{array_type}'.")
         }
     }
 }
 
-pub(crate) fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array, from, to] = take_function_args("array_replace_all", args)?;
 
     // replace all occurrences (up to "i64::MAX")
@@ -463,15 +457,15 @@ pub(crate) fn array_replace_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     match array.data_type() {
         DataType::List(_) => {
             let list_array = array.as_list::<i32>();
-            general_replace::<i32>(list_array, from, to, arr_n)
+            general_replace::<i32>(list_array, from, to, &arr_n)
         }
         DataType::LargeList(_) => {
             let list_array = array.as_list::<i64>();
-            general_replace::<i64>(list_array, from, to, arr_n)
+            general_replace::<i64>(list_array, from, to, &arr_n)
         }
         DataType::Null => Ok(new_null_array(array.data_type(), 1)),
         array_type => {
-            exec_err!("array_replace_all does not support type '{array_type:?}'.")
+            exec_err!("array_replace_all does not support type '{array_type}'.")
         }
     }
 }
diff --git a/datafusion/functions-nested/src/resize.rs b/datafusion/functions-nested/src/resize.rs
index 145d7e80043b8..889b8f5975301 100644
--- a/datafusion/functions-nested/src/resize.rs
+++ b/datafusion/functions-nested/src/resize.rs
@@ -19,22 +19,22 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, Capacities, GenericListArray, Int64Array,
-    MutableArrayData, NullBufferBuilder, OffsetSizeTrait,
+    Array, ArrayRef, Capacities, GenericListArray, Int64Array, MutableArrayData,
+    NullBufferBuilder, OffsetSizeTrait, new_null_array,
 };
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{ArrowNativeType, Field};
 use arrow::datatypes::{
-    DataType::{FixedSizeList, LargeList, List},
+    DataType::{LargeList, List},
     FieldRef,
 };
 use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_datafusion_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -70,7 +70,7 @@ make_udf_expr_and_func!(
         description = "Defines new elements' value or empty if value is not set."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayResize {
     signature: Signature,
     aliases: Vec<String>,
@@ -125,7 +125,7 @@ impl ScalarUDFImpl for ArrayResize {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
-            List(field) | FixedSizeList(field, _) => Ok(List(Arc::clone(field))),
+            List(field) => Ok(List(Arc::clone(field))),
             LargeList(field) => Ok(LargeList(Arc::clone(field))),
             DataType::Null => {
                 Ok(List(Arc::new(Field::new_list_field(DataType::Int64, true))))
@@ -136,10 +136,7 @@ impl ScalarUDFImpl for ArrayResize {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_resize_inner)(&args.args)
     }
 
@@ -152,8 +149,7 @@ impl ScalarUDFImpl for ArrayResize {
     }
 }
 
-/// array_resize SQL function
-pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
     if arg.len() < 2 || arg.len() > 3 {
         return exec_err!("array_resize needs two or three arguments");
     }
@@ -169,7 +165,7 @@ pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
                 return exec_err!(
                     "array_resize does not support type '{:?}'.",
                     array.data_type()
-                )
+                );
             }
         };
         return Ok(new_null_array(&return_type, array.len()));
@@ -191,7 +187,7 @@ pub(crate) fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(&arg[0])?;
             general_list_resize::<i64>(array, new_len, field, new_element)
         }
-        array_type => exec_err!("array_resize does not support type '{array_type:?}'."),
+        array_type => exec_err!("array_resize does not support type '{array_type}'."),
     }
 }
 
diff --git a/datafusion/functions-nested/src/reverse.rs b/datafusion/functions-nested/src/reverse.rs
index 140cd19aeff9c..e1c749c7cd5a4 100644
--- a/datafusion/functions-nested/src/reverse.rs
+++ b/datafusion/functions-nested/src/reverse.rs
@@ -19,17 +19,26 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    Array, ArrayRef, Capacities, GenericListArray, MutableArrayData, OffsetSizeTrait,
+    Array, ArrayRef, FixedSizeListArray, GenericListArray, GenericListViewArray,
+    OffsetSizeTrait, UInt32Array, UInt64Array,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::compute::take;
+use arrow::datatypes::DataType::{
+    FixedSizeList, LargeList, LargeListView, List, ListView, Null,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::datatypes::DataType::{LargeList, List, Null};
 use arrow::datatypes::{DataType, FieldRef};
-use datafusion_common::cast::{as_large_list_array, as_list_array};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::cast::{
+    as_fixed_size_list_array, as_large_list_array, as_large_list_view_array,
+    as_list_array, as_list_view_array,
+};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use itertools::Itertools;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -58,7 +67,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayReverse {
     signature: Signature,
     aliases: Vec<String>,
@@ -73,7 +82,7 @@ impl Default for ArrayReverse {
 impl ArrayReverse {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(1, Volatility::Immutable),
+            signature: Signature::array(Volatility::Immutable),
             aliases: vec!["list_reverse".to_string()],
         }
     }
@@ -96,10 +105,7 @@ impl ScalarUDFImpl for ArrayReverse {
         Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_reverse_inner)(&args.args)
     }
 
@@ -125,53 +131,368 @@ pub fn array_reverse_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
             let array = as_large_list_array(input_array)?;
             general_array_reverse::<i64>(array, field)
         }
+        FixedSizeList(field, _) => {
+            let array = as_fixed_size_list_array(input_array)?;
+            fixed_size_array_reverse(array, field)
+        }
         Null => Ok(Arc::clone(input_array)),
-        array_type => exec_err!("array_reverse does not support type '{array_type:?}'."),
+        ListView(field) => {
+            let array = as_list_view_array(input_array)?;
+            list_view_reverse::<i32>(array, field)
+        }
+        LargeListView(field) => {
+            let array = as_large_list_view_array(input_array)?;
+            list_view_reverse::<i64>(array, field)
+        }
+        array_type => exec_err!("array_reverse does not support type '{array_type}'."),
     }
 }
 
-fn general_array_reverse<O: OffsetSizeTrait + TryFrom<i64>>(
+fn general_array_reverse<O: OffsetSizeTrait>(
     array: &GenericListArray<O>,
     field: &FieldRef,
 ) -> Result<ArrayRef> {
     let values = array.values();
-    let original_data = values.to_data();
-    let capacity = Capacities::Array(original_data.len());
     let mut offsets = vec![O::usize_as(0)];
-    let mut nulls = vec![];
-    let mut mutable =
-        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
+    let mut indices: Vec<O> = Vec::with_capacity(values.len());
 
-    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+    for (row_index, (&start, &end)) in array.offsets().iter().tuple_windows().enumerate()
+    {
         // skip the null value
         if array.is_null(row_index) {
-            nulls.push(false);
-            offsets.push(offsets[row_index] + O::one());
-            mutable.extend(0, 0, 1);
+            offsets.push(offsets[row_index]);
             continue;
-        } else {
-            nulls.push(true);
         }
 
-        let start = offset_window[0];
-        let end = offset_window[1];
-
         let mut index = end - O::one();
-        let mut cnt = 0;
-
         while index >= start {
-            mutable.extend(0, index.to_usize().unwrap(), index.to_usize().unwrap() + 1);
+            indices.push(index);
             index = index - O::one();
-            cnt += 1;
         }
-        offsets.push(offsets[row_index] + O::usize_as(cnt));
+        let size = end - start;
+        offsets.push(offsets[row_index] + size);
     }
 
-    let data = mutable.freeze();
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = if O::IS_LARGE {
+        Arc::new(UInt64Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u64)
+                .collect::<Vec<_>>(),
+        ))
+    } else {
+        Arc::new(UInt32Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u32)
+                .collect::<Vec<_>>(),
+        ))
+    };
+    let values = take(&values, &indices_array, None)?;
     Ok(Arc::new(GenericListArray::<O>::try_new(
         Arc::clone(field),
         OffsetBuffer::<O>::new(offsets.into()),
-        arrow::array::make_array(data),
-        Some(nulls.into()),
+        values,
+        array.nulls().cloned(),
     )?))
 }
+
+/// Reverses a list view array.
+///
+/// Construct indices, sizes and offsets for the reversed array by iterating over
+/// the list view array in the logical order, and reversing the order of the elements.
+/// We end up with a list view array where the elements are in order,
+/// even if the original array had elements out of order.
+fn list_view_reverse<O: OffsetSizeTrait>(
+    array: &GenericListViewArray<O>,
+    field: &FieldRef,
+) -> Result<ArrayRef> {
+    let offsets = array.offsets();
+    let values = array.values();
+    let sizes = array.sizes();
+
+    let mut new_offsets: Vec<O> = Vec::with_capacity(offsets.len());
+    let mut indices: Vec<O> = Vec::with_capacity(values.len());
+    let mut new_sizes = Vec::with_capacity(sizes.len());
+
+    let mut current_offset = O::zero();
+    for (row_index, offset) in offsets.iter().enumerate() {
+        new_offsets.push(current_offset);
+
+        // If this array is null, we set its size to 0 and continue
+        if array.is_null(row_index) {
+            new_sizes.push(O::zero());
+            continue;
+        }
+        let size = sizes[row_index];
+        new_sizes.push(size);
+
+        // Each array is located at [offset, offset + size), collect indices in the reverse order
+        let array_start = *offset;
+        let array_end = array_start + size;
+        let mut idx = array_end - O::one();
+        while idx >= array_start {
+            indices.push(idx);
+            idx = idx - O::one();
+        }
+
+        current_offset += size;
+    }
+
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = if O::IS_LARGE {
+        Arc::new(UInt64Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u64)
+                .collect::<Vec<_>>(),
+        ))
+    } else {
+        Arc::new(UInt32Array::from(
+            indices
+                .iter()
+                .map(|i| i.as_usize() as u32)
+                .collect::<Vec<_>>(),
+        ))
+    };
+    let values = take(&values, &indices_array, None)?;
+    Ok(Arc::new(GenericListViewArray::<O>::try_new(
+        Arc::clone(field),
+        ScalarBuffer::from(new_offsets),
+        ScalarBuffer::from(new_sizes),
+        values,
+        array.nulls().cloned(),
+    )?))
+}
+
+fn fixed_size_array_reverse(
+    array: &FixedSizeListArray,
+    field: &FieldRef,
+) -> Result<ArrayRef> {
+    let values: &Arc<dyn Array> = array.values();
+
+    // Since each fixed size list in the physical array is the same size and we keep the order
+    // of the fixed size lists, we can reverse the indices for each fixed size list.
+    let mut indices: Vec<u64> = (0..values.len() as u64).collect();
+    for chunk in indices.chunks_mut(array.value_length() as usize) {
+        chunk.reverse();
+    }
+
+    // Materialize values from underlying array with take
+    let indices_array: ArrayRef = Arc::new(UInt64Array::from(indices));
+    let values = take(&values, &indices_array, None)?;
+
+    Ok(Arc::new(FixedSizeListArray::try_new(
+        Arc::clone(field),
+        array.value_length(),
+        values,
+        array.nulls().cloned(),
+    )?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::reverse::{fixed_size_array_reverse, list_view_reverse};
+    use arrow::{
+        array::{
+            AsArray, FixedSizeListArray, GenericListViewArray, Int32Array,
+            LargeListViewArray, ListViewArray, OffsetSizeTrait,
+        },
+        buffer::{NullBuffer, ScalarBuffer},
+        datatypes::{DataType, Field, Int32Type},
+    };
+    use datafusion_common::Result;
+    use std::sync::Arc;
+
+    fn list_view_values<O: OffsetSizeTrait>(
+        array: &GenericListViewArray<O>,
+    ) -> Vec<Option<Vec<i32>>> {
+        array
+            .iter()
+            .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
+            .collect()
+    }
+
+    fn fixed_size_list_values(array: &FixedSizeListArray) -> Vec<Option<Vec<i32>>> {
+        array
+            .iter()
+            .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
+            .collect()
+    }
+
+    #[test]
+    fn test_reverse_list_view() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
+        let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![1]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![9, 8, 7]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_large_list_view() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 6, 6]);
+        let sizes = ScalarBuffer::from(vec![1, 5, 0, 3]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = LargeListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i64>());
+        let expected = vec![
+            Some(vec![1]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![9, 8, 7]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_out_of_order() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![6, 1, 6, 0]); // out of order
+        let sizes = ScalarBuffer::from(vec![3, 5, 0, 1]);
+        let values = Arc::new(Int32Array::from(vec![
+            1, // fourth array: offset 0, size 1
+            2, 3, 4, 5, 6, // second array: offset 1, size 5
+            // third array: offset 6, size 0 (and null)
+            7, 8, 9, // first array: offset 6, size 3
+        ]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![9, 8, 7]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![1]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_with_nulls() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![16, 1, 6, 0]); // out of order
+        let sizes = ScalarBuffer::from(vec![3, 5, 10, 1]);
+        let values = Arc::new(Int32Array::from(vec![
+            1, // fourth array: offset 0, size 1
+            2, 3, 4, 5, 6, // second array: offset 1, size 5
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // third array: offset 6, size 10
+            7, 8, 9, // first array: offset 6, size 3
+        ]));
+        let nulls = Some(NullBuffer::from(vec![true, true, false, true]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected = vec![
+            Some(vec![9, 8, 7]),
+            Some(vec![6, 5, 4, 3, 2]),
+            None,
+            Some(vec![1]),
+        ];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_empty() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![]);
+        let sizes = ScalarBuffer::from(vec![]);
+        let empty_array: Vec<i32> = vec![];
+        let values = Arc::new(Int32Array::from(empty_array));
+        let nulls = None;
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected: Vec<Option<Vec<i32>>> = vec![];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_list_view_all_nulls() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let offsets = ScalarBuffer::from(vec![0, 1, 2, 3]);
+        let sizes = ScalarBuffer::from(vec![0, 1, 1, 1]);
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
+        let nulls = Some(NullBuffer::from(vec![false, false, false, false]));
+        let list_view = ListViewArray::new(field, offsets, sizes, values, nulls);
+        let result = list_view_reverse(
+            &list_view,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = list_view_values(result.as_list_view::<i32>());
+        let expected: Vec<Option<Vec<i32>>> = vec![None, None, None, None];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_fixed_size_list() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
+        let result = fixed_size_array_reverse(
+            &FixedSizeListArray::new(
+                field,
+                3,
+                values,
+                Some(NullBuffer::from(vec![true, false, true])),
+            ),
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = fixed_size_list_values(result.as_fixed_size_list());
+        let expected = vec![Some(vec![3, 2, 1]), None, Some(vec![9, 8, 7])];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+
+    #[test]
+    fn test_reverse_fixed_size_list_empty() -> Result<()> {
+        let field = Arc::new(Field::new("a", DataType::Int32, false));
+        let empty_array: Vec<i32> = vec![];
+        let values = Arc::new(Int32Array::from(empty_array));
+        let nulls = None;
+        let fixed_size_list = FixedSizeListArray::new(field, 3, values, nulls);
+        let result = fixed_size_array_reverse(
+            &fixed_size_list,
+            &Arc::new(Field::new("test", DataType::Int32, true)),
+        )?;
+        let reversed = fixed_size_list_values(result.as_fixed_size_list());
+        let expected: Vec<Option<Vec<i32>>> = vec![];
+        assert_eq!(expected, reversed);
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/set_ops.rs b/datafusion/functions-nested/src/set_ops.rs
index 4f9457aa59c62..4958a1ee7fe1a 100644
--- a/datafusion/functions-nested/src/set_ops.rs
+++ b/datafusion/functions-nested/src/set_ops.rs
@@ -19,26 +19,26 @@
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericListArray, LargeListArray, ListArray,
-    OffsetSizeTrait,
+    Array, ArrayRef, GenericListArray, OffsetSizeTrait, UInt32Array, UInt64Array,
+    new_empty_array, new_null_array,
 };
-use arrow::buffer::OffsetBuffer;
-use arrow::compute;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::compute::{concat, take};
 use arrow::datatypes::DataType::{LargeList, List, Null};
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow::row::{RowConverter, SortField};
 use datafusion_common::cast::{as_large_list_array, as_list_array};
 use datafusion_common::utils::ListCoercion;
 use datafusion_common::{
-    exec_err, internal_err, plan_err, utils::take_function_args, Result,
+    Result, assert_eq_or_internal_err, exec_err, internal_err, utils::take_function_args,
 };
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use itertools::Itertools;
+use hashbrown::HashSet;
 use std::any::Any;
-use std::collections::HashSet;
 use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
@@ -69,7 +69,7 @@ make_udf_expr_and_func!(
 
 #[user_doc(
     doc_section(label = "Array Functions"),
-    description = "Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.",
+    description = "Returns an array of elements that are present in both arrays (all elements from both arrays) without duplicates.",
     syntax_example = "array_union(array1, array2)",
     sql_example = r#"```sql
 > select array_union([1, 2, 3, 4], [5, 6, 3, 4]);
@@ -94,7 +94,7 @@ make_udf_expr_and_func!(
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayUnion {
     signature: Signature,
     aliases: Vec<String>,
@@ -136,16 +136,12 @@ impl ScalarUDFImpl for ArrayUnion {
         let [array1, array2] = take_function_args(self.name(), arg_types)?;
         match (array1, array2) {
             (Null, Null) => Ok(DataType::new_list(Null, true)),
-            (Null, dt) => Ok(dt.clone()),
-            (dt, Null) => Ok(dt.clone()),
+            (Null, dt) | (dt, Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_union_inner)(&args.args)
     }
 
@@ -185,12 +181,18 @@ impl ScalarUDFImpl for ArrayUnion {
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
-pub(super) struct ArrayIntersect {
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayIntersect {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayIntersect {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayIntersect {
     pub fn new() -> Self {
         Self {
@@ -221,16 +223,12 @@ impl ScalarUDFImpl for ArrayIntersect {
         let [array1, array2] = take_function_args(self.name(), arg_types)?;
         match (array1, array2) {
             (Null, Null) => Ok(DataType::new_list(Null, true)),
-            (Null, dt) => Ok(dt.clone()),
-            (dt, Null) => Ok(dt.clone()),
+            (Null, dt) | (dt, Null) => Ok(dt.clone()),
             (dt, _) => Ok(dt.clone()),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_intersect_inner)(&args.args)
     }
 
@@ -260,8 +258,8 @@ impl ScalarUDFImpl for ArrayIntersect {
         description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
     )
 )]
-#[derive(Debug)]
-pub(super) struct ArrayDistinct {
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ArrayDistinct {
     signature: Signature,
     aliases: Vec<String>,
 }
@@ -275,6 +273,12 @@ impl ArrayDistinct {
     }
 }
 
+impl Default for ArrayDistinct {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ScalarUDFImpl for ArrayDistinct {
     fn as_any(&self) -> &dyn Any {
         self
@@ -289,19 +293,10 @@ impl ScalarUDFImpl for ArrayDistinct {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[0] {
-            List(field) => Ok(DataType::new_list(field.data_type().clone(), true)),
-            LargeList(field) => {
-                Ok(DataType::new_large_list(field.data_type().clone(), true))
-            }
-            arg_type => plan_err!("{} does not support type {arg_type}", self.name()),
-        }
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_distinct_inner)(&args.args)
     }
 
@@ -332,7 +327,7 @@ fn array_distinct_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Copy, Clone)]
 enum SetOp {
     Union,
     Intersect,
@@ -361,70 +356,164 @@ fn generic_set_lists<OffsetSize: OffsetSizeTrait>(
         return general_array_distinct::<OffsetSize>(l, &field);
     }
 
-    if l.value_type() != r.value_type() {
-        return internal_err!("{set_op:?} is not implemented for '{l:?}' and '{r:?}'");
-    }
+    assert_eq_or_internal_err!(
+        l.value_type(),
+        r.value_type(),
+        "{set_op:?} is not implemented for '{l:?}' and '{r:?}'"
+    );
 
-    let mut offsets = vec![OffsetSize::usize_as(0)];
-    let mut new_arrays = vec![];
     let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
-    for (first_arr, second_arr) in l.iter().zip(r.iter()) {
-        let l_values = if let Some(first_arr) = first_arr {
-            converter.convert_columns(&[first_arr])?
-        } else {
-            converter.convert_columns(&[])?
-        };
 
-        let r_values = if let Some(second_arr) = second_arr {
-            converter.convert_columns(&[second_arr])?
-        } else {
-            converter.convert_columns(&[])?
-        };
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let l_first = l.offsets()[0].as_usize();
+    let l_len = l.offsets()[l.len()].as_usize() - l_first;
+    let rows_l = converter.convert_columns(&[l.values().slice(l_first, l_len)])?;
+
+    let r_first = r.offsets()[0].as_usize();
+    let r_len = r.offsets()[r.len()].as_usize() - r_first;
+    let rows_r = converter.convert_columns(&[r.values().slice(r_first, r_len)])?;
+
+    // Combine the *sliced* value arrays so 0-based indices from the row
+    // converter map directly into the concatenated array.
+    let l_values = l.values().slice(l_first, l_len);
+    let r_values = r.values().slice(r_first, r_len);
+    let combined_values = concat(&[l_values.as_ref(), r_values.as_ref()])?;
+    let r_offset = l_len;
+
+    match set_op {
+        SetOp::Union => generic_set_loop::<OffsetSize, true>(
+            l,
+            r,
+            &rows_l,
+            &rows_r,
+            field,
+            &combined_values,
+            r_offset,
+        ),
+        SetOp::Intersect => generic_set_loop::<OffsetSize, false>(
+            l,
+            r,
+            &rows_l,
+            &rows_r,
+            field,
+            &combined_values,
+            r_offset,
+        ),
+    }
+}
 
-        let l_iter = l_values.iter().sorted().dedup();
-        let values_set: HashSet<_> = l_iter.clone().collect();
-        let mut rows = if set_op == SetOp::Union {
-            l_iter.collect()
-        } else {
-            vec![]
-        };
-
-        for r_val in r_values.iter().sorted().dedup() {
-            match set_op {
-                SetOp::Union => {
-                    if !values_set.contains(&r_val) {
-                        rows.push(r_val);
-                    }
+/// Inner loop for set operations, parameterized by const generic to
+/// avoid branching inside the hot loop.
+fn generic_set_loop<OffsetSize: OffsetSizeTrait, const IS_UNION: bool>(
+    l: &GenericListArray<OffsetSize>,
+    r: &GenericListArray<OffsetSize>,
+    rows_l: &arrow::row::Rows,
+    rows_r: &arrow::row::Rows,
+    field: Arc<Field>,
+    combined_values: &ArrayRef,
+    r_offset: usize,
+) -> Result<ArrayRef> {
+    let l_offsets = l.value_offsets();
+    let r_offsets = r.value_offsets();
+    let l_first = l.offsets()[0].as_usize();
+    let r_first = r.offsets()[0].as_usize();
+
+    let mut result_offsets = Vec::with_capacity(l.len() + 1);
+    result_offsets.push(OffsetSize::usize_as(0));
+    let initial_capacity = if IS_UNION {
+        // Union can include all elements from both sides
+        rows_l.num_rows()
+    } else {
+        // Intersect result is bounded by the smaller side
+        rows_l.num_rows().min(rows_r.num_rows())
+    };
+
+    let mut indices: Vec<usize> = Vec::with_capacity(initial_capacity);
+
+    // Reuse hash sets across iterations
+    let mut seen = HashSet::new();
+    let mut lookup_set = HashSet::new();
+    for i in 0..l.len() {
+        let last_offset = *result_offsets.last().unwrap();
+
+        if l.is_null(i) || r.is_null(i) {
+            result_offsets.push(last_offset);
+            continue;
+        }
+
+        let l_start = l_offsets[i].as_usize() - l_first;
+        let l_end = l_offsets[i + 1].as_usize() - l_first;
+        let r_start = r_offsets[i].as_usize() - r_first;
+        let r_end = r_offsets[i + 1].as_usize() - r_first;
+
+        seen.clear();
+
+        if IS_UNION {
+            for idx in l_start..l_end {
+                let row = rows_l.row(idx);
+                if seen.insert(row) {
+                    indices.push(idx);
                 }
-                SetOp::Intersect => {
-                    if values_set.contains(&r_val) {
-                        rows.push(r_val);
-                    }
+            }
+            for idx in r_start..r_end {
+                let row = rows_r.row(idx);
+                if seen.insert(row) {
+                    indices.push(idx + r_offset);
                 }
             }
-        }
-
-        let last_offset = match offsets.last() {
-            Some(offset) => *offset,
-            None => return internal_err!("offsets should not be empty"),
-        };
-
-        offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
-        let arrays = converter.convert_rows(rows)?;
-        let array = match arrays.first() {
-            Some(array) => Arc::clone(array),
-            None => {
-                return internal_err!("{set_op}: failed to get array from rows");
+        } else {
+            let l_len = l_end - l_start;
+            let r_len = r_end - r_start;
+
+            // Select shorter side for lookup, longer side for probing.
+            // Track the probe side's offset into the combined values array.
+            let (lookup_rows, lookup_range, probe_rows, probe_range, probe_offset) =
+                if l_len < r_len {
+                    (rows_l, l_start..l_end, rows_r, r_start..r_end, r_offset)
+                } else {
+                    (rows_r, r_start..r_end, rows_l, l_start..l_end, 0)
+                };
+            lookup_set.clear();
+            lookup_set.reserve(lookup_range.len());
+
+            // Build lookup table
+            for idx in lookup_range {
+                lookup_set.insert(lookup_rows.row(idx));
             }
-        };
-
-        new_arrays.push(array);
-    }
 
-    let offsets = OffsetBuffer::new(offsets.into());
-    let new_arrays_ref: Vec<_> = new_arrays.iter().map(|v| v.as_ref()).collect();
-    let values = compute::concat(&new_arrays_ref)?;
-    let arr = GenericListArray::<OffsetSize>::try_new(field, offsets, values, None)?;
+            // Probe and emit distinct intersected rows
+            for idx in probe_range {
+                let row = probe_rows.row(idx);
+                if lookup_set.contains(&row) && seen.insert(row) {
+                    indices.push(idx + probe_offset);
+                }
+            }
+        }
+        result_offsets.push(last_offset + OffsetSize::usize_as(seen.len()));
+    }
+
+    // Gather distinct values by index from the combined values array.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let final_values = if indices.is_empty() {
+        new_empty_array(&l.value_type())
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(combined_values.as_ref(), &indices, None)?
+    } else {
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(combined_values.as_ref(), &indices, None)?
+    };
+
+    let arr = GenericListArray::<OffsetSize>::try_new(
+        field,
+        OffsetBuffer::new(result_offsets.into()),
+        final_values,
+        NullBuffer::union(l.nulls(), r.nulls()),
+    )?;
     Ok(Arc::new(arr))
 }
 
@@ -433,59 +522,13 @@ fn general_set_op(
     array2: &ArrayRef,
     set_op: SetOp,
 ) -> Result<ArrayRef> {
-    fn empty_array(data_type: &DataType, len: usize, large: bool) -> Result<ArrayRef> {
-        let field = Arc::new(Field::new_list_field(data_type.clone(), true));
-        let values = new_null_array(data_type, len);
-        if large {
-            Ok(Arc::new(LargeListArray::try_new(
-                field,
-                OffsetBuffer::new_zeroed(len),
-                values,
-                None,
-            )?))
-        } else {
-            Ok(Arc::new(ListArray::try_new(
-                field,
-                OffsetBuffer::new_zeroed(len),
-                values,
-                None,
-            )?))
-        }
-    }
-
+    let len = array1.len();
     match (array1.data_type(), array2.data_type()) {
-        (Null, Null) => Ok(Arc::new(ListArray::new_null(
-            Arc::new(Field::new_list_field(Null, true)),
-            array1.len(),
-        ))),
-        (Null, List(field)) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), false);
-            }
-            let array = as_list_array(&array2)?;
-            general_array_distinct::<i32>(array, field)
-        }
-        (List(field), Null) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), false);
-            }
-            let array = as_list_array(&array1)?;
-            general_array_distinct::<i32>(array, field)
-        }
-        (Null, LargeList(field)) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), true);
-            }
-            let array = as_large_list_array(&array2)?;
-            general_array_distinct::<i64>(array, field)
-        }
-        (LargeList(field), Null) => {
-            if set_op == SetOp::Intersect {
-                return empty_array(field.data_type(), array1.len(), true);
-            }
-            let array = as_large_list_array(&array1)?;
-            general_array_distinct::<i64>(array, field)
-        }
+        (Null, Null) => Ok(new_null_array(&DataType::new_list(Null, true), len)),
+        (Null, dt @ List(_))
+        | (Null, dt @ LargeList(_))
+        | (dt @ List(_), Null)
+        | (dt @ LargeList(_), Null) => Ok(new_null_array(dt, len)),
         (List(field), List(_)) => {
             let array1 = as_list_array(&array1)?;
             let array2 = as_list_array(&array2)?;
@@ -504,13 +547,11 @@ fn general_set_op(
     }
 }
 
-/// Array_union SQL function
 fn array_union_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_union", args)?;
     general_set_op(array1, array2, SetOp::Union)
 }
 
-/// array_intersect SQL function
 fn array_intersect_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     let [array1, array2] = take_function_args("array_intersect", args)?;
     general_set_op(array1, array2, SetOp::Intersect)
@@ -523,43 +564,237 @@ fn general_array_distinct<OffsetSize: OffsetSizeTrait>(
     if array.is_empty() {
         return Ok(Arc::new(array.clone()) as ArrayRef);
     }
+    let value_offsets = array.value_offsets();
     let dt = array.value_type();
-    let mut offsets = Vec::with_capacity(array.len());
+    let mut offsets = Vec::with_capacity(array.len() + 1);
     offsets.push(OffsetSize::usize_as(0));
-    let mut new_arrays = Vec::with_capacity(array.len());
-    let converter = RowConverter::new(vec![SortField::new(dt)])?;
-    // distinct for each list in ListArray
-    for arr in array.iter() {
-        let last_offset: OffsetSize = offsets.last().copied().unwrap();
-        let Some(arr) = arr else {
-            // Add same offset for null
+
+    let converter = RowConverter::new(vec![SortField::new(dt.clone())])?;
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let first_offset = value_offsets[0].as_usize();
+    let visible_len = value_offsets[array.len()].as_usize() - first_offset;
+    let rows =
+        converter.convert_columns(&[array.values().slice(first_offset, visible_len)])?;
+
+    let mut indices: Vec<usize> = Vec::with_capacity(rows.num_rows());
+    let mut seen = HashSet::new();
+    for i in 0..array.len() {
+        let last_offset = *offsets.last().unwrap();
+
+        // Null list entries produce no output; just carry forward the offset.
+        if array.is_null(i) {
             offsets.push(last_offset);
             continue;
-        };
-        let values = converter.convert_columns(&[arr])?;
-        // sort elements in list and remove duplicates
-        let rows = values.iter().sorted().dedup().collect::<Vec<_>>();
-        offsets.push(last_offset + OffsetSize::usize_as(rows.len()));
-        let arrays = converter.convert_rows(rows)?;
-        let array = match arrays.first() {
-            Some(array) => Arc::clone(array),
-            None => {
-                return internal_err!("array_distinct: failed to get array from rows")
+        }
+
+        let start = value_offsets[i].as_usize() - first_offset;
+        let end = value_offsets[i + 1].as_usize() - first_offset;
+        seen.clear();
+        seen.reserve(end - start);
+
+        // Walk the sub-array and keep only the first occurrence of each value.
+        for idx in start..end {
+            let row = rows.row(idx);
+            if seen.insert(row) {
+                indices.push(idx + first_offset);
             }
-        };
-        new_arrays.push(array);
-    }
-    if new_arrays.is_empty() {
-        return Ok(Arc::new(array.clone()) as ArrayRef);
-    }
-    let offsets = OffsetBuffer::new(offsets.into());
-    let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::<Vec<_>>();
-    let values = compute::concat(&new_arrays_ref)?;
+        }
+        offsets.push(last_offset + OffsetSize::usize_as(seen.len()));
+    }
+
+    // Gather distinct values in a single pass, using the computed `indices`.
+    // Indices are absolute positions in array.values() (first_offset was added
+    // back when collecting them), so we can take directly from the full values.
+    // Use UInt64Array for LargeList to support values arrays exceeding u32::MAX.
+    let final_values = if indices.is_empty() {
+        new_empty_array(&dt)
+    } else if OffsetSize::IS_LARGE {
+        let indices =
+            UInt64Array::from(indices.into_iter().map(|i| i as u64).collect::<Vec<_>>());
+        take(array.values().as_ref(), &indices, None)?
+    } else {
+        let indices =
+            UInt32Array::from(indices.into_iter().map(|i| i as u32).collect::<Vec<_>>());
+        take(array.values().as_ref(), &indices, None)?
+    };
+
     Ok(Arc::new(GenericListArray::<OffsetSize>::try_new(
         Arc::clone(field),
-        offsets,
-        values,
+        OffsetBuffer::new(offsets.into()),
+        final_values,
         // Keep the list nulls
         array.nulls().cloned(),
     )?))
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{Array, AsArray, Int32Array, ListArray},
+        buffer::OffsetBuffer,
+        datatypes::{DataType, Field, Int32Type},
+    };
+    use datafusion_common::{DataFusionError, Result, config::ConfigOptions};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    use crate::set_ops::{ArrayDistinct, ArrayIntersect, ArrayUnion, array_distinct_udf};
+
+    /// Build two sliced ListArrays and return them along with the shared list
+    /// field.
+    ///
+    /// l: [[1,2], [3,4], [5,6], [7,8]]  →  slice(1,2)  →  [[3,4], [5,6]]
+    /// r: [[1,3], [3,5], [5,7], [7,1]]  →  slice(1,2)  →  [[3,5], [5,7]]
+    fn make_sliced_pair() -> (ListArray, ListArray, Arc<Field>) {
+        let l = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            Some(vec![Some(3), Some(4)]),
+            Some(vec![Some(5), Some(6)]),
+            Some(vec![Some(7), Some(8)]),
+        ]);
+        let r = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(3)]),
+            Some(vec![Some(3), Some(5)]),
+            Some(vec![Some(5), Some(7)]),
+            Some(vec![Some(7), Some(1)]),
+        ]);
+        let field = Arc::new(Field::new("item", l.data_type().clone(), true));
+        (l.slice(1, 2), r.slice(1, 2), field)
+    }
+
+    fn collect_i32_list(list: &ListArray) -> Vec<Vec<i32>> {
+        (0..list.len())
+            .map(|i| {
+                let arr = list.value(i);
+                arr.as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap()
+                    .values()
+                    .to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_array_union_sliced_lists() -> Result<()> {
+        let (l, r, field) = make_sliced_pair();
+
+        let result = ArrayUnion::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l)),
+                ColumnarValue::Array(Arc::new(r)),
+            ],
+            arg_fields: vec![Arc::clone(&field), Arc::clone(&field)],
+            number_rows: 2,
+            return_field: Arc::clone(&field),
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: union([3,4], [3,5]) = [3,4,5]
+        assert_eq!(rows[0], vec![3, 4, 5]);
+        // Row 1: union([5,6], [5,7]) = [5,6,7]
+        assert_eq!(rows[1], vec![5, 6, 7]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_intersect_sliced_lists() -> Result<()> {
+        let (l, r, field) = make_sliced_pair();
+
+        let result = ArrayIntersect::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(l)),
+                ColumnarValue::Array(Arc::new(r)),
+            ],
+            arg_fields: vec![Arc::clone(&field), Arc::clone(&field)],
+            number_rows: 2,
+            return_field: Arc::clone(&field),
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: intersect([3,4], [3,5]) = [3]
+        assert_eq!(rows[0], vec![3]);
+        // Row 1: intersect([5,6], [5,7]) = [5]
+        assert_eq!(rows[1], vec![5]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_distinct_sliced_list() -> Result<()> {
+        // [[1,1], [3,3,4], [5,5,6], [7,7]]  →  slice(1,2)  →  [[3,3,4], [5,5,6]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(1)]),
+            Some(vec![Some(3), Some(3), Some(4)]),
+            Some(vec![Some(5), Some(5), Some(6)]),
+            Some(vec![Some(7), Some(7)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let field = Arc::new(Field::new("item", sliced.data_type().clone(), true));
+
+        let result = ArrayDistinct::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::new(sliced))],
+            arg_fields: vec![Arc::clone(&field)],
+            number_rows: 2,
+            return_field: field,
+            config_options: Arc::new(ConfigOptions::default()),
+        })?;
+
+        let output = result.into_array(2)?;
+        let output = output.as_list::<i32>();
+        let rows = collect_i32_list(output);
+
+        // Row 0: distinct([3,3,4]) = [3,4]
+        assert_eq!(rows[0], vec![3, 4]);
+        // Row 1: distinct([5,5,6]) = [5,6]
+        assert_eq!(rows[1], vec![5, 6]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_distinct_inner_nullability_result_type_match_return_type()
+    -> Result<(), DataFusionError> {
+        let udf = array_distinct_udf();
+
+        for inner_nullable in [true, false] {
+            let inner_field = Field::new_list_field(DataType::Int32, inner_nullable);
+            let input_field =
+                Field::new_list("input", Arc::new(inner_field.clone()), true);
+
+            // [[1, 1, 2]]
+            let input_array = ListArray::new(
+                inner_field.into(),
+                OffsetBuffer::new(vec![0, 3].into()),
+                Arc::new(Int32Array::new(vec![1, 1, 2].into(), None)),
+                None,
+            );
+
+            let input_array = ColumnarValue::Array(Arc::new(input_array));
+
+            let result = udf.invoke_with_args(ScalarFunctionArgs {
+                args: vec![input_array],
+                arg_fields: vec![input_field.clone().into()],
+                number_rows: 1,
+                return_field: input_field.clone().into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })?;
+
+            assert_eq!(
+                result.data_type(),
+                udf.return_type(&[input_field.data_type().clone()])?
+            );
+        }
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-nested/src/sort.rs b/datafusion/functions-nested/src/sort.rs
index 7b2f41c0541c6..256293169123c 100644
--- a/datafusion/functions-nested/src/sort.rs
+++ b/datafusion/functions-nested/src/sort.rs
@@ -18,17 +18,17 @@
 //! [`ScalarUDFImpl`] definitions for array_sort function.
 
 use crate::utils::make_scalar_function;
-use arrow::array::{new_null_array, Array, ArrayRef, ListArray, NullBufferBuilder};
+use arrow::array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait, new_null_array};
 use arrow::buffer::OffsetBuffer;
 use arrow::compute::SortColumn;
-use arrow::datatypes::{DataType, Field};
+use arrow::datatypes::{DataType, FieldRef};
 use arrow::{compute, compute::SortOptions};
-use datafusion_common::cast::{as_list_array, as_string_array};
+use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array};
 use datafusion_common::utils::ListCoercion;
-use datafusion_common::{exec_err, plan_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
     ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -74,7 +74,7 @@ make_udf_expr_and_func!(
         description = "Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`)."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArraySort {
     signature: Signature,
     aliases: Vec<String>,
@@ -132,21 +132,10 @@ impl ScalarUDFImpl for ArraySort {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[0] {
-            DataType::Null => Ok(DataType::Null),
-            DataType::List(field) => {
-                Ok(DataType::new_list(field.data_type().clone(), true))
-            }
-            arg_type => {
-                plan_err!("{} does not support type {arg_type}", self.name())
-            }
-        }
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_sort_inner)(&args.args)
     }
 
@@ -159,19 +148,12 @@ impl ScalarUDFImpl for ArraySort {
     }
 }
 
-/// Array_sort SQL function
-pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.is_empty() || args.len() > 3 {
         return exec_err!("array_sort expects one to three arguments");
     }
 
-    if args[0].data_type().is_null() {
-        return Ok(Arc::clone(&args[0]));
-    }
-
-    let list_array = as_list_array(&args[0])?;
-    let row_count = list_array.len();
-    if row_count == 0 || list_array.value_type().is_null() {
+    if args[0].is_empty() || args[0].data_type().is_null() {
         return Ok(Arc::clone(&args[0]));
     }
 
@@ -179,7 +161,7 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         return Ok(new_null_array(args[0].data_type(), args[0].len()));
     }
 
-    let sort_option = match args.len() {
+    let sort_options = match args.len() {
         1 => None,
         2 => {
             let sort = as_string_array(&args[1])?.value(0);
@@ -196,16 +178,41 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                 nulls_first: order_nulls_first(nulls_first)?,
             })
         }
-        _ => return exec_err!("array_sort expects 1 to 3 arguments"),
+        // We guard at the top
+        _ => unreachable!(),
     };
 
+    match args[0].data_type() {
+        DataType::List(field) | DataType::LargeList(field)
+            if field.data_type().is_null() =>
+        {
+            Ok(Arc::clone(&args[0]))
+        }
+        DataType::List(field) => {
+            let array = as_list_array(&args[0])?;
+            array_sort_generic(array, Arc::clone(field), sort_options)
+        }
+        DataType::LargeList(field) => {
+            let array = as_large_list_array(&args[0])?;
+            array_sort_generic(array, Arc::clone(field), sort_options)
+        }
+        // Signature should prevent this arm ever occurring
+        _ => exec_err!("array_sort expects list for first argument"),
+    }
+}
+
+fn array_sort_generic<OffsetSize: OffsetSizeTrait>(
+    list_array: &GenericListArray<OffsetSize>,
+    field: FieldRef,
+    sort_options: Option<SortOptions>,
+) -> Result<ArrayRef> {
+    let row_count = list_array.len();
+
     let mut array_lengths = vec![];
     let mut arrays = vec![];
-    let mut valid = NullBufferBuilder::new(row_count);
     for i in 0..row_count {
         if list_array.is_null(i) {
             array_lengths.push(0);
-            valid.append_null();
         } else {
             let arr_ref = list_array.value(i);
 
@@ -216,39 +223,34 @@ pub fn array_sort_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
                 DataType::Struct(_) => {
                     let sort_columns: Vec<SortColumn> = vec![SortColumn {
                         values: Arc::clone(&arr_ref),
-                        options: sort_option,
+                        options: sort_options,
                     }];
                     let indices = compute::lexsort_to_indices(&sort_columns, None)?;
                     compute::take(arr_ref.as_ref(), &indices, None)?
                 }
                 _ => {
                     let arr_ref = arr_ref.as_ref();
-                    compute::sort(arr_ref, sort_option)?
+                    compute::sort(arr_ref, sort_options)?
                 }
             };
             array_lengths.push(sorted_array.len());
             arrays.push(sorted_array);
-            valid.append_non_null();
         }
     }
 
-    // Assume all arrays have the same data type
-    let data_type = list_array.value_type();
-    let buffer = valid.finish();
-
     let elements = arrays
         .iter()
         .map(|a| a.as_ref())
         .collect::<Vec<&dyn Array>>();
 
     let list_arr = if elements.is_empty() {
-        ListArray::new_null(Arc::new(Field::new_list_field(data_type, true)), row_count)
+        GenericListArray::<OffsetSize>::new_null(field, row_count)
     } else {
-        ListArray::new(
-            Arc::new(Field::new_list_field(data_type, true)),
+        GenericListArray::<OffsetSize>::new(
+            field,
             OffsetBuffer::from_lengths(array_lengths),
             Arc::new(compute::concat(elements.as_slice())?),
-            buffer,
+            list_array.nulls().cloned(),
         )
     };
     Ok(Arc::new(list_arr))
diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs
index d60d1a6e4de02..619e43f40bf71 100644
--- a/datafusion/functions-nested/src/string.rs
+++ b/datafusion/functions-nested/src/string.rs
@@ -19,102 +19,42 @@
 
 use arrow::array::{
     Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
-    Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder,
-    OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array,
-    UInt8Array,
+    Int8Array, Int16Array, Int32Array, Int64Array, LargeStringArray, ListBuilder,
+    OffsetSizeTrait, StringArray, StringBuilder, UInt8Array, UInt16Array, UInt32Array,
+    UInt64Array,
 };
 use arrow::datatypes::{DataType, Field};
 
-use datafusion_common::{
-    internal_datafusion_err, not_impl_err, plan_err, DataFusionError, Result,
-};
+use datafusion_common::utils::ListCoercion;
+use datafusion_common::{DataFusionError, Result, not_impl_err};
 
 use std::any::Any;
+use std::fmt::{self, Write};
 
 use crate::utils::make_scalar_function;
 use arrow::array::{
+    GenericStringArray, StringArrayType, StringViewArray,
     builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
     cast::AsArray,
-    GenericStringArray, StringArrayType, StringViewArray,
 };
-use arrow::compute::cast;
+use arrow::compute::{can_cast_types, cast};
 use arrow::datatypes::DataType::{
     Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
 };
-use datafusion_common::cast::{as_large_list_array, as_list_array};
+use datafusion_common::cast::{
+    as_fixed_size_list_array, as_large_list_array, as_list_array,
+};
 use datafusion_common::exec_err;
 use datafusion_common::types::logical_string;
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
+    ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue,
+    Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
     TypeSignatureClass, Volatility,
 };
-use datafusion_functions::{downcast_arg, downcast_named_arg};
+use datafusion_functions::downcast_arg;
 use datafusion_macros::user_doc;
 use std::sync::Arc;
 
-macro_rules! call_array_function {
-    ($DATATYPE:expr, false) => {
-        match $DATATYPE {
-            DataType::Utf8 => array_function!(StringArray),
-            DataType::Utf8View => array_function!(StringViewArray),
-            DataType::LargeUtf8 => array_function!(LargeStringArray),
-            DataType::Boolean => array_function!(BooleanArray),
-            DataType::Float32 => array_function!(Float32Array),
-            DataType::Float64 => array_function!(Float64Array),
-            DataType::Int8 => array_function!(Int8Array),
-            DataType::Int16 => array_function!(Int16Array),
-            DataType::Int32 => array_function!(Int32Array),
-            DataType::Int64 => array_function!(Int64Array),
-            DataType::UInt8 => array_function!(UInt8Array),
-            DataType::UInt16 => array_function!(UInt16Array),
-            DataType::UInt32 => array_function!(UInt32Array),
-            DataType::UInt64 => array_function!(UInt64Array),
-            dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
-        }
-    };
-    ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
-        match $DATATYPE {
-            DataType::List(_) => array_function!(ListArray),
-            DataType::Utf8 => array_function!(StringArray),
-            DataType::Utf8View => array_function!(StringViewArray),
-            DataType::LargeUtf8 => array_function!(LargeStringArray),
-            DataType::Boolean => array_function!(BooleanArray),
-            DataType::Float32 => array_function!(Float32Array),
-            DataType::Float64 => array_function!(Float64Array),
-            DataType::Int8 => array_function!(Int8Array),
-            DataType::Int16 => array_function!(Int16Array),
-            DataType::Int32 => array_function!(Int32Array),
-            DataType::Int64 => array_function!(Int64Array),
-            DataType::UInt8 => array_function!(UInt8Array),
-            DataType::UInt16 => array_function!(UInt16Array),
-            DataType::UInt32 => array_function!(UInt32Array),
-            DataType::UInt64 => array_function!(UInt64Array),
-            dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
-        }
-    }};
-}
-
-macro_rules! to_string {
-    ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
-        let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
-        for x in arr {
-            match x {
-                Some(x) => {
-                    $ARG.push_str(&x.to_string());
-                    $ARG.push_str($DELIMITER);
-                }
-                None => {
-                    if $WITH_NULL_STRING {
-                        $ARG.push_str($NULL_STRING);
-                        $ARG.push_str($DELIMITER);
-                    }
-                }
-            }
-        }
-        Ok($ARG)
-    }};
-}
-
 // Create static instances of ScalarUDFs for each function
 make_udf_expr_and_func!(
     ArrayToString,
@@ -143,10 +83,10 @@ make_udf_expr_and_func!(
     argument(name = "delimiter", description = "Array element separator."),
     argument(
         name = "null_string",
-        description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior."
+        description = "Optional. String to use for null values in the output. If not provided, nulls will be omitted."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrayToString {
     signature: Signature,
     aliases: Vec<String>,
@@ -161,7 +101,26 @@ impl Default for ArrayToString {
 impl ArrayToString {
     pub fn new() -> Self {
         Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::String,
+                            ArrayFunctionArgument::String,
+                        ],
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
+                    }),
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::String,
+                        ],
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
+                    }),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![
                 String::from("list_to_string"),
                 String::from("array_join"),
@@ -184,19 +143,11 @@ impl ScalarUDFImpl for ArrayToString {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(match arg_types[0] {
-            List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8,
-            _ => {
-                return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList.");
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Utf8)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(array_to_string_inner)(&args.args)
     }
 
@@ -242,7 +193,7 @@ make_udf_expr_and_func!(
         description = "Substring values to be replaced with `NULL`."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(super) struct StringToArray {
     signature: Signature,
     aliases: Vec<String>,
@@ -284,22 +235,13 @@ impl ScalarUDFImpl for StringToArray {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(match arg_types[0] {
-            Utf8 | Utf8View | LargeUtf8 => {
-                List(Arc::new(Field::new_list_field(arg_types[0].clone(), true)))
-            }
-            _ => {
-                return plan_err!(
-                    "The string_to_array function can only accept Utf8, Utf8View or LargeUtf8."
-                );
-            }
-        })
+        Ok(List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
             Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
@@ -319,8 +261,7 @@ impl ScalarUDFImpl for StringToArray {
     }
 }
 
-/// Array_to_string SQL function
-pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() < 2 || args.len() > 3 {
         return exec_err!("array_to_string expects two or three arguments");
     }
@@ -331,172 +272,262 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         Utf8 => args[1].as_string::<i32>().iter().collect(),
         Utf8View => args[1].as_string_view().iter().collect(),
         LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
-        other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
+        other => {
+            return exec_err!(
+                "unsupported type for second argument to array_to_string function as {other:?}"
+            );
+        }
+    };
+
+    let null_strings: Vec<Option<&str>> = if args.len() == 3 {
+        match args[2].data_type() {
+            Utf8 => args[2].as_string::<i32>().iter().collect(),
+            Utf8View => args[2].as_string_view().iter().collect(),
+            LargeUtf8 => args[2].as_string::<i64>().iter().collect(),
+            other => {
+                return exec_err!(
+                    "unsupported type for third argument to array_to_string function as {other:?}"
+                );
+            }
+        }
+    } else {
+        // If `null_strings` is not specified, we treat it as equivalent to
+        // explicitly passing a NULL value for `null_strings` in every row.
+        vec![None; args[0].len()]
     };
 
-    let mut null_string = String::from("");
-    let mut with_null_string = false;
-    if args.len() == 3 {
-        null_string = match args[2].data_type() {
-            Utf8 => args[2].as_string::<i32>().value(0).to_string(),
-            Utf8View => args[2].as_string_view().value(0).to_string(),
-            LargeUtf8 => args[2].as_string::<i64>().value(0).to_string(),
-            other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
+    let string_arr = match arr.data_type() {
+        List(_) => {
+            let list_array = as_list_array(&arr)?;
+            generate_string_array::<i32>(list_array, &delimiters, &null_strings)?
+        }
+        LargeList(_) => {
+            let list_array = as_large_list_array(&arr)?;
+            generate_string_array::<i64>(list_array, &delimiters, &null_strings)?
+        }
+        // Signature guards against this arm
+        _ => return exec_err!("array_to_string expects list as first argument"),
+    };
+
+    Ok(Arc::new(string_arr))
+}
+
+fn generate_string_array<O: OffsetSizeTrait>(
+    list_arr: &GenericListArray<O>,
+    delimiters: &[Option<&str>],
+    null_strings: &[Option<&str>],
+) -> Result<StringArray> {
+    let mut builder = StringBuilder::with_capacity(list_arr.len(), 0);
+
+    for ((arr, &delimiter), &null_string) in list_arr
+        .iter()
+        .zip(delimiters.iter())
+        .zip(null_strings.iter())
+    {
+        let (Some(arr), Some(delimiter)) = (arr, delimiter) else {
+            builder.append_null();
+            continue;
         };
-        with_null_string = true;
-    }
-
-    /// Creates a single string from single element of a ListArray (which is
-    /// itself another Array)
-    fn compute_array_to_string(
-        arg: &mut String,
-        arr: ArrayRef,
-        delimiter: String,
-        null_string: String,
-        with_null_string: bool,
-    ) -> Result<&mut String> {
-        match arr.data_type() {
-            List(..) => {
-                let list_array = as_list_array(&arr)?;
-                for i in 0..list_array.len() {
-                    compute_array_to_string(
-                        arg,
-                        list_array.value(i),
-                        delimiter.clone(),
-                        null_string.clone(),
-                        with_null_string,
-                    )?;
-                }
 
-                Ok(arg)
-            }
-            LargeList(..) => {
-                let list_array = as_large_list_array(&arr)?;
-                for i in 0..list_array.len() {
+        let mut first = true;
+        compute_array_to_string(&mut builder, &arr, delimiter, null_string, &mut first)?;
+        builder.append_value("");
+    }
+
+    Ok(builder.finish())
+}
+
+fn compute_array_to_string(
+    w: &mut impl Write,
+    arr: &ArrayRef,
+    delimiter: &str,
+    null_string: Option<&str>,
+    first: &mut bool,
+) -> Result<()> {
+    // Handle lists by recursing on each list element.
+    macro_rules! handle_list {
+        ($list_array:expr) => {
+            for i in 0..$list_array.len() {
+                if !$list_array.is_null(i) {
                     compute_array_to_string(
-                        arg,
-                        list_array.value(i),
-                        delimiter.clone(),
-                        null_string.clone(),
-                        with_null_string,
+                        w,
+                        &$list_array.value(i),
+                        delimiter,
+                        null_string,
+                        first,
                     )?;
+                } else if let Some(ns) = null_string {
+                    if *first {
+                        *first = false;
+                    } else {
+                        w.write_str(delimiter)?;
+                    }
+                    w.write_str(ns)?;
                 }
+            }
+        };
+    }
 
-                Ok(arg)
+    match arr.data_type() {
+        List(..) => {
+            let list_array = as_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
+        }
+        FixedSizeList(..) => {
+            let list_array = as_fixed_size_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
+        }
+        LargeList(..) => {
+            let list_array = as_large_list_array(arr)?;
+            handle_list!(list_array);
+            Ok(())
+        }
+        Dictionary(_key_type, value_type) => {
+            // Call cast to unwrap the dictionary. This could be optimized if we wanted
+            // to accept the overhead of extra code
+            let values = cast(arr, value_type.as_ref()).map_err(|e| {
+                DataFusionError::from(e)
+                    .context("Casting dictionary to values in compute_array_to_string")
+            })?;
+            compute_array_to_string(w, &values, delimiter, null_string, first)
+        }
+        Null => Ok(()),
+        data_type => {
+            macro_rules! str_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x: &str| w.write_str(x),
+                    )?
+                };
             }
-            Dictionary(_key_type, value_type) => {
-                // Call cast to unwrap the dictionary. This could be optimized if we wanted
-                // to accept the overhead of extra code
-                let values = cast(&arr, value_type.as_ref()).map_err(|e| {
-                    DataFusionError::from(e).context(
-                        "Casting dictionary to values in compute_array_to_string",
-                    )
-                })?;
-                compute_array_to_string(
-                    arg,
-                    values,
-                    delimiter,
-                    null_string,
-                    with_null_string,
-                )
+            macro_rules! bool_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x: bool| {
+                            if x {
+                                w.write_str("true")
+                            } else {
+                                w.write_str("false")
+                            }
+                        },
+                    )?
+                };
             }
-            Null => Ok(arg),
-            data_type => {
-                macro_rules! array_function {
-                    ($ARRAY_TYPE:ident) => {
-                        to_string!(
-                            arg,
-                            arr,
-                            &delimiter,
-                            &null_string,
-                            with_null_string,
-                            $ARRAY_TYPE
-                        )
-                    };
-                }
-                call_array_function!(data_type, false)
+            macro_rules! int_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        |w, x| {
+                            let mut itoa_buf = itoa::Buffer::new();
+                            w.write_str(itoa_buf.format(x))
+                        },
+                    )?
+                };
             }
-        }
-    }
-
-    fn generate_string_array<O: OffsetSizeTrait>(
-        list_arr: &GenericListArray<O>,
-        delimiters: Vec<Option<&str>>,
-        null_string: String,
-        with_null_string: bool,
-    ) -> Result<StringArray> {
-        let mut res: Vec<Option<String>> = Vec::new();
-        for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
-            if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
-                let mut arg = String::from("");
-                let s = compute_array_to_string(
-                    &mut arg,
-                    arr,
-                    delimiter.to_string(),
-                    null_string.clone(),
-                    with_null_string,
-                )?
-                .clone();
-
-                if let Some(s) = s.strip_suffix(delimiter) {
-                    res.push(Some(s.to_string()));
-                } else {
-                    res.push(Some(s));
+            macro_rules! float_leaf {
+                ($ARRAY_TYPE:ident) => {
+                    write_leaf_to_string(
+                        w,
+                        downcast_arg!(arr, $ARRAY_TYPE),
+                        delimiter,
+                        null_string,
+                        first,
+                        // TODO: Consider switching to a more efficient
+                        // floating point display library (e.g., ryu). This
+                        // might result in some differences in the output
+                        // format, however.
+                        |w, x| write!(w, "{}", x),
+                    )?
+                };
+            }
+            match data_type {
+                Utf8 => str_leaf!(StringArray),
+                Utf8View => str_leaf!(StringViewArray),
+                LargeUtf8 => str_leaf!(LargeStringArray),
+                DataType::Boolean => bool_leaf!(BooleanArray),
+                DataType::Float32 => float_leaf!(Float32Array),
+                DataType::Float64 => float_leaf!(Float64Array),
+                DataType::Int8 => int_leaf!(Int8Array),
+                DataType::Int16 => int_leaf!(Int16Array),
+                DataType::Int32 => int_leaf!(Int32Array),
+                DataType::Int64 => int_leaf!(Int64Array),
+                DataType::UInt8 => int_leaf!(UInt8Array),
+                DataType::UInt16 => int_leaf!(UInt16Array),
+                DataType::UInt32 => int_leaf!(UInt32Array),
+                DataType::UInt64 => int_leaf!(UInt64Array),
+                data_type if can_cast_types(data_type, &Utf8) => {
+                    let str_arr = cast(arr, &Utf8).map_err(|e| {
+                        DataFusionError::from(e)
+                            .context("Casting to string in array_to_string")
+                    })?;
+                    return compute_array_to_string(
+                        w,
+                        &str_arr,
+                        delimiter,
+                        null_string,
+                        first,
+                    );
+                }
+                data_type => {
+                    return not_impl_err!(
+                        "Unsupported data type in array_to_string: {data_type}"
+                    );
                 }
-            } else {
-                res.push(None);
             }
+            Ok(())
         }
-
-        Ok(StringArray::from(res))
     }
+}
 
-    let arr_type = arr.data_type();
-    let string_arr = match arr_type {
-        List(_) | FixedSizeList(_, _) => {
-            let list_array = as_list_array(&arr)?;
-            generate_string_array::<i32>(
-                list_array,
-                delimiters,
-                null_string,
-                with_null_string,
-            )?
-        }
-        LargeList(_) => {
-            let list_array = as_large_list_array(&arr)?;
-            generate_string_array::<i64>(
-                list_array,
-                delimiters,
-                null_string,
-                with_null_string,
-            )?
+/// Appends the string representation of each element in a leaf (non-list)
+/// array to `w`, separated by `delimiter`. Null elements are rendered
+/// using `null_string` if provided, or skipped otherwise. The `append`
+/// closure controls how each non-null element is written.
+fn write_leaf_to_string<'a, W: Write, A, T>(
+    w: &mut W,
+    arr: &'a A,
+    delimiter: &str,
+    null_string: Option<&str>,
+    first: &mut bool,
+    append: impl Fn(&mut W, T) -> fmt::Result,
+) -> Result<()>
+where
+    &'a A: IntoIterator<Item = Option<T>>,
+{
+    for x in arr {
+        // Skip nulls when no null_string is provided
+        if x.is_none() && null_string.is_none() {
+            continue;
         }
-        _ => {
-            let mut arg = String::from("");
-            let mut res: Vec<Option<String>> = Vec::new();
-            // delimiter length is 1
-            assert_eq!(delimiters.len(), 1);
-            let delimiter = delimiters[0].unwrap();
-            let s = compute_array_to_string(
-                &mut arg,
-                Arc::clone(arr),
-                delimiter.to_string(),
-                null_string,
-                with_null_string,
-            )?
-            .clone();
-
-            if !s.is_empty() {
-                let s = s.strip_suffix(delimiter).unwrap().to_string();
-                res.push(Some(s));
-            } else {
-                res.push(Some(s));
-            }
-            StringArray::from(res)
+
+        if *first {
+            *first = false;
+        } else {
+            w.write_str(delimiter)?;
         }
-    };
 
-    Ok(Arc::new(string_arr))
+        match x {
+            Some(x) => append(w, x)?,
+            None => w.write_str(null_string.unwrap())?,
+        }
+    }
+    Ok(())
 }
 
 /// String_to_array SQL function
@@ -510,26 +541,46 @@ fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayR
     match args[0].data_type() {
         Utf8 => {
             let string_array = args[0].as_string::<T>();
-            let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
-            string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder)
+            let builder = StringBuilder::with_capacity(
+                string_array.len(),
+                string_array.get_buffer_memory_size(),
+            );
+            string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(
+                args,
+                &string_array,
+                builder,
+            )
         }
         Utf8View => {
             let string_array = args[0].as_string_view();
             let builder = StringViewBuilder::with_capacity(string_array.len());
-            string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder)
+            string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(
+                args,
+                &string_array,
+                builder,
+            )
         }
         LargeUtf8 => {
             let string_array = args[0].as_string::<T>();
-            let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
-            string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder)
+            let builder = LargeStringBuilder::with_capacity(
+                string_array.len(),
+                string_array.get_buffer_memory_size(),
+            );
+            string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(
+                args,
+                &string_array,
+                builder,
+            )
         }
-        other =>  exec_err!("unsupported type for first argument to string_to_array function as {other:?}")
+        other => exec_err!(
+            "unsupported type for first argument to string_to_array function as {other:?}"
+        ),
     }
 }
 
 fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
     args: &'a [ArrayRef],
-    string_array: StringArrType,
+    string_array: &StringArrType,
     string_builder: StringBuilderType,
 ) -> Result<ArrayRef>
 where
@@ -545,11 +596,13 @@ where
                     &GenericStringArray<i32>,
                     &StringViewArray,
                     StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
+                >(string_array, &delimiter_array, None, string_builder)
             } else {
-                string_to_array_inner_3::<StringArrType,
+                string_to_array_inner_3::<
+                    StringArrType,
                     &GenericStringArray<i32>,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
+                    StringBuilderType,
+                >(args, string_array, &delimiter_array, string_builder)
             }
         }
         Utf8View => {
@@ -561,11 +614,13 @@ where
                     &StringViewArray,
                     &StringViewArray,
                     StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
+                >(string_array, &delimiter_array, None, string_builder)
             } else {
-                string_to_array_inner_3::<StringArrType,
+                string_to_array_inner_3::<
+                    StringArrType,
                     &StringViewArray,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
+                    StringBuilderType,
+                >(args, string_array, &delimiter_array, string_builder)
             }
         }
         LargeUtf8 => {
@@ -576,21 +631,25 @@ where
                     &GenericStringArray<i64>,
                     &StringViewArray,
                     StringBuilderType,
-                >(string_array, delimiter_array, None, string_builder)
+                >(string_array, &delimiter_array, None, string_builder)
             } else {
-                string_to_array_inner_3::<StringArrType,
+                string_to_array_inner_3::<
+                    StringArrType,
                     &GenericStringArray<i64>,
-                    StringBuilderType>(args, string_array, delimiter_array, string_builder)
+                    StringBuilderType,
+                >(args, string_array, &delimiter_array, string_builder)
             }
         }
-        other =>  exec_err!("unsupported type for second argument to string_to_array function as {other:?}")
+        other => exec_err!(
+            "unsupported type for second argument to string_to_array function as {other:?}"
+        ),
     }
 }
 
 fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
     args: &'a [ArrayRef],
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
+    string_array: &StringArrType,
+    delimiter_array: &DelimiterArrType,
     string_builder: StringBuilderType,
 ) -> Result<ArrayRef>
 where
@@ -654,8 +713,8 @@ fn string_to_array_impl<
     NullValueArrType,
     StringBuilderType,
 >(
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
+    string_array: &StringArrType,
+    delimiter_array: &DelimiterArrType,
     null_value_array: Option<NullValueArrType>,
     string_builder: StringBuilderType,
 ) -> Result<ArrayRef>
diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs
index ed08a82358748..9f46917a87eb3 100644
--- a/datafusion/functions-nested/src/utils.rs
+++ b/datafusion/functions-nested/src/utils.rs
@@ -22,15 +22,16 @@ use std::sync::Arc;
 use arrow::datatypes::{DataType, Field, Fields};
 
 use arrow::array::{
-    Array, ArrayRef, BooleanArray, GenericListArray, OffsetSizeTrait, Scalar, UInt32Array,
+    Array, ArrayRef, BooleanArray, GenericListArray, OffsetSizeTrait, Scalar,
 };
 use arrow::buffer::OffsetBuffer;
 use datafusion_common::cast::{
     as_fixed_size_list_array, as_large_list_array, as_list_array,
 };
-use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err, plan_err};
 
 use datafusion_expr::ColumnarValue;
+use itertools::Itertools as _;
 
 pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> {
     let data_type = args[0].data_type();
@@ -39,7 +40,10 @@ pub(crate) fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> {
             || arg.data_type().equals_datatype(&DataType::Null)
     }) {
         let types = args.iter().map(|arg| arg.data_type()).collect::<Vec<_>>();
-        return plan_err!("{name} received incompatible types: '{types:?}'.");
+        return plan_err!(
+            "{name} received incompatible types: {}",
+            types.iter().join(", ")
+        );
     }
 
     Ok(())
@@ -157,8 +161,7 @@ pub(crate) fn compare_element_to_list(
         );
     }
 
-    let indices = UInt32Array::from(vec![row_index as u32]);
-    let element_array_row = arrow::compute::take(element_array, &indices, None)?;
+    let element_array_row = element_array.slice(row_index, 1);
 
     // Compute all positions in list_row_array (that is itself an
     // array) that are equal to `from_array_row`
@@ -256,11 +259,11 @@ pub(crate) fn get_map_entry_field(data_type: &DataType) -> Result<&Fields> {
             match field_data_type {
                 DataType::Struct(fields) => Ok(fields),
                 _ => {
-                    internal_err!("Expected a Struct type, got {:?}", field_data_type)
+                    internal_err!("Expected a Struct type, got {}", field_data_type)
                 }
             }
         }
-        _ => internal_err!("Expected a Map type, got {:?}", data_type),
+        _ => internal_err!("Expected a Map type, got {data_type}"),
     }
 }
 
diff --git a/datafusion/functions-table/Cargo.toml b/datafusion/functions-table/Cargo.toml
index 78d59257dd480..4edb640cb2cf2 100644
--- a/datafusion/functions-table/Cargo.toml
+++ b/datafusion/functions-table/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -45,7 +48,6 @@ datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
 parking_lot = { workspace = true }
-paste = "1.0.14"
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
diff --git a/datafusion/functions-table/README.md b/datafusion/functions-table/README.md
index c4e7a5aff9993..89f589a9584c5 100644
--- a/datafusion/functions-table/README.md
+++ b/datafusion/functions-table/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Table Function Library
+# Apache DataFusion Table Function Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate contains table functions that can be used in DataFusion queries.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs
index ee95567ab73dc..342269fbc2996 100644
--- a/datafusion/functions-table/src/generate_series.rs
+++ b/datafusion/functions-table/src/generate_series.rs
@@ -15,115 +15,435 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::Int64Array;
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::array::timezone::Tz;
+use arrow::array::types::TimestampNanosecondType;
+use arrow::array::{ArrayRef, Int64Array, TimestampNanosecondArray};
+use arrow::datatypes::{
+    DataType, Field, IntervalMonthDayNano, Schema, SchemaRef, TimeUnit,
+};
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion_catalog::Session;
 use datafusion_catalog::TableFunctionImpl;
 use datafusion_catalog::TableProvider;
-use datafusion_common::{plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, plan_err};
 use datafusion_expr::{Expr, TableType};
-use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::memory::{LazyBatchGenerator, LazyMemoryExec};
 use parking_lot::RwLock;
+use std::any::Any;
 use std::fmt;
+use std::str::FromStr;
 use std::sync::Arc;
 
+/// Empty generator that produces no rows - used when series arguments contain null values
+#[derive(Debug, Clone)]
+pub struct Empty {
+    name: &'static str,
+}
+
+impl Empty {
+    pub fn name(&self) -> &'static str {
+        self.name
+    }
+}
+
+impl LazyBatchGenerator for Empty {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        Ok(None)
+    }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        Arc::new(RwLock::new(Empty { name: self.name }))
+    }
+}
+
+impl fmt::Display for Empty {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}: empty", self.name)
+    }
+}
+
+/// Trait for values that can be generated in a series
+pub trait SeriesValue: fmt::Debug + Clone + Send + Sync + 'static {
+    type StepType: fmt::Debug + Clone + Send + Sync;
+    type ValueType: fmt::Debug + Clone + Send + Sync;
+
+    /// Check if we've reached the end of the series
+    fn should_stop(&self, end: Self, step: &Self::StepType, include_end: bool) -> bool;
+
+    /// Advance to the next value in the series
+    fn advance(&mut self, step: &Self::StepType) -> Result<()>;
+
+    /// Create an Arrow array from a vector of values
+    fn create_array(&self, values: Vec<Self::ValueType>) -> Result<ArrayRef>;
+
+    /// Convert self to ValueType for array creation
+    fn to_value_type(&self) -> Self::ValueType;
+
+    /// Display the value for debugging
+    fn display_value(&self) -> String;
+}
+
+impl SeriesValue for i64 {
+    type StepType = i64;
+    type ValueType = i64;
+
+    fn should_stop(&self, end: Self, step: &Self::StepType, include_end: bool) -> bool {
+        reach_end_int64(*self, end, *step, include_end)
+    }
+
+    fn advance(&mut self, step: &Self::StepType) -> Result<()> {
+        *self += step;
+        Ok(())
+    }
+
+    fn create_array(&self, values: Vec<Self::ValueType>) -> Result<ArrayRef> {
+        Ok(Arc::new(Int64Array::from(values)))
+    }
+
+    fn to_value_type(&self) -> Self::ValueType {
+        *self
+    }
+
+    fn display_value(&self) -> String {
+        self.to_string()
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct TimestampValue {
+    value: i64,
+    parsed_tz: Option<Tz>,
+    tz_str: Option<Arc<str>>,
+}
+
+impl TimestampValue {
+    pub fn value(&self) -> i64 {
+        self.value
+    }
+
+    pub fn tz_str(&self) -> Option<&Arc<str>> {
+        self.tz_str.as_ref()
+    }
+}
+
+impl SeriesValue for TimestampValue {
+    type StepType = IntervalMonthDayNano;
+    type ValueType = i64;
+
+    fn should_stop(&self, end: Self, step: &Self::StepType, include_end: bool) -> bool {
+        let step_negative = step.months < 0 || step.days < 0 || step.nanoseconds < 0;
+
+        if include_end {
+            if step_negative {
+                self.value < end.value
+            } else {
+                self.value > end.value
+            }
+        } else if step_negative {
+            self.value <= end.value
+        } else {
+            self.value >= end.value
+        }
+    }
+
+    fn advance(&mut self, step: &Self::StepType) -> Result<()> {
+        let tz = self
+            .parsed_tz
+            .unwrap_or_else(|| Tz::from_str("+00:00").unwrap());
+        let Some(next_ts) =
+            TimestampNanosecondType::add_month_day_nano(self.value, *step, tz)
+        else {
+            return plan_err!(
+                "Failed to add interval {:?} to timestamp {}",
+                step,
+                self.value
+            );
+        };
+        self.value = next_ts;
+        Ok(())
+    }
+
+    fn create_array(&self, values: Vec<Self::ValueType>) -> Result<ArrayRef> {
+        let array = TimestampNanosecondArray::from(values);
+
+        // Use timezone from self (now we have access to tz through &self)
+        let array = match self.tz_str.as_ref() {
+            Some(tz_str) => array.with_timezone(Arc::clone(tz_str)),
+            None => array,
+        };
+
+        Ok(Arc::new(array))
+    }
+
+    fn to_value_type(&self) -> Self::ValueType {
+        self.value
+    }
+
+    fn display_value(&self) -> String {
+        self.value.to_string()
+    }
+}
+
 /// Indicates the arguments used for generating a series.
 #[derive(Debug, Clone)]
-enum GenSeriesArgs {
+pub enum GenSeriesArgs {
     /// ContainsNull signifies that at least one argument(start, end, step) was null, thus no series will be generated.
-    ContainsNull {
+    ContainsNull { name: &'static str },
+    /// Int64Args holds the start, end, and step values for generating integer series when all arguments are not null.
+    Int64Args {
+        start: i64,
+        end: i64,
+        step: i64,
+        /// Indicates whether the end value should be included in the series.
         include_end: bool,
         name: &'static str,
     },
-    /// AllNotNullArgs holds the start, end, and step values for generating the series when all arguments are not null.
-    AllNotNullArgs {
+    /// TimestampArgs holds the start, end, and step values for generating timestamp series when all arguments are not null.
+    TimestampArgs {
         start: i64,
         end: i64,
-        step: i64,
+        step: IntervalMonthDayNano,
+        tz: Option<Arc<str>>,
+        /// Indicates whether the end value should be included in the series.
+        include_end: bool,
+        name: &'static str,
+    },
+    /// DateArgs holds the start, end, and step values for generating date series when all arguments are not null.
+    /// Internally, dates are converted to timestamps and use the timestamp logic.
+    DateArgs {
+        start: i64,
+        end: i64,
+        step: IntervalMonthDayNano,
         /// Indicates whether the end value should be included in the series.
         include_end: bool,
         name: &'static str,
     },
 }
 
-/// Table that generates a series of integers from `start`(inclusive) to `end`(inclusive), incrementing by step
+/// Table that generates a series of integers/timestamps from `start`(inclusive) to `end`, incrementing by step
 #[derive(Debug, Clone)]
-struct GenerateSeriesTable {
+pub struct GenerateSeriesTable {
     schema: SchemaRef,
     args: GenSeriesArgs,
 }
 
-/// Table state that generates a series of integers from `start`(inclusive) to `end`(inclusive), incrementing by step
+impl GenerateSeriesTable {
+    pub fn new(schema: SchemaRef, args: GenSeriesArgs) -> Self {
+        Self { schema, args }
+    }
+
+    pub fn as_generator(
+        &self,
+        batch_size: usize,
+    ) -> Result<Arc<RwLock<dyn LazyBatchGenerator>>> {
+        let generator: Arc<RwLock<dyn LazyBatchGenerator>> = match &self.args {
+            GenSeriesArgs::ContainsNull { name } => Arc::new(RwLock::new(Empty { name })),
+            GenSeriesArgs::Int64Args {
+                start,
+                end,
+                step,
+                include_end,
+                name,
+            } => Arc::new(RwLock::new(GenericSeriesState {
+                schema: self.schema(),
+                start: *start,
+                end: *end,
+                step: *step,
+                current: *start,
+                batch_size,
+                include_end: *include_end,
+                name,
+            })),
+            GenSeriesArgs::TimestampArgs {
+                start,
+                end,
+                step,
+                tz,
+                include_end,
+                name,
+            } => {
+                let parsed_tz = tz
+                    .as_ref()
+                    .map(|s| Tz::from_str(s.as_ref()))
+                    .transpose()
+                    .map_err(|e| {
+                        datafusion_common::internal_datafusion_err!(
+                            "Failed to parse timezone: {e}"
+                        )
+                    })?
+                    .unwrap_or_else(|| Tz::from_str("+00:00").unwrap());
+                Arc::new(RwLock::new(GenericSeriesState {
+                    schema: self.schema(),
+                    start: TimestampValue {
+                        value: *start,
+                        parsed_tz: Some(parsed_tz),
+                        tz_str: tz.clone(),
+                    },
+                    end: TimestampValue {
+                        value: *end,
+                        parsed_tz: Some(parsed_tz),
+                        tz_str: tz.clone(),
+                    },
+                    step: *step,
+                    current: TimestampValue {
+                        value: *start,
+                        parsed_tz: Some(parsed_tz),
+                        tz_str: tz.clone(),
+                    },
+                    batch_size,
+                    include_end: *include_end,
+                    name,
+                }))
+            }
+            GenSeriesArgs::DateArgs {
+                start,
+                end,
+                step,
+                include_end,
+                name,
+            } => Arc::new(RwLock::new(GenericSeriesState {
+                schema: self.schema(),
+                start: TimestampValue {
+                    value: *start,
+                    parsed_tz: None,
+                    tz_str: None,
+                },
+                end: TimestampValue {
+                    value: *end,
+                    parsed_tz: None,
+                    tz_str: None,
+                },
+                step: *step,
+                current: TimestampValue {
+                    value: *start,
+                    parsed_tz: None,
+                    tz_str: None,
+                },
+                batch_size,
+                include_end: *include_end,
+                name,
+            })),
+        };
+
+        Ok(generator)
+    }
+}
+
 #[derive(Debug, Clone)]
-struct GenerateSeriesState {
+pub struct GenericSeriesState<T: SeriesValue> {
     schema: SchemaRef,
-    start: i64, // Kept for display
-    end: i64,
-    step: i64,
+    start: T,
+    end: T,
+    step: T::StepType,
     batch_size: usize,
-
-    /// Tracks current position when generating table
-    current: i64,
-    /// Indicates whether the end value should be included in the series.
+    current: T,
     include_end: bool,
     name: &'static str,
 }
 
-impl GenerateSeriesState {
-    fn reach_end(&self, val: i64) -> bool {
-        if self.step > 0 {
-            if self.include_end {
-                return val > self.end;
-            } else {
-                return val >= self.end;
-            }
+impl<T: SeriesValue> GenericSeriesState<T> {
+    pub fn name(&self) -> &'static str {
+        self.name
+    }
+
+    pub fn batch_size(&self) -> usize {
+        self.batch_size
+    }
+
+    pub fn include_end(&self) -> bool {
+        self.include_end
+    }
+
+    pub fn start(&self) -> &T {
+        &self.start
+    }
+
+    pub fn end(&self) -> &T {
+        &self.end
+    }
+
+    pub fn step(&self) -> &T::StepType {
+        &self.step
+    }
+
+    pub fn current(&self) -> &T {
+        &self.current
+    }
+}
+
+impl<T: SeriesValue> LazyBatchGenerator for GenericSeriesState<T> {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        let mut buf = Vec::with_capacity(self.batch_size);
+
+        while buf.len() < self.batch_size
+            && !self
+                .current
+                .should_stop(self.end.clone(), &self.step, self.include_end)
+        {
+            buf.push(self.current.to_value_type());
+            self.current.advance(&self.step)?;
         }
 
-        if self.include_end {
-            val < self.end
-        } else {
-            val <= self.end
+        if buf.is_empty() {
+            return Ok(None);
         }
+
+        let array = self.current.create_array(buf)?;
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![array])?;
+        Ok(Some(batch))
+    }
+
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+        let mut new = self.clone();
+        new.current = new.start.clone();
+        Arc::new(RwLock::new(new))
     }
 }
 
-/// Detail to display for 'Explain' plan
-impl fmt::Display for GenerateSeriesState {
+impl<T: SeriesValue> fmt::Display for GenericSeriesState<T> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
             "{}: start={}, end={}, batch_size={}",
-            self.name, self.start, self.end, self.batch_size
+            self.name,
+            self.start.display_value(),
+            self.end.display_value(),
+            self.batch_size
         )
     }
 }
 
-impl LazyBatchGenerator for GenerateSeriesState {
-    fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>> {
-        let mut buf = Vec::with_capacity(self.batch_size);
-        while buf.len() < self.batch_size && !self.reach_end(self.current) {
-            buf.push(self.current);
-            self.current += self.step;
-        }
-        let array = Int64Array::from(buf);
-
-        if array.is_empty() {
-            return Ok(None);
-        }
-
-        let batch =
-            RecordBatch::try_new(Arc::clone(&self.schema), vec![Arc::new(array)])?;
+fn reach_end_int64(val: i64, end: i64, step: i64, include_end: bool) -> bool {
+    if step > 0 {
+        if include_end { val > end } else { val >= end }
+    } else if include_end {
+        val < end
+    } else {
+        val <= end
+    }
+}
 
-        Ok(Some(batch))
+fn validate_interval_step(step: IntervalMonthDayNano) -> Result<()> {
+    if step.months == 0 && step.days == 0 && step.nanoseconds == 0 {
+        return plan_err!("Step interval cannot be zero");
     }
+
+    Ok(())
 }
 
 #[async_trait]
 impl TableProvider for GenerateSeriesTable {
-    fn as_any(&self) -> &dyn std::any::Any {
+    fn as_any(&self) -> &dyn Any {
         self
     }
 
@@ -143,44 +463,12 @@ impl TableProvider for GenerateSeriesTable {
         _limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let batch_size = state.config_options().execution.batch_size;
-        let schema = match projection {
-            Some(projection) => Arc::new(self.schema.project(projection)?),
-            None => self.schema(),
-        };
-        let state = match self.args {
-            // if args have null, then return 0 row
-            GenSeriesArgs::ContainsNull { include_end, name } => GenerateSeriesState {
-                schema: self.schema(),
-                start: 0,
-                end: 0,
-                step: 1,
-                current: 1,
-                batch_size,
-                include_end,
-                name,
-            },
-            GenSeriesArgs::AllNotNullArgs {
-                start,
-                end,
-                step,
-                include_end,
-                name,
-            } => GenerateSeriesState {
-                schema: self.schema(),
-                start,
-                end,
-                step,
-                current: start,
-                batch_size,
-                include_end,
-                name,
-            },
-        };
+        let generator = self.as_generator(batch_size)?;
 
-        Ok(Arc::new(LazyMemoryExec::try_new(
-            schema,
-            vec![Arc::new(RwLock::new(state))],
-        )?))
+        Ok(Arc::new(
+            LazyMemoryExec::try_new(self.schema(), vec![generator])?
+                .with_projection(projection.cloned()),
+        ))
     }
 }
 
@@ -196,12 +484,44 @@ impl TableFunctionImpl for GenerateSeriesFuncImpl {
             return plan_err!("{} function requires 1 to 3 arguments", self.name);
         }
 
+        // Determine the data type from the first argument
+        match &exprs[0] {
+            Expr::Literal(
+                // Default to int64 for null
+                ScalarValue::Null | ScalarValue::Int64(_),
+                _,
+            ) => self.call_int64(exprs),
+            Expr::Literal(s, _) if matches!(s.data_type(), DataType::Timestamp(_, _)) => {
+                self.call_timestamp(exprs)
+            }
+            Expr::Literal(s, _) if matches!(s.data_type(), DataType::Date32) => {
+                self.call_date(exprs)
+            }
+            Expr::Literal(scalar, _) => {
+                plan_err!(
+                    "Argument #1 must be an INTEGER, TIMESTAMP, DATE or NULL, got {:?}",
+                    scalar.data_type()
+                )
+            }
+            _ => plan_err!("Arguments must be literals"),
+        }
+    }
+}
+
+impl GenerateSeriesFuncImpl {
+    fn call_int64(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
         let mut normalize_args = Vec::new();
-        for expr in exprs {
+        for (expr_index, expr) in exprs.iter().enumerate() {
             match expr {
-                Expr::Literal(ScalarValue::Null) => {}
-                Expr::Literal(ScalarValue::Int64(Some(n))) => normalize_args.push(*n),
-                _ => return plan_err!("First argument must be an integer literal"),
+                Expr::Literal(ScalarValue::Null, _) => {}
+                Expr::Literal(ScalarValue::Int64(Some(n)), _) => normalize_args.push(*n),
+                other => {
+                    return plan_err!(
+                        "Argument #{} must be an INTEGER or NULL, got {:?}",
+                        expr_index + 1,
+                        other
+                    );
+                }
             };
         }
 
@@ -215,10 +535,7 @@ impl TableFunctionImpl for GenerateSeriesFuncImpl {
             // contain null
             return Ok(Arc::new(GenerateSeriesTable {
                 schema,
-                args: GenSeriesArgs::ContainsNull {
-                    include_end: self.include_end,
-                    name: self.name,
-                },
+                args: GenSeriesArgs::ContainsNull { name: self.name },
             }));
         }
 
@@ -231,24 +548,184 @@ impl TableFunctionImpl for GenerateSeriesFuncImpl {
             }
         };
 
-        if start > end && step > 0 {
-            return plan_err!("start is bigger than end, but increment is positive: cannot generate infinite series");
+        if step == 0 {
+            return plan_err!("Step cannot be zero");
         }
 
-        if start < end && step < 0 {
-            return plan_err!("start is smaller than end, but increment is negative: cannot generate infinite series");
-        }
+        Ok(Arc::new(GenerateSeriesTable {
+            schema,
+            args: GenSeriesArgs::Int64Args {
+                start,
+                end,
+                step,
+                include_end: self.include_end,
+                name: self.name,
+            },
+        }))
+    }
 
-        if step == 0 {
-            return plan_err!("step cannot be zero");
+    fn call_timestamp(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        if exprs.len() != 3 {
+            return plan_err!(
+                "{} function with timestamps requires exactly 3 arguments",
+                self.name
+            );
         }
 
+        // Parse start timestamp
+        let (start_ts, tz) = match &exprs[0] {
+            Expr::Literal(ScalarValue::TimestampNanosecond(ts, tz), _) => {
+                (*ts, tz.clone())
+            }
+            other => {
+                return plan_err!(
+                    "First argument must be a timestamp or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        // Parse end timestamp
+        let end_ts = match &exprs[1] {
+            Expr::Literal(ScalarValue::Null, _) => None,
+            Expr::Literal(ScalarValue::TimestampNanosecond(ts, _), _) => *ts,
+            other => {
+                return plan_err!(
+                    "Second argument must be a timestamp or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        // Parse step interval
+        let step_interval = match &exprs[2] {
+            Expr::Literal(ScalarValue::Null, _) => None,
+            Expr::Literal(ScalarValue::IntervalMonthDayNano(interval), _) => *interval,
+            other => {
+                return plan_err!(
+                    "Third argument must be an interval or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()),
+            false,
+        )]));
+
+        // Check if any argument is null
+        let (Some(start), Some(end), Some(step)) = (start_ts, end_ts, step_interval)
+        else {
+            return Ok(Arc::new(GenerateSeriesTable {
+                schema,
+                args: GenSeriesArgs::ContainsNull { name: self.name },
+            }));
+        };
+
+        // Validate step interval
+        validate_interval_step(step)?;
+
         Ok(Arc::new(GenerateSeriesTable {
             schema,
-            args: GenSeriesArgs::AllNotNullArgs {
+            args: GenSeriesArgs::TimestampArgs {
                 start,
                 end,
                 step,
+                tz,
+                include_end: self.include_end,
+                name: self.name,
+            },
+        }))
+    }
+
+    fn call_date(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
+        if exprs.len() != 3 {
+            return plan_err!(
+                "{} function with dates requires exactly 3 arguments",
+                self.name
+            );
+        }
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        )]));
+
+        // Parse start date
+        let start_date = match &exprs[0] {
+            Expr::Literal(ScalarValue::Date32(Some(date)), _) => *date,
+            Expr::Literal(ScalarValue::Date32(None), _)
+            | Expr::Literal(ScalarValue::Null, _) => {
+                return Ok(Arc::new(GenerateSeriesTable {
+                    schema,
+                    args: GenSeriesArgs::ContainsNull { name: self.name },
+                }));
+            }
+            other => {
+                return plan_err!(
+                    "First argument must be a date or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        // Parse end date
+        let end_date = match &exprs[1] {
+            Expr::Literal(ScalarValue::Date32(Some(date)), _) => *date,
+            Expr::Literal(ScalarValue::Date32(None), _)
+            | Expr::Literal(ScalarValue::Null, _) => {
+                return Ok(Arc::new(GenerateSeriesTable {
+                    schema,
+                    args: GenSeriesArgs::ContainsNull { name: self.name },
+                }));
+            }
+            other => {
+                return plan_err!(
+                    "Second argument must be a date or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        // Parse step interval
+        let step_interval = match &exprs[2] {
+            Expr::Literal(ScalarValue::IntervalMonthDayNano(Some(interval)), _) => {
+                *interval
+            }
+            Expr::Literal(ScalarValue::IntervalMonthDayNano(None), _)
+            | Expr::Literal(ScalarValue::Null, _) => {
+                return Ok(Arc::new(GenerateSeriesTable {
+                    schema,
+                    args: GenSeriesArgs::ContainsNull { name: self.name },
+                }));
+            }
+            other => {
+                return plan_err!(
+                    "Third argument must be an interval or NULL, got {:?}",
+                    other
+                );
+            }
+        };
+
+        // Convert Date32 (days since epoch) to timestamp nanoseconds (nanoseconds since epoch)
+        // Date32 is days since 1970-01-01, so multiply by nanoseconds per day
+        const NANOS_PER_DAY: i64 = 24 * 60 * 60 * 1_000_000_000;
+
+        let start_ts = start_date as i64 * NANOS_PER_DAY;
+        let end_ts = end_date as i64 * NANOS_PER_DAY;
+
+        // Validate step interval
+        validate_interval_step(step_interval)?;
+
+        Ok(Arc::new(GenerateSeriesTable {
+            schema,
+            args: GenSeriesArgs::DateArgs {
+                start: start_ts,
+                end: end_ts,
+                step: step_interval,
                 include_end: self.include_end,
                 name: self.name,
             },
@@ -281,3 +758,40 @@ impl TableFunctionImpl for RangeFunc {
         impl_func.call(exprs)
     }
 }
+
+#[cfg(test)]
+mod generate_series_tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_physical_plan::memory::LazyBatchGenerator;
+
+    use crate::generate_series::GenericSeriesState;
+
+    #[test]
+    fn test_generic_series_state_reset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let mut state = GenericSeriesState::<i64> {
+            schema,
+            start: 1,
+            end: 5,
+            step: 1,
+            current: 1,
+            batch_size: 8192,
+            include_end: true,
+            name: "test",
+        };
+        let batch = state.generate_next_batch()?.expect("missing batch");
+
+        let state_reset = state.reset_state();
+        let reset_batch = state_reset
+            .write()
+            .generate_next_batch()?
+            .expect("missing reset batch");
+
+        assert_eq!(batch, reset_batch);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/functions-table/src/lib.rs b/datafusion/functions-table/src/lib.rs
index 36fcdc7ede56c..668e964901c04 100644
--- a/datafusion/functions-table/src/lib.rs
+++ b/datafusion/functions-table/src/lib.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
@@ -37,25 +38,27 @@ pub fn all_default_table_functions() -> Vec<Arc<TableFunction>> {
 /// Creates a singleton instance of a table function
 /// - `$module`: A struct implementing `TableFunctionImpl` to create the function from
 /// - `$name`: The name to give to the created function
-///
-/// This is used to ensure creating the list of `TableFunction` only happens once.
+/// - `$func_name`: The name of the function to be called
+///   This is used to ensure creating the list of `TableFunction` only happens once.
 #[macro_export]
 macro_rules! create_udtf_function {
-    ($module:path, $name:expr) => {
-        paste::paste! {
-            pub fn [<$name:lower>]() -> Arc<TableFunction> {
-                static INSTANCE: std::sync::LazyLock<Arc<TableFunction>> =
-                    std::sync::LazyLock::new(|| {
-                        std::sync::Arc::new(TableFunction::new(
-                            $name.to_string(),
-                            Arc::new($module {}),
-                        ))
-                    });
-                std::sync::Arc::clone(&INSTANCE)
-            }
+    ($module:expr, $func_name:ident, $name:expr) => {
+        pub fn $func_name() -> Arc<TableFunction> {
+            static INSTANCE: std::sync::LazyLock<Arc<TableFunction>> =
+                std::sync::LazyLock::new(|| {
+                    std::sync::Arc::new(TableFunction::new(
+                        $name.to_string(),
+                        Arc::new($module),
+                    ))
+                });
+            std::sync::Arc::clone(&INSTANCE)
         }
     };
 }
 
-create_udtf_function!(generate_series::GenerateSeriesFunc, "generate_series");
-create_udtf_function!(generate_series::RangeFunc, "range");
+create_udtf_function!(
+    generate_series::GenerateSeriesFunc {},
+    generate_series,
+    "generate_series"
+);
+create_udtf_function!(generate_series::RangeFunc {}, range, "range");
diff --git a/datafusion/functions-window-common/Cargo.toml b/datafusion/functions-window-common/Cargo.toml
index 466e7bc68b486..6af668c1459e8 100644
--- a/datafusion/functions-window-common/Cargo.toml
+++ b/datafusion/functions-window-common/Cargo.toml
@@ -31,6 +31,9 @@ version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
diff --git a/datafusion/functions-window-common/README.md b/datafusion/functions-window-common/README.md
index de12d25f97319..f2e45880724e0 100644
--- a/datafusion/functions-window-common/README.md
+++ b/datafusion/functions-window-common/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Window Function Common Library
+# Apache DataFusion Window Function Common Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate contains common functions for implementing user-defined window functions.
+This crate contains common functions for implementing window functions.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-window-common/src/expr.rs b/datafusion/functions-window-common/src/expr.rs
index 774cd5182b30b..7ae43906c4553 100644
--- a/datafusion/functions-window-common/src/expr.rs
+++ b/datafusion/functions-window-common/src/expr.rs
@@ -37,9 +37,8 @@ impl<'a> ExpressionArgs<'a> {
     ///
     /// * `input_exprs` - The expressions passed as arguments
     ///   to the user-defined window function.
-    /// * `input_types` - The data types corresponding to the
+    /// * `input_fields` - The fields corresponding to the
     ///   arguments to the user-defined window function.
-    ///
     pub fn new(
         input_exprs: &'a [Arc<dyn PhysicalExpr>],
         input_fields: &'a [FieldRef],
diff --git a/datafusion/functions-window-common/src/field.rs b/datafusion/functions-window-common/src/field.rs
index 8d22efa3bcf44..8e33930ff760b 100644
--- a/datafusion/functions-window-common/src/field.rs
+++ b/datafusion/functions-window-common/src/field.rs
@@ -36,7 +36,6 @@ impl<'a> WindowUDFFieldArgs<'a> {
     ///   arguments to the user-defined window function.
     /// * `function_name` - The qualified schema name of the
     ///   user-defined window function expression.
-    ///
     pub fn new(input_fields: &'a [FieldRef], display_name: &'a str) -> Self {
         WindowUDFFieldArgs {
             input_fields,
diff --git a/datafusion/functions-window-common/src/lib.rs b/datafusion/functions-window-common/src/lib.rs
index 7f668a20a76a6..301f2c34a6c95 100644
--- a/datafusion/functions-window-common/src/lib.rs
+++ b/datafusion/functions-window-common/src/lib.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
diff --git a/datafusion/functions-window-common/src/partition.rs b/datafusion/functions-window-common/src/partition.rs
index 61125e596130b..463419d5f0193 100644
--- a/datafusion/functions-window-common/src/partition.rs
+++ b/datafusion/functions-window-common/src/partition.rs
@@ -42,13 +42,12 @@ impl<'a> PartitionEvaluatorArgs<'a> {
     ///
     /// * `input_exprs` - The expressions passed as arguments
     ///   to the user-defined window function.
-    /// * `input_types` - The data types corresponding to the
+    /// * `input_fields` - The fields corresponding to the
     ///   arguments to the user-defined window function.
     /// * `is_reversed` - Set to `true` if and only if the user-defined
     ///   window function is reversible and is reversed.
     /// * `ignore_nulls` - Set to `true` when `IGNORE NULLS` is
     ///   specified.
-    ///
     pub fn new(
         input_exprs: &'a [Arc<dyn PhysicalExpr>],
         input_fields: &'a [FieldRef],
diff --git a/datafusion/functions-window/Cargo.toml b/datafusion/functions-window/Cargo.toml
index 23ee608a82675..9c4342adae8fd 100644
--- a/datafusion/functions-window/Cargo.toml
+++ b/datafusion/functions-window/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -47,4 +50,11 @@ datafusion-macros = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 log = { workspace = true }
-paste = "1.0.15"
+
+[dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
+criterion = { workspace = true }
+
+[[bench]]
+name = "nth_value"
+harness = false
diff --git a/datafusion/functions-window/README.md b/datafusion/functions-window/README.md
index 18590983ca473..f2bb9f53f5307 100644
--- a/datafusion/functions-window/README.md
+++ b/datafusion/functions-window/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Window Function Library
+# Apache DataFusion Window Function Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate contains user-defined window functions.
+This crate contains window function definitions.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions-window/benches/nth_value.rs b/datafusion/functions-window/benches/nth_value.rs
new file mode 100644
index 0000000000000..00daf9fa4f9ba
--- /dev/null
+++ b/datafusion/functions-window/benches/nth_value.rs
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::ops::Range;
+use std::slice;
+use std::sync::Arc;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef, Int64Type};
+use arrow::util::bench_util::create_primitive_array;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_expr::{PartitionEvaluator, WindowUDFImpl};
+use datafusion_functions_window::nth_value::{NthValue, NthValueKind};
+use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+use datafusion_physical_expr::expressions::{Column, Literal};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+const ARRAY_SIZE: usize = 8192;
+
+/// Creates a partition evaluator for FIRST_VALUE, LAST_VALUE, or NTH_VALUE
+fn create_evaluator(
+    kind: NthValueKind,
+    ignore_nulls: bool,
+    n: Option<i64>,
+) -> Box<dyn PartitionEvaluator> {
+    let expr = Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>;
+    let input_field: FieldRef = Field::new("c", DataType::Int64, true).into();
+    let input_fields = vec![input_field];
+
+    let (nth_value, exprs): (NthValue, Vec<Arc<dyn PhysicalExpr>>) = match kind {
+        NthValueKind::First => (NthValue::first(), vec![expr]),
+        NthValueKind::Last => (NthValue::last(), vec![expr]),
+        NthValueKind::Nth => {
+            let n_value =
+                Arc::new(Literal::new(ScalarValue::Int64(n))) as Arc<dyn PhysicalExpr>;
+            (NthValue::nth(), vec![expr, n_value])
+        }
+    };
+
+    let args = PartitionEvaluatorArgs::new(&exprs, &input_fields, false, ignore_nulls);
+    nth_value.partition_evaluator(args).unwrap()
+}
+
+fn bench_nth_value_ignore_nulls(c: &mut Criterion) {
+    let mut group = c.benchmark_group("nth_value_ignore_nulls");
+
+    // Test different null densities
+    let null_densities = [0.0, 0.3, 0.5, 0.8];
+
+    for null_density in null_densities {
+        let values = Arc::new(create_primitive_array::<Int64Type>(
+            ARRAY_SIZE,
+            null_density,
+        )) as ArrayRef;
+        let null_pct = (null_density * 100.0) as u32;
+
+        // FIRST_VALUE with ignore_nulls - expanding window
+        group.bench_function(
+            BenchmarkId::new("first_value_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // LAST_VALUE with ignore_nulls - expanding window
+        group.bench_function(
+            BenchmarkId::new("last_value_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::Last, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // NTH_VALUE(col, 10) with ignore_nulls - get 10th non-null value
+        group.bench_function(
+            BenchmarkId::new("nth_value_10_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator =
+                        create_evaluator(NthValueKind::Nth, true, Some(10));
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // NTH_VALUE(col, -10) with ignore_nulls - get 10th from last non-null value
+        group.bench_function(
+            BenchmarkId::new("nth_value_neg10_expanding", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator =
+                        create_evaluator(NthValueKind::Nth, true, Some(-10));
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let range = Range {
+                            start: 0,
+                            end: i + 1,
+                        };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        // Sliding window benchmarks with 100-row window
+        let window_size: usize = 100;
+
+        group.bench_function(
+            BenchmarkId::new("first_value_sliding_100", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let start = i.saturating_sub(window_size - 1);
+                        let range = Range { start, end: i + 1 };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+
+        group.bench_function(
+            BenchmarkId::new("last_value_sliding_100", format!("{null_pct}%_nulls")),
+            |b| {
+                b.iter(|| {
+                    let mut evaluator = create_evaluator(NthValueKind::Last, true, None);
+                    let values_slice = slice::from_ref(&values);
+                    for i in 0..values.len() {
+                        let start = i.saturating_sub(window_size - 1);
+                        let range = Range { start, end: i + 1 };
+                        black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                    }
+                })
+            },
+        );
+    }
+
+    group.finish();
+
+    // Comparison benchmarks: ignore_nulls vs respect_nulls
+    let mut comparison_group = c.benchmark_group("nth_value_nulls_comparison");
+    let values_with_nulls =
+        Arc::new(create_primitive_array::<Int64Type>(ARRAY_SIZE, 0.5)) as ArrayRef;
+
+    // FIRST_VALUE comparison
+    comparison_group.bench_function(
+        BenchmarkId::new("first_value", "ignore_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::First, true, None);
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.bench_function(
+        BenchmarkId::new("first_value", "respect_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::First, false, None);
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    // NTH_VALUE comparison
+    comparison_group.bench_function(
+        BenchmarkId::new("nth_value_10", "ignore_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::Nth, true, Some(10));
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.bench_function(
+        BenchmarkId::new("nth_value_10", "respect_nulls"),
+        |b| {
+            b.iter(|| {
+                let mut evaluator = create_evaluator(NthValueKind::Nth, false, Some(10));
+                let values_slice = slice::from_ref(&values_with_nulls);
+                for i in 0..values_with_nulls.len() {
+                    let range = Range {
+                        start: 0,
+                        end: i + 1,
+                    };
+                    black_box(evaluator.evaluate(values_slice, &range).unwrap());
+                }
+            })
+        },
+    );
+
+    comparison_group.finish();
+}
+
+criterion_group!(benches, bench_nth_value_ignore_nulls);
+criterion_main!(benches);
diff --git a/datafusion/functions-window/src/cume_dist.rs b/datafusion/functions-window/src/cume_dist.rs
index ed8669948188d..8e1cb1b1e6639 100644
--- a/datafusion/functions-window/src/cume_dist.rs
+++ b/datafusion/functions-window/src/cume_dist.rs
@@ -18,16 +18,17 @@
 //! `cume_dist` window function implementation
 
 use arrow::datatypes::FieldRef;
+use datafusion_common::Result;
 use datafusion_common::arrow::array::{ArrayRef, Float64Array};
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
-use datafusion_common::Result;
 use datafusion_expr::{
-    Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
 use std::any::Any;
 use std::fmt::Debug;
@@ -38,6 +39,7 @@ use std::sync::Arc;
 define_udwf_and_expr!(
     CumeDist,
     cume_dist,
+    cume_dist_udwf,
     "Calculates the cumulative distribution of a value in a group of values."
 );
 
@@ -46,13 +48,13 @@ define_udwf_and_expr!(
     doc_section(label = "Ranking Functions"),
     description = "Relative rank of the current row: (number of rows preceding or peer with the current row) / (total rows).",
     syntax_example = "cume_dist()",
-    sql_example = r#"```sql
-    --Example usage of the cume_dist window function:
-    SELECT salary,
-       cume_dist() OVER (ORDER BY salary) AS cume_dist
-    FROM employees;
-```
+    sql_example = r#"
 ```sql
+-- Example usage of the cume_dist window function:
+SELECT salary,
+    cume_dist() OVER (ORDER BY salary) AS cume_dist
+FROM employees;
+
 +--------+-----------+
 | salary | cume_dist |
 +--------+-----------+
@@ -60,9 +62,10 @@ define_udwf_and_expr!(
 | 50000  | 0.67      |
 | 70000  | 1.00      |
 +--------+-----------+
-```"#
+```
+"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CumeDist {
     signature: Signature,
 }
@@ -109,6 +112,10 @@ impl WindowUDFImpl for CumeDist {
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 #[derive(Debug, Default)]
@@ -161,7 +168,7 @@ mod tests {
     }
 
     #[test]
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_cume_dist() -> Result<()> {
         test_f64_result(0, vec![], vec![])?;
 
diff --git a/datafusion/functions-window/src/lead_lag.rs b/datafusion/functions-window/src/lead_lag.rs
index e2a755371ebc8..fbb84483e23cf 100644
--- a/datafusion/functions-window/src/lead_lag.rs
+++ b/datafusion/functions-window/src/lead_lag.rs
@@ -22,25 +22,28 @@ use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
-use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue};
-use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL;
+use datafusion_common::{DataFusionError, Result, ScalarValue, arrow_datafusion_err};
+use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL;
 use datafusion_expr::{
-    Documentation, Literal, PartitionEvaluator, ReversedUDWF, Signature, TypeSignature,
-    Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, Literal, PartitionEvaluator, ReversedUDWF, Signature,
+    TypeSignature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::expr::ExpressionArgs;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+use datafusion_physical_expr::expressions;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use std::any::Any;
 use std::cmp::min;
 use std::collections::VecDeque;
+use std::hash::Hash;
 use std::ops::{Neg, Range};
 use std::sync::{Arc, LazyLock};
 
 get_or_init_udwf!(
     Lag,
     lag,
+    lag_udwf,
     "Returns the row value that precedes the current row by a specified \
     offset within partition. If no such row exists, then returns the \
     default value.",
@@ -49,6 +52,7 @@ get_or_init_udwf!(
 get_or_init_udwf!(
     Lead,
     lead,
+    lead_udwf,
     "Returns the value from a row that follows the current row by a \
     specified offset within the partition. If no such row exists, then \
     returns the default value.",
@@ -93,8 +97,8 @@ pub fn lead(
     lead_udwf().call(vec![arg, shift_offset_lit, default_lit])
 }
 
-#[derive(Debug)]
-enum WindowShiftKind {
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum WindowShiftKind {
     Lag,
     Lead,
 }
@@ -119,7 +123,7 @@ impl WindowShiftKind {
 }
 
 /// window shift expression
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct WindowShift {
     signature: Signature,
     kind: WindowShiftKind,
@@ -135,7 +139,13 @@ impl WindowShift {
                     TypeSignature::Any(3),
                 ],
                 Volatility::Immutable,
-            ),
+            )
+            .with_parameter_names(vec![
+                "expr".to_string(),
+                "offset".to_string(),
+                "default".to_string(),
+            ])
+            .expect("valid parameter names for lead/lag"),
             kind,
         }
     }
@@ -147,6 +157,10 @@ impl WindowShift {
     pub fn lead() -> Self {
         Self::new(WindowShiftKind::Lead)
     }
+
+    pub fn kind(&self) -> &WindowShiftKind {
+        &self.kind
+    }
 }
 
 static LAG_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
@@ -158,15 +172,14 @@ static LAG_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
         the value of expression should be retrieved. Defaults to 1.")
         .with_argument("default", "The default value if the offset is \
         not within the partition. Must be of the same type as expression.")
-        .with_sql_example(r#"```sql
-    --Example usage of the lag window function:
-    SELECT employee_id,
-           salary,
-           lag(salary, 1, 0) OVER (ORDER BY employee_id) AS prev_salary
-    FROM employees;
-```
-
+        .with_sql_example(r#"
 ```sql
+-- Example usage of the lag window function:
+SELECT employee_id,
+    salary,
+    lag(salary, 1, 0) OVER (ORDER BY employee_id) AS prev_salary
+FROM employees;
+
 +-------------+--------+-------------+
 | employee_id | salary | prev_salary |
 +-------------+--------+-------------+
@@ -175,7 +188,8 @@ static LAG_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
 | 3           | 70000  | 50000       |
 | 4           | 60000  | 70000       |
 +-------------+--------+-------------+
-```"#)
+```
+"#)
         .build()
 });
 
@@ -194,17 +208,16 @@ static LEAD_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
         forward the value of expression should be retrieved. Defaults to 1.")
         .with_argument("default", "The default value if the offset is \
         not within the partition. Must be of the same type as expression.")
-        .with_sql_example(r#"```sql
--- Example usage of lead() :
+        .with_sql_example(r#"
+```sql
+-- Example usage of lead window function:
 SELECT
     employee_id,
     department,
     salary,
     lead(salary, 1, 0) OVER (PARTITION BY department ORDER BY salary) AS next_salary
 FROM employees;
-```
 
-```sql
 +-------------+-------------+--------+--------------+
 | employee_id | department  | salary | next_salary  |
 +-------------+-------------+--------+--------------+
@@ -214,7 +227,8 @@ FROM employees;
 | 4           | Engineering | 40000  | 60000        |
 | 5           | Engineering | 60000  | 0            |
 +-------------+-------------+--------+--------------+
-```"#)
+```
+"#)
         .build()
 });
 
@@ -252,7 +266,7 @@ impl WindowUDFImpl for WindowShift {
     ) -> Result<Box<dyn PartitionEvaluator>> {
         let shift_offset =
             get_scalar_value_from_args(partition_evaluator_args.input_exprs(), 1)?
-                .map(get_signed_integer)
+                .map(|v| get_signed_integer(&v))
                 .map_or(Ok(None), |v| v.map(Some))
                 .map(|n| self.kind.shift_offset(n))
                 .map(|offset| {
@@ -298,6 +312,26 @@ impl WindowUDFImpl for WindowShift {
             WindowShiftKind::Lead => Some(get_lead_doc()),
         }
     }
+
+    fn limit_effect(&self, args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        if self.kind == WindowShiftKind::Lag {
+            return LimitEffect::None;
+        }
+        match args {
+            [_, expr, ..] => {
+                let Some(lit) = expr.as_any().downcast_ref::<expressions::Literal>()
+                else {
+                    return LimitEffect::Unknown;
+                };
+                let ScalarValue::Int64(Some(amount)) = lit.value() else {
+                    return LimitEffect::Unknown; // we should only get int64 from the parser
+                };
+                LimitEffect::Relative((*amount).max(0) as usize)
+            }
+            [_] => LimitEffect::Relative(1), // default value
+            _ => LimitEffect::Unknown,       // invalid arguments
+        }
+    }
 }
 
 /// When `lead`/`lag` is evaluated on a `NULL` expression we attempt to
@@ -329,10 +363,8 @@ fn parse_expr(
 
     let default_value = get_scalar_value_from_args(input_exprs, 2)?;
     default_value.map_or(Ok(expr), |value| {
-        ScalarValue::try_from(&value.data_type()).map(|v| {
-            Arc::new(datafusion_physical_expr::expressions::Literal::new(v))
-                as Arc<dyn PhysicalExpr>
-        })
+        ScalarValue::try_from(&value.data_type())
+            .map(|v| Arc::new(expressions::Literal::new(v)) as Arc<dyn PhysicalExpr>)
     })
 }
 
@@ -610,7 +642,7 @@ impl PartitionEvaluator for WindowShiftEvaluator {
         // OR
         // - ignore nulls mode and current value is null and is within window bounds
         // .unwrap() is safe here as there is a none check in front
-        #[allow(clippy::unnecessary_unwrap)]
+        #[expect(clippy::unnecessary_unwrap)]
         if !(idx.is_none() || (self.ignore_nulls && array.is_null(idx.unwrap()))) {
             ScalarValue::try_from_array(array, idx.unwrap())
         } else {
diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs
index 10e09542d7c5d..6edfb92744f5b 100644
--- a/datafusion/functions-window/src/lib.rs
+++ b/datafusion/functions-window/src/lib.rs
@@ -15,14 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+// https://github.com/apache/datafusion/issues/18881
 
 //! Window Function packages for [DataFusion].
 //!
@@ -30,14 +32,13 @@
 //! implemented using the extension API.
 //!
 //! [DataFusion]: https://crates.io/crates/datafusion
-//!
 
 use std::sync::Arc;
 
 use log::debug;
 
-use datafusion_expr::registry::FunctionRegistry;
 use datafusion_expr::WindowUDF;
+use datafusion_expr::registry::FunctionRegistry;
 
 #[macro_use]
 pub mod macros;
diff --git a/datafusion/functions-window/src/macros.rs b/datafusion/functions-window/src/macros.rs
index 23414a7a7172a..aeb54356f8966 100644
--- a/datafusion/functions-window/src/macros.rs
+++ b/datafusion/functions-window/src/macros.rs
@@ -30,8 +30,8 @@
 ///
 /// * `$UDWF`: The struct which defines the [`Signature`](datafusion_expr::Signature)
 ///   of the user-defined window function.
-/// * `$OUT_FN_NAME`: The basename to generate a unique function name like
-///   `$OUT_FN_NAME_udwf`.
+/// * `$OUT_FN_NAME`: The expression function name
+///   `UDWF_FN` : The unique function name
 /// * `$DOC`: Doc comments for UDWF.
 /// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
 ///   automatically resolves to `$UDWF::default()`.
@@ -52,12 +52,13 @@
 /// get_or_init_udwf!(
 ///     SimpleUDWF,
 ///     simple,
+///     simple_udwf,
 ///     "Simple user-defined window function doc comment."
 /// );
 /// #
 /// # assert_eq!(simple_udwf().name(), "simple_user_defined_window_function");
 /// #
-/// #  #[derive(Debug)]
+/// #  #[derive(Debug, PartialEq, Eq, Hash)]
 /// #  struct SimpleUDWF {
 /// #      signature: Signature,
 /// #  }
@@ -94,16 +95,15 @@
 /// ```
 #[macro_export]
 macro_rules! get_or_init_udwf {
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $UDWF::default);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $UDWF::default);
     };
 
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
             #[doc = concat!(" Returns a [`WindowUDF`](datafusion_expr::WindowUDF) for [`", stringify!($OUT_FN_NAME), "`].")]
             #[doc = ""]
             #[doc = concat!(" ", $DOC)]
-            pub fn [<$OUT_FN_NAME _udwf>]() -> std::sync::Arc<datafusion_expr::WindowUDF> {
+            pub fn $UDWF_FN() -> std::sync::Arc<datafusion_expr::WindowUDF> {
                 // Singleton instance of UDWF, ensures it is only created once.
                 static INSTANCE: std::sync::LazyLock<std::sync::Arc<datafusion_expr::WindowUDF>> =
                     std::sync::LazyLock::new(|| {
@@ -111,7 +111,6 @@ macro_rules! get_or_init_udwf {
                     });
                 std::sync::Arc::clone(&INSTANCE)
             }
-        }
     };
 }
 
@@ -149,6 +148,7 @@ macro_rules! get_or_init_udwf {
 /// # get_or_init_udwf!(
 /// #     RowNumber,
 /// #     row_number,
+/// #     row_number_udwf,
 /// #     "Returns a unique row number for each row in window partition beginning at 1."
 /// # );
 /// /// Creates `row_number()` API which has zero parameters:
@@ -163,6 +163,7 @@ macro_rules! get_or_init_udwf {
 /// create_udwf_expr!(
 ///     RowNumber,
 ///     row_number,
+///     row_number_udwf,
 ///     "Returns a unique row number for each row in window partition beginning at 1."
 /// );
 /// #
@@ -171,7 +172,7 @@ macro_rules! get_or_init_udwf {
 /// #     "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// # );
 /// #
-/// # #[derive(Debug)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct RowNumber {
 /// #     signature: Signature,
 /// # }
@@ -221,7 +222,7 @@ macro_rules! get_or_init_udwf {
 /// # use datafusion_expr::{col, lit};
 /// # use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 /// #
-/// # get_or_init_udwf!(Lead, lead, "user-defined window function");
+/// # get_or_init_udwf!(Lead, lead,lead_udwf, "user-defined window function");
 /// #
 /// /// Creates `lead(expr, offset, default)` with 3 parameters:
 /// ///
@@ -240,6 +241,7 @@ macro_rules! get_or_init_udwf {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],
+///     lead_udwf,
 ///     "Returns a value evaluated at the row that is offset rows after the current row within the partition."
 /// );
 /// #
@@ -250,7 +252,7 @@ macro_rules! get_or_init_udwf {
 /// #     "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// # );
 /// #
-/// # #[derive(Debug)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct Lead {
 /// #     signature: Signature,
 /// # }
@@ -298,21 +300,18 @@ macro_rules! get_or_init_udwf {
 #[macro_export]
 macro_rules! create_udwf_expr {
     // zero arguments
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
             #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"]
             #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")]
             #[doc = ""]
             #[doc = concat!(" ", $DOC)]
             pub fn $OUT_FN_NAME() -> datafusion_expr::Expr {
-                [<$OUT_FN_NAME _udwf>]().call(vec![])
+                $UDWF_FN().call(vec![])
             }
-       }
     };
 
     // 1 or more arguments
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => {
-        paste::paste! {
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $UDWF_FN:ident, $DOC:expr) => {
             #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"]
             #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")]
             #[doc = ""]
@@ -320,10 +319,9 @@ macro_rules! create_udwf_expr {
             pub fn $OUT_FN_NAME(
                 $($PARAM: datafusion_expr::Expr),+
             ) -> datafusion_expr::Expr {
-                [<$OUT_FN_NAME _udwf>]()
+                $UDWF_FN()
                     .call(vec![$($PARAM),+])
             }
-       }
     };
 }
 
@@ -374,12 +372,13 @@ macro_rules! create_udwf_expr {
 /// define_udwf_and_expr!(
 ///     SimpleUDWF,
 ///     simple,
+///     simple_udwf,
 ///     "a simple user-defined window function"
 /// );
 /// #
 /// # assert_eq!(simple_udwf().name(), "simple_user_defined_window_function");
 /// #
-/// #  #[derive(Debug)]
+/// #  #[derive(Debug, PartialEq, Eq, Hash)]
 /// #  struct SimpleUDWF {
 /// #      signature: Signature,
 /// #  }
@@ -437,6 +436,7 @@ macro_rules! create_udwf_expr {
 /// define_udwf_and_expr!(
 ///     RowNumber,
 ///     row_number,
+///     row_number_udwf,
 ///     "Returns a unique row number for each row in window partition beginning at 1.",
 ///     RowNumber::new // <-- custom constructor
 /// );
@@ -446,7 +446,7 @@ macro_rules! create_udwf_expr {
 /// #     "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// # );
 /// #
-/// # #[derive(Debug)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct RowNumber {
 /// #     signature: Signature,
 /// # }
@@ -514,6 +514,7 @@ macro_rules! create_udwf_expr {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],        // <- 3 parameters
+///     lead_udwf,
 ///     "user-defined window function"
 /// );
 /// #
@@ -524,7 +525,7 @@ macro_rules! create_udwf_expr {
 /// #     "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// # );
 /// #
-/// # #[derive(Debug)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct Lead {
 /// #     signature: Signature,
 /// # }
@@ -603,6 +604,7 @@ macro_rules! create_udwf_expr {
 ///     Lead,
 ///     lead,
 ///     [expr, offset, default],        // <- 3 parameters
+///     lead_udwf,
 ///     "user-defined window function",
 ///     Lead::new                       // <- Custom constructor
 /// );
@@ -614,7 +616,7 @@ macro_rules! create_udwf_expr {
 /// #     "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING"
 /// # );
 /// #
-/// # #[derive(Debug)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct Lead {
 /// #     signature: Signature,
 /// # }
@@ -663,29 +665,29 @@ macro_rules! create_udwf_expr {
 macro_rules! define_udwf_and_expr {
     // Defines UDWF with default constructor
     // Defines expression API with zero parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME,$UDWF_FN, $DOC);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
     };
 
     // Defines UDWF by passing a custom constructor
     // Defines expression API with zero parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $CTOR);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
     };
 
     // Defines UDWF with default constructor
     // Defines expression API with multiple parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+],$UDWF_FN:ident, $DOC:expr) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $UDWF_FN, $DOC);
     };
 
     // Defines UDWF by passing a custom constructor
     // Defines expression API with multiple parameters
-    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr, $CTOR:path) => {
-        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR);
-        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC);
+    ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $UDWF_FN:ident, $DOC:expr, $CTOR:path) => {
+        get_or_init_udwf!($UDWF, $OUT_FN_NAME, $UDWF_FN, $DOC, $CTOR);
+        create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $UDWF_FN, $DOC);
     };
 }
diff --git a/datafusion/functions-window/src/nth_value.rs b/datafusion/functions-window/src/nth_value.rs
index 0b83e1ff9f084..12b4146cc2f6f 100644
--- a/datafusion/functions-window/src/nth_value.rs
+++ b/datafusion/functions-window/src/nth_value.rs
@@ -19,64 +19,59 @@
 
 use crate::utils::{get_scalar_value_from_args, get_signed_integer};
 
+use arrow::buffer::NullBuffer;
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
-use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL;
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, exec_err};
+use datafusion_doc::window_doc_sections::DOC_SECTION_ANALYTICAL;
 use datafusion_expr::window_state::WindowAggState;
 use datafusion_expr::{
-    Documentation, Literal, PartitionEvaluator, ReversedUDWF, Signature, TypeSignature,
-    Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, Literal, PartitionEvaluator, ReversedUDWF, Signature,
+    TypeSignature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
 use std::any::Any;
 use std::cmp::Ordering;
 use std::fmt::Debug;
+use std::hash::Hash;
 use std::ops::Range;
-use std::sync::LazyLock;
+use std::sync::{Arc, LazyLock};
 
-get_or_init_udwf!(
+define_udwf_and_expr!(
     First,
     first_value,
-    "returns the first value in the window frame",
+    [arg],
+    first_value_udwf,
+    "Returns the first value in the window frame",
     NthValue::first
 );
-get_or_init_udwf!(
+define_udwf_and_expr!(
     Last,
     last_value,
-    "returns the last value in the window frame",
+    [arg],
+    last_value_udwf,
+    "Returns the last value in the window frame",
     NthValue::last
 );
 get_or_init_udwf!(
     NthValue,
     nth_value,
-    "returns the nth value in the window frame",
+    nth_value_udwf,
+    "Returns the nth value in the window frame",
     NthValue::nth
 );
 
-/// Create an expression to represent the `first_value` window function
-///
-pub fn first_value(arg: datafusion_expr::Expr) -> datafusion_expr::Expr {
-    first_value_udwf().call(vec![arg])
-}
-
-/// Create an expression to represent the `last_value` window function
-///
-pub fn last_value(arg: datafusion_expr::Expr) -> datafusion_expr::Expr {
-    last_value_udwf().call(vec![arg])
-}
-
 /// Create an expression to represent the `nth_value` window function
-///
 pub fn nth_value(arg: datafusion_expr::Expr, n: i64) -> datafusion_expr::Expr {
     nth_value_udwf().call(vec![arg, n.lit()])
 }
 
 /// Tag to differentiate special use cases of the NTH_VALUE built-in window function.
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum NthValueKind {
     First,
     Last,
@@ -93,7 +88,7 @@ impl NthValueKind {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NthValue {
     signature: Signature,
     kind: NthValueKind,
@@ -105,7 +100,7 @@ impl NthValue {
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Any(0),
+                    TypeSignature::Nullary,
                     TypeSignature::Any(1),
                     TypeSignature::Any(2),
                 ],
@@ -125,6 +120,10 @@ impl NthValue {
     pub fn nth() -> Self {
         Self::new(NthValueKind::Nth)
     }
+
+    pub fn kind(&self) -> &NthValueKind {
+        &self.kind
+    }
 }
 
 static FIRST_VALUE_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
@@ -135,16 +134,16 @@ static FIRST_VALUE_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
         "first_value(expression)",
     )
     .with_argument("expression", "Expression to operate on")
-        .with_sql_example(r#"```sql
-    --Example usage of the first_value window function:
-    SELECT department,
-           employee_id,
-           salary,
-           first_value(salary) OVER (PARTITION BY department ORDER BY salary DESC) AS top_salary
-    FROM employees;
-```
-
+    .with_sql_example(
+        r#"
 ```sql
+-- Example usage of the first_value window function:
+SELECT department,
+  employee_id,
+  salary,
+  first_value(salary) OVER (PARTITION BY department ORDER BY salary DESC) AS top_salary
+FROM employees;
+
 +-------------+-------------+--------+------------+
 | department  | employee_id | salary | top_salary |
 +-------------+-------------+--------+------------+
@@ -154,7 +153,9 @@ static FIRST_VALUE_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
 | Engineering | 4           | 90000  | 90000      |
 | Engineering | 5           | 80000  | 90000      |
 +-------------+-------------+--------+------------+
-```"#)
+```
+"#,
+    )
     .build()
 });
 
@@ -177,9 +178,7 @@ SELECT department,
        salary,
        last_value(salary) OVER (PARTITION BY department ORDER BY salary) AS running_last_salary
 FROM employees;
-```
 
-```sql
 +-------------+-------------+--------+---------------------+
 | department  | employee_id | salary | running_last_salary |
 +-------------+-------------+--------+---------------------+
@@ -189,7 +188,8 @@ FROM employees;
 | Engineering | 4           | 40000  | 40000               |
 | Engineering | 5           | 60000  | 60000               |
 +-------------+-------------+--------+---------------------+
-```"#)
+```
+"#)
     .build()
 });
 
@@ -213,7 +213,8 @@ static NTH_VALUE_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
         "Integer. Specifies the row number (starting from 1) in the window frame.",
     )
     .with_sql_example(
-        r#"```sql
+        r#"
+```sql
 -- Sample employees table:
 CREATE TABLE employees (id INT, salary INT);
 INSERT INTO employees (id, salary) VALUES
@@ -229,9 +230,7 @@ SELECT nth_value(salary, 2) OVER (
   ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
 ) AS nth_value
 FROM employees;
-```
 
-```text
 +-----------+
 | nth_value |
 +-----------+
@@ -241,7 +240,8 @@ FROM employees;
 | 40000     |
 | 40000     |
 +-----------+
-```"#,
+```
+"#,
     )
     .build()
 });
@@ -272,7 +272,7 @@ impl WindowUDFImpl for NthValue {
             kind: self.kind,
         };
 
-        if !matches!(self.kind, NthValueKind::Nth) {
+        if self.kind != NthValueKind::Nth {
             return Ok(Box::new(NthValueEvaluator {
                 state,
                 ignore_nulls: partition_evaluator_args.ignore_nulls(),
@@ -280,27 +280,30 @@ impl WindowUDFImpl for NthValue {
             }));
         }
 
-        let n =
-            match get_scalar_value_from_args(partition_evaluator_args.input_exprs(), 1)
-                .map_err(|_e| {
-                    exec_datafusion_err!(
-                "Expected a signed integer literal for the second argument of nth_value")
-                })?
-                .map(get_signed_integer)
-            {
-                Some(Ok(n)) => {
-                    if partition_evaluator_args.is_reversed() {
-                        -n
-                    } else {
-                        n
-                    }
-                }
-                _ => {
-                    return exec_err!(
+        let n = match get_scalar_value_from_args(
+            partition_evaluator_args.input_exprs(),
+            1,
+        )
+        .map_err(|_e| {
+            exec_datafusion_err!(
                 "Expected a signed integer literal for the second argument of nth_value"
             )
+        })?
+        .map(|v| get_signed_integer(&v))
+        {
+            Some(Ok(n)) => {
+                if partition_evaluator_args.is_reversed() {
+                    -n
+                } else {
+                    n
                 }
-            };
+            }
+            _ => {
+                return exec_err!(
+                    "Expected a signed integer literal for the second argument of nth_value"
+                );
+            }
+        };
 
         Ok(Box::new(NthValueEvaluator {
             state,
@@ -335,6 +338,10 @@ impl WindowUDFImpl for NthValue {
             NthValueKind::Nth => Some(get_nth_value_doc()),
         }
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::None // NthValue is causal
+    }
 }
 
 #[derive(Debug, Clone)]
@@ -367,6 +374,33 @@ impl PartitionEvaluator for NthValueEvaluator {
     fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> {
         let out = &state.out_col;
         let size = out.len();
+        if self.ignore_nulls {
+            match self.state.kind {
+                // Prune on first non-null output in case of FIRST_VALUE
+                NthValueKind::First => {
+                    if let Some(nulls) = out.nulls() {
+                        if self.state.finalized_result.is_none() {
+                            if let Some(valid_index) = nulls.valid_indices().next() {
+                                let result =
+                                    ScalarValue::try_from_array(out, valid_index)?;
+                                self.state.finalized_result = Some(result);
+                            } else {
+                                // The output is empty or all nulls, ignore
+                            }
+                        }
+                        if state.window_frame_range.start < state.window_frame_range.end {
+                            state.window_frame_range.start =
+                                state.window_frame_range.end - 1;
+                        }
+                        return Ok(());
+                    } else {
+                        // Fall through to the main case because there are no nulls
+                    }
+                }
+                // Do not memoize for other kinds when nulls are ignored
+                NthValueKind::Last | NthValueKind::Nth => return Ok(()),
+            }
+        }
         let mut buffer_size = 1;
         // Decide if we arrived at a final result yet:
         let (is_prunable, is_reverse_direction) = match self.state.kind {
@@ -394,8 +428,7 @@ impl PartitionEvaluator for NthValueEvaluator {
                 }
             }
         };
-        // Do not memoize results when nulls are ignored.
-        if is_prunable && !self.ignore_nulls {
+        if is_prunable {
             if self.state.finalized_result.is_none() && !is_reverse_direction {
                 let result = ScalarValue::try_from_array(out, size - 1)?;
                 self.state.finalized_result = Some(result);
@@ -421,99 +454,90 @@ impl PartitionEvaluator for NthValueEvaluator {
                 // We produce None if the window is empty.
                 return ScalarValue::try_from(arr.data_type());
             }
+            match self.valid_index(arr, range) {
+                Some(index) => ScalarValue::try_from_array(arr, index),
+                None => ScalarValue::try_from(arr.data_type()),
+            }
+        }
+    }
 
-            // If null values exist and need to be ignored, extract the valid indices.
-            let valid_indices = if self.ignore_nulls {
-                // Calculate valid indices, inside the window frame boundaries.
-                let slice = arr.slice(range.start, n_range);
-                match slice.nulls() {
-                    Some(nulls) => {
-                        let valid_indices = nulls
-                            .valid_indices()
-                            .map(|idx| {
-                                // Add offset `range.start` to valid indices, to point correct index in the original arr.
-                                idx + range.start
-                            })
-                            .collect::<Vec<_>>();
-                        if valid_indices.is_empty() {
-                            // If all values are null, return directly.
-                            return ScalarValue::try_from(arr.data_type());
-                        }
-                        Some(valid_indices)
-                    }
-                    None => None,
-                }
-            } else {
-                None
-            };
-            match self.state.kind {
-                NthValueKind::First => {
-                    if let Some(valid_indices) = &valid_indices {
-                        ScalarValue::try_from_array(arr, valid_indices[0])
+    fn supports_bounded_execution(&self) -> bool {
+        true
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        true
+    }
+}
+
+impl NthValueEvaluator {
+    fn valid_index(&self, array: &ArrayRef, range: &Range<usize>) -> Option<usize> {
+        let n_range = range.end - range.start;
+        if self.ignore_nulls {
+            // Calculate valid indices, inside the window frame boundaries.
+            let slice = array.slice(range.start, n_range);
+            if let Some(nulls) = slice.nulls()
+                && nulls.null_count() > 0
+            {
+                return self.valid_index_with_nulls(nulls, range.start);
+            }
+        }
+        // Either no nulls, or nulls are regarded as valid rows
+        match self.state.kind {
+            NthValueKind::First => Some(range.start),
+            NthValueKind::Last => Some(range.end - 1),
+            NthValueKind::Nth => match self.n.cmp(&0) {
+                Ordering::Greater => {
+                    // SQL indices are not 0-based.
+                    let index = (self.n as usize) - 1;
+                    if index >= n_range {
+                        // Outside the range, return NULL:
+                        None
                     } else {
-                        ScalarValue::try_from_array(arr, range.start)
+                        Some(range.start + index)
                     }
                 }
-                NthValueKind::Last => {
-                    if let Some(valid_indices) = &valid_indices {
-                        ScalarValue::try_from_array(
-                            arr,
-                            valid_indices[valid_indices.len() - 1],
-                        )
+                Ordering::Less => {
+                    let reverse_index = (-self.n) as usize;
+                    if n_range < reverse_index {
+                        // Outside the range, return NULL:
+                        None
                     } else {
-                        ScalarValue::try_from_array(arr, range.end - 1)
+                        Some(range.end - reverse_index)
                     }
                 }
-                NthValueKind::Nth => {
-                    match self.n.cmp(&0) {
-                        Ordering::Greater => {
-                            // SQL indices are not 0-based.
-                            let index = (self.n as usize) - 1;
-                            if index >= n_range {
-                                // Outside the range, return NULL:
-                                ScalarValue::try_from(arr.data_type())
-                            } else if let Some(valid_indices) = valid_indices {
-                                if index >= valid_indices.len() {
-                                    return ScalarValue::try_from(arr.data_type());
-                                }
-                                ScalarValue::try_from_array(&arr, valid_indices[index])
-                            } else {
-                                ScalarValue::try_from_array(arr, range.start + index)
-                            }
-                        }
-                        Ordering::Less => {
-                            let reverse_index = (-self.n) as usize;
-                            if n_range < reverse_index {
-                                // Outside the range, return NULL:
-                                ScalarValue::try_from(arr.data_type())
-                            } else if let Some(valid_indices) = valid_indices {
-                                if reverse_index > valid_indices.len() {
-                                    return ScalarValue::try_from(arr.data_type());
-                                }
-                                let new_index =
-                                    valid_indices[valid_indices.len() - reverse_index];
-                                ScalarValue::try_from_array(&arr, new_index)
-                            } else {
-                                ScalarValue::try_from_array(
-                                    arr,
-                                    range.start + n_range - reverse_index,
-                                )
-                            }
+                Ordering::Equal => None,
+            },
+        }
+    }
+
+    fn valid_index_with_nulls(&self, nulls: &NullBuffer, offset: usize) -> Option<usize> {
+        match self.state.kind {
+            NthValueKind::First => nulls.valid_indices().next().map(|idx| idx + offset),
+            NthValueKind::Last => nulls.valid_indices().last().map(|idx| idx + offset),
+            NthValueKind::Nth => {
+                match self.n.cmp(&0) {
+                    Ordering::Greater => {
+                        // SQL indices are not 0-based.
+                        let index = (self.n as usize) - 1;
+                        nulls.valid_indices().nth(index).map(|idx| idx + offset)
+                    }
+                    Ordering::Less => {
+                        let reverse_index = (-self.n) as usize;
+                        let valid_indices_len = nulls.len() - nulls.null_count();
+                        if reverse_index > valid_indices_len {
+                            return None;
                         }
-                        Ordering::Equal => ScalarValue::try_from(arr.data_type()),
+                        nulls
+                            .valid_indices()
+                            .nth(valid_indices_len - reverse_index)
+                            .map(|idx| idx + offset)
                     }
+                    Ordering::Equal => None,
                 }
             }
         }
     }
-
-    fn supports_bounded_execution(&self) -> bool {
-        true
-    }
-
-    fn uses_window_frame(&self) -> bool {
-        true
-    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/functions-window/src/ntile.rs b/datafusion/functions-window/src/ntile.rs
index 6b4c0960e695c..1f9b2344e5c75 100644
--- a/datafusion/functions-window/src/ntile.rs
+++ b/datafusion/functions-window/src/ntile.rs
@@ -23,28 +23,27 @@ use crate::utils::{
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::{ArrayRef, UInt64Array};
 use datafusion_common::arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
 use datafusion_expr::{
-    Documentation, Expr, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-get_or_init_udwf!(
+define_udwf_and_expr!(
     Ntile,
     ntile,
-    "integer ranging from 1 to the argument value, dividing the partition as equally as possible"
+    [arg],
+    ntile_udwf,
+    "Integer ranging from 1 to the argument value, dividing the partition as equally as possible."
 );
 
-pub fn ntile(arg: Expr) -> Expr {
-    ntile_udwf().call(vec![arg])
-}
-
 #[user_doc(
     doc_section(label = "Ranking Functions"),
     description = "Integer ranging from 1 to the argument value, dividing the partition as equally as possible",
@@ -53,15 +52,14 @@ pub fn ntile(arg: Expr) -> Expr {
         name = "expression",
         description = "An integer describing the number groups the partition should be split into"
     ),
-    sql_example = r#"```sql
-    --Example usage of the ntile window function:
-    SELECT employee_id,
-           salary,
-           ntile(4) OVER (ORDER BY salary DESC) AS quartile
-    FROM employees;
-```
-
+    sql_example = r#"
 ```sql
+-- Example usage of the ntile window function:
+SELECT employee_id,
+    salary,
+    ntile(4) OVER (ORDER BY salary DESC) AS quartile
+FROM employees;
+
 +-------------+--------+----------+
 | employee_id | salary | quartile |
 +-------------+--------+----------+
@@ -74,9 +72,10 @@ pub fn ntile(arg: Expr) -> Expr {
 | 7           | 40000  | 4        |
 | 8           | 30000  | 4        |
 +-------------+--------+----------+
-```"#
+```
+"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Ntile {
     signature: Signature,
 }
@@ -129,9 +128,7 @@ impl WindowUDFImpl for Ntile {
         let scalar_n =
             get_scalar_value_from_args(partition_evaluator_args.input_exprs(), 0)?
                 .ok_or_else(|| {
-                    DataFusionError::Execution(
-                        "NTILE requires a positive integer".to_string(),
-                    )
+                    exec_datafusion_err!("NTILE requires a positive integer")
                 })?;
 
         if scalar_n.is_null() {
@@ -139,10 +136,10 @@ impl WindowUDFImpl for Ntile {
         }
 
         if scalar_n.is_unsigned() {
-            let n = get_unsigned_integer(scalar_n)?;
+            let n = get_unsigned_integer(&scalar_n)?;
             Ok(Box::new(NtileEvaluator { n }))
         } else {
-            let n: i64 = get_signed_integer(scalar_n)?;
+            let n: i64 = get_signed_integer(&scalar_n)?;
             if n <= 0 {
                 return exec_err!("NTILE requires a positive integer");
             }
@@ -158,6 +155,10 @@ impl WindowUDFImpl for Ntile {
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 #[derive(Debug)]
diff --git a/datafusion/functions-window/src/planner.rs b/datafusion/functions-window/src/planner.rs
index 1ddd8b27c4205..6f4eb2051f047 100644
--- a/datafusion/functions-window/src/planner.rs
+++ b/datafusion/functions-window/src/planner.rs
@@ -19,11 +19,11 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::{WindowFunction, WindowFunctionParams},
     expr_rewriter::NamePreserver,
     planner::{ExprPlanner, PlannerResult, RawWindowExpr},
     utils::COUNT_STAR_EXPANSION,
-    Expr, ExprFunctionExt,
 };
 
 #[derive(Debug)]
@@ -40,23 +40,30 @@ impl ExprPlanner for WindowFunctionPlanner {
             partition_by,
             order_by,
             window_frame,
+            filter,
             null_treatment,
+            distinct,
         } = raw_expr;
 
-        let origin_expr = Expr::WindowFunction(WindowFunction {
+        let origin_expr = Expr::from(WindowFunction {
             fun: func_def,
             params: WindowFunctionParams {
                 args,
                 partition_by,
                 order_by,
                 window_frame,
+                filter,
                 null_treatment,
+                distinct,
             },
         });
 
         let saved_name = NamePreserver::new_for_projection().save(&origin_expr);
 
-        let Expr::WindowFunction(WindowFunction {
+        let Expr::WindowFunction(window_fun) = origin_expr else {
+            unreachable!("")
+        };
+        let WindowFunction {
             fun,
             params:
                 WindowFunctionParams {
@@ -65,18 +72,19 @@ impl ExprPlanner for WindowFunctionPlanner {
                     order_by,
                     window_frame,
                     null_treatment,
+                    distinct,
+                    filter,
                 },
-        }) = origin_expr
-        else {
-            unreachable!("")
-        };
+        } = *window_fun;
         let raw_expr = RawWindowExpr {
             func_def: fun,
             args,
             partition_by,
             order_by,
             window_frame,
+            filter,
             null_treatment,
+            distinct,
         };
 
         // TODO: remove the next line after `Expr::Wildcard` is removed
@@ -92,19 +100,23 @@ impl ExprPlanner for WindowFunctionPlanner {
                 partition_by,
                 order_by,
                 window_frame,
+                filter,
                 null_treatment,
+                distinct,
             } = raw_expr;
 
-            let new_expr = Expr::WindowFunction(WindowFunction::new(
-                func_def,
-                vec![Expr::Literal(COUNT_STAR_EXPANSION)],
-            ))
-            .partition_by(partition_by)
-            .order_by(order_by)
-            .window_frame(window_frame)
-            .null_treatment(null_treatment)
-            .build()?;
-
+            let new_expr = Expr::from(WindowFunction {
+                fun: func_def,
+                params: WindowFunctionParams {
+                    args: vec![Expr::Literal(COUNT_STAR_EXPANSION, None)],
+                    partition_by,
+                    order_by,
+                    window_frame,
+                    filter,
+                    null_treatment,
+                    distinct,
+                },
+            });
             let new_expr = saved_name.restore(new_expr);
 
             return Ok(PlannerResult::Planned(new_expr));
diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs
index 969a957cddd9c..ee8546703b1f9 100644
--- a/datafusion/functions-window/src/rank.rs
+++ b/datafusion/functions-window/src/rank.rs
@@ -18,7 +18,6 @@
 //! Implementation of `rank`, `dense_rank`, and `percent_rank` window functions,
 //! which can be evaluated at runtime during query execution.
 
-use crate::define_udwf_and_expr;
 use arrow::datatypes::FieldRef;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::array::{Float64Array, UInt64Array};
@@ -26,16 +25,18 @@ use datafusion_common::arrow::compute::SortOptions;
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
 use datafusion_common::utils::get_row_at_idx;
-use datafusion_common::{exec_err, Result, ScalarValue};
-use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_doc::window_doc_sections::DOC_SECTION_RANKING;
 use datafusion_expr::{
-    Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
 use std::any::Any;
 use std::fmt::Debug;
+use std::hash::Hash;
 use std::iter;
 use std::ops::Range;
 use std::sync::{Arc, LazyLock};
@@ -43,6 +44,7 @@ use std::sync::{Arc, LazyLock};
 define_udwf_and_expr!(
     Rank,
     rank,
+    rank_udwf,
     "Returns rank of the current row with gaps. Same as `row_number` of its first peer",
     Rank::basic
 );
@@ -50,6 +52,7 @@ define_udwf_and_expr!(
 define_udwf_and_expr!(
     DenseRank,
     dense_rank,
+    dense_rank_udwf,
     "Returns rank of the current row without gaps. This function counts peer groups",
     Rank::dense_rank
 );
@@ -57,12 +60,13 @@ define_udwf_and_expr!(
 define_udwf_and_expr!(
     PercentRank,
     percent_rank,
+    percent_rank_udwf,
     "Returns the relative rank of the current row: (rank - 1) / (total rows - 1)",
     Rank::percent_rank
 );
 
 /// Rank calculates the rank in the window function with order by
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Rank {
     name: String,
     signature: Signature,
@@ -95,7 +99,7 @@ impl Rank {
     }
 }
 
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum RankType {
     Basic,
     Dense,
@@ -110,15 +114,14 @@ static RANK_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
             skips ranks for identical values.",
 
         "rank()")
-        .with_sql_example(r#"```sql
-    --Example usage of the rank window function:
-    SELECT department,
-           salary,
-           rank() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
-    FROM employees;
-```
-
+        .with_sql_example(r#"
 ```sql
+-- Example usage of the rank window function:
+SELECT department,
+    salary,
+    rank() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
+FROM employees;
+
 +-------------+--------+------+
 | department  | salary | rank |
 +-------------+--------+------+
@@ -129,7 +132,8 @@ static RANK_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
 | Engineering | 90000  | 1    |
 | Engineering | 80000  | 2    |
 +-------------+--------+------+
-```"#)
+```
+"#)
         .build()
 });
 
@@ -141,15 +145,14 @@ static DENSE_RANK_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
     Documentation::builder(DOC_SECTION_RANKING, "Returns the rank of the current row without gaps. This function ranks \
             rows in a dense manner, meaning consecutive ranks are assigned even for identical \
             values.", "dense_rank()")
-        .with_sql_example(r#"```sql
-    --Example usage of the dense_rank window function:
-    SELECT department,
-           salary,
-           dense_rank() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
-    FROM employees;
-```
-
+        .with_sql_example(r#"
 ```sql
+-- Example usage of the dense_rank window function:
+SELECT department,
+    salary,
+    dense_rank() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
+FROM employees;
+
 +-------------+--------+------------+
 | department  | salary | dense_rank |
 +-------------+--------+------------+
@@ -172,14 +175,12 @@ static PERCENT_RANK_DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
     Documentation::builder(DOC_SECTION_RANKING, "Returns the percentage rank of the current row within its partition. \
             The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`.", "percent_rank()")
         .with_sql_example(r#"```sql
-    --Example usage of the percent_rank window function:
-    SELECT employee_id,
-           salary,
-           percent_rank() OVER (ORDER BY salary) AS percent_rank
-    FROM employees;
-```
+    -- Example usage of the percent_rank window function:
+SELECT employee_id,
+    salary,
+    percent_rank() OVER (ORDER BY salary) AS percent_rank
+FROM employees;
 
-```sql
 +-------------+--------+---------------+
 | employee_id | salary | percent_rank  |
 +-------------+--------+---------------+
@@ -242,6 +243,14 @@ impl WindowUDFImpl for Rank {
             RankType::Percent => Some(get_percent_rank_doc()),
         }
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        match self.rank_type {
+            RankType::Basic => LimitEffect::None,
+            RankType::Dense => LimitEffect::None,
+            RankType::Percent => LimitEffect::Unknown,
+        }
+    }
 }
 
 /// State for the RANK(rank) built-in window function.
@@ -374,7 +383,7 @@ mod tests {
         test_i32_result(expr, vec![0..2, 2..3, 3..6, 6..7, 7..8], expected)
     }
 
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_without_rank(expr: &Rank, expected: Vec<u64>) -> Result<()> {
         test_i32_result(expr, vec![0..8], expected)
     }
@@ -427,7 +436,7 @@ mod tests {
     }
 
     #[test]
-    #[allow(clippy::single_range_in_vec_init)]
+    #[expect(clippy::single_range_in_vec_init)]
     fn test_percent_rank() -> Result<()> {
         let r = Rank::percent_rank();
 
diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs
index ba8627dd86d79..cd60e51def72e 100644
--- a/datafusion/functions-window/src/row_number.rs
+++ b/datafusion/functions-window/src/row_number.rs
@@ -25,19 +25,22 @@ use datafusion_common::arrow::datatypes::DataType;
 use datafusion_common::arrow::datatypes::Field;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
-    Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
+    Documentation, LimitEffect, PartitionEvaluator, Signature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_macros::user_doc;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use field::WindowUDFFieldArgs;
 use std::any::Any;
 use std::fmt::Debug;
 use std::ops::Range;
+use std::sync::Arc;
 
 define_udwf_and_expr!(
     RowNumber,
     row_number,
+    row_number_udwf,
     "Returns a unique row number for each row in window partition beginning at 1."
 );
 
@@ -46,15 +49,14 @@ define_udwf_and_expr!(
     doc_section(label = "Ranking Functions"),
     description = "Number of the current row within its partition, counting from 1.",
     syntax_example = "row_number()",
-    sql_example = r"```sql
-    --Example usage of the row_number window function:
-    SELECT department,
-           salary,
-           row_number() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
-    FROM employees;
-```
-
+    sql_example = r#"
 ```sql
+-- Example usage of the row_number window function:
+SELECT department,
+  salary,
+  row_number() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
+FROM employees;
+
 +-------------+--------+---------+
 | department  | salary | row_num |
 +-------------+--------+---------+
@@ -65,9 +67,10 @@ define_udwf_and_expr!(
 | Engineering | 90000  | 1       |
 | Engineering | 80000  | 2       |
 +-------------+--------+---------+
-```#"
+```
+"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RowNumber {
     signature: Signature,
 }
@@ -121,6 +124,10 @@ impl WindowUDFImpl for RowNumber {
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::None
+    }
 }
 
 /// State for the `row_number` built-in window function.
@@ -140,7 +147,7 @@ impl PartitionEvaluator for NumRowsEvaluator {
         _values: &[ArrayRef],
         num_rows: usize,
     ) -> Result<ArrayRef> {
-        Ok(std::sync::Arc::new(UInt64Array::from_iter_values(
+        Ok(Arc::new(UInt64Array::from_iter_values(
             1..(num_rows as u64) + 1,
         )))
     }
diff --git a/datafusion/functions-window/src/utils.rs b/datafusion/functions-window/src/utils.rs
index 3f8061dbea3e1..4d0c4d181aefa 100644
--- a/datafusion/functions-window/src/utils.rs
+++ b/datafusion/functions-window/src/utils.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use datafusion_common::arrow::datatypes::DataType;
-use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
 use datafusion_physical_expr::expressions::Literal;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use std::sync::Arc;
 
-pub(crate) fn get_signed_integer(value: ScalarValue) -> Result<i64> {
+pub(crate) fn get_signed_integer(value: &ScalarValue) -> Result<i64> {
     if value.is_null() {
         return Ok(0);
     }
@@ -52,7 +52,7 @@ pub(crate) fn get_scalar_value_from_args(
     })
 }
 
-pub(crate) fn get_unsigned_integer(value: ScalarValue) -> Result<u64> {
+pub(crate) fn get_unsigned_integer(value: &ScalarValue) -> Result<u64> {
     if value.is_null() {
         return Ok(0);
     }
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 0c4280babc70a..1940f1378b635 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -31,13 +31,16 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [features]
 crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
 # enable datetime functions
-datetime_expressions = []
+datetime_expressions = ["chrono-tz"]
 # Enable encoding by default so the doctests work. In general don't automatically enable all packages.
 default = [
     "datetime_expressions",
@@ -68,25 +71,30 @@ base64 = { version = "0.22", optional = true }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.8", optional = true }
 chrono = { workspace = true }
+chrono-tz = { version = "0.10.4", optional = true }
 datafusion-common = { workspace = true }
 datafusion-doc = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-macros = { workspace = true }
-hex = { version = "0.4", optional = true }
+hex = { workspace = true, optional = true }
 itertools = { workspace = true }
 log = { workspace = true }
 md-5 = { version = "^0.10.0", optional = true }
+memchr = { workspace = true }
+num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
-sha2 = { version = "^0.10.9", optional = true }
+sha2 = { workspace = true, optional = true }
 unicode-segmentation = { version = "^1.7.1", optional = true }
-uuid = { version = "1.17", features = ["v4"], optional = true }
+uuid = { workspace = true, features = ["v4"], optional = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
 criterion = { workspace = true }
+ctor = { workspace = true }
+env_logger = { workspace = true }
 rand = { workspace = true }
 tokio = { workspace = true, features = ["macros", "rt", "sync"] }
 
@@ -100,6 +108,11 @@ harness = false
 name = "concat"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "concat_ws"
+required-features = ["string_expressions"]
+
 [[bench]]
 harness = false
 name = "to_timestamp"
@@ -120,6 +133,11 @@ harness = false
 name = "gcd"
 required-features = ["math_expressions"]
 
+[[bench]]
+harness = false
+name = "nanvl"
+required-features = ["math_expressions"]
+
 [[bench]]
 harness = false
 name = "uuid"
@@ -174,6 +192,11 @@ harness = false
 name = "signum"
 required-features = ["math_expressions"]
 
+[[bench]]
+harness = false
+name = "atan2"
+required-features = ["math_expressions"]
+
 [[bench]]
 harness = false
 name = "substr_index"
@@ -181,7 +204,7 @@ required-features = ["unicode_expressions"]
 
 [[bench]]
 harness = false
-name = "ltrim"
+name = "trim"
 required-features = ["string_expressions"]
 
 [[bench]]
@@ -204,6 +227,11 @@ harness = false
 name = "repeat"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "replace"
+required-features = ["string_expressions"]
+
 [[bench]]
 harness = false
 name = "random"
@@ -248,3 +276,63 @@ required-features = ["unicode_expressions"]
 harness = false
 name = "find_in_set"
 required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "contains"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "starts_with"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "ends_with"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "regexp_count"
+required-features = ["regex_expressions"]
+
+[[bench]]
+harness = false
+name = "crypto"
+required-features = ["crypto_expressions"]
+
+[[bench]]
+harness = false
+name = "translate"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "levenshtein"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "split_part"
+required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "left_right"
+required-features = ["unicode_expressions"]
+
+[[bench]]
+harness = false
+name = "factorial"
+required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "floor_ceil"
+required-features = ["math_expressions"]
+
+[[bench]]
+harness = false
+name = "round"
+required-features = ["math_expressions"]
diff --git a/datafusion/functions/README.md b/datafusion/functions/README.md
index a610d135c0f68..dee1330422727 100644
--- a/datafusion/functions/README.md
+++ b/datafusion/functions/README.md
@@ -17,11 +17,17 @@
   under the License.
 -->
 
-# DataFusion Function Library
+# Apache DataFusion Function Library
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate contains packages of function that can be used to customize the
 functionality of DataFusion.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/functions/benches/ascii.rs b/datafusion/functions/benches/ascii.rs
index 1c7023f4497e6..a2424ed352afc 100644
--- a/datafusion/functions/benches/ascii.rs
+++ b/datafusion/functions/benches/ascii.rs
@@ -15,17 +15,47 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
 mod helper;
 
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use datafusion_expr::ScalarFunctionArgs;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use helper::gen_string_array;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let ascii = datafusion_functions::string::ascii();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks (outside loop)
+    c.bench_function("ascii/scalar_utf8", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                "hello".to_string(),
+            )))],
+            arg_fields: vec![Field::new("a", DataType::Utf8, false).into()],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Int32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(ascii.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("ascii/scalar_utf8view", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                "hello".to_string(),
+            )))],
+            arg_fields: vec![Field::new("a", DataType::Utf8View, false).into()],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Int32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(ascii.invoke_with_args(args.clone()).unwrap()))
+    });
 
     // All benches are single batch run with 8192 rows
     const N_ROWS: usize = 8192;
@@ -46,6 +76,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         let arg_fields =
             vec![Field::new("a", args_string_ascii[0].data_type(), true).into()];
         let return_field = Field::new("f", DataType::Utf8, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(
             format!("ascii/string_ascii_only (null_density={null_density})").as_str(),
@@ -56,6 +87,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: N_ROWS,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -76,6 +108,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: N_ROWS,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -102,6 +135,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: N_ROWS,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -122,6 +156,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: N_ROWS,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
diff --git a/datafusion/functions/benches/atan2.rs b/datafusion/functions/benches/atan2.rs
new file mode 100644
index 0000000000000..f1c9756a0cc08
--- /dev/null
+++ b/datafusion/functions/benches/atan2.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::datatypes::{DataType, Field, Float32Type, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::atan2;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let atan2_fn = atan2();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let y_f32 = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
+        let x_f32 = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
+        let f32_args = vec![ColumnarValue::Array(y_f32), ColumnarValue::Array(x_f32)];
+        let f32_arg_fields = f32_args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let return_field_f32 = Field::new("f", DataType::Float32, true).into();
+
+        c.bench_function(&format!("atan2 f32 array: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    atan2_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: f32_args.clone(),
+                            arg_fields: f32_arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Arc::clone(&return_field_f32),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let y_f64 = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
+        let x_f64 = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
+        let f64_args = vec![ColumnarValue::Array(y_f64), ColumnarValue::Array(x_f64)];
+        let f64_arg_fields = f64_args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let return_field_f64 = Field::new("f", DataType::Float64, true).into();
+
+        c.bench_function(&format!("atan2 f64 array: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    atan2_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: f64_args.clone(),
+                            arg_fields: f64_arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Arc::clone(&return_field_f64),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+    }
+
+    let scalar_f32_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0))),
+        ColumnarValue::Scalar(ScalarValue::Float32(Some(2.0))),
+    ];
+    let scalar_f32_arg_fields = vec![
+        Field::new("a", DataType::Float32, false).into(),
+        Field::new("b", DataType::Float32, false).into(),
+    ];
+    let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("atan2 f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                atan2_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f32),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0))),
+        ColumnarValue::Scalar(ScalarValue::Float64(Some(2.0))),
+    ];
+    let scalar_f64_arg_fields = vec![
+        Field::new("a", DataType::Float64, false).into(),
+        Field::new("b", DataType::Float64, false).into(),
+    ];
+    let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("atan2 f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                atan2_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f64),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/character_length.rs b/datafusion/functions/benches/character_length.rs
index b4a9e917f4160..4927627ec2f05 100644
--- a/datafusion/functions/benches/character_length.rs
+++ b/datafusion/functions/benches/character_length.rs
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use helper::gen_string_array;
+use std::hint::black_box;
 use std::sync::Arc;
 
 mod helper;
@@ -30,6 +30,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let character_length = datafusion_functions::unicode::character_length();
 
     let return_field = Arc::new(Field::new("f", DataType::Utf8, true));
+    let config_options = Arc::new(ConfigOptions::default());
 
     let n_rows = 8192;
     for str_len in [8, 32, 128, 4096] {
@@ -51,6 +52,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -74,6 +76,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -97,6 +100,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -120,6 +124,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
diff --git a/datafusion/functions/benches/chr.rs b/datafusion/functions/benches/chr.rs
index 6a956bb788127..a702dc161ae06 100644
--- a/datafusion/functions/benches/chr.rs
+++ b/datafusion/functions/benches/chr.rs
@@ -15,15 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{array::PrimitiveArray, datatypes::Int64Type};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string::chr;
 use rand::{Rng, SeedableRng};
+use std::hint::black_box;
 
 use arrow::datatypes::{DataType, Field};
+use datafusion_common::config::ConfigOptions;
 use rand::rngs::StdRng;
 use std::sync::Arc;
 
@@ -33,11 +34,32 @@ pub fn seedable_rng() -> StdRng {
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    let cot_fn = chr();
+    let chr_fn = chr();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks
+    c.bench_function("chr/scalar", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(65)))];
+        let arg_fields = vec![Field::new("arg_0", DataType::Int64, true).into()];
+        b.iter(|| {
+            black_box(
+                chr_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
     let size = 1024;
     let input: PrimitiveArray<Int64Type> = {
         let null_density = 0.2;
-        let mut rng = StdRng::seed_from_u64(42);
+        let mut rng = seedable_rng();
         (0..size)
             .map(|_| {
                 if rng.random::<f32>() < null_density {
@@ -56,15 +78,16 @@ fn criterion_benchmark(c: &mut Criterion) {
         .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
         .collect::<Vec<_>>();
 
-    c.bench_function("chr", |b| {
+    c.bench_function("chr/array", |b| {
         b.iter(|| {
             black_box(
-                cot_fn
+                chr_fn
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
                         number_rows: size,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     })
                     .unwrap(),
             )
diff --git a/datafusion/functions/benches/concat.rs b/datafusion/functions/benches/concat.rs
index d350c03c497bb..0fb910800e3bc 100644
--- a/datafusion/functions/benches/concat.rs
+++ b/datafusion/functions/benches/concat.rs
@@ -17,14 +17,18 @@
 
 use arrow::array::ArrayRef;
 use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use arrow::util::bench_util::{create_string_array_with_len, create_string_view_array};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string::concat;
+use rand::Rng;
+use rand::distr::Alphanumeric;
+use std::hint::black_box;
 use std::sync::Arc;
 
-fn create_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+fn create_array_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
     let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len));
     let scalar = ScalarValue::Utf8(Some(", ".to_string()));
     vec![
@@ -34,9 +38,37 @@ fn create_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
     ]
 }
 
+fn create_array_args_view(size: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(create_string_view_array(size, 0.2));
+    let scalar = ScalarValue::Utf8(Some(", ".to_string()));
+    vec![
+        ColumnarValue::Array(Arc::clone(&array) as ArrayRef),
+        ColumnarValue::Scalar(scalar),
+        ColumnarValue::Array(array),
+    ]
+}
+
+fn generate_random_string(str_len: usize) -> String {
+    rand::rng()
+        .sample_iter(&Alphanumeric)
+        .take(str_len)
+        .map(char::from)
+        .collect()
+}
+
+fn create_scalar_args(count: usize, str_len: usize) -> Vec<ColumnarValue> {
+    std::iter::repeat_with(|| {
+        let s = generate_random_string(str_len);
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(s)))
+    })
+    .take(count)
+    .collect()
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
+    // Benchmark for array concat
     for size in [1024, 4096, 8192] {
-        let args = create_args(size, 32);
+        let args = create_array_args(size, 32);
         let arg_fields = args
             .iter()
             .enumerate()
@@ -44,18 +76,20 @@ fn criterion_benchmark(c: &mut Criterion) {
                 Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
             .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
 
         let mut group = c.benchmark_group("concat function");
         group.bench_function(BenchmarkId::new("concat", size), |b| {
             b.iter(|| {
                 let args_cloned = args.clone();
-                criterion::black_box(
+                black_box(
                     concat()
                         .invoke_with_args(ScalarFunctionArgs {
                             args: args_cloned,
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Field::new("f", DataType::Utf8, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
@@ -63,6 +97,70 @@ fn criterion_benchmark(c: &mut Criterion) {
         });
         group.finish();
     }
+
+    // Benchmark for StringViewArray concat
+    for size in [1024, 4096, 8192] {
+        let args = create_array_args_view(size);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                // Use Utf8View for array args
+                let dt = if matches!(arg, ColumnarValue::Array(_)) {
+                    DataType::Utf8View
+                } else {
+                    DataType::Utf8 // scalar remains Utf8
+                };
+                Field::new(format!("arg_{idx}"), dt, true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        let mut group = c.benchmark_group("concat function");
+        group.bench_function(BenchmarkId::new("concat_view", size), |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(
+                    concat()
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Field::new("f", DataType::Utf8View, true)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+        group.finish();
+    }
+
+    // Benchmark for scalar concat
+    let scalar_args = create_scalar_args(10, 100);
+    let scalar_arg_fields = scalar_args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let mut group = c.benchmark_group("concat function");
+    group.bench_function(BenchmarkId::new("concat", "scalar"), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(
+                concat()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/concat_ws.rs b/datafusion/functions/benches/concat_ws.rs
new file mode 100644
index 0000000000000..97d6d96411d73
--- /dev/null
+++ b/datafusion/functions/benches/concat_ws.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string::concat_ws;
+use rand::Rng;
+use rand::distr::Alphanumeric;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_array_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len));
+    let scalar = ScalarValue::Utf8(Some(", ".to_string()));
+    vec![
+        ColumnarValue::Scalar(scalar),
+        ColumnarValue::Array(Arc::clone(&array) as ArrayRef),
+        ColumnarValue::Array(array),
+    ]
+}
+
+fn generate_random_string(str_len: usize) -> String {
+    rand::rng()
+        .sample_iter(&Alphanumeric)
+        .take(str_len)
+        .map(char::from)
+        .collect()
+}
+
+fn create_scalar_args(count: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let mut args = Vec::with_capacity(count + 1);
+
+    args.push(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+        ",".to_string(),
+    ))));
+
+    for _ in 0..count {
+        let s = generate_random_string(str_len);
+        args.push(ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))));
+    }
+    args
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Benchmark for array concat_ws
+    for size in [1024, 4096, 8192] {
+        let args = create_array_args(size, 32);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        let mut group = c.benchmark_group("concat_ws function");
+        group.bench_function(BenchmarkId::new("concat_ws", size), |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(
+                    concat_ws()
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: arg_fields.clone(),
+                            number_rows: size,
+                            return_field: Field::new("f", DataType::Utf8, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+        group.finish();
+    }
+
+    // Benchmark for scalar concat_ws
+    let scalar_args = create_scalar_args(10, 100);
+    let scalar_arg_fields = scalar_args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let mut group = c.benchmark_group("concat_ws function");
+    group.bench_function(BenchmarkId::new("concat_ws", "scalar"), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(
+                concat_ws()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/contains.rs b/datafusion/functions/benches/contains.rs
new file mode 100644
index 0000000000000..6c39f45e14fa6
--- /dev/null
+++ b/datafusion/functions/benches/contains.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar search string
+fn gen_scalar_search(search_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(search_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(search_str.to_string())))
+    }
+}
+
+/// Generate an array of search strings (same string repeated)
+fn gen_array_search(
+    search_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(search_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let contains = datafusion_functions::string::contains();
+    let n_rows = 8192;
+    let str_len = 128;
+    let search_str = "xyz"; // A pattern that likely won't be found
+
+    // Benchmark: StringArray with scalar search (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_search = gen_scalar_search(search_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("contains_StringArray_scalar_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_search.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array search (for comparison)
+    let array_search = gen_array_search(search_str, n_rows, false);
+    c.bench_function("contains_StringArray_array_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_search.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar search (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_search_view = gen_scalar_search(search_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("contains_StringViewArray_scalar_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_search_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array search (for comparison)
+    let array_search_view = gen_array_search(search_str, n_rows, true);
+    c.bench_function("contains_StringViewArray_array_search", |b| {
+        b.iter(|| {
+            black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_search_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar search
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_search = gen_scalar_search(search_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("contains_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(contains.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_search.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/cot.rs b/datafusion/functions/benches/cot.rs
index a32e0d834672c..16c3fba2175fe 100644
--- a/datafusion/functions/benches/cot.rs
+++ b/datafusion/functions/benches/cot.rs
@@ -15,21 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::cot;
+use std::hint::black_box;
 
 use arrow::datatypes::{DataType, Field};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let cot_fn = cot();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Array benchmarks - run for different sizes
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let f32_args = vec![ColumnarValue::Array(f32_array)];
@@ -50,11 +54,13 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Field::new("f", DataType::Float32, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
             })
         });
+
         let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
         let f64_args = vec![ColumnarValue::Array(f64_array)];
         let arg_fields = f64_args
@@ -75,12 +81,54 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
             })
         });
     }
+
+    // Scalar benchmarks - run only once since size doesn't affect scalar performance
+    let scalar_f32_args = vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("cot f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                cot_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f32),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))];
+    let scalar_f64_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+    let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("cot f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                cot_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_f64),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/crypto.rs b/datafusion/functions/benches/crypto.rs
new file mode 100644
index 0000000000000..9a86efbff9ed8
--- /dev/null
+++ b/datafusion/functions/benches/crypto.rs
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_functions::crypto;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let crypto = vec![
+        crypto::md5(),
+        crypto::sha224(),
+        crypto::sha256(),
+        crypto::sha384(),
+        crypto::sha512(),
+    ];
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for func in crypto {
+        let size = 1024;
+        let arr_args = vec![ColumnarValue::Array(Arc::new(
+            create_string_array_with_len::<i32>(size, 0.2, 32),
+        ))];
+        c.bench_function(&format!("{}_array", func.name()), |b| {
+            b.iter(|| {
+                let args_cloned = arr_args.clone();
+                black_box(func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        let scalar_args = vec![ColumnarValue::Scalar("test_string".into())];
+        c.bench_function(&format!("{}_scalar", func.name()), |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/date_bin.rs b/datafusion/functions/benches/date_bin.rs
index ac766a002576c..28dee96987261 100644
--- a/datafusion/functions/benches/date_bin.rs
+++ b/datafusion/functions/benches/date_bin.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, TimestampSecondArray};
 use arrow::datatypes::Field;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use rand::rngs::ThreadRng;
-use rand::Rng;
-
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::date_bin;
+use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray {
     let mut seconds = vec![];
@@ -55,6 +54,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             Field::new("a", interval.data_type(), true).into(),
             Field::new("b", timestamps.data_type(), true).into(),
         ];
+        let config_options = Arc::new(ConfigOptions::default());
+
         b.iter(|| {
             black_box(
                 udf.invoke_with_args(ScalarFunctionArgs {
@@ -62,6 +63,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: batch_len,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
                 })
                 .expect("date_bin should work on valid values"),
             )
diff --git a/datafusion/functions/benches/date_trunc.rs b/datafusion/functions/benches/date_trunc.rs
index ad4d0d0fbb796..0668a1cc5085c 100644
--- a/datafusion/functions/benches/date_trunc.rs
+++ b/datafusion/functions/benches/date_trunc.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, TimestampSecondArray};
 use arrow::datatypes::Field;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use rand::rngs::ThreadRng;
-use rand::Rng;
-
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs};
 use datafusion_functions::datetime::date_trunc;
+use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn timestamps(rng: &mut ThreadRng) -> TimestampSecondArray {
     let mut seconds = vec![];
@@ -56,10 +55,15 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
             .collect::<Vec<_>>();
 
-        let return_type = udf
-            .return_type(&args.iter().map(|arg| arg.data_type()).collect::<Vec<_>>())
+        let scalar_arguments = vec![None; arg_fields.len()];
+        let return_field = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &arg_fields,
+                scalar_arguments: &scalar_arguments,
+            })
             .unwrap();
-        let return_field = Arc::new(Field::new("f", return_type, true));
+        let config_options = Arc::new(ConfigOptions::default());
+
         b.iter(|| {
             black_box(
                 udf.invoke_with_args(ScalarFunctionArgs {
@@ -67,6 +71,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: batch_len,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
                 })
                 .expect("date_trunc should work on valid values"),
             )
diff --git a/datafusion/functions/benches/encoding.rs b/datafusion/functions/benches/encoding.rs
index 830e0324766f7..0b8f0c5c51a58 100644
--- a/datafusion/functions/benches/encoding.rs
+++ b/datafusion/functions/benches/encoding.rs
@@ -15,32 +15,37 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::Array;
 use arrow::datatypes::{DataType, Field};
-use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::util::bench_util::create_binary_array;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::encoding;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let decode = encoding::decode();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
-        let str_array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, 32));
+        let bin_array = Arc::new(create_binary_array::<i32>(size, 0.2));
         c.bench_function(&format!("base64_decode/{size}"), |b| {
             let method = ColumnarValue::Scalar("base64".into());
             let encoded = encoding::encode()
                 .invoke_with_args(ScalarFunctionArgs {
-                    args: vec![ColumnarValue::Array(str_array.clone()), method.clone()],
+                    args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()],
                     arg_fields: vec![
-                        Field::new("a", str_array.data_type().to_owned(), true).into(),
+                        Field::new("a", bin_array.data_type().to_owned(), true).into(),
                         Field::new("b", method.data_type().to_owned(), true).into(),
                     ],
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
+                .unwrap()
+                .cast_to(&DataType::Binary, None)
                 .unwrap();
 
             let arg_fields = vec![
@@ -56,7 +61,8 @@ fn criterion_benchmark(c: &mut Criterion) {
                             args: args.clone(),
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
-                            return_field: Field::new("f", DataType::Utf8, true).into(),
+                            return_field: Field::new("f", DataType::Binary, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
@@ -66,23 +72,26 @@ fn criterion_benchmark(c: &mut Criterion) {
         c.bench_function(&format!("hex_decode/{size}"), |b| {
             let method = ColumnarValue::Scalar("hex".into());
             let arg_fields = vec![
-                Field::new("a", str_array.data_type().to_owned(), true).into(),
+                Field::new("a", bin_array.data_type().to_owned(), true).into(),
                 Field::new("b", method.data_type().to_owned(), true).into(),
             ];
             let encoded = encoding::encode()
                 .invoke_with_args(ScalarFunctionArgs {
-                    args: vec![ColumnarValue::Array(str_array.clone()), method.clone()],
+                    args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()],
                     arg_fields,
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
+                .unwrap()
+                .cast_to(&DataType::Binary, None)
                 .unwrap();
 
             let arg_fields = vec![
                 Field::new("a", encoded.data_type().to_owned(), true).into(),
                 Field::new("b", method.data_type().to_owned(), true).into(),
             ];
-            let return_field = Field::new("f", DataType::Utf8, true).into();
+            let return_field = Field::new("f", DataType::Binary, true).into();
             let args = vec![encoded, method];
 
             b.iter(|| {
@@ -93,6 +102,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
diff --git a/datafusion/functions/benches/ends_with.rs b/datafusion/functions/benches/ends_with.rs
new file mode 100644
index 0000000000000..474e8a1555cf2
--- /dev/null
+++ b/datafusion/functions/benches/ends_with.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar suffix string
+fn gen_scalar_suffix(suffix_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(suffix_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(suffix_str.to_string())))
+    }
+}
+
+/// Generate an array of suffix strings (same string repeated)
+fn gen_array_suffix(
+    suffix_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(suffix_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ends_with = datafusion_functions::string::ends_with();
+    let n_rows = 8192;
+    let str_len = 128;
+    let suffix_str = "xyz"; // A pattern that likely won't match
+
+    // Benchmark: StringArray with scalar suffix (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_suffix = gen_scalar_suffix(suffix_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("ends_with_StringArray_scalar_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_suffix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array suffix (for comparison)
+    let array_suffix = gen_array_suffix(suffix_str, n_rows, false);
+    c.bench_function("ends_with_StringArray_array_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_suffix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar suffix (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_suffix_view = gen_scalar_suffix(suffix_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("ends_with_StringViewArray_scalar_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_suffix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array suffix (for comparison)
+    let array_suffix_view = gen_array_suffix(suffix_str, n_rows, true);
+    c.bench_function("ends_with_StringViewArray_array_suffix", |b| {
+        b.iter(|| {
+            black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_suffix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar suffix
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_suffix = gen_scalar_suffix(suffix_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("ends_with_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(ends_with.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_suffix.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/factorial.rs b/datafusion/functions/benches/factorial.rs
new file mode 100644
index 0000000000000..c441b50c288c3
--- /dev/null
+++ b/datafusion/functions/benches/factorial.rs
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Int64Array;
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_functions::math::factorial;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let factorial = factorial();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    let arr_args = vec![ColumnarValue::Array(Arc::new(Int64Array::from_iter(
+        (0..1024).map(|i| Some(i % 21)),
+    )))];
+    c.bench_function(&format!("{}_array", factorial.name()), |b| {
+        b.iter(|| {
+            let args_cloned = arr_args.clone();
+            black_box(factorial.invoke_with_args(ScalarFunctionArgs {
+                args: args_cloned,
+                arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                number_rows: arr_args.len(),
+                return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(20)))];
+    c.bench_function(&format!("{}_scalar", factorial.name()), |b| {
+        b.iter(|| {
+            let args_cloned = scalar_args.clone();
+            black_box(factorial.invoke_with_args(ScalarFunctionArgs {
+                args: args_cloned,
+                arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
+                number_rows: 1,
+                return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/find_in_set.rs b/datafusion/functions/benches/find_in_set.rs
index bad540f049e28..9ee20ecd14fdf 100644
--- a/datafusion/functions/benches/find_in_set.rs
+++ b/datafusion/functions/benches/find_in_set.rs
@@ -15,19 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use rand::distr::Alphanumeric;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -165,6 +165,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: n_rows,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
                 }))
             })
         });
@@ -182,6 +183,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: n_rows,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
                 }))
             })
         });
@@ -203,6 +205,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: n_rows,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
                 }))
             })
         });
@@ -213,6 +216,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             .map(|arg| Field::new("a", arg.data_type().clone(), true).into())
             .collect::<Vec<_>>();
         let return_field = Arc::new(Field::new("f", DataType::Int32, true));
+        let config_options = Arc::new(ConfigOptions::default());
+
         group.bench_function(format!("string_view_len_{str_len}"), |b| {
             b.iter(|| {
                 black_box(find_in_set.invoke_with_args(ScalarFunctionArgs {
@@ -220,6 +225,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: n_rows,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
diff --git a/datafusion/functions/benches/floor_ceil.rs b/datafusion/functions/benches/floor_ceil.rs
new file mode 100644
index 0000000000000..dc095e0152c4d
--- /dev/null
+++ b/datafusion/functions/benches/floor_ceil.rs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::{ceil, floor};
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let floor_fn = floor();
+    let ceil_fn = ceil();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("floor_ceil size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Float64 array benchmark
+        let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.1));
+        let batch_len = f64_array.len();
+        let f64_args = vec![ColumnarValue::Array(f64_array)];
+
+        group.bench_function("floor_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    floor_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.bench_function("ceil_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    ceil_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Scalar benchmark (the optimization we added)
+        let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(
+            std::f64::consts::PI,
+        )))];
+
+        group.bench_function("floor_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(
+                    floor_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.bench_function("ceil_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_args.clone();
+                black_box(
+                    ceil_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/gcd.rs b/datafusion/functions/benches/gcd.rs
index f700d31123a9d..3c72a46e6643d 100644
--- a/datafusion/functions/benches/gcd.rs
+++ b/datafusion/functions/benches/gcd.rs
@@ -15,18 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::Field;
 use arrow::{
     array::{ArrayRef, Int64Array},
     datatypes::DataType,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::gcd;
 use rand::Rng;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn generate_i64_array(n_rows: usize) -> ArrayRef {
@@ -42,6 +42,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let array_a = ColumnarValue::Array(generate_i64_array(n_rows));
     let array_b = ColumnarValue::Array(generate_i64_array(n_rows));
     let udf = gcd();
+    let config_options = Arc::new(ConfigOptions::default());
 
     c.bench_function("gcd both array", |b| {
         b.iter(|| {
@@ -54,6 +55,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     ],
                     number_rows: 0,
                     return_field: Field::new("f", DataType::Int64, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
                 .expect("date_bin should work on valid values"),
             )
@@ -74,6 +76,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     ],
                     number_rows: 0,
                     return_field: Field::new("f", DataType::Int64, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
                 .expect("date_bin should work on valid values"),
             )
@@ -94,6 +97,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     ],
                     number_rows: 0,
                     return_field: Field::new("f", DataType::Int64, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
                 .expect("date_bin should work on valid values"),
             )
diff --git a/datafusion/functions/benches/helper.rs b/datafusion/functions/benches/helper.rs
index a2b110ae4d63b..d6d6afd48f2ca 100644
--- a/datafusion/functions/benches/helper.rs
+++ b/datafusion/functions/benches/helper.rs
@@ -18,7 +18,7 @@
 use arrow::array::{StringArray, StringViewArray};
 use datafusion_expr::ColumnarValue;
 use rand::distr::Alphanumeric;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rngs::StdRng};
 use std::sync::Arc;
 
 /// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
diff --git a/datafusion/functions/benches/initcap.rs b/datafusion/functions/benches/initcap.rs
index f89b11dff8fbe..b5e653e4136a3 100644
--- a/datafusion/functions/benches/initcap.rs
+++ b/datafusion/functions/benches/initcap.rs
@@ -15,17 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
-use arrow::array::OffsetSizeTrait;
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray, StringViewBuilder};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode;
+use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
 fn create_args<O: OffsetSizeTrait>(
     size: usize,
@@ -45,58 +47,161 @@ fn create_args<O: OffsetSizeTrait>(
     }
 }
 
+/// Create a Utf8 array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8_args(size: usize) -> Vec<ColumnarValue> {
+    let array = Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+        "ñAnDÚ ÁrBOL ОлЕГ ÍslENsku",
+        size,
+    ))) as ArrayRef;
+    vec![ColumnarValue::Array(array)]
+}
+
+/// Create a Utf8View array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8view_args(size: usize) -> Vec<ColumnarValue> {
+    let mut builder = StringViewBuilder::with_capacity(size);
+    for _ in 0..size {
+        builder.append_value("ñAnDÚ ÁrBOL ОлЕГ ÍslENsku");
+    }
+    let array = Arc::new(builder.finish()) as ArrayRef;
+    vec![ColumnarValue::Array(array)]
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let initcap = unicode::initcap();
-    for size in [1024, 4096] {
-        let args = create_args::<i32>(size, 8, true);
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Array benchmarks: vary both row count and string length
+    for size in [1024, 4096, 8192] {
+        for str_len in [16, 128] {
+            let mut group =
+                c.benchmark_group(format!("initcap size={size} str_len={str_len}"));
+            group.sampling_mode(SamplingMode::Flat);
+            group.sample_size(10);
+            group.measurement_time(Duration::from_secs(10));
+
+            // Utf8
+            let array_args = create_args::<i32>(size, str_len, false);
+            let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
 
-        c.bench_function(
-            format!("initcap string view shorter than 12 [size={size}]").as_str(),
-            |b| {
+            group.bench_function("array_utf8", |b| {
                 b.iter(|| {
                     black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
+                        args: array_args.clone(),
+                        arg_fields: array_arg_fields.clone(),
                         number_rows: size,
-                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
-            },
-        );
+            });
+
+            // Utf8View
+            let array_view_args = create_args::<i32>(size, str_len, true);
+            let array_view_arg_fields =
+                vec![Field::new("arg_0", DataType::Utf8View, true).into()];
 
-        let args = create_args::<i32>(size, 16, true);
-        c.bench_function(
-            format!("initcap string view longer than 12 [size={size}]").as_str(),
-            |b| {
+            group.bench_function("array_utf8view", |b| {
                 b.iter(|| {
                     black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
+                        args: array_view_args.clone(),
+                        arg_fields: array_view_arg_fields.clone(),
                         number_rows: size,
                         return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
-            },
-        );
+            });
 
-        let args = create_args::<i32>(size, 16, false);
-        c.bench_function(format!("initcap string [size={size}]").as_str(), |b| {
+            group.finish();
+        }
+    }
+
+    // Unicode array benchmarks
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("initcap unicode size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        let unicode_args = create_unicode_utf8_args(size);
+        let unicode_arg_fields = vec![Field::new("arg_0", DataType::Utf8, true).into()];
+
+        group.bench_function("array_utf8", |b| {
             b.iter(|| {
                 black_box(initcap.invoke_with_args(ScalarFunctionArgs {
-                    args: args.clone(),
-                    arg_fields: arg_fields.clone(),
+                    args: unicode_args.clone(),
+                    arg_fields: unicode_arg_fields.clone(),
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
+
+        let unicode_view_args = create_unicode_utf8view_args(size);
+        let unicode_view_arg_fields =
+            vec![Field::new("arg_0", DataType::Utf8View, true).into()];
+
+        group.bench_function("array_utf8view", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: unicode_view_args.clone(),
+                    arg_fields: unicode_view_arg_fields.clone(),
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8View, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        group.finish();
+    }
+
+    // Scalar benchmarks: independent of array size, run once
+    {
+        let mut group = c.benchmark_group("initcap scalar");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Utf8
+        let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+            "hello world test string".to_string(),
+        )))];
+        let scalar_arg_fields = vec![Field::new("arg_0", DataType::Utf8, false).into()];
+
+        group.bench_function("scalar_utf8", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: scalar_args.clone(),
+                    arg_fields: scalar_arg_fields.clone(),
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8, false).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        // Utf8View
+        let scalar_view_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+            "hello world test string".to_string(),
+        )))];
+        let scalar_view_arg_fields =
+            vec![Field::new("arg_0", DataType::Utf8View, false).into()];
+
+        group.bench_function("scalar_utf8view", |b| {
+            b.iter(|| {
+                black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+                    args: scalar_view_args.clone(),
+                    arg_fields: scalar_view_arg_fields.clone(),
+                    number_rows: 1,
+                    return_field: Field::new("f", DataType::Utf8View, false).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        });
+
+        group.finish();
     }
 }
 
diff --git a/datafusion/functions/benches/isnan.rs b/datafusion/functions/benches/isnan.rs
index 49d0a9e326dd7..e353b9d27a0a1 100644
--- a/datafusion/functions/benches/isnan.rs
+++ b/datafusion/functions/benches/isnan.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::isnan;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
@@ -39,6 +39,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                 Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
             .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("isnan f32 array: {size}"), |b| {
             b.iter(|| {
@@ -49,6 +50,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Field::new("f", DataType::Boolean, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
@@ -72,6 +74,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Field::new("f", DataType::Boolean, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
diff --git a/datafusion/functions/benches/iszero.rs b/datafusion/functions/benches/iszero.rs
index 6d1d34c7a8320..c6d0aed4c615c 100644
--- a/datafusion/functions/benches/iszero.rs
+++ b/datafusion/functions/benches/iszero.rs
@@ -15,20 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::{
     datatypes::{Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::iszero;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let iszero = iszero();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let batch_len = f32_array.len();
@@ -51,11 +54,13 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: batch_len,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
             })
         });
+
         let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.2));
         let batch_len = f64_array.len();
         let f64_args = vec![ColumnarValue::Array(f64_array)];
@@ -77,12 +82,53 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: batch_len,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
             })
         });
     }
+
+    // Scalar benchmarks - run once since size doesn't affect scalar performance
+    let scalar_f32_args = vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let return_field_scalar = Arc::new(Field::new("f", DataType::Boolean, false));
+
+    c.bench_function("iszero f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                iszero
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_scalar),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f64_args = vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))];
+    let scalar_f64_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+
+    c.bench_function("iszero f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                iszero
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_f64_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field_scalar),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/left_right.rs b/datafusion/functions/benches/left_right.rs
new file mode 100644
index 0000000000000..59f8d8a75f74c
--- /dev/null
+++ b/datafusion/functions/benches/left_right.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::unicode::{left, right};
+
+fn create_args(
+    size: usize,
+    str_len: usize,
+    use_negative: bool,
+    is_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let string_arg = if is_string_view {
+        ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
+            size, 0.1, str_len, true,
+        )))
+    } else {
+        ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
+            size, 0.1, str_len,
+        )))
+    };
+
+    // For negative n, we want to trigger the double-iteration code path
+    let n_values: Vec<i64> = if use_negative {
+        (0..size).map(|i| -((i % 10 + 1) as i64)).collect()
+    } else {
+        (0..size).map(|i| (i % 10 + 1) as i64).collect()
+    };
+    let n_array = Arc::new(Int64Array::from(n_values));
+
+    vec![
+        string_arg,
+        ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
+    ]
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let left_function = left();
+    let right_function = right();
+
+    for function in [left_function, right_function] {
+        for is_string_view in [false, true] {
+            for is_negative in [false, true] {
+                for size in [1024, 4096] {
+                    let function_name = function.name();
+                    let mut group =
+                        c.benchmark_group(format!("{function_name} size={size}"));
+
+                    let bench_name = format!(
+                        "{} {} n",
+                        if is_string_view {
+                            "string_view_array"
+                        } else {
+                            "string_array"
+                        },
+                        if is_negative { "negative" } else { "positive" },
+                    );
+                    let return_type = if is_string_view {
+                        DataType::Utf8View
+                    } else {
+                        DataType::Utf8
+                    };
+
+                    let args = create_args(size, 32, is_negative, is_string_view);
+                    group.bench_function(BenchmarkId::new(bench_name, size), |b| {
+                        let arg_fields = args
+                            .iter()
+                            .enumerate()
+                            .map(|(idx, arg)| {
+                                Field::new(format!("arg_{idx}"), arg.data_type(), true)
+                                    .into()
+                            })
+                            .collect::<Vec<_>>();
+                        let config_options = Arc::new(ConfigOptions::default());
+
+                        b.iter(|| {
+                            black_box(
+                                function
+                                    .invoke_with_args(ScalarFunctionArgs {
+                                        args: args.clone(),
+                                        arg_fields: arg_fields.clone(),
+                                        number_rows: size,
+                                        return_field: Field::new(
+                                            "f",
+                                            return_type.clone(),
+                                            true,
+                                        )
+                                        .into(),
+                                        config_options: Arc::clone(&config_options),
+                                    })
+                                    .expect("should work"),
+                            )
+                        })
+                    });
+
+                    group.finish();
+                }
+            }
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/levenshtein.rs b/datafusion/functions/benches/levenshtein.rs
new file mode 100644
index 0000000000000..08733b245ffb4
--- /dev/null
+++ b/datafusion/functions/benches/levenshtein.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let string1_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let string2_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    vec![
+        ColumnarValue::Array(string1_array),
+        ColumnarValue::Array(string2_array),
+    ]
+}
+
+fn invoke_levenshtein_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    string::levenshtein().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int32, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("levenshtein size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for str_len in [8, 32] {
+            let args = create_args::<i32>(size, str_len);
+            group.bench_function(
+                format!("levenshtein_string [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_levenshtein_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/lower.rs b/datafusion/functions/benches/lower.rs
index cdf1529c108c0..6dbc8dcb7d148 100644
--- a/datafusion/functions/benches/lower.rs
+++ b/datafusion/functions/benches/lower.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{ArrayRef, StringArray, StringViewBuilder};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
+use std::hint::black_box;
 use std::sync::Arc;
 
 /// Create an array of args containing a StringArray, where all the values in the
@@ -122,6 +122,8 @@ fn create_args5(
 
 fn criterion_benchmark(c: &mut Criterion) {
     let lower = string::lower();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let args = create_args1(size, 32);
         let arg_fields = args
@@ -140,6 +142,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
@@ -161,6 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
@@ -184,6 +188,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: size,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -217,6 +222,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                                 arg_fields: arg_fields.clone(),
                                 number_rows: size,
                                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                                config_options: Arc::clone(&config_options),
                             }))
                         }),
                     );
@@ -231,6 +237,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                                 arg_fields: arg_fields.clone(),
                                 number_rows: size,
                                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                                config_options: Arc::clone(&config_options),
                             }))
                         }),
                     );
@@ -246,6 +253,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                                 arg_fields: arg_fields.clone(),
                                 number_rows: size,
                                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                                config_options: Arc::clone(&config_options),
                             }))
                         }),
                     );
diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs
deleted file mode 100644
index 7a44f40a689a4..0000000000000
--- a/datafusion/functions/benches/ltrim.rs
+++ /dev/null
@@ -1,251 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-extern crate criterion;
-
-use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
-use arrow::datatypes::{DataType, Field};
-use criterion::{
-    black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
-    Criterion, SamplingMode,
-};
-use datafusion_common::ScalarValue;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
-use datafusion_functions::string;
-use rand::{distr::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
-use std::{fmt, sync::Arc};
-
-#[derive(Clone, Copy)]
-pub enum StringArrayType {
-    Utf8View,
-    Utf8,
-    LargeUtf8,
-}
-
-impl fmt::Display for StringArrayType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            StringArrayType::Utf8View => f.write_str("string_view"),
-            StringArrayType::Utf8 => f.write_str("string"),
-            StringArrayType::LargeUtf8 => f.write_str("large_string"),
-        }
-    }
-}
-
-/// returns an array of strings, and `characters` as a ScalarValue
-pub fn create_string_array_and_characters(
-    size: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_array_type: StringArrayType,
-) -> (ArrayRef, ScalarValue) {
-    let rng = &mut StdRng::seed_from_u64(42);
-
-    // Create `size` rows:
-    //   - 10% rows will be `None`
-    //   - Other 90% will be strings with same `remaining_len` lengths
-    // We will build the string array on it later.
-    let string_iter = (0..size).map(|_| {
-        if rng.random::<f32>() < 0.1 {
-            None
-        } else {
-            let mut value = trimmed.as_bytes().to_vec();
-            let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
-            value.extend(generated);
-            Some(String::from_utf8(value).unwrap())
-        }
-    });
-
-    // Build the target `string array` and `characters` according to `string_array_type`
-    match string_array_type {
-        StringArrayType::Utf8View => (
-            Arc::new(string_iter.collect::<StringViewArray>()),
-            ScalarValue::Utf8View(Some(characters.to_string())),
-        ),
-        StringArrayType::Utf8 => (
-            Arc::new(string_iter.collect::<StringArray>()),
-            ScalarValue::Utf8(Some(characters.to_string())),
-        ),
-        StringArrayType::LargeUtf8 => (
-            Arc::new(string_iter.collect::<LargeStringArray>()),
-            ScalarValue::LargeUtf8(Some(characters.to_string())),
-        ),
-    }
-}
-
-/// Create args for the ltrim benchmark
-/// Inputs:
-///   - size: rows num of the test array
-///   - characters: the characters we need to trim
-///   - trimmed: the part in the testing string that will be trimmed
-///   - remaining_len: the len of the remaining part of testing string after trimming
-///   - string_array_type: the method used to store the testing strings
-///
-/// Outputs:
-///   - testing string array
-///   - trimmed characters
-///
-fn create_args(
-    size: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_array_type: StringArrayType,
-) -> Vec<ColumnarValue> {
-    let (string_array, pattern) = create_string_array_and_characters(
-        size,
-        characters,
-        trimmed,
-        remaining_len,
-        string_array_type,
-    );
-    vec![
-        ColumnarValue::Array(string_array),
-        ColumnarValue::Scalar(pattern),
-    ]
-}
-
-#[allow(clippy::too_many_arguments)]
-fn run_with_string_type<M: Measurement>(
-    group: &mut BenchmarkGroup<'_, M>,
-    ltrim: &ScalarUDF,
-    size: usize,
-    len: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-    string_type: StringArrayType,
-) {
-    let args = create_args(size, characters, trimmed, remaining_len, string_type);
-    let arg_fields = args
-        .iter()
-        .enumerate()
-        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
-        .collect::<Vec<_>>();
-    group.bench_function(
-        format!(
-            "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]",
-        ),
-        |b| {
-            b.iter(|| {
-                let args_cloned = args.clone();
-                black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: arg_fields.clone(),
-                    number_rows: size,
-                    return_field: Field::new("f", DataType::Utf8, true).into(),
-                }))
-            })
-        },
-    );
-}
-
-#[allow(clippy::too_many_arguments)]
-fn run_one_group(
-    c: &mut Criterion,
-    group_name: &str,
-    ltrim: &ScalarUDF,
-    string_types: &[StringArrayType],
-    size: usize,
-    len: usize,
-    characters: &str,
-    trimmed: &str,
-    remaining_len: usize,
-) {
-    let mut group = c.benchmark_group(group_name);
-    group.sampling_mode(SamplingMode::Flat);
-    group.sample_size(10);
-
-    for string_type in string_types {
-        run_with_string_type(
-            &mut group,
-            ltrim,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-            *string_type,
-        );
-    }
-
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let ltrim = string::ltrim();
-    let characters = ",!()";
-
-    let string_types = [
-        StringArrayType::Utf8View,
-        StringArrayType::Utf8,
-        StringArrayType::LargeUtf8,
-    ];
-    for size in [1024, 4096, 8192] {
-        // len=12, trimmed_len=4, len_after_ltrim=8
-        let len = 12;
-        let trimmed = characters;
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN <= 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-        );
-
-        // len=64, trimmed_len=4, len_after_ltrim=60
-        let len = 64;
-        let trimmed = characters;
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN > 12, OUTPUT LEN > 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            trimmed,
-            remaining_len,
-        );
-
-        // len=64, trimmed_len=56, len_after_ltrim=8
-        let len = 64;
-        let trimmed = characters.repeat(15);
-        let remaining_len = len - trimmed.len();
-        run_one_group(
-            c,
-            "INPUT LEN > 12, OUTPUT LEN <= 12",
-            &ltrim,
-            &string_types,
-            size,
-            len,
-            characters,
-            &trimmed,
-            remaining_len,
-        );
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
diff --git a/datafusion/functions/benches/make_date.rs b/datafusion/functions/benches/make_date.rs
index e1f609fbb35c0..42b5b1019538d 100644
--- a/datafusion/functions/benches/make_date.rs
+++ b/datafusion/functions/benches/make_date.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, Int32Array};
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use rand::rngs::ThreadRng;
-use rand::Rng;
-
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::make_date;
+use rand::Rng;
+use rand::rngs::ThreadRng;
 
 fn years(rng: &mut ThreadRng) -> Int32Array {
     let mut years = vec![];
@@ -69,6 +68,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             Field::new("a", days.data_type(), true).into(),
         ];
         let return_field = Field::new("f", DataType::Date32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
 
         b.iter(|| {
             black_box(
@@ -78,6 +78,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("make_date should work on valid values"),
             )
@@ -97,6 +98,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             Field::new("a", days.data_type(), true).into(),
         ];
         let return_field = Field::new("f", DataType::Date32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
+
         b.iter(|| {
             black_box(
                 make_date()
@@ -105,6 +108,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("make_date should work on valid values"),
             )
@@ -124,6 +128,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             Field::new("a", days.data_type(), true).into(),
         ];
         let return_field = Field::new("f", DataType::Date32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
+
         b.iter(|| {
             black_box(
                 make_date()
@@ -132,6 +138,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("make_date should work on valid values"),
             )
@@ -148,6 +155,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             Field::new("a", day.data_type(), true).into(),
         ];
         let return_field = Field::new("f", DataType::Date32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
 
         b.iter(|| {
             black_box(
@@ -157,6 +165,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: 1,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("make_date should work on valid values"),
             )
diff --git a/datafusion/functions/benches/nanvl.rs b/datafusion/functions/benches/nanvl.rs
new file mode 100644
index 0000000000000..206eebd81eb81
--- /dev/null
+++ b/datafusion/functions/benches/nanvl.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, Float32Array, Float64Array};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::nanvl;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let nanvl_fn = nanvl();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks
+    c.bench_function("nanvl/scalar_f64", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NAN))),
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Float64, true).into(),
+                Field::new("b", DataType::Float64, true).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+
+        b.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("nanvl/scalar_f32", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(f32::NAN))),
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Float32, true).into(),
+                Field::new("b", DataType::Float32, true).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+
+        b.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    // Array benchmarks
+    for size in [1024, 4096, 8192] {
+        let a64: ArrayRef = Arc::new(Float64Array::from(vec![f64::NAN; size]));
+        let b64: ArrayRef = Arc::new(Float64Array::from(vec![1.0; size]));
+        c.bench_function(&format!("nanvl/array_f64/{size}"), |bench| {
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(&a64)),
+                    ColumnarValue::Array(Arc::clone(&b64)),
+                ],
+                arg_fields: vec![
+                    Field::new("a", DataType::Float64, true).into(),
+                    Field::new("b", DataType::Float64, true).into(),
+                ],
+                number_rows: size,
+                return_field: Field::new("f", DataType::Float64, true).into(),
+                config_options: Arc::clone(&config_options),
+            };
+            bench.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+        });
+
+        let a32: ArrayRef = Arc::new(Float32Array::from(vec![f32::NAN; size]));
+        let b32: ArrayRef = Arc::new(Float32Array::from(vec![1.0; size]));
+        c.bench_function(&format!("nanvl/array_f32/{size}"), |bench| {
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::clone(&a32)),
+                    ColumnarValue::Array(Arc::clone(&b32)),
+                ],
+                arg_fields: vec![
+                    Field::new("a", DataType::Float32, true).into(),
+                    Field::new("b", DataType::Float32, true).into(),
+                ],
+                number_rows: size,
+                return_field: Field::new("f", DataType::Float32, true).into(),
+                config_options: Arc::clone(&config_options),
+            };
+            bench.iter(|| black_box(nanvl_fn.invoke_with_args(args.clone()).unwrap()))
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/nullif.rs b/datafusion/functions/benches/nullif.rs
index 4ac977af9d428..f9f063c52d0d4 100644
--- a/datafusion/functions/benches/nullif.rs
+++ b/datafusion/functions/benches/nullif.rs
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::core::nullif;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
@@ -40,6 +40,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                 Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
             .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("nullif scalar array: {size}"), |b| {
             b.iter(|| {
@@ -50,6 +51,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Field::new("f", DataType::Utf8, true).into(),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
diff --git a/datafusion/functions/benches/pad.rs b/datafusion/functions/benches/pad.rs
index d954ff452ed56..0f856f0fef384 100644
--- a/datafusion/functions/benches/pad.rs
+++ b/datafusion/functions/benches/pad.rs
@@ -15,18 +15,68 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
+use arrow::array::{
+    ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
+    StringViewBuilder,
+};
 use arrow::datatypes::{DataType, Field, Int64Type};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
-use datafusion_common::DataFusionError;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
-use datafusion_functions::unicode::{lpad, rpad};
-use rand::distr::{Distribution, Uniform};
+use datafusion_functions::unicode;
 use rand::Rng;
+use rand::distr::{Distribution, Uniform};
+use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
+
+const UNICODE_STRINGS: &[&str] = &[
+    "Ñandú",
+    "Íslensku",
+    "Þjóðarinnar",
+    "Ελληνική",
+    "Иванович",
+    "データフュージョン",
+    "José García",
+    "Ölçü bïrïmï",
+    "Ÿéšṱëṟḏàÿ",
+    "Ährenstraße",
+];
+
+fn create_unicode_string_array<O: OffsetSizeTrait>(
+    size: usize,
+    null_density: f32,
+) -> arrow::array::GenericStringArray<O> {
+    let mut rng = rand::rng();
+    let mut builder = GenericStringBuilder::<O>::new();
+    for i in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+        }
+    }
+    builder.finish()
+}
+
+fn create_unicode_string_view_array(
+    size: usize,
+    null_density: f32,
+) -> arrow::array::StringViewArray {
+    let mut rng = rand::rng();
+    let mut builder = StringViewBuilder::with_capacity(size);
+    for i in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+        }
+    }
+    builder.finish()
+}
 
 struct Filter<Dist> {
     dist: Dist,
@@ -65,125 +115,392 @@ where
         .collect()
 }
 
-fn create_args<O: OffsetSizeTrait>(
+/// Create args for pad benchmark with Unicode strings
+fn create_unicode_pad_args(
     size: usize,
-    str_len: usize,
-    force_view_types: bool,
+    target_len: usize,
+    use_string_view: bool,
 ) -> Vec<ColumnarValue> {
-    let length_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.0, str_len));
-
-    if !force_view_types {
-        let string_array =
-            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
-        let fill_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let length_array =
+        Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
 
+    if use_string_view {
+        let string_array = create_unicode_string_view_array(size, 0.1);
+        let fill_array = create_unicode_string_view_array(size, 0.1);
         vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
-            ColumnarValue::Array(fill_array),
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
         ]
     } else {
-        let string_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-        let fill_array =
-            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
-
+        let string_array = create_unicode_string_array::<i32>(size, 0.1);
+        let fill_array = create_unicode_string_array::<i32>(size, 0.1);
         vec![
-            ColumnarValue::Array(string_array),
-            ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
-            ColumnarValue::Array(fill_array),
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
         ]
     }
 }
 
-fn invoke_pad_with_args(
-    args: Vec<ColumnarValue>,
-    number_rows: usize,
-    left_pad: bool,
-) -> Result<ColumnarValue, DataFusionError> {
-    let arg_fields = args
-        .iter()
-        .enumerate()
-        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
-        .collect::<Vec<_>>();
-
-    let scalar_args = ScalarFunctionArgs {
-        args: args.clone(),
-        arg_fields,
-        number_rows,
-        return_field: Field::new("f", DataType::Utf8, true).into(),
-    };
+/// Create args for pad benchmark
+fn create_pad_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    target_len: usize,
+    use_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let length_array =
+        Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
 
-    if left_pad {
-        lpad().invoke_with_args(scalar_args)
+    if use_string_view {
+        let string_array = create_string_view_array_with_len(size, 0.1, str_len, false);
+        let fill_array = create_string_view_array_with_len(size, 0.1, str_len, false);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
+        ]
     } else {
-        rpad().invoke_with_args(scalar_args)
+        let string_array = create_string_array_with_len::<O>(size, 0.1, str_len);
+        let fill_array = create_string_array_with_len::<O>(size, 0.1, str_len);
+        vec![
+            ColumnarValue::Array(Arc::new(string_array)),
+            ColumnarValue::Array(length_array),
+            ColumnarValue::Array(Arc::new(fill_array)),
+        ]
     }
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
-    for size in [1024, 2048] {
-        let mut group = c.benchmark_group("lpad function");
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("lpad size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
 
-        let args = create_args::<i32>(size, 32, false);
+        // Utf8 type
+        let args = create_pad_args::<i32>(size, 5, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
 
-        group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, true).unwrap(),
-                )
+        group.bench_function(
+            format!("lpad utf8 [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type
+        let args = create_pad_args::<i32>(size, 5, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad stringview [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad utf8 [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
-
-        let args = create_args::<i64>(size, 32, false);
-        group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, true).unwrap(),
-                )
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad stringview [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
-
-        let args = create_args::<i32>(size, 32, true);
-        group.bench_function(BenchmarkId::new("stringview type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, true).unwrap(),
-                )
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad utf8 unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("lpad stringview unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
         group.finish();
+    }
 
-        let mut group = c.benchmark_group("rpad function");
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("rpad size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
 
-        let args = create_args::<i32>(size, 32, false);
-        group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, false).unwrap(),
-                )
+        // Utf8 type
+        let args = create_pad_args::<i32>(size, 5, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
-
-        let args = create_args::<i64>(size, 32, false);
-        group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, false).unwrap(),
-                )
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!("rpad utf8 [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type
+        let args = create_pad_args::<i32>(size, 5, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
-
-        // rpad for stringview type
-        let args = create_args::<i32>(size, 32, true);
-        group.bench_function(BenchmarkId::new("stringview type", size), |b| {
-            b.iter(|| {
-                criterion::black_box(
-                    invoke_pad_with_args(args.clone(), size, false).unwrap(),
-                )
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview [size={size}, str_len=5, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
             })
-        });
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad utf8 [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with longer strings
+        let args = create_pad_args::<i32>(size, 20, 50, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview [size={size}, str_len=20, target=50]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // Utf8 type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, false);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad utf8 unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+
+        // StringView type with Unicode strings
+        let args = create_unicode_pad_args(size, 20, true);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+
+        group.bench_function(
+            format!("rpad stringview unicode [size={size}, target=20]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8View, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
 
         group.finish();
     }
diff --git a/datafusion/functions/benches/random.rs b/datafusion/functions/benches/random.rs
index dc1e280b93b13..71ded120eb515 100644
--- a/datafusion/functions/benches/random.rs
+++ b/datafusion/functions/benches/random.rs
@@ -15,18 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
 use datafusion_functions::math::random::RandomFunc;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let random_func = RandomFunc::new();
-
     let return_field = Field::new("f", DataType::Float64, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
     // Benchmark to evaluate 1M rows in batch size 8192
     let iterations = 1_000_000 / 8192; // Calculate how many iterations are needed to reach approximately 1M rows
     c.bench_function("random_1M_rows_batch_8192", |b| {
@@ -39,6 +40,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: vec![],
                             number_rows: 8192,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 );
@@ -59,6 +61,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: vec![],
                             number_rows: 128,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 );
diff --git a/datafusion/functions/benches/regexp_count.rs b/datafusion/functions/benches/regexp_count.rs
new file mode 100644
index 0000000000000..bce76c05585b9
--- /dev/null
+++ b/datafusion/functions/benches/regexp_count.rs
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Int64Array;
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::regex;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    with_start: bool,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    // Use a simple pattern that matches common characters
+    let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string())));
+
+    if with_start {
+        // Test with start position (this is where the optimization matters)
+        let start_array = Arc::new(Int64Array::from(
+            (0..size).map(|i| (i % 10 + 1) as i64).collect::<Vec<_>>(),
+        ));
+        vec![
+            ColumnarValue::Array(string_array),
+            pattern,
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        vec![ColumnarValue::Array(string_array), pattern]
+    }
+}
+
+fn invoke_regexp_count_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    regex::regexp_count().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int64, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("regexp_count size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Test without start position (no optimization impact)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, false);
+            group.bench_function(
+                format!("regexp_count_no_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        // Test with start position (optimization should help here)
+        for str_len in [32, 128] {
+            let args = create_args::<i32>(size, str_len, true);
+            group.bench_function(
+                format!("regexp_count_with_start [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_regexp_count_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs
index c0b50ad62f64a..a46b548236d08 100644
--- a/datafusion/functions/benches/regx.rs
+++ b/datafusion/functions/benches/regx.rs
@@ -15,23 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
+use std::hint::black_box;
+use std::iter;
+use std::sync::Arc;
 
 use arrow::array::builder::StringBuilder;
 use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray};
 use arrow::compute::cast;
-use arrow::datatypes::DataType;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 use datafusion_functions::regex::regexpcount::regexp_count_func;
-use datafusion_functions::regex::regexplike::regexp_like;
+use datafusion_functions::regex::regexpinstr::regexp_instr_func;
+use datafusion_functions::regex::regexplike::{RegexpLikeFunc, regexp_like};
 use datafusion_functions::regex::regexpmatch::regexp_match;
 use datafusion_functions::regex::regexpreplace::regexp_replace;
+use rand::Rng;
 use rand::distr::Alphanumeric;
 use rand::prelude::IndexedRandom;
 use rand::rngs::ThreadRng;
-use rand::Rng;
-use std::iter;
-use std::sync::Arc;
 fn data(rng: &mut ThreadRng) -> StringArray {
     let mut data: Vec<String> = vec![];
     for _ in 0..1000 {
@@ -71,6 +75,15 @@ fn start(rng: &mut ThreadRng) -> Int64Array {
     Int64Array::from(data)
 }
 
+fn n(rng: &mut ThreadRng) -> Int64Array {
+    let mut data: Vec<i64> = vec![];
+    for _ in 0..1000 {
+        data.push(rng.random_range(1..5));
+    }
+
+    Int64Array::from(data)
+}
+
 fn flags(rng: &mut ThreadRng) -> StringArray {
     let samples = [Some("i".to_string()), Some("im".to_string()), None];
     let mut sb = StringBuilder::new();
@@ -86,7 +99,18 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
     sb.finish()
 }
 
+fn subexp(rng: &mut ThreadRng) -> Int64Array {
+    let mut data: Vec<i64> = vec![];
+    for _ in 0..1000 {
+        data.push(rng.random_range(1..5));
+    }
+
+    Int64Array::from(data)
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
+    let regexp_like_func = RegexpLikeFunc::new();
+    let config_options = Arc::new(ConfigOptions::default());
     c.bench_function("regexp_count_1000 string", |b| {
         let mut rng = rand::rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
@@ -127,6 +151,52 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    c.bench_function("regexp_instr_1000 string", |b| {
+        let mut rng = rand::rng();
+        let data = Arc::new(data(&mut rng)) as ArrayRef;
+        let regex = Arc::new(regex(&mut rng)) as ArrayRef;
+        let start = Arc::new(start(&mut rng)) as ArrayRef;
+        let n = Arc::new(n(&mut rng)) as ArrayRef;
+        let flags = Arc::new(flags(&mut rng)) as ArrayRef;
+        let subexp = Arc::new(subexp(&mut rng)) as ArrayRef;
+
+        b.iter(|| {
+            black_box(
+                regexp_instr_func(&[
+                    Arc::clone(&data),
+                    Arc::clone(&regex),
+                    Arc::clone(&start),
+                    Arc::clone(&n),
+                    Arc::clone(&flags),
+                    Arc::clone(&subexp),
+                ])
+                .expect("regexp_instr should work on utf8"),
+            )
+        })
+    });
+
+    c.bench_function("regexp_instr_1000 utf8view", |b| {
+        let mut rng = rand::rng();
+        let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
+        let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
+        let start = Arc::new(start(&mut rng)) as ArrayRef;
+        let n = Arc::new(n(&mut rng)) as ArrayRef;
+        let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
+
+        b.iter(|| {
+            black_box(
+                regexp_instr_func(&[
+                    Arc::clone(&data),
+                    Arc::clone(&regex),
+                    Arc::clone(&start),
+                    Arc::clone(&n),
+                    Arc::clone(&flags),
+                ])
+                .expect("regexp_instr should work on utf8view"),
+            )
+        })
+    });
+
     c.bench_function("regexp_like_1000", |b| {
         let mut rng = rand::rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
@@ -155,6 +225,32 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    let scalar_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
+    ];
+    let scalar_arg_fields = vec![
+        Field::new("arg_0", DataType::Utf8, false).into(),
+        Field::new("arg_1", DataType::Utf8, false).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+
+    c.bench_function("regexp_like scalar utf8", |b| {
+        b.iter(|| {
+            black_box(
+                regexp_like_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_args.clone(),
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .expect("regexp_like scalar should work on valid values"),
+            )
+        })
+    });
+
     c.bench_function("regexp_match_1000", |b| {
         let mut rng = rand::rng();
         let data = Arc::new(data(&mut rng)) as ArrayRef;
@@ -202,11 +298,11 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                regexp_replace::<i32, _, _>(
+                regexp_replace::<i32, _>(
                     data.as_string::<i32>(),
                     regex.as_string::<i32>(),
                     replacement.as_string::<i32>(),
-                    Some(&flags),
+                    Some(flags.as_string::<i32>()),
                 )
                 .expect("regexp_replace should work on valid values"),
             )
@@ -217,19 +313,18 @@ fn criterion_benchmark(c: &mut Criterion) {
         let mut rng = rand::rng();
         let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
         let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
-        // flags are not allowed to be utf8view according to the function
-        let flags = Arc::new(flags(&mut rng)) as ArrayRef;
+        let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
         let replacement = Arc::new(StringViewArray::from_iter_values(iter::repeat_n(
             "XX", 1000,
         )));
 
         b.iter(|| {
             black_box(
-                regexp_replace::<i32, _, _>(
+                regexp_replace::<i32, _>(
                     data.as_string_view(),
                     regex.as_string_view(),
-                    &replacement,
-                    Some(&flags),
+                    &*replacement,
+                    Some(flags.as_string_view()),
                 )
                 .expect("regexp_replace should work on valid values"),
             )
diff --git a/datafusion/functions/benches/repeat.rs b/datafusion/functions/benches/repeat.rs
index 175933f5f745f..354812c0d2ea2 100644
--- a/datafusion/functions/benches/repeat.rs
+++ b/datafusion/functions/benches/repeat.rs
@@ -15,17 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::DataFusionError;
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -66,16 +67,56 @@ fn invoke_repeat_with_args(
         .enumerate()
         .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
         .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
 
     string::repeat().invoke_with_args(ScalarFunctionArgs {
         args,
         arg_fields,
         number_rows: repeat_times as usize,
         return_field: Field::new("f", DataType::Utf8, true).into(),
+        config_options: Arc::clone(&config_options),
     })
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
+    let repeat_fn = string::repeat();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    // Scalar benchmarks (outside loop)
+    c.bench_function("repeat/scalar_utf8", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("hello".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Utf8, false).into(),
+                Field::new("b", DataType::Int64, false).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(repeat_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
+    c.bench_function("repeat/scalar_utf8view", |b| {
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("hello".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            arg_fields: vec![
+                Field::new("a", DataType::Utf8View, false).into(),
+                Field::new("b", DataType::Int64, false).into(),
+            ],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::clone(&config_options),
+        };
+        b.iter(|| black_box(repeat_fn.invoke_with_args(args.clone()).unwrap()))
+    });
+
     for size in [1024, 4096] {
         // REPEAT 3 TIMES
         let repeat_times = 3;
diff --git a/datafusion/functions/benches/replace.rs b/datafusion/functions/benches/replace.rs
new file mode 100644
index 0000000000000..55fbd6ae57af2
--- /dev/null
+++ b/datafusion/functions/benches/replace.rs
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    force_view_types: bool,
+    from_len: usize,
+    to_len: usize,
+) -> Vec<ColumnarValue> {
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        let from_array = Arc::new(create_string_view_array_with_len(
+            size, 0.1, from_len, false,
+        ));
+        let to_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, to_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(from_array),
+            ColumnarValue::Array(to_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+        let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, from_len));
+        let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, to_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(from_array),
+            ColumnarValue::Array(to_array),
+        ]
+    }
+}
+
+fn invoke_replace_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    string::replace().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("replace size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // ASCII single character replacement (fast path)
+        let str_len = 32;
+        let args = create_args::<i32>(size, str_len, false, 1, 1);
+        group.bench_function(
+            format!("replace_string_ascii_single [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        // Multi-character strings (general path)
+        let args = create_args::<i32>(size, str_len, true, 3, 5);
+        group.bench_function(
+            format!("replace_string_view [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        let args = create_args::<i32>(size, str_len, false, 3, 5);
+        group.bench_function(
+            format!("replace_string [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        let args = create_args::<i64>(size, str_len, false, 3, 5);
+        group.bench_function(
+            format!("replace_large_string [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        // Larger strings
+        let str_len = 128;
+        let args = create_args::<i32>(size, str_len, false, 1, 1);
+        group.bench_function(
+            format!("replace_string_ascii_single [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        let args = create_args::<i32>(size, str_len, true, 3, 5);
+        group.bench_function(
+            format!("replace_string_view [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        let args = create_args::<i32>(size, str_len, false, 3, 5);
+        group.bench_function(
+            format!("replace_string [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        let args = create_args::<i64>(size, str_len, false, 3, 5);
+        group.bench_function(
+            format!("replace_large_string [size={size}, str_len={str_len}]"),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_replace_with_args(args_cloned, size))
+                })
+            },
+        );
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/reverse.rs b/datafusion/functions/benches/reverse.rs
index 6403660113051..f2e2898bbfe43 100644
--- a/datafusion/functions/benches/reverse.rs
+++ b/datafusion/functions/benches/reverse.rs
@@ -15,17 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
 mod helper;
 
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use helper::gen_string_array;
+use std::hint::black_box;
+use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     // All benches are single batch run with 8192 rows
     let reverse = datafusion_functions::unicode::reverse();
+    let config_options = Arc::new(ConfigOptions::default());
 
     const N_ROWS: usize = 8192;
     const NULL_DENSITY: f32 = 0.1;
@@ -53,6 +56,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ).into()],
                         number_rows: N_ROWS,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -74,6 +78,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ],
                         number_rows: N_ROWS,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -100,6 +105,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ).into()],
                         number_rows: N_ROWS,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -123,6 +129,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ).into()],
                         number_rows: N_ROWS,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
diff --git a/datafusion/functions/benches/round.rs b/datafusion/functions/benches/round.rs
new file mode 100644
index 0000000000000..7010aa3507dbc
--- /dev/null
+++ b/datafusion/functions/benches/round.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, Float32Type, Float64Type};
+use arrow::util::bench_util::create_primitive_array;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::math::round;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let round_fn = round();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("round size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // Float64 array benchmark
+        let f64_array = Arc::new(create_primitive_array::<Float64Type>(size, 0.1));
+        let batch_len = f64_array.len();
+        let f64_args = vec![
+            ColumnarValue::Array(f64_array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f64_array", |b| {
+            b.iter(|| {
+                let args_cloned = f64_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, true).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float64, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Float32 array benchmark
+        let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.1));
+        let f32_args = vec![
+            ColumnarValue::Array(f32_array),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f32_array", |b| {
+            b.iter(|| {
+                let args_cloned = f32_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float32, true).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: batch_len,
+                            return_field: Field::new("f", DataType::Float32, true).into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Scalar benchmark (the optimization we added)
+        let scalar_f64_args = vec![
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(std::f64::consts::PI))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f64_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_f64_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float64, false).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float64, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let scalar_f32_args = vec![
+            ColumnarValue::Scalar(ScalarValue::Float32(Some(std::f32::consts::PI))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+        ];
+
+        group.bench_function("round_f32_scalar", |b| {
+            b.iter(|| {
+                let args_cloned = scalar_f32_args.clone();
+                black_box(
+                    round_fn
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args_cloned,
+                            arg_fields: vec![
+                                Field::new("a", DataType::Float32, false).into(),
+                                Field::new("b", DataType::Int32, false).into(),
+                            ],
+                            number_rows: 1,
+                            return_field: Field::new("f", DataType::Float32, false)
+                                .into(),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/signum.rs b/datafusion/functions/benches/signum.rs
index 10079bcc81c7d..e98d1b2c22ea2 100644
--- a/datafusion/functions/benches/signum.rs
+++ b/datafusion/functions/benches/signum.rs
@@ -15,16 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::DataType;
 use arrow::{
     datatypes::{Field, Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::signum;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
@@ -41,6 +42,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             })
             .collect::<Vec<_>>();
         let return_field = Field::new("f", DataType::Float32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
 
         c.bench_function(&format!("signum f32 array: {size}"), |b| {
             b.iter(|| {
@@ -51,6 +53,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: batch_len,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
@@ -78,6 +81,52 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: batch_len,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        // Scalar benchmarks (the optimization we added)
+        let scalar_f32_args =
+            vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(-42.5)))];
+        let scalar_f32_arg_fields =
+            vec![Field::new("a", DataType::Float32, false).into()];
+        let return_field_f32 = Field::new("f", DataType::Float32, false).into();
+
+        c.bench_function(&format!("signum f32 scalar: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    signum
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: scalar_f32_args.clone(),
+                            arg_fields: scalar_f32_arg_fields.clone(),
+                            number_rows: 1,
+                            return_field: Arc::clone(&return_field_f32),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .unwrap(),
+                )
+            })
+        });
+
+        let scalar_f64_args =
+            vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(-42.5)))];
+        let scalar_f64_arg_fields =
+            vec![Field::new("a", DataType::Float64, false).into()];
+        let return_field_f64 = Field::new("f", DataType::Float64, false).into();
+
+        c.bench_function(&format!("signum f64 scalar: {size}"), |b| {
+            b.iter(|| {
+                black_box(
+                    signum
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: scalar_f64_args.clone(),
+                            arg_fields: scalar_f64_arg_fields.clone(),
+                            number_rows: 1,
+                            return_field: Arc::clone(&return_field_f64),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs
new file mode 100644
index 0000000000000..7ef84a058920e
--- /dev/null
+++ b/datafusion/functions/benches/split_part.rs
@@ -0,0 +1,380 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string::split_part;
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const N_ROWS: usize = 8192;
+
+/// Generate test data for split_part benchmarks
+/// Creates strings with multiple parts separated by the delimiter
+fn gen_split_part_data(
+    n_rows: usize,
+    num_parts: usize, // number of parts in each string (separated by delimiter)
+    part_len: usize,  // length of each part
+    delimiter: &str,  // the delimiter to use
+    use_string_view: bool, // false -> StringArray, true -> StringViewArray
+) -> (ColumnarValue, ColumnarValue) {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    let mut strings: Vec<String> = Vec::with_capacity(n_rows);
+    for _ in 0..n_rows {
+        let mut parts: Vec<String> = Vec::with_capacity(num_parts);
+        for _ in 0..num_parts {
+            let part: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(part_len)
+                .map(char::from)
+                .collect();
+            parts.push(part);
+        }
+        strings.push(parts.join(delimiter));
+    }
+
+    let delimiters: Vec<String> = vec![delimiter.to_string(); n_rows];
+
+    if use_string_view {
+        let string_array: StringViewArray = strings.into_iter().map(Some).collect();
+        let delimiter_array: StringViewArray = delimiters.into_iter().map(Some).collect();
+        (
+            ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
+            ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
+        )
+    } else {
+        let string_array: StringArray = strings.into_iter().map(Some).collect();
+        let delimiter_array: StringArray = delimiters.into_iter().map(Some).collect();
+        (
+            ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
+            ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
+        )
+    }
+}
+
+fn gen_positions(n_rows: usize, position: i64) -> ColumnarValue {
+    let positions: Vec<i64> = vec![position; n_rows];
+    ColumnarValue::Array(Arc::new(Int64Array::from(positions)) as ArrayRef)
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let split_part_func = split_part();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    let mut group = c.benchmark_group("split_part");
+
+    // Test different scenarios
+    // Scenario 1: Single-char delimiter, first position (should be fastest with optimization)
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
+        let positions = gen_positions(N_ROWS, 1);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("single_char_delim", "pos_first"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 2: Single-char delimiter, middle position
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
+        let positions = gen_positions(N_ROWS, 5);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("single_char_delim", "pos_middle"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 3: Single-char delimiter, last position
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
+        let positions = gen_positions(N_ROWS, 10);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("single_char_delim", "pos_last"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 4: Single-char delimiter, negative position (last element)
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
+        let positions = gen_positions(N_ROWS, -1);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(
+            BenchmarkId::new("single_char_delim", "pos_negative"),
+            |b| {
+                b.iter(|| {
+                    black_box(
+                        split_part_func
+                            .invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: arg_fields.clone(),
+                                number_rows: N_ROWS,
+                                return_field: Arc::clone(&return_field),
+                                config_options: Arc::clone(&config_options),
+                            })
+                            .expect("split_part should work"),
+                    )
+                })
+            },
+        );
+    }
+
+    // Scenario 5: Multi-char delimiter, first position
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~", false);
+        let positions = gen_positions(N_ROWS, 1);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("multi_char_delim", "pos_first"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 6: Multi-char delimiter, middle position
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~", false);
+        let positions = gen_positions(N_ROWS, 5);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("multi_char_delim", "pos_middle"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 7: StringViewArray, single-char delimiter, first position
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", true);
+        let positions = gen_positions(N_ROWS, 1);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(
+            BenchmarkId::new("string_view_single_char", "pos_first"),
+            |b| {
+                b.iter(|| {
+                    black_box(
+                        split_part_func
+                            .invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: arg_fields.clone(),
+                                number_rows: N_ROWS,
+                                return_field: Arc::clone(&return_field),
+                                config_options: Arc::clone(&config_options),
+                            })
+                            .expect("split_part should work"),
+                    )
+                })
+            },
+        );
+    }
+
+    // Scenario 8: Many parts (20), position near end - shows benefit of early termination
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 20, 8, ".", false);
+        let positions = gen_positions(N_ROWS, 2);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(BenchmarkId::new("many_parts_20", "pos_second"), |b| {
+            b.iter(|| {
+                black_box(
+                    split_part_func
+                        .invoke_with_args(ScalarFunctionArgs {
+                            args: args.clone(),
+                            arg_fields: arg_fields.clone(),
+                            number_rows: N_ROWS,
+                            return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .expect("split_part should work"),
+                )
+            })
+        });
+    }
+
+    // Scenario 9: Long strings with many parts - worst case for old implementation
+    {
+        let (strings, delimiters) = gen_split_part_data(N_ROWS, 50, 16, "/", false);
+        let positions = gen_positions(N_ROWS, 1);
+        let args = vec![strings, delimiters, positions];
+        let arg_fields: Vec<_> = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect();
+        let return_field = Field::new("f", DataType::Utf8, true).into();
+
+        group.bench_function(
+            BenchmarkId::new("long_strings_50_parts", "pos_first"),
+            |b| {
+                b.iter(|| {
+                    black_box(
+                        split_part_func
+                            .invoke_with_args(ScalarFunctionArgs {
+                                args: args.clone(),
+                                arg_fields: arg_fields.clone(),
+                                number_rows: N_ROWS,
+                                return_field: Arc::clone(&return_field),
+                                config_options: Arc::clone(&config_options),
+                            })
+                            .expect("split_part should work"),
+                    )
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/starts_with.rs b/datafusion/functions/benches/starts_with.rs
new file mode 100644
index 0000000000000..17483f0da7a07
--- /dev/null
+++ b/datafusion/functions/benches/starts_with.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use rand::distr::Alphanumeric;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Generate a StringArray/StringViewArray with random ASCII strings
+fn gen_string_array(
+    n_rows: usize,
+    str_len: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let mut rng = StdRng::seed_from_u64(42);
+    let strings: Vec<Option<String>> = (0..n_rows)
+        .map(|_| {
+            let s: String = (&mut rng)
+                .sample_iter(&Alphanumeric)
+                .take(str_len)
+                .map(char::from)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+/// Generate a scalar prefix string
+fn gen_scalar_prefix(prefix_str: &str, is_string_view: bool) -> ColumnarValue {
+    if is_string_view {
+        ColumnarValue::Scalar(ScalarValue::Utf8View(Some(prefix_str.to_string())))
+    } else {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(prefix_str.to_string())))
+    }
+}
+
+/// Generate an array of prefix strings (same string repeated)
+fn gen_array_prefix(
+    prefix_str: &str,
+    n_rows: usize,
+    is_string_view: bool,
+) -> ColumnarValue {
+    let strings: Vec<Option<String>> =
+        (0..n_rows).map(|_| Some(prefix_str.to_string())).collect();
+
+    if is_string_view {
+        ColumnarValue::Array(Arc::new(StringViewArray::from(strings)))
+    } else {
+        ColumnarValue::Array(Arc::new(StringArray::from(strings)))
+    }
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let starts_with = datafusion_functions::string::starts_with();
+    let n_rows = 8192;
+    let str_len = 128;
+    let prefix_str = "xyz"; // A pattern that likely won't match
+
+    // Benchmark: StringArray with scalar prefix (the optimized path)
+    let str_array = gen_string_array(n_rows, str_len, false);
+    let scalar_prefix = gen_scalar_prefix(prefix_str, false);
+    let arg_fields = vec![
+        Field::new("a", DataType::Utf8, true).into(),
+        Field::new("b", DataType::Utf8, true).into(),
+    ];
+    let return_field = Field::new("f", DataType::Boolean, true).into();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("starts_with_StringArray_scalar_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), scalar_prefix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringArray with array prefix (for comparison)
+    let array_prefix = gen_array_prefix(prefix_str, n_rows, false);
+    c.bench_function("starts_with_StringArray_array_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_array.clone(), array_prefix.clone()],
+                arg_fields: arg_fields.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with scalar prefix (the optimized path)
+    let str_view_array = gen_string_array(n_rows, str_len, true);
+    let scalar_prefix_view = gen_scalar_prefix(prefix_str, true);
+    let arg_fields_view = vec![
+        Field::new("a", DataType::Utf8View, true).into(),
+        Field::new("b", DataType::Utf8View, true).into(),
+    ];
+
+    c.bench_function("starts_with_StringViewArray_scalar_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), scalar_prefix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark: StringViewArray with array prefix (for comparison)
+    let array_prefix_view = gen_array_prefix(prefix_str, n_rows, true);
+    c.bench_function("starts_with_StringViewArray_array_prefix", |b| {
+        b.iter(|| {
+            black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                args: vec![str_view_array.clone(), array_prefix_view.clone()],
+                arg_fields: arg_fields_view.clone(),
+                number_rows: n_rows,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::clone(&config_options),
+            }))
+        })
+    });
+
+    // Benchmark different string lengths with scalar prefix
+    for str_len in [8, 32, 128, 512] {
+        let str_array = gen_string_array(n_rows, str_len, true);
+        let scalar_prefix = gen_scalar_prefix(prefix_str, true);
+        let arg_fields = vec![
+            Field::new("a", DataType::Utf8View, true).into(),
+            Field::new("b", DataType::Utf8View, true).into(),
+        ];
+
+        c.bench_function(
+            &format!("starts_with_StringViewArray_scalar_strlen_{str_len}"),
+            |b| {
+                b.iter(|| {
+                    black_box(starts_with.invoke_with_args(ScalarFunctionArgs {
+                        args: vec![str_array.clone(), scalar_prefix.clone()],
+                        arg_fields: arg_fields.clone(),
+                        number_rows: n_rows,
+                        return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/strpos.rs b/datafusion/functions/benches/strpos.rs
index df32db1182f1f..94ce919c3d801 100644
--- a/datafusion/functions/benches/strpos.rs
+++ b/datafusion/functions/benches/strpos.rs
@@ -15,21 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use rand::distr::Alphanumeric;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
+use std::hint::black_box;
 use std::str::Chars;
 use std::sync::Arc;
 
-/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
-/// 4096 rows, each row containing a string with 128 random characters.
-/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
+/// Returns a `Vec<ColumnarValue>` with two elements: a haystack array and a
+/// needle array. Each haystack is a random string of `str_len_chars`
+/// characters. Each needle is a random contiguous substring of its
+/// corresponding haystack (i.e., the needle is always present in the haystack).
+/// Around `null_density` fraction of rows are null and `utf8_density` fraction
+/// contain non-ASCII characters; the remaining rows are ASCII-only.
 fn gen_string_array(
     n_rows: usize,
     str_len_chars: usize,
@@ -114,6 +117,8 @@ fn criterion_benchmark(c: &mut Criterion) {
         let arg_fields =
             vec![Field::new("a", args_string_ascii[0].data_type(), true).into()];
         let return_field = Field::new("f", DataType::Int32, true).into();
+        let config_options = Arc::new(ConfigOptions::default());
+
         c.bench_function(
             &format!("strpos_StringArray_ascii_str_len_{str_len}"),
             |b| {
@@ -123,6 +128,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -140,6 +146,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: arg_fields.clone(),
                     number_rows: n_rows,
                     return_field: Arc::clone(&return_field),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
@@ -158,6 +165,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
@@ -177,6 +185,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                         arg_fields: arg_fields.clone(),
                         number_rows: n_rows,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     }))
                 })
             },
diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs
index 342e18b0d9a2e..37a1e178f5612 100644
--- a/datafusion/functions/benches/substr.rs
+++ b/datafusion/functions/benches/substr.rs
@@ -15,17 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::{
     create_string_array_with_len, create_string_view_array_with_len,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
 use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn create_args_without_count<O: OffsetSizeTrait>(
@@ -97,6 +97,7 @@ fn create_args_with_count<O: OffsetSizeTrait>(
     }
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn invoke_substr_with_args(
     args: Vec<ColumnarValue>,
     number_rows: usize,
@@ -106,12 +107,14 @@ fn invoke_substr_with_args(
         .enumerate()
         .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
         .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
 
     unicode::substr().invoke_with_args(ScalarFunctionArgs {
         args: args.clone(),
         arg_fields,
         number_rows,
         return_field: Field::new("f", DataType::Utf8View, true).into(),
+        config_options: Arc::clone(&config_options),
     })
 }
 
diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs
index e772fb38fc400..663e7928bfd95 100644
--- a/datafusion/functions/benches/substr_index.rs
+++ b/datafusion/functions/benches/substr_index.rs
@@ -15,19 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int64Array, StringArray};
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use rand::distr::{Alphanumeric, Uniform};
-use rand::prelude::Distribution;
-use rand::Rng;
-
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::unicode::substr_index;
+use rand::Rng;
+use rand::distr::{Alphanumeric, Uniform};
+use rand::prelude::Distribution;
 
 struct Filter<Dist, Test> {
     dist: Dist,
@@ -49,7 +48,10 @@ where
     }
 }
 
-fn data() -> (StringArray, StringArray, Int64Array) {
+fn data(
+    batch_size: usize,
+    single_char_delimiter: bool,
+) -> (StringArray, StringArray, Int64Array) {
     let dist = Filter {
         dist: Uniform::new(-4, 5),
         test: |x: &i64| x != &0,
@@ -59,19 +61,39 @@ fn data() -> (StringArray, StringArray, Int64Array) {
     let mut delimiters: Vec<String> = vec![];
     let mut counts: Vec<i64> = vec![];
 
-    for _ in 0..1000 {
+    for _ in 0..batch_size {
         let length = rng.random_range(20..50);
-        let text: String = (&mut rng)
+        let base: String = (&mut rng)
             .sample_iter(&Alphanumeric)
             .take(length)
             .map(char::from)
             .collect();
-        let char = rng.random_range(0..text.len());
-        let delimiter = &text.chars().nth(char).unwrap();
+
+        let (string_value, delimiter): (String, String) = if single_char_delimiter {
+            let char_idx = rng.random_range(0..base.chars().count());
+            let delimiter = base.chars().nth(char_idx).unwrap().to_string();
+            (base, delimiter)
+        } else {
+            let long_delimiters = ["|||", "***", "&&&", "###", "@@@", "$$$"];
+            let delimiter =
+                long_delimiters[rng.random_range(0..long_delimiters.len())].to_string();
+
+            let delimiter_count = rng.random_range(1..4);
+            let mut result = String::new();
+
+            for i in 0..delimiter_count {
+                result.push_str(&base);
+                if i < delimiter_count - 1 {
+                    result.push_str(&delimiter);
+                }
+            }
+            (result, delimiter)
+        };
+
         let count = rng.sample(dist.dist.unwrap());
 
-        strings.push(text);
-        delimiters.push(delimiter.to_string());
+        strings.push(string_value);
+        delimiters.push(delimiter);
         counts.push(count);
     }
 
@@ -82,36 +104,63 @@ fn data() -> (StringArray, StringArray, Int64Array) {
     )
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("substr_index_array_array_1000", |b| {
-        let (strings, delimiters, counts) = data();
-        let batch_len = counts.len();
-        let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
-        let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
-        let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
-
-        let args = vec![strings, delimiters, counts];
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
-
-        b.iter(|| {
-            black_box(
-                substr_index()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: batch_len,
-                        return_field: Field::new("f", DataType::Utf8, true).into(),
-                    })
-                    .expect("substr_index should work on valid values"),
-            )
+fn run_benchmark(
+    b: &mut criterion::Bencher,
+    strings: StringArray,
+    delimiters: StringArray,
+    counts: Int64Array,
+    batch_size: usize,
+) {
+    let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
+    let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
+    let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
+
+    let args = vec![strings, delimiters, counts];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            Field::new(format!("arg_{idx}"), arg.data_type().clone(), true).into()
         })
-    });
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    b.iter(|| {
+        black_box(
+            substr_index()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: batch_size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("substr_index should work on valid values"),
+        )
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("substr_index");
+
+    let batch_sizes = [100, 1000, 10_000];
+
+    for batch_size in batch_sizes {
+        group.bench_function(
+            format!("substr_index_{batch_size}_single_delimiter"),
+            |b| {
+                let (strings, delimiters, counts) = data(batch_size, true);
+                run_benchmark(b, strings, delimiters, counts, batch_size);
+            },
+        );
+
+        group.bench_function(format!("substr_index_{batch_size}_long_delimiter"), |b| {
+            let (strings, delimiters, counts) = data(batch_size, false);
+            run_benchmark(b, strings, delimiters, counts, batch_size);
+        });
+    }
+
+    group.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs
index d19714ce61664..4d866570b7dd1 100644
--- a/datafusion/functions/benches/to_char.rs
+++ b/datafusion/functions/benches/to_char.rs
@@ -15,25 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, Date32Array, StringArray};
+use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
 use arrow::datatypes::{DataType, Field};
-use chrono::prelude::*;
 use chrono::TimeDelta;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use rand::prelude::IndexedRandom;
-use rand::rngs::ThreadRng;
-use rand::Rng;
-
+use chrono::prelude::*;
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use datafusion_common::ScalarValue::TimestampNanosecond;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::to_char;
+use rand::Rng;
+use rand::prelude::IndexedRandom;
+use rand::rngs::ThreadRng;
 
-fn random_date_in_range(
+fn pick_date_in_range(
     rng: &mut ThreadRng,
     start_date: NaiveDate,
     end_date: NaiveDate,
@@ -43,7 +41,7 @@ fn random_date_in_range(
     start_date + TimeDelta::try_days(random_days).unwrap()
 }
 
-fn data(rng: &mut ThreadRng) -> Date32Array {
+fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array {
     let mut data: Vec<i32> = vec![];
     let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1)
         .unwrap()
@@ -56,7 +54,7 @@ fn data(rng: &mut ThreadRng) -> Date32Array {
         .expect("Date should parse");
     for _ in 0..1000 {
         data.push(
-            random_date_in_range(rng, start_date, end_date).num_days_from_ce()
+            pick_date_in_range(rng, start_date, end_date).num_days_from_ce()
                 - unix_days_from_ce,
         );
     }
@@ -64,29 +62,97 @@ fn data(rng: &mut ThreadRng) -> Date32Array {
     Date32Array::from(data)
 }
 
-fn patterns(rng: &mut ThreadRng) -> StringArray {
-    let samples = [
-        "%Y:%m:%d".to_string(),
-        "%d-%m-%Y".to_string(),
-        "%d%m%Y".to_string(),
-        "%Y%m%d".to_string(),
-        "%Y...%m...%d".to_string(),
-    ];
-    let mut data: Vec<String> = vec![];
+fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
+    let start_date = "1970-01-01"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let end_date = "2050-12-31"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let mut data: Vec<i64> = Vec::with_capacity(1000);
+    for _ in 0..1000 {
+        let date = pick_date_in_range(rng, start_date, end_date);
+        let millis = date
+            .and_hms_opt(0, 0, 0)
+            .unwrap()
+            .and_utc()
+            .timestamp_millis();
+        data.push(millis);
+    }
+    Date64Array::from(data)
+}
+
+const DATE_PATTERNS: [&str; 5] =
+    ["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
+
+const DATETIME_PATTERNS: [&str; 8] = [
+    "%Y:%m:%d %H:%M%S",
+    "%Y:%m:%d %_H:%M%S",
+    "%Y:%m:%d %k:%M%S",
+    "%d-%m-%Y %I%P-%M-%S %f",
+    "%d%m%Y %H",
+    "%Y%m%d %M-%S %.3f",
+    "%Y...%m...%d %T%3f",
+    "%c",
+];
+
+fn pick_date_pattern(rng: &mut ThreadRng) -> String {
+    (*DATE_PATTERNS
+        .choose(rng)
+        .expect("Empty list of date patterns"))
+    .to_string()
+}
+
+fn pick_date_time_pattern(rng: &mut ThreadRng) -> String {
+    (*DATETIME_PATTERNS
+        .choose(rng)
+        .expect("Empty list of date time patterns"))
+    .to_string()
+}
+
+fn pick_date_and_date_time_mixed_pattern(rng: &mut ThreadRng) -> String {
+    match rng.random_bool(0.5) {
+        true => pick_date_pattern(rng),
+        false => pick_date_time_pattern(rng),
+    }
+}
+
+fn generate_pattern_array(
+    rng: &mut ThreadRng,
+    pick_fn: impl Fn(&mut ThreadRng) -> String,
+) -> StringArray {
+    let mut data = Vec::with_capacity(1000);
+
     for _ in 0..1000 {
-        data.push(samples.choose(rng).unwrap().to_string());
+        data.push(pick_fn(rng));
     }
 
     StringArray::from(data)
 }
 
+fn generate_date_pattern_array(rng: &mut ThreadRng) -> StringArray {
+    generate_pattern_array(rng, pick_date_pattern)
+}
+
+fn generate_datetime_pattern_array(rng: &mut ThreadRng) -> StringArray {
+    generate_pattern_array(rng, pick_date_time_pattern)
+}
+
+fn generate_mixed_pattern_array(rng: &mut ThreadRng) -> StringArray {
+    generate_pattern_array(rng, pick_date_and_date_time_mixed_pattern)
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("to_char_array_array_1000", |b| {
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("to_char_array_date_only_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = data(&mut rng);
+        let data_arr = generate_date32_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
-        let patterns = ColumnarValue::Array(Arc::new(patterns(&mut rng)) as ArrayRef);
+        let patterns = ColumnarValue::Array(Arc::new(generate_date_pattern_array(
+            &mut rng,
+        )) as ArrayRef);
 
         b.iter(|| {
             black_box(
@@ -99,19 +165,74 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ],
                         number_rows: batch_len,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_char should work on valid values"),
             )
         })
     });
 
-    c.bench_function("to_char_array_scalar_1000", |b| {
+    c.bench_function("to_char_array_datetime_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = data(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
+        let batch_len = data_arr.len();
+        let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
+        let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
+            &mut rng,
+        )) as ArrayRef);
+
+        b.iter(|| {
+            black_box(
+                to_char()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: vec![data.clone(), patterns.clone()],
+                        arg_fields: vec![
+                            Field::new("a", data.data_type(), true).into(),
+                            Field::new("b", patterns.data_type(), true).into(),
+                        ],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .expect("to_char should work on valid values"),
+            )
+        })
+    });
+
+    c.bench_function("to_char_array_mixed_patterns_1000", |b| {
+        let mut rng = rand::rng();
+        let data_arr = generate_date64_array(&mut rng);
+        let batch_len = data_arr.len();
+        let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
+        let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
+            &mut rng,
+        )) as ArrayRef);
+
+        b.iter(|| {
+            black_box(
+                to_char()
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: vec![data.clone(), patterns.clone()],
+                        arg_fields: vec![
+                            Field::new("a", data.data_type(), true).into(),
+                            Field::new("b", patterns.data_type(), true).into(),
+                        ],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .expect("to_char should work on valid values"),
+            )
+        })
+    });
+
+    c.bench_function("to_char_scalar_date_only_pattern_1000", |b| {
+        let mut rng = rand::rng();
+        let data_arr = generate_date32_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns =
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some("%Y-%m-%d".to_string())));
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
 
         b.iter(|| {
             black_box(
@@ -124,37 +245,34 @@ fn criterion_benchmark(c: &mut Criterion) {
                         ],
                         number_rows: batch_len,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_char should work on valid values"),
             )
         })
     });
 
-    c.bench_function("to_char_scalar_scalar_1000", |b| {
-        let timestamp = "2026-07-08T09:10:11"
-            .parse::<NaiveDateTime>()
-            .unwrap()
-            .with_nanosecond(56789)
-            .unwrap()
-            .and_utc()
-            .timestamp_nanos_opt()
-            .unwrap();
-        let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None));
-        let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-            "%d-%m-%Y %H:%M:%S".to_string(),
+    c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
+        let mut rng = rand::rng();
+        let data_arr = generate_date64_array(&mut rng);
+        let batch_len = data_arr.len();
+        let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
+        let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+            pick_date_time_pattern(&mut rng),
         )));
 
         b.iter(|| {
             black_box(
                 to_char()
                     .invoke_with_args(ScalarFunctionArgs {
-                        args: vec![data.clone(), pattern.clone()],
+                        args: vec![data.clone(), patterns.clone()],
                         arg_fields: vec![
                             Field::new("a", data.data_type(), true).into(),
-                            Field::new("b", pattern.data_type(), true).into(),
+                            Field::new("b", patterns.data_type(), true).into(),
                         ],
-                        number_rows: 1,
+                        number_rows: batch_len,
                         return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_char should work on valid values"),
             )
diff --git a/datafusion/functions/benches/to_hex.rs b/datafusion/functions/benches/to_hex.rs
index 4a02b74ca42d1..33f8d9c49e8eb 100644
--- a/datafusion/functions/benches/to_hex.rs
+++ b/datafusion/functions/benches/to_hex.rs
@@ -15,52 +15,139 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use arrow::array::Int64Array;
 use arrow::datatypes::{DataType, Field, Int32Type, Int64Type};
 use arrow::util::bench_util::create_primitive_array;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
+use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let hex = string::to_hex();
-    let size = 1024;
-    let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.2));
-    let batch_len = i32_array.len();
-    let i32_args = vec![ColumnarValue::Array(i32_array)];
-    c.bench_function(&format!("to_hex i32 array: {size}"), |b| {
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("to_hex/scalar_i32", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(2147483647)))];
+        let arg_fields = vec![Field::new("a", DataType::Int32, true).into()];
         b.iter(|| {
-            let args_cloned = i32_args.clone();
             black_box(
                 hex.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: vec![Field::new("a", DataType::Int32, false).into()],
-                    number_rows: batch_len,
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: 1,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
                 .unwrap(),
             )
         })
     });
-    let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.2));
-    let batch_len = i64_array.len();
-    let i64_args = vec![ColumnarValue::Array(i64_array)];
-    c.bench_function(&format!("to_hex i64 array: {size}"), |b| {
+
+    c.bench_function("to_hex/scalar_i64", |b| {
+        let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(
+            9223372036854775807,
+        )))];
+        let arg_fields = vec![Field::new("a", DataType::Int64, true).into()];
         b.iter(|| {
-            let args_cloned = i64_args.clone();
             black_box(
                 hex.invoke_with_args(ScalarFunctionArgs {
-                    args: args_cloned,
-                    arg_fields: vec![Field::new("a", DataType::Int64, false).into()],
-                    number_rows: batch_len,
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: 1,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 })
                 .unwrap(),
             )
         })
     });
+
+    for size in [1024, 4096, 8192] {
+        let mut group = c.benchmark_group(format!("to_hex size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        // i32 array with random values
+        let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.1));
+        let batch_len = i32_array.len();
+        let i32_args = vec![ColumnarValue::Array(i32_array)];
+
+        group.bench_function("i32_random", |b| {
+            b.iter(|| {
+                let args_cloned = i32_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int32, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // i64 array with random values (produces longer hex strings)
+        let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.1));
+        let batch_len = i64_array.len();
+        let i64_args = vec![ColumnarValue::Array(i64_array)];
+
+        group.bench_function("i64_random", |b| {
+            b.iter(|| {
+                let args_cloned = i64_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        // i64 array with large values (max length hex strings)
+        let i64_large_array = Arc::new(Int64Array::from(
+            (0..size)
+                .map(|i| {
+                    if i % 10 == 0 {
+                        None
+                    } else {
+                        Some(i64::MAX - i as i64)
+                    }
+                })
+                .collect::<Vec<_>>(),
+        ));
+        let batch_len = i64_large_array.len();
+        let i64_large_args = vec![ColumnarValue::Array(i64_large_array)];
+
+        group.bench_function("i64_large_values", |b| {
+            b.iter(|| {
+                let args_cloned = i64_large_args.clone();
+                black_box(
+                    hex.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
+                        number_rows: batch_len,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+                )
+            })
+        });
+
+        group.finish();
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs
index d898113484899..90ea145d5d2c0 100644
--- a/datafusion/functions/benches/to_timestamp.rs
+++ b/datafusion/functions/benches/to_timestamp.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
+use std::hint::black_box;
 use std::sync::Arc;
 
 use arrow::array::builder::StringBuilder;
 use arrow::array::{Array, ArrayRef, StringArray};
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, Field, TimeUnit};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::to_timestamp;
 
@@ -113,19 +112,27 @@ fn criterion_benchmark(c: &mut Criterion) {
         Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into();
     let arg_field = Field::new("a", DataType::Utf8, false).into();
     let arg_fields = vec![arg_field];
+    let mut options = ConfigOptions::default();
+    options.execution.time_zone = Some("UTC".into());
+    let config_options = Arc::new(options);
+
+    let to_timestamp_udf = to_timestamp(config_options.as_ref());
+
     c.bench_function("to_timestamp_no_formats_utf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let arr_data = data();
         let batch_len = arr_data.len();
         let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
@@ -133,18 +140,20 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_no_formats_largeutf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let data = cast(&data(), &DataType::LargeUtf8).unwrap();
         let batch_len = data.len();
         let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
@@ -152,18 +161,20 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_no_formats_utf8view", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let data = cast(&data(), &DataType::Utf8View).unwrap();
         let batch_len = data.len();
         let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: vec![string_array.clone()],
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
@@ -171,6 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_utf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
         let batch_len = inputs.len();
 
@@ -190,12 +202,13 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
@@ -203,6 +216,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_largeutf8", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
         let batch_len = inputs.len();
 
@@ -230,12 +244,13 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
@@ -243,6 +258,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("to_timestamp_with_formats_utf8view", |b| {
+        let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
         let (inputs, format1, format2, format3) = data_with_formats();
 
         let batch_len = inputs.len();
@@ -271,12 +287,13 @@ fn criterion_benchmark(c: &mut Criterion) {
 
         b.iter(|| {
             black_box(
-                to_timestamp()
+                to_timestamp_udf
                     .invoke_with_args(ScalarFunctionArgs {
                         args: args.clone(),
                         arg_fields: arg_fields.clone(),
                         number_rows: batch_len,
                         return_field: Arc::clone(&return_field),
+                        config_options: Arc::clone(&config_options),
                     })
                     .expect("to_timestamp should work on valid values"),
             )
diff --git a/datafusion/functions/benches/translate.rs b/datafusion/functions/benches/translate.rs
new file mode 100644
index 0000000000000..d0568ba0f5355
--- /dev/null
+++ b/datafusion/functions/benches/translate.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::unicode;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args_array_from_to<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 3));
+    let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 2));
+
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Array(from_array),
+        ColumnarValue::Array(to_array),
+    ]
+}
+
+fn create_args_scalar_from_to<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+) -> Vec<ColumnarValue> {
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Scalar(ScalarValue::from("aeiou")),
+        ColumnarValue::Scalar(ScalarValue::from("AEIOU")),
+    ]
+}
+
+fn invoke_translate_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    unicode::translate().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("translate size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for str_len in [8, 32, 128, 1024] {
+            let args = create_args_array_from_to::<i32>(size, str_len);
+            group.bench_function(format!("array_from_to [str_len={str_len}]"), |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_translate_with_args(args_cloned, size))
+                })
+            });
+
+            let args = create_args_scalar_from_to::<i32>(size, str_len);
+            group.bench_function(format!("scalar_from_to [str_len={str_len}]"), |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(invoke_translate_with_args(args_cloned, size))
+                })
+            });
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/trim.rs b/datafusion/functions/benches/trim.rs
new file mode 100644
index 0000000000000..21d99592d1820
--- /dev/null
+++ b/datafusion/functions/benches/trim.rs
@@ -0,0 +1,435 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+    BenchmarkGroup, Criterion, SamplingMode, criterion_group, criterion_main,
+    measurement::Measurement,
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
+use datafusion_functions::string;
+use rand::{Rng, SeedableRng, distr::Alphanumeric, rngs::StdRng};
+use std::hint::black_box;
+use std::{fmt, sync::Arc};
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+    Utf8View,
+    Utf8,
+    LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            StringArrayType::Utf8View => f.write_str("string_view"),
+            StringArrayType::Utf8 => f.write_str("string"),
+            StringArrayType::LargeUtf8 => f.write_str("large_string"),
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub enum TrimType {
+    Ltrim,
+    Rtrim,
+    Btrim,
+}
+
+impl fmt::Display for TrimType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            TrimType::Ltrim => f.write_str("ltrim"),
+            TrimType::Rtrim => f.write_str("rtrim"),
+            TrimType::Btrim => f.write_str("btrim"),
+        }
+    }
+}
+
+/// Returns an array of strings with trim characters positioned according to trim type,
+/// and `characters` as a ScalarValue.
+///
+/// For ltrim: trim characters are at the start (prefix)
+/// For rtrim: trim characters are at the end (suffix)
+/// For btrim: trim characters are at both start and end
+fn create_string_array_and_characters(
+    size: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> (ArrayRef, ScalarValue) {
+    let rng = &mut StdRng::seed_from_u64(42);
+
+    // Create `size` rows:
+    //   - 10% rows will be `None`
+    //   - Other 90% will be strings with `remaining_len` content length
+    let string_iter = (0..size).map(|_| {
+        if rng.random::<f32>() < 0.1 {
+            None
+        } else {
+            let content: String = rng
+                .sample_iter(&Alphanumeric)
+                .take(remaining_len)
+                .map(char::from)
+                .collect();
+
+            let value = match trim_type {
+                TrimType::Ltrim => format!("{trimmed}{content}"),
+                TrimType::Rtrim => format!("{content}{trimmed}"),
+                TrimType::Btrim => format!("{trimmed}{content}{trimmed}"),
+            };
+            Some(value)
+        }
+    });
+
+    // Build the target `string array` and `characters` according to `string_array_type`
+    match string_array_type {
+        StringArrayType::Utf8View => (
+            Arc::new(string_iter.collect::<StringViewArray>()),
+            ScalarValue::Utf8View(Some(characters.to_string())),
+        ),
+        StringArrayType::Utf8 => (
+            Arc::new(string_iter.collect::<StringArray>()),
+            ScalarValue::Utf8(Some(characters.to_string())),
+        ),
+        StringArrayType::LargeUtf8 => (
+            Arc::new(string_iter.collect::<LargeStringArray>()),
+            ScalarValue::LargeUtf8(Some(characters.to_string())),
+        ),
+    }
+}
+
+/// Create args for the trim benchmark
+fn create_args(
+    size: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+    let (string_array, pattern) = create_string_array_and_characters(
+        size,
+        characters,
+        trimmed,
+        remaining_len,
+        string_array_type,
+        trim_type,
+    );
+    vec![
+        ColumnarValue::Array(string_array),
+        ColumnarValue::Scalar(pattern),
+    ]
+}
+
+/// Create args for trim benchmark where space characters are being trimmed
+fn create_space_trim_args(
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+    string_array_type: StringArrayType,
+    trim_type: TrimType,
+) -> Vec<ColumnarValue> {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let spaces = " ".repeat(pad_len);
+
+    let string_iter = (0..size).map(|_| {
+        if rng.random::<f32>() < 0.1 {
+            None
+        } else {
+            let content: String = rng
+                .sample_iter(&Alphanumeric)
+                .take(remaining_len)
+                .map(char::from)
+                .collect();
+
+            let value = match trim_type {
+                TrimType::Ltrim => format!("{spaces}{content}"),
+                TrimType::Rtrim => format!("{content}{spaces}"),
+                TrimType::Btrim => format!("{spaces}{content}{spaces}"),
+            };
+            Some(value)
+        }
+    });
+
+    let string_array: ArrayRef = match string_array_type {
+        StringArrayType::Utf8View => Arc::new(string_iter.collect::<StringViewArray>()),
+        StringArrayType::Utf8 => Arc::new(string_iter.collect::<StringArray>()),
+        StringArrayType::LargeUtf8 => Arc::new(string_iter.collect::<LargeStringArray>()),
+    };
+
+    vec![ColumnarValue::Array(string_array)]
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_with_string_type<M: Measurement>(
+    group: &mut BenchmarkGroup<'_, M>,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    size: usize,
+    total_len: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+    string_type: StringArrayType,
+) {
+    let args = create_args(
+        size,
+        characters,
+        trimmed,
+        remaining_len,
+        string_type,
+        trim_type,
+    );
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    group.bench_function(
+        format!(
+            "{trim_type} {string_type} [size={size}, len={total_len}, remaining={remaining_len}]",
+        ),
+        |b| {
+            b.iter(|| {
+                let args_cloned = args.clone();
+                black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+                    args: args_cloned,
+                    arg_fields: arg_fields.clone(),
+                    number_rows: size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                }))
+            })
+        },
+    );
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_trim_benchmark(
+    c: &mut Criterion,
+    group_name: &str,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    string_types: &[StringArrayType],
+    size: usize,
+    total_len: usize,
+    characters: &str,
+    trimmed: &str,
+    remaining_len: usize,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(10);
+
+    for string_type in string_types {
+        run_with_string_type(
+            &mut group,
+            trim_func,
+            trim_type,
+            size,
+            total_len,
+            characters,
+            trimmed,
+            remaining_len,
+            *string_type,
+        );
+    }
+
+    group.finish();
+}
+
+#[expect(clippy::too_many_arguments)]
+fn run_space_trim_benchmark(
+    c: &mut Criterion,
+    group_name: &str,
+    trim_func: &ScalarUDF,
+    trim_type: TrimType,
+    string_types: &[StringArrayType],
+    size: usize,
+    pad_len: usize,
+    remaining_len: usize,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(10);
+
+    let total_len = match trim_type {
+        TrimType::Btrim => 2 * pad_len + remaining_len,
+        _ => pad_len + remaining_len,
+    };
+
+    for string_type in string_types {
+        let args =
+            create_space_trim_args(size, pad_len, remaining_len, *string_type, trim_type);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
+            })
+            .collect::<Vec<_>>();
+        let config_options = Arc::new(ConfigOptions::default());
+
+        group.bench_function(
+            format!(
+                "{trim_type} {string_type} [size={size}, len={total_len}, pad={pad_len}]",
+            ),
+            |b| {
+                b.iter(|| {
+                    let args_cloned = args.clone();
+                    black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
+                        args: args_cloned,
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new("f", DataType::Utf8, true).into(),
+                        config_options: Arc::clone(&config_options),
+                    }))
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let ltrim = string::ltrim();
+    let rtrim = string::rtrim();
+    let btrim = string::btrim();
+
+    let characters = ",!()";
+
+    let string_types = [
+        StringArrayType::Utf8View,
+        StringArrayType::Utf8,
+        StringArrayType::LargeUtf8,
+    ];
+
+    let trim_funcs = [
+        (&ltrim, TrimType::Ltrim),
+        (&rtrim, TrimType::Rtrim),
+        (&btrim, TrimType::Btrim),
+    ];
+
+    for size in [4096] {
+        for (trim_func, trim_type) in &trim_funcs {
+            // Scenario 1: Short strings (len <= 12, inline in StringView)
+            // trimmed_len=4, remaining_len=8
+            let total_len = 12;
+            let trimmed = characters;
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "short strings (len <= 12)",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                trimmed,
+                remaining_len,
+            );
+
+            // Scenario 2: Long strings, short trim (len > 12, output > 12)
+            // trimmed_len=4, remaining_len=60
+            let total_len = 64;
+            let trimmed = characters;
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "long strings, short trim",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                trimmed,
+                remaining_len,
+            );
+
+            // Scenario 3: Long strings, long trim (len > 12, output <= 12)
+            // trimmed_len=56, remaining_len=8
+            let total_len = 64;
+            let trimmed = characters.repeat(14);
+            let remaining_len = total_len - trimmed.len();
+            run_trim_benchmark(
+                c,
+                "long strings, long trim",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                total_len,
+                characters,
+                &trimmed,
+                remaining_len,
+            );
+
+            // Scenario 4: Trim spaces, short strings (len <= 12)
+            // pad_len=4, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, short strings (len <= 12)",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                8,
+            );
+
+            // Scenario 5: Trim spaces, long strings (len > 12)
+            // pad_len=4, remaining_len=60
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, long strings",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                4,
+                60,
+            );
+
+            // Scenario 6: Trim spaces, long strings, heavy padding
+            // pad_len=56, remaining_len=8
+            run_space_trim_benchmark(
+                c,
+                "trim spaces, heavy padding",
+                trim_func,
+                *trim_type,
+                &string_types,
+                size,
+                56,
+                8,
+            );
+        }
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/trunc.rs b/datafusion/functions/benches/trunc.rs
index 897e21c1e1d94..ffbedcb142c71 100644
--- a/datafusion/functions/benches/trunc.rs
+++ b/datafusion/functions/benches/trunc.rs
@@ -15,26 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::{
     datatypes::{Field, Float32Type, Float64Type},
     util::bench_util::create_primitive_array,
 };
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::math::trunc;
+use std::hint::black_box;
 
 use arrow::datatypes::DataType;
+use datafusion_common::config::ConfigOptions;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let trunc = trunc();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let f32_array = Arc::new(create_primitive_array::<Float32Type>(size, 0.2));
         let f32_args = vec![ColumnarValue::Array(f32_array)];
         let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
         let return_field = Field::new("f", DataType::Float32, true).into();
+
         c.bench_function(&format!("trunc f32 array: {size}"), |b| {
             b.iter(|| {
                 black_box(
@@ -44,6 +47,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
@@ -62,12 +66,58 @@ fn criterion_benchmark(c: &mut Criterion) {
                             arg_fields: arg_fields.clone(),
                             number_rows: size,
                             return_field: Arc::clone(&return_field),
+                            config_options: Arc::clone(&config_options),
                         })
                         .unwrap(),
                 )
             })
         });
     }
+
+    // Scalar benchmarks - to measure optimized performance
+    let scalar_f64_args = vec![ColumnarValue::Scalar(
+        datafusion_common::ScalarValue::Float64(Some(std::f64::consts::PI)),
+    )];
+    let scalar_arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+    let scalar_return_field = Field::new("f", DataType::Float64, false).into();
+
+    c.bench_function("trunc f64 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                trunc
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f64_args.clone(),
+                        arg_fields: scalar_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&scalar_return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+
+    let scalar_f32_args = vec![ColumnarValue::Scalar(
+        datafusion_common::ScalarValue::Float32(Some(std::f32::consts::PI)),
+    )];
+    let scalar_f32_arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+    let scalar_f32_return_field = Field::new("f", DataType::Float32, false).into();
+
+    c.bench_function("trunc f32 scalar", |b| {
+        b.iter(|| {
+            black_box(
+                trunc
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: scalar_f32_args.clone(),
+                        arg_fields: scalar_f32_arg_fields.clone(),
+                        number_rows: 1,
+                        return_field: Arc::clone(&scalar_f32_return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/benches/upper.rs b/datafusion/functions/benches/upper.rs
index bf2c4161001e8..3f6fa36b18c13 100644
--- a/datafusion/functions/benches/upper.rs
+++ b/datafusion/functions/benches/upper.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
 use arrow::util::bench_util::create_string_array_with_len;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::string;
+use std::hint::black_box;
 use std::sync::Arc;
 
 /// Create an array of args containing a StringArray, where all the values in the
@@ -35,6 +35,8 @@ fn create_args(size: usize, str_len: usize) -> Vec<ColumnarValue> {
 
 fn criterion_benchmark(c: &mut Criterion) {
     let upper = string::upper();
+    let config_options = Arc::new(ConfigOptions::default());
+
     for size in [1024, 4096, 8192] {
         let args = create_args(size, 32);
         c.bench_function("upper_all_values_are_ascii", |b| {
@@ -45,6 +47,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                     arg_fields: vec![Field::new("a", DataType::Utf8, true).into()],
                     number_rows: size,
                     return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
                 }))
             })
         });
diff --git a/datafusion/functions/benches/uuid.rs b/datafusion/functions/benches/uuid.rs
index 942af122562ab..629fb950dd9ff 100644
--- a/datafusion/functions/benches/uuid.rs
+++ b/datafusion/functions/benches/uuid.rs
@@ -15,15 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate criterion;
-
 use arrow::datatypes::{DataType, Field};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
 use datafusion_expr::ScalarFunctionArgs;
 use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let uuid = string::uuid();
+    let config_options = Arc::new(ConfigOptions::default());
+
     c.bench_function("uuid", |b| {
         b.iter(|| {
             black_box(uuid.invoke_with_args(ScalarFunctionArgs {
@@ -31,6 +34,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                 arg_fields: vec![],
                 number_rows: 1024,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&config_options),
             }))
         })
     });
diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs
index 2d769dfa56579..e555081e4132c 100644
--- a/datafusion/functions/src/core/arrow_cast.rs
+++ b/datafusion/functions/src/core/arrow_cast.rs
@@ -20,17 +20,16 @@
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow::error::ArrowError;
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_err, Result, ScalarValue,
-};
-use datafusion_common::{
-    exec_datafusion_err, utils::take_function_args, DataFusionError,
+    Result, ScalarValue, arrow_datafusion_err, datatype::DataTypeExt,
+    exec_datafusion_err, exec_err, internal_err, types::logical_string,
+    utils::take_function_args,
 };
 use std::any::Any;
 
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -60,16 +59,26 @@ use datafusion_macros::user_doc;
     description = "Casts a value to a specific Arrow data type.",
     syntax_example = "arrow_cast(expression, datatype)",
     sql_example = r#"```sql
-> select arrow_cast(-5, 'Int8') as a,
+> select
+  arrow_cast(-5,    'Int8') as a,
   arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b,
-  arrow_cast('bar', 'LargeUtf8') as c,
-  arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d
-  ;
-+----+-----+-----+---------------------------+
-| a  | b   | c   | d                         |
-+----+-----+-----+---------------------------+
-| -5 | foo | bar | 2023-01-02T12:53:02+08:00 |
-+----+-----+-----+---------------------------+
+  arrow_cast('bar', 'LargeUtf8') as c;
+
++----+-----+-----+
+| a  | b   | c   |
++----+-----+-----+
+| -5 | foo | bar |
++----+-----+-----+
+
+> select
+  arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d,
+  arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e;
+
++---------------------------+---------------------+
+| d                         | e                   |
++---------------------------+---------------------+
+| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 |
++---------------------------+---------------------+
 ```"#,
     argument(
         name = "expression",
@@ -80,7 +89,7 @@ use datafusion_macros::user_doc;
         description = "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]"
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrowCastFunc {
     signature: Signature,
 }
@@ -94,7 +103,13 @@ impl Default for ArrowCastFunc {
 impl ArrowCastFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Any),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -145,7 +160,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         // convert this into a real cast
         let target_type = data_type_from_args(&args)?;
@@ -161,7 +176,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
             // Use an actual cast to get the correct type
             Expr::Cast(datafusion_expr::Cast {
                 expr: Box::new(arg),
-                data_type: target_type,
+                field: target_type.into_nullable_field_ref(),
             })
         };
         // return the newly written argument to DataFusion
@@ -177,7 +192,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
 fn data_type_from_args(args: &[Expr]) -> Result<DataType> {
     let [_, type_arg] = take_function_args("arrow_cast", args)?;
 
-    let Expr::Literal(ScalarValue::Utf8(Some(val))) = type_arg else {
+    let Expr::Literal(ScalarValue::Utf8(Some(val)), _) = type_arg else {
         return exec_err!(
             "arrow_cast requires its second argument to be a constant string, got {:?}",
             type_arg
diff --git a/datafusion/functions/src/core/arrow_metadata.rs b/datafusion/functions/src/core/arrow_metadata.rs
new file mode 100644
index 0000000000000..86a6d8c21e96b
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_metadata.rs
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{MapBuilder, StringBuilder};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::types::logical_string;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.",
+    syntax_example = "arrow_metadata(expression[, key])",
+    sql_example = r#"```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression to retrieve metadata from. Can be a column or other expression."
+    ),
+    argument(
+        name = "key",
+        description = "Optional. The specific metadata key to retrieve."
+    )
+)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ArrowMetadataFunc {
+    signature: Signature,
+}
+
+impl ArrowMetadataFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Any,
+                    )]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Any),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl Default for ArrowMetadataFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ScalarUDFImpl for ArrowMetadataFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "arrow_metadata"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() == 2 {
+            Ok(DataType::Utf8)
+        } else if arg_types.len() == 1 {
+            Ok(DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("keys", DataType::Utf8, false),
+                        Field::new("values", DataType::Utf8, true),
+                    ])),
+                    false,
+                )),
+                false,
+            ))
+        } else {
+            internal_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let metadata = args.arg_fields[0].metadata();
+
+        if args.args.len() == 2 {
+            let key = match &args.args[1] {
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(key))) => key,
+                _ => {
+                    return exec_err!(
+                        "Second argument to arrow_metadata must be a string literal key"
+                    );
+                }
+            };
+            let value = metadata.get(key).cloned();
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(value)))
+        } else if args.args.len() == 1 {
+            let mut map_builder =
+                MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
+
+            let mut entries: Vec<_> = metadata.iter().collect();
+            entries.sort_by_key(|(k, _)| *k);
+
+            for (k, v) in entries {
+                map_builder.keys().append_value(k);
+                map_builder.values().append_value(v);
+            }
+            map_builder.append(true)?;
+
+            let map_array = map_builder.finish();
+
+            Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                &map_array, 0,
+            )?))
+        } else {
+            internal_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+}
diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs
index 2509ed246ac7c..f34a2abafd793 100644
--- a/datafusion/functions/src/core/arrowtypeof.rs
+++ b/datafusion/functions/src/core/arrowtypeof.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use arrow::datatypes::DataType;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -40,7 +40,7 @@ use std::any::Any;
         description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ArrowTypeOfFunc {
     signature: Signature,
 }
diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs
index 12a4bef247393..359a6f6c9c84c 100644
--- a/datafusion/functions/src/core/coalesce.rs
+++ b/datafusion/functions/src/core/coalesce.rs
@@ -15,14 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{new_null_array, BooleanArray};
-use arrow::compute::kernels::zip::zip;
-use arrow::compute::{and, is_not_null, is_null};
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, exec_err, internal_err, plan_err};
 use datafusion_expr::binary::try_type_union_resolution;
+use datafusion_expr::conditional_expressions::CaseBuilder;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -46,9 +45,9 @@ use std::any::Any;
         description = "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CoalesceFunc {
-    signature: Signature,
+    pub(super) signature: Signature,
 }
 
 impl Default for CoalesceFunc {
@@ -95,61 +94,45 @@ impl ScalarUDFImpl for CoalesceFunc {
         Ok(Field::new(self.name(), return_type, nullable).into())
     }
 
-    /// coalesce evaluates to the first value which is not NULL
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = args.args;
-        // do not accept 0 arguments.
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
         if args.is_empty() {
-            return exec_err!(
-                "coalesce was called with {} arguments. It requires at least 1.",
-                args.len()
-            );
+            return plan_err!("coalesce must have at least one argument");
         }
-
-        let return_type = args[0].data_type();
-        let mut return_array = args.iter().filter_map(|x| match x {
-            ColumnarValue::Array(array) => Some(array.len()),
-            _ => None,
-        });
-
-        if let Some(size) = return_array.next() {
-            // start with nulls as default output
-            let mut current_value = new_null_array(&return_type, size);
-            let mut remainder = BooleanArray::from(vec![true; size]);
-
-            for arg in args {
-                match arg {
-                    ColumnarValue::Array(ref array) => {
-                        let to_apply = and(&remainder, &is_not_null(array.as_ref())?)?;
-                        current_value = zip(&to_apply, array, &current_value)?;
-                        remainder = and(&remainder, &is_null(array)?)?;
-                    }
-                    ColumnarValue::Scalar(value) => {
-                        if value.is_null() {
-                            continue;
-                        } else {
-                            let last_value = value.to_scalar()?;
-                            current_value = zip(&remainder, &last_value, &current_value)?;
-                            break;
-                        }
-                    }
-                }
-                if remainder.iter().all(|x| x == Some(false)) {
-                    break;
-                }
-            }
-            Ok(ColumnarValue::Array(current_value))
-        } else {
-            let result = args
-                .iter()
-                .filter_map(|x| match x {
-                    ColumnarValue::Scalar(s) if !s.is_null() => Some(x.clone()),
-                    _ => None,
-                })
-                .next()
-                .unwrap_or_else(|| args[0].clone());
-            Ok(result)
+        if args.len() == 1 {
+            return Ok(ExprSimplifyResult::Simplified(
+                args.into_iter().next().unwrap(),
+            ));
         }
+
+        let n = args.len();
+        let (init, last_elem) = args.split_at(n - 1);
+        let whens = init
+            .iter()
+            .map(|x| x.clone().is_not_null())
+            .collect::<Vec<_>>();
+        let cases = init.to_vec();
+        Ok(ExprSimplifyResult::Simplified(
+            CaseBuilder::new(None, whens, cases, Some(Box::new(last_elem[0].clone())))
+                .end()?,
+        ))
+    }
+
+    /// coalesce evaluates to the first value which is not NULL
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("coalesce should have been simplified to case")
+    }
+
+    fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        let eager = vec![&args[0]];
+        let lazy = args[1..].iter().collect();
+        Some((eager, lazy))
     }
 
     fn short_circuits(&self) -> bool {
diff --git a/datafusion/functions/src/core/expr_ext.rs b/datafusion/functions/src/core/expr_ext.rs
index af05f447f1c1e..3b8581995ab37 100644
--- a/datafusion/functions/src/core/expr_ext.rs
+++ b/datafusion/functions/src/core/expr_ext.rs
@@ -39,8 +39,7 @@ use super::expr_fn::get_field;
 /// ```
 /// # use datafusion_expr::{col};
 /// # use datafusion_functions::core::expr_ext::FieldAccessor;
-/// let expr = col("c1")
-///    .field("my_field");
+/// let expr = col("c1").field("my_field");
 /// assert_eq!(expr.schema_name().to_string(), "c1[my_field]");
 /// ```
 pub trait FieldAccessor {
diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs
index de87308ef3c49..d57ba46fb56a9 100644
--- a/datafusion/functions/src/core/getfield.rs
+++ b/datafusion/functions/src/core/getfield.rs
@@ -15,67 +15,79 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
+use std::sync::Arc;
+
 use arrow::array::{
-    make_array, make_comparator, Array, BooleanArray, Capacities, MutableArrayData,
-    Scalar,
+    Array, BooleanArray, Capacities, MutableArrayData, Scalar, make_array,
+    make_comparator,
 };
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, Field, FieldRef};
 use arrow_buffer::NullBuffer;
+
 use datafusion_common::cast::{as_map_array, as_struct_array};
 use datafusion_common::{
-    exec_err, internal_err, plan_datafusion_err, utils::take_function_args, Result,
-    ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, plan_datafusion_err,
 };
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::ExprSimplifyResult;
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ColumnarValue, Documentation, Expr, ExpressionPlacement, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
 };
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
-use std::any::Any;
-use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Other Functions"),
     description = r#"Returns a field within a map or a struct with the given key.
+    Supports nested field access by providing multiple field names.
     Note: most users invoke `get_field` indirectly via field access
     syntax such as `my_struct_col['field_name']` which results in a call to
-    `get_field(my_struct_col, 'field_name')`."#,
-    syntax_example = "get_field(expression1, expression2)",
+    `get_field(my_struct_col, 'field_name')`.
+    Nested access like `my_struct['a']['b']` is optimized to a single call:
+    `get_field(my_struct, 'a', 'b')`."#,
+    syntax_example = "get_field(expression, field_name[, field_name2, ...])",
     sql_example = r#"```sql
-> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
-> select struct(idx, v) from t as c;
-+-------------------------+
-| struct(c.idx,c.v)       |
-+-------------------------+
-| {c0: data, c1: fusion}  |
-| {c0: apache, c1: arrow} |
-+-------------------------+
-> select get_field((select struct(idx, v) from t), 'c0');
-+-----------------------+
-| struct(t.idx,t.v)[c0] |
-+-----------------------+
-| data                  |
-| apache                |
-+-----------------------+
-> select get_field((select struct(idx, v) from t), 'c1');
-+-----------------------+
-| struct(t.idx,t.v)[c1] |
-+-----------------------+
-| fusion                |
-| arrow                 |
-+-----------------------+
+> -- Access a field from a struct column
+> create table test( struct_col) as values
+    ({name: 'Alice', age: 30}),
+    ({name: 'Bob', age: 25});
+> select struct_col from test;
++-----------------------------+
+| struct_col                  |
++-----------------------------+
+| {name: Alice, age: 30}      |
+| {name: Bob, age: 25}        |
++-----------------------------+
+> select struct_col['name'] as name from test;
++-------+
+| name  |
++-------+
+| Alice |
+| Bob   |
++-------+
+
+> -- Nested field access with multiple arguments
+> create table test(struct_col) as values
+    ({outer: {inner_val: 42}});
+> select struct_col['outer']['inner_val'] as result from test;
++--------+
+| result |
++--------+
+| 42     |
++--------+
 ```"#,
     argument(
-        name = "expression1",
-        description = "The map or struct to retrieve a field for."
+        name = "expression",
+        description = "The map or struct to retrieve a field from."
     ),
     argument(
-        name = "expression2",
-        description = "The field name in the map or struct to retrieve data for. Must evaluate to a string."
+        name = "field_name",
+        description = "The field name(s) to access, in order for nested access. Must evaluate to strings."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct GetFieldFunc {
     signature: Signature,
 }
@@ -86,10 +98,144 @@ impl Default for GetFieldFunc {
     }
 }
 
+/// Process a map array by finding matching keys and extracting corresponding values.
+///
+/// This function handles both simple (scalar) and nested key types by using
+/// appropriate comparison strategies.
+fn process_map_array(
+    array: &dyn Array,
+    key_array: Arc<dyn Array>,
+) -> Result<ColumnarValue> {
+    let map_array = as_map_array(array)?;
+    let keys = if key_array.data_type().is_nested() {
+        let comparator = make_comparator(
+            map_array.keys().as_ref(),
+            key_array.as_ref(),
+            SortOptions::default(),
+        )?;
+        let len = map_array.keys().len().min(key_array.len());
+        let values = (0..len).map(|i| comparator(i, i).is_eq()).collect();
+        let nulls = NullBuffer::union(map_array.keys().nulls(), key_array.nulls());
+        BooleanArray::new(values, nulls)
+    } else {
+        let be_compared = Scalar::new(key_array);
+        arrow::compute::kernels::cmp::eq(&be_compared, map_array.keys())?
+    };
+
+    let original_data = map_array.entries().column(1).to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    for entry in 0..map_array.len() {
+        let start = map_array.value_offsets()[entry] as usize;
+        let end = map_array.value_offsets()[entry + 1] as usize;
+
+        let maybe_matched = keys
+            .slice(start, end - start)
+            .iter()
+            .enumerate()
+            .find(|(_, t)| t.unwrap());
+
+        if maybe_matched.is_none() {
+            mutable.extend_nulls(1);
+            continue;
+        }
+        let (match_offset, _) = maybe_matched.unwrap();
+        mutable.extend(0, start + match_offset, start + match_offset + 1);
+    }
+
+    let data = mutable.freeze();
+    let data = make_array(data);
+    Ok(ColumnarValue::Array(data))
+}
+
+/// Process a map array with a nested key type by iterating through entries
+/// and using a comparator for key matching.
+///
+/// This specialized version is used when the key type is nested (e.g., struct, list).
+fn process_map_with_nested_key(
+    array: &dyn Array,
+    key_array: &dyn Array,
+) -> Result<ColumnarValue> {
+    let map_array = as_map_array(array)?;
+
+    let comparator =
+        make_comparator(map_array.keys().as_ref(), key_array, SortOptions::default())?;
+
+    let original_data = map_array.entries().column(1).to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+
+    for entry in 0..map_array.len() {
+        let start = map_array.value_offsets()[entry] as usize;
+        let end = map_array.value_offsets()[entry + 1] as usize;
+
+        let mut found_match = false;
+        for i in start..end {
+            if comparator(i, 0).is_eq() {
+                mutable.extend(0, i, i + 1);
+                found_match = true;
+                break;
+            }
+        }
+
+        if !found_match {
+            mutable.extend_nulls(1);
+        }
+    }
+
+    let data = mutable.freeze();
+    let data = make_array(data);
+    Ok(ColumnarValue::Array(data))
+}
+
+/// Extract a single field from a struct or map array
+fn extract_single_field(base: ColumnarValue, name: ScalarValue) -> Result<ColumnarValue> {
+    let arrays = ColumnarValue::values_to_arrays(&[base])?;
+    let array = Arc::clone(&arrays[0]);
+
+    let string_value = name.try_as_str().flatten().map(|s| s.to_string());
+
+    match (array.data_type(), name, string_value) {
+        (DataType::Map(_, _), ScalarValue::List(arr), _) => {
+            let key_array: Arc<dyn Array> = arr;
+            process_map_array(&array, key_array)
+        }
+        (DataType::Map(_, _), ScalarValue::Struct(arr), _) => {
+            process_map_array(&array, arr as Arc<dyn Array>)
+        }
+        (DataType::Map(_, _), other, _) => {
+            let data_type = other.data_type();
+            if data_type.is_nested() {
+                process_map_with_nested_key(&array, &other.to_array()?)
+            } else {
+                process_map_array(&array, other.to_array()?)
+            }
+        }
+        (DataType::Struct(_), _, Some(k)) => {
+            let as_struct_array = as_struct_array(&array)?;
+            match as_struct_array.column_by_name(&k) {
+                None => exec_err!("Field {k} not found in struct"),
+                Some(col) => Ok(ColumnarValue::Array(Arc::clone(col))),
+            }
+        }
+        (DataType::Struct(_), name, _) => exec_err!(
+            "get_field is only possible on struct with utf8 indexes. \
+                         Received with {name:?} index"
+        ),
+        (DataType::Null, _, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
+        (dt, name, _) => exec_err!(
+            "get_field is only possible on maps or structs. Received {dt} with {name:?} index"
+        ),
+    }
+}
+
 impl GetFieldFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::any(2, Volatility::Immutable),
+            signature: Signature::user_defined(Volatility::Immutable),
         }
     }
 }
@@ -105,24 +251,47 @@ impl ScalarUDFImpl for GetFieldFunc {
     }
 
     fn display_name(&self, args: &[Expr]) -> Result<String> {
-        let [base, field_name] = take_function_args(self.name(), args)?;
+        if args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.len()
+            );
+        }
 
-        let name = match field_name {
-            Expr::Literal(name) => name,
-            other => &ScalarValue::Utf8(Some(other.schema_name().to_string())),
-        };
+        let base = &args[0];
+        let field_names: Vec<String> = args[1..]
+            .iter()
+            .map(|f| match f {
+                Expr::Literal(name, _) => name.to_string(),
+                other => other.schema_name().to_string(),
+            })
+            .collect();
 
-        Ok(format!("{base}[{name}]"))
+        Ok(format!("{}[{}]", base, field_names.join("][")))
     }
 
     fn schema_name(&self, args: &[Expr]) -> Result<String> {
-        let [base, field_name] = take_function_args(self.name(), args)?;
-        let name = match field_name {
-            Expr::Literal(name) => name,
-            other => &ScalarValue::Utf8(Some(other.schema_name().to_string())),
-        };
+        if args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.len()
+            );
+        }
+
+        let base = &args[0];
+        let field_names: Vec<String> = args[1..]
+            .iter()
+            .map(|f| match f {
+                Expr::Literal(name, _) => name.to_string(),
+                other => other.schema_name().to_string(),
+            })
+            .collect();
 
-        Ok(format!("{}[{}]", base.schema_name(), name))
+        Ok(format!(
+            "{}[{}]",
+            base.schema_name(),
+            field_names.join("][")
+        ))
     }
 
     fn signature(&self) -> &Signature {
@@ -134,153 +303,357 @@ impl ScalarUDFImpl for GetFieldFunc {
     }
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
-        // Length check handled in the signature
-        debug_assert_eq!(args.scalar_arguments.len(), 2);
-
-        match (&args.arg_fields[0].data_type(), args.scalar_arguments[1].as_ref()) {
-            (DataType::Map(fields, _), _) => {
-                match fields.data_type() {
-                    DataType::Struct(fields) if fields.len() == 2 => {
-                        // Arrow's MapArray is essentially a ListArray of structs with two columns. They are
-                        // often named "key", and "value", but we don't require any specific naming here;
-                        // instead, we assume that the second column is the "value" column both here and in
-                        // execution.
-                        let value_field = fields.get(1).expect("fields should have exactly two members");
-
-                        Ok(value_field.as_ref().clone().with_nullable(true).into())
-                    },
-                    _ => exec_err!("Map fields must contain a Struct with exactly 2 fields"),
+        // Validate minimum 2 arguments: base expression + at least one field name
+        if args.scalar_arguments.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.scalar_arguments.len()
+            );
+        }
+
+        let mut current_field = Arc::clone(&args.arg_fields[0]);
+
+        // Iterate through each field name (starting from index 1)
+        for (i, sv) in args.scalar_arguments.iter().enumerate().skip(1) {
+            match current_field.data_type() {
+                DataType::Map(map_field, _) => {
+                    match map_field.data_type() {
+                        DataType::Struct(fields) if fields.len() == 2 => {
+                            // Arrow's MapArray is essentially a ListArray of structs with two columns. They are
+                            // often named "key", and "value", but we don't require any specific naming here;
+                            // instead, we assume that the second column is the "value" column both here and in
+                            // execution.
+                            let value_field = fields
+                                .get(1)
+                                .expect("fields should have exactly two members");
+
+                            current_field = Arc::new(
+                                value_field.as_ref().clone().with_nullable(true),
+                            );
+                        }
+                        _ => {
+                            return exec_err!(
+                                "Map fields must contain a Struct with exactly 2 fields"
+                            );
+                        }
+                    }
+                }
+                DataType::Struct(fields) => {
+                    let field_name = sv
+                        .as_ref()
+                        .and_then(|sv| {
+                            sv.try_as_str().flatten().filter(|s| !s.is_empty())
+                        })
+                        .ok_or_else(|| {
+                            datafusion_common::DataFusionError::Execution(
+                                "Field name must be a non-empty string".to_string(),
+                            )
+                        })?;
+
+                    let child_field = fields
+                        .iter()
+                        .find(|f| f.name() == field_name)
+                        .ok_or_else(|| {
+                            plan_datafusion_err!("Field {field_name} not found in struct")
+                        })?;
+
+                    let mut new_field = child_field.as_ref().clone();
+
+                    // If the parent is nullable, then getting the child must be nullable
+                    if current_field.is_nullable() {
+                        new_field = new_field.with_nullable(true);
+                    }
+                    current_field = Arc::new(new_field);
+                }
+                DataType::Null => {
+                    return Ok(Field::new(self.name(), DataType::Null, true).into());
+                }
+                other => {
+                    return exec_err!(
+                        "Cannot access field at argument {}: type {} is not Struct, Map, or Null",
+                        i,
+                        other
+                    );
                 }
             }
-            (DataType::Struct(fields),sv) => {
-                sv.and_then(|sv| sv.try_as_str().flatten().filter(|s| !s.is_empty()))
-                .map_or_else(
-                    || exec_err!("Field name must be a non-empty string"),
-                    |field_name| {
-                    fields.iter().find(|f| f.name() == field_name)
-                    .ok_or(plan_datafusion_err!("Field {field_name} not found in struct"))
-                    .map(|f| {
-                        let mut child_field = f.as_ref().clone();
-
-                        // If the parent is nullable, then getting the child must be nullable,
-                        // so potentially override the return value
-
-                        if args.arg_fields[0].is_nullable() {
-                            child_field = child_field.with_nullable(true);
-                        }
-                        Arc::new(child_field)
-                    })
-                })
-            },
-            (DataType::Null, _) => Ok(Field::new(self.name(), DataType::Null, true).into()),
-            (other, _) => exec_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"),
         }
+
+        Ok(current_field)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let [base, field_name] = take_function_args(self.name(), args.args)?;
+        if args.args.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                args.args.len()
+            );
+        }
+
+        let mut current = args.args[0].clone();
 
-        if base.data_type().is_null() {
+        // Early exit for null base
+        if current.data_type().is_null() {
             return Ok(ColumnarValue::Scalar(ScalarValue::Null));
         }
 
-        let arrays =
-            ColumnarValue::values_to_arrays(&[base.clone(), field_name.clone()])?;
-        let array = Arc::clone(&arrays[0]);
-        let name = match field_name {
-            ColumnarValue::Scalar(name) => name,
-            _ => {
-                return exec_err!(
-                    "get_field function requires the argument field_name to be a string"
-                );
+        // Iterate through each field name
+        for field_name in args.args.iter().skip(1) {
+            let field_name_scalar = match field_name {
+                ColumnarValue::Scalar(name) => name.clone(),
+                _ => {
+                    return exec_err!(
+                        "get_field function requires all field_name arguments to be scalars"
+                    );
+                }
+            };
+
+            current = extract_single_field(current, field_name_scalar)?;
+
+            // Early exit if we hit null
+            if current.data_type().is_null() {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Null));
             }
-        };
+        }
 
-        fn process_map_array(
-            array: Arc<dyn Array>,
-            key_array: Arc<dyn Array>,
-        ) -> Result<ColumnarValue> {
-            let map_array = as_map_array(array.as_ref())?;
-            let keys = if key_array.data_type().is_nested() {
-                let comparator = make_comparator(
-                    map_array.keys().as_ref(),
-                    key_array.as_ref(),
-                    SortOptions::default(),
-                )?;
-                let len = map_array.keys().len().min(key_array.len());
-                let values = (0..len).map(|i| comparator(i, i).is_eq()).collect();
-                let nulls =
-                    NullBuffer::union(map_array.keys().nulls(), key_array.nulls());
-                BooleanArray::new(values, nulls)
-            } else {
-                let be_compared = Scalar::new(key_array);
-                arrow::compute::kernels::cmp::eq(&be_compared, map_array.keys())?
-            };
+        Ok(current)
+    }
 
-            let original_data = map_array.entries().column(1).to_data();
-            let capacity = Capacities::Array(original_data.len());
-            let mut mutable =
-                MutableArrayData::with_capacities(vec![&original_data], true, capacity);
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &datafusion_expr::simplify::SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        // Need at least 2 args (base + field)
+        if args.len() < 2 {
+            return Ok(ExprSimplifyResult::Original(args));
+        }
 
-            for entry in 0..map_array.len() {
-                let start = map_array.value_offsets()[entry] as usize;
-                let end = map_array.value_offsets()[entry + 1] as usize;
+        // Flatten all nested get_field calls in a single pass
+        // Pattern: get_field(get_field(get_field(base, a), b), c) => get_field(base, a, b, c)
 
-                let maybe_matched = keys
-                    .slice(start, end - start)
-                    .iter()
-                    .enumerate()
-                    .find(|(_, t)| t.unwrap());
+        // Collect path arguments from all nested levels
+        let mut path_args_stack = Vec::new();
+        let mut current_expr = &args[0];
 
-                if maybe_matched.is_none() {
-                    mutable.extend_nulls(1);
-                    continue;
-                }
-                let (match_offset, _) = maybe_matched.unwrap();
-                mutable.extend(0, start + match_offset, start + match_offset + 1);
+        // Push the outermost path arguments first
+        path_args_stack.push(&args[1..]);
+
+        // Walk down the chain of nested get_field calls
+        let base_expr = loop {
+            if let Expr::ScalarFunction(ScalarFunction {
+                func,
+                args: inner_args,
+            }) = current_expr
+                && func
+                    .inner()
+                    .as_any()
+                    .downcast_ref::<GetFieldFunc>()
+                    .is_some()
+            {
+                // Store this level's path arguments (all except the first, which is base/nested call)
+                path_args_stack.push(&inner_args[1..]);
+
+                // Move to the next level down
+                current_expr = &inner_args[0];
+                continue;
             }
+            // Not a get_field call, this is the base expression
+            break current_expr;
+        };
 
-            let data = mutable.freeze();
-            let data = make_array(data);
-            Ok(ColumnarValue::Array(data))
+        // If no nested get_field calls were found, return original
+        if path_args_stack.len() == args.len() - 1 {
+            return Ok(ExprSimplifyResult::Original(args));
         }
 
-        match (array.data_type(), name) {
-            (DataType::Map(_, _), ScalarValue::List(arr)) => {
-                let key_array: Arc<dyn Array> = arr;
-                process_map_array(array, key_array)
-            }
-            (DataType::Map(_, _), ScalarValue::Struct(arr)) => {
-                process_map_array(array, arr as Arc<dyn Array>)
-            }
-            (DataType::Map(_, _), other) => {
-                let data_type = other.data_type();
-                if data_type.is_nested() {
-                    exec_err!("unsupported type {:?} for map access", data_type)
-                } else {
-                    process_map_array(array, other.to_array()?)
-                }
-            }
-            (DataType::Struct(_), ScalarValue::Utf8(Some(k))) => {
-                let as_struct_array = as_struct_array(&array)?;
-                match as_struct_array.column_by_name(&k) {
-                    None => exec_err!("get indexed field {k} not found in struct"),
-                    Some(col) => Ok(ColumnarValue::Array(Arc::clone(col))),
-                }
-            }
-            (DataType::Struct(_), name) => exec_err!(
-                "get_field is only possible on struct with utf8 indexes. \
-                             Received with {name:?} index"
-            ),
-            (DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)),
-            (dt, name) => exec_err!(
-                "get_field is only possible on maps with utf8 indexes or struct \
-                                         with utf8 indexes. Received {dt:?} with {name:?} index"
+        // If we found any nested get_field calls, flatten them
+        // Build merged args: [base, ...all_path_args_in_correct_order]
+        let mut merged_args = vec![base_expr.clone()];
+
+        // Add path args in reverse order (innermost to outermost)
+        // Stack is: [outermost_paths, ..., innermost_paths]
+        // We want: [base, innermost_paths, ..., outermost_paths]
+        for path_slice in path_args_stack.iter().rev() {
+            merged_args.extend_from_slice(path_slice);
+        }
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(
+                Arc::new(ScalarUDF::new_from_impl(GetFieldFunc::new())),
+                merged_args,
             ),
+        )))
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() < 2 {
+            return exec_err!(
+                "get_field requires at least 2 arguments, got {}",
+                arg_types.len()
+            );
         }
+        // Accept types as-is, validation happens in return_field_from_args
+        Ok(arg_types.to_vec())
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
+
+    fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        // get_field can be pushed to leaves if:
+        // 1. The base (first arg) is a column or already placeable at leaves
+        // 2. All field keys (remaining args) are literals
+        if args.is_empty() {
+            return ExpressionPlacement::KeepInPlace;
+        }
+
+        let base_placement = args[0];
+        let base_is_pushable = matches!(
+            base_placement,
+            ExpressionPlacement::Column | ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        let all_keys_are_literals = args
+            .iter()
+            .skip(1)
+            .all(|p| *p == ExpressionPlacement::Literal);
+
+        if base_is_pushable && all_keys_are_literals {
+            ExpressionPlacement::MoveTowardsLeafNodes
+        } else {
+            ExpressionPlacement::KeepInPlace
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{ArrayRef, Int32Array, StructArray};
+    use arrow::datatypes::Fields;
+
+    #[test]
+    fn test_get_field_utf8view_key() -> Result<()> {
+        // Create a struct array with fields "a" and "b"
+        let a_values = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+        let b_values = Int32Array::from(vec![Some(10), Some(20), Some(30)]);
+
+        let fields: Fields = vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]
+        .into();
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![
+                Arc::new(a_values) as ArrayRef,
+                Arc::new(b_values) as ArrayRef,
+            ],
+            None,
+        );
+
+        let base = ColumnarValue::Array(Arc::new(struct_array));
+
+        // Use Utf8View key to access field "a"
+        let key = ScalarValue::Utf8View(Some("a".to_string()));
+
+        let result = extract_single_field(base, key)?;
+
+        let result_array = result.into_array(3)?;
+        let expected = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+
+        assert_eq!(result_array.as_ref(), &expected as &dyn Array);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_placement_literal_key() {
+        let func = GetFieldFunc::new();
+
+        // get_field(col, 'literal') -> leaf-pushable (static field access)
+        let args = vec![ExpressionPlacement::Column, ExpressionPlacement::Literal];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // get_field(col, 'a', 'b') -> leaf-pushable (nested static field access)
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::Literal,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // get_field(get_field(col, 'a'), 'b') represented as MoveTowardsLeafNodes for base
+        let args = vec![
+            ExpressionPlacement::MoveTowardsLeafNodes,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+    }
+
+    #[test]
+    fn test_placement_column_key() {
+        let func = GetFieldFunc::new();
+
+        // get_field(col, other_col) -> NOT leaf-pushable (dynamic per-row lookup)
+        let args = vec![ExpressionPlacement::Column, ExpressionPlacement::Column];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+
+        // get_field(col, 'a', other_col) -> NOT leaf-pushable (dynamic nested lookup)
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::Literal,
+            ExpressionPlacement::Column,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
+
+    #[test]
+    fn test_placement_root() {
+        let func = GetFieldFunc::new();
+
+        // get_field(root_expr, 'literal') -> NOT leaf-pushable
+        let args = vec![
+            ExpressionPlacement::KeepInPlace,
+            ExpressionPlacement::Literal,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+
+        // get_field(col, root_expr) -> NOT leaf-pushable
+        let args = vec![
+            ExpressionPlacement::Column,
+            ExpressionPlacement::KeepInPlace,
+        ];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
+
+    #[test]
+    fn test_placement_edge_cases() {
+        let func = GetFieldFunc::new();
+
+        // Empty args -> NOT leaf-pushable
+        assert_eq!(func.placement(&[]), ExpressionPlacement::KeepInPlace);
+
+        // Just base, no key -> MoveTowardsLeafNodes (not a valid call but should handle gracefully)
+        let args = vec![ExpressionPlacement::Column];
+        assert_eq!(
+            func.placement(&args),
+            ExpressionPlacement::MoveTowardsLeafNodes
+        );
+
+        // Literal base with literal key -> NOT leaf-pushable (would be constant-folded)
+        let args = vec![ExpressionPlacement::Literal, ExpressionPlacement::Literal];
+        assert_eq!(func.placement(&args), ExpressionPlacement::KeepInPlace);
+    }
 }
diff --git a/datafusion/functions/src/core/greatest.rs b/datafusion/functions/src/core/greatest.rs
index 2d7ad2be3986f..fb7592c6290ad 100644
--- a/datafusion/functions/src/core/greatest.rs
+++ b/datafusion/functions/src/core/greatest.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use crate::core::greatest_least_utils::GreatestLeastOperator;
-use arrow::array::{make_comparator, Array, BooleanArray};
+use arrow::array::{Array, BooleanArray, make_comparator};
 use arrow::buffer::BooleanBuffer;
-use arrow::compute::kernels::cmp;
 use arrow::compute::SortOptions;
+use arrow::compute::kernels::cmp;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
@@ -53,7 +53,7 @@ const SORT_OPTIONS: SortOptions = SortOptions {
         description = "Expressions to compare and return the greatest value.. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct GreatestFunc {
     signature: Signature,
 }
@@ -90,11 +90,7 @@ impl GreatestLeastOperator for GreatestFunc {
             SORT_OPTIONS,
         )?;
 
-        if cmp(0, 0).is_ge() {
-            Ok(lhs)
-        } else {
-            Ok(rhs)
-        }
+        if cmp(0, 0).is_ge() { Ok(lhs) } else { Ok(rhs) }
     }
 
     /// Return boolean array where `arr[i] = lhs[i] >= rhs[i]` for all i, where `arr` is the result array
@@ -113,11 +109,11 @@ impl GreatestLeastOperator for GreatestFunc {
 
         let cmp = make_comparator(lhs, rhs, SORT_OPTIONS)?;
 
-        if lhs.len() != rhs.len() {
-            return internal_err!(
-                "All arrays should have the same length for greatest comparison"
-            );
-        }
+        assert_eq_or_internal_err!(
+            lhs.len(),
+            rhs.len(),
+            "All arrays should have the same length for greatest comparison"
+        );
 
         let values = BooleanBuffer::collect_bool(lhs.len(), |i| cmp(i, i).is_ge());
 
diff --git a/datafusion/functions/src/core/greatest_least_utils.rs b/datafusion/functions/src/core/greatest_least_utils.rs
index 46b3645e703a2..5f8b4a51186fe 100644
--- a/datafusion/functions/src/core/greatest_least_utils.rs
+++ b/datafusion/functions/src/core/greatest_least_utils.rs
@@ -18,7 +18,7 @@
 use arrow::array::{Array, ArrayRef, BooleanArray};
 use arrow::compute::kernels::zip::zip;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_or_internal_err, plan_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::type_coercion::binary::type_union_resolution;
 use std::sync::Arc;
@@ -36,11 +36,11 @@ pub(super) trait GreatestLeastOperator {
 }
 
 fn keep_array<Op: GreatestLeastOperator>(
-    lhs: ArrayRef,
-    rhs: ArrayRef,
+    lhs: &dyn Array,
+    rhs: &dyn Array,
 ) -> Result<ArrayRef> {
     // True for values that we should keep from the left array
-    let keep_lhs = Op::get_indexes_to_keep(lhs.as_ref(), rhs.as_ref())?;
+    let keep_lhs = Op::get_indexes_to_keep(lhs, rhs)?;
 
     let result = zip(&keep_lhs, &lhs, &rhs)?;
 
@@ -50,12 +50,11 @@ fn keep_array<Op: GreatestLeastOperator>(
 pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
     args: &[ColumnarValue],
 ) -> Result<ColumnarValue> {
-    if args.is_empty() {
-        return internal_err!(
-            "{} was called with no arguments. It requires at least 1.",
-            Op::NAME
-        );
-    }
+    assert_or_internal_err!(
+        !args.is_empty(),
+        "{} was called with no arguments. It requires at least 1.",
+        Op::NAME
+    );
 
     // Some engines (e.g. SQL Server) allow greatest/least with single arg, it's a noop
     if args.len() == 1 {
@@ -101,8 +100,8 @@ pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
 
         // Start with the result value
         result = keep_array::<Op>(
-            Arc::clone(first_array),
-            result_scalar.to_array_of_size(first_array.len())?,
+            first_array,
+            &result_scalar.to_array_of_size(first_array.len())?,
         )?;
     } else {
         // If we only have arrays, start with the first array
@@ -111,7 +110,7 @@ pub(super) fn execute_conditional<Op: GreatestLeastOperator>(
     }
 
     for array in arrays_iter {
-        result = keep_array::<Op>(Arc::clone(array), result)?;
+        result = keep_array::<Op>(array, &result)?;
     }
 
     Ok(ColumnarValue::Array(result))
diff --git a/datafusion/functions/src/core/least.rs b/datafusion/functions/src/core/least.rs
index 662dac3e699fb..fc67924888a73 100644
--- a/datafusion/functions/src/core/least.rs
+++ b/datafusion/functions/src/core/least.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use crate::core::greatest_least_utils::GreatestLeastOperator;
-use arrow::array::{make_comparator, Array, BooleanArray};
+use arrow::array::{Array, BooleanArray, make_comparator};
 use arrow::buffer::BooleanBuffer;
-use arrow::compute::kernels::cmp;
 use arrow::compute::SortOptions;
+use arrow::compute::kernels::cmp;
 use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
@@ -53,7 +53,7 @@ const SORT_OPTIONS: SortOptions = SortOptions {
         description = "Expressions to compare and return the smallest value. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LeastFunc {
     signature: Signature,
 }
@@ -103,11 +103,7 @@ impl GreatestLeastOperator for LeastFunc {
             SORT_OPTIONS,
         )?;
 
-        if cmp(0, 0).is_le() {
-            Ok(lhs)
-        } else {
-            Ok(rhs)
-        }
+        if cmp(0, 0).is_le() { Ok(lhs) } else { Ok(rhs) }
     }
 
     /// Return boolean array where `arr[i] = lhs[i] <= rhs[i]` for all i, where `arr` is the result array
@@ -126,11 +122,11 @@ impl GreatestLeastOperator for LeastFunc {
 
         let cmp = make_comparator(lhs, rhs, SORT_OPTIONS)?;
 
-        if lhs.len() != rhs.len() {
-            return internal_err!(
-                "All arrays should have the same length for least comparison"
-            );
-        }
+        assert_eq_or_internal_err!(
+            lhs.len(),
+            rhs.len(),
+            "All arrays should have the same length for least comparison"
+        );
 
         let values = BooleanBuffer::collect_bool(lhs.len(), |i| cmp(i, i).is_le());
 
diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs
index db080cd628478..a14d563737240 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
 use std::sync::Arc;
 
 pub mod arrow_cast;
+pub mod arrow_metadata;
 pub mod arrowtypeof;
 pub mod coalesce;
 pub mod expr_ext;
@@ -55,6 +56,7 @@ make_udf_function!(least::LeastFunc, least);
 make_udf_function!(union_extract::UnionExtractFun, union_extract);
 make_udf_function!(union_tag::UnionTagFunc, union_tag);
 make_udf_function!(version::VersionFunc, version);
+make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
 
 pub mod expr_fn {
     use datafusion_expr::{Expr, Literal};
@@ -83,6 +85,10 @@ pub mod expr_fn {
         arrow_typeof,
         "Returns the Arrow type of the input expression.",
         arg1
+    ),(
+        arrow_metadata,
+        "Returns the metadata of the input expression",
+        args,
     ),(
         r#struct,
         "Returns a struct with the given arguments",
@@ -110,11 +116,20 @@ pub mod expr_fn {
     ));
 
     #[doc = "Returns the value of the field with the given name from the struct"]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn get_field(arg1: Expr, arg2: impl Literal) -> Expr {
         super::get_field().call(vec![arg1, arg2.lit()])
     }
 
+    #[doc = "Returns the value of nested fields by traversing multiple field names"]
+    pub fn get_field_path(base: Expr, field_names: Vec<Expr>) -> Expr {
+        let mut args = vec![base];
+        args.extend(field_names);
+        super::get_field().call(args)
+    }
+
     #[doc = "Returns the value of the field with the given name from the union when it's selected, or NULL otherwise"]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn union_extract(arg1: Expr, arg2: impl Literal) -> Expr {
         super::union_extract().call(vec![arg1, arg2.lit()])
     }
@@ -125,6 +140,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
         nullif(),
         arrow_cast(),
+        arrow_metadata(),
         nvl(),
         nvl2(),
         overlay(),
diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs
index 115f4a8aba225..933151fcbd31e 100644
--- a/datafusion/functions/src/core/named_struct.rs
+++ b/datafusion/functions/src/core/named_struct.rs
@@ -17,7 +17,7 @@
 
 use arrow::array::StructArray;
 use arrow::datatypes::{DataType, Field, FieldRef, Fields};
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::{
     ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs,
 };
@@ -58,7 +58,7 @@ a struct type of fields `field_a` and `field_b`:
         description = "Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NamedStructFunc {
     signature: Signature,
 }
@@ -104,7 +104,7 @@ impl ScalarUDFImpl for NamedStructFunc {
             );
         }
 
-        if args.scalar_arguments.len() % 2 != 0 {
+        if !args.scalar_arguments.len().is_multiple_of(2) {
             return exec_err!(
                 "named_struct requires an even number of arguments, got {} instead",
                 args.scalar_arguments.len()
diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs
index ee29714da16b6..a8a512d35a36b 100644
--- a/datafusion/functions/src/core/nullif.rs
+++ b/datafusion/functions/src/core/nullif.rs
@@ -20,7 +20,7 @@ use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 
 use arrow::compute::kernels::cmp::eq;
 use arrow::compute::kernels::nullif::nullif;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -53,7 +53,7 @@ This can be used to perform the inverse operation of [`coalesce`](#coalesce).",
         description = "Expression to compare to expression1. Can be a constant, column, or function, and any combination of operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NullIfFunc {
     signature: Signature,
 }
@@ -113,7 +113,6 @@ impl ScalarUDFImpl for NullIfFunc {
 /// Implements NULLIF(expr1, expr2)
 /// Args: 0 - left expr is any array
 ///       1 - if the left is equal to this expr2, then the result is NULL, otherwise left value is passed.
-///
 fn nullif_func(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     let [lhs, rhs] = take_function_args("nullif", args)?;
 
diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs
index 82d367072a256..0b4966d4fbdce 100644
--- a/datafusion/functions/src/core/nvl.rs
+++ b/datafusion/functions/src/core/nvl.rs
@@ -15,21 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::Array;
-use arrow::compute::is_not_null;
-use arrow::compute::kernels::zip::zip;
-use arrow::datatypes::DataType;
-use datafusion_common::{utils::take_function_args, Result};
+use crate::core::coalesce::CoalesceFunc;
+use arrow::datatypes::{DataType, FieldRef};
+use datafusion_common::Result;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_macros::user_doc;
-use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Conditional Functions"),
-    description = "Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_.",
+    description = "Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_ and _expression2_ is not evaluated. This function can be used to substitute a default value for NULL values.",
     syntax_example = "nvl(expression1, expression2)",
     sql_example = r#"```sql
 > select nvl(null, 'a');
@@ -55,9 +53,9 @@ use std::sync::Arc;
         description = "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NVLFunc {
-    signature: Signature,
+    coalesce: CoalesceFunc,
     aliases: Vec<String>,
 }
 
@@ -90,11 +88,13 @@ impl Default for NVLFunc {
 impl NVLFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::uniform(
-                2,
-                SUPPORTED_NVL_TYPES.to_vec(),
-                Volatility::Immutable,
-            ),
+            coalesce: CoalesceFunc {
+                signature: Signature::uniform(
+                    2,
+                    SUPPORTED_NVL_TYPES.to_vec(),
+                    Volatility::Immutable,
+                ),
+            },
             aliases: vec![String::from("ifnull")],
         }
     }
@@ -110,209 +110,45 @@ impl ScalarUDFImpl for NVLFunc {
     }
 
     fn signature(&self) -> &Signature {
-        &self.signature
+        &self.coalesce.signature
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].clone())
+        self.coalesce.return_type(arg_types)
     }
 
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        nvl_func(&args.args)
-    }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        self.coalesce.return_field_from_args(args)
     }
-}
-
-fn nvl_func(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [lhs, rhs] = take_function_args("nvl/ifnull", args)?;
-    let (lhs_array, rhs_array) = match (lhs, rhs) {
-        (ColumnarValue::Array(lhs), ColumnarValue::Scalar(rhs)) => {
-            (Arc::clone(lhs), rhs.to_array_of_size(lhs.len())?)
-        }
-        (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => {
-            (Arc::clone(lhs), Arc::clone(rhs))
-        }
-        (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => {
-            (lhs.to_array_of_size(rhs.len())?, Arc::clone(rhs))
-        }
-        (ColumnarValue::Scalar(lhs), ColumnarValue::Scalar(rhs)) => {
-            let mut current_value = lhs;
-            if lhs.is_null() {
-                current_value = rhs;
-            }
-            return Ok(ColumnarValue::Scalar(current_value.clone()));
-        }
-    };
-    let to_apply = is_not_null(&lhs_array)?;
-    let value = zip(&to_apply, &lhs_array, &rhs_array)?;
-    Ok(ColumnarValue::Array(value))
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::*;
 
-    use super::*;
-    use datafusion_common::ScalarValue;
-
-    #[test]
-    fn nvl_int32() -> Result<()> {
-        let a = Int32Array::from(vec![
-            Some(1),
-            Some(2),
-            None,
-            None,
-            Some(3),
-            None,
-            None,
-            Some(4),
-            Some(5),
-        ]);
-        let a = ColumnarValue::Array(Arc::new(a));
-
-        let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(6i32)));
-
-        let result = nvl_func(&[a, lit_array])?;
-        let result = result.into_array(0).expect("Failed to convert to array");
-
-        let expected = Arc::new(Int32Array::from(vec![
-            Some(1),
-            Some(2),
-            Some(6),
-            Some(6),
-            Some(3),
-            Some(6),
-            Some(6),
-            Some(4),
-            Some(5),
-        ])) as ArrayRef;
-        assert_eq!(expected.as_ref(), result.as_ref());
-        Ok(())
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        self.coalesce.simplify(args, info)
     }
 
-    #[test]
-    // Ensure that arrays with no nulls can also invoke nvl() correctly
-    fn nvl_int32_non_nulls() -> Result<()> {
-        let a = Int32Array::from(vec![1, 3, 10, 7, 8, 1, 2, 4, 5]);
-        let a = ColumnarValue::Array(Arc::new(a));
-
-        let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(20i32)));
-
-        let result = nvl_func(&[a, lit_array])?;
-        let result = result.into_array(0).expect("Failed to convert to array");
-
-        let expected = Arc::new(Int32Array::from(vec![
-            Some(1),
-            Some(3),
-            Some(10),
-            Some(7),
-            Some(8),
-            Some(1),
-            Some(2),
-            Some(4),
-            Some(5),
-        ])) as ArrayRef;
-        assert_eq!(expected.as_ref(), result.as_ref());
-        Ok(())
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        self.coalesce.invoke_with_args(args)
     }
 
-    #[test]
-    fn nvl_boolean() -> Result<()> {
-        let a = BooleanArray::from(vec![Some(true), Some(false), None]);
-        let a = ColumnarValue::Array(Arc::new(a));
-
-        let lit_array = ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)));
-
-        let result = nvl_func(&[a, lit_array])?;
-        let result = result.into_array(0).expect("Failed to convert to array");
-
-        let expected = Arc::new(BooleanArray::from(vec![
-            Some(true),
-            Some(false),
-            Some(false),
-        ])) as ArrayRef;
-
-        assert_eq!(expected.as_ref(), result.as_ref());
-        Ok(())
+    fn conditional_arguments<'a>(
+        &self,
+        args: &'a [Expr],
+    ) -> Option<(Vec<&'a Expr>, Vec<&'a Expr>)> {
+        self.coalesce.conditional_arguments(args)
     }
 
-    #[test]
-    fn nvl_string() -> Result<()> {
-        let a = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
-        let a = ColumnarValue::Array(Arc::new(a));
-
-        let lit_array = ColumnarValue::Scalar(ScalarValue::from("bax"));
-
-        let result = nvl_func(&[a, lit_array])?;
-        let result = result.into_array(0).expect("Failed to convert to array");
-
-        let expected = Arc::new(StringArray::from(vec![
-            Some("foo"),
-            Some("bar"),
-            Some("bax"),
-            Some("baz"),
-        ])) as ArrayRef;
-
-        assert_eq!(expected.as_ref(), result.as_ref());
-        Ok(())
+    fn short_circuits(&self) -> bool {
+        self.coalesce.short_circuits()
     }
 
-    #[test]
-    fn nvl_literal_first() -> Result<()> {
-        let a = Int32Array::from(vec![Some(1), Some(2), None, None, Some(3), Some(4)]);
-        let a = ColumnarValue::Array(Arc::new(a));
-
-        let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32)));
-
-        let result = nvl_func(&[lit_array, a])?;
-        let result = result.into_array(0).expect("Failed to convert to array");
-
-        let expected = Arc::new(Int32Array::from(vec![
-            Some(2),
-            Some(2),
-            Some(2),
-            Some(2),
-            Some(2),
-            Some(2),
-        ])) as ArrayRef;
-        assert_eq!(expected.as_ref(), result.as_ref());
-        Ok(())
+    fn aliases(&self) -> &[String] {
+        &self.aliases
     }
 
-    #[test]
-    fn nvl_scalar() -> Result<()> {
-        let a_null = ColumnarValue::Scalar(ScalarValue::Int32(None));
-        let b_null = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32)));
-
-        let result_null = nvl_func(&[a_null, b_null])?;
-        let result_null = result_null
-            .into_array(1)
-            .expect("Failed to convert to array");
-
-        let expected_null = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef;
-
-        assert_eq!(expected_null.as_ref(), result_null.as_ref());
-
-        let a_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32)));
-        let b_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(1i32)));
-
-        let result_nnull = nvl_func(&[a_nnull, b_nnull])?;
-        let result_nnull = result_nnull
-            .into_array(1)
-            .expect("Failed to convert to array");
-
-        let expected_nnull = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef;
-        assert_eq!(expected_nnull.as_ref(), result_nnull.as_ref());
-
-        Ok(())
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
     }
 }
diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs
index d20b01e29fba8..0b092c44d502b 100644
--- a/datafusion/functions/src/core/nvl2.rs
+++ b/datafusion/functions/src/core/nvl2.rs
@@ -15,17 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::Array;
-use arrow::compute::is_not_null;
-use arrow::compute::kernels::zip::zip;
-use arrow::datatypes::DataType;
-use datafusion_common::{internal_err, utils::take_function_args, Result};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, internal_err, utils::take_function_args};
 use datafusion_expr::{
-    type_coercion::binary::comparison_coercion, ColumnarValue, Documentation,
-    ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, Volatility,
+    conditional_expressions::CaseBuilder,
+    simplify::{ExprSimplifyResult, SimplifyContext},
+    type_coercion::binary::comparison_coercion,
 };
 use datafusion_macros::user_doc;
-use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "Conditional Functions"),
@@ -59,7 +58,7 @@ use std::sync::Arc;
         description = "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NVL2Func {
     signature: Signature,
 }
@@ -95,8 +94,37 @@ impl ScalarUDFImpl for NVL2Func {
         Ok(arg_types[1].clone())
     }
 
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        nvl2_func(&args.args)
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable =
+            args.arg_fields[1].is_nullable() || args.arg_fields[2].is_nullable();
+        let return_type = args.arg_fields[1].data_type().clone();
+        Ok(Field::new(self.name(), return_type, nullable).into())
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("nvl2 should have been simplified to case")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [test, if_non_null, if_null] = take_function_args(self.name(), args)?;
+
+        let expr = CaseBuilder::new(
+            None,
+            vec![test.is_not_null()],
+            vec![if_non_null],
+            Some(Box::new(if_null)),
+        )
+        .end()?;
+
+        Ok(ExprSimplifyResult::Simplified(expr))
+    }
+
+    fn short_circuits(&self) -> bool {
+        true
     }
 
     fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
@@ -113,7 +141,7 @@ impl ScalarUDFImpl for NVL2Func {
                     if let Some(coerced_type) = coerced_type {
                         Ok(coerced_type)
                     } else {
-                        internal_err!("Coercion from {acc:?} to {x:?} failed.")
+                        internal_err!("Coercion from {acc} to {x} failed.")
                     }
                 })?;
         Ok(vec![new_type; arg_types.len()])
@@ -123,42 +151,3 @@ impl ScalarUDFImpl for NVL2Func {
         self.doc()
     }
 }
-
-fn nvl2_func(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let mut len = 1;
-    let mut is_array = false;
-    for arg in args {
-        if let ColumnarValue::Array(array) = arg {
-            len = array.len();
-            is_array = true;
-            break;
-        }
-    }
-    if is_array {
-        let args = args
-            .iter()
-            .map(|arg| match arg {
-                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(len),
-                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
-            })
-            .collect::<Result<Vec<_>>>()?;
-        let [tested, if_non_null, if_null] = take_function_args("nvl2", args)?;
-        let to_apply = is_not_null(&tested)?;
-        let value = zip(&to_apply, &if_non_null, &if_null)?;
-        Ok(ColumnarValue::Array(value))
-    } else {
-        let [tested, if_non_null, if_null] = take_function_args("nvl2", args)?;
-        match &tested {
-            ColumnarValue::Array(_) => {
-                internal_err!("except Scalar value, but got Array")
-            }
-            ColumnarValue::Scalar(scalar) => {
-                if scalar.is_null() {
-                    Ok(if_null.clone())
-                } else {
-                    Ok(if_non_null.clone())
-                }
-            }
-        }
-    }
-}
diff --git a/datafusion/functions/src/core/overlay.rs b/datafusion/functions/src/core/overlay.rs
index 0ea5359e9621d..179ad9a460a08 100644
--- a/datafusion/functions/src/core/overlay.rs
+++ b/datafusion/functions/src/core/overlay.rs
@@ -25,7 +25,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{
     as_generic_string_array, as_int64_array, as_string_view_array,
 };
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
@@ -53,7 +53,7 @@ use datafusion_macros::user_doc;
         description = "The count of characters to be replaced from start position of str. If not specified, will use substr length instead."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct OverlayFunc {
     signature: Signature,
 }
@@ -201,7 +201,7 @@ fn overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-pub fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
         3 => {
             let string_array = as_generic_string_array::<T>(&args[0])?;
@@ -227,7 +227,7 @@ pub fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef>
     }
 }
 
-pub fn string_view_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn string_view_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
         3 => {
             let string_array = as_string_view_array(&args[0])?;
diff --git a/datafusion/functions/src/core/planner.rs b/datafusion/functions/src/core/planner.rs
index 227e401156173..4d6b744b3e68b 100644
--- a/datafusion/functions/src/core/planner.rs
+++ b/datafusion/functions/src/core/planner.rs
@@ -20,7 +20,7 @@ use datafusion_common::Result;
 use datafusion_common::{Column, DFSchema, ScalarValue, TableReference};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawDictionaryExpr};
-use datafusion_expr::{lit, Expr};
+use datafusion_expr::{Expr, lit};
 
 use super::named_struct;
 
diff --git a/datafusion/functions/src/core/struct.rs b/datafusion/functions/src/core/struct.rs
index f068fc18a8b04..352f258643921 100644
--- a/datafusion/functions/src/core/struct.rs
+++ b/datafusion/functions/src/core/struct.rs
@@ -17,7 +17,7 @@
 
 use arrow::array::StructArray;
 use arrow::datatypes::{DataType, Field};
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -64,7 +64,7 @@ select struct(a as field_a, b) from t;
         description = "Expression to include in the output struct. Can be a constant, column, or function, any combination of arithmetic or string operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct StructFunc {
     signature: Signature,
     aliases: Vec<String>,
diff --git a/datafusion/functions/src/core/union_extract.rs b/datafusion/functions/src/core/union_extract.rs
index be49f82267121..8d915fb2e2c07 100644
--- a/datafusion/functions/src/core/union_extract.rs
+++ b/datafusion/functions/src/core/union_extract.rs
@@ -20,7 +20,7 @@ use arrow::datatypes::{DataType, Field, FieldRef, UnionFields};
 use datafusion_common::cast::as_union_array;
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, Result, ScalarValue,
+    Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
 };
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs};
@@ -49,7 +49,7 @@ use datafusion_macros::user_doc;
         description = "String expression to operate on. Must be a constant."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct UnionExtractFun {
     signature: Signature,
 }
@@ -117,9 +117,16 @@ impl ScalarUDFImpl for UnionExtractFun {
         let [array, target_name] = take_function_args("union_extract", args.args)?;
 
         let target_name = match target_name {
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_name))) => Ok(target_name),
-            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => exec_err!("union_extract second argument must be a non-null string literal, got a null instead"),
-            _ => exec_err!("union_extract second argument must be a non-null string literal, got {} instead", target_name.data_type()),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(target_name))) => {
+                Ok(target_name)
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => exec_err!(
+                "union_extract second argument must be a non-null string literal, got a null instead"
+            ),
+            _ => exec_err!(
+                "union_extract second argument must be a non-null string literal, got {} instead",
+                target_name.data_type()
+            ),
         }?;
 
         match array {
@@ -169,10 +176,11 @@ fn find_field<'a>(fields: &'a UnionFields, name: &str) -> Result<(i8, &'a FieldR
 
 #[cfg(test)]
 mod tests {
-
     use arrow::datatypes::{DataType, Field, UnionFields, UnionMode};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
 
     use super::UnionExtractFun;
 
@@ -181,13 +189,14 @@ mod tests {
     fn test_scalar_value() -> Result<()> {
         let fun = UnionExtractFun::new();
 
-        let fields = UnionFields::new(
+        let fields = UnionFields::try_new(
             vec![1, 3],
             vec![
                 Field::new("str", DataType::Utf8, false),
                 Field::new("int", DataType::Int32, false),
             ],
-        );
+        )
+        .unwrap();
 
         let args = vec![
             ColumnarValue::Scalar(ScalarValue::Union(
@@ -207,6 +216,7 @@ mod tests {
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         })?;
 
         assert_scalar(result, ScalarValue::Utf8(None));
@@ -229,6 +239,7 @@ mod tests {
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         })?;
 
         assert_scalar(result, ScalarValue::Utf8(None));
@@ -250,6 +261,7 @@ mod tests {
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         })?;
 
         assert_scalar(result, ScalarValue::new_utf8("42"));
diff --git a/datafusion/functions/src/core/union_tag.rs b/datafusion/functions/src/core/union_tag.rs
index 3a4d96de2bc03..fac5c82691adc 100644
--- a/datafusion/functions/src/core/union_tag.rs
+++ b/datafusion/functions/src/core/union_tag.rs
@@ -18,7 +18,7 @@
 use arrow::array::{Array, AsArray, DictionaryArray, Int8Array, StringArray};
 use arrow::datatypes::DataType;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, exec_err};
 use datafusion_doc::Documentation;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
@@ -43,7 +43,7 @@ use std::sync::Arc;
 ```"#,
     standard_argument(name = "union", prefix = "Union")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct UnionTagFunc {
     signature: Signature,
 }
@@ -136,14 +136,14 @@ impl ScalarUDFImpl for UnionTagFunc {
                     })
                     .ok_or_else(|| {
                         exec_datafusion_err!(
-                            "union_tag: union scalar with unknow type_id {value_type_id}"
+                            "union_tag: union scalar with unknown type_id {value_type_id}"
                         )
                     }),
                 None => Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
                     args.return_field.data_type(),
                 )?)),
             },
-            v => exec_err!("union_tag only support unions, got {:?}", v.data_type()),
+            v => exec_err!("union_tag only support unions, got {}", v.data_type()),
         }
     }
 
@@ -157,6 +157,7 @@ mod tests {
     use super::UnionTagFunc;
     use arrow::datatypes::{DataType, Field, UnionFields, UnionMode};
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
@@ -182,6 +183,7 @@ mod tests {
                 number_rows: 1,
                 return_field: Field::new("res", return_type, true).into(),
                 arg_fields: vec![],
+                config_options: Arc::new(ConfigOptions::default()),
             })
             .unwrap();
 
@@ -204,6 +206,7 @@ mod tests {
                 number_rows: 1,
                 return_field: Field::new("res", return_type, true).into(),
                 arg_fields: vec![],
+                config_options: Arc::new(ConfigOptions::default()),
             })
             .unwrap();
 
diff --git a/datafusion/functions/src/core/version.rs b/datafusion/functions/src/core/version.rs
index b3abe246b4b3f..9c658d5918a5a 100644
--- a/datafusion/functions/src/core/version.rs
+++ b/datafusion/functions/src/core/version.rs
@@ -18,7 +18,7 @@
 //! [`VersionFunc`]: Implementation of the `version` function.
 
 use arrow::datatypes::DataType;
-use datafusion_common::{utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, utils::take_function_args};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -39,7 +39,7 @@ use std::any::Any;
 +--------------------------------------------+
 ```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct VersionFunc {
     signature: Signature,
 }
@@ -53,7 +53,7 @@ impl Default for VersionFunc {
 impl VersionFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::exact(vec![], Volatility::Immutable),
+            signature: Signature::nullary(Volatility::Immutable),
         }
     }
 }
@@ -98,7 +98,10 @@ impl ScalarUDFImpl for VersionFunc {
 mod test {
     use super::*;
     use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::ScalarFunctionArgs;
     use datafusion_expr::ScalarUDF;
+    use std::sync::Arc;
 
     #[tokio::test]
     async fn test_version_udf() {
@@ -109,6 +112,7 @@ mod test {
                 arg_fields: vec![],
                 number_rows: 0,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             })
             .unwrap();
 
diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs
index eaa688c1c3359..abb86b8246fc9 100644
--- a/datafusion/functions/src/crypto/basic.rs
+++ b/datafusion/functions/src/crypto/basic.rs
@@ -17,85 +17,22 @@
 
 //! "crypto" DataFusion functions
 
-use arrow::array::{
-    Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray,
-    OffsetSizeTrait,
-};
-use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray};
+use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType};
 use arrow::datatypes::DataType;
 use blake2::{Blake2b512, Blake2s256, Digest};
 use blake3::Hasher as Blake3;
-use datafusion_common::cast::as_binary_array;
 
 use arrow::compute::StringArrayType;
-use datafusion_common::{
-    exec_err, internal_err, plan_err, utils::take_function_args, DataFusionError, Result,
-    ScalarValue,
-};
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, plan_err};
 use datafusion_expr::ColumnarValue;
 use md5::Md5;
 use sha2::{Sha224, Sha256, Sha384, Sha512};
-use std::fmt::{self, Write};
+use std::fmt;
 use std::str::FromStr;
 use std::sync::Arc;
 
-macro_rules! define_digest_function {
-    ($NAME: ident, $METHOD: ident, $DOC: expr) => {
-        #[doc = $DOC]
-        pub fn $NAME(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-            let [data] = take_function_args(&DigestAlgorithm::$METHOD.to_string(), args)?;
-            digest_process(data, DigestAlgorithm::$METHOD)
-        }
-    };
-}
-define_digest_function!(
-    sha224,
-    Sha224,
-    "computes sha224 hash digest of the given input"
-);
-define_digest_function!(
-    sha256,
-    Sha256,
-    "computes sha256 hash digest of the given input"
-);
-define_digest_function!(
-    sha384,
-    Sha384,
-    "computes sha384 hash digest of the given input"
-);
-define_digest_function!(
-    sha512,
-    Sha512,
-    "computes sha512 hash digest of the given input"
-);
-define_digest_function!(
-    blake2b,
-    Blake2b,
-    "computes blake2b hash digest of the given input"
-);
-define_digest_function!(
-    blake2s,
-    Blake2s,
-    "computes blake2s hash digest of the given input"
-);
-define_digest_function!(
-    blake3,
-    Blake3,
-    "computes blake3 hash digest of the given input"
-);
-
-macro_rules! digest_to_scalar {
-    ($METHOD: ident, $INPUT:expr) => {{
-        ScalarValue::Binary($INPUT.as_ref().map(|v| {
-            let mut digest = $METHOD::default();
-            digest.update(v);
-            digest.finalize().as_slice().to_vec()
-        }))
-    }};
-}
-
-#[derive(Debug, Copy, Clone)]
-pub enum DigestAlgorithm {
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub(crate) enum DigestAlgorithm {
     Md5,
     Sha224,
     Sha256,
@@ -106,23 +43,6 @@ pub enum DigestAlgorithm {
     Blake3,
 }
 
-/// Digest computes a binary hash of the given data, accepts Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the algorithm to use.
-/// Standard algorithms are md5, sha1, sha224, sha256, sha384 and sha512.
-pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [data, digest_algorithm] = take_function_args("digest", args)?;
-    let digest_algorithm = match digest_algorithm {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method)) => method.parse::<DigestAlgorithm>(),
-            _ => exec_err!("Unsupported data type {scalar:?} for function digest"),
-        },
-        ColumnarValue::Array(_) => {
-            internal_err!("Digest using dynamically decided method is not yet supported")
-        }
-    }?;
-    digest_process(data, digest_algorithm)
-}
-
 impl FromStr for DigestAlgorithm {
     type Err = DataFusionError;
     fn from_str(name: &str) -> Result<DigestAlgorithm> {
@@ -164,76 +84,23 @@ impl fmt::Display for DigestAlgorithm {
     }
 }
 
-/// computes md5 hash digest of the given input
-pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [data] = take_function_args("md5", args)?;
-    let value = digest_process(data, DigestAlgorithm::Md5)?;
-
-    // md5 requires special handling because of its unique utf8 return type
-    Ok(match value {
-        ColumnarValue::Array(array) => {
-            let binary_array = as_binary_array(&array)?;
-            let string_array: StringArray = binary_array
-                .iter()
-                .map(|opt| opt.map(hex_encode::<_>))
-                .collect();
-            ColumnarValue::Array(Arc::new(string_array))
-        }
-        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
-            ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
-        }
-        _ => return exec_err!("Impossibly got invalid results from digest"),
-    })
-}
-
-/// this function exists so that we do not need to pull in the crate hex. it is only used by md5
-/// function below
-#[inline]
-fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
-    let mut s = String::with_capacity(data.as_ref().len() * 2);
-    for b in data.as_ref() {
-        // Writing to a string never errors, so we can unwrap here.
-        write!(&mut s, "{b:02x}").unwrap();
-    }
-    s
-}
-pub fn utf8_or_binary_to_binary_type(
-    arg_type: &DataType,
-    name: &str,
-) -> Result<DataType> {
-    Ok(match arg_type {
-        DataType::Utf8View
-        | DataType::LargeUtf8
-        | DataType::Utf8
-        | DataType::Binary
-        | DataType::BinaryView
-        | DataType::LargeBinary => DataType::Binary,
-        DataType::Null => DataType::Null,
-        _ => {
-            return plan_err!(
-                "The {name:?} function can only accept strings or binary arrays."
-            );
-        }
-    })
-}
 macro_rules! digest_to_array {
     ($METHOD:ident, $INPUT:expr) => {{
         let binary_array: BinaryArray = $INPUT
             .iter()
-            .map(|x| {
-                x.map(|x| {
-                    let mut digest = $METHOD::default();
-                    digest.update(x);
-                    digest.finalize()
-                })
-            })
+            .map(|x| x.map(|x| $METHOD::digest(x)))
             .collect();
         Arc::new(binary_array)
     }};
 }
+
+macro_rules! digest_to_scalar {
+    ($METHOD: ident, $INPUT:expr) => {{ ScalarValue::Binary($INPUT.map(|v| $METHOD::digest(v).as_slice().to_vec())) }};
+}
+
 impl DigestAlgorithm {
     /// digest an optional string to its hash value, null values are returned as is
-    pub fn digest_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
+    fn digest_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
         ColumnarValue::Scalar(match self {
             Self::Md5 => digest_to_scalar!(Md5, value),
             Self::Sha224 => digest_to_scalar!(Sha224, value),
@@ -250,51 +117,9 @@ impl DigestAlgorithm {
         })
     }
 
-    /// digest a binary array to their hash values
-    pub fn digest_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let array = match value.data_type() {
-            DataType::Binary | DataType::LargeBinary => {
-                let v = value.as_binary::<T>();
-                self.digest_binary_array_impl::<&GenericBinaryArray<T>>(v)
-            }
-            DataType::BinaryView => {
-                let v = value.as_binary_view();
-                self.digest_binary_array_impl::<&BinaryViewArray>(v)
-            }
-            other => {
-                return exec_err!("unsupported type for digest_utf_array: {other:?}")
-            }
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    /// digest a string array to their hash values
-    pub fn digest_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let array = match value.data_type() {
-            DataType::Utf8 | DataType::LargeUtf8 => {
-                let v = value.as_string::<T>();
-                self.digest_utf8_array_impl::<&GenericStringArray<T>>(v)
-            }
-            DataType::Utf8View => {
-                let v = value.as_string_view();
-                self.digest_utf8_array_impl::<&StringViewArray>(v)
-            }
-            other => {
-                return exec_err!("unsupported type for digest_utf_array: {other:?}")
-            }
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    pub fn digest_utf8_array_impl<'a, StringArrType>(
+    fn digest_utf8_array_impl<'a, StringArrType>(
         self,
-        input_value: StringArrType,
+        input_value: &StringArrType,
     ) -> ArrayRef
     where
         StringArrType: StringArrayType<'a>,
@@ -323,9 +148,9 @@ impl DigestAlgorithm {
         }
     }
 
-    pub fn digest_binary_array_impl<'a, BinaryArrType>(
+    fn digest_binary_array_impl<'a, BinaryArrType>(
         self,
-        input_value: BinaryArrType,
+        input_value: &BinaryArrType,
     ) -> ArrayRef
     where
         BinaryArrType: BinaryArrayType<'a>,
@@ -354,26 +179,40 @@ impl DigestAlgorithm {
         }
     }
 }
-pub fn digest_process(
+
+pub(crate) fn digest_process(
     value: &ColumnarValue,
     digest_algorithm: DigestAlgorithm,
 ) -> Result<ColumnarValue> {
     match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8View => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
-            DataType::Utf8 => digest_algorithm.digest_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => digest_algorithm.digest_utf8_array::<i64>(a.as_ref()),
-            DataType::Binary => digest_algorithm.digest_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => {
-                digest_algorithm.digest_binary_array::<i64>(a.as_ref())
-            }
-            DataType::BinaryView => {
-                digest_algorithm.digest_binary_array::<i32>(a.as_ref())
-            }
-            other => exec_err!(
-                "Unsupported data type {other:?} for function {digest_algorithm}"
-            ),
-        },
+        ColumnarValue::Array(a) => {
+            let output = match a.data_type() {
+                DataType::Utf8View => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string_view())
+                }
+                DataType::Utf8 => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string::<i32>())
+                }
+                DataType::LargeUtf8 => {
+                    digest_algorithm.digest_utf8_array_impl(&a.as_string::<i64>())
+                }
+                DataType::Binary => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary::<i32>())
+                }
+                DataType::LargeBinary => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary::<i64>())
+                }
+                DataType::BinaryView => {
+                    digest_algorithm.digest_binary_array_impl(&a.as_binary_view())
+                }
+                other => {
+                    return exec_err!(
+                        "Unsupported data type {other:?} for function {digest_algorithm}"
+                    );
+                }
+            };
+            Ok(ColumnarValue::Array(output))
+        }
         ColumnarValue::Scalar(scalar) => {
             match scalar {
                 ScalarValue::Utf8View(a)
diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs
index 2840006169be4..4d3ff06777249 100644
--- a/datafusion/functions/src/crypto/digest.rs
+++ b/datafusion/functions/src/crypto/digest.rs
@@ -15,12 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! "crypto" DataFusion functions
-use super::basic::{digest, utf8_or_binary_to_binary_type};
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
+
 use arrow::datatypes::DataType;
 use datafusion_common::{
+    Result, exec_err, not_impl_err,
     types::{logical_binary, logical_string},
-    Result,
+    utils::take_function_args,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -36,16 +37,16 @@ use std::any::Any;
     syntax_example = "digest(expression, algorithm)",
     sql_example = r#"```sql
 > select digest('foo', 'sha256');
-+------------------------------------------+
-| digest(Utf8("foo"), Utf8("sha256"))      |
-+------------------------------------------+
-| <binary_hash_result>                     |
-+------------------------------------------+
++------------------------------------------------------------------+
+| digest(Utf8("foo"),Utf8("sha256"))                               |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "String"),
     argument(
         name = "algorithm",
-        description = "String expression specifying algorithm to use. Must be one of:       
+        description = "String expression specifying algorithm to use. Must be one of:
     - md5
     - sha224
     - sha256
@@ -56,10 +57,11 @@ use std::any::Any;
     - blake3"
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DigestFunc {
     signature: Signature,
 }
+
 impl Default for DigestFunc {
     fn default() -> Self {
         Self::new()
@@ -85,6 +87,7 @@ impl DigestFunc {
         }
     }
 }
+
 impl ScalarUDFImpl for DigestFunc {
     fn as_any(&self) -> &dyn Any {
         self
@@ -98,14 +101,35 @@ impl ScalarUDFImpl for DigestFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
     }
+
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        digest(&args.args)
+        let [data, digest_algorithm] = take_function_args(self.name(), &args.args)?;
+        digest(data, digest_algorithm)
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
+
+/// Compute binary hash of the given `data` (String or Binary array), according
+/// to the specified `digest_algorithm`. See [`DigestAlgorithm`] for supported
+/// algorithms.
+fn digest(
+    data: &ColumnarValue,
+    digest_algorithm: &ColumnarValue,
+) -> Result<ColumnarValue> {
+    let digest_algorithm = match digest_algorithm {
+        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
+            Some(Some(method)) => method.parse::<DigestAlgorithm>(),
+            _ => exec_err!("Unsupported data type {scalar:?} for function digest"),
+        },
+        ColumnarValue::Array(_) => {
+            not_impl_err!("Digest using dynamically decided method is not yet supported")
+        }
+    }?;
+    digest_process(data, digest_algorithm)
+}
diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs
index c1540450029cf..355e3e287ad22 100644
--- a/datafusion/functions/src/crypto/md5.rs
+++ b/datafusion/functions/src/crypto/md5.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! "crypto" DataFusion functions
-use crate::crypto::basic::md5;
-use arrow::datatypes::DataType;
+use arrow::{array::StringViewArray, datatypes::DataType};
 use datafusion_common::{
-    plan_err,
-    types::{logical_binary, logical_string, NativeType},
-    Result,
+    Result, ScalarValue,
+    cast::as_binary_array,
+    internal_err,
+    types::{logical_binary, logical_string},
+    utils::take_function_args,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -29,7 +29,9 @@ use datafusion_expr::{
 };
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
-use std::any::Any;
+use std::{any::Any, sync::Arc};
+
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
 
 #[user_doc(
     doc_section(label = "Hashing Functions"),
@@ -37,18 +39,19 @@ use std::any::Any;
     syntax_example = "md5(expression)",
     sql_example = r#"```sql
 > select md5('foo');
-+-------------------------------------+
-| md5(Utf8("foo"))                    |
-+-------------------------------------+
-| <md5_checksum_result>               |
-+-------------------------------------+
++----------------------------------+
+| md5(Utf8("foo"))                 |
++----------------------------------+
+| acbd18db4cc2f85cedef654fccc4a4d8 |
++----------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "String")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct Md5Func {
     signature: Signature,
 }
+
 impl Default for Md5Func {
     fn default() -> Self {
         Self::new()
@@ -60,15 +63,11 @@ impl Md5Func {
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
                     )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
                         TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
                     )]),
                 ],
                 Volatility::Immutable,
@@ -76,6 +75,7 @@ impl Md5Func {
         }
     }
 }
+
 impl ScalarUDFImpl for Md5Func {
     fn as_any(&self) -> &dyn Any {
         self
@@ -89,30 +89,10 @@ impl ScalarUDFImpl for Md5Func {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-        Ok(match &arg_types[0] {
-            LargeUtf8 | LargeBinary => Utf8,
-            Utf8View | Utf8 | Binary | BinaryView => Utf8,
-            Null => Null,
-            Dictionary(_, t) => match **t {
-                LargeUtf8 | LargeBinary => Utf8,
-                Utf8 | Binary | BinaryView => Utf8,
-                Null => Null,
-                _ => {
-                    return plan_err!(
-                        "the md5 can only accept strings but got {:?}",
-                        **t
-                    );
-                }
-            },
-            other => {
-                return plan_err!(
-                    "The md5 function can only accept strings. Got {other}"
-                );
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8View)
     }
+
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         md5(&args.args)
     }
@@ -121,3 +101,38 @@ impl ScalarUDFImpl for Md5Func {
         self.doc()
     }
 }
+
+/// Hex encoding lookup table for fast byte-to-hex conversion
+const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
+
+/// Fast hex encoding using a lookup table instead of format strings.
+/// This is significantly faster than using `write!("{:02x}")` for each byte.
+#[inline]
+fn hex_encode(data: impl AsRef<[u8]>) -> String {
+    let bytes = data.as_ref();
+    let mut s = String::with_capacity(bytes.len() * 2);
+    for &b in bytes {
+        s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
+        s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
+    }
+    s
+}
+
+fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let [data] = take_function_args("md5", args)?;
+    let value = digest_process(data, DigestAlgorithm::Md5)?;
+
+    // md5 requires special handling because of its unique utf8view return type
+    Ok(match value {
+        ColumnarValue::Array(array) => {
+            let binary_array = as_binary_array(&array)?;
+            let string_array: StringViewArray =
+                binary_array.iter().map(|opt| opt.map(hex_encode)).collect();
+            ColumnarValue::Array(Arc::new(string_array))
+        }
+        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode)))
+        }
+        _ => return internal_err!("Impossibly got invalid results from digest"),
+    })
+}
diff --git a/datafusion/functions/src/crypto/mod.rs b/datafusion/functions/src/crypto/mod.rs
index 62ea3c2e27371..fd15db44c795d 100644
--- a/datafusion/functions/src/crypto/mod.rs
+++ b/datafusion/functions/src/crypto/mod.rs
@@ -23,16 +23,13 @@ use std::sync::Arc;
 pub mod basic;
 pub mod digest;
 pub mod md5;
-pub mod sha224;
-pub mod sha256;
-pub mod sha384;
-pub mod sha512;
+pub mod sha;
 make_udf_function!(digest::DigestFunc, digest);
 make_udf_function!(md5::Md5Func, md5);
-make_udf_function!(sha224::SHA224Func, sha224);
-make_udf_function!(sha256::SHA256Func, sha256);
-make_udf_function!(sha384::SHA384Func, sha384);
-make_udf_function!(sha512::SHA512Func, sha512);
+make_udf_function!(sha::SHAFunc, sha224, sha::SHAFunc::sha224);
+make_udf_function!(sha::SHAFunc, sha256, sha::SHAFunc::sha256);
+make_udf_function!(sha::SHAFunc, sha384, sha::SHAFunc::sha384);
+make_udf_function!(sha::SHAFunc, sha512, sha::SHAFunc::sha512);
 
 pub mod expr_fn {
     export_functions!((
diff --git a/datafusion/functions/src/crypto/sha.rs b/datafusion/functions/src/crypto/sha.rs
new file mode 100644
index 0000000000000..9199cf57c7a2d
--- /dev/null
+++ b/datafusion/functions/src/crypto/sha.rs
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::crypto::basic::{DigestAlgorithm, digest_process};
+
+use arrow::datatypes::DataType;
+use datafusion_common::{
+    Result,
+    types::{logical_binary, logical_string},
+    utils::take_function_args,
+};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+use datafusion_macros::user_doc;
+use std::any::Any;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-224 hash of a binary string.",
+    syntax_example = "sha224(expression)",
+    sql_example = r#"```sql
+> select sha224('foo');
++----------------------------------------------------------+
+| sha224(Utf8("foo"))                                      |
++----------------------------------------------------------+
+| 0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db |
++----------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA224Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-256 hash of a binary string.",
+    syntax_example = "sha256(expression)",
+    sql_example = r#"```sql
+> select sha256('foo');
++------------------------------------------------------------------+
+| sha256(Utf8("foo"))                                              |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA256Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-384 hash of a binary string.",
+    syntax_example = "sha384(expression)",
+    sql_example = r#"```sql
+> select sha384('foo');
++--------------------------------------------------------------------------------------------------+
+| sha384(Utf8("foo"))                                                                              |
++--------------------------------------------------------------------------------------------------+
+| 98c11ffdfdd540676b1a137cb1a22b2a70350c9a44171d6b1180c6be5cbb2ee3f79d532c8a1dd9ef2e8e08e752a3babb |
++--------------------------------------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA384Doc;
+
+#[user_doc(
+    doc_section(label = "Hashing Functions"),
+    description = "Computes the SHA-512 hash of a binary string.",
+    syntax_example = "sha512(expression)",
+    sql_example = r#"```sql
+> select sha512('foo');
++----------------------------------------------------------------------------------------------------------------------------------+
+| sha512(Utf8("foo"))                                                                                                              |
++----------------------------------------------------------------------------------------------------------------------------------+
+| f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254d8820772c5518a2c5a8c0c7f7eda19594a7eb539453e1ed7 |
++----------------------------------------------------------------------------------------------------------------------------------+
+```"#,
+    standard_argument(name = "expression", prefix = "String")
+)]
+struct SHA512Doc;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SHAFunc {
+    signature: Signature,
+    name: &'static str,
+    algorithm: DigestAlgorithm,
+}
+
+impl SHAFunc {
+    pub fn sha224() -> Self {
+        Self::new("sha224", DigestAlgorithm::Sha224)
+    }
+
+    pub fn sha256() -> Self {
+        Self::new("sha256", DigestAlgorithm::Sha256)
+    }
+
+    pub fn sha384() -> Self {
+        Self::new("sha384", DigestAlgorithm::Sha384)
+    }
+
+    pub fn sha512() -> Self {
+        Self::new("sha512", DigestAlgorithm::Sha512)
+    }
+
+    fn new(name: &'static str, algorithm: DigestAlgorithm) -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_string()),
+                    )]),
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Native(logical_binary()),
+                    )]),
+                ],
+                Volatility::Immutable,
+            ),
+            name,
+            algorithm,
+        }
+    }
+}
+
+impl ScalarUDFImpl for SHAFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [data] = take_function_args(self.name(), args.args)?;
+        digest_process(&data, self.algorithm)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        match self.algorithm {
+            DigestAlgorithm::Sha224 => SHA224Doc {}.doc(),
+            DigestAlgorithm::Sha256 => SHA256Doc {}.doc(),
+            DigestAlgorithm::Sha384 => SHA384Doc {}.doc(),
+            DigestAlgorithm::Sha512 => SHA512Doc {}.doc(),
+            DigestAlgorithm::Md5
+            | DigestAlgorithm::Blake2s
+            | DigestAlgorithm::Blake2b
+            | DigestAlgorithm::Blake3 => unreachable!(),
+        }
+    }
+}
diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs
deleted file mode 100644
index a64a3ef803197..0000000000000
--- a/datafusion/functions/src/crypto/sha224.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha224, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-224 hash of a binary string.",
-    syntax_example = "sha224(expression)",
-    sql_example = r#"```sql
-> select sha224('foo');
-+------------------------------------------+
-| sha224(Utf8("foo"))                      |
-+------------------------------------------+
-| <sha224_hash_result>                     |
-+------------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug)]
-pub struct SHA224Func {
-    signature: Signature,
-}
-
-impl Default for SHA224Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA224Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-
-impl ScalarUDFImpl for SHA224Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha224"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha224(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs
deleted file mode 100644
index 94f3ea3b49fa6..0000000000000
--- a/datafusion/functions/src/crypto/sha256.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha256, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-256 hash of a binary string.",
-    syntax_example = "sha256(expression)",
-    sql_example = r#"```sql
-> select sha256('foo');
-+--------------------------------------+
-| sha256(Utf8("foo"))                  |
-+--------------------------------------+
-| <sha256_hash_result>                 |
-+--------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug)]
-pub struct SHA256Func {
-    signature: Signature,
-}
-impl Default for SHA256Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA256Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA256Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha256"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha256(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs
deleted file mode 100644
index 023730469c7bd..0000000000000
--- a/datafusion/functions/src/crypto/sha384.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha384, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-384 hash of a binary string.",
-    syntax_example = "sha384(expression)",
-    sql_example = r#"```sql
-> select sha384('foo');
-+-----------------------------------------+
-| sha384(Utf8("foo"))                     |
-+-----------------------------------------+
-| <sha384_hash_result>                    |
-+-----------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug)]
-pub struct SHA384Func {
-    signature: Signature,
-}
-impl Default for SHA384Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA384Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA384Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha384"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha384(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs
deleted file mode 100644
index f48737e5751f0..0000000000000
--- a/datafusion/functions/src/crypto/sha512.rs
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! "crypto" DataFusion functions
-use super::basic::{sha512, utf8_or_binary_to_binary_type};
-use arrow::datatypes::DataType;
-use datafusion_common::{
-    types::{logical_binary, logical_string, NativeType},
-    Result,
-};
-use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
-};
-use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
-use datafusion_macros::user_doc;
-use std::any::Any;
-
-#[user_doc(
-    doc_section(label = "Hashing Functions"),
-    description = "Computes the SHA-512 hash of a binary string.",
-    syntax_example = "sha512(expression)",
-    sql_example = r#"```sql
-> select sha512('foo');
-+-------------------------------------------+
-| sha512(Utf8("foo"))                       |
-+-------------------------------------------+
-| <sha512_hash_result>                      |
-+-------------------------------------------+
-```"#,
-    standard_argument(name = "expression", prefix = "String")
-)]
-#[derive(Debug)]
-pub struct SHA512Func {
-    signature: Signature,
-}
-impl Default for SHA512Func {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl SHA512Func {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_string())],
-                        NativeType::String,
-                    )]),
-                    TypeSignature::Coercible(vec![Coercion::new_implicit(
-                        TypeSignatureClass::Native(logical_binary()),
-                        vec![TypeSignatureClass::Native(logical_binary())],
-                        NativeType::Binary,
-                    )]),
-                ],
-                Volatility::Immutable,
-            ),
-        }
-    }
-}
-impl ScalarUDFImpl for SHA512Func {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "sha512"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_or_binary_to_binary_type(&arg_types[0], self.name())
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        sha512(&args.args)
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs
index fd9f37d8052c7..2db64beafa9b7 100644
--- a/datafusion/functions/src/datetime/common.rs
+++ b/datafusion/functions/src/datetime/common.rs
@@ -15,30 +15,57 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 
+use arrow::array::timezone::Tz;
 use arrow::array::{
     Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
     StringArrayType, StringViewArray,
 };
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
-use chrono::format::{parse, Parsed, StrftimeItems};
+use arrow::compute::DecimalCast;
+use arrow::compute::kernels::cast_utils::string_to_datetime;
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
 use chrono::LocalResult::Single;
+use chrono::format::{Parsed, StrftimeItems, parse};
 use chrono::{DateTime, TimeZone, Utc};
-
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::{
-    exec_err, unwrap_or_internal_err, DataFusionError, Result, ScalarType, ScalarValue,
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err,
+    internal_datafusion_err, unwrap_or_internal_err,
 };
 use datafusion_expr::ColumnarValue;
 
 /// Error message if nanosecond conversion request beyond supported interval
 const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804";
 
-/// Calls string_to_timestamp_nanos and converts the error type
-pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
-    string_to_timestamp_nanos(s).map_err(|e| e.into())
+static UTC: LazyLock<Tz> = LazyLock::new(|| "UTC".parse().expect("UTC is always valid"));
+
+/// Converts a string representation of a date‑time into a timestamp expressed in
+/// nanoseconds since the Unix epoch.
+///
+/// This helper is a thin wrapper around the more general `string_to_datetime`
+/// function. It accepts an optional `timezone` which, if `None`, defaults to
+/// Coordinated Universal Time (UTC). The string `s` must contain a valid
+/// date‑time format that can be parsed by the underlying chrono parser.
+///
+/// # Return Value
+///
+/// * `Ok(i64)` – The number of nanoseconds since `1970‑01‑01T00:00:00Z`.
+/// * `Err(DataFusionError)` – If the string cannot be parsed, the parsed
+///   value is out of range (between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804)
+///   or the parsed value does not correspond to an unambiguous time.
+pub(crate) fn string_to_timestamp_nanos_with_timezone(
+    timezone: &Option<Tz>,
+    s: &str,
+) -> Result<i64> {
+    let tz = timezone.as_ref().unwrap_or(&UTC);
+    let dt = string_to_datetime(tz, s)?;
+    let parsed = dt
+        .timestamp_nanos_opt()
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
+
+    Ok(parsed)
 }
 
 /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View]
@@ -68,13 +95,12 @@ pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result<
 /// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers
 /// relative to the provided `timezone`
 ///
-/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled
-///
-/// * `2023-01-01 040506 America/Los_Angeles`
-///
 /// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error
 /// will be returned
 ///
+/// Note that parsing [IANA timezones] is not supported yet in chrono - <https://github.com/chronotope/chrono/issues/38>
+/// and this implementation only supports named timezones at the end of the string preceded by a space.
+///
 /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
 /// [IANA timezones]: https://www.iana.org/time-zones
 pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
@@ -83,16 +109,60 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
     format: &str,
 ) -> Result<DateTime<T>, DataFusionError> {
     let err = |err_ctx: &str| {
-        DataFusionError::Execution(format!(
+        exec_datafusion_err!(
             "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}"
-        ))
+        )
+    };
+
+    let mut datetime_str = s;
+    let mut format = format;
+
+    // Manually handle the most common case of a named timezone at the end of the timestamp.
+    // Note that %+ handles 'Z' at the end of the string without a space. This code doesn't
+    // handle named timezones with no preceding space since that would require writing a
+    // custom parser (or switching to Jiff)
+    let tz: Option<chrono_tz::Tz> = if format.trim_end().ends_with(" %Z") {
+        // grab the string after the last space as the named timezone
+        if let Some((dt_str, timezone_name)) = datetime_str.trim_end().rsplit_once(' ') {
+            datetime_str = dt_str;
+
+            // attempt to parse the timezone name
+            let result: Result<chrono_tz::Tz, chrono_tz::ParseError> =
+                timezone_name.parse();
+            let Ok(tz) = result else {
+                return Err(err(&result.unwrap_err().to_string()));
+            };
+
+            // successfully parsed the timezone name, remove the ' %Z' from the format
+            format = &format[..format.len() - 3];
+
+            Some(tz)
+        } else {
+            None
+        }
+    } else if format.contains("%Z") {
+        return Err(err(
+            "'%Z' is only supported at the end of the format string preceded by a space",
+        ));
+    } else {
+        None
     };
 
     let mut parsed = Parsed::new();
-    parse(&mut parsed, s, StrftimeItems::new(format)).map_err(|e| err(&e.to_string()))?;
+    parse(&mut parsed, datetime_str, StrftimeItems::new(format))
+        .map_err(|e| err(&e.to_string()))?;
 
-    // attempt to parse the string assuming it has a timezone
-    let dt = parsed.to_datetime();
+    let dt = match tz {
+        Some(tz) => {
+            // A timezone was manually parsed out, convert it to a fixed offset
+            match parsed.to_datetime_with_timezone(&tz) {
+                Ok(dt) => Ok(dt.fixed_offset()),
+                Err(e) => Err(e),
+            }
+        }
+        // default to parse the string assuming it has a timezone
+        None => parsed.to_datetime(),
+    };
 
     if let Err(e) = &dt {
         // no timezone or other failure, try without a timezone
@@ -114,7 +184,7 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
 }
 
 /// Accepts a string with a `chrono` format and converts it to a
-/// nanosecond precision timestamp.
+/// nanosecond precision timestamp relative to the provided `timezone`.
 ///
 /// See [`chrono::format::strftime`] for the full set of supported formats.
 ///
@@ -139,23 +209,22 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
 /// defined by `chrono`.
 ///
 /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
-///
 #[inline]
-pub(crate) fn string_to_timestamp_nanos_formatted(
+pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone(
+    timezone: &Option<Tz>,
     s: &str,
     format: &str,
 ) -> Result<i64, DataFusionError> {
-    string_to_datetime_formatted(&Utc, s, format)?
-        .naive_utc()
-        .and_utc()
+    let dt = string_to_datetime_formatted(timezone.as_ref().unwrap_or(&UTC), s, format)?;
+    let parsed = dt
         .timestamp_nanos_opt()
-        .ok_or_else(|| {
-            DataFusionError::Execution(ERR_NANOSECONDS_NOT_SUPPORTED.to_string())
-        })
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
+
+    Ok(parsed)
 }
 
 /// Accepts a string with a `chrono` format and converts it to a
-/// millisecond precision timestamp.
+/// millisecond precision timestamp relative to the provided `timezone`.
 ///
 /// See [`chrono::format::strftime`] for the full set of supported formats.
 ///
@@ -170,7 +239,6 @@ pub(crate) fn string_to_timestamp_nanos_formatted(
 /// defined by `chrono`.
 ///
 /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
-///
 #[inline]
 pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result<i64> {
     Ok(string_to_datetime_formatted(&Utc, s, format)?
@@ -179,33 +247,33 @@ pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Res
         .timestamp_millis())
 }
 
-pub(crate) fn handle<O, F, S>(
+pub(crate) fn handle<O, F>(
     args: &[ColumnarValue],
     op: F,
     name: &str,
+    dt: &DataType,
 ) -> Result<ColumnarValue>
 where
     O: ArrowPrimitiveType,
-    S: ScalarType<O::Native>,
     F: Fn(&str) -> Result<O::Native>,
 {
     match &args[0] {
         ColumnarValue::Array(a) => match a.data_type() {
             DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&StringViewArray, O, _>(
-                    a.as_ref().as_string_view(),
+                    &a.as_string_view(),
                     op,
                 )?,
             ))),
             DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&GenericStringArray<i64>, O, _>(
-                    a.as_ref().as_string::<i64>(),
+                    &a.as_string::<i64>(),
                     op,
                 )?,
             ))),
             DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new(
                 unary_string_to_primitive_function::<&GenericStringArray<i32>, O, _>(
-                    a.as_ref().as_string::<i32>(),
+                    &a.as_string::<i32>(),
                     op,
                 )?,
             ))),
@@ -213,8 +281,13 @@ where
         },
         ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
             Some(a) => {
-                let result = a.as_ref().map(|x| op(x)).transpose()?;
-                Ok(ColumnarValue::Scalar(S::scalar(result)))
+                let result = a
+                    .as_ref()
+                    .map(|x| op(x))
+                    .transpose()?
+                    .and_then(|v| v.to_i64());
+                let s = scalar_value(dt, result)?;
+                Ok(ColumnarValue::Scalar(s))
             }
             _ => exec_err!("Unsupported data type {scalar:?} for function {name}"),
         },
@@ -224,15 +297,15 @@ where
 // Given a function that maps a `&str`, `&str` to an arrow native type,
 // returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue`
 // depending on the `args`'s variant.
-pub(crate) fn handle_multiple<O, F, S, M>(
+pub(crate) fn handle_multiple<O, F, M>(
     args: &[ColumnarValue],
     op: F,
     op2: M,
     name: &str,
+    dt: &DataType,
 ) -> Result<ColumnarValue>
 where
     O: ArrowPrimitiveType,
-    S: ScalarType<O::Native>,
     F: Fn(&str, &str) -> Result<O::Native>,
     M: Fn(O::Native) -> O::Native,
 {
@@ -246,14 +319,24 @@ where
                             DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
                                 // all good
                             }
-                            other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"),
+                            other => {
+                                return exec_err!(
+                                    "Unsupported data type {other:?} for function {name}, arg # {pos}"
+                                );
+                            }
                         },
                         ColumnarValue::Scalar(arg) => {
                             match arg.data_type() {
-                                DataType::Utf8View| DataType::LargeUtf8 | DataType::Utf8 => {
+                                DataType::Utf8View
+                                | DataType::LargeUtf8
+                                | DataType::Utf8 => {
                                     // all good
                                 }
-                                other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"),
+                                other => {
+                                    return exec_err!(
+                                        "Unsupported data type {other:?} for function {name}, arg # {pos}"
+                                    );
+                                }
                             }
                         }
                     }
@@ -283,15 +366,17 @@ where
                         | ScalarValue::Utf8(x),
                     ) = v
                     else {
-                        return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}");
+                        return exec_err!(
+                            "Unsupported data type {v:?} for function {name}, arg # {pos}"
+                        );
                     };
 
                     if let Some(s) = x {
                         match op(a, s.as_str()) {
                             Ok(r) => {
-                                ret = Some(Ok(ColumnarValue::Scalar(S::scalar(Some(
-                                    op2(r),
-                                )))));
+                                let result = op2(r).to_i64();
+                                let s = scalar_value(dt, result)?;
+                                ret = Some(Ok(ColumnarValue::Scalar(s)));
                                 break;
                             }
                             Err(e) => ret = Some(Err(e)),
@@ -412,8 +497,8 @@ where
                     }?;
 
                     let r = op(x, v);
-                    if r.is_ok() {
-                        val = Some(Ok(op2(r.unwrap())));
+                    if let Ok(inner) = r {
+                        val = Some(Ok(op2(inner)));
                         break;
                     } else {
                         val = Some(r);
@@ -434,7 +519,7 @@ where
 /// * the number of arguments is not 1 or
 /// * the function `op` errors
 fn unary_string_to_primitive_function<'a, StringArrType, O, F>(
-    array: StringArrType,
+    array: &StringArrType,
     op: F,
 ) -> Result<PrimitiveArray<O>>
 where
@@ -445,3 +530,16 @@ where
     // first map is the iterator, second is for the `Option<_>`
     array.iter().map(|x| x.map(&op).transpose()).collect()
 }
+
+fn scalar_value(dt: &DataType, r: Option<i64>) -> Result<ScalarValue> {
+    match dt {
+        DataType::Date32 => Ok(ScalarValue::Date32(r.and_then(|v| v.to_i32()))),
+        DataType::Timestamp(u, tz) => match u {
+            TimeUnit::Second => Ok(ScalarValue::TimestampSecond(r, tz.clone())),
+            TimeUnit::Millisecond => Ok(ScalarValue::TimestampMillisecond(r, tz.clone())),
+            TimeUnit::Microsecond => Ok(ScalarValue::TimestampMicrosecond(r, tz.clone())),
+            TimeUnit::Nanosecond => Ok(ScalarValue::TimestampNanosecond(r, tz.clone())),
+        },
+        t => Err(internal_datafusion_err!("Unsupported data type: {t:?}")),
+    }
+}
diff --git a/datafusion/functions/src/datetime/current_date.rs b/datafusion/functions/src/datetime/current_date.rs
index 9998e7d3758e0..f0571b94fa8d7 100644
--- a/datafusion/functions/src/datetime/current_date.rs
+++ b/datafusion/functions/src/datetime/current_date.rs
@@ -17,27 +17,31 @@
 
 use std::any::Any;
 
+use arrow::array::timezone::Tz;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Date32;
-use chrono::{Datelike, NaiveDate};
+use chrono::{Datelike, NaiveDate, TimeZone};
 
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r#"
-Returns the current UTC date.
+Returns the current date in the session time zone.
 
 The `current_date()` return value is determined at query time and will return the same date, no matter when in the query plan the function executes.
 "#,
-    syntax_example = "current_date()"
+    syntax_example = r#"current_date()
+    (optional) SET datafusion.execution.time_zone = '+00:00';
+    SELECT current_date();"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CurrentDateFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -81,10 +85,7 @@ impl ScalarUDFImpl for CurrentDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!(
             "invoke should not be called on a simplified current_date() function"
         )
@@ -96,18 +97,30 @@ impl ScalarUDFImpl for CurrentDateFunc {
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info.execution_props().query_execution_start_time;
-        let days = Some(
-            now_ts.num_days_from_ce()
-                - NaiveDate::from_ymd_opt(1970, 1, 1)
-                    .unwrap()
-                    .num_days_from_ce(),
-        );
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        // Get timezone from config and convert to local time
+        let days = info
+            .config_options()
+            .execution
+            .time_zone
+            .as_ref()
+            .and_then(|tz| tz.parse::<Tz>().ok())
+            .map_or_else(
+                || datetime_to_days(&now_ts),
+                |tz| {
+                    let local_now = tz.from_utc_datetime(&now_ts.naive_utc());
+                    datetime_to_days(&local_now)
+                },
+            );
         Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-            ScalarValue::Date32(days),
+            ScalarValue::Date32(Some(days)),
+            None,
         )))
     }
 
@@ -115,3 +128,11 @@ impl ScalarUDFImpl for CurrentDateFunc {
         self.doc()
     }
 }
+
+/// Converts a DateTime to the number of days since Unix epoch (1970-01-01)
+fn datetime_to_days<T: Datelike>(dt: &T) -> i32 {
+    dt.num_days_from_ce()
+        - NaiveDate::from_ymd_opt(1970, 1, 1)
+            .unwrap()
+            .num_days_from_ce()
+}
diff --git a/datafusion/functions/src/datetime/current_time.rs b/datafusion/functions/src/datetime/current_time.rs
index c416d0240b13c..2ac5cf96d0316 100644
--- a/datafusion/functions/src/datetime/current_time.rs
+++ b/datafusion/functions/src/datetime/current_time.rs
@@ -15,28 +15,35 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::array::timezone::Tz;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Time64;
 use arrow::datatypes::TimeUnit::Nanosecond;
-use std::any::Any;
-
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use chrono::TimeZone;
+use chrono::Timelike;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
+use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r#"
-Returns the current UTC time.
+Returns the current time in the session time zone.
 
 The `current_time()` return value is determined at query time and will return the same time, no matter when in the query plan the function executes.
+
+The session time zone can be set using the statement 'SET datafusion.execution.time_zone = desired time zone'. The time zone can be a value like +00:00, 'Europe/London' etc.
 "#,
-    syntax_example = "current_time()"
+    syntax_example = r#"current_time()
+    (optional) SET datafusion.execution.time_zone = '+00:00';
+    SELECT current_time();"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CurrentTimeFunc {
     signature: Signature,
 }
@@ -78,10 +85,7 @@ impl ScalarUDFImpl for CurrentTimeFunc {
         Ok(Time64(Nanosecond))
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!(
             "invoke should not be called on a simplified current_time() function"
         )
@@ -89,13 +93,31 @@ impl ScalarUDFImpl for CurrentTimeFunc {
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info.execution_props().query_execution_start_time;
-        let nano = now_ts.timestamp_nanos_opt().map(|ts| ts % 86400000000000);
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        // Try to get timezone from config and convert to local time
+        let nano = info
+            .config_options()
+            .execution
+            .time_zone
+            .as_ref()
+            .and_then(|tz| tz.parse::<Tz>().ok())
+            .map_or_else(
+                || datetime_to_time_nanos(&now_ts),
+                |tz| {
+                    let local_now = tz.from_utc_datetime(&now_ts.naive_utc());
+                    datetime_to_time_nanos(&local_now)
+                },
+            );
+
         Ok(ExprSimplifyResult::Simplified(Expr::Literal(
             ScalarValue::Time64Nanosecond(nano),
+            None,
         )))
     }
 
@@ -103,3 +125,82 @@ impl ScalarUDFImpl for CurrentTimeFunc {
         self.doc()
     }
 }
+
+// Helper function for conversion of datetime to a timestamp.
+fn datetime_to_time_nanos<Tz: TimeZone>(dt: &chrono::DateTime<Tz>) -> Option<i64> {
+    let hour = dt.hour() as i64;
+    let minute = dt.minute() as i64;
+    let second = dt.second() as i64;
+    let nanosecond = dt.nanosecond() as i64;
+    Some((hour * 3600 + minute * 60 + second) * 1_000_000_000 + nanosecond)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::{DateTime, Utc};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{DFSchema, ScalarValue};
+    use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+    use std::sync::Arc;
+
+    fn set_session_timezone_env(tz: &str, start_time: DateTime<Utc>) -> SimplifyContext {
+        let mut config = ConfigOptions::default();
+        config.execution.time_zone = if tz.is_empty() {
+            None
+        } else {
+            Some(tz.to_string())
+        };
+        let schema = Arc::new(DFSchema::empty());
+        SimplifyContext::default()
+            .with_schema(schema)
+            .with_config_options(Arc::new(config))
+            .with_query_execution_start_time(Some(start_time))
+    }
+
+    #[test]
+    fn test_current_time_timezone_offset() {
+        // Use a fixed start time for consistent testing
+        let start_time = Utc.with_ymd_and_hms(2025, 1, 1, 12, 0, 0).unwrap();
+
+        // Test with UTC+05:00
+        let info_plus_5 = set_session_timezone_env("+05:00", start_time);
+        let result_plus_5 = CurrentTimeFunc::new()
+            .simplify(vec![], &info_plus_5)
+            .unwrap();
+
+        // Test with UTC-05:00
+        let info_minus_5 = set_session_timezone_env("-05:00", start_time);
+        let result_minus_5 = CurrentTimeFunc::new()
+            .simplify(vec![], &info_minus_5)
+            .unwrap();
+
+        // Extract nanoseconds from results
+        let nanos_plus_5 = match result_plus_5 {
+            ExprSimplifyResult::Simplified(Expr::Literal(
+                ScalarValue::Time64Nanosecond(Some(n)),
+                _,
+            )) => n,
+            _ => panic!("Expected Time64Nanosecond literal"),
+        };
+
+        let nanos_minus_5 = match result_minus_5 {
+            ExprSimplifyResult::Simplified(Expr::Literal(
+                ScalarValue::Time64Nanosecond(Some(n)),
+                _,
+            )) => n,
+            _ => panic!("Expected Time64Nanosecond literal"),
+        };
+
+        // Calculate the difference: UTC+05:00 should be 10 hours ahead of UTC-05:00
+        let difference = nanos_plus_5 - nanos_minus_5;
+
+        // 10 hours in nanoseconds
+        let expected_offset = 10i64 * 3600 * 1_000_000_000;
+
+        assert_eq!(
+            difference, expected_offset,
+            "Expected 10-hour offset difference in nanoseconds between UTC+05:00 and UTC-05:00"
+        );
+    }
+}
diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs
index 1c801dfead723..0dca23f46b9d2 100644
--- a/datafusion/functions/src/datetime/date_bin.rs
+++ b/datafusion/functions/src/datetime/date_bin.rs
@@ -24,18 +24,22 @@ use arrow::array::types::{
     TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
     TimestampSecondType,
 };
-use arrow::array::{ArrayRef, PrimitiveArray};
-use arrow::datatypes::DataType::{Null, Timestamp, Utf8};
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
+use arrow::datatypes::DataType::{Time32, Time64, Timestamp};
 use arrow::datatypes::IntervalUnit::{DayTime, MonthDayNano};
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::datatypes::{DataType, TimeUnit};
-
+use arrow::datatypes::{
+    DataType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimeUnit,
+};
+use arrow::temporal_conversions::NANOSECONDS_IN_DAY;
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{exec_err, not_impl_err, plan_err, Result, ScalarValue};
-use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err, plan_err};
 use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TIMEZONE_WILDCARD, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -71,6 +75,17 @@ FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z')  t(time);
 | 2023-01-03T03:00:00 |
 +---------------------+
 2 row(s) fetched.
+
+-- Bin the time into 15 minute intervals starting at 1 min
+>  SELECT date_bin(interval '15 minutes', time, TIME '00:01:00') as bin
+FROM VALUES (TIME '02:18:18'), (TIME '19:00:03')  t(time);
++----------+
+| bin      |
++----------+
+| 02:16:00 |
+| 18:46:00 |
++----------+
+2 row(s) fetched.
 ```"#,
     argument(name = "interval", description = "Bin interval."),
     argument(
@@ -95,7 +110,7 @@ FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z')  t(time);
 "#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DateBinFunc {
     signature: Signature,
 }
@@ -109,7 +124,7 @@ impl Default for DateBinFunc {
 impl DateBinFunc {
     pub fn new() -> Self {
         let base_sig = |array_type: TimeUnit| {
-            vec![
+            let mut v = vec![
                 Exact(vec![
                     DataType::Interval(MonthDayNano),
                     Timestamp(array_type, None),
@@ -146,7 +161,44 @@ impl DateBinFunc {
                     DataType::Interval(DayTime),
                     Timestamp(array_type, Some(TIMEZONE_WILDCARD.into())),
                 ]),
-            ]
+            ];
+
+            match array_type {
+                Second | Millisecond => {
+                    v.append(&mut vec![
+                        Exact(vec![
+                            DataType::Interval(MonthDayNano),
+                            Time32(array_type),
+                            Time32(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(MonthDayNano), Time32(array_type)]),
+                        Exact(vec![
+                            DataType::Interval(DayTime),
+                            Time32(array_type),
+                            Time32(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(DayTime), Time32(array_type)]),
+                    ]);
+                }
+                Microsecond | Nanosecond => {
+                    v.append(&mut vec![
+                        Exact(vec![
+                            DataType::Interval(DayTime),
+                            Time64(array_type),
+                            Time64(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(DayTime), Time64(array_type)]),
+                        Exact(vec![
+                            DataType::Interval(MonthDayNano),
+                            Time64(array_type),
+                            Time64(array_type),
+                        ]),
+                        Exact(vec![DataType::Interval(MonthDayNano), Time64(array_type)]),
+                    ]);
+                }
+            }
+
+            v
         };
 
         let full_sig = [Nanosecond, Microsecond, Millisecond, Second]
@@ -176,28 +228,39 @@ impl ScalarUDFImpl for DateBinFunc {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[1] {
-            Timestamp(Nanosecond, None) | Utf8 | Null => Ok(Timestamp(Nanosecond, None)),
-            Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())),
-            Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())),
-            Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())),
-            Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())),
+            Timestamp(tu, tz_opt) => Ok(Timestamp(*tu, tz_opt.clone())),
+            Time32(tu) => Ok(Time32(*tu)),
+            Time64(tu) => Ok(Time64(*tu)),
             _ => plan_err!(
-                "The date_bin function can only accept timestamp as the second arg."
+                "The date_bin function can only accept timestamp or time as the second arg."
             ),
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         if args.len() == 2 {
-            // Default to unix EPOCH
-            let origin = ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                Some(0),
-                Some("+00:00".into()),
-            ));
+            let origin = match args[1].data_type() {
+                Time32(Second) => {
+                    ColumnarValue::Scalar(ScalarValue::Time32Second(Some(0)))
+                }
+                Time32(Millisecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time32Millisecond(Some(0)))
+                }
+                Time64(Microsecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(0)))
+                }
+                Time64(Nanosecond) => {
+                    ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(Some(0)))
+                }
+                _ => {
+                    // Default to unix EPOCH
+                    ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        Some(0),
+                        Some("+00:00".into()),
+                    ))
+                }
+            };
             date_bin_impl(&args[0], &args[1], &origin)
         } else if args.len() == 3 {
             date_bin_impl(&args[0], &args[1], &args[2])
@@ -227,6 +290,18 @@ impl ScalarUDFImpl for DateBinFunc {
     }
 }
 
+const NANOS_PER_MICRO: i64 = 1_000;
+const NANOS_PER_MILLI: i64 = 1_000_000;
+const NANOS_PER_SEC: i64 = NANOSECONDS;
+/// Function type for binning timestamps into intervals
+///
+/// Arguments:
+/// * `stride` - Interval width (nanoseconds for time-based, months for month-based)
+/// * `source` - Timestamp to bin (nanoseconds since epoch)
+/// * `origin` - Origin timestamp (nanoseconds since epoch)
+///
+/// Returns: Binned timestamp in nanoseconds, or error if out of range
+type BinFunction = fn(i64, i64, i64) -> Result<i64>;
 enum Interval {
     Nanoseconds(i64),
     Months(i64),
@@ -241,7 +316,7 @@ impl Interval {
     /// `source` is the timestamp being binned
     ///
     /// `origin`  is the time, in nanoseconds, where windows are measured from
-    fn bin_fn(&self) -> (i64, fn(i64, i64, i64) -> i64) {
+    fn bin_fn(&self) -> (i64, BinFunction) {
         match self {
             Interval::Nanoseconds(nanos) => (*nanos, date_bin_nanos_interval),
             Interval::Months(months) => (*months, date_bin_months_interval),
@@ -250,13 +325,13 @@ impl Interval {
 }
 
 // return time in nanoseconds that the source timestamp falls into based on the stride and origin
-fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 {
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> Result<i64> {
     let time_diff = source - origin;
 
     // distance from origin to bin
     let time_delta = compute_distance(time_diff, stride_nanos);
 
-    origin + time_delta
+    Ok(origin + time_delta)
 }
 
 // distance from origin to bin
@@ -272,10 +347,10 @@ fn compute_distance(time_diff: i64, stride: i64) -> i64 {
 }
 
 // return time in nanoseconds that the source timestamp falls into based on the stride and origin
-fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 {
+fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> Result<i64> {
     // convert source and origin to DateTime<Utc>
-    let source_date = to_utc_date_time(source);
-    let origin_date = to_utc_date_time(origin);
+    let source_date = to_utc_date_time(source)?;
+    let origin_date = to_utc_date_time(origin)?;
 
     // calculate the number of months between the source and origin
     let month_diff = (source_date.year() - origin_date.year()) * 12
@@ -286,9 +361,17 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64
     let month_delta = compute_distance(month_diff as i64, stride_months);
 
     let mut bin_time = if month_delta < 0 {
-        origin_date - Months::new(month_delta.unsigned_abs() as u32)
+        match origin_date
+            .checked_sub_months(Months::new(month_delta.unsigned_abs() as u32))
+        {
+            Some(dt) => dt,
+            None => return exec_err!("DATE_BIN month subtraction out of range"),
+        }
     } else {
-        origin_date + Months::new(month_delta as u32)
+        match origin_date.checked_add_months(Months::new(month_delta as u32)) {
+            Some(dt) => dt,
+            None => return exec_err!("DATE_BIN month addition out of range"),
+        }
     };
 
     // If origin is not midnight of first date of the month, the bin_time may be larger than the source
@@ -296,19 +379,32 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64
     if bin_time > source_date {
         let month_delta = month_delta - stride_months;
         bin_time = if month_delta < 0 {
-            origin_date - Months::new(month_delta.unsigned_abs() as u32)
+            match origin_date
+                .checked_sub_months(Months::new(month_delta.unsigned_abs() as u32))
+            {
+                Some(dt) => dt,
+                None => return exec_err!("DATE_BIN month subtraction out of range"),
+            }
         } else {
-            origin_date + Months::new(month_delta as u32)
+            match origin_date.checked_add_months(Months::new(month_delta as u32)) {
+                Some(dt) => dt,
+                None => return exec_err!("DATE_BIN month addition out of range"),
+            }
         };
     }
-
-    bin_time.timestamp_nanos_opt().unwrap()
+    match bin_time.timestamp_nanos_opt() {
+        Some(nanos) => Ok(nanos),
+        None => exec_err!("DATE_BIN result timestamp out of range"),
+    }
 }
 
-fn to_utc_date_time(nanos: i64) -> DateTime<Utc> {
-    let secs = nanos / 1_000_000_000;
-    let nsec = (nanos % 1_000_000_000) as u32;
-    DateTime::from_timestamp(secs, nsec).unwrap()
+fn to_utc_date_time(nanos: i64) -> Result<DateTime<Utc>> {
+    let secs = nanos / NANOS_PER_SEC;
+    let nsec = (nanos % NANOS_PER_SEC) as u32;
+    match DateTime::from_timestamp(secs, nsec) {
+        Some(dt) => Ok(dt),
+        None => exec_err!("Invalid timestamp value"),
+    }
 }
 
 // Supported intervals:
@@ -323,6 +419,12 @@ fn date_bin_impl(
     origin: &ColumnarValue,
 ) -> Result<ColumnarValue> {
     let stride = match stride {
+        ColumnarValue::Scalar(s) if s.is_null() => {
+            // NULL stride -> NULL result (standard SQL NULL propagation)
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(
+                array.data_type(),
+            )?));
+        }
         ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => {
             let (days, ms) = IntervalDayTimeType::to_parts(*v);
             let nanos = (TimeDelta::try_days(days as i64).unwrap()
@@ -365,23 +467,105 @@ fn date_bin_impl(
         }
         ColumnarValue::Array(_) => {
             return not_impl_err!(
-            "DATE_BIN only supports literal values for the stride argument, not arrays"
-        );
+                "DATE_BIN only supports literal values for the stride argument, not arrays"
+            );
         }
     };
 
-    let origin = match origin {
-        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v,
+    let (origin, is_time) = match origin {
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => {
+            (*v, false)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Millisecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v as i64 * NANOS_PER_MILLI, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Second(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v as i64 * NANOS_PER_SEC, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v * NANOS_PER_MICRO, true)
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(Some(v))) => {
+            match stride {
+                Interval::Months(m) => {
+                    if m > 0 {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+                Interval::Nanoseconds(ns) => {
+                    if ns >= NANOSECONDS_IN_DAY {
+                        return exec_err!(
+                            "DATE_BIN stride for TIME input must be less than 1 day"
+                        );
+                    }
+                }
+            }
+
+            (*v, true)
+        }
         ColumnarValue::Scalar(v) => {
             return exec_err!(
-                "DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got {}",
+                "DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision or a TIME but got {}",
                 v.data_type()
             );
         }
         ColumnarValue::Array(_) => {
             return not_impl_err!(
-            "DATE_BIN only supports literal values for the origin argument, not arrays"
-        );
+                "DATE_BIN only supports literal values for the origin argument, not arrays"
+            );
         }
     };
 
@@ -395,15 +579,18 @@ fn date_bin_impl(
     fn stride_map_fn<T: ArrowTimestampType>(
         origin: i64,
         stride: i64,
-        stride_fn: fn(i64, i64, i64) -> i64,
-    ) -> impl Fn(i64) -> i64 {
+        stride_fn: BinFunction,
+    ) -> impl Fn(i64) -> Result<i64> {
         let scale = match T::UNIT {
             Nanosecond => 1,
-            Microsecond => NANOSECONDS / 1_000_000,
-            Millisecond => NANOSECONDS / 1_000,
+            Microsecond => NANOS_PER_MICRO,
+            Millisecond => NANOS_PER_MILLI,
             Second => NANOSECONDS,
         };
-        move |x: i64| stride_fn(stride, x * scale, origin) / scale
+        move |x: i64| match stride_fn(stride, x * scale, origin) {
+            Ok(result) => Ok(result / scale),
+            Err(e) => Err(e),
+        }
     }
 
     Ok(match array {
@@ -411,7 +598,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampNanosecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -419,7 +606,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampMicrosecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -427,7 +614,7 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampMillisecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
@@ -435,16 +622,69 @@ fn date_bin_impl(
             let apply_stride_fn =
                 stride_map_fn::<TimestampSecondType>(origin, stride, stride_fn);
             ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                v.map(apply_stride_fn),
+                v.and_then(|val| apply_stride_fn(val).ok()),
                 tz_opt.clone(),
             ))
         }
-
+        ColumnarValue::Scalar(ScalarValue::Time32Millisecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
+            }
+            let result = v.and_then(|x| {
+                match stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some((nanos / NANOS_PER_MILLI) as i32)
+                    }
+                    Err(_) => None,
+                }
+            });
+            ColumnarValue::Scalar(ScalarValue::Time32Millisecond(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time32Second(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
+            }
+            let result = v.and_then(|x| {
+                match stride_fn(stride, x as i64 * NANOS_PER_SEC, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some((nanos / NANOS_PER_SEC) as i32)
+                    }
+                    Err(_) => None,
+                }
+            });
+            ColumnarValue::Scalar(ScalarValue::Time32Second(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
+            }
+            let result = v.and_then(|x| match stride_fn(stride, x, origin) {
+                Ok(binned_nanos) => Some(binned_nanos % (NANOSECONDS_IN_DAY)),
+                Err(_) => None,
+            });
+            ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(result))
+        }
+        ColumnarValue::Scalar(ScalarValue::Time64Microsecond(v)) => {
+            if !is_time {
+                return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
+            }
+            let result =
+                v.and_then(|x| match stride_fn(stride, x * NANOS_PER_MICRO, origin) {
+                    Ok(binned_nanos) => {
+                        let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                        Some(nanos / NANOS_PER_MICRO)
+                    }
+                    Err(_) => None,
+                });
+            ColumnarValue::Scalar(ScalarValue::Time64Microsecond(result))
+        }
         ColumnarValue::Array(array) => {
             fn transform_array_with_stride<T>(
                 origin: i64,
                 stride: i64,
-                stride_fn: fn(i64, i64, i64) -> i64,
+                stride_fn: BinFunction,
                 array: &ArrayRef,
                 tz_opt: &Option<Arc<str>>,
             ) -> Result<ColumnarValue>
@@ -452,11 +692,22 @@ fn date_bin_impl(
                 T: ArrowTimestampType,
             {
                 let array = as_primitive_array::<T>(array)?;
-                let apply_stride_fn = stride_map_fn::<T>(origin, stride, stride_fn);
-                let array: PrimitiveArray<T> = array
-                    .unary(apply_stride_fn)
-                    .with_timezone_opt(tz_opt.clone());
-
+                let scale = match T::UNIT {
+                    Nanosecond => 1,
+                    Microsecond => NANOS_PER_MICRO,
+                    Millisecond => NANOS_PER_MILLI,
+                    Second => NANOSECONDS,
+                };
+
+                let result: PrimitiveArray<T> = array.try_unary(|val| {
+                    stride_fn(stride, val * scale, origin)
+                        .map(|binned| binned / scale)
+                        .map_err(|e| {
+                            arrow::error::ArrowError::ComputeError(e.to_string())
+                        })
+                })?;
+
+                let array = result.with_timezone_opt(tz_opt.clone());
                 Ok(ColumnarValue::Array(Arc::new(array)))
             }
 
@@ -481,9 +732,86 @@ fn date_bin_impl(
                         origin, stride, stride_fn, array, tz_opt,
                     )?
                 }
+                Time32(Millisecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time32 source requires Time32 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time32MillisecondType>();
+                    let result: PrimitiveArray<Time32MillisecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    (nanos / NANOS_PER_MILLI) as i32
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time32(Second) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time32 source requires Time32 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time32SecondType>();
+                    let result: PrimitiveArray<Time32SecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x as i64 * NANOS_PER_SEC, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    (nanos / NANOS_PER_SEC) as i32
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time64(Microsecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time64 source requires Time64 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time64MicrosecondType>();
+                    let result: PrimitiveArray<Time64MicrosecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x * NANOS_PER_MICRO, origin)
+                                .map(|binned_nanos| {
+                                    let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
+                                    nanos / NANOS_PER_MICRO
+                                })
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
+                Time64(Nanosecond) => {
+                    if !is_time {
+                        return exec_err!(
+                            "DATE_BIN with Time64 source requires Time64 origin"
+                        );
+                    }
+                    let array = array.as_primitive::<Time64NanosecondType>();
+                    let result: PrimitiveArray<Time64NanosecondType> =
+                        array.try_unary(|x| {
+                            stride_fn(stride, x, origin)
+                                .map(|binned_nanos| binned_nanos % (NANOSECONDS_IN_DAY))
+                                .map_err(|e| {
+                                    arrow::error::ArrowError::ComputeError(e.to_string())
+                                })
+                        })?;
+                    ColumnarValue::Array(Arc::new(result))
+                }
                 _ => {
                     return exec_err!(
-                        "DATE_BIN expects source argument to be a TIMESTAMP but got {}",
+                        "DATE_BIN expects source argument to be a TIMESTAMP or TIME but got {}",
                         array.data_type()
                     );
                 }
@@ -491,7 +819,7 @@ fn date_bin_impl(
         }
         _ => {
             return exec_err!(
-                "DATE_BIN expects source argument to be a TIMESTAMP scalar or array"
+                "DATE_BIN expects source argument to be a TIMESTAMP or TIME scalar or array"
             );
         }
     })
@@ -501,7 +829,7 @@ fn date_bin_impl(
 mod tests {
     use std::sync::Arc;
 
-    use crate::datetime::date_bin::{date_bin_nanos_interval, DateBinFunc};
+    use crate::datetime::date_bin::{DateBinFunc, date_bin_nanos_interval};
     use arrow::array::types::TimestampNanosecondType;
     use arrow::array::{Array, IntervalDayTimeArray, TimestampNanosecondArray};
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
@@ -509,9 +837,10 @@ mod tests {
 
     use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano};
     use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use chrono::TimeDelta;
+    use datafusion_common::config::ConfigOptions;
 
     fn invoke_date_bin_with_args(
         args: Vec<ColumnarValue>,
@@ -523,11 +852,12 @@ mod tests {
             .map(|arg| Field::new("a", arg.data_type(), true).into())
             .collect::<Vec<_>>();
 
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args,
             arg_fields,
             number_rows,
             return_field: Arc::clone(return_field),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         DateBinFunc::new().invoke_with_args(args)
     }
@@ -685,7 +1015,7 @@ mod tests {
         let res = invoke_date_bin_with_args(args, 1, return_field);
         assert_eq!(
             res.err().unwrap().strip_backtrace(),
-            "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(Microsecond, None)"
+            "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision or a TIME but got Timestamp(µs)"
         );
 
         args = vec![
@@ -741,7 +1071,7 @@ mod tests {
 
     #[test]
     fn test_date_bin_timezones() {
-        let cases = vec![
+        let cases = [
             (
                 vec![
                     "2020-09-08T00:00:00Z",
@@ -881,7 +1211,7 @@ mod tests {
 
     #[test]
     fn test_date_bin_single() {
-        let cases = vec![
+        let cases = [
             (
                 (
                     TimeDelta::try_minutes(15),
@@ -933,7 +1263,7 @@ mod tests {
                 let origin1 = string_to_timestamp_nanos(origin).unwrap();
 
                 let expected1 = string_to_timestamp_nanos(expected).unwrap();
-                let result = date_bin_nanos_interval(stride1, source1, origin1);
+                let result = date_bin_nanos_interval(stride1, source1, origin1).unwrap();
                 assert_eq!(result, expected1, "{source} = {expected}");
             })
     }
@@ -961,8 +1291,55 @@ mod tests {
             let source1 = string_to_timestamp_nanos(source).unwrap();
 
             let expected1 = string_to_timestamp_nanos(expected).unwrap();
-            let result = date_bin_nanos_interval(stride1, source1, 0);
+            let result = date_bin_nanos_interval(stride1, source1, 0).unwrap();
             assert_eq!(result, expected1, "{source} = {expected}");
         })
     }
+
+    #[test]
+    fn test_date_bin_out_of_range() {
+        let return_field = &Arc::new(Field::new(
+            "f",
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            true,
+        ));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1637426858, 0, 0)),
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(1040292460),
+                None,
+            )),
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(string_to_timestamp_nanos("1984-01-07 00:00:00").unwrap()),
+                None,
+            )),
+        ];
+
+        let result = invoke_date_bin_with_args(args, 1, return_field);
+        assert!(result.is_ok());
+        if let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(val, _)) =
+            result.unwrap()
+        {
+            assert!(val.is_none(), "Expected None for out of range operation");
+        }
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1637426858, 0, 0)),
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(-1040292460),
+                None,
+            )),
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(string_to_timestamp_nanos("1984-01-07 00:00:00").unwrap()),
+                None,
+            )),
+        ];
+
+        let result = invoke_date_bin_with_args(args, 1, return_field);
+        assert!(result.is_ok());
+        if let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(val, _)) =
+            result.unwrap()
+        {
+            assert!(val.is_none(), "Expected None for out of range operation");
+        }
+    }
 }
diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs
index 021000dc100b8..7a60bb883a0d2 100644
--- a/datafusion/functions/src/datetime/date_part.rs
+++ b/datafusion/functions/src/datetime/date_part.rs
@@ -19,19 +19,27 @@ use std::any::Any;
 use std::str::FromStr;
 use std::sync::Arc;
 
-use arrow::array::{Array, ArrayRef, Float64Array, Int32Array};
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, Int64Array};
 use arrow::compute::kernels::cast_utils::IntervalUnit;
-use arrow::compute::{binary, date_part, DatePart};
+use arrow::compute::{DatePart, binary, date_part};
 use arrow::datatypes::DataType::{
     Date32, Date64, Duration, Interval, Time32, Time64, Timestamp,
 };
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
-use datafusion_common::types::{logical_date, NativeType};
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Date32Type, Date64Type, Field, FieldRef,
+    IntervalUnit as ArrowIntervalUnit, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use chrono::{Datelike, NaiveDate};
+use datafusion_common::types::{NativeType, logical_date};
 
 use datafusion_common::{
+    Result, ScalarValue,
     cast::{
-        as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array,
+        as_date32_array, as_date64_array, as_int32_array, as_interval_dt_array,
+        as_interval_mdn_array, as_interval_ym_array, as_time32_millisecond_array,
         as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array,
         as_timestamp_microsecond_array, as_timestamp_millisecond_array,
         as_timestamp_nanosecond_array, as_timestamp_second_array,
@@ -39,11 +47,12 @@ use datafusion_common::{
     exec_err, internal_err, not_impl_err,
     types::logical_string,
     utils::take_function_args,
-    Result, ScalarValue,
 };
+use datafusion_expr::preimage::PreimageResult;
+use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature,
-    TypeSignature, Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility, interval_arithmetic,
 };
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
@@ -56,8 +65,9 @@ use datafusion_macros::user_doc;
     argument(
         name = "part",
         description = r#"Part of the date to return. The following date parts are supported:
-        
+
     - year
+    - isoyear (ISO 8601 week-numbering year)
     - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in)
     - month
     - week (week of the year)
@@ -68,9 +78,10 @@ use datafusion_macros::user_doc;
     - millisecond
     - microsecond
     - nanosecond
-    - dow (day of the week)
+    - dow (day of the week where Sunday is 0)
     - doy (day of the year)
-    - epoch (seconds since Unix epoch)
+    - epoch (seconds since Unix epoch for timestamps/dates, total seconds for intervals)
+    - isodow (day of the week where Monday is 0)
 "#
     ),
     argument(
@@ -78,7 +89,7 @@ use datafusion_macros::user_doc;
         description = "Time expression to operate on. Can be a constant, column, or function."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DatePartFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -147,6 +158,7 @@ impl ScalarUDFImpl for DatePartFunc {
 
     fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
         let [field, _] = take_function_args(self.name(), args.scalar_arguments)?;
+        let nullable = args.arg_fields[1].is_nullable();
 
         field
             .and_then(|sv| {
@@ -155,9 +167,12 @@ impl ScalarUDFImpl for DatePartFunc {
                     .filter(|s| !s.is_empty())
                     .map(|part| {
                         if is_epoch(part) {
-                            Field::new(self.name(), DataType::Float64, true)
+                            Field::new(self.name(), DataType::Float64, nullable)
+                        } else if is_nanosecond(part) {
+                            // See notes on [seconds_ns] for rationale
+                            Field::new(self.name(), DataType::Int64, nullable)
                         } else {
-                            Field::new(self.name(), DataType::Int32, true)
+                            Field::new(self.name(), DataType::Int32, nullable)
                         }
                     })
             })
@@ -168,10 +183,7 @@ impl ScalarUDFImpl for DatePartFunc {
             )
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let [part, array] = take_function_args(self.name(), args)?;
 
@@ -207,16 +219,18 @@ impl ScalarUDFImpl for DatePartFunc {
                 IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?,
                 IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?,
                 IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?,
-                IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?,
+                IntervalUnit::Nanosecond => seconds_ns(array.as_ref())?,
                 // century and decade are not supported by `DatePart`, although they are supported in postgres
                 _ => return exec_err!("Date part '{part}' not supported"),
             }
         } else {
             // special cases that can be extracted (in postgres) but are not interval units
             match part_trim.to_lowercase().as_str() {
+                "isoyear" => date_part(array.as_ref(), DatePart::YearISO)?,
                 "qtr" | "quarter" => date_part(array.as_ref(), DatePart::Quarter)?,
                 "doy" => date_part(array.as_ref(), DatePart::DayOfYear)?,
                 "dow" => date_part(array.as_ref(), DatePart::DayOfWeekSunday0)?,
+                "isodow" => date_part(array.as_ref(), DatePart::DayOfWeekMonday0)?,
                 "epoch" => epoch(array.as_ref())?,
                 _ => return exec_err!("Date part '{part}' not supported"),
             }
@@ -229,9 +243,75 @@ impl ScalarUDFImpl for DatePartFunc {
         })
     }
 
+    // Only casting the year is supported since pruning other IntervalUnit is not possible
+    // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01'
+    // But for anything less than YEAR simplifying is not possible without specifying the bigger interval
+    // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01'
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        let [part, col_expr] = take_function_args(self.name(), args)?;
+
+        // Get the interval unit from the part argument
+        let interval_unit = part
+            .as_literal()
+            .and_then(|sv| sv.try_as_str().flatten())
+            .map(part_normalization)
+            .and_then(|s| IntervalUnit::from_str(s).ok());
+
+        // only support extracting year
+        match interval_unit {
+            Some(IntervalUnit::Year) => (),
+            _ => return Ok(PreimageResult::None),
+        }
+
+        // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024)
+        let Some(argument_literal) = lit_expr.as_literal() else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Extract i32 year from Scalar value
+        let year = match argument_literal {
+            ScalarValue::Int32(Some(y)) => *y,
+            _ => return Ok(PreimageResult::None),
+        };
+
+        // Can only extract year from Date32/64 and Timestamp column
+        let target_type = match info.get_data_type(col_expr)? {
+            Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?,
+            _ => return Ok(PreimageResult::None),
+        };
+
+        // Compute the Interval bounds
+        let Some(start_time) = NaiveDate::from_ymd_opt(year, 1, 1) else {
+            return Ok(PreimageResult::None);
+        };
+        let Some(end_time) = start_time.with_year(year + 1) else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Convert to ScalarValues
+        let (Some(lower), Some(upper)) = (
+            date_to_scalar(start_time, target_type),
+            date_to_scalar(end_time, target_type),
+        ) else {
+            return Ok(PreimageResult::None);
+        };
+        let interval = Box::new(interval_arithmetic::Interval::try_new(lower, upper)?);
+
+        Ok(PreimageResult::Range {
+            expr: col_expr.clone(),
+            interval,
+        })
+    }
+
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -242,6 +322,53 @@ fn is_epoch(part: &str) -> bool {
     matches!(part.to_lowercase().as_str(), "epoch")
 }
 
+fn is_nanosecond(part: &str) -> bool {
+    IntervalUnit::from_str(part_normalization(part))
+        .map(|p| matches!(p, IntervalUnit::Nanosecond))
+        .unwrap_or(false)
+}
+
+fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option<ScalarValue> {
+    Some(match target_type {
+        Date32 => ScalarValue::Date32(Some(Date32Type::from_naive_date(date))),
+        Date64 => ScalarValue::Date64(Some(Date64Type::from_naive_date(date))),
+
+        Timestamp(unit, tz_opt) => {
+            let naive_midnight = date.and_hms_opt(0, 0, 0)?;
+            let tz: Option<Tz> = tz_opt.clone().and_then(|s| s.parse().ok());
+
+            match unit {
+                Second => ScalarValue::TimestampSecond(
+                    TimestampSecondType::from_naive_datetime(naive_midnight, tz.as_ref()),
+                    tz_opt.clone(),
+                ),
+                Millisecond => ScalarValue::TimestampMillisecond(
+                    TimestampMillisecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+                Microsecond => ScalarValue::TimestampMicrosecond(
+                    TimestampMicrosecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+                Nanosecond => ScalarValue::TimestampNanosecond(
+                    TimestampNanosecondType::from_naive_datetime(
+                        naive_midnight,
+                        tz.as_ref(),
+                    ),
+                    tz_opt.clone(),
+                ),
+            }
+        }
+        _ => return None,
+    })
+}
+
 // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error
 fn part_normalization(part: &str) -> &str {
     part.strip_prefix(|c| c == '\'' || c == '\"')
@@ -346,6 +473,11 @@ fn seconds(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
 
 fn epoch(array: &dyn Array) -> Result<ArrayRef> {
     const SECONDS_IN_A_DAY: f64 = 86400_f64;
+    // Note: Month-to-second conversion uses 30 days as an approximation.
+    // This matches PostgreSQL's behavior for interval epoch extraction,
+    // but does not represent exact calendar months (which vary 28-31 days).
+    // See: https://doxygen.postgresql.org/datatype_2timestamp_8h.html
+    const DAYS_PER_MONTH: f64 = 30_f64;
 
     let f: Float64Array = match array.data_type() {
         Timestamp(Second, _) => as_timestamp_second_array(array)?.unary(|x| x as f64),
@@ -370,8 +502,56 @@ fn epoch(array: &dyn Array) -> Result<ArrayRef> {
         Time64(Nanosecond) => {
             as_time64_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64)
         }
-        Interval(_) | Duration(_) => return seconds(array, Second),
+        Interval(ArrowIntervalUnit::YearMonth) => as_interval_ym_array(array)?
+            .unary(|x| x as f64 * DAYS_PER_MONTH * SECONDS_IN_A_DAY),
+        Interval(ArrowIntervalUnit::DayTime) => as_interval_dt_array(array)?.unary(|x| {
+            x.days as f64 * SECONDS_IN_A_DAY + x.milliseconds as f64 / 1_000_f64
+        }),
+        Interval(ArrowIntervalUnit::MonthDayNano) => {
+            as_interval_mdn_array(array)?.unary(|x| {
+                x.months as f64 * DAYS_PER_MONTH * SECONDS_IN_A_DAY
+                    + x.days as f64 * SECONDS_IN_A_DAY
+                    + x.nanoseconds as f64 / 1_000_000_000_f64
+            })
+        }
+        Duration(_) => return seconds(array, Second),
         d => return exec_err!("Cannot convert {d:?} to epoch"),
     };
     Ok(Arc::new(f))
 }
+
+/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the
+/// result to a total number of nanoseconds as an Int64 array.
+///
+/// This returns an Int64 rather than Int32 because  there 1 billion
+/// `nanosecond`s in each second, so representing up to 60 seconds as
+/// nanoseconds can be values up to 60 billion, which does not fit in Int32.
+fn seconds_ns(array: &dyn Array) -> Result<ArrayRef> {
+    let secs = date_part(array, DatePart::Second)?;
+    // This assumes array is primitive and not a dictionary
+    let secs = as_int32_array(secs.as_ref())?;
+    let subsecs = date_part(array, DatePart::Nanosecond)?;
+    let subsecs = as_int32_array(subsecs.as_ref())?;
+
+    // Special case where there are no nulls.
+    if subsecs.null_count() == 0 {
+        let r: Int64Array = binary(secs, subsecs, |secs, subsecs| {
+            (secs as i64) * 1_000_000_000 + (subsecs as i64)
+        })?;
+        Ok(Arc::new(r))
+    } else {
+        // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case
+        // where the number of nanoseconds overflows.
+        let r: Int64Array = secs
+            .iter()
+            .zip(subsecs)
+            .map(|(secs, subsecs)| {
+                secs.map(|secs| {
+                    let subsecs = subsecs.unwrap_or(0);
+                    (secs as i64) * 1_000_000_000 + (subsecs as i64)
+                })
+            })
+            .collect();
+        Ok(Arc::new(r))
+    }
+}
diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs
index 8963ef77a53b9..ef9896cead5a0 100644
--- a/datafusion/functions/src/datetime/date_trunc.rs
+++ b/datafusion/functions/src/datetime/date_trunc.rs
@@ -16,42 +16,135 @@
 // under the License.
 
 use std::any::Any;
+use std::num::NonZeroI64;
 use std::ops::{Add, Sub};
 use std::str::FromStr;
 use std::sync::Arc;
 
 use arrow::array::temporal_conversions::{
-    as_datetime_with_timezone, timestamp_ns_to_datetime,
+    MICROSECONDS, MILLISECONDS, NANOSECONDS, as_datetime_with_timezone,
+    timestamp_ns_to_datetime,
 };
 use arrow::array::timezone::Tz;
 use arrow::array::types::{
-    ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType,
+    ArrowTimestampType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType,
     TimestampNanosecondType, TimestampSecondType,
 };
-use arrow::array::{Array, PrimitiveArray};
-use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View};
+use arrow::array::{Array, ArrayRef, PrimitiveArray};
+use arrow::datatypes::DataType::{self, Time32, Time64, Timestamp};
 use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second};
+use arrow::datatypes::{Field, FieldRef};
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{exec_err, plan_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::types::{NativeType, logical_date, logical_string};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
+};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, Volatility,
 };
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
 use datafusion_macros::user_doc;
 
 use chrono::{
     DateTime, Datelike, Duration, LocalResult, NaiveDateTime, Offset, TimeDelta, Timelike,
 };
 
+/// Represents the granularity for date truncation operations
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum DateTruncGranularity {
+    Microsecond,
+    Millisecond,
+    Second,
+    Minute,
+    Hour,
+    Day,
+    Week,
+    Month,
+    Quarter,
+    Year,
+}
+
+impl DateTruncGranularity {
+    /// List of all supported granularity values
+    /// Cannot use HashMap here as it would require lazy_static or once_cell,
+    /// Rust does not support const HashMap yet.
+    const SUPPORTED_GRANULARITIES: &[&str] = &[
+        "microsecond",
+        "millisecond",
+        "second",
+        "minute",
+        "hour",
+        "day",
+        "week",
+        "month",
+        "quarter",
+        "year",
+    ];
+
+    /// Parse a granularity string into a DateTruncGranularity enum
+    fn from_str(s: &str) -> Result<Self> {
+        // Using match for O(1) lookup - compiler optimizes this into a jump table or perfect hash
+        match s.to_lowercase().as_str() {
+            "microsecond" => Ok(Self::Microsecond),
+            "millisecond" => Ok(Self::Millisecond),
+            "second" => Ok(Self::Second),
+            "minute" => Ok(Self::Minute),
+            "hour" => Ok(Self::Hour),
+            "day" => Ok(Self::Day),
+            "week" => Ok(Self::Week),
+            "month" => Ok(Self::Month),
+            "quarter" => Ok(Self::Quarter),
+            "year" => Ok(Self::Year),
+            _ => {
+                let supported = Self::SUPPORTED_GRANULARITIES.join(", ");
+                exec_err!(
+                    "Unsupported date_trunc granularity: '{s}'. Supported values are: {supported}"
+                )
+            }
+        }
+    }
+
+    /// Returns true if this granularity can be handled with simple arithmetic
+    /// (fine granularity: second, minute, millisecond, microsecond)
+    fn is_fine_granularity(&self) -> bool {
+        matches!(
+            self,
+            Self::Second | Self::Minute | Self::Millisecond | Self::Microsecond
+        )
+    }
+
+    /// Returns true if this granularity can be handled with simple arithmetic in UTC
+    /// (hour and day in addition to fine granularities)
+    fn is_fine_granularity_utc(&self) -> bool {
+        self.is_fine_granularity() || matches!(self, Self::Hour | Self::Day)
+    }
+
+    /// Returns true if this granularity is valid for Time types
+    /// Time types don't have date components, so day/week/month/quarter/year are not valid
+    fn valid_for_time(&self) -> bool {
+        matches!(
+            self,
+            Self::Hour
+                | Self::Minute
+                | Self::Second
+                | Self::Millisecond
+                | Self::Microsecond
+        )
+    }
+}
+
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Truncates a timestamp value to a specified precision.",
+    description = "Truncates a timestamp or time value to a specified precision.",
     syntax_example = "date_trunc(precision, expression)",
     argument(
         name = "precision",
         description = r#"Time precision to truncate to. The following precisions are supported:
 
+    For Timestamp types:
     - year / YEAR
     - quarter / QUARTER
     - month / MONTH
@@ -60,14 +153,23 @@ use chrono::{
     - hour / HOUR
     - minute / MINUTE
     - second / SECOND
+    - millisecond / MILLISECOND
+    - microsecond / MICROSECOND
+
+    For Time types (hour, minute, second, millisecond, microsecond only):
+    - hour / HOUR
+    - minute / MINUTE
+    - second / SECOND
+    - millisecond / MILLISECOND
+    - microsecond / MICROSECOND
 "#
     ),
     argument(
         name = "expression",
-        description = "Time expression to operate on. Can be a constant, column, or function."
+        description = "Timestamp or time expression to operate on. Can be a constant, column, or function."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DateTruncFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -84,45 +186,21 @@ impl DateTruncFunc {
         Self {
             signature: Signature::one_of(
                 vec![
-                    Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Nanosecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Timestamp,
+                            // Allow implicit cast from string and date to timestamp for backward compatibility
+                            vec![
+                                TypeSignatureClass::Native(logical_string()),
+                                TypeSignatureClass::Native(logical_date()),
+                            ],
+                            NativeType::Timestamp(Nanosecond, None),
+                        ),
                     ]),
-                    Exact(vec![Utf8, Timestamp(Microsecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Microsecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![Utf8, Timestamp(Millisecond, None)]),
-                    Exact(vec![Utf8View, Timestamp(Millisecond, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![Utf8, Timestamp(Second, None)]),
-                    Exact(vec![Utf8View, Timestamp(Second, None)]),
-                    Exact(vec![
-                        Utf8,
-                        Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
-                    ]),
-                    Exact(vec![
-                        Utf8View,
-                        Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Time),
                     ]),
                 ],
                 Volatility::Immutable,
@@ -145,67 +223,92 @@ impl ScalarUDFImpl for DateTruncFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[1] {
-            Timestamp(Nanosecond, None) | Utf8 | DataType::Date32 | Null => {
-                Ok(Timestamp(Nanosecond, None))
-            }
-            Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())),
-            Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())),
-            Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())),
-            Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())),
-            _ => plan_err!(
-                "The date_trunc function can only accept timestamp as the second arg."
-            ),
-        }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be called instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let field = &args.arg_fields[1];
+        let return_type = if field.data_type().is_null() {
+            Timestamp(Nanosecond, None)
+        } else {
+            field.data_type().clone()
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            field.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let (granularity, array) = (&args[0], &args[1]);
 
-        let granularity = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) =
+        let granularity_str = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) =
             granularity
         {
             v.to_lowercase()
         } else if let ColumnarValue::Scalar(ScalarValue::Utf8View(Some(v))) = granularity
+        {
+            v.to_lowercase()
+        } else if let ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) = granularity
         {
             v.to_lowercase()
         } else {
             return exec_err!("Granularity of `date_trunc` must be non-null scalar Utf8");
         };
 
+        let granularity = DateTruncGranularity::from_str(&granularity_str)?;
+
+        // Check upfront if granularity is valid for Time types
+        let is_time_type = matches!(array.data_type(), Time64(_) | Time32(_));
+        if is_time_type && !granularity.valid_for_time() {
+            return exec_err!(
+                "date_trunc does not support '{}' granularity for Time types. Valid values are: hour, minute, second, millisecond, microsecond",
+                granularity_str
+            );
+        }
+
         fn process_array<T: ArrowTimestampType>(
             array: &dyn Array,
-            granularity: String,
+            granularity: DateTruncGranularity,
             tz_opt: &Option<Arc<str>>,
         ) -> Result<ColumnarValue> {
             let parsed_tz = parse_tz(tz_opt)?;
             let array = as_primitive_array::<T>(array)?;
+
+            // fast path for fine granularity
+            // For modern timezones, it's correct to truncate "minute" in this way.
+            // Both datafusion and arrow are ignoring historical timezone's non-minute granularity
+            // bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16).
+            // In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic
+            if granularity.is_fine_granularity()
+                || (parsed_tz.is_none() && granularity.is_fine_granularity_utc())
+            {
+                let result = general_date_trunc_array_fine_granularity(
+                    T::UNIT,
+                    array,
+                    granularity,
+                    tz_opt.clone(),
+                )?;
+                return Ok(ColumnarValue::Array(result));
+            }
+
             let array: PrimitiveArray<T> = array
-                .try_unary(|x| {
-                    general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str())
-                })?
+                .try_unary(|x| general_date_trunc(T::UNIT, x, parsed_tz, granularity))?
                 .with_timezone_opt(tz_opt.clone());
             Ok(ColumnarValue::Array(Arc::new(array)))
         }
 
         fn process_scalar<T: ArrowTimestampType>(
             v: &Option<i64>,
-            granularity: String,
+            granularity: DateTruncGranularity,
             tz_opt: &Option<Arc<str>>,
         ) -> Result<ColumnarValue> {
             let parsed_tz = parse_tz(tz_opt)?;
             let value = if let Some(v) = v {
-                Some(general_date_trunc(
-                    T::UNIT,
-                    *v,
-                    parsed_tz,
-                    granularity.as_str(),
-                )?)
+                Some(general_date_trunc(T::UNIT, *v, parsed_tz, granularity)?)
             } else {
                 None
             };
@@ -214,6 +317,10 @@ impl ScalarUDFImpl for DateTruncFunc {
         }
 
         Ok(match array {
+            ColumnarValue::Scalar(ScalarValue::Null) => {
+                // NULL input returns NULL timestamp
+                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, None))
+            }
             ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => {
                 process_scalar::<TimestampNanosecondType>(v, granularity, tz_opt)?
             }
@@ -226,38 +333,77 @@ impl ScalarUDFImpl for DateTruncFunc {
             ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => {
                 process_scalar::<TimestampSecondType>(v, granularity, tz_opt)?
             }
+            ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(v)) => {
+                let truncated = v.map(|val| truncate_time_nanos(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time64Microsecond(v)) => {
+                let truncated = v.map(|val| truncate_time_micros(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time64Microsecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time32Millisecond(v)) => {
+                let truncated = v.map(|val| truncate_time_millis(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time32Millisecond(truncated))
+            }
+            ColumnarValue::Scalar(ScalarValue::Time32Second(v)) => {
+                let truncated = v.map(|val| truncate_time_secs(val, granularity));
+                ColumnarValue::Scalar(ScalarValue::Time32Second(truncated))
+            }
             ColumnarValue::Array(array) => {
                 let array_type = array.data_type();
-                if let Timestamp(unit, tz_opt) = array_type {
-                    match unit {
-                        Second => process_array::<TimestampSecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Millisecond => process_array::<TimestampMillisecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Microsecond => process_array::<TimestampMicrosecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
-                        Nanosecond => process_array::<TimestampNanosecondType>(
-                            array,
-                            granularity,
-                            tz_opt,
-                        )?,
+                match array_type {
+                    Timestamp(Second, tz_opt) => {
+                        process_array::<TimestampSecondType>(array, granularity, tz_opt)?
+                    }
+                    Timestamp(Millisecond, tz_opt) => process_array::<
+                        TimestampMillisecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Timestamp(Microsecond, tz_opt) => process_array::<
+                        TimestampMicrosecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Timestamp(Nanosecond, tz_opt) => process_array::<
+                        TimestampNanosecondType,
+                    >(
+                        array, granularity, tz_opt
+                    )?,
+                    Time64(Nanosecond) => {
+                        let arr = as_primitive_array::<Time64NanosecondType>(array)?;
+                        let result: PrimitiveArray<Time64NanosecondType> =
+                            arr.unary(|v| truncate_time_nanos(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time64(Microsecond) => {
+                        let arr = as_primitive_array::<Time64MicrosecondType>(array)?;
+                        let result: PrimitiveArray<Time64MicrosecondType> =
+                            arr.unary(|v| truncate_time_micros(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time32(Millisecond) => {
+                        let arr = as_primitive_array::<Time32MillisecondType>(array)?;
+                        let result: PrimitiveArray<Time32MillisecondType> =
+                            arr.unary(|v| truncate_time_millis(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    Time32(Second) => {
+                        let arr = as_primitive_array::<Time32SecondType>(array)?;
+                        let result: PrimitiveArray<Time32SecondType> =
+                            arr.unary(|v| truncate_time_secs(v, granularity));
+                        ColumnarValue::Array(Arc::new(result))
+                    }
+                    _ => {
+                        return exec_err!(
+                            "second argument of `date_trunc` is an unsupported array type: {array_type}"
+                        );
                     }
-                } else {
-                    return exec_err!("second argument of `date_trunc` is an unsupported array type: {array_type}");
                 }
             }
             _ => {
                 return exec_err!(
-                    "second argument of `date_trunc` must be timestamp scalar or array"
+                    "second argument of `date_trunc` must be timestamp, time scalar or array"
                 );
             }
         })
@@ -283,27 +429,100 @@ impl ScalarUDFImpl for DateTruncFunc {
     }
 }
 
-fn _date_trunc_coarse<T>(granularity: &str, value: Option<T>) -> Result<Option<T>>
+const NANOS_PER_MICROSECOND: i64 = NANOSECONDS / MICROSECONDS;
+const NANOS_PER_MILLISECOND: i64 = NANOSECONDS / MILLISECONDS;
+const NANOS_PER_SECOND: i64 = NANOSECONDS;
+const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE;
+
+const MICROS_PER_MILLISECOND: i64 = MICROSECONDS / MILLISECONDS;
+const MICROS_PER_SECOND: i64 = MICROSECONDS;
+const MICROS_PER_MINUTE: i64 = 60 * MICROS_PER_SECOND;
+const MICROS_PER_HOUR: i64 = 60 * MICROS_PER_MINUTE;
+
+const MILLIS_PER_SECOND: i32 = MILLISECONDS as i32;
+const MILLIS_PER_MINUTE: i32 = 60 * MILLIS_PER_SECOND;
+const MILLIS_PER_HOUR: i32 = 60 * MILLIS_PER_MINUTE;
+
+const SECS_PER_MINUTE: i32 = 60;
+const SECS_PER_HOUR: i32 = 60 * SECS_PER_MINUTE;
+
+/// Truncate time in nanoseconds to the specified granularity
+fn truncate_time_nanos(value: i64, granularity: DateTruncGranularity) -> i64 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % NANOS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % NANOS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % NANOS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value - (value % NANOS_PER_MILLISECOND),
+        DateTruncGranularity::Microsecond => value - (value % NANOS_PER_MICROSECOND),
+        // Other granularities are not valid for time - should be caught earlier
+        _ => value,
+    }
+}
+
+/// Truncate time in microseconds to the specified granularity
+fn truncate_time_micros(value: i64, granularity: DateTruncGranularity) -> i64 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % MICROS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % MICROS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % MICROS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value - (value % MICROS_PER_MILLISECOND),
+        DateTruncGranularity::Microsecond => value, // Already at microsecond precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
+/// Truncate time in milliseconds to the specified granularity
+fn truncate_time_millis(value: i32, granularity: DateTruncGranularity) -> i32 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % MILLIS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % MILLIS_PER_MINUTE),
+        DateTruncGranularity::Second => value - (value % MILLIS_PER_SECOND),
+        DateTruncGranularity::Millisecond => value, // Already at millisecond precision
+        DateTruncGranularity::Microsecond => value, // Can't truncate to finer precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
+/// Truncate time in seconds to the specified granularity
+fn truncate_time_secs(value: i32, granularity: DateTruncGranularity) -> i32 {
+    match granularity {
+        DateTruncGranularity::Hour => value - (value % SECS_PER_HOUR),
+        DateTruncGranularity::Minute => value - (value % SECS_PER_MINUTE),
+        DateTruncGranularity::Second => value, // Already at second precision
+        DateTruncGranularity::Millisecond => value, // Can't truncate to finer precision
+        DateTruncGranularity::Microsecond => value, // Can't truncate to finer precision
+        // Other granularities are not valid for time
+        _ => value,
+    }
+}
+
+fn _date_trunc_coarse<T>(
+    granularity: DateTruncGranularity,
+    value: Option<T>,
+) -> Result<Option<T>>
 where
     T: Datelike + Timelike + Sub<Duration, Output = T> + Copy,
 {
     let value = match granularity {
-        "millisecond" => value,
-        "microsecond" => value,
-        "second" => value.and_then(|d| d.with_nanosecond(0)),
-        "minute" => value
+        DateTruncGranularity::Millisecond => value,
+        DateTruncGranularity::Microsecond => value,
+        DateTruncGranularity::Second => value.and_then(|d| d.with_nanosecond(0)),
+        DateTruncGranularity::Minute => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0)),
-        "hour" => value
+        DateTruncGranularity::Hour => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0)),
-        "day" => value
+        DateTruncGranularity::Day => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0))
             .and_then(|d| d.with_hour(0)),
-        "week" => value
+        DateTruncGranularity::Week => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0))
@@ -311,29 +530,26 @@ where
             .map(|d| {
                 d - TimeDelta::try_seconds(60 * 60 * 24 * d.weekday() as i64).unwrap()
             }),
-        "month" => value
+        DateTruncGranularity::Month => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0))
             .and_then(|d| d.with_hour(0))
             .and_then(|d| d.with_day0(0)),
-        "quarter" => value
+        DateTruncGranularity::Quarter => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0))
             .and_then(|d| d.with_hour(0))
             .and_then(|d| d.with_day0(0))
             .and_then(|d| d.with_month(quarter_month(&d))),
-        "year" => value
+        DateTruncGranularity::Year => value
             .and_then(|d| d.with_nanosecond(0))
             .and_then(|d| d.with_second(0))
             .and_then(|d| d.with_minute(0))
             .and_then(|d| d.with_hour(0))
             .and_then(|d| d.with_day0(0))
             .and_then(|d| d.with_month0(0)),
-        unsupported => {
-            return exec_err!("Unsupported date_trunc granularity: {unsupported}");
-        }
     };
     Ok(value)
 }
@@ -346,7 +562,7 @@ where
 }
 
 fn _date_trunc_coarse_with_tz(
-    granularity: &str,
+    granularity: DateTruncGranularity,
     value: Option<DateTime<Tz>>,
 ) -> Result<Option<i64>> {
     if let Some(value) = value {
@@ -388,7 +604,7 @@ fn _date_trunc_coarse_with_tz(
 }
 
 fn _date_trunc_coarse_without_tz(
-    granularity: &str,
+    granularity: DateTruncGranularity,
     value: Option<NaiveDateTime>,
 ) -> Result<Option<i64>> {
     let value = _date_trunc_coarse::<NaiveDateTime>(granularity, value)?;
@@ -399,22 +615,23 @@ fn _date_trunc_coarse_without_tz(
 /// epoch, for granularities greater than 1 second, in taking into
 /// account that some granularities are not uniform durations of time
 /// (e.g. months are not always the same lengths, leap seconds, etc)
-fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i64> {
+fn date_trunc_coarse(
+    granularity: DateTruncGranularity,
+    value: i64,
+    tz: Option<Tz>,
+) -> Result<i64> {
     let value = match tz {
         Some(tz) => {
             // Use chrono DateTime<Tz> to clear the various fields because need to clear per timezone,
             // and NaiveDateTime (ISO 8601) has no concept of timezones
             let value = as_datetime_with_timezone::<TimestampNanosecondType>(value, tz)
-                .ok_or(DataFusionError::Execution(format!(
-                "Timestamp {value} out of range"
-            )))?;
+                .ok_or(exec_datafusion_err!("Timestamp {value} out of range"))?;
             _date_trunc_coarse_with_tz(granularity, Some(value))
         }
         None => {
             // Use chrono NaiveDateTime to clear the various fields, if we don't have a timezone.
-            let value = timestamp_ns_to_datetime(value).ok_or_else(|| {
-                DataFusionError::Execution(format!("Timestamp {value} out of range"))
-            })?;
+            let value = timestamp_ns_to_datetime(value)
+                .ok_or_else(|| exec_datafusion_err!("Timestamp {value} out of range"))?;
             _date_trunc_coarse_without_tz(granularity, Some(value))
         }
     }?;
@@ -423,12 +640,66 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i6
     Ok(value.unwrap())
 }
 
+/// Fast path for fine granularities (hour and smaller) that can be handled
+/// with simple arithmetic operations without calendar complexity.
+///
+/// This function is timezone-agnostic and should only be used when:
+/// - No timezone is specified in the input, OR
+/// - The granularity is less than hour as hour can be affected by DST transitions in some cases
+fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>(
+    tu: TimeUnit,
+    array: &PrimitiveArray<T>,
+    granularity: DateTruncGranularity,
+    tz_opt: Option<Arc<str>>,
+) -> Result<ArrayRef> {
+    let unit = match (tu, granularity) {
+        (Second, DateTruncGranularity::Minute) => NonZeroI64::new(60),
+        (Second, DateTruncGranularity::Hour) => NonZeroI64::new(3600),
+        (Second, DateTruncGranularity::Day) => NonZeroI64::new(86400),
+
+        (Millisecond, DateTruncGranularity::Second) => NonZeroI64::new(1_000),
+        (Millisecond, DateTruncGranularity::Minute) => NonZeroI64::new(60_000),
+        (Millisecond, DateTruncGranularity::Hour) => NonZeroI64::new(3_600_000),
+        (Millisecond, DateTruncGranularity::Day) => NonZeroI64::new(86_400_000),
+
+        (Microsecond, DateTruncGranularity::Millisecond) => NonZeroI64::new(1_000),
+        (Microsecond, DateTruncGranularity::Second) => NonZeroI64::new(1_000_000),
+        (Microsecond, DateTruncGranularity::Minute) => NonZeroI64::new(60_000_000),
+        (Microsecond, DateTruncGranularity::Hour) => NonZeroI64::new(3_600_000_000),
+        (Microsecond, DateTruncGranularity::Day) => NonZeroI64::new(86_400_000_000),
+
+        (Nanosecond, DateTruncGranularity::Microsecond) => NonZeroI64::new(1_000),
+        (Nanosecond, DateTruncGranularity::Millisecond) => NonZeroI64::new(1_000_000),
+        (Nanosecond, DateTruncGranularity::Second) => NonZeroI64::new(1_000_000_000),
+        (Nanosecond, DateTruncGranularity::Minute) => NonZeroI64::new(60_000_000_000),
+        (Nanosecond, DateTruncGranularity::Hour) => NonZeroI64::new(3_600_000_000_000),
+        (Nanosecond, DateTruncGranularity::Day) => NonZeroI64::new(86_400_000_000_000),
+        _ => None,
+    };
+
+    if let Some(unit) = unit {
+        let unit = unit.get();
+        let array = PrimitiveArray::<T>::from_iter_values_with_nulls(
+            array
+                .values()
+                .iter()
+                .map(|v| *v - i64::rem_euclid(*v, unit)),
+            array.nulls().cloned(),
+        )
+        .with_timezone_opt(tz_opt);
+        Ok(Arc::new(array))
+    } else {
+        // truncate to the same or smaller unit
+        Ok(Arc::new(array.clone()))
+    }
+}
+
 // truncates a single value with the given timeunit to the specified granularity
 fn general_date_trunc(
     tu: TimeUnit,
     value: i64,
     tz: Option<Tz>,
-    granularity: &str,
+    granularity: DateTruncGranularity,
 ) -> Result<i64, DataFusionError> {
     let scale = match tu {
         Second => 1_000_000_000,
@@ -442,25 +713,29 @@ fn general_date_trunc(
 
     let result = match tu {
         Second => match granularity {
-            "minute" => nano / 1_000_000_000 / 60 * 60,
+            DateTruncGranularity::Minute => nano / 1_000_000_000 / 60 * 60,
             _ => nano / 1_000_000_000,
         },
         Millisecond => match granularity {
-            "minute" => nano / 1_000_000 / 1_000 / 60 * 1_000 * 60,
-            "second" => nano / 1_000_000 / 1_000 * 1_000,
+            DateTruncGranularity::Minute => nano / 1_000_000 / 1_000 / 60 * 1_000 * 60,
+            DateTruncGranularity::Second => nano / 1_000_000 / 1_000 * 1_000,
             _ => nano / 1_000_000,
         },
         Microsecond => match granularity {
-            "minute" => nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000,
-            "second" => nano / 1_000 / 1_000_000 * 1_000_000,
-            "millisecond" => nano / 1_000 / 1_000 * 1_000,
+            DateTruncGranularity::Minute => {
+                nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000
+            }
+            DateTruncGranularity::Second => nano / 1_000 / 1_000_000 * 1_000_000,
+            DateTruncGranularity::Millisecond => nano / 1_000 / 1_000 * 1_000,
             _ => nano / 1_000,
         },
         _ => match granularity {
-            "minute" => nano / 1_000_000_000 / 60 * 1_000_000_000 * 60,
-            "second" => nano / 1_000_000_000 * 1_000_000_000,
-            "millisecond" => nano / 1_000_000 * 1_000_000,
-            "microsecond" => nano / 1_000 * 1_000,
+            DateTruncGranularity::Minute => {
+                nano / 1_000_000_000 / 60 * 1_000_000_000 * 60
+            }
+            DateTruncGranularity::Second => nano / 1_000_000_000 * 1_000_000_000,
+            DateTruncGranularity::Millisecond => nano / 1_000_000 * 1_000_000,
+            DateTruncGranularity::Microsecond => nano / 1_000 * 1_000,
             _ => nano,
         },
     };
@@ -470,9 +745,8 @@ fn general_date_trunc(
 fn parse_tz(tz: &Option<Arc<str>>) -> Result<Option<Tz>> {
     tz.as_ref()
         .map(|tz| {
-            Tz::from_str(tz).map_err(|op| {
-                DataFusionError::Execution(format!("failed on timezone {tz}: {op:?}"))
-            })
+            Tz::from_str(tz)
+                .map_err(|op| exec_datafusion_err!("failed on timezone {tz}: {op:?}"))
         })
         .transpose()
 }
@@ -481,7 +755,9 @@ fn parse_tz(tz: &Option<Arc<str>>) -> Result<Option<Tz>> {
 mod tests {
     use std::sync::Arc;
 
-    use crate::datetime::date_trunc::{date_trunc_coarse, DateTruncFunc};
+    use crate::datetime::date_trunc::{
+        DateTruncFunc, DateTruncGranularity, date_trunc_coarse,
+    };
 
     use arrow::array::cast::as_primitive_array;
     use arrow::array::types::TimestampNanosecondType;
@@ -489,7 +765,8 @@ mod tests {
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
     use arrow::datatypes::{DataType, Field, TimeUnit};
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     #[test]
     fn date_trunc_test() {
@@ -581,14 +858,15 @@ mod tests {
         cases.iter().for_each(|(original, granularity, expected)| {
             let left = string_to_timestamp_nanos(original).unwrap();
             let right = string_to_timestamp_nanos(expected).unwrap();
-            let result = date_trunc_coarse(granularity, left, None).unwrap();
+            let granularity_enum = DateTruncGranularity::from_str(granularity).unwrap();
+            let result = date_trunc_coarse(granularity_enum, left, None).unwrap();
             assert_eq!(result, right, "{original} = {expected}");
         });
     }
 
     #[test]
     fn test_date_trunc_timezones() {
-        let cases = vec![
+        let cases = [
             (
                 vec![
                     "2020-09-08T00:00:00Z",
@@ -730,7 +1008,7 @@ mod tests {
                 Field::new("a", DataType::Utf8, false).into(),
                 Field::new("b", input.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(ScalarValue::from("day")),
                     ColumnarValue::Array(Arc::new(input)),
@@ -743,6 +1021,7 @@ mod tests {
                     true,
                 )
                 .into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = DateTruncFunc::new().invoke_with_args(args).unwrap();
             if let ColumnarValue::Array(result) = result {
@@ -760,7 +1039,7 @@ mod tests {
 
     #[test]
     fn test_date_trunc_hour_timezones() {
-        let cases = vec![
+        let cases = [
             (
                 vec![
                     "2020-09-08T00:30:00Z",
@@ -884,6 +1163,21 @@ mod tests {
                     "2018-11-04T02:00:00-02",
                 ],
             ),
+            (
+                vec![
+                    "2024-10-26T23:30:00Z",
+                    "2024-10-27T00:30:00Z",
+                    "2024-10-27T01:30:00Z",
+                    "2024-10-27T02:30:00Z",
+                ],
+                Some("Asia/Kathmandu".into()), // UTC+5:45
+                vec![
+                    "2024-10-27T05:00:00+05:45",
+                    "2024-10-27T06:00:00+05:45",
+                    "2024-10-27T07:00:00+05:45",
+                    "2024-10-27T08:00:00+05:45",
+                ],
+            ),
         ];
 
         cases.iter().for_each(|(original, tz_opt, expected)| {
@@ -902,7 +1196,7 @@ mod tests {
                 Field::new("a", DataType::Utf8, false).into(),
                 Field::new("b", input.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(ScalarValue::from("hour")),
                     ColumnarValue::Array(Arc::new(input)),
@@ -915,6 +1209,7 @@ mod tests {
                     true,
                 )
                 .into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = DateTruncFunc::new().invoke_with_args(args).unwrap();
             if let ColumnarValue::Array(result) = result {
@@ -929,4 +1224,176 @@ mod tests {
             }
         });
     }
+
+    #[test]
+    fn test_date_trunc_fine_granularity_timezones() {
+        let cases = [
+            // Test "second" granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:30.500000Z",
+                    "2020-09-08T13:42:31.999999Z",
+                ],
+                Some("+00".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000Z",
+                    "2020-09-08T13:42:30.000000Z",
+                    "2020-09-08T13:42:31.000000Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855+05",
+                    "2020-09-08T13:42:30.500000+05",
+                    "2020-09-08T13:42:31.999999+05",
+                ],
+                Some("+05".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000+05",
+                    "2020-09-08T13:42:30.000000+05",
+                    "2020-09-08T13:42:31.000000+05",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:30.500000Z",
+                    "2020-09-08T13:42:31.999999Z",
+                ],
+                Some("Europe/Berlin".into()),
+                "second",
+                vec![
+                    "2020-09-08T13:42:29.000000Z",
+                    "2020-09-08T13:42:30.000000Z",
+                    "2020-09-08T13:42:31.000000Z",
+                ],
+            ),
+            // Test "minute" granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                Some("+00".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855+08",
+                    "2020-09-08T13:43:30.500000+08",
+                    "2020-09-08T13:44:31.999999+08",
+                ],
+                Some("+08".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000+08",
+                    "2020-09-08T13:43:00.000000+08",
+                    "2020-09-08T13:44:00.000000+08",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                Some("America/Sao_Paulo".into()),
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            // Test with None (no timezone)
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:43:30.500000Z",
+                    "2020-09-08T13:44:31.999999Z",
+                ],
+                None,
+                "minute",
+                vec![
+                    "2020-09-08T13:42:00.000000Z",
+                    "2020-09-08T13:43:00.000000Z",
+                    "2020-09-08T13:44:00.000000Z",
+                ],
+            ),
+            // Test millisecond granularity
+            (
+                vec![
+                    "2020-09-08T13:42:29.190855Z",
+                    "2020-09-08T13:42:29.191999Z",
+                    "2020-09-08T13:42:29.192500Z",
+                ],
+                Some("Asia/Kolkata".into()),
+                "millisecond",
+                vec![
+                    "2020-09-08T19:12:29.190000+05:30",
+                    "2020-09-08T19:12:29.191000+05:30",
+                    "2020-09-08T19:12:29.192000+05:30",
+                ],
+            ),
+        ];
+
+        cases
+            .iter()
+            .for_each(|(original, tz_opt, granularity, expected)| {
+                let input = original
+                    .iter()
+                    .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                    .collect::<TimestampNanosecondArray>()
+                    .with_timezone_opt(tz_opt.clone());
+                let right = expected
+                    .iter()
+                    .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                    .collect::<TimestampNanosecondArray>()
+                    .with_timezone_opt(tz_opt.clone());
+                let batch_len = input.len();
+                let arg_fields = vec![
+                    Field::new("a", DataType::Utf8, false).into(),
+                    Field::new("b", input.data_type().clone(), false).into(),
+                ];
+                let args = ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Scalar(ScalarValue::from(*granularity)),
+                        ColumnarValue::Array(Arc::new(input)),
+                    ],
+                    arg_fields,
+                    number_rows: batch_len,
+                    return_field: Field::new(
+                        "f",
+                        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()),
+                        true,
+                    )
+                    .into(),
+                    config_options: Arc::new(ConfigOptions::default()),
+                };
+                let result = DateTruncFunc::new().invoke_with_args(args).unwrap();
+                if let ColumnarValue::Array(result) = result {
+                    assert_eq!(
+                        result.data_type(),
+                        &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()),
+                        "Failed for granularity: {granularity}, timezone: {tz_opt:?}"
+                    );
+                    let left = as_primitive_array::<TimestampNanosecondType>(&result);
+                    assert_eq!(
+                        left, &right,
+                        "Failed for granularity: {granularity}, timezone: {tz_opt:?}"
+                    );
+                } else {
+                    panic!("unexpected column type");
+                }
+            });
+    }
 }
diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs
index c1497040261ca..05007ad1138f3 100644
--- a/datafusion/functions/src/datetime/from_unixtime.rs
+++ b/datafusion/functions/src/datetime/from_unixtime.rs
@@ -21,10 +21,11 @@ use std::sync::Arc;
 use arrow::datatypes::DataType::{Int64, Timestamp, Utf8};
 use arrow::datatypes::TimeUnit::Second;
 use arrow::datatypes::{DataType, Field, FieldRef};
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -46,7 +47,7 @@ use datafusion_macros::user_doc;
         description = "Optional timezone to use when converting the integer to a timestamp. If not provided, the default timezone is UTC."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct FromUnixtimeFunc {
     signature: Signature,
 }
@@ -118,10 +119,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc {
         internal_err!("call return_field_from_args instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let len = args.len();
         if len != 1 && len != 2 {
@@ -133,7 +131,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc {
 
         if args[0].data_type() != Int64 {
             return exec_err!(
-                "Unsupported data type {:?} for function from_unixtime",
+                "Unsupported data type {} for function from_unixtime",
                 args[0].data_type()
             );
         }
@@ -145,7 +143,7 @@ impl ScalarUDFImpl for FromUnixtimeFunc {
                     .cast_to(&Timestamp(Second, Some(Arc::from(tz.to_string()))), None),
                 _ => {
                     exec_err!(
-                        "Unsupported data type {:?} for function from_unixtime",
+                        "Unsupported data type {} for function from_unixtime",
                         args[1].data_type()
                     )
                 }
@@ -166,17 +164,19 @@ mod test {
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::ScalarValue;
     use datafusion_common::ScalarValue::Int64;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
     #[test]
     fn test_without_timezone() {
         let arg_field = Arc::new(Field::new("a", DataType::Int64, true));
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![ColumnarValue::Scalar(Int64(Some(1729900800)))],
             arg_fields: vec![arg_field],
             number_rows: 1,
             return_field: Field::new("f", DataType::Timestamp(Second, None), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = FromUnixtimeFunc::new().invoke_with_args(args).unwrap();
 
@@ -194,7 +194,7 @@ mod test {
             Field::new("a", DataType::Int64, true).into(),
             Field::new("a", DataType::Utf8, true).into(),
         ];
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![
                 ColumnarValue::Scalar(Int64(Some(1729900800))),
                 ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -209,6 +209,7 @@ mod test {
                 true,
             )
             .into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = FromUnixtimeFunc::new().invoke_with_args(args).unwrap();
 
diff --git a/datafusion/functions/src/datetime/make_date.rs b/datafusion/functions/src/datetime/make_date.rs
index daa9bd83971f9..a816db0921d3a 100644
--- a/datafusion/functions/src/datetime/make_date.rs
+++ b/datafusion/functions/src/datetime/make_date.rs
@@ -21,14 +21,16 @@ use std::sync::Arc;
 use arrow::array::builder::PrimitiveBuilder;
 use arrow::array::cast::AsArray;
 use arrow::array::types::{Date32Type, Int32Type};
-use arrow::array::PrimitiveArray;
+use arrow::array::{Array, PrimitiveArray};
 use arrow::datatypes::DataType;
-use arrow::datatypes::DataType::{Date32, Int32, Int64, UInt32, UInt64, Utf8, Utf8View};
+use arrow::datatypes::DataType::Date32;
 use chrono::prelude::*;
 
-use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue};
+use datafusion_common::types::{NativeType, logical_int32, logical_string};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -51,7 +53,7 @@ use datafusion_macros::user_doc;
 +-----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/make_date.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "year",
@@ -66,7 +68,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
         description = "Day to use when making the date. Can be a constant, column or function, and any combination of arithmetic operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct MakeDateFunc {
     signature: Signature,
 }
@@ -79,12 +81,16 @@ impl Default for MakeDateFunc {
 
 impl MakeDateFunc {
     pub fn new() -> Self {
+        let int = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![
+                TypeSignatureClass::Integer,
+                TypeSignatureClass::Native(logical_string()),
+            ],
+            NativeType::Int32,
+        );
         Self {
-            signature: Signature::uniform(
-                3,
-                vec![Int32, Int64, UInt32, UInt64, Utf8, Utf8View],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![int; 3], Volatility::Immutable),
         }
     }
 }
@@ -106,84 +112,60 @@ impl ScalarUDFImpl for MakeDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        // first, identify if any of the arguments is an Array. If yes, store its `len`,
-        // as any scalar will need to be converted to an array of len `len`.
-        let args = args.args;
-        let len = args
-            .iter()
-            .fold(Option::<usize>::None, |acc, arg| match arg {
-                ColumnarValue::Scalar(_) => acc,
-                ColumnarValue::Array(a) => Some(a.len()),
-            });
-
-        let [years, months, days] = take_function_args(self.name(), args)?;
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [years, months, days] = take_function_args(self.name(), args.args)?;
 
-        let years = years.cast_to(&Int32, None)?;
-        let months = months.cast_to(&Int32, None)?;
-        let days = days.cast_to(&Int32, None)?;
-
-        let scalar_value_fn = |col: &ColumnarValue| -> Result<i32> {
-            let ColumnarValue::Scalar(s) = col else {
-                return exec_err!("Expected scalar value");
-            };
-            let ScalarValue::Int32(Some(i)) = s else {
-                return exec_err!("Unable to parse date from null/empty value");
-            };
-            Ok(*i)
-        };
-
-        let value = if let Some(array_size) = len {
-            let to_primitive_array_fn =
-                |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
-                    match col {
-                        ColumnarValue::Array(a) => {
-                            a.as_primitive::<Int32Type>().to_owned()
-                        }
-                        _ => {
-                            let v = scalar_value_fn(col).unwrap();
-                            PrimitiveArray::<Int32Type>::from_value(v, array_size)
-                        }
+        match (years, months, days) {
+            (ColumnarValue::Scalar(y), _, _) if y.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (_, ColumnarValue::Scalar(m), _) if m.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (_, _, ColumnarValue::Scalar(d)) if d.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+            }
+            (
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(years))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(months))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(days))),
+            ) => {
+                let mut value = 0;
+                make_date_inner(years, months, days, |days: i32| value = days)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(value))))
+            }
+            (years, months, days) => {
+                let len = args.number_rows;
+                let years = years.into_array(len)?;
+                let months = months.into_array(len)?;
+                let days = days.into_array(len)?;
+
+                let years = years.as_primitive::<Int32Type>();
+                let months = months.as_primitive::<Int32Type>();
+                let days = days.as_primitive::<Int32Type>();
+
+                let mut builder: PrimitiveBuilder<Date32Type> =
+                    PrimitiveArray::builder(len);
+
+                for i in 0..len {
+                    // match postgresql behaviour which returns null for any null input
+                    if years.is_null(i) || months.is_null(i) || days.is_null(i) {
+                        builder.append_null();
+                    } else {
+                        make_date_inner(
+                            years.value(i),
+                            months.value(i),
+                            days.value(i),
+                            |days: i32| builder.append_value(days),
+                        )?;
                     }
-                };
+                }
 
-            let years = to_primitive_array_fn(&years);
-            let months = to_primitive_array_fn(&months);
-            let days = to_primitive_array_fn(&days);
-
-            let mut builder: PrimitiveBuilder<Date32Type> =
-                PrimitiveArray::builder(array_size);
-            for i in 0..array_size {
-                make_date_inner(
-                    years.value(i),
-                    months.value(i),
-                    days.value(i),
-                    |days: i32| builder.append_value(days),
-                )?;
+                Ok(ColumnarValue::Array(Arc::new(builder.finish())))
             }
-
-            let arr = builder.finish();
-
-            ColumnarValue::Array(Arc::new(arr))
-        } else {
-            // For scalar only columns the operation is faster without using the PrimitiveArray.
-            // Also, keep the output as scalar since all inputs are scalar.
-            let mut value = 0;
-            make_date_inner(
-                scalar_value_fn(&years)?,
-                scalar_value_fn(&months)?,
-                scalar_value_fn(&days)?,
-                |days: i32| value = days,
-            )?;
-
-            ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
-        };
-
-        Ok(value)
+        }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -197,11 +179,13 @@ fn make_date_inner<F: FnMut(i32)>(
     day: i32,
     mut date_consumer_fn: F,
 ) -> Result<()> {
-    let Ok(m) = u32::try_from(month) else {
-        return exec_err!("Month value '{month:?}' is out of range");
+    let m = match month {
+        1..=12 => month as u32,
+        _ => return exec_err!("Month value '{month:?}' is out of range"),
     };
-    let Ok(d) = u32::try_from(day) else {
-        return exec_err!("Day value '{day:?}' is out of range");
+    let d = match day {
+        1..=31 => day as u32,
+        _ => return exec_err!("Day value '{day:?}' is out of range"),
     };
 
     if let Some(date) = NaiveDate::from_ymd_opt(year, m, d) {
@@ -218,163 +202,3 @@ fn make_date_inner<F: FnMut(i32)>(
         exec_err!("Unable to parse date from {year}, {month}, {day}")
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use crate::datetime::make_date::MakeDateFunc;
-    use arrow::array::{Array, Date32Array, Int32Array, Int64Array, UInt32Array};
-    use arrow::datatypes::{DataType, Field};
-    use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-    use std::sync::Arc;
-
-    fn invoke_make_date_with_args(
-        args: Vec<ColumnarValue>,
-        number_rows: usize,
-    ) -> Result<ColumnarValue, DataFusionError> {
-        let arg_fields = args
-            .iter()
-            .map(|arg| Field::new("a", arg.data_type(), true).into())
-            .collect::<Vec<_>>();
-        let args = datafusion_expr::ScalarFunctionArgs {
-            args,
-            arg_fields,
-            number_rows,
-            return_field: Field::new("f", DataType::Date32, true).into(),
-        };
-        MakeDateFunc::new().invoke_with_args(args)
-    }
-
-    #[test]
-    fn test_make_date() {
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))),
-                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))),
-                ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))),
-                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))),
-                ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))),
-            ],
-            1,
-        )
-        .expect("that make_date parsed values without error");
-
-        if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res {
-            assert_eq!(19736, date.unwrap());
-        } else {
-            panic!("Expected a scalar value")
-        }
-
-        let years = Arc::new((2021..2025).map(Some).collect::<Int64Array>());
-        let months = Arc::new((1..5).map(Some).collect::<Int32Array>());
-        let days = Arc::new((11..15).map(Some).collect::<UInt32Array>());
-        let batch_len = years.len();
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Array(years),
-                ColumnarValue::Array(months),
-                ColumnarValue::Array(days),
-            ],
-            batch_len,
-        )
-        .unwrap();
-
-        if let ColumnarValue::Array(array) = res {
-            assert_eq!(array.len(), 4);
-            let mut builder = Date32Array::builder(4);
-            builder.append_value(18_638);
-            builder.append_value(19_035);
-            builder.append_value(19_429);
-            builder.append_value(19_827);
-            assert_eq!(&builder.finish() as &dyn Array, array.as_ref());
-        } else {
-            panic!("Expected a columnar array")
-        }
-
-        //
-        // Fallible test cases
-        //
-
-        // invalid number of arguments
-        let res = invoke_make_date_with_args(
-            vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Execution error: make_date function requires 3 arguments, got 1"
-        );
-
-        // invalid type
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))),
-                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
-                ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Casting from Interval(YearMonth) to Int32 not supported"
-        );
-
-        // overflow of month
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))),
-                ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))),
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(22))),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int32"
-        );
-
-        // overflow of day
-        let res = invoke_make_date_with_args(
-            vec![
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))),
-                ColumnarValue::Scalar(ScalarValue::Int32(Some(22))),
-                ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))),
-            ],
-            1,
-        );
-        assert_eq!(
-            res.err().unwrap().strip_backtrace(),
-            "Arrow error: Cast error: Can't cast value 4294967295 to type Int32"
-        );
-    }
-}
diff --git a/datafusion/functions/src/datetime/make_time.rs b/datafusion/functions/src/datetime/make_time.rs
new file mode 100644
index 0000000000000..72cca1e0fff57
--- /dev/null
+++ b/datafusion/functions/src/datetime/make_time.rs
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::builder::PrimitiveBuilder;
+use arrow::array::cast::AsArray;
+use arrow::array::types::Int32Type;
+use arrow::array::{Array, PrimitiveArray};
+use arrow::datatypes::DataType::Time32;
+use arrow::datatypes::{DataType, Time32SecondType, TimeUnit};
+use chrono::prelude::*;
+
+use datafusion_common::types::{NativeType, logical_int32, logical_string};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
+use datafusion_macros::user_doc;
+
+#[user_doc(
+    doc_section(label = "Time and Date Functions"),
+    description = "Make a time from hour/minute/second component parts.",
+    syntax_example = "make_time(hour, minute, second)",
+    sql_example = r#"```sql
+> select make_time(13, 23, 1);
++-------------------------------------------+
+| make_time(Int64(13),Int64(23),Int64(1))   |
++-------------------------------------------+
+| 13:23:01                                  |
++-------------------------------------------+
+> select make_time('23', '01', '31');
++-----------------------------------------------+
+| make_time(Utf8("23"),Utf8("01"),Utf8("31"))   |
++-----------------------------------------------+
+| 23:01:31                                      |
++-----------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+"#,
+    argument(
+        name = "hour",
+        description = "Hour to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    ),
+    argument(
+        name = "minute",
+        description = "Minute to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    ),
+    argument(
+        name = "second",
+        description = "Second to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MakeTimeFunc {
+    signature: Signature,
+}
+
+impl Default for MakeTimeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MakeTimeFunc {
+    pub fn new() -> Self {
+        let int = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![
+                TypeSignatureClass::Integer,
+                TypeSignatureClass::Native(logical_string()),
+            ],
+            NativeType::Int32,
+        );
+        Self {
+            signature: Signature::coercible(vec![int; 3], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MakeTimeFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "make_time"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Time32(TimeUnit::Second))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [hours, minutes, seconds] = take_function_args(self.name(), args.args)?;
+
+        match (hours, minutes, seconds) {
+            (ColumnarValue::Scalar(h), _, _) if h.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (_, ColumnarValue::Scalar(m), _) if m.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (_, _, ColumnarValue::Scalar(s)) if s.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(None)))
+            }
+            (
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(hours))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(minutes))),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(seconds))),
+            ) => {
+                let mut value = 0;
+                make_time_inner(hours, minutes, seconds, |seconds: i32| value = seconds)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::Time32Second(Some(
+                    value,
+                ))))
+            }
+            (hours, minutes, seconds) => {
+                let len = args.number_rows;
+                let hours = hours.into_array(len)?;
+                let minutes = minutes.into_array(len)?;
+                let seconds = seconds.into_array(len)?;
+
+                let hours = hours.as_primitive::<Int32Type>();
+                let minutes = minutes.as_primitive::<Int32Type>();
+                let seconds = seconds.as_primitive::<Int32Type>();
+
+                let mut builder: PrimitiveBuilder<Time32SecondType> =
+                    PrimitiveArray::builder(len);
+
+                for i in 0..len {
+                    // match postgresql behaviour which returns null for any null input
+                    if hours.is_null(i) || minutes.is_null(i) || seconds.is_null(i) {
+                        builder.append_null();
+                    } else {
+                        make_time_inner(
+                            hours.value(i),
+                            minutes.value(i),
+                            seconds.value(i),
+                            |seconds: i32| builder.append_value(seconds),
+                        )?;
+                    }
+                }
+
+                Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+            }
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Converts the hour/minute/second fields to an `i32` representing the seconds from
+/// midnight and invokes `time_consumer_fn` with the value
+fn make_time_inner<F: FnMut(i32)>(
+    hour: i32,
+    minute: i32,
+    second: i32,
+    mut time_consumer_fn: F,
+) -> Result<()> {
+    let h = match hour {
+        0..=24 => hour as u32,
+        _ => return exec_err!("Hour value '{hour:?}' is out of range"),
+    };
+    let m = match minute {
+        0..=60 => minute as u32,
+        _ => return exec_err!("Minute value '{minute:?}' is out of range"),
+    };
+    let s = match second {
+        0..=60 => second as u32,
+        _ => return exec_err!("Second value '{second:?}' is out of range"),
+    };
+
+    if let Some(time) = NaiveTime::from_hms_opt(h, m, s) {
+        time_consumer_fn(time.num_seconds_from_midnight() as i32);
+        Ok(())
+    } else {
+        exec_err!("Unable to parse time from {hour}, {minute}, {second}")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::datetime::make_time::MakeTimeFunc;
+    use arrow::array::{Array, Int32Array, Time32SecondArray};
+    use arrow::datatypes::TimeUnit::Second;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::DataFusionError;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
+
+    fn invoke_make_time_with_args(
+        args: Vec<ColumnarValue>,
+        number_rows: usize,
+    ) -> Result<ColumnarValue, DataFusionError> {
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Field::new("f", DataType::Time32(Second), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        MakeTimeFunc::new().invoke_with_args(args)
+    }
+
+    #[test]
+    fn test_make_time() {
+        let hours = Arc::new((4..8).map(Some).collect::<Int32Array>());
+        let minutes = Arc::new((1..5).map(Some).collect::<Int32Array>());
+        let seconds = Arc::new((11..15).map(Some).collect::<Int32Array>());
+        let batch_len = hours.len();
+        let res = invoke_make_time_with_args(
+            vec![
+                ColumnarValue::Array(hours),
+                ColumnarValue::Array(minutes),
+                ColumnarValue::Array(seconds),
+            ],
+            batch_len,
+        )
+        .unwrap();
+
+        if let ColumnarValue::Array(array) = res {
+            assert_eq!(array.len(), 4);
+
+            let mut builder = Time32SecondArray::builder(4);
+            builder.append_value(14_471);
+            builder.append_value(18_132);
+            builder.append_value(21_793);
+            builder.append_value(25_454);
+            assert_eq!(&builder.finish() as &dyn Array, array.as_ref());
+        } else {
+            panic!("Expected a columnar array")
+        }
+    }
+}
diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs
index dee40215c9ea5..39b9453295df6 100644
--- a/datafusion/functions/src/datetime/mod.rs
+++ b/datafusion/functions/src/datetime/mod.rs
@@ -29,10 +29,13 @@ pub mod date_part;
 pub mod date_trunc;
 pub mod from_unixtime;
 pub mod make_date;
+pub mod make_time;
 pub mod now;
+pub mod planner;
 pub mod to_char;
 pub mod to_date;
 pub mod to_local_time;
+pub mod to_time;
 pub mod to_timestamp;
 pub mod to_unixtime;
 
@@ -43,17 +46,24 @@ make_udf_function!(date_bin::DateBinFunc, date_bin);
 make_udf_function!(date_part::DatePartFunc, date_part);
 make_udf_function!(date_trunc::DateTruncFunc, date_trunc);
 make_udf_function!(make_date::MakeDateFunc, make_date);
+make_udf_function!(make_time::MakeTimeFunc, make_time);
 make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime);
-make_udf_function!(now::NowFunc, now);
 make_udf_function!(to_char::ToCharFunc, to_char);
 make_udf_function!(to_date::ToDateFunc, to_date);
 make_udf_function!(to_local_time::ToLocalTimeFunc, to_local_time);
+make_udf_function!(to_time::ToTimeFunc, to_time);
 make_udf_function!(to_unixtime::ToUnixtimeFunc, to_unixtime);
-make_udf_function!(to_timestamp::ToTimestampFunc, to_timestamp);
-make_udf_function!(to_timestamp::ToTimestampSecondsFunc, to_timestamp_seconds);
-make_udf_function!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis);
-make_udf_function!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros);
-make_udf_function!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos);
+make_udf_function_with_config!(to_timestamp::ToTimestampFunc, to_timestamp);
+make_udf_function_with_config!(
+    to_timestamp::ToTimestampSecondsFunc,
+    to_timestamp_seconds
+);
+make_udf_function_with_config!(to_timestamp::ToTimestampMillisFunc, to_timestamp_millis);
+make_udf_function_with_config!(to_timestamp::ToTimestampMicrosFunc, to_timestamp_micros);
+make_udf_function_with_config!(to_timestamp::ToTimestampNanosFunc, to_timestamp_nanos);
+
+// create UDF with config
+make_udf_function_with_config!(now::NowFunc, now);
 
 // we cannot currently use the export_functions macro since it doesn't handle
 // functions with varargs currently
@@ -87,9 +97,14 @@ pub mod expr_fn {
         make_date,
         "make a date from year, month and day component parts",
         year month day
+    ),(
+        make_time,
+        "make a time from hour, minute and second component parts",
+        hour minute second
     ),(
         now,
         "returns the current timestamp in nanoseconds, using the same value for all instances of now() in same statement",
+        @config
     ),
     (
         to_local_time,
@@ -98,28 +113,32 @@ pub mod expr_fn {
     ),
     (
         to_unixtime,
-        "converts a string and optional formats to a Unixtime",
+        "converts a value to seconds since the unix epoch",
         args,
     ),(
-        to_timestamp,
-        "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`",
+        to_time,
+        "converts a string and optional formats to a `Time64(Nanoseconds)`",
         args,
+    ),(
+        to_timestamp,
+        "converts a string and optional formats to a `Timestamp(Nanoseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_seconds,
-        "converts a string and optional formats to a `Timestamp(Seconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Seconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_millis,
-        "converts a string and optional formats to a `Timestamp(Milliseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Milliseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_micros,
-        "converts a string and optional formats to a `Timestamp(Microseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Microseconds, TimeZone)`",
+        @config args,
     ),(
         to_timestamp_nanos,
-        "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`",
-        args,
+        "converts a string and optional formats to a `Timestamp(Nanoseconds, TimeZone)`",
+        @config args,
     ));
 
     /// Returns a string representation of a date, time, timestamp or duration based
@@ -254,6 +273,8 @@ pub mod expr_fn {
 
 /// Returns all DataFusion functions defined in this package
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
+    use datafusion_common::config::ConfigOptions;
+    let config = ConfigOptions::default();
     vec![
         current_date(),
         current_time(),
@@ -262,15 +283,17 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         date_trunc(),
         from_unixtime(),
         make_date(),
-        now(),
+        make_time(),
+        now(&config),
         to_char(),
         to_date(),
         to_local_time(),
+        to_time(),
         to_unixtime(),
-        to_timestamp(),
-        to_timestamp_seconds(),
-        to_timestamp_millis(),
-        to_timestamp_micros(),
-        to_timestamp_nanos(),
+        to_timestamp(&config),
+        to_timestamp_seconds(&config),
+        to_timestamp_millis(&config),
+        to_timestamp_micros(&config),
+        to_timestamp_nanos(&config),
     ]
 }
diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs
index 30b4d4ca9c76f..bd67e4cfcbba0 100644
--- a/datafusion/functions/src/datetime/now.rs
+++ b/datafusion/functions/src/datetime/now.rs
@@ -19,41 +19,59 @@ use arrow::datatypes::DataType::Timestamp;
 use arrow::datatypes::TimeUnit::Nanosecond;
 use arrow::datatypes::{DataType, Field, FieldRef};
 use std::any::Any;
+use std::sync::Arc;
 
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarUDFImpl, Signature,
-    Volatility,
+    ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r#"
-Returns the current UTC timestamp.
+Returns the current timestamp in the system configured timezone (None by default).
 
 The `now()` return value is determined at query time and will return the same timestamp, no matter when in the query plan the function executes.
 "#,
     syntax_example = "now()"
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NowFunc {
     signature: Signature,
     aliases: Vec<String>,
+    timezone: Option<Arc<str>>,
 }
 
 impl Default for NowFunc {
     fn default() -> Self {
-        Self::new()
+        Self::new_with_config(&ConfigOptions::default())
     }
 }
 
 impl NowFunc {
+    #[deprecated(since = "50.2.0", note = "use `new_with_config` instead")]
+    /// Deprecated constructor retained for backwards compatibility.
+    ///
+    /// Prefer [`NowFunc::new_with_config`] which allows specifying the
+    /// timezone via [`ConfigOptions`]. This helper now mirrors the
+    /// canonical default offset (None) provided by `ConfigOptions::default()`.
     pub fn new() -> Self {
+        Self::new_with_config(&ConfigOptions::default())
+    }
+
+    pub fn new_with_config(config: &ConfigOptions) -> Self {
         Self {
             signature: Signature::nullary(Volatility::Stable),
             aliases: vec!["current_timestamp".to_string()],
+            timezone: config
+                .execution
+                .time_zone
+                .as_ref()
+                .map(|tz| Arc::from(tz.as_str())),
         }
     }
 }
@@ -77,10 +95,14 @@ impl ScalarUDFImpl for NowFunc {
         &self.signature
     }
 
+    fn with_updated_config(&self, config: &ConfigOptions) -> Option<ScalarUDF> {
+        Some(Self::new_with_config(config).into())
+    }
+
     fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
         Ok(Field::new(
             self.name(),
-            Timestamp(Nanosecond, Some("+00:00".into())),
+            Timestamp(Nanosecond, self.timezone.clone()),
             false,
         )
         .into())
@@ -90,24 +112,25 @@ impl ScalarUDFImpl for NowFunc {
         internal_err!("return_field_from_args should be called instead")
     }
 
-    fn invoke_with_args(
-        &self,
-        _args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         internal_err!("invoke should not be called on a simplified now() function")
     }
 
     fn simplify(
         &self,
-        _args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let now_ts = info
-            .execution_props()
-            .query_execution_start_time
-            .timestamp_nanos_opt();
+        let Some(now_ts) = info.query_execution_start_time() else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
         Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-            ScalarValue::TimestampNanosecond(now_ts, Some("+00:00".into())),
+            ScalarValue::TimestampNanosecond(
+                now_ts.timestamp_nanos_opt(),
+                self.timezone.clone(),
+            ),
+            None,
         )))
     }
 
@@ -119,3 +142,44 @@ impl ScalarUDFImpl for NowFunc {
         self.doc()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[expect(deprecated)]
+    #[test]
+    fn now_func_default_matches_config() {
+        let default_config = ConfigOptions::default();
+
+        let legacy_now = NowFunc::new();
+        let configured_now = NowFunc::new_with_config(&default_config);
+
+        let empty_fields: [FieldRef; 0] = [];
+        let empty_scalars: [Option<&ScalarValue>; 0] = [];
+
+        let legacy_field = legacy_now
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &empty_fields,
+                scalar_arguments: &empty_scalars,
+            })
+            .expect("legacy now() return field");
+
+        let configured_field = configured_now
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &empty_fields,
+                scalar_arguments: &empty_scalars,
+            })
+            .expect("configured now() return field");
+
+        assert_eq!(legacy_field.as_ref(), configured_field.as_ref());
+
+        let legacy_scalar =
+            ScalarValue::TimestampNanosecond(None, legacy_now.timezone.clone());
+        let configured_scalar =
+            ScalarValue::TimestampNanosecond(None, configured_now.timezone.clone());
+
+        assert_eq!(legacy_scalar, configured_scalar);
+        assert_eq!(None, legacy_now.timezone.as_deref());
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/types.rs b/datafusion/functions/src/datetime/planner.rs
similarity index 54%
rename from datafusion/sqllogictest/src/engines/postgres_engine/types.rs
rename to datafusion/functions/src/datetime/planner.rs
index 510462befb086..f2b8ef9d1d310 100644
--- a/datafusion/sqllogictest/src/engines/postgres_engine/types.rs
+++ b/datafusion/functions/src/datetime/planner.rs
@@ -15,31 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use postgres_types::Type;
-use std::fmt::Display;
-use tokio_postgres::types::FromSql;
+//! SQL planning extensions like [`DatetimeFunctionPlanner`]
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
 
-pub struct PgRegtype {
-    value: String,
-}
-
-impl<'a> FromSql<'a> for PgRegtype {
-    fn from_sql(
-        _: &Type,
-        buf: &'a [u8],
-    ) -> Result<Self, Box<dyn std::error::Error + Sync + Send>> {
-        let oid = postgres_protocol::types::oid_from_sql(buf)?;
-        let value = Type::from_oid(oid).ok_or("bad type")?.to_string();
-        Ok(PgRegtype { value })
-    }
-
-    fn accepts(ty: &Type) -> bool {
-        matches!(*ty, Type::REGTYPE)
-    }
-}
+#[derive(Default, Debug)]
+pub struct DatetimeFunctionPlanner;
 
-impl Display for PgRegtype {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.value)
+impl ExprPlanner for DatetimeFunctionPlanner {
+    fn plan_extract(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::datetime::date_part(), args),
+        )))
     }
 }
diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs
index 3e89242aba263..6caff4f7c7463 100644
--- a/datafusion/functions/src/datetime/to_char.rs
+++ b/datafusion/functions/src/datetime/to_char.rs
@@ -18,20 +18,21 @@
 use std::any::Any;
 use std::sync::Arc;
 
+use arrow::array::builder::StringBuilder;
 use arrow::array::cast::AsArray;
-use arrow::array::{new_null_array, Array, ArrayRef, StringArray};
+use arrow::array::{Array, ArrayRef};
+use arrow::compute::cast;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{
     Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
 };
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::error::ArrowError;
 use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
-
-use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TIMEZONE_WILDCARD, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -48,7 +49,7 @@ use datafusion_macros::user_doc;
 +----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -63,7 +64,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
         description = "Day to use when making the date. Can be a constant, column or function, and any combination of arithmetic operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToCharFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -135,50 +136,42 @@ impl ScalarUDFImpl for ToCharFunc {
         Ok(Utf8)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         let [date_time, format] = take_function_args(self.name(), &args)?;
 
         match format {
-            ColumnarValue::Scalar(ScalarValue::Utf8(None))
-            | ColumnarValue::Scalar(ScalarValue::Null) => {
-                _to_char_scalar(date_time.clone(), None)
-            }
-            // constant format
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
-                // invoke to_char_scalar with the known string, without converting to array
-                _to_char_scalar(date_time.clone(), Some(format))
+            ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            ColumnarValue::Array(_) => _to_char_array(&args),
-            _ => {
-                exec_err!(
-                    "Format for `to_char` must be non-null Utf8, received {:?}",
-                    format.data_type()
-                )
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
+                to_char_scalar(date_time, fmt)
             }
+            ColumnarValue::Array(_) => to_char_array(&args),
+            _ => exec_err!(
+                "Format for `to_char` must be non-null Utf8, received {}",
+                format.data_type()
+            ),
         }
     }
 
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-fn _build_format_options<'a>(
+fn build_format_options<'a>(
     data_type: &DataType,
-    format: Option<&'a str>,
-) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
-    let Some(format) = format else {
-        return Ok(FormatOptions::new());
-    };
+    format: &'a str,
+) -> Result<FormatOptions<'a>> {
     let format_options = match data_type {
-        Date32 => FormatOptions::new().with_date_format(Some(format)),
+        Date32 => FormatOptions::new()
+            .with_date_format(Some(format))
+            .with_datetime_format(Some(format)),
         Date64 => FormatOptions::new().with_datetime_format(Some(format)),
         Time32(_) => FormatOptions::new().with_time_format(Some(format)),
         Time64(_) => FormatOptions::new().with_time_format(Some(format)),
@@ -193,104 +186,111 @@ fn _build_format_options<'a>(
             },
         ),
         other => {
-            return Err(exec_err!(
+            return exec_err!(
                 "to_char only supports date, time, timestamp and duration data types, received {other:?}"
-            ));
+            );
         }
     };
     Ok(format_options)
 }
 
-/// Special version when arg\[1] is a scalar
-fn _to_char_scalar(
-    expression: ColumnarValue,
-    format: Option<&str>,
-) -> Result<ColumnarValue> {
-    // it's possible that the expression is a scalar however because
-    // of the implementation in arrow-rs we need to convert it to an array
+/// Formats `expression` using a constant `format` string.
+fn to_char_scalar(expression: &ColumnarValue, format: &str) -> Result<ColumnarValue> {
+    // ArrayFormatter requires an array, so scalar expressions must be
+    // converted to a 1-element array first.
     let data_type = &expression.data_type();
     let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
-    let array = expression.into_array(1)?;
+    let array = expression.to_array(1)?;
 
-    if format.is_none() {
-        if is_scalar_expression {
-            return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
-        } else {
-            return Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())));
-        }
-    }
+    let format_options = build_format_options(data_type, format)?;
+    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
 
-    let format_options = match _build_format_options(data_type, format) {
-        Ok(value) => value,
-        Err(value) => return value,
-    };
+    // Pad the preallocated capacity a bit because format specifiers often
+    // expand the string (e.g., %Y -> "2026")
+    let fmt_len = format.len() + 10;
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * fmt_len);
 
-    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
-    let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
-        .map(|i| {
-            if array.is_null(i) {
-                Ok(None)
-            } else {
-                formatter.value(i).try_to_string().map(Some)
-            }
-        })
-        .collect();
-
-    if let Ok(formatted) = formatted {
-        if is_scalar_expression {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
-                formatted.first().unwrap().clone(),
-            )))
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            builder.append_null();
         } else {
-            Ok(ColumnarValue::Array(
-                Arc::new(StringArray::from(formatted)) as ArrayRef
-            ))
+            // Write directly into the builder's internal buffer, then
+            // commit the value with append_value("").
+            match formatter.value(i).write(&mut builder) {
+                Ok(()) => builder.append_value(""),
+                // Arrow's Date32 formatter only handles date specifiers
+                // (%Y, %m, %d, ...). Format strings with time specifiers
+                // (%H, %M, %S, ...) cause it to fail. When this happens,
+                // we retry by casting to Date64, whose datetime formatter
+                // handles both date and time specifiers (with zero for
+                // the time components).
+                Err(_) if data_type == &Date32 => {
+                    return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
+                }
+                Err(e) => return Err(e.into()),
+            }
         }
+    }
+
+    let result = builder.finish();
+    if is_scalar_expression {
+        let val = result.is_valid(0).then(|| result.value(0).to_string());
+        Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
     } else {
-        exec_err!("{}", formatted.unwrap_err())
+        Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
     }
 }
 
-fn _to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     let arrays = ColumnarValue::values_to_arrays(args)?;
-    let mut results: Vec<Option<String>> = vec![];
+    let data_array = &arrays[0];
     let format_array = arrays[1].as_string::<i32>();
-    let data_type = arrays[0].data_type();
+    let data_type = data_array.data_type();
 
-    for idx in 0..arrays[0].len() {
-        let format = if format_array.is_null(idx) {
-            None
-        } else {
-            Some(format_array.value(idx))
-        };
-        if format.is_none() {
-            results.push(None);
+    // Arbitrary guess for the length of a typical formatted datetime string
+    let fmt_len = 30;
+    let mut builder =
+        StringBuilder::with_capacity(data_array.len(), data_array.len() * fmt_len);
+    let mut buffer = String::with_capacity(fmt_len);
+
+    for idx in 0..data_array.len() {
+        if format_array.is_null(idx) || data_array.is_null(idx) {
+            builder.append_null();
             continue;
         }
-        let format_options = match _build_format_options(data_type, format) {
-            Ok(value) => value,
-            Err(value) => return value,
-        };
-        // this isn't ideal but this can't use ValueFormatter as it isn't independent
-        // from ArrayFormatter
-        let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?;
-        let result = formatter.value(idx).try_to_string();
-        match result {
-            Ok(value) => results.push(Some(value)),
-            Err(e) => return exec_err!("{}", e),
+
+        let format = format_array.value(idx);
+        let format_options = build_format_options(data_type, format)?;
+        let formatter = ArrayFormatter::try_new(data_array.as_ref(), &format_options)?;
+
+        buffer.clear();
+
+        // We'd prefer to write directly to the StringBuilder's internal buffer,
+        // but the write might fail, and there's no easy way to ensure a partial
+        // write is removed from the buffer. So instead we write to a temporary
+        // buffer and `append_value` on success.
+        match formatter.value(idx).write(&mut buffer) {
+            Ok(()) => builder.append_value(&buffer),
+            // Retry with Date64 (see comment in to_char_scalar).
+            Err(_) if data_type == &Date32 => {
+                buffer.clear();
+                let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
+                let retry_fmt =
+                    ArrayFormatter::try_new(date64_value.as_ref(), &format_options)?;
+                retry_fmt.value(0).write(&mut buffer)?;
+                builder.append_value(&buffer);
+            }
+            Err(e) => return Err(e.into()),
         }
     }
 
+    let result = builder.finish();
     match args[0] {
-        ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from(
-            results,
-        )) as ArrayRef)),
-        ColumnarValue::Scalar(_) => match results.first().unwrap() {
-            Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-                value.to_string(),
-            )))),
-            None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
-        },
+        ColumnarValue::Scalar(_) => {
+            let val = result.is_valid(0).then(|| result.value(0).to_string());
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+        }
+        ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)),
     }
 }
 
@@ -306,9 +306,49 @@ mod tests {
     use arrow::datatypes::{DataType, Field, TimeUnit};
     use chrono::{NaiveDateTime, Timelike};
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
+    #[test]
+    fn test_array_array() {
+        let array_array_data = vec![(
+            Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef,
+            StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]),
+            StringArray::from(vec!["2020::09::01", "2020::09::02 00::00::00 000000000"]),
+        )];
+
+        for (value, format, expected) in array_array_data {
+            let batch_len = value.len();
+            let value_data_type = value.data_type().clone();
+            let format_data_type = format.data_type().clone();
+
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(value),
+                    ColumnarValue::Array(Arc::new(format) as ArrayRef),
+                ],
+                arg_fields: vec![
+                    Field::new("a", value_data_type, true).into(),
+                    Field::new("b", format_data_type, true).into(),
+                ],
+                number_rows: batch_len,
+                return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::clone(&Arc::new(ConfigOptions::default())),
+            };
+            let result = ToCharFunc::new()
+                .invoke_with_args(args)
+                .expect("that to_char parsed values without error");
+
+            if let ColumnarValue::Array(result) = result {
+                assert_eq!(result.len(), 2);
+                assert_eq!(&expected as &dyn Array, result.as_ref());
+            } else {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
     #[test]
     fn test_to_char() {
         let date = "2020-01-02T03:04:05"
@@ -328,6 +368,11 @@ mod tests {
                 ScalarValue::Utf8(Some("%Y::%m::%d".to_string())),
                 "2020::09::01".to_string(),
             ),
+            (
+                ScalarValue::Date32(Some(18506)),
+                ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())),
+                "2020::09::01 00::00::00 000000000".to_string(),
+            ),
             (
                 ScalarValue::Date64(Some(date.and_utc().timestamp_millis())),
                 ScalarValue::Utf8(Some("%Y::%m::%d".to_string())),
@@ -389,11 +434,12 @@ mod tests {
                 Field::new("a", value.data_type(), false).into(),
                 Field::new("a", format.data_type(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)],
                 arg_fields,
                 number_rows: 1,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = ToCharFunc::new()
                 .invoke_with_args(args)
@@ -412,6 +458,11 @@ mod tests {
                 StringArray::from(vec!["%Y::%m::%d".to_string()]),
                 "2020::09::01".to_string(),
             ),
+            (
+                ScalarValue::Date32(Some(18506)),
+                StringArray::from(vec!["%Y::%m::%d %S::%M::%H %f".to_string()]),
+                "2020::09::01 00::00::00 000000000".to_string(),
+            ),
             (
                 ScalarValue::Date64(Some(date.and_utc().timestamp_millis())),
                 StringArray::from(vec!["%Y::%m::%d".to_string()]),
@@ -474,7 +525,7 @@ mod tests {
                 Field::new("a", value.data_type(), false).into(),
                 Field::new("a", format.data_type().to_owned(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Scalar(value),
                     ColumnarValue::Array(Arc::new(format) as ArrayRef),
@@ -482,6 +533,7 @@ mod tests {
                 arg_fields,
                 number_rows: batch_len,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = ToCharFunc::new()
                 .invoke_with_args(args)
@@ -500,6 +552,14 @@ mod tests {
                 ScalarValue::Utf8(Some("%Y::%m::%d".to_string())),
                 StringArray::from(vec!["2020::09::01", "2020::09::02"]),
             ),
+            (
+                Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef,
+                ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())),
+                StringArray::from(vec![
+                    "2020::09::01 00::00::00 000000000",
+                    "2020::09::02 00::00::00 000000000",
+                ]),
+            ),
             (
                 Arc::new(Date64Array::from(vec![
                     date.and_utc().timestamp_millis(),
@@ -516,6 +576,25 @@ mod tests {
                 StringArray::from(vec!["%Y::%m::%d", "%d::%m::%Y"]),
                 StringArray::from(vec!["2020::09::01", "02::09::2020"]),
             ),
+            (
+                Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef,
+                StringArray::from(vec![
+                    "%Y::%m::%d %S::%M::%H %f",
+                    "%Y::%m::%d %S::%M::%H %f",
+                ]),
+                StringArray::from(vec![
+                    "2020::09::01 00::00::00 000000000",
+                    "2020::09::02 00::00::00 000000000",
+                ]),
+            ),
+            (
+                Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef,
+                StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]),
+                StringArray::from(vec![
+                    "2020::09::01",
+                    "2020::09::02 00::00::00 000000000",
+                ]),
+            ),
             (
                 Arc::new(Date64Array::from(vec![
                     date.and_utc().timestamp_millis(),
@@ -610,7 +689,7 @@ mod tests {
                 Field::new("a", value.data_type().clone(), false).into(),
                 Field::new("a", format.data_type(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Array(value as ArrayRef),
                     ColumnarValue::Scalar(format),
@@ -618,6 +697,7 @@ mod tests {
                 arg_fields,
                 number_rows: batch_len,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = ToCharFunc::new()
                 .invoke_with_args(args)
@@ -637,7 +717,7 @@ mod tests {
                 Field::new("a", value.data_type().clone(), false).into(),
                 Field::new("a", format.data_type().clone(), false).into(),
             ];
-            let args = datafusion_expr::ScalarFunctionArgs {
+            let args = ScalarFunctionArgs {
                 args: vec![
                     ColumnarValue::Array(value),
                     ColumnarValue::Array(Arc::new(format) as ArrayRef),
@@ -645,6 +725,7 @@ mod tests {
                 arg_fields,
                 number_rows: batch_len,
                 return_field: Field::new("f", DataType::Utf8, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = ToCharFunc::new()
                 .invoke_with_args(args)
@@ -664,11 +745,12 @@ mod tests {
 
         // invalid number of arguments
         let arg_field = Field::new("a", DataType::Int32, true).into();
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))],
             arg_fields: vec![arg_field],
             number_rows: 1,
             return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = ToCharFunc::new().invoke_with_args(args);
         assert_eq!(
@@ -681,7 +763,7 @@ mod tests {
             Field::new("a", DataType::Utf8, true).into(),
             Field::new("a", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(),
         ];
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args: vec![
                 ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
                 ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)),
@@ -689,11 +771,12 @@ mod tests {
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = ToCharFunc::new().invoke_with_args(args);
         assert_eq!(
             result.err().unwrap().strip_backtrace(),
-            "Execution error: Format for `to_char` must be non-null Utf8, received Timestamp(Nanosecond, None)"
+            "Execution error: Format for `to_char` must be non-null Utf8, received Timestamp(ns)"
         );
     }
 }
diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs
index c9fd17dbef11f..55801b5db4d54 100644
--- a/datafusion/functions/src/datetime/to_date.rs
+++ b/datafusion/functions/src/datetime/to_date.rs
@@ -16,14 +16,16 @@
 // under the License.
 
 use crate::datetime::common::*;
+use arrow::compute::cast_with_options;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::*;
 use arrow::error::ArrowError::ParseError;
 use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser};
-use datafusion_common::error::DataFusionError;
-use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result};
+use datafusion_common::format::DEFAULT_CAST_OPTIONS;
+use datafusion_common::{Result, arrow_err, exec_err, internal_datafusion_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -31,7 +33,7 @@ use std::any::Any;
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r"Converts a value to a date (`YYYY-MM-DD`).
-Supports strings, integer and double types as input.
+Supports strings, numeric and timestamp types as input.
 Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
 Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
 Returns the corresponding date.
@@ -39,7 +41,7 @@ Returns the corresponding date.
 Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`.",
     syntax_example = "to_date('2017-05-31', '%Y-%m-%d')",
     sql_example = r#"```sql
-> select to_date('2023-01-31'); 
+> select to_date('2023-01-31');
 +-------------------------------+
 | to_date(Utf8("2023-01-31")) |
 +-------------------------------+
@@ -53,7 +55,7 @@ Note: `to_date` returns Date32, which represents its values as the number of day
 +---------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     standard_argument(name = "expression", prefix = "String"),
     argument(
@@ -63,7 +65,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
   an error will be returned."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToDateFunc {
     signature: Signature,
 }
@@ -83,7 +85,7 @@ impl ToDateFunc {
 
     fn to_date(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         match args.len() {
-            1 => handle::<Date32Type, _, Date32Type>(
+            1 => handle::<Date32Type, _>(
                 args,
                 |s| match Date32Type::parse(s) {
                     Some(v) => Ok(v),
@@ -93,8 +95,9 @@ impl ToDateFunc {
                     )),
                 },
                 "to_date",
+                &Date32,
             ),
-            2.. => handle_multiple::<Date32Type, _, Date32Type, _>(
+            2.. => handle_multiple::<Date32Type, _, _>(
                 args,
                 |s, format| {
                     string_to_timestamp_millis_formatted(s, format)
@@ -107,6 +110,7 @@ impl ToDateFunc {
                 },
                 |n| n,
                 "to_date",
+                &Date32,
             ),
             0 => exec_err!("Unsupported 0 argument count for function to_date"),
         }
@@ -130,10 +134,7 @@ impl ScalarUDFImpl for ToDateFunc {
         Ok(Date32)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = args.args;
         if args.is_empty() {
             return exec_err!("to_date function requires 1 or more arguments, got 0");
@@ -145,12 +146,45 @@ impl ScalarUDFImpl for ToDateFunc {
         }
 
         match args[0].data_type() {
-            Int32 | Int64 | Null | Float64 | Date32 | Date64 => {
+            Null | Int32 | Int64 | Date32 | Date64 | Timestamp(_, _) => {
                 args[0].cast_to(&Date32, None)
             }
+            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 => {
+                // Arrow cast doesn't support direct casting of these types to date32
+                // as it only supports Int32 and Int64. To work around that limitation,
+                // use cast_with_options to cast to Int32 and then cast the result of
+                // that to Date32.
+                match &args[0] {
+                    ColumnarValue::Array(array) => {
+                        Ok(ColumnarValue::Array(cast_with_options(
+                            &cast_with_options(&array, &Int32, &DEFAULT_CAST_OPTIONS)?,
+                            &Date32,
+                            &DEFAULT_CAST_OPTIONS,
+                        )?))
+                    }
+                    ColumnarValue::Scalar(scalar) => {
+                        let sv =
+                            scalar.cast_to_with_options(&Int32, &DEFAULT_CAST_OPTIONS)?;
+                        Ok(ColumnarValue::Scalar(
+                            sv.cast_to_with_options(&Date32, &DEFAULT_CAST_OPTIONS)?,
+                        ))
+                    }
+                }
+            }
+            Float16
+            | Float32
+            | Float64
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                // The only way this makes sense is to get the Int64 value of the float
+                // or decimal and then cast that to Date32.
+                args[0].cast_to(&Int64, None)?.cast_to(&Date32, None)
+            }
             Utf8View | LargeUtf8 | Utf8 => self.to_date(&args),
             other => {
-                exec_err!("Unsupported data type {:?} for function to_date", other)
+                exec_err!("Unsupported data type {} for function to_date", other)
             }
         }
     }
@@ -162,15 +196,15 @@ impl ScalarUDFImpl for ToDateFunc {
 
 #[cfg(test)]
 mod tests {
+    use super::ToDateFunc;
     use arrow::array::{Array, Date32Array, GenericStringArray, StringViewArray};
     use arrow::datatypes::{DataType, Field};
     use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{DataFusionError, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
-    use super::ToDateFunc;
-
     fn invoke_to_date_with_args(
         args: Vec<ColumnarValue>,
         number_rows: usize,
@@ -180,11 +214,12 @@ mod tests {
             .map(|arg| Field::new("a", arg.data_type(), true).into())
             .collect::<Vec<_>>();
 
-        let args = datafusion_expr::ScalarFunctionArgs {
+        let args = ScalarFunctionArgs {
             args,
             arg_fields,
             number_rows,
             return_field: Field::new("f", DataType::Date32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         ToDateFunc::new().invoke_with_args(args)
     }
@@ -351,7 +386,11 @@ mod tests {
             match to_date_result {
                 Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
                     let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
-                    assert_eq!(date_val, expected, "{}: to_date created wrong value for date '{}' with format string '{}'", tc.name, tc.formatted_date, tc.format_str);
+                    assert_eq!(
+                        date_val, expected,
+                        "{}: to_date created wrong value for date '{}' with format string '{}'",
+                        tc.name, tc.formatted_date, tc.format_str
+                    );
                 }
                 _ => panic!(
                     "Could not convert '{}' with format string '{}'to Date",
@@ -385,7 +424,8 @@ mod tests {
                     builder.append_value(expected.unwrap());
 
                     assert_eq!(
-                        &builder.finish() as &dyn Array, a.as_ref(),
+                        &builder.finish() as &dyn Array,
+                        a.as_ref(),
                         "{}: to_date created wrong value for date '{}' with format string '{}'",
                         tc.name,
                         tc.formatted_date,
diff --git a/datafusion/functions/src/datetime/to_local_time.rs b/datafusion/functions/src/datetime/to_local_time.rs
index b9ebe537d459b..335250b377c74 100644
--- a/datafusion/functions/src/datetime/to_local_time.rs
+++ b/datafusion/functions/src/datetime/to_local_time.rs
@@ -20,7 +20,7 @@ use std::ops::Add;
 use std::sync::Arc;
 
 use arrow::array::timezone::Tz;
-use arrow::array::{Array, ArrayRef, PrimitiveBuilder};
+use arrow::array::{ArrayRef, PrimitiveBuilder};
 use arrow::datatypes::DataType::Timestamp;
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
 use arrow::datatypes::{
@@ -31,10 +31,12 @@ use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc};
 
 use datafusion_common::cast::as_primitive_array;
 use datafusion_common::{
-    exec_err, plan_err, utils::take_function_args, DataFusionError, Result, ScalarValue,
+    Result, ScalarValue, exec_err, internal_datafusion_err, internal_err,
+    utils::take_function_args,
 };
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -68,11 +70,11 @@ use datafusion_macros::user_doc;
 FROM (
   SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time
 );
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
-| time                      | type                                           | to_local_time       | to_local_time_type          |
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
-| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) |
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
++---------------------------+----------------------------------+---------------------+--------------------+
+| time                      | type                             | to_local_time       | to_local_time_type |
++---------------------------+----------------------------------+---------------------+--------------------+
+| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns)      |
++---------------------------+----------------------------------+---------------------+--------------------+
 
 # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather
 # than UTC boundaries
@@ -96,7 +98,7 @@ FROM (
         description = "Time expression to operate on. Can be a constant, column, or function."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToLocalTimeFunc {
     signature: Signature,
 }
@@ -110,133 +112,160 @@ impl Default for ToLocalTimeFunc {
 impl ToLocalTimeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+                Volatility::Immutable,
+            ),
         }
     }
+}
 
-    fn to_local_time(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        let [time_value] = take_function_args(self.name(), args)?;
+impl ScalarUDFImpl for ToLocalTimeFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
 
-        let arg_type = time_value.data_type();
-        match arg_type {
-            Timestamp(_, None) => {
-                // if no timezone specified, just return the input
-                Ok(time_value.clone())
-            }
-            // If has timezone, adjust the underlying time value. The current time value
-            // is stored as i64 in UTC, even though the timezone may not be in UTC. Therefore,
-            // we need to adjust the time value to the local time. See [`adjust_to_local_time`]
-            // for more details.
-            //
-            // Then remove the timezone in return type, i.e. return None
-            Timestamp(_, Some(timezone)) => {
-                let tz: Tz = timezone.parse()?;
-
-                match time_value {
-                    ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampNanosecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampMicrosecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampMillisecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                        Some(ts),
-                        Some(_),
-                    )) => {
-                        let adjusted_ts =
-                            adjust_to_local_time::<TimestampSecondType>(*ts, tz)?;
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampSecond(
-                            Some(adjusted_ts),
-                            None,
-                        )))
-                    }
-                    ColumnarValue::Array(array) => {
-                        fn transform_array<T: ArrowTimestampType>(
-                            array: &ArrayRef,
-                            tz: Tz,
-                        ) -> Result<ColumnarValue> {
-                            let mut builder = PrimitiveBuilder::<T>::new();
-
-                            let primitive_array = as_primitive_array::<T>(array)?;
-                            for ts_opt in primitive_array.iter() {
-                                match ts_opt {
-                                    None => builder.append_null(),
-                                    Some(ts) => {
-                                        let adjusted_ts: i64 =
-                                            adjust_to_local_time::<T>(ts, tz)?;
-                                        builder.append_value(adjusted_ts)
-                                    }
-                                }
-                            }
-
-                            Ok(ColumnarValue::Array(Arc::new(builder.finish())))
-                        }
-
-                        match array.data_type() {
-                            Timestamp(_, None) => {
-                                // if no timezone specified, just return the input
-                                Ok(time_value.clone())
-                            }
-                            Timestamp(Nanosecond, Some(_)) => {
-                                transform_array::<TimestampNanosecondType>(array, tz)
-                            }
-                            Timestamp(Microsecond, Some(_)) => {
-                                transform_array::<TimestampMicrosecondType>(array, tz)
-                            }
-                            Timestamp(Millisecond, Some(_)) => {
-                                transform_array::<TimestampMillisecondType>(array, tz)
-                            }
-                            Timestamp(Second, Some(_)) => {
-                                transform_array::<TimestampSecondType>(array, tz)
-                            }
-                            _ => {
-                                exec_err!("to_local_time function requires timestamp argument in array, got {:?}", array.data_type())
-                            }
-                        }
-                    }
-                    _ => {
-                        exec_err!(
-                        "to_local_time function requires timestamp argument, got {:?}",
-                        time_value.data_type()
-                    )
-                    }
-                }
-            }
-            _ => {
-                exec_err!(
-                    "to_local_time function requires timestamp argument, got {:?}",
-                    arg_type
-                )
+    fn name(&self) -> &str {
+        "to_local_time"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(Timestamp(Nanosecond, None)),
+            Timestamp(timeunit, _) => Ok(Timestamp(*timeunit, None)),
+            dt => internal_err!(
+                "The to_local_time function can only accept timestamp as the arg, got {dt}"
+            ),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [time_value] = take_function_args(self.name(), &args.args)?;
+        to_local_time(time_value)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+fn transform_array<T: ArrowTimestampType>(
+    array: &ArrayRef,
+    tz: Tz,
+) -> Result<ColumnarValue> {
+    let primitive_array = as_primitive_array::<T>(array)?;
+    let mut builder = PrimitiveBuilder::<T>::with_capacity(primitive_array.len());
+    for ts_opt in primitive_array.iter() {
+        match ts_opt {
+            None => builder.append_null(),
+            Some(ts) => {
+                let adjusted_ts: i64 = adjust_to_local_time::<T>(ts, tz)?;
+                builder.append_value(adjusted_ts)
             }
         }
     }
+
+    Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+}
+
+fn to_local_time(time_value: &ColumnarValue) -> Result<ColumnarValue> {
+    let arg_type = time_value.data_type();
+
+    let tz: Tz = match &arg_type {
+        Timestamp(_, Some(timezone)) => timezone.parse()?,
+        Timestamp(_, None) => {
+            // if no timezone specified, just return the input
+            return Ok(time_value.clone());
+        }
+        DataType::Null => {
+            return Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                None, None,
+            )));
+        }
+        dt => {
+            return internal_err!(
+                "to_local_time function requires timestamp argument, got {dt}"
+            );
+        }
+    };
+
+    // If has timezone, adjust the underlying time value. The current time value
+    // is stored as i64 in UTC, even though the timezone may not be in UTC. Therefore,
+    // we need to adjust the time value to the local time. See [`adjust_to_local_time`]
+    // for more details.
+    //
+    // Then remove the timezone in return type, i.e. return None
+    match time_value {
+        ColumnarValue::Scalar(ScalarValue::TimestampSecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampSecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, Some(_))) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, None)),
+        ),
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampNanosecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampMicrosecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampMillisecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Scalar(ScalarValue::TimestampSecond(Some(ts), Some(_))) => {
+            let adjusted_ts = adjust_to_local_time::<TimestampSecondType>(*ts, tz)?;
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                Some(adjusted_ts),
+                None,
+            )))
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Nanosecond, Some(_))) =>
+        {
+            transform_array::<TimestampNanosecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Microsecond, Some(_))) =>
+        {
+            transform_array::<TimestampMicrosecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Millisecond, Some(_))) =>
+        {
+            transform_array::<TimestampMillisecondType>(array, tz)
+        }
+        ColumnarValue::Array(array)
+            if matches!(array.data_type(), Timestamp(Second, Some(_))) =>
+        {
+            transform_array::<TimestampSecondType>(array, tz)
+        }
+        _ => {
+            internal_err!(
+                "to_local_time function requires timestamp argument, got {arg_type}"
+            )
+        }
+    }
 }
 
 /// This function converts a timestamp with a timezone to a timestamp without a timezone.
@@ -292,7 +321,7 @@ impl ToLocalTimeFunc {
 /// ```
 ///
 /// See `test_adjust_to_local_time()` for example
-fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
+pub fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
     fn convert_timestamp<F>(ts: i64, converter: F) -> Result<DateTime<Utc>>
     where
         F: Fn(i64) -> MappedLocalTime<DateTime<Utc>>,
@@ -326,15 +355,15 @@ fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
         // This should not fail under normal circumstances as the
         // maximum possible offset is 26 hours (93,600 seconds)
         TimeDelta::try_seconds(offset_seconds)
-            .ok_or(DataFusionError::Internal("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000".to_string()))?,
+            .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?,
     );
 
     // convert the naive datetime back to i64
     match T::UNIT {
-        Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or(
-            DataFusionError::Internal(
-                "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807".to_string(),
-            ),
+        Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(||
+            internal_datafusion_err!(
+                "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807"
+            )
         ),
         Microsecond => Ok(adjusted_date_time.timestamp_micros()),
         Millisecond => Ok(adjusted_date_time.timestamp_millis()),
@@ -342,79 +371,19 @@ fn adjust_to_local_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
     }
 }
 
-impl ScalarUDFImpl for ToLocalTimeFunc {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "to_local_time"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        let [time_value] = take_function_args(self.name(), arg_types)?;
-
-        match time_value {
-            Timestamp(timeunit, _) => Ok(Timestamp(*timeunit, None)),
-            _ => exec_err!(
-                "The to_local_time function can only accept timestamp as the arg, got {:?}", time_value
-            )
-        }
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let [time_value] = take_function_args(self.name(), args.args)?;
-
-        self.to_local_time(&[time_value.clone()])
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return plan_err!(
-                "to_local_time function requires 1 argument, got {:?}",
-                arg_types.len()
-            );
-        }
-
-        let first_arg = arg_types[0].clone();
-        match &first_arg {
-            Timestamp(Nanosecond, timezone) => {
-                Ok(vec![Timestamp(Nanosecond, timezone.clone())])
-            }
-            Timestamp(Microsecond, timezone) => {
-                Ok(vec![Timestamp(Microsecond, timezone.clone())])
-            }
-            Timestamp(Millisecond, timezone) => {
-                Ok(vec![Timestamp(Millisecond, timezone.clone())])
-            }
-            Timestamp(Second, timezone) => Ok(vec![Timestamp(Second, timezone.clone())]),
-            _ => plan_err!("The to_local_time function can only accept Timestamp as the arg got {first_arg}"),
-        }
-    }
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::{types::TimestampNanosecondType, Array, TimestampNanosecondArray};
+    use arrow::array::{Array, TimestampNanosecondArray, types::TimestampNanosecondType};
     use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
     use arrow::datatypes::{DataType, Field, TimeUnit};
     use chrono::NaiveDateTime;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
-    use super::{adjust_to_local_time, ToLocalTimeFunc};
+    use super::{ToLocalTimeFunc, adjust_to_local_time};
 
     #[test]
     fn test_adjust_to_local_time() {
@@ -545,6 +514,7 @@ mod tests {
                 arg_fields: vec![arg_field],
                 number_rows: 1,
                 return_field: Field::new("f", expected.data_type(), true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
             })
             .unwrap();
         match res {
@@ -615,6 +585,7 @@ mod tests {
                     true,
                 )
                 .into(),
+                config_options: Arc::new(ConfigOptions::default()),
             };
             let result = ToLocalTimeFunc::new().invoke_with_args(args).unwrap();
             if let ColumnarValue::Array(result) = result {
diff --git a/datafusion/functions/src/datetime/to_time.rs b/datafusion/functions/src/datetime/to_time.rs
new file mode 100644
index 0000000000000..952e68c4888f3
--- /dev/null
+++ b/datafusion/functions/src/datetime/to_time.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::datetime::common::*;
+use arrow::array::builder::PrimitiveBuilder;
+use arrow::array::cast::AsArray;
+use arrow::array::temporal_conversions::time_to_time64ns;
+use arrow::array::types::Time64NanosecondType;
+use arrow::array::{Array, PrimitiveArray, StringArrayType};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::*;
+use chrono::NaiveTime;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Default time formats to try when parsing without an explicit format
+const DEFAULT_TIME_FORMATS: &[&str] = &[
+    "%H:%M:%S%.f", // 12:30:45.123456789
+    "%H:%M:%S",    // 12:30:45
+    "%H:%M",       // 12:30
+];
+
+#[user_doc(
+    doc_section(label = "Time and Date Functions"),
+    description = r"Converts a value to a time (`HH:MM:SS.nnnnnnnnn`).
+Supports strings and timestamps as input.
+Strings are parsed as `HH:MM:SS`, `HH:MM:SS.nnnnnnnnn`, or `HH:MM` if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
+Timestamps will have the time portion extracted.
+Returns the corresponding time.
+
+Note: `to_time` returns Time64(Nanosecond), which represents the time of day in nanoseconds since midnight.",
+    syntax_example = "to_time('12:30:45', '%H:%M:%S')",
+    sql_example = r#"```sql
+> select to_time('12:30:45');
++---------------------------+
+| to_time(Utf8("12:30:45")) |
++---------------------------+
+| 12:30:45                  |
++---------------------------+
+> select to_time('12-30-45', '%H-%M-%S');
++--------------------------------------------+
+| to_time(Utf8("12-30-45"),Utf8("%H-%M-%S")) |
++--------------------------------------------+
+| 12:30:45                                   |
++--------------------------------------------+
+> select to_time('2024-01-15 14:30:45'::timestamp);
++--------------------------------------------------+
+| to_time(Utf8("2024-01-15 14:30:45"))             |
++--------------------------------------------------+
+| 14:30:45                                         |
++--------------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+"#,
+    standard_argument(name = "expression", prefix = "String or Timestamp"),
+    argument(
+        name = "format_n",
+        description = r"Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order
+  they appear with the first successful one being returned. If none of the formats successfully parse the expression
+  an error will be returned."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ToTimeFunc {
+    signature: Signature,
+}
+
+impl Default for ToTimeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ToTimeFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ToTimeFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "to_time"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Time64(arrow::datatypes::TimeUnit::Nanosecond))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let args = args.args;
+        if args.is_empty() {
+            return exec_err!("to_time function requires 1 or more arguments, got 0");
+        }
+
+        // validate that any args after the first one are Utf8
+        if args.len() > 1 {
+            validate_data_types(&args, "to_time")?;
+        }
+
+        match args[0].data_type() {
+            Utf8View | LargeUtf8 | Utf8 => string_to_time(&args),
+            Null => Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(None))),
+            // Support timestamp input by extracting time portion using Arrow cast
+            Timestamp(_, _) => timestamp_to_time(&args[0]),
+            other => {
+                exec_err!("Unsupported data type {} for function to_time", other)
+            }
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+/// Convert string arguments to time (standalone function, not a method on ToTimeFunc)
+fn string_to_time(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let formats = collect_formats(args)?;
+
+    match &args[0] {
+        ColumnarValue::Scalar(ScalarValue::Utf8(s))
+        | ColumnarValue::Scalar(ScalarValue::LargeUtf8(s))
+        | ColumnarValue::Scalar(ScalarValue::Utf8View(s)) => {
+            let result = s
+                .as_ref()
+                .map(|s| parse_time_with_formats(s, &formats))
+                .transpose()?;
+            Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(result)))
+        }
+        ColumnarValue::Array(array) => {
+            let result = match array.data_type() {
+                Utf8 => parse_time_array(&array.as_string::<i32>(), &formats)?,
+                LargeUtf8 => parse_time_array(&array.as_string::<i64>(), &formats)?,
+                Utf8View => parse_time_array(&array.as_string_view(), &formats)?,
+                other => return exec_err!("Unsupported type for to_time: {other}"),
+            };
+            Ok(ColumnarValue::Array(Arc::new(result)))
+        }
+        other => exec_err!("Unsupported argument for to_time: {other:?}"),
+    }
+}
+
+/// Collect format strings from arguments, erroring on non-scalar inputs
+fn collect_formats(args: &[ColumnarValue]) -> Result<Vec<&str>> {
+    if args.len() <= 1 {
+        return Ok(DEFAULT_TIME_FORMATS.to_vec());
+    }
+
+    let mut formats = Vec::with_capacity(args.len() - 1);
+    for (i, arg) in args[1..].iter().enumerate() {
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(Some(s))) => {
+                formats.push(s.as_str());
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(None))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(None))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {
+                // Skip null format strings
+            }
+            ColumnarValue::Array(_) => {
+                return exec_err!(
+                    "to_time format argument {} must be a scalar, not an array",
+                    i + 2 // argument position (1-indexed, +1 for the first arg)
+                );
+            }
+            other => {
+                return exec_err!(
+                    "to_time format argument {} has unsupported type: {:?}",
+                    i + 2,
+                    other.data_type()
+                );
+            }
+        }
+    }
+    Ok(formats)
+}
+
+/// Extract time portion from timestamp using Arrow cast kernel
+fn timestamp_to_time(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    arg.cast_to(&Time64(arrow::datatypes::TimeUnit::Nanosecond), None)
+}
+
+/// Parse time array using the provided formats
+fn parse_time_array<'a, A: StringArrayType<'a>>(
+    array: &A,
+    formats: &[&str],
+) -> Result<PrimitiveArray<Time64NanosecondType>> {
+    let mut builder: PrimitiveBuilder<Time64NanosecondType> =
+        PrimitiveArray::builder(array.len());
+
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            builder.append_null();
+        } else {
+            let s = array.value(i);
+            let nanos = parse_time_with_formats(s, formats)?;
+            builder.append_value(nanos);
+        }
+    }
+
+    Ok(builder.finish())
+}
+
+/// Parse time string using provided formats
+fn parse_time_with_formats(s: &str, formats: &[&str]) -> Result<i64> {
+    for format in formats {
+        if let Ok(time) = NaiveTime::parse_from_str(s, format) {
+            // Use Arrow's time_to_time64ns function instead of custom implementation
+            return Ok(time_to_time64ns(time));
+        }
+    }
+    exec_err!(
+        "Error parsing '{}' as time. Tried formats: {:?}",
+        s,
+        formats
+    )
+}
diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs
index 8b26a1c259505..08839041e5304 100644
--- a/datafusion/functions/src/datetime/to_timestamp.rs
+++ b/datafusion/functions/src/datetime/to_timestamp.rs
@@ -19,24 +19,42 @@ use std::any::Any;
 use std::sync::Arc;
 
 use crate::datetime::common::*;
+use arrow::array::timezone::Tz;
+use arrow::array::{
+    Array, Decimal128Array, Float16Array, Float32Array, Float64Array,
+    TimestampNanosecondArray,
+};
 use arrow::datatypes::DataType::*;
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
 use arrow::datatypes::{
-    ArrowTimestampType, DataType, TimeUnit, TimestampMicrosecondType,
-    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+    ArrowTimestampType, DataType, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType,
 };
-use datafusion_common::{exec_err, Result, ScalarType, ScalarValue};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{Result, ScalarType, ScalarValue, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+    Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
     description = r#"
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
-
-Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+Note: `to_timestamp` returns `Timestamp(ns, TimeZone)` where the time zone is the session time zone. The supported range
+for integer input is between`-9223372037` and `9223372036`. Supported range for string input is between
+`1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds`
+for the input outside of supported bounds.
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 "#,
     syntax_example = "to_timestamp(expression[, ..., format_n])",
     sql_example = r#"```sql
@@ -53,7 +71,7 @@ Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for in
 | 2023-05-17T03:59:00.123456789                                                                          |
 +--------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -61,17 +79,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_seconds(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_seconds('2023-01-31T09:26:56.123456789-05:00');
@@ -87,7 +121,7 @@ pub struct ToTimestampFunc {
 | 2023-05-17T03:59:00                                                                                            |
 +----------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -95,17 +129,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampSecondsFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_millis(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_millis('2023-01-31T09:26:56.123456789-05:00');
@@ -121,7 +171,7 @@ pub struct ToTimestampSecondsFunc {
 | 2023-05-17T03:59:00.123                                                                                       |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -129,17 +179,33 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampMillisFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`) Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_micros(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_micros('2023-01-31T09:26:56.123456789-05:00');
@@ -155,7 +221,7 @@ pub struct ToTimestampMillisFunc {
 | 2023-05-17T03:59:00.123456                                                                                    |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -163,17 +229,32 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampMicrosFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. 
+Strings that parse without a time zone are treated as if they are in the
+session time zone. Integers, unsigned integers, and doubles are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
+"#,
     syntax_example = "to_timestamp_nanos(expression[, ..., format_n])",
     sql_example = r#"```sql
 > select to_timestamp_nanos('2023-01-31T09:26:56.123456789-05:00');
@@ -189,7 +270,7 @@ pub struct ToTimestampMicrosFunc {
 | 2023-05-17T03:59:00.123456789                                                                                |
 +---------------------------------------------------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 "#,
     argument(
         name = "expression",
@@ -197,81 +278,97 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampNanosFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
-impl Default for ToTimestampFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
-        }
-    }
-}
-
-impl Default for ToTimestampSecondsFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampSecondsFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+/// Macro to generate boilerplate constructors and config methods for ToTimestamp* functions.
+/// Generates: Default impl, deprecated new(), new_with_config(), and extracts timezone from ConfigOptions.
+macro_rules! impl_to_timestamp_constructors {
+    ($func:ty) => {
+        impl Default for $func {
+            fn default() -> Self {
+                Self::new_with_config(&ConfigOptions::default())
+            }
         }
-    }
-}
 
-impl Default for ToTimestampMillisFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
+        impl $func {
+            #[deprecated(since = "52.0.0", note = "use `new_with_config` instead")]
+            /// Deprecated constructor retained for backwards compatibility.
+            ///
+            /// Prefer `new_with_config` which allows specifying the
+            /// timezone via [`ConfigOptions`]. This helper now mirrors the
+            /// canonical default offset (None) provided by `ConfigOptions::default()`.
+            pub fn new() -> Self {
+                Self::new_with_config(&ConfigOptions::default())
+            }
 
-impl ToTimestampMillisFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+            pub fn new_with_config(config: &ConfigOptions) -> Self {
+                Self {
+                    signature: Signature::variadic_any(Volatility::Immutable),
+                    timezone: config
+                        .execution
+                        .time_zone
+                        .as_ref()
+                        .map(|tz| Arc::from(tz.as_str())),
+                }
+            }
         }
-    }
+    };
 }
 
-impl Default for ToTimestampMicrosFunc {
-    fn default() -> Self {
-        Self::new()
-    }
+impl_to_timestamp_constructors!(ToTimestampFunc);
+impl_to_timestamp_constructors!(ToTimestampSecondsFunc);
+impl_to_timestamp_constructors!(ToTimestampMillisFunc);
+impl_to_timestamp_constructors!(ToTimestampMicrosFunc);
+impl_to_timestamp_constructors!(ToTimestampNanosFunc);
+
+fn decimal_to_nanoseconds(value: i128, scale: i8) -> i64 {
+    let nanos_exponent = 9_i16 - scale as i16;
+    let timestamp_nanos = if nanos_exponent >= 0 {
+        value * 10_i128.pow(nanos_exponent as u32)
+    } else {
+        value / 10_i128.pow(nanos_exponent.unsigned_abs() as u32)
+    };
+    timestamp_nanos as i64
 }
 
-impl ToTimestampMicrosFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+fn decimal128_to_timestamp_nanos(
+    arg: &ColumnarValue,
+    tz: Option<Arc<str>>,
+) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Decimal128(Some(value), _, scale)) => {
+            let timestamp_nanos = decimal_to_nanoseconds(*value, *scale);
+            Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                Some(timestamp_nanos),
+                tz,
+            )))
         }
-    }
-}
-
-impl Default for ToTimestampNanosFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ToTimestampNanosFunc {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+        ColumnarValue::Scalar(ScalarValue::Decimal128(None, _, _)) => Ok(
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(None, tz)),
+        ),
+        ColumnarValue::Array(arr) => {
+            let decimal_arr = downcast_arg!(arr, Decimal128Array);
+            let scale = decimal_arr.scale();
+            let result: TimestampNanosecondArray = decimal_arr
+                .iter()
+                .map(|v| v.map(|val| decimal_to_nanoseconds(val, scale)))
+                .collect();
+            let result = result.with_timezone_opt(tz);
+            Ok(ColumnarValue::Array(Arc::new(result)))
         }
+        _ => exec_err!("Invalid Decimal128 value for to_timestamp"),
     }
 }
 
@@ -281,6 +378,15 @@ impl ToTimestampNanosFunc {
 /// The supported range for integer input is between `-9223372037` and `9223372036`.
 /// Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`.
 /// Please use `to_timestamp_seconds` for the input outside of supported bounds.
+/// Macro to generate the with_updated_config method for ToTimestamp* functions.
+macro_rules! impl_with_updated_config {
+    () => {
+        fn with_updated_config(&self, config: &ConfigOptions) -> Option<ScalarUDF> {
+            Some(Self::new_with_config(config).into())
+        }
+    };
+}
+
 impl ScalarUDFImpl for ToTimestampFunc {
     fn as_any(&self) -> &dyn Any {
         self
@@ -294,15 +400,15 @@ impl ScalarUDFImpl for ToTimestampFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Nanosecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Nanosecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp function requires 1 or more arguments, got {}",
@@ -315,51 +421,77 @@ impl ScalarUDFImpl for ToTimestampFunc {
             validate_data_types(&args, "to_timestamp")?;
         }
 
+        let tz = self.timezone.clone();
+
         match args[0].data_type() {
-            Int32 | Int64 => args[0]
+            Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 => args[0]
                 .cast_to(&Timestamp(Second, None), None)?
-                .cast_to(&Timestamp(Nanosecond, None), None),
-            Null | Float64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Nanosecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None)
+                .cast_to(&Timestamp(Nanosecond, tz), None),
+            Null | Timestamp(_, _) => args[0].cast_to(&Timestamp(Nanosecond, tz), None),
+            Float16 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float16(value)) => {
+                    let timestamp_nanos =
+                        value.map(|v| (v.to_f64() * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f16_arr = downcast_arg!(arr, Float16Array);
+                    let result: TimestampNanosecondArray =
+                        f16_arr.unary(|x| (x.to_f64() * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float16 value for to_timestamp"),
+            },
+            Float32 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float32(value)) => {
+                    let timestamp_nanos =
+                        value.map(|v| (v as f64 * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f32_arr = downcast_arg!(arr, Float32Array);
+                    let result: TimestampNanosecondArray =
+                        f32_arr.unary(|x| (x as f64 * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float32 value for to_timestamp"),
+            },
+            Float64 => match &args[0] {
+                ColumnarValue::Scalar(ScalarValue::Float64(value)) => {
+                    let timestamp_nanos = value.map(|v| (v * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        timestamp_nanos,
+                        tz,
+                    )))
+                }
+                ColumnarValue::Array(arr) => {
+                    let f64_arr = downcast_arg!(arr, Float64Array);
+                    let result: TimestampNanosecondArray =
+                        f64_arr.unary(|x| (x * 1_000_000_000.0) as i64);
+                    Ok(ColumnarValue::Array(Arc::new(result.with_timezone_opt(tz))))
+                }
+                _ => exec_err!("Invalid Float64 value for to_timestamp"),
+            },
+            Decimal32(_, _) | Decimal64(_, _) | Decimal256(_, _) => {
+                let arg = args[0].cast_to(&Decimal128(38, 9), None)?;
+                decimal128_to_timestamp_nanos(&arg, tz)
             }
+            Decimal128(_, _) => decimal128_to_timestamp_nanos(&args[0], tz),
             Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp")
-            }
-            Decimal128(_, _) => {
-                match &args[0] {
-                    ColumnarValue::Scalar(ScalarValue::Decimal128(
-                        Some(value),
-                        _,
-                        scale,
-                    )) => {
-                        // Convert decimal to seconds and nanoseconds
-                        let scale_factor = 10_i128.pow(*scale as u32);
-                        let seconds = value / scale_factor;
-                        let fraction = value % scale_factor;
-
-                        let nanos = (fraction * 1_000_000_000) / scale_factor;
-
-                        let timestamp_nanos = seconds * 1_000_000_000 + nanos;
-
-                        Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                            Some(timestamp_nanos as i64),
-                            None,
-                        )))
-                    }
-                    _ => exec_err!("Invalid decimal value"),
-                }
+                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp", &tz)
             }
             other => {
-                exec_err!(
-                    "Unsupported data type {:?} for function to_timestamp",
-                    other
-                )
+                exec_err!("Unsupported data type {other} for function to_timestamp")
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -378,15 +510,15 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Second))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Second, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_seconds function requires 1 or more arguments, got {}",
@@ -399,22 +531,40 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc {
             validate_data_types(&args, "to_timestamp")?;
         }
 
+        let tz = self.timezone.clone();
+
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) | Decimal128(_, _) => {
-                args[0].cast_to(&Timestamp(Second, None), None)
-            }
-            Timestamp(_, Some(tz)) => args[0].cast_to(&Timestamp(Second, Some(tz)), None),
-            Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampSecondType>(&args, "to_timestamp_seconds")
-            }
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => args[0].cast_to(&Timestamp(Second, tz), None),
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Second, tz), None),
+            Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampSecondType>(
+                &args,
+                "to_timestamp_seconds",
+                &self.timezone,
+            ),
             other => {
                 exec_err!(
-                    "Unsupported data type {:?} for function to_timestamp_seconds",
+                    "Unsupported data type {} for function to_timestamp_seconds",
                     other
                 )
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -433,15 +583,15 @@ impl ScalarUDFImpl for ToTimestampMillisFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Millisecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Millisecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_millis function requires 1 or more arguments, got {}",
@@ -455,24 +605,39 @@ impl ScalarUDFImpl for ToTimestampMillisFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Millisecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Millisecond, Some(tz)), None)
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Millisecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Millisecond, self.timezone.clone()), None),
             Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampMillisecondType>(
                 &args,
                 "to_timestamp_millis",
+                &self.timezone,
             ),
             other => {
                 exec_err!(
-                    "Unsupported data type {:?} for function to_timestamp_millis",
+                    "Unsupported data type {} for function to_timestamp_millis",
                     other
                 )
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -491,15 +656,15 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Microsecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Microsecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_micros function requires 1 or more arguments, got {}",
@@ -513,24 +678,39 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Microsecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Microsecond, Some(tz)), None)
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Microsecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Microsecond, self.timezone.clone()), None),
             Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampMicrosecondType>(
                 &args,
                 "to_timestamp_micros",
+                &self.timezone,
             ),
             other => {
                 exec_err!(
-                    "Unsupported data type {:?} for function to_timestamp_micros",
+                    "Unsupported data type {} for function to_timestamp_micros",
                     other
                 )
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -549,15 +729,15 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(return_type_for(&arg_types[0], Nanosecond))
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Timestamp(Nanosecond, self.timezone.clone()))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = args.args;
+    impl_with_updated_config!();
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+
         if args.is_empty() {
             return exec_err!(
                 "to_timestamp_nanos function requires 1 or more arguments, got {}",
@@ -571,40 +751,48 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
         }
 
         match args[0].data_type() {
-            Null | Int32 | Int64 | Timestamp(_, None) => {
-                args[0].cast_to(&Timestamp(Nanosecond, None), None)
-            }
-            Timestamp(_, Some(tz)) => {
-                args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None)
-            }
-            Utf8View | LargeUtf8 | Utf8 => {
-                to_timestamp_impl::<TimestampNanosecondType>(&args, "to_timestamp_nanos")
+            Null
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Timestamp(_, _)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
+                args[0].cast_to(&Timestamp(Nanosecond, self.timezone.clone()), None)
             }
+            Float16 | Float32 | Float64 => args[0]
+                .cast_to(&Int64, None)?
+                .cast_to(&Timestamp(Nanosecond, self.timezone.clone()), None),
+            Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::<TimestampNanosecondType>(
+                &args,
+                "to_timestamp_nanos",
+                &self.timezone,
+            ),
             other => {
                 exec_err!(
-                    "Unsupported data type {:?} for function to_timestamp_nanos",
+                    "Unsupported data type {} for function to_timestamp_nanos",
                     other
                 )
             }
         }
     }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
-/// Returns the return type for the to_timestamp_* function, preserving
-/// the timezone if it exists.
-fn return_type_for(arg: &DataType, unit: TimeUnit) -> DataType {
-    match arg {
-        Timestamp(_, Some(tz)) => Timestamp(unit, Some(Arc::clone(tz))),
-        _ => Timestamp(unit, None),
-    }
-}
-
 fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
     args: &[ColumnarValue],
     name: &str,
+    timezone: &Option<Arc<str>>,
 ) -> Result<ColumnarValue> {
     let factor = match T::UNIT {
         Second => 1_000_000_000,
@@ -613,17 +801,26 @@ fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
         Nanosecond => 1,
     };
 
+    let tz = match timezone.clone() {
+        Some(tz) => Some(tz.parse::<Tz>()?),
+        None => None,
+    };
+
     match args.len() {
-        1 => handle::<T, _, T>(
+        1 => handle::<T, _>(
             args,
-            |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor),
+            move |s| string_to_timestamp_nanos_with_timezone(&tz, s).map(|n| n / factor),
             name,
+            &Timestamp(T::UNIT, timezone.clone()),
         ),
-        n if n >= 2 => handle_multiple::<T, _, T, _>(
+        n if n >= 2 => handle_multiple::<T, _, _>(
             args,
-            string_to_timestamp_nanos_formatted,
+            move |s, format| {
+                string_to_timestamp_nanos_formatted_with_timezone(&tz, s, format)
+            },
             |n| n / factor,
             name,
+            &Timestamp(T::UNIT, timezone.clone()),
         ),
         _ => exec_err!("Unsupported 0 argument count for function {name}"),
     }
@@ -640,34 +837,110 @@ mod tests {
     };
     use arrow::array::{ArrayRef, Int64Array, StringBuilder};
     use arrow::datatypes::{Field, TimeUnit};
-    use chrono::Utc;
-    use datafusion_common::{assert_contains, DataFusionError, ScalarValue};
-    use datafusion_expr::ScalarFunctionImplementation;
+    use chrono::{DateTime, FixedOffset, Utc};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{DataFusionError, ScalarValue, assert_contains};
+    use datafusion_expr::{ScalarFunctionArgs, ScalarFunctionImplementation};
 
     use super::*;
 
     fn to_timestamp(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp", &timezone)
     }
 
     /// to_timestamp_millis SQL function
     fn to_timestamp_millis(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampMillisecondType>(args, "to_timestamp_millis")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampMillisecondType>(
+            args,
+            "to_timestamp_millis",
+            &timezone,
+        )
     }
 
     /// to_timestamp_micros SQL function
     fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampMicrosecondType>(args, "to_timestamp_micros")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampMicrosecondType>(
+            args,
+            "to_timestamp_micros",
+            &timezone,
+        )
     }
 
     /// to_timestamp_nanos SQL function
     fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampNanosecondType>(args, "to_timestamp_nanos")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampNanosecondType>(
+            args,
+            "to_timestamp_nanos",
+            &timezone,
+        )
     }
 
     /// to_timestamp_seconds SQL function
     fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        to_timestamp_impl::<TimestampSecondType>(args, "to_timestamp_seconds")
+        let timezone: Option<Arc<str>> = Some("UTC".into());
+        to_timestamp_impl::<TimestampSecondType>(args, "to_timestamp_seconds", &timezone)
+    }
+
+    fn udfs_and_timeunit() -> Vec<(Box<dyn ScalarUDFImpl>, TimeUnit)> {
+        let udfs: Vec<(Box<dyn ScalarUDFImpl>, TimeUnit)> = vec![
+            (
+                Box::new(ToTimestampFunc::new_with_config(&ConfigOptions::default())),
+                Nanosecond,
+            ),
+            (
+                Box::new(ToTimestampSecondsFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Second,
+            ),
+            (
+                Box::new(ToTimestampMillisFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Millisecond,
+            ),
+            (
+                Box::new(ToTimestampMicrosFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Microsecond,
+            ),
+            (
+                Box::new(ToTimestampNanosFunc::new_with_config(
+                    &ConfigOptions::default(),
+                )),
+                Nanosecond,
+            ),
+        ];
+        udfs
+    }
+
+    fn validate_expected_error(
+        options: &mut ConfigOptions,
+        args: ScalarFunctionArgs,
+        expected_err: &str,
+    ) {
+        let udfs = udfs_and_timeunit();
+
+        for (udf, _) in udfs {
+            match udf
+                .with_updated_config(options)
+                .unwrap()
+                .invoke_with_args(args.clone())
+            {
+                Ok(_) => panic!("Expected error but got success"),
+                Err(e) => {
+                    assert!(
+                        e.to_string().contains(expected_err),
+                        "Can not find expected error '{expected_err}'. Actual error '{e}'"
+                    );
+                }
+            }
+        }
     }
 
     #[test]
@@ -738,6 +1011,368 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn to_timestamp_respects_execution_timezone() -> Result<()> {
+        let udfs = udfs_and_timeunit();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("-05:00".to_string());
+
+        let time_zone: Option<Arc<str>> = options
+            .execution
+            .time_zone
+            .as_ref()
+            .map(|tz| Arc::from(tz.as_str()));
+
+        for (udf, time_unit) in udfs {
+            let field = Field::new("arg", Utf8, true).into();
+
+            let args = ScalarFunctionArgs {
+                args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "2020-09-08T13:42:29".to_string(),
+                )))],
+                arg_fields: vec![field],
+                number_rows: 1,
+                return_field: Field::new(
+                    "f",
+                    Timestamp(time_unit, Some("-05:00".into())),
+                    true,
+                )
+                .into(),
+                config_options: Arc::new(options.clone()),
+            };
+
+            let result = udf
+                .with_updated_config(&options.clone())
+                .unwrap()
+                .invoke_with_args(args)?;
+            let result = match time_unit {
+                Second => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Millisecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Microsecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+                Nanosecond => {
+                    let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                        Some(value),
+                        tz,
+                    )) = result
+                    else {
+                        panic!("expected scalar timestamp");
+                    };
+
+                    assert_eq!(tz, time_zone);
+
+                    value
+                }
+            };
+
+            let scale = match time_unit {
+                Second => 1_000_000_000,
+                Millisecond => 1_000_000,
+                Microsecond => 1_000,
+                Nanosecond => 1,
+            };
+
+            let offset = FixedOffset::west_opt(5 * 3600).unwrap();
+            let result = Some(
+                DateTime::<Utc>::from_timestamp_nanos(result * scale)
+                    .with_timezone(&offset)
+                    .to_string(),
+            );
+
+            assert_eq!(result, Some("2020-09-08 13:42:29 -05:00".to_string()));
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_formats_respects_execution_timezone() -> Result<()> {
+        let udfs = udfs_and_timeunit();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("-05:00".to_string());
+
+        let time_zone: Option<Arc<str>> = options
+            .execution
+            .time_zone
+            .as_ref()
+            .map(|tz| Arc::from(tz.as_str()));
+
+        let expr_field = Field::new("arg", Utf8, true).into();
+        let format_field: Arc<Field> = Field::new("fmt", Utf8, true).into();
+
+        for (udf, time_unit) in udfs {
+            for (value, format, expected_str) in [
+                (
+                    "2020-09-08 09:42:29 -05:00",
+                    "%Y-%m-%d %H:%M:%S %z",
+                    Some("2020-09-08 09:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08T13:42:29Z",
+                    "%+",
+                    Some("2020-09-08 08:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08 13:42:29 UTC",
+                    "%Y-%m-%d %H:%M:%S %Z",
+                    Some("2020-09-08 08:42:29 -05:00"),
+                ),
+                (
+                    "+0000 2024-01-01 12:00:00",
+                    "%z %Y-%m-%d %H:%M:%S",
+                    Some("2024-01-01 07:00:00 -05:00"),
+                ),
+                (
+                    "20200908134229+0100",
+                    "%Y%m%d%H%M%S%z",
+                    Some("2020-09-08 07:42:29 -05:00"),
+                ),
+                (
+                    "2020-09-08+0230 13:42",
+                    "%Y-%m-%d%z %H:%M",
+                    Some("2020-09-08 06:12:00 -05:00"),
+                ),
+            ] {
+                let args = ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(value.to_string()))),
+                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                            format.to_string(),
+                        ))),
+                    ],
+                    arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+                    number_rows: 1,
+                    return_field: Field::new(
+                        "f",
+                        Timestamp(time_unit, Some("-05:00".into())),
+                        true,
+                    )
+                    .into(),
+                    config_options: Arc::new(options.clone()),
+                };
+                let result = udf
+                    .with_updated_config(&options.clone())
+                    .unwrap()
+                    .invoke_with_args(args)?;
+                let result = match time_unit {
+                    Second => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampSecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Millisecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Microsecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                    Nanosecond => {
+                        let ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
+                            Some(value),
+                            tz,
+                        )) = result
+                        else {
+                            panic!("expected scalar timestamp");
+                        };
+
+                        assert_eq!(tz, time_zone);
+
+                        value
+                    }
+                };
+
+                let scale = match time_unit {
+                    Second => 1_000_000_000,
+                    Millisecond => 1_000_000,
+                    Microsecond => 1_000,
+                    Nanosecond => 1,
+                };
+                let offset = FixedOffset::west_opt(5 * 3600).unwrap();
+                let result = Some(
+                    DateTime::<Utc>::from_timestamp_nanos(result * scale)
+                        .with_timezone(&offset)
+                        .to_string(),
+                );
+
+                assert_eq!(result, expected_str.map(|s| s.to_string()));
+            }
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_invalid_execution_timezone_behavior() -> Result<()> {
+        let field: Arc<Field> = Field::new("arg", Utf8, true).into();
+        let return_field: Arc<Field> =
+            Field::new("f", Timestamp(Nanosecond, None), true).into();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("Invalid/Timezone".to_string());
+
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                "2020-09-08T13:42:29Z".to_string(),
+            )))],
+            arg_fields: vec![Arc::clone(&field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        let expected_err =
+            "Invalid timezone \"Invalid/Timezone\": failed to parse timezone";
+
+        validate_expected_error(&mut options, args, expected_err);
+
+        Ok(())
+    }
+
+    #[test]
+    fn to_timestamp_formats_invalid_execution_timezone_behavior() -> Result<()> {
+        let expr_field: Arc<Field> = Field::new("arg", Utf8, true).into();
+        let format_field: Arc<Field> = Field::new("fmt", Utf8, true).into();
+        let return_field: Arc<Field> =
+            Field::new("f", Timestamp(Nanosecond, None), true).into();
+
+        let mut options = ConfigOptions::default();
+        options.execution.time_zone = Some("Invalid/Timezone".to_string());
+
+        let expected_err =
+            "Invalid timezone \"Invalid/Timezone\": failed to parse timezone";
+
+        let make_args = |value: &str, format: &str| ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(value.to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(format.to_string()))),
+            ],
+            arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        for (value, format, _expected_str) in [
+            (
+                "2020-09-08 09:42:29 -05:00",
+                "%Y-%m-%d %H:%M:%S %z",
+                Some("2020-09-08 09:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08T13:42:29Z",
+                "%+",
+                Some("2020-09-08 08:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08 13:42:29 +0000",
+                "%Y-%m-%d %H:%M:%S %z",
+                Some("2020-09-08 08:42:29 -05:00"),
+            ),
+            (
+                "+0000 2024-01-01 12:00:00",
+                "%z %Y-%m-%d %H:%M:%S",
+                Some("2024-01-01 07:00:00 -05:00"),
+            ),
+            (
+                "20200908134229+0100",
+                "%Y%m%d%H%M%S%z",
+                Some("2020-09-08 07:42:29 -05:00"),
+            ),
+            (
+                "2020-09-08+0230 13:42",
+                "%Y-%m-%d%z %H:%M",
+                Some("2020-09-08 06:12:00 -05:00"),
+            ),
+        ] {
+            let args = make_args(value, format);
+            validate_expected_error(&mut options.clone(), args, expected_err);
+        }
+
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "2020-09-08T13:42:29".to_string(),
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                    "%Y-%m-%dT%H:%M:%S".to_string(),
+                ))),
+            ],
+            arg_fields: vec![Arc::clone(&expr_field), Arc::clone(&format_field)],
+            number_rows: 1,
+            return_field: Arc::clone(&return_field),
+            config_options: Arc::new(options.clone()),
+        };
+
+        validate_expected_error(&mut options.clone(), args, expected_err);
+
+        Ok(())
+    }
+
     #[test]
     fn to_timestamp_invalid_input_type() -> Result<()> {
         // pass the wrong type of input array to to_timestamp and test
@@ -788,7 +1423,7 @@ mod tests {
     }
 
     #[test]
-    fn to_timestamp_with_unparseable_data() -> Result<()> {
+    fn to_timestamp_with_unparsable_data() -> Result<()> {
         let mut date_string_builder = StringBuilder::with_capacity(2, 1024);
 
         date_string_builder.append_null();
@@ -798,8 +1433,7 @@ mod tests {
         let string_array =
             ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef);
 
-        let expected_err =
-            "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time";
+        let expected_err = "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time";
         match to_timestamp(&[string_array]) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -823,8 +1457,7 @@ mod tests {
         let string_array =
             ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef);
 
-        let expected_err =
-            "Arrow error: Parser error: Invalid timezone \"ZZ\": failed to parse timezone";
+        let expected_err = "Arrow error: Parser error: Invalid timezone \"ZZ\": failed to parse timezone";
         match to_timestamp(&[string_array]) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -861,8 +1494,7 @@ mod tests {
             ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef),
         ];
 
-        let expected_err =
-            "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters";
+        let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters";
         match to_timestamp(&string_array) {
             Ok(_) => panic!("Expected error but got success"),
             Err(e) => {
@@ -910,7 +1542,11 @@ mod tests {
     }
 
     fn parse_timestamp_formatted(s: &str, format: &str) -> Result<i64, DataFusionError> {
-        let result = string_to_timestamp_nanos_formatted(s, format);
+        let result = string_to_timestamp_nanos_formatted_with_timezone(
+            &Some("UTC".parse()?),
+            s,
+            format,
+        );
         if let Err(e) = &result {
             eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}");
         }
@@ -937,10 +1573,12 @@ mod tests {
         ];
 
         for (s, f, ctx) in cases {
-            let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}");
+            let expected = format!(
+                "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"
+            );
             let actual = string_to_datetime_formatted(&Utc, s, f)
                 .unwrap_err()
-                .to_string();
+                .strip_backtrace();
             assert_eq!(actual, expected)
         }
     }
@@ -965,22 +1603,32 @@ mod tests {
         ];
 
         for (s, f, ctx) in cases {
-            let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}");
+            let expected = format!(
+                "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"
+            );
             let actual = string_to_datetime_formatted(&Utc, s, f)
                 .unwrap_err()
-                .to_string();
+                .strip_backtrace();
             assert_eq!(actual, expected)
         }
     }
 
     #[test]
-    fn test_tz() {
+    fn test_no_tz() {
         let udfs: Vec<Box<dyn ScalarUDFImpl>> = vec![
-            Box::new(ToTimestampFunc::new()),
-            Box::new(ToTimestampSecondsFunc::new()),
-            Box::new(ToTimestampMillisFunc::new()),
-            Box::new(ToTimestampNanosFunc::new()),
-            Box::new(ToTimestampSecondsFunc::new()),
+            Box::new(ToTimestampFunc::new_with_config(&ConfigOptions::default())),
+            Box::new(ToTimestampSecondsFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampMillisFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampNanosFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
+            Box::new(ToTimestampSecondsFunc::new_with_config(
+                &ConfigOptions::default(),
+            )),
         ];
 
         let mut nanos_builder = TimestampNanosecondArray::builder(2);
@@ -1013,12 +1661,13 @@ mod tests {
             for array in arrays {
                 let rt = udf.return_type(&[array.data_type()]).unwrap();
                 let arg_field = Field::new("arg", array.data_type().clone(), true).into();
-                assert!(matches!(rt, Timestamp(_, Some(_))));
-                let args = datafusion_expr::ScalarFunctionArgs {
+                assert!(matches!(rt, Timestamp(_, None)));
+                let args = ScalarFunctionArgs {
                     args: vec![array.clone()],
                     arg_fields: vec![arg_field],
                     number_rows: 4,
                     return_field: Field::new("f", rt, true).into(),
+                    config_options: Arc::new(ConfigOptions::default()),
                 };
                 let res = udf
                     .invoke_with_args(args)
@@ -1028,7 +1677,7 @@ mod tests {
                     _ => panic!("Expected a columnar array"),
                 };
                 let ty = array.data_type();
-                assert!(matches!(ty, Timestamp(_, Some(_))));
+                assert!(matches!(ty, Timestamp(_, None)));
             }
         }
 
@@ -1063,11 +1712,12 @@ mod tests {
                 let rt = udf.return_type(&[array.data_type()]).unwrap();
                 assert!(matches!(rt, Timestamp(_, None)));
                 let arg_field = Field::new("arg", array.data_type().clone(), true).into();
-                let args = datafusion_expr::ScalarFunctionArgs {
+                let args = ScalarFunctionArgs {
                     args: vec![array.clone()],
                     arg_fields: vec![arg_field],
                     number_rows: 5,
                     return_field: Field::new("f", rt, true).into(),
+                    config_options: Arc::new(ConfigOptions::default()),
                 };
                 let res = udf
                     .invoke_with_args(args)
@@ -1199,4 +1849,23 @@ mod tests {
             assert_contains!(actual, expected);
         }
     }
+
+    #[test]
+    fn test_decimal_to_nanoseconds_negative_scale() {
+        // scale -2: internal value 5 represents 5 * 10^2 = 500 seconds
+        let nanos = decimal_to_nanoseconds(5, -2);
+        assert_eq!(nanos, 500_000_000_000); // 500 seconds in nanoseconds
+
+        // scale -1: internal value 10 represents 10 * 10^1 = 100 seconds
+        let nanos = decimal_to_nanoseconds(10, -1);
+        assert_eq!(nanos, 100_000_000_000);
+
+        // scale 0: internal value 5 represents 5 seconds
+        let nanos = decimal_to_nanoseconds(5, 0);
+        assert_eq!(nanos, 5_000_000_000);
+
+        // scale 3: internal value 1500 represents 1.5 seconds
+        let nanos = decimal_to_nanoseconds(1500, 3);
+        assert_eq!(nanos, 1_500_000_000);
+    }
 }
diff --git a/datafusion/functions/src/datetime/to_unixtime.rs b/datafusion/functions/src/datetime/to_unixtime.rs
index 653ec10851695..ea8103865eff7 100644
--- a/datafusion/functions/src/datetime/to_unixtime.rs
+++ b/datafusion/functions/src/datetime/to_unixtime.rs
@@ -18,16 +18,22 @@
 use super::to_timestamp::ToTimestampSecondsFunc;
 use crate::datetime::common::*;
 use arrow::datatypes::{DataType, TimeUnit};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00Z`). Supports strings, dates, timestamps and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.",
+    description = r#"
+Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00`).
+Supports strings, dates, timestamps, integer, unsigned integer, and float types as input.
+Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Integers, unsigned integers, and floats are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00`)."#,
     syntax_example = "to_unixtime(expression[, ..., format_n])",
     sql_example = r#"
 ```sql
@@ -54,7 +60,7 @@ use std::any::Any;
         description = "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToUnixtimeFunc {
     signature: Signature,
 }
@@ -90,10 +96,7 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
         Ok(DataType::Int64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let arg_args = &args.args;
         if arg_args.is_empty() {
             return exec_err!("to_unixtime function requires 1 or more arguments, got 0");
@@ -101,24 +104,46 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
 
         // validate that any args after the first one are Utf8
         if arg_args.len() > 1 {
-            validate_data_types(arg_args, "to_unixtime")?;
+            // Format arguments only make sense for string inputs
+            match arg_args[0].data_type() {
+                DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
+                    validate_data_types(arg_args, "to_unixtime")?;
+                }
+                _ => {
+                    return exec_err!(
+                        "to_unixtime function only accepts format arguments with string input, got {} arguments",
+                        arg_args.len()
+                    );
+                }
+            }
         }
 
         match arg_args[0].data_type() {
-            DataType::Int32 | DataType::Int64 | DataType::Null | DataType::Float64 => {
-                arg_args[0].cast_to(&DataType::Int64, None)
-            }
+            DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float16
+            | DataType::Float32
+            | DataType::Float64
+            | DataType::Null => arg_args[0].cast_to(&DataType::Int64, None),
             DataType::Date64 | DataType::Date32 => arg_args[0]
                 .cast_to(&DataType::Timestamp(TimeUnit::Second, None), None)?
                 .cast_to(&DataType::Int64, None),
             DataType::Timestamp(_, tz) => arg_args[0]
                 .cast_to(&DataType::Timestamp(TimeUnit::Second, tz), None)?
                 .cast_to(&DataType::Int64, None),
-            DataType::Utf8 => ToTimestampSecondsFunc::new()
-                .invoke_with_args(args)?
-                .cast_to(&DataType::Int64, None),
+            DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => {
+                ToTimestampSecondsFunc::new_with_config(args.config_options.as_ref())
+                    .invoke_with_args(args)?
+                    .cast_to(&DataType::Int64, None)
+            }
             other => {
-                exec_err!("Unsupported data type {:?} for function to_unixtime", other)
+                exec_err!("Unsupported data type {} for function to_unixtime", other)
             }
         }
     }
diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs
index 9a7b49105743e..4ad67b78178f2 100644
--- a/datafusion/functions/src/encoding/inner.rs
+++ b/datafusion/functions/src/encoding/inner.rs
@@ -19,26 +19,44 @@
 
 use arrow::{
     array::{
-        Array, ArrayRef, BinaryArray, GenericByteArray, OffsetSizeTrait, StringArray,
+        Array, ArrayRef, AsArray, BinaryArrayType, GenericBinaryArray,
+        GenericStringArray, OffsetSizeTrait,
     },
-    datatypes::{ByteArrayType, DataType},
+    datatypes::DataType,
 };
 use arrow_buffer::{Buffer, OffsetBufferBuilder};
-use base64::{engine::general_purpose, Engine as _};
+use base64::{
+    Engine as _,
+    engine::{DecodePaddingMode, GeneralPurpose, GeneralPurposeConfig},
+};
 use datafusion_common::{
-    cast::{as_generic_binary_array, as_generic_string_array},
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
     not_impl_err, plan_err,
+    types::{NativeType, logical_string},
     utils::take_function_args,
 };
-use datafusion_common::{exec_err, ScalarValue};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation};
-use std::sync::Arc;
-use std::{fmt, str::FromStr};
-
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
 use std::any::Any;
+use std::fmt;
+use std::sync::Arc;
+
+// Allow padding characters, but don't require them, and don't generate them.
+const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new(
+    &base64::alphabet::STANDARD,
+    GeneralPurposeConfig::new()
+        .with_encode_padding(false)
+        .with_decode_padding_mode(DecodePaddingMode::Indifferent),
+);
+
+// Generate padding characters when encoding
+const BASE64_ENGINE_PADDED: GeneralPurpose = GeneralPurpose::new(
+    &base64::alphabet::STANDARD,
+    GeneralPurposeConfig::new().with_encode_padding(true),
+);
 
 #[user_doc(
     doc_section(label = "Binary String Functions"),
@@ -50,11 +68,11 @@ use std::any::Any;
     ),
     argument(
         name = "format",
-        description = "Supported formats are: `base64`, `hex`"
+        description = "Supported formats are: `base64`, `base64pad`, `hex`"
     ),
     related_udf(name = "decode")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct EncodeFunc {
     signature: Signature,
 }
@@ -68,7 +86,17 @@ impl Default for EncodeFunc {
 impl EncodeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Binary,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -77,6 +105,7 @@ impl ScalarUDFImpl for EncodeFunc {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn name(&self) -> &str {
         "encode"
     }
@@ -86,48 +115,21 @@ impl ScalarUDFImpl for EncodeFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-
-        Ok(match arg_types[0] {
-            Utf8 => Utf8,
-            LargeUtf8 => LargeUtf8,
-            Utf8View => Utf8,
-            Binary => Utf8,
-            LargeBinary => LargeUtf8,
-            Null => Null,
-            _ => {
-                return plan_err!(
-                    "The encode function can only accept Utf8 or Binary or Null."
-                );
-            }
-        })
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        encode(&args.args)
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        let [expression, format] = take_function_args(self.name(), arg_types)?;
-
-        if format != &DataType::Utf8 {
-            return Err(DataFusionError::Plan("2nd argument should be Utf8".into()));
+        match &arg_types[0] {
+            DataType::LargeBinary => Ok(DataType::LargeUtf8),
+            _ => Ok(DataType::Utf8),
         }
+    }
 
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [expression, encoding] = take_function_args("encode", &args.args)?;
+        let encoding = Encoding::try_from(encoding)?;
         match expression {
-            DataType::Utf8 | DataType::Utf8View | DataType::Null => {
-                Ok(vec![DataType::Utf8; 2])
+            _ if expression.data_type().is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]),
-            DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]),
-            DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]),
-            _ => plan_err!(
-                "1st argument should be Utf8 or Binary or Null, got {:?}",
-                arg_types[0]
-            ),
+            ColumnarValue::Array(array) => encode_array(array, encoding),
+            ColumnarValue::Scalar(scalar) => encode_scalar(scalar, encoding),
         }
     }
 
@@ -147,7 +149,7 @@ impl ScalarUDFImpl for EncodeFunc {
     argument(name = "format", description = "Same arguments as [encode](#encode)"),
     related_udf(name = "encode")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct DecodeFunc {
     signature: Signature,
 }
@@ -161,7 +163,17 @@ impl Default for DecodeFunc {
 impl DecodeFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Binary,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -170,6 +182,7 @@ impl ScalarUDFImpl for DecodeFunc {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn name(&self) -> &str {
         "decode"
     }
@@ -179,40 +192,21 @@ impl ScalarUDFImpl for DecodeFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        Ok(arg_types[0].to_owned())
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        decode(&args.args)
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 2 {
-            return plan_err!(
-                "{} expects to get 2 arguments, but got {}",
-                self.name(),
-                arg_types.len()
-            );
-        }
-
-        if arg_types[1] != DataType::Utf8 {
-            return plan_err!("2nd argument should be Utf8");
+        match &arg_types[0] {
+            DataType::LargeBinary => Ok(DataType::LargeBinary),
+            _ => Ok(DataType::Binary),
         }
+    }
 
-        match arg_types[0] {
-            DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
-                Ok(vec![DataType::Binary, DataType::Utf8])
-            }
-            DataType::LargeUtf8 | DataType::LargeBinary => {
-                Ok(vec![DataType::LargeBinary, DataType::Utf8])
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [expression, encoding] = take_function_args("decode", &args.args)?;
+        let encoding = Encoding::try_from(encoding)?;
+        match expression {
+            _ if expression.data_type().is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
             }
-            _ => plan_err!(
-                "1st argument should be Utf8 or Binary or Null, got {:?}",
-                arg_types[0]
-            ),
+            ColumnarValue::Array(array) => decode_array(array, encoding),
+            ColumnarValue::Scalar(scalar) => decode_scalar(scalar, encoding),
         }
     }
 
@@ -221,345 +215,317 @@ impl ScalarUDFImpl for DecodeFunc {
     }
 }
 
-#[derive(Debug, Copy, Clone)]
-enum Encoding {
-    Base64,
-    Hex,
-}
-
-fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarValue> {
+fn encode_scalar(value: &ScalarValue, encoding: Encoding) -> Result<ColumnarValue> {
     match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => encoding.encode_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => encoding.encode_utf8_array::<i64>(a.as_ref()),
-            DataType::Utf8View => encoding.encode_utf8_array::<i32>(a.as_ref()),
-            DataType::Binary => encoding.encode_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => encoding.encode_binary_array::<i64>(a.as_ref()),
-            other => exec_err!(
-                "Unsupported data type {other:?} for function encode({encoding})"
-            ),
-        },
-        ColumnarValue::Scalar(scalar) => {
-            match scalar {
-                ScalarValue::Utf8(a) => {
-                    Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
-                }
-                ScalarValue::LargeUtf8(a) => Ok(encoding
-                    .encode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
-                ScalarValue::Utf8View(a) => {
-                    Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
-                }
-                ScalarValue::Binary(a) => Ok(
-                    encoding.encode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
-                ),
-                ScalarValue::LargeBinary(a) => Ok(encoding
-                    .encode_large_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))),
-                other => exec_err!(
-                    "Unsupported data type {other:?} for function encode({encoding})"
-                ),
-            }
+        ScalarValue::Binary(maybe_bytes)
+        | ScalarValue::BinaryView(maybe_bytes)
+        | ScalarValue::FixedSizeBinary(_, maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
+                maybe_bytes
+                    .as_ref()
+                    .map(|bytes| encoding.encode_bytes(bytes)),
+            )))
+        }
+        ScalarValue::LargeBinary(maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
+                maybe_bytes
+                    .as_ref()
+                    .map(|bytes| encoding.encode_bytes(bytes)),
+            )))
         }
+        v => internal_err!("Unexpected value for encode: {v}"),
     }
 }
 
-fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarValue> {
-    match value {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => encoding.decode_utf8_array::<i32>(a.as_ref()),
-            DataType::LargeUtf8 => encoding.decode_utf8_array::<i64>(a.as_ref()),
-            DataType::Utf8View => encoding.decode_utf8_array::<i32>(a.as_ref()),
-            DataType::Binary => encoding.decode_binary_array::<i32>(a.as_ref()),
-            DataType::LargeBinary => encoding.decode_binary_array::<i64>(a.as_ref()),
-            other => exec_err!(
-                "Unsupported data type {other:?} for function decode({encoding})"
-            ),
-        },
-        ColumnarValue::Scalar(scalar) => {
-            match scalar {
-                ScalarValue::Utf8(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
-                }
-                ScalarValue::LargeUtf8(a) => encoding
-                    .decode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes())),
-                ScalarValue::Utf8View(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
-                }
-                ScalarValue::Binary(a) => {
-                    encoding.decode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
-                }
-                ScalarValue::LargeBinary(a) => encoding
-                    .decode_large_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice())),
-                other => exec_err!(
-                    "Unsupported data type {other:?} for function decode({encoding})"
-                ),
-            }
+fn encode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> {
+    let array = match array.data_type() {
+        DataType::Binary => encoding.encode_array::<_, i32>(&array.as_binary::<i32>()),
+        DataType::BinaryView => encoding.encode_array::<_, i32>(&array.as_binary_view()),
+        DataType::LargeBinary => {
+            encoding.encode_array::<_, i64>(&array.as_binary::<i64>())
         }
-    }
+        DataType::FixedSizeBinary(_) => {
+            encoding.encode_array::<_, i32>(&array.as_fixed_size_binary())
+        }
+        dt => {
+            internal_err!("Unexpected data type for encode: {dt}")
+        }
+    };
+    array.map(ColumnarValue::Array)
 }
 
-fn hex_encode(input: &[u8]) -> String {
-    hex::encode(input)
+fn decode_scalar(value: &ScalarValue, encoding: Encoding) -> Result<ColumnarValue> {
+    match value {
+        ScalarValue::Binary(maybe_bytes)
+        | ScalarValue::BinaryView(maybe_bytes)
+        | ScalarValue::FixedSizeBinary(_, maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::Binary(
+                maybe_bytes
+                    .as_ref()
+                    .map(|x| encoding.decode_bytes(x))
+                    .transpose()?,
+            )))
+        }
+        ScalarValue::LargeBinary(maybe_bytes) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(
+                maybe_bytes
+                    .as_ref()
+                    .map(|x| encoding.decode_bytes(x))
+                    .transpose()?,
+            )))
+        }
+        v => internal_err!("Unexpected value for decode: {v}"),
+    }
 }
 
-fn base64_encode(input: &[u8]) -> String {
-    general_purpose::STANDARD_NO_PAD.encode(input)
+/// Estimate how many bytes are actually represented by the array; in case the
+/// the array slices it's internal buffer, this returns the byte size of that slice
+/// but not the byte size of the entire buffer.
+///
+/// This is an estimation only as it can estimate higher if null slots are non-zero
+/// sized.
+fn estimate_byte_data_size<O: OffsetSizeTrait>(array: &GenericBinaryArray<O>) -> usize {
+    let offsets = array.value_offsets();
+    // Unwraps are safe as should always have 1 element in offset buffer
+    let start = *offsets.first().unwrap();
+    let end = *offsets.last().unwrap();
+    let data_size = end - start;
+    data_size.as_usize()
 }
 
-fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
-    // only write input / 2 bytes to buf
-    let out_len = input.len() / 2;
-    let buf = &mut buf[..out_len];
-    hex::decode_to_slice(input, buf).map_err(|e| {
-        DataFusionError::Internal(format!("Failed to decode from hex: {e}"))
-    })?;
-    Ok(out_len)
+fn decode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> {
+    let array = match array.data_type() {
+        DataType::Binary => {
+            let array = array.as_binary::<i32>();
+            encoding.decode_array::<_, i32>(&array, estimate_byte_data_size(array))
+        }
+        DataType::BinaryView => {
+            let array = array.as_binary_view();
+            // Don't know if there is a more strict upper bound we can infer
+            // for view arrays byte data size.
+            encoding.decode_array::<_, i32>(&array, array.get_buffer_memory_size())
+        }
+        DataType::LargeBinary => {
+            let array = array.as_binary::<i64>();
+            encoding.decode_array::<_, i64>(&array, estimate_byte_data_size(array))
+        }
+        DataType::FixedSizeBinary(size) => {
+            let array = array.as_fixed_size_binary();
+            // TODO: could we be more conservative by accounting for nulls?
+            let estimate = array.len().saturating_mul(*size as usize);
+            encoding.decode_array::<_, i32>(&array, estimate)
+        }
+        dt => {
+            internal_err!("Unexpected data type for decode: {dt}")
+        }
+    };
+    array.map(ColumnarValue::Array)
 }
 
-fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
-    general_purpose::STANDARD_NO_PAD
-        .decode_slice(input, buf)
-        .map_err(|e| {
-            DataFusionError::Internal(format!("Failed to decode from base64: {e}"))
-        })
+#[derive(Debug, Copy, Clone)]
+enum Encoding {
+    Base64,
+    Base64Padded,
+    Hex,
 }
 
-macro_rules! encode_to_array {
-    ($METHOD: ident, $INPUT:expr) => {{
-        let utf8_array: StringArray = $INPUT
-            .iter()
-            .map(|x| x.map(|x| $METHOD(x.as_ref())))
-            .collect();
-        Arc::new(utf8_array)
-    }};
+impl fmt::Display for Encoding {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            Self::Base64 => "base64",
+            Self::Base64Padded => "base64pad",
+            Self::Hex => "hex",
+        };
+        write!(f, "{name}")
+    }
 }
 
-fn decode_to_array<F, T: ByteArrayType>(
-    method: F,
-    input: &GenericByteArray<T>,
-    conservative_upper_bound_size: usize,
-) -> Result<ArrayRef>
-where
-    F: Fn(&[u8], &mut [u8]) -> Result<usize>,
-{
-    let mut values = vec![0; conservative_upper_bound_size];
-    let mut offsets = OffsetBufferBuilder::new(input.len());
-    let mut total_bytes_decoded = 0;
-    for v in input {
-        if let Some(v) = v {
-            let cursor = &mut values[total_bytes_decoded..];
-            let decoded = method(v.as_ref(), cursor)?;
-            total_bytes_decoded += decoded;
-            offsets.push_length(decoded);
-        } else {
-            offsets.push_length(0);
+impl TryFrom<&ColumnarValue> for Encoding {
+    type Error = DataFusionError;
+
+    fn try_from(encoding: &ColumnarValue) -> Result<Self> {
+        let encoding = match encoding {
+            ColumnarValue::Scalar(encoding) => match encoding.try_as_str().flatten() {
+                Some(encoding) => encoding,
+                _ => return exec_err!("Encoding must be a non-null string"),
+            },
+            ColumnarValue::Array(_) => {
+                return not_impl_err!(
+                    "Encoding must be a scalar; array specified encoding is not yet supported"
+                );
+            }
+        };
+        match encoding {
+            "base64" => Ok(Self::Base64),
+            "base64pad" => Ok(Self::Base64Padded),
+            "hex" => Ok(Self::Hex),
+            _ => {
+                let options = [Self::Base64, Self::Base64Padded, Self::Hex]
+                    .iter()
+                    .map(|i| i.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+                plan_err!(
+                    "There is no built-in encoding named '{encoding}', currently supported encodings are: {options}"
+                )
+            }
         }
     }
-    // We reserved an upper bound size for the values buffer, but we only use the actual size
-    values.truncate(total_bytes_decoded);
-    let binary_array = BinaryArray::try_new(
-        offsets.finish(),
-        Buffer::from_vec(values),
-        input.nulls().cloned(),
-    )?;
-    Ok(Arc::new(binary_array))
 }
 
 impl Encoding {
-    fn encode_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
-        ColumnarValue::Scalar(match self {
-            Self::Base64 => ScalarValue::Utf8(
-                value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)),
-            ),
-            Self::Hex => ScalarValue::Utf8(value.map(hex::encode)),
-        })
-    }
-
-    fn encode_large_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
-        ColumnarValue::Scalar(match self {
-            Self::Base64 => ScalarValue::LargeUtf8(
-                value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)),
-            ),
-            Self::Hex => ScalarValue::LargeUtf8(value.map(hex::encode)),
-        })
+    fn encode_bytes(self, value: &[u8]) -> String {
+        match self {
+            Self::Base64 => BASE64_ENGINE.encode(value),
+            Self::Base64Padded => BASE64_ENGINE_PADDED.encode(value),
+            Self::Hex => hex::encode(value),
+        }
     }
 
-    fn encode_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let input_value = as_generic_binary_array::<T>(value)?;
-        let array: ArrayRef = match self {
-            Self::Base64 => encode_to_array!(base64_encode, input_value),
-            Self::Hex => encode_to_array!(hex_encode, input_value),
-        };
-        Ok(ColumnarValue::Array(array))
+    fn decode_bytes(self, value: &[u8]) -> Result<Vec<u8>> {
+        match self {
+            Self::Base64 | Self::Base64Padded => {
+                BASE64_ENGINE.decode(value).map_err(|e| {
+                    exec_datafusion_err!("Failed to decode value using {self}: {e}")
+                })
+            }
+            Self::Hex => hex::decode(value).map_err(|e| {
+                exec_datafusion_err!("Failed to decode value using hex: {e}")
+            }),
+        }
     }
 
-    fn encode_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+    // OutputOffset important to ensure Large types output Large arrays
+    fn encode_array<'a, InputBinaryArray, OutputOffset>(
+        self,
+        array: &InputBinaryArray,
+    ) -> Result<ArrayRef>
     where
-        T: OffsetSizeTrait,
+        InputBinaryArray: BinaryArrayType<'a>,
+        OutputOffset: OffsetSizeTrait,
     {
-        let input_value = as_generic_string_array::<T>(value)?;
-        let array: ArrayRef = match self {
-            Self::Base64 => encode_to_array!(base64_encode, input_value),
-            Self::Hex => encode_to_array!(hex_encode, input_value),
-        };
-        Ok(ColumnarValue::Array(array))
-    }
-
-    fn decode_scalar(self, value: Option<&[u8]>) -> Result<ColumnarValue> {
-        let value = match value {
-            Some(value) => value,
-            None => return Ok(ColumnarValue::Scalar(ScalarValue::Binary(None))),
-        };
-
-        let out = match self {
+        match self {
             Self::Base64 => {
-                general_purpose::STANDARD_NO_PAD
-                    .decode(value)
-                    .map_err(|e| {
-                        DataFusionError::Internal(format!(
-                            "Failed to decode value using base64: {e}"
-                        ))
-                    })?
+                let array: GenericStringArray<OutputOffset> = array
+                    .iter()
+                    .map(|x| x.map(|x| BASE64_ENGINE.encode(x)))
+                    .collect();
+                Ok(Arc::new(array))
             }
-            Self::Hex => hex::decode(value).map_err(|e| {
-                DataFusionError::Internal(format!(
-                    "Failed to decode value using hex: {e}"
-                ))
-            })?,
-        };
-
-        Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(out))))
-    }
-
-    fn decode_large_scalar(self, value: Option<&[u8]>) -> Result<ColumnarValue> {
-        let value = match value {
-            Some(value) => value,
-            None => return Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(None))),
-        };
-
-        let out = match self {
-            Self::Base64 => {
-                general_purpose::STANDARD_NO_PAD
-                    .decode(value)
-                    .map_err(|e| {
-                        DataFusionError::Internal(format!(
-                            "Failed to decode value using base64: {e}"
-                        ))
-                    })?
+            Self::Base64Padded => {
+                let array: GenericStringArray<OutputOffset> = array
+                    .iter()
+                    .map(|x| x.map(|x| BASE64_ENGINE_PADDED.encode(x)))
+                    .collect();
+                Ok(Arc::new(array))
             }
-            Self::Hex => hex::decode(value).map_err(|e| {
-                DataFusionError::Internal(format!(
-                    "Failed to decode value using hex: {e}"
-                ))
-            })?,
-        };
-
-        Ok(ColumnarValue::Scalar(ScalarValue::LargeBinary(Some(out))))
+            Self::Hex => {
+                let array: GenericStringArray<OutputOffset> =
+                    array.iter().map(|x| x.map(hex::encode)).collect();
+                Ok(Arc::new(array))
+            }
+        }
     }
 
-    fn decode_binary_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+    // OutputOffset important to ensure Large types output Large arrays
+    fn decode_array<'a, InputBinaryArray, OutputOffset>(
+        self,
+        value: &InputBinaryArray,
+        approx_data_size: usize,
+    ) -> Result<ArrayRef>
     where
-        T: OffsetSizeTrait,
+        InputBinaryArray: BinaryArrayType<'a>,
+        OutputOffset: OffsetSizeTrait,
     {
-        let input_value = as_generic_binary_array::<T>(value)?;
-        let array = self.decode_byte_array(input_value)?;
-        Ok(ColumnarValue::Array(array))
-    }
+        fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
+            // only write input / 2 bytes to buf
+            let out_len = input.len() / 2;
+            let buf = &mut buf[..out_len];
+            hex::decode_to_slice(input, buf)
+                .map_err(|e| exec_datafusion_err!("Failed to decode from hex: {e}"))?;
+            Ok(out_len)
+        }
 
-    fn decode_utf8_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
-    where
-        T: OffsetSizeTrait,
-    {
-        let input_value = as_generic_string_array::<T>(value)?;
-        let array = self.decode_byte_array(input_value)?;
-        Ok(ColumnarValue::Array(array))
-    }
+        fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
+            BASE64_ENGINE
+                .decode_slice(input, buf)
+                .map_err(|e| exec_datafusion_err!("Failed to decode from base64: {e}"))
+        }
 
-    fn decode_byte_array<T: ByteArrayType>(
-        &self,
-        input_value: &GenericByteArray<T>,
-    ) -> Result<ArrayRef> {
         match self {
-            Self::Base64 => {
-                let upper_bound =
-                    base64::decoded_len_estimate(input_value.values().len());
-                decode_to_array(base64_decode, input_value, upper_bound)
+            Self::Base64 | Self::Base64Padded => {
+                let upper_bound = base64::decoded_len_estimate(approx_data_size);
+                delegated_decode::<_, _, OutputOffset>(base64_decode, value, upper_bound)
             }
             Self::Hex => {
                 // Calculate the upper bound for decoded byte size
                 // For hex encoding, each pair of hex characters (2 bytes) represents 1 byte when decoded
                 // So the upper bound is half the length of the input values.
-                let upper_bound = input_value.values().len() / 2;
-                decode_to_array(hex_decode, input_value, upper_bound)
+                let upper_bound = approx_data_size / 2;
+                delegated_decode::<_, _, OutputOffset>(hex_decode, value, upper_bound)
             }
         }
     }
 }
 
-impl fmt::Display for Encoding {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}", format!("{self:?}").to_lowercase())
+fn delegated_decode<'a, DecodeFunction, InputBinaryArray, OutputOffset>(
+    decode: DecodeFunction,
+    input: &InputBinaryArray,
+    conservative_upper_bound_size: usize,
+) -> Result<ArrayRef>
+where
+    DecodeFunction: Fn(&[u8], &mut [u8]) -> Result<usize>,
+    InputBinaryArray: BinaryArrayType<'a>,
+    OutputOffset: OffsetSizeTrait,
+{
+    let mut values = vec![0; conservative_upper_bound_size];
+    let mut offsets = OffsetBufferBuilder::new(input.len());
+    let mut total_bytes_decoded = 0;
+    for v in input.iter() {
+        if let Some(v) = v {
+            let cursor = &mut values[total_bytes_decoded..];
+            let decoded = decode(v, cursor)?;
+            total_bytes_decoded += decoded;
+            offsets.push_length(decoded);
+        } else {
+            offsets.push_length(0);
+        }
     }
+    // We reserved an upper bound size for the values buffer, but we only use the actual size
+    values.truncate(total_bytes_decoded);
+    let binary_array = GenericBinaryArray::<OutputOffset>::try_new(
+        offsets.finish(),
+        Buffer::from_vec(values),
+        input.nulls().cloned(),
+    )?;
+    Ok(Arc::new(binary_array))
 }
 
-impl FromStr for Encoding {
-    type Err = DataFusionError;
-    fn from_str(name: &str) -> Result<Encoding> {
-        Ok(match name {
-            "base64" => Self::Base64,
-            "hex" => Self::Hex,
-            _ => {
-                let options = [Self::Base64, Self::Hex]
-                    .iter()
-                    .map(|i| i.to_string())
-                    .collect::<Vec<_>>()
-                    .join(", ");
-                return plan_err!(
-                    "There is no built-in encoding named '{name}', currently supported encodings are: {options}"
-                );
-            }
-        })
+#[cfg(test)]
+mod tests {
+    use arrow::array::BinaryArray;
+    use arrow_buffer::OffsetBuffer;
+
+    use super::*;
+
+    #[test]
+    fn test_estimate_byte_data_size() {
+        // Offsets starting at 0, but don't count entire data buffer size
+        let array = BinaryArray::new(
+            OffsetBuffer::new(vec![0, 5, 10, 15].into()),
+            vec![0; 100].into(),
+            None,
+        );
+        let size = estimate_byte_data_size(&array);
+        assert_eq!(size, 15);
+
+        // Offsets starting at 0, but don't count entire data buffer size
+        let array = BinaryArray::new(
+            OffsetBuffer::new(vec![50, 51, 51, 60, 80, 81].into()),
+            vec![0; 100].into(),
+            Some(vec![true, false, false, true, true].into()),
+        );
+        let size = estimate_byte_data_size(&array);
+        assert_eq!(size, 31);
     }
 }
-
-/// Encodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the encoding to use.
-/// Standard encodings are base64 and hex.
-fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [expression, format] = take_function_args("encode", args)?;
-
-    let encoding = match format {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method)) => method.parse::<Encoding>(),
-            _ => not_impl_err!(
-                "Second argument to encode must be non null constant string: Encode using dynamically decided method is not yet supported. Got {scalar:?}"
-            ),
-        },
-        ColumnarValue::Array(_) => not_impl_err!(
-            "Second argument to encode must be a constant: Encode using dynamically decided method is not yet supported"
-        ),
-    }?;
-    encode_process(expression, encoding)
-}
-
-/// Decodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
-/// Second argument is the encoding to use.
-/// Standard encodings are base64 and hex.
-fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    let [expression, format] = take_function_args("decode", args)?;
-
-    let encoding = match format {
-        ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
-            Some(Some(method))=> method.parse::<Encoding>(),
-            _ => not_impl_err!(
-                "Second argument to decode must be a non null constant string: Decode using dynamically decided method is not yet supported. Got {scalar:?}"
-            ),
-        },
-        ColumnarValue::Array(_) => not_impl_err!(
-            "Second argument to decode must be a utf8 constant: Decode using dynamically decided method is not yet supported"
-        ),
-    }?;
-    decode_process(expression, encoding)
-}
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index 51cd5df8060d1..b9ce113efa627 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Function packages for [DataFusion].
 //!
@@ -191,6 +192,13 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
     Ok(())
 }
 
+#[cfg(test)]
+#[ctor::ctor]
+fn init() {
+    // Enable RUST_LOG logging configuration for test
+    let _ = env_logger::try_init();
+}
+
 #[cfg(test)]
 mod tests {
     use crate::all_default_functions;
diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs
index 30ebf8654ea0b..380877b593643 100644
--- a/datafusion/functions/src/macros.rs
+++ b/datafusion/functions/src/macros.rs
@@ -40,15 +40,54 @@
 /// Exported functions accept:
 /// - `Vec<Expr>` argument (single argument followed by a comma)
 /// - Variable number of `Expr` arguments (zero or more arguments, must be without commas)
+/// - Functions that require config (marked with `@config` prefix)
+///
+/// Note on configuration construction paths:
+/// - The convenience wrappers generated for `@config` functions call the inner
+///   constructor with `ConfigOptions::default()`. These wrappers are intended
+///   primarily for programmatic `Expr` construction and convenience usage.
+/// - When functions are registered in a session, DataFusion will call
+///   `with_updated_config()` to create a `ScalarUDF` instance using the session's
+///   actual `ConfigOptions`. This also happens when configuration changes at runtime
+///   (e.g., via `SET` statements). In short: the macro uses the default config for
+///   convenience constructors; the session config is applied when functions are
+///   registered or when configuration is updated.
 #[macro_export]
 macro_rules! export_functions {
     ($(($FUNC:ident, $DOC:expr, $($arg:tt)*)),*) => {
         $(
             // switch to single-function cases below
-            export_functions!(single $FUNC, $DOC, $($arg)*);
+            $crate::export_functions!(single $FUNC, $DOC, $($arg)*);
         )*
     };
 
+    // function that requires config (marked with @config)
+    (single $FUNC:ident, $DOC:expr, @config) => {
+        #[doc = $DOC]
+        pub fn $FUNC() -> datafusion_expr::Expr {
+            use datafusion_common::config::ConfigOptions;
+            super::$FUNC(&ConfigOptions::default()).call(vec![])
+        }
+    };
+
+    // function that requires config and takes a vector argument
+    (single $FUNC:ident, $DOC:expr, @config $arg:ident,) => {
+        #[doc = $DOC]
+        pub fn $FUNC($arg: Vec<datafusion_expr::Expr>) -> datafusion_expr::Expr {
+            use datafusion_common::config::ConfigOptions;
+            super::$FUNC(&ConfigOptions::default()).call($arg)
+        }
+    };
+
+    // function that requires config and variadic arguments
+    (single $FUNC:ident, $DOC:expr, @config $($arg:ident)*) => {
+        #[doc = $DOC]
+        pub fn $FUNC($($arg: datafusion_expr::Expr),*) -> datafusion_expr::Expr {
+            use datafusion_common::config::ConfigOptions;
+            super::$FUNC(&ConfigOptions::default()).call(vec![$($arg),*])
+        }
+    };
+
     // single vector argument (a single argument followed by a comma)
     (single $FUNC:ident, $DOC:expr, $arg:ident,) => {
         #[doc = $DOC]
@@ -67,12 +106,13 @@ macro_rules! export_functions {
 }
 
 /// Creates a singleton `ScalarUDF` of the `$UDF` function and a function
-/// named `$NAME` which returns that singleton.
+/// named `$NAME` which returns that singleton. Optionally use a custom constructor
+/// `$CTOR` which defaults to `$UDF::new()` if not specified.
 ///
 /// This is used to ensure creating the list of `ScalarUDF` only happens once.
 #[macro_export]
 macro_rules! make_udf_function {
-    ($UDF:ty, $NAME:ident) => {
+    ($UDF:ty, $NAME:ident, $CTOR:expr) => {
         #[doc = concat!("Return a [`ScalarUDF`](datafusion_expr::ScalarUDF) implementation of ", stringify!($NAME))]
         pub fn $NAME() -> std::sync::Arc<datafusion_expr::ScalarUDF> {
             // Singleton instance of the function
@@ -80,12 +120,30 @@ macro_rules! make_udf_function {
                 std::sync::Arc<datafusion_expr::ScalarUDF>,
             > = std::sync::LazyLock::new(|| {
                 std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(
-                    <$UDF>::new(),
+                    ($CTOR)(),
                 ))
             });
             std::sync::Arc::clone(&INSTANCE)
         }
     };
+    ($UDF:ty, $NAME:ident) => {
+        make_udf_function!($UDF, $NAME, <$UDF>::new);
+    };
+}
+
+/// Creates a singleton `ScalarUDF` of the `$UDF` function and a function
+/// named `$NAME` which returns that singleton. The function takes a
+/// configuration argument of type `$CONFIG_TYPE` to create the UDF.
+#[macro_export]
+macro_rules! make_udf_function_with_config {
+    ($UDF:ty, $NAME:ident) => {
+        #[doc = concat!("Return a [`ScalarUDF`](datafusion_expr::ScalarUDF) implementation of ", stringify!($NAME))]
+        pub fn $NAME(config: &datafusion_common::config::ConfigOptions) -> std::sync::Arc<datafusion_expr::ScalarUDF> {
+            std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl(
+                <$UDF>::new_with_config(&config),
+            ))
+        }
+    };
 }
 
 /// Macro creates a sub module if the feature is not enabled
@@ -121,7 +179,7 @@ macro_rules! make_stub_package {
 macro_rules! downcast_named_arg {
     ($ARG:expr, $NAME:expr, $ARRAY_TYPE:ident) => {{
         $ARG.as_any().downcast_ref::<$ARRAY_TYPE>().ok_or_else(|| {
-            internal_datafusion_err!(
+            datafusion_common::internal_datafusion_err!(
                 "could not cast {} to {}",
                 $NAME,
                 std::any::type_name::<$ARRAY_TYPE>()
@@ -137,9 +195,7 @@ macro_rules! downcast_named_arg {
 /// $ARRAY_TYPE: the type of array to cast the argument to
 #[macro_export]
 macro_rules! downcast_arg {
-    ($ARG:expr, $ARRAY_TYPE:ident) => {{
-        downcast_named_arg!($ARG, "", $ARRAY_TYPE)
-    }};
+    ($ARG:expr, $ARRAY_TYPE:ident) => {{ $crate::downcast_named_arg!($ARG, "", $ARRAY_TYPE) }};
 }
 
 /// Macro to create a unary math UDF.
@@ -154,7 +210,7 @@ macro_rules! downcast_arg {
 /// $GET_DOC: the function to get the documentation of the UDF
 macro_rules! make_math_unary_udf {
     ($UDF:ident, $NAME:ident, $UNARY_FUNC:ident, $OUTPUT_ORDERING:expr, $EVALUATE_BOUNDS:expr, $GET_DOC:expr) => {
-        make_udf_function!($NAME::$UDF, $NAME);
+        $crate::make_udf_function!($NAME::$UDF, $NAME);
 
         mod $NAME {
             use std::any::Any;
@@ -162,7 +218,7 @@ macro_rules! make_math_unary_udf {
 
             use arrow::array::{ArrayRef, AsArray};
             use arrow::datatypes::{DataType, Float32Type, Float64Type};
-            use datafusion_common::{exec_err, Result};
+            use datafusion_common::{Result, exec_err};
             use datafusion_expr::interval_arithmetic::Interval;
             use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
             use datafusion_expr::{
@@ -170,7 +226,7 @@ macro_rules! make_math_unary_udf {
                 Signature, Volatility,
             };
 
-            #[derive(Debug)]
+            #[derive(Debug, PartialEq, Eq, Hash)]
             pub struct $UDF {
                 signature: Signature,
             }
@@ -241,7 +297,7 @@ macro_rules! make_math_unary_udf {
                             return exec_err!(
                                 "Unsupported data type {other:?} for function {}",
                                 self.name()
-                            )
+                            );
                         }
                     };
 
@@ -268,7 +324,7 @@ macro_rules! make_math_unary_udf {
 /// $GET_DOC: the function to get the documentation of the UDF
 macro_rules! make_math_binary_udf {
     ($UDF:ident, $NAME:ident, $BINARY_FUNC:ident, $OUTPUT_ORDERING:expr, $GET_DOC:expr) => {
-        make_udf_function!($NAME::$UDF, $NAME);
+        $crate::make_udf_function!($NAME::$UDF, $NAME);
 
         mod $NAME {
             use std::any::Any;
@@ -276,15 +332,16 @@ macro_rules! make_math_binary_udf {
 
             use arrow::array::{ArrayRef, AsArray};
             use arrow::datatypes::{DataType, Float32Type, Float64Type};
-            use datafusion_common::{exec_err, Result};
-            use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+            use datafusion_common::utils::take_function_args;
+            use datafusion_common::{Result, ScalarValue, internal_err};
             use datafusion_expr::TypeSignature;
+            use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
             use datafusion_expr::{
                 ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl,
                 Signature, Volatility,
             };
 
-            #[derive(Debug)]
+            #[derive(Debug, PartialEq, Eq, Hash)]
             pub struct $UDF {
                 signature: Signature,
             }
@@ -337,37 +394,76 @@ macro_rules! make_math_binary_udf {
                     &self,
                     args: ScalarFunctionArgs,
                 ) -> Result<ColumnarValue> {
-                    let args = ColumnarValue::values_to_arrays(&args.args)?;
-                    let arr: ArrayRef = match args[0].data_type() {
-                        DataType::Float64 => {
-                            let y = args[0].as_primitive::<Float64Type>();
-                            let x = args[1].as_primitive::<Float64Type>();
-                            let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                                y,
-                                x,
-                                |y, x| f64::$BINARY_FUNC(y, x),
-                            )?;
-                            Arc::new(result) as _
+                    let ScalarFunctionArgs {
+                        args, return_field, ..
+                    } = args;
+                    let return_type = return_field.data_type();
+                    let [y, x] = take_function_args(self.name(), args)?;
+
+                    match (y, x) {
+                        (
+                            ColumnarValue::Scalar(y_scalar),
+                            ColumnarValue::Scalar(x_scalar),
+                        ) => match (&y_scalar, &x_scalar) {
+                            (y, x) if y.is_null() || x.is_null() => {
+                                ColumnarValue::Scalar(ScalarValue::Null)
+                                    .cast_to(return_type, None)
+                            }
+                            (
+                                ScalarValue::Float64(Some(yv)),
+                                ScalarValue::Float64(Some(xv)),
+                            ) => Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(
+                                f64::$BINARY_FUNC(*yv, *xv),
+                            )))),
+                            (
+                                ScalarValue::Float32(Some(yv)),
+                                ScalarValue::Float32(Some(xv)),
+                            ) => Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(
+                                f32::$BINARY_FUNC(*yv, *xv),
+                            )))),
+                            _ => internal_err!(
+                                "Unexpected scalar types for function {}: {:?}, {:?}",
+                                self.name(),
+                                y_scalar.data_type(),
+                                x_scalar.data_type()
+                            ),
+                        },
+                        (y, x) => {
+                            let args = ColumnarValue::values_to_arrays(&[y, x])?;
+                            let arr: ArrayRef = match args[0].data_type() {
+                                DataType::Float64 => {
+                                    let y = args[0].as_primitive::<Float64Type>();
+                                    let x = args[1].as_primitive::<Float64Type>();
+                                    let result =
+                                        arrow::compute::binary::<_, _, _, Float64Type>(
+                                            y,
+                                            x,
+                                            |y, x| f64::$BINARY_FUNC(y, x),
+                                        )?;
+                                    Arc::new(result) as _
+                                }
+                                DataType::Float32 => {
+                                    let y = args[0].as_primitive::<Float32Type>();
+                                    let x = args[1].as_primitive::<Float32Type>();
+                                    let result =
+                                        arrow::compute::binary::<_, _, _, Float32Type>(
+                                            y,
+                                            x,
+                                            |y, x| f32::$BINARY_FUNC(y, x),
+                                        )?;
+                                    Arc::new(result) as _
+                                }
+                                other => {
+                                    return internal_err!(
+                                        "Unsupported data type {other:?} for function {}",
+                                        self.name()
+                                    );
+                                }
+                            };
+
+                            Ok(ColumnarValue::Array(arr))
                         }
-                        DataType::Float32 => {
-                            let y = args[0].as_primitive::<Float32Type>();
-                            let x = args[1].as_primitive::<Float32Type>();
-                            let result = arrow::compute::binary::<_, _, _, Float32Type>(
-                                y,
-                                x,
-                                |y, x| f32::$BINARY_FUNC(y, x),
-                            )?;
-                            Arc::new(result) as _
-                        }
-                        other => {
-                            return exec_err!(
-                                "Unsupported data type {other:?} for function {}",
-                                self.name()
-                            )
-                        }
-                    };
-
-                    Ok(ColumnarValue::Array(arr))
+                    }
                 }
 
                 fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs
index 0c686a59016ac..1b5aaf7745a84 100644
--- a/datafusion/functions/src/math/abs.rs
+++ b/datafusion/functions/src/math/abs.rs
@@ -21,14 +21,13 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayRef, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array,
+    ArrayRef, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+    Float16Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
+    Int64Array,
 };
 use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
-use datafusion_common::{
-    internal_datafusion_err, not_impl_err, utils::take_function_args, Result,
-};
+use datafusion_common::{Result, not_impl_err, utils::take_function_args};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
@@ -36,9 +35,11 @@ use datafusion_expr::{
     Volatility,
 };
 use datafusion_macros::user_doc;
+use num_traits::sign::Signed;
 
 type MathArrayFunction = fn(&ArrayRef) -> Result<ArrayRef>;
 
+#[macro_export]
 macro_rules! make_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
@@ -49,6 +50,7 @@ macro_rules! make_abs_function {
     }};
 }
 
+#[macro_export]
 macro_rules! make_try_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
@@ -61,13 +63,15 @@ macro_rules! make_try_abs_function {
                         x
                     ))
                 })
-            })?;
+            })
+            .and_then(|v| Ok(v.with_data_type(input.data_type().clone())))?; // maintain decimal's precision and scale
             Ok(Arc::new(res) as ArrayRef)
         }
     }};
 }
 
-macro_rules! make_decimal_abs_function {
+#[macro_export]
+macro_rules! make_wrapping_abs_function {
     ($ARRAY_TYPE:ident) => {{
         |input: &ArrayRef| {
             let array = downcast_named_arg!(&input, "abs arg", $ARRAY_TYPE);
@@ -83,6 +87,7 @@ macro_rules! make_decimal_abs_function {
 /// Return different implementations based on input datatype to reduce branches during execution
 fn create_abs_function(input_data_type: &DataType) -> Result<MathArrayFunction> {
     match input_data_type {
+        DataType::Float16 => Ok(make_abs_function!(Float16Array)),
         DataType::Float32 => Ok(make_abs_function!(Float32Array)),
         DataType::Float64 => Ok(make_abs_function!(Float64Array)),
 
@@ -100,8 +105,10 @@ fn create_abs_function(input_data_type: &DataType) -> Result<MathArrayFunction>
         | DataType::UInt64 => Ok(|input: &ArrayRef| Ok(Arc::clone(input))),
 
         // Decimal types
-        DataType::Decimal128(_, _) => Ok(make_decimal_abs_function!(Decimal128Array)),
-        DataType::Decimal256(_, _) => Ok(make_decimal_abs_function!(Decimal256Array)),
+        DataType::Decimal32(_, _) => Ok(make_wrapping_abs_function!(Decimal32Array)),
+        DataType::Decimal64(_, _) => Ok(make_wrapping_abs_function!(Decimal64Array)),
+        DataType::Decimal128(_, _) => Ok(make_wrapping_abs_function!(Decimal128Array)),
+        DataType::Decimal256(_, _) => Ok(make_wrapping_abs_function!(Decimal256Array)),
 
         other => not_impl_err!("Unsupported data type {other:?} for function abs"),
     }
@@ -110,9 +117,17 @@ fn create_abs_function(input_data_type: &DataType) -> Result<MathArrayFunction>
     doc_section(label = "Math Functions"),
     description = "Returns the absolute value of a number.",
     syntax_example = "abs(numeric_expression)",
+    sql_example = r#"```sql
+> SELECT abs(-5);
++----------+
+| abs(-5)  |
++----------+
+| 5        |
++----------+
+```"#,
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct AbsFunc {
     signature: Signature,
 }
@@ -135,6 +150,7 @@ impl ScalarUDFImpl for AbsFunc {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn name(&self) -> &str {
         "abs"
     }
@@ -144,29 +160,7 @@ impl ScalarUDFImpl for AbsFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            DataType::Float32 => Ok(DataType::Float32),
-            DataType::Float64 => Ok(DataType::Float64),
-            DataType::Int8 => Ok(DataType::Int8),
-            DataType::Int16 => Ok(DataType::Int16),
-            DataType::Int32 => Ok(DataType::Int32),
-            DataType::Int64 => Ok(DataType::Int64),
-            DataType::Null => Ok(DataType::Null),
-            DataType::UInt8 => Ok(DataType::UInt8),
-            DataType::UInt16 => Ok(DataType::UInt16),
-            DataType::UInt32 => Ok(DataType::UInt32),
-            DataType::UInt64 => Ok(DataType::UInt64),
-            DataType::Decimal128(precision, scale) => {
-                Ok(DataType::Decimal128(precision, scale))
-            }
-            DataType::Decimal256(precision, scale) => {
-                Ok(DataType::Decimal256(precision, scale))
-            }
-            _ => not_impl_err!(
-                "Unsupported data type {} for function abs",
-                arg_types[0].to_string()
-            ),
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -185,9 +179,9 @@ impl ScalarUDFImpl for AbsFunc {
         let range = &arg.range;
         let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-        if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+        if range.gt_eq(&zero_point)? == Interval::TRUE {
             Ok(arg.sort_properties)
-        } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+        } else if range.lt_eq(&zero_point)? == Interval::TRUE {
             Ok(-arg.sort_properties)
         } else {
             Ok(SortProperties::Unordered)
diff --git a/datafusion/functions/src/math/ceil.rs b/datafusion/functions/src/math/ceil.rs
new file mode 100644
index 0000000000000..5961b3cb27fed
--- /dev/null
+++ b/datafusion/functions/src/math/ceil.rs
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float32Type,
+    Float64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+
+use super::decimal::{apply_decimal_op, ceil_decimal_value};
+
+#[user_doc(
+    doc_section(label = "Math Functions"),
+    description = "Returns the nearest integer greater than or equal to a number.",
+    syntax_example = "ceil(numeric_expression)",
+    standard_argument(name = "numeric_expression", prefix = "Numeric"),
+    sql_example = r#"```sql
+> SELECT ceil(3.14);
++------------+
+| ceil(3.14) |
++------------+
+| 4.0        |
++------------+
+```"#
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct CeilFunc {
+    signature: Signature,
+}
+
+impl Default for CeilFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CeilFunc {
+    pub fn new() -> Self {
+        let decimal_sig = Coercion::new_exact(TypeSignatureClass::Decimal);
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![decimal_sig]),
+                    TypeSignature::Uniform(1, vec![DataType::Float64, DataType::Float32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for CeilFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "ceil"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(DataType::Float64),
+            other => Ok(other.clone()),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path for float types - avoid array conversion overhead entirely
+        if let ColumnarValue::Scalar(scalar) = arg {
+            match scalar {
+                ScalarValue::Float64(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(
+                        v.map(f64::ceil),
+                    )));
+                }
+                ScalarValue::Float32(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float32(
+                        v.map(f32::ceil),
+                    )));
+                }
+                ScalarValue::Null => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+                }
+                // For decimals: convert to array of size 1, process, then extract scalar
+                // This ensures we don't expand the array while reusing overflow validation
+                _ => {}
+            }
+        }
+
+        // Track if input was a scalar to convert back at the end
+        let is_scalar = matches!(arg, ColumnarValue::Scalar(_));
+
+        // Array path (also handles decimal scalars converted to size-1 arrays)
+        let value = arg.to_array(args.number_rows)?;
+
+        let result: ArrayRef = match value.data_type() {
+            DataType::Float64 => Arc::new(
+                value
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(f64::ceil),
+            ),
+            DataType::Float32 => Arc::new(
+                value
+                    .as_primitive::<Float32Type>()
+                    .unary::<_, Float32Type>(f32::ceil),
+            ),
+            DataType::Null => {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+            }
+            DataType::Decimal32(precision, scale) => {
+                apply_decimal_op::<Decimal32Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal64(precision, scale) => {
+                apply_decimal_op::<Decimal64Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal128(precision, scale) => {
+                apply_decimal_op::<Decimal128Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            DataType::Decimal256(precision, scale) => {
+                apply_decimal_op::<Decimal256Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    ceil_decimal_value,
+                )?
+            }
+            other => {
+                return exec_err!(
+                    "Unsupported data type {other:?} for function {}",
+                    self.name()
+                );
+            }
+        };
+
+        // If input was a scalar, convert result back to scalar
+        if is_scalar {
+            ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
+        } else {
+            Ok(ColumnarValue::Array(result))
+        }
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        Ok(input[0].sort_properties)
+    }
+
+    fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
+        let data_type = inputs[0].data_type();
+        Interval::make_unbounded(&data_type)
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
diff --git a/datafusion/functions/src/math/cot.rs b/datafusion/functions/src/math/cot.rs
index 4e56212ddbee8..1f67ef713833f 100644
--- a/datafusion/functions/src/math/cot.rs
+++ b/datafusion/functions/src/math/cot.rs
@@ -18,12 +18,12 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray};
+use arrow::array::AsArray;
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type};
 
-use crate::utils::make_scalar_function;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -32,9 +32,17 @@ use datafusion_macros::user_doc;
     doc_section(label = "Math Functions"),
     description = "Returns the cotangent of a number.",
     syntax_example = r#"cot(numeric_expression)"#,
+    sql_example = r#"```sql
+> SELECT cot(1);
++---------+
+| cot(1)  |
++---------+
+| 0.64209 |
++---------+
+```"#,
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CotFunc {
     signature: Signature,
 }
@@ -88,24 +96,47 @@ impl ScalarUDFImpl for CotFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(cot, vec![])(&args.args)
-    }
-}
+        let return_field = args.return_field;
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return ColumnarValue::Scalar(ScalarValue::Null)
+                        .cast_to(return_field.data_type(), None);
+                }
 
-///cot SQL function
-fn cot(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float64Type>()
-                .unary::<_, Float64Type>(|x: f64| compute_cot64(x)),
-        ) as ArrayRef),
-        Float32 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float32Type>()
-                .unary::<_, Float32Type>(|x: f32| compute_cot32(x)),
-        ) as ArrayRef),
-        other => exec_err!("Unsupported data type {other:?} for function cot"),
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Float64(Some(compute_cot64(v))),
+                    )),
+                    ScalarValue::Float32(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Float32(Some(compute_cot32(v))),
+                    )),
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for cot: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Float64 => Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .as_primitive::<Float64Type>()
+                        .unary::<_, Float64Type>(compute_cot64),
+                ))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .as_primitive::<Float32Type>()
+                        .unary::<_, Float32Type>(compute_cot32),
+                ))),
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function cot")
+                }
+            },
+        }
     }
 }
 
@@ -121,54 +152,212 @@ fn compute_cot64(x: f64) -> f64 {
 
 #[cfg(test)]
 mod test {
-    use crate::math::cot::cot;
+    use std::sync::Arc;
+
     use arrow::array::{ArrayRef, Float32Array, Float64Array};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::ScalarValue;
     use datafusion_common::cast::{as_float32_array, as_float64_array};
-    use std::sync::Arc;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    use crate::math::cot::CotFunc;
 
     #[test]
     fn test_cot_f32() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float32Array::from(vec![12.1, 30.0, 90.0, -30.0]))];
-        let result = cot(&args).expect("failed to initialize function cot");
-        let floats =
-            as_float32_array(&result).expect("failed to initialize function cot");
-
-        let expected = Float32Array::from(vec![
-            -1.986_460_4,
-            -0.156_119_96,
-            -0.501_202_8,
-            0.156_119_96,
-        ]);
-
-        let eps = 1e-6;
-        assert_eq!(floats.len(), 4);
-        assert!((floats.value(0) - expected.value(0)).abs() < eps);
-        assert!((floats.value(1) - expected.value(1)).abs() < eps);
-        assert!((floats.value(2) - expected.value(2)).abs() < eps);
-        assert!((floats.value(3) - expected.value(3)).abs() < eps);
+        let array = Arc::new(Float32Array::from(vec![12.1, 30.0, 90.0, -30.0]));
+        let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::clone(&array) as ArrayRef)],
+            arg_fields,
+            number_rows: array.len(),
+            return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function cot");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float32_array(&arr)
+                    .expect("failed to convert result to a Float32Array");
+
+                let expected = Float32Array::from(vec![
+                    -1.986_460_4,
+                    -0.156_119_96,
+                    -0.501_202_8,
+                    0.156_119_96,
+                ]);
+
+                let eps = 1e-6;
+                assert_eq!(floats.len(), 4);
+                assert!((floats.value(0) - expected.value(0)).abs() < eps);
+                assert!((floats.value(1) - expected.value(1)).abs() < eps);
+                assert!((floats.value(2) - expected.value(2)).abs() < eps);
+                assert!((floats.value(3) - expected.value(3)).abs() < eps);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
     }
 
     #[test]
     fn test_cot_f64() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float64Array::from(vec![12.1, 30.0, 90.0, -30.0]))];
-        let result = cot(&args).expect("failed to initialize function cot");
-        let floats =
-            as_float64_array(&result).expect("failed to initialize function cot");
-
-        let expected = Float64Array::from(vec![
-            -1.986_458_685_881_4,
-            -0.156_119_952_161_6,
-            -0.501_202_783_380_1,
-            0.156_119_952_161_6,
-        ]);
-
-        let eps = 1e-12;
-        assert_eq!(floats.len(), 4);
-        assert!((floats.value(0) - expected.value(0)).abs() < eps);
-        assert!((floats.value(1) - expected.value(1)).abs() < eps);
-        assert!((floats.value(2) - expected.value(2)).abs() < eps);
-        assert!((floats.value(3) - expected.value(3)).abs() < eps);
+        let array = Arc::new(Float64Array::from(vec![12.1, 30.0, 90.0, -30.0]));
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Array(Arc::clone(&array) as ArrayRef)],
+            arg_fields,
+            number_rows: array.len(),
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function cot");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+
+                let expected = Float64Array::from(vec![
+                    -1.986_458_685_881_4,
+                    -0.156_119_952_161_6,
+                    -0.501_202_783_380_1,
+                    0.156_119_952_161_6,
+                ]);
+
+                let eps = 1e-12;
+                assert_eq!(floats.len(), 4);
+                assert!((floats.value(0) - expected.value(0)).abs() < eps);
+                assert!((floats.value(1) - expected.value(1)).abs() < eps);
+                assert!((floats.value(2) - expected.value(2)).abs() < eps);
+                assert!((floats.value(3) - expected.value(3)).abs() < eps);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_f64() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(1.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot scalar should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(1.0) = 1/tan(1.0) ≈ 0.6420926159343306
+                let expected = 1.0_f64 / 1.0_f64.tan();
+                assert!((v - expected).abs() < 1e-12);
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_f32() {
+        let arg_fields = vec![Field::new("a", DataType::Float32, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float32(Some(1.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float32, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot scalar should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float32(Some(v))) => {
+                let expected = 1.0_f32 / 1.0_f32.tan();
+                assert!((v - expected).abs() < 1e-6);
+            }
+            _ => panic!("Expected Float32 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_null() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, true).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(None))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot null should succeed");
+
+        match result {
+            ColumnarValue::Scalar(scalar) => {
+                assert!(scalar.is_null());
+            }
+            _ => panic!("Expected scalar result"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_zero() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(0.0)))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot zero should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(0) = 1/tan(0) = infinity
+                assert!(v.is_infinite());
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
+    }
+
+    #[test]
+    fn test_cot_scalar_pi() {
+        let arg_fields = vec![Field::new("a", DataType::Float64, false).into()];
+        let args = ScalarFunctionArgs {
+            args: vec![ColumnarValue::Scalar(ScalarValue::Float64(Some(
+                std::f64::consts::PI,
+            )))],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, false).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = CotFunc::new()
+            .invoke_with_args(args)
+            .expect("cot pi should succeed");
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => {
+                // cot(PI) = 1/tan(PI) - very large negative number due to floating point
+                let expected = 1.0_f64 / std::f64::consts::PI.tan();
+                assert!((v - expected).abs() < 1e-6);
+            }
+            _ => panic!("Expected Float64 scalar"),
+        }
     }
 }
diff --git a/datafusion/functions/src/math/decimal.rs b/datafusion/functions/src/math/decimal.rs
new file mode 100644
index 0000000000000..abaded4568a93
--- /dev/null
+++ b/datafusion/functions/src/math/decimal.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
+use arrow::datatypes::{ArrowNativeTypeOp, DecimalType};
+use arrow::error::ArrowError;
+use arrow_buffer::ArrowNativeType;
+use datafusion_common::{DataFusionError, Result};
+
+pub(super) fn apply_decimal_op<T, F>(
+    array: &ArrayRef,
+    precision: u8,
+    scale: i8,
+    fn_name: &str,
+    op: F,
+) -> Result<ArrayRef>
+where
+    T: DecimalType,
+    T::Native: ArrowNativeType + ArrowNativeTypeOp,
+    F: Fn(T::Native, T::Native) -> T::Native,
+{
+    if scale <= 0 {
+        return Ok(Arc::clone(array));
+    }
+
+    let factor = decimal_scale_factor::<T>(scale, fn_name)?;
+    let decimal = array.as_primitive::<T>();
+    let data_type = array.data_type().clone();
+
+    let result: PrimitiveArray<T> = decimal.try_unary(|value| {
+        let new_value = op(value, factor);
+        T::validate_decimal_precision(new_value, precision, scale).map_err(|_| {
+            ArrowError::ComputeError(format!("Decimal overflow while applying {fn_name}"))
+        })?;
+        Ok::<_, ArrowError>(new_value)
+    })?;
+
+    let result = result.with_data_type(data_type);
+
+    Ok(Arc::new(result))
+}
+
+fn decimal_scale_factor<T>(scale: i8, fn_name: &str) -> Result<T::Native>
+where
+    T: DecimalType,
+    T::Native: ArrowNativeType + ArrowNativeTypeOp,
+{
+    let base = <T::Native as ArrowNativeType>::from_usize(10).ok_or_else(|| {
+        DataFusionError::Execution(format!(
+            "Cannot get 10_{} from usize: {:?}",
+            std::any::type_name::<T::Native>(),
+            10_usize
+        ))
+    })?;
+
+    base.pow_checked(scale as u32).map_err(|_| {
+        DataFusionError::Execution(format!("Decimal overflow while applying {fn_name}"))
+    })
+}
+
+pub(super) fn ceil_decimal_value<T>(value: T, factor: T) -> T
+where
+    T: ArrowNativeTypeOp + std::ops::Rem<Output = T>,
+{
+    let remainder = value % factor;
+
+    if remainder == T::ZERO {
+        return value;
+    }
+
+    if value >= T::ZERO {
+        let increment = factor.sub_wrapping(remainder);
+        value.add_wrapping(increment)
+    } else {
+        value.sub_wrapping(remainder)
+    }
+}
+
+pub(super) fn floor_decimal_value<T>(value: T, factor: T) -> T
+where
+    T: ArrowNativeTypeOp + std::ops::Rem<Output = T>,
+{
+    let remainder = value % factor;
+
+    if remainder == T::ZERO {
+        return value;
+    }
+
+    if value >= T::ZERO {
+        value.sub_wrapping(remainder)
+    } else {
+        let adjustment = factor.add_wrapping(remainder);
+        value.sub_wrapping(adjustment)
+    }
+}
diff --git a/datafusion/functions/src/math/factorial.rs b/datafusion/functions/src/math/factorial.rs
index c2ac21b78f212..c1dd802140c04 100644
--- a/datafusion/functions/src/math/factorial.rs
+++ b/datafusion/functions/src/math/factorial.rs
@@ -15,19 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::{
-    array::{ArrayRef, Int64Array},
-    error::ArrowError,
-};
+use arrow::array::{ArrayRef, AsArray, Int64Array};
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
+use arrow::datatypes::{DataType, Int64Type};
 
-use crate::utils::make_scalar_function;
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_datafusion_err, DataFusionError, Result,
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
 };
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -39,9 +35,17 @@ use datafusion_macros::user_doc;
     doc_section(label = "Math Functions"),
     description = "Factorial. Returns 1 if value is less than 2.",
     syntax_example = "factorial(numeric_expression)",
+    sql_example = r#"```sql
+> SELECT factorial(5);
++---------------+
+| factorial(5)  |
++---------------+
+| 120           |
++---------------+
+```"#,
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct FactorialFunc {
     signature: Signature,
 }
@@ -78,7 +82,39 @@ impl ScalarUDFImpl for FactorialFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(factorial, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Int64(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Int64(Some(v)) => {
+                        let result = compute_factorial(v)?;
+                        Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected data type {:?} for function factorial",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Int64 => {
+                    let result: Int64Array = array
+                        .as_primitive::<Int64Type>()
+                        .try_unary(compute_factorial)?;
+                    Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
+                }
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function factorial")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -86,50 +122,36 @@ impl ScalarUDFImpl for FactorialFunc {
     }
 }
 
-/// Factorial SQL function
-fn factorial(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Int64 => {
-            let arg = downcast_named_arg!((&args[0]), "value", Int64Array);
-            Ok(arg
-                .iter()
-                .map(|a| match a {
-                    Some(a) => (2..=a)
-                        .try_fold(1i64, i64::checked_mul)
-                        .ok_or_else(|| {
-                            arrow_datafusion_err!(ArrowError::ComputeError(format!(
-                                "Overflow happened on FACTORIAL({a})"
-                            )))
-                        })
-                        .map(Some),
-                    _ => Ok(None),
-                })
-                .collect::<Result<Int64Array>>()
-                .map(Arc::new)? as ArrayRef)
-        }
-        other => exec_err!("Unsupported data type {other:?} for function factorial."),
-    }
-}
-
-#[cfg(test)]
-mod test {
-
-    use datafusion_common::cast::as_int64_array;
-
-    use super::*;
-
-    #[test]
-    fn test_factorial_i64() {
-        let args: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![0, 1, 2, 4])), // input
-        ];
-
-        let result = factorial(&args).expect("failed to initialize function factorial");
-        let ints =
-            as_int64_array(&result).expect("failed to initialize function factorial");
-
-        let expected = Int64Array::from(vec![1, 1, 2, 24]);
-
-        assert_eq!(ints, &expected);
+const FACTORIALS: [i64; 21] = [
+    1,
+    1,
+    2,
+    6,
+    24,
+    120,
+    720,
+    5040,
+    40320,
+    362880,
+    3628800,
+    39916800,
+    479001600,
+    6227020800,
+    87178291200,
+    1307674368000,
+    20922789888000,
+    355687428096000,
+    6402373705728000,
+    121645100408832000,
+    2432902008176640000,
+]; // if return type changes, this constant needs to be updated accordingly
+
+fn compute_factorial(n: i64) -> Result<i64> {
+    if n < 0 {
+        Ok(1)
+    } else if n < FACTORIALS.len() as i64 {
+        Ok(FACTORIALS[n as usize])
+    } else {
+        exec_err!("Overflow happened on FACTORIAL({n})")
     }
 }
diff --git a/datafusion/functions/src/math/floor.rs b/datafusion/functions/src/math/floor.rs
new file mode 100644
index 0000000000000..d4f25716ff7ee
--- /dev/null
+++ b/datafusion/functions/src/math/floor.rs
@@ -0,0 +1,689 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray};
+use arrow::compute::{DecimalCast, rescale_decimal};
+use arrow::datatypes::{
+    ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, DecimalType, Float32Type, Float64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::preimage::PreimageResult;
+use datafusion_expr::simplify::SimplifyContext;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_macros::user_doc;
+use num_traits::{CheckedAdd, Float, One};
+
+use super::decimal::{apply_decimal_op, floor_decimal_value};
+
+#[user_doc(
+    doc_section(label = "Math Functions"),
+    description = "Returns the nearest integer less than or equal to a number.",
+    syntax_example = "floor(numeric_expression)",
+    standard_argument(name = "numeric_expression", prefix = "Numeric"),
+    sql_example = r#"```sql
+> SELECT floor(3.14);
++-------------+
+| floor(3.14) |
++-------------+
+| 3.0         |
++-------------+
+```"#
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct FloorFunc {
+    signature: Signature,
+}
+
+impl Default for FloorFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FloorFunc {
+    pub fn new() -> Self {
+        let decimal_sig = Coercion::new_exact(TypeSignatureClass::Decimal);
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![decimal_sig]),
+                    TypeSignature::Uniform(1, vec![DataType::Float64, DataType::Float32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+// ============ Macro for preimage bounds ============
+/// Generates the code to call the appropriate bounds function and wrap results.
+macro_rules! preimage_bounds {
+    // Float types: call float_preimage_bounds and wrap in ScalarValue
+    (float: $variant:ident, $value:expr) => {
+        float_preimage_bounds($value).map(|(lo, hi)| {
+            (
+                ScalarValue::$variant(Some(lo)),
+                ScalarValue::$variant(Some(hi)),
+            )
+        })
+    };
+
+    // Integer types: call int_preimage_bounds and wrap in ScalarValue
+    (int: $variant:ident, $value:expr) => {
+        int_preimage_bounds($value).map(|(lo, hi)| {
+            (
+                ScalarValue::$variant(Some(lo)),
+                ScalarValue::$variant(Some(hi)),
+            )
+        })
+    };
+
+    // Decimal types: call decimal_preimage_bounds with precision/scale and wrap in ScalarValue
+    (decimal: $variant:ident, $decimal_type:ty, $value:expr, $precision:expr, $scale:expr) => {
+        decimal_preimage_bounds::<$decimal_type>($value, $precision, $scale).map(
+            |(lo, hi)| {
+                (
+                    ScalarValue::$variant(Some(lo), $precision, $scale),
+                    ScalarValue::$variant(Some(hi), $precision, $scale),
+                )
+            },
+        )
+    };
+}
+
+impl ScalarUDFImpl for FloorFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "floor"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Null => Ok(DataType::Float64),
+            other => Ok(other.clone()),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path for float types - avoid array conversion overhead entirely
+        if let ColumnarValue::Scalar(scalar) = arg {
+            match scalar {
+                ScalarValue::Float64(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(
+                        v.map(f64::floor),
+                    )));
+                }
+                ScalarValue::Float32(v) => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float32(
+                        v.map(f32::floor),
+                    )));
+                }
+                ScalarValue::Null => {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+                }
+                // For decimals: convert to array of size 1, process, then extract scalar
+                // This ensures we don't expand the array while reusing overflow validation
+                _ => {}
+            }
+        }
+
+        // Track if input was a scalar to convert back at the end
+        let is_scalar = matches!(arg, ColumnarValue::Scalar(_));
+
+        // Array path (also handles decimal scalars converted to size-1 arrays)
+        let value = arg.to_array(args.number_rows)?;
+
+        let result: ArrayRef = match value.data_type() {
+            DataType::Float64 => Arc::new(
+                value
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(f64::floor),
+            ),
+            DataType::Float32 => Arc::new(
+                value
+                    .as_primitive::<Float32Type>()
+                    .unary::<_, Float32Type>(f32::floor),
+            ),
+            DataType::Null => {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Float64(None)));
+            }
+            DataType::Decimal32(precision, scale) => {
+                apply_decimal_op::<Decimal32Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal64(precision, scale) => {
+                apply_decimal_op::<Decimal64Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal128(precision, scale) => {
+                apply_decimal_op::<Decimal128Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            DataType::Decimal256(precision, scale) => {
+                apply_decimal_op::<Decimal256Type, _>(
+                    &value,
+                    *precision,
+                    *scale,
+                    self.name(),
+                    floor_decimal_value,
+                )?
+            }
+            other => {
+                return exec_err!(
+                    "Unsupported data type {other:?} for function {}",
+                    self.name()
+                );
+            }
+        };
+
+        // If input was a scalar, convert result back to scalar
+        if is_scalar {
+            ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
+        } else {
+            Ok(ColumnarValue::Array(result))
+        }
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        Ok(input[0].sort_properties)
+    }
+
+    fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
+        let data_type = inputs[0].data_type();
+        Interval::make_unbounded(&data_type)
+    }
+
+    /// Compute the preimage for floor function.
+    ///
+    /// For `floor(x) = N`, the preimage is `x >= N AND x < N + 1`
+    /// because floor(x) = N for all x in [N, N+1).
+    ///
+    /// This enables predicate pushdown optimizations, transforming:
+    /// `floor(col) = 100` into `col >= 100 AND col < 101`
+    fn preimage(
+        &self,
+        args: &[Expr],
+        lit_expr: &Expr,
+        _info: &SimplifyContext,
+    ) -> Result<PreimageResult> {
+        // floor takes exactly one argument and we do not expect to reach here with multiple arguments.
+        debug_assert!(args.len() == 1, "floor() takes exactly one argument");
+
+        let arg = args[0].clone();
+
+        // Extract the literal value being compared to
+        let Expr::Literal(lit_value, _) = lit_expr else {
+            return Ok(PreimageResult::None);
+        };
+
+        // Compute lower bound (N) and upper bound (N + 1) using helper functions
+        let Some((lower, upper)) = (match lit_value {
+            // Floating-point types
+            ScalarValue::Float64(Some(n)) => preimage_bounds!(float: Float64, *n),
+            ScalarValue::Float32(Some(n)) => preimage_bounds!(float: Float32, *n),
+
+            // Integer types (not reachable from SQL/SLT: floor() only accepts Float64/Float32/Decimal,
+            // so the RHS literal is always coerced to one of those before preimage runs; kept for
+            // programmatic use and unit tests)
+            ScalarValue::Int8(Some(n)) => preimage_bounds!(int: Int8, *n),
+            ScalarValue::Int16(Some(n)) => preimage_bounds!(int: Int16, *n),
+            ScalarValue::Int32(Some(n)) => preimage_bounds!(int: Int32, *n),
+            ScalarValue::Int64(Some(n)) => preimage_bounds!(int: Int64, *n),
+
+            // Decimal types
+            // DECIMAL(precision, scale) where precision ≤ 38 -> Decimal128(precision, scale)
+            // DECIMAL(precision, scale) where precision > 38 -> Decimal256(precision, scale)
+            // Decimal32 and Decimal64 are unreachable from SQL/SLT.
+            ScalarValue::Decimal32(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal32, Decimal32Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal64(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal64, Decimal64Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal128(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal128, Decimal128Type, *n, *precision, *scale)
+            }
+            ScalarValue::Decimal256(Some(n), precision, scale) => {
+                preimage_bounds!(decimal: Decimal256, Decimal256Type, *n, *precision, *scale)
+            }
+
+            // Unsupported types
+            _ => None,
+        }) else {
+            return Ok(PreimageResult::None);
+        };
+
+        Ok(PreimageResult::Range {
+            expr: arg,
+            interval: Box::new(Interval::try_new(lower, upper)?),
+        })
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+// ============ Helper functions for preimage bounds ============
+
+/// Compute preimage bounds for floor function on floating-point types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if:
+/// - The value is non-finite (infinity, NaN)
+/// - The value is not an integer (floor always returns integers, so floor(x) = 1.3 has no solution)
+/// - Adding 1 would lose precision at extreme values
+fn float_preimage_bounds<F: Float>(n: F) -> Option<(F, F)> {
+    let one = F::one();
+    // Check for non-finite values (infinity, NaN)
+    if !n.is_finite() {
+        return None;
+    }
+    // floor always returns an integer, so if n has a fractional part, there's no solution
+    if n.fract() != F::zero() {
+        return None;
+    }
+    // Check for precision loss at extreme values
+    if n + one <= n {
+        return None;
+    }
+    Some((n, n + one))
+}
+
+/// Compute preimage bounds for floor function on integer types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if adding 1 would overflow.
+fn int_preimage_bounds<I: CheckedAdd + One + Copy>(n: I) -> Option<(I, I)> {
+    let upper = n.checked_add(&I::one())?;
+    Some((n, upper))
+}
+
+/// Compute preimage bounds for floor function on decimal types.
+/// For floor(x) = n, the preimage is [n, n+1).
+/// Returns None if:
+/// - The value has a fractional part (floor always returns integers)
+/// - Adding 1 would overflow
+fn decimal_preimage_bounds<D: DecimalType>(
+    value: D::Native,
+    precision: u8,
+    scale: i8,
+) -> Option<(D::Native, D::Native)>
+where
+    D::Native: DecimalCast + ArrowNativeTypeOp + std::ops::Rem<Output = D::Native>,
+{
+    // Use rescale_decimal to compute "1" at target scale (avoids manual pow)
+    // Convert integer 1 (scale=0) to the target scale
+    let one_scaled: D::Native = rescale_decimal::<D, D>(
+        D::Native::ONE, // value = 1
+        1,              // input_precision = 1
+        0,              // input_scale = 0 (integer)
+        precision,      // output_precision
+        scale,          // output_scale
+    )?;
+
+    // floor always returns an integer, so if value has a fractional part, there's no solution
+    // Check: value % one_scaled != 0 means fractional part exists
+    if scale > 0 && value % one_scaled != D::Native::ZERO {
+        return None;
+    }
+
+    // Compute upper bound using checked addition
+    // Before preimage stage, the internal i128/i256(value) is validated based on the precision and scale.
+    // MAX_DECIMAL128_FOR_EACH_PRECISION and MAX_DECIMAL256_FOR_EACH_PRECISION are used to validate the internal i128/i256.
+    // Any invalid i128/i256 will not reach here.
+    // Therefore, the add_checked will always succeed if tested via SQL/SLT path.
+    let upper = value.add_checked(one_scaled).ok()?;
+
+    Some((value, upper))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_buffer::i256;
+    use datafusion_expr::col;
+
+    /// Helper to test valid preimage cases that should return a Range
+    fn assert_preimage_range(
+        input: ScalarValue,
+        expected_lower: ScalarValue,
+        expected_upper: ScalarValue,
+    ) {
+        let floor_func = FloorFunc::new();
+        let args = vec![col("x")];
+        let lit_expr = Expr::Literal(input.clone(), None);
+        let info = SimplifyContext::default();
+
+        let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
+
+        match result {
+            PreimageResult::Range { expr, interval } => {
+                assert_eq!(expr, col("x"));
+                assert_eq!(interval.lower().clone(), expected_lower);
+                assert_eq!(interval.upper().clone(), expected_upper);
+            }
+            PreimageResult::None => {
+                panic!("Expected Range, got None for input {input:?}")
+            }
+        }
+    }
+
+    /// Helper to test cases that should return None
+    fn assert_preimage_none(input: ScalarValue) {
+        let floor_func = FloorFunc::new();
+        let args = vec![col("x")];
+        let lit_expr = Expr::Literal(input.clone(), None);
+        let info = SimplifyContext::default();
+
+        let result = floor_func.preimage(&args, &lit_expr, &info).unwrap();
+        assert!(
+            matches!(result, PreimageResult::None),
+            "Expected None for input {input:?}"
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_valid_cases() {
+        // Float64
+        assert_preimage_range(
+            ScalarValue::Float64(Some(100.0)),
+            ScalarValue::Float64(Some(100.0)),
+            ScalarValue::Float64(Some(101.0)),
+        );
+        // Float32
+        assert_preimage_range(
+            ScalarValue::Float32(Some(50.0)),
+            ScalarValue::Float32(Some(50.0)),
+            ScalarValue::Float32(Some(51.0)),
+        );
+        // Int64
+        assert_preimage_range(
+            ScalarValue::Int64(Some(42)),
+            ScalarValue::Int64(Some(42)),
+            ScalarValue::Int64(Some(43)),
+        );
+        // Int32
+        assert_preimage_range(
+            ScalarValue::Int32(Some(100)),
+            ScalarValue::Int32(Some(100)),
+            ScalarValue::Int32(Some(101)),
+        );
+        // Negative values
+        assert_preimage_range(
+            ScalarValue::Float64(Some(-5.0)),
+            ScalarValue::Float64(Some(-5.0)),
+            ScalarValue::Float64(Some(-4.0)),
+        );
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Float64(Some(0.0)),
+            ScalarValue::Float64(Some(0.0)),
+            ScalarValue::Float64(Some(1.0)),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_non_integer_float() {
+        // floor(x) = 1.3 has NO SOLUTION because floor always returns an integer
+        // Therefore preimage should return None for non-integer literals
+        assert_preimage_none(ScalarValue::Float64(Some(1.3)));
+        assert_preimage_none(ScalarValue::Float64(Some(-2.5)));
+        assert_preimage_none(ScalarValue::Float32(Some(3.7)));
+    }
+
+    #[test]
+    fn test_floor_preimage_integer_overflow() {
+        // All integer types at MAX value should return None
+        assert_preimage_none(ScalarValue::Int64(Some(i64::MAX)));
+        assert_preimage_none(ScalarValue::Int32(Some(i32::MAX)));
+        assert_preimage_none(ScalarValue::Int16(Some(i16::MAX)));
+        assert_preimage_none(ScalarValue::Int8(Some(i8::MAX)));
+    }
+
+    #[test]
+    fn test_floor_preimage_float_edge_cases() {
+        // Float64 edge cases
+        assert_preimage_none(ScalarValue::Float64(Some(f64::INFINITY)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::NEG_INFINITY)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::NAN)));
+        assert_preimage_none(ScalarValue::Float64(Some(f64::MAX))); // precision loss
+
+        // Float32 edge cases
+        assert_preimage_none(ScalarValue::Float32(Some(f32::INFINITY)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::NEG_INFINITY)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::NAN)));
+        assert_preimage_none(ScalarValue::Float32(Some(f32::MAX))); // precision loss
+    }
+
+    #[test]
+    fn test_floor_preimage_null_values() {
+        assert_preimage_none(ScalarValue::Float64(None));
+        assert_preimage_none(ScalarValue::Float32(None));
+        assert_preimage_none(ScalarValue::Int64(None));
+    }
+
+    // ============ Decimal32 Tests (mirrors float/int tests) ============
+
+    #[test]
+    fn test_floor_preimage_decimal_valid_cases() {
+        // ===== Decimal32 =====
+        // Positive integer decimal: 100.00 (scale=2, so raw=10000)
+        // floor(x) = 100.00 -> x in [100.00, 101.00)
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(10000), 9, 2),
+            ScalarValue::Decimal32(Some(10000), 9, 2), // 100.00
+            ScalarValue::Decimal32(Some(10100), 9, 2), // 101.00
+        );
+
+        // Smaller positive: 50.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(5000), 9, 2),
+            ScalarValue::Decimal32(Some(5000), 9, 2), // 50.00
+            ScalarValue::Decimal32(Some(5100), 9, 2), // 51.00
+        );
+
+        // Negative integer decimal: -5.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(-500), 9, 2),
+            ScalarValue::Decimal32(Some(-500), 9, 2), // -5.00
+            ScalarValue::Decimal32(Some(-400), 9, 2), // -4.00
+        );
+
+        // Zero: 0.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(0), 9, 2),
+            ScalarValue::Decimal32(Some(0), 9, 2), // 0.00
+            ScalarValue::Decimal32(Some(100), 9, 2), // 1.00
+        );
+
+        // Scale 0 (pure integer): 42
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(42), 9, 0),
+            ScalarValue::Decimal32(Some(42), 9, 0),
+            ScalarValue::Decimal32(Some(43), 9, 0),
+        );
+
+        // ===== Decimal64 =====
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(10000), 18, 2),
+            ScalarValue::Decimal64(Some(10000), 18, 2), // 100.00
+            ScalarValue::Decimal64(Some(10100), 18, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(-500), 18, 2),
+            ScalarValue::Decimal64(Some(-500), 18, 2), // -5.00
+            ScalarValue::Decimal64(Some(-400), 18, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal64(Some(0), 18, 2),
+            ScalarValue::Decimal64(Some(0), 18, 2),
+            ScalarValue::Decimal64(Some(100), 18, 2),
+        );
+
+        // ===== Decimal128 =====
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(10000), 38, 2),
+            ScalarValue::Decimal128(Some(10000), 38, 2), // 100.00
+            ScalarValue::Decimal128(Some(10100), 38, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(-500), 38, 2),
+            ScalarValue::Decimal128(Some(-500), 38, 2), // -5.00
+            ScalarValue::Decimal128(Some(-400), 38, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal128(Some(0), 38, 2),
+            ScalarValue::Decimal128(Some(0), 38, 2),
+            ScalarValue::Decimal128(Some(100), 38, 2),
+        );
+
+        // ===== Decimal256 =====
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::from(10000)), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(10000)), 76, 2), // 100.00
+            ScalarValue::Decimal256(Some(i256::from(10100)), 76, 2), // 101.00
+        );
+
+        // Negative
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::from(-500)), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(-500)), 76, 2), // -5.00
+            ScalarValue::Decimal256(Some(i256::from(-400)), 76, 2), // -4.00
+        );
+
+        // Zero
+        assert_preimage_range(
+            ScalarValue::Decimal256(Some(i256::ZERO), 76, 2),
+            ScalarValue::Decimal256(Some(i256::ZERO), 76, 2),
+            ScalarValue::Decimal256(Some(i256::from(100)), 76, 2),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_non_integer() {
+        // floor(x) = 1.30 has NO SOLUTION because floor always returns an integer
+        // Therefore preimage should return None for non-integer decimals
+
+        // Decimal32
+        assert_preimage_none(ScalarValue::Decimal32(Some(130), 9, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal32(Some(-250), 9, 2)); // -2.50
+        assert_preimage_none(ScalarValue::Decimal32(Some(370), 9, 2)); // 3.70
+        assert_preimage_none(ScalarValue::Decimal32(Some(1), 9, 2)); // 0.01
+
+        // Decimal64
+        assert_preimage_none(ScalarValue::Decimal64(Some(130), 18, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal64(Some(-250), 18, 2)); // -2.50
+
+        // Decimal128
+        assert_preimage_none(ScalarValue::Decimal128(Some(130), 38, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal128(Some(-250), 38, 2)); // -2.50
+
+        // Decimal256
+        assert_preimage_none(ScalarValue::Decimal256(Some(i256::from(130)), 76, 2)); // 1.30
+        assert_preimage_none(ScalarValue::Decimal256(Some(i256::from(-250)), 76, 2)); // -2.50
+
+        // Decimal32: i32::MAX - 50
+        // This return None because the value is not an integer, not because it is out of range.
+        assert_preimage_none(ScalarValue::Decimal32(Some(i32::MAX - 50), 10, 2));
+
+        // Decimal64: i64::MAX - 50
+        // This return None because the value is not an integer, not because it is out of range.
+        assert_preimage_none(ScalarValue::Decimal64(Some(i64::MAX - 50), 19, 2));
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_overflow() {
+        // Test near MAX where adding scale_factor would overflow
+
+        // Decimal32: i32::MAX
+        assert_preimage_none(ScalarValue::Decimal32(Some(i32::MAX), 10, 0));
+
+        // Decimal64: i64::MAX
+        assert_preimage_none(ScalarValue::Decimal64(Some(i64::MAX), 19, 0));
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_edge_cases() {
+        // ===== Decimal32 =====
+        // Large value that doesn't overflow
+        // Decimal(9,2) max value is 9,999,999.99 (stored as 999,999,999)
+        // Use a large value that fits Decimal(9,2) and is divisible by 100
+        let safe_max_aligned_32 = 999_999_900; // 9,999,999.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(safe_max_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(safe_max_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(safe_max_aligned_32 + 100), 9, 2),
+        );
+
+        // Negative edge: use a large negative value that fits Decimal(9,2)
+        // Decimal(9,2) min value is -9,999,999.99 (stored as -999,999,999)
+        let min_aligned_32 = -999_999_900; // -9,999,999.00
+        assert_preimage_range(
+            ScalarValue::Decimal32(Some(min_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(min_aligned_32), 9, 2),
+            ScalarValue::Decimal32(Some(min_aligned_32 + 100), 9, 2),
+        );
+    }
+
+    #[test]
+    fn test_floor_preimage_decimal_null() {
+        assert_preimage_none(ScalarValue::Decimal32(None, 9, 2));
+        assert_preimage_none(ScalarValue::Decimal64(None, 18, 2));
+        assert_preimage_none(ScalarValue::Decimal128(None, 38, 2));
+        assert_preimage_none(ScalarValue::Decimal256(None, 76, 2));
+    }
+}
diff --git a/datafusion/functions/src/math/gcd.rs b/datafusion/functions/src/math/gcd.rs
index 7fe253b4afbc0..1f6a353a85ee3 100644
--- a/datafusion/functions/src/math/gcd.rs
+++ b/datafusion/functions/src/math/gcd.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{new_null_array, ArrayRef, AsArray, Int64Array, PrimitiveArray};
+use arrow::array::{ArrayRef, AsArray, Int64Array, PrimitiveArray};
 use arrow::compute::try_binary;
 use arrow::datatypes::{DataType, Int64Type};
 use arrow::error::ArrowError;
@@ -23,7 +23,7 @@ use std::any::Any;
 use std::mem::swap;
 use std::sync::Arc;
 
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_datafusion_err};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -34,10 +34,18 @@ use datafusion_macros::user_doc;
     doc_section(label = "Math Functions"),
     description = "Returns the greatest common divisor of `expression_x` and `expression_y`. Returns 0 if both inputs are zero.",
     syntax_example = "gcd(expression_x, expression_y)",
+    sql_example = r#"```sql
+> SELECT gcd(48, 18);
++------------+
+| gcd(48,18) |
++------------+
+| 6          |
++------------+
+```"#,
     standard_argument(name = "expression_x", prefix = "First numeric"),
     standard_argument(name = "expression_y", prefix = "Second numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct GcdFunc {
     signature: Signature,
 }
@@ -86,20 +94,23 @@ impl ScalarUDFImpl for GcdFunc {
             [ColumnarValue::Array(a), ColumnarValue::Array(b)] => {
                 compute_gcd_for_arrays(&a, &b)
             }
-            [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
-                match (a, b) {
-                    (Some(a), Some(b)) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
-                        Some(compute_gcd(a, b)?),
-                    ))),
-                    _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
-                }
-            }
-            [ColumnarValue::Array(a), ColumnarValue::Scalar(ScalarValue::Int64(b))] => {
-                compute_gcd_with_scalar(&a, b)
-            }
-            [ColumnarValue::Scalar(ScalarValue::Int64(a)), ColumnarValue::Array(b)] => {
-                compute_gcd_with_scalar(&b, a)
-            }
+            [
+                ColumnarValue::Scalar(ScalarValue::Int64(a)),
+                ColumnarValue::Scalar(ScalarValue::Int64(b)),
+            ] => match (a, b) {
+                (Some(a), Some(b)) => Ok(ColumnarValue::Scalar(ScalarValue::Int64(
+                    Some(compute_gcd(a, b)?),
+                ))),
+                _ => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
+            },
+            [
+                ColumnarValue::Array(a),
+                ColumnarValue::Scalar(ScalarValue::Int64(b)),
+            ] => compute_gcd_with_scalar(&a, b),
+            [
+                ColumnarValue::Scalar(ScalarValue::Int64(a)),
+                ColumnarValue::Array(b),
+            ] => compute_gcd_with_scalar(&b, a),
             _ => exec_err!("Unsupported argument types for function gcd"),
         }
     }
@@ -133,10 +144,7 @@ fn compute_gcd_with_scalar(arr: &ArrayRef, scalar: Option<i64>) -> Result<Column
 
             result.map(|arr| ColumnarValue::Array(Arc::new(arr) as ArrayRef))
         }
-        None => Ok(ColumnarValue::Array(new_null_array(
-            &DataType::Int64,
-            arr.len(),
-        ))),
+        None => Ok(ColumnarValue::Scalar(ScalarValue::Int64(None))),
     }
 }
 
diff --git a/datafusion/functions/src/math/iszero.rs b/datafusion/functions/src/math/iszero.rs
index bc12dfb7898e8..aa93d797eb7b3 100644
--- a/datafusion/functions/src/math/iszero.rs
+++ b/datafusion/functions/src/math/iszero.rs
@@ -18,27 +18,41 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray, BooleanArray};
-use arrow::datatypes::DataType::{Boolean, Float32, Float64};
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
+use arrow::array::{ArrowNativeTypeOp, AsArray, BooleanArray};
+use arrow::datatypes::DataType::{
+    Boolean, Decimal32, Decimal64, Decimal128, Decimal256, Float16, Float32, Float64,
+    Int8, Int16, Int32, Int64, Null, UInt8, UInt16, UInt32, UInt64,
+};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
+};
 
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::TypeSignature::Exact;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::{Coercion, TypeSignatureClass};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
 };
 use datafusion_macros::user_doc;
 
-use crate::utils::make_scalar_function;
-
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = "Returns true if a given number is +0.0 or -0.0 otherwise returns false.",
     syntax_example = "iszero(numeric_expression)",
+    sql_example = r#"```sql
+> SELECT iszero(0);
++------------+
+| iszero(0)  |
++------------+
+| true       |
++------------+
+```"#,
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct IsZeroFunc {
     signature: Signature,
 }
@@ -51,12 +65,10 @@ impl Default for IsZeroFunc {
 
 impl IsZeroFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        // Accept any numeric type (ints, uints, floats, decimals) without implicit casts.
+        let numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
         Self {
-            signature: Signature::one_of(
-                vec![Exact(vec![Float32]), Exact(vec![Float64])],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![numeric], Volatility::Immutable),
         }
     }
 }
@@ -79,70 +91,155 @@ impl ScalarUDFImpl for IsZeroFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(iszero, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0.0))))
+                    }
+                    ScalarValue::Float32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0.0))))
+                    }
+                    ScalarValue::Float16(Some(v)) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Boolean(Some(v.is_zero())),
+                    )),
+
+                    ScalarValue::Int8(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int16(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Int64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt8(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt16(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt32(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::UInt64(Some(v)) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+
+                    ScalarValue::Decimal32(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal64(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal128(Some(v), ..) => {
+                        Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(v == 0))))
+                    }
+                    ScalarValue::Decimal256(Some(v), ..) => Ok(ColumnarValue::Scalar(
+                        ScalarValue::Boolean(Some(v.is_zero())),
+                    )),
+
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for iszero: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Null => Ok(ColumnarValue::Array(Arc::new(BooleanArray::new_null(
+                    array.len(),
+                )))),
+
+                Float64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float64Type>(),
+                    |x| x == 0.0,
+                )))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float32Type>(),
+                    |x| x == 0.0,
+                )))),
+                Float16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Float16Type>(),
+                    |x| x.is_zero(),
+                )))),
+
+                Int8 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int8Type>(),
+                    |x| x == 0,
+                )))),
+                Int16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int16Type>(),
+                    |x| x == 0,
+                )))),
+                Int32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int32Type>(),
+                    |x| x == 0,
+                )))),
+                Int64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<Int64Type>(),
+                    |x| x == 0,
+                )))),
+                UInt8 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt8Type>(),
+                    |x| x == 0,
+                )))),
+                UInt16 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt16Type>(),
+                    |x| x == 0,
+                )))),
+                UInt32 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt32Type>(),
+                    |x| x == 0,
+                )))),
+                UInt64 => Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                    array.as_primitive::<UInt64Type>(),
+                    |x| x == 0,
+                )))),
+
+                Decimal32(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal32Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal64(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal64Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal128(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal128Type>(),
+                        |x| x == 0,
+                    ))))
+                }
+                Decimal256(_, _) => {
+                    Ok(ColumnarValue::Array(Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal256Type>(),
+                        |x| x.is_zero(),
+                    ))))
+                }
+
+                other => {
+                    internal_err!("Unexpected data type {other:?} for function iszero")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
-
-/// Iszero SQL function
-pub fn iszero(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(BooleanArray::from_unary(
-            args[0].as_primitive::<Float64Type>(),
-            |x| x == 0.0,
-        )) as ArrayRef),
-
-        Float32 => Ok(Arc::new(BooleanArray::from_unary(
-            args[0].as_primitive::<Float32Type>(),
-            |x| x == 0.0,
-        )) as ArrayRef),
-
-        other => exec_err!("Unsupported data type {other:?} for function iszero"),
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::sync::Arc;
-
-    use arrow::array::{ArrayRef, Float32Array, Float64Array};
-
-    use datafusion_common::cast::as_boolean_array;
-
-    use crate::math::iszero::iszero;
-
-    #[test]
-    fn test_iszero_f64() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float64Array::from(vec![1.0, 0.0, 3.0, -0.0]))];
-
-        let result = iszero(&args).expect("failed to initialize function iszero");
-        let booleans =
-            as_boolean_array(&result).expect("failed to initialize function iszero");
-
-        assert_eq!(booleans.len(), 4);
-        assert!(!booleans.value(0));
-        assert!(booleans.value(1));
-        assert!(!booleans.value(2));
-        assert!(booleans.value(3));
-    }
-
-    #[test]
-    fn test_iszero_f32() {
-        let args: Vec<ArrayRef> =
-            vec![Arc::new(Float32Array::from(vec![1.0, 0.0, 3.0, -0.0]))];
-
-        let result = iszero(&args).expect("failed to initialize function iszero");
-        let booleans =
-            as_boolean_array(&result).expect("failed to initialize function iszero");
-
-        assert_eq!(booleans.len(), 4);
-        assert!(!booleans.value(0));
-        assert!(booleans.value(1));
-        assert!(!booleans.value(2));
-        assert!(booleans.value(3));
-    }
-}
diff --git a/datafusion/functions/src/math/lcm.rs b/datafusion/functions/src/math/lcm.rs
index fc6bf9461f283..58934e137beca 100644
--- a/datafusion/functions/src/math/lcm.rs
+++ b/datafusion/functions/src/math/lcm.rs
@@ -23,9 +23,7 @@ use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
 
 use arrow::error::ArrowError;
-use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_datafusion_err, DataFusionError, Result,
-};
+use datafusion_common::{Result, arrow_datafusion_err, exec_err};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -39,10 +37,18 @@ use crate::utils::make_scalar_function;
     doc_section(label = "Math Functions"),
     description = "Returns the least common multiple of `expression_x` and `expression_y`. Returns 0 if either input is zero.",
     syntax_example = "lcm(expression_x, expression_y)",
+    sql_example = r#"```sql
+> SELECT lcm(4, 5);
++----------+
+| lcm(4,5) |
++----------+
+| 20       |
++----------+
+```"#,
     standard_argument(name = "expression_x", prefix = "First numeric"),
     standard_argument(name = "expression_y", prefix = "Second numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LcmFunc {
     signature: Signature,
 }
diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs
index ee52c035ac81d..3db24d67e8494 100644
--- a/datafusion/functions/src/math/log.rs
+++ b/datafusion/functions/src/math/log.rs
@@ -18,34 +18,49 @@
 //! Math function: `log()`.
 
 use std::any::Any;
-use std::sync::Arc;
 
 use super::power::PowerFunc;
 
-use arrow::array::{ArrayRef, AsArray};
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
+use crate::utils::calculate_binary_math;
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type,
+};
+use arrow::error::ArrowError;
+use arrow_buffer::i256;
+use datafusion_common::types::NativeType;
 use datafusion_common::{
-    exec_err, internal_err, plan_datafusion_err, plan_err, Result, ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
-    lit, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
-    TypeSignature::*,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
+    TypeSignature, TypeSignatureClass, lit,
 };
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
+use num_traits::{Float, ToPrimitive};
 
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.",
     syntax_example = r#"log(base, numeric_expression)
 log(numeric_expression)"#,
+    sql_example = r#"```sql
+> SELECT log(10);
++---------+
+| log(10) |
++---------+
+| 1.0     |
++---------+
+```"#,
     standard_argument(name = "base", prefix = "Base numeric"),
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LogFunc {
     signature: Signature,
 }
@@ -58,14 +73,28 @@ impl Default for LogFunc {
 
 impl LogFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        // Converts decimals & integers to float64, accepting other floats as is
+        let as_float = Coercion::new_implicit(
+            TypeSignatureClass::Float,
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
+                // Ensure decimals have precedence over floats since we have
+                // a native decimal implementation for log
                 vec![
-                    Exact(vec![Float32]),
-                    Exact(vec![Float64]),
-                    Exact(vec![Float32, Float32]),
-                    Exact(vec![Float64, Float64]),
+                    // log(value)
+                    TypeSignature::Coercible(vec![Coercion::new_exact(
+                        TypeSignatureClass::Decimal,
+                    )]),
+                    TypeSignature::Coercible(vec![as_float.clone()]),
+                    // log(base, value)
+                    TypeSignature::Coercible(vec![
+                        as_float.clone(),
+                        Coercion::new_exact(TypeSignatureClass::Decimal),
+                    ]),
+                    TypeSignature::Coercible(vec![as_float.clone(), as_float.clone()]),
                 ],
                 Volatility::Immutable,
             ),
@@ -73,6 +102,91 @@ impl LogFunc {
     }
 }
 
+/// Checks if the base is valid for the efficient integer logarithm algorithm.
+#[inline]
+fn is_valid_integer_base(base: f64) -> bool {
+    base.trunc() == base && base >= 2.0 && base <= u32::MAX as f64
+}
+
+/// Calculate logarithm for Decimal32 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal32(value: i32, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u32::try_from(value)
+        && unscaled > 0
+    {
+        let base_u32 = base as u32;
+        let int_log = unscaled.ilog(base_u32);
+        if base_u32.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
+    }
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
+
+/// Calculate logarithm for Decimal64 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal64(value: i64, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u64::try_from(value)
+        && unscaled > 0
+    {
+        let base_u64 = base as u64;
+        let int_log = unscaled.ilog(base_u64);
+        if base_u64.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
+    }
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
+
+/// Calculate logarithm for Decimal128 values.
+/// For integer bases >= 2 with zero scale, return an exact integer log when the
+/// value is a perfect power of the base. Otherwise falls back to f64 computation.
+fn log_decimal128(value: i128, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    if scale == 0
+        && is_valid_integer_base(base)
+        && let Ok(unscaled) = u128::try_from(value)
+        && unscaled > 0
+    {
+        let base_u128 = base as u128;
+        let int_log = unscaled.ilog(base_u128);
+        if base_u128.checked_pow(int_log) == Some(unscaled) {
+            return Ok(int_log as f64);
+        }
+    }
+    decimal_to_f64(value, scale).map(|v| v.log(base))
+}
+
+/// Convert a scaled decimal value to f64.
+#[inline]
+fn decimal_to_f64<T: ToPrimitive + Copy>(value: T, scale: i8) -> Result<f64, ArrowError> {
+    let value_f64 = value.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert value to f64".to_string())
+    })?;
+    let scale_factor = 10f64.powi(scale as i32);
+    Ok(value_f64 / scale_factor)
+}
+
+fn log_decimal256(value: i256, scale: i8, base: f64) -> Result<f64, ArrowError> {
+    // Try to convert to i128 for the optimized path
+    match value.to_i128() {
+        Some(v) => log_decimal128(v, scale, base),
+        None => {
+            // For very large Decimal256 values, use f64 computation
+            let value_f64 = value.to_f64().ok_or_else(|| {
+                ArrowError::ComputeError(format!("Cannot convert {value} to f64"))
+            })?;
+            let scale_factor = 10f64.powi(scale as i32);
+            Ok((value_f64 / scale_factor).log(base))
+        }
+    }
+}
+
 impl ScalarUDFImpl for LogFunc {
     fn as_any(&self) -> &dyn Any {
         self
@@ -86,7 +200,9 @@ impl ScalarUDFImpl for LogFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match &arg_types[0] {
+        // Check last argument (value)
+        match &arg_types.last().ok_or(plan_datafusion_err!("No args"))? {
+            DataType::Float16 => Ok(DataType::Float16),
             DataType::Float32 => Ok(DataType::Float32),
             _ => Ok(DataType::Float64),
         }
@@ -119,63 +235,78 @@ impl ScalarUDFImpl for LogFunc {
 
     // Support overloaded log(base, x) and log(x) which defaults to log(10, x)
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
+        if args.arg_fields.iter().any(|a| a.data_type().is_null()) {
+            return ColumnarValue::Scalar(ScalarValue::Null)
+                .cast_to(args.return_type(), None);
+        }
 
-        let mut base = ColumnarValue::Scalar(ScalarValue::Float32(Some(10.0)));
+        let (base, value) = if args.args.len() == 2 {
+            (args.args[0].clone(), &args.args[1])
+        } else {
+            // no base specified, default to 10
+            (
+                ColumnarValue::Scalar(ScalarValue::new_ten(args.return_type())?),
+                &args.args[0],
+            )
+        };
+        let value = value.to_array(args.number_rows)?;
 
-        let mut x = &args[0];
-        if args.len() == 2 {
-            x = &args[1];
-            base = ColumnarValue::Array(Arc::clone(&args[0]));
-        }
-        // note in f64::log params order is different than in sql. e.g in sql log(base, x) == f64::log(x, base)
-        let arr: ArrayRef = match args[0].data_type() {
-            DataType::Float64 => match base {
-                ColumnarValue::Scalar(ScalarValue::Float32(Some(base))) => {
-                    Arc::new(x.as_primitive::<Float64Type>().unary::<_, Float64Type>(
-                        |value: f64| f64::log(value, base as f64),
-                    ))
-                }
-                ColumnarValue::Array(base) => {
-                    let x = x.as_primitive::<Float64Type>();
-                    let base = base.as_primitive::<Float64Type>();
-                    let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                        x,
-                        base,
-                        f64::log,
-                    )?;
-                    Arc::new(result) as _
-                }
-                _ => {
-                    return exec_err!("log function requires a scalar or array for base")
-                }
-            },
-
-            DataType::Float32 => match base {
-                ColumnarValue::Scalar(ScalarValue::Float32(Some(base))) => Arc::new(
-                    x.as_primitive::<Float32Type>()
-                        .unary::<_, Float32Type>(|value: f32| f32::log(value, base)),
-                ),
-                ColumnarValue::Array(base) => {
-                    let x = x.as_primitive::<Float32Type>();
-                    let base = base.as_primitive::<Float32Type>();
-                    let result = arrow::compute::binary::<_, _, _, Float32Type>(
-                        x,
-                        base,
-                        f32::log,
-                    )?;
-                    Arc::new(result) as _
-                }
-                _ => {
-                    return exec_err!("log function requires a scalar or array for base")
-                }
-            },
+        let output: ArrayRef = match value.data_type() {
+            DataType::Float16 => {
+                calculate_binary_math::<Float16Type, Float16Type, Float16Type, _>(
+                    &value,
+                    &base,
+                    |value, base| Ok(value.log(base)),
+                )?
+            }
+            DataType::Float32 => {
+                calculate_binary_math::<Float32Type, Float32Type, Float32Type, _>(
+                    &value,
+                    &base,
+                    |value, base| Ok(value.log(base)),
+                )?
+            }
+            DataType::Float64 => {
+                calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| Ok(value.log(base)),
+                )?
+            }
+            DataType::Decimal32(_, scale) => {
+                calculate_binary_math::<Decimal32Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| log_decimal32(value, *scale, base),
+                )?
+            }
+            DataType::Decimal64(_, scale) => {
+                calculate_binary_math::<Decimal64Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| log_decimal64(value, *scale, base),
+                )?
+            }
+            DataType::Decimal128(_, scale) => {
+                calculate_binary_math::<Decimal128Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| log_decimal128(value, *scale, base),
+                )?
+            }
+            DataType::Decimal256(_, scale) => {
+                calculate_binary_math::<Decimal256Type, Float64Type, Float64Type, _>(
+                    &value,
+                    &base,
+                    |value, base| log_decimal256(value, *scale, base),
+                )?
+            }
             other => {
-                return exec_err!("Unsupported data type {other:?} for function log")
+                return exec_err!("Unsupported data type {other:?} for function log");
             }
         };
 
-        Ok(ColumnarValue::Array(arr))
+        Ok(ColumnarValue::Array(output))
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -189,19 +320,43 @@ impl ScalarUDFImpl for LogFunc {
     fn simplify(
         &self,
         mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
+        let mut arg_types = args
+            .iter()
+            .map(|arg| info.get_data_type(arg))
+            .collect::<Result<Vec<_>>>()?;
+        let return_type = self.return_type(&arg_types)?;
+
+        // Null propagation
+        if arg_types.iter().any(|dt| dt.is_null()) {
+            return Ok(ExprSimplifyResult::Simplified(lit(
+                ScalarValue::Null.cast_to(&return_type)?
+            )));
+        }
+
         // Args are either
         // log(number)
         // log(base, number)
         let num_args = args.len();
-        if num_args > 2 {
+        if num_args != 1 && num_args != 2 {
             return plan_err!("Expected log to have 1 or 2 arguments, got {num_args}");
         }
-        let number = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected log to have 1 or 2 arguments, got 0")
-        })?;
-        let number_datatype = info.get_data_type(&number)?;
+
+        match arg_types.last().unwrap() {
+            DataType::Decimal32(_, scale)
+            | DataType::Decimal64(_, scale)
+            | DataType::Decimal128(_, scale)
+            | DataType::Decimal256(_, scale)
+                if *scale < 0 =>
+            {
+                return Ok(ExprSimplifyResult::Original(args));
+            }
+            _ => (),
+        };
+
+        let number = args.pop().unwrap();
+        let number_datatype = arg_types.pop().unwrap();
         // default to base 10
         let base = if let Some(base) = args.pop() {
             base
@@ -210,7 +365,9 @@ impl ScalarUDFImpl for LogFunc {
         };
 
         match number {
-            Expr::Literal(value) if value == ScalarValue::new_one(&number_datatype)? => {
+            Expr::Literal(value, _)
+                if value == ScalarValue::new_one(&number_datatype)? =>
+            {
                 Ok(ExprSimplifyResult::Simplified(lit(ScalarValue::new_zero(
                     &info.get_data_type(&base)?,
                 )?)))
@@ -233,7 +390,7 @@ impl ScalarUDFImpl for LogFunc {
                         _ => {
                             return internal_err!(
                                 "Unexpected number of arguments in log::simplify"
-                            )
+                            );
                         }
                     };
                     Ok(ExprSimplifyResult::Original(args))
@@ -250,49 +407,64 @@ fn is_pow(func: &ScalarUDF) -> bool {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
+    use std::sync::Arc;
 
     use super::*;
 
-    use arrow::array::{Float32Array, Float64Array, Int64Array};
+    use arrow::array::{
+        Date32Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array,
+    };
     use arrow::compute::SortOptions;
-    use arrow::datatypes::Field;
+    use arrow::datatypes::{DECIMAL256_MAX_PRECISION, Field};
     use datafusion_common::cast::{as_float32_array, as_float64_array};
-    use datafusion_common::DFSchema;
-    use datafusion_expr::execution_props::ExecutionProps;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::simplify::SimplifyContext;
 
     #[test]
-    #[should_panic]
+    fn test_log_decimal_native() {
+        let value = 10_i128.pow(35);
+        let expected = (value as f64).log2();
+        let actual = log_decimal128(value, 0, 2.0).unwrap();
+        assert!((actual - expected).abs() < 1e-10);
+    }
+
+    #[test]
     fn test_log_invalid_base_type() {
         let arg_fields = vec![
-            Field::new("a", DataType::Float64, false).into(),
-            Field::new("a", DataType::Int64, false).into(),
+            Field::new("b", DataType::Date32, false).into(),
+            Field::new("n", DataType::Float64, false).into(),
         ];
         let args = ScalarFunctionArgs {
             args: vec![
+                ColumnarValue::Array(Arc::new(Date32Array::from(vec![5, 10, 15, 20]))), // base
                 ColumnarValue::Array(Arc::new(Float64Array::from(vec![
                     10.0, 100.0, 1000.0, 10000.0,
                 ]))), // num
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![5, 10, 15, 20]))),
             ],
             arg_fields,
             number_rows: 4,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
-        let _ = LogFunc::new().invoke_with_args(args);
+        let result = LogFunc::new().invoke_with_args(args);
+        assert!(result.is_err());
+        assert_eq!(
+            result.unwrap_err().to_string().lines().next().unwrap(),
+            "Arrow error: Cast error: Casting from Date32 to Float64 not supported"
+        );
     }
 
     #[test]
     fn test_log_invalid_value() {
-        let arg_field = Field::new("a", DataType::Int64, false).into();
+        let arg_field = Field::new("a", DataType::Date32, false).into();
         let args = ScalarFunctionArgs {
             args: vec![
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![10]))), // num
+                ColumnarValue::Array(Arc::new(Date32Array::from(vec![10]))), // num
             ],
             arg_fields: vec![arg_field],
             number_rows: 1,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = LogFunc::new().invoke_with_args(args);
@@ -309,6 +481,7 @@ mod tests {
             arg_fields: vec![arg_field],
             number_rows: 1,
             return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -338,6 +511,7 @@ mod tests {
             arg_fields: vec![arg_field],
             number_rows: 1,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -365,12 +539,13 @@ mod tests {
         ];
         let args = ScalarFunctionArgs {
             args: vec![
-                ColumnarValue::Scalar(ScalarValue::Float32(Some(2.0))), // num
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(2.0))), // base
                 ColumnarValue::Scalar(ScalarValue::Float32(Some(32.0))), // num
             ],
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -398,12 +573,13 @@ mod tests {
         ];
         let args = ScalarFunctionArgs {
             args: vec![
-                ColumnarValue::Scalar(ScalarValue::Float64(Some(2.0))), // num
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(2.0))), // base
                 ColumnarValue::Scalar(ScalarValue::Float64(Some(64.0))), // num
             ],
             arg_fields,
             number_rows: 1,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -435,6 +611,7 @@ mod tests {
             arg_fields: vec![arg_field],
             number_rows: 4,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -469,6 +646,7 @@ mod tests {
             arg_fields: vec![arg_field],
             number_rows: 4,
             return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -500,15 +678,16 @@ mod tests {
         let args = ScalarFunctionArgs {
             args: vec![
                 ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    2.0, 2.0, 3.0, 5.0,
+                    2.0, 2.0, 3.0, 5.0, 5.0,
                 ]))), // base
                 ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    8.0, 4.0, 81.0, 625.0,
+                    8.0, 4.0, 81.0, 625.0, -123.0,
                 ]))), // num
             ],
             arg_fields,
-            number_rows: 4,
+            number_rows: 5,
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -519,11 +698,12 @@ mod tests {
                 let floats = as_float64_array(&arr)
                     .expect("failed to convert result to a Float64Array");
 
-                assert_eq!(floats.len(), 4);
+                assert_eq!(floats.len(), 5);
                 assert!((floats.value(0) - 3.0).abs() < 1e-10);
                 assert!((floats.value(1) - 2.0).abs() < 1e-10);
                 assert!((floats.value(2) - 4.0).abs() < 1e-10);
                 assert!((floats.value(3) - 4.0).abs() < 1e-10);
+                assert!(floats.value(4).is_nan());
             }
             ColumnarValue::Scalar(_) => {
                 panic!("Expected an array value")
@@ -549,6 +729,7 @@ mod tests {
             arg_fields,
             number_rows: 4,
             return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = LogFunc::new()
             .invoke_with_args(args)
@@ -573,10 +754,7 @@ mod tests {
     #[test]
     // Test log() simplification errors
     fn test_log_simplify_errors() {
-        let props = ExecutionProps::new();
-        let schema =
-            Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new()).unwrap());
-        let context = SimplifyContext::new(&props).with_schema(schema);
+        let context = SimplifyContext::default();
         // Expect 0 args to error
         let _ = LogFunc::new().simplify(vec![], &context).unwrap_err();
         // Expect 3 args to error
@@ -588,10 +766,7 @@ mod tests {
     #[test]
     // Test that non-simplifiable log() expressions are unchanged after simplification
     fn test_log_simplify_original() {
-        let props = ExecutionProps::new();
-        let schema =
-            Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new()).unwrap());
-        let context = SimplifyContext::new(&props).with_schema(schema);
+        let context = SimplifyContext::default();
         // One argument with no simplifications
         let result = LogFunc::new().simplify(vec![lit(2)], &context).unwrap();
         let ExprSimplifyResult::Original(args) = result else {
@@ -614,7 +789,7 @@ mod tests {
     #[test]
     fn test_log_output_ordering() {
         // [Unordered, Ascending, Descending, Literal]
-        let orders = vec![
+        let orders = [
             ExprProperties::new_unknown(),
             ExprProperties::new_unknown().with_order(SortProperties::Ordered(
                 SortOptions {
@@ -635,7 +810,7 @@ mod tests {
 
         // Test log(num)
         for order in orders.iter().cloned() {
-            let result = log.output_ordering(&[order.clone()]).unwrap();
+            let result = log.output_ordering(std::slice::from_ref(&order)).unwrap();
             assert_eq!(result, order.sort_properties);
         }
 
@@ -649,7 +824,7 @@ mod tests {
                 results.push(result);
             }
         }
-        let expected = vec![
+        let expected = [
             // base: Unordered
             SortProperties::Unordered,
             SortProperties::Unordered,
@@ -718,4 +893,317 @@ mod tests {
             SortProperties::Unordered
         );
     }
+
+    #[test]
+    fn test_log_scalar_decimal128_unary() {
+        let arg_field = Field::new("a", DataType::Decimal128(38, 0), false).into();
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(10), 38, 0)), // num
+            ],
+            arg_fields: vec![arg_field],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Decimal128(38, 0), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function log");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Decimal128Array");
+                assert_eq!(floats.len(), 1);
+                assert!((floats.value(0) - 1.0).abs() < 1e-10);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_scalar_decimal128() {
+        let arg_fields = vec![
+            Field::new("b", DataType::Float64, false).into(),
+            Field::new("x", DataType::Decimal128(38, 0), false).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(2.0))), // base
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(64), 38, 0)), // num
+            ],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function log");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+
+                assert_eq!(floats.len(), 1);
+                assert!((floats.value(0) - 6.0).abs() < 1e-10);
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal128_unary() {
+        let arg_field = Field::new("a", DataType::Decimal128(38, 0), false).into();
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(
+                    Decimal128Array::from(vec![10, 100, 1000, 10000, 12600, -123])
+                        .with_precision_and_scale(38, 0)
+                        .unwrap(),
+                )), // num
+            ],
+            arg_fields: vec![arg_field],
+            number_rows: 6,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function log");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+
+                assert_eq!(floats.len(), 6);
+                assert!((floats.value(0) - 1.0).abs() < 1e-10);
+                assert!((floats.value(1) - 2.0).abs() < 1e-10);
+                assert!((floats.value(2) - 3.0).abs() < 1e-10);
+                assert!((floats.value(3) - 4.0).abs() < 1e-10);
+                let expected = 12600_f64.log(10.0);
+                assert!((floats.value(4) - expected).abs() < 1e-10);
+                assert!(floats.value(5).is_nan());
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal128_base_decimal() {
+        // Base stays 2 despite scaling
+        for base in [
+            ScalarValue::Decimal128(Some(i128::from(2)), 38, 0),
+            ScalarValue::Decimal128(Some(i128::from(2000)), 38, 3),
+        ] {
+            let arg_fields = vec![
+                Field::new("b", DataType::Decimal128(38, 0), false).into(),
+                Field::new("x", DataType::Decimal128(38, 0), false).into(),
+            ];
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(base), // base
+                    ColumnarValue::Scalar(ScalarValue::Decimal128(Some(64), 38, 0)), // num
+                ],
+                arg_fields,
+                number_rows: 1,
+                return_field: Field::new("f", DataType::Float64, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            };
+            let result = LogFunc::new()
+                .invoke_with_args(args)
+                .expect("failed to initialize function log");
+
+            match result {
+                ColumnarValue::Array(arr) => {
+                    let floats = as_float64_array(&arr)
+                        .expect("failed to convert result to a Float64Array");
+
+                    assert_eq!(floats.len(), 1);
+                    assert!((floats.value(0) - 6.0).abs() < 1e-10);
+                }
+                ColumnarValue::Scalar(_) => {
+                    panic!("Expected an array value")
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal128_value_scale() {
+        // Value stays 1000 despite scaling
+        for value in [
+            ScalarValue::Decimal128(Some(i128::from(1000)), 38, 0),
+            ScalarValue::Decimal128(Some(i128::from(10000)), 38, 1),
+            ScalarValue::Decimal128(Some(i128::from(1000000)), 38, 3),
+        ] {
+            let arg_fields = vec![
+                Field::new("b", DataType::Decimal128(38, 0), false).into(),
+                Field::new("x", DataType::Decimal128(38, 0), false).into(),
+            ];
+            let args = ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(value), // base
+                ],
+                arg_fields,
+                number_rows: 1,
+                return_field: Field::new("f", DataType::Float64, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            };
+            let result = LogFunc::new()
+                .invoke_with_args(args)
+                .expect("failed to initialize function log");
+
+            match result {
+                ColumnarValue::Array(arr) => {
+                    let floats = as_float64_array(&arr)
+                        .expect("failed to convert result to a Float64Array");
+
+                    assert_eq!(floats.len(), 1);
+                    assert!((floats.value(0) - 3.0).abs() < 1e-10);
+                }
+                ColumnarValue::Scalar(_) => {
+                    panic!("Expected an array value")
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal256_unary() {
+        let arg_field = Field::new(
+            "a",
+            DataType::Decimal256(DECIMAL256_MAX_PRECISION, 0),
+            false,
+        )
+        .into();
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(
+                    Decimal256Array::from(vec![
+                        Some(i256::from(10)),
+                        Some(i256::from(100)),
+                        Some(i256::from(1000)),
+                        Some(i256::from(10000)),
+                        Some(i256::from(12600)),
+                        // Slightly lower than i128 max - can calculate
+                        Some(i256::from_i128(i128::MAX) - i256::from(1000)),
+                        // Give NaN for incorrect inputs, as in f64::log
+                        Some(i256::from(-123)),
+                    ])
+                    .with_precision_and_scale(DECIMAL256_MAX_PRECISION, 0)
+                    .unwrap(),
+                )), // num
+            ],
+            arg_fields: vec![arg_field],
+            number_rows: 7,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("failed to initialize function log");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+
+                assert_eq!(floats.len(), 7);
+                assert!((floats.value(0) - 1.0).abs() < 1e-10);
+                assert!((floats.value(1) - 2.0).abs() < 1e-10);
+                assert!((floats.value(2) - 3.0).abs() < 1e-10);
+                assert!((floats.value(3) - 4.0).abs() < 1e-10);
+                let expected = 12600_f64.log(10.0);
+                assert!((floats.value(4) - expected).abs() < 1e-10);
+                let expected = ((i128::MAX - 1000) as f64).log(10.0);
+                assert!((floats.value(5) - expected).abs() < 1e-10);
+                assert!(floats.value(6).is_nan());
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal128_invalid_base() {
+        // Invalid base (-2.0) should return NaN, matching f64::log behavior
+        let arg_fields = vec![
+            Field::new("b", DataType::Float64, false).into(),
+            Field::new("x", DataType::Decimal128(38, 0), false).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(-2.0))), // base
+                ColumnarValue::Scalar(ScalarValue::Decimal128(Some(64), 38, 0)), // num
+            ],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should not error on invalid base");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                assert!(floats.value(0).is_nan());
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
+
+    #[test]
+    fn test_log_decimal256_large() {
+        // Large Decimal256 values that don't fit in i128 now use f64 fallback
+        let arg_field = Field::new("a", DataType::Decimal256(38, 0), false).into();
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(Arc::new(Decimal256Array::from(vec![
+                    // Slightly larger than i128
+                    Some(i256::from_i128(i128::MAX) + i256::from(1000)),
+                ]))), // num
+            ],
+            arg_fields: vec![arg_field],
+            number_rows: 1,
+            return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let result = LogFunc::new()
+            .invoke_with_args(args)
+            .expect("should handle large Decimal256 via f64 fallback");
+
+        match result {
+            ColumnarValue::Array(arr) => {
+                let floats = as_float64_array(&arr)
+                    .expect("failed to convert result to a Float64Array");
+                assert_eq!(floats.len(), 1);
+                // The f64 fallback may lose some precision for very large numbers,
+                // but we verify we get a reasonable positive result (not NaN/infinity)
+                let log_result = floats.value(0);
+                assert!(
+                    log_result.is_finite() && log_result > 0.0,
+                    "Expected positive finite log result, got {log_result}"
+                );
+            }
+            ColumnarValue::Scalar(_) => {
+                panic!("Expected an array value")
+            }
+        }
+    }
 }
diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs
index 4eb337a30110e..610e773d68fd0 100644
--- a/datafusion/functions/src/math/mod.rs
+++ b/datafusion/functions/src/math/mod.rs
@@ -23,8 +23,11 @@ use std::sync::Arc;
 
 pub mod abs;
 pub mod bounds;
+pub mod ceil;
 pub mod cot;
+mod decimal;
 pub mod factorial;
+pub mod floor;
 pub mod gcd;
 pub mod iszero;
 pub mod lcm;
@@ -104,14 +107,7 @@ make_math_unary_udf!(
     super::bounds::unbounded_bounds,
     super::get_cbrt_doc
 );
-make_math_unary_udf!(
-    CeilFunc,
-    ceil,
-    ceil,
-    super::ceil_order,
-    super::bounds::unbounded_bounds,
-    super::get_ceil_doc
-);
+make_udf_function!(ceil::CeilFunc, ceil);
 make_math_unary_udf!(
     CosFunc,
     cos,
@@ -146,14 +142,7 @@ make_math_unary_udf!(
     super::get_exp_doc
 );
 make_udf_function!(factorial::FactorialFunc, factorial);
-make_math_unary_udf!(
-    FloorFunc,
-    floor,
-    floor,
-    super::floor_order,
-    super::bounds::unbounded_bounds,
-    super::get_floor_doc
-);
+make_udf_function!(floor::FloorFunc, floor);
 make_udf_function!(log::LogFunc, log);
 make_udf_function!(gcd::GcdFunc, gcd);
 make_udf_function!(nans::IsNanFunc, isnan);
diff --git a/datafusion/functions/src/math/monotonicity.rs b/datafusion/functions/src/math/monotonicity.rs
index baa3147f6258d..f672ff9804546 100644
--- a/datafusion/functions/src/math/monotonicity.rs
+++ b/datafusion/functions/src/math/monotonicity.rs
@@ -17,11 +17,11 @@
 
 use std::sync::LazyLock;
 
-use datafusion_common::{exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_doc::scalar_doc_sections::DOC_SECTION_MATH;
+use datafusion_expr::Documentation;
 use datafusion_expr::interval_arithmetic::Interval;
-use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::Documentation;
 
 /// Non-increasing on the interval \[−1, 1\], undefined otherwise.
 pub fn acos_order(input: &[ExprProperties]) -> Result<SortProperties> {
@@ -31,7 +31,7 @@ pub fn acos_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(-arg.sort_properties)
     } else {
         exec_err!("Input range of ACOS contains out-of-domain values")
@@ -45,6 +45,16 @@ static DOCUMENTATION_ACOS: LazyLock<Documentation> = LazyLock::new(|| {
         "acos(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT acos(1);
++----------+
+| acos(1)  |
++----------+
+| 0.0      |
++----------+
+```"#,
+    )
     .build()
 });
 
@@ -62,22 +72,31 @@ pub fn acosh_order(input: &[ExprProperties]) -> Result<SortProperties> {
         ScalarValue::try_from(&range.upper().data_type())?,
     )?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ACOSH contains out-of-domain values")
     }
 }
 
-static DOCUMENTATION_ACOSH: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
+static DOCUMENTATION_ACOSH: LazyLock<Documentation> =
+    LazyLock::new(|| {
+        Documentation::builder(
         DOC_SECTION_MATH,
         "Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number.",
         "acosh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(r#"```sql
+> SELECT acosh(2);
++------------+
+| acosh(2)   |
++------------+
+| 1.31696    |
++------------+
+```"#)
     .build()
-});
+    });
 
 pub fn get_acosh_doc() -> &'static Documentation {
     &DOCUMENTATION_ACOSH
@@ -91,7 +110,7 @@ pub fn asin_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ASIN contains out-of-domain values")
@@ -105,6 +124,16 @@ static DOCUMENTATION_ASIN: LazyLock<Documentation> = LazyLock::new(|| {
         "asin(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT asin(0.5);
++------------+
+| asin(0.5)  |
++------------+
+| 0.5235988  |
++------------+
+```"#,
+    )
     .build()
 });
 
@@ -124,6 +153,16 @@ static DOCUMENTATION_ASINH: LazyLock<Documentation> = LazyLock::new(|| {
         "asinh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#" ```sql 
+> SELECT asinh(1);
++------------+
+| asinh(1)   |
++------------+
+| 0.8813736  |
++------------+
+```"#,
+    )
     .build()
 });
 
@@ -143,6 +182,16 @@ static DOCUMENTATION_ATAN: LazyLock<Documentation> = LazyLock::new(|| {
         "atan(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+    > SELECT atan(1);
++-----------+
+| atan(1)   |
++-----------+
+| 0.7853982 |
++-----------+
+```"#,
+    )
     .build()
 });
 
@@ -158,22 +207,31 @@ pub fn atanh_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let valid_domain =
         Interval::make_symmetric_unit_interval(&range.lower().data_type())?;
 
-    if valid_domain.contains(range)? == Interval::CERTAINLY_TRUE {
+    if valid_domain.contains(range)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of ATANH contains out-of-domain values")
     }
 }
 
-static DOCUMENTATION_ATANH: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
+static DOCUMENTATION_ATANH: LazyLock<Documentation> =
+    LazyLock::new(|| {
+        Documentation::builder(
         DOC_SECTION_MATH,
         "Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number.",
         "atanh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(r#"```sql
+    > SELECT atanh(0.5);
++-------------+
+| atanh(0.5)  |
++-------------+
+| 0.5493061   |
++-------------+
+```"#)
     .build()
-});
+    });
 
 pub fn get_atanh_doc() -> &'static Documentation {
     &DOCUMENTATION_ATANH
@@ -185,8 +243,9 @@ pub fn atan2_order(_input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(SortProperties::Unordered)
 }
 
-static DOCUMENTATION_ATANH2: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
+static DOCUMENTATION_ATANH2: LazyLock<Documentation> =
+    LazyLock::new(|| {
+        Documentation::builder(
         DOC_SECTION_MATH,
         "Returns the arc tangent or inverse tangent of `expression_y / expression_x`.",
         "atan2(expression_y, expression_x)",
@@ -201,8 +260,16 @@ Can be a constant, column, or function, and any combination of arithmetic operat
         r#"Second numeric expression to operate on.
 Can be a constant, column, or function, and any combination of arithmetic operators."#,
     )
+    .with_sql_example(r#"```sql
+> SELECT atan2(1, 1);
++------------+
+| atan2(1,1) |
++------------+
+| 0.7853982  |
++------------+
+```"#)
     .build()
-});
+    });
 
 pub fn get_atan2_doc() -> &'static Documentation {
     &DOCUMENTATION_ATANH2
@@ -220,6 +287,16 @@ static DOCUMENTATION_CBRT: LazyLock<Documentation> = LazyLock::new(|| {
         "cbrt(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT cbrt(27);
++-----------+
+| cbrt(27)  |
++-----------+
+| 3.0       |
++-----------+
+```"#,
+    )
     .build()
 });
 
@@ -232,20 +309,6 @@ pub fn ceil_order(input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(input[0].sort_properties)
 }
 
-static DOCUMENTATION_CEIL: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
-        DOC_SECTION_MATH,
-        "Returns the nearest integer greater than or equal to a number.",
-        "ceil(numeric_expression)",
-    )
-    .with_standard_argument("numeric_expression", Some("Numeric"))
-    .build()
-});
-
-pub fn get_ceil_doc() -> &'static Documentation {
-    &DOCUMENTATION_CEIL
-}
-
 /// Non-increasing on \[0, π\] and then non-decreasing on \[π, 2π\].
 /// This pattern repeats periodically with a period of 2π.
 // TODO: Implement ordering rule of the ATAN2 function.
@@ -260,6 +323,16 @@ static DOCUMENTATION_COS: LazyLock<Documentation> = LazyLock::new(|| {
         "cos(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT cos(0);
++--------+
+| cos(0) |
++--------+
+| 1.0    |
++--------+
+```"#,
+    )
     .build()
 });
 
@@ -274,9 +347,9 @@ pub fn cosh_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
-    } else if range.lt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    } else if range.lt_eq(&zero_point)? == Interval::TRUE {
         Ok(-arg.sort_properties)
     } else {
         Ok(SortProperties::Unordered)
@@ -290,6 +363,16 @@ static DOCUMENTATION_COSH: LazyLock<Documentation> = LazyLock::new(|| {
         "cosh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT cosh(1);
++-----------+
+| cosh(1)   |
++-----------+
+| 1.5430806 |
++-----------+
+```"#,
+    )
     .build()
 });
 
@@ -309,6 +392,16 @@ static DOCUMENTATION_DEGREES: LazyLock<Documentation> = LazyLock::new(|| {
         "degrees(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+    > SELECT degrees(pi());
++------------+
+| degrees(0) |
++------------+
+| 180.0      |
++------------+
+```"#,
+    )
     .build()
 });
 
@@ -328,6 +421,16 @@ static DOCUMENTATION_EXP: LazyLock<Documentation> = LazyLock::new(|| {
         "exp(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT exp(1);
++---------+
+| exp(1)  |
++---------+
+| 2.71828 |
++---------+
+```"#,
+    )
     .build()
 });
 
@@ -340,20 +443,6 @@ pub fn floor_order(input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(input[0].sort_properties)
 }
 
-static DOCUMENTATION_FLOOR: LazyLock<Documentation> = LazyLock::new(|| {
-    Documentation::builder(
-        DOC_SECTION_MATH,
-        "Returns the nearest integer less than or equal to a number.",
-        "floor(numeric_expression)",
-    )
-    .with_standard_argument("numeric_expression", Some("Numeric"))
-    .build()
-});
-
-pub fn get_floor_doc() -> &'static Documentation {
-    &DOCUMENTATION_FLOOR
-}
-
 /// Non-decreasing for x ≥ 0, undefined otherwise.
 pub fn ln_order(input: &[ExprProperties]) -> Result<SortProperties> {
     let arg = &input[0];
@@ -361,7 +450,7 @@ pub fn ln_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LN contains out-of-domain values")
@@ -375,6 +464,16 @@ static DOCUMENTATION_LN: LazyLock<Documentation> = LazyLock::new(|| {
         "ln(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT ln(2.71828);
++-------------+
+| ln(2.71828) |
++-------------+
+| 1.0         |
++-------------+
+```"#,
+    )
     .build()
 });
 
@@ -389,7 +488,7 @@ pub fn log2_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LOG2 contains out-of-domain values")
@@ -403,6 +502,16 @@ static DOCUMENTATION_LOG2: LazyLock<Documentation> = LazyLock::new(|| {
         "log2(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT log2(8);
++-----------+
+| log2(8)   |
++-----------+
+| 3.0       |
++-----------+
+```"#,
+    )
     .build()
 });
 
@@ -417,7 +526,7 @@ pub fn log10_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of LOG10 contains out-of-domain values")
@@ -431,6 +540,16 @@ static DOCUMENTATION_LOG10: LazyLock<Documentation> = LazyLock::new(|| {
         "log10(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT log10(100);
++-------------+
+| log10(100)  |
++-------------+
+| 2.0         |
++-------------+
+```"#,
+    )
     .build()
 });
 
@@ -443,18 +562,28 @@ pub fn radians_order(input: &[ExprProperties]) -> Result<SortProperties> {
     Ok(input[0].sort_properties)
 }
 
-static DOCUMENTATION_RADIONS: LazyLock<Documentation> = LazyLock::new(|| {
+static DOCUMENTATION_RADIANS: LazyLock<Documentation> = LazyLock::new(|| {
     Documentation::builder(
         DOC_SECTION_MATH,
         "Converts degrees to radians.",
         "radians(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT radians(180);
++----------------+
+| radians(180)   |
++----------------+
+| 3.14159265359  |
++----------------+
+```"#,
+    )
     .build()
 });
 
 pub fn get_radians_doc() -> &'static Documentation {
-    &DOCUMENTATION_RADIONS
+    &DOCUMENTATION_RADIANS
 }
 
 /// Non-decreasing on \[0, π\] and then non-increasing on \[π, 2π\].
@@ -471,6 +600,16 @@ static DOCUMENTATION_SIN: LazyLock<Documentation> = LazyLock::new(|| {
         "sin(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT sin(0);
++----------+
+| sin(0)   |
++----------+
+| 0.0      |
++----------+
+```"#,
+    )
     .build()
 });
 
@@ -490,6 +629,16 @@ static DOCUMENTATION_SINH: LazyLock<Documentation> = LazyLock::new(|| {
         "sinh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT sinh(1);
++-----------+
+| sinh(1)   |
++-----------+
+| 1.1752012 |
++-----------+
+```"#,
+    )
     .build()
 });
 
@@ -504,7 +653,7 @@ pub fn sqrt_order(input: &[ExprProperties]) -> Result<SortProperties> {
 
     let zero_point = Interval::make_zero(&range.lower().data_type())?;
 
-    if range.gt_eq(&zero_point)? == Interval::CERTAINLY_TRUE {
+    if range.gt_eq(&zero_point)? == Interval::TRUE {
         Ok(arg.sort_properties)
     } else {
         exec_err!("Input range of SQRT contains out-of-domain values")
@@ -539,6 +688,16 @@ static DOCUMENTATION_TAN: LazyLock<Documentation> = LazyLock::new(|| {
         "tan(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+> SELECT tan(pi()/4);
++--------------+
+| tan(PI()/4)  |
++--------------+
+| 1.0          |
++--------------+
+```"#,
+    )
     .build()
 });
 
@@ -558,6 +717,16 @@ static DOCUMENTATION_TANH: LazyLock<Documentation> = LazyLock::new(|| {
         "tanh(numeric_expression)",
     )
     .with_standard_argument("numeric_expression", Some("Numeric"))
+    .with_sql_example(
+        r#"```sql
+  > SELECT tanh(20);
+  +----------+
+  | tanh(20) |
+  +----------+
+  | 1.0      |
+  +----------+
+  ```"#,
+    )
     .build()
 });
 
diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs
index 34a5c2a1c16bb..632eafe1e009a 100644
--- a/datafusion/functions/src/math/nans.rs
+++ b/datafusion/functions/src/math/nans.rs
@@ -17,12 +17,21 @@
 
 //! Math function: `isnan()`.
 
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, TypeSignature};
-
 use arrow::array::{ArrayRef, AsArray, BooleanArray};
-use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
+use arrow::datatypes::DataType::{
+    Decimal32, Decimal64, Decimal128, Decimal256, Float16, Float32, Float64, Int8, Int16,
+    Int32, Int64, Null, UInt8, UInt16, UInt32, UInt64,
+};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, Float16Type,
+    Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
 use std::any::Any;
 use std::sync::Arc;
@@ -31,9 +40,17 @@ use std::sync::Arc;
     doc_section(label = "Math Functions"),
     description = "Returns true if a given number is +NaN or -NaN otherwise returns false.",
     syntax_example = "isnan(numeric_expression)",
+    sql_example = r#"```sql
+> SELECT isnan(1);
++----------+
+| isnan(1) |
++----------+
+| false    |
++----------+
+```"#,
     standard_argument(name = "numeric_expression", prefix = "Numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct IsNanFunc {
     signature: Signature,
 }
@@ -46,15 +63,10 @@ impl Default for IsNanFunc {
 
 impl IsNanFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        // Accept any numeric type (ints, uints, floats, decimals) without implicit casts.
+        let numeric = Coercion::new_exact(TypeSignatureClass::Numeric);
         Self {
-            signature: Signature::one_of(
-                vec![
-                    TypeSignature::Exact(vec![Float32]),
-                    TypeSignature::Exact(vec![Float64]),
-                ],
-                Volatility::Immutable,
-            ),
+            signature: Signature::coercible(vec![numeric], Volatility::Immutable),
         }
     }
 }
@@ -76,26 +88,123 @@ impl ScalarUDFImpl for IsNanFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
-
-        let arr: ArrayRef = match args[0].data_type() {
-            DataType::Float64 => Arc::new(BooleanArray::from_unary(
-                args[0].as_primitive::<Float64Type>(),
-                f64::is_nan,
-            )) as ArrayRef,
-
-            DataType::Float32 => Arc::new(BooleanArray::from_unary(
-                args[0].as_primitive::<Float32Type>(),
-                f32::is_nan,
-            )) as ArrayRef,
-            other => {
-                return exec_err!(
-                    "Unsupported data type {other:?} for function {}",
-                    self.name()
-                )
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+                }
+
+                let result = match scalar {
+                    ScalarValue::Float64(Some(v)) => Some(v.is_nan()),
+                    ScalarValue::Float32(Some(v)) => Some(v.is_nan()),
+                    ScalarValue::Float16(Some(v)) => Some(v.is_nan()),
+
+                    // Non-float numeric inputs are never NaN
+                    ScalarValue::Int8(_)
+                    | ScalarValue::Int16(_)
+                    | ScalarValue::Int32(_)
+                    | ScalarValue::Int64(_)
+                    | ScalarValue::UInt8(_)
+                    | ScalarValue::UInt16(_)
+                    | ScalarValue::UInt32(_)
+                    | ScalarValue::UInt64(_)
+                    | ScalarValue::Decimal32(_, _, _)
+                    | ScalarValue::Decimal64(_, _, _)
+                    | ScalarValue::Decimal128(_, _, _)
+                    | ScalarValue::Decimal256(_, _, _) => Some(false),
+
+                    other => {
+                        return exec_err!(
+                            "Unsupported data type {other:?} for function {}",
+                            self.name()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(result)))
             }
-        };
-        Ok(ColumnarValue::Array(arr))
+            ColumnarValue::Array(array) => {
+                // NOTE: BooleanArray::from_unary preserves nulls.
+                let arr: ArrayRef = match array.data_type() {
+                    Null => Arc::new(BooleanArray::new_null(array.len())) as ArrayRef,
+
+                    Float64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float64Type>(),
+                        f64::is_nan,
+                    )) as ArrayRef,
+                    Float32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float32Type>(),
+                        f32::is_nan,
+                    )) as ArrayRef,
+                    Float16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Float16Type>(),
+                        |x| x.is_nan(),
+                    )) as ArrayRef,
+
+                    // Non-float numeric arrays are never NaN
+                    Decimal32(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal64(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal128(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal128Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Decimal256(_, _) => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Decimal256Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+
+                    Int8 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int8Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int16Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    Int64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<Int64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt8 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt8Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt16 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt16Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt32 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt32Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+                    UInt64 => Arc::new(BooleanArray::from_unary(
+                        array.as_primitive::<UInt64Type>(),
+                        |_| false,
+                    )) as ArrayRef,
+
+                    other => {
+                        return exec_err!(
+                            "Unsupported data type {other:?} for function {}",
+                            self.name()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Array(arr))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
diff --git a/datafusion/functions/src/math/nanvl.rs b/datafusion/functions/src/math/nanvl.rs
index 9effb82896ee0..2bdc3fbbc64ac 100644
--- a/datafusion/functions/src/math/nanvl.rs
+++ b/datafusion/functions/src/math/nanvl.rs
@@ -18,12 +18,10 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use crate::utils::make_scalar_function;
-
-use arrow::array::{ArrayRef, AsArray, Float32Array, Float64Array};
-use arrow::datatypes::DataType::{Float32, Float64};
-use arrow::datatypes::{DataType, Float32Type, Float64Type};
-use datafusion_common::{exec_err, DataFusionError, Result};
+use arrow::array::{ArrayRef, AsArray, Float16Array, Float32Array, Float64Array};
+use arrow::datatypes::DataType::{Float16, Float32, Float64};
+use arrow::datatypes::{DataType, Float16Type, Float32Type, Float64Type};
+use datafusion_common::{Result, ScalarValue, exec_err, utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -36,6 +34,14 @@ use datafusion_macros::user_doc;
     description = r#"Returns the first argument if it's not _NaN_.
 Returns the second argument otherwise."#,
     syntax_example = "nanvl(expression_x, expression_y)",
+    sql_example = r#"```sql
+> SELECT nanvl(0, 5);
++------------+
+| nanvl(0,5) |
++------------+
+| 0          |
++------------+
+```"#,
     argument(
         name = "expression_x",
         description = "Numeric expression to return if it's not _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators."
@@ -45,7 +51,7 @@ Returns the second argument otherwise."#,
         description = "Numeric expression to return if the first expression is _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct NanvlFunc {
     signature: Signature,
 }
@@ -58,10 +64,13 @@ impl Default for NanvlFunc {
 
 impl NanvlFunc {
     pub fn new() -> Self {
-        use DataType::*;
         Self {
             signature: Signature::one_of(
-                vec![Exact(vec![Float32, Float32]), Exact(vec![Float64, Float64])],
+                vec![
+                    Exact(vec![Float16, Float16]),
+                    Exact(vec![Float32, Float32]),
+                    Exact(vec![Float64, Float64]),
+                ],
                 Volatility::Immutable,
             ),
         }
@@ -83,13 +92,31 @@ impl ScalarUDFImpl for NanvlFunc {
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         match &arg_types[0] {
+            Float16 => Ok(Float16),
             Float32 => Ok(Float32),
             _ => Ok(Float64),
         }
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(nanvl, vec![])(&args.args)
+        let [x, y] = take_function_args(self.name(), args.args)?;
+
+        match (x, y) {
+            (ColumnarValue::Scalar(ScalarValue::Float16(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (ColumnarValue::Scalar(ScalarValue::Float32(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (ColumnarValue::Scalar(ScalarValue::Float64(Some(v))), y) if v.is_nan() => {
+                Ok(y)
+            }
+            (x @ ColumnarValue::Scalar(_), _) => Ok(x),
+            (x, y) => {
+                let args = ColumnarValue::values_to_arrays(&[x, y])?;
+                Ok(ColumnarValue::Array(nanvl(&args)?))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -98,37 +125,49 @@ impl ScalarUDFImpl for NanvlFunc {
 }
 
 /// Nanvl SQL function
+///
+/// - x is NaN -> output is y (which may itself be NULL)
+/// - otherwise -> output is x (which may itself be NULL)
 fn nanvl(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         Float64 => {
-            let compute_nanvl = |x: f64, y: f64| {
-                if x.is_nan() {
-                    y
-                } else {
-                    x
-                }
-            };
-
-            let x = args[0].as_primitive() as &Float64Array;
-            let y = args[1].as_primitive() as &Float64Array;
-            arrow::compute::binary::<_, _, _, Float64Type>(x, y, compute_nanvl)
-                .map(|res| Arc::new(res) as _)
-                .map_err(DataFusionError::from)
+            let x = args[0].as_primitive::<Float64Type>();
+            let y = args[1].as_primitive::<Float64Type>();
+            let result: Float64Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
         }
         Float32 => {
-            let compute_nanvl = |x: f32, y: f32| {
-                if x.is_nan() {
-                    y
-                } else {
-                    x
-                }
-            };
-
-            let x = args[0].as_primitive() as &Float32Array;
-            let y = args[1].as_primitive() as &Float32Array;
-            arrow::compute::binary::<_, _, _, Float32Type>(x, y, compute_nanvl)
-                .map(|res| Arc::new(res) as _)
-                .map_err(DataFusionError::from)
+            let x = args[0].as_primitive::<Float32Type>();
+            let y = args[1].as_primitive::<Float32Type>();
+            let result: Float32Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        Float16 => {
+            let x = args[0].as_primitive::<Float16Type>();
+            let y = args[1].as_primitive::<Float16Type>();
+            let result: Float16Array = x
+                .iter()
+                .zip(y.iter())
+                .map(|(x_value, y_value)| match x_value {
+                    Some(x_value) if x_value.is_nan() => y_value,
+                    _ => x_value,
+                })
+                .collect();
+            Ok(Arc::new(result) as ArrayRef)
         }
         other => exec_err!("Unsupported data type {other:?} for function nanvl"),
     }
@@ -146,8 +185,8 @@ mod test {
     #[test]
     fn test_nanvl_f64() {
         let args: Vec<ArrayRef> = vec![
-            Arc::new(Float64Array::from(vec![1.0, f64::NAN, 3.0, f64::NAN])), // y
-            Arc::new(Float64Array::from(vec![5.0, 6.0, f64::NAN, f64::NAN])), // x
+            Arc::new(Float64Array::from(vec![1.0, f64::NAN, 3.0, f64::NAN])), // x
+            Arc::new(Float64Array::from(vec![5.0, 6.0, f64::NAN, f64::NAN])), // y
         ];
 
         let result = nanvl(&args).expect("failed to initialize function nanvl");
@@ -164,8 +203,8 @@ mod test {
     #[test]
     fn test_nanvl_f32() {
         let args: Vec<ArrayRef> = vec![
-            Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0, f32::NAN])), // y
-            Arc::new(Float32Array::from(vec![5.0, 6.0, f32::NAN, f32::NAN])), // x
+            Arc::new(Float32Array::from(vec![1.0, f32::NAN, 3.0, f32::NAN])), // x
+            Arc::new(Float32Array::from(vec![5.0, 6.0, f32::NAN, f32::NAN])), // y
         ];
 
         let result = nanvl(&args).expect("failed to initialize function nanvl");
diff --git a/datafusion/functions/src/math/pi.rs b/datafusion/functions/src/math/pi.rs
index 5339a9b14a283..574928a09705f 100644
--- a/datafusion/functions/src/math/pi.rs
+++ b/datafusion/functions/src/math/pi.rs
@@ -19,7 +19,7 @@ use std::any::Any;
 
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Float64;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, assert_or_internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -32,7 +32,7 @@ use datafusion_macros::user_doc;
     description = "Returns an approximate value of π.",
     syntax_example = "pi()"
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct PiFunc {
     signature: Signature,
 }
@@ -69,9 +69,11 @@ impl ScalarUDFImpl for PiFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
         Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(
             std::f64::consts::PI,
         ))))
diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs
index bd1ae7c316c1a..489c59aa3d6fa 100644
--- a/datafusion/functions/src/math/power.rs
+++ b/datafusion/functions/src/math/power.rs
@@ -17,32 +17,45 @@
 
 //! Math function: `power()`.
 use std::any::Any;
-use std::sync::Arc;
 
 use super::log::LogFunc;
 
-use arrow::array::{ArrayRef, AsArray, Int64Array};
-use arrow::datatypes::{ArrowNativeTypeOp, DataType, Float64Type};
-use datafusion_common::{
-    arrow_datafusion_err, exec_datafusion_err, exec_err, internal_datafusion_err,
-    plan_datafusion_err, DataFusionError, Result, ScalarValue,
+use crate::utils::{calculate_binary_decimal_math, calculate_binary_math};
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::i256;
+use arrow::datatypes::{
+    ArrowNativeType, ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type,
+    Decimal128Type, Decimal256Type, Float64Type, Int64Type,
 };
+use arrow::error::ArrowError;
+use datafusion_common::types::{NativeType, logical_float64, logical_int64};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::{
-    ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF, TypeSignature,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass, Volatility, lit,
 };
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
+use num_traits::{NumCast, ToPrimitive};
 
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = "Returns a base expression raised to the power of an exponent.",
     syntax_example = "power(base, exponent)",
+    sql_example = r#"```sql
+> SELECT power(2, 3);
++-------------+
+| power(2,3)  |
++-------------+
+| 8           |
++-------------+
+```"#,
     standard_argument(name = "base", prefix = "Numeric"),
     standard_argument(name = "exponent", prefix = "Exponent numeric")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct PowerFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -56,12 +69,23 @@ impl Default for PowerFunc {
 
 impl PowerFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        let integer = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int64,
+        );
+        let decimal = Coercion::new_exact(TypeSignatureClass::Decimal);
+        let float = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Exact(vec![Int64, Int64]),
-                    TypeSignature::Exact(vec![Float64, Float64]),
+                    TypeSignature::Coercible(vec![decimal.clone(), integer]),
+                    TypeSignature::Coercible(vec![decimal.clone(), float.clone()]),
+                    TypeSignature::Coercible(vec![float; 2]),
                 ],
                 Volatility::Immutable,
             ),
@@ -70,10 +94,308 @@ impl PowerFunc {
     }
 }
 
+/// Binary function to calculate a math power to integer exponent
+/// for scaled integer types.
+///
+/// Formula
+/// The power for a scaled integer `b` is
+///
+/// ```text
+/// (b * 10^(-s)) ^ e
+/// ```
+/// However, the result should be scaled back from scale 0 to scale `s`,
+/// which is done by multiplying by `10^s`.
+/// At the end, the formula is:
+///
+/// ```text
+///   b^e * 10^(-s * e) * 10^s = b^e / 10^(s * (e-1))
+/// ```
+/// Example of 2.5 ^ 4 = 39:
+///   2.5 is represented as 25 with scale 1
+///   The unscaled result is 25^4 = 390625
+///   Scale it back to 1: 390625 / 10^4 = 39
+fn pow_decimal_int<T>(base: T, scale: i8, exp: i64) -> Result<T, ArrowError>
+where
+    T: ArrowNativeType + ArrowNativeTypeOp + ToPrimitive + NumCast + Copy,
+{
+    // Negative exponent: fall back to float computation
+    if exp < 0 {
+        return pow_decimal_float(base, scale, exp as f64);
+    }
+
+    let exp: u32 = exp.try_into().map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Unsupported exp value: {exp}"))
+    })?;
+    // Handle edge case for exp == 0
+    // If scale < 0, 10^scale (e.g., 10^-2 = 0.01) becomes 0 in integer arithmetic.
+    if exp == 0 {
+        return if scale >= 0 {
+            T::usize_as(10).pow_checked(scale as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make unscale factor for {scale} and {exp}"
+                ))
+            })
+        } else {
+            Ok(T::ZERO)
+        };
+    }
+    let powered: T = base.pow_checked(exp).map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Cannot raise base {base:?} to exp {exp}"))
+    })?;
+
+    // Calculate the scale adjustment: s * (e - 1)
+    // We use i64 to prevent overflow during the intermediate multiplication
+    let mul_exp = (scale as i64).wrapping_mul(exp as i64 - 1);
+
+    if mul_exp == 0 {
+        return Ok(powered);
+    }
+
+    // If mul_exp is positive, we divide (standard case).
+    // If mul_exp is negative, we multiply (negative scale case).
+    if mul_exp > 0 {
+        let div_factor: T =
+            T::usize_as(10).pow_checked(mul_exp as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make div factor for {scale} and {exp}"
+                ))
+            })?;
+        powered.div_checked(div_factor)
+    } else {
+        // mul_exp is negative, so we multiply by 10^(-mul_exp)
+        let abs_exp = mul_exp.checked_neg().ok_or_else(|| {
+            ArrowError::ArithmeticOverflow(
+                "Overflow while negating scale exponent".to_string(),
+            )
+        })?;
+        let mul_factor: T =
+            T::usize_as(10).pow_checked(abs_exp as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make mul factor for {scale} and {exp}"
+                ))
+            })?;
+        powered.mul_checked(mul_factor)
+    }
+}
+
+/// Binary function to calculate a math power to float exponent
+/// for scaled integer types.
+fn pow_decimal_float<T>(base: T, scale: i8, exp: f64) -> Result<T, ArrowError>
+where
+    T: ArrowNativeType + ArrowNativeTypeOp + ToPrimitive + NumCast + Copy,
+{
+    if exp.is_finite() && exp.trunc() == exp && exp >= 0f64 && exp < u32::MAX as f64 {
+        return pow_decimal_int(base, scale, exp as i64);
+    }
+
+    if !exp.is_finite() {
+        return Err(ArrowError::ComputeError(format!(
+            "Cannot use non-finite exp: {exp}"
+        )));
+    }
+
+    pow_decimal_float_fallback(base, scale, exp)
+}
+
+/// Compute the f64 power result and scale it back.
+/// Returns the rounded i128 result for conversion to target type.
+#[inline]
+fn compute_pow_f64_result(
+    base_f64: f64,
+    scale: i8,
+    exp: f64,
+) -> Result<i128, ArrowError> {
+    let result_f64 = base_f64.powf(exp);
+
+    if !result_f64.is_finite() {
+        return Err(ArrowError::ArithmeticOverflow(format!(
+            "Result of {base_f64}^{exp} is not finite"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let result_scaled = result_f64 * scale_factor;
+    let result_rounded = result_scaled.round();
+
+    if result_rounded.abs() > i128::MAX as f64 {
+        return Err(ArrowError::ArithmeticOverflow(format!(
+            "Result {result_rounded} is too large for the target decimal type"
+        )));
+    }
+
+    Ok(result_rounded as i128)
+}
+
+/// Convert i128 result to target decimal native type using NumCast.
+/// Returns error if value overflows the target type.
+#[inline]
+fn decimal_from_i128<T>(value: i128) -> Result<T, ArrowError>
+where
+    T: NumCast,
+{
+    NumCast::from(value).ok_or_else(|| {
+        ArrowError::ArithmeticOverflow(format!(
+            "Value {value} is too large for the target decimal type"
+        ))
+    })
+}
+
+/// Fallback implementation using f64 for negative or non-integer exponents.
+/// This handles cases that cannot be computed using integer arithmetic.
+fn pow_decimal_float_fallback<T>(base: T, scale: i8, exp: f64) -> Result<T, ArrowError>
+where
+    T: ToPrimitive + NumCast + Copy,
+{
+    if scale < 0 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "Negative scale is not yet supported: {scale}"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let base_f64 = base.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert base to f64".to_string())
+    })? / scale_factor;
+
+    let result_i128 = compute_pow_f64_result(base_f64, scale, exp)?;
+
+    decimal_from_i128(result_i128)
+}
+
+/// Decimal256 specialized float exponent version.
+fn pow_decimal256_float(base: i256, scale: i8, exp: f64) -> Result<i256, ArrowError> {
+    if exp.is_finite() && exp.trunc() == exp && exp >= 0f64 && exp < u32::MAX as f64 {
+        return pow_decimal256_int(base, scale, exp as i64);
+    }
+
+    if !exp.is_finite() {
+        return Err(ArrowError::ComputeError(format!(
+            "Cannot use non-finite exp: {exp}"
+        )));
+    }
+
+    pow_decimal256_float_fallback(base, scale, exp)
+}
+
+/// Decimal256 specialized integer exponent version.
+fn pow_decimal256_int(base: i256, scale: i8, exp: i64) -> Result<i256, ArrowError> {
+    if exp < 0 {
+        return pow_decimal256_float(base, scale, exp as f64);
+    }
+
+    let exp: u32 = exp.try_into().map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Unsupported exp value: {exp}"))
+    })?;
+
+    if exp == 0 {
+        return if scale >= 0 {
+            i256::from_i128(10).pow_checked(scale as u32).map_err(|_| {
+                ArrowError::ArithmeticOverflow(format!(
+                    "Cannot make unscale factor for {scale} and {exp}"
+                ))
+            })
+        } else {
+            Ok(i256::from_i128(0))
+        };
+    }
+
+    let powered: i256 = base.pow_checked(exp).map_err(|_| {
+        ArrowError::ArithmeticOverflow(format!("Cannot raise base {base:?} to exp {exp}"))
+    })?;
+
+    let mul_exp = (scale as i64).wrapping_mul(exp as i64 - 1);
+
+    if mul_exp == 0 {
+        return Ok(powered);
+    }
+
+    if mul_exp > 0 {
+        let div_factor: i256 =
+            i256::from_i128(10)
+                .pow_checked(mul_exp as u32)
+                .map_err(|_| {
+                    ArrowError::ArithmeticOverflow(format!(
+                        "Cannot make div factor for {scale} and {exp}"
+                    ))
+                })?;
+        powered.div_checked(div_factor)
+    } else {
+        let abs_exp = mul_exp.checked_neg().ok_or_else(|| {
+            ArrowError::ArithmeticOverflow(
+                "Overflow while negating scale exponent".to_string(),
+            )
+        })?;
+        let mul_factor: i256 =
+            i256::from_i128(10)
+                .pow_checked(abs_exp as u32)
+                .map_err(|_| {
+                    ArrowError::ArithmeticOverflow(format!(
+                        "Cannot make mul factor for {scale} and {exp}"
+                    ))
+                })?;
+        powered.mul_checked(mul_factor)
+    }
+}
+
+/// Fallback implementation for Decimal256.
+fn pow_decimal256_float_fallback(
+    base: i256,
+    scale: i8,
+    exp: f64,
+) -> Result<i256, ArrowError> {
+    if scale < 0 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "Negative scale is not yet supported: {scale}"
+        )));
+    }
+
+    let scale_factor = 10f64.powi(scale as i32);
+    let base_f64 = base.to_f64().ok_or_else(|| {
+        ArrowError::ComputeError("Cannot convert base to f64".to_string())
+    })? / scale_factor;
+
+    let result_i128 = compute_pow_f64_result(base_f64, scale, exp)?;
+
+    // i256 can be constructed from i128 directly
+    Ok(i256::from_i128(result_i128))
+}
+
+/// Fallback implementation for decimal power when exponent is an array.
+/// Casts decimal to float64, computes power, and casts back to original decimal type.
+/// This is used for performance when exponent varies per-row.
+fn pow_decimal_with_float_fallback(
+    base: &ArrayRef,
+    exponent: &ColumnarValue,
+    num_rows: usize,
+) -> Result<ColumnarValue> {
+    use arrow::compute::cast;
+
+    let original_type = base.data_type().clone();
+    let base_f64 = cast(base.as_ref(), &DataType::Float64)?;
+
+    let exp_f64 = match exponent {
+        ColumnarValue::Array(arr) => cast(arr.as_ref(), &DataType::Float64)?,
+        ColumnarValue::Scalar(scalar) => {
+            let scalar_f64 = scalar.cast_to(&DataType::Float64)?;
+            scalar_f64.to_array_of_size(num_rows)?
+        }
+    };
+
+    let result_f64 = calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+        &base_f64,
+        &ColumnarValue::Array(exp_f64),
+        |b, e| Ok(f64::powf(b, e)),
+    )?;
+
+    let result = cast(result_f64.as_ref(), &original_type)?;
+    Ok(ColumnarValue::Array(result))
+}
+
 impl ScalarUDFImpl for PowerFunc {
     fn as_any(&self) -> &dyn Any {
         self
     }
+
     fn name(&self) -> &str {
         "power"
     }
@@ -83,9 +405,10 @@ impl ScalarUDFImpl for PowerFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            DataType::Int64 => Ok(DataType::Int64),
-            _ => Ok(DataType::Float64),
+        if arg_types[0].is_null() {
+            Ok(DataType::Float64)
+        } else {
+            Ok(arg_types[0].clone())
         }
     }
 
@@ -94,74 +417,175 @@ impl ScalarUDFImpl for PowerFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args = ColumnarValue::values_to_arrays(&args.args)?;
-
-        let arr: ArrayRef = match args[0].data_type() {
-            DataType::Float64 => {
-                let bases = args[0].as_primitive::<Float64Type>();
-                let exponents = args[1].as_primitive::<Float64Type>();
-                let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                    bases,
-                    exponents,
-                    f64::powf,
-                )?;
-                Arc::new(result) as _
+        let [base, exponent] = take_function_args(self.name(), &args.args)?;
+
+        // For decimal types, only use native decimal
+        // operations when we have a scalar exponent. When the exponent is an array,
+        // fall back to float computation for better performance.
+        let use_float_fallback = matches!(
+            base.data_type(),
+            DataType::Decimal32(_, _)
+                | DataType::Decimal64(_, _)
+                | DataType::Decimal128(_, _)
+                | DataType::Decimal256(_, _)
+        ) && matches!(exponent, ColumnarValue::Array(_));
+
+        let base = base.to_array(args.number_rows)?;
+
+        // If decimal with array exponent, cast to float and compute
+        if use_float_fallback {
+            return pow_decimal_with_float_fallback(&base, exponent, args.number_rows);
+        }
+
+        let arr: ArrayRef = match (base.data_type(), exponent.data_type()) {
+            (DataType::Float64, DataType::Float64) => {
+                calculate_binary_math::<Float64Type, Float64Type, Float64Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| Ok(f64::powf(b, e)),
+                )?
             }
-            DataType::Int64 => {
-                let bases = downcast_named_arg!(&args[0], "base", Int64Array);
-                let exponents = downcast_named_arg!(&args[1], "exponent", Int64Array);
-                bases
-                    .iter()
-                    .zip(exponents.iter())
-                    .map(|(base, exp)| match (base, exp) {
-                        (Some(base), Some(exp)) => Ok(Some(base.pow_checked(
-                            exp.try_into().map_err(|_| {
-                                exec_datafusion_err!(
-                                    "Can't use negative exponents: {exp} in integer computation, please use Float."
-                                )
-                            })?,
-                        ).map_err(|e| arrow_datafusion_err!(e))?)),
-                        _ => Ok(None),
-                    })
-                    .collect::<Result<Int64Array>>()
-                    .map(Arc::new)? as _
+            (DataType::Decimal32(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<Decimal32Type, Int64Type, Decimal32Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
             }
-
-            other => {
-                return exec_err!(
-                    "Unsupported data type {other:?} for function {}",
-                    self.name()
-                )
+            (DataType::Decimal32(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal32Type,
+                    Float64Type,
+                    Decimal32Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal64(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<Decimal64Type, Int64Type, Decimal64Type, _>(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal64(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal64Type,
+                    Float64Type,
+                    Decimal64Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal128(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<
+                    Decimal128Type,
+                    Int64Type,
+                    Decimal128Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal128(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal128Type,
+                    Float64Type,
+                    Decimal128Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal256(precision, scale), DataType::Int64) => {
+                calculate_binary_decimal_math::<
+                    Decimal256Type,
+                    Int64Type,
+                    Decimal256Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal256_int(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (DataType::Decimal256(precision, scale), DataType::Float64) => {
+                calculate_binary_decimal_math::<
+                    Decimal256Type,
+                    Float64Type,
+                    Decimal256Type,
+                    _,
+                >(
+                    &base,
+                    exponent,
+                    |b, e| pow_decimal256_float(b, *scale, e),
+                    *precision,
+                    *scale,
+                )?
+            }
+            (base_type, exp_type) => {
+                return internal_err!(
+                    "Unsupported data types for base {base_type:?} and exponent {exp_type:?} for power"
+                );
             }
         };
-
         Ok(ColumnarValue::Array(arr))
     }
 
     /// Simplify the `power` function by the relevant rules:
-    /// 1. Power(a, 0) ===> 0
+    /// 1. Power(a, 0) ===> 1
     /// 2. Power(a, 1) ===> a
     /// 3. Power(a, Log(a, b)) ===> b
     fn simplify(
         &self,
-        mut args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        let exponent = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected power to have 2 arguments, got 0")
-        })?;
-        let base = args.pop().ok_or_else(|| {
-            plan_datafusion_err!("Expected power to have 2 arguments, got 1")
-        })?;
-
+        let [base, exponent] = take_function_args("power", args)?;
+        let base_type = info.get_data_type(&base)?;
         let exponent_type = info.get_data_type(&exponent)?;
+
+        // Null propagation
+        if base_type.is_null() || exponent_type.is_null() {
+            let return_type = self.return_type(&[base_type, exponent_type])?;
+            return Ok(ExprSimplifyResult::Simplified(lit(
+                ScalarValue::Null.cast_to(&return_type)?
+            )));
+        }
+
         match exponent {
-            Expr::Literal(value) if value == ScalarValue::new_zero(&exponent_type)? => {
-                Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-                    ScalarValue::new_one(&info.get_data_type(&base)?)?,
-                )))
+            Expr::Literal(value, _)
+                if value == ScalarValue::new_zero(&exponent_type)? =>
+            {
+                Ok(ExprSimplifyResult::Simplified(lit(ScalarValue::new_one(
+                    &base_type,
+                )?)))
             }
-            Expr::Literal(value) if value == ScalarValue::new_one(&exponent_type)? => {
+            Expr::Literal(value, _) if value == ScalarValue::new_one(&exponent_type)? => {
                 Ok(ExprSimplifyResult::Simplified(base))
             }
             Expr::ScalarFunction(ScalarFunction { func, mut args })
@@ -186,84 +610,58 @@ fn is_log(func: &ScalarUDF) -> bool {
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::Float64Array;
-    use arrow::datatypes::Field;
-    use datafusion_common::cast::{as_float64_array, as_int64_array};
-
     use super::*;
 
     #[test]
-    fn test_power_f64() {
-        let arg_fields = vec![
-            Field::new("a", DataType::Float64, true).into(),
-            Field::new("a", DataType::Float64, true).into(),
-        ];
-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    2.0, 2.0, 3.0, 5.0,
-                ]))), // base
-                ColumnarValue::Array(Arc::new(Float64Array::from(vec![
-                    3.0, 2.0, 4.0, 4.0,
-                ]))), // exponent
-            ],
-            arg_fields,
-            number_rows: 4,
-            return_field: Field::new("f", DataType::Float64, true).into(),
-        };
-        let result = PowerFunc::new()
-            .invoke_with_args(args)
-            .expect("failed to initialize function power");
-
-        match result {
-            ColumnarValue::Array(arr) => {
-                let floats = as_float64_array(&arr)
-                    .expect("failed to convert result to a Float64Array");
-                assert_eq!(floats.len(), 4);
-                assert_eq!(floats.value(0), 8.0);
-                assert_eq!(floats.value(1), 4.0);
-                assert_eq!(floats.value(2), 81.0);
-                assert_eq!(floats.value(3), 625.0);
-            }
-            ColumnarValue::Scalar(_) => {
-                panic!("Expected an array value")
-            }
-        }
+    fn test_pow_decimal128_helper() {
+        // Expression: 2.5 ^ 4 = 39.0625
+        assert_eq!(pow_decimal_int(25i128, 1, 4).unwrap(), 390i128);
+        assert_eq!(pow_decimal_int(2500i128, 3, 4).unwrap(), 39062i128);
+        assert_eq!(pow_decimal_int(25000i128, 4, 4).unwrap(), 390625i128);
+
+        // Expression: 25 ^ 4 = 390625
+        assert_eq!(pow_decimal_int(25i128, 0, 4).unwrap(), 390625i128);
+
+        // Expressions for edge cases
+        assert_eq!(pow_decimal_int(25i128, 1, 1).unwrap(), 25i128);
+        assert_eq!(pow_decimal_int(25i128, 0, 1).unwrap(), 25i128);
+        assert_eq!(pow_decimal_int(25i128, 0, 0).unwrap(), 1i128);
+        assert_eq!(pow_decimal_int(25i128, 1, 0).unwrap(), 10i128);
+
+        assert_eq!(pow_decimal_int(25i128, -1, 4).unwrap(), 390625000i128);
     }
 
     #[test]
-    fn test_power_i64() {
-        let arg_fields = vec![
-            Field::new("a", DataType::Int64, true).into(),
-            Field::new("a", DataType::Int64, true).into(),
-        ];
-        let args = ScalarFunctionArgs {
-            args: vec![
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![2, 2, 3, 5]))), // base
-                ColumnarValue::Array(Arc::new(Int64Array::from(vec![3, 2, 4, 4]))), // exponent
-            ],
-            arg_fields,
-            number_rows: 4,
-            return_field: Field::new("f", DataType::Int64, true).into(),
-        };
-        let result = PowerFunc::new()
-            .invoke_with_args(args)
-            .expect("failed to initialize function power");
-
-        match result {
-            ColumnarValue::Array(arr) => {
-                let ints = as_int64_array(&arr)
-                    .expect("failed to convert result to a Int64Array");
-
-                assert_eq!(ints.len(), 4);
-                assert_eq!(ints.value(0), 8);
-                assert_eq!(ints.value(1), 4);
-                assert_eq!(ints.value(2), 81);
-                assert_eq!(ints.value(3), 625);
-            }
-            ColumnarValue::Scalar(_) => {
-                panic!("Expected an array value")
-            }
-        }
+    fn test_pow_decimal_float_fallback() {
+        // Test negative exponent: 4^(-1) = 0.25
+        // 4 with scale 2 = 400, result should be 25 (0.25 with scale 2)
+        let result: i128 = pow_decimal_float(400i128, 2, -1.0).unwrap();
+        assert_eq!(result, 25);
+
+        // Test non-integer exponent: 4^0.5 = 2
+        // 4 with scale 2 = 400, result should be 200 (2.0 with scale 2)
+        let result: i128 = pow_decimal_float(400i128, 2, 0.5).unwrap();
+        assert_eq!(result, 200);
+
+        // Test 8^(1/3) = 2 (cube root)
+        // 8 with scale 1 = 80, result should be 20 (2.0 with scale 1)
+        let result: i128 = pow_decimal_float(80i128, 1, 1.0 / 3.0).unwrap();
+        assert_eq!(result, 20);
+
+        // Test negative base with integer exponent still works
+        // (-2)^3 = -8
+        // -2 with scale 1 = -20, result should be -80 (-8.0 with scale 1)
+        let result: i128 = pow_decimal_float(-20i128, 1, 3.0).unwrap();
+        assert_eq!(result, -80);
+
+        // Test positive integer exponent goes through fast path
+        // 2.5^4 = 39.0625
+        // 25 with scale 1, result should be 390 (39.0 with scale 1) - truncated
+        let result: i128 = pow_decimal_float(25i128, 1, 4.0).unwrap();
+        assert_eq!(result, 390); // Uses integer path
+
+        // Test non-finite exponent returns error
+        assert!(pow_decimal_float(100i128, 2, f64::NAN).is_err());
+        assert!(pow_decimal_float(100i128, 2, f64::INFINITY).is_err());
     }
 }
diff --git a/datafusion/functions/src/math/random.rs b/datafusion/functions/src/math/random.rs
index 92b6ed1895edd..78932873b485b 100644
--- a/datafusion/functions/src/math/random.rs
+++ b/datafusion/functions/src/math/random.rs
@@ -21,9 +21,9 @@ use std::sync::Arc;
 use arrow::array::Float64Array;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Float64;
-use rand::{rng, Rng};
+use rand::{Rng, rng};
 
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
@@ -32,9 +32,17 @@ use datafusion_macros::user_doc;
     doc_section(label = "Math Functions"),
     description = r#"Returns a random float value in the range [0, 1).
 The random seed is unique to each row."#,
-    syntax_example = "random()"
+    syntax_example = "random()",
+    sql_example = r#"```sql
+> SELECT random();
++------------------+
+| random()         |
++------------------+
+| 0.7389238902938  |
++------------------+
+```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RandomFunc {
     signature: Signature,
 }
@@ -71,9 +79,11 @@ impl ScalarUDFImpl for RandomFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
         let mut rng = rng();
         let mut values = vec![0.0; args.number_rows];
         // Equivalent to set each element with rng.random_range(0.0..1.0), but more efficient
diff --git a/datafusion/functions/src/math/round.rs b/datafusion/functions/src/math/round.rs
index fc87b7e63a62f..07cddf9341f27 100644
--- a/datafusion/functions/src/math/round.rs
+++ b/datafusion/functions/src/math/round.rs
@@ -16,22 +16,139 @@
 // under the License.
 
 use std::any::Any;
-use std::sync::Arc;
 
-use crate::utils::make_scalar_function;
+use crate::utils::{calculate_binary_decimal_math, calculate_binary_math};
 
-use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
-use arrow::compute::{cast_with_options, CastOptions};
-use arrow::datatypes::DataType::{Float32, Float64, Int32};
-use arrow::datatypes::{DataType, Float32Type, Float64Type, Int32Type};
-use datafusion_common::{exec_datafusion_err, exec_err, Result, ScalarValue};
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType::{
+    Decimal32, Decimal64, Decimal128, Decimal256, Float32, Float64,
+};
+use arrow::datatypes::{
+    ArrowNativeTypeOp, DataType, Decimal32Type, Decimal64Type, Decimal128Type,
+    Decimal256Type, DecimalType, Float32Type, Float64Type, Int32Type,
+};
+use arrow::datatypes::{Field, FieldRef};
+use arrow::error::ArrowError;
+use datafusion_common::types::{
+    NativeType, logical_float32, logical_float64, logical_int32,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
+use std::sync::Arc;
+
+fn output_scale_for_decimal(precision: u8, input_scale: i8, decimal_places: i32) -> i8 {
+    // `decimal_places` controls the maximum output scale, but scale cannot exceed the input scale.
+    //
+    // For negative-scale decimals, allow further scale reduction to match negative `decimal_places`
+    // (e.g. scale -2 rounded to -3 becomes scale -3). This preserves fixed precision by
+    // representing the rounded result at a coarser scale.
+    if input_scale < 0 {
+        // Decimal scales must be within [-precision, precision] and fit in i8. For negative-scale
+        // decimals, allow rounding to move the output scale further negative, but cap it at
+        // `-precision` (beyond that, the rounded result is always 0).
+        let min_scale = -i32::from(precision);
+        let new_scale = i32::from(input_scale).min(decimal_places).max(min_scale);
+        return new_scale as i8;
+    }
+
+    // The `min` ensures the result is always within i8 range because `input_scale` is i8.
+    let decimal_places = decimal_places.max(0);
+    i32::from(input_scale).min(decimal_places) as i8
+}
+
+fn normalize_decimal_places_for_decimal(
+    decimal_places: i32,
+    precision: u8,
+    scale: i8,
+) -> Option<i32> {
+    if decimal_places >= 0 {
+        return Some(decimal_places);
+    }
+
+    // For fixed precision decimals, the absolute value is strictly less than 10^(precision - scale).
+    // If the rounding position is beyond that (abs(decimal_places) > precision - scale), the
+    // rounded result is always 0, and we can avoid overflow in intermediate 10^n computations.
+    let max_rounding_pow10 = i64::from(precision) - i64::from(scale);
+    if max_rounding_pow10 <= 0 {
+        return None;
+    }
+
+    let abs_decimal_places = i64::from(decimal_places.unsigned_abs());
+    (abs_decimal_places <= max_rounding_pow10).then_some(decimal_places)
+}
+
+fn validate_decimal_precision<T: DecimalType>(
+    value: T::Native,
+    precision: u8,
+    scale: i8,
+) -> Result<T::Native, ArrowError> {
+    T::validate_decimal_precision(value, precision, scale).map_err(|e| {
+        ArrowError::ComputeError(format!(
+            "Decimal overflow: rounded value exceeds precision {precision}: {e}"
+        ))
+    })?;
+    Ok(value)
+}
+
+fn calculate_new_precision_scale<T: DecimalType>(
+    precision: u8,
+    scale: i8,
+    decimal_places: Option<i32>,
+) -> Result<DataType> {
+    if let Some(decimal_places) = decimal_places {
+        let new_scale = output_scale_for_decimal(precision, scale, decimal_places);
+
+        // When rounding an integer decimal (scale == 0) to a negative `decimal_places`, a carry can
+        // add an extra digit to the integer part (e.g. 99 -> 100 when rounding to -1). This can
+        // only happen when the rounding position is within the existing precision.
+        let abs_decimal_places = decimal_places.unsigned_abs();
+        let new_precision = if scale == 0
+            && decimal_places < 0
+            && abs_decimal_places <= u32::from(precision)
+        {
+            precision.saturating_add(1).min(T::MAX_PRECISION)
+        } else {
+            precision
+        };
+        Ok(T::TYPE_CONSTRUCTOR(new_precision, new_scale))
+    } else {
+        let new_precision = precision.saturating_add(1).min(T::MAX_PRECISION);
+        Ok(T::TYPE_CONSTRUCTOR(new_precision, scale))
+    }
+}
+
+fn decimal_places_from_scalar(scalar: &ScalarValue) -> Result<i32> {
+    let out_of_range = |value: String| {
+        datafusion_common::DataFusionError::Execution(format!(
+            "round decimal_places {value} is out of supported i32 range"
+        ))
+    };
+    match scalar {
+        ScalarValue::Int8(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::Int16(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::Int32(Some(v)) => Ok(*v),
+        ScalarValue::Int64(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        ScalarValue::UInt8(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::UInt16(Some(v)) => Ok(i32::from(*v)),
+        ScalarValue::UInt32(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        ScalarValue::UInt64(Some(v)) => {
+            i32::try_from(*v).map_err(|_| out_of_range(v.to_string()))
+        }
+        other => exec_err!(
+            "Unexpected datatype for decimal_places: {}",
+            other.data_type()
+        ),
+    }
+}
 
 #[user_doc(
     doc_section(label = "Math Functions"),
@@ -41,9 +158,17 @@ use datafusion_macros::user_doc;
     argument(
         name = "decimal_places",
         description = "Optional. The number of decimal places to round to. Defaults to 0."
-    )
+    ),
+    sql_example = r#"```sql
+> SELECT round(3.14159);
++--------------+
+| round(3.14159)|
++--------------+
+| 3.0          |
++--------------+
+```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RoundFunc {
     signature: Signature,
 }
@@ -56,14 +181,33 @@ impl Default for RoundFunc {
 
 impl RoundFunc {
     pub fn new() -> Self {
-        use DataType::*;
+        let decimal = Coercion::new_exact(TypeSignatureClass::Decimal);
+        let decimal_places = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+        let float32 = Coercion::new_exact(TypeSignatureClass::Native(logical_float32()));
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
         Self {
             signature: Signature::one_of(
                 vec![
-                    Exact(vec![Float64, Int64]),
-                    Exact(vec![Float32, Int64]),
-                    Exact(vec![Float64]),
-                    Exact(vec![Float32]),
+                    TypeSignature::Coercible(vec![
+                        decimal.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![decimal]),
+                    TypeSignature::Coercible(vec![
+                        float32.clone(),
+                        decimal_places.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![float32]),
+                    TypeSignature::Coercible(vec![float64.clone(), decimal_places]),
+                    TypeSignature::Coercible(vec![float64]),
                 ],
                 Volatility::Immutable,
             ),
@@ -84,15 +228,218 @@ impl ScalarUDFImpl for RoundFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        match arg_types[0] {
-            Float32 => Ok(Float32),
-            _ => Ok(Float64),
-        }
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let input_field = &args.arg_fields[0];
+        let input_type = input_field.data_type();
+
+        // If decimal_places is a scalar literal, we can incorporate it into the output type
+        // (scale reduction). Otherwise, keep the input scale as we can't pick a per-row scale.
+        //
+        // Note: `scalar_arguments` contains the original literal values (pre-coercion), so
+        // integer literals may appear as Int64 even though the signature coerces them to Int32.
+        let decimal_places: Option<i32> = match args.scalar_arguments.get(1) {
+            None => Some(0),    // No dp argument means default to 0
+            Some(None) => None, // dp is not a literal (e.g. column)
+            Some(Some(scalar)) if scalar.is_null() => Some(0), // null dp => default to 0
+            Some(Some(scalar)) => Some(decimal_places_from_scalar(scalar)?),
+        };
+
+        // Calculate return type based on input type
+        // For decimals: reduce scale to decimal_places (reclaims precision for integer part)
+        // This matches Spark/DuckDB behavior where ROUND adjusts the scale
+        // BUT only if dp is a scalar literal - otherwise keep original scale and add
+        // extra precision to accommodate potential carry-over.
+        let return_type =
+            match input_type {
+                Float32 => Float32,
+                Decimal32(precision, scale) => calculate_new_precision_scale::<
+                    Decimal32Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal64(precision, scale) => calculate_new_precision_scale::<
+                    Decimal64Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal128(precision, scale) => calculate_new_precision_scale::<
+                    Decimal128Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                Decimal256(precision, scale) => calculate_new_precision_scale::<
+                    Decimal256Type,
+                >(
+                    *precision, *scale, decimal_places
+                )?,
+                _ => Float64,
+            };
+
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), return_type, nullable)))
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("use return_field_from_args instead")
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(round, vec![])(&args.args)
+        if args.arg_fields.iter().any(|a| a.data_type().is_null()) {
+            return ColumnarValue::Scalar(ScalarValue::Null)
+                .cast_to(args.return_type(), None);
+        }
+
+        let default_decimal_places = ColumnarValue::Scalar(ScalarValue::Int32(Some(0)));
+        let decimal_places = if args.args.len() == 2 {
+            &args.args[1]
+        } else {
+            &default_decimal_places
+        };
+
+        if let (ColumnarValue::Scalar(value_scalar), ColumnarValue::Scalar(dp_scalar)) =
+            (&args.args[0], decimal_places)
+        {
+            if value_scalar.is_null() || dp_scalar.is_null() {
+                return ColumnarValue::Scalar(ScalarValue::Null)
+                    .cast_to(args.return_type(), None);
+            }
+
+            let dp = if let ScalarValue::Int32(Some(dp)) = dp_scalar {
+                *dp
+            } else {
+                return internal_err!(
+                    "Unexpected datatype for decimal_places: {}",
+                    dp_scalar.data_type()
+                );
+            };
+
+            match (value_scalar, args.return_type()) {
+                (ScalarValue::Float32(Some(v)), _) => {
+                    let rounded = round_float(*v, dp)?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::from(rounded)))
+                }
+                (ScalarValue::Float64(Some(v)), _) => {
+                    let rounded = round_float(*v, dp)?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::from(rounded)))
+                }
+                (
+                    ScalarValue::Decimal32(Some(v), in_precision, scale),
+                    Decimal32(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal32Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // With scale == 0 and negative dp, rounding can carry into an additional
+                        // digit (e.g. 99 -> 100). If we're already at max precision we can't widen
+                        // the type, so validate and error rather than producing an invalid decimal.
+                        validate_decimal_precision::<Decimal32Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar =
+                        ScalarValue::Decimal32(Some(rounded), *out_precision, *out_scale);
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal64(Some(v), in_precision, scale),
+                    Decimal64(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal64Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal64Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar =
+                        ScalarValue::Decimal64(Some(rounded), *out_precision, *out_scale);
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal128(Some(v), in_precision, scale),
+                    Decimal128(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal128Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal128Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar = ScalarValue::Decimal128(
+                        Some(rounded),
+                        *out_precision,
+                        *out_scale,
+                    );
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (
+                    ScalarValue::Decimal256(Some(v), in_precision, scale),
+                    Decimal256(out_precision, out_scale),
+                ) => {
+                    let rounded =
+                        round_decimal_or_zero(*v, *in_precision, *scale, *out_scale, dp)?;
+                    let rounded = if *out_precision == Decimal256Type::MAX_PRECISION
+                        && *scale == 0
+                        && dp < 0
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal256Type>(
+                            rounded,
+                            *out_precision,
+                            *out_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }?;
+                    let scalar = ScalarValue::Decimal256(
+                        Some(rounded),
+                        *out_precision,
+                        *out_scale,
+                    );
+                    Ok(ColumnarValue::Scalar(scalar))
+                }
+                (ScalarValue::Null, _) => ColumnarValue::Scalar(ScalarValue::Null)
+                    .cast_to(args.return_type(), None),
+                (value_scalar, return_type) => {
+                    internal_err!(
+                        "Unexpected datatype for round(value, decimal_places): value {}, return type {}",
+                        value_scalar.data_type(),
+                        return_type
+                    )
+                }
+            }
+        } else {
+            round_columnar(
+                &args.args[0],
+                decimal_places,
+                args.number_rows,
+                args.return_type(),
+            )
+        }
     }
 
     fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
@@ -115,107 +462,270 @@ impl ScalarUDFImpl for RoundFunc {
     }
 }
 
-/// Round SQL function
-pub fn round(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args.len() != 1 && args.len() != 2 {
-        return exec_err!(
-            "round function requires one or two arguments, got {}",
-            args.len()
-        );
+fn round_columnar(
+    value: &ColumnarValue,
+    decimal_places: &ColumnarValue,
+    number_rows: usize,
+    return_type: &DataType,
+) -> Result<ColumnarValue> {
+    let value_array = value.to_array(number_rows)?;
+    let both_scalars = matches!(value, ColumnarValue::Scalar(_))
+        && matches!(decimal_places, ColumnarValue::Scalar(_));
+    let decimal_places_is_array = matches!(decimal_places, ColumnarValue::Array(_));
+
+    let arr: ArrayRef = match (value_array.data_type(), return_type) {
+        (Float64, _) => {
+            let result = calculate_binary_math::<Float64Type, Int32Type, Float64Type, _>(
+                value_array.as_ref(),
+                decimal_places,
+                round_float::<f64>,
+            )?;
+            result as _
+        }
+        (Float32, _) => {
+            let result = calculate_binary_math::<Float32Type, Int32Type, Float32Type, _>(
+                value_array.as_ref(),
+                decimal_places,
+                round_float::<f32>,
+            )?;
+            result as _
+        }
+        (Decimal32(input_precision, scale), Decimal32(precision, new_scale)) => {
+            // reduce scale to reclaim integer precision
+            let result = calculate_binary_decimal_math::<
+                Decimal32Type,
+                Int32Type,
+                Decimal32Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal32Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // If we're already at max precision, we can't widen the result type. For
+                        // dp arrays, or for scale == 0 with negative dp, rounding can overflow the
+                        // fixed-precision type. Validate per-row and return an error instead of
+                        // producing an invalid decimal that Arrow may display incorrectly.
+                        validate_decimal_precision::<Decimal32Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal64(input_precision, scale), Decimal64(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal64Type,
+                Int32Type,
+                Decimal64Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal64Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal64Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal128(input_precision, scale), Decimal128(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal128Type,
+                Int32Type,
+                Decimal128Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal128Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal128Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (Decimal256(input_precision, scale), Decimal256(precision, new_scale)) => {
+            let result = calculate_binary_decimal_math::<
+                Decimal256Type,
+                Int32Type,
+                Decimal256Type,
+                _,
+            >(
+                value_array.as_ref(),
+                decimal_places,
+                |v, dp| {
+                    let rounded = round_decimal_or_zero(
+                        v,
+                        *input_precision,
+                        *scale,
+                        *new_scale,
+                        dp,
+                    )?;
+                    if *precision == Decimal256Type::MAX_PRECISION
+                        && (decimal_places_is_array || (*scale == 0 && dp < 0))
+                    {
+                        // See Decimal32 branch for details.
+                        validate_decimal_precision::<Decimal256Type>(
+                            rounded, *precision, *new_scale,
+                        )
+                    } else {
+                        Ok(rounded)
+                    }
+                },
+                *precision,
+                *new_scale,
+            )?;
+            result as _
+        }
+        (other, _) => exec_err!("Unsupported data type {other:?} for function round")?,
+    };
+
+    if both_scalars {
+        ScalarValue::try_from_array(&arr, 0).map(ColumnarValue::Scalar)
+    } else {
+        Ok(ColumnarValue::Array(arr))
     }
+}
 
-    let mut decimal_places = ColumnarValue::Scalar(ScalarValue::Int64(Some(0)));
+fn round_float<T>(value: T, decimal_places: i32) -> Result<T, ArrowError>
+where
+    T: num_traits::Float,
+{
+    let factor = T::from(10_f64.powi(decimal_places)).ok_or_else(|| {
+        ArrowError::ComputeError(format!(
+            "Invalid value for decimal places: {decimal_places}"
+        ))
+    })?;
+    Ok((value * factor).round() / factor)
+}
 
-    if args.len() == 2 {
-        decimal_places = ColumnarValue::Array(Arc::clone(&args[1]));
+fn round_decimal<V: ArrowNativeTypeOp>(
+    value: V,
+    input_scale: i8,
+    output_scale: i8,
+    decimal_places: i32,
+) -> Result<V, ArrowError> {
+    let diff = i64::from(input_scale) - i64::from(decimal_places);
+    if diff <= 0 {
+        return Ok(value);
     }
 
-    match args[0].data_type() {
-        Float64 => match decimal_places {
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(decimal_places))) => {
-                let decimal_places: i32 = decimal_places.try_into().map_err(|e| {
-                    exec_datafusion_err!(
-                        "Invalid value for decimal places: {decimal_places}: {e}"
-                    )
-                })?;
-
-                let result = args[0]
-                    .as_primitive::<Float64Type>()
-                    .unary::<_, Float64Type>(|value: f64| {
-                        (value * 10.0_f64.powi(decimal_places)).round()
-                            / 10.0_f64.powi(decimal_places)
-                    });
-                Ok(Arc::new(result) as _)
-            }
-            ColumnarValue::Array(decimal_places) => {
-                let options = CastOptions {
-                    safe: false, // raise error if the cast is not possible
-                    ..Default::default()
-                };
-                let decimal_places = cast_with_options(&decimal_places, &Int32, &options)
-                    .map_err(|e| {
-                        exec_datafusion_err!("Invalid values for decimal places: {e}")
-                    })?;
-
-                let values = args[0].as_primitive::<Float64Type>();
-                let decimal_places = decimal_places.as_primitive::<Int32Type>();
-                let result = arrow::compute::binary::<_, _, _, Float64Type>(
-                    values,
-                    decimal_places,
-                    |value, decimal_places| {
-                        (value * 10.0_f64.powi(decimal_places)).round()
-                            / 10.0_f64.powi(decimal_places)
-                    },
-                )?;
-                Ok(Arc::new(result) as _)
-            }
-            _ => {
-                exec_err!("round function requires a scalar or array for decimal_places")
-            }
-        },
+    debug_assert!(diff <= i64::from(u32::MAX));
+    let diff = diff as u32;
+
+    let one = V::ONE;
+    let two = V::from_usize(2).ok_or_else(|| {
+        ArrowError::ComputeError("Internal error: could not create constant 2".into())
+    })?;
+    let ten = V::from_usize(10).ok_or_else(|| {
+        ArrowError::ComputeError("Internal error: could not create constant 10".into())
+    })?;
+
+    let factor = ten.pow_checked(diff).map_err(|_| {
+        ArrowError::ComputeError(format!(
+            "Overflow while rounding decimal with scale {input_scale} and decimal places {decimal_places}"
+        ))
+    })?;
+
+    let mut quotient = value.div_wrapping(factor);
+    let remainder = value.mod_wrapping(factor);
+
+    // `factor` is an even number (10^n, n > 0), so `factor / 2` is the tie threshold
+    let threshold = factor.div_wrapping(two);
+    if remainder >= threshold {
+        quotient = quotient.add_checked(one).map_err(|_| {
+            ArrowError::ComputeError("Overflow while rounding decimal".into())
+        })?;
+    } else if remainder <= threshold.neg_wrapping() {
+        quotient = quotient.sub_checked(one).map_err(|_| {
+            ArrowError::ComputeError("Overflow while rounding decimal".into())
+        })?;
+    }
 
-        Float32 => match decimal_places {
-            ColumnarValue::Scalar(ScalarValue::Int64(Some(decimal_places))) => {
-                let decimal_places: i32 = decimal_places.try_into().map_err(|e| {
-                    exec_datafusion_err!(
-                        "Invalid value for decimal places: {decimal_places}: {e}"
-                    )
-                })?;
-                let result = args[0]
-                    .as_primitive::<Float32Type>()
-                    .unary::<_, Float32Type>(|value: f32| {
-                        (value * 10.0_f32.powi(decimal_places)).round()
-                            / 10.0_f32.powi(decimal_places)
-                    });
-                Ok(Arc::new(result) as _)
-            }
-            ColumnarValue::Array(_) => {
-                let ColumnarValue::Array(decimal_places) =
-                    decimal_places.cast_to(&Int32, None).map_err(|e| {
-                        exec_datafusion_err!("Invalid values for decimal places: {e}")
-                    })?
-                else {
-                    panic!("Unexpected result of ColumnarValue::Array.cast")
-                };
-
-                let values = args[0].as_primitive::<Float32Type>();
-                let decimal_places = decimal_places.as_primitive::<Int32Type>();
-                let result: PrimitiveArray<Float32Type> = arrow::compute::binary(
-                    values,
-                    decimal_places,
-                    |value, decimal_places| {
-                        (value * 10.0_f32.powi(decimal_places)).round()
-                            / 10.0_f32.powi(decimal_places)
-                    },
-                )?;
-                Ok(Arc::new(result) as _)
-            }
-            _ => {
-                exec_err!("round function requires a scalar or array for decimal_places")
-            }
-        },
+    // `quotient` is the rounded value at scale `decimal_places`. Rescale to the desired
+    // `output_scale` (which is always >= `decimal_places` in cases where diff > 0).
+    let scale_shift = i64::from(output_scale) - i64::from(decimal_places);
+    if scale_shift == 0 {
+        return Ok(quotient);
+    }
+
+    debug_assert!(scale_shift > 0);
+    debug_assert!(scale_shift <= i64::from(u32::MAX));
+    let scale_shift = scale_shift as u32;
+    let shift_factor = ten.pow_checked(scale_shift).map_err(|_| {
+        ArrowError::ComputeError(format!(
+            "Overflow while rounding decimal with scale {input_scale} and decimal places {decimal_places}"
+        ))
+    })?;
+    quotient
+        .mul_checked(shift_factor)
+        .map_err(|_| ArrowError::ComputeError("Overflow while rounding decimal".into()))
+}
 
-        other => exec_err!("Unsupported data type {other:?} for function round"),
+fn round_decimal_or_zero<V: ArrowNativeTypeOp>(
+    value: V,
+    precision: u8,
+    input_scale: i8,
+    output_scale: i8,
+    decimal_places: i32,
+) -> Result<V, ArrowError> {
+    if let Some(dp) =
+        normalize_decimal_places_for_decimal(decimal_places, precision, input_scale)
+    {
+        round_decimal(value, input_scale, output_scale, dp)
+    } else {
+        V::from_usize(0).ok_or_else(|| {
+            ArrowError::ComputeError("Internal error: could not create constant 0".into())
+        })
     }
 }
 
@@ -223,11 +733,33 @@ pub fn round(args: &[ArrayRef]) -> Result<ArrayRef> {
 mod test {
     use std::sync::Arc;
 
-    use crate::math::round::round;
-
     use arrow::array::{ArrayRef, Float32Array, Float64Array, Int64Array};
-    use datafusion_common::cast::{as_float32_array, as_float64_array};
     use datafusion_common::DataFusionError;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::cast::{as_float32_array, as_float64_array};
+    use datafusion_expr::ColumnarValue;
+
+    fn round_arrays(
+        value: ArrayRef,
+        decimal_places: Option<ArrayRef>,
+    ) -> Result<ArrayRef, DataFusionError> {
+        let number_rows = value.len();
+        // NOTE: For decimal inputs, the actual ROUND return type can differ from the
+        // input type (scale reduction for literal `decimal_places`). These unit tests
+        // only exercise Float32/Float64 behavior.
+        let return_type = value.data_type().clone();
+        let value = ColumnarValue::Array(value);
+        let decimal_places = decimal_places
+            .map(ColumnarValue::Array)
+            .unwrap_or_else(|| ColumnarValue::Scalar(ScalarValue::Int32(Some(0))));
+
+        let result =
+            super::round_columnar(&value, &decimal_places, number_rows, &return_type)?;
+        match result {
+            ColumnarValue::Array(array) => Ok(array),
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(1),
+        }
+    }
 
     #[test]
     fn test_round_f32() {
@@ -236,7 +768,8 @@ mod test {
             Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4, 5, -1, -2, -3, -4])), // decimal_places
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])))
+            .expect("failed to initialize function round");
         let floats =
             as_float32_array(&result).expect("failed to initialize function round");
 
@@ -254,7 +787,8 @@ mod test {
             Arc::new(Int64Array::from(vec![0, 1, 2, 3, 4, 5, -1, -2, -3, -4])), // decimal_places
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])))
+            .expect("failed to initialize function round");
         let floats =
             as_float64_array(&result).expect("failed to initialize function round");
 
@@ -271,7 +805,8 @@ mod test {
             Arc::new(Float32Array::from(vec![125.2345, 12.345, 1.234, 0.1234])), // input
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), None)
+            .expect("failed to initialize function round");
         let floats =
             as_float32_array(&result).expect("failed to initialize function round");
 
@@ -286,7 +821,8 @@ mod test {
             Arc::new(Float64Array::from(vec![125.2345, 12.345, 1.234, 0.1234])), // input
         ];
 
-        let result = round(&args).expect("failed to initialize function round");
+        let result = round_arrays(Arc::clone(&args[0]), None)
+            .expect("failed to initialize function round");
         let floats =
             as_float64_array(&result).expect("failed to initialize function round");
 
@@ -302,9 +838,12 @@ mod test {
             Arc::new(Int64Array::from(vec![2147483648])), // decimal_places
         ];
 
-        let result = round(&args);
+        let result = round_arrays(Arc::clone(&args[0]), Some(Arc::clone(&args[1])));
 
         assert!(result.is_err());
-        assert!(matches!(result, Err(DataFusionError::Execution { .. })));
+        assert!(matches!(
+            result,
+            Err(DataFusionError::ArrowError(_, _)) | Err(DataFusionError::Execution(_))
+        ));
     }
 }
diff --git a/datafusion/functions/src/math/signum.rs b/datafusion/functions/src/math/signum.rs
index ec6ef5a78c6a7..8a3769a12f294 100644
--- a/datafusion/functions/src/math/signum.rs
+++ b/datafusion/functions/src/math/signum.rs
@@ -18,11 +18,12 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, AsArray};
+use arrow::array::AsArray;
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type};
 
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -30,17 +31,23 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-use crate::utils::make_scalar_function;
-
 #[user_doc(
     doc_section(label = "Math Functions"),
     description = r#"Returns the sign of a number.
 Negative numbers return `-1`.
 Zero and positive numbers return `1`."#,
     syntax_example = "signum(numeric_expression)",
-    standard_argument(name = "numeric_expression", prefix = "Numeric")
+    standard_argument(name = "numeric_expression", prefix = "Numeric"),
+    sql_example = r#"```sql
+> SELECT signum(-42);
++-------------+
+| signum(-42) |
++-------------+
+| -1          |
++-------------+
+```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SignumFunc {
     signature: Signature,
 }
@@ -90,7 +97,53 @@ impl ScalarUDFImpl for SignumFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(signum, vec![])(&args.args)
+        let return_type = args.return_type().clone();
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return ColumnarValue::Scalar(ScalarValue::Null)
+                        .cast_to(&return_type, None);
+                }
+
+                match scalar {
+                    ScalarValue::Float64(Some(v)) => {
+                        let result = if v == 0.0 { 0.0 } else { v.signum() };
+                        Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(result))))
+                    }
+                    ScalarValue::Float32(Some(v)) => {
+                        let result = if v == 0.0 { 0.0 } else { v.signum() };
+                        Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected scalar type for signum: {:?}",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => match array.data_type() {
+                Float64 => Ok(ColumnarValue::Array(Arc::new(
+                    array.as_primitive::<Float64Type>().unary::<_, Float64Type>(
+                        |x: f64| {
+                            if x == 0.0 { 0.0 } else { x.signum() }
+                        },
+                    ),
+                ))),
+                Float32 => Ok(ColumnarValue::Array(Arc::new(
+                    array.as_primitive::<Float32Type>().unary::<_, Float32Type>(
+                        |x: f32| {
+                            if x == 0.0 { 0.0 } else { x.signum() }
+                        },
+                    ),
+                ))),
+                other => {
+                    internal_err!("Unsupported data type {other:?} for function signum")
+                }
+            },
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -98,41 +151,6 @@ impl ScalarUDFImpl for SignumFunc {
     }
 }
 
-/// signum SQL function
-pub fn signum(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        Float64 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float64Type>()
-                .unary::<_, Float64Type>(
-                    |x: f64| {
-                        if x == 0_f64 {
-                            0_f64
-                        } else {
-                            x.signum()
-                        }
-                    },
-                ),
-        ) as ArrayRef),
-
-        Float32 => Ok(Arc::new(
-            args[0]
-                .as_primitive::<Float32Type>()
-                .unary::<_, Float32Type>(
-                    |x: f32| {
-                        if x == 0_f32 {
-                            0_f32
-                        } else {
-                            x.signum()
-                        }
-                    },
-                ),
-        ) as ArrayRef),
-
-        other => exec_err!("Unsupported data type {other:?} for function signum"),
-    }
-}
-
 #[cfg(test)]
 mod test {
     use std::sync::Arc;
@@ -140,6 +158,7 @@ mod test {
     use arrow::array::{ArrayRef, Float32Array, Float64Array};
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::cast::{as_float32_array, as_float64_array};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use crate::math::signum::SignumFunc;
@@ -163,6 +182,7 @@ mod test {
             arg_fields,
             number_rows: array.len(),
             return_field: Field::new("f", DataType::Float32, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = SignumFunc::new()
             .invoke_with_args(args)
@@ -209,6 +229,7 @@ mod test {
             arg_fields,
             number_rows: array.len(),
             return_field: Field::new("f", DataType::Float64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
         let result = SignumFunc::new()
             .invoke_with_args(args)
diff --git a/datafusion/functions/src/math/trunc.rs b/datafusion/functions/src/math/trunc.rs
index 2ac291204a0bc..ecdad22e8af11 100644
--- a/datafusion/functions/src/math/trunc.rs
+++ b/datafusion/functions/src/math/trunc.rs
@@ -24,9 +24,9 @@ use arrow::array::{ArrayRef, AsArray, PrimitiveArray};
 use arrow::datatypes::DataType::{Float32, Float64};
 use arrow::datatypes::{DataType, Float32Type, Float64Type, Int64Type};
 use datafusion_common::ScalarValue::Int64;
-use datafusion_common::{exec_err, Result};
-use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::TypeSignature::Exact;
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     Volatility,
@@ -45,9 +45,18 @@ use datafusion_macros::user_doc;
   `decimal_places` is a positive integer, truncates digits to the
   right of the decimal point. If `decimal_places` is a negative
   integer, replaces digits to the left of the decimal point with `0`."#
-    )
+    ),
+    sql_example = r#"
+  ```sql
+  > SELECT trunc(42.738);
+  +----------------+
+  | trunc(42.738)  |
+  +----------------+
+  | 42             |
+  +----------------+
+  ```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct TruncFunc {
     signature: Signature,
 }
@@ -101,7 +110,50 @@ impl ScalarUDFImpl for TruncFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(trunc, vec![])(&args.args)
+        // Extract precision from second argument (default 0)
+        let precision = match args.args.get(1) {
+            Some(ColumnarValue::Scalar(Int64(Some(p)))) => Some(*p),
+            Some(ColumnarValue::Scalar(Int64(None))) => None, // null precision
+            Some(ColumnarValue::Array(_)) => {
+                // Precision is an array - use array path
+                return make_scalar_function(trunc, vec![])(&args.args);
+            }
+            None => Some(0), // default precision
+            Some(cv) => {
+                return exec_err!(
+                    "trunc function requires precision to be Int64, got {:?}",
+                    cv.data_type()
+                );
+            }
+        };
+
+        // Scalar fast path using tuple matching for (value, precision)
+        match (&args.args[0], precision) {
+            // Null cases
+            (ColumnarValue::Scalar(sv), _) if sv.is_null() => {
+                ColumnarValue::Scalar(ScalarValue::Null).cast_to(args.return_type(), None)
+            }
+            (_, None) => {
+                ColumnarValue::Scalar(ScalarValue::Null).cast_to(args.return_type(), None)
+            }
+            // Scalar cases
+            (ColumnarValue::Scalar(ScalarValue::Float64(Some(v))), Some(p)) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Float64(Some(if p == 0 {
+                    v.trunc()
+                } else {
+                    compute_truncate64(*v, p)
+                }))),
+            ),
+            (ColumnarValue::Scalar(ScalarValue::Float32(Some(v))), Some(p)) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Float32(Some(if p == 0 {
+                    v.trunc()
+                } else {
+                    compute_truncate32(*v, p)
+                }))),
+            ),
+            // Array path for everything else
+            _ => make_scalar_function(trunc, vec![])(&args.args),
+        }
     }
 
     fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
@@ -149,11 +201,7 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
                     args[0]
                         .as_primitive::<Float64Type>()
                         .unary::<_, Float64Type>(|x: f64| {
-                            if x == 0_f64 {
-                                0_f64
-                            } else {
-                                x.trunc()
-                            }
+                            if x == 0_f64 { 0_f64 } else { x.trunc() }
                         }),
                 ) as ArrayRef)
             }
@@ -175,11 +223,7 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
                     args[0]
                         .as_primitive::<Float32Type>()
                         .unary::<_, Float32Type>(|x: f32| {
-                            if x == 0_f32 {
-                                0_f32
-                            } else {
-                                x.trunc()
-                            }
+                            if x == 0_f32 { 0_f32 } else { x.trunc() }
                         }),
                 ) as ArrayRef)
             }
@@ -201,12 +245,12 @@ fn trunc(args: &[ArrayRef]) -> Result<ArrayRef> {
 
 fn compute_truncate32(x: f32, y: i64) -> f32 {
     let factor = 10.0_f32.powi(y as i32);
-    (x * factor).round() / factor
+    (x * factor).trunc() / factor
 }
 
 fn compute_truncate64(x: f64, y: i64) -> f64 {
     let factor = 10.0_f64.powi(y as i32);
-    (x * factor).round() / factor
+    (x * factor).trunc() / factor
 }
 
 #[cfg(test)]
@@ -237,9 +281,9 @@ mod test {
 
         assert_eq!(floats.len(), 5);
         assert_eq!(floats.value(0), 15.0);
-        assert_eq!(floats.value(1), 1_234.268);
+        assert_eq!(floats.value(1), 1_234.267);
         assert_eq!(floats.value(2), 1_233.12);
-        assert_eq!(floats.value(3), 3.312_98);
+        assert_eq!(floats.value(3), 3.312_97);
         assert_eq!(floats.value(4), -21.123_4);
     }
 
@@ -262,9 +306,9 @@ mod test {
 
         assert_eq!(floats.len(), 5);
         assert_eq!(floats.value(0), 5.0);
-        assert_eq!(floats.value(1), 234.268);
+        assert_eq!(floats.value(1), 234.267);
         assert_eq!(floats.value(2), 123.12);
-        assert_eq!(floats.value(3), 123.312_98);
+        assert_eq!(floats.value(3), 123.312_97);
         assert_eq!(floats.value(4), -321.123_1);
     }
 
diff --git a/datafusion/functions/src/planner.rs b/datafusion/functions/src/planner.rs
index 93edec7ece307..9854326945e95 100644
--- a/datafusion/functions/src/planner.rs
+++ b/datafusion/functions/src/planner.rs
@@ -19,14 +19,19 @@
 
 use datafusion_common::Result;
 use datafusion_expr::{
+    Expr,
     expr::ScalarFunction,
     planner::{ExprPlanner, PlannerResult},
-    Expr,
 };
 
+#[deprecated(
+    since = "50.0.0",
+    note = "Use UnicodeFunctionPlanner and DateTimeFunctionPlanner instead"
+)]
 #[derive(Default, Debug)]
 pub struct UserDefinedFunctionPlanner;
 
+#[expect(deprecated)]
 impl ExprPlanner for UserDefinedFunctionPlanner {
     #[cfg(feature = "datetime_expressions")]
     fn plan_extract(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs
index 13fbc049af582..75cc5d9514cbd 100644
--- a/datafusion/functions/src/regex/mod.rs
+++ b/datafusion/functions/src/regex/mod.rs
@@ -17,15 +17,20 @@
 
 //! "regex" DataFusion functions
 
+use arrow::error::ArrowError;
+use regex::Regex;
+use std::collections::HashMap;
+use std::collections::hash_map::Entry;
 use std::sync::Arc;
-
 pub mod regexpcount;
+pub mod regexpinstr;
 pub mod regexplike;
 pub mod regexpmatch;
 pub mod regexpreplace;
 
 // create UDFs
 make_udf_function!(regexpcount::RegexpCountFunc, regexp_count);
+make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr);
 make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match);
 make_udf_function!(regexplike::RegexpLikeFunc, regexp_like);
 make_udf_function!(regexpreplace::RegexpReplaceFunc, regexp_replace);
@@ -60,7 +65,35 @@ pub mod expr_fn {
         super::regexp_match().call(args)
     }
 
-    /// Returns true if a has at least one match in a string, false otherwise.
+    /// Returns index of regular expression matches in a string.
+    pub fn regexp_instr(
+        values: Expr,
+        regex: Expr,
+        start: Option<Expr>,
+        n: Option<Expr>,
+        endoption: Option<Expr>,
+        flags: Option<Expr>,
+        subexpr: Option<Expr>,
+    ) -> Expr {
+        let mut args = vec![values, regex];
+        if let Some(start) = start {
+            args.push(start);
+        };
+        if let Some(n) = n {
+            args.push(n);
+        };
+        if let Some(endoption) = endoption {
+            args.push(endoption);
+        };
+        if let Some(flags) = flags {
+            args.push(flags);
+        };
+        if let Some(subexpr) = subexpr {
+            args.push(subexpr);
+        };
+        super::regexp_instr().call(args)
+    }
+    /// Returns true if a regex has at least one match in a string, false otherwise.
     pub fn regexp_like(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
         let mut args = vec![values, regex];
         if let Some(flags) = flags {
@@ -89,7 +122,45 @@ pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
     vec![
         regexp_count(),
         regexp_match(),
+        regexp_instr(),
         regexp_like(),
         regexp_replace(),
     ]
 }
+
+pub fn compile_and_cache_regex<'strings, 'cache>(
+    regex: &'strings str,
+    flags: Option<&'strings str>,
+    regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
+) -> Result<&'cache Regex, ArrowError>
+where
+    'strings: 'cache,
+{
+    let result = match regex_cache.entry((regex, flags)) {
+        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
+        Entry::Vacant(vacant_entry) => {
+            let compiled = compile_regex(regex, flags)?;
+            vacant_entry.insert(compiled)
+        }
+    };
+    Ok(result)
+}
+
+pub fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
+    let pattern = match flags {
+        None | Some("") => regex.to_string(),
+        Some(flags) => {
+            if flags.contains("g") {
+                return Err(ArrowError::ComputeError(
+                    "regexp_count()/regexp_instr() does not support the global flag"
+                        .to_string(),
+                ));
+            }
+            format!("(?{flags}){regex}")
+        }
+    };
+
+    Regex::new(&pattern).map_err(|_| {
+        ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
+    })
+}
diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs
index 52ab3d489ee31..30257e62cc68f 100644
--- a/datafusion/functions/src/regex/regexpcount.rs
+++ b/datafusion/functions/src/regex/regexpcount.rs
@@ -15,21 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::regex::{compile_and_cache_regex, compile_regex};
 use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, StringArrayType};
 use arrow::datatypes::{DataType, Int64Type};
 use arrow::datatypes::{
     DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
 };
 use arrow::error::ArrowError;
-use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
-    TypeSignature::Uniform, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature::Exact, TypeSignature::Uniform, Volatility,
 };
 use datafusion_macros::user_doc;
 use itertools::izip;
 use regex::Regex;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Arc;
 
@@ -61,7 +61,7 @@ use std::sync::Arc;
   - **U**: swap the meaning of x* and x*?"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RegexpCountFunc {
     signature: Signature,
 }
@@ -108,10 +108,7 @@ impl ScalarUDFImpl for RegexpCountFunc {
         Ok(Int64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         let len = args
@@ -146,7 +143,9 @@ impl ScalarUDFImpl for RegexpCountFunc {
 pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
     let args_len = args.len();
     if !(2..=4).contains(&args_len) {
-        return exec_err!("regexp_count was called with {args_len} arguments. It requires at least 2 and at most 4.");
+        return exec_err!(
+            "regexp_count was called with {args_len} arguments. It requires at least 2 and at most 4."
+        );
     }
 
     let values = &args[0];
@@ -183,7 +182,7 @@ pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
 ///
 /// # Errors
 /// Returns an error if the input arrays have mismatched lengths or if the regular expression fails to compile.
-pub fn regexp_count(
+fn regexp_count(
     values: &dyn Array,
     regex_array: &dyn Datum,
     start_array: Option<&dyn Datum>,
@@ -201,8 +200,8 @@ pub fn regexp_count(
 
     match (values.data_type(), regex_array.data_type(), flags_array) {
         (Utf8, Utf8, None) => regexp_count_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -210,17 +209,17 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => regexp_count_inner(
-            values.as_string::<i32>(),
-            regex_array.as_string::<i32>(),
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string::<i32>()),
+            Some(&flags_array.as_string::<i32>()),
             is_flags_scalar,
         ),
         (LargeUtf8, LargeUtf8, None) => regexp_count_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -228,17 +227,17 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() == LargeUtf8 => regexp_count_inner(
-            values.as_string::<i64>(),
-            regex_array.as_string::<i64>(),
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string::<i64>()),
+            Some(&flags_array.as_string::<i64>()),
             is_flags_scalar,
         ),
         (Utf8View, Utf8View, None) => regexp_count_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
@@ -246,12 +245,12 @@ pub fn regexp_count(
             is_flags_scalar,
         ),
         (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == Utf8View => regexp_count_inner(
-            values.as_string_view(),
-            regex_array.as_string_view(),
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
             is_regex_scalar,
             start_array.map(|start| start.as_primitive::<Int64Type>()),
             is_start_scalar,
-            Some(flags_array.as_string_view()),
+            Some(&flags_array.as_string_view()),
             is_flags_scalar,
         ),
         _ => Err(ArrowError::ComputeError(
@@ -260,13 +259,13 @@ pub fn regexp_count(
     }
 }
 
-pub fn regexp_count_inner<'a, S>(
-    values: S,
-    regex_array: S,
+fn regexp_count_inner<'a, S>(
+    values: &S,
+    regex_array: &S,
     is_regex_scalar: bool,
     start_array: Option<&Int64Array>,
     is_start_scalar: bool,
-    flags_array: Option<S>,
+    flags_array: Option<&S>,
     is_flags_scalar: bool,
 ) -> Result<ArrayRef, ArrowError>
 where
@@ -306,7 +305,7 @@ where
         (true, true, true) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -323,7 +322,7 @@ where
         (true, true, false) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -352,7 +351,7 @@ where
         (true, false, true) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -372,7 +371,7 @@ where
         (true, false, false) => {
             let regex = match regex_scalar {
                 None | Some("") => {
-                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])))
+                    return Ok(Arc::new(Int64Array::from(vec![0; values.len()])));
                 }
                 Some(regex) => regex,
             };
@@ -550,42 +549,6 @@ where
     }
 }
 
-fn compile_and_cache_regex<'strings, 'cache>(
-    regex: &'strings str,
-    flags: Option<&'strings str>,
-    regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
-) -> Result<&'cache Regex, ArrowError>
-where
-    'strings: 'cache,
-{
-    let result = match regex_cache.entry((regex, flags)) {
-        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
-        Entry::Vacant(vacant_entry) => {
-            let compiled = compile_regex(regex, flags)?;
-            vacant_entry.insert(compiled)
-        }
-    };
-    Ok(result)
-}
-
-fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
-    let pattern = match flags {
-        None | Some("") => regex.to_string(),
-        Some(flags) => {
-            if flags.contains("g") {
-                return Err(ArrowError::ComputeError(
-                    "regexp_count() does not support global flag".to_string(),
-                ));
-            }
-            format!("(?{flags}){regex}")
-        }
-    };
-
-    Regex::new(&pattern).map_err(|_| {
-        ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
-    })
-}
-
 fn count_matches(
     value: Option<&str>,
     pattern: &Regex,
@@ -603,8 +566,16 @@ fn count_matches(
             ));
         }
 
-        let find_slice = value.chars().skip(start as usize - 1).collect::<String>();
-        let count = pattern.find_iter(find_slice.as_str()).count();
+        // Find the byte offset for the start position (1-based character index)
+        let byte_offset = value
+            .char_indices()
+            .nth((start as usize).saturating_sub(1))
+            .map(|(idx, _)| idx)
+            .unwrap_or(value.len());
+
+        // Use string slicing instead of collecting chars into a new String
+        let find_slice = &value[byte_offset..];
+        let count = pattern.find_iter(find_slice).count();
         Ok(count as i64)
     } else {
         let count = pattern.find_iter(value).count();
@@ -617,7 +588,7 @@ mod tests {
     use super::*;
     use arrow::array::{GenericStringArray, StringViewArray};
     use arrow::datatypes::Field;
-    use datafusion_expr::ScalarFunctionArgs;
+    use datafusion_common::config::ConfigOptions;
 
     #[test]
     fn test_regexp_count() {
@@ -662,6 +633,7 @@ mod tests {
             arg_fields,
             number_rows: args.len(),
             return_field: Field::new("f", Int64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         })
     }
 
@@ -876,9 +848,9 @@ mod tests {
         values.iter().enumerate().for_each(|(pos, &v)| {
             // utf8
             let v_sv = ScalarValue::Utf8(Some(v.to_string()));
-            let regex_sv = ScalarValue::Utf8(regex.get(pos).map(|s| s.to_string()));
+            let regex_sv = ScalarValue::Utf8(regex.get(pos).map(|s| (*s).to_string()));
             let start_sv = ScalarValue::Int64(Some(start));
-            let flags_sv = ScalarValue::Utf8(flags.get(pos).map(|f| f.to_string()));
+            let flags_sv = ScalarValue::Utf8(flags.get(pos).map(|f| (*f).to_string()));
             let expected = expected.get(pos).cloned();
             let re = regexp_count_with_scalar_values(&[
                 v_sv,
@@ -895,8 +867,10 @@ mod tests {
 
             // largeutf8
             let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
-            let regex_sv = ScalarValue::LargeUtf8(regex.get(pos).map(|s| s.to_string()));
-            let flags_sv = ScalarValue::LargeUtf8(flags.get(pos).map(|f| f.to_string()));
+            let regex_sv =
+                ScalarValue::LargeUtf8(regex.get(pos).map(|s| (*s).to_string()));
+            let flags_sv =
+                ScalarValue::LargeUtf8(flags.get(pos).map(|f| (*f).to_string()));
             let re = regexp_count_with_scalar_values(&[
                 v_sv,
                 regex_sv,
@@ -912,8 +886,10 @@ mod tests {
 
             // utf8view
             let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
-            let regex_sv = ScalarValue::Utf8View(regex.get(pos).map(|s| s.to_string()));
-            let flags_sv = ScalarValue::Utf8View(flags.get(pos).map(|f| f.to_string()));
+            let regex_sv =
+                ScalarValue::Utf8View(regex.get(pos).map(|s| (*s).to_string()));
+            let flags_sv =
+                ScalarValue::Utf8View(flags.get(pos).map(|f| (*f).to_string()));
             let re = regexp_count_with_scalar_values(&[
                 v_sv,
                 regex_sv,
diff --git a/datafusion/functions/src/regex/regexpinstr.rs b/datafusion/functions/src/regex/regexpinstr.rs
new file mode 100644
index 0000000000000..608310ce6ee0c
--- /dev/null
+++ b/datafusion/functions/src/regex/regexpinstr.rs
@@ -0,0 +1,821 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, AsArray, Datum, Int64Array, PrimitiveArray, StringArrayType,
+};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature::Exact, TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags[, subexpr]]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) to return the position for. Defaults to 0, which returns the position of the entire match."
+    )
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8, Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!(
+            "regexp_instr was called with {args_len} arguments. It requires at least 2 and at most 6."
+        );
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a regular expression pattern
+/// within a string array. It supports optional start positions and flags for case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the search.
+/// - `flags_array` (optional): The array of flags to modify the search behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the search.
+///
+/// The function handles different combinations of scalar and array inputs for the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the regular expression fails to compile.
+fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, _) = regex_array.get();
+    let start_array = start_array.map(|start| {
+        let (start, _) = start.get();
+        start
+    });
+    let nth_array = nth_array.map(|nth| {
+        let (nth, _) = nth.get();
+        nth
+    });
+    let flags_array = flags_array.map(|flags| {
+        let (flags, _) = flags.get();
+        flags
+    });
+    let subexpr_array = subexpr_array.map(|subexpr| {
+        let (subexpr, _) = subexpr.get();
+        subexpr
+    });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            None,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => regexp_instr_inner(
+            &values.as_string::<i32>(),
+            &regex_array.as_string::<i32>(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            Some(flags_array.as_string::<i32>()),
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            None,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() == LargeUtf8 => regexp_instr_inner(
+            &values.as_string::<i64>(),
+            &regex_array.as_string::<i64>(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            Some(flags_array.as_string::<i64>()),
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            None,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == Utf8View => regexp_instr_inner(
+            &values.as_string_view(),
+            &regex_array.as_string_view(),
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            Some(flags_array.as_string_view()),
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, LargeUtf8, or Utf8View and the data types of the values, regex_array, and flags_array to match".to_string(),
+        )),
+    }
+}
+
+fn regexp_instr_inner<'a, S>(
+    values: &S,
+    regex_array: &S,
+    start_array: Option<&Int64Array>,
+    nth_array: Option<&Int64Array>,
+    flags_array: Option<S>,
+    subexp_array: Option<&Int64Array>,
+) -> Result<ArrayRef, ArrowError>
+where
+    S: StringArrayType<'a>,
+{
+    let len = values.len();
+
+    let default_start_array = PrimitiveArray::<Int64Type>::from(vec![1; len]);
+    let start_array = start_array.unwrap_or(&default_start_array);
+    let start_input: Vec<i64> = (0..start_array.len())
+        .map(|i| start_array.value(i)) // handle nulls as 0
+        .collect();
+
+    let default_nth_array = PrimitiveArray::<Int64Type>::from(vec![1; len]);
+    let nth_array = nth_array.unwrap_or(&default_nth_array);
+    let nth_input: Vec<i64> = (0..nth_array.len())
+        .map(|i| nth_array.value(i)) // handle nulls as 0
+        .collect();
+
+    let flags_input = match flags_array {
+        Some(flags) => flags.iter().collect(),
+        None => vec![None; len],
+    };
+
+    let default_subexp_array = PrimitiveArray::<Int64Type>::from(vec![0; len]);
+    let subexp_array = subexp_array.unwrap_or(&default_subexp_array);
+    let subexp_input: Vec<i64> = (0..subexp_array.len())
+        .map(|i| subexp_array.value(i)) // handle nulls as 0
+        .collect();
+
+    let mut regex_cache = HashMap::new();
+
+    let result: Result<Vec<Option<i64>>, ArrowError> = izip!(
+        values.iter(),
+        regex_array.iter(),
+        start_input.iter(),
+        nth_input.iter(),
+        flags_input.iter(),
+        subexp_input.iter()
+    )
+    .map(|(value, regex, start, nth, flags, subexp)| match regex {
+        None => Ok(None),
+        Some("") => Ok(Some(0)),
+        Some(regex) => get_index(
+            value,
+            regex,
+            *start,
+            *nth,
+            *subexp,
+            *flags,
+            &mut regex_cache,
+        ),
+    })
+    .collect();
+    Ok(Arc::new(Int64Array::from(result?)))
+}
+
+fn handle_subexp(
+    pattern: &Regex,
+    search_slice: &str,
+    subexpr: i64,
+    value: &str,
+    byte_start_offset: usize,
+) -> Result<Option<i64>, ArrowError> {
+    if let Some(captures) = pattern.captures(search_slice)
+        && let Some(matched) = captures.get(subexpr as usize)
+    {
+        // Convert byte offset relative to search_slice back to 1-based character offset
+        // relative to the original `value` string.
+        let start_char_offset =
+            value[..byte_start_offset + matched.start()].chars().count() as i64 + 1;
+        return Ok(Some(start_char_offset));
+    }
+    Ok(Some(0)) // Return 0 if the subexpression was not found
+}
+
+fn get_nth_match(
+    pattern: &Regex,
+    search_slice: &str,
+    n: i64,
+    byte_start_offset: usize,
+    value: &str,
+) -> Result<Option<i64>, ArrowError> {
+    if let Some(mat) = pattern.find_iter(search_slice).nth((n - 1) as usize) {
+        // Convert byte offset relative to search_slice back to 1-based character offset
+        // relative to the original `value` string.
+        let match_start_byte_offset = byte_start_offset + mat.start();
+        let match_start_char_offset =
+            value[..match_start_byte_offset].chars().count() as i64 + 1;
+        Ok(Some(match_start_char_offset))
+    } else {
+        Ok(Some(0)) // Return 0 if the N-th match was not found
+    }
+}
+fn get_index<'strings, 'cache>(
+    value: Option<&str>,
+    pattern: &'strings str,
+    start: i64,
+    n: i64,
+    subexpr: i64,
+    flags: Option<&'strings str>,
+    regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
+) -> Result<Option<i64>, ArrowError>
+where
+    'strings: 'cache,
+{
+    let value = match value {
+        None => return Ok(None),
+        Some("") => return Ok(Some(0)),
+        Some(value) => value,
+    };
+    let pattern: &Regex = compile_and_cache_regex(pattern, flags, regex_cache)?;
+    // println!("get_index: value = {}, pattern = {}, start = {}, n = {}, subexpr = {}, flags = {:?}", value, pattern, start, n, subexpr, flags);
+    if start < 1 {
+        return Err(ArrowError::ComputeError(
+            "regexp_instr() requires start to be 1-based".to_string(),
+        ));
+    }
+
+    if n < 1 {
+        return Err(ArrowError::ComputeError(
+            "N must be 1 or greater".to_string(),
+        ));
+    }
+
+    // --- Simplified byte_start_offset calculation ---
+    let total_chars = value.chars().count() as i64;
+    let byte_start_offset: usize = if start > total_chars {
+        // If start is beyond the total characters, it means we start searching
+        // after the string effectively. No matches possible.
+        return Ok(Some(0));
+    } else {
+        // Get the byte offset for the (start - 1)-th character (0-based)
+        value
+            .char_indices()
+            .nth((start - 1) as usize)
+            .map(|(idx, _)| idx)
+            .unwrap_or(0) // Should not happen if start is valid and <= total_chars
+    };
+    // --- End simplified calculation ---
+
+    let search_slice = &value[byte_start_offset..];
+
+    // Handle subexpression capturing first, as it takes precedence
+    if subexpr > 0 {
+        return handle_subexp(pattern, search_slice, subexpr, value, byte_start_offset);
+    }
+
+    // Use nth to get the N-th match (n is 1-based, nth is 0-based)
+    get_nth_match(pattern, search_slice, n, byte_start_offset, value)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int64Array;
+    use arrow::array::{GenericStringArray, StringViewArray};
+    use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
+    #[test]
+    fn test_regexp_instr() {
+        test_case_sensitive_regexp_instr_nulls();
+        test_case_sensitive_regexp_instr_scalar();
+        test_case_sensitive_regexp_instr_scalar_start();
+        test_case_sensitive_regexp_instr_scalar_nth();
+        test_case_sensitive_regexp_instr_scalar_subexp();
+
+        test_case_sensitive_regexp_instr_array::<GenericStringArray<i32>>();
+        test_case_sensitive_regexp_instr_array::<GenericStringArray<i64>>();
+        test_case_sensitive_regexp_instr_array::<StringViewArray>();
+
+        test_case_sensitive_regexp_instr_array_start::<GenericStringArray<i32>>();
+        test_case_sensitive_regexp_instr_array_start::<GenericStringArray<i64>>();
+        test_case_sensitive_regexp_instr_array_start::<StringViewArray>();
+
+        test_case_sensitive_regexp_instr_array_nth::<GenericStringArray<i32>>();
+        test_case_sensitive_regexp_instr_array_nth::<GenericStringArray<i64>>();
+        test_case_sensitive_regexp_instr_array_nth::<StringViewArray>();
+    }
+
+    fn regexp_instr_with_scalar_values(args: &[ScalarValue]) -> Result<ColumnarValue> {
+        let args_values: Vec<ColumnarValue> = args
+            .iter()
+            .map(|sv| ColumnarValue::Scalar(sv.clone()))
+            .collect();
+
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, a)| {
+                Arc::new(Field::new(format!("arg_{idx}"), a.data_type(), true))
+            })
+            .collect::<Vec<_>>();
+
+        RegexpInstrFunc::new().invoke_with_args(ScalarFunctionArgs {
+            args: args_values,
+            arg_fields,
+            number_rows: args.len(),
+            return_field: Arc::new(Field::new("f", Int64, true)),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+    }
+
+    fn test_case_sensitive_regexp_instr_nulls() {
+        let v = "";
+        let r = "";
+        let expected = 0;
+        let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
+        let re = regexp_instr_with_scalar_values(&[v.to_string().into(), regex_sv]);
+        // let res_exp = re.unwrap();
+        match re {
+            Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                assert_eq!(v, Some(expected), "regexp_instr scalar test failed");
+            }
+            _ => panic!("Unexpected result"),
+        }
+    }
+    fn test_case_sensitive_regexp_instr_scalar() {
+        let values = [
+            "hello world",
+            "abcdefg",
+            "xyz123xyz",
+            "no match here",
+            "abc",
+            "ДатаФусион数据融合📊🔥",
+        ];
+        let regex = ["o", "d", "123", "z", "gg", "📊"];
+
+        let expected: Vec<i64> = vec![5, 4, 4, 0, 0, 15];
+
+        izip!(values.iter(), regex.iter())
+            .enumerate()
+            .for_each(|(pos, (&v, &r))| {
+                // utf8
+                let v_sv = ScalarValue::Utf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
+                let expected = expected.get(pos).cloned();
+                let re = regexp_instr_with_scalar_values(&[v_sv, regex_sv]);
+                // let res_exp = re.unwrap();
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // largeutf8
+                let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::LargeUtf8(Some(r.to_string()));
+                let re = regexp_instr_with_scalar_values(&[v_sv, regex_sv]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // utf8view
+                let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8View(Some(r.to_string()));
+                let re = regexp_instr_with_scalar_values(&[v_sv, regex_sv]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+            });
+    }
+
+    fn test_case_sensitive_regexp_instr_scalar_start() {
+        let values = ["abcabcabc", "abcabcabc", ""];
+        let regex = ["abc", "abc", "gg"];
+        let start = [4, 5, 5];
+        let expected: Vec<i64> = vec![4, 7, 0];
+
+        izip!(values.iter(), regex.iter(), start.iter())
+            .enumerate()
+            .for_each(|(pos, (&v, &r, &s))| {
+                // utf8
+                let v_sv = ScalarValue::Utf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let expected = expected.get(pos).cloned();
+                let re =
+                    regexp_instr_with_scalar_values(&[v_sv, regex_sv, start_sv.clone()]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // largeutf8
+                let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::LargeUtf8(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let re =
+                    regexp_instr_with_scalar_values(&[v_sv, regex_sv, start_sv.clone()]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // utf8view
+                let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8View(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let re =
+                    regexp_instr_with_scalar_values(&[v_sv, regex_sv, start_sv.clone()]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+            });
+    }
+
+    fn test_case_sensitive_regexp_instr_scalar_nth() {
+        let values = ["abcabcabc", "abcabcabc", "abcabcabc", "abcabcabc"];
+        let regex = ["abc", "abc", "abc", "abc"];
+        let start = [1, 1, 1, 1];
+        let nth = [1, 2, 3, 4];
+        let expected: Vec<i64> = vec![1, 4, 7, 0];
+
+        izip!(values.iter(), regex.iter(), start.iter(), nth.iter())
+            .enumerate()
+            .for_each(|(pos, (&v, &r, &s, &n))| {
+                // utf8
+                let v_sv = ScalarValue::Utf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let nth_sv = ScalarValue::Int64(Some(n));
+                let expected = expected.get(pos).cloned();
+                let re = regexp_instr_with_scalar_values(&[
+                    v_sv,
+                    regex_sv,
+                    start_sv.clone(),
+                    nth_sv.clone(),
+                ]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // largeutf8
+                let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
+                let regex_sv = ScalarValue::LargeUtf8(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let nth_sv = ScalarValue::Int64(Some(n));
+                let re = regexp_instr_with_scalar_values(&[
+                    v_sv,
+                    regex_sv,
+                    start_sv.clone(),
+                    nth_sv.clone(),
+                ]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+
+                // utf8view
+                let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
+                let regex_sv = ScalarValue::Utf8View(Some(r.to_string()));
+                let start_sv = ScalarValue::Int64(Some(s));
+                let nth_sv = ScalarValue::Int64(Some(n));
+                let re = regexp_instr_with_scalar_values(&[
+                    v_sv,
+                    regex_sv,
+                    start_sv.clone(),
+                    nth_sv.clone(),
+                ]);
+                match re {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                        assert_eq!(v, expected, "regexp_instr scalar test failed");
+                    }
+                    _ => panic!("Unexpected result"),
+                }
+            });
+    }
+
+    fn test_case_sensitive_regexp_instr_scalar_subexp() {
+        let values = ["12 abc def ghi 34"];
+        let regex = ["(abc) (def) (ghi)"];
+        let start = [1];
+        let nth = [1];
+        let flags = ["i"];
+        let subexps = [2];
+        let expected: Vec<i64> = vec![8];
+
+        izip!(
+            values.iter(),
+            regex.iter(),
+            start.iter(),
+            nth.iter(),
+            flags.iter(),
+            subexps.iter()
+        )
+        .enumerate()
+        .for_each(|(pos, (&v, &r, &s, &n, &flag, &subexp))| {
+            // utf8
+            let v_sv = ScalarValue::Utf8(Some(v.to_string()));
+            let regex_sv = ScalarValue::Utf8(Some(r.to_string()));
+            let start_sv = ScalarValue::Int64(Some(s));
+            let nth_sv = ScalarValue::Int64(Some(n));
+            let flags_sv = ScalarValue::Utf8(Some(flag.to_string()));
+            let subexp_sv = ScalarValue::Int64(Some(subexp));
+            let expected = expected.get(pos).cloned();
+            let re = regexp_instr_with_scalar_values(&[
+                v_sv,
+                regex_sv,
+                start_sv.clone(),
+                nth_sv.clone(),
+                flags_sv,
+                subexp_sv.clone(),
+            ]);
+            match re {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                    assert_eq!(v, expected, "regexp_instr scalar test failed");
+                }
+                _ => panic!("Unexpected result"),
+            }
+
+            // largeutf8
+            let v_sv = ScalarValue::LargeUtf8(Some(v.to_string()));
+            let regex_sv = ScalarValue::LargeUtf8(Some(r.to_string()));
+            let start_sv = ScalarValue::Int64(Some(s));
+            let nth_sv = ScalarValue::Int64(Some(n));
+            let flags_sv = ScalarValue::LargeUtf8(Some(flag.to_string()));
+            let subexp_sv = ScalarValue::Int64(Some(subexp));
+            let re = regexp_instr_with_scalar_values(&[
+                v_sv,
+                regex_sv,
+                start_sv.clone(),
+                nth_sv.clone(),
+                flags_sv,
+                subexp_sv.clone(),
+            ]);
+            match re {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                    assert_eq!(v, expected, "regexp_instr scalar test failed");
+                }
+                _ => panic!("Unexpected result"),
+            }
+
+            // utf8view
+            let v_sv = ScalarValue::Utf8View(Some(v.to_string()));
+            let regex_sv = ScalarValue::Utf8View(Some(r.to_string()));
+            let start_sv = ScalarValue::Int64(Some(s));
+            let nth_sv = ScalarValue::Int64(Some(n));
+            let flags_sv = ScalarValue::Utf8View(Some(flag.to_string()));
+            let subexp_sv = ScalarValue::Int64(Some(subexp));
+            let re = regexp_instr_with_scalar_values(&[
+                v_sv,
+                regex_sv,
+                start_sv.clone(),
+                nth_sv.clone(),
+                flags_sv,
+                subexp_sv.clone(),
+            ]);
+            match re {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(v))) => {
+                    assert_eq!(v, expected, "regexp_instr scalar test failed");
+                }
+                _ => panic!("Unexpected result"),
+            }
+        });
+    }
+
+    fn test_case_sensitive_regexp_instr_array<A>()
+    where
+        A: From<Vec<&'static str>> + Array + 'static,
+    {
+        let values = A::from(vec![
+            "hello world",
+            "abcdefg",
+            "xyz123xyz",
+            "no match here",
+            "",
+        ]);
+        let regex = A::from(vec!["o", "d", "123", "z", "gg"]);
+
+        let expected = Int64Array::from(vec![5, 4, 4, 0, 0]);
+        let re = regexp_instr_func(&[Arc::new(values), Arc::new(regex)]).unwrap();
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    fn test_case_sensitive_regexp_instr_array_start<A>()
+    where
+        A: From<Vec<&'static str>> + Array + 'static,
+    {
+        let values = A::from(vec!["abcabcabc", "abcabcabc", ""]);
+        let regex = A::from(vec!["abc", "abc", "gg"]);
+        let start = Int64Array::from(vec![4, 5, 5]);
+        let expected = Int64Array::from(vec![4, 7, 0]);
+
+        let re = regexp_instr_func(&[Arc::new(values), Arc::new(regex), Arc::new(start)])
+            .unwrap();
+        assert_eq!(re.as_ref(), &expected);
+    }
+
+    fn test_case_sensitive_regexp_instr_array_nth<A>()
+    where
+        A: From<Vec<&'static str>> + Array + 'static,
+    {
+        let values = A::from(vec!["abcabcabc", "abcabcabc", "abcabcabc", "abcabcabc"]);
+        let regex = A::from(vec!["abc", "abc", "abc", "abc"]);
+        let start = Int64Array::from(vec![1, 1, 1, 1]);
+        let nth = Int64Array::from(vec![1, 2, 3, 4]);
+        let expected = Int64Array::from(vec![1, 4, 7, 0]);
+
+        let re = regexp_instr_func(&[
+            Arc::new(values),
+            Arc::new(regex),
+            Arc::new(start),
+            Arc::new(nth),
+        ])
+        .unwrap();
+        assert_eq!(re.as_ref(), &expected);
+    }
+}
diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs
index 2080bb9fe818f..68c81eaa57e2b 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -17,21 +17,24 @@
 
 //! Regex expressions
 
-use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, GenericStringArray};
 use arrow::compute::kernels::regexp;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 use datafusion_common::types::logical_string;
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, internal_err, plan_err, DataFusionError, Result,
-    ScalarValue,
+    Result, ScalarValue, arrow_datafusion_err, exec_err, internal_err, plan_err,
 };
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
-    TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility, binary_expr, cast,
 };
 use datafusion_macros::user_doc;
 
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr_common::operator::Operator;
+use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
+use regex::Regex;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -53,7 +56,7 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
 | true                                             |
 +--------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     standard_argument(name = "regexp", prefix = "Regular"),
@@ -67,7 +70,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
   - **U**: swap the meaning of x* and x*?"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RegexpLikeFunc {
     signature: Signature,
 }
@@ -123,41 +126,120 @@ impl ScalarUDFImpl for RegexpLikeFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
-
-        let len = args
-            .iter()
-            .fold(Option::<usize>::None, |acc, arg| match arg {
-                ColumnarValue::Scalar(_) => acc,
-                ColumnarValue::Array(a) => Some(a.len()),
-            });
-
-        let is_scalar = len.is_none();
-        let inferred_length = len.unwrap_or(1);
-        let args = args
-            .iter()
-            .map(|arg| arg.to_array(inferred_length))
-            .collect::<Result<Vec<_>>>()?;
-
-        let result = regexp_like(&args);
-        if is_scalar {
-            // If all inputs are scalar, keeps output as scalar
-            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
-            result.map(ColumnarValue::Scalar)
-        } else {
-            result.map(ColumnarValue::Array)
+        match args.as_slice() {
+            [ColumnarValue::Scalar(value), ColumnarValue::Scalar(pattern)] => {
+                let value = scalar_string(value)?;
+                let pattern = scalar_string(pattern)?;
+                regexp_like_scalar(value, pattern, None)
+            }
+            [
+                ColumnarValue::Scalar(value),
+                ColumnarValue::Scalar(pattern),
+                ColumnarValue::Scalar(flags),
+            ] => {
+                let value = scalar_string(value)?;
+                let pattern = scalar_string(pattern)?;
+                let flags = scalar_string(flags)?;
+                regexp_like_scalar(value, pattern, flags)
+            }
+            [ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
+                let pattern = scalar_string(pattern)?;
+                let array = regexp_like_array_scalar(values, pattern, None)?;
+                Ok(ColumnarValue::Array(array))
+            }
+            [
+                ColumnarValue::Array(values),
+                ColumnarValue::Scalar(pattern),
+                ColumnarValue::Scalar(flags),
+            ] => {
+                let flags = scalar_string(flags)?;
+                if flags.is_some_and(|flagz| flagz.contains('g')) {
+                    plan_err!("regexp_like() does not support the \"global\" option")
+                } else {
+                    let pattern = scalar_string(pattern)?;
+                    let array = regexp_like_array_scalar(values, pattern, flags)?;
+                    Ok(ColumnarValue::Array(array))
+                }
+            }
+            _ => {
+                let args = ColumnarValue::values_to_arrays(args)?;
+                regexp_like(&args).map(ColumnarValue::Array)
+            }
         }
     }
 
+    fn simplify(
+        &self,
+        mut args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        // Try to simplify regexp_like usage to one of the builtin operators since those have
+        // optimized code paths for the case where the regular expression pattern is a scalar.
+        // Additionally, the expression simplification optimization pass will attempt to further
+        // simplify regular expression patterns used in operator expressions.
+        let Some(op) = derive_operator(&args) else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        let string_type = info.get_data_type(&args[0])?;
+        let regexp_type = info.get_data_type(&args[1])?;
+        let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, &regexp_type);
+        let Ok((coerced_string_type, coerced_regexp_type)) =
+            binary_type_coercer.get_input_types()
+        else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        // regexp_like(str, regexp [, flags])
+        let regexp = args.swap_remove(1);
+        let string = args.swap_remove(0);
+
+        Ok(ExprSimplifyResult::Simplified(binary_expr(
+            if string_type != coerced_string_type {
+                cast(string, coerced_string_type)
+            } else {
+                string
+            },
+            op,
+            if regexp_type != coerced_regexp_type {
+                cast(regexp, coerced_regexp_type)
+            } else {
+                regexp
+            },
+        )))
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
+fn derive_operator(args: &[Expr]) -> Option<Operator> {
+    match args.len() {
+        // regexp_like(str, regexp, flags)
+        3 => {
+            match &args[2] {
+                Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
+                    match flags.as_str() {
+                        "i" => Some(Operator::RegexIMatch),
+                        "" => Some(Operator::RegexMatch),
+                        // Any flags besides 'i' have no operator equivalent
+                        _ => None,
+                    }
+                }
+                // `flags` is not a literal, so we can't derive the correct operator statically
+                _ => None,
+            }
+        }
+        // regexp_like(str, regexp)
+        2 => Some(Operator::RegexMatch),
+        // Should never happen, but just in case
+        _ => None,
+    }
+}
+
 /// Tests a string using a regular expression returning true if at
 /// least one match, false otherwise.
 ///
@@ -208,43 +290,125 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
                 Utf8 => args[2].as_string::<i32>(),
                 LargeUtf8 => {
                     let large_string_array = args[2].as_string::<i64>();
-                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
-                        if large_string_array.is_null(i) {
-                            None
-                        } else {
-                            Some(large_string_array.value(i))
-                        }
-                    })
-                    .collect();
+                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len())
+                        .map(|i| {
+                            if large_string_array.is_null(i) {
+                                None
+                            } else {
+                                Some(large_string_array.value(i))
+                            }
+                        })
+                        .collect();
 
                     &GenericStringArray::<i32>::from(string_vec)
-                },
+                }
                 _ => {
                     let string_view_array = args[2].as_string_view();
-                    let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
-                        if string_view_array.is_null(i) {
-                            None
-                        } else {
-                            Some(string_view_array.value(i).to_string())
-                        }
-                    })
-                    .collect();
+                    let string_vec: Vec<Option<String>> = (0..string_view_array.len())
+                        .map(|i| {
+                            if string_view_array.is_null(i) {
+                                None
+                            } else {
+                                Some(string_view_array.value(i).to_string())
+                            }
+                        })
+                        .collect();
                     &GenericStringArray::<i32>::from(string_vec)
-                },
+                }
             };
 
-            if flags.iter().any(|s| s == Some("g")) {
+            if flags
+                .iter()
+                .any(|s| s.is_some_and(|flagz| flagz.contains('g')))
+            {
                 return plan_err!("regexp_like() does not support the \"global\" option");
             }
 
             handle_regexp_like(&args[0], &args[1], Some(flags))
-        },
+        }
         other => exec_err!(
             "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
         ),
     }
 }
 
+fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
+    match value.try_as_str() {
+        Some(v) => Ok(v),
+        None => internal_err!(
+            "Unsupported data type {:?} for function `regexp_like`",
+            value.data_type()
+        ),
+    }
+}
+
+fn regexp_like_array_scalar(
+    values: &ArrayRef,
+    pattern: Option<&str>,
+    flags: Option<&str>,
+) -> Result<ArrayRef> {
+    use DataType::*;
+
+    let Some(pattern) = pattern else {
+        return Ok(Arc::new(BooleanArray::new_null(values.len())));
+    };
+    let array = match values.data_type() {
+        Utf8 => {
+            let array = values.as_string::<i32>();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        Utf8View => {
+            let array = values.as_string_view();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        LargeUtf8 => {
+            let array = values.as_string::<i64>();
+            regexp::regexp_is_match_scalar(array, pattern, flags)?
+        }
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function `regexp_like`"
+            );
+        }
+    };
+
+    Ok(Arc::new(array))
+}
+
+fn regexp_like_scalar(
+    value: Option<&str>,
+    pattern: Option<&str>,
+    flags: Option<&str>,
+) -> Result<ColumnarValue> {
+    if flags.is_some_and(|flagz| flagz.contains('g')) {
+        return plan_err!("regexp_like() does not support the \"global\" option");
+    }
+
+    if value.is_none() || pattern.is_none() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    let value = value.unwrap();
+    let pattern = pattern.unwrap();
+    let pattern = match flags {
+        Some(flagz) => format!("(?{flagz}){pattern}"),
+        None => pattern.to_string(),
+    };
+
+    let result = if pattern.is_empty() {
+        true
+    } else {
+        let re = Regex::new(pattern.as_str()).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Regular expression did not compile: {e:?}"
+            ))
+        })?;
+        re.is_match(value)
+    };
+
+    Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(result))))
+}
+
 fn handle_regexp_like(
     values: &ArrayRef,
     patterns: &ArrayRef,
@@ -287,7 +451,7 @@ fn handle_regexp_like(
                 .map_err(|e| arrow_datafusion_err!(e))?
         }
         (Utf8, LargeUtf8) => {
-            let value = values.as_string_view();
+            let value = values.as_string::<i32>();
             let pattern = patterns.as_string::<i64>();
 
             regexp::regexp_is_match(value, pattern, flags)
@@ -317,7 +481,7 @@ fn handle_regexp_like(
         other => {
             return internal_err!(
                 "Unsupported data type {other:?} for function `regexp_like`"
-            )
+            );
         }
     };
 
@@ -330,8 +494,37 @@ mod tests {
 
     use arrow::array::StringArray;
     use arrow::array::{BooleanBuilder, StringViewArray};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    use crate::regex::regexplike::{RegexpLikeFunc, regexp_like};
 
-    use crate::regex::regexplike::regexp_like;
+    fn invoke_regexp_like(args: Vec<ColumnarValue>) -> Result<ColumnarValue> {
+        let number_rows = args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(1);
+        let arg_fields = args
+            .iter()
+            .enumerate()
+            .map(|(idx, arg)| {
+                Arc::new(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+            })
+            .collect::<Vec<_>>();
+
+        RegexpLikeFunc::new().invoke_with_args(ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Arc::new(Field::new("f", DataType::Boolean, true)),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+    }
 
     #[test]
     fn test_case_sensitive_regexp_like_utf8() {
@@ -430,4 +623,66 @@ mod tests {
             "Error during planning: regexp_like() does not support the \"global\" option"
         );
     }
+
+    #[test]
+    fn test_regexp_like_scalar_invoke() {
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
+        ];
+        let result = invoke_regexp_like(args).unwrap();
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))) => {}
+            other => panic!("Unexpected result {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_regexp_like_array_scalar_invoke() {
+        let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
+        let args = vec![
+            ColumnarValue::Array(values),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+        ];
+        let result = invoke_regexp_like(args).unwrap();
+        let mut expected_builder = BooleanBuilder::new();
+        expected_builder.append_value(true);
+        expected_builder.append_value(false);
+        let expected = expected_builder.finish();
+        match result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(array.as_ref(), &expected);
+            }
+            other => panic!("Unexpected result {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_regexp_like_scalar_flags_with_global() {
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
+        ];
+        let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Error during planning: regexp_like() does not support the \"global\" option"
+        );
+    }
+
+    #[test]
+    fn test_regexp_like_array_scalar_flags_with_global() {
+        let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
+        let args = vec![
+            ColumnarValue::Array(values),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
+        ];
+        let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
+        assert_eq!(
+            err.strip_backtrace(),
+            "Error during planning: regexp_like() does not support the \"global\" option"
+        );
+    }
 }
diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs
index 1119e66398d1d..1a96e095267e2 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -20,11 +20,11 @@ use arrow::array::{Array, ArrayRef, AsArray};
 use arrow::compute::kernels::regexp;
 use arrow::datatypes::DataType;
 use arrow::datatypes::Field;
-use datafusion_common::exec_err;
+use datafusion_common::Result;
 use datafusion_common::ScalarValue;
+use datafusion_common::exec_err;
 use datafusion_common::{arrow_datafusion_err, plan_err};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
+use datafusion_expr::{ColumnarValue, Documentation, ScalarFunctionArgs, TypeSignature};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -48,7 +48,7 @@ use std::sync::Arc;
             | [B]                                               |
             +---------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     argument(
@@ -66,7 +66,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
   - **U**: swap the meaning of x* and x*?"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RegexpMatchFunc {
     signature: Signature,
 }
@@ -119,10 +119,7 @@ impl ScalarUDFImpl for RegexpMatchFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         let len = args
             .iter()
@@ -155,29 +152,35 @@ impl ScalarUDFImpl for RegexpMatchFunc {
 
 pub fn regexp_match(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args.len() {
-        2 => {
-            regexp::regexp_match(&args[0], &args[1], None)
-                .map_err(|e| arrow_datafusion_err!(e))
-        }
+        2 => regexp::regexp_match(&args[0], &args[1], None)
+            .map_err(|e| arrow_datafusion_err!(e)),
         3 => {
             match args[2].data_type() {
                 DataType::Utf8View => {
                     if args[2].as_string_view().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 DataType::Utf8 => {
                     if args[2].as_string::<i32>().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 DataType::LargeUtf8 => {
                     if args[2].as_string::<i64>().iter().any(|s| s == Some("g")) {
-                        return plan_err!("regexp_match() does not support the \"global\" option");
+                        return plan_err!(
+                            "regexp_match() does not support the \"global\" option"
+                        );
                     }
                 }
                 e => {
-                    return plan_err!("regexp_match was called with unexpected data type {e:?}");
+                    return plan_err!(
+                        "regexp_match was called with unexpected data type {e:?}"
+                    );
                 }
             }
 
@@ -254,6 +257,9 @@ mod tests {
             regexp_match(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
                 .expect_err("unsupported flag should have failed");
 
-        assert_eq!(re_err.strip_backtrace(), "Error during planning: regexp_match() does not support the \"global\" option");
+        assert_eq!(
+            re_err.strip_backtrace(),
+            "Error during planning: regexp_match() does not support the \"global\" option"
+        );
     }
 }
diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs
index 3a83564ff11fe..4e8ffa3385446 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -20,21 +20,25 @@ use arrow::array::ArrayDataBuilder;
 use arrow::array::BufferBuilder;
 use arrow::array::GenericStringArray;
 use arrow::array::StringViewBuilder;
-use arrow::array::{new_null_array, ArrayIter, AsArray};
 use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
 use arrow::array::{ArrayAccessor, StringViewArray};
+use arrow::array::{ArrayIter, AsArray, new_null_array};
 use arrow::datatypes::DataType;
-use datafusion_common::cast::as_string_view_array;
+use datafusion_common::ScalarValue;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
 use datafusion_common::exec_err;
 use datafusion_common::plan_err;
-use datafusion_common::ScalarValue;
 use datafusion_common::{
-    cast::as_generic_string_array, internal_err, DataFusionError, Result,
+    DataFusionError, Result, cast::as_generic_string_array, internal_err,
 };
-use datafusion_expr::function::Hint;
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::TypeSignature;
-use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::function::Hint;
+use datafusion_expr::{
+    Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
 use datafusion_macros::user_doc;
 use regex::Regex;
 use std::any::Any;
@@ -59,7 +63,7 @@ SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
 | aAbBac                                                            |
 +-------------------------------------------------------------------+
 ```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 "#,
     standard_argument(name = "str", prefix = "String"),
     argument(
@@ -74,7 +78,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
     argument(
         name = "flags",
         description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
-- **g**: (global) Search globally and don't return after the first match        
+- **g**: (global) Search globally and don't return after the first match
 - **i**: case-insensitive: letters match both upper and lower case
 - **m**: multi-line mode: ^ and $ match begin/end of line
 - **s**: allow . to match \n
@@ -82,7 +86,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 - **U**: swap the meaning of x* and x*?"#
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RegexpReplaceFunc {
     signature: Signature,
 }
@@ -95,13 +99,12 @@ impl Default for RegexpReplaceFunc {
 impl RegexpReplaceFunc {
     pub fn new() -> Self {
         use DataType::*;
+        use TypeSignature::*;
         Self {
             signature: Signature::one_of(
                 vec![
-                    TypeSignature::Exact(vec![Utf8, Utf8, Utf8]),
-                    TypeSignature::Exact(vec![Utf8View, Utf8, Utf8]),
-                    TypeSignature::Exact(vec![Utf8, Utf8, Utf8, Utf8]),
-                    TypeSignature::Exact(vec![Utf8View, Utf8, Utf8, Utf8]),
+                    Uniform(3, vec![Utf8View, LargeUtf8, Utf8]),
+                    Uniform(4, vec![Utf8View, LargeUtf8, Utf8]),
                 ],
                 Volatility::Immutable,
             ),
@@ -148,10 +151,7 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
         })
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
 
         let len = args
@@ -188,13 +188,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
     }
 }
 
-/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
+/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
 /// used by regexp_replace
+/// Handles both single backslash (\1) and double backslash (\\1) which can occur
+/// when SQL strings with escaped backslashes are passed through
+///
+/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
+/// substitutes the entire match. This is consistent with POSIX behavior where
+/// \0 (or &) refers to the entire matched string.
 fn regex_replace_posix_groups(replacement: &str) -> String {
     static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
-        LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
+        LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
     CAPTURE_GROUPS_RE_LOCK
-        .replace_all(replacement, "$${$2}")
+        .replace_all(replacement, "$${$1}")
         .into_owned()
 }
 
@@ -238,15 +244,14 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
 /// # Ok(())
 /// # }
 /// ```
-pub fn regexp_replace<'a, T: OffsetSizeTrait, V, B>(
-    string_array: V,
-    pattern_array: B,
-    replacement_array: B,
-    flags: Option<&ArrayRef>,
+pub fn regexp_replace<'a, T: OffsetSizeTrait, U>(
+    string_array: U,
+    pattern_array: U,
+    replacement_array: U,
+    flags_array: Option<U>,
 ) -> Result<ArrayRef>
 where
-    V: ArrayAccessor<Item = &'a str>,
-    B: ArrayAccessor<Item = &'a str>,
+    U: ArrayAccessor<Item = &'a str>,
 {
     // Default implementation for regexp_replace, assumes all args are arrays
     // and args is a sequence of 3 or 4 elements.
@@ -260,7 +265,7 @@ where
     let pattern_array_iter = ArrayIter::new(pattern_array);
     let replacement_array_iter = ArrayIter::new(replacement_array);
 
-    match flags {
+    match flags_array {
         None => {
             let result_iter = string_array_iter
                 .zip(pattern_array_iter)
@@ -307,13 +312,13 @@ where
                 }
             }
         }
-        Some(flags) => {
-            let flags_array = as_generic_string_array::<T>(flags)?;
+        Some(flags_array) => {
+            let flags_array_iter = ArrayIter::new(flags_array);
 
             let result_iter = string_array_iter
                 .zip(pattern_array_iter)
                 .zip(replacement_array_iter)
-                .zip(flags_array.iter())
+                .zip(flags_array_iter)
                 .map(|(((string, pattern), replacement), flags)| {
                     match (string, pattern, replacement, flags) {
                         (Some(string), Some(pattern), Some(replacement), Some(flags)) => {
@@ -382,28 +387,37 @@ where
     }
 }
 
-fn _regexp_replace_early_abort<T: ArrayAccessor>(
-    input_array: T,
-    sz: usize,
-) -> Result<ArrayRef> {
-    // Mimicking the existing behavior of regexp_replace, if any of the scalar arguments
-    // are actually null, then the result will be an array of the same size as the first argument with all nulls.
-    //
-    // Also acts like an early abort mechanism when the input array is empty.
-    Ok(new_null_array(input_array.data_type(), sz))
-}
-
 /// Get the first argument from the given string array.
 ///
 /// Note: If the array is empty or the first argument is null,
-/// then calls the given early abort function.
+/// then aborts early.
 macro_rules! fetch_string_arg {
-    ($ARG:expr, $NAME:expr, $T:ident, $EARLY_ABORT:ident, $ARRAY_SIZE:expr) => {{
-        let array = as_generic_string_array::<$T>($ARG)?;
-        if array.len() == 0 || array.is_null(0) {
-            return $EARLY_ABORT(array, $ARRAY_SIZE);
-        } else {
-            array.value(0)
+    ($ARG:expr, $NAME:expr, $ARRAY_SIZE:expr) => {{
+        let string_array_type = ($ARG).data_type();
+        match string_array_type {
+            dt if $ARG.len() == 0 || $ARG.is_null(0) => {
+                // Mimicking the existing behavior of regexp_replace, if any of the scalar arguments
+                // are actually null, then the result will be an array of the same size as the first argument with all nulls.
+                //
+                // Also acts like an early abort mechanism when the input array is empty.
+                return Ok(new_null_array(dt, $ARRAY_SIZE));
+            }
+            DataType::Utf8 => {
+                let array = as_string_array($ARG)?;
+                array.value(0)
+            }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array($ARG)?;
+                array.value(0)
+            }
+            DataType::Utf8View => {
+                let array = as_string_view_array($ARG)?;
+                array.value(0)
+            }
+            _ => unreachable!(
+                "Invalid data type for regexp_replace: {}",
+                string_array_type
+            ),
         }
     }};
 }
@@ -417,27 +431,15 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     args: &[ArrayRef],
 ) -> Result<ArrayRef> {
     let array_size = args[0].len();
-    let pattern = fetch_string_arg!(
-        &args[1],
-        "pattern",
-        i32,
-        _regexp_replace_early_abort,
-        array_size
-    );
-    let replacement = fetch_string_arg!(
-        &args[2],
-        "replacement",
-        i32,
-        _regexp_replace_early_abort,
-        array_size
-    );
+    let pattern = fetch_string_arg!(&args[1], "pattern", array_size);
+    let replacement = fetch_string_arg!(&args[2], "replacement", array_size);
     let flags = match args.len() {
         3 => None,
-        4 => Some(fetch_string_arg!(&args[3], "flags", i32, _regexp_replace_early_abort, array_size)),
+        4 => Some(fetch_string_arg!(&args[3], "flags", array_size)),
         other => {
             return exec_err!(
                 "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4."
-            )
+            );
         }
     };
 
@@ -518,7 +520,7 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
 
 /// Determine which implementation of the regexp_replace to use based
 /// on the given set of arguments.
-pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
+fn specialize_regexp_replace<T: OffsetSizeTrait>(
     args: &[ColumnarValue],
 ) -> Result<ArrayRef> {
     // This will serve as a dispatch table where we can
@@ -590,38 +592,61 @@ pub fn specialize_regexp_replace<T: OffsetSizeTrait>(
                 .map(|arg| arg.to_array(inferred_length))
                 .collect::<Result<Vec<_>>>()?;
 
-            match args[0].data_type() {
-                DataType::Utf8View => {
-                    let string_array = args[0].as_string_view();
+            match (
+                args[0].data_type(),
+                args[1].data_type(),
+                args[2].data_type(),
+                args.get(3).map(|a| a.data_type()),
+            ) {
+                (
+                    DataType::Utf8,
+                    DataType::Utf8,
+                    DataType::Utf8,
+                    Some(DataType::Utf8) | None,
+                ) => {
+                    let string_array = args[0].as_string::<i32>();
                     let pattern_array = args[1].as_string::<i32>();
                     let replacement_array = args[2].as_string::<i32>();
-                    regexp_replace::<i32, _, _>(
+                    let flags_array = args.get(3).map(|a| a.as_string::<i32>());
+                    regexp_replace::<i32, _>(
                         string_array,
                         pattern_array,
                         replacement_array,
-                        args.get(3),
+                        flags_array,
                     )
                 }
-                DataType::Utf8 => {
-                    let string_array = args[0].as_string::<i32>();
-                    let pattern_array = args[1].as_string::<i32>();
-                    let replacement_array = args[2].as_string::<i32>();
-                    regexp_replace::<i32, _, _>(
+                (
+                    DataType::Utf8View,
+                    DataType::Utf8View,
+                    DataType::Utf8View,
+                    Some(DataType::Utf8View) | None,
+                ) => {
+                    let string_array = args[0].as_string_view();
+                    let pattern_array = args[1].as_string_view();
+                    let replacement_array = args[2].as_string_view();
+                    let flags_array = args.get(3).map(|a| a.as_string_view());
+                    regexp_replace::<i32, _>(
                         string_array,
                         pattern_array,
                         replacement_array,
-                        args.get(3),
+                        flags_array,
                     )
                 }
-                DataType::LargeUtf8 => {
+                (
+                    DataType::LargeUtf8,
+                    DataType::LargeUtf8,
+                    DataType::LargeUtf8,
+                    Some(DataType::LargeUtf8) | None,
+                ) => {
                     let string_array = args[0].as_string::<i64>();
                     let pattern_array = args[1].as_string::<i64>();
                     let replacement_array = args[2].as_string::<i64>();
-                    regexp_replace::<i64, _, _>(
+                    let flags_array = args.get(3).map(|a| a.as_string::<i64>());
+                    regexp_replace::<i64, _>(
                         string_array,
                         pattern_array,
                         replacement_array,
-                        args.get(3),
+                        flags_array,
                     )
                 }
                 other => {
@@ -639,6 +664,42 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn test_regex_replace_posix_groups() {
+        // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
+        assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
+        assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
+
+        // Test double backslash (from SQL escaped strings like '\\1')
+        assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
+
+        // Test 3 or 4 backslashes before digits to document expected behavior
+        assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
+
+        // Test that a lone backslash is NOT replaced (requires at least one digit)
+        assert_eq!(regex_replace_posix_groups(r"\"), r"\");
+        assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
+
+        // Test that backslash followed by non-digit is preserved
+        assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
+        assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
+
+        // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
+        // replacement syntax substitutes the entire match. This is consistent
+        // with POSIX behavior where \0 (or &) refers to the entire matched string.
+        assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
+        assert_eq!(
+            regex_replace_posix_groups(r"prefix\0suffix"),
+            "prefix${0}suffix"
+        );
+    }
+
     macro_rules! static_pattern_regexp_replace {
         ($name:ident, $T:ty, $O:ty) => {
             #[test]
@@ -650,8 +711,8 @@ mod tests {
                     vec!["afooc", "acd", "afoocd1234567890123", "123456789012afooc"];
 
                 let values = <$T>::from(values);
-                let patterns = StringArray::from(patterns);
-                let replacements = StringArray::from(replacement);
+                let patterns = <$T>::from(patterns);
+                let replacements = <$T>::from(replacement);
                 let expected = <$T>::from(expected);
 
                 let re = _regexp_replace_static_pattern_replace::<$O>(&[
diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs
index 63c987906b0f7..bfd035ed3c0db 100644
--- a/datafusion/functions/src/string/ascii.rs
+++ b/datafusion/functions/src/string/ascii.rs
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::make_scalar_function;
 use arrow::array::{ArrayRef, AsArray, Int32Array, StringArrayType};
 use arrow::datatypes::DataType;
 use arrow::error::ArrowError;
 use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignatureClass};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
 use datafusion_expr_common::signature::Coercion;
@@ -30,7 +30,7 @@ use std::sync::Arc;
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Returns the Unicode character code of the first character in a string.",
+    description = "Returns the first Unicode scalar value of a string.",
     syntax_example = "ascii(str)",
     sql_example = r#"```sql
 > select ascii('abc');
@@ -49,7 +49,7 @@ use std::sync::Arc;
     standard_argument(name = "str", prefix = "String"),
     related_udf(name = "chr")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct AsciiFunc {
     signature: Signature,
 }
@@ -87,13 +87,35 @@ impl ScalarUDFImpl for AsciiFunc {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-
-        Ok(Int32)
+        Ok(DataType::Int32)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(ascii, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(scalar) => {
+                if scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Int32(None)));
+                }
+
+                match scalar {
+                    ScalarValue::Utf8(Some(s))
+                    | ScalarValue::LargeUtf8(Some(s))
+                    | ScalarValue::Utf8View(Some(s)) => {
+                        let result = s.chars().next().map_or(0, |c| c as i32);
+                        Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(result))))
+                    }
+                    _ => {
+                        internal_err!(
+                            "Unexpected data type {:?} for function ascii",
+                            scalar.data_type()
+                        )
+                    }
+                }
+            }
+            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(ascii(&[array])?)),
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -101,7 +123,7 @@ impl ScalarUDFImpl for AsciiFunc {
     }
 }
 
-fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
+fn calculate_ascii<'a, V>(array: &V) -> Result<ArrayRef, ArrowError>
 where
     V: StringArrayType<'a, Item = &'a str>,
 {
@@ -126,15 +148,15 @@ pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            Ok(calculate_ascii(string_array)?)
+            Ok(calculate_ascii(&string_array)?)
         }
         _ => internal_err!("Unsupported data type"),
     }
@@ -186,6 +208,8 @@ mod tests {
         test_ascii!(Some(String::from("a")), Ok(Some(97)));
         test_ascii!(Some(String::from("")), Ok(Some(0)));
         test_ascii!(Some(String::from("🚀")), Ok(Some(128640)));
+        test_ascii!(Some(String::from("\n")), Ok(Some(10)));
+        test_ascii!(Some(String::from("\t")), Ok(Some(9)));
         test_ascii!(None, Ok(None));
         Ok(())
     }
diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs
index f8740aa4178b4..1578331e57f89 100644
--- a/datafusion/functions/src/string/bit_length.rs
+++ b/datafusion/functions/src/string/bit_length.rs
@@ -45,7 +45,7 @@ use datafusion_macros::user_doc;
     related_udf(name = "length"),
     related_udf(name = "octet_length")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct BitLengthFunc {
     signature: Signature,
 }
diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs
index 2f1711c9962ad..beea527f6d0b5 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -20,7 +20,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -30,7 +30,7 @@ use datafusion_macros::user_doc;
 use std::any::Any;
 use std::sync::Arc;
 
-/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, spaces are removed.
 /// btrim('xyxtrimyyx', 'xyz') = 'trim'
 fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -40,12 +40,12 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Both, use_string_view)
+    general_trim::<T, TrimBoth>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.",
+    description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string.",
     syntax_example = "btrim(str[, trim_str])",
     sql_example = r#"```sql
 > select btrim('__datafusion____', '_');
@@ -58,14 +58,14 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._"
+        description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is a space._"
     ),
     alternative_syntax = "trim(BOTH trim_str FROM str)",
     alternative_syntax = "trim(trim_str FROM str)",
     related_udf(name = "ltrim"),
     related_udf(name = "rtrim")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct BTrimFunc {
     signature: Signature,
     aliases: Vec<String>,
diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs
index a811de7fccf06..2f432c838e010 100644
--- a/datafusion/functions/src/string/chr.rs
+++ b/datafusion/functions/src/string/chr.rs
@@ -18,24 +18,21 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
-use arrow::array::GenericStringBuilder;
+use arrow::array::{ArrayRef, GenericStringBuilder, Int64Array};
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Int64;
 use arrow::datatypes::DataType::Utf8;
 
-use crate::utils::make_scalar_function;
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
-/// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character.
+/// Returns the character with the given code.
 /// chr(65) = 'A'
-pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let integer_array = as_int64_array(&args[0])?;
-
+fn chr_array(integer_array: &Int64Array) -> Result<ArrayRef> {
     let mut builder = GenericStringBuilder::<i32>::with_capacity(
         integer_array.len(),
         // 1 byte per character, assuming that is the common case
@@ -47,35 +44,25 @@ pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
     for integer in integer_array {
         match integer {
             Some(integer) => {
-                if integer == 0 {
-                    return exec_err!("null character not permitted.");
-                } else {
-                    match core::char::from_u32(integer as u32) {
-                        Some(c) => {
-                            builder.append_value(c.encode_utf8(&mut buf));
-                        }
-                        None => {
-                            return exec_err!(
-                                "requested character too large for encoding."
-                            );
-                        }
-                    }
+                if let Ok(u) = u32::try_from(integer)
+                    && let Some(c) = core::char::from_u32(u)
+                {
+                    builder.append_value(c.encode_utf8(&mut buf));
+                    continue;
                 }
+
+                return exec_err!("invalid Unicode scalar value: {integer}");
             }
-            None => {
-                builder.append_null();
-            }
+            None => builder.append_null(),
         }
     }
 
-    let result = builder.finish();
-
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Returns the character with the specified ASCII or Unicode code value.",
+    description = "Returns a string containing the character with the specified Unicode scalar value.",
     syntax_example = "chr(expression)",
     sql_example = r#"```sql
 > select chr(128640);
@@ -88,7 +75,7 @@ pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "expression", prefix = "String"),
     related_udf(name = "ascii")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ChrFunc {
     signature: Signature,
 }
@@ -125,10 +112,186 @@ impl ScalarUDFImpl for ChrFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(chr, vec![])(&args.args)
+        let [arg] = take_function_args(self.name(), args.args)?;
+
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(code_point))) => {
+                if let Ok(u) = u32::try_from(code_point)
+                    && let Some(c) = core::char::from_u32(u)
+                {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                        c.to_string(),
+                    ))))
+                } else {
+                    exec_err!("invalid Unicode scalar value: {code_point}")
+                }
+            }
+            ColumnarValue::Scalar(ScalarValue::Int64(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+            }
+            ColumnarValue::Array(array) => {
+                let integer_array = as_int64_array(&array)?;
+                Ok(ColumnarValue::Array(chr_array(integer_array)?))
+            }
+            other => internal_err!(
+                "Unexpected data type {:?} for function chr",
+                other.data_type()
+            ),
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use arrow::array::{Array, Int64Array, StringArray};
+    use arrow::datatypes::Field;
+    use datafusion_common::assert_contains;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+
+    fn invoke_chr(arg: ColumnarValue, number_rows: usize) -> Result<ColumnarValue> {
+        ChrFunc::new().invoke_with_args(ScalarFunctionArgs {
+            args: vec![arg],
+            arg_fields: vec![Field::new("a", Int64, true).into()],
+            number_rows,
+            return_field: Field::new("f", Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        })
+    }
+
+    #[test]
+    fn test_chr_normal() {
+        let input = Arc::new(Int64Array::from(vec![
+            Some(0),        // \u{0000}
+            Some(65),       // A
+            Some(66),       // B
+            Some(67),       // C
+            Some(128640),   // 🚀
+            Some(8364),     // €
+            Some(945),      // α
+            None,           // NULL
+            Some(32),       // space
+            Some(10),       // newline
+            Some(9),        // tab
+            Some(0x10FFFF), // 0x10FFFF, the largest Unicode code point
+        ]));
+
+        let result = invoke_chr(ColumnarValue::Array(input), 12).unwrap();
+        let ColumnarValue::Array(arr) = result else {
+            panic!("Expected array");
+        };
+        let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
+
+        let expected = [
+            "\u{0000}",
+            "A",
+            "B",
+            "C",
+            "🚀",
+            "€",
+            "α",
+            "",
+            " ",
+            "\n",
+            "\t",
+            "\u{10ffff}",
+        ];
+
+        assert_eq!(string_array.len(), expected.len());
+        for (i, e) in expected.iter().enumerate() {
+            assert_eq!(string_array.value(i), *e);
+        }
+    }
+
+    #[test]
+    fn test_chr_error() {
+        let input = Arc::new(Int64Array::from(vec![i64::MAX]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: 9223372036854775807"
+        );
+
+        let input = Arc::new(Int64Array::from(vec![0x10FFFF + 1]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: 1114112"
+        );
+
+        let input = Arc::new(Int64Array::from(vec![0xD800 + 1]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: 55297"
+        );
+
+        let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: -9223372036854775806"
+        );
+
+        let input = Arc::new(Int64Array::from(vec![-1]));
+        let result = invoke_chr(ColumnarValue::Array(input), 1);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: -1"
+        );
+
+        let input = Arc::new(Int64Array::from(vec![65, -1, 66]));
+        let result = invoke_chr(ColumnarValue::Array(input), 3);
+        assert!(result.is_err());
+        assert_contains!(
+            result.err().unwrap().to_string(),
+            "invalid Unicode scalar value: -1"
+        );
+    }
+
+    #[test]
+    fn test_chr_empty() {
+        let input = Arc::new(Int64Array::from(Vec::<i64>::new()));
+        let result = invoke_chr(ColumnarValue::Array(input), 0).unwrap();
+        let ColumnarValue::Array(arr) = result else {
+            panic!("Expected array");
+        };
+        let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
+        assert_eq!(string_array.len(), 0);
+    }
+
+    #[test]
+    fn test_chr_scalar() {
+        let result =
+            invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(Some(65))), 1).unwrap();
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
+                assert_eq!(s, "A");
+            }
+            other => panic!("Unexpected result: {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_chr_scalar_null() {
+        let result =
+            invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(None)), 1).unwrap();
+
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {}
+            other => panic!("Unexpected result: {other:?}"),
+        }
+    }
+}
diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs
index 5e0567eafea2e..cc97c1b2c1957 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -17,175 +17,210 @@
 
 //! Common utilities for implementing string functions
 
-use std::fmt::{Display, Formatter};
 use std::sync::Arc;
 
 use crate::strings::make_and_append_view;
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericStringArray, GenericStringBuilder,
-    NullBufferBuilder, OffsetSizeTrait, StringBuilder, StringViewArray,
+    Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder,
+    OffsetSizeTrait, StringViewArray, StringViewBuilder, new_null_array,
 };
 use arrow::buffer::{Buffer, ScalarBuffer};
 use arrow::datatypes::DataType;
-use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::Result;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
+use datafusion_common::{ScalarValue, exec_err};
 use datafusion_expr::ColumnarValue;
 
-pub(crate) enum TrimType {
-    Left,
-    Right,
-    Both,
+/// Trait for trim operations, allowing compile-time dispatch instead of runtime matching.
+///
+/// Each implementation performs its specific trim operation and returns
+/// (trimmed_str, start_offset) where start_offset is the byte offset
+/// from the beginning of the input string where the trimmed result starts.
+pub(crate) trait Trimmer {
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32);
+
+    /// Optimized trim for a single ASCII byte.
+    /// Uses byte-level scanning instead of char-level iteration.
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32);
+}
+
+/// Returns the number of leading bytes matching `byte`
+#[inline]
+fn leading_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().take_while(|&&b| b == byte).count()
+}
+
+/// Returns the number of trailing bytes matching `byte`
+#[inline]
+fn trailing_bytes(bytes: &[u8], byte: u8) -> usize {
+    bytes.iter().rev().take_while(|&&b| b == byte).count()
 }
 
-impl Display for TrimType {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TrimType::Left => write!(f, "ltrim"),
-            TrimType::Right => write!(f, "rtrim"),
-            TrimType::Both => write!(f, "btrim"),
+/// Left trim - removes leading characters
+pub(crate) struct TrimLeft;
+
+impl Trimmer for TrimLeft {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
         }
+        let trimmed = input.trim_start_matches(pattern);
+        let offset = (input.len() - trimmed.len()) as u32;
+        (trimmed, offset)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let start = leading_bytes(input.as_bytes(), byte);
+        (&input[start..], start as u32)
     }
 }
 
-pub(crate) fn general_trim<T: OffsetSizeTrait>(
+/// Right trim - removes trailing characters
+pub(crate) struct TrimRight;
+
+impl Trimmer for TrimRight {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
+        let trimmed = input.trim_end_matches(pattern);
+        (trimmed, 0)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let end = bytes.len() - trailing_bytes(bytes, byte);
+        (&input[..end], 0)
+    }
+}
+
+/// Both trim - removes both leading and trailing characters
+pub(crate) struct TrimBoth;
+
+impl Trimmer for TrimBoth {
+    #[inline]
+    fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
+        if pattern.len() == 1 && pattern[0].is_ascii() {
+            return Self::trim_ascii_char(input, pattern[0] as u8);
+        }
+        let left_trimmed = input.trim_start_matches(pattern);
+        let offset = (input.len() - left_trimmed.len()) as u32;
+        let trimmed = left_trimmed.trim_end_matches(pattern);
+        (trimmed, offset)
+    }
+
+    #[inline]
+    fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
+        let bytes = input.as_bytes();
+        let start = leading_bytes(bytes, byte);
+        let end = bytes.len() - trailing_bytes(&bytes[start..], byte);
+        (&input[start..end], start as u32)
+    }
+}
+
+pub(crate) fn general_trim<T: OffsetSizeTrait, Tr: Trimmer>(
     args: &[ArrayRef],
-    trim_type: TrimType,
     use_string_view: bool,
 ) -> Result<ArrayRef> {
-    let func = match trim_type {
-        TrimType::Left => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let ltrimmed_str =
-                str::trim_start_matches::<&[char]>(input, pattern.as_ref());
-            // `ltrimmed_str` is actually `input`[start_offset..],
-            // so `start_offset` = len(`input`) - len(`ltrimmed_str`)
-            let start_offset = input.len() - ltrimmed_str.len();
-
-            (ltrimmed_str, start_offset as u32)
-        },
-        TrimType::Right => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let rtrimmed_str = str::trim_end_matches::<&[char]>(input, pattern.as_ref());
-
-            // `ltrimmed_str` is actually `input`[0..new_len], so `start_offset` is 0
-            (rtrimmed_str, 0)
-        },
-        TrimType::Both => |input, pattern: &str| {
-            let pattern = pattern.chars().collect::<Vec<char>>();
-            let ltrimmed_str =
-                str::trim_start_matches::<&[char]>(input, pattern.as_ref());
-            // `btrimmed_str` can be got by rtrim(ltrim(`input`)),
-            // so its `start_offset` should be same as ltrim situation above
-            let start_offset = input.len() - ltrimmed_str.len();
-            let btrimmed_str =
-                str::trim_end_matches::<&[char]>(ltrimmed_str, pattern.as_ref());
-
-            (btrimmed_str, start_offset as u32)
-        },
-    };
-
     if use_string_view {
-        string_view_trim(func, args)
+        string_view_trim::<Tr>(args)
     } else {
-        string_trim::<T>(func, args)
+        string_trim::<T, Tr>(args)
     }
 }
 
 /// Applies the trim function to the given string view array(s)
 /// and returns a new string view array with the trimmed values.
 ///
-/// # `trim_func`: The function to apply to each string view.
-///
-/// ## Arguments
-/// - The original string
-/// - the pattern to trim
-///
-/// ## Returns
-///  - trimmed str (must be a substring of the first argument)
-///  - start offset, needed in `string_view_trim`
-///
-/// ## Examples
-///
-/// For `ltrim`:
-/// - `fn("  abc", " ") -> ("abc", 2)`
-/// - `fn("abd", " ") -> ("abd", 0)`
-///
-/// For `btrim`:
-/// - `fn("  abc  ", " ") -> ("abc", 2)`
-/// - `fn("abd", " ") -> ("abd", 0)`
-// removing 'a will cause compiler complaining lifetime of `func`
-fn string_view_trim<'a>(
-    trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
-    args: &'a [ArrayRef],
-) -> Result<ArrayRef> {
+/// Pre-computes the pattern characters once for scalar patterns to avoid
+/// repeated allocations per row.
+fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_view_array = as_string_view_array(&args[0])?;
     let mut views_buf = Vec::with_capacity(string_view_array.len());
     let mut null_builder = NullBufferBuilder::new(string_view_array.len());
 
     match args.len() {
         1 => {
-            let array_iter = string_view_array.iter();
-            let views_iter = string_view_array.views().iter();
-            for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
-                trim_and_append_str(
-                    src_str_opt,
-                    Some(" "),
-                    trim_func,
-                    &mut views_buf,
-                    &mut null_builder,
-                    raw_view,
-                );
+            // Trim spaces by default
+            for (src_str_opt, raw_view) in string_view_array
+                .iter()
+                .zip(string_view_array.views().iter())
+            {
+                if let Some(src_str) = src_str_opt {
+                    let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
+                    make_and_append_view(
+                        &mut views_buf,
+                        &mut null_builder,
+                        raw_view,
+                        trimmed,
+                        offset,
+                    );
+                } else {
+                    null_builder.append_null();
+                    views_buf.push(0);
+                }
             }
         }
         2 => {
             let characters_array = as_string_view_array(&args[1])?;
 
             if characters_array.len() == 1 {
-                // Only one `trim characters` exist
+                // Scalar pattern - pre-compute pattern chars once
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
-                        // The schema is expecting utf8 as null
                         &DataType::Utf8View,
                         string_view_array.len(),
                     ));
                 }
 
-                let characters = characters_array.value(0);
-                let array_iter = string_view_array.iter();
-                let views_iter = string_view_array.views().iter();
-                for (src_str_opt, raw_view) in array_iter.zip(views_iter) {
-                    trim_and_append_str(
+                let pattern: Vec<char> = characters_array.value(0).chars().collect();
+                for (src_str_opt, raw_view) in string_view_array
+                    .iter()
+                    .zip(string_view_array.views().iter())
+                {
+                    trim_and_append_view::<Tr>(
                         src_str_opt,
-                        Some(characters),
-                        trim_func,
+                        &pattern,
                         &mut views_buf,
                         &mut null_builder,
                         raw_view,
                     );
                 }
             } else {
-                // A specific `trim characters` for a row in the string view array
-                let characters_iter = characters_array.iter();
-                let array_iter = string_view_array.iter();
-                let views_iter = string_view_array.views().iter();
-                for ((src_str_opt, raw_view), characters_opt) in
-                    array_iter.zip(views_iter).zip(characters_iter)
+                // Per-row pattern - must compute pattern chars for each row
+                let mut pattern: Vec<char> = Vec::new();
+                for ((src_str_opt, raw_view), characters_opt) in string_view_array
+                    .iter()
+                    .zip(string_view_array.views().iter())
+                    .zip(characters_array.iter())
                 {
-                    trim_and_append_str(
-                        src_str_opt,
-                        characters_opt,
-                        trim_func,
-                        &mut views_buf,
-                        &mut null_builder,
-                        raw_view,
-                    );
+                    if let (Some(src_str), Some(characters)) =
+                        (src_str_opt, characters_opt)
+                    {
+                        pattern.clear();
+                        pattern.extend(characters.chars());
+                        let (trimmed, offset) = Tr::trim(src_str, &pattern);
+                        make_and_append_view(
+                            &mut views_buf,
+                            &mut null_builder,
+                            raw_view,
+                            trimmed,
+                            offset,
+                        );
+                    } else {
+                        null_builder.append_null();
+                        views_buf.push(0);
+                    }
                 }
             }
         }
         other => {
             return exec_err!(
-            "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
+                "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
             );
         }
     }
@@ -210,33 +245,23 @@ fn string_view_trim<'a>(
 /// Trims the given string and appends the trimmed string to the views buffer
 /// and the null buffer.
 ///
-/// Calls `trim_func` on the string value in `original_view`, for non_null
-/// values and appends the updated view to the views buffer / null_builder.
-///
 /// Arguments
 /// - `src_str_opt`: The original string value (represented by the view)
-/// - `trim_characters_opt`: The characters to trim from the string
-/// - `trim_func`: The function to apply to the string (see [`string_view_trim`] for details)
+/// - `pattern`: Pre-computed character pattern to trim
 /// - `views_buf`: The buffer to append the updated views to
 /// - `null_builder`: The buffer to append the null values to
 /// - `original_view`: The original view value (that contains src_str_opt)
-fn trim_and_append_str<'a>(
-    src_str_opt: Option<&'a str>,
-    trim_characters_opt: Option<&'a str>,
-    trim_func: fn(&'a str, &'a str) -> (&'a str, u32),
+#[inline]
+fn trim_and_append_view<Tr: Trimmer>(
+    src_str_opt: Option<&str>,
+    pattern: &[char],
     views_buf: &mut Vec<u128>,
     null_builder: &mut NullBufferBuilder,
     original_view: &u128,
 ) {
-    if let (Some(src_str), Some(characters)) = (src_str_opt, trim_characters_opt) {
-        let (trim_str, start_offset) = trim_func(src_str, characters);
-        make_and_append_view(
-            views_buf,
-            null_builder,
-            original_view,
-            trim_str,
-            start_offset,
-        );
+    if let Some(src_str) = src_str_opt {
+        let (trimmed, offset) = Tr::trim(src_str, pattern);
+        make_and_append_view(views_buf, null_builder, original_view, trimmed, offset);
     } else {
         null_builder.append_null();
         views_buf.push(0);
@@ -246,18 +271,17 @@ fn trim_and_append_str<'a>(
 /// Applies the trim function to the given string array(s)
 /// and returns a new string array with the trimmed values.
 ///
-/// See [`string_view_trim`] for details on `func`
-fn string_trim<'a, T: OffsetSizeTrait>(
-    func: fn(&'a str, &'a str) -> (&'a str, u32),
-    args: &'a [ArrayRef],
-) -> Result<ArrayRef> {
+/// Pre-computes the pattern characters once for scalar patterns to avoid
+/// repeated allocations per row.
+fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_array = as_generic_string_array::<T>(&args[0])?;
 
     match args.len() {
         1 => {
+            // Trim spaces by default
             let result = string_array
                 .iter()
-                .map(|string| string.map(|string: &str| func(string, " ").0))
+                .map(|string| string.map(|s| Tr::trim_ascii_char(s, b' ').0))
                 .collect::<GenericStringArray<T>>();
 
             Ok(Arc::new(result) as ArrayRef)
@@ -266,6 +290,7 @@ fn string_trim<'a, T: OffsetSizeTrait>(
             let characters_array = as_generic_string_array::<T>(&args[1])?;
 
             if characters_array.len() == 1 {
+                // Scalar pattern - pre-compute pattern chars once
                 if characters_array.is_null(0) {
                     return Ok(new_null_array(
                         string_array.data_type(),
@@ -273,19 +298,25 @@ fn string_trim<'a, T: OffsetSizeTrait>(
                     ));
                 }
 
-                let characters = characters_array.value(0);
+                let pattern: Vec<char> = characters_array.value(0).chars().collect();
                 let result = string_array
                     .iter()
-                    .map(|item| item.map(|string| func(string, characters).0))
+                    .map(|item| item.map(|s| Tr::trim(s, &pattern).0))
                     .collect::<GenericStringArray<T>>();
                 return Ok(Arc::new(result) as ArrayRef);
             }
 
+            // Per-row pattern - must compute pattern chars for each row
+            let mut pattern: Vec<char> = Vec::new();
             let result = string_array
                 .iter()
                 .zip(characters_array.iter())
                 .map(|(string, characters)| match (string, characters) {
-                    (Some(string), Some(characters)) => Some(func(string, characters).0),
+                    (Some(s), Some(c)) => {
+                        pattern.clear();
+                        pattern.extend(c.chars());
+                        Some(Tr::trim(s, &pattern).0)
+                    }
                     _ => None,
                 })
                 .collect::<GenericStringArray<T>>();
@@ -294,7 +325,7 @@ fn string_trim<'a, T: OffsetSizeTrait>(
         }
         other => {
             exec_err!(
-            "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
+                "Function TRIM was called with {other} arguments. It requires at least 1 and at most 2."
             )
         }
     }
@@ -327,10 +358,8 @@ where
             >(array, op)?)),
             DataType::Utf8View => {
                 let string_array = as_string_view_array(array)?;
-                let mut string_builder = StringBuilder::with_capacity(
-                    string_array.len(),
-                    string_array.get_array_memory_size(),
-                );
+                let mut string_builder =
+                    StringViewBuilder::with_capacity(string_array.len());
 
                 for str in string_array.iter() {
                     if let Some(str) = str {
@@ -355,7 +384,7 @@ where
             }
             ScalarValue::Utf8View(a) => {
                 let result = a.as_ref().map(|x| op(x));
-                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(result)))
             }
             other => exec_err!("Unsupported data type {other:?} for function {name}"),
         },
diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs
index 773c316422b70..c65b990f42397 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{as_largestring_array, Array};
+use arrow::array::{Array, as_largestring_array};
 use arrow::datatypes::DataType;
 use datafusion_expr::sort_properties::ExprProperties;
 use std::any::Any;
@@ -26,10 +26,10 @@ use crate::strings::{
     ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
 };
 use datafusion_common::cast::{as_string_array, as_string_view_array};
-use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
-use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{ColumnarValue, Documentation, Expr, Volatility, lit};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
@@ -52,7 +52,7 @@ use datafusion_macros::user_doc;
     ),
     related_udf(name = "concat_ws")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ConcatFunc {
     signature: Signature,
 }
@@ -88,19 +88,18 @@ impl ScalarUDFImpl for ConcatFunc {
         &self.signature
     }
 
+    /// Match the return type to the input types to avoid unnecessary casts. On
+    /// mixed inputs, prefer Utf8View; prefer LargeUtf8 over Utf8 to avoid
+    /// potential overflow on LargeUtf8 input.
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         use DataType::*;
-        let mut dt = &Utf8;
-        arg_types.iter().for_each(|data_type| {
-            if data_type == &Utf8View {
-                dt = data_type;
-            }
-            if data_type == &LargeUtf8 && dt != &Utf8View {
-                dt = data_type;
-            }
-        });
-
-        Ok(dt.to_owned())
+        if arg_types.contains(&Utf8View) {
+            Ok(Utf8View)
+        } else if arg_types.contains(&LargeUtf8) {
+            Ok(LargeUtf8)
+        } else {
+            Ok(Utf8)
+        }
     }
 
     /// Concatenates the text representations of all the arguments. NULL arguments are ignored.
@@ -108,43 +107,38 @@ impl ScalarUDFImpl for ConcatFunc {
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
 
-        let mut return_datatype = DataType::Utf8;
-        args.iter().for_each(|col| {
-            if col.data_type() == DataType::Utf8View {
-                return_datatype = col.data_type();
-            }
-            if col.data_type() == DataType::LargeUtf8
-                && return_datatype != DataType::Utf8View
-            {
-                return_datatype = col.data_type();
-            }
-        });
+        let return_datatype = if args.iter().any(|c| c.data_type() == DataType::Utf8View)
+        {
+            DataType::Utf8View
+        } else if args.iter().any(|c| c.data_type() == DataType::LargeUtf8) {
+            DataType::LargeUtf8
+        } else {
+            DataType::Utf8
+        };
 
-        let array_len = args
-            .iter()
-            .filter_map(|x| match x {
-                ColumnarValue::Array(array) => Some(array.len()),
-                _ => None,
-            })
-            .next();
+        let array_len = args.iter().find_map(|x| match x {
+            ColumnarValue::Array(array) => Some(array.len()),
+            _ => None,
+        });
 
         // Scalar
         if array_len.is_none() {
-            let mut result = String::new();
-            for arg in args {
+            let mut values = Vec::with_capacity(args.len());
+            for arg in &args {
                 let ColumnarValue::Scalar(scalar) = arg else {
                     return internal_err!("concat expected scalar value, got {arg:?}");
                 };
 
                 match scalar.try_as_str() {
-                    Some(Some(v)) => result.push_str(v),
+                    Some(Some(v)) => values.push(v),
                     Some(None) => {} // null literal
                     None => plan_err!(
-                        "Concat function does not support scalar type {:?}",
+                        "Concat function does not support scalar type {}",
                         scalar
                     )?,
                 }
             }
+            let result = values.concat();
 
             return match return_datatype {
                 DataType::Utf8View => {
@@ -189,7 +183,7 @@ impl ScalarUDFImpl for ConcatFunc {
                                 ColumnarValueRef::NonNullableArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::LargeUtf8 => {
                             let string_array = as_largestring_array(array);
 
@@ -197,23 +191,29 @@ impl ScalarUDFImpl for ConcatFunc {
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableLargeStringArray(string_array)
                             } else {
-                                ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                                ColumnarValueRef::NonNullableLargeStringArray(
+                                    string_array,
+                                )
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
-                            data_size += string_array.len();
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 bytes).
+                            data_size += string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableStringViewArray(string_array)
                             } else {
                                 ColumnarValueRef::NonNullableStringViewArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         other => {
-                            return plan_err!("Input was {other} which is not a supported datatype for concat function")
+                            return plan_err!(
+                                "Input was {other} which is not a supported datatype for concat function"
+                            );
                         }
                     };
                 }
@@ -243,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc {
                     builder.append_offset();
                 }
 
-                let string_array = builder.finish();
+                let string_array = builder.finish(None);
                 Ok(ColumnarValue::Array(Arc::new(string_array)))
             }
             DataType::LargeUtf8 => {
@@ -273,7 +273,7 @@ impl ScalarUDFImpl for ConcatFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         simplify_concat(args)
     }
@@ -287,7 +287,7 @@ impl ScalarUDFImpl for ConcatFunc {
     }
 }
 
-pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
+pub(crate) fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
     let mut new_args = Vec::with_capacity(args.len());
     let mut contiguous_scalar = "".to_string();
 
@@ -295,7 +295,7 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
         let data_types: Vec<_> = args
             .iter()
             .filter_map(|expr| match expr {
-                Expr::Literal(l) => Some(l.data_type()),
+                Expr::Literal(l, _) => Some(l.data_type()),
                 _ => None,
             })
             .collect();
@@ -304,28 +304,27 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
 
     for arg in args.clone() {
         match arg {
-            Expr::Literal(ScalarValue::Utf8(None)) => {}
-            Expr::Literal(ScalarValue::LargeUtf8(None)) => {
-            }
-            Expr::Literal(ScalarValue::Utf8View(None)) => { }
+            Expr::Literal(ScalarValue::Utf8(None), _) => {}
+            Expr::Literal(ScalarValue::LargeUtf8(None), _) => {}
+            Expr::Literal(ScalarValue::Utf8View(None), _) => {}
 
             // filter out `null` args
             // All literals have been converted to Utf8 or LargeUtf8 in type_coercion.
             // Concatenate it with the `contiguous_scalar`.
-            Expr::Literal(ScalarValue::Utf8(Some(v))) => {
+            Expr::Literal(ScalarValue::Utf8(Some(v)), _) => {
                 contiguous_scalar += &v;
             }
-            Expr::Literal(ScalarValue::LargeUtf8(Some(v))) => {
+            Expr::Literal(ScalarValue::LargeUtf8(Some(v)), _) => {
                 contiguous_scalar += &v;
             }
-            Expr::Literal(ScalarValue::Utf8View(Some(v))) => {
+            Expr::Literal(ScalarValue::Utf8View(Some(v)), _) => {
                 contiguous_scalar += &v;
             }
 
-            Expr::Literal(x) => {
+            Expr::Literal(x, _) => {
                 return internal_err!(
                     "The scalar {x} should be casted to string type during the type coercion."
-                )
+                );
             }
             // If the arg is not a literal, we should first push the current `contiguous_scalar`
             // to the `new_args` (if it is not empty) and reset it to empty string.
@@ -334,8 +333,10 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
                 if !contiguous_scalar.is_empty() {
                     match return_type {
                         DataType::Utf8 => new_args.push(lit(contiguous_scalar)),
-                        DataType::LargeUtf8 => new_args.push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))),
-                        DataType::Utf8View => new_args.push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))),
+                        DataType::LargeUtf8 => new_args
+                            .push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))),
+                        DataType::Utf8View => new_args
+                            .push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))),
                         _ => unreachable!(),
                     }
                     contiguous_scalar = "".to_string();
@@ -374,10 +375,11 @@ pub fn simplify_concat(args: Vec<Expr>) -> Result<ExprSimplifyResult> {
 mod tests {
     use super::*;
     use crate::utils::test::test_function;
+    use DataType::*;
     use arrow::array::{Array, LargeStringArray, StringViewArray};
     use arrow::array::{ArrayRef, StringArray};
     use arrow::datatypes::Field;
-    use DataType::*;
+    use datafusion_common::config::ConfigOptions;
 
     #[test]
     fn test_functions() -> Result<()> {
@@ -485,6 +487,7 @@ mod tests {
             arg_fields,
             number_rows: 3,
             return_field: Field::new("f", Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = ConcatFunc::new().invoke_with_args(args)?;
diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs
index 2a2f9429f8fc3..80e11d286ed87 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{as_largestring_array, Array, StringArray};
+use arrow::array::Array;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -24,12 +24,16 @@ use arrow::datatypes::DataType;
 use crate::string::concat;
 use crate::string::concat::simplify_concat;
 use crate::string::concat_ws;
-use crate::strings::{ColumnarValueRef, StringArrayBuilder};
-use datafusion_common::cast::{as_string_array, as_string_view_array};
-use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue};
+use crate::strings::{
+    ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
+};
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err, plan_err};
 use datafusion_expr::expr::ScalarFunction;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
-use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{ColumnarValue, Documentation, Expr, Volatility, lit};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
@@ -59,7 +63,7 @@ use datafusion_macros::user_doc;
     ),
     related_udf(name = "concat")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ConcatWsFunc {
     signature: Signature,
 }
@@ -95,17 +99,27 @@ impl ScalarUDFImpl for ConcatWsFunc {
         &self.signature
     }
 
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+    /// Match the return type to the input types to avoid unnecessary casts. On
+    /// mixed inputs, prefer Utf8View; prefer LargeUtf8 over Utf8 to avoid
+    /// potential overflow on LargeUtf8 input.
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         use DataType::*;
-        Ok(Utf8)
+        if arg_types.contains(&Utf8View) {
+            Ok(Utf8View)
+        } else if arg_types.contains(&LargeUtf8) {
+            Ok(LargeUtf8)
+        } else {
+            Ok(Utf8)
+        }
     }
 
-    /// Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored.
+    /// Concatenates all but the first argument, with separators. The first
+    /// argument is used as the separator string, and should not be NULL. Other
+    /// NULL arguments are ignored.
     /// concat_ws(',', 'abcde', 2, NULL, 22) = 'abcde,2,22'
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let ScalarFunctionArgs { args, .. } = args;
 
-        // do not accept 0 arguments.
         if args.len() < 2 {
             return exec_err!(
                 "concat_ws was called with {} arguments. It requires at least 2.",
@@ -113,68 +127,67 @@ impl ScalarUDFImpl for ConcatWsFunc {
             );
         }
 
-        let array_len = args
-            .iter()
-            .filter_map(|x| match x {
-                ColumnarValue::Array(array) => Some(array.len()),
-                _ => None,
-            })
-            .next();
+        let return_datatype = if args.iter().any(|c| c.data_type() == DataType::Utf8View)
+        {
+            DataType::Utf8View
+        } else if args.iter().any(|c| c.data_type() == DataType::LargeUtf8) {
+            DataType::LargeUtf8
+        } else {
+            DataType::Utf8
+        };
+
+        let array_len = args.iter().find_map(|x| match x {
+            ColumnarValue::Array(array) => Some(array.len()),
+            _ => None,
+        });
 
         // Scalar
         if array_len.is_none() {
             let ColumnarValue::Scalar(scalar) = &args[0] else {
-                // loop above checks for all args being scalar
                 unreachable!()
             };
             let sep = match scalar.try_as_str() {
                 Some(Some(s)) => s,
                 Some(None) => {
                     // null literal string
-                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+                    return match return_datatype {
+                        DataType::Utf8View => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None)))
+                        }
+                        DataType::LargeUtf8 => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+                        }
+                        _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+                    };
                 }
                 None => return internal_err!("Expected string literal, got {scalar:?}"),
             };
 
-            let mut result = String::new();
-            // iterator over Option<str>
-            let iter = &mut args[1..].iter().map(|arg| {
+            let mut values = Vec::with_capacity(args.len() - 1);
+            for arg in &args[1..] {
                 let ColumnarValue::Scalar(scalar) = arg else {
-                    // loop above checks for all args being scalar
                     unreachable!()
                 };
-                scalar.try_as_str()
-            });
-
-            // append first non null arg
-            for scalar in iter.by_ref() {
-                match scalar {
-                    Some(Some(s)) => {
-                        result.push_str(s);
-                        break;
-                    }
-                    Some(None) => {} // null literal string
-                    None => {
-                        return internal_err!("Expected string literal, got {scalar:?}")
-                    }
-                }
-            }
 
-            // handle subsequent non null args
-            for scalar in iter.by_ref() {
-                match scalar {
-                    Some(Some(s)) => {
-                        result.push_str(sep);
-                        result.push_str(s);
-                    }
+                match scalar.try_as_str() {
+                    Some(Some(v)) => values.push(v),
                     Some(None) => {} // null literal string
                     None => {
-                        return internal_err!("Expected string literal, got {scalar:?}")
+                        return internal_err!("Expected string literal, got {scalar:?}");
                     }
                 }
             }
+            let result = values.join(sep);
 
-            return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result))));
+            return match return_datatype {
+                DataType::Utf8View => {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result))))
+                }
+                DataType::LargeUtf8 => {
+                    Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(result))))
+                }
+                _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result)))),
+            };
         }
 
         // Array
@@ -183,23 +196,61 @@ impl ScalarUDFImpl for ConcatWsFunc {
 
         // parse sep
         let sep = match &args[0] {
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
-                data_size += s.len() * len * (args.len() - 2); // estimate
-                ColumnarValueRef::Scalar(s.as_bytes())
-            }
-            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
-                return Ok(ColumnarValue::Array(Arc::new(StringArray::new_null(len))));
-            }
-            ColumnarValue::Array(array) => {
-                let string_array = as_string_array(array)?;
-                data_size += string_array.values().len() * (args.len() - 2); // estimate
-                if array.is_nullable() {
-                    ColumnarValueRef::NullableArray(string_array)
-                } else {
-                    ColumnarValueRef::NonNullableArray(string_array)
+            ColumnarValue::Scalar(scalar) => match scalar.try_as_str() {
+                Some(Some(s)) => {
+                    data_size += s.len() * len * (args.len() - 2); // estimate
+                    ColumnarValueRef::Scalar(s.as_bytes())
                 }
-            }
-            _ => unreachable!("concat ws"),
+                Some(None) => {
+                    return match return_datatype {
+                        DataType::Utf8View => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None)))
+                        }
+                        DataType::LargeUtf8 => {
+                            Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+                        }
+                        _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+                    };
+                }
+                None => {
+                    return internal_err!("Expected string separator, got {scalar:?}");
+                }
+            },
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Utf8 => {
+                    let string_array = as_string_array(array)?;
+                    data_size += string_array.values().len() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableArray(string_array)
+                    }
+                }
+                DataType::LargeUtf8 => {
+                    let string_array = as_large_string_array(array)?;
+                    data_size += string_array.values().len() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableLargeStringArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                    }
+                }
+                DataType::Utf8View => {
+                    let string_array = as_string_view_array(array)?;
+                    data_size +=
+                        string_array.total_buffer_bytes_used() * (args.len() - 2);
+                    if array.is_nullable() {
+                        ColumnarValueRef::NullableStringViewArray(string_array)
+                    } else {
+                        ColumnarValueRef::NonNullableStringViewArray(string_array)
+                    }
+                }
+                other => {
+                    return plan_err!(
+                        "Input was {other} which is not a supported datatype for concat_ws separator"
+                    );
+                }
+            },
         };
 
         let mut columns = Vec::with_capacity(args.len() - 1);
@@ -225,31 +276,37 @@ impl ScalarUDFImpl for ConcatWsFunc {
                                 ColumnarValueRef::NonNullableArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::LargeUtf8 => {
-                            let string_array = as_largestring_array(array);
+                            let string_array = as_large_string_array(array)?;
 
                             data_size += string_array.values().len();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableLargeStringArray(string_array)
                             } else {
-                                ColumnarValueRef::NonNullableLargeStringArray(string_array)
+                                ColumnarValueRef::NonNullableLargeStringArray(
+                                    string_array,
+                                )
                             };
                             columns.push(column);
-                        },
+                        }
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
-                            data_size += string_array.data_buffers().iter().map(|buf| buf.len()).sum::<usize>();
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 bytes).
+                            data_size += string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 ColumnarValueRef::NullableStringViewArray(string_array)
                             } else {
                                 ColumnarValueRef::NonNullableStringViewArray(string_array)
                             };
                             columns.push(column);
-                        },
+                        }
                         other => {
-                            return plan_err!("Input was {other} which is not a supported datatype for concat_ws function.")
+                            return plan_err!(
+                                "Input was {other} which is not a supported datatype for concat_ws function."
+                            );
                         }
                     };
                 }
@@ -257,32 +314,71 @@ impl ScalarUDFImpl for ConcatWsFunc {
             }
         }
 
-        let mut builder = StringArrayBuilder::with_capacity(len, data_size);
-        for i in 0..len {
-            if !sep.is_valid(i) {
-                builder.append_offset();
-                continue;
+        match return_datatype {
+            DataType::Utf8View => {
+                let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset();
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset();
+                }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls()))))
             }
-
-            let mut iter = columns.iter();
-            for column in iter.by_ref() {
-                if column.is_valid(i) {
-                    builder.write::<false>(column, i);
-                    break;
+            DataType::LargeUtf8 => {
+                let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset();
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset();
                 }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls()))))
             }
-
-            for column in iter {
-                if column.is_valid(i) {
-                    builder.write::<false>(&sep, i);
-                    builder.write::<false>(column, i);
+            _ => {
+                let mut builder = StringArrayBuilder::with_capacity(len, data_size);
+                for i in 0..len {
+                    if !sep.is_valid(i) {
+                        builder.append_offset();
+                        continue;
+                    }
+                    let mut first = true;
+                    for column in &columns {
+                        if column.is_valid(i) {
+                            if !first {
+                                builder.write::<false>(&sep, i);
+                            }
+                            builder.write::<false>(column, i);
+                            first = false;
+                        }
+                    }
+                    builder.append_offset();
                 }
+                Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls()))))
             }
-
-            builder.append_offset();
         }
-
-        Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls()))))
     }
 
     /// Simply the `concat_ws` function by
@@ -293,7 +389,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
     fn simplify(
         &self,
         args: Vec<Expr>,
-        _info: &dyn SimplifyInfo,
+        _info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
         match &args[..] {
             [delimiter, vals @ ..] => simplify_concat_ws(delimiter, vals),
@@ -307,15 +403,31 @@ impl ScalarUDFImpl for ConcatWsFunc {
 }
 
 fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyResult> {
+    // Preserve the delimiter's string type for any new literals produced
+    // during simplification.
+    let delimiter_type = match delimiter {
+        Expr::Literal(v, _) => v.data_type(),
+        _ => DataType::Utf8,
+    };
+
+    let typed_lit = |s: String| -> Expr {
+        match delimiter_type {
+            DataType::LargeUtf8 => lit(ScalarValue::LargeUtf8(Some(s))),
+            DataType::Utf8View => lit(ScalarValue::Utf8View(Some(s))),
+            _ => lit(s),
+        }
+    };
+
     match delimiter {
         Expr::Literal(
             ScalarValue::Utf8(delimiter)
             | ScalarValue::LargeUtf8(delimiter)
             | ScalarValue::Utf8View(delimiter),
+            _,
         ) => {
             match delimiter {
-                // when the delimiter is an empty string,
-                // we can use `concat` to replace `concat_ws`
+                // When the delimiter is the empty string, replace `concat_ws`
+                // with `concat`
                 Some(delimiter) if delimiter.is_empty() => {
                     match simplify_concat(args.to_vec())? {
                         ExprSimplifyResult::Original(_) => {
@@ -331,29 +443,41 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                 }
                 Some(delimiter) => {
                     let mut new_args = Vec::with_capacity(args.len());
-                    new_args.push(lit(delimiter));
+                    new_args.push(typed_lit(delimiter.to_string()));
                     let mut contiguous_scalar = None;
                     for arg in args {
                         match arg {
                             // filter out null args
-                            Expr::Literal(ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Utf8View(None)) => {}
-                            Expr::Literal(ScalarValue::Utf8(Some(v)) | ScalarValue::LargeUtf8(Some(v)) | ScalarValue::Utf8View(Some(v))) => {
-                                match contiguous_scalar {
-                                    None => contiguous_scalar = Some(v.to_string()),
-                                    Some(mut pre) => {
-                                        pre += delimiter;
-                                        pre += v;
-                                        contiguous_scalar = Some(pre)
-                                    }
+                            Expr::Literal(
+                                ScalarValue::Utf8(None)
+                                | ScalarValue::LargeUtf8(None)
+                                | ScalarValue::Utf8View(None),
+                                _,
+                            ) => {}
+                            Expr::Literal(
+                                ScalarValue::Utf8(Some(v))
+                                | ScalarValue::LargeUtf8(Some(v))
+                                | ScalarValue::Utf8View(Some(v)),
+                                _,
+                            ) => match contiguous_scalar {
+                                None => contiguous_scalar = Some(v.to_string()),
+                                Some(mut pre) => {
+                                    pre += delimiter;
+                                    pre += v;
+                                    contiguous_scalar = Some(pre)
                                 }
+                            },
+                            Expr::Literal(s, _) => {
+                                return internal_err!(
+                                    "The scalar {s} should be casted to string type during the type coercion."
+                                );
                             }
-                            Expr::Literal(s) => return internal_err!("The scalar {s} should be casted to string type during the type coercion."),
                             // If the arg is not a literal, we should first push the current `contiguous_scalar`
                             // to the `new_args` and reset it to None.
                             // Then pushing this arg to the `new_args`.
                             arg => {
                                 if let Some(val) = contiguous_scalar {
-                                    new_args.push(lit(val));
+                                    new_args.push(typed_lit(val));
                                 }
                                 new_args.push(arg.clone());
                                 contiguous_scalar = None;
@@ -361,7 +485,7 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                         }
                     }
                     if let Some(val) = contiguous_scalar {
-                        new_args.push(lit(val));
+                        new_args.push(typed_lit(val));
                     }
 
                     Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
@@ -371,13 +495,21 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
                         },
                     )))
                 }
-                // if the delimiter is null, then the value of the whole expression is null.
-                None => Ok(ExprSimplifyResult::Simplified(Expr::Literal(
-                    ScalarValue::Utf8(None),
-                ))),
+                // If the delimiter is null, then the value of the whole expression is null.
+                None => {
+                    let null_scalar = match delimiter_type {
+                        DataType::LargeUtf8 => ScalarValue::LargeUtf8(None),
+                        DataType::Utf8View => ScalarValue::Utf8View(None),
+                        _ => ScalarValue::Utf8(None),
+                    };
+                    Ok(ExprSimplifyResult::Simplified(Expr::Literal(
+                        null_scalar,
+                        None,
+                    )))
+                }
             }
         }
-        Expr::Literal(d) => internal_err!(
+        Expr::Literal(d, _) => internal_err!(
             "The scalar {d} should be casted to string type during the type coercion."
         ),
         _ => {
@@ -394,7 +526,7 @@ fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result<ExprSimplifyRes
 
 fn is_null(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(v) => v.is_null(),
+        Expr::Literal(v, _) => v.is_null(),
         _ => false,
     }
 }
@@ -404,11 +536,12 @@ mod tests {
     use std::sync::Arc;
 
     use crate::string::concat_ws::ConcatWsFunc;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
     use arrow::datatypes::Field;
     use datafusion_common::Result;
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use crate::utils::test::test_function;
@@ -491,6 +624,7 @@ mod tests {
             arg_fields,
             number_rows: 3,
             return_field: Field::new("f", Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = ConcatWsFunc::new().invoke_with_args(args)?;
@@ -527,6 +661,7 @@ mod tests {
             arg_fields,
             number_rows: 3,
             return_field: Field::new("f", Utf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = ConcatWsFunc::new().invoke_with_args(args)?;
@@ -542,4 +677,265 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn concat_ws_utf8view_scalar_separator() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(",".to_string())));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringArray::from(vec!["foo", "bar", "baz"])));
+        let c2 = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(StringViewArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_scalar_separator() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(",".to_string())));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringArray::from(vec!["foo", "bar", "baz"])));
+        let c2 = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+            Field::new("a", Utf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(LargeStringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_utf8view_nullable_separator() -> Result<()> {
+        let c0 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            Some(","),
+            None,
+            Some("+"),
+        ])));
+        let c1 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            "foo", "bar", "baz",
+        ])));
+        let c2 = ColumnarValue::Array(Arc::new(StringViewArray::from(vec![
+            Some("x"),
+            Some("y"),
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("foo,x"),
+            None,
+            Some("baz+z"),
+        ])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_arrays() -> Result<()> {
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(",".to_string())));
+        let c1 = ColumnarValue::Array(Arc::new(LargeStringArray::from(vec![
+            "foo", "bar", "baz",
+        ])));
+        let c2 = ColumnarValue::Array(Arc::new(LargeStringArray::from(vec![
+            Some("x"),
+            None,
+            Some("z"),
+        ])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 3,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        let expected =
+            Arc::new(LargeStringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef;
+        match &result {
+            ColumnarValue::Array(array) => {
+                assert_eq!(&expected, array);
+            }
+            _ => panic!("Expected array result"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_utf8view_null_separator() -> Result<()> {
+        // All-scalar path: null Utf8View separator should return Utf8View(None)
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(None));
+        let c1 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some("aa".to_string())));
+        let c2 = ColumnarValue::Scalar(ScalarValue::Utf8View(Some("bb".to_string())));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {}
+            other => panic!("Expected Utf8View(None), got {other:?}"),
+        }
+
+        // Array path: null Utf8View scalar separator with array args
+        let c0 = ColumnarValue::Scalar(ScalarValue::Utf8View(None));
+        let c1 =
+            ColumnarValue::Array(Arc::new(StringViewArray::from(vec!["foo", "bar"])));
+
+        let arg_fields = vec![
+            Field::new("a", Utf8View, true).into(),
+            Field::new("a", Utf8View, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1],
+            arg_fields,
+            number_rows: 2,
+            return_field: Field::new("f", Utf8View, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {}
+            other => panic!("Expected Utf8View(None), got {other:?}"),
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn concat_ws_largeutf8_null_separator() -> Result<()> {
+        // All-scalar path: null LargeUtf8 separator should return LargeUtf8(None)
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(None));
+        let c1 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("aa".to_string())));
+        let c2 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bb".to_string())));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1, c2],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {}
+            other => panic!("Expected LargeUtf8(None), got {other:?}"),
+        }
+
+        // Array path: null LargeUtf8 scalar separator with array args
+        let c0 = ColumnarValue::Scalar(ScalarValue::LargeUtf8(None));
+        let c1 =
+            ColumnarValue::Array(Arc::new(LargeStringArray::from(vec!["foo", "bar"])));
+
+        let arg_fields = vec![
+            Field::new("a", LargeUtf8, true).into(),
+            Field::new("a", LargeUtf8, true).into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![c0, c1],
+            arg_fields,
+            number_rows: 2,
+            return_field: Field::new("f", LargeUtf8, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+
+        let result = ConcatWsFunc::new().invoke_with_args(args)?;
+        match result {
+            ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {}
+            other => panic!("Expected LargeUtf8(None), got {other:?}"),
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs
index b74be15466265..b7ec95be444cc 100644
--- a/datafusion/functions/src/string/contains.rs
+++ b/datafusion/functions/src/string/contains.rs
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::utils::make_scalar_function;
-use arrow::array::{Array, ArrayRef, AsArray};
+use arrow::array::{Array, ArrayRef, Scalar};
 use arrow::compute::contains as arrow_contains;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::binary::{binary_to_string_coercion, string_coercion};
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -46,7 +45,7 @@ use std::sync::Arc;
     standard_argument(name = "str", prefix = "String"),
     argument(name = "search_str", description = "The string to search for in str.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ContainsFunc {
     signature: Signature,
 }
@@ -89,7 +88,7 @@ impl ScalarUDFImpl for ContainsFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(contains, vec![])(&args.args)
+        contains(args.args.as_slice())
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -97,50 +96,78 @@ impl ScalarUDFImpl for ContainsFunc {
     }
 }
 
+fn to_array(value: &ColumnarValue) -> Result<(ArrayRef, bool)> {
+    match value {
+        ColumnarValue::Array(array) => Ok((Arc::clone(array), false)),
+        ColumnarValue::Scalar(scalar) => Ok((scalar.to_array()?, true)),
+    }
+}
+
+/// Helper to call arrow_contains with proper Datum handling.
+/// When an argument is marked as scalar, we wrap it in `Scalar` to tell arrow's
+/// kernel to use the optimized single-value code path instead of iterating.
+fn call_arrow_contains(
+    haystack: &ArrayRef,
+    haystack_is_scalar: bool,
+    needle: &ArrayRef,
+    needle_is_scalar: bool,
+) -> Result<ColumnarValue> {
+    // Arrow's Datum trait is implemented for ArrayRef, Arc<dyn Array>, and Scalar<T>
+    // We pass ArrayRef directly when not scalar, or wrap in Scalar when it is
+    let result = match (haystack_is_scalar, needle_is_scalar) {
+        (false, false) => arrow_contains(haystack, needle)?,
+        (false, true) => arrow_contains(haystack, &Scalar::new(Arc::clone(needle)))?,
+        (true, false) => arrow_contains(&Scalar::new(Arc::clone(haystack)), needle)?,
+        (true, true) => arrow_contains(
+            &Scalar::new(Arc::clone(haystack)),
+            &Scalar::new(Arc::clone(needle)),
+        )?,
+    };
+
+    // If both inputs were scalar, return a scalar result
+    if haystack_is_scalar && needle_is_scalar {
+        let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?;
+        Ok(ColumnarValue::Scalar(scalar))
+    } else {
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }
+}
+
 /// use `arrow::compute::contains` to do the calculation for contains
-fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+fn contains(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let (haystack, haystack_is_scalar) = to_array(&args[0])?;
+    let (needle, needle_is_scalar) = to_array(&args[1])?;
+
     if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
+        string_coercion(haystack.data_type(), needle.data_type()).or_else(|| {
+            binary_to_string_coercion(haystack.data_type(), needle.data_type())
         })
     {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
+        let haystack = if haystack.data_type() == &coercion_data_type {
+            haystack
         } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
+            arrow::compute::kernels::cast::cast(&haystack, &coercion_data_type)?
         };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
+        let needle = if needle.data_type() == &coercion_data_type {
+            needle
         } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
+            arrow::compute::kernels::cast::cast(&needle, &coercion_data_type)?
         };
 
         match coercion_data_type {
-            Utf8View => {
-                let mod_str = arg0.as_string_view();
-                let match_str = arg1.as_string_view();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
-            Utf8 => {
-                let mod_str = arg0.as_string::<i32>();
-                let match_str = arg1.as_string::<i32>();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
-            LargeUtf8 => {
-                let mod_str = arg0.as_string::<i64>();
-                let match_str = arg1.as_string::<i64>();
-                let res = arrow_contains(mod_str, match_str)?;
-                Ok(Arc::new(res) as ArrayRef)
-            }
+            Utf8View | Utf8 | LargeUtf8 => call_arrow_contains(
+                &haystack,
+                haystack_is_scalar,
+                &needle,
+                needle_is_scalar,
+            ),
             other => {
                 exec_err!("Unsupported data type {other:?} for function `contains`.")
             }
         }
     } else {
         exec_err!(
-            "Unsupported data type {:?}, {:?} for function `contains`.",
+            "Unsupported data type {}, {:?} for function `contains`.",
             args[0].data_type(),
             args[1].data_type()
         )
@@ -154,6 +181,7 @@ mod test {
     use arrow::array::{BooleanArray, StringArray};
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::ScalarValue;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
 
@@ -175,6 +203,7 @@ mod test {
             arg_fields,
             number_rows: 2,
             return_field: Field::new("f", DataType::Boolean, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let actual = udf.invoke_with_args(args).unwrap();
@@ -191,8 +220,11 @@ mod test {
     #[test]
     fn test_contains_api() {
         let expr = contains(
-            Expr::Literal(ScalarValue::Utf8(Some("the quick brown fox".to_string()))),
-            Expr::Literal(ScalarValue::Utf8(Some("row".to_string()))),
+            Expr::Literal(
+                ScalarValue::Utf8(Some("the quick brown fox".to_string())),
+                None,
+            ),
+            Expr::Literal(ScalarValue::Utf8(Some("row".to_string())), None),
         );
         assert_eq!(
             expr.to_string(),
diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs
index eafc310236ee3..f65349a83799d 100644
--- a/datafusion/functions/src/string/ends_with.rs
+++ b/datafusion/functions/src/string/ends_with.rs
@@ -18,12 +18,13 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
+use arrow::array::{ArrayRef, Scalar};
+use arrow::compute::kernels::comparison::ends_with as arrow_ends_with;
 use arrow::datatypes::DataType;
 
-use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::binary::{binary_to_string_coercion, string_coercion};
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -52,7 +53,7 @@ use datafusion_macros::user_doc;
     standard_argument(name = "str", prefix = "String"),
     argument(name = "substr", description = "Substring to test for.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct EndsWithFunc {
     signature: Signature,
 }
@@ -95,12 +96,70 @@ impl ScalarUDFImpl for EndsWithFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
-                make_scalar_function(ends_with, vec![])(&args.args)
+        let [str_arg, suffix_arg] = take_function_args(self.name(), &args.args)?;
+
+        // Determine the common type for coercion
+        let coercion_type = string_coercion(
+            &str_arg.data_type(),
+            &suffix_arg.data_type(),
+        )
+        .or_else(|| {
+            binary_to_string_coercion(&str_arg.data_type(), &suffix_arg.data_type())
+        });
+
+        let Some(coercion_type) = coercion_type else {
+            return exec_err!(
+                "Unsupported data types {:?}, {:?} for function `ends_with`.",
+                str_arg.data_type(),
+                suffix_arg.data_type()
+            );
+        };
+
+        // Helper to cast an array if needed
+        let maybe_cast = |arr: &ArrayRef, target: &DataType| -> Result<ArrayRef> {
+            if arr.data_type() == target {
+                Ok(Arc::clone(arr))
+            } else {
+                Ok(arrow::compute::kernels::cast::cast(arr, target)?)
+            }
+        };
+
+        match (str_arg, suffix_arg) {
+            // Both scalars - just compute directly
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Scalar(suffix_scalar)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let suffix_arr = suffix_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let suffix_arr = maybe_cast(&suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_arr, &suffix_arr)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                    &result, 0,
+                )?))
+            }
+            // String is array, suffix is scalar - use Scalar wrapper for optimization
+            (ColumnarValue::Array(str_arr), ColumnarValue::Scalar(suffix_scalar)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let suffix_arr = suffix_scalar.to_array_of_size(1)?;
+                let suffix_arr = maybe_cast(&suffix_arr, &coercion_type)?;
+                let suffix_scalar = Scalar::new(suffix_arr);
+                let result = arrow_ends_with(&str_arr, &suffix_scalar)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
-            other => {
-                internal_err!("Unsupported data type {other:?} for function ends_with. Expected Utf8, LargeUtf8 or Utf8View")?
+            // String is scalar, suffix is array - use Scalar wrapper for string
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Array(suffix_arr)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let str_scalar = Scalar::new(str_arr);
+                let suffix_arr = maybe_cast(suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_scalar, &suffix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // Both arrays - pass directly
+            (ColumnarValue::Array(str_arr), ColumnarValue::Array(suffix_arr)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let suffix_arr = maybe_cast(suffix_arr, &coercion_type)?;
+                let result = arrow_ends_with(&str_arr, &suffix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
         }
     }
@@ -110,47 +169,24 @@ impl ScalarUDFImpl for EndsWithFunc {
     }
 }
 
-/// Returns true if string ends with suffix.
-/// ends_with('alphabet', 'abet') = 't'
-fn ends_with(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
-        })
-    {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
-        };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
-        };
-        let result = arrow::compute::kernels::comparison::ends_with(&arg0, &arg1)?;
-        Ok(Arc::new(result) as ArrayRef)
-    } else {
-        internal_err!(
-            "Unsupported data types for ends_with. Expected Utf8, LargeUtf8 or Utf8View"
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, BooleanArray};
+    use arrow::array::{Array, BooleanArray, StringArray};
     use arrow::datatypes::DataType::Boolean;
+    use arrow::datatypes::{DataType, Field};
+    use std::sync::Arc;
 
     use datafusion_common::Result;
     use datafusion_common::ScalarValue;
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
 
     use crate::string::ends_with::EndsWithFunc;
     use crate::utils::test::test_function;
 
     #[test]
-    fn test_functions() -> Result<()> {
+    fn test_scalar_scalar() -> Result<()> {
+        // Test Scalar + Scalar combinations
         test_function!(
             EndsWithFunc::new(),
             vec![
@@ -196,6 +232,186 @@ mod tests {
             BooleanArray
         );
 
+        // Test with LargeUtf8
+        test_function!(
+            EndsWithFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(
+                    "alphabet".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("bet".to_string()))),
+            ],
+            Ok(Some(true)),
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        // Test with Utf8View
+        test_function!(
+            EndsWithFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "alphabet".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("bet".to_string()))),
+            ],
+            Ok(Some(true)),
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
         Ok(())
     }
+
+    #[test]
+    fn test_array_scalar() -> Result<()> {
+        // Test Array + Scalar (the optimized path)
+        let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ])));
+        let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("bet".to_string())));
+
+        let args = vec![array, scalar];
+        test_function!(
+            EndsWithFunc::new(),
+            args,
+            Ok(Some(true)), // First element result: "alphabet" ends with "bet"
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_scalar_full_result() {
+        // Test Array + Scalar and verify all results
+        let func = EndsWithFunc::new();
+        let array = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("bet".to_string()))),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(bool_array.value(1)); // "alphabet" ends with "bet"
+        assert!(!bool_array.value(2)); // "beta" does not end with "bet"
+        assert!(bool_array.is_null(3)); // null input -> null output
+    }
+
+    #[test]
+    fn test_scalar_array() {
+        // Test Scalar + Array
+        let func = EndsWithFunc::new();
+        let suffixes = Arc::new(StringArray::from(vec![
+            Some("bet"),
+            Some("alph"),
+            Some("phabet"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alphabet".to_string()))),
+            ColumnarValue::Array(suffixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(!bool_array.value(1)); // "alphabet" does not end with "alph"
+        assert!(bool_array.value(2)); // "alphabet" ends with "phabet"
+        assert!(bool_array.is_null(3)); // null suffix -> null output
+    }
+
+    #[test]
+    fn test_array_array() {
+        // Test Array + Array
+        let func = EndsWithFunc::new();
+        let strings = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("rust"),
+            Some("datafusion"),
+            None,
+        ]));
+        let suffixes = Arc::new(StringArray::from(vec![
+            Some("bet"),
+            Some("st"),
+            Some("hello"),
+            Some("test"),
+        ]));
+        let args = vec![
+            ColumnarValue::Array(strings),
+            ColumnarValue::Array(suffixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" ends with "bet"
+        assert!(bool_array.value(1)); // "rust" ends with "st"
+        assert!(!bool_array.value(2)); // "datafusion" does not end with "hello"
+        assert!(bool_array.is_null(3)); // null string -> null output
+    }
 }
diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs
index a1a486c7d3cf4..4fedd0d3ebad0 100644
--- a/datafusion/functions/src/string/levenshtein.rs
+++ b/datafusion/functions/src/string/levenshtein.rs
@@ -26,7 +26,7 @@ use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
 use datafusion_common::utils::datafusion_strsim;
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
@@ -57,7 +57,7 @@ use datafusion_macros::user_doc;
         description = "String expression to compute Levenshtein distance with str1."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LevenshteinFunc {
     signature: Signature,
 }
@@ -101,7 +101,9 @@ impl ScalarUDFImpl for LevenshteinFunc {
         {
             utf8_to_int_type(&coercion_data_type, "levenshtein")
         } else {
-            exec_err!("Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View")
+            exec_err!(
+                "Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View"
+            )
         }
     }
 
@@ -149,12 +151,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8View => {
                 let str1_array = as_string_view_array(&str1)?;
                 let str2_array = as_string_view_array(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -164,12 +172,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -179,12 +193,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::LargeUtf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i64)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i64)
                         }
                         _ => None,
                     })
@@ -198,7 +218,9 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             }
         }
     } else {
-        exec_err!("Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View")
+        exec_err!(
+            "Unsupported data types for levenshtein. Expected Utf8, LargeUtf8 or Utf8View"
+        )
     }
 }
 
diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs
index 536c29a7cb253..d91e4595c58ac 100644
--- a/datafusion/functions/src/string/lower.rs
+++ b/datafusion/functions/src/string/lower.rs
@@ -19,9 +19,8 @@ use arrow::datatypes::DataType;
 use std::any::Any;
 
 use crate::string::common::to_lower;
-use crate::utils::utf8_to_str_type;
-use datafusion_common::types::logical_string;
 use datafusion_common::Result;
+use datafusion_common::types::logical_string;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
@@ -44,7 +43,7 @@ use datafusion_macros::user_doc;
     related_udf(name = "initcap"),
     related_udf(name = "upper")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LowerFunc {
     signature: Signature,
 }
@@ -82,7 +81,7 @@ impl ScalarUDFImpl for LowerFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "lower")
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -97,9 +96,9 @@ impl ScalarUDFImpl for LowerFunc {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
     use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
 
     fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
@@ -110,7 +109,8 @@ mod tests {
             number_rows: input.len(),
             args: vec![ColumnarValue::Array(input)],
             arg_fields,
-            return_field: Field::new("f", Utf8, true).into(),
+            return_field: Field::new("f", expected.data_type().clone(), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = match func.invoke_with_args(args)? {
@@ -195,4 +195,21 @@ mod tests {
 
         to_lower(input, expected)
     }
+
+    #[test]
+    fn lower_utf8view() -> Result<()> {
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"),
+            None,
+            Some("TSCHÜSS"),
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("arrow"),
+            None,
+            Some("tschüss"),
+        ])) as ArrayRef;
+
+        to_lower(input, expected)
+    }
 }
diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs
index 65849202efc66..f84b273b8d6b2 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -21,9 +21,9 @@ use std::any::Any;
 use std::sync::Arc;
 
 use crate::string::common::*;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -31,7 +31,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with leading characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with leading characters removed. If the characters are not specified, spaces are removed.
 /// ltrim('zzzytest', 'xyz') = 'test'
 fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -41,12 +41,12 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Left, use_string_view)
+    general_trim::<T, TrimLeft>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.",
+    description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string.",
     syntax_example = "ltrim(str[, trim_str])",
     sql_example = r#"```sql
 > select ltrim('  datafusion  ');
@@ -65,13 +65,13 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._"
+        description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._"
     ),
     alternative_syntax = "trim(LEADING trim_str FROM str)",
     related_udf(name = "btrim"),
     related_udf(name = "rtrim")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LtrimFunc {
     signature: Signature,
 }
@@ -115,11 +115,7 @@ impl ScalarUDFImpl for LtrimFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[0] == DataType::Utf8View {
-            Ok(DataType::Utf8View)
-        } else {
-            utf8_to_str_type(&arg_types[0], "ltrim")
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs
index 17ea2726b071e..3732897f3d372 100644
--- a/datafusion/functions/src/string/octet_length.rs
+++ b/datafusion/functions/src/string/octet_length.rs
@@ -45,7 +45,7 @@ use datafusion_macros::user_doc;
     related_udf(name = "bit_length"),
     related_udf(name = "length")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct OctetLengthFunc {
     signature: Signature,
 }
@@ -119,7 +119,7 @@ mod tests {
     use arrow::datatypes::DataType::Int32;
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::octet_length::OctetLengthFunc;
diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs
index 2d36cb8356a00..65f320c4f9f13 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -18,16 +18,17 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::utf8_to_str_type;
 use arrow::array::{
-    ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
+    Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::types::{logical_int64, logical_string, NativeType};
-use datafusion_common::{exec_err, DataFusionError, Result};
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
@@ -51,7 +52,7 @@ use datafusion_macros::user_doc;
         description = "Number of times to repeat the input string."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RepeatFunc {
     signature: Signature,
 }
@@ -99,7 +100,63 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(repeat, vec![])(&args.args)
+        let return_type = args.return_field.data_type().clone();
+        let [string_arg, count_arg] = take_function_args(self.name(), args.args)?;
+
+        // Early return if either argument is a scalar null
+        if let ColumnarValue::Scalar(s) = &string_arg
+            && s.is_null()
+        {
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(&return_type)?));
+        }
+        if let ColumnarValue::Scalar(c) = &count_arg
+            && c.is_null()
+        {
+            return Ok(ColumnarValue::Scalar(ScalarValue::try_from(&return_type)?));
+        }
+
+        match (&string_arg, &count_arg) {
+            (
+                ColumnarValue::Scalar(string_scalar),
+                ColumnarValue::Scalar(count_scalar),
+            ) => {
+                let count = match count_scalar {
+                    ScalarValue::Int64(Some(n)) => *n,
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for repeat count",
+                            count_scalar.data_type()
+                        );
+                    }
+                };
+
+                let result = match string_scalar {
+                    ScalarValue::Utf8(Some(s)) | ScalarValue::Utf8View(Some(s)) => {
+                        ScalarValue::Utf8(Some(compute_repeat(
+                            s,
+                            count,
+                            i32::MAX as usize,
+                        )?))
+                    }
+                    ScalarValue::LargeUtf8(Some(s)) => ScalarValue::LargeUtf8(Some(
+                        compute_repeat(s, count, i64::MAX as usize)?,
+                    )),
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for function repeat",
+                            string_scalar.data_type()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Scalar(result))
+            }
+            _ => {
+                let string_array = string_arg.to_array(args.number_rows)?;
+                let count_array = count_arg.to_array(args.number_rows)?;
+                Ok(ColumnarValue::Array(repeat(&string_array, &count_array)?))
+            }
+        }
     }
 
     fn documentation(&self) -> Option<&Documentation> {
@@ -107,31 +164,48 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 }
 
+/// Computes repeat for a single string value with max size check
+#[inline]
+fn compute_repeat(s: &str, count: i64, max_size: usize) -> Result<String> {
+    if count <= 0 {
+        return Ok(String::new());
+    }
+    let result_len = s.len().saturating_mul(count as usize);
+    if result_len > max_size {
+        return exec_err!(
+            "string size overflow on repeat, max size is {}, but got {}",
+            max_size,
+            result_len
+        );
+    }
+    Ok(s.repeat(count as usize))
+}
+
 /// Repeats string the specified number of times.
 /// repeat('Pg', 4) = 'PgPgPgPg'
-fn repeat(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let number_array = as_int64_array(&args[1])?;
-    match args[0].data_type() {
+fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result<ArrayRef> {
+    let number_array = as_int64_array(count_array)?;
+    match string_array.data_type() {
         Utf8View => {
-            let string_view_array = args[0].as_string_view();
+            let string_view_array = string_array.as_string_view();
             repeat_impl::<i32, &StringViewArray>(
-                string_view_array,
+                &string_view_array,
                 number_array,
                 i32::MAX as usize,
             )
         }
         Utf8 => {
-            let string_array = args[0].as_string::<i32>();
+            let string_arr = string_array.as_string::<i32>();
             repeat_impl::<i32, &GenericStringArray<i32>>(
-                string_array,
+                &string_arr,
                 number_array,
                 i32::MAX as usize,
             )
         }
         LargeUtf8 => {
-            let string_array = args[0].as_string::<i64>();
+            let string_arr = string_array.as_string::<i64>();
             repeat_impl::<i64, &GenericStringArray<i64>>(
-                string_array,
+                &string_arr,
                 number_array,
                 i64::MAX as usize,
             )
@@ -144,15 +218,16 @@ fn repeat(args: &[ArrayRef]) -> Result<ArrayRef> {
 }
 
 fn repeat_impl<'a, T, S>(
-    string_array: S,
+    string_array: &S,
     number_array: &Int64Array,
     max_str_len: usize,
 ) -> Result<ArrayRef>
 where
     T: OffsetSizeTrait,
-    S: StringArrayType<'a>,
+    S: StringArrayType<'a> + 'a,
 {
     let mut total_capacity = 0;
+    let mut max_item_capacity = 0;
     string_array.iter().zip(number_array.iter()).try_for_each(
         |(string, number)| -> Result<(), DataFusionError> {
             match (string, number) {
@@ -166,6 +241,7 @@ where
                         );
                     }
                     total_capacity += item_capacity;
+                    max_item_capacity = max_item_capacity.max(item_capacity);
                 }
                 _ => (),
             }
@@ -176,21 +252,58 @@ where
     let mut builder =
         GenericStringBuilder::<T>::with_capacity(string_array.len(), total_capacity);
 
-    string_array.iter().zip(number_array.iter()).try_for_each(
-        |(string, number)| -> Result<(), DataFusionError> {
+    // Reusable buffer to avoid allocations in string.repeat()
+    let mut buffer = Vec::<u8>::with_capacity(max_item_capacity);
+
+    // Helper function to repeat a string into a buffer using doubling strategy
+    // count must be > 0
+    #[inline]
+    fn repeat_to_buffer(buffer: &mut Vec<u8>, string: &str, count: usize) {
+        buffer.clear();
+        if !string.is_empty() {
+            let src = string.as_bytes();
+            // Initial copy
+            buffer.extend_from_slice(src);
+            // Doubling strategy: copy what we have so far until we reach the target
+            while buffer.len() < src.len() * count {
+                let copy_len = buffer.len().min(src.len() * count - buffer.len());
+                // SAFETY: we're copying valid UTF-8 bytes that we already verified
+                buffer.extend_from_within(..copy_len);
+            }
+        }
+    }
+
+    // Fast path: no nulls in either array
+    if string_array.null_count() == 0 && number_array.null_count() == 0 {
+        for i in 0..string_array.len() {
+            // SAFETY: i is within bounds (0..len) and null_count() == 0 guarantees valid value
+            let string = unsafe { string_array.value_unchecked(i) };
+            let count = number_array.value(i);
+            if count > 0 {
+                repeat_to_buffer(&mut buffer, string, count as usize);
+                // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
+                builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
+            } else {
+                builder.append_value("");
+            }
+        }
+    } else {
+        // Slow path: handle nulls
+        for (string, number) in string_array.iter().zip(number_array.iter()) {
             match (string, number) {
-                (Some(string), Some(number)) if number >= 0 => {
-                    builder.append_value(string.repeat(number as usize));
+                (Some(string), Some(count)) if count > 0 => {
+                    repeat_to_buffer(&mut buffer, string, count as usize);
+                    // SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
+                    builder
+                        .append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
                 }
                 (Some(_), Some(_)) => builder.append_value(""),
                 _ => builder.append_null(),
             }
-            Ok(())
-        },
-    )?;
-    let array = builder.finish();
+        }
+    }
 
-    Ok(Arc::new(array) as ArrayRef)
+    Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
 #[cfg(test)]
@@ -199,7 +312,7 @@ mod tests {
     use arrow::datatypes::DataType::Utf8;
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::repeat::RepeatFunc;
diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs
index de70215c49c77..458b86d0c6fb0 100644
--- a/datafusion/functions/src/string/replace.rs
+++ b/datafusion/functions/src/string/replace.rs
@@ -18,13 +18,13 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
+use arrow::array::{ArrayRef, GenericStringBuilder, OffsetSizeTrait};
 use arrow::datatypes::DataType;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
@@ -52,7 +52,7 @@ use datafusion_macros::user_doc;
     ),
     standard_argument(name = "replacement", prefix = "Replacement substring")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ReplaceFunc {
     signature: Signature,
 }
@@ -101,7 +101,9 @@ impl ScalarUDFImpl for ReplaceFunc {
         {
             utf8_to_str_type(&coercion_data_type, "replace")
         } else {
-            exec_err!("Unsupported data types for replace. Expected Utf8, LargeUtf8 or Utf8View")
+            exec_err!(
+                "Unsupported data types for replace. Expected Utf8, LargeUtf8 or Utf8View"
+            )
         }
     }
 
@@ -145,7 +147,7 @@ impl ScalarUDFImpl for ReplaceFunc {
             }
         } else {
             exec_err!(
-                "Unsupported data type {:?}, {:?}, {:?} for function replace.",
+                "Unsupported data type {}, {:?}, {:?} for function replace.",
                 data_types[0],
                 data_types[1],
                 data_types[2]
@@ -163,17 +165,25 @@ fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> {
     let from_array = as_string_view_array(&args[1])?;
     let to_array = as_string_view_array(&args[2])?;
 
-    let result = string_array
+    let mut builder = GenericStringBuilder::<i32>::new();
+    let mut buffer = String::new();
+
+    for ((string, from), to) in string_array
         .iter()
         .zip(from_array.iter())
         .zip(to_array.iter())
-        .map(|((string, from), to)| match (string, from, to) {
-            (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)),
-            _ => None,
-        })
-        .collect::<StringArray>();
+    {
+        match (string, from, to) {
+            (Some(string), Some(from), Some(to)) => {
+                buffer.clear();
+                replace_into_string(&mut buffer, string, from, to);
+                builder.append_value(&buffer);
+            }
+            _ => builder.append_null(),
+        }
+    }
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
 /// Replaces all occurrences in string of substring from with substring to.
@@ -183,17 +193,66 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let from_array = as_generic_string_array::<T>(&args[1])?;
     let to_array = as_generic_string_array::<T>(&args[2])?;
 
-    let result = string_array
+    let mut builder = GenericStringBuilder::<T>::new();
+    let mut buffer = String::new();
+
+    for ((string, from), to) in string_array
         .iter()
         .zip(from_array.iter())
         .zip(to_array.iter())
-        .map(|((string, from), to)| match (string, from, to) {
-            (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)),
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
+    {
+        match (string, from, to) {
+            (Some(string), Some(from), Some(to)) => {
+                buffer.clear();
+                replace_into_string(&mut buffer, string, from, to);
+                builder.append_value(&buffer);
+            }
+            _ => builder.append_null(),
+        }
+    }
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+/// Helper function to perform string replacement into a reusable String buffer
+#[inline]
+fn replace_into_string(buffer: &mut String, string: &str, from: &str, to: &str) {
+    if from.is_empty() {
+        // When from is empty, insert 'to' at the beginning, between each character, and at the end
+        // This matches the behavior of str::replace()
+        buffer.push_str(to);
+        for ch in string.chars() {
+            buffer.push(ch);
+            buffer.push_str(to);
+        }
+        return;
+    }
+
+    // Fast path for replacing a single ASCII character with another single ASCII character.
+    // Extends the buffer's underlying Vec<u8> directly, for performance.
+    if let ([from_byte], [to_byte]) = (from.as_bytes(), to.as_bytes())
+        && from_byte.is_ascii()
+        && to_byte.is_ascii()
+    {
+        // SAFETY: Replacing an ASCII byte with another ASCII byte preserves UTF-8 validity.
+        unsafe {
+            buffer.as_mut_vec().extend(
+                string
+                    .as_bytes()
+                    .iter()
+                    .map(|&b| if b == *from_byte { *to_byte } else { b }),
+            );
+        }
+        return;
+    }
+
+    let mut last_end = 0;
+    for (start, _part) in string.match_indices(from) {
+        buffer.push_str(&string[last_end..start]);
+        buffer.push_str(to);
+        last_end = start + from.len();
+    }
+    buffer.push_str(&string[last_end..]);
 }
 
 #[cfg(test)]
diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs
index bb33274978daf..5659d0acfd97c 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -21,9 +21,9 @@ use std::any::Any;
 use std::sync::Arc;
 
 use crate::string::common::*;
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::function::Hint;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
@@ -31,7 +31,7 @@ use datafusion_expr::{
 };
 use datafusion_macros::user_doc;
 
-/// Returns the longest string  with trailing characters removed. If the characters are not specified, whitespace is removed.
+/// Returns the longest string with trailing characters removed. If the characters are not specified, spaces are removed.
 /// rtrim('testxxzx', 'xyz') = 'test'
 fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -41,12 +41,12 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     } else {
         args.to_owned()
     };
-    general_trim::<T>(&args, TrimType::Right, use_string_view)
+    general_trim::<T, TrimRight>(&args, use_string_view)
 }
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.",
+    description = "Trims the specified trim string from the end of a string. If no trim string is provided, all spaces are removed from the end of the input string.",
     syntax_example = "rtrim(str[, trim_str])",
     alternative_syntax = "trim(TRAILING trim_str FROM str)",
     sql_example = r#"```sql
@@ -66,12 +66,12 @@ fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(
         name = "trim_str",
-        description = "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._"
+        description = "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._"
     ),
     related_udf(name = "btrim"),
     related_udf(name = "ltrim")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RtrimFunc {
     signature: Signature,
 }
@@ -115,11 +115,7 @@ impl ScalarUDFImpl for RtrimFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[0] == DataType::Utf8View {
-            Ok(DataType::Utf8View)
-        } else {
-            utf8_to_str_type(&arg_types[0], "rtrim")
-        }
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs
index 724d9c278cca5..0bd197818e4e2 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -22,10 +22,13 @@ use arrow::array::{
 };
 use arrow::array::{AsArray, GenericStringBuilder};
 use arrow::datatypes::DataType;
-use datafusion_common::cast::as_int64_array;
 use datafusion_common::ScalarValue;
-use datafusion_common::{exec_err, DataFusionError, Result};
-use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::{DataFusionError, Result, exec_datafusion_err, exec_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, TypeSignatureClass, Volatility,
+};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -45,9 +48,12 @@ use std::sync::Arc;
 ```"#,
     standard_argument(name = "str", prefix = "String"),
     argument(name = "delimiter", description = "String or character to split on."),
-    argument(name = "pos", description = "Position of the part to return.")
+    argument(
+        name = "pos",
+        description = "Position of the part to return (counting from 1). Negative values count backward from the end of the string."
+    )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SplitPartFunc {
     signature: Signature,
 }
@@ -60,19 +66,16 @@ impl Default for SplitPartFunc {
 
 impl SplitPartFunc {
     pub fn new() -> Self {
-        use DataType::*;
         Self {
-            signature: Signature::one_of(
+            signature: Signature::coercible(
                 vec![
-                    TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![Utf8View, Utf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8View, LargeUtf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![Utf8, Utf8, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, Utf8View, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, Utf8, Int64]),
-                    TypeSignature::Exact(vec![Utf8, LargeUtf8, Int64]),
-                    TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int64()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int64,
+                    ),
                 ],
                 Volatility::Immutable,
             ),
@@ -123,64 +126,64 @@ impl ScalarUDFImpl for SplitPartFunc {
         let result = match (args[0].data_type(), args[1].data_type()) {
             (DataType::Utf8View, DataType::Utf8View) => {
                 split_part_impl::<&StringViewArray, &StringViewArray, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string_view(),
+                    &args[0].as_string_view(),
+                    &args[1].as_string_view(),
                     n_array,
                 )
             }
             (DataType::Utf8View, DataType::Utf8) => {
                 split_part_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string::<i32>(),
+                    &args[0].as_string_view(),
+                    &args[1].as_string::<i32>(),
                     n_array,
                 )
             }
             (DataType::Utf8View, DataType::LargeUtf8) => {
                 split_part_impl::<&StringViewArray, &GenericStringArray<i64>, i32>(
-                    args[0].as_string_view(),
-                    args[1].as_string::<i64>(),
+                    &args[0].as_string_view(),
+                    &args[1].as_string::<i64>(),
                     n_array,
                 )
             }
             (DataType::Utf8, DataType::Utf8View) => {
                 split_part_impl::<&GenericStringArray<i32>, &StringViewArray, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string_view(),
+                    &args[0].as_string::<i32>(),
+                    &args[1].as_string_view(),
                     n_array,
                 )
             }
             (DataType::LargeUtf8, DataType::Utf8View) => {
                 split_part_impl::<&GenericStringArray<i64>, &StringViewArray, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string_view(),
+                    &args[0].as_string::<i64>(),
+                    &args[1].as_string_view(),
                     n_array,
                 )
             }
             (DataType::Utf8, DataType::Utf8) => {
                 split_part_impl::<&GenericStringArray<i32>, &GenericStringArray<i32>, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string::<i32>(),
+                    &args[0].as_string::<i32>(),
+                    &args[1].as_string::<i32>(),
                     n_array,
                 )
             }
             (DataType::LargeUtf8, DataType::LargeUtf8) => {
                 split_part_impl::<&GenericStringArray<i64>, &GenericStringArray<i64>, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string::<i64>(),
+                    &args[0].as_string::<i64>(),
+                    &args[1].as_string::<i64>(),
                     n_array,
                 )
             }
             (DataType::Utf8, DataType::LargeUtf8) => {
                 split_part_impl::<&GenericStringArray<i32>, &GenericStringArray<i64>, i32>(
-                    args[0].as_string::<i32>(),
-                    args[1].as_string::<i64>(),
+                    &args[0].as_string::<i32>(),
+                    &args[1].as_string::<i64>(),
                     n_array,
                 )
             }
             (DataType::LargeUtf8, DataType::Utf8) => {
                 split_part_impl::<&GenericStringArray<i64>, &GenericStringArray<i32>, i64>(
-                    args[0].as_string::<i64>(),
-                    args[1].as_string::<i32>(),
+                    &args[0].as_string::<i64>(),
+                    &args[1].as_string::<i32>(),
                     n_array,
                 )
             }
@@ -200,10 +203,9 @@ impl ScalarUDFImpl for SplitPartFunc {
     }
 }
 
-/// impl
-pub fn split_part_impl<'a, StringArrType, DelimiterArrType, StringArrayLen>(
-    string_array: StringArrType,
-    delimiter_array: DelimiterArrType,
+fn split_part_impl<'a, StringArrType, DelimiterArrType, StringArrayLen>(
+    string_array: &StringArrType,
+    delimiter_array: &DelimiterArrType,
     n_array: &Int64Array,
 ) -> Result<ArrayRef>
 where
@@ -220,22 +222,47 @@ where
         .try_for_each(|((string, delimiter), n)| -> Result<(), DataFusionError> {
             match (string, delimiter, n) {
                 (Some(string), Some(delimiter), Some(n)) => {
-                    let split_string: Vec<&str> = string.split(delimiter).collect();
-                    let len = split_string.len();
+                    let result = match n.cmp(&0) {
+                        std::cmp::Ordering::Greater => {
+                            // Positive index: use nth() to avoid collecting all parts
+                            // This stops iteration as soon as we find the nth element
+                            let idx: usize = (n - 1).try_into().map_err(|_| {
+                                exec_datafusion_err!(
+                                    "split_part index {n} exceeds maximum supported value"
+                                )
+                            })?;
 
-                    let index = match n.cmp(&0) {
-                        std::cmp::Ordering::Less => len as i64 + n,
+                            if delimiter.is_empty() {
+                                // Match PostgreSQL split_part behavior for empty delimiter:
+                                // treat the input as a single field ("ab" -> ["ab"]),
+                                // rather than Rust's split("") result (["", "a", "b", ""]).
+                                (n == 1).then_some(string)
+                            } else {
+                                string.split(delimiter).nth(idx)
+                            }
+                        }
+                        std::cmp::Ordering::Less => {
+                            // Negative index: use rsplit().nth() to efficiently get from the end
+                            // rsplit iterates in reverse, so -1 means first from rsplit (index 0)
+                            let idx: usize = (n.unsigned_abs() - 1).try_into().map_err(|_| {
+                                exec_datafusion_err!(
+                                    "split_part index {n} exceeds minimum supported value"
+                                )
+                            })?;
+                            if delimiter.is_empty() {
+                                // Match PostgreSQL split_part behavior for empty delimiter:
+                                // treat the input as a single field ("ab" -> ["ab"]),
+                                // rather than Rust's split("") result (["", "a", "b", ""]).
+                                (n == -1).then_some(string)
+                            } else {
+                                string.rsplit(delimiter).nth(idx)
+                            }
+                        }
                         std::cmp::Ordering::Equal => {
                             return exec_err!("field position must not be zero");
                         }
-                        std::cmp::Ordering::Greater => n - 1,
-                    } as usize;
-
-                    if index < len {
-                        builder.append_value(split_string[index]);
-                    } else {
-                        builder.append_value("");
-                    }
+                    };
+                    builder.append_value(result.unwrap_or(""));
                 }
                 _ => builder.append_null(),
             }
@@ -251,7 +278,7 @@ mod tests {
     use arrow::datatypes::DataType::Utf8;
 
     use datafusion_common::ScalarValue;
-    use datafusion_common::{exec_err, Result};
+    use datafusion_common::{Result, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::string::split_part::SplitPartFunc;
@@ -315,6 +342,131 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(
+                    "abc~@~def~@~ghi"
+                )))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("~@~")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(i64::MIN))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // Edge cases with delimiters
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
+
+        // Edge cases with delimiters with negative n
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
+            ],
+            Ok(Some("a,b")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            SplitPartFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
 
         Ok(())
     }
diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs
index a59d7080a5804..e50bd9f657669 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -18,47 +18,22 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow::array::ArrayRef;
+use arrow::array::{ArrayRef, Scalar};
+use arrow::compute::kernels::comparison::starts_with as arrow_starts_with;
 use arrow::datatypes::DataType;
-use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_common::types::logical_string;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
 use datafusion_expr::type_coercion::binary::{
     binary_to_string_coercion, string_coercion,
 };
-
-use crate::utils::make_scalar_function;
-use datafusion_common::types::logical_string;
-use datafusion_common::{internal_err, Result, ScalarValue};
 use datafusion_expr::{
-    cast, Coercion, ColumnarValue, Documentation, Expr, Like, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, Documentation, Expr, Like, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, cast,
 };
 use datafusion_macros::user_doc;
 
-/// Returns true if string starts with prefix.
-/// starts_with('alphabet', 'alph') = 't'
-fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
-        })
-    {
-        let arg0 = if args[0].data_type() == &coercion_data_type {
-            Arc::clone(&args[0])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[0], &coercion_data_type)?
-        };
-        let arg1 = if args[1].data_type() == &coercion_data_type {
-            Arc::clone(&args[1])
-        } else {
-            arrow::compute::kernels::cast::cast(&args[1], &coercion_data_type)?
-        };
-        let result = arrow::compute::kernels::comparison::starts_with(&arg0, &arg1)?;
-        Ok(Arc::new(result) as ArrayRef)
-    } else {
-        internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")
-    }
-}
-
 #[user_doc(
     doc_section(label = "String Functions"),
     description = "Tests if a string starts with a substring.",
@@ -74,7 +49,7 @@ fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
     standard_argument(name = "str", prefix = "String"),
     argument(name = "substr", description = "Substring to test for.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct StartsWithFunc {
     signature: Signature,
 }
@@ -117,32 +92,95 @@ impl ScalarUDFImpl for StartsWithFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
-                make_scalar_function(starts_with, vec![])(&args.args)
+        let [str_arg, prefix_arg] = take_function_args(self.name(), &args.args)?;
+
+        // Determine the common type for coercion
+        let coercion_type = string_coercion(
+            &str_arg.data_type(),
+            &prefix_arg.data_type(),
+        )
+        .or_else(|| {
+            binary_to_string_coercion(&str_arg.data_type(), &prefix_arg.data_type())
+        });
+
+        let Some(coercion_type) = coercion_type else {
+            return exec_err!(
+                "Unsupported data types {:?}, {:?} for function `starts_with`.",
+                str_arg.data_type(),
+                prefix_arg.data_type()
+            );
+        };
+
+        // Helper to cast an array if needed
+        let maybe_cast = |arr: &ArrayRef, target: &DataType| -> Result<ArrayRef> {
+            if arr.data_type() == target {
+                Ok(Arc::clone(arr))
+            } else {
+                Ok(arrow::compute::kernels::cast::cast(arr, target)?)
+            }
+        };
+
+        match (str_arg, prefix_arg) {
+            // Both scalars - just compute directly
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Scalar(prefix_scalar)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let prefix_arr = prefix_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let prefix_arr = maybe_cast(&prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_arr, &prefix_arr)?;
+                Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                    &result, 0,
+                )?))
+            }
+            // String is array, prefix is scalar - use Scalar wrapper for optimization
+            (ColumnarValue::Array(str_arr), ColumnarValue::Scalar(prefix_scalar)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let prefix_arr = prefix_scalar.to_array_of_size(1)?;
+                let prefix_arr = maybe_cast(&prefix_arr, &coercion_type)?;
+                let prefix_scalar = Scalar::new(prefix_arr);
+                let result = arrow_starts_with(&str_arr, &prefix_scalar)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // String is scalar, prefix is array - use Scalar wrapper for string
+            (ColumnarValue::Scalar(str_scalar), ColumnarValue::Array(prefix_arr)) => {
+                let str_arr = str_scalar.to_array_of_size(1)?;
+                let str_arr = maybe_cast(&str_arr, &coercion_type)?;
+                let str_scalar = Scalar::new(str_arr);
+                let prefix_arr = maybe_cast(prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_scalar, &prefix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            // Both arrays - pass directly
+            (ColumnarValue::Array(str_arr), ColumnarValue::Array(prefix_arr)) => {
+                let str_arr = maybe_cast(str_arr, &coercion_type)?;
+                let prefix_arr = maybe_cast(prefix_arr, &coercion_type)?;
+                let result = arrow_starts_with(&str_arr, &prefix_arr)?;
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
-            _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?,
         }
     }
 
     fn simplify(
         &self,
         args: Vec<Expr>,
-        info: &dyn SimplifyInfo,
+        info: &SimplifyContext,
     ) -> Result<ExprSimplifyResult> {
-        if let Expr::Literal(scalar_value) = &args[1] {
+        if let Expr::Literal(scalar_value, _) = &args[1] {
             // Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping
-            // Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%'
-            //   1. 'ja%'         (input pattern)
-            //   2. 'ja\%'        (escape special char '%')
-            //   3. 'ja\%%'       (add suffix for starts_with)
+            // Escapes pattern characters: starts_with(col, 'j\_a%') -> col LIKE 'j\\\_a\%%'
+            //   1. 'j\_a%'         (input pattern)
+            //   2. 'j\\\_a\%'       (escape special chars '%', '_' and '\')
+            //   3. 'j\\\_a\%%'      (add unescaped % suffix for starts_with)
             let like_expr = match scalar_value {
                 ScalarValue::Utf8(Some(pattern))
                 | ScalarValue::LargeUtf8(Some(pattern))
                 | ScalarValue::Utf8View(Some(pattern)) => {
-                    let escaped_pattern = pattern.replace("%", "\\%");
+                    let escaped_pattern = pattern
+                        .replace("\\", "\\\\")
+                        .replace("%", "\\%")
+                        .replace("_", "\\_");
                     let like_pattern = format!("{escaped_pattern}%");
-                    Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))
+                    Expr::Literal(ScalarValue::Utf8(Some(like_pattern)), None)
                 }
                 _ => return Ok(ExprSimplifyResult::Original(args)),
             };
@@ -188,16 +226,19 @@ impl ScalarUDFImpl for StartsWithFunc {
 #[cfg(test)]
 mod tests {
     use crate::utils::test::test_function;
-    use arrow::array::{Array, BooleanArray};
+    use arrow::array::{Array, BooleanArray, StringArray};
     use arrow::datatypes::DataType::Boolean;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
 
     use super::*;
 
     #[test]
-    fn test_functions() -> Result<()> {
-        // Generate test cases for starts_with
+    fn test_scalar_scalar() -> Result<()> {
+        // Test Scalar + Scalar combinations
         let test_cases = vec![
             (Some("alphabet"), Some("alph"), Some(true)),
             (Some("alphabet"), Some("bet"), Some(false)),
@@ -241,4 +282,154 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_array_scalar() -> Result<()> {
+        // Test Array + Scalar (the optimized path)
+        let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ])));
+        let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("alph".to_string())));
+
+        let args = vec![array, scalar];
+        test_function!(
+            StartsWithFunc::new(),
+            args,
+            Ok(Some(true)), // First element result
+            bool,
+            Boolean,
+            BooleanArray
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_scalar_full_result() {
+        // Test Array + Scalar and verify all results
+        let func = StartsWithFunc::new();
+        let array = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("alphabet"),
+            Some("beta"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Array(array),
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alph".to_string()))),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(bool_array.value(1)); // "alphabet" starts with "alph"
+        assert!(!bool_array.value(2)); // "beta" does not start with "alph"
+        assert!(bool_array.is_null(3)); // null input -> null output
+    }
+
+    #[test]
+    fn test_scalar_array() {
+        // Test Scalar + Array
+        let func = StartsWithFunc::new();
+        let prefixes = Arc::new(StringArray::from(vec![
+            Some("alph"),
+            Some("bet"),
+            Some("alpha"),
+            None,
+        ]));
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some("alphabet".to_string()))),
+            ColumnarValue::Array(prefixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(!bool_array.value(1)); // "alphabet" does not start with "bet"
+        assert!(bool_array.value(2)); // "alphabet" starts with "alpha"
+        assert!(bool_array.is_null(3)); // null prefix -> null output
+    }
+
+    #[test]
+    fn test_array_array() {
+        // Test Array + Array
+        let func = StartsWithFunc::new();
+        let strings = Arc::new(StringArray::from(vec![
+            Some("alphabet"),
+            Some("rust"),
+            Some("datafusion"),
+            None,
+        ]));
+        let prefixes = Arc::new(StringArray::from(vec![
+            Some("alph"),
+            Some("ru"),
+            Some("hello"),
+            Some("test"),
+        ]));
+        let args = vec![
+            ColumnarValue::Array(strings),
+            ColumnarValue::Array(prefixes),
+        ];
+
+        let result = func
+            .invoke_with_args(ScalarFunctionArgs {
+                args,
+                arg_fields: vec![
+                    Field::new("a", DataType::Utf8, true).into(),
+                    Field::new("b", DataType::Utf8, true).into(),
+                ],
+                number_rows: 4,
+                return_field: Field::new("f", Boolean, true).into(),
+                config_options: Arc::new(ConfigOptions::default()),
+            })
+            .unwrap();
+
+        let result_array = result.into_array(4).unwrap();
+        let bool_array = result_array
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+
+        assert!(bool_array.value(0)); // "alphabet" starts with "alph"
+        assert!(bool_array.value(1)); // "rust" starts with "ru"
+        assert!(!bool_array.value(2)); // "datafusion" does not start with "hello"
+        assert!(bool_array.is_null(3)); // null string -> null output
+    }
 }
diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs
index a3a1acfcf1f05..ed8ce07b876d5 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -16,59 +16,166 @@
 // under the License.
 
 use std::any::Any;
-use std::fmt::Write;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, GenericStringBuilder, OffsetSizeTrait};
+use arrow::array::{Array, ArrayRef, StringArray};
+use arrow::buffer::{Buffer, OffsetBuffer};
 use arrow::datatypes::{
-    ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type,
+    ArrowNativeType, ArrowPrimitiveType, DataType, Int8Type, Int16Type, Int32Type,
+    Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
-
-use crate::utils::make_scalar_function;
 use datafusion_common::cast::as_primitive_array;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, plan_err};
-
-use datafusion_expr::{ColumnarValue, Documentation};
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
 use datafusion_macros::user_doc;
 
+/// Hex lookup table for fast conversion
+const HEX_CHARS: &[u8; 16] = b"0123456789abcdef";
+
 /// Converts the number to its equivalent hexadecimal representation.
 /// to_hex(2147483647) = '7fffffff'
-pub fn to_hex<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
+fn to_hex_array<T: ArrowPrimitiveType>(array: &ArrayRef) -> Result<ArrayRef>
 where
-    T::Native: OffsetSizeTrait,
+    T::Native: ToHex,
 {
-    let integer_array = as_primitive_array::<T>(&args[0])?;
+    let integer_array = as_primitive_array::<T>(array)?;
+    let len = integer_array.len();
 
-    let mut result = GenericStringBuilder::<i32>::with_capacity(
-        integer_array.len(),
-        // * 8 to convert to bits, / 4 bits per hex char
-        integer_array.len() * (T::Native::get_byte_width() * 8 / 4),
-    );
+    // Max hex string length: 16 chars for u64/i64
+    let max_hex_len = T::Native::get_byte_width() * 2;
 
-    for integer in integer_array {
-        if let Some(value) = integer {
-            if let Some(value_usize) = value.to_usize() {
-                write!(result, "{value_usize:x}")?;
-            } else if let Some(value_isize) = value.to_isize() {
-                write!(result, "{value_isize:x}")?;
-            } else {
-                return exec_err!(
-                    "Unsupported data type {integer:?} for function to_hex"
-                );
-            }
-            result.append_value("");
-        } else {
-            result.append_null();
-        }
+    // Pre-allocate buffers - avoid the builder API overhead
+    let mut offsets: Vec<i32> = Vec::with_capacity(len + 1);
+    let mut values: Vec<u8> = Vec::with_capacity(len * max_hex_len);
+
+    // Reusable buffer for hex conversion
+    let mut hex_buffer = [0u8; 16];
+
+    // Start with offset 0
+    offsets.push(0);
+
+    // Process all values directly (including null slots - we write empty strings for nulls)
+    // The null bitmap will mark which entries are actually null
+    for value in integer_array.values() {
+        let hex_len = value.write_hex_to_buffer(&mut hex_buffer);
+        values.extend_from_slice(&hex_buffer[16 - hex_len..]);
+        offsets.push(values.len() as i32);
     }
 
-    let result = result.finish();
+    // Copy null bitmap from input (nulls pass through unchanged)
+    let nulls = integer_array.nulls().cloned();
+
+    // SAFETY: offsets are valid (monotonically increasing, last value equals values.len())
+    // and values contains valid UTF-8 (only ASCII hex digits)
+    let offsets =
+        unsafe { OffsetBuffer::new_unchecked(Buffer::from_vec(offsets).into()) };
+    let result = StringArray::new(offsets, Buffer::from_vec(values), nulls);
 
     Ok(Arc::new(result) as ArrayRef)
 }
 
+#[inline]
+fn to_hex_scalar<T: ToHex>(value: T) -> String {
+    let mut hex_buffer = [0u8; 16];
+    let hex_len = value.write_hex_to_buffer(&mut hex_buffer);
+    // SAFETY: hex_buffer is ASCII hex digits
+    unsafe { std::str::from_utf8_unchecked(&hex_buffer[16 - hex_len..]).to_string() }
+}
+
+/// Trait for converting integer types to hexadecimal in a buffer
+trait ToHex: ArrowNativeType {
+    /// Write hex representation to buffer and return the number of hex digits written.
+    /// The hex digits are written right-aligned in the buffer (starting from position 16 - len).
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize;
+}
+
+/// Write unsigned value to hex buffer and return the number of digits written.
+/// Digits are written right-aligned in the buffer.
+#[inline]
+fn write_unsigned_hex_to_buffer(value: u64, buffer: &mut [u8; 16]) -> usize {
+    if value == 0 {
+        buffer[15] = b'0';
+        return 1;
+    }
+
+    // Write hex digits from right to left
+    let mut pos = 16;
+    let mut v = value;
+    while v > 0 {
+        pos -= 1;
+        buffer[pos] = HEX_CHARS[(v & 0xf) as usize];
+        v >>= 4;
+    }
+
+    16 - pos
+}
+
+/// Write signed value to hex buffer (two's complement for negative) and return digit count
+#[inline]
+fn write_signed_hex_to_buffer(value: i64, buffer: &mut [u8; 16]) -> usize {
+    // For negative values, use two's complement representation (same as casting to u64)
+    write_unsigned_hex_to_buffer(value as u64, buffer)
+}
+
+impl ToHex for i8 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i16 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i32 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self as i64, buffer)
+    }
+}
+
+impl ToHex for i64 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_signed_hex_to_buffer(self, buffer)
+    }
+}
+
+impl ToHex for u8 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u16 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u32 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self as u64, buffer)
+    }
+}
+
+impl ToHex for u64 {
+    #[inline]
+    fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
+        write_unsigned_hex_to_buffer(self, buffer)
+    }
+}
+
 #[user_doc(
     doc_section(label = "String Functions"),
     description = "Converts an integer to a hexadecimal string.",
@@ -83,7 +190,7 @@ where
 ```"#,
     standard_argument(name = "int", prefix = "Integer")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToHexFunc {
     signature: Signature,
 }
@@ -96,9 +203,11 @@ impl Default for ToHexFunc {
 
 impl ToHexFunc {
     pub fn new() -> Self {
-        use DataType::*;
         Self {
-            signature: Signature::uniform(1, vec![Int64], Volatility::Immutable),
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Integer)],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -116,26 +225,76 @@ impl ScalarUDFImpl for ToHexFunc {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        use DataType::*;
-
-        Ok(match arg_types[0] {
-            Int8 | Int16 | Int32 | Int64 => Utf8,
-            _ => {
-                return plan_err!("The to_hex function can only accept integers.");
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        match args.args[0].data_type() {
-            DataType::Int32 => {
-                make_scalar_function(to_hex::<Int32Type>, vec![])(&args.args)
-            }
-            DataType::Int64 => {
-                make_scalar_function(to_hex::<Int64Type>, vec![])(&args.args)
+        let arg = &args.args[0];
+
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Int64(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt64(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt32(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int16(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt16(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Int8(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::UInt8(Some(v))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))),
+            ),
+
+            // NULL scalars
+            ColumnarValue::Scalar(s) if s.is_null() => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            other => exec_err!("Unsupported data type {other:?} for function to_hex"),
+
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Int64 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int64Type>(array)?))
+                }
+                DataType::UInt64 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt64Type>(array)?))
+                }
+                DataType::Int32 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int32Type>(array)?))
+                }
+                DataType::UInt32 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt32Type>(array)?))
+                }
+                DataType::Int16 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int16Type>(array)?))
+                }
+                DataType::UInt16 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt16Type>(array)?))
+                }
+                DataType::Int8 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<Int8Type>(array)?))
+                }
+                DataType::UInt8 => {
+                    Ok(ColumnarValue::Array(to_hex_array::<UInt8Type>(array)?))
+                }
+                other => exec_err!("Unsupported data type {other:?} for function to_hex"),
+            },
+
+            other => internal_err!(
+                "Unexpected argument type {:?} for function to_hex",
+                other.data_type()
+            ),
         }
     }
 
@@ -146,48 +305,92 @@ impl ScalarUDFImpl for ToHexFunc {
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Int32Array, StringArray};
-
+    use arrow::array::{
+        Int8Array, Int16Array, Int32Array, Int64Array, StringArray, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array,
+    };
     use datafusion_common::cast::as_string_array;
 
     use super::*;
 
-    #[test]
-    // Test to_hex function for zero
-    fn to_hex_zero() -> Result<()> {
-        let array = vec![0].into_iter().collect::<Int32Array>();
-        let array_ref = Arc::new(array);
-        let hex_value_arc = to_hex::<Int32Type>(&[array_ref])?;
-        let hex_value = as_string_array(&hex_value_arc)?;
-        let expected = StringArray::from(vec![Some("0")]);
-        assert_eq!(&expected, hex_value);
+    macro_rules! test_to_hex_type {
+        // Default test with standard input/output
+        ($name:ident, $arrow_type:ty, $array_type:ty) => {
+            test_to_hex_type!(
+                $name,
+                $arrow_type,
+                $array_type,
+                vec![Some(100), Some(0), None],
+                vec![Some("64"), Some("0"), None]
+            );
+        };
 
-        Ok(())
-    }
+        // Custom test with custom input/output (eg: positive number)
+        ($name:ident, $arrow_type:ty, $array_type:ty, $input:expr, $expected:expr) => {
+            #[test]
+            fn $name() -> Result<()> {
+                let input = $input;
+                let expected = $expected;
 
-    #[test]
-    // Test to_hex function for positive number
-    fn to_hex_positive_number() -> Result<()> {
-        let array = vec![100].into_iter().collect::<Int32Array>();
-        let array_ref = Arc::new(array);
-        let hex_value_arc = to_hex::<Int32Type>(&[array_ref])?;
-        let hex_value = as_string_array(&hex_value_arc)?;
-        let expected = StringArray::from(vec![Some("64")]);
-        assert_eq!(&expected, hex_value);
+                let array = <$array_type>::from(input);
+                let array_ref: ArrayRef = Arc::new(array);
+                let hex_result = to_hex_array::<$arrow_type>(&array_ref)?;
+                let hex_array = as_string_array(&hex_result)?;
+                let expected_array = StringArray::from(expected);
 
-        Ok(())
+                assert_eq!(&expected_array, hex_array);
+                Ok(())
+            }
+        };
     }
 
-    #[test]
-    // Test to_hex function for negative number
-    fn to_hex_negative_number() -> Result<()> {
-        let array = vec![-1].into_iter().collect::<Int32Array>();
-        let array_ref = Arc::new(array);
-        let hex_value_arc = to_hex::<Int32Type>(&[array_ref])?;
-        let hex_value = as_string_array(&hex_value_arc)?;
-        let expected = StringArray::from(vec![Some("ffffffffffffffff")]);
-        assert_eq!(&expected, hex_value);
+    test_to_hex_type!(
+        to_hex_int8,
+        Int8Type,
+        Int8Array,
+        vec![Some(100), Some(0), None, Some(-1)],
+        vec![Some("64"), Some("0"), None, Some("ffffffffffffffff")]
+    );
+    test_to_hex_type!(
+        to_hex_int16,
+        Int16Type,
+        Int16Array,
+        vec![Some(100), Some(0), None, Some(-1)],
+        vec![Some("64"), Some("0"), None, Some("ffffffffffffffff")]
+    );
+    test_to_hex_type!(
+        to_hex_int32,
+        Int32Type,
+        Int32Array,
+        vec![Some(100), Some(0), None, Some(-1)],
+        vec![Some("64"), Some("0"), None, Some("ffffffffffffffff")]
+    );
+    test_to_hex_type!(
+        to_hex_int64,
+        Int64Type,
+        Int64Array,
+        vec![Some(100), Some(0), None, Some(-1)],
+        vec![Some("64"), Some("0"), None, Some("ffffffffffffffff")]
+    );
+
+    test_to_hex_type!(to_hex_uint8, UInt8Type, UInt8Array);
+    test_to_hex_type!(to_hex_uint16, UInt16Type, UInt16Array);
+    test_to_hex_type!(to_hex_uint32, UInt32Type, UInt32Array);
+    test_to_hex_type!(to_hex_uint64, UInt64Type, UInt64Array);
 
-        Ok(())
-    }
+    test_to_hex_type!(
+        to_hex_large_signed,
+        Int64Type,
+        Int64Array,
+        vec![Some(i64::MAX), Some(i64::MIN)],
+        vec![Some("7fffffffffffffff"), Some("8000000000000000")]
+    );
+
+    test_to_hex_type!(
+        to_hex_large_unsigned,
+        UInt64Type,
+        UInt64Array,
+        vec![Some(u64::MAX), Some(u64::MIN)],
+        vec![Some("ffffffffffffffff"), Some("0")]
+    );
 }
diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs
index 882fb45eda4af..80375f58c87be 100644
--- a/datafusion/functions/src/string/upper.rs
+++ b/datafusion/functions/src/string/upper.rs
@@ -16,10 +16,9 @@
 // under the License.
 
 use crate::string::common::to_upper;
-use crate::utils::utf8_to_str_type;
 use arrow::datatypes::DataType;
-use datafusion_common::types::logical_string;
 use datafusion_common::Result;
+use datafusion_common::types::logical_string;
 use datafusion_expr::{
     Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
     TypeSignatureClass, Volatility,
@@ -43,7 +42,7 @@ use std::any::Any;
     related_udf(name = "initcap"),
     related_udf(name = "lower")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct UpperFunc {
     signature: Signature,
 }
@@ -81,7 +80,7 @@ impl ScalarUDFImpl for UpperFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "upper")
+        Ok(arg_types[0].clone())
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -96,9 +95,9 @@ impl ScalarUDFImpl for UpperFunc {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Array, ArrayRef, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, ArrayRef, StringArray, StringViewArray};
     use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
 
     fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
@@ -109,7 +108,8 @@ mod tests {
             number_rows: input.len(),
             args: vec![ColumnarValue::Array(input)],
             arg_fields: vec![arg_field],
-            return_field: Field::new("f", Utf8, true).into(),
+            return_field: Field::new("f", expected.data_type().clone(), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
         };
 
         let result = match func.invoke_with_args(args)? {
@@ -194,4 +194,21 @@ mod tests {
 
         to_upper(input, expected)
     }
+
+    #[test]
+    fn upper_utf8view() -> Result<()> {
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("arrow"),
+            None,
+            Some("tschüß"),
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"),
+            None,
+            Some("TSCHÜSS"),
+        ])) as ArrayRef;
+
+        to_upper(input, expected)
+    }
 }
diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs
index 29415a9b20805..3a99412f5ed29 100644
--- a/datafusion/functions/src/string/uuid.rs
+++ b/datafusion/functions/src/string/uuid.rs
@@ -24,14 +24,14 @@ use arrow::datatypes::DataType::Utf8;
 use rand::Rng;
 use uuid::Uuid;
 
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)) string value which is unique per row.",
+    description = "Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_%28random%29) string value which is unique per row.",
     syntax_example = "uuid()",
     sql_example = r#"```sql
 > select uuid();
@@ -42,7 +42,7 @@ use datafusion_macros::user_doc;
 +--------------------------------------+
 ```"#
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct UuidFunc {
     signature: Signature,
 }
@@ -56,7 +56,7 @@ impl Default for UuidFunc {
 impl UuidFunc {
     pub fn new() -> Self {
         Self {
-            signature: Signature::exact(vec![], Volatility::Volatile),
+            signature: Signature::nullary(Volatility::Volatile),
         }
     }
 }
@@ -81,9 +81,11 @@ impl ScalarUDFImpl for UuidFunc {
     /// Prints random (v4) uuid values per row
     /// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if !args.args.is_empty() {
-            return internal_err!("{} function does not accept arguments", self.name());
-        }
+        assert_or_internal_err!(
+            args.args.is_empty(),
+            "{} function does not accept arguments",
+            self.name()
+        );
 
         // Generate random u128 values
         let mut rng = rand::rng();
diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs
index 6299b353d57a9..8e25a45cf62dd 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -18,47 +18,12 @@
 use std::mem::size_of;
 
 use arrow::array::{
-    make_view, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ByteView,
-    GenericStringArray, LargeStringArray, NullBufferBuilder, OffsetSizeTrait,
-    StringArray, StringViewArray, StringViewBuilder,
+    Array, ArrayAccessor, ArrayDataBuilder, ByteView, LargeStringArray,
+    NullBufferBuilder, StringArray, StringViewArray, StringViewBuilder, make_view,
 };
 use arrow::buffer::{MutableBuffer, NullBuffer};
 use arrow::datatypes::DataType;
 
-/// Abstracts iteration over different types of string arrays.
-#[deprecated(since = "45.0.0", note = "Use arrow::array::StringArrayType instead")]
-pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
-    /// Return an [`ArrayIter`]  over the values of the array.
-    ///
-    /// This iterator iterates returns `Option<&str>` for each item in the array.
-    fn iter(&self) -> ArrayIter<Self>;
-
-    /// Check if the array is ASCII only.
-    fn is_ascii(&self) -> bool;
-}
-
-#[allow(deprecated)]
-impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> {
-    fn iter(&self) -> ArrayIter<Self> {
-        GenericStringArray::<T>::iter(self)
-    }
-
-    fn is_ascii(&self) -> bool {
-        GenericStringArray::<T>::is_ascii(self)
-    }
-}
-
-#[allow(deprecated)]
-impl<'a> StringArrayType<'a> for &'a StringViewArray {
-    fn iter(&self) -> ArrayIter<Self> {
-        StringViewArray::iter(self)
-    }
-
-    fn is_ascii(&self) -> bool {
-        StringViewArray::is_ascii(self)
-    }
-}
-
 /// Optimized version of the StringBuilder in Arrow that:
 /// 1. Precalculating the expected length of the result, avoiding reallocations.
 /// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
@@ -187,47 +152,46 @@ impl StringViewArrayBuilder {
             }
             ColumnarValueRef::NullableArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NullableLargeStringArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NullableStringViewArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NonNullableArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
             ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
             ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.block
-                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
         }
     }
 
     pub fn append_offset(&mut self) {
         self.builder.append_value(&self.block);
-        self.block = String::new();
+        self.block.clear();
     }
 
-    pub fn finish(mut self) -> StringViewArray {
-        self.builder.finish()
+    pub fn finish(mut self, null_buffer: Option<NullBuffer>) -> StringViewArray {
+        let array = self.builder.finish();
+        match null_buffer {
+            Some(nulls) => {
+                let array_data = array.into_data().into_builder().nulls(Some(nulls));
+                // SAFETY: the underlying data is valid; we are only adding a null buffer
+                StringViewArray::from(unsafe { array_data.build_unchecked() })
+            }
+            None => array,
+        }
     }
 }
 
diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs
index 4ee5995f0a6b8..052baf32df857 100644
--- a/datafusion/functions/src/unicode/character_length.rs
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -23,7 +23,8 @@ use arrow::array::{
 use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
 use datafusion_common::Result;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
@@ -45,7 +46,7 @@ use std::sync::Arc;
     related_udf(name = "bit_length"),
     related_udf(name = "octet_length")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct CharacterLengthFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -88,10 +89,7 @@ impl ScalarUDFImpl for CharacterLengthFunc {
         utf8_to_int_type(&arg_types[0], "character_length")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(character_length, vec![])(&args.args)
     }
 
@@ -111,21 +109,21 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
-            character_length_general::<Int32Type, _>(string_array)
+            character_length_general::<Int32Type, _>(&string_array)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            character_length_general::<Int64Type, _>(string_array)
+            character_length_general::<Int64Type, _>(&string_array)
         }
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
-            character_length_general::<Int32Type, _>(string_array)
+            character_length_general::<Int32Type, _>(&string_array)
         }
         _ => unreachable!("CharacterLengthFunc"),
     }
 }
 
-fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
+fn character_length_general<'a, T, V>(array: &V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
@@ -136,56 +134,37 @@ where
     // string is ASCII only is relatively cheap.
     // If strings are ASCII only, count bytes instead.
     let is_array_ascii_only = array.is_ascii();
-    let array = if array.null_count() == 0 {
+    let nulls = array.nulls().cloned();
+    let array = {
         if is_array_ascii_only {
             let values: Vec<_> = (0..array.len())
                 .map(|i| {
-                    let value = array.value(i);
+                    // Safety: we are iterating with array.len() so the index is always valid
+                    let value = unsafe { array.value_unchecked(i) };
                     T::Native::usize_as(value.len())
                 })
                 .collect();
-            PrimitiveArray::<T>::new(values.into(), None)
+            PrimitiveArray::<T>::new(values.into(), nulls)
         } else {
             let values: Vec<_> = (0..array.len())
                 .map(|i| {
-                    let value = array.value(i);
-                    if value.is_ascii() {
-                        T::Native::usize_as(value.len())
+                    // Safety: we are iterating with array.len() so the index is always valid
+                    if array.is_null(i) {
+                        T::default_value()
                     } else {
-                        T::Native::usize_as(value.chars().count())
+                        let value = unsafe { array.value_unchecked(i) };
+                        if value.is_empty() {
+                            T::default_value()
+                        } else if value.is_ascii() {
+                            T::Native::usize_as(value.len())
+                        } else {
+                            T::Native::usize_as(value.chars().count())
+                        }
                     }
                 })
                 .collect();
-            PrimitiveArray::<T>::new(values.into(), None)
+            PrimitiveArray::<T>::new(values.into(), nulls)
         }
-    } else if is_array_ascii_only {
-        let values: Vec<_> = (0..array.len())
-            .map(|i| {
-                if array.is_null(i) {
-                    T::default_value()
-                } else {
-                    let value = array.value(i);
-                    T::Native::usize_as(value.len())
-                }
-            })
-            .collect();
-        PrimitiveArray::<T>::new(values.into(), array.nulls().cloned())
-    } else {
-        let values: Vec<_> = (0..array.len())
-            .map(|i| {
-                if array.is_null(i) {
-                    T::default_value()
-                } else {
-                    let value = array.value(i);
-                    if value.is_ascii() {
-                        T::Native::usize_as(value.len())
-                    } else {
-                        T::Native::usize_as(value.chars().count())
-                    }
-                }
-            })
-            .collect();
-        PrimitiveArray::<T>::new(values.into(), array.nulls().cloned())
     };
 
     Ok(Arc::new(array))
@@ -246,7 +225,9 @@ mod tests {
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
             CharacterLengthFunc::new(),
-            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
+            &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+                String::from("josé")
+            )))],
             internal_err!(
                 "function character_length requires compilation with feature flag: unicode_expressions."
             ),
diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
new file mode 100644
index 0000000000000..93f0c7900961e
--- /dev/null
+++ b/datafusion/functions/src/unicode/common.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Common utilities for implementing unicode functions
+
+use arrow::array::{
+    Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array,
+    OffsetSizeTrait, StringViewArray, make_view,
+};
+use arrow::datatypes::DataType;
+use arrow_buffer::{NullBuffer, ScalarBuffer};
+use datafusion_common::cast::{
+    as_generic_string_array, as_int64_array, as_string_view_array,
+};
+use datafusion_common::exec_err;
+use std::cmp::Ordering;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// A trait for `left` and `right` byte slicing operations
+pub(crate) trait LeftRightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize>;
+}
+
+pub(crate) struct LeftSlicer {}
+
+impl LeftRightSlicer for LeftSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        0..left_right_byte_length(string, n)
+    }
+}
+
+pub(crate) struct RightSlicer {}
+
+impl LeftRightSlicer for RightSlicer {
+    fn slice(string: &str, n: i64) -> Range<usize> {
+        if n == 0 {
+            // Return nothing for `n=0`
+            0..0
+        } else if n == i64::MIN {
+            // Special case for i64::MIN overflow
+            0..0
+        } else {
+            left_right_byte_length(string, -n)..string.len()
+        }
+    }
+}
+
+/// Calculate the byte length of the substring of `n` chars from string `string`
+#[inline]
+fn left_right_byte_length(string: &str, n: i64) -> usize {
+    match n.cmp(&0) {
+        Ordering::Less => string
+            .char_indices()
+            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
+            .map(|(index, _)| index)
+            .unwrap_or(0),
+        Ordering::Equal => 0,
+        Ordering::Greater => string
+            .char_indices()
+            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
+            .map(|(index, _)| index)
+            .unwrap_or(string.len()),
+    }
+}
+
+/// General implementation for `left` and `right` functions
+pub(crate) fn general_left_right<F: LeftRightSlicer>(
+    args: &[ArrayRef],
+) -> datafusion_common::Result<ArrayRef> {
+    let n_array = as_int64_array(&args[1])?;
+
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = as_generic_string_array::<i32>(&args[0])?;
+            general_left_right_array::<i32, _, F>(string_array, n_array)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = as_generic_string_array::<i64>(&args[0])?;
+            general_left_right_array::<i64, _, F>(string_array, n_array)
+        }
+        DataType::Utf8View => {
+            let string_view_array = as_string_view_array(&args[0])?;
+            general_left_right_view::<F>(string_view_array, n_array)
+        }
+        _ => exec_err!("Not supported"),
+    }
+}
+
+/// `general_left_right` implementation for strings
+fn general_left_right_array<
+    'a,
+    T: OffsetSizeTrait,
+    V: ArrayAccessor<Item = &'a str>,
+    F: LeftRightSlicer,
+>(
+    string_array: V,
+    n_array: &Int64Array,
+) -> datafusion_common::Result<ArrayRef> {
+    let iter = ArrayIter::new(string_array);
+    let result = iter
+        .zip(n_array.iter())
+        .map(|(string, n)| match (string, n) {
+            (Some(string), Some(n)) => {
+                let range = F::slice(string, n);
+                // Extract a given range from a byte-indexed slice
+                Some(&string[range])
+            }
+            _ => None,
+        })
+        .collect::<GenericStringArray<T>>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// `general_left_right` implementation for StringViewArray
+fn general_left_right_view<F: LeftRightSlicer>(
+    string_view_array: &StringViewArray,
+    n_array: &Int64Array,
+) -> datafusion_common::Result<ArrayRef> {
+    let len = n_array.len();
+
+    let views = string_view_array.views();
+    // Every string in StringViewArray has one corresponding view in `views`
+    debug_assert!(views.len() == string_view_array.len());
+
+    // Compose null buffer at once
+    let string_nulls = string_view_array.nulls();
+    let n_nulls = n_array.nulls();
+    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
+
+    let new_views = (0..len)
+        .map(|idx| {
+            let view = views[idx];
+
+            let is_valid = match &new_nulls {
+                Some(nulls_buf) => nulls_buf.is_valid(idx),
+                None => true,
+            };
+
+            if is_valid {
+                let string: &str = string_view_array.value(idx);
+                let n = n_array.value(idx);
+
+                // Input string comes from StringViewArray, so it should fit in 32-bit length
+                let range = F::slice(string, n);
+                let result_bytes = &string.as_bytes()[range.clone()];
+
+                let byte_view = ByteView::from(view);
+                // New offset starts at 0 for left, and at `range.start` for right,
+                // which is encoded in the given range
+                let new_offset = byte_view.offset + (range.start as u32);
+                // Reuse buffer
+                make_view(result_bytes, byte_view.buffer_index, new_offset)
+            } else {
+                // For nulls, keep the original view
+                view
+            }
+        })
+        .collect::<Vec<u128>>();
+
+    // Buffers are unchanged
+    let result = StringViewArray::try_new(
+        ScalarBuffer::from(new_views),
+        Vec::from(string_view_array.data_buffers()),
+        new_nulls,
+    )?;
+    Ok(Arc::new(result) as ArrayRef)
+}
diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs
index 8b00c7be1ccf8..0cf20584a6bcc 100644
--- a/datafusion/functions/src/unicode/find_in_set.rs
+++ b/datafusion/functions/src/unicode/find_in_set.rs
@@ -19,14 +19,14 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    new_null_array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
-    OffsetSizeTrait, PrimitiveArray,
+    ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
+    PrimitiveArray,
 };
 use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
 
 use crate::utils::utf8_to_int_type;
 use datafusion_common::{
-    exec_err, internal_err, utils::take_function_args, Result, ScalarValue,
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
 };
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
@@ -53,7 +53,7 @@ use datafusion_macros::user_doc;
         description = "A string list is a string composed of substrings separated by , characters."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct FindInSetFunc {
     signature: Signature,
 }
@@ -98,9 +98,8 @@ impl ScalarUDFImpl for FindInSetFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let ScalarFunctionArgs { args, .. } = args;
-
-        let [string, str_list] = take_function_args(self.name(), args)?;
+        let return_field = args.return_field;
+        let [string, str_list] = take_function_args(self.name(), args.args)?;
 
         match (string, str_list) {
             // both inputs are scalars
@@ -139,9 +138,11 @@ impl ScalarUDFImpl for FindInSetFunc {
                     | ScalarValue::LargeUtf8(str_list_literal),
                 ),
             ) => {
-                let result_array = match str_list_literal {
+                match str_list_literal {
                     // find_in_set(column_a, null) = null
-                    None => new_null_array(str_array.data_type(), str_array.len()),
+                    None => Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        return_field.data_type(),
+                    )?)),
                     Some(str_list_literal) => {
                         let str_list = str_list_literal.split(',').collect::<Vec<&str>>();
                         let result = match str_array.data_type() {
@@ -149,31 +150,32 @@ impl ScalarUDFImpl for FindInSetFunc {
                                 let string_array = str_array.as_string::<i32>();
                                 find_in_set_right_literal::<Int32Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             DataType::LargeUtf8 => {
                                 let string_array = str_array.as_string::<i64>();
                                 find_in_set_right_literal::<Int64Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             DataType::Utf8View => {
                                 let string_array = str_array.as_string_view();
                                 find_in_set_right_literal::<Int32Type, _>(
                                     string_array,
-                                    str_list,
+                                    &str_list,
                                 )
                             }
                             other => {
-                                exec_err!("Unsupported data type {other:?} for function find_in_set")
+                                exec_err!(
+                                    "Unsupported data type {other:?} for function find_in_set"
+                                )
                             }
                         };
-                        Arc::new(result?)
+                        Ok(ColumnarValue::Array(Arc::new(result?)))
                     }
-                };
-                Ok(ColumnarValue::Array(result_array))
+                }
             }
 
             // `string` is scalar, `str_list` is an array
@@ -185,38 +187,45 @@ impl ScalarUDFImpl for FindInSetFunc {
                 ),
                 ColumnarValue::Array(str_list_array),
             ) => {
-                let res = match string_literal {
+                match string_literal {
                     // find_in_set(null, column_b) = null
-                    None => {
-                        new_null_array(str_list_array.data_type(), str_list_array.len())
-                    }
+                    None => Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        return_field.data_type(),
+                    )?)),
                     Some(string) => {
                         let result = match str_list_array.data_type() {
                             DataType::Utf8 => {
                                 let str_list = str_list_array.as_string::<i32>();
-                                find_in_set_left_literal::<Int32Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int32Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             DataType::LargeUtf8 => {
                                 let str_list = str_list_array.as_string::<i64>();
-                                find_in_set_left_literal::<Int64Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int64Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             DataType::Utf8View => {
                                 let str_list = str_list_array.as_string_view();
-                                find_in_set_left_literal::<Int32Type, _>(string, str_list)
+                                find_in_set_left_literal::<Int32Type, _>(
+                                    &string, str_list,
+                                )
                             }
                             other => {
-                                exec_err!("Unsupported data type {other:?} for function find_in_set")
+                                exec_err!(
+                                    "Unsupported data type {other:?} for function find_in_set"
+                                )
                             }
                         };
-                        Arc::new(result?)
+                        Ok(ColumnarValue::Array(Arc::new(result?)))
                     }
-                };
-                Ok(ColumnarValue::Array(res))
+                }
             }
 
             // both inputs are arrays
             (ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
-                let res = find_in_set(base_array, exp_array)?;
+                let res = find_in_set(&base_array, &exp_array)?;
 
                 Ok(ColumnarValue::Array(res))
             }
@@ -234,7 +243,7 @@ impl ScalarUDFImpl for FindInSetFunc {
 /// Returns a value in the range of 1 to N if the string `str` is in the string list `strlist`
 /// consisting of N substrings. A string list is a string composed of substrings separated by `,`
 /// characters.
-fn find_in_set(str: ArrayRef, str_list: ArrayRef) -> Result<ArrayRef> {
+fn find_in_set(str: &ArrayRef, str_list: &ArrayRef) -> Result<ArrayRef> {
     match str.data_type() {
         DataType::Utf8 => {
             let string_array = str.as_string::<i32>();
@@ -257,10 +266,7 @@ fn find_in_set(str: ArrayRef, str_list: ArrayRef) -> Result<ArrayRef> {
     }
 }
 
-pub fn find_in_set_general<'a, T, V>(
-    string_array: V,
-    str_list_array: V,
-) -> Result<ArrayRef>
+fn find_in_set_general<'a, T, V>(string_array: V, str_list_array: V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
@@ -289,10 +295,7 @@ where
     Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
-fn find_in_set_left_literal<'a, T, V>(
-    string: String,
-    str_list_array: V,
-) -> Result<ArrayRef>
+fn find_in_set_left_literal<'a, T, V>(string: &str, str_list_array: V) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
     T::Native: OffsetSizeTrait,
@@ -318,7 +321,7 @@ where
 
 fn find_in_set_right_literal<'a, T, V>(
     string_array: V,
-    str_list: Vec<&str>,
+    str_list: &[&str],
 ) -> Result<ArrayRef>
 where
     T: ArrowPrimitiveType,
@@ -349,6 +352,7 @@ mod tests {
     use crate::utils::test::test_function;
     use arrow::array::{Array, Int32Array, StringArray};
     use arrow::datatypes::{DataType::Int32, Field};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
     use std::sync::Arc;
@@ -483,6 +487,7 @@ mod tests {
                     arg_fields,
                     number_rows: cardinality,
                     return_field: Field::new("f", return_type, true).into(),
+                    config_options: Arc::new(ConfigOptions::default()),
                 });
                 assert!(result.is_ok());
 
diff --git a/datafusion/functions/src/unicode/initcap.rs b/datafusion/functions/src/unicode/initcap.rs
index c9b0cb77b0969..d1f618436355e 100644
--- a/datafusion/functions/src/unicode/initcap.rs
+++ b/datafusion/functions/src/unicode/initcap.rs
@@ -19,17 +19,19 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
+    Array, ArrayRef, GenericStringArray, GenericStringBuilder, OffsetSizeTrait,
+    StringViewBuilder,
 };
+use arrow::buffer::{Buffer, OffsetBuffer};
 use arrow::datatypes::DataType;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, ScalarValue, exec_err};
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignatureClass,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -50,7 +52,7 @@ use datafusion_macros::user_doc;
     related_udf(name = "lower"),
     related_udf(name = "upper")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct InitcapFunc {
     signature: Signature,
 }
@@ -95,10 +97,40 @@ impl ScalarUDFImpl for InitcapFunc {
         }
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arg = &args.args[0];
+
+        // Scalar fast path - handle directly without array conversion
+        if let ColumnarValue::Scalar(scalar) = arg {
+            return match scalar {
+                ScalarValue::Utf8(None)
+                | ScalarValue::LargeUtf8(None)
+                | ScalarValue::Utf8View(None) => Ok(arg.clone()),
+                ScalarValue::Utf8(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(result))))
+                }
+                ScalarValue::LargeUtf8(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(result))))
+                }
+                ScalarValue::Utf8View(Some(s)) => {
+                    let mut result = String::new();
+                    initcap_string(s, &mut result);
+                    Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(result))))
+                }
+                other => {
+                    exec_err!(
+                        "Unsupported data type {:?} for function `initcap`",
+                        other.data_type()
+                    )
+                }
+            };
+        }
+
+        // Array path
         let args = &args.args;
         match args[0].data_type() {
             DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
@@ -115,8 +147,8 @@ impl ScalarUDFImpl for InitcapFunc {
     }
 }
 
-/// Converts the first letter of each word to upper case and the rest to lower
-/// case. Words are sequences of alphanumeric characters separated by
+/// Converts the first letter of each word to uppercase and the rest to
+/// lowercase. Words are sequences of alphanumeric characters separated by
 /// non-alphanumeric characters.
 ///
 /// Example:
@@ -126,15 +158,20 @@ impl ScalarUDFImpl for InitcapFunc {
 fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_array = as_generic_string_array::<T>(&args[0])?;
 
+    if string_array.is_ascii() {
+        return Ok(initcap_ascii_array(string_array));
+    }
+
     let mut builder = GenericStringBuilder::<T>::with_capacity(
         string_array.len(),
         string_array.value_data().len(),
     );
 
+    let mut container = String::new();
     string_array.iter().for_each(|str| match str {
         Some(s) => {
-            let initcap_str = initcap_string(s);
-            builder.append_value(initcap_str);
+            initcap_string(s, &mut container);
+            builder.append_value(&container);
         }
         None => builder.append_null(),
     });
@@ -142,15 +179,71 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
+/// Fast path for `Utf8` or `LargeUtf8` arrays that are ASCII-only. We can use a
+/// single pass over the buffer and operate directly on bytes.
+fn initcap_ascii_array<T: OffsetSizeTrait>(
+    string_array: &GenericStringArray<T>,
+) -> ArrayRef {
+    let offsets = string_array.offsets();
+    let src = string_array.value_data();
+    let first_offset = offsets.first().unwrap().as_usize();
+    let last_offset = offsets.last().unwrap().as_usize();
+
+    // For sliced arrays, only convert the visible bytes, not the entire input
+    // buffer.
+    let mut out = Vec::with_capacity(last_offset - first_offset);
+
+    for window in offsets.windows(2) {
+        let start = window[0].as_usize();
+        let end = window[1].as_usize();
+
+        let mut prev_is_alnum = false;
+        for &b in &src[start..end] {
+            let converted = if prev_is_alnum {
+                b.to_ascii_lowercase()
+            } else {
+                b.to_ascii_uppercase()
+            };
+            out.push(converted);
+            prev_is_alnum = b.is_ascii_alphanumeric();
+        }
+    }
+
+    let values = Buffer::from_vec(out);
+    let out_offsets = if first_offset == 0 {
+        offsets.clone()
+    } else {
+        // For sliced arrays, we need to rebase the offsets to reflect that the
+        // output only contains the bytes in the visible slice.
+        let rebased_offsets = offsets
+            .iter()
+            .map(|offset| T::usize_as(offset.as_usize() - first_offset))
+            .collect::<Vec<_>>();
+        OffsetBuffer::<T>::new(rebased_offsets.into())
+    };
+
+    // SAFETY: ASCII case conversion preserves byte length, so the original
+    // string boundaries are preserved. `out_offsets` is either identical to
+    // the input offsets or a rebased version relative to the compacted values
+    // buffer.
+    Arc::new(unsafe {
+        GenericStringArray::<T>::new_unchecked(
+            out_offsets,
+            values,
+            string_array.nulls().cloned(),
+        )
+    })
+}
+
 fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
     let string_view_array = as_string_view_array(&args[0])?;
-
     let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
+    let mut container = String::new();
 
     string_view_array.iter().for_each(|str| match str {
         Some(s) => {
-            let initcap_str = initcap_string(s);
-            builder.append_value(initcap_str);
+            initcap_string(s, &mut container);
+            builder.append_value(&container);
         }
         None => builder.append_null(),
     });
@@ -158,41 +251,43 @@ fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
     Ok(Arc::new(builder.finish()) as ArrayRef)
 }
 
-fn initcap_string(input: &str) -> String {
-    let mut result = String::with_capacity(input.len());
+fn initcap_string(input: &str, container: &mut String) {
+    container.clear();
     let mut prev_is_alphanumeric = false;
 
     if input.is_ascii() {
-        for c in input.chars() {
+        container.reserve(input.len());
+        // SAFETY: each byte is ASCII, so the result is valid UTF-8.
+        let out = unsafe { container.as_mut_vec() };
+        for &b in input.as_bytes() {
             if prev_is_alphanumeric {
-                result.push(c.to_ascii_lowercase());
+                out.push(b.to_ascii_lowercase());
             } else {
-                result.push(c.to_ascii_uppercase());
-            };
-            prev_is_alphanumeric = c.is_ascii_alphanumeric();
+                out.push(b.to_ascii_uppercase());
+            }
+            prev_is_alphanumeric = b.is_ascii_alphanumeric();
         }
     } else {
         for c in input.chars() {
             if prev_is_alphanumeric {
-                result.extend(c.to_lowercase());
+                container.extend(c.to_lowercase());
             } else {
-                result.extend(c.to_uppercase());
+                container.extend(c.to_uppercase());
             }
             prev_is_alphanumeric = c.is_alphanumeric();
         }
     }
-
-    result
 }
 
 #[cfg(test)]
 mod tests {
     use crate::unicode::initcap::InitcapFunc;
     use crate::utils::test::test_function;
-    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
     use arrow::datatypes::DataType::{Utf8, Utf8View};
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+    use std::sync::Arc;
 
     #[test]
     fn test_functions() -> Result<()> {
@@ -296,4 +391,114 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_initcap_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            None,
+            Some("foo-bar_baz/baX"),
+            Some(""),
+            Some("123 abc 456DEF"),
+            Some("ALL CAPS"),
+            Some("already correct"),
+        ]);
+        let args: Vec<ArrayRef> = vec![Arc::new(array)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 7);
+        assert_eq!(result.value(0), "Hello World");
+        assert!(result.is_null(1));
+        assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+        assert_eq!(result.value(3), "");
+        assert_eq!(result.value(4), "123 Abc 456def");
+        assert_eq!(result.value(5), "All Caps");
+        assert_eq!(result.value(6), "Already Correct");
+        Ok(())
+    }
+
+    #[test]
+    fn test_initcap_ascii_large_array() -> Result<()> {
+        let array = LargeStringArray::from(vec![
+            Some("hello world"),
+            None,
+            Some("foo-bar_baz/baX"),
+            Some(""),
+            Some("123 abc 456DEF"),
+            Some("ALL CAPS"),
+            Some("already correct"),
+        ]);
+        let args: Vec<ArrayRef> = vec![Arc::new(array)];
+        let result = super::initcap::<i64>(&args)?;
+        let result = result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        assert_eq!(result.len(), 7);
+        assert_eq!(result.value(0), "Hello World");
+        assert!(result.is_null(1));
+        assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+        assert_eq!(result.value(3), "");
+        assert_eq!(result.value(4), "123 Abc 456def");
+        assert_eq!(result.value(5), "All Caps");
+        assert_eq!(result.value(6), "Already Correct");
+        Ok(())
+    }
+
+    /// Test that initcap works correctly on a sliced ASCII StringArray.
+    #[test]
+    fn test_initcap_sliced_ascii_array() -> Result<()> {
+        let array = StringArray::from(vec![
+            Some("hello world"),
+            Some("foo bar"),
+            Some("baz qux"),
+        ]);
+        // Slice to get only the last two elements. The resulting array's
+        // offsets are [11, 18, 25] (non-zero start), but value_data still
+        // contains the full original buffer.
+        let sliced = array.slice(1, 2);
+        let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+        let result = super::initcap::<i32>(&args)?;
+        let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "Foo Bar");
+        assert_eq!(result.value(1), "Baz Qux");
+
+        // The output values buffer should be compact
+        assert_eq!(*result.offsets().first().unwrap(), 0);
+        assert_eq!(
+            result.value_data().len(),
+            *result.offsets().last().unwrap() as usize
+        );
+        Ok(())
+    }
+
+    /// Test that initcap works correctly on a sliced ASCII LargeStringArray.
+    #[test]
+    fn test_initcap_sliced_ascii_large_array() -> Result<()> {
+        let array = LargeStringArray::from(vec![
+            Some("hello world"),
+            Some("foo bar"),
+            Some("baz qux"),
+        ]);
+        // Slice to get only the last two elements. The resulting array's
+        // offsets are [11, 18, 25] (non-zero start), but value_data still
+        // contains the full original buffer.
+        let sliced = array.slice(1, 2);
+        let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+        let result = super::initcap::<i64>(&args)?;
+        let result = result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.value(0), "Foo Bar");
+        assert_eq!(result.value(1), "Baz Qux");
+
+        // The output values buffer should be compact
+        assert_eq!(*result.offsets().first().unwrap(), 0);
+        assert_eq!(
+            result.value_data().len(),
+            *result.offsets().last().unwrap() as usize
+        );
+        Ok(())
+    }
 }
diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs
index f99f0de67ebb2..4ff38e52ce62a 100644
--- a/datafusion/functions/src/unicode/left.rs
+++ b/datafusion/functions/src/unicode/left.rs
@@ -16,24 +16,16 @@
 // under the License.
 
 use std::any::Any;
-use std::cmp::Ordering;
-use std::sync::Arc;
 
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array,
-    OffsetSizeTrait,
-};
+use crate::unicode::common::{LeftSlicer, general_left_right};
+use crate::utils::make_scalar_function;
 use arrow::datatypes::DataType;
-
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
-use datafusion_common::exec_err;
 use datafusion_common::Result;
+use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -53,7 +45,7 @@ use datafusion_macros::user_doc;
     argument(name = "n", description = "Number of characters to return."),
     related_udf(name = "right")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LeftFunc {
     signature: Signature,
 }
@@ -94,22 +86,23 @@ impl ScalarUDFImpl for LeftFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "left")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
+    /// left('abcde', 2) = 'ab'
+    /// left('abcde', -2) = 'abc'
+    /// The implementation uses UTF-8 code points as characters
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            DataType::Utf8 | DataType::Utf8View => {
-                make_scalar_function(left::<i32>, vec![])(args)
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
+                make_scalar_function(general_left_right::<LeftSlicer>, vec![])(args)
             }
-            DataType::LargeUtf8 => make_scalar_function(left::<i64>, vec![])(args),
             other => exec_err!(
-                "Unsupported data type {other:?} for function left,\
-                expected Utf8View, Utf8 or LargeUtf8."
+                "Unsupported data type {other:?} for function {},\
+                expected Utf8View, Utf8 or LargeUtf8.",
+                self.name()
             ),
         }
     }
@@ -119,54 +112,10 @@ impl ScalarUDFImpl for LeftFunc {
     }
 }
 
-/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
-/// left('abcde', 2) = 'ab'
-/// The implementation uses UTF-8 code points as characters
-pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-
-    if args[0].data_type() == &DataType::Utf8View {
-        let string_array = as_string_view_array(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
-    } else {
-        let string_array = as_generic_string_array::<T>(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
-    }
-}
-
-fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array: V,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let iter = ArrayIter::new(string_array);
-    let result = iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => {
-                    let len = string.chars().count() as i64;
-                    Some(if n.abs() < len {
-                        string.chars().take((len + n) as usize).collect::<String>()
-                    } else {
-                        "".to_string()
-                    })
-                }
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => {
-                    Some(string.chars().take(n as usize).collect::<String>())
-                }
-            },
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -209,6 +158,17 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcde")),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             LeftFunc::new(),
             vec![
@@ -290,6 +250,74 @@ mod tests {
             StringArray
         );
 
+        // StringView cases
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("ab")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("abcde")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            LeftFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "joséésoj".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+            ],
+            Ok(Some("joséé")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
+        // Unicode indexing case
+        let input = "joé楽s𐀀so↓j";
+        for n in 1..=input.chars().count() {
+            let expected = input
+                .chars()
+                .take(input.chars().count() - n)
+                .collect::<String>();
+            test_function!(
+                LeftFunc::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::from(input)),
+                    ColumnarValue::Scalar(ScalarValue::from(-(n as i64))),
+                ],
+                Ok(Some(expected.as_str())),
+                &str,
+                Utf8,
+                StringArray
+            );
+        }
+
         Ok(())
     }
 }
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index ea57dbd2bed51..3dc70c56a40ed 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -19,20 +19,21 @@ use std::any::Any;
 use std::fmt::Write;
 use std::sync::Arc;
 
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
     Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
 use unicode_segmentation::UnicodeSegmentation;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -49,14 +50,17 @@ use datafusion_macros::user_doc;
 +---------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "n", description = "String length to pad to."),
+    argument(
+        name = "n",
+        description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
+    ),
     argument(
         name = "padding_str",
         description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
     ),
     related_udf(name = "rpad")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct LPadFunc {
     signature: Signature,
 }
@@ -109,10 +113,7 @@ impl ScalarUDFImpl for LPadFunc {
         utf8_to_str_type(&arg_types[0], "lpad")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
             Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
@@ -129,7 +130,7 @@ impl ScalarUDFImpl for LPadFunc {
 /// Extends the string to length 'length' by prepending the characters fill (a space by default).
 /// If the string is already longer than length then it is truncated (on the right).
 /// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() <= 1 || args.len() > 3 {
         return exec_err!(
             "lpad was called with {} arguments. It requires at least 2 and at most 3.",
@@ -141,7 +142,7 @@ pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
 
     match (args.len(), args[0].data_type()) {
         (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
-            args[0].as_string_view(),
+            &args[0].as_string_view(),
             length_array,
             None,
         ),
@@ -149,14 +150,14 @@ pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             &GenericStringArray<T>,
             &GenericStringArray<T>,
             T,
-        >(args[0].as_string::<T>(), length_array, None),
+        >(&args[0].as_string::<T>(), length_array, None),
         (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>(
-            args[0].as_string_view(),
+            &args[0].as_string_view(),
             length_array,
             &args[2],
         ),
         (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray<T>, T>(
-            args[0].as_string::<T>(),
+            &args[0].as_string::<T>(),
             length_array,
             &args[2],
         ),
@@ -165,7 +166,7 @@ pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
 }
 
 fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
-    string_array: V,
+    string_array: &V,
     length_array: &Int64Array,
     fill_array: &'a ArrayRef,
 ) -> Result<ArrayRef>
@@ -195,7 +196,7 @@ where
 }
 
 fn lpad_impl<'a, V, V2, T>(
-    string_array: V,
+    string_array: &V,
     length_array: &Int64Array,
     fill_array: Option<V2>,
 ) -> Result<ArrayRef>
@@ -204,11 +205,17 @@ where
     V2: StringArrayType<'a>,
     T: OffsetSizeTrait,
 {
-    let array = if fill_array.is_none() {
+    let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
+        let mut graphemes_buf = Vec::new();
+        let mut fill_chars_buf = Vec::new();
 
-        for (string, length) in string_array.iter().zip(length_array.iter()) {
-            if let (Some(string), Some(length)) = (string, length) {
+        for ((string, length), fill) in string_array
+            .iter()
+            .zip(length_array.iter())
+            .zip(fill_array.iter())
+        {
+            if let (Some(string), Some(length), Some(fill)) = (string, length, fill) {
                 if length > i32::MAX as i64 {
                     return exec_err!("lpad requested length {length} too large");
                 }
@@ -219,13 +226,47 @@ where
                     continue;
                 }
 
-                let graphemes = string.graphemes(true).collect::<Vec<&str>>();
-                if length < graphemes.len() {
-                    builder.append_value(graphemes[..length].concat());
+                if string.is_ascii() && fill.is_ascii() {
+                    // ASCII fast path: byte length == character length,
+                    // so we skip expensive grapheme segmentation.
+                    let str_len = string.len();
+                    if length < str_len {
+                        builder.append_value(&string[..length]);
+                    } else if fill.is_empty() {
+                        builder.append_value(string);
+                    } else {
+                        let pad_len = length - str_len;
+                        let fill_len = fill.len();
+                        let full_reps = pad_len / fill_len;
+                        let remainder = pad_len % fill_len;
+                        for _ in 0..full_reps {
+                            builder.write_str(fill)?;
+                        }
+                        if remainder > 0 {
+                            builder.write_str(&fill[..remainder])?;
+                        }
+                        builder.append_value(string);
+                    }
                 } else {
-                    builder.write_str(" ".repeat(length - graphemes.len()).as_str())?;
-                    builder.write_str(string)?;
-                    builder.append_value("");
+                    // Reuse buffers by clearing and refilling
+                    graphemes_buf.clear();
+                    graphemes_buf.extend(string.graphemes(true));
+
+                    fill_chars_buf.clear();
+                    fill_chars_buf.extend(fill.chars());
+
+                    if length < graphemes_buf.len() {
+                        builder.append_value(graphemes_buf[..length].concat());
+                    } else if fill_chars_buf.is_empty() {
+                        builder.append_value(string);
+                    } else {
+                        for l in 0..length - graphemes_buf.len() {
+                            let c =
+                                *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
+                            builder.write_char(c)?;
+                        }
+                        builder.append_value(string);
+                    }
                 }
             } else {
                 builder.append_null();
@@ -235,13 +276,10 @@ where
         builder.finish()
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
+        let mut graphemes_buf = Vec::new();
 
-        for ((string, length), fill) in string_array
-            .iter()
-            .zip(length_array.iter())
-            .zip(fill_array.unwrap().iter())
-        {
-            if let (Some(string), Some(length), Some(fill)) = (string, length, fill) {
+        for (string, length) in string_array.iter().zip(length_array.iter()) {
+            if let (Some(string), Some(length)) = (string, length) {
                 if length > i32::MAX as i64 {
                     return exec_err!("lpad requested length {length} too large");
                 }
@@ -252,20 +290,30 @@ where
                     continue;
                 }
 
-                let graphemes = string.graphemes(true).collect::<Vec<&str>>();
-                let fill_chars = fill.chars().collect::<Vec<char>>();
-
-                if length < graphemes.len() {
-                    builder.append_value(graphemes[..length].concat());
-                } else if fill_chars.is_empty() {
-                    builder.append_value(string);
+                if string.is_ascii() {
+                    // ASCII fast path: byte length == character length
+                    let str_len = string.len();
+                    if length < str_len {
+                        builder.append_value(&string[..length]);
+                    } else {
+                        for _ in 0..(length - str_len) {
+                            builder.write_str(" ")?;
+                        }
+                        builder.append_value(string);
+                    }
                 } else {
-                    for l in 0..length - graphemes.len() {
-                        let c = *fill_chars.get(l % fill_chars.len()).unwrap();
-                        builder.write_char(c)?;
+                    // Reuse buffer by clearing and refilling
+                    graphemes_buf.clear();
+                    graphemes_buf.extend(string.graphemes(true));
+
+                    if length < graphemes_buf.len() {
+                        builder.append_value(graphemes_buf[..length].concat());
+                    } else {
+                        for _ in 0..(length - graphemes_buf.len()) {
+                            builder.write_str(" ")?;
+                        }
+                        builder.append_value(string);
                     }
-                    builder.write_str(string)?;
-                    builder.append_value("");
                 }
             } else {
                 builder.append_null();
@@ -512,6 +560,17 @@ mod tests {
             None,
             Ok(None)
         );
+        test_lpad!(
+            Some("hello".into()),
+            ScalarValue::Int64(Some(2i64)),
+            Ok(Some("he"))
+        );
+        test_lpad!(
+            Some("hi".into()),
+            ScalarValue::Int64(Some(6i64)),
+            Some("xy".into()),
+            Ok(Some("xyxyhi"))
+        );
         test_lpad!(
             Some("josé".into()),
             ScalarValue::Int64(Some(10i64)),
@@ -526,9 +585,13 @@ mod tests {
         );
 
         #[cfg(not(feature = "unicode_expressions"))]
-        test_lpad!(Some("josé".into()), ScalarValue::Int64(Some(5i64)), internal_err!(
+        test_lpad!(
+            Some("josé".into()),
+            ScalarValue::Int64(Some(5i64)),
+            internal_err!(
                 "function lpad requires compilation with feature flag: unicode_expressions."
-        ));
+            )
+        );
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs
index 3c5cde3789ea2..7250b3915fb5c 100644
--- a/datafusion/functions/src/unicode/mod.rs
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -22,10 +22,12 @@ use std::sync::Arc;
 use datafusion_expr::ScalarUDF;
 
 pub mod character_length;
+pub mod common;
 pub mod find_in_set;
 pub mod initcap;
 pub mod left;
 pub mod lpad;
+pub mod planner;
 pub mod reverse;
 pub mod right;
 pub mod rpad;
diff --git a/datafusion/functions/src/unicode/planner.rs b/datafusion/functions/src/unicode/planner.rs
new file mode 100644
index 0000000000000..38c82486416a6
--- /dev/null
+++ b/datafusion/functions/src/unicode/planner.rs
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SQL planning extensions like [`UnicodeFunctionPlanner`]
+
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
+
+#[derive(Default, Debug)]
+pub struct UnicodeFunctionPlanner;
+
+impl ExprPlanner for UnicodeFunctionPlanner {
+    fn plan_position(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::unicode::strpos(), args),
+        )))
+    }
+
+    fn plan_substring(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::unicode::substr(), args),
+        )))
+    }
+}
diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs
index 311e9e81a8be9..8f265d8659a9b 100644
--- a/datafusion/functions/src/unicode/reverse.rs
+++ b/datafusion/functions/src/unicode/reverse.rs
@@ -18,17 +18,19 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
+use crate::utils::make_scalar_function;
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
-    Array, ArrayRef, AsArray, GenericStringBuilder, OffsetSizeTrait, StringArrayType,
+    Array, ArrayRef, AsArray, LargeStringBuilder, StringArrayType, StringBuilder,
+    StringLikeArrayBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -44,7 +46,7 @@ use DataType::{LargeUtf8, Utf8, Utf8View};
 ```"#,
     standard_argument(name = "str", prefix = "String")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ReverseFunc {
     signature: Signature,
 }
@@ -82,17 +84,13 @@ impl ScalarUDFImpl for ReverseFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "reverse")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            Utf8 | Utf8View => make_scalar_function(reverse::<i32>, vec![])(args),
-            LargeUtf8 => make_scalar_function(reverse::<i64>, vec![])(args),
+            Utf8 | Utf8View | LargeUtf8 => make_scalar_function(reverse, vec![])(args),
             other => {
                 exec_err!("Unsupported data type {other:?} for function reverse")
             }
@@ -106,21 +104,39 @@ impl ScalarUDFImpl for ReverseFunc {
 
 /// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`.
 /// The implementation uses UTF-8 code points as characters
-pub fn reverse<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    if args[0].data_type() == &Utf8View {
-        reverse_impl::<T, _>(args[0].as_string_view())
-    } else {
-        reverse_impl::<T, _>(args[0].as_string::<T>())
+fn reverse(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let len = args[0].len();
+
+    match args[0].data_type() {
+        Utf8 => reverse_impl(
+            &args[0].as_string::<i32>(),
+            StringBuilder::with_capacity(len, 1024),
+        ),
+        Utf8View => reverse_impl(
+            &args[0].as_string_view(),
+            StringViewBuilder::with_capacity(len),
+        ),
+        LargeUtf8 => reverse_impl(
+            &args[0].as_string::<i64>(),
+            LargeStringBuilder::with_capacity(len, 1024),
+        ),
+        _ => unreachable!(
+            "Reverse can only be applied to Utf8View, Utf8 and LargeUtf8 types"
+        ),
     }
 }
 
-fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
-    string_array: V,
-) -> Result<ArrayRef> {
-    let mut builder = GenericStringBuilder::<T>::with_capacity(string_array.len(), 1024);
-
+fn reverse_impl<'a, StringArrType, StringBuilderType>(
+    string_array: &StringArrType,
+    mut array_builder: StringBuilderType,
+) -> Result<ArrayRef>
+where
+    StringArrType: StringArrayType<'a>,
+    StringBuilderType: StringLikeArrayBuilder,
+{
     let mut string_buf = String::new();
     let mut byte_buf = Vec::<u8>::new();
+
     for string in string_array.iter() {
         if let Some(s) = string {
             if s.is_ascii() {
@@ -129,25 +145,25 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
                 byte_buf.reverse();
                 // SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8.
                 let reversed = unsafe { std::str::from_utf8_unchecked(&byte_buf) };
-                builder.append_value(reversed);
+                array_builder.append_value(reversed);
                 byte_buf.clear();
             } else {
                 string_buf.extend(s.chars().rev());
-                builder.append_value(&string_buf);
+                array_builder.append_value(&string_buf);
                 string_buf.clear();
             }
         } else {
-            builder.append_null();
+            array_builder.append_null();
         }
     }
 
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    Ok(Arc::new(array_builder.finish()) as ArrayRef)
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, LargeStringArray, StringArray};
-    use arrow::datatypes::DataType::{LargeUtf8, Utf8};
+    use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -180,8 +196,8 @@ mod tests {
                 vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
                 $EXPECTED,
                 &str,
-                Utf8,
-                StringArray
+                Utf8View,
+                StringViewArray
             );
         };
     }
diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs
index 1ceaf69983311..bbb777066186a 100644
--- a/datafusion/functions/src/unicode/right.rs
+++ b/datafusion/functions/src/unicode/right.rs
@@ -16,24 +16,16 @@
 // under the License.
 
 use std::any::Any;
-use std::cmp::{max, Ordering};
-use std::sync::Arc;
 
-use arrow::array::{
-    Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array,
-    OffsetSizeTrait,
-};
+use crate::unicode::common::{RightSlicer, general_left_right};
+use crate::utils::make_scalar_function;
 use arrow::datatypes::DataType;
-
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::cast::{
-    as_generic_string_array, as_int64_array, as_string_view_array,
-};
-use datafusion_common::exec_err;
 use datafusion_common::Result;
+use datafusion_common::exec_err;
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -53,7 +45,7 @@ use datafusion_macros::user_doc;
     argument(name = "n", description = "Number of characters to return."),
     related_udf(name = "left")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RightFunc {
     signature: Signature,
 }
@@ -94,22 +86,23 @@ impl ScalarUDFImpl for RightFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "right")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    /// Returns right n characters in the string, or when n is negative, returns all but first |n| characters.
+    /// right('abcde', 2) = 'de'
+    /// right('abcde', -2) = 'cde'
+    /// The implementation uses UTF-8 code points as characters
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match args[0].data_type() {
-            DataType::Utf8 | DataType::Utf8View => {
-                make_scalar_function(right::<i32>, vec![])(args)
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => {
+                make_scalar_function(general_left_right::<RightSlicer>, vec![])(args)
             }
-            DataType::LargeUtf8 => make_scalar_function(right::<i64>, vec![])(args),
             other => exec_err!(
-                "Unsupported data type {other:?} for function right,\
-            expected Utf8View, Utf8 or LargeUtf8."
+                "Unsupported data type {other:?} for function {},\
+                expected Utf8View, Utf8 or LargeUtf8.",
+                self.name()
             ),
         }
     }
@@ -119,58 +112,10 @@ impl ScalarUDFImpl for RightFunc {
     }
 }
 
-/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters.
-/// right('abcde', 2) = 'de'
-/// The implementation uses UTF-8 code points as characters
-pub fn right<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let n_array = as_int64_array(&args[1])?;
-    if args[0].data_type() == &DataType::Utf8View {
-        // string_view_right(args)
-        let string_array = as_string_view_array(&args[0])?;
-        right_impl::<T, _>(&mut string_array.iter(), n_array)
-    } else {
-        // string_right::<T>(args)
-        let string_array = &as_generic_string_array::<T>(&args[0])?;
-        right_impl::<T, _>(&mut string_array.iter(), n_array)
-    }
-}
-
-// Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need
-// to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View
-// See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166
-fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
-    string_array_iter: &mut ArrayIter<V>,
-    n_array: &Int64Array,
-) -> Result<ArrayRef> {
-    let result = string_array_iter
-        .zip(n_array.iter())
-        .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => Some(
-                    string
-                        .chars()
-                        .skip(n.unsigned_abs() as usize)
-                        .collect::<String>(),
-                ),
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => Some(
-                    string
-                        .chars()
-                        .skip(max(string.chars().count() as i64 - n, 0) as usize)
-                        .collect::<String>(),
-                ),
-            },
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -213,6 +158,17 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcde")),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             RightFunc::new(),
             vec![
@@ -260,10 +216,10 @@ mod tests {
         test_function!(
             RightFunc::new(),
             vec![
-                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from("joséérend")),
                 ColumnarValue::Scalar(ScalarValue::from(5i64)),
             ],
-            Ok(Some("éésoj")),
+            Ok(Some("érend")),
             &str,
             Utf8,
             StringArray
@@ -271,10 +227,10 @@ mod tests {
         test_function!(
             RightFunc::new(),
             vec![
-                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
+                ColumnarValue::Scalar(ScalarValue::from("joséérend")),
                 ColumnarValue::Scalar(ScalarValue::from(-3i64)),
             ],
-            Ok(Some("éésoj")),
+            Ok(Some("éérend")),
             &str,
             Utf8,
             StringArray
@@ -294,6 +250,71 @@ mod tests {
             StringArray
         );
 
+        // StringView cases
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("de")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("abcde")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::from(200i64)),
+            ],
+            Ok(Some("")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        test_function!(
+            RightFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                    "joséérend".to_string()
+                ))),
+                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
+            ],
+            Ok(Some("éérend")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
+        // Unicode indexing case
+        let input = "joé楽s𐀀so↓j";
+        for n in 1..=input.chars().count() {
+            let expected = input.chars().skip(n).collect::<String>();
+            test_function!(
+                RightFunc::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::from(input)),
+                    ColumnarValue::Scalar(ScalarValue::from(-(n as i64))),
+                ],
+                Ok(Some(expected.as_str())),
+                &str,
+                Utf8,
+                StringArray
+            );
+        }
+
         Ok(())
     }
 }
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index c68c4d329c74d..5d8ed688655c4 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -16,24 +16,25 @@
 // under the License.
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
+use DataType::{LargeUtf8, Utf8, Utf8View};
 use arrow::array::{
     ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use datafusion_common::cast::as_int64_array;
 use datafusion_common::DataFusionError;
-use datafusion_common::{exec_err, Result};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
 use std::fmt::Write;
 use std::sync::Arc;
 use unicode_segmentation::UnicodeSegmentation;
-use DataType::{LargeUtf8, Utf8, Utf8View};
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -48,14 +49,17 @@ use DataType::{LargeUtf8, Utf8, Utf8View};
 +-----------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "n", description = "String length to pad to."),
+    argument(
+        name = "n",
+        description = "String length to pad to. If the input string is longer than this length, it is truncated."
+    ),
     argument(
         name = "padding_str",
         description = "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
     ),
     related_udf(name = "lpad")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct RPadFunc {
     signature: Signature,
 }
@@ -108,10 +112,7 @@ impl ScalarUDFImpl for RPadFunc {
         utf8_to_str_type(&arg_types[0], "rpad")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         let args = &args.args;
         match (
             args.len(),
@@ -145,7 +146,7 @@ impl ScalarUDFImpl for RPadFunc {
     }
 }
 
-pub fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
+fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
     args: &[ArrayRef],
 ) -> Result<ArrayRef> {
     if args.len() < 2 || args.len() > 3 {
@@ -163,21 +164,21 @@ pub fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
     ) {
         (2, Utf8View, _) => {
             rpad_impl::<&StringViewArray, &StringViewArray, StringArrayLen>(
-                args[0].as_string_view(),
+                &args[0].as_string_view(),
                 length_array,
                 None,
             )
         }
         (3, Utf8View, Some(Utf8View)) => {
             rpad_impl::<&StringViewArray, &StringViewArray, StringArrayLen>(
-                args[0].as_string_view(),
+                &args[0].as_string_view(),
                 length_array,
                 Some(args[2].as_string_view()),
             )
         }
         (3, Utf8View, Some(Utf8 | LargeUtf8)) => {
             rpad_impl::<&StringViewArray, &GenericStringArray<FillArrayLen>, StringArrayLen>(
-                args[0].as_string_view(),
+                &args[0].as_string_view(),
                 length_array,
                 Some(args[2].as_string::<FillArrayLen>()),
             )
@@ -187,7 +188,7 @@ pub fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
             &StringViewArray,
             StringArrayLen,
         >(
-            args[0].as_string::<StringArrayLen>(),
+            &args[0].as_string::<StringArrayLen>(),
             length_array,
             Some(args[2].as_string_view()),
         ),
@@ -196,17 +197,18 @@ pub fn rpad<StringArrayLen: OffsetSizeTrait, FillArrayLen: OffsetSizeTrait>(
             &GenericStringArray<FillArrayLen>,
             StringArrayLen,
         >(
-            args[0].as_string::<StringArrayLen>(),
+            &args[0].as_string::<StringArrayLen>(),
             length_array,
             args.get(2).map(|arg| arg.as_string::<FillArrayLen>()),
         ),
     }
 }
 
-/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated.
+/// Extends the string to length 'length' by appending the characters fill (a space by default).
+/// If the string is already longer than length then it is truncated (on the right).
 /// rpad('hi', 5, 'xy') = 'hixyx'
-pub fn rpad_impl<'a, StringArrType, FillArrType, StringArrayLen>(
-    string_array: StringArrType,
+fn rpad_impl<'a, StringArrType, FillArrType, StringArrayLen>(
+    string_array: &StringArrType,
     length_array: &Int64Array,
     fill_array: Option<FillArrType>,
 ) -> Result<ArrayRef>
@@ -216,6 +218,8 @@ where
     StringArrayLen: OffsetSizeTrait,
 {
     let mut builder: GenericStringBuilder<StringArrayLen> = GenericStringBuilder::new();
+    let mut graphemes_buf = Vec::new();
+    let mut fill_chars_buf = Vec::new();
 
     match fill_array {
         None => {
@@ -232,16 +236,31 @@ where
                             let length = if length < 0 { 0 } else { length as usize };
                             if length == 0 {
                                 builder.append_value("");
+                            } else if string.is_ascii() {
+                                // ASCII fast path: byte length == character length
+                                let str_len = string.len();
+                                if length < str_len {
+                                    builder.append_value(&string[..length]);
+                                } else {
+                                    builder.write_str(string)?;
+                                    for _ in 0..(length - str_len) {
+                                        builder.write_str(" ")?;
+                                    }
+                                    builder.append_value("");
+                                }
                             } else {
-                                let graphemes =
-                                    string.graphemes(true).collect::<Vec<&str>>();
-                                if length < graphemes.len() {
-                                    builder.append_value(graphemes[..length].concat());
+                                // Reuse buffer by clearing and refilling
+                                graphemes_buf.clear();
+                                graphemes_buf.extend(string.graphemes(true));
+
+                                if length < graphemes_buf.len() {
+                                    builder
+                                        .append_value(graphemes_buf[..length].concat());
                                 } else {
                                     builder.write_str(string)?;
-                                    builder.write_str(
-                                        &" ".repeat(length - graphemes.len()),
-                                    )?;
+                                    for _ in 0..(length - graphemes_buf.len()) {
+                                        builder.write_str(" ")?;
+                                    }
                                     builder.append_value("");
                                 }
                             }
@@ -268,20 +287,52 @@ where
                                     );
                                 }
                                 let length = if length < 0 { 0 } else { length as usize };
-                                let graphemes =
-                                    string.graphemes(true).collect::<Vec<&str>>();
-
-                                if length < graphemes.len() {
-                                    builder.append_value(graphemes[..length].concat());
-                                } else if fill.is_empty() {
-                                    builder.append_value(string);
+                                if string.is_ascii() && fill.is_ascii() {
+                                    // ASCII fast path: byte length == character length,
+                                    // so we skip expensive grapheme segmentation.
+                                    let str_len = string.len();
+                                    if length < str_len {
+                                        builder.append_value(&string[..length]);
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        let pad_len = length - str_len;
+                                        let fill_len = fill.len();
+                                        let full_reps = pad_len / fill_len;
+                                        let remainder = pad_len % fill_len;
+                                        builder.write_str(string)?;
+                                        for _ in 0..full_reps {
+                                            builder.write_str(fill)?;
+                                        }
+                                        if remainder > 0 {
+                                            builder.write_str(&fill[..remainder])?;
+                                        }
+                                        builder.append_value("");
+                                    }
                                 } else {
-                                    builder.write_str(string)?;
-                                    fill.chars()
-                                        .cycle()
-                                        .take(length - graphemes.len())
-                                        .for_each(|ch| builder.write_char(ch).unwrap());
-                                    builder.append_value("");
+                                    // Reuse buffer by clearing and refilling
+                                    graphemes_buf.clear();
+                                    graphemes_buf.extend(string.graphemes(true));
+
+                                    if length < graphemes_buf.len() {
+                                        builder.append_value(
+                                            graphemes_buf[..length].concat(),
+                                        );
+                                    } else if fill.is_empty() {
+                                        builder.append_value(string);
+                                    } else {
+                                        builder.write_str(string)?;
+                                        // Reuse fill_chars_buf by clearing and refilling
+                                        fill_chars_buf.clear();
+                                        fill_chars_buf.extend(fill.chars());
+                                        for l in 0..length - graphemes_buf.len() {
+                                            let c = *fill_chars_buf
+                                                .get(l % fill_chars_buf.len())
+                                                .unwrap();
+                                            builder.write_char(c)?;
+                                        }
+                                        builder.append_value("");
+                                    }
                                 }
                             }
                             _ => builder.append_null(),
@@ -447,6 +498,29 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            RPadFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("hello")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+            ],
+            Ok(Some("he")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        test_function!(
+            RPadFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("hi")),
+                ColumnarValue::Scalar(ScalarValue::from(6i64)),
+                ColumnarValue::Scalar(ScalarValue::from("xy")),
+            ],
+            Ok(Some("hixyxy")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             RPadFunc::new(),
             vec![
diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs
index 1c81b46ec78ea..ee854c3c9d1f4 100644
--- a/datafusion/functions/src/unicode/strpos.rs
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -26,12 +26,13 @@ use arrow::datatypes::{
     ArrowNativeType, DataType, Field, FieldRef, Int32Type, Int64Type,
 };
 use datafusion_common::types::logical_string;
-use datafusion_common::{exec_err, internal_err, Result};
+use datafusion_common::{Result, exec_err, internal_err};
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignatureClass,
-    Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
+use memchr::memchr;
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -49,7 +50,7 @@ use datafusion_macros::user_doc;
     standard_argument(name = "str", prefix = "String"),
     argument(name = "substr", description = "Substring expression to search for.")
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct StrposFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -109,10 +110,7 @@ impl ScalarUDFImpl for StrposFunc {
         )
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(strpos, vec![])(&args.args)
     }
 
@@ -130,47 +128,47 @@ fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
         (DataType::Utf8, DataType::Utf8) => {
             let string_array = args[0].as_string::<i32>();
             let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
         (DataType::Utf8, DataType::Utf8View) => {
             let string_array = args[0].as_string::<i32>();
             let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
         (DataType::Utf8, DataType::LargeUtf8) => {
             let string_array = args[0].as_string::<i32>();
             let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
         (DataType::LargeUtf8, DataType::Utf8) => {
             let string_array = args[0].as_string::<i64>();
             let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int64Type>(&string_array, &substring_array)
         }
         (DataType::LargeUtf8, DataType::Utf8View) => {
             let string_array = args[0].as_string::<i64>();
             let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int64Type>(&string_array, &substring_array)
         }
         (DataType::LargeUtf8, DataType::LargeUtf8) => {
             let string_array = args[0].as_string::<i64>();
             let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int64Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int64Type>(&string_array, &substring_array)
         }
         (DataType::Utf8View, DataType::Utf8View) => {
             let string_array = args[0].as_string_view();
             let substring_array = args[1].as_string_view();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
         (DataType::Utf8View, DataType::Utf8) => {
             let string_array = args[0].as_string_view();
             let substring_array = args[1].as_string::<i32>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
         (DataType::Utf8View, DataType::LargeUtf8) => {
             let string_array = args[0].as_string_view();
             let substring_array = args[1].as_string::<i64>();
-            calculate_strpos::<_, _, Int32Type>(string_array, substring_array)
+            calculate_strpos::<_, _, Int32Type>(&string_array, &substring_array)
         }
 
         other => {
@@ -179,12 +177,37 @@ fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
+/// Find `needle` in `haystack` using `memchr` to quickly skip to positions
+/// where the first byte matches, then verify the remaining bytes. Using
+/// string::find is slower because it has significant per-call overhead that
+/// `memchr` does not, and strpos is often invoked many times on short inputs.
+/// Returns a 1-based position, or 0 if not found.
+/// Both inputs must be ASCII-only.
+fn find_ascii_substring(haystack: &[u8], needle: &[u8]) -> usize {
+    let needle_len = needle.len();
+    let first_byte = needle[0];
+    let mut offset = 0;
+
+    while let Some(pos) = memchr(first_byte, &haystack[offset..]) {
+        let start = offset + pos;
+        if start + needle_len > haystack.len() {
+            return 0;
+        }
+        if haystack[start..start + needle_len] == *needle {
+            return start + 1;
+        }
+        offset = start + 1;
+    }
+
+    0
+}
+
 /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)
 /// strpos('high', 'ig') = 2
 /// The implementation uses UTF-8 code points as characters
 fn calculate_strpos<'a, V1, V2, T: ArrowPrimitiveType>(
-    string_array: V1,
-    substring_array: V2,
+    string_array: &V1,
+    substring_array: &V2,
 ) -> Result<ArrayRef>
 where
     V1: StringArrayType<'a, Item = &'a str>,
@@ -198,31 +221,42 @@ where
         .zip(substring_iter)
         .map(|(string, substring)| match (string, substring) {
             (Some(string), Some(substring)) => {
-                // If only ASCII characters are present, we can use the slide window method to find
-                // the sub vector in the main vector. This is faster than string.find() method.
+                if substring.is_empty() {
+                    return T::Native::from_usize(1);
+                }
+
+                let substring_bytes = substring.as_bytes();
+                let string_bytes = string.as_bytes();
+
+                if substring_bytes.len() > string_bytes.len() {
+                    return T::Native::from_usize(0);
+                }
+
                 if ascii_only {
-                    // If the substring is empty, the result is 1.
-                    if substring.is_empty() {
-                        T::Native::from_usize(1)
-                    } else {
-                        T::Native::from_usize(
-                            string
-                                .as_bytes()
-                                .windows(substring.len())
-                                .position(|w| w == substring.as_bytes())
-                                .map(|x| x + 1)
-                                .unwrap_or(0),
-                        )
-                    }
+                    T::Native::from_usize(find_ascii_substring(
+                        string_bytes,
+                        substring_bytes,
+                    ))
                 } else {
-                    // The `find` method returns the byte index of the substring.
-                    // We count the number of chars up to that byte index.
-                    T::Native::from_usize(
-                        string
-                            .find(substring)
-                            .map(|x| string[..x].chars().count() + 1)
-                            .unwrap_or(0),
-                    )
+                    // For non-ASCII, use a single-pass search that tracks both
+                    // byte position and character position simultaneously
+                    let mut char_pos = 0;
+                    for (byte_idx, _) in string.char_indices() {
+                        char_pos += 1;
+                        if byte_idx + substring_bytes.len() <= string_bytes.len() {
+                            // SAFETY: We just checked that byte_idx + substring_bytes.len() <= string_bytes.len()
+                            let slice = unsafe {
+                                string_bytes.get_unchecked(
+                                    byte_idx..byte_idx + substring_bytes.len(),
+                                )
+                            };
+                            if slice == substring_bytes {
+                                return T::Native::from_usize(char_pos);
+                            }
+                        }
+                    }
+
+                    T::Native::from_usize(0)
                 }
             }
             _ => None,
diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs
index 4dcbea4807f44..737730cf88f7d 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -27,9 +27,13 @@ use arrow::array::{
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
 use datafusion_common::cast::as_int64_array;
-use datafusion_common::{exec_err, plan_err, Result};
+use datafusion_common::types::{
+    NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -44,7 +48,7 @@ use datafusion_macros::user_doc;
 | substr(Utf8("datafusion"),Int64(5),Int64(3)) |
 +----------------------------------------------+
 | fus                                          |
-+----------------------------------------------+ 
++----------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
     argument(
@@ -56,7 +60,7 @@ use datafusion_macros::user_doc;
         description = "Number of characters to extract. If not specified, returns the rest of the string after the start position."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SubstrFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -70,8 +74,30 @@ impl Default for SubstrFunc {
 
 impl SubstrFunc {
     pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Native(logical_int32())],
+            NativeType::Int64,
+        );
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
+                    TypeSignature::Coercible(vec![
+                        string.clone(),
+                        int64.clone(),
+                        int64.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec![
+                "str".to_string(),
+                "start_pos".to_string(),
+                "length".to_string(),
+            ])
+            .expect("valid parameter names"),
             aliases: vec![String::from("substring")],
         }
     }
@@ -95,10 +121,7 @@ impl ScalarUDFImpl for SubstrFunc {
         Ok(DataType::Utf8View)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(substr, vec![])(&args.args)
     }
 
@@ -106,72 +129,6 @@ impl ScalarUDFImpl for SubstrFunc {
         &self.aliases
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() < 2 || arg_types.len() > 3 {
-            return plan_err!(
-                "The {} function requires 2 or 3 arguments, but got {}.",
-                self.name(),
-                arg_types.len()
-            );
-        }
-        let first_data_type = match &arg_types[0] {
-            DataType::Null => Ok(DataType::Utf8),
-            DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8 => Ok(arg_types[0].clone()),
-            DataType::Dictionary(key_type, value_type) => {
-                if key_type.is_integer() {
-                    match value_type.as_ref() {
-                        DataType::Null => Ok(DataType::Utf8),
-                        DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8 => Ok(*value_type.clone()),
-                        _ => plan_err!(
-                                "The first argument of the {} function can only be a string, but got {:?}.",
-                                self.name(),
-                                arg_types[0]
-                        ),
-                    }
-                } else {
-                    plan_err!(
-                        "The first argument of the {} function can only be a string, but got {:?}.",
-                        self.name(),
-                        arg_types[0]
-                    )
-                }
-            }
-            _ => plan_err!(
-                "The first argument of the {} function can only be a string, but got {:?}.",
-                self.name(),
-                arg_types[0]
-            )
-        }?;
-
-        if ![DataType::Int64, DataType::Int32, DataType::Null].contains(&arg_types[1]) {
-            return plan_err!(
-                "The second argument of the {} function can only be an integer, but got {:?}.",
-                self.name(),
-                arg_types[1]
-            );
-        }
-
-        if arg_types.len() == 3
-            && ![DataType::Int64, DataType::Int32, DataType::Null].contains(&arg_types[2])
-        {
-            return plan_err!(
-                "The third argument of the {} function can only be an integer, but got {:?}.",
-                self.name(),
-                arg_types[2]
-            );
-        }
-
-        if arg_types.len() == 2 {
-            Ok(vec![first_data_type.to_owned(), DataType::Int64])
-        } else {
-            Ok(vec![
-                first_data_type.to_owned(),
-                DataType::Int64,
-                DataType::Int64,
-            ])
-        }
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
@@ -181,7 +138,7 @@ impl ScalarUDFImpl for SubstrFunc {
 /// substr('alphabet', 3) = 'phabet'
 /// substr('alphabet', 3, 2) = 'ph'
 /// The implementation uses UTF-8 code points as characters
-pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
@@ -216,7 +173,7 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
 // `get_true_start_end('Hi🌏', 1, None) -> (0, 6)`
 // `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)`
 // `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)`
-fn get_true_start_end(
+pub fn get_true_start_end(
     input: &str,
     start: i64,
     count: Option<u64>,
@@ -225,7 +182,10 @@ fn get_true_start_end(
     let start = start.checked_sub(1).unwrap_or(start);
 
     let end = match count {
-        Some(count) => start + count as i64,
+        Some(count) => {
+            let count_i64 = i64::try_from(count).unwrap_or(i64::MAX);
+            start.saturating_add(count_i64)
+        }
         None => input.len() as i64,
     };
     let count_to_end = count.is_some();
@@ -275,7 +235,7 @@ fn get_true_start_end(
 // string, such as `substr(long_str_with_1k_chars, 1, 32)`.
 // In such case the overhead of ASCII-validation may not be worth it, so
 // skip the validation for short prefix for now.
-fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
+pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
     string_array: &V,
     start: &Int64Array,
     count: Option<&Int64Array>,
@@ -287,7 +247,7 @@ fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
 
             // HACK: can be simplified if function has specialized
             // implementation for `ScalarValue` (implement without `make_scalar_function()`)
-            let avg_prefix_len = start
+            let total_prefix_len = start
                 .iter()
                 .zip(count.iter())
                 .take(n_sample)
@@ -295,11 +255,11 @@ fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
                     let start = start.unwrap_or(0);
                     let count = count.unwrap_or(0);
                     // To get substring, need to decode from 0 to start+count instead of start to start+count
-                    start + count
+                    start.saturating_add(count)
                 })
-                .sum::<i64>();
+                .fold(0i64, |acc, val| acc.saturating_add(val));
 
-            avg_prefix_len as f64 / n_sample as f64 <= short_prefix_threshold
+            (total_prefix_len as f64 / n_sample as f64) <= short_prefix_threshold
         }
         None => false,
     };
@@ -404,7 +364,7 @@ fn string_view_substr(
         other => {
             return exec_err!(
                 "substr was called with {other} arguments. It requires 2 or 3."
-            )
+            );
         }
     }
 
@@ -510,7 +470,7 @@ mod tests {
     use arrow::array::{Array, StringViewArray};
     use arrow::datatypes::DataType::Utf8View;
 
-    use datafusion_common::{exec_err, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, exec_err};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
 
     use crate::unicode::substr::SubstrFunc;
@@ -850,7 +810,7 @@ mod tests {
             SubstrFunc::new(),
             vec![
                 ColumnarValue::Scalar(ScalarValue::from("abc")),
-                ColumnarValue::Scalar(ScalarValue::from(-9223372036854775808i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
             ],
             Ok(Some("abc")),
             &str,
@@ -861,7 +821,7 @@ mod tests {
             SubstrFunc::new(),
             vec![
                 ColumnarValue::Scalar(ScalarValue::from("overflow")),
-                ColumnarValue::Scalar(ScalarValue::from(-9223372036854775808i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
                 ColumnarValue::Scalar(ScalarValue::from(1i64)),
             ],
             exec_err!("negative overflow when calculating skip value"),
@@ -869,6 +829,18 @@ mod tests {
             Utf8View,
             StringViewArray
         );
+        test_function!(
+            SubstrFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("large count")),
+                ColumnarValue::Scalar(ScalarValue::from(2i64)),
+                ColumnarValue::Scalar(ScalarValue::from(i64::MAX)),
+            ],
+            Ok(Some("arge count")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
 
         Ok(())
     }
diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs
index 9a18b5d23c5ee..11f6a7a295c0d 100644
--- a/datafusion/functions/src/unicode/substrindex.rs
+++ b/datafusion/functions/src/unicode/substrindex.rs
@@ -19,16 +19,17 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
-    PrimitiveArray, StringBuilder,
+    ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
+    GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
 };
 use arrow::datatypes::{DataType, Int32Type, Int64Type};
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::{exec_err, utils::take_function_args, Result};
+use datafusion_common::{Result, exec_err, utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -62,7 +63,7 @@ If count is negative, everything to the right of the final delimiter (counting f
         description = "The number of times to search for the delimiter. Can be either a positive or negative number."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SubstrIndexFunc {
     signature: Signature,
     aliases: Vec<String>,
@@ -108,10 +109,7 @@ impl ScalarUDFImpl for SubstrIndexFunc {
         utf8_to_str_type(&arg_types[0], "substr_index")
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         make_scalar_function(substr_index, vec![])(&args.args)
     }
 
@@ -169,7 +167,7 @@ fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-pub fn substr_index_general<
+fn substr_index_general<
     'a,
     T: ArrowPrimitiveType,
     V: ArrayAccessor<Item = &'a str>,
@@ -182,7 +180,8 @@ pub fn substr_index_general<
 where
     T::Native: OffsetSizeTrait,
 {
-    let mut builder = StringBuilder::new();
+    let num_rows = string_array.len();
+    let mut builder = GenericStringBuilder::<T::Native>::with_capacity(num_rows, 0);
     let string_iter = ArrayIter::new(string_array);
     let delimiter_array_iter = ArrayIter::new(delimiter_array);
     let count_array_iter = ArrayIter::new(count_array);
@@ -198,31 +197,49 @@ where
                 }
 
                 let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX);
-                let length = if n > 0 {
-                    let split = string.split(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                } else {
-                    let split = string.rsplit(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                };
-                if n > 0 {
-                    match string.get(..length) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
+                let result_idx = if delimiter.len() == 1 {
+                    // Fast path: use byte-level search for single-character delimiters
+                    let d_byte = delimiter.as_bytes()[0];
+                    let bytes = string.as_bytes();
+
+                    if n > 0 {
+                        bytes
+                            .iter()
+                            .enumerate()
+                            .filter(|&(_, &b)| b == d_byte)
+                            .nth(occurrences - 1)
+                            .map(|(idx, _)| idx)
+                    } else {
+                        bytes
+                            .iter()
+                            .enumerate()
+                            .rev()
+                            .filter(|&(_, &b)| b == d_byte)
+                            .nth(occurrences - 1)
+                            .map(|(idx, _)| idx + 1)
                     }
+                } else if n > 0 {
+                    // Multi-byte path: forward search for n-th occurrence
+                    string
+                        .match_indices(delimiter)
+                        .nth(occurrences - 1)
+                        .map(|(idx, _)| idx)
                 } else {
-                    match string.get(string.len().saturating_sub(length)..) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
+                    // Multi-byte path: backward search for n-th occurrence from the right
+                    string
+                        .rmatch_indices(delimiter)
+                        .nth(occurrences - 1)
+                        .map(|(idx, _)| idx + delimiter.len())
+                };
+                match result_idx {
+                    Some(idx) => {
+                        if n > 0 {
+                            builder.append_value(&string[..idx]);
+                        } else {
+                            builder.append_value(&string[idx..]);
+                        }
                     }
+                    None => builder.append_value(string),
                 }
             }
             _ => builder.append_null(),
@@ -328,7 +345,6 @@ mod tests {
             Utf8,
             StringArray
         );
-
         Ok(())
     }
 }
diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs
index 8b4894643a7a3..b212242dce8e9 100644
--- a/datafusion/functions/src/unicode/translate.rs
+++ b/datafusion/functions/src/unicode/translate.rs
@@ -16,27 +16,28 @@
 // under the License.
 
 use std::any::Any;
-use std::sync::Arc;
 
 use arrow::array::{
-    ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait,
+    ArrayAccessor, ArrayIter, ArrayRef, AsArray, LargeStringBuilder, StringBuilder,
+    StringLikeArrayBuilder, StringViewBuilder,
 };
 use arrow::datatypes::DataType;
 use datafusion_common::HashMap;
 use unicode_segmentation::UnicodeSegmentation;
 
-use crate::utils::{make_scalar_function, utf8_to_str_type};
-use datafusion_common::{exec_err, Result};
+use crate::utils::make_scalar_function;
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::TypeSignature::Exact;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
 #[user_doc(
     doc_section(label = "String Functions"),
-    description = "Translates characters in a string to specified translation characters.",
-    syntax_example = "translate(str, chars, translation)",
+    description = "Performs character-wise substitution based on a mapping.",
+    syntax_example = "translate(str, from, to)",
     sql_example = r#"```sql
 > select translate('twice', 'wic', 'her');
 +--------------------------------------------------+
@@ -46,13 +47,13 @@ use datafusion_macros::user_doc;
 +--------------------------------------------------+
 ```"#,
     standard_argument(name = "str", prefix = "String"),
-    argument(name = "chars", description = "Characters to translate."),
+    argument(name = "from", description = "The characters to be replaced."),
     argument(
-        name = "translation",
-        description = "Translation characters. Translation characters replace only characters at the same position in the **chars** string."
+        name = "to",
+        description = "The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed. If a character appears more than once in **from**, the first occurrence determines the mapping."
     )
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct TranslateFunc {
     signature: Signature,
 }
@@ -71,6 +72,7 @@ impl TranslateFunc {
                 vec![
                     Exact(vec![Utf8View, Utf8, Utf8]),
                     Exact(vec![Utf8, Utf8, Utf8]),
+                    Exact(vec![LargeUtf8, Utf8, Utf8]),
                 ],
                 Volatility::Immutable,
             ),
@@ -92,13 +94,74 @@ impl ScalarUDFImpl for TranslateFunc {
     }
 
     fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        utf8_to_str_type(&arg_types[0], "translate")
+        Ok(arg_types[0].clone())
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        // When from and to are scalars, pre-build the translation map once
+        if let (Some(from_str), Some(to_str)) = (
+            try_as_scalar_str(&args.args[1]),
+            try_as_scalar_str(&args.args[2]),
+        ) {
+            let to_graphemes: Vec<&str> = to_str.graphemes(true).collect();
+
+            let mut from_map: HashMap<&str, usize> = HashMap::new();
+            for (index, c) in from_str.graphemes(true).enumerate() {
+                // Ignore characters that already exist in from_map
+                from_map.entry(c).or_insert(index);
+            }
+
+            let ascii_table = build_ascii_translate_table(from_str, to_str);
+
+            let string_array = args.args[0].to_array_of_size(args.number_rows)?;
+            let len = string_array.len();
+
+            let result = match string_array.data_type() {
+                DataType::Utf8View => {
+                    let arr = string_array.as_string_view();
+                    let builder = StringViewBuilder::with_capacity(len);
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_graphemes,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                DataType::Utf8 => {
+                    let arr = string_array.as_string::<i32>();
+                    let builder =
+                        StringBuilder::with_capacity(len, arr.value_data().len());
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_graphemes,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                DataType::LargeUtf8 => {
+                    let arr = string_array.as_string::<i64>();
+                    let builder =
+                        LargeStringBuilder::with_capacity(len, arr.value_data().len());
+                    translate_with_map(
+                        arr,
+                        &from_map,
+                        &to_graphemes,
+                        ascii_table.as_ref(),
+                        builder,
+                    )
+                }
+                other => {
+                    return exec_err!(
+                        "Unsupported data type {other:?} for function translate"
+                    );
+                }
+            }?;
+
+            return Ok(ColumnarValue::Array(result));
+        }
+
         make_scalar_function(invoke_translate, vec![])(&args.args)
     }
 
@@ -107,25 +170,39 @@ impl ScalarUDFImpl for TranslateFunc {
     }
 }
 
+/// If `cv` is a non-null scalar string, return its value.
+fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
+    match cv {
+        ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
+        _ => None,
+    }
+}
+
 fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let len = args[0].len();
     match args[0].data_type() {
         DataType::Utf8View => {
             let string_array = args[0].as_string_view();
             let from_array = args[1].as_string::<i32>();
             let to_array = args[2].as_string::<i32>();
-            translate::<i32, _, _>(string_array, from_array, to_array)
+            let builder = StringViewBuilder::with_capacity(len);
+            translate(string_array, from_array, to_array, builder)
         }
         DataType::Utf8 => {
             let string_array = args[0].as_string::<i32>();
             let from_array = args[1].as_string::<i32>();
             let to_array = args[2].as_string::<i32>();
-            translate::<i32, _, _>(string_array, from_array, to_array)
+            let builder =
+                StringBuilder::with_capacity(len, string_array.value_data().len());
+            translate(string_array, from_array, to_array, builder)
         }
         DataType::LargeUtf8 => {
             let string_array = args[0].as_string::<i64>();
-            let from_array = args[1].as_string::<i64>();
-            let to_array = args[2].as_string::<i64>();
-            translate::<i64, _, _>(string_array, from_array, to_array)
+            let from_array = args[1].as_string::<i32>();
+            let to_array = args[2].as_string::<i32>();
+            let builder =
+                LargeStringBuilder::with_capacity(len, string_array.value_data().len());
+            translate(string_array, from_array, to_array, builder)
         }
         other => {
             exec_err!("Unsupported data type {other:?} for function translate")
@@ -135,59 +212,169 @@ fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
 
 /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted.
 /// translate('12345', '143', 'ax') = 'a2x5'
-fn translate<'a, T: OffsetSizeTrait, V, B>(
+fn translate<'a, V, B, O>(
     string_array: V,
     from_array: B,
     to_array: B,
+    mut builder: O,
 ) -> Result<ArrayRef>
 where
     V: ArrayAccessor<Item = &'a str>,
     B: ArrayAccessor<Item = &'a str>,
+    O: StringLikeArrayBuilder,
 {
     let string_array_iter = ArrayIter::new(string_array);
     let from_array_iter = ArrayIter::new(from_array);
     let to_array_iter = ArrayIter::new(to_array);
 
-    let result = string_array_iter
-        .zip(from_array_iter)
-        .zip(to_array_iter)
-        .map(|((string, from), to)| match (string, from, to) {
+    // Reusable buffers to avoid allocating for each row
+    let mut from_map: HashMap<&str, usize> = HashMap::new();
+    let mut from_graphemes: Vec<&str> = Vec::new();
+    let mut to_graphemes: Vec<&str> = Vec::new();
+    let mut string_graphemes: Vec<&str> = Vec::new();
+    let mut result_graphemes: Vec<&str> = Vec::new();
+
+    for ((string, from), to) in string_array_iter.zip(from_array_iter).zip(to_array_iter)
+    {
+        match (string, from, to) {
             (Some(string), Some(from), Some(to)) => {
-                // create a hashmap of [char, index] to change from O(n) to O(1) for from list
-                let from_map: HashMap<&str, usize> = from
-                    .graphemes(true)
-                    .collect::<Vec<&str>>()
-                    .iter()
-                    .enumerate()
-                    .map(|(index, c)| (c.to_owned(), index))
-                    .collect();
-
-                let to = to.graphemes(true).collect::<Vec<&str>>();
-
-                Some(
-                    string
-                        .graphemes(true)
-                        .collect::<Vec<&str>>()
-                        .iter()
-                        .flat_map(|c| match from_map.get(*c) {
-                            Some(n) => to.get(*n).copied(),
-                            None => Some(*c),
-                        })
-                        .collect::<Vec<&str>>()
-                        .concat(),
-                )
+                // Clear and reuse buffers
+                from_map.clear();
+                from_graphemes.clear();
+                to_graphemes.clear();
+                string_graphemes.clear();
+                result_graphemes.clear();
+
+                // Build from_map using reusable buffer
+                from_graphemes.extend(from.graphemes(true));
+                for (index, c) in from_graphemes.iter().enumerate() {
+                    // Ignore characters that already exist in from_map
+                    from_map.entry(*c).or_insert(index);
+                }
+
+                // Build to_graphemes
+                to_graphemes.extend(to.graphemes(true));
+
+                // Process string and build result
+                string_graphemes.extend(string.graphemes(true));
+                for c in &string_graphemes {
+                    match from_map.get(*c) {
+                        Some(n) => {
+                            if let Some(replacement) = to_graphemes.get(*n) {
+                                result_graphemes.push(*replacement);
+                            }
+                        }
+                        None => result_graphemes.push(*c),
+                    }
+                }
+
+                builder.append_value(&result_graphemes.concat());
             }
-            _ => None,
-        })
-        .collect::<GenericStringArray<T>>();
+            _ => builder.append_null(),
+        }
+    }
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(builder.finish())
+}
+
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character).  Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed.  Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+    if !from.is_ascii() || !to.is_ascii() {
+        return None;
+    }
+    let mut table = [0u8; 128];
+    for i in 0..128u8 {
+        table[i as usize] = i;
+    }
+    let to_bytes = to.as_bytes();
+    let mut seen = [false; 128];
+    for (i, from_byte) in from.bytes().enumerate() {
+        let idx = from_byte as usize;
+        if !seen[idx] {
+            seen[idx] = true;
+            if i < to_bytes.len() {
+                table[idx] = to_bytes[i];
+            } else {
+                table[idx] = ASCII_DELETE;
+            }
+        }
+    }
+    Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a pre-built
+/// translation map instead of rebuilding it for every row.  When an ASCII byte
+/// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
+/// inputs fallback to using the map.
+fn translate_with_map<'a, V, O>(
+    string_array: V,
+    from_map: &HashMap<&str, usize>,
+    to_graphemes: &[&str],
+    ascii_table: Option<&[u8; 128]>,
+    mut builder: O,
+) -> Result<ArrayRef>
+where
+    V: ArrayAccessor<Item = &'a str>,
+    O: StringLikeArrayBuilder,
+{
+    let mut result_graphemes: Vec<&str> = Vec::new();
+    let mut ascii_buf: Vec<u8> = Vec::new();
+
+    for string in ArrayIter::new(string_array) {
+        match string {
+            Some(s) => {
+                // Fast path: byte-level table lookup for ASCII strings
+                if let Some(table) = ascii_table
+                    && s.is_ascii()
+                {
+                    ascii_buf.clear();
+                    for &b in s.as_bytes() {
+                        let mapped = table[b as usize];
+                        if mapped != ASCII_DELETE {
+                            ascii_buf.push(mapped);
+                        }
+                    }
+                    // SAFETY: all bytes are ASCII, hence valid UTF-8.
+                    builder.append_value(unsafe {
+                        std::str::from_utf8_unchecked(&ascii_buf)
+                    });
+                } else {
+                    // Slow path: grapheme-based translation
+                    result_graphemes.clear();
+
+                    for c in s.graphemes(true) {
+                        match from_map.get(c) {
+                            Some(n) => {
+                                if let Some(replacement) = to_graphemes.get(*n) {
+                                    result_graphemes.push(*replacement);
+                                }
+                            }
+                            None => result_graphemes.push(c),
+                        }
+                    }
+
+                    builder.append_value(&result_graphemes.concat());
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(builder.finish())
 }
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::{Array, StringArray};
-    use arrow::datatypes::DataType::Utf8;
+    use arrow::array::{Array, StringArray, StringViewArray};
+    use arrow::datatypes::DataType::{Utf8, Utf8View};
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
@@ -245,6 +432,18 @@ mod tests {
             Utf8,
             StringArray
         );
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("abcabc")),
+                ColumnarValue::Scalar(ScalarValue::from("aa")),
+                ColumnarValue::Scalar(ScalarValue::from("de"))
+            ],
+            Ok(Some("dbcdbc")),
+            &str,
+            Utf8,
+            StringArray
+        );
         test_function!(
             TranslateFunc::new(),
             vec![
@@ -257,6 +456,60 @@ mod tests {
             Utf8,
             StringArray
         );
+        // Non-ASCII input with ASCII scalar from/to: exercises the
+        // grapheme fallback within translate_with_map.
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::from("café")),
+                ColumnarValue::Scalar(ScalarValue::from("ae")),
+                ColumnarValue::Scalar(ScalarValue::from("AE"))
+            ],
+            Ok(Some("cAfé")),
+            &str,
+            Utf8,
+            StringArray
+        );
+        // Utf8View input should produce Utf8View output
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("12345".into()))),
+                ColumnarValue::Scalar(ScalarValue::from("143")),
+                ColumnarValue::Scalar(ScalarValue::from("ax"))
+            ],
+            Ok(Some("a2x5")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        // Null Utf8View input
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(None)),
+                ColumnarValue::Scalar(ScalarValue::from("143")),
+                ColumnarValue::Scalar(ScalarValue::from("ax"))
+            ],
+            Ok(None),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+        // Non-ASCII Utf8View input
+        test_function!(
+            TranslateFunc::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8View(Some("é2íñ5".into()))),
+                ColumnarValue::Scalar(ScalarValue::from("éñí")),
+                ColumnarValue::Scalar(ScalarValue::from("óü"))
+            ],
+            Ok(Some("ó2ü5")),
+            &str,
+            Utf8View,
+            StringViewArray
+        );
+
         #[cfg(not(feature = "unicode_expressions"))]
         test_function!(
             TranslateFunc::new(),
diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs
index 583ff48bff39d..b9bde1454994c 100644
--- a/datafusion/functions/src/utils.rs
+++ b/datafusion/functions/src/utils.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::ArrayRef;
-use arrow::datatypes::DataType;
-
-use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::function::Hint;
+use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray};
+use arrow::compute::try_binary;
+use arrow::datatypes::{DataType, DecimalType};
+use arrow::error::ArrowError;
+use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::ColumnarValue;
+use datafusion_expr::function::Hint;
+use std::sync::Arc;
 
 /// Creates a function to identify the optimal return type of a string function given
 /// the type of its first argument.
@@ -120,6 +122,133 @@ where
     }
 }
 
+/// Computes a binary math function for input arrays using a specified function.
+/// Generic types:
+/// - `L`: Left array primitive type
+/// - `R`: Right array primitive type
+/// - `O`: Output array primitive type
+/// - `F`: Functor computing `fun(l: L, r: R) -> Result<OutputType>`
+pub fn calculate_binary_math<L, R, O, F>(
+    left: &dyn Array,
+    right: &ColumnarValue,
+    fun: F,
+) -> Result<Arc<PrimitiveArray<O>>>
+where
+    L: ArrowPrimitiveType,
+    R: ArrowPrimitiveType,
+    O: ArrowPrimitiveType,
+    F: Fn(L::Native, R::Native) -> Result<O::Native, ArrowError>,
+    R::Native: TryFrom<ScalarValue>,
+{
+    let left = left.as_primitive::<L>();
+    let right = right.cast_to(&R::DATA_TYPE, None)?;
+    let result = match right {
+        ColumnarValue::Scalar(scalar) => {
+            if scalar.is_null() {
+                // Null scalar is castable to any numeric, creating a non-null expression.
+                // Provide null array explicitly to make result null
+                PrimitiveArray::<O>::new_null(left.len())
+            } else {
+                let right = R::Native::try_from(scalar.clone()).map_err(|_| {
+                    DataFusionError::NotImplemented(format!(
+                        "Cannot convert scalar value {} to {}",
+                        &scalar,
+                        R::DATA_TYPE
+                    ))
+                })?;
+                left.try_unary::<_, O, _>(|lvalue| fun(lvalue, right))?
+            }
+        }
+        ColumnarValue::Array(right) => {
+            let right = right.as_primitive::<R>();
+            try_binary::<_, _, _, O>(left, right, &fun)?
+        }
+    };
+    Ok(Arc::new(result) as _)
+}
+
+/// Computes a binary math function for input arrays using a specified function
+/// and apply rescaling to given precision and scale.
+/// Generic types:
+/// - `L`: Left array decimal type
+/// - `R`: Right array primitive type
+/// - `O`: Output array decimal type
+/// - `F`: Functor computing `fun(l: L, r: R) -> Result<OutputType>`
+pub fn calculate_binary_decimal_math<L, R, O, F>(
+    left: &dyn Array,
+    right: &ColumnarValue,
+    fun: F,
+    precision: u8,
+    scale: i8,
+) -> Result<Arc<PrimitiveArray<O>>>
+where
+    L: DecimalType,
+    R: ArrowPrimitiveType,
+    O: DecimalType,
+    F: Fn(L::Native, R::Native) -> Result<O::Native, ArrowError>,
+    R::Native: TryFrom<ScalarValue>,
+{
+    let result_array = calculate_binary_math::<L, R, O, F>(left, right, fun)?;
+    Ok(Arc::new(
+        result_array
+            .as_ref()
+            .clone()
+            .with_precision_and_scale(precision, scale)?,
+    ))
+}
+
+/// Converts Decimal128 components (value and scale) to an unscaled i128
+pub fn decimal128_to_i128(value: i128, scale: i8) -> Result<i128, ArrowError> {
+    if scale < 0 {
+        Err(ArrowError::ComputeError(
+            "Negative scale is not supported".into(),
+        ))
+    } else if scale == 0 {
+        Ok(value)
+    } else {
+        match i128::from(10).checked_pow(scale as u32) {
+            Some(divisor) => Ok(value / divisor),
+            None => Err(ArrowError::ComputeError(format!(
+                "Cannot get a power of {scale}"
+            ))),
+        }
+    }
+}
+
+pub fn decimal32_to_i32(value: i32, scale: i8) -> Result<i32, ArrowError> {
+    if scale < 0 {
+        Err(ArrowError::ComputeError(
+            "Negative scale is not supported".into(),
+        ))
+    } else if scale == 0 {
+        Ok(value)
+    } else {
+        match 10_i32.checked_pow(scale as u32) {
+            Some(divisor) => Ok(value / divisor),
+            None => Err(ArrowError::ComputeError(format!(
+                "Cannot get a power of {scale}"
+            ))),
+        }
+    }
+}
+
+pub fn decimal64_to_i64(value: i64, scale: i8) -> Result<i64, ArrowError> {
+    if scale < 0 {
+        Err(ArrowError::ComputeError(
+            "Negative scale is not supported".into(),
+        ))
+    } else if scale == 0 {
+        Ok(value)
+    } else {
+        match i64::from(10).checked_pow(scale as u32) {
+            Some(divisor) => Ok(value / divisor),
+            None => Err(ArrowError::ComputeError(format!(
+                "Cannot get a power of {scale}"
+            ))),
+        }
+    }
+}
+
 #[cfg(test)]
 pub mod test {
     /// $FUNC ScalarUDFImpl to test
@@ -128,19 +257,20 @@ pub mod test {
     /// $EXPECTED_TYPE is the expected value type
     /// $EXPECTED_DATA_TYPE is the expected result type
     /// $ARRAY_TYPE is the column type after function applied
+    /// $CONFIG_OPTIONS config options to pass to function
     macro_rules! test_function {
-        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
-            let expected: Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
-            let func = $FUNC;
-
-            let data_array = $ARGS.iter().map(|arg| arg.data_type()).collect::<Vec<_>>();
-            let cardinality = $ARGS
-                .iter()
-                .fold(Option::<usize>::None, |acc, arg| match arg {
-                    ColumnarValue::Scalar(_) => acc,
-                    ColumnarValue::Array(a) => Some(a.len()),
-                })
-                .unwrap_or(1);
+    ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident, $CONFIG_OPTIONS:expr) => {
+        let expected: Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
+        let func = $FUNC;
+
+        let data_array = $ARGS.iter().map(|arg| arg.data_type()).collect::<Vec<_>>();
+        let cardinality = $ARGS
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            })
+            .unwrap_or(1);
 
             let scalar_arguments = $ARGS.iter().map(|arg| match arg {
                 ColumnarValue::Scalar(scalar) => Some(scalar.clone()),
@@ -155,67 +285,108 @@ pub mod test {
 
             let field_array = data_array.into_iter().zip(nullables).enumerate()
                 .map(|(idx, (data_type, nullable))| arrow::datatypes::Field::new(format!("field_{idx}"), data_type, nullable))
-                .map(std::sync::Arc::new)
-                .collect::<Vec<_>>();
+            .map(std::sync::Arc::new)
+            .collect::<Vec<_>>();
 
-            let return_field = func.return_field_from_args(datafusion_expr::ReturnFieldArgs {
-                arg_fields: &field_array,
-                scalar_arguments: &scalar_arguments_refs,
-            });
+        let return_field = func.return_field_from_args(datafusion_expr::ReturnFieldArgs {
+            arg_fields: &field_array,
+            scalar_arguments: &scalar_arguments_refs,
+        });
             let arg_fields = $ARGS.iter()
-                .enumerate()
+            .enumerate()
                 .map(|(idx, arg)| arrow::datatypes::Field::new(format!("f_{idx}"), arg.data_type(), true).into())
-                .collect::<Vec<_>>();
+            .collect::<Vec<_>>();
 
-            match expected {
-                Ok(expected) => {
-                    assert_eq!(return_field.is_ok(), true);
-                    let return_field = return_field.unwrap();
-                    let return_type = return_field.data_type();
-                    assert_eq!(return_type, &$EXPECTED_DATA_TYPE);
+        match expected {
+            Ok(expected) => {
+                assert_eq!(return_field.is_ok(), true);
+                let return_field = return_field.unwrap();
+                let return_type = return_field.data_type();
+                assert_eq!(return_type, &$EXPECTED_DATA_TYPE);
 
-                    let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, arg_fields, number_rows: cardinality, return_field});
+                    let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
+                    args: $ARGS,
+                    arg_fields,
+                    number_rows: cardinality,
+                    return_field,
+                        config_options: $CONFIG_OPTIONS
+                });
                     assert_eq!(result.is_ok(), true, "function returned an error: {}", result.unwrap_err());
 
                     let result = result.unwrap().to_array(cardinality).expect("Failed to convert to array");
                     let result = result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to type");
-                    assert_eq!(result.data_type(), &$EXPECTED_DATA_TYPE);
+                assert_eq!(result.data_type(), &$EXPECTED_DATA_TYPE);
 
-                    // value is correct
-                    match expected {
-                        Some(v) => assert_eq!(result.value(0), v),
-                        None => assert!(result.is_null(0)),
-                    };
-                }
-                Err(expected_error) => {
-                    if return_field.is_err() {
-                        match return_field {
-                            Ok(_) => assert!(false, "expected error"),
-                            Err(error) => { datafusion_common::assert_contains!(expected_error.strip_backtrace(), error.strip_backtrace()); }
-                        }
-                    }
-                    else {
-                        let return_field = return_field.unwrap();
-
-                        // invoke is expected error - cannot use .expect_err() due to Debug not being implemented
-                        match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, arg_fields, number_rows: cardinality, return_field}) {
-                            Ok(_) => assert!(false, "expected error"),
-                            Err(error) => {
-                                assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace()));
-                            }
+                // value is correct
+                match expected {
+                    Some(v) => assert_eq!(result.value(0), v),
+                    None => assert!(result.is_null(0)),
+                };
+            }
+            Err(expected_error) => {
+                if let Ok(return_field) = return_field {
+                    // invoke is expected error - cannot use .expect_err() due to Debug not being implemented
+                    match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs {
+                        args: $ARGS,
+                        arg_fields,
+                        number_rows: cardinality,
+                        return_field,
+                        config_options: $CONFIG_OPTIONS,
+                    }) {
+                        Ok(_) => assert!(false, "expected error"),
+                        Err(error) => {
+                            assert!(expected_error
+                                .strip_backtrace()
+                                .starts_with(&error.strip_backtrace()));
                         }
                     }
+                } else if let Err(error) = return_field {
+                    datafusion_common::assert_contains!(
+                        expected_error.strip_backtrace(),
+                        error.strip_backtrace()
+                    );
                 }
-            };
+            }
+        };
+    };
+
+        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
+            test_function!(
+                $FUNC,
+                $ARGS,
+                $EXPECTED,
+                $EXPECTED_TYPE,
+                $EXPECTED_DATA_TYPE,
+                $ARRAY_TYPE,
+                std::sync::Arc::new(datafusion_common::config::ConfigOptions::default())
+            )
         };
     }
 
-    use arrow::datatypes::DataType;
-    #[allow(unused_imports)]
+    use arrow::{
+        array::Int32Array,
+        datatypes::{DataType, Int32Type},
+    };
+    use itertools::Either;
     pub(crate) use test_function;
 
     use super::*;
 
+    #[test]
+    fn test_calculate_binary_math_scalar_null() {
+        let left = Int32Array::from(vec![1, 2]);
+        let right = ColumnarValue::Scalar(ScalarValue::Int32(None));
+        let result = calculate_binary_math::<Int32Type, Int32Type, Int32Type, _>(
+            &left,
+            &right,
+            |x, y| Ok(x + y),
+        )
+        .unwrap();
+
+        assert_eq!(result.len(), 2);
+        assert_eq!(result.null_count(), 2);
+    }
+
     #[test]
     fn string_to_int_type() {
         let v = utf8_to_int_type(&DataType::Utf8, "test").unwrap();
@@ -227,4 +398,133 @@ pub mod test {
         let v = utf8_to_int_type(&DataType::LargeUtf8, "test").unwrap();
         assert_eq!(v, DataType::Int64);
     }
+
+    #[test]
+    fn test_decimal128_to_i128() {
+        let cases = [
+            (123, 0, Some(123)),
+            (1230, 1, Some(123)),
+            (123000, 3, Some(123)),
+            (1, 0, Some(1)),
+            (123, -3, None),
+            (123, i8::MAX, None),
+            (i128::MAX, 0, Some(i128::MAX)),
+            (i128::MAX, 3, Some(i128::MAX / 1000)),
+        ];
+
+        for (value, scale, expected) in cases {
+            match decimal128_to_i128(value, scale) {
+                Ok(actual) => {
+                    assert_eq!(
+                        actual,
+                        expected.expect("Got value but expected none"),
+                        "{value} and {scale} vs {expected:?}"
+                    );
+                }
+                Err(_) => assert!(expected.is_none()),
+            }
+        }
+    }
+
+    #[test]
+    fn test_decimal32_to_i32() {
+        let cases: [(i32, i8, Either<i32, String>); _] = [
+            (123, 0, Either::Left(123)),
+            (1230, 1, Either::Left(123)),
+            (123000, 3, Either::Left(123)),
+            (1234567, 2, Either::Left(12345)),
+            (-1234567, 2, Either::Left(-12345)),
+            (1, 0, Either::Left(1)),
+            (
+                123,
+                -3,
+                Either::Right("Negative scale is not supported".into()),
+            ),
+            (
+                123,
+                i8::MAX,
+                Either::Right("Cannot get a power of 127".into()),
+            ),
+            (999999999, 0, Either::Left(999999999)),
+            (999999999, 3, Either::Left(999999)),
+        ];
+
+        for (value, scale, expected) in cases {
+            match decimal32_to_i32(value, scale) {
+                Ok(actual) => {
+                    let expected_value =
+                        expected.left().expect("Got value but expected none");
+                    assert_eq!(
+                        actual, expected_value,
+                        "{value} and {scale} vs {expected_value:?}"
+                    );
+                }
+                Err(ArrowError::ComputeError(msg)) => {
+                    assert_eq!(
+                        msg,
+                        expected.right().expect("Got error but expected value")
+                    );
+                }
+                Err(_) => {
+                    assert!(expected.is_right())
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_decimal64_to_i64() {
+        let cases: [(i64, i8, Either<i64, String>); _] = [
+            (123, 0, Either::Left(123)),
+            (1234567890, 2, Either::Left(12345678)),
+            (-1234567890, 2, Either::Left(-12345678)),
+            (
+                123,
+                -3,
+                Either::Right("Negative scale is not supported".into()),
+            ),
+            (
+                123,
+                i8::MAX,
+                Either::Right("Cannot get a power of 127".into()),
+            ),
+            (
+                999999999999999999i64,
+                0,
+                Either::Left(999999999999999999i64),
+            ),
+            (
+                999999999999999999i64,
+                3,
+                Either::Left(999999999999999999i64 / 1000),
+            ),
+            (
+                -999999999999999999i64,
+                3,
+                Either::Left(-999999999999999999i64 / 1000),
+            ),
+        ];
+
+        for (value, scale, expected) in cases {
+            match decimal64_to_i64(value, scale) {
+                Ok(actual) => {
+                    let expected_value =
+                        expected.left().expect("Got value but expected none");
+                    assert_eq!(
+                        actual, expected_value,
+                        "{value} and {scale} vs {expected_value:?}"
+                    );
+                }
+                Err(ArrowError::ComputeError(msg)) => {
+                    assert_eq!(
+                        msg,
+                        expected.right().expect("Got error but expected value")
+                    );
+                }
+                Err(_) => {
+                    assert!(expected.is_right())
+                }
+            }
+        }
+    }
 }
diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml
index c6532aa046810..91f1dde62aaac 100644
--- a/datafusion/macros/Cargo.toml
+++ b/datafusion/macros/Cargo.toml
@@ -19,6 +19,7 @@
 name = "datafusion-macros"
 description = "Procedural macros for DataFusion query engine"
 keywords = ["datafusion", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
 homepage = { workspace = true }
@@ -30,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,6 +44,6 @@ path = "src/user_doc.rs"
 proc-macro = true
 
 [dependencies]
-datafusion-expr = { workspace = true }
-quote = "1.0.40"
-syn = { version = "2.0.100", features = ["full"] }
+datafusion-doc = { workspace = true }
+quote = "1.0.44"
+syn = { version = "2.0.117", features = ["full"] }
diff --git a/datafusion/macros/README.md b/datafusion/macros/README.md
new file mode 100644
index 0000000000000..c45bba1423fc2
--- /dev/null
+++ b/datafusion/macros/README.md
@@ -0,0 +1,30 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Macros
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate contains common macros used in DataFusion
+
+Most projects should use the [`datafusion`] crate directly.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs
index 31cf9bb1b7504..ce9e7d55ef103 100644
--- a/datafusion/macros/src/user_doc.rs
+++ b/datafusion/macros/src/user_doc.rs
@@ -19,13 +19,13 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
 extern crate proc_macro;
-use datafusion_expr::scalar_doc_sections::doc_sections_const;
+use datafusion_doc::scalar_doc_sections::doc_sections_const;
 use proc_macro::TokenStream;
 use quote::quote;
-use syn::{parse_macro_input, DeriveInput, LitStr};
+use syn::{DeriveInput, LitStr, parse_macro_input};
 
 /// This procedural macro is intended to parse a rust custom attribute and create user documentation
 /// from it by constructing a `DocumentBuilder()` automatically. The `Documentation` can be
@@ -61,7 +61,6 @@ use syn::{parse_macro_input, DeriveInput, LitStr};
 /// }
 /// ```
 /// will generate the following code
-///
 /// ```ignore
 /// pub struct ToDateFunc {
 ///     signature: Signature,
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index 60358d20e2a1a..e3a6733532324 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -40,18 +43,27 @@ name = "datafusion_optimizer"
 [features]
 recursive_protection = ["dep:recursive"]
 
+# Note -- please DO NOT add a dependency here to any of the datafusion-functions
+# crates. While it is tempting to try and add an optimizer pass that uses
+# datafusion-functions Doing so makes it harder for downstream crates to
+# provide their own function library and smaller install footprint.
+#
+# If you want to add special handling for a specific function, use the methods
+# on the ScalarUDFImpl or AggregateUDFImpl traits (or add a new method to those
+# traits).
 [dependencies]
 arrow = { workspace = true }
 chrono = { workspace = true }
 datafusion-common = { workspace = true, default-features = true }
 datafusion-expr = { workspace = true }
+datafusion-expr-common = { workspace = true }
 datafusion-physical-expr = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
 recursive = { workspace = true, optional = true }
 regex = { workspace = true }
-regex-syntax = "0.8.0"
+regex-syntax = "0.8.9"
 
 [dev-dependencies]
 async-trait = { workspace = true }
diff --git a/datafusion/optimizer/README.md b/datafusion/optimizer/README.md
index 61bc1cd70145b..a95ec4828b35e 100644
--- a/datafusion/optimizer/README.md
+++ b/datafusion/optimizer/README.md
@@ -17,6 +17,18 @@
   under the License.
 -->
 
-Please see [Query Optimizer] in the Library User Guide
+# Apache DataFusion Optimizer
 
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate contains the DataFusion logical optimizer.
+Please see [Query Optimizer] in the Library User Guide for more information.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
 [query optimizer]: https://datafusion.apache.org/library-user-guide/query-optimizer.html
diff --git a/datafusion/optimizer/benches/projection_unnecessary.rs b/datafusion/optimizer/benches/projection_unnecessary.rs
index ee7889eb33213..2082ed6a37515 100644
--- a/datafusion/optimizer/benches/projection_unnecessary.rs
+++ b/datafusion/optimizer/benches/projection_unnecessary.rs
@@ -16,11 +16,12 @@
 // under the License.
 
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ToDFSchema;
 use datafusion_common::{Column, TableReference};
-use datafusion_expr::{logical_plan::LogicalPlan, projection_schema, Expr};
+use datafusion_expr::{Expr, logical_plan::LogicalPlan, projection_schema};
 use datafusion_optimizer::optimize_projections::is_projection_unnecessary;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn is_projection_unnecessary_old(
@@ -30,7 +31,7 @@ fn is_projection_unnecessary_old(
     // First check if all expressions are trivial (cheaper operation than `projection_schema`)
     if !proj_exprs
         .iter()
-        .all(|expr| matches!(expr, Expr::Column(_) | Expr::Literal(_)))
+        .all(|expr| matches!(expr, Expr::Column(_) | Expr::Literal(_, _)))
     {
         return Ok(false);
     }
diff --git a/datafusion/optimizer/src/analyzer/function_rewrite.rs b/datafusion/optimizer/src/analyzer/function_rewrite.rs
index c6bf14ebce2e3..9faa60d939fe3 100644
--- a/datafusion/optimizer/src/analyzer/function_rewrite.rs
+++ b/datafusion/optimizer/src/analyzer/function_rewrite.rs
@@ -23,9 +23,9 @@ use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{DFSchema, Result};
 
 use crate::utils::NamePreserver;
+use datafusion_expr::LogicalPlan;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
 use datafusion_expr::utils::merge_schema;
-use datafusion_expr::LogicalPlan;
 use std::sync::Arc;
 
 /// Analyzer rule that invokes [`FunctionRewrite`]s on expressions
diff --git a/datafusion/optimizer/src/analyzer/mod.rs b/datafusion/optimizer/src/analyzer/mod.rs
index 2517e3c3a4006..ddb3b828f01dd 100644
--- a/datafusion/optimizer/src/analyzer/mod.rs
+++ b/datafusion/optimizer/src/analyzer/mod.rs
@@ -22,9 +22,9 @@ use std::sync::Arc;
 
 use log::debug;
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::instant::Instant;
-use datafusion_common::Result;
 use datafusion_expr::expr_rewriter::FunctionRewrite;
 use datafusion_expr::{InvariantLevel, LogicalPlan};
 
@@ -38,14 +38,6 @@ pub mod function_rewrite;
 pub mod resolve_grouping_function;
 pub mod type_coercion;
 
-pub mod subquery {
-    #[deprecated(
-        since = "44.0.0",
-        note = "please use `datafusion_expr::check_subquery_expr` instead"
-    )]
-    pub use datafusion_expr::check_subquery_expr;
-}
-
 /// [`AnalyzerRule`]s transform [`LogicalPlan`]s in some way to make
 /// the plan valid prior to the rest of the DataFusion optimization process.
 ///
diff --git a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
index f8a8185636090..747c54e2cd26d 100644
--- a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
+++ b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs
@@ -28,14 +28,14 @@ use arrow::datatypes::DataType;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{
-    internal_datafusion_err, plan_err, Column, DFSchemaRef, Result, ScalarValue,
+    Column, DFSchema, Result, ScalarValue, internal_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::{AggregateFunction, Alias};
 use datafusion_expr::logical_plan::LogicalPlan;
 use datafusion_expr::utils::grouping_set_to_exprlist;
 use datafusion_expr::{
-    bitwise_and, bitwise_or, bitwise_shift_left, bitwise_shift_right, cast, Aggregate,
-    Expr, Projection,
+    Aggregate, Expr, Projection, bitwise_and, bitwise_or, bitwise_shift_left,
+    bitwise_shift_right, cast,
 };
 use itertools::Itertools;
 
@@ -74,7 +74,7 @@ fn group_expr_to_bitmap_index(group_expr: &[Expr]) -> Result<HashMap<&Expr, usiz
 
 fn replace_grouping_exprs(
     input: Arc<LogicalPlan>,
-    schema: DFSchemaRef,
+    schema: &DFSchema,
     group_expr: Vec<Expr>,
     aggr_expr: Vec<Expr>,
 ) -> Result<LogicalPlan> {
@@ -139,7 +139,7 @@ fn analyze_internal(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
             schema,
             ..
         }) if contains_grouping_function(&aggr_expr) => Ok(Transformed::yes(
-            replace_grouping_exprs(input, schema, group_expr, aggr_expr)?,
+            replace_grouping_exprs(input, schema.as_ref(), group_expr, aggr_expr)?,
         )),
         _ => Ok(Transformed::no(plan)),
     })?;
@@ -150,7 +150,7 @@ fn analyze_internal(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
 fn is_grouping_function(expr: &Expr) -> bool {
     // TODO: Do something better than name here should grouping be a built
     // in expression?
-    matches!(expr, Expr::AggregateFunction(AggregateFunction { ref func, .. }) if func.name() == "grouping")
+    matches!(expr, Expr::AggregateFunction(AggregateFunction { func, .. }) if func.name() == "grouping")
 }
 
 fn contains_grouping_function(exprs: &[Expr]) -> bool {
@@ -189,19 +189,19 @@ fn grouping_function_on_id(
     // Postgres allows grouping function for group by without grouping sets, the result is then
     // always 0
     if !is_grouping_set {
-        return Ok(Expr::Literal(ScalarValue::from(0i32)));
+        return Ok(Expr::Literal(ScalarValue::from(0i32), None));
     }
 
     let group_by_expr_count = group_by_expr.len();
     let literal = |value: usize| {
         if group_by_expr_count < 8 {
-            Expr::Literal(ScalarValue::from(value as u8))
+            Expr::Literal(ScalarValue::from(value as u8), None)
         } else if group_by_expr_count < 16 {
-            Expr::Literal(ScalarValue::from(value as u16))
+            Expr::Literal(ScalarValue::from(value as u16), None)
         } else if group_by_expr_count < 32 {
-            Expr::Literal(ScalarValue::from(value as u32))
+            Expr::Literal(ScalarValue::from(value as u32), None)
         } else {
-            Expr::Literal(ScalarValue::from(value as u64))
+            Expr::Literal(ScalarValue::from(value as u64), None)
         }
     };
 
diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs
index c8246ecebd543..efc9984acb9b0 100644
--- a/datafusion/optimizer/src/analyzer/type_coercion.rs
+++ b/datafusion/optimizer/src/analyzer/type_coercion.rs
@@ -17,42 +17,41 @@
 
 //! Optimizer rule for type validation and coercion
 
-use std::sync::Arc;
-
+use arrow::compute::can_cast_types;
 use datafusion_expr::binary::BinaryTypeCoercer;
-use itertools::izip;
-
-use arrow::datatypes::{DataType, Field, IntervalUnit, Schema};
+use itertools::{Itertools as _, izip};
+use std::sync::Arc;
 
 use crate::analyzer::AnalyzerRule;
 use crate::utils::NamePreserver;
+
+use arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit};
+use arrow::temporal_conversions::SECONDS_IN_DAY;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column,
-    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference,
+    Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference,
+    exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err,
+    plan_err,
 };
 use datafusion_expr::expr::{
     self, AggregateFunctionParams, Alias, Between, BinaryExpr, Case, Exists, InList,
-    InSubquery, Like, ScalarFunction, Sort, WindowFunction,
+    InSubquery, Like, ScalarFunction, SetComparison, Sort, WindowFunction,
 };
 use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema;
 use datafusion_expr::expr_schema::cast_subquery;
 use datafusion_expr::logical_plan::Subquery;
 use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion};
-use datafusion_expr::type_coercion::functions::{
-    data_types_with_scalar_udf, fields_with_aggregate_udf,
-};
+use datafusion_expr::type_coercion::functions::{UDFCoercionExt, fields_with_udf};
+use datafusion_expr::type_coercion::is_datetime;
 use datafusion_expr::type_coercion::other::{
     get_coerce_type_for_case_expression, get_coerce_type_for_list,
 };
-use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8};
 use datafusion_expr::utils::merge_schema;
 use datafusion_expr::{
-    is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not,
-    AggregateUDF, Expr, ExprFunctionExt, ExprSchemable, Join, Limit, LogicalPlan,
-    Operator, Projection, ScalarUDF, Union, WindowFrame, WindowFrameBound,
-    WindowFrameUnits,
+    Cast, Expr, ExprSchemable, Join, Limit, LogicalPlan, Operator, Projection, Union,
+    WindowFrame, WindowFrameBound, WindowFrameUnits, is_false, is_not_false, is_not_true,
+    is_not_unknown, is_true, is_unknown, lit, not,
 };
 
 /// Performs type coercion by determining the schema
@@ -253,7 +252,7 @@ impl<'a> TypeCoercionRewriter<'a> {
             if dt.is_integer() || dt.is_null() {
                 expr.cast_to(&DataType::Int64, schema)
             } else {
-                plan_err!("Expected {expr_name} to be an integer or null, but got {dt:?}")
+                plan_err!("Expected {expr_name} to be an integer or null, but got {dt}")
             }
         }
 
@@ -290,17 +289,150 @@ impl<'a> TypeCoercionRewriter<'a> {
         right: Expr,
         right_schema: &DFSchema,
     ) -> Result<(Expr, Expr)> {
-        let (left_type, right_type) = BinaryTypeCoercer::new(
-            &left.get_type(left_schema)?,
+        let left_data_type = left.get_type(left_schema)?;
+        let right_data_type = right.get_type(right_schema)?;
+        let (left_type, right_type) =
+            BinaryTypeCoercer::new(&left_data_type, &op, &right_data_type)
+                .get_input_types()?;
+        let left_cast_ok = can_cast_types(&left_data_type, &left_type);
+        let right_cast_ok = can_cast_types(&right_data_type, &right_type);
+
+        // handle special cases for
+        // * Date +/- int => Date
+        // * Date + time => Timestamp
+        let left_expr = if !left_cast_ok {
+            Self::coerce_date_time_math_op(
+                left,
+                &op,
+                &left_data_type,
+                &left_type,
+                &right_type,
+            )?
+        } else {
+            left.cast_to(&left_type, left_schema)?
+        };
+
+        let right_expr = if !right_cast_ok {
+            Self::coerce_date_time_math_op(
+                right,
+                &op,
+                &right_data_type,
+                &right_type,
+                &left_type,
+            )?
+        } else {
+            right.cast_to(&right_type, right_schema)?
+        };
+
+        Ok((left_expr, right_expr))
+    }
+
+    fn coerce_date_time_math_op(
+        expr: Expr,
+        op: &Operator,
+        left_current_type: &DataType,
+        left_target_type: &DataType,
+        right_target_type: &DataType,
+    ) -> Result<Expr, DataFusionError> {
+        use DataType::*;
+
+        fn cast(expr: Expr, target_type: DataType) -> Expr {
+            Expr::Cast(Cast::new(Box::new(expr), target_type))
+        }
+
+        fn time_to_nanos(
+            expr: Expr,
+            expr_type: &DataType,
+        ) -> Result<Expr, DataFusionError> {
+            let expr = match expr_type {
+                Time32(TimeUnit::Second) => {
+                    cast(cast(expr, Int32), Int64)
+                        * lit(ScalarValue::Int64(Some(1_000_000_000)))
+                }
+                Time32(TimeUnit::Millisecond) => {
+                    cast(cast(expr, Int32), Int64)
+                        * lit(ScalarValue::Int64(Some(1_000_000)))
+                }
+                Time64(TimeUnit::Microsecond) => {
+                    cast(expr, Int64) * lit(ScalarValue::Int64(Some(1_000)))
+                }
+                Time64(TimeUnit::Nanosecond) => cast(expr, Int64),
+                t => return internal_err!("Unexpected time data type {t}"),
+            };
+
+            Ok(expr)
+        }
+
+        let e = match (
             &op,
-            &right.get_type(right_schema)?,
-        )
-        .get_input_types()?;
+            &left_current_type,
+            &left_target_type,
+            &right_target_type,
+        ) {
+            // int +/- date => date
+            (
+                Operator::Plus | Operator::Minus,
+                Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64,
+                Interval(IntervalUnit::MonthDayNano),
+                Date32 | Date64,
+            ) => {
+                // cast to i64 first
+                let expr = match *left_current_type {
+                    Int64 => expr,
+                    _ => cast(expr, Int64),
+                };
+                // next, multiply by 86400 to get seconds
+                let expr = expr * lit(ScalarValue::from(SECONDS_IN_DAY));
+                // cast to duration
+                let expr = cast(expr, Duration(TimeUnit::Second));
+                // finally cast to interval
+                cast(expr, Interval(IntervalUnit::MonthDayNano))
+            }
+            // These might seem to be a bit convoluted, however for arrow to do date + time arithmetic
+            // date must be cast to Timestamp(Nanosecond) and time cast to Duration(Nanosecond)
+            // (they must be the same timeunit).
+            //
+            // For Time32/64 we first need to cast to an Int64, convert that to nanoseconds based
+            // on the time unit, then cast that to duration.
+            //
+            // Time + date -> timestamp or
+            (
+                Operator::Plus | Operator::Minus,
+                Time32(_) | Time64(_),
+                Duration(TimeUnit::Nanosecond),
+                Timestamp(TimeUnit::Nanosecond, None),
+            ) => {
+                // cast to int64, convert to nanoseconds
+                let expr = time_to_nanos(expr, left_current_type)?;
+                // cast to duration
+                cast(expr, Duration(TimeUnit::Nanosecond))
+            }
+            // Similar to above, for arrow to do time - time we need to convert to an interval.
+            // To do that we first need to cast to an Int64, convert that to nanoseconds based
+            // on the time unit, then cast that to duration, then finally cast to an interval.
+            //
+            // Time - time -> timestamp
+            (
+                Operator::Plus | Operator::Minus,
+                Time32(_) | Time64(_),
+                Interval(IntervalUnit::MonthDayNano),
+                Interval(IntervalUnit::MonthDayNano),
+            ) => {
+                // cast to int64, convert to nanoseconds
+                let expr = time_to_nanos(expr, left_current_type)?;
+                // cast to duration
+                let expr = cast(expr, Duration(TimeUnit::Nanosecond));
+                // finally cast to interval
+                cast(expr, Interval(IntervalUnit::MonthDayNano))
+            }
+            _ => {
+                return plan_err!(
+                    "Cannot automatically convert {left_current_type} to {left_target_type}"
+                );
+            }
+        };
 
-        Ok((
-            left.cast_to(&left_type, left_schema)?,
-            right.cast_to(&right_type, right_schema)?,
-        ))
+        Ok(e)
     }
 }
 
@@ -352,9 +484,10 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                 .data;
                 let expr_type = expr.get_type(self.schema)?;
                 let subquery_type = new_plan.schema().field(0).data_type();
-                let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(plan_datafusion_err!(
-                        "expr type {expr_type:?} can't cast to {subquery_type:?} in InSubquery"
-                    ),
+                let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(
+                    plan_datafusion_err!(
+                    "expr type {expr_type} can't cast to {subquery_type} in InSubquery"
+                ),
                 )?;
                 let new_subquery = Subquery {
                     subquery: Arc::new(new_plan),
@@ -367,6 +500,43 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                     negated,
                 ))))
             }
+            Expr::SetComparison(SetComparison {
+                expr,
+                subquery,
+                op,
+                quantifier,
+            }) => {
+                let new_plan = analyze_internal(
+                    self.schema,
+                    Arc::unwrap_or_clone(subquery.subquery),
+                )?
+                .data;
+                let expr_type = expr.get_type(self.schema)?;
+                let subquery_type = new_plan.schema().field(0).data_type();
+                if (expr_type.is_numeric() && subquery_type.is_string())
+                    || (subquery_type.is_numeric() && expr_type.is_string())
+                {
+                    return plan_err!(
+                        "expr type {expr_type} can't cast to {subquery_type} in SetComparison"
+                    );
+                }
+                let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(
+                    plan_datafusion_err!(
+                        "expr type {expr_type} can't cast to {subquery_type} in SetComparison"
+                    ),
+                )?;
+                let new_subquery = Subquery {
+                    subquery: Arc::new(new_plan),
+                    outer_ref_columns: subquery.outer_ref_columns,
+                    spans: subquery.spans,
+                };
+                Ok(Transformed::yes(Expr::SetComparison(SetComparison::new(
+                    Box::new(expr.cast_to(&common_type, self.schema)?),
+                    cast_subquery(new_subquery, &common_type)?,
+                    op,
+                    quantifier,
+                ))))
+            }
             Expr::Not(expr) => Ok(Transformed::yes(not(get_casted_expr_for_bool_op(
                 *expr,
                 self.schema,
@@ -440,23 +610,23 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                 let low_type = low.get_type(self.schema)?;
                 let low_coerced_type = comparison_coercion(&expr_type, &low_type)
                     .ok_or_else(|| {
-                        DataFusionError::Internal(format!(
+                        internal_datafusion_err!(
                             "Failed to coerce types {expr_type} and {low_type} in BETWEEN expression"
-                        ))
+                        )
                     })?;
                 let high_type = high.get_type(self.schema)?;
                 let high_coerced_type = comparison_coercion(&expr_type, &high_type)
                     .ok_or_else(|| {
-                        DataFusionError::Internal(format!(
+                        internal_datafusion_err!(
                             "Failed to coerce types {expr_type} and {high_type} in BETWEEN expression"
-                        ))
+                        )
                     })?;
                 let coercion_type =
                     comparison_coercion(&low_coerced_type, &high_coerced_type)
                         .ok_or_else(|| {
-                            DataFusionError::Internal(format!(
+                            internal_datafusion_err!(
                                 "Failed to coerce types {expr_type} and {high_type} in BETWEEN expression"
-                            ))
+                            )
                         })?;
                 Ok(Transformed::yes(Expr::Between(Between::new(
                     Box::new(expr.cast_to(&coercion_type, self.schema)?),
@@ -479,7 +649,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                     get_coerce_type_for_list(&expr_data_type, &list_data_types);
                 match result_type {
                     None => plan_err!(
-                        "Can not find compatible types to compare {expr_data_type:?} with {list_data_types:?}"
+                        "Can not find compatible types to compare {expr_data_type} with [{}]",
+                        list_data_types.iter().join(", ")
                     ),
                     Some(coerced_type) => {
                         // find the coerced type
@@ -490,9 +661,9 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                                 list_expr.cast_to(&coerced_type, self.schema)
                             })
                             .collect::<Result<Vec<_>>>()?;
-                        Ok(Transformed::yes(Expr::InList(InList ::new(
-                             Box::new(cast_expr),
-                             cast_list_expr,
+                        Ok(Transformed::yes(Expr::InList(InList::new(
+                            Box::new(cast_expr),
+                            cast_list_expr,
                             negated,
                         ))))
                     }
@@ -503,11 +674,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                 Ok(Transformed::yes(Expr::Case(case)))
             }
             Expr::ScalarFunction(ScalarFunction { func, args }) => {
-                let new_expr = coerce_arguments_for_signature_with_scalar_udf(
-                    args,
-                    self.schema,
-                    &func,
-                )?;
+                let new_expr =
+                    coerce_arguments_for_signature(args, self.schema, func.as_ref())?;
                 Ok(Transformed::yes(Expr::ScalarFunction(
                     ScalarFunction::new_udf(func, new_expr),
                 )))
@@ -523,11 +691,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                         null_treatment,
                     },
             }) => {
-                let new_expr = coerce_arguments_for_signature_with_aggregate_udf(
-                    args,
-                    self.schema,
-                    &func,
-                )?;
+                let new_expr =
+                    coerce_arguments_for_signature(args, self.schema, func.as_ref())?;
                 Ok(Transformed::yes(Expr::AggregateFunction(
                     expr::AggregateFunction::new_udf(
                         func,
@@ -539,46 +704,52 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> {
                     ),
                 )))
             }
-            Expr::WindowFunction(WindowFunction {
-                fun,
-                params:
-                    expr::WindowFunctionParams {
-                        args,
-                        partition_by,
-                        order_by,
-                        window_frame,
-                        null_treatment,
-                    },
-            }) => {
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction {
+                    fun,
+                    params:
+                        expr::WindowFunctionParams {
+                            args,
+                            partition_by,
+                            order_by,
+                            window_frame,
+                            filter,
+                            null_treatment,
+                            distinct,
+                        },
+                } = *window_fun;
                 let window_frame =
                     coerce_window_frame(window_frame, self.schema, &order_by)?;
 
                 let args = match &fun {
                     expr::WindowFunctionDefinition::AggregateUDF(udf) => {
-                        coerce_arguments_for_signature_with_aggregate_udf(
-                            args,
-                            self.schema,
-                            udf,
-                        )?
+                        coerce_arguments_for_signature(args, self.schema, udf.as_ref())?
+                    }
+                    expr::WindowFunctionDefinition::WindowUDF(udf) => {
+                        coerce_arguments_for_signature(args, self.schema, udf.as_ref())?
                     }
-                    _ => args,
                 };
 
-                Ok(Transformed::yes(
-                    Expr::WindowFunction(WindowFunction::new(fun, args))
-                        .partition_by(partition_by)
-                        .order_by(order_by)
-                        .window_frame(window_frame)
-                        .null_treatment(null_treatment)
-                        .build()?,
-                ))
+                let new_expr = Expr::from(WindowFunction {
+                    fun,
+                    params: expr::WindowFunctionParams {
+                        args,
+                        partition_by,
+                        order_by,
+                        window_frame,
+                        filter,
+                        null_treatment,
+                        distinct,
+                    },
+                });
+                Ok(Transformed::yes(new_expr))
             }
             // TODO: remove the next line after `Expr::Wildcard` is removed
             #[expect(deprecated)]
             Expr::Alias(_)
             | Expr::Column(_)
             | Expr::ScalarVariable(_, _)
-            | Expr::Literal(_)
+            | Expr::Literal(_, _)
             | Expr::SimilarTo(_)
             | Expr::IsNotNull(_)
             | Expr::IsNull(_)
@@ -678,7 +849,7 @@ fn coerce_scalar_range_aware(
         // If type coercion fails, check if the largest type in family works:
         if let Some(largest_type) = get_widest_type_in_family(target_type) {
             coerce_scalar(largest_type, value).map_or_else(
-                |_| exec_err!("Cannot cast {value:?} to {target_type:?}"),
+                |_| exec_err!("Cannot cast {value} to {target_type}"),
                 |_| ScalarValue::try_from(target_type),
             )
         } else {
@@ -717,12 +888,15 @@ fn coerce_frame_bound(
 
 fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
     if col_type.is_numeric()
-        || is_utf8_or_utf8view_or_large_utf8(col_type)
-        || matches!(col_type, DataType::List(_))
-        || matches!(col_type, DataType::LargeList(_))
-        || matches!(col_type, DataType::FixedSizeList(_, _))
-        || matches!(col_type, DataType::Null)
-        || matches!(col_type, DataType::Boolean)
+        || col_type.is_string()
+        || col_type.is_null()
+        || matches!(
+            col_type,
+            DataType::List(_)
+                | DataType::LargeList(_)
+                | DataType::FixedSizeList(_, _)
+                | DataType::Boolean
+        )
     {
         Ok(col_type.clone())
     } else if is_datetime(col_type) {
@@ -730,7 +904,7 @@ fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
     } else if let DataType::Dictionary(_, value_type) = col_type {
         extract_window_frame_target_type(value_type)
     } else {
-        return internal_err!("Cannot run range queries on datatype: {col_type:?}");
+        internal_err!("Cannot run range queries on datatype: {col_type}")
     }
 }
 
@@ -775,48 +949,17 @@ fn get_casted_expr_for_bool_op(expr: Expr, schema: &DFSchema) -> Result<Expr> {
 /// `signature`, if possible.
 ///
 /// See the module level documentation for more detail on coercion.
-fn coerce_arguments_for_signature_with_scalar_udf(
+fn coerce_arguments_for_signature<F: UDFCoercionExt>(
     expressions: Vec<Expr>,
     schema: &DFSchema,
-    func: &ScalarUDF,
+    func: &F,
 ) -> Result<Vec<Expr>> {
-    if expressions.is_empty() {
-        return Ok(expressions);
-    }
-
-    let current_types = expressions
-        .iter()
-        .map(|e| e.get_type(schema))
-        .collect::<Result<Vec<_>>>()?;
-
-    let new_types = data_types_with_scalar_udf(&current_types, func)?;
-
-    expressions
-        .into_iter()
-        .enumerate()
-        .map(|(i, expr)| expr.cast_to(&new_types[i], schema))
-        .collect()
-}
-
-/// Returns `expressions` coerced to types compatible with
-/// `signature`, if possible.
-///
-/// See the module level documentation for more detail on coercion.
-fn coerce_arguments_for_signature_with_aggregate_udf(
-    expressions: Vec<Expr>,
-    schema: &DFSchema,
-    func: &AggregateUDF,
-) -> Result<Vec<Expr>> {
-    if expressions.is_empty() {
-        return Ok(expressions);
-    }
-
     let current_fields = expressions
         .iter()
         .map(|e| e.to_field(schema).map(|(_, f)| f))
         .collect::<Result<Vec<_>>>()?;
 
-    let new_types = fields_with_aggregate_udf(&current_fields, func)?
+    let coerced_types = fields_with_udf(&current_fields, func)?
         .into_iter()
         .map(|f| f.data_type().clone())
         .collect::<Vec<_>>();
@@ -824,7 +967,7 @@ fn coerce_arguments_for_signature_with_aggregate_udf(
     expressions
         .into_iter()
         .enumerate()
-        .map(|(i, expr)| expr.cast_to(&new_types[i], schema))
+        .map(|(i, expr)| expr.cast_to(&coerced_types[i], schema))
         .collect()
 }
 
@@ -889,8 +1032,9 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
                 get_coerce_type_for_case_expression(&when_types, Some(case_type));
             coerced_type.ok_or_else(|| {
                 plan_datafusion_err!(
-                    "Failed to coerce case ({case_type:?}) and when ({when_types:?}) \
-                     to common types in CASE WHEN expression"
+                    "Failed to coerce case ({case_type}) and when ({}) \
+                     to common types in CASE WHEN expression",
+                    when_types.iter().join(", ")
                 )
             })
         })
@@ -898,10 +1042,19 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
     let then_else_coerce_type =
         get_coerce_type_for_case_expression(&then_types, else_type.as_ref()).ok_or_else(
             || {
-                plan_datafusion_err!(
-                    "Failed to coerce then ({then_types:?}) and else ({else_type:?}) \
-                     to common types in CASE WHEN expression"
-                )
+                if let Some(else_type) = else_type {
+                    plan_datafusion_err!(
+                        "Failed to coerce then ({}) and else ({else_type}) \
+                         to common types in CASE WHEN expression",
+                        then_types.iter().join(", ")
+                    )
+                } else {
+                    plan_datafusion_err!(
+                        "Failed to coerce then ({}) and else (None) \
+                         to common types in CASE WHEN expression",
+                        then_types.iter().join(", ")
+                    )
+                }
             },
         )?;
 
@@ -943,6 +1096,43 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result<Case> {
 ///
 /// This method presumes that the wildcard expansion is unneeded, or has already
 /// been applied.
+///
+/// ## Schema and Field Handling in Union Coercion
+///
+/// **Processing order**: The function starts with the base schema (first input) and then
+/// processes remaining inputs sequentially, with later inputs taking precedence in merging.
+///
+/// **Schema-level metadata merging**: Later schemas take precedence for duplicate keys.
+///
+/// **Field-level metadata merging**: Later fields take precedence for duplicate metadata keys.
+///
+/// **Type coercion precedence**: The coerced type is determined by iteratively applying
+/// `comparison_coercion()` between the accumulated type and each new input's type. The
+/// result depends on type coercion rules, not input order.
+///
+/// **Nullability merging**: Nullability is accumulated using logical OR (`||`).
+/// Once any input field is nullable, the result field becomes nullable permanently.
+/// Later inputs can make a field nullable but cannot make it non-nullable.
+///
+/// **Field precedence**: Field names come from the first (base) schema, but the field properties
+/// (nullability and field-level metadata) have later schemas taking precedence.
+///
+/// **Example**:
+/// ```sql
+/// SELECT a, b FROM table1  -- a: Int32, metadata {"source": "t1"}, nullable=false
+/// UNION
+/// SELECT a, b FROM table2  -- a: Int64, metadata {"source": "t2"}, nullable=true
+/// UNION
+/// SELECT a, b FROM table3  -- a: Int32, metadata {"encoding": "utf8"}, nullable=false
+/// -- Result:
+/// -- a: Int64 (from type coercion), nullable=true (from table2),
+/// -- metadata: {"source": "t2", "encoding": "utf8"} (later inputs take precedence)
+/// ```
+///
+/// **Precedence Summary**:
+/// - **Datatypes**: Determined by `comparison_coercion()` rules, not input order
+/// - **Nullability**: Later inputs can add nullability but cannot remove it (logical OR)
+/// - **Metadata**: Later inputs take precedence for same keys (HashMap::extend semantics)
 pub fn coerce_union_schema(inputs: &[Arc<LogicalPlan>]) -> Result<DFSchema> {
     coerce_union_schema_with_schema(&inputs[1..], inputs[0].schema())
 }
@@ -1063,10 +1253,10 @@ mod test {
     use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, TimeUnit};
     use insta::assert_snapshot;
 
+    use crate::analyzer::Analyzer;
     use crate::analyzer::type_coercion::{
-        coerce_case_expression, TypeCoercion, TypeCoercionRewriter,
+        TypeCoercion, TypeCoercionRewriter, coerce_case_expression,
     };
-    use crate::analyzer::Analyzer;
     use crate::assert_analyzed_plan_with_config_eq_snapshot;
     use datafusion_common::config::ConfigOptions;
     use datafusion_common::tree_node::{TransformedResult, TreeNode};
@@ -1075,10 +1265,10 @@ mod test {
     use datafusion_expr::logical_plan::{EmptyRelation, Projection, Sort};
     use datafusion_expr::test::function_stub::avg_udaf;
     use datafusion_expr::{
-        cast, col, create_udaf, is_true, lit, AccumulatorFactoryFunction, AggregateUDF,
-        BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Filter, LogicalPlan,
-        Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        SimpleAggregateUDF, Subquery, Union, Volatility,
+        AccumulatorFactoryFunction, AggregateUDF, BinaryExpr, Case, ColumnarValue, Expr,
+        ExprSchemable, Filter, LogicalPlan, Operator, ScalarFunctionArgs, ScalarUDF,
+        ScalarUDFImpl, Signature, SimpleAggregateUDF, Subquery, Union, Volatility, cast,
+        col, create_udaf, is_true, lit,
     };
     use datafusion_functions_aggregate::average::AvgAccumulator;
     use datafusion_sql::TableReference;
@@ -1174,7 +1364,7 @@ mod test {
             plan,
             @r"
         Projection: a < CAST(UInt32(2) AS Float64)
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1217,8 +1407,8 @@ mod test {
         Projection: a
           Union
             Projection: CAST(datafusion.test.foo.a AS Int64) AS a
-              EmptyRelation
-            EmptyRelation
+              EmptyRelation: rows=0
+            EmptyRelation: rows=0
         "
         )
     }
@@ -1240,7 +1430,7 @@ mod test {
             plan.clone(),
             @r"
         Projection: a
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1249,8 +1439,8 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
-          EmptyRelation
+        Projection: CAST(a AS LargeUtf8) AS a
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1267,7 +1457,7 @@ mod test {
             bool_plan.clone(),
             @r#"
         Projection: a < CAST(Utf8("foo") AS Utf8View)
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )?;
 
@@ -1276,7 +1466,7 @@ mod test {
             plan.clone(),
             @r"
         Projection: a
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1285,8 +1475,8 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
-          EmptyRelation
+        Projection: CAST(a AS LargeUtf8) AS a
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1306,7 +1496,7 @@ mod test {
             @r"
         Sort: a ASC NULLS FIRST
           Projection: a
-            EmptyRelation
+            EmptyRelation: rows=0
         "
         )?;
 
@@ -1315,10 +1505,10 @@ mod test {
             true,
             sort_plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
 
@@ -1336,7 +1526,7 @@ mod test {
         Projection: a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
         // Plan B: coerce requested: Utf8View => LargeUtf8 only on outermost
@@ -1344,10 +1534,10 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeUtf8)
+        Projection: CAST(a AS LargeUtf8) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
 
@@ -1371,7 +1561,7 @@ mod test {
             plan.clone(),
             @r"
         Projection: a
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1380,8 +1570,8 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
-          EmptyRelation
+        Projection: CAST(a AS LargeBinary) AS a
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1399,7 +1589,7 @@ mod test {
             bool_plan.clone(),
             @r#"
         Projection: a < CAST(Binary("8,1,8,1") AS BinaryView)
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )?;
 
@@ -1409,7 +1599,7 @@ mod test {
             bool_plan.clone(),
             @r#"
         Projection: a < CAST(Binary("8,1,8,1") AS BinaryView)
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )?;
 
@@ -1429,7 +1619,7 @@ mod test {
             @r"
         Sort: a ASC NULLS FIRST
           Projection: a
-            EmptyRelation
+            EmptyRelation: rows=0
         "
         )?;
         // Plan C: coerce requested: BinaryView => LargeBinary
@@ -1437,10 +1627,10 @@ mod test {
             true,
             sort_plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
+        Projection: CAST(a AS LargeBinary) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
 
@@ -1459,7 +1649,7 @@ mod test {
         Projection: a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
 
@@ -1468,10 +1658,10 @@ mod test {
             true,
             plan.clone(),
             @r"
-        Projection: CAST(a AS LargeBinary)
+        Projection: CAST(a AS LargeBinary) AS a
           Sort: a ASC NULLS FIRST
             Projection: a
-              EmptyRelation
+              EmptyRelation: rows=0
         "
         )?;
 
@@ -1492,12 +1682,12 @@ mod test {
             plan,
             @r"
         Projection: a < CAST(UInt32(2) AS Float64) OR a < CAST(UInt32(2) AS Float64)
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct TestScalarUDF {
         signature: Signature,
     }
@@ -1538,7 +1728,7 @@ mod test {
             plan,
             @r"
         Projection: TestScalarUDF(CAST(Int32(123) AS Float32))
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1575,7 +1765,7 @@ mod test {
             plan,
             @r"
         Projection: TestScalarUDF(CAST(Int64(10) AS Float32))
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1596,7 +1786,7 @@ mod test {
             vec![lit(10i64)],
             false,
             None,
-            None,
+            vec![],
             None,
         ));
         let plan = LogicalPlan::Projection(Projection::try_new(vec![udaf], empty)?);
@@ -1605,7 +1795,7 @@ mod test {
             plan,
             @r"
         Projection: MY_AVG(CAST(Int64(10) AS Float64))
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1631,13 +1821,13 @@ mod test {
             vec![lit("10")],
             false,
             None,
-            None,
+            vec![],
             None,
         ));
 
         let err = Projection::try_new(vec![udaf], empty).err().unwrap();
         assert!(
-            err.strip_backtrace().starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'MY_AVG' function: coercion from [Utf8] to the signature Uniform(1, [Float64]) failed")
+            err.strip_backtrace().starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'MY_AVG' function: coercion from Utf8 to the signature Uniform(1, [Float64]) failed")
         );
         Ok(())
     }
@@ -1650,7 +1840,7 @@ mod test {
             vec![lit(12f64)],
             false,
             None,
-            None,
+            vec![],
             None,
         ));
         let plan = LogicalPlan::Projection(Projection::try_new(vec![agg_expr], empty)?);
@@ -1659,7 +1849,7 @@ mod test {
             plan,
             @r"
         Projection: avg(Float64(12))
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1669,7 +1859,7 @@ mod test {
             vec![cast(col("a"), DataType::Float64)],
             false,
             None,
-            None,
+            vec![],
             None,
         ));
         let plan = LogicalPlan::Projection(Projection::try_new(vec![agg_expr], empty)?);
@@ -1678,7 +1868,7 @@ mod test {
             plan,
             @r"
         Projection: avg(CAST(a AS Float64))
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1691,14 +1881,14 @@ mod test {
             vec![lit("1")],
             false,
             None,
-            None,
+            vec![],
             None,
         ));
         let err = Projection::try_new(vec![agg_expr], empty)
             .err()
             .unwrap()
             .strip_backtrace();
-        assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from [Utf8] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed"));
+        assert!(err.starts_with("Error during planning: Failed to coerce arguments to satisfy a call to 'avg' function: coercion from Utf8 to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) failed"));
         Ok(())
     }
 
@@ -1714,7 +1904,7 @@ mod test {
             plan,
             @r#"
         Projection: CAST(Utf8("1998-03-18") AS Date32) + IntervalDayTime("IntervalDayTime { days: 123, milliseconds: 456 }")
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -1726,10 +1916,10 @@ mod test {
         let empty = empty_with_type(DataType::Int64);
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_analyzed_plan_eq!(
-            plan, 
+            plan,
             @r"
         Projection: a IN ([CAST(Int32(1) AS Int64), CAST(Int8(4) AS Int64), Int64(8)])
-          EmptyRelation
+          EmptyRelation: rows=0
         ")?;
 
         // a in (1,4,8), a is decimal
@@ -1743,10 +1933,10 @@ mod test {
         }));
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_analyzed_plan_eq!(
-            plan, 
+            plan,
             @r"
         Projection: CAST(a AS Decimal128(24, 4)) IN ([CAST(Int32(1) AS Decimal128(24, 4)), CAST(Int8(4) AS Decimal128(24, 4)), CAST(Int64(8) AS Decimal128(24, 4))])
-          EmptyRelation
+          EmptyRelation: rows=0
         ")
     }
 
@@ -1765,7 +1955,7 @@ mod test {
             plan,
             @r#"
         Filter: CAST(a AS Date32) BETWEEN CAST(Utf8("2002-05-08") AS Date32) AND CAST(Utf8("2002-05-08") AS Date32) + IntervalYearMonth("1")
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -1786,7 +1976,7 @@ mod test {
             plan,
             @r#"
         Filter: CAST(a AS Date32) BETWEEN CAST(Utf8("2002-05-08") AS Date32) + IntervalYearMonth("1") AND CAST(Utf8("2002-12-08") AS Date32)
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -1801,7 +1991,7 @@ mod test {
             plan,
             @r"
         Filter: CAST(NULL AS Int64) BETWEEN CAST(NULL AS Int64) AND Int64(2)
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1818,7 +2008,7 @@ mod test {
             plan,
             @r"
         Projection: a IS TRUE
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1826,7 +2016,7 @@ mod test {
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_type_coercion_error(
             plan,
-            "Cannot infer common argument type for comparison operation Int64 IS DISTINCT FROM Boolean"
+            "Cannot infer common argument type for comparison operation Int64 IS DISTINCT FROM Boolean",
         )?;
 
         // is not true
@@ -1838,7 +2028,7 @@ mod test {
             plan,
             @r"
         Projection: a IS NOT TRUE
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1851,7 +2041,7 @@ mod test {
             plan,
             @r"
         Projection: a IS FALSE
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1864,7 +2054,7 @@ mod test {
             plan,
             @r"
         Projection: a IS NOT FALSE
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -1882,7 +2072,7 @@ mod test {
             plan,
             @r#"
         Projection: a LIKE Utf8("abc")
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )?;
 
@@ -1896,7 +2086,7 @@ mod test {
             plan,
             @r"
         Projection: a LIKE CAST(NULL AS Utf8)
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1921,7 +2111,7 @@ mod test {
             plan,
             @r#"
         Projection: a ILIKE Utf8("abc")
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )?;
 
@@ -1935,7 +2125,7 @@ mod test {
             plan,
             @r"
         Projection: a ILIKE CAST(NULL AS Utf8)
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1964,7 +2154,7 @@ mod test {
             plan,
             @r"
         Projection: a IS UNKNOWN
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )?;
 
@@ -1972,7 +2162,7 @@ mod test {
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
         assert_type_coercion_error(
             plan,
-            "Cannot infer common argument type for comparison operation Utf8 IS DISTINCT FROM Boolean"
+            "Cannot infer common argument type for comparison operation Utf8 IS DISTINCT FROM Boolean",
         )?;
 
         // is not unknown
@@ -1984,7 +2174,7 @@ mod test {
             plan,
             @r"
         Projection: a IS NOT UNKNOWN
-          EmptyRelation
+          EmptyRelation: rows=0
         "
         )
     }
@@ -2005,7 +2195,7 @@ mod test {
             plan,
             @r#"
         Projection: TestScalarUDF(a, Utf8("b"), CAST(Boolean(true) AS Utf8), CAST(Boolean(false) AS Utf8), CAST(Int32(13) AS Utf8))
-          EmptyRelation
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -2061,8 +2251,8 @@ mod test {
         assert_analyzed_plan_eq!(
             plan,
             @r#"
-        Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(Nanosecond, None))
-          EmptyRelation
+        Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(ns))
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -2187,7 +2377,7 @@ mod test {
         let err = coerce_case_expression(case, &schema).unwrap_err();
         assert_snapshot!(
             err.strip_backtrace(),
-            @"Error during planning: Failed to coerce case (Interval(MonthDayNano)) and when ([Float32, Binary, Utf8]) to common types in CASE WHEN expression"
+            @"Error during planning: Failed to coerce case (Interval(MonthDayNano)) and when (Float32, Binary, Utf8) to common types in CASE WHEN expression"
         );
 
         let case = Case {
@@ -2202,7 +2392,7 @@ mod test {
         let err = coerce_case_expression(case, &schema).unwrap_err();
         assert_snapshot!(
             err.strip_backtrace(),
-            @"Error during planning: Failed to coerce then ([Date32, Float32, Binary]) and else (Some(Timestamp(Nanosecond, None))) to common types in CASE WHEN expression"
+            @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(ns)) to common types in CASE WHEN expression"
         );
 
         Ok(())
@@ -2400,17 +2590,17 @@ mod test {
         let map_type_entries = DataType::Map(Arc::new(fields), false);
 
         let fields = Field::new("key_value", DataType::Struct(struct_fields), false);
-        let may_type_cutsom = DataType::Map(Arc::new(fields), false);
+        let may_type_custom = DataType::Map(Arc::new(fields), false);
 
-        let expr = col("a").eq(cast(col("a"), may_type_cutsom));
+        let expr = col("a").eq(cast(col("a"), may_type_custom));
         let empty = empty_with_type(map_type_entries);
         let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?);
 
         assert_analyzed_plan_eq!(
             plan,
             @r#"
-        Projection: a = CAST(CAST(a AS Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false)) AS Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false))
-          EmptyRelation
+        Projection: a = CAST(CAST(a AS Map("key_value": non-null Struct("key": non-null Utf8, "value": Float64), unsorted)) AS Map("entries": non-null Struct("key": non-null Utf8, "value": Float64), unsorted))
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -2432,8 +2622,8 @@ mod test {
         assert_analyzed_plan_eq!(
             plan,
             @r#"
-        Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(Nanosecond, None))
-          EmptyRelation
+        Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(ns))
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -2457,8 +2647,8 @@ mod test {
         assert_analyzed_plan_eq!(
             plan,
             @r#"
-        Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) - CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None))
-          EmptyRelation
+        Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) - CAST(Utf8("1998-03-18") AS Timestamp(ns))
+          EmptyRelation: rows=0
         "#
         )
     }
@@ -2486,8 +2676,8 @@ mod test {
         Filter: a IN (<subquery>)
           Subquery:
             Projection: CAST(a AS Int64)
-              EmptyRelation
-          EmptyRelation
+              EmptyRelation: rows=0
+          EmptyRelation: rows=0
         "
         )
     }
@@ -2514,8 +2704,8 @@ mod test {
             @r"
         Filter: CAST(a AS Int64) IN (<subquery>)
           Subquery:
-            EmptyRelation
-          EmptyRelation
+            EmptyRelation: rows=0
+          EmptyRelation: rows=0
         "
         )
     }
@@ -2543,8 +2733,8 @@ mod test {
         Filter: CAST(a AS Decimal128(13, 8)) IN (<subquery>)
           Subquery:
             Projection: CAST(a AS Decimal128(13, 8))
-              EmptyRelation
-          EmptyRelation
+              EmptyRelation: rows=0
+          EmptyRelation: rows=0
         "
         )
     }
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 6a49e5d22087f..2096c42770315 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -27,14 +27,16 @@ use crate::optimizer::ApplyOrder;
 use crate::utils::NamePreserver;
 use datafusion_common::alias::AliasGenerator;
 
-use datafusion_common::cse::{CSEController, FoundCommonNodes, CSE};
+use datafusion_common::cse::{CSE, CSEController, FoundCommonNodes};
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{qualified_name, Column, DFSchema, DFSchemaRef, Result};
+use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, qualified_name};
 use datafusion_expr::expr::{Alias, ScalarFunction};
 use datafusion_expr::logical_plan::{
     Aggregate, Filter, LogicalPlan, Projection, Sort, Window,
 };
-use datafusion_expr::{col, BinaryExpr, Case, Expr, Operator, SortExpr};
+use datafusion_expr::{
+    BinaryExpr, Case, Expr, ExpressionPlacement, Operator, SortExpr, col,
+};
 
 const CSE_PREFIX: &str = "__common_expr";
 
@@ -316,6 +318,19 @@ impl CommonSubexprEliminate {
                     } => {
                         let rewritten_aggr_expr = new_exprs_list.pop().unwrap();
                         let new_aggr_expr = original_exprs_list.pop().unwrap();
+                        let saved_names = if let Some(aggr_expr) = aggr_expr {
+                            let name_preserver = NamePreserver::new_for_projection();
+                            aggr_expr
+                                .iter()
+                                .map(|expr| Some(name_preserver.save(expr)))
+                                .collect::<Vec<_>>()
+                        } else {
+                            new_aggr_expr
+                                .clone()
+                                .into_iter()
+                                .map(|_| None)
+                                .collect::<Vec<_>>()
+                        };
 
                         let mut agg_exprs = common_exprs
                             .into_iter()
@@ -326,10 +341,19 @@ impl CommonSubexprEliminate {
                         for expr in &new_group_expr {
                             extract_expressions(expr, &mut proj_exprs)
                         }
-                        for (expr_rewritten, expr_orig) in
-                            rewritten_aggr_expr.into_iter().zip(new_aggr_expr)
+                        for ((expr_rewritten, expr_orig), saved_name) in
+                            rewritten_aggr_expr
+                                .into_iter()
+                                .zip(new_aggr_expr)
+                                .zip(saved_names)
                         {
                             if expr_rewritten == expr_orig {
+                                let expr_rewritten = if let Some(saved_name) = saved_name
+                                {
+                                    saved_name.restore(expr_rewritten)
+                                } else {
+                                    expr_rewritten
+                                };
                                 if let Expr::Alias(Alias { expr, name, .. }) =
                                     expr_rewritten
                                 {
@@ -630,10 +654,8 @@ impl CSEController for ExprCSEController<'_> {
             // In case of `ScalarFunction`s we don't know which children are surely
             // executed so start visiting all children conditionally and stop the
             // recursion with `TreeNodeRecursion::Jump`.
-            Expr::ScalarFunction(ScalarFunction { func, args })
-                if func.short_circuits() =>
-            {
-                Some((vec![], args.iter().collect()))
+            Expr::ScalarFunction(ScalarFunction { func, args }) => {
+                func.conditional_arguments(args)
             }
 
             // In case of `And` and `Or` the first child is surely executed, but we
@@ -678,10 +700,27 @@ impl CSEController for ExprCSEController<'_> {
     }
 
     fn is_ignored(&self, node: &Expr) -> bool {
+        // MoveTowardsLeafNodes expressions (e.g. get_field) are cheap struct
+        // field accesses that the ExtractLeafExpressions / PushDownLeafProjections
+        // rules deliberately duplicate when needed (one copy for a filter
+        // predicate, another for an output column). CSE deduplicating them
+        // creates intermediate projections that fight with those rules,
+        // causing optimizer instability — ExtractLeafExpressions will undo
+        // the dedup, creating an infinite loop that runs until the iteration
+        // limit is hit. Skip them.
+        if node.placement() == ExpressionPlacement::MoveTowardsLeafNodes {
+            return true;
+        }
+
         // TODO: remove the next line after `Expr::Wildcard` is removed
         #[expect(deprecated)]
         let is_normal_minus_aggregates = matches!(
             node,
+            // TODO: there's an argument for removing `Literal` from here,
+            // maybe using `Expr::placemement().should_push_to_leaves()` instead
+            // so that we extract common literals and don't broadcast them to num_batch_rows multiple times.
+            // However that currently breaks things like `percentile_cont()` which expect literal arguments
+            // (and would instead be getting `col(__common_expr_n)`).
             Expr::Literal(..)
                 | Expr::Column(..)
                 | Expr::ScalarVariable(..)
@@ -794,17 +833,18 @@ mod test {
     use std::iter;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_expr::logical_plan::{table_scan, JoinType};
+    use datafusion_expr::logical_plan::{JoinType, table_scan};
     use datafusion_expr::{
-        grouping_set, is_null, not, AccumulatorFactoryFunction, AggregateUDF,
-        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        SimpleAggregateUDF, Volatility,
+        AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarFunctionArgs,
+        ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, Volatility,
+        grouping_set, is_null, not,
     };
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
 
     use super::*;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::optimizer::OptimizerContext;
+    use crate::test::udfs::leaf_udf_expr;
     use crate::test::*;
     use datafusion_expr::test::function_stub::{avg, sum};
 
@@ -909,7 +949,7 @@ mod test {
                 vec![inner],
                 false,
                 None,
-                None,
+                vec![],
                 None,
             ))
         };
@@ -1646,7 +1686,7 @@ mod test {
         Ok(())
     }
 
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     pub struct TestUdf {
         signature: Signature,
     }
@@ -1773,7 +1813,7 @@ mod test {
         ScalarUDF::new_from_impl(RandomStub::new())
     }
 
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct RandomStub {
         signature: Signature,
     }
@@ -1806,4 +1846,56 @@ mod test {
             panic!("dummy - not implemented")
         }
     }
+
+    /// Identical MoveTowardsLeafNodes expressions should NOT be deduplicated
+    /// by CSE — they are cheap (e.g. struct field access) and the extraction
+    /// rules deliberately duplicate them. Deduplicating causes optimizer
+    /// instability where one optimizer rule will undo the work of another,
+    /// resulting in an infinite optimization loop until the
+    /// we hit the max iteration limit and then give up.
+    #[test]
+    fn test_leaf_expression_not_extracted() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let leaf = leaf_udf_expr(col("a"));
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf.clone().alias("c1"), leaf.alias("c2")])?
+            .build()?;
+
+        // Plan should be unchanged — no __common_expr introduced
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: leaf_udf(test.a) AS c1, leaf_udf(test.a) AS c2
+          TableScan: test
+        "
+        )
+    }
+
+    /// When a MoveTowardsLeafNodes expression appears as a sub-expression of
+    /// a larger expression that IS duplicated, only the outer expression gets
+    /// deduplicated; the leaf sub-expression stays inline.
+    #[test]
+    fn test_leaf_subexpression_not_extracted() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // leaf_udf(a) + b appears twice — the outer `+` is a common
+        // sub-expression, but leaf_udf(a) by itself is MoveTowardsLeafNodes
+        // and should not be extracted separately.
+        let common = leaf_udf_expr(col("a")) + col("b");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![common.clone().alias("c1"), common.alias("c2")])?
+            .build()?;
+
+        // The whole `leaf_udf(a) + b` gets deduplicated as __common_expr_1,
+        // but leaf_udf(a) alone is NOT pulled out.
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: __common_expr_1 AS c1, __common_expr_1 AS c2
+          Projection: leaf_udf(test.a) + test.b AS __common_expr_1, test.a, test.b, test.c
+            TableScan: test
+        "
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs
index 1378b53fa73f0..52d777f874fa8 100644
--- a/datafusion/optimizer/src/decorrelate.rs
+++ b/datafusion/optimizer/src/decorrelate.rs
@@ -26,17 +26,16 @@ use crate::simplify_expressions::ExprSimplifier;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
-use datafusion_common::{plan_err, Column, DFSchemaRef, HashMap, Result, ScalarValue};
+use datafusion_common::{Column, DFSchemaRef, HashMap, Result, ScalarValue, plan_err};
 use datafusion_expr::expr::Alias;
 use datafusion_expr::simplify::SimplifyContext;
 use datafusion_expr::utils::{
     collect_subquery_cols, conjunction, find_join_exprs, split_conjunction,
 };
 use datafusion_expr::{
-    expr, lit, BinaryExpr, Cast, EmptyRelation, Expr, FetchType, LogicalPlan,
-    LogicalPlanBuilder, Operator,
+    BinaryExpr, Cast, EmptyRelation, Expr, FetchType, LogicalPlan, LogicalPlanBuilder,
+    Operator, expr, lit,
 };
-use datafusion_physical_expr::execution_props::ExecutionProps;
 
 /// This struct rewrite the sub query plan by pull up the correlated
 /// expressions(contains outer reference columns) from the inner subquery's
@@ -494,9 +493,12 @@ fn agg_exprs_evaluation_result_on_empty_batch(
                 let new_expr = match expr {
                     Expr::AggregateFunction(expr::AggregateFunction { func, .. }) => {
                         if func.name() == "count" {
-                            Transformed::yes(Expr::Literal(ScalarValue::Int64(Some(0))))
+                            Transformed::yes(Expr::Literal(
+                                ScalarValue::Int64(Some(0)),
+                                None,
+                            ))
                         } else {
-                            Transformed::yes(Expr::Literal(ScalarValue::Null))
+                            Transformed::yes(Expr::Literal(ScalarValue::Null, None))
                         }
                     }
                     _ => Transformed::no(expr),
@@ -506,8 +508,7 @@ fn agg_exprs_evaluation_result_on_empty_batch(
             .data()?;
 
         let result_expr = result_expr.unalias();
-        let props = ExecutionProps::new();
-        let info = SimplifyContext::new(&props).with_schema(Arc::clone(schema));
+        let info = SimplifyContext::default().with_schema(Arc::clone(schema));
         let simplifier = ExprSimplifier::new(info);
         let result_expr = simplifier.simplify(result_expr)?;
         expr_result_map_for_count_bug.insert(e.schema_name().to_string(), result_expr);
@@ -540,8 +541,7 @@ fn proj_exprs_evaluation_result_on_empty_batch(
             .data()?;
 
         if result_expr.ne(expr) {
-            let props = ExecutionProps::new();
-            let info = SimplifyContext::new(&props).with_schema(Arc::clone(schema));
+            let info = SimplifyContext::default().with_schema(Arc::clone(schema));
             let simplifier = ExprSimplifier::new(info);
             let result_expr = simplifier.simplify(result_expr)?;
             let expr_name = match expr {
@@ -581,16 +581,15 @@ fn filter_exprs_evaluation_result_on_empty_batch(
         .data()?;
 
     let pull_up_expr = if result_expr.ne(filter_expr) {
-        let props = ExecutionProps::new();
-        let info = SimplifyContext::new(&props).with_schema(schema);
+        let info = SimplifyContext::default().with_schema(schema);
         let simplifier = ExprSimplifier::new(info);
         let result_expr = simplifier.simplify(result_expr)?;
         match &result_expr {
             // evaluate to false or null on empty batch, no need to pull up
-            Expr::Literal(ScalarValue::Null)
-            | Expr::Literal(ScalarValue::Boolean(Some(false))) => None,
+            Expr::Literal(ScalarValue::Null, _)
+            | Expr::Literal(ScalarValue::Boolean(Some(false)), _) => None,
             // evaluate to true on empty batch, need to pull up the expr
-            Expr::Literal(ScalarValue::Boolean(Some(true))) => {
+            Expr::Literal(ScalarValue::Boolean(Some(true)), _) => {
                 for (name, exprs) in input_expr_result_map_for_count_bug {
                     expr_result_map_for_count_bug.insert(name.clone(), exprs.clone());
                 }
@@ -605,7 +604,7 @@ fn filter_exprs_evaluation_result_on_empty_batch(
                             Box::new(result_expr.clone()),
                             Box::new(input_expr.clone()),
                         )],
-                        else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))),
+                        else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null, None))),
                     });
                     let expr_key = new_expr.schema_name().to_string();
                     expr_result_map_for_count_bug.insert(expr_key, new_expr);
diff --git a/datafusion/optimizer/src/decorrelate_lateral_join.rs b/datafusion/optimizer/src/decorrelate_lateral_join.rs
index 7d2072ad1ce99..a8c751ff46288 100644
--- a/datafusion/optimizer/src/decorrelate_lateral_join.rs
+++ b/datafusion/optimizer/src/decorrelate_lateral_join.rs
@@ -22,12 +22,12 @@ use std::collections::BTreeSet;
 use crate::decorrelate::PullUpCorrelatedExpr;
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_expr::{lit, Join};
+use datafusion_expr::{Join, lit};
 
+use datafusion_common::Result;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion_common::Result;
 use datafusion_expr::logical_plan::JoinType;
 use datafusion_expr::utils::conjunction;
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
@@ -37,7 +37,7 @@ use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
 pub struct DecorrelateLateralJoin {}
 
 impl DecorrelateLateralJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
index a72657bf689d8..281d2d73481d8 100644
--- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
+++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs
@@ -27,14 +27,17 @@ use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::alias::AliasGenerator;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, plan_err, Column, Result};
+use datafusion_common::{
+    Column, DFSchemaRef, ExprSchema, NullEquality, Result, assert_or_internal_err,
+    plan_err,
+};
 use datafusion_expr::expr::{Exists, InSubquery};
 use datafusion_expr::expr_rewriter::create_col_from_scalar_expr;
 use datafusion_expr::logical_plan::{JoinType, Subquery};
-use datafusion_expr::utils::{conjunction, split_conjunction_owned};
+use datafusion_expr::utils::{conjunction, expr_to_columns, split_conjunction_owned};
 use datafusion_expr::{
-    exists, in_subquery, lit, not, not_exists, not_in_subquery, BinaryExpr, Expr, Filter,
-    LogicalPlan, LogicalPlanBuilder, Operator,
+    BinaryExpr, Expr, Filter, LogicalPlan, LogicalPlanBuilder, Operator, exists,
+    in_subquery, lit, not, not_exists, not_in_subquery,
 };
 
 use log::debug;
@@ -44,7 +47,7 @@ use log::debug;
 pub struct DecorrelatePredicateSubquery {}
 
 impl DecorrelatePredicateSubquery {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -79,11 +82,10 @@ impl OptimizerRule for DecorrelatePredicateSubquery {
                 .into_iter()
                 .partition(has_subquery);
 
-        if with_subqueries.is_empty() {
-            return internal_err!(
-                "can not find expected subqueries in DecorrelatePredicateSubquery"
-            );
-        }
+        assert_or_internal_err!(
+            !with_subqueries.is_empty(),
+            "can not find expected subqueries in DecorrelatePredicateSubquery"
+        );
 
         // iterate through all exists clauses in predicate, turning each into a join
         let mut cur_input = Arc::unwrap_or_clone(filter.input);
@@ -136,7 +138,7 @@ fn rewrite_inner_subqueries(
         Expr::Exists(Exists {
             subquery: Subquery { subquery, .. },
             negated,
-        }) => match mark_join(&cur_input, Arc::clone(&subquery), None, negated, alias)? {
+        }) => match mark_join(&cur_input, &subquery, None, negated, alias)? {
             Some((plan, exists_expr)) => {
                 cur_input = plan;
                 Ok(Transformed::yes(exists_expr))
@@ -154,13 +156,7 @@ fn rewrite_inner_subqueries(
                 .map_or(plan_err!("single expression required."), |output_expr| {
                     Ok(Expr::eq(*expr.clone(), output_expr))
                 })?;
-            match mark_join(
-                &cur_input,
-                Arc::clone(&subquery),
-                Some(in_predicate),
-                negated,
-                alias,
-            )? {
+            match mark_join(&cur_input, &subquery, Some(&in_predicate), negated, alias)? {
                 Some((plan, exists_expr)) => {
                     cur_input = plan;
                     Ok(Transformed::yes(exists_expr))
@@ -275,7 +271,13 @@ fn build_join_top(
     };
     let subquery = query_info.query.subquery.as_ref();
     let subquery_alias = alias.next("__correlated_sq");
-    build_join(left, subquery, in_predicate_opt, join_type, subquery_alias)
+    build_join(
+        left,
+        subquery,
+        in_predicate_opt.as_ref(),
+        join_type,
+        subquery_alias,
+    )
 }
 
 /// This is used to handle the case when the subquery is embedded in a more complex boolean
@@ -295,8 +297,8 @@ fn build_join_top(
 ///           TableScan: t2
 fn mark_join(
     left: &LogicalPlan,
-    subquery: Arc<LogicalPlan>,
-    in_predicate_opt: Option<Expr>,
+    subquery: &LogicalPlan,
+    in_predicate_opt: Option<&Expr>,
     negated: bool,
     alias_generator: &Arc<AliasGenerator>,
 ) -> Result<Option<(LogicalPlan, Expr)>> {
@@ -306,20 +308,53 @@ fn mark_join(
     let exists_expr = if negated { !exists_col } else { exists_col };
 
     Ok(
-        build_join(left, &subquery, in_predicate_opt, JoinType::LeftMark, alias)?
+        build_join(left, subquery, in_predicate_opt, JoinType::LeftMark, alias)?
             .map(|plan| (plan, exists_expr)),
     )
 }
 
+/// Check if join keys in the join filter may contain NULL values
+///
+/// Returns true if any join key column is nullable on either side.
+/// This is used to optimize null-aware anti joins: if all join keys are non-nullable,
+/// we can use a regular anti join instead of the more expensive null-aware variant.
+fn join_keys_may_be_null(
+    join_filter: &Expr,
+    left_schema: &DFSchemaRef,
+    right_schema: &DFSchemaRef,
+) -> Result<bool> {
+    // Extract columns from the join filter
+    let mut columns = std::collections::HashSet::new();
+    expr_to_columns(join_filter, &mut columns)?;
+
+    // Check if any column is nullable
+    for col in columns {
+        // Check in left schema
+        if let Ok(field) = left_schema.field_from_column(&col)
+            && field.as_ref().is_nullable()
+        {
+            return Ok(true);
+        }
+        // Check in right schema
+        if let Ok(field) = right_schema.field_from_column(&col)
+            && field.as_ref().is_nullable()
+        {
+            return Ok(true);
+        }
+    }
+
+    Ok(false)
+}
+
 fn build_join(
     left: &LogicalPlan,
     subquery: &LogicalPlan,
-    in_predicate_opt: Option<Expr>,
+    in_predicate_opt: Option<&Expr>,
     join_type: JoinType,
     alias: String,
 ) -> Result<Option<LogicalPlan>> {
     let mut pull_up = PullUpCorrelatedExpr::new()
-        .with_in_predicate_opt(in_predicate_opt.clone())
+        .with_in_predicate_opt(in_predicate_opt.cloned())
         .with_exists_sub_query(in_predicate_opt.is_none());
 
     let new_plan = subquery.clone().rewrite(&mut pull_up).data()?;
@@ -342,7 +377,7 @@ fn build_join(
             replace_qualified_name(filter, &all_correlated_cols, &alias).map(Some)
         })?;
 
-    let join_filter = match (join_filter_opt, in_predicate_opt) {
+    let join_filter = match (join_filter_opt, in_predicate_opt.cloned()) {
         (
             Some(join_filter),
             Some(Expr::BinaryExpr(BinaryExpr {
@@ -365,16 +400,89 @@ fn build_join(
             })),
         ) => {
             let right_col = create_col_from_scalar_expr(right.deref(), alias)?;
-            let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col));
-            in_predicate
+
+            Expr::eq(left.deref().clone(), Expr::Column(right_col))
         }
         (None, None) => lit(true),
         _ => return Ok(None),
     };
+
+    if matches!(join_type, JoinType::LeftMark | JoinType::RightMark) {
+        let right_schema = sub_query_alias.schema();
+
+        // Gather all columns needed for the join filter + predicates
+        let mut needed = std::collections::HashSet::new();
+        expr_to_columns(&join_filter, &mut needed)?;
+        if let Some(in_pred) = in_predicate_opt {
+            expr_to_columns(in_pred, &mut needed)?;
+        }
+
+        // Keep only columns that actually belong to the RIGHT child, and sort by their
+        // position in the right schema for deterministic order.
+        let mut right_cols_idx_and_col: Vec<(usize, Column)> = needed
+            .into_iter()
+            .filter_map(|c| right_schema.index_of_column(&c).ok().map(|idx| (idx, c)))
+            .collect();
+
+        right_cols_idx_and_col.sort_by_key(|(idx, _)| *idx);
+
+        let right_proj_exprs: Vec<Expr> = right_cols_idx_and_col
+            .into_iter()
+            .map(|(_, c)| Expr::Column(c))
+            .collect();
+
+        let right_projected = if !right_proj_exprs.is_empty() {
+            LogicalPlanBuilder::from(sub_query_alias.clone())
+                .project(right_proj_exprs)?
+                .build()?
+        } else {
+            // Degenerate case: no right columns referenced by the predicate(s)
+            sub_query_alias.clone()
+        };
+
+        // Mark joins don't use null-aware semantics (they use three-valued logic with mark column)
+        let new_plan = LogicalPlanBuilder::from(left.clone())
+            .join_on(right_projected, join_type, Some(join_filter))?
+            .build()?;
+
+        debug!(
+            "predicate subquery optimized:\n{}",
+            new_plan.display_indent()
+        );
+
+        return Ok(Some(new_plan));
+    }
+
+    // Determine if this should be a null-aware anti join
+    // Null-aware semantics are only needed for NOT IN subqueries, not NOT EXISTS:
+    // - NOT IN: Uses three-valued logic, requires null-aware handling
+    // - NOT EXISTS: Uses two-valued logic, regular anti join is correct
+    // We can distinguish them: NOT IN has in_predicate_opt, NOT EXISTS does not
+    //
+    // Additionally, if the join keys are non-nullable on both sides, we don't need
+    // null-aware semantics because NULLs cannot exist in the data.
+    let null_aware = join_type == JoinType::LeftAnti
+        && in_predicate_opt.is_some()
+        && join_keys_may_be_null(&join_filter, left.schema(), sub_query_alias.schema())?;
+
     // join our sub query into the main plan
-    let new_plan = LogicalPlanBuilder::from(left.clone())
-        .join_on(sub_query_alias, join_type, Some(join_filter))?
-        .build()?;
+    let new_plan = if null_aware {
+        // Use join_detailed_with_options to set null_aware flag
+        LogicalPlanBuilder::from(left.clone())
+            .join_detailed_with_options(
+                sub_query_alias,
+                join_type,
+                (Vec::<Column>::new(), Vec::<Column>::new()), // No equijoin keys, filter-based join
+                Some(join_filter),
+                NullEquality::NullEqualsNothing,
+                true, // null_aware
+            )?
+            .build()?
+    } else {
+        LogicalPlanBuilder::from(left.clone())
+            .join_on(sub_query_alias, join_type, Some(join_filter))?
+            .build()?
+    };
     debug!(
         "predicate subquery optimized:\n{}",
         new_plan.display_indent()
@@ -569,7 +677,7 @@ mod tests {
 
         assert_optimized_plan_equal!(
                 plan,
-                @r###"
+                @r"
         Projection: customer.c_custkey [c_custkey:Int64]
           LeftSemi Join:  Filter: customer.c_custkey = __correlated_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8]
             LeftSemi Join:  Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]
@@ -580,7 +688,7 @@ mod tests {
             SubqueryAlias: __correlated_sq_2 [o_custkey:Int64]
               Projection: orders.o_custkey [o_custkey:Int64]
                 TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-        "###    
+        "    
         )
     }
 
@@ -1927,14 +2035,14 @@ mod tests {
 
         assert_optimized_plan_equal!(
             plan,
-            @r#"
+            @r"
         Projection: test.b [b:UInt32]
           LeftSemi Join:  Filter: Boolean(true) [a:UInt32, b:UInt32, c:UInt32]
             TableScan: test [a:UInt32, b:UInt32, c:UInt32]
             SubqueryAlias: __correlated_sq_1 [arr:Int32;N]
               Unnest: lists[sq.arr|depth=1] structs[] [arr:Int32;N]
-                TableScan: sq [arr:List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-        "#
+                TableScan: sq [arr:List(Int32);N]
+        "
         )
     }
 
@@ -1962,14 +2070,14 @@ mod tests {
 
         assert_optimized_plan_equal!(
             plan,
-            @r#"
+            @r"
         Projection: test.b [b:UInt32]
           LeftSemi Join:  Filter: __correlated_sq_1.a = test.b [a:UInt32, b:UInt32, c:UInt32]
             TableScan: test [a:UInt32, b:UInt32, c:UInt32]
             SubqueryAlias: __correlated_sq_1 [a:UInt32;N]
               Unnest: lists[sq.a|depth=1] structs[] [a:UInt32;N]
-                TableScan: sq [a:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-        "#
+                TableScan: sq [a:List(UInt32);N]
+        "
         )
     }
 
diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs
index d465faf0c5e83..3cb0516a6d296 100644
--- a/datafusion/optimizer/src/eliminate_cross_join.rs
+++ b/datafusion/optimizer/src/eliminate_cross_join.rs
@@ -21,19 +21,19 @@ use std::sync::Arc;
 
 use crate::join_key_set::JoinKeySet;
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::Result;
+use datafusion_common::{NullEquality, Result};
 use datafusion_expr::expr::{BinaryExpr, Expr};
 use datafusion_expr::logical_plan::{
     Filter, Join, JoinConstraint, JoinType, LogicalPlan, Projection,
 };
 use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair};
-use datafusion_expr::{and, build_join_schema, ExprSchemable, Operator};
+use datafusion_expr::{ExprSchemable, Operator, and, build_join_schema};
 
 #[derive(Default, Debug)]
 pub struct EliminateCrossJoin;
 
 impl EliminateCrossJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -89,6 +89,7 @@ impl OptimizerRule for EliminateCrossJoin {
         let mut possible_join_keys = JoinKeySet::new();
         let mut all_inputs: Vec<LogicalPlan> = vec![];
         let mut all_filters: Vec<Expr> = vec![];
+        let mut null_equality = NullEquality::NullEqualsNothing;
 
         let parent_predicate = if let LogicalPlan::Filter(filter) = plan {
             // if input isn't a join that can potentially be rewritten
@@ -113,6 +114,12 @@ impl OptimizerRule for EliminateCrossJoin {
             let Filter {
                 input, predicate, ..
             } = filter;
+
+            // Extract null_equality setting from the input join
+            if let LogicalPlan::Join(join) = input.as_ref() {
+                null_equality = join.null_equality;
+            }
+
             flatten_join_inputs(
                 Arc::unwrap_or_clone(input),
                 &mut possible_join_keys,
@@ -122,26 +129,30 @@ impl OptimizerRule for EliminateCrossJoin {
 
             extract_possible_join_keys(&predicate, &mut possible_join_keys);
             Some(predicate)
-        } else if matches!(
-            plan,
-            LogicalPlan::Join(Join {
-                join_type: JoinType::Inner,
-                ..
-            })
-        ) {
-            if !can_flatten_join_inputs(&plan) {
-                return Ok(Transformed::no(plan));
-            }
-            flatten_join_inputs(
-                plan,
-                &mut possible_join_keys,
-                &mut all_inputs,
-                &mut all_filters,
-            )?;
-            None
         } else {
-            // recursively try to rewrite children
-            return rewrite_children(self, plan, config);
+            match plan {
+                LogicalPlan::Join(Join {
+                    join_type: JoinType::Inner,
+                    null_equality: original_null_equality,
+                    ..
+                }) => {
+                    if !can_flatten_join_inputs(&plan) {
+                        return Ok(Transformed::no(plan));
+                    }
+                    flatten_join_inputs(
+                        plan,
+                        &mut possible_join_keys,
+                        &mut all_inputs,
+                        &mut all_filters,
+                    )?;
+                    null_equality = original_null_equality;
+                    None
+                }
+                _ => {
+                    // recursively try to rewrite children
+                    return rewrite_children(self, plan, config);
+                }
+            }
         };
 
         // Join keys are handled locally:
@@ -153,6 +164,7 @@ impl OptimizerRule for EliminateCrossJoin {
                 &mut all_inputs,
                 &possible_join_keys,
                 &mut all_join_keys,
+                null_equality,
             )?;
         }
 
@@ -264,10 +276,9 @@ fn can_flatten_join_inputs(plan: &LogicalPlan) -> bool {
             join_type: JoinType::Inner,
             ..
         }) = child
+            && !can_flatten_join_inputs(child)
         {
-            if !can_flatten_join_inputs(child) {
-                return false;
-            }
+            return false;
         }
     }
     true
@@ -290,6 +301,7 @@ fn find_inner_join(
     rights: &mut Vec<LogicalPlan>,
     possible_join_keys: &JoinKeySet,
     all_join_keys: &mut JoinKeySet,
+    null_equality: NullEquality,
 ) -> Result<LogicalPlan> {
     for (i, right_input) in rights.iter().enumerate() {
         let mut join_keys = vec![];
@@ -303,10 +315,10 @@ fn find_inner_join(
             )?;
 
             // Save join keys
-            if let Some((valid_l, valid_r)) = key_pair {
-                if can_hash(&valid_l.get_type(left_input.schema())?) {
-                    join_keys.push((valid_l, valid_r));
-                }
+            if let Some((valid_l, valid_r)) = key_pair
+                && can_hash(&valid_l.get_type(left_input.schema())?)
+            {
+                join_keys.push((valid_l, valid_r));
             }
         }
 
@@ -328,7 +340,8 @@ fn find_inner_join(
                 on: join_keys,
                 filter: None,
                 schema: join_schema,
-                null_equals_null: false,
+                null_equality,
+                null_aware: false,
             }));
         }
     }
@@ -350,7 +363,8 @@ fn find_inner_join(
         filter: None,
         join_type: JoinType::Inner,
         join_constraint: JoinConstraint::On,
-        null_equals_null: false,
+        null_equality,
+        null_aware: false,
     }))
 }
 
@@ -436,9 +450,9 @@ mod tests {
     use crate::test::*;
 
     use datafusion_expr::{
+        Operator::{And, Or},
         binary_expr, col, lit,
         logical_plan::builder::LogicalPlanBuilder,
-        Operator::{And, Or},
     };
     use insta::assert_snapshot;
 
@@ -510,7 +524,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a OR t2.b = t1.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -596,7 +610,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a AND t2.c < UInt32(15) OR t1.b = t2.b AND t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -622,7 +636,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a = t2.a AND t2.c < UInt32(15) OR t1.a = t2.a OR t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -844,7 +858,7 @@ mod tests {
             plan,
             @ r"
         Filter: t3.a = t1.a AND t4.c < UInt32(15) OR t3.a = t1.a OR t4.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             Filter: t2.c < UInt32(15) OR t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
               Inner Join: t1.a = t2.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
@@ -924,7 +938,7 @@ mod tests {
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
             Filter: t3.a = t4.a AND t4.c < UInt32(15) OR t3.a = t4.a AND t3.c = UInt32(688) OR t3.a = t4.a OR t3.b = t4.b [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-              Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+              Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t3 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t4 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -998,7 +1012,7 @@ mod tests {
         Filter: t4.c < UInt32(15) OR t4.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
           Inner Join: t1.a = t3.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             Filter: t1.a = t2.a OR t2.c < UInt32(15) OR t1.a = t2.a AND t2.c = UInt32(688) [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-              Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+              Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
                 TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
             Filter: t4.c < UInt32(15) OR t3.c = UInt32(688) OR t3.b = t4.b [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
@@ -1234,7 +1248,7 @@ mod tests {
             plan,
             @ r"
         Filter: t1.a + UInt32(100) = t2.a * UInt32(2) OR t2.b = t1.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
-          Cross Join:  [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
+          Cross Join: [a:UInt32, b:UInt32, c:UInt32, a:UInt32, b:UInt32, c:UInt32]
             TableScan: t1 [a:UInt32, b:UInt32, c:UInt32]
             TableScan: t2 [a:UInt32, b:UInt32, c:UInt32]
         "
@@ -1333,4 +1347,70 @@ mod tests {
         "
         )
     }
+
+    #[test]
+    fn preserve_null_equality_setting() -> Result<()> {
+        let t1 = test_table_scan_with_name("t1")?;
+        let t2 = test_table_scan_with_name("t2")?;
+
+        // Create an inner join with NullEquality::NullEqualsNull
+        let join_schema = Arc::new(build_join_schema(
+            t1.schema(),
+            t2.schema(),
+            &JoinType::Inner,
+        )?);
+
+        let inner_join = LogicalPlan::Join(Join {
+            left: Arc::new(t1),
+            right: Arc::new(t2),
+            join_type: JoinType::Inner,
+            join_constraint: JoinConstraint::On,
+            on: vec![],
+            filter: None,
+            schema: join_schema,
+            null_equality: NullEquality::NullEqualsNull, // Test preservation
+            null_aware: false,
+        });
+
+        // Apply filter that can create join conditions
+        let plan = LogicalPlanBuilder::from(inner_join)
+            .filter(binary_expr(
+                col("t1.a").eq(col("t2.a")),
+                And,
+                col("t2.c").lt(lit(20u32)),
+            ))?
+            .build()?;
+
+        let rule = EliminateCrossJoin::new();
+        let optimized_plan = rule.rewrite(plan, &OptimizerContext::new())?.data;
+
+        // Verify that null_equality is preserved in the optimized plan
+        fn check_null_equality_preserved(plan: &LogicalPlan) -> bool {
+            match plan {
+                LogicalPlan::Join(join) => {
+                    // All joins in the optimized plan should preserve null equality
+                    if join.null_equality == NullEquality::NullEqualsNothing {
+                        return false;
+                    }
+                    // Recursively check child plans
+                    plan.inputs()
+                        .iter()
+                        .all(|input| check_null_equality_preserved(input))
+                }
+                _ => {
+                    // Recursively check child plans for non-join nodes
+                    plan.inputs()
+                        .iter()
+                        .all(|input| check_null_equality_preserved(input))
+                }
+            }
+        }
+
+        assert!(
+            check_null_equality_preserved(&optimized_plan),
+            "null_equality setting should be preserved after optimization"
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/optimizer/src/eliminate_duplicated_expr.rs b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
index a6651df938a70..113c92c2c8e99 100644
--- a/datafusion/optimizer/src/eliminate_duplicated_expr.rs
+++ b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
@@ -19,8 +19,8 @@
 
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::logical_plan::LogicalPlan;
 use datafusion_expr::{Aggregate, Expr, Sort, SortExpr};
 use std::hash::{Hash, Hasher};
@@ -32,7 +32,7 @@ use indexmap::IndexSet;
 pub struct EliminateDuplicatedExpr;
 
 impl EliminateDuplicatedExpr {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -118,9 +118,9 @@ impl OptimizerRule for EliminateDuplicatedExpr {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
     use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder};
     use std::sync::Arc;
 
diff --git a/datafusion/optimizer/src/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs
index 452df6e8331f8..8be5fb0857a9e 100644
--- a/datafusion/optimizer/src/eliminate_filter.rs
+++ b/datafusion/optimizer/src/eliminate_filter.rs
@@ -34,7 +34,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 pub struct EliminateFilter;
 
 impl EliminateFilter {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -60,7 +60,7 @@ impl OptimizerRule for EliminateFilter {
     ) -> Result<Transformed<LogicalPlan>> {
         match plan {
             LogicalPlan::Filter(Filter {
-                predicate: Expr::Literal(ScalarValue::Boolean(v)),
+                predicate: Expr::Literal(ScalarValue::Boolean(v), _),
                 input,
                 ..
             }) => match v {
@@ -81,10 +81,10 @@ impl OptimizerRule for EliminateFilter {
 mod tests {
     use std::sync::Arc;
 
-    use crate::assert_optimized_plan_eq_snapshot;
     use crate::OptimizerContext;
+    use crate::assert_optimized_plan_eq_snapshot;
     use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{col, lit, logical_plan::builder::LogicalPlanBuilder, Expr};
+    use datafusion_expr::{Expr, col, lit, logical_plan::builder::LogicalPlanBuilder};
 
     use crate::eliminate_filter::EliminateFilter;
     use crate::test::*;
@@ -117,12 +117,12 @@ mod tests {
             .build()?;
 
         // No aggregate / scan / limit
-        assert_optimized_plan_equal!(plan, @"EmptyRelation")
+        assert_optimized_plan_equal!(plan, @"EmptyRelation: rows=0")
     }
 
     #[test]
     fn filter_null() -> Result<()> {
-        let filter_expr = Expr::Literal(ScalarValue::Boolean(None));
+        let filter_expr = Expr::Literal(ScalarValue::Boolean(None), None);
 
         let table_scan = test_table_scan().unwrap();
         let plan = LogicalPlanBuilder::from(table_scan)
@@ -131,7 +131,7 @@ mod tests {
             .build()?;
 
         // No aggregate / scan / limit
-        assert_optimized_plan_equal!(plan, @"EmptyRelation")
+        assert_optimized_plan_equal!(plan, @"EmptyRelation: rows=0")
     }
 
     #[test]
@@ -151,7 +151,7 @@ mod tests {
         // Left side is removed
         assert_optimized_plan_equal!(plan, @r"
         Union
-          EmptyRelation
+          EmptyRelation: rows=0
           Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b)]]
             TableScan: test
         ")
@@ -217,7 +217,7 @@ mod tests {
         // Filter is removed
         assert_optimized_plan_equal!(plan, @r"
         Projection: test.a
-          EmptyRelation
+          EmptyRelation: rows=0
         ")
     }
 }
diff --git a/datafusion/optimizer/src/eliminate_group_by_constant.rs b/datafusion/optimizer/src/eliminate_group_by_constant.rs
index 604f083b37090..6f5ca59e31113 100644
--- a/datafusion/optimizer/src/eliminate_group_by_constant.rs
+++ b/datafusion/optimizer/src/eliminate_group_by_constant.rs
@@ -15,12 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`EliminateGroupByConstant`] removes constant expressions from `GROUP BY` clause
+//! [`EliminateGroupByConstant`] removes constant and functionally redundant
+//! expressions from `GROUP BY` clause
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
-use datafusion_common::tree_node::Transformed;
+use std::collections::HashSet;
+
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::{Aggregate, Expr, LogicalPlan, LogicalPlanBuilder, Volatility};
 
 /// Optimizer rule that removes constant expressions from `GROUP BY` clause
@@ -47,25 +50,30 @@ impl OptimizerRule for EliminateGroupByConstant {
     ) -> Result<Transformed<LogicalPlan>> {
         match plan {
             LogicalPlan::Aggregate(aggregate) => {
-                let (const_group_expr, nonconst_group_expr): (Vec<_>, Vec<_>) = aggregate
+                // Collect bare column references in GROUP BY
+                let group_by_columns: HashSet<&datafusion_common::Column> = aggregate
                     .group_expr
                     .iter()
-                    .partition(|expr| is_constant_expression(expr));
-
-                // If no constant expressions found (nothing to optimize) or
-                // constant expression is the only expression in aggregate,
-                // optimization is skipped
-                if const_group_expr.is_empty()
-                    || (!const_group_expr.is_empty()
-                        && nonconst_group_expr.is_empty()
-                        && aggregate.aggr_expr.is_empty())
+                    .filter_map(|expr| match expr {
+                        Expr::Column(c) => Some(c),
+                        _ => None,
+                    })
+                    .collect();
+
+                let (redundant, required): (Vec<_>, Vec<_>) = aggregate
+                    .group_expr
+                    .iter()
+                    .partition(|expr| is_redundant_group_expr(expr, &group_by_columns));
+
+                if redundant.is_empty()
+                    || (required.is_empty() && aggregate.aggr_expr.is_empty())
                 {
                     return Ok(Transformed::no(LogicalPlan::Aggregate(aggregate)));
                 }
 
                 let simplified_aggregate = LogicalPlan::Aggregate(Aggregate::try_new(
                     aggregate.input,
-                    nonconst_group_expr.into_iter().cloned().collect(),
+                    required.into_iter().cloned().collect(),
                     aggregate.aggr_expr.clone(),
                 )?);
 
@@ -91,23 +99,47 @@ impl OptimizerRule for EliminateGroupByConstant {
     }
 }
 
-/// Checks if expression is constant, and can be eliminated from group by.
-///
-/// Intended to be used only within this rule, helper function, which heavily
-/// relies on `SimplifyExpressions` result.
-fn is_constant_expression(expr: &Expr) -> bool {
+/// Checks if a GROUP BY expression is redundant (can be removed without
+/// changing grouping semantics). An expression is redundant if it is a
+/// deterministic function of constants and columns already present as bare
+/// column references in the GROUP BY.
+fn is_redundant_group_expr(
+    expr: &Expr,
+    group_by_columns: &HashSet<&datafusion_common::Column>,
+) -> bool {
+    // Bare column references are never redundant - they define the grouping
+    if matches!(expr, Expr::Column(_)) {
+        return false;
+    }
+    is_deterministic_of(expr, group_by_columns)
+}
+
+/// Returns true if `expr` is a deterministic expression whose only column
+/// references are contained in `known_columns`.
+fn is_deterministic_of(
+    expr: &Expr,
+    known_columns: &HashSet<&datafusion_common::Column>,
+) -> bool {
     match expr {
-        Expr::Alias(e) => is_constant_expression(&e.expr),
+        Expr::Alias(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::Column(c) => known_columns.contains(c),
+        Expr::Literal(_, _) => true,
         Expr::BinaryExpr(e) => {
-            is_constant_expression(&e.left) && is_constant_expression(&e.right)
+            is_deterministic_of(&e.left, known_columns)
+                && is_deterministic_of(&e.right, known_columns)
         }
-        Expr::Literal(_) => true,
         Expr::ScalarFunction(e) => {
             matches!(
                 e.func.signature().volatility,
                 Volatility::Immutable | Volatility::Stable
-            ) && e.args.iter().all(is_constant_expression)
+            ) && e
+                .args
+                .iter()
+                .all(|arg| is_deterministic_of(arg, known_columns))
         }
+        Expr::Cast(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::TryCast(e) => is_deterministic_of(&e.expr, known_columns),
+        Expr::Negative(e) => is_deterministic_of(e, known_columns),
         _ => false,
     }
 }
@@ -115,16 +147,16 @@ fn is_constant_expression(expr: &Expr) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
 
     use arrow::datatypes::DataType;
     use datafusion_common::Result;
     use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::{
-        col, lit, ColumnarValue, LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF,
-        ScalarUDFImpl, Signature, TypeSignature,
+        ColumnarValue, LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+        Signature, TypeSignature, col, lit,
     };
 
     use datafusion_functions_aggregate::expr_fn::count;
@@ -147,7 +179,7 @@ mod tests {
         }};
     }
 
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct ScalarUDFMock {
         signature: Signature,
     }
@@ -268,6 +300,43 @@ mod tests {
         ")
     }
 
+    #[test]
+    fn test_eliminate_deterministic_expr_of_group_by_column() -> Result<()> {
+        let scan = test_table_scan()?;
+        // GROUP BY a, a - 1, a - 2, a - 3  ->  GROUP BY a
+        let plan = LogicalPlanBuilder::from(scan)
+            .aggregate(
+                vec![
+                    col("a"),
+                    col("a") - lit(1u32),
+                    col("a") - lit(2u32),
+                    col("a") - lit(3u32),
+                ],
+                vec![count(col("c"))],
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Projection: test.a, test.a - UInt32(1), test.a - UInt32(2), test.a - UInt32(3), count(test.c)
+          Aggregate: groupBy=[[test.a]], aggr=[[count(test.c)]]
+            TableScan: test
+        ")
+    }
+
+    #[test]
+    fn test_no_eliminate_independent_columns() -> Result<()> {
+        // GROUP BY a, b - 1 should NOT eliminate b - 1 (b is not a group by column)
+        let scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .aggregate(vec![col("a"), col("b") - lit(1u32)], vec![count(col("c"))])?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Aggregate: groupBy=[[test.a, test.b - UInt32(1)]], aggr=[[count(test.c)]]
+          TableScan: test
+        ")
+    }
+
     #[test]
     fn test_no_op_volatile_scalar_fn_with_constant_arg() -> Result<()> {
         let udf = ScalarUDF::new_from_impl(ScalarUDFMock::new_with_volatility(
diff --git a/datafusion/optimizer/src/eliminate_join.rs b/datafusion/optimizer/src/eliminate_join.rs
index 2aad889b2fcbe..885910c1e4182 100644
--- a/datafusion/optimizer/src/eliminate_join.rs
+++ b/datafusion/optimizer/src/eliminate_join.rs
@@ -22,8 +22,8 @@ use datafusion_common::tree_node::Transformed;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::JoinType::Inner;
 use datafusion_expr::{
-    logical_plan::{EmptyRelation, LogicalPlan},
     Expr,
+    logical_plan::{EmptyRelation, LogicalPlan},
 };
 
 /// Eliminates joins when join condition is false.
@@ -54,7 +54,7 @@ impl OptimizerRule for EliminateJoin {
         match plan {
             LogicalPlan::Join(join) if join.join_type == Inner && join.on.is_empty() => {
                 match join.filter {
-                    Some(Expr::Literal(ScalarValue::Boolean(Some(false)))) => Ok(
+                    Some(Expr::Literal(ScalarValue::Boolean(Some(false)), _)) => Ok(
                         Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation {
                             produce_one_row: false,
                             schema: join.schema,
@@ -74,9 +74,9 @@ impl OptimizerRule for EliminateJoin {
 
 #[cfg(test)]
 mod tests {
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::eliminate_join::EliminateJoin;
-    use crate::OptimizerContext;
     use datafusion_common::Result;
     use datafusion_expr::JoinType::Inner;
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
@@ -108,6 +108,6 @@ mod tests {
             )?
             .build()?;
 
-        assert_optimized_plan_equal!(plan, @"EmptyRelation")
+        assert_optimized_plan_equal!(plan, @"EmptyRelation: rows=0")
     }
 }
diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs
index 2007e0c820454..e9ba535c96b97 100644
--- a/datafusion/optimizer/src/eliminate_limit.rs
+++ b/datafusion/optimizer/src/eliminate_limit.rs
@@ -18,8 +18,8 @@
 //! [`EliminateLimit`] eliminates `LIMIT` when possible
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::logical_plan::{EmptyRelation, FetchType, LogicalPlan, SkipType};
 use std::sync::Arc;
 
@@ -34,7 +34,7 @@ use std::sync::Arc;
 pub struct EliminateLimit;
 
 impl EliminateLimit {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -77,7 +77,7 @@ impl OptimizerRule for EliminateLimit {
                 } else if matches!(limit.get_skip_type()?, SkipType::Literal(0)) {
                     // If fetch is `None` and skip is 0, then Limit takes no effect and
                     // we can remove it. Its input also can be Limit, so we should apply again.
-                    #[allow(clippy::used_underscore_binding)]
+                    #[expect(clippy::used_underscore_binding)]
                     return self.rewrite(Arc::unwrap_or_clone(limit.input), _config);
                 }
                 Ok(Transformed::no(LogicalPlan::Limit(limit)))
@@ -90,12 +90,12 @@ impl OptimizerRule for EliminateLimit {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::test::*;
     use crate::OptimizerContext;
+    use crate::test::*;
     use datafusion_common::Column;
     use datafusion_expr::{
         col,
-        logical_plan::{builder::LogicalPlanBuilder, JoinType},
+        logical_plan::{JoinType, builder::LogicalPlanBuilder},
     };
     use std::sync::Arc;
 
@@ -148,7 +148,7 @@ mod tests {
         // No aggregate / scan / limit
         assert_optimized_plan_equal!(
             plan,
-            @ r"EmptyRelation"
+            @ "EmptyRelation: rows=0"
         )
     }
 
@@ -169,7 +169,7 @@ mod tests {
             plan,
             @ r"
         Union
-          EmptyRelation
+          EmptyRelation: rows=0
           Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b)]]
             TableScan: test
         "
@@ -188,7 +188,7 @@ mod tests {
         // No aggregate / scan / limit
         assert_optimized_plan_eq_with_pushdown!(
             plan,
-            @ "EmptyRelation"
+            @ "EmptyRelation: rows=0"
         )
     }
 
diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs
deleted file mode 100644
index 3e027811420c4..0000000000000
--- a/datafusion/optimizer/src/eliminate_one_union.rs
+++ /dev/null
@@ -1,121 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! [`EliminateOneUnion`]  eliminates single element `Union`
-
-use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::{tree_node::Transformed, Result};
-use datafusion_expr::logical_plan::{LogicalPlan, Union};
-use std::sync::Arc;
-
-use crate::optimizer::ApplyOrder;
-
-#[derive(Default, Debug)]
-/// An optimization rule that eliminates union with one element.
-pub struct EliminateOneUnion;
-
-impl EliminateOneUnion {
-    #[allow(missing_docs)]
-    pub fn new() -> Self {
-        Self {}
-    }
-}
-
-impl OptimizerRule for EliminateOneUnion {
-    fn name(&self) -> &str {
-        "eliminate_one_union"
-    }
-
-    fn supports_rewrite(&self) -> bool {
-        true
-    }
-
-    fn rewrite(
-        &self,
-        plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
-    ) -> Result<Transformed<LogicalPlan>> {
-        match plan {
-            LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => Ok(
-                Transformed::yes(Arc::unwrap_or_clone(inputs.pop().unwrap())),
-            ),
-            _ => Ok(Transformed::no(plan)),
-        }
-    }
-
-    fn apply_order(&self) -> Option<ApplyOrder> {
-        Some(ApplyOrder::TopDown)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test::*;
-    use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::ToDFSchema;
-    use datafusion_expr::{
-        expr_rewriter::coerce_plan_expr_for_schema, logical_plan::table_scan,
-    };
-    use std::sync::Arc;
-
-    fn schema() -> Schema {
-        Schema::new(vec![
-            Field::new("id", DataType::Int32, false),
-            Field::new("key", DataType::Utf8, false),
-            Field::new("value", DataType::Int32, false),
-        ])
-    }
-
-    fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> {
-        assert_optimized_plan_with_rules(
-            vec![Arc::new(EliminateOneUnion::new())],
-            plan,
-            expected,
-            true,
-        )
-    }
-
-    #[test]
-    fn eliminate_nothing() -> Result<()> {
-        let plan_builder = table_scan(Some("table"), &schema(), None)?;
-
-        let plan = plan_builder.clone().union(plan_builder.build()?)?.build()?;
-
-        let expected = "\
-        Union\
-        \n  TableScan: table\
-        \n  TableScan: table";
-        assert_optimized_plan_equal(plan, expected)
-    }
-
-    #[test]
-    fn eliminate_one_union() -> Result<()> {
-        let table_plan = coerce_plan_expr_for_schema(
-            table_scan(Some("table"), &schema(), None)?.build()?,
-            &schema().to_dfschema()?,
-        )?;
-        let schema = Arc::clone(table_plan.schema());
-        let single_union_plan = LogicalPlan::Union(Union {
-            inputs: vec![Arc::new(table_plan)],
-            schema,
-        });
-
-        let expected = "TableScan: table";
-        assert_optimized_plan_equal(single_union_plan, expected)
-    }
-}
diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs
index 621086e4a28a9..5c47d6b7c566e 100644
--- a/datafusion/optimizer/src/eliminate_outer_join.rs
+++ b/datafusion/optimizer/src/eliminate_outer_join.rs
@@ -52,7 +52,7 @@ use std::sync::Arc;
 pub struct EliminateOuterJoin;
 
 impl EliminateOuterJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -118,7 +118,8 @@ impl OptimizerRule for EliminateOuterJoin {
                         on: join.on.clone(),
                         filter: join.filter.clone(),
                         schema: Arc::clone(&join.schema),
-                        null_equals_null: join.null_equals_null,
+                        null_equality: join.null_equality,
+                        null_aware: join.null_aware,
                     }));
                     Filter::try_new(filter.predicate, new_join)
                         .map(|f| Transformed::yes(LogicalPlan::Filter(f)))
@@ -289,8 +290,8 @@ fn extract_non_nullable_columns(
                 false,
             )
         }
-        Expr::Cast(Cast { expr, data_type: _ })
-        | Expr::TryCast(TryCast { expr, data_type: _ }) => extract_non_nullable_columns(
+        Expr::Cast(Cast { expr, field: _ })
+        | Expr::TryCast(TryCast { expr, field: _ }) => extract_non_nullable_columns(
             expr,
             non_nullable_cols,
             left_schema,
@@ -304,15 +305,15 @@ fn extract_non_nullable_columns(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::OptimizerContext;
     use arrow::datatypes::DataType;
     use datafusion_expr::{
+        Operator::{And, Or},
         binary_expr, cast, col, lit,
         logical_plan::builder::LogicalPlanBuilder,
         try_cast,
-        Operator::{And, Or},
     };
 
     macro_rules! assert_optimized_plan_equal {
diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs
index a07b50ade5b8d..0a50761e8a9f7 100644
--- a/datafusion/optimizer/src/extract_equijoin_predicate.rs
+++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs
@@ -19,8 +19,8 @@
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::DFSchema;
-use datafusion_common::Result;
+use datafusion_common::{DFSchema, assert_or_internal_err};
+use datafusion_common::{NullEquality, Result};
 use datafusion_expr::utils::split_conjunction_owned;
 use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair};
 use datafusion_expr::{BinaryExpr, Expr, ExprSchemable, Join, LogicalPlan, Operator};
@@ -42,7 +42,7 @@ type EquijoinPredicate = (Expr, Expr);
 pub struct ExtractEquijoinPredicate;
 
 impl ExtractEquijoinPredicate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -75,13 +75,54 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                 join_type,
                 join_constraint,
                 schema,
-                null_equals_null,
+                null_equality,
+                null_aware,
             }) => {
                 let left_schema = left.schema();
                 let right_schema = right.schema();
                 let (equijoin_predicates, non_equijoin_expr) =
                     split_eq_and_noneq_join_predicate(expr, left_schema, right_schema)?;
 
+                // Equi-join operators like HashJoin support a special behavior
+                // that evaluates `NULL = NULL` as true instead of NULL. Therefore,
+                // we transform `t1.c1 IS NOT DISTINCT FROM t2.c1` into an equi-join
+                // and set the `NullEquality` configuration in the join operator.
+                // This allows certain queries to use Hash Join instead of
+                // Nested Loop Join, resulting in better performance.
+                //
+                // Only convert when there are NO equijoin predicates, to be conservative.
+                if on.is_empty()
+                    && equijoin_predicates.is_empty()
+                    && non_equijoin_expr.is_some()
+                {
+                    // SAFETY: checked in the outer `if`
+                    let expr = non_equijoin_expr.clone().unwrap();
+                    let (equijoin_predicates, non_equijoin_expr) =
+                        split_is_not_distinct_from_and_other_join_predicate(
+                            expr,
+                            left_schema,
+                            right_schema,
+                        )?;
+
+                    if !equijoin_predicates.is_empty() {
+                        on.extend(equijoin_predicates);
+
+                        return Ok(Transformed::yes(LogicalPlan::Join(Join {
+                            left,
+                            right,
+                            on,
+                            filter: non_equijoin_expr,
+                            join_type,
+                            join_constraint,
+                            schema,
+                            // According to `is not distinct from`'s semantics, it's
+                            // safe to override it
+                            null_equality: NullEquality::NullEqualsNull,
+                            null_aware,
+                        })));
+                    }
+                }
+
                 if !equijoin_predicates.is_empty() {
                     on.extend(equijoin_predicates);
                     Ok(Transformed::yes(LogicalPlan::Join(Join {
@@ -92,7 +133,8 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                         join_type,
                         join_constraint,
                         schema,
-                        null_equals_null,
+                        null_equality,
+                        null_aware,
                     })))
                 } else {
                     Ok(Transformed::no(LogicalPlan::Join(Join {
@@ -103,7 +145,8 @@ impl OptimizerRule for ExtractEquijoinPredicate {
                         join_type,
                         join_constraint,
                         schema,
-                        null_equals_null,
+                        null_equality,
+                        null_aware,
                     })))
                 }
             }
@@ -112,22 +155,97 @@ impl OptimizerRule for ExtractEquijoinPredicate {
     }
 }
 
+/// Splits an ANDed filter expression into equijoin predicates and remaining filters.
+/// Returns all equijoin predicates and the remaining filters combined with AND.
+///
+/// # Example
+///
+/// For the expression `a.id = b.id AND a.x > 10 AND b.x > b.id`, this function will extract `a.id = b.id` as an equijoin predicate.
+///
+/// It first splits the ANDed sub-expressions:
+/// - expr1: a.id = b.id
+/// - expr2: a.x > 10
+/// - expr3: b.x > b.id
+///
+/// Then, it filters out the equijoin predicates and collects the non-equality expressions.
+/// The equijoin condition is:
+/// - It is an equality expression like `lhs == rhs`
+/// - All column references in `lhs` are from the left schema, and all in `rhs` are from the right schema
+///
+/// According to the above rule, `expr1` is the equijoin predicate, while `expr2` and `expr3` are not.
+/// The function returns Ok(\[expr1\], Some(expr2 AND expr3))
 fn split_eq_and_noneq_join_predicate(
     filter: Expr,
     left_schema: &DFSchema,
     right_schema: &DFSchema,
 ) -> Result<(Vec<EquijoinPredicate>, Option<Expr>)> {
+    split_op_and_other_join_predicates(filter, left_schema, right_schema, Operator::Eq)
+}
+
+/// See `split_eq_and_noneq_join_predicate`'s comment for the idea. This function
+/// is splitting out `is not distinct from` expressions instead of equal exprs.
+/// The `is not distinct from` exprs will be return as `EquijoinPredicate`.
+///
+/// # Example
+/// - Input: `a.id IS NOT DISTINCT FROM b.id AND a.x > 10 AND b.x > b.id`
+/// - Output from this splitter: `Ok([a.id, b.id], Some((a.x > 10) AND (b.x > b.id)))`
+///
+/// # Note
+/// Caller should be cautious -- `is not distinct from` is not equivalent to an
+/// equal expression; the caller is responsible for correctly setting the
+/// `nulls equals nulls` property in the join operator (if it supports it) to
+/// make the transformation valid.
+///
+/// For the above example: in downstream, a valid plan that uses the extracted
+/// equijoin keys should look like:
+///
+/// HashJoin
+/// - on: `a.id = b.id` (equality)
+/// - join_filter: `(a.x > 10) AND (b.x > b.id)`
+/// - nulls_equals_null: `true`
+///
+/// This reflects that `IS NOT DISTINCT FROM` treats `NULL = NULL` as true and
+/// thus requires setting `NullEquality::NullEqualsNull` in the join operator to
+/// preserve semantics while enabling an equi-join implementation (e.g., HashJoin).
+fn split_is_not_distinct_from_and_other_join_predicate(
+    filter: Expr,
+    left_schema: &DFSchema,
+    right_schema: &DFSchema,
+) -> Result<(Vec<EquijoinPredicate>, Option<Expr>)> {
+    split_op_and_other_join_predicates(
+        filter,
+        left_schema,
+        right_schema,
+        Operator::IsNotDistinctFrom,
+    )
+}
+
+/// See comments in `split_eq_and_noneq_join_predicate` for details.
+fn split_op_and_other_join_predicates(
+    filter: Expr,
+    left_schema: &DFSchema,
+    right_schema: &DFSchema,
+    operator: Operator,
+) -> Result<(Vec<EquijoinPredicate>, Option<Expr>)> {
+    assert_or_internal_err!(
+        matches!(operator, Operator::Eq | Operator::IsNotDistinctFrom),
+        "split_op_and_other_join_predicates only supports 'Eq' or 'IsNotDistinctFrom' operators, \
+        but received: {:?}",
+        operator
+    );
+
     let exprs = split_conjunction_owned(filter);
 
+    // Treat 'is not distinct from' comparison as join key in equal joins
     let mut accum_join_keys: Vec<(Expr, Expr)> = vec![];
     let mut accum_filters: Vec<Expr> = vec![];
     for expr in exprs {
         match expr {
             Expr::BinaryExpr(BinaryExpr {
                 ref left,
-                op: Operator::Eq,
+                ref op,
                 ref right,
-            }) => {
+            }) if *op == operator => {
                 let join_key_pair =
                     find_valid_equijoin_key_pair(left, right, left_schema, right_schema)?;
 
@@ -159,7 +277,7 @@ mod tests {
     use crate::test::*;
     use arrow::datatypes::DataType;
     use datafusion_expr::{
-        col, lit, logical_plan::builder::LogicalPlanBuilder, JoinType,
+        JoinType, col, lit, logical_plan::builder::LogicalPlanBuilder,
     };
     use std::sync::Arc;
 
diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs
new file mode 100644
index 0000000000000..922ea7933781e
--- /dev/null
+++ b/datafusion/optimizer/src/extract_leaf_expressions.rs
@@ -0,0 +1,3053 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Two-pass optimizer pipeline that pushes cheap expressions (like struct field
+//! access `user['status']`) closer to data sources, enabling early data reduction
+//! and source-level optimizations (e.g., Parquet column pruning). See
+//! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2).
+
+use indexmap::{IndexMap, IndexSet};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion_common::alias::AliasGenerator;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
+use datafusion_common::{Column, DFSchema, Result, qualified_name};
+use datafusion_expr::logical_plan::LogicalPlan;
+use datafusion_expr::{Expr, ExpressionPlacement, Projection};
+
+use crate::optimizer::ApplyOrder;
+use crate::push_down_filter::replace_cols_by_name;
+use crate::utils::has_all_column_refs;
+use crate::{OptimizerConfig, OptimizerRule};
+
+/// Prefix for aliases generated by the extraction optimizer passes.
+///
+/// This prefix is **reserved for internal optimizer use**. User-defined aliases
+/// starting with this prefix may be misidentified as optimizer-generated
+/// extraction aliases, leading to unexpected behavior. Do not use this prefix
+/// in user queries.
+const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted";
+
+/// Returns `true` if any sub-expression in `exprs` has
+/// [`ExpressionPlacement::MoveTowardsLeafNodes`] placement.
+///
+/// This is a lightweight pre-check that short-circuits as soon as one
+/// extractable expression is found, avoiding the expensive allocations
+/// (column HashSets, extractors, expression rewrites) that the full
+/// extraction pipeline requires.
+fn has_extractable_expr(exprs: &[Expr]) -> bool {
+    exprs.iter().any(|expr| {
+        expr.exists(|e| Ok(e.placement() == ExpressionPlacement::MoveTowardsLeafNodes))
+            .unwrap_or(false)
+    })
+}
+
+/// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes
+/// into **extraction projections** (pass 1 of 2).
+///
+/// This handles Filter, Sort, Limit, Aggregate, and Join nodes. For Projection
+/// nodes, extraction and pushdown are handled by [`PushDownLeafProjections`].
+///
+/// # Key Concepts
+///
+/// **Extraction projection**: a projection inserted *below* a node that
+/// pre-computes a cheap expression and exposes it under an alias
+/// (`__datafusion_extracted_N`). The parent node then references the alias
+/// instead of the original expression.
+///
+/// **Recovery projection**: a projection inserted *above* a node to restore
+/// the original output schema when extraction changes it.
+/// Schema-preserving nodes (Filter, Sort, Limit) gain extra columns from
+/// the extraction projection that bubble up; the recovery projection selects
+/// only the original columns to hide the extras.
+///
+/// # Example
+///
+/// Given a filter with a struct field access:
+///
+/// ```text
+/// Filter: user['status'] = 'active'
+///   TableScan: t [id, user]
+/// ```
+///
+/// This rule:
+/// 1. Inserts an **extraction projection** below the filter:
+/// 2. Adds a **recovery projection** above to hide the extra column:
+///
+/// ```text
+/// Projection: id, user                                                        <-- recovery projection
+///   Filter: __datafusion_extracted_1 = 'active'
+///     Projection: user['status'] AS __datafusion_extracted_1, id, user         <-- extraction projection
+///       TableScan: t [id, user]
+/// ```
+///
+/// **Important:** The `PushDownFilter` rule is aware of projections created by this rule
+/// and will not push filters through them. It uses `ExpressionPlacement` to detect
+/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them.
+#[derive(Default, Debug)]
+pub struct ExtractLeafExpressions {}
+
+impl ExtractLeafExpressions {
+    /// Create a new [`ExtractLeafExpressions`]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for ExtractLeafExpressions {
+    fn name(&self) -> &str {
+        "extract_leaf_expressions"
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        if !config.options().optimizer.enable_leaf_expression_pushdown {
+            return Ok(Transformed::no(plan));
+        }
+        let alias_generator = config.alias_generator();
+
+        // Advance the alias generator past any user-provided __datafusion_extracted_N
+        // aliases to prevent collisions when generating new extraction aliases.
+        advance_generator_past_existing(&plan, alias_generator)?;
+
+        plan.transform_down_with_subqueries(|plan| {
+            extract_from_plan(plan, alias_generator)
+        })
+    }
+}
+
+/// Scans the current plan node's expressions for pre-existing
+/// `__datafusion_extracted_N` aliases and advances the generator
+/// counter past them to avoid collisions with user-provided aliases.
+fn advance_generator_past_existing(
+    plan: &LogicalPlan,
+    alias_generator: &AliasGenerator,
+) -> Result<()> {
+    plan.apply(|plan| {
+        plan.expressions().iter().try_for_each(|expr| {
+            expr.apply(|e| {
+                if let Expr::Alias(alias) = e
+                    && let Some(id) = alias
+                        .name
+                        .strip_prefix(EXTRACTED_EXPR_PREFIX)
+                        .and_then(|s| s.strip_prefix('_'))
+                        .and_then(|s| s.parse().ok())
+                {
+                    alias_generator.update_min_id(id);
+                }
+                Ok(TreeNodeRecursion::Continue)
+            })?;
+            Ok::<(), datafusion_common::error::DataFusionError>(())
+        })?;
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .map(|_| ())
+}
+
+/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node.
+///
+/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes
+/// like Join, each extracted sub-expression is routed to the correct input
+/// by checking which input's schema contains all of the expression's column
+/// references.
+fn extract_from_plan(
+    plan: LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Transformed<LogicalPlan>> {
+    // Only extract from plan types whose output schema is predictable after
+    // expression rewriting.  Nodes like Window derive column names from
+    // their expressions, so rewriting `get_field` inside a window function
+    // changes the output schema and breaks the recovery projection.
+    if !matches!(
+        &plan,
+        LogicalPlan::Aggregate(_)
+            | LogicalPlan::Filter(_)
+            | LogicalPlan::Sort(_)
+            | LogicalPlan::Limit(_)
+            | LogicalPlan::Join(_)
+    ) {
+        return Ok(Transformed::no(plan));
+    }
+
+    let inputs = plan.inputs();
+    if inputs.is_empty() {
+        return Ok(Transformed::no(plan));
+    }
+
+    // Fast pre-check: skip all allocations if no extractable expressions exist
+    if !has_extractable_expr(&plan.expressions()) {
+        return Ok(Transformed::no(plan));
+    }
+
+    // Save original output schema before any transformation
+    let original_schema = Arc::clone(plan.schema());
+
+    // Build per-input schemas from borrowed inputs (before plan is consumed
+    // by map_expressions). We only need schemas and column sets for routing;
+    // the actual inputs are cloned later only if extraction succeeds.
+    let input_schemas: Vec<Arc<DFSchema>> =
+        inputs.iter().map(|i| Arc::clone(i.schema())).collect();
+
+    // Build per-input extractors
+    let mut extractors: Vec<LeafExpressionExtractor> = input_schemas
+        .iter()
+        .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator))
+        .collect();
+
+    // Build per-input column sets for routing expressions to the correct input
+    let input_column_sets: Vec<std::collections::HashSet<Column>> = input_schemas
+        .iter()
+        .map(|schema| schema_columns(schema.as_ref()))
+        .collect();
+
+    // Transform expressions via map_expressions with routing
+    let transformed = plan.map_expressions(|expr| {
+        routing_extract(expr, &mut extractors, &input_column_sets)
+    })?;
+
+    // If no expressions were rewritten, nothing was extracted
+    if !transformed.transformed {
+        return Ok(transformed);
+    }
+
+    // Clone inputs now that we know extraction succeeded. Wrap in Arc
+    // upfront since build_extraction_projection expects &Arc<LogicalPlan>.
+    let owned_inputs: Vec<Arc<LogicalPlan>> = transformed
+        .data
+        .inputs()
+        .into_iter()
+        .map(|i| Arc::new(i.clone()))
+        .collect();
+
+    // Build per-input extraction projections (None means no extractions for that input)
+    let new_inputs: Vec<LogicalPlan> = owned_inputs
+        .into_iter()
+        .zip(extractors.iter())
+        .map(|(input_arc, extractor)| {
+            match extractor.build_extraction_projection(&input_arc)? {
+                Some(plan) => Ok(plan),
+                // No extractions for this input — recover the LogicalPlan
+                // without cloning (refcount is 1 since build returned None).
+                None => {
+                    Ok(Arc::try_unwrap(input_arc).unwrap_or_else(|arc| (*arc).clone()))
+                }
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    // Rebuild the plan keeping its rewritten expressions but replacing
+    // inputs with the new extraction projections.
+    let new_plan = transformed
+        .data
+        .with_new_exprs(transformed.data.expressions(), new_inputs)?;
+
+    // Add recovery projection if the output schema changed
+    let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?;
+
+    Ok(Transformed::yes(recovered))
+}
+
+/// Given an expression, returns the index of the input whose columns fully
+/// cover the expression's column references.
+/// Returns `None` if the expression references columns from multiple inputs
+/// or if multiple inputs match (ambiguous, e.g. unqualified columns present
+/// in both sides of a join).
+fn find_owning_input(
+    expr: &Expr,
+    input_column_sets: &[std::collections::HashSet<Column>],
+) -> Option<usize> {
+    let mut found = None;
+    for (idx, cols) in input_column_sets.iter().enumerate() {
+        if has_all_column_refs(expr, cols) {
+            if found.is_some() {
+                // Ambiguous — multiple inputs match
+                return None;
+            }
+            found = Some(idx);
+        }
+    }
+    found
+}
+
+/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes`
+/// sub-expressions and routing each to the correct per-input extractor.
+fn routing_extract(
+    expr: Expr,
+    extractors: &mut [LeafExpressionExtractor],
+    input_column_sets: &[std::collections::HashSet<Column>],
+) -> Result<Transformed<Expr>> {
+    expr.transform_down(|e| {
+        // Skip expressions already aliased with extracted expression pattern
+        if let Expr::Alias(alias) = &e
+            && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+        {
+            return Ok(Transformed {
+                data: e,
+                transformed: false,
+                tnr: TreeNodeRecursion::Jump,
+            });
+        }
+
+        // Don't extract Alias nodes directly — preserve the alias and let
+        // transform_down recurse into the inner expression
+        if matches!(&e, Expr::Alias(_)) {
+            return Ok(Transformed::no(e));
+        }
+
+        match e.placement() {
+            ExpressionPlacement::MoveTowardsLeafNodes => {
+                if let Some(idx) = find_owning_input(&e, input_column_sets) {
+                    let col_ref = extractors[idx].add_extracted(e)?;
+                    Ok(Transformed::yes(col_ref))
+                } else {
+                    // References columns from multiple inputs — cannot extract
+                    Ok(Transformed::no(e))
+                }
+            }
+            ExpressionPlacement::Column => {
+                // Track columns that the parent node references so the
+                // extraction projection includes them as pass-through.
+                // Without this, the extraction projection would only
+                // contain __datafusion_extracted_N aliases, and the parent couldn't
+                // resolve its other column references.
+                if let Expr::Column(col) = &e
+                    && let Some(idx) = find_owning_input(&e, input_column_sets)
+                {
+                    extractors[idx].columns_needed.insert(col.clone());
+                }
+                Ok(Transformed::no(e))
+            }
+            _ => Ok(Transformed::no(e)),
+        }
+    })
+}
+
+/// Returns all columns in the schema (both qualified and unqualified forms)
+fn schema_columns(schema: &DFSchema) -> std::collections::HashSet<Column> {
+    schema
+        .iter()
+        .flat_map(|(qualifier, field)| {
+            [
+                Column::new(qualifier.cloned(), field.name()),
+                Column::new_unqualified(field.name()),
+            ]
+        })
+        .collect()
+}
+
+/// Rewrites extraction pairs and column references from one qualifier
+/// space to another.
+///
+/// Builds a replacement map by zipping `from_schema` (whose qualifiers
+/// currently appear in `pairs` / `columns`) with `to_schema` (the
+/// qualifiers we want), then applies `replace_cols_by_name`.
+///
+/// Used for SubqueryAlias (alias-space -> input-space) and Union
+/// (union output-space -> per-branch input-space).
+fn remap_pairs_and_columns(
+    pairs: &[(Expr, String)],
+    columns: &IndexSet<Column>,
+    from_schema: &DFSchema,
+    to_schema: &DFSchema,
+) -> Result<ExtractionTarget> {
+    let mut replace_map = HashMap::new();
+    for ((from_q, from_f), (to_q, to_f)) in from_schema.iter().zip(to_schema.iter()) {
+        replace_map.insert(
+            qualified_name(from_q, from_f.name()),
+            Expr::Column(Column::new(to_q.cloned(), to_f.name())),
+        );
+    }
+    let remapped_pairs: Vec<(Expr, String)> = pairs
+        .iter()
+        .map(|(expr, alias)| {
+            Ok((
+                replace_cols_by_name(expr.clone(), &replace_map)?,
+                alias.clone(),
+            ))
+        })
+        .collect::<Result<_>>()?;
+    let remapped_columns: IndexSet<Column> = columns
+        .iter()
+        .filter_map(|col| {
+            let rewritten =
+                replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?;
+            if let Expr::Column(c) = rewritten {
+                Some(c)
+            } else {
+                Some(col.clone())
+            }
+        })
+        .collect();
+    Ok(ExtractionTarget {
+        pairs: remapped_pairs,
+        columns: remapped_columns,
+    })
+}
+
+// =============================================================================
+// Helper Types & Functions for Extraction Targeting
+// =============================================================================
+
+/// A bundle of extraction pairs (expression + alias) and standalone columns
+/// that need to be pushed through a plan node.
+struct ExtractionTarget {
+    /// Extracted expressions paired with their generated aliases.
+    pairs: Vec<(Expr, String)>,
+    /// Standalone column references needed by the parent node.
+    columns: IndexSet<Column>,
+}
+
+/// Build a replacement map from a projection: output_column_name -> underlying_expr.
+///
+/// This is used to resolve column references through a renaming projection.
+/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`.
+fn build_projection_replace_map(projection: &Projection) -> HashMap<String, Expr> {
+    projection
+        .schema
+        .iter()
+        .zip(projection.expr.iter())
+        .map(|((qualifier, field), expr)| {
+            let key = Column::from((qualifier, field)).flat_name();
+            (key, expr.clone().unalias())
+        })
+        .collect()
+}
+
+/// Build a recovery projection to restore the original output schema.
+///
+/// After extraction, a node's output schema may differ from the original:
+///
+/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection
+///   below adds extra `__datafusion_extracted_N` columns that bubble up through
+///   the node. Recovery selects only the original columns to hide the extras.
+///   ```text
+///   Original schema: [id, user]
+///   After extraction: [__datafusion_extracted_1, id, user]   ← extra column leaked through
+///   Recovery: SELECT id, user FROM ...                       ← hides __datafusion_extracted_1
+///   ```
+///
+/// - **Schema-defining nodes** (Aggregate): same number of columns but names
+///   may differ because extracted aliases replaced the original expressions.
+///   Recovery maps positionally, aliasing where names changed.
+///   ```text
+///   Original: [SUM(user['balance'])]
+///   After:    [SUM(__datafusion_extracted_1)]                ← name changed
+///   Recovery: SUM(__datafusion_extracted_1) AS "SUM(user['balance'])"
+///   ```
+///
+/// - **Schemas identical** → no recovery projection needed.
+fn build_recovery_projection(
+    original_schema: &DFSchema,
+    input: LogicalPlan,
+) -> Result<LogicalPlan> {
+    let new_schema = input.schema();
+    let orig_len = original_schema.fields().len();
+    let new_len = new_schema.fields().len();
+
+    if orig_len == new_len {
+        // Same number of fields — check if schemas are identical
+        let schemas_match = original_schema.iter().zip(new_schema.iter()).all(
+            |((orig_q, orig_f), (new_q, new_f))| {
+                orig_f.name() == new_f.name() && orig_q == new_q
+            },
+        );
+        if schemas_match {
+            return Ok(input);
+        }
+
+        // Schema-defining nodes (Aggregate, Join): names may differ at some
+        // positions because extracted aliases replaced the original expressions.
+        // Map positionally, aliasing where the name changed.
+        //
+        // Invariant: `with_new_exprs` on all supported node types (Aggregate,
+        // Filter, Sort, Limit, Join) preserves column order, so positional
+        // mapping is safe here.
+        debug_assert!(
+            orig_len == new_len,
+            "build_recovery_projection: positional mapping requires same field count, \
+             got original={orig_len} vs new={new_len}"
+        );
+        let mut proj_exprs = Vec::with_capacity(orig_len);
+        for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() {
+            let (new_qualifier, new_field) = new_schema.qualified_field(i);
+            if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier {
+                proj_exprs.push(Expr::from((orig_qualifier, orig_field)));
+            } else {
+                let new_col = Expr::Column(Column::from((new_qualifier, new_field)));
+                proj_exprs.push(
+                    new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()),
+                );
+            }
+        }
+        let projection = Projection::try_new(proj_exprs, Arc::new(input))?;
+        Ok(LogicalPlan::Projection(projection))
+    } else {
+        // Schema-preserving nodes: new schema has extra extraction columns.
+        // Original columns still exist by name; select them to hide extras.
+        let col_exprs: Vec<Expr> = original_schema.iter().map(Expr::from).collect();
+        let projection = Projection::try_new(col_exprs, Arc::new(input))?;
+        Ok(LogicalPlan::Projection(projection))
+    }
+}
+
+/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression
+/// tree traversal and can build an extraction projection from them.
+///
+/// # Example
+///
+/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`:
+/// - `add_extracted(user['status'])` → stores it, returns `col("__datafusion_extracted_1")`
+/// - `add_extracted(user['name'])`   → stores it, returns `col("__datafusion_extracted_2")`
+/// - `build_extraction_projection()` produces:
+///   `Projection: user['status'] AS __datafusion_extracted_1, user['name'] AS __datafusion_extracted_2, <all input columns>`
+struct LeafExpressionExtractor<'a> {
+    /// Extracted expressions: maps expression -> alias
+    extracted: IndexMap<Expr, String>,
+    /// Columns referenced by extracted expressions or the parent node,
+    /// included as pass-through in the extraction projection.
+    columns_needed: IndexSet<Column>,
+    /// Input schema
+    input_schema: &'a DFSchema,
+    /// Alias generator
+    alias_generator: &'a Arc<AliasGenerator>,
+}
+
+impl<'a> LeafExpressionExtractor<'a> {
+    fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc<AliasGenerator>) -> Self {
+        Self {
+            extracted: IndexMap::new(),
+            columns_needed: IndexSet::new(),
+            input_schema,
+            alias_generator,
+        }
+    }
+
+    /// Adds an expression to extracted set, returns column reference.
+    fn add_extracted(&mut self, expr: Expr) -> Result<Expr> {
+        // Deduplication: reuse existing alias if same expression
+        if let Some(alias) = self.extracted.get(&expr) {
+            return Ok(Expr::Column(Column::new_unqualified(alias)));
+        }
+
+        // Track columns referenced by this expression
+        for col in expr.column_refs() {
+            self.columns_needed.insert(col.clone());
+        }
+
+        // Generate unique alias
+        let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX);
+        self.extracted.insert(expr, alias.clone());
+
+        Ok(Expr::Column(Column::new_unqualified(&alias)))
+    }
+
+    /// Builds an extraction projection above the given input, or merges into
+    /// it if the input is already a projection. Delegates to
+    /// [`build_extraction_projection_impl`].
+    ///
+    /// Returns `None` if there are no extractions.
+    fn build_extraction_projection(
+        &self,
+        input: &Arc<LogicalPlan>,
+    ) -> Result<Option<LogicalPlan>> {
+        if self.extracted.is_empty() {
+            return Ok(None);
+        }
+        let pairs: Vec<(Expr, String)> = self
+            .extracted
+            .iter()
+            .map(|(e, a)| (e.clone(), a.clone()))
+            .collect();
+        let proj = build_extraction_projection_impl(
+            &pairs,
+            &self.columns_needed,
+            input,
+            self.input_schema,
+        )?;
+        Ok(Some(LogicalPlan::Projection(proj)))
+    }
+}
+
+/// Build an extraction projection above the target node (shared by both passes).
+///
+/// If the target is an existing projection, merges into it. This requires
+/// resolving column references through the projection's rename mapping:
+/// if the projection has `user AS u`, and an extracted expression references
+/// `u['name']`, we must rewrite it to `user['name']` since the merged
+/// projection reads from the same input as the original.
+///
+/// Deduplicates by resolved expression equality and adds pass-through
+/// columns as needed. Otherwise builds a fresh projection with extracted
+/// expressions + ALL input schema columns.
+fn build_extraction_projection_impl(
+    extracted_exprs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    target: &Arc<LogicalPlan>,
+    target_schema: &DFSchema,
+) -> Result<Projection> {
+    if let LogicalPlan::Projection(existing) = target.as_ref() {
+        // Merge into existing projection
+        let mut proj_exprs = existing.expr.clone();
+
+        // Build a map of existing expressions (by Expr equality) to their aliases
+        let existing_extractions: IndexMap<Expr, String> = existing
+            .expr
+            .iter()
+            .filter_map(|e| {
+                if let Expr::Alias(alias) = e
+                    && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+                {
+                    return Some((*alias.expr.clone(), alias.name.clone()));
+                }
+                None
+            })
+            .collect();
+
+        // Resolve column references through the projection's rename mapping
+        let replace_map = build_projection_replace_map(existing);
+
+        // Add new extracted expressions, resolving column refs through the projection
+        for (expr, alias) in extracted_exprs {
+            let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?;
+            let resolved_inner = if let Expr::Alias(a) = &resolved {
+                a.expr.as_ref()
+            } else {
+                &resolved
+            };
+            if let Some(existing_alias) = existing_extractions.get(resolved_inner) {
+                // Same expression already extracted under a different alias —
+                // add the expression with the new alias so both names are
+                // available in the output. We can't reference the existing alias
+                // as a column within the same projection, so we duplicate the
+                // computation.
+                if existing_alias != alias {
+                    proj_exprs.push(resolved);
+                }
+            } else {
+                proj_exprs.push(resolved);
+            }
+        }
+
+        // Add any new pass-through columns that aren't already in the projection.
+        // We check against existing.input.schema() (the projection's source) rather
+        // than target_schema (the projection's output) because columns produced
+        // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but
+        // not the input, and cannot be added as pass-through Column references.
+        let existing_cols: IndexSet<Column> = existing
+            .expr
+            .iter()
+            .filter_map(|e| {
+                if let Expr::Column(c) = e {
+                    Some(c.clone())
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        let input_schema = existing.input.schema();
+        for col in columns_needed {
+            let col_expr = Expr::Column(col.clone());
+            let resolved = replace_cols_by_name(col_expr, &replace_map)?;
+            if let Expr::Column(resolved_col) = &resolved
+                && !existing_cols.contains(resolved_col)
+                && input_schema.has_column(resolved_col)
+            {
+                proj_exprs.push(Expr::Column(resolved_col.clone()));
+            }
+            // If resolved to non-column expr, it's already computed by existing projection
+        }
+
+        Projection::try_new(proj_exprs, Arc::clone(&existing.input))
+    } else {
+        // Build new projection with extracted expressions + all input columns
+        let mut proj_exprs = Vec::new();
+        for (expr, alias) in extracted_exprs {
+            proj_exprs.push(expr.clone().alias(alias));
+        }
+        for (qualifier, field) in target_schema.iter() {
+            proj_exprs.push(Expr::from((qualifier, field)));
+        }
+        Projection::try_new(proj_exprs, Arc::clone(target))
+    }
+}
+
+// =============================================================================
+// Pass 2: PushDownLeafProjections
+// =============================================================================
+
+/// Pushes extraction projections down through schema-preserving nodes towards
+/// leaf nodes (pass 2 of 2, after [`ExtractLeafExpressions`]).
+///
+/// Handles two types of projections:
+/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns):
+///   pushes through Filter/Sort/Limit, merges into existing projections, or routes
+///   into multi-input node inputs (Join, SubqueryAlias, etc.)
+/// - **Mixed projections** (user projections containing `MoveTowardsLeafNodes`
+///   sub-expressions): splits into a recovery projection + extraction projection,
+///   then pushes the extraction projection down.
+///
+/// # Example: Pushing through a Filter
+///
+/// After pass 1, the extraction projection sits directly below the filter:
+/// ```text
+/// Projection: id, user                                                              <-- recovery
+///   Filter: __datafusion_extracted_1 = 'active'
+///     Projection: user['status'] AS __datafusion_extracted_1, id, user               <-- extraction
+///       TableScan: t [id, user]
+/// ```
+///
+/// Pass 2 pushes the extraction projection through the recovery and filter,
+/// and a subsequent `OptimizeProjections` pass removes the (now-redundant)
+/// recovery projection:
+/// ```text
+/// Filter: __datafusion_extracted_1 = 'active'
+///   Projection: user['status'] AS __datafusion_extracted_1, id, user                 <-- extraction (pushed down)
+///     TableScan: t [id, user]
+/// ```
+#[derive(Default, Debug)]
+pub struct PushDownLeafProjections {}
+
+impl PushDownLeafProjections {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for PushDownLeafProjections {
+    fn name(&self) -> &str {
+        "push_down_leaf_projections"
+    }
+
+    fn apply_order(&self) -> Option<ApplyOrder> {
+        Some(ApplyOrder::TopDown)
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        if !config.options().optimizer.enable_leaf_expression_pushdown {
+            return Ok(Transformed::no(plan));
+        }
+        let alias_generator = config.alias_generator();
+        match try_push_input(&plan, alias_generator)? {
+            Some(new_plan) => Ok(Transformed::yes(new_plan)),
+            None => Ok(Transformed::no(plan)),
+        }
+    }
+}
+
+/// Attempts to push a projection's extractable expressions further down.
+///
+/// Returns `Some(new_subtree)` if the projection was pushed down or merged,
+/// `None` if there is nothing to push or the projection sits above a barrier.
+fn try_push_input(
+    input: &LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    let LogicalPlan::Projection(proj) = input else {
+        return Ok(None);
+    };
+    split_and_push_projection(proj, alias_generator)
+}
+
+/// Splits a projection into extractable pieces, pushes them towards leaf
+/// nodes, and adds a recovery projection if needed.
+///
+/// Handles both:
+/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns)
+/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions)
+///
+/// Returns `Some(new_subtree)` if extractions were pushed down,
+/// `None` if there is nothing to extract or push.
+///
+/// # Example: Mixed Projection
+///
+/// ```text
+/// Input plan:
+///   Projection: user['name'] IS NOT NULL AS has_name, id
+///     Filter: ...
+///       TableScan
+///
+/// Phase 1 (Split):
+///   extraction_pairs: [(user['name'], "__datafusion_extracted_1")]
+///   recovery_exprs:   [__datafusion_extracted_1 IS NOT NULL AS has_name, id]
+///
+/// Phase 2 (Push):
+///   Push extraction projection through Filter toward TableScan
+///
+/// Phase 3 (Recovery):
+///   Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, id       <-- recovery
+///     Filter: ...
+///       Projection: user['name'] AS __datafusion_extracted_1, id           <-- extraction (pushed)
+///         TableScan
+/// ```
+fn split_and_push_projection(
+    proj: &Projection,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    // Fast pre-check: skip if there are no pre-existing extracted aliases
+    // and no new extractable expressions.
+    let has_existing_extracted = proj.expr.iter().any(|e| {
+        matches!(e, Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX))
+    });
+    if !has_existing_extracted && !has_extractable_expr(&proj.expr) {
+        return Ok(None);
+    }
+
+    let input = &proj.input;
+    let input_schema = input.schema();
+
+    // ── Phase 1: Split ──────────────────────────────────────────────────
+    // For each projection expression, collect extraction pairs and build
+    // recovery expressions.
+    //
+    // Pre-existing `__datafusion_extracted` aliases are inserted into the
+    // extractor's `IndexMap` with the **full** `Expr::Alias(…)` as the key,
+    // so the alias name participates in equality. This prevents collisions
+    // when CSE rewrites produce the same inner expression under different
+    // alias names (e.g. `__common_expr_4 AS __datafusion_extracted_1` and
+    // `__common_expr_4 AS __datafusion_extracted_3`). New extractions from
+    // `routing_extract` use bare (non-Alias) keys and get normal dedup.
+    //
+    // When building the final `extraction_pairs`, the Alias wrapper is
+    // stripped so consumers see the usual `(inner_expr, alias_name)` tuples.
+
+    let mut extractors = vec![LeafExpressionExtractor::new(
+        input_schema.as_ref(),
+        alias_generator,
+    )];
+    let input_column_sets = vec![schema_columns(input_schema.as_ref())];
+
+    let original_schema = proj.schema.as_ref();
+    let mut recovery_exprs: Vec<Expr> = Vec::with_capacity(proj.expr.len());
+    let mut needs_recovery = false;
+    let mut has_new_extractions = false;
+    let mut proj_exprs_captured: usize = 0;
+    // Track standalone column expressions (Case B) to detect column refs
+    // from extracted aliases (Case A) that aren't also standalone expressions.
+    let mut standalone_columns: IndexSet<Column> = IndexSet::new();
+
+    for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) {
+        if let Expr::Alias(alias) = expr
+            && alias.name.starts_with(EXTRACTED_EXPR_PREFIX)
+        {
+            // Insert the full Alias expression as the key so that
+            // distinct alias names don't collide in the IndexMap.
+            let alias_name = alias.name.clone();
+
+            for col_ref in alias.expr.column_refs() {
+                extractors[0].columns_needed.insert(col_ref.clone());
+            }
+
+            extractors[0]
+                .extracted
+                .insert(expr.clone(), alias_name.clone());
+            recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name)));
+            proj_exprs_captured += 1;
+        } else if let Expr::Column(col) = expr {
+            // Plain column pass-through — track it in the extractor
+            extractors[0].columns_needed.insert(col.clone());
+            standalone_columns.insert(col.clone());
+            recovery_exprs.push(expr.clone());
+            proj_exprs_captured += 1;
+        } else {
+            // Everything else: run through routing_extract
+            let transformed =
+                routing_extract(expr.clone(), &mut extractors, &input_column_sets)?;
+            if transformed.transformed {
+                has_new_extractions = true;
+            }
+            let transformed_expr = transformed.data;
+
+            // Build recovery expression, aliasing back to original name if needed
+            let original_name = field.name();
+            let needs_alias = if let Expr::Column(col) = &transformed_expr {
+                col.name.as_str() != original_name
+            } else {
+                let expr_name = transformed_expr.schema_name().to_string();
+                original_name != &expr_name
+            };
+            let recovery_expr = if needs_alias {
+                needs_recovery = true;
+                transformed_expr
+                    .clone()
+                    .alias_qualified(qualifier.cloned(), original_name)
+            } else {
+                transformed_expr.clone()
+            };
+
+            // If the expression was transformed (i.e., has extracted sub-parts),
+            // it differs from what the pushed projection outputs → needs recovery.
+            // Also, any non-column, non-__datafusion_extracted expression needs recovery
+            // because the pushed extraction projection won't output it directly.
+            if transformed.transformed || !matches!(expr, Expr::Column(_)) {
+                needs_recovery = true;
+            }
+
+            recovery_exprs.push(recovery_expr);
+        }
+    }
+
+    // Build extraction_pairs, stripping the Alias wrapper from pre-existing
+    // entries (they used the full Alias as the map key to avoid dedup).
+    let extractor = &extractors[0];
+    let extraction_pairs: Vec<(Expr, String)> = extractor
+        .extracted
+        .iter()
+        .map(|(e, a)| match e {
+            Expr::Alias(alias) => (*alias.expr.clone(), a.clone()),
+            _ => (e.clone(), a.clone()),
+        })
+        .collect();
+    let columns_needed = &extractor.columns_needed;
+
+    // If no extractions found, nothing to do
+    if extraction_pairs.is_empty() {
+        return Ok(None);
+    }
+
+    // If columns_needed has entries that aren't standalone projection columns
+    // (i.e., they came from column refs inside extracted aliases), a merge
+    // into an inner projection will widen the schema with those extra columns,
+    // requiring a recovery projection to restore the original schema.
+    if columns_needed
+        .iter()
+        .any(|c| !standalone_columns.contains(c))
+    {
+        needs_recovery = true;
+    }
+
+    // ── Phase 2: Push down ──────────────────────────────────────────────
+    let proj_input = Arc::clone(&proj.input);
+    let pushed = push_extraction_pairs(
+        &extraction_pairs,
+        columns_needed,
+        proj,
+        &proj_input,
+        alias_generator,
+        proj_exprs_captured,
+    )?;
+
+    // ── Phase 3: Recovery ───────────────────────────────────────────────
+    // Determine the base plan: either the pushed result or an in-place extraction.
+    let base_plan = match pushed {
+        Some(plan) => plan,
+        None => {
+            if !has_new_extractions {
+                // Only pre-existing __datafusion_extracted aliases and columns, no new
+                // extractions from routing_extract. The original projection is
+                // already an extraction projection that couldn't be pushed
+                // further. Return None.
+                return Ok(None);
+            }
+            // Build extraction projection in-place (couldn't push down)
+            let input_arc = Arc::clone(input);
+            let extraction = build_extraction_projection_impl(
+                &extraction_pairs,
+                columns_needed,
+                &input_arc,
+                input_schema.as_ref(),
+            )?;
+            LogicalPlan::Projection(extraction)
+        }
+    };
+
+    // Wrap with recovery projection if the output schema changed
+    if needs_recovery {
+        let recovery = LogicalPlan::Projection(Projection::try_new(
+            recovery_exprs,
+            Arc::new(base_plan),
+        )?);
+        Ok(Some(recovery))
+    } else {
+        Ok(Some(base_plan))
+    }
+}
+
+/// Returns true if the plan is a Projection where ALL expressions are either
+/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction.
+/// Such projections can safely be pushed further without re-extraction.
+fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool {
+    let LogicalPlan::Projection(proj) = plan else {
+        return false;
+    };
+    let mut has_extraction = false;
+    for expr in &proj.expr {
+        match expr {
+            Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => {
+                has_extraction = true;
+            }
+            Expr::Column(_) => {}
+            _ => return false,
+        }
+    }
+    has_extraction
+}
+
+/// Pushes extraction pairs down through the projection's input node,
+/// dispatching to the appropriate handler based on the input node type.
+fn push_extraction_pairs(
+    pairs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    proj: &Projection,
+    proj_input: &Arc<LogicalPlan>,
+    alias_generator: &Arc<AliasGenerator>,
+    proj_exprs_captured: usize,
+) -> Result<Option<LogicalPlan>> {
+    match proj_input.as_ref() {
+        // Merge into existing projection, then try to push the result further down.
+        // Only merge when every expression in the outer projection is fully
+        // captured as either an extraction pair (Case A: __datafusion_extracted
+        // alias) or a plain column (Case B). Uncaptured expressions (e.g.
+        // `col AS __common_expr_1` from CSE, or complex expressions with
+        // extracted sub-parts) would be lost during the merge.
+        LogicalPlan::Projection(_) if proj_exprs_captured == proj.expr.len() => {
+            let target_schema = Arc::clone(proj_input.schema());
+            let merged = build_extraction_projection_impl(
+                pairs,
+                columns_needed,
+                proj_input,
+                target_schema.as_ref(),
+            )?;
+            let merged_plan = LogicalPlan::Projection(merged);
+
+            // After merging, try to push the result further down, but ONLY
+            // if the merged result is still a pure extraction projection
+            // (all __datafusion_extracted aliases + columns). If the merge inherited
+            // bare MoveTowardsLeafNodes expressions from the inner projection,
+            // pushing would re-extract them into new aliases and fail when
+            // the (None, true) fallback can't find the original aliases.
+            // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan
+            // by pushing through the recovery projection AND the filter in one pass.
+            if is_pure_extraction_projection(&merged_plan)
+                && let Some(pushed) = try_push_input(&merged_plan, alias_generator)?
+            {
+                return Ok(Some(pushed));
+            }
+            Ok(Some(merged_plan))
+        }
+        // Generic: handles Filter/Sort/Limit (via recursion),
+        // SubqueryAlias (with qualifier remap in try_push_into_inputs),
+        // Join, and anything else.
+        // Safely bails out for nodes that don't pass through extracted
+        // columns (Aggregate, Window) via the output schema check.
+        _ => try_push_into_inputs(
+            pairs,
+            columns_needed,
+            proj_input.as_ref(),
+            alias_generator,
+        ),
+    }
+}
+
+/// Routes extraction pairs and columns to the appropriate inputs.
+///
+/// - **Union**: broadcasts to every input via [`remap_pairs_and_columns`].
+/// - **Other nodes**: routes each expression to the one input that owns
+///   all of its column references (via [`find_owning_input`]).
+///
+/// Returns `None` if any expression can't be routed or no input has pairs.
+fn route_to_inputs(
+    pairs: &[(Expr, String)],
+    columns: &IndexSet<Column>,
+    node: &LogicalPlan,
+    input_column_sets: &[std::collections::HashSet<Column>],
+    input_schemas: &[Arc<DFSchema>],
+) -> Result<Option<Vec<ExtractionTarget>>> {
+    let num_inputs = input_schemas.len();
+    let mut per_input: Vec<ExtractionTarget> = (0..num_inputs)
+        .map(|_| ExtractionTarget {
+            pairs: vec![],
+            columns: IndexSet::new(),
+        })
+        .collect();
+
+    if matches!(node, LogicalPlan::Union(_)) {
+        // Union output schema and each input schema have the same fields by
+        // index but may differ in qualifiers (e.g. output `s` vs input
+        // `simple_struct.s`). Remap pairs/columns to each input's space.
+        let union_schema = node.schema();
+        for (idx, input_schema) in input_schemas.iter().enumerate() {
+            per_input[idx] =
+                remap_pairs_and_columns(pairs, columns, union_schema, input_schema)?;
+        }
+    } else {
+        for (expr, alias) in pairs {
+            match find_owning_input(expr, input_column_sets) {
+                Some(idx) => per_input[idx].pairs.push((expr.clone(), alias.clone())),
+                None => return Ok(None), // Cross-input expression — bail out
+            }
+        }
+        for col in columns {
+            let col_expr = Expr::Column(col.clone());
+            match find_owning_input(&col_expr, input_column_sets) {
+                Some(idx) => {
+                    per_input[idx].columns.insert(col.clone());
+                }
+                None => return Ok(None), // Ambiguous column — bail out
+            }
+        }
+    }
+
+    // Check at least one input has extractions to push
+    if per_input.iter().all(|t| t.pairs.is_empty()) {
+        return Ok(None);
+    }
+
+    Ok(Some(per_input))
+}
+
+/// Pushes extraction expressions into a node's inputs by routing each
+/// expression to the input that owns all of its column references.
+///
+/// Works for any number of inputs (1, 2, …N). For single-input nodes,
+/// all expressions trivially route to that input. For multi-input nodes
+/// (Join, etc.), each expression is routed to the side that owns its columns.
+///
+/// Returns `Some(new_node)` if all expressions could be routed AND the
+/// rebuilt node's output schema contains all extracted aliases.
+/// Returns `None` if any expression references columns from multiple inputs
+/// or the node doesn't pass through the extracted columns.
+///
+/// # Example: Join with expressions from both sides
+///
+/// ```text
+/// Extraction projection above a Join:
+///   Projection: left.user['name'] AS __datafusion_extracted_1, right.order['total'] AS __datafusion_extracted_2, ...
+///     Join: left.id = right.user_id
+///       TableScan: left [id, user]
+///       TableScan: right [user_id, order]
+///
+/// After routing each expression to its owning input:
+///   Join: left.id = right.user_id
+///     Projection: user['name'] AS __datafusion_extracted_1, id, user              <-- left-side extraction
+///       TableScan: left [id, user]
+///     Projection: order['total'] AS __datafusion_extracted_2, user_id, order      <-- right-side extraction
+///       TableScan: right [user_id, order]
+/// ```
+fn try_push_into_inputs(
+    pairs: &[(Expr, String)],
+    columns_needed: &IndexSet<Column>,
+    node: &LogicalPlan,
+    alias_generator: &Arc<AliasGenerator>,
+) -> Result<Option<LogicalPlan>> {
+    let inputs = node.inputs();
+    if inputs.is_empty() {
+        return Ok(None);
+    }
+
+    // SubqueryAlias remaps qualifiers between input and output.
+    // Rewrite pairs/columns from alias-space to input-space before routing.
+    let remapped = if let LogicalPlan::SubqueryAlias(sa) = node {
+        remap_pairs_and_columns(pairs, columns_needed, &sa.schema, sa.input.schema())?
+    } else {
+        ExtractionTarget {
+            pairs: pairs.to_vec(),
+            columns: columns_needed.clone(),
+        }
+    };
+    let pairs = &remapped.pairs[..];
+    let columns_needed = &remapped.columns;
+
+    // Build per-input schemas and column sets for routing
+    let input_schemas: Vec<Arc<DFSchema>> =
+        inputs.iter().map(|i| Arc::clone(i.schema())).collect();
+    let input_column_sets: Vec<std::collections::HashSet<Column>> =
+        input_schemas.iter().map(|s| schema_columns(s)).collect();
+
+    // Route pairs and columns to the appropriate inputs
+    let per_input = match route_to_inputs(
+        pairs,
+        columns_needed,
+        node,
+        &input_column_sets,
+        &input_schemas,
+    )? {
+        Some(routed) => routed,
+        None => return Ok(None),
+    };
+
+    let num_inputs = inputs.len();
+
+    // Build per-input extraction projections and push them as far as possible
+    // immediately. This is critical because map_children preserves cached schemas,
+    // so if the TopDown pass later pushes a child further (changing its output
+    // schema), the parent node's schema becomes stale.
+    let mut new_inputs: Vec<LogicalPlan> = Vec::with_capacity(num_inputs);
+    for (idx, input) in inputs.into_iter().enumerate() {
+        if per_input[idx].pairs.is_empty() {
+            new_inputs.push(input.clone());
+        } else {
+            let input_arc = Arc::new(input.clone());
+            let target_schema = Arc::clone(input.schema());
+            let proj = build_extraction_projection_impl(
+                &per_input[idx].pairs,
+                &per_input[idx].columns,
+                &input_arc,
+                target_schema.as_ref(),
+            )?;
+            // Verify all requested aliases appear in the projection's output.
+            // A merge may deduplicate if the same expression already exists
+            // under a different alias, leaving the requested alias missing.
+            let proj_schema = proj.schema.as_ref();
+            for (_expr, alias) in &per_input[idx].pairs {
+                if !proj_schema.fields().iter().any(|f| f.name() == alias) {
+                    return Ok(None);
+                }
+            }
+            let proj_plan = LogicalPlan::Projection(proj);
+            // Try to push the extraction projection further down within
+            // this input (e.g., through Filter → existing extraction projection).
+            // This ensures the input's output schema is stable and won't change
+            // when the TopDown pass later visits children.
+            match try_push_input(&proj_plan, alias_generator)? {
+                Some(pushed) => new_inputs.push(pushed),
+                None => new_inputs.push(proj_plan),
+            }
+        }
+    }
+
+    // Rebuild the node with new inputs
+    let new_node = node.with_new_exprs(node.expressions(), new_inputs)?;
+
+    // Safety check: verify all extracted aliases appear in the rebuilt
+    // node's output schema. Nodes like Aggregate define their own output
+    // and won't pass through extracted columns — bail out for those.
+    let output_schema = new_node.schema();
+    for (_expr, alias) in pairs {
+        if !output_schema.fields().iter().any(|f| f.name() == alias) {
+            return Ok(None);
+        }
+    }
+
+    Ok(Some(new_node))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use crate::optimize_projections::OptimizeProjections;
+    use crate::test::udfs::PlacementTestUDF;
+    use crate::test::*;
+    use crate::{Optimizer, OptimizerContext};
+    use datafusion_common::Result;
+    use datafusion_expr::expr::ScalarFunction;
+    use datafusion_expr::{Expr, ExpressionPlacement};
+    use datafusion_expr::{
+        ScalarUDF, col, lit, logical_plan::builder::LogicalPlanBuilder,
+    };
+
+    fn leaf_udf(expr: Expr, name: &str) -> Expr {
+        Expr::ScalarFunction(ScalarFunction::new_udf(
+            Arc::new(ScalarUDF::new_from_impl(
+                PlacementTestUDF::new()
+                    .with_placement(ExpressionPlacement::MoveTowardsLeafNodes),
+            )),
+            vec![expr, lit(name)],
+        ))
+    }
+
+    // =========================================================================
+    // Combined optimization stage formatter
+    // =========================================================================
+
+    /// Runs all 4 optimization stages and returns a single formatted string.
+    /// Stages that produce the same plan as the previous stage show
+    /// "(same as <previous>)" to reduce noise.
+    ///
+    /// Stages:
+    /// 1. **Original** - OptimizeProjections only (baseline)
+    /// 2. **After Extraction** - + ExtractLeafExpressions
+    /// 3. **After Pushdown** - + PushDownLeafProjections
+    /// 4. **Optimized** - + final OptimizeProjections
+    fn format_optimization_stages(plan: &LogicalPlan) -> Result<String> {
+        let run = |rules: Vec<Arc<dyn OptimizerRule + Send + Sync>>| -> Result<String> {
+            let ctx = OptimizerContext::new().with_max_passes(1);
+            let optimizer = Optimizer::with_rules(rules);
+            let optimized = optimizer.optimize(plan.clone(), &ctx, |_, _| {})?;
+            Ok(format!("{optimized}"))
+        };
+
+        let original = run(vec![Arc::new(OptimizeProjections::new())])?;
+
+        let after_extract = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+        ])?;
+
+        let after_pushdown = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
+        ])?;
+
+        let optimized = run(vec![
+            Arc::new(OptimizeProjections::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
+            Arc::new(OptimizeProjections::new()),
+        ])?;
+
+        let mut out = format!("## Original Plan\n{original}");
+
+        out.push_str("\n\n## After Extraction\n");
+        if after_extract == original {
+            out.push_str("(same as original)");
+        } else {
+            out.push_str(&after_extract);
+        }
+
+        out.push_str("\n\n## After Pushdown\n");
+        if after_pushdown == after_extract {
+            out.push_str("(same as after extraction)");
+        } else {
+            out.push_str(&after_pushdown);
+        }
+
+        out.push_str("\n\n## Optimized\n");
+        if optimized == after_pushdown {
+            out.push_str("(same as after pushdown)");
+        } else {
+            out.push_str(&optimized);
+        }
+
+        Ok(out)
+    }
+
+    /// Assert all optimization stages for a plan in a single insta snapshot.
+    macro_rules! assert_stages {
+        ($plan:expr, @ $expected:literal $(,)?) => {{
+            let result = format_optimization_stages(&$plan)?;
+            insta::assert_snapshot!(result, @ $expected);
+            Ok::<(), datafusion_common::DataFusionError>(())
+        }};
+    }
+
+    #[test]
+    fn test_extract_from_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .select(vec![
+                table_scan
+                    .schema()
+                    .index_of_column_by_name(None, "id")
+                    .unwrap(),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: test.id
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    #[test]
+    fn test_no_extraction_for_column() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(col("a").eq(lit(1)))?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Filter: test.a = Int32(1)
+          TableScan: test projection=[a, b, c]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    #[test]
+    fn test_extract_from_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_projection_with_subexpression() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf(col("user"), "name")
+                    .is_not_null()
+                    .alias("has_name"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS has_name
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_projection_no_extraction_for_column() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    #[test]
+    fn test_filter_with_deduplication() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field_access = leaf_udf(col("user"), "name");
+        // Filter with the same expression used twice
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                field_access
+                    .clone()
+                    .is_not_null()
+                    .and(field_access.is_null()),
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL AND leaf_udf(test.user, Utf8("name")) IS NULL
+          TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_already_leaf_expression_in_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "name").eq(lit("test")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) = Utf8("test")
+          TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 = Utf8("test")
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_aggregate_group_by() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![leaf_udf(col("user"), "status")], vec![count(lit(1))])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[leaf_udf(test.user, Utf8("status"))]], aggr=[[COUNT(Int32(1))]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_extract_from_aggregate_args() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("user")],
+                vec![count(leaf_udf(col("user"), "value"))],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value")))]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(leaf_udf(test.user,Utf8("value")))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]]
+            Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_projection_with_filter_combined() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Projection: test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    #[test]
+    fn test_projection_preserves_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![leaf_udf(col("user"), "name").alias("username")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) AS username
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS username
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) AS username
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Test: Projection with different field than Filter
+    /// SELECT id, s['label'] FROM t WHERE s['value'] > 150
+    /// Both s['label'] and s['value'] should be in a single extraction projection.
+    #[test]
+    fn test_projection_different_field_from_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "value").gt(lit(150)))?
+            .project(vec![col("user"), leaf_udf(col("user"), "label")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.user, leaf_udf(test.user, Utf8("label"))
+          Filter: leaf_udf(test.user, Utf8("value")) > Int32(150)
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: test.user, leaf_udf(test.user, Utf8("label"))
+          Projection: test.user
+            Filter: __datafusion_extracted_1 > Int32(150)
+              Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: test.user, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("label"))
+          Filter: __datafusion_extracted_1 > Int32(150)
+            Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("label")) AS __datafusion_extracted_2
+              TableScan: test projection=[user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    #[test]
+    fn test_projection_deduplication() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field = leaf_udf(col("user"), "name");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![field.clone(), field.clone().alias("name2")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    // =========================================================================
+    // Additional tests for code coverage
+    // =========================================================================
+
+    /// Extractions push through Sort nodes to reach the TableScan.
+    #[test]
+    fn test_extract_through_sort() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort(vec![col("user").sort(true, true)])?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Sort: test.user ASC NULLS FIRST
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Sort: test.user ASC NULLS FIRST
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extractions push through Limit nodes to reach the TableScan.
+    #[test]
+    fn test_extract_through_limit() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .limit(0, Some(10))?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          Limit: skip=0, fetch=10
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Limit: skip=0, fetch=10
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Limit: skip=0, fetch=10
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Aliased aggregate functions like count(...).alias("cnt") are handled.
+    #[test]
+    fn test_extract_from_aliased_aggregate() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("user")],
+                vec![count(leaf_udf(col("user"), "value")).alias("cnt")],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value"))) AS cnt]]
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]]
+          Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Aggregates with no MoveTowardsLeafNodes expressions return unchanged.
+    #[test]
+    fn test_aggregate_no_extraction() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![count(col("b"))])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Aggregate: groupBy=[[test.a]], aggr=[[COUNT(test.b)]]
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Projections containing extracted expression aliases are skipped (already extracted).
+    #[test]
+    fn test_skip_extracted_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf(col("user"), "name").alias("__datafusion_extracted_manual"),
+                col("user"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_manual, test.user
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Multiple extractions merge into a single extracted expression projection.
+    #[test]
+    fn test_merge_into_existing_extracted_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .filter(leaf_udf(col("user"), "name").is_not_null())?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user
+              Projection: test.id, test.user
+                Filter: __datafusion_extracted_2 = Utf8("active")
+                  Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user
+                    TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Filter: __datafusion_extracted_2 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, test.user
+          Filter: __datafusion_extracted_1 IS NOT NULL
+            Projection: test.id, test.user, __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Utf8("active")
+                Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                  TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Extractions push through passthrough projections (columns only).
+    #[test]
+    fn test_extract_through_passthrough_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user")])?
+            .project(vec![leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name"))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+            TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name"))
+          TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Projections with aliased columns (nothing to extract) return unchanged.
+    #[test]
+    fn test_projection_early_return_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a").alias("x"), col("b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Projection: test.a AS x, test.b
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Projections with arithmetic expressions but no MoveTowardsLeafNodes return unchanged.
+    #[test]
+    fn test_projection_with_arithmetic_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![(col("a") + col("b")).alias("sum")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Projection: test.a + test.b AS sum
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Aggregate extractions merge into existing extracted projection created by Filter.
+    #[test]
+    fn test_aggregate_merge_into_extracted_projection() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .aggregate(vec![leaf_udf(col("user"), "name")], vec![count(lit(1))])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Aggregate: groupBy=[[leaf_udf(test.user, Utf8("name"))]], aggr=[[COUNT(Int32(1))]]
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              Projection: test.user
+                Filter: __datafusion_extracted_2 = Utf8("active")
+                  Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user
+                    TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Filter: __datafusion_extracted_2 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1))
+          Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]]
+            Projection: __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Utf8("active")
+                Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                  TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Projection containing a MoveTowardsLeafNodes sub-expression above an
+    /// Aggregate. Aggregate blocks pushdown, so the (None, true) recovery
+    /// fallback path fires: in-place extraction + recovery projection.
+    #[test]
+    fn test_projection_with_leaf_expr_above_aggregate() -> Result<()> {
+        use datafusion_expr::test::function_stub::count;
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("user")], vec![count(lit(1))])?
+            .project(vec![
+                leaf_udf(col("user"), "name")
+                    .is_not_null()
+                    .alias("has_name"),
+                col("COUNT(Int32(1))"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, COUNT(Int32(1))
+          Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user, COUNT(Int32(1))
+            Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1))
+          Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]]
+            TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Merging adds new pass-through columns not in the existing extracted projection.
+    #[test]
+    fn test_merge_with_new_columns() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("a"), "x").eq(lit(1)))?
+            .filter(leaf_udf(col("b"), "y").eq(lit(2)))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.b, Utf8("y")) = Int32(2)
+          Filter: leaf_udf(test.a, Utf8("x")) = Int32(1)
+            TableScan: test projection=[a, b, c]
+
+        ## After Extraction
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Projection: leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c
+              Projection: test.a, test.b, test.c
+                Filter: __datafusion_extracted_2 = Int32(1)
+                  Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c
+                    TableScan: test projection=[a, b, c]
+
+        ## After Pushdown
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Filter: __datafusion_extracted_2 = Int32(1)
+              Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1
+                TableScan: test projection=[a, b, c]
+
+        ## Optimized
+        Projection: test.a, test.b, test.c
+          Filter: __datafusion_extracted_1 = Int32(2)
+            Projection: test.a, test.b, test.c, __datafusion_extracted_1
+              Filter: __datafusion_extracted_2 = Int32(1)
+                Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1
+                  TableScan: test projection=[a, b, c]
+        "#)
+    }
+
+    // =========================================================================
+    // Join extraction tests
+    // =========================================================================
+
+    /// Create a second table scan with struct field for join tests
+    fn test_table_scan_with_struct_named(name: &str) -> Result<LogicalPlan> {
+        use arrow::datatypes::Schema;
+        let schema = Schema::new(test_table_scan_with_struct_fields());
+        datafusion_expr::logical_plan::table_scan(Some(name), &schema, None)?.build()
+    }
+
+    /// Extraction from equijoin keys (`on` expressions).
+    #[test]
+    fn test_extract_from_join_on() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_with_expr_keys(
+                right,
+                JoinType::Inner,
+                (
+                    vec![leaf_udf(col("user"), "id")],
+                    vec![leaf_udf(col("user"), "id")],
+                ),
+                None,
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id"))
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2
+            Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extraction from non-equi join filter.
+    #[test]
+    fn test_extract_from_join_filter() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.user").eq(col("right.user")),
+                    leaf_udf(col("test.user"), "status").eq(lit("active")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Extraction from both left and right sides of a join.
+    #[test]
+    fn test_extract_from_join_both_sides() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.user").eq(col("right.user")),
+                    leaf_udf(col("test.user"), "status").eq(lit("active")),
+                    leaf_udf(col("right.user"), "role").eq(lit("admin")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active") AND leaf_udf(right.user, Utf8("role")) = Utf8("admin")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// Join with no MoveTowardsLeafNodes expressions returns unchanged.
+    #[test]
+    fn test_extract_from_join_no_extraction() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan()?;
+        let right = test_table_scan_with_name("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join(right, JoinType::Inner, (vec!["a"], vec!["a"]), None)?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        Inner Join: test.a = right.a
+          TableScan: test projection=[a, b, c]
+          TableScan: right projection=[a, b, c]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Join followed by filter with extraction.
+    #[test]
+    fn test_extract_from_filter_above_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_with_expr_keys(
+                right,
+                JoinType::Inner,
+                (
+                    vec![leaf_udf(col("user"), "id")],
+                    vec![leaf_udf(col("user"), "id")],
+                ),
+                None,
+            )?
+            .filter(leaf_udf(col("test.user"), "status").eq(lit("active")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+          Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id"))
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user
+              Projection: test.id, test.user, right.id, right.user
+                Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+                  Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user
+                    TableScan: test projection=[id, user]
+                  Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                    TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+              Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+                TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user
+              Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3
+                Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+                  TableScan: test projection=[id, user]
+                Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user
+                  TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Extraction projection (get_field in SELECT) above a Join pushes into
+    /// the correct input side.
+    #[test]
+    fn test_extract_projection_above_join() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join(right, JoinType::Inner, (vec!["id"], vec!["id"]), None)?
+            .project(vec![
+                leaf_udf(col("test.user"), "status"),
+                leaf_udf(col("right.user"), "role"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(test.user, Utf8("status")), leaf_udf(right.user, Utf8("role"))
+          Inner Join: test.id = right.id
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role"))
+          Inner Join: test.id = right.id
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role"))
+          Inner Join: test.id = right.id
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id
+              TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Join where both sides have same-named columns: a qualified reference
+    /// to the right side must be routed to the right input, not the left.
+    #[test]
+    fn test_extract_from_join_qualified_right_side() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        // Filter references right.user explicitly — must route to right side
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Inner,
+                vec![
+                    col("test.id").eq(col("right.id")),
+                    leaf_udf(col("right.user"), "status").eq(lit("active")),
+                ],
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Inner Join:  Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) = Utf8("active")
+          TableScan: test projection=[id, user]
+          TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Inner Join:  Filter: test.id = right.id AND __datafusion_extracted_1 = Utf8("active")
+            TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user
+              TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    /// When both inputs contain the same unqualified column, an unqualified
+    /// column reference is ambiguous and `find_owning_input` must return
+    /// `None` rather than always returning 0 (the left side).
+    #[test]
+    fn test_find_owning_input_ambiguous_unqualified_column() {
+        use std::collections::HashSet;
+
+        // Simulate schema_columns output for two sides of a join where both
+        // have a "user" column — each set contains the qualified and
+        // unqualified form.
+        let left_cols: HashSet<Column> = [
+            Column::new(Some("test"), "user"),
+            Column::new_unqualified("user"),
+        ]
+        .into_iter()
+        .collect();
+
+        let right_cols: HashSet<Column> = [
+            Column::new(Some("right"), "user"),
+            Column::new_unqualified("user"),
+        ]
+        .into_iter()
+        .collect();
+
+        let input_column_sets = vec![left_cols, right_cols];
+
+        // Unqualified "user" matches both sets — must return None (ambiguous)
+        let unqualified = Expr::Column(Column::new_unqualified("user"));
+        assert_eq!(find_owning_input(&unqualified, &input_column_sets), None);
+
+        // Qualified "right.user" matches only the right set — must return Some(1)
+        let qualified_right = Expr::Column(Column::new(Some("right"), "user"));
+        assert_eq!(
+            find_owning_input(&qualified_right, &input_column_sets),
+            Some(1)
+        );
+
+        // Qualified "test.user" matches only the left set — must return Some(0)
+        let qualified_left = Expr::Column(Column::new(Some("test"), "user"));
+        assert_eq!(
+            find_owning_input(&qualified_left, &input_column_sets),
+            Some(0)
+        );
+    }
+
+    /// Two leaf_udf expressions from different sides of a Join in a Filter.
+    /// Each is routed to its respective input side independently.
+    #[test]
+    fn test_extract_from_join_cross_input_expression() -> Result<()> {
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                datafusion_expr::JoinType::Inner,
+                vec![col("test.id").eq(col("right.id"))],
+            )?
+            .filter(
+                leaf_udf(col("test.user"), "status")
+                    .eq(leaf_udf(col("right.user"), "status")),
+            )?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(test.user, Utf8("status")) = leaf_udf(right.user, Utf8("status"))
+          Inner Join:  Filter: test.id = right.id
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = __datafusion_extracted_2
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, right.id, right.user
+              Inner Join:  Filter: test.id = right.id
+                TableScan: test projection=[id, user]
+                TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, test.user, right.id, right.user
+          Filter: __datafusion_extracted_1 = __datafusion_extracted_2
+            Inner Join:  Filter: test.id = right.id
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## Optimized
+        (same as after pushdown)
+        "#)
+    }
+
+    // =========================================================================
+    // Column-rename through intermediate node tests
+    // =========================================================================
+
+    /// Projection with leaf expr above Filter above renaming Projection.
+    #[test]
+    fn test_extract_through_filter_with_column_rename() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(col("x").is_not_null())?
+            .project(vec![leaf_udf(col("x"), "a")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(x, Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a"))
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Same as above but with a partial extraction (leaf + arithmetic).
+    #[test]
+    fn test_extract_partial_through_filter_with_column_rename() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(col("x").is_not_null())?
+            .project(vec![leaf_udf(col("x"), "a").is_not_null()])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(x, Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL
+          Filter: x IS NOT NULL
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Tests merge_into_extracted_projection path through a renaming projection.
+    #[test]
+    fn test_extract_from_filter_above_renaming_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user").alias("x")])?
+            .filter(leaf_udf(col("x"), "a").eq(lit("active")))?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Filter: leaf_udf(x, Utf8("a")) = Utf8("active")
+          Projection: test.user AS x
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: x
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: x
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    // =========================================================================
+    // SubqueryAlias extraction tests
+    // =========================================================================
+
+    /// Extraction projection pushes through SubqueryAlias.
+    #[test]
+    fn test_extract_through_subquery_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .project(vec![leaf_udf(col("sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          SubqueryAlias: sub
+            TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name"))
+          SubqueryAlias: sub
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+              TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name"))
+          SubqueryAlias: sub
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+              TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Extraction projection pushes through SubqueryAlias + Filter.
+    #[test]
+    fn test_extract_through_subquery_alias_with_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .filter(leaf_udf(col("sub.user"), "status").eq(lit("active")))?
+            .project(vec![leaf_udf(col("sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          Filter: leaf_udf(sub.user, Utf8("status")) = Utf8("active")
+            SubqueryAlias: sub
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        Projection: leaf_udf(sub.user, Utf8("name"))
+          Projection: sub.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user
+                SubqueryAlias: sub
+                  TableScan: test projection=[user]
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            SubqueryAlias: sub
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            SubqueryAlias: sub
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+                TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Two layers of SubqueryAlias: extraction pushes through both.
+    #[test]
+    fn test_extract_through_nested_subquery_alias() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("inner_sub")?
+            .alias("outer_sub")?
+            .project(vec![leaf_udf(col("outer_sub.user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: leaf_udf(outer_sub.user, Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              TableScan: test projection=[user]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user
+                TableScan: test projection=[user]
+
+        ## Optimized
+        Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name"))
+          SubqueryAlias: outer_sub
+            SubqueryAlias: inner_sub
+              Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1
+                TableScan: test projection=[user]
+        "#)
+    }
+
+    /// Plain columns through SubqueryAlias -- no extraction needed.
+    #[test]
+    fn test_subquery_alias_no_extraction() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("sub")?
+            .project(vec![col("sub.a"), col("sub.b")])?
+            .build()?;
+
+        assert_stages!(plan, @"
+        ## Original Plan
+        SubqueryAlias: sub
+          TableScan: test projection=[a, b]
+
+        ## After Extraction
+        (same as original)
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        (same as after pushdown)
+        ")
+    }
+
+    /// Two UDFs with the same `name()` but different concrete types should NOT be
+    /// deduplicated -- they are semantically different expressions that happen to
+    /// collide on `schema_name()`.
+    #[test]
+    fn test_different_udfs_same_schema_name_not_deduplicated() -> Result<()> {
+        let udf_a = Arc::new(ScalarUDF::new_from_impl(
+            PlacementTestUDF::new()
+                .with_placement(ExpressionPlacement::MoveTowardsLeafNodes)
+                .with_id(1),
+        ));
+        let udf_b = Arc::new(ScalarUDF::new_from_impl(
+            PlacementTestUDF::new()
+                .with_placement(ExpressionPlacement::MoveTowardsLeafNodes)
+                .with_id(2),
+        ));
+
+        let expr_a = Expr::ScalarFunction(ScalarFunction::new_udf(
+            udf_a,
+            vec![col("user"), lit("field")],
+        ));
+        let expr_b = Expr::ScalarFunction(ScalarFunction::new_udf(
+            udf_b,
+            vec![col("user"), lit("field")],
+        ));
+
+        // Verify preconditions: same schema_name but different Expr
+        assert_eq!(
+            expr_a.schema_name().to_string(),
+            expr_b.schema_name().to_string(),
+            "Both expressions should have the same schema_name"
+        );
+        assert_ne!(
+            expr_a, expr_b,
+            "Expressions should NOT be equal (different UDF instances)"
+        );
+
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(expr_a.clone().eq(lit("a")).and(expr_b.clone().eq(lit("b"))))?
+            .select(vec![
+                table_scan
+                    .schema()
+                    .index_of_column_by_name(None, "id")
+                    .unwrap(),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id
+          Filter: leaf_udf(test.user, Utf8("field")) = Utf8("a") AND leaf_udf(test.user, Utf8("field")) = Utf8("b")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b")
+              Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        (same as after extraction)
+
+        ## Optimized
+        Projection: test.id
+          Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b")
+            Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    // =========================================================================
+    // Filter pushdown interaction tests
+    // =========================================================================
+
+    /// Extraction pushdown through a filter that already had its own
+    /// `leaf_udf` extracted.
+    #[test]
+    fn test_extraction_pushdown_through_filter_with_extracted_predicate() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").eq(lit("active")))?
+            .project(vec![col("id"), leaf_udf(col("user"), "name")])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name"))
+          Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active")
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 = Utf8("active")
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name"))
+          Filter: __datafusion_extracted_1 = Utf8("active")
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Same expression in filter predicate and projection output.
+    #[test]
+    fn test_extraction_pushdown_same_expr_in_filter_and_projection() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let field_expr = leaf_udf(col("user"), "status");
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(field_expr.clone().gt(lit(5)))?
+            .project(vec![col("id"), field_expr])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("status"))
+          Filter: leaf_udf(test.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("status"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 > Int32(5)
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// Left join with a `leaf_udf` filter on the right side AND
+    /// the projection also selects `leaf_udf` from the right side.
+    #[test]
+    fn test_left_join_with_filter_and_projection_extraction() -> Result<()> {
+        use datafusion_expr::JoinType;
+
+        let left = test_table_scan_with_struct()?;
+        let right = test_table_scan_with_struct_named("right")?;
+
+        let plan = LogicalPlanBuilder::from(left)
+            .join_on(
+                right,
+                JoinType::Left,
+                vec![
+                    col("test.id").eq(col("right.id")),
+                    leaf_udf(col("right.user"), "status").gt(lit(5)),
+                ],
+            )?
+            .project(vec![
+                col("test.id"),
+                leaf_udf(col("test.user"), "name"),
+                leaf_udf(col("right.user"), "status"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+            TableScan: right projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status"))
+          Projection: test.id, test.user, right.id, right.user
+            Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+              TableScan: test projection=[id, user]
+              Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user
+                TableScan: right projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: right projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status"))
+          Left Join:  Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id
+              TableScan: test projection=[id, user]
+            Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: right projection=[id, user]
+        "#)
+    }
+
+    /// Extraction projection pushed through a filter whose predicate
+    /// references a different extracted expression.
+    #[test]
+    fn test_pure_extraction_proj_push_through_filter() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(leaf_udf(col("user"), "status").gt(lit(5)))?
+            .project(vec![
+                col("id"),
+                leaf_udf(col("user"), "name"),
+                leaf_udf(col("user"), "status"),
+            ])?
+            .build()?;
+
+        assert_stages!(plan, @r#"
+        ## Original Plan
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status"))
+          Filter: leaf_udf(test.user, Utf8("status")) > Int32(5)
+            TableScan: test projection=[id, user]
+
+        ## After Extraction
+        Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status"))
+          Projection: test.id, test.user
+            Filter: __datafusion_extracted_1 > Int32(5)
+              Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user
+                TableScan: test projection=[id, user]
+
+        ## After Pushdown
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: test projection=[id, user]
+
+        ## Optimized
+        Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status"))
+          Filter: __datafusion_extracted_1 > Int32(5)
+            Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3
+              TableScan: test projection=[id, user]
+        "#)
+    }
+
+    /// When an extraction projection's __extracted alias references a column
+    /// (e.g. `user`) that is NOT a standalone expression in the projection,
+    /// the merge into the inner projection should still succeed.
+    #[test]
+    fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> {
+        let table_scan = test_table_scan_with_struct()?;
+
+        // Inner projection (simulates a trimmed projection)
+        let inner = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("user"), col("id")])?
+            .build()?;
+
+        // Outer projection: __extracted alias + id (but NOT user as standalone).
+        // The alias references `user` internally, inflating columns_needed.
+        let plan = LogicalPlanBuilder::from(inner)
+            .project(vec![
+                leaf_udf(col("user"), "status")
+                    .alias(format!("{EXTRACTED_EXPR_PREFIX}_1")),
+                col("id"),
+            ])?
+            .build()?;
+
+        // Run only PushDownLeafProjections
+        let ctx = OptimizerContext::new().with_max_passes(1);
+        let optimizer =
+            Optimizer::with_rules(vec![Arc::new(PushDownLeafProjections::new())]);
+        let result = optimizer.optimize(plan, &ctx, |_, _| {})?;
+
+        // With the fix: merge succeeds → extraction merged into inner projection.
+        // Without the fix: merge rejected → two separate projections remain.
+        insta::assert_snapshot!(format!("{result}"), @r#"
+        Projection: __datafusion_extracted_1, test.id
+          Projection: test.user, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1
+            TableScan: test
+        "#);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs
index 14a424b32687f..c8f419d3e543e 100644
--- a/datafusion/optimizer/src/filter_null_join_keys.rs
+++ b/datafusion/optimizer/src/filter_null_join_keys.rs
@@ -21,9 +21,9 @@ use crate::optimizer::ApplyOrder;
 use crate::push_down_filter::on_lr_is_preserved;
 use crate::{OptimizerConfig, OptimizerRule};
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::Result;
+use datafusion_common::{NullEquality, Result};
 use datafusion_expr::utils::conjunction;
-use datafusion_expr::{logical_plan::Filter, Expr, ExprSchemable, LogicalPlan};
+use datafusion_expr::{Expr, ExprSchemable, LogicalPlan, logical_plan::Filter};
 use std::sync::Arc;
 
 /// The FilterNullJoinKeys rule will identify joins with equi-join conditions
@@ -51,7 +51,8 @@ impl OptimizerRule for FilterNullJoinKeys {
         }
         match plan {
             LogicalPlan::Join(mut join)
-                if !join.on.is_empty() && !join.null_equals_null =>
+                if !join.on.is_empty()
+                    && join.null_equality == NullEquality::NullEqualsNothing =>
             {
                 let (left_preserved, right_preserved) =
                     on_lr_is_preserved(join.join_type);
@@ -107,12 +108,12 @@ fn create_not_null_predicate(filters: Vec<Expr>) -> Expr {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::assert_optimized_plan_eq_snapshot;
     use crate::OptimizerContext;
+    use crate::assert_optimized_plan_eq_snapshot;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::Column;
     use datafusion_expr::logical_plan::table_scan;
-    use datafusion_expr::{col, lit, JoinType, LogicalPlanBuilder};
+    use datafusion_expr::{JoinType, LogicalPlanBuilder, col, lit};
 
     macro_rules! assert_optimized_plan_equal {
         (
diff --git a/datafusion/optimizer/src/join_key_set.rs b/datafusion/optimizer/src/join_key_set.rs
index 0a97173b30966..de795c0aeacfa 100644
--- a/datafusion/optimizer/src/join_key_set.rs
+++ b/datafusion/optimizer/src/join_key_set.rs
@@ -157,7 +157,7 @@ impl Equivalent<(Expr, Expr)> for ExprPair<'_> {
 #[cfg(test)]
 mod test {
     use crate::join_key_set::JoinKeySet;
-    use datafusion_expr::{col, Expr};
+    use datafusion_expr::{Expr, col};
 
     #[test]
     fn test_insert() {
diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs
index 280010e3d92c0..e610091824092 100644
--- a/datafusion/optimizer/src/lib.rs
+++ b/datafusion/optimizer/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! # DataFusion Optimizer
 //!
@@ -48,17 +49,24 @@ pub mod eliminate_filter;
 pub mod eliminate_group_by_constant;
 pub mod eliminate_join;
 pub mod eliminate_limit;
-pub mod eliminate_nested_union;
-pub mod eliminate_one_union;
+#[deprecated(since = "52.0.0", note = "Please use OptimizeUnions instead")]
+pub mod eliminate_nested_union {
+    use crate::optimize_unions::OptimizeUnions;
+    #[deprecated(since = "52.0.0", note = "Please use OptimizeUnions instead")]
+    pub type EliminateNestedUnion = OptimizeUnions;
+}
 pub mod eliminate_outer_join;
 pub mod extract_equijoin_predicate;
+pub mod extract_leaf_expressions;
 pub mod filter_null_join_keys;
 pub mod optimize_projections;
+pub mod optimize_unions;
 pub mod optimizer;
 pub mod propagate_empty_relation;
 pub mod push_down_filter;
 pub mod push_down_limit;
 pub mod replace_distinct_aggregate;
+pub mod rewrite_set_comparison;
 pub mod scalar_subquery_to_join;
 pub mod simplify_expressions;
 pub mod single_distinct_to_groupby;
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs
index 926315eb86297..93df300bb50b4 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -25,13 +25,13 @@ use std::collections::HashSet;
 use std::sync::Arc;
 
 use datafusion_common::{
-    get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column,
-    HashMap, JoinType, Result,
+    Column, DFSchema, HashMap, JoinType, Result, assert_eq_or_internal_err,
+    get_required_group_by_exprs_indices, internal_datafusion_err, internal_err,
 };
 use datafusion_expr::expr::Alias;
-use datafusion_expr::Unnest;
 use datafusion_expr::{
-    logical_plan::LogicalPlan, Aggregate, Distinct, Expr, Projection, TableScan, Window,
+    Aggregate, Distinct, EmptyRelation, Expr, Projection, TableScan, Unnest, Window,
+    logical_plan::LogicalPlan,
 };
 
 use crate::optimize_projections::required_indices::RequiredIndices;
@@ -55,11 +55,29 @@ use datafusion_common::tree_node::{
 /// The rule analyzes the input logical plan, determines the necessary column
 /// indices, and then removes any unnecessary columns. It also removes any
 /// unnecessary projections from the plan tree.
+///
+/// ## Schema, Field Properties, and Metadata Handling
+///
+/// The `OptimizeProjections` rule preserves schema and field metadata in most optimization scenarios:
+///
+/// **Schema-level metadata preservation by plan type**:
+/// - **Window and Aggregate plans**: Schema metadata is preserved
+/// - **Projection plans**: Schema metadata is preserved per [`projection_schema`](datafusion_expr::logical_plan::projection_schema).
+/// - **Other logical plans**: Schema metadata is preserved unless [`LogicalPlan::recompute_schema`]
+///   is called on plan types that drop metadata
+///
+/// **Field-level properties and metadata**: Individual field properties are preserved when fields
+/// are retained in the optimized plan, determined by [`exprlist_to_fields`](datafusion_expr::utils::exprlist_to_fields)
+/// and [`ExprSchemable::to_field`](datafusion_expr::expr_schema::ExprSchemable::to_field).
+///
+/// **Field precedence**: When the same field appears multiple times, the optimizer
+/// maintains one occurrence and removes duplicates (refer to `RequiredIndices::compact()`),
+/// preserving the properties and metadata of that occurrence.
 #[derive(Default, Debug)]
 pub struct OptimizeProjections {}
 
 impl OptimizeProjections {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -120,7 +138,7 @@ fn optimize_projections(
         LogicalPlan::Projection(proj) => {
             return merge_consecutive_projections(proj)?.transform_data(|proj| {
                 rewrite_projection_given_requirements(proj, config, &indices)
-            })
+            });
         }
         LogicalPlan::Aggregate(aggregate) => {
             // Split parent requirements to GROUP BY and aggregate sections:
@@ -153,23 +171,16 @@ fn optimize_projections(
 
             // Only use the absolutely necessary aggregate expressions required
             // by the parent:
-            let mut new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr);
-
-            // Aggregations always need at least one aggregate expression.
-            // With a nested count, we don't require any column as input, but
-            // still need to create a correct aggregate, which may be optimized
-            // out later. As an example, consider the following query:
-            //
-            // SELECT count(*) FROM (SELECT count(*) FROM [...])
-            //
-            // which always returns 1.
-            if new_aggr_expr.is_empty()
-                && new_group_bys.is_empty()
-                && !aggregate.aggr_expr.is_empty()
-            {
-                // take the old, first aggregate expression
-                new_aggr_expr = aggregate.aggr_expr;
-                new_aggr_expr.resize_with(1, || unreachable!());
+            let new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr);
+
+            if new_group_bys.is_empty() && new_aggr_expr.is_empty() {
+                // Global aggregation with no aggregate functions always produces 1 row and no columns.
+                return Ok(Transformed::yes(LogicalPlan::EmptyRelation(
+                    EmptyRelation {
+                        produce_one_row: true,
+                        schema: Arc::new(DFSchema::empty()),
+                    },
+                )));
             }
 
             let all_exprs_iter = new_group_bys.iter().chain(new_aggr_expr.iter());
@@ -257,15 +268,10 @@ fn optimize_projections(
                 Some(projection) => indices.into_mapped_indices(|idx| projection[idx]),
                 None => indices.into_inner(),
             };
-            return TableScan::try_new(
-                table_name,
-                source,
-                Some(projection),
-                filters,
-                fetch,
-            )
-            .map(LogicalPlan::TableScan)
-            .map(Transformed::yes);
+            let new_scan =
+                TableScan::try_new(table_name, source, Some(projection), filters, fetch)?;
+
+            return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)));
         }
         // Other node types are handled below
         _ => {}
@@ -330,11 +336,14 @@ fn optimize_projections(
                 return Ok(Transformed::no(plan));
             };
             let children = extension.node.inputs();
-            if children.len() != necessary_children_indices.len() {
-                return internal_err!("Inconsistent length between children and necessary children indices. \
-                Make sure `.necessary_children_exprs` implementation of the `UserDefinedLogicalNode` is \
-                consistent with actual children length for the node.");
-            }
+            assert_eq_or_internal_err!(
+                children.len(),
+                necessary_children_indices.len(),
+                "Inconsistent length between children and necessary children indices. \
+                Make sure `.necessary_children_exprs` implementation of the \
+                `UserDefinedLogicalNode` is consistent with actual children length \
+                for the node."
+            );
             children
                 .into_iter()
                 .zip(necessary_children_indices)
@@ -345,12 +354,35 @@ fn optimize_projections(
                 .collect::<Result<Vec<_>>>()?
         }
         LogicalPlan::EmptyRelation(_)
-        | LogicalPlan::RecursiveQuery(_)
         | LogicalPlan::Values(_)
         | LogicalPlan::DescribeTable(_) => {
             // These operators have no inputs, so stop the optimization process.
             return Ok(Transformed::no(plan));
         }
+        LogicalPlan::RecursiveQuery(recursive) => {
+            // Only allow subqueries that reference the current CTE; nested subqueries are not yet
+            // supported for projection pushdown for simplicity.
+            // TODO: be able to do projection pushdown on recursive CTEs with subqueries
+            if plan_contains_other_subqueries(
+                recursive.static_term.as_ref(),
+                &recursive.name,
+            ) || plan_contains_other_subqueries(
+                recursive.recursive_term.as_ref(),
+                &recursive.name,
+            ) {
+                return Ok(Transformed::no(plan));
+            }
+
+            plan.inputs()
+                .into_iter()
+                .map(|input| {
+                    indices
+                        .clone()
+                        .with_projection_beneficial()
+                        .with_plan_exprs(&plan, input.schema())
+                })
+                .collect::<Result<Vec<_>>>()?
+        }
         LogicalPlan::Join(join) => {
             let left_len = join.left.schema().fields().len();
             let (left_req_indices, right_req_indices) =
@@ -376,22 +408,33 @@ fn optimize_projections(
             );
         }
         LogicalPlan::Unnest(Unnest {
-            dependency_indices, ..
+            input,
+            dependency_indices,
+            ..
         }) => {
-            vec![RequiredIndices::new_from_indices(
-                dependency_indices.clone(),
-            )]
+            // at least provide the indices for the exec-columns as a starting point
+            let required_indices =
+                RequiredIndices::new().with_plan_exprs(&plan, input.schema())?;
+
+            // Add additional required indices from the parent
+            let mut additional_necessary_child_indices = Vec::new();
+            indices.indices().iter().for_each(|idx| {
+                if let Some(index) = dependency_indices.get(*idx) {
+                    additional_necessary_child_indices.push(*index);
+                }
+            });
+            vec![required_indices.append(&additional_necessary_child_indices)]
         }
     };
 
     // Required indices are currently ordered (child0, child1, ...)
     // but the loop pops off the last element, so we need to reverse the order
     child_required_indices.reverse();
-    if child_required_indices.len() != plan.inputs().len() {
-        return internal_err!(
-            "OptimizeProjection: child_required_indices length mismatch with plan inputs"
-        );
-    }
+    assert_eq_or_internal_err!(
+        child_required_indices.len(),
+        plan.inputs().len(),
+        "OptimizeProjection: child_required_indices length mismatch with plan inputs"
+    );
 
     // Rewrite children of the plan
     let transformed_plan = plan.map_children(|child| {
@@ -431,6 +474,18 @@ fn optimize_projections(
 /// appear more than once in its input fields. This can act as a caching mechanism
 /// for non-trivial computations.
 ///
+/// ## Metadata Handling During Projection Merging
+///
+/// **Alias metadata preservation**: When merging projections, alias metadata from both
+/// the current and previous projections is carefully preserved. The presence of metadata
+/// precludes alias trimming.
+///
+/// **Schema, Fields, and metadata**: If a projection is rewritten, the schema and metadata
+/// are preserved. Individual field properties and metadata flows through expression rewriting
+/// and are preserved when fields are referenced in the merged projection.
+/// Refer to [`projection_schema`](datafusion_expr::logical_plan::projection_schema)
+/// for more details.
+///
 /// # Parameters
 ///
 /// * `proj` - A reference to the `Projection` to be merged.
@@ -470,15 +525,14 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
     expr.iter()
         .for_each(|expr| expr.add_column_ref_counts(&mut column_referral_map));
 
-    // If an expression is non-trivial and appears more than once, do not merge
+    // If an expression is non-trivial (KeepInPlace) and appears more than once, do not merge
     // them as consecutive projections will benefit from a compute-once approach.
     // For details, see: https://github.com/apache/datafusion/issues/8296
     if column_referral_map.into_iter().any(|(col, usage)| {
         usage > 1
-            && !is_expr_trivial(
-                &prev_projection.expr
-                    [prev_projection.schema.index_of_column(col).unwrap()],
-            )
+            && !prev_projection.expr[prev_projection.schema.index_of_column(col).unwrap()]
+                .placement()
+                .should_push_to_leaves()
     }) {
         // no change
         return Projection::try_new_with_schema(expr, input, schema).map(Transformed::no);
@@ -505,7 +559,19 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
                 metadata,
             }) => rewrite_expr(*expr, &prev_projection).map(|result| {
                 result.update_data(|expr| {
-                    Expr::Alias(Alias::new(expr, relation, name).with_metadata(metadata))
+                    // After substitution, the inner expression may now have the
+                    // same schema_name as the alias (e.g. when an extraction
+                    // alias like `__extracted_1 AS f(x)` is resolved back to
+                    // `f(x)`). Wrapping in a redundant self-alias causes a
+                    // cosmetic `f(x) AS f(x)` due to Display vs schema_name
+                    // formatting differences. Drop the alias when it matches.
+                    if metadata.is_none() && expr.schema_name().to_string() == name {
+                        expr
+                    } else {
+                        Expr::Alias(
+                            Alias::new(expr, relation, name).with_metadata(metadata),
+                        )
+                    }
                 })
             }),
             e => rewrite_expr(e, &prev_projection),
@@ -531,11 +597,6 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
     }
 }
 
-// Check whether `expr` is trivial; i.e. it doesn't imply any computation.
-fn is_expr_trivial(expr: &Expr) -> bool {
-    matches!(expr, Expr::Column(_) | Expr::Literal(_))
-}
-
 /// Rewrites a projection expression using the projection before it (i.e. its input)
 /// This is a subroutine to the `merge_consecutive_projections` function.
 ///
@@ -554,7 +615,8 @@ fn is_expr_trivial(expr: &Expr) -> bool {
 /// - `Err(error)`: An error occurred during the function call.
 ///
 /// # Notes
-/// This rewrite also removes any unnecessary layers of aliasing.
+/// This rewrite also removes any unnecessary layers of aliasing. "Unnecessary" is
+/// defined as not contributing new information, such as metadata.
 ///
 /// Without trimming, we can end up with unnecessary indirections inside expressions
 /// during projection merges.
@@ -583,8 +645,18 @@ fn is_expr_trivial(expr: &Expr) -> bool {
 fn rewrite_expr(expr: Expr, input: &Projection) -> Result<Transformed<Expr>> {
     expr.transform_up(|expr| {
         match expr {
-            //  remove any intermediate aliases
-            Expr::Alias(alias) => Ok(Transformed::yes(*alias.expr)),
+            //  remove any intermediate aliases if they do not carry metadata
+            Expr::Alias(alias) => {
+                match alias
+                    .metadata
+                    .as_ref()
+                    .map(|h| h.is_empty())
+                    .unwrap_or(true)
+                {
+                    true => Ok(Transformed::yes(*alias.expr)),
+                    false => Ok(Transformed::no(Expr::Alias(alias))),
+                }
+            }
             Expr::Column(col) => {
                 // Find index of column:
                 let idx = input.schema.index_of_column(&col)?;
@@ -662,10 +734,10 @@ fn outer_columns_helper_multi<'a, 'b>(
 /// Depending on the join type, it divides the requirement indices into those
 /// that apply to the left child and those that apply to the right child.
 ///
-/// - For `INNER`, `LEFT`, `RIGHT` and `FULL` joins, the requirements are split
-///   between left and right children. The right child indices are adjusted to
-///   point to valid positions within the right child by subtracting the length
-///   of the left child.
+/// - For `INNER`, `LEFT`, `RIGHT`, `FULL`, `LEFTMARK`, and `RIGHTMARK` joins,
+///   the requirements are split between left and right children. The right
+///   child indices are adjusted to point to valid positions within the right
+///   child by subtracting the length of the left child.
 ///
 /// - For `LEFT ANTI`, `LEFT SEMI`, `RIGHT SEMI` and `RIGHT ANTI` joins, all
 ///   requirements are re-routed to either the left child or the right child
@@ -694,7 +766,8 @@ fn split_join_requirements(
         | JoinType::Left
         | JoinType::Right
         | JoinType::Full
-        | JoinType::LeftMark => {
+        | JoinType::LeftMark
+        | JoinType::RightMark => {
             // Decrease right side indices by `left_len` so that they point to valid
             // positions within the right child:
             indices.split_off(left_len)
@@ -804,6 +877,64 @@ pub fn is_projection_unnecessary(
     ))
 }
 
+/// Returns true if the plan subtree contains any subqueries that are not the
+/// CTE reference itself. This treats any non-CTE [`LogicalPlan::SubqueryAlias`]
+/// node (including aliased relations) as a blocker, along with expression-level
+/// subqueries like scalar, EXISTS, or IN. These cases prevent projection
+/// pushdown for now because we cannot safely reason about their column usage.
+fn plan_contains_other_subqueries(plan: &LogicalPlan, cte_name: &str) -> bool {
+    if let LogicalPlan::SubqueryAlias(alias) = plan
+        && alias.alias.table() != cte_name
+        && !subquery_alias_targets_recursive_cte(alias.input.as_ref(), cte_name)
+    {
+        return true;
+    }
+
+    let mut found = false;
+    plan.apply_expressions(|expr| {
+        if expr_contains_subquery(expr) {
+            found = true;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("expression traversal never fails");
+    if found {
+        return true;
+    }
+
+    plan.inputs()
+        .into_iter()
+        .any(|child| plan_contains_other_subqueries(child, cte_name))
+}
+
+fn expr_contains_subquery(expr: &Expr) -> bool {
+    expr.exists(|e| match e {
+        Expr::ScalarSubquery(_) | Expr::Exists(_) | Expr::InSubquery(_) => Ok(true),
+        _ => Ok(false),
+    })
+    // Safe unwrap since we are doing a simple boolean check
+    .unwrap()
+}
+
+fn subquery_alias_targets_recursive_cte(plan: &LogicalPlan, cte_name: &str) -> bool {
+    match plan {
+        LogicalPlan::TableScan(scan) => scan.table_name.table() == cte_name,
+        LogicalPlan::SubqueryAlias(alias) => {
+            subquery_alias_targets_recursive_cte(alias.input.as_ref(), cte_name)
+        }
+        _ => {
+            let inputs = plan.inputs();
+            if inputs.len() == 1 {
+                subquery_alias_targets_recursive_cte(inputs[0], cte_name)
+            } else {
+                false
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::cmp::Ordering;
@@ -826,14 +957,15 @@ mod tests {
     };
     use datafusion_expr::ExprFunctionExt;
     use datafusion_expr::{
-        binary_expr, build_join_schema,
+        BinaryExpr, Expr, Extension, Like, LogicalPlan, Operator, Projection,
+        UserDefinedLogicalNodeCore, WindowFunctionDefinition, binary_expr,
+        build_join_schema,
         builder::table_scan_with_filters,
         col,
         expr::{self, Cast},
         lit,
         logical_plan::{builder::LogicalPlanBuilder, table_scan},
-        not, try_cast, when, BinaryExpr, Expr, Extension, Like, LogicalPlan, Operator,
-        Projection, UserDefinedLogicalNodeCore, WindowFunctionDefinition,
+        not, try_cast, when,
     };
     use insta::assert_snapshot;
 
@@ -887,6 +1019,8 @@ mod tests {
                 Some(Ordering::Equal) => self.input.partial_cmp(&other.input),
                 cmp => cmp,
             }
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
         }
     }
 
@@ -974,6 +1108,8 @@ mod tests {
                 }
                 cmp => cmp,
             }
+            // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+            .filter(|cmp| *cmp != Ordering::Equal || self == other)
         }
     }
 
@@ -1124,9 +1260,7 @@ mod tests {
             plan,
             @r"
         Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]]
-          Projection:
-            Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]]
-              TableScan: ?table? projection=[]
+          EmptyRelation: rows=1
         "
         )
     }
@@ -1816,7 +1950,7 @@ mod tests {
         let table2_scan = scan_empty(Some("test2"), &schema, None)?.build()?;
 
         let plan = LogicalPlanBuilder::from(table_scan)
-            .join_using(table2_scan, JoinType::Left, vec!["a"])?
+            .join_using(table2_scan, JoinType::Left, vec!["a".into()])?
             .project(vec![col("a"), col("b")])?
             .build()?;
 
@@ -2144,7 +2278,7 @@ mod tests {
     fn test_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let max1 = Expr::WindowFunction(expr::WindowFunction::new(
+        let max1 = Expr::from(expr::WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("test.a")],
         ))
@@ -2152,7 +2286,7 @@ mod tests {
         .build()
         .unwrap();
 
-        let max2 = Expr::WindowFunction(expr::WindowFunction::new(
+        let max2 = Expr::from(expr::WindowFunction::new(
             WindowFunctionDefinition::AggregateUDF(max_udaf()),
             vec![col("test.b")],
         ));
diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/optimize_unions.rs
similarity index 79%
rename from datafusion/optimizer/src/eliminate_nested_union.rs
rename to datafusion/optimizer/src/optimize_unions.rs
index f8f93727cd9ba..900757b9a0607 100644
--- a/datafusion/optimizer/src/eliminate_nested_union.rs
+++ b/datafusion/optimizer/src/optimize_unions.rs
@@ -15,30 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`EliminateNestedUnion`]: flattens nested `Union` to a single `Union`
+//! [`OptimizeUnions`]: removes `Union` nodes in the logical plan.
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::Result;
+use datafusion_common::tree_node::Transformed;
 use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema;
-use datafusion_expr::{Distinct, LogicalPlan, Union};
+use datafusion_expr::{Distinct, LogicalPlan, Projection, Union};
 use itertools::Itertools;
 use std::sync::Arc;
 
 #[derive(Default, Debug)]
-/// An optimization rule that replaces nested unions with a single union.
-pub struct EliminateNestedUnion;
+/// An optimization rule that
+/// 1. replaces nested unions with a single union.
+/// 2. removes unions with a single input.
+pub struct OptimizeUnions;
 
-impl EliminateNestedUnion {
-    #[allow(missing_docs)]
+impl OptimizeUnions {
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
-impl OptimizerRule for EliminateNestedUnion {
+impl OptimizerRule for OptimizeUnions {
     fn name(&self) -> &str {
-        "eliminate_nested_union"
+        "optimize_unions"
     }
 
     fn apply_order(&self) -> Option<ApplyOrder> {
@@ -55,6 +57,9 @@ impl OptimizerRule for EliminateNestedUnion {
         _config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         match plan {
+            LogicalPlan::Union(Union { mut inputs, .. }) if inputs.len() == 1 => Ok(
+                Transformed::yes(Arc::unwrap_or_clone(inputs.pop().unwrap())),
+            ),
             LogicalPlan::Union(Union { inputs, schema }) => {
                 let inputs = inputs
                     .into_iter()
@@ -100,6 +105,38 @@ fn extract_plans_from_union(plan: Arc<LogicalPlan>) -> Vec<LogicalPlan> {
             .into_iter()
             .map(Arc::unwrap_or_clone)
             .collect::<Vec<_>>(),
+        // While unnesting, unwrap a Projection whose input is a nested Union,
+        // flatten the inner Union, and push the same Projection down onto
+        // each of the nested Union’s children.
+        //
+        // Example:
+        //   Union { Projection { Union { plan1, plan2 } }, plan3 }
+        //     => Union { Projection { plan1 }, Projection { plan2 }, plan3 }
+        LogicalPlan::Projection(Projection {
+            expr,
+            input,
+            schema,
+            ..
+        }) => match Arc::unwrap_or_clone(input) {
+            LogicalPlan::Union(Union { inputs, .. }) => inputs
+                .into_iter()
+                .map(Arc::unwrap_or_clone)
+                .map(|plan| {
+                    LogicalPlan::Projection(
+                        Projection::try_new_with_schema(
+                            expr.clone(),
+                            Arc::new(plan),
+                            Arc::clone(&schema),
+                        )
+                        .unwrap(),
+                    )
+                })
+                .collect::<Vec<_>>(),
+
+            plan => vec![LogicalPlan::Projection(
+                Projection::try_new_with_schema(expr, Arc::new(plan), schema).unwrap(),
+            )],
+        },
         plan => vec![plan],
     }
 }
@@ -114,10 +151,10 @@ fn extract_plan_from_distinct(plan: Arc<LogicalPlan>) -> Arc<LogicalPlan> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::analyzer::type_coercion::TypeCoercion;
+    use crate::OptimizerContext;
     use crate::analyzer::Analyzer;
+    use crate::analyzer::type_coercion::TypeCoercion;
     use crate::assert_optimized_plan_eq_snapshot;
-    use crate::OptimizerContext;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{col, logical_plan::table_scan};
@@ -139,7 +176,7 @@ mod tests {
             let analyzed_plan = Analyzer::with_rules(vec![Arc::new(TypeCoercion::new())])
                 .execute_and_check($plan, &options, |_, _| {})?;
             let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(EliminateNestedUnion::new())];
+            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(OptimizeUnions::new())];
             assert_optimized_plan_eq_snapshot!(
                 optimizer_ctx,
                 rules,
@@ -326,6 +363,27 @@ mod tests {
         ")
     }
 
+    #[test]
+    fn eliminate_nested_union_in_projection() -> Result<()> {
+        let plan_builder = table_scan(Some("table"), &schema(), None)?;
+
+        let plan = plan_builder
+            .clone()
+            .union(plan_builder.clone().build()?)?
+            .project(vec![col("id").alias("table_id"), col("key"), col("value")])?
+            .union(plan_builder.build()?)?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Union
+          Projection: id AS table_id, key, value
+            TableScan: table
+          Projection: id AS table_id, key, value
+            TableScan: table
+          TableScan: table
+        ")
+    }
+
     #[test]
     fn eliminate_nested_union_with_type_cast_projection() -> Result<()> {
         let table_1 = table_scan(
@@ -420,4 +478,26 @@ mod tests {
               TableScan: table_1
         ")
     }
+
+    #[test]
+    fn eliminate_one_union() -> Result<()> {
+        let plan = table_scan(Some("table"), &schema(), None)?.build()?;
+        let schema = Arc::clone(plan.schema());
+        // note it is not possible to create a single input union via
+        // LogicalPlanBuilder so create it manually here
+        let plan = LogicalPlan::Union(Union {
+            inputs: vec![Arc::new(plan)],
+            schema,
+        });
+
+        // Note we can't use the same assert_optimized_plan_equal as creating a
+        // single input union is not possible via LogicalPlanBuilder and other passes
+        // throw errors / don't handle the schema correctly.
+        assert_optimized_plan_eq_snapshot!(
+            OptimizerContext::new().with_max_passes(1),
+            vec![Arc::new(OptimizeUnions::new())],
+            plan,
+            @"TableScan: table"
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
index 4d2c2c7c79cd5..bdea6a83072cd 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -22,14 +22,14 @@ use std::sync::Arc;
 
 use chrono::{DateTime, Utc};
 use datafusion_expr::registry::FunctionRegistry;
-use datafusion_expr::{assert_expected_schema, InvariantLevel};
+use datafusion_expr::{InvariantLevel, assert_expected_schema};
 use log::{debug, warn};
 
 use datafusion_common::alias::AliasGenerator;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::instant::Instant;
 use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
-use datafusion_common::{internal_err, DFSchema, DataFusionError, HashSet, Result};
+use datafusion_common::{DFSchema, DataFusionError, HashSet, Result, internal_err};
 use datafusion_expr::logical_plan::LogicalPlan;
 
 use crate::common_subexpr_eliminate::CommonSubexprEliminate;
@@ -41,28 +41,29 @@ use crate::eliminate_filter::EliminateFilter;
 use crate::eliminate_group_by_constant::EliminateGroupByConstant;
 use crate::eliminate_join::EliminateJoin;
 use crate::eliminate_limit::EliminateLimit;
-use crate::eliminate_nested_union::EliminateNestedUnion;
-use crate::eliminate_one_union::EliminateOneUnion;
 use crate::eliminate_outer_join::EliminateOuterJoin;
 use crate::extract_equijoin_predicate::ExtractEquijoinPredicate;
+use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections};
 use crate::filter_null_join_keys::FilterNullJoinKeys;
 use crate::optimize_projections::OptimizeProjections;
+use crate::optimize_unions::OptimizeUnions;
 use crate::plan_signature::LogicalPlanSignature;
 use crate::propagate_empty_relation::PropagateEmptyRelation;
 use crate::push_down_filter::PushDownFilter;
 use crate::push_down_limit::PushDownLimit;
 use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
+use crate::rewrite_set_comparison::RewriteSetComparison;
 use crate::scalar_subquery_to_join::ScalarSubqueryToJoin;
 use crate::simplify_expressions::SimplifyExpressions;
 use crate::single_distinct_to_groupby::SingleDistinctToGroupBy;
 use crate::utils::log_plan;
 
-/// `OptimizerRule`s transforms one [`LogicalPlan`] into another which
-/// computes the same results, but in a potentially more efficient
-/// way. If there are no suitable transformations for the input plan,
-/// the optimizer should simply return it unmodified.
+/// Transforms one [`LogicalPlan`] into another which computes the same results,
+/// but in a potentially more efficient way.
 ///
-/// To change the semantics of a `LogicalPlan`, see [`AnalyzerRule`]
+/// See notes on [`Self::rewrite`] for details on how to implement an `OptimizerRule`.
+///
+/// To change the semantics of a `LogicalPlan`, see [`AnalyzerRule`].
 ///
 /// Use [`SessionState::add_optimizer_rule`] to register additional
 /// `OptimizerRule`s.
@@ -87,8 +88,40 @@ pub trait OptimizerRule: Debug {
         true
     }
 
-    /// Try to rewrite `plan` to an optimized form, returning `Transformed::yes`
-    /// if the plan was rewritten and `Transformed::no` if it was not.
+    /// Try to rewrite `plan` to an optimized form, returning [`Transformed::yes`]
+    /// if the plan was rewritten and [`Transformed::no`] if it was not.
+    ///
+    /// # Notes for implementations:
+    ///
+    /// ## Return the same plan if no changes were made
+    ///
+    /// If there are no suitable transformations for the input plan,
+    /// the optimizer should simply return it unmodified.
+    ///
+    /// The optimizer will call `rewrite` several times until a fixed point is
+    /// reached, so it is important that `rewrite` return [`Transformed::no`] if
+    /// the output is the same.
+    ///
+    /// ## Matching on functions
+    ///
+    /// The rule should avoid function-specific transformations, and instead use
+    /// methods on [`ScalarUDFImpl`] and [`AggregateUDFImpl`]. Specifically, the
+    /// rule should not check function names as functions can be overridden, and
+    /// may not have the same semantics as the functions provided with
+    /// DataFusion.
+    ///
+    /// For example, if a rule rewrites a function based on the check
+    /// `func.name() == "sum"`, it may rewrite the plan incorrectly if the
+    /// registered `sum` function has different semantics (for example, the
+    /// `sum` function from the `datafusion-spark` crate).
+    ///
+    /// There are still several cases that rely on function name checking in
+    /// the rules included with DataFusion. Please see [#18643] for more details
+    /// and to help remove these cases.
+    ///
+    /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
+    /// [`AggregateUDFImpl`]: datafusion_expr::ScalarUDFImpl
+    /// [#18643]: https://github.com/apache/datafusion/issues/18643
     fn rewrite(
         &self,
         _plan: LogicalPlan,
@@ -101,13 +134,14 @@ pub trait OptimizerRule: Debug {
 /// Options to control the DataFusion Optimizer.
 pub trait OptimizerConfig {
     /// Return the time at which the query execution started. This
-    /// time is used as the value for now()
-    fn query_execution_start_time(&self) -> DateTime<Utc>;
+    /// time is used as the value for `now()`. If `None`, time-dependent
+    /// functions like `now()` will not be simplified during optimization.
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>>;
 
     /// Return alias generator used to generate unique aliases for subqueries
     fn alias_generator(&self) -> &Arc<AliasGenerator>;
 
-    fn options(&self) -> &ConfigOptions;
+    fn options(&self) -> Arc<ConfigOptions>;
 
     fn function_registry(&self) -> Option<&dyn FunctionRegistry> {
         None
@@ -119,13 +153,14 @@ pub trait OptimizerConfig {
 #[derive(Debug)]
 pub struct OptimizerContext {
     /// Query execution start time that can be used to rewrite
-    /// expressions such as `now()` to use a literal value instead
-    query_execution_start_time: DateTime<Utc>,
+    /// expressions such as `now()` to use a literal value instead.
+    /// If `None`, time-dependent functions will not be simplified.
+    query_execution_start_time: Option<DateTime<Utc>>,
 
     /// Alias generator used to generate unique aliases for subqueries
     alias_generator: Arc<AliasGenerator>,
 
-    options: ConfigOptions,
+    options: Arc<ConfigOptions>,
 }
 
 impl OptimizerContext {
@@ -134,8 +169,13 @@ impl OptimizerContext {
         let mut options = ConfigOptions::default();
         options.optimizer.filter_null_join_keys = true;
 
+        Self::new_with_config_options(Arc::new(options))
+    }
+
+    /// Create a optimizer config with provided [ConfigOptions].
+    pub fn new_with_config_options(options: Arc<ConfigOptions>) -> Self {
         Self {
-            query_execution_start_time: Utc::now(),
+            query_execution_start_time: Some(Utc::now()),
             alias_generator: Arc::new(AliasGenerator::new()),
             options,
         }
@@ -143,30 +183,38 @@ impl OptimizerContext {
 
     /// Specify whether to enable the filter_null_keys rule
     pub fn filter_null_keys(mut self, filter_null_keys: bool) -> Self {
-        self.options.optimizer.filter_null_join_keys = filter_null_keys;
+        Arc::make_mut(&mut self.options)
+            .optimizer
+            .filter_null_join_keys = filter_null_keys;
         self
     }
 
-    /// Specify whether the optimizer should skip rules that produce
-    /// errors, or fail the query
+    /// Set the query execution start time
     pub fn with_query_execution_start_time(
         mut self,
-        query_execution_tart_time: DateTime<Utc>,
+        query_execution_start_time: DateTime<Utc>,
     ) -> Self {
-        self.query_execution_start_time = query_execution_tart_time;
+        self.query_execution_start_time = Some(query_execution_start_time);
+        self
+    }
+
+    /// Clear the query execution start time. When `None`, time-dependent
+    /// functions like `now()` will not be simplified during optimization.
+    pub fn without_query_execution_start_time(mut self) -> Self {
+        self.query_execution_start_time = None;
         self
     }
 
     /// Specify whether the optimizer should skip rules that produce
     /// errors, or fail the query
     pub fn with_skip_failing_rules(mut self, b: bool) -> Self {
-        self.options.optimizer.skip_failed_rules = b;
+        Arc::make_mut(&mut self.options).optimizer.skip_failed_rules = b;
         self
     }
 
     /// Specify how many times to attempt to optimize the plan
     pub fn with_max_passes(mut self, v: u8) -> Self {
-        self.options.optimizer.max_passes = v as usize;
+        Arc::make_mut(&mut self.options).optimizer.max_passes = v as usize;
         self
     }
 }
@@ -179,7 +227,7 @@ impl Default for OptimizerContext {
 }
 
 impl OptimizerConfig for OptimizerContext {
-    fn query_execution_start_time(&self) -> DateTime<Utc> {
+    fn query_execution_start_time(&self) -> Option<DateTime<Utc>> {
         self.query_execution_start_time
     }
 
@@ -187,8 +235,8 @@ impl OptimizerConfig for OptimizerContext {
         &self.alias_generator
     }
 
-    fn options(&self) -> &ConfigOptions {
-        &self.options
+    fn options(&self) -> Arc<ConfigOptions> {
+        Arc::clone(&self.options)
     }
 }
 
@@ -220,8 +268,18 @@ impl Default for Optimizer {
 impl Optimizer {
     /// Create a new optimizer using the recommended list of rules
     pub fn new() -> Self {
+        // NOTEs:
+        // - The order of rules in this list is important, as it determines the
+        //   order in which they are applied.
+        // - Adding a new rule here is expensive as it will be applied to all
+        //   queries, and will likely increase the optimization time. Please extend
+        //   existing rules when possible, rather than adding a new rule.
+        //   If you do add a new rule considering having aggressive no-op paths
+        //   (e.g. if the plan doesn't contain any of the nodes you are looking for
+        //    return `Transformed::no`; only works if you control the traversal).
         let rules: Vec<Arc<dyn OptimizerRule + Sync + Send>> = vec![
-            Arc::new(EliminateNestedUnion::new()),
+            Arc::new(RewriteSetComparison::new()),
+            Arc::new(OptimizeUnions::new()),
             Arc::new(SimplifyExpressions::new()),
             Arc::new(ReplaceDistinctWithAggregate::new()),
             Arc::new(EliminateJoin::new()),
@@ -234,8 +292,6 @@ impl Optimizer {
             Arc::new(EliminateCrossJoin::new()),
             Arc::new(EliminateLimit::new()),
             Arc::new(PropagateEmptyRelation::new()),
-            // Must be after PropagateEmptyRelation
-            Arc::new(EliminateOneUnion::new()),
             Arc::new(FilterNullJoinKeys::default()),
             Arc::new(EliminateOuterJoin::new()),
             // Filters can't be pushed down past Limits, we should do PushDownFilter after PushDownLimit
@@ -246,6 +302,8 @@ impl Optimizer {
             // that might benefit from the following rules
             Arc::new(EliminateGroupByConstant::new()),
             Arc::new(CommonSubexprEliminate::new()),
+            Arc::new(ExtractLeafExpressions::new()),
+            Arc::new(PushDownLeafProjections::new()),
             Arc::new(OptimizeProjections::new()),
         ];
 
@@ -284,9 +342,7 @@ impl TreeNodeRewriter for Rewriter<'_> {
 
     fn f_down(&mut self, node: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
         if self.apply_order == ApplyOrder::TopDown {
-            {
-                self.rule.rewrite(node, self.config)
-            }
+            self.rule.rewrite(node, self.config)
         } else {
             Ok(Transformed::no(node))
         }
@@ -294,9 +350,7 @@ impl TreeNodeRewriter for Rewriter<'_> {
 
     fn f_up(&mut self, node: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
         if self.apply_order == ApplyOrder::BottomUp {
-            {
-                self.rule.rewrite(node, self.config)
-            }
+            self.rule.rewrite(node, self.config)
         } else {
             Ok(Transformed::no(node))
         }
@@ -460,10 +514,10 @@ mod tests {
 
     use datafusion_common::tree_node::Transformed;
     use datafusion_common::{
-        assert_contains, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result,
+        DFSchema, DFSchemaRef, DataFusionError, Result, assert_contains, plan_err,
     };
     use datafusion_expr::logical_plan::EmptyRelation;
-    use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection};
+    use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, Projection, col, lit};
 
     use crate::optimizer::Optimizer;
     use crate::test::test_table_scan;
diff --git a/datafusion/optimizer/src/plan_signature.rs b/datafusion/optimizer/src/plan_signature.rs
index 73e6b418272a9..6f46d7b663342 100644
--- a/datafusion/optimizer/src/plan_signature.rs
+++ b/datafusion/optimizer/src/plan_signature.rs
@@ -89,7 +89,7 @@ mod tests {
     use std::sync::Arc;
 
     use datafusion_common::{DFSchema, Result};
-    use datafusion_expr::{lit, LogicalPlan};
+    use datafusion_expr::{LogicalPlan, lit};
 
     use crate::plan_signature::get_node_number;
 
diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs
index 4fb9e117e2afc..b59295b1d717a 100644
--- a/datafusion/optimizer/src/propagate_empty_relation.rs
+++ b/datafusion/optimizer/src/propagate_empty_relation.rs
@@ -19,9 +19,9 @@
 
 use std::sync::Arc;
 
-use datafusion_common::tree_node::Transformed;
 use datafusion_common::JoinType;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::tree_node::Transformed;
+use datafusion_common::{Result, plan_err};
 use datafusion_expr::logical_plan::LogicalPlan;
 use datafusion_expr::{EmptyRelation, Projection, Union};
 
@@ -33,7 +33,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 pub struct PropagateEmptyRelation;
 
 impl PropagateEmptyRelation {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -140,10 +140,10 @@ impl OptimizerRule for PropagateEmptyRelation {
                 }
             }
             LogicalPlan::Aggregate(ref agg) => {
-                if !agg.group_expr.is_empty() {
-                    if let Some(empty_plan) = empty_child(&plan)? {
-                        return Ok(Transformed::yes(empty_plan));
-                    }
+                if !agg.group_expr.is_empty()
+                    && let Some(empty_plan) = empty_child(&plan)?
+                {
+                    return Ok(Transformed::yes(empty_plan));
                 }
                 Ok(Transformed::no(LogicalPlan::Aggregate(agg.clone())))
             }
@@ -239,17 +239,17 @@ mod tests {
     use datafusion_common::{Column, DFSchema, JoinType};
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Operator,
+        Operator, binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder,
     };
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::eliminate_filter::EliminateFilter;
-    use crate::eliminate_nested_union::EliminateNestedUnion;
+    use crate::optimize_unions::OptimizeUnions;
     use crate::test::{
         assert_optimized_plan_with_rules, test_table_scan, test_table_scan_fields,
         test_table_scan_with_name,
     };
-    use crate::OptimizerContext;
 
     use super::*;
 
@@ -277,7 +277,7 @@ mod tests {
         assert_optimized_plan_with_rules(
             vec![
                 Arc::new(EliminateFilter::new()),
-                Arc::new(EliminateNestedUnion::new()),
+                Arc::new(OptimizeUnions::new()),
                 Arc::new(PropagateEmptyRelation::new()),
             ],
             plan,
@@ -294,7 +294,7 @@ mod tests {
             .project(vec![binary_expr(lit(1), Operator::Plus, lit(1))])?
             .build()?;
 
-        assert_optimized_plan_equal!(plan, @"EmptyRelation")
+        assert_optimized_plan_equal!(plan, @"EmptyRelation: rows=0")
     }
 
     #[test]
@@ -316,7 +316,7 @@ mod tests {
             .filter(col("a").lt_eq(lit(1i64)))?
             .build()?;
 
-        let expected = "EmptyRelation";
+        let expected = "EmptyRelation: rows=0";
         assert_together_optimized_plan(plan, expected, true)
     }
 
@@ -379,7 +379,7 @@ mod tests {
             .union(four)?
             .build()?;
 
-        let expected = "EmptyRelation";
+        let expected = "EmptyRelation: rows=0";
         assert_together_optimized_plan(plan, expected, true)
     }
 
@@ -434,7 +434,7 @@ mod tests {
             .filter(col("a").lt_eq(lit(1i64)))?
             .build()?;
 
-        let expected = "EmptyRelation";
+        let expected = "EmptyRelation: rows=0";
         assert_together_optimized_plan(plan, expected, true)
     }
 
@@ -474,7 +474,7 @@ mod tests {
             )?
             .build()?;
 
-        let expected = "EmptyRelation";
+        let expected = "EmptyRelation: rows=0";
         assert_together_optimized_plan(plan, expected, eq)
     }
 
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index bbf0b0dd810e7..03a7a0b864177 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -20,14 +20,18 @@
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use arrow::datatypes::DataType;
 use indexmap::IndexSet;
 use itertools::Itertools;
+use log::{Level, debug, log_enabled};
 
+use datafusion_common::instant::Instant;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
 use datafusion_common::{
-    internal_err, plan_err, qualified_name, Column, DFSchema, Result,
+    Column, DFSchema, Result, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_err, plan_err, qualified_name,
 };
 use datafusion_expr::expr::WindowFunction;
 use datafusion_expr::expr_rewriter::replace_col;
@@ -36,12 +40,14 @@ use datafusion_expr::utils::{
     conjunction, expr_to_columns, split_conjunction, split_conjunction_owned,
 };
 use datafusion_expr::{
-    and, or, BinaryExpr, Expr, Filter, Operator, Projection, TableProviderFilterPushDown,
+    BinaryExpr, Expr, Filter, Operator, Projection, TableProviderFilterPushDown, and, or,
 };
 
 use crate::optimizer::ApplyOrder;
+use crate::simplify_expressions::simplify_predicates;
 use crate::utils::{has_all_column_refs, is_restrict_null_predicate};
 use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_expr::ExpressionPlacement;
 
 /// Optimizer rule for pushing (moving) filter expressions down in a plan so
 /// they are applied as early as possible.
@@ -168,30 +174,13 @@ pub(crate) fn lr_is_preserved(join_type: JoinType) -> (bool, bool) {
         JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => (true, false),
         // No columns from the left side of the join can be referenced in output
         // predicates for semi/anti joins, so whether we specify t/f doesn't matter.
-        JoinType::RightSemi | JoinType::RightAnti => (false, true),
+        JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => (false, true),
     }
 }
 
-/// For a given JOIN type, determine whether each input of the join is preserved
-/// for the join condition (`ON` clause filters).
-///
-/// It is only correct to push filters below a join for preserved inputs.
-///
-/// # Return Value
-/// A tuple of booleans - (left_preserved, right_preserved).
-///
-/// See [`lr_is_preserved`] for a definition of "preserved".
+/// See [`JoinType::on_lr_is_preserved`] for details.
 pub(crate) fn on_lr_is_preserved(join_type: JoinType) -> (bool, bool) {
-    match join_type {
-        JoinType::Inner => (true, true),
-        JoinType::Left => (false, true),
-        JoinType::Right => (true, false),
-        JoinType::Full => (false, false),
-        JoinType::LeftSemi | JoinType::RightSemi => (true, true),
-        JoinType::LeftAnti => (false, true),
-        JoinType::RightAnti => (true, false),
-        JoinType::LeftMark => (false, true),
-    }
+    join_type.on_lr_is_preserved()
 }
 
 /// Evaluates the columns referenced in the given expression to see if they refer
@@ -254,11 +243,12 @@ fn can_evaluate_as_join_condition(predicate: &Expr) -> Result<bool> {
     let mut is_evaluate = true;
     predicate.apply(|expr| match expr {
         Expr::Column(_)
-        | Expr::Literal(_)
+        | Expr::Literal(_, _)
         | Expr::Placeholder(_)
         | Expr::ScalarVariable(_, _) => Ok(TreeNodeRecursion::Jump),
         Expr::Exists { .. }
         | Expr::InSubquery(_)
+        | Expr::SetComparison(_)
         | Expr::ScalarSubquery(_)
         | Expr::OuterReferenceColumn(_, _)
         | Expr::Unnest(_) => {
@@ -450,11 +440,11 @@ fn push_down_all_join(
         }
     }
 
-    // For infer predicates, if they can not push through join, just drop them
+    // Push predicates inferred from the join expression
     for predicate in inferred_join_predicates {
-        if left_preserved && checker.is_left_only(&predicate) {
+        if checker.is_left_only(&predicate) {
             left_push.push(predicate);
-        } else if right_preserved && checker.is_right_only(&predicate) {
+        } else if checker.is_right_only(&predicate) {
             right_push.push(predicate);
         }
     }
@@ -537,8 +527,19 @@ fn push_down_join(
         .map_or_else(Vec::new, |filter| split_conjunction_owned(filter.clone()));
 
     // Are there any new join predicates that can be inferred from the filter expressions?
-    let inferred_join_predicates =
-        infer_join_predicates(&join, &predicates, &on_filters)?;
+    let inferred_join_predicates = with_debug_timing("infer_join_predicates", || {
+        infer_join_predicates(&join, &predicates, &on_filters)
+    })?;
+
+    if log_enabled!(Level::Debug) {
+        debug!(
+            "push_down_filter: join_type={:?}, parent_predicates={}, on_filters={}, inferred_join_predicates={}",
+            join.join_type,
+            predicates.len(),
+            on_filters.len(),
+            inferred_join_predicates.len()
+        );
+    }
 
     if on_filters.is_empty()
         && predicates.is_empty()
@@ -559,7 +560,6 @@ fn push_down_join(
 ///
 /// * `on_filters` filters from the join ON clause that have not already been
 ///   identified as join predicates
-///
 fn infer_join_predicates(
     join: &Join,
     predicates: &[Expr],
@@ -613,7 +613,7 @@ impl InferredPredicates {
     fn new(join_type: JoinType) -> Self {
         Self {
             predicates: vec![],
-            is_inner_join: matches!(join_type, JoinType::Inner),
+            is_inner_join: join_type == JoinType::Inner,
         }
     }
 
@@ -646,7 +646,6 @@ impl InferredPredicates {
 /// * `predicates` the pushed down predicates
 ///
 /// * `inferred_predicates` the inferred results
-///
 fn infer_join_predicates_from_predicates(
     join_col_keys: &[(&Column, &Column)],
     predicates: &[Expr],
@@ -670,7 +669,6 @@ fn infer_join_predicates_from_predicates(
 ///   identified as join predicates
 ///
 /// * `inferred_predicates` the inferred results
-///
 fn infer_join_predicates_from_on_filters(
     join_col_keys: &[(&Column, &Column)],
     join_type: JoinType,
@@ -691,7 +689,7 @@ fn infer_join_predicates_from_on_filters(
                 inferred_predicates,
             )
         }
-        JoinType::Right | JoinType::RightSemi => {
+        JoinType::Right | JoinType::RightSemi | JoinType::RightMark => {
             infer_join_predicates_impl::<false, true>(
                 join_col_keys,
                 on_filters,
@@ -716,7 +714,6 @@ fn infer_join_predicates_from_on_filters(
 ///
 /// * `ENABLE_RIGHT_TO_LEFT` indicates that the left table related predicate can
 ///   be inferred from the right table related predicate
-///
 fn infer_join_predicates_impl<
     const ENABLE_LEFT_TO_RIGHT: bool,
     const ENABLE_RIGHT_TO_LEFT: bool,
@@ -766,8 +763,9 @@ impl OptimizerRule for PushDownFilter {
     fn rewrite(
         &self,
         plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
+        let _ = config.options();
         if let LogicalPlan::Join(join) = plan {
             return push_down_join(join, None);
         };
@@ -778,6 +776,26 @@ impl OptimizerRule for PushDownFilter {
             return Ok(Transformed::no(plan));
         };
 
+        let predicate = split_conjunction_owned(filter.predicate.clone());
+        let old_predicate_len = predicate.len();
+        let new_predicates =
+            with_debug_timing("simplify_predicates", || simplify_predicates(predicate))?;
+        if log_enabled!(Level::Debug) {
+            debug!(
+                "push_down_filter: simplify_predicates old_count={}, new_count={}",
+                old_predicate_len,
+                new_predicates.len()
+            );
+        }
+        if old_predicate_len != new_predicates.len() {
+            let Some(new_predicate) = conjunction(new_predicates) else {
+                // new_predicates is empty - remove the filter entirely
+                // Return the child plan without the filter
+                return Ok(Transformed::yes(Arc::unwrap_or_clone(filter.input)));
+            };
+            filter.predicate = new_predicate;
+        }
+
         match Arc::unwrap_or_clone(filter.input) {
             LogicalPlan::Filter(child_filter) => {
                 let parents_predicates = split_conjunction_owned(filter.predicate);
@@ -799,8 +817,7 @@ impl OptimizerRule for PushDownFilter {
                     new_predicate,
                     child_filter.input,
                 )?);
-                #[allow(clippy::used_underscore_binding)]
-                self.rewrite(new_filter, _config)
+                self.rewrite(new_filter, config)
             }
             LogicalPlan::Repartition(repartition) => {
                 let new_filter =
@@ -861,14 +878,37 @@ impl OptimizerRule for PushDownFilter {
                 let predicates = split_conjunction_owned(filter.predicate.clone());
                 let mut non_unnest_predicates = vec![];
                 let mut unnest_predicates = vec![];
+                let mut unnest_struct_columns = vec![];
+
+                for idx in &unnest.struct_type_columns {
+                    let (sub_qualifier, field) =
+                        unnest.input.schema().qualified_field(*idx);
+                    let field_name = field.name().clone();
+
+                    if let DataType::Struct(children) = field.data_type() {
+                        for child in children {
+                            let child_name = child.name().clone();
+                            unnest_struct_columns.push(Column::new(
+                                sub_qualifier.cloned(),
+                                format!("{field_name}.{child_name}"),
+                            ));
+                        }
+                    }
+                }
+
                 for predicate in predicates {
                     // collect all the Expr::Column in predicate recursively
                     let mut accum: HashSet<Column> = HashSet::new();
                     expr_to_columns(&predicate, &mut accum)?;
 
-                    if unnest.list_type_columns.iter().any(|(_, unnest_list)| {
-                        accum.contains(&unnest_list.output_column)
-                    }) {
+                    let contains_list_columns =
+                        unnest.list_type_columns.iter().any(|(_, unnest_list)| {
+                            accum.contains(&unnest_list.output_column)
+                        });
+                    let contains_struct_columns =
+                        unnest_struct_columns.iter().any(|c| accum.contains(c));
+
+                    if contains_list_columns || contains_struct_columns {
                         unnest_predicates.push(predicate);
                     } else {
                         non_unnest_predicates.push(predicate);
@@ -940,8 +980,11 @@ impl OptimizerRule for PushDownFilter {
                 let group_expr_columns = agg
                     .group_expr
                     .iter()
-                    .map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string())))
-                    .collect::<Result<HashSet<_>>>()?;
+                    .map(|e| {
+                        let (relation, name) = e.qualified_name();
+                        Column::new(relation, name)
+                    })
+                    .collect::<HashSet<_>>();
 
                 let predicates = split_conjunction_owned(filter.predicate);
 
@@ -1009,7 +1052,10 @@ impl OptimizerRule for PushDownFilter {
                     func.params
                         .partition_by
                         .iter()
-                        .map(|c| Column::from_qualified_name(c.schema_name().to_string()))
+                        .map(|c| {
+                            let (relation, name) = c.qualified_name();
+                            Column::new(relation, name)
+                        })
                         .collect::<HashSet<_>>()
                 };
                 let potential_partition_keys = window
@@ -1095,12 +1141,13 @@ impl OptimizerRule for PushDownFilter {
                 let supported_filters = scan
                     .source
                     .supports_filters_pushdown(non_volatile_filters.as_slice())?;
-                if non_volatile_filters.len() != supported_filters.len() {
-                    return internal_err!(
-                        "Vec returned length: {} from supports_filters_pushdown is not the same size as the filters passed, which length is: {}",
-                        supported_filters.len(),
-                        non_volatile_filters.len());
-                }
+                assert_eq_or_internal_err!(
+                    non_volatile_filters.len(),
+                    supported_filters.len(),
+                    "Vec returned length: {} from supports_filters_pushdown is not the same size as the filters passed, which length is: {}",
+                    supported_filters.len(),
+                    non_volatile_filters.len()
+                );
 
                 // Compose scan filters from non-volatile filters of `Exact` or `Inexact` pushdown type
                 let zip = non_volatile_filters.into_iter().zip(supported_filters);
@@ -1252,10 +1299,13 @@ fn rewrite_projection(
     predicates: Vec<Expr>,
     mut projection: Projection,
 ) -> Result<(Transformed<LogicalPlan>, Option<Expr>)> {
-    // A projection is filter-commutable if it do not contain volatile predicates or contain volatile
-    // predicates that are not used in the filter. However, we should re-writes all predicate expressions.
-    // collect projection.
-    let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) = projection
+    // Partition projection expressions into non-pushable vs pushable.
+    // Non-pushable expressions are volatile (must not be duplicated) or
+    // MoveTowardsLeafNodes (cheap expressions like get_field where re-inlining
+    // into a filter causes optimizer instability — ExtractLeafExpressions will
+    // undo the push-down, creating an infinite loop that runs until the
+    // iteration limit is hit).
+    let (non_pushable_map, pushable_map): (HashMap<_, _>, HashMap<_, _>) = projection
         .schema
         .iter()
         .zip(projection.expr.iter())
@@ -1265,12 +1315,15 @@ fn rewrite_projection(
 
             (qualified_name(qualifier, field.name()), expr)
         })
-        .partition(|(_, value)| value.is_volatile());
+        .partition(|(_, value)| {
+            value.is_volatile()
+                || value.placement() == ExpressionPlacement::MoveTowardsLeafNodes
+        });
 
     let mut push_predicates = vec![];
     let mut keep_predicates = vec![];
     for expr in predicates {
-        if contain(&expr, &volatile_map) {
+        if contain(&expr, &non_pushable_map) {
             keep_predicates.push(expr);
         } else {
             push_predicates.push(expr);
@@ -1282,7 +1335,7 @@ fn rewrite_projection(
             // re-write all filters based on this projection
             // E.g. in `Filter: b\n  Projection: a > 1 as b`, we can swap them, but the filter must be "a > 1"
             let new_filter = LogicalPlan::Filter(Filter::try_new(
-                replace_cols_by_name(expr, &non_volatile_map)?,
+                replace_cols_by_name(expr, &pushable_map)?,
                 std::mem::take(&mut projection.input),
             )?);
 
@@ -1293,7 +1346,10 @@ fn rewrite_projection(
                 conjunction(keep_predicates),
             ))
         }
-        None => Ok((Transformed::no(LogicalPlan::Projection(projection)), None)),
+        None => Ok((
+            Transformed::no(LogicalPlan::Projection(projection)),
+            conjunction(keep_predicates),
+        )),
     }
 }
 
@@ -1330,20 +1386,34 @@ fn insert_below(
     })?;
 
     // make sure we did the actual replacement
-    if new_child.is_some() {
-        return internal_err!("node had no  inputs");
-    }
+    assert_or_internal_err!(new_child.is_none(), "node had no inputs");
 
     Ok(transformed_plan)
 }
 
 impl PushDownFilter {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
+fn with_debug_timing<T, F>(label: &'static str, f: F) -> Result<T>
+where
+    F: FnOnce() -> Result<T>,
+{
+    if !log_enabled!(Level::Debug) {
+        return f();
+    }
+    let start = Instant::now();
+    let result = f();
+    debug!(
+        "push_down_filter_timing: section={label}, elapsed_us={}",
+        start.elapsed().as_micros()
+    );
+    result
+}
+
 /// replaces columns by its name on the projection.
 pub fn replace_cols_by_name(
     e: Expr,
@@ -1395,17 +1465,18 @@ mod tests {
     use datafusion_expr::expr::{ScalarFunction, WindowFunction};
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        col, in_list, in_subquery, lit, ColumnarValue, ExprFunctionExt, Extension,
-        LogicalPlanBuilder, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-        TableSource, TableType, UserDefinedLogicalNodeCore, Volatility,
-        WindowFunctionDefinition,
+        ColumnarValue, ExprFunctionExt, Extension, LogicalPlanBuilder,
+        ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TableSource, TableType,
+        UserDefinedLogicalNodeCore, Volatility, WindowFunctionDefinition, col, in_list,
+        in_subquery, lit,
     };
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::optimizer::Optimizer;
     use crate::simplify_expressions::SimplifyExpressions;
+    use crate::test::udfs::leaf_udf_expr;
     use crate::test::*;
-    use crate::OptimizerContext;
     use datafusion_expr::test::function_stub::sum;
     use insta::assert_snapshot;
 
@@ -1529,6 +1600,30 @@ mod tests {
         )
     }
 
+    /// verifies that filters with unusual column names are pushed down through aggregate operators
+    #[test]
+    fn filter_move_agg_special() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("$a", DataType::UInt32, false),
+            Field::new("$b", DataType::UInt32, false),
+            Field::new("$c", DataType::UInt32, false),
+        ]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("$a")], vec![sum(col("$b")).alias("total_salary")])?
+            .filter(col("$a").gt(lit(10i64)))?
+            .build()?;
+        // filter of key aggregation is commutative
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Aggregate: groupBy=[[test.$a]], aggr=[[sum(test.$b) AS total_salary]]
+          TableScan: test, full_filters=[test.$a > Int64(10)]
+        "
+        )
+    }
+
     #[test]
     fn filter_complex_group_by() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -1584,7 +1679,7 @@ mod tests {
     fn filter_move_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window = Expr::WindowFunction(WindowFunction::new(
+        let window = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1609,13 +1704,48 @@ mod tests {
         )
     }
 
+    /// verifies that filters with unusual identifier names are pushed down through window functions
+    #[test]
+    fn filter_window_special_identifier() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("$a", DataType::UInt32, false),
+            Field::new("$b", DataType::UInt32, false),
+            Field::new("$c", DataType::UInt32, false),
+        ]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+
+        let window = Expr::from(WindowFunction::new(
+            WindowFunctionDefinition::WindowUDF(
+                datafusion_functions_window::rank::rank_udwf(),
+            ),
+            vec![],
+        ))
+        .partition_by(vec![col("$a"), col("$b")])
+        .order_by(vec![col("$c").sort(true, true)])
+        .build()
+        .unwrap();
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .window(vec![window])?
+            .filter(col("$b").gt(lit(10i64)))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        WindowAggr: windowExpr=[[rank() PARTITION BY [test.$a, test.$b] ORDER BY [test.$c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: test, full_filters=[test.$b > Int64(10)]
+        "
+        )
+    }
+
     /// verifies that when partitioning by 'a' and 'b', and filtering by 'a' and 'b', both 'a' and
     /// 'b' are pushed
     #[test]
     fn filter_move_complex_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window = Expr::WindowFunction(WindowFunction::new(
+        let window = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1645,7 +1775,7 @@ mod tests {
     fn filter_move_partial_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window = Expr::WindowFunction(WindowFunction::new(
+        let window = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1677,7 +1807,7 @@ mod tests {
     fn filter_expression_keep_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window = Expr::WindowFunction(WindowFunction::new(
+        let window = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1710,7 +1840,7 @@ mod tests {
     fn filter_order_keep_window() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window = Expr::WindowFunction(WindowFunction::new(
+        let window = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1742,7 +1872,7 @@ mod tests {
     fn filter_multiple_windows_common_partitions() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window1 = Expr::WindowFunction(WindowFunction::new(
+        let window1 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1753,7 +1883,7 @@ mod tests {
         .build()
         .unwrap();
 
-        let window2 = Expr::WindowFunction(WindowFunction::new(
+        let window2 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1784,7 +1914,7 @@ mod tests {
     fn filter_multiple_windows_disjoint_partitions() -> Result<()> {
         let table_scan = test_table_scan()?;
 
-        let window1 = Expr::WindowFunction(WindowFunction::new(
+        let window1 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1795,7 +1925,7 @@ mod tests {
         .build()
         .unwrap();
 
-        let window2 = Expr::WindowFunction(WindowFunction::new(
+        let window2 = Expr::from(WindowFunction::new(
             WindowFunctionDefinition::WindowUDF(
                 datafusion_functions_window::rank::rank_udwf(),
             ),
@@ -1928,7 +2058,10 @@ mod tests {
     // Manual implementation needed because of `schema` field. Comparison excludes this field.
     impl PartialOrd for NoopPlan {
         fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-            self.input.partial_cmp(&other.input)
+            self.input
+                .partial_cmp(&other.input)
+                // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+                .filter(|cmp| *cmp != Ordering::Equal || self == other)
         }
     }
 
@@ -2229,7 +2362,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.d
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.d, test1.e, test1.f
@@ -2259,7 +2392,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.a
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.a, test1.b, test1.c
@@ -2618,8 +2751,7 @@ mod tests {
         )
     }
 
-    /// post-left-join predicate on a column common to both sides is only pushed to the left side
-    /// i.e. - not duplicated to the right side
+    /// post-left-join predicate on a column common to both sides is pushed to both sides
     #[test]
     fn filter_using_left_join_on_common() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -2647,20 +2779,19 @@ mod tests {
               TableScan: test2
         ",
         );
-        // filter sent to left side of the join, not the right
+        // filter sent to left side of the join and to the right
         assert_optimized_plan_equal!(
             plan,
             @r"
         Left Join: Using test.a = test2.a
           TableScan: test, full_filters=[test.a <= Int64(1)]
           Projection: test2.a
-            TableScan: test2
+            TableScan: test2, full_filters=[test2.a <= Int64(1)]
         "
         )
     }
 
-    /// post-right-join predicate on a column common to both sides is only pushed to the right side
-    /// i.e. - not duplicated to the left side.
+    /// post-right-join predicate on a column common to both sides is pushed to both sides
     #[test]
     fn filter_using_right_join_on_common() -> Result<()> {
         let table_scan = test_table_scan()?;
@@ -2688,12 +2819,12 @@ mod tests {
               TableScan: test2
         ",
         );
-        // filter sent to right side of join, not duplicated to the left
+        // filter sent to right side of join, sent to the left as well
         assert_optimized_plan_equal!(
             plan,
             @r"
         Right Join: Using test.a = test2.a
-          TableScan: test
+          TableScan: test, full_filters=[test.a <= Int64(1)]
           Projection: test2.a
             TableScan: test2, full_filters=[test2.a <= Int64(1)]
         "
@@ -2875,7 +3006,7 @@ mod tests {
           Projection: test.a, test.b, test.c
             TableScan: test
           Projection: test2.a, test2.b, test2.c
-            TableScan: test2, full_filters=[test2.c > UInt32(4)]
+            TableScan: test2, full_filters=[test2.a > UInt32(1), test2.c > UInt32(4)]
         "
         )
     }
@@ -3013,9 +3144,7 @@ mod tests {
         let table_scan = LogicalPlan::TableScan(TableScan {
             table_name: "test".into(),
             filters,
-            projected_schema: Arc::new(DFSchema::try_from(
-                (*test_provider.schema()).clone(),
-            )?),
+            projected_schema: Arc::new(DFSchema::try_from(test_provider.schema())?),
             projection,
             source: Arc::new(test_provider),
             fetch: None,
@@ -3385,7 +3514,7 @@ mod tests {
               Projection: b.a
                 SubqueryAlias: b
                   Projection: Int64(0) AS a
-                    EmptyRelation
+                    EmptyRelation: rows=1
         ",
         );
         // Ensure that the predicate without any columns (0 = 1) is
@@ -3399,7 +3528,7 @@ mod tests {
               SubqueryAlias: b
                 Projection: Int64(0) AS a
                   Filter: Int64(0) = Int64(1)
-                    EmptyRelation
+                    EmptyRelation: rows=1
         "
         )
     }
@@ -3818,7 +3947,7 @@ mod tests {
         )
     }
 
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct TestScalarUDF {
         signature: Signature,
     }
@@ -4122,4 +4251,68 @@ mod tests {
         "
         )
     }
+
+    /// Test that filters are NOT pushed through MoveTowardsLeafNodes projections.
+    /// These are cheap expressions (like get_field) where re-inlining into a filter
+    /// has no benefit and causes optimizer instability — ExtractLeafExpressions will
+    /// undo the push-down, creating an infinite loop that runs until the iteration
+    /// limit is hit.
+    #[test]
+    fn filter_not_pushed_through_move_towards_leaves_projection() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // Create a projection with a MoveTowardsLeafNodes expression
+        let proj = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf_expr(col("a")).alias("val"),
+                col("b"),
+                col("c"),
+            ])?
+            .build()?;
+
+        // Put a filter on the MoveTowardsLeafNodes column
+        let plan = LogicalPlanBuilder::from(proj)
+            .filter(col("val").gt(lit(150i64)))?
+            .build()?;
+
+        // Filter should NOT be pushed through — val maps to a MoveTowardsLeafNodes expr
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: val > Int64(150)
+          Projection: leaf_udf(test.a) AS val, test.b, test.c
+            TableScan: test
+        "
+        )
+    }
+
+    /// Test mixed predicates: Column predicate pushed, MoveTowardsLeafNodes kept.
+    #[test]
+    fn filter_mixed_predicates_partial_push() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        // Create a projection with both MoveTowardsLeafNodes and Column expressions
+        let proj = LogicalPlanBuilder::from(table_scan)
+            .project(vec![
+                leaf_udf_expr(col("a")).alias("val"),
+                col("b"),
+                col("c"),
+            ])?
+            .build()?;
+
+        // Filter with both: val > 150 (MoveTowardsLeafNodes) AND b > 5 (Column)
+        let plan = LogicalPlanBuilder::from(proj)
+            .filter(col("val").gt(lit(150i64)).and(col("b").gt(lit(5i64))))?
+            .build()?;
+
+        // val > 150 should be kept above, b > 5 should be pushed through
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Filter: val > Int64(150)
+          Projection: leaf_udf(test.a) AS val, test.b, test.c
+            TableScan: test, full_filters=[test.b > Int64(5)]
+        "
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index ec042dd350ca1..755e192e340d9 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -23,20 +23,19 @@ use std::sync::Arc;
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
+use datafusion_common::Result;
 use datafusion_common::tree_node::Transformed;
 use datafusion_common::utils::combine_limit;
-use datafusion_common::Result;
 use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan};
-use datafusion_expr::{lit, FetchType, SkipType};
+use datafusion_expr::{FetchType, SkipType, lit};
 
 /// Optimization rule that tries to push down `LIMIT`.
-///
 //. It will push down through projection, limits (taking the smaller limit)
 #[derive(Default, Debug)]
 pub struct PushDownLimit {}
 
 impl PushDownLimit {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -51,8 +50,9 @@ impl OptimizerRule for PushDownLimit {
     fn rewrite(
         &self,
         plan: LogicalPlan,
-        _config: &dyn OptimizerConfig,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
+        let _ = config.options();
         let LogicalPlan::Limit(mut limit) = plan else {
             return Ok(Transformed::no(plan));
         };
@@ -82,8 +82,7 @@ impl OptimizerRule for PushDownLimit {
             });
 
             // recursively reapply the rule on the new plan
-            #[allow(clippy::used_underscore_binding)]
-            return self.rewrite(plan, _config);
+            return self.rewrite(plan, config);
         }
 
         // no fetch to push, so return the original plan
@@ -282,8 +281,8 @@ mod test {
     use crate::OptimizerContext;
     use datafusion_common::DFSchemaRef;
     use datafusion_expr::{
-        col, exists, logical_plan::builder::LogicalPlanBuilder, Expr, Extension,
-        UserDefinedLogicalNodeCore,
+        Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
+        logical_plan::builder::LogicalPlanBuilder,
     };
     use datafusion_functions_aggregate::expr_fn::max;
 
@@ -312,7 +311,10 @@ mod test {
     // Manual implementation needed because of `schema` field. Comparison excludes this field.
     impl PartialOrd for NoopPlan {
         fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-            self.input.partial_cmp(&other.input)
+            self.input
+                .partial_cmp(&other.input)
+                // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+                .filter(|cmp| *cmp != Ordering::Equal || self == other)
         }
     }
 
@@ -365,7 +367,10 @@ mod test {
     // Manual implementation needed because of `schema` field. Comparison excludes this field.
     impl PartialOrd for NoLimitNoopPlan {
         fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-            self.input.partial_cmp(&other.input)
+            self.input
+                .partial_cmp(&other.input)
+                // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+                .filter(|cmp| *cmp != Ordering::Equal || self == other)
         }
     }
 
@@ -1039,7 +1044,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1062,7 +1067,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000
diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs
index 2383787fa0e8a..f24ca1f119b61 100644
--- a/datafusion/optimizer/src/replace_distinct_aggregate.rs
+++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs
@@ -25,8 +25,8 @@ use datafusion_common::tree_node::Transformed;
 use datafusion_common::{Column, Result};
 use datafusion_expr::expr_rewriter::normalize_cols;
 use datafusion_expr::utils::expand_wildcard;
-use datafusion_expr::{col, ExprFunctionExt, LogicalPlanBuilder};
 use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
+use datafusion_expr::{ExprFunctionExt, Limit, LogicalPlanBuilder, col, lit};
 
 /// Optimizer that replaces logical [[Distinct]] with a logical [[Aggregate]]
 ///
@@ -54,11 +54,22 @@ use datafusion_expr::{Aggregate, Distinct, DistinctOn, Expr, LogicalPlan};
 /// )
 /// ORDER BY a DESC
 /// ```
+///
+/// In case there are no columns, the [[Distinct]] is replaced by a [[Limit]]
+///
+/// ```text
+/// SELECT DISTINCT * FROM empty_table
+/// ```
+///
+/// Into
+/// ```text
+/// SELECT * FROM empty_table LIMIT 1
+/// ```
 #[derive(Default, Debug)]
 pub struct ReplaceDistinctWithAggregate {}
 
 impl ReplaceDistinctWithAggregate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -78,6 +89,16 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
             LogicalPlan::Distinct(Distinct::All(input)) => {
                 let group_expr = expand_wildcard(input.schema(), &input, None)?;
 
+                if group_expr.is_empty() {
+                    // Special case: there are no columns to group by, so we can't replace it by a group by
+                    // however, we can replace it by LIMIT 1 because there is either no output or a single empty row
+                    return Ok(Transformed::yes(LogicalPlan::Limit(Limit {
+                        skip: None,
+                        fetch: Some(Box::new(lit(1i64))),
+                        input,
+                    })));
+                }
+
                 let field_count = input.schema().fields().len();
                 for dep in input.schema().functional_dependencies().iter() {
                     // If distinct is exactly the same with a previous GROUP BY, we can
@@ -184,15 +205,17 @@ impl OptimizerRule for ReplaceDistinctWithAggregate {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
     use crate::test::*;
+    use arrow::datatypes::{Fields, Schema};
+    use std::sync::Arc;
 
     use crate::OptimizerContext;
     use datafusion_common::Result;
-    use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder, Expr};
+    use datafusion_expr::{
+        Expr, col, logical_plan::builder::LogicalPlanBuilder, table_scan,
+    };
     use datafusion_functions_aggregate::sum::sum;
 
     macro_rules! assert_optimized_plan_equal {
@@ -274,4 +297,16 @@ mod tests {
               TableScan: test
         ")
     }
+
+    #[test]
+    fn use_limit_1_when_no_columns() -> Result<()> {
+        let plan = table_scan(Some("test"), &Schema::new(Fields::empty()), None)?
+            .distinct()?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Limit: skip=0, fetch=1
+          TableScan: test
+        ")
+    }
 }
diff --git a/datafusion/optimizer/src/rewrite_set_comparison.rs b/datafusion/optimizer/src/rewrite_set_comparison.rs
new file mode 100644
index 0000000000000..c8c35b518743a
--- /dev/null
+++ b/datafusion/optimizer/src/rewrite_set_comparison.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Optimizer rule rewriting `SetComparison` subqueries (e.g. `= ANY`,
+//! `> ALL`) into boolean expressions built from `EXISTS` subqueries
+//! that capture SQL three-valued logic.
+
+use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{Column, DFSchema, ExprSchema, Result, ScalarValue, plan_err};
+use datafusion_expr::expr::{self, Exists, SetComparison, SetQuantifier};
+use datafusion_expr::logical_plan::Subquery;
+use datafusion_expr::logical_plan::builder::LogicalPlanBuilder;
+use datafusion_expr::{Expr, LogicalPlan, lit};
+use std::sync::Arc;
+
+use datafusion_expr::utils::merge_schema;
+
+/// Rewrite `SetComparison` expressions to scalar subqueries that return the
+/// correct boolean value (including SQL NULL semantics). After this rule
+/// runs, later rules such as `ScalarSubqueryToJoin` can decorrelate and
+/// remove the remaining subquery.
+#[derive(Debug, Default)]
+pub struct RewriteSetComparison;
+
+impl RewriteSetComparison {
+    /// Create a new `RewriteSetComparison` optimizer rule.
+    pub fn new() -> Self {
+        Self
+    }
+
+    fn rewrite_plan(&self, plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        let schema = merge_schema(&plan.inputs());
+        plan.map_expressions(|expr| {
+            expr.transform_up(|expr| rewrite_set_comparison(expr, &schema))
+        })
+    }
+}
+
+impl OptimizerRule for RewriteSetComparison {
+    fn name(&self) -> &str {
+        "rewrite_set_comparison"
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        plan.transform_up_with_subqueries(|plan| self.rewrite_plan(plan))
+    }
+}
+
+fn rewrite_set_comparison(
+    expr: Expr,
+    outer_schema: &DFSchema,
+) -> Result<Transformed<Expr>> {
+    match expr {
+        Expr::SetComparison(set_comparison) => {
+            let rewritten = build_set_comparison_subquery(set_comparison, outer_schema)?;
+            Ok(Transformed::yes(rewritten))
+        }
+        _ => Ok(Transformed::no(expr)),
+    }
+}
+
+fn build_set_comparison_subquery(
+    set_comparison: SetComparison,
+    outer_schema: &DFSchema,
+) -> Result<Expr> {
+    let SetComparison {
+        expr,
+        subquery,
+        op,
+        quantifier,
+    } = set_comparison;
+
+    let left_expr = to_outer_reference(*expr, outer_schema)?;
+    let subquery_schema = subquery.subquery.schema();
+    if subquery_schema.fields().is_empty() {
+        return plan_err!("single expression required.");
+    }
+    // avoid `head_output_expr` for aggr/window plan, it will gives group-by expr if exists
+    let right_expr = Expr::Column(Column::from(subquery_schema.qualified_field(0)));
+
+    let comparison = Expr::BinaryExpr(expr::BinaryExpr::new(
+        Box::new(left_expr),
+        op,
+        Box::new(right_expr),
+    ));
+
+    let true_exists =
+        exists_subquery(&subquery, Expr::IsTrue(Box::new(comparison.clone())))?;
+    let null_exists =
+        exists_subquery(&subquery, Expr::IsNull(Box::new(comparison.clone())))?;
+
+    let result_expr = match quantifier {
+        SetQuantifier::Any => Expr::Case(expr::Case {
+            expr: None,
+            when_then_expr: vec![
+                (Box::new(true_exists), Box::new(lit(true))),
+                (
+                    Box::new(null_exists),
+                    Box::new(Expr::Literal(ScalarValue::Boolean(None), None)),
+                ),
+            ],
+            else_expr: Some(Box::new(lit(false))),
+        }),
+        SetQuantifier::All => {
+            let false_exists =
+                exists_subquery(&subquery, Expr::IsFalse(Box::new(comparison.clone())))?;
+            Expr::Case(expr::Case {
+                expr: None,
+                when_then_expr: vec![
+                    (Box::new(false_exists), Box::new(lit(false))),
+                    (
+                        Box::new(null_exists),
+                        Box::new(Expr::Literal(ScalarValue::Boolean(None), None)),
+                    ),
+                ],
+                else_expr: Some(Box::new(lit(true))),
+            })
+        }
+    };
+
+    Ok(result_expr)
+}
+
+fn exists_subquery(subquery: &Subquery, filter: Expr) -> Result<Expr> {
+    let plan = LogicalPlanBuilder::from(subquery.subquery.as_ref().clone())
+        .filter(filter)?
+        .build()?;
+    let outer_ref_columns = plan.all_out_ref_exprs();
+    Ok(Expr::Exists(Exists {
+        subquery: Subquery {
+            subquery: Arc::new(plan),
+            outer_ref_columns,
+            spans: subquery.spans.clone(),
+        },
+        negated: false,
+    }))
+}
+
+fn to_outer_reference(expr: Expr, outer_schema: &DFSchema) -> Result<Expr> {
+    expr.transform_up(|expr| match expr {
+        Expr::Column(col) => {
+            let field = outer_schema.field_from_column(&col)?;
+            Ok(Transformed::yes(Expr::OuterReferenceColumn(
+                Arc::clone(field),
+                col,
+            )))
+        }
+        Expr::OuterReferenceColumn(_, _) => Ok(Transformed::no(expr)),
+        _ => Ok(Transformed::no(expr)),
+    })
+    .map(|t| t.data)
+}
diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs
index 897e07cb987ed..975c234b38836 100644
--- a/datafusion/optimizer/src/scalar_subquery_to_join.rs
+++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs
@@ -30,18 +30,20 @@ use datafusion_common::alias::AliasGenerator;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
-use datafusion_common::{internal_err, plan_err, Column, Result, ScalarValue};
+use datafusion_common::{Column, Result, ScalarValue, assert_or_internal_err, plan_err};
 use datafusion_expr::expr_rewriter::create_col_from_scalar_expr;
 use datafusion_expr::logical_plan::{JoinType, Subquery};
 use datafusion_expr::utils::conjunction;
-use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder, expr};
 
 /// Optimizer rule for rewriting subquery filters to joins
+/// and places additional projection on top of the filter, to preserve
+/// original schema.
 #[derive(Default, Debug)]
 pub struct ScalarSubqueryToJoin {}
 
 impl ScalarSubqueryToJoin {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -92,9 +94,10 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                     config.alias_generator(),
                 )?;
 
-                if subqueries.is_empty() {
-                    return internal_err!("Expected subqueries not found in filter");
-                }
+                assert_or_internal_err!(
+                    !subqueries.is_empty(),
+                    "Expected subqueries not found in filter"
+                );
 
                 // iterate through all subqueries in predicate, turning each into a left join
                 let mut cur_input = filter.input.as_ref().clone();
@@ -123,8 +126,13 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                         return Ok(Transformed::no(LogicalPlan::Filter(filter)));
                     }
                 }
+
+                // Preserve original schema as new Join might have more fields than what Filter & parents expect.
+                let projection =
+                    filter.input.schema().columns().into_iter().map(Expr::from);
                 let new_plan = LogicalPlanBuilder::from(cur_input)
                     .filter(rewrite_expr)?
+                    .project(projection)?
                     .build()?;
                 Ok(Transformed::yes(new_plan))
             }
@@ -147,9 +155,10 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                     all_subqueries.extend(subqueries);
                     expr_to_rewrite_expr_map.insert(expr, rewrite_exprs);
                 }
-                if all_subqueries.is_empty() {
-                    return internal_err!("Expected subqueries not found in projection");
-                }
+                assert_or_internal_err!(
+                    !all_subqueries.is_empty(),
+                    "Expected subqueries not found in projection"
+                );
                 // iterate through all subqueries in predicate, turning each into a left join
                 let mut cur_input = projection.input.as_ref().clone();
                 for (subquery, alias) in all_subqueries {
@@ -157,29 +166,25 @@ impl OptimizerRule for ScalarSubqueryToJoin {
                         build_join(&subquery, &cur_input, &alias)?
                     {
                         cur_input = optimized_subquery;
-                        if !expr_check_map.is_empty() {
-                            if let Some(expr) = subquery_to_expr_map.get(&subquery) {
-                                if let Some(rewrite_expr) =
-                                    expr_to_rewrite_expr_map.get(expr)
-                                {
-                                    let new_expr = rewrite_expr
-                                        .clone()
-                                        .transform_up(|expr| {
-                                            // replace column references with entry in map, if it exists
-                                            if let Some(map_expr) =
-                                                expr.try_as_col().and_then(|col| {
-                                                    expr_check_map.get(&col.name)
-                                                })
-                                            {
-                                                Ok(Transformed::yes(map_expr.clone()))
-                                            } else {
-                                                Ok(Transformed::no(expr))
-                                            }
-                                        })
-                                        .data()?;
-                                    expr_to_rewrite_expr_map.insert(expr, new_expr);
-                                }
-                            }
+                        if !expr_check_map.is_empty()
+                            && let Some(expr) = subquery_to_expr_map.get(&subquery)
+                            && let Some(rewrite_expr) = expr_to_rewrite_expr_map.get(expr)
+                        {
+                            let new_expr = rewrite_expr
+                                .clone()
+                                .transform_up(|expr| {
+                                    // replace column references with entry in map, if it exists
+                                    if let Some(map_expr) = expr
+                                        .try_as_col()
+                                        .and_then(|col| expr_check_map.get(&col.name))
+                                    {
+                                        Ok(Transformed::yes(map_expr.clone()))
+                                    } else {
+                                        Ok(Transformed::no(expr))
+                                    }
+                                })
+                                .data()?;
+                            expr_to_rewrite_expr_map.insert(expr, new_expr);
                         }
                     } else {
                         // if we can't handle all of the subqueries then bail for now
@@ -335,7 +340,7 @@ fn build_join(
                     .join_on(
                         sub_query_alias,
                         JoinType::Left,
-                        vec![Expr::Literal(ScalarValue::Boolean(Some(true)))],
+                        vec![Expr::Literal(ScalarValue::Boolean(Some(true)), None)],
                     )?
                     .build()?
             }
@@ -365,7 +370,7 @@ fn build_join(
                         ),
                         (
                             Box::new(Expr::Not(Box::new(filter.clone()))),
-                            Box::new(Expr::Literal(ScalarValue::Null)),
+                            Box::new(Expr::Literal(ScalarValue::Null, None)),
                         ),
                     ],
                     else_expr: Some(Box::new(Expr::Column(Column::new_unqualified(
@@ -408,7 +413,7 @@ mod tests {
     use datafusion_expr::test::function_stub::sum;
 
     use crate::assert_optimized_plan_eq_display_indent_snapshot;
-    use datafusion_expr::{col, lit, out_ref_col, scalar_subquery, Between};
+    use datafusion_expr::{Between, col, lit, out_ref_col, scalar_subquery};
     use datafusion_functions_aggregate::min_max::{max, min};
 
     macro_rules! assert_optimized_plan_equal {
@@ -452,18 +457,19 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: Int32(1) < __scalar_sq_1.max(orders.o_custkey) AND Int32(1) < __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: __scalar_sq_2.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              Left Join:  Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: Int32(1) < __scalar_sq_1.max(orders.o_custkey) AND Int32(1) < __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: __scalar_sq_2.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                Left Join:  Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                  TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                  SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                      Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
                   Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
                     Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
                       TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-              SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -506,19 +512,21 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_acctbal < __scalar_sq_1.sum(orders.o_totalprice) [c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: sum(orders.o_totalprice), orders.o_custkey, __always_true [sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[sum(orders.o_totalprice)]] [o_custkey:Int64, __always_true:Boolean, sum(orders.o_totalprice):Float64;N]
-                    Filter: orders.o_totalprice < __scalar_sq_2.sum(lineitem.l_extendedprice) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N, __always_true:Boolean;N]
-                      Left Join:  Filter: __scalar_sq_2.l_orderkey = orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N, __always_true:Boolean;N]
-                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                        SubqueryAlias: __scalar_sq_2 [sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64, __always_true:Boolean]
-                          Projection: sum(lineitem.l_extendedprice), lineitem.l_orderkey, __always_true [sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64, __always_true:Boolean]
-                            Aggregate: groupBy=[[lineitem.l_orderkey, Boolean(true) AS __always_true]], aggr=[[sum(lineitem.l_extendedprice)]] [l_orderkey:Int64, __always_true:Boolean, sum(lineitem.l_extendedprice):Float64;N]
-                              TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_acctbal < __scalar_sq_1.sum(orders.o_totalprice) [c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, sum(orders.o_totalprice):Float64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: sum(orders.o_totalprice), orders.o_custkey, __always_true [sum(orders.o_totalprice):Float64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[sum(orders.o_totalprice)]] [o_custkey:Int64, __always_true:Boolean, sum(orders.o_totalprice):Float64;N]
+                      Projection: orders.o_orderkey, orders.o_custkey, orders.o_orderstatus, orders.o_totalprice [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                        Filter: orders.o_totalprice < __scalar_sq_2.sum(lineitem.l_extendedprice) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N, __always_true:Boolean;N]
+                          Left Join:  Filter: __scalar_sq_2.l_orderkey = orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N, __always_true:Boolean;N]
+                            TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                            SubqueryAlias: __scalar_sq_2 [sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64, __always_true:Boolean]
+                              Projection: sum(lineitem.l_extendedprice), lineitem.l_orderkey, __always_true [sum(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64, __always_true:Boolean]
+                                Aggregate: groupBy=[[lineitem.l_orderkey, Boolean(true) AS __always_true]], aggr=[[sum(lineitem.l_extendedprice)]] [l_orderkey:Int64, __always_true:Boolean, sum(lineitem.l_extendedprice):Float64;N]
+                                  TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]
         "
         )
     }
@@ -547,14 +555,15 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                      Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -583,13 +592,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-            Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                  Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -614,14 +624,15 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-            Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                  Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                    Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                      Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -774,13 +785,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) + Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey) + Int32(1), orders.o_custkey, __always_true [max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) + Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey) + Int32(1), orders.o_custkey, __always_true [max(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -877,13 +889,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey >= __scalar_sq_1.max(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey >= __scalar_sq_1.max(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -914,13 +927,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -952,13 +966,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -983,13 +998,14 @@ mod tests {
             plan,
             @r"
         Projection: test.c [c:UInt32]
-          Filter: test.c < __scalar_sq_1.min(sq.c) [a:UInt32, b:UInt32, c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
-            Left Join:  Filter: test.a = __scalar_sq_1.a [a:UInt32, b:UInt32, c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
-              TableScan: test [a:UInt32, b:UInt32, c:UInt32]
-              SubqueryAlias: __scalar_sq_1 [min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
-                Projection: min(sq.c), sq.a, __always_true [min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
-                  Aggregate: groupBy=[[sq.a, Boolean(true) AS __always_true]], aggr=[[min(sq.c)]] [a:UInt32, __always_true:Boolean, min(sq.c):UInt32;N]
-                    TableScan: sq [a:UInt32, b:UInt32, c:UInt32]
+          Projection: test.a, test.b, test.c [a:UInt32, b:UInt32, c:UInt32]
+            Filter: test.c < __scalar_sq_1.min(sq.c) [a:UInt32, b:UInt32, c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
+              Left Join:  Filter: test.a = __scalar_sq_1.a [a:UInt32, b:UInt32, c:UInt32, min(sq.c):UInt32;N, a:UInt32;N, __always_true:Boolean;N]
+                TableScan: test [a:UInt32, b:UInt32, c:UInt32]
+                SubqueryAlias: __scalar_sq_1 [min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
+                  Projection: min(sq.c), sq.a, __always_true [min(sq.c):UInt32;N, a:UInt32, __always_true:Boolean]
+                    Aggregate: groupBy=[[sq.a, Boolean(true) AS __always_true]], aggr=[[min(sq.c)]] [a:UInt32, __always_true:Boolean, min(sq.c):UInt32;N]
+                      TableScan: sq [a:UInt32, b:UInt32, c:UInt32]
         "
         )
     }
@@ -1013,13 +1029,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey < __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-            Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                  Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey < __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -1042,13 +1059,14 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-            Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
-              TableScan: customer [c_custkey:Int64, c_name:Utf8]
-              SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
-                Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                  Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey = __scalar_sq_1.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, max(orders.o_custkey):Int64;N]
+                TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                SubqueryAlias: __scalar_sq_1 [max(orders.o_custkey):Int64;N]
+                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
+                      TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -1092,18 +1110,19 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey BETWEEN __scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-            Left Join:  Filter: customer.c_custkey = __scalar_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-              Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Projection: min(orders.o_custkey), orders.o_custkey, __always_true [min(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[min(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, min(orders.o_custkey):Int64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey BETWEEN __scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+              Left Join:  Filter: customer.c_custkey = __scalar_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N, max(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                Left Join:  Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, o_custkey:Int64;N, __always_true:Boolean;N]
+                  TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                  SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Projection: min(orders.o_custkey), orders.o_custkey, __always_true [min(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                      Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[min(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, min(orders.o_custkey):Int64;N]
+                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                  Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
+                    Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
                       TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-              SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                Projection: max(orders.o_custkey), orders.o_custkey, __always_true [max(orders.o_custkey):Int64;N, o_custkey:Int64, __always_true:Boolean]
-                  Aggregate: groupBy=[[orders.o_custkey, Boolean(true) AS __always_true]], aggr=[[max(orders.o_custkey)]] [o_custkey:Int64, __always_true:Boolean, max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
@@ -1139,18 +1158,19 @@ mod tests {
             plan,
             @r"
         Projection: customer.c_custkey [c_custkey:Int64]
-          Filter: customer.c_custkey BETWEEN __scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
-            Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
-              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N]
-                TableScan: customer [c_custkey:Int64, c_name:Utf8]
-                SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N]
-                  Projection: min(orders.o_custkey) [min(orders.o_custkey):Int64;N]
-                    Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]] [min(orders.o_custkey):Int64;N]
+          Projection: customer.c_custkey, customer.c_name [c_custkey:Int64, c_name:Utf8]
+            Filter: customer.c_custkey BETWEEN __scalar_sq_1.min(orders.o_custkey) AND __scalar_sq_2.max(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
+              Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N, max(orders.o_custkey):Int64;N]
+                Left Join:  Filter: Boolean(true) [c_custkey:Int64, c_name:Utf8, min(orders.o_custkey):Int64;N]
+                  TableScan: customer [c_custkey:Int64, c_name:Utf8]
+                  SubqueryAlias: __scalar_sq_1 [min(orders.o_custkey):Int64;N]
+                    Projection: min(orders.o_custkey) [min(orders.o_custkey):Int64;N]
+                      Aggregate: groupBy=[[]], aggr=[[min(orders.o_custkey)]] [min(orders.o_custkey):Int64;N]
+                        TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
+                SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N]
+                  Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
                       TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
-              SubqueryAlias: __scalar_sq_2 [max(orders.o_custkey):Int64;N]
-                Projection: max(orders.o_custkey) [max(orders.o_custkey):Int64;N]
-                  Aggregate: groupBy=[[]], aggr=[[max(orders.o_custkey)]] [max(orders.o_custkey):Int64;N]
-                    TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]
         "
         )
     }
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 4e4e3d316c268..28fcdf1dede0b 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -17,47 +17,54 @@
 
 //! Expression simplification API
 
-use std::borrow::Cow;
-use std::collections::HashSet;
-use std::ops::Not;
-
 use arrow::{
-    array::{new_null_array, AsArray},
+    array::{Array, AsArray, new_null_array},
     datatypes::{DataType, Field, Schema},
     record_batch::RecordBatch,
 };
+use std::borrow::Cow;
+use std::collections::HashSet;
+use std::ops::Not;
+use std::sync::Arc;
 
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::nested_struct::has_one_of_more_common_fields;
 use datafusion_common::{
+    DFSchema, DataFusionError, Result, ScalarValue, exec_datafusion_err, internal_err,
+};
+use datafusion_common::{
+    HashMap,
     cast::{as_large_list_array, as_list_array},
+    metadata::FieldMetadata,
     tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
 };
-use datafusion_common::{internal_err, DFSchema, DataFusionError, Result, ScalarValue};
 use datafusion_expr::{
-    and, binary::BinaryTypeCoercer, lit, or, BinaryExpr, Case, ColumnarValue, Expr, Like,
-    Operator, Volatility, WindowFunctionDefinition,
+    BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Like, Operator, Volatility,
+    and, binary::BinaryTypeCoercer, lit, or, preimage::PreimageResult,
 };
+use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult};
 use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval};
 use datafusion_expr::{
-    expr::{InList, InSubquery, WindowFunction},
+    expr::{InList, InSubquery},
     utils::{iter_conjunction, iter_conjunction_owned},
 };
-use datafusion_expr::{simplify::ExprSimplifyResult, Cast, TryCast};
 use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps};
 
 use super::inlist_simplifier::ShortenInListSimplifier;
 use super::utils::*;
-use crate::simplify_expressions::guarantees::GuaranteeRewriter;
+use crate::simplify_expressions::SimplifyContext;
 use crate::simplify_expressions::regex::simplify_regex_expr;
 use crate::simplify_expressions::unwrap_cast::{
     is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary,
     is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist,
     unwrap_cast_in_comparison_for_binary,
 };
-use crate::simplify_expressions::SimplifyInfo;
 use crate::{
     analyzer::type_coercion::TypeCoercionRewriter,
-    simplify_expressions::unwrap_cast::try_cast_literal_to_type,
+    simplify_expressions::udf_preimage::rewrite_with_preimage,
 };
+use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map;
+use datafusion_expr_common::casts::try_cast_literal_to_type;
 use indexmap::IndexSet;
 use regex::Regex;
 
@@ -68,23 +75,19 @@ use regex::Regex;
 ///
 /// For example:
 /// ```
-/// use arrow::datatypes::{Schema, Field, DataType};
-/// use datafusion_expr::{col, lit};
+/// use arrow::datatypes::{DataType, Field, Schema};
 /// use datafusion_common::{DataFusionError, ToDFSchema};
-/// use datafusion_expr::execution_props::ExecutionProps;
 /// use datafusion_expr::simplify::SimplifyContext;
+/// use datafusion_expr::{col, lit};
 /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 ///
 /// // Create the schema
-/// let schema = Schema::new(vec![
-///     Field::new("i", DataType::Int64, false),
-///   ])
-///   .to_dfschema_ref().unwrap();
+/// let schema = Schema::new(vec![Field::new("i", DataType::Int64, false)])
+///     .to_dfschema_ref()
+///     .unwrap();
 ///
 /// // Create the simplifier
-/// let props = ExecutionProps::new();
-/// let context = SimplifyContext::new(&props)
-///    .with_schema(schema);
+/// let context = SimplifyContext::default().with_schema(schema);
 /// let simplifier = ExprSimplifier::new(context);
 ///
 /// // Use the simplifier
@@ -96,8 +99,8 @@ use regex::Regex;
 /// let simplified = simplifier.simplify(expr).unwrap();
 /// assert_eq!(simplified, col("b").lt(lit(2)));
 /// ```
-pub struct ExprSimplifier<S> {
-    info: S,
+pub struct ExprSimplifier {
+    info: SimplifyContext,
     /// Guarantees about the values of columns. This is provided by the user
     /// in [ExprSimplifier::with_guarantees()].
     guarantees: Vec<(Expr, NullableInterval)>,
@@ -111,13 +114,12 @@ pub struct ExprSimplifier<S> {
 pub const THRESHOLD_INLINE_INLIST: usize = 3;
 pub const DEFAULT_MAX_SIMPLIFIER_CYCLES: u32 = 3;
 
-impl<S: SimplifyInfo> ExprSimplifier<S> {
-    /// Create a new `ExprSimplifier` with the given `info` such as an
-    /// instance of [`SimplifyContext`]. See
-    /// [`simplify`](Self::simplify) for an example.
+impl ExprSimplifier {
+    /// Create a new `ExprSimplifier` with the given [`SimplifyContext`].
+    /// See [`simplify`](Self::simplify) for an example.
     ///
     /// [`SimplifyContext`]: datafusion_expr::simplify::SimplifyContext
-    pub fn new(info: S) -> Self {
+    pub fn new(info: SimplifyContext) -> Self {
         Self {
             info,
             guarantees: vec![],
@@ -142,40 +144,21 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// `b > 2`
     ///
     /// ```
-    /// use arrow::datatypes::DataType;
-    /// use datafusion_expr::{col, lit, Expr};
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::{DFSchema, ToDFSchema};
     /// use datafusion_common::Result;
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::simplify::SimplifyContext;
-    /// use datafusion_expr::simplify::SimplifyInfo;
+    /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
-    /// use datafusion_common::DFSchema;
     /// use std::sync::Arc;
     ///
-    /// /// Simple implementation that provides `Simplifier` the information it needs
-    /// /// See SimplifyContext for a structure that does this.
-    /// #[derive(Default)]
-    /// struct Info {
-    ///   execution_props: ExecutionProps,
-    /// };
-    ///
-    /// impl SimplifyInfo for Info {
-    ///   fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-    ///     Ok(false)
-    ///   }
-    ///   fn nullable(&self, expr: &Expr) -> Result<bool> {
-    ///     Ok(true)
-    ///   }
-    ///   fn execution_props(&self) -> &ExecutionProps {
-    ///     &self.execution_props
-    ///   }
-    ///   fn get_data_type(&self, expr: &Expr) -> Result<DataType> {
-    ///     Ok(DataType::Int32)
-    ///   }
-    /// }
-    ///
+    /// // Create a schema and SimplifyContext
+    /// let schema = Schema::new(vec![Field::new("b", DataType::Int32, true)])
+    ///     .to_dfschema_ref()
+    ///     .unwrap();
     /// // Create the simplifier
-    /// let simplifier = ExprSimplifier::new(Info::default());
+    /// let context = SimplifyContext::default().with_schema(schema);
+    /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // b < 2
     /// let b_lt_2 = col("b").gt(lit(2));
@@ -197,12 +180,11 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// optimizations.
     ///
     /// See [Self::simplify] for details and usage examples.
-    ///
     #[deprecated(
         since = "48.0.0",
         note = "Use `simplify_with_cycle_count_transformed` instead"
     )]
-    #[allow(unused_mut)]
+    #[expect(unused_mut)]
     pub fn simplify_with_cycle_count(&self, mut expr: Expr) -> Result<(Expr, u32)> {
         let (transformed, cycle_count) =
             self.simplify_with_cycle_count_transformed(expr)?;
@@ -221,15 +203,16 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// - The number of simplification cycles that were performed
     ///
     /// See [Self::simplify] for details and usage examples.
-    ///
     pub fn simplify_with_cycle_count_transformed(
         &self,
         mut expr: Expr,
     ) -> Result<(Transformed<Expr>, u32)> {
         let mut simplifier = Simplifier::new(&self.info);
-        let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?;
+        let config_options = Some(Arc::clone(self.info.config_options()));
+        let mut const_evaluator = ConstEvaluator::try_new(config_options)?;
         let mut shorten_in_list_simplifier = ShortenInListSimplifier::new();
-        let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);
+        let guarantees_map: HashMap<&Expr, &NullableInterval> =
+            self.guarantees.iter().map(|(k, v)| (k, v)).collect();
 
         if self.canonicalize {
             expr = expr.rewrite(&mut Canonicalizer::new()).data()?
@@ -246,7 +229,9 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
             } = expr
                 .rewrite(&mut const_evaluator)?
                 .transform_data(|expr| expr.rewrite(&mut simplifier))?
-                .transform_data(|expr| expr.rewrite(&mut guarantee_rewriter))?;
+                .transform_data(|expr| {
+                    rewrite_with_guarantees_map(expr, &guarantees_map)
+                })?;
             expr = data;
             num_cycles += 1;
             // Track if any transformation occurred
@@ -285,24 +270,22 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     ///
     /// ```rust
     /// use arrow::datatypes::{DataType, Field, Schema};
-    /// use datafusion_expr::{col, lit, Expr};
-    /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
+    /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_expr::simplify::SimplifyContext;
+    /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
     ///
     /// let schema = Schema::new(vec![
-    ///   Field::new("x", DataType::Int64, false),
-    ///   Field::new("y", DataType::UInt32, false),
-    ///   Field::new("z", DataType::Int64, false),
-    ///   ])
-    ///   .to_dfschema_ref().unwrap();
+    ///     Field::new("x", DataType::Int64, false),
+    ///     Field::new("y", DataType::UInt32, false),
+    ///     Field::new("z", DataType::Int64, false),
+    /// ])
+    /// .to_dfschema_ref()
+    /// .unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props)
-    ///    .with_schema(schema);
+    /// let context = SimplifyContext::default().with_schema(schema);
     ///
     /// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5)
     /// let expr_x = col("x").gt_eq(lit(3_i64));
@@ -311,15 +294,18 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// let expr = expr_x.and(expr_y).and(expr_z.clone());
     ///
     /// let guarantees = vec![
-    ///    // x ∈ [3, 5]
-    ///    (
-    ///        col("x"),
-    ///        NullableInterval::NotNull {
-    ///            values: Interval::make(Some(3_i64), Some(5_i64)).unwrap()
-    ///        }
-    ///    ),
-    ///    // y = 3
-    ///    (col("y"), NullableInterval::from(ScalarValue::UInt32(Some(3)))),
+    ///     // x ∈ [3, 5]
+    ///     (
+    ///         col("x"),
+    ///         NullableInterval::NotNull {
+    ///             values: Interval::make(Some(3_i64), Some(5_i64)).unwrap(),
+    ///         },
+    ///     ),
+    ///     // y = 3
+    ///     (
+    ///         col("y"),
+    ///         NullableInterval::from(ScalarValue::UInt32(Some(3))),
+    ///     ),
     /// ];
     /// let simplifier = ExprSimplifier::new(context).with_guarantees(guarantees);
     /// let output = simplifier.simplify(expr).unwrap();
@@ -344,24 +330,22 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     ///
     /// ```rust
     /// use arrow::datatypes::{DataType, Field, Schema};
-    /// use datafusion_expr::{col, lit, Expr};
-    /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
+    /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
     /// use datafusion_expr::simplify::SimplifyContext;
+    /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
     ///
     /// let schema = Schema::new(vec![
-    ///   Field::new("a", DataType::Int64, false),
-    ///   Field::new("b", DataType::Int64, false),
-    ///   Field::new("c", DataType::Int64, false),
-    ///   ])
-    ///   .to_dfschema_ref().unwrap();
+    ///     Field::new("a", DataType::Int64, false),
+    ///     Field::new("b", DataType::Int64, false),
+    ///     Field::new("c", DataType::Int64, false),
+    /// ])
+    /// .to_dfschema_ref()
+    /// .unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props)
-    ///    .with_schema(schema);
+    /// let context = SimplifyContext::default().with_schema(schema);
     /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // Expression: a = c AND 1 = b
@@ -375,9 +359,9 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     ///
     /// // If canonicalization is disabled, the expression is not changed
     /// let non_canonicalized = simplifier
-    ///   .with_canonicalize(false)
-    ///   .simplify(expr.clone())
-    ///   .unwrap();
+    ///     .with_canonicalize(false)
+    ///     .simplify(expr.clone())
+    ///     .unwrap();
     ///
     /// assert_eq!(non_canonicalized, expr);
     /// ```
@@ -406,7 +390,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// use arrow::datatypes::{DataType, Field, Schema};
     /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
-    /// use datafusion_expr::execution_props::ExecutionProps;
     /// use datafusion_expr::simplify::SimplifyContext;
     /// use datafusion_optimizer::simplify_expressions::ExprSimplifier;
     ///
@@ -416,9 +399,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     ///   .to_dfschema_ref().unwrap();
     ///
     /// // Create the simplifier
-    /// let props = ExecutionProps::new();
-    /// let context = SimplifyContext::new(&props)
-    ///    .with_schema(schema);
+    /// let context = SimplifyContext::default().with_schema(schema);
     /// let simplifier = ExprSimplifier::new(context);
     ///
     /// // Expression: a IS NOT NULL
@@ -436,7 +417,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
     /// assert_eq!(simplified_expr.data, lit(true));
     /// // Only 1 cycle was executed
     /// assert_eq!(count, 1);
-    ///
     /// ```
     pub fn with_max_cycles(mut self, max_simplifier_cycles: u32) -> Self {
         self.max_simplifier_cycles = max_simplifier_cycles;
@@ -477,7 +457,7 @@ impl TreeNodeRewriter for Canonicalizer {
                 })))
             }
             // <literal> <op> <col>
-            (Expr::Literal(_a), Expr::Column(_b), Some(swapped_op)) => {
+            (Expr::Literal(_a, _), Expr::Column(_b), Some(swapped_op)) => {
                 Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr {
                     left: right,
                     op: swapped_op,
@@ -493,12 +473,11 @@ impl TreeNodeRewriter for Canonicalizer {
     }
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
 /// Partially evaluate `Expr`s so constant subtrees are evaluated at plan time.
 ///
 /// Note it does not handle algebraic rewrites such as `(a or false)`
 /// --> `a`, which is handled by [`Simplifier`]
-struct ConstEvaluator<'a> {
+struct ConstEvaluator {
     /// `can_evaluate` is used during the depth-first-search of the
     /// `Expr` tree to track if any siblings (or their descendants) were
     /// non evaluatable (e.g. had a column reference or volatile
@@ -512,25 +491,28 @@ struct ConstEvaluator<'a> {
     /// means there were no non evaluatable siblings (or their
     /// descendants) so this `Expr` can be evaluated
     can_evaluate: Vec<bool>,
-
-    execution_props: &'a ExecutionProps,
+    /// Execution properties needed to call [`create_physical_expr`].
+    /// `ConstEvaluator` only evaluates expressions without column references
+    /// (i.e. constant expressions) and doesn't use the variable binding features
+    /// of `ExecutionProps` (we explicitly filter out [`Expr::ScalarVariable`]).
+    /// The `config_options` are passed from the session to allow scalar functions
+    /// to access configuration like timezone.
+    execution_props: ExecutionProps,
     input_schema: DFSchema,
     input_batch: RecordBatch,
 }
 
-#[allow(dead_code)]
 /// The simplify result of ConstEvaluator
-#[allow(clippy::large_enum_variant)]
 enum ConstSimplifyResult {
     // Expr was simplified and contains the new expression
-    Simplified(ScalarValue),
+    Simplified(ScalarValue, Option<FieldMetadata>),
     // Expr was not simplified and original value is returned
-    NotSimplified(ScalarValue),
+    NotSimplified(ScalarValue, Option<FieldMetadata>),
     // Evaluation encountered an error, contains the original expression
     SimplifyRuntimeError(DataFusionError, Expr),
 }
 
-impl TreeNodeRewriter for ConstEvaluator<'_> {
+impl TreeNodeRewriter for ConstEvaluator {
     type Node = Expr;
 
     fn f_down(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
@@ -567,13 +549,23 @@ impl TreeNodeRewriter for ConstEvaluator<'_> {
             // any error is countered during simplification, return the original
             // so that normal evaluation can occur
             Some(true) => match self.evaluate_to_scalar(expr) {
-                ConstSimplifyResult::Simplified(s) => {
-                    Ok(Transformed::yes(Expr::Literal(s)))
+                ConstSimplifyResult::Simplified(s, m) => {
+                    Ok(Transformed::yes(Expr::Literal(s, m)))
                 }
-                ConstSimplifyResult::NotSimplified(s) => {
-                    Ok(Transformed::no(Expr::Literal(s)))
+                ConstSimplifyResult::NotSimplified(s, m) => {
+                    Ok(Transformed::no(Expr::Literal(s, m)))
                 }
-                ConstSimplifyResult::SimplifyRuntimeError(_, expr) => {
+                ConstSimplifyResult::SimplifyRuntimeError(err, expr) => {
+                    // For CAST expressions with literal inputs, propagate the error at plan time rather than deferring to execution time.
+                    // This provides clearer error messages and fails fast.
+                    if let Expr::Cast(Cast { ref expr, .. })
+                    | Expr::TryCast(TryCast { ref expr, .. }) = expr
+                        && matches!(expr.as_ref(), Expr::Literal(_, _))
+                    {
+                        return Err(err);
+                    }
+                    // For other expressions (like CASE, COALESCE), preserve the original
+                    // to allow short-circuit evaluation at execution time
                     Ok(Transformed::yes(expr))
                 }
             },
@@ -583,19 +575,32 @@ impl TreeNodeRewriter for ConstEvaluator<'_> {
     }
 }
 
-impl<'a> ConstEvaluator<'a> {
-    /// Create a new `ConstantEvaluator`. Session constants (such as
-    /// the time for `now()` are taken from the passed
-    /// `execution_props`.
-    pub fn try_new(execution_props: &'a ExecutionProps) -> Result<Self> {
+impl ConstEvaluator {
+    /// Create a new `ConstantEvaluator`.
+    ///
+    /// Note: `ConstEvaluator` filters out expressions with scalar variables
+    /// (like `$var`) and volatile functions, so it creates its own default
+    /// `ExecutionProps` internally. The filtered expressions will be evaluated
+    /// at runtime where proper variable bindings are available.
+    ///
+    /// The `config_options` parameter is used to pass session configuration
+    /// (like timezone) to scalar functions during constant evaluation.
+    pub fn try_new(config_options: Option<Arc<ConfigOptions>>) -> Result<Self> {
         // The dummy column name is unused and doesn't matter as only
         // expressions without column references can be evaluated
         static DUMMY_COL_NAME: &str = ".";
-        let schema = Schema::new(vec![Field::new(DUMMY_COL_NAME, DataType::Null, true)]);
-        let input_schema = DFSchema::try_from(schema.clone())?;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            DUMMY_COL_NAME,
+            DataType::Null,
+            true,
+        )]));
+        let input_schema = DFSchema::try_from(Arc::clone(&schema))?;
         // Need a single "input" row to produce a single output row
         let col = new_null_array(&DataType::Null, 1);
-        let input_batch = RecordBatch::try_new(std::sync::Arc::new(schema), vec![col])?;
+        let input_batch = RecordBatch::try_new(schema, vec![col])?;
+
+        let mut execution_props = ExecutionProps::new();
+        execution_props.config_options = config_options;
 
         Ok(Self {
             can_evaluate: vec![],
@@ -632,6 +637,7 @@ impl<'a> ConstEvaluator<'a> {
             | Expr::OuterReferenceColumn(_, _)
             | Expr::Exists { .. }
             | Expr::InSubquery(_)
+            | Expr::SetComparison(_)
             | Expr::ScalarSubquery(_)
             | Expr::WindowFunction { .. }
             | Expr::GroupingSet(_)
@@ -640,7 +646,35 @@ impl<'a> ConstEvaluator<'a> {
             Expr::ScalarFunction(ScalarFunction { func, .. }) => {
                 Self::volatility_ok(func.signature().volatility)
             }
-            Expr::Literal(_)
+            Expr::Cast(Cast { expr, field }) | Expr::TryCast(TryCast { expr, field }) => {
+                if let (
+                    Ok(DataType::Struct(source_fields)),
+                    DataType::Struct(target_fields),
+                ) = (expr.get_type(&DFSchema::empty()), field.data_type())
+                {
+                    // Don't const-fold struct casts with different field counts
+                    if source_fields.len() != target_fields.len() {
+                        return false;
+                    }
+
+                    // Skip const-folding when there is no field name overlap
+                    if !has_one_of_more_common_fields(&source_fields, target_fields) {
+                        return false;
+                    }
+
+                    // Don't const-fold struct casts with empty (0-row) literals
+                    // The simplifier uses a 1-row input batch, which causes dimension mismatches
+                    // when evaluating 0-row struct literals
+                    if let Expr::Literal(ScalarValue::Struct(struct_array), _) =
+                        expr.as_ref()
+                        && struct_array.len() == 0
+                    {
+                        return false;
+                    }
+                }
+                true
+            }
+            Expr::Literal(_, _)
             | Expr::Alias(..)
             | Expr::Unnest(_)
             | Expr::BinaryExpr { .. }
@@ -658,23 +692,34 @@ impl<'a> ConstEvaluator<'a> {
             | Expr::Like { .. }
             | Expr::SimilarTo { .. }
             | Expr::Case(_)
-            | Expr::Cast { .. }
-            | Expr::TryCast { .. }
             | Expr::InList { .. } => true,
         }
     }
 
     /// Internal helper to evaluates an Expr
     pub(crate) fn evaluate_to_scalar(&mut self, expr: Expr) -> ConstSimplifyResult {
-        if let Expr::Literal(s) = expr {
-            return ConstSimplifyResult::NotSimplified(s);
+        if let Expr::Literal(s, m) = expr {
+            return ConstSimplifyResult::NotSimplified(s, m);
         }
 
-        let phys_expr =
-            match create_physical_expr(&expr, &self.input_schema, self.execution_props) {
-                Ok(e) => e,
-                Err(err) => return ConstSimplifyResult::SimplifyRuntimeError(err, expr),
-            };
+        let phys_expr = match create_physical_expr(
+            &expr,
+            &self.input_schema,
+            &self.execution_props,
+        ) {
+            Ok(e) => e,
+            Err(err) => return ConstSimplifyResult::SimplifyRuntimeError(err, expr),
+        };
+        let metadata = phys_expr
+            .return_field(self.input_batch.schema_ref())
+            .ok()
+            .and_then(|f| {
+                let m = f.metadata();
+                match m.is_empty() {
+                    true => None,
+                    false => Some(FieldMetadata::from(m)),
+                }
+            });
         let col_val = match phys_expr.evaluate(&self.input_batch) {
             Ok(v) => v,
             Err(err) => return ConstSimplifyResult::SimplifyRuntimeError(err, expr),
@@ -683,49 +728,31 @@ impl<'a> ConstEvaluator<'a> {
             ColumnarValue::Array(a) => {
                 if a.len() != 1 {
                     ConstSimplifyResult::SimplifyRuntimeError(
-                        DataFusionError::Execution(format!("Could not evaluate the expression, found a result of length {}", a.len())),
+                        exec_datafusion_err!(
+                            "Could not evaluate the expression, found a result of length {}",
+                            a.len()
+                        ),
                         expr,
                     )
                 } else if as_list_array(&a).is_ok() {
-                    ConstSimplifyResult::Simplified(ScalarValue::List(
-                        a.as_list::<i32>().to_owned().into(),
-                    ))
+                    ConstSimplifyResult::Simplified(
+                        ScalarValue::List(a.as_list::<i32>().to_owned().into()),
+                        metadata,
+                    )
                 } else if as_large_list_array(&a).is_ok() {
-                    ConstSimplifyResult::Simplified(ScalarValue::LargeList(
-                        a.as_list::<i64>().to_owned().into(),
-                    ))
+                    ConstSimplifyResult::Simplified(
+                        ScalarValue::LargeList(a.as_list::<i64>().to_owned().into()),
+                        metadata,
+                    )
                 } else {
                     // Non-ListArray
                     match ScalarValue::try_from_array(&a, 0) {
-                        Ok(s) => {
-                            // TODO: support the optimization for `Map` type after support impl hash for it
-                            if matches!(&s, ScalarValue::Map(_)) {
-                                ConstSimplifyResult::SimplifyRuntimeError(
-                                    DataFusionError::NotImplemented("Const evaluate for Map type is still not supported".to_string()),
-                                    expr,
-                                )
-                            } else {
-                                ConstSimplifyResult::Simplified(s)
-                            }
-                        }
+                        Ok(s) => ConstSimplifyResult::Simplified(s, metadata),
                         Err(err) => ConstSimplifyResult::SimplifyRuntimeError(err, expr),
                     }
                 }
             }
-            ColumnarValue::Scalar(s) => {
-                // TODO: support the optimization for `Map` type after support impl hash for it
-                if matches!(&s, ScalarValue::Map(_)) {
-                    ConstSimplifyResult::SimplifyRuntimeError(
-                        DataFusionError::NotImplemented(
-                            "Const evaluate for Map type is still not supported"
-                                .to_string(),
-                        ),
-                        expr,
-                    )
-                } else {
-                    ConstSimplifyResult::Simplified(s)
-                }
-            }
+            ColumnarValue::Scalar(s) => ConstSimplifyResult::Simplified(s, metadata),
         }
     }
 }
@@ -739,17 +766,17 @@ impl<'a> ConstEvaluator<'a> {
 /// * `false = true` and `true = false` to `false`
 /// * `!!expr` to `expr`
 /// * `expr = null` and `expr != null` to `null`
-struct Simplifier<'a, S> {
-    info: &'a S,
+struct Simplifier<'a> {
+    info: &'a SimplifyContext,
 }
 
-impl<'a, S> Simplifier<'a, S> {
-    pub fn new(info: &'a S) -> Self {
+impl<'a> Simplifier<'a> {
+    pub fn new(info: &'a SimplifyContext) -> Self {
         Self { info }
     }
 }
 
-impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
+impl TreeNodeRewriter for Simplifier<'_> {
     type Node = Expr;
 
     /// rewrite the expression simplifying any constant expressions
@@ -762,6 +789,29 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
 
         let info = self.info;
         Ok(match expr {
+            // `value op NULL` -> `NULL`
+            // `NULL op value` -> `NULL`
+            // except for few operators that can return non-null value even when one of the operands is NULL
+            ref expr @ Expr::BinaryExpr(BinaryExpr {
+                ref left,
+                ref op,
+                ref right,
+            }) if op.returns_null_on_null()
+                && (is_null(left.as_ref()) || is_null(right.as_ref())) =>
+            {
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::try_new_null(&info.get_data_type(expr)?)?,
+                    None,
+                ))
+            }
+
+            // `NULL {AND, OR} NULL` -> `NULL`
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: And | Or,
+                right,
+            }) if is_null(&left) && is_null(&right) => Transformed::yes(lit_bool_null()),
+
             //
             // Rules for Eq
             //
@@ -1021,9 +1071,27 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                         right: left_right,
                     }))
                 } else {
-                    return internal_err!("can_reduce_to_equal_statement should only be called with a BinaryExpr");
+                    return internal_err!(
+                        "can_reduce_to_equal_statement should only be called with a BinaryExpr"
+                    );
                 }
             }
+            // A = L1 AND A != L2 --> A = L1 (when L1 != L2)
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: And,
+                right,
+            }) if is_eq_and_ne_with_different_literal(&left, &right) => {
+                Transformed::yes(*left)
+            }
+            // A != L2 AND A = L1 --> A = L1 (when L1 != L2)
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: And,
+                right,
+            }) if is_eq_and_ne_with_different_literal(&right, &left) => {
+                Transformed::yes(*right)
+            }
 
             //
             // Rules for Multiply
@@ -1037,14 +1105,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             }) if is_one(&right) => {
                 simplify_right_is_one_case(info, left, &Multiply, &right)?
             }
-            // A * null --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Multiply,
-                right,
-            }) if is_null(&right) => {
-                simplify_right_is_null_case(info, &left, &Multiply, right)?
-            }
             // 1 * A --> A
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1054,14 +1114,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 // 1 * A is equivalent to A * 1
                 simplify_right_is_one_case(info, right, &Multiply, &left)?
             }
-            // null * A --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Multiply,
-                right,
-            }) if is_null(&left) => {
-                simplify_right_is_null_case(info, &right, &Multiply, left)?
-            }
 
             // A * 0 --> 0 (if A is not null and not floating, since NAN * 0 -> NAN)
             Expr::BinaryExpr(BinaryExpr {
@@ -1098,37 +1150,11 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             }) if is_one(&right) => {
                 simplify_right_is_one_case(info, left, &Divide, &right)?
             }
-            // A / null --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Divide,
-                right,
-            }) if is_null(&right) => {
-                simplify_right_is_null_case(info, &left, &Divide, right)?
-            }
-            // null / A --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Divide,
-                right,
-            }) if is_null(&left) => simplify_null_div_other_case(info, left, &right)?,
 
             //
             // Rules for Modulo
             //
 
-            // A % null --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: Modulo,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-            // null % A --> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Modulo,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
             // A % 1 --> 0 (if A is not nullable and not floating, since NAN % 1 --> NAN)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1138,29 +1164,16 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 && !info.get_data_type(&left)?.is_floating()
                 && is_one(&right) =>
             {
-                Transformed::yes(Expr::Literal(ScalarValue::new_zero(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_zero(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             //
             // Rules for BitwiseAnd
             //
 
-            // A & null -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: BitwiseAnd,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-
-            // null & A -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: BitwiseAnd,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
-
             // A & 0 -> 0 (if A not nullable)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1181,9 +1194,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseAnd,
                 right,
             }) if is_negative_of(&left, &right) && !info.nullable(&right)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_zero(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_zero(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // A & !A -> 0 (if A not nullable)
@@ -1192,9 +1206,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseAnd,
                 right,
             }) if is_negative_of(&right, &left) && !info.nullable(&left)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_zero(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_zero(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // (..A..) & A --> (..A..)
@@ -1233,20 +1248,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // Rules for BitwiseOr
             //
 
-            // A | null -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: BitwiseOr,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-
-            // null | A -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: BitwiseOr,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
-
             // A | 0 -> A (even if A is null)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1267,9 +1268,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseOr,
                 right,
             }) if is_negative_of(&left, &right) && !info.nullable(&right)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_negative_one(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_negative_one(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // A | !A -> -1 (if A not nullable)
@@ -1278,9 +1280,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseOr,
                 right,
             }) if is_negative_of(&right, &left) && !info.nullable(&left)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_negative_one(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_negative_one(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // (..A..) | A --> (..A..)
@@ -1319,20 +1322,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // Rules for BitwiseXor
             //
 
-            // A ^ null -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: BitwiseXor,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-
-            // null ^ A -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: BitwiseXor,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
-
             // A ^ 0 -> A (if A not nullable)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1353,9 +1342,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseXor,
                 right,
             }) if is_negative_of(&left, &right) && !info.nullable(&right)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_negative_one(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_negative_one(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // A ^ !A -> -1 (if A not nullable)
@@ -1364,9 +1354,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 op: BitwiseXor,
                 right,
             }) if is_negative_of(&right, &left) && !info.nullable(&left)? => {
-                Transformed::yes(Expr::Literal(ScalarValue::new_negative_one(
-                    &info.get_data_type(&left)?,
-                )?))
+                Transformed::yes(Expr::Literal(
+                    ScalarValue::new_negative_one(&info.get_data_type(&left)?)?,
+                    None,
+                ))
             }
 
             // (..A..) ^ A --> (the expression without A, if number of A is odd, otherwise one A)
@@ -1377,7 +1368,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             }) if expr_contains(&left, &right, BitwiseXor) => {
                 let expr = delete_xor_in_complex_expr(&left, &right, false);
                 Transformed::yes(if expr == *right {
-                    Expr::Literal(ScalarValue::new_zero(&info.get_data_type(&right)?)?)
+                    Expr::Literal(
+                        ScalarValue::new_zero(&info.get_data_type(&right)?)?,
+                        None,
+                    )
                 } else {
                     expr
                 })
@@ -1391,7 +1385,10 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             }) if expr_contains(&right, &left, BitwiseXor) => {
                 let expr = delete_xor_in_complex_expr(&right, &left, true);
                 Transformed::yes(if expr == *left {
-                    Expr::Literal(ScalarValue::new_zero(&info.get_data_type(&left)?)?)
+                    Expr::Literal(
+                        ScalarValue::new_zero(&info.get_data_type(&left)?)?,
+                        None,
+                    )
                 } else {
                     expr
                 })
@@ -1401,20 +1398,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // Rules for BitwiseShiftRight
             //
 
-            // A >> null -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: BitwiseShiftRight,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-
-            // null >> A -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: BitwiseShiftRight,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
-
             // A >> 0 -> A (even if A is null)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1426,20 +1409,6 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // Rules for BitwiseShiftRight
             //
 
-            // A << null -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left: _,
-                op: BitwiseShiftLeft,
-                right,
-            }) if is_null(&right) => Transformed::yes(*right),
-
-            // null << A -> null
-            Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: BitwiseShiftLeft,
-                right: _,
-            }) if is_null(&left) => Transformed::yes(*left),
-
             // A << 0 -> A (even if A is null)
             Expr::BinaryExpr(BinaryExpr {
                 left,
@@ -1461,6 +1430,89 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // Rules for Case
             //
 
+            // Inline a comparison to a literal with the case statement into the `THEN` clauses.
+            // which can enable further simplifications
+            // CASE WHEN X THEN "a" WHEN Y THEN "b" ... END = "a" --> CASE WHEN X THEN "a" = "a" WHEN Y THEN "b" = "a" END
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: op @ (Eq | NotEq),
+                right,
+            }) if is_case_with_literal_outputs(&left) && is_lit(&right) => {
+                let case = into_case(*left)?;
+                Transformed::yes(Expr::Case(Case {
+                    expr: None,
+                    when_then_expr: case
+                        .when_then_expr
+                        .into_iter()
+                        .map(|(when, then)| {
+                            (
+                                when,
+                                Box::new(Expr::BinaryExpr(BinaryExpr {
+                                    left: then,
+                                    op,
+                                    right: right.clone(),
+                                })),
+                            )
+                        })
+                        .collect(),
+                    else_expr: case.else_expr.map(|els| {
+                        Box::new(Expr::BinaryExpr(BinaryExpr {
+                            left: els,
+                            op,
+                            right,
+                        }))
+                    }),
+                }))
+            }
+
+            // CASE WHEN true THEN A ... END --> A
+            // CASE WHEN X THEN A WHEN TRUE THEN B ... END --> CASE WHEN X THEN A ELSE B END
+            // CASE WHEN false THEN A END --> NULL
+            // CASE WHEN false THEN A ELSE B END --> B
+            // CASE WHEN X THEN A WHEN false THEN B END --> CASE WHEN X THEN A ELSE B END
+            Expr::Case(Case {
+                expr: None,
+                when_then_expr,
+                mut else_expr,
+            }) if when_then_expr
+                .iter()
+                .any(|(when, _)| is_true(when.as_ref()) || is_false(when.as_ref())) =>
+            {
+                let out_type = info.get_data_type(&when_then_expr[0].1)?;
+                let mut new_when_then_expr = Vec::with_capacity(when_then_expr.len());
+
+                for (when, then) in when_then_expr.into_iter() {
+                    if is_true(when.as_ref()) {
+                        // Skip adding the rest of the when-then expressions after WHEN true
+                        // CASE WHEN X THEN A WHEN TRUE THEN B ... END --> CASE WHEN X THEN A ELSE B END
+                        else_expr = Some(then);
+                        break;
+                    } else if !is_false(when.as_ref()) {
+                        new_when_then_expr.push((when, then));
+                    }
+                    // else: skip WHEN false cases
+                }
+
+                // Exclude CASE statement altogether if there are no when-then expressions left
+                if new_when_then_expr.is_empty() {
+                    // CASE WHEN false THEN A ELSE B END --> B
+                    if let Some(else_expr) = else_expr {
+                        return Ok(Transformed::yes(*else_expr));
+                    // CASE WHEN false THEN A END --> NULL
+                    } else {
+                        let null =
+                            Expr::Literal(ScalarValue::try_new_null(&out_type)?, None);
+                        return Ok(Transformed::yes(null));
+                    }
+                }
+
+                Transformed::yes(Expr::Case(Case {
+                    expr: None,
+                    when_then_expr: new_when_then_expr,
+                    else_expr,
+                }))
+            }
+
             // CASE
             //   WHEN X THEN A
             //   WHEN Y THEN B
@@ -1477,7 +1529,11 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 when_then_expr,
                 else_expr,
             }) if !when_then_expr.is_empty()
-                && when_then_expr.len() < 3 // The rewrite is O(n²) so limit to small number
+                // The rewrite is O(n²) in general so limit to small number of when-thens that can be true
+                && (when_then_expr.len() < 3 // small number of input whens
+                    // or all thens are literal bools and a small number of them are true
+                    || (when_then_expr.iter().all(|(_, then)| is_bool_lit(then))
+                        && when_then_expr.iter().filter(|(_, then)| is_true(then)).count() < 3))
                 && info.is_boolean_type(&when_then_expr[0].1)? =>
             {
                 // String disjunction of all the when predicates encountered so far. Not nullable.
@@ -1501,6 +1557,55 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 // Do a first pass at simplification
                 out_expr.rewrite(self)?
             }
+            // CASE
+            //   WHEN X THEN true
+            //   WHEN Y THEN true
+            //   WHEN Z THEN false
+            //   ...
+            //   ELSE true
+            // END
+            //
+            // --->
+            //
+            // NOT(CASE
+            //   WHEN X THEN false
+            //   WHEN Y THEN false
+            //   WHEN Z THEN true
+            //   ...
+            //   ELSE false
+            // END)
+            //
+            // Note: the rationale for this rewrite is that the case can then be further
+            // simplified into a small number of ANDs and ORs
+            Expr::Case(Case {
+                expr: None,
+                when_then_expr,
+                else_expr,
+            }) if !when_then_expr.is_empty()
+                && when_then_expr
+                    .iter()
+                    .all(|(_, then)| is_bool_lit(then)) // all thens are literal bools
+                // This simplification is only helpful if we end up with a small number of true thens
+                && when_then_expr
+                    .iter()
+                    .filter(|(_, then)| is_false(then))
+                    .count()
+                    < 3
+                && else_expr.as_deref().is_none_or(is_bool_lit) =>
+            {
+                Transformed::yes(
+                    Expr::Case(Case {
+                        expr: None,
+                        when_then_expr: when_then_expr
+                            .into_iter()
+                            .map(|(when, then)| (when, Box::new(Expr::Not(then))))
+                            .collect(),
+                        else_expr: else_expr
+                            .map(|else_expr| Box::new(Expr::Not(else_expr))),
+                    })
+                    .not(),
+                )
+            }
             Expr::ScalarFunction(ScalarFunction { func: udf, args }) => {
                 match udf.simplify(args, info)? {
                     ExprSimplifyResult::Original(args) => {
@@ -1523,12 +1628,9 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 (_, expr) => Transformed::no(expr),
             },
 
-            Expr::WindowFunction(WindowFunction {
-                fun: WindowFunctionDefinition::WindowUDF(ref udwf),
-                ..
-            }) => match (udwf.simplify(), expr) {
+            Expr::WindowFunction(ref window_fun) => match (window_fun.simplify(), expr) {
                 (Some(simplify_function), Expr::WindowFunction(wf)) => {
-                    Transformed::yes(simplify_function(wf, info)?)
+                    Transformed::yes(simplify_function(*wf, info)?)
                 }
                 (_, expr) => Transformed::no(expr),
             },
@@ -1557,17 +1659,19 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 left,
                 op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
                 right,
-            }) => Transformed::yes(simplify_regex_expr(left, op, right)?),
+            }) => simplify_regex_expr(left, op, right)?,
 
             // Rules for Like
             Expr::Like(like) => {
                 // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291
                 let escape_char = like.escape_char.unwrap_or('\\');
-                match as_string_scalar(&like.pattern) {
-                    Some((data_type, pattern_str)) => {
+
+                match StringScalar::try_from_expr(&like.pattern) {
+                    Some(string_scalar) => {
+                        let pattern_str = string_scalar.as_str();
                         match pattern_str {
                             None => return Ok(Transformed::yes(lit_bool_null())),
-                            Some(pattern_str) if pattern_str == "%" => {
+                            Some("%") => {
                                 // exp LIKE '%' is
                                 //   - when exp is not NULL, it's true
                                 //   - when exp is NULL, it's NULL
@@ -1599,10 +1703,9 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                                     .replace_all(pattern_str, "%")
                                     .to_string();
                                 Transformed::yes(Expr::Like(Like {
-                                    pattern: Box::new(to_string_scalar(
-                                        data_type,
-                                        Some(simplified_pattern),
-                                    )),
+                                    pattern: Box::new(
+                                        string_scalar.to_expr(&simplified_pattern),
+                                    ),
                                     ..like
                                 }))
                             }
@@ -1642,20 +1745,20 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             // expr IN () --> false
             // expr NOT IN () --> true
             Expr::InList(InList {
-                expr,
+                expr: _,
                 list,
                 negated,
-            }) if list.is_empty() && *expr != Expr::Literal(ScalarValue::Null) => {
-                Transformed::yes(lit(negated))
-            }
+            }) if list.is_empty() => Transformed::yes(lit(negated)),
 
             // null in (x, y, z) --> null
             // null not in (x, y, z) --> null
             Expr::InList(InList {
                 expr,
-                list: _,
+                list,
                 negated: _,
-            }) if is_null(expr.as_ref()) => Transformed::yes(lit_bool_null()),
+            }) if is_null(expr.as_ref()) && !list.is_empty() => {
+                Transformed::yes(lit_bool_null())
+            }
 
             // expr IN ((subquery)) -> expr IN (subquery), see ##5529
             Expr::InList(InList {
@@ -1828,7 +1931,7 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                     info, &left, op, &right,
                 ) && op.supports_propagation() =>
             {
-                unwrap_cast_in_comparison_for_binary(info, left, right, op)?
+                unwrap_cast_in_comparison_for_binary(info, *left, *right, op)?
             }
             // literal op try_cast/cast(expr as data_type)
             // -->
@@ -1841,8 +1944,8 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
             {
                 unwrap_cast_in_comparison_for_binary(
                     info,
-                    right,
-                    left,
+                    *right,
+                    *left,
                     op.swap().unwrap(),
                 )?
             }
@@ -1871,12 +1974,12 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                     .into_iter()
                     .map(|right| {
                         match right {
-                            Expr::Literal(right_lit_value) => {
+                            Expr::Literal(right_lit_value, _) => {
                                 // if the right_lit_value can be casted to the type of internal_left_expr
                                 // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal
                                 let Some(value) = try_cast_literal_to_type(&right_lit_value, &expr_type) else {
                                     internal_err!(
-                                        "Can't cast the list expr {:?} to type {:?}",
+                                        "Can't cast the list expr {:?} to type {}",
                                         right_lit_value, &expr_type
                                     )?
                                 };
@@ -1897,27 +2000,180 @@ impl<S: SimplifyInfo> TreeNodeRewriter for Simplifier<'_, S> {
                 }))
             }
 
+            // =======================================
+            // preimage_in_comparison
+            // =======================================
+            //
+            // For case:
+            // date_part('YEAR', expr) op literal
+            //
+            // For details see datafusion_expr::ScalarUDFImpl::preimage
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                use datafusion_expr::Operator::*;
+                let is_preimage_op = matches!(
+                    op,
+                    Eq | NotEq
+                        | Lt
+                        | LtEq
+                        | Gt
+                        | GtEq
+                        | IsDistinctFrom
+                        | IsNotDistinctFrom
+                );
+                if !is_preimage_op || is_null(&right) {
+                    return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+                        left,
+                        op,
+                        right,
+                    })));
+                }
+
+                if let PreimageResult::Range { interval, expr } =
+                    get_preimage(left.as_ref(), right.as_ref(), info)?
+                {
+                    rewrite_with_preimage(*interval, op, expr)?
+                } else if let Some(swapped) = op.swap() {
+                    if let PreimageResult::Range { interval, expr } =
+                        get_preimage(right.as_ref(), left.as_ref(), info)?
+                    {
+                        rewrite_with_preimage(*interval, swapped, expr)?
+                    } else {
+                        Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+                    }
+                } else {
+                    Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+                }
+            }
+            // For case:
+            // date_part('YEAR', expr) IN (literal1, literal2, ...)
+            Expr::InList(InList {
+                expr,
+                list,
+                negated,
+            }) => {
+                if list.len() > THRESHOLD_INLINE_INLIST || list.iter().any(is_null) {
+                    return Ok(Transformed::no(Expr::InList(InList {
+                        expr,
+                        list,
+                        negated,
+                    })));
+                }
+
+                let (op, combiner): (Operator, fn(Expr, Expr) -> Expr) =
+                    if negated { (NotEq, and) } else { (Eq, or) };
+
+                let mut rewritten: Option<Expr> = None;
+                for item in &list {
+                    let PreimageResult::Range { interval, expr } =
+                        get_preimage(expr.as_ref(), item, info)?
+                    else {
+                        return Ok(Transformed::no(Expr::InList(InList {
+                            expr,
+                            list,
+                            negated,
+                        })));
+                    };
+
+                    let range_expr = rewrite_with_preimage(*interval, op, expr)?.data;
+                    rewritten = Some(match rewritten {
+                        None => range_expr,
+                        Some(acc) => combiner(acc, range_expr),
+                    });
+                }
+
+                if let Some(rewritten) = rewritten {
+                    Transformed::yes(rewritten)
+                } else {
+                    Transformed::no(Expr::InList(InList {
+                        expr,
+                        list,
+                        negated,
+                    }))
+                }
+            }
+
             // no additional rewrites possible
             expr => Transformed::no(expr),
         })
     }
 }
 
-fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
+fn get_preimage(
+    left_expr: &Expr,
+    right_expr: &Expr,
+    info: &SimplifyContext,
+) -> Result<PreimageResult> {
+    let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else {
+        return Ok(PreimageResult::None);
+    };
+    if !is_literal_or_literal_cast(right_expr) {
+        return Ok(PreimageResult::None);
+    }
+    if func.signature().volatility != Volatility::Immutable {
+        return Ok(PreimageResult::None);
+    }
+    func.preimage(args, right_expr, info)
+}
+
+fn is_literal_or_literal_cast(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(ScalarValue::Utf8(s)) => Some((DataType::Utf8, s)),
-        Expr::Literal(ScalarValue::LargeUtf8(s)) => Some((DataType::LargeUtf8, s)),
-        Expr::Literal(ScalarValue::Utf8View(s)) => Some((DataType::Utf8View, s)),
-        _ => None,
+        Expr::Literal(_, _) => true,
+        Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)),
+        Expr::TryCast(TryCast { expr, .. }) => {
+            matches!(expr.as_ref(), Expr::Literal(_, _))
+        }
+        _ => false,
     }
 }
 
-fn to_string_scalar(data_type: DataType, value: Option<String>) -> Expr {
-    match data_type {
-        DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value)),
-        DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value)),
-        DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value)),
-        _ => unreachable!(),
+/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View)
+pub(crate) enum StringScalar<'a> {
+    Utf8(&'a ScalarValue),
+    LargeUtf8(&'a ScalarValue),
+    Utf8View(&'a ScalarValue),
+}
+
+impl<'a> StringScalar<'a> {
+    /// Create a `StringScalar` view from an `Expr` if it is a supported string literal.
+    /// Returns `None` if the expression is not a string literal.
+    pub(crate) fn try_from_expr(expr: &'a Expr) -> Option<Self> {
+        match expr {
+            Expr::Literal(scalar, _) => Self::try_from_scalar(scalar),
+            _ => None,
+        }
+    }
+
+    /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type.
+    /// Returns `None` if the scalar value is not a supported string type.
+    fn try_from_scalar(scalar: &'a ScalarValue) -> Option<Self> {
+        match scalar {
+            ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)),
+            ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)),
+            ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)),
+            _ => None,
+        }
+    }
+
+    /// Returns the underlying string slice.
+    pub(crate) fn as_str(&self) -> Option<&'a str> {
+        match self {
+            Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => {
+                scalar.try_as_str().flatten()
+            }
+        }
+    }
+
+    /// Build a new `Expr` of the same string type with the given value.
+    pub(crate) fn to_expr(&self, val: &str) -> Expr {
+        match self {
+            Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None),
+            Self::LargeUtf8(_) => {
+                Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None)
+            }
+            Self::Utf8View(_) => {
+                Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None)
+            }
+        }
     }
 }
 
@@ -1957,17 +2213,17 @@ fn are_inlist_and_eq(left: &Expr, right: &Expr) -> bool {
 }
 
 /// Try to convert an expression to an in-list expression
-fn as_inlist(expr: &Expr) -> Option<Cow<InList>> {
+fn as_inlist(expr: &'_ Expr) -> Option<Cow<'_, InList>> {
     match expr {
         Expr::InList(inlist) => Some(Cow::Borrowed(inlist)),
         Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => {
             match (left.as_ref(), right.as_ref()) {
-                (Expr::Column(_), Expr::Literal(_)) => Some(Cow::Owned(InList {
+                (Expr::Column(_), Expr::Literal(_, _)) => Some(Cow::Owned(InList {
                     expr: left.clone(),
                     list: vec![*right.clone()],
                     negated: false,
                 })),
-                (Expr::Literal(_), Expr::Column(_)) => Some(Cow::Owned(InList {
+                (Expr::Literal(_, _), Expr::Column(_)) => Some(Cow::Owned(InList {
                     expr: right.clone(),
                     list: vec![*left.clone()],
                     negated: false,
@@ -1987,12 +2243,12 @@ fn to_inlist(expr: Expr) -> Option<InList> {
             op: Operator::Eq,
             right,
         }) => match (left.as_ref(), right.as_ref()) {
-            (Expr::Column(_), Expr::Literal(_)) => Some(InList {
+            (Expr::Column(_), Expr::Literal(_, _)) => Some(InList {
                 expr: left,
                 list: vec![*right],
                 negated: false,
             }),
-            (Expr::Literal(_), Expr::Column(_)) => Some(InList {
+            (Expr::Literal(_, _), Expr::Column(_)) => Some(InList {
                 expr: right,
                 list: vec![*left],
                 negated: false,
@@ -2052,7 +2308,7 @@ fn inlist_except(mut l1: InList, l2: &InList) -> Result<Expr> {
 }
 
 /// Returns expression testing a boolean `expr` for being exactly `true` (not `false` or NULL).
-fn is_exactly_true(expr: Expr, info: &impl SimplifyInfo) -> Result<Expr> {
+fn is_exactly_true(expr: Expr, info: &SimplifyContext) -> Result<Expr> {
     if !info.nullable(&expr)? {
         Ok(expr)
     } else {
@@ -2068,8 +2324,8 @@ fn is_exactly_true(expr: Expr, info: &impl SimplifyInfo) -> Result<Expr> {
 // A / 1 -> A
 //
 // Move this function body out of the large match branch avoid stack overflow
-fn simplify_right_is_one_case<S: SimplifyInfo>(
-    info: &S,
+fn simplify_right_is_one_case(
+    info: &SimplifyContext,
     left: Box<Expr>,
     op: &Operator,
     right: &Expr,
@@ -2090,66 +2346,18 @@ fn simplify_right_is_one_case<S: SimplifyInfo>(
     }
 }
 
-// A * null -> null
-// A / null -> null
-//
-// Move this function body out of the large match branch avoid stack overflow
-fn simplify_right_is_null_case<S: SimplifyInfo>(
-    info: &S,
-    left: &Expr,
-    op: &Operator,
-    right: Box<Expr>,
-) -> Result<Transformed<Expr>> {
-    // Check if resulting type would be different due to coercion
-    let left_type = info.get_data_type(left)?;
-    let right_type = info.get_data_type(&right)?;
-    match BinaryTypeCoercer::new(&left_type, op, &right_type).get_result_type() {
-        Ok(result_type) => {
-            // Only cast if the types differ
-            if right_type != result_type {
-                Ok(Transformed::yes(Expr::Cast(Cast::new(right, result_type))))
-            } else {
-                Ok(Transformed::yes(*right))
-            }
-        }
-        Err(_) => Ok(Transformed::yes(*right)),
-    }
-}
-
-// null / A --> null
-//
-// Move this function body out of the large match branch avoid stack overflow
-fn simplify_null_div_other_case<S: SimplifyInfo>(
-    info: &S,
-    left: Box<Expr>,
-    right: &Expr,
-) -> Result<Transformed<Expr>> {
-    // Check if resulting type would be different due to coercion
-    let left_type = info.get_data_type(&left)?;
-    let right_type = info.get_data_type(right)?;
-    match BinaryTypeCoercer::new(&left_type, &Operator::Divide, &right_type)
-        .get_result_type()
-    {
-        Ok(result_type) => {
-            // Only cast if the types differ
-            if left_type != result_type {
-                Ok(Transformed::yes(Expr::Cast(Cast::new(left, result_type))))
-            } else {
-                Ok(Transformed::yes(*left))
-            }
-        }
-        Err(_) => Ok(Transformed::yes(*left)),
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::simplify_expressions::SimplifyContext;
     use crate::test::test_table_scan_with_name;
-    use arrow::datatypes::FieldRef;
-    use datafusion_common::{assert_contains, DFSchemaRef, ToDFSchema};
+    use arrow::{
+        array::{Int32Array, StructArray},
+        datatypes::{FieldRef, Fields},
+    };
+    use datafusion_common::{DFSchemaRef, ToDFSchema, assert_contains};
     use datafusion_expr::{
+        expr::WindowFunction,
         function::{
             AccumulatorArgs, AggregateFunctionSimplification,
             WindowFunctionSimplification,
@@ -2159,6 +2367,9 @@ mod tests {
     };
     use datafusion_functions_window_common::field::WindowUDFFieldArgs;
     use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
+    use datafusion_physical_expr::PhysicalExpr;
+    use std::hash::Hash;
+    use std::sync::LazyLock;
     use std::{
         collections::HashMap,
         ops::{BitAnd, BitOr, BitXor},
@@ -2170,9 +2381,8 @@ mod tests {
     // ------------------------------
     #[test]
     fn api_basic() {
-        let props = ExecutionProps::new();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
 
         let expr = lit(1) + lit(2);
         let expected = lit(3);
@@ -2182,9 +2392,8 @@ mod tests {
     #[test]
     fn basic_coercion() {
         let schema = test_schema();
-        let props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&props).with_schema(Arc::clone(&schema)),
+            SimplifyContext::default().with_schema(Arc::clone(&schema)),
         );
 
         // Note expr type is int32 (not int64)
@@ -2199,19 +2408,21 @@ mod tests {
     }
 
     fn test_schema() -> DFSchemaRef {
-        Schema::new(vec![
-            Field::new("i", DataType::Int64, false),
-            Field::new("b", DataType::Boolean, true),
-        ])
-        .to_dfschema_ref()
-        .unwrap()
+        static TEST_SCHEMA: LazyLock<DFSchemaRef> = LazyLock::new(|| {
+            Schema::new(vec![
+                Field::new("i", DataType::Int64, false),
+                Field::new("b", DataType::Boolean, true),
+            ])
+            .to_dfschema_ref()
+            .unwrap()
+        });
+        Arc::clone(&TEST_SCHEMA)
     }
 
     #[test]
     fn simplify_and_constant_prop() {
-        let props = ExecutionProps::new();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
 
         // should be able to simplify to false
         // (i * (1 - 2)) > 0
@@ -2222,9 +2433,8 @@ mod tests {
 
     #[test]
     fn simplify_and_constant_prop_with_case() {
-        let props = ExecutionProps::new();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
 
         //   CASE
         //     WHEN i>5 AND false THEN i > 5
@@ -2388,8 +2598,29 @@ mod tests {
         let expr_b = col("c2_non_null").not().and(col("c2_non_null"));
         let expected = lit(false);
 
-        assert_eq!(simplify(expr_a), expected);
-        assert_eq!(simplify(expr_b), expected);
+        assert_eq!(simplify(expr_a), expected);
+        assert_eq!(simplify(expr_b), expected);
+    }
+
+    #[test]
+    fn test_simplify_eq_and_neq_with_different_literals() {
+        // A = 1 AND A != 0 --> A = 1 (when 1 != 0)
+        let expr = col("c2").eq(lit(1)).and(col("c2").not_eq(lit(0)));
+        let expected = col("c2").eq(lit(1));
+        assert_eq!(simplify(expr), expected);
+
+        // A != 0 AND A = 1 --> A = 1 (when 1 != 0)
+        let expr = col("c2").not_eq(lit(0)).and(col("c2").eq(lit(1)));
+        let expected = col("c2").eq(lit(1));
+        assert_eq!(simplify(expr), expected);
+
+        // Should NOT simplify when literals are the same (A = 1 AND A != 1)
+        // This is a contradiction but handled by other rules
+        let expr = col("c2").eq(lit(1)).and(col("c2").not_eq(lit(1)));
+        // Should not be simplified by this rule (left unchanged or handled elsewhere)
+        let result = simplify(expr.clone());
+        // The expression should not have been simplified
+        assert_eq!(result, expr);
     }
 
     #[test]
@@ -2410,15 +2641,15 @@ mod tests {
 
     #[test]
     fn test_simplify_multiply_by_null() {
-        let null = Expr::Literal(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         // A * null --> null
         {
-            let expr = col("c2") * null.clone();
+            let expr = col("c3") * null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null * A --> null
         {
-            let expr = null.clone() * col("c2");
+            let expr = null.clone() * col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
@@ -2474,14 +2705,14 @@ mod tests {
     #[test]
     fn test_simplify_divide_null() {
         // A / null --> null
-        let null = lit(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         {
-            let expr = col("c1") / null.clone();
+            let expr = col("c3") / null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null / A --> null
         {
-            let expr = null.clone() / col("c1");
+            let expr = null.clone() / col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
@@ -2497,15 +2728,15 @@ mod tests {
 
     #[test]
     fn test_simplify_modulo_by_null() {
-        let null = lit(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         // A % null --> null
         {
-            let expr = col("c2") % null.clone();
+            let expr = col("c3") % null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null % A --> null
         {
-            let expr = null.clone() % col("c2");
+            let expr = null.clone() % col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
@@ -2551,45 +2782,45 @@ mod tests {
 
     #[test]
     fn test_simplify_bitwise_xor_by_null() {
-        let null = lit(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         // A ^ null --> null
         {
-            let expr = col("c2") ^ null.clone();
+            let expr = col("c3") ^ null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null ^ A --> null
         {
-            let expr = null.clone() ^ col("c2");
+            let expr = null.clone() ^ col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
 
     #[test]
     fn test_simplify_bitwise_shift_right_by_null() {
-        let null = lit(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         // A >> null --> null
         {
-            let expr = col("c2") >> null.clone();
+            let expr = col("c3") >> null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null >> A --> null
         {
-            let expr = null.clone() >> col("c2");
+            let expr = null.clone() >> col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
 
     #[test]
     fn test_simplify_bitwise_shift_left_by_null() {
-        let null = lit(ScalarValue::Null);
+        let null = lit(ScalarValue::Int64(None));
         // A << null --> null
         {
-            let expr = col("c2") << null.clone();
+            let expr = col("c3") << null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null << A --> null
         {
-            let expr = null.clone() << col("c2");
+            let expr = null.clone() << col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
@@ -2656,15 +2887,15 @@ mod tests {
 
     #[test]
     fn test_simplify_bitwise_and_by_null() {
-        let null = lit(ScalarValue::Null);
+        let null = Expr::Literal(ScalarValue::Int64(None), None);
         // A & null --> null
         {
-            let expr = col("c2") & null.clone();
+            let expr = col("c3") & null.clone();
             assert_eq!(simplify(expr), null);
         }
         // null & A --> null
         {
-            let expr = null.clone() & col("c2");
+            let expr = null.clone() & col("c3");
             assert_eq!(simplify(expr), null);
         }
     }
@@ -3338,18 +3569,15 @@ mod tests {
 
     fn try_simplify(expr: Expr) -> Result<Expr> {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        );
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(schema));
         simplifier.simplify(expr)
     }
 
     fn coerce(expr: Expr) -> Expr {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(Arc::clone(&schema)),
+            SimplifyContext::default().with_schema(Arc::clone(&schema)),
         );
         simplifier.coerce(expr, schema.as_ref()).unwrap()
     }
@@ -3360,10 +3588,8 @@ mod tests {
 
     fn try_simplify_with_cycle_count(expr: Expr) -> Result<(Expr, u32)> {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        );
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(schema));
         let (expr, count) = simplifier.simplify_with_cycle_count_transformed(expr)?;
         Ok((expr.data, count))
     }
@@ -3377,33 +3603,34 @@ mod tests {
         guarantees: Vec<(Expr, NullableInterval)>,
     ) -> Expr {
         let schema = expr_test_schema();
-        let execution_props = ExecutionProps::new();
-        let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&execution_props).with_schema(schema),
-        )
-        .with_guarantees(guarantees);
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(schema))
+                .with_guarantees(guarantees);
         simplifier.simplify(expr).unwrap()
     }
 
     fn expr_test_schema() -> DFSchemaRef {
-        Arc::new(
-            DFSchema::from_unqualified_fields(
-                vec![
-                    Field::new("c1", DataType::Utf8, true),
-                    Field::new("c2", DataType::Boolean, true),
-                    Field::new("c3", DataType::Int64, true),
-                    Field::new("c4", DataType::UInt32, true),
-                    Field::new("c1_non_null", DataType::Utf8, false),
-                    Field::new("c2_non_null", DataType::Boolean, false),
-                    Field::new("c3_non_null", DataType::Int64, false),
-                    Field::new("c4_non_null", DataType::UInt32, false),
-                    Field::new("c5", DataType::FixedSizeBinary(3), true),
-                ]
-                .into(),
-                HashMap::new(),
+        static EXPR_TEST_SCHEMA: LazyLock<DFSchemaRef> = LazyLock::new(|| {
+            Arc::new(
+                DFSchema::from_unqualified_fields(
+                    vec![
+                        Field::new("c1", DataType::Utf8, true),
+                        Field::new("c2", DataType::Boolean, true),
+                        Field::new("c3", DataType::Int64, true),
+                        Field::new("c4", DataType::UInt32, true),
+                        Field::new("c1_non_null", DataType::Utf8, false),
+                        Field::new("c2_non_null", DataType::Boolean, false),
+                        Field::new("c3_non_null", DataType::Int64, false),
+                        Field::new("c4_non_null", DataType::UInt32, false),
+                        Field::new("c5", DataType::FixedSizeBinary(3), true),
+                    ]
+                    .into(),
+                    HashMap::new(),
+                )
+                .unwrap(),
             )
-            .unwrap(),
-        )
+        });
+        Arc::clone(&EXPR_TEST_SCHEMA)
     }
 
     #[test]
@@ -3548,6 +3775,142 @@ mod tests {
         );
     }
 
+    #[test]
+    fn simplify_literal_case_equality() {
+        // CASE WHEN c2 != false THEN "ok" ELSE "not_ok"
+        let simple_case = Expr::Case(Case::new(
+            None,
+            vec![(
+                Box::new(col("c2_non_null").not_eq(lit(false))),
+                Box::new(lit("ok")),
+            )],
+            Some(Box::new(lit("not_ok"))),
+        ));
+
+        // CASE WHEN c2 != false THEN "ok" ELSE "not_ok" == "ok"
+        // -->
+        // CASE WHEN c2 != false THEN "ok" == "ok" ELSE "not_ok" == "ok"
+        // -->
+        // CASE WHEN c2 != false THEN true ELSE false
+        // -->
+        // c2
+        assert_eq!(
+            simplify(binary_expr(simple_case.clone(), Operator::Eq, lit("ok"),)),
+            col("c2_non_null"),
+        );
+
+        // CASE WHEN c2 != false THEN "ok" ELSE "not_ok" != "ok"
+        // -->
+        // NOT(CASE WHEN c2 != false THEN "ok" == "ok" ELSE "not_ok" == "ok")
+        // -->
+        // NOT(CASE WHEN c2 != false THEN true ELSE false)
+        // -->
+        // NOT(c2)
+        assert_eq!(
+            simplify(binary_expr(simple_case, Operator::NotEq, lit("ok"),)),
+            not(col("c2_non_null")),
+        );
+
+        let complex_case = Expr::Case(Case::new(
+            None,
+            vec![
+                (
+                    Box::new(col("c1").eq(lit("inboxed"))),
+                    Box::new(lit("pending")),
+                ),
+                (
+                    Box::new(col("c1").eq(lit("scheduled"))),
+                    Box::new(lit("pending")),
+                ),
+                (
+                    Box::new(col("c1").eq(lit("completed"))),
+                    Box::new(lit("completed")),
+                ),
+                (
+                    Box::new(col("c1").eq(lit("paused"))),
+                    Box::new(lit("paused")),
+                ),
+                (Box::new(col("c2")), Box::new(lit("running"))),
+                (
+                    Box::new(col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0)))),
+                    Box::new(lit("backing-off")),
+                ),
+            ],
+            Some(Box::new(lit("ready"))),
+        ));
+
+        assert_eq!(
+            simplify(binary_expr(
+                complex_case.clone(),
+                Operator::Eq,
+                lit("completed"),
+            )),
+            not_distinct_from(col("c1").eq(lit("completed")), lit(true)).and(
+                distinct_from(col("c1").eq(lit("inboxed")), lit(true))
+                    .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true)))
+            )
+        );
+
+        assert_eq!(
+            simplify(binary_expr(
+                complex_case.clone(),
+                Operator::NotEq,
+                lit("completed"),
+            )),
+            distinct_from(col("c1").eq(lit("completed")), lit(true))
+                .or(not_distinct_from(col("c1").eq(lit("inboxed")), lit(true))
+                    .or(not_distinct_from(col("c1").eq(lit("scheduled")), lit(true))))
+        );
+
+        assert_eq!(
+            simplify(binary_expr(
+                complex_case.clone(),
+                Operator::Eq,
+                lit("running"),
+            )),
+            not_distinct_from(col("c2"), lit(true)).and(
+                distinct_from(col("c1").eq(lit("inboxed")), lit(true))
+                    .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true)))
+                    .and(distinct_from(col("c1").eq(lit("completed")), lit(true)))
+                    .and(distinct_from(col("c1").eq(lit("paused")), lit(true)))
+            )
+        );
+
+        assert_eq!(
+            simplify(binary_expr(
+                complex_case.clone(),
+                Operator::Eq,
+                lit("ready"),
+            )),
+            distinct_from(col("c1").eq(lit("inboxed")), lit(true))
+                .and(distinct_from(col("c1").eq(lit("scheduled")), lit(true)))
+                .and(distinct_from(col("c1").eq(lit("completed")), lit(true)))
+                .and(distinct_from(col("c1").eq(lit("paused")), lit(true)))
+                .and(distinct_from(col("c2"), lit(true)))
+                .and(distinct_from(
+                    col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0))),
+                    lit(true)
+                ))
+        );
+
+        assert_eq!(
+            simplify(binary_expr(
+                complex_case.clone(),
+                Operator::NotEq,
+                lit("ready"),
+            )),
+            not_distinct_from(col("c1").eq(lit("inboxed")), lit(true))
+                .or(not_distinct_from(col("c1").eq(lit("scheduled")), lit(true)))
+                .or(not_distinct_from(col("c1").eq(lit("completed")), lit(true)))
+                .or(not_distinct_from(col("c1").eq(lit("paused")), lit(true)))
+                .or(not_distinct_from(col("c2"), lit(true)))
+                .or(not_distinct_from(
+                    col("c1").eq(lit("invoked")).and(col("c3").gt(lit(0))),
+                    lit(true)
+                ))
+        );
+    }
+
     #[test]
     fn simplify_expr_case_when_then_else() {
         // CASE WHEN c2 != false THEN "ok" == "not_ok" ELSE c2 == true
@@ -3667,6 +4030,200 @@ mod tests {
         );
     }
 
+    #[test]
+    fn simplify_expr_case_when_first_true() {
+        // CASE WHEN true THEN 1 ELSE c1 END --> 1
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![(Box::new(lit(true)), Box::new(lit(1)),)],
+                Some(Box::new(col("c1"))),
+            ))),
+            lit(1)
+        );
+
+        // CASE WHEN true THEN col('a') ELSE col('b') END --> col('a')
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![(Box::new(lit(true)), Box::new(lit("a")),)],
+                Some(Box::new(lit("b"))),
+            ))),
+            lit("a")
+        );
+
+        // CASE WHEN true THEN col('a') WHEN col('x') > 5 THEN col('b') ELSE col('c') END --> col('a')
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(lit(true)), Box::new(lit("a"))),
+                    (Box::new(lit("x").gt(lit(5))), Box::new(lit("b"))),
+                ],
+                Some(Box::new(lit("c"))),
+            ))),
+            lit("a")
+        );
+
+        // CASE WHEN true THEN col('a') END --> col('a') (no else clause)
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![(Box::new(lit(true)), Box::new(lit("a")),)],
+                None,
+            ))),
+            lit("a")
+        );
+
+        // Negative test: CASE WHEN c2 THEN 1 ELSE 2 END should not be simplified
+        let expr = Expr::Case(Case::new(
+            None,
+            vec![(Box::new(col("c2")), Box::new(lit(1)))],
+            Some(Box::new(lit(2))),
+        ));
+        assert_eq!(simplify(expr.clone()), expr);
+
+        // Negative test: CASE WHEN false THEN 1 ELSE 2 END should not use this rule
+        let expr = Expr::Case(Case::new(
+            None,
+            vec![(Box::new(lit(false)), Box::new(lit(1)))],
+            Some(Box::new(lit(2))),
+        ));
+        assert_ne!(simplify(expr), lit(1));
+
+        // Negative test: CASE WHEN col('c1') > 5 THEN 1 ELSE 2 END should not be simplified
+        let expr = Expr::Case(Case::new(
+            None,
+            vec![(Box::new(col("c1").gt(lit(5))), Box::new(lit(1)))],
+            Some(Box::new(lit(2))),
+        ));
+        assert_eq!(simplify(expr.clone()), expr);
+    }
+
+    #[test]
+    fn simplify_expr_case_when_any_true() {
+        // CASE WHEN c3 > 0 THEN 'a' WHEN true THEN 'b' ELSE 'c' END --> CASE WHEN c3 > 0 THEN 'a' ELSE 'b' END
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").gt(lit(0))), Box::new(lit("a"))),
+                    (Box::new(lit(true)), Box::new(lit("b"))),
+                ],
+                Some(Box::new(lit("c"))),
+            ))),
+            Expr::Case(Case::new(
+                None,
+                vec![(Box::new(col("c3").gt(lit(0))), Box::new(lit("a")))],
+                Some(Box::new(lit("b"))),
+            ))
+        );
+
+        // CASE WHEN c3 > 0 THEN 'a' WHEN c4 < 0 THEN 'b' WHEN true THEN 'c' WHEN c3 = 0 THEN 'd' ELSE 'e' END
+        // --> CASE WHEN c3 > 0 THEN 'a' WHEN c4 < 0 THEN 'b' ELSE 'c' END
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").gt(lit(0))), Box::new(lit("a"))),
+                    (Box::new(col("c4").lt(lit(0))), Box::new(lit("b"))),
+                    (Box::new(lit(true)), Box::new(lit("c"))),
+                    (Box::new(col("c3").eq(lit(0))), Box::new(lit("d"))),
+                ],
+                Some(Box::new(lit("e"))),
+            ))),
+            Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").gt(lit(0))), Box::new(lit("a"))),
+                    (Box::new(col("c4").lt(lit(0))), Box::new(lit("b"))),
+                ],
+                Some(Box::new(lit("c"))),
+            ))
+        );
+
+        // CASE WHEN c3 > 0 THEN 1 WHEN c4 < 0 THEN 2 WHEN true THEN 3 END (no else)
+        // --> CASE WHEN c3 > 0 THEN 1 WHEN c4 < 0 THEN 2 ELSE 3 END
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").gt(lit(0))), Box::new(lit(1))),
+                    (Box::new(col("c4").lt(lit(0))), Box::new(lit(2))),
+                    (Box::new(lit(true)), Box::new(lit(3))),
+                ],
+                None,
+            ))),
+            Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").gt(lit(0))), Box::new(lit(1))),
+                    (Box::new(col("c4").lt(lit(0))), Box::new(lit(2))),
+                ],
+                Some(Box::new(lit(3))),
+            ))
+        );
+
+        // Negative test: CASE WHEN c3 > 0 THEN c3 WHEN c4 < 0 THEN 2 ELSE 3 END should not be simplified
+        let expr = Expr::Case(Case::new(
+            None,
+            vec![
+                (Box::new(col("c3").gt(lit(0))), Box::new(col("c3"))),
+                (Box::new(col("c4").lt(lit(0))), Box::new(lit(2))),
+            ],
+            Some(Box::new(lit(3))),
+        ));
+        assert_eq!(simplify(expr.clone()), expr);
+    }
+
+    #[test]
+    fn simplify_expr_case_when_any_false() {
+        // CASE WHEN false THEN 'a' END --> NULL
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![(Box::new(lit(false)), Box::new(lit("a")))],
+                None,
+            ))),
+            Expr::Literal(ScalarValue::Utf8(None), None)
+        );
+
+        // CASE WHEN false THEN 2 ELSE 1 END --> 1
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![(Box::new(lit(false)), Box::new(lit(2)))],
+                Some(Box::new(lit(1))),
+            ))),
+            lit(1),
+        );
+
+        // CASE WHEN c3 < 10 THEN 'b' WHEN false then c3 ELSE c4 END --> CASE WHEN c3 < 10 THEN b ELSE c4 END
+        assert_eq!(
+            simplify(Expr::Case(Case::new(
+                None,
+                vec![
+                    (Box::new(col("c3").lt(lit(10))), Box::new(lit("b"))),
+                    (Box::new(lit(false)), Box::new(col("c3"))),
+                ],
+                Some(Box::new(col("c4"))),
+            ))),
+            Expr::Case(Case::new(
+                None,
+                vec![(Box::new(col("c3").lt(lit(10))), Box::new(lit("b")))],
+                Some(Box::new(col("c4"))),
+            ))
+        );
+
+        // Negative test: CASE WHEN c3 = 4 THEN 1 ELSE 2 END should not be simplified
+        let expr = Expr::Case(Case::new(
+            None,
+            vec![(Box::new(col("c3").eq(lit(4))), Box::new(lit(1)))],
+            Some(Box::new(lit(2))),
+        ));
+        assert_eq!(simplify(expr.clone()), expr);
+    }
+
     fn distinct_from(left: impl Into<Expr>, right: impl Into<Expr>) -> Expr {
         Expr::BinaryExpr(BinaryExpr {
             left: Box::new(left.into()),
@@ -3926,6 +4483,55 @@ mod tests {
         assert_eq!(simplify(expr.clone()), expr);
     }
 
+    #[test]
+    fn simplify_null_in_empty_inlist() {
+        // `NULL::boolean IN ()` == `NULL::boolean IN (SELECT foo FROM empty)` == false
+        let expr = in_list(lit_bool_null(), vec![], false);
+        assert_eq!(simplify(expr), lit(false));
+
+        // `NULL::boolean NOT IN ()` == `NULL::boolean NOT IN (SELECT foo FROM empty)` == true
+        let expr = in_list(lit_bool_null(), vec![], true);
+        assert_eq!(simplify(expr), lit(true));
+
+        // `NULL IN ()` == `NULL IN (SELECT foo FROM empty)` == false
+        let null_null = || Expr::Literal(ScalarValue::Null, None);
+        let expr = in_list(null_null(), vec![], false);
+        assert_eq!(simplify(expr), lit(false));
+
+        // `NULL NOT IN ()` == `NULL NOT IN (SELECT foo FROM empty)` == true
+        let expr = in_list(null_null(), vec![], true);
+        assert_eq!(simplify(expr), lit(true));
+    }
+
+    #[test]
+    fn just_simplifier_simplify_null_in_empty_inlist() {
+        let simplify = |expr: Expr| -> Expr {
+            let schema = expr_test_schema();
+            let info = SimplifyContext::default().with_schema(schema);
+            let simplifier = &mut Simplifier::new(&info);
+            expr.rewrite(simplifier)
+                .expect("Failed to simplify expression")
+                .data
+        };
+
+        // `NULL::boolean IN ()` == `NULL::boolean IN (SELECT foo FROM empty)` == false
+        let expr = in_list(lit_bool_null(), vec![], false);
+        assert_eq!(simplify(expr), lit(false));
+
+        // `NULL::boolean NOT IN ()` == `NULL::boolean NOT IN (SELECT foo FROM empty)` == true
+        let expr = in_list(lit_bool_null(), vec![], true);
+        assert_eq!(simplify(expr), lit(true));
+
+        // `NULL IN ()` == `NULL IN (SELECT foo FROM empty)` == false
+        let null_null = || Expr::Literal(ScalarValue::Null, None);
+        let expr = in_list(null_null(), vec![], false);
+        assert_eq!(simplify(expr), lit(false));
+
+        // `NULL NOT IN ()` == `NULL NOT IN (SELECT foo FROM empty)` == true
+        let expr = in_list(null_null(), vec![], true);
+        assert_eq!(simplify(expr), lit(true));
+    }
+
     #[test]
     fn simplify_large_or() {
         let expr = (0..5)
@@ -4252,22 +4858,24 @@ mod tests {
     }
 
     fn boolean_test_schema() -> DFSchemaRef {
-        Schema::new(vec![
-            Field::new("A", DataType::Boolean, false),
-            Field::new("B", DataType::Boolean, false),
-            Field::new("C", DataType::Boolean, false),
-            Field::new("D", DataType::Boolean, false),
-        ])
-        .to_dfschema_ref()
-        .unwrap()
+        static BOOLEAN_TEST_SCHEMA: LazyLock<DFSchemaRef> = LazyLock::new(|| {
+            Schema::new(vec![
+                Field::new("A", DataType::Boolean, false),
+                Field::new("B", DataType::Boolean, false),
+                Field::new("C", DataType::Boolean, false),
+                Field::new("D", DataType::Boolean, false),
+            ])
+            .to_dfschema_ref()
+            .unwrap()
+        });
+        Arc::clone(&BOOLEAN_TEST_SCHEMA)
     }
 
     #[test]
     fn simplify_common_factor_conjunction_in_disjunction() {
-        let props = ExecutionProps::new();
         let schema = boolean_test_schema();
         let simplifier =
-            ExprSimplifier::new(SimplifyContext::new(&props).with_schema(schema));
+            ExprSimplifier::new(SimplifyContext::default().with_schema(schema));
 
         let a = || col("A");
         let b = || col("B");
@@ -4300,7 +4908,7 @@ mod tests {
                 vec![],
                 false,
                 None,
-                None,
+                vec![],
                 None,
             ));
 
@@ -4314,7 +4922,7 @@ mod tests {
                 vec![],
                 false,
                 None,
-                None,
+                vec![],
                 None,
             ));
 
@@ -4324,7 +4932,7 @@ mod tests {
 
     /// A Mock UDAF which defines `simplify` to be used in tests
     /// related to UDAF simplification
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct SimplifyMockUdaf {
         simplify: bool,
     }
@@ -4389,8 +4997,7 @@ mod tests {
         let udwf = WindowFunctionDefinition::WindowUDF(
             WindowUDF::new_from_impl(SimplifyMockUdwf::new_with_simplify()).into(),
         );
-        let window_function_expr =
-            Expr::WindowFunction(WindowFunction::new(udwf, vec![]));
+        let window_function_expr = Expr::from(WindowFunction::new(udwf, vec![]));
 
         let expected = col("result_column");
         assert_eq!(simplify(window_function_expr), expected);
@@ -4398,8 +5005,7 @@ mod tests {
         let udwf = WindowFunctionDefinition::WindowUDF(
             WindowUDF::new_from_impl(SimplifyMockUdwf::new_without_simplify()).into(),
         );
-        let window_function_expr =
-            Expr::WindowFunction(WindowFunction::new(udwf, vec![]));
+        let window_function_expr = Expr::from(WindowFunction::new(udwf, vec![]));
 
         let expected = window_function_expr.clone();
         assert_eq!(simplify(window_function_expr), expected);
@@ -4407,7 +5013,7 @@ mod tests {
 
     /// A Mock UDWF which defines `simplify` to be used in tests
     /// related to UDWF simplification
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct SimplifyMockUdwf {
         simplify: bool,
     }
@@ -4454,8 +5060,12 @@ mod tests {
         fn field(&self, _field_args: WindowUDFFieldArgs) -> Result<FieldRef> {
             unimplemented!("not needed for tests")
         }
+
+        fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+            LimitEffect::Unknown
+        }
     }
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct VolatileUdf {
         signature: Signature,
     }
@@ -4547,13 +5157,62 @@ mod tests {
         // The simplifier removes the cast.
         assert_eq!(
             simplify(coerced),
-            col("c5").eq(Expr::Literal(ScalarValue::FixedSizeBinary(
-                3,
-                Some(bytes.to_vec()),
-            )))
+            col("c5").eq(Expr::Literal(
+                ScalarValue::FixedSizeBinary(3, Some(bytes.to_vec()),),
+                None
+            ))
         );
     }
 
+    #[test]
+    fn simplify_cast_literal() {
+        // Test that CAST(literal) expressions are evaluated at plan time
+
+        // CAST(123 AS Int64) should become 123i64
+        let expr = Expr::Cast(Cast::new(Box::new(lit(123i32)), DataType::Int64));
+        let expected = lit(123i64);
+        assert_eq!(simplify(expr), expected);
+
+        // CAST(1761630189642 AS Timestamp(Nanosecond, Some("+00:00")))
+        // Integer to timestamp cast
+        let expr = Expr::Cast(Cast::new(
+            Box::new(lit(1761630189642i64)),
+            DataType::Timestamp(
+                arrow::datatypes::TimeUnit::Nanosecond,
+                Some("+00:00".into()),
+            ),
+        ));
+        // Should evaluate to a timestamp literal
+        let result = simplify(expr);
+        match result {
+            Expr::Literal(ScalarValue::TimestampNanosecond(Some(val), tz), _) => {
+                assert_eq!(val, 1761630189642i64);
+                assert_eq!(tz.as_deref(), Some("+00:00"));
+            }
+            other => panic!("Expected TimestampNanosecond literal, got: {other:?}"),
+        }
+
+        // Test CAST of invalid string to timestamp - should return an error at plan time
+        // This represents the case from the issue: CAST(Utf8("1761630189642") AS Timestamp)
+        // "1761630189642" is NOT a valid timestamp string format
+        let expr = Expr::Cast(Cast::new(
+            Box::new(lit("1761630189642")),
+            DataType::Timestamp(
+                arrow::datatypes::TimeUnit::Nanosecond,
+                Some("+00:00".into()),
+            ),
+        ));
+
+        // The simplification should now fail with an error at plan time
+        let schema = test_schema();
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(schema));
+        let result = simplifier.simplify(expr);
+        assert!(result.is_err(), "Expected error for invalid cast");
+        let err_msg = result.unwrap_err().to_string();
+        assert_contains!(err_msg, "Error parsing timestamp");
+    }
+
     fn if_not_null(expr: Expr, then: bool) -> Expr {
         Expr::Case(Case {
             expr: Some(expr.is_not_null().into()),
@@ -4561,4 +5220,156 @@ mod tests {
             else_expr: None,
         })
     }
+
+    // --------------------------------
+    // --- Struct Cast Tests -----
+    // --------------------------------
+
+    /// Helper to create a `Struct` literal cast expression from `source_fields` and `target_fields`.
+    fn make_struct_cast_expr(source_fields: Fields, target_fields: Fields) -> Expr {
+        // Create 1-row struct array (not 0-row) so it can be evaluated by simplifier
+        let arrays: Vec<Arc<dyn Array>> = vec![
+            Arc::new(Int32Array::from(vec![Some(1)])),
+            Arc::new(Int32Array::from(vec![Some(2)])),
+        ];
+        let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap();
+
+        Expr::Cast(Cast::new(
+            Box::new(Expr::Literal(
+                ScalarValue::Struct(Arc::new(struct_array)),
+                None,
+            )),
+            DataType::Struct(target_fields),
+        ))
+    }
+
+    #[test]
+    fn test_struct_cast_different_field_counts_not_foldable() {
+        // Test that struct casts with different field counts are NOT marked as foldable
+        // When field counts differ, const-folding should not be attempted
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("x", DataType::Int32, true)),
+            Arc::new(Field::new("y", DataType::Int32, true)),
+            Arc::new(Field::new("z", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
+
+        // The cast should remain unchanged since field counts differ
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        // Ensure const-folding was not attempted (the expression remains exactly the same)
+        assert_eq!(
+            result, expr,
+            "Struct cast with different field counts should remain unchanged (no const-folding)"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_same_field_count_foldable() {
+        // Test that struct casts with same field counts can be considered for const-folding
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
+
+        // The cast should be simplified
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        // Struct casts with same field count should be const-folded to a literal
+        assert!(matches!(result, Expr::Literal(_, _)));
+        // Ensure the simplifier made a change (not identical to original)
+        assert_ne!(
+            result, expr,
+            "Struct cast with same field count should be simplified (not identical to input)"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_different_names_same_count() {
+        // Test struct cast with same field count but different names
+        // Field count matches; simplification should be skipped because names do not overlap
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("x", DataType::Int32, true)),
+            Arc::new(Field::new("y", DataType::Int32, true)),
+        ]);
+
+        let expr = make_struct_cast_expr(source_fields, target_fields);
+
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
+
+        // The cast should remain unchanged because there is no name overlap
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        assert_eq!(
+            result, expr,
+            "Struct cast with different names but same field count should not be simplified"
+        );
+    }
+
+    #[test]
+    fn test_struct_cast_empty_array_not_foldable() {
+        // Test that struct casts with 0-row (empty) struct arrays are NOT const-folded
+        // The simplifier uses a 1-row input batch, which causes dimension mismatches
+        // when evaluating 0-row struct literals
+
+        let source_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        let target_fields = Fields::from(vec![
+            Arc::new(Field::new("a", DataType::Int32, true)),
+            Arc::new(Field::new("b", DataType::Int32, true)),
+        ]);
+
+        // Create a 0-row (empty) struct array
+        let arrays: Vec<Arc<dyn Array>> = vec![
+            Arc::new(Int32Array::new(vec![].into(), None)),
+            Arc::new(Int32Array::new(vec![].into(), None)),
+        ];
+        let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap();
+
+        let expr = Expr::Cast(Cast::new(
+            Box::new(Expr::Literal(
+                ScalarValue::Struct(Arc::new(struct_array)),
+                None,
+            )),
+            DataType::Struct(target_fields),
+        ));
+
+        let simplifier =
+            ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
+
+        // The cast should remain unchanged since the struct array is empty (0-row)
+        let result = simplifier.simplify(expr.clone()).unwrap();
+        assert_eq!(
+            result, expr,
+            "Struct cast with empty (0-row) array should remain unchanged"
+        );
+    }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions/guarantees.rs b/datafusion/optimizer/src/simplify_expressions/guarantees.rs
deleted file mode 100644
index 2c11632ad6d26..0000000000000
--- a/datafusion/optimizer/src/simplify_expressions/guarantees.rs
+++ /dev/null
@@ -1,476 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Simplifier implementation for [`ExprSimplifier::with_guarantees()`]
-//!
-//! [`ExprSimplifier::with_guarantees()`]: crate::simplify_expressions::expr_simplifier::ExprSimplifier::with_guarantees
-
-use std::{borrow::Cow, collections::HashMap};
-
-use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
-use datafusion_common::{DataFusionError, Result};
-use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
-use datafusion_expr::{expr::InList, lit, Between, BinaryExpr, Expr};
-
-/// Rewrite expressions to incorporate guarantees.
-///
-/// Guarantees are a mapping from an expression (which currently is always a
-/// column reference) to a [NullableInterval]. The interval represents the known
-/// possible values of the column. Using these known values, expressions are
-/// rewritten so they can be simplified using `ConstEvaluator` and `Simplifier`.
-///
-/// For example, if we know that a column is not null and has values in the
-/// range [1, 10), we can rewrite `x IS NULL` to `false` or `x < 10` to `true`.
-///
-/// See a full example in [`ExprSimplifier::with_guarantees()`].
-///
-/// [`ExprSimplifier::with_guarantees()`]: crate::simplify_expressions::expr_simplifier::ExprSimplifier::with_guarantees
-pub struct GuaranteeRewriter<'a> {
-    guarantees: HashMap<&'a Expr, &'a NullableInterval>,
-}
-
-impl<'a> GuaranteeRewriter<'a> {
-    pub fn new(
-        guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>,
-    ) -> Self {
-        Self {
-            // TODO: Clippy wants the "map" call removed, but doing so generates
-            //       a compilation error. Remove the clippy directive once this
-            //       issue is fixed.
-            #[allow(clippy::map_identity)]
-            guarantees: guarantees.into_iter().map(|(k, v)| (k, v)).collect(),
-        }
-    }
-}
-
-impl TreeNodeRewriter for GuaranteeRewriter<'_> {
-    type Node = Expr;
-
-    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
-        if self.guarantees.is_empty() {
-            return Ok(Transformed::no(expr));
-        }
-
-        match &expr {
-            Expr::IsNull(inner) => match self.guarantees.get(inner.as_ref()) {
-                Some(NullableInterval::Null { .. }) => Ok(Transformed::yes(lit(true))),
-                Some(NullableInterval::NotNull { .. }) => {
-                    Ok(Transformed::yes(lit(false)))
-                }
-                _ => Ok(Transformed::no(expr)),
-            },
-            Expr::IsNotNull(inner) => match self.guarantees.get(inner.as_ref()) {
-                Some(NullableInterval::Null { .. }) => Ok(Transformed::yes(lit(false))),
-                Some(NullableInterval::NotNull { .. }) => Ok(Transformed::yes(lit(true))),
-                _ => Ok(Transformed::no(expr)),
-            },
-            Expr::Between(Between {
-                expr: inner,
-                negated,
-                low,
-                high,
-            }) => {
-                if let (Some(interval), Expr::Literal(low), Expr::Literal(high)) = (
-                    self.guarantees.get(inner.as_ref()),
-                    low.as_ref(),
-                    high.as_ref(),
-                ) {
-                    let expr_interval = NullableInterval::NotNull {
-                        values: Interval::try_new(low.clone(), high.clone())?,
-                    };
-
-                    let contains = expr_interval.contains(*interval)?;
-
-                    if contains.is_certainly_true() {
-                        Ok(Transformed::yes(lit(!negated)))
-                    } else if contains.is_certainly_false() {
-                        Ok(Transformed::yes(lit(*negated)))
-                    } else {
-                        Ok(Transformed::no(expr))
-                    }
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
-                // The left or right side of expression might either have a guarantee
-                // or be a literal. Either way, we can resolve them to a NullableInterval.
-                let left_interval = self
-                    .guarantees
-                    .get(left.as_ref())
-                    .map(|interval| Cow::Borrowed(*interval))
-                    .or_else(|| {
-                        if let Expr::Literal(value) = left.as_ref() {
-                            Some(Cow::Owned(value.clone().into()))
-                        } else {
-                            None
-                        }
-                    });
-                let right_interval = self
-                    .guarantees
-                    .get(right.as_ref())
-                    .map(|interval| Cow::Borrowed(*interval))
-                    .or_else(|| {
-                        if let Expr::Literal(value) = right.as_ref() {
-                            Some(Cow::Owned(value.clone().into()))
-                        } else {
-                            None
-                        }
-                    });
-
-                match (left_interval, right_interval) {
-                    (Some(left_interval), Some(right_interval)) => {
-                        let result =
-                            left_interval.apply_operator(op, right_interval.as_ref())?;
-                        if result.is_certainly_true() {
-                            Ok(Transformed::yes(lit(true)))
-                        } else if result.is_certainly_false() {
-                            Ok(Transformed::yes(lit(false)))
-                        } else {
-                            Ok(Transformed::no(expr))
-                        }
-                    }
-                    _ => Ok(Transformed::no(expr)),
-                }
-            }
-
-            // Columns (if interval is collapsed to a single value)
-            Expr::Column(_) => {
-                if let Some(interval) = self.guarantees.get(&expr) {
-                    Ok(Transformed::yes(interval.single_value().map_or(expr, lit)))
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            Expr::InList(InList {
-                expr: inner,
-                list,
-                negated,
-            }) => {
-                if let Some(interval) = self.guarantees.get(inner.as_ref()) {
-                    // Can remove items from the list that don't match the guarantee
-                    let new_list: Vec<Expr> = list
-                        .iter()
-                        .filter_map(|expr| {
-                            if let Expr::Literal(item) = expr {
-                                match interval
-                                    .contains(NullableInterval::from(item.clone()))
-                                {
-                                    // If we know for certain the value isn't in the column's interval,
-                                    // we can skip checking it.
-                                    Ok(interval) if interval.is_certainly_false() => None,
-                                    Ok(_) => Some(Ok(expr.clone())),
-                                    Err(e) => Some(Err(e)),
-                                }
-                            } else {
-                                Some(Ok(expr.clone()))
-                            }
-                        })
-                        .collect::<Result<_, DataFusionError>>()?;
-
-                    Ok(Transformed::yes(Expr::InList(InList {
-                        expr: inner.clone(),
-                        list: new_list,
-                        negated: *negated,
-                    })))
-                } else {
-                    Ok(Transformed::no(expr))
-                }
-            }
-
-            _ => Ok(Transformed::no(expr)),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use arrow::datatypes::DataType;
-    use datafusion_common::tree_node::{TransformedResult, TreeNode};
-    use datafusion_common::ScalarValue;
-    use datafusion_expr::{col, Operator};
-
-    #[test]
-    fn test_null_handling() {
-        // IsNull / IsNotNull can be rewritten to true / false
-        let guarantees = vec![
-            // Note: AlwaysNull case handled by test_column_single_value test,
-            // since it's a special case of a column with a single value.
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::make_unbounded(&DataType::Boolean).unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // x IS NULL => guaranteed false
-        let expr = col("x").is_null();
-        let output = expr.rewrite(&mut rewriter).data().unwrap();
-        assert_eq!(output, lit(false));
-
-        // x IS NOT NULL => guaranteed true
-        let expr = col("x").is_not_null();
-        let output = expr.rewrite(&mut rewriter).data().unwrap();
-        assert_eq!(output, lit(true));
-    }
-
-    fn validate_simplified_cases<T>(rewriter: &mut GuaranteeRewriter, cases: &[(Expr, T)])
-    where
-        ScalarValue: From<T>,
-        T: Clone,
-    {
-        for (expr, expected_value) in cases {
-            let output = expr.clone().rewrite(rewriter).data().unwrap();
-            let expected = lit(ScalarValue::from(expected_value.clone()));
-            assert_eq!(
-                output, expected,
-                "{expr} simplified to {output}, but expected {expected}"
-            );
-        }
-    }
-
-    fn validate_unchanged_cases(rewriter: &mut GuaranteeRewriter, cases: &[Expr]) {
-        for expr in cases {
-            let output = expr.clone().rewrite(rewriter).data().unwrap();
-            assert_eq!(
-                &output, expr,
-                "{expr} was simplified to {output}, but expected it to be unchanged"
-            );
-        }
-    }
-
-    #[test]
-    fn test_inequalities_non_null_unbounded() {
-        let guarantees = vec![
-            // y ∈ [2021-01-01, ∞) (not null)
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::try_new(
-                        ScalarValue::Date32(Some(18628)),
-                        ScalarValue::Date32(None),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // (original_expr, expected_simplification)
-        let simplified_cases = &[
-            (col("x").lt(lit(ScalarValue::Date32(Some(18628)))), false),
-            (col("x").lt_eq(lit(ScalarValue::Date32(Some(17000)))), false),
-            (col("x").gt(lit(ScalarValue::Date32(Some(18627)))), true),
-            (col("x").gt_eq(lit(ScalarValue::Date32(Some(18628)))), true),
-            (col("x").eq(lit(ScalarValue::Date32(Some(17000)))), false),
-            (col("x").not_eq(lit(ScalarValue::Date32(Some(17000)))), true),
-            (
-                col("x").between(
-                    lit(ScalarValue::Date32(Some(16000))),
-                    lit(ScalarValue::Date32(Some(17000))),
-                ),
-                false,
-            ),
-            (
-                col("x").not_between(
-                    lit(ScalarValue::Date32(Some(16000))),
-                    lit(ScalarValue::Date32(Some(17000))),
-                ),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit(ScalarValue::Null)),
-                }),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit(ScalarValue::Date32(Some(17000)))),
-                }),
-                true,
-            ),
-        ];
-
-        validate_simplified_cases(&mut rewriter, simplified_cases);
-
-        let unchanged_cases = &[
-            col("x").lt(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").lt_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").gt(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").gt_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").not_eq(lit(ScalarValue::Date32(Some(19000)))),
-            col("x").between(
-                lit(ScalarValue::Date32(Some(18000))),
-                lit(ScalarValue::Date32(Some(19000))),
-            ),
-            col("x").not_between(
-                lit(ScalarValue::Date32(Some(18000))),
-                lit(ScalarValue::Date32(Some(19000))),
-            ),
-        ];
-
-        validate_unchanged_cases(&mut rewriter, unchanged_cases);
-    }
-
-    #[test]
-    fn test_inequalities_maybe_null() {
-        let guarantees = vec![
-            // x ∈ ("abc", "def"]? (maybe null)
-            (
-                col("x"),
-                NullableInterval::MaybeNull {
-                    values: Interval::try_new(
-                        ScalarValue::from("abc"),
-                        ScalarValue::from("def"),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // (original_expr, expected_simplification)
-        let simplified_cases = &[
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsDistinctFrom,
-                    right: Box::new(lit("z")),
-                }),
-                true,
-            ),
-            (
-                Expr::BinaryExpr(BinaryExpr {
-                    left: Box::new(col("x")),
-                    op: Operator::IsNotDistinctFrom,
-                    right: Box::new(lit("z")),
-                }),
-                false,
-            ),
-        ];
-
-        validate_simplified_cases(&mut rewriter, simplified_cases);
-
-        let unchanged_cases = &[
-            col("x").lt(lit("z")),
-            col("x").lt_eq(lit("z")),
-            col("x").gt(lit("a")),
-            col("x").gt_eq(lit("a")),
-            col("x").eq(lit("abc")),
-            col("x").not_eq(lit("a")),
-            col("x").between(lit("a"), lit("z")),
-            col("x").not_between(lit("a"), lit("z")),
-            Expr::BinaryExpr(BinaryExpr {
-                left: Box::new(col("x")),
-                op: Operator::IsDistinctFrom,
-                right: Box::new(lit(ScalarValue::Null)),
-            }),
-        ];
-
-        validate_unchanged_cases(&mut rewriter, unchanged_cases);
-    }
-
-    #[test]
-    fn test_column_single_value() {
-        let scalars = [
-            ScalarValue::Null,
-            ScalarValue::Int32(Some(1)),
-            ScalarValue::Boolean(Some(true)),
-            ScalarValue::Boolean(None),
-            ScalarValue::from("abc"),
-            ScalarValue::LargeUtf8(Some("def".to_string())),
-            ScalarValue::Date32(Some(18628)),
-            ScalarValue::Date32(None),
-            ScalarValue::Decimal128(Some(1000), 19, 2),
-        ];
-
-        for scalar in scalars {
-            let guarantees = vec![(col("x"), NullableInterval::from(scalar.clone()))];
-            let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-            let output = col("x").rewrite(&mut rewriter).data().unwrap();
-            assert_eq!(output, Expr::Literal(scalar.clone()));
-        }
-    }
-
-    #[test]
-    fn test_in_list() {
-        let guarantees = vec![
-            // x ∈ [1, 10] (not null)
-            (
-                col("x"),
-                NullableInterval::NotNull {
-                    values: Interval::try_new(
-                        ScalarValue::Int32(Some(1)),
-                        ScalarValue::Int32(Some(10)),
-                    )
-                    .unwrap(),
-                },
-            ),
-        ];
-        let mut rewriter = GuaranteeRewriter::new(guarantees.iter());
-
-        // These cases should be simplified so the list doesn't contain any
-        // values the guarantee says are outside the range.
-        // (column_name, starting_list, negated, expected_list)
-        let cases = &[
-            // x IN (9, 11) => x IN (9)
-            ("x", vec![9, 11], false, vec![9]),
-            // x IN (10, 2) => x IN (10, 2)
-            ("x", vec![10, 2], false, vec![10, 2]),
-            // x NOT IN (9, 11) => x NOT IN (9)
-            ("x", vec![9, 11], true, vec![9]),
-            // x NOT IN (0, 22) => x NOT IN ()
-            ("x", vec![0, 22], true, vec![]),
-        ];
-
-        for (column_name, starting_list, negated, expected_list) in cases {
-            let expr = col(*column_name).in_list(
-                starting_list
-                    .iter()
-                    .map(|v| lit(ScalarValue::Int32(Some(*v))))
-                    .collect(),
-                *negated,
-            );
-            let output = expr.clone().rewrite(&mut rewriter).data().unwrap();
-            let expected_list = expected_list
-                .iter()
-                .map(|v| lit(ScalarValue::Int32(Some(*v))))
-                .collect();
-            assert_eq!(
-                output,
-                Expr::InList(InList {
-                    expr: Box::new(col(*column_name)),
-                    list: expected_list,
-                    negated: *negated,
-                })
-            );
-        }
-    }
-}
diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
index c8638eb723955..17112d4f0ae24 100644
--- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
@@ -19,10 +19,10 @@
 
 use super::THRESHOLD_INLINE_INLIST;
 
-use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
 use datafusion_common::Result;
-use datafusion_expr::expr::InList;
+use datafusion_common::tree_node::{Transformed, TreeNodeRewriter};
 use datafusion_expr::Expr;
+use datafusion_expr::expr::InList;
 
 pub(super) struct ShortenInListSimplifier {}
 
@@ -39,56 +39,54 @@ impl TreeNodeRewriter for ShortenInListSimplifier {
         // if expr is a single column reference:
         // expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C)
         if let Expr::InList(InList {
-            expr,
-            list,
+            ref expr,
+            ref list,
             negated,
-        }) = expr.clone()
+        }) = expr
+            && !list.is_empty()
+            && (
+                // For lists with only 1 value we allow more complex expressions to be simplified
+                // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
+                // for more than one we avoid repeating this potentially expensive
+                // expressions
+                list.len() == 1
+                    || list.len() <= THRESHOLD_INLINE_INLIST
+                        && expr.try_as_col().is_some()
+            )
         {
-            if !list.is_empty()
-                && (
-                    // For lists with only 1 value we allow more complex expressions to be simplified
-                    // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
-                    // for more than one we avoid repeating this potentially expensive
-                    // expressions
-                    list.len() == 1
-                        || list.len() <= THRESHOLD_INLINE_INLIST
-                            && expr.try_as_col().is_some()
-                )
-            {
-                let first_val = list[0].clone();
-                if negated {
-                    return Ok(Transformed::yes(list.into_iter().skip(1).fold(
-                        (*expr.clone()).not_eq(first_val),
-                        |acc, y| {
-                            // Note that `A and B and C and D` is a left-deep tree structure
-                            // as such we want to maintain this structure as much as possible
-                            // to avoid reordering the expression during each optimization
-                            // pass.
-                            //
-                            // Left-deep tree structure for `A and B and C and D`:
-                            // ```
-                            //        &
-                            //       / \
-                            //      &   D
-                            //     / \
-                            //    &   C
-                            //   / \
-                            //  A   B
-                            // ```
-                            //
-                            // The code below maintain the left-deep tree structure.
-                            acc.and((*expr.clone()).not_eq(y))
-                        },
-                    )));
-                } else {
-                    return Ok(Transformed::yes(list.into_iter().skip(1).fold(
-                        (*expr.clone()).eq(first_val),
-                        |acc, y| {
-                            // Same reasoning as above
-                            acc.or((*expr.clone()).eq(y))
-                        },
-                    )));
-                }
+            let first_val = list[0].clone();
+            if negated {
+                return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
+                    (*expr.clone()).not_eq(first_val),
+                    |acc, y| {
+                        // Note that `A and B and C and D` is a left-deep tree structure
+                        // as such we want to maintain this structure as much as possible
+                        // to avoid reordering the expression during each optimization
+                        // pass.
+                        //
+                        // Left-deep tree structure for `A and B and C and D`:
+                        // ```
+                        //        &
+                        //       / \
+                        //      &   D
+                        //     / \
+                        //    &   C
+                        //   / \
+                        //  A   B
+                        // ```
+                        //
+                        // The code below maintain the left-deep tree structure.
+                        acc.and((*expr.clone()).not_eq(y))
+                    },
+                )));
+            } else {
+                return Ok(Transformed::yes(list.iter().skip(1).cloned().fold(
+                    (*expr.clone()).eq(first_val),
+                    |acc, y| {
+                        // Same reasoning as above
+                        acc.or((*expr.clone()).eq(y))
+                    },
+                )));
             }
         }
 
diff --git a/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs b/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs
new file mode 100644
index 0000000000000..21389cf326c24
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/linear_aggregates.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplification to refactor multiple aggregate functions to use the same aggregate function
+
+use datafusion_common::HashMap;
+use datafusion_expr::expr::AggregateFunctionParams;
+use datafusion_expr::{BinaryExpr, Expr};
+use datafusion_expr_common::operator::Operator;
+
+/// Threshold of the number of aggregates that share similar arguments before
+/// triggering rewrite.
+///
+/// There is a threshold because the canonical SUM rewrite described in
+/// [`AggregateUDFImpl::simplify_expr_op_literal`] actually results in more
+/// aggregates (2) for each original aggregate. It is important that CSE then
+/// eliminate them.
+///
+/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal
+const DUPLICATE_THRESHOLD: usize = 2;
+
+/// Rewrites multiple aggregate expressions that have a common linear component
+/// into multiple aggregate expressions that share that common component.
+///
+/// For example, rewrites patterns such as
+/// * `SUM(x + 1), SUM(x + 2), ...`
+///
+/// Into
+/// * `SUM(x) + 1 * COUNT(x), SUM(x) + 2 * COUNT(x), ...`
+///
+/// See the background [`AggregateUDFImpl::simplify_expr_op_literal`] for details.
+///
+/// Returns `true` if any of the arguments are rewritten (modified), `false`
+/// otherwise.
+///
+/// ## Design goals:
+/// 1. Keep the aggregate specific logic out of the optimizer (can't depend directly on SUM)
+/// 2. Optimize for the case that this rewrite will not apply (it almost never does)
+///
+/// [`AggregateUDFImpl::simplify_expr_op_literal`]: datafusion_expr::AggregateUDFImpl::simplify_expr_op_literal
+pub(super) fn rewrite_multiple_linear_aggregates(
+    agg_expr: &mut [Expr],
+) -> datafusion_common::Result<bool> {
+    // map <expr>: count of expressions that have a common argument
+    let mut common_args = HashMap::new();
+
+    // First pass -- figure out any aggregates that can be split and have common
+    // expressions.
+    for agg in agg_expr.iter() {
+        let Expr::AggregateFunction(agg_function) = agg else {
+            continue;
+        };
+
+        let Some(arg) = candidate_linear_param(&agg_function.params) else {
+            continue;
+        };
+
+        let Some(expr_literal) = ExprLiteral::try_new(arg) else {
+            continue;
+        };
+
+        let counter = common_args.entry(expr_literal.expr()).or_insert(0);
+        *counter += 1;
+    }
+
+    // (agg_index, new_expr)
+    let mut new_aggs = vec![];
+
+    // Second pass, actually rewrite any aggregates that have a common
+    // expression and enough duplicates.
+    for (idx, agg) in agg_expr.iter().enumerate() {
+        let Expr::AggregateFunction(agg_function) = agg else {
+            continue;
+        };
+
+        let Some(arg) = candidate_linear_param(&agg_function.params) else {
+            continue;
+        };
+
+        let Some(expr_literal) = ExprLiteral::try_new(arg) else {
+            continue;
+        };
+
+        // Not enough common expressions to make it worth rewriting
+        if common_args.get(expr_literal.expr()).unwrap_or(&0) < &DUPLICATE_THRESHOLD {
+            continue;
+        }
+
+        if let Some(new_agg_function) = agg_function.func.simplify_expr_op_literal(
+            agg_function,
+            expr_literal.expr(),
+            expr_literal.op(),
+            expr_literal.lit(),
+            expr_literal.arg_is_left(),
+        )? {
+            new_aggs.push((idx, new_agg_function));
+        }
+    }
+
+    if new_aggs.is_empty() {
+        return Ok(false);
+    }
+
+    // Otherwise replace the aggregate expressions
+    drop(common_args); // release borrow
+    for (idx, new_agg) in new_aggs {
+        let orig_name = agg_expr[idx].name_for_alias()?;
+        agg_expr[idx] = new_agg.alias_if_changed(orig_name)?
+    }
+
+    Ok(true)
+}
+
+/// Returns Some(&Expr) with the single argument if this is a suitable candidate
+/// for the  linear rewrite
+fn candidate_linear_param(params: &AggregateFunctionParams) -> Option<&Expr> {
+    // Explicitly destructure to ensure we check all relevant fields
+    let AggregateFunctionParams {
+        args,
+        distinct,
+        filter,
+        order_by,
+        null_treatment,
+    } = params;
+
+    // Disqualify anything "non standard"
+    if *distinct
+        || filter.is_some()
+        || !order_by.is_empty()
+        || null_treatment.is_some()
+        || args.len() != 1
+    {
+        return None;
+    }
+    let arg = args.first()?;
+    if arg.is_volatile() {
+        return None;
+    };
+    Some(arg)
+}
+
+/// A view into a [`Expr::BinaryExpr`]  that is arbitrary expression and a
+/// literal
+///
+/// This is an enum to distinguish the direction of the operator arguments
+#[derive(Debug, Clone)]
+pub enum ExprLiteral<'a> {
+    /// if the expression is `<arg> <op> <lit>`
+    ArgOpLit {
+        arg: &'a Expr,
+        op: Operator,
+        lit: &'a Expr,
+    },
+    /// if the expression is `<lit> <op> <arg>`
+    LitOpArg {
+        lit: &'a Expr,
+        op: Operator,
+        arg: &'a Expr,
+    },
+}
+
+impl<'a> ExprLiteral<'a> {
+    /// Try and split the Expr into its parts
+    fn try_new(expr: &'a Expr) -> Option<Self> {
+        match expr {
+            // <lit> <op> <expr>
+            Expr::BinaryExpr(BinaryExpr { left, op, right })
+                if matches!(left.as_ref(), Expr::Literal(..)) =>
+            {
+                Some(Self::LitOpArg {
+                    arg: right,
+                    lit: left,
+                    op: *op,
+                })
+            }
+
+            // <expr> + <lit>
+            Expr::BinaryExpr(BinaryExpr { left, op, right })
+                if matches!(right.as_ref(), Expr::Literal(..)) =>
+            {
+                Some(Self::ArgOpLit {
+                    arg: left,
+                    lit: right,
+                    op: *op,
+                })
+            }
+            _ => None,
+        }
+    }
+
+    fn expr(&self) -> &'a Expr {
+        match self {
+            Self::ArgOpLit { arg, .. } => arg,
+            Self::LitOpArg { arg, .. } => arg,
+        }
+    }
+
+    fn lit(&self) -> &'a Expr {
+        match self {
+            Self::ArgOpLit { lit, .. } => lit,
+            Self::LitOpArg { lit, .. } => lit,
+        }
+    }
+
+    fn op(&self) -> Operator {
+        match self {
+            Self::ArgOpLit { op, .. } => *op,
+            Self::LitOpArg { op, .. } => *op,
+        }
+    }
+
+    fn arg_is_left(&self) -> bool {
+        matches!(self, Self::ArgOpLit { .. })
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs
index 5fbee02e3909e..89c79d3fb4203 100644
--- a/datafusion/optimizer/src/simplify_expressions/mod.rs
+++ b/datafusion/optimizer/src/simplify_expressions/mod.rs
@@ -19,18 +19,22 @@
 //! [`ExprSimplifier`] simplifies individual `Expr`s.
 
 pub mod expr_simplifier;
-mod guarantees;
 mod inlist_simplifier;
+mod linear_aggregates;
 mod regex;
 pub mod simplify_exprs;
+pub mod simplify_literal;
+mod simplify_predicates;
+mod udf_preimage;
 mod unwrap_cast;
 mod utils;
 
 // backwards compatibility
-pub use datafusion_expr::simplify::{SimplifyContext, SimplifyInfo};
+pub use datafusion_expr::simplify::SimplifyContext;
 
 pub use expr_simplifier::*;
 pub use simplify_exprs::*;
+pub use simplify_predicates::simplify_predicates;
 
 // Export for test in datafusion/core/tests/optimizer_integration.rs
-pub use guarantees::GuaranteeRewriter;
+pub use datafusion_expr::expr_rewriter::GuaranteeRewriter;
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs
index ec6485bf4b443..b341c328e992a 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -15,10 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion_common::tree_node::Transformed;
 use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
+use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
 use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
 
+use crate::simplify_expressions::expr_simplifier::StringScalar;
+
 /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
 const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
 
@@ -36,59 +39,76 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
 /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
 /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
 /// - `EQ .*` to NotNull
-/// - `NE .*` means IS EMPTY
+/// - `NE .*` to col IS NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL)
 ///
 /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
 pub fn simplify_regex_expr(
     left: Box<Expr>,
     op: Operator,
     right: Box<Expr>,
-) -> Result<Expr> {
-    let mode = OperatorMode::new(&op);
+) -> Result<Transformed<Expr>> {
+    // Check if the right operand is a supported string literal
+    let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else {
+        return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+            left,
+            op,
+            right,
+        })));
+    };
+    let pattern = string_scalar.as_str();
+    let Some(pattern) = pattern else {
+        return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+            left,
+            op,
+            right,
+        })));
+    };
 
-    if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() {
-        // Handle the special case for ".*" pattern
-        if pattern == ANY_CHAR_REGEX_PATTERN {
-            let new_expr = if mode.not {
-                // not empty
-                let empty_lit = Box::new(lit(""));
-                Expr::BinaryExpr(BinaryExpr {
-                    left,
-                    op: Operator::Eq,
-                    right: empty_lit,
-                })
-            } else {
-                // not null
-                left.is_not_null()
-            };
-            return Ok(new_expr);
-        }
+    let mode = OperatorMode::new(&op);
+    // Handle the special case for ".*" pattern
+    if pattern == ANY_CHAR_REGEX_PATTERN {
+        let new_expr = if mode.not {
+            let null_bool = lit(ScalarValue::Boolean(None));
+            Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(left.is_null()),
+                op: Operator::And,
+                right: Box::new(null_bool),
+            })
+        } else {
+            // not null
+            left.is_not_null()
+        };
+        return Ok(Transformed::yes(new_expr));
+    }
 
-        match regex_syntax::Parser::new().parse(pattern) {
-            Ok(hir) => {
-                let kind = hir.kind();
-                if let HirKind::Alternation(alts) = kind {
-                    if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION {
-                        if let Some(expr) = lower_alt(&mode, &left, alts) {
-                            return Ok(expr);
-                        }
-                    }
-                } else if let Some(expr) = lower_simple(&mode, &left, &hir) {
-                    return Ok(expr);
+    match regex_syntax::Parser::new().parse(pattern) {
+        Ok(hir) => {
+            let kind = hir.kind();
+            if let HirKind::Alternation(alts) = kind {
+                if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
+                    && let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar)
+                {
+                    return Ok(Transformed::yes(expr));
                 }
+            } else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) {
+                return Ok(Transformed::yes(expr));
             }
-            Err(e) => {
-                // error out early since the execution may fail anyways
-                return Err(DataFusionError::Context(
-                    "Invalid regex".to_owned(),
-                    Box::new(DataFusionError::External(Box::new(e))),
-                ));
-            }
+        }
+        Err(e) => {
+            // error out early since the execution may fail anyways
+            return Err(DataFusionError::Context(
+                "Invalid regex".to_owned(),
+                Box::new(DataFusionError::External(Box::new(e))),
+            ));
         }
     }
 
     // Leave untouched if optimization didn't work
-    Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+    Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+        left,
+        op,
+        right,
+    })))
 }
 
 #[derive(Debug)]
@@ -117,11 +137,11 @@ impl OperatorMode {
     }
 
     /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
-    fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
+    fn expr(&self, expr: Box<Expr>, pattern: Box<Expr>) -> Expr {
         let like = Like {
             negated: self.not,
             expr,
-            pattern: Box::new(Expr::Literal(ScalarValue::from(pattern))),
+            pattern,
             escape_char: None,
             case_insensitive: self.i,
         };
@@ -287,11 +307,11 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
             let mut literals = Vec::with_capacity(alters.len());
             for hir in alters {
                 let mut is_safe = false;
-                if let HirKind::Literal(l) = hir.kind() {
-                    if let Some(safe_literal) = str_from_literal(l).map(lit) {
-                        literals.push(safe_literal);
-                        is_safe = true;
-                    }
+                if let HirKind::Literal(l) = hir.kind()
+                    && let Some(safe_literal) = str_from_literal(l).map(lit)
+                {
+                    literals.push(safe_literal);
+                    is_safe = true;
                 }
 
                 if !is_safe {
@@ -311,14 +331,24 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
 }
 
 /// Tries to lower (transform) a simple regex pattern to a LIKE expression.
-fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
+fn lower_simple(
+    mode: &OperatorMode,
+    left: &Expr,
+    hir: &Hir,
+    string_scalar: &StringScalar,
+) -> Option<Expr> {
     match hir.kind() {
         HirKind::Empty => {
-            return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
+            return Some(
+                mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))),
+            );
         }
         HirKind::Literal(l) => {
             let s = like_str_from_literal(l)?;
-            return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
+            return Some(mode.expr(
+                Box::new(left.clone()),
+                Box::new(string_scalar.to_expr(&format!("%{s}%"))),
+            ));
         }
         HirKind::Concat(inner) if is_anchored_literal(inner) => {
             return anchored_literal_to_expr(inner).map(|right| {
@@ -333,7 +363,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
             if let Some(pattern) = partial_anchored_literal_to_like(inner)
                 .or_else(|| collect_concat_to_like_string(inner))
             {
-                return Some(mode.expr(Box::new(left.clone()), pattern));
+                return Some(mode.expr(
+                    Box::new(left.clone()),
+                    Box::new(string_scalar.to_expr(&pattern)),
+                ));
             }
         }
         _ => {}
@@ -344,11 +377,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
 /// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
 /// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
 /// function to return `None`.
-fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
+fn lower_alt(
+    mode: &OperatorMode,
+    left: &Expr,
+    alts: &[Hir],
+    string_scalar: &StringScalar,
+) -> Option<Expr> {
     let mut accu: Option<Expr> = None;
 
     for part in alts {
-        if let Some(expr) = lower_simple(mode, left, part) {
+        if let Some(expr) = lower_simple(mode, left, part, string_scalar) {
             accu = match accu {
                 Some(accu) => {
                     if mode.not {
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index ccf90893e17e2..29ee59342273e 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -20,19 +20,20 @@
 use std::sync::Arc;
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result};
-use datafusion_expr::execution_props::ExecutionProps;
-use datafusion_expr::logical_plan::LogicalPlan;
-use datafusion_expr::simplify::SimplifyContext;
-use datafusion_expr::utils::merge_schema;
+use datafusion_common::{Column, DFSchema, DFSchemaRef, DataFusionError, Result};
 use datafusion_expr::Expr;
+use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection};
+use datafusion_expr::simplify::SimplifyContext;
+use datafusion_expr::utils::{
+    columnize_expr, find_aggregate_exprs, grouping_set_to_exprlist, merge_schema,
+};
 
+use super::ExprSimplifier;
 use crate::optimizer::ApplyOrder;
+use crate::simplify_expressions::linear_aggregates::rewrite_multiple_linear_aggregates;
 use crate::utils::NamePreserver;
 use crate::{OptimizerConfig, OptimizerRule};
 
-use super::ExprSimplifier;
-
 /// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting
 /// [`Expr`]`s evaluating constants and applying algebraic
 /// simplifications
@@ -67,16 +68,14 @@ impl OptimizerRule for SimplifyExpressions {
         plan: LogicalPlan,
         config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>, DataFusionError> {
-        let mut execution_props = ExecutionProps::new();
-        execution_props.query_execution_start_time = config.query_execution_start_time();
-        Self::optimize_internal(plan, &execution_props)
+        Self::optimize_internal(plan, config)
     }
 }
 
 impl SimplifyExpressions {
     fn optimize_internal(
         plan: LogicalPlan,
-        execution_props: &ExecutionProps,
+        config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         let schema = if !plan.inputs().is_empty() {
             DFSchemaRef::new(merge_schema(&plan.inputs()))
@@ -99,7 +98,10 @@ impl SimplifyExpressions {
             Arc::new(DFSchema::empty())
         };
 
-        let info = SimplifyContext::new(execution_props).with_schema(schema);
+        let info = SimplifyContext::default()
+            .with_schema(schema)
+            .with_config_options(config.options())
+            .with_query_execution_start_time(config.query_execution_start_time());
 
         // Inputs have already been rewritten (due to bottom-up traversal handled by Optimizer)
         // Just need to rewrite our own expressions
@@ -137,17 +139,110 @@ impl SimplifyExpressions {
             } else {
                 rewrite_expr(expr)
             }
-        })
+        })?
+        .transform_data(rewrite_aggregate_non_aggregate_aggr_expr)
     }
 }
 
 impl SimplifyExpressions {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
 }
 
+/// Ensures that `LogicalPlan::Aggregate` is well formed after rewrites
+/// by potentially introducing an extra `Projection`.
+///
+/// Also applies the [`rewrite_multiple_linear_aggregates`] special case
+///
+/// # Rationale:
+///
+/// [`LogicalPlan::Aggregate`] requires agg expressions to be (possibly aliased)
+/// [`Expr::AggregateFunction`]. Some UDAF simplifiers may return other [`Expr`]
+/// variants.
+///
+/// # Operation
+///
+/// Rewrites things like this (note that `exp1` is not an aggregate):
+/// * `Aggregate(group_expr, aggr_expr=[exp1 + agg(exp2)])`
+///
+/// into:
+/// * `Projection(exp1 + _X)`
+/// * `  Aggregate(group_expr, aggr_expr=[agg(exp2) AS _X])`
+fn rewrite_aggregate_non_aggregate_aggr_expr(
+    plan: LogicalPlan,
+) -> Result<Transformed<LogicalPlan>> {
+    let LogicalPlan::Aggregate(Aggregate {
+        input,
+        group_expr,
+        mut aggr_expr,
+        schema,
+        ..
+    }) = plan
+    else {
+        return Ok(Transformed::no(plan));
+    };
+
+    let rewrote_aggs = rewrite_multiple_linear_aggregates(&mut aggr_expr)?;
+
+    // Ensure that all Aggregate arguments are AggregateExpr
+    if aggr_expr.iter().all(is_top_level_aggregate_expr) {
+        let new_plan = LogicalPlan::Aggregate(Aggregate::try_new_with_schema(
+            input, group_expr, aggr_expr, schema,
+        )?);
+        return if !rewrote_aggs {
+            Ok(Transformed::no(new_plan))
+        } else {
+            Ok(Transformed::yes(new_plan))
+        };
+    }
+
+    // Otherwise we need to add a Projection above Aggregate to calculate
+    // the final output expressions.
+
+    let inner_aggr_expr = find_aggregate_exprs(aggr_expr.iter());
+    let inner_aggregate = LogicalPlan::Aggregate(Aggregate::try_new(
+        Arc::clone(&input),
+        group_expr.clone(),
+        inner_aggr_expr,
+    )?);
+    let inner_aggregate = Arc::new(inner_aggregate);
+
+    let mut projection_exprs = aggregate_output_exprs(&group_expr)?;
+    projection_exprs.extend(aggr_expr);
+    let projection_exprs = projection_exprs
+        .into_iter()
+        .map(|expr| columnize_expr(expr, inner_aggregate.as_ref()))
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(Transformed::yes(LogicalPlan::Projection(
+        Projection::try_new(projection_exprs, inner_aggregate)?,
+    )))
+}
+
+fn is_top_level_aggregate_expr(expr: &Expr) -> bool {
+    matches!(
+        expr.clone().unalias_nested().data,
+        Expr::AggregateFunction(_)
+    )
+}
+
+fn aggregate_output_exprs(group_expr: &[Expr]) -> Result<Vec<Expr>> {
+    let mut output_exprs = grouping_set_to_exprlist(group_expr)?
+        .into_iter()
+        .cloned()
+        .collect::<Vec<_>>();
+
+    if matches!(group_expr, [Expr::GroupingSet(_)]) {
+        output_exprs.push(Expr::Column(Column::from_name(
+            Aggregate::INTERNAL_GROUPING_ID,
+        )));
+    }
+
+    Ok(output_exprs)
+}
+
 #[cfg(test)]
 mod tests {
     use std::ops::Not;
@@ -155,14 +250,15 @@ mod tests {
     use arrow::datatypes::{DataType, Field, Schema};
     use chrono::{DateTime, Utc};
 
+    use datafusion_common::ScalarValue;
     use datafusion_expr::logical_plan::builder::table_scan_with_filters;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::*;
-    use datafusion_functions_aggregate::expr_fn::{max, min};
+    use datafusion_functions_aggregate::expr_fn::{max, min, sum};
 
+    use crate::OptimizerContext;
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::{assert_fields_eq, test_table_scan_with_name};
-    use crate::OptimizerContext;
 
     use super::*;
 
@@ -218,7 +314,7 @@ mod tests {
 
         assert_optimized_plan_equal!(
             table_scan,
-            @ r"TableScan: test projection=[a], full_filters=[Boolean(true)]"
+            @ "TableScan: test projection=[a], full_filters=[Boolean(true)]"
         )
     }
 
@@ -251,13 +347,59 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @ r"
-            Filter: test.b > Int32(1)
-              Projection: test.a
-                TableScan: test
-            "
+        Filter: test.b > Int32(1)
+          Projection: test.a
+            TableScan: test
+        "
         )
     }
 
+    #[test]
+    fn test_simplify_udaf_to_non_aggregate_expr() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?
+            .build()
+            .expect("building scan");
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(Vec::<Expr>::new(), vec![sum(col("a") + lit(2i64))])?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Aggregate: groupBy=[[]], aggr=[[sum(test.a + Int64(2))]]
+          TableScan: test
+        "
+        )?;
+        Ok(())
+    }
+
+    #[test]
+    fn test_simplify_common_sum_arg() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?
+            .build()
+            .expect("building scan");
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                Vec::<Expr>::new(),
+                vec![sum(col("a") + lit(2i64)), sum(col("a") + lit(3i64))],
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @r"
+        Projection: sum(test.a) + Int64(2) * CAST(count(test.a) AS Int64) AS sum(test.a + Int64(2)), sum(test.a) + Int64(3) * CAST(count(test.a) AS Int64) AS sum(test.a + Int64(3))
+          Aggregate: groupBy=[[]], aggr=[[sum(test.a), count(test.a)]]
+            TableScan: test
+        "
+        )?;
+        Ok(())
+    }
+
     #[test]
     fn test_simplify_optimized_plan_with_or() -> Result<()> {
         let table_scan = test_table_scan();
@@ -269,10 +411,10 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @ r"
-            Filter: test.b > Int32(1)
-              Projection: test.a
-                TableScan: test
-            "
+        Filter: test.b > Int32(1)
+          Projection: test.a
+            TableScan: test
+        "
         )
     }
 
@@ -491,8 +633,7 @@ mod tests {
             .build()?;
 
         let actual = get_optimized_plan_formatted(plan, &time);
-        let expected =
-            "Projection: NOT test.a AS Boolean(true) OR Boolean(false) != test.a\
+        let expected = "Projection: NOT test.a AS Boolean(true) OR Boolean(false) != test.a\
                         \n  TableScan: test";
 
         assert_eq!(expected, actual);
@@ -871,7 +1012,7 @@ mod tests {
         ]);
         let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
 
-        // Test `= ".*"` transforms to true (except for empty strings)
+        // Test `~ ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
             .build()?;
@@ -884,22 +1025,22 @@ mod tests {
         "
         )?;
 
-        // Test `!= ".*"` transforms to checking if the column is empty
+        // Test `!~ ".*"` preserves NULL semantics while remaining false for non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
             .build()?;
 
         assert_optimized_plan_equal!(
             plan,
-            @ r#"
-        Filter: test.a = Utf8("")
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
           TableScan: test
-        "#
+        "
         )?;
 
         // Test case-insensitive versions
 
-        // Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings)
+        // Test `~* ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))?
             .build()?;
@@ -912,17 +1053,199 @@ mod tests {
         "
         )?;
 
-        // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
+        // Test NULL `!~ ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
+        )?;
+
+        // Test `!~* ".*"` preserves NULL semantics while remaining false for non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
             .build()?;
 
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
+          TableScan: test
+        "
+        )?;
+
+        // Test NULL `!~* ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotIMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
+        )
+    }
+
+    #[test]
+    fn simplify_not_in_list() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(col("a").in_list(vec![lit("a"), lit("b")], false).not())?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r#"
+        Filter: test.a != Utf8("a") AND test.a != Utf8("b")
+          TableScan: test
+        "#
+        )
+    }
+
+    #[test]
+    fn simplify_not_not_in_list() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                col("a")
+                    .in_list(vec![lit("a"), lit("b")], false)
+                    .not()
+                    .not(),
+            )?
+            .build()?;
+
         assert_optimized_plan_equal!(
             plan,
             @ r#"
-        Filter: test.a = Utf8("")
+        Filter: test.a = Utf8("a") OR test.a = Utf8("b")
           TableScan: test
         "#
         )
     }
+
+    #[test]
+    fn simplify_not_exists() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+        let table_scan2 =
+            datafusion_expr::table_scan(Some("test2"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                exists(Arc::new(LogicalPlanBuilder::from(table_scan2).build()?)).not(),
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: NOT EXISTS (<subquery>)
+          Subquery:
+            TableScan: test2
+          TableScan: test
+        "
+        )
+    }
+
+    #[test]
+    fn simplify_not_not_exists() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+        let table_scan2 =
+            datafusion_expr::table_scan(Some("test2"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                exists(Arc::new(LogicalPlanBuilder::from(table_scan2).build()?))
+                    .not()
+                    .not(),
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: EXISTS (<subquery>)
+          Subquery:
+            TableScan: test2
+          TableScan: test
+        "
+        )
+    }
+
+    #[test]
+    fn simplify_not_in_subquery() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+        let table_scan2 =
+            datafusion_expr::table_scan(Some("test2"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                in_subquery(
+                    col("a"),
+                    Arc::new(LogicalPlanBuilder::from(table_scan2).build()?),
+                )
+                .not(),
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: test.a NOT IN (<subquery>)
+          Subquery:
+            TableScan: test2
+          TableScan: test
+        "
+        )
+    }
+
+    #[test]
+    fn simplify_not_not_in_subquery() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
+        let table_scan2 =
+            datafusion_expr::table_scan(Some("test2"), &schema, None)?.build()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(
+                in_subquery(
+                    col("a"),
+                    Arc::new(LogicalPlanBuilder::from(table_scan2).build()?),
+                )
+                .not()
+                .not(),
+            )?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: test.a IN (<subquery>)
+          Subquery:
+            TableScan: test2
+          TableScan: test
+        "
+        )
+    }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs b/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs
new file mode 100644
index 0000000000000..b77240fc5343a
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_literal.rs
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Parses and simplifies an expression to a literal of a given type.
+//!
+//! This module provides functionality to parse and simplify static expressions
+//! used in SQL constructs like `FROM TABLE SAMPLE (10 + 50 * 2)`. If they are required
+//! in a planning (not an execution) phase, they need to be reduced to literals of a given type.
+
+use crate::simplify_expressions::ExprSimplifier;
+use arrow::datatypes::ArrowPrimitiveType;
+use datafusion_common::{
+    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, plan_datafusion_err,
+    plan_err,
+};
+use datafusion_expr::Expr;
+use datafusion_expr::simplify::SimplifyContext;
+use std::sync::Arc;
+
+/// Parse and simplifies an expression to a numeric literal,
+/// corresponding to an arrow primitive type `T` (for example, Float64Type).
+///
+/// This function simplifies and coerces the expression, then extracts the underlying
+/// native type using `TryFrom<ScalarValue>`.
+///
+/// # Example
+/// ```ignore
+/// let value: f64 = parse_literal::<Float64Type>(expr)?;
+/// ```
+pub fn parse_literal<T>(expr: &Expr) -> Result<T::Native>
+where
+    T: ArrowPrimitiveType,
+    T::Native: TryFrom<ScalarValue, Error = DataFusionError>,
+{
+    // Empty schema is sufficient because it parses only literal expressions
+    let schema = DFSchemaRef::new(DFSchema::empty());
+
+    log::debug!("Parsing expr {:?} to type {}", expr, T::DATA_TYPE);
+
+    let simplifier =
+        ExprSimplifier::new(SimplifyContext::default().with_schema(Arc::clone(&schema)));
+
+    // Simplify and coerce expression in case of constant arithmetic operations (e.g., 10 + 5)
+    let simplified_expr: Expr = simplifier
+        .simplify(expr.clone())
+        .map_err(|err| plan_datafusion_err!("Cannot simplify {expr:?}: {err}"))?;
+    let coerced_expr: Expr = simplifier.coerce(simplified_expr, schema.as_ref())?;
+    log::debug!("Coerced expression: {:?}", &coerced_expr);
+
+    match coerced_expr {
+        Expr::Literal(scalar_value, _) => {
+            // It is a literal - proceed to the underlying value
+            // Cast to the target type if needed
+            let casted_scalar = scalar_value.cast_to(&T::DATA_TYPE)?;
+
+            // Extract the native type
+            T::Native::try_from(casted_scalar).map_err(|err| {
+                plan_datafusion_err!(
+                    "Cannot extract {} from scalar value: {err}",
+                    std::any::type_name::<T>()
+                )
+            })
+        }
+        actual => {
+            plan_err!(
+                "Cannot extract literal from coerced {actual:?} expression given {expr:?} expression"
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{Float64Type, Int64Type};
+    use datafusion_expr::{BinaryExpr, lit};
+    use datafusion_expr_common::operator::Operator;
+
+    #[test]
+    fn test_parse_sql_float_literal() {
+        let test_cases = vec![
+            (Expr::Literal(ScalarValue::Float64(Some(0.0)), None), 0.0),
+            (Expr::Literal(ScalarValue::Float64(Some(1.0)), None), 1.0),
+            (
+                Expr::BinaryExpr(BinaryExpr::new(
+                    Box::new(lit(50.0)),
+                    Operator::Minus,
+                    Box::new(lit(10.0)),
+                )),
+                40.0,
+            ),
+            (
+                Expr::Literal(ScalarValue::Utf8(Some("1e2".into())), None),
+                100.0,
+            ),
+            (
+                Expr::Literal(ScalarValue::Utf8(Some("2.5e-1".into())), None),
+                0.25,
+            ),
+        ];
+
+        for (expr, expected) in test_cases {
+            let result: Result<f64> = parse_literal::<Float64Type>(&expr);
+
+            match result {
+                Ok(value) => {
+                    assert!(
+                        (value - expected).abs() < 1e-10,
+                        "For expression '{expr}': expected {expected}, got {value}",
+                    );
+                }
+                Err(e) => panic!("Failed to parse expression '{expr}': {e}"),
+            }
+        }
+    }
+
+    #[test]
+    fn test_parse_sql_integer_literal() {
+        let expr = Expr::BinaryExpr(BinaryExpr::new(
+            Box::new(lit(2)),
+            Operator::Plus,
+            Box::new(lit(4)),
+        ));
+
+        let result: Result<i64> = parse_literal::<Int64Type>(&expr);
+
+        match result {
+            Ok(value) => {
+                assert_eq!(6, value);
+            }
+            Err(e) => panic!("Failed to parse expression: {e}"),
+        }
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs b/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs
new file mode 100644
index 0000000000000..e811ce7313102
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplifies predicates by reducing redundant or overlapping conditions.
+//!
+//! This module provides functionality to optimize logical predicates used in query planning
+//! by eliminating redundant conditions, thus reducing the number of predicates to evaluate.
+//! Unlike the simplifier in `simplify_expressions/simplify_exprs.rs`, which focuses on
+//! general expression simplification (e.g., constant folding and algebraic simplifications),
+//! this module specifically targets predicate optimization by handling containment relationships.
+//! For example, it can simplify `x > 5 AND x > 6` to just `x > 6`, as the latter condition
+//! encompasses the former, resulting in fewer checks during query execution.
+
+use datafusion_common::{Column, Result, ScalarValue};
+use datafusion_expr::{BinaryExpr, Expr, Operator};
+use std::collections::BTreeMap;
+
+/// Simplifies a list of predicates by removing redundancies.
+///
+/// This function takes a vector of predicate expressions and groups them by the column they reference.
+/// Predicates that reference a single column and are comparison operations (e.g., >, >=, <, <=, =)
+/// are analyzed to remove redundant conditions. For instance, `x > 5 AND x > 6` is simplified to
+/// `x > 6`. Other predicates that do not fit this pattern are retained as-is.
+///
+/// # Arguments
+/// * `predicates` - A vector of `Expr` representing the predicates to simplify.
+///
+/// # Returns
+/// A `Result` containing a vector of simplified `Expr` predicates.
+pub fn simplify_predicates(predicates: Vec<Expr>) -> Result<Vec<Expr>> {
+    // Early return for simple cases
+    if predicates.len() <= 1 {
+        return Ok(predicates);
+    }
+
+    // Group predicates by their column reference
+    let mut column_predicates: BTreeMap<Column, Vec<Expr>> = BTreeMap::new();
+    let mut other_predicates = Vec::new();
+
+    for pred in predicates {
+        match &pred {
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op:
+                    Operator::Gt
+                    | Operator::GtEq
+                    | Operator::Lt
+                    | Operator::LtEq
+                    | Operator::Eq,
+                right,
+            }) => {
+                let left_col = extract_column_from_expr(left);
+                let right_col = extract_column_from_expr(right);
+                if let (Some(col), Some(_)) = (&left_col, right.as_literal()) {
+                    column_predicates.entry(col.clone()).or_default().push(pred);
+                } else if let (Some(_), Some(col)) = (left.as_literal(), &right_col) {
+                    column_predicates.entry(col.clone()).or_default().push(pred);
+                } else {
+                    other_predicates.push(pred);
+                }
+            }
+            _ => other_predicates.push(pred),
+        }
+    }
+
+    // Process each column's predicates to remove redundancies
+    let mut result = other_predicates;
+    for (_, preds) in column_predicates {
+        let simplified = simplify_column_predicates(preds)?;
+        result.extend(simplified);
+    }
+
+    Ok(result)
+}
+
+/// Simplifies predicates related to a single column.
+///
+/// This function processes a list of predicates that all reference the same column and
+/// simplifies them based on their operators. It groups predicates into greater-than (>, >=),
+/// less-than (<, <=), and equality (=) categories, then selects the most restrictive condition
+/// in each category to reduce redundancy. For example, among `x > 5` and `x > 6`, only `x > 6`
+/// is retained as it is more restrictive.
+///
+/// # Arguments
+/// * `predicates` - A vector of `Expr` representing predicates for a single column.
+///
+/// # Returns
+/// A `Result` containing a vector of simplified `Expr` predicates for the column.
+fn simplify_column_predicates(predicates: Vec<Expr>) -> Result<Vec<Expr>> {
+    if predicates.len() <= 1 {
+        return Ok(predicates);
+    }
+
+    // Group by operator type, but combining similar operators
+    let mut greater_predicates = Vec::new(); // Combines > and >=
+    let mut less_predicates = Vec::new(); // Combines < and <=
+    let mut eq_predicates = Vec::new();
+
+    for pred in predicates {
+        match &pred {
+            Expr::BinaryExpr(BinaryExpr { left: _, op, right }) => {
+                match (op, right.as_literal().is_some()) {
+                    (Operator::Gt, true)
+                    | (Operator::Lt, false)
+                    | (Operator::GtEq, true)
+                    | (Operator::LtEq, false) => greater_predicates.push(pred),
+                    (Operator::Lt, true)
+                    | (Operator::Gt, false)
+                    | (Operator::LtEq, true)
+                    | (Operator::GtEq, false) => less_predicates.push(pred),
+                    (Operator::Eq, _) => eq_predicates.push(pred),
+                    _ => unreachable!("Unexpected operator: {}", op),
+                }
+            }
+            _ => unreachable!("Unexpected predicate {}", pred.to_string()),
+        }
+    }
+
+    let mut result = Vec::new();
+
+    if !eq_predicates.is_empty() {
+        // If there are many equality predicates, we can only keep one if they are all the same
+        if eq_predicates.len() == 1
+            || eq_predicates.iter().all(|e| e == &eq_predicates[0])
+        {
+            result.push(eq_predicates.pop().unwrap());
+        } else {
+            // If they are not the same, add a false predicate
+            result.push(Expr::Literal(ScalarValue::Boolean(Some(false)), None));
+        }
+    }
+
+    // Handle all greater-than-style predicates (keep the most restrictive - highest value)
+    if !greater_predicates.is_empty() {
+        if let Some(most_restrictive) =
+            find_most_restrictive_predicate(&greater_predicates, true)?
+        {
+            result.push(most_restrictive);
+        } else {
+            result.extend(greater_predicates);
+        }
+    }
+
+    // Handle all less-than-style predicates (keep the most restrictive - lowest value)
+    if !less_predicates.is_empty() {
+        if let Some(most_restrictive) =
+            find_most_restrictive_predicate(&less_predicates, false)?
+        {
+            result.push(most_restrictive);
+        } else {
+            result.extend(less_predicates);
+        }
+    }
+
+    Ok(result)
+}
+
+/// Finds the most restrictive predicate from a list based on literal values.
+///
+/// This function iterates through a list of predicates to identify the most restrictive one
+/// by comparing their literal values. For greater-than predicates, the highest value is most
+/// restrictive, while for less-than predicates, the lowest value is most restrictive.
+///
+/// # Arguments
+/// * `predicates` - A slice of `Expr` representing predicates to compare.
+/// * `find_greater` - A boolean indicating whether to find the highest value (true for >, >=)
+///   or the lowest value (false for <, <=).
+///
+/// # Returns
+/// A `Result` containing an `Option<Expr>` with the most restrictive predicate, if any.
+fn find_most_restrictive_predicate(
+    predicates: &[Expr],
+    find_greater: bool,
+) -> Result<Option<Expr>> {
+    if predicates.is_empty() {
+        return Ok(None);
+    }
+
+    let mut most_restrictive_idx = 0;
+    let mut best_value: Option<&ScalarValue> = None;
+
+    for (idx, pred) in predicates.iter().enumerate() {
+        if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = pred {
+            // Extract the literal value based on which side has it
+            let scalar_value = match (right.as_literal(), left.as_literal()) {
+                (Some(scalar), _) => Some(scalar),
+                (_, Some(scalar)) => Some(scalar),
+                _ => None,
+            };
+
+            if let Some(scalar) = scalar_value {
+                if let Some(current_best) = best_value {
+                    let comparison = scalar.try_cmp(current_best)?;
+                    let is_better = if find_greater {
+                        comparison == std::cmp::Ordering::Greater
+                            || (comparison == std::cmp::Ordering::Equal
+                                && op == &Operator::Gt)
+                    } else {
+                        comparison == std::cmp::Ordering::Less
+                            || (comparison == std::cmp::Ordering::Equal
+                                && op == &Operator::Lt)
+                    };
+
+                    if is_better {
+                        best_value = Some(scalar);
+                        most_restrictive_idx = idx;
+                    }
+                } else {
+                    best_value = Some(scalar);
+                    most_restrictive_idx = idx;
+                }
+            }
+        }
+    }
+
+    Ok(Some(predicates[most_restrictive_idx].clone()))
+}
+
+/// Extracts a column reference from an expression, if present.
+///
+/// This function checks if the given expression is a column reference or contains one,
+/// such as within a cast operation. It returns the `Column` if found.
+///
+/// # Arguments
+/// * `expr` - A reference to an `Expr` to inspect for a column reference.
+///
+/// # Returns
+/// An `Option<Column>` containing the column reference if found, otherwise `None`.
+fn extract_column_from_expr(expr: &Expr) -> Option<Column> {
+    match expr {
+        Expr::Column(col) => Some(col.clone()),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::DataType;
+    use datafusion_expr::{cast, col, lit};
+
+    #[test]
+    fn test_simplify_predicates_with_cast() {
+        // Test that predicates on cast expressions are not grouped with predicates on the raw column
+        // a < 5 AND CAST(a AS varchar) < 'abc' AND a < 6
+        // Should simplify to:
+        // a < 5 AND CAST(a AS varchar) < 'abc'
+
+        let predicates = vec![
+            col("a").lt(lit(5i32)),
+            cast(col("a"), DataType::Utf8).lt(lit("abc")),
+            col("a").lt(lit(6i32)),
+        ];
+
+        let result = simplify_predicates(predicates).unwrap();
+
+        // Should have 2 predicates: a < 5 and CAST(a AS varchar) < 'abc'
+        assert_eq!(result.len(), 2);
+
+        // Check that the cast predicate is preserved
+        let has_cast_predicate = result.iter().any(|p| {
+            matches!(p, Expr::BinaryExpr(BinaryExpr { 
+                left, 
+                op: Operator::Lt, 
+                right 
+            }) if matches!(left.as_ref(), Expr::Cast(_)) && right == &Box::new(lit("abc")))
+        });
+        assert!(has_cast_predicate, "Cast predicate should be preserved");
+
+        // Check that we have the more restrictive column predicate (a < 5)
+        let has_column_predicate = result.iter().any(|p| {
+            matches!(p, Expr::BinaryExpr(BinaryExpr { 
+                left, 
+                op: Operator::Lt, 
+                right 
+            }) if left == &Box::new(col("a")) && right == &Box::new(lit(5i32)))
+        });
+        assert!(has_column_predicate, "Should have a < 5 predicate");
+    }
+
+    #[test]
+    fn test_extract_column_ignores_cast() {
+        // Test that extract_column_from_expr does not extract columns from cast expressions
+        let cast_expr = cast(col("a"), DataType::Utf8);
+        assert_eq!(extract_column_from_expr(&cast_expr), None);
+
+        // Test that it still extracts from direct column references
+        let col_expr = col("a");
+        assert_eq!(extract_column_from_expr(&col_expr), Some(Column::from("a")));
+    }
+
+    #[test]
+    fn test_simplify_predicates_direct_columns_only() {
+        // Test that only predicates on direct columns are simplified together
+        let predicates = vec![
+            col("a").lt(lit(5i32)),
+            col("a").lt(lit(3i32)),
+            col("b").gt(lit(10i32)),
+            col("b").gt(lit(20i32)),
+        ];
+
+        let result = simplify_predicates(predicates).unwrap();
+
+        // Should have 2 predicates: a < 3 and b > 20 (most restrictive for each column)
+        assert_eq!(result.len(), 2);
+
+        // Check for a < 3
+        let has_a_predicate = result.iter().any(|p| {
+            matches!(p, Expr::BinaryExpr(BinaryExpr { 
+                left, 
+                op: Operator::Lt, 
+                right 
+            }) if left == &Box::new(col("a")) && right == &Box::new(lit(3i32)))
+        });
+        assert!(has_a_predicate, "Should have a < 3 predicate");
+
+        // Check for b > 20
+        let has_b_predicate = result.iter().any(|p| {
+            matches!(p, Expr::BinaryExpr(BinaryExpr { 
+                left, 
+                op: Operator::Gt, 
+                right 
+            }) if left == &Box::new(col("b")) && right == &Box::new(lit(20i32)))
+        });
+        assert!(has_b_predicate, "Should have b > 20 predicate");
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs
new file mode 100644
index 0000000000000..da2716d13cb47
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs
@@ -0,0 +1,404 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::{Result, internal_err, tree_node::Transformed};
+use datafusion_expr::{Expr, Operator, and, lit, or};
+use datafusion_expr_common::interval_arithmetic::Interval;
+
+/// Rewrites a binary expression using its "preimage"
+///
+/// Specifically it rewrites expressions of the form `<expr> OP x` (e.g. `<expr> =
+/// x`) where `<expr>` is known to have a pre-image (aka the entire single
+/// range for which it is valid) and `x` is not `NULL`
+///
+/// For details see [`datafusion_expr::ScalarUDFImpl::preimage`]
+pub(super) fn rewrite_with_preimage(
+    preimage_interval: Interval,
+    op: Operator,
+    expr: Expr,
+) -> Result<Transformed<Expr>> {
+    let (lower, upper) = preimage_interval.into_bounds();
+    let (lower, upper) = (lit(lower), lit(upper));
+
+    let rewritten_expr = match op {
+        // <expr> < x   ==>  <expr> < lower
+        Operator::Lt => expr.lt(lower),
+        // <expr> >= x  ==>  <expr> >= lower
+        Operator::GtEq => expr.gt_eq(lower),
+        // <expr> > x ==> <expr> >= upper
+        Operator::Gt => expr.gt_eq(upper),
+        // <expr> <= x ==> <expr> < upper
+        Operator::LtEq => expr.lt(upper),
+        // <expr> = x ==> (<expr> >= lower) and (<expr> < upper)
+        Operator::Eq => and(expr.clone().gt_eq(lower), expr.lt(upper)),
+        // <expr> != x ==> (<expr> < lower) or (<expr> >= upper)
+        Operator::NotEq => or(expr.clone().lt(lower), expr.gt_eq(upper)),
+        // <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper))
+        // but since x is always not NULL => (<expr> is not NULL) and (<expr> >= lower) and (<expr> < upper)
+        Operator::IsNotDistinctFrom => expr
+            .clone()
+            .is_not_null()
+            .and(expr.clone().gt_eq(lower))
+            .and(expr.lt(upper)),
+        // <expr> is distinct from x ==> (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL and x is not NULL) or (<expr> is not NULL and x is NULL)
+        // but given that x is always not NULL => (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL)
+        Operator::IsDistinctFrom => expr
+            .clone()
+            .lt(lower)
+            .or(expr.clone().gt_eq(upper))
+            .or(expr.is_null()),
+        _ => return internal_err!("Expect comparison operators"),
+    };
+    Ok(Transformed::yes(rewritten_expr))
+}
+
+#[cfg(test)]
+mod test {
+    use std::any::Any;
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue};
+    use datafusion_expr::{
+        ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
+        Signature, Volatility, and, binary_expr, col, lit, or, preimage::PreimageResult,
+        simplify::SimplifyContext,
+    };
+
+    use super::Interval;
+    use crate::simplify_expressions::ExprSimplifier;
+
+    fn is_distinct_from(left: Expr, right: Expr) -> Expr {
+        binary_expr(left, Operator::IsDistinctFrom, right)
+    }
+
+    fn is_not_distinct_from(left: Expr, right: Expr) -> Expr {
+        binary_expr(left, Operator::IsNotDistinctFrom, right)
+    }
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct PreimageUdf {
+        /// Defaults to an exact signature with one Int32 argument and Immutable volatility
+        signature: Signature,
+        /// If true, returns a preimage; otherwise, returns None
+        enabled: bool,
+    }
+
+    impl PreimageUdf {
+        fn new() -> Self {
+            Self {
+                signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable),
+                enabled: true,
+            }
+        }
+
+        /// Set the enabled flag
+        fn with_enabled(mut self, enabled: bool) -> Self {
+            self.enabled = enabled;
+            self
+        }
+
+        /// Set the volatility
+        fn with_volatility(mut self, volatility: Volatility) -> Self {
+            self.signature.volatility = volatility;
+            self
+        }
+    }
+
+    impl ScalarUDFImpl for PreimageUdf {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "preimage_func"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Int32)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(500))))
+        }
+
+        fn preimage(
+            &self,
+            args: &[Expr],
+            lit_expr: &Expr,
+            _info: &SimplifyContext,
+        ) -> Result<PreimageResult> {
+            if !self.enabled {
+                return Ok(PreimageResult::None);
+            }
+            if args.len() != 1 {
+                return Ok(PreimageResult::None);
+            }
+
+            let expr = args.first().cloned().expect("Should be column expression");
+            match lit_expr {
+                Expr::Literal(ScalarValue::Int32(Some(500)), _) => {
+                    Ok(PreimageResult::Range {
+                        expr,
+                        interval: Box::new(Interval::try_new(
+                            ScalarValue::Int32(Some(100)),
+                            ScalarValue::Int32(Some(200)),
+                        )?),
+                    })
+                }
+                Expr::Literal(ScalarValue::Int32(Some(600)), _) => {
+                    Ok(PreimageResult::Range {
+                        expr,
+                        interval: Box::new(Interval::try_new(
+                            ScalarValue::Int32(Some(300)),
+                            ScalarValue::Int32(Some(400)),
+                        )?),
+                    })
+                }
+                _ => Ok(PreimageResult::None),
+            }
+        }
+    }
+
+    fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr {
+        let simplify_context = SimplifyContext::default().with_schema(Arc::clone(schema));
+        ExprSimplifier::new(simplify_context)
+            .simplify(expr)
+            .unwrap()
+    }
+
+    fn preimage_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new()).call(vec![col("x")])
+    }
+
+    fn non_immutable_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new().with_volatility(Volatility::Volatile))
+            .call(vec![col("x")])
+    }
+
+    fn no_preimage_udf_expr() -> Expr {
+        ScalarUDF::new_from_impl(PreimageUdf::new().with_enabled(false))
+            .call(vec![col("x")])
+    }
+
+    fn test_schema() -> DFSchemaRef {
+        Arc::new(
+            DFSchema::from_unqualified_fields(
+                vec![Field::new("x", DataType::Int32, true)].into(),
+                Default::default(),
+            )
+            .unwrap(),
+        )
+    }
+
+    fn test_schema_xy() -> DFSchemaRef {
+        Arc::new(
+            DFSchema::from_unqualified_fields(
+                vec![
+                    Field::new("x", DataType::Int32, false),
+                    Field::new("y", DataType::Int32, false),
+                ]
+                .into(),
+                Default::default(),
+            )
+            .unwrap(),
+        )
+    }
+
+    #[test]
+    fn test_preimage_eq_rewrite() {
+        // Equality rewrite when preimage and column expression are available.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().eq(lit(500));
+        let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_noteq_rewrite() {
+        // Inequality rewrite expands to disjoint ranges.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().not_eq(lit(500));
+        let expected = col("x").lt(lit(100)).or(col("x").gt_eq(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_eq_rewrite_swapped() {
+        // Equality rewrite works when the literal appears on the left.
+        let schema = test_schema();
+        let expr = lit(500).eq(preimage_udf_expr());
+        let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_lt_rewrite() {
+        // Less-than comparison rewrites to the lower bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().lt(lit(500));
+        let expected = col("x").lt(lit(100));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_lteq_rewrite() {
+        // Less-than-or-equal comparison rewrites to the upper bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().lt_eq(lit(500));
+        let expected = col("x").lt(lit(200));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_gt_rewrite() {
+        // Greater-than comparison rewrites to the upper bound (inclusive).
+        let schema = test_schema();
+        let expr = preimage_udf_expr().gt(lit(500));
+        let expected = col("x").gt_eq(lit(200));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_gteq_rewrite() {
+        // Greater-than-or-equal comparison rewrites to the lower bound.
+        let schema = test_schema();
+        let expr = preimage_udf_expr().gt_eq(lit(500));
+        let expected = col("x").gt_eq(lit(100));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_is_not_distinct_from_rewrite() {
+        // IS NOT DISTINCT FROM rewrites to equality plus expression not-null check
+        // for non-null literal RHS.
+        let schema = test_schema();
+        let expr = is_not_distinct_from(preimage_udf_expr(), lit(500));
+        let expected = col("x")
+            .is_not_null()
+            .and(col("x").gt_eq(lit(100)))
+            .and(col("x").lt(lit(200)));
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_is_distinct_from_rewrite() {
+        // IS DISTINCT FROM adds an explicit NULL branch for the column.
+        let schema = test_schema();
+        let expr = is_distinct_from(preimage_udf_expr(), lit(500));
+        let expected = col("x")
+            .lt(lit(100))
+            .or(col("x").gt_eq(lit(200)))
+            .or(col("x").is_null());
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_in_list_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list(vec![lit(500), lit(600)], false);
+        let expected = or(
+            and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))),
+            and(col("x").gt_eq(lit(300)), col("x").lt(lit(400))),
+        );
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_not_in_list_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list(vec![lit(500), lit(600)], true);
+        let expected = and(
+            or(col("x").lt(lit(100)), col("x").gt_eq(lit(200))),
+            or(col("x").lt(lit(300)), col("x").gt_eq(lit(400))),
+        );
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_in_list_long_list_no_rewrite() {
+        let schema = test_schema();
+        let expr = preimage_udf_expr().in_list((1..100).map(lit).collect(), false);
+
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+    }
+
+    #[test]
+    fn test_preimage_non_literal_rhs_no_rewrite() {
+        // Non-literal RHS should not be rewritten.
+        let schema = test_schema_xy();
+        let expr = preimage_udf_expr().eq(col("y"));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_null_literal_no_rewrite_distinct_ops() {
+        // NULL literal RHS should not be rewritten for DISTINCTness operators:
+        // - `expr IS DISTINCT FROM NULL`  <=> `NOT (expr IS NULL)`
+        // - `expr IS NOT DISTINCT FROM NULL` <=> `expr IS NULL`
+        //
+        // For normal comparisons (=, !=, <, <=, >, >=), `expr OP NULL` evaluates to NULL
+        // under SQL tri-state logic, and DataFusion's simplifier constant-folds it.
+        // https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html#boolean-tri-state-logic
+
+        let schema = test_schema();
+
+        let expr = is_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None)));
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+
+        let expr =
+            is_not_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None)));
+        assert_eq!(optimize_test(expr.clone(), &schema), expr);
+    }
+
+    #[test]
+    fn test_preimage_non_immutable_no_rewrite() {
+        // Non-immutable UDFs should not participate in preimage rewrites.
+        let schema = test_schema();
+        let expr = non_immutable_udf_expr().eq(lit(500));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+
+    #[test]
+    fn test_preimage_no_preimage_no_rewrite() {
+        // If the UDF provides no preimage, the expression should remain unchanged.
+        let schema = test_schema();
+        let expr = no_preimage_udf_expr().eq(lit(500));
+        let expected = expr.clone();
+
+        assert_eq!(optimize_test(expr, &schema), expected);
+    }
+}
diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
index 37116018cdca5..acf0f32ab2234 100644
--- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
+++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
@@ -53,30 +53,24 @@
 //! ```text
 //! c1 > INT32(10)
 //! ```
-//!
-
-use std::cmp::Ordering;
 
-use arrow::datatypes::{
-    DataType, TimeUnit, MAX_DECIMAL128_FOR_EACH_PRECISION,
-    MIN_DECIMAL128_FOR_EACH_PRECISION,
-};
-use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS};
-use datafusion_common::{internal_err, tree_node::Transformed};
+use arrow::datatypes::DataType;
 use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::{lit, BinaryExpr};
-use datafusion_expr::{simplify::SimplifyInfo, Cast, Expr, Operator, TryCast};
-
-pub(super) fn unwrap_cast_in_comparison_for_binary<S: SimplifyInfo>(
-    info: &S,
-    cast_expr: Box<Expr>,
-    literal: Box<Expr>,
+use datafusion_common::{internal_err, tree_node::Transformed};
+use datafusion_expr::{BinaryExpr, lit};
+use datafusion_expr::{Cast, Expr, Operator, TryCast, simplify::SimplifyContext};
+use datafusion_expr_common::casts::{is_supported_type, try_cast_literal_to_type};
+
+pub(super) fn unwrap_cast_in_comparison_for_binary(
+    info: &SimplifyContext,
+    cast_expr: Expr,
+    literal: Expr,
     op: Operator,
 ) -> Result<Transformed<Expr>> {
-    match (*cast_expr, *literal) {
+    match (cast_expr, literal) {
         (
             Expr::TryCast(TryCast { expr, .. }) | Expr::Cast(Cast { expr, .. }),
-            Expr::Literal(lit_value),
+            Expr::Literal(lit_value, _),
         ) => {
             let Ok(expr_type) = info.get_data_type(&expr) else {
                 return internal_err!("Can't get the data type of the expr {:?}", &expr);
@@ -95,7 +89,7 @@ pub(super) fn unwrap_cast_in_comparison_for_binary<S: SimplifyInfo>(
             // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal
             let Some(value) = try_cast_literal_to_type(&lit_value, &expr_type) else {
                 return internal_err!(
-                    "Can't cast the literal expr {:?} to type {:?}",
+                    "Can't cast the literal expr {:?} to type {}",
                     &lit_value,
                     &expr_type
                 );
@@ -110,10 +104,8 @@ pub(super) fn unwrap_cast_in_comparison_for_binary<S: SimplifyInfo>(
     }
 }
 
-pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary<
-    S: SimplifyInfo,
->(
-    info: &S,
+pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary(
+    info: &SimplifyContext,
     expr: &Expr,
     op: Operator,
     literal: &Expr,
@@ -126,7 +118,7 @@ pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary<
             | Expr::Cast(Cast {
                 expr: left_expr, ..
             }),
-            Expr::Literal(lit_val),
+            Expr::Literal(lit_val, _),
         ) => {
             let Ok(expr_type) = info.get_data_type(left_expr) else {
                 return false;
@@ -148,10 +140,8 @@ pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary<
     }
 }
 
-pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
-    S: SimplifyInfo,
->(
-    info: &S,
+pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist(
+    info: &SimplifyContext,
     expr: &Expr,
     list: &[Expr],
 ) -> bool {
@@ -183,7 +173,7 @@ pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
         }
 
         match right {
-            Expr::Literal(lit_val)
+            Expr::Literal(lit_val, _)
                 if try_cast_literal_to_type(lit_val, &expr_type).is_some() => {}
             _ => return false,
         }
@@ -192,49 +182,6 @@ pub(super) fn is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist<
     true
 }
 
-/// Returns true if unwrap_cast_in_comparison supports this data type
-fn is_supported_type(data_type: &DataType) -> bool {
-    is_supported_numeric_type(data_type)
-        || is_supported_string_type(data_type)
-        || is_supported_dictionary_type(data_type)
-        || is_supported_binary_type(data_type)
-}
-
-/// Returns true if unwrap_cast_in_comparison support this numeric type
-fn is_supported_numeric_type(data_type: &DataType) -> bool {
-    matches!(
-        data_type,
-        DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64
-            | DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::Decimal128(_, _)
-            | DataType::Timestamp(_, _)
-    )
-}
-
-/// Returns true if unwrap_cast_in_comparison supports casting this value as a string
-fn is_supported_string_type(data_type: &DataType) -> bool {
-    matches!(
-        data_type,
-        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
-    )
-}
-
-/// Returns true if unwrap_cast_in_comparison supports casting this value as a dictionary
-fn is_supported_dictionary_type(data_type: &DataType) -> bool {
-    matches!(data_type,
-                    DataType::Dictionary(_, inner) if is_supported_type(inner))
-}
-
-fn is_supported_binary_type(data_type: &DataType) -> bool {
-    matches!(data_type, DataType::Binary | DataType::FixedSizeBinary(_))
-}
-
 ///// Tries to move a cast from an expression (such as column) to the literal other side of a comparison operator./
 ///
 /// Specifically, rewrites
@@ -281,246 +228,6 @@ fn cast_literal_to_type_with_op(
     }
 }
 
-/// Convert a literal value from one data type to another
-pub(super) fn try_cast_literal_to_type(
-    lit_value: &ScalarValue,
-    target_type: &DataType,
-) -> Option<ScalarValue> {
-    let lit_data_type = lit_value.data_type();
-    if !is_supported_type(&lit_data_type) || !is_supported_type(target_type) {
-        return None;
-    }
-    if lit_value.is_null() {
-        // null value can be cast to any type of null value
-        return ScalarValue::try_from(target_type).ok();
-    }
-    try_cast_numeric_literal(lit_value, target_type)
-        .or_else(|| try_cast_string_literal(lit_value, target_type))
-        .or_else(|| try_cast_dictionary(lit_value, target_type))
-        .or_else(|| try_cast_binary(lit_value, target_type))
-}
-
-/// Convert a numeric value from one numeric data type to another
-fn try_cast_numeric_literal(
-    lit_value: &ScalarValue,
-    target_type: &DataType,
-) -> Option<ScalarValue> {
-    let lit_data_type = lit_value.data_type();
-    if !is_supported_numeric_type(&lit_data_type)
-        || !is_supported_numeric_type(target_type)
-    {
-        return None;
-    }
-
-    let mul = match target_type {
-        DataType::UInt8
-        | DataType::UInt16
-        | DataType::UInt32
-        | DataType::UInt64
-        | DataType::Int8
-        | DataType::Int16
-        | DataType::Int32
-        | DataType::Int64 => 1_i128,
-        DataType::Timestamp(_, _) => 1_i128,
-        DataType::Decimal128(_, scale) => 10_i128.pow(*scale as u32),
-        _ => return None,
-    };
-    let (target_min, target_max) = match target_type {
-        DataType::UInt8 => (u8::MIN as i128, u8::MAX as i128),
-        DataType::UInt16 => (u16::MIN as i128, u16::MAX as i128),
-        DataType::UInt32 => (u32::MIN as i128, u32::MAX as i128),
-        DataType::UInt64 => (u64::MIN as i128, u64::MAX as i128),
-        DataType::Int8 => (i8::MIN as i128, i8::MAX as i128),
-        DataType::Int16 => (i16::MIN as i128, i16::MAX as i128),
-        DataType::Int32 => (i32::MIN as i128, i32::MAX as i128),
-        DataType::Int64 => (i64::MIN as i128, i64::MAX as i128),
-        DataType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128),
-        DataType::Decimal128(precision, _) => (
-            // Different precision for decimal128 can store different range of value.
-            // For example, the precision is 3, the max of value is `999` and the min
-            // value is `-999`
-            MIN_DECIMAL128_FOR_EACH_PRECISION[*precision as usize],
-            MAX_DECIMAL128_FOR_EACH_PRECISION[*precision as usize],
-        ),
-        _ => return None,
-    };
-    let lit_value_target_type = match lit_value {
-        ScalarValue::Int8(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::Int16(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::Int32(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::Int64(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::UInt8(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::UInt16(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::UInt32(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::UInt64(Some(v)) => (*v as i128).checked_mul(mul),
-        ScalarValue::TimestampSecond(Some(v), _) => (*v as i128).checked_mul(mul),
-        ScalarValue::TimestampMillisecond(Some(v), _) => (*v as i128).checked_mul(mul),
-        ScalarValue::TimestampMicrosecond(Some(v), _) => (*v as i128).checked_mul(mul),
-        ScalarValue::TimestampNanosecond(Some(v), _) => (*v as i128).checked_mul(mul),
-        ScalarValue::Decimal128(Some(v), _, scale) => {
-            let lit_scale_mul = 10_i128.pow(*scale as u32);
-            if mul >= lit_scale_mul {
-                // Example:
-                // lit is decimal(123,3,2)
-                // target type is decimal(5,3)
-                // the lit can be converted to the decimal(1230,5,3)
-                (*v).checked_mul(mul / lit_scale_mul)
-            } else if (*v) % (lit_scale_mul / mul) == 0 {
-                // Example:
-                // lit is decimal(123000,10,3)
-                // target type is int32: the lit can be converted to INT32(123)
-                // target type is decimal(10,2): the lit can be converted to decimal(12300,10,2)
-                Some(*v / (lit_scale_mul / mul))
-            } else {
-                // can't convert the lit decimal to the target data type
-                None
-            }
-        }
-        _ => None,
-    };
-
-    match lit_value_target_type {
-        None => None,
-        Some(value) => {
-            if value >= target_min && value <= target_max {
-                // the value casted from lit to the target type is in the range of target type.
-                // return the target type of scalar value
-                let result_scalar = match target_type {
-                    DataType::Int8 => ScalarValue::Int8(Some(value as i8)),
-                    DataType::Int16 => ScalarValue::Int16(Some(value as i16)),
-                    DataType::Int32 => ScalarValue::Int32(Some(value as i32)),
-                    DataType::Int64 => ScalarValue::Int64(Some(value as i64)),
-                    DataType::UInt8 => ScalarValue::UInt8(Some(value as u8)),
-                    DataType::UInt16 => ScalarValue::UInt16(Some(value as u16)),
-                    DataType::UInt32 => ScalarValue::UInt32(Some(value as u32)),
-                    DataType::UInt64 => ScalarValue::UInt64(Some(value as u64)),
-                    DataType::Timestamp(TimeUnit::Second, tz) => {
-                        let value = cast_between_timestamp(
-                            &lit_data_type,
-                            &DataType::Timestamp(TimeUnit::Second, tz.clone()),
-                            value,
-                        );
-                        ScalarValue::TimestampSecond(value, tz.clone())
-                    }
-                    DataType::Timestamp(TimeUnit::Millisecond, tz) => {
-                        let value = cast_between_timestamp(
-                            &lit_data_type,
-                            &DataType::Timestamp(TimeUnit::Millisecond, tz.clone()),
-                            value,
-                        );
-                        ScalarValue::TimestampMillisecond(value, tz.clone())
-                    }
-                    DataType::Timestamp(TimeUnit::Microsecond, tz) => {
-                        let value = cast_between_timestamp(
-                            &lit_data_type,
-                            &DataType::Timestamp(TimeUnit::Microsecond, tz.clone()),
-                            value,
-                        );
-                        ScalarValue::TimestampMicrosecond(value, tz.clone())
-                    }
-                    DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
-                        let value = cast_between_timestamp(
-                            &lit_data_type,
-                            &DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()),
-                            value,
-                        );
-                        ScalarValue::TimestampNanosecond(value, tz.clone())
-                    }
-                    DataType::Decimal128(p, s) => {
-                        ScalarValue::Decimal128(Some(value), *p, *s)
-                    }
-                    _ => {
-                        return None;
-                    }
-                };
-                Some(result_scalar)
-            } else {
-                None
-            }
-        }
-    }
-}
-
-fn try_cast_string_literal(
-    lit_value: &ScalarValue,
-    target_type: &DataType,
-) -> Option<ScalarValue> {
-    let string_value = lit_value.try_as_str()?.map(|s| s.to_string());
-    let scalar_value = match target_type {
-        DataType::Utf8 => ScalarValue::Utf8(string_value),
-        DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value),
-        DataType::Utf8View => ScalarValue::Utf8View(string_value),
-        _ => return None,
-    };
-    Some(scalar_value)
-}
-
-/// Attempt to cast to/from a dictionary type by wrapping/unwrapping the dictionary
-fn try_cast_dictionary(
-    lit_value: &ScalarValue,
-    target_type: &DataType,
-) -> Option<ScalarValue> {
-    let lit_value_type = lit_value.data_type();
-    let result_scalar = match (lit_value, target_type) {
-        // Unwrap dictionary when inner type matches target type
-        (ScalarValue::Dictionary(_, inner_value), _)
-            if inner_value.data_type() == *target_type =>
-        {
-            (**inner_value).clone()
-        }
-        // Wrap type when target type is dictionary
-        (_, DataType::Dictionary(index_type, inner_type))
-            if **inner_type == lit_value_type =>
-        {
-            ScalarValue::Dictionary(index_type.clone(), Box::new(lit_value.clone()))
-        }
-        _ => {
-            return None;
-        }
-    };
-    Some(result_scalar)
-}
-
-/// Cast a timestamp value from one unit to another
-fn cast_between_timestamp(from: &DataType, to: &DataType, value: i128) -> Option<i64> {
-    let value = value as i64;
-    let from_scale = match from {
-        DataType::Timestamp(TimeUnit::Second, _) => 1,
-        DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS,
-        DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS,
-        DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS,
-        _ => return Some(value),
-    };
-
-    let to_scale = match to {
-        DataType::Timestamp(TimeUnit::Second, _) => 1,
-        DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS,
-        DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS,
-        DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS,
-        _ => return Some(value),
-    };
-
-    match from_scale.cmp(&to_scale) {
-        Ordering::Less => value.checked_mul(to_scale / from_scale),
-        Ordering::Greater => Some(value / (from_scale / to_scale)),
-        Ordering::Equal => Some(value),
-    }
-}
-
-fn try_cast_binary(
-    lit_value: &ScalarValue,
-    target_type: &DataType,
-) -> Option<ScalarValue> {
-    match (lit_value, target_type) {
-        (ScalarValue::Binary(Some(v)), DataType::FixedSizeBinary(n))
-            if v.len() == *n as usize =>
-        {
-            Some(ScalarValue::FixedSizeBinary(*n, Some(v.clone())))
-        }
-        _ => None,
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -528,10 +235,8 @@ mod tests {
     use std::sync::Arc;
 
     use crate::simplify_expressions::ExprSimplifier;
-    use arrow::compute::{cast_with_options, CastOptions};
-    use arrow::datatypes::Field;
+    use arrow::datatypes::{Field, TimeUnit};
     use datafusion_common::{DFSchema, DFSchemaRef};
-    use datafusion_expr::execution_props::ExecutionProps;
     use datafusion_expr::simplify::SimplifyContext;
     use datafusion_expr::{cast, col, in_list, try_cast};
 
@@ -586,9 +291,9 @@ mod tests {
         let expected = col("c2").eq(lit(16i64));
         assert_eq!(optimize_test(c2_eq_lit, &schema), expected);
 
-        // cast(c1, INT64) < INT64(NULL) => INT32(c1) < INT32(NULL)
+        // cast(c1, INT64) < INT64(NULL) => NULL
         let c1_lt_lit_null = cast(col("c1"), DataType::Int64).lt(null_i64());
-        let expected = col("c1").lt(null_i32());
+        let expected = null_bool();
         assert_eq!(optimize_test(c1_lt_lit_null, &schema), expected);
 
         // cast(INT8(NULL), INT32) < INT32(12) => INT8(NULL) < INT8(12) => BOOL(NULL)
@@ -606,9 +311,9 @@ mod tests {
         let expected = col("c1").not_eq(lit(123i32));
         assert_eq!(optimize_test(expr_input, &schema), expected);
 
-        // cast(c1, UTF8) = NULL => c1 = NULL
+        // cast(c1, UTF8) = NULL => NULL
         let expr_input = cast(col("c1"), DataType::Utf8).eq(lit(ScalarValue::Utf8(None)));
-        let expected = col("c1").eq(lit(ScalarValue::Int32(None)));
+        let expected = null_bool();
         assert_eq!(optimize_test(expr_input, &schema), expected);
     }
 
@@ -711,7 +416,7 @@ mod tests {
 
         // c3 < INT64(NULL)
         let c1_lt_lit_null = cast(col("c3"), DataType::Int64).lt(null_i64());
-        let expected = col("c3").lt(null_decimal(18, 2));
+        let expected = null_bool();
         assert_eq!(optimize_test(c1_lt_lit_null, &schema), expected);
 
         // decimal to decimal
@@ -882,9 +587,8 @@ mod tests {
     }
 
     fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr {
-        let props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
-            SimplifyContext::new(&props).with_schema(Arc::clone(schema)),
+            SimplifyContext::default().with_schema(Arc::clone(schema)),
         );
 
         simplifier.simplify(expr).unwrap()
@@ -942,10 +646,6 @@ mod tests {
         lit(ScalarValue::TimestampNanosecond(Some(ts), utc))
     }
 
-    fn null_decimal(precision: u8, scale: i8) -> Expr {
-        lit(ScalarValue::Decimal128(None, precision, scale))
-    }
-
     fn timestamp_nano_none_type() -> DataType {
         DataType::Timestamp(TimeUnit::Nanosecond, None)
     }
@@ -960,523 +660,4 @@ mod tests {
     fn dictionary_tag_type() -> DataType {
         DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
     }
-
-    #[test]
-    fn test_try_cast_to_type_nulls() {
-        // test that nulls can be cast to/from all integer types
-        let scalars = vec![
-            ScalarValue::Int8(None),
-            ScalarValue::Int16(None),
-            ScalarValue::Int32(None),
-            ScalarValue::Int64(None),
-            ScalarValue::UInt8(None),
-            ScalarValue::UInt16(None),
-            ScalarValue::UInt32(None),
-            ScalarValue::UInt64(None),
-            ScalarValue::Decimal128(None, 3, 0),
-            ScalarValue::Decimal128(None, 8, 2),
-            ScalarValue::Utf8(None),
-            ScalarValue::LargeUtf8(None),
-        ];
-
-        for s1 in &scalars {
-            for s2 in &scalars {
-                let expected_value = ExpectedCast::Value(s2.clone());
-
-                expect_cast(s1.clone(), s2.data_type(), expected_value);
-            }
-        }
-    }
-
-    #[test]
-    fn test_try_cast_to_type_int_in_range() {
-        // test values that can be cast to/from all integer types
-        let scalars = vec![
-            ScalarValue::Int8(Some(123)),
-            ScalarValue::Int16(Some(123)),
-            ScalarValue::Int32(Some(123)),
-            ScalarValue::Int64(Some(123)),
-            ScalarValue::UInt8(Some(123)),
-            ScalarValue::UInt16(Some(123)),
-            ScalarValue::UInt32(Some(123)),
-            ScalarValue::UInt64(Some(123)),
-            ScalarValue::Decimal128(Some(123), 3, 0),
-            ScalarValue::Decimal128(Some(12300), 8, 2),
-        ];
-
-        for s1 in &scalars {
-            for s2 in &scalars {
-                let expected_value = ExpectedCast::Value(s2.clone());
-
-                expect_cast(s1.clone(), s2.data_type(), expected_value);
-            }
-        }
-
-        let max_i32 = ScalarValue::Int32(Some(i32::MAX));
-        expect_cast(
-            max_i32,
-            DataType::UInt64,
-            ExpectedCast::Value(ScalarValue::UInt64(Some(i32::MAX as u64))),
-        );
-
-        let min_i32 = ScalarValue::Int32(Some(i32::MIN));
-        expect_cast(
-            min_i32,
-            DataType::Int64,
-            ExpectedCast::Value(ScalarValue::Int64(Some(i32::MIN as i64))),
-        );
-
-        let max_i64 = ScalarValue::Int64(Some(i64::MAX));
-        expect_cast(
-            max_i64,
-            DataType::UInt64,
-            ExpectedCast::Value(ScalarValue::UInt64(Some(i64::MAX as u64))),
-        );
-    }
-
-    #[test]
-    fn test_try_cast_to_type_int_out_of_range() {
-        let min_i32 = ScalarValue::Int32(Some(i32::MIN));
-        let min_i64 = ScalarValue::Int64(Some(i64::MIN));
-        let max_i64 = ScalarValue::Int64(Some(i64::MAX));
-        let max_u64 = ScalarValue::UInt64(Some(u64::MAX));
-
-        expect_cast(max_i64.clone(), DataType::Int8, ExpectedCast::NoValue);
-
-        expect_cast(max_i64.clone(), DataType::Int16, ExpectedCast::NoValue);
-
-        expect_cast(max_i64, DataType::Int32, ExpectedCast::NoValue);
-
-        expect_cast(max_u64, DataType::Int64, ExpectedCast::NoValue);
-
-        expect_cast(min_i64, DataType::UInt64, ExpectedCast::NoValue);
-
-        expect_cast(min_i32, DataType::UInt64, ExpectedCast::NoValue);
-
-        // decimal out of range
-        expect_cast(
-            ScalarValue::Decimal128(Some(99999999999999999999999999999999999900), 38, 0),
-            DataType::Int64,
-            ExpectedCast::NoValue,
-        );
-
-        expect_cast(
-            ScalarValue::Decimal128(Some(-9999999999999999999999999999999999), 37, 1),
-            DataType::Int64,
-            ExpectedCast::NoValue,
-        );
-    }
-
-    #[test]
-    fn test_try_decimal_cast_in_range() {
-        expect_cast(
-            ScalarValue::Decimal128(Some(12300), 5, 2),
-            DataType::Decimal128(3, 0),
-            ExpectedCast::Value(ScalarValue::Decimal128(Some(123), 3, 0)),
-        );
-
-        expect_cast(
-            ScalarValue::Decimal128(Some(12300), 5, 2),
-            DataType::Decimal128(8, 0),
-            ExpectedCast::Value(ScalarValue::Decimal128(Some(123), 8, 0)),
-        );
-
-        expect_cast(
-            ScalarValue::Decimal128(Some(12300), 5, 2),
-            DataType::Decimal128(8, 5),
-            ExpectedCast::Value(ScalarValue::Decimal128(Some(12300000), 8, 5)),
-        );
-    }
-
-    #[test]
-    fn test_try_decimal_cast_out_of_range() {
-        // decimal would lose precision
-        expect_cast(
-            ScalarValue::Decimal128(Some(12345), 5, 2),
-            DataType::Decimal128(3, 0),
-            ExpectedCast::NoValue,
-        );
-
-        // decimal would lose precision
-        expect_cast(
-            ScalarValue::Decimal128(Some(12300), 5, 2),
-            DataType::Decimal128(2, 0),
-            ExpectedCast::NoValue,
-        );
-    }
-
-    #[test]
-    fn test_try_cast_to_type_timestamps() {
-        for time_unit in [
-            TimeUnit::Second,
-            TimeUnit::Millisecond,
-            TimeUnit::Microsecond,
-            TimeUnit::Nanosecond,
-        ] {
-            let utc = Some("+00:00".into());
-            // No timezone, utc timezone
-            let (lit_tz_none, lit_tz_utc) = match time_unit {
-                TimeUnit::Second => (
-                    ScalarValue::TimestampSecond(Some(12345), None),
-                    ScalarValue::TimestampSecond(Some(12345), utc),
-                ),
-
-                TimeUnit::Millisecond => (
-                    ScalarValue::TimestampMillisecond(Some(12345), None),
-                    ScalarValue::TimestampMillisecond(Some(12345), utc),
-                ),
-
-                TimeUnit::Microsecond => (
-                    ScalarValue::TimestampMicrosecond(Some(12345), None),
-                    ScalarValue::TimestampMicrosecond(Some(12345), utc),
-                ),
-
-                TimeUnit::Nanosecond => (
-                    ScalarValue::TimestampNanosecond(Some(12345), None),
-                    ScalarValue::TimestampNanosecond(Some(12345), utc),
-                ),
-            };
-
-            // DataFusion ignores timezones for comparisons of ScalarValue
-            // so double check it here
-            assert_eq!(lit_tz_none, lit_tz_utc);
-
-            // e.g. DataType::Timestamp(_, None)
-            let dt_tz_none = lit_tz_none.data_type();
-
-            // e.g. DataType::Timestamp(_, Some(utc))
-            let dt_tz_utc = lit_tz_utc.data_type();
-
-            // None <--> None
-            expect_cast(
-                lit_tz_none.clone(),
-                dt_tz_none.clone(),
-                ExpectedCast::Value(lit_tz_none.clone()),
-            );
-
-            // None <--> Utc
-            expect_cast(
-                lit_tz_none.clone(),
-                dt_tz_utc.clone(),
-                ExpectedCast::Value(lit_tz_utc.clone()),
-            );
-
-            // Utc <--> None
-            expect_cast(
-                lit_tz_utc.clone(),
-                dt_tz_none.clone(),
-                ExpectedCast::Value(lit_tz_none.clone()),
-            );
-
-            // Utc <--> Utc
-            expect_cast(
-                lit_tz_utc.clone(),
-                dt_tz_utc.clone(),
-                ExpectedCast::Value(lit_tz_utc.clone()),
-            );
-
-            // timestamp to int64
-            expect_cast(
-                lit_tz_utc.clone(),
-                DataType::Int64,
-                ExpectedCast::Value(ScalarValue::Int64(Some(12345))),
-            );
-
-            // int64 to timestamp
-            expect_cast(
-                ScalarValue::Int64(Some(12345)),
-                dt_tz_none.clone(),
-                ExpectedCast::Value(lit_tz_none.clone()),
-            );
-
-            // int64 to timestamp
-            expect_cast(
-                ScalarValue::Int64(Some(12345)),
-                dt_tz_utc.clone(),
-                ExpectedCast::Value(lit_tz_utc.clone()),
-            );
-
-            // timestamp to string (not supported yet)
-            expect_cast(
-                lit_tz_utc.clone(),
-                DataType::LargeUtf8,
-                ExpectedCast::NoValue,
-            );
-        }
-    }
-
-    #[test]
-    fn test_try_cast_to_type_unsupported() {
-        // int64 to list
-        expect_cast(
-            ScalarValue::Int64(Some(12345)),
-            DataType::List(Arc::new(Field::new("f", DataType::Int32, true))),
-            ExpectedCast::NoValue,
-        );
-    }
-
-    #[derive(Debug, Clone)]
-    enum ExpectedCast {
-        /// test successfully cast value and it is as specified
-        Value(ScalarValue),
-        /// test returned OK, but could not cast the value
-        NoValue,
-    }
-
-    /// Runs try_cast_literal_to_type with the specified inputs and
-    /// ensure it computes the expected output, and ensures the
-    /// casting is consistent with the Arrow kernels
-    fn expect_cast(
-        literal: ScalarValue,
-        target_type: DataType,
-        expected_result: ExpectedCast,
-    ) {
-        let actual_value = try_cast_literal_to_type(&literal, &target_type);
-
-        println!("expect_cast: ");
-        println!("  {literal:?} --> {target_type:?}");
-        println!("  expected_result: {expected_result:?}");
-        println!("  actual_result:   {actual_value:?}");
-
-        match expected_result {
-            ExpectedCast::Value(expected_value) => {
-                let actual_value =
-                    actual_value.expect("Expected cast value but got None");
-
-                assert_eq!(actual_value, expected_value);
-
-                // Verify that calling the arrow
-                // cast kernel yields the same results
-                // input array
-                let literal_array = literal
-                    .to_array_of_size(1)
-                    .expect("Failed to convert to array of size");
-                let expected_array = expected_value
-                    .to_array_of_size(1)
-                    .expect("Failed to convert to array of size");
-                let cast_array = cast_with_options(
-                    &literal_array,
-                    &target_type,
-                    &CastOptions::default(),
-                )
-                .expect("Expected to be cast array with arrow cast kernel");
-
-                assert_eq!(
-                    &expected_array, &cast_array,
-                    "Result of casting {literal:?} with arrow was\n {cast_array:#?}\nbut expected\n{expected_array:#?}"
-                );
-
-                // Verify that for timestamp types the timezones are the same
-                // (ScalarValue::cmp doesn't account for timezones);
-                if let (
-                    DataType::Timestamp(left_unit, left_tz),
-                    DataType::Timestamp(right_unit, right_tz),
-                ) = (actual_value.data_type(), expected_value.data_type())
-                {
-                    assert_eq!(left_unit, right_unit);
-                    assert_eq!(left_tz, right_tz);
-                }
-            }
-            ExpectedCast::NoValue => {
-                assert!(
-                    actual_value.is_none(),
-                    "Expected no cast value, but got {actual_value:?}"
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_try_cast_literal_to_timestamp() {
-        // same timestamp
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampNanosecond(Some(123456), None),
-            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-        )
-        .unwrap();
-
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampNanosecond(Some(123456), None)
-        );
-
-        // TimestampNanosecond to TimestampMicrosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampNanosecond(Some(123456), None),
-            &DataType::Timestamp(TimeUnit::Microsecond, None),
-        )
-        .unwrap();
-
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampMicrosecond(Some(123), None)
-        );
-
-        // TimestampNanosecond to TimestampMillisecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampNanosecond(Some(123456), None),
-            &DataType::Timestamp(TimeUnit::Millisecond, None),
-        )
-        .unwrap();
-
-        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(Some(0), None));
-
-        // TimestampNanosecond to TimestampSecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampNanosecond(Some(123456), None),
-            &DataType::Timestamp(TimeUnit::Second, None),
-        )
-        .unwrap();
-
-        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(0), None));
-
-        // TimestampMicrosecond to TimestampNanosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMicrosecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-        )
-        .unwrap();
-
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampNanosecond(Some(123000), None)
-        );
-
-        // TimestampMicrosecond to TimestampMillisecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMicrosecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Millisecond, None),
-        )
-        .unwrap();
-
-        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(Some(0), None));
-
-        // TimestampMicrosecond to TimestampSecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMicrosecond(Some(123456789), None),
-            &DataType::Timestamp(TimeUnit::Second, None),
-        )
-        .unwrap();
-        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123), None));
-
-        // TimestampMillisecond to TimestampNanosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMillisecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-        )
-        .unwrap();
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampNanosecond(Some(123000000), None)
-        );
-
-        // TimestampMillisecond to TimestampMicrosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMillisecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Microsecond, None),
-        )
-        .unwrap();
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampMicrosecond(Some(123000), None)
-        );
-        // TimestampMillisecond to TimestampSecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampMillisecond(Some(123456789), None),
-            &DataType::Timestamp(TimeUnit::Second, None),
-        )
-        .unwrap();
-        assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123456), None));
-
-        // TimestampSecond to TimestampNanosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampSecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Nanosecond, None),
-        )
-        .unwrap();
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampNanosecond(Some(123000000000), None)
-        );
-
-        // TimestampSecond to TimestampMicrosecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampSecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Microsecond, None),
-        )
-        .unwrap();
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampMicrosecond(Some(123000000), None)
-        );
-
-        // TimestampSecond to TimestampMillisecond
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampSecond(Some(123), None),
-            &DataType::Timestamp(TimeUnit::Millisecond, None),
-        )
-        .unwrap();
-        assert_eq!(
-            new_scalar,
-            ScalarValue::TimestampMillisecond(Some(123000), None)
-        );
-
-        // overflow
-        let new_scalar = try_cast_literal_to_type(
-            &ScalarValue::TimestampSecond(Some(i64::MAX), None),
-            &DataType::Timestamp(TimeUnit::Millisecond, None),
-        )
-        .unwrap();
-        assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(None, None));
-    }
-
-    #[test]
-    fn test_try_cast_to_string_type() {
-        let scalars = vec![
-            ScalarValue::from("string"),
-            ScalarValue::LargeUtf8(Some("string".to_owned())),
-        ];
-
-        for s1 in &scalars {
-            for s2 in &scalars {
-                let expected_value = ExpectedCast::Value(s2.clone());
-
-                expect_cast(s1.clone(), s2.data_type(), expected_value);
-            }
-        }
-    }
-    #[test]
-    fn test_try_cast_to_dictionary_type() {
-        fn dictionary_type(t: DataType) -> DataType {
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(t))
-        }
-        fn dictionary_value(value: ScalarValue) -> ScalarValue {
-            ScalarValue::Dictionary(Box::new(DataType::Int32), Box::new(value))
-        }
-        let scalars = vec![
-            ScalarValue::from("string"),
-            ScalarValue::LargeUtf8(Some("string".to_owned())),
-        ];
-        for s in &scalars {
-            expect_cast(
-                s.clone(),
-                dictionary_type(s.data_type()),
-                ExpectedCast::Value(dictionary_value(s.clone())),
-            );
-            expect_cast(
-                dictionary_value(s.clone()),
-                s.data_type(),
-                ExpectedCast::Value(s.clone()),
-            )
-        }
-    }
-
-    #[test]
-    fn try_cast_to_fixed_size_binary() {
-        expect_cast(
-            ScalarValue::Binary(Some(vec![1, 2, 3])),
-            DataType::FixedSizeBinary(3),
-            ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1, 2, 3]))),
-        )
-    }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs
index cf182175e48ee..b0908b47602f7 100644
--- a/datafusion/optimizer/src/simplify_expressions/utils.rs
+++ b/datafusion/optimizer/src/simplify_expressions/utils.rs
@@ -17,11 +17,12 @@
 
 //! Utility functions for expression simplification
 
-use datafusion_common::{internal_err, Result, ScalarValue};
+use arrow::datatypes::i256;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::{
+    Case, Expr, Like, Operator,
     expr::{Between, BinaryExpr, InList},
     expr_fn::{and, bitwise_and, bitwise_or, or},
-    Expr, Like, Operator,
 };
 
 pub static POWS_OF_TEN: [i128; 38] = [
@@ -139,47 +140,59 @@ pub fn delete_xor_in_complex_expr(expr: &Expr, needle: &Expr, is_left: bool) ->
 
 pub fn is_zero(s: &Expr) -> bool {
     match s {
-        Expr::Literal(ScalarValue::Int8(Some(0)))
-        | Expr::Literal(ScalarValue::Int16(Some(0)))
-        | Expr::Literal(ScalarValue::Int32(Some(0)))
-        | Expr::Literal(ScalarValue::Int64(Some(0)))
-        | Expr::Literal(ScalarValue::UInt8(Some(0)))
-        | Expr::Literal(ScalarValue::UInt16(Some(0)))
-        | Expr::Literal(ScalarValue::UInt32(Some(0)))
-        | Expr::Literal(ScalarValue::UInt64(Some(0))) => true,
-        Expr::Literal(ScalarValue::Float32(Some(v))) if *v == 0. => true,
-        Expr::Literal(ScalarValue::Float64(Some(v))) if *v == 0. => true,
-        Expr::Literal(ScalarValue::Decimal128(Some(v), _p, _s)) if *v == 0 => true,
+        Expr::Literal(ScalarValue::Int8(Some(0)), _)
+        | Expr::Literal(ScalarValue::Int16(Some(0)), _)
+        | Expr::Literal(ScalarValue::Int32(Some(0)), _)
+        | Expr::Literal(ScalarValue::Int64(Some(0)), _)
+        | Expr::Literal(ScalarValue::UInt8(Some(0)), _)
+        | Expr::Literal(ScalarValue::UInt16(Some(0)), _)
+        | Expr::Literal(ScalarValue::UInt32(Some(0)), _)
+        | Expr::Literal(ScalarValue::UInt64(Some(0)), _) => true,
+        Expr::Literal(ScalarValue::Float32(Some(v)), _) if *v == 0. => true,
+        Expr::Literal(ScalarValue::Float64(Some(v)), _) if *v == 0. => true,
+        Expr::Literal(ScalarValue::Decimal128(Some(v), _p, _s), _) if *v == 0 => true,
+        Expr::Literal(ScalarValue::Decimal256(Some(v), _p, _s), _)
+            if *v == i256::ZERO =>
+        {
+            true
+        }
         _ => false,
     }
 }
 
 pub fn is_one(s: &Expr) -> bool {
     match s {
-        Expr::Literal(ScalarValue::Int8(Some(1)))
-        | Expr::Literal(ScalarValue::Int16(Some(1)))
-        | Expr::Literal(ScalarValue::Int32(Some(1)))
-        | Expr::Literal(ScalarValue::Int64(Some(1)))
-        | Expr::Literal(ScalarValue::UInt8(Some(1)))
-        | Expr::Literal(ScalarValue::UInt16(Some(1)))
-        | Expr::Literal(ScalarValue::UInt32(Some(1)))
-        | Expr::Literal(ScalarValue::UInt64(Some(1))) => true,
-        Expr::Literal(ScalarValue::Float32(Some(v))) if *v == 1. => true,
-        Expr::Literal(ScalarValue::Float64(Some(v))) if *v == 1. => true,
-        Expr::Literal(ScalarValue::Decimal128(Some(v), _p, s)) => {
+        Expr::Literal(ScalarValue::Int8(Some(1)), _)
+        | Expr::Literal(ScalarValue::Int16(Some(1)), _)
+        | Expr::Literal(ScalarValue::Int32(Some(1)), _)
+        | Expr::Literal(ScalarValue::Int64(Some(1)), _)
+        | Expr::Literal(ScalarValue::UInt8(Some(1)), _)
+        | Expr::Literal(ScalarValue::UInt16(Some(1)), _)
+        | Expr::Literal(ScalarValue::UInt32(Some(1)), _)
+        | Expr::Literal(ScalarValue::UInt64(Some(1)), _) => true,
+        Expr::Literal(ScalarValue::Float32(Some(v)), _) if *v == 1. => true,
+        Expr::Literal(ScalarValue::Float64(Some(v)), _) if *v == 1. => true,
+        Expr::Literal(ScalarValue::Decimal128(Some(v), _p, s), _) => {
             *s >= 0
                 && POWS_OF_TEN
                     .get(*s as usize)
                     .map(|x| x == v)
                     .unwrap_or_default()
         }
+        Expr::Literal(ScalarValue::Decimal256(Some(v), _p, s), _) => {
+            *s >= 0
+                && match i256::from(10).checked_pow(*s as u32) {
+                    Some(res) => res == *v,
+                    None => false,
+                }
+        }
         _ => false,
     }
 }
 
 pub fn is_true(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(ScalarValue::Boolean(Some(v))) => *v,
+        Expr::Literal(ScalarValue::Boolean(Some(v)), _) => *v,
         _ => false,
     }
 }
@@ -187,24 +200,24 @@ pub fn is_true(expr: &Expr) -> bool {
 /// returns true if expr is a
 /// `Expr::Literal(ScalarValue::Boolean(v))` , false otherwise
 pub fn is_bool_lit(expr: &Expr) -> bool {
-    matches!(expr, Expr::Literal(ScalarValue::Boolean(_)))
+    matches!(expr, Expr::Literal(ScalarValue::Boolean(_), _))
 }
 
 /// Return a literal NULL value of Boolean data type
 pub fn lit_bool_null() -> Expr {
-    Expr::Literal(ScalarValue::Boolean(None))
+    Expr::Literal(ScalarValue::Boolean(None), None)
 }
 
 pub fn is_null(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(v) => v.is_null(),
+        Expr::Literal(v, _) => v.is_null(),
         _ => false,
     }
 }
 
 pub fn is_false(expr: &Expr) -> bool {
     match expr {
-        Expr::Literal(ScalarValue::Boolean(Some(v))) => !(*v),
+        Expr::Literal(ScalarValue::Boolean(Some(v)), _) => !(*v),
         _ => false,
     }
 }
@@ -247,11 +260,84 @@ pub fn is_negative_of(not_expr: &Expr, expr: &Expr) -> bool {
 /// `Expr::Literal(ScalarValue::Boolean(v))`.
 pub fn as_bool_lit(expr: &Expr) -> Result<Option<bool>> {
     match expr {
-        Expr::Literal(ScalarValue::Boolean(v)) => Ok(*v),
+        Expr::Literal(ScalarValue::Boolean(v), _) => Ok(*v),
         _ => internal_err!("Expected boolean literal, got {expr:?}"),
     }
 }
 
+pub fn is_case_with_literal_outputs(expr: &Expr) -> bool {
+    match expr {
+        Expr::Case(Case {
+            expr: None,
+            when_then_expr,
+            else_expr,
+        }) => {
+            when_then_expr.iter().all(|(_, then)| is_lit(then))
+                && else_expr.as_deref().is_none_or(is_lit)
+        }
+        _ => false,
+    }
+}
+
+pub fn into_case(expr: Expr) -> Result<Case> {
+    match expr {
+        Expr::Case(case) => Ok(case),
+        _ => internal_err!("Expected case, got {expr:?}"),
+    }
+}
+
+pub fn is_lit(expr: &Expr) -> bool {
+    matches!(expr, Expr::Literal(_, _))
+}
+
+/// Checks if `eq_expr` is `A = L1` and `ne_expr` is `A != L2` where L1 != L2.
+/// This pattern can be simplified to just `A = L1` since if A equals L1
+/// and L1 is different from L2, then A is automatically not equal to L2.
+pub fn is_eq_and_ne_with_different_literal(eq_expr: &Expr, ne_expr: &Expr) -> bool {
+    fn extract_var_and_literal(expr: &Expr) -> Option<(&Expr, &Expr)> {
+        match expr {
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::Eq,
+                right,
+            })
+            | Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::NotEq,
+                right,
+            }) => match (left.as_ref(), right.as_ref()) {
+                (Expr::Literal(_, _), var) => Some((var, left)),
+                (var, Expr::Literal(_, _)) => Some((var, right)),
+                _ => None,
+            },
+            _ => None,
+        }
+    }
+    match (eq_expr, ne_expr) {
+        (
+            Expr::BinaryExpr(BinaryExpr {
+                op: Operator::Eq, ..
+            }),
+            Expr::BinaryExpr(BinaryExpr {
+                op: Operator::NotEq,
+                ..
+            }),
+        ) => {
+            // Check if both compare the same expression against different literals
+            if let (Some((var1, lit1)), Some((var2, lit2))) = (
+                extract_var_and_literal(eq_expr),
+                extract_var_and_literal(ne_expr),
+            ) && var1 == var2
+                && lit1 != lit2
+            {
+                return true;
+            }
+            false
+        }
+        _ => false,
+    }
+}
+
 /// negate a Not clause
 /// input is the clause to be negated.(args of Not clause)
 /// For BinaryExpr, use the negation of op instead.
@@ -365,3 +451,78 @@ pub fn distribute_negation(expr: Expr) -> Expr {
         _ => Expr::Negative(Box::new(expr)),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{is_one, is_zero};
+    use arrow::datatypes::i256;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::lit;
+
+    #[test]
+    fn test_is_zero() {
+        assert!(is_zero(&lit(ScalarValue::Int8(Some(0)))));
+        assert!(is_zero(&lit(ScalarValue::Float32(Some(0.0)))));
+        assert!(is_zero(&lit(ScalarValue::Decimal128(
+            Some(i128::from(0)),
+            9,
+            0
+        ))));
+        assert!(is_zero(&lit(ScalarValue::Decimal128(
+            Some(i128::from(0)),
+            9,
+            5
+        ))));
+        assert!(is_zero(&lit(ScalarValue::Decimal256(
+            Some(i256::ZERO),
+            9,
+            0
+        ))));
+        assert!(is_zero(&lit(ScalarValue::Decimal256(
+            Some(i256::ZERO),
+            9,
+            5
+        ))));
+    }
+
+    #[test]
+    fn test_is_one() {
+        assert!(is_one(&lit(ScalarValue::Int8(Some(1)))));
+        assert!(is_one(&lit(ScalarValue::Float32(Some(1.0)))));
+        assert!(is_one(&lit(ScalarValue::Decimal128(
+            Some(i128::from(1)),
+            9,
+            0
+        ))));
+        assert!(is_one(&lit(ScalarValue::Decimal128(
+            Some(i128::from(10)),
+            9,
+            1
+        ))));
+        assert!(is_one(&lit(ScalarValue::Decimal128(
+            Some(i128::from(100)),
+            9,
+            2
+        ))));
+        assert!(is_one(&lit(ScalarValue::Decimal256(
+            Some(i256::from(1)),
+            9,
+            0
+        ))));
+        assert!(is_one(&lit(ScalarValue::Decimal256(
+            Some(i256::from(10)),
+            9,
+            1
+        ))));
+        assert!(is_one(&lit(ScalarValue::Decimal256(
+            Some(i256::from(100)),
+            9,
+            2
+        ))));
+        assert!(!is_one(&lit(ScalarValue::Decimal256(
+            Some(i256::from(100)),
+            9,
+            -1
+        ))));
+    }
+}
diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs
index 50783a214342e..00c8fab228117 100644
--- a/datafusion/optimizer/src/single_distinct_to_groupby.rs
+++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs
@@ -23,15 +23,14 @@ use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::{
-    internal_err, tree_node::Transformed, DataFusionError, HashSet, Result,
+    DataFusionError, HashSet, Result, assert_eq_or_internal_err, tree_node::Transformed,
 };
 use datafusion_expr::builder::project;
 use datafusion_expr::expr::AggregateFunctionParams;
 use datafusion_expr::{
-    col,
+    Expr, col,
     expr::AggregateFunction,
     logical_plan::{Aggregate, LogicalPlan},
-    Expr,
 };
 
 /// single distinct to group by optimizer rule
@@ -56,7 +55,7 @@ pub struct SingleDistinctToGroupBy {}
 const SINGLE_DISTINCT_ALIAS: &str = "alias1";
 
 impl SingleDistinctToGroupBy {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -79,7 +78,7 @@ fn is_single_distinct_agg(aggr_expr: &[Expr]) -> Result<bool> {
                 },
         }) = expr
         {
-            if filter.is_some() || order_by.is_some() {
+            if filter.is_some() || !order_by.is_empty() {
                 return Ok(false);
             }
             aggregate_count += 1;
@@ -183,15 +182,25 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                     .map(|aggr_expr| match aggr_expr {
                         Expr::AggregateFunction(AggregateFunction {
                             func,
-                            params: AggregateFunctionParams { mut args, distinct, .. }
+                            params:
+                                AggregateFunctionParams {
+                                    mut args,
+                                    distinct,
+                                    filter,
+                                    order_by,
+                                    null_treatment,
+                                },
                         }) => {
                             if distinct {
-                                if args.len() != 1 {
-                                    return internal_err!("DISTINCT aggregate should have exactly one argument");
-                                }
+                                assert_eq_or_internal_err!(
+                                    args.len(),
+                                    1,
+                                    "DISTINCT aggregate should have exactly one argument"
+                                );
                                 let arg = args.swap_remove(0);
 
-                                if group_fields_set.insert(arg.schema_name().to_string()) {
+                                if group_fields_set.insert(arg.schema_name().to_string())
+                                {
                                     inner_group_exprs
                                         .push(arg.alias(SINGLE_DISTINCT_ALIAS));
                                 }
@@ -199,9 +208,9 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                                     func,
                                     vec![col(SINGLE_DISTINCT_ALIAS)],
                                     false, // intentional to remove distinct here
-                                    None,
-                                    None,
-                                    None,
+                                    filter,
+                                    order_by,
+                                    null_treatment,
                                 )))
                                 // if the aggregate function is not distinct, we need to rewrite it like two phase aggregation
                             } else {
@@ -212,9 +221,9 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                                         Arc::clone(&func),
                                         args,
                                         false,
-                                        None,
-                                        None,
-                                        None,
+                                        filter,
+                                        order_by,
+                                        null_treatment,
                                     ))
                                     .alias(&alias_str),
                                 );
@@ -223,7 +232,7 @@ impl OptimizerRule for SingleDistinctToGroupBy {
                                     vec![col(&alias_str)],
                                     false,
                                     None,
-                                    None,
+                                    vec![],
                                     None,
                                 )))
                             }
@@ -282,8 +291,8 @@ mod tests {
     use super::*;
     use crate::assert_optimized_plan_eq_display_indent_snapshot;
     use crate::test::*;
-    use datafusion_expr::expr::GroupingSet;
     use datafusion_expr::ExprFunctionExt;
+    use datafusion_expr::expr::GroupingSet;
     use datafusion_expr::{lit, logical_plan::builder::LogicalPlanBuilder};
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_aggregate::expr_fn::{count, count_distinct, max, min, sum};
@@ -296,7 +305,7 @@ mod tests {
             vec![expr],
             true,
             None,
-            None,
+            vec![],
             None,
         ))
     }
@@ -627,7 +636,7 @@ mod tests {
             vec![col("a")],
             false,
             Some(Box::new(col("a").gt(lit(5)))),
-            None,
+            vec![],
             None,
         ));
         let plan = LogicalPlanBuilder::from(table_scan)
@@ -678,7 +687,7 @@ mod tests {
             vec![col("a")],
             false,
             None,
-            Some(vec![col("a").sort(true, false)]),
+            vec![col("a").sort(true, false)],
             None,
         ));
         let plan = LogicalPlanBuilder::from(table_scan)
diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs
index 6e0b734bb9280..2915e77be2e12 100644
--- a/datafusion/optimizer/src/test/mod.rs
+++ b/datafusion/optimizer/src/test/mod.rs
@@ -20,10 +20,11 @@ use crate::optimizer::Optimizer;
 use crate::{OptimizerContext, OptimizerRule};
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{assert_contains, Result};
-use datafusion_expr::{logical_plan::table_scan, LogicalPlan, LogicalPlanBuilder};
+use datafusion_common::{Result, assert_contains};
+use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, logical_plan::table_scan};
 use std::sync::Arc;
 
+pub mod udfs;
 pub mod user_defined;
 
 pub fn test_table_scan_fields() -> Vec<Field> {
@@ -34,6 +35,28 @@ pub fn test_table_scan_fields() -> Vec<Field> {
     ]
 }
 
+pub fn test_table_scan_with_struct_fields() -> Vec<Field> {
+    vec![
+        Field::new("id", DataType::UInt32, false),
+        Field::new(
+            "user",
+            DataType::Struct(
+                vec![
+                    Field::new("name", DataType::Utf8, true),
+                    Field::new("status", DataType::Utf8, true),
+                ]
+                .into(),
+            ),
+            true,
+        ),
+    ]
+}
+
+pub fn test_table_scan_with_struct() -> Result<LogicalPlan> {
+    let schema = Schema::new(test_table_scan_with_struct_fields());
+    table_scan(Some("test"), &schema, None)?.build()
+}
+
 /// some tests share a common table with different names
 pub fn test_table_scan_with_name(name: &str) -> Result<LogicalPlan> {
     let schema = Schema::new(test_table_scan_fields());
diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs
new file mode 100644
index 0000000000000..9164603dba3d5
--- /dev/null
+++ b/datafusion/optimizer/src/test/udfs.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, Expr, ExpressionPlacement, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, TypeSignature,
+};
+
+/// A configurable test UDF for optimizer tests.
+/// Defaults to `MoveTowardsLeafNodes` placement. Use `with_placement()` to override.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct PlacementTestUDF {
+    signature: Signature,
+    placement: ExpressionPlacement,
+    id: usize,
+}
+
+impl Default for PlacementTestUDF {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PlacementTestUDF {
+    pub fn new() -> Self {
+        Self {
+            // Accept any one or two arguments and return UInt32 for testing purposes.
+            // The actual types don't matter since this UDF is not intended for execution.
+            signature: Signature::new(
+                TypeSignature::OneOf(vec![TypeSignature::Any(1), TypeSignature::Any(2)]),
+                datafusion_expr::Volatility::Immutable,
+            ),
+            placement: ExpressionPlacement::MoveTowardsLeafNodes,
+            id: 0,
+        }
+    }
+
+    /// Set the expression placement for this UDF, which is used by optimizer rules to determine where in the plan the expression should be placed.
+    /// This also resets the name of the UDF to a default based on the placement.
+    pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self {
+        self.placement = placement;
+        self
+    }
+
+    /// Set the id of the UDF.
+    /// This is an arbitrary made up field to allow creating multiple distinct UDFs with the same placement.
+    pub fn with_id(mut self, id: usize) -> Self {
+        self.id = id;
+        self
+    }
+}
+
+impl ScalarUDFImpl for PlacementTestUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn name(&self) -> &str {
+        match self.placement {
+            ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf",
+            ExpressionPlacement::KeepInPlace => "keep_in_place_udf",
+            ExpressionPlacement::Column => "column_udf",
+            ExpressionPlacement::Literal => "literal_udf",
+        }
+    }
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::UInt32)
+    }
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        panic!("PlacementTestUDF: not intended for execution")
+    }
+    fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement {
+        self.placement
+    }
+}
+
+/// Create a `leaf_udf(arg)` expression with `MoveTowardsLeafNodes` placement.
+pub fn leaf_udf_expr(arg: Expr) -> Expr {
+    let udf = ScalarUDF::new_from_impl(
+        PlacementTestUDF::new().with_placement(ExpressionPlacement::MoveTowardsLeafNodes),
+    );
+    udf.call(vec![arg])
+}
diff --git a/datafusion/optimizer/src/test/user_defined.rs b/datafusion/optimizer/src/test/user_defined.rs
index a39f90b5da5db..878ce274d5ed6 100644
--- a/datafusion/optimizer/src/test/user_defined.rs
+++ b/datafusion/optimizer/src/test/user_defined.rs
@@ -19,8 +19,8 @@
 
 use datafusion_common::DFSchemaRef;
 use datafusion_expr::{
-    logical_plan::{Extension, UserDefinedLogicalNodeCore},
     Expr, LogicalPlan,
+    logical_plan::{Extension, UserDefinedLogicalNodeCore},
 };
 use std::{
     fmt::{self, Debug},
diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs
index 135f37dd9883b..7e038d2392022 100644
--- a/datafusion/optimizer/src/utils.rs
+++ b/datafusion/optimizer/src/utils.rs
@@ -20,14 +20,14 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
 
 use crate::analyzer::type_coercion::TypeCoercionRewriter;
-use arrow::array::{new_null_array, Array, RecordBatch};
+use arrow::array::{Array, RecordBatch, new_null_array};
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::tree_node::{TransformedResult, TreeNode};
 use datafusion_common::{Column, DFSchema, Result, ScalarValue};
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::expr_rewriter::replace_col;
-use datafusion_expr::{logical_plan::LogicalPlan, ColumnarValue, Expr};
+use datafusion_expr::{ColumnarValue, Expr, logical_plan::LogicalPlan};
 use datafusion_physical_expr::create_physical_expr;
 use log::{debug, trace};
 use std::sync::Arc;
@@ -124,10 +124,14 @@ fn evaluate_expr_with_null_column<'a>(
     null_columns: impl IntoIterator<Item = &'a Column>,
 ) -> Result<ColumnarValue> {
     static DUMMY_COL_NAME: &str = "?";
-    let schema = Schema::new(vec![Field::new(DUMMY_COL_NAME, DataType::Null, true)]);
-    let input_schema = DFSchema::try_from(schema.clone())?;
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        DUMMY_COL_NAME,
+        DataType::Null,
+        true,
+    )]));
+    let input_schema = DFSchema::try_from(Arc::clone(&schema))?;
     let column = new_null_array(&DataType::Null, 1);
-    let input_batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![column])?;
+    let input_batch = RecordBatch::try_new(schema, vec![column])?;
     let execution_props = ExecutionProps::default();
     let null_column = Column::from_name(DUMMY_COL_NAME);
 
@@ -150,7 +154,7 @@ fn coerce(expr: Expr, schema: &DFSchema) -> Result<Expr> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion_expr::{binary_expr, case, col, in_list, is_null, lit, Operator};
+    use datafusion_expr::{Operator, binary_expr, case, col, in_list, is_null, lit};
 
     #[test]
     fn expr_is_restrict_null_predicate() -> Result<()> {
@@ -163,7 +167,11 @@ mod tests {
             (Expr::IsNotNull(Box::new(col("a"))), true),
             // a = NULL
             (
-                binary_expr(col("a"), Operator::Eq, Expr::Literal(ScalarValue::Null)),
+                binary_expr(
+                    col("a"),
+                    Operator::Eq,
+                    Expr::Literal(ScalarValue::Null, None),
+                ),
                 true,
             ),
             // a > 8
@@ -226,12 +234,16 @@ mod tests {
             ),
             // a IN (NULL)
             (
-                in_list(col("a"), vec![Expr::Literal(ScalarValue::Null)], false),
+                in_list(
+                    col("a"),
+                    vec![Expr::Literal(ScalarValue::Null, None)],
+                    false,
+                ),
                 true,
             ),
             // a NOT IN (NULL)
             (
-                in_list(col("a"), vec![Expr::Literal(ScalarValue::Null)], true),
+                in_list(col("a"), vec![Expr::Literal(ScalarValue::Null, None)], true),
                 true,
             ),
         ];
diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs
index 95a9db6c8abd2..fd4991c24413f 100644
--- a/datafusion/optimizer/tests/optimizer_integration.rs
+++ b/datafusion/optimizer/tests/optimizer_integration.rs
@@ -22,7 +22,7 @@ use std::sync::Arc;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{plan_err, Result, TableReference};
+use datafusion_common::{Result, TableReference, plan_err};
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::test::function_stub::sum_udaf;
 use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF};
@@ -46,6 +46,48 @@ fn init() {
     let _ = env_logger::try_init();
 }
 
+#[test]
+fn recursive_cte_with_nested_subquery() -> Result<()> {
+    // Covers bailout path in `plan_contains_other_subqueries`, ensuring nested subqueries
+    // within recursive CTE branches prevent projection pushdown.
+    let sql = r#"
+        WITH RECURSIVE numbers(id, level) AS (
+            SELECT sub.id, sub.level FROM (
+                SELECT col_int32 AS id, 1 AS level FROM test
+            ) sub
+            UNION ALL
+            SELECT t.col_int32, numbers.level + 1
+            FROM test t
+            JOIN numbers ON t.col_int32 = numbers.id + 1
+        )
+        SELECT id, level FROM numbers
+    "#;
+
+    let plan = test_sql(sql)?;
+
+    assert_snapshot!(
+        format!("{plan}"),
+        @r"
+    SubqueryAlias: numbers
+      Projection: sub.id AS id, sub.level AS level
+        RecursiveQuery: is_distinct=false
+          Projection: sub.id, sub.level
+            SubqueryAlias: sub
+              Projection: test.col_int32 AS id, Int64(1) AS level
+                TableScan: test
+          Projection: t.col_int32, numbers.level + Int64(1)
+            Inner Join: CAST(t.col_int32 AS Int64) = CAST(numbers.id AS Int64) + Int64(1)
+              SubqueryAlias: t
+                Filter: CAST(test.col_int32 AS Int64) IS NOT NULL
+                  TableScan: test
+              Filter: CAST(numbers.id AS Int64) + Int64(1) IS NOT NULL
+                TableScan: numbers
+    "
+    );
+
+    Ok(())
+}
+
 #[test]
 fn case_when() -> Result<()> {
     let sql = "SELECT CASE WHEN col_int32 > 0 THEN 1 ELSE 0 END FROM test";
@@ -53,10 +95,10 @@ fn case_when() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
-  TableScan: test projection=[col_int32]
-"#
+    @r"
+    Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
+      TableScan: test projection=[col_int32]
+    "
     );
 
     let sql = "SELECT CASE WHEN col_uint32 > 0 THEN 1 ELSE 0 END FROM test";
@@ -64,10 +106,10 @@ Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END
 
     assert_snapshot!(
         format!("{plan}"),
-    @r#"
+    @r"
     Projection: CASE WHEN test.col_uint32 > UInt32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_uint32 > Int64(0) THEN Int64(1) ELSE Int64(0) END
       TableScan: test projection=[col_uint32]
-    "#
+    "
     );
     Ok(())
 }
@@ -107,11 +149,11 @@ fn case_when_aggregate() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_utf8, sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n
-          Aggregate: groupBy=[[test.col_utf8]], aggr=[[sum(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]
-            TableScan: test projection=[col_int32, col_utf8]
-        "#
+    @r"
+    Projection: test.col_utf8, sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n
+      Aggregate: groupBy=[[test.col_utf8]], aggr=[[sum(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]
+        TableScan: test projection=[col_int32, col_utf8]
+    "
     );
     Ok(())
 }
@@ -123,11 +165,11 @@ fn unsigned_target_type() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
+    @r"
     Projection: test.col_utf8
       Filter: test.col_uint32 > UInt32(0)
         TableScan: test projection=[col_uint32, col_utf8]
-    "#
+    "
     );
     Ok(())
 }
@@ -140,10 +182,10 @@ fn distribute_by() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Repartition: DistributeBy(test.col_utf8)
-          TableScan: test projection=[col_int32, col_utf8]
-        "#
+    @r"
+    Repartition: DistributeBy(test.col_utf8)
+      TableScan: test projection=[col_int32, col_utf8]
+    "
     );
     Ok(())
 }
@@ -158,16 +200,16 @@ fn semi_join_with_join_filter() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_utf8
-          LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+    @r"
+    Projection: test.col_utf8
+      LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8]
+        SubqueryAlias: __correlated_sq_1
+          SubqueryAlias: t2
             Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8]
-            SubqueryAlias: __correlated_sq_1
-              SubqueryAlias: t2
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32, col_uint32]
-        "#
+              TableScan: test projection=[col_int32, col_uint32]
+    "
     );
     Ok(())
 }
@@ -182,15 +224,15 @@ fn anti_join_with_join_filter() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Projection: test.col_utf8
-  LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
-    TableScan: test projection=[col_int32, col_uint32, col_utf8]
-    SubqueryAlias: __correlated_sq_1
-      SubqueryAlias: t2
-        Filter: test.col_int32 IS NOT NULL
-          TableScan: test projection=[col_int32, col_uint32]
-"#
+    @r"
+    Projection: test.col_utf8
+      LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32
+        TableScan: test projection=[col_int32, col_uint32, col_utf8]
+        SubqueryAlias: __correlated_sq_1
+          SubqueryAlias: t2
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32, col_uint32]
+    "
     );
     Ok(())
 }
@@ -203,16 +245,16 @@ fn where_exists_distinct() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32
-  Filter: test.col_int32 IS NOT NULL
-    TableScan: test projection=[col_int32]
-  SubqueryAlias: __correlated_sq_1
-    Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]
-      SubqueryAlias: t2
-        Filter: test.col_int32 IS NOT NULL
-          TableScan: test projection=[col_int32]
-"#
+    @r"
+    LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32
+      Filter: test.col_int32 IS NOT NULL
+        TableScan: test projection=[col_int32]
+      SubqueryAlias: __correlated_sq_1
+        Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]
+          SubqueryAlias: t2
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32]
+    "
 
     );
     Ok(())
@@ -227,15 +269,17 @@ fn intersect() -> Result<()> {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8
-  Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]
-    LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8
-      Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]
-        TableScan: test projection=[col_int32, col_utf8]
+    @r"
+    LeftSemi Join: left.col_int32 = test.col_int32, left.col_utf8 = test.col_utf8
+      Aggregate: groupBy=[[left.col_int32, left.col_utf8]], aggr=[[]]
+        LeftSemi Join: left.col_int32 = right.col_int32, left.col_utf8 = right.col_utf8
+          Aggregate: groupBy=[[left.col_int32, left.col_utf8]], aggr=[[]]
+            SubqueryAlias: left
+              TableScan: test projection=[col_int32, col_utf8]
+          SubqueryAlias: right
+            TableScan: test projection=[col_int32, col_utf8]
       TableScan: test projection=[col_int32, col_utf8]
-  TableScan: test projection=[col_int32, col_utf8]
-"#
+    "
     );
     Ok(())
 }
@@ -249,11 +293,11 @@ fn between_date32_plus_interval() -> Result<()> {
     assert_snapshot!(
     format!("{plan}"),
     @r#"
-Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-  Projection:
-    Filter: test.col_date32 >= Date32("1998-03-18") AND test.col_date32 <= Date32("1998-06-16")
-      TableScan: test projection=[col_date32]
-"#
+    Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+      Projection:
+        Filter: test.col_date32 >= Date32("1998-03-18") AND test.col_date32 <= Date32("1998-06-16")
+          TableScan: test projection=[col_date32]
+    "#
     );
     Ok(())
 }
@@ -267,11 +311,11 @@ fn between_date64_plus_interval() -> Result<()> {
     assert_snapshot!(
     format!("{plan}"),
     @r#"
-        Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-          Projection:
-            Filter: test.col_date64 >= Date64("1998-03-18") AND test.col_date64 <= Date64("1998-06-16")
-              TableScan: test projection=[col_date64]
-        "#
+    Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+      Projection:
+        Filter: test.col_date64 >= Date64("1998-03-18") AND test.col_date64 <= Date64("1998-06-16")
+          TableScan: test projection=[col_date64]
+    "#
     );
     Ok(())
 }
@@ -284,9 +328,7 @@ fn propagate_empty_relation() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        EmptyRelation
-        "#
+    @"EmptyRelation: rows=0"
     );
 }
 
@@ -297,16 +339,16 @@ fn join_keys_in_subquery_alias() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Inner Join: a.col_int32 = b.key
-          SubqueryAlias: a
-            Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: b
-            Projection: test.col_int32 AS key
-              Filter: test.col_int32 IS NOT NULL
-                TableScan: test projection=[col_int32]
-        "#
+    @r"
+    Inner Join: a.col_int32 = b.key
+      SubqueryAlias: a
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: b
+        Projection: test.col_int32 AS key
+          Filter: test.col_int32 IS NOT NULL
+            TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -317,20 +359,20 @@ fn join_keys_in_subquery_alias_1() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Inner Join: a.col_int32 = b.key
-          SubqueryAlias: a
+    @r"
+    Inner Join: a.col_int32 = b.key
+      SubqueryAlias: a
+        Filter: test.col_int32 IS NOT NULL
+          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: b
+        Projection: test.col_int32 AS key
+          Inner Join: test.col_int32 = c.col_int32
             Filter: test.col_int32 IS NOT NULL
-              TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: b
-            Projection: test.col_int32 AS key
-              Inner Join: test.col_int32 = c.col_int32
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32]
-                SubqueryAlias: c
-                  Filter: test.col_int32 IS NOT NULL
-                    TableScan: test projection=[col_int32]
-        "#
+              TableScan: test projection=[col_int32]
+            SubqueryAlias: c
+              Filter: test.col_int32 IS NOT NULL
+                TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -341,12 +383,12 @@ fn push_down_filter_groupby_expr_contains_alias() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32 + test.col_uint32 AS c, count(Int64(1)) AS count(*)
-          Aggregate: groupBy=[[CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64)]], aggr=[[count(Int64(1))]]
-            Filter: CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64) > Int64(3)
-              TableScan: test projection=[col_int32, col_uint32]
-        "#
+    @r"
+    Projection: test.col_int32 + test.col_uint32 AS c, count(Int64(1)) AS count(*)
+      Aggregate: groupBy=[[CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64)]], aggr=[[count(Int64(1))]]
+        Filter: CAST(test.col_int32 AS Int64) + CAST(test.col_uint32 AS Int64) > Int64(3)
+          TableScan: test projection=[col_int32, col_uint32]
+    "
     );
 }
 
@@ -358,14 +400,14 @@ fn test_same_name_but_not_ambiguous() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        LeftSemi Join: t1.col_int32 = t2.col_int32
-          Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]
-            SubqueryAlias: t1
-              TableScan: test projection=[col_int32]
-          SubqueryAlias: t2
-            TableScan: test projection=[col_int32]
-        "#
+    @r"
+    LeftSemi Join: t1.col_int32 = t2.col_int32
+      Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]
+        SubqueryAlias: t1
+          TableScan: test projection=[col_int32]
+      SubqueryAlias: t2
+        TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -380,10 +422,10 @@ fn eliminate_nested_filters() {
 
     assert_snapshot!(
           format!("{plan}"),
-          @r#"
-Filter: test.col_int32 > Int32(0)
-  TableScan: test projection=[col_int32]
-  "#
+          @r"
+    Filter: test.col_int32 > Int32(0)
+      TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -398,11 +440,11 @@ fn eliminate_redundant_null_check_on_count() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32, count(Int64(1)) AS count(*) AS c
-          Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1))]]
-            TableScan: test projection=[col_int32]
-        "#
+    @r"
+    Projection: test.col_int32, count(Int64(1)) AS count(*) AS c
+      Aggregate: groupBy=[[test.col_int32]], aggr=[[count(Int64(1))]]
+        TableScan: test projection=[col_int32]
+    "
     );
 }
 
@@ -426,13 +468,13 @@ fn test_propagate_empty_relation_inner_join_and_unions() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-Union
-  TableScan: test projection=[col_int32]
-  TableScan: test projection=[col_int32]
-  Filter: test.col_int32 < Int32(0)
-    TableScan: test projection=[col_int32]
-  "#);
+    @r"
+    Union
+      TableScan: test projection=[col_int32]
+      TableScan: test projection=[col_int32]
+      Filter: test.col_int32 < Int32(0)
+        TableScan: test projection=[col_int32]
+    ");
 }
 
 #[test]
@@ -443,10 +485,10 @@ fn select_wildcard_with_repeated_column_but_is_aliased() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32 AS col_32
-          TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-        "#
+    @r"
+    Projection: test.col_int32, test.col_uint32, test.col_utf8, test.col_date32, test.col_date64, test.col_ts_nano_none, test.col_ts_nano_utc, test.col_int32 AS col_32
+      TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+    "
     );
 }
 
@@ -467,17 +509,133 @@ fn select_correlated_predicate_subquery_with_uppercase_ident() {
 
     assert_snapshot!(
     format!("{plan}"),
-    @r#"
-        LeftSemi Join: test.col_int32 = __correlated_sq_1.COL_INT32
-          Filter: test.col_int32 IS NOT NULL
-            TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
-          SubqueryAlias: __correlated_sq_1
-            SubqueryAlias: T1
-              Projection: test.col_int32 AS COL_INT32
-                Filter: test.col_int32 IS NOT NULL
-                  TableScan: test projection=[col_int32]
-        "#
+    @r"
+    LeftSemi Join: test.col_int32 = __correlated_sq_1.COL_INT32
+      Filter: test.col_int32 IS NOT NULL
+        TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]
+      SubqueryAlias: __correlated_sq_1
+        SubqueryAlias: T1
+          Projection: test.col_int32 AS COL_INT32
+            Filter: test.col_int32 IS NOT NULL
+              TableScan: test projection=[col_int32]
+    "
+    );
+}
+
+#[test]
+fn recursive_cte_projection_pushdown() -> Result<()> {
+    // Test that projection pushdown works with recursive CTEs by ensuring
+    // only the required columns are projected from the base table, even when
+    // the CTE definition includes unused columns
+    let sql = "WITH RECURSIVE nodes AS (\
+        SELECT col_int32 AS id, col_utf8 AS name, col_uint32 AS extra FROM test \
+        UNION ALL \
+        SELECT id + 1, name, extra FROM nodes WHERE id < 3\
+    ) SELECT id FROM nodes";
+    let plan = test_sql(sql)?;
+
+    // The optimizer successfully performs projection pushdown by only selecting the needed
+    // columns from the base table and recursive table, eliminating unused columns
+    assert_snapshot!(
+        format!("{plan}"),
+        @r"
+    SubqueryAlias: nodes
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS id
+          TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(nodes.id AS Int64) + Int64(1) AS Int32)
+          Filter: nodes.id < Int32(3)
+            TableScan: nodes projection=[id]
+    "
     );
+    Ok(())
+}
+
+#[test]
+fn recursive_cte_with_aliased_self_reference() -> Result<()> {
+    let sql = "WITH RECURSIVE nodes AS (\
+        SELECT col_int32 AS id, col_utf8 AS name FROM test \
+        UNION ALL \
+        SELECT child.id + 1, child.name FROM nodes AS child WHERE child.id < 3\
+    ) SELECT id FROM nodes";
+    let plan = test_sql(sql)?;
+
+    assert_snapshot!(
+        format!("{plan}"),
+        @r"
+    SubqueryAlias: nodes
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS id
+          TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(child.id AS Int64) + Int64(1) AS Int32)
+          SubqueryAlias: child
+            Filter: nodes.id < Int32(3)
+              TableScan: nodes projection=[id]
+    ",
+    );
+    Ok(())
+}
+
+#[test]
+fn recursive_cte_with_unused_columns() -> Result<()> {
+    // Test projection pushdown with a recursive CTE where the base case
+    // includes columns that are never used in the recursive part or final result
+    let sql = "WITH RECURSIVE series AS (\
+        SELECT 1 AS n, col_utf8, col_uint32, col_date32 FROM test WHERE col_int32 = 1 \
+        UNION ALL \
+        SELECT n + 1, col_utf8, col_uint32, col_date32 FROM series WHERE n < 3\
+    ) SELECT n FROM series";
+    let plan = test_sql(sql)?;
+
+    // The optimizer successfully performs projection pushdown by eliminating unused columns
+    // even when they're defined in the CTE but not actually needed
+    assert_snapshot!(
+        format!("{plan}"),
+        @r"
+    SubqueryAlias: series
+      RecursiveQuery: is_distinct=false
+        Projection: Int64(1) AS n
+          Filter: test.col_int32 = Int32(1)
+            TableScan: test projection=[col_int32]
+        Projection: series.n + Int64(1)
+          Filter: series.n < Int64(3)
+            TableScan: series projection=[n]
+    "
+    );
+    Ok(())
+}
+
+#[test]
+/// Asserts the minimal plan shape once projection pushdown succeeds for a recursive CTE.
+/// Unlike the previous two tests that retain extra columns in either the base or recursive
+/// branches, this baseline shows the optimizer trimming everything down to the single
+/// column required by the final projection.
+fn recursive_cte_projection_pushdown_baseline() -> Result<()> {
+    // Test case that truly demonstrates projection pushdown working:
+    // The base case only selects needed columns
+    let sql = "WITH RECURSIVE countdown AS (\
+        SELECT col_int32 AS n FROM test WHERE col_int32 = 5 \
+        UNION ALL \
+        SELECT n - 1 FROM countdown WHERE n > 1\
+    ) SELECT n FROM countdown";
+    let plan = test_sql(sql)?;
+
+    // This demonstrates optimal projection pushdown where only col_int32 is projected from the base table,
+    // and only the needed column is selected from the recursive table
+    assert_snapshot!(
+        format!("{plan}"),
+        @r"
+    SubqueryAlias: countdown
+      RecursiveQuery: is_distinct=false
+        Projection: test.col_int32 AS n
+          Filter: test.col_int32 = Int32(5)
+            TableScan: test projection=[col_int32]
+        Projection: CAST(CAST(countdown.n AS Int64) - Int64(1) AS Int32)
+          Filter: countdown.n > Int32(1)
+            TableScan: countdown projection=[n]
+    "
+    );
+    Ok(())
 }
 
 fn test_sql(sql: &str) -> Result<LogicalPlan> {
@@ -526,7 +684,7 @@ fn test_sql(sql: &str) -> Result<LogicalPlan> {
     let analyzer = Analyzer::new();
     let optimizer = Optimizer::new();
     // analyze and optimize the logical plan
-    let plan = analyzer.execute_and_check(plan, config.options(), |_, _| {})?;
+    let plan = analyzer.execute_and_check(plan, &config.options(), |_, _| {})?;
     optimizer.optimize(plan, &config, observe)
 }
 
@@ -589,6 +747,14 @@ impl ContextProvider for MyContextProvider {
         None
     }
 
+    fn create_cte_work_table(
+        &self,
+        _name: &str,
+        schema: SchemaRef,
+    ) -> Result<Arc<dyn TableSource>> {
+        Ok(Arc::new(MyTableSource { schema }))
+    }
+
     fn options(&self) -> &ConfigOptions {
         &self.options
     }
diff --git a/datafusion/physical-expr-adapter/Cargo.toml b/datafusion/physical-expr-adapter/Cargo.toml
new file mode 100644
index 0000000000000..453c8bdaacb4a
--- /dev/null
+++ b/datafusion/physical-expr-adapter/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "datafusion-physical-expr-adapter"
+description = "Physical expression schema adaptation utilities for DataFusion"
+keywords = ["datafusion", "query", "sql"]
+readme = "README.md"
+version = { workspace = true }
+edition = { workspace = true }
+homepage = { workspace = true }
+repository = { workspace = true }
+license = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+
+[lib]
+name = "datafusion_physical_expr_adapter"
+path = "src/lib.rs"
+
+[dependencies]
+arrow = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+itertools = { workspace = true }
+
+[lints]
+workspace = true
+
+[dev-dependencies]
diff --git a/datafusion/physical-expr-adapter/LICENSE.txt b/datafusion/physical-expr-adapter/LICENSE.txt
new file mode 120000
index 0000000000000..1ef648f64b34f
--- /dev/null
+++ b/datafusion/physical-expr-adapter/LICENSE.txt
@@ -0,0 +1 @@
+../../LICENSE.txt
\ No newline at end of file
diff --git a/datafusion/physical-expr-adapter/NOTICE.txt b/datafusion/physical-expr-adapter/NOTICE.txt
new file mode 120000
index 0000000000000..fb051c92b10b2
--- /dev/null
+++ b/datafusion/physical-expr-adapter/NOTICE.txt
@@ -0,0 +1 @@
+../../NOTICE.txt
\ No newline at end of file
diff --git a/datafusion/physical-expr-adapter/README.md b/datafusion/physical-expr-adapter/README.md
new file mode 100644
index 0000000000000..02bc144c16f34
--- /dev/null
+++ b/datafusion/physical-expr-adapter/README.md
@@ -0,0 +1,38 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Physical Expression Adapter
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate provides utilities for adapting physical expressions to different schemas in DataFusion.
+
+It handles schema differences in file scans by rewriting expressions to match the physical schema,
+including type casting, missing columns, and partition values.
+
+For detailed documentation, see the [`PhysicalExprAdapter`] trait documentation.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
+[`physicalexpradapter`]: https://docs.rs/datafusion/latest/datafusion/physical_expr_adapter/trait.PhysicalExprAdapter.html
diff --git a/datafusion/physical-expr-adapter/src/lib.rs b/datafusion/physical-expr-adapter/src/lib.rs
new file mode 100644
index 0000000000000..ea4db19ee110e
--- /dev/null
+++ b/datafusion/physical-expr-adapter/src/lib.rs
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+#![doc(
+    html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
+    html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
+)]
+#![cfg_attr(docsrs, feature(doc_cfg))]
+
+//! Physical expression schema adaptation utilities for DataFusion
+
+pub mod schema_rewriter;
+
+pub use schema_rewriter::{
+    BatchAdapter, BatchAdapterFactory, DefaultPhysicalExprAdapter,
+    DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
+    replace_columns_with_literals,
+};
diff --git a/datafusion/physical-expr-adapter/src/schema_rewriter.rs b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
new file mode 100644
index 0000000000000..a2a45cbdfe7aa
--- /dev/null
+++ b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
@@ -0,0 +1,1621 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Physical expression schema rewriting utilities: [`PhysicalExprAdapter`],
+//! [`PhysicalExprAdapterFactory`], default implementations,
+//! and [`replace_columns_with_literals`].
+
+use std::borrow::Borrow;
+use std::collections::HashMap;
+use std::hash::Hash;
+use std::sync::Arc;
+
+use arrow::array::RecordBatch;
+use arrow::compute::can_cast_types;
+use arrow::datatypes::{DataType, Field, FieldRef, SchemaRef};
+use datafusion_common::{
+    Result, ScalarValue, exec_err,
+    metadata::FieldMetadata,
+    nested_struct::validate_struct_compatibility,
+    tree_node::{Transformed, TransformedResult, TreeNode},
+};
+use datafusion_functions::core::getfield::GetFieldFunc;
+use datafusion_physical_expr::PhysicalExprSimplifier;
+use datafusion_physical_expr::expressions::CastColumnExpr;
+use datafusion_physical_expr::projection::{ProjectionExprs, Projector};
+use datafusion_physical_expr::{
+    ScalarFunctionExpr,
+    expressions::{self, Column},
+};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use itertools::Itertools;
+
+/// Replace column references in the given physical expression with literal values.
+///
+/// Some use cases for this include:
+/// - Partition column pruning: When scanning partitioned data, partition column references
+///   can be replaced with their literal values for the specific partition being scanned.
+/// - Constant folding: In some cases, columns that can be proven to be constant
+///   from statistical analysis may be replaced with their literal values to optimize expression evaluation.
+/// - Filling in non-null default values: in a custom [`PhysicalExprAdapter`] implementation,
+///   column references can be replaced with default literal values instead of nulls.
+///
+/// # Arguments
+/// - `expr`: The physical expression in which to replace column references.
+/// - `replacements`: A mapping from column names to their corresponding literal `ScalarValue`s.
+///   Accepts various HashMap types including `HashMap<&str, &ScalarValue>`,
+///   `HashMap<String, ScalarValue>`, `HashMap<String, &ScalarValue>`, etc.
+///
+/// # Returns
+/// - `Result<Arc<dyn PhysicalExpr>>`: The rewritten physical expression with columns replaced by literals.
+pub fn replace_columns_with_literals<K, V>(
+    expr: Arc<dyn PhysicalExpr>,
+    replacements: &HashMap<K, V>,
+) -> Result<Arc<dyn PhysicalExpr>>
+where
+    K: Borrow<str> + Eq + Hash,
+    V: Borrow<ScalarValue>,
+{
+    expr.transform_down(|expr| {
+        if let Some(column) = expr.as_any().downcast_ref::<Column>()
+            && let Some(replacement_value) = replacements.get(column.name())
+        {
+            return Ok(Transformed::yes(expressions::lit(
+                replacement_value.borrow().clone(),
+            )));
+        }
+        Ok(Transformed::no(expr))
+    })
+    .data()
+}
+
+/// Trait for adapting [`PhysicalExpr`] expressions to match a target schema.
+///
+/// This is used in file scans to rewrite expressions so that they can be
+/// evaluated against the physical schema of the file being scanned. It allows
+/// for handling differences between logical and physical schemas, such as type
+/// mismatches or missing columns common in [Schema evolution] scenarios.
+///
+/// [Schema evolution]: https://www.dremio.com/wiki/schema-evolution/
+///
+/// ## Default Implementations
+///
+/// The default implementation [`DefaultPhysicalExprAdapter`]  handles common
+/// cases.
+///
+/// ## Custom Implementations
+///
+/// You can create a custom implementation of this trait to handle specific rewriting logic.
+/// For example, to fill in missing columns with default values instead of nulls:
+///
+/// ```rust
+/// use datafusion_physical_expr_adapter::{PhysicalExprAdapter, PhysicalExprAdapterFactory};
+/// use arrow::datatypes::{Schema, Field, DataType, FieldRef, SchemaRef};
+/// use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+/// use datafusion_common::{Result, ScalarValue, tree_node::{Transformed, TransformedResult, TreeNode}};
+/// use datafusion_physical_expr::expressions::{self, Column};
+/// use std::sync::Arc;
+///
+/// #[derive(Debug)]
+/// pub struct CustomPhysicalExprAdapter {
+///     logical_file_schema: SchemaRef,
+///     physical_file_schema: SchemaRef,
+/// }
+///
+/// impl PhysicalExprAdapter for CustomPhysicalExprAdapter {
+///     fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+///         expr.transform(|expr| {
+///             if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+///                 // Check if the column exists in the physical schema
+///                 if self.physical_file_schema.index_of(column.name()).is_err() {
+///                     // If the column is missing, fill it with a default value instead of null
+///                     // The default value could be stored in the table schema's column metadata for example.
+///                     let default_value = ScalarValue::Int32(Some(0));
+///                     return Ok(Transformed::yes(expressions::lit(default_value)));
+///                 }
+///             }
+///             // If the column exists, return it as is
+///             Ok(Transformed::no(expr))
+///         }).data()
+///     }
+/// }
+///
+/// #[derive(Debug)]
+/// pub struct CustomPhysicalExprAdapterFactory;
+///
+/// impl PhysicalExprAdapterFactory for CustomPhysicalExprAdapterFactory {
+///     fn create(
+///         &self,
+///         logical_file_schema: SchemaRef,
+///         physical_file_schema: SchemaRef,
+///     ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+///         Ok(Arc::new(CustomPhysicalExprAdapter {
+///             logical_file_schema,
+///             physical_file_schema,
+///         }))
+///     }
+/// }
+/// ```
+pub trait PhysicalExprAdapter: Send + Sync + std::fmt::Debug {
+    /// Rewrite a physical expression to match the target schema.
+    ///
+    /// This method should return a transformed expression that matches the target schema.
+    ///
+    /// Arguments:
+    /// - `expr`: The physical expression to rewrite.
+    /// - `logical_file_schema`: The logical schema of the table being queried, excluding any partition columns.
+    /// - `physical_file_schema`: The physical schema of the file being scanned.
+    /// - `partition_values`: Optional partition values to use for rewriting partition column references.
+    ///   These are handled as if they were columns appended onto the logical file schema.
+    ///
+    /// Returns:
+    /// - `Arc<dyn PhysicalExpr>`: The rewritten physical expression that can be evaluated against the physical schema.
+    ///
+    /// See Also:
+    /// - [`replace_columns_with_literals`]: for replacing partition column references with their literal values.
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>>;
+}
+
+/// Creates instances of [`PhysicalExprAdapter`] for given logical and physical schemas.
+///
+/// See [`DefaultPhysicalExprAdapterFactory`] for the default implementation.
+pub trait PhysicalExprAdapterFactory: Send + Sync + std::fmt::Debug {
+    /// Create a new instance of the physical expression adapter.
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>>;
+}
+
+#[derive(Debug, Clone)]
+pub struct DefaultPhysicalExprAdapterFactory;
+
+impl PhysicalExprAdapterFactory for DefaultPhysicalExprAdapterFactory {
+    fn create(
+        &self,
+        logical_file_schema: SchemaRef,
+        physical_file_schema: SchemaRef,
+    ) -> Result<Arc<dyn PhysicalExprAdapter>> {
+        Ok(Arc::new(DefaultPhysicalExprAdapter {
+            logical_file_schema,
+            physical_file_schema,
+        }))
+    }
+}
+
+/// Default implementation of [`PhysicalExprAdapter`] for rewriting physical
+/// expressions to match different schemas.
+///
+/// ## Overview
+///
+///  [`DefaultPhysicalExprAdapter`] rewrites physical expressions to match
+///  different schemas, including:
+///
+/// - **Type casting**: When logical and physical schemas have different types, expressions are
+///   automatically wrapped with cast operations. For example, `lit(ScalarValue::Int32(123)) = int64_column`
+///   gets rewritten to `lit(ScalarValue::Int32(123)) = cast(int64_column, 'Int32')`.
+///   Note that this does not attempt to simplify such expressions - that is done by shared simplifiers.
+///
+/// - **Missing columns**: When a column exists in the logical schema but not in the physical schema,
+///   references to it are replaced with null literals.
+///
+/// - **Struct field access**: Expressions like `struct_column.field_that_is_missing_in_schema` are
+///   rewritten to `null` when the field doesn't exist in the physical schema.
+///
+/// - **Default column values**: Partition column references can be replaced with their literal values
+///   when scanning specific partitions. See [`replace_columns_with_literals`] for more details.
+///
+/// # Example
+///
+/// ```rust
+/// # use datafusion_physical_expr_adapter::{DefaultPhysicalExprAdapterFactory, PhysicalExprAdapterFactory};
+/// # use arrow::datatypes::Schema;
+/// # use std::sync::Arc;
+/// #
+/// # fn example(
+/// #     predicate: std::sync::Arc<dyn datafusion_physical_expr_common::physical_expr::PhysicalExpr>,
+/// #     physical_file_schema: &Schema,
+/// #     logical_file_schema: &Schema,
+/// # ) -> datafusion_common::Result<()> {
+/// let factory = DefaultPhysicalExprAdapterFactory;
+/// let adapter =
+///     factory.create(Arc::new(logical_file_schema.clone()), Arc::new(physical_file_schema.clone()))?;
+/// let adapted_predicate = adapter.rewrite(predicate)?;
+/// # Ok(())
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct DefaultPhysicalExprAdapter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+}
+
+impl DefaultPhysicalExprAdapter {
+    /// Create a new instance of the default physical expression adapter.
+    ///
+    /// This adapter rewrites expressions to match the physical schema of the file being scanned,
+    /// handling type mismatches and missing columns by filling them with default values.
+    pub fn new(logical_file_schema: SchemaRef, physical_file_schema: SchemaRef) -> Self {
+        Self {
+            logical_file_schema,
+            physical_file_schema,
+        }
+    }
+}
+
+impl PhysicalExprAdapter for DefaultPhysicalExprAdapter {
+    fn rewrite(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        let rewriter = DefaultPhysicalExprAdapterRewriter {
+            logical_file_schema: Arc::clone(&self.logical_file_schema),
+            physical_file_schema: Arc::clone(&self.physical_file_schema),
+        };
+        expr.transform(|expr| rewriter.rewrite_expr(Arc::clone(&expr)))
+            .data()
+    }
+}
+
+struct DefaultPhysicalExprAdapterRewriter {
+    logical_file_schema: SchemaRef,
+    physical_file_schema: SchemaRef,
+}
+
+impl DefaultPhysicalExprAdapterRewriter {
+    fn rewrite_expr(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        if let Some(transformed) = self.try_rewrite_struct_field_access(&expr)? {
+            return Ok(Transformed::yes(transformed));
+        }
+
+        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+            return self.rewrite_column(Arc::clone(&expr), column);
+        }
+
+        Ok(Transformed::no(expr))
+    }
+
+    /// Attempt to rewrite struct field access expressions to return null if the field does not exist in the physical schema.
+    /// Note that this does *not* handle nested struct fields, only top-level struct field access.
+    /// See <https://github.com/apache/datafusion/issues/17114> for more details.
+    fn try_rewrite_struct_field_access(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        let get_field_expr =
+            match ScalarFunctionExpr::try_downcast_func::<GetFieldFunc>(expr.as_ref()) {
+                Some(expr) => expr,
+                None => return Ok(None),
+            };
+
+        let source_expr = match get_field_expr.args().first() {
+            Some(expr) => expr,
+            None => return Ok(None),
+        };
+
+        let field_name_expr = match get_field_expr.args().get(1) {
+            Some(expr) => expr,
+            None => return Ok(None),
+        };
+
+        let lit = match field_name_expr
+            .as_any()
+            .downcast_ref::<expressions::Literal>()
+        {
+            Some(lit) => lit,
+            None => return Ok(None),
+        };
+
+        let field_name = match lit.value().try_as_str().flatten() {
+            Some(name) => name,
+            None => return Ok(None),
+        };
+
+        let column = match source_expr.as_any().downcast_ref::<Column>() {
+            Some(column) => column,
+            None => return Ok(None),
+        };
+
+        let physical_field =
+            match self.physical_file_schema.field_with_name(column.name()) {
+                Ok(field) => field,
+                Err(_) => return Ok(None),
+            };
+
+        let physical_struct_fields = match physical_field.data_type() {
+            DataType::Struct(fields) => fields,
+            _ => return Ok(None),
+        };
+
+        if physical_struct_fields
+            .iter()
+            .any(|f| f.name() == field_name)
+        {
+            return Ok(None);
+        }
+
+        let logical_field = match self.logical_file_schema.field_with_name(column.name())
+        {
+            Ok(field) => field,
+            Err(_) => return Ok(None),
+        };
+
+        let logical_struct_fields = match logical_field.data_type() {
+            DataType::Struct(fields) => fields,
+            _ => return Ok(None),
+        };
+
+        let logical_struct_field = match logical_struct_fields
+            .iter()
+            .find(|f| f.name() == field_name)
+        {
+            Some(field) => field,
+            None => return Ok(None),
+        };
+
+        let null_value = ScalarValue::Null.cast_to(logical_struct_field.data_type())?;
+        Ok(Some(Arc::new(expressions::Literal::new_with_metadata(
+            null_value,
+            Some(FieldMetadata::from(logical_struct_field.as_ref())),
+        ))))
+    }
+
+    fn rewrite_column(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+        column: &Column,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        // Get the logical field for this column if it exists in the logical schema
+        let logical_field = match self.logical_file_schema.field_with_name(column.name())
+        {
+            Ok(field) => field,
+            Err(e) => {
+                // This can be hit if a custom rewrite injected a reference to a column that doesn't exist in the logical schema.
+                // For example, a pre-computed column that is kept only in the physical schema.
+                // If the column exists in the physical schema, we can still use it.
+                if let Ok(physical_field) =
+                    self.physical_file_schema.field_with_name(column.name())
+                {
+                    // If the column exists in the physical schema, we can use it in place of the logical column.
+                    // This is nice to users because if they do a rewrite that results in something like `physical_int32_col = 123u64`
+                    // we'll at least handle the casts for them.
+                    physical_field
+                } else {
+                    // A completely unknown column that doesn't exist in either schema!
+                    // This should probably never be hit unless something upstream broke, but nonetheless it's better
+                    // for us to return a handleable error than to panic / do something unexpected.
+                    return Err(e.into());
+                }
+            }
+        };
+
+        let Some((resolved_column, physical_field)) =
+            self.resolve_physical_column(column)?
+        else {
+            if !logical_field.is_nullable() {
+                return exec_err!(
+                    "Non-nullable column '{}' is missing from the physical schema",
+                    column.name()
+                );
+            }
+            // If the column is missing from the physical schema fill it in with nulls.
+            // For a different behavior, provide a custom `PhysicalExprAdapter` implementation.
+            let null_value = ScalarValue::Null.cast_to(logical_field.data_type())?;
+            return Ok(Transformed::yes(Arc::new(
+                expressions::Literal::new_with_metadata(
+                    null_value,
+                    Some(FieldMetadata::from(logical_field)),
+                ),
+            )));
+        };
+
+        if resolved_column.index() == column.index()
+            && logical_field == physical_field.as_ref()
+        {
+            return Ok(Transformed::no(expr));
+        }
+
+        if logical_field == physical_field.as_ref() {
+            // If the fields match (including metadata/nullability), we can use the column as is
+            return Ok(Transformed::yes(Arc::new(resolved_column)));
+        }
+
+        // We need a cast expression whenever the logical and physical fields differ,
+        // whether that difference is only metadata/nullability or also data type.
+        // TODO: add optimization to move the cast from the column to literal expressions in the case of `col = 123`
+        // since that's much cheaper to evalaute.
+        // See https://github.com/apache/datafusion/issues/15780#issuecomment-2824716928
+        self.create_cast_column_expr(resolved_column, physical_field, logical_field)
+    }
+
+    /// Resolves a logical column to the corresponding physical column and field.
+    fn resolve_physical_column(
+        &self,
+        column: &Column,
+    ) -> Result<Option<(Column, FieldRef)>> {
+        // The physical schema adaptation step intentionally resolves columns by **name first**
+        // rather than trusting the incoming index. This mirrors what the old refactoring
+        // did before `resolve_physical_column()` was extracted: the planner might hand us a
+        // `Column` whose `index` field is stale (e.g. after projection/rename rewrites), so
+        // resolving by name ensures we match the correct physical slot. Once we know the
+        // proper index we rebuild the `Column` with `new_with_schema` so callers can rely
+        // on `column.index()` later without having to re-query the schema.
+        let Ok(physical_column_index) = self.physical_file_schema.index_of(column.name())
+        else {
+            return Ok(None);
+        };
+
+        let column = if column.index() == physical_column_index {
+            column.clone()
+        } else {
+            Column::new_with_schema(column.name(), self.physical_file_schema.as_ref())?
+        };
+
+        Ok(Some((
+            column,
+            Arc::new(
+                self.physical_file_schema
+                    .field(physical_column_index)
+                    .clone(),
+            ),
+        )))
+    }
+
+    /// Validates type compatibility and creates a CastColumnExpr if needed.
+    ///
+    /// Checks whether the physical field can be cast to the logical field type,
+    /// handling both struct and scalar types. Returns a CastColumnExpr with the
+    /// appropriate configuration.
+    fn create_cast_column_expr(
+        &self,
+        column: Column,
+        physical_field: FieldRef,
+        logical_field: &Field,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        // For struct types, use validate_struct_compatibility which handles:
+        // - Missing fields in source (filled with nulls)
+        // - Extra fields in source (ignored)
+        // - Recursive validation of nested structs
+        // For non-struct types, use Arrow's can_cast_types
+        match (physical_field.data_type(), logical_field.data_type()) {
+            (DataType::Struct(physical_fields), DataType::Struct(logical_fields)) => {
+                validate_struct_compatibility(
+                    physical_fields.as_ref(),
+                    logical_fields.as_ref(),
+                )?;
+            }
+            _ => {
+                let is_compatible =
+                    can_cast_types(physical_field.data_type(), logical_field.data_type());
+                if !is_compatible {
+                    return exec_err!(
+                        "Cannot cast column '{}' from '{}' (physical data type) to '{}' (logical data type)",
+                        column.name(),
+                        physical_field.data_type(),
+                        logical_field.data_type()
+                    );
+                }
+            }
+        }
+
+        let cast_expr = Arc::new(CastColumnExpr::new(
+            Arc::new(column),
+            physical_field,
+            Arc::new(logical_field.clone()),
+            None,
+        ));
+
+        Ok(Transformed::yes(cast_expr))
+    }
+}
+
+/// Factory for creating [`BatchAdapter`] instances to adapt record batches
+/// to a target schema.
+///
+/// This binds a target schema and allows creating adapters for different source schemas.
+/// It handles:
+/// - **Column reordering**: Columns are reordered to match the target schema
+/// - **Type casting**: Automatic type conversion (e.g., Int32 to Int64)
+/// - **Missing columns**: Nullable columns missing from source are filled with nulls
+/// - **Struct field adaptation**: Nested struct fields are recursively adapted
+///
+/// ## Examples
+///
+/// ```rust
+/// use arrow::array::{Int32Array, Int64Array, StringArray, RecordBatch};
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use datafusion_physical_expr_adapter::BatchAdapterFactory;
+/// use std::sync::Arc;
+///
+/// // Target schema has different column order and types
+/// let target_schema = Arc::new(Schema::new(vec![
+///     Field::new("name", DataType::Utf8, true),
+///     Field::new("id", DataType::Int64, false),    // Int64 in target
+///     Field::new("score", DataType::Float64, true), // Missing from source
+/// ]));
+///
+/// // Source schema has different column order and Int32 for id
+/// let source_schema = Arc::new(Schema::new(vec![
+///     Field::new("id", DataType::Int32, false),    // Int32 in source
+///     Field::new("name", DataType::Utf8, true),
+///     // Note: 'score' column is missing from source
+/// ]));
+///
+/// // Create factory with target schema
+/// let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+///
+/// // Create adapter for this specific source schema
+/// let adapter = factory.make_adapter(&source_schema).unwrap();
+///
+/// // Create a source batch
+/// let source_batch = RecordBatch::try_new(
+///     source_schema,
+///     vec![
+///         Arc::new(Int32Array::from(vec![1, 2, 3])),
+///         Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol"])),
+///     ],
+/// ).unwrap();
+///
+/// // Adapt the batch to match target schema
+/// let adapted = adapter.adapt_batch(&source_batch).unwrap();
+///
+/// assert_eq!(adapted.num_columns(), 3);
+/// assert_eq!(adapted.column(0).data_type(), &DataType::Utf8);   // name
+/// assert_eq!(adapted.column(1).data_type(), &DataType::Int64);  // id (cast from Int32)
+/// assert_eq!(adapted.column(2).data_type(), &DataType::Float64); // score (filled with nulls)
+/// ```
+#[derive(Debug)]
+pub struct BatchAdapterFactory {
+    target_schema: SchemaRef,
+    expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+}
+
+impl BatchAdapterFactory {
+    /// Create a new [`BatchAdapterFactory`] with the given target schema.
+    pub fn new(target_schema: SchemaRef) -> Self {
+        let expr_adapter_factory = Arc::new(DefaultPhysicalExprAdapterFactory);
+        Self {
+            target_schema,
+            expr_adapter_factory,
+        }
+    }
+
+    /// Set a custom [`PhysicalExprAdapterFactory`] to use when adapting expressions.
+    ///
+    /// Use this to customize behavior when adapting batches, e.g. to fill in missing values
+    /// with defaults instead of nulls.
+    ///
+    /// See [`PhysicalExprAdapter`] for more details.
+    pub fn with_adapter_factory(
+        self,
+        factory: Arc<dyn PhysicalExprAdapterFactory>,
+    ) -> Self {
+        Self {
+            expr_adapter_factory: factory,
+            ..self
+        }
+    }
+
+    /// Create a new [`BatchAdapter`] for the given source schema.
+    ///
+    /// Batches fed into this [`BatchAdapter`] *must* conform to the source schema,
+    /// no validation is performed at runtime to minimize overheads.
+    pub fn make_adapter(&self, source_schema: &SchemaRef) -> Result<BatchAdapter> {
+        let expr_adapter = self
+            .expr_adapter_factory
+            .create(Arc::clone(&self.target_schema), Arc::clone(source_schema))?;
+
+        let simplifier = PhysicalExprSimplifier::new(&self.target_schema);
+
+        let projection = ProjectionExprs::from_indices(
+            &(0..self.target_schema.fields().len()).collect_vec(),
+            &self.target_schema,
+        );
+
+        let adapted = projection
+            .try_map_exprs(|e| simplifier.simplify(expr_adapter.rewrite(e)?))?;
+        let projector = adapted.make_projector(source_schema)?;
+
+        Ok(BatchAdapter { projector })
+    }
+}
+
+/// Adapter for transforming record batches to match a target schema.
+///
+/// Create instances via [`BatchAdapterFactory`].
+///
+/// ## Performance
+///
+/// The adapter pre-computes the projection expressions during creation,
+/// so the [`adapt_batch`](BatchAdapter::adapt_batch) call is efficient and suitable
+/// for use in hot paths like streaming file scans.
+#[derive(Debug)]
+pub struct BatchAdapter {
+    projector: Projector,
+}
+
+impl BatchAdapter {
+    /// Adapt the given record batch to match the target schema.
+    ///
+    /// The input batch *must* conform to the source schema used when
+    /// creating this adapter.
+    pub fn adapt_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        self.projector.project_batch(batch)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        BooleanArray, Int32Array, Int64Array, RecordBatch, RecordBatchOptions,
+        StringArray, StringViewArray, StructArray,
+    };
+    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+    use datafusion_common::{Result, ScalarValue, assert_contains, record_batch};
+    use datafusion_expr::Operator;
+    use datafusion_physical_expr::expressions::{Column, Literal, col, lit};
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use itertools::Itertools;
+    use std::sync::Arc;
+
+    fn create_test_schema() -> (Schema, Schema) {
+        let physical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]);
+
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false), // Different type
+            Field::new("b", DataType::Utf8, true),
+            Field::new("c", DataType::Float64, true), // Missing from physical
+        ]);
+
+        (physical_schema, logical_schema)
+    }
+
+    #[test]
+    fn test_rewrite_column_with_type_cast() {
+        let (physical_schema, logical_schema) = create_test_schema();
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("a", 0));
+
+        let result = adapter.rewrite(column_expr).unwrap();
+
+        // Should be wrapped in a cast expression
+        assert!(result.as_any().downcast_ref::<CastColumnExpr>().is_some());
+    }
+
+    #[test]
+    fn test_rewrite_column_with_metadata_or_nullability_mismatch() -> Result<()> {
+        use std::collections::HashMap;
+
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let logical_schema =
+            Schema::new(vec![Field::new("a", DataType::Int64, false).with_metadata(
+                HashMap::from([("logical_meta".to_string(), "1".to_string())]),
+            )]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema.clone()))
+            .unwrap();
+
+        let result = adapter.rewrite(Arc::new(Column::new("a", 0)))?;
+        let cast = result
+            .as_any()
+            .downcast_ref::<CastColumnExpr>()
+            .expect("Expected CastColumnExpr");
+
+        assert_eq!(cast.target_field().data_type(), &DataType::Int64);
+        assert!(!cast.target_field().is_nullable());
+        assert_eq!(
+            cast.target_field()
+                .metadata()
+                .get("logical_meta")
+                .map(String::as_str),
+            Some("1")
+        );
+
+        // Ensure the expression reports the logical nullability regardless of input schema
+        assert!(!result.nullable(physical_schema.as_ref())?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_rewrite_multi_column_expr_with_type_cast() {
+        let (physical_schema, logical_schema) = create_test_schema();
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+
+        // Create a complex expression: (a + 5) OR (c > 0.0) that tests the recursive case of the rewriter
+        let column_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let column_c = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
+        let expr = expressions::BinaryExpr::new(
+            Arc::clone(&column_a),
+            Operator::Plus,
+            Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
+        );
+        let expr = expressions::BinaryExpr::new(
+            Arc::new(expr),
+            Operator::Or,
+            Arc::new(expressions::BinaryExpr::new(
+                Arc::clone(&column_c),
+                Operator::Gt,
+                Arc::new(Literal::new(ScalarValue::Float64(Some(0.0)))),
+            )),
+        );
+
+        let result = adapter.rewrite(Arc::new(expr)).unwrap();
+        println!("Rewritten expression: {result}");
+
+        let expected = expressions::BinaryExpr::new(
+            Arc::new(CastColumnExpr::new(
+                Arc::new(Column::new("a", 0)),
+                Arc::new(Field::new("a", DataType::Int32, false)),
+                Arc::new(Field::new("a", DataType::Int64, false)),
+                None,
+            )),
+            Operator::Plus,
+            Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
+        );
+        let expected = Arc::new(expressions::BinaryExpr::new(
+            Arc::new(expected),
+            Operator::Or,
+            Arc::new(expressions::BinaryExpr::new(
+                lit(ScalarValue::Float64(None)), // c is missing, so it becomes null
+                Operator::Gt,
+                Arc::new(Literal::new(ScalarValue::Float64(Some(0.0)))),
+            )),
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert_eq!(
+            result.to_string(),
+            expected.to_string(),
+            "The rewritten expression did not match the expected output"
+        );
+    }
+
+    #[test]
+    fn test_rewrite_struct_column_incompatible() {
+        let physical_schema = Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(vec![Field::new("field1", DataType::Binary, true)].into()),
+            true,
+        )]);
+
+        let logical_schema = Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(vec![Field::new("field1", DataType::Int32, true)].into()),
+            true,
+        )]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("data", 0));
+
+        let error_msg = adapter.rewrite(column_expr).unwrap_err().to_string();
+        // validate_struct_compatibility provides more specific error about which field can't be cast
+        assert_contains!(
+            error_msg,
+            "Cannot cast struct field 'field1' from type Binary to type Int32"
+        );
+    }
+
+    #[test]
+    fn test_rewrite_struct_compatible_cast() {
+        let physical_schema = Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(
+                vec![
+                    Field::new("id", DataType::Int32, false),
+                    Field::new("name", DataType::Utf8, true),
+                ]
+                .into(),
+            ),
+            false,
+        )]);
+
+        let logical_schema = Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(
+                vec![
+                    Field::new("id", DataType::Int64, false),
+                    Field::new("name", DataType::Utf8View, true),
+                ]
+                .into(),
+            ),
+            false,
+        )]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("data", 0));
+
+        let result = adapter.rewrite(column_expr).unwrap();
+
+        let physical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+        let physical_field = Arc::new(Field::new(
+            "data",
+            DataType::Struct(physical_struct_fields),
+            false,
+        ));
+
+        let logical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8View, true),
+        ]
+        .into();
+        let logical_field = Arc::new(Field::new(
+            "data",
+            DataType::Struct(logical_struct_fields),
+            false,
+        ));
+
+        let expected = Arc::new(CastColumnExpr::new(
+            Arc::new(Column::new("data", 0)),
+            physical_field,
+            logical_field,
+            None,
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert_eq!(result.to_string(), expected.to_string());
+    }
+
+    #[test]
+    fn test_rewrite_missing_column() -> Result<()> {
+        let (physical_schema, logical_schema) = create_test_schema();
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("c", 2));
+
+        let result = adapter.rewrite(column_expr)?;
+
+        // Should be replaced with a literal null
+        if let Some(literal) = result.as_any().downcast_ref::<Literal>() {
+            assert_eq!(*literal.value(), ScalarValue::Float64(None));
+        } else {
+            panic!("Expected literal expression");
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_rewrite_missing_column_propagates_metadata() -> Result<()> {
+        use std::collections::HashMap;
+
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true).with_metadata(HashMap::from([(
+                "logical_meta".to_string(),
+                "1".to_string(),
+            )])),
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema.clone()))
+            .unwrap();
+
+        let result = adapter.rewrite(Arc::new(Column::new("b", 1)))?;
+        let literal = result
+            .as_any()
+            .downcast_ref::<Literal>()
+            .expect("Expected literal expression");
+
+        assert_eq!(
+            literal
+                .return_field(physical_schema.as_ref())?
+                .metadata()
+                .get("logical_meta")
+                .map(String::as_str),
+            Some("1")
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_rewrite_missing_column_non_nullable_error() {
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Utf8, false), // Missing and non-nullable
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("b", 1));
+
+        let error_msg = adapter.rewrite(column_expr).unwrap_err().to_string();
+        assert_contains!(error_msg, "Non-nullable column 'b' is missing");
+    }
+
+    #[test]
+    fn test_rewrite_missing_column_nullable() {
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Utf8, true), // Missing but nullable
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("b", 1));
+
+        let result = adapter.rewrite(column_expr).unwrap();
+
+        let expected =
+            Arc::new(Literal::new(ScalarValue::Utf8(None))) as Arc<dyn PhysicalExpr>;
+
+        assert_eq!(result.to_string(), expected.to_string());
+    }
+
+    #[test]
+    fn test_replace_columns_with_literals() -> Result<()> {
+        let partition_value = ScalarValue::Utf8(Some("test_value".to_string()));
+        let replacements = HashMap::from([("partition_col", &partition_value)]);
+
+        let column_expr =
+            Arc::new(Column::new("partition_col", 0)) as Arc<dyn PhysicalExpr>;
+        let result = replace_columns_with_literals(column_expr, &replacements)?;
+
+        // Should be replaced with the partition value
+        let literal = result
+            .as_any()
+            .downcast_ref::<Literal>()
+            .expect("Expected literal expression");
+        assert_eq!(*literal.value(), partition_value);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_replace_columns_with_literals_no_match() -> Result<()> {
+        let value = ScalarValue::Utf8(Some("test_value".to_string()));
+        let replacements = HashMap::from([("other_col", &value)]);
+
+        let column_expr =
+            Arc::new(Column::new("partition_col", 0)) as Arc<dyn PhysicalExpr>;
+        let result = replace_columns_with_literals(column_expr, &replacements)?;
+
+        assert!(result.as_any().downcast_ref::<Column>().is_some());
+        Ok(())
+    }
+
+    #[test]
+    fn test_replace_columns_with_literals_nested_expr() -> Result<()> {
+        let value_a = ScalarValue::Int64(Some(10));
+        let value_b = ScalarValue::Int64(Some(20));
+        let replacements = HashMap::from([("a", &value_a), ("b", &value_b)]);
+
+        let expr = Arc::new(expressions::BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Plus,
+            Arc::new(Column::new("b", 1)),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let result = replace_columns_with_literals(expr, &replacements)?;
+        assert_eq!(result.to_string(), "10 + 20");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_rewrite_no_change_needed() -> Result<()> {
+        let (physical_schema, logical_schema) = create_test_schema();
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
+
+        let result = adapter.rewrite(Arc::clone(&column_expr))?;
+
+        // Should be the same expression (no transformation needed)
+        // We compare the underlying pointer through the trait object
+        assert!(std::ptr::eq(
+            column_expr.as_ref() as *const dyn PhysicalExpr,
+            result.as_ref() as *const dyn PhysicalExpr
+        ));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_non_nullable_missing_column_error() {
+        let physical_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false), // Non-nullable missing column
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+        let column_expr = Arc::new(Column::new("b", 1));
+
+        let result = adapter.rewrite(column_expr);
+        assert!(result.is_err());
+        assert_contains!(
+            result.unwrap_err().to_string(),
+            "Non-nullable column 'b' is missing from the physical schema"
+        );
+    }
+
+    /// Helper function to project expressions onto a RecordBatch
+    fn batch_project(
+        expr: Vec<Arc<dyn PhysicalExpr>>,
+        batch: &RecordBatch,
+        schema: SchemaRef,
+    ) -> Result<RecordBatch> {
+        let arrays = expr
+            .iter()
+            .map(|expr| {
+                expr.evaluate(batch)
+                    .and_then(|v| v.into_array(batch.num_rows()))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        if arrays.is_empty() {
+            let options =
+                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+            RecordBatch::try_new_with_options(Arc::clone(&schema), arrays, &options)
+                .map_err(Into::into)
+        } else {
+            RecordBatch::try_new(Arc::clone(&schema), arrays).map_err(Into::into)
+        }
+    }
+
+    /// Example showing how we can use the `DefaultPhysicalExprAdapter` to adapt RecordBatches during a scan
+    /// to apply projections, type conversions and handling of missing columns all at once.
+    #[test]
+    fn test_adapt_batches() {
+        let physical_batch = record_batch!(
+            ("a", Int32, vec![Some(1), None, Some(3)]),
+            ("extra", Utf8, vec![Some("x"), Some("y"), None])
+        )
+        .unwrap();
+
+        let physical_schema = physical_batch.schema();
+
+        let logical_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, true), // Different type
+            Field::new("b", DataType::Utf8, true),  // Missing from physical
+        ]));
+
+        let projection = vec![
+            col("b", &logical_schema).unwrap(),
+            col("a", &logical_schema).unwrap(),
+        ];
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::clone(&logical_schema), Arc::clone(&physical_schema))
+            .unwrap();
+
+        let adapted_projection = projection
+            .into_iter()
+            .map(|expr| adapter.rewrite(expr).unwrap())
+            .collect_vec();
+
+        let adapted_schema = Arc::new(Schema::new(
+            adapted_projection
+                .iter()
+                .map(|expr| expr.return_field(&physical_schema).unwrap())
+                .collect_vec(),
+        ));
+
+        let res = batch_project(
+            adapted_projection,
+            &physical_batch,
+            Arc::clone(&adapted_schema),
+        )
+        .unwrap();
+
+        assert_eq!(res.num_columns(), 2);
+        assert_eq!(res.column(0).data_type(), &DataType::Utf8);
+        assert_eq!(res.column(1).data_type(), &DataType::Int64);
+        assert_eq!(
+            res.column(0)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .unwrap()
+                .iter()
+                .collect_vec(),
+            vec![None, None, None]
+        );
+        assert_eq!(
+            res.column(1)
+                .as_any()
+                .downcast_ref::<Int64Array>()
+                .unwrap()
+                .iter()
+                .collect_vec(),
+            vec![Some(1), None, Some(3)]
+        );
+    }
+
+    /// Test that struct columns are properly adapted including:
+    /// - Type casting of subfields (Int32 -> Int64, Utf8 -> Utf8View)
+    /// - Missing fields in logical schema are filled with nulls
+    #[test]
+    fn test_adapt_struct_batches() {
+        // Physical struct: {id: Int32, name: Utf8}
+        let physical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+
+        let struct_array = StructArray::new(
+            physical_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as _,
+                Arc::new(StringArray::from(vec![
+                    Some("alice"),
+                    None,
+                    Some("charlie"),
+                ])) as _,
+            ],
+            None,
+        );
+
+        let physical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(physical_struct_fields),
+            false,
+        )]));
+
+        let physical_batch = RecordBatch::try_new(
+            Arc::clone(&physical_schema),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        // Logical struct: {id: Int64, name: Utf8View, extra: Boolean}
+        // - id: cast from Int32 to Int64
+        // - name: cast from Utf8 to Utf8View
+        // - extra: missing from physical, should be filled with nulls
+        let logical_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8View, true),
+            Field::new("extra", DataType::Boolean, true), // New field, not in physical
+        ]
+        .into();
+
+        let logical_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(logical_struct_fields),
+            false,
+        )]));
+
+        let projection = vec![col("data", &logical_schema).unwrap()];
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::clone(&logical_schema), Arc::clone(&physical_schema))
+            .unwrap();
+
+        let adapted_projection = projection
+            .into_iter()
+            .map(|expr| adapter.rewrite(expr).unwrap())
+            .collect_vec();
+
+        let adapted_schema = Arc::new(Schema::new(
+            adapted_projection
+                .iter()
+                .map(|expr| expr.return_field(&physical_schema).unwrap())
+                .collect_vec(),
+        ));
+
+        let res = batch_project(
+            adapted_projection,
+            &physical_batch,
+            Arc::clone(&adapted_schema),
+        )
+        .unwrap();
+
+        assert_eq!(res.num_columns(), 1);
+
+        let result_struct = res
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+
+        // Verify id field is cast to Int64
+        let id_col = result_struct.column_by_name("id").unwrap();
+        assert_eq!(id_col.data_type(), &DataType::Int64);
+        let id_values = id_col.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(
+            id_values.iter().collect_vec(),
+            vec![Some(1), Some(2), Some(3)]
+        );
+
+        // Verify name field is cast to Utf8View
+        let name_col = result_struct.column_by_name("name").unwrap();
+        assert_eq!(name_col.data_type(), &DataType::Utf8View);
+        let name_values = name_col.as_any().downcast_ref::<StringViewArray>().unwrap();
+        assert_eq!(
+            name_values.iter().collect_vec(),
+            vec![Some("alice"), None, Some("charlie")]
+        );
+
+        // Verify extra field (missing from physical) is filled with nulls
+        let extra_col = result_struct.column_by_name("extra").unwrap();
+        assert_eq!(extra_col.data_type(), &DataType::Boolean);
+        let extra_values = extra_col.as_any().downcast_ref::<BooleanArray>().unwrap();
+        assert_eq!(extra_values.iter().collect_vec(), vec![None, None, None]);
+    }
+
+    #[test]
+    fn test_try_rewrite_struct_field_access() {
+        // Test the core logic of try_rewrite_struct_field_access
+        let physical_schema = Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![Field::new("existing_field", DataType::Int32, true)].into(),
+            ),
+            true,
+        )]);
+
+        let logical_schema = Schema::new(vec![Field::new(
+            "struct_col",
+            DataType::Struct(
+                vec![
+                    Field::new("existing_field", DataType::Int32, true),
+                    Field::new("missing_field", DataType::Utf8, true),
+                ]
+                .into(),
+            ),
+            true,
+        )]);
+
+        let rewriter = DefaultPhysicalExprAdapterRewriter {
+            logical_file_schema: Arc::new(logical_schema),
+            physical_file_schema: Arc::new(physical_schema),
+        };
+
+        // Test that when a field exists in physical schema, it returns None
+        let column = Arc::new(Column::new("struct_col", 0)) as Arc<dyn PhysicalExpr>;
+        let result = rewriter.try_rewrite_struct_field_access(&column).unwrap();
+        assert!(result.is_none());
+
+        // The actual test for the get_field expression would require creating a proper ScalarFunctionExpr
+        // with ScalarUDF, which is complex to set up in a unit test. The integration tests in
+        // datafusion/core/tests/parquet/schema_adapter.rs provide better coverage for this functionality.
+    }
+
+    // ============================================================================
+    // BatchAdapterFactory and BatchAdapter tests
+    // ============================================================================
+
+    #[test]
+    fn test_batch_adapter_factory_basic() {
+        // Target schema
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        // Source schema with different column order and type
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Utf8, true),
+            Field::new("a", DataType::Int32, false), // Int32 -> Int64
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+
+        // Create source batch
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![
+                Arc::new(StringArray::from(vec![Some("hello"), None, Some("world")])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        // Verify schema matches target
+        assert_eq!(adapted.num_columns(), 2);
+        assert_eq!(adapted.schema().field(0).name(), "a");
+        assert_eq!(adapted.schema().field(0).data_type(), &DataType::Int64);
+        assert_eq!(adapted.schema().field(1).name(), "b");
+        assert_eq!(adapted.schema().field(1).data_type(), &DataType::Utf8);
+
+        // Verify data
+        let col_a = adapted
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        assert_eq!(col_a.iter().collect_vec(), vec![Some(1), Some(2), Some(3)]);
+
+        let col_b = adapted
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!(
+            col_b.iter().collect_vec(),
+            vec![Some("hello"), None, Some("world")]
+        );
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_missing_column() {
+        // Target schema with a column missing from source
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true), // exists in source
+            Field::new("c", DataType::Float64, true), // missing from source
+        ]));
+
+        let source_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        assert_eq!(adapted.num_columns(), 3);
+
+        // Missing column should be filled with nulls
+        let col_c = adapted.column(2);
+        assert_eq!(col_c.data_type(), &DataType::Float64);
+        assert_eq!(col_c.null_count(), 2); // All nulls
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_with_struct() {
+        // Target has struct with Int64 id
+        let target_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+        let target_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(target_struct_fields),
+            false,
+        )]));
+
+        // Source has struct with Int32 id
+        let source_struct_fields: Fields = vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]
+        .into();
+        let source_schema = Arc::new(Schema::new(vec![Field::new(
+            "data",
+            DataType::Struct(source_struct_fields.clone()),
+            false,
+        )]));
+
+        let struct_array = StructArray::new(
+            source_struct_fields,
+            vec![
+                Arc::new(Int32Array::from(vec![10, 20])) as _,
+                Arc::new(StringArray::from(vec!["a", "b"])) as _,
+            ],
+            None,
+        );
+
+        let source_batch = RecordBatch::try_new(
+            Arc::clone(&source_schema),
+            vec![Arc::new(struct_array)],
+        )
+        .unwrap();
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+        let adapter = factory.make_adapter(&source_schema).unwrap();
+        let adapted = adapter.adapt_batch(&source_batch).unwrap();
+
+        let result_struct = adapted
+            .column(0)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+
+        // Verify id was cast to Int64
+        let id_col = result_struct.column_by_name("id").unwrap();
+        assert_eq!(id_col.data_type(), &DataType::Int64);
+        let id_values = id_col.as_any().downcast_ref::<Int64Array>().unwrap();
+        assert_eq!(id_values.iter().collect_vec(), vec![Some(10), Some(20)]);
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_identity() {
+        // When source and target schemas are identical, should pass through efficiently
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&schema));
+        let adapter = factory.make_adapter(&schema).unwrap();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )
+        .unwrap();
+
+        let adapted = adapter.adapt_batch(&batch).unwrap();
+
+        assert_eq!(adapted.num_columns(), 2);
+        assert_eq!(adapted.schema().field(0).data_type(), &DataType::Int32);
+        assert_eq!(adapted.schema().field(1).data_type(), &DataType::Utf8);
+    }
+
+    #[test]
+    fn test_batch_adapter_factory_reuse() {
+        // Factory can create multiple adapters for different source schemas
+        let target_schema = Arc::new(Schema::new(vec![
+            Field::new("x", DataType::Int64, false),
+            Field::new("y", DataType::Utf8, true),
+        ]));
+
+        let factory = BatchAdapterFactory::new(Arc::clone(&target_schema));
+
+        // First source schema
+        let source1 = Arc::new(Schema::new(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, true),
+        ]));
+        let adapter1 = factory.make_adapter(&source1).unwrap();
+
+        // Second source schema (different order)
+        let source2 = Arc::new(Schema::new(vec![
+            Field::new("y", DataType::Utf8, true),
+            Field::new("x", DataType::Int64, false),
+        ]));
+        let adapter2 = factory.make_adapter(&source2).unwrap();
+
+        // Both should work correctly
+        assert!(format!("{adapter1:?}").contains("BatchAdapter"));
+        assert!(format!("{adapter2:?}").contains("BatchAdapter"));
+    }
+
+    #[test]
+    fn test_rewrite_column_index_and_type_mismatch() {
+        let physical_schema = Schema::new(vec![
+            Field::new("b", DataType::Utf8, true),
+            Field::new("a", DataType::Int32, false), // Index 1
+        ]);
+
+        let logical_schema = Schema::new(vec![
+            Field::new("a", DataType::Int64, false), // Index 0, Different Type
+            Field::new("b", DataType::Utf8, true),
+        ]);
+
+        let factory = DefaultPhysicalExprAdapterFactory;
+        let adapter = factory
+            .create(Arc::new(logical_schema), Arc::new(physical_schema))
+            .unwrap();
+
+        // Logical column "a" is at index 0
+        let column_expr = Arc::new(Column::new("a", 0));
+
+        let result = adapter.rewrite(column_expr).unwrap();
+
+        // Should be a CastColumnExpr
+        let cast_expr = result
+            .as_any()
+            .downcast_ref::<CastColumnExpr>()
+            .expect("Expected CastColumnExpr");
+
+        // Verify the inner column points to the correct physical index (1)
+        let inner_col = cast_expr
+            .expr()
+            .as_any()
+            .downcast_ref::<Column>()
+            .expect("Expected inner Column");
+        assert_eq!(inner_col.name(), "a");
+        assert_eq!(inner_col.index(), 1); // Physical index is 1
+
+        // Verify cast types
+        assert_eq!(
+            cast_expr.data_type(&Schema::empty()).unwrap(),
+            DataType::Int64
+        );
+    }
+
+    #[test]
+    fn test_create_cast_column_expr_uses_name_lookup_not_column_index() {
+        // Physical schema has column `a` at index 1; index 0 is an incompatible type.
+        let physical_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Binary, true),
+            Field::new("a", DataType::Int32, false),
+        ]));
+
+        let logical_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Binary, true),
+        ]));
+
+        let rewriter = DefaultPhysicalExprAdapterRewriter {
+            logical_file_schema: Arc::clone(&logical_schema),
+            physical_file_schema: Arc::clone(&physical_schema),
+        };
+
+        // Deliberately provide the wrong index for column `a`.
+        // Regression: this must still resolve against physical field `a` by name.
+        let transformed = rewriter
+            .create_cast_column_expr(
+                Column::new("a", 0),
+                Arc::new(physical_schema.field_with_name("a").unwrap().clone()),
+                logical_schema.field_with_name("a").unwrap(),
+            )
+            .unwrap();
+
+        let cast_expr = transformed
+            .data
+            .as_any()
+            .downcast_ref::<CastColumnExpr>()
+            .expect("Expected CastColumnExpr");
+
+        assert_eq!(cast_expr.input_field().name(), "a");
+        assert_eq!(cast_expr.input_field().data_type(), &DataType::Int32);
+        assert_eq!(cast_expr.target_field().data_type(), &DataType::Int64);
+    }
+}
diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml
index a5a12b5527b7d..119be362348d5 100644
--- a/datafusion/physical-expr-common/Cargo.toml
+++ b/datafusion/physical-expr-common/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -38,9 +41,19 @@ workspace = true
 name = "datafusion_physical_expr_common"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
+chrono = { workspace = true }
+datafusion-common = { workspace = true }
 datafusion-expr-common = { workspace = true }
 hashbrown = { workspace = true }
+indexmap = { workspace = true }
 itertools = { workspace = true }
+parking_lot = { workspace = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+rand = { workspace = true }
+
+[[bench]]
+harness = false
+name = "compare_nested"
diff --git a/datafusion/physical-expr-common/README.md b/datafusion/physical-expr-common/README.md
index 7a1eff77d3b4f..c318e7468183f 100644
--- a/datafusion/physical-expr-common/README.md
+++ b/datafusion/physical-expr-common/README.md
@@ -17,11 +17,19 @@
   under the License.
 -->
 
-# DataFusion Core Physical Expressions
+# Apache DataFusion Core Physical Expressions
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides shared APIs for implementing
-physical expressions such as `PhysicalExpr` and `PhysicalSortExpr`.
+physical expressions such as [`PhysicalExpr`] and [`PhysicalSortExpr`].
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
+[`physicalexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/trait.PhysicalExpr.html
+[`physicalsortexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/struct.PhysicalSortExpr.html
diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs
new file mode 100644
index 0000000000000..56c122fef9420
--- /dev/null
+++ b/datafusion/physical-expr-common/benches/compare_nested.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr_common::operator::Operator;
+use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Build a StructArray with fields {x: Int32, y: Utf8}.
+fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef {
+    let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::<i32>())).collect();
+
+    let strings: StringArray = (0..num_rows)
+        .map(|_| {
+            let s: String = (0..12)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    let fields = Fields::from(vec![
+        Field::new("x", DataType::Int32, false),
+        Field::new("y", DataType::Utf8, false),
+    ]);
+
+    Arc::new(
+        StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None)
+            .unwrap(),
+    )
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let num_rows = 8192;
+    let mut rng = StdRng::seed_from_u64(42);
+
+    let lhs = make_struct_array(num_rows, &mut rng);
+    let rhs_array = make_struct_array(num_rows, &mut rng);
+    let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng));
+
+    c.bench_function("compare_nested array_array", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap())
+        })
+    });
+
+    c.bench_function("compare_nested array_scalar", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap())
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr-common/src/binary_map.rs b/datafusion/physical-expr-common/src/binary_map.rs
index b37d9a7773eeb..ad184d6500d56 100644
--- a/datafusion/physical-expr-common/src/binary_map.rs
+++ b/datafusion/physical-expr-common/src/binary_map.rs
@@ -18,15 +18,15 @@
 //! [`ArrowBytesMap`] and [`ArrowBytesSet`] for storing maps/sets of values from
 //! StringArray / LargeStringArray / BinaryArray / LargeBinaryArray.
 
-use ahash::RandomState;
 use arrow::array::{
-    cast::AsArray,
-    types::{ByteArrayType, GenericBinaryType, GenericStringType},
     Array, ArrayRef, BufferBuilder, GenericBinaryArray, GenericStringArray,
     NullBufferBuilder, OffsetSizeTrait,
+    cast::AsArray,
+    types::{ByteArrayType, GenericBinaryType, GenericStringType},
 };
 use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::DataType;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 use std::any::type_name;
@@ -250,7 +250,7 @@ where
             map_size: 0,
             buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY),
             offsets: vec![O::default()], // first offset is always 0
-            random_state: RandomState::new(),
+            random_state: RandomState::default(),
             hashes_buffer: vec![],
             null: None,
         }
@@ -349,7 +349,7 @@ where
         let batch_hashes = &mut self.hashes_buffer;
         batch_hashes.clear();
         batch_hashes.resize(values.len(), 0);
-        create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes)
+        create_hashes([values], &self.random_state, batch_hashes)
             // hash is supported for all types and create_hashes only
             // returns errors for unsupported types
             .unwrap();
@@ -389,7 +389,7 @@ where
                 // is value is already present in the set?
                 let entry = self.map.find_mut(hash, |header| {
                     // compare value if hashes match
-                    if header.len != value_len {
+                    if header.hash != hash || header.len != value_len {
                         return false;
                     }
                     // value is stored inline so no need to consult buffer
@@ -427,7 +427,7 @@ where
                 // Check if the value is already present in the set
                 let entry = self.map.find_mut(hash, |header| {
                     // compare value if hashes match
-                    if header.len != value_len {
+                    if header.hash != hash {
                         return false;
                     }
                     // Need to compare the bytes in the buffer
diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs
index 7ce943030a453..89a97e18bebf0 100644
--- a/datafusion/physical-expr-common/src/binary_view_map.rs
+++ b/datafusion/physical-expr-common/src/binary_view_map.rs
@@ -17,19 +17,19 @@
 
 //! [`ArrowBytesViewMap`] and [`ArrowBytesViewSet`] for storing maps/sets of values from
 //! `StringViewArray`/`BinaryViewArray`.
-//! Much of the code is from `binary_map.rs`, but with simpler implementation because we directly use the
-//! [`GenericByteViewBuilder`].
-use ahash::RandomState;
+use crate::binary_map::OutputType;
+use arrow::array::NullBufferBuilder;
 use arrow::array::cast::AsArray;
-use arrow::array::{Array, ArrayBuilder, ArrayRef, GenericByteViewBuilder};
+use arrow::array::{Array, ArrayRef, BinaryViewArray, ByteView, make_view};
+use arrow::buffer::{Buffer, ScalarBuffer};
 use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType};
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
 use std::fmt::Debug;
+use std::mem::size_of;
 use std::sync::Arc;
 
-use crate::binary_map::OutputType;
-
 /// HashSet optimized for storing string or binary values that can produce that
 /// the final set as a `GenericBinaryViewArray` with minimal copies.
 #[derive(Debug)]
@@ -114,6 +114,9 @@ impl ArrowBytesViewSet {
 /// This map is used by the special `COUNT DISTINCT` aggregate function to
 /// store the distinct values, and by the `GROUP BY` operator to store
 /// group values when they are a single string array.
+/// Max size of the in-progress buffer before flushing to completed buffers
+const BYTE_VIEW_MAX_BLOCK_SIZE: usize = 2 * 1024 * 1024;
+
 pub struct ArrowBytesViewMap<V>
 where
     V: Debug + PartialEq + Eq + Clone + Copy + Default,
@@ -125,8 +128,15 @@ where
     /// Total size of the map in bytes
     map_size: usize,
 
-    /// Builder for output array
-    builder: GenericByteViewBuilder<BinaryViewType>,
+    /// Views for all stored values (in insertion order)
+    views: Vec<u128>,
+    /// In-progress buffer for out-of-line string data
+    in_progress: Vec<u8>,
+    /// Completed buffers containing string data
+    completed: Vec<Buffer>,
+    /// Tracks null values (true = null)
+    nulls: NullBufferBuilder,
+
     /// random state used to generate hashes
     random_state: RandomState,
     /// buffer that stores hash values (reused across batches to save allocations)
@@ -149,8 +159,11 @@ where
             output_type,
             map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
             map_size: 0,
-            builder: GenericByteViewBuilder::new(),
-            random_state: RandomState::new(),
+            views: Vec::new(),
+            in_progress: Vec::new(),
+            completed: Vec::new(),
+            nulls: NullBufferBuilder::new(0),
+            random_state: RandomState::default(),
             hashes_buffer: vec![],
             null: None,
         }
@@ -243,7 +256,7 @@ where
         let batch_hashes = &mut self.hashes_buffer;
         batch_hashes.clear();
         batch_hashes.resize(values.len(), 0);
-        create_hashes(&[Arc::clone(values)], &self.random_state, batch_hashes)
+        create_hashes([values], &self.random_state, batch_hashes)
             // hash is supported for all types and create_hashes only
             // returns errors for unsupported types
             .unwrap();
@@ -251,53 +264,92 @@ where
         // step 2: insert each value into the set, if not already present
         let values = values.as_byte_view::<B>();
 
+        // Get raw views buffer for direct comparison
+        let input_views = values.views();
+
         // Ensure lengths are equivalent
-        assert_eq!(values.len(), batch_hashes.len());
+        assert_eq!(values.len(), self.hashes_buffer.len());
+
+        for i in 0..values.len() {
+            let view_u128 = input_views[i];
+            let hash = self.hashes_buffer[i];
 
-        for (value, &hash) in values.iter().zip(batch_hashes.iter()) {
-            // handle null value
-            let Some(value) = value else {
+            // handle null value via validity bitmap check
+            if values.is_null(i) {
                 let payload = if let Some(&(payload, _offset)) = self.null.as_ref() {
                     payload
                 } else {
                     let payload = make_payload_fn(None);
-                    let null_index = self.builder.len();
-                    self.builder.append_null();
+                    let null_index = self.views.len();
+                    self.views.push(0);
+                    self.nulls.append_null();
                     self.null = Some((payload, null_index));
                     payload
                 };
                 observe_payload_fn(payload);
                 continue;
-            };
-
-            // get the value as bytes
-            let value: &[u8] = value.as_ref();
+            }
 
-            let entry = self.map.find_mut(hash, |header| {
-                let v = self.builder.get_value(header.view_idx);
+            // Extract length from the view (first 4 bytes of u128 in little-endian)
+            let len = view_u128 as u32;
 
-                if v.len() != value.len() {
-                    return false;
-                }
+            // Check if value already exists
+            let maybe_payload = {
+                // Borrow completed and in_progress for comparison
+                let completed = &self.completed;
+                let in_progress = &self.in_progress;
 
-                v == value
-            });
+                self.map
+                    .find(hash, |header| {
+                        if header.hash != hash {
+                            return false;
+                        }
+
+                        // Fast path: inline strings can be compared directly
+                        if len <= 12 {
+                            return header.view == view_u128;
+                        }
+
+                        // For larger strings: first compare the 4-byte prefix
+                        let stored_prefix = (header.view >> 32) as u32;
+                        let input_prefix = (view_u128 >> 32) as u32;
+                        if stored_prefix != input_prefix {
+                            return false;
+                        }
+
+                        // Prefix matched - compare full bytes
+                        let byte_view = ByteView::from(header.view);
+                        let stored_len = byte_view.length as usize;
+                        let buffer_index = byte_view.buffer_index as usize;
+                        let offset = byte_view.offset as usize;
+
+                        let stored_value = if buffer_index < completed.len() {
+                            &completed[buffer_index].as_slice()
+                                [offset..offset + stored_len]
+                        } else {
+                            &in_progress[offset..offset + stored_len]
+                        };
+                        let input_value: &[u8] = values.value(i).as_ref();
+                        stored_value == input_value
+                    })
+                    .map(|entry| entry.payload)
+            };
 
-            let payload = if let Some(entry) = entry {
-                entry.payload
+            let payload = if let Some(payload) = maybe_payload {
+                payload
             } else {
-                // no existing value, make a new one.
+                // no existing value, make a new one
+                let value: &[u8] = values.value(i).as_ref();
                 let payload = make_payload_fn(Some(value));
 
-                let inner_view_idx = self.builder.len();
+                // Create view pointing to our buffers
+                let new_view = self.append_value(value);
                 let new_header = Entry {
-                    view_idx: inner_view_idx,
+                    view: new_view,
                     hash,
                     payload,
                 };
 
-                self.builder.append_value(value);
-
                 self.map
                     .insert_accounted(new_header, |h| h.hash, &mut self.map_size);
                 payload
@@ -312,29 +364,58 @@ where
     ///
     /// The values are guaranteed to be returned in the same order in which
     /// they were first seen.
-    pub fn into_state(self) -> ArrayRef {
-        let mut builder = self.builder;
-        match self.output_type {
-            OutputType::BinaryView => {
-                let array = builder.finish();
+    pub fn into_state(mut self) -> ArrayRef {
+        // Flush any remaining in-progress buffer
+        if !self.in_progress.is_empty() {
+            let flushed = std::mem::take(&mut self.in_progress);
+            self.completed.push(Buffer::from_vec(flushed));
+        }
 
-                Arc::new(array)
-            }
+        // Build null buffer if we have any nulls
+        let null_buffer = self.nulls.finish();
+
+        let views = ScalarBuffer::from(self.views);
+        let array =
+            unsafe { BinaryViewArray::new_unchecked(views, self.completed, null_buffer) };
+
+        match self.output_type {
+            OutputType::BinaryView => Arc::new(array),
             OutputType::Utf8View => {
-                // SAFETY:
-                // we asserted the input arrays were all the correct type and
-                // thus since all the values that went in were valid (e.g. utf8)
-                // so are all the values that come out
-                let array = builder.finish();
+                // SAFETY: all input was valid utf8
                 let array = unsafe { array.to_string_view_unchecked() };
                 Arc::new(array)
             }
-            _ => {
-                unreachable!("Utf8/Binary should use `ArrowBytesMap`")
-            }
+            _ => unreachable!("Utf8/Binary should use `ArrowBytesMap`"),
         }
     }
 
+    /// Append a value to our buffers and return the view pointing to it
+    fn append_value(&mut self, value: &[u8]) -> u128 {
+        let len = value.len();
+        let view = if len <= 12 {
+            make_view(value, 0, 0)
+        } else {
+            // Ensure buffer is big enough
+            if self.in_progress.len() + len > BYTE_VIEW_MAX_BLOCK_SIZE {
+                let flushed = std::mem::replace(
+                    &mut self.in_progress,
+                    Vec::with_capacity(BYTE_VIEW_MAX_BLOCK_SIZE),
+                );
+                self.completed.push(Buffer::from_vec(flushed));
+            }
+
+            let buffer_index = self.completed.len() as u32;
+            let offset = self.in_progress.len() as u32;
+            self.in_progress.extend_from_slice(value);
+
+            make_view(value, buffer_index, offset)
+        };
+
+        self.views.push(view);
+        self.nulls.append_non_null();
+        view
+    }
+
     /// Total number of entries (including null, if present)
     pub fn len(&self) -> usize {
         self.non_null_len() + self.null.map(|_| 1).unwrap_or(0)
@@ -353,8 +434,16 @@ where
     /// Return the total size, in bytes, of memory used to store the data in
     /// this set, not including `self`
     pub fn size(&self) -> usize {
+        let views_size = self.views.len() * size_of::<u128>();
+        let in_progress_size = self.in_progress.capacity();
+        let completed_size: usize = self.completed.iter().map(|b| b.len()).sum();
+        let nulls_size = self.nulls.allocated_size();
+
         self.map_size
-            + self.builder.allocated_size()
+            + views_size
+            + in_progress_size
+            + completed_size
+            + nulls_size
             + self.hashes_buffer.allocated_size()
     }
 }
@@ -367,7 +456,8 @@ where
         f.debug_struct("ArrowBytesMap")
             .field("map", &"<map>")
             .field("map_size", &self.map_size)
-            .field("view_builder", &self.builder)
+            .field("views_len", &self.views.len())
+            .field("completed_buffers", &self.completed.len())
             .field("random_state", &self.random_state)
             .field("hashes_buffer", &self.hashes_buffer)
             .finish()
@@ -375,13 +465,20 @@ where
 }
 
 /// Entry in the hash table -- see [`ArrowBytesViewMap`] for more details
+///
+/// Stores the view pointing to our internal buffers, eliminating the need
+/// for a separate builder index. For inline strings (<=12 bytes), the view
+/// contains the entire value. For out-of-line strings, the view contains
+/// buffer_index and offset pointing directly to our storage.
 #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
 struct Entry<V>
 where
     V: Debug + PartialEq + Eq + Clone + Copy + Default,
 {
-    /// The idx into the views array
-    view_idx: usize,
+    /// The u128 view pointing to our internal buffers. For inline strings,
+    /// this contains the complete value. For larger strings, this contains
+    /// the buffer_index/offset into our completed/in_progress buffers.
+    view: u128,
 
     hash: u64,
 
diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs
index 233deff758c7b..bd5790507f662 100644
--- a/datafusion/physical-expr-common/src/datum.rs
+++ b/datafusion/physical-expr-common/src/datum.rs
@@ -16,13 +16,15 @@
 // under the License.
 
 use arrow::array::BooleanArray;
-use arrow::array::{make_comparator, ArrayRef, Datum};
-use arrow::buffer::NullBuffer;
-use arrow::compute::SortOptions;
+use arrow::array::{ArrayRef, Datum, make_comparator};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::kernels::cmp::{
+    distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct,
+};
+use arrow::compute::{SortOptions, ilike, like, nilike, nlike};
 use arrow::error::ArrowError;
-use datafusion_common::DataFusionError;
-use datafusion_common::{arrow_datafusion_err, internal_err};
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{arrow_datafusion_err, assert_or_internal_err, internal_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::operator::Operator;
 use std::sync::Arc;
@@ -53,39 +55,67 @@ pub fn apply(
     }
 }
 
-/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs`
+/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs`
 pub fn apply_cmp(
+    op: Operator,
     lhs: &ColumnarValue,
     rhs: &ColumnarValue,
-    f: impl Fn(&dyn Datum, &dyn Datum) -> Result<BooleanArray, ArrowError>,
 ) -> Result<ColumnarValue> {
-    apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?)))
+    if lhs.data_type().is_nested() {
+        apply_cmp_for_nested(op, lhs, rhs)
+    } else {
+        let f = match op {
+            Operator::Eq => eq,
+            Operator::NotEq => neq,
+            Operator::Lt => lt,
+            Operator::LtEq => lt_eq,
+            Operator::Gt => gt,
+            Operator::GtEq => gt_eq,
+            Operator::IsDistinctFrom => distinct,
+            Operator::IsNotDistinctFrom => not_distinct,
+
+            Operator::LikeMatch => like,
+            Operator::ILikeMatch => ilike,
+            Operator::NotLikeMatch => nlike,
+            Operator::NotILikeMatch => nilike,
+
+            _ => {
+                return internal_err!("Invalid compare operator: {}", op);
+            }
+        };
+
+        apply(lhs, rhs, |l, r| Ok(Arc::new(f(l, r)?)))
+    }
 }
 
-/// Applies a binary [`Datum`] comparison kernel `f` to `lhs` and `rhs` for nested type like
+/// Applies a binary [`Datum`] comparison operator `op` to `lhs` and `rhs` for nested type like
 /// List, FixedSizeList, LargeList, Struct, Union, Map, or a dictionary of a nested type
 pub fn apply_cmp_for_nested(
     op: Operator,
     lhs: &ColumnarValue,
     rhs: &ColumnarValue,
 ) -> Result<ColumnarValue> {
-    if matches!(
-        op,
-        Operator::Eq
-            | Operator::NotEq
-            | Operator::Lt
-            | Operator::Gt
-            | Operator::LtEq
-            | Operator::GtEq
-            | Operator::IsDistinctFrom
-            | Operator::IsNotDistinctFrom
-    ) {
-        apply(lhs, rhs, |l, r| {
-            Ok(Arc::new(compare_op_for_nested(op, l, r)?))
-        })
-    } else {
-        internal_err!("invalid operator for nested")
-    }
+    let left_data_type = lhs.data_type();
+    let right_data_type = rhs.data_type();
+
+    assert_or_internal_err!(
+        matches!(
+            op,
+            Operator::Eq
+                | Operator::NotEq
+                | Operator::Lt
+                | Operator::Gt
+                | Operator::LtEq
+                | Operator::GtEq
+                | Operator::IsDistinctFrom
+                | Operator::IsNotDistinctFrom
+        ) && left_data_type.equals_datatype(&right_data_type),
+        "invalid operator or data type mismatch for nested data, op {op} left {left_data_type}, right {right_data_type}",
+    );
+
+    apply(lhs, rhs, |l, r| {
+        Ok(Arc::new(compare_op_for_nested(op, l, r)?))
+    })
 }
 
 /// Compare with eq with either nested or non-nested
@@ -97,7 +127,7 @@ pub fn compare_with_eq(
     if is_nested {
         compare_op_for_nested(Operator::Eq, lhs, rhs)
     } else {
-        arrow::compute::kernels::cmp::eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e))
+        eq(lhs, rhs).map_err(|e| arrow_datafusion_err!(e))
     }
 }
 
@@ -112,9 +142,7 @@ pub fn compare_op_for_nested(
     let l_len = l.len();
     let r_len = r.len();
 
-    if l_len != r_len && !is_l_scalar && !is_r_scalar {
-        return internal_err!("len mismatch");
-    }
+    assert_or_internal_err!(l_len == r_len || is_l_scalar || is_r_scalar, "len mismatch");
 
     let len = match is_l_scalar {
         true => r_len,
@@ -143,9 +171,9 @@ pub fn compare_op_for_nested(
     };
 
     let values = match (is_l_scalar, is_r_scalar) {
-        (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(),
-        (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(),
-        (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(),
+        (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, i)),
+        (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0, i)),
+        (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 0)),
         (true, true) => std::iter::once(cmp_with_op(0, 0)).collect(),
     };
 
@@ -154,9 +182,26 @@ pub fn compare_op_for_nested(
     if matches!(op, Operator::IsDistinctFrom | Operator::IsNotDistinctFrom) {
         Ok(BooleanArray::new(values, None))
     } else {
-        // If one of the side is NULL, we returns NULL
+        // If one of the side is NULL, we return NULL
         // i.e. NULL eq NULL -> NULL
-        let nulls = NullBuffer::union(l.nulls(), r.nulls());
+        // For nested comparisons, we need to ensure the null buffer matches the result length
+        let nulls = match (is_l_scalar, is_r_scalar) {
+            (false, false) | (true, true) => NullBuffer::union(l.nulls(), r.nulls()),
+            (true, false) => {
+                // When left is null-scalar and right is array, expand left nulls to match result length
+                match l.nulls().filter(|nulls| nulls.is_null(0)) {
+                    Some(_) => Some(NullBuffer::new_null(len)), // Left scalar is null
+                    None => r.nulls().cloned(),                 // Left scalar is non-null
+                }
+            }
+            (false, true) => {
+                // When right is null-scalar and left is array, expand right nulls to match result length
+                match r.nulls().filter(|nulls| nulls.is_null(0)) {
+                    Some(_) => Some(NullBuffer::new_null(len)), // Right scalar is null
+                    None => l.nulls().cloned(), // Right scalar is non-null
+                }
+            }
+        };
         Ok(BooleanArray::new(values, nulls))
     }
 }
diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs
index 86d4487f4c126..b6eaacdca2505 100644
--- a/datafusion/physical-expr-common/src/lib.rs
+++ b/datafusion/physical-expr-common/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Physical Expr Common packages for [DataFusion]
 //! This package contains high level PhysicalExpr trait
@@ -32,6 +33,7 @@
 pub mod binary_map;
 pub mod binary_view_map;
 pub mod datum;
+pub mod metrics;
 pub mod physical_expr;
 pub mod sort_expr;
 pub mod tree_node;
diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-expr-common/src/metrics/baseline.rs
similarity index 70%
rename from datafusion/physical-plan/src/metrics/baseline.rs
rename to datafusion/physical-expr-common/src/metrics/baseline.rs
index a4a83b84b6555..0de8e26494931 100644
--- a/datafusion/physical-plan/src/metrics/baseline.rs
+++ b/datafusion/physical-expr-common/src/metrics/baseline.rs
@@ -20,16 +20,18 @@
 use std::task::Poll;
 
 use arrow::record_batch::RecordBatch;
+use datafusion_common::{Result, utils::memory::get_record_batch_memory_size};
 
 use super::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time, Timestamp};
-use datafusion_common::Result;
 
 /// Helper for creating and tracking common "baseline" metrics for
 /// each operator
 ///
 /// Example:
 /// ```
-/// use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet};
+/// use datafusion_physical_expr_common::metrics::{
+///     BaselineMetrics, ExecutionPlanMetricsSet,
+/// };
 /// let metrics = ExecutionPlanMetricsSet::new();
 ///
 /// let partition = 2;
@@ -45,7 +47,7 @@ use datafusion_common::Result;
 /// ```
 #[derive(Debug, Clone)]
 pub struct BaselineMetrics {
-    /// end_time is set when `ExecutionMetrics::done()` is called
+    /// end_time is set when `BaselineMetrics::done()` is called
     end_time: Timestamp,
 
     /// amount of time the operator was actively trying to use the CPU
@@ -53,6 +55,19 @@ pub struct BaselineMetrics {
 
     /// output rows: the total output rows
     output_rows: Count,
+
+    /// Memory usage of all output batches.
+    ///
+    /// Note: This value may be overestimated. If multiple output `RecordBatch`
+    /// instances share underlying memory buffers, their sizes will be counted
+    /// multiple times.
+    /// Issue: <https://github.com/apache/datafusion/issues/16841>
+    output_bytes: Count,
+
+    /// output batches: the total output batch count
+    output_batches: Count,
+    // Remember to update `docs/source/user-guide/metrics.md` when updating comments
+    // or adding new metrics
 }
 
 impl BaselineMetrics {
@@ -62,9 +77,21 @@ impl BaselineMetrics {
         start_time.record();
 
         Self {
-            end_time: MetricBuilder::new(metrics).end_timestamp(partition),
-            elapsed_compute: MetricBuilder::new(metrics).elapsed_compute(partition),
-            output_rows: MetricBuilder::new(metrics).output_rows(partition),
+            end_time: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::SUMMARY)
+                .end_timestamp(partition),
+            elapsed_compute: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::SUMMARY)
+                .elapsed_compute(partition),
+            output_rows: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::SUMMARY)
+                .output_rows(partition),
+            output_bytes: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::SUMMARY)
+                .output_bytes(partition),
+            output_batches: MetricBuilder::new(metrics)
+                .with_type(super::MetricType::DEV)
+                .output_batches(partition),
         }
     }
 
@@ -78,6 +105,8 @@ impl BaselineMetrics {
             end_time: Default::default(),
             elapsed_compute: self.elapsed_compute.clone(),
             output_rows: Default::default(),
+            output_bytes: Default::default(),
+            output_batches: Default::default(),
         }
     }
 
@@ -91,6 +120,11 @@ impl BaselineMetrics {
         &self.output_rows
     }
 
+    /// return the metric for the total number of output batches produced
+    pub fn output_batches(&self) -> &Count {
+        &self.output_batches
+    }
+
     /// Records the fact that this operator's execution is complete
     /// (recording the `end_time` metric).
     ///
@@ -117,9 +151,10 @@ impl BaselineMetrics {
         }
     }
 
-    /// Process a poll result of a stream producing output for an
-    /// operator, recording the output rows and stream done time and
-    /// returning the same poll result
+    /// Process a poll result of a stream producing output for an operator.
+    ///
+    /// Note: this method only updates `output_rows` and `end_time` metrics.
+    /// Remember to update `elapsed_compute` and other metrics manually.
     pub fn record_poll(
         &self,
         poll: Poll<Option<Result<RecordBatch>>>,
@@ -150,7 +185,7 @@ pub struct SpillMetrics {
     /// count of spills during the execution of the operator
     pub spill_file_count: Count,
 
-    /// total spilled bytes during the execution of the operator
+    /// total bytes actually written to disk during the execution of the operator
     pub spilled_bytes: Count,
 
     /// total spilled rows during the execution of the operator
@@ -168,6 +203,23 @@ impl SpillMetrics {
     }
 }
 
+/// Metrics for tracking batch splitting activity
+#[derive(Debug, Clone)]
+pub struct SplitMetrics {
+    /// Number of times an input [`RecordBatch`] was split
+    pub batches_split: Count,
+}
+
+impl SplitMetrics {
+    /// Create a new [`SplitMetrics`]
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            batches_split: MetricBuilder::new(metrics)
+                .counter("batches_split", partition),
+        }
+    }
+}
+
 /// Trait for things that produce output rows as a result of execution.
 pub trait RecordOutput {
     /// Record that some number of output rows have been produced
@@ -187,6 +239,9 @@ impl RecordOutput for usize {
 impl RecordOutput for RecordBatch {
     fn record_output(self, bm: &BaselineMetrics) -> Self {
         bm.record_output(self.num_rows());
+        let n_bytes = get_record_batch_memory_size(&self);
+        bm.output_bytes.add(n_bytes);
+        bm.output_batches.add(1);
         self
     }
 }
@@ -194,6 +249,9 @@ impl RecordOutput for RecordBatch {
 impl RecordOutput for &RecordBatch {
     fn record_output(self, bm: &BaselineMetrics) -> Self {
         bm.record_output(self.num_rows());
+        let n_bytes = get_record_batch_memory_size(self);
+        bm.output_bytes.add(n_bytes);
+        bm.output_batches.add(1);
         self
     }
 }
diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-expr-common/src/metrics/builder.rs
similarity index 69%
rename from datafusion/physical-plan/src/metrics/builder.rs
rename to datafusion/physical-expr-common/src/metrics/builder.rs
index dbda0a310ce52..4fa938f69ed36 100644
--- a/datafusion/physical-plan/src/metrics/builder.rs
+++ b/datafusion/physical-expr-common/src/metrics/builder.rs
@@ -19,6 +19,11 @@
 
 use std::{borrow::Cow, sync::Arc};
 
+use crate::metrics::{
+    MetricType,
+    value::{PruningMetrics, RatioMergeStrategy, RatioMetrics},
+};
+
 use super::{
     Count, ExecutionPlanMetricsSet, Gauge, Label, Metric, MetricValue, Time, Timestamp,
 };
@@ -29,19 +34,18 @@ use super::{
 /// case of constant strings
 ///
 /// ```rust
-///  use datafusion_physical_plan::metrics::*;
-///
-///  let metrics = ExecutionPlanMetricsSet::new();
-///  let partition = 1;
+/// use datafusion_physical_expr_common::metrics::*;
 ///
-///  // Create the standard output_rows metric
-///  let output_rows = MetricBuilder::new(&metrics).output_rows(partition);
+/// let metrics = ExecutionPlanMetricsSet::new();
+/// let partition = 1;
 ///
-///  // Create a operator specific counter with some labels
-///  let num_bytes = MetricBuilder::new(&metrics)
-///    .with_new_label("filename", "my_awesome_file.parquet")
-///    .counter("num_bytes", partition);
+/// // Create the standard output_rows metric
+/// let output_rows = MetricBuilder::new(&metrics).output_rows(partition);
 ///
+/// // Create a operator specific counter with some labels
+/// let num_bytes = MetricBuilder::new(&metrics)
+///     .with_new_label("filename", "my_awesome_file.parquet")
+///     .counter("num_bytes", partition);
 /// ```
 pub struct MetricBuilder<'a> {
     /// Location that the metric created by this builder will be added do
@@ -52,15 +56,23 @@ pub struct MetricBuilder<'a> {
 
     /// arbitrary name=value pairs identifying this metric
     labels: Vec<Label>,
+
+    /// The type controlling the verbosity/category for this builder
+    /// See comments in [`MetricType`] for details
+    metric_type: MetricType,
 }
 
 impl<'a> MetricBuilder<'a> {
     /// Create a new `MetricBuilder` that will register the result of `build()` with the `metrics`
+    ///
+    /// `self.metric_type` controls when such metric is displayed. See comments in
+    /// [`MetricType`] for details.
     pub fn new(metrics: &'a ExecutionPlanMetricsSet) -> Self {
         Self {
             metrics,
             partition: None,
             labels: vec![],
+            metric_type: MetricType::DEV,
         }
     }
 
@@ -70,6 +82,12 @@ impl<'a> MetricBuilder<'a> {
         self
     }
 
+    /// Set the metric type to the metric being constructed
+    pub fn with_type(mut self, metric_type: MetricType) -> Self {
+        self.metric_type = metric_type;
+        self
+    }
+
     /// Add a label to the metric being constructed
     pub fn with_new_label(
         self,
@@ -92,8 +110,11 @@ impl<'a> MetricBuilder<'a> {
             labels,
             partition,
             metrics,
+            metric_type,
         } = self;
-        let metric = Arc::new(Metric::new_with_labels(value, partition, labels));
+        let metric = Arc::new(
+            Metric::new_with_labels(value, partition, labels).with_type(metric_type),
+        );
         metrics.register(metric);
     }
 
@@ -132,6 +153,22 @@ impl<'a> MetricBuilder<'a> {
         count
     }
 
+    /// Consume self and create a new counter for recording total output bytes
+    pub fn output_bytes(self, partition: usize) -> Count {
+        let count = Count::new();
+        self.with_partition(partition)
+            .build(MetricValue::OutputBytes(count.clone()));
+        count
+    }
+
+    /// Consume self and create a new counter for recording total output batches
+    pub fn output_batches(self, partition: usize) -> Count {
+        let count = Count::new();
+        self.with_partition(partition)
+            .build(MetricValue::OutputBatches(count.clone()));
+        count
+    }
+
     /// Consume self and create a new gauge for reporting current memory usage
     pub fn mem_used(self, partition: usize) -> Gauge {
         let gauge = Gauge::new();
@@ -223,4 +260,44 @@ impl<'a> MetricBuilder<'a> {
             .build(MetricValue::EndTimestamp(timestamp.clone()));
         timestamp
     }
+
+    /// Consumes self and creates a new `PruningMetrics`
+    pub fn pruning_metrics(
+        self,
+        name: impl Into<Cow<'static, str>>,
+        partition: usize,
+    ) -> PruningMetrics {
+        let pruning_metrics = PruningMetrics::new();
+        self.with_partition(partition)
+            .build(MetricValue::PruningMetrics {
+                name: name.into(),
+                // inner values will be `Arc::clone()`
+                pruning_metrics: pruning_metrics.clone(),
+            });
+        pruning_metrics
+    }
+
+    /// Consumes self and creates a new [`RatioMetrics`]
+    pub fn ratio_metrics(
+        self,
+        name: impl Into<Cow<'static, str>>,
+        partition: usize,
+    ) -> RatioMetrics {
+        self.ratio_metrics_with_strategy(name, partition, RatioMergeStrategy::default())
+    }
+
+    /// Consumes self and creates a new [`RatioMetrics`] with a specific merge strategy
+    pub fn ratio_metrics_with_strategy(
+        self,
+        name: impl Into<Cow<'static, str>>,
+        partition: usize,
+        merge_strategy: RatioMergeStrategy,
+    ) -> RatioMetrics {
+        let ratio_metrics = RatioMetrics::new().with_merge_strategy(merge_strategy);
+        self.with_partition(partition).build(MetricValue::Ratio {
+            name: name.into(),
+            ratio_metrics: ratio_metrics.clone(),
+        });
+        ratio_metrics
+    }
 }
diff --git a/datafusion/physical-expr-common/src/metrics/custom.rs b/datafusion/physical-expr-common/src/metrics/custom.rs
new file mode 100644
index 0000000000000..0bd7ba1b10a25
--- /dev/null
+++ b/datafusion/physical-expr-common/src/metrics/custom.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Custom metric value type.
+
+use std::{any::Any, fmt::Debug, fmt::Display, sync::Arc};
+
+/// A trait for implementing custom metric values.
+///
+/// This trait enables defining application- or operator-specific metric types
+/// that can be aggregated and displayed alongside standard metrics. These
+/// custom metrics integrate with [`MetricValue::Custom`] and support
+/// aggregation logic, introspection, and optional numeric representation.
+///
+/// # Requirements
+/// Implementations of `CustomMetricValue` must satisfy the following:
+///
+/// 1. [`Self::aggregate`]: Defines how two metric values are combined
+/// 2. [`Self::new_empty`]: Returns a new, zero-value instance for accumulation
+/// 3. [`Self::as_any`]: Enables dynamic downcasting for type-specific operations
+/// 4. [`Self::as_usize`]: Optionally maps the value to a `usize` (for sorting, display, etc.)
+/// 5. [`Self::is_eq`]: Implements comparison between two values, this isn't reusing the std
+///    PartialEq trait because this trait is used dynamically in the context of
+///    [`MetricValue::Custom`]
+///
+/// # Examples
+/// ```
+/// # use std::sync::Arc;
+/// # use std::fmt::{Debug, Display};
+/// # use std::any::Any;
+/// # use std::sync::atomic::{AtomicUsize, Ordering};
+///
+/// # use datafusion_physical_expr_common::metrics::CustomMetricValue;
+///
+/// #[derive(Debug, Default)]
+/// struct MyCounter {
+///     count: AtomicUsize,
+/// }
+///
+/// impl Display for MyCounter {
+///     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+///         write!(f, "count: {}", self.count.load(Ordering::Relaxed))
+///     }
+/// }
+///
+/// impl CustomMetricValue for MyCounter {
+///     fn new_empty(&self) -> Arc<dyn CustomMetricValue> {
+///         Arc::new(Self::default())
+///     }
+///
+///     fn aggregate(&self, other: Arc<dyn CustomMetricValue>) {
+///         let other = other.as_any().downcast_ref::<Self>().unwrap();
+///         self.count
+///             .fetch_add(other.count.load(Ordering::Relaxed), Ordering::Relaxed);
+///     }
+///
+///     fn as_any(&self) -> &dyn Any {
+///         self
+///     }
+///
+///     fn as_usize(&self) -> usize {
+///         self.count.load(Ordering::Relaxed)
+///     }
+///
+///     fn is_eq(&self, other: &Arc<dyn CustomMetricValue>) -> bool {
+///         let Some(other) = other.as_any().downcast_ref::<Self>() else {
+///             return false;
+///         };
+///
+///         self.count.load(Ordering::Relaxed) == other.count.load(Ordering::Relaxed)
+///     }
+/// }
+/// ```
+///
+/// [`MetricValue::Custom`]: super::MetricValue::Custom
+pub trait CustomMetricValue: Display + Debug + Send + Sync {
+    /// Returns a new, zero-initialized version of this metric value.
+    ///
+    /// This value is used during metric aggregation to accumulate results.
+    fn new_empty(&self) -> Arc<dyn CustomMetricValue>;
+
+    /// Merges another metric value into this one.
+    ///
+    /// The type of `other` could be of a different custom type as long as it's aggregatable into self.
+    fn aggregate(&self, other: Arc<dyn CustomMetricValue + 'static>);
+
+    /// Returns this value as a [`Any`] to support dynamic downcasting.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Optionally returns a numeric representation of the value, if meaningful.
+    /// Otherwise will default to zero.
+    ///
+    /// This is used for sorting and summarizing metrics.
+    fn as_usize(&self) -> usize {
+        0
+    }
+
+    /// Compares this value with another custom value.
+    fn is_eq(&self, other: &Arc<dyn CustomMetricValue>) -> bool;
+}
diff --git a/datafusion/physical-expr-common/src/metrics/expression.rs b/datafusion/physical-expr-common/src/metrics/expression.rs
new file mode 100644
index 0000000000000..4a092b0d1b522
--- /dev/null
+++ b/datafusion/physical-expr-common/src/metrics/expression.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metrics helpers for expression evaluation.
+
+use super::{ExecutionPlanMetricsSet, MetricBuilder, MetricType, ScopedTimerGuard, Time};
+
+/// Tracks evaluation time for a sequence of expressions.
+///
+/// # Example
+/// Given SQL query:
+///     EXPLAIN ANALYZE
+///     SELECT a+1, pow(a,2)
+///     FROM generate_series(1, 1000000) as t1(a)
+///
+/// This struct holds two time metrics for the projection expressions
+/// `a+1` and `pow(a,2)`, respectively.
+///
+/// The output reads:
+/// `ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[... expr_0_eval_time=9.23ms, expr_1_eval_time=32.35ms...]`
+#[derive(Debug, Clone)]
+pub struct ExpressionEvaluatorMetrics {
+    expression_times: Vec<Time>,
+}
+
+impl ExpressionEvaluatorMetrics {
+    /// Create metrics for a collection of expressions.
+    ///
+    /// # Args
+    /// - metrics: see `MetricBuilder` for details.
+    /// - partition: see `MetricBuilder` for details.
+    /// - expression_labels: unique identifier for each metric, so that the metric
+    ///   can get aggregated across multiple partitions. It is not the name showed
+    ///   in the `EXPLAIN ANALYZE`, the metric name will be `expr_{idx}_eval_time`
+    ///   according to the expression order.
+    pub fn new<T>(
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+        expression_labels: impl IntoIterator<Item = T>,
+    ) -> Self
+    where
+        T: Into<String>,
+    {
+        let expression_times = expression_labels
+            .into_iter()
+            .enumerate()
+            .map(|(idx, label)| {
+                MetricBuilder::new(metrics)
+                    .with_new_label("expr", label.into())
+                    .with_type(MetricType::DEV)
+                    // Existing PhysicalExpr formatter is a bit verbose, so use simple name
+                    .subset_time(format!("expr_{idx}_eval_time"), partition)
+            })
+            .collect();
+
+        Self { expression_times }
+    }
+
+    /// Returns a timer guard for the expression at `index`, if present.
+    #[inline]
+    pub fn scoped_timer(&self, index: usize) -> Option<ScopedTimerGuard<'_>> {
+        self.expression_times.get(index).map(Time::timer)
+    }
+
+    /// The number of tracked expressions.
+    pub fn len(&self) -> usize {
+        self.expression_times.len()
+    }
+
+    /// True when no expressions are tracked.
+    pub fn is_empty(&self) -> bool {
+        self.expression_times.is_empty()
+    }
+}
diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-expr-common/src/metrics/mod.rs
similarity index 84%
rename from datafusion/physical-plan/src/metrics/mod.rs
rename to datafusion/physical-expr-common/src/metrics/mod.rs
index 2ac7ac1299a0a..18dafa41276d9 100644
--- a/datafusion/physical-plan/src/metrics/mod.rs
+++ b/datafusion/physical-expr-common/src/metrics/mod.rs
@@ -19,8 +19,11 @@
 
 mod baseline;
 mod builder;
+mod custom;
+mod expression;
 mod value;
 
+use datafusion_common::HashMap;
 use parking_lot::Mutex;
 use std::{
     borrow::Cow,
@@ -28,42 +31,42 @@ use std::{
     sync::Arc,
 };
 
-use datafusion_common::HashMap;
-
 // public exports
-pub use baseline::{BaselineMetrics, RecordOutput, SpillMetrics};
+
+pub use baseline::{BaselineMetrics, RecordOutput, SpillMetrics, SplitMetrics};
 pub use builder::MetricBuilder;
-pub use value::{Count, Gauge, MetricValue, ScopedTimerGuard, Time, Timestamp};
+pub use custom::CustomMetricValue;
+pub use expression::ExpressionEvaluatorMetrics;
+pub use value::{
+    Count, Gauge, MetricValue, PruningMetrics, RatioMergeStrategy, RatioMetrics,
+    ScopedTimerGuard, Time, Timestamp,
+};
 
-/// Something that tracks a value of interest (metric) of a DataFusion
-/// [`ExecutionPlan`] execution.
+/// Something that tracks a value of interest (metric) during execution.
 ///
 /// Typically [`Metric`]s are not created directly, but instead
 /// are created using [`MetricBuilder`] or methods on
 /// [`ExecutionPlanMetricsSet`].
 ///
 /// ```
-///  use datafusion_physical_plan::metrics::*;
+/// use datafusion_physical_expr_common::metrics::*;
 ///
-///  let metrics = ExecutionPlanMetricsSet::new();
-///  assert!(metrics.clone_inner().output_rows().is_none());
+/// let metrics = ExecutionPlanMetricsSet::new();
+/// assert!(metrics.clone_inner().output_rows().is_none());
 ///
-///  // Create a counter to increment using the MetricBuilder
-///  let partition = 1;
-///  let output_rows = MetricBuilder::new(&metrics)
-///      .output_rows(partition);
+/// // Create a counter to increment using the MetricBuilder
+/// let partition = 1;
+/// let output_rows = MetricBuilder::new(&metrics).output_rows(partition);
 ///
-///  // Counter can be incremented
-///  output_rows.add(13);
+/// // Counter can be incremented
+/// output_rows.add(13);
 ///
-///  // The value can be retrieved directly:
-///  assert_eq!(output_rows.value(), 13);
+/// // The value can be retrieved directly:
+/// assert_eq!(output_rows.value(), 13);
 ///
-///  // As well as from the metrics set
-///  assert_eq!(metrics.clone_inner().output_rows(), Some(13));
+/// // As well as from the metrics set
+/// assert_eq!(metrics.clone_inner().output_rows(), Some(13));
 /// ```
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
 
 #[derive(Debug)]
 pub struct Metric {
@@ -76,6 +79,29 @@ pub struct Metric {
     /// To which partition of an operators output did this metric
     /// apply? If `None` then means all partitions.
     partition: Option<usize>,
+
+    metric_type: MetricType,
+}
+
+/// Categorizes metrics so the display layer can choose the desired verbosity.
+///
+/// # How is it used:
+/// The `datafusion.explain.analyze_level` configuration controls which category is shown.
+/// - When set to `dev`, all metrics with type `MetricType::Summary` or `MetricType::DEV`
+///   will be shown.
+/// - When set to `summary`, only metrics with type `MetricType::Summary` are shown.
+///
+/// # Difference from `EXPLAIN ANALYZE VERBOSE`:  
+/// The `VERBOSE` keyword controls whether per-partition metrics are shown (when specified),  
+/// or aggregated metrics are displayed (when omitted).  
+/// In contrast, the `analyze_level` configuration determines which categories or
+/// levels of metrics are displayed.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum MetricType {
+    /// Common metrics for high-level insights (answering which operator is slow)
+    SUMMARY,
+    /// For deep operator-level introspection for developers
+    DEV,
 }
 
 impl Display for Metric {
@@ -120,6 +146,7 @@ impl Metric {
             value,
             labels: vec![],
             partition,
+            metric_type: MetricType::DEV,
         }
     }
 
@@ -134,9 +161,16 @@ impl Metric {
             value,
             labels,
             partition,
+            metric_type: MetricType::DEV,
         }
     }
 
+    /// Set the type for this metric. Defaults to [`MetricType::DEV`]
+    pub fn with_type(mut self, metric_type: MetricType) -> Self {
+        self.metric_type = metric_type;
+        self
+    }
+
     /// Add a new label to this metric
     pub fn with_label(mut self, label: Label) -> Self {
         self.labels.push(label);
@@ -162,11 +196,14 @@ impl Metric {
     pub fn partition(&self) -> Option<usize> {
         self.partition
     }
+
+    /// Return the metric type (verbosity level) associated with this metric
+    pub fn metric_type(&self) -> MetricType {
+        self.metric_type
+    }
 }
 
-/// A snapshot of the metrics for a particular ([`ExecutionPlan`]).
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
+/// A snapshot of the metrics for a particular execution plan.
 #[derive(Default, Debug, Clone)]
 pub struct MetricsSet {
     metrics: Vec<Arc<Metric>>,
@@ -258,11 +295,16 @@ impl MetricsSet {
             MetricValue::ElapsedCompute(_) => false,
             MetricValue::SpillCount(_) => false,
             MetricValue::SpilledBytes(_) => false,
+            MetricValue::OutputBytes(_) => false,
+            MetricValue::OutputBatches(_) => false,
             MetricValue::SpilledRows(_) => false,
             MetricValue::CurrentMemoryUsage(_) => false,
             MetricValue::Gauge { name, .. } => name == metric_name,
             MetricValue::StartTimestamp(_) => false,
             MetricValue::EndTimestamp(_) => false,
+            MetricValue::PruningMetrics { name, .. } => name == metric_name,
+            MetricValue::Ratio { name, .. } => name == metric_name,
+            MetricValue::Custom { .. } => false,
         })
     }
 
@@ -283,7 +325,8 @@ impl MetricsSet {
                 .or_insert_with(|| {
                     // accumulate with no partition
                     let partition = None;
-                    let mut accum = Metric::new(metric.value().new_empty(), partition);
+                    let mut accum = Metric::new(metric.value().new_empty(), partition)
+                        .with_type(metric.metric_type());
                     accum.value_mut().aggregate(metric.value());
                     accum
                 });
@@ -321,6 +364,21 @@ impl MetricsSet {
 
         Self { metrics }
     }
+
+    /// Returns a new derived `MetricsSet` containing only metrics whose
+    /// [`MetricType`] appears in `allowed`.
+    pub fn filter_by_metric_types(self, allowed: &[MetricType]) -> Self {
+        if allowed.is_empty() {
+            return Self { metrics: vec![] };
+        }
+
+        let metrics = self
+            .metrics
+            .into_iter()
+            .filter(|metric| allowed.contains(&metric.metric_type()))
+            .collect::<Vec<_>>();
+        Self { metrics }
+    }
 }
 
 impl Display for MetricsSet {
@@ -340,17 +398,14 @@ impl Display for MetricsSet {
     }
 }
 
-/// A set of [`Metric`]s for an individual "operator" (e.g. `&dyn
-/// ExecutionPlan`).
+/// A set of [`Metric`]s for an individual operator.
 ///
-/// This structure is intended as a convenience for [`ExecutionPlan`]
+/// This structure is intended as a convenience for execution plan
 /// implementations so they can generate different streams for multiple
 /// partitions but easily report them together.
 ///
 /// Each `clone()` of this structure will add metrics to the same
 /// underlying metrics set
-///
-/// [`ExecutionPlan`]: super::ExecutionPlan
 #[derive(Default, Debug, Clone)]
 pub struct ExecutionPlanMetricsSet {
     inner: Arc<Mutex<MetricsSet>>,
@@ -384,7 +439,7 @@ impl ExecutionPlanMetricsSet {
 /// "tags" in
 /// [InfluxDB](https://docs.influxdata.com/influxdb/v1.8/write_protocols/line_protocol_tutorial/)
 /// , "attributes" in [open
-/// telemetry]<https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/datamodel.md>,
+/// telemetry]<https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md>,
 /// etc.
 ///
 /// As the name and value are expected to mostly be constant strings,
@@ -681,9 +736,15 @@ mod tests {
             n.join(", ")
         }
 
-        assert_eq!("end_timestamp, start_timestamp, elapsed_compute, the_second_counter, the_counter, the_third_counter, the_time, output_rows", metric_names(&metrics));
+        assert_eq!(
+            "end_timestamp, start_timestamp, elapsed_compute, the_second_counter, the_counter, the_third_counter, the_time, output_rows",
+            metric_names(&metrics)
+        );
 
         let metrics = metrics.sorted_for_display();
-        assert_eq!("output_rows, elapsed_compute, the_counter, the_second_counter, the_third_counter, the_time, start_timestamp, end_timestamp", metric_names(&metrics));
+        assert_eq!(
+            "output_rows, elapsed_compute, the_counter, the_second_counter, the_third_counter, the_time, start_timestamp, end_timestamp",
+            metric_names(&metrics)
+        );
     }
 }
diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs
new file mode 100644
index 0000000000000..d9e93aa361c12
--- /dev/null
+++ b/datafusion/physical-expr-common/src/metrics/value.rs
@@ -0,0 +1,1516 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Value representation of metrics
+
+use super::CustomMetricValue;
+use chrono::{DateTime, Utc};
+use datafusion_common::{
+    human_readable_count, human_readable_duration, human_readable_size, instant::Instant,
+};
+use parking_lot::Mutex;
+use std::{
+    borrow::{Borrow, Cow},
+    fmt::{Debug, Display},
+    sync::{
+        Arc,
+        atomic::{AtomicUsize, Ordering},
+    },
+    time::Duration,
+};
+
+/// A counter to record things such as number of input or output rows
+///
+/// Note `clone`ing counters update the same underlying metrics
+#[derive(Debug, Clone)]
+pub struct Count {
+    /// value of the metric counter
+    value: Arc<AtomicUsize>,
+}
+
+impl PartialEq for Count {
+    fn eq(&self, other: &Self) -> bool {
+        self.value().eq(&other.value())
+    }
+}
+
+impl Display for Count {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", human_readable_count(self.value()))
+    }
+}
+
+impl Default for Count {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Count {
+    /// create a new counter
+    pub fn new() -> Self {
+        Self {
+            value: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    /// Add `n` to the metric's value
+    pub fn add(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.value.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Get the current value
+    pub fn value(&self) -> usize {
+        self.value.load(Ordering::Relaxed)
+    }
+}
+
+/// A gauge is the simplest metrics type. It just returns a value.
+/// For example, you can easily expose current memory consumption with a gauge.
+///
+/// Note `clone`ing gauge update the same underlying metrics
+#[derive(Debug, Clone)]
+pub struct Gauge {
+    /// value of the metric gauge
+    value: Arc<AtomicUsize>,
+}
+
+impl PartialEq for Gauge {
+    fn eq(&self, other: &Self) -> bool {
+        self.value().eq(&other.value())
+    }
+}
+
+impl Display for Gauge {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.value())
+    }
+}
+
+impl Default for Gauge {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Gauge {
+    /// create a new gauge
+    pub fn new() -> Self {
+        Self {
+            value: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    /// Add `n` to the metric's value
+    pub fn add(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.value.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Sub `n` from the metric's value
+    pub fn sub(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.value.fetch_sub(n, Ordering::Relaxed);
+    }
+
+    /// Set metric's value to maximum of `n` and current value
+    pub fn set_max(&self, n: usize) {
+        self.value.fetch_max(n, Ordering::Relaxed);
+    }
+
+    /// Set the metric's value to `n` and return the previous value
+    pub fn set(&self, n: usize) -> usize {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.value.swap(n, Ordering::Relaxed)
+    }
+
+    /// Get the current value
+    pub fn value(&self) -> usize {
+        self.value.load(Ordering::Relaxed)
+    }
+}
+
+/// Measure a potentially non contiguous duration of time
+#[derive(Debug, Clone)]
+pub struct Time {
+    /// elapsed time, in nanoseconds
+    nanos: Arc<AtomicUsize>,
+}
+
+impl Default for Time {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PartialEq for Time {
+    fn eq(&self, other: &Self) -> bool {
+        self.value().eq(&other.value())
+    }
+}
+
+impl Display for Time {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", human_readable_duration(self.value() as u64))
+    }
+}
+
+impl Time {
+    /// Create a new [`Time`] wrapper suitable for recording elapsed
+    /// times for operations.
+    pub fn new() -> Self {
+        Self {
+            nanos: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    /// Add elapsed nanoseconds since `start`to self
+    pub fn add_elapsed(&self, start: Instant) {
+        self.add_duration(start.elapsed());
+    }
+
+    /// Add duration of time to self
+    ///
+    /// Note: this will always increment the recorded time by at least 1 nanosecond
+    /// to distinguish between the scenario of no values recorded, in which
+    /// case the value will be 0, and no measurable amount of time having passed,
+    /// in which case the value will be small but not 0.
+    ///
+    /// This is based on the assumption that the timing logic in most cases is likely
+    /// to take at least a nanosecond, and so this is reasonable mechanism to avoid
+    /// ambiguity, especially on systems with low-resolution monotonic clocks
+    pub fn add_duration(&self, duration: Duration) {
+        let more_nanos = duration.as_nanos() as usize;
+        self.nanos.fetch_add(more_nanos.max(1), Ordering::Relaxed);
+    }
+
+    /// Add the number of nanoseconds of other `Time` to self
+    pub fn add(&self, other: &Time) {
+        self.add_duration(Duration::from_nanos(other.value() as u64))
+    }
+
+    /// return a scoped guard that adds the amount of time elapsed
+    /// between its creation and its drop or call to `stop` to the
+    /// underlying metric.
+    pub fn timer(&self) -> ScopedTimerGuard<'_> {
+        ScopedTimerGuard {
+            inner: self,
+            start: Some(Instant::now()),
+        }
+    }
+
+    /// Get the number of nanoseconds record by this Time metric
+    pub fn value(&self) -> usize {
+        self.nanos.load(Ordering::Relaxed)
+    }
+
+    /// Return a scoped guard that adds the amount of time elapsed between the
+    /// given instant and its drop (or the call to `stop`) to the underlying metric
+    pub fn timer_with(&self, now: Instant) -> ScopedTimerGuard<'_> {
+        ScopedTimerGuard {
+            inner: self,
+            start: Some(now),
+        }
+    }
+}
+
+/// Stores a single timestamp, stored as the number of nanoseconds
+/// elapsed from Jan 1, 1970 UTC
+#[derive(Debug, Clone)]
+pub struct Timestamp {
+    /// Time thing started
+    timestamp: Arc<Mutex<Option<DateTime<Utc>>>>,
+}
+
+impl Default for Timestamp {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Timestamp {
+    /// Create a new timestamp and sets its value to 0
+    pub fn new() -> Self {
+        Self {
+            timestamp: Arc::new(Mutex::new(None)),
+        }
+    }
+
+    /// Sets the timestamps value to the current time
+    pub fn record(&self) {
+        self.set(Utc::now())
+    }
+
+    /// Sets the timestamps value to a specified time
+    pub fn set(&self, now: DateTime<Utc>) {
+        *self.timestamp.lock() = Some(now);
+    }
+
+    /// return the timestamps value at the last time `record()` was
+    /// called.
+    ///
+    /// Returns `None` if `record()` has not been called
+    pub fn value(&self) -> Option<DateTime<Utc>> {
+        *self.timestamp.lock()
+    }
+
+    /// sets the value of this timestamp to the minimum of this and other
+    pub fn update_to_min(&self, other: &Timestamp) {
+        let min = match (self.value(), other.value()) {
+            (None, None) => None,
+            (Some(v), None) => Some(v),
+            (None, Some(v)) => Some(v),
+            (Some(v1), Some(v2)) => Some(if v1 < v2 { v1 } else { v2 }),
+        };
+
+        *self.timestamp.lock() = min;
+    }
+
+    /// sets the value of this timestamp to the maximum of this and other
+    pub fn update_to_max(&self, other: &Timestamp) {
+        let max = match (self.value(), other.value()) {
+            (None, None) => None,
+            (Some(v), None) => Some(v),
+            (None, Some(v)) => Some(v),
+            (Some(v1), Some(v2)) => Some(if v1 < v2 { v2 } else { v1 }),
+        };
+
+        *self.timestamp.lock() = max;
+    }
+}
+
+impl PartialEq for Timestamp {
+    fn eq(&self, other: &Self) -> bool {
+        self.value().eq(&other.value())
+    }
+}
+
+impl Display for Timestamp {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self.value() {
+            None => write!(f, "NONE"),
+            Some(v) => {
+                write!(f, "{v}")
+            }
+        }
+    }
+}
+
+/// RAAI structure that adds all time between its construction and
+/// destruction to the CPU time or the first call to `stop` whichever
+/// comes first
+pub struct ScopedTimerGuard<'a> {
+    inner: &'a Time,
+    start: Option<Instant>,
+}
+
+impl ScopedTimerGuard<'_> {
+    /// Stop the timer timing and record the time taken
+    pub fn stop(&mut self) {
+        if let Some(start) = self.start.take() {
+            self.inner.add_elapsed(start)
+        }
+    }
+
+    /// Restarts the timer recording from the current time
+    pub fn restart(&mut self) {
+        self.start = Some(Instant::now())
+    }
+
+    /// Stop the timer, record the time taken and consume self
+    pub fn done(mut self) {
+        self.stop()
+    }
+
+    /// Stop the timer timing and record the time taken since the given endpoint.
+    pub fn stop_with(&mut self, end_time: Instant) {
+        if let Some(start) = self.start.take() {
+            let elapsed = end_time - start;
+            self.inner.add_duration(elapsed)
+        }
+    }
+
+    /// Stop the timer, record the time taken since `end_time` endpoint, and
+    /// consume self.
+    pub fn done_with(mut self, end_time: Instant) {
+        self.stop_with(end_time)
+    }
+}
+
+impl Drop for ScopedTimerGuard<'_> {
+    fn drop(&mut self) {
+        self.stop()
+    }
+}
+
+/// Counters tracking pruning metrics
+///
+/// For example, a file scanner initially is planned to scan 10 files, but skipped
+/// 8 of them using statistics, the pruning metrics would look like: 10 total -> 2 matched
+///
+/// Note `clone`ing update the same underlying metrics
+#[derive(Debug, Clone)]
+pub struct PruningMetrics {
+    pruned: Arc<AtomicUsize>,
+    matched: Arc<AtomicUsize>,
+    fully_matched: Arc<AtomicUsize>,
+}
+
+impl Display for PruningMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let matched = self.matched.load(Ordering::Relaxed);
+        let total = self.pruned.load(Ordering::Relaxed) + matched;
+        let fully_matched = self.fully_matched.load(Ordering::Relaxed);
+
+        if fully_matched != 0 {
+            write!(
+                f,
+                "{} total → {} matched -> {} fully matched",
+                human_readable_count(total),
+                human_readable_count(matched),
+                human_readable_count(fully_matched)
+            )
+        } else {
+            write!(
+                f,
+                "{} total → {} matched",
+                human_readable_count(total),
+                human_readable_count(matched)
+            )
+        }
+    }
+}
+
+impl Default for PruningMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PruningMetrics {
+    /// create a new PruningMetrics
+    pub fn new() -> Self {
+        Self {
+            pruned: Arc::new(AtomicUsize::new(0)),
+            matched: Arc::new(AtomicUsize::new(0)),
+            fully_matched: Arc::new(AtomicUsize::new(0)),
+        }
+    }
+
+    /// Add `n` to the metric's pruned value
+    pub fn add_pruned(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.pruned.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Add `n` to the metric's matched value
+    pub fn add_matched(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.matched.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Add `n` to the metric's fully matched value
+    pub fn add_fully_matched(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.fully_matched.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Subtract `n` to the metric's matched value.
+    pub fn subtract_matched(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.matched.fetch_sub(n, Ordering::Relaxed);
+    }
+
+    /// Number of items pruned
+    pub fn pruned(&self) -> usize {
+        self.pruned.load(Ordering::Relaxed)
+    }
+
+    /// Number of items matched (not pruned)
+    pub fn matched(&self) -> usize {
+        self.matched.load(Ordering::Relaxed)
+    }
+
+    /// Number of items fully matched
+    pub fn fully_matched(&self) -> usize {
+        self.fully_matched.load(Ordering::Relaxed)
+    }
+}
+
+/// Counters tracking ratio metrics (e.g. matched vs total)
+///
+/// The counters are thread-safe and shared across clones.
+#[derive(Debug, Clone, Default)]
+pub struct RatioMetrics {
+    part: Arc<AtomicUsize>,
+    total: Arc<AtomicUsize>,
+    merge_strategy: RatioMergeStrategy,
+}
+
+#[derive(Debug, Clone, Default)]
+pub enum RatioMergeStrategy {
+    #[default]
+    AddPartAddTotal,
+    AddPartSetTotal,
+    SetPartAddTotal,
+}
+
+impl RatioMetrics {
+    /// Create a new [`RatioMetrics`]
+    pub fn new() -> Self {
+        Self {
+            part: Arc::new(AtomicUsize::new(0)),
+            total: Arc::new(AtomicUsize::new(0)),
+            merge_strategy: RatioMergeStrategy::AddPartAddTotal,
+        }
+    }
+
+    pub fn with_merge_strategy(mut self, merge_strategy: RatioMergeStrategy) -> Self {
+        self.merge_strategy = merge_strategy;
+        self
+    }
+
+    /// Add `n` to the numerator (`part`) value
+    pub fn add_part(&self, n: usize) {
+        self.part.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Add `n` to the denominator (`total`) value
+    pub fn add_total(&self, n: usize) {
+        self.total.fetch_add(n, Ordering::Relaxed);
+    }
+
+    /// Set the numerator (`part`) value to `n`, overwriting any existing value
+    pub fn set_part(&self, n: usize) {
+        self.part.store(n, Ordering::Relaxed);
+    }
+
+    /// Set the denominator (`total`) value to `n`, overwriting any existing value
+    pub fn set_total(&self, n: usize) {
+        self.total.store(n, Ordering::Relaxed);
+    }
+
+    /// Merge the value from `other` into `self`
+    pub fn merge(&self, other: &Self) {
+        match self.merge_strategy {
+            RatioMergeStrategy::AddPartAddTotal => {
+                self.add_part(other.part());
+                self.add_total(other.total());
+            }
+            RatioMergeStrategy::AddPartSetTotal => {
+                self.add_part(other.part());
+                self.set_total(other.total());
+            }
+            RatioMergeStrategy::SetPartAddTotal => {
+                self.set_part(other.part());
+                self.add_total(other.total());
+            }
+        }
+    }
+
+    /// Return the numerator (`part`) value
+    pub fn part(&self) -> usize {
+        self.part.load(Ordering::Relaxed)
+    }
+
+    /// Return the denominator (`total`) value
+    pub fn total(&self) -> usize {
+        self.total.load(Ordering::Relaxed)
+    }
+}
+
+impl PartialEq for RatioMetrics {
+    fn eq(&self, other: &Self) -> bool {
+        self.part() == other.part() && self.total() == other.total()
+    }
+}
+
+/// Format a float number with `digits` most significant numbers.
+///
+/// fmt_significant(12.5) -> "12"
+/// fmt_significant(0.0543) -> "0.054"
+/// fmt_significant(0.000123) -> "0.00012"
+fn fmt_significant(mut x: f64, digits: usize) -> String {
+    if x == 0.0 {
+        return "0".to_string();
+    }
+
+    let exp = x.abs().log10().floor(); // exponent of first significant digit
+    let scale = 10f64.powf(-(exp - (digits as f64 - 1.0)));
+    x = (x * scale).round() / scale; // round to N significant digits
+    format!("{x}")
+}
+
+impl Display for RatioMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let part = self.part();
+        let total = self.total();
+
+        if total == 0 {
+            if part == 0 {
+                write!(f, "N/A (0/0)")
+            } else {
+                write!(f, "N/A ({}/0)", human_readable_count(part))
+            }
+        } else {
+            let percentage = (part as f64 / total as f64) * 100.0;
+
+            write!(
+                f,
+                "{}% ({}/{})",
+                fmt_significant(percentage, 2),
+                human_readable_count(part),
+                human_readable_count(total)
+            )
+        }
+    }
+}
+
+/// Possible values for a [super::Metric].
+///
+/// Among other differences, the metric types have different ways to
+/// logically interpret their underlying values and some metrics are
+/// so common they are given special treatment.
+#[derive(Debug, Clone)]
+pub enum MetricValue {
+    /// Number of output rows produced: "output_rows" metric
+    OutputRows(Count),
+    /// Elapsed Compute Time: the wall clock time spent in "cpu
+    /// intensive" work.
+    ///
+    /// This measurement represents, roughly:
+    /// ```
+    /// use std::time::Instant;
+    /// let start = Instant::now();
+    /// // ...CPU intensive work here...
+    /// let elapsed_compute = (Instant::now() - start).as_nanos();
+    /// ```
+    ///
+    /// Note 1: Does *not* include time other operators spend
+    /// computing input.
+    ///
+    /// Note 2: *Does* includes time when the thread could have made
+    /// progress but the OS did not schedule it (e.g. due to CPU
+    /// contention), thus making this value different than the
+    /// classical definition of "cpu_time", which is the time reported
+    /// from `clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)`.
+    ElapsedCompute(Time),
+    /// Number of spills produced: "spill_count" metric
+    SpillCount(Count),
+    /// Total size of spilled bytes produced: "spilled_bytes" metric
+    SpilledBytes(Count),
+    /// Total size of output bytes produced: "output_bytes" metric
+    OutputBytes(Count),
+    /// Total number of output batches produced: "output_batches" metric
+    OutputBatches(Count),
+    /// Total size of spilled rows produced: "spilled_rows" metric
+    SpilledRows(Count),
+    /// Current memory used
+    CurrentMemoryUsage(Gauge),
+    /// Operator defined count.
+    Count {
+        /// The provided name of this metric
+        name: Cow<'static, str>,
+        /// The value of the metric
+        count: Count,
+    },
+    /// Operator defined gauge.
+    Gauge {
+        /// The provided name of this metric
+        name: Cow<'static, str>,
+        /// The value of the metric
+        gauge: Gauge,
+    },
+    /// Operator defined time
+    Time {
+        /// The provided name of this metric
+        name: Cow<'static, str>,
+        /// The value of the metric
+        time: Time,
+    },
+    /// The time at which execution started
+    StartTimestamp(Timestamp),
+    /// The time at which execution ended
+    EndTimestamp(Timestamp),
+    /// Metrics related to scan pruning
+    PruningMetrics {
+        name: Cow<'static, str>,
+        pruning_metrics: PruningMetrics,
+    },
+    /// Metrics that should be displayed as ratio like (42%)
+    Ratio {
+        name: Cow<'static, str>,
+        ratio_metrics: RatioMetrics,
+    },
+    Custom {
+        /// The provided name of this metric
+        name: Cow<'static, str>,
+        /// A custom implementation of the metric value.
+        value: Arc<dyn CustomMetricValue>,
+    },
+}
+
+// Manually implement PartialEq for `MetricValue` because it contains CustomMetricValue in its
+// definition which is a dyn trait. This wouldn't allow us to just derive PartialEq.
+impl PartialEq for MetricValue {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (MetricValue::OutputRows(count), MetricValue::OutputRows(other)) => {
+                count == other
+            }
+            (MetricValue::ElapsedCompute(time), MetricValue::ElapsedCompute(other)) => {
+                time == other
+            }
+            (MetricValue::SpillCount(count), MetricValue::SpillCount(other)) => {
+                count == other
+            }
+            (MetricValue::SpilledBytes(count), MetricValue::SpilledBytes(other)) => {
+                count == other
+            }
+            (MetricValue::OutputBytes(count), MetricValue::OutputBytes(other)) => {
+                count == other
+            }
+            (MetricValue::OutputBatches(count), MetricValue::OutputBatches(other)) => {
+                count == other
+            }
+            (MetricValue::SpilledRows(count), MetricValue::SpilledRows(other)) => {
+                count == other
+            }
+            (
+                MetricValue::CurrentMemoryUsage(gauge),
+                MetricValue::CurrentMemoryUsage(other),
+            ) => gauge == other,
+            (
+                MetricValue::Count { name, count },
+                MetricValue::Count {
+                    name: other_name,
+                    count: other_count,
+                },
+            ) => name == other_name && count == other_count,
+            (
+                MetricValue::Gauge { name, gauge },
+                MetricValue::Gauge {
+                    name: other_name,
+                    gauge: other_gauge,
+                },
+            ) => name == other_name && gauge == other_gauge,
+            (
+                MetricValue::Time { name, time },
+                MetricValue::Time {
+                    name: other_name,
+                    time: other_time,
+                },
+            ) => name == other_name && time == other_time,
+
+            (
+                MetricValue::StartTimestamp(timestamp),
+                MetricValue::StartTimestamp(other),
+            ) => timestamp == other,
+            (MetricValue::EndTimestamp(timestamp), MetricValue::EndTimestamp(other)) => {
+                timestamp == other
+            }
+            (
+                MetricValue::PruningMetrics {
+                    name,
+                    pruning_metrics,
+                },
+                MetricValue::PruningMetrics {
+                    name: other_name,
+                    pruning_metrics: other_pruning_metrics,
+                },
+            ) => {
+                name == other_name
+                    && pruning_metrics.pruned() == other_pruning_metrics.pruned()
+                    && pruning_metrics.matched() == other_pruning_metrics.matched()
+            }
+            (
+                MetricValue::Ratio {
+                    name,
+                    ratio_metrics,
+                },
+                MetricValue::Ratio {
+                    name: other_name,
+                    ratio_metrics: other_ratio_metrics,
+                },
+            ) => name == other_name && ratio_metrics == other_ratio_metrics,
+            (
+                MetricValue::Custom { name, value },
+                MetricValue::Custom {
+                    name: other_name,
+                    value: other_value,
+                },
+            ) => name == other_name && value.is_eq(other_value),
+            // Default case when the two sides do not have the same type.
+            _ => false,
+        }
+    }
+}
+
+impl MetricValue {
+    /// Return the name of this SQL metric
+    pub fn name(&self) -> &str {
+        match self {
+            Self::OutputRows(_) => "output_rows",
+            Self::SpillCount(_) => "spill_count",
+            Self::SpilledBytes(_) => "spilled_bytes",
+            Self::OutputBytes(_) => "output_bytes",
+            Self::OutputBatches(_) => "output_batches",
+            Self::SpilledRows(_) => "spilled_rows",
+            Self::CurrentMemoryUsage(_) => "mem_used",
+            Self::ElapsedCompute(_) => "elapsed_compute",
+            Self::Count { name, .. } => name.borrow(),
+            Self::Gauge { name, .. } => name.borrow(),
+            Self::Time { name, .. } => name.borrow(),
+            Self::StartTimestamp(_) => "start_timestamp",
+            Self::EndTimestamp(_) => "end_timestamp",
+            Self::PruningMetrics { name, .. } => name.borrow(),
+            Self::Ratio { name, .. } => name.borrow(),
+            Self::Custom { name, .. } => name.borrow(),
+        }
+    }
+
+    /// Return the value of the metric as a usize value, used to aggregate metric
+    /// value across partitions.
+    pub fn as_usize(&self) -> usize {
+        match self {
+            Self::OutputRows(count) => count.value(),
+            Self::SpillCount(count) => count.value(),
+            Self::SpilledBytes(bytes) => bytes.value(),
+            Self::OutputBytes(bytes) => bytes.value(),
+            Self::OutputBatches(count) => count.value(),
+            Self::SpilledRows(count) => count.value(),
+            Self::CurrentMemoryUsage(used) => used.value(),
+            Self::ElapsedCompute(time) => time.value(),
+            Self::Count { count, .. } => count.value(),
+            Self::Gauge { gauge, .. } => gauge.value(),
+            Self::Time { time, .. } => time.value(),
+            Self::StartTimestamp(timestamp) => timestamp
+                .value()
+                .and_then(|ts| ts.timestamp_nanos_opt())
+                .map(|nanos| nanos as usize)
+                .unwrap_or(0),
+            Self::EndTimestamp(timestamp) => timestamp
+                .value()
+                .and_then(|ts| ts.timestamp_nanos_opt())
+                .map(|nanos| nanos as usize)
+                .unwrap_or(0),
+            // This function is a utility for aggregating metrics, for complex metric
+            // like `PruningMetrics`, this function is not supposed to get called.
+            // Metrics aggregation for them are implemented inside `MetricsSet` directly.
+            Self::PruningMetrics { .. } => 0,
+            // Should not be used. See comments in `PruningMetrics` for details.
+            Self::Ratio { .. } => 0,
+            Self::Custom { value, .. } => value.as_usize(),
+        }
+    }
+
+    /// create a new MetricValue with the same type as `self` suitable
+    /// for accumulating
+    pub fn new_empty(&self) -> Self {
+        match self {
+            Self::OutputRows(_) => Self::OutputRows(Count::new()),
+            Self::SpillCount(_) => Self::SpillCount(Count::new()),
+            Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()),
+            Self::OutputBytes(_) => Self::OutputBytes(Count::new()),
+            Self::OutputBatches(_) => Self::OutputBatches(Count::new()),
+            Self::SpilledRows(_) => Self::SpilledRows(Count::new()),
+            Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()),
+            Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()),
+            Self::Count { name, .. } => Self::Count {
+                name: name.clone(),
+                count: Count::new(),
+            },
+            Self::Gauge { name, .. } => Self::Gauge {
+                name: name.clone(),
+                gauge: Gauge::new(),
+            },
+            Self::Time { name, .. } => Self::Time {
+                name: name.clone(),
+                time: Time::new(),
+            },
+            Self::StartTimestamp(_) => Self::StartTimestamp(Timestamp::new()),
+            Self::EndTimestamp(_) => Self::EndTimestamp(Timestamp::new()),
+            Self::PruningMetrics { name, .. } => Self::PruningMetrics {
+                name: name.clone(),
+                pruning_metrics: PruningMetrics::new(),
+            },
+            Self::Ratio {
+                name,
+                ratio_metrics,
+            } => {
+                let merge_strategy = ratio_metrics.merge_strategy.clone();
+                Self::Ratio {
+                    name: name.clone(),
+                    ratio_metrics: RatioMetrics::new()
+                        .with_merge_strategy(merge_strategy),
+                }
+            }
+            Self::Custom { name, value } => Self::Custom {
+                name: name.clone(),
+                value: value.new_empty(),
+            },
+        }
+    }
+
+    /// Aggregates the value of other to `self`. panic's if the types
+    /// are mismatched or aggregating does not make sense for this
+    /// value
+    ///
+    /// Note this is purposely marked `mut` (even though atomics are
+    /// used) so Rust's type system can be used to ensure the
+    /// appropriate API access. `MetricValues` should be modified
+    /// using the original [`Count`] or [`Time`] they were created
+    /// from.
+    pub fn aggregate(&mut self, other: &Self) {
+        match (self, other) {
+            (Self::OutputRows(count), Self::OutputRows(other_count))
+            | (Self::SpillCount(count), Self::SpillCount(other_count))
+            | (Self::SpilledBytes(count), Self::SpilledBytes(other_count))
+            | (Self::OutputBytes(count), Self::OutputBytes(other_count))
+            | (Self::OutputBatches(count), Self::OutputBatches(other_count))
+            | (Self::SpilledRows(count), Self::SpilledRows(other_count))
+            | (
+                Self::Count { count, .. },
+                Self::Count {
+                    count: other_count, ..
+                },
+            ) => count.add(other_count.value()),
+            (Self::CurrentMemoryUsage(gauge), Self::CurrentMemoryUsage(other_gauge))
+            | (
+                Self::Gauge { gauge, .. },
+                Self::Gauge {
+                    gauge: other_gauge, ..
+                },
+            ) => gauge.add(other_gauge.value()),
+            (Self::ElapsedCompute(time), Self::ElapsedCompute(other_time))
+            | (
+                Self::Time { time, .. },
+                Self::Time {
+                    time: other_time, ..
+                },
+            ) => time.add(other_time),
+            // timestamps are aggregated by min/max
+            (Self::StartTimestamp(timestamp), Self::StartTimestamp(other_timestamp)) => {
+                timestamp.update_to_min(other_timestamp);
+            }
+            // timestamps are aggregated by min/max
+            (Self::EndTimestamp(timestamp), Self::EndTimestamp(other_timestamp)) => {
+                timestamp.update_to_max(other_timestamp);
+            }
+            (
+                Self::PruningMetrics {
+                    pruning_metrics, ..
+                },
+                Self::PruningMetrics {
+                    pruning_metrics: other_pruning_metrics,
+                    ..
+                },
+            ) => {
+                let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed);
+                let matched = other_pruning_metrics.matched.load(Ordering::Relaxed);
+                let fully_matched =
+                    other_pruning_metrics.fully_matched.load(Ordering::Relaxed);
+                pruning_metrics.add_pruned(pruned);
+                pruning_metrics.add_matched(matched);
+                pruning_metrics.add_fully_matched(fully_matched);
+            }
+            (
+                Self::Ratio { ratio_metrics, .. },
+                Self::Ratio {
+                    ratio_metrics: other_ratio_metrics,
+                    ..
+                },
+            ) => {
+                ratio_metrics.merge(other_ratio_metrics);
+            }
+            (
+                Self::Custom { value, .. },
+                Self::Custom {
+                    value: other_value, ..
+                },
+            ) => {
+                value.aggregate(Arc::clone(other_value));
+            }
+            m @ (_, _) => {
+                panic!(
+                    "Mismatched metric types. Can not aggregate {:?} with value {:?}",
+                    m.0, m.1
+                )
+            }
+        }
+    }
+
+    /// Returns a number by which to sort metrics by display. Lower
+    /// numbers are "more useful" (and displayed first)
+    pub fn display_sort_key(&self) -> u8 {
+        match self {
+            // `BaselineMetrics` that is common for most operators
+            Self::OutputRows(_) => 0,
+            Self::ElapsedCompute(_) => 1,
+            Self::OutputBytes(_) => 2,
+            Self::OutputBatches(_) => 3,
+            // Other metrics
+            Self::PruningMetrics { name, .. } => match name.as_ref() {
+                // The following metrics belong to `DataSourceExec` with a Parquet data source.
+                // They are displayed in a specific order that reflects the actual pruning process,
+                // from coarse-grained to fine-grained pruning levels.
+                //
+                // You may update these metrics as long as their relative order remains unchanged.
+                //
+                // Reference PR: <https://github.com/apache/datafusion/pull/18379>
+                "files_ranges_pruned_statistics" => 4,
+                "row_groups_pruned_statistics" => 5,
+                "row_groups_pruned_bloom_filter" => 6,
+                "page_index_pages_pruned" => 7,
+                "page_index_rows_pruned" => 8,
+                _ => 9,
+            },
+            Self::SpillCount(_) => 10,
+            Self::SpilledBytes(_) => 11,
+            Self::SpilledRows(_) => 12,
+            Self::CurrentMemoryUsage(_) => 13,
+            Self::Count { .. } => 14,
+            Self::Gauge { .. } => 15,
+            Self::Time { .. } => 16,
+            Self::Ratio { .. } => 17,
+            Self::StartTimestamp(_) => 18, // show timestamps last
+            Self::EndTimestamp(_) => 19,
+            Self::Custom { .. } => 20,
+        }
+    }
+
+    /// returns true if this metric has a timestamp value
+    pub fn is_timestamp(&self) -> bool {
+        matches!(self, Self::StartTimestamp(_) | Self::EndTimestamp(_))
+    }
+}
+
+impl Display for MetricValue {
+    /// Prints the value of this metric
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::OutputRows(count)
+            | Self::OutputBatches(count)
+            | Self::SpillCount(count)
+            | Self::SpilledRows(count)
+            | Self::Count { count, .. } => {
+                write!(f, "{count}")
+            }
+            Self::SpilledBytes(count) | Self::OutputBytes(count) => {
+                let readable_count = human_readable_size(count.value());
+                write!(f, "{readable_count}")
+            }
+            Self::CurrentMemoryUsage(gauge) => {
+                // CurrentMemoryUsage is in bytes, format like SpilledBytes
+                let readable_size = human_readable_size(gauge.value());
+                write!(f, "{readable_size}")
+            }
+            Self::Gauge { gauge, .. } => {
+                // Generic gauge metrics - format with human-readable count
+                write!(f, "{}", human_readable_count(gauge.value()))
+            }
+            Self::ElapsedCompute(time) | Self::Time { time, .. } => {
+                // distinguish between no time recorded and very small
+                // amount of time recorded
+                if time.value() > 0 {
+                    write!(f, "{time}")
+                } else {
+                    write!(f, "NOT RECORDED")
+                }
+            }
+            Self::StartTimestamp(timestamp) | Self::EndTimestamp(timestamp) => {
+                write!(f, "{timestamp}")
+            }
+            Self::PruningMetrics {
+                pruning_metrics, ..
+            } => {
+                write!(f, "{pruning_metrics}")
+            }
+            Self::Ratio { ratio_metrics, .. } => write!(f, "{ratio_metrics}"),
+            Self::Custom { value, .. } => {
+                write!(f, "{value}")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::any::Any;
+
+    use chrono::TimeZone;
+    use datafusion_common::units::MB;
+
+    use super::*;
+
+    #[derive(Debug, Default)]
+    pub struct CustomCounter {
+        count: AtomicUsize,
+    }
+
+    impl Display for CustomCounter {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "count: {}", self.count.load(Ordering::Relaxed))
+        }
+    }
+
+    impl CustomMetricValue for CustomCounter {
+        fn new_empty(&self) -> Arc<dyn CustomMetricValue> {
+            Arc::new(CustomCounter::default())
+        }
+
+        fn aggregate(&self, other: Arc<dyn CustomMetricValue + 'static>) {
+            let other = other.as_any().downcast_ref::<Self>().unwrap();
+            self.count
+                .fetch_add(other.count.load(Ordering::Relaxed), Ordering::Relaxed);
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn is_eq(&self, other: &Arc<dyn CustomMetricValue>) -> bool {
+            let Some(other) = other.as_any().downcast_ref::<Self>() else {
+                return false;
+            };
+
+            self.count.load(Ordering::Relaxed) == other.count.load(Ordering::Relaxed)
+        }
+    }
+
+    fn new_custom_counter(name: &'static str, value: usize) -> MetricValue {
+        let custom_counter = CustomCounter::default();
+        custom_counter.count.fetch_add(value, Ordering::Relaxed);
+
+        MetricValue::Custom {
+            name: Cow::Borrowed(name),
+            value: Arc::new(custom_counter),
+        }
+    }
+
+    #[test]
+    fn test_custom_metric_with_mismatching_names() {
+        let mut custom_val = new_custom_counter("Hi", 1);
+        let other_custom_val = new_custom_counter("Hello", 1);
+
+        // Not equal since the name differs.
+        assert!(other_custom_val != custom_val);
+
+        // Should work even though the name differs
+        custom_val.aggregate(&other_custom_val);
+
+        let expected_val = new_custom_counter("Hi", 2);
+        assert!(expected_val == custom_val);
+    }
+
+    #[test]
+    fn test_custom_metric() {
+        let mut custom_val = new_custom_counter("hi", 11);
+        let other_custom_val = new_custom_counter("hi", 20);
+
+        custom_val.aggregate(&other_custom_val);
+
+        assert!(custom_val != other_custom_val);
+
+        if let MetricValue::Custom { value, .. } = custom_val {
+            let counter = value
+                .as_any()
+                .downcast_ref::<CustomCounter>()
+                .expect("Expected CustomCounter");
+            assert_eq!(counter.count.load(Ordering::Relaxed), 31);
+        } else {
+            panic!("Unexpected value");
+        }
+    }
+
+    #[test]
+    fn test_display_custom_metric() {
+        let custom_val = new_custom_counter("hi", 11);
+        assert_eq!(custom_val.to_string(), "count: 11");
+    }
+
+    #[test]
+    fn test_display_output_rows() {
+        let count = Count::new();
+        let values = vec![
+            MetricValue::OutputRows(count.clone()),
+            MetricValue::Count {
+                name: "my_counter".into(),
+                count: count.clone(),
+            },
+        ];
+
+        for value in &values {
+            assert_eq!("0", value.to_string(), "value {value:?}");
+        }
+
+        count.add(42);
+        for value in &values {
+            assert_eq!("42", value.to_string(), "value {value:?}");
+        }
+    }
+
+    #[test]
+    fn test_display_spilled_bytes() {
+        let count = Count::new();
+        let spilled_byte = MetricValue::SpilledBytes(count.clone());
+
+        assert_eq!("0.0 B", spilled_byte.to_string());
+
+        count.add((100 * MB) as usize);
+        assert_eq!("100.0 MB", spilled_byte.to_string());
+
+        count.add((0.5 * MB as f64) as usize);
+        assert_eq!("100.5 MB", spilled_byte.to_string());
+    }
+
+    #[test]
+    fn test_display_time() {
+        let time = Time::new();
+        let values = vec![
+            MetricValue::ElapsedCompute(time.clone()),
+            MetricValue::Time {
+                name: "my_time".into(),
+                time: time.clone(),
+            },
+        ];
+
+        // if time is not set, it should not be reported as zero
+        for value in &values {
+            assert_eq!("NOT RECORDED", value.to_string(), "value {value:?}");
+        }
+
+        time.add_duration(Duration::from_nanos(1042));
+        for value in &values {
+            assert_eq!("1.04µs", value.to_string(), "value {value:?}");
+        }
+    }
+
+    #[test]
+    fn test_display_ratio() {
+        let ratio_metrics = RatioMetrics::new();
+        let ratio = MetricValue::Ratio {
+            name: Cow::Borrowed("ratio_metric"),
+            ratio_metrics: ratio_metrics.clone(),
+        };
+
+        assert_eq!("N/A (0/0)", ratio.to_string());
+
+        ratio_metrics.add_part(10);
+        assert_eq!("N/A (10/0)", ratio.to_string());
+
+        ratio_metrics.add_total(40);
+        assert_eq!("25% (10/40)", ratio.to_string());
+
+        let tiny_ratio_metrics = RatioMetrics::new();
+        let tiny_ratio = MetricValue::Ratio {
+            name: Cow::Borrowed("tiny_ratio_metric"),
+            ratio_metrics: tiny_ratio_metrics.clone(),
+        };
+        tiny_ratio_metrics.add_part(1);
+        tiny_ratio_metrics.add_total(3000);
+        assert_eq!("0.033% (1/3.00 K)", tiny_ratio.to_string());
+    }
+
+    #[test]
+    fn test_ratio_set_methods() {
+        let ratio_metrics = RatioMetrics::new();
+
+        // Ensure set methods don't increment
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_total(40);
+        ratio_metrics.set_total(40);
+        assert_eq!("25% (10/40)", ratio_metrics.to_string());
+
+        let ratio_metrics = RatioMetrics::new();
+
+        // Calling set should change the value
+        ratio_metrics.set_part(10);
+        ratio_metrics.set_part(30);
+        ratio_metrics.set_total(40);
+        ratio_metrics.set_total(50);
+        assert_eq!("60% (30/50)", ratio_metrics.to_string());
+    }
+
+    #[test]
+    fn test_ratio_merge_strategy() {
+        // Test AddPartSetTotal strategy
+        let ratio_metrics1 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::AddPartSetTotal);
+
+        ratio_metrics1.set_part(10);
+        ratio_metrics1.set_total(40);
+        assert_eq!("25% (10/40)", ratio_metrics1.to_string());
+        let ratio_metrics2 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::AddPartSetTotal);
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(40);
+        assert_eq!("50% (20/40)", ratio_metrics2.to_string());
+
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("75% (30/40)", ratio_metrics1.to_string());
+
+        // Test SetPartAddTotal strategy
+        let ratio_metrics1 =
+            RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::SetPartAddTotal);
+        ratio_metrics1.set_part(20);
+        ratio_metrics1.set_total(50);
+        let ratio_metrics2 = RatioMetrics::new();
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(50);
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("20% (20/100)", ratio_metrics1.to_string());
+
+        // Test AddPartAddTotal strategy (default)
+        let ratio_metrics1 = RatioMetrics::new();
+        ratio_metrics1.set_part(20);
+        ratio_metrics1.set_total(50);
+        let ratio_metrics2 = RatioMetrics::new();
+        ratio_metrics2.set_part(20);
+        ratio_metrics2.set_total(50);
+        ratio_metrics1.merge(&ratio_metrics2);
+        assert_eq!("40% (40/100)", ratio_metrics1.to_string());
+    }
+
+    #[test]
+    fn test_display_timestamp() {
+        let timestamp = Timestamp::new();
+        let values = vec![
+            MetricValue::StartTimestamp(timestamp.clone()),
+            MetricValue::EndTimestamp(timestamp.clone()),
+        ];
+
+        // if time is not set, it should not be reported as zero
+        for value in &values {
+            assert_eq!("NONE", value.to_string(), "value {value:?}");
+        }
+
+        timestamp.set(Utc.timestamp_nanos(1431648000000000));
+        for value in &values {
+            assert_eq!(
+                "1970-01-17 13:40:48 UTC",
+                value.to_string(),
+                "value {value:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_timer_with_custom_instant() {
+        let time = Time::new();
+        let start_time = Instant::now();
+
+        // Sleep a bit to ensure some time passes
+        std::thread::sleep(Duration::from_millis(1));
+
+        // Create timer with the earlier start time
+        let mut timer = time.timer_with(start_time);
+
+        // Sleep a bit more
+        std::thread::sleep(Duration::from_millis(1));
+
+        // Stop the timer
+        timer.stop();
+
+        // The recorded time should be at least 20ms (both sleeps)
+        assert!(
+            time.value() >= 2_000_000,
+            "Expected at least 2ms, got {} ns",
+            time.value()
+        );
+    }
+
+    #[test]
+    fn test_stop_with_custom_endpoint() {
+        let time = Time::new();
+        let start = Instant::now();
+        let mut timer = time.timer_with(start);
+
+        // Simulate exactly 10ms passing
+        let end = start + Duration::from_millis(10);
+
+        // Stop with custom endpoint
+        timer.stop_with(end);
+
+        // Should record exactly 10ms (10_000_000 nanoseconds)
+        // Allow for small variations due to timer resolution
+        let recorded = time.value();
+        assert!(
+            (10_000_000..=10_100_000).contains(&recorded),
+            "Expected ~10ms, got {recorded} ns"
+        );
+
+        // Calling stop_with again should not add more time
+        timer.stop_with(end);
+        assert_eq!(
+            recorded,
+            time.value(),
+            "Time should not change after second stop"
+        );
+    }
+
+    #[test]
+    fn test_done_with_custom_endpoint() {
+        let time = Time::new();
+        let start = Instant::now();
+
+        // Create a new scope for the timer
+        {
+            let timer = time.timer_with(start);
+
+            // Simulate 50ms passing
+            let end = start + Duration::from_millis(5);
+
+            // Call done_with to stop and consume the timer
+            timer.done_with(end);
+
+            // Timer is consumed, can't use it anymore
+        }
+
+        // Should record exactly 5ms
+        let recorded = time.value();
+        assert!(
+            (5_000_000..=5_100_000).contains(&recorded),
+            "Expected ~5ms, got {recorded} ns",
+        );
+
+        // Test that done_with prevents drop from recording time again
+        {
+            let timer2 = time.timer_with(start);
+            let end2 = start + Duration::from_millis(5);
+            timer2.done_with(end2);
+            // drop happens here but should not record additional time
+        }
+
+        // Should have added only 5ms more
+        let new_recorded = time.value();
+        assert!(
+            (10_000_000..=10_100_000).contains(&new_recorded),
+            "Expected ~10ms total, got {new_recorded} ns",
+        );
+    }
+
+    #[test]
+    fn test_human_readable_metric_formatting() {
+        // Test Count formatting with various sizes
+        let small_count = Count::new();
+        small_count.add(42);
+        assert_eq!(
+            MetricValue::OutputRows(small_count.clone()).to_string(),
+            "42"
+        );
+
+        let thousand_count = Count::new();
+        thousand_count.add(10_100);
+        assert_eq!(
+            MetricValue::OutputRows(thousand_count.clone()).to_string(),
+            "10.10 K"
+        );
+
+        let million_count = Count::new();
+        million_count.add(1_532_000);
+        assert_eq!(
+            MetricValue::SpilledRows(million_count.clone()).to_string(),
+            "1.53 M"
+        );
+
+        let billion_count = Count::new();
+        billion_count.add(2_500_000_000);
+        assert_eq!(
+            MetricValue::OutputBatches(billion_count.clone()).to_string(),
+            "2.50 B"
+        );
+
+        // Test Time formatting with various durations
+        let micros_time = Time::new();
+        micros_time.add_duration(Duration::from_nanos(1_234));
+        assert_eq!(
+            MetricValue::ElapsedCompute(micros_time.clone()).to_string(),
+            "1.23µs"
+        );
+
+        let millis_time = Time::new();
+        millis_time.add_duration(Duration::from_nanos(11_295_377));
+        assert_eq!(
+            MetricValue::ElapsedCompute(millis_time.clone()).to_string(),
+            "11.30ms"
+        );
+
+        let seconds_time = Time::new();
+        seconds_time.add_duration(Duration::from_nanos(1_234_567_890));
+        assert_eq!(
+            MetricValue::ElapsedCompute(seconds_time.clone()).to_string(),
+            "1.23s"
+        );
+
+        // Test CurrentMemoryUsage formatting (should use size, not count)
+        let mem_gauge = Gauge::new();
+        mem_gauge.add(100 * MB as usize);
+        assert_eq!(
+            MetricValue::CurrentMemoryUsage(mem_gauge.clone()).to_string(),
+            "100.0 MB"
+        );
+
+        // Test custom Gauge formatting (should use count)
+        let custom_gauge = Gauge::new();
+        custom_gauge.add(50_000);
+        assert_eq!(
+            MetricValue::Gauge {
+                name: "custom".into(),
+                gauge: custom_gauge.clone()
+            }
+            .to_string(),
+            "50.00 K"
+        );
+
+        // Test PruningMetrics formatting
+        let pruning = PruningMetrics::new();
+        pruning.add_matched(500_000);
+        pruning.add_pruned(500_000);
+        assert_eq!(
+            MetricValue::PruningMetrics {
+                name: "test_pruning".into(),
+                pruning_metrics: pruning.clone()
+            }
+            .to_string(),
+            "1.00 M total → 500.0 K matched"
+        );
+
+        // Test RatioMetrics formatting
+        let ratio = RatioMetrics::new();
+        ratio.add_part(250_000);
+        ratio.add_total(1_000_000);
+        assert_eq!(
+            MetricValue::Ratio {
+                name: "test_ratio".into(),
+                ratio_metrics: ratio.clone()
+            }
+            .to_string(),
+            "25% (250.0 K/1.00 M)"
+        );
+    }
+}
diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs
index 7be132fa61238..7107b0a9004d3 100644
--- a/datafusion/physical-expr-common/src/physical_expr.rs
+++ b/datafusion/physical-expr-common/src/physical_expr.rs
@@ -23,14 +23,19 @@ use std::sync::Arc;
 
 use crate::utils::scatter;
 
-use arrow::array::BooleanArray;
+use arrow::array::{ArrayRef, BooleanArray, new_empty_array};
 use arrow::compute::filter_record_batch;
 use arrow::datatypes::{DataType, Field, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
+use datafusion_common::{
+    Result, ScalarValue, assert_eq_or_internal_err, exec_err, not_impl_err,
+};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_expr_common::sort_properties::ExprProperties;
 use datafusion_expr_common::statistics::Distribution;
 
@@ -66,7 +71,7 @@ pub type PhysicalExprRef = Arc<dyn PhysicalExpr>;
 /// [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html
 /// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html
 /// [`Column`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/expressions/struct.Column.html
-pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
+pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
     /// Returns the physical expression as [`Any`] so that it can be
     /// downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
@@ -88,24 +93,72 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
             self.nullable(input_schema)?,
         )))
     }
-    /// Evaluate an expression against a RecordBatch after first applying a
-    /// validity array
+    /// Evaluate an expression against a RecordBatch after first applying a validity array
+    ///
+    /// # Errors
+    ///
+    /// Returns an `Err` if the expression could not be evaluated or if the length of the
+    /// `selection` validity array and the number of row in `batch` is not equal.
     fn evaluate_selection(
         &self,
         batch: &RecordBatch,
         selection: &BooleanArray,
     ) -> Result<ColumnarValue> {
-        let tmp_batch = filter_record_batch(batch, selection)?;
+        let row_count = batch.num_rows();
+        if row_count != selection.len() {
+            return exec_err!(
+                "Selection array length does not match batch row count: {} != {row_count}",
+                selection.len()
+            );
+        }
 
-        let tmp_result = self.evaluate(&tmp_batch)?;
+        let selection_count = selection.true_count();
 
-        if batch.num_rows() == tmp_batch.num_rows() {
-            // All values from the `selection` filter are true.
-            Ok(tmp_result)
-        } else if let ColumnarValue::Array(a) = tmp_result {
-            scatter(selection, a.as_ref()).map(ColumnarValue::Array)
+        // First, check if we can avoid filtering altogether.
+        if selection_count == row_count {
+            // All values from the `selection` filter are true and match the input batch.
+            // No need to perform any filtering.
+            return self.evaluate(batch);
+        }
+
+        // Next, prepare the result array for each 'true' row in the selection vector.
+        let filtered_result = if selection_count == 0 {
+            // Do not call `evaluate` when the selection is empty.
+            // `evaluate_selection` is used to conditionally evaluate expressions.
+            // When the expression in question is fallible, evaluating it with an empty
+            // record batch may trigger a runtime error (e.g. division by zero).
+            //
+            // Instead, create an empty array matching the expected return type.
+            let datatype = self.data_type(batch.schema_ref().as_ref())?;
+            ColumnarValue::Array(new_empty_array(&datatype))
         } else {
-            Ok(tmp_result)
+            // If we reach this point, there's no other option than to filter the batch.
+            // This is a fairly costly operation since it requires creating partial copies
+            // (worst case of length `row_count - 1`) of all the arrays in the record batch.
+            // The resulting `filtered_batch` will contain `selection_count` rows.
+            let filtered_batch = filter_record_batch(batch, selection)?;
+            self.evaluate(&filtered_batch)?
+        };
+
+        // Finally, scatter the filtered result array so that the indices match the input rows again.
+        match &filtered_result {
+            ColumnarValue::Array(a) => {
+                scatter(selection, a.as_ref()).map(ColumnarValue::Array)
+            }
+            ColumnarValue::Scalar(ScalarValue::Boolean(value)) => {
+                // When the scalar is true or false, skip the scatter process
+                if let Some(v) = value {
+                    if *v {
+                        Ok(ColumnarValue::from(Arc::new(selection.clone()) as ArrayRef))
+                    } else {
+                        Ok(filtered_result)
+                    }
+                } else {
+                    let array = BooleanArray::from(vec![None; row_count]);
+                    scatter(selection, &array).map(ColumnarValue::Array)
+                }
+            }
+            ColumnarValue::Scalar(_) => Ok(filtered_result),
         }
     }
 
@@ -200,9 +253,9 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
         let output_interval = self.evaluate_bounds(children_ranges_refs.as_slice())?;
         let dt = output_interval.data_type();
         if dt.eq(&DataType::Boolean) {
-            let p = if output_interval.eq(&Interval::CERTAINLY_TRUE) {
+            let p = if output_interval.eq(&Interval::TRUE) {
                 ScalarValue::new_one(&dt)
-            } else if output_interval.eq(&Interval::CERTAINLY_FALSE) {
+            } else if output_interval.eq(&Interval::FALSE) {
                 ScalarValue::new_zero(&dt)
             } else {
                 ScalarValue::try_from(&dt)
@@ -262,9 +315,9 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
                     Ok((*child).clone())
                 } else if new_interval.data_type().eq(&DataType::Boolean) {
                     let dt = old_interval.data_type();
-                    let p = if new_interval.eq(&Interval::CERTAINLY_TRUE) {
+                    let p = if new_interval.eq(&Interval::TRUE) {
                         ScalarValue::new_one(&dt)
-                    } else if new_interval.eq(&Interval::CERTAINLY_FALSE) {
+                    } else if new_interval.eq(&Interval::FALSE) {
                         ScalarValue::new_zero(&dt)
                     } else {
                         unreachable!("Given that we have a range reduction for a boolean interval, we should have certainty")
@@ -294,7 +347,6 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
     /// representation.
     ///
     /// See the [`fmt_sql`] function for an example of printing `PhysicalExpr`s as SQL.
-    ///
     fn fmt_sql(&self, f: &mut Formatter<'_>) -> fmt::Result;
 
     /// Take a snapshot of this `PhysicalExpr`, if it is dynamic.
@@ -335,7 +387,7 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
     ///
     /// Systems that implement remote execution of plans, e.g. serialize a portion of the query plan
     /// and send it across the wire to a remote executor may want to call this method after
-    /// every batch on the source side and brodcast / update the current snaphot to the remote executor.
+    /// every batch on the source side and broadcast / update the current snapshot to the remote executor.
     ///
     /// Note for implementers: this method should *not* handle recursion.
     /// Recursion is handled in [`snapshot_physical_expr`].
@@ -345,42 +397,65 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + DynEq + DynHash {
         // This is a safe default behavior.
         Ok(None)
     }
-}
 
-/// [`PhysicalExpr`] can't be constrained by [`Eq`] directly because it must remain object
-/// safe. To ease implementation, blanket implementation is provided for [`Eq`] types.
-pub trait DynEq {
-    fn dyn_eq(&self, other: &dyn Any) -> bool;
-}
+    /// Returns the generation of this `PhysicalExpr` for snapshotting purposes.
+    /// The generation is an arbitrary u64 that can be used to track changes
+    /// in the state of the `PhysicalExpr` over time without having to do an exhaustive comparison.
+    /// This is useful to avoid unnecessary computation or serialization if there are no changes to the expression.
+    /// In particular, dynamic expressions that may change over time; this allows cheap checks for changes.
+    /// Static expressions that do not change over time should return 0, as does the default implementation.
+    /// You should not call this method directly as it does not handle recursion.
+    /// Instead use [`snapshot_generation`] to handle recursion and capture the
+    /// full state of the `PhysicalExpr`.
+    fn snapshot_generation(&self) -> u64 {
+        // By default, we return 0 to indicate that this PhysicalExpr does not
+        // have any dynamic references or state.
+        // Since the recursive algorithm XORs the generations of all children the overall
+        // generation will be 0 if no children have a non-zero generation, meaning that
+        // static expressions will always return 0.
+        0
+    }
+
+    /// Returns true if the expression node is volatile, i.e. whether it can return
+    /// different results when evaluated multiple times with the same input.
+    ///
+    /// Note: unlike [`is_volatile`], this function does not consider inputs:
+    /// - `random()` returns `true`,
+    /// - `a + random()` returns `false` (because the operation `+` itself is not volatile.)
+    ///
+    /// The default to this function was set to `false` when it was created
+    /// to avoid imposing API churn on implementers, but this is not a safe default in general.
+    /// It is highly recommended that volatile expressions implement this method and return `true`.
+    /// This default may be removed in the future if it causes problems or we decide to
+    /// eat the cost of the breaking change and require all implementers to make a choice.
+    fn is_volatile_node(&self) -> bool {
+        false
+    }
 
-impl<T: Eq + Any> DynEq for T {
-    fn dyn_eq(&self, other: &dyn Any) -> bool {
-        other.downcast_ref::<Self>() == Some(self)
+    /// Returns placement information for this expression.
+    ///
+    /// This is used by optimizers to make decisions about expression placement,
+    /// such as whether to push expressions down through projections.
+    ///
+    /// The default implementation returns [`ExpressionPlacement::KeepInPlace`].
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::KeepInPlace
     }
 }
 
+#[deprecated(
+    since = "50.0.0",
+    note = "Use `datafusion_expr_common::dyn_eq` instead"
+)]
+pub use datafusion_expr_common::dyn_eq::{DynEq, DynHash};
+
 impl PartialEq for dyn PhysicalExpr {
     fn eq(&self, other: &Self) -> bool {
         self.dyn_eq(other.as_any())
     }
 }
-
 impl Eq for dyn PhysicalExpr {}
 
-/// [`PhysicalExpr`] can't be constrained by [`Hash`] directly because it must remain
-/// object safe. To ease implementation blanket implementation is provided for [`Hash`]
-/// types.
-pub trait DynHash {
-    fn dyn_hash(&self, _state: &mut dyn Hasher);
-}
-
-impl<T: Hash + Any> DynHash for T {
-    fn dyn_hash(&self, mut state: &mut dyn Hasher) {
-        self.type_id().hash(&mut state);
-        self.hash(&mut state)
-    }
-}
-
 impl Hash for dyn PhysicalExpr {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.dyn_hash(state);
@@ -394,9 +469,13 @@ pub fn with_new_children_if_necessary(
     children: Vec<Arc<dyn PhysicalExpr>>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let old_children = expr.children();
-    if children.len() != old_children.len() {
-        internal_err!("PhysicalExpr: Wrong number of children")
-    } else if children.is_empty()
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "PhysicalExpr: Wrong number of children"
+    );
+
+    if children.is_empty()
         || children
             .iter()
             .zip(old_children.iter())
@@ -408,21 +487,6 @@ pub fn with_new_children_if_necessary(
     }
 }
 
-#[deprecated(since = "44.0.0")]
-pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
-    if any.is::<Arc<dyn PhysicalExpr>>() {
-        any.downcast_ref::<Arc<dyn PhysicalExpr>>()
-            .unwrap()
-            .as_any()
-    } else if any.is::<Box<dyn PhysicalExpr>>() {
-        any.downcast_ref::<Box<dyn PhysicalExpr>>()
-            .unwrap()
-            .as_any()
-    } else {
-        any
-    }
-}
-
 /// Returns [`Display`] able a list of [`PhysicalExpr`]
 ///
 /// Example output: `[a + 1, b]`
@@ -463,7 +527,7 @@ where
 ///
 /// # Example
 /// ```
-/// # // The boiler plate needed to create a `PhysicalExpr` for the example
+/// # // The boilerplate needed to create a `PhysicalExpr` for the example
 /// # use std::any::Any;
 /// use std::collections::HashMap;
 /// # use std::fmt::Formatter;
@@ -473,7 +537,7 @@ where
 /// # use datafusion_common::Result;
 /// # use datafusion_expr_common::columnar_value::ColumnarValue;
 /// # use datafusion_physical_expr_common::physical_expr::{fmt_sql, DynEq, PhysicalExpr};
-/// # #[derive(Debug, Hash, PartialOrd, PartialEq)]
+/// # #[derive(Debug, PartialEq, Eq, Hash)]
 /// # struct MyExpr {}
 /// # impl PhysicalExpr for MyExpr {fn as_any(&self) -> &dyn Any { unimplemented!() }
 /// # fn data_type(&self, input_schema: &Schema) -> Result<DataType> { unimplemented!() }
@@ -485,7 +549,6 @@ where
 /// # fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "CASE a > b THEN 1 ELSE 0 END") }
 /// # }
 /// # impl std::fmt::Display for MyExpr {fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { unimplemented!() } }
-/// # impl DynEq for MyExpr {fn dyn_eq(&self, other: &dyn Any) -> bool { unimplemented!() } }
 /// # fn make_physical_expr() -> Arc<dyn PhysicalExpr> { Arc::new(MyExpr{}) }
 /// let expr: Arc<dyn PhysicalExpr> = make_physical_expr();
 /// // wrap the expression in `sql_fmt` which can be used with
@@ -522,12 +585,30 @@ pub fn fmt_sql(expr: &dyn PhysicalExpr) -> impl Display + '_ {
 ///
 /// # Returns
 ///
-/// Returns an `Option<Arc<dyn PhysicalExpr>>` which is the snapshot of the
-/// `PhysicalExpr` if it is dynamic. If the `PhysicalExpr` does not have
-/// any dynamic references or state, it returns `None`.
+/// Returns a snapshot of the `PhysicalExpr` if it is dynamic, otherwise
+/// returns itself.
 pub fn snapshot_physical_expr(
     expr: Arc<dyn PhysicalExpr>,
 ) -> Result<Arc<dyn PhysicalExpr>> {
+    snapshot_physical_expr_opt(expr).data()
+}
+
+/// Take a snapshot of the given `PhysicalExpr` if it is dynamic.
+///
+/// Take a snapshot of this `PhysicalExpr` if it is dynamic.
+/// This is used to capture the current state of `PhysicalExpr`s that may contain
+/// dynamic references to other operators in order to serialize it over the wire
+/// or treat it via downcast matching.
+///
+/// See the documentation of [`PhysicalExpr::snapshot`] for more details.
+///
+/// # Returns
+///
+/// Returns a `[`Transformed`] indicating whether a snapshot was taken,
+/// along with the resulting `PhysicalExpr`.
+pub fn snapshot_physical_expr_opt(
+    expr: Arc<dyn PhysicalExpr>,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
     expr.transform_up(|e| {
         if let Some(snapshot) = e.snapshot()? {
             Ok(Transformed::yes(snapshot))
@@ -535,5 +616,244 @@ pub fn snapshot_physical_expr(
             Ok(Transformed::no(Arc::clone(&e)))
         }
     })
-    .data()
+}
+
+/// Check the generation of this `PhysicalExpr`.
+/// Dynamic `PhysicalExpr`s may have a generation that is incremented
+/// every time the state of the `PhysicalExpr` changes.
+/// If the generation changes that means this `PhysicalExpr` or one of its children
+/// has changed since the last time it was evaluated.
+///
+/// This algorithm will not produce collisions as long as the structure of the
+/// `PhysicalExpr` does not change and no `PhysicalExpr` decrements its own generation.
+pub fn snapshot_generation(expr: &Arc<dyn PhysicalExpr>) -> u64 {
+    let mut generation = 0u64;
+    expr.apply(|e| {
+        // Add the current generation of the `PhysicalExpr` to our global generation.
+        generation = generation.wrapping_add(e.snapshot_generation());
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .expect("this traversal is infallible");
+
+    generation
+}
+
+/// Check if the given `PhysicalExpr` is dynamic.
+/// Internally this calls [`snapshot_generation`] to check if the generation is non-zero,
+/// any dynamic `PhysicalExpr` should have a non-zero generation.
+pub fn is_dynamic_physical_expr(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    // If the generation is non-zero, then this `PhysicalExpr` is dynamic.
+    snapshot_generation(expr) != 0
+}
+
+/// Returns true if the expression is volatile, i.e. whether it can return different
+/// results when evaluated multiple times with the same input.
+///
+/// For example the function call `RANDOM()` is volatile as each call will
+/// return a different value.
+///
+/// This method recursively checks if any sub-expression is volatile, for example
+/// `1 + RANDOM()` will return `true`.
+pub fn is_volatile(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    if expr.is_volatile_node() {
+        return true;
+    }
+    let mut is_volatile = false;
+    expr.apply(|e| {
+        if e.is_volatile_node() {
+            is_volatile = true;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("infallible closure should not fail");
+    is_volatile
+}
+
+#[cfg(test)]
+mod test {
+    use crate::physical_expr::PhysicalExpr;
+    use arrow::array::{Array, BooleanArray, Int64Array, RecordBatch};
+    use arrow::datatypes::{DataType, Schema};
+    use datafusion_expr_common::columnar_value::ColumnarValue;
+    use std::fmt::{Display, Formatter};
+    use std::sync::Arc;
+
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestExpr {}
+
+    impl PhysicalExpr for TestExpr {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn data_type(&self, _schema: &Schema) -> datafusion_common::Result<DataType> {
+            Ok(DataType::Int64)
+        }
+
+        fn nullable(&self, _schema: &Schema) -> datafusion_common::Result<bool> {
+            Ok(false)
+        }
+
+        fn evaluate(
+            &self,
+            batch: &RecordBatch,
+        ) -> datafusion_common::Result<ColumnarValue> {
+            let data = vec![1; batch.num_rows()];
+            Ok(ColumnarValue::Array(Arc::new(Int64Array::from(data))))
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn PhysicalExpr>>,
+        ) -> datafusion_common::Result<Arc<dyn PhysicalExpr>> {
+            Ok(Arc::new(Self {}))
+        }
+
+        fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            f.write_str("TestExpr")
+        }
+    }
+
+    impl Display for TestExpr {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            self.fmt_sql(f)
+        }
+    }
+
+    macro_rules! assert_arrays_eq {
+        ($EXPECTED: expr, $ACTUAL: expr, $MESSAGE: expr) => {
+            let expected = $EXPECTED.to_array(1).unwrap();
+            let actual = $ACTUAL;
+
+            let actual_array = actual.to_array(expected.len()).unwrap();
+            let actual_ref = actual_array.as_ref();
+            let expected_ref = expected.as_ref();
+            assert!(
+                actual_ref == expected_ref,
+                "{}: expected: {:?}, actual: {:?}",
+                $MESSAGE,
+                $EXPECTED,
+                actual_ref
+            );
+        };
+    }
+
+    fn test_evaluate_selection(
+        batch: &RecordBatch,
+        selection: &BooleanArray,
+        expected: &ColumnarValue,
+    ) {
+        let expr = TestExpr {};
+
+        // First check that the `evaluate_selection` is the expected one
+        let selection_result = expr.evaluate_selection(batch, selection).unwrap();
+        assert_eq!(
+            expected.to_array(1).unwrap().len(),
+            selection_result.to_array(1).unwrap().len(),
+            "evaluate_selection should output row count should match input record batch"
+        );
+        assert_arrays_eq!(
+            expected,
+            &selection_result,
+            "evaluate_selection returned unexpected value"
+        );
+
+        // If we're selecting all rows, the result should be the same as calling `evaluate`
+        // with the full record batch.
+        if (0..batch.num_rows())
+            .all(|row_idx| row_idx < selection.len() && selection.value(row_idx))
+        {
+            let empty_result = expr.evaluate(batch).unwrap();
+
+            assert_arrays_eq!(
+                empty_result,
+                &selection_result,
+                "evaluate_selection does not match unfiltered evaluate result"
+            );
+        }
+    }
+
+    fn test_evaluate_selection_error(batch: &RecordBatch, selection: &BooleanArray) {
+        let expr = TestExpr {};
+
+        // First check that the `evaluate_selection` is the expected one
+        let selection_result = expr.evaluate_selection(batch, selection);
+        assert!(selection_result.is_err(), "evaluate_selection should fail");
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_empty_record_batch() {
+        test_evaluate_selection(
+            &RecordBatch::new_empty(Arc::new(Schema::empty())),
+            &BooleanArray::from(vec![false; 0]),
+            &ColumnarValue::Array(Arc::new(Int64Array::new_null(0))),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_empty_record_batch_with_larger_false_selection() {
+        test_evaluate_selection_error(
+            &RecordBatch::new_empty(Arc::new(Schema::empty())),
+            &BooleanArray::from(vec![false; 10]),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_empty_record_batch_with_larger_true_selection() {
+        test_evaluate_selection_error(
+            &RecordBatch::new_empty(Arc::new(Schema::empty())),
+            &BooleanArray::from(vec![true; 10]),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_non_empty_record_batch() {
+        test_evaluate_selection(
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &BooleanArray::from(vec![true; 10]),
+            &ColumnarValue::Array(Arc::new(Int64Array::from(vec![1; 10]))),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_false_selection()
+     {
+        test_evaluate_selection_error(
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &BooleanArray::from(vec![false; 20]),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_larger_true_selection()
+     {
+        test_evaluate_selection_error(
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &BooleanArray::from(vec![true; 20]),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_false_selection()
+     {
+        test_evaluate_selection_error(
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &BooleanArray::from(vec![false; 5]),
+        );
+    }
+
+    #[test]
+    pub fn test_evaluate_selection_with_non_empty_record_batch_with_smaller_true_selection()
+     {
+        test_evaluate_selection_error(
+            &unsafe { RecordBatch::new_unchecked(Arc::new(Schema::empty()), vec![], 10) },
+            &BooleanArray::from(vec![true; 5]),
+        );
+    }
 }
diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs
index 2572e8679484f..fa961981c0488 100644
--- a/datafusion/physical-expr-common/src/sort_expr.rs
+++ b/datafusion/physical-expr-common/src/sort_expr.rs
@@ -17,21 +17,21 @@
 
 //! Sort expressions
 
-use crate::physical_expr::{fmt_sql, PhysicalExpr};
-use std::fmt;
-use std::fmt::{Display, Formatter};
+use std::cmp::Ordering;
+use std::fmt::{self, Display, Formatter};
 use std::hash::{Hash, Hasher};
-use std::ops::{Deref, Index, Range, RangeFrom, RangeTo};
-use std::sync::{Arc, LazyLock};
+use std::ops::{Deref, DerefMut};
+use std::sync::Arc;
 use std::vec::IntoIter;
 
+use crate::physical_expr::{PhysicalExpr, fmt_sql};
+
 use arrow::compute::kernels::sort::{SortColumn, SortOptions};
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::Result;
+use datafusion_common::{HashSet, Result};
 use datafusion_expr_common::columnar_value::ColumnarValue;
-use itertools::Itertools;
-
+use indexmap::IndexSet;
 /// Represents Sort operation for a column in a RecordBatch
 ///
 /// Example:
@@ -77,7 +77,7 @@ use itertools::Itertools;
 ///   .nulls_last();
 /// assert_eq!(sort_expr.to_string(), "a DESC NULLS LAST");
 /// ```
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Eq)]
 pub struct PhysicalSortExpr {
     /// Physical expression representing the column to sort
     pub expr: Arc<dyn PhysicalExpr>,
@@ -96,6 +96,15 @@ impl PhysicalSortExpr {
         Self::new(expr, SortOptions::default())
     }
 
+    /// Reverses the sort expression. For instance, `[a ASC NULLS LAST]` turns
+    /// into `[a DESC NULLS FIRST]`. Such reversals are useful in planning, e.g.
+    /// when constructing equivalent window expressions.
+    pub fn reverse(&self) -> Self {
+        let mut result = self.clone();
+        result.options = !result.options;
+        result
+    }
+
     /// Set the sort sort options to ASC
     pub fn asc(mut self) -> Self {
         self.options.descending = false;
@@ -129,23 +138,58 @@ impl PhysicalSortExpr {
             to_str(&self.options)
         )
     }
-}
 
-/// Access the PhysicalSortExpr as a PhysicalExpr
-impl AsRef<dyn PhysicalExpr> for PhysicalSortExpr {
-    fn as_ref(&self) -> &(dyn PhysicalExpr + 'static) {
-        self.expr.as_ref()
+    /// Evaluates the sort expression into a `SortColumn` that can be passed
+    /// into the arrow sort kernel.
+    pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result<SortColumn> {
+        let array_to_sort = match self.expr.evaluate(batch)? {
+            ColumnarValue::Array(array) => array,
+            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(batch.num_rows())?,
+        };
+        Ok(SortColumn {
+            values: array_to_sort,
+            options: Some(self.options),
+        })
+    }
+
+    /// Checks whether this sort expression satisfies the given `requirement`.
+    /// If sort options are unspecified in `requirement`, only expressions are
+    /// compared for inequality. See [`options_compatible`] for details on
+    /// how sort options compare with one another.
+    pub fn satisfy(
+        &self,
+        requirement: &PhysicalSortRequirement,
+        schema: &Schema,
+    ) -> bool {
+        self.expr.eq(&requirement.expr)
+            && requirement.options.is_none_or(|opts| {
+                options_compatible(
+                    &self.options,
+                    &opts,
+                    self.expr.nullable(schema).unwrap_or(true),
+                )
+            })
+    }
+
+    /// Checks whether this sort expression satisfies the given `sort_expr`.
+    /// See [`options_compatible`] for details on how sort options compare with
+    /// one another.
+    pub fn satisfy_expr(&self, sort_expr: &Self, schema: &Schema) -> bool {
+        self.expr.eq(&sort_expr.expr)
+            && options_compatible(
+                &self.options,
+                &sort_expr.options,
+                self.expr.nullable(schema).unwrap_or(true),
+            )
     }
 }
 
 impl PartialEq for PhysicalSortExpr {
-    fn eq(&self, other: &PhysicalSortExpr) -> bool {
+    fn eq(&self, other: &Self) -> bool {
         self.options == other.options && self.expr.eq(&other.expr)
     }
 }
 
-impl Eq for PhysicalSortExpr {}
-
 impl Hash for PhysicalSortExpr {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.expr.hash(state);
@@ -159,38 +203,20 @@ impl Display for PhysicalSortExpr {
     }
 }
 
-impl PhysicalSortExpr {
-    /// evaluate the sort expression into SortColumn that can be passed into arrow sort kernel
-    pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result<SortColumn> {
-        let value_to_sort = self.expr.evaluate(batch)?;
-        let array_to_sort = match value_to_sort {
-            ColumnarValue::Array(array) => array,
-            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(batch.num_rows())?,
-        };
-        Ok(SortColumn {
-            values: array_to_sort,
-            options: Some(self.options),
-        })
-    }
-
-    /// Checks whether this sort expression satisfies the given `requirement`.
-    /// If sort options are unspecified in `requirement`, only expressions are
-    /// compared for inequality.
-    pub fn satisfy(
-        &self,
-        requirement: &PhysicalSortRequirement,
-        schema: &Schema,
-    ) -> bool {
+/// Returns whether the given two [`SortOptions`] are compatible. Here,
+/// compatibility means that they are either exactly equal, or they differ only
+/// in whether NULL values come in first/last, which is immaterial because the
+/// column in question is not nullable (specified by the `nullable` parameter).
+pub fn options_compatible(
+    options_lhs: &SortOptions,
+    options_rhs: &SortOptions,
+    nullable: bool,
+) -> bool {
+    if nullable {
+        options_lhs == options_rhs
+    } else {
         // If the column is not nullable, NULLS FIRST/LAST is not important.
-        let nullable = self.expr.nullable(schema).unwrap_or(true);
-        self.expr.eq(&requirement.expr)
-            && if nullable {
-                requirement.options.is_none_or(|opts| self.options == opts)
-            } else {
-                requirement
-                    .options
-                    .is_none_or(|opts| self.options.descending == opts.descending)
-            }
+        options_lhs.descending == options_rhs.descending
     }
 }
 
@@ -222,28 +248,8 @@ pub struct PhysicalSortRequirement {
     pub options: Option<SortOptions>,
 }
 
-impl From<PhysicalSortRequirement> for PhysicalSortExpr {
-    /// If options is `None`, the default sort options `ASC, NULLS LAST` is used.
-    ///
-    /// The default is picked to be consistent with
-    /// PostgreSQL: <https://www.postgresql.org/docs/current/queries-order.html>
-    fn from(value: PhysicalSortRequirement) -> Self {
-        let options = value.options.unwrap_or(SortOptions {
-            descending: false,
-            nulls_first: false,
-        });
-        PhysicalSortExpr::new(value.expr, options)
-    }
-}
-
-impl From<PhysicalSortExpr> for PhysicalSortRequirement {
-    fn from(value: PhysicalSortExpr) -> Self {
-        PhysicalSortRequirement::new(value.expr, Some(value.options))
-    }
-}
-
 impl PartialEq for PhysicalSortRequirement {
-    fn eq(&self, other: &PhysicalSortRequirement) -> bool {
+    fn eq(&self, other: &Self) -> bool {
         self.options == other.options && self.expr.eq(&other.expr)
     }
 }
@@ -293,37 +299,16 @@ impl PhysicalSortRequirement {
         Self { expr, options }
     }
 
-    /// Replace the required expression for this requirement with the new one
-    pub fn with_expr(mut self, expr: Arc<dyn PhysicalExpr>) -> Self {
-        self.expr = expr;
-        self
-    }
-
     /// Returns whether this requirement is equal or more specific than `other`.
-    pub fn compatible(&self, other: &PhysicalSortRequirement) -> bool {
+    pub fn compatible(&self, other: &Self) -> bool {
         self.expr.eq(&other.expr)
             && other
                 .options
                 .is_none_or(|other_opts| self.options == Some(other_opts))
     }
-
-    #[deprecated(since = "43.0.0", note = "use  LexRequirement::from_lex_ordering")]
-    pub fn from_sort_exprs<'a>(
-        ordering: impl IntoIterator<Item = &'a PhysicalSortExpr>,
-    ) -> LexRequirement {
-        let ordering = ordering.into_iter().cloned().collect();
-        LexRequirement::from_lex_ordering(ordering)
-    }
-    #[deprecated(since = "43.0.0", note = "use  LexOrdering::from_lex_requirement")]
-    pub fn to_sort_exprs(
-        requirements: impl IntoIterator<Item = PhysicalSortRequirement>,
-    ) -> LexOrdering {
-        let requirements = requirements.into_iter().collect();
-        LexOrdering::from_lex_requirement(requirements)
-    }
 }
 
-/// Returns the SQL string representation of the given [SortOptions] object.
+/// Returns the SQL string representation of the given [`SortOptions`] object.
 #[inline]
 fn to_str(options: &SortOptions) -> &str {
     match (options.descending, options.nulls_first) {
@@ -334,162 +319,214 @@ fn to_str(options: &SortOptions) -> &str {
     }
 }
 
-///`LexOrdering` contains a `Vec<PhysicalSortExpr>`, which represents
-/// a lexicographical ordering.
+// Cross-conversion utilities between `PhysicalSortExpr` and `PhysicalSortRequirement`
+impl From<PhysicalSortExpr> for PhysicalSortRequirement {
+    fn from(value: PhysicalSortExpr) -> Self {
+        Self::new(value.expr, Some(value.options))
+    }
+}
+
+impl From<PhysicalSortRequirement> for PhysicalSortExpr {
+    /// The default sort options `ASC, NULLS LAST` when the requirement does
+    /// not specify sort options. This default is consistent with PostgreSQL.
+    ///
+    /// Reference: <https://www.postgresql.org/docs/current/queries-order.html>
+    fn from(value: PhysicalSortRequirement) -> Self {
+        let options = value
+            .options
+            .unwrap_or_else(|| SortOptions::new(false, false));
+        Self::new(value.expr, options)
+    }
+}
+
+/// This object represents a lexicographical ordering and contains a vector
+/// of `PhysicalSortExpr` objects.
 ///
-/// For example, `vec![a ASC, b DESC]` represents a lexicographical ordering
+/// For example, a `vec![a ASC, b DESC]` represents a lexicographical ordering
 /// that first sorts by column `a` in ascending order, then by column `b` in
 /// descending order.
-#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
+///
+/// # Invariants
+///
+/// The following always hold true for a `LexOrdering`:
+///
+/// 1. It is non-degenerate, meaning it contains at least one element.
+/// 2. It is duplicate-free, meaning it does not contain multiple entries for
+///    the same column.
+#[derive(Clone, Debug)]
 pub struct LexOrdering {
-    inner: Vec<PhysicalSortExpr>,
-}
-
-impl AsRef<LexOrdering> for LexOrdering {
-    fn as_ref(&self) -> &LexOrdering {
-        self
-    }
+    /// Vector of sort expressions representing the lexicographical ordering.
+    exprs: Vec<PhysicalSortExpr>,
+    /// Set of expressions in the lexicographical ordering, used to ensure
+    /// that the ordering is duplicate-free. Note that the elements in this
+    /// set are the same underlying physical expressions as in `exprs`.
+    set: IndexSet<Arc<dyn PhysicalExpr>>,
 }
 
 impl LexOrdering {
-    /// Creates a new [`LexOrdering`] from a vector
-    pub fn new(inner: Vec<PhysicalSortExpr>) -> Self {
-        Self { inner }
-    }
-
-    /// Return an empty LexOrdering (no expressions)
-    pub fn empty() -> &'static LexOrdering {
-        static EMPTY_ORDER: LazyLock<LexOrdering> = LazyLock::new(LexOrdering::default);
-        &EMPTY_ORDER
-    }
-
-    /// Returns the number of elements that can be stored in the LexOrdering
-    /// without reallocating.
-    pub fn capacity(&self) -> usize {
-        self.inner.capacity()
-    }
-
-    /// Clears the LexOrdering, removing all elements.
-    pub fn clear(&mut self) {
-        self.inner.clear()
-    }
-
-    /// Takes ownership of the actual vector of `PhysicalSortExpr`s in the LexOrdering.
-    pub fn take_exprs(self) -> Vec<PhysicalSortExpr> {
-        self.inner
-    }
-
-    /// Returns `true` if the LexOrdering contains `expr`
-    pub fn contains(&self, expr: &PhysicalSortExpr) -> bool {
-        self.inner.contains(expr)
-    }
-
-    /// Add all elements from `iter` to the LexOrdering.
-    pub fn extend<I: IntoIterator<Item = PhysicalSortExpr>>(&mut self, iter: I) {
-        self.inner.extend(iter)
-    }
-
-    /// Remove all elements from the LexOrdering where `f` evaluates to `false`.
-    pub fn retain<F>(&mut self, f: F)
-    where
-        F: FnMut(&PhysicalSortExpr) -> bool,
-    {
-        self.inner.retain(f)
-    }
-
-    /// Returns `true` if the LexOrdering contains no elements.
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    /// Returns an iterator over each `&PhysicalSortExpr` in the LexOrdering.
-    pub fn iter(&self) -> core::slice::Iter<PhysicalSortExpr> {
-        self.inner.iter()
+    /// Creates a new [`LexOrdering`] from the given vector of sort expressions.
+    /// If the vector is empty, returns `None`.
+    pub fn new(exprs: impl IntoIterator<Item = PhysicalSortExpr>) -> Option<Self> {
+        let exprs = exprs.into_iter();
+        let mut candidate = Self {
+            // not valid yet; valid publicly-returned instance must be non-empty
+            exprs: Vec::new(),
+            set: IndexSet::new(),
+        };
+        for expr in exprs {
+            candidate.push(expr);
+        }
+        if candidate.exprs.is_empty() {
+            None
+        } else {
+            Some(candidate)
+        }
     }
 
-    /// Returns the number of elements in the LexOrdering.
-    pub fn len(&self) -> usize {
-        self.inner.len()
+    /// Appends an element to the back of the `LexOrdering`.
+    pub fn push(&mut self, sort_expr: PhysicalSortExpr) {
+        if self.set.insert(Arc::clone(&sort_expr.expr)) {
+            self.exprs.push(sort_expr);
+        }
     }
 
-    /// Removes the last element from the LexOrdering and returns it, or `None` if it is empty.
-    pub fn pop(&mut self) -> Option<PhysicalSortExpr> {
-        self.inner.pop()
+    /// Add all elements from `iter` to the `LexOrdering`.
+    pub fn extend(&mut self, sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>) {
+        for sort_expr in sort_exprs {
+            self.push(sort_expr);
+        }
     }
 
-    /// Appends an element to the back of the LexOrdering.
-    pub fn push(&mut self, physical_sort_expr: PhysicalSortExpr) {
-        self.inner.push(physical_sort_expr)
+    /// Returns the leading `PhysicalSortExpr` of the `LexOrdering`. Note that
+    /// this function does not return an `Option`, as a `LexOrdering` is always
+    /// non-degenerate (i.e. it contains at least one element).
+    pub fn first(&self) -> &PhysicalSortExpr {
+        // Can safely `unwrap` because `LexOrdering` is non-degenerate:
+        self.exprs.first().unwrap()
     }
 
-    /// Truncates the LexOrdering, keeping only the first `len` elements.
-    pub fn truncate(&mut self, len: usize) {
-        self.inner.truncate(len)
+    /// Returns the number of elements that can be stored in the `LexOrdering`
+    /// without reallocating.
+    pub fn capacity(&self) -> usize {
+        self.exprs.capacity()
     }
 
-    /// Merge the contents of `other` into `self`, removing duplicates.
-    pub fn merge(mut self, other: LexOrdering) -> Self {
-        self.inner = self.inner.into_iter().chain(other).unique().collect();
-        self
+    /// Truncates the `LexOrdering`, keeping only the first `len` elements.
+    /// Returns `true` if truncation made a change, `false` otherwise. Negative
+    /// cases happen in two scenarios: (1) When `len` is greater than or equal
+    /// to the number of expressions inside this `LexOrdering`, making truncation
+    /// a no-op, or (2) when `len` is `0`, making truncation impossible.
+    pub fn truncate(&mut self, len: usize) -> bool {
+        if len == 0 || len >= self.exprs.len() {
+            return false;
+        }
+        for PhysicalSortExpr { expr, .. } in self.exprs[len..].iter() {
+            self.set.swap_remove(expr);
+        }
+        self.exprs.truncate(len);
+        true
     }
 
-    /// Converts a `LexRequirement` into a `LexOrdering`.
+    /// Check if reversing this ordering would satisfy another ordering requirement.
     ///
-    /// This function converts [`PhysicalSortRequirement`] to [`PhysicalSortExpr`]
-    /// for each entry in the input.
+    /// This supports **prefix matching**: if this ordering is `[A DESC, B ASC]`
+    /// and `other` is `[A ASC]`, reversing this gives `[A ASC, B DESC]`, which
+    /// satisfies `other` since `[A ASC]` is a prefix.
     ///
-    /// If the required ordering is `None` for an entry in `requirement`, the
-    /// default ordering `ASC, NULLS LAST` is used (see
-    /// [`PhysicalSortExpr::from`]).
-    pub fn from_lex_requirement(requirement: LexRequirement) -> LexOrdering {
-        requirement
-            .into_iter()
-            .map(PhysicalSortExpr::from)
-            .collect()
-    }
-
-    /// Collapse a `LexOrdering` into a new duplicate-free `LexOrdering` based on expression.
+    /// # Arguments
+    /// * `other` - The ordering requirement to check against
     ///
-    /// This function filters  duplicate entries that have same physical
-    /// expression inside, ignoring [`SortOptions`]. For example:
+    /// # Returns
+    /// `true` if reversing this ordering would satisfy `other`
     ///
-    /// `vec![a ASC, a DESC]` collapses to `vec![a ASC]`.
-    pub fn collapse(self) -> Self {
-        let mut output = LexOrdering::default();
-        for item in self {
-            if !output.iter().any(|req| req.expr.eq(&item.expr)) {
-                output.push(item);
-            }
+    /// # Example
+    /// ```text
+    /// self:  [number DESC, letter ASC]
+    /// other: [number ASC]
+    /// After reversing self: [number ASC, letter DESC]  ✓ Prefix match!
+    /// ```
+    pub fn is_reverse(&self, other: &LexOrdering) -> bool {
+        let self_exprs = self.as_ref();
+        let other_exprs = other.as_ref();
+
+        if other_exprs.len() > self_exprs.len() {
+            return false;
         }
-        output
-    }
 
-    /// Transforms each `PhysicalSortExpr` in the `LexOrdering`
-    /// in place using the provided closure `f`.
-    pub fn transform<F>(&mut self, f: F)
-    where
-        F: FnMut(&mut PhysicalSortExpr),
-    {
-        self.inner.iter_mut().for_each(f);
+        other_exprs.iter().zip(self_exprs.iter()).all(|(req, cur)| {
+            req.expr.eq(&cur.expr) && is_reversed_sort_options(&req.options, &cur.options)
+        })
     }
-}
 
-impl From<Vec<PhysicalSortExpr>> for LexOrdering {
-    fn from(value: Vec<PhysicalSortExpr>) -> Self {
-        Self::new(value)
-    }
-}
+    /// Returns the sort options for the given expression if one is defined in this `LexOrdering`.
+    pub fn get_sort_options(&self, expr: &dyn PhysicalExpr) -> Option<SortOptions> {
+        for e in self {
+            if e.expr.as_ref().dyn_eq(expr) {
+                return Some(e.options);
+            }
+        }
 
-impl From<LexRequirement> for LexOrdering {
-    fn from(value: LexRequirement) -> Self {
-        Self::from_lex_requirement(value)
+        None
     }
 }
 
-/// Convert a `LexOrdering` into a `Arc[<PhysicalSortExpr>]` for fast copies
-impl From<LexOrdering> for Arc<[PhysicalSortExpr]> {
-    fn from(value: LexOrdering) -> Self {
-        value.inner.into()
+/// Check if two SortOptions represent reversed orderings.
+///
+/// Returns `true` if both `descending` and `nulls_first` are opposite.
+///
+/// # Example
+/// ```
+/// use arrow::compute::SortOptions;
+/// # use datafusion_physical_expr_common::sort_expr::is_reversed_sort_options;
+///
+/// let asc_nulls_last = SortOptions {
+///     descending: false,
+///     nulls_first: false,
+/// };
+/// let desc_nulls_first = SortOptions {
+///     descending: true,
+///     nulls_first: true,
+/// };
+///
+/// assert!(is_reversed_sort_options(&asc_nulls_last, &desc_nulls_first));
+/// assert!(is_reversed_sort_options(&desc_nulls_first, &asc_nulls_last));
+/// ```
+pub fn is_reversed_sort_options(lhs: &SortOptions, rhs: &SortOptions) -> bool {
+    lhs.descending != rhs.descending && lhs.nulls_first != rhs.nulls_first
+}
+
+impl PartialEq for LexOrdering {
+    fn eq(&self, other: &Self) -> bool {
+        let Self {
+            exprs,
+            set: _, // derived from `exprs`
+        } = self;
+        // PartialEq must be consistent with PartialOrd
+        exprs == &other.exprs
+    }
+}
+impl Eq for LexOrdering {}
+impl PartialOrd for LexOrdering {
+    /// There is a partial ordering among `LexOrdering` objects. For example, the
+    /// ordering `[a ASC]` is coarser (less) than ordering `[a ASC, b ASC]`.
+    /// If two orderings do not share a prefix, they are incomparable.
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        // PartialEq must be consistent with PartialOrd
+        self.exprs
+            .iter()
+            .zip(other.exprs.iter())
+            .all(|(lhs, rhs)| lhs == rhs)
+            .then(|| self.len().cmp(&other.len()))
+    }
+}
+
+impl<const N: usize> From<[PhysicalSortExpr; N]> for LexOrdering {
+    fn from(value: [PhysicalSortExpr; N]) -> Self {
+        // TODO: Replace this assertion with a condition on the generic parameter
+        //       when Rust supports it.
+        assert!(N > 0);
+        Self::new(value)
+            .expect("A LexOrdering from non-empty array must be non-degenerate")
     }
 }
 
@@ -497,14 +534,14 @@ impl Deref for LexOrdering {
     type Target = [PhysicalSortExpr];
 
     fn deref(&self) -> &Self::Target {
-        self.inner.as_slice()
+        self.exprs.as_slice()
     }
 }
 
 impl Display for LexOrdering {
     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
         let mut first = true;
-        for sort_expr in &self.inner {
+        for sort_expr in &self.exprs {
             if first {
                 first = false;
             } else {
@@ -516,162 +553,296 @@ impl Display for LexOrdering {
     }
 }
 
-impl FromIterator<PhysicalSortExpr> for LexOrdering {
-    fn from_iter<T: IntoIterator<Item = PhysicalSortExpr>>(iter: T) -> Self {
-        let mut lex_ordering = LexOrdering::default();
-
-        for i in iter {
-            lex_ordering.push(i);
-        }
+impl IntoIterator for LexOrdering {
+    type Item = PhysicalSortExpr;
+    type IntoIter = IntoIter<Self::Item>;
 
-        lex_ordering
+    fn into_iter(self) -> Self::IntoIter {
+        self.exprs.into_iter()
     }
 }
 
-impl Index<usize> for LexOrdering {
-    type Output = PhysicalSortExpr;
+impl<'a> IntoIterator for &'a LexOrdering {
+    type Item = &'a PhysicalSortExpr;
+    type IntoIter = std::slice::Iter<'a, PhysicalSortExpr>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.exprs.iter()
+    }
+}
 
-    fn index(&self, index: usize) -> &Self::Output {
-        &self.inner[index]
+impl From<LexOrdering> for Vec<PhysicalSortExpr> {
+    fn from(ordering: LexOrdering) -> Self {
+        ordering.exprs
     }
 }
 
-impl Index<Range<usize>> for LexOrdering {
-    type Output = [PhysicalSortExpr];
+/// This object represents a lexicographical ordering requirement and contains
+/// a vector of `PhysicalSortRequirement` objects.
+///
+/// For example, a `vec![a Some(ASC), b None]` represents a lexicographical
+/// requirement that firsts imposes an ordering by column `a` in ascending
+/// order, then by column `b` in *any* (ascending or descending) order. The
+/// ordering is non-degenerate, meaning it contains at least one element, and
+/// it is duplicate-free, meaning it does not contain multiple entries for the
+/// same column.
+///
+/// Note that a `LexRequirement` need not enforce the uniqueness of its sort
+/// expressions after construction like a `LexOrdering` does, because it provides
+/// no mutation methods. If such methods become necessary, we will need to
+/// enforce uniqueness like the latter object.
+#[derive(Debug, Clone, PartialEq)]
+pub struct LexRequirement {
+    reqs: Vec<PhysicalSortRequirement>,
+}
+
+impl LexRequirement {
+    /// Creates a new [`LexRequirement`] from the given vector of sort expressions.
+    /// If the vector is empty, returns `None`.
+    pub fn new(reqs: impl IntoIterator<Item = PhysicalSortRequirement>) -> Option<Self> {
+        let (non_empty, requirements) = Self::construct(reqs);
+        non_empty.then_some(requirements)
+    }
+
+    /// Returns the leading `PhysicalSortRequirement` of the `LexRequirement`.
+    /// Note that this function does not return an `Option`, as a `LexRequirement`
+    /// is always non-degenerate (i.e. it contains at least one element).
+    pub fn first(&self) -> &PhysicalSortRequirement {
+        // Can safely `unwrap` because `LexRequirement` is non-degenerate:
+        self.reqs.first().unwrap()
+    }
+
+    /// Constructs a new `LexRequirement` from the given sort requirements w/o
+    /// enforcing non-degeneracy. This function is used internally and is not
+    /// meant (or safe) for external use.
+    fn construct(
+        reqs: impl IntoIterator<Item = PhysicalSortRequirement>,
+    ) -> (bool, Self) {
+        let mut set = HashSet::new();
+        let reqs = reqs
+            .into_iter()
+            .filter_map(|r| set.insert(Arc::clone(&r.expr)).then_some(r))
+            .collect();
+        (!set.is_empty(), Self { reqs })
+    }
+}
 
-    fn index(&self, range: Range<usize>) -> &Self::Output {
-        &self.inner[range]
+impl<const N: usize> From<[PhysicalSortRequirement; N]> for LexRequirement {
+    fn from(value: [PhysicalSortRequirement; N]) -> Self {
+        // TODO: Replace this assertion with a condition on the generic parameter
+        //       when Rust supports it.
+        assert!(N > 0);
+        let (non_empty, requirement) = Self::construct(value);
+        debug_assert!(non_empty);
+        requirement
     }
 }
 
-impl Index<RangeFrom<usize>> for LexOrdering {
-    type Output = [PhysicalSortExpr];
+impl Deref for LexRequirement {
+    type Target = [PhysicalSortRequirement];
 
-    fn index(&self, range_from: RangeFrom<usize>) -> &Self::Output {
-        &self.inner[range_from]
+    fn deref(&self) -> &Self::Target {
+        self.reqs.as_slice()
     }
 }
 
-impl Index<RangeTo<usize>> for LexOrdering {
-    type Output = [PhysicalSortExpr];
+impl IntoIterator for LexRequirement {
+    type Item = PhysicalSortRequirement;
+    type IntoIter = IntoIter<Self::Item>;
 
-    fn index(&self, range_to: RangeTo<usize>) -> &Self::Output {
-        &self.inner[range_to]
+    fn into_iter(self) -> Self::IntoIter {
+        self.reqs.into_iter()
     }
 }
 
-impl IntoIterator for LexOrdering {
-    type Item = PhysicalSortExpr;
-    type IntoIter = IntoIter<PhysicalSortExpr>;
+impl<'a> IntoIterator for &'a LexRequirement {
+    type Item = &'a PhysicalSortRequirement;
+    type IntoIter = std::slice::Iter<'a, PhysicalSortRequirement>;
 
     fn into_iter(self) -> Self::IntoIter {
-        self.inner.into_iter()
+        self.reqs.iter()
+    }
+}
+
+impl From<LexRequirement> for Vec<PhysicalSortRequirement> {
+    fn from(requirement: LexRequirement) -> Self {
+        requirement.reqs
     }
 }
 
-///`LexOrderingRef` is an alias for the type &`[PhysicalSortExpr]`, which represents
-/// a reference to a lexicographical ordering.
-#[deprecated(since = "43.0.0", note = "use &LexOrdering instead")]
-pub type LexOrderingRef<'a> = &'a [PhysicalSortExpr];
+// Cross-conversion utilities between `LexOrdering` and `LexRequirement`
+impl From<LexOrdering> for LexRequirement {
+    fn from(value: LexOrdering) -> Self {
+        // Can construct directly as `value` is non-degenerate:
+        let (non_empty, requirements) =
+            Self::construct(value.into_iter().map(Into::into));
+        debug_assert!(non_empty);
+        requirements
+    }
+}
 
-///`LexRequirement` is an struct containing a `Vec<PhysicalSortRequirement>`, which
-/// represents a lexicographical ordering requirement.
-#[derive(Debug, Default, Clone, PartialEq)]
-pub struct LexRequirement {
-    pub inner: Vec<PhysicalSortRequirement>,
+impl From<LexRequirement> for LexOrdering {
+    fn from(value: LexRequirement) -> Self {
+        // Can construct directly as `value` is non-degenerate
+        Self::new(value.into_iter().map(Into::into))
+            .expect("A LexOrdering from LexRequirement must be non-degenerate")
+    }
 }
 
-impl LexRequirement {
-    pub fn new(inner: Vec<PhysicalSortRequirement>) -> Self {
-        Self { inner }
+/// Represents a plan's input ordering requirements. Vector elements represent
+/// alternative ordering requirements in the order of preference. The list of
+/// alternatives can be either hard or soft, depending on whether the operator
+/// can work without an input ordering.
+///
+/// # Invariants
+///
+/// The following always hold true for a `OrderingRequirements`:
+///
+/// 1. It is non-degenerate, meaning it contains at least one ordering. The
+///    absence of an input ordering requirement is represented by a `None` value
+///    in `ExecutionPlan` APIs, which return an `Option<OrderingRequirements>`.
+#[derive(Debug, Clone, PartialEq)]
+pub enum OrderingRequirements {
+    /// The operator is not able to work without one of these requirements.
+    Hard(Vec<LexRequirement>),
+    /// The operator can benefit from these input orderings when available,
+    /// but can still work in the absence of any input ordering.
+    Soft(Vec<LexRequirement>),
+}
+
+impl OrderingRequirements {
+    /// Creates a new instance from the given alternatives. If an empty list of
+    /// alternatives are given, returns `None`.
+    pub fn new_alternatives(
+        alternatives: impl IntoIterator<Item = LexRequirement>,
+        soft: bool,
+    ) -> Option<Self> {
+        let alternatives = alternatives.into_iter().collect::<Vec<_>>();
+        (!alternatives.is_empty()).then(|| {
+            if soft {
+                Self::Soft(alternatives)
+            } else {
+                Self::Hard(alternatives)
+            }
+        })
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
+    /// Creates a new instance with a single hard requirement.
+    pub fn new(requirement: LexRequirement) -> Self {
+        Self::Hard(vec![requirement])
     }
 
-    pub fn iter(&self) -> impl Iterator<Item = &PhysicalSortRequirement> {
-        self.inner.iter()
+    /// Creates a new instance with a single soft requirement.
+    pub fn new_soft(requirement: LexRequirement) -> Self {
+        Self::Soft(vec![requirement])
     }
 
-    pub fn push(&mut self, physical_sort_requirement: PhysicalSortRequirement) {
-        self.inner.push(physical_sort_requirement)
+    /// Adds an alternative requirement to the list of alternatives.
+    pub fn add_alternative(&mut self, requirement: LexRequirement) {
+        match self {
+            Self::Hard(alts) | Self::Soft(alts) => alts.push(requirement),
+        }
     }
 
-    /// Create a new [`LexRequirement`] from a [`LexOrdering`]
-    ///
-    /// Returns [`LexRequirement`] that requires the exact
-    /// sort of the [`PhysicalSortExpr`]s in `ordering`
-    pub fn from_lex_ordering(ordering: LexOrdering) -> Self {
-        Self::new(
-            ordering
-                .into_iter()
-                .map(PhysicalSortRequirement::from)
-                .collect(),
-        )
+    /// Returns the first (i.e. most preferred) `LexRequirement` among
+    /// alternative requirements.
+    pub fn into_single(self) -> LexRequirement {
+        match self {
+            Self::Hard(mut alts) | Self::Soft(mut alts) => alts.swap_remove(0),
+        }
     }
 
-    /// Constructs a duplicate-free `LexOrderingReq` by filtering out
-    /// duplicate entries that have same physical expression inside.
-    ///
-    /// For example, `vec![a Some(ASC), a Some(DESC)]` collapses to `vec![a
-    /// Some(ASC)]`.
-    pub fn collapse(self) -> Self {
-        let mut output = Vec::<PhysicalSortRequirement>::new();
-        for item in self {
-            if !output.iter().any(|req| req.expr.eq(&item.expr)) {
-                output.push(item);
-            }
+    /// Returns a reference to the first (i.e. most preferred) `LexRequirement`
+    /// among alternative requirements.
+    pub fn first(&self) -> &LexRequirement {
+        match self {
+            Self::Hard(alts) | Self::Soft(alts) => &alts[0],
         }
-        LexRequirement::new(output)
     }
-}
 
-impl From<LexOrdering> for LexRequirement {
-    fn from(value: LexOrdering) -> Self {
-        Self::from_lex_ordering(value)
+    /// Returns all alternatives as a vector of `LexRequirement` objects and a
+    /// boolean value indicating softness/hardness of the requirements.
+    pub fn into_alternatives(self) -> (Vec<LexRequirement>, bool) {
+        match self {
+            Self::Hard(alts) => (alts, false),
+            Self::Soft(alts) => (alts, true),
+        }
     }
 }
 
-impl Deref for LexRequirement {
-    type Target = [PhysicalSortRequirement];
+impl From<LexRequirement> for OrderingRequirements {
+    fn from(requirement: LexRequirement) -> Self {
+        Self::new(requirement)
+    }
+}
 
-    fn deref(&self) -> &Self::Target {
-        self.inner.as_slice()
+impl From<LexOrdering> for OrderingRequirements {
+    fn from(ordering: LexOrdering) -> Self {
+        Self::new(ordering.into())
     }
 }
 
-impl FromIterator<PhysicalSortRequirement> for LexRequirement {
-    fn from_iter<T: IntoIterator<Item = PhysicalSortRequirement>>(iter: T) -> Self {
-        let mut lex_requirement = LexRequirement::new(vec![]);
+impl Deref for OrderingRequirements {
+    type Target = [LexRequirement];
 
-        for i in iter {
-            lex_requirement.inner.push(i);
+    fn deref(&self) -> &Self::Target {
+        match &self {
+            Self::Hard(alts) | Self::Soft(alts) => alts.as_slice(),
         }
-
-        lex_requirement
     }
 }
 
-impl IntoIterator for LexRequirement {
-    type Item = PhysicalSortRequirement;
-    type IntoIter = IntoIter<Self::Item>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.inner.into_iter()
+impl DerefMut for OrderingRequirements {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        match self {
+            Self::Hard(alts) | Self::Soft(alts) => alts.as_mut_slice(),
+        }
     }
 }
 
-impl<'a> IntoIterator for &'a LexOrdering {
-    type Item = &'a PhysicalSortExpr;
-    type IntoIter = std::slice::Iter<'a, PhysicalSortExpr>;
+#[cfg(test)]
+mod tests {
+    use super::*;
 
-    fn into_iter(self) -> Self::IntoIter {
-        self.inner.iter()
+    #[test]
+    fn test_is_reversed_sort_options() {
+        // Test basic reversal: ASC NULLS LAST ↔ DESC NULLS FIRST
+        let asc_nulls_last = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+        let desc_nulls_first = SortOptions {
+            descending: true,
+            nulls_first: true,
+        };
+        assert!(is_reversed_sort_options(&asc_nulls_last, &desc_nulls_first));
+        assert!(is_reversed_sort_options(&desc_nulls_first, &asc_nulls_last));
+
+        // Test another reversal: ASC NULLS FIRST ↔ DESC NULLS LAST
+        let asc_nulls_first = SortOptions {
+            descending: false,
+            nulls_first: true,
+        };
+        let desc_nulls_last = SortOptions {
+            descending: true,
+            nulls_first: false,
+        };
+        assert!(is_reversed_sort_options(&asc_nulls_first, &desc_nulls_last));
+        assert!(is_reversed_sort_options(&desc_nulls_last, &asc_nulls_first));
+
+        // Test non-reversal: same options
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &asc_nulls_last));
+        assert!(!is_reversed_sort_options(
+            &desc_nulls_first,
+            &desc_nulls_first
+        ));
+
+        // Test non-reversal: only descending differs
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &desc_nulls_last));
+        assert!(!is_reversed_sort_options(&desc_nulls_last, &asc_nulls_last));
+
+        // Test non-reversal: only nulls_first differs
+        assert!(!is_reversed_sort_options(&asc_nulls_last, &asc_nulls_first));
+        assert!(!is_reversed_sort_options(&asc_nulls_first, &asc_nulls_last));
     }
 }
-
-///`LexRequirementRef` is an alias for the type &`[PhysicalSortRequirement]`, which
-/// represents a reference to a lexicographical ordering requirement.
-/// #[deprecated(since = "43.0.0", note = "use &LexRequirement instead")]
-pub type LexRequirementRef<'a> = &'a [PhysicalSortRequirement];
diff --git a/datafusion/physical-expr-common/src/tree_node.rs b/datafusion/physical-expr-common/src/tree_node.rs
index c37e67575bf00..6c7d04a22535f 100644
--- a/datafusion/physical-expr-common/src/tree_node.rs
+++ b/datafusion/physical-expr-common/src/tree_node.rs
@@ -20,10 +20,10 @@
 use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
-use crate::physical_expr::{with_new_children_if_necessary, PhysicalExpr};
+use crate::physical_expr::{PhysicalExpr, with_new_children_if_necessary};
 
-use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 use datafusion_common::Result;
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 
 impl DynTreeNode for dyn PhysicalExpr {
     fn arc_children(&self) -> Vec<&Arc<Self>> {
diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs
index 114007bfa6afb..278294a9bf2ad 100644
--- a/datafusion/physical-expr-common/src/utils.rs
+++ b/datafusion/physical-expr-common/src/utils.rs
@@ -17,16 +17,16 @@
 
 use std::sync::Arc;
 
-use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData};
-use arrow::compute::{and_kleene, is_not_null, SlicesIterator};
+use crate::metrics::ExpressionEvaluatorMetrics;
+use crate::physical_expr::PhysicalExpr;
+use crate::tree_node::ExprContext;
 
+use arrow::array::{Array, ArrayRef, BooleanArray, MutableArrayData, make_array};
+use arrow::compute::{SlicesIterator, and_kleene, is_not_null};
+use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
 use datafusion_expr_common::sort_properties::ExprProperties;
 
-use crate::physical_expr::PhysicalExpr;
-use crate::sort_expr::{LexOrdering, PhysicalSortExpr};
-use crate::tree_node::ExprContext;
-
 /// Represents a [`PhysicalExpr`] node with associated properties (order and
 /// range) in a context where properties are tracked.
 pub type ExprPropertiesNode = ExprContext<ExprProperties>;
@@ -93,14 +93,38 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result<ArrayRef> {
     Ok(make_array(data))
 }
 
-/// Reverses the ORDER BY expression, which is useful during equivalent window
-/// expression construction. For instance, 'ORDER BY a ASC, NULLS LAST' turns into
-/// 'ORDER BY a DESC, NULLS FIRST'.
-pub fn reverse_order_bys(order_bys: &LexOrdering) -> LexOrdering {
-    order_bys
-        .iter()
-        .map(|e| PhysicalSortExpr::new(Arc::clone(&e.expr), !e.options))
-        .collect()
+/// Evaluates expressions against a record batch.
+/// This will convert the resulting ColumnarValues to ArrayRefs,
+/// duplicating any ScalarValues that may have been returned,
+/// and validating that the returned arrays all have the same
+/// number of rows as the input batch.
+#[inline]
+pub fn evaluate_expressions_to_arrays<'a>(
+    exprs: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    evaluate_expressions_to_arrays_with_metrics(exprs, batch, None)
+}
+
+/// Same as [`evaluate_expressions_to_arrays`] but records optional per-expression metrics.
+///
+/// For metrics tracking, see [`ExpressionEvaluatorMetrics`] for details.
+#[inline]
+pub fn evaluate_expressions_to_arrays_with_metrics<'a>(
+    exprs: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
+    batch: &RecordBatch,
+    metrics: Option<&ExpressionEvaluatorMetrics>,
+) -> Result<Vec<ArrayRef>> {
+    let num_rows = batch.num_rows();
+    exprs
+        .into_iter()
+        .enumerate()
+        .map(|(idx, e)| {
+            let _timer = metrics.and_then(|m| m.scoped_timer(idx));
+            e.evaluate(batch)
+                .and_then(|col| col.into_array_of_size(num_rows))
+        })
+        .collect::<Result<Vec<ArrayRef>>>()
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 2cce585b7f150..3a3224545ce4e 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -31,27 +31,33 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [lib]
 name = "datafusion_physical_expr"
 
+[features]
+recursive_protection = ["dep:recursive"]
+
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
+datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true }
 datafusion-functions-aggregate-common = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
-half = { workspace = true }
 hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
-log = { workspace = true }
-paste = "^1.0"
-petgraph = "0.8.1"
+parking_lot = { workspace = true }
+petgraph = "0.8.3"
+recursive = { workspace = true, optional = true }
+tokio = { workspace = true }
+half = { workspace = true }
 
 [dev-dependencies]
 arrow = { workspace = true, features = ["test_utils"] }
@@ -76,3 +82,10 @@ name = "is_null"
 [[bench]]
 harness = false
 name = "binary_op"
+
+[[bench]]
+harness = false
+name = "simplify"
+
+[package.metadata.cargo-machete]
+ignored = ["half"]
diff --git a/datafusion/physical-expr/README.md b/datafusion/physical-expr/README.md
index 424256c77e7e2..4c79223b09b8c 100644
--- a/datafusion/physical-expr/README.md
+++ b/datafusion/physical-expr/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Physical Expressions
+# Apache DataFusion Physical Expressions
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that provides data types and utilities for physical expressions.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/physical-expr/benches/binary_op.rs b/datafusion/physical-expr/benches/binary_op.rs
index 5b0f700fdb8aa..99fc40fa1c91b 100644
--- a/datafusion/physical-expr/benches/binary_op.rs
+++ b/datafusion/physical-expr/benches/binary_op.rs
@@ -20,13 +20,14 @@ use arrow::{
     datatypes::{DataType, Field, Schema},
 };
 use arrow::{array::StringArray, record_batch::RecordBatch};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use datafusion_expr::{and, binary_expr, col, lit, or, Operator};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr::{Operator, and, binary_expr, col, lit, or};
 use datafusion_physical_expr::{
+    PhysicalExpr,
     expressions::{BinaryExpr, Column},
     planner::logical2physical,
-    PhysicalExpr,
 };
+use std::hint::black_box;
 use std::sync::Arc;
 
 /// Generates BooleanArrays with different true/false distributions for benchmarking.
@@ -285,6 +286,7 @@ fn generate_test_strings(num_rows: usize) -> (Vec<String>, Vec<String>) {
 /// Creates record batches with boolean arrays that test different short-circuit scenarios.
 /// When TEST_ALL_FALSE = true: creates data for AND operator benchmarks (needs early false exit)
 /// When TEST_ALL_FALSE = false: creates data for OR operator benchmarks (needs early true exit)
+#[expect(clippy::needless_pass_by_value)]
 fn create_record_batch<const TEST_ALL_FALSE: bool>(
     schema: Arc<Schema>,
     b_values: &[String],
diff --git a/datafusion/physical-expr/benches/case_when.rs b/datafusion/physical-expr/benches/case_when.rs
index 5a88604716d21..33931a2ba98e4 100644
--- a/datafusion/physical-expr/benches/case_when.rs
+++ b/datafusion/physical-expr/benches/case_when.rs
@@ -15,110 +15,587 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::builder::{Int32Builder, StringBuilder};
-use arrow::datatypes::{DataType, Field, Schema};
+use arrow::array::{Array, ArrayRef, Int32Array, Int32Builder, StringArray};
+use arrow::datatypes::{ArrowNativeTypeOp, Field, Schema};
 use arrow::record_batch::RecordBatch;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::util::test_util::seedable_rng;
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
 use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{BinaryExpr, CaseExpr, Column, Literal};
+use datafusion_physical_expr::expressions::{BinaryExpr, case, col, lit};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use itertools::Itertools;
+use rand::distr::Alphanumeric;
+use rand::distr::uniform::SampleUniform;
+use rand::rngs::StdRng;
+use rand::{Rng, RngCore};
+use std::fmt::{Display, Formatter};
+use std::hint::black_box;
+use std::ops::Range;
 use std::sync::Arc;
 
-fn make_col(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
-    Arc::new(Column::new(name, index))
+fn make_x_cmp_y(
+    x: &Arc<dyn PhysicalExpr>,
+    op: Operator,
+    y: i32,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(Arc::clone(x), op, lit(y)))
 }
 
-fn make_lit_i32(n: i32) -> Arc<dyn PhysicalExpr> {
-    Arc::new(Literal::new(ScalarValue::Int32(Some(n))))
-}
+/// Create a record batch with the given number of rows and columns.
+/// Columns are named `c<i>` where `i` is the column index.
+///
+/// The minimum value for `column_count` is `3`.
+/// `c1` contains incrementing int32 values
+/// `c2` contains int32 values in blocks of 1000 that increment by 1000
+/// `c3` contains int32 values with one null inserted every 9 rows
+/// `c4` to `cn`, is present, contain unspecified int32 values
+fn make_batch(row_count: usize, column_count: usize) -> RecordBatch {
+    assert!(column_count >= 3);
+
+    let mut c2 = Int32Builder::new();
+    let mut c3 = Int32Builder::new();
+    for i in 0..row_count {
+        c2.append_value(i as i32 / 1000 * 1000);
 
-fn criterion_benchmark(c: &mut Criterion) {
-    // create input data
-    let mut c1 = Int32Builder::new();
-    let mut c2 = StringBuilder::new();
-    let mut c3 = StringBuilder::new();
-    for i in 0..1000 {
-        c1.append_value(i);
-        if i % 7 == 0 {
-            c2.append_null();
-        } else {
-            c2.append_value(format!("string {i}"));
-        }
         if i % 9 == 0 {
             c3.append_null();
         } else {
-            c3.append_value(format!("other string {i}"));
+            c3.append_value(i as i32);
         }
     }
-    let c1 = Arc::new(c1.finish());
+    let c1 = Arc::new(Int32Array::from_iter_values(0..row_count as i32));
     let c2 = Arc::new(c2.finish());
     let c3 = Arc::new(c3.finish());
-    let schema = Schema::new(vec![
-        Field::new("c1", DataType::Int32, true),
-        Field::new("c2", DataType::Utf8, true),
-        Field::new("c3", DataType::Utf8, true),
-    ]);
-    let batch = RecordBatch::try_new(Arc::new(schema), vec![c1, c2, c3]).unwrap();
-
-    // use same predicate for all benchmarks
-    let predicate = Arc::new(BinaryExpr::new(
-        make_col("c1", 0),
-        Operator::LtEq,
-        make_lit_i32(500),
-    ));
-
-    // CASE WHEN c1 <= 500 THEN 1 ELSE 0 END
-    c.bench_function("case_when: scalar or scalar", |b| {
+    let mut columns: Vec<ArrayRef> = vec![c1, c2, c3];
+    for _ in 3..column_count {
+        columns.push(Arc::new(Int32Array::from_iter_values(0..row_count as i32)));
+    }
+
+    let fields = columns
+        .iter()
+        .enumerate()
+        .map(|(i, c)| {
+            Field::new(
+                format!("c{}", i + 1),
+                c.data_type().clone(),
+                c.is_nullable(),
+            )
+        })
+        .collect::<Vec<_>>();
+
+    let schema = Arc::new(Schema::new(fields));
+    RecordBatch::try_new(Arc::clone(&schema), columns).unwrap()
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    run_benchmarks(c, &make_batch(8192, 3));
+    run_benchmarks(c, &make_batch(8192, 50));
+    run_benchmarks(c, &make_batch(8192, 100));
+
+    benchmark_lookup_table_case_when(c, 8192);
+    benchmark_divide_by_zero_protection(c, 8192);
+}
+
+fn run_benchmarks(c: &mut Criterion, batch: &RecordBatch) {
+    let c1 = col("c1", &batch.schema()).unwrap();
+    let c2 = col("c2", &batch.schema()).unwrap();
+    let c3 = col("c3", &batch.schema()).unwrap();
+
+    // No expression, when/then/else, literal values
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN 1 ELSE 0 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), lit(1))],
+                    Some(lit(0)),
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // No expression, when/then/else, column reference values
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN c2 ELSE c3 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), Arc::clone(&c2))],
+                    Some(Arc::clone(&c3)),
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // No expression, when/then, implicit else
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), Arc::clone(&c2))],
+                    None,
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // With expression, two when/then branches
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE c1 WHEN 1 THEN c2 WHEN 2 THEN c3 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    Some(Arc::clone(&c1)),
+                    vec![(lit(1), Arc::clone(&c2)), (lit(2), Arc::clone(&c3))],
+                    None,
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // Many when/then branches where all are effectively reachable
+    c.bench_function(format!("case_when {}x{}: CASE WHEN c1 == 0 THEN 0 WHEN c1 == 1 THEN 1 ... WHEN c1 == n THEN n ELSE n + 1 END", batch.num_rows(), batch.num_columns()).as_str(), |b| {
+        let when_thens = (0..batch.num_rows() as i32).map(|i| (make_x_cmp_y(&c1, Operator::Eq, i), lit(i))).collect();
         let expr = Arc::new(
-            CaseExpr::try_new(
+            case(
                 None,
-                vec![(predicate.clone(), make_lit_i32(1))],
-                Some(make_lit_i32(0)),
+                when_thens,
+                Some(lit(batch.num_rows() as i32))
             )
-            .unwrap(),
+                .unwrap(),
         );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+        b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
     });
 
-    // CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END
-    c.bench_function("case_when: column or null", |b| {
+    // Many when/then branches where all but the first few are effectively unreachable
+    c.bench_function(format!("case_when {}x{}: CASE WHEN c1 < 0 THEN 0 WHEN c1 < 1000 THEN 1 ... WHEN c1 < n * 1000 THEN n ELSE n + 1 END", batch.num_rows(), batch.num_columns()).as_str(), |b| {
+        let when_thens = (0..batch.num_rows() as i32).map(|i| (make_x_cmp_y(&c1, Operator::Lt, i * 1000), lit(i))).collect();
         let expr = Arc::new(
-            CaseExpr::try_new(None, vec![(predicate.clone(), make_col("c2", 1))], None)
+            case(
+                None,
+                when_thens,
+                Some(lit(batch.num_rows() as i32))
+            )
                 .unwrap(),
         );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+        b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
     });
 
-    // CASE WHEN c1 <= 500 THEN c2 ELSE c3 END
-    c.bench_function("case_when: expr or expr", |b| {
+    // Many when/then branches where all are effectively reachable
+    c.bench_function(format!("case_when {}x{}: CASE c1 WHEN 0 THEN 0 WHEN 1 THEN 1 ... WHEN n THEN n ELSE n + 1 END", batch.num_rows(), batch.num_columns()).as_str(), |b| {
+        let when_thens = (0..batch.num_rows() as i32).map(|i| (lit(i), lit(i))).collect();
         let expr = Arc::new(
-            CaseExpr::try_new(
-                None,
-                vec![(predicate.clone(), make_col("c2", 1))],
-                Some(make_col("c3", 2)),
+            case(
+                Some(Arc::clone(&c1)),
+                when_thens,
+                Some(lit(batch.num_rows() as i32))
             )
-            .unwrap(),
+                .unwrap(),
         );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+        b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
     });
 
-    // CASE c1 WHEN 1 THEN c2 WHEN 2 THEN c3 END
-    c.bench_function("case_when: CASE expr", |b| {
+    // Many when/then branches where all but the first few are effectively unreachable
+    c.bench_function(format!("case_when {}x{}: CASE c2 WHEN 0 THEN 0 WHEN 1000 THEN 1 ... WHEN n * 1000 THEN n ELSE n + 1 END", batch.num_rows(), batch.num_columns()).as_str(), |b| {
+        let when_thens = (0..batch.num_rows() as i32).map(|i| (lit(i * 1000), lit(i))).collect();
         let expr = Arc::new(
-            CaseExpr::try_new(
-                Some(make_col("c1", 0)),
-                vec![
-                    (make_lit_i32(1), make_col("c2", 1)),
-                    (make_lit_i32(2), make_col("c3", 2)),
-                ],
-                None,
+            case(
+                Some(Arc::clone(&c2)),
+                when_thens,
+                Some(lit(batch.num_rows() as i32))
             )
-            .unwrap(),
+                .unwrap(),
         );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+        b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
     });
 }
 
+struct Options<T> {
+    number_of_rows: usize,
+    range_of_values: Vec<T>,
+    in_range_probability: f32,
+    null_probability: f32,
+}
+
+fn generate_other_primitive_value<T: ArrowNativeTypeOp + SampleUniform>(
+    rng: &mut impl RngCore,
+    exclude: &[T],
+) -> T {
+    let mut value;
+    let retry_limit = 100;
+    for _ in 0..retry_limit {
+        value = rng.random_range(T::MIN_TOTAL_ORDER..=T::MAX_TOTAL_ORDER);
+        if !exclude.contains(&value) {
+            return value;
+        }
+    }
+
+    panic!("Could not generate out of range value after {retry_limit} attempts");
+}
+
+fn create_random_string_generator(
+    length: Range<usize>,
+) -> impl Fn(&mut dyn RngCore, &[String]) -> String {
+    assert!(length.end > length.start);
+
+    move |rng, exclude| {
+        let retry_limit = 100;
+        for _ in 0..retry_limit {
+            let length = rng.random_range(length.clone());
+            let value: String = rng
+                .sample_iter(Alphanumeric)
+                .take(length)
+                .map(char::from)
+                .collect();
+
+            if !exclude.contains(&value) {
+                return value;
+            }
+        }
+
+        panic!("Could not generate out of range value after {retry_limit} attempts");
+    }
+}
+
+/// Create column with the provided number of rows
+/// `in_range_percentage` is the percentage of values that should be inside the specified range
+/// `null_percentage` is the percentage of null values
+/// The rest of the values will be outside the specified range
+fn generate_values_for_lookup<T, A>(
+    options: &Options<T>,
+    generate_other_value: impl Fn(&mut StdRng, &[T]) -> T,
+) -> A
+where
+    T: Clone,
+    A: FromIterator<Option<T>>,
+{
+    // Create a value with specified range most of the time, but also some nulls and the rest is generic
+
+    assert!(
+        options.in_range_probability + options.null_probability <= 1.0,
+        "Percentages must sum to 1.0 or less"
+    );
+
+    let rng = &mut seedable_rng();
+
+    let in_range_probability = 0.0..options.in_range_probability;
+    let null_range_probability =
+        in_range_probability.start..in_range_probability.start + options.null_probability;
+    let out_range_probability = null_range_probability.end..1.0;
+
+    (0..options.number_of_rows)
+        .map(|_| {
+            let roll: f32 = rng.random();
+
+            match roll {
+                v if out_range_probability.contains(&v) => {
+                    let index = rng.random_range(0..options.range_of_values.len());
+                    // Generate value in range
+                    Some(options.range_of_values[index].clone())
+                }
+                v if null_range_probability.contains(&v) => None,
+                _ => {
+                    // Generate value out of range
+                    Some(generate_other_value(rng, &options.range_of_values))
+                }
+            }
+        })
+        .collect::<A>()
+}
+
+fn benchmark_lookup_table_case_when(c: &mut Criterion, batch_size: usize) {
+    #[derive(Clone, Copy, Debug)]
+    struct CaseWhenLookupInput {
+        batch_size: usize,
+
+        in_range_probability: f32,
+        null_probability: f32,
+    }
+
+    impl Display for CaseWhenLookupInput {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            write!(
+                f,
+                "case_when {} rows: in_range: {}, nulls: {}",
+                self.batch_size, self.in_range_probability, self.null_probability,
+            )
+        }
+    }
+
+    let mut case_when_lookup = c.benchmark_group("lookup_table_case_when");
+
+    for in_range_probability in [0.1, 0.5, 0.9, 1.0] {
+        for null_probability in [0.0, 0.1, 0.5] {
+            if in_range_probability + null_probability > 1.0 {
+                continue;
+            }
+
+            let input = CaseWhenLookupInput {
+                batch_size,
+                in_range_probability,
+                null_probability,
+            };
+
+            let when_thens_primitive_to_string = vec![
+                (1, "something"),
+                (2, "very"),
+                (3, "interesting"),
+                (4, "is"),
+                (5, "going"),
+                (6, "to"),
+                (7, "happen"),
+                (30, "in"),
+                (31, "datafusion"),
+                (90, "when"),
+                (91, "you"),
+                (92, "find"),
+                (93, "it"),
+                (120, "let"),
+                (240, "me"),
+                (241, "know"),
+                (244, "please"),
+                (246, "thank"),
+                (250, "you"),
+                (252, "!"),
+            ];
+            let when_thens_string_to_primitive = when_thens_primitive_to_string
+                .iter()
+                .map(|&(key, value)| (value, key))
+                .collect_vec();
+
+            for num_entries in [5, 10, 20] {
+                for (name, values_range) in [
+                    ("all equally true", 0..num_entries),
+                    // Test when early termination is beneficial
+                    ("only first 2 are true", 0..2),
+                ] {
+                    let when_thens_primitive_to_string =
+                        when_thens_primitive_to_string[values_range.clone()].to_vec();
+
+                    let when_thens_string_to_primitive =
+                        when_thens_string_to_primitive[values_range].to_vec();
+
+                    case_when_lookup.bench_with_input(
+                        BenchmarkId::new(
+                            format!(
+                                "case when i32 -> utf8, {num_entries} entries, {name}"
+                            ),
+                            input,
+                        ),
+                        &input,
+                        |b, input| {
+                            let array: Int32Array = generate_values_for_lookup(
+                                &Options::<i32> {
+                                    number_of_rows: batch_size,
+                                    range_of_values: when_thens_primitive_to_string
+                                        .iter()
+                                        .map(|(key, _)| *key)
+                                        .collect(),
+                                    in_range_probability: input.in_range_probability,
+                                    null_probability: input.null_probability,
+                                },
+                                |rng, exclude| {
+                                    generate_other_primitive_value::<i32>(rng, exclude)
+                                },
+                            );
+                            let batch = RecordBatch::try_new(
+                                Arc::new(Schema::new(vec![Field::new(
+                                    "col1",
+                                    array.data_type().clone(),
+                                    true,
+                                )])),
+                                vec![Arc::new(array)],
+                            )
+                            .unwrap();
+
+                            let when_thens = when_thens_primitive_to_string
+                                .iter()
+                                .map(|&(key, value)| (lit(key), lit(value)))
+                                .collect();
+
+                            let expr = Arc::new(
+                                case(
+                                    Some(col("col1", batch.schema_ref()).unwrap()),
+                                    when_thens,
+                                    Some(lit("whatever")),
+                                )
+                                .unwrap(),
+                            );
+
+                            b.iter(|| {
+                                black_box(expr.evaluate(black_box(&batch)).unwrap())
+                            })
+                        },
+                    );
+
+                    case_when_lookup.bench_with_input(
+                        BenchmarkId::new(
+                            format!(
+                                "case when utf8 -> i32, {num_entries} entries, {name}"
+                            ),
+                            input,
+                        ),
+                        &input,
+                        |b, input| {
+                            let array: StringArray = generate_values_for_lookup(
+                                &Options::<String> {
+                                    number_of_rows: batch_size,
+                                    range_of_values: when_thens_string_to_primitive
+                                        .iter()
+                                        .map(|(key, _)| (*key).to_string())
+                                        .collect(),
+                                    in_range_probability: input.in_range_probability,
+                                    null_probability: input.null_probability,
+                                },
+                                |rng, exclude| {
+                                    create_random_string_generator(3..10)(rng, exclude)
+                                },
+                            );
+                            let batch = RecordBatch::try_new(
+                                Arc::new(Schema::new(vec![Field::new(
+                                    "col1",
+                                    array.data_type().clone(),
+                                    true,
+                                )])),
+                                vec![Arc::new(array)],
+                            )
+                            .unwrap();
+
+                            let when_thens = when_thens_string_to_primitive
+                                .iter()
+                                .map(|&(key, value)| (lit(key), lit(value)))
+                                .collect();
+
+                            let expr = Arc::new(
+                                case(
+                                    Some(col("col1", batch.schema_ref()).unwrap()),
+                                    when_thens,
+                                    Some(lit(1000)),
+                                )
+                                .unwrap(),
+                            );
+
+                            b.iter(|| {
+                                black_box(expr.evaluate(black_box(&batch)).unwrap())
+                            })
+                        },
+                    );
+                }
+            }
+        }
+    }
+}
+
+fn benchmark_divide_by_zero_protection(c: &mut Criterion, batch_size: usize) {
+    let mut group = c.benchmark_group("divide_by_zero_protection");
+
+    for zero_percentage in [0.0, 0.1, 0.5, 0.9] {
+        let rng = &mut seedable_rng();
+
+        let numerator: Int32Array =
+            (0..batch_size).map(|_| Some(rng.random::<i32>())).collect();
+
+        let divisor_values: Vec<Option<i32>> = (0..batch_size)
+            .map(|_| {
+                let roll: f32 = rng.random();
+                if roll < zero_percentage {
+                    Some(0)
+                } else {
+                    let mut val = rng.random::<i32>();
+                    while val == 0 {
+                        val = rng.random::<i32>();
+                    }
+                    Some(val)
+                }
+            })
+            .collect();
+
+        let divisor: Int32Array = divisor_values.iter().cloned().collect();
+        let divisor_copy: Int32Array = divisor_values.iter().cloned().collect();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("numerator", numerator.data_type().clone(), true),
+            Field::new("divisor", divisor.data_type().clone(), true),
+            Field::new("divisor_copy", divisor_copy.data_type().clone(), true),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(numerator),
+                Arc::new(divisor),
+                Arc::new(divisor_copy),
+            ],
+        )
+        .unwrap();
+
+        let numerator_col = col("numerator", &batch.schema()).unwrap();
+        let divisor_col = col("divisor", &batch.schema()).unwrap();
+
+        // DivideByZeroProtection: WHEN condition checks `divisor_col > 0` and division
+        // uses `divisor_col` as divisor. Since the checked column matches the divisor,
+        // this triggers the DivideByZeroProtection optimization.
+        group.bench_function(
+            format!(
+                "{} rows, {}% zeros: DivideByZeroProtection",
+                batch_size,
+                (zero_percentage * 100.0) as i32
+            ),
+            |b| {
+                let when = Arc::new(BinaryExpr::new(
+                    Arc::clone(&divisor_col),
+                    Operator::NotEq,
+                    lit(0i32),
+                ));
+                let then = Arc::new(BinaryExpr::new(
+                    Arc::clone(&numerator_col),
+                    Operator::Divide,
+                    Arc::clone(&divisor_col),
+                ));
+                let else_null: Arc<dyn PhysicalExpr> = lit(ScalarValue::Int32(None));
+                let expr =
+                    Arc::new(case(None, vec![(when, then)], Some(else_null)).unwrap());
+
+                b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+            },
+        );
+    }
+
+    group.finish();
+}
+
 criterion_group!(benches, criterion_benchmark);
 criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs
index e91e8d1f137c1..021d8259cdfdf 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -15,16 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray};
+use arrow::array::{
+    Array, ArrayRef, Float32Array, Int16Array, Int32Array, StringArray, StringViewArray,
+    TimestampNanosecondArray, UInt8Array,
+};
 use arrow::datatypes::{Field, Schema};
 use arrow::record_batch::RecordBatch;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::expressions::{col, in_list, lit};
 use rand::distr::Alphanumeric;
 use rand::prelude::*;
+use std::any::TypeId;
+use std::hint::black_box;
 use std::sync::Arc;
+use std::time::Duration;
 
+/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
 fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
     let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
     let exprs = exprs.iter().map(|s| lit(s.clone())).collect();
@@ -36,78 +44,401 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu
     });
 }
 
+/// Generates a random alphanumeric string of the specified length.
 fn random_string(rng: &mut StdRng, len: usize) -> String {
     let value = rng.sample_iter(&Alphanumeric).take(len).collect();
     String::from_utf8(value).unwrap()
 }
 
-fn do_benches(
+const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
+const LIST_WITH_COLUMNS_LENGTHS: [usize; 3] = [3, 8, 28];
+const NULL_PERCENTS: [f64; 2] = [0., 0.2];
+const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0];
+const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
+const ARRAY_LENGTH: usize = 8192;
+
+/// Mixed string lengths for realistic benchmarks.
+/// ~50% short (≤12 bytes), ~50% long (>12 bytes).
+const MIXED_STRING_LENGTHS: &[usize] = &[3, 6, 9, 12, 16, 20, 25, 30];
+
+/// Returns a friendly type name for the array type.
+fn array_type_name<A: 'static>() -> &'static str {
+    let id = TypeId::of::<A>();
+    if id == TypeId::of::<StringArray>() {
+        "Utf8"
+    } else if id == TypeId::of::<StringViewArray>() {
+        "Utf8View"
+    } else if id == TypeId::of::<Float32Array>() {
+        "Float32"
+    } else if id == TypeId::of::<Int16Array>() {
+        "Int16"
+    } else if id == TypeId::of::<Int32Array>() {
+        "Int32"
+    } else if id == TypeId::of::<TimestampNanosecondArray>() {
+        "TimestampNs"
+    } else if id == TypeId::of::<UInt8Array>() {
+        "UInt8"
+    } else {
+        "Unknown"
+    }
+}
+
+/// Builds a benchmark name from array type, list size, and null percentage.
+fn bench_name<A: 'static>(in_list_length: usize, null_percent: f64) -> String {
+    format!(
+        "in_list/{}/list={in_list_length}/nulls={}%",
+        array_type_name::<A>(),
+        (null_percent * 100.0) as u32
+    )
+}
+
+/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations.
+fn bench_string_type<A>(
     c: &mut Criterion,
-    array_length: usize,
-    in_list_length: usize,
-    null_percent: f64,
-) {
-    let mut rng = StdRng::seed_from_u64(120320);
-    for string_length in [5, 10, 20] {
-        let values: StringArray = (0..array_length)
-            .map(|_| {
-                rng.random_bool(null_percent)
-                    .then(|| random_string(&mut rng, string_length))
-            })
-            .collect();
-
-        let in_list: Vec<_> = (0..in_list_length)
-            .map(|_| ScalarValue::from(random_string(&mut rng, string_length)))
-            .collect();
-
-        do_bench(
-            c,
-            &format!(
-                "in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)"
-            ),
-            Arc::new(values),
-            &in_list,
-        )
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            for string_length in STRING_LENGTHS {
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| random_string(rng, string_length))
+                    })
+                    .collect();
+
+                let in_list: Vec<_> = (0..in_list_length)
+                    .map(|_| make_scalar(random_string(rng, string_length)))
+                    .collect();
+
+                do_bench(
+                    c,
+                    &format!(
+                        "{}/str={string_length}",
+                        bench_name::<A>(in_list_length, null_percent)
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                )
+            }
+        }
     }
+}
 
-    let values: Float32Array = (0..array_length)
-        .map(|_| rng.random_bool(null_percent).then(|| rng.random()))
-        .collect();
+/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations.
+fn bench_numeric_type<T, A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    mut gen_value: impl FnMut(&mut StdRng) -> T,
+    make_scalar: fn(T) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<T>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for null_percent in NULL_PERCENTS {
+            let values: A = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng)))
+                .collect();
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Float32(Some(rng.random())))
-        .collect();
+            let in_list: Vec<_> = (0..in_list_length)
+                .map(|_| make_scalar(gen_value(rng)))
+                .collect();
 
-    do_bench(
-        c,
-        &format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    );
+            do_bench(
+                c,
+                &bench_name::<A>(in_list_length, null_percent),
+                Arc::new(values),
+                &in_list,
+            );
+        }
+    }
+}
+
+/// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
+fn random_mixed_length_string(rng: &mut StdRng) -> String {
+    let len = *MIXED_STRING_LENGTHS.choose(rng).unwrap();
+    random_string(rng, len)
+}
+
+/// Benchmarks realistic mixed-length IN list scenario.
+///
+/// Tests with:
+/// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
+/// - Varying prefixes (fully random strings)
+/// - Configurable match rate (% of values that are in the IN list)
+/// - Various IN list sizes (3, 8, 28, 100)
+fn bench_realistic_mixed_strings<A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for match_percent in [0.0, 0.25, 0.75] {
+            for null_percent in NULL_PERCENTS {
+                // Generate IN list with mixed-length random strings
+                let in_list_strings: Vec<String> = (0..in_list_length)
+                    .map(|_| random_mixed_length_string(rng))
+                    .collect();
+
+                let in_list: Vec<_> = in_list_strings
+                    .iter()
+                    .map(|s| make_scalar(s.clone()))
+                    .collect();
+
+                // Generate values array with controlled match rate
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        if !rng.random_bool(1.0 - null_percent) {
+                            None
+                        } else if rng.random_bool(match_percent) {
+                            // Pick from IN list (will match)
+                            Some(in_list_strings.choose(rng).unwrap().clone())
+                        } else {
+                            // Generate new random string (unlikely to match)
+                            Some(random_mixed_length_string(rng))
+                        }
+                    })
+                    .collect();
 
-    let values: Int32Array = (0..array_length)
-        .map(|_| rng.random_bool(null_percent).then(|| rng.random()))
+                do_bench(
+                    c,
+                    &format!(
+                        "in_list/{}/mixed/list={}/match={}%/nulls={}%",
+                        array_type_name::<A>(),
+                        in_list_length,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                );
+            }
+        }
+    }
+}
+
+/// Benchmarks the column-reference evaluation path (no static filter) by including
+/// a column reference in the IN list, which prevents static filter creation.
+///
+/// This simulates SQL like:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, b2 INT);
+/// SELECT * FROM t WHERE a IN (b0, b1, b2);
+/// ```
+///
+/// - `values`: the "needle" column (`a`)
+/// - `list_cols`: the "haystack" columns (`b0`, `b1`, …)
+fn do_bench_with_columns(
+    c: &mut Criterion,
+    name: &str,
+    values: ArrayRef,
+    list_cols: &[ArrayRef],
+) {
+    let mut fields = vec![Field::new("a", values.data_type().clone(), true)];
+    let mut columns: Vec<ArrayRef> = vec![values];
+
+    // Build list expressions: column refs (forces non-constant evaluation path)
+    let schema_fields: Vec<Field> = list_cols
+        .iter()
+        .enumerate()
+        .map(|(i, col_arr)| {
+            let name = format!("b{i}");
+            fields.push(Field::new(&name, col_arr.data_type().clone(), true));
+            columns.push(Arc::clone(col_arr));
+            Field::new(&name, col_arr.data_type().clone(), true)
+        })
         .collect();
 
-    let in_list: Vec<_> = (0..in_list_length)
-        .map(|_| ScalarValue::Int32(Some(rng.random())))
+    let schema = Schema::new(fields);
+    let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields
+        .iter()
+        .map(|f| col(f.name(), &schema).unwrap())
         .collect();
 
-    do_bench(
-        c,
-        &format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
-        Arc::new(values),
-        &in_list,
-    )
+    let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap();
+    let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap();
+
+    c.bench_function(name, |b| {
+        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
+    });
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    for in_list_length in [1, 3, 10, 100] {
-        for null_percent in [0., 0.2] {
-            do_benches(c, 1024, in_list_length, null_percent)
+/// Benchmarks the IN list path with column references for Int32 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a INT, b0 INT, b1 INT, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_int32(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            for null_percent in NULL_PERCENTS {
+                // Generate the "needle" column
+                let values: Int32Array = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        rng.random_bool(1.0 - null_percent)
+                            .then(|| rng.random_range(0..1000))
+                    })
+                    .collect();
+
+                // Generate list columns with controlled match rate
+                let list_cols: Vec<ArrayRef> = (0..list_size)
+                    .map(|_| {
+                        let col: Int32Array = (0..ARRAY_LENGTH)
+                            .map(|row| {
+                                if rng.random_bool(1.0 - null_percent) {
+                                    if rng.random_bool(match_percent) {
+                                        // Copy from values to create a match
+                                        if values.is_null(row) {
+                                            Some(rng.random_range(0..1000))
+                                        } else {
+                                            Some(values.value(row))
+                                        }
+                                    } else {
+                                        // Random value (unlikely to match)
+                                        Some(rng.random_range(1000..2000))
+                                    }
+                                } else {
+                                    None
+                                }
+                            })
+                            .collect();
+                        Arc::new(col) as ArrayRef
+                    })
+                    .collect();
+
+                do_bench_with_columns(
+                    c,
+                    &format!(
+                        "in_list_cols/Int32/list={}/match={}%/nulls={}%",
+                        list_size,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &list_cols,
+                );
+            }
+        }
+    }
+}
+
+/// Benchmarks the IN list path with column references for Utf8 arrays.
+///
+/// Equivalent SQL:
+/// ```sql
+/// CREATE TABLE t (a VARCHAR, b0 VARCHAR, b1 VARCHAR, ...);
+/// SELECT * FROM t WHERE a IN (b0, b1, ...);
+/// ```
+fn bench_with_columns_utf8(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(99);
+
+    for list_size in LIST_WITH_COLUMNS_LENGTHS {
+        for match_percent in MATCH_PERCENTS {
+            // Generate the "needle" column
+            let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH)
+                .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12)))
+                .collect();
+            let values: StringArray =
+                value_strings.iter().map(|s| s.as_deref()).collect();
+
+            // Generate list columns with controlled match rate
+            let list_cols: Vec<ArrayRef> = (0..list_size)
+                .map(|_| {
+                    let col: StringArray = (0..ARRAY_LENGTH)
+                        .map(|row| {
+                            if rng.random_bool(match_percent) {
+                                // Copy from values to create a match
+                                value_strings[row].as_deref()
+                            } else {
+                                Some("no_match_value_xyz")
+                            }
+                        })
+                        .collect();
+                    Arc::new(col) as ArrayRef
+                })
+                .collect();
+
+            do_bench_with_columns(
+                c,
+                &format!(
+                    "in_list_cols/Utf8/list={}/match={}%",
+                    list_size,
+                    (match_percent * 100.0) as u32,
+                ),
+                Arc::new(values),
+                &list_cols,
+            );
         }
     }
 }
 
-criterion_group!(benches, criterion_benchmark);
+/// Entry point: registers in_list benchmarks for string and numeric array types.
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut rng = StdRng::seed_from_u64(120320);
+
+    // Benchmarks for string array types (Utf8, Utf8View)
+    bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
+    bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
+
+    // Realistic mixed-length string benchmarks (TPC-H style)
+    bench_realistic_mixed_strings::<StringArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8(Some(s))
+    });
+    bench_realistic_mixed_strings::<StringViewArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8View(Some(s))
+    });
+
+    // Benchmarks for numeric types
+    bench_numeric_type::<u8, UInt8Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::UInt8(Some(v)),
+    );
+    bench_numeric_type::<i16, Int16Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Int16(Some(v)),
+    );
+    bench_numeric_type::<f32, Float32Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Float32(Some(v)),
+    );
+    bench_numeric_type::<i32, Int32Array>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::Int32(Some(v)),
+    );
+    bench_numeric_type::<i64, TimestampNanosecondArray>(
+        c,
+        &mut rng,
+        |rng| rng.random(),
+        |v| ScalarValue::TimestampNanosecond(Some(v), None),
+    );
+
+    // Column-reference path benchmarks (non-constant list expressions)
+    bench_with_columns_int32(c);
+    bench_with_columns_utf8(c);
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .warm_up_time(Duration::from_millis(100))
+        .measurement_time(Duration::from_millis(500));
+    targets = criterion_benchmark
+}
 criterion_main!(benches);
diff --git a/datafusion/physical-expr/benches/is_null.rs b/datafusion/physical-expr/benches/is_null.rs
index ce6ad6eac2c71..0637ade1b3eec 100644
--- a/datafusion/physical-expr/benches/is_null.rs
+++ b/datafusion/physical-expr/benches/is_null.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{builder::Int32Builder, RecordBatch};
+use arrow::array::{RecordBatch, builder::Int32Builder};
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_physical_expr::expressions::{Column, IsNotNullExpr, IsNullExpr};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/datafusion/physical-expr/benches/simplify.rs b/datafusion/physical-expr/benches/simplify.rs
new file mode 100644
index 0000000000000..cc00c710004e8
--- /dev/null
+++ b/datafusion/physical-expr/benches/simplify.rs
@@ -0,0 +1,299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This is an attempt at reproducing some predicates generated by TPC-DS query #76,
+//! and trying to figure out how long it takes to simplify them.
+
+use arrow::datatypes::{DataType, Field, Schema};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::simplifier::PhysicalExprSimplifier;
+use std::hint::black_box;
+use std::sync::Arc;
+
+use datafusion_common::ScalarValue;
+use datafusion_expr::Operator;
+
+use datafusion_physical_expr::expressions::{
+    BinaryExpr, CaseExpr, Column, IsNullExpr, Literal,
+};
+
+fn catalog_sales_schema() -> Schema {
+    Schema::new(vec![
+        Field::new("cs_sold_date_sk", DataType::Int64, true), // 0
+        Field::new("cs_sold_time_sk", DataType::Int64, true), // 1
+        Field::new("cs_ship_date_sk", DataType::Int64, true), // 2
+        Field::new("cs_bill_customer_sk", DataType::Int64, true), // 3
+        Field::new("cs_bill_cdemo_sk", DataType::Int64, true), // 4
+        Field::new("cs_bill_hdemo_sk", DataType::Int64, true), // 5
+        Field::new("cs_bill_addr_sk", DataType::Int64, true), // 6
+        Field::new("cs_ship_customer_sk", DataType::Int64, true), // 7
+        Field::new("cs_ship_cdemo_sk", DataType::Int64, true), // 8
+        Field::new("cs_ship_hdemo_sk", DataType::Int64, true), // 9
+        Field::new("cs_ship_addr_sk", DataType::Int64, true), // 10
+        Field::new("cs_call_center_sk", DataType::Int64, true), // 11
+        Field::new("cs_catalog_page_sk", DataType::Int64, true), // 12
+        Field::new("cs_ship_mode_sk", DataType::Int64, true), // 13
+        Field::new("cs_warehouse_sk", DataType::Int64, true), // 14
+        Field::new("cs_item_sk", DataType::Int64, true),      // 15
+        Field::new("cs_promo_sk", DataType::Int64, true),     // 16
+        Field::new("cs_order_number", DataType::Int64, true), // 17
+        Field::new("cs_quantity", DataType::Int64, true),     // 18
+        Field::new("cs_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_discount_amt", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_coupon_amt", DataType::Decimal128(7, 2), true),
+        Field::new("cs_ext_ship_cost", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_ship", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_paid_inc_ship_tax", DataType::Decimal128(7, 2), true),
+        Field::new("cs_net_profit", DataType::Decimal128(7, 2), true),
+    ])
+}
+
+fn web_sales_schema() -> Schema {
+    Schema::new(vec![
+        Field::new("ws_sold_date_sk", DataType::Int64, true),
+        Field::new("ws_sold_time_sk", DataType::Int64, true),
+        Field::new("ws_ship_date_sk", DataType::Int64, true),
+        Field::new("ws_item_sk", DataType::Int64, true),
+        Field::new("ws_bill_customer_sk", DataType::Int64, true),
+        Field::new("ws_bill_cdemo_sk", DataType::Int64, true),
+        Field::new("ws_bill_hdemo_sk", DataType::Int64, true),
+        Field::new("ws_bill_addr_sk", DataType::Int64, true),
+        Field::new("ws_ship_customer_sk", DataType::Int64, true),
+        Field::new("ws_ship_cdemo_sk", DataType::Int64, true),
+        Field::new("ws_ship_hdemo_sk", DataType::Int64, true),
+        Field::new("ws_ship_addr_sk", DataType::Int64, true),
+        Field::new("ws_web_page_sk", DataType::Int64, true),
+        Field::new("ws_web_site_sk", DataType::Int64, true),
+        Field::new("ws_ship_mode_sk", DataType::Int64, true),
+        Field::new("ws_warehouse_sk", DataType::Int64, true),
+        Field::new("ws_promo_sk", DataType::Int64, true),
+        Field::new("ws_order_number", DataType::Int64, true),
+        Field::new("ws_quantity", DataType::Int64, true),
+        Field::new("ws_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_discount_amt", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_sales_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_wholesale_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_list_price", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_coupon_amt", DataType::Decimal128(7, 2), true),
+        Field::new("ws_ext_ship_cost", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_ship", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_paid_inc_ship_tax", DataType::Decimal128(7, 2), true),
+        Field::new("ws_net_profit", DataType::Decimal128(7, 2), true),
+    ])
+}
+
+// Helper to create a literal
+fn lit_i64(val: i64) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Int64(Some(val))))
+}
+
+fn lit_i32(val: i32) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Int32(Some(val))))
+}
+
+fn lit_bool(val: bool) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Literal::new(ScalarValue::Boolean(Some(val))))
+}
+
+// Helper to create binary expressions
+fn and(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::And, right))
+}
+
+fn gte(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::GtEq, right))
+}
+
+fn lte(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::LtEq, right))
+}
+
+fn modulo(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::Modulo, right))
+}
+
+fn eq(
+    left: Arc<dyn PhysicalExpr>,
+    right: Arc<dyn PhysicalExpr>,
+) -> Arc<dyn PhysicalExpr> {
+    Arc::new(BinaryExpr::new(left, Operator::Eq, right))
+}
+
+/// Build a predicate similar to TPC-DS q76 catalog_sales filter.
+/// Uses placeholder columns instead of hash expressions.
+pub fn catalog_sales_predicate(num_partitions: usize) -> Arc<dyn PhysicalExpr> {
+    let cs_sold_date_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("cs_sold_date_sk", 0));
+    let cs_ship_addr_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("cs_ship_addr_sk", 10));
+    let cs_item_sk: Arc<dyn PhysicalExpr> = Arc::new(Column::new("cs_item_sk", 15));
+
+    // Use a simple modulo expression as placeholder for hash
+    let item_hash_mod = modulo(cs_item_sk.clone(), lit_i64(num_partitions as i64));
+    let date_hash_mod = modulo(cs_sold_date_sk.clone(), lit_i64(num_partitions as i64));
+
+    // cs_ship_addr_sk IS NULL
+    let is_null_expr: Arc<dyn PhysicalExpr> = Arc::new(IsNullExpr::new(cs_ship_addr_sk));
+
+    // Build item_sk CASE expression with num_partitions branches
+    let item_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(item_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(cs_item_sk.clone(), lit_i64(partition as i64)),
+                lte(cs_item_sk.clone(), lit_i64(18000)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let item_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, item_when_then, Some(lit_bool(false))).unwrap());
+
+    // Build sold_date_sk CASE expression with num_partitions branches
+    let date_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(date_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(cs_sold_date_sk.clone(), lit_i64(2415000 + partition as i64)),
+                lte(cs_sold_date_sk.clone(), lit_i64(2488070)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let date_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, date_when_then, Some(lit_bool(false))).unwrap());
+
+    // Final: is_null AND item_case AND date_case
+    and(and(is_null_expr, item_case_expr), date_case_expr)
+}
+/// Build a predicate similar to TPC-DS q76 web_sales filter.
+/// Uses placeholder columns instead of hash expressions.
+fn web_sales_predicate(num_partitions: usize) -> Arc<dyn PhysicalExpr> {
+    let ws_sold_date_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("ws_sold_date_sk", 0));
+    let ws_item_sk: Arc<dyn PhysicalExpr> = Arc::new(Column::new("ws_item_sk", 3));
+    let ws_ship_customer_sk: Arc<dyn PhysicalExpr> =
+        Arc::new(Column::new("ws_ship_customer_sk", 8));
+
+    // Use simple modulo expression as placeholder for hash
+    let item_hash_mod = modulo(ws_item_sk.clone(), lit_i64(num_partitions as i64));
+    let date_hash_mod = modulo(ws_sold_date_sk.clone(), lit_i64(num_partitions as i64));
+
+    // ws_ship_customer_sk IS NULL
+    let is_null_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(IsNullExpr::new(ws_ship_customer_sk));
+
+    // Build item_sk CASE expression with num_partitions branches
+    let item_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(item_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(ws_item_sk.clone(), lit_i64(partition as i64)),
+                lte(ws_item_sk.clone(), lit_i64(18000)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let item_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, item_when_then, Some(lit_bool(false))).unwrap());
+
+    // Build sold_date_sk CASE expression with num_partitions branches
+    let date_when_then: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> = (0
+        ..num_partitions)
+        .map(|partition| {
+            let when_expr = eq(date_hash_mod.clone(), lit_i32(partition as i32));
+            let then_expr = and(
+                gte(ws_sold_date_sk.clone(), lit_i64(2415000 + partition as i64)),
+                lte(ws_sold_date_sk.clone(), lit_i64(2488070)),
+            );
+            (when_expr, then_expr)
+        })
+        .collect();
+
+    let date_case_expr: Arc<dyn PhysicalExpr> =
+        Arc::new(CaseExpr::try_new(None, date_when_then, Some(lit_bool(false))).unwrap());
+
+    and(and(is_null_expr, item_case_expr), date_case_expr)
+}
+
+/// Measures how long `PhysicalExprSimplifier::simplify` takes for a given expression.
+fn bench_simplify(
+    c: &mut Criterion,
+    name: &str,
+    schema: &Schema,
+    expr: &Arc<dyn PhysicalExpr>,
+) {
+    let simplifier = PhysicalExprSimplifier::new(schema);
+    c.bench_function(name, |b| {
+        b.iter(|| black_box(simplifier.simplify(black_box(Arc::clone(expr))).unwrap()))
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let cs_schema = catalog_sales_schema();
+    let ws_schema = web_sales_schema();
+
+    for num_partitions in [16, 128] {
+        bench_simplify(
+            c,
+            &format!("tpc-ds/q76/cs/{num_partitions}"),
+            &cs_schema,
+            &catalog_sales_predicate(num_partitions),
+        );
+        bench_simplify(
+            c,
+            &format!("tpc-ds/q76/ws/{num_partitions}"),
+            &ws_schema,
+            &web_sales_predicate(num_partitions),
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs
index be04b9c6b8ea8..d031ad7d85750 100644
--- a/datafusion/physical-expr/src/aggregate.rs
+++ b/datafusion/physical-expr/src/aggregate.rs
@@ -16,22 +16,21 @@
 // under the License.
 
 pub(crate) mod groups_accumulator {
-    #[allow(unused_imports)]
+    #[expect(unused_imports)]
     pub(crate) mod accumulate {
         pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState;
     }
     pub use datafusion_functions_aggregate_common::aggregate::groups_accumulator::{
-        accumulate::NullState, GroupsAccumulatorAdapter,
+        GroupsAccumulatorAdapter, accumulate::NullState,
     };
 }
 pub(crate) mod stats {
     pub use datafusion_functions_aggregate_common::stats::StatsType;
 }
 pub mod utils {
-    #[allow(deprecated)] // allow adjust_output_array
     pub use datafusion_functions_aggregate_common::utils::{
-        adjust_output_array, get_accum_scalar_values_as_arrays, get_sort_options,
-        ordering_fields, DecimalAverager, Hashable,
+        DecimalAverager, Hashable, get_accum_scalar_values_as_arrays, get_sort_options,
+        ordering_fields,
     };
 }
 
@@ -42,7 +41,9 @@ use crate::expressions::Column;
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::{DataType, FieldRef, Schema, SchemaRef};
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, assert_or_internal_err, internal_err, not_impl_err,
+};
 use datafusion_expr::{AggregateUDF, ReversedUDAF, SetMonotonicity};
 use datafusion_expr_common::accumulator::Accumulator;
 use datafusion_expr_common::groups_accumulator::GroupsAccumulator;
@@ -52,8 +53,7 @@ use datafusion_functions_aggregate_common::accumulator::{
 };
 use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
-use datafusion_physical_expr_common::utils::reverse_order_bys;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
 /// Builder for physical [`AggregateFunctionExpr`]
 ///
@@ -70,7 +70,7 @@ pub struct AggregateExprBuilder {
     /// Arrow Schema for the aggregate function
     schema: SchemaRef,
     /// The physical order by expressions
-    ordering_req: LexOrdering,
+    order_bys: Vec<PhysicalSortExpr>,
     /// Whether to ignore null values
     ignore_nulls: bool,
     /// Whether is distinct aggregate function
@@ -87,7 +87,7 @@ impl AggregateExprBuilder {
             alias: None,
             human_display: String::default(),
             schema: Arc::new(Schema::empty()),
-            ordering_req: LexOrdering::default(),
+            order_bys: vec![],
             ignore_nulls: false,
             is_distinct: false,
             is_reversed: false,
@@ -112,7 +112,7 @@ impl AggregateExprBuilder {
     /// # use datafusion_expr::{AggregateUDFImpl, AggregateUDF, Accumulator, function::{AccumulatorArgs, StateFieldsArgs}};
     /// # use arrow::datatypes::Field;
     /// #
-    /// # #[derive(Debug, Clone)]
+    /// # #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     /// # struct FirstValueUdf {
     /// #     signature: Signature,
     /// # }
@@ -128,25 +128,28 @@ impl AggregateExprBuilder {
     /// # impl AggregateUDFImpl for FirstValueUdf {
     /// #     fn as_any(&self) -> &dyn Any {
     /// #         unimplemented!()
-    /// # }
+    /// #     }
+    /// #
     /// #     fn name(&self) -> &str {
     /// #         unimplemented!()
-    /// }
+    /// #     }
+    /// #
     /// #     fn signature(&self) -> &Signature {
     /// #         unimplemented!()
-    /// # }
+    /// #     }
+    /// #
     /// #     fn return_type(&self, args: &[DataType]) -> Result<DataType> {
     /// #         unimplemented!()
     /// #     }
-    /// #     
+    /// #
     /// #     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
     /// #         unimplemented!()
     /// #         }
-    /// #     
+    /// #
     /// #     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
     /// #         unimplemented!()
     /// #     }
-    /// #     
+    /// #
     /// #     fn documentation(&self) -> Option<&Documentation> {
     /// #         unimplemented!()
     /// #     }
@@ -169,16 +172,16 @@ impl AggregateExprBuilder {
     ///     }];
     ///
     ///     let first_value = AggregateUDF::from(FirstValueUdf::new());
-    ///     
+    ///
     ///     let aggregate_expr = AggregateExprBuilder::new(
     ///         Arc::new(first_value),
     ///         args
     ///     )
-    ///     .order_by(order_by.into())
+    ///     .order_by(order_by)
     ///     .alias("first_a_by_x")
     ///     .ignore_nulls()
     ///     .build()?;
-    ///     
+    ///
     ///     Ok(())
     /// }
     /// ```
@@ -192,26 +195,19 @@ impl AggregateExprBuilder {
             alias,
             human_display,
             schema,
-            ordering_req,
+            order_bys,
             ignore_nulls,
             is_distinct,
             is_reversed,
         } = self;
-        if args.is_empty() {
-            return internal_err!("args should not be empty");
-        }
-
-        let mut ordering_fields = vec![];
+        assert_or_internal_err!(!args.is_empty(), "args should not be empty");
 
-        if !ordering_req.is_empty() {
-            let ordering_types = ordering_req
-                .iter()
-                .map(|e| e.expr.data_type(&schema))
-                .collect::<Result<Vec<_>>>()?;
+        let ordering_types = order_bys
+            .iter()
+            .map(|e| e.expr.data_type(&schema))
+            .collect::<Result<Vec<_>>>()?;
 
-            ordering_fields =
-                utils::ordering_fields(ordering_req.as_ref(), &ordering_types);
-        }
+        let ordering_fields = utils::ordering_fields(&order_bys, &ordering_types);
 
         let input_exprs_fields = args
             .iter()
@@ -230,19 +226,25 @@ impl AggregateExprBuilder {
             None => {
                 return internal_err!(
                     "AggregateExprBuilder::alias must be provided prior to calling build"
-                )
+                );
             }
             Some(alias) => alias,
         };
 
+        let arg_fields = args
+            .iter()
+            .map(|e| e.return_field(schema.as_ref()))
+            .collect::<Result<Vec<_>>>()?;
+
         Ok(AggregateFunctionExpr {
             fun: Arc::unwrap_or_clone(fun),
             args,
+            arg_fields,
             return_field,
             name,
             human_display,
             schema: Arc::unwrap_or_clone(schema),
-            ordering_req,
+            order_bys,
             ignore_nulls,
             ordering_fields,
             is_distinct,
@@ -267,8 +269,8 @@ impl AggregateExprBuilder {
         self
     }
 
-    pub fn order_by(mut self, order_by: LexOrdering) -> Self {
-        self.ordering_req = order_by;
+    pub fn order_by(mut self, order_bys: Vec<PhysicalSortExpr>) -> Self {
+        self.order_bys = order_bys;
         self
     }
 
@@ -310,6 +312,8 @@ impl AggregateExprBuilder {
 pub struct AggregateFunctionExpr {
     fun: AggregateUDF,
     args: Vec<Arc<dyn PhysicalExpr>>,
+    /// Fields corresponding to args (same order & length)
+    arg_fields: Vec<FieldRef>,
     /// Output / return field of this aggregate
     return_field: FieldRef,
     /// Output column name that this expression creates
@@ -318,7 +322,7 @@ pub struct AggregateFunctionExpr {
     human_display: String,
     schema: Schema,
     // The physical order by expressions
-    ordering_req: LexOrdering,
+    order_bys: Vec<PhysicalSortExpr>,
     // Whether to ignore null values
     ignore_nulls: bool,
     // fields used for order sensitive aggregation functions
@@ -387,8 +391,9 @@ impl AggregateFunctionExpr {
         let acc_args = AccumulatorArgs {
             return_field: Arc::clone(&self.return_field),
             schema: &self.schema,
+            expr_fields: &self.arg_fields,
             ignore_nulls: self.ignore_nulls,
-            ordering_req: self.ordering_req.as_ref(),
+            order_bys: self.order_bys.as_ref(),
             is_distinct: self.is_distinct,
             name: &self.name,
             is_reversed: self.is_reversed,
@@ -411,31 +416,24 @@ impl AggregateFunctionExpr {
         self.fun.state_fields(args)
     }
 
-    /// Order by requirements for the aggregate function
-    /// By default it is `None` (there is no requirement)
-    /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this
-    pub fn order_bys(&self) -> Option<&LexOrdering> {
-        if self.ordering_req.is_empty() {
-            return None;
-        }
-
-        if !self.order_sensitivity().is_insensitive() {
-            return Some(self.ordering_req.as_ref());
+    /// Returns the ORDER BY expressions for the aggregate function.
+    pub fn order_bys(&self) -> &[PhysicalSortExpr] {
+        if self.order_sensitivity().is_insensitive() {
+            &[]
+        } else {
+            &self.order_bys
         }
-
-        None
     }
 
     /// Indicates whether aggregator can produce the correct result with any
     /// arbitrary input ordering. By default, we assume that aggregate expressions
     /// are order insensitive.
     pub fn order_sensitivity(&self) -> AggregateOrderSensitivity {
-        if !self.ordering_req.is_empty() {
-            // If there is requirement, use the sensitivity of the implementation
-            self.fun.order_sensitivity()
-        } else {
-            // If no requirement, aggregator is order insensitive
+        if self.order_bys.is_empty() {
             AggregateOrderSensitivity::Insensitive
+        } else {
+            // If there is an ORDER BY clause, use the sensitivity of the implementation:
+            self.fun.order_sensitivity()
         }
     }
 
@@ -463,7 +461,7 @@ impl AggregateFunctionExpr {
         };
 
         AggregateExprBuilder::new(Arc::new(updated_fn), self.args.to_vec())
-            .order_by(self.ordering_req.clone())
+            .order_by(self.order_bys.clone())
             .schema(Arc::new(self.schema.clone()))
             .alias(self.name().to_string())
             .with_ignore_nulls(self.ignore_nulls)
@@ -478,8 +476,9 @@ impl AggregateFunctionExpr {
         let args = AccumulatorArgs {
             return_field: Arc::clone(&self.return_field),
             schema: &self.schema,
+            expr_fields: &self.arg_fields,
             ignore_nulls: self.ignore_nulls,
-            ordering_req: self.ordering_req.as_ref(),
+            order_bys: self.order_bys.as_ref(),
             is_distinct: self.is_distinct,
             name: &self.name,
             is_reversed: self.is_reversed,
@@ -547,8 +546,9 @@ impl AggregateFunctionExpr {
         let args = AccumulatorArgs {
             return_field: Arc::clone(&self.return_field),
             schema: &self.schema,
+            expr_fields: &self.arg_fields,
             ignore_nulls: self.ignore_nulls,
-            ordering_req: self.ordering_req.as_ref(),
+            order_bys: self.order_bys.as_ref(),
             is_distinct: self.is_distinct,
             name: &self.name,
             is_reversed: self.is_reversed,
@@ -566,8 +566,9 @@ impl AggregateFunctionExpr {
         let args = AccumulatorArgs {
             return_field: Arc::clone(&self.return_field),
             schema: &self.schema,
+            expr_fields: &self.arg_fields,
             ignore_nulls: self.ignore_nulls,
-            ordering_req: self.ordering_req.as_ref(),
+            order_bys: self.order_bys.as_ref(),
             is_distinct: self.is_distinct,
             name: &self.name,
             is_reversed: self.is_reversed,
@@ -585,18 +586,16 @@ impl AggregateFunctionExpr {
             ReversedUDAF::NotSupported => None,
             ReversedUDAF::Identical => Some(self.clone()),
             ReversedUDAF::Reversed(reverse_udf) => {
-                let reverse_ordering_req = reverse_order_bys(self.ordering_req.as_ref());
                 let mut name = self.name().to_string();
                 // If the function is changed, we need to reverse order_by clause as well
                 // i.e. First(a order by b asc null first) -> Last(a order by b desc null last)
-                if self.fun().name() == reverse_udf.name() {
-                } else {
+                if self.fun().name() != reverse_udf.name() {
                     replace_order_by_clause(&mut name);
                 }
                 replace_fn_name_clause(&mut name, self.fun.name(), reverse_udf.name());
 
                 AggregateExprBuilder::new(reverse_udf, self.args.to_vec())
-                    .order_by(reverse_ordering_req)
+                    .order_by(self.order_bys.iter().map(|e| e.reverse()).collect())
                     .schema(Arc::new(self.schema.clone()))
                     .alias(name)
                     .with_ignore_nulls(self.ignore_nulls)
@@ -612,14 +611,11 @@ impl AggregateFunctionExpr {
     /// These expressions are  (1)function arguments, (2) order by expressions.
     pub fn all_expressions(&self) -> AggregatePhysicalExpressions {
         let args = self.expressions();
-        let order_bys = self
+        let order_by_exprs = self
             .order_bys()
-            .cloned()
-            .unwrap_or_else(LexOrdering::default);
-        let order_by_exprs = order_bys
             .iter()
             .map(|sort_expr| Arc::clone(&sort_expr.expr))
-            .collect::<Vec<_>>();
+            .collect();
         AggregatePhysicalExpressions {
             args,
             order_by_exprs,
@@ -631,10 +627,45 @@ impl AggregateFunctionExpr {
     /// Returns `Some(Arc<dyn AggregateExpr>)` if re-write is supported, otherwise returns `None`.
     pub fn with_new_expressions(
         &self,
-        _args: Vec<Arc<dyn PhysicalExpr>>,
-        _order_by_exprs: Vec<Arc<dyn PhysicalExpr>>,
+        args: Vec<Arc<dyn PhysicalExpr>>,
+        order_by_exprs: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Option<AggregateFunctionExpr> {
-        None
+        if args.len() != self.args.len()
+            || (self.order_sensitivity() != AggregateOrderSensitivity::Insensitive
+                && order_by_exprs.len() != self.order_bys.len())
+        {
+            return None;
+        }
+
+        let new_order_bys = self
+            .order_bys
+            .iter()
+            .zip(order_by_exprs)
+            .map(|(req, new_expr)| PhysicalSortExpr {
+                expr: new_expr,
+                options: req.options,
+            })
+            .collect();
+
+        Some(AggregateFunctionExpr {
+            fun: self.fun.clone(),
+            args,
+            // TODO: need to align arg_fields here with new args
+            //       https://github.com/apache/datafusion/issues/18149
+            arg_fields: self.arg_fields.clone(),
+            return_field: Arc::clone(&self.return_field),
+            name: self.name.clone(),
+            // TODO: Human name should be updated after re-write to not mislead
+            human_display: self.human_display.clone(),
+            schema: self.schema.clone(),
+            order_bys: new_order_bys,
+            ignore_nulls: self.ignore_nulls,
+            ordering_fields: self.ordering_fields.clone(),
+            is_distinct: self.is_distinct,
+            is_reversed: false,
+            input_fields: self.input_fields.clone(),
+            is_nullable: self.is_nullable,
+        })
     }
 
     /// If this function is max, return (output_field, true)
@@ -708,18 +739,18 @@ fn replace_order_by_clause(order_by: &mut String) {
         (" ASC NULLS LAST]", " DESC NULLS FIRST]"),
     ];
 
-    if let Some(start) = order_by.find("ORDER BY [") {
-        if let Some(end) = order_by[start..].find(']') {
-            let order_by_start = start + 9;
-            let order_by_end = start + end;
-
-            let column_order = &order_by[order_by_start..=order_by_end];
-            for (suffix, replacement) in suffixes {
-                if column_order.ends_with(suffix) {
-                    let new_order = column_order.replace(suffix, replacement);
-                    order_by.replace_range(order_by_start..=order_by_end, &new_order);
-                    break;
-                }
+    if let Some(start) = order_by.find("ORDER BY [")
+        && let Some(end) = order_by[start..].find(']')
+    {
+        let order_by_start = start + 9;
+        let order_by_end = start + end;
+
+        let column_order = &order_by[order_by_start..=order_by_end];
+        for (suffix, replacement) in suffixes {
+            if column_order.ends_with(suffix) {
+                let new_order = column_order.replace(suffix, replacement);
+                order_by.replace_range(order_by_start..=order_by_end, &new_order);
+                break;
             }
         }
     }
diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs
index 1d59dab8fd6dd..11a60afc90a10 100644
--- a/datafusion/physical-expr/src/analysis.rs
+++ b/datafusion/physical-expr/src/analysis.rs
@@ -20,17 +20,18 @@
 use std::fmt::Debug;
 use std::sync::Arc;
 
+use crate::PhysicalExpr;
 use crate::expressions::Column;
 use crate::intervals::cp_solver::{ExprIntervalGraph, PropagationResult};
 use crate::utils::collect_columns;
-use crate::PhysicalExpr;
 
 use arrow::datatypes::Schema;
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, ColumnStatistics, Result, ScalarValue,
+    ColumnStatistics, Result, ScalarValue, assert_or_internal_err,
+    internal_datafusion_err, internal_err,
 };
-use datafusion_expr::interval_arithmetic::{cardinality_ratio, Interval};
+use datafusion_expr::interval_arithmetic::{Interval, cardinality_ratio};
 
 /// The shared context used during the analysis of an expression. Includes
 /// the boundaries for all known columns.
@@ -170,27 +171,24 @@ pub fn analyze(
         .iter()
         .all(|bound| bound.interval.is_none())
     {
-        if initial_boundaries
-            .iter()
-            .any(|bound| bound.distinct_count != Precision::Exact(0))
-        {
-            return internal_err!(
-                "ExprBoundaries has a non-zero distinct count although it represents an empty table"
-            );
-        }
-        if context.selectivity != Some(0.0) {
-            return internal_err!(
-                "AnalysisContext has a non-zero selectivity although it represents an empty table"
-            );
-        }
+        assert_or_internal_err!(
+            !initial_boundaries
+                .iter()
+                .any(|bound| bound.distinct_count != Precision::Exact(0)),
+            "ExprBoundaries has a non-zero distinct count although it represents an empty table"
+        );
+        assert_or_internal_err!(
+            context.selectivity.unwrap_or(0.0) == 0.0,
+            "AnalysisContext has a non-zero selectivity although it represents an empty table"
+        );
         Ok(context)
     } else if initial_boundaries
         .iter()
         .any(|bound| bound.interval.is_none())
     {
         internal_err!(
-                "AnalysisContext is an inconsistent state. Some columns represent empty table while others don't"
-            )
+            "AnalysisContext is an inconsistent state. Some columns represent empty table while others don't"
+        )
     } else {
         let mut target_boundaries = context.boundaries;
         let mut graph = ExprIntervalGraph::try_new(Arc::clone(expr), schema)?;
@@ -203,22 +201,19 @@ pub fn analyze(
         let target_expr_and_indices = graph.gather_node_indices(columns.as_slice());
 
         for (expr, index) in &target_expr_and_indices {
-            if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-                if let Some(bound) =
+            if let Some(column) = expr.as_any().downcast_ref::<Column>()
+                && let Some(bound) =
                     target_boundaries.iter().find(|b| b.column == *column)
-                {
-                    // Now, it's safe to unwrap
-                    target_indices_and_boundaries
-                        .push((*index, bound.interval.as_ref().unwrap().clone()));
-                }
+            {
+                // Now, it's safe to unwrap
+                target_indices_and_boundaries
+                    .push((*index, bound.interval.as_ref().unwrap().clone()));
             }
         }
 
-        match graph
-            .update_ranges(&mut target_indices_and_boundaries, Interval::CERTAINLY_TRUE)?
-        {
+        match graph.update_ranges(&mut target_indices_and_boundaries, Interval::TRUE)? {
             PropagationResult::Success => {
-                shrink_boundaries(graph, target_boundaries, target_expr_and_indices)
+                shrink_boundaries(&graph, target_boundaries, &target_expr_and_indices)
             }
             PropagationResult::Infeasible => {
                 // If the propagation result is infeasible, set intervals to None
@@ -239,27 +234,27 @@ pub fn analyze(
 /// Following this, it constructs and returns a new `AnalysisContext` with the
 /// updated parameters.
 fn shrink_boundaries(
-    graph: ExprIntervalGraph,
+    graph: &ExprIntervalGraph,
     mut target_boundaries: Vec<ExprBoundaries>,
-    target_expr_and_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>,
+    target_expr_and_indices: &[(Arc<dyn PhysicalExpr>, usize)],
 ) -> Result<AnalysisContext> {
     let initial_boundaries = target_boundaries.clone();
     target_expr_and_indices.iter().for_each(|(expr, i)| {
-        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-            if let Some(bound) = target_boundaries
+        if let Some(column) = expr.as_any().downcast_ref::<Column>()
+            && let Some(bound) = target_boundaries
                 .iter_mut()
                 .find(|bound| bound.column.eq(column))
-            {
-                bound.interval = Some(graph.get_interval(*i));
-            };
-        }
+        {
+            bound.interval = Some(graph.get_interval(*i));
+        };
     });
 
     let selectivity = calculate_selectivity(&target_boundaries, &initial_boundaries)?;
 
-    if !(0.0..=1.0).contains(&selectivity) {
-        return internal_err!("Selectivity is out of limit: {}", selectivity);
-    }
+    assert_or_internal_err!(
+        (0.0..=1.0).contains(&selectivity),
+        "Selectivity is out of limit: {selectivity}",
+    );
 
     Ok(AnalysisContext::new(target_boundaries).with_selectivity(selectivity))
 }
@@ -287,8 +282,8 @@ fn calculate_selectivity(
             }
             (None, Some(_)) => {
                 return internal_err!(
-                "Initial boundary cannot be None while having a Some() target boundary"
-            );
+                    "Initial boundary cannot be None while having a Some() target boundary"
+                );
             }
             _ => return Ok(0.0),
         }
@@ -302,14 +297,14 @@ mod tests {
     use std::sync::Arc;
 
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::{assert_contains, DFSchema};
+    use datafusion_common::{DFSchema, assert_contains};
     use datafusion_expr::{
-        col, execution_props::ExecutionProps, interval_arithmetic::Interval, lit, Expr,
+        Expr, col, execution_props::ExecutionProps, interval_arithmetic::Interval, lit,
     };
 
-    use crate::{create_physical_expr, AnalysisContext};
+    use crate::{AnalysisContext, create_physical_expr};
 
-    use super::{analyze, ExprBoundaries};
+    use super::{ExprBoundaries, analyze};
 
     fn make_field(name: &str, data_type: DataType) -> Field {
         let nullable = false;
@@ -376,7 +371,9 @@ mod tests {
             )
             .unwrap();
             let Some(actual) = &analysis_result.boundaries[0].interval else {
-                panic!("The analysis result should contain non-empty intervals for all columns");
+                panic!(
+                    "The analysis result should contain non-empty intervals for all columns"
+                );
             };
             let expected = Interval::make(lower, upper).unwrap();
             assert_eq!(
diff --git a/datafusion/physical-expr/src/async_scalar_function.rs b/datafusion/physical-expr/src/async_scalar_function.rs
new file mode 100644
index 0000000000000..2f50a17afc39a
--- /dev/null
+++ b/datafusion/physical-expr/src/async_scalar_function.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ScalarFunctionExpr;
+use arrow::array::RecordBatch;
+use arrow::compute::concat;
+use arrow::datatypes::{DataType, Field, FieldRef, Schema};
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{internal_err, not_impl_err};
+use datafusion_expr::ScalarFunctionArgs;
+use datafusion_expr::async_udf::AsyncScalarUDF;
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use std::any::Any;
+use std::fmt::Display;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+/// Wrapper around a scalar function that can be evaluated asynchronously
+#[derive(Debug, Clone, Eq)]
+pub struct AsyncFuncExpr {
+    /// The name of the output column this function will generate
+    pub name: String,
+    /// The actual function (always `ScalarFunctionExpr`)
+    pub func: Arc<dyn PhysicalExpr>,
+    /// The field that this function will return
+    return_field: FieldRef,
+}
+
+impl Display for AsyncFuncExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "async_expr(name={}, expr={})", self.name, self.func)
+    }
+}
+
+impl PartialEq for AsyncFuncExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name && self.func == Arc::clone(&other.func)
+    }
+}
+
+impl Hash for AsyncFuncExpr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.name.hash(state);
+        self.func.as_ref().hash(state);
+    }
+}
+
+impl AsyncFuncExpr {
+    /// create a new AsyncFuncExpr
+    pub fn try_new(
+        name: impl Into<String>,
+        func: Arc<dyn PhysicalExpr>,
+        schema: &Schema,
+    ) -> Result<Self> {
+        let Some(_) = func.as_any().downcast_ref::<ScalarFunctionExpr>() else {
+            return internal_err!(
+                "unexpected function type, expected ScalarFunctionExpr, got: {:?}",
+                func
+            );
+        };
+
+        let return_field = func.return_field(schema)?;
+        Ok(Self {
+            name: name.into(),
+            func,
+            return_field,
+        })
+    }
+
+    /// return the name of the output column
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Return the output field generated by evaluating this function
+    pub fn field(&self, input_schema: &Schema) -> Result<Field> {
+        Ok(Field::new(
+            &self.name,
+            self.func.data_type(input_schema)?,
+            self.func.nullable(input_schema)?,
+        ))
+    }
+
+    /// Return the ideal batch size for this function
+    pub fn ideal_batch_size(&self) -> Result<Option<usize>> {
+        if let Some(expr) = self.func.as_any().downcast_ref::<ScalarFunctionExpr>()
+            && let Some(udf) =
+                expr.fun().inner().as_any().downcast_ref::<AsyncScalarUDF>()
+        {
+            return Ok(udf.ideal_batch_size());
+        }
+        not_impl_err!("Can't get ideal_batch_size from {:?}", self.func)
+    }
+
+    /// This (async) function is called for each record batch to evaluate the LLM expressions
+    ///
+    /// The output is the output of evaluating the async expression and the input record batch
+    pub async fn invoke_with_args(
+        &self,
+        batch: &RecordBatch,
+        config_options: Arc<ConfigOptions>,
+    ) -> Result<ColumnarValue> {
+        let Some(scalar_function_expr) =
+            self.func.as_any().downcast_ref::<ScalarFunctionExpr>()
+        else {
+            return internal_err!(
+                "unexpected function type, expected ScalarFunctionExpr, got: {:?}",
+                self.func
+            );
+        };
+
+        let Some(async_udf) = scalar_function_expr
+            .fun()
+            .inner()
+            .as_any()
+            .downcast_ref::<AsyncScalarUDF>()
+        else {
+            return not_impl_err!(
+                "Don't know how to evaluate async function: {:?}",
+                scalar_function_expr
+            );
+        };
+
+        let arg_fields = scalar_function_expr
+            .args()
+            .iter()
+            .map(|e| e.return_field(batch.schema_ref()))
+            .collect::<Result<Vec<_>>>()?;
+
+        let mut result_batches = vec![];
+        if let Some(ideal_batch_size) = self.ideal_batch_size()? {
+            let mut remainder = batch.clone();
+            while remainder.num_rows() > 0 {
+                let size = if ideal_batch_size > remainder.num_rows() {
+                    remainder.num_rows()
+                } else {
+                    ideal_batch_size
+                };
+
+                let current_batch = remainder.slice(0, size); // get next 10 rows
+                remainder = remainder.slice(size, remainder.num_rows() - size);
+                let args = scalar_function_expr
+                    .args()
+                    .iter()
+                    .map(|e| e.evaluate(&current_batch))
+                    .collect::<Result<Vec<_>>>()?;
+                result_batches.push(
+                    async_udf
+                        .invoke_async_with_args(ScalarFunctionArgs {
+                            args,
+                            arg_fields: arg_fields.clone(),
+                            number_rows: current_batch.num_rows(),
+                            return_field: Arc::clone(&self.return_field),
+                            config_options: Arc::clone(&config_options),
+                        })
+                        .await?,
+                );
+            }
+        } else {
+            let args = scalar_function_expr
+                .args()
+                .iter()
+                .map(|e| e.evaluate(batch))
+                .collect::<Result<Vec<_>>>()?;
+
+            result_batches.push(
+                async_udf
+                    .invoke_async_with_args(ScalarFunctionArgs {
+                        args: args.to_vec(),
+                        arg_fields,
+                        number_rows: batch.num_rows(),
+                        return_field: Arc::clone(&self.return_field),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .await?,
+            );
+        }
+
+        let datas = result_batches
+            .into_iter()
+            .map(|cv| match cv {
+                ColumnarValue::Array(arr) => Ok(arr),
+                ColumnarValue::Scalar(scalar) => Ok(scalar.to_array_of_size(1)?),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // Get references to the arrays as dyn Array to call concat
+        let dyn_arrays = datas
+            .iter()
+            .map(|arr| arr as &dyn arrow::array::Array)
+            .collect::<Vec<_>>();
+        let result_array = concat(&dyn_arrays)?;
+        Ok(ColumnarValue::Array(result_array))
+    }
+}
+
+impl PhysicalExpr for AsyncFuncExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        self.func.data_type(input_schema)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.func.nullable(input_schema)
+    }
+
+    fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
+        // TODO: implement this for scalar value input
+        not_impl_err!("AsyncFuncExpr.evaluate")
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.func.children()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let new_func = Arc::clone(&self.func).with_new_children(children)?;
+        Ok(Arc::new(AsyncFuncExpr {
+            name: self.name.clone(),
+            func: new_func,
+            return_field: Arc::clone(&self.return_field),
+        }))
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.func)
+    }
+}
diff --git a/datafusion/physical-expr/src/equivalence/class.rs b/datafusion/physical-expr/src/equivalence/class.rs
index 98b1299a2ec6a..78478fc13ed4f 100644
--- a/datafusion/physical-expr/src/equivalence/class.rs
+++ b/datafusion/physical-expr/src/equivalence/class.rs
@@ -15,30 +15,61 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::{add_offset_to_expr, ProjectionMapping};
-use crate::{
-    expressions::Column, LexOrdering, LexRequirement, PhysicalExpr, PhysicalExprRef,
-    PhysicalSortExpr, PhysicalSortRequirement,
-};
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{JoinType, ScalarValue};
-use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
 use std::fmt::Display;
+use std::ops::Deref;
 use std::sync::Arc;
 use std::vec::IntoIter;
 
+use super::ProjectionMapping;
+use crate::expressions::Literal;
+use crate::physical_expr::add_offset_to_expr;
+use crate::projection::ProjectionTargets;
+use crate::{PhysicalExpr, PhysicalExprRef, PhysicalSortExpr, PhysicalSortRequirement};
+
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::{JoinType, Result, ScalarValue};
+use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
+
 use indexmap::{IndexMap, IndexSet};
 
-/// A structure representing a expression known to be constant in a physical execution plan.
+/// Represents whether a constant expression's value is uniform or varies across
+/// partitions. Has two variants:
+/// - `Heterogeneous`: The constant expression may have different values for
+///   different partitions.
+/// - `Uniform(Option<ScalarValue>)`: The constant expression has the same value
+///   across all partitions, or is `None` if the value is unknown.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub enum AcrossPartitions {
+    #[default]
+    Heterogeneous,
+    Uniform(Option<ScalarValue>),
+}
+
+impl Display for AcrossPartitions {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            AcrossPartitions::Heterogeneous => write!(f, "(heterogeneous)"),
+            AcrossPartitions::Uniform(value) => {
+                if let Some(val) = value {
+                    write!(f, "(uniform: {val})")
+                } else {
+                    write!(f, "(uniform: unknown)")
+                }
+            }
+        }
+    }
+}
+
+/// A structure representing a expression known to be constant in a physical
+/// execution plan.
 ///
-/// The `ConstExpr` struct encapsulates an expression that is constant during the execution
-/// of a query. For example if a predicate like `A = 5` applied earlier in the plan `A` would
-/// be known constant
+/// The `ConstExpr` struct encapsulates an expression that is constant during
+/// the execution of a query. For example if a filter like `A = 5` appears
+/// earlier in the plan, `A` would become a constant in subsequent operations.
 ///
 /// # Fields
 ///
 /// - `expr`: Constant expression for a node in the physical plan.
-///
 /// - `across_partitions`: A boolean flag indicating whether the constant
 ///   expression is the same across partitions. If set to `true`, the constant
 ///   expression has same value for all partitions. If set to `false`, the
@@ -50,108 +81,37 @@ use indexmap::{IndexMap, IndexSet};
 /// # use datafusion_physical_expr::ConstExpr;
 /// # use datafusion_physical_expr::expressions::lit;
 /// let col = lit(5);
-/// // Create a constant expression from a physical expression ref
-/// let const_expr = ConstExpr::from(&col);
-/// // create a constant expression from a physical expression
+/// // Create a constant expression from a physical expression:
 /// let const_expr = ConstExpr::from(col);
 /// ```
-// TODO: Consider refactoring the `across_partitions` and `value` fields into an enum:
-//
-// ```
-// enum PartitionValues {
-//     Uniform(Option<ScalarValue>),           // Same value across all partitions
-//     Heterogeneous(Vec<Option<ScalarValue>>) // Different values per partition
-// }
-// ```
-//
-// This would provide more flexible representation of partition values.
-// Note: This is a breaking change for the equivalence API and should be
-// addressed in a separate issue/PR.
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug)]
 pub struct ConstExpr {
-    /// The  expression that is known to be constant (e.g. a `Column`)
-    expr: Arc<dyn PhysicalExpr>,
-    /// Does the constant have the same value across all partitions? See
-    /// struct docs for more details
-    across_partitions: AcrossPartitions,
-}
-
-#[derive(PartialEq, Clone, Debug)]
-/// Represents whether a constant expression's value is uniform or varies across partitions.
-///
-/// The `AcrossPartitions` enum is used to describe the nature of a constant expression
-/// in a physical execution plan:
-///
-/// - `Heterogeneous`: The constant expression may have different values for different partitions.
-/// - `Uniform(Option<ScalarValue>)`: The constant expression has the same value across all partitions,
-///   or is `None` if the value is not specified.
-pub enum AcrossPartitions {
-    Heterogeneous,
-    Uniform(Option<ScalarValue>),
-}
-
-impl Default for AcrossPartitions {
-    fn default() -> Self {
-        Self::Heterogeneous
-    }
-}
-
-impl PartialEq for ConstExpr {
-    fn eq(&self, other: &Self) -> bool {
-        self.across_partitions == other.across_partitions && self.expr.eq(&other.expr)
-    }
+    /// The expression that is known to be constant (e.g. a `Column`).
+    pub expr: Arc<dyn PhysicalExpr>,
+    /// Indicates whether the constant have the same value across all partitions.
+    pub across_partitions: AcrossPartitions,
 }
+// TODO: The `ConstExpr` definition above can be in an inconsistent state where
+//       `expr` is a literal but `across_partitions` is not `Uniform`. Consider
+//       a refactor to ensure that `ConstExpr` is always in a consistent state
+//       (either by changing type definition, or by API constraints).
 
 impl ConstExpr {
-    /// Create a new constant expression from a physical expression.
+    /// Create a new constant expression from a physical expression, specifying
+    /// whether the constant expression is the same across partitions.
     ///
-    /// Note you can also use `ConstExpr::from` to create a constant expression
-    /// from a reference as well
-    pub fn new(expr: Arc<dyn PhysicalExpr>) -> Self {
-        Self {
-            expr,
-            // By default, assume constant expressions are not same across partitions.
-            across_partitions: Default::default(),
+    /// Note that you can also use `ConstExpr::from` to create a constant
+    /// expression from just a physical expression, with the *safe* assumption
+    /// of heterogenous values across partitions unless the expression is a
+    /// literal.
+    pub fn new(expr: Arc<dyn PhysicalExpr>, across_partitions: AcrossPartitions) -> Self {
+        let mut result = ConstExpr::from(expr);
+        // Override the across partitions specification if the expression is not
+        // a literal.
+        if result.across_partitions == AcrossPartitions::Heterogeneous {
+            result.across_partitions = across_partitions;
         }
-    }
-
-    /// Set the `across_partitions` flag
-    ///
-    /// See struct docs for more details
-    pub fn with_across_partitions(mut self, across_partitions: AcrossPartitions) -> Self {
-        self.across_partitions = across_partitions;
-        self
-    }
-
-    /// Is the  expression the same across all partitions?
-    ///
-    /// See struct docs for more details
-    pub fn across_partitions(&self) -> AcrossPartitions {
-        self.across_partitions.clone()
-    }
-
-    pub fn expr(&self) -> &Arc<dyn PhysicalExpr> {
-        &self.expr
-    }
-
-    pub fn owned_expr(self) -> Arc<dyn PhysicalExpr> {
-        self.expr
-    }
-
-    pub fn map<F>(&self, f: F) -> Option<Self>
-    where
-        F: Fn(&Arc<dyn PhysicalExpr>) -> Option<Arc<dyn PhysicalExpr>>,
-    {
-        let maybe_expr = f(&self.expr);
-        maybe_expr.map(|expr| Self {
-            expr,
-            across_partitions: self.across_partitions.clone(),
-        })
-    }
-
-    /// Returns true if this constant expression is equal to the given expression
-    pub fn eq_expr(&self, other: impl AsRef<dyn PhysicalExpr>) -> bool {
-        self.expr.as_ref() == other.as_ref()
+        result
     }
 
     /// Returns a [`Display`]able list of `ConstExpr`.
@@ -175,47 +135,36 @@ impl ConstExpr {
     }
 }
 
+impl PartialEq for ConstExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.across_partitions == other.across_partitions && self.expr.eq(&other.expr)
+    }
+}
+
 impl Display for ConstExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.expr)?;
-        match &self.across_partitions {
-            AcrossPartitions::Heterogeneous => {
-                write!(f, "(heterogeneous)")?;
-            }
-            AcrossPartitions::Uniform(value) => {
-                if let Some(val) = value {
-                    write!(f, "(uniform: {val})")?;
-                } else {
-                    write!(f, "(uniform: unknown)")?;
-                }
-            }
-        }
-        Ok(())
+        write!(f, "{}", self.across_partitions)
     }
 }
 
 impl From<Arc<dyn PhysicalExpr>> for ConstExpr {
     fn from(expr: Arc<dyn PhysicalExpr>) -> Self {
-        Self::new(expr)
-    }
-}
-
-impl From<&Arc<dyn PhysicalExpr>> for ConstExpr {
-    fn from(expr: &Arc<dyn PhysicalExpr>) -> Self {
-        Self::new(Arc::clone(expr))
+        // By default, assume constant expressions are not same across partitions.
+        // However, if we have a literal, it will have a single value that is the
+        // same across all partitions.
+        let across = if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+            AcrossPartitions::Uniform(Some(lit.value().clone()))
+        } else {
+            AcrossPartitions::Heterogeneous
+        };
+        Self {
+            expr,
+            across_partitions: across,
+        }
     }
 }
 
-/// Checks whether `expr` is among in the `const_exprs`.
-pub fn const_exprs_contains(
-    const_exprs: &[ConstExpr],
-    expr: &Arc<dyn PhysicalExpr>,
-) -> bool {
-    const_exprs
-        .iter()
-        .any(|const_expr| const_expr.expr.eq(expr))
-}
-
 /// An `EquivalenceClass` is a set of [`Arc<dyn PhysicalExpr>`]s that are known
 /// to have the same value for all tuples in a relation. These are generated by
 /// equality predicates (e.g. `a = b`), typically equi-join conditions and
@@ -223,259 +172,361 @@ pub fn const_exprs_contains(
 ///
 /// Two `EquivalenceClass`es are equal if they contains the same expressions in
 /// without any ordering.
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
 pub struct EquivalenceClass {
-    /// The expressions in this equivalence class. The order doesn't
-    /// matter for equivalence purposes
-    ///
-    exprs: IndexSet<Arc<dyn PhysicalExpr>>,
-}
-
-impl PartialEq for EquivalenceClass {
-    /// Returns true if other is equal in the sense
-    /// of bags (multi-sets), disregarding their orderings.
-    fn eq(&self, other: &Self) -> bool {
-        self.exprs.eq(&other.exprs)
-    }
+    /// The expressions in this equivalence class. The order doesn't matter for
+    /// equivalence purposes.
+    pub(crate) exprs: IndexSet<Arc<dyn PhysicalExpr>>,
+    /// Indicates whether the expressions in this equivalence class have a
+    /// constant value. A `Some` value indicates constant-ness.
+    pub(crate) constant: Option<AcrossPartitions>,
 }
 
 impl EquivalenceClass {
-    /// Create a new empty equivalence class
-    pub fn new_empty() -> Self {
-        Self {
-            exprs: IndexSet::new(),
+    // Create a new equivalence class from a pre-existing collection.
+    pub fn new(exprs: impl IntoIterator<Item = Arc<dyn PhysicalExpr>>) -> Self {
+        let mut class = Self::default();
+        for expr in exprs {
+            class.push(expr);
         }
-    }
-
-    // Create a new equivalence class from a pre-existing `Vec`
-    pub fn new(exprs: Vec<Arc<dyn PhysicalExpr>>) -> Self {
-        Self {
-            exprs: exprs.into_iter().collect(),
-        }
-    }
-
-    /// Return the inner vector of expressions
-    pub fn into_vec(self) -> Vec<Arc<dyn PhysicalExpr>> {
-        self.exprs.into_iter().collect()
+        class
     }
 
     /// Return the "canonical" expression for this class (the first element)
-    /// if any
-    fn canonical_expr(&self) -> Option<Arc<dyn PhysicalExpr>> {
-        self.exprs.iter().next().cloned()
+    /// if non-empty.
+    pub fn canonical_expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
+        self.exprs.iter().next()
     }
 
     /// Insert the expression into this class, meaning it is known to be equal to
-    /// all other expressions in this class
+    /// all other expressions in this class.
     pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
+        if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+            let expr_across = AcrossPartitions::Uniform(Some(lit.value().clone()));
+            if let Some(across) = self.constant.as_mut() {
+                // TODO: Return an error if constant values do not agree.
+                if *across == AcrossPartitions::Heterogeneous {
+                    *across = expr_across;
+                }
+            } else {
+                self.constant = Some(expr_across);
+            }
+        }
         self.exprs.insert(expr);
     }
 
-    /// Inserts all the expressions from other into this class
+    /// Inserts all the expressions from other into this class.
     pub fn extend(&mut self, other: Self) {
-        for expr in other.exprs {
-            // use push so entries are deduplicated
-            self.push(expr);
+        self.exprs.extend(other.exprs);
+        match (&self.constant, &other.constant) {
+            (Some(across), Some(_)) => {
+                // TODO: Return an error if constant values do not agree.
+                if across == &AcrossPartitions::Heterogeneous {
+                    self.constant = other.constant;
+                }
+            }
+            (None, Some(_)) => self.constant = other.constant,
+            (_, None) => {}
         }
     }
 
-    /// Returns true if this equivalence class contains t expression
-    pub fn contains(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
-        self.exprs.contains(expr)
-    }
-
-    /// Returns true if this equivalence class has any entries in common with `other`
+    /// Returns whether this equivalence class has any entries in common with
+    /// `other`.
     pub fn contains_any(&self, other: &Self) -> bool {
-        self.exprs.iter().any(|e| other.contains(e))
+        self.exprs.intersection(&other.exprs).next().is_some()
     }
 
-    /// return the number of items in this class
-    pub fn len(&self) -> usize {
-        self.exprs.len()
+    /// Returns whether this equivalence class is trivial, meaning that it is
+    /// either empty, or contains a single expression that is not a constant.
+    /// Such classes are not useful, and can be removed from equivalence groups.
+    pub fn is_trivial(&self) -> bool {
+        self.exprs.is_empty() || (self.exprs.len() == 1 && self.constant.is_none())
     }
 
-    /// return true if this class is empty
-    pub fn is_empty(&self) -> bool {
-        self.exprs.is_empty()
-    }
-
-    /// Iterate over all elements in this class, in some arbitrary order
-    pub fn iter(&self) -> impl Iterator<Item = &Arc<dyn PhysicalExpr>> {
-        self.exprs.iter()
-    }
-
-    /// Return a new equivalence class that have the specified offset added to
-    /// each expression (used when schemas are appended such as in joins)
-    pub fn with_offset(&self, offset: usize) -> Self {
-        let new_exprs = self
+    /// Adds the given offset to all columns in the expressions inside this
+    /// class. This is used when schemas are appended, e.g. in joins.
+    pub fn try_with_offset(&self, offset: isize) -> Result<Self> {
+        let mut cls = Self::default();
+        for expr_result in self
             .exprs
             .iter()
             .cloned()
             .map(|e| add_offset_to_expr(e, offset))
-            .collect();
-        Self::new(new_exprs)
+        {
+            cls.push(expr_result?);
+        }
+        Ok(cls)
+    }
+}
+
+impl Deref for EquivalenceClass {
+    type Target = IndexSet<Arc<dyn PhysicalExpr>>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.exprs
+    }
+}
+
+impl IntoIterator for EquivalenceClass {
+    type Item = Arc<dyn PhysicalExpr>;
+    type IntoIter = <IndexSet<Self::Item> as IntoIterator>::IntoIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.exprs.into_iter()
     }
 }
 
 impl Display for EquivalenceClass {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "[{}]", format_physical_expr_list(&self.exprs))
+        write!(f, "{{")?;
+        write!(f, "members: {}", format_physical_expr_list(&self.exprs))?;
+        if let Some(across) = &self.constant {
+            write!(f, ", constant: {across}")?;
+        }
+        write!(f, "}}")
+    }
+}
+
+impl From<EquivalenceClass> for Vec<Arc<dyn PhysicalExpr>> {
+    fn from(cls: EquivalenceClass) -> Self {
+        cls.exprs.into_iter().collect()
     }
 }
 
-/// A collection of distinct `EquivalenceClass`es
-#[derive(Debug, Clone)]
+type AugmentedMapping<'a> = IndexMap<
+    &'a Arc<dyn PhysicalExpr>,
+    (&'a ProjectionTargets, Option<&'a EquivalenceClass>),
+>;
+
+/// A collection of distinct `EquivalenceClass`es. This object supports fast
+/// lookups of expressions and their equivalence classes.
+#[derive(Clone, Debug, Default)]
 pub struct EquivalenceGroup {
+    /// A mapping from expressions to their equivalence class key.
+    map: IndexMap<Arc<dyn PhysicalExpr>, usize>,
+    /// The equivalence classes in this group.
     classes: Vec<EquivalenceClass>,
 }
 
 impl EquivalenceGroup {
-    /// Creates an empty equivalence group.
-    pub fn empty() -> Self {
-        Self { classes: vec![] }
-    }
-
     /// Creates an equivalence group from the given equivalence classes.
-    pub fn new(classes: Vec<EquivalenceClass>) -> Self {
-        let mut result = Self { classes };
-        result.remove_redundant_entries();
-        result
-    }
-
-    /// Returns how many equivalence classes there are in this group.
-    pub fn len(&self) -> usize {
-        self.classes.len()
+    pub fn new(classes: impl IntoIterator<Item = EquivalenceClass>) -> Self {
+        classes.into_iter().collect::<Vec<_>>().into()
     }
 
-    /// Checks whether this equivalence group is empty.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+    /// Adds `expr` as a constant expression to this equivalence group.
+    pub fn add_constant(&mut self, const_expr: ConstExpr) {
+        // If the expression is already in an equivalence class, we should
+        // adjust the constant-ness of the class if necessary:
+        if let Some(idx) = self.map.get(&const_expr.expr) {
+            let cls = &mut self.classes[*idx];
+            if let Some(across) = cls.constant.as_mut() {
+                // TODO: Return an error if constant values do not agree.
+                if *across == AcrossPartitions::Heterogeneous {
+                    *across = const_expr.across_partitions;
+                }
+            } else {
+                cls.constant = Some(const_expr.across_partitions);
+            }
+            return;
+        }
+        // If the expression is not in any equivalence class, but has the same
+        // constant value with some class, add it to that class:
+        if let AcrossPartitions::Uniform(_) = &const_expr.across_partitions {
+            for (idx, cls) in self.classes.iter_mut().enumerate() {
+                if cls
+                    .constant
+                    .as_ref()
+                    .is_some_and(|across| const_expr.across_partitions.eq(across))
+                {
+                    self.map.insert(Arc::clone(&const_expr.expr), idx);
+                    cls.push(const_expr.expr);
+                    return;
+                }
+            }
+        }
+        // Otherwise, create a new class with the expression as the only member:
+        let mut new_class = EquivalenceClass::new(std::iter::once(const_expr.expr));
+        if new_class.constant.is_none() {
+            new_class.constant = Some(const_expr.across_partitions);
+        }
+        Self::update_lookup_table(&mut self.map, &new_class, self.classes.len());
+        self.classes.push(new_class);
     }
 
-    /// Returns an iterator over the equivalence classes in this group.
-    pub fn iter(&self) -> impl Iterator<Item = &EquivalenceClass> {
-        self.classes.iter()
+    /// Removes constant expressions that may change across partitions.
+    /// This method should be used when merging data from different partitions.
+    /// Returns whether any change was made to the equivalence group.
+    pub fn clear_per_partition_constants(&mut self) -> bool {
+        let (mut idx, mut change) = (0, false);
+        while idx < self.classes.len() {
+            let cls = &mut self.classes[idx];
+            if let Some(AcrossPartitions::Heterogeneous) = cls.constant {
+                change = true;
+                if cls.len() == 1 {
+                    // If this class becomes trivial, remove it entirely:
+                    self.remove_class_at_idx(idx);
+                    continue;
+                } else {
+                    cls.constant = None;
+                }
+            }
+            idx += 1;
+        }
+        change
     }
 
-    /// Adds the equality `left` = `right` to this equivalence group.
-    /// New equality conditions often arise after steps like `Filter(a = b)`,
-    /// `Alias(a, a as b)` etc.
+    /// Adds the equality `left` = `right` to this equivalence group. New
+    /// equality conditions often arise after steps like `Filter(a = b)`,
+    /// `Alias(a, a as b)` etc. Returns whether the given equality defines
+    /// a new equivalence class.
     pub fn add_equal_conditions(
         &mut self,
-        left: &Arc<dyn PhysicalExpr>,
-        right: &Arc<dyn PhysicalExpr>,
-    ) {
-        let mut first_class = None;
-        let mut second_class = None;
-        for (idx, cls) in self.classes.iter().enumerate() {
-            if cls.contains(left) {
-                first_class = Some(idx);
-            }
-            if cls.contains(right) {
-                second_class = Some(idx);
-            }
-        }
+        left: Arc<dyn PhysicalExpr>,
+        right: Arc<dyn PhysicalExpr>,
+    ) -> bool {
+        let first_class = self.map.get(&left).copied();
+        let second_class = self.map.get(&right).copied();
         match (first_class, second_class) {
             (Some(mut first_idx), Some(mut second_idx)) => {
                 // If the given left and right sides belong to different classes,
                 // we should unify/bridge these classes.
-                if first_idx != second_idx {
-                    // By convention, make sure `second_idx` is larger than `first_idx`.
-                    if first_idx > second_idx {
-                        (first_idx, second_idx) = (second_idx, first_idx);
+                match first_idx.cmp(&second_idx) {
+                    // The equality is already known, return and signal this:
+                    std::cmp::Ordering::Equal => return false,
+                    // Swap indices to ensure `first_idx` is the lesser index.
+                    std::cmp::Ordering::Greater => {
+                        std::mem::swap(&mut first_idx, &mut second_idx);
                     }
-                    // Remove the class at `second_idx` and merge its values with
-                    // the class at `first_idx`. The convention above makes sure
-                    // that `first_idx` is still valid after removing `second_idx`.
-                    let other_class = self.classes.swap_remove(second_idx);
-                    self.classes[first_idx].extend(other_class);
+                    _ => {}
                 }
+                // Remove the class at `second_idx` and merge its values with
+                // the class at `first_idx`. The convention above makes sure
+                // that `first_idx` is still valid after removing `second_idx`.
+                let other_class = self.remove_class_at_idx(second_idx);
+                // Update the lookup table for the second class:
+                Self::update_lookup_table(&mut self.map, &other_class, first_idx);
+                self.classes[first_idx].extend(other_class);
             }
             (Some(group_idx), None) => {
                 // Right side is new, extend left side's class:
-                self.classes[group_idx].push(Arc::clone(right));
+                self.map.insert(Arc::clone(&right), group_idx);
+                self.classes[group_idx].push(right);
             }
             (None, Some(group_idx)) => {
                 // Left side is new, extend right side's class:
-                self.classes[group_idx].push(Arc::clone(left));
+                self.map.insert(Arc::clone(&left), group_idx);
+                self.classes[group_idx].push(left);
             }
             (None, None) => {
                 // None of the expressions is among existing classes.
                 // Create a new equivalence class and extend the group.
-                self.classes.push(EquivalenceClass::new(vec![
-                    Arc::clone(left),
-                    Arc::clone(right),
-                ]));
+                let class = EquivalenceClass::new([left, right]);
+                Self::update_lookup_table(&mut self.map, &class, self.classes.len());
+                self.classes.push(class);
+                return true;
             }
         }
+        false
     }
 
-    /// Removes redundant entries from this group.
-    fn remove_redundant_entries(&mut self) {
-        // Remove duplicate entries from each equivalence class:
-        self.classes.retain_mut(|cls| {
-            // Keep groups that have at least two entries as singleton class is
-            // meaningless (i.e. it contains no non-trivial information):
-            cls.len() > 1
-        });
-        // Unify/bridge groups that have common expressions:
-        self.bridge_classes()
+    /// Removes the equivalence class at the given index from this group.
+    fn remove_class_at_idx(&mut self, idx: usize) -> EquivalenceClass {
+        // Remove the class at the given index:
+        let cls = self.classes.swap_remove(idx);
+        // Remove its entries from the lookup table:
+        for expr in cls.iter() {
+            self.map.swap_remove(expr);
+        }
+        // Update the lookup table for the moved class:
+        if idx < self.classes.len() {
+            Self::update_lookup_table(&mut self.map, &self.classes[idx], idx);
+        }
+        cls
+    }
+
+    /// Updates the entry in lookup table for the given equivalence class with
+    /// the given index.
+    fn update_lookup_table(
+        map: &mut IndexMap<Arc<dyn PhysicalExpr>, usize>,
+        cls: &EquivalenceClass,
+        idx: usize,
+    ) {
+        for expr in cls.iter() {
+            map.insert(Arc::clone(expr), idx);
+        }
+    }
+
+    /// Removes redundant entries from this group. Returns whether any change
+    /// was made to the equivalence group.
+    fn remove_redundant_entries(&mut self) -> bool {
+        // First, remove trivial equivalence classes:
+        let mut change = false;
+        for idx in (0..self.classes.len()).rev() {
+            if self.classes[idx].is_trivial() {
+                self.remove_class_at_idx(idx);
+                change = true;
+            }
+        }
+        // Then, unify/bridge groups that have common expressions:
+        self.bridge_classes() || change
     }
 
     /// This utility function unifies/bridges classes that have common expressions.
     /// For example, assume that we have [`EquivalenceClass`]es `[a, b]` and `[b, c]`.
     /// Since both classes contain `b`, columns `a`, `b` and `c` are actually all
     /// equal and belong to one class. This utility converts merges such classes.
-    fn bridge_classes(&mut self) {
-        let mut idx = 0;
-        while idx < self.classes.len() {
-            let mut next_idx = idx + 1;
-            let start_size = self.classes[idx].len();
-            while next_idx < self.classes.len() {
-                if self.classes[idx].contains_any(&self.classes[next_idx]) {
-                    let extension = self.classes.swap_remove(next_idx);
+    /// Returns whether any change was made to the equivalence group.
+    fn bridge_classes(&mut self) -> bool {
+        let (mut idx, mut change) = (0, false);
+        'scan: while idx < self.classes.len() {
+            for other_idx in (idx + 1..self.classes.len()).rev() {
+                if self.classes[idx].contains_any(&self.classes[other_idx]) {
+                    let extension = self.remove_class_at_idx(other_idx);
+                    Self::update_lookup_table(&mut self.map, &extension, idx);
                     self.classes[idx].extend(extension);
-                } else {
-                    next_idx += 1;
+                    change = true;
+                    continue 'scan;
                 }
             }
-            if self.classes[idx].len() > start_size {
-                continue;
-            }
             idx += 1;
         }
+        change
     }
 
     /// Extends this equivalence group with the `other` equivalence group.
-    pub fn extend(&mut self, other: Self) {
+    /// Returns whether any equivalence classes were unified/bridged as a
+    /// result of the extension process.
+    pub fn extend(&mut self, other: Self) -> bool {
+        for (idx, cls) in other.classes.iter().enumerate() {
+            // Update the lookup table for the new class:
+            Self::update_lookup_table(&mut self.map, cls, idx);
+        }
         self.classes.extend(other.classes);
-        self.remove_redundant_entries();
+        self.bridge_classes()
     }
 
-    /// Normalizes the given physical expression according to this group.
-    /// The expression is replaced with the first expression in the equivalence
-    /// class it matches with (if any).
+    /// Normalizes the given physical expression according to this group. The
+    /// expression is replaced with the first (canonical) expression in the
+    /// equivalence class it matches with (if any).
     pub fn normalize_expr(&self, expr: Arc<dyn PhysicalExpr>) -> Arc<dyn PhysicalExpr> {
         expr.transform(|expr| {
-            for cls in self.iter() {
-                if cls.contains(&expr) {
-                    // The unwrap below is safe because the guard above ensures
-                    // that the class is not empty.
-                    return Ok(Transformed::yes(cls.canonical_expr().unwrap()));
-                }
-            }
-            Ok(Transformed::no(expr))
+            let cls = self.get_equivalence_class(&expr);
+            let Some(canonical) = cls.and_then(|cls| cls.canonical_expr()) else {
+                return Ok(Transformed::no(expr));
+            };
+            Ok(Transformed::yes(Arc::clone(canonical)))
         })
         .data()
         .unwrap()
         // The unwrap above is safe because the closure always returns `Ok`.
     }
 
-    /// Normalizes the given sort expression according to this group.
-    /// The underlying physical expression is replaced with the first expression
-    /// in the equivalence class it matches with (if any). If the underlying
-    /// expression does not belong to any equivalence class in this group, returns
-    /// the sort expression as is.
+    /// Normalizes the given sort expression according to this group. The
+    /// underlying physical expression is replaced with the first expression in
+    /// the equivalence class it matches with (if any). If the underlying
+    /// expression does not belong to any equivalence class in this group,
+    /// returns the sort expression as is.
     pub fn normalize_sort_expr(
         &self,
         mut sort_expr: PhysicalSortExpr,
@@ -484,11 +535,29 @@ impl EquivalenceGroup {
         sort_expr
     }
 
-    /// Normalizes the given sort requirement according to this group.
-    /// The underlying physical expression is replaced with the first expression
-    /// in the equivalence class it matches with (if any). If the underlying
-    /// expression does not belong to any equivalence class in this group, returns
-    /// the given sort requirement as is.
+    /// Normalizes the given sort expressions (i.e. `sort_exprs`) by:
+    /// - Replacing sections that belong to some equivalence class in the
+    ///   with the first entry in the matching equivalence class.
+    /// - Removing expressions that have a constant value.
+    ///
+    /// If columns `a` and `b` are known to be equal, `d` is known to be a
+    /// constant, and `sort_exprs` is `[b ASC, d DESC, c ASC, a ASC]`, this
+    /// function would return `[a ASC, c ASC, a ASC]`.
+    pub fn normalize_sort_exprs<'a>(
+        &'a self,
+        sort_exprs: impl IntoIterator<Item = PhysicalSortExpr> + 'a,
+    ) -> impl Iterator<Item = PhysicalSortExpr> + 'a {
+        sort_exprs
+            .into_iter()
+            .map(|sort_expr| self.normalize_sort_expr(sort_expr))
+            .filter(|sort_expr| self.is_expr_constant(&sort_expr.expr).is_none())
+    }
+
+    /// Normalizes the given sort requirement according to this group. The
+    /// underlying physical expression is replaced with the first expression in
+    /// the equivalence class it matches with (if any). If the underlying
+    /// expression does not belong to any equivalence class in this group,
+    /// returns the given sort requirement as is.
     pub fn normalize_sort_requirement(
         &self,
         mut sort_requirement: PhysicalSortRequirement,
@@ -497,44 +566,81 @@ impl EquivalenceGroup {
         sort_requirement
     }
 
-    /// This function applies the `normalize_expr` function for all expressions
-    /// in `exprs` and returns the corresponding normalized physical expressions.
-    pub fn normalize_exprs(
-        &self,
-        exprs: impl IntoIterator<Item = Arc<dyn PhysicalExpr>>,
-    ) -> Vec<Arc<dyn PhysicalExpr>> {
-        exprs
+    /// Normalizes the given sort requirements (i.e. `sort_reqs`) by:
+    /// - Replacing sections that belong to some equivalence class in the
+    ///   with the first entry in the matching equivalence class.
+    /// - Removing expressions that have a constant value.
+    ///
+    /// If columns `a` and `b` are known to be equal, `d` is known to be a
+    /// constant, and `sort_reqs` is `[b ASC, d DESC, c ASC, a ASC]`, this
+    /// function would return `[a ASC, c ASC, a ASC]`.
+    pub fn normalize_sort_requirements<'a>(
+        &'a self,
+        sort_reqs: impl IntoIterator<Item = PhysicalSortRequirement> + 'a,
+    ) -> impl Iterator<Item = PhysicalSortRequirement> + 'a {
+        sort_reqs
             .into_iter()
-            .map(|expr| self.normalize_expr(expr))
-            .collect()
+            .map(|req| self.normalize_sort_requirement(req))
+            .filter(|req| self.is_expr_constant(&req.expr).is_none())
     }
 
-    /// This function applies the `normalize_sort_expr` function for all sort
-    /// expressions in `sort_exprs` and returns the corresponding normalized
-    /// sort expressions.
-    pub fn normalize_sort_exprs(&self, sort_exprs: &LexOrdering) -> LexOrdering {
-        // Convert sort expressions to sort requirements:
-        let sort_reqs = LexRequirement::from(sort_exprs.clone());
-        // Normalize the requirements:
-        let normalized_sort_reqs = self.normalize_sort_requirements(&sort_reqs);
-        // Convert sort requirements back to sort expressions:
-        LexOrdering::from(normalized_sort_reqs)
+    /// Perform an indirect projection of `expr` by consulting the equivalence
+    /// classes.
+    fn project_expr_indirect(
+        aug_mapping: &AugmentedMapping,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Option<Arc<dyn PhysicalExpr>> {
+        // Literals don't need to be projected
+        if expr.as_any().downcast_ref::<Literal>().is_some() {
+            return Some(Arc::clone(expr));
+        }
+
+        // The given expression is not inside the mapping, so we try to project
+        // indirectly using equivalence classes.
+        for (targets, eq_class) in aug_mapping.values() {
+            // If we match an equivalent expression to a source expression in
+            // the mapping, then we can project. For example, if we have the
+            // mapping `(a as a1, a + c)` and the equivalence `a == b`,
+            // expression `b` projects to `a1`.
+            if eq_class.as_ref().is_some_and(|cls| cls.contains(expr)) {
+                let (target, _) = targets.first();
+                return Some(Arc::clone(target));
+            }
+        }
+        // Project a non-leaf expression by projecting its children.
+        let children = expr.children();
+        if children.is_empty() {
+            // A leaf expression should be inside the mapping.
+            return None;
+        }
+        children
+            .into_iter()
+            .map(|child| {
+                // First, we try to project children with an exact match. If
+                // we are unable to do this, we consult equivalence classes.
+                if let Some((targets, _)) = aug_mapping.get(child) {
+                    // If we match the source, we can project directly:
+                    let (target, _) = targets.first();
+                    Some(Arc::clone(target))
+                } else {
+                    Self::project_expr_indirect(aug_mapping, child)
+                }
+            })
+            .collect::<Option<Vec<_>>>()
+            .map(|children| Arc::clone(expr).with_new_children(children).unwrap())
     }
 
-    /// This function applies the `normalize_sort_requirement` function for all
-    /// requirements in `sort_reqs` and returns the corresponding normalized
-    /// sort requirements.
-    pub fn normalize_sort_requirements(
-        &self,
-        sort_reqs: &LexRequirement,
-    ) -> LexRequirement {
-        LexRequirement::new(
-            sort_reqs
-                .iter()
-                .map(|sort_req| self.normalize_sort_requirement(sort_req.clone()))
-                .collect(),
-        )
-        .collapse()
+    fn augment_projection_mapping<'a>(
+        &'a self,
+        mapping: &'a ProjectionMapping,
+    ) -> AugmentedMapping<'a> {
+        mapping
+            .iter()
+            .map(|(k, v)| {
+                let eq_class = self.get_equivalence_class(k);
+                (k, (v, eq_class))
+            })
+            .collect()
     }
 
     /// Projects `expr` according to the given projection mapping.
@@ -544,81 +650,118 @@ impl EquivalenceGroup {
         mapping: &ProjectionMapping,
         expr: &Arc<dyn PhysicalExpr>,
     ) -> Option<Arc<dyn PhysicalExpr>> {
-        // First, we try to project expressions with an exact match. If we are
-        // unable to do this, we consult equivalence classes.
-        if let Some(target) = mapping.target_expr(expr) {
+        if let Some(targets) = mapping.get(expr) {
             // If we match the source, we can project directly:
-            return Some(target);
+            let (target, _) = targets.first();
+            Some(Arc::clone(target))
         } else {
-            // If the given expression is not inside the mapping, try to project
-            // expressions considering the equivalence classes.
-            for (source, target) in mapping.iter() {
-                // If we match an equivalent expression to `source`, then we can
-                // project. For example, if we have the mapping `(a as a1, a + c)`
-                // and the equivalence class `(a, b)`, expression `b` projects to `a1`.
-                if self
-                    .get_equivalence_class(source)
-                    .is_some_and(|group| group.contains(expr))
-                {
-                    return Some(Arc::clone(target));
-                }
-            }
-        }
-        // Project a non-leaf expression by projecting its children.
-        let children = expr.children();
-        if children.is_empty() {
-            // Leaf expression should be inside mapping.
-            return None;
+            let aug_mapping = self.augment_projection_mapping(mapping);
+            Self::project_expr_indirect(&aug_mapping, expr)
         }
-        children
-            .into_iter()
-            .map(|child| self.project_expr(mapping, child))
-            .collect::<Option<Vec<_>>>()
-            .map(|children| Arc::clone(expr).with_new_children(children).unwrap())
+    }
+
+    /// Projects `expressions` according to the given projection mapping.
+    /// This function is similar to [`Self::project_expr`], but projects multiple
+    /// expressions at once more efficiently than calling `project_expr` for each
+    /// expression.
+    pub fn project_expressions<'a>(
+        &'a self,
+        mapping: &'a ProjectionMapping,
+        expressions: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>> + 'a,
+    ) -> impl Iterator<Item = Option<Arc<dyn PhysicalExpr>>> + 'a {
+        let mut aug_mapping = None;
+        expressions.into_iter().map(move |expr| {
+            if let Some(targets) = mapping.get(expr) {
+                // If we match the source, we can project directly:
+                let (target, _) = targets.first();
+                Some(Arc::clone(target))
+            } else {
+                let aug_mapping = aug_mapping
+                    .get_or_insert_with(|| self.augment_projection_mapping(mapping));
+                Self::project_expr_indirect(aug_mapping, expr)
+            }
+        })
     }
 
     /// Projects this equivalence group according to the given projection mapping.
     pub fn project(&self, mapping: &ProjectionMapping) -> Self {
-        let projected_classes = self.iter().filter_map(|cls| {
-            let new_class = cls
-                .iter()
-                .filter_map(|expr| self.project_expr(mapping, expr))
-                .collect::<Vec<_>>();
-            (new_class.len() > 1).then_some(EquivalenceClass::new(new_class))
+        let projected_classes = self.iter().map(|cls| {
+            let new_exprs = self.project_expressions(mapping, cls.iter());
+            EquivalenceClass::new(new_exprs.flatten())
         });
 
         // The key is the source expression, and the value is the equivalence
         // class that contains the corresponding target expression.
-        let mut new_classes: IndexMap<_, _> = IndexMap::new();
-        for (source, target) in mapping.iter() {
+        let mut new_constants = vec![];
+        let mut new_classes = IndexMap::<_, EquivalenceClass>::new();
+        for (source, targets) in mapping.iter() {
             // We need to find equivalent projected expressions. For example,
             // consider a table with columns `[a, b, c]` with `a` == `b`, and
             // projection `[a + c, b + c]`. To conclude that `a + c == b + c`,
             // we first normalize all source expressions in the mapping, then
             // merge all equivalent expressions into the classes.
             let normalized_expr = self.normalize_expr(Arc::clone(source));
-            new_classes
-                .entry(normalized_expr)
-                .or_insert_with(EquivalenceClass::new_empty)
-                .push(Arc::clone(target));
+            let cls = new_classes.entry(normalized_expr).or_default();
+            for (target, _) in targets.iter() {
+                cls.push(Arc::clone(target));
+            }
+            // Save new constants arising from the projection:
+            if let Some(across) = self.is_expr_constant(source) {
+                for (target, _) in targets.iter() {
+                    let const_expr = ConstExpr::new(Arc::clone(target), across.clone());
+                    new_constants.push(const_expr);
+                }
+            }
         }
-        // Only add equivalence classes with at least two members as singleton
-        // equivalence classes are meaningless.
-        let new_classes = new_classes
-            .into_iter()
-            .filter_map(|(_, cls)| (cls.len() > 1).then_some(cls));
 
-        let classes = projected_classes.chain(new_classes).collect();
-        Self::new(classes)
+        // Union projected classes with new classes to make up the result:
+        let classes = projected_classes
+            .chain(new_classes.into_values())
+            .filter(|cls| !cls.is_trivial());
+        let mut result = Self::new(classes);
+        // Add new constants arising from the projection to the equivalence group:
+        for constant in new_constants {
+            result.add_constant(constant);
+        }
+        result
+    }
+
+    /// Returns a `Some` value if the expression is constant according to
+    /// equivalence group, and `None` otherwise. The `Some` variant contains
+    /// an `AcrossPartitions` value indicating whether the expression is
+    /// constant across partitions, and its actual value (if available).
+    pub fn is_expr_constant(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Option<AcrossPartitions> {
+        if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+            return Some(AcrossPartitions::Uniform(Some(lit.value().clone())));
+        }
+        if let Some(cls) = self.get_equivalence_class(expr)
+            && cls.constant.is_some()
+        {
+            return cls.constant.clone();
+        }
+        // TODO: This function should be able to return values of non-literal
+        //       complex constants as well; e.g. it should return `8` for the
+        //       expression `3 + 5`, not an unknown `heterogenous` value.
+        let children = expr.children();
+        if children.is_empty() {
+            return None;
+        }
+        for child in children {
+            self.is_expr_constant(child)?;
+        }
+        Some(AcrossPartitions::Heterogeneous)
     }
 
     /// Returns the equivalence class containing `expr`. If no equivalence class
     /// contains `expr`, returns `None`.
-    fn get_equivalence_class(
+    pub fn get_equivalence_class(
         &self,
         expr: &Arc<dyn PhysicalExpr>,
     ) -> Option<&EquivalenceClass> {
-        self.iter().find(|cls| cls.contains(expr))
+        self.map.get(expr).map(|idx| &self.classes[*idx])
     }
 
     /// Combine equivalence groups of the given join children.
@@ -628,18 +771,16 @@ impl EquivalenceGroup {
         join_type: &JoinType,
         left_size: usize,
         on: &[(PhysicalExprRef, PhysicalExprRef)],
-    ) -> Self {
-        match join_type {
+    ) -> Result<Self> {
+        let group = match join_type {
             JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
                 let mut result = Self::new(
-                    self.iter()
-                        .cloned()
-                        .chain(
-                            right_equivalences
-                                .iter()
-                                .map(|cls| cls.with_offset(left_size)),
-                        )
-                        .collect(),
+                    self.iter().cloned().chain(
+                        right_equivalences
+                            .iter()
+                            .map(|cls| cls.try_with_offset(left_size as _))
+                            .collect::<Result<Vec<_>>>()?,
+                    ),
                 );
                 // In we have an inner join, expressions in the "on" condition
                 // are equal in the resulting table.
@@ -647,36 +788,25 @@ impl EquivalenceGroup {
                     for (lhs, rhs) in on.iter() {
                         let new_lhs = Arc::clone(lhs);
                         // Rewrite rhs to point to the right side of the join:
-                        let new_rhs = Arc::clone(rhs)
-                            .transform(|expr| {
-                                if let Some(column) =
-                                    expr.as_any().downcast_ref::<Column>()
-                                {
-                                    let new_column = Arc::new(Column::new(
-                                        column.name(),
-                                        column.index() + left_size,
-                                    ))
-                                        as _;
-                                    return Ok(Transformed::yes(new_column));
-                                }
-
-                                Ok(Transformed::no(expr))
-                            })
-                            .data()
-                            .unwrap();
-                        result.add_equal_conditions(&new_lhs, &new_rhs);
+                        let new_rhs =
+                            add_offset_to_expr(Arc::clone(rhs), left_size as _)?;
+                        result.add_equal_conditions(new_lhs, new_rhs);
                     }
                 }
                 result
             }
             JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => self.clone(),
-            JoinType::RightSemi | JoinType::RightAnti => right_equivalences.clone(),
-        }
+            JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+                right_equivalences.clone()
+            }
+        };
+        Ok(group)
     }
 
-    /// Checks if two expressions are equal either directly or through equivalence classes.
-    /// For complex expressions (e.g. a + b), checks that the expression trees are structurally
-    /// identical and their leaf nodes are equivalent either directly or through equivalence classes.
+    /// Checks if two expressions are equal directly or through equivalence
+    /// classes. For complex expressions (e.g. `a + b`), checks that the
+    /// expression trees are structurally identical and their leaf nodes are
+    /// equivalent either directly or through equivalence classes.
     pub fn exprs_equal(
         &self,
         left: &Arc<dyn PhysicalExpr>,
@@ -689,15 +819,15 @@ impl EquivalenceGroup {
 
         // Check if expressions are equivalent through equivalence classes
         // We need to check both directions since expressions might be in different classes
-        if let Some(left_class) = self.get_equivalence_class(left) {
-            if left_class.contains(right) {
-                return true;
-            }
+        if let Some(left_class) = self.get_equivalence_class(left)
+            && left_class.contains(right)
+        {
+            return true;
         }
-        if let Some(right_class) = self.get_equivalence_class(right) {
-            if right_class.contains(left) {
-                return true;
-            }
+        if let Some(right_class) = self.get_equivalence_class(right)
+            && right_class.contains(left)
+        {
+            return true;
         }
 
         // For non-leaf nodes, check structural equality
@@ -726,16 +856,19 @@ impl EquivalenceGroup {
             .zip(right_children)
             .all(|(left_child, right_child)| self.exprs_equal(left_child, right_child))
     }
+}
 
-    /// Return the inner classes of this equivalence group.
-    pub fn into_inner(self) -> Vec<EquivalenceClass> {
-        self.classes
+impl Deref for EquivalenceGroup {
+    type Target = [EquivalenceClass];
+
+    fn deref(&self) -> &Self::Target {
+        &self.classes
     }
 }
 
 impl IntoIterator for EquivalenceGroup {
     type Item = EquivalenceClass;
-    type IntoIter = IntoIter<EquivalenceClass>;
+    type IntoIter = IntoIter<Self::Item>;
 
     fn into_iter(self) -> Self::IntoIter {
         self.classes.into_iter()
@@ -756,11 +889,28 @@ impl Display for EquivalenceGroup {
     }
 }
 
+impl From<Vec<EquivalenceClass>> for EquivalenceGroup {
+    fn from(classes: Vec<EquivalenceClass>) -> Self {
+        let mut result = Self {
+            map: classes
+                .iter()
+                .enumerate()
+                .flat_map(|(idx, cls)| {
+                    cls.iter().map(move |expr| (Arc::clone(expr), idx))
+                })
+                .collect(),
+            classes,
+        };
+        result.remove_redundant_entries();
+        result
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::equivalence::tests::create_test_params;
-    use crate::expressions::{binary, col, lit, BinaryExpr, Literal};
+    use crate::expressions::{BinaryExpr, Column, Literal, binary, col, lit};
     use arrow::datatypes::{DataType, Field, Schema};
 
     use datafusion_common::{Result, ScalarValue};
@@ -786,16 +936,25 @@ mod tests {
         for (entries, expected) in test_cases {
             let entries = entries
                 .into_iter()
-                .map(|entry| entry.into_iter().map(lit).collect::<Vec<_>>())
+                .map(|entry| {
+                    entry.into_iter().map(|idx| {
+                        let c = Column::new(format!("col_{idx}").as_str(), idx);
+                        Arc::new(c) as _
+                    })
+                })
                 .map(EquivalenceClass::new)
                 .collect::<Vec<_>>();
             let expected = expected
                 .into_iter()
-                .map(|entry| entry.into_iter().map(lit).collect::<Vec<_>>())
+                .map(|entry| {
+                    entry.into_iter().map(|idx| {
+                        let c = Column::new(format!("col_{idx}").as_str(), idx);
+                        Arc::new(c) as _
+                    })
+                })
                 .map(EquivalenceClass::new)
                 .collect::<Vec<_>>();
-            let mut eq_groups = EquivalenceGroup::new(entries.clone());
-            eq_groups.bridge_classes();
+            let eq_groups: EquivalenceGroup = entries.clone().into();
             let eq_groups = eq_groups.classes;
             let err_msg = format!(
                 "error in test entries: {entries:?}, expected: {expected:?}, actual:{eq_groups:?}"
@@ -810,58 +969,45 @@ mod tests {
 
     #[test]
     fn test_remove_redundant_entries_eq_group() -> Result<()> {
+        let c = |idx| Arc::new(Column::new(format!("col_{idx}").as_str(), idx)) as _;
         let entries = [
-            EquivalenceClass::new(vec![lit(1), lit(1), lit(2)]),
-            // This group is meaningless should be removed
-            EquivalenceClass::new(vec![lit(3), lit(3)]),
-            EquivalenceClass::new(vec![lit(4), lit(5), lit(6)]),
+            EquivalenceClass::new([c(1), c(1), lit(20)]),
+            EquivalenceClass::new([lit(30), lit(30)]),
+            EquivalenceClass::new([c(2), c(3), c(4)]),
         ];
         // Given equivalences classes are not in succinct form.
         // Expected form is the most plain representation that is functionally same.
         let expected = [
-            EquivalenceClass::new(vec![lit(1), lit(2)]),
-            EquivalenceClass::new(vec![lit(4), lit(5), lit(6)]),
+            EquivalenceClass::new([c(1), lit(20)]),
+            EquivalenceClass::new([lit(30)]),
+            EquivalenceClass::new([c(2), c(3), c(4)]),
         ];
-        let mut eq_groups = EquivalenceGroup::new(entries.to_vec());
-        eq_groups.remove_redundant_entries();
-
-        let eq_groups = eq_groups.classes;
-        assert_eq!(eq_groups.len(), expected.len());
-        assert_eq!(eq_groups.len(), 2);
-
-        assert_eq!(eq_groups[0], expected[0]);
-        assert_eq!(eq_groups[1], expected[1]);
+        let eq_groups = EquivalenceGroup::new(entries);
+        assert_eq!(eq_groups.classes, expected);
         Ok(())
     }
 
     #[test]
     fn test_schema_normalize_expr_with_equivalence() -> Result<()> {
-        let col_a = &Column::new("a", 0);
-        let col_b = &Column::new("b", 1);
-        let col_c = &Column::new("c", 2);
+        let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
+        let col_b = Arc::new(Column::new("b", 1)) as _;
+        let col_c = Arc::new(Column::new("c", 2)) as _;
         // Assume that column a and c are aliases.
-        let (_test_schema, eq_properties) = create_test_params()?;
-
-        let col_a_expr = Arc::new(col_a.clone()) as Arc<dyn PhysicalExpr>;
-        let col_b_expr = Arc::new(col_b.clone()) as Arc<dyn PhysicalExpr>;
-        let col_c_expr = Arc::new(col_c.clone()) as Arc<dyn PhysicalExpr>;
-        // Test cases for equivalence normalization,
-        // First entry in the tuple is argument, second entry is expected result after normalization.
+        let (_, eq_properties) = create_test_params()?;
+        // Test cases for equivalence normalization. First entry in the tuple is
+        // the argument, second entry is expected result after normalization.
         let expressions = vec![
             // Normalized version of the column a and c should go to a
             // (by convention all the expressions inside equivalence class are mapped to the first entry
             // in this case a is the first entry in the equivalence class.)
-            (&col_a_expr, &col_a_expr),
-            (&col_c_expr, &col_a_expr),
+            (Arc::clone(&col_a), Arc::clone(&col_a)),
+            (col_c, col_a),
             // Cannot normalize column b
-            (&col_b_expr, &col_b_expr),
+            (Arc::clone(&col_b), Arc::clone(&col_b)),
         ];
         let eq_group = eq_properties.eq_group();
         for (expr, expected_eq) in expressions {
-            assert!(
-                expected_eq.eq(&eq_group.normalize_expr(Arc::clone(expr))),
-                "error in test: expr: {expr:?}"
-            );
+            assert!(expected_eq.eq(&eq_group.normalize_expr(expr)));
         }
 
         Ok(())
@@ -869,21 +1015,15 @@ mod tests {
 
     #[test]
     fn test_contains_any() {
-        let lit_true = Arc::new(Literal::new(ScalarValue::Boolean(Some(true))))
-            as Arc<dyn PhysicalExpr>;
-        let lit_false = Arc::new(Literal::new(ScalarValue::Boolean(Some(false))))
-            as Arc<dyn PhysicalExpr>;
-        let lit2 =
-            Arc::new(Literal::new(ScalarValue::Int32(Some(2)))) as Arc<dyn PhysicalExpr>;
-        let lit1 =
-            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
-        let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
-
-        let cls1 =
-            EquivalenceClass::new(vec![Arc::clone(&lit_true), Arc::clone(&lit_false)]);
-        let cls2 =
-            EquivalenceClass::new(vec![Arc::clone(&lit_true), Arc::clone(&col_b_expr)]);
-        let cls3 = EquivalenceClass::new(vec![Arc::clone(&lit2), Arc::clone(&lit1)]);
+        let lit_true = Arc::new(Literal::new(ScalarValue::from(true))) as _;
+        let lit_false = Arc::new(Literal::new(ScalarValue::from(false))) as _;
+        let col_a_expr = Arc::new(Column::new("a", 0)) as _;
+        let col_b_expr = Arc::new(Column::new("b", 1)) as _;
+        let col_c_expr = Arc::new(Column::new("c", 2)) as _;
+
+        let cls1 = EquivalenceClass::new([Arc::clone(&lit_true), col_a_expr]);
+        let cls2 = EquivalenceClass::new([lit_true, col_b_expr]);
+        let cls3 = EquivalenceClass::new([col_c_expr, lit_false]);
 
         // lit_true is common
         assert!(cls1.contains_any(&cls2));
@@ -902,21 +1042,19 @@ mod tests {
         }
 
         // Create test columns
-        let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
-        let col_b = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
-        let col_x = Arc::new(Column::new("x", 2)) as Arc<dyn PhysicalExpr>;
-        let col_y = Arc::new(Column::new("y", 3)) as Arc<dyn PhysicalExpr>;
+        let col_a = Arc::new(Column::new("a", 0)) as _;
+        let col_b = Arc::new(Column::new("b", 1)) as _;
+        let col_x = Arc::new(Column::new("x", 2)) as _;
+        let col_y = Arc::new(Column::new("y", 3)) as _;
 
         // Create test literals
-        let lit_1 =
-            Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
-        let lit_2 =
-            Arc::new(Literal::new(ScalarValue::Int32(Some(2)))) as Arc<dyn PhysicalExpr>;
+        let lit_1 = Arc::new(Literal::new(ScalarValue::from(1))) as _;
+        let lit_2 = Arc::new(Literal::new(ScalarValue::from(2))) as _;
 
         // Create equivalence group with classes (a = x) and (b = y)
-        let eq_group = EquivalenceGroup::new(vec![
-            EquivalenceClass::new(vec![Arc::clone(&col_a), Arc::clone(&col_x)]),
-            EquivalenceClass::new(vec![Arc::clone(&col_b), Arc::clone(&col_y)]),
+        let eq_group = EquivalenceGroup::new([
+            EquivalenceClass::new([Arc::clone(&col_a), Arc::clone(&col_x)]),
+            EquivalenceClass::new([Arc::clone(&col_b), Arc::clone(&col_y)]),
         ]);
 
         let test_cases = vec![
@@ -944,8 +1082,7 @@ mod tests {
                 left: Arc::clone(&col_a),
                 right: Arc::clone(&col_b),
                 expected: false,
-                description:
-                    "Columns in different equivalence classes should not be equal",
+                description: "Columns in different equivalence classes should not be equal",
             },
             // Literal tests
             TestCase {
@@ -966,42 +1103,40 @@ mod tests {
                     Arc::clone(&col_a),
                     Operator::Plus,
                     Arc::clone(&col_b),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 right: Arc::new(BinaryExpr::new(
                     Arc::clone(&col_x),
                     Operator::Plus,
                     Arc::clone(&col_y),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 expected: true,
-                description:
-                    "Binary expressions with equivalent operands should be equal",
+                description: "Binary expressions with equivalent operands should be equal",
             },
             TestCase {
                 left: Arc::new(BinaryExpr::new(
                     Arc::clone(&col_a),
                     Operator::Plus,
                     Arc::clone(&col_b),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 right: Arc::new(BinaryExpr::new(
                     Arc::clone(&col_x),
                     Operator::Plus,
                     Arc::clone(&col_a),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 expected: false,
-                description:
-                    "Binary expressions with non-equivalent operands should not be equal",
+                description: "Binary expressions with non-equivalent operands should not be equal",
             },
             TestCase {
                 left: Arc::new(BinaryExpr::new(
                     Arc::clone(&col_a),
                     Operator::Plus,
                     Arc::clone(&lit_1),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 right: Arc::new(BinaryExpr::new(
                     Arc::clone(&col_x),
                     Operator::Plus,
                     Arc::clone(&lit_1),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 expected: true,
                 description: "Binary expressions with equivalent column and same literal should be equal",
             },
@@ -1014,7 +1149,7 @@ mod tests {
                     )),
                     Operator::Multiply,
                     Arc::clone(&lit_1),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 right: Arc::new(BinaryExpr::new(
                     Arc::new(BinaryExpr::new(
                         Arc::clone(&col_x),
@@ -1023,7 +1158,7 @@ mod tests {
                     )),
                     Operator::Multiply,
                     Arc::clone(&lit_1),
-                )) as Arc<dyn PhysicalExpr>,
+                )) as _,
                 expected: true,
                 description: "Nested binary expressions with equivalent operands should be equal",
             },
@@ -1057,36 +1192,36 @@ mod tests {
             Field::new("b", DataType::Int32, false),
             Field::new("c", DataType::Int32, false),
         ]));
-        let mut group = EquivalenceGroup::empty();
-        group.add_equal_conditions(&col("a", &schema)?, &col("b", &schema)?);
+        let mut group = EquivalenceGroup::default();
+        group.add_equal_conditions(col("a", &schema)?, col("b", &schema)?);
 
         let projected_schema = Arc::new(Schema::new(vec![
             Field::new("a+c", DataType::Int32, false),
             Field::new("b+c", DataType::Int32, false),
         ]));
 
-        let mapping = ProjectionMapping {
-            map: vec![
-                (
-                    binary(
-                        col("a", &schema)?,
-                        Operator::Plus,
-                        col("c", &schema)?,
-                        &schema,
-                    )?,
-                    col("a+c", &projected_schema)?,
-                ),
-                (
-                    binary(
-                        col("b", &schema)?,
-                        Operator::Plus,
-                        col("c", &schema)?,
-                        &schema,
-                    )?,
-                    col("b+c", &projected_schema)?,
-                ),
-            ],
-        };
+        let mapping = [
+            (
+                binary(
+                    col("a", &schema)?,
+                    Operator::Plus,
+                    col("c", &schema)?,
+                    &schema,
+                )?,
+                vec![(col("a+c", &projected_schema)?, 0)].into(),
+            ),
+            (
+                binary(
+                    col("b", &schema)?,
+                    Operator::Plus,
+                    col("c", &schema)?,
+                    &schema,
+                )?,
+                vec![(col("b+c", &projected_schema)?, 1)].into(),
+            ),
+        ]
+        .into_iter()
+        .collect::<ProjectionMapping>();
 
         let projected = group.project(&mapping);
 
diff --git a/datafusion/physical-expr/src/equivalence/mod.rs b/datafusion/physical-expr/src/equivalence/mod.rs
index ef98b48122658..0d6699c7101fe 100644
--- a/datafusion/physical-expr/src/equivalence/mod.rs
+++ b/datafusion/physical-expr/src/equivalence/mod.rs
@@ -15,69 +15,55 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::borrow::Borrow;
 use std::sync::Arc;
 
-use crate::expressions::Column;
-use crate::{LexRequirement, PhysicalExpr};
+use crate::PhysicalExpr;
 
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use arrow::compute::SortOptions;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 
 mod class;
 mod ordering;
-mod projection;
 mod properties;
 
 pub use class::{AcrossPartitions, ConstExpr, EquivalenceClass, EquivalenceGroup};
 pub use ordering::OrderingEquivalenceClass;
-pub use projection::ProjectionMapping;
+// Re-export for backwards compatibility, we recommend importing from
+// datafusion_physical_expr::projection instead
+pub use crate::projection::{ProjectionMapping, project_ordering, project_orderings};
 pub use properties::{
-    calculate_union, join_equivalence_properties, EquivalenceProperties,
+    EquivalenceProperties, calculate_union, join_equivalence_properties,
 };
 
-/// This function constructs a duplicate-free `LexOrderingReq` by filtering out
-/// duplicate entries that have same physical expression inside. For example,
-/// `vec![a Some(ASC), a Some(DESC)]` collapses to `vec![a Some(ASC)]`.
-///
-/// It will also filter out entries that are ordered if the next entry is;
-/// for instance, `vec![floor(a) Some(ASC), a Some(ASC)]` will be collapsed to
-/// `vec![a Some(ASC)]`.
-#[deprecated(since = "45.0.0", note = "Use LexRequirement::collapse")]
-pub fn collapse_lex_req(input: LexRequirement) -> LexRequirement {
-    input.collapse()
+// Convert each tuple to a `PhysicalSortExpr` and construct a vector.
+pub fn convert_to_sort_exprs<T: Borrow<Arc<dyn PhysicalExpr>>>(
+    args: &[(T, SortOptions)],
+) -> Vec<PhysicalSortExpr> {
+    args.iter()
+        .map(|(expr, options)| PhysicalSortExpr::new(Arc::clone(expr.borrow()), *options))
+        .collect()
 }
 
-/// Adds the `offset` value to `Column` indices inside `expr`. This function is
-/// generally used during the update of the right table schema in join operations.
-pub fn add_offset_to_expr(
-    expr: Arc<dyn PhysicalExpr>,
-    offset: usize,
-) -> Arc<dyn PhysicalExpr> {
-    expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
-        Some(col) => Ok(Transformed::yes(Arc::new(Column::new(
-            col.name(),
-            offset + col.index(),
-        )))),
-        None => Ok(Transformed::no(e)),
-    })
-    .data()
-    .unwrap()
-    // Note that we can safely unwrap here since our transform always returns
-    // an `Ok` value.
+// Convert each vector of tuples to a `LexOrdering`.
+pub fn convert_to_orderings<T: Borrow<Arc<dyn PhysicalExpr>>>(
+    args: &[Vec<(T, SortOptions)>],
+) -> Vec<LexOrdering> {
+    args.iter()
+        .filter_map(|sort_exprs| LexOrdering::new(convert_to_sort_exprs(sort_exprs)))
+        .collect()
 }
 
 #[cfg(test)]
 mod tests {
-
     use super::*;
-    use crate::expressions::col;
-    use crate::PhysicalSortExpr;
+    use crate::expressions::{Column, col};
+    use crate::{LexRequirement, PhysicalSortExpr};
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-    use datafusion_common::{plan_datafusion_err, Result};
-    use datafusion_physical_expr_common::sort_expr::{
-        LexOrdering, PhysicalSortRequirement,
-    };
+    use datafusion_common::Result;
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortRequirement;
 
     /// Converts a string to a physical sort expression
     ///
@@ -110,37 +96,6 @@ mod tests {
         sort_expr
     }
 
-    pub fn output_schema(
-        mapping: &ProjectionMapping,
-        input_schema: &Arc<Schema>,
-    ) -> Result<SchemaRef> {
-        // Calculate output schema
-        let fields: Result<Vec<Field>> = mapping
-            .iter()
-            .map(|(source, target)| {
-                let name = target
-                    .as_any()
-                    .downcast_ref::<Column>()
-                    .ok_or_else(|| plan_datafusion_err!("Expects to have column"))?
-                    .name();
-                let field = Field::new(
-                    name,
-                    source.data_type(input_schema)?,
-                    source.nullable(input_schema)?,
-                );
-
-                Ok(field)
-            })
-            .collect();
-
-        let output_schema = Arc::new(Schema::new_with_metadata(
-            fields?,
-            input_schema.metadata().clone(),
-        ));
-
-        Ok(output_schema)
-    }
-
     // Generate a schema which consists of 8 columns (a, b, c, d, e, f, g, h)
     pub fn create_test_schema() -> Result<SchemaRef> {
         let a = Field::new("a", DataType::Int32, true);
@@ -163,15 +118,15 @@ mod tests {
     /// Column [a=c] (e.g they are aliases).
     pub fn create_test_params() -> Result<(SchemaRef, EquivalenceProperties)> {
         let test_schema = create_test_schema()?;
-        let col_a = &col("a", &test_schema)?;
-        let col_b = &col("b", &test_schema)?;
-        let col_c = &col("c", &test_schema)?;
-        let col_d = &col("d", &test_schema)?;
-        let col_e = &col("e", &test_schema)?;
-        let col_f = &col("f", &test_schema)?;
-        let col_g = &col("g", &test_schema)?;
+        let col_a = col("a", &test_schema)?;
+        let col_b = col("b", &test_schema)?;
+        let col_c = col("c", &test_schema)?;
+        let col_d = col("d", &test_schema)?;
+        let col_e = col("e", &test_schema)?;
+        let col_f = col("f", &test_schema)?;
+        let col_g = col("g", &test_schema)?;
         let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
-        eq_properties.add_equal_conditions(col_a, col_c)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_a), Arc::clone(&col_c))?;
 
         let option_asc = SortOptions {
             descending: false,
@@ -194,68 +149,19 @@ mod tests {
             ],
         ];
         let orderings = convert_to_orderings(&orderings);
-        eq_properties.add_new_orderings(orderings);
+        eq_properties.add_orderings(orderings);
         Ok((test_schema, eq_properties))
     }
 
-    // Convert each tuple to PhysicalSortRequirement
+    // Convert each tuple to a `PhysicalSortRequirement` and construct a
+    // a `LexRequirement` from them.
     pub fn convert_to_sort_reqs(
-        in_data: &[(&Arc<dyn PhysicalExpr>, Option<SortOptions>)],
+        args: &[(&Arc<dyn PhysicalExpr>, Option<SortOptions>)],
     ) -> LexRequirement {
-        in_data
-            .iter()
-            .map(|(expr, options)| {
-                PhysicalSortRequirement::new(Arc::clone(*expr), *options)
-            })
-            .collect()
-    }
-
-    // Convert each tuple to PhysicalSortExpr
-    pub fn convert_to_sort_exprs(
-        in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)],
-    ) -> LexOrdering {
-        in_data
-            .iter()
-            .map(|(expr, options)| PhysicalSortExpr {
-                expr: Arc::clone(*expr),
-                options: *options,
-            })
-            .collect()
-    }
-
-    // Convert each inner tuple to PhysicalSortExpr
-    pub fn convert_to_orderings(
-        orderings: &[Vec<(&Arc<dyn PhysicalExpr>, SortOptions)>],
-    ) -> Vec<LexOrdering> {
-        orderings
-            .iter()
-            .map(|sort_exprs| convert_to_sort_exprs(sort_exprs))
-            .collect()
-    }
-
-    // Convert each tuple to PhysicalSortExpr
-    pub fn convert_to_sort_exprs_owned(
-        in_data: &[(Arc<dyn PhysicalExpr>, SortOptions)],
-    ) -> LexOrdering {
-        LexOrdering::new(
-            in_data
-                .iter()
-                .map(|(expr, options)| PhysicalSortExpr {
-                    expr: Arc::clone(expr),
-                    options: *options,
-                })
-                .collect(),
-        )
-    }
-
-    // Convert each inner tuple to PhysicalSortExpr
-    pub fn convert_to_orderings_owned(
-        orderings: &[Vec<(Arc<dyn PhysicalExpr>, SortOptions)>],
-    ) -> Vec<LexOrdering> {
-        orderings
-            .iter()
-            .map(|sort_exprs| convert_to_sort_exprs_owned(sort_exprs))
-            .collect()
+        let exprs = args.iter().map(|(expr, options)| {
+            PhysicalSortRequirement::new(Arc::clone(*expr), *options)
+        });
+        LexRequirement::new(exprs).unwrap()
     }
 
     #[test]
@@ -269,49 +175,49 @@ mod tests {
         ]));
 
         let mut eq_properties = EquivalenceProperties::new(schema);
-        let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
-        let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
-        let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
-        let col_x_expr = Arc::new(Column::new("x", 3)) as Arc<dyn PhysicalExpr>;
-        let col_y_expr = Arc::new(Column::new("y", 4)) as Arc<dyn PhysicalExpr>;
+        let col_a = Arc::new(Column::new("a", 0)) as _;
+        let col_b = Arc::new(Column::new("b", 1)) as _;
+        let col_c = Arc::new(Column::new("c", 2)) as _;
+        let col_x = Arc::new(Column::new("x", 3)) as _;
+        let col_y = Arc::new(Column::new("y", 4)) as _;
 
         // a and b are aliases
-        eq_properties.add_equal_conditions(&col_a_expr, &col_b_expr)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_a), Arc::clone(&col_b))?;
         assert_eq!(eq_properties.eq_group().len(), 1);
 
         // This new entry is redundant, size shouldn't increase
-        eq_properties.add_equal_conditions(&col_b_expr, &col_a_expr)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_a))?;
         assert_eq!(eq_properties.eq_group().len(), 1);
         let eq_groups = eq_properties.eq_group().iter().next().unwrap();
         assert_eq!(eq_groups.len(), 2);
-        assert!(eq_groups.contains(&col_a_expr));
-        assert!(eq_groups.contains(&col_b_expr));
+        assert!(eq_groups.contains(&col_a));
+        assert!(eq_groups.contains(&col_b));
 
         // b and c are aliases. Existing equivalence class should expand,
         // however there shouldn't be any new equivalence class
-        eq_properties.add_equal_conditions(&col_b_expr, &col_c_expr)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_c))?;
         assert_eq!(eq_properties.eq_group().len(), 1);
         let eq_groups = eq_properties.eq_group().iter().next().unwrap();
         assert_eq!(eq_groups.len(), 3);
-        assert!(eq_groups.contains(&col_a_expr));
-        assert!(eq_groups.contains(&col_b_expr));
-        assert!(eq_groups.contains(&col_c_expr));
+        assert!(eq_groups.contains(&col_a));
+        assert!(eq_groups.contains(&col_b));
+        assert!(eq_groups.contains(&col_c));
 
         // This is a new set of equality. Hence equivalent class count should be 2.
-        eq_properties.add_equal_conditions(&col_x_expr, &col_y_expr)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_x), Arc::clone(&col_y))?;
         assert_eq!(eq_properties.eq_group().len(), 2);
 
         // This equality bridges distinct equality sets.
         // Hence equivalent class count should decrease from 2 to 1.
-        eq_properties.add_equal_conditions(&col_x_expr, &col_a_expr)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_x), Arc::clone(&col_a))?;
         assert_eq!(eq_properties.eq_group().len(), 1);
         let eq_groups = eq_properties.eq_group().iter().next().unwrap();
         assert_eq!(eq_groups.len(), 5);
-        assert!(eq_groups.contains(&col_a_expr));
-        assert!(eq_groups.contains(&col_b_expr));
-        assert!(eq_groups.contains(&col_c_expr));
-        assert!(eq_groups.contains(&col_x_expr));
-        assert!(eq_groups.contains(&col_y_expr));
+        assert!(eq_groups.contains(&col_a));
+        assert!(eq_groups.contains(&col_b));
+        assert!(eq_groups.contains(&col_c));
+        assert!(eq_groups.contains(&col_x));
+        assert!(eq_groups.contains(&col_y));
 
         Ok(())
     }
diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs
index 819f8905bda51..2ce8a8d246fe7 100644
--- a/datafusion/physical-expr/src/equivalence/ordering.rs
+++ b/datafusion/physical-expr/src/equivalence/ordering.rs
@@ -16,115 +16,83 @@
 // under the License.
 
 use std::fmt::Display;
-use std::hash::Hash;
+use std::ops::Deref;
 use std::sync::Arc;
 use std::vec::IntoIter;
 
-use crate::equivalence::add_offset_to_expr;
-use crate::{LexOrdering, PhysicalExpr};
+use crate::expressions::with_new_schema;
+use crate::{LexOrdering, PhysicalExpr, add_offset_to_physical_sort_exprs};
 
 use arrow::compute::SortOptions;
-use datafusion_common::HashSet;
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{HashSet, Result};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
-/// An `OrderingEquivalenceClass` object keeps track of different alternative
-/// orderings than can describe a schema. For example, consider the following table:
+/// An `OrderingEquivalenceClass` keeps track of distinct alternative orderings
+/// than can describe a table. For example, consider the following table:
 ///
 /// ```text
-/// |a|b|c|d|
-/// |1|4|3|1|
-/// |2|3|3|2|
-/// |3|1|2|2|
-/// |3|2|1|3|
+/// ┌───┬───┬───┬───┐
+/// │ a │ b │ c │ d │
+/// ├───┼───┼───┼───┤
+/// │ 1 │ 4 │ 3 │ 1 │
+/// │ 2 │ 3 │ 3 │ 2 │
+/// │ 3 │ 1 │ 2 │ 2 │
+/// │ 3 │ 2 │ 1 │ 3 │
+/// └───┴───┴───┴───┘
 /// ```
 ///
-/// Here, both `vec![a ASC, b ASC]` and `vec![c DESC, d ASC]` describe the table
+/// Here, both `[a ASC, b ASC]` and `[c DESC, d ASC]` describe the table
 /// ordering. In this case, we say that these orderings are equivalent.
-#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)]
+///
+/// An `OrderingEquivalenceClass` is a set of such equivalent orderings, which
+/// is represented by a vector of `LexOrdering`s. The set does not store any
+/// redundant information by enforcing the invariant that no suffix of an
+/// ordering in the equivalence class is a prefix of another ordering in the
+/// equivalence class. The set can be empty, which means that there are no
+/// orderings that describe the table.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
 pub struct OrderingEquivalenceClass {
     orderings: Vec<LexOrdering>,
 }
 
 impl OrderingEquivalenceClass {
-    /// Creates new empty ordering equivalence class.
-    pub fn empty() -> Self {
-        Default::default()
-    }
-
     /// Clears (empties) this ordering equivalence class.
     pub fn clear(&mut self) {
         self.orderings.clear();
     }
 
-    /// Creates new ordering equivalence class from the given orderings
-    ///
-    /// Any redundant entries are removed
-    pub fn new(orderings: Vec<LexOrdering>) -> Self {
-        let mut result = Self { orderings };
+    /// Creates a new ordering equivalence class from the given orderings
+    /// and removes any redundant entries (if given).
+    pub fn new(
+        orderings: impl IntoIterator<Item = impl IntoIterator<Item = PhysicalSortExpr>>,
+    ) -> Self {
+        let mut result = Self {
+            orderings: orderings.into_iter().filter_map(LexOrdering::new).collect(),
+        };
         result.remove_redundant_entries();
         result
     }
 
-    /// Converts this OrderingEquivalenceClass to a vector of orderings.
-    pub fn into_inner(self) -> Vec<LexOrdering> {
-        self.orderings
-    }
-
-    /// Checks whether `ordering` is a member of this equivalence class.
-    pub fn contains(&self, ordering: &LexOrdering) -> bool {
-        self.orderings.contains(ordering)
-    }
-
-    /// Adds `ordering` to this equivalence class.
-    #[allow(dead_code)]
-    #[deprecated(
-        since = "45.0.0",
-        note = "use OrderingEquivalenceClass::add_new_ordering instead"
-    )]
-    fn push(&mut self, ordering: LexOrdering) {
-        self.add_new_ordering(ordering)
-    }
-
-    /// Checks whether this ordering equivalence class is empty.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Returns an iterator over the equivalent orderings in this class.
-    ///
-    /// Note this class also implements [`IntoIterator`] to return an iterator
-    /// over owned [`LexOrdering`]s.
-    pub fn iter(&self) -> impl Iterator<Item = &LexOrdering> {
-        self.orderings.iter()
-    }
-
-    /// Returns how many equivalent orderings there are in this class.
-    pub fn len(&self) -> usize {
-        self.orderings.len()
-    }
-
-    /// Extend this ordering equivalence class with the `other` class.
-    pub fn extend(&mut self, other: Self) {
-        self.orderings.extend(other.orderings);
+    /// Extend this ordering equivalence class with the given orderings.
+    pub fn extend(&mut self, orderings: impl IntoIterator<Item = LexOrdering>) {
+        self.orderings.extend(orderings);
         // Make sure that there are no redundant orderings:
         self.remove_redundant_entries();
     }
 
-    /// Adds new orderings into this ordering equivalence class
-    pub fn add_new_orderings(
+    /// Adds new orderings into this ordering equivalence class.
+    pub fn add_orderings(
         &mut self,
-        orderings: impl IntoIterator<Item = LexOrdering>,
+        sort_exprs: impl IntoIterator<Item = impl IntoIterator<Item = PhysicalSortExpr>>,
     ) {
-        self.orderings.extend(orderings);
+        self.orderings
+            .extend(sort_exprs.into_iter().filter_map(LexOrdering::new));
         // Make sure that there are no redundant orderings:
         self.remove_redundant_entries();
     }
 
-    /// Adds a single ordering to the existing ordering equivalence class.
-    pub fn add_new_ordering(&mut self, ordering: LexOrdering) {
-        self.add_new_orderings([ordering]);
-    }
-
-    /// Removes redundant orderings from this equivalence class.
+    /// Removes redundant orderings from this ordering equivalence class.
     ///
     /// For instance, if we already have the ordering `[a ASC, b ASC, c DESC]`,
     /// then there is no need to keep ordering `[a ASC, b ASC]` in the state.
@@ -133,82 +101,72 @@ impl OrderingEquivalenceClass {
         while work {
             work = false;
             let mut idx = 0;
-            while idx < self.orderings.len() {
+            'outer: while idx < self.orderings.len() {
                 let mut ordering_idx = idx + 1;
-                let mut removal = self.orderings[idx].is_empty();
                 while ordering_idx < self.orderings.len() {
-                    work |= self.resolve_overlap(idx, ordering_idx);
-                    if self.orderings[idx].is_empty() {
-                        removal = true;
-                        break;
+                    if let Some(remove) = self.resolve_overlap(idx, ordering_idx) {
+                        work = true;
+                        if remove {
+                            self.orderings.swap_remove(idx);
+                            continue 'outer;
+                        }
                     }
-                    work |= self.resolve_overlap(ordering_idx, idx);
-                    if self.orderings[ordering_idx].is_empty() {
-                        self.orderings.swap_remove(ordering_idx);
-                    } else {
-                        ordering_idx += 1;
+                    if let Some(remove) = self.resolve_overlap(ordering_idx, idx) {
+                        work = true;
+                        if remove {
+                            self.orderings.swap_remove(ordering_idx);
+                            continue;
+                        }
                     }
+                    ordering_idx += 1;
                 }
-                if removal {
-                    self.orderings.swap_remove(idx);
-                } else {
-                    idx += 1;
-                }
+                idx += 1;
             }
         }
     }
 
     /// Trims `orderings[idx]` if some suffix of it overlaps with a prefix of
-    /// `orderings[pre_idx]`. Returns `true` if there is any overlap, `false` otherwise.
+    /// `orderings[pre_idx]`. If there is any overlap, returns a `Some(true)`
+    /// if any trimming took place, and `Some(false)` otherwise. If there is
+    /// no overlap, returns `None`.
     ///
     /// For example, if `orderings[idx]` is `[a ASC, b ASC, c DESC]` and
     /// `orderings[pre_idx]` is `[b ASC, c DESC]`, then the function will trim
     /// `orderings[idx]` to `[a ASC]`.
-    fn resolve_overlap(&mut self, idx: usize, pre_idx: usize) -> bool {
+    fn resolve_overlap(&mut self, idx: usize, pre_idx: usize) -> Option<bool> {
         let length = self.orderings[idx].len();
         let other_length = self.orderings[pre_idx].len();
         for overlap in 1..=length.min(other_length) {
             if self.orderings[idx][length - overlap..]
                 == self.orderings[pre_idx][..overlap]
             {
-                self.orderings[idx].truncate(length - overlap);
-                return true;
+                return Some(!self.orderings[idx].truncate(length - overlap));
             }
         }
-        false
+        None
     }
 
     /// Returns the concatenation of all the orderings. This enables merge
     /// operations to preserve all equivalent orderings simultaneously.
     pub fn output_ordering(&self) -> Option<LexOrdering> {
-        let output_ordering = self
-            .orderings
-            .iter()
-            .flatten()
-            .cloned()
-            .collect::<LexOrdering>()
-            .collapse();
-        (!output_ordering.is_empty()).then_some(output_ordering)
+        self.orderings.iter().cloned().reduce(|mut cat, o| {
+            cat.extend(o);
+            cat
+        })
     }
 
-    // Append orderings in `other` to all existing orderings in this equivalence
-    // class.
+    // Append orderings in `other` to all existing orderings in this ordering
+    // equivalence class.
     pub fn join_suffix(mut self, other: &Self) -> Self {
         let n_ordering = self.orderings.len();
-        // Replicate entries before cross product
+        // Replicate entries before cross product:
         let n_cross = std::cmp::max(n_ordering, other.len() * n_ordering);
-        self.orderings = self
-            .orderings
-            .iter()
-            .cloned()
-            .cycle()
-            .take(n_cross)
-            .collect();
-        // Suffix orderings of other to the current orderings.
+        self.orderings = self.orderings.into_iter().cycle().take(n_cross).collect();
+        // Append sort expressions of `other` to the current orderings:
         for (outer_idx, ordering) in other.iter().enumerate() {
-            for idx in 0..n_ordering {
-                // Calculate cross product index
-                let idx = outer_idx * n_ordering + idx;
+            let base = outer_idx * n_ordering;
+            // Use the cross product index:
+            for idx in base..(base + n_ordering) {
                 self.orderings[idx].extend(ordering.iter().cloned());
             }
         }
@@ -217,12 +175,40 @@ impl OrderingEquivalenceClass {
 
     /// Adds `offset` value to the index of each expression inside this
     /// ordering equivalence class.
-    pub fn add_offset(&mut self, offset: usize) {
-        for ordering in self.orderings.iter_mut() {
-            ordering.transform(|sort_expr| {
-                sort_expr.expr = add_offset_to_expr(Arc::clone(&sort_expr.expr), offset);
-            })
+    pub fn add_offset(&mut self, offset: isize) -> Result<()> {
+        let orderings = std::mem::take(&mut self.orderings);
+        for ordering_result in orderings
+            .into_iter()
+            .map(|o| add_offset_to_physical_sort_exprs(o, offset))
+        {
+            self.orderings.extend(LexOrdering::new(ordering_result?));
         }
+        Ok(())
+    }
+
+    /// Transforms this `OrderingEquivalenceClass` by mapping columns in the
+    /// original schema to columns in the new schema by index. The new schema
+    /// and the original schema needs to be aligned; i.e. they should have the
+    /// same number of columns, and fields at the same index have the same type
+    /// in both schemas.
+    pub fn with_new_schema(mut self, schema: &SchemaRef) -> Result<Self> {
+        self.orderings = self
+            .orderings
+            .into_iter()
+            .map(|ordering| {
+                ordering
+                    .into_iter()
+                    .map(|mut sort_expr| {
+                        sort_expr.expr = with_new_schema(sort_expr.expr, schema)?;
+                        Ok(sort_expr)
+                    })
+                    .collect::<Result<Vec<_>>>()
+                    // The following `unwrap` is safe because the vector will always
+                    // be non-empty.
+                    .map(|v| LexOrdering::new(v).unwrap())
+            })
+            .collect::<Result<_>>()?;
+        Ok(self)
     }
 
     /// Gets sort options associated with this expression if it is a leading
@@ -257,31 +243,6 @@ impl OrderingEquivalenceClass {
     /// added as a constant during `ordering_satisfy_requirement()` iterations
     /// after the corresponding prefix requirement is satisfied.
     ///
-    /// ### Example Scenarios
-    ///
-    /// In these scenarios, we assume that all expressions share the same sort
-    /// properties.
-    ///
-    /// #### Case 1: Sort Requirement `[a, c]`
-    ///
-    /// **Existing Orderings:** `[[a, b, c], [a, d]]`, **Constants:** `[]`
-    /// 1. `ordering_satisfy_single()` returns `true` because the requirement
-    ///    `a` is satisfied by `[a, b, c].first()`.
-    /// 2. `a` is added as a constant for the next iteration.
-    /// 3. The normalized orderings become `[[b, c], [d]]`.
-    /// 4. `ordering_satisfy_single()` returns `false` for `c`, as neither
-    ///    `[b, c]` nor `[d]` satisfies `c`.
-    ///
-    /// #### Case 2: Sort Requirement `[a, d]`
-    ///
-    /// **Existing Orderings:** `[[a, b, c], [a, d]]`, **Constants:** `[]`
-    /// 1. `ordering_satisfy_single()` returns `true` because the requirement
-    ///    `a` is satisfied by `[a, b, c].first()`.
-    /// 2. `a` is added as a constant for the next iteration.
-    /// 3. The normalized orderings become `[[b, c], [d]]`.
-    /// 4. `ordering_satisfy_single()` returns `true` for `d`, as `[d]` satisfies
-    ///    `d`.
-    ///
     /// ### Future Improvements
     ///
     /// This function may become unnecessary if any of the following improvements
@@ -296,15 +257,14 @@ impl OrderingEquivalenceClass {
         ];
 
         for ordering in self.iter() {
-            if let Some(leading_ordering) = ordering.first() {
-                if leading_ordering.expr.eq(expr) {
-                    let opt = (
-                        leading_ordering.options.descending,
-                        leading_ordering.options.nulls_first,
-                    );
-                    constantness_defining_pairs[0].remove(&opt);
-                    constantness_defining_pairs[1].remove(&opt);
-                }
+            let leading_ordering = ordering.first();
+            if leading_ordering.expr.eq(expr) {
+                let opt = (
+                    leading_ordering.options.descending,
+                    leading_ordering.options.nulls_first,
+                );
+                constantness_defining_pairs[0].remove(&opt);
+                constantness_defining_pairs[1].remove(&opt);
             }
         }
 
@@ -314,10 +274,26 @@ impl OrderingEquivalenceClass {
     }
 }
 
-/// Convert the `OrderingEquivalenceClass` into an iterator of LexOrderings
+impl Deref for OrderingEquivalenceClass {
+    type Target = [LexOrdering];
+
+    fn deref(&self) -> &Self::Target {
+        self.orderings.as_slice()
+    }
+}
+
+impl From<Vec<LexOrdering>> for OrderingEquivalenceClass {
+    fn from(orderings: Vec<LexOrdering>) -> Self {
+        let mut result = Self { orderings };
+        result.remove_redundant_entries();
+        result
+    }
+}
+
+/// Convert the `OrderingEquivalenceClass` into an iterator of `LexOrdering`s.
 impl IntoIterator for OrderingEquivalenceClass {
     type Item = LexOrdering;
-    type IntoIter = IntoIter<LexOrdering>;
+    type IntoIter = IntoIter<Self::Item>;
 
     fn into_iter(self) -> Self::IntoIter {
         self.orderings.into_iter()
@@ -334,8 +310,13 @@ impl Display for OrderingEquivalenceClass {
         for ordering in iter {
             write!(f, ", [{ordering}]")?;
         }
-        write!(f, "]")?;
-        Ok(())
+        write!(f, "]")
+    }
+}
+
+impl From<OrderingEquivalenceClass> for Vec<LexOrdering> {
+    fn from(oeq_class: OrderingEquivalenceClass) -> Self {
+        oeq_class.orderings
     }
 }
 
@@ -343,14 +324,12 @@ impl Display for OrderingEquivalenceClass {
 mod tests {
     use std::sync::Arc;
 
-    use crate::equivalence::tests::{
-        convert_to_orderings, convert_to_sort_exprs, create_test_schema,
-    };
+    use crate::equivalence::tests::create_test_schema;
     use crate::equivalence::{
         EquivalenceClass, EquivalenceGroup, EquivalenceProperties,
-        OrderingEquivalenceClass,
+        OrderingEquivalenceClass, convert_to_orderings, convert_to_sort_exprs,
     };
-    use crate::expressions::{col, BinaryExpr, Column};
+    use crate::expressions::{BinaryExpr, Column, col};
     use crate::utils::tests::TestScalarUDF;
     use crate::{
         AcrossPartitions, ConstExpr, PhysicalExpr, PhysicalExprRef, PhysicalSortExpr,
@@ -360,8 +339,8 @@ mod tests {
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::Result;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::{Operator, ScalarUDF};
-    use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
     #[test]
     fn test_ordering_satisfy() -> Result<()> {
@@ -369,11 +348,11 @@ mod tests {
             Field::new("a", DataType::Int64, true),
             Field::new("b", DataType::Int64, true),
         ]));
-        let crude = LexOrdering::new(vec![PhysicalSortExpr {
+        let crude = vec![PhysicalSortExpr {
             expr: Arc::new(Column::new("a", 0)),
             options: SortOptions::default(),
-        }]);
-        let finer = LexOrdering::new(vec![
+        }];
+        let finer = vec![
             PhysicalSortExpr {
                 expr: Arc::new(Column::new("a", 0)),
                 options: SortOptions::default(),
@@ -382,20 +361,18 @@ mod tests {
                 expr: Arc::new(Column::new("b", 1)),
                 options: SortOptions::default(),
             },
-        ]);
+        ];
         // finer ordering satisfies, crude ordering should return true
         let eq_properties_finer = EquivalenceProperties::new_with_orderings(
             Arc::clone(&input_schema),
-            &[finer.clone()],
+            [finer.clone()],
         );
-        assert!(eq_properties_finer.ordering_satisfy(crude.as_ref()));
+        assert!(eq_properties_finer.ordering_satisfy(crude.clone())?);
 
         // Crude ordering doesn't satisfy finer ordering. should return false
-        let eq_properties_crude = EquivalenceProperties::new_with_orderings(
-            Arc::clone(&input_schema),
-            &[crude.clone()],
-        );
-        assert!(!eq_properties_crude.ordering_satisfy(finer.as_ref()));
+        let eq_properties_crude =
+            EquivalenceProperties::new_with_orderings(Arc::clone(&input_schema), [crude]);
+        assert!(!eq_properties_crude.ordering_satisfy(finer)?);
         Ok(())
     }
 
@@ -414,16 +391,19 @@ mod tests {
             Arc::clone(&test_fun),
             vec![Arc::clone(col_a)],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?) as PhysicalExprRef;
         let floor_f = Arc::new(ScalarFunctionExpr::try_new(
             Arc::clone(&test_fun),
             vec![Arc::clone(col_f)],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?) as PhysicalExprRef;
         let exp_a = Arc::new(ScalarFunctionExpr::try_new(
             Arc::clone(&test_fun),
             vec![Arc::clone(col_a)],
             &test_schema,
+            Arc::new(ConfigOptions::default()),
         )?) as PhysicalExprRef;
 
         let a_plus_b = Arc::new(BinaryExpr::new(
@@ -659,33 +639,25 @@ mod tests {
         ];
 
         for (orderings, eq_group, constants, reqs, expected) in test_cases {
-            let err_msg =
-                format!("error in test orderings: {orderings:?}, eq_group: {eq_group:?}, constants: {constants:?}, reqs: {reqs:?}, expected: {expected:?}");
+            let err_msg = format!(
+                "error in test orderings: {orderings:?}, eq_group: {eq_group:?}, constants: {constants:?}, reqs: {reqs:?}, expected: {expected:?}"
+            );
             let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
             let orderings = convert_to_orderings(&orderings);
-            eq_properties.add_new_orderings(orderings);
-            let eq_group = eq_group
+            eq_properties.add_orderings(orderings);
+            let classes = eq_group
                 .into_iter()
-                .map(|eq_class| {
-                    let eq_classes = eq_class.into_iter().cloned().collect::<Vec<_>>();
-                    EquivalenceClass::new(eq_classes)
-                })
-                .collect::<Vec<_>>();
-            let eq_group = EquivalenceGroup::new(eq_group);
-            eq_properties.add_equivalence_group(eq_group);
+                .map(|eq_class| EquivalenceClass::new(eq_class.into_iter().cloned()));
+            let eq_group = EquivalenceGroup::new(classes);
+            eq_properties.add_equivalence_group(eq_group)?;
 
             let constants = constants.into_iter().map(|expr| {
-                ConstExpr::from(expr)
-                    .with_across_partitions(AcrossPartitions::Uniform(None))
+                ConstExpr::new(Arc::clone(expr), AcrossPartitions::Uniform(None))
             });
-            eq_properties = eq_properties.with_constants(constants);
+            eq_properties.add_constants(constants)?;
 
             let reqs = convert_to_sort_exprs(&reqs);
-            assert_eq!(
-                eq_properties.ordering_satisfy(reqs.as_ref()),
-                expected,
-                "{err_msg}"
-            );
+            assert_eq!(eq_properties.ordering_satisfy(reqs)?, expected, "{err_msg}");
         }
 
         Ok(())
@@ -706,7 +678,7 @@ mod tests {
         };
         // a=c (e.g they are aliases).
         let mut eq_properties = EquivalenceProperties::new(test_schema);
-        eq_properties.add_equal_conditions(col_a, col_c)?;
+        eq_properties.add_equal_conditions(Arc::clone(col_a), Arc::clone(col_c))?;
 
         let orderings = vec![
             vec![(col_a, options)],
@@ -716,7 +688,7 @@ mod tests {
         let orderings = convert_to_orderings(&orderings);
 
         // Column [a ASC], [e ASC], [d ASC, f ASC] are all valid orderings for the schema.
-        eq_properties.add_new_orderings(orderings);
+        eq_properties.add_orderings(orderings);
 
         // First entry in the tuple is required ordering, second entry is the expected flag
         // that indicates whether this required ordering is satisfied.
@@ -740,11 +712,7 @@ mod tests {
             let err_msg =
                 format!("error in test reqs: {reqs:?}, expected: {expected:?}",);
             let reqs = convert_to_sort_exprs(&reqs);
-            assert_eq!(
-                eq_properties.ordering_satisfy(reqs.as_ref()),
-                expected,
-                "{err_msg}"
-            );
+            assert_eq!(eq_properties.ordering_satisfy(reqs)?, expected, "{err_msg}");
         }
 
         Ok(())
@@ -854,7 +822,7 @@ mod tests {
             // ------- TEST CASE 5 ---------
             // Empty ordering
             (
-                vec![vec![]],
+                vec![],
                 // No ordering in the state (empty ordering is ignored).
                 vec![],
             ),
@@ -973,8 +941,7 @@ mod tests {
         for (orderings, expected) in test_cases {
             let orderings = convert_to_orderings(&orderings);
             let expected = convert_to_orderings(&expected);
-            let actual = OrderingEquivalenceClass::new(orderings.clone());
-            let actual = actual.orderings;
+            let actual = OrderingEquivalenceClass::from(orderings.clone());
             let err_msg = format!(
                 "orderings: {orderings:?}, expected: {expected:?}, actual :{actual:?}"
             );
diff --git a/datafusion/physical-expr/src/equivalence/projection.rs b/datafusion/physical-expr/src/equivalence/projection.rs
deleted file mode 100644
index efbb50bc40e1a..0000000000000
--- a/datafusion/physical-expr/src/equivalence/projection.rs
+++ /dev/null
@@ -1,980 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use crate::expressions::Column;
-use crate::PhysicalExpr;
-
-use arrow::datatypes::SchemaRef;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, Result};
-
-/// Stores the mapping between source expressions and target expressions for a
-/// projection.
-#[derive(Debug, Clone)]
-pub struct ProjectionMapping {
-    /// Mapping between source expressions and target expressions.
-    /// Vector indices correspond to the indices after projection.
-    pub map: Vec<(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)>,
-}
-
-impl ProjectionMapping {
-    /// Constructs the mapping between a projection's input and output
-    /// expressions.
-    ///
-    /// For example, given the input projection expressions (`a + b`, `c + d`)
-    /// and an output schema with two columns `"c + d"` and `"a + b"`, the
-    /// projection mapping would be:
-    ///
-    /// ```text
-    ///  [0]: (c + d, col("c + d"))
-    ///  [1]: (a + b, col("a + b"))
-    /// ```
-    ///
-    /// where `col("c + d")` means the column named `"c + d"`.
-    pub fn try_new(
-        expr: &[(Arc<dyn PhysicalExpr>, String)],
-        input_schema: &SchemaRef,
-    ) -> Result<Self> {
-        // Construct a map from the input expressions to the output expression of the projection:
-        expr.iter()
-            .enumerate()
-            .map(|(expr_idx, (expression, name))| {
-                let target_expr = Arc::new(Column::new(name, expr_idx)) as _;
-                Arc::clone(expression)
-                    .transform_down(|e| match e.as_any().downcast_ref::<Column>() {
-                        Some(col) => {
-                            // Sometimes, an expression and its name in the input_schema
-                            // doesn't match. This can cause problems, so we make sure
-                            // that the expression name matches with the name in `input_schema`.
-                            // Conceptually, `source_expr` and `expression` should be the same.
-                            let idx = col.index();
-                            let matching_input_field = input_schema.field(idx);
-                            if col.name() != matching_input_field.name() {
-                                return internal_err!("Input field name {} does not match with the projection expression {}",
-                                matching_input_field.name(),col.name())
-                            }
-                            let matching_input_column =
-                                Column::new(matching_input_field.name(), idx);
-                            Ok(Transformed::yes(Arc::new(matching_input_column)))
-                        }
-                        None => Ok(Transformed::no(e)),
-                    })
-                    .data()
-                    .map(|source_expr| (source_expr, target_expr))
-            })
-            .collect::<Result<Vec<_>>>()
-            .map(|map| Self { map })
-    }
-
-    /// Constructs a subset mapping using the provided indices.
-    ///
-    /// This is used when the output is a subset of the input without any
-    /// other transformations. The indices are for columns in the schema.
-    pub fn from_indices(indices: &[usize], schema: &SchemaRef) -> Result<Self> {
-        let projection_exprs = project_index_to_exprs(indices, schema);
-        ProjectionMapping::try_new(&projection_exprs, schema)
-    }
-
-    /// Iterate over pairs of (source, target) expressions
-    pub fn iter(
-        &self,
-    ) -> impl Iterator<Item = &(Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>)> + '_ {
-        self.map.iter()
-    }
-
-    /// This function returns the target expression for a given source expression.
-    ///
-    /// # Arguments
-    ///
-    /// * `expr` - Source physical expression.
-    ///
-    /// # Returns
-    ///
-    /// An `Option` containing the target for the given source expression,
-    /// where a `None` value means that `expr` is not inside the mapping.
-    pub fn target_expr(
-        &self,
-        expr: &Arc<dyn PhysicalExpr>,
-    ) -> Option<Arc<dyn PhysicalExpr>> {
-        self.map
-            .iter()
-            .find(|(source, _)| source.eq(expr))
-            .map(|(_, target)| Arc::clone(target))
-    }
-}
-
-fn project_index_to_exprs(
-    projection_index: &[usize],
-    schema: &SchemaRef,
-) -> Vec<(Arc<dyn PhysicalExpr>, String)> {
-    projection_index
-        .iter()
-        .map(|index| {
-            let field = schema.field(*index);
-            (
-                Arc::new(Column::new(field.name(), *index)) as Arc<dyn PhysicalExpr>,
-                field.name().to_owned(),
-            )
-        })
-        .collect::<Vec<_>>()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::equivalence::tests::{
-        convert_to_orderings, convert_to_orderings_owned, output_schema,
-    };
-    use crate::equivalence::EquivalenceProperties;
-    use crate::expressions::{col, BinaryExpr};
-    use crate::utils::tests::TestScalarUDF;
-    use crate::{PhysicalExprRef, ScalarFunctionExpr};
-
-    use arrow::compute::SortOptions;
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
-    use datafusion_expr::{Operator, ScalarUDF};
-
-    #[test]
-    fn project_orderings() -> Result<()> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Int32, true),
-            Field::new("c", DataType::Int32, true),
-            Field::new("d", DataType::Int32, true),
-            Field::new("e", DataType::Int32, true),
-            Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
-        ]));
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let col_c = &col("c", &schema)?;
-        let col_d = &col("d", &schema)?;
-        let col_e = &col("e", &schema)?;
-        let col_ts = &col("ts", &schema)?;
-        let a_plus_b = Arc::new(BinaryExpr::new(
-            Arc::clone(col_a),
-            Operator::Plus,
-            Arc::clone(col_b),
-        )) as Arc<dyn PhysicalExpr>;
-        let b_plus_d = Arc::new(BinaryExpr::new(
-            Arc::clone(col_b),
-            Operator::Plus,
-            Arc::clone(col_d),
-        )) as Arc<dyn PhysicalExpr>;
-        let b_plus_e = Arc::new(BinaryExpr::new(
-            Arc::clone(col_b),
-            Operator::Plus,
-            Arc::clone(col_e),
-        )) as Arc<dyn PhysicalExpr>;
-        let c_plus_d = Arc::new(BinaryExpr::new(
-            Arc::clone(col_c),
-            Operator::Plus,
-            Arc::clone(col_d),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let option_asc = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-        let option_desc = SortOptions {
-            descending: true,
-            nulls_first: true,
-        };
-
-        let test_cases = vec![
-            // ---------- TEST CASE 1 ------------
-            (
-                // orderings
-                vec![
-                    // [b ASC]
-                    vec![(col_b, option_asc)],
-                ],
-                // projection exprs
-                vec![(col_b, "b_new".to_string()), (col_a, "a_new".to_string())],
-                // expected
-                vec![
-                    // [b_new ASC]
-                    vec![("b_new", option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 2 ------------
-            (
-                // orderings
-                vec![
-                    // empty ordering
-                ],
-                // projection exprs
-                vec![(col_c, "c_new".to_string()), (col_b, "b_new".to_string())],
-                // expected
-                vec![
-                    // no ordering at the output
-                ],
-            ),
-            // ---------- TEST CASE 3 ------------
-            (
-                // orderings
-                vec![
-                    // [ts ASC]
-                    vec![(col_ts, option_asc)],
-                ],
-                // projection exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_ts, "ts_new".to_string()),
-                ],
-                // expected
-                vec![
-                    // [ts_new ASC]
-                    vec![("ts_new", option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 4 ------------
-            (
-                // orderings
-                vec![
-                    // [a ASC, ts ASC]
-                    vec![(col_a, option_asc), (col_ts, option_asc)],
-                    // [b ASC, ts ASC]
-                    vec![(col_b, option_asc), (col_ts, option_asc)],
-                ],
-                // projection exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_ts, "ts_new".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, ts_new ASC]
-                    vec![("a_new", option_asc), ("ts_new", option_asc)],
-                    // [b_new ASC, ts_new ASC]
-                    vec![("b_new", option_asc), ("ts_new", option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 5 ------------
-            (
-                // orderings
-                vec![
-                    // [a + b ASC]
-                    vec![(&a_plus_b, option_asc)],
-                ],
-                // projection exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (&a_plus_b, "a+b".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a + b ASC]
-                    vec![("a+b", option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 6 ------------
-            (
-                // orderings
-                vec![
-                    // [a + b ASC, c ASC]
-                    vec![(&a_plus_b, option_asc), (col_c, option_asc)],
-                ],
-                // projection exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_c, "c_new".to_string()),
-                    (&a_plus_b, "a+b".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a + b ASC, c_new ASC]
-                    vec![("a+b", option_asc), ("c_new", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 7 ----------
-            (
-                vec![
-                    // [a ASC, b ASC, c ASC]
-                    vec![(col_a, option_asc), (col_b, option_asc)],
-                    // [a ASC, d ASC]
-                    vec![(col_a, option_asc), (col_d, option_asc)],
-                ],
-                // b as b_new, a as a_new, d as d_new b+d
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_d, "d_new".to_string()),
-                    (&b_plus_d, "b+d".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, b_new ASC]
-                    vec![("a_new", option_asc), ("b_new", option_asc)],
-                    // [a_new ASC, d_new ASC]
-                    vec![("a_new", option_asc), ("d_new", option_asc)],
-                    // [a_new ASC, b+d ASC]
-                    vec![("a_new", option_asc), ("b+d", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 8 ----------
-            (
-                // orderings
-                vec![
-                    // [b+d ASC]
-                    vec![(&b_plus_d, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_d, "d_new".to_string()),
-                    (&b_plus_d, "b+d".to_string()),
-                ],
-                // expected
-                vec![
-                    // [b+d ASC]
-                    vec![("b+d", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 9 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, d ASC, b ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_d, option_asc),
-                        (col_b, option_asc),
-                    ],
-                    // [c ASC]
-                    vec![(col_c, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_d, "d_new".to_string()),
-                    (col_c, "c_new".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, d_new ASC, b_new ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("d_new", option_asc),
-                        ("b_new", option_asc),
-                    ],
-                    // [c_new ASC],
-                    vec![("c_new", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 10 ----------
-            (
-                vec![
-                    // [a ASC, b ASC, c ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_b, option_asc),
-                        (col_c, option_asc),
-                    ],
-                    // [a ASC, d ASC]
-                    vec![(col_a, option_asc), (col_d, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_c, "c_new".to_string()),
-                    (&c_plus_d, "c+d".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, b_new ASC, c_new ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("b_new", option_asc),
-                        ("c_new", option_asc),
-                    ],
-                    // [a_new ASC, b_new ASC, c+d ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("b_new", option_asc),
-                        ("c+d", option_asc),
-                    ],
-                ],
-            ),
-            // ------- TEST CASE 11 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, b ASC]
-                    vec![(col_a, option_asc), (col_b, option_asc)],
-                    // [a ASC, d ASC]
-                    vec![(col_a, option_asc), (col_d, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (&b_plus_d, "b+d".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, b_new ASC]
-                    vec![("a_new", option_asc), ("b_new", option_asc)],
-                    // [a_new ASC, b + d ASC]
-                    vec![("a_new", option_asc), ("b+d", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 12 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, b ASC, c ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_b, option_asc),
-                        (col_c, option_asc),
-                    ],
-                ],
-                // proj exprs
-                vec![(col_c, "c_new".to_string()), (col_a, "a_new".to_string())],
-                // expected
-                vec![
-                    // [a_new ASC]
-                    vec![("a_new", option_asc)],
-                ],
-            ),
-            // ------- TEST CASE 13 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, b ASC, c ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_b, option_asc),
-                        (col_c, option_asc),
-                    ],
-                    // [a ASC, a + b ASC, c ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (&a_plus_b, option_asc),
-                        (col_c, option_asc),
-                    ],
-                ],
-                // proj exprs
-                vec![
-                    (col_c, "c_new".to_string()),
-                    (col_b, "b_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (&a_plus_b, "a+b".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, b_new ASC, c_new ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("b_new", option_asc),
-                        ("c_new", option_asc),
-                    ],
-                    // [a_new ASC, a+b ASC, c_new ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("a+b", option_asc),
-                        ("c_new", option_asc),
-                    ],
-                ],
-            ),
-            // ------- TEST CASE 14 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, b ASC]
-                    vec![(col_a, option_asc), (col_b, option_asc)],
-                    // [c ASC, b ASC]
-                    vec![(col_c, option_asc), (col_b, option_asc)],
-                    // [d ASC, e ASC]
-                    vec![(col_d, option_asc), (col_e, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_c, "c_new".to_string()),
-                    (col_d, "d_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (&b_plus_e, "b+e".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, d_new ASC, b+e ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("d_new", option_asc),
-                        ("b+e", option_asc),
-                    ],
-                    // [d_new ASC, a_new ASC, b+e ASC]
-                    vec![
-                        ("d_new", option_asc),
-                        ("a_new", option_asc),
-                        ("b+e", option_asc),
-                    ],
-                    // [c_new ASC, d_new ASC, b+e ASC]
-                    vec![
-                        ("c_new", option_asc),
-                        ("d_new", option_asc),
-                        ("b+e", option_asc),
-                    ],
-                    // [d_new ASC, c_new ASC, b+e ASC]
-                    vec![
-                        ("d_new", option_asc),
-                        ("c_new", option_asc),
-                        ("b+e", option_asc),
-                    ],
-                ],
-            ),
-            // ------- TEST CASE 15 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, c ASC, b ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_c, option_asc),
-                        (col_b, option_asc),
-                    ],
-                ],
-                // proj exprs
-                vec![
-                    (col_c, "c_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (&a_plus_b, "a+b".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, d_new ASC, b+e ASC]
-                    vec![
-                        ("a_new", option_asc),
-                        ("c_new", option_asc),
-                        ("a+b", option_asc),
-                    ],
-                ],
-            ),
-            // ------- TEST CASE 16 ----------
-            (
-                // orderings
-                vec![
-                    // [a ASC, b ASC]
-                    vec![(col_a, option_asc), (col_b, option_asc)],
-                    // [c ASC, b DESC]
-                    vec![(col_c, option_asc), (col_b, option_desc)],
-                    // [e ASC]
-                    vec![(col_e, option_asc)],
-                ],
-                // proj exprs
-                vec![
-                    (col_c, "c_new".to_string()),
-                    (col_a, "a_new".to_string()),
-                    (col_b, "b_new".to_string()),
-                    (&b_plus_e, "b+e".to_string()),
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, b_new ASC]
-                    vec![("a_new", option_asc), ("b_new", option_asc)],
-                    // [a_new ASC, b_new ASC]
-                    vec![("a_new", option_asc), ("b+e", option_asc)],
-                    // [c_new ASC, b_new DESC]
-                    vec![("c_new", option_asc), ("b_new", option_desc)],
-                ],
-            ),
-        ];
-
-        for (idx, (orderings, proj_exprs, expected)) in test_cases.into_iter().enumerate()
-        {
-            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
-
-            let orderings = convert_to_orderings(&orderings);
-            eq_properties.add_new_orderings(orderings);
-
-            let proj_exprs = proj_exprs
-                .into_iter()
-                .map(|(expr, name)| (Arc::clone(expr), name))
-                .collect::<Vec<_>>();
-            let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &schema)?;
-            let output_schema = output_schema(&projection_mapping, &schema)?;
-
-            let expected = expected
-                .into_iter()
-                .map(|ordering| {
-                    ordering
-                        .into_iter()
-                        .map(|(name, options)| {
-                            (col(name, &output_schema).unwrap(), options)
-                        })
-                        .collect::<Vec<_>>()
-                })
-                .collect::<Vec<_>>();
-            let expected = convert_to_orderings_owned(&expected);
-
-            let projected_eq = eq_properties.project(&projection_mapping, output_schema);
-            let orderings = projected_eq.oeq_class();
-
-            let err_msg = format!(
-                "test_idx: {idx:?}, actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
-            );
-
-            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
-            for expected_ordering in &expected {
-                assert!(orderings.contains(expected_ordering), "{}", err_msg)
-            }
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn project_orderings2() -> Result<()> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Int32, true),
-            Field::new("c", DataType::Int32, true),
-            Field::new("d", DataType::Int32, true),
-            Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
-        ]));
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let col_c = &col("c", &schema)?;
-        let col_ts = &col("ts", &schema)?;
-        let a_plus_b = Arc::new(BinaryExpr::new(
-            Arc::clone(col_a),
-            Operator::Plus,
-            Arc::clone(col_b),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let test_fun = Arc::new(ScalarUDF::new_from_impl(TestScalarUDF::new()));
-
-        let round_c = Arc::new(ScalarFunctionExpr::try_new(
-            test_fun,
-            vec![Arc::clone(col_c)],
-            &schema,
-        )?) as PhysicalExprRef;
-
-        let option_asc = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-
-        let proj_exprs = vec![
-            (col_b, "b_new".to_string()),
-            (col_a, "a_new".to_string()),
-            (col_c, "c_new".to_string()),
-            (&round_c, "round_c_res".to_string()),
-        ];
-        let proj_exprs = proj_exprs
-            .into_iter()
-            .map(|(expr, name)| (Arc::clone(expr), name))
-            .collect::<Vec<_>>();
-        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &schema)?;
-        let output_schema = output_schema(&projection_mapping, &schema)?;
-
-        let col_a_new = &col("a_new", &output_schema)?;
-        let col_b_new = &col("b_new", &output_schema)?;
-        let col_c_new = &col("c_new", &output_schema)?;
-        let col_round_c_res = &col("round_c_res", &output_schema)?;
-        let a_new_plus_b_new = Arc::new(BinaryExpr::new(
-            Arc::clone(col_a_new),
-            Operator::Plus,
-            Arc::clone(col_b_new),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let test_cases = vec![
-            // ---------- TEST CASE 1 ------------
-            (
-                // orderings
-                vec![
-                    // [a ASC]
-                    vec![(col_a, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [b_new ASC]
-                    vec![(col_a_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 2 ------------
-            (
-                // orderings
-                vec![
-                    // [a+b ASC]
-                    vec![(&a_plus_b, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [b_new ASC]
-                    vec![(&a_new_plus_b_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 3 ------------
-            (
-                // orderings
-                vec![
-                    // [a ASC, ts ASC]
-                    vec![(col_a, option_asc), (col_ts, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, date_bin_res ASC]
-                    vec![(col_a_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 4 ------------
-            (
-                // orderings
-                vec![
-                    // [a ASC, ts ASC, b ASC]
-                    vec![
-                        (col_a, option_asc),
-                        (col_ts, option_asc),
-                        (col_b, option_asc),
-                    ],
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, date_bin_res ASC]
-                    vec![(col_a_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 5 ------------
-            (
-                // orderings
-                vec![
-                    // [a ASC, c ASC]
-                    vec![(col_a, option_asc), (col_c, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [a_new ASC, round_c_res ASC, c_new ASC]
-                    vec![(col_a_new, option_asc), (col_round_c_res, option_asc)],
-                    // [a_new ASC, c_new ASC]
-                    vec![(col_a_new, option_asc), (col_c_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 6 ------------
-            (
-                // orderings
-                vec![
-                    // [c ASC, b ASC]
-                    vec![(col_c, option_asc), (col_b, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [round_c_res ASC]
-                    vec![(col_round_c_res, option_asc)],
-                    // [c_new ASC, b_new ASC]
-                    vec![(col_c_new, option_asc), (col_b_new, option_asc)],
-                ],
-            ),
-            // ---------- TEST CASE 7 ------------
-            (
-                // orderings
-                vec![
-                    // [a+b ASC, c ASC]
-                    vec![(&a_plus_b, option_asc), (col_c, option_asc)],
-                ],
-                // expected
-                vec![
-                    // [a+b ASC, round(c) ASC, c_new ASC]
-                    vec![
-                        (&a_new_plus_b_new, option_asc),
-                        (col_round_c_res, option_asc),
-                    ],
-                    // [a+b ASC, c_new ASC]
-                    vec![(&a_new_plus_b_new, option_asc), (col_c_new, option_asc)],
-                ],
-            ),
-        ];
-
-        for (idx, (orderings, expected)) in test_cases.iter().enumerate() {
-            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
-
-            let orderings = convert_to_orderings(orderings);
-            eq_properties.add_new_orderings(orderings);
-
-            let expected = convert_to_orderings(expected);
-
-            let projected_eq =
-                eq_properties.project(&projection_mapping, Arc::clone(&output_schema));
-            let orderings = projected_eq.oeq_class();
-
-            let err_msg = format!(
-                "test idx: {idx:?}, actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
-            );
-
-            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
-            for expected_ordering in &expected {
-                assert!(orderings.contains(expected_ordering), "{}", err_msg)
-            }
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn project_orderings3() -> Result<()> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Int32, true),
-            Field::new("c", DataType::Int32, true),
-            Field::new("d", DataType::Int32, true),
-            Field::new("e", DataType::Int32, true),
-            Field::new("f", DataType::Int32, true),
-        ]));
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let col_c = &col("c", &schema)?;
-        let col_d = &col("d", &schema)?;
-        let col_e = &col("e", &schema)?;
-        let col_f = &col("f", &schema)?;
-        let a_plus_b = Arc::new(BinaryExpr::new(
-            Arc::clone(col_a),
-            Operator::Plus,
-            Arc::clone(col_b),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let option_asc = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-
-        let proj_exprs = vec![
-            (col_c, "c_new".to_string()),
-            (col_d, "d_new".to_string()),
-            (&a_plus_b, "a+b".to_string()),
-        ];
-        let proj_exprs = proj_exprs
-            .into_iter()
-            .map(|(expr, name)| (Arc::clone(expr), name))
-            .collect::<Vec<_>>();
-        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &schema)?;
-        let output_schema = output_schema(&projection_mapping, &schema)?;
-
-        let col_a_plus_b_new = &col("a+b", &output_schema)?;
-        let col_c_new = &col("c_new", &output_schema)?;
-        let col_d_new = &col("d_new", &output_schema)?;
-
-        let test_cases = vec![
-            // ---------- TEST CASE 1 ------------
-            (
-                // orderings
-                vec![
-                    // [d ASC, b ASC]
-                    vec![(col_d, option_asc), (col_b, option_asc)],
-                    // [c ASC, a ASC]
-                    vec![(col_c, option_asc), (col_a, option_asc)],
-                ],
-                // equal conditions
-                vec![],
-                // expected
-                vec![
-                    // [d_new ASC, c_new ASC, a+b ASC]
-                    vec![
-                        (col_d_new, option_asc),
-                        (col_c_new, option_asc),
-                        (col_a_plus_b_new, option_asc),
-                    ],
-                    // [c_new ASC, d_new ASC, a+b ASC]
-                    vec![
-                        (col_c_new, option_asc),
-                        (col_d_new, option_asc),
-                        (col_a_plus_b_new, option_asc),
-                    ],
-                ],
-            ),
-            // ---------- TEST CASE 2 ------------
-            (
-                // orderings
-                vec![
-                    // [d ASC, b ASC]
-                    vec![(col_d, option_asc), (col_b, option_asc)],
-                    // [c ASC, e ASC], Please note that a=e
-                    vec![(col_c, option_asc), (col_e, option_asc)],
-                ],
-                // equal conditions
-                vec![(col_e, col_a)],
-                // expected
-                vec![
-                    // [d_new ASC, c_new ASC, a+b ASC]
-                    vec![
-                        (col_d_new, option_asc),
-                        (col_c_new, option_asc),
-                        (col_a_plus_b_new, option_asc),
-                    ],
-                    // [c_new ASC, d_new ASC, a+b ASC]
-                    vec![
-                        (col_c_new, option_asc),
-                        (col_d_new, option_asc),
-                        (col_a_plus_b_new, option_asc),
-                    ],
-                ],
-            ),
-            // ---------- TEST CASE 3 ------------
-            (
-                // orderings
-                vec![
-                    // [d ASC, b ASC]
-                    vec![(col_d, option_asc), (col_b, option_asc)],
-                    // [c ASC, e ASC], Please note that a=f
-                    vec![(col_c, option_asc), (col_e, option_asc)],
-                ],
-                // equal conditions
-                vec![(col_a, col_f)],
-                // expected
-                vec![
-                    // [d_new ASC]
-                    vec![(col_d_new, option_asc)],
-                    // [c_new ASC]
-                    vec![(col_c_new, option_asc)],
-                ],
-            ),
-        ];
-        for (orderings, equal_columns, expected) in test_cases {
-            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
-            for (lhs, rhs) in equal_columns {
-                eq_properties.add_equal_conditions(lhs, rhs)?;
-            }
-
-            let orderings = convert_to_orderings(&orderings);
-            eq_properties.add_new_orderings(orderings);
-
-            let expected = convert_to_orderings(&expected);
-
-            let projected_eq =
-                eq_properties.project(&projection_mapping, Arc::clone(&output_schema));
-            let orderings = projected_eq.oeq_class();
-
-            let err_msg = format!(
-                "actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
-            );
-
-            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
-            for expected_ordering in &expected {
-                assert!(orderings.contains(expected_ordering), "{}", err_msg)
-            }
-        }
-
-        Ok(())
-    }
-}
diff --git a/datafusion/physical-expr/src/equivalence/properties/dependency.rs b/datafusion/physical-expr/src/equivalence/properties/dependency.rs
index fa52ae8686f76..a42a4abf8059e 100644
--- a/datafusion/physical-expr/src/equivalence/properties/dependency.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/dependency.rs
@@ -16,28 +16,28 @@
 // under the License.
 
 use std::fmt::{self, Display};
+use std::ops::{Deref, DerefMut};
 use std::sync::Arc;
 
+use super::expr_refers;
 use crate::{LexOrdering, PhysicalSortExpr};
+
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use indexmap::IndexSet;
 
-use indexmap::IndexMap;
+use indexmap::{IndexMap, IndexSet};
 use itertools::Itertools;
 
-use super::{expr_refers, ExprWrapper};
-
 // A list of sort expressions that can be calculated from a known set of
 /// dependencies.
 #[derive(Debug, Default, Clone, PartialEq, Eq)]
 pub struct Dependencies {
-    inner: IndexSet<PhysicalSortExpr>,
+    sort_exprs: IndexSet<PhysicalSortExpr>,
 }
 
 impl Display for Dependencies {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "[")?;
-        let mut iter = self.inner.iter();
+        let mut iter = self.sort_exprs.iter();
         if let Some(dep) = iter.next() {
             write!(f, "{dep}")?;
         }
@@ -49,38 +49,34 @@ impl Display for Dependencies {
 }
 
 impl Dependencies {
-    /// Create a new empty `Dependencies` instance.
-    fn new() -> Self {
+    // Creates a new `Dependencies` instance from the given sort expressions.
+    pub fn new(sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>) -> Self {
         Self {
-            inner: IndexSet::new(),
+            sort_exprs: sort_exprs.into_iter().collect(),
         }
     }
+}
 
-    /// Create a new `Dependencies` from an iterator of `PhysicalSortExpr`.
-    pub fn new_from_iter(iter: impl IntoIterator<Item = PhysicalSortExpr>) -> Self {
-        Self {
-            inner: iter.into_iter().collect(),
-        }
-    }
+impl Deref for Dependencies {
+    type Target = IndexSet<PhysicalSortExpr>;
 
-    /// Insert a new dependency into the set.
-    pub fn insert(&mut self, sort_expr: PhysicalSortExpr) {
-        self.inner.insert(sort_expr);
+    fn deref(&self) -> &Self::Target {
+        &self.sort_exprs
     }
+}
 
-    /// Iterator over  dependencies in the set
-    pub fn iter(&self) -> impl Iterator<Item = &PhysicalSortExpr> + Clone {
-        self.inner.iter()
+impl DerefMut for Dependencies {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.sort_exprs
     }
+}
 
-    /// Return the inner set of dependencies
-    pub fn into_inner(self) -> IndexSet<PhysicalSortExpr> {
-        self.inner
-    }
+impl IntoIterator for Dependencies {
+    type Item = PhysicalSortExpr;
+    type IntoIter = <IndexSet<PhysicalSortExpr> as IntoIterator>::IntoIter;
 
-    /// Returns true if there are no dependencies
-    fn is_empty(&self) -> bool {
-        self.inner.is_empty()
+    fn into_iter(self) -> Self::IntoIter {
+        self.sort_exprs.into_iter()
     }
 }
 
@@ -133,26 +129,25 @@ impl<'a> DependencyEnumerator<'a> {
         let node = dependency_map
             .get(referred_sort_expr)
             .expect("`referred_sort_expr` should be inside `dependency_map`");
-        // Since we work on intermediate nodes, we are sure `val.target_sort_expr`
-        // exists.
-        let target_sort_expr = node.target_sort_expr.as_ref().unwrap();
+        // Since we work on intermediate nodes, we are sure `node.target` exists.
+        let target = node.target.as_ref().unwrap();
         // An empty dependency means the referred_sort_expr represents a global ordering.
         // Return its projected version, which is the target_expression.
         if node.dependencies.is_empty() {
-            return vec![LexOrdering::new(vec![target_sort_expr.clone()])];
+            return vec![[target.clone()].into()];
         };
 
         node.dependencies
             .iter()
             .flat_map(|dep| {
-                let mut orderings = if self.insert(target_sort_expr, dep) {
+                let mut orderings = if self.insert(target, dep) {
                     self.construct_orderings(dep, dependency_map)
                 } else {
                     vec![]
                 };
 
                 for ordering in orderings.iter_mut() {
-                    ordering.push(target_sort_expr.clone())
+                    ordering.push(target.clone());
                 }
                 orderings
             })
@@ -178,70 +173,55 @@ impl<'a> DependencyEnumerator<'a> {
 /// # Note on IndexMap Rationale
 ///
 /// Using `IndexMap` (which preserves insert order) to ensure consistent results
-/// across different executions for the same query. We could have used
-/// `HashSet`, `HashMap` in place of them without any loss of functionality.
+/// across different executions for the same query. We could have used `HashSet`
+/// and `HashMap` instead without any loss of functionality.
 ///
 /// As an example, if existing orderings are
 /// 1. `[a ASC, b ASC]`
-/// 2. `[c ASC]` for
+/// 2. `[c ASC]`
 ///
 /// Then both the following output orderings are valid
 /// 1. `[a ASC, b ASC, c ASC]`
 /// 2. `[c ASC, a ASC, b ASC]`
 ///
-/// (this are both valid as they are concatenated versions of the alternative
-/// orderings). When using `HashSet`, `HashMap` it is not guaranteed to generate
-/// consistent result, among the possible 2 results in the example above.
-#[derive(Debug)]
+/// These are both valid as they are concatenated versions of the alternative
+/// orderings. Had we used `HashSet`/`HashMap`, we couldn't guarantee to generate
+/// the same result among the possible two results in the example above.
+#[derive(Debug, Default)]
 pub struct DependencyMap {
-    inner: IndexMap<PhysicalSortExpr, DependencyNode>,
+    map: IndexMap<PhysicalSortExpr, DependencyNode>,
 }
 
 impl DependencyMap {
-    pub fn new() -> Self {
-        Self {
-            inner: IndexMap::new(),
-        }
-    }
-
-    /// Insert a new dependency `sort_expr` --> `dependency` into the map.
-    ///
-    /// If `target_sort_expr` is none, a new entry is created with empty dependencies.
+    /// Insert a new dependency of `sort_expr` (i.e. `dependency`) into the map
+    /// along with its target sort expression.
     pub fn insert(
         &mut self,
-        sort_expr: &PhysicalSortExpr,
-        target_sort_expr: Option<&PhysicalSortExpr>,
-        dependency: Option<&PhysicalSortExpr>,
+        sort_expr: PhysicalSortExpr,
+        target_sort_expr: Option<PhysicalSortExpr>,
+        dependency: Option<PhysicalSortExpr>,
     ) {
-        self.inner
-            .entry(sort_expr.clone())
-            .or_insert_with(|| DependencyNode {
-                target_sort_expr: target_sort_expr.cloned(),
-                dependencies: Dependencies::new(),
-            })
-            .insert_dependency(dependency)
-    }
-
-    /// Iterator over (sort_expr, DependencyNode) pairs
-    pub fn iter(&self) -> impl Iterator<Item = (&PhysicalSortExpr, &DependencyNode)> {
-        self.inner.iter()
+        let entry = self.map.entry(sort_expr);
+        let node = entry.or_insert_with(|| DependencyNode {
+            target: target_sort_expr,
+            dependencies: Dependencies::default(),
+        });
+        node.dependencies.extend(dependency);
     }
+}
 
-    /// iterator over all sort exprs
-    pub fn sort_exprs(&self) -> impl Iterator<Item = &PhysicalSortExpr> {
-        self.inner.keys()
-    }
+impl Deref for DependencyMap {
+    type Target = IndexMap<PhysicalSortExpr, DependencyNode>;
 
-    /// Return the dependency node for the given sort expression, if any
-    pub fn get(&self, sort_expr: &PhysicalSortExpr) -> Option<&DependencyNode> {
-        self.inner.get(sort_expr)
+    fn deref(&self) -> &Self::Target {
+        &self.map
     }
 }
 
 impl Display for DependencyMap {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         writeln!(f, "DependencyMap: {{")?;
-        for (sort_expr, node) in self.inner.iter() {
+        for (sort_expr, node) in self.map.iter() {
             writeln!(f, "  {sort_expr} --> {node}")?;
         }
         writeln!(f, "}}")
@@ -256,29 +236,20 @@ impl Display for DependencyMap {
 ///
 /// # Fields
 ///
-/// - `target_sort_expr`: An optional `PhysicalSortExpr` representing the target
-///   sort expression associated with the node. It is `None` if the sort expression
+/// - `target`: An optional `PhysicalSortExpr` representing the target sort
+///   expression associated with the node. It is `None` if the sort expression
 ///   cannot be projected.
 /// - `dependencies`: A [`Dependencies`] containing dependencies on other sort
 ///   expressions that are referred to by the target sort expression.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct DependencyNode {
-    pub target_sort_expr: Option<PhysicalSortExpr>,
-    pub dependencies: Dependencies,
-}
-
-impl DependencyNode {
-    /// Insert dependency to the state (if exists).
-    fn insert_dependency(&mut self, dependency: Option<&PhysicalSortExpr>) {
-        if let Some(dep) = dependency {
-            self.dependencies.insert(dep.clone());
-        }
-    }
+    pub(crate) target: Option<PhysicalSortExpr>,
+    pub(crate) dependencies: Dependencies,
 }
 
 impl Display for DependencyNode {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        if let Some(target) = &self.target_sort_expr {
+        if let Some(target) = &self.target {
             write!(f, "(target: {target}, ")?;
         } else {
             write!(f, "(")?;
@@ -307,12 +278,12 @@ pub fn referred_dependencies(
     source: &Arc<dyn PhysicalExpr>,
 ) -> Vec<Dependencies> {
     // Associate `PhysicalExpr`s with `PhysicalSortExpr`s that contain them:
-    let mut expr_to_sort_exprs = IndexMap::<ExprWrapper, Dependencies>::new();
+    let mut expr_to_sort_exprs = IndexMap::<_, Dependencies>::new();
     for sort_expr in dependency_map
-        .sort_exprs()
+        .keys()
         .filter(|sort_expr| expr_refers(source, &sort_expr.expr))
     {
-        let key = ExprWrapper(Arc::clone(&sort_expr.expr));
+        let key = Arc::clone(&sort_expr.expr);
         expr_to_sort_exprs
             .entry(key)
             .or_default()
@@ -322,16 +293,10 @@ pub fn referred_dependencies(
     // Generate all valid dependencies for the source. For example, if the source
     // is `a + b` and the map is `[a -> (a ASC, a DESC), b -> (b ASC)]`, we get
     // `vec![HashSet(a ASC, b ASC), HashSet(a DESC, b ASC)]`.
-    let dependencies = expr_to_sort_exprs
+    expr_to_sort_exprs
         .into_values()
-        .map(Dependencies::into_inner)
-        .collect::<Vec<_>>();
-    dependencies
-        .iter()
         .multi_cartesian_product()
-        .map(|referred_deps| {
-            Dependencies::new_from_iter(referred_deps.into_iter().cloned())
-        })
+        .map(Dependencies::new)
         .collect()
 }
 
@@ -378,46 +343,39 @@ pub fn construct_prefix_orderings(
 /// # Parameters
 ///
 /// * `dependencies` - Set of relevant expressions.
-/// * `dependency_map` - Map of dependencies for expressions that may appear in `dependencies`
+/// * `dependency_map` - Map of dependencies for expressions that may appear in
+///   `dependencies`.
 ///
 /// # Returns
 ///
-/// A vector of lexical orderings (`Vec<LexOrdering>`) representing all valid orderings
-/// based on the given dependencies.
+/// A vector of lexical orderings (`Vec<LexOrdering>`) representing all valid
+/// orderings based on the given dependencies.
 pub fn generate_dependency_orderings(
     dependencies: &Dependencies,
     dependency_map: &DependencyMap,
 ) -> Vec<LexOrdering> {
     // Construct all the valid prefix orderings for each expression appearing
-    // in the projection:
-    let relevant_prefixes = dependencies
+    // in the projection. Note that if relevant prefixes are empty, there is no
+    // dependency, meaning that dependent is a leading ordering.
+    dependencies
         .iter()
-        .flat_map(|dep| {
+        .filter_map(|dep| {
             let prefixes = construct_prefix_orderings(dep, dependency_map);
             (!prefixes.is_empty()).then_some(prefixes)
         })
-        .collect::<Vec<_>>();
-
-    // No dependency, dependent is a leading ordering.
-    if relevant_prefixes.is_empty() {
-        // Return an empty ordering:
-        return vec![LexOrdering::default()];
-    }
-
-    relevant_prefixes
-        .into_iter()
+        // Generate all possible valid orderings:
         .multi_cartesian_product()
         .flat_map(|prefix_orderings| {
+            let length = prefix_orderings.len();
             prefix_orderings
-                .iter()
-                .permutations(prefix_orderings.len())
-                .map(|prefixes| {
-                    prefixes
-                        .into_iter()
-                        .flat_map(|ordering| ordering.clone())
-                        .collect()
+                .into_iter()
+                .permutations(length)
+                .filter_map(|prefixes| {
+                    prefixes.into_iter().reduce(|mut acc, ordering| {
+                        acc.extend(ordering);
+                        acc
+                    })
                 })
-                .collect::<Vec<_>>()
         })
         .collect()
 }
@@ -429,21 +387,24 @@ mod tests {
 
     use super::*;
     use crate::equivalence::tests::{
-        convert_to_sort_exprs, convert_to_sort_reqs, create_test_params,
-        create_test_schema, output_schema, parse_sort_expr,
+        convert_to_sort_reqs, create_test_params, create_test_schema, parse_sort_expr,
     };
-    use crate::equivalence::ProjectionMapping;
-    use crate::expressions::{col, BinaryExpr, CastExpr, Column};
+    use crate::equivalence::{ProjectionMapping, convert_to_sort_exprs};
+    use crate::expressions::{BinaryExpr, CastColumnExpr, CastExpr, Column, col};
+    use crate::projection::tests::output_schema;
     use crate::{ConstExpr, EquivalenceProperties, ScalarFunctionExpr};
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion_common::config::ConfigOptions;
     use datafusion_common::{Constraint, Constraints, Result};
-    use datafusion_expr::sort_properties::SortProperties;
     use datafusion_expr::Operator;
-
+    use datafusion_expr::sort_properties::SortProperties;
     use datafusion_functions::string::concat;
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_expr_common::sort_expr::{
+        LexRequirement, PhysicalSortRequirement,
+    };
 
     #[test]
     fn project_equivalence_properties_test() -> Result<()> {
@@ -463,7 +424,7 @@ mod tests {
             (Arc::clone(&col_a), "a3".to_string()),
             (Arc::clone(&col_a), "a4".to_string()),
         ];
-        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?;
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &input_schema)?;
 
         let out_schema = output_schema(&projection_mapping, &input_schema)?;
         // a as a1, a as a2, a as a3, a as a3
@@ -473,7 +434,7 @@ mod tests {
             (Arc::clone(&col_a), "a3".to_string()),
             (Arc::clone(&col_a), "a4".to_string()),
         ];
-        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?;
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &input_schema)?;
 
         // a as a1, a as a2, a as a3, a as a3
         let col_a1 = &col("a1", &out_schema)?;
@@ -506,20 +467,20 @@ mod tests {
 
         let mut input_properties = EquivalenceProperties::new(Arc::clone(&input_schema));
         // add equivalent ordering [a, b, c, d]
-        input_properties.add_new_ordering(LexOrdering::new(vec![
+        input_properties.add_ordering([
             parse_sort_expr("a", &input_schema),
             parse_sort_expr("b", &input_schema),
             parse_sort_expr("c", &input_schema),
             parse_sort_expr("d", &input_schema),
-        ]));
+        ]);
 
         // add equivalent ordering [a, c, b, d]
-        input_properties.add_new_ordering(LexOrdering::new(vec![
+        input_properties.add_ordering([
             parse_sort_expr("a", &input_schema),
             parse_sort_expr("c", &input_schema),
             parse_sort_expr("b", &input_schema), // NB b and c are swapped
             parse_sort_expr("d", &input_schema),
-        ]));
+        ]);
 
         // simply project all the columns in order
         let proj_exprs = vec![
@@ -528,7 +489,7 @@ mod tests {
             (col("c", &input_schema)?, "c".to_string()),
             (col("d", &input_schema)?, "d".to_string()),
         ];
-        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?;
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &input_schema)?;
         let out_properties = input_properties.project(&projection_mapping, input_schema);
 
         assert_eq!(
@@ -541,8 +502,6 @@ mod tests {
 
     #[test]
     fn test_normalize_ordering_equivalence_classes() -> Result<()> {
-        let sort_options = SortOptions::default();
-
         let schema = Schema::new(vec![
             Field::new("a", DataType::Int32, true),
             Field::new("b", DataType::Int32, true),
@@ -553,35 +512,19 @@ mod tests {
         let col_c_expr = col("c", &schema)?;
         let mut eq_properties = EquivalenceProperties::new(Arc::new(schema.clone()));
 
-        eq_properties.add_equal_conditions(&col_a_expr, &col_c_expr)?;
-        let others = vec![
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(&col_b_expr),
-                options: sort_options,
-            }]),
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(&col_c_expr),
-                options: sort_options,
-            }]),
-        ];
-        eq_properties.add_new_orderings(others);
+        eq_properties.add_equal_conditions(col_a_expr, Arc::clone(&col_c_expr))?;
+        eq_properties.add_orderings([
+            vec![PhysicalSortExpr::new_default(Arc::clone(&col_b_expr))],
+            vec![PhysicalSortExpr::new_default(Arc::clone(&col_c_expr))],
+        ]);
 
         let mut expected_eqs = EquivalenceProperties::new(Arc::new(schema));
-        expected_eqs.add_new_orderings([
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(&col_b_expr),
-                options: sort_options,
-            }]),
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(&col_c_expr),
-                options: sort_options,
-            }]),
+        expected_eqs.add_orderings([
+            vec![PhysicalSortExpr::new_default(col_b_expr)],
+            vec![PhysicalSortExpr::new_default(col_c_expr)],
         ]);
 
-        let oeq_class = eq_properties.oeq_class().clone();
-        let expected = expected_eqs.oeq_class();
-        assert!(oeq_class.eq(expected));
-
+        assert!(eq_properties.oeq_class().eq(expected_eqs.oeq_class()));
         Ok(())
     }
 
@@ -594,34 +537,22 @@ mod tests {
             Field::new("a", DataType::Int32, true),
             Field::new("b", DataType::Int32, true),
         ]);
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let required_columns = [Arc::clone(col_b), Arc::clone(col_a)];
+        let col_a = col("a", &schema)?;
+        let col_b = col("b", &schema)?;
+        let required_columns = [Arc::clone(&col_b), Arc::clone(&col_a)];
         let mut eq_properties = EquivalenceProperties::new(Arc::new(schema));
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("b", 1)),
-                options: sort_options_not,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("a", 0)),
-                options: sort_options,
-            },
-        ])]);
-        let (result, idxs) = eq_properties.find_longest_permutation(&required_columns);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new(Arc::new(Column::new("b", 1)), sort_options_not),
+            PhysicalSortExpr::new(Arc::new(Column::new("a", 0)), sort_options),
+        ]);
+        let (result, idxs) = eq_properties.find_longest_permutation(&required_columns)?;
         assert_eq!(idxs, vec![0, 1]);
         assert_eq!(
             result,
-            LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::clone(col_b),
-                    options: sort_options_not
-                },
-                PhysicalSortExpr {
-                    expr: Arc::clone(col_a),
-                    options: sort_options
-                }
-            ])
+            vec![
+                PhysicalSortExpr::new(col_b, sort_options_not),
+                PhysicalSortExpr::new(col_a, sort_options),
+            ]
         );
 
         let schema = Schema::new(vec![
@@ -629,40 +560,28 @@ mod tests {
             Field::new("b", DataType::Int32, true),
             Field::new("c", DataType::Int32, true),
         ]);
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let required_columns = [Arc::clone(col_b), Arc::clone(col_a)];
+        let col_a = col("a", &schema)?;
+        let col_b = col("b", &schema)?;
+        let required_columns = [Arc::clone(&col_b), Arc::clone(&col_a)];
         let mut eq_properties = EquivalenceProperties::new(Arc::new(schema));
-        eq_properties.add_new_orderings([
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::new(Column::new("c", 2)),
-                options: sort_options,
-            }]),
-            LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("b", 1)),
-                    options: sort_options_not,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("a", 0)),
-                    options: sort_options,
-                },
-            ]),
+        eq_properties.add_orderings([
+            vec![PhysicalSortExpr::new(
+                Arc::new(Column::new("c", 2)),
+                sort_options,
+            )],
+            vec![
+                PhysicalSortExpr::new(Arc::new(Column::new("b", 1)), sort_options_not),
+                PhysicalSortExpr::new(Arc::new(Column::new("a", 0)), sort_options),
+            ],
         ]);
-        let (result, idxs) = eq_properties.find_longest_permutation(&required_columns);
+        let (result, idxs) = eq_properties.find_longest_permutation(&required_columns)?;
         assert_eq!(idxs, vec![0, 1]);
         assert_eq!(
             result,
-            LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::clone(col_b),
-                    options: sort_options_not
-                },
-                PhysicalSortExpr {
-                    expr: Arc::clone(col_a),
-                    options: sort_options
-                }
-            ])
+            vec![
+                PhysicalSortExpr::new(col_b, sort_options_not),
+                PhysicalSortExpr::new(col_a, sort_options),
+            ]
         );
 
         let required_columns = [
@@ -677,21 +596,12 @@ mod tests {
         let mut eq_properties = EquivalenceProperties::new(Arc::new(schema));
 
         // not satisfied orders
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("b", 1)),
-                options: sort_options_not,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("c", 2)),
-                options: sort_options,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("a", 0)),
-                options: sort_options,
-            },
-        ])]);
-        let (_, idxs) = eq_properties.find_longest_permutation(&required_columns);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new(Arc::new(Column::new("b", 1)), sort_options_not),
+            PhysicalSortExpr::new(Arc::new(Column::new("c", 2)), sort_options),
+            PhysicalSortExpr::new(Arc::new(Column::new("a", 0)), sort_options),
+        ]);
+        let (_, idxs) = eq_properties.find_longest_permutation(&required_columns)?;
         assert_eq!(idxs, vec![0]);
 
         Ok(())
@@ -707,49 +617,35 @@ mod tests {
         ]);
 
         let mut eq_properties = EquivalenceProperties::new(Arc::new(schema.clone()));
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let col_c = &col("c", &schema)?;
-        let col_d = &col("d", &schema)?;
+        let col_a = col("a", &schema)?;
+        let col_b = col("b", &schema)?;
+        let col_c = col("c", &schema)?;
+        let col_d = col("d", &schema)?;
         let option_asc = SortOptions {
             descending: false,
             nulls_first: false,
         };
         // b=a (e.g they are aliases)
-        eq_properties.add_equal_conditions(col_b, col_a)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_a))?;
         // [b ASC], [d ASC]
-        eq_properties.add_new_orderings(vec![
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(col_b),
-                options: option_asc,
-            }]),
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(col_d),
-                options: option_asc,
-            }]),
+        eq_properties.add_orderings([
+            vec![PhysicalSortExpr::new(Arc::clone(&col_b), option_asc)],
+            vec![PhysicalSortExpr::new(Arc::clone(&col_d), option_asc)],
         ]);
 
         let test_cases = vec![
             // d + b
             (
-                Arc::new(BinaryExpr::new(
-                    Arc::clone(col_d),
-                    Operator::Plus,
-                    Arc::clone(col_b),
-                )) as Arc<dyn PhysicalExpr>,
+                Arc::new(BinaryExpr::new(col_d, Operator::Plus, Arc::clone(&col_b))) as _,
                 SortProperties::Ordered(option_asc),
             ),
             // b
-            (Arc::clone(col_b), SortProperties::Ordered(option_asc)),
+            (col_b, SortProperties::Ordered(option_asc)),
             // a
-            (Arc::clone(col_a), SortProperties::Ordered(option_asc)),
+            (Arc::clone(&col_a), SortProperties::Ordered(option_asc)),
             // a + c
             (
-                Arc::new(BinaryExpr::new(
-                    Arc::clone(col_a),
-                    Operator::Plus,
-                    Arc::clone(col_c),
-                )),
+                Arc::new(BinaryExpr::new(col_a, Operator::Plus, col_c)),
                 SortProperties::Unordered,
             ),
         ];
@@ -757,7 +653,7 @@ mod tests {
             let leading_orderings = eq_properties
                 .oeq_class()
                 .iter()
-                .flat_map(|ordering| ordering.first().cloned())
+                .map(|ordering| ordering.first().clone())
                 .collect::<Vec<_>>();
             let expr_props = eq_properties.get_expr_properties(Arc::clone(&expr));
             let err_msg = format!(
@@ -790,7 +686,7 @@ mod tests {
             Arc::clone(col_a),
             Operator::Plus,
             Arc::clone(col_d),
-        )) as Arc<dyn PhysicalExpr>;
+        )) as _;
 
         let option_asc = SortOptions {
             descending: false,
@@ -801,16 +697,10 @@ mod tests {
             nulls_first: true,
         };
         // [d ASC, h DESC] also satisfies schema.
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(col_d),
-                options: option_asc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(col_h),
-                options: option_desc,
-            },
-        ])]);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new(Arc::clone(col_d), option_asc),
+            PhysicalSortExpr::new(Arc::clone(col_h), option_desc),
+        ]);
         let test_cases = vec![
             // TEST CASE 1
             (vec![col_a], vec![(col_a, option_asc)]),
@@ -878,7 +768,7 @@ mod tests {
         for (exprs, expected) in test_cases {
             let exprs = exprs.into_iter().cloned().collect::<Vec<_>>();
             let expected = convert_to_sort_exprs(&expected);
-            let (actual, _) = eq_properties.find_longest_permutation(&exprs);
+            let (actual, _) = eq_properties.find_longest_permutation(&exprs)?;
             assert_eq!(actual, expected);
         }
 
@@ -896,7 +786,7 @@ mod tests {
         let col_h = &col("h", &test_schema)?;
 
         // Add column h as constant
-        eq_properties = eq_properties.with_constants(vec![ConstExpr::from(col_h)]);
+        eq_properties.add_constants(vec![ConstExpr::from(Arc::clone(col_h))])?;
 
         let test_cases = vec![
             // TEST CASE 1
@@ -907,72 +797,13 @@ mod tests {
         for (exprs, expected) in test_cases {
             let exprs = exprs.into_iter().cloned().collect::<Vec<_>>();
             let expected = convert_to_sort_exprs(&expected);
-            let (actual, _) = eq_properties.find_longest_permutation(&exprs);
+            let (actual, _) = eq_properties.find_longest_permutation(&exprs)?;
             assert_eq!(actual, expected);
         }
 
         Ok(())
     }
 
-    #[test]
-    fn test_get_finer() -> Result<()> {
-        let schema = create_test_schema()?;
-        let col_a = &col("a", &schema)?;
-        let col_b = &col("b", &schema)?;
-        let col_c = &col("c", &schema)?;
-        let eq_properties = EquivalenceProperties::new(schema);
-        let option_asc = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-        let option_desc = SortOptions {
-            descending: true,
-            nulls_first: true,
-        };
-        // First entry, and second entry are the physical sort requirement that are argument for get_finer_requirement.
-        // Third entry is the expected result.
-        let tests_cases = vec![
-            // Get finer requirement between [a Some(ASC)] and [a None, b Some(ASC)]
-            // result should be [a Some(ASC), b Some(ASC)]
-            (
-                vec![(col_a, Some(option_asc))],
-                vec![(col_a, None), (col_b, Some(option_asc))],
-                Some(vec![(col_a, Some(option_asc)), (col_b, Some(option_asc))]),
-            ),
-            // Get finer requirement between [a Some(ASC), b Some(ASC), c Some(ASC)] and [a Some(ASC), b Some(ASC)]
-            // result should be [a Some(ASC), b Some(ASC), c Some(ASC)]
-            (
-                vec![
-                    (col_a, Some(option_asc)),
-                    (col_b, Some(option_asc)),
-                    (col_c, Some(option_asc)),
-                ],
-                vec![(col_a, Some(option_asc)), (col_b, Some(option_asc))],
-                Some(vec![
-                    (col_a, Some(option_asc)),
-                    (col_b, Some(option_asc)),
-                    (col_c, Some(option_asc)),
-                ]),
-            ),
-            // Get finer requirement between [a Some(ASC), b Some(ASC)] and [a Some(ASC), b Some(DESC)]
-            // result should be None
-            (
-                vec![(col_a, Some(option_asc)), (col_b, Some(option_asc))],
-                vec![(col_a, Some(option_asc)), (col_b, Some(option_desc))],
-                None,
-            ),
-        ];
-        for (lhs, rhs, expected) in tests_cases {
-            let lhs = convert_to_sort_reqs(&lhs);
-            let rhs = convert_to_sort_reqs(&rhs);
-            let expected = expected.map(|expected| convert_to_sort_reqs(&expected));
-            let finer = eq_properties.get_finer_requirement(&lhs, &rhs);
-            assert_eq!(finer, expected)
-        }
-
-        Ok(())
-    }
-
     #[test]
     fn test_normalize_sort_reqs() -> Result<()> {
         // Schema satisfies following properties
@@ -1040,7 +871,7 @@ mod tests {
             let expected_normalized = convert_to_sort_reqs(&expected_normalized);
 
             assert_eq!(
-                eq_properties.normalize_sort_requirements(&req),
+                eq_properties.normalize_sort_requirements(req).unwrap(),
                 expected_normalized
             );
         }
@@ -1073,8 +904,9 @@ mod tests {
         for (reqs, expected) in test_cases.into_iter() {
             let reqs = convert_to_sort_reqs(&reqs);
             let expected = convert_to_sort_reqs(&expected);
-
-            let normalized = eq_properties.normalize_sort_requirements(&reqs);
+            let normalized = eq_properties
+                .normalize_sort_requirements(reqs.clone())
+                .unwrap();
             assert!(
                 expected.eq(&normalized),
                 "error in test: reqs: {reqs:?}, expected: {expected:?}, normalized: {normalized:?}"
@@ -1091,21 +923,12 @@ mod tests {
             Field::new("b", DataType::Utf8, true),
             Field::new("c", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
         ]));
-        let base_properties = EquivalenceProperties::new(Arc::clone(&schema))
-            .with_reorder(LexOrdering::new(
-                ["a", "b", "c"]
-                    .into_iter()
-                    .map(|c| {
-                        col(c, schema.as_ref()).map(|expr| PhysicalSortExpr {
-                            expr,
-                            options: SortOptions {
-                                descending: false,
-                                nulls_first: true,
-                            },
-                        })
-                    })
-                    .collect::<Result<Vec<_>>>()?,
-            ));
+        let mut base_properties = EquivalenceProperties::new(Arc::clone(&schema));
+        base_properties.reorder(
+            ["a", "b", "c"]
+                .into_iter()
+                .map(|c| PhysicalSortExpr::new_default(col(c, schema.as_ref()).unwrap())),
+        )?;
 
         struct TestCase {
             name: &'static str,
@@ -1118,17 +941,14 @@ mod tests {
         let col_a = col("a", schema.as_ref())?;
         let col_b = col("b", schema.as_ref())?;
         let col_c = col("c", schema.as_ref())?;
-        let cast_c = Arc::new(CastExpr::new(col_c, DataType::Date32, None));
+        let cast_c = Arc::new(CastExpr::new(col_c, DataType::Date32, None)) as _;
 
         let cases = vec![
             TestCase {
                 name: "(a, b, c) -> (c)",
                 // b is constant, so it should be removed from the sort order
                 constants: vec![Arc::clone(&col_b)],
-                equal_conditions: vec![[
-                    Arc::clone(&cast_c) as Arc<dyn PhysicalExpr>,
-                    Arc::clone(&col_a),
-                ]],
+                equal_conditions: vec![[Arc::clone(&cast_c), Arc::clone(&col_a)]],
                 sort_columns: &["c"],
                 should_satisfy_ordering: true,
             },
@@ -1138,10 +958,7 @@ mod tests {
                 name: "(a, b, c) -> (c)",
                 // b is constant, so it should be removed from the sort order
                 constants: vec![col_b],
-                equal_conditions: vec![[
-                    Arc::clone(&col_a),
-                    Arc::clone(&cast_c) as Arc<dyn PhysicalExpr>,
-                ]],
+                equal_conditions: vec![[Arc::clone(&col_a), Arc::clone(&cast_c)]],
                 sort_columns: &["c"],
                 should_satisfy_ordering: true,
             },
@@ -1150,10 +967,7 @@ mod tests {
                 // b is not constant anymore
                 constants: vec![],
                 // a and c are still compatible, but this is irrelevant since the original ordering is (a, b, c)
-                equal_conditions: vec![[
-                    Arc::clone(&cast_c) as Arc<dyn PhysicalExpr>,
-                    Arc::clone(&col_a),
-                ]],
+                equal_conditions: vec![[Arc::clone(&cast_c), Arc::clone(&col_a)]],
                 sort_columns: &["c"],
                 should_satisfy_ordering: false,
             },
@@ -1167,19 +981,21 @@ mod tests {
                 // Equal conditions before constants
                 {
                     let mut properties = base_properties.clone();
-                    for [left, right] in &case.equal_conditions {
+                    for [left, right] in case.equal_conditions.clone() {
                         properties.add_equal_conditions(left, right)?
                     }
-                    properties.with_constants(
+                    properties.add_constants(
                         case.constants.iter().cloned().map(ConstExpr::from),
-                    )
+                    )?;
+                    properties
                 },
                 // Constants before equal conditions
                 {
-                    let mut properties = base_properties.clone().with_constants(
+                    let mut properties = base_properties.clone();
+                    properties.add_constants(
                         case.constants.iter().cloned().map(ConstExpr::from),
-                    );
-                    for [left, right] in &case.equal_conditions {
+                    )?;
+                    for [left, right] in case.equal_conditions {
                         properties.add_equal_conditions(left, right)?
                     }
                     properties
@@ -1188,16 +1004,11 @@ mod tests {
                 let sort = case
                     .sort_columns
                     .iter()
-                    .map(|&name| {
-                        col(name, &schema).map(|col| PhysicalSortExpr {
-                            expr: col,
-                            options: SortOptions::default(),
-                        })
-                    })
-                    .collect::<Result<LexOrdering>>()?;
+                    .map(|&name| col(name, &schema).map(PhysicalSortExpr::new_default))
+                    .collect::<Result<Vec<_>>>()?;
 
                 assert_eq!(
-                    properties.ordering_satisfy(sort.as_ref()),
+                    properties.ordering_satisfy(sort)?,
                     case.should_satisfy_ordering,
                     "failed test '{}'",
                     case.name
@@ -1208,6 +1019,44 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_eliminate_redundant_monotonic_sorts_cast_column_expr() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Date32, true),
+            Field::new("b", DataType::Utf8, true),
+            Field::new("c", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
+        ]));
+        let mut properties = EquivalenceProperties::new(Arc::clone(&schema));
+        properties.reorder(
+            ["a", "b", "c"]
+                .into_iter()
+                .map(|c| PhysicalSortExpr::new_default(col(c, schema.as_ref()).unwrap())),
+        )?;
+
+        let col_a = col("a", schema.as_ref())?;
+        let col_b = col("b", schema.as_ref())?;
+        let col_c = col("c", schema.as_ref())?;
+
+        let cast_c = Arc::new(CastColumnExpr::new(
+            Arc::clone(&col_c),
+            Arc::new(Field::new(
+                "c",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                true,
+            )),
+            Arc::new(Field::new("c", DataType::Date32, true)),
+            None,
+        )) as Arc<dyn PhysicalExpr>;
+
+        properties.add_equal_conditions(cast_c, Arc::clone(&col_a))?;
+        properties.add_constants(std::iter::once(ConstExpr::from(col_b)))?;
+
+        let required = vec![PhysicalSortExpr::new_default(col("c", &schema)?)];
+        assert!(properties.ordering_satisfy(required)?);
+
+        Ok(())
+    }
+
     #[test]
     fn test_ordering_equivalence_with_lex_monotonic_concat() -> Result<()> {
         let schema = Arc::new(Schema::new(vec![
@@ -1225,30 +1074,29 @@ mod tests {
             concat(),
             vec![Arc::clone(&col_a), Arc::clone(&col_b)],
             Field::new("f", DataType::Utf8, true).into(),
+            Arc::new(ConfigOptions::default()),
         ));
 
         // Assume existing ordering is [c ASC, a ASC, b ASC]
         let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
 
-        eq_properties.add_new_ordering(LexOrdering::from(vec![
+        eq_properties.add_ordering([
             PhysicalSortExpr::new_default(Arc::clone(&col_c)).asc(),
             PhysicalSortExpr::new_default(Arc::clone(&col_a)).asc(),
             PhysicalSortExpr::new_default(Arc::clone(&col_b)).asc(),
-        ]));
+        ]);
 
         // Add equality condition c = concat(a, b)
-        eq_properties.add_equal_conditions(&col_c, &a_concat_b)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_c), a_concat_b)?;
 
         let orderings = eq_properties.oeq_class();
 
-        let expected_ordering1 =
-            LexOrdering::from(vec![
-                PhysicalSortExpr::new_default(Arc::clone(&col_c)).asc()
-            ]);
-        let expected_ordering2 = LexOrdering::from(vec![
-            PhysicalSortExpr::new_default(Arc::clone(&col_a)).asc(),
-            PhysicalSortExpr::new_default(Arc::clone(&col_b)).asc(),
-        ]);
+        let expected_ordering1 = [PhysicalSortExpr::new_default(col_c).asc()].into();
+        let expected_ordering2 = [
+            PhysicalSortExpr::new_default(col_a).asc(),
+            PhysicalSortExpr::new_default(col_b).asc(),
+        ]
+        .into();
 
         // The ordering should be [c ASC] and [a ASC, b ASC]
         assert_eq!(orderings.len(), 2);
@@ -1270,25 +1118,26 @@ mod tests {
         let col_b = col("b", &schema)?;
         let col_c = col("c", &schema)?;
 
-        let a_times_b: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        let a_times_b = Arc::new(BinaryExpr::new(
             Arc::clone(&col_a),
             Operator::Multiply,
             Arc::clone(&col_b),
-        ));
+        )) as _;
 
         // Assume existing ordering is [c ASC, a ASC, b ASC]
         let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
 
-        let initial_ordering = LexOrdering::from(vec![
+        let initial_ordering: LexOrdering = [
             PhysicalSortExpr::new_default(Arc::clone(&col_c)).asc(),
-            PhysicalSortExpr::new_default(Arc::clone(&col_a)).asc(),
-            PhysicalSortExpr::new_default(Arc::clone(&col_b)).asc(),
-        ]);
+            PhysicalSortExpr::new_default(col_a).asc(),
+            PhysicalSortExpr::new_default(col_b).asc(),
+        ]
+        .into();
 
-        eq_properties.add_new_ordering(initial_ordering.clone());
+        eq_properties.add_ordering(initial_ordering.clone());
 
         // Add equality condition c = a * b
-        eq_properties.add_equal_conditions(&col_c, &a_times_b)?;
+        eq_properties.add_equal_conditions(col_c, a_times_b)?;
 
         let orderings = eq_properties.oeq_class();
 
@@ -1311,37 +1160,36 @@ mod tests {
         let col_b = col("b", &schema)?;
         let col_c = col("c", &schema)?;
 
-        let a_concat_b: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
+        let a_concat_b = Arc::new(ScalarFunctionExpr::new(
             "concat",
             concat(),
             vec![Arc::clone(&col_a), Arc::clone(&col_b)],
             Field::new("f", DataType::Utf8, true).into(),
-        ));
+            Arc::new(ConfigOptions::default()),
+        )) as _;
 
         // Assume existing ordering is [concat(a, b) ASC, a ASC, b ASC]
         let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
 
-        eq_properties.add_new_ordering(LexOrdering::from(vec![
+        eq_properties.add_ordering([
             PhysicalSortExpr::new_default(Arc::clone(&a_concat_b)).asc(),
             PhysicalSortExpr::new_default(Arc::clone(&col_a)).asc(),
             PhysicalSortExpr::new_default(Arc::clone(&col_b)).asc(),
-        ]));
+        ]);
 
         // Add equality condition c = concat(a, b)
-        eq_properties.add_equal_conditions(&col_c, &a_concat_b)?;
+        eq_properties.add_equal_conditions(col_c, Arc::clone(&a_concat_b))?;
 
         let orderings = eq_properties.oeq_class();
 
-        let expected_ordering1 = LexOrdering::from(vec![PhysicalSortExpr::new_default(
-            Arc::clone(&a_concat_b),
-        )
-        .asc()]);
-        let expected_ordering2 = LexOrdering::from(vec![
-            PhysicalSortExpr::new_default(Arc::clone(&col_a)).asc(),
-            PhysicalSortExpr::new_default(Arc::clone(&col_b)).asc(),
-        ]);
+        let expected_ordering1 = [PhysicalSortExpr::new_default(a_concat_b).asc()].into();
+        let expected_ordering2 = [
+            PhysicalSortExpr::new_default(col_a).asc(),
+            PhysicalSortExpr::new_default(col_b).asc(),
+        ]
+        .into();
 
-        // The ordering should be [concat(a, b) ASC] and [a ASC, b ASC]
+        // The ordering should be [c ASC] and [a ASC, b ASC]
         assert_eq!(orderings.len(), 2);
         assert!(orderings.contains(&expected_ordering1));
         assert!(orderings.contains(&expected_ordering2));
@@ -1349,6 +1197,35 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_requirements_compatible() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+        ]));
+        let col_a = col("a", &schema)?;
+        let col_b = col("b", &schema)?;
+        let col_c = col("c", &schema)?;
+
+        let eq_properties = EquivalenceProperties::new(schema);
+        let lex_a: LexRequirement =
+            [PhysicalSortRequirement::new(Arc::clone(&col_a), None)].into();
+        let lex_a_b: LexRequirement = [
+            PhysicalSortRequirement::new(col_a, None),
+            PhysicalSortRequirement::new(col_b, None),
+        ]
+        .into();
+        let lex_c = [PhysicalSortRequirement::new(col_c, None)].into();
+
+        assert!(eq_properties.requirements_compatible(lex_a.clone(), lex_a.clone()));
+        assert!(!eq_properties.requirements_compatible(lex_a.clone(), lex_a_b.clone()));
+        assert!(eq_properties.requirements_compatible(lex_a_b, lex_a.clone()));
+        assert!(!eq_properties.requirements_compatible(lex_c, lex_a));
+
+        Ok(())
+    }
+
     #[test]
     fn test_with_reorder_constant_filtering() -> Result<()> {
         let schema = create_test_schema()?;
@@ -1357,26 +1234,21 @@ mod tests {
         // Setup constant columns
         let col_a = col("a", &schema)?;
         let col_b = col("b", &schema)?;
-        eq_properties = eq_properties.with_constants([ConstExpr::from(&col_a)]);
+        eq_properties.add_constants([ConstExpr::from(Arc::clone(&col_a))])?;
 
-        let sort_exprs = LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_a),
-                options: SortOptions::default(),
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_b),
-                options: SortOptions::default(),
-            },
-        ]);
+        let sort_exprs = vec![
+            PhysicalSortExpr::new_default(Arc::clone(&col_a)),
+            PhysicalSortExpr::new_default(Arc::clone(&col_b)),
+        ];
 
-        let result = eq_properties.with_reorder(sort_exprs);
+        let change = eq_properties.reorder(sort_exprs)?;
+        assert!(change);
 
-        // Should only contain b since a is constant
-        assert_eq!(result.oeq_class().len(), 1);
-        let ordering = result.oeq_class().iter().next().unwrap();
-        assert_eq!(ordering.len(), 1);
-        assert!(ordering[0].expr.eq(&col_b));
+        assert_eq!(eq_properties.oeq_class().len(), 1);
+        let ordering = eq_properties.oeq_class().iter().next().unwrap();
+        assert_eq!(ordering.len(), 2);
+        assert!(ordering[0].expr.eq(&col_a));
+        assert!(ordering[1].expr.eq(&col_b));
 
         Ok(())
     }
@@ -1397,32 +1269,21 @@ mod tests {
         };
 
         // Initial ordering: [a ASC, b DESC, c ASC]
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_a),
-                options: asc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_b),
-                options: desc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_c),
-                options: asc,
-            },
-        ])]);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new(Arc::clone(&col_a), asc),
+            PhysicalSortExpr::new(Arc::clone(&col_b), desc),
+            PhysicalSortExpr::new(Arc::clone(&col_c), asc),
+        ]);
 
         // New ordering: [a ASC]
-        let new_order = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: Arc::clone(&col_a),
-            options: asc,
-        }]);
+        let new_order = vec![PhysicalSortExpr::new(Arc::clone(&col_a), asc)];
 
-        let result = eq_properties.with_reorder(new_order);
+        let change = eq_properties.reorder(new_order)?;
+        assert!(!change);
 
         // Should only contain [a ASC, b DESC, c ASC]
-        assert_eq!(result.oeq_class().len(), 1);
-        let ordering = result.oeq_class().iter().next().unwrap();
+        assert_eq!(eq_properties.oeq_class().len(), 1);
+        let ordering = eq_properties.oeq_class().iter().next().unwrap();
         assert_eq!(ordering.len(), 3);
         assert!(ordering[0].expr.eq(&col_a));
         assert!(ordering[0].options.eq(&asc));
@@ -1444,37 +1305,28 @@ mod tests {
         let col_c = col("c", &schema)?;
 
         // Make a and b equivalent
-        eq_properties.add_equal_conditions(&col_a, &col_b)?;
-
-        let asc = SortOptions::default();
+        eq_properties.add_equal_conditions(Arc::clone(&col_a), Arc::clone(&col_b))?;
 
         // Initial ordering: [a ASC, c ASC]
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_a),
-                options: asc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_c),
-                options: asc,
-            },
-        ])]);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new_default(Arc::clone(&col_a)),
+            PhysicalSortExpr::new_default(Arc::clone(&col_c)),
+        ]);
 
         // New ordering: [b ASC]
-        let new_order = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: Arc::clone(&col_b),
-            options: asc,
-        }]);
+        let new_order = vec![PhysicalSortExpr::new_default(Arc::clone(&col_b))];
 
-        let result = eq_properties.with_reorder(new_order);
+        let change = eq_properties.reorder(new_order)?;
 
-        // Should only contain [b ASC, c ASC]
-        assert_eq!(result.oeq_class().len(), 1);
+        assert!(!change);
+        // Should only contain [a/b ASC, c ASC]
+        assert_eq!(eq_properties.oeq_class().len(), 1);
 
         // Verify orderings
-        let ordering = result.oeq_class().iter().next().unwrap();
+        let asc = SortOptions::default();
+        let ordering = eq_properties.oeq_class().iter().next().unwrap();
         assert_eq!(ordering.len(), 2);
-        assert!(ordering[0].expr.eq(&col_b));
+        assert!(ordering[0].expr.eq(&col_a) || ordering[0].expr.eq(&col_b));
         assert!(ordering[0].options.eq(&asc));
         assert!(ordering[1].expr.eq(&col_c));
         assert!(ordering[1].options.eq(&asc));
@@ -1497,29 +1349,21 @@ mod tests {
         };
 
         // Initial ordering: [a ASC, b DESC]
-        eq_properties.add_new_orderings([LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_a),
-                options: asc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_b),
-                options: desc,
-            },
-        ])]);
+        eq_properties.add_ordering([
+            PhysicalSortExpr::new(Arc::clone(&col_a), asc),
+            PhysicalSortExpr::new(Arc::clone(&col_b), desc),
+        ]);
 
         // New ordering: [a DESC]
-        let new_order = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: Arc::clone(&col_a),
-            options: desc,
-        }]);
+        let new_order = vec![PhysicalSortExpr::new(Arc::clone(&col_a), desc)];
 
-        let result = eq_properties.with_reorder(new_order.clone());
+        let change = eq_properties.reorder(new_order.clone())?;
 
+        assert!(change);
         // Should only contain the new ordering since options don't match
-        assert_eq!(result.oeq_class().len(), 1);
-        let ordering = result.oeq_class().iter().next().unwrap();
-        assert_eq!(ordering, &new_order);
+        assert_eq!(eq_properties.oeq_class().len(), 1);
+        let ordering = eq_properties.oeq_class().iter().next().unwrap();
+        assert_eq!(ordering.to_vec(), new_order);
 
         Ok(())
     }
@@ -1535,62 +1379,32 @@ mod tests {
         let col_d = col("d", &schema)?;
         let col_e = col("e", &schema)?;
 
-        let asc = SortOptions::default();
-
         // Constants: c is constant
-        eq_properties = eq_properties.with_constants([ConstExpr::from(&col_c)]);
+        eq_properties.add_constants([ConstExpr::from(Arc::clone(&col_c))])?;
 
         // Equality: b = d
-        eq_properties.add_equal_conditions(&col_b, &col_d)?;
+        eq_properties.add_equal_conditions(Arc::clone(&col_b), Arc::clone(&col_d))?;
 
         // Orderings: [d ASC, a ASC], [e ASC]
-        eq_properties.add_new_orderings([
-            LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::clone(&col_d),
-                    options: asc,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::clone(&col_a),
-                    options: asc,
-                },
-            ]),
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: Arc::clone(&col_e),
-                options: asc,
-            }]),
+        eq_properties.add_orderings([
+            vec![
+                PhysicalSortExpr::new_default(Arc::clone(&col_d)),
+                PhysicalSortExpr::new_default(Arc::clone(&col_a)),
+            ],
+            vec![PhysicalSortExpr::new_default(Arc::clone(&col_e))],
         ]);
 
-        // Initial ordering: [b ASC, c ASC]
-        let new_order = LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_b),
-                options: asc,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(&col_c),
-                options: asc,
-            },
-        ]);
-
-        let result = eq_properties.with_reorder(new_order);
-
-        // Should preserve the original [d ASC, a ASC] ordering
-        assert_eq!(result.oeq_class().len(), 1);
-        let ordering = result.oeq_class().iter().next().unwrap();
-        assert_eq!(ordering.len(), 2);
-
-        // First expression should be either b or d (they're equivalent)
-        assert!(
-            ordering[0].expr.eq(&col_b) || ordering[0].expr.eq(&col_d),
-            "Expected b or d as first expression, got {:?}",
-            ordering[0].expr
-        );
-        assert!(ordering[0].options.eq(&asc));
+        // New ordering: [b ASC, c ASC]
+        let new_order = vec![
+            PhysicalSortExpr::new_default(Arc::clone(&col_b)),
+            PhysicalSortExpr::new_default(Arc::clone(&col_c)),
+        ];
 
-        // Second expression should be a
-        assert!(ordering[1].expr.eq(&col_a));
-        assert!(ordering[1].options.eq(&asc));
+        let old_orderings = eq_properties.oeq_class().clone();
+        let change = eq_properties.reorder(new_order)?;
+        // Original orderings should be preserved:
+        assert!(!change);
+        assert_eq!(eq_properties.oeq_class, old_orderings);
 
         Ok(())
     }
@@ -1691,75 +1505,62 @@ mod tests {
         {
             let mut eq_properties = EquivalenceProperties::new(Arc::clone(schema));
 
-            // Convert base ordering
-            let base_ordering = LexOrdering::new(
-                base_order
-                    .iter()
-                    .map(|col_name| PhysicalSortExpr {
-                        expr: col(col_name, schema).unwrap(),
-                        options: SortOptions::default(),
-                    })
-                    .collect(),
-            );
-
             // Convert string column names to orderings
-            let satisfied_orderings: Vec<LexOrdering> = satisfied_orders
+            let satisfied_orderings: Vec<_> = satisfied_orders
                 .iter()
                 .map(|cols| {
-                    LexOrdering::new(
-                        cols.iter()
-                            .map(|col_name| PhysicalSortExpr {
-                                expr: col(col_name, schema).unwrap(),
-                                options: SortOptions::default(),
-                            })
-                            .collect(),
-                    )
+                    cols.iter()
+                        .map(|col_name| {
+                            PhysicalSortExpr::new_default(col(col_name, schema).unwrap())
+                        })
+                        .collect::<Vec<_>>()
                 })
                 .collect();
 
-            let unsatisfied_orderings: Vec<LexOrdering> = unsatisfied_orders
+            let unsatisfied_orderings: Vec<_> = unsatisfied_orders
                 .iter()
                 .map(|cols| {
-                    LexOrdering::new(
-                        cols.iter()
-                            .map(|col_name| PhysicalSortExpr {
-                                expr: col(col_name, schema).unwrap(),
-                                options: SortOptions::default(),
-                            })
-                            .collect(),
-                    )
+                    cols.iter()
+                        .map(|col_name| {
+                            PhysicalSortExpr::new_default(col(col_name, schema).unwrap())
+                        })
+                        .collect::<Vec<_>>()
                 })
                 .collect();
 
             // Test that orderings are not satisfied before adding constraints
-            for ordering in &satisfied_orderings {
-                assert!(
-                    !eq_properties.ordering_satisfy(ordering),
-                    "{name}: ordering {ordering:?} should not be satisfied before adding constraints"
+            for ordering in satisfied_orderings.clone() {
+                let err_msg = format!(
+                    "{name}: ordering {ordering:?} should not be satisfied before adding constraints",
                 );
+                assert!(!eq_properties.ordering_satisfy(ordering)?, "{err_msg}");
             }
 
             // Add base ordering
-            eq_properties.add_new_ordering(base_ordering);
+            let base_ordering = base_order.iter().map(|col_name| PhysicalSortExpr {
+                expr: col(col_name, schema).unwrap(),
+                options: SortOptions::default(),
+            });
+            eq_properties.add_ordering(base_ordering);
 
             // Add constraints
             eq_properties =
                 eq_properties.with_constraints(Constraints::new_unverified(constraints));
 
             // Test that expected orderings are now satisfied
-            for ordering in &satisfied_orderings {
-                assert!(
-                    eq_properties.ordering_satisfy(ordering),
-                    "{name}: ordering {ordering:?} should be satisfied after adding constraints"
+            for ordering in satisfied_orderings {
+                let err_msg = format!(
+                    "{name}: ordering {ordering:?} should be satisfied after adding constraints",
                 );
+                assert!(eq_properties.ordering_satisfy(ordering)?, "{err_msg}");
             }
 
             // Test that unsatisfied orderings remain unsatisfied
-            for ordering in &unsatisfied_orderings {
-                assert!(
-                    !eq_properties.ordering_satisfy(ordering),
-                    "{name}: ordering {ordering:?} should not be satisfied after adding constraints"
+            for ordering in unsatisfied_orderings {
+                let err_msg = format!(
+                    "{name}: ordering {ordering:?} should not be satisfied after adding constraints",
                 );
+                assert!(!eq_properties.ordering_satisfy(ordering)?, "{err_msg}");
             }
         }
 
diff --git a/datafusion/physical-expr/src/equivalence/properties/joins.rs b/datafusion/physical-expr/src/equivalence/properties/joins.rs
index 344cf54a57a8a..aa3e1c5675d53 100644
--- a/datafusion/physical-expr/src/equivalence/properties/joins.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/joins.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{equivalence::OrderingEquivalenceClass, PhysicalExprRef};
-use arrow::datatypes::SchemaRef;
-use datafusion_common::{JoinSide, JoinType};
-
 use super::EquivalenceProperties;
+use crate::{PhysicalExprRef, equivalence::OrderingEquivalenceClass};
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{JoinSide, JoinType, Result};
 
 /// Calculate ordering equivalence properties for the given join operation.
 pub fn join_equivalence_properties(
@@ -30,7 +30,7 @@ pub fn join_equivalence_properties(
     maintains_input_order: &[bool],
     probe_side: Option<JoinSide>,
     on: &[(PhysicalExprRef, PhysicalExprRef)],
-) -> EquivalenceProperties {
+) -> Result<EquivalenceProperties> {
     let left_size = left.schema.fields.len();
     let mut result = EquivalenceProperties::new(join_schema);
     result.add_equivalence_group(left.eq_group().join(
@@ -38,15 +38,13 @@ pub fn join_equivalence_properties(
         join_type,
         left_size,
         on,
-    ));
+    )?)?;
 
     let EquivalenceProperties {
-        constants: left_constants,
         oeq_class: left_oeq_class,
         ..
     } = left;
     let EquivalenceProperties {
-        constants: right_constants,
         oeq_class: mut right_oeq_class,
         ..
     } = right;
@@ -54,12 +52,14 @@ pub fn join_equivalence_properties(
         [true, false] => {
             // In this special case, right side ordering can be prefixed with
             // the left side ordering.
-            if let (Some(JoinSide::Left), JoinType::Inner) = (probe_side, join_type) {
+            if matches!(join_type, JoinType::Inner | JoinType::Left)
+                && probe_side == Some(JoinSide::Left)
+            {
                 updated_right_ordering_equivalence_class(
                     &mut right_oeq_class,
                     join_type,
                     left_size,
-                );
+                )?;
 
                 // Right side ordering equivalence properties should be prepended
                 // with those of the left side while constructing output ordering
@@ -70,9 +70,9 @@ pub fn join_equivalence_properties(
                 // then we should add `a ASC, b ASC` to the ordering equivalences
                 // of the join output.
                 let out_oeq_class = left_oeq_class.join_suffix(&right_oeq_class);
-                result.add_ordering_equivalence_class(out_oeq_class);
+                result.add_orderings(out_oeq_class);
             } else {
-                result.add_ordering_equivalence_class(left_oeq_class);
+                result.add_orderings(left_oeq_class);
             }
         }
         [false, true] => {
@@ -80,10 +80,12 @@ pub fn join_equivalence_properties(
                 &mut right_oeq_class,
                 join_type,
                 left_size,
-            );
+            )?;
             // In this special case, left side ordering can be prefixed with
             // the right side ordering.
-            if let (Some(JoinSide::Right), JoinType::Inner) = (probe_side, join_type) {
+            if matches!(join_type, JoinType::Inner | JoinType::Right)
+                && probe_side == Some(JoinSide::Right)
+            {
                 // Left side ordering equivalence properties should be prepended
                 // with those of the right side while constructing output ordering
                 // equivalence properties since stream side is the right side.
@@ -93,25 +95,16 @@ pub fn join_equivalence_properties(
                 // then we should add `b ASC, a ASC` to the ordering equivalences
                 // of the join output.
                 let out_oeq_class = right_oeq_class.join_suffix(&left_oeq_class);
-                result.add_ordering_equivalence_class(out_oeq_class);
+                result.add_orderings(out_oeq_class);
             } else {
-                result.add_ordering_equivalence_class(right_oeq_class);
+                result.add_orderings(right_oeq_class);
             }
         }
         [false, false] => {}
         [true, true] => unreachable!("Cannot maintain ordering of both sides"),
         _ => unreachable!("Join operators can not have more than two children"),
     }
-    match join_type {
-        JoinType::LeftAnti | JoinType::LeftSemi => {
-            result = result.with_constants(left_constants);
-        }
-        JoinType::RightAnti | JoinType::RightSemi => {
-            result = result.with_constants(right_constants);
-        }
-        _ => {}
-    }
-    result
+    Ok(result)
 }
 
 /// In the context of a join, update the right side `OrderingEquivalenceClass`
@@ -125,28 +118,29 @@ pub fn updated_right_ordering_equivalence_class(
     right_oeq_class: &mut OrderingEquivalenceClass,
     join_type: &JoinType,
     left_size: usize,
-) {
+) -> Result<()> {
     if matches!(
         join_type,
         JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right
     ) {
-        right_oeq_class.add_offset(left_size);
+        right_oeq_class.add_offset(left_size as _)?;
     }
+    Ok(())
 }
 
 #[cfg(test)]
 mod tests {
-
     use std::sync::Arc;
 
     use super::*;
-    use crate::equivalence::add_offset_to_expr;
-    use crate::equivalence::tests::{convert_to_orderings, create_test_schema};
+    use crate::equivalence::convert_to_orderings;
+    use crate::equivalence::tests::create_test_schema;
     use crate::expressions::col;
-    use datafusion_common::Result;
+    use crate::physical_expr::add_offset_to_expr;
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Fields, Schema};
+    use datafusion_common::Result;
 
     #[test]
     fn test_join_equivalence_properties() -> Result<()> {
@@ -154,9 +148,9 @@ mod tests {
         let col_a = &col("a", &schema)?;
         let col_b = &col("b", &schema)?;
         let col_c = &col("c", &schema)?;
-        let offset = schema.fields.len();
-        let col_a2 = &add_offset_to_expr(Arc::clone(col_a), offset);
-        let col_b2 = &add_offset_to_expr(Arc::clone(col_b), offset);
+        let offset = schema.fields.len() as _;
+        let col_a2 = &add_offset_to_expr(Arc::clone(col_a), offset)?;
+        let col_b2 = &add_offset_to_expr(Arc::clone(col_b), offset)?;
         let option_asc = SortOptions {
             descending: false,
             nulls_first: false,
@@ -205,8 +199,8 @@ mod tests {
             let left_orderings = convert_to_orderings(&left_orderings);
             let right_orderings = convert_to_orderings(&right_orderings);
             let expected = convert_to_orderings(&expected);
-            left_eq_properties.add_new_orderings(left_orderings);
-            right_eq_properties.add_new_orderings(right_orderings);
+            left_eq_properties.add_orderings(left_orderings);
+            right_eq_properties.add_orderings(right_orderings);
             let join_eq = join_equivalence_properties(
                 left_eq_properties,
                 right_eq_properties,
@@ -215,7 +209,7 @@ mod tests {
                 &[true, false],
                 Some(JoinSide::Left),
                 &[],
-            );
+            )?;
             let err_msg =
                 format!("expected: {:?}, actual:{:?}", expected, &join_eq.oeq_class);
             assert_eq!(join_eq.oeq_class.len(), expected.len(), "{err_msg}");
@@ -253,7 +247,7 @@ mod tests {
         ];
         let orderings = convert_to_orderings(&orderings);
         // Right child ordering equivalences
-        let mut right_oeq_class = OrderingEquivalenceClass::new(orderings);
+        let mut right_oeq_class = OrderingEquivalenceClass::from(orderings);
 
         let left_columns_len = 4;
 
@@ -264,24 +258,24 @@ mod tests {
 
         // Join Schema
         let schema = Schema::new(fields);
-        let col_a = &col("a", &schema)?;
-        let col_d = &col("d", &schema)?;
-        let col_x = &col("x", &schema)?;
-        let col_y = &col("y", &schema)?;
-        let col_z = &col("z", &schema)?;
-        let col_w = &col("w", &schema)?;
+        let col_a = col("a", &schema)?;
+        let col_d = col("d", &schema)?;
+        let col_x = col("x", &schema)?;
+        let col_y = col("y", &schema)?;
+        let col_z = col("z", &schema)?;
+        let col_w = col("w", &schema)?;
 
         let mut join_eq_properties = EquivalenceProperties::new(Arc::new(schema));
         // a=x and d=w
-        join_eq_properties.add_equal_conditions(col_a, col_x)?;
-        join_eq_properties.add_equal_conditions(col_d, col_w)?;
+        join_eq_properties.add_equal_conditions(col_a, Arc::clone(&col_x))?;
+        join_eq_properties.add_equal_conditions(col_d, Arc::clone(&col_w))?;
 
         updated_right_ordering_equivalence_class(
             &mut right_oeq_class,
             &join_type,
             left_columns_len,
-        );
-        join_eq_properties.add_ordering_equivalence_class(right_oeq_class);
+        )?;
+        join_eq_properties.add_orderings(right_oeq_class);
         let result = join_eq_properties.oeq_class().clone();
 
         // [x ASC, y ASC], [z ASC, w ASC]
@@ -290,7 +284,7 @@ mod tests {
             vec![(col_z, option_asc), (col_w, option_asc)],
         ];
         let orderings = convert_to_orderings(&orderings);
-        let expected = OrderingEquivalenceClass::new(orderings);
+        let expected = OrderingEquivalenceClass::from(orderings);
 
         assert_eq!(result, expected);
 
diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs
index 8f6391bc0b5ef..1ca4ead0335de 100644
--- a/datafusion/physical-expr/src/equivalence/properties/mod.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs
@@ -19,47 +19,43 @@ mod dependency; // Submodule containing DependencyMap and Dependencies
 mod joins; // Submodule containing join_equivalence_properties
 mod union; // Submodule containing calculate_union
 
-use dependency::{
-    construct_prefix_orderings, generate_dependency_orderings, referred_dependencies,
-    Dependencies, DependencyMap,
-};
 pub use joins::*;
 pub use union::*;
 
-use std::fmt::Display;
-use std::hash::{Hash, Hasher};
+use std::fmt::{self, Display};
+use std::mem;
 use std::sync::Arc;
-use std::{fmt, mem};
 
-use crate::equivalence::class::{const_exprs_contains, AcrossPartitions};
+use self::dependency::{
+    Dependencies, DependencyMap, construct_prefix_orderings,
+    generate_dependency_orderings, referred_dependencies,
+};
 use crate::equivalence::{
-    EquivalenceClass, EquivalenceGroup, OrderingEquivalenceClass, ProjectionMapping,
+    AcrossPartitions, EquivalenceGroup, OrderingEquivalenceClass, ProjectionMapping,
 };
-use crate::expressions::{with_new_schema, CastExpr, Column, Literal};
+use crate::expressions::{CastColumnExpr, CastExpr, Column, Literal, with_new_schema};
 use crate::{
-    physical_exprs_contains, ConstExpr, LexOrdering, LexRequirement, PhysicalExpr,
-    PhysicalSortExpr, PhysicalSortRequirement,
+    ConstExpr, LexOrdering, LexRequirement, PhysicalExpr, PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
-use arrow::compute::SortOptions;
-use arrow::datatypes::SchemaRef;
+use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{plan_err, Constraint, Constraints, HashMap, Result};
+use datafusion_common::{Constraint, Constraints, HashMap, Result, plan_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_physical_expr_common::sort_expr::options_compatible;
 use datafusion_physical_expr_common::utils::ExprPropertiesNode;
 
 use indexmap::IndexSet;
 use itertools::Itertools;
 
-/// `EquivalenceProperties` stores information about the output
-/// of a plan node, that can be used to optimize the plan.
-///
-/// Currently, it keeps track of:
-/// - Sort expressions (orderings)
-/// - Equivalent expressions: expressions that are known to have same value.
-/// - Constants expressions: expressions that are known to contain a single
-///   constant value.
+/// `EquivalenceProperties` stores information about the output of a plan node
+/// that can be used to optimize the plan. Currently, it keeps track of:
+/// - Sort expressions (orderings),
+/// - Equivalent expressions; i.e. expressions known to have the same value.
+/// - Constants expressions; i.e. expressions known to contain a single constant
+///   value.
 ///
 /// Please see the [Using Ordering for Better Plans] blog for more details.
 ///
@@ -81,8 +77,8 @@ use itertools::Itertools;
 /// ```
 ///
 /// In this case, both `a ASC` and `b DESC` can describe the table ordering.
-/// `EquivalenceProperties`, tracks these different valid sort expressions and
-/// treat `a ASC` and `b DESC` on an equal footing. For example if the query
+/// `EquivalenceProperties` tracks these different valid sort expressions and
+/// treat `a ASC` and `b DESC` on an equal footing. For example, if the query
 /// specifies the output sorted by EITHER `a ASC` or `b DESC`, the sort can be
 /// avoided.
 ///
@@ -101,12 +97,11 @@ use itertools::Itertools;
 /// └---┴---┘
 /// ```
 ///
-/// In this case,  columns `a` and `b` always have the same value, which can of
-/// such equivalences inside this object. With this information, Datafusion can
-/// optimize operations such as. For example, if the partition requirement is
-/// `Hash(a)` and output partitioning is `Hash(b)`, then DataFusion avoids
-/// repartitioning the data as the existing partitioning satisfies the
-/// requirement.
+/// In this case,  columns `a` and `b` always have the same value. With this
+/// information, Datafusion can optimize various operations. For example, if
+/// the partition requirement is `Hash(a)` and output partitioning is
+/// `Hash(b)`, then DataFusion avoids repartitioning the data as the existing
+/// partitioning satisfies the requirement.
 ///
 /// # Code Example
 /// ```
@@ -125,57 +120,154 @@ use itertools::Itertools;
 /// # let col_c = col("c", &schema).unwrap();
 /// // This object represents data that is sorted by a ASC, c DESC
 /// // with a single constant value of b
-/// let mut eq_properties = EquivalenceProperties::new(schema)
-///   .with_constants(vec![ConstExpr::from(col_b)]);
-/// eq_properties.add_new_ordering(LexOrdering::new(vec![
-///   PhysicalSortExpr::new_default(col_a).asc(),
-///   PhysicalSortExpr::new_default(col_c).desc(),
-/// ]));
+/// let mut eq_properties = EquivalenceProperties::new(schema);
+/// eq_properties.add_constants(vec![ConstExpr::from(col_b)]);
+/// eq_properties.add_ordering([
+///     PhysicalSortExpr::new_default(col_a).asc(),
+///     PhysicalSortExpr::new_default(col_c).desc(),
+/// ]);
 ///
-/// assert_eq!(eq_properties.to_string(), "order: [[a@0 ASC, c@2 DESC]], const: [b@1(heterogeneous)]")
+/// assert_eq!(
+///     eq_properties.to_string(),
+///     "order: [[a@0 ASC, c@2 DESC]], eq: [{members: [b@1], constant: (heterogeneous)}]"
+/// );
 /// ```
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug)]
 pub struct EquivalenceProperties {
-    /// Distinct equivalence classes (exprs known to have the same expressions)
+    /// Distinct equivalence classes (i.e. expressions with the same value).
     eq_group: EquivalenceGroup,
-    /// Equivalent sort expressions
+    /// Equivalent sort expressions (i.e. those define the same ordering).
     oeq_class: OrderingEquivalenceClass,
-    /// Expressions whose values are constant
-    ///
-    /// TODO: We do not need to track constants separately, they can be tracked
-    ///       inside `eq_group` as `Literal` expressions.
-    constants: Vec<ConstExpr>,
-    /// Table constraints
+    /// Cache storing equivalent sort expressions in normal form (i.e. without
+    /// constants/duplicates and in standard form) and a map associating leading
+    /// terms with full sort expressions.
+    oeq_cache: OrderingEquivalenceCache,
+    /// Table constraints that factor in equivalence calculations.
     constraints: Constraints,
     /// Schema associated with this object.
     schema: SchemaRef,
 }
 
+/// This object serves as a cache for storing equivalent sort expressions
+/// in normal form, and a map associating leading sort expressions with
+/// full lexicographical orderings. With this information, DataFusion can
+/// efficiently determine whether a given ordering is satisfied by the
+/// existing orderings, and discover new orderings based on the existing
+/// equivalence properties.
+#[derive(Clone, Debug, Default)]
+struct OrderingEquivalenceCache {
+    /// Equivalent sort expressions in normal form.
+    normal_cls: OrderingEquivalenceClass,
+    /// Map associating leading sort expressions with full lexicographical
+    /// orderings. Values are indices into `normal_cls`.
+    leading_map: HashMap<Arc<dyn PhysicalExpr>, Vec<usize>>,
+}
+
+impl OrderingEquivalenceCache {
+    /// Creates a new `OrderingEquivalenceCache` object with the given
+    /// equivalent orderings, which should be in normal form.
+    pub fn new(
+        orderings: impl IntoIterator<Item = impl IntoIterator<Item = PhysicalSortExpr>>,
+    ) -> Self {
+        let mut cache = Self {
+            normal_cls: OrderingEquivalenceClass::new(orderings),
+            leading_map: HashMap::new(),
+        };
+        cache.update_map();
+        cache
+    }
+
+    /// Updates/reconstructs the leading expression map according to the normal
+    /// ordering equivalence class within.
+    pub fn update_map(&mut self) {
+        self.leading_map.clear();
+        for (idx, ordering) in self.normal_cls.iter().enumerate() {
+            let expr = Arc::clone(&ordering.first().expr);
+            self.leading_map.entry(expr).or_default().push(idx);
+        }
+    }
+
+    /// Clears the cache, removing all orderings and leading expressions.
+    pub fn clear(&mut self) {
+        self.normal_cls.clear();
+        self.leading_map.clear();
+    }
+}
+
 impl EquivalenceProperties {
+    /// Helper used by the ordering equivalence rule when considering whether a
+    /// cast-bearing expression can replace an existing sort key without invalidating
+    /// the ordering.
+    ///
+    /// This function handles *both* `CastExpr` (generic cast) and
+    /// `CastColumnExpr` (field-aware cast) because the planner may introduce either
+    /// form during rewrite steps; the core logic is the same in both cases.  The
+    /// substitution is only allowed when the cast wraps **the very same child
+    /// expression** that the original sort used (an exact-child-match invariant),
+    /// and the casted type must be a widening/order-preserving conversion
+    /// `CastExpr::check_bigger_cast(...)` ensures.  Without those restrictions the
+    /// existing sort order could be violated (e.g. a narrowing cast could collapse
+    /// distinct values together).
+    fn substitute_cast_like_ordering(
+        r_expr: Arc<dyn PhysicalExpr>,
+        sort_expr: &PhysicalSortExpr,
+        expr_type: &DataType,
+    ) -> Option<PhysicalSortExpr> {
+        let (child_expr, cast_type) = if let Some(cast_expr) =
+            r_expr.as_any().downcast_ref::<CastExpr>()
+        {
+            (cast_expr.expr(), cast_expr.cast_type())
+        } else if let Some(cast_expr) = r_expr.as_any().downcast_ref::<CastColumnExpr>() {
+            (cast_expr.expr(), cast_expr.target_field().data_type())
+        } else {
+            return None;
+        };
+
+        (child_expr.eq(&sort_expr.expr)
+            && CastExpr::check_bigger_cast(cast_type, expr_type))
+        .then(|| PhysicalSortExpr::new(r_expr, sort_expr.options))
+    }
+
     /// Creates an empty `EquivalenceProperties` object.
     pub fn new(schema: SchemaRef) -> Self {
         Self {
-            eq_group: EquivalenceGroup::empty(),
-            oeq_class: OrderingEquivalenceClass::empty(),
-            constants: vec![],
-            constraints: Constraints::empty(),
+            eq_group: EquivalenceGroup::default(),
+            oeq_class: OrderingEquivalenceClass::default(),
+            oeq_cache: OrderingEquivalenceCache::default(),
+            constraints: Constraints::default(),
             schema,
         }
     }
 
     /// Adds constraints to the properties.
-    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+    pub fn set_constraints(&mut self, constraints: Constraints) {
         self.constraints = constraints;
+    }
+
+    /// Adds constraints to the properties.
+    pub fn with_constraints(mut self, constraints: Constraints) -> Self {
+        self.set_constraints(constraints);
         self
     }
 
     /// Creates a new `EquivalenceProperties` object with the given orderings.
-    pub fn new_with_orderings(schema: SchemaRef, orderings: &[LexOrdering]) -> Self {
+    pub fn new_with_orderings(
+        schema: SchemaRef,
+        orderings: impl IntoIterator<Item = impl IntoIterator<Item = PhysicalSortExpr>>,
+    ) -> Self {
+        let eq_group = EquivalenceGroup::default();
+        let oeq_class = OrderingEquivalenceClass::new(orderings);
+        // Here, we can avoid performing a full normalization, and get by with
+        // only removing constants because the equivalence group is empty.
+        let normal_orderings = oeq_class.iter().cloned().map(|o| {
+            o.into_iter()
+                .filter(|sort_expr| eq_group.is_expr_constant(&sort_expr.expr).is_none())
+        });
         Self {
-            eq_group: EquivalenceGroup::empty(),
-            oeq_class: OrderingEquivalenceClass::new(orderings.to_vec()),
-            constants: vec![],
-            constraints: Constraints::empty(),
+            oeq_cache: OrderingEquivalenceCache::new(normal_orderings),
+            oeq_class,
+            eq_group,
+            constraints: Constraints::default(),
             schema,
         }
     }
@@ -190,91 +282,131 @@ impl EquivalenceProperties {
         &self.oeq_class
     }
 
-    /// Return the inner OrderingEquivalenceClass, consuming self
-    pub fn into_oeq_class(self) -> OrderingEquivalenceClass {
-        self.oeq_class
-    }
-
     /// Returns a reference to the equivalence group within.
     pub fn eq_group(&self) -> &EquivalenceGroup {
         &self.eq_group
     }
 
-    /// Returns a reference to the constant expressions
-    pub fn constants(&self) -> &[ConstExpr] {
-        &self.constants
-    }
-
+    /// Returns a reference to the constraints within.
     pub fn constraints(&self) -> &Constraints {
         &self.constraints
     }
 
-    /// Returns the output ordering of the properties.
-    pub fn output_ordering(&self) -> Option<LexOrdering> {
-        let constants = self.constants();
-        let mut output_ordering = self.oeq_class().output_ordering().unwrap_or_default();
-        // Prune out constant expressions
-        output_ordering
-            .retain(|sort_expr| !const_exprs_contains(constants, &sort_expr.expr));
-        (!output_ordering.is_empty()).then_some(output_ordering)
+    /// Returns all the known constants expressions.
+    pub fn constants(&self) -> Vec<ConstExpr> {
+        self.eq_group
+            .iter()
+            .flat_map(|c| {
+                c.iter().filter_map(|expr| {
+                    c.constant
+                        .as_ref()
+                        .map(|across| ConstExpr::new(Arc::clone(expr), across.clone()))
+                })
+            })
+            .collect()
     }
 
-    /// Returns the normalized version of the ordering equivalence class within.
-    /// Normalization removes constants and duplicates as well as standardizing
-    /// expressions according to the equivalence group within.
-    pub fn normalized_oeq_class(&self) -> OrderingEquivalenceClass {
-        OrderingEquivalenceClass::new(
-            self.oeq_class
-                .iter()
-                .map(|ordering| self.normalize_sort_exprs(ordering))
-                .collect(),
-        )
+    /// Returns the output ordering of the properties.
+    pub fn output_ordering(&self) -> Option<LexOrdering> {
+        let concat = self.oeq_class.iter().flat_map(|o| o.iter().cloned());
+        self.normalize_sort_exprs(concat)
     }
 
     /// Extends this `EquivalenceProperties` with the `other` object.
-    pub fn extend(mut self, other: Self) -> Self {
-        self.eq_group.extend(other.eq_group);
-        self.oeq_class.extend(other.oeq_class);
-        self.with_constants(other.constants)
+    pub fn extend(mut self, other: Self) -> Result<Self> {
+        self.constraints.extend(other.constraints);
+        self.add_equivalence_group(other.eq_group)?;
+        self.add_orderings(other.oeq_class);
+        Ok(self)
     }
 
     /// Clears (empties) the ordering equivalence class within this object.
     /// Call this method when existing orderings are invalidated.
     pub fn clear_orderings(&mut self) {
         self.oeq_class.clear();
+        self.oeq_cache.clear();
     }
 
     /// Removes constant expressions that may change across partitions.
-    /// This method should be used when data from different partitions are merged.
+    /// This method should be used when merging data from different partitions.
     pub fn clear_per_partition_constants(&mut self) {
-        self.constants.retain(|item| {
-            matches!(item.across_partitions(), AcrossPartitions::Uniform(_))
-        })
-    }
-
-    /// Extends this `EquivalenceProperties` by adding the orderings inside the
-    /// ordering equivalence class `other`.
-    pub fn add_ordering_equivalence_class(&mut self, other: OrderingEquivalenceClass) {
-        self.oeq_class.extend(other);
+        if self.eq_group.clear_per_partition_constants() {
+            // Renormalize orderings if the equivalence group changes:
+            let normal_orderings = self
+                .oeq_class
+                .iter()
+                .cloned()
+                .map(|o| self.eq_group.normalize_sort_exprs(o));
+            self.oeq_cache = OrderingEquivalenceCache::new(normal_orderings);
+        }
     }
 
     /// Adds new orderings into the existing ordering equivalence class.
-    pub fn add_new_orderings(
+    pub fn add_orderings(
         &mut self,
-        orderings: impl IntoIterator<Item = LexOrdering>,
+        orderings: impl IntoIterator<Item = impl IntoIterator<Item = PhysicalSortExpr>>,
     ) {
-        self.oeq_class.add_new_orderings(orderings);
+        let orderings: Vec<_> =
+            orderings.into_iter().filter_map(LexOrdering::new).collect();
+        let normal_orderings: Vec<_> = orderings
+            .iter()
+            .cloned()
+            .filter_map(|o| self.normalize_sort_exprs(o))
+            .collect();
+        if !normal_orderings.is_empty() {
+            self.oeq_class.extend(orderings);
+            // Normalize given orderings to update the cache:
+            self.oeq_cache.normal_cls.extend(normal_orderings);
+            // TODO: If no ordering is found to be redundant during extension, we
+            //       can use a shortcut algorithm to update the leading map.
+            self.oeq_cache.update_map();
+        }
     }
 
     /// Adds a single ordering to the existing ordering equivalence class.
-    pub fn add_new_ordering(&mut self, ordering: LexOrdering) {
-        self.add_new_orderings([ordering]);
+    pub fn add_ordering(&mut self, ordering: impl IntoIterator<Item = PhysicalSortExpr>) {
+        self.add_orderings(std::iter::once(ordering));
+    }
+
+    fn update_oeq_cache(&mut self) -> Result<()> {
+        // Renormalize orderings if the equivalence group changes:
+        let normal_cls = mem::take(&mut self.oeq_cache.normal_cls);
+        let normal_orderings = normal_cls
+            .into_iter()
+            .map(|o| self.eq_group.normalize_sort_exprs(o));
+        self.oeq_cache.normal_cls = OrderingEquivalenceClass::new(normal_orderings);
+        self.oeq_cache.update_map();
+        // Discover any new orderings based on the new equivalence classes:
+        let leading_exprs: Vec<_> = self.oeq_cache.leading_map.keys().cloned().collect();
+        for expr in leading_exprs {
+            self.discover_new_orderings(expr)?;
+        }
+        Ok(())
     }
 
     /// Incorporates the given equivalence group to into the existing
     /// equivalence group within.
-    pub fn add_equivalence_group(&mut self, other_eq_group: EquivalenceGroup) {
-        self.eq_group.extend(other_eq_group);
+    pub fn add_equivalence_group(
+        &mut self,
+        other_eq_group: EquivalenceGroup,
+    ) -> Result<()> {
+        if !other_eq_group.is_empty() {
+            self.eq_group.extend(other_eq_group);
+            self.update_oeq_cache()?;
+        }
+        Ok(())
+    }
+
+    /// Returns the ordering equivalence class within in normal form.
+    /// Normalization standardizes expressions according to the equivalence
+    /// group within, and removes constants/duplicates.
+    pub fn normalized_oeq_class(&self) -> OrderingEquivalenceClass {
+        self.oeq_class
+            .iter()
+            .cloned()
+            .filter_map(|ordering| self.normalize_sort_exprs(ordering))
+            .collect::<Vec<_>>()
+            .into()
     }
 
     /// Adds a new equality condition into the existing equivalence group.
@@ -282,290 +414,229 @@ impl EquivalenceProperties {
     /// equivalence class to the equivalence group.
     pub fn add_equal_conditions(
         &mut self,
-        left: &Arc<dyn PhysicalExpr>,
-        right: &Arc<dyn PhysicalExpr>,
+        left: Arc<dyn PhysicalExpr>,
+        right: Arc<dyn PhysicalExpr>,
     ) -> Result<()> {
-        // Discover new constants in light of new the equality:
-        if self.is_expr_constant(left) {
-            // Left expression is constant, add right as constant
-            if !const_exprs_contains(&self.constants, right) {
-                let const_expr = ConstExpr::from(right)
-                    .with_across_partitions(self.get_expr_constant_value(left));
-                self.constants.push(const_expr);
-            }
-        } else if self.is_expr_constant(right) {
-            // Right expression is constant, add left as constant
-            if !const_exprs_contains(&self.constants, left) {
-                let const_expr = ConstExpr::from(left)
-                    .with_across_partitions(self.get_expr_constant_value(right));
-                self.constants.push(const_expr);
-            }
+        // Add equal expressions to the state:
+        if self.eq_group.add_equal_conditions(left, right) {
+            self.update_oeq_cache()?;
         }
-
-        // Add equal expressions to the state
-        self.eq_group.add_equal_conditions(left, right);
-
-        // Discover any new orderings
-        self.discover_new_orderings(left)?;
+        self.update_oeq_cache()?;
         Ok(())
     }
 
     /// Track/register physical expressions with constant values.
-    #[deprecated(since = "43.0.0", note = "Use [`with_constants`] instead")]
-    pub fn add_constants(self, constants: impl IntoIterator<Item = ConstExpr>) -> Self {
-        self.with_constants(constants)
-    }
-
-    /// Remove the specified constant
-    pub fn remove_constant(mut self, c: &ConstExpr) -> Self {
-        self.constants.retain(|existing| existing != c);
-        self
-    }
-
-    /// Track/register physical expressions with constant values.
-    pub fn with_constants(
-        mut self,
+    pub fn add_constants(
+        &mut self,
         constants: impl IntoIterator<Item = ConstExpr>,
-    ) -> Self {
-        let normalized_constants = constants
-            .into_iter()
-            .filter_map(|c| {
-                let across_partitions = c.across_partitions();
-                let expr = c.owned_expr();
-                let normalized_expr = self.eq_group.normalize_expr(expr);
-
-                if const_exprs_contains(&self.constants, &normalized_expr) {
-                    return None;
-                }
-
-                let const_expr = ConstExpr::from(normalized_expr)
-                    .with_across_partitions(across_partitions);
-
-                Some(const_expr)
+    ) -> Result<()> {
+        // Add the new constant to the equivalence group:
+        for constant in constants {
+            self.eq_group.add_constant(constant);
+        }
+        // Renormalize the orderings after adding new constants by removing
+        // the constants from existing orderings:
+        let normal_cls = mem::take(&mut self.oeq_cache.normal_cls);
+        let normal_orderings = normal_cls.into_iter().map(|ordering| {
+            ordering.into_iter().filter(|sort_expr| {
+                self.eq_group.is_expr_constant(&sort_expr.expr).is_none()
             })
-            .collect::<Vec<_>>();
-
-        // Add all new normalized constants
-        self.constants.extend(normalized_constants);
-
-        // Discover any new orderings based on the constants
-        for ordering in self.normalized_oeq_class().iter() {
-            if let Err(e) = self.discover_new_orderings(&ordering[0].expr) {
-                log::debug!("error discovering new orderings: {e}");
-            }
+        });
+        self.oeq_cache.normal_cls = OrderingEquivalenceClass::new(normal_orderings);
+        self.oeq_cache.update_map();
+        // Discover any new orderings based on the constants:
+        let leading_exprs: Vec<_> = self.oeq_cache.leading_map.keys().cloned().collect();
+        for expr in leading_exprs {
+            self.discover_new_orderings(expr)?;
         }
-
-        self
+        Ok(())
     }
 
-    // Discover new valid orderings in light of a new equality.
-    // Accepts a single argument (`expr`) which is used to determine
-    // which orderings should be updated.
-    // When constants or equivalence classes are changed, there may be new orderings
-    // that can be discovered with the new equivalence properties.
-    // For a discussion, see: https://github.com/apache/datafusion/issues/9812
-    fn discover_new_orderings(&mut self, expr: &Arc<dyn PhysicalExpr>) -> Result<()> {
-        let normalized_expr = self.eq_group().normalize_expr(Arc::clone(expr));
+    /// Discover new valid orderings in light of a new equality. Accepts a single
+    /// argument (`expr`) which is used to determine the orderings to update.
+    /// When constants or equivalence classes change, there may be new orderings
+    /// that can be discovered with the new equivalence properties.
+    /// For a discussion, see: <https://github.com/apache/datafusion/issues/9812>
+    fn discover_new_orderings(
+        &mut self,
+        normal_expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<()> {
+        let Some(ordering_idxs) = self.oeq_cache.leading_map.get(&normal_expr) else {
+            return Ok(());
+        };
         let eq_class = self
             .eq_group
-            .iter()
-            .find_map(|class| {
-                class
-                    .contains(&normalized_expr)
-                    .then(|| class.clone().into_vec())
-            })
-            .unwrap_or_else(|| vec![Arc::clone(&normalized_expr)]);
-
-        let mut new_orderings: Vec<LexOrdering> = vec![];
-        for ordering in self.normalized_oeq_class().iter() {
-            if !ordering[0].expr.eq(&normalized_expr) {
-                continue;
-            }
+            .get_equivalence_class(&normal_expr)
+            .map_or_else(|| vec![normal_expr], |class| class.clone().into());
 
+        let mut new_orderings = vec![];
+        for idx in ordering_idxs {
+            let ordering = &self.oeq_cache.normal_cls[*idx];
             let leading_ordering_options = ordering[0].options;
 
-            for equivalent_expr in &eq_class {
+            'exprs: for equivalent_expr in &eq_class {
                 let children = equivalent_expr.children();
                 if children.is_empty() {
                     continue;
                 }
-
-                // Check if all children match the next expressions in the ordering
-                let mut all_children_match = true;
+                // Check if all children match the next expressions in the ordering:
                 let mut child_properties = vec![];
-
-                // Build properties for each child based on the next expressions
-                for (i, child) in children.iter().enumerate() {
-                    if let Some(next) = ordering.get(i + 1) {
-                        if !child.as_ref().eq(next.expr.as_ref()) {
-                            all_children_match = false;
-                            break;
-                        }
-                        child_properties.push(ExprProperties {
-                            sort_properties: SortProperties::Ordered(next.options),
-                            range: Interval::make_unbounded(
-                                &child.data_type(&self.schema)?,
-                            )?,
-                            preserves_lex_ordering: true,
-                        });
-                    } else {
-                        all_children_match = false;
-                        break;
+                // Build properties for each child based on the next expression:
+                for (i, child) in children.into_iter().enumerate() {
+                    let Some(next) = ordering.get(i + 1) else {
+                        break 'exprs;
+                    };
+                    if !next.expr.eq(child) {
+                        break 'exprs;
                     }
+                    let data_type = child.data_type(&self.schema)?;
+                    child_properties.push(ExprProperties {
+                        sort_properties: SortProperties::Ordered(next.options),
+                        range: Interval::make_unbounded(&data_type)?,
+                        preserves_lex_ordering: true,
+                    });
                 }
-
-                if all_children_match {
-                    // Check if the expression is monotonic in all arguments
-                    if let Ok(expr_properties) =
-                        equivalent_expr.get_properties(&child_properties)
-                    {
-                        if expr_properties.preserves_lex_ordering
-                            && SortProperties::Ordered(leading_ordering_options)
-                                == expr_properties.sort_properties
-                        {
-                            // Assume existing ordering is [c ASC, a ASC, b ASC]
-                            // When equality c = f(a,b) is given, if we know that given ordering `[a ASC, b ASC]`,
-                            // ordering `[f(a,b) ASC]` is valid, then we can deduce that ordering `[a ASC, b ASC]` is also valid.
-                            // Hence, ordering `[a ASC, b ASC]` can be added to the state as a valid ordering.
-                            // (e.g. existing ordering where leading ordering is removed)
-                            new_orderings.push(LexOrdering::new(ordering[1..].to_vec()));
-                            break;
-                        }
-                    }
+                // Check if the expression is monotonic in all arguments:
+                let expr_properties =
+                    equivalent_expr.get_properties(&child_properties)?;
+                if expr_properties.preserves_lex_ordering
+                    && expr_properties.sort_properties
+                        == SortProperties::Ordered(leading_ordering_options)
+                {
+                    // Assume that `[c ASC, a ASC, b ASC]` is among existing
+                    // orderings. If equality `c = f(a, b)` is given, ordering
+                    // `[a ASC, b ASC]` implies the ordering `[c ASC]`. Thus,
+                    // ordering `[a ASC, b ASC]` is also a valid ordering.
+                    new_orderings.push(ordering[1..].to_vec());
+                    break;
                 }
             }
         }
 
-        self.oeq_class.add_new_orderings(new_orderings);
-        Ok(())
-    }
-
-    /// Updates the ordering equivalence group within assuming that the table
-    /// is re-sorted according to the argument `sort_exprs`. Note that constants
-    /// and equivalence classes are unchanged as they are unaffected by a re-sort.
-    /// If the given ordering is already satisfied, the function does nothing.
-    pub fn with_reorder(mut self, sort_exprs: LexOrdering) -> Self {
-        // Filter out constant expressions as they don't affect ordering
-        let filtered_exprs = LexOrdering::new(
-            sort_exprs
-                .into_iter()
-                .filter(|expr| !self.is_expr_constant(&expr.expr))
-                .collect(),
-        );
-
-        if filtered_exprs.is_empty() {
-            return self;
-        }
-
-        let mut new_orderings = vec![filtered_exprs.clone()];
-
-        // Preserve valid suffixes from existing orderings
-        let oeq_class = mem::take(&mut self.oeq_class);
-        for existing in oeq_class {
-            if self.is_prefix_of(&filtered_exprs, &existing) {
-                let mut extended = filtered_exprs.clone();
-                extended.extend(existing.into_iter().skip(filtered_exprs.len()));
-                new_orderings.push(extended);
-            }
+        if !new_orderings.is_empty() {
+            self.add_orderings(new_orderings);
         }
-
-        self.oeq_class = OrderingEquivalenceClass::new(new_orderings);
-        self
+        Ok(())
     }
 
-    /// Checks if the new ordering matches a prefix of the existing ordering
-    /// (considering expression equivalences)
-    fn is_prefix_of(&self, new_order: &LexOrdering, existing: &LexOrdering) -> bool {
-        // Check if new order is longer than existing - can't be a prefix
-        if new_order.len() > existing.len() {
-            return false;
+    /// Updates the ordering equivalence class within assuming that the table
+    /// is re-sorted according to the argument `ordering`, and returns whether
+    /// this operation resulted in any change. Note that equivalence classes
+    /// (and constants) do not change as they are unaffected by a re-sort. If
+    /// the given ordering is already satisfied, the function does nothing.
+    pub fn reorder(
+        &mut self,
+        ordering: impl IntoIterator<Item = PhysicalSortExpr>,
+    ) -> Result<bool> {
+        let (ordering, ordering_tee) = ordering.into_iter().tee();
+        // First, standardize the given ordering:
+        let Some(normal_ordering) = self.normalize_sort_exprs(ordering) else {
+            // If the ordering vanishes after normalization, it is satisfied:
+            return Ok(false);
+        };
+        if normal_ordering.len() != self.common_sort_prefix_length(&normal_ordering)? {
+            // If the ordering is unsatisfied, replace existing orderings:
+            self.clear_orderings();
+            self.add_ordering(ordering_tee);
+            return Ok(true);
         }
-
-        // Check if new order matches existing prefix (considering equivalences)
-        new_order.iter().zip(existing).all(|(new, existing)| {
-            self.eq_group.exprs_equal(&new.expr, &existing.expr)
-                && new.options == existing.options
-        })
+        Ok(false)
     }
 
     /// Normalizes the given sort expressions (i.e. `sort_exprs`) using the
-    /// equivalence group and the ordering equivalence class within.
-    ///
-    /// Assume that `self.eq_group` states column `a` and `b` are aliases.
-    /// Also assume that `self.oeq_class` states orderings `d ASC` and `a ASC, c ASC`
-    /// are equivalent (in the sense that both describe the ordering of the table).
-    /// If the `sort_exprs` argument were `vec![b ASC, c ASC, a ASC]`, then this
-    /// function would return `vec![a ASC, c ASC]`. Internally, it would first
-    /// normalize to `vec![a ASC, c ASC, a ASC]` and end up with the final result
-    /// after deduplication.
-    fn normalize_sort_exprs(&self, sort_exprs: &LexOrdering) -> LexOrdering {
-        // Convert sort expressions to sort requirements:
-        let sort_reqs = LexRequirement::from(sort_exprs.clone());
-        // Normalize the requirements:
-        let normalized_sort_reqs = self.normalize_sort_requirements(&sort_reqs);
-        // Convert sort requirements back to sort expressions:
-        LexOrdering::from(normalized_sort_reqs)
+    /// equivalence group within. Returns a `LexOrdering` instance if the
+    /// expressions define a proper lexicographical ordering. For more details,
+    /// see [`EquivalenceGroup::normalize_sort_exprs`].
+    pub fn normalize_sort_exprs(
+        &self,
+        sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    ) -> Option<LexOrdering> {
+        LexOrdering::new(self.eq_group.normalize_sort_exprs(sort_exprs))
     }
 
     /// Normalizes the given sort requirements (i.e. `sort_reqs`) using the
-    /// equivalence group and the ordering equivalence class within. It works by:
-    /// - Removing expressions that have a constant value from the given requirement.
-    /// - Replacing sections that belong to some equivalence class in the equivalence
-    ///   group with the first entry in the matching equivalence class.
-    ///
-    /// Assume that `self.eq_group` states column `a` and `b` are aliases.
-    /// Also assume that `self.oeq_class` states orderings `d ASC` and `a ASC, c ASC`
-    /// are equivalent (in the sense that both describe the ordering of the table).
-    /// If the `sort_reqs` argument were `vec![b ASC, c ASC, a ASC]`, then this
-    /// function would return `vec![a ASC, c ASC]`. Internally, it would first
-    /// normalize to `vec![a ASC, c ASC, a ASC]` and end up with the final result
-    /// after deduplication.
-    fn normalize_sort_requirements(&self, sort_reqs: &LexRequirement) -> LexRequirement {
-        let normalized_sort_reqs = self.eq_group.normalize_sort_requirements(sort_reqs);
-        let mut constant_exprs = vec![];
-        constant_exprs.extend(
-            self.constants
-                .iter()
-                .map(|const_expr| Arc::clone(const_expr.expr())),
-        );
-        let constants_normalized = self.eq_group.normalize_exprs(constant_exprs);
-        // Prune redundant sections in the requirement:
-        normalized_sort_reqs
-            .iter()
-            .filter(|&order| !physical_exprs_contains(&constants_normalized, &order.expr))
-            .cloned()
-            .collect::<LexRequirement>()
-            .collapse()
+    /// equivalence group within. Returns a `LexRequirement` instance if the
+    /// expressions define a proper lexicographical requirement. For more
+    /// details, see [`EquivalenceGroup::normalize_sort_exprs`].
+    pub fn normalize_sort_requirements(
+        &self,
+        sort_reqs: impl IntoIterator<Item = PhysicalSortRequirement>,
+    ) -> Option<LexRequirement> {
+        LexRequirement::new(self.eq_group.normalize_sort_requirements(sort_reqs))
     }
 
-    /// Checks whether the given ordering is satisfied by any of the existing
-    /// orderings.
-    pub fn ordering_satisfy(&self, given: &LexOrdering) -> bool {
-        // Convert the given sort expressions to sort requirements:
-        let sort_requirements = LexRequirement::from(given.clone());
-        self.ordering_satisfy_requirement(&sort_requirements)
+    /// Iteratively checks whether the given ordering is satisfied by any of
+    /// the existing orderings. See [`Self::ordering_satisfy_requirement`] for
+    /// more details and examples.
+    pub fn ordering_satisfy(
+        &self,
+        given: impl IntoIterator<Item = PhysicalSortExpr>,
+    ) -> Result<bool> {
+        // First, standardize the given ordering:
+        let Some(normal_ordering) = self.normalize_sort_exprs(given) else {
+            // If the ordering vanishes after normalization, it is satisfied:
+            return Ok(true);
+        };
+        Ok(normal_ordering.len() == self.common_sort_prefix_length(&normal_ordering)?)
     }
 
-    /// Returns the number of consecutive requirements (starting from the left)
-    /// that are satisfied by the plan ordering.
-    fn compute_common_sort_prefix_length(
+    /// Iteratively checks whether the given sort requirement is satisfied by
+    /// any of the existing orderings.
+    ///
+    /// ### Example Scenarios
+    ///
+    /// In these scenarios, assume that all expressions share the same sort
+    /// properties.
+    ///
+    /// #### Case 1: Sort Requirement `[a, c]`
+    ///
+    /// **Existing orderings:** `[[a, b, c], [a, d]]`, **constants:** `[]`
+    /// 1. The function first checks the leading requirement `a`, which is
+    ///    satisfied by `[a, b, c].first()`.
+    /// 2. `a` is added as a constant for the next iteration.
+    /// 3. Normal orderings become `[[b, c], [d]]`.
+    /// 4. The function fails for `c` in the second iteration, as neither
+    ///    `[b, c]` nor `[d]` satisfies `c`.
+    ///
+    /// #### Case 2: Sort Requirement `[a, d]`
+    ///
+    /// **Existing orderings:** `[[a, b, c], [a, d]]`, **constants:** `[]`
+    /// 1. The function first checks the leading requirement `a`, which is
+    ///    satisfied by `[a, b, c].first()`.
+    /// 2. `a` is added as a constant for the next iteration.
+    /// 3. Normal orderings become `[[b, c], [d]]`.
+    /// 4. The function returns `true` as `[d]` satisfies `d`.
+    pub fn ordering_satisfy_requirement(
         &self,
-        normalized_reqs: &LexRequirement,
-    ) -> usize {
-        // Check whether given ordering is satisfied by constraints first
-        if self.satisfied_by_constraints(normalized_reqs) {
-            // If the constraints satisfy all requirements, return the full normalized requirements length
-            return normalized_reqs.len();
+        given: impl IntoIterator<Item = PhysicalSortRequirement>,
+    ) -> Result<bool> {
+        // First, standardize the given requirement:
+        let Some(normal_reqs) = self.normalize_sort_requirements(given) else {
+            // If the requirement vanishes after normalization, it is satisfied:
+            return Ok(true);
+        };
+        // Then, check whether given requirement is satisfied by constraints:
+        if self.satisfied_by_constraints(&normal_reqs) {
+            return Ok(true);
         }
-
+        let schema = self.schema();
         let mut eq_properties = self.clone();
-
-        for (i, normalized_req) in normalized_reqs.iter().enumerate() {
-            // Check whether given ordering is satisfied
-            if !eq_properties.ordering_satisfy_single(normalized_req) {
-                // As soon as one requirement is not satisfied, return
-                // how many we've satisfied so far
-                return i;
+        for element in normal_reqs {
+            // Check whether given requirement is satisfied:
+            let ExprProperties {
+                sort_properties, ..
+            } = eq_properties.get_expr_properties(Arc::clone(&element.expr));
+            let satisfy = match sort_properties {
+                SortProperties::Ordered(options) => element.options.is_none_or(|opts| {
+                    let nullable = element.expr.nullable(schema).unwrap_or(true);
+                    options_compatible(&options, &opts, nullable)
+                }),
+                // Singleton expressions satisfy any requirement.
+                SortProperties::Singleton => true,
+                SortProperties::Unordered => false,
+            };
+            if !satisfy {
+                return Ok(false);
             }
             // Treat satisfied keys as constants in subsequent iterations. We
             // can do this because the "next" key only matters in a lexicographical
@@ -579,288 +650,253 @@ impl EquivalenceProperties {
             // From the analysis above, we know that `[a ASC]` is satisfied. Then,
             // we add column `a` as constant to the algorithm state. This enables us
             // to deduce that `(b + c) ASC` is satisfied, given `a` is constant.
-            eq_properties = eq_properties.with_constants(std::iter::once(
-                ConstExpr::from(Arc::clone(&normalized_req.expr)),
-            ));
+            let const_expr = ConstExpr::from(element.expr);
+            eq_properties.add_constants(std::iter::once(const_expr))?;
         }
+        Ok(true)
+    }
 
-        // All requirements are satisfied.
-        normalized_reqs.len()
+    /// Returns the number of consecutive sort expressions (starting from the
+    /// left) that are satisfied by the existing ordering.
+    fn common_sort_prefix_length(&self, normal_ordering: &LexOrdering) -> Result<usize> {
+        let full_length = normal_ordering.len();
+        // Check whether the given ordering is satisfied by constraints:
+        if self.satisfied_by_constraints_ordering(normal_ordering) {
+            // If constraints satisfy all sort expressions, return the full
+            // length:
+            return Ok(full_length);
+        }
+        let schema = self.schema();
+        let mut eq_properties = self.clone();
+        for (idx, element) in normal_ordering.into_iter().enumerate() {
+            // Check whether given ordering is satisfied:
+            let ExprProperties {
+                sort_properties, ..
+            } = eq_properties.get_expr_properties(Arc::clone(&element.expr));
+            let satisfy = match sort_properties {
+                SortProperties::Ordered(options) => options_compatible(
+                    &options,
+                    &element.options,
+                    element.expr.nullable(schema).unwrap_or(true),
+                ),
+                // Singleton expressions satisfy any ordering.
+                SortProperties::Singleton => true,
+                SortProperties::Unordered => false,
+            };
+            if !satisfy {
+                // As soon as one sort expression is unsatisfied, return how
+                // many we've satisfied so far:
+                return Ok(idx);
+            }
+            // Treat satisfied keys as constants in subsequent iterations. We
+            // can do this because the "next" key only matters in a lexicographical
+            // ordering when the keys to its left have the same values.
+            //
+            // Note that these expressions are not properly "constants". This is just
+            // an implementation strategy confined to this function.
+            //
+            // For example, assume that the requirement is `[a ASC, (b + c) ASC]`,
+            // and existing equivalent orderings are `[a ASC, b ASC]` and `[c ASC]`.
+            // From the analysis above, we know that `[a ASC]` is satisfied. Then,
+            // we add column `a` as constant to the algorithm state. This enables us
+            // to deduce that `(b + c) ASC` is satisfied, given `a` is constant.
+            let const_expr = ConstExpr::from(Arc::clone(&element.expr));
+            eq_properties.add_constants(std::iter::once(const_expr))?
+        }
+        // All sort expressions are satisfied, return full length:
+        Ok(full_length)
     }
 
-    /// Determines the longest prefix of `reqs` that is satisfied by the existing ordering.
-    /// Returns that prefix as a new `LexRequirement`, and a boolean indicating if all the requirements are satisfied.
+    /// Determines the longest normal prefix of `ordering` satisfied by the
+    /// existing ordering. Returns that prefix as a new `LexOrdering`, and a
+    /// boolean indicating whether all the sort expressions are satisfied.
     pub fn extract_common_sort_prefix(
         &self,
-        reqs: &LexRequirement,
-    ) -> (LexRequirement, bool) {
-        // First, standardize the given requirement:
-        let normalized_reqs = self.normalize_sort_requirements(reqs);
-
-        let prefix_len = self.compute_common_sort_prefix_length(&normalized_reqs);
-        (
-            LexRequirement::new(normalized_reqs[..prefix_len].to_vec()),
-            prefix_len == normalized_reqs.len(),
-        )
-    }
-
-    /// Checks whether the given sort requirements are satisfied by any of the
-    /// existing orderings.
-    pub fn ordering_satisfy_requirement(&self, reqs: &LexRequirement) -> bool {
-        self.extract_common_sort_prefix(reqs).1
+        ordering: LexOrdering,
+    ) -> Result<(Vec<PhysicalSortExpr>, bool)> {
+        // First, standardize the given ordering:
+        let Some(normal_ordering) = self.normalize_sort_exprs(ordering) else {
+            // If the ordering vanishes after normalization, it is satisfied:
+            return Ok((vec![], true));
+        };
+        let prefix_len = self.common_sort_prefix_length(&normal_ordering)?;
+        let flag = prefix_len == normal_ordering.len();
+        let mut sort_exprs: Vec<_> = normal_ordering.into();
+        if !flag {
+            sort_exprs.truncate(prefix_len);
+        }
+        Ok((sort_exprs, flag))
     }
 
-    /// Checks if the sort requirements are satisfied by any of the table constraints (primary key or unique).
-    /// Returns true if any constraint fully satisfies the requirements.
-    fn satisfied_by_constraints(
+    /// Checks if the sort expressions are satisfied by any of the table
+    /// constraints (primary key or unique). Returns true if any constraint
+    /// fully satisfies the expressions (i.e. constraint indices form a valid
+    /// prefix of an existing ordering that matches the expressions). For
+    /// unique constraints, also verifies nullable columns.
+    fn satisfied_by_constraints_ordering(
         &self,
-        normalized_reqs: &[PhysicalSortRequirement],
+        normal_exprs: &[PhysicalSortExpr],
     ) -> bool {
         self.constraints.iter().any(|constraint| match constraint {
-            Constraint::PrimaryKey(indices) | Constraint::Unique(indices) => self
-                .satisfied_by_constraint(
-                    normalized_reqs,
-                    indices,
-                    matches!(constraint, Constraint::Unique(_)),
-                ),
-        })
-    }
-
-    /// Checks if sort requirements are satisfied by a constraint (primary key or unique).
-    /// Returns true if the constraint indices form a valid prefix of an existing ordering
-    /// that matches the requirements. For unique constraints, also verifies nullable columns.
-    fn satisfied_by_constraint(
-        &self,
-        normalized_reqs: &[PhysicalSortRequirement],
-        indices: &[usize],
-        check_null: bool,
-    ) -> bool {
-        // Requirements must contain indices
-        if indices.len() > normalized_reqs.len() {
-            return false;
-        }
-
-        // Iterate over all orderings
-        self.oeq_class.iter().any(|ordering| {
-            if indices.len() > ordering.len() {
-                return false;
-            }
-
-            // Build a map of column positions in the ordering
-            let mut col_positions = HashMap::with_capacity(ordering.len());
-            for (pos, req) in ordering.iter().enumerate() {
-                if let Some(col) = req.expr.as_any().downcast_ref::<Column>() {
-                    col_positions.insert(
-                        col.index(),
-                        (pos, col.nullable(&self.schema).unwrap_or(true)),
-                    );
-                }
-            }
-
-            // Check if all constraint indices appear in valid positions
-            if !indices.iter().all(|&idx| {
-                col_positions
-                    .get(&idx)
-                    .map(|&(pos, nullable)| {
-                        // For unique constraints, verify column is not nullable if it's first/last
-                        !check_null
-                            || (pos != 0 && pos != ordering.len() - 1)
-                            || !nullable
+            Constraint::PrimaryKey(indices) | Constraint::Unique(indices) => {
+                let check_null = matches!(constraint, Constraint::Unique(_));
+                let normalized_size = normal_exprs.len();
+                indices.len() <= normalized_size
+                    && self.oeq_class.iter().any(|ordering| {
+                        let length = ordering.len();
+                        if indices.len() > length || normalized_size < length {
+                            return false;
+                        }
+                        // Build a map of column positions in the ordering:
+                        let mut col_positions = HashMap::with_capacity(length);
+                        for (pos, req) in ordering.iter().enumerate() {
+                            if let Some(col) = req.expr.as_any().downcast_ref::<Column>()
+                            {
+                                let nullable = col.nullable(&self.schema).unwrap_or(true);
+                                col_positions.insert(col.index(), (pos, nullable));
+                            }
+                        }
+                        // Check if all constraint indices appear in valid positions:
+                        if !indices.iter().all(|idx| {
+                            col_positions.get(idx).is_some_and(|&(pos, nullable)| {
+                                // For unique constraints, verify column is not nullable if it's first/last:
+                                !check_null
+                                    || !nullable
+                                    || (pos != 0 && pos != length - 1)
+                            })
+                        }) {
+                            return false;
+                        }
+                        // Check if this ordering matches the prefix:
+                        normal_exprs.iter().zip(ordering).all(|(given, existing)| {
+                            existing.satisfy_expr(given, &self.schema)
+                        })
                     })
-                    .unwrap_or(false)
-            }) {
-                return false;
             }
-
-            // Check if this ordering matches requirements prefix
-            let ordering_len = ordering.len();
-            normalized_reqs.len() >= ordering_len
-                && normalized_reqs[..ordering_len].iter().zip(ordering).all(
-                    |(req, existing)| {
-                        req.expr.eq(&existing.expr)
-                            && req
-                                .options
-                                .is_none_or(|req_opts| req_opts == existing.options)
-                    },
-                )
         })
     }
 
-    /// Determines whether the ordering specified by the given sort requirement
-    /// is satisfied based on the orderings within, equivalence classes, and
-    /// constant expressions.
-    ///
-    /// # Parameters
-    ///
-    /// - `req`: A reference to a `PhysicalSortRequirement` for which the ordering
-    ///   satisfaction check will be done.
-    ///
-    /// # Returns
-    ///
-    /// Returns `true` if the specified ordering is satisfied, `false` otherwise.
-    fn ordering_satisfy_single(&self, req: &PhysicalSortRequirement) -> bool {
-        let ExprProperties {
-            sort_properties, ..
-        } = self.get_expr_properties(Arc::clone(&req.expr));
-        match sort_properties {
-            SortProperties::Ordered(options) => {
-                let sort_expr = PhysicalSortExpr {
-                    expr: Arc::clone(&req.expr),
-                    options,
-                };
-                sort_expr.satisfy(req, self.schema())
+    /// Checks if the sort requirements are satisfied by any of the table
+    /// constraints (primary key or unique). Returns true if any constraint
+    /// fully satisfies the requirements (i.e. constraint indices form a valid
+    /// prefix of an existing ordering that matches the requirements). For
+    /// unique constraints, also verifies nullable columns.
+    fn satisfied_by_constraints(&self, normal_reqs: &[PhysicalSortRequirement]) -> bool {
+        self.constraints.iter().any(|constraint| match constraint {
+            Constraint::PrimaryKey(indices) | Constraint::Unique(indices) => {
+                let check_null = matches!(constraint, Constraint::Unique(_));
+                let normalized_size = normal_reqs.len();
+                indices.len() <= normalized_size
+                    && self.oeq_class.iter().any(|ordering| {
+                        let length = ordering.len();
+                        if indices.len() > length || normalized_size < length {
+                            return false;
+                        }
+                        // Build a map of column positions in the ordering:
+                        let mut col_positions = HashMap::with_capacity(length);
+                        for (pos, req) in ordering.iter().enumerate() {
+                            if let Some(col) = req.expr.as_any().downcast_ref::<Column>()
+                            {
+                                let nullable = col.nullable(&self.schema).unwrap_or(true);
+                                col_positions.insert(col.index(), (pos, nullable));
+                            }
+                        }
+                        // Check if all constraint indices appear in valid positions:
+                        if !indices.iter().all(|idx| {
+                            col_positions.get(idx).is_some_and(|&(pos, nullable)| {
+                                // For unique constraints, verify column is not nullable if it's first/last:
+                                !check_null
+                                    || !nullable
+                                    || (pos != 0 && pos != length - 1)
+                            })
+                        }) {
+                            return false;
+                        }
+                        // Check if this ordering matches the prefix:
+                        normal_reqs.iter().zip(ordering).all(|(given, existing)| {
+                            existing.satisfy(given, &self.schema)
+                        })
+                    })
             }
-            // Singleton expressions satisfies any ordering.
-            SortProperties::Singleton => true,
-            SortProperties::Unordered => false,
-        }
+        })
     }
 
     /// Checks whether the `given` sort requirements are equal or more specific
     /// than the `reference` sort requirements.
     pub fn requirements_compatible(
         &self,
-        given: &LexRequirement,
-        reference: &LexRequirement,
+        given: LexRequirement,
+        reference: LexRequirement,
     ) -> bool {
-        let normalized_given = self.normalize_sort_requirements(given);
-        let normalized_reference = self.normalize_sort_requirements(reference);
-
-        (normalized_reference.len() <= normalized_given.len())
-            && normalized_reference
+        let Some(normal_given) = self.normalize_sort_requirements(given) else {
+            return true;
+        };
+        let Some(normal_reference) = self.normalize_sort_requirements(reference) else {
+            return true;
+        };
+
+        (normal_reference.len() <= normal_given.len())
+            && normal_reference
                 .into_iter()
-                .zip(normalized_given)
+                .zip(normal_given)
                 .all(|(reference, given)| given.compatible(&reference))
     }
 
-    /// Returns the finer ordering among the orderings `lhs` and `rhs`, breaking
-    /// any ties by choosing `lhs`.
-    ///
-    /// The finer ordering is the ordering that satisfies both of the orderings.
-    /// If the orderings are incomparable, returns `None`.
-    ///
-    /// For example, the finer ordering among `[a ASC]` and `[a ASC, b ASC]` is
-    /// the latter.
-    pub fn get_finer_ordering(
-        &self,
-        lhs: &LexOrdering,
-        rhs: &LexOrdering,
-    ) -> Option<LexOrdering> {
-        // Convert the given sort expressions to sort requirements:
-        let lhs = LexRequirement::from(lhs.clone());
-        let rhs = LexRequirement::from(rhs.clone());
-        let finer = self.get_finer_requirement(&lhs, &rhs);
-        // Convert the chosen sort requirements back to sort expressions:
-        finer.map(LexOrdering::from)
-    }
-
-    /// Returns the finer ordering among the requirements `lhs` and `rhs`,
-    /// breaking any ties by choosing `lhs`.
+    /// Modify existing orderings by substituting sort expressions with appropriate
+    /// targets from the projection mapping. We substitute a sort expression when
+    /// its physical expression has a one-to-one functional relationship with a
+    /// target expression in the mapping.
     ///
-    /// The finer requirements are the ones that satisfy both of the given
-    /// requirements. If the requirements are incomparable, returns `None`.
+    /// After substitution, we may generate more than one `LexOrdering` for each
+    /// existing equivalent ordering. For example, `[a ASC, b ASC]` will turn
+    /// into `[CAST(a) ASC, b ASC]` and `[a ASC, b ASC]` when applying projection
+    /// expressions `a, b, CAST(a)`.
     ///
-    /// For example, the finer requirements among `[a ASC]` and `[a ASC, b ASC]`
-    /// is the latter.
-    pub fn get_finer_requirement(
-        &self,
-        req1: &LexRequirement,
-        req2: &LexRequirement,
-    ) -> Option<LexRequirement> {
-        let mut lhs = self.normalize_sort_requirements(req1);
-        let mut rhs = self.normalize_sort_requirements(req2);
-        lhs.inner
-            .iter_mut()
-            .zip(rhs.inner.iter_mut())
-            .all(|(lhs, rhs)| {
-                lhs.expr.eq(&rhs.expr)
-                    && match (lhs.options, rhs.options) {
-                        (Some(lhs_opt), Some(rhs_opt)) => lhs_opt == rhs_opt,
-                        (Some(options), None) => {
-                            rhs.options = Some(options);
-                            true
-                        }
-                        (None, Some(options)) => {
-                            lhs.options = Some(options);
-                            true
-                        }
-                        (None, None) => true,
-                    }
-            })
-            .then_some(if lhs.len() >= rhs.len() { lhs } else { rhs })
-    }
-
-    /// we substitute the ordering according to input expression type, this is a simplified version
-    /// In this case, we just substitute when the expression satisfy the following condition:
-    /// I. just have one column and is a CAST expression
-    /// TODO: Add one-to-ones analysis for monotonic ScalarFunctions.
-    /// TODO: we could precompute all the scenario that is computable, for example: atan(x + 1000) should also be substituted if
-    ///  x is DESC or ASC
-    /// After substitution, we may generate more than 1 `LexOrdering`. As an example,
-    /// `[a ASC, b ASC]` will turn into `[a ASC, b ASC], [CAST(a) ASC, b ASC]` when projection expressions `a, b, CAST(a)` is applied.
-    pub fn substitute_ordering_component(
-        &self,
+    /// TODO: Handle all scenarios that allow substitution; e.g. when `x` is
+    ///       sorted, `atan(x + 1000)` should also be substituted. For now, we
+    ///       only consider single-column `CAST` expressions.
+    fn substitute_oeq_class(
+        schema: &SchemaRef,
         mapping: &ProjectionMapping,
-        sort_expr: &LexOrdering,
-    ) -> Result<Vec<LexOrdering>> {
-        let new_orderings = sort_expr
-            .iter()
-            .map(|sort_expr| {
-                let referring_exprs: Vec<_> = mapping
-                    .iter()
-                    .map(|(source, _target)| source)
-                    .filter(|source| expr_refers(source, &sort_expr.expr))
-                    .cloned()
-                    .collect();
-                let mut res = LexOrdering::new(vec![sort_expr.clone()]);
-                // TODO: Add one-to-ones analysis for ScalarFunctions.
-                for r_expr in referring_exprs {
-                    // we check whether this expression is substitutable or not
-                    if let Some(cast_expr) = r_expr.as_any().downcast_ref::<CastExpr>() {
-                        // we need to know whether the Cast Expr matches or not
-                        let expr_type = sort_expr.expr.data_type(&self.schema)?;
-                        if cast_expr.expr.eq(&sort_expr.expr)
-                            && cast_expr.is_bigger_cast(expr_type)
-                        {
-                            res.push(PhysicalSortExpr {
-                                expr: Arc::clone(&r_expr),
-                                options: sort_expr.options,
-                            });
+        oeq_class: OrderingEquivalenceClass,
+    ) -> OrderingEquivalenceClass {
+        let new_orderings = oeq_class.into_iter().flat_map(|order| {
+            // Modify/expand existing orderings by substituting sort
+            // expressions with appropriate targets from the mapping:
+            order
+                .into_iter()
+                .map(|sort_expr| {
+                    let referring_exprs = mapping
+                        .iter()
+                        .map(|(source, _target)| source)
+                        .filter(|source| expr_refers(source, &sort_expr.expr))
+                        .cloned();
+                    let mut result = vec![];
+                    // The sort expression comes from this schema, so the
+                    // following call to `unwrap` is safe.
+                    let expr_type = sort_expr.expr.data_type(schema).unwrap();
+                    // TODO: Add one-to-one analysis for ScalarFunctions.
+                    for r_expr in referring_exprs {
+                        if let Some(substituted) = Self::substitute_cast_like_ordering(
+                            r_expr, &sort_expr, &expr_type,
+                        ) {
+                            result.push(substituted);
                         }
                     }
-                }
-                Ok(res)
-            })
-            .collect::<Result<Vec<_>>>()?;
-        // Generate all valid orderings, given substituted expressions.
-        let res = new_orderings
-            .into_iter()
-            .multi_cartesian_product()
-            .map(LexOrdering::new)
-            .collect::<Vec<_>>();
-        Ok(res)
+                    result.push(sort_expr);
+                    result
+                })
+                // Generate all valid orderings given substituted expressions:
+                .multi_cartesian_product()
+        });
+        OrderingEquivalenceClass::new(new_orderings)
     }
 
-    /// In projection, supposed we have a input function 'A DESC B DESC' and the output shares the same expression
-    /// with A and B, we could surely use the ordering of the original ordering, However, if the A has been changed,
-    /// for example, A-> Cast(A, Int64) or any other form, it is invalid if we continue using the original ordering
-    /// Since it would cause bug in dependency constructions, we should substitute the input order in order to get correct
-    /// dependency map, happen in issue 8838: <https://github.com/apache/datafusion/issues/8838>
-    pub fn substitute_oeq_class(&mut self, mapping: &ProjectionMapping) -> Result<()> {
-        let new_order = self
-            .oeq_class
-            .iter()
-            .map(|order| self.substitute_ordering_component(mapping, order))
-            .collect::<Result<Vec<_>>>()?;
-        let new_order = new_order.into_iter().flatten().collect();
-        self.oeq_class = OrderingEquivalenceClass::new(new_order);
-        Ok(())
-    }
-    /// Projects argument `expr` according to `projection_mapping`, taking
-    /// equivalences into account.
+    /// Projects argument `expr` according to the projection described by
+    /// `mapping`, taking equivalences into account.
     ///
     /// For example, assume that columns `a` and `c` are always equal, and that
-    /// `projection_mapping` encodes following mapping:
+    /// the projection described by `mapping` encodes the following:
     ///
     /// ```text
     /// a -> a1
@@ -868,13 +904,25 @@ impl EquivalenceProperties {
     /// ```
     ///
     /// Then, this function projects `a + b` to `Some(a1 + b1)`, `c + b` to
-    /// `Some(a1 + b1)` and `d` to `None`, meaning that it  cannot be projected.
+    /// `Some(a1 + b1)` and `d` to `None`, meaning that it is not projectable.
     pub fn project_expr(
         &self,
         expr: &Arc<dyn PhysicalExpr>,
-        projection_mapping: &ProjectionMapping,
+        mapping: &ProjectionMapping,
     ) -> Option<Arc<dyn PhysicalExpr>> {
-        self.eq_group.project_expr(projection_mapping, expr)
+        self.eq_group.project_expr(mapping, expr)
+    }
+
+    /// Projects the given `expressions` according to the projection described
+    /// by `mapping`, taking equivalences into account. This function is similar
+    /// to [`Self::project_expr`], but projects multiple expressions at once
+    /// more efficiently than calling `project_expr` for each expression.
+    pub fn project_expressions<'a>(
+        &'a self,
+        expressions: impl IntoIterator<Item = &'a Arc<dyn PhysicalExpr>> + 'a,
+        mapping: &'a ProjectionMapping,
+    ) -> impl Iterator<Item = Option<Arc<dyn PhysicalExpr>>> + 'a {
+        self.eq_group.project_expressions(mapping, expressions)
     }
 
     /// Constructs a dependency map based on existing orderings referred to in
@@ -906,71 +954,85 @@ impl EquivalenceProperties {
     /// b ASC: Node {Some(b_new ASC), HashSet{a ASC}}
     /// c ASC: Node {None, HashSet{a ASC}}
     /// ```
-    fn construct_dependency_map(&self, mapping: &ProjectionMapping) -> DependencyMap {
-        let mut dependency_map = DependencyMap::new();
-        for ordering in self.normalized_oeq_class().iter() {
-            for (idx, sort_expr) in ordering.iter().enumerate() {
-                let target_sort_expr =
-                    self.project_expr(&sort_expr.expr, mapping).map(|expr| {
-                        PhysicalSortExpr {
-                            expr,
-                            options: sort_expr.options,
-                        }
-                    });
-                let is_projected = target_sort_expr.is_some();
-                if is_projected
-                    || mapping
-                        .iter()
-                        .any(|(source, _)| expr_refers(source, &sort_expr.expr))
-                {
-                    // Previous ordering is a dependency. Note that there is no,
-                    // dependency for a leading ordering (i.e. the first sort
-                    // expression).
-                    let dependency = idx.checked_sub(1).map(|a| &ordering[a]);
-                    // Add sort expressions that can be projected or referred to
-                    // by any of the projection expressions to the dependency map:
-                    dependency_map.insert(
-                        sort_expr,
-                        target_sort_expr.as_ref(),
-                        dependency,
-                    );
-                }
-                if !is_projected {
-                    // If we can not project, stop constructing the dependency
-                    // map as remaining dependencies will be invalid after projection.
+    fn construct_dependency_map(
+        &self,
+        oeq_class: OrderingEquivalenceClass,
+        mapping: &ProjectionMapping,
+    ) -> DependencyMap {
+        let mut map = DependencyMap::default();
+        for ordering in oeq_class.into_iter() {
+            // Previous expression is a dependency. Note that there is no
+            // dependency for the leading expression.
+            if !self.insert_to_dependency_map(
+                mapping,
+                ordering[0].clone(),
+                None,
+                &mut map,
+            ) {
+                continue;
+            }
+            for (dependency, sort_expr) in ordering.into_iter().tuple_windows() {
+                if !self.insert_to_dependency_map(
+                    mapping,
+                    sort_expr,
+                    Some(dependency),
+                    &mut map,
+                ) {
+                    // If we can't project, stop constructing the dependency map
+                    // as remaining dependencies will be invalid post projection.
                     break;
                 }
             }
         }
-        dependency_map
+        map
     }
 
-    /// Returns a new `ProjectionMapping` where source expressions are normalized.
-    ///
-    /// This normalization ensures that source expressions are transformed into a
-    /// consistent representation. This is beneficial for algorithms that rely on
-    /// exact equalities, as it allows for more precise and reliable comparisons.
+    /// Projects the sort expression according to the projection mapping and
+    /// inserts it into the dependency map with the given dependency. Returns
+    /// a boolean flag indicating whether the given expression is projectable.
+    fn insert_to_dependency_map(
+        &self,
+        mapping: &ProjectionMapping,
+        sort_expr: PhysicalSortExpr,
+        dependency: Option<PhysicalSortExpr>,
+        map: &mut DependencyMap,
+    ) -> bool {
+        let target_sort_expr = self
+            .project_expr(&sort_expr.expr, mapping)
+            .map(|expr| PhysicalSortExpr::new(expr, sort_expr.options));
+        let projectable = target_sort_expr.is_some();
+        if projectable
+            || mapping
+                .iter()
+                .any(|(source, _)| expr_refers(source, &sort_expr.expr))
+        {
+            // Add sort expressions that can be projected or referred to
+            // by any of the projection expressions to the dependency map:
+            map.insert(sort_expr, target_sort_expr, dependency);
+        }
+        projectable
+    }
+
+    /// Returns a new `ProjectionMapping` where source expressions are in normal
+    /// form. Normalization ensures that source expressions are transformed into
+    /// a consistent representation, which is beneficial for algorithms that rely
+    /// on exact equalities, as it allows for more precise and reliable comparisons.
     ///
     /// # Parameters
     ///
-    /// - `mapping`: A reference to the original `ProjectionMapping` to be normalized.
+    /// - `mapping`: A reference to the original `ProjectionMapping` to normalize.
     ///
     /// # Returns
     ///
-    /// A new `ProjectionMapping` with normalized source expressions.
-    fn normalized_mapping(&self, mapping: &ProjectionMapping) -> ProjectionMapping {
-        // Construct the mapping where source expressions are normalized. In this way
-        // In the algorithms below we can work on exact equalities
-        ProjectionMapping {
-            map: mapping
-                .iter()
-                .map(|(source, target)| {
-                    let normalized_source =
-                        self.eq_group.normalize_expr(Arc::clone(source));
-                    (normalized_source, Arc::clone(target))
-                })
-                .collect(),
-        }
+    /// A new `ProjectionMapping` with source expressions in normal form.
+    fn normalize_mapping(&self, mapping: &ProjectionMapping) -> ProjectionMapping {
+        mapping
+            .iter()
+            .map(|(source, target)| {
+                let normal_source = self.eq_group.normalize_expr(Arc::clone(source));
+                (normal_source, target.clone())
+            })
+            .collect()
     }
 
     /// Computes projected orderings based on a given projection mapping.
@@ -984,42 +1046,55 @@ impl EquivalenceProperties {
     ///
     /// - `mapping`: A reference to the `ProjectionMapping` that defines the
     ///   relationship between source and target expressions.
+    /// - `oeq_class`: The `OrderingEquivalenceClass` containing the orderings
+    ///   to project.
     ///
     /// # Returns
     ///
-    /// A vector of `LexOrdering` containing all valid orderings after projection.
-    fn projected_orderings(&self, mapping: &ProjectionMapping) -> Vec<LexOrdering> {
-        let mapping = self.normalized_mapping(mapping);
-
+    /// A vector of all valid (but not in normal form) orderings after projection.
+    fn projected_orderings(
+        &self,
+        mapping: &ProjectionMapping,
+        mut oeq_class: OrderingEquivalenceClass,
+    ) -> Vec<LexOrdering> {
+        // Normalize source expressions in the mapping:
+        let mapping = self.normalize_mapping(mapping);
         // Get dependency map for existing orderings:
-        let dependency_map = self.construct_dependency_map(&mapping);
-        let orderings = mapping.iter().flat_map(|(source, target)| {
+        oeq_class = Self::substitute_oeq_class(&self.schema, &mapping, oeq_class);
+        let dependency_map = self.construct_dependency_map(oeq_class, &mapping);
+        let orderings = mapping.iter().flat_map(|(source, targets)| {
             referred_dependencies(&dependency_map, source)
                 .into_iter()
-                .filter_map(|relevant_deps| {
-                    if let Ok(SortProperties::Ordered(options)) =
-                        get_expr_properties(source, &relevant_deps, &self.schema)
-                            .map(|prop| prop.sort_properties)
-                    {
-                        Some((options, relevant_deps))
+                .filter_map(|deps| {
+                    let ep = get_expr_properties(source, &deps, &self.schema);
+                    let sort_properties = ep.map(|prop| prop.sort_properties);
+                    if let Ok(SortProperties::Ordered(options)) = sort_properties {
+                        Some((options, deps))
                     } else {
-                        // Do not consider unordered cases
+                        // Do not consider unordered cases.
                         None
                     }
                 })
                 .flat_map(|(options, relevant_deps)| {
-                    let sort_expr = PhysicalSortExpr {
-                        expr: Arc::clone(target),
-                        options,
-                    };
-                    // Generate dependent orderings (i.e. prefixes for `sort_expr`):
-                    let mut dependency_orderings =
+                    // Generate dependent orderings (i.e. prefixes for targets):
+                    let dependency_orderings =
                         generate_dependency_orderings(&relevant_deps, &dependency_map);
-                    // Append `sort_expr` to the dependent orderings:
-                    for ordering in dependency_orderings.iter_mut() {
-                        ordering.push(sort_expr.clone());
+                    let sort_exprs = targets.iter().map(|(target, _)| {
+                        PhysicalSortExpr::new(Arc::clone(target), options)
+                    });
+                    if dependency_orderings.is_empty() {
+                        sort_exprs.map(|sort_expr| [sort_expr].into()).collect()
+                    } else {
+                        sort_exprs
+                            .flat_map(|sort_expr| {
+                                let mut result = dependency_orderings.clone();
+                                for ordering in result.iter_mut() {
+                                    ordering.push(sort_expr.clone());
+                                }
+                                result
+                            })
+                            .collect::<Vec<_>>()
                     }
-                    dependency_orderings
                 })
         });
 
@@ -1033,116 +1108,67 @@ impl EquivalenceProperties {
             if prefixes.is_empty() {
                 // If prefix is empty, there is no dependency. Insert
                 // empty ordering:
-                prefixes = vec![LexOrdering::default()];
-            }
-            // Append current ordering on top its dependencies:
-            for ordering in prefixes.iter_mut() {
-                if let Some(target) = &node.target_sort_expr {
-                    ordering.push(target.clone())
+                if let Some(target) = &node.target {
+                    prefixes.push([target.clone()].into());
+                }
+            } else {
+                // Append current ordering on top its dependencies:
+                for ordering in prefixes.iter_mut() {
+                    if let Some(target) = &node.target {
+                        ordering.push(target.clone());
+                    }
                 }
             }
             prefixes
         });
 
         // Simplify each ordering by removing redundant sections:
-        orderings
-            .chain(projected_orderings)
-            .map(|lex_ordering| lex_ordering.collapse())
-            .collect()
-    }
-
-    /// Projects constants based on the provided `ProjectionMapping`.
-    ///
-    /// This function takes a `ProjectionMapping` and identifies/projects
-    /// constants based on the existing constants and the mapping. It ensures
-    /// that constants are appropriately propagated through the projection.
-    ///
-    /// # Parameters
-    ///
-    /// - `mapping`: A reference to a `ProjectionMapping` representing the
-    ///   mapping of source expressions to target expressions in the projection.
-    ///
-    /// # Returns
-    ///
-    /// Returns a `Vec<Arc<dyn PhysicalExpr>>` containing the projected constants.
-    fn projected_constants(&self, mapping: &ProjectionMapping) -> Vec<ConstExpr> {
-        // First, project existing constants. For example, assume that `a + b`
-        // is known to be constant. If the projection were `a as a_new`, `b as b_new`,
-        // then we would project constant `a + b` as `a_new + b_new`.
-        let mut projected_constants = self
-            .constants
-            .iter()
-            .flat_map(|const_expr| {
-                const_expr
-                    .map(|expr| self.eq_group.project_expr(mapping, expr))
-                    .map(|projected_expr| {
-                        projected_expr
-                            .with_across_partitions(const_expr.across_partitions())
-                    })
-            })
-            .collect::<Vec<_>>();
-
-        // Add projection expressions that are known to be constant:
-        for (source, target) in mapping.iter() {
-            if self.is_expr_constant(source)
-                && !const_exprs_contains(&projected_constants, target)
-            {
-                if self.is_expr_constant_across_partitions(source) {
-                    projected_constants.push(
-                        ConstExpr::from(target)
-                            .with_across_partitions(self.get_expr_constant_value(source)),
-                    )
-                } else {
-                    projected_constants.push(
-                        ConstExpr::from(target)
-                            .with_across_partitions(AcrossPartitions::Heterogeneous),
-                    )
-                }
-            }
-        }
-        projected_constants
+        orderings.chain(projected_orderings).collect()
     }
 
     /// Projects constraints according to the given projection mapping.
     ///
-    /// This function takes a projection mapping and extracts the column indices of the target columns.
-    /// It then projects the constraints to only include relationships between
-    /// columns that exist in the projected output.
+    /// This function takes a projection mapping and extracts column indices of
+    /// target columns. It then projects the constraints to only include
+    /// relationships between columns that exist in the projected output.
     ///
-    /// # Arguments
+    /// # Parameters
     ///
-    /// * `mapping` - A reference to `ProjectionMapping` that defines how expressions are mapped
-    ///   in the projection operation
+    /// * `mapping` - A reference to the `ProjectionMapping` that defines the
+    ///   projection operation.
     ///
     /// # Returns
     ///
-    /// Returns a new `Constraints` object containing only the constraints
-    /// that are valid for the projected columns.
+    /// Returns an optional `Constraints` object containing only the constraints
+    /// that are valid for the projected columns (if any exists).
     fn projected_constraints(&self, mapping: &ProjectionMapping) -> Option<Constraints> {
         let indices = mapping
             .iter()
-            .filter_map(|(_, target)| target.as_any().downcast_ref::<Column>())
-            .map(|col| col.index())
+            .flat_map(|(_, targets)| {
+                targets.iter().flat_map(|(target, _)| {
+                    target.as_any().downcast_ref::<Column>().map(|c| c.index())
+                })
+            })
             .collect::<Vec<_>>();
-        debug_assert_eq!(mapping.map.len(), indices.len());
         self.constraints.project(&indices)
     }
 
-    /// Projects the equivalences within according to `mapping`
-    /// and `output_schema`.
+    /// Projects the equivalences within according to `mapping` and
+    /// `output_schema`.
     pub fn project(&self, mapping: &ProjectionMapping, output_schema: SchemaRef) -> Self {
         let eq_group = self.eq_group.project(mapping);
-        let oeq_class = OrderingEquivalenceClass::new(self.projected_orderings(mapping));
-        let constants = self.projected_constants(mapping);
-        let constraints = self
-            .projected_constraints(mapping)
-            .unwrap_or_else(Constraints::empty);
+        let orderings =
+            self.projected_orderings(mapping, self.oeq_cache.normal_cls.clone());
+        let normal_orderings = orderings
+            .iter()
+            .cloned()
+            .map(|o| eq_group.normalize_sort_exprs(o));
         Self {
+            oeq_cache: OrderingEquivalenceCache::new(normal_orderings),
+            oeq_class: OrderingEquivalenceClass::new(orderings),
+            constraints: self.projected_constraints(mapping).unwrap_or_default(),
             schema: output_schema,
             eq_group,
-            oeq_class,
-            constants,
-            constraints,
         }
     }
 
@@ -1159,7 +1185,7 @@ impl EquivalenceProperties {
     pub fn find_longest_permutation(
         &self,
         exprs: &[Arc<dyn PhysicalExpr>],
-    ) -> (LexOrdering, Vec<usize>) {
+    ) -> Result<(Vec<PhysicalSortExpr>, Vec<usize>)> {
         let mut eq_properties = self.clone();
         let mut result = vec![];
         // The algorithm is as follows:
@@ -1172,32 +1198,23 @@ impl EquivalenceProperties {
         // This algorithm should reach a fixed point in at most `exprs.len()`
         // iterations.
         let mut search_indices = (0..exprs.len()).collect::<IndexSet<_>>();
-        for _idx in 0..exprs.len() {
+        for _ in 0..exprs.len() {
             // Get ordered expressions with their indices.
             let ordered_exprs = search_indices
                 .iter()
-                .flat_map(|&idx| {
+                .filter_map(|&idx| {
                     let ExprProperties {
                         sort_properties, ..
                     } = eq_properties.get_expr_properties(Arc::clone(&exprs[idx]));
                     match sort_properties {
-                        SortProperties::Ordered(options) => Some((
-                            PhysicalSortExpr {
-                                expr: Arc::clone(&exprs[idx]),
-                                options,
-                            },
-                            idx,
-                        )),
+                        SortProperties::Ordered(options) => {
+                            let expr = Arc::clone(&exprs[idx]);
+                            Some((PhysicalSortExpr::new(expr, options), idx))
+                        }
                         SortProperties::Singleton => {
-                            // Assign default ordering to constant expressions
-                            let options = SortOptions::default();
-                            Some((
-                                PhysicalSortExpr {
-                                    expr: Arc::clone(&exprs[idx]),
-                                    options,
-                                },
-                                idx,
-                            ))
+                            // Assign default ordering to constant expressions:
+                            let expr = Arc::clone(&exprs[idx]);
+                            Some((PhysicalSortExpr::new_default(expr), idx))
                         }
                         SortProperties::Unordered => None,
                     }
@@ -1215,67 +1232,20 @@ impl EquivalenceProperties {
             // Note that these expressions are not properly "constants". This is just
             // an implementation strategy confined to this function.
             for (PhysicalSortExpr { expr, .. }, idx) in &ordered_exprs {
-                eq_properties =
-                    eq_properties.with_constants(std::iter::once(ConstExpr::from(expr)));
+                let const_expr = ConstExpr::from(Arc::clone(expr));
+                eq_properties.add_constants(std::iter::once(const_expr))?;
                 search_indices.shift_remove(idx);
             }
             // Add new ordered section to the state.
             result.extend(ordered_exprs);
         }
-        let (left, right) = result.into_iter().unzip();
-        (LexOrdering::new(left), right)
-    }
-
-    /// This function determines whether the provided expression is constant
-    /// based on the known constants.
-    ///
-    /// # Parameters
-    ///
-    /// - `expr`: A reference to a `Arc<dyn PhysicalExpr>` representing the
-    ///   expression to be checked.
-    ///
-    /// # Returns
-    ///
-    /// Returns `true` if the expression is constant according to equivalence
-    /// group, `false` otherwise.
-    pub fn is_expr_constant(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
-        // As an example, assume that we know columns `a` and `b` are constant.
-        // Then, `a`, `b` and `a + b` will all return `true` whereas `c` will
-        // return `false`.
-        let const_exprs = self
-            .constants
-            .iter()
-            .map(|const_expr| Arc::clone(const_expr.expr()));
-        let normalized_constants = self.eq_group.normalize_exprs(const_exprs);
-        let normalized_expr = self.eq_group.normalize_expr(Arc::clone(expr));
-        is_constant_recurse(&normalized_constants, &normalized_expr)
-    }
-
-    /// This function determines whether the provided expression is constant
-    /// across partitions based on the known constants.
-    ///
-    /// # Parameters
-    ///
-    /// - `expr`: A reference to a `Arc<dyn PhysicalExpr>` representing the
-    ///   expression to be checked.
-    ///
-    /// # Returns
-    ///
-    /// Returns `true` if the expression is constant across all partitions according
-    /// to equivalence group, `false` otherwise
-    #[deprecated(
-        since = "45.0.0",
-        note = "Use [`is_expr_constant_across_partitions`] instead"
-    )]
-    pub fn is_expr_constant_accross_partitions(
-        &self,
-        expr: &Arc<dyn PhysicalExpr>,
-    ) -> bool {
-        self.is_expr_constant_across_partitions(expr)
+        Ok(result.into_iter().unzip())
     }
 
     /// This function determines whether the provided expression is constant
-    /// across partitions based on the known constants.
+    /// based on the known constants. For example, if columns `a` and `b` are
+    /// constant, then expressions `a`, `b` and `a + b` will all return `true`
+    /// whereas expression `c` will return `false`.
     ///
     /// # Parameters
     ///
@@ -1284,64 +1254,15 @@ impl EquivalenceProperties {
     ///
     /// # Returns
     ///
-    /// Returns `true` if the expression is constant across all partitions according
-    /// to equivalence group, `false` otherwise.
-    pub fn is_expr_constant_across_partitions(
-        &self,
-        expr: &Arc<dyn PhysicalExpr>,
-    ) -> bool {
-        // As an example, assume that we know columns `a` and `b` are constant.
-        // Then, `a`, `b` and `a + b` will all return `true` whereas `c` will
-        // return `false`.
-        let const_exprs = self
-            .constants
-            .iter()
-            .filter_map(|const_expr| {
-                if matches!(
-                    const_expr.across_partitions(),
-                    AcrossPartitions::Uniform { .. }
-                ) {
-                    Some(Arc::clone(const_expr.expr()))
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-        let normalized_constants = self.eq_group.normalize_exprs(const_exprs);
-        let normalized_expr = self.eq_group.normalize_expr(Arc::clone(expr));
-        is_constant_recurse(&normalized_constants, &normalized_expr)
-    }
-
-    /// Retrieves the constant value of a given physical expression, if it exists.
-    ///
-    /// Normalizes the input expression and checks if it matches any known constants
-    /// in the current context. Returns whether the expression has a uniform value,
-    /// varies across partitions, or is not constant.
-    ///
-    /// # Parameters
-    /// - `expr`: A reference to the physical expression to evaluate.
-    ///
-    /// # Returns
-    /// - `AcrossPartitions::Uniform(value)`: If the expression has the same value across partitions.
-    /// - `AcrossPartitions::Heterogeneous`: If the expression varies across partitions.
-    /// - `None`: If the expression is not recognized as constant.
-    pub fn get_expr_constant_value(
+    /// Returns a `Some` value if the expression is constant according to
+    /// equivalence group, and `None` otherwise. The `Some` variant contains
+    /// an `AcrossPartitions` value indicating whether the expression is
+    /// constant across partitions, and its actual value (if available).
+    pub fn is_expr_constant(
         &self,
         expr: &Arc<dyn PhysicalExpr>,
-    ) -> AcrossPartitions {
-        let normalized_expr = self.eq_group.normalize_expr(Arc::clone(expr));
-
-        if let Some(lit) = normalized_expr.as_any().downcast_ref::<Literal>() {
-            return AcrossPartitions::Uniform(Some(lit.value().clone()));
-        }
-
-        for const_expr in self.constants.iter() {
-            if normalized_expr.eq(const_expr.expr()) {
-                return const_expr.across_partitions();
-            }
-        }
-
-        AcrossPartitions::Heterogeneous
+    ) -> Option<AcrossPartitions> {
+        self.eq_group.is_expr_constant(expr)
     }
 
     /// Retrieves the properties for a given physical expression.
@@ -1367,10 +1288,9 @@ impl EquivalenceProperties {
             .unwrap_or_else(|_| ExprProperties::new_unknown())
     }
 
-    /// Transforms this `EquivalenceProperties` into a new `EquivalenceProperties`
-    /// by mapping columns in the original schema to columns in the new schema
-    /// by index.
-    pub fn with_new_schema(self, schema: SchemaRef) -> Result<Self> {
+    /// Transforms this `EquivalenceProperties` by mapping columns in the
+    /// original schema to columns in the new schema by index.
+    pub fn with_new_schema(mut self, schema: SchemaRef) -> Result<Self> {
         // The new schema and the original schema is aligned when they have the
         // same number of columns, and fields at the same index have the same
         // type in both schemas.
@@ -1385,54 +1305,49 @@ impl EquivalenceProperties {
             // Rewriting equivalence properties in terms of new schema is not
             // safe when schemas are not aligned:
             return plan_err!(
-                "Cannot rewrite old_schema:{:?} with new schema: {:?}",
+                "Schemas have to be aligned to rewrite equivalences:\n Old schema: {}\n New schema: {}",
                 self.schema,
                 schema
             );
         }
-        // Rewrite constants according to new schema:
-        let new_constants = self
-            .constants
-            .into_iter()
-            .map(|const_expr| {
-                let across_partitions = const_expr.across_partitions();
-                let new_const_expr = with_new_schema(const_expr.owned_expr(), &schema)?;
-                Ok(ConstExpr::new(new_const_expr)
-                    .with_across_partitions(across_partitions))
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        // Rewrite orderings according to new schema:
-        let mut new_orderings = vec![];
-        for ordering in self.oeq_class {
-            let new_ordering = ordering
-                .into_iter()
-                .map(|mut sort_expr| {
-                    sort_expr.expr = with_new_schema(sort_expr.expr, &schema)?;
-                    Ok(sort_expr)
-                })
-                .collect::<Result<_>>()?;
-            new_orderings.push(new_ordering);
-        }
 
         // Rewrite equivalence classes according to the new schema:
         let mut eq_classes = vec![];
-        for eq_class in self.eq_group {
-            let new_eq_exprs = eq_class
-                .into_vec()
+        for mut eq_class in self.eq_group {
+            // Rewrite the expressions in the equivalence class:
+            eq_class.exprs = eq_class
+                .exprs
                 .into_iter()
                 .map(|expr| with_new_schema(expr, &schema))
                 .collect::<Result<_>>()?;
-            eq_classes.push(EquivalenceClass::new(new_eq_exprs));
+            // Rewrite the constant value (if available and known):
+            let data_type = eq_class
+                .canonical_expr()
+                .map(|e| e.data_type(&schema))
+                .transpose()?;
+            if let (Some(data_type), Some(AcrossPartitions::Uniform(Some(value)))) =
+                (data_type, &mut eq_class.constant)
+            {
+                *value = value.cast_to(&data_type)?;
+            }
+            eq_classes.push(eq_class);
         }
+        self.eq_group = eq_classes.into();
+
+        // Rewrite orderings according to new schema:
+        self.oeq_class = self.oeq_class.with_new_schema(&schema)?;
+        self.oeq_cache.normal_cls = self.oeq_cache.normal_cls.with_new_schema(&schema)?;
+
+        // Update the schema:
+        self.schema = schema;
 
-        // Construct the resulting equivalence properties:
-        let mut result = EquivalenceProperties::new(schema);
-        result.constants = new_constants;
-        result.add_new_orderings(new_orderings);
-        result.add_equivalence_group(EquivalenceGroup::new(eq_classes));
+        Ok(self)
+    }
+}
 
-        Ok(result)
+impl From<EquivalenceProperties> for OrderingEquivalenceClass {
+    fn from(eq_properties: EquivalenceProperties) -> Self {
+        eq_properties.oeq_class
     }
 }
 
@@ -1440,24 +1355,21 @@ impl EquivalenceProperties {
 ///
 /// Format:
 /// ```text
-/// order: [[a ASC, b ASC], [a ASC, c ASC]], eq: [[a = b], [a = c]], const: [a = 1]
+/// order: [[b@1 ASC NULLS LAST]], eq: [{members: [a@0], constant: (heterogeneous)}]
 /// ```
 impl Display for EquivalenceProperties {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        if self.eq_group.is_empty()
-            && self.oeq_class.is_empty()
-            && self.constants.is_empty()
-        {
-            return write!(f, "No properties");
-        }
-        if !self.oeq_class.is_empty() {
+        let empty_eq_group = self.eq_group.is_empty();
+        let empty_oeq_class = self.oeq_class.is_empty();
+        if empty_oeq_class && empty_eq_group {
+            write!(f, "No properties")?;
+        } else if !empty_oeq_class {
             write!(f, "order: {}", self.oeq_class)?;
-        }
-        if !self.eq_group.is_empty() {
-            write!(f, ", eq: {}", self.eq_group)?;
-        }
-        if !self.constants.is_empty() {
-            write!(f, ", const: [{}]", ConstExpr::format_list(&self.constants))?;
+            if !empty_eq_group {
+                write!(f, ", eq: {}", self.eq_group)?;
+            }
+        } else {
+            write!(f, "eq: {}", self.eq_group)?;
         }
         Ok(())
     }
@@ -1501,45 +1413,20 @@ fn update_properties(
             Interval::make_unbounded(&node.expr.data_type(eq_properties.schema())?)?
     }
     // Now, check what we know about orderings:
-    let normalized_expr = eq_properties
+    let normal_expr = eq_properties
         .eq_group
         .normalize_expr(Arc::clone(&node.expr));
-    let oeq_class = eq_properties.normalized_oeq_class();
-    if eq_properties.is_expr_constant(&normalized_expr)
-        || oeq_class.is_expr_partial_const(&normalized_expr)
+    let oeq_class = &eq_properties.oeq_cache.normal_cls;
+    if eq_properties.is_expr_constant(&normal_expr).is_some()
+        || oeq_class.is_expr_partial_const(&normal_expr)
     {
         node.data.sort_properties = SortProperties::Singleton;
-    } else if let Some(options) = oeq_class.get_options(&normalized_expr) {
+    } else if let Some(options) = oeq_class.get_options(&normal_expr) {
         node.data.sort_properties = SortProperties::Ordered(options);
     }
     Ok(Transformed::yes(node))
 }
 
-/// This function determines whether the provided expression is constant
-/// based on the known constants.
-///
-/// # Parameters
-///
-/// - `constants`: A `&[Arc<dyn PhysicalExpr>]` containing expressions known to
-///   be a constant.
-/// - `expr`: A reference to a `Arc<dyn PhysicalExpr>` representing the expression
-///   to check.
-///
-/// # Returns
-///
-/// Returns `true` if the expression is constant according to equivalence
-/// group, `false` otherwise.
-fn is_constant_recurse(
-    constants: &[Arc<dyn PhysicalExpr>],
-    expr: &Arc<dyn PhysicalExpr>,
-) -> bool {
-    if physical_exprs_contains(constants, expr) || expr.as_any().is::<Literal>() {
-        return true;
-    }
-    let children = expr.children();
-    !children.is_empty() && children.iter().all(|c| is_constant_recurse(constants, c))
-}
-
 /// This function examines whether a referring expression directly refers to a
 /// given referred expression or if any of its children in the expression tree
 /// refer to the specified expression.
@@ -1614,59 +1501,3 @@ fn get_expr_properties(
         expr.get_properties(&child_states)
     }
 }
-
-/// Wrapper struct for `Arc<dyn PhysicalExpr>` to use them as keys in a hash map.
-#[derive(Debug, Clone)]
-struct ExprWrapper(Arc<dyn PhysicalExpr>);
-
-impl PartialEq<Self> for ExprWrapper {
-    fn eq(&self, other: &Self) -> bool {
-        self.0.eq(&other.0)
-    }
-}
-
-impl Eq for ExprWrapper {}
-
-impl Hash for ExprWrapper {
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.0.hash(state);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-    use crate::expressions::{col, BinaryExpr};
-
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
-    use datafusion_expr::Operator;
-
-    #[test]
-    fn test_expr_consists_of_constants() -> Result<()> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Int32, true),
-            Field::new("c", DataType::Int32, true),
-            Field::new("d", DataType::Int32, true),
-            Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
-        ]));
-        let col_a = col("a", &schema)?;
-        let col_b = col("b", &schema)?;
-        let col_d = col("d", &schema)?;
-        let b_plus_d = Arc::new(BinaryExpr::new(
-            Arc::clone(&col_b),
-            Operator::Plus,
-            Arc::clone(&col_d),
-        )) as Arc<dyn PhysicalExpr>;
-
-        let constants = vec![Arc::clone(&col_a), Arc::clone(&col_b)];
-        let expr = Arc::clone(&b_plus_d);
-        assert!(!is_constant_recurse(&constants, &expr));
-
-        let constants = vec![Arc::clone(&col_a), Arc::clone(&col_b), Arc::clone(&col_d)];
-        let expr = Arc::clone(&b_plus_d);
-        assert!(is_constant_recurse(&constants, &expr));
-        Ok(())
-    }
-}
diff --git a/datafusion/physical-expr/src/equivalence/properties/union.rs b/datafusion/physical-expr/src/equivalence/properties/union.rs
index 64ef9278e248b..d77129472a8ba 100644
--- a/datafusion/physical-expr/src/equivalence/properties/union.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/union.rs
@@ -15,28 +15,26 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_err, Result};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use std::iter::Peekable;
 use std::sync::Arc;
 
+use super::EquivalenceProperties;
 use crate::equivalence::class::AcrossPartitions;
-use crate::ConstExpr;
+use crate::{ConstExpr, PhysicalSortExpr};
 
-use super::EquivalenceProperties;
-use crate::PhysicalSortExpr;
 use arrow::datatypes::SchemaRef;
-use std::slice::Iter;
+use datafusion_common::{Result, internal_err};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
-/// Calculates the union (in the sense of `UnionExec`) `EquivalenceProperties`
-/// of  `lhs` and `rhs` according to the schema of `lhs`.
+/// Computes the union (in the sense of `UnionExec`) `EquivalenceProperties`
+/// of `lhs` and `rhs` according to the schema of `lhs`.
 ///
-/// Rules: The UnionExec does not interleave its inputs: instead it passes each
-/// input partition from the children as its own output.
+/// Rules: The `UnionExec` does not interleave its inputs, instead it passes
+/// each input partition from the children as its own output.
 ///
 /// Since the output equivalence properties are properties that are true for
 /// *all* output partitions, that is the same as being true for all *input*
-/// partitions
+/// partitions.
 fn calculate_union_binary(
     lhs: EquivalenceProperties,
     mut rhs: EquivalenceProperties,
@@ -48,28 +46,21 @@ fn calculate_union_binary(
 
     // First, calculate valid constants for the union. An expression is constant
     // at the output of the union if it is constant in both sides with matching values.
+    let rhs_constants = rhs.constants();
     let constants = lhs
         .constants()
-        .iter()
+        .into_iter()
         .filter_map(|lhs_const| {
             // Find matching constant expression in RHS
-            rhs.constants()
+            rhs_constants
                 .iter()
-                .find(|rhs_const| rhs_const.expr().eq(lhs_const.expr()))
+                .find(|rhs_const| rhs_const.expr.eq(&lhs_const.expr))
                 .map(|rhs_const| {
-                    let mut const_expr = ConstExpr::new(Arc::clone(lhs_const.expr()));
-
-                    // If both sides have matching constant values, preserve the value and set across_partitions=true
-                    if let (
-                        AcrossPartitions::Uniform(Some(lhs_val)),
-                        AcrossPartitions::Uniform(Some(rhs_val)),
-                    ) = (lhs_const.across_partitions(), rhs_const.across_partitions())
-                    {
-                        if lhs_val == rhs_val {
-                            const_expr = const_expr.with_across_partitions(
-                                AcrossPartitions::Uniform(Some(lhs_val)),
-                            )
-                        }
+                    let mut const_expr = lhs_const.clone();
+                    // If both sides have matching constant values, preserve it.
+                    // Otherwise, set fall back to heterogeneous values.
+                    if lhs_const.across_partitions != rhs_const.across_partitions {
+                        const_expr.across_partitions = AcrossPartitions::Heterogeneous;
                     }
                     const_expr
                 })
@@ -79,14 +70,13 @@ fn calculate_union_binary(
     // Next, calculate valid orderings for the union by searching for prefixes
     // in both sides.
     let mut orderings = UnionEquivalentOrderingBuilder::new();
-    orderings.add_satisfied_orderings(lhs.normalized_oeq_class(), lhs.constants(), &rhs);
-    orderings.add_satisfied_orderings(rhs.normalized_oeq_class(), rhs.constants(), &lhs);
+    orderings.add_satisfied_orderings(&lhs, &rhs)?;
+    orderings.add_satisfied_orderings(&rhs, &lhs)?;
     let orderings = orderings.build();
 
-    let mut eq_properties =
-        EquivalenceProperties::new(lhs.schema).with_constants(constants);
-
-    eq_properties.add_new_orderings(orderings);
+    let mut eq_properties = EquivalenceProperties::new(lhs.schema);
+    eq_properties.add_constants(constants)?;
+    eq_properties.add_orderings(orderings);
     Ok(eq_properties)
 }
 
@@ -137,135 +127,139 @@ impl UnionEquivalentOrderingBuilder {
         Self { orderings: vec![] }
     }
 
-    /// Add all orderings from `orderings` that satisfy `properties`,
-    /// potentially augmented with`constants`.
+    /// Add all orderings from `source` that satisfy `properties`,
+    /// potentially augmented with the constants in `source`.
     ///
-    /// Note: any column that is known to be constant can be inserted into the
-    /// ordering without changing its meaning
+    /// Note: Any column that is known to be constant can be inserted into the
+    /// ordering without changing its meaning.
     ///
     /// For example:
-    /// * `orderings` contains `[a ASC, c ASC]` and `constants` contains `b`
-    /// * `properties` has required ordering `[a ASC, b ASC]`
+    /// * Orderings in `source` contains `[a ASC, c ASC]` and constants contains
+    ///   `b`,
+    /// * `properties` has the ordering `[a ASC, b ASC]`.
     ///
     /// Then this will add `[a ASC, b ASC]` to the `orderings` list (as `a` was
     /// in the sort order and `b` was a constant).
     fn add_satisfied_orderings(
         &mut self,
-        orderings: impl IntoIterator<Item = LexOrdering>,
-        constants: &[ConstExpr],
+        source: &EquivalenceProperties,
         properties: &EquivalenceProperties,
-    ) {
-        for mut ordering in orderings.into_iter() {
+    ) -> Result<()> {
+        let constants = source.constants();
+        let properties_constants = properties.constants();
+        for mut ordering in source.oeq_cache.normal_cls.clone() {
             // Progressively shorten the ordering to search for a satisfied prefix:
             loop {
-                match self.try_add_ordering(ordering, constants, properties) {
+                ordering = match self.try_add_ordering(
+                    ordering,
+                    &constants,
+                    properties,
+                    &properties_constants,
+                )? {
                     AddedOrdering::Yes => break,
-                    AddedOrdering::No(o) => {
-                        ordering = o;
-                        ordering.pop();
+                    AddedOrdering::No(ordering) => {
+                        let mut sort_exprs: Vec<_> = ordering.into();
+                        sort_exprs.pop();
+                        if let Some(ordering) = LexOrdering::new(sort_exprs) {
+                            ordering
+                        } else {
+                            break;
+                        }
                     }
                 }
             }
         }
+        Ok(())
     }
 
-    /// Adds `ordering`, potentially augmented with constants, if it satisfies
-    /// the target `properties` properties.
-    ///
-    /// Returns
+    /// Adds `ordering`, potentially augmented with `constants`, if it satisfies
+    /// the given `properties`.
     ///
-    /// * [`AddedOrdering::Yes`] if the ordering was added (either directly or
-    ///   augmented), or was empty.
+    /// # Returns
     ///
-    /// * [`AddedOrdering::No`] if the ordering was not added
+    /// An [`AddedOrdering::Yes`] instance if the ordering was added (either
+    /// directly or augmented), or was empty. An [`AddedOrdering::No`] instance
+    /// otherwise.
     fn try_add_ordering(
         &mut self,
         ordering: LexOrdering,
         constants: &[ConstExpr],
         properties: &EquivalenceProperties,
-    ) -> AddedOrdering {
-        if ordering.is_empty() {
-            AddedOrdering::Yes
-        } else if properties.ordering_satisfy(ordering.as_ref()) {
+        properties_constants: &[ConstExpr],
+    ) -> Result<AddedOrdering> {
+        if properties.ordering_satisfy(ordering.clone())? {
             // If the ordering satisfies the target properties, no need to
             // augment it with constants.
             self.orderings.push(ordering);
-            AddedOrdering::Yes
+            Ok(AddedOrdering::Yes)
+        } else if self.try_find_augmented_ordering(
+            &ordering,
+            constants,
+            properties,
+            properties_constants,
+        ) {
+            // Augmented with constants to match the properties.
+            Ok(AddedOrdering::Yes)
         } else {
-            // Did not satisfy target properties, try and augment with constants
-            //  to match the properties
-            if self.try_find_augmented_ordering(&ordering, constants, properties) {
-                AddedOrdering::Yes
-            } else {
-                AddedOrdering::No(ordering)
-            }
+            Ok(AddedOrdering::No(ordering))
         }
     }
 
     /// Attempts to add `constants` to `ordering` to satisfy the properties.
-    ///
-    /// returns true if any orderings were added, false otherwise
+    /// Returns `true` if augmentation took place, `false` otherwise.
     fn try_find_augmented_ordering(
         &mut self,
         ordering: &LexOrdering,
         constants: &[ConstExpr],
         properties: &EquivalenceProperties,
+        properties_constants: &[ConstExpr],
     ) -> bool {
-        // can't augment if there is nothing to augment with
-        if constants.is_empty() {
-            return false;
-        }
-        let start_num_orderings = self.orderings.len();
-
-        // for each equivalent ordering in properties, try and augment
-        // `ordering` it with the constants to match
-        for existing_ordering in properties.oeq_class.iter() {
-            if let Some(augmented_ordering) = self.augment_ordering(
-                ordering,
-                constants,
-                existing_ordering,
-                &properties.constants,
-            ) {
-                if !augmented_ordering.is_empty() {
-                    assert!(properties.ordering_satisfy(augmented_ordering.as_ref()));
+        let mut result = false;
+        // Can only augment if there are constants.
+        if !constants.is_empty() {
+            // For each equivalent ordering in properties, try and augment
+            // `ordering` with the constants to match `existing_ordering`:
+            for existing_ordering in properties.oeq_class.iter() {
+                if let Some(augmented_ordering) = Self::augment_ordering(
+                    ordering,
+                    constants,
+                    existing_ordering,
+                    properties_constants,
+                ) {
                     self.orderings.push(augmented_ordering);
+                    result = true;
                 }
             }
         }
-
-        self.orderings.len() > start_num_orderings
+        result
     }
 
-    /// Attempts to augment the ordering with constants to match the
-    /// `existing_ordering`
-    ///
-    /// Returns Some(ordering) if an augmented ordering was found, None otherwise
+    /// Attempts to augment the ordering with constants to match `existing_ordering`.
+    /// Returns `Some(ordering)` if an augmented ordering was found, `None` otherwise.
     fn augment_ordering(
-        &mut self,
         ordering: &LexOrdering,
         constants: &[ConstExpr],
         existing_ordering: &LexOrdering,
         existing_constants: &[ConstExpr],
     ) -> Option<LexOrdering> {
-        let mut augmented_ordering = LexOrdering::default();
-        let mut sort_expr_iter = ordering.iter().peekable();
-        let mut existing_sort_expr_iter = existing_ordering.iter().peekable();
-
-        // walk in parallel down the two orderings, trying to match them up
-        while sort_expr_iter.peek().is_some() || existing_sort_expr_iter.peek().is_some()
-        {
-            // If the next expressions are equal, add the next match
-            // otherwise try and match with a constant
+        let mut augmented_ordering = vec![];
+        let mut sort_exprs = ordering.iter().peekable();
+        let mut existing_sort_exprs = existing_ordering.iter().peekable();
+
+        // Walk in parallel down the two orderings, trying to match them up:
+        while sort_exprs.peek().is_some() || existing_sort_exprs.peek().is_some() {
+            // If the next expressions are equal, add the next match. Otherwise,
+            // try and match with a constant.
             if let Some(expr) =
-                advance_if_match(&mut sort_expr_iter, &mut existing_sort_expr_iter)
+                advance_if_match(&mut sort_exprs, &mut existing_sort_exprs)
             {
                 augmented_ordering.push(expr);
             } else if let Some(expr) =
-                advance_if_matches_constant(&mut sort_expr_iter, existing_constants)
+                advance_if_matches_constant(&mut sort_exprs, existing_constants)
             {
                 augmented_ordering.push(expr);
             } else if let Some(expr) =
-                advance_if_matches_constant(&mut existing_sort_expr_iter, constants)
+                advance_if_matches_constant(&mut existing_sort_exprs, constants)
             {
                 augmented_ordering.push(expr);
             } else {
@@ -274,7 +268,7 @@ impl UnionEquivalentOrderingBuilder {
             }
         }
 
-        Some(augmented_ordering)
+        LexOrdering::new(augmented_ordering)
     }
 
     fn build(self) -> Vec<LexOrdering> {
@@ -282,47 +276,38 @@ impl UnionEquivalentOrderingBuilder {
     }
 }
 
-/// Advances two iterators in parallel
-///
-/// If the next expressions are equal, the iterators are advanced and returns
-/// the matched expression .
-///
-/// Otherwise, the iterators are left unchanged and return `None`
-fn advance_if_match(
-    iter1: &mut Peekable<Iter<PhysicalSortExpr>>,
-    iter2: &mut Peekable<Iter<PhysicalSortExpr>>,
+/// Advances two iterators in parallel if the next expressions are equal.
+/// Otherwise, the iterators are left unchanged and returns `None`.
+fn advance_if_match<'a>(
+    iter1: &mut Peekable<impl Iterator<Item = &'a PhysicalSortExpr>>,
+    iter2: &mut Peekable<impl Iterator<Item = &'a PhysicalSortExpr>>,
 ) -> Option<PhysicalSortExpr> {
-    if matches!((iter1.peek(), iter2.peek()), (Some(expr1), Some(expr2)) if expr1.eq(expr2))
-    {
-        iter1.next().unwrap();
+    let (expr1, expr2) = (iter1.peek()?, iter2.peek()?);
+    if expr1.eq(expr2) {
+        iter1.next();
         iter2.next().cloned()
     } else {
         None
     }
 }
 
-/// Advances the iterator with a constant
-///
-/// If the next expression  matches one of the constants, advances the iterator
-/// returning the matched expression
-///
-/// Otherwise, the iterator is left unchanged and returns `None`
-fn advance_if_matches_constant(
-    iter: &mut Peekable<Iter<PhysicalSortExpr>>,
+/// Advances the iterator with a constant if the next expression matches one of
+/// the constants. Otherwise, the iterator is left unchanged and returns `None`.
+fn advance_if_matches_constant<'a>(
+    iter: &mut Peekable<impl Iterator<Item = &'a PhysicalSortExpr>>,
     constants: &[ConstExpr],
 ) -> Option<PhysicalSortExpr> {
     let expr = iter.peek()?;
-    let const_expr = constants.iter().find(|c| c.eq_expr(expr))?;
-    let found_expr = PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options);
+    let const_expr = constants.iter().find(|c| expr.expr.eq(&c.expr))?;
+    let found_expr = PhysicalSortExpr::new(Arc::clone(&const_expr.expr), expr.options);
     iter.next();
     Some(found_expr)
 }
 
 #[cfg(test)]
 mod tests {
-
     use super::*;
-    use crate::equivalence::class::const_exprs_contains;
+    use crate::PhysicalExpr;
     use crate::equivalence::tests::{create_test_schema, parse_sort_expr};
     use crate::expressions::col;
 
@@ -331,85 +316,95 @@ mod tests {
 
     use itertools::Itertools;
 
+    /// Checks whether `expr` is among in the `const_exprs`.
+    fn const_exprs_contains(
+        const_exprs: &[ConstExpr],
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> bool {
+        const_exprs
+            .iter()
+            .any(|const_expr| const_expr.expr.eq(expr))
+    }
+
     #[test]
-    fn test_union_equivalence_properties_multi_children_1() {
+    fn test_union_equivalence_properties_multi_children_1() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         let schema3 = append_fields(&schema, "2");
         UnionEquivalenceTest::new(&schema)
             // Children 1
-            .with_child_sort(vec![vec!["a", "b", "c"]], &schema)
+            .with_child_sort(vec![vec!["a", "b", "c"]], &schema)?
             // Children 2
-            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)
+            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)?
             // Children 3
-            .with_child_sort(vec![vec!["a2", "b2"]], &schema3)
-            .with_expected_sort(vec![vec!["a", "b"]])
+            .with_child_sort(vec![vec!["a2", "b2"]], &schema3)?
+            .with_expected_sort(vec![vec!["a", "b"]])?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_multi_children_2() {
+    fn test_union_equivalence_properties_multi_children_2() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         let schema3 = append_fields(&schema, "2");
         UnionEquivalenceTest::new(&schema)
             // Children 1
-            .with_child_sort(vec![vec!["a", "b", "c"]], &schema)
+            .with_child_sort(vec![vec!["a", "b", "c"]], &schema)?
             // Children 2
-            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)
+            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)?
             // Children 3
-            .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3)
-            .with_expected_sort(vec![vec!["a", "b", "c"]])
+            .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3)?
+            .with_expected_sort(vec![vec!["a", "b", "c"]])?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_multi_children_3() {
+    fn test_union_equivalence_properties_multi_children_3() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         let schema3 = append_fields(&schema, "2");
         UnionEquivalenceTest::new(&schema)
             // Children 1
-            .with_child_sort(vec![vec!["a", "b"]], &schema)
+            .with_child_sort(vec![vec!["a", "b"]], &schema)?
             // Children 2
-            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)
+            .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2)?
             // Children 3
-            .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3)
-            .with_expected_sort(vec![vec!["a", "b"]])
+            .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3)?
+            .with_expected_sort(vec![vec!["a", "b"]])?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_multi_children_4() {
+    fn test_union_equivalence_properties_multi_children_4() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         let schema3 = append_fields(&schema, "2");
         UnionEquivalenceTest::new(&schema)
             // Children 1
-            .with_child_sort(vec![vec!["a", "b"]], &schema)
+            .with_child_sort(vec![vec!["a", "b"]], &schema)?
             // Children 2
-            .with_child_sort(vec![vec!["a1", "b1"]], &schema2)
+            .with_child_sort(vec![vec!["a1", "b1"]], &schema2)?
             // Children 3
-            .with_child_sort(vec![vec!["b2", "c2"]], &schema3)
-            .with_expected_sort(vec![])
+            .with_child_sort(vec![vec!["b2", "c2"]], &schema3)?
+            .with_expected_sort(vec![])?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_multi_children_5() {
+    fn test_union_equivalence_properties_multi_children_5() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         UnionEquivalenceTest::new(&schema)
             // Children 1
-            .with_child_sort(vec![vec!["a", "b"], vec!["c"]], &schema)
+            .with_child_sort(vec![vec!["a", "b"], vec!["c"]], &schema)?
             // Children 2
-            .with_child_sort(vec![vec!["a1", "b1"], vec!["c1"]], &schema2)
-            .with_expected_sort(vec![vec!["a", "b"], vec!["c"]])
+            .with_child_sort(vec![vec!["a1", "b1"], vec!["c1"]], &schema2)?
+            .with_expected_sort(vec![vec!["a", "b"], vec!["c"]])?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_common_constants() {
+    fn test_union_equivalence_properties_constants_common_constants() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -417,23 +412,23 @@ mod tests {
                 vec![vec!["a"]],
                 vec!["b", "c"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child: [b ASC], const [a, c]
                 vec![vec!["b"]],
                 vec!["a", "c"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union expected orderings: [[a ASC], [b ASC]], const [c]
                 vec![vec!["a"], vec!["b"]],
                 vec!["c"],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_prefix() {
+    fn test_union_equivalence_properties_constants_prefix() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -441,23 +436,23 @@ mod tests {
                 vec![vec!["a"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child: [a ASC, b ASC], const []
                 vec![vec!["a", "b"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [a ASC], const []
                 vec![vec!["a"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_asc_desc_mismatch() {
+    fn test_union_equivalence_properties_constants_asc_desc_mismatch() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -465,23 +460,23 @@ mod tests {
                 vec![vec!["a"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [a DESC], const []
                 vec![vec!["a DESC"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union doesn't have any ordering or constant
                 vec![],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_different_schemas() {
+    fn test_union_equivalence_properties_constants_different_schemas() -> Result<()> {
         let schema = create_test_schema().unwrap();
         let schema2 = append_fields(&schema, "1");
         UnionEquivalenceTest::new(&schema)
@@ -490,13 +485,13 @@ mod tests {
                 vec![vec!["a"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [a1 ASC, b1 ASC], const []
                 vec![vec!["a1", "b1"]],
                 vec![],
                 &schema2,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [a ASC]
                 //
@@ -504,12 +499,12 @@ mod tests {
                 // corresponding schemas.
                 vec![vec!["a"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_fill_gaps() {
+    fn test_union_equivalence_properties_constants_fill_gaps() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -517,13 +512,13 @@ mod tests {
                 vec![vec!["a", "c"]],
                 vec!["b"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [b ASC, c ASC], const [a]
                 vec![vec!["b", "c"]],
                 vec!["a"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [
                 //   [a ASC, b ASC, c ASC],
@@ -531,12 +526,12 @@ mod tests {
                 // ], const []
                 vec![vec!["a", "b", "c"], vec!["b", "a", "c"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_no_fill_gaps() {
+    fn test_union_equivalence_properties_constants_no_fill_gaps() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -544,23 +539,23 @@ mod tests {
                 vec![vec!["a", "c"]],
                 vec!["d"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [b ASC, c ASC], const [a]
                 vec![vec!["b", "c"]],
                 vec!["a"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [[a]] (only a is constant)
                 vec![vec!["a"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_fill_some_gaps() {
+    fn test_union_equivalence_properties_constants_fill_some_gaps() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -568,23 +563,24 @@ mod tests {
                 vec![vec!["c"]],
                 vec!["a", "b"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [a DESC, b], const []
                 vec![vec!["a DESC", "b"]],
                 vec![],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [[a, b]] (can fill in the a/b with constants)
                 vec![vec!["a DESC", "b"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_fill_gaps_non_symmetric() {
+    fn test_union_equivalence_properties_constants_fill_gaps_non_symmetric() -> Result<()>
+    {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -592,13 +588,13 @@ mod tests {
                 vec![vec!["a", "c"]],
                 vec!["b"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child orderings: [b ASC, c ASC], const [a]
                 vec![vec!["b DESC", "c"]],
                 vec!["a"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings: [
                 //   [a ASC, b ASC, c ASC],
@@ -606,12 +602,12 @@ mod tests {
                 // ], const []
                 vec![vec!["a", "b DESC", "c"], vec!["b DESC", "a", "c"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_gap_fill_symmetric() {
+    fn test_union_equivalence_properties_constants_gap_fill_symmetric() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -619,25 +615,25 @@ mod tests {
                 vec![vec!["a", "b", "d"]],
                 vec!["c"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child: [a ASC, c ASC, d ASC], const [b]
                 vec![vec!["a", "c", "d"]],
                 vec!["b"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings:
                 // [a, b, c, d]
                 // [a, c, b, d]
                 vec![vec!["a", "c", "b", "d"], vec!["a", "b", "c", "d"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_gap_fill_and_common() {
+    fn test_union_equivalence_properties_constants_gap_fill_and_common() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -645,24 +641,24 @@ mod tests {
                 vec![vec!["a DESC", "d"]],
                 vec!["b", "c"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child: [a DESC, c ASC, d ASC], const [b]
                 vec![vec!["a DESC", "c", "d"]],
                 vec!["b"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings:
                 // [a DESC, c, d]  [b]
                 vec![vec!["a DESC", "c", "d"]],
                 vec!["b"],
-            )
+            )?
             .run()
     }
 
     #[test]
-    fn test_union_equivalence_properties_constants_middle_desc() {
+    fn test_union_equivalence_properties_constants_middle_desc() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
             .with_child_sort_and_const_exprs(
@@ -672,20 +668,20 @@ mod tests {
                 vec![vec!["a", "b DESC", "d"]],
                 vec!["c"],
                 &schema,
-            )
+            )?
             .with_child_sort_and_const_exprs(
                 // Second child: [a ASC, c ASC, d ASC], const [b]
                 vec![vec!["a", "c", "d"]],
                 vec!["b"],
                 &schema,
-            )
+            )?
             .with_expected_sort_and_const_exprs(
                 // Union orderings:
                 // [a, b, d] (c constant)
                 // [a, c, d] (b constant)
                 vec![vec!["a", "c", "b DESC", "d"], vec!["a", "b DESC", "c", "d"]],
                 vec![],
-            )
+            )?
             .run()
     }
 
@@ -718,10 +714,10 @@ mod tests {
             mut self,
             orderings: Vec<Vec<&str>>,
             schema: &SchemaRef,
-        ) -> Self {
-            let properties = self.make_props(orderings, vec![], schema);
+        ) -> Result<Self> {
+            let properties = self.make_props(orderings, vec![], schema)?;
             self.child_properties.push(properties);
-            self
+            Ok(self)
         }
 
         /// Add a union input with the specified orderings and constant
@@ -734,19 +730,19 @@ mod tests {
             orderings: Vec<Vec<&str>>,
             constants: Vec<&str>,
             schema: &SchemaRef,
-        ) -> Self {
-            let properties = self.make_props(orderings, constants, schema);
+        ) -> Result<Self> {
+            let properties = self.make_props(orderings, constants, schema)?;
             self.child_properties.push(properties);
-            self
+            Ok(self)
         }
 
         /// Set the expected output sort order for the union of the children
         ///
         /// See [`Self::make_props`] for the format of the strings in `orderings`
-        fn with_expected_sort(mut self, orderings: Vec<Vec<&str>>) -> Self {
-            let properties = self.make_props(orderings, vec![], &self.output_schema);
+        fn with_expected_sort(mut self, orderings: Vec<Vec<&str>>) -> Result<Self> {
+            let properties = self.make_props(orderings, vec![], &self.output_schema)?;
             self.expected_properties = Some(properties);
-            self
+            Ok(self)
         }
 
         /// Set the expected output sort order and constant expressions for the
@@ -758,15 +754,16 @@ mod tests {
             mut self,
             orderings: Vec<Vec<&str>>,
             constants: Vec<&str>,
-        ) -> Self {
-            let properties = self.make_props(orderings, constants, &self.output_schema);
+        ) -> Result<Self> {
+            let properties =
+                self.make_props(orderings, constants, &self.output_schema)?;
             self.expected_properties = Some(properties);
-            self
+            Ok(self)
         }
 
         /// compute the union's output equivalence properties from the child
         /// properties, and compare them to the expected properties
-        fn run(self) {
+        fn run(self) -> Result<()> {
             let Self {
                 output_schema,
                 child_properties,
@@ -798,6 +795,7 @@ mod tests {
                     ),
                 );
             }
+            Ok(())
         }
 
         fn assert_eq_properties_same(
@@ -808,9 +806,9 @@ mod tests {
             // Check whether constants are same
             let lhs_constants = lhs.constants();
             let rhs_constants = rhs.constants();
-            for rhs_constant in rhs_constants {
+            for rhs_constant in &rhs_constants {
                 assert!(
-                    const_exprs_contains(lhs_constants, rhs_constant.expr()),
+                    const_exprs_contains(&lhs_constants, &rhs_constant.expr),
                     "{err_msg}\nlhs: {lhs}\nrhs: {rhs}"
                 );
             }
@@ -845,24 +843,19 @@ mod tests {
             orderings: Vec<Vec<&str>>,
             constants: Vec<&str>,
             schema: &SchemaRef,
-        ) -> EquivalenceProperties {
-            let orderings = orderings
-                .iter()
-                .map(|ordering| {
-                    ordering
-                        .iter()
-                        .map(|name| parse_sort_expr(name, schema))
-                        .collect::<LexOrdering>()
-                })
-                .collect::<Vec<_>>();
+        ) -> Result<EquivalenceProperties> {
+            let orderings = orderings.iter().map(|ordering| {
+                ordering.iter().map(|name| parse_sort_expr(name, schema))
+            });
 
             let constants = constants
                 .iter()
-                .map(|col_name| ConstExpr::new(col(col_name, schema).unwrap()))
-                .collect::<Vec<_>>();
+                .map(|col_name| ConstExpr::from(col(col_name, schema).unwrap()));
 
-            EquivalenceProperties::new_with_orderings(Arc::clone(schema), &orderings)
-                .with_constants(constants)
+            let mut props =
+                EquivalenceProperties::new_with_orderings(Arc::clone(schema), orderings);
+            props.add_constants(constants)?;
+            Ok(props)
         }
     }
 
@@ -877,25 +870,29 @@ mod tests {
         let literal_10 = ScalarValue::Int32(Some(10));
 
         // Create first input with a=10
-        let const_expr1 = ConstExpr::new(Arc::clone(&col_a))
-            .with_across_partitions(AcrossPartitions::Uniform(Some(literal_10.clone())));
-        let input1 = EquivalenceProperties::new(Arc::clone(&schema))
-            .with_constants(vec![const_expr1]);
+        let const_expr1 = ConstExpr::new(
+            Arc::clone(&col_a),
+            AcrossPartitions::Uniform(Some(literal_10.clone())),
+        );
+        let mut input1 = EquivalenceProperties::new(Arc::clone(&schema));
+        input1.add_constants(vec![const_expr1])?;
 
         // Create second input with a=10
-        let const_expr2 = ConstExpr::new(Arc::clone(&col_a))
-            .with_across_partitions(AcrossPartitions::Uniform(Some(literal_10.clone())));
-        let input2 = EquivalenceProperties::new(Arc::clone(&schema))
-            .with_constants(vec![const_expr2]);
+        let const_expr2 = ConstExpr::new(
+            Arc::clone(&col_a),
+            AcrossPartitions::Uniform(Some(literal_10.clone())),
+        );
+        let mut input2 = EquivalenceProperties::new(Arc::clone(&schema));
+        input2.add_constants(vec![const_expr2])?;
 
         // Calculate union properties
         let union_props = calculate_union(vec![input1, input2], schema)?;
 
         // Verify column 'a' remains constant with value 10
         let const_a = &union_props.constants()[0];
-        assert!(const_a.expr().eq(&col_a));
+        assert!(const_a.expr.eq(&col_a));
         assert_eq!(
-            const_a.across_partitions(),
+            const_a.across_partitions,
             AcrossPartitions::Uniform(Some(literal_10))
         );
 
@@ -924,4 +921,63 @@ mod tests {
                 .collect::<Vec<_>>(),
         ))
     }
+
+    #[test]
+    fn test_constants_share_values() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("const_1", DataType::Utf8, false),
+            Field::new("const_2", DataType::Utf8, false),
+        ]));
+
+        let col_const_1 = col("const_1", &schema)?;
+        let col_const_2 = col("const_2", &schema)?;
+
+        let literal_foo = ScalarValue::Utf8(Some("foo".to_owned()));
+        let literal_bar = ScalarValue::Utf8(Some("bar".to_owned()));
+
+        let const_expr_1_foo = ConstExpr::new(
+            Arc::clone(&col_const_1),
+            AcrossPartitions::Uniform(Some(literal_foo.clone())),
+        );
+        let const_expr_2_foo = ConstExpr::new(
+            Arc::clone(&col_const_2),
+            AcrossPartitions::Uniform(Some(literal_foo.clone())),
+        );
+        let const_expr_2_bar = ConstExpr::new(
+            Arc::clone(&col_const_2),
+            AcrossPartitions::Uniform(Some(literal_bar.clone())),
+        );
+
+        let mut input1 = EquivalenceProperties::new(Arc::clone(&schema));
+        let mut input2 = EquivalenceProperties::new(Arc::clone(&schema));
+
+        // | Input | Const_1 | Const_2 |
+        // | ----- | ------- | ------- |
+        // |     1 | foo     | foo     |
+        // |     2 | foo     | bar     |
+        input1.add_constants(vec![const_expr_1_foo.clone(), const_expr_2_foo.clone()])?;
+        input2.add_constants(vec![const_expr_1_foo.clone(), const_expr_2_bar.clone()])?;
+
+        // Calculate union properties
+        let union_props = calculate_union(vec![input1, input2], schema)?;
+
+        // This should result in:
+        //   const_1 = Uniform("foo")
+        //   const_2 = Heterogeneous
+        assert_eq!(union_props.constants().len(), 2);
+        let union_const_1 = &union_props.constants()[0];
+        assert!(union_const_1.expr.eq(&col_const_1));
+        assert_eq!(
+            union_const_1.across_partitions,
+            AcrossPartitions::Uniform(Some(literal_foo)),
+        );
+        let union_const_2 = &union_props.constants()[1];
+        assert!(union_const_2.expr.eq(&col_const_2));
+        assert_eq!(
+            union_const_2.across_partitions,
+            AcrossPartitions::Heterogeneous,
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index 798e68a459ce6..02628b405ec6c 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -17,39 +17,36 @@
 
 mod kernels;
 
-use crate::expressions::binary::kernels::concat_elements_utf8view;
-use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison};
 use crate::PhysicalExpr;
+use crate::intervals::cp_solver::{propagate_arithmetic, propagate_comparison};
 use std::hash::Hash;
 use std::{any::Any, sync::Arc};
 
 use arrow::array::*;
-use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene};
-use arrow::compute::kernels::cmp::*;
-use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar};
+use arrow::compute::kernels::boolean::{and_kleene, or_kleene};
 use arrow::compute::kernels::concat_elements::concat_elements_utf8;
-use arrow::compute::{
-    cast, filter_record_batch, ilike, like, nilike, nlike, SlicesIterator,
-};
+use arrow::compute::{SlicesIterator, cast, filter_record_batch};
 use arrow::datatypes::*;
 use arrow::error::ArrowError;
 use datafusion_common::cast::as_boolean_array;
-use datafusion_common::{internal_err, not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, internal_err, not_impl_err};
+
 use datafusion_expr::binary::BinaryTypeCoercer;
-use datafusion_expr::interval_arithmetic::{apply_operator, Interval};
+use datafusion_expr::interval_arithmetic::{Interval, apply_operator};
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::statistics::Distribution::{Bernoulli, Gaussian};
 use datafusion_expr::statistics::{
-    combine_bernoullis, combine_gaussians, create_bernoulli_from_comparison,
-    new_generic_from_binary_op, Distribution,
+    Distribution, combine_bernoullis, combine_gaussians,
+    create_bernoulli_from_comparison, new_generic_from_binary_op,
 };
 use datafusion_expr::{ColumnarValue, Operator};
-use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested};
+use datafusion_physical_expr_common::datum::{apply, apply_cmp};
 
 use kernels::{
     bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar,
     bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn,
     bitwise_shift_right_dyn_scalar, bitwise_xor_dyn, bitwise_xor_dyn_scalar,
+    concat_elements_utf8view, regex_match_dyn, regex_match_dyn_scalar,
 };
 
 /// Binary expression
@@ -160,181 +157,98 @@ fn boolean_op(
     left: &dyn Array,
     right: &dyn Array,
     op: impl FnOnce(&BooleanArray, &BooleanArray) -> Result<BooleanArray, ArrowError>,
-) -> Result<Arc<(dyn Array + 'static)>, ArrowError> {
+) -> Result<Arc<dyn Array + 'static>, ArrowError> {
     let ll = as_boolean_array(left).expect("boolean_op failed to downcast left array");
     let rr = as_boolean_array(right).expect("boolean_op failed to downcast right array");
     op(ll, rr).map(|t| Arc::new(t) as _)
 }
 
-macro_rules! binary_string_array_flag_op {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
-        match $LEFT.data_type() {
-            DataType::Utf8 => {
-                compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
-            },
-            DataType::Utf8View => {
-                compute_utf8view_flag_op!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG)
-            }
-            DataType::LargeUtf8 => {
-                compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
-            },
-            other => internal_err!(
-                "Data type {:?} not supported for binary_string_array_flag_op operation '{}' on string array",
-                other, stringify!($OP)
-            ),
-        }
-    }};
+/// Returns true if both operands are Date types (Date32 or Date64)
+/// Used to detect Date - Date operations which should return Int64 (days difference)
+fn is_date_minus_date(lhs: &DataType, rhs: &DataType) -> bool {
+    matches!(
+        (lhs, rhs),
+        (DataType::Date32, DataType::Date32) | (DataType::Date64, DataType::Date64)
+    )
 }
 
-/// Invoke a compute kernel on a pair of binary data arrays with flags
-macro_rules! compute_utf8_flag_op {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
-        let ll = $LEFT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8_flag_op failed to downcast array");
-        let rr = $RIGHT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8_flag_op failed to downcast array");
-
-        let flag = if $FLAG {
-            Some($ARRAYTYPE::from(vec!["i"; ll.len()]))
-        } else {
-            None
-        };
-        let mut array = $OP(ll, rr, flag.as_ref())?;
-        if $NOT {
-            array = not(&array).unwrap();
-        }
-        Ok(Arc::new(array))
-    }};
-}
+/// Computes the difference between two dates and returns the result as Int64 (days)
+/// This aligns with PostgreSQL, DuckDB, and MySQL behavior where date - date returns an integer
+///
+/// Implementation: Uses Arrow's sub_wrapping to get Duration, then converts to Int64 days
+fn apply_date_subtraction(
+    lhs: &ColumnarValue,
+    rhs: &ColumnarValue,
+) -> Result<ColumnarValue> {
+    use arrow::compute::kernels::numeric::sub_wrapping;
 
-/// Invoke a compute kernel on a pair of binary data arrays with flags
-macro_rules! compute_utf8view_flag_op {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
-        let ll = $LEFT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8view_flag_op failed to downcast array");
-        let rr = $RIGHT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8view_flag_op failed to downcast array");
+    // Use Arrow's sub_wrapping to compute the Duration result
+    let duration_result = apply(lhs, rhs, sub_wrapping)?;
 
-        let flag = if $FLAG {
-            Some($ARRAYTYPE::from(vec!["i"; ll.len()]))
-        } else {
-            None
-        };
-        let mut array = $OP(ll, rr, flag.as_ref())?;
-        if $NOT {
-            array = not(&array).unwrap();
+    // Convert Duration to Int64 (days)
+    match duration_result {
+        ColumnarValue::Array(array) => {
+            let int64_array = duration_to_days(&array)?;
+            Ok(ColumnarValue::Array(int64_array))
         }
-        Ok(Arc::new(array))
-    }};
-}
-
-macro_rules! binary_string_array_flag_op_scalar {
-    ($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
-        // This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value,
-        // the query can be optimized in such a way that operands will be dicts, so we need to support it here
-        let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
-            DataType::Utf8 => {
-                compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
-            },
-            DataType::Utf8View => {
-                compute_utf8view_flag_op_scalar!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG)
-            }
-            DataType::LargeUtf8 => {
-                compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
-            },
-            DataType::Dictionary(_, _) => {
-                let values = $LEFT.as_any_dictionary().values();
-
-                match values.data_type() {
-                    DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
-                    DataType::Utf8View => compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT, $FLAG),
-                    DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG),
-                    other => internal_err!(
-                        "Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array",
-                        other, stringify!($OP)
-                    ),
-                }.map(
-                    // downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before
-                    |evaluated_values| downcast_dictionary_array! {
-                        $LEFT => {
-                            let unpacked_dict = evaluated_values.take_iter($LEFT.keys().iter().map(|opt| opt.map(|v| v as _))).collect::<BooleanArray>();
-                            Arc::new(unpacked_dict) as _
-                        },
-                        _ => unreachable!(),
-                    }
-                )
-            },
-            other => internal_err!(
-                "Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array",
-                other, stringify!($OP)
-            ),
-        };
-        Some(result)
-    }};
-}
-
-/// Invoke a compute kernel on a data array and a scalar value with flag
-macro_rules! compute_utf8_flag_op_scalar {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
-        let ll = $LEFT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8_flag_op_scalar failed to downcast array");
-
-        let string_value = match $RIGHT.try_as_str() {
-            Some(Some(string_value)) => string_value,
-            // null literal or non string
-            _ => return internal_err!(
-                        "compute_utf8_flag_op_scalar failed to cast literal value {} for operation '{}'",
-                        $RIGHT, stringify!($OP)
-                    )
-        };
-
-        let flag = $FLAG.then_some("i");
-        let mut array =
-            paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
-        if $NOT {
-            array = not(&array).unwrap();
+        ColumnarValue::Scalar(scalar) => {
+            // Convert scalar Duration to Int64 days
+            let array = scalar.to_array_of_size(1)?;
+            let int64_array = duration_to_days(&array)?;
+            let int64_scalar = ScalarValue::try_from_array(int64_array.as_ref(), 0)?;
+            Ok(ColumnarValue::Scalar(int64_scalar))
         }
-
-        Ok(Arc::new(array))
-    }};
+    }
 }
 
-/// Invoke a compute kernel on a data array and a scalar value with flag
-macro_rules! compute_utf8view_flag_op_scalar {
-    ($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
-        let ll = $LEFT
-            .as_any()
-            .downcast_ref::<$ARRAYTYPE>()
-            .expect("compute_utf8view_flag_op_scalar failed to downcast array");
-
-        let string_value = match $RIGHT.try_as_str() {
-            Some(Some(string_value)) => string_value,
-            // null literal or non string
-            _ => return internal_err!(
-                        "compute_utf8view_flag_op_scalar failed to cast literal value {} for operation '{}'",
-                        $RIGHT, stringify!($OP)
-                    )
-        };
+/// Converts a Duration array to Int64 days
+/// Handles different Duration time units (Second, Millisecond, Microsecond, Nanosecond)
+fn duration_to_days(array: &ArrayRef) -> Result<ArrayRef> {
+    use datafusion_common::cast::{
+        as_duration_microsecond_array, as_duration_millisecond_array,
+        as_duration_nanosecond_array, as_duration_second_array,
+    };
 
-        let flag = $FLAG.then_some("i");
-        let mut array =
-            paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
-        if $NOT {
-            array = not(&array).unwrap();
+    const SECONDS_PER_DAY: i64 = 86_400;
+    const MILLIS_PER_DAY: i64 = 86_400_000;
+    const MICROS_PER_DAY: i64 = 86_400_000_000;
+    const NANOS_PER_DAY: i64 = 86_400_000_000_000;
+
+    match array.data_type() {
+        DataType::Duration(TimeUnit::Second) => {
+            let duration_array = as_duration_second_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / SECONDS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
         }
-
-        Ok(Arc::new(array))
-    }};
+        DataType::Duration(TimeUnit::Millisecond) => {
+            let duration_array = as_duration_millisecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / MILLIS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Duration(TimeUnit::Microsecond) => {
+            let duration_array = as_duration_microsecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / MICROS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Duration(TimeUnit::Nanosecond) => {
+            let duration_array = as_duration_nanosecond_array(array)?;
+            let result: Int64Array = duration_array
+                .iter()
+                .map(|v| v.map(|val| val / NANOS_PER_DAY))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        other => internal_err!("duration_to_days expected Duration type, got: {}", other),
+    }
 }
 
 impl PhysicalExpr for BinaryExpr {
@@ -375,7 +289,44 @@ impl PhysicalExpr for BinaryExpr {
                 // as it takes into account cases where the selection contains null values.
                 let batch = filter_record_batch(batch, selection)?;
                 let right_ret = self.right.evaluate(&batch)?;
-                return pre_selection_scatter(selection, right_ret);
+
+                match &right_ret {
+                    ColumnarValue::Array(array) => {
+                        // When the array on the right is all true or all false, skip the scatter process
+                        let boolean_array = array.as_boolean();
+                        let true_count = boolean_array.true_count();
+                        let length = boolean_array.len();
+                        if true_count == length {
+                            return Ok(lhs);
+                        } else if true_count == 0 && boolean_array.null_count() == 0 {
+                            // If the right-hand array is returned at this point,the lengths will be inconsistent;
+                            // returning a scalar can avoid this issue
+                            return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(
+                                Some(false),
+                            )));
+                        }
+
+                        return pre_selection_scatter(selection, Some(boolean_array));
+                    }
+                    ColumnarValue::Scalar(scalar) => {
+                        if let ScalarValue::Boolean(v) = scalar {
+                            // When the scalar is true or false, skip the scatter process
+                            if let Some(v) = v {
+                                if *v {
+                                    return Ok(lhs);
+                                } else {
+                                    return Ok(right_ret);
+                                }
+                            } else {
+                                return pre_selection_scatter(selection, None);
+                            }
+                        } else {
+                            return internal_err!(
+                                "Expected boolean scalar value, found: {right_ret:?}"
+                            );
+                        }
+                    }
+                }
             }
         }
 
@@ -386,52 +337,49 @@ impl PhysicalExpr for BinaryExpr {
         let schema = batch.schema();
         let input_schema = schema.as_ref();
 
-        if left_data_type.is_nested() {
-            if right_data_type != left_data_type {
-                return internal_err!("type mismatch");
-            }
-            return apply_cmp_for_nested(self.op, &lhs, &rhs);
-        }
-
         match self.op {
             Operator::Plus if self.fail_on_overflow => return apply(&lhs, &rhs, add),
             Operator::Plus => return apply(&lhs, &rhs, add_wrapping),
+            // Special case: Date - Date returns Int64 (days difference)
+            // This aligns with PostgreSQL, DuckDB, and MySQL behavior
+            Operator::Minus if is_date_minus_date(&left_data_type, &right_data_type) => {
+                return apply_date_subtraction(&lhs, &rhs);
+            }
             Operator::Minus if self.fail_on_overflow => return apply(&lhs, &rhs, sub),
             Operator::Minus => return apply(&lhs, &rhs, sub_wrapping),
             Operator::Multiply if self.fail_on_overflow => return apply(&lhs, &rhs, mul),
             Operator::Multiply => return apply(&lhs, &rhs, mul_wrapping),
             Operator::Divide => return apply(&lhs, &rhs, div),
             Operator::Modulo => return apply(&lhs, &rhs, rem),
-            Operator::Eq => return apply_cmp(&lhs, &rhs, eq),
-            Operator::NotEq => return apply_cmp(&lhs, &rhs, neq),
-            Operator::Lt => return apply_cmp(&lhs, &rhs, lt),
-            Operator::Gt => return apply_cmp(&lhs, &rhs, gt),
-            Operator::LtEq => return apply_cmp(&lhs, &rhs, lt_eq),
-            Operator::GtEq => return apply_cmp(&lhs, &rhs, gt_eq),
-            Operator::IsDistinctFrom => return apply_cmp(&lhs, &rhs, distinct),
-            Operator::IsNotDistinctFrom => return apply_cmp(&lhs, &rhs, not_distinct),
-            Operator::LikeMatch => return apply_cmp(&lhs, &rhs, like),
-            Operator::ILikeMatch => return apply_cmp(&lhs, &rhs, ilike),
-            Operator::NotLikeMatch => return apply_cmp(&lhs, &rhs, nlike),
-            Operator::NotILikeMatch => return apply_cmp(&lhs, &rhs, nilike),
+
+            Operator::Eq
+            | Operator::NotEq
+            | Operator::Lt
+            | Operator::Gt
+            | Operator::LtEq
+            | Operator::GtEq
+            | Operator::IsDistinctFrom
+            | Operator::IsNotDistinctFrom
+            | Operator::LikeMatch
+            | Operator::ILikeMatch
+            | Operator::NotLikeMatch
+            | Operator::NotILikeMatch => {
+                return apply_cmp(self.op, &lhs, &rhs);
+            }
             _ => {}
         }
 
         let result_type = self.data_type(input_schema)?;
 
         // If the left-hand side is an array and the right-hand side is a non-null scalar, try the optimized kernel.
-        if let (ColumnarValue::Array(array), ColumnarValue::Scalar(ref scalar)) =
-            (&lhs, &rhs)
+        if let (ColumnarValue::Array(array), ColumnarValue::Scalar(scalar)) = (&lhs, &rhs)
+            && !scalar.is_null()
+            && let Some(result_array) =
+                self.evaluate_array_scalar(array, scalar.clone())?
         {
-            if !scalar.is_null() {
-                if let Some(result_array) =
-                    self.evaluate_array_scalar(array, scalar.clone())?
-                {
-                    let final_array = result_array
-                        .and_then(|a| to_result_type_array(&self.op, a, &result_type));
-                    return final_array.map(ColumnarValue::Array);
-                }
-            }
+            let final_array = result_array
+                .and_then(|a| to_result_type_array(&self.op, a, &result_type));
+            return final_array.map(ColumnarValue::Array);
         }
 
         // if both arrays or both literals - extract arrays and continue execution
@@ -475,33 +423,27 @@ impl PhysicalExpr for BinaryExpr {
         let right_interval = children[1];
 
         if self.op.eq(&Operator::And) {
-            if interval.eq(&Interval::CERTAINLY_TRUE) {
+            if interval.eq(&Interval::TRUE) {
                 // A certainly true logical conjunction can only derive from possibly
                 // true operands. Otherwise, we prove infeasibility.
-                Ok((!left_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && !right_interval.eq(&Interval::CERTAINLY_FALSE))
-                .then(|| vec![Interval::CERTAINLY_TRUE, Interval::CERTAINLY_TRUE]))
-            } else if interval.eq(&Interval::CERTAINLY_FALSE) {
+                Ok((!left_interval.eq(&Interval::FALSE)
+                    && !right_interval.eq(&Interval::FALSE))
+                .then(|| vec![Interval::TRUE, Interval::TRUE]))
+            } else if interval.eq(&Interval::FALSE) {
                 // If the logical conjunction is certainly false, one of the
                 // operands must be false. However, it's not always possible to
                 // determine which operand is false, leading to different scenarios.
 
                 // If one operand is certainly true and the other one is uncertain,
                 // then the latter must be certainly false.
-                if left_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && right_interval.eq(&Interval::UNCERTAIN)
+                if left_interval.eq(&Interval::TRUE)
+                    && right_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_TRUE,
-                        Interval::CERTAINLY_FALSE,
-                    ]))
-                } else if right_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && left_interval.eq(&Interval::UNCERTAIN)
+                    Ok(Some(vec![Interval::TRUE, Interval::FALSE]))
+                } else if right_interval.eq(&Interval::TRUE)
+                    && left_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_FALSE,
-                        Interval::CERTAINLY_TRUE,
-                    ]))
+                    Ok(Some(vec![Interval::FALSE, Interval::TRUE]))
                 }
                 // If both children are uncertain, or if one is certainly false,
                 // we cannot conclusively refine their intervals. In this case,
@@ -515,33 +457,27 @@ impl PhysicalExpr for BinaryExpr {
                 Ok(Some(vec![]))
             }
         } else if self.op.eq(&Operator::Or) {
-            if interval.eq(&Interval::CERTAINLY_FALSE) {
+            if interval.eq(&Interval::FALSE) {
                 // A certainly false logical disjunction can only derive from certainly
                 // false operands. Otherwise, we prove infeasibility.
-                Ok((!left_interval.eq(&Interval::CERTAINLY_TRUE)
-                    && !right_interval.eq(&Interval::CERTAINLY_TRUE))
-                .then(|| vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_FALSE]))
-            } else if interval.eq(&Interval::CERTAINLY_TRUE) {
+                Ok((!left_interval.eq(&Interval::TRUE)
+                    && !right_interval.eq(&Interval::TRUE))
+                .then(|| vec![Interval::FALSE, Interval::FALSE]))
+            } else if interval.eq(&Interval::TRUE) {
                 // If the logical disjunction is certainly true, one of the
                 // operands must be true. However, it's not always possible to
                 // determine which operand is true, leading to different scenarios.
 
                 // If one operand is certainly false and the other one is uncertain,
                 // then the latter must be certainly true.
-                if left_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && right_interval.eq(&Interval::UNCERTAIN)
+                if left_interval.eq(&Interval::FALSE)
+                    && right_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_FALSE,
-                        Interval::CERTAINLY_TRUE,
-                    ]))
-                } else if right_interval.eq(&Interval::CERTAINLY_FALSE)
-                    && left_interval.eq(&Interval::UNCERTAIN)
+                    Ok(Some(vec![Interval::FALSE, Interval::TRUE]))
+                } else if right_interval.eq(&Interval::FALSE)
+                    && left_interval.eq(&Interval::TRUE_OR_FALSE)
                 {
-                    Ok(Some(vec![
-                        Interval::CERTAINLY_TRUE,
-                        Interval::CERTAINLY_FALSE,
-                    ]))
+                    Ok(Some(vec![Interval::TRUE, Interval::FALSE]))
                 }
                 // If both children are uncertain, or if one is certainly true,
                 // we cannot conclusively refine their intervals. In this case,
@@ -574,10 +510,10 @@ impl PhysicalExpr for BinaryExpr {
             // We might be able to construct the output statistics more accurately,
             // without falling back to an unknown distribution, if we are dealing
             // with Gaussian distributions and numerical operations.
-            if let (Gaussian(left), Gaussian(right)) = (left, right) {
-                if let Some(result) = combine_gaussians(&self.op, left, right)? {
-                    return Ok(Gaussian(result));
-                }
+            if let (Gaussian(left), Gaussian(right)) = (left, right)
+                && let Some(result) = combine_gaussians(&self.op, left, right)?
+            {
+                return Ok(Gaussian(result));
             }
         } else if self.op.is_logic_operator() {
             // If we are dealing with logical operators, we expect (and can only
@@ -694,8 +630,8 @@ fn to_result_type_array(
                     Ok(cast(&array, result_type)?)
                 } else {
                     internal_err!(
-                            "Incompatible Dictionary value type {value_type:?} with result type {result_type:?} of Binary operator {op:?}"
-                        )
+                        "Incompatible Dictionary value type {value_type} with result type {result_type} of Binary operator {op:?}"
+                    )
                 }
             }
             _ => Ok(array),
@@ -715,34 +651,10 @@ impl BinaryExpr {
     ) -> Result<Option<Result<ArrayRef>>> {
         use Operator::*;
         let scalar_result = match &self.op {
-            RegexMatch => binary_string_array_flag_op_scalar!(
-                array,
-                scalar,
-                regexp_is_match,
-                false,
-                false
-            ),
-            RegexIMatch => binary_string_array_flag_op_scalar!(
-                array,
-                scalar,
-                regexp_is_match,
-                false,
-                true
-            ),
-            RegexNotMatch => binary_string_array_flag_op_scalar!(
-                array,
-                scalar,
-                regexp_is_match,
-                true,
-                false
-            ),
-            RegexNotIMatch => binary_string_array_flag_op_scalar!(
-                array,
-                scalar,
-                regexp_is_match,
-                true,
-                true
-            ),
+            RegexMatch => regex_match_dyn_scalar(array, &scalar, false, false),
+            RegexIMatch => regex_match_dyn_scalar(array, &scalar, false, true),
+            RegexNotMatch => regex_match_dyn_scalar(array, &scalar, true, false),
+            RegexNotIMatch => regex_match_dyn_scalar(array, &scalar, true, true),
             BitwiseAnd => bitwise_and_dyn_scalar(array, scalar),
             BitwiseOr => bitwise_or_dyn_scalar(array, scalar),
             BitwiseXor => bitwise_xor_dyn_scalar(array, scalar),
@@ -791,27 +703,19 @@ impl BinaryExpr {
                     )
                 }
             }
-            RegexMatch => {
-                binary_string_array_flag_op!(left, right, regexp_is_match, false, false)
-            }
-            RegexIMatch => {
-                binary_string_array_flag_op!(left, right, regexp_is_match, false, true)
-            }
-            RegexNotMatch => {
-                binary_string_array_flag_op!(left, right, regexp_is_match, true, false)
-            }
-            RegexNotIMatch => {
-                binary_string_array_flag_op!(left, right, regexp_is_match, true, true)
-            }
+            RegexMatch => regex_match_dyn(&left, &right, false, false),
+            RegexIMatch => regex_match_dyn(&left, &right, false, true),
+            RegexNotMatch => regex_match_dyn(&left, &right, true, false),
+            RegexNotIMatch => regex_match_dyn(&left, &right, true, true),
             BitwiseAnd => bitwise_and_dyn(left, right),
             BitwiseOr => bitwise_or_dyn(left, right),
             BitwiseXor => bitwise_xor_dyn(left, right),
             BitwiseShiftRight => bitwise_shift_right_dyn(left, right),
             BitwiseShiftLeft => bitwise_shift_left_dyn(left, right),
-            StringConcat => concat_elements(left, right),
+            StringConcat => concat_elements(&left, &right),
             AtArrow | ArrowAt | Arrow | LongArrow | HashArrow | HashLongArrow | AtAt
             | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe
-            | IntegerDivide => {
+            | IntegerDivide | Colon => {
                 not_impl_err!(
                     "Binary operator '{:?}' is not supported in the physical expr",
                     self.op
@@ -974,13 +878,8 @@ fn check_short_circuit<'a>(
 /// However, this is difficult to achieve under the immutable constraints of [`Arc`] and [`BooleanArray`].
 fn pre_selection_scatter(
     left_result: &BooleanArray,
-    right_result: ColumnarValue,
+    right_result: Option<&BooleanArray>,
 ) -> Result<ColumnarValue> {
-    let right_boolean_array = match &right_result {
-        ColumnarValue::Array(array) => array.as_boolean(),
-        ColumnarValue::Scalar(_) => return Ok(right_result),
-    };
-
     let result_len = left_result.len();
 
     let mut result_array_builder = BooleanArray::builder(result_len);
@@ -990,22 +889,39 @@ fn pre_selection_scatter(
 
     // keep track of how much is filled
     let mut last_end = 0;
-    SlicesIterator::new(left_result).for_each(|(start, end)| {
-        // the gap needs to be filled with false
-        if start > last_end {
-            result_array_builder.append_n(start - last_end, false);
+    // reduce if condition in for_each
+    match right_result {
+        Some(right_result) => {
+            SlicesIterator::new(left_result).for_each(|(start, end)| {
+                // the gap needs to be filled with false
+                if start > last_end {
+                    result_array_builder.append_n(start - last_end, false);
+                }
+
+                // copy values from right array for this slice
+                let len = end - start;
+                right_result
+                    .slice(right_array_pos, len)
+                    .iter()
+                    .for_each(|v| result_array_builder.append_option(v));
+
+                right_array_pos += len;
+                last_end = end;
+            });
         }
+        None => SlicesIterator::new(left_result).for_each(|(start, end)| {
+            // the gap needs to be filled with false
+            if start > last_end {
+                result_array_builder.append_n(start - last_end, false);
+            }
 
-        // copy values from right array for this slice
-        let len = end - start;
-        right_boolean_array
-            .slice(right_array_pos, len)
-            .iter()
-            .for_each(|v| result_array_builder.append_option(v));
+            // append nulls for this slice derictly
+            let len = end - start;
+            result_array_builder.append_nulls(len);
 
-        right_array_pos += len;
-        last_end = end;
-    });
+            last_end = end;
+        }),
+    }
 
     // Fill any remaining positions with false
     if last_end < result_len {
@@ -1016,7 +932,7 @@ fn pre_selection_scatter(
     Ok(ColumnarValue::Array(Arc::new(boolean_result)))
 }
 
-fn concat_elements(left: Arc<dyn Array>, right: Arc<dyn Array>) -> Result<ArrayRef> {
+fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result<ArrayRef> {
     Ok(match left.data_type() {
         DataType::Utf8 => Arc::new(concat_elements_utf8(
             left.as_string::<i32>(),
@@ -1069,7 +985,7 @@ pub fn similar_to(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, lit, try_cast, Column, Literal};
+    use crate::expressions::{Column, Literal, col, lit, try_cast};
     use datafusion_expr::lit as expr_lit;
 
     use datafusion_common::plan_datafusion_err;
@@ -1192,7 +1108,8 @@ mod tests {
             ]);
             let a = $A_ARRAY::from($A_VEC);
             let b = $B_ARRAY::from($B_VEC);
-            let (lhs, rhs) = BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?;
+            let (lhs, rhs) =
+                BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?;
 
             let left = try_cast(col("a", &schema)?, &schema, lhs)?;
             let right = try_cast(col("b", &schema)?, &schema, rhs)?;
@@ -1208,7 +1125,10 @@ mod tests {
             assert_eq!(expression.data_type(&schema)?, $C_TYPE);
 
             // compute
-            let result = expression.evaluate(&batch)?.into_array(batch.num_rows()).expect("Failed to convert to array");
+            let result = expression
+                .evaluate(&batch)?
+                .into_array(batch.num_rows())
+                .expect("Failed to convert to array");
 
             // verify that the array's data_type is correct
             assert_eq!(*result.data_type(), $C_TYPE);
@@ -1222,8 +1142,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 let v = result.value(i);
                 assert_eq!(
-                    v,
-                    *x,
+                    v, *x,
                     "Unexpected output at position {i}:\n\nActual:\n{v}\n\nExpected:\n{x}"
                 );
             }
@@ -4600,11 +4519,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: 2147483647 + 1"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: 2147483647 + 1")
+        );
         Ok(())
     }
 
@@ -4629,11 +4550,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: -2147483648 - 1"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: -2147483648 - 1")
+        );
         Ok(())
     }
 
@@ -4658,11 +4581,13 @@ mod tests {
 
         // evaluate expression
         let result = expr.evaluate(&batch);
-        assert!(result
-            .err()
-            .unwrap()
-            .to_string()
-            .contains("Overflow happened on: 2147483647 * 2"));
+        assert!(
+            result
+                .err()
+                .unwrap()
+                .to_string()
+                .contains("Overflow happened on: 2147483647 * 2")
+        );
         Ok(())
     }
 
@@ -4971,9 +4896,10 @@ mod tests {
             let child_refs = child_view.iter().collect::<Vec<_>>();
             for op in &ops {
                 let expr = binary_expr(Arc::clone(&a), *op, Arc::clone(&b), schema)?;
-                assert!(expr
-                    .propagate_statistics(&parent, child_refs.as_slice())?
-                    .is_some());
+                assert!(
+                    expr.propagate_statistics(&parent, child_refs.as_slice())?
+                        .is_some()
+                );
             }
         }
 
@@ -5211,7 +5137,6 @@ mod tests {
     /// 4. Test single true at first position
     /// 5. Test single true at last position
     /// 6. Test nulls in right array
-    /// 7. Test scalar right handling
     #[test]
     fn test_pre_selection_scatter() {
         fn create_bool_array(bools: Vec<bool>) -> BooleanArray {
@@ -5222,11 +5147,9 @@ mod tests {
             // Left: [T, F, T, F, T]
             // Right: [F, T, F] (values for 3 true positions)
             let left = create_bool_array(vec![true, false, true, false, true]);
-            let right = ColumnarValue::Array(Arc::new(create_bool_array(vec![
-                false, true, false,
-            ])));
+            let right = create_bool_array(vec![false, true, false]);
 
-            let result = pre_selection_scatter(&left, right).unwrap();
+            let result = pre_selection_scatter(&left, Some(&right)).unwrap();
             let result_arr = result.into_array(left.len()).unwrap();
 
             let expected = create_bool_array(vec![false, false, true, false, false]);
@@ -5238,11 +5161,9 @@ mod tests {
             // Right: [T, F, F, T, F]
             let left =
                 create_bool_array(vec![false, true, true, false, true, true, true]);
-            let right = ColumnarValue::Array(Arc::new(create_bool_array(vec![
-                true, false, false, true, false,
-            ])));
+            let right = create_bool_array(vec![true, false, false, true, false]);
 
-            let result = pre_selection_scatter(&left, right).unwrap();
+            let result = pre_selection_scatter(&left, Some(&right)).unwrap();
             let result_arr = result.into_array(left.len()).unwrap();
 
             let expected =
@@ -5254,9 +5175,9 @@ mod tests {
             // Left: [T, F, F]
             // Right: [F]
             let left = create_bool_array(vec![true, false, false]);
-            let right = ColumnarValue::Array(Arc::new(create_bool_array(vec![false])));
+            let right = create_bool_array(vec![false]);
 
-            let result = pre_selection_scatter(&left, right).unwrap();
+            let result = pre_selection_scatter(&left, Some(&right)).unwrap();
             let result_arr = result.into_array(left.len()).unwrap();
 
             let expected = create_bool_array(vec![false, false, false]);
@@ -5267,9 +5188,9 @@ mod tests {
             // Left: [F, F, T]
             // Right: [F]
             let left = create_bool_array(vec![false, false, true]);
-            let right = ColumnarValue::Array(Arc::new(create_bool_array(vec![false])));
+            let right = create_bool_array(vec![false]);
 
-            let result = pre_selection_scatter(&left, right).unwrap();
+            let result = pre_selection_scatter(&left, Some(&right)).unwrap();
             let result_arr = result.into_array(left.len()).unwrap();
 
             let expected = create_bool_array(vec![false, false, false]);
@@ -5280,10 +5201,9 @@ mod tests {
             // Left: [F, T, F, T]
             // Right: [None, Some(false)] (with null at first position)
             let left = create_bool_array(vec![false, true, false, true]);
-            let right_arr = BooleanArray::from(vec![None, Some(false)]);
-            let right = ColumnarValue::Array(Arc::new(right_arr));
+            let right = BooleanArray::from(vec![None, Some(false)]);
 
-            let result = pre_selection_scatter(&left, right).unwrap();
+            let result = pre_selection_scatter(&left, Some(&right)).unwrap();
             let result_arr = result.into_array(left.len()).unwrap();
 
             let expected = BooleanArray::from(vec![
@@ -5294,16 +5214,30 @@ mod tests {
             ]);
             assert_eq!(&expected, result_arr.as_boolean());
         }
-        // Test scalar right handling
-        {
-            // Left: [T, F, T]
-            // Right: Scalar true
-            let left = create_bool_array(vec![true, false, true]);
-            let right = ColumnarValue::Scalar(ScalarValue::Boolean(Some(true)));
+    }
 
-            let result = pre_selection_scatter(&left, right).unwrap();
-            assert!(matches!(result, ColumnarValue::Scalar(_)));
-        }
+    #[test]
+    fn test_and_true_preselection_returns_lhs() {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c", DataType::Boolean, false)]));
+        let c_array = Arc::new(BooleanArray::from(vec![false, true, false, false, false]))
+            as ArrayRef;
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::clone(&c_array)])
+            .unwrap();
+
+        let expr = logical2physical(&logical_col("c").and(expr_lit(true)), &schema);
+
+        let result = expr.evaluate(&batch).unwrap();
+        let ColumnarValue::Array(result_arr) = result else {
+            panic!("Expected ColumnarValue::Array");
+        };
+
+        let expected: Vec<_> = c_array.as_boolean().iter().collect();
+        let actual: Vec<_> = result_arr.as_boolean().iter().collect();
+        assert_eq!(
+            expected, actual,
+            "AND with TRUE must equal LHS even with PreSelection"
+        );
     }
 
     #[test]
@@ -5399,4 +5333,65 @@ mod tests {
             Interval::make(Some(false), Some(false)).unwrap()
         );
     }
+
+    #[test]
+    fn test_evaluate_nested_type() {
+        let batch_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "a",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                true,
+            ),
+            Field::new(
+                "b",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                true,
+            ),
+        ]));
+
+        let mut list_builder_a = ListBuilder::new(Int32Builder::new());
+
+        list_builder_a.append_value([Some(1)]);
+        list_builder_a.append_value([Some(2)]);
+        list_builder_a.append_value([]);
+        list_builder_a.append_value([None]);
+
+        let list_array_a: ArrayRef = Arc::new(list_builder_a.finish());
+
+        let mut list_builder_b = ListBuilder::new(Int32Builder::new());
+
+        list_builder_b.append_value([Some(1)]);
+        list_builder_b.append_value([Some(2)]);
+        list_builder_b.append_value([]);
+        list_builder_b.append_value([None]);
+
+        let list_array_b: ArrayRef = Arc::new(list_builder_b.finish());
+
+        let batch =
+            RecordBatch::try_new(batch_schema, vec![list_array_a, list_array_b]).unwrap();
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "a",
+                DataType::List(Arc::new(Field::new("foo", DataType::Int32, true))),
+                true,
+            ),
+            Field::new(
+                "b",
+                DataType::List(Arc::new(Field::new("bar", DataType::Int32, true))),
+                true,
+            ),
+        ]));
+
+        let a = Arc::new(Column::new("a", 0)) as _;
+        let b = Arc::new(Column::new("b", 1)) as _;
+
+        let eq_expr =
+            binary_expr(Arc::clone(&a), Operator::Eq, Arc::clone(&b), &schema).unwrap();
+
+        let eq_result = eq_expr.evaluate(&batch).unwrap();
+        let expected =
+            BooleanArray::from_iter(vec![Some(true), Some(true), Some(true), Some(true)]);
+        assert_eq!(eq_result.into_array(4).unwrap().as_boolean(), &expected);
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs
index ae26f3e842418..39e3a6f16b5cf 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs
@@ -23,15 +23,17 @@ use arrow::compute::kernels::bitwise::{
     bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar,
     bitwise_xor, bitwise_xor_scalar,
 };
+use arrow::compute::kernels::boolean::not;
+use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar};
 use arrow::datatypes::DataType;
-use datafusion_common::plan_err;
+use arrow::error::ArrowError;
 use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{internal_err, plan_err};
 
-use arrow::error::ArrowError;
 use std::sync::Arc;
 
 /// Downcasts $LEFT and $RIGHT to $ARRAY_TYPE and then calls $KERNEL($LEFT, $RIGHT)
-macro_rules! call_bitwise_kernel {
+macro_rules! call_kernel {
     ($LEFT:expr, $RIGHT:expr, $KERNEL:expr, $ARRAY_TYPE:ident) => {{
         let left = $LEFT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
         let right = $RIGHT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
@@ -42,36 +44,36 @@ macro_rules! call_bitwise_kernel {
 
 /// Creates a $FUNC(left: ArrayRef, right: ArrayRef) that
 /// downcasts left / right to the appropriate integral type and calls the kernel
-macro_rules! create_dyn_kernel {
+macro_rules! create_left_integral_dyn_kernel {
     ($FUNC:ident, $KERNEL:ident) => {
         pub(crate) fn $FUNC(left: ArrayRef, right: ArrayRef) -> Result<ArrayRef> {
             match &left.data_type() {
                 DataType::Int8 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, Int8Array)
+                    call_kernel!(left, right, $KERNEL, Int8Array)
                 }
                 DataType::Int16 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, Int16Array)
+                    call_kernel!(left, right, $KERNEL, Int16Array)
                 }
                 DataType::Int32 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, Int32Array)
+                    call_kernel!(left, right, $KERNEL, Int32Array)
                 }
                 DataType::Int64 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, Int64Array)
+                    call_kernel!(left, right, $KERNEL, Int64Array)
                 }
                 DataType::UInt8 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, UInt8Array)
+                    call_kernel!(left, right, $KERNEL, UInt8Array)
                 }
                 DataType::UInt16 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, UInt16Array)
+                    call_kernel!(left, right, $KERNEL, UInt16Array)
                 }
                 DataType::UInt32 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, UInt32Array)
+                    call_kernel!(left, right, $KERNEL, UInt32Array)
                 }
                 DataType::UInt64 => {
-                    call_bitwise_kernel!(left, right, $KERNEL, UInt64Array)
+                    call_kernel!(left, right, $KERNEL, UInt64Array)
                 }
                 other => plan_err!(
-                    "Data type {:?} not supported for binary operation '{}' on dyn arrays",
+                    "Data type {} not supported for binary operation '{}' on dyn arrays",
                     other,
                     stringify!($KERNEL)
                 ),
@@ -80,14 +82,14 @@ macro_rules! create_dyn_kernel {
     };
 }
 
-create_dyn_kernel!(bitwise_or_dyn, bitwise_or);
-create_dyn_kernel!(bitwise_xor_dyn, bitwise_xor);
-create_dyn_kernel!(bitwise_and_dyn, bitwise_and);
-create_dyn_kernel!(bitwise_shift_right_dyn, bitwise_shift_right);
-create_dyn_kernel!(bitwise_shift_left_dyn, bitwise_shift_left);
+create_left_integral_dyn_kernel!(bitwise_or_dyn, bitwise_or);
+create_left_integral_dyn_kernel!(bitwise_xor_dyn, bitwise_xor);
+create_left_integral_dyn_kernel!(bitwise_and_dyn, bitwise_and);
+create_left_integral_dyn_kernel!(bitwise_shift_right_dyn, bitwise_shift_right);
+create_left_integral_dyn_kernel!(bitwise_shift_left_dyn, bitwise_shift_left);
 
 /// Downcasts $LEFT as $ARRAY_TYPE and $RIGHT as TYPE and calls $KERNEL($LEFT, $RIGHT)
-macro_rules! call_bitwise_scalar_kernel {
+macro_rules! call_scalar_kernel {
     ($LEFT:expr, $RIGHT:expr, $KERNEL:ident, $ARRAY_TYPE:ident, $TYPE:ty) => {{
         let len = $LEFT.len();
         let array = $LEFT.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
@@ -104,20 +106,39 @@ macro_rules! call_bitwise_scalar_kernel {
 
 /// Creates a $FUNC(left: ArrayRef, right: ScalarValue) that
 /// downcasts left / right to the appropriate integral type and calls the kernel
-macro_rules! create_dyn_scalar_kernel {
+macro_rules! create_left_integral_dyn_scalar_kernel {
     ($FUNC:ident, $KERNEL:ident) => {
-        pub(crate) fn $FUNC(array: &dyn Array, scalar: ScalarValue) -> Option<Result<ArrayRef>> {
+        pub(crate) fn $FUNC(
+            array: &dyn Array,
+            scalar: ScalarValue,
+        ) -> Option<Result<ArrayRef>> {
             let result = match array.data_type() {
-                DataType::Int8 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8),
-                DataType::Int16 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16),
-                DataType::Int32 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32),
-                DataType::Int64 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64),
-                DataType::UInt8 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8),
-                DataType::UInt16 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16),
-                DataType::UInt32 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32),
-                DataType::UInt64 => call_bitwise_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64),
+                DataType::Int8 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int8Array, i8)
+                }
+                DataType::Int16 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int16Array, i16)
+                }
+                DataType::Int32 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int32Array, i32)
+                }
+                DataType::Int64 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, Int64Array, i64)
+                }
+                DataType::UInt8 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt8Array, u8)
+                }
+                DataType::UInt16 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt16Array, u16)
+                }
+                DataType::UInt32 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt32Array, u32)
+                }
+                DataType::UInt64 => {
+                    call_scalar_kernel!(array, scalar, $KERNEL, UInt64Array, u64)
+                }
                 other => plan_err!(
-                    "Data type {:?} not supported for binary operation '{}' on dyn arrays",
+                    "Data type {} not supported for binary operation '{}' on dyn arrays",
                     other,
                     stringify!($KERNEL)
                 ),
@@ -127,22 +148,36 @@ macro_rules! create_dyn_scalar_kernel {
     };
 }
 
-create_dyn_scalar_kernel!(bitwise_and_dyn_scalar, bitwise_and_scalar);
-create_dyn_scalar_kernel!(bitwise_or_dyn_scalar, bitwise_or_scalar);
-create_dyn_scalar_kernel!(bitwise_xor_dyn_scalar, bitwise_xor_scalar);
-create_dyn_scalar_kernel!(bitwise_shift_right_dyn_scalar, bitwise_shift_right_scalar);
-create_dyn_scalar_kernel!(bitwise_shift_left_dyn_scalar, bitwise_shift_left_scalar);
+create_left_integral_dyn_scalar_kernel!(bitwise_and_dyn_scalar, bitwise_and_scalar);
+create_left_integral_dyn_scalar_kernel!(bitwise_or_dyn_scalar, bitwise_or_scalar);
+create_left_integral_dyn_scalar_kernel!(bitwise_xor_dyn_scalar, bitwise_xor_scalar);
+create_left_integral_dyn_scalar_kernel!(
+    bitwise_shift_right_dyn_scalar,
+    bitwise_shift_right_scalar
+);
+create_left_integral_dyn_scalar_kernel!(
+    bitwise_shift_left_dyn_scalar,
+    bitwise_shift_left_scalar
+);
 
+/// Concatenates two `StringViewArray`s element-wise.  
+/// If either element is `Null`, the result element is also `Null`.
+///
+/// # Errors
+/// - Returns an error if the input arrays have different lengths.  
+/// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length.
 pub fn concat_elements_utf8view(
     left: &StringViewArray,
     right: &StringViewArray,
 ) -> std::result::Result<StringViewArray, ArrowError> {
-    let capacity = left
-        .data_buffers()
-        .iter()
-        .zip(right.data_buffers().iter())
-        .map(|(b1, b2)| b1.len() + b2.len())
-        .sum();
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(format!(
+            "Arrays must have the same length: {} != {}",
+            left.len(),
+            right.len()
+        )));
+    }
+    let capacity = left.len();
     let mut result = StringViewBuilder::with_capacity(capacity);
 
     // Avoid reallocations by writing to a reused buffer (note we
@@ -156,7 +191,7 @@ pub fn concat_elements_utf8view(
             buffer.clear();
             write!(&mut buffer, "{left}{right}")
                 .expect("writing into string buffer failed");
-            result.append_value(&buffer);
+            result.try_append_value(&buffer)?;
         } else {
             // at least one of the values is null, so the output is also null
             result.append_null()
@@ -164,3 +199,125 @@ pub fn concat_elements_utf8view(
     }
     Ok(result.finish())
 }
+
+/// Invoke a compute kernel on a pair of binary data arrays with flags
+macro_rules! regexp_is_match_flag {
+    ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
+        let ll = $LEFT
+            .as_any()
+            .downcast_ref::<$ARRAYTYPE>()
+            .expect("failed to downcast array");
+        let rr = $RIGHT
+            .as_any()
+            .downcast_ref::<$ARRAYTYPE>()
+            .expect("failed to downcast array");
+
+        let flag = if $FLAG {
+            Some($ARRAYTYPE::from(vec!["i"; ll.len()]))
+        } else {
+            None
+        };
+        let mut array = regexp_is_match(ll, rr, flag.as_ref())?;
+        if $NOT {
+            array = not(&array).unwrap();
+        }
+        Ok(Arc::new(array))
+    }};
+}
+
+pub(crate) fn regex_match_dyn(
+    left: &ArrayRef,
+    right: &ArrayRef,
+    not_match: bool,
+    flag: bool,
+) -> Result<ArrayRef> {
+    match left.data_type() {
+        DataType::Utf8 => {
+            regexp_is_match_flag!(left, right, StringArray, not_match, flag)
+        }
+        DataType::Utf8View => {
+            regexp_is_match_flag!(left, right, StringViewArray, not_match, flag)
+        }
+        DataType::LargeUtf8 => {
+            regexp_is_match_flag!(left, right, LargeStringArray, not_match, flag)
+        }
+        other => internal_err!(
+            "Data type {} not supported for regex_match_dyn on string array",
+            other
+        ),
+    }
+}
+
+/// Invoke a compute kernel on a data array and a scalar value with flag
+macro_rules! regexp_is_match_flag_scalar {
+    ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
+        let ll = $LEFT
+            .as_any()
+            .downcast_ref::<$ARRAYTYPE>()
+            .expect("failed to downcast array");
+
+        if let Some(Some(string_value)) = $RIGHT.try_as_str() {
+            let flag = $FLAG.then_some("i");
+            match regexp_is_match_scalar(ll, &string_value, flag) {
+                Ok(mut array) => {
+                    if $NOT {
+                        array = not(&array).unwrap();
+                    }
+                    Ok(Arc::new(array))
+                }
+                Err(e) => internal_err!("failed to call 'regex_match_dyn_scalar' {}", e),
+            }
+        } else {
+            internal_err!(
+                "failed to cast literal value {} for operation 'regex_match_dyn_scalar'",
+                $RIGHT
+            )
+        }
+    }};
+}
+
+pub(crate) fn regex_match_dyn_scalar(
+    left: &dyn Array,
+    right: &ScalarValue,
+    not_match: bool,
+    flag: bool,
+) -> Option<Result<ArrayRef>> {
+    let result: Result<ArrayRef> = match left.data_type() {
+        DataType::Utf8 => {
+            regexp_is_match_flag_scalar!(left, right, StringArray, not_match, flag)
+        }
+        DataType::Utf8View => {
+            regexp_is_match_flag_scalar!(left, right, StringViewArray, not_match, flag)
+        }
+        DataType::LargeUtf8 => {
+            regexp_is_match_flag_scalar!(left, right, LargeStringArray, not_match, flag)
+        }
+        DataType::Dictionary(_, _) => {
+            let values = left.as_any_dictionary().values();
+
+            match values.data_type() {
+                DataType::Utf8 => regexp_is_match_flag_scalar!(values, right, StringArray, not_match, flag),
+                DataType::Utf8View => regexp_is_match_flag_scalar!(values, right, StringViewArray, not_match, flag),
+                DataType::LargeUtf8 => regexp_is_match_flag_scalar!(values, right, LargeStringArray, not_match, flag),
+                other => internal_err!(
+                    "Data type {} not supported as a dictionary value type for operation 'regex_match_dyn_scalar' on string array",
+                    other
+                ),
+            }.map(
+                // downcast_dictionary_array duplicates code per possible key type, so we aim to do all prep work before
+                |evaluated_values| downcast_dictionary_array! {
+                    left => {
+                        let unpacked_dict = evaluated_values.take_iter(left.keys().iter().map(|opt| opt.map(|v| v as _))).collect::<BooleanArray>();
+                        Arc::new(unpacked_dict) as ArrayRef
+                    },
+                    _ => unreachable!(),
+                }
+            )
+        }
+        other => internal_err!(
+            "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array",
+            other
+        ),
+    };
+    Some(result)
+}
diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs
index 1a74e78f1075f..f1d867dddf369 100644
--- a/datafusion/physical-expr/src/expressions/case.rs
+++ b/datafusion/physical-expr/src/expressions/case.rs
@@ -15,27 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::expressions::try_cast;
-use crate::PhysicalExpr;
-use std::borrow::Cow;
-use std::hash::Hash;
-use std::{any::Any, sync::Arc};
+mod literal_lookup_table;
 
+use super::{Column, Literal};
+use crate::PhysicalExpr;
+use crate::expressions::{lit, try_cast};
 use arrow::array::*;
 use arrow::compute::kernels::zip::zip;
-use arrow::compute::{and, and_not, is_null, not, nullif, or, prep_null_mask_filter};
-use arrow::datatypes::{DataType, Schema};
+use arrow::compute::{
+    FilterBuilder, FilterPredicate, is_not_null, not, nullif, prep_null_mask_filter,
+};
+use arrow::datatypes::{DataType, Schema, UInt32Type, UnionMode};
+use arrow::error::ArrowError;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::{
-    exec_err, internal_datafusion_err, internal_err, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, assert_or_internal_err, exec_err,
+    internal_datafusion_err, internal_err,
 };
 use datafusion_expr::ColumnarValue;
+use indexmap::{IndexMap, IndexSet};
+use std::borrow::Cow;
+use std::hash::Hash;
+use std::{any::Any, sync::Arc};
 
-use super::{Column, Literal};
+use crate::expressions::case::literal_lookup_table::LiteralLookupTable;
+use arrow::compute::kernels::merge::{MergeIndex, merge, merge_n};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
 use datafusion_physical_expr_common::datum::compare_with_eq;
+use datafusion_physical_expr_common::utils::scatter;
 use itertools::Itertools;
+use std::fmt::{Debug, Formatter};
 
-type WhenThen = (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>);
+pub(super) type WhenThen = (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>);
 
 #[derive(Debug, Hash, PartialEq, Eq)]
 enum EvalMethod {
@@ -43,18 +54,18 @@ enum EvalMethod {
     ///      [WHEN ...]
     ///      [ELSE result]
     /// END
-    NoExpression,
+    NoExpression(ProjectedCaseBody),
     /// CASE expression
     ///     WHEN value THEN result
     ///     [WHEN ...]
     ///     [ELSE result]
     /// END
-    WithExpression,
+    WithExpression(ProjectedCaseBody),
     /// This is a specialization for a specific use case where we can take a fast path
     /// for expressions that are infallible and can be cheaply computed for the entire
     /// record batch rather than just for the rows where the predicate is true.
     ///
-    /// CASE WHEN condition THEN column [ELSE NULL] END
+    /// CASE WHEN condition THEN infallible_expression [ELSE NULL] END
     InfallibleExprOrNull,
     /// This is a specialization for a specific use case where we can take a fast path
     /// if there is just one when/then pair and both the `then` and `else` expressions
@@ -62,10 +73,165 @@ enum EvalMethod {
     /// CASE WHEN condition THEN literal ELSE literal END
     ScalarOrScalar,
     /// This is a specialization for a specific use case where we can take a fast path
-    /// if there is just one when/then pair and both the `then` and `else` are expressions
+    /// if there is just one when/then pair, the `then` is an expression, and `else` is either
+    /// an expression, literal NULL or absent.
     ///
-    /// CASE WHEN condition THEN expression ELSE expression END
-    ExpressionOrExpression,
+    /// In contrast to [`EvalMethod::InfallibleExprOrNull`], this specialization can handle fallible
+    /// `then` expressions.
+    ///
+    /// CASE WHEN condition THEN expression [ELSE expression] END
+    ExpressionOrExpression(ProjectedCaseBody),
+
+    /// This is a specialization for [`EvalMethod::WithExpression`] when the value and results are literals
+    ///
+    /// See [`LiteralLookupTable`] for more details
+    WithExprScalarLookupTable(LiteralLookupTable),
+}
+
+/// Implementing hash so we can use `derive` on [`EvalMethod`].
+///
+/// not implementing actual [`Hash`] as it is not dyn compatible so we cannot implement it for
+/// `dyn` [`literal_lookup_table::WhenLiteralIndexMap`].
+///
+/// So implementing empty hash is still valid as the data is derived from `PhysicalExpr` s which are already hashed
+impl Hash for LiteralLookupTable {
+    fn hash<H: std::hash::Hasher>(&self, _state: &mut H) {}
+}
+
+/// Implementing Equal so we can use `derive` on [`EvalMethod`].
+///
+/// not implementing actual [`PartialEq`] as it is not dyn compatible so we cannot implement it for
+/// `dyn` [`literal_lookup_table::WhenLiteralIndexMap`].
+///
+/// So we always return true as the data is derived from `PhysicalExpr` s which are already compared
+impl PartialEq for LiteralLookupTable {
+    fn eq(&self, _other: &Self) -> bool {
+        true
+    }
+}
+
+impl Eq for LiteralLookupTable {}
+
+/// The body of a CASE expression which consists of an optional base expression, the "when/then"
+/// branches and an optional "else" branch.
+#[derive(Debug, Hash, PartialEq, Eq)]
+struct CaseBody {
+    /// Optional base expression that can be compared to literal values in the "when" expressions
+    expr: Option<Arc<dyn PhysicalExpr>>,
+    /// One or more when/then expressions
+    when_then_expr: Vec<WhenThen>,
+    /// Optional "else" expression
+    else_expr: Option<Arc<dyn PhysicalExpr>>,
+}
+
+impl CaseBody {
+    /// Derives a [ProjectedCaseBody] from this [CaseBody].
+    fn project(&self) -> Result<ProjectedCaseBody> {
+        // Determine the set of columns that are used in all the expressions of the case body.
+        let mut used_column_indices = IndexSet::<usize>::new();
+        let mut collect_column_indices = |expr: &Arc<dyn PhysicalExpr>| {
+            expr.apply(|expr| {
+                if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                    used_column_indices.insert(column.index());
+                }
+                Ok(TreeNodeRecursion::Continue)
+            })
+            .expect("Closure cannot fail");
+        };
+
+        if let Some(e) = &self.expr {
+            collect_column_indices(e);
+        }
+        self.when_then_expr.iter().for_each(|(w, t)| {
+            collect_column_indices(w);
+            collect_column_indices(t);
+        });
+        if let Some(e) = &self.else_expr {
+            collect_column_indices(e);
+        }
+
+        // Construct a mapping from the original column index to the projected column index.
+        let column_index_map = used_column_indices
+            .iter()
+            .enumerate()
+            .map(|(projected, original)| (*original, projected))
+            .collect::<IndexMap<usize, usize>>();
+
+        // Construct the projected body by rewriting each expression from the original body
+        // using the column index mapping.
+        let project = |expr: &Arc<dyn PhysicalExpr>| -> Result<Arc<dyn PhysicalExpr>> {
+            Arc::clone(expr)
+                .transform_down(|e| {
+                    if let Some(column) = e.as_any().downcast_ref::<Column>() {
+                        let original = column.index();
+                        let projected = *column_index_map.get(&original).unwrap();
+                        if projected != original {
+                            return Ok(Transformed::yes(Arc::new(Column::new(
+                                column.name(),
+                                projected,
+                            ))));
+                        }
+                    }
+                    Ok(Transformed::no(e))
+                })
+                .map(|t| t.data)
+        };
+
+        let projected_body = CaseBody {
+            expr: self.expr.as_ref().map(project).transpose()?,
+            when_then_expr: self
+                .when_then_expr
+                .iter()
+                .map(|(e, t)| Ok((project(e)?, project(t)?)))
+                .collect::<Result<Vec<_>>>()?,
+            else_expr: self.else_expr.as_ref().map(project).transpose()?,
+        };
+
+        // Construct the projection vector
+        let projection = column_index_map
+            .iter()
+            .sorted_by_key(|(_, v)| **v)
+            .map(|(k, _)| *k)
+            .collect::<Vec<_>>();
+
+        Ok(ProjectedCaseBody {
+            projection,
+            body: projected_body,
+        })
+    }
+}
+
+/// A derived case body that can be used to evaluate a case expression after projecting
+/// record batches using a projection vector.
+///
+/// This is used to avoid filtering columns that are not used in the
+/// input `RecordBatch` when progressively evaluating a `CASE` expression's
+/// remainder batches. Filtering these columns is wasteful since for a record
+/// batch of `n` rows, filtering requires at worst a copy of `n - 1` values
+/// per array. If these filtered values will never be accessed, the time spent
+/// producing them is better avoided.
+///
+/// For example, if we are evaluating the following case expression that
+/// only references columns B and D:
+///
+/// ```sql
+/// SELECT CASE WHEN B > 10 THEN D ELSE NULL END FROM (VALUES (...)) T(A, B, C, D)
+/// ```
+///
+/// Of the 4 input columns `[A, B, C, D]`, the `CASE` expression only access `B` and `D`.
+/// Filtering `A` and `C` would be unnecessary and wasteful.
+///
+/// If we only retain columns `B` and `D` using `RecordBatch::project` and the projection vector
+/// `[1, 3]`, the indices of these two columns will change to `[0, 1]`. To evaluate the
+/// case expression, it will need to be rewritten from `CASE WHEN B@1 > 10 THEN D@3 ELSE NULL END`
+/// to `CASE WHEN B@0 > 10 THEN D@1 ELSE NULL END`.
+///
+/// The projection vector and the rewritten expression (which only differs from the original in
+/// column reference indices) are held in a `ProjectedCaseBody`.
+#[derive(Debug, Hash, PartialEq, Eq)]
+struct ProjectedCaseBody {
+    projection: Vec<usize>,
+    body: CaseBody,
 }
 
 /// The CASE expression is similar to a series of nested if/else and there are two forms that
@@ -87,26 +253,22 @@ enum EvalMethod {
 /// END
 #[derive(Debug, Hash, PartialEq, Eq)]
 pub struct CaseExpr {
-    /// Optional base expression that can be compared to literal values in the "when" expressions
-    expr: Option<Arc<dyn PhysicalExpr>>,
-    /// One or more when/then expressions
-    when_then_expr: Vec<WhenThen>,
-    /// Optional "else" expression
-    else_expr: Option<Arc<dyn PhysicalExpr>>,
+    /// The case expression body
+    body: CaseBody,
     /// Evaluation method to use
     eval_method: EvalMethod,
 }
 
 impl std::fmt::Display for CaseExpr {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
         write!(f, "CASE ")?;
-        if let Some(e) = &self.expr {
+        if let Some(e) = &self.body.expr {
             write!(f, "{e} ")?;
         }
-        for (w, t) in &self.when_then_expr {
+        for (w, t) in &self.body.when_then_expr {
             write!(f, "WHEN {w} THEN {t} ")?;
         }
-        if let Some(e) = &self.else_expr {
+        if let Some(e) = &self.body.else_expr {
             write!(f, "ELSE {e} ")?;
         }
         write!(f, "END")
@@ -122,6 +284,333 @@ fn is_cheap_and_infallible(expr: &Arc<dyn PhysicalExpr>) -> bool {
     expr.as_any().is::<Column>()
 }
 
+/// Creates a [FilterPredicate] from a boolean array.
+fn create_filter(predicate: &BooleanArray, optimize: bool) -> FilterPredicate {
+    let mut filter_builder = FilterBuilder::new(predicate);
+    if optimize {
+        // Always optimize the filter since we use them multiple times.
+        filter_builder = filter_builder.optimize();
+    }
+    filter_builder.build()
+}
+
+fn multiple_arrays(data_type: &DataType) -> bool {
+    match data_type {
+        DataType::Struct(fields) => {
+            fields.len() > 1
+                || fields.len() == 1 && multiple_arrays(fields[0].data_type())
+        }
+        DataType::Union(fields, UnionMode::Sparse) => !fields.is_empty(),
+        _ => false,
+    }
+}
+
+// This should be removed when https://github.com/apache/arrow-rs/pull/8693
+// is merged and becomes available.
+fn filter_record_batch(
+    record_batch: &RecordBatch,
+    filter: &FilterPredicate,
+) -> std::result::Result<RecordBatch, ArrowError> {
+    let filtered_columns = record_batch
+        .columns()
+        .iter()
+        .map(|a| filter_array(a, filter))
+        .collect::<std::result::Result<Vec<_>, _>>()?;
+    // SAFETY: since we start from a valid RecordBatch, there's no need to revalidate the schema
+    // since the set of columns has not changed.
+    // The input column arrays all had the same length (since they're coming from a valid RecordBatch)
+    // and the filtering them with the same filter will produces a new set of arrays with identical
+    // lengths.
+    unsafe {
+        Ok(RecordBatch::new_unchecked(
+            record_batch.schema(),
+            filtered_columns,
+            filter.count(),
+        ))
+    }
+}
+
+// This function exists purely to be able to use the same call style
+// for `filter_record_batch` and `filter_array` at the point of use.
+// When https://github.com/apache/arrow-rs/pull/8693 is available, replace
+// both with method calls on `FilterPredicate`.
+#[inline(always)]
+fn filter_array(
+    array: &dyn Array,
+    filter: &FilterPredicate,
+) -> std::result::Result<ArrayRef, ArrowError> {
+    filter.filter(array)
+}
+
+/// An index into the partial results array that's more compact than `usize`.
+///
+/// `u32::MAX` is reserved as a special 'none' value. This is used instead of
+/// `Option` to keep the array of indices as compact as possible.
+#[derive(Copy, Clone, PartialEq, Eq)]
+struct PartialResultIndex {
+    index: u32,
+}
+
+const NONE_VALUE: u32 = u32::MAX;
+
+impl PartialResultIndex {
+    /// Returns the 'none' placeholder value.
+    fn none() -> Self {
+        Self { index: NONE_VALUE }
+    }
+
+    fn zero() -> Self {
+        Self { index: 0 }
+    }
+
+    /// Creates a new partial result index.
+    ///
+    /// If the provided value is greater than or equal to `u32::MAX`
+    /// an error will be returned.
+    fn try_new(index: usize) -> Result<Self> {
+        let Ok(index) = u32::try_from(index) else {
+            return internal_err!("Partial result index exceeds limit");
+        };
+
+        assert_or_internal_err!(
+            index != NONE_VALUE,
+            "Partial result index exceeds limit"
+        );
+
+        Ok(Self { index })
+    }
+
+    /// Determines if this index is the 'none' placeholder value or not.
+    fn is_none(&self) -> bool {
+        self.index == NONE_VALUE
+    }
+}
+
+impl MergeIndex for PartialResultIndex {
+    /// Returns `Some(index)` if this value is not the 'none' placeholder, `None` otherwise.
+    fn index(&self) -> Option<usize> {
+        if self.is_none() {
+            None
+        } else {
+            Some(self.index as usize)
+        }
+    }
+}
+
+impl Debug for PartialResultIndex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        if self.is_none() {
+            write!(f, "null")
+        } else {
+            write!(f, "{}", self.index)
+        }
+    }
+}
+
+enum ResultState {
+    /// The final result is an array containing only null values.
+    Empty,
+    /// The final result needs to be computed by merging the data in `arrays`.
+    Partial {
+        // A `Vec` of partial results that should be merged.
+        // `partial_result_indices` contains indexes into this vec.
+        arrays: Vec<ArrayRef>,
+        // Indicates per result row from which array in `partial_results` a value should be taken.
+        indices: Vec<PartialResultIndex>,
+    },
+    /// A single branch matched all input rows. When creating the final result, no further merging
+    /// of partial results is necessary.
+    Complete(ColumnarValue),
+}
+
+/// A builder for constructing result arrays for CASE expressions.
+///
+/// Rather than building a monolithic array containing all results, it maintains a set of
+/// partial result arrays and a mapping that indicates for each row which partial array
+/// contains the result value for that row.
+///
+/// On finish(), the builder will merge all partial results into a single array if necessary.
+/// If all rows evaluated to the same array, that array can be returned directly without
+/// any merging overhead.
+struct ResultBuilder {
+    data_type: DataType,
+    /// The number of rows in the final result.
+    row_count: usize,
+    state: ResultState,
+}
+
+impl ResultBuilder {
+    /// Creates a new ResultBuilder that will produce arrays of the given data type.
+    ///
+    /// The `row_count` parameter indicates the number of rows in the final result.
+    fn new(data_type: &DataType, row_count: usize) -> Self {
+        Self {
+            data_type: data_type.clone(),
+            row_count,
+            state: ResultState::Empty,
+        }
+    }
+
+    /// Adds a result for one branch of the case expression.
+    ///
+    /// `row_indices` should be a [UInt32Array] containing [RecordBatch] relative row indices
+    /// for which `value` contains result values.
+    ///
+    /// If `value` is a scalar, the scalar value will be used as the value for each row in `row_indices`.
+    ///
+    /// If `value` is an array, the values from the array and the indices from `row_indices` will be
+    /// processed pairwise. The lengths of `value` and `row_indices` must match.
+    ///
+    /// The diagram below shows a situation where a when expression matched rows 1 and 4 of the
+    /// record batch. The then expression produced the value array `[A, D]`.
+    /// After adding this result, the result array will have been added to `partial arrays` and
+    /// `partial indices` will have been updated at indexes `1` and `4`.
+    ///
+    /// ```text
+    ///  ┌─────────┐     ┌─────────┐┌───────────┐                            ┌─────────┐┌───────────┐
+    ///  │    C    │     │ 0: None ││┌ 0 ──────┐│                            │ 0: None ││┌ 0 ──────┐│
+    ///  ├─────────┤     ├─────────┤││    A    ││                            ├─────────┤││    A    ││
+    ///  │    D    │     │ 1: None ││└─────────┘│                            │ 1:  2   ││└─────────┘│
+    ///  └─────────┘     ├─────────┤│┌ 1 ──────┐│   add_branch_result(       ├─────────┤│┌ 1 ──────┐│
+    ///   matching       │ 2:  0   │││    B    ││     row indices,           │ 2:  0   │││    B    ││
+    /// 'then' values    ├─────────┤│└─────────┘│     value                  ├─────────┤│└─────────┘│
+    ///                  │ 3: None ││           │   )                        │ 3: None ││┌ 2 ──────┐│
+    ///  ┌─────────┐     ├─────────┤│           │ ─────────────────────────▶ ├─────────┤││    C    ││
+    ///  │    1    │     │ 4: None ││           │                            │ 4:  2   ││├─────────┤│
+    ///  ├─────────┤     ├─────────┤│           │                            ├─────────┤││    D    ││
+    ///  │    4    │     │ 5:  1   ││           │                            │ 5:  1   ││└─────────┘│
+    ///  └─────────┘     └─────────┘└───────────┘                            └─────────┘└───────────┘
+    /// row indices        partial     partial                                 partial     partial
+    ///                    indices     arrays                                  indices     arrays
+    /// ```
+    fn add_branch_result(
+        &mut self,
+        row_indices: &ArrayRef,
+        value: ColumnarValue,
+    ) -> Result<()> {
+        match value {
+            ColumnarValue::Array(a) => {
+                if a.len() != row_indices.len() {
+                    internal_err!("Array length must match row indices length")
+                } else if row_indices.len() == self.row_count {
+                    self.set_complete_result(ColumnarValue::Array(a))
+                } else {
+                    self.add_partial_result(row_indices, a)
+                }
+            }
+            ColumnarValue::Scalar(s) => {
+                if row_indices.len() == self.row_count {
+                    self.set_complete_result(ColumnarValue::Scalar(s))
+                } else {
+                    self.add_partial_result(
+                        row_indices,
+                        s.to_array_of_size(row_indices.len())?,
+                    )
+                }
+            }
+        }
+    }
+
+    /// Adds a partial result array.
+    ///
+    /// This method adds the given array data as a partial result and updates the index mapping
+    /// to indicate that the specified rows should take their values from this array.
+    /// The partial results will be merged into a single array when finish() is called.
+    fn add_partial_result(
+        &mut self,
+        row_indices: &ArrayRef,
+        row_values: ArrayRef,
+    ) -> Result<()> {
+        assert_or_internal_err!(
+            row_indices.null_count() == 0,
+            "Row indices must not contain nulls"
+        );
+
+        match &mut self.state {
+            ResultState::Empty => {
+                let array_index = PartialResultIndex::zero();
+                let mut indices = vec![PartialResultIndex::none(); self.row_count];
+                for row_ix in row_indices.as_primitive::<UInt32Type>().values().iter() {
+                    indices[*row_ix as usize] = array_index;
+                }
+
+                self.state = ResultState::Partial {
+                    arrays: vec![row_values],
+                    indices,
+                };
+
+                Ok(())
+            }
+            ResultState::Partial { arrays, indices } => {
+                let array_index = PartialResultIndex::try_new(arrays.len())?;
+
+                arrays.push(row_values);
+
+                for row_ix in row_indices.as_primitive::<UInt32Type>().values().iter() {
+                    // This is check is only active for debug config because the callers of this method,
+                    // `case_when_with_expr` and `case_when_no_expr`, already ensure that
+                    // they only calculate a value for each row at most once.
+                    #[cfg(debug_assertions)]
+                    assert_or_internal_err!(
+                        indices[*row_ix as usize].is_none(),
+                        "Duplicate value for row {}",
+                        *row_ix
+                    );
+
+                    indices[*row_ix as usize] = array_index;
+                }
+                Ok(())
+            }
+            ResultState::Complete(_) => internal_err!(
+                "Cannot add a partial result when complete result is already set"
+            ),
+        }
+    }
+
+    /// Sets a result that applies to all rows.
+    ///
+    /// This is an optimization for cases where all rows evaluate to the same result.
+    /// When a complete result is set, the builder will return it directly from finish()
+    /// without any merging overhead.
+    fn set_complete_result(&mut self, value: ColumnarValue) -> Result<()> {
+        match &self.state {
+            ResultState::Empty => {
+                self.state = ResultState::Complete(value);
+                Ok(())
+            }
+            ResultState::Partial { .. } => {
+                internal_err!(
+                    "Cannot set a complete result when there are already partial results"
+                )
+            }
+            ResultState::Complete(_) => internal_err!("Complete result already set"),
+        }
+    }
+
+    /// Finishes building the result and returns the final array.
+    fn finish(self) -> Result<ColumnarValue> {
+        match self.state {
+            ResultState::Empty => {
+                // No complete result and no partial results.
+                // This can happen for case expressions with no else branch where no rows
+                // matched.
+                Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                    &self.data_type,
+                )?))
+            }
+            ResultState::Partial { arrays, indices } => {
+                // Merge partial results into a single array.
+                let array_refs = arrays.iter().map(|a| a.as_ref()).collect::<Vec<_>>();
+                Ok(ColumnarValue::Array(merge_n(&array_refs, &indices)?))
+            }
+            ResultState::Complete(v) => {
+                // If we have a complete result, we can just return it.
+                Ok(v)
+            }
+        }
+    }
+}
+
 impl CaseExpr {
     /// Create a new CASE WHEN expression
     pub fn try_new(
@@ -140,198 +629,425 @@ impl CaseExpr {
         };
 
         if when_then_expr.is_empty() {
-            exec_err!("There must be at least one WHEN clause")
-        } else {
-            let eval_method = if expr.is_some() {
-                EvalMethod::WithExpression
-            } else if when_then_expr.len() == 1
-                && is_cheap_and_infallible(&(when_then_expr[0].1))
-                && else_expr.is_none()
+            return exec_err!("There must be at least one WHEN clause");
+        }
+
+        let body = CaseBody {
+            expr,
+            when_then_expr,
+            else_expr,
+        };
+
+        let eval_method = Self::find_best_eval_method(&body)?;
+
+        Ok(Self { body, eval_method })
+    }
+
+    fn find_best_eval_method(body: &CaseBody) -> Result<EvalMethod> {
+        if body.expr.is_some() {
+            if let Some(mapping) = LiteralLookupTable::maybe_new(body) {
+                return Ok(EvalMethod::WithExprScalarLookupTable(mapping));
+            }
+
+            return Ok(EvalMethod::WithExpression(body.project()?));
+        }
+
+        Ok(
+            if body.when_then_expr.len() == 1
+                && is_cheap_and_infallible(&(body.when_then_expr[0].1))
+                && body.else_expr.is_none()
             {
                 EvalMethod::InfallibleExprOrNull
-            } else if when_then_expr.len() == 1
-                && when_then_expr[0].1.as_any().is::<Literal>()
-                && else_expr.is_some()
-                && else_expr.as_ref().unwrap().as_any().is::<Literal>()
+            } else if body.when_then_expr.len() == 1
+                && body.when_then_expr[0].1.as_any().is::<Literal>()
+                && body.else_expr.is_some()
+                && body.else_expr.as_ref().unwrap().as_any().is::<Literal>()
             {
                 EvalMethod::ScalarOrScalar
-            } else if when_then_expr.len() == 1
-                && is_cheap_and_infallible(&(when_then_expr[0].1))
-                && else_expr.as_ref().is_some_and(is_cheap_and_infallible)
-            {
-                EvalMethod::ExpressionOrExpression
+            } else if body.when_then_expr.len() == 1 {
+                EvalMethod::ExpressionOrExpression(body.project()?)
             } else {
-                EvalMethod::NoExpression
-            };
-
-            Ok(Self {
-                expr,
-                when_then_expr,
-                else_expr,
-                eval_method,
-            })
-        }
+                EvalMethod::NoExpression(body.project()?)
+            },
+        )
     }
 
     /// Optional base expression that can be compared to literal values in the "when" expressions
     pub fn expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
-        self.expr.as_ref()
+        self.body.expr.as_ref()
     }
 
     /// One or more when/then expressions
     pub fn when_then_expr(&self) -> &[WhenThen] {
-        &self.when_then_expr
+        &self.body.when_then_expr
     }
 
     /// Optional "else" expression
     pub fn else_expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
-        self.else_expr.as_ref()
+        self.body.else_expr.as_ref()
     }
 }
 
-impl CaseExpr {
-    /// This function evaluates the form of CASE that matches an expression to fixed values.
-    ///
-    /// CASE expression
-    ///     WHEN value THEN result
-    ///     [WHEN ...]
-    ///     [ELSE result]
-    /// END
-    fn case_when_with_expr(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let return_type = self.data_type(&batch.schema())?;
-        let expr = self.expr.as_ref().unwrap();
-        let base_value = expr.evaluate(batch)?;
-        let base_value = base_value.into_array(batch.num_rows())?;
-        let base_nulls = is_null(base_value.as_ref())?;
-
-        // start with nulls as default output
-        let mut current_value = new_null_array(&return_type, batch.num_rows());
-        // We only consider non-null values while comparing with whens
-        let mut remainder = not(&base_nulls)?;
+impl CaseBody {
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        // since all then results have the same data type, we can choose any one as the
+        // return data type except for the null.
+        let mut data_type = DataType::Null;
         for i in 0..self.when_then_expr.len() {
-            let when_value = self.when_then_expr[i]
-                .0
-                .evaluate_selection(batch, &remainder)?;
-            let when_value = when_value.into_array(batch.num_rows())?;
-            // build boolean array representing which rows match the "when" value
-            let when_match = compare_with_eq(
-                &when_value,
-                &base_value,
-                // The types of case and when expressions will be coerced to match.
-                // We only need to check if the base_value is nested.
-                base_value.data_type().is_nested(),
-            )?;
-            // Treat nulls as false
-            let when_match = match when_match.null_count() {
-                0 => Cow::Borrowed(&when_match),
-                _ => Cow::Owned(prep_null_mask_filter(&when_match)),
-            };
-            // Make sure we only consider rows that have not been matched yet
-            let when_match = and(&when_match, &remainder)?;
-
-            // When no rows available for when clause, skip then clause
-            if when_match.true_count() == 0 {
-                continue;
+            data_type = self.when_then_expr[i].1.data_type(input_schema)?;
+            if !data_type.equals_datatype(&DataType::Null) {
+                break;
             }
+        }
+        // if all then results are null, we use data type of else expr instead if possible.
+        if data_type.equals_datatype(&DataType::Null)
+            && let Some(e) = &self.else_expr
+        {
+            data_type = e.data_type(input_schema)?;
+        }
 
-            let then_value = self.when_then_expr[i]
-                .1
-                .evaluate_selection(batch, &when_match)?;
+        Ok(data_type)
+    }
 
-            current_value = match then_value {
-                ColumnarValue::Scalar(ScalarValue::Null) => {
-                    nullif(current_value.as_ref(), &when_match)?
+    /// See [CaseExpr::case_when_with_expr].
+    fn case_when_with_expr(
+        &self,
+        batch: &RecordBatch,
+        return_type: &DataType,
+    ) -> Result<ColumnarValue> {
+        let mut result_builder = ResultBuilder::new(return_type, batch.num_rows());
+
+        // `remainder_rows` contains the indices of the rows that need to be evaluated
+        let mut remainder_rows: ArrayRef =
+            Arc::new(UInt32Array::from_iter_values(0..batch.num_rows() as u32));
+        // `remainder_batch` contains the rows themselves that need to be evaluated
+        let mut remainder_batch = Cow::Borrowed(batch);
+
+        // evaluate the base expression
+        let mut base_values = self
+            .expr
+            .as_ref()
+            .unwrap()
+            .evaluate(batch)?
+            .into_array(batch.num_rows())?;
+
+        // Fill in a result value already for rows where the base expression value is null
+        // Since each when expression is tested against the base expression using the equality
+        // operator, null base values can never match any when expression. `x = NULL` is falsy,
+        // for all possible values of `x`.
+        let base_null_count = base_values.logical_null_count();
+        if base_null_count > 0 {
+            // Use `is_not_null` since this is a cheap clone of the null buffer from 'base_value'.
+            // We already checked there are nulls, so we can be sure a new buffer will not be
+            // created.
+            let base_not_nulls = is_not_null(base_values.as_ref())?;
+            let base_all_null = base_null_count == remainder_batch.num_rows();
+
+            // If there is an else expression, use that as the default value for the null rows
+            // Otherwise the default `null` value from the result builder will be used.
+            if let Some(e) = &self.else_expr {
+                let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?;
+
+                if base_all_null {
+                    // All base values were null, so no need to filter
+                    let nulls_value = expr.evaluate(&remainder_batch)?;
+                    result_builder.add_branch_result(&remainder_rows, nulls_value)?;
+                } else {
+                    // Filter out the null rows and evaluate the else expression for those
+                    let nulls_filter = create_filter(&not(&base_not_nulls)?, true);
+                    let nulls_batch =
+                        filter_record_batch(&remainder_batch, &nulls_filter)?;
+                    let nulls_rows = filter_array(&remainder_rows, &nulls_filter)?;
+                    let nulls_value = expr.evaluate(&nulls_batch)?;
+                    result_builder.add_branch_result(&nulls_rows, nulls_value)?;
                 }
-                ColumnarValue::Scalar(then_value) => {
-                    zip(&when_match, &then_value.to_scalar()?, &current_value)?
+            }
+
+            // All base values are null, so we can return early
+            if base_all_null {
+                return result_builder.finish();
+            }
+
+            // Remove the null rows from the remainder batch
+            let not_null_filter = create_filter(&base_not_nulls, true);
+            remainder_batch =
+                Cow::Owned(filter_record_batch(&remainder_batch, &not_null_filter)?);
+            remainder_rows = filter_array(&remainder_rows, &not_null_filter)?;
+            base_values = filter_array(&base_values, &not_null_filter)?;
+        }
+
+        // The types of case and when expressions will be coerced to match.
+        // We only need to check if the base_value is nested.
+        let base_value_is_nested = base_values.data_type().is_nested();
+
+        for i in 0..self.when_then_expr.len() {
+            // Evaluate the 'when' predicate for the remainder batch
+            // This results in a boolean array with the same length as the remaining number of rows
+            let when_expr = &self.when_then_expr[i].0;
+            let when_value = match when_expr.evaluate(&remainder_batch)? {
+                ColumnarValue::Array(a) => {
+                    compare_with_eq(&a, &base_values, base_value_is_nested)
                 }
-                ColumnarValue::Array(then_value) => {
-                    zip(&when_match, &then_value, &current_value)?
+                ColumnarValue::Scalar(s) => {
+                    compare_with_eq(&s.to_scalar()?, &base_values, base_value_is_nested)
                 }
-            };
+            }?;
+
+            // `true_count` ignores `true` values where the validity bit is not set, so there's
+            // no need to call `prep_null_mask_filter`.
+            let when_true_count = when_value.true_count();
+
+            // If the 'when' predicate did not match any rows, continue to the next branch immediately
+            if when_true_count == 0 {
+                continue;
+            }
+
+            // If the 'when' predicate matched all remaining rows, there is no need to filter
+            if when_true_count == remainder_batch.num_rows() {
+                let then_expression = &self.when_then_expr[i].1;
+                let then_value = then_expression.evaluate(&remainder_batch)?;
+                result_builder.add_branch_result(&remainder_rows, then_value)?;
+                return result_builder.finish();
+            }
+
+            // Filter the remainder batch based on the 'when' value
+            // This results in a batch containing only the rows that need to be evaluated
+            // for the current branch
+            // Still no need to call `prep_null_mask_filter` since `create_filter` will already do
+            // this unconditionally.
+            let then_filter = create_filter(&when_value, true);
+            let then_batch = filter_record_batch(&remainder_batch, &then_filter)?;
+            let then_rows = filter_array(&remainder_rows, &then_filter)?;
+
+            let then_expression = &self.when_then_expr[i].1;
+            let then_value = then_expression.evaluate(&then_batch)?;
+            result_builder.add_branch_result(&then_rows, then_value)?;
+
+            // If this is the last 'when' branch and there is no 'else' expression, there's no
+            // point in calculating the remaining rows.
+            if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 {
+                return result_builder.finish();
+            }
 
-            remainder = and_not(&remainder, &when_match)?;
+            // Prepare the next when branch (or the else branch)
+            let next_selection = match when_value.null_count() {
+                0 => not(&when_value),
+                _ => {
+                    // `prep_null_mask_filter` is required to ensure the not operation treats nulls
+                    // as false
+                    not(&prep_null_mask_filter(&when_value))
+                }
+            }?;
+            let next_filter = create_filter(&next_selection, true);
+            remainder_batch =
+                Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?);
+            remainder_rows = filter_array(&remainder_rows, &next_filter)?;
+            base_values = filter_array(&base_values, &next_filter)?;
         }
 
-        if let Some(e) = self.else_expr() {
+        // If we reached this point, some rows were left unmatched.
+        // Check if those need to be evaluated using the 'else' expression.
+        if let Some(e) = &self.else_expr {
             // keep `else_expr`'s data type and return type consistent
             let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?;
-            // null and unmatched tuples should be assigned else value
-            remainder = or(&base_nulls, &remainder)?;
-            let else_ = expr
-                .evaluate_selection(batch, &remainder)?
-                .into_array(batch.num_rows())?;
-            current_value = zip(&remainder, &else_, &current_value)?;
+            let else_value = expr.evaluate(&remainder_batch)?;
+            result_builder.add_branch_result(&remainder_rows, else_value)?;
         }
 
-        Ok(ColumnarValue::Array(current_value))
+        result_builder.finish()
     }
 
-    /// This function evaluates the form of CASE where each WHEN expression is a boolean
-    /// expression.
-    ///
-    /// CASE WHEN condition THEN result
-    ///      [WHEN ...]
-    ///      [ELSE result]
-    /// END
-    fn case_when_no_expr(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let return_type = self.data_type(&batch.schema())?;
+    /// See [CaseExpr::case_when_no_expr].
+    fn case_when_no_expr(
+        &self,
+        batch: &RecordBatch,
+        return_type: &DataType,
+    ) -> Result<ColumnarValue> {
+        let mut result_builder = ResultBuilder::new(return_type, batch.num_rows());
+
+        // `remainder_rows` contains the indices of the rows that need to be evaluated
+        let mut remainder_rows: ArrayRef =
+            Arc::new(UInt32Array::from_iter(0..batch.num_rows() as u32));
+        // `remainder_batch` contains the rows themselves that need to be evaluated
+        let mut remainder_batch = Cow::Borrowed(batch);
 
-        // start with nulls as default output
-        let mut current_value = new_null_array(&return_type, batch.num_rows());
-        let mut remainder = BooleanArray::from(vec![true; batch.num_rows()]);
         for i in 0..self.when_then_expr.len() {
-            let when_value = self.when_then_expr[i]
-                .0
-                .evaluate_selection(batch, &remainder)?;
-            let when_value = when_value.into_array(batch.num_rows())?;
+            // Evaluate the 'when' predicate for the remainder batch
+            // This results in a boolean array with the same length as the remaining number of rows
+            let when_predicate = &self.when_then_expr[i].0;
+            let when_value = when_predicate
+                .evaluate(&remainder_batch)?
+                .into_array(remainder_batch.num_rows())?;
             let when_value = as_boolean_array(&when_value).map_err(|_| {
                 internal_datafusion_err!("WHEN expression did not return a BooleanArray")
             })?;
-            // Treat 'NULL' as false value
-            let when_value = match when_value.null_count() {
-                0 => Cow::Borrowed(when_value),
-                _ => Cow::Owned(prep_null_mask_filter(when_value)),
-            };
-            // Make sure we only consider rows that have not been matched yet
-            let when_value = and(&when_value, &remainder)?;
-
-            // When no rows available for when clause, skip then clause
-            if when_value.true_count() == 0 {
+
+            // `true_count` ignores `true` values where the validity bit is not set, so there's
+            // no need to call `prep_null_mask_filter`.
+            let when_true_count = when_value.true_count();
+
+            // If the 'when' predicate did not match any rows, continue to the next branch immediately
+            if when_true_count == 0 {
                 continue;
             }
 
-            let then_value = self.when_then_expr[i]
-                .1
-                .evaluate_selection(batch, &when_value)?;
+            // If the 'when' predicate matched all remaining rows, there is no need to filter
+            if when_true_count == remainder_batch.num_rows() {
+                let then_expression = &self.when_then_expr[i].1;
+                let then_value = then_expression.evaluate(&remainder_batch)?;
+                result_builder.add_branch_result(&remainder_rows, then_value)?;
+                return result_builder.finish();
+            }
 
-            current_value = match then_value {
-                ColumnarValue::Scalar(ScalarValue::Null) => {
-                    nullif(current_value.as_ref(), &when_value)?
-                }
-                ColumnarValue::Scalar(then_value) => {
-                    zip(&when_value, &then_value.to_scalar()?, &current_value)?
-                }
-                ColumnarValue::Array(then_value) => {
-                    zip(&when_value, &then_value, &current_value)?
-                }
-            };
+            // Filter the remainder batch based on the 'when' value
+            // This results in a batch containing only the rows that need to be evaluated
+            // for the current branch
+            // Still no need to call `prep_null_mask_filter` since `create_filter` will already do
+            // this unconditionally.
+            let then_filter = create_filter(when_value, true);
+            let then_batch = filter_record_batch(&remainder_batch, &then_filter)?;
+            let then_rows = filter_array(&remainder_rows, &then_filter)?;
+
+            let then_expression = &self.when_then_expr[i].1;
+            let then_value = then_expression.evaluate(&then_batch)?;
+            result_builder.add_branch_result(&then_rows, then_value)?;
+
+            // If this is the last 'when' branch and there is no 'else' expression, there's no
+            // point in calculating the remaining rows.
+            if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 {
+                return result_builder.finish();
+            }
 
-            // Succeed tuples should be filtered out for short-circuit evaluation,
-            // null values for the current when expr should be kept
-            remainder = and_not(&remainder, &when_value)?;
+            // Prepare the next when branch (or the else branch)
+            let next_selection = match when_value.null_count() {
+                0 => not(when_value),
+                _ => {
+                    // `prep_null_mask_filter` is required to ensure the not operation treats nulls
+                    // as false
+                    not(&prep_null_mask_filter(when_value))
+                }
+            }?;
+            let next_filter = create_filter(&next_selection, true);
+            remainder_batch =
+                Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?);
+            remainder_rows = filter_array(&remainder_rows, &next_filter)?;
         }
 
-        if let Some(e) = self.else_expr() {
+        // If we reached this point, some rows were left unmatched.
+        // Check if those need to be evaluated using the 'else' expression.
+        if let Some(e) = &self.else_expr {
             // keep `else_expr`'s data type and return type consistent
             let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?;
-            let else_ = expr
-                .evaluate_selection(batch, &remainder)?
-                .into_array(batch.num_rows())?;
-            current_value = zip(&remainder, &else_, &current_value)?;
+            let else_value = expr.evaluate(&remainder_batch)?;
+            result_builder.add_branch_result(&remainder_rows, else_value)?;
         }
 
-        Ok(ColumnarValue::Array(current_value))
+        result_builder.finish()
+    }
+
+    /// See [CaseExpr::expr_or_expr].
+    fn expr_or_expr(
+        &self,
+        batch: &RecordBatch,
+        when_value: &BooleanArray,
+    ) -> Result<ColumnarValue> {
+        let when_value = match when_value.null_count() {
+            0 => Cow::Borrowed(when_value),
+            _ => {
+                // `prep_null_mask_filter` is required to ensure null is treated as false
+                Cow::Owned(prep_null_mask_filter(when_value))
+            }
+        };
+
+        let optimize_filter = batch.num_columns() > 1
+            || (batch.num_columns() == 1 && multiple_arrays(batch.column(0).data_type()));
+
+        let when_filter = create_filter(&when_value, optimize_filter);
+        let then_batch = filter_record_batch(batch, &when_filter)?;
+        let then_value = self.when_then_expr[0].1.evaluate(&then_batch)?;
+
+        match &self.else_expr {
+            None => {
+                let then_array = then_value.to_array(when_value.true_count())?;
+                scatter(&when_value, then_array.as_ref()).map(ColumnarValue::Array)
+            }
+            Some(else_expr) => {
+                let else_selection = not(&when_value)?;
+                let else_filter = create_filter(&else_selection, optimize_filter);
+                let else_batch = filter_record_batch(batch, &else_filter)?;
+
+                // keep `else_expr`'s data type and return type consistent
+                let return_type = self.data_type(&batch.schema())?;
+                let else_expr =
+                    try_cast(Arc::clone(else_expr), &batch.schema(), return_type.clone())
+                        .unwrap_or_else(|_| Arc::clone(else_expr));
+
+                let else_value = else_expr.evaluate(&else_batch)?;
+
+                Ok(ColumnarValue::Array(match (then_value, else_value) {
+                    (ColumnarValue::Array(t), ColumnarValue::Array(e)) => {
+                        merge(&when_value, &t, &e)
+                    }
+                    (ColumnarValue::Scalar(t), ColumnarValue::Array(e)) => {
+                        merge(&when_value, &t.to_scalar()?, &e)
+                    }
+                    (ColumnarValue::Array(t), ColumnarValue::Scalar(e)) => {
+                        merge(&when_value, &t, &e.to_scalar()?)
+                    }
+                    (ColumnarValue::Scalar(t), ColumnarValue::Scalar(e)) => {
+                        merge(&when_value, &t.to_scalar()?, &e.to_scalar()?)
+                    }
+                }?))
+            }
+        }
+    }
+}
+
+impl CaseExpr {
+    /// This function evaluates the form of CASE that matches an expression to fixed values.
+    ///
+    /// CASE expression
+    ///     WHEN value THEN result
+    ///     [WHEN ...]
+    ///     [ELSE result]
+    /// END
+    fn case_when_with_expr(
+        &self,
+        batch: &RecordBatch,
+        projected: &ProjectedCaseBody,
+    ) -> Result<ColumnarValue> {
+        let return_type = self.data_type(&batch.schema())?;
+        if projected.projection.len() < batch.num_columns() {
+            let projected_batch = batch.project(&projected.projection)?;
+            projected
+                .body
+                .case_when_with_expr(&projected_batch, &return_type)
+        } else {
+            self.body.case_when_with_expr(batch, &return_type)
+        }
+    }
+
+    /// This function evaluates the form of CASE where each WHEN expression is a boolean
+    /// expression.
+    ///
+    /// CASE WHEN condition THEN result
+    ///      [WHEN ...]
+    ///      [ELSE result]
+    /// END
+    fn case_when_no_expr(
+        &self,
+        batch: &RecordBatch,
+        projected: &ProjectedCaseBody,
+    ) -> Result<ColumnarValue> {
+        let return_type = self.data_type(&batch.schema())?;
+        if projected.projection.len() < batch.num_columns() {
+            let projected_batch = batch.project(&projected.projection)?;
+            projected
+                .body
+                .case_when_no_expr(&projected_batch, &return_type)
+        } else {
+            self.body.case_when_no_expr(batch, &return_type)
+        }
     }
 
     /// This function evaluates the specialized case of:
@@ -344,8 +1060,8 @@ impl CaseExpr {
     /// that are infallible because the expression will be evaluated for all
     /// rows in the input batch.
     fn case_column_or_null(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let when_expr = &self.when_then_expr[0].0;
-        let then_expr = &self.when_then_expr[0].1;
+        let when_expr = &self.body.when_then_expr[0].0;
+        let then_expr = &self.body.when_then_expr[0].1;
 
         match when_expr.evaluate(batch)? {
             // WHEN true --> column
@@ -385,7 +1101,7 @@ impl CaseExpr {
         let return_type = self.data_type(&batch.schema())?;
 
         // evaluate when expression
-        let when_value = self.when_then_expr[0].0.evaluate(batch)?;
+        let when_value = self.body.when_then_expr[0].0.evaluate(batch)?;
         let when_value = when_value.into_array(batch.num_rows())?;
         let when_value = as_boolean_array(&when_value).map_err(|_| {
             internal_datafusion_err!("WHEN expression did not return a BooleanArray")
@@ -398,10 +1114,10 @@ impl CaseExpr {
         };
 
         // evaluate then_value
-        let then_value = self.when_then_expr[0].1.evaluate(batch)?;
+        let then_value = self.body.when_then_expr[0].1.evaluate(batch)?;
         let then_value = Scalar::new(then_value.into_array(1)?);
 
-        let Some(e) = self.else_expr() else {
+        let Some(e) = &self.body.else_expr else {
             return internal_err!("expression did not evaluate to an array");
         };
         // keep `else_expr`'s data type and return type consistent
@@ -410,12 +1126,17 @@ impl CaseExpr {
         Ok(ColumnarValue::Array(zip(&when_value, &then_value, &else_)?))
     }
 
-    fn expr_or_expr(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        let return_type = self.data_type(&batch.schema())?;
-
-        // evalute when condition on batch
-        let when_value = self.when_then_expr[0].0.evaluate(batch)?;
-        let when_value = when_value.into_array(batch.num_rows())?;
+    fn expr_or_expr(
+        &self,
+        batch: &RecordBatch,
+        projected: &ProjectedCaseBody,
+    ) -> Result<ColumnarValue> {
+        // evaluate when condition on batch
+        let when_value = self.body.when_then_expr[0].0.evaluate(batch)?;
+        // `num_rows == 1` is intentional to avoid expanding scalars.
+        // If the `when_value` is effectively a scalar, the 'all true' and 'all false' checks
+        // below will avoid incorrectly using the scalar as a merge/zip mask.
+        let when_value = when_value.into_array(1)?;
         let when_value = as_boolean_array(&when_value).map_err(|e| {
             DataFusionError::Context(
                 "WHEN expression did not return a BooleanArray".to_string(),
@@ -423,28 +1144,52 @@ impl CaseExpr {
             )
         })?;
 
-        // Treat 'NULL' as false value
-        let when_value = match when_value.null_count() {
-            0 => Cow::Borrowed(when_value),
-            _ => Cow::Owned(prep_null_mask_filter(when_value)),
-        };
+        let true_count = when_value.true_count();
+        if true_count == when_value.len() {
+            // All input rows are true, just call the 'then' expression
+            self.body.when_then_expr[0].1.evaluate(batch)
+        } else if true_count == 0 {
+            // All input rows are false/null, just call the 'else' expression
+            match &self.body.else_expr {
+                Some(else_expr) => else_expr.evaluate(batch),
+                None => {
+                    let return_type = self.data_type(&batch.schema())?;
+                    Ok(ColumnarValue::Scalar(ScalarValue::try_new_null(
+                        &return_type,
+                    )?))
+                }
+            }
+        } else if projected.projection.len() < batch.num_columns() {
+            // The case expressions do not use all the columns of the input batch.
+            // Project first to reduce time spent filtering.
+            let projected_batch = batch.project(&projected.projection)?;
+            projected.body.expr_or_expr(&projected_batch, when_value)
+        } else {
+            // All columns are used in the case expressions, so there is no need to project.
+            self.body.expr_or_expr(batch, when_value)
+        }
+    }
 
-        let then_value = self.when_then_expr[0]
-            .1
-            .evaluate_selection(batch, &when_value)?
-            .into_array(batch.num_rows())?;
+    fn with_lookup_table(
+        &self,
+        batch: &RecordBatch,
+        lookup_table: &LiteralLookupTable,
+    ) -> Result<ColumnarValue> {
+        let expr = self.body.expr.as_ref().unwrap();
+        let evaluated_expression = expr.evaluate(batch)?;
 
-        // evaluate else expression on the values not covered by when_value
-        let remainder = not(&when_value)?;
-        let e = self.else_expr.as_ref().unwrap();
-        // keep `else_expr`'s data type and return type consistent
-        let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())
-            .unwrap_or_else(|_| Arc::clone(e));
-        let else_ = expr
-            .evaluate_selection(batch, &remainder)?
-            .into_array(batch.num_rows())?;
+        let is_scalar = matches!(evaluated_expression, ColumnarValue::Scalar(_));
+        let evaluated_expression = evaluated_expression.to_array(1)?;
 
-        Ok(ColumnarValue::Array(zip(&remainder, &else_, &then_value)?))
+        let values = lookup_table.map_keys_to_values(&evaluated_expression)?;
+
+        let result = if is_scalar {
+            ColumnarValue::Scalar(ScalarValue::try_from_array(values.as_ref(), 0)?)
+        } else {
+            ColumnarValue::Array(values)
+        };
+
+        Ok(result)
     }
 }
 
@@ -455,35 +1200,66 @@ impl PhysicalExpr for CaseExpr {
     }
 
     fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
-        // since all then results have the same data type, we can choose any one as the
-        // return data type except for the null.
-        let mut data_type = DataType::Null;
-        for i in 0..self.when_then_expr.len() {
-            data_type = self.when_then_expr[i].1.data_type(input_schema)?;
-            if !data_type.equals_datatype(&DataType::Null) {
-                break;
-            }
-        }
-        // if all then results are null, we use data type of else expr instead if possible.
-        if data_type.equals_datatype(&DataType::Null) {
-            if let Some(e) = &self.else_expr {
-                data_type = e.data_type(input_schema)?;
-            }
-        }
-
-        Ok(data_type)
+        self.body.data_type(input_schema)
     }
 
     fn nullable(&self, input_schema: &Schema) -> Result<bool> {
-        // this expression is nullable if any of the input expressions are nullable
-        let then_nullable = self
+        let nullable_then = self
+            .body
             .when_then_expr
             .iter()
-            .map(|(_, t)| t.nullable(input_schema))
-            .collect::<Result<Vec<_>>>()?;
-        if then_nullable.contains(&true) {
-            Ok(true)
-        } else if let Some(e) = &self.else_expr {
+            .filter_map(|(w, t)| {
+                let is_nullable = match t.nullable(input_schema) {
+                    // Pass on error determining nullability verbatim
+                    Err(e) => return Some(Err(e)),
+                    Ok(n) => n,
+                };
+
+                // Branches with a then expression that is not nullable do not impact the
+                // nullability of the case expression.
+                if !is_nullable {
+                    return None;
+                }
+
+                // For case-with-expression assume all 'then' expressions are reachable
+                if self.body.expr.is_some() {
+                    return Some(Ok(()));
+                }
+
+                // For branches with a nullable 'then' expression, try to determine
+                // if the 'then' expression is ever reachable in the situation where
+                // it would evaluate to null.
+
+                // Replace the `then` expression with `NULL` in the `when` expression
+                let with_null = match replace_with_null(w, t.as_ref(), input_schema) {
+                    Err(e) => return Some(Err(e)),
+                    Ok(e) => e,
+                };
+
+                // Try to const evaluate the modified `when` expression.
+                let predicate_result = match evaluate_predicate(&with_null) {
+                    Err(e) => return Some(Err(e)),
+                    Ok(b) => b,
+                };
+
+                match predicate_result {
+                    // Evaluation was inconclusive or true, so the 'then' expression is reachable
+                    None | Some(true) => Some(Ok(())),
+                    // Evaluation proves the branch will never be taken.
+                    // The most common pattern for this is `WHEN x IS NOT NULL THEN x`.
+                    Some(false) => None,
+                }
+            })
+            .next();
+
+        if let Some(nullable_then) = nullable_then {
+            // There is at least one reachable nullable 'then' expression, so the case
+            // expression itself is nullable.
+            // Use `Result::map` to propagate the error from `nullable_then` if there is one.
+            nullable_then.map(|_| true)
+        } else if let Some(e) = &self.body.else_expr {
+            // There are no reachable nullable 'then' expressions, so all we still need to
+            // check is the 'else' expression's nullability.
             e.nullable(input_schema)
         } else {
             // CASE produces NULL if there is no `else` expr
@@ -493,37 +1269,40 @@ impl PhysicalExpr for CaseExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        match self.eval_method {
-            EvalMethod::WithExpression => {
+        match &self.eval_method {
+            EvalMethod::WithExpression(p) => {
                 // this use case evaluates "expr" and then compares the values with the "when"
                 // values
-                self.case_when_with_expr(batch)
+                self.case_when_with_expr(batch, p)
             }
-            EvalMethod::NoExpression => {
+            EvalMethod::NoExpression(p) => {
                 // The "when" conditions all evaluate to boolean in this use case and can be
                 // arbitrary expressions
-                self.case_when_no_expr(batch)
+                self.case_when_no_expr(batch, p)
             }
             EvalMethod::InfallibleExprOrNull => {
                 // Specialization for CASE WHEN expr THEN column [ELSE NULL] END
                 self.case_column_or_null(batch)
             }
             EvalMethod::ScalarOrScalar => self.scalar_or_scalar(batch),
-            EvalMethod::ExpressionOrExpression => self.expr_or_expr(batch),
+            EvalMethod::ExpressionOrExpression(p) => self.expr_or_expr(batch, p),
+            EvalMethod::WithExprScalarLookupTable(lookup_table) => {
+                self.with_lookup_table(batch, lookup_table)
+            }
         }
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
         let mut children = vec![];
-        if let Some(expr) = &self.expr {
+        if let Some(expr) = &self.body.expr {
             children.push(expr)
         }
-        self.when_then_expr.iter().for_each(|(cond, value)| {
+        self.body.when_then_expr.iter().for_each(|(cond, value)| {
             children.push(cond);
             children.push(value);
         });
 
-        if let Some(else_expr) = &self.else_expr {
+        if let Some(else_expr) = &self.body.else_expr {
             children.push(else_expr)
         }
         children
@@ -538,7 +1317,7 @@ impl PhysicalExpr for CaseExpr {
             internal_err!("CaseExpr: Wrong number of children")
         } else {
             let (expr, when_then_expr, else_expr) =
-                match (self.expr().is_some(), self.else_expr().is_some()) {
+                match (self.expr().is_some(), self.body.else_expr.is_some()) {
                     (true, true) => (
                         Some(&children[0]),
                         &children[1..children.len() - 1],
@@ -562,14 +1341,14 @@ impl PhysicalExpr for CaseExpr {
         }
     }
 
-    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "CASE ")?;
-        if let Some(e) = &self.expr {
+        if let Some(e) = &self.body.expr {
             e.fmt_sql(f)?;
             write!(f, " ")?;
         }
 
-        for (w, t) in &self.when_then_expr {
+        for (w, t) in &self.body.when_then_expr {
             write!(f, "WHEN ")?;
             w.fmt_sql(f)?;
             write!(f, " THEN ")?;
@@ -577,7 +1356,7 @@ impl PhysicalExpr for CaseExpr {
             write!(f, " ")?;
         }
 
-        if let Some(e) = &self.else_expr {
+        if let Some(e) = &self.body.else_expr {
             write!(f, "ELSE ")?;
             e.fmt_sql(f)?;
             write!(f, " ")?;
@@ -586,6 +1365,51 @@ impl PhysicalExpr for CaseExpr {
     }
 }
 
+/// Attempts to const evaluate the given `predicate`.
+/// Returns:
+/// - `Some(true)` if the predicate evaluates to a truthy value.
+/// - `Some(false)` if the predicate evaluates to a falsy value.
+/// - `None` if the predicate could not be evaluated.
+fn evaluate_predicate(predicate: &Arc<dyn PhysicalExpr>) -> Result<Option<bool>> {
+    // Create a dummy record with no columns and one row
+    let batch = RecordBatch::try_new_with_options(
+        Arc::new(Schema::empty()),
+        vec![],
+        &RecordBatchOptions::new().with_row_count(Some(1)),
+    )?;
+
+    // Evaluate the predicate and interpret the result as a boolean
+    let result = match predicate.evaluate(&batch) {
+        // An error during evaluation means we couldn't const evaluate the predicate, so return `None`
+        Err(_) => None,
+        Ok(ColumnarValue::Array(array)) => Some(
+            ScalarValue::try_from_array(array.as_ref(), 0)?
+                .cast_to(&DataType::Boolean)?,
+        ),
+        Ok(ColumnarValue::Scalar(scalar)) => Some(scalar.cast_to(&DataType::Boolean)?),
+    };
+    Ok(result.map(|v| matches!(v, ScalarValue::Boolean(Some(true)))))
+}
+
+fn replace_with_null(
+    expr: &Arc<dyn PhysicalExpr>,
+    expr_to_replace: &dyn PhysicalExpr,
+    input_schema: &Schema,
+) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+    let with_null = Arc::clone(expr)
+        .transform_down(|e| {
+            if e.as_ref().dyn_eq(expr_to_replace) {
+                let data_type = e.data_type(input_schema)?;
+                let null_literal = lit(ScalarValue::try_new_null(&data_type)?);
+                Ok(Transformed::yes(null_literal))
+            } else {
+                Ok(Transformed::no(e))
+            }
+        })?
+        .data;
+    Ok(with_null)
+}
+
 /// Create a CASE expression
 pub fn case(
     expr: Option<Arc<dyn PhysicalExpr>>,
@@ -599,7 +1423,8 @@ pub fn case(
 mod tests {
     use super::*;
 
-    use crate::expressions::{binary, cast, col, lit, BinaryExpr};
+    use crate::expressions;
+    use crate::expressions::{BinaryExpr, binary, cast, col, is_not_null, lit};
     use arrow::buffer::Buffer;
     use arrow::datatypes::DataType::Float64;
     use arrow::datatypes::Field;
@@ -607,8 +1432,9 @@ mod tests {
     use datafusion_common::plan_err;
     use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
     use datafusion_expr::type_coercion::binary::comparison_coercion;
-    use datafusion_expr::Operator;
+    use datafusion_expr_common::operator::Operator;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
+    use half::f16;
 
     #[test]
     fn case_with_expr() -> Result<()> {
@@ -640,6 +1466,164 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn case_with_expr_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 END
+        let when1 = lit("foo");
+        let then1 = lit(123i32);
+        let when2 = lit("bar");
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    // Make sure we are not failing when got literal in case when but input is dictionary encoded
+    #[test]
+    fn case_with_expr_primitive_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt64)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = UInt64Array::from(vec![Some(10), Some(20), None, Some(30)]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 10 THEN 123 WHEN 30 THEN 456 END
+        let when1 = lit(10_u64);
+        let then1 = lit(123_i32);
+        let when2 = lit(30_u64);
+        let then2 = lit(456_i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    // Make sure we are not failing when got literal in case when but input is dictionary encoded
+    #[test]
+    fn case_with_expr_boolean_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Boolean)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![0u8, 1u8, 2u8, 3u8]);
+        let values = BooleanArray::from(vec![Some(true), Some(false), None, Some(true)]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN true THEN 123 WHEN false THEN 456 END
+        let when1 = lit(true);
+        let then1 = lit(123i32);
+        let when2 = lit(false);
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![Some(123), Some(456), None, Some(123)]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn case_with_expr_all_null_dictionary() -> Result<()> {
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
+            true,
+        )]);
+        let keys = UInt8Array::from(vec![2u8, 2u8, 2u8, 2u8]);
+        let values = StringArray::from(vec![Some("foo"), Some("baz"), None, Some("bar")]);
+        let dictionary = DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dictionary)])?;
+
+        let schema = batch.schema();
+
+        // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 END
+        let when1 = lit("foo");
+        let then1 = lit(123i32);
+        let when2 = lit("bar");
+        let then2 = lit(456i32);
+
+        let expr = generate_case_when_with_type_coercion(
+            Some(col("a", &schema)?),
+            vec![(when1, then1), (when2, then2)],
+            None,
+            schema.as_ref(),
+        )?;
+        let result = expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+        let result = as_int32_array(&result)?;
+
+        let expected = &Int32Array::from(vec![None, None, None, None]);
+
+        assert_eq!(expected, result);
+
+        Ok(())
+    }
+
     #[test]
     fn case_with_expr_else() -> Result<()> {
         let batch = case_test_batch()?;
@@ -1068,7 +2052,6 @@ mod tests {
         .into_iter()
         .collect();
 
-        //let valid_array = vec![true, false, false, true, false, tru
         let null_buffer = Buffer::from([0b00101001u8]);
         let load4 = load4
             .into_data()
@@ -1296,7 +2279,7 @@ mod tests {
             make_lit_i32(250),
         ));
         let expr = CaseExpr::try_new(None, vec![(predicate, make_col("c2", 1))], None)?;
-        assert!(matches!(expr.eval_method, EvalMethod::InfallibleExprOrNull));
+        assert_eq!(expr.eval_method, EvalMethod::InfallibleExprOrNull);
         match expr.evaluate(&batch)? {
             ColumnarValue::Array(array) => {
                 assert_eq!(1000, array.len());
@@ -1322,7 +2305,7 @@ mod tests {
         let expr = CaseExpr::try_new(None, vec![(when, then)], Some(else_expr))?;
         assert!(matches!(
             expr.eval_method,
-            EvalMethod::ExpressionOrExpression
+            EvalMethod::ExpressionOrExpression(_)
         ));
         let result = expr
             .evaluate(&batch)?
@@ -1434,4 +2417,679 @@ mod tests {
 
         Ok(())
     }
+
+    fn when_then_else(
+        when: &Arc<dyn PhysicalExpr>,
+        then: &Arc<dyn PhysicalExpr>,
+        els: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let case = CaseExpr::try_new(
+            None,
+            vec![(Arc::clone(when), Arc::clone(then))],
+            Some(Arc::clone(els)),
+        )?;
+        Ok(Arc::new(case))
+    }
+
+    #[test]
+    fn test_case_expression_nullability_with_nullable_column() -> Result<()> {
+        case_expression_nullability(true)
+    }
+
+    #[test]
+    fn test_case_expression_nullability_with_not_nullable_column() -> Result<()> {
+        case_expression_nullability(false)
+    }
+
+    fn case_expression_nullability(col_is_nullable: bool) -> Result<()> {
+        let schema =
+            Schema::new(vec![Field::new("foo", DataType::Int32, col_is_nullable)]);
+
+        let foo = col("foo", &schema)?;
+        let foo_is_not_null = is_not_null(Arc::clone(&foo))?;
+        let foo_is_null = expressions::is_null(Arc::clone(&foo))?;
+        let not_foo_is_null = expressions::not(Arc::clone(&foo_is_null))?;
+        let zero = lit(0);
+        let foo_eq_zero =
+            binary(Arc::clone(&foo), Operator::Eq, Arc::clone(&zero), &schema)?;
+
+        assert_not_nullable(when_then_else(&foo_is_not_null, &foo, &zero)?, &schema);
+        assert_not_nullable(when_then_else(&not_foo_is_null, &foo, &zero)?, &schema);
+        assert_not_nullable(when_then_else(&foo_eq_zero, &foo, &zero)?, &schema);
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_not_null),
+                    Operator::And,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_eq_zero),
+                    Operator::And,
+                    Arc::clone(&foo_is_not_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_not_null),
+                    Operator::Or,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_eq_zero),
+                    Operator::Or,
+                    Arc::clone(&foo_is_not_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        assert_nullability(
+            when_then_else(
+                &binary(
+                    Arc::clone(&foo_is_null),
+                    Operator::Or,
+                    Arc::clone(&foo_eq_zero),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+            col_is_nullable,
+        );
+
+        assert_nullability(
+            when_then_else(
+                &binary(
+                    binary(Arc::clone(&foo), Operator::Eq, Arc::clone(&zero), &schema)?,
+                    Operator::Or,
+                    Arc::clone(&foo_is_null),
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+            col_is_nullable,
+        );
+
+        assert_not_nullable(
+            when_then_else(
+                &binary(
+                    binary(
+                        binary(
+                            Arc::clone(&foo),
+                            Operator::Eq,
+                            Arc::clone(&zero),
+                            &schema,
+                        )?,
+                        Operator::And,
+                        Arc::clone(&foo_is_not_null),
+                        &schema,
+                    )?,
+                    Operator::Or,
+                    binary(
+                        binary(
+                            Arc::clone(&foo),
+                            Operator::Eq,
+                            Arc::clone(&foo),
+                            &schema,
+                        )?,
+                        Operator::And,
+                        Arc::clone(&foo_is_not_null),
+                        &schema,
+                    )?,
+                    &schema,
+                )?,
+                &foo,
+                &zero,
+            )?,
+            &schema,
+        );
+
+        Ok(())
+    }
+
+    fn assert_not_nullable(expr: Arc<dyn PhysicalExpr>, schema: &Schema) {
+        assert!(!expr.nullable(schema).unwrap());
+    }
+
+    fn assert_nullable(expr: Arc<dyn PhysicalExpr>, schema: &Schema) {
+        assert!(expr.nullable(schema).unwrap());
+    }
+
+    fn assert_nullability(expr: Arc<dyn PhysicalExpr>, schema: &Schema, nullable: bool) {
+        if nullable {
+            assert_nullable(expr, schema);
+        } else {
+            assert_not_nullable(expr, schema);
+        }
+    }
+
+    // Test Lookup evaluation
+
+    fn test_case_when_literal_lookup(
+        values: ArrayRef,
+        lookup_map: &[(ScalarValue, ScalarValue)],
+        else_value: Option<ScalarValue>,
+        expected: ArrayRef,
+    ) {
+        // Create lookup
+        // CASE <expr>
+        // WHEN <when_constant_1> THEN <then_constant_1>
+        // WHEN <when_constant_2> THEN <then_constant_2>
+        // [ ELSE <else_constant> ]
+
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            values.data_type().clone(),
+            values.is_nullable(),
+        )]);
+        let schema = Arc::new(schema);
+
+        let batch = RecordBatch::try_new(schema, vec![values])
+            .expect("failed to create RecordBatch");
+
+        let schema = batch.schema_ref();
+        let case = col("a", schema).expect("failed to create col");
+
+        let when_then = lookup_map
+            .iter()
+            .map(|(when, then)| {
+                (
+                    Arc::new(Literal::new(when.clone())) as _,
+                    Arc::new(Literal::new(then.clone())) as _,
+                )
+            })
+            .collect::<Vec<WhenThen>>();
+
+        let else_expr = else_value.map(|else_value| {
+            Arc::new(Literal::new(else_value)) as Arc<dyn PhysicalExpr>
+        });
+        let expr = CaseExpr::try_new(Some(case), when_then, else_expr)
+            .expect("failed to create case");
+
+        // Assert that we are testing what we intend to assert
+        assert!(
+            matches!(
+                expr.eval_method,
+                EvalMethod::WithExprScalarLookupTable { .. }
+            ),
+            "we should use the expected eval method"
+        );
+
+        let actual = expr
+            .evaluate(&batch)
+            .expect("failed to evaluate case")
+            .into_array(batch.num_rows())
+            .expect("Failed to convert to array");
+
+        assert_eq!(
+            actual.data_type(),
+            expected.data_type(),
+            "Data type mismatch"
+        );
+
+        assert_eq!(
+            actual.as_ref(),
+            expected.as_ref(),
+            "actual (left) does not match expected (right)"
+        );
+    }
+
+    fn create_lookup<When, Then>(
+        when_then_pairs: impl IntoIterator<Item = (When, Then)>,
+    ) -> Vec<(ScalarValue, ScalarValue)>
+    where
+        ScalarValue: From<When>,
+        ScalarValue: From<Then>,
+    {
+        when_then_pairs
+            .into_iter()
+            .map(|(when, then)| (ScalarValue::from(when), ScalarValue::from(then)))
+            .collect()
+    }
+
+    fn create_input_and_expected<Input, Expected, InputFromItem, ExpectedFromItem>(
+        input_and_expected_pairs: impl IntoIterator<Item = (InputFromItem, ExpectedFromItem)>,
+    ) -> (Input, Expected)
+    where
+        Input: Array + From<Vec<InputFromItem>>,
+        Expected: Array + From<Vec<ExpectedFromItem>>,
+    {
+        let (input_items, expected_items): (Vec<InputFromItem>, Vec<ExpectedFromItem>) =
+            input_and_expected_pairs.into_iter().unzip();
+
+        (Input::from(input_items), Expected::from(expected_items))
+    }
+
+    fn test_lookup_eval_with_and_without_else(
+        lookup_map: &[(ScalarValue, ScalarValue)],
+        input_values: ArrayRef,
+        expected: StringArray,
+    ) {
+        // Testing without ELSE should fallback to None
+        test_case_when_literal_lookup(
+            Arc::clone(&input_values),
+            lookup_map,
+            None,
+            Arc::new(expected.clone()),
+        );
+
+        // Testing with Else
+        let else_value = "___fallback___";
+
+        // Changing each expected None to be fallback
+        let expected_with_else = expected
+            .iter()
+            .map(|item| item.unwrap_or(else_value))
+            .map(Some)
+            .collect::<StringArray>();
+
+        // Test case
+        test_case_when_literal_lookup(
+            input_values,
+            lookup_map,
+            Some(ScalarValue::Utf8(Some(else_value.to_string()))),
+            Arc::new(expected_with_else),
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_int32_to_string() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (Some(2), Some("two")),
+            (Some(3), Some("three")),
+            (Some(1), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_none_case_should_never_match() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (None, Some("none")),
+            (Some(2), Some("two")),
+            (Some(1), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (Some(1), Some("one")),
+                (Some(5), None), // No match in WHEN
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some(2), Some("two")),
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some(2), Some("two")),
+                (Some(5), None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_int32_to_string_with_duplicate_cases() {
+        let lookup_map = create_lookup([
+            (Some(4), Some("four")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("two")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("three")),
+            (Some(3), Some("no 3")),
+            (Some(2), Some("no 2")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("no 3")),
+            (Some(4), Some("no 4")),
+            (Some(2), Some("no 2")),
+            (Some(3), Some("no 3")),
+            (Some(3), Some("no 3")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Int32Array, StringArray, _, _>([
+                (1, None), // No match in WHEN
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f32_to_string_with_special_values_and_duplicate_cases()
+     {
+        let lookup_map = create_lookup([
+            (Some(4.0), Some("four point zero")),
+            (Some(f32::NAN), Some("NaN")),
+            (Some(3.2), Some("three point two")),
+            // Duplicate case to make sure it is not used
+            (Some(f32::NAN), Some("should not use this NaN branch")),
+            (Some(f32::INFINITY), Some("Infinity")),
+            (Some(0.0), Some("zero")),
+            // Duplicate case to make sure it is not used
+            (
+                Some(f32::INFINITY),
+                Some("should not use this Infinity branch"),
+            ),
+            (Some(1.1), Some("one point one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float32Array, StringArray, _, _>([
+                (1.1, Some("one point one")),
+                (f32::NAN, Some("NaN")),
+                (3.2, Some("three point two")),
+                (3.2, Some("three point two")),
+                (0.0, Some("zero")),
+                (f32::INFINITY, Some("Infinity")),
+                (3.2, Some("three point two")),
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (3.2, Some("three point two")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f16_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (
+                ScalarValue::Float16(Some(f16::from_f32(3.2))),
+                Some("3 dot 2"),
+            ),
+            (ScalarValue::Float16(Some(f16::NAN)), Some("NaN")),
+            (
+                ScalarValue::Float16(Some(f16::from_f32(17.4))),
+                Some("17 dot 4"),
+            ),
+            (ScalarValue::Float16(Some(f16::INFINITY)), Some("Infinity")),
+            (ScalarValue::Float16(Some(f16::ZERO)), Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float16Array, StringArray, _, _>([
+                (f16::from_f32(3.2), Some("3 dot 2")),
+                (f16::NAN, Some("NaN")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::INFINITY, Some("Infinity")),
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::NEG_INFINITY, None), // No match in WHEN
+                (f16::NEG_INFINITY, None), // No match in WHEN
+                (f16::from_f32(17.4), Some("17 dot 4")),
+                (f16::NEG_ZERO, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f32_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (3.2, Some("3 dot 2")),
+            (f32::NAN, Some("NaN")),
+            (17.4, Some("17 dot 4")),
+            (f32::INFINITY, Some("Infinity")),
+            (f32::ZERO, Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float32Array, StringArray, _, _>([
+                (3.2, Some("3 dot 2")),
+                (f32::NAN, Some("NaN")),
+                (17.4, Some("17 dot 4")),
+                (17.4, Some("17 dot 4")),
+                (f32::INFINITY, Some("Infinity")),
+                (17.4, Some("17 dot 4")),
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (f32::NEG_INFINITY, None), // No match in WHEN
+                (17.4, Some("17 dot 4")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_case_when_literal_lookup_f64_to_string_with_special_values() {
+        let lookup_map = create_lookup([
+            (3.2, Some("3 dot 2")),
+            (f64::NAN, Some("NaN")),
+            (17.4, Some("17 dot 4")),
+            (f64::INFINITY, Some("Infinity")),
+            (f64::ZERO, Some("zero")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Float64Array, StringArray, _, _>([
+                (3.2, Some("3 dot 2")),
+                (f64::NAN, Some("NaN")),
+                (17.4, Some("17 dot 4")),
+                (17.4, Some("17 dot 4")),
+                (f64::INFINITY, Some("Infinity")),
+                (17.4, Some("17 dot 4")),
+                (f64::NEG_INFINITY, None), // No match in WHEN
+                (f64::NEG_INFINITY, None), // No match in WHEN
+                (17.4, Some("17 dot 4")),
+                (-0.0, None), // No match in WHEN
+            ]);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    // Test that we don't lose the decimal precision and scale info
+    #[test]
+    fn test_decimal_with_non_default_precision_and_scale() {
+        let lookup_map = create_lookup([
+            (ScalarValue::Decimal32(Some(4), 3, 2), Some("four")),
+            (ScalarValue::Decimal32(Some(2), 3, 2), Some("two")),
+            (ScalarValue::Decimal32(Some(3), 3, 2), Some("three")),
+            (ScalarValue::Decimal32(Some(1), 3, 2), Some("one")),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<Decimal32Array, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        let input_values = input_values
+            .with_precision_and_scale(3, 2)
+            .expect("must be able to set precision and scale");
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    // Test that we don't lose the timezone info
+    #[test]
+    fn test_timestamp_with_non_default_timezone() {
+        let timezone: Option<Arc<str>> = Some("-10:00".into());
+        let lookup_map = create_lookup([
+            (
+                ScalarValue::TimestampMillisecond(Some(4), timezone.clone()),
+                Some("four"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(2), timezone.clone()),
+                Some("two"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(3), timezone.clone()),
+                Some("three"),
+            ),
+            (
+                ScalarValue::TimestampMillisecond(Some(1), timezone.clone()),
+                Some("one"),
+            ),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<TimestampMillisecondArray, StringArray, _, _>([
+                (1, Some("one")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (3, Some("three")),
+                (2, Some("two")),
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+                (5, None), // No match in WHEN
+                (3, Some("three")),
+                (5, None), // No match in WHEN
+            ]);
+
+        let input_values = input_values.with_timezone_opt(timezone);
+
+        test_lookup_eval_with_and_without_else(
+            &lookup_map,
+            Arc::new(input_values),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_with_strings_to_int32() {
+        let lookup_map = create_lookup([
+            (Some("why"), Some(42)),
+            (Some("what"), Some(22)),
+            (Some("when"), Some(17)),
+        ]);
+
+        let (input_values, expected) =
+            create_input_and_expected::<StringArray, Int32Array, _, _>([
+                (Some("why"), Some(42)),
+                (Some("5"), None), // No match in WHEN
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some("what"), Some(22)),
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (None, None), // None cases are never match in CASE <expr> WHEN <value> syntax
+                (Some("what"), Some(22)),
+                (Some("5"), None), // No match in WHEN
+            ]);
+
+        let input_values = Arc::new(input_values) as ArrayRef;
+
+        // Testing without ELSE should fallback to None
+        test_case_when_literal_lookup(
+            Arc::clone(&input_values),
+            &lookup_map,
+            None,
+            Arc::new(expected.clone()),
+        );
+
+        // Testing with Else
+        let else_value = 101;
+
+        // Changing each expected None to be fallback
+        let expected_with_else = expected
+            .iter()
+            .map(|item| item.unwrap_or(else_value))
+            .map(Some)
+            .collect::<Int32Array>();
+
+        // Test case
+        test_case_when_literal_lookup(
+            input_values,
+            &lookup_map,
+            Some(ScalarValue::Int32(Some(else_value))),
+            Arc::new(expected_with_else),
+        );
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs
new file mode 100644
index 0000000000000..15b3d04955b2e
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/boolean_lookup_table.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
+use arrow::datatypes::DataType;
+use datafusion_common::{ScalarValue, internal_err};
+
+#[derive(Clone, Debug)]
+pub(super) struct BooleanIndexMap {
+    true_index: Option<u32>,
+    false_index: Option<u32>,
+}
+
+impl BooleanIndexMap {
+    /// Try creating a new lookup table from the given literals and else index
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let mut true_index: Option<u32> = None;
+        let mut false_index: Option<u32> = None;
+
+        for (index, literal) in unique_non_null_literals.into_iter().enumerate() {
+            match literal {
+                ScalarValue::Boolean(Some(true)) => {
+                    if true_index.is_some() {
+                        return internal_err!(
+                            "Duplicate true literal found in literals for BooleanIndexMap"
+                        );
+                    }
+                    true_index = Some(index as u32);
+                }
+                ScalarValue::Boolean(Some(false)) => {
+                    if false_index.is_some() {
+                        return internal_err!(
+                            "Duplicate false literal found in literals for BooleanIndexMap"
+                        );
+                    }
+                    false_index = Some(index as u32);
+                }
+                ScalarValue::Boolean(None) => {
+                    return internal_err!(
+                        "Null literal found in non-null literals for BooleanIndexMap"
+                    );
+                }
+                _ => {
+                    return internal_err!(
+                        "Non-boolean literal found in literals for BooleanIndexMap"
+                    );
+                }
+            }
+        }
+
+        Ok(Self {
+            true_index,
+            false_index,
+        })
+    }
+
+    fn map_boolean_array_to_when_indices(
+        &self,
+        array: &BooleanArray,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let true_index = self.true_index.unwrap_or(else_index);
+        let false_index = self.false_index.unwrap_or(else_index);
+
+        Ok(array
+            .into_iter()
+            .map(|value| match value {
+                Some(true) => true_index,
+                Some(false) => false_index,
+                None => else_index,
+            })
+            .collect::<Vec<u32>>())
+    }
+}
+
+impl WhenLiteralIndexMap for BooleanIndexMap {
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        match array.data_type() {
+            DataType::Boolean => {
+                self.map_boolean_array_to_when_indices(array.as_boolean(), else_index)
+            }
+            // We support dictionary boolean array as we create the lookup table in `CaseWhen` expression
+            // creation when we don't know the schema, so we may receive dictionary encoded boolean arrays at execution time.
+            DataType::Dictionary(_, value_type)
+                if value_type.as_ref() == &DataType::Boolean =>
+            {
+                // Since it is not common to have dictionary encoded boolean arrays
+                // at all than it is ok to do the cast here to simplify the implementation.
+                let converted = arrow::compute::cast(array.as_ref(), &DataType::Boolean)?;
+                self.map_boolean_array_to_when_indices(converted.as_boolean(), else_index)
+            }
+            _ => internal_err!(
+                "Expected boolean array for BooleanIndexMap, got {:?}",
+                array.data_type()
+            ),
+        }
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs
new file mode 100644
index 0000000000000..e5cf3f84fd919
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/bytes_like_lookup_table.rs
@@ -0,0 +1,223 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{
+    Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, DictionaryArray,
+    FixedSizeBinaryArray, LargeBinaryArray, LargeStringArray, StringArray,
+    StringViewArray, downcast_integer,
+};
+use arrow::datatypes::{
+    ArrowDictionaryKeyType, BinaryViewType, DataType, StringViewType,
+};
+use datafusion_common::{HashMap, ScalarValue, internal_err, plan_datafusion_err};
+use std::fmt::Debug;
+
+/// Map from byte-like literal values to their first occurrence index
+///
+/// This is a wrapper for handling different kinds of literal maps
+#[derive(Clone, Debug)]
+pub(super) struct BytesLikeIndexMap {
+    /// Map from non-null literal value the first occurrence index in the literals
+    map: HashMap<Vec<u8>, u32>,
+}
+
+impl BytesLikeIndexMap {
+    /// Try creating a new lookup table from the given literals and else index
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let input = ScalarValue::iter_to_array(unique_non_null_literals)?;
+
+        // Literals are guaranteed to not contain nulls
+        if input.logical_null_count() > 0 {
+            return internal_err!("Literal values for WHEN clauses cannot contain nulls");
+        }
+
+        let map: HashMap<Vec<u8>, u32> = try_get_bytes_iterator(&input)?
+            // Flattening Option<&[u8]> to &[u8] as literals cannot contain nulls
+            .flatten()
+            .enumerate()
+            .map(|(map_index, value)| (value.to_vec(), map_index as u32))
+            // Because literals are unique we can collect directly, and we can avoid only inserting the first occurrence
+            .collect();
+
+        Ok(Self { map })
+    }
+}
+
+impl WhenLiteralIndexMap for BytesLikeIndexMap {
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let indices = try_get_bytes_iterator(array)?
+            .map(|value| match value {
+                Some(value) => self.map.get(value).copied().unwrap_or(else_index),
+                None => else_index,
+            })
+            .collect::<Vec<u32>>();
+
+        Ok(indices)
+    }
+}
+
+fn try_get_bytes_iterator(
+    array: &ArrayRef,
+) -> datafusion_common::Result<Box<dyn Iterator<Item = Option<&[u8]>> + '_>> {
+    Ok(match array.data_type() {
+        DataType::Utf8 => Box::new(array.as_string::<i32>().into_iter().map(|item| {
+            item.map(|v| {
+                let bytes: &[u8] = v.as_ref();
+
+                bytes
+            })
+        })),
+
+        DataType::LargeUtf8 => {
+            Box::new(array.as_string::<i64>().into_iter().map(|item| {
+                item.map(|v| {
+                    let bytes: &[u8] = v.as_ref();
+
+                    bytes
+                })
+            }))
+        }
+
+        DataType::Binary => Box::new(array.as_binary::<i32>().into_iter()),
+
+        DataType::LargeBinary => Box::new(array.as_binary::<i64>().into_iter()),
+
+        DataType::FixedSizeBinary(_) => Box::new(array.as_binary::<i64>().into_iter()),
+
+        DataType::Utf8View => Box::new(
+            array
+                .as_byte_view::<StringViewType>()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+        DataType::BinaryView => {
+            Box::new(array.as_byte_view::<BinaryViewType>().into_iter())
+        }
+
+        DataType::Dictionary(key, _) => {
+            macro_rules! downcast_dictionary_array_helper {
+                ($t:ty) => {{ get_bytes_iterator_for_dictionary(array.as_dictionary::<$t>())? }};
+            }
+
+            downcast_integer! {
+                key.as_ref() => (downcast_dictionary_array_helper),
+                k => unreachable!("unsupported dictionary key type: {}", k)
+            }
+        }
+        t => {
+            return Err(plan_datafusion_err!(
+                "Unsupported data type for bytes lookup table: {}",
+                t
+            ));
+        }
+    })
+}
+
+fn get_bytes_iterator_for_dictionary<K: ArrowDictionaryKeyType + Send + Sync>(
+    array: &DictionaryArray<K>,
+) -> datafusion_common::Result<Box<dyn Iterator<Item = Option<&[u8]>> + '_>> {
+    Ok(match array.values().data_type() {
+        DataType::Utf8 => Box::new(
+            array
+                .downcast_dict::<StringArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+
+        DataType::LargeUtf8 => Box::new(
+            array
+                .downcast_dict::<LargeStringArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+
+        DataType::Binary => {
+            Box::new(array.downcast_dict::<BinaryArray>().unwrap().into_iter())
+        }
+
+        DataType::LargeBinary => Box::new(
+            array
+                .downcast_dict::<LargeBinaryArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        DataType::FixedSizeBinary(_) => Box::new(
+            array
+                .downcast_dict::<FixedSizeBinaryArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        DataType::Utf8View => Box::new(
+            array
+                .downcast_dict::<StringViewArray>()
+                .unwrap()
+                .into_iter()
+                .map(|item| {
+                    item.map(|v| {
+                        let bytes: &[u8] = v.as_ref();
+
+                        bytes
+                    })
+                }),
+        ),
+        DataType::BinaryView => Box::new(
+            array
+                .downcast_dict::<BinaryViewArray>()
+                .unwrap()
+                .into_iter(),
+        ),
+
+        t => {
+            return Err(plan_datafusion_err!(
+                "Unsupported data type for lookup table dictionary value: {}",
+                t
+            ));
+        }
+    })
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs
new file mode 100644
index 0000000000000..67b045f9988f8
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/mod.rs
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod boolean_lookup_table;
+mod bytes_like_lookup_table;
+mod primitive_lookup_table;
+
+use crate::expressions::Literal;
+use crate::expressions::case::CaseBody;
+use crate::expressions::case::literal_lookup_table::boolean_lookup_table::BooleanIndexMap;
+use crate::expressions::case::literal_lookup_table::bytes_like_lookup_table::BytesLikeIndexMap;
+use crate::expressions::case::literal_lookup_table::primitive_lookup_table::PrimitiveIndexMap;
+use arrow::array::{Array, ArrayRef, UInt32Array, downcast_primitive};
+use arrow::datatypes::DataType;
+use datafusion_common::{ScalarValue, arrow_datafusion_err, plan_datafusion_err};
+use indexmap::IndexMap;
+use std::fmt::Debug;
+
+/// Optimization for CASE expressions with literal WHEN and THEN clauses
+///
+/// for this form:
+/// ```sql
+/// CASE <expr_a>
+///     WHEN <literal_a> THEN <literal_e>
+///     WHEN <literal_b> THEN <literal_f>
+///     WHEN <literal_c> THEN <literal_g>
+///     WHEN <literal_d> THEN <literal_h>
+///     ELSE <optional-fallback_literal>
+/// END
+/// ```
+///
+/// # Improvement idea
+/// TODO - we should think of unwrapping the `IN` expressions into multiple equality comparisons
+/// so it will use this optimization as well, e.g.
+/// ```sql
+/// -- Before
+/// CASE
+///     WHEN (<expr_a> = <literal_a>) THEN <literal_e>
+///     WHEN (<expr_a> in (<literal_b>, <literal_c>) THEN <literal_f>
+///     WHEN (<expr_a> = <literal_d>) THEN <literal_g>
+/// ELSE <optional-fallback_literal>
+///
+/// -- After
+/// CASE
+///     WHEN (<expr_a> = <literal_a>) THEN <literal_e>
+///     WHEN (<expr_a> = <literal_b>) THEN <literal_f>
+///     WHEN (<expr_a> = <literal_c>) THEN <literal_g>
+///     WHEN (<expr_a> = <literal_d>) THEN <literal_h>
+///     ELSE <optional-fallback_literal>
+/// END
+/// ```
+///
+#[derive(Debug)]
+pub(in super::super) struct LiteralLookupTable {
+    /// The lookup table to use for evaluating the CASE expression
+    lookup: Box<dyn WhenLiteralIndexMap>,
+
+    else_index: u32,
+
+    /// [`ArrayRef`] where `array[i] = then_literals[i]`
+    /// the last value in the array is the else_expr
+    ///
+    /// This will be used to take from based on the indices returned by the lookup table to build the final output
+    then_and_else_values: ArrayRef,
+}
+
+impl LiteralLookupTable {
+    pub(in super::super) fn maybe_new(body: &CaseBody) -> Option<Self> {
+        // We can't use the optimization if we don't have any when then pairs
+        if body.when_then_expr.is_empty() {
+            return None;
+        }
+
+        // If we only have 1 than this optimization is not useful
+        if body.when_then_expr.len() == 1 {
+            return None;
+        }
+
+        // Try to downcast all the WHEN/THEN expressions to literals
+        let when_then_exprs_maybe_literals = body
+            .when_then_expr
+            .iter()
+            .map(|(when, then)| {
+                let when_maybe_literal = when.as_any().downcast_ref::<Literal>();
+                let then_maybe_literal = then.as_any().downcast_ref::<Literal>();
+
+                when_maybe_literal.zip(then_maybe_literal)
+            })
+            .collect::<Vec<_>>();
+
+        // If not all the WHEN/THEN expressions are literals we cannot use this optimization
+        if when_then_exprs_maybe_literals.contains(&None) {
+            return None;
+        }
+
+        let when_then_exprs_scalars = when_then_exprs_maybe_literals
+            .into_iter()
+            // Unwrap the options as we have already checked there is no None
+            .flatten()
+            .map(|(when_lit, then_lit)| {
+                (when_lit.value().clone(), then_lit.value().clone())
+            })
+            // Only keep non-null WHEN literals
+            // as they cannot be matched - case NULL WHEN NULL THEN ... ELSE ... END always goes to ELSE
+            .filter(|(when_lit, _)| !when_lit.is_null())
+            .collect::<Vec<_>>();
+
+        if when_then_exprs_scalars.is_empty() {
+            // All WHEN literals were nulls, so cannot use optimization
+            //
+            // instead, another optimization would be to go straight to the ELSE clause
+            return None;
+        }
+
+        // Keep only the first occurrence of each when literal (as the first match is used)
+        // and remove nulls (as they cannot be matched - case NULL WHEN NULL THEN ... ELSE ... END always goes to ELSE)
+        let (when, then): (Vec<ScalarValue>, Vec<ScalarValue>) = {
+            let mut map = IndexMap::with_capacity(body.when_then_expr.len());
+
+            for (when, then) in when_then_exprs_scalars.into_iter() {
+                // Don't overwrite existing entries as we want to keep the first occurrence
+                if !map.contains_key(&when) {
+                    map.insert(when, then);
+                }
+            }
+
+            map.into_iter().unzip()
+        };
+
+        let else_value: ScalarValue = if let Some(else_expr) = &body.else_expr {
+            let literal = else_expr.as_any().downcast_ref::<Literal>()?;
+
+            literal.value().clone()
+        } else {
+            let Ok(null_scalar) = ScalarValue::try_new_null(&then[0].data_type()) else {
+                return None;
+            };
+
+            null_scalar
+        };
+
+        {
+            let when_data_type = when[0].data_type();
+
+            // If not all the WHEN literals are the same data type we cannot use this optimization
+            if when.iter().any(|l| l.data_type() != when_data_type) {
+                return None;
+            }
+        }
+
+        {
+            let data_type = then[0].data_type();
+
+            // If not all the then and the else literals are the same data type we cannot use this optimization
+            if then.iter().any(|l| l.data_type() != data_type) {
+                return None;
+            }
+
+            if else_value.data_type() != data_type {
+                return None;
+            }
+        }
+
+        let then_and_else_values = ScalarValue::iter_to_array(
+            then.iter()
+                // The else is in the end
+                .chain(std::iter::once(&else_value))
+                .cloned(),
+        )
+        .ok()?;
+        // The else expression is in the end
+        let else_index = then_and_else_values.len() as u32 - 1;
+
+        let lookup = try_creating_lookup_table(when).ok()?;
+
+        Some(Self {
+            lookup,
+            then_and_else_values,
+            else_index,
+        })
+    }
+
+    pub(in super::super) fn map_keys_to_values(
+        &self,
+        keys_array: &ArrayRef,
+    ) -> datafusion_common::Result<ArrayRef> {
+        let take_indices = self
+            .lookup
+            .map_to_when_indices(keys_array, self.else_index)?;
+
+        // Zero-copy conversion
+        let take_indices = UInt32Array::from(take_indices);
+
+        // An optimize version would depend on the type of the values_to_take_from
+        // For example, if the type is view we can just keep pointing to the same value (similar to dictionary)
+        // if the type is dictionary we can just use the indices as is (or cast them to the key type) and create a new dictionary array
+        let output =
+            arrow::compute::take(&self.then_and_else_values, &take_indices, None)
+                .map_err(|e| arrow_datafusion_err!(e))?;
+
+        Ok(output)
+    }
+}
+
+/// Map values that match the WHEN literal to the index of their corresponding WHEN clause
+///
+/// For example, for this CASE expression:
+///
+/// ```sql
+/// CASE <expr_a>
+///     WHEN <literal_a> THEN <result_e>
+///     WHEN <literal_b> THEN <result_f>
+///     WHEN <literal_c> THEN <result_g>
+///     WHEN <literal_d> THEN <result_h>
+///     ELSE <fallback_result>
+/// END
+/// ```
+///
+/// this will map <literal_a> to 0, <literal_b> to 1, <literal_c> to 2, <literal_d> to 3
+pub(super) trait WhenLiteralIndexMap: Debug + Send + Sync {
+    /// Given an array of values, returns a vector of WHEN clause indices corresponding to each value in the provided array.
+    ///
+    /// For example, for this CASE expression:
+    ///
+    /// ```sql
+    /// CASE <expr_a>
+    ///     WHEN <literal_a> THEN <result_e>
+    ///     WHEN <literal_b> THEN <result_f>
+    ///     WHEN <literal_c> THEN <result_g>
+    ///     WHEN <literal_d> THEN <result_h>
+    ///     ELSE <fallback_result>
+    /// END
+    /// ```
+    ///
+    /// the array will be the evaluated values of `<expr_a>`
+    /// and if that array is:
+    /// - `[<literal_a>, <literal_c>, <literal_x>, <literal_b>, <literal_a>]`
+    ///
+    /// the returned vector will be:
+    /// - `[0, 2, else_index, 1, 0]`
+    ///
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>>;
+}
+
+fn try_creating_lookup_table(
+    unique_non_null_literals: Vec<ScalarValue>,
+) -> datafusion_common::Result<Box<dyn WhenLiteralIndexMap>> {
+    assert_ne!(
+        unique_non_null_literals.len(),
+        0,
+        "Must have at least one literal"
+    );
+    match unique_non_null_literals[0].data_type() {
+        DataType::Boolean => {
+            let lookup_table = BooleanIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        data_type if data_type.is_primitive() => {
+            macro_rules! create_matching_map {
+                ($t:ty) => {{
+                    let lookup_table =
+                        PrimitiveIndexMap::<$t>::try_new(unique_non_null_literals)?;
+                    Ok(Box::new(lookup_table))
+                }};
+            }
+
+            downcast_primitive! {
+                data_type => (create_matching_map),
+                _ => Err(plan_datafusion_err!(
+                    "Unsupported field type for primitive: {:?}",
+                    data_type
+                )),
+            }
+        }
+
+        DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Binary
+        | DataType::LargeBinary
+        | DataType::FixedSizeBinary(_)
+        | DataType::Utf8View
+        | DataType::BinaryView => {
+            let lookup_table = BytesLikeIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        DataType::Dictionary(_key, value)
+            if matches!(
+                value.as_ref(),
+                DataType::Utf8
+                    | DataType::LargeUtf8
+                    | DataType::Binary
+                    | DataType::LargeBinary
+                    | DataType::FixedSizeBinary(_)
+                    | DataType::Utf8View
+                    | DataType::BinaryView
+            ) =>
+        {
+            let lookup_table = BytesLikeIndexMap::try_new(unique_non_null_literals)?;
+            Ok(Box::new(lookup_table))
+        }
+
+        _ => Err(plan_datafusion_err!(
+            "Unsupported data type for lookup table: {}",
+            unique_non_null_literals[0].data_type()
+        )),
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs
new file mode 100644
index 0000000000000..36d282c2a402b
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/case/literal_lookup_table/primitive_lookup_table.rs
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::expressions::case::literal_lookup_table::WhenLiteralIndexMap;
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, AsArray, PrimitiveArray,
+};
+use arrow::datatypes::{DataType, IntervalDayTime, IntervalMonthDayNano, i256};
+use datafusion_common::{HashMap, ScalarValue, internal_err};
+use half::f16;
+use std::fmt::Debug;
+use std::hash::Hash;
+
+#[derive(Clone)]
+pub(super) struct PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    data_type: DataType,
+    /// Literal value to map index
+    ///
+    /// If searching this map becomes a bottleneck consider using linear map implementations for small hashmaps
+    map: HashMap<<T::Native as ToHashableKey>::HashableKey, u32>,
+}
+
+impl<T> Debug for PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("PrimitiveIndexMap")
+            .field("map", &self.map)
+            .finish()
+    }
+}
+
+impl<T> PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    /// Try creating a new lookup table from the given literals and else index.
+    /// The index of each literal in the vector is used as the mapped value in the lookup table.
+    ///
+    /// `literals` are guaranteed to be unique and non-nullable
+    pub(super) fn try_new(
+        unique_non_null_literals: Vec<ScalarValue>,
+    ) -> datafusion_common::Result<Self> {
+        let input = ScalarValue::iter_to_array(unique_non_null_literals)?;
+
+        // Literals are guaranteed to not contain nulls
+        if input.null_count() > 0 {
+            return internal_err!("Literal values for WHEN clauses cannot contain nulls");
+        }
+
+        let map = input
+            .as_primitive::<T>()
+            .values()
+            .iter()
+            .enumerate()
+            // Because literals are unique we can collect directly, and we can avoid only inserting the first occurrence
+            .map(|(map_index, value)| (value.into_hashable_key(), map_index as u32))
+            .collect();
+
+        Ok(Self {
+            map,
+            data_type: input.data_type().clone(),
+        })
+    }
+
+    fn map_primitive_array_to_when_indices(
+        &self,
+        array: &PrimitiveArray<T>,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        let indices = array
+            .into_iter()
+            .map(|value| match value {
+                Some(value) => self
+                    .map
+                    .get(&value.into_hashable_key())
+                    .copied()
+                    .unwrap_or(else_index),
+
+                None => else_index,
+            })
+            .collect::<Vec<u32>>();
+
+        Ok(indices)
+    }
+}
+
+impl<T> WhenLiteralIndexMap for PrimitiveIndexMap<T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: ToHashableKey,
+{
+    fn map_to_when_indices(
+        &self,
+        array: &ArrayRef,
+        else_index: u32,
+    ) -> datafusion_common::Result<Vec<u32>> {
+        match array.data_type() {
+            dt if dt == &self.data_type => {
+                let primitive_array = array.as_primitive::<T>();
+
+                self.map_primitive_array_to_when_indices(primitive_array, else_index)
+            }
+            // We support dictionary primitive array as we create the lookup table in `CaseWhen` expression
+            // creation when we don't know the schema, so we may receive dictionary encoded primitive arrays at execution time.
+            DataType::Dictionary(_, value_type)
+                if value_type.as_ref() == &self.data_type =>
+            {
+                // Cast here to simplify the implementation.
+                let converted = arrow::compute::cast(array.as_ref(), &self.data_type)?;
+                self.map_primitive_array_to_when_indices(
+                    converted.as_primitive::<T>(),
+                    else_index,
+                )
+            }
+            _ => internal_err!(
+                "PrimitiveIndexMap expected array of type {:?} but got {:?}",
+                self.data_type,
+                array.data_type()
+            ),
+        }
+    }
+}
+
+// TODO - We need to port it to arrow so that it can be reused in other places
+
+/// Trait that help convert a value to a key that is hashable and equatable
+/// This is needed as some types like f16/f32/f64 do not implement Hash/Eq directly
+pub(super) trait ToHashableKey: ArrowNativeTypeOp {
+    /// The type that is hashable and equatable
+    /// It must be an Arrow native type but it NOT GUARANTEED to be the same as Self
+    /// this is just a helper trait so you can reuse the same code for all arrow native types
+    type HashableKey: Hash + Eq + Debug + Clone + Copy + Send + Sync;
+
+    /// Converts self to a hashable key
+    /// the result of this value can be used as the key in hash maps/sets
+    fn into_hashable_key(self) -> Self::HashableKey;
+}
+
+macro_rules! impl_to_hashable_key {
+    (@single_already_hashable | $t:ty) => {
+        impl ToHashableKey for $t {
+            type HashableKey = $t;
+
+            #[inline]
+            fn into_hashable_key(self) -> Self::HashableKey {
+                self
+            }
+        }
+    };
+    (@already_hashable | $($t:ty),+ $(,)?) => {
+        $(
+            impl_to_hashable_key!(@single_already_hashable | $t);
+        )+
+    };
+    (@float | $t:ty => $hashable:ty) => {
+        impl ToHashableKey for $t {
+            type HashableKey = $hashable;
+
+            #[inline]
+            fn into_hashable_key(self) -> Self::HashableKey {
+                self.to_bits()
+            }
+        }
+    };
+}
+
+impl_to_hashable_key!(@already_hashable | i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, IntervalDayTime, IntervalMonthDayNano);
+impl_to_hashable_key!(@float | f16 => u16);
+impl_to_hashable_key!(@float | f32 => u32);
+impl_to_hashable_key!(@float | f64 => u64);
+
+#[cfg(test)]
+mod tests {
+    use super::ToHashableKey;
+    use arrow::array::downcast_primitive;
+
+    // This test ensure that all arrow primitive types implement ToHashableKey
+    // otherwise the code will not compile
+    #[test]
+    fn should_implement_to_hashable_key_for_all_primitives() {
+        #[derive(Debug, Default)]
+        struct ExampleSet<T>
+        where
+            T: arrow::datatypes::ArrowPrimitiveType,
+            T::Native: ToHashableKey,
+        {
+            _map: std::collections::HashSet<<T::Native as ToHashableKey>::HashableKey>,
+        }
+
+        macro_rules! create_matching_set {
+            ($t:ty) => {{
+                let _lookup_table = ExampleSet::<$t> {
+                    _map: Default::default(),
+                };
+
+                return;
+            }};
+        }
+
+        let data_type = arrow::datatypes::DataType::Float16;
+
+        downcast_primitive! {
+            data_type => (create_matching_set),
+            _ => panic!("not implemented for {data_type}"),
+        }
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs
index 7e345e60271fd..5a80daf663a36 100644
--- a/datafusion/physical-expr/src/expressions/cast.rs
+++ b/datafusion/physical-expr/src/expressions/cast.rs
@@ -22,11 +22,13 @@ use std::sync::Arc;
 
 use crate::physical_expr::PhysicalExpr;
 
-use arrow::compute::{can_cast_types, CastOptions};
+use arrow::compute::{CastOptions, can_cast_types};
 use arrow::datatypes::{DataType, DataType::*, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::format::DEFAULT_FORMAT_OPTIONS;
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::nested_struct::validate_struct_compatibility;
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
 use datafusion_expr_common::sort_properties::ExprProperties;
@@ -41,13 +43,29 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions {
     format_options: DEFAULT_FORMAT_OPTIONS,
 };
 
+/// Check if struct-to-struct casting is allowed by validating field compatibility.
+///
+/// This function applies the same validation rules as execution time to ensure
+/// planning-time validation matches runtime validation, enabling fail-fast behavior
+/// instead of deferring errors to execution.
+fn can_cast_struct_types(source: &DataType, target: &DataType) -> bool {
+    match (source, target) {
+        (Struct(source_fields), Struct(target_fields)) => {
+            // Apply the same struct compatibility rules as at execution time.
+            // This ensures planning-time validation matches execution-time validation.
+            validate_struct_compatibility(source_fields, target_fields).is_ok()
+        }
+        _ => false,
+    }
+}
+
 /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast
 #[derive(Debug, Clone, Eq)]
 pub struct CastExpr {
     /// The expression to cast
     pub expr: Arc<dyn PhysicalExpr>,
-    /// The data type to cast to
-    cast_type: DataType,
+    /// Field metadata describing the desired output after casting
+    target_field: FieldRef,
     /// Cast options
     cast_options: CastOptions<'static>,
 }
@@ -56,7 +74,7 @@ pub struct CastExpr {
 impl PartialEq for CastExpr {
     fn eq(&self, other: &Self) -> bool {
         self.expr.eq(&other.expr)
-            && self.cast_type.eq(&other.cast_type)
+            && self.target_field.eq(&other.target_field)
             && self.cast_options.eq(&other.cast_options)
     }
 }
@@ -64,21 +82,55 @@ impl PartialEq for CastExpr {
 impl Hash for CastExpr {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
         self.expr.hash(state);
-        self.cast_type.hash(state);
+        self.target_field.hash(state);
         self.cast_options.hash(state);
     }
 }
 
 impl CastExpr {
-    /// Create a new CastExpr
+    /// Create a new `CastExpr` using only a `DataType`.
+    ///
+    /// This constructor is provided for compatibility with existing call sites
+    /// that only know the target type.  It synthesizes a ``Field`` with the
+    /// given type (**nullable by default**) and no name metadata.  Callers that
+    /// already have a `FieldRef` (for example, coming from schema inference or a
+    /// resolved column) should prefer [`CastExpr::new_with_target_field`], which
+    /// preserves the field's name, nullability, and other metadata.  In other
+    /// words:
+    ///
+    /// * use `new()` when only a `DataType` is available and you want the legacy
+    ///   semantics of a type-only cast
+    /// * use `new_with_target_field()` when you need explicit field
+    ///   metadata/name/nullability preserved
     pub fn new(
         expr: Arc<dyn PhysicalExpr>,
         cast_type: DataType,
         cast_options: Option<CastOptions<'static>>,
+    ) -> Self {
+        Self::new_with_target_field(
+            expr,
+            cast_type.into_nullable_field_ref(),
+            cast_options,
+        )
+    }
+
+    /// Create a new `CastExpr` with an explicit target `FieldRef`.
+    ///
+    /// The provided `target_field` is used verbatim for the expression's
+    /// return schema, so the field's name, nullability, and other metadata are
+    /// preserved.  This is the preferred constructor when the caller already
+    /// has field information (for example, during logical-to-physical planning).
+    ///
+    /// See [`CastExpr::new`] for the compatibility constructor that only accepts
+    /// a `DataType`.
+    pub fn new_with_target_field(
+        expr: Arc<dyn PhysicalExpr>,
+        target_field: FieldRef,
+        cast_options: Option<CastOptions<'static>>,
     ) -> Self {
         Self {
             expr,
-            cast_type,
+            target_field,
             cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS),
         }
     }
@@ -90,19 +142,48 @@ impl CastExpr {
 
     /// The data type to cast to
     pub fn cast_type(&self) -> &DataType {
-        &self.cast_type
+        self.target_field.data_type()
+    }
+
+    /// Field metadata describing the output column after casting.
+    pub fn target_field(&self) -> &FieldRef {
+        &self.target_field
     }
 
     /// The cast options
     pub fn cast_options(&self) -> &CastOptions<'static> {
         &self.cast_options
     }
-    pub fn is_bigger_cast(&self, src: DataType) -> bool {
-        if src == self.cast_type {
+
+    fn is_default_target_field(&self) -> bool {
+        self.target_field.name().is_empty()
+            && self.target_field.is_nullable()
+            && self.target_field.metadata().is_empty()
+    }
+
+    fn resolved_target_field(&self, input_schema: &Schema) -> Result<FieldRef> {
+        if self.is_default_target_field() {
+            self.expr.return_field(input_schema).map(|field| {
+                Arc::new(
+                    field
+                        .as_ref()
+                        .clone()
+                        .with_data_type(self.cast_type().clone()),
+                )
+            })
+        } else {
+            Ok(Arc::clone(&self.target_field))
+        }
+    }
+
+    /// Check if casting from the specified source type to the target type is a
+    /// widening cast (e.g. from `Int8` to `Int16`).
+    pub fn check_bigger_cast(cast_type: &DataType, src: &DataType) -> bool {
+        if cast_type.eq(src) {
             return true;
         }
         matches!(
-            (src, &self.cast_type),
+            (src, cast_type),
             (Int8, Int16 | Int32 | Int64)
                 | (Int16, Int32 | Int64)
                 | (Int32, Int64)
@@ -117,11 +198,37 @@ impl CastExpr {
                 | (Utf8, LargeUtf8)
         )
     }
+
+    /// Check if the cast is a widening cast (e.g. from `Int8` to `Int16`).
+    pub fn is_bigger_cast(&self, src: &DataType) -> bool {
+        Self::check_bigger_cast(self.cast_type(), src)
+    }
+}
+
+pub(crate) fn is_order_preserving_cast_family(
+    source_type: &DataType,
+    target_type: &DataType,
+) -> bool {
+    (source_type.is_numeric() || *source_type == Boolean) && target_type.is_numeric()
+        || source_type.is_temporal() && target_type.is_temporal()
+        || source_type.eq(target_type)
+}
+
+pub(crate) fn cast_expr_properties(
+    child: &ExprProperties,
+    target_type: &DataType,
+) -> Result<ExprProperties> {
+    let unbounded = Interval::make_unbounded(target_type)?;
+    if is_order_preserving_cast_family(&child.range.data_type(), target_type) {
+        Ok(child.clone().with_range(unbounded))
+    } else {
+        Ok(ExprProperties::new_unknown().with_range(unbounded))
+    }
 }
 
 impl fmt::Display for CastExpr {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "CAST({} AS {:?})", self.expr, self.cast_type)
+        write!(f, "CAST({} AS {})", self.expr, self.cast_type())
     }
 }
 
@@ -132,26 +239,27 @@ impl PhysicalExpr for CastExpr {
     }
 
     fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
-        Ok(self.cast_type.clone())
+        Ok(self.cast_type().clone())
     }
 
     fn nullable(&self, input_schema: &Schema) -> Result<bool> {
-        self.expr.nullable(input_schema)
+        // A cast is nullable if **either** the child is nullable or the
+        // target field allows nulls.  This conservative rule prevents
+        // optimizers from assuming a non-null result when a null input could
+        // still propagate.  `return_field()` continues to expose the exact
+        // target metadata separately.
+        let child_nullable = self.expr.nullable(input_schema)?;
+        let target_nullable = self.resolved_target_field(input_schema)?.is_nullable();
+        Ok(child_nullable || target_nullable)
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let value = self.expr.evaluate(batch)?;
-        value.cast_to(&self.cast_type, Some(&self.cast_options))
+        value.cast_to(self.cast_type(), Some(&self.cast_options))
     }
 
     fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
-        Ok(self
-            .expr
-            .return_field(input_schema)?
-            .as_ref()
-            .clone()
-            .with_data_type(self.cast_type.clone())
-            .into())
+        self.resolved_target_field(input_schema)
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
@@ -162,16 +270,16 @@ impl PhysicalExpr for CastExpr {
         self: Arc<Self>,
         children: Vec<Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn PhysicalExpr>> {
-        Ok(Arc::new(CastExpr::new(
+        Ok(Arc::new(CastExpr::new_with_target_field(
             Arc::clone(&children[0]),
-            self.cast_type.clone(),
+            Arc::clone(&self.target_field),
             Some(self.cast_options.clone()),
         )))
     }
 
     fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
         // Cast current node's interval to the right type:
-        children[0].cast_to(&self.cast_type, &self.cast_options)
+        children[0].cast_to(self.cast_type(), &self.cast_options)
     }
 
     fn propagate_constraints(
@@ -183,32 +291,20 @@ impl PhysicalExpr for CastExpr {
         // Get child's datatype:
         let cast_type = child_interval.data_type();
         Ok(Some(vec![
-            interval.cast_to(&cast_type, &DEFAULT_SAFE_CAST_OPTIONS)?
+            interval.cast_to(&cast_type, &DEFAULT_SAFE_CAST_OPTIONS)?,
         ]))
     }
 
     /// A [`CastExpr`] preserves the ordering of its child if the cast is done
     /// under the same datatype family.
     fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
-        let source_datatype = children[0].range.data_type();
-        let target_type = &self.cast_type;
-
-        let unbounded = Interval::make_unbounded(target_type)?;
-        if (source_datatype.is_numeric() || source_datatype == Boolean)
-            && target_type.is_numeric()
-            || source_datatype.is_temporal() && target_type.is_temporal()
-            || source_datatype.eq(target_type)
-        {
-            Ok(children[0].clone().with_range(unbounded))
-        } else {
-            Ok(ExprProperties::new_unknown().with_range(unbounded))
-        }
+        cast_expr_properties(&children[0], self.cast_type())
     }
 
     fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "CAST(")?;
         self.expr.fmt_sql(f)?;
-        write!(f, " AS {:?}", self.cast_type)?;
+        write!(f, " AS {:?}", self.cast_type())?;
 
         write!(f, ")")
     }
@@ -227,10 +323,20 @@ pub fn cast_with_options(
     let expr_type = expr.data_type(input_schema)?;
     if expr_type == cast_type {
         Ok(Arc::clone(&expr))
+    } else if matches!((&expr_type, &cast_type), (Struct(_), Struct(_))) {
+        if can_cast_struct_types(&expr_type, &cast_type) {
+            // Allow struct-to-struct casts that pass name-based compatibility validation.
+            // This validation is applied at planning time (now) to fail fast, rather than
+            // deferring errors to execution time. The name-based casting logic will be
+            // executed at runtime via ColumnarValue::cast_to.
+            Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options)))
+        } else {
+            not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}")
+        }
     } else if can_cast_types(&expr_type, &cast_type) {
         Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options)))
     } else {
-        not_impl_err!("Unsupported CAST from {expr_type:?} to {cast_type:?}")
+        not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}")
     }
 }
 
@@ -254,14 +360,15 @@ mod tests {
 
     use arrow::{
         array::{
-            Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array,
-            Int64Array, Int8Array, StringArray, Time64NanosecondArray,
+            Array, Decimal128Array, Float32Array, Float64Array, Int8Array, Int16Array,
+            Int32Array, Int64Array, StringArray, Time64NanosecondArray,
             TimestampNanosecondArray, UInt32Array,
         },
         datatypes::*,
     };
-    use datafusion_common::assert_contains;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
+    use insta::assert_snapshot;
+    use std::collections::HashMap;
 
     // runs an end-to-end test of physical type cast
     // 1. construct a record batch with a column "a" of type A
@@ -281,10 +388,7 @@ mod tests {
                 cast_with_options(col("a", &schema)?, &schema, $TYPE, $CAST_OPTIONS)?;
 
             // verify that its display is correct
-            assert_eq!(
-                format!("CAST(a@0 AS {:?})", $TYPE),
-                format!("{}", expression)
-            );
+            assert_eq!(format!("CAST(a@0 AS {})", $TYPE), format!("{}", expression));
 
             // verify that the expression's type is correct
             assert_eq!(expression.data_type(&schema)?, $TYPE);
@@ -308,7 +412,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -333,10 +437,7 @@ mod tests {
                 cast_with_options(col("a", &schema)?, &schema, $TYPE, $CAST_OPTIONS)?;
 
             // verify that its display is correct
-            assert_eq!(
-                format!("CAST(a@0 AS {:?})", $TYPE),
-                format!("{}", expression)
-            );
+            assert_eq!(format!("CAST(a@0 AS {})", $TYPE), format!("{}", expression));
 
             // verify that the expression's type is correct
             assert_eq!(expression.data_type(&schema)?, $TYPE);
@@ -363,7 +464,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -436,12 +537,9 @@ mod tests {
         )?;
         let expression =
             cast_with_options(col("a", &schema)?, &schema, Decimal128(6, 2), None)?;
-        let e = expression.evaluate(&batch).unwrap_err(); // panics on OK
-        assert_contains!(
-            e.to_string(),
-            "Arrow error: Invalid argument error: 12345679 is too large to store in a Decimal128 of precision 6. Max is 999999"
-        );
-
+        let e = expression.evaluate(&batch).unwrap_err().strip_backtrace(); // panics on OK
+        assert_snapshot!(e, @"Arrow error: Invalid argument error: 123456.79 is too large to store in a Decimal128 of precision 6. Max is 9999.99");
+        // safe cast should return null
         let expression_safe = cast_with_options(
             col("a", &schema)?,
             &schema,
@@ -741,6 +839,9 @@ mod tests {
         Ok(())
     }
 
+    // Tests for timestamp timezone casting have been moved to timestamps.slt
+    // See the "Casting between timestamp with and without timezone" section
+
     #[test]
     fn invalid_cast() {
         // Ensure a useful error happens at plan time if invalid casts are used
@@ -766,14 +867,115 @@ mod tests {
         match result {
             Ok(_) => panic!("expected error"),
             Err(e) => {
-                assert!(e
-                    .to_string()
-                    .contains("Cannot cast string '9.1' to value of Int32 type"))
+                assert!(
+                    e.to_string()
+                        .contains("Cannot cast string '9.1' to value of Int32 type")
+                )
             }
         }
         Ok(())
     }
 
+    #[test]
+    fn field_aware_cast_return_field_preserves_target_metadata() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", Int32, false)]);
+        let expr = CastExpr::new_with_target_field(
+            col("a", &schema)?,
+            Arc::new(Field::new("cast_target", Int64, true).with_metadata(
+                HashMap::from([("target_meta".to_string(), "1".to_string())]),
+            )),
+            None,
+        );
+
+        let field = expr.return_field(&schema)?;
+
+        assert_eq!(field.name(), "cast_target");
+        assert_eq!(field.data_type(), &Int64);
+        assert!(field.is_nullable());
+        assert_eq!(
+            field.metadata().get("target_meta").map(String::as_str),
+            Some("1")
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn field_aware_cast_nullable_prefers_child_nullability() -> Result<()> {
+        // When the child expression is nullable the cast must be treated as
+        // nullable even if the explicitly supplied target field is marked
+        // non-nullable.  return_field() still reflects the target metadata.
+        let schema = Schema::new(vec![Field::new("a", Int32, true)]);
+        let expr = CastExpr::new_with_target_field(
+            col("a", &schema)?,
+            Arc::new(Field::new("cast_target", Int64, false)),
+            None,
+        );
+
+        assert!(expr.nullable(&schema)?);
+        assert!(!expr.return_field(&schema)?.is_nullable());
+
+        Ok(())
+    }
+
+    #[test]
+    fn type_only_cast_preserves_legacy_field_name_and_nullability() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", Int32, false)]);
+        let expr = CastExpr::new(col("a", &schema)?, Int64, None);
+
+        let field = expr.return_field(&schema)?;
+
+        assert_eq!(field.name(), "a");
+        assert_eq!(field.data_type(), &Int64);
+        assert!(!field.is_nullable());
+        assert!(!expr.nullable(&schema)?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn field_aware_cast_nullable_child_nonnullable_targets_nullable() -> Result<()> {
+        // child is non-nullable but the target field is marked nullable; the
+        // nullable() result should still be true because the field allows nulls.
+        let schema = Schema::new(vec![Field::new("a", Int32, false)]);
+        let expr = CastExpr::new_with_target_field(
+            col("a", &schema)?,
+            Arc::new(Field::new("cast_target", Int64, true)),
+            None,
+        );
+
+        assert!(expr.nullable(&schema)?);
+        assert!(expr.return_field(&schema)?.is_nullable());
+
+        Ok(())
+    }
+
+    #[test]
+    fn struct_cast_validation_uses_nested_target_fields() -> Result<()> {
+        let source_type = Struct(Fields::from(vec![
+            Arc::new(Field::new("x", Int32, true)),
+            Arc::new(Field::new("y", Utf8, true)),
+        ]));
+        let schema = Schema::new(vec![Field::new("a", source_type.clone(), true)]);
+
+        let valid_target = Struct(Fields::from(vec![
+            Arc::new(Field::new("y", Utf8, true)),
+            Arc::new(Field::new("x", Int64, true)),
+        ]));
+        cast_with_options(col("a", &schema)?, &schema, valid_target, None)?;
+
+        let invalid_target = Struct(Fields::from(vec![
+            Arc::new(Field::new("y", Utf8, true)),
+            Arc::new(Field::new("missing", Int64, false)),
+        ]));
+        let err = cast_with_options(col("a", &schema)?, &schema, invalid_target, None)
+            .expect_err("missing required struct field should fail");
+
+        assert!(err.to_string().contains("Unsupported CAST"));
+
+        Ok(())
+    }
+
     #[test]
     #[ignore] // TODO: https://github.com/apache/datafusion/issues/5396
     fn test_cast_decimal() -> Result<()> {
diff --git a/datafusion/physical-expr/src/expressions/cast_column.rs b/datafusion/physical-expr/src/expressions/cast_column.rs
new file mode 100644
index 0000000000000..a99953abdb5cb
--- /dev/null
+++ b/datafusion/physical-expr/src/expressions/cast_column.rs
@@ -0,0 +1,417 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Physical expression for struct-aware casting of columns.
+
+use super::cast::cast_expr_properties;
+use crate::physical_expr::PhysicalExpr;
+use arrow::{
+    compute::CastOptions,
+    datatypes::{DataType, FieldRef, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_common::{
+    Result, ScalarValue, format::DEFAULT_CAST_OPTIONS, nested_struct::cast_column,
+};
+use datafusion_expr_common::columnar_value::ColumnarValue;
+use datafusion_expr_common::sort_properties::ExprProperties;
+use std::{
+    any::Any,
+    fmt::{self, Display},
+    hash::Hash,
+    sync::Arc,
+};
+/// A physical expression that applies [`cast_column`] to its input.
+///
+/// [`CastColumnExpr`] extends the regular [`CastExpr`](super::CastExpr) by
+/// retaining schema metadata for both the input and output fields. This allows
+/// the evaluator to perform struct-aware casts that honour nested field
+/// ordering, preserve nullability, and fill missing fields with null values.
+///
+/// This expression is intended for schema rewriting scenarios where the
+/// planner already resolved the input column but needs to adapt its physical
+/// representation to a new [`arrow::datatypes::Field`]. It mirrors the behaviour of the
+/// [`datafusion_common::nested_struct::cast_column`] helper while integrating
+/// with the `PhysicalExpr` trait so it can participate in the execution plan
+/// like any other column expression.
+#[derive(Debug, Clone, Eq)]
+pub struct CastColumnExpr {
+    /// The physical expression producing the value to cast.
+    expr: Arc<dyn PhysicalExpr>,
+    /// The logical field of the input column.
+    input_field: FieldRef,
+    /// The field metadata describing the desired output column.
+    target_field: FieldRef,
+    /// Options forwarded to [`cast_column`].
+    cast_options: CastOptions<'static>,
+}
+
+// Manually derive `PartialEq`/`Hash` as `Arc<dyn PhysicalExpr>` does not
+// implement these traits by default for the trait object.
+impl PartialEq for CastColumnExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.expr.eq(&other.expr)
+            && self.input_field.eq(&other.input_field)
+            && self.target_field.eq(&other.target_field)
+            && self.cast_options.eq(&other.cast_options)
+    }
+}
+
+impl Hash for CastColumnExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.expr.hash(state);
+        self.input_field.hash(state);
+        self.target_field.hash(state);
+        self.cast_options.hash(state);
+    }
+}
+
+impl CastColumnExpr {
+    /// Create a new [`CastColumnExpr`].
+    pub fn new(
+        expr: Arc<dyn PhysicalExpr>,
+        input_field: FieldRef,
+        target_field: FieldRef,
+        cast_options: Option<CastOptions<'static>>,
+    ) -> Self {
+        Self {
+            expr,
+            input_field,
+            target_field,
+            cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS),
+        }
+    }
+
+    /// The expression that produces the value to be cast.
+    pub fn expr(&self) -> &Arc<dyn PhysicalExpr> {
+        &self.expr
+    }
+
+    /// Field metadata describing the resolved input column.
+    pub fn input_field(&self) -> &FieldRef {
+        &self.input_field
+    }
+
+    /// Field metadata describing the output column after casting.
+    pub fn target_field(&self) -> &FieldRef {
+        &self.target_field
+    }
+}
+
+impl Display for CastColumnExpr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "CAST_COLUMN({} AS {})",
+            self.expr,
+            self.target_field.data_type()
+        )
+    }
+}
+
+impl PhysicalExpr for CastColumnExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(self.target_field.data_type().clone())
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(self.target_field.is_nullable())
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let value = self.expr.evaluate(batch)?;
+        match value {
+            ColumnarValue::Array(array) => {
+                let casted =
+                    cast_column(&array, self.target_field.as_ref(), &self.cast_options)?;
+                Ok(ColumnarValue::Array(casted))
+            }
+            ColumnarValue::Scalar(scalar) => {
+                let as_array = scalar.to_array_of_size(1)?;
+                let casted = cast_column(
+                    &as_array,
+                    self.target_field.as_ref(),
+                    &self.cast_options,
+                )?;
+                let result = ScalarValue::try_from_array(casted.as_ref(), 0)?;
+                Ok(ColumnarValue::Scalar(result))
+            }
+        }
+    }
+
+    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
+        Ok(Arc::clone(&self.target_field))
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        vec![&self.expr]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        assert_eq!(children.len(), 1);
+        let child = children.pop().expect("CastColumnExpr child");
+        Ok(Arc::new(Self::new(
+            child,
+            Arc::clone(&self.input_field),
+            Arc::clone(&self.target_field),
+            Some(self.cast_options.clone()),
+        )))
+    }
+
+    /// A [`CastColumnExpr`] preserves the ordering of its child if the cast is done
+    /// under the same datatype family.
+    fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
+        cast_expr_properties(&children[0], self.target_field.data_type())
+    }
+
+    fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        Display::fmt(self, f)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::expressions::{Column, Literal};
+    use arrow::{
+        array::{Array, ArrayRef, BooleanArray, Int32Array, StringArray, StructArray},
+        datatypes::{DataType, Field, Fields, SchemaRef},
+    };
+    use datafusion_common::{
+        Result as DFResult, ScalarValue,
+        cast::{as_int64_array, as_string_array, as_struct_array, as_uint8_array},
+    };
+
+    fn make_schema(field: &Field) -> SchemaRef {
+        Arc::new(Schema::new(vec![field.clone()]))
+    }
+
+    fn make_struct_array(fields: Fields, arrays: Vec<ArrayRef>) -> StructArray {
+        StructArray::new(fields, arrays, None)
+    }
+
+    #[test]
+    fn cast_primitive_array() -> DFResult<()> {
+        let input_field = Field::new("a", DataType::Int32, true);
+        let target_field = Field::new("a", DataType::Int64, true);
+        let schema = make_schema(&input_field);
+
+        let values = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![values])?;
+
+        let column = Arc::new(Column::new_with_schema("a", schema.as_ref())?);
+        let expr = CastColumnExpr::new(
+            column,
+            Arc::new(input_field.clone()),
+            Arc::new(target_field.clone()),
+            None,
+        );
+
+        let result = expr.evaluate(&batch)?;
+        let ColumnarValue::Array(array) = result else {
+            panic!("expected array");
+        };
+        let casted = as_int64_array(array.as_ref())?;
+        assert_eq!(casted.value(0), 1);
+        assert!(casted.is_null(1));
+        assert_eq!(casted.value(2), 3);
+        Ok(())
+    }
+
+    #[test]
+    fn cast_struct_array_missing_child() -> DFResult<()> {
+        let source_a = Field::new("a", DataType::Int32, true);
+        let source_b = Field::new("b", DataType::Utf8, true);
+        let input_field = Field::new(
+            "s",
+            DataType::Struct(
+                vec![Arc::new(source_a.clone()), Arc::new(source_b.clone())].into(),
+            ),
+            true,
+        );
+        let target_a = Field::new("a", DataType::Int64, true);
+        let target_c = Field::new("c", DataType::Utf8, true);
+        let target_field = Field::new(
+            "s",
+            DataType::Struct(
+                vec![Arc::new(target_a.clone()), Arc::new(target_c.clone())].into(),
+            ),
+            true,
+        );
+
+        let schema = make_schema(&input_field);
+        let struct_array = make_struct_array(
+            vec![Arc::new(source_a.clone()), Arc::new(source_b.clone())].into(),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), None])) as ArrayRef,
+                Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")]))
+                    as ArrayRef,
+            ],
+        );
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(struct_array) as Arc<_>],
+        )?;
+
+        let column = Arc::new(Column::new_with_schema("s", schema.as_ref())?);
+        let expr = CastColumnExpr::new(
+            column,
+            Arc::new(input_field.clone()),
+            Arc::new(target_field.clone()),
+            None,
+        );
+
+        let result = expr.evaluate(&batch)?;
+        let ColumnarValue::Array(array) = result else {
+            panic!("expected array");
+        };
+        let struct_array = as_struct_array(array.as_ref())?;
+        let cast_a = as_int64_array(struct_array.column_by_name("a").unwrap().as_ref())?;
+        assert_eq!(cast_a.value(0), 1);
+        assert!(cast_a.is_null(1));
+
+        let cast_c = as_string_array(struct_array.column_by_name("c").unwrap().as_ref())?;
+        assert!(cast_c.is_null(0));
+        assert!(cast_c.is_null(1));
+        Ok(())
+    }
+
+    #[test]
+    fn cast_nested_struct_array() -> DFResult<()> {
+        let inner_source = Field::new(
+            "inner",
+            DataType::Struct(
+                vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
+            ),
+            true,
+        );
+        let outer_field = Field::new(
+            "root",
+            DataType::Struct(vec![Arc::new(inner_source.clone())].into()),
+            true,
+        );
+
+        let inner_target = Field::new(
+            "inner",
+            DataType::Struct(
+                vec![
+                    Arc::new(Field::new("x", DataType::Int64, true)),
+                    Arc::new(Field::new("y", DataType::Boolean, true)),
+                ]
+                .into(),
+            ),
+            true,
+        );
+        let target_field = Field::new(
+            "root",
+            DataType::Struct(vec![Arc::new(inner_target.clone())].into()),
+            true,
+        );
+
+        let schema = make_schema(&outer_field);
+
+        let inner_struct = make_struct_array(
+            vec![Arc::new(Field::new("x", DataType::Int32, true))].into(),
+            vec![Arc::new(Int32Array::from(vec![Some(7), None])) as ArrayRef],
+        );
+        let outer_struct = make_struct_array(
+            vec![Arc::new(inner_source.clone())].into(),
+            vec![Arc::new(inner_struct) as ArrayRef],
+        );
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(outer_struct) as ArrayRef],
+        )?;
+
+        let column = Arc::new(Column::new_with_schema("root", schema.as_ref())?);
+        let expr = CastColumnExpr::new(
+            column,
+            Arc::new(outer_field.clone()),
+            Arc::new(target_field.clone()),
+            None,
+        );
+
+        let result = expr.evaluate(&batch)?;
+        let ColumnarValue::Array(array) = result else {
+            panic!("expected array");
+        };
+        let struct_array = as_struct_array(array.as_ref())?;
+        let inner =
+            as_struct_array(struct_array.column_by_name("inner").unwrap().as_ref())?;
+        let x = as_int64_array(inner.column_by_name("x").unwrap().as_ref())?;
+        assert_eq!(x.value(0), 7);
+        assert!(x.is_null(1));
+        let y = inner.column_by_name("y").unwrap();
+        let y = y
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .expect("boolean array");
+        assert!(y.is_null(0));
+        assert!(y.is_null(1));
+        Ok(())
+    }
+
+    #[test]
+    fn cast_struct_scalar() -> DFResult<()> {
+        let source_field = Field::new("a", DataType::Int32, true);
+        let input_field = Field::new(
+            "s",
+            DataType::Struct(vec![Arc::new(source_field.clone())].into()),
+            true,
+        );
+        let target_field = Field::new(
+            "s",
+            DataType::Struct(
+                vec![Arc::new(Field::new("a", DataType::UInt8, true))].into(),
+            ),
+            true,
+        );
+
+        let schema = make_schema(&input_field);
+        let scalar_struct = StructArray::new(
+            vec![Arc::new(source_field.clone())].into(),
+            vec![Arc::new(Int32Array::from(vec![Some(9)])) as ArrayRef],
+            None,
+        );
+        let literal =
+            Arc::new(Literal::new(ScalarValue::Struct(Arc::new(scalar_struct))));
+        let expr = CastColumnExpr::new(
+            literal,
+            Arc::new(input_field.clone()),
+            Arc::new(target_field.clone()),
+            None,
+        );
+
+        let batch = RecordBatch::new_empty(Arc::clone(&schema));
+        let result = expr.evaluate(&batch)?;
+        let ColumnarValue::Scalar(ScalarValue::Struct(array)) = result else {
+            panic!("expected struct scalar");
+        };
+        let casted = array.column_by_name("a").unwrap();
+        let casted = as_uint8_array(casted.as_ref())?;
+        assert_eq!(casted.value(0), 9);
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs
index 5a11783a87e90..cf844790a002e 100644
--- a/datafusion/physical-expr/src/expressions/column.rs
+++ b/datafusion/physical-expr/src/expressions/column.rs
@@ -28,8 +28,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::ColumnarValue;
+use datafusion_expr_common::placement::ExpressionPlacement;
 
 /// Represents the column at a given index in a RecordBatch
 ///
@@ -49,9 +50,9 @@ use datafusion_expr::ColumnarValue;
 /// # use arrow::datatypes::{DataType, Field, Schema};
 /// // Schema with columns a, b, c
 /// let schema = Schema::new(vec![
-///    Field::new("a", DataType::Int32, false),
-///    Field::new("b", DataType::Int32, false),
-///    Field::new("c", DataType::Int32, false),
+///     Field::new("a", DataType::Int32, false),
+///     Field::new("b", DataType::Int32, false),
+///     Field::new("c", DataType::Int32, false),
 /// ]);
 ///
 /// // reference to column b is index 1
@@ -146,6 +147,10 @@ impl PhysicalExpr for Column {
     fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.name)
     }
+
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::Column
+    }
 }
 
 impl Column {
@@ -158,7 +163,11 @@ impl Column {
                 self.name,
                 self.index,
                 input_schema.fields.len(),
-                input_schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>()
+                input_schema
+                    .fields()
+                    .iter()
+                    .map(|f| f.name())
+                    .collect::<Vec<_>>()
             )
         }
     }
@@ -204,7 +213,6 @@ mod test {
     use arrow::array::StringArray;
     use arrow::datatypes::{DataType, Field, Schema};
     use arrow::record_batch::RecordBatch;
-    use datafusion_common::Result;
 
     use std::sync::Arc;
 
@@ -214,8 +222,9 @@ mod test {
         let col = Column::new("id", 9);
         let error = col.data_type(&schema).expect_err("error").strip_backtrace();
         assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
-            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
-            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error))
+             but input schema only has 1 columns: [\"foo\"].\nThis issue was likely caused by a bug \
+             in DataFusion's code. Please help us to resolve this by filing a bug report \
+             in our issue tracker: https://github.com/apache/datafusion/issues".starts_with(&error))
     }
 
     #[test]
@@ -224,20 +233,21 @@ mod test {
         let col = Column::new("id", 9);
         let error = col.nullable(&schema).expect_err("error").strip_backtrace();
         assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
-            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
-            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error))
+             but input schema only has 1 columns: [\"foo\"].\nThis issue was likely caused by a bug \
+             in DataFusion's code. Please help us to resolve this by filing a bug report \
+             in our issue tracker: https://github.com/apache/datafusion/issues".starts_with(&error));
     }
 
     #[test]
-    fn out_of_bounds_evaluate() -> Result<()> {
+    fn out_of_bounds_evaluate() {
         let schema = Schema::new(vec![Field::new("foo", DataType::Utf8, true)]);
         let data: StringArray = vec!["data"].into();
-        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)])?;
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data)]).unwrap();
         let col = Column::new("id", 9);
         let error = col.evaluate(&batch).expect_err("error").strip_backtrace();
         assert!("Internal error: PhysicalExpr Column references column 'id' at index 9 (zero-based) \
-            but input schema only has 1 columns: [\"foo\"].\nThis was likely caused by a bug in \
-            DataFusion's code and we would welcome that you file an bug report in our issue tracker".starts_with(&error));
-        Ok(())
+             but input schema only has 1 columns: [\"foo\"].\nThis issue was likely caused by a bug \
+             in DataFusion's code. Please help us to resolve this by filing a bug report \
+             in our issue tracker: https://github.com/apache/datafusion/issues".starts_with(&error));
     }
 }
diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs
index 9785203a70208..d285f8b377eca 100644
--- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs
+++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs
@@ -15,23 +15,46 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    any::Any,
-    fmt::Display,
-    hash::Hash,
-    sync::{Arc, RwLock},
-};
+use parking_lot::RwLock;
+use std::{any::Any, fmt::Display, hash::Hash, sync::Arc};
+use tokio::sync::watch;
 
 use crate::PhysicalExpr;
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::{
-    tree_node::{Transformed, TransformedResult, TreeNode},
     Result,
+    tree_node::{Transformed, TransformedResult, TreeNode},
 };
 use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash};
+use datafusion_physical_expr_common::physical_expr::DynHash;
+
+/// State of a dynamic filter, tracking both updates and completion.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum FilterState {
+    /// Filter is in progress and may receive more updates.
+    InProgress { generation: u64 },
+    /// Filter is complete and will not receive further updates.
+    Complete { generation: u64 },
+}
+
+impl FilterState {
+    fn generation(&self) -> u64 {
+        match self {
+            FilterState::InProgress { generation }
+            | FilterState::Complete { generation } => *generation,
+        }
+    }
+}
 
 /// A dynamic [`PhysicalExpr`] that can be updated by anyone with a reference to it.
+///
+/// Any `ExecutionPlan` that uses this expression and holds a reference to it internally should probably also
+/// implement `ExecutionPlan::reset_state` to remain compatible with recursive queries and other situations where
+/// the same `ExecutionPlan` is reused with different data.
+///
+/// For more background, please also see the [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]
+///
+/// [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
 #[derive(Debug)]
 pub struct DynamicFilterPhysicalExpr {
     /// The original children of this PhysicalExpr, if any.
@@ -43,7 +66,9 @@ pub struct DynamicFilterPhysicalExpr {
     /// so that when we update `current()` in subsequent iterations we can re-apply the replacements.
     remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>,
     /// The source of dynamic filters.
-    inner: Arc<RwLock<Arc<dyn PhysicalExpr>>>,
+    inner: Arc<RwLock<Inner>>,
+    /// Broadcasts filter state (updates and completion) to all waiters.
+    state_watch: watch::Sender<FilterState>,
     /// For testing purposes track the data type and nullability to make sure they don't change.
     /// If they do, there's a bug in the implementation.
     /// But this can have overhead in production, so it's only included in our tests.
@@ -51,10 +76,42 @@ pub struct DynamicFilterPhysicalExpr {
     nullable: Arc<RwLock<Option<bool>>>,
 }
 
+#[derive(Debug)]
+struct Inner {
+    /// A counter that gets incremented every time the expression is updated so that we can track changes cheaply.
+    /// This is used for [`PhysicalExpr::snapshot_generation`] to have a cheap check for changes.
+    generation: u64,
+    expr: Arc<dyn PhysicalExpr>,
+    /// Flag for quick synchronous check if filter is complete.
+    /// This is redundant with the watch channel state, but allows us to return immediately
+    /// from `wait_complete()` without subscribing if already complete.
+    is_complete: bool,
+}
+
+impl Inner {
+    fn new(expr: Arc<dyn PhysicalExpr>) -> Self {
+        Self {
+            // Start with generation 1 which gives us a different result for [`PhysicalExpr::generation`] than the default 0.
+            // This is not currently used anywhere but it seems useful to have this simple distinction.
+            generation: 1,
+            expr,
+            is_complete: false,
+        }
+    }
+
+    /// Clone the inner expression.
+    fn expr(&self) -> &Arc<dyn PhysicalExpr> {
+        &self.expr
+    }
+}
+
 impl Hash for DynamicFilterPhysicalExpr {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        let inner = self.current().expect("Failed to get current expression");
-        inner.dyn_hash(state);
+        // Use pointer identity of the inner Arc for stable hashing.
+        // This is stable across update() calls and consistent with Eq.
+        // See issue #19641 for details on why content-based hashing violates
+        // the Hash/Eq contract when the underlying expression can change.
+        Arc::as_ptr(&self.inner).hash(state);
         self.children.dyn_hash(state);
         self.remapped_children.dyn_hash(state);
     }
@@ -62,11 +119,13 @@ impl Hash for DynamicFilterPhysicalExpr {
 
 impl PartialEq for DynamicFilterPhysicalExpr {
     fn eq(&self, other: &Self) -> bool {
-        let inner = self.current().expect("Failed to get current expression");
-        let our_children = self.remapped_children.as_ref().unwrap_or(&self.children);
-        let other_children = other.remapped_children.as_ref().unwrap_or(&other.children);
-        let other = other.current().expect("Failed to get current expression");
-        inner.dyn_eq(other.as_any()) && our_children == other_children
+        // Two dynamic filters are equal if they share the same inner source
+        // AND have the same children configuration.
+        // This is consistent with Hash using Arc::as_ptr.
+        // See issue #19641 for details on the Hash/Eq contract violation fix.
+        Arc::ptr_eq(&self.inner, &other.inner)
+            && self.children == other.children
+            && self.remapped_children == other.remapped_children
     }
 }
 
@@ -74,8 +133,7 @@ impl Eq for DynamicFilterPhysicalExpr {}
 
 impl Display for DynamicFilterPhysicalExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.current().expect("Failed to get current expression");
-        write!(f, "DynamicFilterPhysicalExpr [ {inner} ]")
+        self.render(f, |expr, f| write!(f, "{expr}"))
     }
 }
 
@@ -102,16 +160,21 @@ impl DynamicFilterPhysicalExpr {
     /// do not change* since those will be used to determine what columns need to read or projected
     /// when evaluating the expression.
     ///
+    /// Any `ExecutionPlan` that uses this expression and holds a reference to it internally should probably also
+    /// implement `ExecutionPlan::reset_state` to remain compatible with recursive queries and other situations where
+    /// the same `ExecutionPlan` is reused with different data.
+    ///
     /// [`collect_columns`]: crate::utils::collect_columns
-    #[allow(dead_code)] // Only used in tests for now
     pub fn new(
         children: Vec<Arc<dyn PhysicalExpr>>,
         inner: Arc<dyn PhysicalExpr>,
     ) -> Self {
+        let (state_watch, _) = watch::channel(FilterState::InProgress { generation: 1 });
         Self {
             children,
             remapped_children: None, // Initially no remapped children
-            inner: Arc::new(RwLock::new(inner)),
+            inner: Arc::new(RwLock::new(Inner::new(inner))),
+            state_watch,
             data_type: Arc::new(RwLock::new(None)),
             nullable: Arc::new(RwLock::new(None)),
         }
@@ -146,37 +209,26 @@ impl DynamicFilterPhysicalExpr {
         }
     }
 
+    /// Get the current generation of the expression.
+    fn current_generation(&self) -> u64 {
+        self.inner.read().generation
+    }
+
     /// Get the current expression.
     /// This will return the current expression with any children
     /// remapped to match calls to [`PhysicalExpr::with_new_children`].
     pub fn current(&self) -> Result<Arc<dyn PhysicalExpr>> {
-        let inner = self
-            .inner
-            .read()
-            .map_err(|_| {
-                datafusion_common::DataFusionError::Execution(
-                    "Failed to acquire read lock for inner".to_string(),
-                )
-            })?
-            .clone();
-        let inner =
-            Self::remap_children(&self.children, self.remapped_children.as_ref(), inner)?;
-        Ok(inner)
-    }
-
-    /// Update the current expression.
+        let expr = Arc::clone(self.inner.read().expr());
+        Self::remap_children(&self.children, self.remapped_children.as_ref(), expr)
+    }
+
+    /// Update the current expression and notify all waiters.
     /// Any children of this expression must be a subset of the original children
     /// passed to the constructor.
     /// This should be called e.g.:
     /// - When we've computed the probe side's hash table in a HashJoinExec
     /// - After every batch is processed if we update the TopK heap in a SortExec using a TopK approach.
-    #[allow(dead_code)] // Only used in tests for now
     pub fn update(&self, new_expr: Arc<dyn PhysicalExpr>) -> Result<()> {
-        let mut current = self.inner.write().map_err(|_| {
-            datafusion_common::DataFusionError::Execution(
-                "Failed to acquire write lock for inner".to_string(),
-            )
-        })?;
         // Remap the children of the new expression to match the original children
         // We still do this again in `current()` but doing it preventively here
         // reduces the work needed in some cases if `current()` is called multiple times
@@ -186,9 +238,114 @@ impl DynamicFilterPhysicalExpr {
             self.remapped_children.as_ref(),
             new_expr,
         )?;
-        *current = new_expr;
+
+        // Load the current inner, increment generation, and store the new one
+        let mut current = self.inner.write();
+        let new_generation = current.generation + 1;
+        *current = Inner {
+            generation: new_generation,
+            expr: new_expr,
+            is_complete: current.is_complete,
+        };
+        drop(current); // Release the lock before broadcasting
+
+        // Broadcast the new state to all waiters
+        let _ = self.state_watch.send(FilterState::InProgress {
+            generation: new_generation,
+        });
         Ok(())
     }
+
+    /// Mark this dynamic filter as complete and broadcast to all waiters.
+    ///
+    /// This signals that all expected updates have been received.
+    /// Waiters using [`Self::wait_complete`] will be notified.
+    pub fn mark_complete(&self) {
+        let mut current = self.inner.write();
+        let current_generation = current.generation;
+        current.is_complete = true;
+        drop(current);
+
+        // Broadcast completion to all waiters
+        let _ = self.state_watch.send(FilterState::Complete {
+            generation: current_generation,
+        });
+    }
+
+    /// Wait asynchronously for any update to this filter.
+    ///
+    /// This method will return when [`Self::update`] is called and the generation increases.
+    /// It does not guarantee that the filter is complete.
+    ///
+    /// Producers (e.g.) HashJoinExec may never update the expression or mark it as completed if there are no consumers.
+    /// If you call this method on a dynamic filter created by such a producer and there are no consumers registered this method would wait indefinitely.
+    /// This should not happen under normal operation and would indicate a programming error either in your producer or in DataFusion if the producer is a built in node.
+    pub async fn wait_update(&self) {
+        let mut rx = self.state_watch.subscribe();
+        // Get the current generation
+        let current_gen = rx.borrow_and_update().generation();
+
+        // Wait until generation increases
+        let _ = rx.wait_for(|state| state.generation() > current_gen).await;
+    }
+
+    /// Wait asynchronously until this dynamic filter is marked as complete.
+    ///
+    /// This method returns immediately if the filter is already complete.
+    /// Otherwise, it waits until [`Self::mark_complete`] is called.
+    ///
+    /// Unlike [`Self::wait_update`], this method guarantees that when it returns,
+    /// the filter is fully complete with no more updates expected.
+    ///
+    /// Producers (e.g.) HashJoinExec may never update the expression or mark it as completed if there are no consumers.
+    /// If you call this method on a dynamic filter created by such a producer and there are no consumers registered this method would wait indefinitely.
+    /// This should not happen under normal operation and would indicate a programming error either in your producer or in DataFusion if the producer is a built in node.
+    pub async fn wait_complete(&self) {
+        if self.inner.read().is_complete {
+            return;
+        }
+
+        let mut rx = self.state_watch.subscribe();
+        let _ = rx
+            .wait_for(|state| matches!(state, FilterState::Complete { .. }))
+            .await;
+    }
+
+    /// Check if this dynamic filter is being actively used by any consumers.
+    ///
+    /// Returns `true` if there are references beyond the producer (e.g., the HashJoinExec
+    /// that created the filter). This is useful to avoid computing expensive filter
+    /// expressions when no consumer will actually use them.
+    ///
+    /// # Implementation Details
+    ///
+    /// We check both Arc counts to handle two cases:
+    /// - Transformed filters (via `with_new_children`) share the inner Arc (inner count > 1)
+    /// - Direct clones (via `Arc::clone`) increment the outer count (outer count > 1)
+    pub fn is_used(self: &Arc<Self>) -> bool {
+        // Strong count > 1 means at least one consumer is holding a reference beyond the producer.
+        Arc::strong_count(self) > 1 || Arc::strong_count(&self.inner) > 1
+    }
+
+    fn render(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        render_expr: impl FnOnce(
+            Arc<dyn PhysicalExpr>,
+            &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result,
+    ) -> std::fmt::Result {
+        let inner = self.current().map_err(|_| std::fmt::Error)?;
+        let current_generation = self.current_generation();
+        write!(f, "DynamicFilter [ ")?;
+        if current_generation == 1 {
+            write!(f, "empty")?;
+        } else {
+            render_expr(inner, f)?;
+        }
+
+        write!(f, " ]")
+    }
 }
 
 impl PhysicalExpr for DynamicFilterPhysicalExpr {
@@ -212,6 +369,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
             children: self.children.clone(),
             remapped_children: Some(children),
             inner: Arc::clone(&self.inner),
+            state_watch: self.state_watch.clone(),
             data_type: Arc::clone(&self.data_type),
             nullable: Arc::clone(&self.nullable),
         }))
@@ -223,10 +381,8 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
         {
             use datafusion_common::internal_err;
             // Check if the data type has changed.
-            let mut data_type_lock = self
-                .data_type
-                .write()
-                .expect("Failed to acquire write lock for data_type");
+            let mut data_type_lock = self.data_type.write();
+
             if let Some(existing) = &*data_type_lock {
                 if existing != &res {
                     // If the data type has changed, we have a bug.
@@ -248,10 +404,7 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
         {
             use datafusion_common::internal_err;
             // Check if the nullability has changed.
-            let mut nullable_lock = self
-                .nullable
-                .write()
-                .expect("Failed to acquire write lock for nullable");
+            let mut nullable_lock = self.nullable.write();
             if let Some(existing) = *nullable_lock {
                 if existing != res {
                     // If the nullability has changed, we have a bug.
@@ -283,21 +436,25 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr {
     }
 
     fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.current().map_err(|_| std::fmt::Error)?;
-        inner.fmt_sql(f)
+        self.render(f, |expr, f| expr.fmt_sql(f))
     }
 
     fn snapshot(&self) -> Result<Option<Arc<dyn PhysicalExpr>>> {
         // Return the current expression as a snapshot.
         Ok(Some(self.current()?))
     }
+
+    fn snapshot_generation(&self) -> u64 {
+        // Return the current generation of the expression.
+        self.inner.read().generation
+    }
 }
 
 #[cfg(test)]
 mod test {
     use crate::{
-        expressions::{col, lit, BinaryExpr},
-        utils::reassign_predicate_columns,
+        expressions::{BinaryExpr, col, lit},
+        utils::reassign_expr_columns,
     };
     use arrow::{
         array::RecordBatch,
@@ -335,22 +492,20 @@ mod test {
         ]));
         // Each ParquetExec calls `with_new_children` on the DynamicFilterPhysicalExpr
         // and remaps the children to the file schema.
-        let dynamic_filter_1 = reassign_predicate_columns(
+        let dynamic_filter_1 = reassign_expr_columns(
             Arc::clone(&dynamic_filter) as Arc<dyn PhysicalExpr>,
             &filter_schema_1,
-            false,
         )
         .unwrap();
         let snap = dynamic_filter_1.snapshot().unwrap().unwrap();
-        insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42) }, fail_on_overflow: false }"#);
-        let dynamic_filter_2 = reassign_predicate_columns(
+        insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#);
+        let dynamic_filter_2 = reassign_expr_columns(
             Arc::clone(&dynamic_filter) as Arc<dyn PhysicalExpr>,
             &filter_schema_2,
-            false,
         )
         .unwrap();
         let snap = dynamic_filter_2.snapshot().unwrap().unwrap();
-        insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42) }, fail_on_overflow: false }"#);
+        insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#);
         // Both filters allow evaluating the same expression
         let batch_1 = RecordBatch::try_new(
             Arc::clone(&filter_schema_1),
@@ -471,4 +626,245 @@ mod test {
             "Expected err when evaluate is called after changing the expression."
         );
     }
+
+    #[tokio::test]
+    async fn test_wait_complete_already_complete() {
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![],
+            lit(42) as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Mark as complete immediately
+        dynamic_filter.mark_complete();
+
+        // wait_complete should return immediately
+        dynamic_filter.wait_complete().await;
+    }
+
+    #[test]
+    fn test_with_new_children_independence() {
+        // Create a schema with columns a, b, c, d
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]));
+
+        // Create expression col(a) + col(b)
+        let col_a = col("a", &schema).unwrap();
+        let col_b = col("b", &schema).unwrap();
+        let col_c = col("c", &schema).unwrap();
+        let col_d = col("d", &schema).unwrap();
+
+        let expr = Arc::new(BinaryExpr::new(
+            Arc::clone(&col_a),
+            datafusion_expr::Operator::Plus,
+            Arc::clone(&col_b),
+        ));
+
+        // Create DynamicFilterPhysicalExpr with children [col_a, col_b]
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            expr as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Clone the Arc (two references to the same DynamicFilterPhysicalExpr)
+        let clone_1 = Arc::clone(&dynamic_filter);
+        let clone_2 = Arc::clone(&dynamic_filter);
+
+        // Call with_new_children with different children on each clone
+        // clone_1: replace [a, b] with [b, c] -> expression becomes b + c
+        let remapped_1 = clone_1
+            .with_new_children(vec![Arc::clone(&col_b), Arc::clone(&col_c)])
+            .unwrap();
+
+        // clone_2: replace [a, b] with [b, d] -> expression becomes b + d
+        let remapped_2 = clone_2
+            .with_new_children(vec![Arc::clone(&col_b), Arc::clone(&col_d)])
+            .unwrap();
+
+        // Create a RecordBatch with columns a=1,2,3  b=10,20,30  c=100,200,300  d=1000,2000,3000
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])), // a
+                Arc::new(arrow::array::Int32Array::from(vec![10, 20, 30])), // b
+                Arc::new(arrow::array::Int32Array::from(vec![100, 200, 300])), // c
+                Arc::new(arrow::array::Int32Array::from(vec![1000, 2000, 3000])), // d
+            ],
+        )
+        .unwrap();
+
+        // Evaluate both remapped expressions
+        let result_1 = remapped_1.evaluate(&batch).unwrap();
+        let result_2 = remapped_2.evaluate(&batch).unwrap();
+
+        // Extract arrays from results
+        let ColumnarValue::Array(arr_1) = result_1 else {
+            panic!("Expected ColumnarValue::Array for result_1");
+        };
+        let ColumnarValue::Array(arr_2) = result_2 else {
+            panic!("Expected ColumnarValue::Array for result_2");
+        };
+
+        // Verify result_1 = b + c = [110, 220, 330]
+        let expected_1: Arc<dyn arrow::array::Array> =
+            Arc::new(arrow::array::Int32Array::from(vec![110, 220, 330]));
+        assert!(
+            arr_1.eq(&expected_1),
+            "Expected b + c = [110, 220, 330], got {arr_1:?}",
+        );
+
+        // Verify result_2 = b + d = [1010, 2020, 3030]
+        let expected_2: Arc<dyn arrow::array::Array> =
+            Arc::new(arrow::array::Int32Array::from(vec![1010, 2020, 3030]));
+        assert!(
+            arr_2.eq(&expected_2),
+            "Expected b + d = [1010, 2020, 3030], got {arr_2:?}",
+        );
+    }
+
+    #[test]
+    fn test_is_used() {
+        let filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![],
+            lit(true) as Arc<dyn PhysicalExpr>,
+        ));
+
+        // Initially, only one reference to the inner Arc exists
+        assert!(
+            !filter.is_used(),
+            "Filter should not be used with only one inner reference"
+        );
+
+        // Simulate a consumer created via transformation (what happens during filter pushdown).
+        // When filters are pushed down and transformed via reassign_expr_columns/transform_down,
+        // with_new_children() is called which creates a new outer Arc but clones the inner Arc.
+        let consumer1_expr = Arc::clone(&filter).with_new_children(vec![]).unwrap();
+        let _consumer1 = consumer1_expr
+            .as_any()
+            .downcast_ref::<DynamicFilterPhysicalExpr>()
+            .expect("Should be DynamicFilterPhysicalExpr");
+
+        // Now the inner Arc is shared (inner_count = 2)
+        assert!(
+            filter.is_used(),
+            "Filter should be used when inner Arc is shared with transformed consumer"
+        );
+
+        // Create another transformed consumer
+        let consumer2_expr = Arc::clone(&filter).with_new_children(vec![]).unwrap();
+        let _consumer2 = consumer2_expr
+            .as_any()
+            .downcast_ref::<DynamicFilterPhysicalExpr>()
+            .expect("Should be DynamicFilterPhysicalExpr");
+
+        assert!(
+            filter.is_used(),
+            "Filter should still be used with multiple consumers"
+        );
+    }
+
+    /// Test that verifies the Hash/Eq contract is now satisfied (issue #19641 fix).
+    ///
+    /// After the fix, Hash uses Arc::as_ptr(&self.inner) which is stable across
+    /// update() calls, fixing the HashMap key instability issue.
+    #[test]
+    fn test_hash_stable_after_update() {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        // Create filter with initial value
+        let filter =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Compute hash BEFORE update
+        let mut hasher_before = DefaultHasher::new();
+        filter.hash(&mut hasher_before);
+        let hash_before = hasher_before.finish();
+
+        // Update changes the underlying expression
+        filter
+            .update(lit(false) as Arc<dyn PhysicalExpr>)
+            .expect("Update should succeed");
+
+        // Compute hash AFTER update
+        let mut hasher_after = DefaultHasher::new();
+        filter.hash(&mut hasher_after);
+        let hash_after = hasher_after.finish();
+
+        // FIXED: Hash should now be STABLE after update() because we use
+        // Arc::as_ptr for identity-based hashing instead of expression content.
+        assert_eq!(
+            hash_before, hash_after,
+            "Hash should be stable after update() - fix for issue #19641"
+        );
+
+        // Self-equality should still hold
+        assert!(filter.eq(&filter), "Self-equality should hold");
+    }
+
+    /// Test that verifies separate DynamicFilterPhysicalExpr instances
+    /// with the same expression are NOT equal (identity-based comparison).
+    #[test]
+    fn test_identity_based_equality() {
+        // Create two separate filters with identical initial expressions
+        let filter1 =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+        let filter2 =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Different instances should NOT be equal even with same expression
+        // because they have independent inner Arcs (different update lifecycles)
+        assert!(
+            !filter1.eq(&filter2),
+            "Different instances should not be equal (identity-based)"
+        );
+
+        // Self-equality should hold
+        assert!(filter1.eq(&filter1), "Self-equality should hold");
+    }
+
+    /// Test that hash is stable for the same filter instance.
+    /// After the fix, hash uses Arc::as_ptr which is pointer-based.
+    #[test]
+    fn test_hash_stable_for_same_instance() {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let filter =
+            DynamicFilterPhysicalExpr::new(vec![], lit(true) as Arc<dyn PhysicalExpr>);
+
+        // Compute hash twice for the same instance
+        let hash1 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+        let hash2 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+
+        assert_eq!(hash1, hash2, "Same instance should have stable hash");
+
+        // Update the expression
+        filter
+            .update(lit(false) as Arc<dyn PhysicalExpr>)
+            .expect("Update should succeed");
+
+        // Hash should STILL be the same (identity-based)
+        let hash3 = {
+            let mut h = DefaultHasher::new();
+            filter.hash(&mut h);
+            h.finish()
+        };
+
+        assert_eq!(
+            hash1, hash3,
+            "Hash should be stable after update (identity-based)"
+        );
+    }
 }
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index 469f7bbee3173..ca89a3ab1ef43 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -22,37 +22,41 @@ use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use crate::physical_expr::physical_exprs_bag_equal;
 use crate::PhysicalExpr;
+use crate::physical_expr::physical_exprs_bag_equal;
 
-use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::*;
-use arrow::buffer::BooleanBuffer;
+use arrow::buffer::{BooleanBuffer, NullBuffer};
 use arrow::compute::kernels::boolean::{not, or_kleene};
-use arrow::compute::take;
+use arrow::compute::kernels::cmp::eq as arrow_eq;
+use arrow::compute::{SortOptions, take};
 use arrow::datatypes::*;
 use arrow::util::bit_iterator::BitIndexIterator;
-use arrow::{downcast_dictionary_array, downcast_primitive_array};
-use datafusion_common::cast::{
-    as_boolean_array, as_generic_binary_array, as_string_array,
-};
-use datafusion_common::hash_utils::HashValue;
+use datafusion_common::hash_utils::with_hashes;
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue,
+    DFSchema, HashSet, Result, ScalarValue, assert_or_internal_err, exec_datafusion_err,
+    exec_err,
 };
-use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr_common::datum::compare_with_eq;
+use datafusion_expr::{ColumnarValue, expr_vec_fmt};
 
-use ahash::RandomState;
 use datafusion_common::HashMap;
+use datafusion_common::hash_utils::RandomState;
 use hashbrown::hash_map::RawEntryMut;
 
+/// Trait for InList static filters
+trait StaticFilter {
+    fn null_count(&self) -> usize;
+
+    /// Checks if values in `v` are contained in the filter
+    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray>;
+}
+
 /// InList
 pub struct InListExpr {
     expr: Arc<dyn PhysicalExpr>,
     list: Vec<Arc<dyn PhysicalExpr>>,
     negated: bool,
-    static_filter: Option<Arc<dyn Set>>,
+    static_filter: Option<Arc<dyn StaticFilter + Send + Sync>>,
 }
 
 impl Debug for InListExpr {
@@ -65,13 +69,10 @@ impl Debug for InListExpr {
     }
 }
 
-/// A type-erased container of array elements
-pub trait Set: Send + Sync {
-    fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray>;
-    fn has_nulls(&self) -> bool;
-}
-
-struct ArrayHashSet {
+/// Static filter for InList that stores the array and hash set for O(1) lookups
+#[derive(Debug, Clone)]
+struct ArrayStaticFilter {
+    in_array: ArrayRef,
     state: RandomState,
     /// Used to provide a lookup from value to in list index
     ///
@@ -80,135 +81,480 @@ struct ArrayHashSet {
     map: HashMap<usize, (), ()>,
 }
 
-struct ArraySet<T> {
-    array: T,
-    hash_set: ArrayHashSet,
-}
-
-impl<T> ArraySet<T>
-where
-    T: Array + From<ArrayData>,
-{
-    fn new(array: &T, hash_set: ArrayHashSet) -> Self {
-        Self {
-            array: downcast_array(array),
-            hash_set,
-        }
+impl StaticFilter for ArrayStaticFilter {
+    fn null_count(&self) -> usize {
+        self.in_array.null_count()
     }
-}
 
-impl<T> Set for ArraySet<T>
-where
-    T: Array + 'static,
-    for<'a> &'a T: ArrayAccessor,
-    for<'a> <&'a T as ArrayAccessor>::Item: IsEqual,
-{
+    /// Checks if values in `v` are contained in the `in_array` using this hash set for lookup.
     fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
+        // Null type comparisons always return null (SQL three-valued logic)
+        if v.data_type() == &DataType::Null
+            || self.in_array.data_type() == &DataType::Null
+        {
+            let nulls = NullBuffer::new_null(v.len());
+            return Ok(BooleanArray::new(
+                BooleanBuffer::new_unset(v.len()),
+                Some(nulls),
+            ));
+        }
+
+        // Unwrap dictionary-encoded needles when the value type matches
+        // in_array, evaluating against the dictionary values and mapping
+        // back via keys.
         downcast_dictionary_array! {
             v => {
-                let values_contains = self.contains(v.values().as_ref(), negated)?;
-                let result = take(&values_contains, v.keys(), None)?;
-                return Ok(downcast_array(result.as_ref()))
+                // Only unwrap when the haystack (in_array) type matches
+                // the dictionary value type
+                if v.values().data_type() == self.in_array.data_type() {
+                    let values_contains = self.contains(v.values().as_ref(), negated)?;
+                    let result = take(&values_contains, v.keys(), None)?;
+                    return Ok(downcast_array(result.as_ref()));
+                }
             }
             _ => {}
         }
 
-        let v = v.as_any().downcast_ref::<T>().unwrap();
-        let in_array = &self.array;
-        let has_nulls = in_array.null_count() != 0;
+        let needle_nulls = v.logical_nulls();
+        let needle_nulls = needle_nulls.as_ref();
+        let haystack_has_nulls = self.in_array.null_count() != 0;
+
+        with_hashes([v], &self.state, |hashes| {
+            let cmp = make_comparator(v, &self.in_array, SortOptions::default())?;
+            Ok((0..v.len())
+                .map(|i| {
+                    // SQL three-valued logic: null IN (...) is always null
+                    if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) {
+                        return None;
+                    }
 
-        Ok(ArrayIter::new(v)
-            .map(|v| {
-                v.and_then(|v| {
-                    let hash = v.hash_one(&self.hash_set.state);
+                    let hash = hashes[i];
                     let contains = self
-                        .hash_set
                         .map
                         .raw_entry()
-                        .from_hash(hash, |idx| in_array.value(*idx).is_equal(&v))
+                        .from_hash(hash, |idx| cmp(i, *idx).is_eq())
                         .is_some();
 
                     match contains {
                         true => Some(!negated),
-                        false if has_nulls => None,
+                        false if haystack_has_nulls => None,
                         false => Some(negated),
                     }
                 })
-            })
-            .collect())
+                .collect())
+        })
     }
+}
 
-    fn has_nulls(&self) -> bool {
-        self.array.null_count() != 0
+/// Returns true if Arrow's vectorized `eq` kernel supports this data type.
+///
+/// Supported: primitives, boolean, strings (Utf8/LargeUtf8/Utf8View),
+/// binary (Binary/LargeBinary/BinaryView/FixedSizeBinary), Null, and
+/// Dictionary-encoded variants of the above.
+/// Unsupported: nested types (Struct, List, Map, Union) and RunEndEncoded.
+fn supports_arrow_eq(dt: &DataType) -> bool {
+    use DataType::*;
+    match dt {
+        Boolean | Binary | LargeBinary | BinaryView | FixedSizeBinary(_) => true,
+        Dictionary(_, v) => supports_arrow_eq(v.as_ref()),
+        _ => dt.is_primitive() || dt.is_null() || dt.is_string(),
     }
 }
 
-/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there
-/// are nulls present or there are more than the configured number of
-/// elements.
-///
-/// Note: This is split into a separate function as higher-rank trait bounds currently
-/// cause type inference to misbehave
-fn make_hash_set<T>(array: T) -> ArrayHashSet
-where
-    T: ArrayAccessor,
-    T::Item: IsEqual,
-{
-    let state = RandomState::new();
-    let mut map: HashMap<usize, (), ()> =
-        HashMap::with_capacity_and_hasher(array.len(), ());
-
-    let insert_value = |idx| {
-        let value = array.value(idx);
-        let hash = value.hash_one(&state);
-        if let RawEntryMut::Vacant(v) = map
-            .raw_entry_mut()
-            .from_hash(hash, |x| array.value(*x).is_equal(&value))
-        {
-            v.insert_with_hasher(hash, idx, (), |x| array.value(*x).hash_one(&state));
+fn instantiate_static_filter(
+    in_array: ArrayRef,
+) -> Result<Arc<dyn StaticFilter + Send + Sync>> {
+    match in_array.data_type() {
+        // Integer primitive types
+        DataType::Int8 => Ok(Arc::new(Int8StaticFilter::try_new(&in_array)?)),
+        DataType::Int16 => Ok(Arc::new(Int16StaticFilter::try_new(&in_array)?)),
+        DataType::Int32 => Ok(Arc::new(Int32StaticFilter::try_new(&in_array)?)),
+        DataType::Int64 => Ok(Arc::new(Int64StaticFilter::try_new(&in_array)?)),
+        DataType::UInt8 => Ok(Arc::new(UInt8StaticFilter::try_new(&in_array)?)),
+        DataType::UInt16 => Ok(Arc::new(UInt16StaticFilter::try_new(&in_array)?)),
+        DataType::UInt32 => Ok(Arc::new(UInt32StaticFilter::try_new(&in_array)?)),
+        DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)),
+        // Float primitive types (use ordered wrappers for Hash/Eq)
+        DataType::Float32 => Ok(Arc::new(Float32StaticFilter::try_new(&in_array)?)),
+        DataType::Float64 => Ok(Arc::new(Float64StaticFilter::try_new(&in_array)?)),
+        _ => {
+            /* fall through to generic implementation for unsupported types (Struct, etc.) */
+            Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?))
         }
-    };
+    }
+}
 
-    match array.nulls() {
-        Some(nulls) => {
-            BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
-                .for_each(insert_value)
+impl ArrayStaticFilter {
+    /// Computes a [`StaticFilter`] for the provided [`Array`] if there
+    /// are nulls present or there are more than the configured number of
+    /// elements.
+    ///
+    /// Note: This is split into a separate function as higher-rank trait bounds currently
+    /// cause type inference to misbehave
+    fn try_new(in_array: ArrayRef) -> Result<ArrayStaticFilter> {
+        // Null type has no natural order - return empty hash set
+        if in_array.data_type() == &DataType::Null {
+            return Ok(ArrayStaticFilter {
+                in_array,
+                state: RandomState::default(),
+                map: HashMap::with_hasher(()),
+            });
         }
-        None => (0..array.len()).for_each(insert_value),
+
+        let state = RandomState::default();
+        let mut map: HashMap<usize, (), ()> = HashMap::with_hasher(());
+
+        with_hashes([&in_array], &state, |hashes| -> Result<()> {
+            let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?;
+
+            let insert_value = |idx| {
+                let hash = hashes[idx];
+                if let RawEntryMut::Vacant(v) = map
+                    .raw_entry_mut()
+                    .from_hash(hash, |x| cmp(*x, idx).is_eq())
+                {
+                    v.insert_with_hasher(hash, idx, (), |x| hashes[*x]);
+                }
+            };
+
+            match in_array.nulls() {
+                Some(nulls) => {
+                    BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
+                        .for_each(insert_value)
+                }
+                None => (0..in_array.len()).for_each(insert_value),
+            }
+
+            Ok(())
+        })?;
+
+        Ok(Self {
+            in_array,
+            state,
+            map,
+        })
+    }
+}
+
+/// Wrapper for f32 that implements Hash and Eq using bit comparison.
+/// This treats NaN values as equal to each other when they have the same bit pattern.
+#[derive(Clone, Copy)]
+struct OrderedFloat32(f32);
+
+impl Hash for OrderedFloat32 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_ne_bytes().hash(state);
+    }
+}
+
+impl PartialEq for OrderedFloat32 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
+    }
+}
+
+impl Eq for OrderedFloat32 {}
+
+impl From<f32> for OrderedFloat32 {
+    fn from(v: f32) -> Self {
+        Self(v)
+    }
+}
+
+/// Wrapper for f64 that implements Hash and Eq using bit comparison.
+/// This treats NaN values as equal to each other when they have the same bit pattern.
+#[derive(Clone, Copy)]
+struct OrderedFloat64(f64);
+
+impl Hash for OrderedFloat64 {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.to_ne_bytes().hash(state);
+    }
+}
+
+impl PartialEq for OrderedFloat64 {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.to_bits() == other.0.to_bits()
     }
+}
+
+impl Eq for OrderedFloat64 {}
 
-    ArrayHashSet { state, map }
+impl From<f64> for OrderedFloat64 {
+    fn from(v: f64) -> Self {
+        Self(v)
+    }
 }
 
-/// Creates a `Box<dyn Set>` for the given list of `IN` expressions and `batch`
-fn make_set(array: &dyn Array) -> Result<Arc<dyn Set>> {
-    Ok(downcast_primitive_array! {
-        array => Arc::new(ArraySet::new(array, make_hash_set(array))),
-        DataType::Boolean => {
-            let array = as_boolean_array(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
-        },
-        DataType::Utf8 => {
-            let array = as_string_array(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
+// Macro to generate specialized StaticFilter implementations for primitive types
+macro_rules! primitive_static_filter {
+    ($Name:ident, $ArrowType:ty) => {
+        struct $Name {
+            null_count: usize,
+            values: HashSet<<$ArrowType as ArrowPrimitiveType>::Native>,
+        }
+
+        impl $Name {
+            fn try_new(in_array: &ArrayRef) -> Result<Self> {
+                let in_array = in_array
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let mut values = HashSet::with_capacity(in_array.len());
+                let null_count = in_array.null_count();
+
+                for v in in_array.iter().flatten() {
+                    values.insert(v);
+                }
+
+                Ok(Self { null_count, values })
+            }
+        }
+
+        impl StaticFilter for $Name {
+            fn null_count(&self) -> usize {
+                self.null_count
+            }
+
+            fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
+                // Handle dictionary arrays by recursing on the values
+                downcast_dictionary_array! {
+                    v => {
+                        let values_contains = self.contains(v.values().as_ref(), negated)?;
+                        let result = take(&values_contains, v.keys(), None)?;
+                        return Ok(downcast_array(result.as_ref()))
+                    }
+                    _ => {}
+                }
+
+                let v = v
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let haystack_has_nulls = self.null_count > 0;
+
+                let needle_values = v.values();
+                let needle_nulls = v.nulls();
+                let needle_has_nulls = v.null_count() > 0;
+
+                // Truth table for `value [NOT] IN (set)` with SQL three-valued logic:
+                // ("-" means the value doesn't affect the result)
+                //
+                // | needle_null | haystack_null | negated | in set? | result |
+                // |-------------|---------------|---------|---------|--------|
+                // | true        | -             | false   | -       | null   |
+                // | true        | -             | true    | -       | null   |
+                // | false       | true          | false   | yes     | true   |
+                // | false       | true          | false   | no      | null   |
+                // | false       | true          | true    | yes     | false  |
+                // | false       | true          | true    | no      | null   |
+                // | false       | false         | false   | yes     | true   |
+                // | false       | false         | false   | no      | false  |
+                // | false       | false         | true    | yes     | false  |
+                // | false       | false         | true    | no      | true   |
+
+                // Compute the "contains" result using collect_bool (fast batched approach)
+                // This ignores nulls - we handle them separately
+                let contains_buffer = if negated {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        !self.values.contains(&needle_values[i])
+                    })
+                } else {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        self.values.contains(&needle_values[i])
+                    })
+                };
+
+                // Compute the null mask
+                // Output is null when:
+                // 1. needle value is null, OR
+                // 2. needle value is not in set AND haystack has nulls
+                let result_nulls = match (needle_has_nulls, haystack_has_nulls) {
+                    (false, false) => {
+                        // No nulls anywhere
+                        None
+                    }
+                    (true, false) => {
+                        // Only needle has nulls - just use needle's null mask
+                        needle_nulls.cloned()
+                    }
+                    (false, true) => {
+                        // Only haystack has nulls - result is null when value not in set
+                        // Valid (not null) when original "in set" is true
+                        // For NOT IN: contains_buffer = !original, so validity = !contains_buffer
+                        let validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+                        Some(NullBuffer::new(validity))
+                    }
+                    (true, true) => {
+                        // Both have nulls - combine needle nulls with haystack-induced nulls
+                        let needle_validity = needle_nulls.map(|n| n.inner().clone())
+                            .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len()));
+
+                        // Valid when original "in set" is true (see above)
+                        let haystack_validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+
+                        // Combined validity: valid only where both are valid
+                        let combined_validity = &needle_validity & &haystack_validity;
+                        Some(NullBuffer::new(combined_validity))
+                    }
+                };
+
+                Ok(BooleanArray::new(contains_buffer, result_nulls))
+            }
         }
-        DataType::LargeUtf8 => {
-            let array = as_largestring_array(array);
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
+    };
+}
+
+// Generate specialized filters for all integer primitive types
+primitive_static_filter!(Int8StaticFilter, Int8Type);
+primitive_static_filter!(Int16StaticFilter, Int16Type);
+primitive_static_filter!(Int32StaticFilter, Int32Type);
+primitive_static_filter!(Int64StaticFilter, Int64Type);
+primitive_static_filter!(UInt8StaticFilter, UInt8Type);
+primitive_static_filter!(UInt16StaticFilter, UInt16Type);
+primitive_static_filter!(UInt32StaticFilter, UInt32Type);
+primitive_static_filter!(UInt64StaticFilter, UInt64Type);
+
+// Macro to generate specialized StaticFilter implementations for float types
+// Floats require a wrapper type (OrderedFloat*) to implement Hash/Eq due to NaN semantics
+macro_rules! float_static_filter {
+    ($Name:ident, $ArrowType:ty, $OrderedType:ty) => {
+        struct $Name {
+            null_count: usize,
+            values: HashSet<$OrderedType>,
         }
-        DataType::Binary => {
-            let array = as_generic_binary_array::<i32>(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
+
+        impl $Name {
+            fn try_new(in_array: &ArrayRef) -> Result<Self> {
+                let in_array = in_array
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let mut values = HashSet::with_capacity(in_array.len());
+                let null_count = in_array.null_count();
+
+                for v in in_array.iter().flatten() {
+                    values.insert(<$OrderedType>::from(v));
+                }
+
+                Ok(Self { null_count, values })
+            }
         }
-        DataType::LargeBinary => {
-            let array = as_generic_binary_array::<i64>(array)?;
-            Arc::new(ArraySet::new(array, make_hash_set(array)))
+
+        impl StaticFilter for $Name {
+            fn null_count(&self) -> usize {
+                self.null_count
+            }
+
+            fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
+                // Handle dictionary arrays by recursing on the values
+                downcast_dictionary_array! {
+                    v => {
+                        let values_contains = self.contains(v.values().as_ref(), negated)?;
+                        let result = take(&values_contains, v.keys(), None)?;
+                        return Ok(downcast_array(result.as_ref()))
+                    }
+                    _ => {}
+                }
+
+                let v = v
+                    .as_primitive_opt::<$ArrowType>()
+                    .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
+
+                let haystack_has_nulls = self.null_count > 0;
+
+                let needle_values = v.values();
+                let needle_nulls = v.nulls();
+                let needle_has_nulls = v.null_count() > 0;
+
+                // Truth table for `value [NOT] IN (set)` with SQL three-valued logic:
+                // ("-" means the value doesn't affect the result)
+                //
+                // | needle_null | haystack_null | negated | in set? | result |
+                // |-------------|---------------|---------|---------|--------|
+                // | true        | -             | false   | -       | null   |
+                // | true        | -             | true    | -       | null   |
+                // | false       | true          | false   | yes     | true   |
+                // | false       | true          | false   | no      | null   |
+                // | false       | true          | true    | yes     | false  |
+                // | false       | true          | true    | no      | null   |
+                // | false       | false         | false   | yes     | true   |
+                // | false       | false         | false   | no      | false  |
+                // | false       | false         | true    | yes     | false  |
+                // | false       | false         | true    | no      | true   |
+
+                // Compute the "contains" result using collect_bool (fast batched approach)
+                // This ignores nulls - we handle them separately
+                let contains_buffer = if negated {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        !self.values.contains(&<$OrderedType>::from(needle_values[i]))
+                    })
+                } else {
+                    BooleanBuffer::collect_bool(needle_values.len(), |i| {
+                        self.values.contains(&<$OrderedType>::from(needle_values[i]))
+                    })
+                };
+
+                // Compute the null mask
+                // Output is null when:
+                // 1. needle value is null, OR
+                // 2. needle value is not in set AND haystack has nulls
+                let result_nulls = match (needle_has_nulls, haystack_has_nulls) {
+                    (false, false) => {
+                        // No nulls anywhere
+                        None
+                    }
+                    (true, false) => {
+                        // Only needle has nulls - just use needle's null mask
+                        needle_nulls.cloned()
+                    }
+                    (false, true) => {
+                        // Only haystack has nulls - result is null when value not in set
+                        // Valid (not null) when original "in set" is true
+                        // For NOT IN: contains_buffer = !original, so validity = !contains_buffer
+                        let validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+                        Some(NullBuffer::new(validity))
+                    }
+                    (true, true) => {
+                        // Both have nulls - combine needle nulls with haystack-induced nulls
+                        let needle_validity = needle_nulls.map(|n| n.inner().clone())
+                            .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len()));
+
+                        // Valid when original "in set" is true (see above)
+                        let haystack_validity = if negated {
+                            !&contains_buffer
+                        } else {
+                            contains_buffer.clone()
+                        };
+
+                        // Combined validity: valid only where both are valid
+                        let combined_validity = &needle_validity & &haystack_validity;
+                        Some(NullBuffer::new(combined_validity))
+                    }
+                };
+
+                Ok(BooleanArray::new(contains_buffer, result_nulls))
+            }
         }
-        DataType::Dictionary(_, _) => unreachable!("dictionary should have been flattened"),
-        d => return not_impl_err!("DataType::{d} not supported in InList")
-    })
+    };
 }
 
+// Generate specialized filters for float types using ordered wrappers
+float_static_filter!(Float32StaticFilter, Float32Type, OrderedFloat32);
+float_static_filter!(Float64StaticFilter, Float64Type, OrderedFloat64);
+
 /// Evaluates the list of expressions into an array, flattening any dictionaries
 fn evaluate_list(
     list: &[Arc<dyn PhysicalExpr>],
@@ -231,56 +577,37 @@ fn evaluate_list(
     ScalarValue::iter_to_array(scalars)
 }
 
-fn try_cast_static_filter_to_set(
+/// Try to evaluate a list of expressions as constants.
+///
+/// Returns:
+/// - `Ok(Some(ArrayRef))` if all expressions are constants (can be evaluated on an empty RecordBatch)
+/// - `Ok(None)` if the list contains non-constant expressions
+/// - `Err(...)` only for actual errors (not for non-constant expressions)
+///
+/// This is used to detect when a list contains only literals, casts of literals,
+/// or other constant expressions.
+fn try_evaluate_constant_list(
     list: &[Arc<dyn PhysicalExpr>],
     schema: &Schema,
-) -> Result<Arc<dyn Set>> {
+) -> Result<Option<ArrayRef>> {
     let batch = RecordBatch::new_empty(Arc::new(schema.clone()));
-    make_set(evaluate_list(list, &batch)?.as_ref())
-}
-
-/// Custom equality check function which is used with [`ArrayHashSet`] for existence check.
-trait IsEqual: HashValue {
-    fn is_equal(&self, other: &Self) -> bool;
-}
-
-impl<T: IsEqual + ?Sized> IsEqual for &T {
-    fn is_equal(&self, other: &Self) -> bool {
-        T::is_equal(self, other)
+    match evaluate_list(list, &batch) {
+        Ok(array) => Ok(Some(array)),
+        Err(_) => {
+            // Non-constant expressions can't be evaluated on an empty batch
+            // This is not an error, just means we can't use a static filter
+            Ok(None)
+        }
     }
 }
 
-macro_rules! is_equal {
-    ($($t:ty),+) => {
-        $(impl IsEqual for $t {
-            fn is_equal(&self, other: &Self) -> bool {
-                self == other
-            }
-        })*
-    };
-}
-is_equal!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
-is_equal!(bool, str, [u8]);
-is_equal!(IntervalDayTime, IntervalMonthDayNano);
-
-macro_rules! is_equal_float {
-    ($($t:ty),+) => {
-        $(impl IsEqual for $t {
-            fn is_equal(&self, other: &Self) -> bool {
-                self.to_bits() == other.to_bits()
-            }
-        })*
-    };
-}
-is_equal_float!(half::f16, f32, f64);
-
 impl InListExpr {
     /// Create a new InList expression
-    pub fn new(
+    fn new(
         expr: Arc<dyn PhysicalExpr>,
         list: Vec<Arc<dyn PhysicalExpr>>,
         negated: bool,
-        static_filter: Option<Arc<dyn Set>>,
+        static_filter: Option<Arc<dyn StaticFilter + Send + Sync>>,
     ) -> Self {
         Self {
             expr,
@@ -300,24 +627,99 @@ impl InListExpr {
         &self.list
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.list.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.list.len()
+    }
+
     /// Is this negated e.g. NOT IN LIST
     pub fn negated(&self) -> bool {
         self.negated
     }
-}
 
+    /// Create a new InList expression directly from an array, bypassing expression evaluation.
+    ///
+    /// This is more efficient than `in_list()` when you already have the list as an array,
+    /// as it avoids the conversion: `ArrayRef -> Vec<PhysicalExpr> -> ArrayRef -> StaticFilter`.
+    /// Instead it goes directly: `ArrayRef -> StaticFilter`.
+    ///
+    /// The `list` field will be empty when using this constructor, as the array is stored
+    /// directly in the static filter.
+    ///
+    /// This does not make the expression any more performant at runtime, but it does make it slightly
+    /// cheaper to build.
+    pub fn try_new_from_array(
+        expr: Arc<dyn PhysicalExpr>,
+        array: ArrayRef,
+        negated: bool,
+    ) -> Result<Self> {
+        let list = (0..array.len())
+            .map(|i| {
+                let scalar = ScalarValue::try_from_array(array.as_ref(), i)?;
+                Ok(crate::expressions::lit(scalar) as Arc<dyn PhysicalExpr>)
+            })
+            .collect::<Result<Vec<_>>>()?;
+        Ok(Self::new(
+            expr,
+            list,
+            negated,
+            Some(instantiate_static_filter(array)?),
+        ))
+    }
+
+    /// Create a new InList expression, using a static filter when possible.
+    ///
+    /// This validates data types and attempts to create a static filter for constant
+    /// list expressions. Uses specialized StaticFilter implementations for better
+    /// performance (e.g., Int32StaticFilter for Int32).
+    ///
+    /// Returns an error if data types don't match. If the list contains non-constant
+    /// expressions, falls back to dynamic evaluation at runtime.
+    pub fn try_new(
+        expr: Arc<dyn PhysicalExpr>,
+        list: Vec<Arc<dyn PhysicalExpr>>,
+        negated: bool,
+        schema: &Schema,
+    ) -> Result<Self> {
+        // Check the data types match
+        let expr_data_type = expr.data_type(schema)?;
+        for list_expr in list.iter() {
+            let list_expr_data_type = list_expr.data_type(schema)?;
+            assert_or_internal_err!(
+                DFSchema::datatype_is_logically_equal(
+                    &expr_data_type,
+                    &list_expr_data_type
+                ),
+                "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}"
+            );
+        }
+
+        // Try to create a static filter if all list expressions are constants
+        let static_filter = match try_evaluate_constant_list(&list, schema)? {
+            Some(in_array) => Some(instantiate_static_filter(in_array)?),
+            None => None, // Non-constant expressions, fall back to dynamic evaluation
+        };
+
+        Ok(Self::new(expr, list, negated, static_filter))
+    }
+}
 impl std::fmt::Display for InListExpr {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let list = expr_vec_fmt!(self.list);
+
         if self.negated {
             if self.static_filter.is_some() {
-                write!(f, "{} NOT IN (SET) ({:?})", self.expr, self.list)
+                write!(f, "{} NOT IN (SET) ([{list}])", self.expr)
             } else {
-                write!(f, "{} NOT IN ({:?})", self.expr, self.list)
+                write!(f, "{} NOT IN ([{list}])", self.expr)
             }
         } else if self.static_filter.is_some() {
-            write!(f, "Use {} IN (SET) ({:?})", self.expr, self.list)
+            write!(f, "{} IN (SET) ([{list}])", self.expr)
         } else {
-            write!(f, "{} IN ({:?})", self.expr, self.list)
+            write!(f, "{} IN ([{list}])", self.expr)
         }
     }
 }
@@ -338,7 +740,7 @@ impl PhysicalExpr for InListExpr {
         }
 
         if let Some(static_filter) = &self.static_filter {
-            Ok(static_filter.has_nulls())
+            Ok(static_filter.null_count() > 0)
         } else {
             for expr in &self.list {
                 if expr.nullable(input_schema)? {
@@ -353,35 +755,125 @@ impl PhysicalExpr for InListExpr {
         let num_rows = batch.num_rows();
         let value = self.expr.evaluate(batch)?;
         let r = match &self.static_filter {
-            Some(f) => f.contains(value.into_array(num_rows)?.as_ref(), self.negated)?,
+            Some(filter) => {
+                match value {
+                    ColumnarValue::Array(array) => {
+                        filter.contains(&array, self.negated)?
+                    }
+                    ColumnarValue::Scalar(scalar) => {
+                        if scalar.is_null() {
+                            // SQL three-valued logic: null IN (...) is always null
+                            // The code below would handle this correctly but this is a faster path
+                            let nulls = NullBuffer::new_null(num_rows);
+                            return Ok(ColumnarValue::Array(Arc::new(
+                                BooleanArray::new(
+                                    BooleanBuffer::new_unset(num_rows),
+                                    Some(nulls),
+                                ),
+                            )));
+                        }
+                        // Use a 1 row array to avoid code duplication/branching
+                        // Since all we do is compute hash and lookup this should be efficient enough
+                        let array = scalar.to_array()?;
+                        let result_array =
+                            filter.contains(array.as_ref(), self.negated)?;
+                        // Broadcast the single result to all rows
+                        // Must check is_null() to preserve NULL values (SQL three-valued logic)
+                        if result_array.is_null(0) {
+                            let nulls = NullBuffer::new_null(num_rows);
+                            BooleanArray::new(
+                                BooleanBuffer::new_unset(num_rows),
+                                Some(nulls),
+                            )
+                        } else if result_array.value(0) {
+                            BooleanArray::new(BooleanBuffer::new_set(num_rows), None)
+                        } else {
+                            BooleanArray::new(BooleanBuffer::new_unset(num_rows), None)
+                        }
+                    }
+                }
+            }
             None => {
+                // No static filter: iterate through each expression, compare, and OR results.
+                // Use Arrow's vectorized eq kernel for types it supports (primitive,
+                // boolean, string, binary, dictionary), falling back to row-by-row
+                // comparator for unsupported types (nested, RunEndEncoded, etc.).
                 let value = value.into_array(num_rows)?;
-                let is_nested = value.data_type().is_nested();
-                let found = self.list.iter().map(|expr| expr.evaluate(batch)).try_fold(
-                    BooleanArray::new(BooleanBuffer::new_unset(num_rows), None),
-                    |result, expr| -> Result<BooleanArray> {
-                        let rhs = compare_with_eq(
-                            &value,
-                            &expr?.into_array(num_rows)?,
-                            is_nested,
-                        )?;
-                        Ok(or_kleene(&result, &rhs)?)
-                    },
-                )?;
-
-                if self.negated {
-                    not(&found)?
+                let lhs_supports_arrow_eq = supports_arrow_eq(value.data_type());
+
+                // Helper: compare value against a single list expression
+                let compare_one = |expr: &Arc<dyn PhysicalExpr>| -> Result<BooleanArray> {
+                    match expr.evaluate(batch)? {
+                        ColumnarValue::Array(array) => {
+                            if lhs_supports_arrow_eq
+                                && supports_arrow_eq(array.data_type())
+                            {
+                                Ok(arrow_eq(&value, &array)?)
+                            } else {
+                                let cmp = make_comparator(
+                                    value.as_ref(),
+                                    array.as_ref(),
+                                    SortOptions::default(),
+                                )?;
+                                let buffer = BooleanBuffer::collect_bool(num_rows, |i| {
+                                    cmp(i, i).is_eq()
+                                });
+                                let nulls =
+                                    NullBuffer::union(value.nulls(), array.nulls());
+                                Ok(BooleanArray::new(buffer, nulls))
+                            }
+                        }
+                        ColumnarValue::Scalar(scalar) => {
+                            // Check if scalar is null once, before the loop
+                            if scalar.is_null() {
+                                // If scalar is null, all comparisons return null
+                                Ok(BooleanArray::from(vec![None; num_rows]))
+                            } else if lhs_supports_arrow_eq {
+                                let scalar_datum = scalar.to_scalar()?;
+                                Ok(arrow_eq(&value, &scalar_datum)?)
+                            } else {
+                                // Convert scalar to 1-element array
+                                let array = scalar.to_array()?;
+                                let cmp = make_comparator(
+                                    value.as_ref(),
+                                    array.as_ref(),
+                                    SortOptions::default(),
+                                )?;
+                                // Compare each row of value with the single scalar element
+                                let buffer = BooleanBuffer::collect_bool(num_rows, |i| {
+                                    cmp(i, 0).is_eq()
+                                });
+                                Ok(BooleanArray::new(buffer, value.nulls().cloned()))
+                            }
+                        }
+                    }
+                };
+
+                // Evaluate first expression directly to avoid a redundant
+                // or_kleene with an all-false accumulator.
+                let mut found = if let Some(first) = self.list.first() {
+                    compare_one(first)?
                 } else {
-                    found
+                    BooleanArray::new(BooleanBuffer::new_unset(num_rows), None)
+                };
+
+                for expr in self.list.iter().skip(1) {
+                    // Short-circuit: if every non-null row is already true,
+                    // no further list items can change the result.
+                    if found.null_count() == 0 && found.true_count() == num_rows {
+                        break;
+                    }
+                    found = or_kleene(&found, &compare_one(expr)?)?;
                 }
+
+                if self.negated { not(&found)? } else { found }
             }
         };
         Ok(ColumnarValue::Array(Arc::new(r)))
     }
 
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
-        let mut children = vec![];
-        children.push(&self.expr);
+        let mut children = vec![&self.expr];
         children.extend(&self.list);
         children
     }
@@ -395,7 +887,7 @@ impl PhysicalExpr for InListExpr {
             Arc::clone(&children[0]),
             children[1..].to_vec(),
             self.negated,
-            self.static_filter.clone(),
+            self.static_filter.as_ref().map(Arc::clone),
         )))
     }
 
@@ -430,8 +922,8 @@ impl Hash for InListExpr {
     fn hash<H: Hasher>(&self, state: &mut H) {
         self.expr.hash(state);
         self.negated.hash(state);
-        self.list.hash(state);
         // Add `self.static_filter` when hash is available
+        self.list.hash(state);
     }
 }
 
@@ -442,34 +934,20 @@ pub fn in_list(
     negated: &bool,
     schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    // check the data type
-    let expr_data_type = expr.data_type(schema)?;
-    for list_expr in list.iter() {
-        let list_expr_data_type = list_expr.data_type(schema)?;
-        if !DFSchema::datatype_is_logically_equal(&expr_data_type, &list_expr_data_type) {
-            return internal_err!(
-                "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}"
-            );
-        }
-    }
-    let static_filter = try_cast_static_filter_to_set(&list, schema).ok();
-    Ok(Arc::new(InListExpr::new(
-        expr,
-        list,
-        *negated,
-        static_filter,
-    )))
+    Ok(Arc::new(InListExpr::try_new(expr, list, *negated, schema)?))
 }
 
 #[cfg(test)]
 mod tests {
-
     use super::*;
-    use crate::expressions;
     use crate::expressions::{col, lit, try_cast};
+    use arrow::buffer::NullBuffer;
+    use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano, i256};
     use datafusion_common::plan_err;
     use datafusion_expr::type_coercion::binary::comparison_coercion;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
+    use insta::assert_snapshot;
+    use itertools::Itertools;
 
     type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
 
@@ -488,7 +966,8 @@ mod tests {
         let result_type = get_coerce_type(expr_type, &list_types);
         match result_type {
             None => plan_err!(
-                "Can not find compatible types to compare {expr_type:?} with {list_types:?}"
+                "Can not find compatible types to compare {expr_type} with [{}]",
+                list_types.iter().join(", ")
             ),
             Some(data_type) => {
                 // find the coerced type
@@ -514,7 +993,18 @@ mod tests {
             })
     }
 
-    // applies the in_list expr to an input batch and list
+    /// Test helper macro that evaluates an IN LIST expression with automatic type casting.
+    ///
+    /// # Parameters
+    /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against
+    /// - `$LIST`: A `Vec<Arc<dyn PhysicalExpr>>` of literal expressions representing the IN list values
+    /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false)
+    /// - `$EXPECTED`: A `Vec<Option<bool>>` representing the expected boolean results for each row
+    /// - `$COL`: An `Arc<dyn PhysicalExpr>` representing the column expression to evaluate
+    /// - `$SCHEMA`: A `&Schema` reference for the input batch
+    ///
+    /// This macro first applies type casting to the column and list expressions to ensure
+    /// type compatibility, then delegates to `in_list_raw!` to perform the evaluation and assertion.
     macro_rules! in_list {
         ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{
             let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
@@ -529,220 +1019,453 @@ mod tests {
         }};
     }
 
-    // applies the in_list expr to an input batch and list without cast
+    /// Test helper macro that evaluates an IN LIST expression without automatic type casting.
+    ///
+    /// # Parameters
+    /// - `$BATCH`: The `RecordBatch` containing the input data to evaluate against
+    /// - `$LIST`: A `Vec<Arc<dyn PhysicalExpr>>` of literal expressions representing the IN list values
+    /// - `$NEGATED`: A `&bool` indicating whether this is a NOT IN operation (true) or IN operation (false)
+    /// - `$EXPECTED`: A `Vec<Option<bool>>` representing the expected boolean results for each row
+    /// - `$COL`: An `Arc<dyn PhysicalExpr>` representing the column expression to evaluate
+    /// - `$SCHEMA`: A `&Schema` reference for the input batch
+    ///
+    /// This macro creates an IN LIST expression, evaluates it against the batch, converts the result
+    /// to a `BooleanArray`, and asserts that it matches the expected output. Use this when the column
+    /// and list expressions are already the correct types and don't require casting.
     macro_rules! in_list_raw {
         ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{
-            let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap();
+            let col_expr = $COL;
+            let expr = in_list(Arc::clone(&col_expr), $LIST, $NEGATED, $SCHEMA).unwrap();
             let result = expr
                 .evaluate(&$BATCH)?
                 .into_array($BATCH.num_rows())
                 .expect("Failed to convert to array");
-            let result =
-                as_boolean_array(&result).expect("failed to downcast to BooleanArray");
+            let result = as_boolean_array(&result);
             let expected = &BooleanArray::from($EXPECTED);
-            assert_eq!(expected, result);
+            assert_eq!(
+                expected,
+                result,
+                "Failed for: {}\n{}: {:?}",
+                fmt_sql(expr.as_ref()),
+                fmt_sql(col_expr.as_ref()),
+                col_expr
+                    .evaluate(&$BATCH)?
+                    .into_array($BATCH.num_rows())
+                    .unwrap()
+            );
         }};
     }
 
+    /// Test case for primitive types following the standard IN LIST pattern.
+    ///
+    /// Each test case represents a data type with:
+    /// - `value_in`: A value that appears in both the test array and the IN list (matches → true)
+    /// - `value_not_in`: A value that appears in the test array but NOT in the IN list (doesn't match → false)
+    /// - `other_list_values`: Additional values in the IN list besides `value_in`
+    /// - `null_value`: Optional null scalar value for NULL handling tests. When None, tests
+    ///   without nulls are run, exercising the `(false, false)` and `(false, true)` branches.
+    struct InListPrimitiveTestCase {
+        name: &'static str,
+        value_in: ScalarValue,
+        value_not_in: ScalarValue,
+        other_list_values: Vec<ScalarValue>,
+        null_value: Option<ScalarValue>,
+    }
+
+    /// Generic test data struct for primitive types.
+    ///
+    /// Holds test values needed for IN LIST tests, allowing the data
+    /// to be declared explicitly and reused across multiple types.
+    #[derive(Clone)]
+    struct PrimitiveTestCaseData<T> {
+        value_in: T,
+        value_not_in: T,
+        other_list_values: Vec<T>,
+    }
+
+    /// Helper to create test cases for any primitive type using generic data.
+    ///
+    /// Uses TryInto for flexible type conversion, allowing test data to be
+    /// declared in any convertible type (e.g., i32 for all integer types).
+    /// Creates a test case WITH null support (for null handling tests).
+    fn primitive_test_case<T, D, F>(
+        name: &'static str,
+        constructor: F,
+        data: PrimitiveTestCaseData<D>,
+    ) -> InListPrimitiveTestCase
+    where
+        D: TryInto<T> + Clone,
+        <D as TryInto<T>>::Error: Debug,
+        F: Fn(Option<T>) -> ScalarValue,
+        T: Clone,
+    {
+        InListPrimitiveTestCase {
+            name,
+            value_in: constructor(Some(data.value_in.try_into().unwrap())),
+            value_not_in: constructor(Some(data.value_not_in.try_into().unwrap())),
+            other_list_values: data
+                .other_list_values
+                .into_iter()
+                .map(|v| constructor(Some(v.try_into().unwrap())))
+                .collect(),
+            null_value: Some(constructor(None)),
+        }
+    }
+
+    /// Helper to create test cases WITHOUT null support.
+    /// These test cases exercise the `(false, true)` branch (no nulls, negated).
+    fn primitive_test_case_no_nulls<T, D, F>(
+        name: &'static str,
+        constructor: F,
+        data: PrimitiveTestCaseData<D>,
+    ) -> InListPrimitiveTestCase
+    where
+        D: TryInto<T> + Clone,
+        <D as TryInto<T>>::Error: Debug,
+        F: Fn(Option<T>) -> ScalarValue,
+        T: Clone,
+    {
+        InListPrimitiveTestCase {
+            name,
+            value_in: constructor(Some(data.value_in.try_into().unwrap())),
+            value_not_in: constructor(Some(data.value_not_in.try_into().unwrap())),
+            other_list_values: data
+                .other_list_values
+                .into_iter()
+                .map(|v| constructor(Some(v.try_into().unwrap())))
+                .collect(),
+            null_value: None,
+        }
+    }
+
+    /// Runs test cases for multiple types, providing detailed SQL error messages on failure.
+    ///
+    /// For each test case, runs IN LIST scenarios based on whether null_value is Some or None:
+    /// - With null_value (Some): 4 tests including null handling
+    /// - Without null_value (None): 2 tests exercising the no-nulls paths
+    fn run_test_cases(test_cases: Vec<InListPrimitiveTestCase>) -> Result<()> {
+        for test_case in test_cases {
+            let test_name = test_case.name;
+
+            // Get the data type from the scalar value
+            let data_type = test_case.value_in.data_type();
+
+            // Build the base list: [value_in, ...other_list_values]
+            let build_base_list = || -> Vec<Arc<dyn PhysicalExpr>> {
+                let mut list = vec![lit(test_case.value_in.clone())];
+                list.extend(test_case.other_list_values.iter().map(|v| lit(v.clone())));
+                list
+            };
+
+            match &test_case.null_value {
+                Some(null_val) => {
+                    // Tests WITH nulls in the needle array
+                    let schema =
+                        Schema::new(vec![Field::new("a", data_type.clone(), true)]);
+
+                    // Create array from scalar values: [value_in, value_not_in, None]
+                    let array = ScalarValue::iter_to_array(vec![
+                        test_case.value_in.clone(),
+                        test_case.value_not_in.clone(),
+                        null_val.clone(),
+                    ])?;
+
+                    let col_a = col("a", &schema)?;
+                    let batch = RecordBatch::try_new(
+                        Arc::new(schema.clone()),
+                        vec![Arc::clone(&array)],
+                    )?;
+
+                    // Test 1: a IN (list) → [true, false, null]
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), Some(false), None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 2: a NOT IN (list) → [false, true, null]
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), Some(true), None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 3: a IN (list, NULL) → [true, null, null]
+                    let mut list = build_base_list();
+                    list.push(lit(null_val.clone()));
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), None, None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 4: a NOT IN (list, NULL) → [false, null, null]
+                    let mut list = build_base_list();
+                    list.push(lit(null_val.clone()));
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), None, None],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+                }
+                None => {
+                    // Tests WITHOUT nulls - exercises the (false, false) and (false, true) branches
+                    let schema =
+                        Schema::new(vec![Field::new("a", data_type.clone(), false)]);
+
+                    // Create array from scalar values: [value_in, value_not_in] (no NULL)
+                    let array = ScalarValue::iter_to_array(vec![
+                        test_case.value_in.clone(),
+                        test_case.value_not_in.clone(),
+                    ])?;
+
+                    let col_a = col("a", &schema)?;
+                    let batch = RecordBatch::try_new(
+                        Arc::new(schema.clone()),
+                        vec![Arc::clone(&array)],
+                    )?;
+
+                    // Test 1: a IN (list) → [true, false] - exercises (false, false) branch
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &false,
+                        vec![Some(true), Some(false)],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    // Test 2: a NOT IN (list) → [false, true] - exercises (false, true) branch
+                    let list = build_base_list();
+                    in_list!(
+                        batch,
+                        list,
+                        &true,
+                        vec![Some(false), Some(true)],
+                        Arc::clone(&col_a),
+                        &schema
+                    );
+
+                    eprintln!(
+                        "Test '{test_name}': exercised (false, true) branch (no nulls, negated)",
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Test IN LIST for all integer types (Int8/16/32/64, UInt8/16/32/64).
+    ///
+    /// Test data: 0 (in list), 2 (not in list), [1, 3, 5] (other list values)
     #[test]
-    fn in_list_utf8() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
-        let a = StringArray::from(vec![Some("a"), Some("d"), None]);
+    fn in_list_int_types() -> Result<()> {
+        let int_data = PrimitiveTestCaseData {
+            value_in: 0,
+            value_not_in: 2,
+            other_list_values: vec![1, 3, 5],
+        };
+
+        run_test_cases(vec![
+            // Tests WITH nulls
+            primitive_test_case("int8", ScalarValue::Int8, int_data.clone()),
+            primitive_test_case("int16", ScalarValue::Int16, int_data.clone()),
+            primitive_test_case("int32", ScalarValue::Int32, int_data.clone()),
+            primitive_test_case("int64", ScalarValue::Int64, int_data.clone()),
+            primitive_test_case("uint8", ScalarValue::UInt8, int_data.clone()),
+            primitive_test_case("uint16", ScalarValue::UInt16, int_data.clone()),
+            primitive_test_case("uint32", ScalarValue::UInt32, int_data.clone()),
+            primitive_test_case("uint64", ScalarValue::UInt64, int_data.clone()),
+            // Tests WITHOUT nulls - exercises (false, true) branch
+            primitive_test_case_no_nulls("int32_no_nulls", ScalarValue::Int32, int_data),
+        ])
+    }
+
+    /// Test IN LIST for all string types (Utf8, LargeUtf8, Utf8View).
+    ///
+    /// Test data: "a" (in list), "d" (not in list), ["b", "c"] (other list values)
+    #[test]
+    fn in_list_string_types() -> Result<()> {
+        let string_data = PrimitiveTestCaseData {
+            value_in: "a",
+            value_not_in: "d",
+            other_list_values: vec!["b", "c"],
+        };
+
+        run_test_cases(vec![
+            primitive_test_case("utf8", ScalarValue::Utf8, string_data.clone()),
+            primitive_test_case(
+                "large_utf8",
+                ScalarValue::LargeUtf8,
+                string_data.clone(),
+            ),
+            primitive_test_case("utf8_view", ScalarValue::Utf8View, string_data),
+        ])
+    }
+
+    /// Test IN LIST for all binary types (Binary, LargeBinary, BinaryView).
+    ///
+    /// Test data: [1,2,3] (in list), [1,2,2] (not in list), [[4,5,6], [7,8,9]] (other list values)
+    #[test]
+    fn in_list_binary_types() -> Result<()> {
+        let binary_data = PrimitiveTestCaseData {
+            value_in: vec![1_u8, 2, 3],
+            value_not_in: vec![1_u8, 2, 2],
+            other_list_values: vec![vec![4_u8, 5, 6], vec![7_u8, 8, 9]],
+        };
+
+        run_test_cases(vec![
+            primitive_test_case("binary", ScalarValue::Binary, binary_data.clone()),
+            primitive_test_case(
+                "large_binary",
+                ScalarValue::LargeBinary,
+                binary_data.clone(),
+            ),
+            primitive_test_case("binary_view", ScalarValue::BinaryView, binary_data),
+        ])
+    }
+
+    /// Test IN LIST for date types (Date32, Date64).
+    ///
+    /// Test data: 0 (in list), 2 (not in list), [1, 3] (other list values)
+    #[test]
+    fn in_list_date_types() -> Result<()> {
+        let date_data = PrimitiveTestCaseData {
+            value_in: 0,
+            value_not_in: 2,
+            other_list_values: vec![1, 3],
+        };
+
+        run_test_cases(vec![
+            primitive_test_case("date32", ScalarValue::Date32, date_data.clone()),
+            primitive_test_case("date64", ScalarValue::Date64, date_data),
+        ])
+    }
+
+    /// Test IN LIST for Decimal128 type.
+    ///
+    /// Test data: 0 (in list), 200 (not in list), [100, 300] (other list values) with precision=10, scale=2
+    #[test]
+    fn in_list_decimal() -> Result<()> {
+        run_test_cases(vec![InListPrimitiveTestCase {
+            name: "decimal128",
+            value_in: ScalarValue::Decimal128(Some(0), 10, 2),
+            value_not_in: ScalarValue::Decimal128(Some(200), 10, 2),
+            other_list_values: vec![
+                ScalarValue::Decimal128(Some(100), 10, 2),
+                ScalarValue::Decimal128(Some(300), 10, 2),
+            ],
+            null_value: Some(ScalarValue::Decimal128(None, 10, 2)),
+        }])
+    }
+
+    /// Test IN LIST for timestamp types.
+    ///
+    /// Test data: 0 (in list), 2000 (not in list), [1000, 3000] (other list values)
+    #[test]
+    fn in_list_timestamp_types() -> Result<()> {
+        run_test_cases(vec![
+            InListPrimitiveTestCase {
+                name: "timestamp_nanosecond",
+                value_in: ScalarValue::TimestampNanosecond(Some(0), None),
+                value_not_in: ScalarValue::TimestampNanosecond(Some(2000), None),
+                other_list_values: vec![
+                    ScalarValue::TimestampNanosecond(Some(1000), None),
+                    ScalarValue::TimestampNanosecond(Some(3000), None),
+                ],
+                null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_with_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![ScalarValue::TimestampMillisecond(
+                    Some(3500000),
+                    Some("+05:00".into()),
+                )],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_mixed_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![
+                    ScalarValue::TimestampMillisecond(
+                        Some(3500000),
+                        Some("+01:00".into()),
+                    ),
+                    ScalarValue::TimestampMillisecond(Some(4500000), Some("UTC".into())),
+                ],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+        ])
+    }
+
+    #[test]
+    fn in_list_float64() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]);
+        let a = Float64Array::from(vec![
+            Some(0.0),
+            Some(0.2),
+            None,
+            Some(f64::NAN),
+            Some(-f64::NAN),
+        ]);
         let col_a = col("a", &schema)?;
         let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // expression: "a in ("a", "b")"
-        let list = vec![lit("a"), lit("b")];
+        // expression: "a in (0.0, 0.1)"
+        let list = vec![lit(0.0f64), lit(0.1f64)];
         in_list!(
             batch,
             list,
             &false,
-            vec![Some(true), Some(false), None],
+            vec![Some(true), Some(false), None, Some(false), Some(false)],
             Arc::clone(&col_a),
             &schema
         );
 
-        // expression: "a not in ("a", "b")"
-        let list = vec![lit("a"), lit("b")];
+        // expression: "a not in (0.0, 0.1)"
+        let list = vec![lit(0.0f64), lit(0.1f64)];
         in_list!(
             batch,
             list,
             &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a in ("a", "b", null)"
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in ("a", "b", null)"
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_binary() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Binary, true)]);
-        let a = BinaryArray::from(vec![
-            Some([1, 2, 3].as_slice()),
-            Some([1, 2, 2].as_slice()),
-            None,
-        ]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in ([1, 2, 3], [4, 5, 6])"
-        let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())];
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in ([1, 2, 3], [4, 5, 6])"
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a in ([1, 2, 3], [4, 5, 6], null)"
-        let list = vec![
-            lit([1, 2, 3].as_slice()),
-            lit([4, 5, 6].as_slice()),
-            lit(ScalarValue::Binary(None)),
-        ];
-        in_list!(
-            batch,
-            list.clone(),
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a in ([1, 2, 3], [4, 5, 6], null)"
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_int64() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
-        let a = Int64Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in (0, 1)"
-        let list = vec![lit(0i64), lit(1i64)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (0, 1)"
-        let list = vec![lit(0i64), lit(1i64)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![lit(0i64), lit(1i64), lit(ScalarValue::Null)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_float64() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]);
-        let a = Float64Array::from(vec![
-            Some(0.0),
-            Some(0.2),
-            None,
-            Some(f64::NAN),
-            Some(-f64::NAN),
-        ]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
-
-        // expression: "a in (0.0, 0.1)"
-        let list = vec![lit(0.0f64), lit(0.1f64)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None, Some(false), Some(false)],
-            Arc::clone(&col_a),
-            &schema
-        );
-
-        // expression: "a not in (0.0, 0.1)"
-        let list = vec![lit(0.0f64), lit(0.1f64)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None, Some(true), Some(true)],
+            vec![Some(false), Some(true), None, Some(true), Some(true)],
             Arc::clone(&col_a),
             &schema
         );
@@ -870,236 +1593,368 @@ mod tests {
         Ok(())
     }
 
+    macro_rules! test_nullable {
+        ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{
+            let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
+            let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap();
+            let result = expr.nullable($SCHEMA)?;
+            assert_eq!($EXPECTED, result);
+        }};
+    }
+
     #[test]
-    fn in_list_date64() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]);
-        let a = Date64Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+    fn in_list_nullable() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("c1_nullable", DataType::Int64, true),
+            Field::new("c2_non_nullable", DataType::Int64, false),
+        ]);
 
-        // expression: "a in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        let c1_nullable = col("c1_nullable", &schema)?;
+        let c2_non_nullable = col("c2_non_nullable", &schema)?;
 
-        // expression: "a not in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        // static_filter has no nulls
+        let list = vec![lit(1_i64), lit(2_i64)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
 
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        // static_filter has nulls
+        let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
 
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date64(Some(0))),
-            lit(ScalarValue::Date64(Some(1))),
-            lit(ScalarValue::Null),
-        ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        let list = vec![Arc::clone(&c1_nullable)];
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
+
+        let list = vec![Arc::clone(&c2_non_nullable)];
+        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+
+        let list = vec![Arc::clone(&c2_non_nullable), Arc::clone(&c2_non_nullable)];
+        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
 
         Ok(())
     }
 
     #[test]
-    fn in_list_date32() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]);
-        let a = Date32Array::from(vec![Some(0), Some(2), None]);
-        let col_a = col("a", &schema)?;
+    fn in_list_no_cols() -> Result<()> {
+        // test logic when the in_list expression doesn't have any columns
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let a = Int32Array::from(vec![Some(1), Some(2), None]);
         let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // expression: "a in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-        ];
+        let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))];
+
+        // 1 IN (1, 6)
+        let expr = lit(ScalarValue::Int32(Some(1)));
         in_list!(
             batch,
-            list,
+            list.clone(),
             &false,
-            vec![Some(true), Some(false), None],
-            Arc::clone(&col_a),
+            // should have three outputs, as the input batch has three rows
+            vec![Some(true), Some(true), Some(true)],
+            expr,
             &schema
         );
 
-        // expression: "a not in (0, 1)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-        ];
+        // 2 IN (1, 6)
+        let expr = lit(ScalarValue::Int32(Some(2)));
         in_list!(
             batch,
-            list,
-            &true,
-            vec![Some(false), Some(true), None],
-            Arc::clone(&col_a),
+            list.clone(),
+            &false,
+            // should have three outputs, as the input batch has three rows
+            vec![Some(false), Some(false), Some(false)],
+            expr,
             &schema
         );
 
-        // expression: "a in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-            lit(ScalarValue::Null),
-        ];
+        // NULL IN (1, 6)
+        let expr = lit(ScalarValue::Int32(None));
         in_list!(
             batch,
-            list,
+            list.clone(),
             &false,
-            vec![Some(true), None, None],
-            Arc::clone(&col_a),
+            // should have three outputs, as the input batch has three rows
+            vec![None, None, None],
+            expr,
             &schema
         );
 
-        // expression: "a not in (0, 1, NULL)"
-        let list = vec![
-            lit(ScalarValue::Date32(Some(0))),
-            lit(ScalarValue::Date32(Some(1))),
-            lit(ScalarValue::Null),
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_utf8_with_dict_types() -> Result<()> {
+        fn dict_lit(key_type: DataType, value: &str) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::new_utf8(value.to_string())),
+            ))
+        }
+
+        fn null_dict_lit(key_type: DataType) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Utf8(None)),
+            ))
+        }
+
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
+            true,
+        )]);
+        let a: UInt16DictionaryArray =
+            vec![Some("a"), Some("d"), None].into_iter().collect();
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // expression: "a in ("a", "b")"
+        let lists = [
+            vec![lit("a"), lit("b")],
+            vec![
+                dict_lit(DataType::Int8, "a"),
+                dict_lit(DataType::UInt16, "b"),
+            ],
         ];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, None],
-            Arc::clone(&col_a),
-            &schema
-        );
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &false,
+                vec![Some(true), Some(false), None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
+
+        // expression: "a not in ("a", "b")"
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &true,
+                vec![Some(false), Some(true), None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
+
+        // expression: "a in ("a", "b", null)"
+        let lists = [
+            vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+            vec![
+                dict_lit(DataType::Int8, "a"),
+                dict_lit(DataType::UInt16, "b"),
+                null_dict_lit(DataType::UInt16),
+            ],
+        ];
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &false,
+                vec![Some(true), None, None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
+
+        // expression: "a not in ("a", "b", null)"
+        for list in lists.iter() {
+            in_list_raw!(
+                batch,
+                list.clone(),
+                &true,
+                vec![Some(false), None, None],
+                Arc::clone(&col_a),
+                &schema
+            );
+        }
 
         Ok(())
     }
 
     #[test]
-    fn in_list_decimal() -> Result<()> {
-        // Now, we can check the NULL type
-        let schema =
-            Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]);
-        let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)]
-            .into_iter()
-            .collect::<Decimal128Array>();
-        let array = array.with_precision_and_scale(13, 4).unwrap();
+    fn test_fmt_sql_1() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let col_a = col("a", &schema)?;
+
+        // Test: a IN ('a', 'b')
+        let list = vec![lit("a"), lit("b")];
+        let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?;
+        let sql_string = fmt_sql(expr.as_ref()).to_string();
+        let display_string = expr.to_string();
+        assert_snapshot!(sql_string, @"a IN (a, b)");
+        assert_snapshot!(display_string, @"a@0 IN (SET) ([a, b])");
+        Ok(())
+    }
+
+    #[test]
+    fn test_fmt_sql_2() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let col_a = col("a", &schema)?;
+
+        // Test: a NOT IN ('a', 'b')
+        let list = vec![lit("a"), lit("b")];
+        let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?;
+        let sql_string = fmt_sql(expr.as_ref()).to_string();
+        let display_string = expr.to_string();
+
+        assert_snapshot!(sql_string, @"a NOT IN (a, b)");
+        assert_snapshot!(display_string, @"a@0 NOT IN (SET) ([a, b])");
+        Ok(())
+    }
+
+    #[test]
+    fn test_fmt_sql_3() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let col_a = col("a", &schema)?;
+        // Test: a IN ('a', 'b', NULL)
+        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
+        let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?;
+        let sql_string = fmt_sql(expr.as_ref()).to_string();
+        let display_string = expr.to_string();
+
+        assert_snapshot!(sql_string, @"a IN (a, b, NULL)");
+        assert_snapshot!(display_string, @"a@0 IN (SET) ([a, b, NULL])");
+        Ok(())
+    }
+
+    #[test]
+    fn test_fmt_sql_4() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let col_a = col("a", &schema)?;
+        // Test: a NOT IN ('a', 'b', NULL)
+        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
+        let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?;
+        let sql_string = fmt_sql(expr.as_ref()).to_string();
+        let display_string = expr.to_string();
+        assert_snapshot!(sql_string, @"a NOT IN (a, b, NULL)");
+        assert_snapshot!(display_string, @"a@0 NOT IN (SET) ([a, b, NULL])");
+        Ok(())
+    }
+
+    #[test]
+    fn in_list_struct() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data: array of structs
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
+
         let col_a = col("a", &schema)?;
         let batch =
-            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?;
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
 
-        // expression: "a in (100,200), the data type of list is INT32
-        let list = vec![lit(100i32), lit(200i32)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, Some(false)],
-            Arc::clone(&col_a),
-            &schema
-        );
-        // expression: "a not in (100,200)
-        let list = vec![lit(100i32), lit(200i32)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(false), None, Some(true)],
-            Arc::clone(&col_a),
-            &schema
-        );
+        // Create literal structs for the IN list
+        // Struct {x: 1, y: "a"}
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
 
-        // expression: "a in (200,NULL), the data type of list is INT32 AND NULL
-        let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)];
-        in_list!(
+        // Struct {x: 3, y: "c"}
+        let struct3 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![3])),
+                Arc::new(StringArray::from(vec!["c"])),
+            ],
+            None,
+        )));
+
+        // Test: a IN ({1, "a"}, {3, "c"})
+        let list = vec![lit(struct1.clone()), lit(struct3.clone())];
+        in_list_raw!(
             batch,
             list.clone(),
             &false,
-            vec![Some(true), None, None],
+            vec![Some(true), Some(false), Some(true)],
             Arc::clone(&col_a),
             &schema
         );
-        // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL
-        in_list!(
+
+        // Test: a NOT IN ({1, "a"}, {3, "c"})
+        in_list_raw!(
             batch,
             list,
             &true,
-            vec![Some(false), None, None],
+            vec![Some(false), Some(true), Some(false)],
             Arc::clone(&col_a),
             &schema
         );
 
-        // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32
-        let list = vec![lit(200.50f32), lit(100i32)];
-        in_list!(
-            batch,
-            list,
-            &false,
-            vec![Some(true), None, Some(true)],
-            Arc::clone(&col_a),
-            &schema
-        );
+        Ok(())
+    }
 
-        // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32
-        let list = vec![lit(200.50f32), lit(101i32)];
-        in_list!(
-            batch,
-            list,
-            &true,
-            vec![Some(true), None, Some(false)],
-            Arc::clone(&col_a),
-            &schema
+    #[test]
+    fn in_list_struct_with_nulls() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data with a null struct
+        let x_array = Arc::new(Int32Array::from(vec![1, 2]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b"]));
+        let struct_array = StructArray::new(
+            struct_fields.clone(),
+            vec![x_array, y_array],
+            Some(NullBuffer::from(vec![true, false])),
         );
 
-        // test the optimization: set
-        // expression: "a in (99..300), the data type of list is INT32
-        let list = (99i32..300).map(lit).collect::<Vec<_>>();
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
+
+        // Create literal struct for the IN list
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
 
-        in_list!(
+        // Test: a IN ({1, "a"})
+        let list = vec![lit(struct1.clone())];
+        in_list_raw!(
             batch,
             list.clone(),
             &false,
-            vec![Some(true), None, Some(false)],
+            vec![Some(true), None],
             Arc::clone(&col_a),
             &schema
         );
 
-        in_list!(
+        // Test: a NOT IN ({1, "a"})
+        in_list_raw!(
             batch,
             list,
             &true,
-            vec![Some(false), None, Some(true)],
+            vec![Some(false), None],
             Arc::clone(&col_a),
             &schema
         );
@@ -1108,240 +1963,334 @@ mod tests {
     }
 
     #[test]
-    fn test_cast_static_filter_to_set() -> Result<()> {
-        // random schema
-        let schema =
-            Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]);
-
-        // list of phy expr
-        let mut phy_exprs = vec![
-            lit(1i64),
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            try_cast(lit(3.13f32), &schema, DataType::Int64)?,
-        ];
-        let result = try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        let array = Int64Array::from(vec![1, 2, 3, 4]);
-        let r = result.contains(&array, false).unwrap();
-        assert_eq!(r, BooleanArray::from(vec![true, true, true, false]));
-
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-        // cast(cast(lit())), but the cast to the same data type, one case will be ignored
-        phy_exprs.push(expressions::cast(
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            &schema,
-            DataType::Int64,
-        )?);
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        phy_exprs.clear();
-
-        // case(cast(lit())), the cast to the diff data type
-        phy_exprs.push(expressions::cast(
-            expressions::cast(lit(2i32), &schema, DataType::Int64)?,
-            &schema,
-            DataType::Int32,
-        )?);
-        try_cast_static_filter_to_set(&phy_exprs, &schema).unwrap();
-
-        // column
-        phy_exprs.push(col("a", &schema)?);
-        assert!(try_cast_static_filter_to_set(&phy_exprs, &schema).is_err());
-
-        Ok(())
-    }
-
-    #[test]
-    fn in_list_timestamp() -> Result<()> {
+    fn in_list_struct_with_null_in_list() -> Result<()> {
+        // Create schema with a struct column
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
         let schema = Schema::new(vec![Field::new(
             "a",
-            DataType::Timestamp(TimeUnit::Microsecond, None),
+            DataType::Struct(struct_fields.clone()),
             true,
         )]);
-        let a = TimestampMicrosecondArray::from(vec![
-            Some(1388588401000000000),
-            Some(1288588501000000000),
-            None,
-        ]);
+
+        // Create test data
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
+
         let col_a = col("a", &schema)?;
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
 
-        let list = vec![
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000000),
-                None,
-            )),
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000001),
-                None,
-            )),
-            lit(ScalarValue::TimestampMicrosecond(
-                Some(1388588401000000002),
-                None,
-            )),
-        ];
+        // Create literal structs including a NULL
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
 
-        in_list!(
+        let null_struct = ScalarValue::Struct(Arc::new(StructArray::new_null(
+            struct_fields.clone(),
+            1,
+        )));
+
+        // Test: a IN ({1, "a"}, NULL)
+        let list = vec![lit(struct1), lit(null_struct.clone())];
+        in_list_raw!(
             batch,
             list.clone(),
             &false,
-            vec![Some(true), Some(false), None],
+            vec![Some(true), None, None],
             Arc::clone(&col_a),
             &schema
         );
 
-        in_list!(
+        // Test: a NOT IN ({1, "a"}, NULL)
+        in_list_raw!(
             batch,
-            list.clone(),
+            list,
             &true,
-            vec![Some(false), Some(true), None],
+            vec![Some(false), None, None],
             Arc::clone(&col_a),
             &schema
         );
+
         Ok(())
     }
 
     #[test]
-    fn in_expr_with_multiple_element_in_list() -> Result<()> {
-        let schema = Schema::new(vec![
-            Field::new("a", DataType::Float64, true),
-            Field::new("b", DataType::Float64, true),
-            Field::new("c", DataType::Float64, true),
-        ]);
-        let a = Float64Array::from(vec![
-            Some(0.0),
-            Some(1.0),
-            Some(2.0),
-            Some(f64::NAN),
-            Some(-f64::NAN),
+    fn in_list_nested_struct() -> Result<()> {
+        // Create nested struct schema
+        let inner_struct_fields = Fields::from(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
         ]);
-        let b = Float64Array::from(vec![
-            Some(8.0),
-            Some(1.0),
-            Some(5.0),
-            Some(f64::NAN),
-            Some(3.0),
+        let outer_struct_fields = Fields::from(vec![
+            Field::new(
+                "inner",
+                DataType::Struct(inner_struct_fields.clone()),
+                false,
+            ),
+            Field::new("c", DataType::Int32, false),
         ]);
-        let c = Float64Array::from(vec![
-            Some(6.0),
-            Some(7.0),
+        let schema = Schema::new(vec![Field::new(
+            "x",
+            DataType::Struct(outer_struct_fields.clone()),
+            true,
+        )]);
+
+        // Create test data with nested structs
+        let inner1 = Arc::new(StructArray::new(
+            inner_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec!["x", "y"])),
+            ],
             None,
-            Some(5.0),
-            Some(-f64::NAN),
-        ]);
-        let col_a = col("a", &schema)?;
-        let col_b = col("b", &schema)?;
-        let col_c = col("c", &schema)?;
-        let batch = RecordBatch::try_new(
-            Arc::new(schema.clone()),
-            vec![Arc::new(a), Arc::new(b), Arc::new(c)],
-        )?;
+        ));
+        let c_array = Arc::new(Int32Array::from(vec![10, 20]));
+        let outer_array =
+            StructArray::new(outer_struct_fields.clone(), vec![inner1, c_array], None);
 
-        let list = vec![Arc::clone(&col_b), Arc::clone(&col_c)];
-        in_list!(
+        let col_x = col("x", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(outer_array)])?;
+
+        // Create a nested struct literal matching the first row
+        let inner_match = Arc::new(StructArray::new(
+            inner_struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["x"])),
+            ],
+            None,
+        ));
+        let outer_match = ScalarValue::Struct(Arc::new(StructArray::new(
+            outer_struct_fields.clone(),
+            vec![inner_match, Arc::new(Int32Array::from(vec![10]))],
+            None,
+        )));
+
+        // Test: x IN ({{1, "x"}, 10})
+        let list = vec![lit(outer_match)];
+        in_list_raw!(
             batch,
             list.clone(),
             &false,
-            vec![Some(false), Some(true), None, Some(true), Some(true)],
-            Arc::clone(&col_a),
+            vec![Some(true), Some(false)],
+            Arc::clone(&col_x),
             &schema
         );
 
-        in_list!(
+        // Test: x NOT IN ({{1, "x"}, 10})
+        in_list_raw!(
             batch,
             list,
             &true,
-            vec![Some(true), Some(false), None, Some(false), Some(false)],
-            Arc::clone(&col_a),
+            vec![Some(false), Some(true)],
+            Arc::clone(&col_x),
             &schema
         );
 
         Ok(())
     }
 
-    macro_rules! test_nullable {
-        ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{
-            let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
-            let expr = in_list(cast_expr, cast_list_exprs, &false, $SCHEMA).unwrap();
-            let result = expr.nullable($SCHEMA)?;
-            assert_eq!($EXPECTED, result);
-        }};
-    }
-
     #[test]
-    fn in_list_nullable() -> Result<()> {
-        let schema = Schema::new(vec![
-            Field::new("c1_nullable", DataType::Int64, true),
-            Field::new("c2_non_nullable", DataType::Int64, false),
+    fn in_list_struct_with_exprs_not_array() -> Result<()> {
+        // Test InList using expressions (not the array constructor) with structs
+        // By using InListExpr::new directly, we bypass the array optimization
+        // and use the Exprs variant, testing the expression evaluation path
+
+        // Create schema with a struct column {x: Int32, y: Utf8}
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
         ]);
+        let schema = Schema::new(vec![Field::new(
+            "a",
+            DataType::Struct(struct_fields.clone()),
+            true,
+        )]);
 
-        let c1_nullable = col("c1_nullable", &schema)?;
-        let c2_non_nullable = col("c2_non_nullable", &schema)?;
+        // Create test data: array of structs [{1, "a"}, {2, "b"}, {3, "c"}]
+        let x_array = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let y_array = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+        let struct_array =
+            StructArray::new(struct_fields.clone(), vec![x_array, y_array], None);
 
-        // static_filter has no nulls
-        let list = vec![lit(1_i64), lit(2_i64)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
+        let col_a = col("a", &schema)?;
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(struct_array)])?;
 
-        // static_filter has nulls
-        let list = vec![lit(1_i64), lit(2_i64), lit(ScalarValue::Null)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
+        // Create struct literals with the SAME shape (so types are compatible)
+        // Struct {x: 1, y: "a"}
+        let struct1 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1])),
+                Arc::new(StringArray::from(vec!["a"])),
+            ],
+            None,
+        )));
 
-        let list = vec![Arc::clone(&c1_nullable)];
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, true);
+        // Struct {x: 3, y: "c"}
+        let struct3 = ScalarValue::Struct(Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![3])),
+                Arc::new(StringArray::from(vec!["c"])),
+            ],
+            None,
+        )));
 
-        let list = vec![Arc::clone(&c2_non_nullable)];
-        test_nullable!(Arc::clone(&c1_nullable), list.clone(), &schema, true);
+        // Create list of struct expressions
+        let list = vec![lit(struct1), lit(struct3)];
 
-        let list = vec![Arc::clone(&c2_non_nullable), Arc::clone(&c2_non_nullable)];
-        test_nullable!(Arc::clone(&c2_non_nullable), list.clone(), &schema, false);
+        // Use InListExpr::new directly (not in_list()) to bypass array optimization
+        // This creates an InList without a static filter
+        let expr = Arc::new(InListExpr::new(Arc::clone(&col_a), list, false, None));
+
+        // Verify that the expression doesn't have a static filter
+        // by checking the display string does NOT contain "(SET)"
+        let display_string = expr.to_string();
+        assert!(
+            !display_string.contains("(SET)"),
+            "Expected display string to NOT contain '(SET)' (should use Exprs variant), but got: {display_string}",
+        );
+
+        // Evaluate the expression
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // Expected: first row {1, "a"} matches struct1,
+        //           second row {2, "b"} doesn't match,
+        //           third row {3, "c"} matches struct3
+        let expected = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
+        assert_eq!(result, &expected);
+
+        // Test NOT IN as well
+        let expr_not = Arc::new(InListExpr::new(
+            Arc::clone(&col_a),
+            vec![
+                lit(ScalarValue::Struct(Arc::new(StructArray::new(
+                    struct_fields.clone(),
+                    vec![
+                        Arc::new(Int32Array::from(vec![1])),
+                        Arc::new(StringArray::from(vec!["a"])),
+                    ],
+                    None,
+                )))),
+                lit(ScalarValue::Struct(Arc::new(StructArray::new(
+                    struct_fields.clone(),
+                    vec![
+                        Arc::new(Int32Array::from(vec![3])),
+                        Arc::new(StringArray::from(vec!["c"])),
+                    ],
+                    None,
+                )))),
+            ],
+            true,
+            None,
+        ));
+
+        let result_not = expr_not.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result_not = as_boolean_array(&result_not);
+
+        let expected_not = BooleanArray::from(vec![Some(false), Some(true), Some(false)]);
+        assert_eq!(result_not, &expected_not);
 
         Ok(())
     }
 
     #[test]
-    fn in_list_no_cols() -> Result<()> {
-        // test logic when the in_list expression doesn't have any columns
-        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
-        let a = Int32Array::from(vec![Some(1), Some(2), None]);
-        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+    fn test_in_list_null_handling_comprehensive() -> Result<()> {
+        // Comprehensive test demonstrating SQL three-valued logic for IN expressions
+        // This test explicitly shows all possible outcomes: true, false, and null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
 
-        let list = vec![lit(ScalarValue::from(1i32)), lit(ScalarValue::from(6i32))];
+        // Test data: [1, 2, 3, null]
+        // - 1 will match in both lists
+        // - 2 will not match in either list
+        // - 3 will not match in either list
+        // - null is always null
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // 1 IN (1, 6)
-        let expr = lit(ScalarValue::Int32(Some(1)));
+        // Case 1: List WITHOUT null - demonstrates true/false/null outcomes
+        // "a IN (1, 4)" - 1 matches, 2 and 3 don't match, null is null
+        let list = vec![lit(1i64), lit(4i64)];
         in_list!(
             batch,
-            list.clone(),
+            list,
             &false,
-            // should have three outputs, as the input batch has three rows
-            vec![Some(true), Some(true), Some(true)],
-            expr,
+            vec![
+                Some(true),  // 1 is in the list → true
+                Some(false), // 2 is not in the list → false
+                Some(false), // 3 is not in the list → false
+                None,        // null IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
             &schema
         );
 
-        // 2 IN (1, 6)
-        let expr = lit(ScalarValue::Int32(Some(2)));
+        // Case 2: List WITH null - demonstrates null propagation for non-matches
+        // "a IN (1, NULL)" - 1 matches (true), 2/3 don't match but list has null (null), null is null
+        let list = vec![lit(1i64), lit(ScalarValue::Int64(None))];
         in_list!(
             batch,
-            list.clone(),
+            list,
             &false,
-            // should have three outputs, as the input batch has three rows
-            vec![Some(false), Some(false), Some(false)],
-            expr,
+            vec![
+                Some(true), // 1 is in the list → true (found match)
+                None, // 2 is not in list, but list has NULL → null (might match NULL)
+                None, // 3 is not in list, but list has NULL → null (might match NULL)
+                None, // null IN (...) → null (SQL three-valued logic)
+            ],
+            Arc::clone(&col_a),
             &schema
         );
 
-        // NULL IN (1, 6)
-        let expr = lit(ScalarValue::Int32(None));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_only_nulls() -> Result<()> {
+        // Edge case: IN list contains ONLY null values
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(1), Some(2), None]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // "a IN (NULL, NULL)" - list has only nulls
+        let list = vec![lit(ScalarValue::Int64(None)), lit(ScalarValue::Int64(None))];
+
+        // All results should be NULL because:
+        // - Non-null values (1, 2) can't match anything concrete, but list might contain matching value
+        // - NULL value is always NULL in IN expressions
         in_list!(
             batch,
             list.clone(),
             &false,
-            // should have three outputs, as the input batch has three rows
             vec![None, None, None],
-            expr,
+            Arc::clone(&col_a),
+            &schema
+        );
+
+        // "a NOT IN (NULL, NULL)" - list has only nulls
+        // All results should still be NULL due to three-valued logic
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![None, None, None],
+            Arc::clone(&col_a),
             &schema
         );
 
@@ -1349,133 +2298,1790 @@ mod tests {
     }
 
     #[test]
-    fn in_list_utf8_with_dict_types() -> Result<()> {
-        fn dict_lit(key_type: DataType, value: &str) -> Arc<dyn PhysicalExpr> {
-            lit(ScalarValue::Dictionary(
-                Box::new(key_type),
-                Box::new(ScalarValue::new_utf8(value.to_string())),
-            ))
-        }
+    fn test_in_list_multiple_nulls_deduplication() -> Result<()> {
+        // Test that multiple NULLs in the list are handled correctly
+        // This verifies deduplication doesn't break null handling
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let col_a = col("a", &schema)?;
 
-        fn null_dict_lit(key_type: DataType) -> Arc<dyn PhysicalExpr> {
-            lit(ScalarValue::Dictionary(
-                Box::new(key_type),
-                Box::new(ScalarValue::Utf8(None)),
-            ))
-        }
+        // Create array with multiple nulls: [1, 2, NULL, NULL, 3, NULL]
+        let array = Arc::new(Int64Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            None,
+            Some(3),
+            None,
+        ])) as ArrayRef;
 
-        let schema = Schema::new(vec![Field::new(
-            "a",
-            DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)),
-            true,
-        )]);
-        let a: UInt16DictionaryArray =
-            vec![Some("a"), Some("d"), None].into_iter().collect();
+        // Create InListExpr from array
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+
+        // Create test data: [1, 2, 3, 4, null]
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), Some(4), None]);
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+
+        // Evaluate the expression
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // Expected behavior with multiple NULLs in list:
+        // - Values in the list (1,2,3) → true
+        // - Values not in the list (4) → NULL (because list contains NULL)
+        // - NULL input → NULL
+        let expected = BooleanArray::from(vec![
+            Some(true), // 1 is in list
+            Some(true), // 2 is in list
+            Some(true), // 3 is in list
+            None,       // 4 not in list, but list has NULLs
+            None,       // NULL input
+        ]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_in_null_handling_comprehensive() -> Result<()> {
+        // Comprehensive test demonstrating SQL three-valued logic for NOT IN expressions
+        // This test explicitly shows all possible outcomes for NOT IN: true, false, and null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+
+        // Test data: [1, 2, 3, null]
+        let a = Int64Array::from(vec![Some(1), Some(2), Some(3), None]);
         let col_a = col("a", &schema)?;
         let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // expression: "a in ("a", "b")"
-        let lists = [
-            vec![lit("a"), lit("b")],
+        // Case 1: List WITHOUT null - demonstrates true/false/null outcomes for NOT IN
+        // "a NOT IN (1, 4)" - 1 matches (false), 2 and 3 don't match (true), null is null
+        let list = vec![lit(1i64), lit(4i64)];
+        in_list!(
+            batch,
+            list,
+            &true,
             vec![
-                dict_lit(DataType::Int8, "a"),
-                dict_lit(DataType::UInt16, "b"),
+                Some(false), // 1 is in the list → NOT IN returns false
+                Some(true),  // 2 is not in the list → NOT IN returns true
+                Some(true),  // 3 is not in the list → NOT IN returns true
+                None,        // null NOT IN (...) → null (SQL three-valued logic)
             ],
-        ];
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &false,
-                vec![Some(true), Some(false), None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
-
-        // expression: "a not in ("a", "b")"
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &true,
-                vec![Some(false), Some(true), None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
+            Arc::clone(&col_a),
+            &schema
+        );
 
-        // expression: "a in ("a", "b", null)"
-        let lists = [
-            vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+        // Case 2: List WITH null - demonstrates null propagation for NOT IN
+        // "a NOT IN (1, NULL)" - 1 matches (false), 2/3 don't match but list has null (null), null is null
+        let list = vec![lit(1i64), lit(ScalarValue::Int64(None))];
+        in_list!(
+            batch,
+            list,
+            &true,
             vec![
-                dict_lit(DataType::Int8, "a"),
-                dict_lit(DataType::UInt16, "b"),
-                null_dict_lit(DataType::UInt16),
+                Some(false), // 1 is in the list → NOT IN returns false
+                None, // 2 is not in known values, but list has NULL → null (can't prove it's not in list)
+                None, // 3 is not in known values, but list has NULL → null (can't prove it's not in list)
+                None, // null NOT IN (...) → null (SQL three-valued logic)
             ],
-        ];
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &false,
-                vec![Some(true), None, None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
-
-        // expression: "a not in ("a", "b", null)"
-        for list in lists.iter() {
-            in_list_raw!(
-                batch,
-                list.clone(),
-                &true,
-                vec![Some(false), None, None],
-                Arc::clone(&col_a),
-                &schema
-            );
-        }
+            Arc::clone(&col_a),
+            &schema
+        );
 
         Ok(())
     }
 
     #[test]
-    fn test_fmt_sql() -> Result<()> {
-        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+    fn test_in_list_null_type_column() -> Result<()> {
+        // Test with a column that has DataType::Null (not just nullable values)
+        // All values in a NullArray are null by definition
+        let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]);
+        let a = NullArray::new(3);
         let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
 
-        // Test: a IN ('a', 'b')
-        let list = vec![lit("a"), lit("b")];
-        let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?;
-        let sql_string = fmt_sql(expr.as_ref()).to_string();
-        let display_string = expr.to_string();
-        assert_eq!(sql_string, "a IN (a, b)");
-        assert_eq!(display_string, "Use a@0 IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }])");
+        // "null_column IN (1, 2)" - comparing Null type against Int64 list
+        // Note: This tests type coercion behavior between Null and Int64
+        let list = vec![lit(1i64), lit(2i64)];
 
-        // Test: a NOT IN ('a', 'b')
-        let list = vec![lit("a"), lit("b")];
-        let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?;
-        let sql_string = fmt_sql(expr.as_ref()).to_string();
-        let display_string = expr.to_string();
-        assert_eq!(sql_string, "a NOT IN (a, b)");
-        assert_eq!(display_string, "a@0 NOT IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }])");
+        // All results should be NULL because:
+        // - Every value in the column is null (DataType::Null)
+        // - null IN (anything) always returns null per SQL three-valued logic
+        in_list!(
+            batch,
+            list.clone(),
+            &false,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
 
-        // Test: a IN ('a', 'b', NULL)
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        let expr = in_list(Arc::clone(&col_a), list, &false, &schema)?;
-        let sql_string = fmt_sql(expr.as_ref()).to_string();
-        let display_string = expr.to_string();
-        assert_eq!(sql_string, "a IN (a, b, NULL)");
-        assert_eq!(display_string, "Use a@0 IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }, Literal { value: Utf8(NULL) }])");
+        // "null_column NOT IN (1, 2)"
+        // Same behavior for NOT IN - null NOT IN (anything) is still null
+        in_list!(
+            batch,
+            list,
+            &true,
+            vec![None, None, None],
+            Arc::clone(&col_a),
+            &schema
+        );
 
-        // Test: a NOT IN ('a', 'b', NULL)
-        let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))];
-        let expr = in_list(Arc::clone(&col_a), list, &true, &schema)?;
-        let sql_string = fmt_sql(expr.as_ref()).to_string();
-        let display_string = expr.to_string();
-        assert_eq!(sql_string, "a NOT IN (a, b, NULL)");
-        assert_eq!(display_string, "a@0 NOT IN (SET) ([Literal { value: Utf8(\"a\") }, Literal { value: Utf8(\"b\") }, Literal { value: Utf8(NULL) }])");
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_type_list() -> Result<()> {
+        // Test with a list that has DataType::Null
+        let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
+        let a = Int64Array::from(vec![Some(1), Some(2), None]);
+        let col_a = col("a", &schema)?;
+
+        // Create a NullArray as the list
+        let null_array = Arc::new(NullArray::new(2)) as ArrayRef;
+
+        // Try to create InListExpr with a NullArray list
+        // This tests whether try_new_from_array can handle Null type arrays
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            null_array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // If it succeeds, all results should be NULL
+        // because the list contains only null type values
+        let expected = BooleanArray::from(vec![None, None, None]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_null_type_both() -> Result<()> {
+        // Test when both column and list are DataType::Null
+        let schema = Schema::new(vec![Field::new("a", DataType::Null, true)]);
+        let a = NullArray::new(3);
+        let col_a = col("a", &schema)?;
+
+        // Create a NullArray as the list
+        let null_array = Arc::new(NullArray::new(2)) as ArrayRef;
+
+        // Try to create InListExpr with both Null types
+        let expr = Arc::new(InListExpr::try_new_from_array(
+            Arc::clone(&col_a),
+            null_array,
+            false,
+        )?) as Arc<dyn PhysicalExpr>;
+
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+
+        // If successful, all results should be NULL
+        // null IN [null, null] -> null
+        let expected = BooleanArray::from(vec![None, None, None]);
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_comprehensive_null_handling() -> Result<()> {
+        // Comprehensive test for IN LIST operations with various NULL handling scenarios.
+        // This test covers the key cases validated against DuckDB as the source of truth.
+        //
+        // Note: Some scalar literal tests (like NULL IN (1, 2)) are omitted as they
+        // appear to expose an issue with static filter optimization. These are covered
+        // by existing tests like in_list_no_cols().
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+        let col_b = col("b", &schema)?;
+        let null_i32 = ScalarValue::Int32(None);
+
+        // Helper to create a batch
+        let make_batch = |values: Vec<Option<i32>>| -> Result<RecordBatch> {
+            let array = Arc::new(Int32Array::from(values));
+            Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?)
+        };
+
+        // Helper to run a test
+        let run_test = |batch: &RecordBatch,
+                        expr: Arc<dyn PhysicalExpr>,
+                        list: Vec<Arc<dyn PhysicalExpr>>,
+                        expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &false, schema.as_ref())?;
+            let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            assert_eq!(result, &BooleanArray::from(expected));
+            Ok(())
+        };
+
+        // ========================================================================
+        // COLUMN TESTS - col(b) IN [1, 2]
+        // ========================================================================
+
+        // [1] IN (1, 2) => [TRUE]
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true)],
+        )?;
+
+        // [1, 2] IN (1, 2) => [TRUE, TRUE]
+        let batch = make_batch(vec![Some(1), Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true), Some(true)],
+        )?;
+
+        // [3, 4] IN (1, 2) => [FALSE, FALSE]
+        let batch = make_batch(vec![Some(3), Some(4)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(false), Some(false)],
+        )?;
+
+        // [1, NULL] IN (1, 2) => [TRUE, NULL]
+        let batch = make_batch(vec![Some(1), None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(true), None],
+        )?;
+
+        // [3, NULL] IN (1, 2) => [FALSE, NULL] (no match, NULL is NULL)
+        let batch = make_batch(vec![Some(3), None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), lit(2i32)],
+            vec![Some(false), None],
+        )?;
+
+        // ========================================================================
+        // COLUMN WITH NULL IN LIST - col(b) IN [NULL, 1]
+        // ========================================================================
+
+        // [1] IN (NULL, 1) => [TRUE] (found match)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![Some(true)],
+        )?;
+
+        // [2] IN (NULL, 1) => [NULL] (no match, but list has NULL)
+        let batch = make_batch(vec![Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![None],
+        )?;
+
+        // [NULL] IN (NULL, 1) => [NULL]
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // COLUMN WITH ALL NULLS IN LIST - col(b) IN [NULL, NULL]
+        // ========================================================================
+
+        // [1] IN (NULL, NULL) => [NULL]
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            vec![None],
+        )?;
+
+        // [NULL] IN (NULL, NULL) => [NULL]
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // LITERAL IN LIST WITH COLUMN - lit(1) IN [2, col(b)]
+        // ========================================================================
+
+        // 1 IN (2, [1]) => [TRUE] (matches column value)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // 1 IN (2, [3]) => [FALSE] (no match)
+        let batch = make_batch(vec![Some(3)])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![Some(false)],
+        )?;
+
+        // 1 IN (2, [NULL]) => [NULL] (no match, column is NULL)
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(2i32), Arc::clone(&col_b)],
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // COLUMN IN LIST CONTAINING ITSELF - col(b) IN [1, col(b)]
+        // ========================================================================
+
+        // [1] IN (1, [1]) => [TRUE] (always matches - either list literal or itself)
+        let batch = make_batch(vec![Some(1)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // [2] IN (1, [2]) => [TRUE] (matches itself)
+        let batch = make_batch(vec![Some(2)])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![Some(true)],
+        )?;
+
+        // [NULL] IN (1, [NULL]) => [NULL] (NULL is never equal to anything)
+        let batch = make_batch(vec![None])?;
+        run_test(
+            &batch,
+            Arc::clone(&col_b),
+            vec![lit(1i32), Arc::clone(&col_b)],
+            vec![None],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_scalar_literal_cases() -> Result<()> {
+        // Test scalar literal cases (both NULL and non-NULL) to ensure SQL three-valued
+        // logic is correctly implemented. This covers the important case where a scalar
+        // value is tested against a list containing NULL.
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+        let null_i32 = ScalarValue::Int32(None);
+
+        // Helper to create a batch
+        let make_batch = |values: Vec<Option<i32>>| -> Result<RecordBatch> {
+            let array = Arc::new(Int32Array::from(values));
+            Ok(RecordBatch::try_new(Arc::clone(&schema), vec![array])?)
+        };
+
+        // Helper to run a test
+        let run_test = |batch: &RecordBatch,
+                        expr: Arc<dyn PhysicalExpr>,
+                        list: Vec<Arc<dyn PhysicalExpr>>,
+                        negated: bool,
+                        expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &negated, schema.as_ref())?;
+            let result = in_expr.evaluate(batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            let expected_array = BooleanArray::from(expected);
+            assert_eq!(
+                result,
+                &expected_array,
+                "Expected {:?}, got {:?}",
+                expected_array,
+                result.iter().collect::<Vec<_>>()
+            );
+            Ok(())
+        };
+
+        let batch = make_batch(vec![Some(1)])?;
+
+        // ========================================================================
+        // NULL LITERAL TESTS
+        // According to SQL semantics, NULL IN (any_list) should always return NULL
+        // ========================================================================
+
+        // NULL IN (1, 1) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(1i32), lit(1i32)],
+            false,
+            vec![None],
+        )?;
+
+        // NULL IN (NULL, 1) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(null_i32.clone()), lit(1i32)],
+            false,
+            vec![None],
+        )?;
+
+        // NULL IN (NULL, NULL) => NULL
+        run_test(
+            &batch,
+            lit(null_i32.clone()),
+            vec![lit(null_i32.clone()), lit(null_i32.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // ========================================================================
+        // NON-NULL SCALAR LITERALS WITH NULL IN LIST - Int32
+        // When a scalar value is NOT in a list containing NULL, the result is NULL
+        // When a scalar value IS in the list, the result is TRUE (NULL doesn't matter)
+        // ========================================================================
+
+        // 3 IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL)
+        run_test(
+            &batch,
+            lit(3i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // 3 NOT IN (0, 1, 2, NULL) => NULL (not in list, but list has NULL)
+        run_test(
+            &batch,
+            lit(3i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            true,
+            vec![None],
+        )?;
+
+        // 1 IN (0, 1, 2, NULL) => TRUE (found match, NULL doesn't matter)
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            false,
+            vec![Some(true)],
+        )?;
+
+        // 1 NOT IN (0, 1, 2, NULL) => FALSE (found match, NULL doesn't matter)
+        run_test(
+            &batch,
+            lit(1i32),
+            vec![lit(0i32), lit(1i32), lit(2i32), lit(null_i32.clone())],
+            true,
+            vec![Some(false)],
+        )?;
+
+        // ========================================================================
+        // NON-NULL SCALAR LITERALS WITH NULL IN LIST - String
+        // Same semantics as Int32 but with string type
+        // ========================================================================
+
+        let schema_str =
+            Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8, true)]));
+        let batch_str = RecordBatch::try_new(
+            Arc::clone(&schema_str),
+            vec![Arc::new(StringArray::from(vec![Some("dummy")]))],
+        )?;
+        let null_str = ScalarValue::Utf8(None);
+
+        let run_test_str = |expr: Arc<dyn PhysicalExpr>,
+                            list: Vec<Arc<dyn PhysicalExpr>>,
+                            negated: bool,
+                            expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let in_expr = in_list(expr, list, &negated, schema_str.as_ref())?;
+            let result = in_expr
+                .evaluate(&batch_str)?
+                .into_array(batch_str.num_rows())?;
+            let result = as_boolean_array(&result);
+            let expected_array = BooleanArray::from(expected);
+            assert_eq!(
+                result,
+                &expected_array,
+                "Expected {:?}, got {:?}",
+                expected_array,
+                result.iter().collect::<Vec<_>>()
+            );
+            Ok(())
+        };
+
+        // 'c' IN ('a', 'b', NULL) => NULL (not in list, but list has NULL)
+        run_test_str(
+            lit("c"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            false,
+            vec![None],
+        )?;
+
+        // 'c' NOT IN ('a', 'b', NULL) => NULL (not in list, but list has NULL)
+        run_test_str(
+            lit("c"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            true,
+            vec![None],
+        )?;
+
+        // 'a' IN ('a', 'b', NULL) => TRUE (found match, NULL doesn't matter)
+        run_test_str(
+            lit("a"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            false,
+            vec![Some(true)],
+        )?;
+
+        // 'a' NOT IN ('a', 'b', NULL) => FALSE (found match, NULL doesn't matter)
+        run_test_str(
+            lit("a"),
+            vec![lit("a"), lit("b"), lit(null_str.clone())],
+            true,
+            vec![Some(false)],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_tuple_cases() -> Result<()> {
+        // Test tuple/struct cases from the original request: (lit, lit) IN (lit, lit)
+        // These test row-wise comparisons like (1, 2) IN ((1, 2), (3, 4))
+
+        let schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)]));
+
+        // Helper to create struct scalars for tuple comparisons
+        let make_struct = |v1: Option<i32>, v2: Option<i32>| -> ScalarValue {
+            let fields = Fields::from(vec![
+                Field::new("field_0", DataType::Int32, true),
+                Field::new("field_1", DataType::Int32, true),
+            ]);
+            ScalarValue::Struct(Arc::new(StructArray::new(
+                fields,
+                vec![
+                    Arc::new(Int32Array::from(vec![v1])),
+                    Arc::new(Int32Array::from(vec![v2])),
+                ],
+                None,
+            )))
+        };
+
+        // Need a single row batch for scalar tests
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![Some(1)]))],
+        )?;
+
+        // Helper to run tuple tests
+        let run_tuple_test = |lhs: ScalarValue,
+                              list: Vec<ScalarValue>,
+                              expected: Vec<Option<bool>>|
+         -> Result<()> {
+            let expr = in_list(
+                lit(lhs),
+                list.into_iter().map(lit).collect(),
+                &false,
+                schema.as_ref(),
+            )?;
+            let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+            let result = as_boolean_array(&result);
+            assert_eq!(result, &BooleanArray::from(expected));
+            Ok(())
+        };
+
+        // (NULL, NULL) IN ((1, 2)) => FALSE (tuples don't match)
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, NULL) IN ((NULL, 1)) => FALSE
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(None, Some(1))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match including nulls)
+        run_tuple_test(
+            make_struct(None, None),
+            vec![make_struct(None, None)],
+            vec![Some(true)],
+        )?;
+
+        // (NULL, 1) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (NULL, 1) IN ((NULL, 1)) => TRUE (exact match)
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(None, Some(1))],
+            vec![Some(true)],
+        )?;
+
+        // (NULL, 1) IN ((NULL, NULL)) => FALSE
+        run_tuple_test(
+            make_struct(None, Some(1)),
+            vec![make_struct(None, None)],
+            vec![Some(false)],
+        )?;
+
+        // (1, 2) IN ((1, 2)) => TRUE
+        run_tuple_test(
+            make_struct(Some(1), Some(2)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(true)],
+        )?;
+
+        // (1, 3) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(3)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (4, 4) IN ((1, 2)) => FALSE
+        run_tuple_test(
+            make_struct(Some(4), Some(4)),
+            vec![make_struct(Some(1), Some(2))],
+            vec![Some(false)],
+        )?;
+
+        // (1, 1) IN ((NULL, 1)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(1)),
+            vec![make_struct(None, Some(1))],
+            vec![Some(false)],
+        )?;
+
+        // (1, 1) IN ((NULL, NULL)) => FALSE
+        run_tuple_test(
+            make_struct(Some(1), Some(1)),
+            vec![make_struct(None, None)],
+            vec![Some(false)],
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_dictionary_int32() -> Result<()> {
+        // Create schema with dictionary-encoded Int32 column
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32));
+        let schema = Schema::new(vec![Field::new("a", dict_type.clone(), false)]);
+        let col_a = col("a", &schema)?;
+
+        // Create IN list with Int32 literals: (100, 200, 300)
+        let list = vec![lit(100i32), lit(200i32), lit(300i32)];
+
+        // Create InListExpr via in_list() - this uses Int32StaticFilter for Int32 lists
+        let expr = in_list(col_a, list, &false, &schema)?;
+
+        // Create dictionary-encoded batch with values [100, 200, 500]
+        // Dictionary: keys [0, 1, 2] -> values [100, 200, 500]
+        // Using values clearly distinct from keys to avoid confusion
+        let keys = Int8Array::from(vec![0, 1, 2]);
+        let values = Int32Array::from(vec![100, 200, 500]);
+        let dict_array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values))?);
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![dict_array])?;
+
+        // Expected: [100 IN (100,200,300), 200 IN (100,200,300), 500 IN (100,200,300)] = [true, true, false]
+        let result = expr.evaluate(&batch)?.into_array(3)?;
+        let result = as_boolean_array(&result);
+        assert_eq!(result, &BooleanArray::from(vec![true, true, false]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_dictionary_types() -> Result<()> {
+        // Helper functions for creating dictionary literals
+        fn dict_lit_int64(key_type: DataType, value: i64) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Int64(Some(value))),
+            ))
+        }
+
+        fn dict_lit_float64(key_type: DataType, value: f64) -> Arc<dyn PhysicalExpr> {
+            lit(ScalarValue::Dictionary(
+                Box::new(key_type),
+                Box::new(ScalarValue::Float64(Some(value))),
+            ))
+        }
+
+        // Test case structures
+        struct DictNeedleTest {
+            list_values: Vec<Arc<dyn PhysicalExpr>>,
+            expected: Vec<Option<bool>>,
+        }
+
+        struct DictionaryInListTestCase {
+            name: &'static str,
+            dict_type: DataType,
+            dict_keys: Vec<Option<i8>>,
+            dict_values: ArrayRef,
+            list_values_no_null: Vec<Arc<dyn PhysicalExpr>>,
+            list_values_with_null: Vec<Arc<dyn PhysicalExpr>>,
+            expected_1: Vec<Option<bool>>,
+            expected_2: Vec<Option<bool>>,
+            expected_3: Vec<Option<bool>>,
+            expected_4: Vec<Option<bool>>,
+            dict_needle_test: Option<DictNeedleTest>,
+        }
+
+        // Test harness function
+        fn run_dictionary_in_list_test(
+            test_case: DictionaryInListTestCase,
+        ) -> Result<()> {
+            // Create schema with dictionary type
+            let schema =
+                Schema::new(vec![Field::new("a", test_case.dict_type.clone(), true)]);
+            let col_a = col("a", &schema)?;
+
+            // Create dictionary array from keys and values
+            let keys = Int8Array::from(test_case.dict_keys.clone());
+            let dict_array: ArrayRef =
+                Arc::new(DictionaryArray::try_new(keys, test_case.dict_values)?);
+            let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![dict_array])?;
+
+            let exp1 = test_case.expected_1.clone();
+            let exp2 = test_case.expected_2.clone();
+            let exp3 = test_case.expected_3.clone();
+            let exp4 = test_case.expected_4;
+
+            // Test 1: a IN (values_no_null)
+            in_list!(
+                batch,
+                test_case.list_values_no_null.clone(),
+                &false,
+                exp1,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 2: a NOT IN (values_no_null)
+            in_list!(
+                batch,
+                test_case.list_values_no_null.clone(),
+                &true,
+                exp2,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 3: a IN (values_with_null)
+            in_list!(
+                batch,
+                test_case.list_values_with_null.clone(),
+                &false,
+                exp3,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Test 4: a NOT IN (values_with_null)
+            in_list!(
+                batch,
+                test_case.list_values_with_null,
+                &true,
+                exp4,
+                Arc::clone(&col_a),
+                &schema
+            );
+
+            // Optional: Dictionary needle test (if provided)
+            if let Some(needle_test) = test_case.dict_needle_test {
+                in_list_raw!(
+                    batch,
+                    needle_test.list_values,
+                    &false,
+                    needle_test.expected,
+                    Arc::clone(&col_a),
+                    &schema
+                );
+            }
+
+            Ok(())
+        }
+
+        // Test case 1: UTF8
+        // Dictionary: keys [0, 1, null] → values ["a", "d", -]
+        // Rows: ["a", "d", null]
+        let utf8_case = DictionaryInListTestCase {
+            name: "dictionary_utf8",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Utf8),
+            ),
+            dict_keys: vec![Some(0), Some(1), None],
+            dict_values: Arc::new(StringArray::from(vec![Some("a"), Some("d")])),
+            list_values_no_null: vec![lit("a"), lit("b")],
+            list_values_with_null: vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+            expected_1: vec![Some(true), Some(false), None],
+            expected_2: vec![Some(false), Some(true), None],
+            expected_3: vec![Some(true), None, None],
+            expected_4: vec![Some(false), None, None],
+            dict_needle_test: None,
+        };
+
+        // Test case 2: Int64 with dictionary needles
+        // Dictionary: keys [0, 1, null] → values [10, 20, -]
+        // Rows: [10, 20, null]
+        let int64_case = DictionaryInListTestCase {
+            name: "dictionary_int64",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Int64),
+            ),
+            dict_keys: vec![Some(0), Some(1), None],
+            dict_values: Arc::new(Int64Array::from(vec![Some(10), Some(20)])),
+            list_values_no_null: vec![lit(10i64), lit(15i64)],
+            list_values_with_null: vec![
+                lit(10i64),
+                lit(15i64),
+                lit(ScalarValue::Int64(None)),
+            ],
+            expected_1: vec![Some(true), Some(false), None],
+            expected_2: vec![Some(false), Some(true), None],
+            expected_3: vec![Some(true), None, None],
+            expected_4: vec![Some(false), None, None],
+            dict_needle_test: Some(DictNeedleTest {
+                list_values: vec![
+                    dict_lit_int64(DataType::Int16, 10),
+                    dict_lit_int64(DataType::Int16, 15),
+                ],
+                expected: vec![Some(true), Some(false), None],
+            }),
+        };
+
+        // Test case 3: Float64 with NaN and dictionary needles
+        // Dictionary: keys [0, 1, null, 2] → values [1.5, 3.7, NaN, -]
+        // Rows: [1.5, 3.7, null, NaN]
+        // Note: NaN is a value (not null), so it goes in the values array
+        let float64_case = DictionaryInListTestCase {
+            name: "dictionary_float64",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Float64),
+            ),
+            dict_keys: vec![Some(0), Some(1), None, Some(2)],
+            dict_values: Arc::new(Float64Array::from(vec![
+                Some(1.5),      // index 0
+                Some(3.7),      // index 1
+                Some(f64::NAN), // index 2
+            ])),
+            list_values_no_null: vec![lit(1.5f64), lit(2.0f64)],
+            list_values_with_null: vec![
+                lit(1.5f64),
+                lit(2.0f64),
+                lit(ScalarValue::Float64(None)),
+            ],
+            // Test 1: a IN (1.5, 2.0) → [true, false, null, false]
+            // NaN is false because NaN not in list and no NULL in list
+            expected_1: vec![Some(true), Some(false), None, Some(false)],
+            // Test 2: a NOT IN (1.5, 2.0) → [false, true, null, true]
+            // NaN is true because NaN not in list
+            expected_2: vec![Some(false), Some(true), None, Some(true)],
+            // Test 3: a IN (1.5, 2.0, NULL) → [true, null, null, null]
+            // 3.7 and NaN become null due to NULL in list (three-valued logic)
+            expected_3: vec![Some(true), None, None, None],
+            // Test 4: a NOT IN (1.5, 2.0, NULL) → [false, null, null, null]
+            // 3.7 and NaN become null due to NULL in list
+            expected_4: vec![Some(false), None, None, None],
+            dict_needle_test: Some(DictNeedleTest {
+                list_values: vec![
+                    dict_lit_float64(DataType::UInt16, 1.5),
+                    dict_lit_float64(DataType::UInt16, 2.0),
+                ],
+                expected: vec![Some(true), Some(false), None, Some(false)],
+            }),
+        };
+
+        // Execute all test cases
+        let test_name = utf8_case.name;
+        run_dictionary_in_list_test(utf8_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        let test_name = int64_case.name;
+        run_dictionary_in_list_test(int64_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        let test_name = float64_case.name;
+        run_dictionary_in_list_test(float64_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        // Additional test: Dictionary deduplication with repeated keys
+        // This tests that multiple rows with the same key (pointing to the same value)
+        // are evaluated correctly
+        let dedup_case = DictionaryInListTestCase {
+            name: "dictionary_deduplication",
+            dict_type: DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::Utf8),
+            ),
+            // Keys: [0, 1, 0, 1, null] - keys 0 and 1 are repeated
+            // This creates data: ["a", "d", "a", "d", null]
+            dict_keys: vec![Some(0), Some(1), Some(0), Some(1), None],
+            dict_values: Arc::new(StringArray::from(vec![Some("a"), Some("d")])),
+            list_values_no_null: vec![lit("a"), lit("b")],
+            list_values_with_null: vec![lit("a"), lit("b"), lit(ScalarValue::Utf8(None))],
+            // Test 1: a IN ("a", "b") → [true, false, true, false, null]
+            // Rows 0 and 2 both have key 0 → "a", so both are true
+            expected_1: vec![Some(true), Some(false), Some(true), Some(false), None],
+            // Test 2: a NOT IN ("a", "b") → [false, true, false, true, null]
+            expected_2: vec![Some(false), Some(true), Some(false), Some(true), None],
+            // Test 3: a IN ("a", "b", NULL) → [true, null, true, null, null]
+            // "d" becomes null due to NULL in list
+            expected_3: vec![Some(true), None, Some(true), None, None],
+            // Test 4: a NOT IN ("a", "b", NULL) → [false, null, false, null, null]
+            expected_4: vec![Some(false), None, Some(false), None, None],
+            dict_needle_test: None,
+        };
+
+        let test_name = dedup_case.name;
+        run_dictionary_in_list_test(dedup_case).map_err(|e| {
+            datafusion_common::DataFusionError::Execution(format!(
+                "Dictionary test '{test_name}' failed: {e}"
+            ))
+        })?;
+
+        // Additional test for Float64 NaN in IN list
+        let dict_type =
+            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Float64));
+        let schema = Schema::new(vec![Field::new("a", dict_type.clone(), true)]);
+        let col_a = col("a", &schema)?;
+
+        let keys = Int8Array::from(vec![Some(0), Some(1), None, Some(2)]);
+        let values = Float64Array::from(vec![Some(1.5), Some(3.7), Some(f64::NAN)]);
+        let dict_array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values))?);
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![dict_array])?;
+
+        // Test: a IN (1.5, 2.0, NaN)
+        let list_with_nan = vec![lit(1.5f64), lit(2.0f64), lit(f64::NAN)];
+        in_list!(
+            batch,
+            list_with_nan,
+            &false,
+            vec![Some(true), Some(false), None, Some(true)],
+            col_a,
+            &schema
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_esoteric_types() -> Result<()> {
+        // Test esoteric/less common types to validate the transform and mapping flow.
+        // These types are reinterpreted to base primitive types (e.g., Timestamp -> UInt64,
+        // Interval -> Decimal128, Float16 -> UInt16). We just need to verify basic
+        // functionality works - no need for comprehensive null handling tests.
+
+        // Helper: simple IN test that expects [Some(true), Some(false)]
+        let test_type = |data_type: DataType,
+                         in_array: ArrayRef,
+                         list_values: Vec<ScalarValue>|
+         -> Result<()> {
+            let schema = Schema::new(vec![Field::new("a", data_type.clone(), false)]);
+            let col_a = col("a", &schema)?;
+            let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![in_array])?;
+
+            let list = list_values.into_iter().map(lit).collect();
+            in_list!(
+                batch,
+                list,
+                &false,
+                vec![Some(true), Some(false)],
+                col_a,
+                &schema
+            );
+            Ok(())
+        };
+
+        // Timestamp types (all units map to Int64 -> UInt64)
+        test_type(
+            DataType::Timestamp(TimeUnit::Second, None),
+            Arc::new(TimestampSecondArray::from(vec![Some(1000), Some(2000)])),
+            vec![
+                ScalarValue::TimestampSecond(Some(1000), None),
+                ScalarValue::TimestampSecond(Some(1500), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            Arc::new(TimestampMillisecondArray::from(vec![
+                Some(1000000),
+                Some(2000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMillisecond(Some(1000000), None),
+                ScalarValue::TimestampMillisecond(Some(1500000), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Microsecond, None),
+            Arc::new(TimestampMicrosecondArray::from(vec![
+                Some(1000000000),
+                Some(2000000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMicrosecond(Some(1000000000), None),
+                ScalarValue::TimestampMicrosecond(Some(1500000000), None),
+            ],
+        )?;
+
+        // Time32 and Time64 (map to Int32 -> UInt32 and Int64 -> UInt64 respectively)
+        test_type(
+            DataType::Time32(TimeUnit::Second),
+            Arc::new(Time32SecondArray::from(vec![Some(3600), Some(7200)])),
+            vec![
+                ScalarValue::Time32Second(Some(3600)),
+                ScalarValue::Time32Second(Some(5400)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time32(TimeUnit::Millisecond),
+            Arc::new(Time32MillisecondArray::from(vec![
+                Some(3600000),
+                Some(7200000),
+            ])),
+            vec![
+                ScalarValue::Time32Millisecond(Some(3600000)),
+                ScalarValue::Time32Millisecond(Some(5400000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Microsecond),
+            Arc::new(Time64MicrosecondArray::from(vec![
+                Some(3600000000),
+                Some(7200000000),
+            ])),
+            vec![
+                ScalarValue::Time64Microsecond(Some(3600000000)),
+                ScalarValue::Time64Microsecond(Some(5400000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Nanosecond),
+            Arc::new(Time64NanosecondArray::from(vec![
+                Some(3600000000000),
+                Some(7200000000000),
+            ])),
+            vec![
+                ScalarValue::Time64Nanosecond(Some(3600000000000)),
+                ScalarValue::Time64Nanosecond(Some(5400000000000)),
+            ],
+        )?;
+
+        // Duration types (map to Int64 -> UInt64)
+        test_type(
+            DataType::Duration(TimeUnit::Second),
+            Arc::new(DurationSecondArray::from(vec![Some(86400), Some(172800)])),
+            vec![
+                ScalarValue::DurationSecond(Some(86400)),
+                ScalarValue::DurationSecond(Some(129600)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Millisecond),
+            Arc::new(DurationMillisecondArray::from(vec![
+                Some(86400000),
+                Some(172800000),
+            ])),
+            vec![
+                ScalarValue::DurationMillisecond(Some(86400000)),
+                ScalarValue::DurationMillisecond(Some(129600000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Microsecond),
+            Arc::new(DurationMicrosecondArray::from(vec![
+                Some(86400000000),
+                Some(172800000000),
+            ])),
+            vec![
+                ScalarValue::DurationMicrosecond(Some(86400000000)),
+                ScalarValue::DurationMicrosecond(Some(129600000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Nanosecond),
+            Arc::new(DurationNanosecondArray::from(vec![
+                Some(86400000000000),
+                Some(172800000000000),
+            ])),
+            vec![
+                ScalarValue::DurationNanosecond(Some(86400000000000)),
+                ScalarValue::DurationNanosecond(Some(129600000000000)),
+            ],
+        )?;
+
+        // Interval types (map to 16-byte Decimal128Type)
+        test_type(
+            DataType::Interval(IntervalUnit::YearMonth),
+            Arc::new(IntervalYearMonthArray::from(vec![Some(12), Some(24)])),
+            vec![
+                ScalarValue::IntervalYearMonth(Some(12)),
+                ScalarValue::IntervalYearMonth(Some(18)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::DayTime),
+            Arc::new(IntervalDayTimeArray::from(vec![
+                Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                }),
+                Some(IntervalDayTime {
+                    days: 2,
+                    milliseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                })),
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 500,
+                })),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::MonthDayNano),
+            Arc::new(IntervalMonthDayNanoArray::from(vec![
+                Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+                Some(IntervalMonthDayNano {
+                    months: 2,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                })),
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 15,
+                    nanoseconds: 0,
+                })),
+            ],
+        )?;
+
+        // Decimal256 (maps to Decimal128Type for 16-byte width)
+        // Need to use with_precision_and_scale() to set the metadata
+        let precision = 38;
+        let scale = 10;
+        test_type(
+            DataType::Decimal256(precision, scale),
+            Arc::new(
+                Decimal256Array::from(vec![
+                    Some(i256::from(12345)),
+                    Some(i256::from(67890)),
+                ])
+                .with_precision_and_scale(precision, scale)?,
+            ),
+            vec![
+                ScalarValue::Decimal256(Some(i256::from(12345)), precision, scale),
+                ScalarValue::Decimal256(Some(i256::from(54321)), precision, scale),
+            ],
+        )?;
+
+        Ok(())
+    }
+
+    /// Helper: creates an InListExpr with `static_filter = None`
+    /// to force the column-reference evaluation path.
+    fn make_in_list_with_columns(
+        expr: Arc<dyn PhysicalExpr>,
+        list: Vec<Arc<dyn PhysicalExpr>>,
+        negated: bool,
+    ) -> Arc<InListExpr> {
+        Arc::new(InListExpr::new(expr, list, negated, None))
+    }
+
+    #[test]
+    fn test_in_list_with_columns_int32_scalars() -> Result<()> {
+        // Column-reference path with scalar literals (bypassing static filter)
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(Int32Array::from(vec![
+                Some(1),
+                Some(2),
+                Some(3),
+                None,
+            ]))],
+        )?;
+
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(false), Some(true), None,])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_int32_column_refs() -> Result<()> {
+        // IN list with column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), None])),
+                Arc::new(Int32Array::from(vec![
+                    Some(1),
+                    Some(99),
+                    Some(99),
+                    Some(99),
+                ])),
+                Arc::new(Int32Array::from(vec![Some(99), Some(99), Some(3), None])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (1, 99) → true
+        // row 1: 2 IN (99, 99) → false
+        // row 2: 3 IN (99, 3) → true
+        // row 3: NULL IN (99, NULL) → NULL
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(false), Some(true), None,])
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_utf8_column_refs() -> Result<()> {
+        // IN list with Utf8 column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(StringArray::from(vec!["x", "y", "z"])),
+                Arc::new(StringArray::from(vec!["x", "x", "z"])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: "x" IN ("x") → true
+        // row 1: "y" IN ("x") → false
+        // row 2: "z" IN ("z") → true
+        assert_eq!(result, &BooleanArray::from(vec![true, false, true]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_negated() -> Result<()> {
+        // NOT IN with column references
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(Int32Array::from(vec![1, 99, 3])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, true);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 NOT IN (1) → false
+        // row 1: 2 NOT IN (99) → true
+        // row 2: 3 NOT IN (3) → false
+        assert_eq!(result, &BooleanArray::from(vec![false, true, false]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_null_in_list() -> Result<()> {
+        // IN list with NULL scalar (column-reference path)
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let col_a = col("a", &schema)?;
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2]))],
+        )?;
+
+        let list = vec![
+            lit(ScalarValue::Int32(None)),
+            lit(ScalarValue::Int32(Some(1))),
+        ];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (NULL, 1) → true (true OR null = true)
+        // row 1: 2 IN (NULL, 1) → NULL (false OR null = null)
+        assert_eq!(result, &BooleanArray::from(vec![Some(true), None]));
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_with_columns_float_nan() -> Result<()> {
+        // Verify NaN == NaN is true in the column-reference path
+        // (consistent with Arrow's totalOrder semantics)
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Float64, false),
+            Field::new("b", DataType::Float64, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Float64Array::from(vec![f64::NAN, 1.0, f64::NAN])),
+                Arc::new(Float64Array::from(vec![f64::NAN, 2.0, 0.0])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: NaN IN (NaN) → true
+        // row 1: 1.0 IN (2.0) → false
+        // row 2: NaN IN (0.0) → false
+        assert_eq!(result, &BooleanArray::from(vec![true, false, false]));
+        Ok(())
+    }
+
+    /// Tests that short-circuit evaluation produces correct results.
+    /// When all rows match after the first list item, remaining items
+    /// should be skipped without affecting correctness.
+    #[test]
+    fn test_in_list_with_columns_short_circuit() -> Result<()> {
+        // a IN (b, c) where b already matches every row of a
+        // The short-circuit should skip evaluating c
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])), // b == a for all rows
+                Arc::new(Int32Array::from(vec![99, 99, 99])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        assert_eq!(result, &BooleanArray::from(vec![true, true, true]));
+        Ok(())
+    }
+
+    /// Short-circuit must NOT skip when nulls are present (three-valued logic).
+    /// Even if all non-null values are true, null rows keep the result as null.
+    #[test]
+    fn test_in_list_with_columns_short_circuit_with_nulls() -> Result<()> {
+        // a IN (b, c) where a has nulls
+        // Even if b matches all non-null rows, result should preserve nulls
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])),
+                Arc::new(Int32Array::from(vec![1, 2, 3])), // matches non-null rows
+                Arc::new(Int32Array::from(vec![99, 99, 99])),
+            ],
+        )?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: 1 IN (1, 99) → true
+        // row 1: NULL IN (2, 99) → NULL
+        // row 2: 3 IN (3, 99) → true
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), None, Some(true)])
+        );
+        Ok(())
+    }
+
+    /// Tests the make_comparator + collect_bool fallback path using
+    /// struct column references (nested types don't support arrow_eq).
+    #[test]
+    fn test_in_list_with_columns_struct() -> Result<()> {
+        let struct_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Utf8, false),
+        ]);
+        let struct_dt = DataType::Struct(struct_fields.clone());
+
+        let schema = Schema::new(vec![
+            Field::new("a", struct_dt.clone(), true),
+            Field::new("b", struct_dt.clone(), false),
+            Field::new("c", struct_dt.clone(), false),
+        ]);
+
+        // a: [{1,"a"}, {2,"b"}, NULL,    {4,"d"}]
+        // b: [{1,"a"}, {9,"z"}, {3,"c"}, {4,"d"}]
+        // c: [{9,"z"}, {2,"b"}, {9,"z"}, {9,"z"}]
+        let a = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3, 4])),
+                Arc::new(StringArray::from(vec!["a", "b", "c", "d"])),
+            ],
+            Some(vec![true, true, false, true].into()),
+        ));
+        let b = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 9, 3, 4])),
+                Arc::new(StringArray::from(vec!["a", "z", "c", "d"])),
+            ],
+            None,
+        ));
+        let c = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![9, 2, 9, 9])),
+                Arc::new(StringArray::from(vec!["z", "b", "z", "z"])),
+            ],
+            None,
+        ));
+
+        let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![a, b, c])?;
+
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, false);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: {1,"a"} IN ({1,"a"}, {9,"z"}) → true  (matches b)
+        // row 1: {2,"b"} IN ({9,"z"}, {2,"b"}) → true  (matches c)
+        // row 2: NULL    IN ({3,"c"}, {9,"z"}) → NULL
+        // row 3: {4,"d"} IN ({4,"d"}, {9,"z"}) → true  (matches b)
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(true), Some(true), None, Some(true)])
+        );
+
+        // Also test NOT IN
+        let col_a = col("a", &schema)?;
+        let list = vec![col("b", &schema)?, col("c", &schema)?];
+        let expr = make_in_list_with_columns(col_a, list, true);
+
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        let result = as_boolean_array(&result);
+        // row 0: {1,"a"} NOT IN ({1,"a"}, {9,"z"}) → false
+        // row 1: {2,"b"} NOT IN ({9,"z"}, {2,"b"}) → false
+        // row 2: NULL    NOT IN ({3,"c"}, {9,"z"}) → NULL
+        // row 3: {4,"d"} NOT IN ({4,"d"}, {9,"z"}) → false
+        assert_eq!(
+            result,
+            &BooleanArray::from(vec![Some(false), Some(false), None, Some(false)])
+        );
+        Ok(())
+    }
+
+    // -----------------------------------------------------------------------
+    // Tests for try_new_from_array: evaluates `needle IN in_array`.
+    //
+    // This exercises the code path used by HashJoin dynamic filter pushdown,
+    // where in_array is built directly from the join's build-side arrays.
+    // Unlike try_new (used by SQL IN expressions), which always produces a
+    // non-Dictionary in_array because evaluate_list() flattens Dictionary
+    // scalars, try_new_from_array passes the array directly and can produce
+    // a Dictionary in_array.
+    // -----------------------------------------------------------------------
+
+    fn wrap_in_dict(array: ArrayRef) -> ArrayRef {
+        let keys = Int32Array::from((0..array.len() as i32).collect::<Vec<_>>());
+        Arc::new(DictionaryArray::new(keys, array))
+    }
+
+    /// Evaluates `needle IN in_array` via try_new_from_array, the same
+    /// path used by HashJoin dynamic filter pushdown (not the SQL literal
+    /// IN path which goes through try_new).
+    fn eval_in_list_from_array(
+        needle: ArrayRef,
+        in_array: ArrayRef,
+    ) -> Result<BooleanArray> {
+        let schema =
+            Schema::new(vec![Field::new("a", needle.data_type().clone(), false)]);
+        let col_a = col("a", &schema)?;
+        let expr = Arc::new(InListExpr::try_new_from_array(col_a, in_array, false)?)
+            as Arc<dyn PhysicalExpr>;
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![needle])?;
+        let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+        Ok(as_boolean_array(&result).clone())
+    }
+
+    #[test]
+    fn test_in_list_from_array_type_combinations() -> Result<()> {
+        use arrow::compute::cast;
+
+        // All cases: needle[0] and needle[2] match, needle[1] does not.
+        let expected = BooleanArray::from(vec![Some(true), Some(false), Some(true)]);
+
+        // Base arrays cast to each target type
+        let base_in = Arc::new(Int64Array::from(vec![1i64, 2, 3])) as ArrayRef;
+        let base_needle = Arc::new(Int64Array::from(vec![1i64, 4, 2])) as ArrayRef;
+
+        // Test all specializations in instantiate_static_filter
+        let primitive_types = vec![
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float32,
+            DataType::Float64,
+        ];
+
+        for dt in &primitive_types {
+            let in_array = cast(&base_in, dt)?;
+            let needle = cast(&base_needle, dt)?;
+
+            // T in_array, T needle
+            assert_eq!(
+                expected,
+                eval_in_list_from_array(Arc::clone(&needle), Arc::clone(&in_array))?,
+                "same-type failed for {dt:?}"
+            );
+
+            // T in_array, Dict(Int32, T) needle
+            assert_eq!(
+                expected,
+                eval_in_list_from_array(wrap_in_dict(needle), in_array)?,
+                "dict-needle failed for {dt:?}"
+            );
+        }
+
+        // Utf8 (falls through to ArrayStaticFilter)
+        let utf8_in = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef;
+        let utf8_needle = Arc::new(StringArray::from(vec!["a", "d", "b"])) as ArrayRef;
+
+        // Utf8 in_array, Utf8 needle
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(Arc::clone(&utf8_needle), Arc::clone(&utf8_in),)?
+        );
+
+        // Utf8 in_array, Dict(Utf8) needle
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                wrap_in_dict(Arc::clone(&utf8_needle)),
+                Arc::clone(&utf8_in),
+            )?
+        );
+
+        // Dict(Utf8) in_array, Dict(Utf8) needle: the #20937 bug
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                wrap_in_dict(Arc::clone(&utf8_needle)),
+                wrap_in_dict(Arc::clone(&utf8_in)),
+            )?
+        );
+
+        // Struct in_array, Struct needle: multi-column join
+        let struct_fields = Fields::from(vec![
+            Field::new("c0", DataType::Utf8, true),
+            Field::new("c1", DataType::Int64, true),
+        ]);
+        let make_struct = |c0: ArrayRef, c1: ArrayRef| -> ArrayRef {
+            let pairs: Vec<(FieldRef, ArrayRef)> =
+                struct_fields.iter().cloned().zip([c0, c1]).collect();
+            Arc::new(StructArray::from(pairs))
+        };
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                make_struct(
+                    Arc::clone(&utf8_needle),
+                    Arc::new(Int64Array::from(vec![1, 4, 2])),
+                ),
+                make_struct(
+                    Arc::clone(&utf8_in),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ),
+            )?
+        );
+
+        // Struct with Dict fields: multi-column Dict join
+        let dict_struct_fields = Fields::from(vec![
+            Field::new(
+                "c0",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new("c1", DataType::Int64, true),
+        ]);
+        let make_dict_struct = |c0: ArrayRef, c1: ArrayRef| -> ArrayRef {
+            let pairs: Vec<(FieldRef, ArrayRef)> =
+                dict_struct_fields.iter().cloned().zip([c0, c1]).collect();
+            Arc::new(StructArray::from(pairs))
+        };
+        assert_eq!(
+            expected,
+            eval_in_list_from_array(
+                make_dict_struct(
+                    wrap_in_dict(Arc::clone(&utf8_needle)),
+                    Arc::new(Int64Array::from(vec![1, 4, 2])),
+                ),
+                make_dict_struct(
+                    wrap_in_dict(Arc::clone(&utf8_in)),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ),
+            )?
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_in_list_from_array_type_mismatch_errors() -> Result<()> {
+        // Utf8 needle, Dict(Utf8) in_array
+        let err = eval_in_list_from_array(
+            Arc::new(StringArray::from(vec!["a", "d", "b"])),
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "b", "c"]))),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(
+            err.contains("Can't compare arrays of different types"),
+            "{err}"
+        );
+
+        // Dict(Utf8) needle, Int64 in_array: specialized Int64StaticFilter
+        // rejects the Utf8 dictionary values at construction time
+        let err = eval_in_list_from_array(
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "d", "b"]))),
+            Arc::new(Int64Array::from(vec![1, 2, 3])),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(err.contains("Failed to downcast"), "{err}");
+
+        // Dict(Int64) needle, Dict(Utf8) in_array: both Dict but different
+        // value types, make_comparator rejects the comparison
+        let err = eval_in_list_from_array(
+            wrap_in_dict(Arc::new(Int64Array::from(vec![1, 4, 2]))),
+            wrap_in_dict(Arc::new(StringArray::from(vec!["a", "b", "c"]))),
+        )
+        .unwrap_err()
+        .to_string();
+        assert!(
+            err.contains("Can't compare arrays of different types"),
+            "{err}"
+        );
 
         Ok(())
     }
diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs
index ff05dab40126a..62be8ebbc13e3 100644
--- a/datafusion/physical-expr/src/expressions/is_not_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_not_null.rs
@@ -18,7 +18,6 @@
 //! IS NOT NULL expression
 
 use crate::PhysicalExpr;
-use arrow::datatypes::FieldRef;
 use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
@@ -94,10 +93,6 @@ impl PhysicalExpr for IsNotNullExpr {
         }
     }
 
-    fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
-        self.arg.return_field(input_schema)
-    }
-
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
         vec![&self.arg]
     }
diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs
index 15c7c645bda09..356fe2a866672 100644
--- a/datafusion/physical-expr/src/expressions/is_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_null.rs
@@ -18,7 +18,6 @@
 //! IS NULL expression
 
 use crate::PhysicalExpr;
-use arrow::datatypes::FieldRef;
 use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
@@ -93,10 +92,6 @@ impl PhysicalExpr for IsNullExpr {
         }
     }
 
-    fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
-        self.arg.return_field(input_schema)
-    }
-
     fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
         vec![&self.arg]
     }
diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs
index e86c778d51619..fc49ca35f0498 100644
--- a/datafusion/physical-expr/src/expressions/like.rs
+++ b/datafusion/physical-expr/src/expressions/like.rs
@@ -18,8 +18,8 @@
 use crate::PhysicalExpr;
 use arrow::datatypes::{DataType, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
-use datafusion_expr::ColumnarValue;
+use datafusion_common::{Result, assert_or_internal_err};
+use datafusion_expr::{ColumnarValue, Operator};
 use datafusion_physical_expr_common::datum::apply_cmp;
 use std::hash::Hash;
 use std::{any::Any, sync::Arc};
@@ -118,14 +118,13 @@ impl PhysicalExpr for LikeExpr {
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
-        use arrow::compute::*;
         let lhs = self.expr.evaluate(batch)?;
         let rhs = self.pattern.evaluate(batch)?;
         match (self.negated, self.case_insensitive) {
-            (false, false) => apply_cmp(&lhs, &rhs, like),
-            (false, true) => apply_cmp(&lhs, &rhs, ilike),
-            (true, false) => apply_cmp(&lhs, &rhs, nlike),
-            (true, true) => apply_cmp(&lhs, &rhs, nilike),
+            (false, false) => apply_cmp(Operator::LikeMatch, &lhs, &rhs),
+            (false, true) => apply_cmp(Operator::ILikeMatch, &lhs, &rhs),
+            (true, false) => apply_cmp(Operator::NotLikeMatch, &lhs, &rhs),
+            (true, true) => apply_cmp(Operator::NotILikeMatch, &lhs, &rhs),
         }
     }
 
@@ -170,11 +169,10 @@ pub fn like(
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let expr_type = &expr.data_type(input_schema)?;
     let pattern_type = &pattern.data_type(input_schema)?;
-    if !expr_type.eq(pattern_type) && !can_like_type(expr_type) {
-        return internal_err!(
-            "The type of {expr_type} AND {pattern_type} of like physical should be same"
-        );
-    }
+    assert_or_internal_err!(
+        expr_type.eq(pattern_type) || can_like_type(expr_type),
+        "The type of {expr_type} AND {pattern_type} of like physical should be same"
+    );
     Ok(Arc::new(LikeExpr::new(
         negated,
         case_insensitive,
diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr/src/expressions/literal.rs
index 6f7caaea8d45f..9105297c96d61 100644
--- a/datafusion/physical-expr/src/expressions/literal.rs
+++ b/datafusion/physical-expr/src/expressions/literal.rs
@@ -23,26 +23,60 @@ use std::sync::Arc;
 
 use crate::physical_expr::PhysicalExpr;
 
+use arrow::datatypes::{Field, FieldRef};
 use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
+use datafusion_common::metadata::FieldMetadata;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::Expr;
 use datafusion_expr_common::columnar_value::ColumnarValue;
 use datafusion_expr_common::interval_arithmetic::Interval;
+use datafusion_expr_common::placement::ExpressionPlacement;
 use datafusion_expr_common::sort_properties::{ExprProperties, SortProperties};
 
 /// Represents a literal value
-#[derive(Debug, PartialEq, Eq, Hash)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct Literal {
     value: ScalarValue,
+    field: FieldRef,
+}
+
+impl Hash for Literal {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.value.hash(state);
+        let metadata = self.field.metadata();
+        let mut keys = metadata.keys().collect::<Vec<_>>();
+        keys.sort();
+        for key in keys {
+            key.hash(state);
+            metadata.get(key).unwrap().hash(state);
+        }
+    }
 }
 
 impl Literal {
     /// Create a literal value expression
     pub fn new(value: ScalarValue) -> Self {
-        Self { value }
+        Self::new_with_metadata(value, None)
+    }
+
+    /// Create a literal value expression
+    pub fn new_with_metadata(
+        value: ScalarValue,
+        metadata: Option<FieldMetadata>,
+    ) -> Self {
+        let mut field = Field::new("lit".to_string(), value.data_type(), value.is_null());
+
+        if let Some(metadata) = metadata {
+            field = metadata.add_to_field(field);
+        }
+
+        Self {
+            value,
+            field: field.into(),
+        }
     }
 
     /// Get the scalar value
@@ -71,6 +105,10 @@ impl PhysicalExpr for Literal {
         Ok(self.value.is_null())
     }
 
+    fn return_field(&self, _input_schema: &Schema) -> Result<FieldRef> {
+        Ok(Arc::clone(&self.field))
+    }
+
     fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
         Ok(ColumnarValue::Scalar(self.value.clone()))
     }
@@ -97,12 +135,17 @@ impl PhysicalExpr for Literal {
     fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         std::fmt::Display::fmt(self, f)
     }
+
+    fn placement(&self) -> ExpressionPlacement {
+        ExpressionPlacement::Literal
+    }
 }
 
 /// Create a literal expression
+#[expect(clippy::needless_pass_by_value)]
 pub fn lit<T: datafusion_expr::Literal>(value: T) -> Arc<dyn PhysicalExpr> {
     match value.lit() {
-        Expr::Literal(v) => Arc::new(Literal::new(v)),
+        Expr::Literal(v, _) => Arc::new(Literal::new(v)),
         _ => unreachable!(),
     }
 }
diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs
index d77207fbbcd76..c9e02708d6c28 100644
--- a/datafusion/physical-expr/src/expressions/mod.rs
+++ b/datafusion/physical-expr/src/expressions/mod.rs
@@ -21,6 +21,7 @@
 mod binary;
 mod case;
 mod cast;
+mod cast_column;
 mod column;
 mod dynamic_filters;
 mod in_list;
@@ -34,22 +35,24 @@ mod not;
 mod try_cast;
 mod unknown_column;
 
+pub use crate::PhysicalSortExpr;
 /// Module with some convenient methods used in expression building
 pub use crate::aggregate::stats::StatsType;
-pub use crate::PhysicalSortExpr;
 
-pub use binary::{binary, similar_to, BinaryExpr};
-pub use case::{case, CaseExpr};
-pub use cast::{cast, CastExpr};
-pub use column::{col, with_new_schema, Column};
+pub use binary::{BinaryExpr, binary, similar_to};
+pub use case::{CaseExpr, case};
+pub use cast::{CastExpr, cast};
+pub use cast_column::CastColumnExpr;
+pub use column::{Column, col, with_new_schema};
 pub use datafusion_expr::utils::format_state_name;
-pub use in_list::{in_list, InListExpr};
-pub use is_not_null::{is_not_null, IsNotNullExpr};
-pub use is_null::{is_null, IsNullExpr};
-pub use like::{like, LikeExpr};
-pub use literal::{lit, Literal};
-pub use negative::{negative, NegativeExpr};
+pub use dynamic_filters::DynamicFilterPhysicalExpr;
+pub use in_list::{InListExpr, in_list};
+pub use is_not_null::{IsNotNullExpr, is_not_null};
+pub use is_null::{IsNullExpr, is_null};
+pub use like::{LikeExpr, like};
+pub use literal::{Literal, lit};
+pub use negative::{NegativeExpr, negative};
 pub use no_op::NoOp;
-pub use not::{not, NotExpr};
-pub use try_cast::{try_cast, TryCastExpr};
+pub use not::{NotExpr, not};
+pub use try_cast::{TryCastExpr, try_cast};
 pub use unknown_column::UnKnownColumn;
diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs
index fa7224768a777..c78bbe999eb26 100644
--- a/datafusion/physical-expr/src/expressions/negative.rs
+++ b/datafusion/physical-expr/src/expressions/negative.rs
@@ -29,15 +29,15 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::statistics::Distribution::{
     self, Bernoulli, Exponential, Gaussian, Generic, Uniform,
 };
 use datafusion_expr::{
-    type_coercion::{is_interval, is_null, is_signed_numeric, is_timestamp},
     ColumnarValue,
+    type_coercion::{is_interval, is_signed_numeric, is_timestamp},
 };
 
 /// Negative expression
@@ -190,7 +190,7 @@ pub fn negative(
     input_schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let data_type = arg.data_type(input_schema)?;
-    if is_null(&data_type) {
+    if data_type.is_null() {
         Ok(arg)
     } else if !is_signed_numeric(&data_type)
         && !is_interval(&data_type)
@@ -205,19 +205,18 @@ pub fn negative(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::expressions::{col, Column};
+    use crate::expressions::{Column, col};
 
     use arrow::array::*;
-    use arrow::datatypes::DataType::{Float32, Float64, Int16, Int32, Int64, Int8};
+    use arrow::datatypes::DataType::{Float32, Float64, Int8, Int16, Int32, Int64};
     use arrow::datatypes::*;
     use datafusion_common::cast::as_primitive_array;
     use datafusion_common::{DataFusionError, ScalarValue};
 
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
-    use paste::paste;
 
     macro_rules! test_array_negative_op {
-        ($DATA_TY:tt, $($VALUE:expr),*   ) => {
+        ($DATA_TY:tt, $ARRAY_TY:ty, $($VALUE:expr),*   ) => {
             let schema = Schema::new(vec![Field::new("a", DataType::$DATA_TY, true)]);
             let expr = negative(col("a", &schema)?, &schema)?;
             assert_eq!(expr.data_type(&schema)?, DataType::$DATA_TY);
@@ -230,8 +229,8 @@ mod tests {
             )+
             arr.push(None);
             arr_expected.push(None);
-            let input = paste!{[<$DATA_TY Array>]::from(arr)};
-            let expected = &paste!{[<$DATA_TY Array>]::from(arr_expected)};
+            let input = <$ARRAY_TY>::from(arr);
+            let expected = &<$ARRAY_TY>::from(arr_expected);
             let batch =
                 RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(input)])?;
             let result = expr.evaluate(&batch)?.into_array(batch.num_rows()).expect("Failed to convert to array");
@@ -243,12 +242,12 @@ mod tests {
 
     #[test]
     fn array_negative_op() -> Result<()> {
-        test_array_negative_op!(Int8, 2i8, 1i8);
-        test_array_negative_op!(Int16, 234i16, 123i16);
-        test_array_negative_op!(Int32, 2345i32, 1234i32);
-        test_array_negative_op!(Int64, 23456i64, 12345i64);
-        test_array_negative_op!(Float32, 2345.0f32, 1234.0f32);
-        test_array_negative_op!(Float64, 23456.0f64, 12345.0f64);
+        test_array_negative_op!(Int8, Int8Array, 2i8, 1i8);
+        test_array_negative_op!(Int16, Int16Array, 234i16, 123i16);
+        test_array_negative_op!(Int32, Int32Array, 2345i32, 1234i32);
+        test_array_negative_op!(Int64, Int64Array, 23456i64, 12345i64);
+        test_array_negative_op!(Float32, Float32Array, 2345.0f32, 1234.0f32);
+        test_array_negative_op!(Float64, Float64Array, 23456.0f64, 12345.0f64);
         Ok(())
     }
 
@@ -277,11 +276,13 @@ mod tests {
         );
 
         // Bernoulli
-        assert!(negative_expr
-            .evaluate_statistics(&[&Distribution::new_bernoulli(ScalarValue::from(
-                0.75
-            ))?])
-            .is_err());
+        assert!(
+            negative_expr
+                .evaluate_statistics(&[&Distribution::new_bernoulli(ScalarValue::from(
+                    0.75
+                ))?])
+                .is_err()
+        );
 
         // Exponential
         assert_eq!(
diff --git a/datafusion/physical-expr/src/expressions/no_op.rs b/datafusion/physical-expr/src/expressions/no_op.rs
index 94610996c6b00..ff44a60a862d0 100644
--- a/datafusion/physical-expr/src/expressions/no_op.rs
+++ b/datafusion/physical-expr/src/expressions/no_op.rs
@@ -26,7 +26,7 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::ColumnarValue;
 
 /// A place holder expression, can not be evaluated.
diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs
index 8184ef601e543..a29ab2ff40f5c 100644
--- a/datafusion/physical-expr/src/expressions/not.rs
+++ b/datafusion/physical-expr/src/expressions/not.rs
@@ -26,10 +26,10 @@ use crate::PhysicalExpr;
 
 use arrow::datatypes::{DataType, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{cast::as_boolean_array, internal_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, cast::as_boolean_array, internal_err};
+use datafusion_expr::ColumnarValue;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::statistics::Distribution::{self, Bernoulli};
-use datafusion_expr::ColumnarValue;
 
 /// Not expression
 #[derive(Debug, Eq)]
@@ -155,16 +155,16 @@ impl PhysicalExpr for NotExpr {
         match (parent, children[0]) {
             (Bernoulli(parent), Bernoulli(child)) => {
                 let parent_range = parent.range();
-                let result = if parent_range == Interval::CERTAINLY_TRUE {
-                    if child.range() == Interval::CERTAINLY_TRUE {
+                let result = if parent_range == Interval::TRUE {
+                    if child.range() == Interval::TRUE {
                         None
                     } else {
                         Some(vec![Distribution::new_bernoulli(ScalarValue::new_zero(
                             &child.data_type(),
                         )?)?])
                     }
-                } else if parent_range == Interval::CERTAINLY_FALSE {
-                    if child.range() == Interval::CERTAINLY_FALSE {
+                } else if parent_range == Interval::FALSE {
+                    if child.range() == Interval::FALSE {
                         None
                     } else {
                         Some(vec![Distribution::new_bernoulli(ScalarValue::new_one(
@@ -196,7 +196,7 @@ mod tests {
     use std::sync::LazyLock;
 
     use super::*;
-    use crate::expressions::{col, Column};
+    use crate::expressions::{Column, col};
 
     use arrow::{array::BooleanArray, datatypes::*};
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
@@ -265,28 +265,31 @@ mod tests {
         let expr = not(a)?;
 
         // Uniform with non-boolean bounds
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_uniform(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_uniform(
                 Interval::make_unbounded(&DataType::Float64)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Exponential
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_exponential(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_exponential(
                 ScalarValue::from(1.0),
                 ScalarValue::from(1.0),
                 true
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Gaussian
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_gaussian(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_gaussian(
                 ScalarValue::from(1.0),
                 ScalarValue::from(1.0),
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Bernoulli
         assert_eq!(
@@ -310,24 +313,26 @@ mod tests {
             Distribution::new_bernoulli(ScalarValue::from(0.75))?
         );
 
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_generic(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_generic(
                 ScalarValue::Null,
                 ScalarValue::Null,
                 ScalarValue::Null,
                 Interval::make_unbounded(&DataType::UInt8)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         // Unknown with non-boolean interval as range
-        assert!(expr
-            .evaluate_statistics(&[&Distribution::new_generic(
+        assert!(
+            expr.evaluate_statistics(&[&Distribution::new_generic(
                 ScalarValue::Null,
                 ScalarValue::Null,
                 ScalarValue::Null,
                 Interval::make_unbounded(&DataType::Float64)?
             )?])
-            .is_err());
+            .is_err()
+        );
 
         Ok(())
     }
diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs
index b593dfe83209d..306f14b48fa39 100644
--- a/datafusion/physical-expr/src/expressions/try_cast.rs
+++ b/datafusion/physical-expr/src/expressions/try_cast.rs
@@ -22,12 +22,12 @@ use std::sync::Arc;
 
 use crate::PhysicalExpr;
 use arrow::compute;
-use arrow::compute::{cast_with_options, CastOptions};
+use arrow::compute::CastOptions;
 use arrow::datatypes::{DataType, FieldRef, Schema};
 use arrow::record_batch::RecordBatch;
 use compute::can_cast_types;
 use datafusion_common::format::DEFAULT_FORMAT_OPTIONS;
-use datafusion_common::{not_impl_err, Result, ScalarValue};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::ColumnarValue;
 
 /// TRY_CAST expression casts an expression to a specific data type and returns NULL on invalid cast
@@ -72,7 +72,7 @@ impl TryCastExpr {
 
 impl fmt::Display for TryCastExpr {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "TRY_CAST({} AS {:?})", self.expr, self.cast_type)
+        write!(f, "TRY_CAST({} AS {})", self.expr, self.cast_type)
     }
 }
 
@@ -96,18 +96,7 @@ impl PhysicalExpr for TryCastExpr {
             safe: true,
             format_options: DEFAULT_FORMAT_OPTIONS,
         };
-        match value {
-            ColumnarValue::Array(array) => {
-                let cast = cast_with_options(&array, &self.cast_type, &options)?;
-                Ok(ColumnarValue::Array(cast))
-            }
-            ColumnarValue::Scalar(scalar) => {
-                let array = scalar.to_array()?;
-                let cast_array = cast_with_options(&array, &self.cast_type, &options)?;
-                let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?;
-                Ok(ColumnarValue::Scalar(cast_scalar))
-            }
-        }
+        value.cast_to(&self.cast_type, Some(&options))
     }
 
     fn return_field(&self, input_schema: &Schema) -> Result<FieldRef> {
@@ -153,7 +142,7 @@ pub fn try_cast(
     } else if can_cast_types(&expr_type, &cast_type) {
         Ok(Arc::new(TryCastExpr::new(expr, cast_type)))
     } else {
-        not_impl_err!("Unsupported TRY_CAST from {expr_type:?} to {cast_type:?}")
+        not_impl_err!("Unsupported TRY_CAST from {expr_type} to {cast_type}")
     }
 }
 
@@ -166,8 +155,8 @@ mod tests {
     };
     use arrow::{
         array::{
-            Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-            Int8Array, TimestampNanosecondArray, UInt32Array,
+            Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
+            Int64Array, TimestampNanosecondArray, UInt32Array,
         },
         datatypes::*,
     };
@@ -191,7 +180,7 @@ mod tests {
 
             // verify that its display is correct
             assert_eq!(
-                format!("TRY_CAST(a@0 AS {:?})", $TYPE),
+                format!("TRY_CAST(a@0 AS {})", $TYPE),
                 format!("{}", expression)
             );
 
@@ -217,7 +206,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
@@ -242,7 +231,7 @@ mod tests {
 
             // verify that its display is correct
             assert_eq!(
-                format!("TRY_CAST(a@0 AS {:?})", $TYPE),
+                format!("TRY_CAST(a@0 AS {})", $TYPE),
                 format!("{}", expression)
             );
 
@@ -271,7 +260,7 @@ mod tests {
             for (i, x) in $VEC.iter().enumerate() {
                 match x {
                     Some(x) => assert_eq!(result.value(i), *x),
-                    None => assert!(!result.is_valid(i)),
+                    None => assert!(result.is_null(i)),
                 }
             }
         }};
diff --git a/datafusion/physical-expr/src/expressions/unknown_column.rs b/datafusion/physical-expr/src/expressions/unknown_column.rs
index 2face4eb6bdb6..f06d880985f4a 100644
--- a/datafusion/physical-expr/src/expressions/unknown_column.rs
+++ b/datafusion/physical-expr/src/expressions/unknown_column.rs
@@ -27,7 +27,7 @@ use arrow::{
     datatypes::{DataType, Schema},
     record_batch::RecordBatch,
 };
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, internal_err};
 use datafusion_expr::ColumnarValue;
 
 #[derive(Debug, Clone, Eq)]
diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs
index 28f76bbfd1c8c..e5e9304ab1d99 100644
--- a/datafusion/physical-expr/src/intervals/cp_solver.rs
+++ b/datafusion/physical-expr/src/intervals/cp_solver.rs
@@ -42,7 +42,7 @@
 //!
 //! In order to use interval arithmetic to compute bounds for this expression,
 //! one would first determine intervals that represent the possible values of
-//! `x` and `y`` Let's say that the interval for `x` is `[1, 2]` and the interval
+//! `x` and `y` Let's say that the interval for `x` is `[1, 2]` and the interval
 //! for `y` is `[-3, 1]`. In the chart below, you can see how the computation
 //! takes place.
 //!
@@ -148,19 +148,19 @@ use std::sync::Arc;
 use super::utils::{
     convert_duration_type_to_interval, convert_interval_type_to_duration, get_inverse_op,
 };
-use crate::expressions::{BinaryExpr, Literal};
-use crate::utils::{build_dag, ExprTreeNode};
 use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, Literal};
+use crate::utils::{ExprTreeNode, build_dag};
 
 use arrow::datatypes::{DataType, Schema};
-use datafusion_common::{internal_err, not_impl_err, Result};
-use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval};
+use datafusion_common::{Result, internal_err, not_impl_err};
 use datafusion_expr::Operator;
+use datafusion_expr::interval_arithmetic::{Interval, apply_operator, satisfy_greater};
 
+use petgraph::Outgoing;
 use petgraph::graph::NodeIndex;
 use petgraph::stable_graph::{DefaultIx, StableGraph};
 use petgraph::visit::{Bfs, Dfs, DfsPostOrder, EdgeRef};
-use petgraph::Outgoing;
 
 /// This object implements a directed acyclic expression graph (DAEG) that
 /// is used to compute ranges for expressions through interval arithmetic.
@@ -345,7 +345,7 @@ pub fn propagate_comparison(
     left_child: &Interval,
     right_child: &Interval,
 ) -> Result<Option<(Interval, Interval)>> {
-    if parent == &Interval::CERTAINLY_TRUE {
+    if parent == &Interval::TRUE {
         match op {
             Operator::Eq => left_child.intersect(right_child).map(|result| {
                 result.map(|intersection| (intersection.clone(), intersection))
@@ -360,7 +360,7 @@ pub fn propagate_comparison(
                 "The operator must be a comparison operator to propagate intervals"
             ),
         }
-    } else if parent == &Interval::CERTAINLY_FALSE {
+    } else if parent == &Interval::FALSE {
         match op {
             Operator::Eq => {
                 // TODO: Propagation is not possible until we support interval sets.
@@ -518,10 +518,10 @@ impl ExprIntervalGraph {
         // (1) given_range ⊇ bounds => Nothing to propagate
         // (2) ∅ ⊂ (given_range ∩ bounds) ⊂ bounds => Can propagate
         // (3) Disjoint sets => Infeasible
-        if given_range.contains(bounds)? == Interval::CERTAINLY_TRUE {
+        if given_range.contains(bounds)? == Interval::TRUE {
             // First case:
             Ok(PropagationResult::CannotPropagate)
-        } else if bounds.contains(&given_range)? != Interval::CERTAINLY_FALSE {
+        } else if bounds.contains(&given_range)? != Interval::FALSE {
             // Second case:
             let result = self.propagate_constraints(given_range);
             self.update_intervals(leaf_bounds);
@@ -579,15 +579,11 @@ impl ExprIntervalGraph {
     ///
     /// let mut graph = ExprIntervalGraph::try_new(expr, &schema).unwrap();
     /// // Do it once, while constructing.
-    /// let node_indices = graph
-    ///     .gather_node_indices(&[Arc::new(Column::new("gnz", 0))]);
+    /// let node_indices = graph.gather_node_indices(&[Arc::new(Column::new("gnz", 0))]);
     /// let left_index = node_indices.get(0).unwrap().1;
     ///
     /// // Provide intervals for leaf variables (here, there is only one).
-    /// let intervals = vec![(
-    ///     left_index,
-    ///     Interval::make(Some(10), Some(20)).unwrap(),
-    /// )];
+    /// let intervals = vec![(left_index, Interval::make(Some(10), Some(20)).unwrap())];
     ///
     /// // Evaluate bounds for the composite expression:
     /// graph.assign_intervals(&intervals);
@@ -647,7 +643,7 @@ impl ExprIntervalGraph {
             let node_interval = self.graph[node].interval();
             // Special case: true OR could in principle be propagated by 3 interval sets,
             // (i.e. left true, or right true, or both true) however we do not support this yet.
-            if node_interval == &Interval::CERTAINLY_TRUE
+            if node_interval == &Interval::TRUE
                 && self.graph[node]
                     .expr
                     .as_any()
@@ -784,7 +780,7 @@ mod tests {
     use rand::{Rng, SeedableRng};
     use rstest::*;
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn experiment(
         expr: Arc<dyn PhysicalExpr>,
         exprs_with_interval: (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>),
@@ -795,11 +791,11 @@ mod tests {
         result: PropagationResult,
         schema: &Schema,
     ) -> Result<()> {
-        let col_stats = vec![
+        let col_stats = [
             (Arc::clone(&exprs_with_interval.0), left_interval),
             (Arc::clone(&exprs_with_interval.1), right_interval),
         ];
-        let expected = vec![
+        let expected = [
             (Arc::clone(&exprs_with_interval.0), left_expected),
             (Arc::clone(&exprs_with_interval.1), right_expected),
         ];
@@ -819,8 +815,7 @@ mod tests {
             .map(|((_, interval), (_, index))| (*index, interval.clone()))
             .collect_vec();
 
-        let exp_result =
-            graph.update_ranges(&mut col_stat_nodes[..], Interval::CERTAINLY_TRUE)?;
+        let exp_result = graph.update_ranges(&mut col_stat_nodes[..], Interval::TRUE)?;
         assert_eq!(exp_result, result);
         col_stat_nodes.iter().zip(expected_nodes.iter()).for_each(
             |((_, calculated_interval_node), (_, expected))| {
@@ -1579,12 +1574,7 @@ mod tests {
                 Interval::make(None, Some(999_i64))?,
                 Interval::make(Some(1000_i64), Some(1000_i64))?,
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         let left =
@@ -1608,12 +1598,7 @@ mod tests {
                     ScalarValue::TimestampNanosecond(Some(1000), None),
                 )?
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         let left = Interval::make_unbounded(&DataType::Timestamp(
@@ -1639,12 +1624,7 @@ mod tests {
                     ScalarValue::TimestampNanosecond(Some(1000), Some("+05:00".into())),
                 )?
             ))),
-            propagate_comparison(
-                &Operator::Lt,
-                &Interval::CERTAINLY_TRUE,
-                &left,
-                &right
-            )?
+            propagate_comparison(&Operator::Lt, &Interval::TRUE, &left, &right)?
         );
 
         Ok(())
@@ -1657,38 +1637,38 @@ mod tests {
             Operator::Or,
             Arc::new(Column::new("b", 1)),
         ));
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_set = vec![
-            vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN],
-            vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_FALSE],
-            vec![&Interval::CERTAINLY_FALSE, &Interval::CERTAINLY_FALSE],
-            vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN],
+            vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::FALSE],
+            vec![&Interval::FALSE, &Interval::FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE],
         ];
         for children in children_set {
             assert_eq!(
                 expr.propagate_constraints(&parent, &children)?.unwrap(),
-                vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_FALSE],
+                vec![Interval::FALSE, Interval::FALSE],
             );
         }
 
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_set = vec![
-            vec![&Interval::CERTAINLY_TRUE, &Interval::UNCERTAIN],
-            vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_TRUE],
+            vec![&Interval::TRUE, &Interval::TRUE_OR_FALSE],
+            vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE],
         ];
         for children in children_set {
             assert_eq!(expr.propagate_constraints(&parent, &children)?, None,);
         }
 
-        let parent = Interval::CERTAINLY_TRUE;
-        let children = vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN];
+        let parent = Interval::TRUE;
+        let children = vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE];
         assert_eq!(
             expr.propagate_constraints(&parent, &children)?.unwrap(),
-            vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_TRUE]
+            vec![Interval::FALSE, Interval::TRUE]
         );
 
-        let parent = Interval::CERTAINLY_TRUE;
-        let children = vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN];
+        let parent = Interval::TRUE;
+        let children = vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE];
         assert_eq!(
             expr.propagate_constraints(&parent, &children)?.unwrap(),
             // Empty means unchanged intervals.
@@ -1705,25 +1685,22 @@ mod tests {
             Operator::And,
             Arc::new(Column::new("b", 1)),
         ));
-        let parent = Interval::CERTAINLY_FALSE;
+        let parent = Interval::FALSE;
         let children_and_results_set = vec![
             (
-                vec![&Interval::CERTAINLY_TRUE, &Interval::UNCERTAIN],
-                vec![Interval::CERTAINLY_TRUE, Interval::CERTAINLY_FALSE],
+                vec![&Interval::TRUE, &Interval::TRUE_OR_FALSE],
+                vec![Interval::TRUE, Interval::FALSE],
             ),
             (
-                vec![&Interval::UNCERTAIN, &Interval::CERTAINLY_TRUE],
-                vec![Interval::CERTAINLY_FALSE, Interval::CERTAINLY_TRUE],
+                vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE],
+                vec![Interval::FALSE, Interval::TRUE],
             ),
             (
-                vec![&Interval::UNCERTAIN, &Interval::UNCERTAIN],
+                vec![&Interval::TRUE_OR_FALSE, &Interval::TRUE_OR_FALSE],
                 // Empty means unchanged intervals.
                 vec![],
             ),
-            (
-                vec![&Interval::CERTAINLY_FALSE, &Interval::UNCERTAIN],
-                vec![],
-            ),
+            (vec![&Interval::FALSE, &Interval::TRUE_OR_FALSE], vec![]),
         ];
         for (children, result) in children_and_results_set {
             assert_eq!(
diff --git a/datafusion/physical-expr/src/intervals/test_utils.rs b/datafusion/physical-expr/src/intervals/test_utils.rs
index c3d38a974ab02..805ffd27613ee 100644
--- a/datafusion/physical-expr/src/intervals/test_utils.rs
+++ b/datafusion/physical-expr/src/intervals/test_utils.rs
@@ -19,13 +19,13 @@
 
 use std::sync::Arc;
 
-use crate::expressions::{binary, BinaryExpr, Literal};
 use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, Literal, binary};
 use arrow::datatypes::Schema;
 use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_expr::Operator;
 
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 /// This test function generates a conjunctive statement with two numeric
 /// terms with the following form:
 /// left_col (op_1) a  >/>= right_col (op_2) b AND left_col (op_3) c </<= right_col (op_4) d
@@ -61,7 +61,7 @@ pub fn gen_conjunctive_numerical_expr(
     Arc::new(BinaryExpr::new(left_expr, Operator::And, right_expr))
 }
 
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 /// This test function generates a conjunctive statement with
 /// two scalar values with the following form:
 /// left_col (op_1) a  > right_col (op_2) b AND left_col (op_3) c < right_col (op_4) d
diff --git a/datafusion/physical-expr/src/intervals/utils.rs b/datafusion/physical-expr/src/intervals/utils.rs
index 910631ef4a43f..3cada63a34ace 100644
--- a/datafusion/physical-expr/src/intervals/utils.rs
+++ b/datafusion/physical-expr/src/intervals/utils.rs
@@ -20,15 +20,15 @@
 use std::sync::Arc;
 
 use crate::{
-    expressions::{BinaryExpr, CastExpr, Column, Literal, NegativeExpr},
     PhysicalExpr,
+    expressions::{BinaryExpr, CastExpr, Column, Literal, NegativeExpr},
 };
 
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{internal_err, Result, ScalarValue};
-use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::Operator;
+use datafusion_expr::interval_arithmetic::Interval;
 
 /// Indicates whether interval arithmetic is supported for the given expression.
 /// Currently, we do not support all [`PhysicalExpr`]s for interval calculations.
@@ -45,13 +45,13 @@ pub fn check_support(expr: &Arc<dyn PhysicalExpr>, schema: &SchemaRef) -> bool {
         if let Ok(field) = schema.field_with_name(column.name()) {
             is_datatype_supported(field.data_type())
         } else {
-            return false;
+            false
         }
     } else if let Some(literal) = expr_any.downcast_ref::<Literal>() {
         if let Ok(dt) = literal.data_type(schema) {
             is_datatype_supported(&dt)
         } else {
-            return false;
+            false
         }
     } else if let Some(cast) = expr_any.downcast_ref::<CastExpr>() {
         check_support(cast.expr(), schema)
diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs
index 9f795c81fa48e..bedd348dab92f 100644
--- a/datafusion/physical-expr/src/lib.rs
+++ b/datafusion/physical-expr/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 // Backward compatibility
 pub mod aggregate;
@@ -30,13 +31,16 @@ pub mod analysis;
 pub mod binary_map {
     pub use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType};
 }
+pub mod async_scalar_function;
 pub mod equivalence;
 pub mod expressions;
 pub mod intervals;
 mod partitioning;
 mod physical_expr;
 pub mod planner;
+pub mod projection;
 mod scalar_function;
+pub mod simplifier;
 pub mod statistics;
 pub mod utils;
 pub mod window;
@@ -48,26 +52,26 @@ pub mod execution_props {
 }
 
 pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState};
-pub use analysis::{analyze, AnalysisContext, ExprBoundaries};
+pub use analysis::{AnalysisContext, ExprBoundaries, analyze};
 pub use equivalence::{
-    calculate_union, AcrossPartitions, ConstExpr, EquivalenceProperties,
+    AcrossPartitions, ConstExpr, EquivalenceProperties, calculate_union,
 };
 pub use partitioning::{Distribution, Partitioning};
 pub use physical_expr::{
+    add_offset_to_expr, add_offset_to_physical_sort_exprs, create_lex_ordering,
     create_ordering, create_physical_sort_expr, create_physical_sort_exprs,
     physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal,
-    PhysicalExprRef,
 };
 
-pub use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+pub use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, PhysicalExprRef};
 pub use datafusion_physical_expr_common::sort_expr::{
-    LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement,
+    LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortExpr,
+    PhysicalSortRequirement,
 };
 
 pub use planner::{create_physical_expr, create_physical_exprs};
 pub use scalar_function::ScalarFunctionExpr;
-
-pub use datafusion_physical_expr_common::utils::reverse_order_bys;
+pub use simplifier::PhysicalExprSimplifier;
 pub use utils::{conjunction, conjunction_opt, split_conjunction};
 
 // For backwards compatibility
diff --git a/datafusion/physical-expr/src/partitioning.rs b/datafusion/physical-expr/src/partitioning.rs
index eb7e1ea6282bb..d24c60b63e6bd 100644
--- a/datafusion/physical-expr/src/partitioning.rs
+++ b/datafusion/physical-expr/src/partitioning.rs
@@ -18,8 +18,8 @@
 //! [`Partitioning`] and [`Distribution`] for `ExecutionPlans`
 
 use crate::{
-    equivalence::ProjectionMapping, expressions::UnKnownColumn, physical_exprs_equal,
-    EquivalenceProperties, PhysicalExpr,
+    EquivalenceProperties, PhysicalExpr, equivalence::ProjectionMapping,
+    expressions::UnKnownColumn, physical_exprs_equal,
 };
 use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
 use std::fmt;
@@ -139,6 +139,28 @@ impl Display for Partitioning {
         }
     }
 }
+
+/// Represents how a [`Partitioning`] satisfies a [`Distribution`] requirement.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PartitioningSatisfaction {
+    /// The partitioning does not satisfy the distribution requirement
+    NotSatisfied,
+    /// The partitioning exactly matches the distribution requirement
+    Exact,
+    /// The partitioning satisfies the distribution requirement via subset logic
+    Subset,
+}
+
+impl PartitioningSatisfaction {
+    pub fn is_satisfied(&self) -> bool {
+        matches!(self, Self::Exact | Self::Subset)
+    }
+
+    pub fn is_subset(&self) -> bool {
+        *self == Self::Subset
+    }
+}
+
 impl Partitioning {
     /// Returns the number of partitions in this partitioning scheme
     pub fn partition_count(&self) -> usize {
@@ -148,69 +170,121 @@ impl Partitioning {
         }
     }
 
-    /// Returns true when the guarantees made by this [`Partitioning`] are sufficient to
-    /// satisfy the partitioning scheme mandated by the `required` [`Distribution`].
+    /// Returns true if `subset_exprs` is a subset of `exprs`.
+    /// For example: Hash(a, b) is subset of Hash(a) since a partition with all occurrences of
+    /// a distinct (a) must also contain all occurrences of a distinct (a, b) with the same (a).
+    fn is_subset_partitioning(
+        subset_exprs: &[Arc<dyn PhysicalExpr>],
+        superset_exprs: &[Arc<dyn PhysicalExpr>],
+    ) -> bool {
+        // Require strict subset: fewer expressions, not equal
+        if subset_exprs.is_empty() || subset_exprs.len() >= superset_exprs.len() {
+            return false;
+        }
+
+        subset_exprs.iter().all(|subset_expr| {
+            superset_exprs
+                .iter()
+                .any(|superset_expr| subset_expr.eq(superset_expr))
+        })
+    }
+
+    #[deprecated(since = "52.0.0", note = "Use satisfaction instead")]
     pub fn satisfy(
         &self,
         required: &Distribution,
         eq_properties: &EquivalenceProperties,
     ) -> bool {
+        self.satisfaction(required, eq_properties, false)
+            == PartitioningSatisfaction::Exact
+    }
+
+    /// Returns how this [`Partitioning`] satisfies the partitioning scheme mandated
+    /// by the `required` [`Distribution`].
+    pub fn satisfaction(
+        &self,
+        required: &Distribution,
+        eq_properties: &EquivalenceProperties,
+        allow_subset: bool,
+    ) -> PartitioningSatisfaction {
         match required {
-            Distribution::UnspecifiedDistribution => true,
-            Distribution::SinglePartition if self.partition_count() == 1 => true,
+            Distribution::UnspecifiedDistribution => PartitioningSatisfaction::Exact,
+            Distribution::SinglePartition if self.partition_count() == 1 => {
+                PartitioningSatisfaction::Exact
+            }
             // When partition count is 1, hash requirement is satisfied.
-            Distribution::HashPartitioned(_) if self.partition_count() == 1 => true,
-            Distribution::HashPartitioned(required_exprs) => {
-                match self {
-                    // Here we do not check the partition count for hash partitioning and assumes the partition count
-                    // and hash functions in the system are the same. In future if we plan to support storage partition-wise joins,
-                    // then we need to have the partition count and hash functions validation.
-                    Partitioning::Hash(partition_exprs, _) => {
-                        let fast_match =
-                            physical_exprs_equal(required_exprs, partition_exprs);
-                        // If the required exprs do not match, need to leverage the eq_properties provided by the child
-                        // and normalize both exprs based on the equivalent groups.
-                        if !fast_match {
-                            let eq_groups = eq_properties.eq_group();
-                            if !eq_groups.is_empty() {
-                                let normalized_required_exprs = required_exprs
-                                    .iter()
-                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
-                                    .collect::<Vec<_>>();
-                                let normalized_partition_exprs = partition_exprs
-                                    .iter()
-                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
-                                    .collect::<Vec<_>>();
-                                return physical_exprs_equal(
-                                    &normalized_required_exprs,
-                                    &normalized_partition_exprs,
-                                );
-                            }
+            Distribution::HashPartitioned(_) if self.partition_count() == 1 => {
+                PartitioningSatisfaction::Exact
+            }
+            Distribution::HashPartitioned(required_exprs) => match self {
+                // Here we do not check the partition count for hash partitioning and assumes the partition count
+                // and hash functions in the system are the same. In future if we plan to support storage partition-wise joins,
+                // then we need to have the partition count and hash functions validation.
+                Partitioning::Hash(partition_exprs, _) => {
+                    // Empty hash partitioning is invalid
+                    if partition_exprs.is_empty() || required_exprs.is_empty() {
+                        return PartitioningSatisfaction::NotSatisfied;
+                    }
+
+                    // Fast path: exact match
+                    if physical_exprs_equal(required_exprs, partition_exprs) {
+                        return PartitioningSatisfaction::Exact;
+                    }
+
+                    // Normalization path using equivalence groups
+                    let eq_groups = eq_properties.eq_group();
+                    if !eq_groups.is_empty() {
+                        let normalized_required_exprs = required_exprs
+                            .iter()
+                            .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
+                            .collect::<Vec<_>>();
+                        let normalized_partition_exprs = partition_exprs
+                            .iter()
+                            .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
+                            .collect::<Vec<_>>();
+                        if physical_exprs_equal(
+                            &normalized_required_exprs,
+                            &normalized_partition_exprs,
+                        ) {
+                            return PartitioningSatisfaction::Exact;
+                        }
+
+                        if allow_subset
+                            && Self::is_subset_partitioning(
+                                &normalized_partition_exprs,
+                                &normalized_required_exprs,
+                            )
+                        {
+                            return PartitioningSatisfaction::Subset;
                         }
-                        fast_match
+                    } else if allow_subset
+                        && Self::is_subset_partitioning(partition_exprs, required_exprs)
+                    {
+                        return PartitioningSatisfaction::Subset;
                     }
-                    _ => false,
+
+                    PartitioningSatisfaction::NotSatisfied
                 }
-            }
-            _ => false,
+                _ => PartitioningSatisfaction::NotSatisfied,
+            },
+            _ => PartitioningSatisfaction::NotSatisfied,
         }
     }
 
     /// Calculate the output partitioning after applying the given projection.
     pub fn project(
         &self,
-        projection_mapping: &ProjectionMapping,
+        mapping: &ProjectionMapping,
         input_eq_properties: &EquivalenceProperties,
     ) -> Self {
         if let Partitioning::Hash(exprs, part) = self {
-            let normalized_exprs = exprs
-                .iter()
-                .map(|expr| {
-                    input_eq_properties
-                        .project_expr(expr, projection_mapping)
-                        .unwrap_or_else(|| {
-                            Arc::new(UnKnownColumn::new(&expr.to_string()))
-                        })
+            let normalized_exprs = input_eq_properties
+                .project_expressions(exprs, mapping)
+                .zip(exprs)
+                .map(|(proj_expr, expr)| {
+                    proj_expr.unwrap_or_else(|| {
+                        Arc::new(UnKnownColumn::new(&expr.to_string()))
+                    })
                 })
                 .collect();
             Partitioning::Hash(normalized_exprs, *part)
@@ -318,11 +392,21 @@ mod tests {
 
         for distribution in distribution_types {
             let result = (
-                single_partition.satisfy(&distribution, &eq_properties),
-                unspecified_partition.satisfy(&distribution, &eq_properties),
-                round_robin_partition.satisfy(&distribution, &eq_properties),
-                hash_partition1.satisfy(&distribution, &eq_properties),
-                hash_partition2.satisfy(&distribution, &eq_properties),
+                single_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                unspecified_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                round_robin_partition
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                hash_partition1
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
+                hash_partition2
+                    .satisfaction(&distribution, &eq_properties, true)
+                    .is_satisfied(),
             );
 
             match distribution {
@@ -340,4 +424,425 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_partitioning_satisfy_by_subset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([b]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([b, a]) vs Hash([a, b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::Subset,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_current_superset() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a, b]) vs Hash([a])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b, c]) vs Hash([a])",
+                Partitioning::Hash(
+                    vec![Arc::clone(&col_a), Arc::clone(&col_b), Arc::clone(&col_c)],
+                    4,
+                ),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b, c]) vs Hash([a, b])",
+                Partitioning::Hash(
+                    vec![Arc::clone(&col_a), Arc::clone(&col_b), Arc::clone(&col_c)],
+                    4,
+                ),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_partial_overlap() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![(
+            "Partial overlap: Hash([a, c]) vs Hash([a, b])",
+            Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_c)], 4),
+            Distribution::HashPartitioned(vec![Arc::clone(&col_a), Arc::clone(&col_b)]),
+            PartitioningSatisfaction::NotSatisfied,
+            PartitioningSatisfaction::NotSatisfied,
+        )];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_no_overlap() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let col_c: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("c", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a]) vs Hash([b, c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_b),
+                    Arc::clone(&col_c),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([c])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_c)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_exact_match() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([a, b]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::Exact,
+                PartitioningSatisfaction::Exact,
+            ),
+            (
+                "Hash([a]) vs Hash([a])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::Exact,
+                PartitioningSatisfaction::Exact,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_unknown() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+        ]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let col_b: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("b", &schema)?);
+        let unknown: Arc<dyn PhysicalExpr> = Arc::new(UnKnownColumn::new("dropped"));
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([unknown]) vs Hash([a, b])",
+                Partitioning::Hash(vec![Arc::clone(&unknown)], 4),
+                Distribution::HashPartitioned(vec![
+                    Arc::clone(&col_a),
+                    Arc::clone(&col_b),
+                ]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a, b]) vs Hash([unknown])",
+                Partitioning::Hash(vec![Arc::clone(&col_a), Arc::clone(&col_b)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&unknown)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([unknown]) vs Hash([unknown])",
+                Partitioning::Hash(vec![Arc::clone(&unknown)], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&unknown)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_partitioning_empty_hash() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+
+        let col_a: Arc<dyn PhysicalExpr> =
+            Arc::new(Column::new_with_schema("a", &schema)?);
+        let eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+        let test_cases = vec![
+            (
+                "Hash([]) vs Hash([a])",
+                Partitioning::Hash(vec![], 4),
+                Distribution::HashPartitioned(vec![Arc::clone(&col_a)]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([a]) vs Hash([])",
+                Partitioning::Hash(vec![Arc::clone(&col_a)], 4),
+                Distribution::HashPartitioned(vec![]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+            (
+                "Hash([]) vs Hash([])",
+                Partitioning::Hash(vec![], 4),
+                Distribution::HashPartitioned(vec![]),
+                PartitioningSatisfaction::NotSatisfied,
+                PartitioningSatisfaction::NotSatisfied,
+            ),
+        ];
+
+        for (desc, partition, required, expected_with_subset, expected_without_subset) in
+            test_cases
+        {
+            let result = partition.satisfaction(&required, &eq_properties, true);
+            assert_eq!(
+                result, expected_with_subset,
+                "Failed for {desc} with subset enabled"
+            );
+
+            let result = partition.satisfaction(&required, &eq_properties, false);
+            assert_eq!(
+                result, expected_without_subset,
+                "Failed for {desc} with subset disabled"
+            );
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs
index 63c4ccbb4b385..e750bfd79d77d 100644
--- a/datafusion/physical-expr/src/physical_expr.rs
+++ b/datafusion/physical-expr/src/physical_expr.rs
@@ -17,12 +17,38 @@
 
 use std::sync::Arc;
 
-use crate::create_physical_expr;
+use crate::expressions::{self, Column};
+use crate::{LexOrdering, PhysicalSortExpr, create_physical_expr};
+
+use arrow::compute::SortOptions;
+use arrow::datatypes::{Schema, SchemaRef};
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{DFSchema, HashMap};
+use datafusion_common::{Result, plan_err};
 use datafusion_expr::execution_props::ExecutionProps;
-pub(crate) use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-pub use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+use datafusion_expr::{Expr, SortExpr};
+
 use itertools::izip;
+// Exports:
+pub(crate) use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+
+/// Adds the `offset` value to `Column` indices inside `expr`. This function is
+/// generally used during the update of the right table schema in join operations.
+pub fn add_offset_to_expr(
+    expr: Arc<dyn PhysicalExpr>,
+    offset: isize,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
+        Some(col) => {
+            let Some(idx) = col.index().checked_add_signed(offset) else {
+                return plan_err!("Column index overflow");
+            };
+            Ok(Transformed::yes(Arc::new(Column::new(col.name(), idx))))
+        }
+        None => Ok(Transformed::no(e)),
+    })
+    .data()
+}
 
 /// This function is similar to the `contains` method of `Vec`. It finds
 /// whether `expr` is among `physical_exprs`.
@@ -60,26 +86,21 @@ pub fn physical_exprs_bag_equal(
     multi_set_lhs == multi_set_rhs
 }
 
-use crate::{expressions, LexOrdering, PhysicalSortExpr};
-use arrow::compute::SortOptions;
-use arrow::datatypes::Schema;
-use datafusion_common::plan_err;
-use datafusion_common::Result;
-use datafusion_expr::{Expr, SortExpr};
-
-/// Converts logical sort expressions to physical sort expressions
+/// Converts logical sort expressions to physical sort expressions.
 ///
-/// This function transforms a collection of logical sort expressions into their physical
-/// representation that can be used during query execution.
+/// This function transforms a collection of logical sort expressions into their
+/// physical representation that can be used during query execution.
 ///
 /// # Arguments
 ///
-/// * `schema` - The schema containing column definitions
-/// * `sort_order` - A collection of logical sort expressions grouped into lexicographic orderings
+/// * `schema` - The schema containing column definitions.
+/// * `sort_order` - A collection of logical sort expressions grouped into
+///   lexicographic orderings.
 ///
 /// # Returns
 ///
-/// A vector of lexicographic orderings for physical execution, or an error if the transformation fails
+/// A vector of lexicographic orderings for physical execution, or an error if
+/// the transformation fails.
 ///
 /// # Examples
 ///
@@ -97,12 +118,16 @@ use datafusion_expr::{Expr, SortExpr};
 /// ]);
 ///
 /// let sort_exprs = vec![
-///     vec![
-///         SortExpr { expr: Expr::Column(Column::new(Some("t"), "id")), asc: true, nulls_first: false }
-///     ],
-///     vec![
-///         SortExpr { expr: Expr::Column(Column::new(Some("t"), "name")), asc: false, nulls_first: true }
-///     ]
+///     vec![SortExpr {
+///         expr: Expr::Column(Column::new(Some("t"), "id")),
+///         asc: true,
+///         nulls_first: false,
+///     }],
+///     vec![SortExpr {
+///         expr: Expr::Column(Column::new(Some("t"), "name")),
+///         asc: false,
+///         nulls_first: true,
+///     }],
 /// ];
 /// let result = create_ordering(&schema, &sort_exprs).unwrap();
 /// ```
@@ -114,18 +139,13 @@ pub fn create_ordering(
 
     for (group_idx, exprs) in sort_order.iter().enumerate() {
         // Construct PhysicalSortExpr objects from Expr objects:
-        let mut sort_exprs = LexOrdering::default();
+        let mut sort_exprs = vec![];
         for (expr_idx, sort) in exprs.iter().enumerate() {
             match &sort.expr {
                 Expr::Column(col) => match expressions::col(&col.name, schema) {
                     Ok(expr) => {
-                        sort_exprs.push(PhysicalSortExpr {
-                            expr,
-                            options: SortOptions {
-                                descending: !sort.asc,
-                                nulls_first: sort.nulls_first,
-                            },
-                        });
+                        let opts = SortOptions::new(!sort.asc, sort.nulls_first);
+                        sort_exprs.push(PhysicalSortExpr::new(expr, opts));
                     }
                     // Cannot find expression in the projected_schema, stop iterating
                     // since rest of the orderings are violated
@@ -141,9 +161,33 @@ pub fn create_ordering(
                 }
             }
         }
-        if !sort_exprs.is_empty() {
-            all_sort_orders.push(sort_exprs);
-        }
+        all_sort_orders.extend(LexOrdering::new(sort_exprs));
+    }
+    Ok(all_sort_orders)
+}
+
+/// Creates a vector of [LexOrdering] from a vector of logical expression
+pub fn create_lex_ordering(
+    schema: &SchemaRef,
+    sort_order: &[Vec<SortExpr>],
+    execution_props: &ExecutionProps,
+) -> Result<Vec<LexOrdering>> {
+    // Try the fast path that only supports column references first
+    // This avoids creating a DFSchema
+    if let Ok(ordering) = create_ordering(schema, sort_order) {
+        return Ok(ordering);
+    }
+
+    let df_schema = DFSchema::try_from(Arc::clone(schema))?;
+
+    let mut all_sort_orders = vec![];
+
+    for exprs in sort_order.iter() {
+        all_sort_orders.extend(LexOrdering::new(create_physical_sort_exprs(
+            exprs,
+            &df_schema,
+            execution_props,
+        )?));
     }
     Ok(all_sort_orders)
 }
@@ -154,17 +198,9 @@ pub fn create_physical_sort_expr(
     input_dfschema: &DFSchema,
     execution_props: &ExecutionProps,
 ) -> Result<PhysicalSortExpr> {
-    let SortExpr {
-        expr,
-        asc,
-        nulls_first,
-    } = e;
-    Ok(PhysicalSortExpr {
-        expr: create_physical_expr(expr, input_dfschema, execution_props)?,
-        options: SortOptions {
-            descending: !asc,
-            nulls_first: *nulls_first,
-        },
+    create_physical_expr(&e.expr, input_dfschema, execution_props).map(|expr| {
+        let options = SortOptions::new(!e.asc, e.nulls_first);
+        PhysicalSortExpr::new(expr, options)
     })
 }
 
@@ -173,23 +209,43 @@ pub fn create_physical_sort_exprs(
     exprs: &[SortExpr],
     input_dfschema: &DFSchema,
     execution_props: &ExecutionProps,
-) -> Result<LexOrdering> {
+) -> Result<Vec<PhysicalSortExpr>> {
     exprs
         .iter()
-        .map(|expr| create_physical_sort_expr(expr, input_dfschema, execution_props))
-        .collect::<Result<LexOrdering>>()
+        .map(|e| create_physical_sort_expr(e, input_dfschema, execution_props))
+        .collect()
+}
+
+pub fn add_offset_to_physical_sort_exprs(
+    sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+    offset: isize,
+) -> Result<Vec<PhysicalSortExpr>> {
+    sort_exprs
+        .into_iter()
+        .map(|mut sort_expr| {
+            sort_expr.expr = add_offset_to_expr(sort_expr.expr, offset)?;
+            Ok(sort_expr)
+        })
+        .collect()
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    use crate::expressions::{Column, Literal};
+    use crate::expressions::{BinaryExpr, Column, Literal};
     use crate::physical_expr::{
         physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal,
     };
+    use datafusion_physical_expr_common::physical_expr::is_volatile;
 
-    use datafusion_common::ScalarValue;
+    use arrow::datatypes::{DataType, Schema};
+    use arrow::record_batch::RecordBatch;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+    use datafusion_expr::Operator;
+    use std::any::Any;
+    use std::fmt;
 
     #[test]
     fn test_physical_exprs_contains() {
@@ -302,4 +358,120 @@ mod tests {
         assert!(physical_exprs_bag_equal(list3.as_slice(), list3.as_slice()));
         assert!(physical_exprs_bag_equal(list4.as_slice(), list4.as_slice()));
     }
+
+    #[test]
+    fn test_is_volatile_default_behavior() {
+        // Test that default PhysicalExpr implementations are not volatile
+        let literal =
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
+        let column = Arc::new(Column::new("test", 0)) as Arc<dyn PhysicalExpr>;
+
+        // Test is_volatile_node() - should return false by default
+        assert!(!literal.is_volatile_node());
+        assert!(!column.is_volatile_node());
+
+        // Test is_volatile() - should return false for non-volatile expressions
+        assert!(!is_volatile(&literal));
+        assert!(!is_volatile(&column));
+    }
+
+    /// Mock volatile PhysicalExpr for testing purposes
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
+    struct MockVolatileExpr {
+        volatile: bool,
+    }
+
+    impl MockVolatileExpr {
+        fn new(volatile: bool) -> Self {
+            Self { volatile }
+        }
+    }
+
+    impl fmt::Display for MockVolatileExpr {
+        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+            write!(f, "MockVolatile({})", self.volatile)
+        }
+    }
+
+    impl PhysicalExpr for MockVolatileExpr {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+            Ok(DataType::Boolean)
+        }
+
+        fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+            Ok(false)
+        }
+
+        fn evaluate(&self, _batch: &RecordBatch) -> Result<ColumnarValue> {
+            Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+                self.volatile,
+            ))))
+        }
+
+        fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn PhysicalExpr>>,
+        ) -> Result<Arc<dyn PhysicalExpr>> {
+            Ok(self)
+        }
+
+        fn is_volatile_node(&self) -> bool {
+            self.volatile
+        }
+
+        fn fmt_sql(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "mock_volatile({})", self.volatile)
+        }
+    }
+
+    #[test]
+    fn test_nested_expression_volatility() {
+        // Test that is_volatile() recursively detects volatility in expression trees
+
+        // Create a volatile mock expression
+        let volatile_expr =
+            Arc::new(MockVolatileExpr::new(true)) as Arc<dyn PhysicalExpr>;
+        assert!(volatile_expr.is_volatile_node());
+        assert!(is_volatile(&volatile_expr));
+
+        // Create a non-volatile mock expression
+        let stable_expr = Arc::new(MockVolatileExpr::new(false)) as Arc<dyn PhysicalExpr>;
+        assert!(!stable_expr.is_volatile_node());
+        assert!(!is_volatile(&stable_expr));
+
+        // Create a literal (non-volatile)
+        let literal =
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
+        assert!(!literal.is_volatile_node());
+        assert!(!is_volatile(&literal));
+
+        // Test composite expression: volatile_expr AND literal
+        // The BinaryExpr itself is not volatile, but contains a volatile child
+        let composite_expr = Arc::new(BinaryExpr::new(
+            Arc::clone(&volatile_expr),
+            Operator::And,
+            Arc::clone(&literal),
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert!(!composite_expr.is_volatile_node()); // BinaryExpr itself is not volatile
+        assert!(is_volatile(&composite_expr)); // But it contains a volatile child
+
+        // Test composite expression with all non-volatile children
+        let stable_composite = Arc::new(BinaryExpr::new(
+            Arc::clone(&stable_expr),
+            Operator::And,
+            Arc::clone(&literal),
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert!(!stable_composite.is_volatile_node());
+        assert!(!is_volatile(&stable_composite)); // No volatile children
+    }
 }
diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs
index 8660bff796d5a..5c170700d9833 100644
--- a/datafusion/physical-expr/src/planner.rs
+++ b/datafusion/physical-expr/src/planner.rs
@@ -19,20 +19,22 @@ use std::sync::Arc;
 
 use crate::ScalarFunctionExpr;
 use crate::{
-    expressions::{self, binary, like, similar_to, Column, Literal},
     PhysicalExpr,
+    expressions::{self, Column, Literal, binary, like, similar_to},
 };
 
 use arrow::datatypes::Schema;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::metadata::{FieldMetadata, format_type_and_metadata};
 use datafusion_common::{
-    exec_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, ToDFSchema,
+    DFSchema, Result, ScalarValue, ToDFSchema, exec_err, not_impl_err, plan_err,
 };
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::expr::{Alias, Cast, InList, Placeholder, ScalarFunction};
-use datafusion_expr::var_provider::is_system_variables;
 use datafusion_expr::var_provider::VarType;
+use datafusion_expr::var_provider::is_system_variables;
 use datafusion_expr::{
-    binary_expr, lit, Between, BinaryExpr, Expr, Like, Operator, TryCast,
+    Between, BinaryExpr, Expr, ExprSchemable, Like, Operator, TryCast, binary_expr, lit,
 };
 
 /// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1
@@ -103,22 +105,37 @@ use datafusion_expr::{
 /// * `e` - The logical expression
 /// * `input_dfschema` - The DataFusion schema for the input, used to resolve `Column` references
 ///   to qualified or unqualified fields by name.
+#[cfg_attr(feature = "recursive_protection", recursive::recursive)]
 pub fn create_physical_expr(
     e: &Expr,
     input_dfschema: &DFSchema,
     execution_props: &ExecutionProps,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    let input_schema: &Schema = &input_dfschema.into();
+    let input_schema = input_dfschema.as_arrow();
 
     match e {
-        Expr::Alias(Alias { expr, .. }) => {
-            Ok(create_physical_expr(expr, input_dfschema, execution_props)?)
+        Expr::Alias(Alias { expr, metadata, .. }) => {
+            if let Expr::Literal(v, prior_metadata) = expr.as_ref() {
+                let new_metadata = FieldMetadata::merge_options(
+                    prior_metadata.as_ref(),
+                    metadata.as_ref(),
+                );
+                Ok(Arc::new(Literal::new_with_metadata(
+                    v.clone(),
+                    new_metadata,
+                )))
+            } else {
+                Ok(create_physical_expr(expr, input_dfschema, execution_props)?)
+            }
         }
         Expr::Column(c) => {
             let idx = input_dfschema.index_of_column(c)?;
             Ok(Arc::new(Column::new(&c.name, idx)))
         }
-        Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))),
+        Expr::Literal(value, metadata) => Ok(Arc::new(Literal::new_with_metadata(
+            value.clone(),
+            metadata.clone(),
+        ))),
         Expr::ScalarVariable(_, variable_names) => {
             if is_system_variables(variable_names) {
                 match execution_props.get_var_provider(VarType::System) {
@@ -168,7 +185,7 @@ pub fn create_physical_expr(
             let binary_op = binary_expr(
                 expr.as_ref().clone(),
                 Operator::IsNotDistinctFrom,
-                Expr::Literal(ScalarValue::Boolean(None)),
+                Expr::Literal(ScalarValue::Boolean(None), None),
             );
             create_physical_expr(&binary_op, input_dfschema, execution_props)
         }
@@ -176,7 +193,7 @@ pub fn create_physical_expr(
             let binary_op = binary_expr(
                 expr.as_ref().clone(),
                 Operator::IsDistinctFrom,
-                Expr::Literal(ScalarValue::Boolean(None)),
+                Expr::Literal(ScalarValue::Boolean(None), None),
             );
             create_physical_expr(&binary_op, input_dfschema, execution_props)
         }
@@ -271,16 +288,44 @@ pub fn create_physical_expr(
                 };
             Ok(expressions::case(expr, when_then_expr, else_expr)?)
         }
-        Expr::Cast(Cast { expr, data_type }) => expressions::cast(
-            create_physical_expr(expr, input_dfschema, execution_props)?,
-            input_schema,
-            data_type.clone(),
-        ),
-        Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast(
-            create_physical_expr(expr, input_dfschema, execution_props)?,
-            input_schema,
-            data_type.clone(),
-        ),
+        Expr::Cast(Cast { expr, field }) => {
+            if !field.metadata().is_empty() {
+                let (_, src_field) = expr.to_field(input_dfschema)?;
+                return plan_err!(
+                    "Cast from {} to {} is not supported",
+                    format_type_and_metadata(
+                        src_field.data_type(),
+                        Some(src_field.metadata()),
+                    ),
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()))
+                );
+            }
+
+            expressions::cast(
+                create_physical_expr(expr, input_dfschema, execution_props)?,
+                input_schema,
+                field.data_type().clone(),
+            )
+        }
+        Expr::TryCast(TryCast { expr, field }) => {
+            if !field.metadata().is_empty() {
+                let (_, src_field) = expr.to_field(input_dfschema)?;
+                return plan_err!(
+                    "TryCast from {} to {} is not supported",
+                    format_type_and_metadata(
+                        src_field.data_type(),
+                        Some(src_field.metadata()),
+                    ),
+                    format_type_and_metadata(field.data_type(), Some(field.metadata()))
+                );
+            }
+
+            expressions::try_cast(
+                create_physical_expr(expr, input_dfschema, execution_props)?,
+                input_schema,
+                field.data_type().clone(),
+            )
+        }
         Expr::Not(expr) => {
             expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?)
         }
@@ -301,11 +346,16 @@ pub fn create_physical_expr(
         Expr::ScalarFunction(ScalarFunction { func, args }) => {
             let physical_args =
                 create_physical_exprs(args, input_dfschema, execution_props)?;
+            let config_options = match execution_props.config_options.as_ref() {
+                Some(config_options) => Arc::clone(config_options),
+                None => Arc::new(ConfigOptions::default()),
+            };
 
             Ok(Arc::new(ScalarFunctionExpr::try_new(
                 Arc::clone(func),
                 physical_args,
                 input_schema,
+                config_options,
             )?))
         }
         Expr::Between(Between {
@@ -347,7 +397,7 @@ pub fn create_physical_expr(
             list,
             negated,
         }) => match expr.as_ref() {
-            Expr::Literal(ScalarValue::Utf8(None)) => {
+            Expr::Literal(ScalarValue::Utf8(None), _) => {
                 Ok(expressions::lit(ScalarValue::Boolean(None)))
             }
             _ => {
@@ -380,11 +430,12 @@ where
     exprs
         .into_iter()
         .map(|expr| create_physical_expr(expr, input_dfschema, execution_props))
-        .collect::<Result<Vec<_>>>()
+        .collect()
 }
 
 /// Convert a logical expression to a physical expression (without any simplification, etc)
 pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc<dyn PhysicalExpr> {
+    // TODO this makes a deep copy of the Schema. Should take SchemaRef instead and avoid deep copy
     let df_schema = schema.clone().to_dfschema().unwrap();
     let execution_props = ExecutionProps::new();
     create_physical_expr(expr, &df_schema, &execution_props).unwrap()
@@ -394,8 +445,8 @@ pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc<dyn PhysicalExpr> {
 mod tests {
     use arrow::array::{ArrayRef, BooleanArray, RecordBatch, StringArray};
     use arrow::datatypes::{DataType, Field};
-
-    use datafusion_expr::{col, lit};
+    use datafusion_common::datatype::DataTypeExt;
+    use datafusion_expr::{Operator, col, lit};
 
     use super::*;
 
@@ -423,4 +474,69 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_cast_to_extension_type() -> Result<()> {
+        let extension_field_type = Arc::new(
+            DataType::FixedSizeBinary(16)
+                .into_nullable_field()
+                .with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+        );
+        let expr = lit("3230e5d4-888e-408b-b09b-831f44aa0c58");
+        let cast_expr = Expr::Cast(Cast::new_from_field(
+            Box::new(expr.clone()),
+            Arc::clone(&extension_field_type),
+        ));
+        let err =
+            create_physical_expr(&cast_expr, &DFSchema::empty(), &ExecutionProps::new())
+                .unwrap_err();
+        assert!(err.message().contains("arrow.uuid"));
+
+        let try_cast_expr = Expr::TryCast(TryCast::new_from_field(
+            Box::new(expr.clone()),
+            Arc::clone(&extension_field_type),
+        ));
+        let err = create_physical_expr(
+            &try_cast_expr,
+            &DFSchema::empty(),
+            &ExecutionProps::new(),
+        )
+        .unwrap_err();
+        assert!(err.message().contains("arrow.uuid"));
+
+        Ok(())
+    }
+
+    /// Test that deeply nested expressions do not cause a stack overflow.
+    ///
+    /// This test only runs when the `recursive_protection` feature is enabled,
+    /// as it would overflow the stack otherwise.
+    #[test]
+    #[cfg_attr(not(feature = "recursive_protection"), ignore)]
+    fn test_deeply_nested_binary_expr() -> Result<()> {
+        // Create a deeply nested binary expression tree: ((((a + a) + a) + a) + ... )
+        // With 1000 levels of nesting, this would overflow the stack without recursion protection.
+        let depth = 1000;
+
+        let mut expr = col("a");
+        for _ in 0..depth {
+            expr = Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(expr),
+                op: Operator::Plus,
+                right: Box::new(col("a")),
+            });
+        }
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let df_schema = DFSchema::try_from(schema)?;
+
+        // This should not stack overflow
+        let _physical_expr =
+            create_physical_expr(&expr, &df_schema, &ExecutionProps::new())?;
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs
new file mode 100644
index 0000000000000..dbbd289415277
--- /dev/null
+++ b/datafusion/physical-expr/src/projection.rs
@@ -0,0 +1,3011 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ProjectionExpr`] and [`ProjectionExprs`] for representing projections.
+
+use std::ops::Deref;
+use std::sync::Arc;
+
+use crate::PhysicalExpr;
+use crate::expressions::{Column, Literal};
+use crate::utils::collect_columns;
+
+use arrow::array::{RecordBatch, RecordBatchOptions};
+use arrow::datatypes::{Field, Schema, SchemaRef};
+use datafusion_common::stats::{ColumnStatistics, Precision};
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::{
+    Result, ScalarValue, Statistics, assert_or_internal_err, internal_datafusion_err,
+    plan_err,
+};
+
+use datafusion_physical_expr_common::metrics::ExecutionPlanMetricsSet;
+use datafusion_physical_expr_common::metrics::ExpressionEvaluatorMetrics;
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays_with_metrics;
+use indexmap::IndexMap;
+use itertools::Itertools;
+
+/// An expression used by projection operations.
+///
+/// The expression is evaluated and the result is stored in a column
+/// with the name specified by `alias`.
+///
+/// For example, the SQL expression `a + b AS sum_ab` would be represented
+/// as a `ProjectionExpr` where `expr` is the expression `a + b`
+/// and `alias` is the string `sum_ab`.
+///
+/// See [`ProjectionExprs`] for a collection of projection expressions.
+#[derive(Debug, Clone)]
+pub struct ProjectionExpr {
+    /// The expression that will be evaluated.
+    pub expr: Arc<dyn PhysicalExpr>,
+    /// The name of the output column for use an output schema.
+    pub alias: String,
+}
+
+impl PartialEq for ProjectionExpr {
+    fn eq(&self, other: &Self) -> bool {
+        let ProjectionExpr { expr, alias } = self;
+        expr.eq(&other.expr) && *alias == other.alias
+    }
+}
+
+impl Eq for ProjectionExpr {}
+
+impl std::fmt::Display for ProjectionExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.expr.to_string() == self.alias {
+            write!(f, "{}", self.alias)
+        } else {
+            write!(f, "{} AS {}", self.expr, self.alias)
+        }
+    }
+}
+
+impl ProjectionExpr {
+    /// Create a new projection expression
+    pub fn new(expr: Arc<dyn PhysicalExpr>, alias: impl Into<String>) -> Self {
+        let alias = alias.into();
+        Self { expr, alias }
+    }
+
+    /// Create a new projection expression from an expression and a schema using the expression's output field name as alias.
+    pub fn new_from_expression(
+        expr: Arc<dyn PhysicalExpr>,
+        schema: &Schema,
+    ) -> Result<Self> {
+        let field = expr.return_field(schema)?;
+        Ok(Self {
+            expr,
+            alias: field.name().to_string(),
+        })
+    }
+}
+
+impl From<(Arc<dyn PhysicalExpr>, String)> for ProjectionExpr {
+    fn from(value: (Arc<dyn PhysicalExpr>, String)) -> Self {
+        Self::new(value.0, value.1)
+    }
+}
+
+impl From<&(Arc<dyn PhysicalExpr>, String)> for ProjectionExpr {
+    fn from(value: &(Arc<dyn PhysicalExpr>, String)) -> Self {
+        Self::new(Arc::clone(&value.0), value.1.clone())
+    }
+}
+
+impl From<ProjectionExpr> for (Arc<dyn PhysicalExpr>, String) {
+    fn from(value: ProjectionExpr) -> Self {
+        (value.expr, value.alias)
+    }
+}
+
+/// A collection of  [`ProjectionExpr`] instances, representing a complete
+/// projection operation.
+///
+/// Projection operations are used in query plans to select specific columns or
+/// compute new columns based on existing ones.
+///
+/// See [`ProjectionExprs::from_indices`] to select a subset of columns by
+/// indices.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ProjectionExprs {
+    /// [`Arc`] used for a cheap clone, which improves physical plan optimization performance.
+    exprs: Arc<[ProjectionExpr]>,
+}
+
+impl std::fmt::Display for ProjectionExprs {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let exprs: Vec<String> = self.exprs.iter().map(|e| e.to_string()).collect();
+        write!(f, "Projection[{}]", exprs.join(", "))
+    }
+}
+
+impl From<Vec<ProjectionExpr>> for ProjectionExprs {
+    fn from(value: Vec<ProjectionExpr>) -> Self {
+        Self {
+            exprs: value.into(),
+        }
+    }
+}
+
+impl From<&[ProjectionExpr]> for ProjectionExprs {
+    fn from(value: &[ProjectionExpr]) -> Self {
+        Self {
+            exprs: value.iter().cloned().collect(),
+        }
+    }
+}
+
+impl FromIterator<ProjectionExpr> for ProjectionExprs {
+    fn from_iter<T: IntoIterator<Item = ProjectionExpr>>(exprs: T) -> Self {
+        Self {
+            exprs: exprs.into_iter().collect(),
+        }
+    }
+}
+
+impl AsRef<[ProjectionExpr]> for ProjectionExprs {
+    fn as_ref(&self) -> &[ProjectionExpr] {
+        &self.exprs
+    }
+}
+
+impl ProjectionExprs {
+    /// Make a new [`ProjectionExprs`] from expressions iterator.
+    pub fn new(exprs: impl IntoIterator<Item = ProjectionExpr>) -> Self {
+        Self {
+            exprs: exprs.into_iter().collect(),
+        }
+    }
+
+    /// Make a new [`ProjectionExprs`] from expressions.
+    pub fn from_expressions(exprs: impl Into<Arc<[ProjectionExpr]>>) -> Self {
+        Self {
+            exprs: exprs.into(),
+        }
+    }
+
+    /// Creates a [`ProjectionExpr`] from a list of column indices.
+    ///
+    /// This is a convenience method for creating simple column-only projections, where each projection expression is a reference to a column
+    /// in the input schema.
+    ///
+    /// # Behavior
+    /// - Ordering: the output projection preserves the exact order of indices provided in the input slice
+    ///   For example, `[2, 0, 1]` will produce projections for columns 2, 0, then 1 in that order
+    /// - Duplicates: Duplicate indices are allowed and will create multiple projection expressions referencing the same source column
+    ///   For example, `[0, 0]` creates 2 separate projections both referencing column 0
+    ///
+    /// # Panics
+    /// Panics if any index in `indices` is out of bounds for the provided schema.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_physical_expr::projection::ProjectionExprs;
+    /// use std::sync::Arc;
+    ///
+    /// // Create a schema with three columns
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("a", DataType::Int32, false),
+    ///     Field::new("b", DataType::Utf8, false),
+    ///     Field::new("c", DataType::Float64, false),
+    /// ]));
+    ///
+    /// // Project columns at indices 2 and 0 (c and a) - ordering is preserved
+    /// let projection = ProjectionExprs::from_indices(&[2, 0], &schema);
+    ///
+    /// // This creates: SELECT c@2 AS c, a@0 AS a
+    /// assert_eq!(projection.as_ref().len(), 2);
+    /// assert_eq!(projection.as_ref()[0].alias, "c");
+    /// assert_eq!(projection.as_ref()[1].alias, "a");
+    ///
+    /// // Duplicate indices are allowed
+    /// let projection_with_dups = ProjectionExprs::from_indices(&[0, 0, 1], &schema);
+    /// assert_eq!(projection_with_dups.as_ref().len(), 3);
+    /// assert_eq!(projection_with_dups.as_ref()[0].alias, "a");
+    /// assert_eq!(projection_with_dups.as_ref()[1].alias, "a"); // duplicate
+    /// assert_eq!(projection_with_dups.as_ref()[2].alias, "b");
+    /// ```
+    pub fn from_indices(indices: &[usize], schema: &Schema) -> Self {
+        let projection_exprs = indices.iter().map(|&i| {
+            let field = schema.field(i);
+            ProjectionExpr {
+                expr: Arc::new(Column::new(field.name(), i)),
+                alias: field.name().clone(),
+            }
+        });
+
+        Self::from_iter(projection_exprs)
+    }
+
+    /// Returns an iterator over the projection expressions
+    pub fn iter(&self) -> impl Iterator<Item = &ProjectionExpr> {
+        self.exprs.iter()
+    }
+
+    /// Creates a ProjectionMapping from this projection
+    pub fn projection_mapping(
+        &self,
+        input_schema: &SchemaRef,
+    ) -> Result<ProjectionMapping> {
+        ProjectionMapping::try_new(
+            self.exprs
+                .iter()
+                .map(|p| (Arc::clone(&p.expr), p.alias.clone())),
+            input_schema,
+        )
+    }
+
+    /// Iterate over a clone of the projection expressions.
+    pub fn expr_iter(&self) -> impl Iterator<Item = Arc<dyn PhysicalExpr>> + '_ {
+        self.exprs.iter().map(|e| Arc::clone(&e.expr))
+    }
+
+    /// Apply a fallible transformation to the [`PhysicalExpr`] of each projection.
+    ///
+    /// This method transforms the expression in each [`ProjectionExpr`] while preserving
+    /// the alias. This is useful for rewriting expressions, such as when adapting
+    /// expressions to a different schema.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use std::sync::Arc;
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::Result;
+    /// use datafusion_physical_expr::expressions::Column;
+    /// use datafusion_physical_expr::projection::ProjectionExprs;
+    /// use datafusion_physical_expr::PhysicalExpr;
+    ///
+    /// // Create a schema and projection
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("a", DataType::Int32, false),
+    ///     Field::new("b", DataType::Int32, false),
+    /// ]));
+    /// let projection = ProjectionExprs::from_indices(&[0, 1], &schema);
+    ///
+    /// // Transform each expression (this example just clones them)
+    /// let transformed = projection.try_map_exprs(|expr| Ok(expr))?;
+    /// assert_eq!(transformed.as_ref().len(), 2);
+    /// # Ok::<(), datafusion_common::DataFusionError>(())
+    /// ```
+    pub fn try_map_exprs<F>(self, mut f: F) -> Result<Self>
+    where
+        F: FnMut(Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>>,
+    {
+        let exprs = self
+            .exprs
+            .iter()
+            .cloned()
+            .map(|mut proj| {
+                proj.expr = f(proj.expr)?;
+                Ok(proj)
+            })
+            .collect::<Result<Arc<_>>>()?;
+        Ok(Self::from_expressions(exprs))
+    }
+
+    /// Apply another projection on top of this projection, returning the combined projection.
+    /// For example, if this projection is `SELECT c@2 AS x, b@1 AS y, a@0 as z` and the other projection is `SELECT x@0 + 1 AS c1, y@1 + z@2 as c2`,
+    /// we return a projection equivalent to `SELECT c@2 + 1 AS c1, b@1 + a@0 as c2`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use datafusion_common::{Result, ScalarValue};
+    /// use datafusion_expr::Operator;
+    /// use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
+    /// use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs};
+    /// use std::sync::Arc;
+    ///
+    /// fn main() -> Result<()> {
+    ///     // Example from the docstring:
+    ///     // Base projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z
+    ///     let base = ProjectionExprs::new(vec![
+    ///         ProjectionExpr {
+    ///             expr: Arc::new(Column::new("c", 2)),
+    ///             alias: "x".to_string(),
+    ///         },
+    ///         ProjectionExpr {
+    ///             expr: Arc::new(Column::new("b", 1)),
+    ///             alias: "y".to_string(),
+    ///         },
+    ///         ProjectionExpr {
+    ///             expr: Arc::new(Column::new("a", 0)),
+    ///             alias: "z".to_string(),
+    ///         },
+    ///     ]);
+    ///
+    ///     // Top projection: SELECT x@0 + 1 AS c1, y@1 + z@2 AS c2
+    ///     let top = ProjectionExprs::new(vec![
+    ///         ProjectionExpr {
+    ///             expr: Arc::new(BinaryExpr::new(
+    ///                 Arc::new(Column::new("x", 0)),
+    ///                 Operator::Plus,
+    ///                 Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+    ///             )),
+    ///             alias: "c1".to_string(),
+    ///         },
+    ///         ProjectionExpr {
+    ///             expr: Arc::new(BinaryExpr::new(
+    ///                 Arc::new(Column::new("y", 1)),
+    ///                 Operator::Plus,
+    ///                 Arc::new(Column::new("z", 2)),
+    ///             )),
+    ///             alias: "c2".to_string(),
+    ///         },
+    ///     ]);
+    ///
+    ///     // Expected result: SELECT c@2 + 1 AS c1, b@1 + a@0 AS c2
+    ///     let result = base.try_merge(&top)?;
+    ///
+    ///     assert_eq!(result.as_ref().len(), 2);
+    ///     assert_eq!(result.as_ref()[0].alias, "c1");
+    ///     assert_eq!(result.as_ref()[1].alias, "c2");
+    ///
+    ///     Ok(())
+    /// }
+    /// ```
+    ///
+    /// # Errors
+    /// This function returns an error if any expression in the `other` projection cannot be
+    /// applied on top of this projection.
+    pub fn try_merge(&self, other: &ProjectionExprs) -> Result<ProjectionExprs> {
+        let mut new_exprs = Vec::with_capacity(other.exprs.len());
+        for proj_expr in other.exprs.iter() {
+            new_exprs.push(ProjectionExpr {
+                expr: self.unproject_expr(&proj_expr.expr)?,
+                alias: proj_expr.alias.clone(),
+            });
+        }
+        Ok(ProjectionExprs::new(new_exprs))
+    }
+
+    /// Extract the column indices used in this projection.
+    /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1,
+    /// this function would return `[0, 1]`.
+    /// Repeated indices are returned only once, and the order is ascending.
+    pub fn column_indices(&self) -> Vec<usize> {
+        self.exprs
+            .iter()
+            .flat_map(|e| collect_columns(&e.expr).into_iter().map(|col| col.index()))
+            .sorted_unstable()
+            .dedup()
+            .collect_vec()
+    }
+
+    /// Extract the ordered column indices for a column-only projection.
+    ///
+    /// This function assumes that all expressions in the projection are simple column references.
+    /// It returns the column indices in the order they appear in the projection.
+    ///
+    /// # Panics
+    ///
+    /// Panics if any expression in the projection is not a simple column reference. This includes:
+    /// - Computed expressions (e.g., `a + 1`, `CAST(a AS INT)`)
+    /// - Function calls (e.g., `UPPER(name)`, `SUM(amount)`)
+    /// - Literals (e.g., `42`, `'hello'`)
+    /// - Complex nested expressions (e.g., `CASE WHEN ... THEN ... END`)
+    ///
+    /// # Returns
+    ///
+    /// A vector of column indices in projection order. Unlike [`column_indices()`](Self::column_indices),
+    /// this function:
+    /// - Preserves the projection order (does not sort)
+    /// - Preserves duplicates (does not deduplicate)
+    ///
+    /// # Example
+    ///
+    /// For a projection `SELECT c, a, c` where `a` is at index 0 and `c` is at index 2,
+    /// this function would return `[2, 0, 2]`.
+    ///
+    /// Use [`column_indices()`](Self::column_indices) instead if the projection may contain
+    /// non-column expressions or if you need a deduplicated sorted list.
+    ///
+    /// # Panics
+    ///
+    /// Panics if any expression in the projection is not a simple column reference.
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use column_indices() instead. This method will be removed in 58.0.0 or 6 months after 52.0.0 is released, whichever comes first."
+    )]
+    pub fn ordered_column_indices(&self) -> Vec<usize> {
+        self.exprs
+            .iter()
+            .map(|e| {
+                e.expr
+                    .as_any()
+                    .downcast_ref::<Column>()
+                    .expect("Expected column reference in projection")
+                    .index()
+            })
+            .collect()
+    }
+
+    /// Project a schema according to this projection.
+    ///
+    /// For example, given a projection:
+    /// * `SELECT a AS x, b + 1 AS y`
+    /// * where `a` is at index 0
+    /// * `b` is at index 1
+    ///
+    /// If the input schema is `[a: Int32, b: Int32, c: Int32]`, the output
+    /// schema would be `[x: Int32, y: Int32]`.
+    ///
+    /// Note that [`Field`] metadata are preserved from the input schema.
+    pub fn project_schema(&self, input_schema: &Schema) -> Result<Schema> {
+        let fields: Result<Vec<Field>> = self
+            .exprs
+            .iter()
+            .map(|proj_expr| {
+                let metadata = proj_expr
+                    .expr
+                    .return_field(input_schema)?
+                    .metadata()
+                    .clone();
+
+                let field = Field::new(
+                    &proj_expr.alias,
+                    proj_expr.expr.data_type(input_schema)?,
+                    proj_expr.expr.nullable(input_schema)?,
+                )
+                .with_metadata(metadata);
+
+                Ok(field)
+            })
+            .collect();
+
+        Ok(Schema::new_with_metadata(
+            fields?,
+            input_schema.metadata().clone(),
+        ))
+    }
+
+    /// "unproject" an expression by applying this projection in reverse,
+    /// returning a new set of expressions that reference the original input
+    /// columns.
+    ///
+    /// For example, consider
+    /// * an expression `c1_c2 > 5`, and a schema `[c1, c2]`
+    /// * a projection `c1 + c2 as c1_c2`
+    ///
+    /// This method would rewrite the expression to `c1 + c2 > 5`
+    pub fn unproject_expr(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        update_expr(expr, &self.exprs, true)?.ok_or_else(|| {
+            internal_datafusion_err!(
+                "Failed to unproject an expression {} with ProjectionExprs {}",
+                expr,
+                self.exprs.iter().map(|e| format!("{e}")).join(", ")
+            )
+        })
+    }
+
+    /// "project" an expression using these projection's expressions
+    ///
+    /// For example, consider
+    /// * an expression `c1 + c2 > 5`, and a schema `[c1, c2]`
+    /// * a projection `c1 + c2 as c1_c2`
+    ///
+    /// * This method would rewrite the expression to `c1_c2 > 5`
+    pub fn project_expr(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        update_expr(expr, &self.exprs, false)?.ok_or_else(|| {
+            internal_datafusion_err!(
+                "Failed to project an expression {} with ProjectionExprs {}",
+                expr,
+                self.exprs.iter().map(|e| format!("{e}")).join(", ")
+            )
+        })
+    }
+
+    /// Create a new [`Projector`] from this projection and an input schema.
+    ///
+    /// A [`Projector`] can be used to apply this projection to record batches.
+    ///
+    /// # Errors
+    /// This function returns an error if the output schema cannot be constructed from the input schema
+    /// with the given projection expressions.
+    /// For example, if an expression only works with integer columns but the input schema has a string column at that index.
+    pub fn make_projector(&self, input_schema: &Schema) -> Result<Projector> {
+        let output_schema = Arc::new(self.project_schema(input_schema)?);
+        Ok(Projector {
+            projection: self.clone(),
+            output_schema,
+            expression_metrics: None,
+        })
+    }
+
+    pub fn create_expression_metrics(
+        &self,
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+    ) -> ExpressionEvaluatorMetrics {
+        let labels: Vec<String> = self
+            .exprs
+            .iter()
+            .map(|proj_expr| {
+                let expr_sql = fmt_sql(proj_expr.expr.as_ref()).to_string();
+                if proj_expr.expr.to_string() == proj_expr.alias {
+                    expr_sql
+                } else {
+                    format!("{expr_sql} AS {}", proj_expr.alias)
+                }
+            })
+            .collect();
+        ExpressionEvaluatorMetrics::new(metrics, partition, labels)
+    }
+
+    /// Project statistics according to this projection.
+    /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1,
+    /// if the input statistics has column statistics for columns `a`, `b`, and `c`, the output statistics would have column statistics for columns `x` and `y`.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_common::stats::{ColumnStatistics, Precision, Statistics};
+    /// use datafusion_physical_expr::projection::ProjectionExprs;
+    /// use datafusion_common::Result;
+    /// use datafusion_common::ScalarValue;
+    /// use std::sync::Arc;
+    ///
+    /// fn main() -> Result<()> {
+    ///     // Input schema: a: Int32, b: Int32, c: Int32
+    ///     let input_schema = Arc::new(Schema::new(vec![
+    ///         Field::new("a", DataType::Int32, false),
+    ///         Field::new("b", DataType::Int32, false),
+    ///         Field::new("c", DataType::Int32, false),
+    ///     ]));
+    ///
+    ///     // Input statistics with column stats for a, b, c
+    ///     let input_stats = Statistics {
+    ///         num_rows: Precision::Exact(100),
+    ///         total_byte_size: Precision::Exact(1200),
+    ///         column_statistics: vec![
+    ///             // Column a stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(0))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(0))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(100))))
+    ///                 .with_distinct_count(Precision::Exact(100)),
+    ///             // Column b stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(0))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(10))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(60))))
+    ///                 .with_distinct_count(Precision::Exact(50)),
+    ///             // Column c stats
+    ///             ColumnStatistics::new_unknown()
+    ///                 .with_null_count(Precision::Exact(5))
+    ///                 .with_min_value(Precision::Exact(ScalarValue::Int32(Some(-10))))
+    ///                 .with_max_value(Precision::Exact(ScalarValue::Int32(Some(200))))
+    ///                 .with_distinct_count(Precision::Exact(25)),
+    ///         ],
+    ///     };
+    ///
+    ///     // Create a projection that selects columns c and a (indices 2 and 0)
+    ///     let projection = ProjectionExprs::from_indices(&[2, 0], &input_schema);
+    ///
+    ///     // Compute output schema
+    ///     let output_schema = projection.project_schema(&input_schema)?;
+    ///
+    ///     // Project the statistics
+    ///     let output_stats = projection.project_statistics(input_stats, &output_schema)?;
+    ///
+    ///     // The output should have 2 column statistics (for c and a, in that order)
+    ///     assert_eq!(output_stats.column_statistics.len(), 2);
+    ///
+    ///     // First column in output is c (was at index 2)
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[0].min_value,
+    ///         Precision::Exact(ScalarValue::Int32(Some(-10)))
+    ///     );
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[0].null_count,
+    ///         Precision::Exact(5)
+    ///     );
+    ///
+    ///     // Second column in output is a (was at index 0)
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[1].min_value,
+    ///         Precision::Exact(ScalarValue::Int32(Some(0)))
+    ///     );
+    ///     assert_eq!(
+    ///         output_stats.column_statistics[1].distinct_count,
+    ///         Precision::Exact(100)
+    ///     );
+    ///
+    ///     // Total byte size is recalculated based on projected columns
+    ///     assert_eq!(
+    ///         output_stats.total_byte_size,
+    ///         Precision::Exact(800), // each Int32 column is 4 bytes * 100 rows * 2 columns
+    ///     );
+    ///
+    ///     // Number of rows remains the same
+    ///     assert_eq!(output_stats.num_rows, Precision::Exact(100));
+    ///
+    ///     Ok(())
+    /// }
+    /// ```
+    pub fn project_statistics(
+        &self,
+        mut stats: Statistics,
+        output_schema: &Schema,
+    ) -> Result<Statistics> {
+        let mut column_statistics = vec![];
+
+        for proj_expr in self.exprs.iter() {
+            let expr = &proj_expr.expr;
+            let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                std::mem::take(&mut stats.column_statistics[col.index()])
+            } else if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
+                // Handle literal expressions (constants) by calculating proper statistics
+                let data_type = expr.data_type(output_schema)?;
+
+                if literal.value().is_null() {
+                    let null_count = match stats.num_rows {
+                        Precision::Exact(num_rows) => Precision::Exact(num_rows),
+                        _ => Precision::Absent,
+                    };
+
+                    ColumnStatistics {
+                        min_value: Precision::Exact(literal.value().clone()),
+                        max_value: Precision::Exact(literal.value().clone()),
+                        distinct_count: Precision::Exact(1),
+                        null_count,
+                        sum_value: Precision::Exact(literal.value().clone()),
+                        byte_size: Precision::Exact(0),
+                    }
+                } else {
+                    let value = literal.value();
+                    let distinct_count = Precision::Exact(1);
+                    let null_count = Precision::Exact(0);
+
+                    let byte_size = if let Some(byte_width) = data_type.primitive_width()
+                    {
+                        stats.num_rows.multiply(&Precision::Exact(byte_width))
+                    } else {
+                        // Complex types depend on array encoding, so set to Absent
+                        Precision::Absent
+                    };
+
+                    let sum_value = Precision::<ScalarValue>::from(stats.num_rows)
+                        .cast_to(&value.data_type())
+                        .ok()
+                        .map(|row_count| {
+                            Precision::Exact(value.clone()).multiply(&row_count)
+                        })
+                        .unwrap_or(Precision::Absent);
+
+                    ColumnStatistics {
+                        min_value: Precision::Exact(value.clone()),
+                        max_value: Precision::Exact(value.clone()),
+                        distinct_count,
+                        null_count,
+                        sum_value,
+                        byte_size,
+                    }
+                }
+            } else {
+                // TODO stats: estimate more statistics from expressions
+                // (expressions should compute their statistics themselves)
+                ColumnStatistics::new_unknown()
+            };
+            column_statistics.push(col_stats);
+        }
+        stats.calculate_total_byte_size(output_schema);
+        stats.column_statistics = column_statistics;
+        Ok(stats)
+    }
+}
+
+impl<'a> IntoIterator for &'a ProjectionExprs {
+    type Item = &'a ProjectionExpr;
+    type IntoIter = std::slice::Iter<'a, ProjectionExpr>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.exprs.iter()
+    }
+}
+
+/// Applies a projection to record batches.
+///
+/// A [`Projector`] uses a set of projection expressions to transform
+/// and a pre-computed output schema to project record batches accordingly.
+///
+/// The main reason to use a `Projector` is to avoid repeatedly computing
+/// the output schema for each batch, which can be costly if the projection
+/// expressions are complex.
+#[derive(Clone, Debug)]
+pub struct Projector {
+    projection: ProjectionExprs,
+    output_schema: SchemaRef,
+    /// If `Some`, metrics will be tracked for projection evaluation.
+    expression_metrics: Option<ExpressionEvaluatorMetrics>,
+}
+
+impl Projector {
+    /// Construct the projector with metrics. After execution, related metrics will
+    /// be tracked inside `ExecutionPlanMetricsSet`
+    ///
+    /// See [`ExpressionEvaluatorMetrics`] for details.
+    pub fn with_metrics(
+        &self,
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+    ) -> Self {
+        let expr_metrics = self
+            .projection
+            .create_expression_metrics(metrics, partition);
+        Self {
+            expression_metrics: Some(expr_metrics),
+            projection: self.projection.clone(),
+            output_schema: Arc::clone(&self.output_schema),
+        }
+    }
+
+    /// Project a record batch according to this projector's expressions.
+    ///
+    /// # Errors
+    /// This function returns an error if any expression evaluation fails
+    /// or if the output schema of the resulting record batch does not match
+    /// the pre-computed output schema of the projector.
+    pub fn project_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let arrays = evaluate_expressions_to_arrays_with_metrics(
+            self.projection.exprs.iter().map(|p| &p.expr),
+            batch,
+            self.expression_metrics.as_ref(),
+        )?;
+
+        if arrays.is_empty() {
+            let options =
+                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+            RecordBatch::try_new_with_options(
+                Arc::clone(&self.output_schema),
+                arrays,
+                &options,
+            )
+            .map_err(Into::into)
+        } else {
+            RecordBatch::try_new(Arc::clone(&self.output_schema), arrays)
+                .map_err(Into::into)
+        }
+    }
+
+    pub fn output_schema(&self) -> &SchemaRef {
+        &self.output_schema
+    }
+
+    pub fn projection(&self) -> &ProjectionExprs {
+        &self.projection
+    }
+}
+
+/// Describes an immutable reference counted projection.
+///
+/// This structure represents projecting a set of columns by index.
+/// [`Arc`] is used to make it cheap to clone.
+pub type ProjectionRef = Arc<[usize]>;
+
+/// Combine two projections.
+///
+/// If `p1` is [`None`] then there are no changes.
+/// Otherwise, if passed `p2` is not [`None`] then it is remapped
+/// according to the `p1`. Otherwise, there are no changes.
+///
+/// # Example
+///
+/// If stored projection is [0, 2] and we call `apply_projection([0, 2, 3])`,
+/// then the resulting projection will be [0, 3].
+///
+/// # Error
+///
+/// Returns an internal error if `p1` contains index that is greater than `p2` len.
+///
+pub fn combine_projections(
+    p1: Option<&ProjectionRef>,
+    p2: Option<&ProjectionRef>,
+) -> Result<Option<ProjectionRef>> {
+    let Some(p1) = p1 else {
+        return Ok(None);
+    };
+    let Some(p2) = p2 else {
+        return Ok(Some(Arc::clone(p1)));
+    };
+
+    Ok(Some(
+        p1.iter()
+            .map(|i| {
+                let idx = *i;
+                assert_or_internal_err!(
+                    idx < p2.len(),
+                    "unable to apply projection: index {} is greater than new projection len {}",
+                    idx,
+                    p2.len(),
+                );
+                Ok(p2[*i])
+            })
+            .collect::<Result<Arc<[usize]>>>()?,
+    ))
+}
+
+/// The function projects / unprojects an expression with respect to set of
+/// projection expressions.
+///
+/// See also [`ProjectionExprs::unproject_expr`] and [`ProjectionExprs::project_expr`]
+///
+/// 1) When `unproject` is `true`:
+///
+///    Rewrites an expression with respect to the projection expressions,
+///    effectively "unprojecting" it to reference the original input columns.
+///
+///    For example, given
+///    * the expressions `a@1 + b@2` and `c@0`
+///    * and projection expressions `c@2, a@0, b@1`
+///
+///    Then
+///    * `a@1 + b@2` becomes `a@0 + b@1`
+///    * `c@0` becomes `c@2`
+///
+/// 2) When `unproject` is `false`:
+///
+///    Rewrites the expression to reference the projected expressions,
+///    effectively "projecting" it. The resulting expression will reference the
+///    indices as they appear in the projection.
+///
+///    If the expression cannot be rewritten after the projection, it returns
+///    `None`.
+///
+///    For example, given
+///    * the expressions `c@0`, `a@1` and `b@2`
+///    * the projection `a@1 as a, c@0 as c_new`,
+///
+///    Then
+///    * `c@0` becomes `c_new@1`
+///    * `a@1` becomes `a@0`
+///    * `b@2` results in `None` since the projection does not include `b`.
+///
+/// # Errors
+/// This function returns an error if `unproject` is `true` and if any expression references
+/// an index that is out of bounds for `projected_exprs`.
+/// For example:
+///
+/// - `expr` is `a@3`
+/// - `projected_exprs` is \[`a@0`, `b@1`\]
+///
+/// In this case, `a@3` references index 3, which is out of bounds for `projected_exprs` (which has length 2).
+pub fn update_expr(
+    expr: &Arc<dyn PhysicalExpr>,
+    projected_exprs: &[ProjectionExpr],
+    unproject: bool,
+) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+    #[derive(Debug, PartialEq)]
+    enum RewriteState {
+        /// The expression is unchanged.
+        Unchanged,
+        /// Some part of the expression has been rewritten
+        RewrittenValid,
+        /// Some part of the expression has been rewritten, but some column
+        /// references could not be.
+        RewrittenInvalid,
+    }
+
+    let mut state = RewriteState::Unchanged;
+
+    let new_expr = Arc::clone(expr)
+        .transform_up(|expr| {
+            if state == RewriteState::RewrittenInvalid {
+                return Ok(Transformed::no(expr));
+            }
+
+            let Some(column) = expr.as_any().downcast_ref::<Column>() else {
+                return Ok(Transformed::no(expr));
+            };
+            if unproject {
+                state = RewriteState::RewrittenValid;
+                // Update the index of `column`:
+                let projected_expr = projected_exprs.get(column.index()).ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Column index {} out of bounds for projected expressions of length {}",
+                        column.index(),
+                        projected_exprs.len()
+                    )
+                })?;
+                Ok(Transformed::yes(Arc::clone(&projected_expr.expr)))
+            } else {
+                // default to invalid, in case we can't find the relevant column
+                state = RewriteState::RewrittenInvalid;
+                // Determine how to update `column` to accommodate `projected_exprs`
+                projected_exprs
+                    .iter()
+                    .enumerate()
+                    .find_map(|(index, proj_expr)| {
+                        proj_expr.expr.as_any().downcast_ref::<Column>().and_then(
+                            |projected_column| {
+                                (column.name().eq(projected_column.name())
+                                    && column.index() == projected_column.index())
+                                .then(|| {
+                                    state = RewriteState::RewrittenValid;
+                                    Arc::new(Column::new(&proj_expr.alias, index)) as _
+                                })
+                            },
+                        )
+                    })
+                    .map_or_else(
+                        || Ok(Transformed::no(expr)),
+                        |c| Ok(Transformed::yes(c)),
+                    )
+            }
+        })
+        .data()?;
+
+    match state {
+        RewriteState::RewrittenInvalid => Ok(None),
+        // Both Unchanged and RewrittenValid are valid:
+        // - Unchanged means no columns to rewrite (e.g., literals)
+        // - RewrittenValid means columns were successfully rewritten
+        RewriteState::Unchanged | RewriteState::RewrittenValid => Ok(Some(new_expr)),
+    }
+}
+
+/// Stores target expressions, along with their indices, that associate with a
+/// source expression in a projection mapping.
+#[derive(Clone, Debug, Default)]
+pub struct ProjectionTargets {
+    /// A non-empty vector of pairs of target expressions and their indices.
+    /// Consider using a special non-empty collection type in the future (e.g.
+    /// if Rust provides one in the standard library).
+    exprs_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>,
+}
+
+impl ProjectionTargets {
+    /// Returns the first target expression and its index.
+    pub fn first(&self) -> &(Arc<dyn PhysicalExpr>, usize) {
+        // Since the vector is non-empty, we can safely unwrap:
+        self.exprs_indices.first().unwrap()
+    }
+
+    /// Adds a target expression and its index to the list of targets.
+    pub fn push(&mut self, target: (Arc<dyn PhysicalExpr>, usize)) {
+        self.exprs_indices.push(target);
+    }
+}
+
+impl Deref for ProjectionTargets {
+    type Target = [(Arc<dyn PhysicalExpr>, usize)];
+
+    fn deref(&self) -> &Self::Target {
+        &self.exprs_indices
+    }
+}
+
+impl From<Vec<(Arc<dyn PhysicalExpr>, usize)>> for ProjectionTargets {
+    fn from(exprs_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>) -> Self {
+        Self { exprs_indices }
+    }
+}
+
+/// Stores the mapping between source expressions and target expressions for a
+/// projection.
+#[derive(Clone, Debug)]
+pub struct ProjectionMapping {
+    /// Mapping between source expressions and target expressions.
+    /// Vector indices correspond to the indices after projection.
+    map: IndexMap<Arc<dyn PhysicalExpr>, ProjectionTargets>,
+}
+
+impl ProjectionMapping {
+    /// Constructs the mapping between a projection's input and output
+    /// expressions.
+    ///
+    /// For example, given the input projection expressions (`a + b`, `c + d`)
+    /// and an output schema with two columns `"c + d"` and `"a + b"`, the
+    /// projection mapping would be:
+    ///
+    /// ```text
+    ///  [0]: (c + d, [(col("c + d"), 0)])
+    ///  [1]: (a + b, [(col("a + b"), 1)])
+    /// ```
+    ///
+    /// where `col("c + d")` means the column named `"c + d"`.
+    pub fn try_new(
+        expr: impl IntoIterator<Item = (Arc<dyn PhysicalExpr>, String)>,
+        input_schema: &SchemaRef,
+    ) -> Result<Self> {
+        // Construct a map from the input expressions to the output expression of the projection:
+        let mut map = IndexMap::<_, ProjectionTargets>::new();
+        for (expr_idx, (expr, name)) in expr.into_iter().enumerate() {
+            let target_expr = Arc::new(Column::new(&name, expr_idx)) as _;
+            let source_expr = expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
+                Some(col) => {
+                    // Sometimes, an expression and its name in the input_schema
+                    // doesn't match. This can cause problems, so we make sure
+                    // that the expression name matches with the name in `input_schema`.
+                    // Conceptually, `source_expr` and `expression` should be the same.
+                    let idx = col.index();
+                    let matching_field = input_schema.field(idx);
+                    let matching_name = matching_field.name();
+                    assert_or_internal_err!(
+                        col.name() == matching_name,
+                        "Input field name {matching_name} does not match with the projection expression {}",
+                        col.name()
+                    );
+                    let matching_column = Column::new(matching_name, idx);
+                    Ok(Transformed::yes(Arc::new(matching_column)))
+                }
+                None => Ok(Transformed::no(e)),
+            })
+            .data()?;
+            map.entry(source_expr)
+                .or_default()
+                .push((target_expr, expr_idx));
+        }
+        Ok(Self { map })
+    }
+
+    /// Constructs a subset mapping using the provided indices.
+    ///
+    /// This is used when the output is a subset of the input without any
+    /// other transformations. The indices are for columns in the schema.
+    pub fn from_indices(indices: &[usize], schema: &SchemaRef) -> Result<Self> {
+        let projection_exprs = indices.iter().map(|index| {
+            let field = schema.field(*index);
+            let column = Arc::new(Column::new(field.name(), *index));
+            (column as _, field.name().clone())
+        });
+        ProjectionMapping::try_new(projection_exprs, schema)
+    }
+}
+
+impl Deref for ProjectionMapping {
+    type Target = IndexMap<Arc<dyn PhysicalExpr>, ProjectionTargets>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.map
+    }
+}
+
+impl FromIterator<(Arc<dyn PhysicalExpr>, ProjectionTargets)> for ProjectionMapping {
+    fn from_iter<T: IntoIterator<Item = (Arc<dyn PhysicalExpr>, ProjectionTargets)>>(
+        iter: T,
+    ) -> Self {
+        Self {
+            map: IndexMap::from_iter(iter),
+        }
+    }
+}
+
+/// Projects a slice of [LexOrdering]s onto the given schema.
+///
+/// This is a convenience wrapper that applies [project_ordering] to each
+/// input ordering and collects the successful projections:
+/// - For each input ordering, the result of [project_ordering] is appended to
+///   the output if it is `Some(...)`.
+/// - Order is preserved and no deduplication is attempted.
+/// - If none of the input orderings can be projected, an empty `Vec` is
+///   returned.
+///
+/// See [project_ordering] for the semantics of projecting a single
+/// [LexOrdering].
+pub fn project_orderings(
+    orderings: &[LexOrdering],
+    schema: &SchemaRef,
+) -> Vec<LexOrdering> {
+    let mut projected_orderings = vec![];
+
+    for ordering in orderings {
+        projected_orderings.extend(project_ordering(ordering, schema));
+    }
+
+    projected_orderings
+}
+
+/// Projects a single [LexOrdering] onto the given schema.
+///
+/// This function attempts to rewrite every [PhysicalSortExpr] in the provided
+/// [LexOrdering] so that any [Column] expressions point at the correct field
+/// indices in `schema`.
+///
+/// Key details:
+/// - Columns are matched by name, not by index. The index of each matched
+///   column is looked up with [Schema::column_with_name](arrow::datatypes::Schema::column_with_name) and a new
+///   [Column] with the correct [index](Column::index) is substituted.
+/// - If an expression references a column name that does not exist in
+///   `schema`, projection of the current ordering stops and only the already
+///   rewritten prefix is kept. This models the fact that a lexicographical
+///   ordering remains valid for any leading prefix whose expressions are
+///   present in the projected schema.
+/// - If no expressions can be projected (i.e. the first one is missing), the
+///   function returns `None`.
+///
+/// Return value:
+/// - `Some(LexOrdering)` if at least one sort expression could be projected.
+///   The returned ordering may be a strict prefix of the input ordering.
+/// - `None` if no part of the ordering can be projected onto `schema`.
+///
+/// Example
+///
+/// Suppose we have an input ordering `[col("a@0"), col("b@1")]` but the projected
+/// schema only contains b and not a. The result will be `Some([col("a@0")])`. In other
+/// words, the column reference is reindexed to match the projected schema.
+/// If neither a nor b is present, the result will be None.
+pub fn project_ordering(
+    ordering: &LexOrdering,
+    schema: &SchemaRef,
+) -> Option<LexOrdering> {
+    let mut projected_exprs = vec![];
+    for PhysicalSortExpr { expr, options } in ordering.iter() {
+        let transformed = Arc::clone(expr).transform_up(|expr| {
+            let Some(col) = expr.as_any().downcast_ref::<Column>() else {
+                return Ok(Transformed::no(expr));
+            };
+
+            let name = col.name();
+            if let Some((idx, _)) = schema.column_with_name(name) {
+                // Compute the new column expression (with correct index) after projection:
+                Ok(Transformed::yes(Arc::new(Column::new(name, idx))))
+            } else {
+                // Cannot find expression in the projected_schema,
+                // signal this using an Err result
+                plan_err!("")
+            }
+        });
+
+        match transformed {
+            Ok(transformed) => {
+                projected_exprs.push(PhysicalSortExpr::new(transformed.data, *options));
+            }
+            Err(_) => {
+                // Err result indicates an expression could not be found in the
+                // projected_schema, stop iterating since rest of the orderings are violated
+                break;
+            }
+        }
+    }
+
+    LexOrdering::new(projected_exprs)
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+    use crate::equivalence::{EquivalenceProperties, convert_to_orderings};
+    use crate::expressions::{BinaryExpr, Literal, col};
+    use crate::utils::tests::TestScalarUDF;
+    use crate::{PhysicalExprRef, ScalarFunctionExpr};
+
+    use arrow::compute::SortOptions;
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::stats::Precision;
+    use datafusion_common::{ScalarValue, Statistics};
+    use datafusion_expr::{Operator, ScalarUDF};
+    use insta::assert_snapshot;
+
+    pub(crate) fn output_schema(
+        mapping: &ProjectionMapping,
+        input_schema: &Arc<Schema>,
+    ) -> Result<SchemaRef> {
+        // Calculate output schema:
+        let mut fields = vec![];
+        for (source, targets) in mapping.iter() {
+            let data_type = source.data_type(input_schema)?;
+            let nullable = source.nullable(input_schema)?;
+            for (target, _) in targets.iter() {
+                let Some(column) = target.as_any().downcast_ref::<Column>() else {
+                    return plan_err!("Expects to have column");
+                };
+                fields.push(Field::new(column.name(), data_type.clone(), nullable));
+            }
+        }
+
+        let output_schema = Arc::new(Schema::new_with_metadata(
+            fields,
+            input_schema.metadata().clone(),
+        ));
+
+        Ok(output_schema)
+    }
+
+    #[test]
+    fn project_orderings() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("e", DataType::Int32, true),
+            Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
+        ]));
+        let col_a = &col("a", &schema)?;
+        let col_b = &col("b", &schema)?;
+        let col_c = &col("c", &schema)?;
+        let col_d = &col("d", &schema)?;
+        let col_e = &col("e", &schema)?;
+        let col_ts = &col("ts", &schema)?;
+        let a_plus_b = Arc::new(BinaryExpr::new(
+            Arc::clone(col_a),
+            Operator::Plus,
+            Arc::clone(col_b),
+        )) as Arc<dyn PhysicalExpr>;
+        let b_plus_d = Arc::new(BinaryExpr::new(
+            Arc::clone(col_b),
+            Operator::Plus,
+            Arc::clone(col_d),
+        )) as Arc<dyn PhysicalExpr>;
+        let b_plus_e = Arc::new(BinaryExpr::new(
+            Arc::clone(col_b),
+            Operator::Plus,
+            Arc::clone(col_e),
+        )) as Arc<dyn PhysicalExpr>;
+        let c_plus_d = Arc::new(BinaryExpr::new(
+            Arc::clone(col_c),
+            Operator::Plus,
+            Arc::clone(col_d),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let option_asc = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+        let option_desc = SortOptions {
+            descending: true,
+            nulls_first: true,
+        };
+
+        let test_cases = vec![
+            // ---------- TEST CASE 1 ------------
+            (
+                // orderings
+                vec![
+                    // [b ASC]
+                    vec![(col_b, option_asc)],
+                ],
+                // projection exprs
+                vec![(col_b, "b_new".to_string()), (col_a, "a_new".to_string())],
+                // expected
+                vec![
+                    // [b_new ASC]
+                    vec![("b_new", option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 2 ------------
+            (
+                // orderings
+                vec![
+                    // empty ordering
+                ],
+                // projection exprs
+                vec![(col_c, "c_new".to_string()), (col_b, "b_new".to_string())],
+                // expected
+                vec![
+                    // no ordering at the output
+                ],
+            ),
+            // ---------- TEST CASE 3 ------------
+            (
+                // orderings
+                vec![
+                    // [ts ASC]
+                    vec![(col_ts, option_asc)],
+                ],
+                // projection exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_ts, "ts_new".to_string()),
+                ],
+                // expected
+                vec![
+                    // [ts_new ASC]
+                    vec![("ts_new", option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 4 ------------
+            (
+                // orderings
+                vec![
+                    // [a ASC, ts ASC]
+                    vec![(col_a, option_asc), (col_ts, option_asc)],
+                    // [b ASC, ts ASC]
+                    vec![(col_b, option_asc), (col_ts, option_asc)],
+                ],
+                // projection exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_ts, "ts_new".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, ts_new ASC]
+                    vec![("a_new", option_asc), ("ts_new", option_asc)],
+                    // [b_new ASC, ts_new ASC]
+                    vec![("b_new", option_asc), ("ts_new", option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 5 ------------
+            (
+                // orderings
+                vec![
+                    // [a + b ASC]
+                    vec![(&a_plus_b, option_asc)],
+                ],
+                // projection exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (&a_plus_b, "a+b".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a + b ASC]
+                    vec![("a+b", option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 6 ------------
+            (
+                // orderings
+                vec![
+                    // [a + b ASC, c ASC]
+                    vec![(&a_plus_b, option_asc), (col_c, option_asc)],
+                ],
+                // projection exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_c, "c_new".to_string()),
+                    (&a_plus_b, "a+b".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a + b ASC, c_new ASC]
+                    vec![("a+b", option_asc), ("c_new", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 7 ----------
+            (
+                vec![
+                    // [a ASC, b ASC, c ASC]
+                    vec![(col_a, option_asc), (col_b, option_asc)],
+                    // [a ASC, d ASC]
+                    vec![(col_a, option_asc), (col_d, option_asc)],
+                ],
+                // b as b_new, a as a_new, d as d_new b+d
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_d, "d_new".to_string()),
+                    (&b_plus_d, "b+d".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, b_new ASC]
+                    vec![("a_new", option_asc), ("b_new", option_asc)],
+                    // [a_new ASC, d_new ASC]
+                    vec![("a_new", option_asc), ("d_new", option_asc)],
+                    // [a_new ASC, b+d ASC]
+                    vec![("a_new", option_asc), ("b+d", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 8 ----------
+            (
+                // orderings
+                vec![
+                    // [b+d ASC]
+                    vec![(&b_plus_d, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_d, "d_new".to_string()),
+                    (&b_plus_d, "b+d".to_string()),
+                ],
+                // expected
+                vec![
+                    // [b+d ASC]
+                    vec![("b+d", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 9 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, d ASC, b ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_d, option_asc),
+                        (col_b, option_asc),
+                    ],
+                    // [c ASC]
+                    vec![(col_c, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_d, "d_new".to_string()),
+                    (col_c, "c_new".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, d_new ASC, b_new ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("d_new", option_asc),
+                        ("b_new", option_asc),
+                    ],
+                    // [c_new ASC],
+                    vec![("c_new", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 10 ----------
+            (
+                vec![
+                    // [a ASC, b ASC, c ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_b, option_asc),
+                        (col_c, option_asc),
+                    ],
+                    // [a ASC, d ASC]
+                    vec![(col_a, option_asc), (col_d, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_c, "c_new".to_string()),
+                    (&c_plus_d, "c+d".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, b_new ASC, c_new ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("b_new", option_asc),
+                        ("c_new", option_asc),
+                    ],
+                    // [a_new ASC, b_new ASC, c+d ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("b_new", option_asc),
+                        ("c+d", option_asc),
+                    ],
+                ],
+            ),
+            // ------- TEST CASE 11 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, b ASC]
+                    vec![(col_a, option_asc), (col_b, option_asc)],
+                    // [a ASC, d ASC]
+                    vec![(col_a, option_asc), (col_d, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (&b_plus_d, "b+d".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, b_new ASC]
+                    vec![("a_new", option_asc), ("b_new", option_asc)],
+                    // [a_new ASC, b + d ASC]
+                    vec![("a_new", option_asc), ("b+d", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 12 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, b ASC, c ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_b, option_asc),
+                        (col_c, option_asc),
+                    ],
+                ],
+                // proj exprs
+                vec![(col_c, "c_new".to_string()), (col_a, "a_new".to_string())],
+                // expected
+                vec![
+                    // [a_new ASC]
+                    vec![("a_new", option_asc)],
+                ],
+            ),
+            // ------- TEST CASE 13 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, b ASC, c ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_b, option_asc),
+                        (col_c, option_asc),
+                    ],
+                    // [a ASC, a + b ASC, c ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (&a_plus_b, option_asc),
+                        (col_c, option_asc),
+                    ],
+                ],
+                // proj exprs
+                vec![
+                    (col_c, "c_new".to_string()),
+                    (col_b, "b_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (&a_plus_b, "a+b".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, b_new ASC, c_new ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("b_new", option_asc),
+                        ("c_new", option_asc),
+                    ],
+                    // [a_new ASC, a+b ASC, c_new ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("a+b", option_asc),
+                        ("c_new", option_asc),
+                    ],
+                ],
+            ),
+            // ------- TEST CASE 14 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, b ASC]
+                    vec![(col_a, option_asc), (col_b, option_asc)],
+                    // [c ASC, b ASC]
+                    vec![(col_c, option_asc), (col_b, option_asc)],
+                    // [d ASC, e ASC]
+                    vec![(col_d, option_asc), (col_e, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_c, "c_new".to_string()),
+                    (col_d, "d_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (&b_plus_e, "b+e".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, d_new ASC, b+e ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("d_new", option_asc),
+                        ("b+e", option_asc),
+                    ],
+                    // [d_new ASC, a_new ASC, b+e ASC]
+                    vec![
+                        ("d_new", option_asc),
+                        ("a_new", option_asc),
+                        ("b+e", option_asc),
+                    ],
+                    // [c_new ASC, d_new ASC, b+e ASC]
+                    vec![
+                        ("c_new", option_asc),
+                        ("d_new", option_asc),
+                        ("b+e", option_asc),
+                    ],
+                    // [d_new ASC, c_new ASC, b+e ASC]
+                    vec![
+                        ("d_new", option_asc),
+                        ("c_new", option_asc),
+                        ("b+e", option_asc),
+                    ],
+                ],
+            ),
+            // ------- TEST CASE 15 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, c ASC, b ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_c, option_asc),
+                        (col_b, option_asc),
+                    ],
+                ],
+                // proj exprs
+                vec![
+                    (col_c, "c_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (&a_plus_b, "a+b".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, d_new ASC, b+e ASC]
+                    vec![
+                        ("a_new", option_asc),
+                        ("c_new", option_asc),
+                        ("a+b", option_asc),
+                    ],
+                ],
+            ),
+            // ------- TEST CASE 16 ----------
+            (
+                // orderings
+                vec![
+                    // [a ASC, b ASC]
+                    vec![(col_a, option_asc), (col_b, option_asc)],
+                    // [c ASC, b DESC]
+                    vec![(col_c, option_asc), (col_b, option_desc)],
+                    // [e ASC]
+                    vec![(col_e, option_asc)],
+                ],
+                // proj exprs
+                vec![
+                    (col_c, "c_new".to_string()),
+                    (col_a, "a_new".to_string()),
+                    (col_b, "b_new".to_string()),
+                    (&b_plus_e, "b+e".to_string()),
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, b_new ASC]
+                    vec![("a_new", option_asc), ("b_new", option_asc)],
+                    // [a_new ASC, b_new ASC]
+                    vec![("a_new", option_asc), ("b+e", option_asc)],
+                    // [c_new ASC, b_new DESC]
+                    vec![("c_new", option_asc), ("b_new", option_desc)],
+                ],
+            ),
+        ];
+
+        for (idx, (orderings, proj_exprs, expected)) in test_cases.into_iter().enumerate()
+        {
+            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+            let orderings = convert_to_orderings(&orderings);
+            eq_properties.add_orderings(orderings);
+
+            let proj_exprs = proj_exprs
+                .into_iter()
+                .map(|(expr, name)| (Arc::clone(expr), name));
+            let projection_mapping = ProjectionMapping::try_new(proj_exprs, &schema)?;
+            let output_schema = output_schema(&projection_mapping, &schema)?;
+
+            let expected = expected
+                .into_iter()
+                .map(|ordering| {
+                    ordering
+                        .into_iter()
+                        .map(|(name, options)| {
+                            (col(name, &output_schema).unwrap(), options)
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .collect::<Vec<_>>();
+            let expected = convert_to_orderings(&expected);
+
+            let projected_eq = eq_properties.project(&projection_mapping, output_schema);
+            let orderings = projected_eq.oeq_class();
+
+            let err_msg = format!(
+                "test_idx: {idx:?}, actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
+            );
+
+            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
+            for expected_ordering in &expected {
+                assert!(orderings.contains(expected_ordering), "{}", err_msg)
+            }
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn project_orderings2() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true),
+        ]));
+        let col_a = &col("a", &schema)?;
+        let col_b = &col("b", &schema)?;
+        let col_c = &col("c", &schema)?;
+        let col_ts = &col("ts", &schema)?;
+        let a_plus_b = Arc::new(BinaryExpr::new(
+            Arc::clone(col_a),
+            Operator::Plus,
+            Arc::clone(col_b),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let test_fun = Arc::new(ScalarUDF::new_from_impl(TestScalarUDF::new()));
+
+        let round_c = Arc::new(ScalarFunctionExpr::try_new(
+            test_fun,
+            vec![Arc::clone(col_c)],
+            &schema,
+            Arc::new(ConfigOptions::default()),
+        )?) as PhysicalExprRef;
+
+        let option_asc = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+
+        let proj_exprs = vec![
+            (col_b, "b_new".to_string()),
+            (col_a, "a_new".to_string()),
+            (col_c, "c_new".to_string()),
+            (&round_c, "round_c_res".to_string()),
+        ];
+        let proj_exprs = proj_exprs
+            .into_iter()
+            .map(|(expr, name)| (Arc::clone(expr), name));
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &schema)?;
+        let output_schema = output_schema(&projection_mapping, &schema)?;
+
+        let col_a_new = &col("a_new", &output_schema)?;
+        let col_b_new = &col("b_new", &output_schema)?;
+        let col_c_new = &col("c_new", &output_schema)?;
+        let col_round_c_res = &col("round_c_res", &output_schema)?;
+        let a_new_plus_b_new = Arc::new(BinaryExpr::new(
+            Arc::clone(col_a_new),
+            Operator::Plus,
+            Arc::clone(col_b_new),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let test_cases = [
+            // ---------- TEST CASE 1 ------------
+            (
+                // orderings
+                vec![
+                    // [a ASC]
+                    vec![(col_a, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [b_new ASC]
+                    vec![(col_a_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 2 ------------
+            (
+                // orderings
+                vec![
+                    // [a+b ASC]
+                    vec![(&a_plus_b, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [b_new ASC]
+                    vec![(&a_new_plus_b_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 3 ------------
+            (
+                // orderings
+                vec![
+                    // [a ASC, ts ASC]
+                    vec![(col_a, option_asc), (col_ts, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, date_bin_res ASC]
+                    vec![(col_a_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 4 ------------
+            (
+                // orderings
+                vec![
+                    // [a ASC, ts ASC, b ASC]
+                    vec![
+                        (col_a, option_asc),
+                        (col_ts, option_asc),
+                        (col_b, option_asc),
+                    ],
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, date_bin_res ASC]
+                    vec![(col_a_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 5 ------------
+            (
+                // orderings
+                vec![
+                    // [a ASC, c ASC]
+                    vec![(col_a, option_asc), (col_c, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [a_new ASC, round_c_res ASC, c_new ASC]
+                    vec![(col_a_new, option_asc), (col_round_c_res, option_asc)],
+                    // [a_new ASC, c_new ASC]
+                    vec![(col_a_new, option_asc), (col_c_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 6 ------------
+            (
+                // orderings
+                vec![
+                    // [c ASC, b ASC]
+                    vec![(col_c, option_asc), (col_b, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [round_c_res ASC]
+                    vec![(col_round_c_res, option_asc)],
+                    // [c_new ASC, b_new ASC]
+                    vec![(col_c_new, option_asc), (col_b_new, option_asc)],
+                ],
+            ),
+            // ---------- TEST CASE 7 ------------
+            (
+                // orderings
+                vec![
+                    // [a+b ASC, c ASC]
+                    vec![(&a_plus_b, option_asc), (col_c, option_asc)],
+                ],
+                // expected
+                vec![
+                    // [a+b ASC, round(c) ASC, c_new ASC]
+                    vec![
+                        (&a_new_plus_b_new, option_asc),
+                        (col_round_c_res, option_asc),
+                    ],
+                    // [a+b ASC, c_new ASC]
+                    vec![(&a_new_plus_b_new, option_asc), (col_c_new, option_asc)],
+                ],
+            ),
+        ];
+
+        for (idx, (orderings, expected)) in test_cases.iter().enumerate() {
+            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+
+            let orderings = convert_to_orderings(orderings);
+            eq_properties.add_orderings(orderings);
+
+            let expected = convert_to_orderings(expected);
+
+            let projected_eq =
+                eq_properties.project(&projection_mapping, Arc::clone(&output_schema));
+            let orderings = projected_eq.oeq_class();
+
+            let err_msg = format!(
+                "test idx: {idx:?}, actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
+            );
+
+            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
+            for expected_ordering in &expected {
+                assert!(orderings.contains(expected_ordering), "{}", err_msg)
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn project_orderings3() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("e", DataType::Int32, true),
+            Field::new("f", DataType::Int32, true),
+        ]));
+        let col_a = &col("a", &schema)?;
+        let col_b = &col("b", &schema)?;
+        let col_c = &col("c", &schema)?;
+        let col_d = &col("d", &schema)?;
+        let col_e = &col("e", &schema)?;
+        let col_f = &col("f", &schema)?;
+        let a_plus_b = Arc::new(BinaryExpr::new(
+            Arc::clone(col_a),
+            Operator::Plus,
+            Arc::clone(col_b),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let option_asc = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+
+        let proj_exprs = vec![
+            (col_c, "c_new".to_string()),
+            (col_d, "d_new".to_string()),
+            (&a_plus_b, "a+b".to_string()),
+        ];
+        let proj_exprs = proj_exprs
+            .into_iter()
+            .map(|(expr, name)| (Arc::clone(expr), name));
+        let projection_mapping = ProjectionMapping::try_new(proj_exprs, &schema)?;
+        let output_schema = output_schema(&projection_mapping, &schema)?;
+
+        let col_a_plus_b_new = &col("a+b", &output_schema)?;
+        let col_c_new = &col("c_new", &output_schema)?;
+        let col_d_new = &col("d_new", &output_schema)?;
+
+        let test_cases = vec![
+            // ---------- TEST CASE 1 ------------
+            (
+                // orderings
+                vec![
+                    // [d ASC, b ASC]
+                    vec![(col_d, option_asc), (col_b, option_asc)],
+                    // [c ASC, a ASC]
+                    vec![(col_c, option_asc), (col_a, option_asc)],
+                ],
+                // equal conditions
+                vec![],
+                // expected
+                vec![
+                    // [d_new ASC, c_new ASC, a+b ASC]
+                    vec![
+                        (col_d_new, option_asc),
+                        (col_c_new, option_asc),
+                        (col_a_plus_b_new, option_asc),
+                    ],
+                    // [c_new ASC, d_new ASC, a+b ASC]
+                    vec![
+                        (col_c_new, option_asc),
+                        (col_d_new, option_asc),
+                        (col_a_plus_b_new, option_asc),
+                    ],
+                ],
+            ),
+            // ---------- TEST CASE 2 ------------
+            (
+                // orderings
+                vec![
+                    // [d ASC, b ASC]
+                    vec![(col_d, option_asc), (col_b, option_asc)],
+                    // [c ASC, e ASC], Please note that a=e
+                    vec![(col_c, option_asc), (col_e, option_asc)],
+                ],
+                // equal conditions
+                vec![(col_e, col_a)],
+                // expected
+                vec![
+                    // [d_new ASC, c_new ASC, a+b ASC]
+                    vec![
+                        (col_d_new, option_asc),
+                        (col_c_new, option_asc),
+                        (col_a_plus_b_new, option_asc),
+                    ],
+                    // [c_new ASC, d_new ASC, a+b ASC]
+                    vec![
+                        (col_c_new, option_asc),
+                        (col_d_new, option_asc),
+                        (col_a_plus_b_new, option_asc),
+                    ],
+                ],
+            ),
+            // ---------- TEST CASE 3 ------------
+            (
+                // orderings
+                vec![
+                    // [d ASC, b ASC]
+                    vec![(col_d, option_asc), (col_b, option_asc)],
+                    // [c ASC, e ASC], Please note that a=f
+                    vec![(col_c, option_asc), (col_e, option_asc)],
+                ],
+                // equal conditions
+                vec![(col_a, col_f)],
+                // expected
+                vec![
+                    // [d_new ASC]
+                    vec![(col_d_new, option_asc)],
+                    // [c_new ASC]
+                    vec![(col_c_new, option_asc)],
+                ],
+            ),
+        ];
+        for (orderings, equal_columns, expected) in test_cases {
+            let mut eq_properties = EquivalenceProperties::new(Arc::clone(&schema));
+            for (lhs, rhs) in equal_columns {
+                eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))?;
+            }
+
+            let orderings = convert_to_orderings(&orderings);
+            eq_properties.add_orderings(orderings);
+
+            let expected = convert_to_orderings(&expected);
+
+            let projected_eq =
+                eq_properties.project(&projection_mapping, Arc::clone(&output_schema));
+            let orderings = projected_eq.oeq_class();
+
+            let err_msg = format!(
+                "actual: {orderings:?}, expected: {expected:?}, projection_mapping: {projection_mapping:?}"
+            );
+
+            assert_eq!(orderings.len(), expected.len(), "{err_msg}");
+            for expected_ordering in &expected {
+                assert!(orderings.contains(expected_ordering), "{}", err_msg)
+            }
+        }
+
+        Ok(())
+    }
+
+    fn get_stats() -> Statistics {
+        Statistics {
+            num_rows: Precision::Exact(5),
+            total_byte_size: Precision::Exact(23),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(5),
+                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
+                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
+                    null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(1),
+                    max_value: Precision::Exact(ScalarValue::from("x")),
+                    min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
+                    null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Absent,
+                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
+                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
+                    sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
+                    null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+            ],
+        }
+    }
+
+    fn get_schema() -> Schema {
+        let field_0 = Field::new("col0", DataType::Int64, false);
+        let field_1 = Field::new("col1", DataType::Utf8, false);
+        let field_2 = Field::new("col2", DataType::Float32, false);
+        Schema::new(vec![field_0, field_1, field_2])
+    }
+
+    #[test]
+    fn test_stats_projection_columns_only() {
+        let source = get_stats();
+        let schema = get_schema();
+
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col1", 1)),
+                alias: "col1".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "col0".to_string(),
+            },
+        ]);
+
+        let result = projection
+            .project_statistics(source, &projection.project_schema(&schema).unwrap())
+            .unwrap();
+
+        let expected = Statistics {
+            num_rows: Precision::Exact(5),
+            // Because there is a variable length Utf8 column we cannot calculate exact byte size after projection
+            // Thus we set it to Inexact (originally it was Exact(23))
+            total_byte_size: Precision::Inexact(23),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(1),
+                    max_value: Precision::Exact(ScalarValue::from("x")),
+                    min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
+                    null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(5),
+                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
+                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
+                    null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_stats_projection_column_with_primitive_width_only() {
+        let source = get_stats();
+        let schema = get_schema();
+
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col2", 2)),
+                alias: "col2".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "col0".to_string(),
+            },
+        ]);
+
+        let result = projection
+            .project_statistics(source, &projection.project_schema(&schema).unwrap())
+            .unwrap();
+
+        let expected = Statistics {
+            num_rows: Precision::Exact(5),
+            total_byte_size: Precision::Exact(60),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Absent,
+                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
+                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
+                    sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
+                    null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(5),
+                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
+                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
+                    null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
+                },
+            ],
+        };
+
+        assert_eq!(result, expected);
+    }
+
+    // Tests for Projection struct
+
+    #[test]
+    fn test_projection_new() -> Result<()> {
+        let exprs = vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "a".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 1)),
+                alias: "b".to_string(),
+            },
+        ];
+        let projection = ProjectionExprs::new(exprs.clone());
+        assert_eq!(projection.as_ref().len(), 2);
+        Ok(())
+    }
+
+    #[test]
+    fn test_projection_from_vec() -> Result<()> {
+        let exprs = vec![ProjectionExpr {
+            expr: Arc::new(Column::new("x", 0)),
+            alias: "x".to_string(),
+        }];
+        let projection: ProjectionExprs = exprs.clone().into();
+        assert_eq!(projection.as_ref().len(), 1);
+        Ok(())
+    }
+
+    #[test]
+    fn test_projection_as_ref() -> Result<()> {
+        let exprs = vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col1", 0)),
+                alias: "col1".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col2", 1)),
+                alias: "col2".to_string(),
+            },
+        ];
+        let projection = ProjectionExprs::new(exprs);
+        let as_ref: &[ProjectionExpr] = projection.as_ref();
+        assert_eq!(as_ref.len(), 2);
+        Ok(())
+    }
+
+    #[test]
+    fn test_column_indices_multiple_columns() -> Result<()> {
+        // Test with reversed column order to ensure proper reordering
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 5)),
+                alias: "c".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 2)),
+                alias: "b".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "a".to_string(),
+            },
+        ]);
+        // Should return sorted indices regardless of projection order
+        assert_eq!(projection.column_indices(), vec![0, 2, 5]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_column_indices_duplicates() -> Result<()> {
+        // Test that duplicate column indices appear only once
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 1)),
+                alias: "a".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 3)),
+                alias: "b".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a2", 1)), // duplicate index
+                alias: "a2".to_string(),
+            },
+        ]);
+        assert_eq!(projection.column_indices(), vec![1, 3]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_column_indices_unsorted() -> Result<()> {
+        // Test that column indices are sorted in the output
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 5)),
+                alias: "c".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 1)),
+                alias: "a".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 3)),
+                alias: "b".to_string(),
+            },
+        ]);
+        assert_eq!(projection.column_indices(), vec![1, 3, 5]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_column_indices_complex_expr() -> Result<()> {
+        // Test with complex expressions containing multiple columns
+        let expr = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 1)),
+            Operator::Plus,
+            Arc::new(Column::new("b", 4)),
+        ));
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr,
+                alias: "sum".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 2)),
+                alias: "c".to_string(),
+            },
+        ]);
+        // Should return [1, 2, 4] - all columns used, sorted and deduplicated
+        assert_eq!(projection.column_indices(), vec![1, 2, 4]);
+        Ok(())
+    }
+
+    #[test]
+    fn test_column_indices_empty() -> Result<()> {
+        let projection = ProjectionExprs::new(vec![]);
+        assert_eq!(projection.column_indices(), Vec::<usize>::new());
+        Ok(())
+    }
+
+    #[test]
+    fn test_merge_simple_columns() -> Result<()> {
+        // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z
+        let base_projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 2)),
+                alias: "x".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 1)),
+                alias: "y".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "z".to_string(),
+            },
+        ]);
+
+        // Second projection: SELECT y@1 AS col2, x@0 AS col1
+        let top_projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("y", 1)),
+                alias: "col2".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("x", 0)),
+                alias: "col1".to_string(),
+            },
+        ]);
+
+        // Merge should produce: SELECT b@1 AS col2, c@2 AS col1
+        let merged = base_projection.try_merge(&top_projection)?;
+        assert_snapshot!(format!("{merged}"), @"Projection[b@1 AS col2, c@2 AS col1]");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_merge_with_expressions() -> Result<()> {
+        // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z
+        let base_projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 2)),
+                alias: "x".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 1)),
+                alias: "y".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "z".to_string(),
+            },
+        ]);
+
+        // Second projection: SELECT y@1 + z@2 AS c2, x@0 + 1 AS c1
+        let top_projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("y", 1)),
+                    Operator::Plus,
+                    Arc::new(Column::new("z", 2)),
+                )),
+                alias: "c2".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("x", 0)),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                alias: "c1".to_string(),
+            },
+        ]);
+
+        // Merge should produce: SELECT b@1 + a@0 AS c2, c@2 + 1 AS c1
+        let merged = base_projection.try_merge(&top_projection)?;
+        assert_snapshot!(format!("{merged}"), @"Projection[b@1 + a@0 AS c2, c@2 + 1 AS c1]");
+
+        Ok(())
+    }
+
+    #[test]
+    fn try_merge_error() {
+        // Create a base projection
+        let base = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "x".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("b", 1)),
+                alias: "y".to_string(),
+            },
+        ]);
+
+        // Create a top projection that references a non-existent column index
+        let top = ProjectionExprs::new(vec![ProjectionExpr {
+            expr: Arc::new(Column::new("z", 5)), // Invalid index
+            alias: "result".to_string(),
+        }]);
+
+        // Attempt to merge and expect an error
+        let err_msg = base.try_merge(&top).unwrap_err().to_string();
+        assert!(
+            err_msg.contains("Internal error: Column index 5 out of bounds for projected expressions of length 2"),
+            "Unexpected error message: {err_msg}",
+        );
+    }
+
+    #[test]
+    fn test_merge_empty_projection_with_literal() -> Result<()> {
+        // This test reproduces the issue from roundtrip_empty_projection test
+        // Query like: SELECT 1 FROM table
+        // where the file scan needs no columns (empty projection)
+        // but we project a literal on top
+
+        // Empty base projection (no columns needed from file)
+        let base_projection = ProjectionExprs::new(vec![]);
+
+        // Top projection with a literal expression: SELECT 1
+        let top_projection = ProjectionExprs::new(vec![ProjectionExpr {
+            expr: Arc::new(Literal::new(ScalarValue::Int64(Some(1)))),
+            alias: "Int64(1)".to_string(),
+        }]);
+
+        // This should succeed - literals don't reference columns so they should
+        // pass through unchanged when merged with an empty projection
+        let merged = base_projection.try_merge(&top_projection)?;
+        assert_snapshot!(format!("{merged}"), @"Projection[1 AS Int64(1)]");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_update_expr_with_literal() -> Result<()> {
+        // Test that update_expr correctly handles expressions without column references
+        let literal_expr: Arc<dyn PhysicalExpr> =
+            Arc::new(Literal::new(ScalarValue::Int64(Some(42))));
+        let empty_projection: Vec<ProjectionExpr> = vec![];
+
+        // Updating a literal with an empty projection should return the literal unchanged
+        let result = update_expr(&literal_expr, &empty_projection, true)?;
+        assert!(result.is_some(), "Literal expression should be valid");
+
+        let result_expr = result.unwrap();
+        assert_eq!(
+            result_expr
+                .as_any()
+                .downcast_ref::<Literal>()
+                .unwrap()
+                .value(),
+            &ScalarValue::Int64(Some(42))
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_update_expr_with_complex_literal_expr() -> Result<()> {
+        // Test update_expr with an expression containing both literals and a column
+        // This tests the case where we have: literal + column
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Literal::new(ScalarValue::Int64(Some(10)))),
+            Operator::Plus,
+            Arc::new(Column::new("x", 0)),
+        ));
+
+        // Base projection that maps column 0 to a different expression
+        let base_projection = vec![ProjectionExpr {
+            expr: Arc::new(Column::new("a", 5)),
+            alias: "x".to_string(),
+        }];
+
+        // The expression should be updated: 10 + x@0 becomes 10 + a@5
+        let result = update_expr(&expr, &base_projection, true)?;
+        assert!(result.is_some(), "Expression should be valid");
+
+        let result_expr = result.unwrap();
+        let binary = result_expr
+            .as_any()
+            .downcast_ref::<BinaryExpr>()
+            .expect("Should be a BinaryExpr");
+
+        // Left side should still be the literal
+        assert!(binary.left().as_any().downcast_ref::<Literal>().is_some());
+
+        // Right side should be updated to reference column at index 5
+        let right_col = binary
+            .right()
+            .as_any()
+            .downcast_ref::<Column>()
+            .expect("Right should be a Column");
+        assert_eq!(right_col.index(), 5);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_schema_simple_columns() -> Result<()> {
+        // Input schema: [col0: Int64, col1: Utf8, col2: Float32]
+        let input_schema = get_schema();
+
+        // Projection: SELECT col2 AS c, col0 AS a
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col2", 2)),
+                alias: "c".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "a".to_string(),
+            },
+        ]);
+
+        let output_schema = projection.project_schema(&input_schema)?;
+
+        // Should have 2 fields
+        assert_eq!(output_schema.fields().len(), 2);
+
+        // First field should be "c" with Float32 type
+        assert_eq!(output_schema.field(0).name(), "c");
+        assert_eq!(output_schema.field(0).data_type(), &DataType::Float32);
+
+        // Second field should be "a" with Int64 type
+        assert_eq!(output_schema.field(1).name(), "a");
+        assert_eq!(output_schema.field(1).data_type(), &DataType::Int64);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_schema_with_expressions() -> Result<()> {
+        // Input schema: [col0: Int64, col1: Utf8, col2: Float32]
+        let input_schema = get_schema();
+
+        // Projection: SELECT col0 + 1 AS incremented
+        let projection = ProjectionExprs::new(vec![ProjectionExpr {
+            expr: Arc::new(BinaryExpr::new(
+                Arc::new(Column::new("col0", 0)),
+                Operator::Plus,
+                Arc::new(Literal::new(ScalarValue::Int64(Some(1)))),
+            )),
+            alias: "incremented".to_string(),
+        }]);
+
+        let output_schema = projection.project_schema(&input_schema)?;
+
+        // Should have 1 field
+        assert_eq!(output_schema.fields().len(), 1);
+
+        // Field should be "incremented" with Int64 type
+        assert_eq!(output_schema.field(0).name(), "incremented");
+        assert_eq!(output_schema.field(0).data_type(), &DataType::Int64);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_schema_preserves_metadata() -> Result<()> {
+        // Create schema with metadata
+        let mut metadata = HashMap::new();
+        metadata.insert("key".to_string(), "value".to_string());
+        let field_with_metadata =
+            Field::new("col0", DataType::Int64, false).with_metadata(metadata.clone());
+        let input_schema = Schema::new(vec![
+            field_with_metadata,
+            Field::new("col1", DataType::Utf8, false),
+        ]);
+
+        // Projection: SELECT col0 AS renamed
+        let projection = ProjectionExprs::new(vec![ProjectionExpr {
+            expr: Arc::new(Column::new("col0", 0)),
+            alias: "renamed".to_string(),
+        }]);
+
+        let output_schema = projection.project_schema(&input_schema)?;
+
+        // Should have 1 field
+        assert_eq!(output_schema.fields().len(), 1);
+
+        // Field should be "renamed" with metadata preserved
+        assert_eq!(output_schema.field(0).name(), "renamed");
+        assert_eq!(output_schema.field(0).metadata(), &metadata);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_schema_empty() -> Result<()> {
+        let input_schema = get_schema();
+        let projection = ProjectionExprs::new(vec![]);
+
+        let output_schema = projection.project_schema(&input_schema)?;
+
+        assert_eq!(output_schema.fields().len(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_statistics_columns_only() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection: SELECT col1 AS text, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col1", 1)),
+                alias: "text".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics (reordered from input)
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (col1 from input)
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::from("x"))
+        );
+
+        // Second column (col0 from input)
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_statistics_with_expressions() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with expression: SELECT col0 + 1 AS incremented, col1 AS text
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("col0", 0)),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int64(Some(1)))),
+                )),
+                alias: "incremented".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col1", 1)),
+                alias: "text".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (expression) should have unknown statistics
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Absent
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Absent
+        );
+
+        // Second column (col1) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(1)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_statistics_primitive_width_only() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with only primitive width columns: SELECT col2 AS f, col0 AS i
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col2", 2)),
+                alias: "f".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "i".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Total byte size should be recalculated for primitive types
+        // Float32 (4 bytes) + Int64 (8 bytes) = 12 bytes per row, 5 rows = 60 bytes
+        assert_eq!(output_stats.total_byte_size, Precision::Exact(60));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_project_statistics_empty() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        let projection = ProjectionExprs::new(vec![]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have no column statistics
+        assert_eq!(output_stats.column_statistics.len(), 0);
+
+        // Total byte size should be 0 for empty projection
+        assert_eq!(output_stats.total_byte_size, Precision::Exact(0));
+
+        Ok(())
+    }
+
+    // Test statistics calculation for non-null literal (numeric constant)
+    #[test]
+    fn test_project_statistics_with_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with literal: SELECT 42 AS constant, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+                alias: "constant".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (literal 42) should have proper constant statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(42)))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Int64 is 8 bytes, 5 rows = 40 bytes
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(40)
+        );
+        // For a constant column, sum_value = value * num_rows = 42 * 5 = 210
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(Some(210)))
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+
+    // Test statistics calculation for NULL literal (constant NULL column)
+    #[test]
+    fn test_project_statistics_with_null_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with NULL literal: SELECT NULL AS null_col, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Int64(None))),
+                alias: "null_col".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (NULL literal) should have proper constant NULL statistics
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1) // All NULLs are considered the same
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(5) // All rows are NULL
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Exact(0)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Exact(ScalarValue::Int64(None))
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+
+    // Test statistics calculation for complex type literal (e.g., Utf8 string)
+    #[test]
+    fn test_project_statistics_with_complex_type_literal() -> Result<()> {
+        let input_stats = get_stats();
+        let input_schema = get_schema();
+
+        // Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num
+        let projection = ProjectionExprs::new(vec![
+            ProjectionExpr {
+                expr: Arc::new(Literal::new(ScalarValue::Utf8(Some(
+                    "hello".to_string(),
+                )))),
+                alias: "text".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(Column::new("col0", 0)),
+                alias: "num".to_string(),
+            },
+        ]);
+
+        let output_stats = projection.project_statistics(
+            input_stats,
+            &projection.project_schema(&input_schema)?,
+        )?;
+
+        // Row count should be preserved
+        assert_eq!(output_stats.num_rows, Precision::Exact(5));
+
+        // Should have 2 column statistics
+        assert_eq!(output_stats.column_statistics.len(), 2);
+
+        // First column (Utf8 literal 'hello') should have proper constant statistics
+        // but byte_size should be Absent for complex types
+        assert_eq!(
+            output_stats.column_statistics[0].min_value,
+            Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].max_value,
+            Precision::Exact(ScalarValue::Utf8(Some("hello".to_string())))
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].distinct_count,
+            Precision::Exact(1)
+        );
+        assert_eq!(
+            output_stats.column_statistics[0].null_count,
+            Precision::Exact(0)
+        );
+        // Complex types (Utf8, List, etc.) should have byte_size = Absent
+        // because we can't calculate exact size without knowing the actual data
+        assert_eq!(
+            output_stats.column_statistics[0].byte_size,
+            Precision::Absent
+        );
+        // Non-numeric types (Utf8) should have sum_value = Absent
+        // because sum is only meaningful for numeric types
+        assert_eq!(
+            output_stats.column_statistics[0].sum_value,
+            Precision::Absent
+        );
+
+        // Second column (col0) should preserve statistics
+        assert_eq!(
+            output_stats.column_statistics[1].distinct_count,
+            Precision::Exact(5)
+        );
+        assert_eq!(
+            output_stats.column_statistics[1].max_value,
+            Precision::Exact(ScalarValue::Int64(Some(21)))
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs
index d014bbb74caa1..bbc03a5d14b2a 100644
--- a/datafusion/physical-expr/src/scalar_function.rs
+++ b/datafusion/physical-expr/src/scalar_function.rs
@@ -31,29 +31,31 @@
 
 use std::any::Any;
 use std::fmt::{self, Debug, Formatter};
-use std::hash::Hash;
+use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use crate::expressions::Literal;
 use crate::PhysicalExpr;
+use crate::expressions::Literal;
 
 use arrow::array::{Array, RecordBatch};
 use arrow::datatypes::{DataType, FieldRef, Schema};
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::config::{ConfigEntry, ConfigOptions};
+use datafusion_common::{Result, ScalarValue, internal_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::ExprProperties;
-use datafusion_expr::type_coercion::functions::data_types_with_scalar_udf;
+use datafusion_expr::type_coercion::functions::fields_with_udf;
 use datafusion_expr::{
-    expr_vec_fmt, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    ColumnarValue, ExpressionPlacement, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+    Volatility, expr_vec_fmt,
 };
 
 /// Physical expression of a scalar function
-#[derive(Eq, PartialEq, Hash)]
 pub struct ScalarFunctionExpr {
     fun: Arc<ScalarUDF>,
     name: String,
     args: Vec<Arc<dyn PhysicalExpr>>,
     return_field: FieldRef,
+    config_options: Arc<ConfigOptions>,
 }
 
 impl Debug for ScalarFunctionExpr {
@@ -74,12 +76,14 @@ impl ScalarFunctionExpr {
         fun: Arc<ScalarUDF>,
         args: Vec<Arc<dyn PhysicalExpr>>,
         return_field: FieldRef,
+        config_options: Arc<ConfigOptions>,
     ) -> Self {
         Self {
             fun,
             name: name.to_owned(),
             args,
             return_field,
+            config_options,
         }
     }
 
@@ -88,6 +92,7 @@ impl ScalarFunctionExpr {
         fun: Arc<ScalarUDF>,
         args: Vec<Arc<dyn PhysicalExpr>>,
         schema: &Schema,
+        config_options: Arc<ConfigOptions>,
     ) -> Result<Self> {
         let name = fun.name().to_string();
         let arg_fields = args
@@ -96,11 +101,7 @@ impl ScalarFunctionExpr {
             .collect::<Result<Vec<_>>>()?;
 
         // verify that input data types is consistent with function's `TypeSignature`
-        let arg_types = arg_fields
-            .iter()
-            .map(|f| f.data_type().clone())
-            .collect::<Vec<_>>();
-        data_types_with_scalar_udf(&arg_types, &fun)?;
+        fields_with_udf(&arg_fields, fun.as_ref())?;
 
         let arguments = args
             .iter()
@@ -120,6 +121,7 @@ impl ScalarFunctionExpr {
             name,
             args,
             return_field,
+            config_options,
         })
     }
 
@@ -156,6 +158,33 @@ impl ScalarFunctionExpr {
     pub fn nullable(&self) -> bool {
         self.return_field.is_nullable()
     }
+
+    pub fn config_options(&self) -> &ConfigOptions {
+        &self.config_options
+    }
+
+    /// Given an arbitrary PhysicalExpr attempt to downcast it to a ScalarFunctionExpr
+    /// and verify that its inner function is of type T.
+    /// If the downcast fails, or the function is not of type T, returns `None`.
+    /// Otherwise returns `Some(ScalarFunctionExpr)`.
+    pub fn try_downcast_func<T>(expr: &dyn PhysicalExpr) -> Option<&ScalarFunctionExpr>
+    where
+        T: 'static,
+    {
+        match expr.as_any().downcast_ref::<ScalarFunctionExpr>() {
+            Some(scalar_expr)
+                if scalar_expr
+                    .fun()
+                    .inner()
+                    .as_any()
+                    .downcast_ref::<T>()
+                    .is_some() =>
+            {
+                Some(scalar_expr)
+            }
+            _ => None,
+        }
+    }
 }
 
 impl fmt::Display for ScalarFunctionExpr {
@@ -164,6 +193,51 @@ impl fmt::Display for ScalarFunctionExpr {
     }
 }
 
+impl PartialEq for ScalarFunctionExpr {
+    fn eq(&self, o: &Self) -> bool {
+        if std::ptr::eq(self, o) {
+            // The equality implementation is somewhat expensive, so let's short-circuit when possible.
+            return true;
+        }
+        let Self {
+            fun,
+            name,
+            args,
+            return_field,
+            config_options,
+        } = self;
+        fun.eq(&o.fun)
+            && name.eq(&o.name)
+            && args.eq(&o.args)
+            && return_field.eq(&o.return_field)
+            && (Arc::ptr_eq(config_options, &o.config_options)
+                || sorted_config_entries(config_options)
+                    == sorted_config_entries(&o.config_options))
+    }
+}
+impl Eq for ScalarFunctionExpr {}
+impl Hash for ScalarFunctionExpr {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            fun,
+            name,
+            args,
+            return_field,
+            config_options: _, // expensive to hash, and often equal
+        } = self;
+        fun.hash(state);
+        name.hash(state);
+        args.hash(state);
+        return_field.hash(state);
+    }
+}
+
+fn sorted_config_entries(config_options: &ConfigOptions) -> Vec<ConfigEntry> {
+    let mut entries = config_options.entries();
+    entries.sort_by(|l, r| l.key.cmp(&r.key));
+    entries
+}
+
 impl PhysicalExpr for ScalarFunctionExpr {
     /// Return a reference to Any that can be used for downcasting
     fn as_any(&self) -> &dyn Any {
@@ -202,21 +276,25 @@ impl PhysicalExpr for ScalarFunctionExpr {
             arg_fields,
             number_rows: batch.num_rows(),
             return_field: Arc::clone(&self.return_field),
+            config_options: Arc::clone(&self.config_options),
         })?;
 
-        if let ColumnarValue::Array(array) = &output {
-            if array.len() != batch.num_rows() {
-                // If the arguments are a non-empty slice of scalar values, we can assume that
-                // returning a one-element array is equivalent to returning a scalar.
-                let preserve_scalar =
-                    array.len() == 1 && !input_empty && input_all_scalar;
-                return if preserve_scalar {
-                    ScalarValue::try_from_array(array, 0).map(ColumnarValue::Scalar)
-                } else {
-                    internal_err!("UDF {} returned a different number of rows than expected. Expected: {}, Got: {}",
-                            self.name, batch.num_rows(), array.len())
-                };
-            }
+        if let ColumnarValue::Array(array) = &output
+            && array.len() != batch.num_rows()
+        {
+            // If the arguments are a non-empty slice of scalar values, we can assume that
+            // returning a one-element array is equivalent to returning a scalar.
+            let preserve_scalar = array.len() == 1 && !input_empty && input_all_scalar;
+            return if preserve_scalar {
+                ScalarValue::try_from_array(array, 0).map(ColumnarValue::Scalar)
+            } else {
+                internal_err!(
+                    "UDF {} returned a different number of rows than expected. Expected: {}, Got: {}",
+                    self.name,
+                    batch.num_rows(),
+                    array.len()
+                )
+            };
         }
         Ok(output)
     }
@@ -238,6 +316,7 @@ impl PhysicalExpr for ScalarFunctionExpr {
             Arc::clone(&self.fun),
             children,
             Arc::clone(&self.return_field),
+            Arc::clone(&self.config_options),
         )))
     }
 
@@ -279,4 +358,95 @@ impl PhysicalExpr for ScalarFunctionExpr {
         }
         write!(f, ")")
     }
+
+    fn is_volatile_node(&self) -> bool {
+        self.fun.signature().volatility == Volatility::Volatile
+    }
+
+    fn placement(&self) -> ExpressionPlacement {
+        let arg_placements: Vec<_> =
+            self.args.iter().map(|arg| arg.placement()).collect();
+        self.fun.placement(&arg_placements)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::expressions::Column;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_expr::{ScalarUDFImpl, Signature};
+    use datafusion_physical_expr_common::physical_expr::is_volatile;
+    use std::any::Any;
+
+    /// Test helper to create a mock UDF with a specific volatility
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct MockScalarUDF {
+        signature: Signature,
+    }
+
+    impl ScalarUDFImpl for MockScalarUDF {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "mock_function"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Int32)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(42))))
+        }
+    }
+
+    #[test]
+    fn test_scalar_function_volatile_node() {
+        // Create a volatile UDF
+        let volatile_udf = Arc::new(ScalarUDF::from(MockScalarUDF {
+            signature: Signature::uniform(
+                1,
+                vec![DataType::Float32],
+                Volatility::Volatile,
+            ),
+        }));
+
+        // Create a non-volatile UDF
+        let stable_udf = Arc::new(ScalarUDF::from(MockScalarUDF {
+            signature: Signature::uniform(1, vec![DataType::Float32], Volatility::Stable),
+        }));
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Float32, false)]);
+        let args = vec![Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>];
+        let config_options = Arc::new(ConfigOptions::new());
+
+        // Test volatile function
+        let volatile_expr = ScalarFunctionExpr::try_new(
+            volatile_udf,
+            args.clone(),
+            &schema,
+            Arc::clone(&config_options),
+        )
+        .unwrap();
+
+        assert!(volatile_expr.is_volatile_node());
+        let volatile_arc: Arc<dyn PhysicalExpr> = Arc::new(volatile_expr);
+        assert!(is_volatile(&volatile_arc));
+
+        // Test non-volatile function
+        let stable_expr =
+            ScalarFunctionExpr::try_new(stable_udf, args, &schema, config_options)
+                .unwrap();
+
+        assert!(!stable_expr.is_volatile_node());
+        let stable_arc: Arc<dyn PhysicalExpr> = Arc::new(stable_expr);
+        assert!(!is_volatile(&stable_arc));
+    }
 }
diff --git a/datafusion/physical-expr/src/simplifier/const_evaluator.rs b/datafusion/physical-expr/src/simplifier/const_evaluator.rs
new file mode 100644
index 0000000000000..1f3781c537dd5
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/const_evaluator.rs
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Constant expression evaluation for the physical expression simplifier
+
+use std::sync::Arc;
+
+use arrow::array::new_null_array;
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr_common::columnar_value::ColumnarValue;
+
+use crate::PhysicalExpr;
+use crate::expressions::{Column, Literal};
+
+/// Simplify expressions that consist only of literals by evaluating them.
+///
+/// This function checks if all children of the given expression are literals.
+/// If so, it evaluates the expression against a dummy RecordBatch and returns
+/// the result as a new Literal.
+///
+/// # Example transformations
+/// - `1 + 2` -> `3`
+/// - `(1 + 2) * 3` -> `9` (with bottom-up traversal)
+/// - `'hello' || ' world'` -> `'hello world'`
+#[deprecated(
+    since = "53.0.0",
+    note = "This function will be removed in a future release in favor of a private implementation that depends on other implementation details. Please open an issue if you have a use case for keeping it."
+)]
+pub fn simplify_const_expr(
+    expr: Arc<dyn PhysicalExpr>,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    let batch = create_dummy_batch()?;
+    // If expr is already a const literal or can't be evaluated into one.
+    if expr.as_any().is::<Literal>() || (!can_evaluate_as_constant(&expr)) {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Evaluate the expression
+    match expr.evaluate(&batch) {
+        Ok(ColumnarValue::Scalar(scalar)) => {
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(ColumnarValue::Array(arr)) if arr.len() == 1 => {
+            // Some operations return an array even for scalar inputs
+            let scalar = ScalarValue::try_from_array(&arr, 0)?;
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(_) => {
+            // Unexpected result - keep original expression
+            Ok(Transformed::no(expr))
+        }
+        Err(_) => {
+            // On error, keep original expression
+            // The expression might succeed at runtime due to short-circuit evaluation
+            // or other runtime conditions
+            Ok(Transformed::no(expr))
+        }
+    }
+}
+
+/// Simplify expressions whose immediate children are all literals.
+///
+/// This function only checks the direct children of the expression,
+/// not the entire subtree. It is designed to be used with bottom-up tree
+/// traversal, where children are simplified before parents.
+///
+/// # Example transformations
+/// - `1 + 2` -> `3`
+/// - `(1 + 2) * 3` -> `9` (with bottom-up traversal, inner expr simplified first)
+/// - `'hello' || ' world'` -> `'hello world'`
+pub(crate) fn simplify_const_expr_immediate(
+    expr: Arc<dyn PhysicalExpr>,
+    batch: &RecordBatch,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    // Already a literal - nothing to do
+    if expr.as_any().is::<Literal>() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Column references cannot be evaluated at plan time
+    if expr.as_any().is::<Column>() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Volatile nodes cannot be evaluated at plan time
+    if expr.is_volatile_node() {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Since transform visits bottom-up, children have already been simplified.
+    // If all children are now Literals, this node can be const-evaluated.
+    // This is O(k) where k = number of children, instead of O(subtree).
+    let all_children_literal = expr
+        .children()
+        .iter()
+        .all(|child| child.as_any().is::<Literal>());
+
+    if !all_children_literal {
+        return Ok(Transformed::no(expr));
+    }
+
+    // Evaluate the expression
+    match expr.evaluate(batch) {
+        Ok(ColumnarValue::Scalar(scalar)) => {
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(ColumnarValue::Array(arr)) if arr.len() == 1 => {
+            // Some operations return an array even for scalar inputs
+            let scalar = ScalarValue::try_from_array(&arr, 0)?;
+            Ok(Transformed::yes(Arc::new(Literal::new(scalar))))
+        }
+        Ok(_) => {
+            // Unexpected result - keep original expression
+            Ok(Transformed::no(expr))
+        }
+        Err(_) => {
+            // On error, keep original expression
+            // The expression might succeed at runtime due to short-circuit evaluation
+            // or other runtime conditions
+            Ok(Transformed::no(expr))
+        }
+    }
+}
+
+/// Create a 1-row dummy RecordBatch for evaluating constant expressions.
+///
+/// The batch is never actually accessed for data - it's just needed because
+/// the PhysicalExpr::evaluate API requires a RecordBatch. For expressions
+/// that only contain literals, the batch content is irrelevant.
+///
+/// This is the same approach used in the logical expression `ConstEvaluator`.
+pub(crate) fn create_dummy_batch() -> Result<RecordBatch> {
+    // RecordBatch requires at least one column
+    let dummy_schema = Arc::new(Schema::new(vec![Field::new("_", DataType::Null, true)]));
+    let col = new_null_array(&DataType::Null, 1);
+    Ok(RecordBatch::try_new(dummy_schema, vec![col])?)
+}
+
+fn can_evaluate_as_constant(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    let mut can_evaluate = true;
+
+    expr.apply(|e| {
+        if e.as_any().is::<Column>() || e.is_volatile_node() {
+            can_evaluate = false;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("apply should not fail");
+
+    can_evaluate
+}
+
+/// Check if this expression has any column references.
+#[deprecated(
+    since = "53.0.0",
+    note = "This function isn't used internally and is trivial to implement, therefore it will be removed in a future release."
+)]
+pub fn has_column_references(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    let mut has_columns = false;
+    expr.apply(|expr| {
+        if expr.as_any().downcast_ref::<Column>().is_some() {
+            has_columns = true;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })
+    .expect("apply should not fail");
+    has_columns
+}
diff --git a/datafusion/physical-expr/src/simplifier/mod.rs b/datafusion/physical-expr/src/simplifier/mod.rs
new file mode 100644
index 0000000000000..3f3f8573449eb
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/mod.rs
@@ -0,0 +1,622 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplifier for Physical Expressions
+
+use arrow::datatypes::Schema;
+use datafusion_common::{Result, tree_node::TreeNode};
+use std::sync::Arc;
+
+use crate::{
+    PhysicalExpr,
+    simplifier::{
+        const_evaluator::create_dummy_batch, unwrap_cast::unwrap_cast_in_comparison,
+    },
+};
+
+pub mod const_evaluator;
+pub mod not;
+pub mod unwrap_cast;
+
+const MAX_LOOP_COUNT: usize = 5;
+
+/// Simplifies physical expressions by applying various optimizations
+///
+/// This can be useful after adapting expressions from a table schema
+/// to a file schema. For example, casts added to match the types may
+/// potentially be unwrapped.
+pub struct PhysicalExprSimplifier<'a> {
+    schema: &'a Schema,
+}
+
+impl<'a> PhysicalExprSimplifier<'a> {
+    /// Create a new physical expression simplifier
+    pub fn new(schema: &'a Schema) -> Self {
+        Self { schema }
+    }
+
+    /// Simplify a physical expression
+    pub fn simplify(&self, expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExpr>> {
+        let mut current_expr = expr;
+        let mut count = 0;
+        let schema = self.schema;
+
+        let batch = create_dummy_batch()?;
+
+        while count < MAX_LOOP_COUNT {
+            count += 1;
+            let result = current_expr.transform(|node| {
+                #[cfg(debug_assertions)]
+                let original_type = node.data_type(schema).unwrap();
+
+                // Apply NOT expression simplification first, then unwrap cast optimization,
+                // then constant expression evaluation
+                #[expect(deprecated, reason = "`simplify_not_expr` is marked as deprecated until it's made private.")]
+                let rewritten = not::simplify_not_expr(node, schema)?
+                    .transform_data(|node| unwrap_cast_in_comparison(node, schema))?
+                    .transform_data(|node| {
+                        const_evaluator::simplify_const_expr_immediate(node, &batch)
+                    })?;
+
+                #[cfg(debug_assertions)]
+                assert_eq!(
+                    rewritten.data.data_type(schema).unwrap(),
+                    original_type,
+                    "Simplified expression should have the same data type as the original"
+                );
+
+                Ok(rewritten)
+            })?;
+
+            if !result.transformed {
+                return Ok(result.data);
+            }
+            current_expr = result.data;
+        }
+        Ok(current_expr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::expressions::{
+        BinaryExpr, CastExpr, Literal, NotExpr, TryCastExpr, col, in_list, lit,
+    };
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::Operator;
+
+    fn test_schema() -> Schema {
+        Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int64, false),
+            Field::new("c3", DataType::Utf8, false),
+        ])
+    }
+
+    fn not_test_schema() -> Schema {
+        Schema::new(vec![
+            Field::new("a", DataType::Boolean, false),
+            Field::new("b", DataType::Boolean, false),
+            Field::new("c", DataType::Int32, false),
+        ])
+    }
+
+    /// Helper function to extract a Literal from a PhysicalExpr
+    fn as_literal(expr: &Arc<dyn PhysicalExpr>) -> &Literal {
+        expr.as_any()
+            .downcast_ref::<Literal>()
+            .unwrap_or_else(|| panic!("Expected Literal, got: {expr}"))
+    }
+
+    /// Helper function to extract a BinaryExpr from a PhysicalExpr
+    fn as_binary(expr: &Arc<dyn PhysicalExpr>) -> &BinaryExpr {
+        expr.as_any()
+            .downcast_ref::<BinaryExpr>()
+            .unwrap_or_else(|| panic!("Expected BinaryExpr, got: {expr}"))
+    }
+
+    /// Assert that simplifying `input` produces `expected`
+    fn assert_not_simplify(
+        simplifier: &PhysicalExprSimplifier,
+        input: Arc<dyn PhysicalExpr>,
+        expected: Arc<dyn PhysicalExpr>,
+    ) {
+        let result = simplifier.simplify(Arc::clone(&input)).unwrap();
+        assert_eq!(
+            &result, &expected,
+            "Simplification should transform:\n  input: {input}\n  to:    {expected}\n  got:   {result}"
+        );
+    }
+
+    #[test]
+    fn test_simplify() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // Create: cast(c2 as INT32) != INT32(99)
+        let column_expr = col("c2", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int32, None));
+        let literal_expr = lit(ScalarValue::Int32(Some(99)));
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::NotEq, literal_expr));
+
+        // Apply full simplification (uses TreeNodeRewriter)
+        let optimized = simplifier.simplify(binary_expr).unwrap();
+
+        let optimized_binary = as_binary(&optimized);
+
+        // Should be optimized to: c2 != INT64(99) (c2 is INT64, literal cast to match)
+        let left_expr = optimized_binary.left();
+        assert!(
+            left_expr.as_any().downcast_ref::<CastExpr>().is_none()
+                && left_expr.as_any().downcast_ref::<TryCastExpr>().is_none()
+        );
+        let right_literal = as_literal(optimized_binary.right());
+        assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(99)));
+    }
+
+    #[test]
+    fn test_nested_expression_simplification() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // Create nested expression: (cast(c1 as INT64) > INT64(5)) OR (cast(c2 as INT32) <= INT32(10))
+        let c1_expr = col("c1", &schema).unwrap();
+        let c1_cast = Arc::new(CastExpr::new(c1_expr, DataType::Int64, None));
+        let c1_literal = lit(ScalarValue::Int64(Some(5)));
+        let c1_binary = Arc::new(BinaryExpr::new(c1_cast, Operator::Gt, c1_literal));
+
+        let c2_expr = col("c2", &schema).unwrap();
+        let c2_cast = Arc::new(CastExpr::new(c2_expr, DataType::Int32, None));
+        let c2_literal = lit(ScalarValue::Int32(Some(10)));
+        let c2_binary = Arc::new(BinaryExpr::new(c2_cast, Operator::LtEq, c2_literal));
+
+        let or_expr = Arc::new(BinaryExpr::new(c1_binary, Operator::Or, c2_binary));
+
+        // Apply simplification
+        let optimized = simplifier.simplify(or_expr).unwrap();
+
+        let or_binary = as_binary(&optimized);
+
+        // Verify left side: c1 > INT32(5)
+        let left_binary = as_binary(or_binary.left());
+        let left_left_expr = left_binary.left();
+        assert!(
+            left_left_expr.as_any().downcast_ref::<CastExpr>().is_none()
+                && left_left_expr
+                    .as_any()
+                    .downcast_ref::<TryCastExpr>()
+                    .is_none()
+        );
+        let left_literal = as_literal(left_binary.right());
+        assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(5)));
+
+        // Verify right side: c2 <= INT64(10)
+        let right_binary = as_binary(or_binary.right());
+        let right_left_expr = right_binary.left();
+        assert!(
+            right_left_expr
+                .as_any()
+                .downcast_ref::<CastExpr>()
+                .is_none()
+                && right_left_expr
+                    .as_any()
+                    .downcast_ref::<TryCastExpr>()
+                    .is_none()
+        );
+        let right_literal = as_literal(right_binary.right());
+        assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(10)));
+    }
+
+    #[test]
+    fn test_double_negation_elimination() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(c > 5)) -> c > 5
+        let inner_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Gt,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+        let inner_not = Arc::new(NotExpr::new(Arc::clone(&inner_expr)));
+        let double_not: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(inner_not));
+
+        let expected = inner_expr;
+        assert_not_simplify(&simplifier, double_not, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_literal() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(TRUE) -> FALSE
+        let not_true = Arc::new(NotExpr::new(lit(ScalarValue::Boolean(Some(true)))));
+        let expected = lit(ScalarValue::Boolean(Some(false)));
+        assert_not_simplify(&simplifier, not_true, expected);
+
+        // NOT(FALSE) -> TRUE
+        let not_false = Arc::new(NotExpr::new(lit(ScalarValue::Boolean(Some(false)))));
+        let expected = lit(ScalarValue::Boolean(Some(true)));
+        assert_not_simplify(&simplifier, not_false, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_negate_comparison() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c = 5) -> c != 5
+        let not_eq = Arc::new(NotExpr::new(Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(5))),
+        ))));
+        let expected = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::NotEq,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+        assert_not_simplify(&simplifier, not_eq, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_law_and() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(a AND b) -> NOT a OR NOT b
+        let and_expr = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::And,
+            col("b", &schema)?,
+        ));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(NotExpr::new(col("a", &schema)?)),
+            Operator::Or,
+            Arc::new(NotExpr::new(col("b", &schema)?)),
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_law_or() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(a OR b) -> NOT a AND NOT b
+        let or_expr = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::Or,
+            col("b", &schema)?,
+        ));
+        let not_or: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(or_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(NotExpr::new(col("a", &schema)?)),
+            Operator::And,
+            Arc::new(NotExpr::new(col("b", &schema)?)),
+        ));
+        assert_not_simplify(&simplifier, not_or, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_demorgans_with_comparison_simplification() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c = 1 AND c = 2) -> c != 1 OR c != 2
+        let eq1 = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(1))),
+        ));
+        let eq2 = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Eq,
+            lit(ScalarValue::Int32(Some(2))),
+        ));
+        let and_expr = Arc::new(BinaryExpr::new(eq1, Operator::And, eq2));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                col("c", &schema)?,
+                Operator::NotEq,
+                lit(ScalarValue::Int32(Some(1))),
+            )),
+            Operator::Or,
+            Arc::new(BinaryExpr::new(
+                col("c", &schema)?,
+                Operator::NotEq,
+                lit(ScalarValue::Int32(Some(2))),
+            )),
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_of_not_and_not() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(a) AND NOT(b)) -> a OR b
+        let not_a = Arc::new(NotExpr::new(col("a", &schema)?));
+        let not_b = Arc::new(NotExpr::new(col("b", &schema)?));
+        let and_expr = Arc::new(BinaryExpr::new(not_a, Operator::And, not_b));
+        let not_and: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(and_expr));
+
+        let expected: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("a", &schema)?,
+            Operator::Or,
+            col("b", &schema)?,
+        ));
+        assert_not_simplify(&simplifier, not_and, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c IN (1, 2, 3)) -> c NOT IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let in_list_expr = in_list(col("c", &schema)?, list.clone(), &false, &schema)?;
+        let not_in: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(in_list_expr));
+
+        let expected = in_list(col("c", &schema)?, list, &true, &schema)?;
+        assert_not_simplify(&simplifier, not_in, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(c NOT IN (1, 2, 3)) -> c IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let not_in_list_expr = in_list(col("c", &schema)?, list.clone(), &true, &schema)?;
+        let not_not_in: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(not_in_list_expr));
+
+        let expected = in_list(col("c", &schema)?, list, &false, &schema)?;
+        assert_not_simplify(&simplifier, not_not_in, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_double_not_in_list() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // NOT(NOT(c IN (1, 2, 3))) -> c IN (1, 2, 3)
+        let list = vec![
+            lit(ScalarValue::Int32(Some(1))),
+            lit(ScalarValue::Int32(Some(2))),
+            lit(ScalarValue::Int32(Some(3))),
+        ];
+        let in_list_expr = in_list(col("c", &schema)?, list.clone(), &false, &schema)?;
+        let not_in = Arc::new(NotExpr::new(in_list_expr));
+        let double_not: Arc<dyn PhysicalExpr> = Arc::new(NotExpr::new(not_in));
+
+        let expected = in_list(col("c", &schema)?, list, &false, &schema)?;
+        assert_not_simplify(&simplifier, double_not, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_deeply_nested_not() -> Result<()> {
+        let schema = not_test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // Create a deeply nested NOT expression: NOT(NOT(NOT(...NOT(c > 5)...)))
+        // This tests that we don't get stack overflow with many nested NOTs.
+        // With recursive_protection enabled (default), this should work by
+        // automatically growing the stack as needed.
+        let inner_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c", &schema)?,
+            Operator::Gt,
+            lit(ScalarValue::Int32(Some(5))),
+        ));
+
+        let mut expr = Arc::clone(&inner_expr);
+        // Create 200 layers of NOT to test deep recursion handling
+        for _ in 0..200 {
+            expr = Arc::new(NotExpr::new(expr));
+        }
+
+        // With 200 NOTs (even number), should simplify back to the original expression
+        let expected = inner_expr;
+        assert_not_simplify(&simplifier, Arc::clone(&expr), expected);
+
+        // Manually dismantle the deep input expression to avoid Stack Overflow on Drop
+        // If we just let `expr` go out of scope, Rust's recursive Drop will blow the stack
+        // even with recursive_protection, because Drop doesn't use the #[recursive] attribute.
+        // We peel off layers one by one to avoid deep recursion in Drop.
+        while let Some(not_expr) = expr.as_any().downcast_ref::<NotExpr>() {
+            // Clone the child (Arc increment).
+            // Now child has 2 refs: one in parent, one in `child`.
+            let child = Arc::clone(not_expr.arg());
+
+            // Reassign `expr` to `child`.
+            // This drops the old `expr` (Parent).
+            // Parent refcount -> 0, Parent is dropped.
+            // Parent drops its reference to Child.
+            // Child refcount decrements 2 -> 1.
+            // Child is NOT dropped recursively because we still hold it in `expr`
+            expr = child;
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_simplify_literal_binary_expr() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 1 + 2 -> 3
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(3)));
+    }
+
+    #[test]
+    fn test_simplify_literal_comparison() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 5 > 3 -> true
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(5i32), Operator::Gt, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Boolean(Some(true)));
+
+        // 2 > 3 -> false
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(2i32), Operator::Gt, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Boolean(Some(false)));
+    }
+
+    #[test]
+    fn test_simplify_nested_literal_expr() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // (1 + 2) * 3 -> 9
+        let inner: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(inner, Operator::Multiply, lit(3i32)));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(9)));
+    }
+
+    #[test]
+    fn test_simplify_deeply_nested_literals() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // ((1 + 2) * 3) + ((4 - 1) * 2) -> 9 + 6 -> 15
+        let left: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32))),
+            Operator::Multiply,
+            lit(3i32),
+        ));
+        let right: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(lit(4i32), Operator::Minus, lit(1i32))),
+            Operator::Multiply,
+            lit(2i32),
+        ));
+        let expr: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(left, Operator::Plus, right));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(literal.value(), &ScalarValue::Int32(Some(15)));
+    }
+
+    #[test]
+    fn test_no_simplify_with_column() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // c1 + 2 should NOT be simplified (has column reference)
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            col("c1", &schema).unwrap(),
+            Operator::Plus,
+            lit(2i32),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+        // Should remain a BinaryExpr, not become a Literal
+        assert!(result.as_any().downcast_ref::<BinaryExpr>().is_some());
+    }
+
+    #[test]
+    fn test_partial_simplify_with_column() {
+        let schema = test_schema();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // (1 + 2) + c1 should simplify the literal part: 3 + c1
+        let literal_part: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(lit(1i32), Operator::Plus, lit(2i32)));
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            literal_part,
+            Operator::Plus,
+            col("c1", &schema).unwrap(),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+
+        // Should be a BinaryExpr with a Literal(3) on the left
+        let binary = as_binary(&result);
+        let left_literal = as_literal(binary.left());
+        assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(3)));
+    }
+
+    #[test]
+    fn test_simplify_literal_string_concat() {
+        let schema = Schema::empty();
+        let simplifier = PhysicalExprSimplifier::new(&schema);
+
+        // 'hello' || ' world' -> 'hello world'
+        let expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            lit("hello"),
+            Operator::StringConcat,
+            lit(" world"),
+        ));
+        let result = simplifier.simplify(expr).unwrap();
+        let literal = as_literal(&result);
+        assert_eq!(
+            literal.value(),
+            &ScalarValue::Utf8(Some("hello world".to_string()))
+        );
+    }
+}
diff --git a/datafusion/physical-expr/src/simplifier/not.rs b/datafusion/physical-expr/src/simplifier/not.rs
new file mode 100644
index 0000000000000..709260aa48791
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/not.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Simplify NOT expressions in physical expressions
+//!
+//! This module provides optimizations for NOT expressions such as:
+//! - Double negation elimination: NOT(NOT(expr)) -> expr
+//! - NOT with binary comparisons: NOT(a = b) -> a != b
+//! - NOT with IN expressions: NOT(a IN (list)) -> a NOT IN (list)
+//! - De Morgan's laws: NOT(A AND B) -> NOT A OR NOT B
+//! - Constant folding: NOT(TRUE) -> FALSE, NOT(FALSE) -> TRUE
+//!
+//! This function is designed to work with TreeNodeRewriter's f_up traversal,
+//! which means children are already simplified when this function is called.
+//! The TreeNodeRewriter will automatically call this function repeatedly until
+//! no more transformations are possible.
+
+use std::sync::Arc;
+
+use arrow::datatypes::Schema;
+use datafusion_common::{Result, ScalarValue, tree_node::Transformed};
+use datafusion_expr::Operator;
+
+use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, InListExpr, Literal, NotExpr, in_list, lit};
+
+/// Attempts to simplify NOT expressions by applying one level of transformation
+///
+/// This function applies a single simplification rule and returns. When used with
+/// TreeNodeRewriter, multiple passes will automatically be applied until no more
+/// transformations are possible.
+#[deprecated(
+    since = "53.0.0",
+    note = "This function will be made private in a future release, please file an issue if you have a reason for keeping it public."
+)]
+pub fn simplify_not_expr(
+    expr: Arc<dyn PhysicalExpr>,
+    schema: &Schema,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    // Check if this is a NOT expression
+    let not_expr = match expr.as_any().downcast_ref::<NotExpr>() {
+        Some(not_expr) => not_expr,
+        None => return Ok(Transformed::no(expr)),
+    };
+
+    let inner_expr = not_expr.arg();
+
+    // Handle NOT(NOT(expr)) -> expr (double negation elimination)
+    if let Some(inner_not) = inner_expr.as_any().downcast_ref::<NotExpr>() {
+        return Ok(Transformed::yes(Arc::clone(inner_not.arg())));
+    }
+
+    // Handle NOT(literal) -> !literal
+    if let Some(literal) = inner_expr.as_any().downcast_ref::<Literal>() {
+        if let ScalarValue::Boolean(Some(val)) = literal.value() {
+            return Ok(Transformed::yes(lit(ScalarValue::Boolean(Some(!val)))));
+        }
+        if let ScalarValue::Boolean(None) = literal.value() {
+            return Ok(Transformed::yes(lit(ScalarValue::Boolean(None))));
+        }
+    }
+
+    // Handle NOT(IN list) -> NOT IN list
+    if let Some(in_list_expr) = inner_expr.as_any().downcast_ref::<InListExpr>() {
+        let negated = !in_list_expr.negated();
+        let new_in_list = in_list(
+            Arc::clone(in_list_expr.expr()),
+            in_list_expr.list().to_vec(),
+            &negated,
+            schema,
+        )?;
+        return Ok(Transformed::yes(new_in_list));
+    }
+
+    // Handle NOT(binary_expr)
+    if let Some(binary_expr) = inner_expr.as_any().downcast_ref::<BinaryExpr>() {
+        if let Some(negated_op) = binary_expr.op().negate() {
+            let new_binary = Arc::new(BinaryExpr::new(
+                Arc::clone(binary_expr.left()),
+                negated_op,
+                Arc::clone(binary_expr.right()),
+            ));
+            return Ok(Transformed::yes(new_binary));
+        }
+
+        // Handle De Morgan's laws for AND/OR
+        match binary_expr.op() {
+            Operator::And => {
+                // NOT(A AND B) -> NOT A OR NOT B
+                let not_left: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.left())));
+                let not_right: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.right())));
+                let new_binary =
+                    Arc::new(BinaryExpr::new(not_left, Operator::Or, not_right));
+                return Ok(Transformed::yes(new_binary));
+            }
+            Operator::Or => {
+                // NOT(A OR B) -> NOT A AND NOT B
+                let not_left: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.left())));
+                let not_right: Arc<dyn PhysicalExpr> =
+                    Arc::new(NotExpr::new(Arc::clone(binary_expr.right())));
+                let new_binary =
+                    Arc::new(BinaryExpr::new(not_left, Operator::And, not_right));
+                return Ok(Transformed::yes(new_binary));
+            }
+            _ => {}
+        }
+    }
+
+    // If no simplification possible, return the original expression
+    Ok(Transformed::no(expr))
+}
diff --git a/datafusion/physical-expr/src/simplifier/unwrap_cast.rs b/datafusion/physical-expr/src/simplifier/unwrap_cast.rs
new file mode 100644
index 0000000000000..0de517cd36c87
--- /dev/null
+++ b/datafusion/physical-expr/src/simplifier/unwrap_cast.rs
@@ -0,0 +1,642 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Unwrap casts in binary comparisons for physical expressions
+//!
+//! This module provides optimization for physical expressions similar to the logical
+//! optimizer's unwrap_cast module. It attempts to remove casts from comparisons to
+//! literals by applying the casts to the literals if possible.
+//!
+//! The optimization improves performance by:
+//! 1. Reducing runtime cast operations on column data
+//! 2. Enabling better predicate pushdown opportunities
+//! 3. Optimizing filter expressions in physical plans
+//!
+//! # Example
+//!
+//! Physical expression: `cast(column as INT64) > INT64(10)`
+//! Optimized to: `column > INT32(10)` (assuming column is INT32)
+
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Schema};
+use datafusion_common::{Result, ScalarValue, tree_node::Transformed};
+use datafusion_expr::Operator;
+use datafusion_expr_common::casts::try_cast_literal_to_type;
+
+use crate::PhysicalExpr;
+use crate::expressions::{BinaryExpr, CastExpr, Literal, TryCastExpr, lit};
+
+/// Attempts to unwrap casts in comparison expressions.
+pub(crate) fn unwrap_cast_in_comparison(
+    expr: Arc<dyn PhysicalExpr>,
+    schema: &Schema,
+) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>()
+        && let Some(unwrapped) = try_unwrap_cast_binary(binary, schema)?
+    {
+        return Ok(Transformed::yes(unwrapped));
+    }
+    Ok(Transformed::no(expr))
+}
+
+/// Try to unwrap casts in binary expressions
+fn try_unwrap_cast_binary(
+    binary: &BinaryExpr,
+    schema: &Schema,
+) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+    // Case 1: cast(left_expr) op literal
+    if let (Some((inner_expr, _cast_type)), Some(literal)) = (
+        extract_cast_info(binary.left()),
+        binary.right().as_any().downcast_ref::<Literal>(),
+    ) && binary.op().supports_propagation()
+        && let Some(unwrapped) = try_unwrap_cast_comparison(
+            Arc::clone(inner_expr),
+            literal.value(),
+            *binary.op(),
+            schema,
+        )?
+    {
+        return Ok(Some(unwrapped));
+    }
+
+    // Case 2: literal op cast(right_expr)
+    if let (Some(literal), Some((inner_expr, _cast_type))) = (
+        binary.left().as_any().downcast_ref::<Literal>(),
+        extract_cast_info(binary.right()),
+    ) {
+        // For literal op cast(expr), we need to swap the operator
+        if let Some(swapped_op) = binary.op().swap()
+            && binary.op().supports_propagation()
+            && let Some(unwrapped) = try_unwrap_cast_comparison(
+                Arc::clone(inner_expr),
+                literal.value(),
+                swapped_op,
+                schema,
+            )?
+        {
+            return Ok(Some(unwrapped));
+        }
+        // If the operator cannot be swapped, we skip this optimization case
+        // but don't prevent other optimizations
+    }
+
+    Ok(None)
+}
+
+/// Extract cast information from a physical expression
+///
+/// If the expression is a CAST(expr, datatype) or TRY_CAST(expr, datatype),
+/// returns Some((inner_expr, target_datatype)). Otherwise returns None.
+fn extract_cast_info(
+    expr: &Arc<dyn PhysicalExpr>,
+) -> Option<(&Arc<dyn PhysicalExpr>, &DataType)> {
+    if let Some(cast) = expr.as_any().downcast_ref::<CastExpr>() {
+        Some((cast.expr(), cast.cast_type()))
+    } else if let Some(try_cast) = expr.as_any().downcast_ref::<TryCastExpr>() {
+        Some((try_cast.expr(), try_cast.cast_type()))
+    } else {
+        None
+    }
+}
+
+/// Try to unwrap a cast in comparison by moving the cast to the literal
+fn try_unwrap_cast_comparison(
+    inner_expr: Arc<dyn PhysicalExpr>,
+    literal_value: &ScalarValue,
+    op: Operator,
+    schema: &Schema,
+) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+    // Get the data type of the inner expression
+    let inner_type = inner_expr.data_type(schema)?;
+
+    // Try to cast the literal to the inner expression's type
+    if let Some(casted_literal) = try_cast_literal_to_type(literal_value, &inner_type) {
+        let literal_expr = lit(casted_literal);
+        let binary_expr = BinaryExpr::new(inner_expr, op, literal_expr);
+        return Ok(Some(Arc::new(binary_expr)));
+    }
+
+    Ok(None)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::expressions::{col, lit};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{ScalarValue, tree_node::TreeNode};
+    use datafusion_expr::Operator;
+
+    /// Check if an expression is a cast expression
+    fn is_cast_expr(expr: &Arc<dyn PhysicalExpr>) -> bool {
+        expr.as_any().downcast_ref::<CastExpr>().is_some()
+            || expr.as_any().downcast_ref::<TryCastExpr>().is_some()
+    }
+
+    /// Check if a binary expression is suitable for cast unwrapping
+    fn is_binary_expr_with_cast_and_literal(binary: &BinaryExpr) -> bool {
+        // Check if left is cast and right is literal
+        let left_cast_right_literal = is_cast_expr(binary.left())
+            && binary.right().as_any().downcast_ref::<Literal>().is_some();
+
+        // Check if left is literal and right is cast
+        let left_literal_right_cast =
+            binary.left().as_any().downcast_ref::<Literal>().is_some()
+                && is_cast_expr(binary.right());
+
+        left_cast_right_literal || left_literal_right_cast
+    }
+
+    fn test_schema() -> Schema {
+        Schema::new(vec![
+            Field::new("c1", DataType::Int32, false),
+            Field::new("c2", DataType::Int64, false),
+            Field::new("c3", DataType::Utf8, false),
+        ])
+    }
+
+    #[test]
+    fn test_unwrap_cast_in_binary_comparison() {
+        let schema = test_schema();
+
+        // Create: cast(c1 as INT64) > INT64(10)
+        let column_expr = col("c1", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let literal_expr = lit(10i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // The result should be: c1 > INT32(10)
+        let optimized = result.data;
+        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        // Check that left side is no longer a cast
+        assert!(!is_cast_expr(optimized_binary.left()));
+
+        // Check that right side is a literal with the correct type and value
+        let right_literal = optimized_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(10)));
+    }
+
+    #[test]
+    fn test_unwrap_cast_with_literal_on_left() {
+        let schema = test_schema();
+
+        // Create: INT64(10) < cast(c1 as INT64)
+        let column_expr = col("c1", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let literal_expr = lit(10i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(literal_expr, Operator::Lt, cast_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // The result should be equivalent to: c1 > INT32(10)
+        let optimized = result.data;
+        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        // Check the operator was swapped
+        assert_eq!(*optimized_binary.op(), Operator::Gt);
+    }
+
+    #[test]
+    fn test_no_unwrap_when_types_unsupported() {
+        let schema = Schema::new(vec![Field::new("f1", DataType::Float32, false)]);
+
+        // Create: cast(f1 as FLOAT64) > FLOAT64(10.5)
+        let column_expr = col("f1", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Float64, None));
+        let literal_expr = lit(10.5f64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should NOT be transformed (floating point types not supported)
+        assert!(!result.transformed);
+    }
+
+    #[test]
+    fn test_is_binary_expr_with_cast_and_literal() {
+        let schema = test_schema();
+
+        let column_expr = col("c1", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let literal_expr = lit(10i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
+        let binary_ref = binary_expr.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        assert!(is_binary_expr_with_cast_and_literal(binary_ref));
+    }
+
+    #[test]
+    fn test_unwrap_cast_literal_on_left_side() {
+        // Test case for: literal <= cast(column)
+        // This was the specific case that caused the bug
+        let schema = Schema::new(vec![Field::new(
+            "decimal_col",
+            DataType::Decimal128(9, 2),
+            true,
+        )]);
+
+        // Create: Decimal128(400) <= cast(decimal_col as Decimal128(22, 2))
+        let column_expr = col("decimal_col", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(
+            column_expr,
+            DataType::Decimal128(22, 2),
+            None,
+        ));
+        let literal_expr = lit(ScalarValue::Decimal128(Some(400), 22, 2));
+        let binary_expr =
+            Arc::new(BinaryExpr::new(literal_expr, Operator::LtEq, cast_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // The result should be: decimal_col >= Decimal128(400, 9, 2)
+        let optimized = result.data;
+        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        // Check operator was swapped correctly
+        assert_eq!(*optimized_binary.op(), Operator::GtEq);
+
+        // Check that left side is the column without cast
+        assert!(!is_cast_expr(optimized_binary.left()));
+
+        // Check that right side is a literal with the correct type
+        let right_literal = optimized_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(
+            right_literal.value().data_type(),
+            DataType::Decimal128(9, 2)
+        );
+    }
+
+    #[test]
+    fn test_unwrap_cast_with_different_comparison_operators() {
+        let schema = Schema::new(vec![Field::new("int_col", DataType::Int32, false)]);
+
+        // Test all comparison operators with literal on the left
+        let operators = vec![
+            (Operator::Lt, Operator::Gt),
+            (Operator::LtEq, Operator::GtEq),
+            (Operator::Gt, Operator::Lt),
+            (Operator::GtEq, Operator::LtEq),
+            (Operator::Eq, Operator::Eq),
+            (Operator::NotEq, Operator::NotEq),
+        ];
+
+        for (original_op, expected_op) in operators {
+            // Create: INT64(100) op cast(int_col as INT64)
+            let column_expr = col("int_col", &schema).unwrap();
+            let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+            let literal_expr = lit(100i64);
+            let binary_expr =
+                Arc::new(BinaryExpr::new(literal_expr, original_op, cast_expr));
+
+            // Apply unwrap cast optimization
+            let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+            // Should be transformed
+            assert!(result.transformed);
+
+            let optimized = result.data;
+            let optimized_binary =
+                optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+            // Check the operator was swapped correctly
+            assert_eq!(
+                *optimized_binary.op(),
+                expected_op,
+                "Failed for operator {original_op:?} -> {expected_op:?}"
+            );
+
+            // Check that left side has no cast
+            assert!(!is_cast_expr(optimized_binary.left()));
+
+            // Check that the literal was cast to the column type
+            let right_literal = optimized_binary
+                .right()
+                .as_any()
+                .downcast_ref::<Literal>()
+                .unwrap();
+            assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(100)));
+        }
+    }
+
+    #[test]
+    fn test_unwrap_cast_with_decimal_types() {
+        // Test various decimal precision/scale combinations
+        let test_cases = vec![
+            // (column_precision, column_scale, cast_precision, cast_scale, value)
+            (9, 2, 22, 2, 400),
+            (10, 3, 20, 3, 1000),
+            (5, 1, 10, 1, 99),
+        ];
+
+        for (col_p, col_s, cast_p, cast_s, value) in test_cases {
+            let schema = Schema::new(vec![Field::new(
+                "decimal_col",
+                DataType::Decimal128(col_p, col_s),
+                true,
+            )]);
+
+            // Test both: cast(column) op literal AND literal op cast(column)
+
+            // Case 1: cast(column) > literal
+            let column_expr = col("decimal_col", &schema).unwrap();
+            let cast_expr = Arc::new(CastExpr::new(
+                Arc::clone(&column_expr),
+                DataType::Decimal128(cast_p, cast_s),
+                None,
+            ));
+            let literal_expr = lit(ScalarValue::Decimal128(Some(value), cast_p, cast_s));
+            let binary_expr =
+                Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
+
+            let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+            assert!(result.transformed);
+
+            // Case 2: literal < cast(column)
+            let cast_expr = Arc::new(CastExpr::new(
+                column_expr,
+                DataType::Decimal128(cast_p, cast_s),
+                None,
+            ));
+            let literal_expr = lit(ScalarValue::Decimal128(Some(value), cast_p, cast_s));
+            let binary_expr =
+                Arc::new(BinaryExpr::new(literal_expr, Operator::Lt, cast_expr));
+
+            let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+            assert!(result.transformed);
+        }
+    }
+
+    #[test]
+    fn test_unwrap_cast_with_null_literals() {
+        // Test with NULL literals to ensure they're handled correctly
+        let schema = Schema::new(vec![Field::new("int_col", DataType::Int32, true)]);
+
+        // Create: cast(int_col as INT64) = NULL
+        let column_expr = col("int_col", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let null_literal = lit(ScalarValue::Int64(None));
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::Eq, null_literal));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // Verify the NULL was cast to the column type
+        let optimized = result.data;
+        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        let right_literal = optimized_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(right_literal.value(), &ScalarValue::Int32(None));
+    }
+
+    #[test]
+    fn test_unwrap_cast_with_try_cast() {
+        // Test that TryCast expressions are also unwrapped correctly
+        let schema = Schema::new(vec![Field::new("str_col", DataType::Utf8, true)]);
+
+        // Create: try_cast(str_col as INT64) > INT64(100)
+        let column_expr = col("str_col", &schema).unwrap();
+        let try_cast_expr = Arc::new(TryCastExpr::new(column_expr, DataType::Int64));
+        let literal_expr = lit(100i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(try_cast_expr, Operator::Gt, literal_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should NOT be transformed (string to int cast not supported)
+        assert!(!result.transformed);
+    }
+
+    #[test]
+    fn test_unwrap_cast_preserves_non_comparison_operators() {
+        // Test that non-comparison operators in AND/OR expressions are preserved
+        let schema = Schema::new(vec![Field::new("int_col", DataType::Int32, false)]);
+
+        // Create: cast(int_col as INT64) > INT64(10) AND cast(int_col as INT64) < INT64(20)
+        let column_expr = col("int_col", &schema).unwrap();
+
+        let cast1 = Arc::new(CastExpr::new(
+            Arc::clone(&column_expr),
+            DataType::Int64,
+            None,
+        ));
+        let lit1 = lit(10i64);
+        let compare1 = Arc::new(BinaryExpr::new(cast1, Operator::Gt, lit1));
+
+        let cast2 = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let lit2 = lit(20i64);
+        let compare2 = Arc::new(BinaryExpr::new(cast2, Operator::Lt, lit2));
+
+        let and_expr = Arc::new(BinaryExpr::new(compare1, Operator::And, compare2));
+
+        // Apply unwrap cast optimization recursively
+        let result = (and_expr as Arc<dyn PhysicalExpr>)
+            .transform_down(|node| unwrap_cast_in_comparison(node, &schema))
+            .unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // Verify the AND operator is preserved
+        let optimized = result.data;
+        let and_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+        assert_eq!(*and_binary.op(), Operator::And);
+
+        // Both sides should have their casts unwrapped
+        let left_binary = and_binary
+            .left()
+            .as_any()
+            .downcast_ref::<BinaryExpr>()
+            .unwrap();
+        let right_binary = and_binary
+            .right()
+            .as_any()
+            .downcast_ref::<BinaryExpr>()
+            .unwrap();
+
+        assert!(!is_cast_expr(left_binary.left()));
+        assert!(!is_cast_expr(right_binary.left()));
+    }
+
+    #[test]
+    fn test_try_cast_unwrapping() {
+        let schema = test_schema();
+
+        // Create: try_cast(c1 as INT64) <= INT64(100)
+        let column_expr = col("c1", &schema).unwrap();
+        let try_cast_expr = Arc::new(TryCastExpr::new(column_expr, DataType::Int64));
+        let literal_expr = lit(100i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(try_cast_expr, Operator::LtEq, literal_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should be transformed to: c1 <= INT32(100)
+        assert!(result.transformed);
+
+        let optimized = result.data;
+        let optimized_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        // Verify the try_cast was removed
+        assert!(!is_cast_expr(optimized_binary.left()));
+
+        // Verify the literal was converted
+        let right_literal = optimized_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(right_literal.value(), &ScalarValue::Int32(Some(100)));
+    }
+
+    #[test]
+    fn test_non_swappable_operator() {
+        // Test case with an operator that cannot be swapped
+        let schema = Schema::new(vec![Field::new("int_col", DataType::Int32, false)]);
+
+        // Create: INT64(10) + cast(int_col as INT64)
+        // The Plus operator cannot be swapped, so this should not be transformed
+        let column_expr = col("int_col", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let literal_expr = lit(10i64);
+        let binary_expr =
+            Arc::new(BinaryExpr::new(literal_expr, Operator::Plus, cast_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should NOT be transformed because Plus cannot be swapped
+        assert!(!result.transformed);
+    }
+
+    #[test]
+    fn test_cast_that_cannot_be_unwrapped_overflow() {
+        // Test case where the literal value would overflow the target type
+        let schema = Schema::new(vec![Field::new("small_int", DataType::Int8, false)]);
+
+        // Create: cast(small_int as INT64) > INT64(1000)
+        // This should NOT be unwrapped because 1000 cannot fit in Int8 (max value is 127)
+        let column_expr = col("small_int", &schema).unwrap();
+        let cast_expr = Arc::new(CastExpr::new(column_expr, DataType::Int64, None));
+        let literal_expr = lit(1000i64); // Value too large for Int8
+        let binary_expr =
+            Arc::new(BinaryExpr::new(cast_expr, Operator::Gt, literal_expr));
+
+        // Apply unwrap cast optimization
+        let result = unwrap_cast_in_comparison(binary_expr, &schema).unwrap();
+
+        // Should NOT be transformed due to overflow
+        assert!(!result.transformed);
+    }
+
+    #[test]
+    fn test_complex_nested_expression() {
+        let schema = test_schema();
+
+        // Create a more complex expression with nested casts
+        // (cast(c1 as INT64) > INT64(10)) AND (cast(c2 as INT32) = INT32(20))
+        let c1_expr = col("c1", &schema).unwrap();
+        let c1_cast = Arc::new(CastExpr::new(c1_expr, DataType::Int64, None));
+        let c1_literal = lit(10i64);
+        let c1_binary = Arc::new(BinaryExpr::new(c1_cast, Operator::Gt, c1_literal));
+
+        let c2_expr = col("c2", &schema).unwrap();
+        let c2_cast = Arc::new(CastExpr::new(c2_expr, DataType::Int32, None));
+        let c2_literal = lit(20i32);
+        let c2_binary = Arc::new(BinaryExpr::new(c2_cast, Operator::Eq, c2_literal));
+
+        // Create AND expression
+        let and_expr = Arc::new(BinaryExpr::new(c1_binary, Operator::And, c2_binary));
+
+        // Apply unwrap cast optimization recursively
+        let result = (and_expr as Arc<dyn PhysicalExpr>)
+            .transform_down(|node| unwrap_cast_in_comparison(node, &schema))
+            .unwrap();
+
+        // Should be transformed
+        assert!(result.transformed);
+
+        // Verify both sides of the AND were optimized
+        let optimized = result.data;
+        let and_binary = optimized.as_any().downcast_ref::<BinaryExpr>().unwrap();
+
+        // Left side should be: c1 > INT32(10)
+        let left_binary = and_binary
+            .left()
+            .as_any()
+            .downcast_ref::<BinaryExpr>()
+            .unwrap();
+        assert!(!is_cast_expr(left_binary.left()));
+        let left_literal = left_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(left_literal.value(), &ScalarValue::Int32(Some(10)));
+
+        // Right side should be: c2 = INT64(20) (c2 is already INT64, literal cast to match)
+        let right_binary = and_binary
+            .right()
+            .as_any()
+            .downcast_ref::<BinaryExpr>()
+            .unwrap();
+        assert!(!is_cast_expr(right_binary.left()));
+        let right_literal = right_binary
+            .right()
+            .as_any()
+            .downcast_ref::<Literal>()
+            .unwrap();
+        assert_eq!(right_literal.value(), &ScalarValue::Int64(Some(20)));
+    }
+}
diff --git a/datafusion/physical-expr/src/statistics/stats_solver.rs b/datafusion/physical-expr/src/statistics/stats_solver.rs
index ec58076caf3b1..5665f7d1bee41 100644
--- a/datafusion/physical-expr/src/statistics/stats_solver.rs
+++ b/datafusion/physical-expr/src/statistics/stats_solver.rs
@@ -20,18 +20,18 @@ use std::sync::Arc;
 use crate::expressions::Literal;
 use crate::intervals::cp_solver::PropagationResult;
 use crate::physical_expr::PhysicalExpr;
-use crate::utils::{build_dag, ExprTreeNode};
+use crate::utils::{ExprTreeNode, build_dag};
 
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::statistics::Distribution;
 use datafusion_expr_common::interval_arithmetic::Interval;
 
+use petgraph::Outgoing;
 use petgraph::adj::DefaultIx;
 use petgraph::prelude::Bfs;
 use petgraph::stable_graph::{NodeIndex, StableGraph};
 use petgraph::visit::DfsPostOrder;
-use petgraph::Outgoing;
 
 /// This object implements a directed acyclic expression graph (DAEG) that
 /// is used to compute statistics/distributions for expressions hierarchically.
@@ -156,7 +156,7 @@ impl ExprStatisticsGraph {
                 // If the given statistics enable us to obtain a more precise
                 // range for the root, update it:
                 let subset = root_range.contains(given_range)?;
-                self.graph[self.root].dist = if subset == Interval::CERTAINLY_TRUE {
+                self.graph[self.root].dist = if subset == Interval::TRUE {
                     // Given statistics is strictly more informative, use it as is:
                     given_stats
                 } else {
@@ -205,7 +205,7 @@ impl ExprStatisticsGraph {
 mod tests {
     use std::sync::Arc;
 
-    use crate::expressions::{binary, try_cast, Column};
+    use crate::expressions::{Column, binary, try_cast};
     use crate::intervals::cp_solver::PropagationResult;
     use crate::statistics::stats_solver::ExprStatisticsGraph;
 
diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs
index 8092dc3c1a614..c4ce74fd3a573 100644
--- a/datafusion/physical-expr/src/utils/guarantee.rs
+++ b/datafusion/physical-expr/src/utils/guarantee.rs
@@ -19,7 +19,7 @@
 //! constant.
 
 use crate::utils::split_disjunction;
-use crate::{split_conjunction, PhysicalExpr};
+use crate::{PhysicalExpr, split_conjunction};
 use datafusion_common::{Column, HashMap, ScalarValue};
 use datafusion_expr::Operator;
 use std::collections::HashSet;
@@ -124,40 +124,20 @@ impl LiteralGuarantee {
             // for an `AND` conjunction to be true, all terms individually must be true
             .fold(GuaranteeBuilder::new(), |builder, expr| {
                 if let Some(cel) = ColOpLit::try_new(expr) {
-                    builder.aggregate_conjunct(cel)
+                    builder.aggregate_conjunct(&cel)
                 } else if let Some(inlist) = expr
                     .as_any()
                     .downcast_ref::<crate::expressions::InListExpr>()
                 {
-                    // Only support single-column inlist currently, multi-column inlist is not supported
-                    let col = inlist
-                        .expr()
-                        .as_any()
-                        .downcast_ref::<crate::expressions::Column>();
-                    let Some(col) = col else {
-                        return builder;
-                    };
-
-                    let literals = inlist
-                        .list()
-                        .iter()
-                        .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
-                        .collect::<Option<Vec<_>>>();
-                    let Some(literals) = literals else {
-                        return builder;
-                    };
-
-                    let guarantee = if inlist.negated() {
-                        Guarantee::NotIn
+                    if let Some(inlist) = ColInList::try_new(inlist) {
+                        builder.aggregate_multi_conjunct(
+                            inlist.col,
+                            inlist.guarantee,
+                            inlist.list.iter().map(|lit| lit.value()),
+                        )
                     } else {
-                        Guarantee::In
-                    };
-
-                    builder.aggregate_multi_conjunct(
-                        col,
-                        guarantee,
-                        literals.iter().map(|e| e.value()),
-                    )
+                        builder
+                    }
                 } else {
                     // split disjunction: <expr> OR <expr> OR ...
                     let disjunctions = split_disjunction(expr);
@@ -184,16 +164,6 @@ impl LiteralGuarantee {
                         .filter_map(|expr| ColOpLit::try_new(expr))
                         .collect::<Vec<_>>();
 
-                    if terms.is_empty() {
-                        return builder;
-                    }
-
-                    // if not all terms are of the form (col <op> literal),
-                    // can't infer any guarantees
-                    if terms.len() != disjunctions.len() {
-                        return builder;
-                    }
-
                     // if all terms are 'col <op> literal' with the same column
                     // and operation we can infer any guarantees
                     //
@@ -203,18 +173,70 @@ impl LiteralGuarantee {
                     // foo is required for the expression to be true.
                     // So we can only create a multi value guarantee for `=`
                     // (or a single value). (e.g. ignore `a != foo OR a != bar`)
-                    let first_term = &terms[0];
-                    if terms.iter().all(|term| {
-                        term.col.name() == first_term.col.name()
-                            && term.guarantee == Guarantee::In
-                    }) {
+                    let first_term = terms.first();
+                    if !terms.is_empty()
+                        && terms.len() == disjunctions.len()
+                        && terms.iter().all(|term| {
+                            term.col.name() == first_term.unwrap().col.name()
+                                && term.guarantee == Guarantee::In
+                        })
+                    {
                         builder.aggregate_multi_conjunct(
-                            first_term.col,
+                            first_term.unwrap().col,
                             Guarantee::In,
                             terms.iter().map(|term| term.lit.value()),
                         )
                     } else {
-                        // can't infer anything
+                        // Handle disjunctions with conjunctions like (a = 1 AND b = 2) OR (a = 2 AND b = 3)
+                        // Extract termsets from each disjunction
+                        // if in each termset, they have same column, and the guarantee is In,
+                        // we can infer a guarantee for the column
+                        // e.g. (a = 1 AND b = 2) OR (a = 2 AND b = 3) is `a IN (1, 2) AND b IN (2, 3)`
+                        // otherwise, we can't infer a guarantee
+                        let termsets: Vec<Vec<ColOpLitOrInList>> = disjunctions
+                            .iter()
+                            .map(|expr| {
+                                split_conjunction(expr)
+                                    .into_iter()
+                                    .filter_map(ColOpLitOrInList::try_new)
+                                    .filter(|term| term.guarantee() == Guarantee::In)
+                                    .collect()
+                            })
+                            .collect();
+
+                        // Early return if any termset is empty (can't infer guarantees)
+                        if termsets.iter().any(|terms| terms.is_empty()) {
+                            return builder;
+                        }
+
+                        // Find columns that appear in all termsets
+                        let common_cols = find_common_columns(&termsets);
+                        if common_cols.is_empty() {
+                            return builder;
+                        }
+
+                        // Build guarantees for common columns
+                        let mut builder = builder;
+                        for col in common_cols {
+                            let literals: Vec<_> = termsets
+                                .iter()
+                                .filter_map(|terms| {
+                                    terms.iter().find(|term| term.col() == col).map(
+                                        |term| {
+                                            term.lits().into_iter().map(|lit| lit.value())
+                                        },
+                                    )
+                                })
+                                .flatten()
+                                .collect();
+
+                            builder = builder.aggregate_multi_conjunct(
+                                col,
+                                Guarantee::In,
+                                literals.into_iter(),
+                            );
+                        }
+
                         builder
                     }
                 }
@@ -270,7 +292,7 @@ impl<'a> GuaranteeBuilder<'a> {
     /// # Examples
     /// * `AND (a = 1)`: `a` is guaranteed to be 1
     /// * `AND (a != 1)`: a is guaranteed to not be 1
-    fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
+    fn aggregate_conjunct(self, col_op_lit: &ColOpLit<'a>) -> Self {
         self.aggregate_multi_conjunct(
             col_op_lit.col,
             col_op_lit.guarantee,
@@ -362,7 +384,7 @@ struct ColOpLit<'a> {
 }
 
 impl<'a> ColOpLit<'a> {
-    /// Returns Some(ColEqLit) if the expression is either:
+    /// Returns Some(ColOpLit) if the expression is either:
     /// 1. `col <op> literal`
     /// 2. `literal <op> col`
     /// 3. operator is `=` or `!=`
@@ -410,6 +432,115 @@ impl<'a> ColOpLit<'a> {
     }
 }
 
+/// Represents a single `col [not]in literal` expression
+struct ColInList<'a> {
+    col: &'a crate::expressions::Column,
+    guarantee: Guarantee,
+    list: Vec<&'a crate::expressions::Literal>,
+}
+
+impl<'a> ColInList<'a> {
+    /// Returns Some(ColInList) if the expression is either:
+    /// 1. `col <op> (literal1, literal2, ...)`
+    /// 3. operator is `in` or `not in`
+    ///
+    /// Returns None otherwise
+    fn try_new(inlist: &'a crate::expressions::InListExpr) -> Option<Self> {
+        // Only support single-column inlist currently, multi-column inlist is not supported
+        let col = inlist
+            .expr()
+            .as_any()
+            .downcast_ref::<crate::expressions::Column>()?;
+
+        let literals = inlist
+            .list()
+            .iter()
+            .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
+            .collect::<Option<Vec<_>>>()?;
+
+        let guarantee = if inlist.negated() {
+            Guarantee::NotIn
+        } else {
+            Guarantee::In
+        };
+
+        Some(Self {
+            col,
+            guarantee,
+            list: literals,
+        })
+    }
+}
+
+/// Represents a single `col [not]in literal` expression or a single `col <op> literal` expression
+enum ColOpLitOrInList<'a> {
+    ColOpLit(ColOpLit<'a>),
+    ColInList(ColInList<'a>),
+}
+
+impl<'a> ColOpLitOrInList<'a> {
+    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
+        match expr
+            .as_any()
+            .downcast_ref::<crate::expressions::InListExpr>()
+        {
+            Some(inlist) => Some(Self::ColInList(ColInList::try_new(inlist)?)),
+            None => ColOpLit::try_new(expr).map(Self::ColOpLit),
+        }
+    }
+
+    fn guarantee(&self) -> Guarantee {
+        match self {
+            Self::ColOpLit(col_op_lit) => col_op_lit.guarantee,
+            Self::ColInList(col_in_list) => col_in_list.guarantee,
+        }
+    }
+
+    fn col(&self) -> &'a crate::expressions::Column {
+        match self {
+            Self::ColOpLit(col_op_lit) => col_op_lit.col,
+            Self::ColInList(col_in_list) => col_in_list.col,
+        }
+    }
+
+    fn lits(&self) -> Vec<&'a crate::expressions::Literal> {
+        match self {
+            Self::ColOpLit(col_op_lit) => vec![col_op_lit.lit],
+            Self::ColInList(col_in_list) => col_in_list.list.clone(),
+        }
+    }
+}
+
+/// Find columns that appear in all termsets
+fn find_common_columns<'a>(
+    termsets: &[Vec<ColOpLitOrInList<'a>>],
+) -> Vec<&'a crate::expressions::Column> {
+    if termsets.is_empty() {
+        return Vec::new();
+    }
+
+    // Start with columns from the first termset
+    let mut common_cols: HashSet<_> = termsets[0].iter().map(|term| term.col()).collect();
+
+    // check if any common_col in one termset occur many times
+    // e.g. (a = 1 AND a = 2) OR (a = 2 AND b = 3), should not infer a guarantee
+    // TODO: for above case, we can infer a IN (2) AND b IN (3)
+    if common_cols.len() != termsets[0].len() {
+        return Vec::new();
+    }
+
+    // Intersect with columns from remaining termsets
+    for termset in termsets.iter().skip(1) {
+        let termset_cols: HashSet<_> = termset.iter().map(|term| term.col()).collect();
+        if termset_cols.len() != termset.len() {
+            return Vec::new();
+        }
+        common_cols = common_cols.intersection(&termset_cols).cloned().collect();
+    }
+
+    common_cols.into_iter().collect()
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::LazyLock;
@@ -419,7 +550,7 @@ mod test {
 
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_expr::expr_fn::*;
-    use datafusion_expr::{lit, Expr};
+    use datafusion_expr::{Expr, lit};
 
     use itertools::Itertools;
 
@@ -808,12 +939,11 @@ mod test {
             vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
         );
         // b IN (1, 2, 3) OR b = 2
-        // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to analyze this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported.
         test_analyze(
             col("b")
                 .in_list(vec![lit(1), lit(2), lit(3)], false)
                 .or(col("b").eq(lit(2))),
-            vec![],
+            vec![in_guarantee("b", [1, 2, 3])],
         );
         // b IN (1, 2, 3) OR b != 3
         test_analyze(
@@ -824,13 +954,123 @@ mod test {
         );
     }
 
+    #[test]
+    fn test_disjunction_and_conjunction_multi_column() {
+        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
+            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1, 2])],
+        );
+        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (b = 3)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
+                .or(col("b").eq(lit(3))),
+            vec![in_guarantee("b", [1, 2, 3])],
+        );
+        // (a = "foo" AND b = 1) OR (a = "bar" AND b = 2) OR (c = 3)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2))))
+                .or(col("c").eq(lit(3))),
+            vec![],
+        );
+        // (a = "foo" AND b > 1) OR (a = "bar" AND b = 2)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").gt(lit(1))))
+                .or(col("a").eq(lit("bar")).and(col("b").eq(lit(2)))),
+            vec![in_guarantee("a", ["foo", "bar"])],
+        );
+        // (a = "foo" AND b = 1) OR (b = 1 AND c = 2) OR (c = 3 AND a = "bar")
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("b").eq(lit(1)).and(col("c").eq(lit(2))))
+                .or(col("c").eq(lit(3)).and(col("a").eq(lit("bar")))),
+            vec![],
+        );
+        // (a = "foo" AND a = "bar") OR (a = "good" AND b = 1)
+        // TODO: this should be `a IN ("good") AND b IN (1)`
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))))
+                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
+            vec![],
+        );
+        // (a = "foo" AND a = "foo") OR (a = "good" AND b = 1)
+        // TODO: this should be `a IN ("foo", "good")`
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("a").eq(lit("foo"))))
+                .or(col("a").eq(lit("good")).and(col("b").eq(lit(1)))),
+            vec![],
+        );
+        // (a = "foo" AND b = 3) OR (b = 4 AND b = 1) OR (b = 2 AND a = "bar")
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(3))))
+                .or(col("b").eq(lit(4)).and(col("b").eq(lit(1))))
+                .or(col("b").eq(lit(2)).and(col("a").eq(lit("bar")))),
+            vec![],
+        );
+        // (b = 1 AND b > 3) OR (a = "foo" AND b = 4)
+        test_analyze(
+            (col("b").eq(lit(1)).and(col("b").gt(lit(3))))
+                .or(col("a").eq(lit("foo")).and(col("b").eq(lit(4)))),
+            // if b isn't 1 or 4, it can not be true (though the expression actually can never be true)
+            vec![in_guarantee("b", [1, 4])],
+        );
+        // (a = "foo" AND b = 1) OR (a != "bar" AND b = 2)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("a").not_eq(lit("bar")).and(col("b").eq(lit(2)))),
+            vec![in_guarantee("b", [1, 2])],
+        );
+        // (a = "foo" AND b = 1) OR (a LIKE "%bar" AND b = 2)
+        test_analyze(
+            (col("a").eq(lit("foo")).and(col("b").eq(lit(1))))
+                .or(col("a").like(lit("%bar")).and(col("b").eq(lit(2)))),
+            vec![in_guarantee("b", [1, 2])],
+        );
+        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo", "bar") AND b = 6)
+        test_analyze(
+            (col("a")
+                .in_list(vec![lit("foo"), lit("bar")], false)
+                .and(col("b").eq(lit(5))))
+            .or(col("a")
+                .in_list(vec![lit("foo"), lit("bar")], false)
+                .and(col("b").eq(lit(6)))),
+            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
+        );
+        // (a IN ("foo", "bar") AND b = 5) OR (a IN ("foo") AND b = 6)
+        test_analyze(
+            (col("a")
+                .in_list(vec![lit("foo"), lit("bar")], false)
+                .and(col("b").eq(lit(5))))
+            .or(col("a")
+                .in_list(vec![lit("foo")], false)
+                .and(col("b").eq(lit(6)))),
+            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [5, 6])],
+        );
+        // (a NOT IN ("foo", "bar") AND b = 5) OR (a NOT IN ("foo") AND b = 6)
+        test_analyze(
+            (col("a")
+                .in_list(vec![lit("foo"), lit("bar")], true)
+                .and(col("b").eq(lit(5))))
+            .or(col("a")
+                .in_list(vec![lit("foo")], true)
+                .and(col("b").eq(lit(6)))),
+            vec![in_guarantee("b", [5, 6])],
+        );
+    }
+
     /// Tests that analyzing expr results in the expected guarantees
     fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
         println!("Begin analyze of {expr}");
         let schema = schema();
         let physical_expr = logical2physical(&expr, &schema);
 
-        let actual = LiteralGuarantee::analyze(&physical_expr);
+        let actual = LiteralGuarantee::analyze(&physical_expr)
+            .into_iter()
+            .sorted_by_key(|g| g.column.name().to_string())
+            .collect::<Vec<_>>();
         assert_eq!(
             expected, actual,
             "expr: {expr}\
@@ -867,6 +1107,7 @@ mod test {
             Arc::new(Schema::new(vec![
                 Field::new("a", DataType::Utf8, false),
                 Field::new("b", DataType::Int32, false),
+                Field::new("c", DataType::Int32, false),
             ]))
         });
         Arc::clone(&SCHEMA)
diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs
index b4d0758fd2e81..2cdc326f5dd36 100644
--- a/datafusion/physical-expr/src/utils/mod.rs
+++ b/datafusion/physical-expr/src/utils/mod.rs
@@ -21,20 +21,18 @@ pub use guarantee::{Guarantee, LiteralGuarantee};
 use std::borrow::Borrow;
 use std::sync::Arc;
 
-use crate::expressions::{BinaryExpr, Column};
-use crate::tree_node::ExprContext;
 use crate::PhysicalExpr;
 use crate::PhysicalSortExpr;
+use crate::expressions::{BinaryExpr, Column};
+use crate::tree_node::ExprContext;
 
-use arrow::datatypes::SchemaRef;
+use arrow::datatypes::Schema;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
 use datafusion_common::{HashMap, HashSet, Result};
 use datafusion_expr::Operator;
 
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use itertools::Itertools;
 use petgraph::graph::NodeIndex;
 use petgraph::stable_graph::StableGraph;
 
@@ -231,7 +229,7 @@ pub fn collect_columns(expr: &Arc<dyn PhysicalExpr>) -> HashSet<Column> {
     let mut columns = HashSet::<Column>::new();
     expr.apply(|expr| {
         if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-            columns.get_or_insert_owned(column);
+            columns.get_or_insert_with(column, |c| c.clone());
         }
         Ok(TreeNodeRecursion::Continue)
     })
@@ -240,22 +238,23 @@ pub fn collect_columns(expr: &Arc<dyn PhysicalExpr>) -> HashSet<Column> {
     columns
 }
 
-/// Re-assign column indices referenced in predicate according to given schema.
-/// This may be helpful when dealing with projections.
-pub fn reassign_predicate_columns(
-    pred: Arc<dyn PhysicalExpr>,
-    schema: &SchemaRef,
-    ignore_not_found: bool,
+/// Re-assign indices of [`Column`]s within the given [`PhysicalExpr`] according to
+/// the provided [`Schema`].
+///
+/// This can be useful when attempting to map an expression onto a different schema.
+///
+/// # Errors
+///
+/// This function will return an error if any column in the expression cannot be found
+/// in the provided schema.
+pub fn reassign_expr_columns(
+    expr: Arc<dyn PhysicalExpr>,
+    schema: &Schema,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    pred.transform_down(|expr| {
-        let expr_any = expr.as_any();
-
-        if let Some(column) = expr_any.downcast_ref::<Column>() {
-            let index = match schema.index_of(column.name()) {
-                Ok(idx) => idx,
-                Err(_) if ignore_not_found => usize::MAX,
-                Err(e) => return Err(e.into()),
-            };
+    expr.transform_down(|expr| {
+        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+            let index = schema.index_of(column.name())?;
+
             return Ok(Transformed::yes(Arc::new(Column::new(
                 column.name(),
                 index,
@@ -266,26 +265,17 @@ pub fn reassign_predicate_columns(
     .data()
 }
 
-/// Merge left and right sort expressions, checking for duplicates.
-pub fn merge_vectors(left: &LexOrdering, right: &LexOrdering) -> LexOrdering {
-    left.iter()
-        .cloned()
-        .chain(right.iter().cloned())
-        .unique()
-        .collect()
-}
-
 #[cfg(test)]
 pub(crate) mod tests {
     use std::any::Any;
     use std::fmt::{Display, Formatter};
 
     use super::*;
-    use crate::expressions::{binary, cast, col, in_list, lit, Literal};
+    use crate::expressions::{Literal, binary, cast, col, in_list, lit};
 
     use arrow::array::{ArrayRef, Float32Array, Float64Array};
     use arrow::datatypes::{DataType, Field, Schema};
-    use datafusion_common::{exec_err, DataFusionError, ScalarValue};
+    use datafusion_common::{ScalarValue, exec_err, internal_datafusion_err};
     use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
     use datafusion_expr::{
         ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
@@ -293,7 +283,7 @@ pub(crate) mod tests {
 
     use petgraph::visit::Bfs;
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     pub struct TestScalarUDF {
         pub(crate) signature: Signature,
     }
@@ -345,11 +335,11 @@ pub(crate) mod tests {
                         .as_any()
                         .downcast_ref::<Float64Array>()
                         .ok_or_else(|| {
-                            DataFusionError::Internal(format!(
+                            internal_datafusion_err!(
                                 "could not cast {} to {}",
                                 self.name(),
                                 std::any::type_name::<Float64Array>()
-                            ))
+                            )
                         })?;
 
                     arg.iter()
@@ -361,11 +351,11 @@ pub(crate) mod tests {
                         .as_any()
                         .downcast_ref::<Float32Array>()
                         .ok_or_else(|| {
-                            DataFusionError::Internal(format!(
+                            internal_datafusion_err!(
                                 "could not cast {} to {}",
                                 self.name(),
                                 std::any::type_name::<Float32Array>()
-                            ))
+                            )
                         })?;
 
                     arg.iter()
@@ -517,7 +507,7 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn test_reassign_predicate_columns_in_list() {
+    fn test_reassign_expr_columns_in_list() {
         let int_field = Field::new("should_not_matter", DataType::Int64, true);
         let dict_field = Field::new(
             "id",
@@ -537,7 +527,7 @@ pub(crate) mod tests {
         )
         .unwrap();
 
-        let actual = reassign_predicate_columns(pred, &schema_small, false).unwrap();
+        let actual = reassign_expr_columns(pred, &schema_small).unwrap();
 
         let expected = in_list(
             Arc::new(Column::new_with_schema("id", &schema_small).unwrap()),
diff --git a/datafusion/physical-expr/src/window/aggregate.rs b/datafusion/physical-expr/src/window/aggregate.rs
index 9b959796136a9..1ff13d107c036 100644
--- a/datafusion/physical-expr/src/window/aggregate.rs
+++ b/datafusion/physical-expr/src/window/aggregate.rs
@@ -23,19 +23,19 @@ use std::sync::Arc;
 
 use crate::aggregate::AggregateFunctionExpr;
 use crate::window::standard::add_new_ordering_expr_with_partition_by;
-use crate::window::window_expr::AggregateWindowExpr;
+use crate::window::window_expr::{AggregateWindowExpr, WindowFn, filter_array};
 use crate::window::{
     PartitionBatches, PartitionWindowAggStates, SlidingAggregateWindowExpr, WindowExpr,
 };
-use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr};
+use crate::{EquivalenceProperties, PhysicalExpr};
 
-use arrow::array::Array;
 use arrow::array::ArrayRef;
+use arrow::array::BooleanArray;
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::{Accumulator, WindowFrame};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err};
+use datafusion_expr::{Accumulator, WindowFrame, WindowFrameBound, WindowFrameUnits};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
 /// A window expr that takes the form of an aggregate function.
 ///
@@ -44,8 +44,10 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering;
 pub struct PlainAggregateWindowExpr {
     aggregate: Arc<AggregateFunctionExpr>,
     partition_by: Vec<Arc<dyn PhysicalExpr>>,
-    order_by: LexOrdering,
+    order_by: Vec<PhysicalSortExpr>,
     window_frame: Arc<WindowFrame>,
+    is_constant_in_partition: bool,
+    filter: Option<Arc<dyn PhysicalExpr>>,
 }
 
 impl PlainAggregateWindowExpr {
@@ -53,14 +55,19 @@ impl PlainAggregateWindowExpr {
     pub fn new(
         aggregate: Arc<AggregateFunctionExpr>,
         partition_by: &[Arc<dyn PhysicalExpr>],
-        order_by: &LexOrdering,
+        order_by: &[PhysicalSortExpr],
         window_frame: Arc<WindowFrame>,
+        filter: Option<Arc<dyn PhysicalExpr>>,
     ) -> Self {
+        let is_constant_in_partition =
+            Self::is_window_constant_in_partition(order_by, &window_frame);
         Self {
             aggregate,
             partition_by: partition_by.to_vec(),
-            order_by: order_by.clone(),
+            order_by: order_by.to_vec(),
             window_frame,
+            is_constant_in_partition,
+            filter,
         }
     }
 
@@ -73,7 +80,7 @@ impl PlainAggregateWindowExpr {
         &self,
         eq_properties: &mut EquivalenceProperties,
         window_expr_index: usize,
-    ) {
+    ) -> Result<()> {
         if let Some(expr) = self
             .get_aggregate_expr()
             .get_result_ordering(window_expr_index)
@@ -82,8 +89,33 @@ impl PlainAggregateWindowExpr {
                 eq_properties,
                 expr,
                 &self.partition_by,
-            );
+            )?;
         }
+        Ok(())
+    }
+
+    // Returns true if every row in the partition has the same window frame. This allows
+    // for preventing bound + function calculation for every row due to the values being the
+    // same.
+    //
+    // This occurs when both bounds fall under either condition below:
+    //  1. Bound is unbounded (`Preceding` or `Following`)
+    //  2. Bound is `CurrentRow` while using `Range` units with no order by clause
+    //  This results in an invalid range specification. Following PostgreSQL’s convention,
+    //  we interpret this as the entire partition being used for the current window frame.
+    fn is_window_constant_in_partition(
+        order_by: &[PhysicalSortExpr],
+        window_frame: &WindowFrame,
+    ) -> bool {
+        let is_constant_bound = |bound: &WindowFrameBound| match bound {
+            WindowFrameBound::CurrentRow => {
+                window_frame.units == WindowFrameUnits::Range && order_by.is_empty()
+            }
+            _ => bound.is_unbounded(),
+        };
+
+        is_constant_bound(&window_frame.start_bound)
+            && is_constant_bound(&window_frame.end_bound)
     }
 }
 
@@ -125,10 +157,9 @@ impl WindowExpr for PlainAggregateWindowExpr {
         // This enables us to run queries involving UNBOUNDED PRECEDING frames
         // using bounded memory for suitable aggregations.
         for partition_row in partition_batches.keys() {
-            let window_state =
-                window_agg_state.get_mut(partition_row).ok_or_else(|| {
-                    DataFusionError::Execution("Cannot find state".to_string())
-                })?;
+            let window_state = window_agg_state
+                .get_mut(partition_row)
+                .ok_or_else(|| exec_datafusion_err!("Cannot find state"))?;
             let state = &mut window_state.state;
             if self.window_frame.start_bound.is_unbounded() {
                 state.window_frame_range.start =
@@ -142,8 +173,8 @@ impl WindowExpr for PlainAggregateWindowExpr {
         &self.partition_by
     }
 
-    fn order_by(&self) -> &LexOrdering {
-        self.order_by.as_ref()
+    fn order_by(&self) -> &[PhysicalSortExpr] {
+        &self.order_by
     }
 
     fn get_window_frame(&self) -> &Arc<WindowFrame> {
@@ -157,15 +188,25 @@ impl WindowExpr for PlainAggregateWindowExpr {
                 Arc::new(PlainAggregateWindowExpr::new(
                     Arc::new(reverse_expr),
                     &self.partition_by.clone(),
-                    reverse_order_bys(self.order_by.as_ref()).as_ref(),
+                    &self
+                        .order_by
+                        .iter()
+                        .map(|e| e.reverse())
+                        .collect::<Vec<_>>(),
                     Arc::new(self.window_frame.reverse()),
+                    self.filter.clone(),
                 )) as _
             } else {
                 Arc::new(SlidingAggregateWindowExpr::new(
                     Arc::new(reverse_expr),
                     &self.partition_by.clone(),
-                    reverse_order_bys(self.order_by.as_ref()).as_ref(),
+                    &self
+                        .order_by
+                        .iter()
+                        .map(|e| e.reverse())
+                        .collect::<Vec<_>>(),
                     Arc::new(self.window_frame.reverse()),
+                    self.filter.clone(),
                 )) as _
             }
         })
@@ -174,6 +215,10 @@ impl WindowExpr for PlainAggregateWindowExpr {
     fn uses_bounded_memory(&self) -> bool {
         !self.window_frame.end_bound.is_unbounded()
     }
+
+    fn create_window_fn(&self) -> Result<WindowFn> {
+        Ok(WindowFn::Aggregate(self.get_accumulator()?))
+    }
 }
 
 impl AggregateWindowExpr for PlainAggregateWindowExpr {
@@ -181,6 +226,10 @@ impl AggregateWindowExpr for PlainAggregateWindowExpr {
         self.aggregate.create_accumulator()
     }
 
+    fn filter_expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
+        self.filter.as_ref()
+    }
+
     /// For a given range, calculate accumulation result inside the range on
     /// `value_slice` and update accumulator state.
     // We assume that `cur_range` contains `last_range` and their start points
@@ -192,6 +241,7 @@ impl AggregateWindowExpr for PlainAggregateWindowExpr {
         cur_range: &Range<usize>,
         value_slice: &[ArrayRef],
         accumulator: &mut Box<dyn Accumulator>,
+        filter_mask: Option<&BooleanArray>,
     ) -> Result<ScalarValue> {
         if cur_range.start == cur_range.end {
             self.aggregate
@@ -204,13 +254,23 @@ impl AggregateWindowExpr for PlainAggregateWindowExpr {
             // same point (i.e. the beginning of the table/frame). Hence, we
             // do not call `retract_batch`.
             if update_bound > 0 {
+                let slice_mask =
+                    filter_mask.map(|m| m.slice(last_range.end, update_bound));
                 let update: Vec<ArrayRef> = value_slice
                     .iter()
                     .map(|v| v.slice(last_range.end, update_bound))
-                    .collect();
+                    .map(|arr| match &slice_mask {
+                        Some(m) => filter_array(&arr, m),
+                        None => Ok(arr),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
                 accumulator.update_batch(&update)?
             }
             accumulator.evaluate()
         }
     }
+
+    fn is_constant_in_partition(&self) -> bool {
+        self.is_constant_in_partition
+    }
 }
diff --git a/datafusion/physical-expr/src/window/mod.rs b/datafusion/physical-expr/src/window/mod.rs
index bc7c716783bdc..b45e35440ac20 100644
--- a/datafusion/physical-expr/src/window/mod.rs
+++ b/datafusion/physical-expr/src/window/mod.rs
@@ -21,12 +21,6 @@ mod standard;
 mod standard_window_function_expr;
 mod window_expr;
 
-#[deprecated(since = "44.0.0", note = "use StandardWindowExpr")]
-pub type BuiltInWindowExpr = StandardWindowExpr;
-
-#[deprecated(since = "44.0.0", note = "use StandardWindowFunctionExpr")]
-pub type BuiltInWindowFunctionExpr = dyn StandardWindowFunctionExpr;
-
 pub use aggregate::PlainAggregateWindowExpr;
 pub use sliding_aggregate::SlidingAggregateWindowExpr;
 pub use standard::StandardWindowExpr;
diff --git a/datafusion/physical-expr/src/window/sliding_aggregate.rs b/datafusion/physical-expr/src/window/sliding_aggregate.rs
index 2b22299f9386b..a71df3ec88472 100644
--- a/datafusion/physical-expr/src/window/sliding_aggregate.rs
+++ b/datafusion/physical-expr/src/window/sliding_aggregate.rs
@@ -22,18 +22,17 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use crate::aggregate::AggregateFunctionExpr;
-use crate::window::window_expr::AggregateWindowExpr;
+use crate::window::window_expr::{AggregateWindowExpr, WindowFn, filter_array};
 use crate::window::{
     PartitionBatches, PartitionWindowAggStates, PlainAggregateWindowExpr, WindowExpr,
 };
-use crate::{expressions::PhysicalSortExpr, reverse_order_bys, PhysicalExpr};
+use crate::{PhysicalExpr, expressions::PhysicalSortExpr};
 
-use arrow::array::{Array, ArrayRef};
+use arrow::array::{ArrayRef, BooleanArray};
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{Accumulator, WindowFrame};
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 /// A window expr that takes the form of an aggregate function that
 /// can be incrementally computed over sliding windows.
@@ -43,8 +42,9 @@ use datafusion_physical_expr_common::sort_expr::LexOrdering;
 pub struct SlidingAggregateWindowExpr {
     aggregate: Arc<AggregateFunctionExpr>,
     partition_by: Vec<Arc<dyn PhysicalExpr>>,
-    order_by: LexOrdering,
+    order_by: Vec<PhysicalSortExpr>,
     window_frame: Arc<WindowFrame>,
+    filter: Option<Arc<dyn PhysicalExpr>>,
 }
 
 impl SlidingAggregateWindowExpr {
@@ -52,14 +52,16 @@ impl SlidingAggregateWindowExpr {
     pub fn new(
         aggregate: Arc<AggregateFunctionExpr>,
         partition_by: &[Arc<dyn PhysicalExpr>],
-        order_by: &LexOrdering,
+        order_by: &[PhysicalSortExpr],
         window_frame: Arc<WindowFrame>,
+        filter: Option<Arc<dyn PhysicalExpr>>,
     ) -> Self {
         Self {
             aggregate,
             partition_by: partition_by.to_vec(),
-            order_by: order_by.clone(),
+            order_by: order_by.to_vec(),
             window_frame,
+            filter,
         }
     }
 
@@ -108,8 +110,8 @@ impl WindowExpr for SlidingAggregateWindowExpr {
         &self.partition_by
     }
 
-    fn order_by(&self) -> &LexOrdering {
-        self.order_by.as_ref()
+    fn order_by(&self) -> &[PhysicalSortExpr] {
+        &self.order_by
     }
 
     fn get_window_frame(&self) -> &Arc<WindowFrame> {
@@ -123,15 +125,25 @@ impl WindowExpr for SlidingAggregateWindowExpr {
                 Arc::new(PlainAggregateWindowExpr::new(
                     Arc::new(reverse_expr),
                     &self.partition_by.clone(),
-                    reverse_order_bys(self.order_by.as_ref()).as_ref(),
+                    &self
+                        .order_by
+                        .iter()
+                        .map(|e| e.reverse())
+                        .collect::<Vec<_>>(),
                     Arc::new(self.window_frame.reverse()),
+                    self.filter.clone(),
                 )) as _
             } else {
                 Arc::new(SlidingAggregateWindowExpr::new(
                     Arc::new(reverse_expr),
                     &self.partition_by.clone(),
-                    reverse_order_bys(self.order_by.as_ref()).as_ref(),
+                    &self
+                        .order_by
+                        .iter()
+                        .map(|e| e.reverse())
+                        .collect::<Vec<_>>(),
                     Arc::new(self.window_frame.reverse()),
+                    self.filter.clone(),
                 )) as _
             }
         })
@@ -157,7 +169,7 @@ impl WindowExpr for SlidingAggregateWindowExpr {
                 expr: new_expr,
                 options: req.options,
             })
-            .collect::<LexOrdering>();
+            .collect();
         Some(Arc::new(SlidingAggregateWindowExpr {
             aggregate: self
                 .aggregate
@@ -166,8 +178,13 @@ impl WindowExpr for SlidingAggregateWindowExpr {
             partition_by: partition_bys,
             order_by: new_order_by,
             window_frame: Arc::clone(&self.window_frame),
+            filter: self.filter.clone(),
         }))
     }
+
+    fn create_window_fn(&self) -> Result<WindowFn> {
+        Ok(WindowFn::Aggregate(self.get_accumulator()?))
+    }
 }
 
 impl AggregateWindowExpr for SlidingAggregateWindowExpr {
@@ -175,6 +192,10 @@ impl AggregateWindowExpr for SlidingAggregateWindowExpr {
         self.aggregate.create_sliding_accumulator()
     }
 
+    fn filter_expr(&self) -> Option<&Arc<dyn PhysicalExpr>> {
+        self.filter.as_ref()
+    }
+
     /// Given current range and the last range, calculates the accumulator
     /// result for the range of interest.
     fn get_aggregate_result_inside_range(
@@ -183,6 +204,7 @@ impl AggregateWindowExpr for SlidingAggregateWindowExpr {
         cur_range: &Range<usize>,
         value_slice: &[ArrayRef],
         accumulator: &mut Box<dyn Accumulator>,
+        filter_mask: Option<&BooleanArray>,
     ) -> Result<ScalarValue> {
         if cur_range.start == cur_range.end {
             self.aggregate
@@ -191,23 +213,39 @@ impl AggregateWindowExpr for SlidingAggregateWindowExpr {
             // Accumulate any new rows that have entered the window:
             let update_bound = cur_range.end - last_range.end;
             if update_bound > 0 {
+                let slice_mask =
+                    filter_mask.map(|m| m.slice(last_range.end, update_bound));
                 let update: Vec<ArrayRef> = value_slice
                     .iter()
                     .map(|v| v.slice(last_range.end, update_bound))
-                    .collect();
+                    .map(|arr| match &slice_mask {
+                        Some(m) => filter_array(&arr, m),
+                        None => Ok(arr),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
                 accumulator.update_batch(&update)?
             }
 
             // Remove rows that have now left the window:
             let retract_bound = cur_range.start - last_range.start;
             if retract_bound > 0 {
+                let slice_mask =
+                    filter_mask.map(|m| m.slice(last_range.start, retract_bound));
                 let retract: Vec<ArrayRef> = value_slice
                     .iter()
                     .map(|v| v.slice(last_range.start, retract_bound))
-                    .collect();
+                    .map(|arr| match &slice_mask {
+                        Some(m) => filter_array(&arr, m),
+                        None => Ok(arr),
+                    })
+                    .collect::<Result<Vec<_>>>()?;
                 accumulator.retract_batch(&retract)?
             }
             accumulator.evaluate()
         }
     }
+
+    fn is_constant_in_partition(&self) -> bool {
+        false
+    }
 }
diff --git a/datafusion/physical-expr/src/window/standard.rs b/datafusion/physical-expr/src/window/standard.rs
index 73f47b0b68632..f8d92d5de4ad5 100644
--- a/datafusion/physical-expr/src/window/standard.rs
+++ b/datafusion/physical-expr/src/window/standard.rs
@@ -22,25 +22,25 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use super::{StandardWindowFunctionExpr, WindowExpr};
-use crate::window::window_expr::{get_orderby_values, WindowFn};
+use crate::window::window_expr::{WindowFn, get_orderby_values};
 use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState};
-use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr};
-use arrow::array::{new_empty_array, ArrayRef};
-use arrow::compute::SortOptions;
+use crate::{EquivalenceProperties, PhysicalExpr};
+
+use arrow::array::{ArrayRef, new_empty_array};
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_common::{Result, ScalarValue};
-use datafusion_expr::window_state::{WindowAggState, WindowFrameContext};
 use datafusion_expr::WindowFrame;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_expr::window_state::{WindowAggState, WindowFrameContext};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
 /// A window expr that takes the form of a [`StandardWindowFunctionExpr`].
 #[derive(Debug)]
 pub struct StandardWindowExpr {
     expr: Arc<dyn StandardWindowFunctionExpr>,
     partition_by: Vec<Arc<dyn PhysicalExpr>>,
-    order_by: LexOrdering,
+    order_by: Vec<PhysicalSortExpr>,
     window_frame: Arc<WindowFrame>,
 }
 
@@ -49,13 +49,13 @@ impl StandardWindowExpr {
     pub fn new(
         expr: Arc<dyn StandardWindowFunctionExpr>,
         partition_by: &[Arc<dyn PhysicalExpr>],
-        order_by: &LexOrdering,
+        order_by: &[PhysicalSortExpr],
         window_frame: Arc<WindowFrame>,
     ) -> Self {
         Self {
             expr,
             partition_by: partition_by.to_vec(),
-            order_by: order_by.clone(),
+            order_by: order_by.to_vec(),
             window_frame,
         }
     }
@@ -70,15 +70,19 @@ impl StandardWindowExpr {
     /// If `self.expr` doesn't have an ordering, ordering equivalence properties
     /// are not updated. Otherwise, ordering equivalence properties are updated
     /// by the ordering of `self.expr`.
-    pub fn add_equal_orderings(&self, eq_properties: &mut EquivalenceProperties) {
+    pub fn add_equal_orderings(
+        &self,
+        eq_properties: &mut EquivalenceProperties,
+    ) -> Result<()> {
         let schema = eq_properties.schema();
         if let Some(fn_res_ordering) = self.expr.get_result_ordering(schema) {
             add_new_ordering_expr_with_partition_by(
                 eq_properties,
                 fn_res_ordering,
                 &self.partition_by,
-            );
+            )?;
         }
+        Ok(())
     }
 }
 
@@ -104,16 +108,15 @@ impl WindowExpr for StandardWindowExpr {
         &self.partition_by
     }
 
-    fn order_by(&self) -> &LexOrdering {
-        self.order_by.as_ref()
+    fn order_by(&self) -> &[PhysicalSortExpr] {
+        &self.order_by
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef> {
         let mut evaluator = self.expr.create_evaluator()?;
         let num_rows = batch.num_rows();
         if evaluator.uses_window_frame() {
-            let sort_options: Vec<SortOptions> =
-                self.order_by.iter().map(|o| o.options).collect();
+            let sort_options = self.order_by.iter().map(|o| o.options).collect();
             let mut row_wise_results = vec![];
 
             let mut values = self.evaluate_args(batch)?;
@@ -158,6 +161,9 @@ impl WindowExpr for StandardWindowExpr {
         let field = self.expr.field()?;
         let out_type = field.data_type();
         let sort_options = self.order_by.iter().map(|o| o.options).collect::<Vec<_>>();
+        // create a WindowAggState to clone when `window_agg_state` does not contain the respective
+        // group, which is faster than potentially creating a new one at every iteration
+        let new_state = WindowAggState::new(out_type)?;
         for (partition_row, partition_batch_state) in partition_batches.iter() {
             let window_state =
                 if let Some(window_state) = window_agg_state.get_mut(partition_row) {
@@ -167,7 +173,7 @@ impl WindowExpr for StandardWindowExpr {
                     window_agg_state
                         .entry(partition_row.clone())
                         .or_insert(WindowState {
-                            state: WindowAggState::new(out_type)?,
+                            state: new_state.clone(),
                             window_fn: WindowFn::Builtin(evaluator),
                         })
                 };
@@ -232,6 +238,9 @@ impl WindowExpr for StandardWindowExpr {
             }
             let out_col = if row_wise_results.is_empty() {
                 new_empty_array(out_type)
+            } else if row_wise_results.len() == 1 {
+                // fast path when the result only has a single row
+                row_wise_results[0].to_array()?
             } else {
                 ScalarValue::iter_to_array(row_wise_results.into_iter())?
             };
@@ -253,7 +262,11 @@ impl WindowExpr for StandardWindowExpr {
             Arc::new(StandardWindowExpr::new(
                 reverse_expr,
                 &self.partition_by.clone(),
-                reverse_order_bys(self.order_by.as_ref()).as_ref(),
+                &self
+                    .order_by
+                    .iter()
+                    .map(|e| e.reverse())
+                    .collect::<Vec<_>>(),
                 Arc::new(self.window_frame.reverse()),
             )) as _
         })
@@ -268,6 +281,10 @@ impl WindowExpr for StandardWindowExpr {
             false
         }
     }
+
+    fn create_window_fn(&self) -> Result<WindowFn> {
+        Ok(WindowFn::Builtin(self.expr.create_evaluator()?))
+    }
 }
 
 /// Adds a new ordering expression into existing ordering equivalence class(es) based on
@@ -276,10 +293,10 @@ pub(crate) fn add_new_ordering_expr_with_partition_by(
     eqp: &mut EquivalenceProperties,
     expr: PhysicalSortExpr,
     partition_by: &[Arc<dyn PhysicalExpr>],
-) {
+) -> Result<()> {
     if partition_by.is_empty() {
         // In the absence of a PARTITION BY, ordering of `self.expr` is global:
-        eqp.add_new_orderings([LexOrdering::new(vec![expr])]);
+        eqp.add_ordering([expr]);
     } else {
         // If we have a PARTITION BY, standard functions can not introduce
         // a global ordering unless the existing ordering is compatible
@@ -287,10 +304,11 @@ pub(crate) fn add_new_ordering_expr_with_partition_by(
         // expressions and existing ordering expressions are equal (w.r.t.
         // set equality), we can prefix the ordering of `self.expr` with
         // the existing ordering.
-        let (mut ordering, _) = eqp.find_longest_permutation(partition_by);
+        let (mut ordering, _) = eqp.find_longest_permutation(partition_by)?;
         if ordering.len() == partition_by.len() {
             ordering.push(expr);
-            eqp.add_new_orderings([ordering]);
+            eqp.add_ordering(ordering);
         }
     }
+    Ok(())
 }
diff --git a/datafusion/physical-expr/src/window/standard_window_function_expr.rs b/datafusion/physical-expr/src/window/standard_window_function_expr.rs
index 871f735e9a963..a6ea5e44a4997 100644
--- a/datafusion/physical-expr/src/window/standard_window_function_expr.rs
+++ b/datafusion/physical-expr/src/window/standard_window_function_expr.rs
@@ -21,20 +21,20 @@ use arrow::array::ArrayRef;
 use arrow::datatypes::{FieldRef, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
-use datafusion_expr::PartitionEvaluator;
+use datafusion_expr::{LimitEffect, PartitionEvaluator};
 
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use std::any::Any;
 use std::sync::Arc;
 
 /// Evaluates a window function by instantiating a
-/// `[PartitionEvaluator]` for calculating the function's output in
+/// [`PartitionEvaluator`] for calculating the function's output in
 /// that partition.
 ///
 /// Note that unlike aggregation based window functions, some window
 /// functions such as `rank` ignore the values in the window frame,
 /// but others such as `first_value`, `last_value`, and
 /// `nth_value` need the value.
-#[allow(rustdoc::private_intra_doc_links)]
 pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug {
     /// Returns the aggregate expression as [`Any`] so that it can be
     /// downcast to a specific implementation.
@@ -57,13 +57,7 @@ pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug {
     ///
     /// Typically, the resulting vector is a single element vector.
     fn evaluate_args(&self, batch: &RecordBatch) -> Result<Vec<ArrayRef>> {
-        self.expressions()
-            .iter()
-            .map(|e| {
-                e.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect()
+        evaluate_expressions_to_arrays(&self.expressions(), batch)
     }
 
     /// Create a [`PartitionEvaluator`] for evaluating the function on
@@ -90,4 +84,6 @@ pub trait StandardWindowFunctionExpr: Send + Sync + std::fmt::Debug {
     fn get_result_ordering(&self, _schema: &SchemaRef) -> Option<PhysicalSortExpr> {
         None
     }
+
+    fn limit_effect(&self) -> LimitEffect;
 }
diff --git a/datafusion/physical-expr/src/window/window_expr.rs b/datafusion/physical-expr/src/window/window_expr.rs
index 8d72604a6af50..0f0ec647a50ae 100644
--- a/datafusion/physical-expr/src/window/window_expr.rs
+++ b/datafusion/physical-expr/src/window/window_expr.rs
@@ -20,20 +20,27 @@ use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
 
-use crate::{LexOrdering, PhysicalExpr};
+use crate::PhysicalExpr;
 
-use arrow::array::{new_empty_array, Array, ArrayRef};
-use arrow::compute::kernels::sort::SortColumn;
+use arrow::array::BooleanArray;
+use arrow::array::{Array, ArrayRef, new_empty_array};
 use arrow::compute::SortOptions;
+use arrow::compute::filter as arrow_filter;
+use arrow::compute::kernels::sort::SortColumn;
 use arrow::datatypes::FieldRef;
 use arrow::record_batch::RecordBatch;
+use datafusion_common::cast::as_boolean_array;
 use datafusion_common::utils::compare_rows;
-use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue};
+use datafusion_common::{
+    Result, ScalarValue, arrow_datafusion_err, exec_datafusion_err, internal_err,
+};
 use datafusion_expr::window_state::{
     PartitionBatchState, WindowAggState, WindowFrameContext, WindowFrameStateGroups,
 };
 use datafusion_expr::{Accumulator, PartitionEvaluator, WindowFrame, WindowFrameBound};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use indexmap::IndexMap;
 
 /// Common trait for [window function] implementations
@@ -83,13 +90,7 @@ pub trait WindowExpr: Send + Sync + Debug {
     /// Evaluate the window function arguments against the batch and return
     /// array ref, normally the resulting `Vec` is a single element one.
     fn evaluate_args(&self, batch: &RecordBatch) -> Result<Vec<ArrayRef>> {
-        self.expressions()
-            .iter()
-            .map(|e| {
-                e.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect()
+        evaluate_expressions_to_arrays(&self.expressions(), batch)
     }
 
     /// Evaluate the window function values against the batch
@@ -109,14 +110,14 @@ pub trait WindowExpr: Send + Sync + Debug {
     fn partition_by(&self) -> &[Arc<dyn PhysicalExpr>];
 
     /// Expressions that's from the window function's order by clause, empty if absent
-    fn order_by(&self) -> &LexOrdering;
+    fn order_by(&self) -> &[PhysicalSortExpr];
 
     /// Get order by columns, empty if absent
     fn order_by_columns(&self, batch: &RecordBatch) -> Result<Vec<SortColumn>> {
         self.order_by()
             .iter()
             .map(|e| e.evaluate_to_sort_column(batch))
-            .collect::<Result<Vec<SortColumn>>>()
+            .collect()
     }
 
     /// Get the window frame of this [WindowExpr].
@@ -129,6 +130,12 @@ pub trait WindowExpr: Send + Sync + Debug {
     /// Get the reverse expression of this [WindowExpr].
     fn get_reverse_expr(&self) -> Option<Arc<dyn WindowExpr>>;
 
+    /// Creates a new instance of the window function evaluator.
+    ///
+    /// Returns `WindowFn::Builtin` for built-in window functions (e.g., ROW_NUMBER, RANK)
+    /// or `WindowFn::Aggregate` for aggregate window functions (e.g., SUM, AVG).
+    fn create_window_fn(&self) -> Result<WindowFn>;
+
     /// Returns all expressions used in the [`WindowExpr`].
     /// These expressions are (1) function arguments, (2) partition by expressions, (3) order by expressions.
     fn all_expressions(&self) -> WindowPhysicalExpressions {
@@ -138,7 +145,7 @@ pub trait WindowExpr: Send + Sync + Debug {
             .order_by()
             .iter()
             .map(|sort_expr| Arc::clone(&sort_expr.expr))
-            .collect::<Vec<_>>();
+            .collect();
         WindowPhysicalExpressions {
             args,
             partition_by_exprs,
@@ -176,6 +183,9 @@ pub trait AggregateWindowExpr: WindowExpr {
     /// (non-sliding) expressions will return sliding (normal) accumulators.
     fn get_accumulator(&self) -> Result<Box<dyn Accumulator>>;
 
+    /// Optional FILTER (WHERE ...) predicate for this window aggregate.
+    fn filter_expr(&self) -> Option<&Arc<dyn PhysicalExpr>>;
+
     /// Given current range and the last range, calculates the accumulator
     /// result for the range of interest.
     fn get_aggregate_result_inside_range(
@@ -184,14 +194,18 @@ pub trait AggregateWindowExpr: WindowExpr {
         cur_range: &Range<usize>,
         value_slice: &[ArrayRef],
         accumulator: &mut Box<dyn Accumulator>,
+        filter_mask: Option<&BooleanArray>,
     ) -> Result<ScalarValue>;
 
+    /// Indicates whether this window function always produces the same result
+    /// for all rows in the partition.
+    fn is_constant_in_partition(&self) -> bool;
+
     /// Evaluates the window function against the batch.
     fn aggregate_evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef> {
         let mut accumulator = self.get_accumulator()?;
         let mut last_range = Range { start: 0, end: 0 };
-        let sort_options: Vec<SortOptions> =
-            self.order_by().iter().map(|o| o.options).collect();
+        let sort_options = self.order_by().iter().map(|o| o.options).collect();
         let mut window_frame_ctx =
             WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options);
         self.get_result_column(
@@ -225,10 +239,9 @@ pub trait AggregateWindowExpr: WindowExpr {
                     },
                 );
             };
-            let window_state =
-                window_agg_state.get_mut(partition_row).ok_or_else(|| {
-                    DataFusionError::Execution("Cannot find state".to_string())
-                })?;
+            let window_state = window_agg_state
+                .get_mut(partition_row)
+                .ok_or_else(|| exec_datafusion_err!("Cannot find state"))?;
             let accumulator = match &mut window_state.window_fn {
                 WindowFn::Aggregate(accumulator) => accumulator,
                 _ => unreachable!(),
@@ -239,8 +252,7 @@ pub trait AggregateWindowExpr: WindowExpr {
 
             // If there is no window state context, initialize it.
             let window_frame_ctx = state.window_frame_ctx.get_or_insert_with(|| {
-                let sort_options: Vec<SortOptions> =
-                    self.order_by().iter().map(|o| o.options).collect();
+                let sort_options = self.order_by().iter().map(|o| o.options).collect();
                 WindowFrameContext::new(Arc::clone(self.get_window_frame()), sort_options)
             });
             let out_col = self.get_result_column(
@@ -260,7 +272,16 @@ pub trait AggregateWindowExpr: WindowExpr {
 
     /// Calculates the window expression result for the given record batch.
     /// Assumes that `record_batch` belongs to a single partition.
-    #[allow(clippy::too_many_arguments)]
+    ///
+    /// # Arguments
+    /// * `accumulator`: The accumulator to use for the calculation.
+    /// * `record_batch`: batch belonging to the current partition (see [`PartitionBatchState`]).
+    /// * `most_recent_row`: the batch that contains the most recent row, if available (see [`PartitionBatchState`]).
+    /// * `last_range`: The last range of rows that were processed (see [`WindowAggState`]).
+    /// * `window_frame_ctx`: Details about the window frame (see [`WindowFrameContext`]).
+    /// * `idx`: The index of the current row in the record batch.
+    /// * `not_end`: is the current row not the end of the partition (see [`PartitionBatchState`]).
+    #[expect(clippy::too_many_arguments)]
     fn get_result_column(
         &self,
         accumulator: &mut Box<dyn Accumulator>,
@@ -272,8 +293,39 @@ pub trait AggregateWindowExpr: WindowExpr {
         not_end: bool,
     ) -> Result<ArrayRef> {
         let values = self.evaluate_args(record_batch)?;
-        let order_bys = get_orderby_values(self.order_by_columns(record_batch)?);
 
+        // Evaluate filter mask once per record batch if present
+        let filter_mask_arr: Option<ArrayRef> = match self.filter_expr() {
+            Some(expr) => {
+                let value = expr.evaluate(record_batch)?;
+                Some(value.into_array(record_batch.num_rows())?)
+            }
+            None => None,
+        };
+
+        // Borrow boolean view from the owned array
+        let filter_mask: Option<&BooleanArray> = match filter_mask_arr.as_deref() {
+            Some(arr) => Some(as_boolean_array(arr)?),
+            None => None,
+        };
+
+        if self.is_constant_in_partition() {
+            if not_end {
+                let field = self.field()?;
+                let out_type = field.data_type();
+                return Ok(new_empty_array(out_type));
+            }
+            let values = if let Some(mask) = filter_mask {
+                // Apply mask to all argument arrays before a single update
+                filter_arrays(&values, mask)?
+            } else {
+                values
+            };
+            accumulator.update_batch(&values)?;
+            let value = accumulator.evaluate()?;
+            return value.to_array_of_size(record_batch.num_rows());
+        }
+        let order_bys = get_orderby_values(self.order_by_columns(record_batch)?);
         let most_recent_row_order_bys = most_recent_row
             .map(|batch| self.order_by_columns(batch))
             .transpose()?
@@ -306,6 +358,7 @@ pub trait AggregateWindowExpr: WindowExpr {
                 &cur_range,
                 &values,
                 accumulator,
+                filter_mask,
             )?;
             // Update last range
             *last_range = cur_range;
@@ -323,6 +376,21 @@ pub trait AggregateWindowExpr: WindowExpr {
     }
 }
 
+/// Filters a single array with the provided boolean mask.
+pub(crate) fn filter_array(array: &ArrayRef, mask: &BooleanArray) -> Result<ArrayRef> {
+    arrow_filter(array.as_ref(), mask)
+        .map(|a| a as ArrayRef)
+        .map_err(|e| arrow_datafusion_err!(e))
+}
+
+/// Filters a list of arrays with the provided boolean mask.
+pub(crate) fn filter_arrays(
+    arrays: &[ArrayRef],
+    mask: &BooleanArray,
+) -> Result<Vec<ArrayRef>> {
+    arrays.iter().map(|arr| filter_array(arr, mask)).collect()
+}
+
 /// Determines whether the end bound calculation for a window frame context is
 /// safe, meaning that the end bound stays the same, regardless of future data,
 /// based on the current sort expressions and ORDER BY columns. This function
@@ -344,13 +412,13 @@ pub(crate) fn is_end_bound_safe(
     window_frame_ctx: &WindowFrameContext,
     order_bys: &[ArrayRef],
     most_recent_order_bys: Option<&[ArrayRef]>,
-    sort_exprs: &LexOrdering,
+    sort_exprs: &[PhysicalSortExpr],
     idx: usize,
 ) -> Result<bool> {
     if sort_exprs.is_empty() {
         // Early return if no sort expressions are present:
         return Ok(false);
-    }
+    };
 
     match window_frame_ctx {
         WindowFrameContext::Rows(window_frame) => {
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index aaadb09bcc98a..38c8a7c37211f 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -39,18 +42,20 @@ recursive_protection = ["dep:recursive"]
 
 [dependencies]
 arrow = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
+datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true, default-features = true }
 datafusion-physical-expr = { workspace = true }
 datafusion-physical-expr-common = { workspace = true }
 datafusion-physical-plan = { workspace = true }
+datafusion-pruning = { workspace = true }
 itertools = { workspace = true }
-log = { workspace = true }
 recursive = { workspace = true, optional = true }
 
 [dev-dependencies]
 datafusion-expr = { workspace = true }
-datafusion-functions-nested = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-functions-window = { workspace = true }
 insta = { workspace = true }
+tokio = { workspace = true }
diff --git a/datafusion/physical-optimizer/README.md b/datafusion/physical-optimizer/README.md
index eb361d3f67792..3efbc19d2e724 100644
--- a/datafusion/physical-optimizer/README.md
+++ b/datafusion/physical-optimizer/README.md
@@ -17,9 +17,16 @@
   under the License.
 -->
 
-# DataFusion Physical Optimizer
+# Apache DataFusion Physical Optimizer
 
-DataFusion is an extensible query execution framework, written in Rust,
-that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate contains the physical optimizer for DataFusion.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs
index 6c44c8fe86c5e..5caee8b047d83 100644
--- a/datafusion/physical-optimizer/src/aggregate_statistics.rs
+++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs
@@ -16,15 +16,15 @@
 // under the License.
 
 //! Utilizing exact statistics from sources to avoid scanning data
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::scalar::ScalarValue;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::aggregates::{AggregateExec, AggregateInputMode};
 use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::udaf::{AggregateFunctionExpr, StatisticsArgs};
-use datafusion_physical_plan::{expressions, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, expressions};
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
@@ -34,7 +34,7 @@ use crate::PhysicalOptimizerRule;
 pub struct AggregateStatistics {}
 
 impl AggregateStatistics {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -42,6 +42,7 @@ impl AggregateStatistics {
 
 impl PhysicalOptimizerRule for AggregateStatistics {
     #[cfg_attr(feature = "recursive_protection", recursive::recursive)]
+    #[expect(clippy::allow_attributes)] // See https://github.com/apache/datafusion/issues/18881#issuecomment-3621545670
     #[allow(clippy::only_used_in_recursion)] // See https://github.com/rust-lang/rust-clippy/issues/14566
     fn optimize(
         &self,
@@ -67,8 +68,10 @@ impl PhysicalOptimizerRule for AggregateStatistics {
                 if let Some((optimizable_statistic, name)) =
                     take_optimizable_value_from_statistics(&statistics_args, expr)
                 {
-                    projections
-                        .push((expressions::lit(optimizable_statistic), name.to_owned()));
+                    projections.push(ProjectionExpr {
+                        expr: expressions::lit(optimizable_statistic),
+                        alias: name.to_owned(),
+                    });
                 } else {
                     // TODO: we need all aggr_expr to be resolved (cf TODO fullres)
                     break;
@@ -112,27 +115,23 @@ impl PhysicalOptimizerRule for AggregateStatistics {
 /// We would have preferred to return a casted ref to AggregateExec but the recursion requires
 /// the `ExecutionPlan.children()` method that returns an owned reference.
 fn take_optimizable(node: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
-    if let Some(final_agg_exec) = node.as_any().downcast_ref::<AggregateExec>() {
-        if !final_agg_exec.mode().is_first_stage()
-            && final_agg_exec.group_expr().is_empty()
-        {
-            let mut child = Arc::clone(final_agg_exec.input());
-            loop {
-                if let Some(partial_agg_exec) =
-                    child.as_any().downcast_ref::<AggregateExec>()
-                {
-                    if partial_agg_exec.mode().is_first_stage()
-                        && partial_agg_exec.group_expr().is_empty()
-                        && partial_agg_exec.filter_expr().iter().all(|e| e.is_none())
-                    {
-                        return Some(child);
-                    }
-                }
-                if let [childrens_child] = child.children().as_slice() {
-                    child = Arc::clone(childrens_child);
-                } else {
-                    break;
-                }
+    if let Some(final_agg_exec) = node.as_any().downcast_ref::<AggregateExec>()
+        && final_agg_exec.mode().input_mode() == AggregateInputMode::Partial
+        && final_agg_exec.group_expr().is_empty()
+    {
+        let mut child = Arc::clone(final_agg_exec.input());
+        loop {
+            if let Some(partial_agg_exec) = child.as_any().downcast_ref::<AggregateExec>()
+                && partial_agg_exec.mode().input_mode() == AggregateInputMode::Raw
+                && partial_agg_exec.group_expr().is_empty()
+                && partial_agg_exec.filter_expr().iter().all(|e| e.is_none())
+            {
+                return Some(child);
+            }
+            if let [childrens_child] = child.children().as_slice() {
+                child = Arc::clone(childrens_child);
+            } else {
+                break;
             }
         }
     }
diff --git a/datafusion/physical-optimizer/src/coalesce_batches.rs b/datafusion/physical-optimizer/src/coalesce_batches.rs
deleted file mode 100644
index 5cf2c877c61a4..0000000000000
--- a/datafusion/physical-optimizer/src/coalesce_batches.rs
+++ /dev/null
@@ -1,94 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! CoalesceBatches optimizer that groups batches together rows
-//! in bigger batches to avoid overhead with small batches
-
-use crate::PhysicalOptimizerRule;
-
-use std::sync::Arc;
-
-use datafusion_common::config::ConfigOptions;
-use datafusion_common::error::Result;
-use datafusion_physical_expr::Partitioning;
-use datafusion_physical_plan::{
-    coalesce_batches::CoalesceBatchesExec, filter::FilterExec, joins::HashJoinExec,
-    repartition::RepartitionExec, ExecutionPlan,
-};
-
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-
-/// Optimizer rule that introduces CoalesceBatchesExec to avoid overhead with small batches that
-/// are produced by highly selective filters
-#[derive(Default, Debug)]
-pub struct CoalesceBatches {}
-
-impl CoalesceBatches {
-    #[allow(missing_docs)]
-    pub fn new() -> Self {
-        Self::default()
-    }
-}
-impl PhysicalOptimizerRule for CoalesceBatches {
-    fn optimize(
-        &self,
-        plan: Arc<dyn ExecutionPlan>,
-        config: &ConfigOptions,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        if !config.execution.coalesce_batches {
-            return Ok(plan);
-        }
-
-        let target_batch_size = config.execution.batch_size;
-        plan.transform_up(|plan| {
-            let plan_any = plan.as_any();
-            // The goal here is to detect operators that could produce small batches and only
-            // wrap those ones with a CoalesceBatchesExec operator. An alternate approach here
-            // would be to build the coalescing logic directly into the operators
-            // See https://github.com/apache/datafusion/issues/139
-            let wrap_in_coalesce = plan_any.downcast_ref::<FilterExec>().is_some()
-                || plan_any.downcast_ref::<HashJoinExec>().is_some()
-                // Don't need to add CoalesceBatchesExec after a round robin RepartitionExec
-                || plan_any
-                    .downcast_ref::<RepartitionExec>()
-                    .map(|repart_exec| {
-                        !matches!(
-                            repart_exec.partitioning().clone(),
-                            Partitioning::RoundRobinBatch(_)
-                        )
-                    })
-                    .unwrap_or(false);
-            if wrap_in_coalesce {
-                Ok(Transformed::yes(Arc::new(CoalesceBatchesExec::new(
-                    plan,
-                    target_batch_size,
-                ))))
-            } else {
-                Ok(Transformed::no(plan))
-            }
-        })
-        .data()
-    }
-
-    fn name(&self) -> &str {
-        "coalesce_batches"
-    }
-
-    fn schema_check(&self) -> bool {
-        true
-    }
-}
diff --git a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
index 86f7e73e9e359..860406118c1b7 100644
--- a/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
+++ b/datafusion/physical-optimizer/src/combine_partial_final_agg.rs
@@ -21,27 +21,26 @@
 use std::sync::Arc;
 
 use datafusion_common::error::Result;
+use datafusion_physical_plan::ExecutionPlan;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
-use datafusion_physical_plan::ExecutionPlan;
 
 use crate::PhysicalOptimizerRule;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
-use datafusion_physical_expr::{physical_exprs_equal, PhysicalExpr};
+use datafusion_physical_expr::{PhysicalExpr, physical_exprs_equal};
 
 /// CombinePartialFinalAggregate optimizer rule combines the adjacent Partial and Final AggregateExecs
 /// into a Single AggregateExec if their grouping exprs and aggregate exprs equal.
 ///
 /// This rule should be applied after the EnforceDistribution and EnforceSorting rules
-///
 #[derive(Default, Debug)]
 pub struct CombinePartialFinalAggregate {}
 
 impl CombinePartialFinalAggregate {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -73,7 +72,7 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
                 return Ok(Transformed::no(plan));
             };
 
-            let transformed = if matches!(input_agg_exec.mode(), AggregateMode::Partial)
+            let transformed = if *input_agg_exec.mode() == AggregateMode::Partial
                 && can_combine(
                     (
                         agg_exec.group_expr(),
@@ -99,7 +98,9 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate {
                     Arc::clone(input_agg_exec.input()),
                     input_agg_exec.input_schema(),
                 )
-                .map(|combined_agg| combined_agg.with_limit(agg_exec.limit()))
+                .map(|combined_agg| {
+                    combined_agg.with_limit_options(agg_exec.limit_options())
+                })
                 .ok()
                 .map(Arc::new)
             } else {
diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs
index 700b00c19dd57..d23a699f715de 100644
--- a/datafusion/physical-optimizer/src/enforce_distribution.rs
+++ b/datafusion/physical-optimizer/src/enforce_distribution.rs
@@ -36,12 +36,13 @@ use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::stats::Precision;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_expr::logical_plan::JoinType;
+use datafusion_expr::logical_plan::{Aggregate, JoinType};
 use datafusion_physical_expr::expressions::{Column, NoOp};
 use datafusion_physical_expr::utils::map_columns_before_projection;
 use datafusion_physical_expr::{
-    physical_exprs_equal, EquivalenceProperties, PhysicalExpr, PhysicalExprRef,
+    EquivalenceProperties, PhysicalExpr, PhysicalExprRef, physical_exprs_equal,
 };
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::aggregates::{
     AggregateExec, AggregateMode, PhysicalGroupBy,
 };
@@ -50,14 +51,13 @@ use datafusion_physical_plan::execution_plan::EmissionType;
 use datafusion_physical_plan::joins::{
     CrossJoinExec, HashJoinExec, PartitionMode, SortMergeJoinExec,
 };
-use datafusion_physical_plan::projection::ProjectionExec;
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
-use datafusion_physical_plan::union::{can_interleave, InterleaveExec, UnionExec};
+use datafusion_physical_plan::union::{InterleaveExec, UnionExec, can_interleave};
 use datafusion_physical_plan::windows::WindowAggExec;
-use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec};
-use datafusion_physical_plan::ExecutionPlanProperties;
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, get_best_fitting_window};
 use datafusion_physical_plan::{Distribution, ExecutionPlan, Partitioning};
 
 use itertools::izip;
@@ -183,7 +183,7 @@ use itertools::izip;
 pub struct EnforceDistribution {}
 
 impl EnforceDistribution {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -281,23 +281,20 @@ pub type PlanWithKeyRequirements = PlanContext<Vec<Arc<dyn PhysicalExpr>>>;
 /// 3) If the current plan is RepartitionExec, CoalescePartitionsExec or WindowAggExec, clear all the requirements, return the unchanged plan
 /// 4) If the current plan is Projection, transform the requirements to the columns before the Projection and push down requirements
 /// 5) For other types of operators, by default, pushdown the parent requirements to children.
-///
 pub fn adjust_input_keys_ordering(
     mut requirements: PlanWithKeyRequirements,
 ) -> Result<Transformed<PlanWithKeyRequirements>> {
     let plan = Arc::clone(&requirements.plan);
 
-    if let Some(HashJoinExec {
-        left,
-        right,
-        on,
-        filter,
-        join_type,
-        projection,
-        mode,
-        null_equals_null,
-        ..
-    }) = plan.as_any().downcast_ref::<HashJoinExec>()
+    if let Some(
+        exec @ HashJoinExec {
+            left,
+            on,
+            join_type,
+            mode,
+            ..
+        },
+    ) = plan.as_any().downcast_ref::<HashJoinExec>()
     {
         match mode {
             PartitionMode::Partitioned => {
@@ -305,18 +302,10 @@ pub fn adjust_input_keys_ordering(
                     Vec<(PhysicalExprRef, PhysicalExprRef)>,
                     Vec<SortOptions>,
                 )| {
-                    HashJoinExec::try_new(
-                        Arc::clone(left),
-                        Arc::clone(right),
-                        new_conditions.0,
-                        filter.clone(),
-                        join_type,
-                        // TODO: although projection is not used in the join here, because projection pushdown is after enforce_distribution. Maybe we need to handle it later. Same as filter.
-                        projection.clone(),
-                        PartitionMode::Partitioned,
-                        *null_equals_null,
-                    )
-                    .map(|e| Arc::new(e) as _)
+                    exec.builder()
+                        .with_partition_mode(PartitionMode::Partitioned)
+                        .with_on(new_conditions.0)
+                        .build_exec()
                 };
                 return reorder_partitioned_join_keys(
                     requirements,
@@ -334,7 +323,7 @@ pub fn adjust_input_keys_ordering(
                         left.schema().fields().len(),
                     )
                     .unwrap_or_default(),
-                    JoinType::RightSemi | JoinType::RightAnti => {
+                    JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
                         requirements.data.clone()
                     }
                     JoinType::Left
@@ -364,7 +353,7 @@ pub fn adjust_input_keys_ordering(
         filter,
         join_type,
         sort_options,
-        null_equals_null,
+        null_equality,
         ..
     }) = plan.as_any().downcast_ref::<SortMergeJoinExec>()
     {
@@ -379,7 +368,7 @@ pub fn adjust_input_keys_ordering(
                 filter.clone(),
                 *join_type,
                 new_conditions.1,
-                *null_equals_null,
+                *null_equality,
             )
             .map(|e| Arc::new(e) as _)
         };
@@ -407,7 +396,11 @@ pub fn adjust_input_keys_ordering(
         // For Projection, we need to transform the requirements to the columns before the Projection
         // And then to push down the requirements
         // Construct a mapping from new name to the original Column
-        let new_required = map_columns_before_projection(&requirements.data, expr);
+        let proj_exprs: Vec<_> = expr
+            .iter()
+            .map(|p| (Arc::clone(&p.expr), p.alias.clone()))
+            .collect();
+        let new_required = map_columns_before_projection(&requirements.data, &proj_exprs);
         if new_required.len() == requirements.data.len() {
             requirements.children[0].data = new_required;
         } else {
@@ -454,14 +447,14 @@ where
         positions,
     ) = try_reorder(join_key_pairs, parent_required, eq_properties);
 
-    if let Some(positions) = positions {
-        if !positions.is_empty() {
-            let new_join_on = new_join_conditions(&left_keys, &right_keys);
-            let new_sort_options = (0..sort_options.len())
-                .map(|idx| sort_options[positions[idx]])
-                .collect();
-            join_plan.plan = join_constructor((new_join_on, new_sort_options))?;
-        }
+    if let Some(positions) = positions
+        && !positions.is_empty()
+    {
+        let new_join_on = new_join_conditions(&left_keys, &right_keys);
+        let new_sort_options = (0..sort_options.len())
+            .map(|idx| sort_options[positions[idx]])
+            .collect();
+        join_plan.plan = join_constructor((new_join_on, new_sort_options))?;
     }
 
     join_plan.children[0].data = left_keys;
@@ -490,77 +483,75 @@ pub fn reorder_aggregate_keys(
     if parent_required.len() == output_exprs.len()
         && agg_exec.group_expr().null_expr().is_empty()
         && !physical_exprs_equal(&output_exprs, parent_required)
+        && let Some(positions) = expected_expr_positions(&output_exprs, parent_required)
+        && let Some(agg_exec) = agg_exec.input().as_any().downcast_ref::<AggregateExec>()
+        && *agg_exec.mode() == AggregateMode::Partial
     {
-        if let Some(positions) = expected_expr_positions(&output_exprs, parent_required) {
-            if let Some(agg_exec) =
-                agg_exec.input().as_any().downcast_ref::<AggregateExec>()
-            {
-                if matches!(agg_exec.mode(), &AggregateMode::Partial) {
-                    let group_exprs = agg_exec.group_expr().expr();
-                    let new_group_exprs = positions
-                        .into_iter()
-                        .map(|idx| group_exprs[idx].clone())
-                        .collect();
-                    let partial_agg = Arc::new(AggregateExec::try_new(
-                        AggregateMode::Partial,
-                        PhysicalGroupBy::new_single(new_group_exprs),
-                        agg_exec.aggr_expr().to_vec(),
-                        agg_exec.filter_expr().to_vec(),
-                        Arc::clone(agg_exec.input()),
-                        Arc::clone(&agg_exec.input_schema),
-                    )?);
-                    // Build new group expressions that correspond to the output
-                    // of the "reordered" aggregator:
-                    let group_exprs = partial_agg.group_expr().expr();
-                    let new_group_by = PhysicalGroupBy::new_single(
-                        partial_agg
-                            .output_group_expr()
-                            .into_iter()
-                            .enumerate()
-                            .map(|(idx, expr)| (expr, group_exprs[idx].1.clone()))
-                            .collect(),
-                    );
-                    let new_final_agg = Arc::new(AggregateExec::try_new(
-                        AggregateMode::FinalPartitioned,
-                        new_group_by,
-                        agg_exec.aggr_expr().to_vec(),
-                        agg_exec.filter_expr().to_vec(),
-                        Arc::clone(&partial_agg) as _,
-                        agg_exec.input_schema(),
-                    )?);
-
-                    agg_node.plan = Arc::clone(&new_final_agg) as _;
-                    agg_node.data.clear();
-                    agg_node.children = vec![PlanWithKeyRequirements::new(
-                        partial_agg as _,
-                        vec![],
-                        agg_node.children.swap_remove(0).children,
-                    )];
-
-                    // Need to create a new projection to change the expr ordering back
-                    let agg_schema = new_final_agg.schema();
-                    let mut proj_exprs = output_columns
-                        .iter()
-                        .map(|col| {
-                            let name = col.name();
-                            let index = agg_schema.index_of(name)?;
-                            Ok((Arc::new(Column::new(name, index)) as _, name.to_owned()))
-                        })
-                        .collect::<Result<Vec<_>>>()?;
-                    let agg_fields = agg_schema.fields();
-                    for (idx, field) in
-                        agg_fields.iter().enumerate().skip(output_columns.len())
-                    {
-                        let name = field.name();
-                        let plan = Arc::new(Column::new(name, idx)) as _;
-                        proj_exprs.push((plan, name.clone()))
-                    }
-                    return ProjectionExec::try_new(proj_exprs, new_final_agg).map(|p| {
-                        PlanWithKeyRequirements::new(Arc::new(p), vec![], vec![agg_node])
-                    });
-                }
-            }
+        let group_exprs = agg_exec.group_expr().expr();
+        let new_group_exprs = positions
+            .into_iter()
+            .map(|idx| group_exprs[idx].clone())
+            .collect();
+        let partial_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(new_group_exprs),
+            agg_exec.aggr_expr().to_vec(),
+            agg_exec.filter_expr().to_vec(),
+            Arc::clone(agg_exec.input()),
+            Arc::clone(&agg_exec.input_schema),
+        )?);
+        // Build new group expressions that correspond to the output
+        // of the "reordered" aggregator:
+        let group_exprs = partial_agg.group_expr().expr();
+        let new_group_by = PhysicalGroupBy::new_single(
+            partial_agg
+                .output_group_expr()
+                .into_iter()
+                .enumerate()
+                .map(|(idx, expr)| (expr, group_exprs[idx].1.clone()))
+                .collect(),
+        );
+        let new_final_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::FinalPartitioned,
+            new_group_by,
+            agg_exec.aggr_expr().to_vec(),
+            agg_exec.filter_expr().to_vec(),
+            Arc::clone(&partial_agg) as _,
+            agg_exec.input_schema(),
+        )?);
+
+        agg_node.plan = Arc::clone(&new_final_agg) as _;
+        agg_node.data.clear();
+        agg_node.children = vec![PlanWithKeyRequirements::new(
+            partial_agg as _,
+            vec![],
+            agg_node.children.swap_remove(0).children,
+        )];
+
+        // Need to create a new projection to change the expr ordering back
+        let agg_schema = new_final_agg.schema();
+        let mut proj_exprs = output_columns
+            .iter()
+            .map(|col| {
+                let name = col.name();
+                let index = agg_schema.index_of(name)?;
+                Ok(ProjectionExpr {
+                    expr: Arc::new(Column::new(name, index)) as _,
+                    alias: name.to_owned(),
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let agg_fields = agg_schema.fields();
+        for (idx, field) in agg_fields.iter().enumerate().skip(output_columns.len()) {
+            let name = field.name();
+            let plan = Arc::new(Column::new(name, idx)) as _;
+            proj_exprs.push(ProjectionExpr {
+                expr: plan,
+                alias: name.clone(),
+            })
         }
+        return ProjectionExec::try_new(proj_exprs, new_final_agg)
+            .map(|p| PlanWithKeyRequirements::new(Arc::new(p), vec![], vec![agg_node]));
     }
     Ok(agg_node)
 }
@@ -608,19 +599,17 @@ pub fn reorder_join_keys_to_inputs(
     plan: Arc<dyn ExecutionPlan>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let plan_any = plan.as_any();
-    if let Some(HashJoinExec {
-        left,
-        right,
-        on,
-        filter,
-        join_type,
-        projection,
-        mode,
-        null_equals_null,
-        ..
-    }) = plan_any.downcast_ref::<HashJoinExec>()
+    if let Some(
+        exec @ HashJoinExec {
+            left,
+            right,
+            on,
+            mode,
+            ..
+        },
+    ) = plan_any.downcast_ref::<HashJoinExec>()
     {
-        if matches!(mode, PartitionMode::Partitioned) {
+        if *mode == PartitionMode::Partitioned {
             let (join_keys, positions) = reorder_current_join_keys(
                 extract_join_keys(on),
                 Some(left.output_partitioning()),
@@ -634,16 +623,11 @@ pub fn reorder_join_keys_to_inputs(
                     right_keys,
                 } = join_keys;
                 let new_join_on = new_join_conditions(&left_keys, &right_keys);
-                return Ok(Arc::new(HashJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    new_join_on,
-                    filter.clone(),
-                    join_type,
-                    projection.clone(),
-                    PartitionMode::Partitioned,
-                    *null_equals_null,
-                )?));
+                return exec
+                    .builder()
+                    .with_partition_mode(PartitionMode::Partitioned)
+                    .with_on(new_join_on)
+                    .build_exec();
             }
         }
     } else if let Some(SortMergeJoinExec {
@@ -653,7 +637,7 @@ pub fn reorder_join_keys_to_inputs(
         filter,
         join_type,
         sort_options,
-        null_equals_null,
+        null_equality,
         ..
     }) = plan_any.downcast_ref::<SortMergeJoinExec>()
     {
@@ -664,27 +648,27 @@ pub fn reorder_join_keys_to_inputs(
             left.equivalence_properties(),
             right.equivalence_properties(),
         );
-        if let Some(positions) = positions {
-            if !positions.is_empty() {
-                let JoinKeyPairs {
-                    left_keys,
-                    right_keys,
-                } = join_keys;
-                let new_join_on = new_join_conditions(&left_keys, &right_keys);
-                let new_sort_options = (0..sort_options.len())
-                    .map(|idx| sort_options[positions[idx]])
-                    .collect();
-                return SortMergeJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    new_join_on,
-                    filter.clone(),
-                    *join_type,
-                    new_sort_options,
-                    *null_equals_null,
-                )
-                .map(|smj| Arc::new(smj) as _);
-            }
+        if let Some(positions) = positions
+            && !positions.is_empty()
+        {
+            let JoinKeyPairs {
+                left_keys,
+                right_keys,
+            } = join_keys;
+            let new_join_on = new_join_conditions(&left_keys, &right_keys);
+            let new_sort_options = (0..sort_options.len())
+                .map(|idx| sort_options[positions[idx]])
+                .collect();
+            return SortMergeJoinExec::try_new(
+                Arc::clone(left),
+                Arc::clone(right),
+                new_join_on,
+                filter.clone(),
+                *join_type,
+                new_sort_options,
+                *null_equality,
+            )
+            .map(|smj| Arc::new(smj) as _);
         }
     }
     Ok(plan)
@@ -880,6 +864,8 @@ fn add_roundrobin_on_top(
 /// * `hash_exprs`: Stores Physical Exprs that are used during hashing.
 /// * `n_target`: desired target partition number, if partition number of the
 ///   current executor is less than this value. Partition number will be increased.
+/// * `allow_subset_satisfy_partitioning`: Whether to allow subset partitioning logic in satisfaction checks.
+///   Set to `false` for partitioned hash joins to ensure exact hash matching.
 ///
 /// # Returns
 ///
@@ -889,6 +875,7 @@ fn add_hash_on_top(
     input: DistributionContext,
     hash_exprs: Vec<Arc<dyn PhysicalExpr>>,
     n_target: usize,
+    allow_subset_satisfy_partitioning: bool,
 ) -> Result<DistributionContext> {
     // Early return if hash repartition is unnecessary
     // `RepartitionExec: partitioning=Hash([...], 1), input_partitions=1` is unnecessary.
@@ -897,15 +884,23 @@ fn add_hash_on_top(
     }
 
     let dist = Distribution::HashPartitioned(hash_exprs);
-    let satisfied = input
-        .plan
-        .output_partitioning()
-        .satisfy(&dist, input.plan.equivalence_properties());
+    let satisfaction = input.plan.output_partitioning().satisfaction(
+        &dist,
+        input.plan.equivalence_properties(),
+        allow_subset_satisfy_partitioning,
+    );
 
     // Add hash repartitioning when:
-    // - The hash distribution requirement is not satisfied, or
-    // - We can increase parallelism by adding hash partitioning.
-    if !satisfied || n_target > input.plan.output_partitioning().partition_count() {
+    // - When subset satisfaction is enabled (current >= threshold): only repartition if not satisfied
+    // - When below threshold (current < threshold): repartition if expressions don't match OR to increase parallelism
+    let needs_repartition = if allow_subset_satisfy_partitioning {
+        !satisfaction.is_satisfied()
+    } else {
+        !satisfaction.is_satisfied()
+            || n_target > input.plan.output_partitioning().partition_count()
+    };
+
+    if needs_repartition {
         // When there is an existing ordering, we preserve ordering during
         // repartition. This will be rolled back in the future if any of the
         // following conditions is true:
@@ -925,34 +920,34 @@ fn add_hash_on_top(
     Ok(input)
 }
 
-/// Adds a [`SortPreservingMergeExec`] operator on top of input executor
-/// to satisfy single distribution requirement.
+/// Adds a [`SortPreservingMergeExec`] or a [`CoalescePartitionsExec`] operator
+/// on top of the given plan node to satisfy a single partition requirement
+/// while preserving ordering constraints.
 ///
-/// # Arguments
+/// # Parameters
 ///
 /// * `input`: Current node.
 ///
 /// # Returns
 ///
-/// Updated node with an execution plan, where desired single
-/// distribution is satisfied by adding [`SortPreservingMergeExec`].
-fn add_spm_on_top(input: DistributionContext) -> DistributionContext {
-    // Add SortPreservingMerge only when partition count is larger than 1.
+/// Updated node with an execution plan, where the desired single distribution
+/// requirement is satisfied.
+fn add_merge_on_top(input: DistributionContext) -> DistributionContext {
+    // Apply only when the partition count is larger than one.
     if input.plan.output_partitioning().partition_count() > 1 {
         // When there is an existing ordering, we preserve ordering
         // when decreasing partitions. This will be un-done in the future
         // if any of the following conditions is true
         // - Preserving ordering is not helpful in terms of satisfying ordering requirements
         // - Usage of order preserving variants is not desirable
-        // (determined by flag `config.optimizer.bounded_order_preserving_variants`)
-        let should_preserve_ordering = input.plan.output_ordering().is_some();
-
-        let new_plan = if should_preserve_ordering {
+        // (determined by flag `config.optimizer.prefer_existing_sort`)
+        let new_plan = if let Some(req) = input.plan.output_ordering() {
             Arc::new(SortPreservingMergeExec::new(
-                input.plan.output_ordering().cloned().unwrap_or_default(),
+                req.clone(),
                 Arc::clone(&input.plan),
             )) as _
         } else {
+            // If there is no input order, we can simply coalesce partitions:
             Arc::new(CoalescePartitionsExec::new(Arc::clone(&input.plan))) as _
         };
 
@@ -1036,14 +1031,13 @@ pub fn replace_order_preserving_variants(
         return Ok(context);
     } else if let Some(repartition) =
         context.plan.as_any().downcast_ref::<RepartitionExec>()
+        && repartition.preserve_order()
     {
-        if repartition.preserve_order() {
-            context.plan = Arc::new(RepartitionExec::try_new(
-                Arc::clone(&context.children[0].plan),
-                repartition.partitioning().clone(),
-            )?);
-            return Ok(context);
-        }
+        context.plan = Arc::new(RepartitionExec::try_new(
+            Arc::clone(&context.children[0].plan),
+            repartition.partitioning().clone(),
+        )?);
+        return Ok(context);
     }
 
     context.update_plan_from_children()
@@ -1175,6 +1169,7 @@ pub fn ensure_distribution(
     let should_use_estimates = config
         .execution
         .use_row_number_estimates_to_optimize_partitioning;
+    let subset_satisfaction_threshold = config.optimizer.subset_repartition_threshold;
     let unbounded_and_pipeline_friendly = dist_context.plan.boundedness().is_unbounded()
         && matches!(
             dist_context.plan.pipeline_behavior(),
@@ -1202,16 +1197,51 @@ pub fn ensure_distribution(
         )? {
             plan = updated_window;
         }
-    } else if let Some(exec) = plan.as_any().downcast_ref::<BoundedWindowAggExec>() {
-        if let Some(updated_window) = get_best_fitting_window(
+    } else if let Some(exec) = plan.as_any().downcast_ref::<BoundedWindowAggExec>()
+        && let Some(updated_window) = get_best_fitting_window(
             exec.window_expr(),
             exec.input(),
             &exec.partition_keys(),
-        )? {
-            plan = updated_window;
-        }
+        )?
+    {
+        plan = updated_window;
     };
 
+    // For joins in partitioned mode, we need exact hash matching between
+    // both sides, so subset partitioning logic must be disabled.
+    //
+    // Why: Different hash expressions produce different hash values, causing
+    // rows with the same join key to land in different partitions. Since
+    // partitioned joins match partition N left with partition N right, rows
+    // that should match may be in different partitions and miss each other.
+    //
+    // Example JOIN ON left.a = right.a:
+    //
+    // Left: Hash([a])
+    //  Partition 1: a=1
+    //  Partition 2: a=2
+    //
+    // Right: Hash([a, b])
+    //  Partition 1: (a=1, b=1) -> Same a=1
+    //  Partition 2: (a=2, b=2)
+    //  Partition 3: (a=1, b=2) -> Same a=1
+    //
+    // Partitioned join execution:
+    //  P1 left (a=1) joins P1 right (a=1, b=1) -> Match
+    //  P2 left (a=2) joins P2 right (a=2, b=2) -> Match
+    //  P3 left (empty) joins P3 right (a=1, b=2) -> Missing, errors
+    //
+    // The row (a=1, b=2) should match left.a=1 but they're in different
+    // partitions, causing panics.
+    //
+    // CollectLeft/CollectRight modes are safe because one side is collected
+    // to a single partition which eliminates partition-to-partition mapping.
+    let is_partitioned_join = plan
+        .as_any()
+        .downcast_ref::<HashJoinExec>()
+        .is_some_and(|join| join.mode == PartitionMode::Partitioned)
+        || plan.as_any().is::<SortMergeJoinExec>();
+
     let repartition_status_flags =
         get_repartition_requirement_status(&plan, batch_size, should_use_estimates)?;
     // This loop iterates over all the children to:
@@ -1237,12 +1267,38 @@ pub fn ensure_distribution(
                 hash_necessary,
             },
         )| {
+            let increases_partition_count =
+                child.plan.output_partitioning().partition_count() < target_partitions;
+
             let add_roundrobin = enable_round_robin
                 // Operator benefits from partitioning (e.g. filter):
                 && roundrobin_beneficial
                 && roundrobin_beneficial_stats
                 // Unless partitioning increases the partition count, it is not beneficial:
-                && child.plan.output_partitioning().partition_count() < target_partitions;
+                && increases_partition_count;
+
+            // Allow subset satisfaction when:
+            // 1. Current partition count >= threshold
+            // 2. Not a partitioned join since must use exact hash matching for joins
+            // 3. Not a grouping set aggregate (requires exact hash including __grouping_id)
+            let current_partitions = child.plan.output_partitioning().partition_count();
+
+            // Check if the hash partitioning requirement includes __grouping_id column.
+            // Grouping set aggregates (ROLLUP, CUBE, GROUPING SETS) require exact hash
+            // partitioning on all group columns including __grouping_id to ensure partial
+            // aggregates from different partitions are correctly combined.
+            let requires_grouping_id = matches!(&requirement, Distribution::HashPartitioned(exprs)
+                if exprs.iter().any(|expr| {
+                    expr.as_any()
+                        .downcast_ref::<Column>()
+                        .is_some_and(|col| col.name() == Aggregate::INTERNAL_GROUPING_ID)
+                })
+            );
+
+            let allow_subset_satisfy_partitioning = current_partitions
+                >= subset_satisfaction_threshold
+                && !is_partitioned_join
+                && !requires_grouping_id;
 
             // When `repartition_file_scans` is set, attempt to increase
             // parallelism at the source.
@@ -1250,29 +1306,29 @@ pub fn ensure_distribution(
             // If repartitioning is not possible (a.k.a. None is returned from `ExecutionPlan::repartitioned`)
             // then no repartitioning will have occurred. As the default implementation returns None, it is only
             // specific physical plan nodes, such as certain datasources, which are repartitioned.
-            if repartition_file_scans && roundrobin_beneficial_stats {
-                if let Some(new_child) =
+            if repartition_file_scans
+                && roundrobin_beneficial_stats
+                && let Some(new_child) =
                     child.plan.repartitioned(target_partitions, config)?
-                {
-                    child.plan = new_child;
-                }
+            {
+                child.plan = new_child;
             }
 
             // Satisfy the distribution requirement if it is unmet.
             match &requirement {
                 Distribution::SinglePartition => {
-                    child = add_spm_on_top(child);
+                    child = add_merge_on_top(child);
                 }
                 Distribution::HashPartitioned(exprs) => {
-                    if add_roundrobin {
-                        // Add round-robin repartitioning on top of the operator
-                        // to increase parallelism.
-                        child = add_roundrobin_on_top(child, target_partitions)?;
-                    }
+                    // See https://github.com/apache/datafusion/issues/18341#issuecomment-3503238325 for background
                     // When inserting hash is necessary to satisfy hash requirement, insert hash repartition.
                     if hash_necessary {
-                        child =
-                            add_hash_on_top(child, exprs.to_vec(), target_partitions)?;
+                        child = add_hash_on_top(
+                            child,
+                            exprs.to_vec(),
+                            target_partitions,
+                            allow_subset_satisfy_partitioning,
+                        )?;
                     }
                 }
                 Distribution::UnspecifiedDistribution => {
@@ -1289,10 +1345,12 @@ pub fn ensure_distribution(
                 // Either:
                 // - Ordering requirement cannot be satisfied by preserving ordering through repartitions, or
                 // - using order preserving variant is not desirable.
+                let sort_req = required_input_ordering.into_single();
                 let ordering_satisfied = child
                     .plan
                     .equivalence_properties()
-                    .ordering_satisfy_requirement(&required_input_ordering);
+                    .ordering_satisfy_requirement(sort_req.clone())?;
+
                 if (!ordering_satisfied || !order_preserving_variants_desirable)
                     && child.data
                 {
@@ -1303,9 +1361,12 @@ pub fn ensure_distribution(
                         // Make sure to satisfy ordering requirement:
                         child = add_sort_above_with_check(
                             child,
-                            required_input_ordering.clone(),
-                            None,
-                        );
+                            sort_req,
+                            plan.as_any()
+                                .downcast_ref::<OutputRequirementExec>()
+                                .map(|output| output.fetch())
+                                .unwrap_or(None),
+                        )?;
                     }
                 }
                 // Stop tracking distribution changing operators
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
index 37fec2eab3f91..247ebb2785dd3 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
@@ -40,22 +40,23 @@ pub mod sort_pushdown;
 
 use std::sync::Arc;
 
+use crate::PhysicalOptimizerRule;
 use crate::enforce_sorting::replace_with_order_preserving_variants::{
-    replace_with_order_preserving_variants, OrderPreservationContext,
+    OrderPreservationContext, replace_with_order_preserving_variants,
 };
 use crate::enforce_sorting::sort_pushdown::{
-    assign_initial_requirements, pushdown_sorts, SortPushDown,
+    SortPushDown, assign_initial_requirements, pushdown_sorts,
 };
+use crate::output_requirements::OutputRequirementExec;
 use crate::utils::{
     add_sort_above, add_sort_above_with_check, is_coalesce_partitions, is_limit,
-    is_repartition, is_sort, is_sort_preserving_merge, is_union, is_window,
+    is_repartition, is_sort, is_sort_preserving_merge, is_window,
 };
-use crate::PhysicalOptimizerRule;
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::plan_err;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 use datafusion_physical_expr::{Distribution, Partitioning};
 use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
@@ -66,7 +67,7 @@ use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
 use datafusion_physical_plan::windows::{
-    get_best_fitting_window, BoundedWindowAggExec, WindowAggExec,
+    BoundedWindowAggExec, WindowAggExec, get_best_fitting_window,
 };
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties, InputOrderMode};
 
@@ -78,7 +79,7 @@ use itertools::izip;
 pub struct EnforceSorting {}
 
 impl EnforceSorting {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -90,7 +91,7 @@ impl EnforceSorting {
 /// via its children.
 pub type PlanWithCorrespondingSort = PlanContext<bool>;
 
-/// For a given node, update the [`PlanContext.data`] attribute.
+/// For a given node, update the `PlanContext.data` attribute.
 ///
 /// If the node is a `SortExec`, or any of the node's children are a `SortExec`,
 /// then set the attribute to true.
@@ -191,14 +192,20 @@ fn update_coalesce_ctx_children(
 }
 
 /// Performs optimizations based upon a series of subrules.
-///
 /// Refer to each subrule for detailed descriptions of the optimizations performed:
-/// [`ensure_sorting`], [`parallelize_sorts`], [`replace_with_order_preserving_variants()`],
-/// and [`pushdown_sorts`].
-///
 /// Subrule application is ordering dependent.
 ///
-/// The subrule `parallelize_sorts` is only applied if `repartition_sorts` is enabled.
+/// Optimizer consists of 5 main parts which work sequentially
+/// 1. [`ensure_sorting`] Works down-to-top to be able to remove unnecessary [`SortExec`]s, [`SortPreservingMergeExec`]s
+///    add [`SortExec`]s if necessary by a requirement and adjusts window operators.
+/// 2. [`parallelize_sorts`] (Optional, depends on the `repartition_sorts` configuration)
+///    Responsible to identify and remove unnecessary partition unifier operators
+///    such as [`SortPreservingMergeExec`], [`CoalescePartitionsExec`] follows [`SortExec`]s does possible simplifications.
+/// 3. [`replace_with_order_preserving_variants()`] Replaces with alternative operators, for example can merge
+///    a [`SortExec`] and a [`CoalescePartitionsExec`] into one [`SortPreservingMergeExec`]
+///    or a [`SortExec`] + [`RepartitionExec`] combination into an order preserving [`RepartitionExec`]
+/// 4. [`sort_pushdown`] Works top-down. Responsible to push down sort operators as deep as possible in the plan.
+/// 5. `replace_with_partial_sort` Checks if it's possible to replace [`SortExec`]s with [`PartialSortExec`] operators
 impl PhysicalOptimizerRule for EnforceSorting {
     fn optimize(
         &self,
@@ -251,87 +258,93 @@ impl PhysicalOptimizerRule for EnforceSorting {
     }
 }
 
+/// Only interested with [`SortExec`]s and their unbounded children.
+/// If the plan is not a [`SortExec`] or its child is not unbounded, returns the original plan.
+/// Otherwise, by checking the requirement satisfaction searches for a replacement chance.
+/// If there's one replaces the [`SortExec`] plan with a [`PartialSortExec`]
 fn replace_with_partial_sort(
     plan: Arc<dyn ExecutionPlan>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let plan_any = plan.as_any();
-    if let Some(sort_plan) = plan_any.downcast_ref::<SortExec>() {
-        let child = Arc::clone(sort_plan.children()[0]);
-        if !child.boundedness().is_unbounded() {
-            return Ok(plan);
-        }
+    let Some(sort_plan) = plan_any.downcast_ref::<SortExec>() else {
+        return Ok(plan);
+    };
+
+    // It's safe to get first child of the SortExec
+    let child = Arc::clone(sort_plan.children()[0]);
+    if !child.boundedness().is_unbounded() {
+        return Ok(plan);
+    }
 
-        // here we're trying to find the common prefix for sorted columns that is required for the
-        // sort and already satisfied by the given ordering
-        let child_eq_properties = child.equivalence_properties();
-        let sort_req = LexRequirement::from(sort_plan.expr().clone());
+    // Here we're trying to find the common prefix for sorted columns that is required for the
+    // sort and already satisfied by the given ordering
+    let child_eq_properties = child.equivalence_properties();
+    let sort_exprs = sort_plan.expr().clone();
 
-        let mut common_prefix_length = 0;
-        while child_eq_properties.ordering_satisfy_requirement(&LexRequirement {
-            inner: sort_req[0..common_prefix_length + 1].to_vec(),
-        }) {
-            common_prefix_length += 1;
-        }
-        if common_prefix_length > 0 {
-            return Ok(Arc::new(
-                PartialSortExec::new(
-                    LexOrdering::new(sort_plan.expr().to_vec()),
-                    Arc::clone(sort_plan.input()),
-                    common_prefix_length,
-                )
-                .with_preserve_partitioning(sort_plan.preserve_partitioning())
-                .with_fetch(sort_plan.fetch()),
-            ));
-        }
+    let mut common_prefix_length = 0;
+    while child_eq_properties
+        .ordering_satisfy(sort_exprs[0..common_prefix_length + 1].to_vec())?
+    {
+        common_prefix_length += 1;
+    }
+    if common_prefix_length > 0 {
+        return Ok(Arc::new(
+            PartialSortExec::new(
+                sort_exprs,
+                Arc::clone(sort_plan.input()),
+                common_prefix_length,
+            )
+            .with_preserve_partitioning(sort_plan.preserve_partitioning())
+            .with_fetch(sort_plan.fetch()),
+        ));
     }
     Ok(plan)
 }
 
-/// Transform [`CoalescePartitionsExec`] + [`SortExec`] into
-/// [`SortExec`] + [`SortPreservingMergeExec`] as illustrated below:
+/// Transform [`CoalescePartitionsExec`] + [`SortExec`] cascades into [`SortExec`]
+/// + [`SortPreservingMergeExec`] cascades, as illustrated below.
 ///
-/// The [`CoalescePartitionsExec`] + [`SortExec`] cascades
-/// combine the partitions first, and then sort:
+/// A [`CoalescePartitionsExec`] + [`SortExec`] cascade combines partitions
+/// first, and then sorts:
 /// ```text
-///   ┌ ─ ─ ─ ─ ─ ┐                                                                                   
-///    ┌─┬─┬─┐                                                                                        
-///   ││B│A│D│... ├──┐                                                                                
-///    └─┴─┴─┘       │                                                                                
+///   ┌ ─ ─ ─ ─ ─ ┐
+///    ┌─┬─┬─┐
+///   ││B│A│D│... ├──┐
+///    └─┴─┴─┘       │
 ///   └ ─ ─ ─ ─ ─ ┘  │  ┌────────────────────────┐   ┌ ─ ─ ─ ─ ─ ─ ┐   ┌────────┐    ┌ ─ ─ ─ ─ ─ ─ ─ ┐
-///    Partition 1   │  │        Coalesce        │    ┌─┬─┬─┬─┬─┐      │        │     ┌─┬─┬─┬─┬─┐     
+///    Partition 1   │  │        Coalesce        │    ┌─┬─┬─┬─┬─┐      │        │     ┌─┬─┬─┬─┬─┐
 ///                  ├──▶(no ordering guarantees)│──▶││B│E│A│D│C│...───▶  Sort  ├───▶││A│B│C│D│E│... │
-///                  │  │                        │    └─┴─┴─┴─┴─┘      │        │     └─┴─┴─┴─┴─┘     
+///                  │  │                        │    └─┴─┴─┴─┴─┘      │        │     └─┴─┴─┴─┴─┘
 ///   ┌ ─ ─ ─ ─ ─ ┐  │  └────────────────────────┘   └ ─ ─ ─ ─ ─ ─ ┘   └────────┘    └ ─ ─ ─ ─ ─ ─ ─ ┘
-///    ┌─┬─┐         │                                 Partition                       Partition      
-///   ││E│C│ ...  ├──┘                                                                                
-///    └─┴─┘                                                                                          
-///   └ ─ ─ ─ ─ ─ ┘                                                                                   
-///    Partition 2                                                                                    
-/// ```                                                                                                 
+///    ┌─┬─┐         │                                 Partition                       Partition
+///   ││E│C│ ...  ├──┘
+///    └─┴─┘
+///   └ ─ ─ ─ ─ ─ ┘
+///    Partition 2
+/// ```
 ///
 ///
-/// The [`SortExec`] + [`SortPreservingMergeExec`] cascades
-/// sorts each partition first, then merge partitions while retaining the sort:
+/// A [`SortExec`] + [`SortPreservingMergeExec`] cascade sorts each partition
+/// first, then merges partitions while preserving the sort:
 /// ```text
-///   ┌ ─ ─ ─ ─ ─ ┐   ┌────────┐   ┌ ─ ─ ─ ─ ─ ┐                                                 
-///    ┌─┬─┬─┐        │        │    ┌─┬─┬─┐                                                      
-///   ││B│A│D│... │──▶│  Sort  │──▶││A│B│D│... │──┐                                              
-///    └─┴─┴─┘        │        │    └─┴─┴─┘       │                                              
+///   ┌ ─ ─ ─ ─ ─ ┐   ┌────────┐   ┌ ─ ─ ─ ─ ─ ┐
+///    ┌─┬─┬─┐        │        │    ┌─┬─┬─┐
+///   ││B│A│D│... │──▶│  Sort  │──▶││A│B│D│... │──┐
+///    └─┴─┴─┘        │        │    └─┴─┴─┘       │
 ///   └ ─ ─ ─ ─ ─ ┘   └────────┘   └ ─ ─ ─ ─ ─ ┘  │  ┌─────────────────────┐    ┌ ─ ─ ─ ─ ─ ─ ─ ┐
-///    Partition 1                  Partition 1   │  │                     │     ┌─┬─┬─┬─┬─┐     
+///    Partition 1                  Partition 1   │  │                     │     ┌─┬─┬─┬─┬─┐
 ///                                               ├──▶ SortPreservingMerge ├───▶││A│B│C│D│E│... │
-///                                               │  │                     │     └─┴─┴─┴─┴─┘     
+///                                               │  │                     │     └─┴─┴─┴─┴─┘
 ///   ┌ ─ ─ ─ ─ ─ ┐   ┌────────┐   ┌ ─ ─ ─ ─ ─ ┐  │  └─────────────────────┘    └ ─ ─ ─ ─ ─ ─ ─ ┘
-///    ┌─┬─┐          │        │    ┌─┬─┐         │                               Partition      
-///   ││E│C│ ...  │──▶│  Sort  ├──▶││C│E│ ...  │──┘                                              
-///    └─┴─┘          │        │    └─┴─┘                                                        
-///   └ ─ ─ ─ ─ ─ ┘   └────────┘   └ ─ ─ ─ ─ ─ ┘                                                 
-///    Partition 2                  Partition 2                                                  
+///    ┌─┬─┐          │        │    ┌─┬─┐         │                               Partition
+///   ││E│C│ ...  │──▶│  Sort  ├──▶││C│E│ ...  │──┘
+///    └─┴─┘          │        │    └─┴─┘
+///   └ ─ ─ ─ ─ ─ ┘   └────────┘   └ ─ ─ ─ ─ ─ ┘
+///    Partition 2                  Partition 2
 /// ```
 ///
-/// The latter [`SortExec`] + [`SortPreservingMergeExec`] cascade performs the
-/// sort first on a per-partition basis, thereby parallelizing the sort.
-///
+/// The latter [`SortExec`] + [`SortPreservingMergeExec`] cascade performs
+/// sorting first on a per-partition basis, thereby parallelizing the sort.
 ///
 /// The outcome is that plans of the form
 /// ```text
@@ -348,16 +361,32 @@ fn replace_with_partial_sort(
 ///      "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
 /// ```
 /// by following connections from [`CoalescePartitionsExec`]s to [`SortExec`]s.
-/// By performing sorting in parallel, we can increase performance in some scenarios.
+/// By performing sorting in parallel, we can increase performance in some
+/// scenarios.
 ///
-/// This requires that there are no nodes between the [`SortExec`] and [`CoalescePartitionsExec`]
-/// which require single partitioning. Do not parallelize when the following scenario occurs:
+/// This optimization requires that there are no nodes between the [`SortExec`]
+/// and the [`CoalescePartitionsExec`], which requires single partitioning. Do
+/// not parallelize when the following scenario occurs:
 /// ```text
 ///      "SortExec: expr=\[a@0 ASC\]",
 ///      "  ...nodes requiring single partitioning..."
 ///      "    CoalescePartitionsExec",
 ///      "      RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1",
 /// ```
+///
+/// **Steps**
+/// 1. Checks if the plan is either a [`SortExec`], a [`SortPreservingMergeExec`],
+///    or a [`CoalescePartitionsExec`]. Otherwise, does nothing.
+/// 2. If the plan is a [`SortExec`] or a final [`SortPreservingMergeExec`]
+///    (i.e. output partitioning is 1):
+///      - Check for [`CoalescePartitionsExec`] in children. If found, check if
+///        it can be removed (with possible [`RepartitionExec`]s). If so, remove
+///        (see `remove_bottleneck_in_subplan`).
+///      - If the plan is satisfying the ordering requirements, add a `SortExec`.
+///      - Add an SPM above the plan and return.
+/// 3. If the plan is a [`CoalescePartitionsExec`]:
+///      - Check if it can be removed (with possible [`RepartitionExec`]s).
+///        If so, remove (see `remove_bottleneck_in_subplan`).
 pub fn parallelize_sorts(
     mut requirements: PlanWithCorrespondingCoalescePartitions,
 ) -> Result<Transformed<PlanWithCorrespondingCoalescePartitions>> {
@@ -388,7 +417,7 @@ pub fn parallelize_sorts(
         // deals with the children and their children and so on.
         requirements = requirements.children.swap_remove(0);
 
-        requirements = add_sort_above_with_check(requirements, sort_reqs, fetch);
+        requirements = add_sort_above_with_check(requirements, sort_reqs, fetch)?;
 
         let spm =
             SortPreservingMergeExec::new(sort_exprs, Arc::clone(&requirements.plan));
@@ -424,6 +453,25 @@ pub fn parallelize_sorts(
 
 /// This function enforces sorting requirements and makes optimizations without
 /// violating these requirements whenever possible. Requires a bottom-up traversal.
+///
+/// **Steps**
+/// 1. Analyze if there are any immediate removals of [`SortExec`]s. If so,
+///    removes them (see `analyze_immediate_sort_removal`).
+/// 2. For each child of the plan, if the plan requires an input ordering:
+///      - Checks if ordering is satisfied with the child. If not:
+///          - If the child has an output ordering, removes the unnecessary
+///            `SortExec`.
+///          - Adds sort above the child plan.
+///      - (Plan not requires input ordering)
+///          - Checks if the `SortExec` is neutralized in the plan. If so,
+///            removes it.
+/// 3. Check and modify window operator:
+///      - Checks if the plan is a window operator, and connected with a sort.
+///        If so, either tries to update the window definition or removes
+///        unnecessary [`SortExec`]s (see `adjust_window_sort_removal`).
+/// 4. Check and remove possibly unnecessary SPM:
+///       -  Checks if the plan is SPM and child 1 output partitions, if so
+///          decides this SPM is unnecessary and removes it from the plan.
 pub fn ensure_sorting(
     mut requirements: PlanWithCorrespondingSort,
 ) -> Result<Transformed<PlanWithCorrespondingSort>> {
@@ -433,7 +481,7 @@ pub fn ensure_sorting(
     if requirements.children.is_empty() {
         return Ok(Transformed::no(requirements));
     }
-    let maybe_requirements = analyze_immediate_sort_removal(requirements);
+    let maybe_requirements = analyze_immediate_sort_removal(requirements)?;
     requirements = if !maybe_requirements.transformed {
         maybe_requirements.data
     } else {
@@ -452,18 +500,23 @@ pub fn ensure_sorting(
 
         if let Some(required) = required_ordering {
             let eq_properties = child.plan.equivalence_properties();
-            if !eq_properties.ordering_satisfy_requirement(&required) {
+            let req = required.into_single();
+            if !eq_properties.ordering_satisfy_requirement(req.clone())? {
                 // Make sure we preserve the ordering requirements:
                 if physical_ordering.is_some() {
                     child = update_child_to_remove_unnecessary_sort(idx, child, plan)?;
                 }
-                child = add_sort_above(child, required, None);
+                child = add_sort_above(
+                    child,
+                    req,
+                    plan.as_any()
+                        .downcast_ref::<OutputRequirementExec>()
+                        .map(|output| output.fetch())
+                        .unwrap_or(None),
+                );
                 child = update_sort_ctx_children_data(child, true)?;
             }
-        } else if physical_ordering.is_none()
-            || !plan.maintains_input_order()[idx]
-            || is_union(plan)
-        {
+        } else if physical_ordering.is_none() || !plan.maintains_input_order()[idx] {
             // We have a `SortExec` whose effect may be neutralized by another
             // order-imposing operator, remove this sort:
             child = update_child_to_remove_unnecessary_sort(idx, child, plan)?;
@@ -493,60 +546,62 @@ pub fn ensure_sorting(
     update_sort_ctx_children_data(requirements, false).map(Transformed::yes)
 }
 
-/// Analyzes a given [`SortExec`] (`plan`) to determine whether its input
-/// already has a finer ordering than it enforces.
+/// Analyzes if there are any immediate sort removals by checking the `SortExec`s
+/// and their ordering requirement satisfactions with children
+/// If the sort is unnecessary, either replaces it with [`SortPreservingMergeExec`]/`LimitExec`
+/// or removes the [`SortExec`].
+/// Otherwise, returns the original plan
 fn analyze_immediate_sort_removal(
     mut node: PlanWithCorrespondingSort,
-) -> Transformed<PlanWithCorrespondingSort> {
-    if let Some(sort_exec) = node.plan.as_any().downcast_ref::<SortExec>() {
-        let sort_input = sort_exec.input();
-        // If this sort is unnecessary, we should remove it:
-        if sort_input.equivalence_properties().ordering_satisfy(
-            sort_exec
-                .properties()
-                .output_ordering()
-                .unwrap_or_else(|| LexOrdering::empty()),
-        ) {
-            node.plan = if !sort_exec.preserve_partitioning()
-                && sort_input.output_partitioning().partition_count() > 1
-            {
-                // Replace the sort with a sort-preserving merge:
-                let expr = LexOrdering::new(sort_exec.expr().to_vec());
-                Arc::new(
-                    SortPreservingMergeExec::new(expr, Arc::clone(sort_input))
-                        .with_fetch(sort_exec.fetch()),
-                ) as _
+) -> Result<Transformed<PlanWithCorrespondingSort>> {
+    let Some(sort_exec) = node.plan.as_any().downcast_ref::<SortExec>() else {
+        return Ok(Transformed::no(node));
+    };
+    let sort_input = sort_exec.input();
+    // Check if the sort is unnecessary:
+    let properties = sort_exec.properties();
+    if let Some(ordering) = properties.output_ordering().cloned() {
+        let eqp = sort_input.equivalence_properties();
+        if !eqp.ordering_satisfy(ordering)? {
+            return Ok(Transformed::no(node));
+        }
+    }
+    node.plan = if !sort_exec.preserve_partitioning()
+        && sort_input.output_partitioning().partition_count() > 1
+    {
+        // Replace the sort with a sort-preserving merge:
+        Arc::new(
+            SortPreservingMergeExec::new(
+                sort_exec.expr().clone(),
+                Arc::clone(sort_input),
+            )
+            .with_fetch(sort_exec.fetch()),
+        ) as _
+    } else {
+        // Remove the sort:
+        node.children = node.children.swap_remove(0).children;
+        if let Some(fetch) = sort_exec.fetch() {
+            let required_ordering = sort_exec.properties().output_ordering().cloned();
+            // If the sort has a fetch, we need to add a limit:
+            if properties.output_partitioning().partition_count() == 1 {
+                let mut global_limit =
+                    GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch));
+                global_limit.set_required_ordering(required_ordering);
+                Arc::new(global_limit)
             } else {
-                // Remove the sort:
-                node.children = node.children.swap_remove(0).children;
-                if let Some(fetch) = sort_exec.fetch() {
-                    // If the sort has a fetch, we need to add a limit:
-                    if sort_exec
-                        .properties()
-                        .output_partitioning()
-                        .partition_count()
-                        == 1
-                    {
-                        Arc::new(GlobalLimitExec::new(
-                            Arc::clone(sort_input),
-                            0,
-                            Some(fetch),
-                        ))
-                    } else {
-                        Arc::new(LocalLimitExec::new(Arc::clone(sort_input), fetch))
-                    }
-                } else {
-                    Arc::clone(sort_input)
-                }
-            };
-            for child in node.children.iter_mut() {
-                child.data = false;
+                let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch);
+                local_limit.set_required_ordering(required_ordering);
+                Arc::new(local_limit)
             }
-            node.data = false;
-            return Transformed::yes(node);
+        } else {
+            Arc::clone(sort_input)
         }
+    };
+    for child in node.children.iter_mut() {
+        child.data = false;
     }
-    Transformed::no(node)
+    node.data = false;
+    Ok(Transformed::yes(node))
 }
 
 /// Adjusts a [`WindowAggExec`] or a [`BoundedWindowAggExec`] to determine
@@ -587,15 +642,13 @@ fn adjust_window_sort_removal(
     } else {
         // We were unable to change the window to accommodate the input, so we
         // will insert a sort.
-        let reqs = window_tree
-            .plan
-            .required_input_ordering()
-            .swap_remove(0)
-            .unwrap_or_default();
+        let reqs = window_tree.plan.required_input_ordering().swap_remove(0);
 
         // Satisfy the ordering requirement so that the window can run:
         let mut child_node = window_tree.children.swap_remove(0);
-        child_node = add_sort_above(child_node, reqs, None);
+        if let Some(reqs) = reqs {
+            child_node = add_sort_above(child_node, reqs.into_single(), None);
+        }
         let child_plan = Arc::clone(&child_node.plan);
         window_tree.children.push(child_node);
 
@@ -742,8 +795,7 @@ fn remove_corresponding_sort_from_sub_plan(
         let fetch = plan.fetch();
         let plan = if let Some(ordering) = plan.output_ordering() {
             Arc::new(
-                SortPreservingMergeExec::new(LexOrdering::new(ordering.to_vec()), plan)
-                    .with_fetch(fetch),
+                SortPreservingMergeExec::new(ordering.clone(), plan).with_fetch(fetch),
             ) as _
         } else {
             Arc::new(CoalescePartitionsExec::new(plan)) as _
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs b/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
index 9769e2e0366f7..6ab84dc95eab9 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/replace_with_order_preserving_variants.rs
@@ -27,16 +27,13 @@ use crate::utils::{
 
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::Transformed;
-use datafusion_physical_expr::LexOrdering;
-use datafusion_physical_plan::internal_err;
-
-use datafusion_common::Result;
+use datafusion_common::{Result, assert_or_internal_err};
+use datafusion_physical_plan::ExecutionPlanProperties;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::execution_plan::EmissionType;
 use datafusion_physical_plan::repartition::RepartitionExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::tree_node::PlanContext;
-use datafusion_physical_plan::ExecutionPlanProperties;
 
 use itertools::izip;
 
@@ -142,16 +139,21 @@ pub fn plan_with_order_preserving_variants(
         if let Some(ordering) = child.output_ordering() {
             let mut fetch = fetch;
             if let Some(coalesce_fetch) = sort_input.plan.fetch() {
-                if let Some(sort_fetch) = fetch {
-                    if coalesce_fetch < sort_fetch {
-                        return internal_err!(
-                                "CoalescePartitionsExec fetch [{:?}] should be greater than or equal to SortExec fetch [{:?}]", coalesce_fetch, sort_fetch
-                            );
+                fetch = match fetch {
+                    Some(sort_fetch) => {
+                        assert_or_internal_err!(
+                            coalesce_fetch >= sort_fetch,
+                            "CoalescePartitionsExec fetch [{:?}] should be greater than or equal to SortExec fetch [{:?}]",
+                            coalesce_fetch,
+                            sort_fetch
+                        );
+                        Some(sort_fetch)
                     }
-                } else {
-                    // If the sort node does not have a fetch, we need to keep the coalesce node's fetch.
-                    fetch = Some(coalesce_fetch);
-                }
+                    None => {
+                        // If the sort node does not have a fetch, we need to keep the coalesce node's fetch.
+                        Some(coalesce_fetch)
+                    }
+                };
             };
             // When the input of a `CoalescePartitionsExec` has an ordering,
             // replace it with a `SortPreservingMergeExec` if appropriate:
@@ -181,18 +183,17 @@ pub fn plan_with_order_breaking_variants(
     .map(|(node, maintains, required_ordering)| {
         // Replace with non-order preserving variants as long as ordering is
         // not required by intermediate operators:
-        if maintains
-            && (is_sort_preserving_merge(plan)
-                || !required_ordering.is_some_and(|required_ordering| {
-                    node.plan
-                        .equivalence_properties()
-                        .ordering_satisfy_requirement(&required_ordering)
-                }))
-        {
-            plan_with_order_breaking_variants(node)
-        } else {
-            Ok(node)
+        if !maintains {
+            return Ok(node);
+        } else if is_sort_preserving_merge(plan) {
+            return plan_with_order_breaking_variants(node);
+        } else if let Some(required_ordering) = required_ordering {
+            let eqp = node.plan.equivalence_properties();
+            if eqp.ordering_satisfy_requirement(required_ordering.into_single())? {
+                return Ok(node);
+            }
         }
+        plan_with_order_breaking_variants(node)
     })
     .collect::<Result<_>>()?;
     sort_input.data = false;
@@ -281,25 +282,18 @@ pub fn replace_with_order_preserving_variants(
     )?;
 
     // If the alternate plan makes this sort unnecessary, accept the alternate:
-    if alternate_plan
-        .plan
-        .equivalence_properties()
-        .ordering_satisfy(
-            requirements
-                .plan
-                .output_ordering()
-                .unwrap_or_else(|| LexOrdering::empty()),
-        )
-    {
-        for child in alternate_plan.children.iter_mut() {
-            child.data = false;
+    if let Some(ordering) = requirements.plan.output_ordering() {
+        let eqp = alternate_plan.plan.equivalence_properties();
+        if !eqp.ordering_satisfy(ordering.clone())? {
+            // The alternate plan does not help, use faster order-breaking variants:
+            alternate_plan = plan_with_order_breaking_variants(alternate_plan)?;
+            alternate_plan.data = false;
+            requirements.children = vec![alternate_plan];
+            return Ok(Transformed::yes(requirements));
         }
-        Ok(Transformed::yes(alternate_plan))
-    } else {
-        // The alternate plan does not help, use faster order-breaking variants:
-        alternate_plan = plan_with_order_breaking_variants(alternate_plan)?;
-        alternate_plan.data = false;
-        requirements.children = vec![alternate_plan];
-        Ok(Transformed::yes(requirements))
     }
+    for child in alternate_plan.children.iter_mut() {
+        child.data = false;
+    }
+    Ok(Transformed::yes(alternate_plan))
 }
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
index 6d2c014f9e7cc..2d9bfe217f40e 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
@@ -24,15 +24,22 @@ use crate::utils::{
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_common::{plan_err, HashSet, JoinSide, Result};
+use datafusion_common::{HashSet, JoinSide, Result, internal_err};
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::utils::collect_columns;
-use datafusion_physical_expr::PhysicalSortRequirement;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr::{
+    EquivalenceProperties, add_offset_to_physical_sort_exprs,
+};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortExpr,
+    PhysicalSortRequirement,
+};
+use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::execution_plan::CardinalityEffect;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{
-    calculate_join_output_ordering, ColumnIndex,
+    ColumnIndex, calculate_join_output_ordering,
 };
 use datafusion_physical_plan::joins::{HashJoinExec, SortMergeJoinExec};
 use datafusion_physical_plan::projection::ProjectionExec;
@@ -50,7 +57,7 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 /// [`EnforceSorting`]: crate::enforce_sorting::EnforceSorting
 #[derive(Default, Clone, Debug)]
 pub struct ParentRequirements {
-    ordering_requirement: Option<LexRequirement>,
+    ordering_requirement: Option<OrderingRequirements>,
     fetch: Option<usize>,
 }
 
@@ -69,6 +76,7 @@ pub fn assign_initial_requirements(sort_push_down: &mut SortPushDown) {
     }
 }
 
+/// Tries to push down the sort requirements as far as possible, if decides a `SortExec` is unnecessary removes it.
 pub fn pushdown_sorts(sort_push_down: SortPushDown) -> Result<SortPushDown> {
     sort_push_down
         .transform_down(pushdown_sorts_helper)
@@ -87,91 +95,108 @@ fn min_fetch(f1: Option<usize>, f2: Option<usize>) -> Option<usize> {
 fn pushdown_sorts_helper(
     mut sort_push_down: SortPushDown,
 ) -> Result<Transformed<SortPushDown>> {
-    let plan = &sort_push_down.plan;
-    let parent_reqs = sort_push_down
-        .data
-        .ordering_requirement
-        .clone()
-        .unwrap_or_default();
-    let satisfy_parent = plan
-        .equivalence_properties()
-        .ordering_satisfy_requirement(&parent_reqs);
-
-    if is_sort(plan) {
-        let current_sort_fetch = plan.fetch();
-        let parent_req_fetch = sort_push_down.data.fetch;
+    let plan = sort_push_down.plan;
+    let parent_fetch = sort_push_down.data.fetch;
 
-        let current_plan_reqs = plan
-            .output_ordering()
-            .cloned()
-            .map(LexRequirement::from)
-            .unwrap_or_default();
-        let parent_is_stricter = plan
-            .equivalence_properties()
-            .requirements_compatible(&parent_reqs, &current_plan_reqs);
-        let current_is_stricter = plan
-            .equivalence_properties()
-            .requirements_compatible(&current_plan_reqs, &parent_reqs);
+    let Some(parent_requirement) = sort_push_down.data.ordering_requirement.clone()
+    else {
+        // If there are no ordering requirements from the parent, nothing to do
+        // unless we have a sort.
+        if is_sort(&plan) {
+            let Some(sort_ordering) = plan.output_ordering().cloned() else {
+                return internal_err!("SortExec should have output ordering");
+            };
+            // The sort is unnecessary, just propagate the stricter fetch and
+            // ordering requirements.
+            let fetch = min_fetch(plan.fetch(), parent_fetch);
+            sort_push_down = sort_push_down
+                .children
+                .swap_remove(0)
+                .update_plan_from_children()?;
+            sort_push_down.data.fetch = fetch;
+            sort_push_down.data.ordering_requirement =
+                Some(OrderingRequirements::from(sort_ordering));
+            // Recursive call to helper, so it doesn't transform_down and miss
+            // the new node (previous child of sort):
+            return pushdown_sorts_helper(sort_push_down);
+        }
+        sort_push_down.plan = plan;
+        return Ok(Transformed::no(sort_push_down));
+    };
 
-        if !satisfy_parent && !parent_is_stricter {
-            // This new sort has different requirements than the ordering being pushed down.
-            // 1. add a `SortExec` here for the pushed down ordering (parent reqs).
-            // 2. continue sort pushdown, but with the new ordering of the new sort.
+    let eqp = plan.equivalence_properties();
+    let satisfy_parent =
+        eqp.ordering_satisfy_requirement(parent_requirement.first().clone())?;
 
-            // remove current sort (which will be the new ordering to pushdown)
-            let new_reqs = current_plan_reqs;
-            sort_push_down = sort_push_down.children.swap_remove(0);
-            sort_push_down = sort_push_down.update_plan_from_children()?; // changed plan
+    if is_sort(&plan) {
+        let Some(sort_ordering) = plan.output_ordering().cloned() else {
+            return internal_err!("SortExec should have output ordering");
+        };
 
-            // add back sort exec matching parent
-            sort_push_down =
-                add_sort_above(sort_push_down, parent_reqs, parent_req_fetch);
+        let sort_fetch = plan.fetch();
+        let parent_is_stricter = eqp.requirements_compatible(
+            parent_requirement.first().clone(),
+            sort_ordering.clone().into(),
+        );
 
-            // make pushdown requirements be the new ones.
+        // Remove the current sort as we are either going to prove that it is
+        // unnecessary, or replace it with a stricter sort.
+        sort_push_down = sort_push_down
+            .children
+            .swap_remove(0)
+            .update_plan_from_children()?;
+        if !satisfy_parent && !parent_is_stricter {
+            // The sort was imposing a different ordering than the one being
+            // pushed down. Replace it with a sort that matches the pushed-down
+            // ordering, and continue the pushdown.
+            // Add back the sort:
+            sort_push_down = add_sort_above(
+                sort_push_down,
+                parent_requirement.into_single(),
+                parent_fetch,
+            );
+            // Update pushdown requirements:
             sort_push_down.children[0].data = ParentRequirements {
-                ordering_requirement: Some(new_reqs),
-                fetch: current_sort_fetch,
+                ordering_requirement: Some(OrderingRequirements::from(sort_ordering)),
+                fetch: sort_fetch,
             };
+            return Ok(Transformed::yes(sort_push_down));
         } else {
-            // Don't add a SortExec
-            // Do update what sort requirements to keep pushing down
-
-            // remove current sort, and get the sort's child
-            sort_push_down = sort_push_down.children.swap_remove(0);
-            sort_push_down = sort_push_down.update_plan_from_children()?; // changed plan
-
-            // set the stricter fetch
-            sort_push_down.data.fetch = min_fetch(current_sort_fetch, parent_req_fetch);
-
-            // set the stricter ordering
-            if current_is_stricter {
-                sort_push_down.data.ordering_requirement = Some(current_plan_reqs);
+            // Sort was unnecessary, just propagate the stricter fetch and
+            // ordering requirements:
+            sort_push_down.data.fetch = min_fetch(sort_fetch, parent_fetch);
+            let current_is_stricter = eqp.requirements_compatible(
+                sort_ordering.clone().into(),
+                parent_requirement.first().clone(),
+            );
+            sort_push_down.data.ordering_requirement = if current_is_stricter {
+                Some(OrderingRequirements::from(sort_ordering))
             } else {
-                sort_push_down.data.ordering_requirement = Some(parent_reqs);
-            }
-
-            // recursive call to helper, so it doesn't transform_down and miss the new node (previous child of sort)
+                Some(parent_requirement)
+            };
+            // Recursive call to helper, so it doesn't transform_down and miss
+            // the new node (previous child of sort):
             return pushdown_sorts_helper(sort_push_down);
         }
-    } else if parent_reqs.is_empty() {
-        // note: this `satisfy_parent`, but we don't want to push down anything.
-        // Nothing to do.
-        return Ok(Transformed::no(sort_push_down));
-    } else if satisfy_parent {
+    }
+
+    sort_push_down.plan = plan;
+    if satisfy_parent {
         // For non-sort operators which satisfy ordering:
-        let reqs = plan.required_input_ordering();
-        let parent_req_fetch = sort_push_down.data.fetch;
+        let reqs = sort_push_down.plan.required_input_ordering();
 
         for (child, order) in sort_push_down.children.iter_mut().zip(reqs) {
             child.data.ordering_requirement = order;
-            child.data.fetch = min_fetch(parent_req_fetch, child.data.fetch);
+            child.data.fetch = min_fetch(parent_fetch, child.data.fetch);
         }
-    } else if let Some(adjusted) = pushdown_requirement_to_children(plan, &parent_reqs)? {
-        // For operators that can take a sort pushdown.
-
-        // Continue pushdown, with updated requirements:
-        let parent_fetch = sort_push_down.data.fetch;
-        let current_fetch = plan.fetch();
+    } else if let Some(adjusted) = pushdown_requirement_to_children(
+        &sort_push_down.plan,
+        parent_requirement.clone(),
+        parent_fetch,
+    )? {
+        // For operators that can take a sort pushdown, continue with updated
+        // requirements:
+        let current_fetch = sort_push_down.plan.fetch();
         for (child, order) in sort_push_down.children.iter_mut().zip(adjusted) {
             child.data.ordering_requirement = order;
             child.data.fetch = min_fetch(current_fetch, parent_fetch);
@@ -179,16 +204,13 @@ fn pushdown_sorts_helper(
         sort_push_down.data.ordering_requirement = None;
     } else {
         // Can not push down requirements, add new `SortExec`:
-        let sort_reqs = sort_push_down
-            .data
-            .ordering_requirement
-            .clone()
-            .unwrap_or_default();
-        let fetch = sort_push_down.data.fetch;
-        sort_push_down = add_sort_above(sort_push_down, sort_reqs, fetch);
+        sort_push_down = add_sort_above(
+            sort_push_down,
+            parent_requirement.into_single(),
+            parent_fetch,
+        );
         assign_initial_requirements(&mut sort_push_down);
     }
-
     Ok(Transformed::yes(sort_push_down))
 }
 
@@ -196,21 +218,52 @@ fn pushdown_sorts_helper(
 /// If sort cannot be pushed down, return None.
 fn pushdown_requirement_to_children(
     plan: &Arc<dyn ExecutionPlan>,
-    parent_required: &LexRequirement,
-) -> Result<Option<Vec<Option<LexRequirement>>>> {
+    parent_required: OrderingRequirements,
+    parent_fetch: Option<usize>,
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    // If there is a limit on the parent plan we cannot push it down through operators that change the cardinality.
+    // E.g. consider if LIMIT 2 is applied below a FilteExec that filters out 1/2 of the rows we'll end up with 1 row instead of 2.
+    // If the LIMIT is applied after the FilterExec and the FilterExec returns > 2 rows we'll end up with 2 rows (correct).
+    if parent_fetch.is_some() && !plan.supports_limit_pushdown() {
+        return Ok(None);
+    }
+    // Note: we still need to check the cardinality effect of the plan here, because the
+    // limit pushdown is not always safe, even if the plan supports it. Here's an example:
+    //
+    // UnionExec advertises `supports_limit_pushdown() == true` because it can
+    // forward a LIMIT k to each of its children—i.e. apply “LIMIT k” separately
+    // on each branch before merging them together.
+    //
+    // However, UnionExec’s `cardinality_effect() == GreaterEqual` (it sums up
+    // all child row counts), so pushing a global TopK/LIMIT through it would
+    // break the semantics of “take the first k rows of the combined result.”
+    //
+    // For example, with two branches A and B and k = 3:
+    //   — Global LIMIT: take the first 3 rows from (A ∪ B) after merging.
+    //   — Pushed down: take 3 from A, 3 from B, then merge → up to 6 rows!
+    //
+    // That’s why we still block on cardinality: even though UnionExec can
+    // push a LIMIT to its children, its GreaterEqual effect means it cannot
+    // preserve the global TopK semantics.
+    if parent_fetch.is_some() {
+        match plan.cardinality_effect() {
+            CardinalityEffect::Equal => {
+                // safe: only true sources (e.g. CoalesceBatchesExec, ProjectionExec) pass
+            }
+            _ => return Ok(None),
+        }
+    }
+
     let maintains_input_order = plan.maintains_input_order();
     if is_window(plan) {
-        let required_input_ordering = plan.required_input_ordering();
-        let request_child = required_input_ordering[0].clone().unwrap_or_default();
+        let mut required_input_ordering = plan.required_input_ordering();
+        let maybe_child_requirement = required_input_ordering.swap_remove(0);
         let child_plan = plan.children().swap_remove(0);
-
-        match determine_children_requirement(parent_required, &request_child, child_plan)
-        {
-            RequirementsCompatibility::Satisfy => {
-                let req = (!request_child.is_empty())
-                    .then(|| LexRequirement::new(request_child.to_vec()));
-                Ok(Some(vec![req]))
-            }
+        let Some(child_req) = maybe_child_requirement else {
+            return Ok(None);
+        };
+        match determine_children_requirement(&parent_required, &child_req, child_plan) {
+            RequirementsCompatibility::Satisfy => Ok(Some(vec![Some(child_req)])),
             RequirementsCompatibility::Compatible(adjusted) => {
                 // If parent requirements are more specific than output ordering
                 // of the window plan, then we can deduce that the parent expects
@@ -218,7 +271,7 @@ fn pushdown_requirement_to_children(
                 // that's the case, we block the pushdown of sort operation.
                 if !plan
                     .equivalence_properties()
-                    .ordering_satisfy_requirement(parent_required)
+                    .ordering_satisfy_requirement(parent_required.into_single())?
                 {
                     return Ok(None);
                 }
@@ -228,82 +281,71 @@ fn pushdown_requirement_to_children(
             RequirementsCompatibility::NonCompatible => Ok(None),
         }
     } else if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
-        let sort_req = LexRequirement::from(
-            sort_exec
-                .properties()
-                .output_ordering()
-                .cloned()
-                .unwrap_or_else(LexOrdering::default),
-        );
-        if sort_exec
+        let Some(sort_ordering) = sort_exec.properties().output_ordering().cloned()
+        else {
+            return internal_err!("SortExec should have output ordering");
+        };
+        sort_exec
             .properties()
             .eq_properties
-            .requirements_compatible(parent_required, &sort_req)
-        {
-            debug_assert!(!parent_required.is_empty());
-            Ok(Some(vec![Some(LexRequirement::new(
-                parent_required.to_vec(),
-            ))]))
-        } else {
-            Ok(None)
-        }
+            .requirements_compatible(
+                parent_required.first().clone(),
+                sort_ordering.into(),
+            )
+            .then(|| Ok(vec![Some(parent_required)]))
+            .transpose()
     } else if plan.fetch().is_some()
         && plan.supports_limit_pushdown()
         && plan
             .maintains_input_order()
-            .iter()
-            .all(|maintain| *maintain)
+            .into_iter()
+            .all(|maintain| maintain)
     {
-        let output_req = LexRequirement::from(
-            plan.properties()
-                .output_ordering()
-                .cloned()
-                .unwrap_or_else(LexOrdering::default),
-        );
         // Push down through operator with fetch when:
         // - requirement is aligned with output ordering
         // - it preserves ordering during execution
-        if plan
-            .properties()
-            .eq_properties
-            .requirements_compatible(parent_required, &output_req)
-        {
-            let req = (!parent_required.is_empty())
-                .then(|| LexRequirement::new(parent_required.to_vec()));
-            Ok(Some(vec![req]))
+        let Some(ordering) = plan.properties().output_ordering() else {
+            return Ok(Some(vec![Some(parent_required)]));
+        };
+        if plan.properties().eq_properties.requirements_compatible(
+            parent_required.first().clone(),
+            ordering.clone().into(),
+        ) {
+            Ok(Some(vec![Some(parent_required)]))
         } else {
             Ok(None)
         }
     } else if is_union(plan) {
-        // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and
-        // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec
-        let req = (!parent_required.is_empty()).then(|| parent_required.clone());
-        Ok(Some(vec![req; plan.children().len()]))
+        // `UnionExec` does not have real sort requirements for its input, we
+        // just propagate the sort requirements down:
+        Ok(Some(vec![Some(parent_required); plan.children().len()]))
     } else if let Some(smj) = plan.as_any().downcast_ref::<SortMergeJoinExec>() {
-        // If the current plan is SortMergeJoinExec
         let left_columns_len = smj.left().schema().fields().len();
-        let parent_required_expr = LexOrdering::from(parent_required.clone());
-        match expr_source_side(
-            parent_required_expr.as_ref(),
-            smj.join_type(),
-            left_columns_len,
-        ) {
-            Some(JoinSide::Left) => try_pushdown_requirements_to_join(
+        let parent_ordering: Vec<PhysicalSortExpr> = parent_required
+            .first()
+            .iter()
+            .cloned()
+            .map(Into::into)
+            .collect();
+        let eqp = smj.properties().equivalence_properties();
+        match expr_source_side(eqp, parent_ordering, smj.join_type(), left_columns_len) {
+            Some((JoinSide::Left, ordering)) => try_pushdown_requirements_to_join(
                 smj,
-                parent_required,
-                parent_required_expr.as_ref(),
+                parent_required.into_single(),
+                ordering,
                 JoinSide::Left,
             ),
-            Some(JoinSide::Right) => {
+            Some((JoinSide::Right, ordering)) => {
                 let right_offset =
                     smj.schema().fields.len() - smj.right().schema().fields.len();
-                let new_right_required =
-                    shift_right_required(parent_required, right_offset)?;
-                let new_right_required_expr = LexOrdering::from(new_right_required);
+                let ordering = add_offset_to_physical_sort_exprs(
+                    ordering,
+                    -(right_offset as isize),
+                )?;
                 try_pushdown_requirements_to_join(
                     smj,
-                    parent_required,
-                    new_right_required_expr.as_ref(),
+                    parent_required.into_single(),
+                    ordering,
                     JoinSide::Right,
                 )
             }
@@ -312,62 +354,132 @@ fn pushdown_requirement_to_children(
                 Ok(None)
             }
         }
+    } else if let Some(aggregate_exec) = plan.as_any().downcast_ref::<AggregateExec>() {
+        handle_aggregate_pushdown(aggregate_exec, parent_required)
     } else if maintains_input_order.is_empty()
         || !maintains_input_order.iter().any(|o| *o)
         || plan.as_any().is::<RepartitionExec>()
         || plan.as_any().is::<FilterExec>()
         // TODO: Add support for Projection push down
         || plan.as_any().is::<ProjectionExec>()
-        || pushdown_would_violate_requirements(parent_required, plan.as_ref())
+        || pushdown_would_violate_requirements(&parent_required, plan.as_ref())
     {
         // If the current plan is a leaf node or can not maintain any of the input ordering, can not pushed down requirements.
         // For RepartitionExec, we always choose to not push down the sort requirements even the RepartitionExec(input_partition=1) could maintain input ordering.
         // Pushing down is not beneficial
         Ok(None)
     } else if is_sort_preserving_merge(plan) {
-        let new_ordering = LexOrdering::from(parent_required.clone());
+        let new_ordering = LexOrdering::from(parent_required.first().clone());
         let mut spm_eqs = plan.equivalence_properties().clone();
+        let old_ordering = spm_eqs.output_ordering().unwrap();
         // Sort preserving merge will have new ordering, one requirement above is pushed down to its below.
-        spm_eqs = spm_eqs.with_reorder(new_ordering);
-        // Do not push-down through SortPreservingMergeExec when
-        // ordering requirement invalidates requirement of sort preserving merge exec.
-        if !spm_eqs.ordering_satisfy(&plan.output_ordering().cloned().unwrap_or_default())
-        {
-            Ok(None)
-        } else {
+        let change = spm_eqs.reorder(new_ordering)?;
+        if !change || spm_eqs.ordering_satisfy(old_ordering)? {
             // Can push-down through SortPreservingMergeExec, because parent requirement is finer
             // than SortPreservingMergeExec output ordering.
-            let req = (!parent_required.is_empty())
-                .then(|| LexRequirement::new(parent_required.to_vec()));
-            Ok(Some(vec![req]))
+            Ok(Some(vec![Some(parent_required)]))
+        } else {
+            // Do not push-down through SortPreservingMergeExec when
+            // ordering requirement invalidates requirement of sort preserving merge exec.
+            Ok(None)
         }
     } else if let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() {
         handle_hash_join(hash_join, parent_required)
     } else {
-        handle_custom_pushdown(plan, parent_required, maintains_input_order)
+        handle_custom_pushdown(plan, parent_required, &maintains_input_order)
     }
     // TODO: Add support for Projection push down
 }
 
+/// Try to push sorting through  [`AggregateExec`]
+///
+/// `AggregateExec` only preserves the input order of its group by columns
+/// (not aggregates in general, which are formed from arbitrary expressions over
+/// input)
+///
+/// Thus function rewrites the parent required ordering in terms of the
+/// aggregate input if possible. This rewritten requirement represents the
+/// ordering of the `AggregateExec`'s **input** that would also satisfy the
+/// **parent** ordering.
+///
+/// If no such mapping is possible (e.g. because the sort references aggregate
+/// columns), returns None.
+fn handle_aggregate_pushdown(
+    aggregate_exec: &AggregateExec,
+    parent_required: OrderingRequirements,
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    if !aggregate_exec
+        .maintains_input_order()
+        .into_iter()
+        .any(|o| o)
+    {
+        return Ok(None);
+    }
+
+    let group_expr = aggregate_exec.group_expr();
+    // GROUPING SETS introduce additional output columns and NULL substitutions;
+    // skip pushdown until we can map those cases safely.
+    if group_expr.has_grouping_set() {
+        return Ok(None);
+    }
+
+    let group_input_exprs = group_expr.input_exprs();
+    let parent_requirement = parent_required.into_single();
+    let mut child_requirement = Vec::with_capacity(parent_requirement.len());
+
+    for req in parent_requirement {
+        // Sort above AggregateExec should reference its output columns. Map each
+        // output group-by column to its original input expression.
+        let Some(column) = req.expr.as_any().downcast_ref::<Column>() else {
+            return Ok(None);
+        };
+        if column.index() >= group_input_exprs.len() {
+            // AggregateExec does not produce output that is sorted on aggregate
+            // columns so those can not be pushed through.
+            return Ok(None);
+        }
+        child_requirement.push(PhysicalSortRequirement::new(
+            Arc::clone(&group_input_exprs[column.index()]),
+            req.options,
+        ));
+    }
+
+    let Some(child_requirement) = LexRequirement::new(child_requirement) else {
+        return Ok(None);
+    };
+
+    // Keep sort above aggregate unless input ordering already satisfies the
+    // mapped requirement.
+    if aggregate_exec
+        .input()
+        .equivalence_properties()
+        .ordering_satisfy_requirement(child_requirement.iter().cloned())?
+    {
+        let child_requirements = OrderingRequirements::new(child_requirement);
+        Ok(Some(vec![Some(child_requirements)]))
+    } else {
+        Ok(None)
+    }
+}
+
 /// Return true if pushing the sort requirements through a node would violate
 /// the input sorting requirements for the plan
 fn pushdown_would_violate_requirements(
-    parent_required: &LexRequirement,
+    parent_required: &OrderingRequirements,
     child: &dyn ExecutionPlan,
 ) -> bool {
     child
         .required_input_ordering()
-        .iter()
+        .into_iter()
+        // If there is no requirement, pushing down would not violate anything.
+        .flatten()
         .any(|child_required| {
-            let Some(child_required) = child_required.as_ref() else {
-                // no requirements, so pushing down would not violate anything
-                return false;
-            };
-            // check if the plan's requirements would still e satisfied if we pushed
-            // down the parent requirements
+            // Check if the plan's requirements would still be satisfied if we
+            // pushed down the parent requirements:
             child_required
+                .into_single()
                 .iter()
-                .zip(parent_required.iter())
+                .zip(parent_required.first().iter())
                 .all(|(c, p)| !c.compatible(p))
         })
 }
@@ -378,25 +490,24 @@ fn pushdown_would_violate_requirements(
 /// - If parent requirements are more specific, push down parent requirements.
 /// - If they are not compatible, need to add a sort.
 fn determine_children_requirement(
-    parent_required: &LexRequirement,
-    request_child: &LexRequirement,
+    parent_required: &OrderingRequirements,
+    child_requirement: &OrderingRequirements,
     child_plan: &Arc<dyn ExecutionPlan>,
 ) -> RequirementsCompatibility {
-    if child_plan
-        .equivalence_properties()
-        .requirements_compatible(request_child, parent_required)
-    {
+    let eqp = child_plan.equivalence_properties();
+    if eqp.requirements_compatible(
+        child_requirement.first().clone(),
+        parent_required.first().clone(),
+    ) {
         // Child requirements are more specific, no need to push down.
         RequirementsCompatibility::Satisfy
-    } else if child_plan
-        .equivalence_properties()
-        .requirements_compatible(parent_required, request_child)
-    {
+    } else if eqp.requirements_compatible(
+        parent_required.first().clone(),
+        child_requirement.first().clone(),
+    ) {
         // Parent requirements are more specific, adjust child's requirements
         // and push down the new requirements:
-        let adjusted = (!parent_required.is_empty())
-            .then(|| LexRequirement::new(parent_required.to_vec()));
-        RequirementsCompatibility::Compatible(adjusted)
+        RequirementsCompatibility::Compatible(Some(parent_required.clone()))
     } else {
         RequirementsCompatibility::NonCompatible
     }
@@ -404,42 +515,41 @@ fn determine_children_requirement(
 
 fn try_pushdown_requirements_to_join(
     smj: &SortMergeJoinExec,
-    parent_required: &LexRequirement,
-    sort_expr: &LexOrdering,
+    parent_required: LexRequirement,
+    sort_exprs: Vec<PhysicalSortExpr>,
     push_side: JoinSide,
-) -> Result<Option<Vec<Option<LexRequirement>>>> {
-    let left_eq_properties = smj.left().equivalence_properties();
-    let right_eq_properties = smj.right().equivalence_properties();
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
     let mut smj_required_orderings = smj.required_input_ordering();
-    let right_requirement = smj_required_orderings.swap_remove(1);
-    let left_requirement = smj_required_orderings.swap_remove(0);
-    let left_ordering = &smj.left().output_ordering().cloned().unwrap_or_default();
-    let right_ordering = &smj.right().output_ordering().cloned().unwrap_or_default();
 
+    let ordering = LexOrdering::new(sort_exprs.clone());
     let (new_left_ordering, new_right_ordering) = match push_side {
         JoinSide::Left => {
-            let left_eq_properties =
-                left_eq_properties.clone().with_reorder(sort_expr.clone());
-            if left_eq_properties
-                .ordering_satisfy_requirement(&left_requirement.unwrap_or_default())
+            let mut left_eq_properties = smj.left().equivalence_properties().clone();
+            left_eq_properties.reorder(sort_exprs)?;
+            let Some(left_requirement) = smj_required_orderings.swap_remove(0) else {
+                return Ok(None);
+            };
+            if !left_eq_properties
+                .ordering_satisfy_requirement(left_requirement.into_single())?
             {
-                // After re-ordering requirement is still satisfied
-                (sort_expr, right_ordering)
-            } else {
                 return Ok(None);
             }
+            // After re-ordering, requirement is still satisfied:
+            (ordering.as_ref(), smj.right().output_ordering())
         }
         JoinSide::Right => {
-            let right_eq_properties =
-                right_eq_properties.clone().with_reorder(sort_expr.clone());
-            if right_eq_properties
-                .ordering_satisfy_requirement(&right_requirement.unwrap_or_default())
+            let mut right_eq_properties = smj.right().equivalence_properties().clone();
+            right_eq_properties.reorder(sort_exprs)?;
+            let Some(right_requirement) = smj_required_orderings.swap_remove(1) else {
+                return Ok(None);
+            };
+            if !right_eq_properties
+                .ordering_satisfy_requirement(right_requirement.into_single())?
             {
-                // After re-ordering requirement is still satisfied
-                (left_ordering, sort_expr)
-            } else {
                 return Ok(None);
             }
+            // After re-ordering, requirement is still satisfied:
+            (smj.left().output_ordering(), ordering.as_ref())
         }
         JoinSide::None => return Ok(None),
     };
@@ -449,18 +559,19 @@ fn try_pushdown_requirements_to_join(
         new_left_ordering,
         new_right_ordering,
         join_type,
-        smj.on(),
         smj.left().schema().fields.len(),
         &smj.maintains_input_order(),
         Some(probe_side),
-    );
+    )?;
     let mut smj_eqs = smj.properties().equivalence_properties().clone();
-    // smj will have this ordering when its input changes.
-    smj_eqs = smj_eqs.with_reorder(new_output_ordering.unwrap_or_default());
-    let should_pushdown = smj_eqs.ordering_satisfy_requirement(parent_required);
+    if let Some(new_output_ordering) = new_output_ordering {
+        // smj will have this ordering when its input changes.
+        smj_eqs.reorder(new_output_ordering)?;
+    }
+    let should_pushdown = smj_eqs.ordering_satisfy_requirement(parent_required)?;
     Ok(should_pushdown.then(|| {
         let mut required_input_ordering = smj.required_input_ordering();
-        let new_req = Some(LexRequirement::from(sort_expr.clone()));
+        let new_req = ordering.map(Into::into);
         match push_side {
             JoinSide::Left => {
                 required_input_ordering[0] = new_req;
@@ -475,77 +586,78 @@ fn try_pushdown_requirements_to_join(
 }
 
 fn expr_source_side(
-    required_exprs: &LexOrdering,
+    eqp: &EquivalenceProperties,
+    mut ordering: Vec<PhysicalSortExpr>,
     join_type: JoinType,
     left_columns_len: usize,
-) -> Option<JoinSide> {
+) -> Option<(JoinSide, Vec<PhysicalSortExpr>)> {
+    // TODO: Handle the case where a prefix of the ordering comes from the left
+    //       and a suffix from the right.
     match join_type {
         JoinType::Inner
         | JoinType::Left
         | JoinType::Right
         | JoinType::Full
-        | JoinType::LeftMark => {
-            let all_column_sides = required_exprs
-                .iter()
-                .filter_map(|r| {
-                    r.expr.as_any().downcast_ref::<Column>().map(|col| {
-                        if col.index() < left_columns_len {
-                            JoinSide::Left
-                        } else {
-                            JoinSide::Right
+        | JoinType::LeftMark
+        | JoinType::RightMark => {
+            let eq_group = eqp.eq_group();
+            let mut right_ordering = ordering.clone();
+            let (mut valid_left, mut valid_right) = (true, true);
+            for (left, right) in ordering.iter_mut().zip(right_ordering.iter_mut()) {
+                let col = left.expr.as_any().downcast_ref::<Column>()?;
+                let eq_class = eq_group.get_equivalence_class(&left.expr);
+                if col.index() < left_columns_len {
+                    if valid_right {
+                        valid_right = eq_class.is_some_and(|cls| {
+                            for expr in cls.iter() {
+                                if expr
+                                    .as_any()
+                                    .downcast_ref::<Column>()
+                                    .is_some_and(|c| c.index() >= left_columns_len)
+                                {
+                                    right.expr = Arc::clone(expr);
+                                    return true;
+                                }
+                            }
+                            false
+                        });
+                    }
+                } else if valid_left {
+                    valid_left = eq_class.is_some_and(|cls| {
+                        for expr in cls.iter() {
+                            if expr
+                                .as_any()
+                                .downcast_ref::<Column>()
+                                .is_some_and(|c| c.index() < left_columns_len)
+                            {
+                                left.expr = Arc::clone(expr);
+                                return true;
+                            }
                         }
-                    })
-                })
-                .collect::<Vec<_>>();
-
-            // If the exprs are all coming from one side, the requirements can be pushed down
-            if all_column_sides.len() != required_exprs.len() {
-                None
-            } else if all_column_sides
-                .iter()
-                .all(|side| matches!(side, JoinSide::Left))
-            {
-                Some(JoinSide::Left)
-            } else if all_column_sides
-                .iter()
-                .all(|side| matches!(side, JoinSide::Right))
-            {
-                Some(JoinSide::Right)
+                        false
+                    });
+                };
+                if !(valid_left || valid_right) {
+                    return None;
+                }
+            }
+            if valid_left {
+                Some((JoinSide::Left, ordering))
+            } else if valid_right {
+                Some((JoinSide::Right, right_ordering))
             } else {
+                // TODO: Handle the case where we can push down to both sides.
                 None
             }
         }
-        JoinType::LeftSemi | JoinType::LeftAnti => required_exprs
+        JoinType::LeftSemi | JoinType::LeftAnti => ordering
             .iter()
-            .all(|e| e.expr.as_any().downcast_ref::<Column>().is_some())
-            .then_some(JoinSide::Left),
-        JoinType::RightSemi | JoinType::RightAnti => required_exprs
+            .all(|e| e.expr.as_any().is::<Column>())
+            .then_some((JoinSide::Left, ordering)),
+        JoinType::RightSemi | JoinType::RightAnti => ordering
             .iter()
-            .all(|e| e.expr.as_any().downcast_ref::<Column>().is_some())
-            .then_some(JoinSide::Right),
-    }
-}
-
-fn shift_right_required(
-    parent_required: &LexRequirement,
-    left_columns_len: usize,
-) -> Result<LexRequirement> {
-    let new_right_required = parent_required
-        .iter()
-        .filter_map(|r| {
-            let col = r.expr.as_any().downcast_ref::<Column>()?;
-            col.index().checked_sub(left_columns_len).map(|offset| {
-                r.clone()
-                    .with_expr(Arc::new(Column::new(col.name(), offset)))
-            })
-        })
-        .collect::<Vec<_>>();
-    if new_right_required.len() == parent_required.len() {
-        Ok(LexRequirement::new(new_right_required))
-    } else {
-        plan_err!(
-            "Expect to shift all the parent required column indexes for SortMergeJoin"
-        )
+            .all(|e| e.expr.as_any().is::<Column>())
+            .then_some((JoinSide::Right, ordering)),
     }
 }
 
@@ -565,16 +677,18 @@ fn shift_right_required(
 /// pushed down, `Ok(None)` if not. On error, returns a `Result::Err`.
 fn handle_custom_pushdown(
     plan: &Arc<dyn ExecutionPlan>,
-    parent_required: &LexRequirement,
-    maintains_input_order: Vec<bool>,
-) -> Result<Option<Vec<Option<LexRequirement>>>> {
-    // If there's no requirement from the parent or the plan has no children, return early
-    if parent_required.is_empty() || plan.children().is_empty() {
+    parent_required: OrderingRequirements,
+    maintains_input_order: &[bool],
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    // If the plan has no children, return early:
+    if plan.children().is_empty() {
         return Ok(None);
     }
 
-    // Collect all unique column indices used in the parent-required sorting expression
-    let all_indices: HashSet<usize> = parent_required
+    // Collect all unique column indices used in the parent-required sorting
+    // expression:
+    let requirement = parent_required.into_single();
+    let all_indices: HashSet<usize> = requirement
         .iter()
         .flat_map(|order| {
             collect_columns(&order.expr)
@@ -584,14 +698,14 @@ fn handle_custom_pushdown(
         })
         .collect();
 
-    // Get the number of fields in each child's schema
-    let len_of_child_schemas: Vec<usize> = plan
+    // Get the number of fields in each child's schema:
+    let children_schema_lengths: Vec<usize> = plan
         .children()
         .iter()
         .map(|c| c.schema().fields().len())
         .collect();
 
-    // Find the index of the child that maintains input order
+    // Find the index of the order-maintaining child:
     let Some(maintained_child_idx) = maintains_input_order
         .iter()
         .enumerate()
@@ -601,26 +715,28 @@ fn handle_custom_pushdown(
         return Ok(None);
     };
 
-    // Check if all required columns come from the child that maintains input order
-    let start_idx = len_of_child_schemas[..maintained_child_idx]
+    // Check if all required columns come from the order-maintaining child:
+    let start_idx = children_schema_lengths[..maintained_child_idx]
         .iter()
         .sum::<usize>();
-    let end_idx = start_idx + len_of_child_schemas[maintained_child_idx];
+    let end_idx = start_idx + children_schema_lengths[maintained_child_idx];
     let all_from_maintained_child =
         all_indices.iter().all(|i| i >= &start_idx && i < &end_idx);
 
-    // If all columns are from the maintained child, update the parent requirements
+    // If all columns are from the maintained child, update the parent requirements:
     if all_from_maintained_child {
-        let sub_offset = len_of_child_schemas
+        let sub_offset = children_schema_lengths
             .iter()
             .take(maintained_child_idx)
             .sum::<usize>();
-        // Transform the parent-required expression for the child schema by adjusting columns
-        let updated_parent_req = parent_required
-            .iter()
+        // Transform the parent-required expression for the child schema by
+        // adjusting columns:
+        let updated_parent_req = requirement
+            .into_iter()
             .map(|req| {
                 let child_schema = plan.children()[maintained_child_idx].schema();
-                let updated_columns = Arc::clone(&req.expr)
+                let updated_columns = req
+                    .expr
                     .transform_up(|expr| {
                         if let Some(col) = expr.as_any().downcast_ref::<Column>() {
                             let new_index = col.index() - sub_offset;
@@ -642,7 +758,8 @@ fn handle_custom_pushdown(
             .iter()
             .map(|&maintains_order| {
                 if maintains_order {
-                    Some(LexRequirement::new(updated_parent_req.clone()))
+                    LexRequirement::new(updated_parent_req.clone())
+                        .map(OrderingRequirements::new)
                 } else {
                     None
                 }
@@ -659,16 +776,17 @@ fn handle_custom_pushdown(
 // for join type: Inner, Right, RightSemi, RightAnti
 fn handle_hash_join(
     plan: &HashJoinExec,
-    parent_required: &LexRequirement,
-) -> Result<Option<Vec<Option<LexRequirement>>>> {
-    // If there's no requirement from the parent or the plan has no children
-    // or the join type is not Inner, Right, RightSemi, RightAnti, return early
-    if parent_required.is_empty() || !plan.maintains_input_order()[1] {
+    parent_required: OrderingRequirements,
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    // If the plan has no children or does not maintain the right side ordering,
+    // return early:
+    if !plan.maintains_input_order()[1] {
         return Ok(None);
     }
 
     // Collect all unique column indices used in the parent-required sorting expression
-    let all_indices: HashSet<usize> = parent_required
+    let requirement = parent_required.into_single();
+    let all_indices: HashSet<_> = requirement
         .iter()
         .flat_map(|order| {
             collect_columns(&order.expr)
@@ -679,7 +797,7 @@ fn handle_hash_join(
         .collect();
 
     let column_indices = build_join_column_index(plan);
-    let projected_indices: Vec<_> = if let Some(projection) = &plan.projection {
+    let projected_indices: Vec<_> = if let Some(projection) = plan.projection.as_ref() {
         projection.iter().map(|&i| &column_indices[i]).collect()
     } else {
         column_indices.iter().collect()
@@ -694,11 +812,12 @@ fn handle_hash_join(
     // If all columns are from the right child, update the parent requirements
     if all_from_right_child {
         // Transform the parent-required expression for the child schema by adjusting columns
-        let updated_parent_req = parent_required
-            .iter()
+        let updated_parent_req = requirement
+            .into_iter()
             .map(|req| {
                 let child_schema = plan.children()[1].schema();
-                let updated_columns = Arc::clone(&req.expr)
+                let updated_columns = req
+                    .expr
                     .transform_up(|expr| {
                         if let Some(col) = expr.as_any().downcast_ref::<Column>() {
                             let index = projected_indices[col.index()].index;
@@ -718,7 +837,7 @@ fn handle_hash_join(
         // Populating with the updated requirements for children that maintain order
         Ok(Some(vec![
             None,
-            Some(LexRequirement::new(updated_parent_req)),
+            LexRequirement::new(updated_parent_req).map(OrderingRequirements::new),
         ]))
     } else {
         Ok(None)
@@ -757,7 +876,7 @@ enum RequirementsCompatibility {
     /// Requirements satisfy
     Satisfy,
     /// Requirements compatible
-    Compatible(Option<LexRequirement>),
+    Compatible(Option<OrderingRequirements>),
     /// Requirements not compatible
     NonCompatible,
 }
diff --git a/datafusion/physical-optimizer/src/ensure_coop.rs b/datafusion/physical-optimizer/src/ensure_coop.rs
new file mode 100644
index 0000000000000..6925c036e466d
--- /dev/null
+++ b/datafusion/physical-optimizer/src/ensure_coop.rs
@@ -0,0 +1,420 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! The [`EnsureCooperative`] optimizer rule inspects the physical plan to find all
+//! portions of the plan that will not yield cooperatively.
+//! It will insert `CooperativeExec` nodes where appropriate to ensure execution plans
+//! always yield cooperatively.
+
+use std::fmt::{Debug, Formatter};
+use std::sync::Arc;
+
+use crate::PhysicalOptimizerRule;
+
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::coop::CooperativeExec;
+use datafusion_physical_plan::execution_plan::{EvaluationType, SchedulingType};
+
+/// `EnsureCooperative` is a [`PhysicalOptimizerRule`] that inspects the physical plan for
+/// sub plans that do not participate in cooperative scheduling. The plan is subdivided into sub
+/// plans on eager evaluation boundaries. Leaf nodes and eager evaluation roots are checked
+/// to see if they participate in cooperative scheduling. Those that do no are wrapped in
+/// a [`CooperativeExec`] parent.
+pub struct EnsureCooperative {}
+
+impl EnsureCooperative {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl Default for EnsureCooperative {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Debug for EnsureCooperative {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct(self.name()).finish()
+    }
+}
+
+impl PhysicalOptimizerRule for EnsureCooperative {
+    fn name(&self) -> &str {
+        "EnsureCooperative"
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        use std::cell::RefCell;
+
+        let ancestry_stack = RefCell::new(Vec::<(SchedulingType, EvaluationType)>::new());
+
+        plan.transform_down_up(
+            // Down phase: Push parent properties <SchedulingType, EvaluationType> into the stack
+            |plan| {
+                let props = plan.properties();
+                ancestry_stack
+                    .borrow_mut()
+                    .push((props.scheduling_type, props.evaluation_type));
+                Ok(Transformed::no(plan))
+            },
+            // Up phase: Wrap nodes with CooperativeExec if needed
+            |plan| {
+                ancestry_stack.borrow_mut().pop();
+
+                let props = plan.properties();
+                let is_cooperative = props.scheduling_type == SchedulingType::Cooperative;
+                let is_leaf = plan.children().is_empty();
+                let is_exchange = props.evaluation_type == EvaluationType::Eager;
+
+                let mut is_under_cooperative_context = false;
+                for (scheduling_type, evaluation_type) in
+                    ancestry_stack.borrow().iter().rev()
+                {
+                    // If nearest ancestor is cooperative, we are under a cooperative context
+                    if *scheduling_type == SchedulingType::Cooperative {
+                        is_under_cooperative_context = true;
+                        break;
+                    // If nearest ancestor is eager, the cooperative context will be reset
+                    } else if *evaluation_type == EvaluationType::Eager {
+                        is_under_cooperative_context = false;
+                        break;
+                    }
+                }
+
+                // Wrap if:
+                // 1. Node is a leaf or exchange point
+                // 2. Node is not already cooperative
+                // 3. Not under any Cooperative context
+                if (is_leaf || is_exchange)
+                    && !is_cooperative
+                    && !is_under_cooperative_context
+                {
+                    return Ok(Transformed::yes(Arc::new(CooperativeExec::new(plan))));
+                }
+
+                Ok(Transformed::no(plan))
+            },
+        )
+        .map(|t| t.data)
+    }
+
+    fn schema_check(&self) -> bool {
+        // Wrapping a leaf in YieldStreamExec preserves the schema, so it is safe.
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_physical_plan::{displayable, test::scan_partitioned};
+    use insta::assert_snapshot;
+
+    #[tokio::test]
+    async fn test_cooperative_exec_for_custom_exec() {
+        let test_custom_exec = scan_partitioned(1);
+        let config = ConfigOptions::new();
+        let optimized = EnsureCooperative::new()
+            .optimize(test_custom_exec, &config)
+            .unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        // Use insta snapshot to ensure full plan structure
+        assert_snapshot!(display, @r"
+        CooperativeExec
+          DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    #[tokio::test]
+    async fn test_optimizer_is_idempotent() {
+        // Comprehensive idempotency test: verify f(f(...f(x))) = f(x)
+        // This test covers:
+        // 1. Multiple runs on unwrapped plan
+        // 2. Multiple runs on already-wrapped plan
+        // 3. No accumulation of CooperativeExec nodes
+
+        let config = ConfigOptions::new();
+        let rule = EnsureCooperative::new();
+
+        // Test 1: Start with unwrapped plan, run multiple times
+        let unwrapped_plan = scan_partitioned(1);
+        let mut current = unwrapped_plan;
+        let mut stable_result = String::new();
+
+        for run in 1..=5 {
+            current = rule.optimize(current, &config).unwrap();
+            let display = displayable(current.as_ref()).indent(true).to_string();
+
+            if run == 1 {
+                stable_result = display.clone();
+                assert_eq!(display.matches("CooperativeExec").count(), 1);
+            } else {
+                assert_eq!(
+                    display, stable_result,
+                    "Run {run} should match run 1 (idempotent)"
+                );
+                assert_eq!(
+                    display.matches("CooperativeExec").count(),
+                    1,
+                    "Should always have exactly 1 CooperativeExec, not accumulate"
+                );
+            }
+        }
+
+        // Test 2: Start with already-wrapped plan, verify no double wrapping
+        let pre_wrapped = Arc::new(CooperativeExec::new(scan_partitioned(1)));
+        let result = rule.optimize(pre_wrapped, &config).unwrap();
+        let display = displayable(result.as_ref()).indent(true).to_string();
+
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            1,
+            "Should not double-wrap already cooperative plans"
+        );
+        assert_eq!(
+            display, stable_result,
+            "Pre-wrapped plan should produce same result as unwrapped after optimization"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_selective_wrapping() {
+        // Test that wrapping is selective: only leaf/eager nodes, not intermediate nodes
+        // Also verify depth tracking prevents double wrapping in subtrees
+        use datafusion_physical_expr::expressions::lit;
+        use datafusion_physical_plan::filter::FilterExec;
+
+        let config = ConfigOptions::new();
+        let rule = EnsureCooperative::new();
+
+        // Case 1: Filter -> Scan (middle node should not be wrapped)
+        let scan = scan_partitioned(1);
+        let filter = Arc::new(FilterExec::try_new(lit(true), scan).unwrap());
+        let optimized = rule.optimize(filter, &config).unwrap();
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        assert_eq!(display.matches("CooperativeExec").count(), 1);
+        assert!(display.contains("FilterExec"));
+
+        // Case 2: Filter -> CoopExec -> Scan (depth tracking prevents double wrap)
+        let scan2 = scan_partitioned(1);
+        let wrapped_scan = Arc::new(CooperativeExec::new(scan2));
+        let filter2 = Arc::new(FilterExec::try_new(lit(true), wrapped_scan).unwrap());
+        let optimized2 = rule.optimize(filter2, &config).unwrap();
+        let display2 = displayable(optimized2.as_ref()).indent(true).to_string();
+
+        assert_eq!(display2.matches("CooperativeExec").count(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_multiple_leaf_nodes() {
+        // When there are multiple leaf nodes, each should be wrapped separately
+        use datafusion_physical_plan::union::UnionExec;
+
+        let scan1 = scan_partitioned(1);
+        let scan2 = scan_partitioned(1);
+        let union = UnionExec::try_new(vec![scan1, scan2]).unwrap();
+
+        let config = ConfigOptions::new();
+        let optimized = EnsureCooperative::new()
+            .optimize(union as Arc<dyn ExecutionPlan>, &config)
+            .unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        // Each leaf should have its own CooperativeExec
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            2,
+            "Each leaf node should be wrapped separately"
+        );
+        assert_eq!(
+            display.matches("DataSourceExec").count(),
+            2,
+            "Both data sources should be present"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_eager_evaluation_resets_cooperative_context() {
+        // Test that cooperative context is reset when encountering an eager evaluation boundary.
+        use arrow::datatypes::Schema;
+        use datafusion_common::tree_node::TreeNodeRecursion;
+        use datafusion_common::{Result, internal_err};
+        use datafusion_execution::TaskContext;
+        use datafusion_physical_expr::EquivalenceProperties;
+        use datafusion_physical_plan::{
+            DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr, PlanProperties,
+            SendableRecordBatchStream,
+            execution_plan::{Boundedness, EmissionType},
+        };
+        use std::any::Any;
+        use std::fmt::Formatter;
+
+        #[derive(Debug)]
+        struct DummyExec {
+            name: String,
+            input: Arc<dyn ExecutionPlan>,
+            scheduling_type: SchedulingType,
+            evaluation_type: EvaluationType,
+            properties: Arc<PlanProperties>,
+        }
+
+        impl DummyExec {
+            fn new(
+                name: &str,
+                input: Arc<dyn ExecutionPlan>,
+                scheduling_type: SchedulingType,
+                evaluation_type: EvaluationType,
+            ) -> Self {
+                let properties = PlanProperties::new(
+                    EquivalenceProperties::new(Arc::new(Schema::empty())),
+                    Partitioning::UnknownPartitioning(1),
+                    EmissionType::Incremental,
+                    Boundedness::Bounded,
+                )
+                .with_scheduling_type(scheduling_type)
+                .with_evaluation_type(evaluation_type);
+
+                Self {
+                    name: name.to_string(),
+                    input,
+                    scheduling_type,
+                    evaluation_type,
+                    properties: Arc::new(properties),
+                }
+            }
+        }
+
+        impl DisplayAs for DummyExec {
+            fn fmt_as(
+                &self,
+                _: DisplayFormatType,
+                f: &mut Formatter,
+            ) -> std::fmt::Result {
+                write!(f, "{}", self.name)
+            }
+        }
+
+        impl ExecutionPlan for DummyExec {
+            fn name(&self) -> &str {
+                &self.name
+            }
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+            fn properties(&self) -> &Arc<PlanProperties> {
+                &self.properties
+            }
+            fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+                vec![&self.input]
+            }
+            fn with_new_children(
+                self: Arc<Self>,
+                children: Vec<Arc<dyn ExecutionPlan>>,
+            ) -> Result<Arc<dyn ExecutionPlan>> {
+                Ok(Arc::new(DummyExec::new(
+                    &self.name,
+                    Arc::clone(&children[0]),
+                    self.scheduling_type,
+                    self.evaluation_type,
+                )))
+            }
+            fn execute(
+                &self,
+                _: usize,
+                _: Arc<TaskContext>,
+            ) -> Result<SendableRecordBatchStream> {
+                internal_err!("DummyExec does not support execution")
+            }
+
+            fn apply_expressions(
+                &self,
+                _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+            ) -> Result<TreeNodeRecursion> {
+                Ok(TreeNodeRecursion::Continue)
+            }
+        }
+
+        // Build a plan similar to the original test:
+        // scan -> exch1(NonCoop,Eager) -> CoopExec -> filter -> exch2(Coop,Eager) -> filter
+        let scan = scan_partitioned(1);
+        let exch1 = Arc::new(DummyExec::new(
+            "exch1",
+            scan,
+            SchedulingType::NonCooperative,
+            EvaluationType::Eager,
+        ));
+        let coop = Arc::new(CooperativeExec::new(exch1));
+        let filter1 = Arc::new(DummyExec::new(
+            "filter1",
+            coop,
+            SchedulingType::NonCooperative,
+            EvaluationType::Lazy,
+        ));
+        let exch2 = Arc::new(DummyExec::new(
+            "exch2",
+            filter1,
+            SchedulingType::Cooperative,
+            EvaluationType::Eager,
+        ));
+        let filter2 = Arc::new(DummyExec::new(
+            "filter2",
+            exch2,
+            SchedulingType::NonCooperative,
+            EvaluationType::Lazy,
+        ));
+
+        let config = ConfigOptions::new();
+        let optimized = EnsureCooperative::new().optimize(filter2, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+
+        // Expected wrapping:
+        // - Scan (leaf) gets wrapped
+        // - exch1 (eager+noncoop) keeps its manual CooperativeExec wrapper
+        // - filter1 is protected by exch2's cooperative context, no extra wrap
+        // - exch2 (already Cooperative) does NOT get wrapped
+        // - filter2 (not leaf or eager) does NOT get wrapped
+        assert_eq!(
+            display.matches("CooperativeExec").count(),
+            2,
+            "Should have 2 CooperativeExec: one wrapping scan, one wrapping exch1"
+        );
+
+        assert_snapshot!(display, @r"
+        filter2
+          exch2
+            filter1
+              CooperativeExec
+                exch1
+                  CooperativeExec
+                    DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+}
diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs
index 5b2d47106b8dc..28f8155002a50 100644
--- a/datafusion/physical-optimizer/src/filter_pushdown.rs
+++ b/datafusion/physical-optimizer/src/filter_pushdown.rs
@@ -15,20 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Filter Pushdown Optimization Process
+//!
+//! The filter pushdown mechanism involves four key steps:
+//! 1. **Optimizer Asks Parent for a Filter Pushdown Plan**: The optimizer calls [`ExecutionPlan::gather_filters_for_pushdown`]
+//!    on the parent node, passing in parent predicates and phase. The parent node creates a [`FilterDescription`]
+//!    by inspecting its logic and children's schemas, determining which filters can be pushed to each child.
+//! 2. **Optimizer Executes Pushdown**: The optimizer recursively calls `push_down_filters` in this module on each child,
+//!    passing the appropriate filters (`Vec<Arc<dyn PhysicalExpr>>`) for that child.
+//! 3. **Optimizer Gathers Results**: The optimizer collects [`FilterPushdownPropagation`] results from children,
+//!    containing information about which filters were successfully pushed down vs. unsupported.
+//! 4. **Parent Responds**: The optimizer calls [`ExecutionPlan::handle_child_pushdown_result`] on the parent,
+//!    passing a [`ChildPushdownResult`] containing the aggregated pushdown outcomes. The parent decides
+//!    how to handle filters that couldn't be pushed down (e.g., keep them as FilterExec nodes).
+//!
+//! [`FilterDescription`]: datafusion_physical_plan::filter_pushdown::FilterDescription
+
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
 
-use datafusion_common::{config::ConfigOptions, Result};
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_common::{Result, assert_eq_or_internal_err, config::ConfigOptions};
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::physical_expr::is_volatile;
 use datafusion_physical_plan::filter_pushdown::{
-    ChildPushdownResult, FilterPushdownPropagation, PredicateSupport, PredicateSupports,
+    ChildFilterPushdownResult, ChildPushdownResult, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDown,
 };
-use datafusion_physical_plan::{with_new_children_if_necessary, ExecutionPlan};
+use datafusion_physical_plan::{ExecutionPlan, with_new_children_if_necessary};
 
-use itertools::izip;
+use itertools::{Itertools, izip};
 
-/// Attempts to recursively push given filters from the top of the tree into leafs.
+/// Attempts to recursively push given filters from the top of the tree into leaves.
 ///
 /// # Default Implementation
 ///
@@ -362,11 +381,31 @@ use itertools::izip;
 /// [`ProjectionExec`]: datafusion_physical_plan::projection::ProjectionExec
 /// [`AggregateExec`]: datafusion_physical_plan::aggregates::AggregateExec
 #[derive(Debug)]
-pub struct FilterPushdown {}
+pub struct FilterPushdown {
+    phase: FilterPushdownPhase,
+    name: String,
+}
 
 impl FilterPushdown {
+    fn new_with_phase(phase: FilterPushdownPhase) -> Self {
+        let name = match phase {
+            FilterPushdownPhase::Pre => "FilterPushdown",
+            FilterPushdownPhase::Post => "FilterPushdown(Post)",
+        }
+        .to_string();
+        Self { phase, name }
+    }
+
+    /// Create a new [`FilterPushdown`] optimizer rule that runs in the pre-optimization phase.
+    /// See [`FilterPushdownPhase`] for more details.
     pub fn new() -> Self {
-        Self {}
+        Self::new_with_phase(FilterPushdownPhase::Pre)
+    }
+
+    /// Create a new [`FilterPushdown`] optimizer rule that runs in the post-optimization phase.
+    /// See [`FilterPushdownPhase`] for more details.
+    pub fn new_post_optimization() -> Self {
+        Self::new_with_phase(FilterPushdownPhase::Post)
     }
 }
 
@@ -382,13 +421,15 @@ impl PhysicalOptimizerRule for FilterPushdown {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(push_down_filters(Arc::clone(&plan), vec![], config)?
-            .updated_node
-            .unwrap_or(plan))
+        Ok(
+            push_down_filters(&Arc::clone(&plan), vec![], config, self.phase)?
+                .updated_node
+                .unwrap_or(plan),
+        )
     }
 
     fn name(&self) -> &str {
-        "FilterPushdown"
+        &self.name
     }
 
     fn schema_check(&self) -> bool {
@@ -396,71 +437,77 @@ impl PhysicalOptimizerRule for FilterPushdown {
     }
 }
 
-/// Support state of each predicate for the children of the node.
-/// These predicates are coming from the parent node.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum ParentPredicateStates {
-    NoChildren,
-    Unsupported,
-    Supported,
-}
-
 fn push_down_filters(
-    node: Arc<dyn ExecutionPlan>,
+    node: &Arc<dyn ExecutionPlan>,
     parent_predicates: Vec<Arc<dyn PhysicalExpr>>,
     config: &ConfigOptions,
+    phase: FilterPushdownPhase,
 ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-    // If the node has any child, these will be rewritten as supported or unsupported
-    let mut parent_predicates_pushdown_states =
-        vec![ParentPredicateStates::NoChildren; parent_predicates.len()];
+    let mut parent_filter_pushdown_supports: Vec<Vec<PushedDown>> =
+        vec![vec![]; parent_predicates.len()];
     let mut self_filters_pushdown_supports = vec![];
     let mut new_children = Vec::with_capacity(node.children().len());
 
     let children = node.children();
-    let filter_description =
-        node.gather_filters_for_pushdown(parent_predicates.clone(), config)?;
 
-    for (child, parent_filters, self_filters) in izip!(
+    // Filter out expressions that are not allowed for pushdown
+    let parent_filtered = FilteredVec::new(&parent_predicates, allow_pushdown_for_expr);
+
+    let filter_description = node.gather_filters_for_pushdown(
+        phase,
+        parent_filtered.items().to_vec(),
+        config,
+    )?;
+
+    let filter_description_parent_filters = filter_description.parent_filters();
+    let filter_description_self_filters = filter_description.self_filters();
+    assert_eq_or_internal_err!(
+        filter_description_parent_filters.len(),
+        children.len(),
+        "Filter pushdown expected parent filters count to match number of children for node {}",
+        node.name()
+    );
+    assert_eq_or_internal_err!(
+        filter_description_self_filters.len(),
+        children.len(),
+        "Filter pushdown expected self filters count to match number of children for node {}",
+        node.name()
+    );
+
+    for (child_idx, (child, parent_filters, self_filters)) in izip!(
         children,
         filter_description.parent_filters(),
         filter_description.self_filters()
-    ) {
+    )
+    .enumerate()
+    {
         // Here, `parent_filters` are the predicates which are provided by the parent node of
         // the current node, and tried to be pushed down over the child which the loop points
         // currently. `self_filters` are the predicates which are provided by the current node,
         // and tried to be pushed down over the child similarly.
 
-        let num_self_filters = self_filters.len();
-        let mut parent_supported_predicate_indices = vec![];
-        let mut all_predicates = self_filters;
-
-        // Iterate over each predicate coming from the parent
-        for (idx, filter) in parent_filters.into_iter().enumerate() {
-            // Check if we can push this filter down to our child.
-            // These supports are defined in `gather_filters_for_pushdown()`
-            match filter {
-                PredicateSupport::Supported(predicate) => {
-                    // Queue this filter up for pushdown to this child
-                    all_predicates.push(predicate);
-                    parent_supported_predicate_indices.push(idx);
-                    // Mark this filter as supported by our children if no child has marked it as unsupported
-                    if parent_predicates_pushdown_states[idx]
-                        != ParentPredicateStates::Unsupported
-                    {
-                        parent_predicates_pushdown_states[idx] =
-                            ParentPredicateStates::Supported;
-                    }
-                }
-                PredicateSupport::Unsupported(_) => {
-                    // Mark as unsupported by our children
-                    parent_predicates_pushdown_states[idx] =
-                        ParentPredicateStates::Unsupported;
-                }
-            }
+        // Filter out self_filters that contain volatile expressions and track indices
+        let self_filtered = FilteredVec::new(&self_filters, allow_pushdown_for_expr);
+
+        let num_self_filters = self_filtered.len();
+        let mut all_predicates = self_filtered.items().to_vec();
+
+        // Apply second filter pass: collect indices of parent filters that can be pushed down
+        let parent_filters_for_child = parent_filtered
+            .chain_filter_slice(&parent_filters, |filter| {
+                matches!(filter.discriminant, PushedDown::Yes)
+            });
+
+        // Add the filtered parent predicates to all_predicates
+        for filter in parent_filters_for_child.items() {
+            all_predicates.push(Arc::clone(&filter.predicate));
         }
 
+        let num_parent_filters = all_predicates.len() - num_self_filters;
+
         // Any filters that could not be pushed down to a child are marked as not-supported to our parents
-        let result = push_down_filters(Arc::clone(child), all_predicates, config)?;
+        let result =
+            push_down_filters(&Arc::clone(child), all_predicates, config, phase)?;
 
         if let Some(new_child) = result.updated_node {
             // If we have a filter pushdown result, we need to update our children
@@ -473,69 +520,342 @@ fn push_down_filters(
         // Our child doesn't know the difference between filters that were passed down
         // from our parents and filters that the current node injected. We need to de-entangle
         // this since we do need to distinguish between them.
-        let mut all_filters = result.filters.into_inner();
-        let parent_predicates = all_filters.split_off(num_self_filters);
-        let self_predicates = all_filters;
-        self_filters_pushdown_supports.push(PredicateSupports::new(self_predicates));
-
-        for (idx, result) in parent_supported_predicate_indices
-            .iter()
-            .zip(parent_predicates)
-        {
-            let current_node_state = match result {
-                PredicateSupport::Supported(_) => ParentPredicateStates::Supported,
-                PredicateSupport::Unsupported(_) => ParentPredicateStates::Unsupported,
-            };
-            match (current_node_state, parent_predicates_pushdown_states[*idx]) {
-                (r, ParentPredicateStates::NoChildren) => {
-                    // If we have no result, use the current state from this child
-                    parent_predicates_pushdown_states[*idx] = r;
-                }
-                (ParentPredicateStates::Supported, ParentPredicateStates::Supported) => {
-                    // If the current child and all previous children are supported,
-                    // the filter continues to support it
-                    parent_predicates_pushdown_states[*idx] =
-                        ParentPredicateStates::Supported;
-                }
-                _ => {
-                    // Either the current child or a previous child marked this filter as unsupported
-                    parent_predicates_pushdown_states[*idx] =
-                        ParentPredicateStates::Unsupported;
-                }
-            }
+        let mut all_filters = result.filters.into_iter().collect_vec();
+        assert_eq_or_internal_err!(
+            all_filters.len(),
+            num_self_filters + num_parent_filters,
+            "Filter pushdown did not return the expected number of filters from {}",
+            child.name()
+        );
+        let parent_filters = all_filters
+            .split_off(num_self_filters)
+            .into_iter()
+            .collect_vec();
+        // Map the results from filtered self filters back to their original positions using FilteredVec
+        let mapped_self_results =
+            self_filtered.map_results_to_original(all_filters, PushedDown::No);
+
+        // Wrap each result with its corresponding expression
+        let self_filter_results: Vec<_> = mapped_self_results
+            .into_iter()
+            .zip(self_filters)
+            .map(|(support, filter)| support.wrap_expression(filter))
+            .collect();
+
+        self_filters_pushdown_supports.push(self_filter_results);
+
+        // Start by marking all parent filters as unsupported for this child
+        for parent_filter_pushdown_support in parent_filter_pushdown_supports.iter_mut() {
+            parent_filter_pushdown_support.push(PushedDown::No);
+            assert_eq!(
+                parent_filter_pushdown_support.len(),
+                child_idx + 1,
+                "Parent filter pushdown supports should have the same length as the number of children"
+            );
+        }
+        // Map results from pushed-down filters back to original parent filter indices
+        let mapped_parent_results = parent_filters_for_child
+            .map_results_to_original(parent_filters, PushedDown::No);
+
+        // Update parent_filter_pushdown_supports with the mapped results
+        // mapped_parent_results already has the results at their original indices
+        for (idx, support) in parent_filter_pushdown_supports.iter_mut().enumerate() {
+            support[child_idx] = mapped_parent_results[idx];
         }
     }
+
     // Re-create this node with new children
-    let updated_node = with_new_children_if_necessary(Arc::clone(&node), new_children)?;
-    // Remap the result onto the parent filters as they were given to us.
-    // Any filters that were not pushed down to any children are marked as unsupported.
-    let parent_pushdown_result = PredicateSupports::new(
-        parent_predicates_pushdown_states
-            .into_iter()
-            .zip(parent_predicates)
-            .map(|(state, filter)| match state {
-                ParentPredicateStates::NoChildren => {
-                    PredicateSupport::Unsupported(filter)
-                }
-                ParentPredicateStates::Unsupported => {
-                    PredicateSupport::Unsupported(filter)
-                }
-                ParentPredicateStates::Supported => PredicateSupport::Supported(filter),
-            })
-            .collect(),
-    );
-    // Check what the current node wants to do given the result of pushdown to it's children
+    let updated_node = with_new_children_if_necessary(Arc::clone(node), new_children)?;
+
+    // TODO: by calling `handle_child_pushdown_result` we are assuming that the
+    // `ExecutionPlan` implementation will not change the plan itself.
+    // Should we have a separate method for dynamic pushdown that does not allow modifying the plan?
     let mut res = updated_node.handle_child_pushdown_result(
+        phase,
         ChildPushdownResult {
-            parent_filters: parent_pushdown_result,
+            parent_filters: parent_predicates
+                .into_iter()
+                .enumerate()
+                .map(
+                    |(parent_filter_idx, parent_filter)| ChildFilterPushdownResult {
+                        filter: parent_filter,
+                        child_results: parent_filter_pushdown_supports[parent_filter_idx]
+                            .clone(),
+                    },
+                )
+                .collect(),
             self_filters: self_filters_pushdown_supports,
         },
         config,
     )?;
     // Compare pointers for new_node and node, if they are different we must replace
     // ourselves because of changes in our children.
-    if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, &node) {
+    if res.updated_node.is_none() && !Arc::ptr_eq(&updated_node, node) {
         res.updated_node = Some(updated_node)
     }
     Ok(res)
 }
+
+/// A helper structure for filtering elements from a vector through multiple passes while
+/// tracking their original indices, allowing results to be mapped back to the original positions.
+struct FilteredVec<T> {
+    items: Vec<T>,
+    // Chain of index mappings: each Vec maps from current level to previous level
+    // index_mappings[0] maps from first filter to original indices
+    // index_mappings[1] maps from second filter to first filter indices, etc.
+    index_mappings: Vec<Vec<usize>>,
+    original_len: usize,
+}
+
+impl<T: Clone> FilteredVec<T> {
+    /// Creates a new FilteredVec by filtering items based on the given predicate
+    fn new<F>(items: &[T], predicate: F) -> Self
+    where
+        F: Fn(&T) -> bool,
+    {
+        let mut filtered_items = Vec::new();
+        let mut original_indices = Vec::new();
+
+        for (idx, item) in items.iter().enumerate() {
+            if predicate(item) {
+                filtered_items.push(item.clone());
+                original_indices.push(idx);
+            }
+        }
+
+        Self {
+            items: filtered_items,
+            index_mappings: vec![original_indices],
+            original_len: items.len(),
+        }
+    }
+
+    /// Returns a reference to the filtered items
+    fn items(&self) -> &[T] {
+        &self.items
+    }
+
+    /// Returns the number of filtered items
+    fn len(&self) -> usize {
+        self.items.len()
+    }
+
+    /// Maps results from the filtered items back to their original positions
+    /// Returns a vector with the same length as the original input, filled with default_value
+    /// and updated with results at their original positions
+    fn map_results_to_original<R: Clone>(
+        &self,
+        results: Vec<R>,
+        default_value: R,
+    ) -> Vec<R> {
+        let mut mapped_results = vec![default_value; self.original_len];
+
+        for (result_idx, result) in results.into_iter().enumerate() {
+            let original_idx = self.trace_to_original_index(result_idx);
+            mapped_results[original_idx] = result;
+        }
+
+        mapped_results
+    }
+
+    /// Traces a filtered index back to its original index through all filter passes
+    fn trace_to_original_index(&self, mut current_idx: usize) -> usize {
+        // Work backwards through the chain of index mappings
+        for mapping in self.index_mappings.iter().rev() {
+            current_idx = mapping[current_idx];
+        }
+        current_idx
+    }
+
+    /// Apply a filter to a new set of items while chaining the index mapping from self (parent)
+    /// This is useful when you have filtered items and then get a transformed slice
+    /// (e.g., from gather_filters_for_pushdown) that you need to filter again
+    fn chain_filter_slice<U: Clone, F>(&self, items: &[U], predicate: F) -> FilteredVec<U>
+    where
+        F: Fn(&U) -> bool,
+    {
+        let mut filtered_items = Vec::new();
+        let mut filtered_indices = Vec::new();
+
+        for (idx, item) in items.iter().enumerate() {
+            if predicate(item) {
+                filtered_items.push(item.clone());
+                filtered_indices.push(idx);
+            }
+        }
+
+        // Chain the index mappings from parent (self)
+        let mut index_mappings = self.index_mappings.clone();
+        index_mappings.push(filtered_indices);
+
+        FilteredVec {
+            items: filtered_items,
+            index_mappings,
+            original_len: self.original_len,
+        }
+    }
+}
+
+fn allow_pushdown_for_expr(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    let mut allow_pushdown = true;
+    expr.apply(|e| {
+        allow_pushdown = allow_pushdown && !is_volatile(e);
+        if allow_pushdown {
+            Ok(TreeNodeRecursion::Continue)
+        } else {
+            Ok(TreeNodeRecursion::Stop)
+        }
+    })
+    .expect("Infallible traversal of PhysicalExpr tree failed");
+    allow_pushdown
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_filtered_vec_single_pass() {
+        let items = vec![1, 2, 3, 4, 5, 6];
+        let filtered = FilteredVec::new(&items, |&x| x % 2 == 0);
+
+        // Check filtered items
+        assert_eq!(filtered.items(), &[2, 4, 6]);
+        assert_eq!(filtered.len(), 3);
+
+        // Check index mapping
+        let results = vec!["a", "b", "c"];
+        let mapped = filtered.map_results_to_original(results, "default");
+        assert_eq!(mapped, vec!["default", "a", "default", "b", "default", "c"]);
+    }
+
+    #[test]
+    fn test_filtered_vec_empty_filter() {
+        let items = vec![1, 3, 5];
+        let filtered = FilteredVec::new(&items, |&x| x % 2 == 0);
+
+        assert_eq!(filtered.items(), &[] as &[i32]);
+        assert_eq!(filtered.len(), 0);
+
+        let results: Vec<&str> = vec![];
+        let mapped = filtered.map_results_to_original(results, "default");
+        assert_eq!(mapped, vec!["default", "default", "default"]);
+    }
+
+    #[test]
+    fn test_filtered_vec_all_pass() {
+        let items = vec![2, 4, 6];
+        let filtered = FilteredVec::new(&items, |&x| x % 2 == 0);
+
+        assert_eq!(filtered.items(), &[2, 4, 6]);
+        assert_eq!(filtered.len(), 3);
+
+        let results = vec!["a", "b", "c"];
+        let mapped = filtered.map_results_to_original(results, "default");
+        assert_eq!(mapped, vec!["a", "b", "c"]);
+    }
+
+    #[test]
+    fn test_chain_filter_slice_different_types() {
+        // First pass: filter numbers
+        let numbers = vec![1, 2, 3, 4, 5, 6];
+        let first_pass = FilteredVec::new(&numbers, |&x| x > 3);
+        assert_eq!(first_pass.items(), &[4, 5, 6]);
+
+        // Transform to strings (simulating gather_filters_for_pushdown transformation)
+        let strings = vec!["four", "five", "six"];
+
+        // Second pass: filter strings that contain 'i'
+        let second_pass = first_pass.chain_filter_slice(&strings, |s| s.contains('i'));
+        assert_eq!(second_pass.items(), &["five", "six"]);
+
+        // Map results back to original indices
+        let results = vec![100, 200];
+        let mapped = second_pass.map_results_to_original(results, 0);
+        // "five" was at index 4 (1-based: 5), "six" was at index 5 (1-based: 6)
+        assert_eq!(mapped, vec![0, 0, 0, 0, 100, 200]);
+    }
+
+    #[test]
+    fn test_chain_filter_slice_complex_scenario() {
+        // Simulating the filter pushdown scenario
+        // Parent predicates: [A, B, C, D, E]
+        let parent_predicates = vec!["A", "B", "C", "D", "E"];
+
+        // First pass: filter out some predicates (simulating allow_pushdown_for_expr)
+        let first_pass = FilteredVec::new(&parent_predicates, |s| *s != "B" && *s != "D");
+        assert_eq!(first_pass.items(), &["A", "C", "E"]);
+
+        // After gather_filters_for_pushdown, we get transformed results for a specific child
+        // Let's say child gets [A_transformed, C_transformed, E_transformed]
+        // but only C and E can be pushed down
+        #[derive(Clone, Debug, PartialEq)]
+        struct TransformedPredicate {
+            name: String,
+            can_push: bool,
+        }
+
+        let child_predicates = vec![
+            TransformedPredicate {
+                name: "A_transformed".to_string(),
+                can_push: false,
+            },
+            TransformedPredicate {
+                name: "C_transformed".to_string(),
+                can_push: true,
+            },
+            TransformedPredicate {
+                name: "E_transformed".to_string(),
+                can_push: true,
+            },
+        ];
+
+        // Second pass: filter based on can_push
+        let second_pass =
+            first_pass.chain_filter_slice(&child_predicates, |p| p.can_push);
+        assert_eq!(second_pass.len(), 2);
+        assert_eq!(second_pass.items()[0].name, "C_transformed");
+        assert_eq!(second_pass.items()[1].name, "E_transformed");
+
+        // Simulate getting results back from child
+        let child_results = vec!["C_result", "E_result"];
+        let mapped = second_pass.map_results_to_original(child_results, "no_result");
+
+        // Results should be at original positions: C was at index 2, E was at index 4
+        assert_eq!(
+            mapped,
+            vec![
+                "no_result",
+                "no_result",
+                "C_result",
+                "no_result",
+                "E_result"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_trace_to_original_index() {
+        let items = vec![10, 20, 30, 40, 50];
+        let filtered = FilteredVec::new(&items, |&x| x != 20 && x != 40);
+
+        // filtered items are [10, 30, 50] at original indices [0, 2, 4]
+        assert_eq!(filtered.trace_to_original_index(0), 0); // 10 was at index 0
+        assert_eq!(filtered.trace_to_original_index(1), 2); // 30 was at index 2
+        assert_eq!(filtered.trace_to_original_index(2), 4); // 50 was at index 4
+    }
+
+    #[test]
+    fn test_chain_filter_preserves_original_len() {
+        let items = vec![1, 2, 3, 4, 5];
+        let first = FilteredVec::new(&items, |&x| x > 2);
+
+        let strings = vec!["three", "four", "five"];
+        let second = first.chain_filter_slice(&strings, |s| s.len() == 4);
+
+        // Original length should still be 5
+        let results = vec!["x", "y"];
+        let mapped = second.map_results_to_original(results, "-");
+        assert_eq!(mapped.len(), 5);
+    }
+}
diff --git a/datafusion/physical-optimizer/src/hash_join_buffering.rs b/datafusion/physical-optimizer/src/hash_join_buffering.rs
new file mode 100644
index 0000000000000..3c29b46c0fa64
--- /dev/null
+++ b/datafusion/physical-optimizer/src/hash_join_buffering.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::JoinSide;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::buffer::BufferExec;
+use datafusion_physical_plan::joins::HashJoinExec;
+use std::sync::Arc;
+
+/// Looks for all the [HashJoinExec]s in the plan and places a [BufferExec] node with the
+/// configured capacity in the probe side:
+///
+/// ```text
+///            ┌───────────────────┐
+///            │   HashJoinExec    │
+///            └─────▲────────▲────┘
+///          ┌───────┘        └─────────┐
+///          │                          │
+/// ┌────────────────┐         ┌─────────────────┐
+/// │   Build side   │       + │   BufferExec    │
+/// └────────────────┘         └────────▲────────┘
+///                                     │
+///                            ┌────────┴────────┐
+///                            │   Probe side    │
+///                            └─────────────────┘
+/// ```
+///
+/// Which allows eagerly pulling it even before the build side has completely finished.
+#[derive(Debug, Default)]
+pub struct HashJoinBuffering {}
+
+impl HashJoinBuffering {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl PhysicalOptimizerRule for HashJoinBuffering {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        let capacity = config.execution.hash_join_buffering_capacity;
+        if capacity == 0 {
+            return Ok(plan);
+        }
+
+        plan.transform_down(|plan| {
+            let Some(node) = plan.as_any().downcast_ref::<HashJoinExec>() else {
+                return Ok(Transformed::no(plan));
+            };
+            let plan = Arc::clone(&plan);
+            Ok(Transformed::yes(
+                if HashJoinExec::probe_side() == JoinSide::Left {
+                    // Do not stack BufferExec nodes together.
+                    if node.left.as_any().downcast_ref::<BufferExec>().is_some() {
+                        return Ok(Transformed::no(plan));
+                    }
+                    plan.with_new_children(vec![
+                        Arc::new(BufferExec::new(Arc::clone(&node.left), capacity)),
+                        Arc::clone(&node.right),
+                    ])?
+                } else {
+                    // Do not stack BufferExec nodes together.
+                    if node.right.as_any().downcast_ref::<BufferExec>().is_some() {
+                        return Ok(Transformed::no(plan));
+                    }
+                    plan.with_new_children(vec![
+                        Arc::clone(&node.left),
+                        Arc::new(BufferExec::new(Arc::clone(&node.right), capacity)),
+                    ])?
+                },
+            ))
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "HashJoinBuffering"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
diff --git a/datafusion/physical-optimizer/src/join_selection.rs b/datafusion/physical-optimizer/src/join_selection.rs
index 05758e5dfdf10..88115b12f7820 100644
--- a/datafusion/physical-optimizer/src/join_selection.rs
+++ b/datafusion/physical-optimizer/src/join_selection.rs
@@ -23,24 +23,22 @@
 //! pipeline-friendly ones. To achieve the second goal, it selects the proper
 //! `PartitionMode` and the build side using the available statistics for hash joins.
 
-use std::sync::Arc;
-
 use crate::PhysicalOptimizerRule;
-
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{internal_err, JoinSide, JoinType};
+use datafusion_common::{JoinSide, JoinType, internal_err};
 use datafusion_expr_common::sort_properties::SortProperties;
-use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_plan::execution_plan::EmissionType;
-use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
+use datafusion_physical_plan::joins::utils::ColumnIndex;
 use datafusion_physical_plan::joins::{
     CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode,
     StreamJoinPartitionMode, SymmetricHashJoinExec,
 };
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+use std::sync::Arc;
 
 /// The [`JoinSelection`] rule tries to modify a given plan so that it can
 /// accommodate infinite sources and optimize joins in the plan according to
@@ -49,7 +47,7 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 pub struct JoinSelection {}
 
 impl JoinSelection {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -89,67 +87,23 @@ fn supports_collect_by_thresholds(
     threshold_byte_size: usize,
     threshold_num_rows: usize,
 ) -> bool {
-    // Currently we do not trust the 0 value from stats, due to stats collection might have bug
-    // TODO check the logic in datasource::get_statistics_with_limit()
     let Ok(stats) = plan.partition_statistics(None) else {
         return false;
     };
 
+    // Stats use `Precision<T>` to represent stats, where `Absent` means unknown.
+    // `Exact(0)` and `Inexact(0)` are both valid stats, and we should not treat
+    // them as unknown, `Absent` will return None (this is in regards to why
+    // `!=0` is not checked)
     if let Some(byte_size) = stats.total_byte_size.get_value() {
-        *byte_size != 0 && *byte_size < threshold_byte_size
+        *byte_size < threshold_byte_size
     } else if let Some(num_rows) = stats.num_rows.get_value() {
-        *num_rows != 0 && *num_rows < threshold_num_rows
+        *num_rows < threshold_num_rows
     } else {
         false
     }
 }
 
-/// Predicate that checks whether the given join type supports input swapping.
-#[deprecated(since = "45.0.0", note = "use JoinType::supports_swap instead")]
-#[allow(dead_code)]
-pub(crate) fn supports_swap(join_type: JoinType) -> bool {
-    join_type.supports_swap()
-}
-
-/// This function returns the new join type we get after swapping the given
-/// join's inputs.
-#[deprecated(since = "45.0.0", note = "use datafusion-functions-nested instead")]
-#[allow(dead_code)]
-pub(crate) fn swap_join_type(join_type: JoinType) -> JoinType {
-    join_type.swap()
-}
-
-/// This function swaps the inputs of the given join operator.
-/// This function is public so other downstream projects can use it
-/// to construct `HashJoinExec` with right side as the build side.
-#[deprecated(since = "45.0.0", note = "use HashJoinExec::swap_inputs instead")]
-pub fn swap_hash_join(
-    hash_join: &HashJoinExec,
-    partition_mode: PartitionMode,
-) -> Result<Arc<dyn ExecutionPlan>> {
-    hash_join.swap_inputs(partition_mode)
-}
-
-/// Swaps inputs of `NestedLoopJoinExec` and wraps it into `ProjectionExec` is required
-#[deprecated(since = "45.0.0", note = "use NestedLoopJoinExec::swap_inputs")]
-#[allow(dead_code)]
-pub(crate) fn swap_nl_join(join: &NestedLoopJoinExec) -> Result<Arc<dyn ExecutionPlan>> {
-    join.swap_inputs()
-}
-
-/// Swaps join sides for filter column indices and produces new `JoinFilter` (if exists).
-#[deprecated(since = "45.0.0", note = "use filter.map(JoinFilter::swap) instead")]
-#[allow(dead_code)]
-fn swap_join_filter(filter: Option<&JoinFilter>) -> Option<JoinFilter> {
-    filter.map(JoinFilter::swap)
-}
-
-#[deprecated(since = "45.0.0", note = "use JoinFilter::swap instead")]
-#[allow(dead_code)]
-pub(crate) fn swap_filter(filter: &JoinFilter) -> JoinFilter {
-    filter.swap()
-}
-
 impl PhysicalOptimizerRule for JoinSelection {
     fn optimize(
         &self,
@@ -232,35 +186,30 @@ pub(crate) fn try_collect_left(
 
     match (left_can_collect, right_can_collect) {
         (true, true) => {
+            // Don't swap null-aware anti joins as they have specific side requirements
             if hash_join.join_type().supports_swap()
+                && !hash_join.null_aware
                 && should_swap_join_order(&**left, &**right)?
             {
                 Ok(Some(hash_join.swap_inputs(PartitionMode::CollectLeft)?))
             } else {
-                Ok(Some(Arc::new(HashJoinExec::try_new(
-                    Arc::clone(left),
-                    Arc::clone(right),
-                    hash_join.on().to_vec(),
-                    hash_join.filter().cloned(),
-                    hash_join.join_type(),
-                    hash_join.projection.clone(),
-                    PartitionMode::CollectLeft,
-                    hash_join.null_equals_null(),
-                )?)))
+                Ok(Some(Arc::new(
+                    hash_join
+                        .builder()
+                        .with_partition_mode(PartitionMode::CollectLeft)
+                        .build()?,
+                )))
             }
         }
-        (true, false) => Ok(Some(Arc::new(HashJoinExec::try_new(
-            Arc::clone(left),
-            Arc::clone(right),
-            hash_join.on().to_vec(),
-            hash_join.filter().cloned(),
-            hash_join.join_type(),
-            hash_join.projection.clone(),
-            PartitionMode::CollectLeft,
-            hash_join.null_equals_null(),
-        )?))),
+        (true, false) => Ok(Some(Arc::new(
+            hash_join
+                .builder()
+                .with_partition_mode(PartitionMode::CollectLeft)
+                .build()?,
+        ))),
         (false, true) => {
-            if hash_join.join_type().supports_swap() {
+            // Don't swap null-aware anti joins as they have specific side requirements
+            if hash_join.join_type().supports_swap() && !hash_join.null_aware {
                 hash_join.swap_inputs(PartitionMode::CollectLeft).map(Some)
             } else {
                 Ok(None)
@@ -280,20 +229,29 @@ pub(crate) fn partitioned_hash_join(
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let left = hash_join.left();
     let right = hash_join.right();
-    if hash_join.join_type().supports_swap() && should_swap_join_order(&**left, &**right)?
+    // Don't swap null-aware anti joins as they have specific side requirements
+    if hash_join.join_type().supports_swap()
+        && !hash_join.null_aware
+        && should_swap_join_order(&**left, &**right)?
     {
         hash_join.swap_inputs(PartitionMode::Partitioned)
     } else {
-        Ok(Arc::new(HashJoinExec::try_new(
-            Arc::clone(left),
-            Arc::clone(right),
-            hash_join.on().to_vec(),
-            hash_join.filter().cloned(),
-            hash_join.join_type(),
-            hash_join.projection.clone(),
-            PartitionMode::Partitioned,
-            hash_join.null_equals_null(),
-        )?))
+        // Null-aware anti joins must use CollectLeft mode because they track probe-side state
+        // (probe_side_non_empty, probe_side_has_null) per-partition, but need global knowledge
+        // for correct null handling. With partitioning, a partition might not see probe rows
+        // even if the probe side is globally non-empty, leading to incorrect NULL row handling.
+        let partition_mode = if hash_join.null_aware {
+            PartitionMode::CollectLeft
+        } else {
+            PartitionMode::Partitioned
+        };
+
+        Ok(Arc::new(
+            hash_join
+                .builder()
+                .with_partition_mode(partition_mode)
+                .build()?,
+        ))
     }
 }
 
@@ -325,7 +283,9 @@ fn statistical_join_selection_subrule(
                 PartitionMode::Partitioned => {
                     let left = hash_join.left();
                     let right = hash_join.right();
+                    // Don't swap null-aware anti joins as they have specific side requirements
                     if hash_join.join_type().supports_swap()
+                        && !hash_join.null_aware
                         && should_swap_join_order(&**left, &**right)?
                     {
                         hash_join
@@ -459,7 +419,7 @@ fn hash_join_convert_symmetric_subrule(
                             JoinSide::Right => hash_join.right().output_ordering(),
                             JoinSide::None => unreachable!(),
                         }
-                        .map(|p| LexOrdering::new(p.to_vec()))
+                        .cloned()
                     })
                     .flatten()
             };
@@ -474,7 +434,7 @@ fn hash_join_convert_symmetric_subrule(
                 hash_join.on().to_vec(),
                 hash_join.filter().cloned(),
                 hash_join.join_type(),
-                hash_join.null_equals_null(),
+                hash_join.null_equality(),
                 left_order,
                 right_order,
                 mode,
@@ -524,25 +484,21 @@ fn hash_join_convert_symmetric_subrule(
 ///           | Data Source  |--------------| Repartition  |
 ///           |              |              |              |
 ///           +--------------+              +--------------+
-///
 /// ```
 pub fn hash_join_swap_subrule(
     mut input: Arc<dyn ExecutionPlan>,
     _config_options: &ConfigOptions,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    if let Some(hash_join) = input.as_any().downcast_ref::<HashJoinExec>() {
-        if hash_join.left.boundedness().is_unbounded()
-            && !hash_join.right.boundedness().is_unbounded()
-            && matches!(
-                *hash_join.join_type(),
-                JoinType::Inner
-                    | JoinType::Left
-                    | JoinType::LeftSemi
-                    | JoinType::LeftAnti
-            )
-        {
-            input = swap_join_according_to_unboundedness(hash_join)?;
-        }
+    if let Some(hash_join) = input.as_any().downcast_ref::<HashJoinExec>()
+        && hash_join.left.boundedness().is_unbounded()
+        && !hash_join.right.boundedness().is_unbounded()
+        && !hash_join.null_aware // Don't swap null-aware anti joins
+        && matches!(
+            *hash_join.join_type(),
+            JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti
+        )
+    {
+        input = swap_join_according_to_unboundedness(hash_join)?;
     }
     Ok(input)
 }
@@ -562,7 +518,11 @@ pub(crate) fn swap_join_according_to_unboundedness(
     match (*partition_mode, *join_type) {
         (
             _,
-            JoinType::Right | JoinType::RightSemi | JoinType::RightAnti | JoinType::Full,
+            JoinType::Right
+            | JoinType::RightSemi
+            | JoinType::RightAnti
+            | JoinType::RightMark
+            | JoinType::Full,
         ) => internal_err!("{join_type} join cannot be swapped for unbounded input."),
         (PartitionMode::Partitioned, _) => {
             hash_join.swap_inputs(PartitionMode::Partitioned)
diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index 5a43d7118d638..a328f43d22b2b 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -19,26 +19,31 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 pub mod aggregate_statistics;
-pub mod coalesce_batches;
 pub mod combine_partial_final_agg;
 pub mod enforce_distribution;
 pub mod enforce_sorting;
+pub mod ensure_coop;
 pub mod filter_pushdown;
 pub mod join_selection;
 pub mod limit_pushdown;
+pub mod limit_pushdown_past_window;
 pub mod limited_distinct_aggregation;
 pub mod optimizer;
 pub mod output_requirements;
 pub mod projection_pushdown;
-pub mod pruning;
+pub use datafusion_pruning as pruning;
+pub mod hash_join_buffering;
+pub mod pushdown_sort;
 pub mod sanity_checker;
 pub mod topk_aggregation;
+pub mod topk_repartition;
 pub mod update_aggr_exprs;
 pub mod utils;
 
diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs
index 7469c3af9344c..b556037699404 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown.rs
@@ -17,6 +17,47 @@
 
 //! [`LimitPushdown`] pushes `LIMIT` down through `ExecutionPlan`s to reduce
 //! data transfer as much as possible.
+//!
+//! # Plan Limit Absorption
+//! In addition to pushing down [`LimitExec`] in the plan, some operators can
+//! "absorb" a limit and stop early during execution.
+//!
+//! ## Background: vectorized volcano execution model
+//! DataFusion uses a batched volcano model. For most operators, output is
+//! produced in batches of `datafusion.execution.batch_size` (default 8192), so
+//! the batch sizes typically look like:
+//! ```text
+//! 8192, 8192, ..., 8192, 100 (the final batch may be partial)
+//! ```
+//!
+//! ## Example
+//! For a join with an expensive, selective predicate:
+//! ```text
+//! LimitExec(fetch=10)
+//! -- NestedLoopJoinExec(on=expr_expensive_and_selective)
+//! --- DataSourceExec()
+//! --- DataSourceExec()
+//! ```
+//!
+//! Under this model, `NestedLoopJoinExec` would keep working until it can emit
+//! a full batch (8192 rows), even though the query only needs 10. If the limit
+//! cannot be pushed below the join, we can still embed it inside the join so it
+//! stops once the limit is satisfied. The transformed plan looks like:
+//!
+//! ```text
+//! NestedLoopJoinExec(on=expr_expensive_and_selective, fetch=10)
+//! --- DataSourceExec()
+//! --- DataSourceExec()
+//! ```
+//!
+//! ## Implementation
+//! The current optimizer rule optionally pushes `fetch` requirements into
+//! operators via [`ExecutionPlan::with_fetch`].
+//!
+//! To support early termination in operators, [`LimitedBatchCoalescer`](https://docs.rs/datafusion/latest/datafusion/physical_plan/coalesce/struct.LimitedBatchCoalescer.html)
+//! can help manage the output buffer.
+//!
+//! Reference implementation in Hash Join: <https://github.com/apache/datafusion/pull/20228>
 
 use std::fmt::Debug;
 use std::sync::Arc;
@@ -50,10 +91,11 @@ pub struct GlobalRequirements {
     fetch: Option<usize>,
     skip: usize,
     satisfied: bool,
+    preserve_order: bool,
 }
 
 impl LimitPushdown {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -69,6 +111,7 @@ impl PhysicalOptimizerRule for LimitPushdown {
             fetch: None,
             skip: 0,
             satisfied: false,
+            preserve_order: false,
         };
         pushdown_limits(plan, global_state)
     }
@@ -111,6 +154,13 @@ impl LimitExec {
             Self::Local(_) => 0,
         }
     }
+
+    fn preserve_order(&self) -> bool {
+        match self {
+            Self::Global(global) => global.required_ordering().is_some(),
+            Self::Local(local) => local.required_ordering().is_some(),
+        }
+    }
 }
 
 impl From<LimitExec> for Arc<dyn ExecutionPlan> {
@@ -145,6 +195,8 @@ pub fn pushdown_limit_helper(
         );
         global_state.skip = skip;
         global_state.fetch = fetch;
+        global_state.preserve_order = limit_exec.preserve_order();
+        global_state.satisfied = false;
 
         // Now the global state has the most recent information, we can remove
         // the `LimitExec` plan. We will decide later if we should add it again
@@ -162,7 +214,7 @@ pub fn pushdown_limit_helper(
     // If we have a non-limit operator with fetch capability, update global
     // state as necessary:
     if pushdown_plan.fetch().is_some() {
-        if global_state.fetch.is_none() {
+        if global_state.skip == 0 {
             global_state.satisfied = true;
         }
         (global_state.skip, global_state.fetch) = combine_limit(
@@ -241,17 +293,28 @@ pub fn pushdown_limit_helper(
         let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch);
         if global_state.satisfied {
             if let Some(plan_with_fetch) = maybe_fetchable {
-                Ok((Transformed::yes(plan_with_fetch), global_state))
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
+                Ok((Transformed::yes(plan_with_preserve_order), global_state))
             } else {
                 Ok((Transformed::no(pushdown_plan), global_state))
             }
         } else {
             global_state.satisfied = true;
             pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable {
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
+
                 if global_skip > 0 {
-                    add_global_limit(plan_with_fetch, global_skip, Some(global_fetch))
+                    add_global_limit(
+                        plan_with_preserve_order,
+                        global_skip,
+                        Some(global_fetch),
+                    )
                 } else {
-                    plan_with_fetch
+                    plan_with_preserve_order
                 }
             } else {
                 add_limit(pushdown_plan, global_skip, global_fetch)
diff --git a/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs b/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs
new file mode 100644
index 0000000000000..729b600da7297
--- /dev/null
+++ b/datafusion/physical-optimizer/src/limit_pushdown_past_window.rs
@@ -0,0 +1,373 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_expr::{LimitEffect, WindowFrameBound, WindowFrameUnits};
+use datafusion_physical_expr::window::{
+    PlainAggregateWindowExpr, SlidingAggregateWindowExpr, StandardWindowExpr,
+    StandardWindowFunctionExpr, WindowExpr,
+};
+use datafusion_physical_plan::execution_plan::CardinalityEffect;
+use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowUDFExpr};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+use std::cmp;
+use std::sync::Arc;
+
+/// This rule inspects [`ExecutionPlan`]'s attempting to find fetch limits that were not pushed
+/// down by `LimitPushdown` because [BoundedWindowAggExec]s were "in the way". If the window is
+/// bounded by [WindowFrameUnits::Rows] then we calculate the adjustment needed to grow the limit
+/// and continue pushdown.
+#[derive(Default, Clone, Debug)]
+pub struct LimitPushPastWindows;
+
+impl LimitPushPastWindows {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Eq, PartialEq)]
+enum Phase {
+    FindOrGrow,
+    Apply,
+}
+
+#[derive(Default)]
+struct TraverseState {
+    pub limit: Option<usize>,
+    pub lookahead: usize,
+}
+
+impl TraverseState {
+    pub fn reset_limit(&mut self, limit: Option<usize>) {
+        self.limit = limit;
+        self.lookahead = 0;
+    }
+
+    pub fn max_lookahead(&mut self, new_val: usize) {
+        self.lookahead = self.lookahead.max(new_val);
+    }
+}
+
+impl PhysicalOptimizerRule for LimitPushPastWindows {
+    fn optimize(
+        &self,
+        original: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        if !config.optimizer.enable_window_limits {
+            return Ok(original);
+        }
+        let mut ctx = TraverseState::default();
+        let mut phase = Phase::FindOrGrow;
+        let result = original.transform_down(|node| {
+            // helper closure to DRY out most the early return cases
+            let reset = |node,
+                         ctx: &mut TraverseState|
+             -> datafusion_common::Result<
+                Transformed<Arc<dyn ExecutionPlan>>,
+            > {
+                ctx.limit = None;
+                ctx.lookahead = 0;
+                Ok(Transformed::no(node))
+            };
+
+            // traversing sides of joins will require more thought
+            if node.children().len() > 1 {
+                return reset(node, &mut ctx);
+            }
+
+            // grab the latest limit we see
+            if phase == Phase::FindOrGrow && get_limit(&node, &mut ctx) {
+                return Ok(Transformed::no(node));
+            }
+
+            // grow the limit if we hit a window function
+            if let Some(window) = node.as_any().downcast_ref::<BoundedWindowAggExec>() {
+                phase = Phase::Apply;
+                if !grow_limit(window, &mut ctx) {
+                    return reset(node, &mut ctx);
+                }
+                return Ok(Transformed::no(node));
+            }
+
+            // Apply the limit if we hit a sortpreservingmerge node
+            if phase == Phase::Apply
+                && let Some(out) = apply_limit(&node, &mut ctx)
+            {
+                return Ok(out);
+            }
+
+            // nodes along the way
+            if !node.supports_limit_pushdown() {
+                return reset(node, &mut ctx);
+            }
+            if let Some(part) = node.as_any().downcast_ref::<RepartitionExec>() {
+                let output = part.partitioning().partition_count();
+                let input = part.input().output_partitioning().partition_count();
+                if output < input {
+                    return reset(node, &mut ctx);
+                }
+            }
+            match node.cardinality_effect() {
+                CardinalityEffect::Unknown => return reset(node, &mut ctx),
+                CardinalityEffect::LowerEqual => return reset(node, &mut ctx),
+                CardinalityEffect::Equal => {}
+                CardinalityEffect::GreaterEqual => {}
+            }
+
+            Ok(Transformed::no(node))
+        })?;
+        Ok(result.data)
+    }
+
+    fn name(&self) -> &str {
+        "LimitPushPastWindows"
+    }
+
+    fn schema_check(&self) -> bool {
+        false // we don't change the schema
+    }
+}
+
+fn grow_limit(window: &BoundedWindowAggExec, ctx: &mut TraverseState) -> bool {
+    let mut max_rel = 0;
+    for expr in window.window_expr().iter() {
+        // grow based on function requirements
+        match get_limit_effect(expr) {
+            LimitEffect::None => {}
+            LimitEffect::Unknown => return false,
+            LimitEffect::Relative(rel) => max_rel = max_rel.max(rel),
+            LimitEffect::Absolute(val) => {
+                let cur = ctx.limit.unwrap_or(0);
+                ctx.limit = Some(cur.max(val))
+            }
+        }
+
+        // grow based on frames
+        let frame = expr.get_window_frame();
+        if frame.units != WindowFrameUnits::Rows {
+            return false; // expression-based limits not statically evaluatable
+        }
+        let Some(end_bound) = bound_to_usize(&frame.end_bound) else {
+            return false; // can't optimize unbounded window expressions
+        };
+        ctx.max_lookahead(end_bound);
+    }
+
+    // finish grow
+    ctx.max_lookahead(ctx.lookahead + max_rel);
+    true
+}
+
+fn apply_limit(
+    node: &Arc<dyn ExecutionPlan>,
+    ctx: &mut TraverseState,
+) -> Option<Transformed<Arc<dyn ExecutionPlan>>> {
+    if !node.as_any().is::<SortExec>() && !node.as_any().is::<SortPreservingMergeExec>() {
+        return None;
+    }
+    let latest = ctx.limit.take();
+    let Some(fetch) = latest else {
+        ctx.limit = None;
+        ctx.lookahead = 0;
+        return Some(Transformed::no(Arc::clone(node)));
+    };
+    let fetch = match node.fetch() {
+        None => fetch + ctx.lookahead,
+        Some(existing) => cmp::min(existing, fetch + ctx.lookahead),
+    };
+    Some(Transformed::complete(node.with_fetch(Some(fetch)).unwrap()))
+}
+
+fn get_limit(node: &Arc<dyn ExecutionPlan>, ctx: &mut TraverseState) -> bool {
+    if let Some(limit) = node.as_any().downcast_ref::<GlobalLimitExec>() {
+        ctx.reset_limit(limit.fetch().map(|fetch| fetch + limit.skip()));
+        return true;
+    }
+    // In distributed execution, GlobalLimitExec becomes LocalLimitExec
+    // per partition. Handle it the same way (LocalLimitExec has no skip).
+    if let Some(limit) = node.as_any().downcast_ref::<LocalLimitExec>() {
+        ctx.reset_limit(Some(limit.fetch()));
+        return true;
+    }
+    if let Some(limit) = node.as_any().downcast_ref::<SortPreservingMergeExec>() {
+        ctx.reset_limit(limit.fetch());
+        return true;
+    }
+    false
+}
+
+/// Examines the `WindowExpr` and decides:
+/// 1. The expression does not change the window size
+/// 2. The expression grows it by X amount
+/// 3. We don't know
+///
+/// # Arguments
+///
+/// * `expr` the expression to examine
+///
+/// # Returns
+///
+/// The effect on the limit
+fn get_limit_effect(expr: &Arc<dyn WindowExpr>) -> LimitEffect {
+    // White list aggregates
+    if expr.as_any().is::<PlainAggregateWindowExpr>()
+        || expr.as_any().is::<SlidingAggregateWindowExpr>()
+    {
+        return LimitEffect::None;
+    }
+
+    // Grab the window function
+    let Some(swe) = expr.as_any().downcast_ref::<StandardWindowExpr>() else {
+        return LimitEffect::Unknown; // should be only remaining type
+    };
+    let swfe = swe.get_standard_func_expr();
+    let Some(udf) = swfe.as_any().downcast_ref::<WindowUDFExpr>() else {
+        return LimitEffect::Unknown; // should be only remaining type
+    };
+    udf.limit_effect()
+}
+
+fn bound_to_usize(bound: &WindowFrameBound) -> Option<usize> {
+    match bound {
+        WindowFrameBound::Preceding(_) => Some(0),
+        WindowFrameBound::CurrentRow => Some(0),
+        WindowFrameBound::Following(ScalarValue::UInt64(Some(scalar))) => {
+            Some(*scalar as usize)
+        }
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits};
+    use datafusion_functions_window::row_number::row_number_udwf;
+    use datafusion_physical_expr::expressions::col;
+    use datafusion_physical_expr::window::StandardWindowExpr;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::InputOrderMode;
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::limit::LocalLimitExec;
+    use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+    use datafusion_physical_plan::windows::{
+        BoundedWindowAggExec, create_udwf_window_expr,
+    };
+    use insta::assert_snapshot;
+    use std::sync::Arc;
+
+    fn plan_str(plan: &dyn ExecutionPlan) -> String {
+        displayable(plan).indent(true).to_string()
+    }
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]))
+    }
+
+    /// Build: LocalLimitExec or GlobalLimitExec → BoundedWindowAggExec(row_number) → SortExec
+    fn build_window_plan(
+        use_local_limit: bool,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        let s = schema();
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(PlaceholderRowExec::new(Arc::clone(&s)));
+
+        let ordering =
+            LexOrdering::new(vec![PhysicalSortExpr::new_default(col("a", &s)?).asc()])
+                .unwrap();
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering.clone(), input).with_preserve_partitioning(true),
+        );
+
+        let window_expr = Arc::new(StandardWindowExpr::new(
+            create_udwf_window_expr(
+                &row_number_udwf(),
+                &[],
+                &s,
+                "row_number".to_string(),
+                false,
+            )?,
+            &[],
+            ordering.as_ref(),
+            Arc::new(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            )),
+        ));
+
+        let window: Arc<dyn ExecutionPlan> = Arc::new(BoundedWindowAggExec::try_new(
+            vec![window_expr],
+            sort,
+            InputOrderMode::Sorted,
+            true,
+        )?);
+
+        let limit: Arc<dyn ExecutionPlan> = if use_local_limit {
+            Arc::new(LocalLimitExec::new(window, 100))
+        } else {
+            Arc::new(GlobalLimitExec::new(window, 0, Some(100)))
+        };
+
+        Ok(limit)
+    }
+
+    fn optimize(plan: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+        let mut config = ConfigOptions::new();
+        config.optimizer.enable_window_limits = true;
+        LimitPushPastWindows::new().optimize(plan, &config).unwrap()
+    }
+
+    /// GlobalLimitExec above a windowed sort should push fetch into the SortExec.
+    #[test]
+    fn global_limit_pushes_past_window() {
+        let plan = build_window_plan(false).unwrap();
+        let optimized = optimize(plan);
+        assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+        GlobalLimitExec: skip=0, fetch=100
+          BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: TopK(fetch=100), expr=[a@0 ASC], preserve_partitioning=[true]
+              PlaceholderRowExec
+        "#);
+    }
+
+    /// LocalLimitExec above a windowed sort should also push fetch into the SortExec.
+    /// This is the case in distributed execution where GlobalLimitExec becomes LocalLimitExec.
+    #[test]
+    fn local_limit_pushes_past_window() {
+        let plan = build_window_plan(true).unwrap();
+        let optimized = optimize(plan);
+        assert_snapshot!(plan_str(optimized.as_ref()), @r#"
+        LocalLimitExec: fetch=100
+          BoundedWindowAggExec: wdw=[row_number: Field { "row_number": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+            SortExec: TopK(fetch=100), expr=[a@0 ASC], preserve_partitioning=[true]
+              PlaceholderRowExec
+        "#);
+    }
+}
diff --git a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
index 3666ff3798b67..fe9636f67619b 100644
--- a/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
+++ b/datafusion/physical-optimizer/src/limited_distinct_aggregation.rs
@@ -20,13 +20,13 @@
 
 use std::sync::Arc;
 
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::aggregates::{AggregateExec, LimitOptions};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 
 use crate::PhysicalOptimizerRule;
 use itertools::Itertools;
@@ -63,7 +63,7 @@ impl LimitedDistinctAggregation {
             aggr.input_schema(),
         )
         .expect("Unable to copy Aggregate!")
-        .with_limit(Some(limit));
+        .with_limit_options(Some(LimitOptions::new(limit)));
         Some(Arc::new(new_aggr))
     }
 
@@ -113,17 +113,15 @@ impl LimitedDistinctAggregation {
                 return Ok(Transformed::no(plan));
             }
             if let Some(aggr) = plan.as_any().downcast_ref::<AggregateExec>() {
-                if found_match_aggr {
-                    if let Some(parent_aggr) =
+                if found_match_aggr
+                    && let Some(parent_aggr) =
                         match_aggr.as_any().downcast_ref::<AggregateExec>()
-                    {
-                        if !parent_aggr.group_expr().eq(aggr.group_expr()) {
-                            // a partial and final aggregation with different groupings disqualifies
-                            // rewriting the child aggregation
-                            rewrite_applicable = false;
-                            return Ok(Transformed::no(plan));
-                        }
-                    }
+                    && !parent_aggr.group_expr().eq(aggr.group_expr())
+                {
+                    // a partial and final aggregation with different groupings disqualifies
+                    // rewriting the child aggregation
+                    rewrite_applicable = false;
+                    return Ok(Transformed::no(plan));
                 }
                 // either we run into an Aggregate and transform it, or disable the rewrite
                 // for subsequent children
diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs
index 432ac35ebc23f..a51dc47999897 100644
--- a/datafusion/physical-optimizer/src/optimizer.rs
+++ b/datafusion/physical-optimizer/src/optimizer.rs
@@ -21,10 +21,10 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use crate::aggregate_statistics::AggregateStatistics;
-use crate::coalesce_batches::CoalesceBatches;
 use crate::combine_partial_final_agg::CombinePartialFinalAggregate;
 use crate::enforce_distribution::EnforceDistribution;
 use crate::enforce_sorting::EnforceSorting;
+use crate::ensure_coop::EnsureCooperative;
 use crate::filter_pushdown::FilterPushdown;
 use crate::join_selection::JoinSelection;
 use crate::limit_pushdown::LimitPushdown;
@@ -33,10 +33,14 @@ use crate::output_requirements::OutputRequirements;
 use crate::projection_pushdown::ProjectionPushdown;
 use crate::sanity_checker::SanityCheckPlan;
 use crate::topk_aggregation::TopKAggregation;
+use crate::topk_repartition::TopKRepartition;
 use crate::update_aggr_exprs::OptimizeAggregateOrder;
 
-use datafusion_common::config::ConfigOptions;
+use crate::hash_join_buffering::HashJoinBuffering;
+use crate::limit_pushdown_past_window::LimitPushPastWindows;
+use crate::pushdown_sort::PushdownSort;
 use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
 use datafusion_physical_plan::ExecutionPlan;
 
 /// `PhysicalOptimizerRule` transforms one ['ExecutionPlan'] into another which
@@ -57,7 +61,7 @@ pub trait PhysicalOptimizerRule: Debug {
     /// A human readable name for this optimizer rule
     fn name(&self) -> &str;
 
-    /// A flag to indicate whether the physical planner should valid the rule will not
+    /// A flag to indicate whether the physical planner should validate that the rule will not
     /// change the schema of the plan after the rewriting.
     /// Some of the optimization rules might change the nullable properties of the schema
     /// and should disable the schema check.
@@ -80,6 +84,12 @@ impl Default for PhysicalOptimizer {
 impl PhysicalOptimizer {
     /// Create a new optimizer using the recommended list of rules
     pub fn new() -> Self {
+        // NOTEs:
+        // - The order of rules in this list is important, as it determines the
+        //   order in which they are applied.
+        // - Adding a new rule here is expensive as it will be applied to all
+        //   queries, and will likely increase the optimization time. Please extend
+        //   existing rules when possible, rather than adding a new rule.
         let rules: Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>> = vec![
             // If there is a output requirement of the query, make sure that
             // this information is not lost across different rules during optimization.
@@ -96,8 +106,10 @@ impl PhysicalOptimizer {
             // Applying the rule early means only directly-connected AggregateExecs must be examined.
             Arc::new(LimitedDistinctAggregation::new()),
             // The FilterPushdown rule tries to push down filters as far as it can.
-            // For example, it will push down filtering from a `FilterExec` to
-            // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`.
+            // For example, it will push down filtering from a `FilterExec` to `DataSourceExec`.
+            // Note that this does not push down dynamic filters (such as those created by a `SortExec` operator in TopK mode),
+            // those are handled by the later `FilterPushdown` rule.
+            // See `FilterPushdownPhase` for more details.
             Arc::new(FilterPushdown::new()),
             // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution
             // requirements. Please make sure that the whole plan tree is determined before this rule.
@@ -115,9 +127,6 @@ impl PhysicalOptimizer {
             Arc::new(OptimizeAggregateOrder::new()),
             // TODO: `try_embed_to_hash_join` in the ProjectionPushdown rule would be block by the CoalesceBatches, so add it before CoalesceBatches. Maybe optimize it in the future.
             Arc::new(ProjectionPushdown::new()),
-            // The CoalesceBatches rule will not influence the distribution and ordering of the
-            // whole plan tree. Therefore, to avoid influencing other rules, it should run last.
-            Arc::new(CoalesceBatches::new()),
             // Remove the ancillary output requirement operator since we are done with the planning
             // phase.
             Arc::new(OutputRequirements::new_remove_mode()),
@@ -126,10 +135,23 @@ impl PhysicalOptimizer {
             // into an `order by max(x) limit y`. In this case it will copy the limit value down
             // to the aggregation, allowing it to use only y number of accumulators.
             Arc::new(TopKAggregation::new()),
+            // Tries to push limits down through window functions, growing as appropriate
+            // This can possibly be combined with [LimitPushdown]
+            // It needs to come after [EnforceSorting]
+            Arc::new(LimitPushPastWindows::new()),
+            // The HashJoinBuffering rule adds a BufferExec node with the configured capacity
+            // in the prob side of hash joins. That way, the probe side gets eagerly polled before
+            // the build side is completely finished.
+            Arc::new(HashJoinBuffering::new()),
             // The LimitPushdown rule tries to push limits down as far as possible,
             // replacing operators with fetching variants, or adding limits
             // past operators that support limit pushdown.
             Arc::new(LimitPushdown::new()),
+            // TopKRepartition pushes TopK (Sort with fetch) below Hash
+            // repartition when the partition key is a prefix of the sort key.
+            // This reduces data volume before a hash shuffle. It must run
+            // after LimitPushdown so that the TopK already exists on the SortExec.
+            Arc::new(TopKRepartition::new()),
             // The ProjectionPushdown rule tries to push projections towards
             // the sources in the execution plan. As a result of this process,
             // a projection can disappear if it reaches the source providers, and
@@ -137,6 +159,13 @@ impl PhysicalOptimizer {
             // are not present, the load of executors such as join or union will be
             // reduced by narrowing their input tables.
             Arc::new(ProjectionPushdown::new()),
+            // PushdownSort: Detect sorts that can be pushed down to data sources.
+            Arc::new(PushdownSort::new()),
+            Arc::new(EnsureCooperative::new()),
+            // This FilterPushdown handles dynamic filters that may have references to the source ExecutionPlan.
+            // Therefore it should be run at the end of the optimization process since any changes to the plan may break the dynamic filter's references.
+            // See `FilterPushdownPhase` for more details.
+            Arc::new(FilterPushdown::new_post_optimization()),
             // The SanityCheckPlan rule checks whether the order and
             // distribution requirements of each node in the plan
             // is satisfied. It will also reject non-runnable query
diff --git a/datafusion/physical-optimizer/src/output_requirements.rs b/datafusion/physical-optimizer/src/output_requirements.rs
index 0488b3fd49a86..8b71fc9fbf74a 100644
--- a/datafusion/physical-optimizer/src/output_requirements.rs
+++ b/datafusion/physical-optimizer/src/output_requirements.rs
@@ -27,19 +27,23 @@ use std::sync::Arc;
 use crate::PhysicalOptimizerRule;
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 use datafusion_common::{Result, Statistics};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{Distribution, LexRequirement, PhysicalSortRequirement};
+use datafusion_physical_expr::Distribution;
+use datafusion_physical_expr_common::sort_expr::OrderingRequirements;
+use datafusion_physical_plan::execution_plan::Boundedness;
 use datafusion_physical_plan::projection::{
-    make_with_child, update_expr, ProjectionExec,
+    ProjectionExec, make_with_child, update_expr, update_ordering_requirement,
 };
 use datafusion_physical_plan::sorts::sort::SortExec;
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use datafusion_physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    SendableRecordBatchStream,
 };
-use datafusion_physical_plan::{ExecutionPlanProperties, PlanProperties};
 
 /// This rule either adds or removes [`OutputRequirements`]s to/from the physical
 /// plan according to its `mode` attribute, which is set by the constructors
@@ -94,23 +98,26 @@ enum RuleMode {
 #[derive(Debug)]
 pub struct OutputRequirementExec {
     input: Arc<dyn ExecutionPlan>,
-    order_requirement: Option<LexRequirement>,
+    order_requirement: Option<OrderingRequirements>,
     dist_requirement: Distribution,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    fetch: Option<usize>,
 }
 
 impl OutputRequirementExec {
     pub fn new(
         input: Arc<dyn ExecutionPlan>,
-        requirements: Option<LexRequirement>,
+        requirements: Option<OrderingRequirements>,
         dist_requirement: Distribution,
+        fetch: Option<usize>,
     ) -> Self {
-        let cache = Self::compute_properties(&input);
+        let cache = Self::compute_properties(&input, &fetch);
         Self {
             input,
             order_requirement: requirements,
             dist_requirement,
-            cache,
+            cache: Arc::new(cache),
+            fetch,
         }
     }
 
@@ -119,14 +126,28 @@ impl OutputRequirementExec {
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
-    fn compute_properties(input: &Arc<dyn ExecutionPlan>) -> PlanProperties {
+    fn compute_properties(
+        input: &Arc<dyn ExecutionPlan>,
+        fetch: &Option<usize>,
+    ) -> PlanProperties {
+        let boundedness = if fetch.is_some() {
+            Boundedness::Bounded
+        } else {
+            input.boundedness()
+        };
+
         PlanProperties::new(
             input.equivalence_properties().clone(), // Equivalence Properties
             input.output_partitioning().clone(),    // Output Partitioning
             input.pipeline_behavior(),              // Pipeline Behavior
-            input.boundedness(),                    // Boundedness
+            boundedness,                            // Boundedness
         )
     }
+
+    /// Get fetch
+    pub fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
 }
 
 impl DisplayAs for OutputRequirementExec {
@@ -137,10 +158,35 @@ impl DisplayAs for OutputRequirementExec {
     ) -> std::fmt::Result {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, "OutputRequirementExec")
+                let order_cols = self
+                    .order_requirement
+                    .as_ref()
+                    .map(|reqs| reqs.first())
+                    .map(|lex| {
+                        let pairs: Vec<String> = lex
+                            .iter()
+                            .map(|req| {
+                                let direction = req
+                                    .options
+                                    .as_ref()
+                                    .map(
+                                        |opt| if opt.descending { "desc" } else { "asc" },
+                                    )
+                                    .unwrap_or("unspecified");
+                                format!("({}, {direction})", req.expr)
+                            })
+                            .collect();
+                        format!("[{}]", pairs.join(", "))
+                    })
+                    .unwrap_or_else(|| "[]".to_string());
+
+                write!(
+                    f,
+                    "OutputRequirementExec: order_by={}, dist_by={}",
+                    order_cols, self.dist_requirement
+                )
             }
             DisplayFormatType::TreeRender => {
-                // TODO: collect info
                 write!(f, "")
             }
         }
@@ -156,7 +202,7 @@ impl ExecutionPlan for OutputRequirementExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -176,7 +222,7 @@ impl ExecutionPlan for OutputRequirementExec {
         vec![&self.input]
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         vec![self.order_requirement.clone()]
     }
 
@@ -188,6 +234,7 @@ impl ExecutionPlan for OutputRequirementExec {
             children.remove(0), // has a single child
             self.order_requirement.clone(),
             self.dist_requirement.clone(),
+            self.fetch,
         )))
     }
 
@@ -199,11 +246,7 @@ impl ExecutionPlan for OutputRequirementExec {
         unreachable!();
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(partition)
     }
 
@@ -212,23 +255,23 @@ impl ExecutionPlan for OutputRequirementExec {
         projection: &ProjectionExec,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
         // If the projection does not narrow the schema, we should not try to push it down:
-        if projection.expr().len() >= projection.input().schema().fields().len() {
+        let proj_exprs = projection.expr();
+        if proj_exprs.len() >= projection.input().schema().fields().len() {
             return Ok(None);
         }
 
-        let mut updated_sort_reqs = LexRequirement::new(vec![]);
-        // None or empty_vec can be treated in the same way.
-        if let Some(reqs) = &self.required_input_ordering()[0] {
-            for req in &reqs.inner {
-                let Some(new_expr) = update_expr(&req.expr, projection.expr(), false)?
+        let mut requirements = self.required_input_ordering().swap_remove(0);
+        if let Some(reqs) = requirements {
+            let mut updated_reqs = vec![];
+            let (lexes, soft) = reqs.into_alternatives();
+            for lex in lexes.into_iter() {
+                let Some(updated_lex) = update_ordering_requirement(lex, proj_exprs)?
                 else {
                     return Ok(None);
                 };
-                updated_sort_reqs.push(PhysicalSortRequirement {
-                    expr: new_expr,
-                    options: req.options,
-                });
+                updated_reqs.push(updated_lex);
             }
+            requirements = OrderingRequirements::new_alternatives(updated_reqs, soft);
         }
 
         let dist_req = match &self.required_input_distribution()[0] {
@@ -246,15 +289,44 @@ impl ExecutionPlan for OutputRequirementExec {
             dist => dist.clone(),
         };
 
-        make_with_child(projection, &self.input())
-            .map(|input| {
-                OutputRequirementExec::new(
-                    input,
-                    (!updated_sort_reqs.is_empty()).then_some(updated_sort_reqs),
-                    dist_req,
-                )
-            })
-            .map(|e| Some(Arc::new(e) as _))
+        make_with_child(projection, &self.input()).map(|input| {
+            let e = OutputRequirementExec::new(input, requirements, dist_req, self.fetch);
+            Some(Arc::new(e) as _)
+        })
+    }
+
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(
+            &dyn datafusion_physical_expr_common::physical_expr::PhysicalExpr,
+        ) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Visit expressions in order_requirement
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(order_reqs) = &self.order_requirement {
+            let lexes = match order_reqs {
+                OrderingRequirements::Hard(alternatives) => alternatives,
+                OrderingRequirements::Soft(alternatives) => alternatives,
+            };
+            for lex in lexes {
+                for sort_expr in lex {
+                    tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+                }
+            }
+        }
+
+        // Visit expressions in dist_requirement if it's HashPartitioned
+        if let Distribution::HashPartitioned(exprs) = &self.dist_requirement {
+            for expr in exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+
+        Ok(tnr)
     }
 }
 
@@ -302,6 +374,7 @@ fn require_top_ordering(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn Executio
             // there is no ordering requirement
             None,
             Distribution::UnspecifiedDistribution,
+            None,
         )) as _)
     }
 }
@@ -317,27 +390,39 @@ fn require_top_ordering_helper(
     if children.len() != 1 {
         Ok((plan, false))
     } else if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
-        // In case of constant columns, output ordering of SortExec would give an empty set.
-        // Therefore; we check the sort expression field of the SortExec to assign the requirements.
+        // In case of constant columns, output ordering of the `SortExec` would
+        // be an empty set. Therefore; we check the sort expression field to
+        // assign the requirements.
+        let req_dist = sort_exec.required_input_distribution().swap_remove(0);
         let req_ordering = sort_exec.expr();
-        let req_dist = sort_exec.required_input_distribution()[0].clone();
-        let reqs = LexRequirement::from(req_ordering.clone());
+        let reqs = OrderingRequirements::from(req_ordering.clone());
+        let fetch = sort_exec.fetch();
+
         Ok((
-            Arc::new(OutputRequirementExec::new(plan, Some(reqs), req_dist)) as _,
+            Arc::new(OutputRequirementExec::new(
+                plan,
+                Some(reqs),
+                req_dist,
+                fetch,
+            )) as _,
             true,
         ))
     } else if let Some(spm) = plan.as_any().downcast_ref::<SortPreservingMergeExec>() {
-        let reqs = LexRequirement::from(spm.expr().clone());
+        let reqs = OrderingRequirements::from(spm.expr().clone());
+        let fetch = spm.fetch();
         Ok((
             Arc::new(OutputRequirementExec::new(
                 plan,
                 Some(reqs),
                 Distribution::SinglePartition,
+                fetch,
             )) as _,
             true,
         ))
     } else if plan.maintains_input_order()[0]
-        && plan.required_input_ordering()[0].is_none()
+        && (plan.required_input_ordering()[0]
+            .as_ref()
+            .is_none_or(|o| matches!(o, OrderingRequirements::Soft(_))))
     {
         // Keep searching for a `SortExec` as long as ordering is maintained,
         // and on-the-way operators do not themselves require an ordering.
diff --git a/datafusion/physical-optimizer/src/projection_pushdown.rs b/datafusion/physical-optimizer/src/projection_pushdown.rs
index 34affcbd4a19b..44d0926a8b250 100644
--- a/datafusion/physical-optimizer/src/projection_pushdown.rs
+++ b/datafusion/physical-optimizer/src/projection_pushdown.rs
@@ -20,23 +20,37 @@
 //! projections one by one if the operator below is amenable to this. If a
 //! projection reaches a source, it can even disappear from the plan entirely.
 
-use std::sync::Arc;
-
 use crate::PhysicalOptimizerRule;
+use arrow::datatypes::{Fields, Schema, SchemaRef};
+use datafusion_common::alias::AliasGenerator;
+use std::collections::HashSet;
+use std::sync::Arc;
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{TransformedResult, TreeNode};
-use datafusion_common::Result;
-use datafusion_physical_plan::projection::remove_unnecessary_projections;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
+use datafusion_common::{JoinSide, JoinType, Result};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, is_volatile};
 use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::joins::NestedLoopJoinExec;
+use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
+use datafusion_physical_plan::projection::{
+    ProjectionExec, remove_unnecessary_projections,
+};
 
 /// This rule inspects `ProjectionExec`'s in the given physical plan and tries to
 /// remove or swap with its child.
+///
+/// Furthermore, tries to push down projections from nested loop join filters that only depend on
+/// one side of the join. By pushing these projections down, functions that only depend on one side
+/// of the join must be evaluated for the cartesian product of the two sides.
 #[derive(Default, Debug)]
 pub struct ProjectionPushdown {}
 
 impl ProjectionPushdown {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -48,6 +62,20 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let alias_generator = AliasGenerator::new();
+        let plan = plan
+            .transform_up(|plan| {
+                match plan.as_any().downcast_ref::<NestedLoopJoinExec>() {
+                    None => Ok(Transformed::no(plan)),
+                    Some(hash_join) => try_push_down_join_filter(
+                        Arc::clone(&plan),
+                        hash_join,
+                        &alias_generator,
+                    ),
+                }
+            })
+            .map(|t| t.data)?;
+
         plan.transform_down(remove_unnecessary_projections).data()
     }
 
@@ -59,3 +87,700 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
         true
     }
 }
+
+/// Tries to push down parts of the filter.
+///
+/// See [JoinFilterRewriter] for details.
+fn try_push_down_join_filter(
+    original_plan: Arc<dyn ExecutionPlan>,
+    join: &NestedLoopJoinExec,
+    alias_generator: &AliasGenerator,
+) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+    // Mark joins are currently not supported.
+    if matches!(join.join_type(), JoinType::LeftMark | JoinType::RightMark) {
+        return Ok(Transformed::no(original_plan));
+    }
+
+    let projections = join.projection();
+    let Some(filter) = join.filter() else {
+        return Ok(Transformed::no(original_plan));
+    };
+
+    let original_lhs_length = join.left().schema().fields().len();
+    let original_rhs_length = join.right().schema().fields().len();
+
+    let lhs_rewrite = try_push_down_projection(
+        Arc::clone(&join.right().schema()),
+        Arc::clone(join.left()),
+        JoinSide::Left,
+        filter.clone(),
+        alias_generator,
+    )?;
+    let rhs_rewrite = try_push_down_projection(
+        Arc::clone(&lhs_rewrite.data.0.schema()),
+        Arc::clone(join.right()),
+        JoinSide::Right,
+        lhs_rewrite.data.1,
+        alias_generator,
+    )?;
+    if !lhs_rewrite.transformed && !rhs_rewrite.transformed {
+        return Ok(Transformed::no(original_plan));
+    }
+
+    let join_filter = minimize_join_filter(
+        Arc::clone(rhs_rewrite.data.1.expression()),
+        rhs_rewrite.data.1.column_indices(),
+        lhs_rewrite.data.0.schema().as_ref(),
+        rhs_rewrite.data.0.schema().as_ref(),
+    );
+
+    let new_lhs_length = lhs_rewrite.data.0.schema().fields.len();
+    let projections = match projections.as_ref() {
+        None => match join.join_type() {
+            JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
+                // Build projections that ignore the newly projected columns.
+                let mut projections = Vec::new();
+                projections.extend(0..original_lhs_length);
+                projections.extend(new_lhs_length..new_lhs_length + original_rhs_length);
+                projections
+            }
+            JoinType::LeftSemi | JoinType::LeftAnti => {
+                // Only return original left columns
+                let mut projections = Vec::new();
+                projections.extend(0..original_lhs_length);
+                projections
+            }
+            JoinType::RightSemi | JoinType::RightAnti => {
+                // Only return original right columns
+                let mut projections = Vec::new();
+                projections.extend(0..original_rhs_length);
+                projections
+            }
+            _ => unreachable!("Unsupported join type"),
+        },
+        Some(projections) => {
+            let rhs_offset = new_lhs_length - original_lhs_length;
+            projections
+                .iter()
+                .map(|idx| {
+                    if *idx >= original_lhs_length {
+                        idx + rhs_offset
+                    } else {
+                        *idx
+                    }
+                })
+                .collect()
+        }
+    };
+
+    Ok(Transformed::yes(Arc::new(NestedLoopJoinExec::try_new(
+        lhs_rewrite.data.0,
+        rhs_rewrite.data.0,
+        Some(join_filter),
+        join.join_type(),
+        Some(projections),
+    )?)))
+}
+
+/// Tries to push down parts of `expr` into the `join_side`.
+fn try_push_down_projection(
+    other_schema: SchemaRef,
+    plan: Arc<dyn ExecutionPlan>,
+    join_side: JoinSide,
+    join_filter: JoinFilter,
+    alias_generator: &AliasGenerator,
+) -> Result<Transformed<(Arc<dyn ExecutionPlan>, JoinFilter)>> {
+    let expr = Arc::clone(join_filter.expression());
+    let original_plan_schema = plan.schema();
+    let mut rewriter = JoinFilterRewriter::new(
+        join_side,
+        original_plan_schema.as_ref(),
+        join_filter.column_indices().to_vec(),
+        alias_generator,
+    );
+    let new_expr = rewriter.rewrite(expr)?;
+
+    if new_expr.transformed {
+        let new_join_side =
+            ProjectionExec::try_new(rewriter.join_side_projections, plan)?;
+        let new_schema = Arc::clone(&new_join_side.schema());
+
+        let (lhs_schema, rhs_schema) = match join_side {
+            JoinSide::Left => (new_schema, other_schema),
+            JoinSide::Right => (other_schema, new_schema),
+            JoinSide::None => unreachable!("Mark join not supported"),
+        };
+        let intermediate_schema = rewriter
+            .intermediate_column_indices
+            .iter()
+            .map(|ci| match ci.side {
+                JoinSide::Left => Arc::clone(&lhs_schema.fields[ci.index]),
+                JoinSide::Right => Arc::clone(&rhs_schema.fields[ci.index]),
+                JoinSide::None => unreachable!("Mark join not supported"),
+            })
+            .collect::<Fields>();
+
+        let join_filter = JoinFilter::new(
+            new_expr.data,
+            rewriter.intermediate_column_indices,
+            Arc::new(Schema::new(intermediate_schema)),
+        );
+        Ok(Transformed::yes((Arc::new(new_join_side), join_filter)))
+    } else {
+        Ok(Transformed::no((plan, join_filter)))
+    }
+}
+
+/// Creates a new [JoinFilter] and tries to minimize the internal schema.
+///
+/// This could eliminate some columns that were only part of a computation that has been pushed
+/// down. As this computation is now materialized on one side of the join, the original input
+/// columns are not needed anymore.
+fn minimize_join_filter(
+    expr: Arc<dyn PhysicalExpr>,
+    old_column_indices: &[ColumnIndex],
+    lhs_schema: &Schema,
+    rhs_schema: &Schema,
+) -> JoinFilter {
+    let mut used_columns = HashSet::new();
+    expr.apply(|expr| {
+        if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+            used_columns.insert(col.index());
+        }
+        Ok(TreeNodeRecursion::Continue)
+    })
+    .expect("Closure cannot fail");
+
+    let new_column_indices = old_column_indices
+        .iter()
+        .enumerate()
+        .filter(|(idx, _)| used_columns.contains(idx))
+        .map(|(_, ci)| ci.clone())
+        .collect::<Vec<_>>();
+    let fields = new_column_indices
+        .iter()
+        .map(|ci| match ci.side {
+            JoinSide::Left => lhs_schema.field(ci.index).clone(),
+            JoinSide::Right => rhs_schema.field(ci.index).clone(),
+            JoinSide::None => unreachable!("Mark join not supported"),
+        })
+        .collect::<Fields>();
+
+    let final_expr = expr
+        .transform_up(|expr| match expr.as_any().downcast_ref::<Column>() {
+            None => Ok(Transformed::no(expr)),
+            Some(column) => {
+                let new_idx = used_columns
+                    .iter()
+                    .filter(|idx| **idx < column.index())
+                    .count();
+                let new_column = Column::new(column.name(), new_idx);
+                Ok(Transformed::yes(
+                    Arc::new(new_column) as Arc<dyn PhysicalExpr>
+                ))
+            }
+        })
+        .expect("Closure cannot fail");
+
+    JoinFilter::new(
+        final_expr.data,
+        new_column_indices,
+        Arc::new(Schema::new(fields)),
+    )
+}
+
+/// Implements the push-down machinery.
+///
+/// The rewriter starts at the top of the filter expression and traverses the expression tree. For
+/// each (sub-)expression, the rewriter checks whether it only refers to one side of the join. If
+/// this is never the case, no subexpressions of the filter can be pushed down. If there is a
+/// subexpression that can be computed using only one side of the join, the entire subexpression is
+/// pushed down to the join side.
+struct JoinFilterRewriter<'a> {
+    join_side: JoinSide,
+    join_side_schema: &'a Schema,
+    join_side_projections: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    intermediate_column_indices: Vec<ColumnIndex>,
+    alias_generator: &'a AliasGenerator,
+}
+
+impl<'a> JoinFilterRewriter<'a> {
+    /// Creates a new [JoinFilterRewriter].
+    fn new(
+        join_side: JoinSide,
+        join_side_schema: &'a Schema,
+        column_indices: Vec<ColumnIndex>,
+        alias_generator: &'a AliasGenerator,
+    ) -> Self {
+        let projections = join_side_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(idx, field)| {
+                (
+                    Arc::new(Column::new(field.name(), idx)) as Arc<dyn PhysicalExpr>,
+                    field.name().to_string(),
+                )
+            })
+            .collect();
+
+        Self {
+            join_side,
+            join_side_schema,
+            join_side_projections: projections,
+            intermediate_column_indices: column_indices,
+            alias_generator,
+        }
+    }
+
+    /// Executes the push-down machinery on `expr`.
+    ///
+    /// See the [JoinFilterRewriter] for further information.
+    fn rewrite(
+        &mut self,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<Transformed<Arc<dyn PhysicalExpr>>> {
+        let depends_on_this_side = self.depends_on_join_side(&expr, self.join_side)?;
+        // We don't push down things that do not depend on this side (other side or no side).
+        if !depends_on_this_side {
+            return Ok(Transformed::no(expr));
+        }
+
+        // Recurse if there is a dependency to both sides or if the entire expression is volatile.
+        let depends_on_other_side =
+            self.depends_on_join_side(&expr, self.join_side.negate())?;
+        if depends_on_other_side || is_volatile(&expr) {
+            return expr.map_children(|expr| self.rewrite(expr));
+        }
+
+        // There is only a dependency on this side.
+
+        // If this expression has no children, we do not push down, as it should already be a column
+        // reference.
+        if expr.children().is_empty() {
+            return Ok(Transformed::no(expr));
+        }
+
+        // Otherwise, we push down a projection.
+        let alias = self.alias_generator.next("join_proj_push_down");
+        let idx = self.create_new_column(alias.clone(), expr)?;
+
+        Ok(Transformed::yes(
+            Arc::new(Column::new(&alias, idx)) as Arc<dyn PhysicalExpr>
+        ))
+    }
+
+    /// Creates a new column in the current join side.
+    fn create_new_column(
+        &mut self,
+        name: String,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<usize> {
+        // First, add a new projection. The expression must be rewritten, as it is no longer
+        // executed against the filter schema.
+        let new_idx = self.join_side_projections.len();
+        let rewritten_expr = expr.transform_up(|expr| {
+            Ok(match expr.as_any().downcast_ref::<Column>() {
+                None => Transformed::no(expr),
+                Some(column) => {
+                    let intermediate_column =
+                        &self.intermediate_column_indices[column.index()];
+                    assert_eq!(intermediate_column.side, self.join_side);
+
+                    let join_side_index = intermediate_column.index;
+                    let field = self.join_side_schema.field(join_side_index);
+                    let new_column = Column::new(field.name(), join_side_index);
+                    Transformed::yes(Arc::new(new_column) as Arc<dyn PhysicalExpr>)
+                }
+            })
+        })?;
+        self.join_side_projections.push((rewritten_expr.data, name));
+
+        // Then, update the column indices
+        let new_intermediate_idx = self.intermediate_column_indices.len();
+        let idx = ColumnIndex {
+            index: new_idx,
+            side: self.join_side,
+        };
+        self.intermediate_column_indices.push(idx);
+
+        Ok(new_intermediate_idx)
+    }
+
+    /// Checks whether the entire expression depends on the given `join_side`.
+    fn depends_on_join_side(
+        &mut self,
+        expr: &Arc<dyn PhysicalExpr>,
+        join_side: JoinSide,
+    ) -> Result<bool> {
+        let mut result = false;
+        expr.apply(|expr| match expr.as_any().downcast_ref::<Column>() {
+            None => Ok(TreeNodeRecursion::Continue),
+            Some(c) => {
+                let column_index = &self.intermediate_column_indices[c.index()];
+                if column_index.side == join_side {
+                    result = true;
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+                Ok(TreeNodeRecursion::Continue)
+            }
+        })?;
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, FieldRef, Schema};
+    use datafusion_expr_common::operator::Operator;
+    use datafusion_functions::math::random;
+    use datafusion_physical_expr::ScalarFunctionExpr;
+    use datafusion_physical_expr::expressions::{binary, lit};
+    use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::empty::EmptyExec;
+    use insta::assert_snapshot;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn no_computation_does_not_project() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_x(),
+            None,
+            a_greater_than_x,
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=a@0 > x@1
+          EmptyExec
+          EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn simple_push_down() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_x(),
+            None,
+            a_plus_one_greater_than_x_plus_one,
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, x@2]
+          ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn does_not_push_down_short_circuiting_expressions() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_x(),
+            None,
+            |schema| {
+                binary(
+                    lit(false),
+                    Operator::And,
+                    a_plus_one_greater_than_x_plus_one(schema)?,
+                    schema,
+                )
+            },
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=false AND join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, x@2]
+          ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn does_not_push_down_volatile_functions() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_x(),
+            None,
+            a_plus_rand_greater_than_x,
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=a@0 + rand() > x@1
+          EmptyExec
+          EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn complex_schema_push_down() -> Result<()> {
+        let (left_schema, right_schema) = create_complex_schemas();
+
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_b_x_z(),
+            None,
+            a_plus_b_greater_than_x_plus_z,
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0, b@1, c@2, x@4, y@5, z@6]
+          ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, a@0 + b@1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, y@1 as y, z@2 as z, x@0 + z@2 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn push_down_with_existing_projections() -> Result<()> {
+        let (left_schema, right_schema) = create_complex_schemas();
+
+        let optimized_plan = run_test(
+            left_schema,
+            right_schema,
+            a_b_x_z(),
+            Some(vec![1, 3, 5]), // ("b", "x", "z")
+            a_plus_b_greater_than_x_plus_z,
+            JoinType::Inner,
+        )?;
+
+        assert_snapshot!(optimized_plan, @r"
+        NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[b@1, x@4, z@6]
+          ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, a@0 + b@1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, y@1 as y, z@2 as z, x@0 + z@2 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn left_semi_join_projection() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+
+        let left_semi_join_plan = run_test(
+            left_schema.clone(),
+            right_schema.clone(),
+            a_x(),
+            None,
+            a_plus_one_greater_than_x_plus_one,
+            JoinType::LeftSemi,
+        )?;
+
+        assert_snapshot!(left_semi_join_plan, @r"
+        NestedLoopJoinExec: join_type=LeftSemi, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[a@0]
+          ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn right_semi_join_projection() -> Result<()> {
+        let (left_schema, right_schema) = create_simple_schemas();
+        let right_semi_join_plan = run_test(
+            left_schema,
+            right_schema,
+            a_x(),
+            None,
+            a_plus_one_greater_than_x_plus_one,
+            JoinType::RightSemi,
+        )?;
+        assert_snapshot!(right_semi_join_plan, @r"
+        NestedLoopJoinExec: join_type=RightSemi, filter=join_proj_push_down_1@0 > join_proj_push_down_2@1, projection=[x@0]
+          ProjectionExec: expr=[a@0 as a, a@0 + 1 as join_proj_push_down_1]
+            EmptyExec
+          ProjectionExec: expr=[x@0 as x, x@0 + 1 as join_proj_push_down_2]
+            EmptyExec
+        ");
+        Ok(())
+    }
+
+    fn run_test(
+        left_schema: Schema,
+        right_schema: Schema,
+        column_indices: Vec<ColumnIndex>,
+        existing_projections: Option<Vec<usize>>,
+        filter_expr_builder: impl FnOnce(&Schema) -> Result<Arc<dyn PhysicalExpr>>,
+        join_type: JoinType,
+    ) -> Result<String> {
+        let left = Arc::new(EmptyExec::new(Arc::new(left_schema.clone())));
+        let right = Arc::new(EmptyExec::new(Arc::new(right_schema.clone())));
+
+        let join_fields: Vec<_> = column_indices
+            .iter()
+            .map(|ci| match ci.side {
+                JoinSide::Left => left_schema.field(ci.index).clone(),
+                JoinSide::Right => right_schema.field(ci.index).clone(),
+                JoinSide::None => unreachable!(),
+            })
+            .collect();
+        let join_schema = Arc::new(Schema::new(join_fields));
+
+        let filter_expr = filter_expr_builder(join_schema.as_ref())?;
+
+        let join_filter = JoinFilter::new(filter_expr, column_indices, join_schema);
+
+        let join = NestedLoopJoinExec::try_new(
+            left,
+            right,
+            Some(join_filter),
+            &join_type,
+            existing_projections,
+        )?;
+
+        let optimizer = ProjectionPushdown::new();
+        let optimized_plan = optimizer.optimize(Arc::new(join), &Default::default())?;
+
+        let displayable_plan = displayable(optimized_plan.as_ref()).indent(false);
+        Ok(displayable_plan.to_string())
+    }
+
+    fn create_simple_schemas() -> (Schema, Schema) {
+        let left_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let right_schema = Schema::new(vec![Field::new("x", DataType::Int32, false)]);
+
+        (left_schema, right_schema)
+    }
+
+    fn create_complex_schemas() -> (Schema, Schema) {
+        let left_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+
+        let right_schema = Schema::new(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("y", DataType::Int32, false),
+            Field::new("z", DataType::Int32, false),
+        ]);
+
+        (left_schema, right_schema)
+    }
+
+    fn a_x() -> Vec<ColumnIndex> {
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ]
+    }
+
+    fn a_b_x_z() -> Vec<ColumnIndex> {
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ]
+    }
+
+    fn a_plus_one_greater_than_x_plus_one(
+        join_schema: &Schema,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let left_expr = binary(
+            Arc::new(Column::new("a", 0)),
+            Operator::Plus,
+            lit(1),
+            join_schema,
+        )?;
+        let right_expr = binary(
+            Arc::new(Column::new("x", 1)),
+            Operator::Plus,
+            lit(1),
+            join_schema,
+        )?;
+        binary(left_expr, Operator::Gt, right_expr, join_schema)
+    }
+
+    fn a_plus_rand_greater_than_x(join_schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
+        let left_expr = binary(
+            Arc::new(Column::new("a", 0)),
+            Operator::Plus,
+            Arc::new(ScalarFunctionExpr::new(
+                "rand",
+                random(),
+                vec![],
+                FieldRef::new(Field::new("out", DataType::Float64, false)),
+                Arc::new(ConfigOptions::default()),
+            )),
+            join_schema,
+        )?;
+        let right_expr = Arc::new(Column::new("x", 1));
+        binary(left_expr, Operator::Gt, right_expr, join_schema)
+    }
+
+    fn a_greater_than_x(join_schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
+        binary(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("x", 1)),
+            join_schema,
+        )
+    }
+
+    fn a_plus_b_greater_than_x_plus_z(
+        join_schema: &Schema,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let lhs = binary(
+            Arc::new(Column::new("a", 0)),
+            Operator::Plus,
+            Arc::new(Column::new("b", 1)),
+            join_schema,
+        )?;
+        let rhs = binary(
+            Arc::new(Column::new("x", 2)),
+            Operator::Plus,
+            Arc::new(Column::new("z", 3)),
+            join_schema,
+        )?;
+        binary(lhs, Operator::Gt, rhs, join_schema)
+    }
+}
diff --git a/datafusion/physical-optimizer/src/pushdown_sort.rs b/datafusion/physical-optimizer/src/pushdown_sort.rs
new file mode 100644
index 0000000000000..1fa15492d2a92
--- /dev/null
+++ b/datafusion/physical-optimizer/src/pushdown_sort.rs
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort Pushdown Optimization
+//!
+//! This optimizer attempts to push sort requirements down through the execution plan
+//! tree to data sources that can natively handle them (e.g., by scanning files in
+//! reverse order).
+//!
+//! ## How it works
+//!
+//! 1. Detects `SortExec` nodes in the plan
+//! 2. Calls `try_pushdown_sort()` on the input to recursively push the sort requirement
+//! 3. Each node type defines its own pushdown behavior:
+//!    - **Transparent nodes** (CoalesceBatchesExec, RepartitionExec, etc.) delegate to
+//!      their children and wrap the result
+//!    - **Data sources** (DataSourceExec) check if they can optimize for the ordering
+//!    - **Blocking nodes** return `Unsupported` to stop pushdown
+//! 4. Based on the result:
+//!    - `Exact`: Remove the Sort operator (data source guarantees perfect ordering)
+//!    - `Inexact`: Keep Sort but use optimized input (enables early termination for TopK)
+//!    - `Unsupported`: No change
+//!
+//! ## Current capabilities (Phase 1)
+//!
+//! - Reverse scan optimization: when required sort is the reverse of the data source's
+//!   natural ordering, enable reverse scanning (reading row groups in reverse order)
+//! - Supports prefix matching: if data has ordering [A DESC, B ASC] and query needs
+//!   [A ASC], reversing gives [A ASC, B DESC] which satisfies the requirement
+//!
+//! TODO Issue: <https://github.com/apache/datafusion/issues/19329>
+//! ## Future enhancements (Phase 2),
+//!
+//! - File reordering based on statistics
+//! - Return `Exact` when files are known to be perfectly sorted
+//! - Complete Sort elimination when ordering is guaranteed
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::SortOrderPushdownResult;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use std::sync::Arc;
+
+/// A PhysicalOptimizerRule that attempts to push down sort requirements to data sources.
+///
+/// See module-level documentation for details.
+#[derive(Debug, Clone, Default)]
+pub struct PushdownSort;
+
+impl PushdownSort {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl PhysicalOptimizerRule for PushdownSort {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Check if sort pushdown optimization is enabled
+        if !config.optimizer.enable_sort_pushdown {
+            return Ok(plan);
+        }
+
+        // Use transform_down to find and optimize all SortExec nodes (including nested ones)
+        plan.transform_down(|plan: Arc<dyn ExecutionPlan>| {
+            // Check if this is a SortExec
+            let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() else {
+                return Ok(Transformed::no(plan));
+            };
+
+            let sort_input = Arc::clone(sort_exec.input());
+            let required_ordering = sort_exec.expr();
+
+            // Try to push the sort requirement down through the plan tree
+            // Each node type defines its own pushdown behavior via try_pushdown_sort()
+            match sort_input.try_pushdown_sort(required_ordering)? {
+                SortOrderPushdownResult::Exact { inner } => {
+                    // Data source guarantees perfect ordering - remove the Sort operator
+                    Ok(Transformed::yes(inner))
+                }
+                SortOrderPushdownResult::Inexact { inner } => {
+                    // Data source is optimized for the ordering but not perfectly sorted
+                    // Keep the Sort operator but use the optimized input
+                    // Benefits: TopK queries can terminate early, better cache locality
+                    Ok(Transformed::yes(Arc::new(
+                        SortExec::new(required_ordering.clone(), inner)
+                            .with_fetch(sort_exec.fetch())
+                            .with_preserve_partitioning(
+                                sort_exec.preserve_partitioning(),
+                            ),
+                    )))
+                }
+                SortOrderPushdownResult::Unsupported => {
+                    // Cannot optimize for this ordering - no change
+                    Ok(Transformed::no(plan))
+                }
+            }
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "PushdownSort"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
diff --git a/datafusion/physical-optimizer/src/sanity_checker.rs b/datafusion/physical-optimizer/src/sanity_checker.rs
index 8edbb0f091140..bff33a281556d 100644
--- a/datafusion/physical-optimizer/src/sanity_checker.rs
+++ b/datafusion/physical-optimizer/src/sanity_checker.rs
@@ -32,7 +32,7 @@ use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported};
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::joins::SymmetricHashJoinExec;
-use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
+use datafusion_physical_plan::{ExecutionPlanProperties, get_plan_string};
 
 use crate::PhysicalOptimizerRule;
 use datafusion_physical_expr_common::sort_expr::format_physical_sort_requirement_list;
@@ -47,7 +47,7 @@ use itertools::izip;
 pub struct SanityCheckPlan {}
 
 impl SanityCheckPlan {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self {}
     }
@@ -78,13 +78,14 @@ pub fn check_finiteness_requirements(
     input: Arc<dyn ExecutionPlan>,
     optimizer_options: &OptimizerOptions,
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
-    if let Some(exec) = input.as_any().downcast_ref::<SymmetricHashJoinExec>() {
-        if !(optimizer_options.allow_symmetric_joins_without_pruning
+    if let Some(exec) = input.as_any().downcast_ref::<SymmetricHashJoinExec>()
+        && !(optimizer_options.allow_symmetric_joins_without_pruning
             || (exec.check_if_order_information_available()? && is_prunable(exec)))
-        {
-            return plan_err!("Join operation cannot operate on a non-prunable stream without enabling \
-                              the 'allow_symmetric_joins_without_pruning' configuration flag");
-        }
+    {
+        return plan_err!(
+            "Join operation cannot operate on a non-prunable stream without enabling \
+                              the 'allow_symmetric_joins_without_pruning' configuration flag"
+        );
     }
 
     if matches!(
@@ -137,7 +138,8 @@ pub fn check_plan_sanity(
     ) {
         let child_eq_props = child.equivalence_properties();
         if let Some(sort_req) = sort_req {
-            if !child_eq_props.ordering_satisfy_requirement(&sort_req) {
+            let sort_req = sort_req.into_single();
+            if !child_eq_props.ordering_satisfy_requirement(sort_req.clone())? {
                 let plan_str = get_plan_string(&plan);
                 return plan_err!(
                     "Plan: {:?} does not satisfy order requirements: {}. Child-{} order: {}",
@@ -151,7 +153,8 @@ pub fn check_plan_sanity(
 
         if !child
             .output_partitioning()
-            .satisfy(&dist_req, child_eq_props)
+            .satisfaction(&dist_req, child_eq_props, true)
+            .is_satisfied()
         {
             let plan_str = get_plan_string(&plan);
             return plan_err!(
diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs
index faedea55ca150..cec6bd70a2089 100644
--- a/datafusion/physical-optimizer/src/topk_aggregation.rs
+++ b/datafusion/physical-optimizer/src/topk_aggregation.rs
@@ -20,17 +20,16 @@
 use std::sync::Arc;
 
 use crate::PhysicalOptimizerRule;
-use arrow::datatypes::DataType;
+use datafusion_common::Result;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::Result;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::LexOrdering;
-use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::ExecutionPlan;
+use datafusion_physical_plan::aggregates::LimitOptions;
+use datafusion_physical_plan::aggregates::{AggregateExec, topk_types_supported};
 use datafusion_physical_plan::execution_plan::CardinalityEffect;
 use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::ExecutionPlan;
 use itertools::Itertools;
 
 /// An optimizer rule that passes a `limit` hint to aggregations if the whole result is not needed
@@ -49,40 +48,47 @@ impl TopKAggregation {
         order_desc: bool,
         limit: usize,
     ) -> Option<Arc<dyn ExecutionPlan>> {
-        // ensure the sort direction matches aggregate function
-        let (field, desc) = aggr.get_minmax_desc()?;
-        if desc != order_desc {
-            return None;
-        }
-        let group_key = aggr.group_expr().expr().iter().exactly_one().ok()?;
-        let kt = group_key.0.data_type(&aggr.input().schema()).ok()?;
-        if !kt.is_primitive()
-            && kt != DataType::Utf8
-            && kt != DataType::Utf8View
-            && kt != DataType::LargeUtf8
-        {
+        // Current only support single group key
+        let (group_key, group_key_alias) =
+            aggr.group_expr().expr().iter().exactly_one().ok()?;
+        let kt = group_key.data_type(&aggr.input().schema()).ok()?;
+        let vt = if let Some((field, _)) = aggr.get_minmax_desc() {
+            field.data_type().clone()
+        } else {
+            kt.clone()
+        };
+        if !topk_types_supported(&kt, &vt) {
             return None;
         }
         if aggr.filter_expr().iter().any(|e| e.is_some()) {
             return None;
         }
 
-        // ensure the sort is on the same field as the aggregate output
-        if order_by != field.name() {
+        // Check if this is ordering by an aggregate function (MIN/MAX)
+        if let Some((field, desc)) = aggr.get_minmax_desc() {
+            // ensure the sort direction matches aggregate function
+            if desc != order_desc {
+                return None;
+            }
+            // ensure the sort is on the same field as the aggregate output
+            if order_by != field.name() {
+                return None;
+            }
+        } else if aggr.aggr_expr().is_empty() {
+            // This is a GROUP BY without aggregates, check if ordering is on the group key itself
+            if order_by != group_key_alias {
+                return None;
+            }
+        } else {
+            // Has aggregates but not MIN/MAX, or doesn't DISTINCT
             return None;
         }
 
         // We found what we want: clone, copy the limit down, and return modified node
-        let new_aggr = AggregateExec::try_new(
-            *aggr.mode(),
-            aggr.group_expr().clone(),
-            aggr.aggr_expr().to_vec(),
-            aggr.filter_expr().to_vec(),
-            Arc::clone(aggr.input()),
-            aggr.input_schema(),
-        )
-        .expect("Unable to copy Aggregate!")
-        .with_limit(Some(limit));
+        let new_aggr = AggregateExec::with_new_limit_options(
+            aggr,
+            Some(LimitOptions::new_with_order(limit, order_desc)),
+        );
         Some(Arc::new(new_aggr))
     }
 
@@ -111,11 +117,12 @@ impl TopKAggregation {
                 }
             } else if let Some(proj) = plan.as_any().downcast_ref::<ProjectionExec>() {
                 // track renames due to successive projections
-                for (src_expr, proj_name) in proj.expr() {
-                    let Some(src_col) = src_expr.as_any().downcast_ref::<Column>() else {
+                for proj_expr in proj.expr() {
+                    let Some(src_col) = proj_expr.expr.as_any().downcast_ref::<Column>()
+                    else {
                         continue;
                     };
-                    if *proj_name == cur_col_name {
+                    if proj_expr.alias == cur_col_name {
                         cur_col_name = src_col.name().to_string();
                     }
                 }
@@ -131,7 +138,7 @@ impl TopKAggregation {
             Ok(Transformed::no(plan))
         };
         let child = Arc::clone(child).transform_down(closure).data().ok()?;
-        let sort = SortExec::new(LexOrdering::new(sort.expr().to_vec()), child)
+        let sort = SortExec::new(sort.expr().clone(), child)
             .with_fetch(sort.fetch())
             .with_preserve_partitioning(sort.preserve_partitioning());
         Some(Arc::new(sort))
diff --git a/datafusion/physical-optimizer/src/topk_repartition.rs b/datafusion/physical-optimizer/src/topk_repartition.rs
new file mode 100644
index 0000000000000..668e0d273288b
--- /dev/null
+++ b/datafusion/physical-optimizer/src/topk_repartition.rs
@@ -0,0 +1,368 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Push TopK (Sort with fetch) past Hash Repartition
+//!
+//! When a `SortExec` with a fetch limit (TopK) sits above a
+//! `RepartitionExec(Hash)`, and the hash partition expressions are a prefix
+//! of the sort expressions, this rule inserts a copy of the TopK below
+//! the repartition to reduce the volume of data flowing through the shuffle.
+//!
+//! This is correct because the hash partition key being a prefix of the sort
+//! key guarantees that all rows with the same partition key end up in the same
+//! output partition. Therefore, rows that survive the final TopK after
+//! repartitioning will always survive the pre-repartition TopK as well.
+//!
+//! ## Example
+//!
+//! Before:
+//! ```text
+//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!   RepartitionExec: Hash([a], 4)
+//!     DataSourceExec
+//! ```
+//!
+//! After:
+//! ```text
+//! SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!   RepartitionExec: Hash([a], 4)
+//!     SortExec: TopK(fetch=3), expr=[a ASC, b ASC]
+//!       DataSourceExec
+//! ```
+
+use crate::PhysicalOptimizerRule;
+use datafusion_common::Result;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use std::sync::Arc;
+// CoalesceBatchesExec is deprecated on main (replaced by arrow-rs BatchCoalescer),
+// but older DataFusion versions may still insert it between SortExec and RepartitionExec.
+#[expect(deprecated)]
+use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::{ExecutionPlan, Partitioning};
+
+/// A physical optimizer rule that pushes TopK (Sort with fetch) past
+/// hash repartition when the partition key is a prefix of the sort key.
+///
+/// See module-level documentation for details.
+#[derive(Debug, Clone, Default)]
+pub struct TopKRepartition;
+
+impl TopKRepartition {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl PhysicalOptimizerRule for TopKRepartition {
+    #[expect(deprecated)] // CoalesceBatchesExec: kept for older DataFusion versions
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if !config.optimizer.enable_topk_repartition {
+            return Ok(plan);
+        }
+        plan.transform_down(|node| {
+            // Match SortExec with fetch (TopK)
+            let Some(sort_exec) = node.as_any().downcast_ref::<SortExec>() else {
+                return Ok(Transformed::no(node));
+            };
+            let Some(fetch) = sort_exec.fetch() else {
+                return Ok(Transformed::no(node));
+            };
+
+            // The child might be a CoalesceBatchesExec; look through it
+            let sort_input = sort_exec.input();
+            let sort_any = sort_input.as_any();
+            let (repart_parent, repart_exec) = if let Some(rp) =
+                sort_any.downcast_ref::<RepartitionExec>()
+            {
+                // found a RepartitionExec, use it
+                (None, rp)
+            } else if let Some(cb_exec) = sort_any.downcast_ref::<CoalesceBatchesExec>() {
+                // There's a CoalesceBatchesExec between TopK & RepartitionExec
+                // in this case we will need to reconstruct both nodes
+                let cb_input = cb_exec.input();
+                let Some(rp) = cb_input.as_any().downcast_ref::<RepartitionExec>() else {
+                    return Ok(Transformed::no(node));
+                };
+                (Some(Arc::clone(sort_input)), rp)
+            } else {
+                return Ok(Transformed::no(node));
+            };
+
+            // Only handle Hash partitioning
+            let Partitioning::Hash(hash_exprs, num_partitions) =
+                repart_exec.partitioning()
+            else {
+                return Ok(Transformed::no(node));
+            };
+
+            let sort_exprs = sort_exec.expr();
+
+            // Check that hash expressions are a prefix of the sort expressions.
+            // Each hash expression must match the corresponding sort expression
+            // (ignoring sort options like ASC/DESC since hash doesn't care about order).
+            if hash_exprs.len() > sort_exprs.len() {
+                return Ok(Transformed::no(node));
+            }
+            for (hash_expr, sort_expr) in hash_exprs.iter().zip(sort_exprs.iter()) {
+                if !hash_expr.eq(&sort_expr.expr) {
+                    return Ok(Transformed::no(node));
+                }
+            }
+
+            // Don't push if the input to the repartition is already bounded
+            // (e.g., another TopK), as it would be redundant.
+            let repart_input = repart_exec.input();
+            if repart_input.as_any().downcast_ref::<SortExec>().is_some() {
+                return Ok(Transformed::no(node));
+            }
+
+            // Insert a copy of the TopK below the repartition
+            let new_sort: Arc<dyn ExecutionPlan> = Arc::new(
+                SortExec::new(sort_exprs.clone(), Arc::clone(repart_input))
+                    .with_fetch(Some(fetch))
+                    .with_preserve_partitioning(sort_exec.preserve_partitioning()),
+            );
+
+            let new_partitioning =
+                Partitioning::Hash(hash_exprs.clone(), *num_partitions);
+            let new_repartition: Arc<dyn ExecutionPlan> =
+                Arc::new(RepartitionExec::try_new(new_sort, new_partitioning)?);
+
+            // Rebuild the tree above the repartition
+            let new_sort_input = if let Some(parent) = repart_parent {
+                parent.with_new_children(vec![new_repartition])?
+            } else {
+                new_repartition
+            };
+
+            let new_top_sort: Arc<dyn ExecutionPlan> = Arc::new(
+                SortExec::new(sort_exprs.clone(), new_sort_input)
+                    .with_fetch(Some(fetch))
+                    .with_preserve_partitioning(sort_exec.preserve_partitioning()),
+            );
+
+            Ok(Transformed::yes(new_top_sort))
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "TopKRepartition"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_physical_expr::expressions::col;
+    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::test::scan_partitioned;
+    use insta::assert_snapshot;
+    use std::sync::Arc;
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Int64, false),
+        ]))
+    }
+
+    fn sort_exprs(schema: &Schema) -> LexOrdering {
+        LexOrdering::new(vec![
+            PhysicalSortExpr::new_default(col("a", schema).unwrap()).asc(),
+            PhysicalSortExpr::new_default(col("b", schema).unwrap()).asc(),
+        ])
+        .unwrap()
+    }
+
+    /// TopK above Hash(a) repartition should get pushed below it,
+    /// because `a` is a prefix of the sort key `(a, b)`.
+    #[test]
+    fn topk_pushed_below_hash_repartition() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true
+            SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+              DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// TopK with no fetch (unbounded sort) should NOT be pushed.
+    #[test]
+    fn unbounded_sort_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition).with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// Hash key NOT a prefix of sort key should NOT be pushed.
+    #[test]
+    fn non_prefix_hash_key_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        // Hash by `b`, but sort by `(a, b)` - b is not a prefix
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("b", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// TopK above CoalesceBatchesExec above Hash(a) repartition should
+    /// push through both, inserting a new TopK below the repartition.
+    #[expect(deprecated)]
+    #[test]
+    fn topk_pushed_through_coalesce_batches() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![col("a", &s).unwrap()], 4),
+            )
+            .unwrap(),
+        );
+
+        let coalesce: Arc<dyn ExecutionPlan> =
+            Arc::new(CoalesceBatchesExec::new(repartition, 8192));
+
+        let sort = Arc::new(
+            SortExec::new(ordering, coalesce)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true], sort_prefix=[a@0 ASC]
+          CoalesceBatchesExec: target_batch_size=8192
+            RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1, maintains_sort_order=true
+              SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+                DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+
+    /// RoundRobin repartition should NOT be pushed.
+    #[test]
+    fn round_robin_not_pushed() {
+        let s = schema();
+        let input = scan_partitioned(1);
+        let ordering = sort_exprs(&s);
+
+        let repartition = Arc::new(
+            RepartitionExec::try_new(input, Partitioning::RoundRobinBatch(4)).unwrap(),
+        );
+
+        let sort: Arc<dyn ExecutionPlan> = Arc::new(
+            SortExec::new(ordering, repartition)
+                .with_fetch(Some(3))
+                .with_preserve_partitioning(true),
+        );
+
+        let config = ConfigOptions::new();
+        let optimized = TopKRepartition::new().optimize(sort, &config).unwrap();
+
+        let display = displayable(optimized.as_ref()).indent(true).to_string();
+        assert_snapshot!(display, @r"
+        SortExec: TopK(fetch=3), expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true]
+          RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+            DataSourceExec: partitions=1, partition_sizes=[1]
+        ");
+    }
+}
diff --git a/datafusion/physical-optimizer/src/update_aggr_exprs.rs b/datafusion/physical-optimizer/src/update_aggr_exprs.rs
index ae1a38230d044..67127c2a238f9 100644
--- a/datafusion/physical-optimizer/src/update_aggr_exprs.rs
+++ b/datafusion/physical-optimizer/src/update_aggr_exprs.rs
@@ -22,18 +22,14 @@ use std::sync::Arc;
 
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{plan_datafusion_err, Result};
+use datafusion_common::{Result, plan_datafusion_err};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
-use datafusion_physical_expr::LexRequirement;
-use datafusion_physical_expr::{
-    reverse_order_bys, EquivalenceProperties, PhysicalSortRequirement,
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirement};
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateInputMode, concat_slices,
 };
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use datafusion_physical_plan::aggregates::concat_slices;
 use datafusion_physical_plan::windows::get_ordered_partition_by_indices;
-use datafusion_physical_plan::{
-    aggregates::AggregateExec, ExecutionPlan, ExecutionPlanProperties,
-};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
 use crate::PhysicalOptimizerRule;
 
@@ -55,7 +51,7 @@ use crate::PhysicalOptimizerRule;
 pub struct OptimizeAggregateOrder {}
 
 impl OptimizeAggregateOrder {
-    #[allow(missing_docs)]
+    #[expect(missing_docs)]
     pub fn new() -> Self {
         Self::default()
     }
@@ -87,36 +83,34 @@ impl PhysicalOptimizerRule for OptimizeAggregateOrder {
                 // ordering fields may be pruned out by first stage aggregates.
                 // Hence, necessary information for proper merge is added during
                 // the first stage to the state field, which the final stage uses.
-                if !aggr_exec.mode().is_first_stage() {
+                if aggr_exec.mode().input_mode() == AggregateInputMode::Partial {
                     return Ok(Transformed::no(plan));
                 }
                 let input = aggr_exec.input();
-                let mut aggr_expr = aggr_exec.aggr_expr().to_vec();
+                let mut aggr_exprs = aggr_exec.aggr_expr().to_vec();
 
                 let groupby_exprs = aggr_exec.group_expr().input_exprs();
                 // If the existing ordering satisfies a prefix of the GROUP BY
                 // expressions, prefix requirements with this section. In this
                 // case, aggregation will work more efficiently.
-                let indices = get_ordered_partition_by_indices(&groupby_exprs, input);
+                let indices = get_ordered_partition_by_indices(&groupby_exprs, input)?;
                 let requirement = indices
                     .iter()
                     .map(|&idx| {
                         PhysicalSortRequirement::new(
-                            Arc::<dyn datafusion_physical_plan::PhysicalExpr>::clone(
-                                &groupby_exprs[idx],
-                            ),
+                            Arc::clone(&groupby_exprs[idx]),
                             None,
                         )
                     })
                     .collect::<Vec<_>>();
 
-                aggr_expr = try_convert_aggregate_if_better(
-                    aggr_expr,
+                aggr_exprs = try_convert_aggregate_if_better(
+                    aggr_exprs,
                     &requirement,
                     input.equivalence_properties(),
                 )?;
 
-                let aggr_exec = aggr_exec.with_new_aggr_exprs(aggr_expr);
+                let aggr_exec = aggr_exec.with_new_aggr_exprs(aggr_exprs);
 
                 Ok(Transformed::yes(Arc::new(aggr_exec) as _))
             } else {
@@ -160,33 +154,30 @@ fn try_convert_aggregate_if_better(
     aggr_exprs
         .into_iter()
         .map(|aggr_expr| {
-            let aggr_sort_exprs = aggr_expr
-                .order_bys()
-                .unwrap_or_else(|| LexOrdering::empty());
-            let reverse_aggr_sort_exprs = reverse_order_bys(aggr_sort_exprs);
-            let aggr_sort_reqs = LexRequirement::from(aggr_sort_exprs.clone());
-            let reverse_aggr_req = LexRequirement::from(reverse_aggr_sort_exprs);
-
+            let order_bys = aggr_expr.order_bys();
             // If the aggregate expression benefits from input ordering, and
             // there is an actual ordering enabling this, try to update the
             // aggregate expression to benefit from the existing ordering.
             // Otherwise, leave it as is.
-            if aggr_expr.order_sensitivity().is_beneficial() && !aggr_sort_reqs.is_empty()
-            {
-                let reqs = LexRequirement {
-                    inner: concat_slices(prefix_requirement, &aggr_sort_reqs),
-                };
-
-                let prefix_requirement = LexRequirement {
-                    inner: prefix_requirement.to_vec(),
-                };
-
-                if eq_properties.ordering_satisfy_requirement(&reqs) {
+            if !aggr_expr.order_sensitivity().is_beneficial() {
+                Ok(aggr_expr)
+            } else if !order_bys.is_empty() {
+                if eq_properties.ordering_satisfy_requirement(concat_slices(
+                    prefix_requirement,
+                    &order_bys
+                        .iter()
+                        .map(|e| e.clone().into())
+                        .collect::<Vec<_>>(),
+                ))? {
                     // Existing ordering satisfies the aggregator requirements:
                     aggr_expr.with_beneficial_ordering(true)?.map(Arc::new)
-                } else if eq_properties.ordering_satisfy_requirement(&LexRequirement {
-                    inner: concat_slices(&prefix_requirement, &reverse_aggr_req),
-                }) {
+                } else if eq_properties.ordering_satisfy_requirement(concat_slices(
+                    prefix_requirement,
+                    &order_bys
+                        .iter()
+                        .map(|e| e.reverse().into())
+                        .collect::<Vec<_>>(),
+                ))? {
                     // Converting to reverse enables more efficient execution
                     // given the existing ordering (if possible):
                     aggr_expr
diff --git a/datafusion/physical-optimizer/src/utils.rs b/datafusion/physical-optimizer/src/utils.rs
index 57a193315a5c3..13a1745216e83 100644
--- a/datafusion/physical-optimizer/src/utils.rs
+++ b/datafusion/physical-optimizer/src/utils.rs
@@ -17,8 +17,8 @@
 
 use std::sync::Arc;
 
-use datafusion_physical_expr::LexRequirement;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_common::Result;
+use datafusion_physical_expr::{LexOrdering, LexRequirement};
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::repartition::RepartitionExec;
@@ -32,22 +32,26 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 /// This utility function adds a `SortExec` above an operator according to the
 /// given ordering requirements while preserving the original partitioning.
 ///
-/// Note that this updates the plan in both the [`PlanContext.children`] and
-/// the [`PlanContext.plan`]'s children. Therefore its not required to sync
+/// Note that this updates the plan in both the `PlanContext.children` and
+/// the `PlanContext.plan`'s children. Therefore its not required to sync
 /// the child plans with [`PlanContext::update_plan_from_children`].
 pub fn add_sort_above<T: Clone + Default>(
     node: PlanContext<T>,
     sort_requirements: LexRequirement,
     fetch: Option<usize>,
 ) -> PlanContext<T> {
-    let mut sort_expr = LexOrdering::from(sort_requirements);
-    sort_expr.retain(|sort_expr| {
-        !node
-            .plan
+    let mut sort_reqs: Vec<_> = sort_requirements.into();
+    sort_reqs.retain(|sort_expr| {
+        node.plan
             .equivalence_properties()
             .is_expr_constant(&sort_expr.expr)
+            .is_none()
     });
-    let mut new_sort = SortExec::new(sort_expr, Arc::clone(&node.plan)).with_fetch(fetch);
+    let sort_exprs = sort_reqs.into_iter().map(Into::into).collect::<Vec<_>>();
+    let Some(ordering) = LexOrdering::new(sort_exprs) else {
+        return node;
+    };
+    let mut new_sort = SortExec::new(ordering, Arc::clone(&node.plan)).with_fetch(fetch);
     if node.plan.output_partitioning().partition_count() > 1 {
         new_sort = new_sort.with_preserve_partitioning(true);
     }
@@ -61,15 +65,15 @@ pub fn add_sort_above_with_check<T: Clone + Default>(
     node: PlanContext<T>,
     sort_requirements: LexRequirement,
     fetch: Option<usize>,
-) -> PlanContext<T> {
+) -> Result<PlanContext<T>> {
     if !node
         .plan
         .equivalence_properties()
-        .ordering_satisfy_requirement(&sort_requirements)
+        .ordering_satisfy_requirement(sort_requirements.clone())?
     {
-        add_sort_above(node, sort_requirements, fetch)
+        Ok(add_sort_above(node, sort_requirements, fetch))
     } else {
-        node
+        Ok(node)
     }
 }
 
diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
index 4f58b575f3a0b..7acb21b8f3b93 100644
--- a/datafusion/physical-plan/Cargo.toml
+++ b/datafusion/physical-plan/Cargo.toml
@@ -31,27 +31,32 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [features]
 force_hash_collisions = []
-bench = []
+test_utils = ["arrow/test_utils"]
+tokio_coop = []
+tokio_coop_fallback = []
 
 [lib]
 name = "datafusion_physical_plan"
 
 [dependencies]
-ahash = { workspace = true }
 arrow = { workspace = true }
 arrow-ord = { workspace = true }
 arrow-schema = { workspace = true }
 async-trait = { workspace = true }
-chrono = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
+datafusion-common = { workspace = true }
 datafusion-common-runtime = { workspace = true, default-features = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions = { workspace = true }
+datafusion-functions-aggregate-common = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-physical-expr = { workspace = true, default-features = true }
 datafusion-physical-expr-common = { workspace = true }
@@ -61,6 +66,7 @@ hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true, features = ["use_std"] }
 log = { workspace = true }
+num-traits = { workspace = true }
 parking_lot = { workspace = true }
 pin-project-lite = "^0.2.7"
 tokio = { workspace = true }
@@ -73,7 +79,6 @@ insta = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 rstest_reuse = "0.7.0"
-tempfile = "3.19.1"
 tokio = { workspace = true, features = [
     "rt-multi-thread",
     "fs",
@@ -91,4 +96,13 @@ name = "spill_io"
 [[bench]]
 harness = false
 name = "sort_preserving_merge"
-required-features = ["bench"]
+
+[[bench]]
+harness = false
+name = "sort_merge_join"
+required-features = ["test_utils"]
+
+[[bench]]
+harness = false
+name = "aggregate_vectorized"
+required-features = ["test_utils"]
diff --git a/datafusion/physical-plan/README.md b/datafusion/physical-plan/README.md
index ec604253fd2e5..3a33100f2f350 100644
--- a/datafusion/physical-plan/README.md
+++ b/datafusion/physical-plan/README.md
@@ -17,11 +17,17 @@
   under the License.
 -->
 
-# DataFusion Physical Plan
+# Apache DataFusion Physical Plan
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate is a submodule of DataFusion that contains the `ExecutionPlan` trait and the various implementations of that
 trait for built in operators such as filters, projections, joins, aggregations, etc.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/physical-plan/benches/aggregate_vectorized.rs b/datafusion/physical-plan/benches/aggregate_vectorized.rs
new file mode 100644
index 0000000000000..a93088a4ebe72
--- /dev/null
+++ b/datafusion/physical-plan/benches/aggregate_vectorized.rs
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{Int32Type, StringViewType};
+use arrow::util::bench_util::{
+    create_primitive_array, create_string_view_array_with_len,
+    create_string_view_array_with_max_len,
+};
+use arrow::util::test_util::seedable_rng;
+use arrow_schema::DataType;
+use criterion::measurement::WallTime;
+use criterion::{
+    BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main,
+};
+use datafusion_physical_plan::aggregates::group_values::multi_group_by::GroupColumn;
+use datafusion_physical_plan::aggregates::group_values::multi_group_by::bytes_view::ByteViewGroupValueBuilder;
+use datafusion_physical_plan::aggregates::group_values::multi_group_by::primitive::PrimitiveGroupValueBuilder;
+use rand::distr::{Bernoulli, Distribution};
+use std::hint::black_box;
+use std::sync::Arc;
+
+const SIZES: [usize; 3] = [1_000, 10_000, 100_000];
+const NULL_DENSITIES: [f32; 3] = [0.0, 0.1, 0.5];
+
+fn bench_vectorized_append(c: &mut Criterion) {
+    byte_view_vectorized_append(c);
+    primitive_vectorized_append(c);
+}
+
+fn byte_view_vectorized_append(c: &mut Criterion) {
+    let mut group = c.benchmark_group("ByteViewGroupValueBuilder_vectorized_append");
+
+    for &size in &SIZES {
+        let rows: Vec<usize> = (0..size).collect();
+
+        for &null_density in &NULL_DENSITIES {
+            let input = create_string_view_array_with_len(size, null_density, 8, false);
+            let input: ArrayRef = Arc::new(input);
+
+            bytes_bench(&mut group, "inline", size, &rows, null_density, &input);
+        }
+    }
+
+    for &size in &SIZES {
+        let rows: Vec<usize> = (0..size).collect();
+
+        for &null_density in &NULL_DENSITIES {
+            let input = create_string_view_array_with_len(size, null_density, 64, true);
+            let input: ArrayRef = Arc::new(input);
+
+            bytes_bench(&mut group, "scenario", size, &rows, null_density, &input);
+        }
+    }
+
+    for &size in &SIZES {
+        let rows: Vec<usize> = (0..size).collect();
+
+        for &null_density in &NULL_DENSITIES {
+            let input = create_string_view_array_with_max_len(size, null_density, 400);
+            let input: ArrayRef = Arc::new(input);
+
+            bytes_bench(&mut group, "random", size, &rows, null_density, &input);
+        }
+    }
+
+    group.finish();
+}
+
+fn bytes_bench(
+    group: &mut BenchmarkGroup<WallTime>,
+    bench_prefix: &str,
+    size: usize,
+    rows: &Vec<usize>,
+    null_density: f32,
+    input: &ArrayRef,
+) {
+    // vectorized_append
+    let function_name = format!("{bench_prefix}_null_{null_density:.1}_size_{size}");
+    let id = BenchmarkId::new(&function_name, "vectorized_append");
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            let mut builder = ByteViewGroupValueBuilder::<StringViewType>::new();
+            builder.vectorized_append(input, rows).unwrap();
+        });
+    });
+
+    // append_val
+    let id = BenchmarkId::new(&function_name, "append_val");
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            let mut builder = ByteViewGroupValueBuilder::<StringViewType>::new();
+            for &i in rows {
+                builder.append_val(input, i).unwrap();
+            }
+        });
+    });
+
+    // vectorized_equal_to
+    vectorized_equal_to(
+        group,
+        ByteViewGroupValueBuilder::<StringViewType>::new(),
+        &function_name,
+        rows,
+        input,
+        "all_true",
+        vec![true; size],
+    );
+    vectorized_equal_to(
+        group,
+        ByteViewGroupValueBuilder::<StringViewType>::new(),
+        &function_name,
+        rows,
+        input,
+        "0.75 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.75).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    vectorized_equal_to(
+        group,
+        ByteViewGroupValueBuilder::<StringViewType>::new(),
+        &function_name,
+        rows,
+        input,
+        "0.5 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.5).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    vectorized_equal_to(
+        group,
+        ByteViewGroupValueBuilder::<StringViewType>::new(),
+        &function_name,
+        rows,
+        input,
+        "0.25 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.25).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    // Not adding 0 true case here as if we optimize for 0 true cases the caller should avoid calling this method at all
+}
+
+fn primitive_vectorized_append(c: &mut Criterion) {
+    let mut group = c.benchmark_group("PrimitiveGroupValueBuilder_vectorized_append");
+
+    for &size in &SIZES {
+        let rows: Vec<usize> = (0..size).collect();
+
+        for &null_density in &NULL_DENSITIES {
+            if null_density == 0.0 {
+                bench_single_primitive::<false>(&mut group, size, &rows, null_density)
+            }
+            bench_single_primitive::<true>(&mut group, size, &rows, null_density);
+        }
+    }
+
+    group.finish();
+}
+
+fn bench_single_primitive<const NULLABLE: bool>(
+    group: &mut BenchmarkGroup<WallTime>,
+    size: usize,
+    rows: &Vec<usize>,
+    null_density: f32,
+) {
+    if !NULLABLE {
+        assert_eq!(
+            null_density, 0.0,
+            "non-nullable case must have null_density 0"
+        );
+    }
+
+    let input = create_primitive_array::<Int32Type>(size, null_density);
+    let input: ArrayRef = Arc::new(input);
+    let function_name = format!("null_{null_density:.1}_nullable_{NULLABLE}_size_{size}");
+
+    // vectorized_append
+    let id = BenchmarkId::new(&function_name, "vectorized_append");
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            let mut builder =
+                PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32);
+            builder.vectorized_append(&input, rows).unwrap();
+        });
+    });
+
+    // append_val
+    let id = BenchmarkId::new(&function_name, "append_val");
+    group.bench_function(id, |b| {
+        b.iter(|| {
+            let mut builder =
+                PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32);
+            for &i in rows {
+                builder.append_val(&input, i).unwrap();
+            }
+        });
+    });
+
+    // vectorized_equal_to
+    vectorized_equal_to(
+        group,
+        PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32),
+        &function_name,
+        rows,
+        &input,
+        "all_true",
+        vec![true; size],
+    );
+    vectorized_equal_to(
+        group,
+        PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32),
+        &function_name,
+        rows,
+        &input,
+        "0.75 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.75).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    vectorized_equal_to(
+        group,
+        PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32),
+        &function_name,
+        rows,
+        &input,
+        "0.5 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.5).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    vectorized_equal_to(
+        group,
+        PrimitiveGroupValueBuilder::<Int32Type, NULLABLE>::new(DataType::Int32),
+        &function_name,
+        rows,
+        &input,
+        "0.25 true",
+        {
+            let mut rng = seedable_rng();
+            let d = Bernoulli::new(0.25).unwrap();
+            (0..size).map(|_| d.sample(&mut rng)).collect::<Vec<_>>()
+        },
+    );
+    // Not adding 0 true case here as if we optimize for 0 true cases the caller should avoid calling this method at all
+}
+
+/// Test `vectorized_equal_to` with different number of true in the initial results
+#[expect(clippy::needless_pass_by_value)]
+fn vectorized_equal_to<GroupColumnBuilder: GroupColumn>(
+    group: &mut BenchmarkGroup<WallTime>,
+    mut builder: GroupColumnBuilder,
+    function_name: &str,
+    rows: &[usize],
+    input: &ArrayRef,
+    equal_to_result_description: &str,
+    equal_to_results: Vec<bool>,
+) {
+    let id = BenchmarkId::new(
+        function_name,
+        format!("vectorized_equal_to_{equal_to_result_description}"),
+    );
+    group.bench_function(id, |b| {
+        builder.vectorized_append(input, rows).unwrap();
+
+        b.iter(|| {
+            // Cloning is a must as `vectorized_equal_to` will modify the input vec
+            // and without cloning all benchmarks after the first one won't be meaningful
+            let mut equal_to_results = equal_to_results.clone();
+            builder.vectorized_equal_to(rows, input, rows, &mut equal_to_results);
+
+            // Make sure that the compiler does not optimize away the call
+            black_box(equal_to_results);
+        });
+    });
+}
+
+criterion_group!(benches, bench_vectorized_append);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/partial_ordering.rs b/datafusion/physical-plan/benches/partial_ordering.rs
index 22d18dd248911..bdadd6274b75e 100644
--- a/datafusion/physical-plan/benches/partial_ordering.rs
+++ b/datafusion/physical-plan/benches/partial_ordering.rs
@@ -18,11 +18,10 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int32Array};
-use arrow_schema::{DataType, Field, Schema, SortOptions};
-use criterion::{criterion_group, criterion_main, Criterion};
-use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr};
 use datafusion_physical_plan::aggregates::order::GroupOrderingPartial;
 
+use criterion::{Criterion, criterion_group, criterion_main};
+
 const BATCH_SIZE: usize = 8192;
 
 fn create_test_arrays(num_columns: usize) -> Vec<ArrayRef> {
@@ -39,22 +38,7 @@ fn bench_new_groups(c: &mut Criterion) {
 
     // Test with 1, 2, 4, and 8 order indices
     for num_columns in [1, 2, 4, 8] {
-        let fields: Vec<Field> = (0..num_columns)
-            .map(|i| Field::new(format!("col{i}"), DataType::Int32, false))
-            .collect();
-        let schema = Schema::new(fields);
-
         let order_indices: Vec<usize> = (0..num_columns).collect();
-        let ordering = LexOrdering::new(
-            (0..num_columns)
-                .map(|i| {
-                    PhysicalSortExpr::new(
-                        col(&format!("col{i}"), &schema).unwrap(),
-                        SortOptions::default(),
-                    )
-                })
-                .collect(),
-        );
 
         group.bench_function(format!("order_indices_{num_columns}"), |b| {
             let batch_group_values = create_test_arrays(num_columns);
@@ -62,8 +46,7 @@ fn bench_new_groups(c: &mut Criterion) {
 
             b.iter(|| {
                 let mut ordering =
-                    GroupOrderingPartial::try_new(&schema, &order_indices, &ordering)
-                        .unwrap();
+                    GroupOrderingPartial::try_new(order_indices.clone()).unwrap();
                 ordering
                     .new_groups(&batch_group_values, &group_indices, BATCH_SIZE)
                     .unwrap();
diff --git a/datafusion/physical-plan/benches/sort_merge_join.rs b/datafusion/physical-plan/benches/sort_merge_join.rs
new file mode 100644
index 0000000000000..82610b2a54c2b
--- /dev/null
+++ b/datafusion/physical-plan/benches/sort_merge_join.rs
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Criterion benchmarks for Sort Merge Join
+//!
+//! These benchmarks measure the join kernel in isolation by feeding
+//! pre-sorted RecordBatches directly into SortMergeJoinExec, avoiding
+//! sort / scan overhead.
+
+use std::sync::Arc;
+
+use arrow::array::{Int64Array, RecordBatch, StringArray};
+use arrow::compute::SortOptions;
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use datafusion_common::NullEquality;
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::expressions::col;
+use datafusion_physical_plan::collect;
+use datafusion_physical_plan::joins::{SortMergeJoinExec, utils::JoinOn};
+use datafusion_physical_plan::test::TestMemoryExec;
+use tokio::runtime::Runtime;
+
+/// Build pre-sorted RecordBatches (split into ~8192-row chunks).
+///
+/// Schema: (key: Int64, data: Int64, payload: Utf8)
+///
+/// `key_mod` controls distinct key count: key = row_index % key_mod.
+fn build_sorted_batches(
+    num_rows: usize,
+    key_mod: usize,
+    schema: &SchemaRef,
+) -> Vec<RecordBatch> {
+    let mut rows: Vec<(i64, i64)> = (0..num_rows)
+        .map(|i| ((i % key_mod) as i64, i as i64))
+        .collect();
+    rows.sort();
+
+    let keys: Vec<i64> = rows.iter().map(|(k, _)| *k).collect();
+    let data: Vec<i64> = rows.iter().map(|(_, d)| *d).collect();
+    let payload: Vec<String> = data.iter().map(|d| format!("val_{d}")).collect();
+
+    let batch = RecordBatch::try_new(
+        Arc::clone(schema),
+        vec![
+            Arc::new(Int64Array::from(keys)),
+            Arc::new(Int64Array::from(data)),
+            Arc::new(StringArray::from(payload)),
+        ],
+    )
+    .unwrap();
+
+    let batch_size = 8192;
+    let mut batches = Vec::new();
+    let mut offset = 0;
+    while offset < batch.num_rows() {
+        let len = (batch.num_rows() - offset).min(batch_size);
+        batches.push(batch.slice(offset, len));
+        offset += len;
+    }
+    batches
+}
+
+fn make_exec(
+    batches: &[RecordBatch],
+    schema: &SchemaRef,
+) -> Arc<dyn datafusion_physical_plan::ExecutionPlan> {
+    TestMemoryExec::try_new_exec(&[batches.to_vec()], Arc::clone(schema), None).unwrap()
+}
+
+fn schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("data", DataType::Int64, false),
+        Field::new("payload", DataType::Utf8, false),
+    ]))
+}
+
+fn do_join(
+    left: Arc<dyn datafusion_physical_plan::ExecutionPlan>,
+    right: Arc<dyn datafusion_physical_plan::ExecutionPlan>,
+    join_type: datafusion_common::JoinType,
+    rt: &Runtime,
+) -> usize {
+    let on: JoinOn = vec![(
+        col("key", &left.schema()).unwrap(),
+        col("key", &right.schema()).unwrap(),
+    )];
+    let join = SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        None,
+        join_type,
+        vec![SortOptions::default()],
+        NullEquality::NullEqualsNothing,
+    )
+    .unwrap();
+
+    let task_ctx = Arc::new(TaskContext::default());
+    rt.block_on(async {
+        let batches = collect(Arc::new(join), task_ctx).await.unwrap();
+        batches.iter().map(|b| b.num_rows()).sum()
+    })
+}
+
+fn bench_smj(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let s = schema();
+
+    let mut group = c.benchmark_group("sort_merge_join");
+
+    // 1:1 Inner Join — 100K rows each, unique keys
+    // Best case for contiguous-range optimization: every index array is [0,1,2,...].
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("inner_1to1", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Inner, &rt)
+            })
+        });
+    }
+
+    // 1:10 Inner Join — 100K left, 100K right, 10K distinct keys
+    {
+        let n = 100_000;
+        let key_mod = 10_000;
+        let left_batches = build_sorted_batches(n, key_mod, &s);
+        let right_batches = build_sorted_batches(n, key_mod, &s);
+        group.bench_function(BenchmarkId::new("inner_1to10", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Inner, &rt)
+            })
+        });
+    }
+
+    // Left Join — 100K each, ~5% unmatched on left
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n + n / 20, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("left_1to1_unmatched", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::Left, &rt)
+            })
+        });
+    }
+
+    // Left Semi Join — 100K left, 100K right, 10K keys
+    {
+        let n = 100_000;
+        let key_mod = 10_000;
+        let left_batches = build_sorted_batches(n, key_mod, &s);
+        let right_batches = build_sorted_batches(n, key_mod, &s);
+        group.bench_function(BenchmarkId::new("left_semi_1to10", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::LeftSemi, &rt)
+            })
+        });
+    }
+
+    // Left Anti Join — 100K left, 100K right, partial match
+    {
+        let n = 100_000;
+        let left_batches = build_sorted_batches(n, n + n / 5, &s);
+        let right_batches = build_sorted_batches(n, n, &s);
+        group.bench_function(BenchmarkId::new("left_anti_partial", n), |b| {
+            b.iter(|| {
+                let left = make_exec(&left_batches, &s);
+                let right = make_exec(&right_batches, &s);
+                do_join(left, right, datafusion_common::JoinType::LeftAnti, &rt)
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_smj);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/sort_preserving_merge.rs b/datafusion/physical-plan/benches/sort_preserving_merge.rs
index 9586dbf94727b..76ebf230a30e0 100644
--- a/datafusion/physical-plan/benches/sort_preserving_merge.rs
+++ b/datafusion/physical-plan/benches/sort_preserving_merge.rs
@@ -20,9 +20,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use arrow_schema::{SchemaRef, SortOptions};
-use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, expressions::col};
 use datafusion_physical_plan::test::TestMemoryExec;
 use datafusion_physical_plan::{
     collect, sorts::sort_preserving_merge::SortPreservingMergeExec,
@@ -115,18 +115,13 @@ fn get_bench_data() -> Vec<BenchData> {
     let mut push_bench_data = |bench_name: &str, partitions: Vec<Vec<RecordBatch>>| {
         let schema = partitions[0][0].schema();
         // Define sort order (col1 ASC, col2 ASC, col3 ASC)
-        let sort_order = LexOrdering::new(
-            schema
-                .fields()
-                .iter()
-                .map(|field| {
-                    PhysicalSortExpr::new(
-                        col(field.name(), &schema).unwrap(),
-                        SortOptions::default(),
-                    )
-                })
-                .collect(),
-        );
+        let sort_order = LexOrdering::new(schema.fields().iter().map(|field| {
+            PhysicalSortExpr::new(
+                col(field.name(), &schema).unwrap(),
+                SortOptions::default(),
+            )
+        }))
+        .unwrap();
         ret.push(BenchData {
             bench_name: bench_name.to_string(),
             partitions,
@@ -173,7 +168,7 @@ fn bench_merge_sorted_preserving(c: &mut Criterion) {
             sort_order,
         } = data;
         c.bench_function(
-            &format!("bench_merge_sorted_preserving/{}", bench_name),
+            &format!("bench_merge_sorted_preserving/{bench_name}"),
             |b| {
                 b.iter_batched(
                     || {
diff --git a/datafusion/physical-plan/benches/spill_io.rs b/datafusion/physical-plan/benches/spill_io.rs
index 3b877671ad583..fac2547a131b4 100644
--- a/datafusion/physical-plan/benches/spill_io.rs
+++ b/datafusion/physical-plan/benches/spill_io.rs
@@ -16,14 +16,22 @@
 // under the License.
 
 use arrow::array::{
-    Date32Builder, Decimal128Builder, Int32Builder, RecordBatch, StringBuilder,
+    Date32Builder, Decimal128Builder, Int32Builder, Int64Builder, RecordBatch,
+    StringBuilder,
 };
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
+use criterion::measurement::WallTime;
+use criterion::{
+    BatchSize, BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main,
+};
+use datafusion_common::config::SpillCompression;
+use datafusion_common::human_readable_size;
+use datafusion_common::instant::Instant;
 use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_physical_plan::SpillManager;
 use datafusion_physical_plan::common::collect;
 use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
-use datafusion_physical_plan::SpillManager;
+use rand::{Rng, SeedableRng};
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
@@ -107,8 +115,9 @@ fn bench_spill_io(c: &mut Criterion) {
                 // - Wait for the consumer to finish processing
                 |spill_file| {
                     rt.block_on(async {
-                        let stream =
-                            spill_manager.read_spill_as_stream(spill_file).unwrap();
+                        let stream = spill_manager
+                            .read_spill_as_stream(spill_file, None)
+                            .unwrap();
                         let _ = collect(stream).await.unwrap();
                     })
                 },
@@ -119,5 +128,454 @@ fn bench_spill_io(c: &mut Criterion) {
     group.finish();
 }
 
-criterion_group!(benches, bench_spill_io);
+// Generate `num_batches` RecordBatches mimicking TPC-H Q2's partial aggregate result:
+// GROUP BY ps_partkey -> MIN(ps_supplycost)
+fn create_q2_like_batches(
+    num_batches: usize,
+    num_rows: usize,
+) -> (Arc<Schema>, Vec<RecordBatch>) {
+    // use fixed seed
+    let seed = 2;
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut batches = Vec::with_capacity(num_batches);
+
+    let mut current_key = 400000_i64;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("ps_partkey", DataType::Int64, false),
+        Field::new("min_ps_supplycost", DataType::Decimal128(15, 2), true),
+    ]));
+
+    for _ in 0..num_batches {
+        let mut partkey_builder = Int64Builder::new();
+        let mut cost_builder = Decimal128Builder::new()
+            .with_precision_and_scale(15, 2)
+            .unwrap();
+
+        for _ in 0..num_rows {
+            // Occasionally skip a few partkey values to simulate sparsity
+            let jump = if rng.random_bool(0.05) {
+                rng.random_range(2..10)
+            } else {
+                1
+            };
+            current_key += jump;
+
+            let supply_cost = rng.random_range(10_00..100_000) as i128;
+
+            partkey_builder.append_value(current_key);
+            cost_builder.append_value(supply_cost);
+        }
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(partkey_builder.finish()),
+                Arc::new(cost_builder.finish()),
+            ],
+        )
+        .unwrap();
+
+        batches.push(batch);
+    }
+
+    (schema, batches)
+}
+
+/// Generate `num_batches` RecordBatches mimicking TPC-H Q16's partial aggregate result:
+/// GROUP BY (p_brand, p_type, p_size) -> COUNT(DISTINCT ps_suppkey)
+pub fn create_q16_like_batches(
+    num_batches: usize,
+    num_rows: usize,
+) -> (Arc<Schema>, Vec<RecordBatch>) {
+    let seed = 16;
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut batches = Vec::with_capacity(num_batches);
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("p_brand", DataType::Utf8, false),
+        Field::new("p_type", DataType::Utf8, false),
+        Field::new("p_size", DataType::Int32, false),
+        Field::new("alias1", DataType::Int64, false), // COUNT(DISTINCT ps_suppkey)
+    ]));
+
+    // Representative string pools
+    let brands = ["Brand#32", "Brand#33", "Brand#41", "Brand#42", "Brand#55"];
+    let types = [
+        "PROMO ANODIZED NICKEL",
+        "STANDARD BRUSHED NICKEL",
+        "PROMO POLISHED COPPER",
+        "ECONOMY ANODIZED BRASS",
+        "LARGE BURNISHED COPPER",
+        "STANDARD POLISHED TIN",
+        "SMALL PLATED STEEL",
+        "MEDIUM POLISHED COPPER",
+    ];
+    let sizes = [3, 9, 14, 19, 23, 36, 45, 49];
+
+    for _ in 0..num_batches {
+        let mut brand_builder = StringBuilder::new();
+        let mut type_builder = StringBuilder::new();
+        let mut size_builder = Int32Builder::new();
+        let mut count_builder = Int64Builder::new();
+
+        for _ in 0..num_rows {
+            let brand = brands[rng.random_range(0..brands.len())];
+            let ptype = types[rng.random_range(0..types.len())];
+            let size = sizes[rng.random_range(0..sizes.len())];
+            let count = rng.random_range(1000..100_000);
+
+            brand_builder.append_value(brand);
+            type_builder.append_value(ptype);
+            size_builder.append_value(size);
+            count_builder.append_value(count);
+        }
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(brand_builder.finish()),
+                Arc::new(type_builder.finish()),
+                Arc::new(size_builder.finish()),
+                Arc::new(count_builder.finish()),
+            ],
+        )
+        .unwrap();
+
+        batches.push(batch);
+    }
+
+    (schema, batches)
+}
+
+// Generate `num_batches` RecordBatches mimicking TPC-H Q20's partial aggregate result:
+// GROUP BY (l_partkey, l_suppkey) -> SUM(l_quantity)
+fn create_q20_like_batches(
+    num_batches: usize,
+    num_rows: usize,
+) -> (Arc<Schema>, Vec<RecordBatch>) {
+    let seed = 20;
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut batches = Vec::with_capacity(num_batches);
+
+    let mut current_partkey = 400000_i64;
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("l_partkey", DataType::Int64, false),
+        Field::new("l_suppkey", DataType::Int64, false),
+        Field::new("sum_l_quantity", DataType::Decimal128(25, 2), true),
+    ]));
+
+    for _ in 0..num_batches {
+        let mut partkey_builder = Int64Builder::new();
+        let mut suppkey_builder = Int64Builder::new();
+        let mut quantity_builder = Decimal128Builder::new()
+            .with_precision_and_scale(25, 2)
+            .unwrap();
+
+        for _ in 0..num_rows {
+            // Occasionally skip a few partkey values to simulate sparsity
+            let partkey_jump = if rng.random_bool(0.03) {
+                rng.random_range(2..6)
+            } else {
+                1
+            };
+            current_partkey += partkey_jump;
+
+            let suppkey = rng.random_range(10_000..99_999);
+            let quantity = rng.random_range(500..20_000) as i128;
+
+            partkey_builder.append_value(current_partkey);
+            suppkey_builder.append_value(suppkey);
+            quantity_builder.append_value(quantity);
+        }
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(partkey_builder.finish()),
+                Arc::new(suppkey_builder.finish()),
+                Arc::new(quantity_builder.finish()),
+            ],
+        )
+        .unwrap();
+
+        batches.push(batch);
+    }
+
+    (schema, batches)
+}
+
+/// Generate `num_batches` wide RecordBatches resembling sort-tpch Q10 for benchmarking.
+/// This includes multiple numeric, date, and Utf8View columns (15 total).
+pub fn create_wide_batches(
+    num_batches: usize,
+    num_rows: usize,
+) -> (Arc<Schema>, Vec<RecordBatch>) {
+    let seed = 10;
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut batches = Vec::with_capacity(num_batches);
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("l_linenumber", DataType::Int32, false),
+        Field::new("l_suppkey", DataType::Int64, false),
+        Field::new("l_orderkey", DataType::Int64, false),
+        Field::new("l_partkey", DataType::Int64, false),
+        Field::new("l_quantity", DataType::Decimal128(15, 2), false),
+        Field::new("l_extendedprice", DataType::Decimal128(15, 2), false),
+        Field::new("l_discount", DataType::Decimal128(15, 2), false),
+        Field::new("l_tax", DataType::Decimal128(15, 2), false),
+        Field::new("l_returnflag", DataType::Utf8, false),
+        Field::new("l_linestatus", DataType::Utf8, false),
+        Field::new("l_shipdate", DataType::Date32, false),
+        Field::new("l_commitdate", DataType::Date32, false),
+        Field::new("l_receiptdate", DataType::Date32, false),
+        Field::new("l_shipinstruct", DataType::Utf8, false),
+        Field::new("l_shipmode", DataType::Utf8, false),
+    ]));
+
+    for _ in 0..num_batches {
+        let mut linenum = Int32Builder::new();
+        let mut suppkey = Int64Builder::new();
+        let mut orderkey = Int64Builder::new();
+        let mut partkey = Int64Builder::new();
+        let mut quantity = Decimal128Builder::new()
+            .with_precision_and_scale(15, 2)
+            .unwrap();
+        let mut extprice = Decimal128Builder::new()
+            .with_precision_and_scale(15, 2)
+            .unwrap();
+        let mut discount = Decimal128Builder::new()
+            .with_precision_and_scale(15, 2)
+            .unwrap();
+        let mut tax = Decimal128Builder::new()
+            .with_precision_and_scale(15, 2)
+            .unwrap();
+        let mut retflag = StringBuilder::new();
+        let mut linestatus = StringBuilder::new();
+        let mut shipdate = Date32Builder::new();
+        let mut commitdate = Date32Builder::new();
+        let mut receiptdate = Date32Builder::new();
+        let mut shipinstruct = StringBuilder::new();
+        let mut shipmode = StringBuilder::new();
+
+        let return_flags = ["A", "N", "R"];
+        let statuses = ["F", "O"];
+        let instructs = ["DELIVER IN PERSON", "COLLECT COD", "NONE"];
+        let modes = ["TRUCK", "MAIL", "SHIP", "RAIL", "AIR"];
+
+        for i in 0..num_rows {
+            linenum.append_value((i % 7) as i32);
+            suppkey.append_value(rng.random_range(0..100_000));
+            orderkey.append_value(1_000_000 + i as i64);
+            partkey.append_value(rng.random_range(0..200_000));
+
+            quantity.append_value(rng.random_range(100..10000) as i128);
+            extprice.append_value(rng.random_range(1_000..1_000_000) as i128);
+            discount.append_value(rng.random_range(0..10000) as i128);
+            tax.append_value(rng.random_range(0..5000) as i128);
+
+            retflag.append_value(return_flags[rng.random_range(0..return_flags.len())]);
+            linestatus.append_value(statuses[rng.random_range(0..statuses.len())]);
+
+            let base_date = 10_000;
+            shipdate.append_value(base_date + (i % 1000) as i32);
+            commitdate.append_value(base_date + (i % 1000) as i32 + 1);
+            receiptdate.append_value(base_date + (i % 1000) as i32 + 2);
+
+            shipinstruct.append_value(instructs[rng.random_range(0..instructs.len())]);
+            shipmode.append_value(modes[rng.random_range(0..modes.len())]);
+        }
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(linenum.finish()),
+                Arc::new(suppkey.finish()),
+                Arc::new(orderkey.finish()),
+                Arc::new(partkey.finish()),
+                Arc::new(quantity.finish()),
+                Arc::new(extprice.finish()),
+                Arc::new(discount.finish()),
+                Arc::new(tax.finish()),
+                Arc::new(retflag.finish()),
+                Arc::new(linestatus.finish()),
+                Arc::new(shipdate.finish()),
+                Arc::new(commitdate.finish()),
+                Arc::new(receiptdate.finish()),
+                Arc::new(shipinstruct.finish()),
+                Arc::new(shipmode.finish()),
+            ],
+        )
+        .unwrap();
+        batches.push(batch);
+    }
+    (schema, batches)
+}
+
+// Benchmarks spill write + read performance across multiple compression codecs
+// using realistic input data inspired by TPC-H aggregate spill scenarios.
+//
+// This function prepares synthetic RecordBatches that mimic the schema and distribution
+// of intermediate aggregate results from representative TPC-H queries (Q2, Q16, Q20) and sort-tpch Q10.
+// The schemas of these batches are:
+//      Q2 [Int64, Decimal128]
+//      Q16 [Utf8, Utf8, Int32, Int64]
+//      Q20 [Int64, Int64, Decimal128]
+//      sort-tpch Q10 (wide batch) [Int32, Int64 * 3, Decimal128 * 4, Date * 3, Utf8 * 4]
+// For each dataset:
+// - It evaluates spill performance under different compression codecs (e.g., Uncompressed, Zstd, LZ4).
+// - It measures end-to-end spill write + read performance using Criterion.
+// - It prints the observed memory-to-disk compression ratio for each codec.
+//
+// This helps evaluate the tradeoffs between compression ratio and runtime overhead for various codecs.
+fn bench_spill_compression(c: &mut Criterion) {
+    let env = Arc::new(RuntimeEnv::default());
+    let mut group = c.benchmark_group("spill_compression");
+    let rt = Runtime::new().unwrap();
+    let compressions = vec![
+        SpillCompression::Uncompressed,
+        SpillCompression::Zstd,
+        SpillCompression::Lz4Frame,
+    ];
+
+    // Modify these values to change data volume. Note that each batch contains `num_rows` rows.
+    let num_batches = 50;
+    let num_rows = 8192;
+
+    // Q2 [Int64, Decimal128]
+    let (schema, batches) = create_q2_like_batches(num_batches, num_rows);
+    benchmark_spill_batches_for_all_codec(
+        &mut group,
+        "q2",
+        batches,
+        &compressions,
+        &rt,
+        env.clone(),
+        schema,
+    );
+    // Q16 [Utf8, Utf8, Int32, Int64]
+    let (schema, batches) = create_q16_like_batches(num_batches, num_rows);
+    benchmark_spill_batches_for_all_codec(
+        &mut group,
+        "q16",
+        batches,
+        &compressions,
+        &rt,
+        env.clone(),
+        schema,
+    );
+    // Q20 [Int64, Int64, Decimal128]
+    let (schema, batches) = create_q20_like_batches(num_batches, num_rows);
+    benchmark_spill_batches_for_all_codec(
+        &mut group,
+        "q20",
+        batches,
+        &compressions,
+        &rt,
+        env.clone(),
+        schema,
+    );
+    // sort-tpch Q10 (wide batch) [Int32, Int64 * 3, Decimal128 * 4, Date * 3, Utf8 * 4]
+    let (schema, batches) = create_wide_batches(num_batches, num_rows);
+    benchmark_spill_batches_for_all_codec(
+        &mut group,
+        "wide",
+        batches,
+        &compressions,
+        &rt,
+        env,
+        schema,
+    );
+    group.finish();
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn benchmark_spill_batches_for_all_codec(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    batch_label: &str,
+    batches: Vec<RecordBatch>,
+    compressions: &[SpillCompression],
+    rt: &Runtime,
+    env: Arc<RuntimeEnv>,
+    schema: Arc<Schema>,
+) {
+    let mem_bytes: usize = batches.iter().map(|b| b.get_array_memory_size()).sum();
+
+    for &compression in compressions {
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let spill_manager =
+            SpillManager::new(Arc::clone(&env), metrics.clone(), Arc::clone(&schema))
+                .with_compression_type(compression);
+
+        let bench_id = BenchmarkId::new(batch_label, compression.to_string());
+        group.bench_with_input(bench_id, &spill_manager, |b, spill_manager| {
+            b.iter_batched(
+                || batches.clone(),
+                |batches| {
+                    rt.block_on(async {
+                        let spill_file = spill_manager
+                            .spill_record_batch_and_finish(
+                                &batches,
+                                &format!("{batch_label}_{compression}"),
+                            )
+                            .unwrap()
+                            .unwrap();
+                        let stream = spill_manager
+                            .read_spill_as_stream(spill_file, None)
+                            .unwrap();
+                        let _ = collect(stream).await.unwrap();
+                    })
+                },
+                BatchSize::LargeInput,
+            )
+        });
+
+        // Run Spilling Read & Write once more to read file size & calculate bandwidth
+        let start = Instant::now();
+
+        let spill_file = spill_manager
+            .spill_record_batch_and_finish(
+                &batches,
+                &format!("{batch_label}_{compression}"),
+            )
+            .unwrap()
+            .unwrap();
+
+        // calculate write_throughput (includes both compression and I/O time) based on in memory batch size
+        let write_time = start.elapsed();
+        let write_throughput = (mem_bytes as u128 / write_time.as_millis().max(1)) * 1000;
+
+        // calculate compression ratio
+        let disk_bytes = std::fs::metadata(spill_file.path())
+            .expect("metadata read fail")
+            .len() as usize;
+        let ratio = mem_bytes as f64 / disk_bytes.max(1) as f64;
+
+        // calculate read_throughput (includes both compression and I/O time) based on in memory batch size
+        let rt = Runtime::new().unwrap();
+        let start = Instant::now();
+        rt.block_on(async {
+            let stream = spill_manager
+                .read_spill_as_stream(spill_file, None)
+                .unwrap();
+            let _ = collect(stream).await.unwrap();
+        });
+        let read_time = start.elapsed();
+        let read_throughput = (mem_bytes as u128 / read_time.as_millis().max(1)) * 1000;
+
+        println!(
+            "[{} | {:?}] mem: {}| disk: {}| compression ratio: {:.3}x| throughput: (w) {}/s (r) {}/s",
+            batch_label,
+            compression,
+            human_readable_size(mem_bytes),
+            human_readable_size(disk_bytes),
+            ratio,
+            human_readable_size(write_throughput as usize),
+            human_readable_size(read_throughput as usize),
+        );
+    }
+}
+
+criterion_group!(benches, bench_spill_io, bench_spill_compression);
 criterion_main!(benches);
diff --git a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
new file mode 100644
index 0000000000000..b6c32204e85f0
--- /dev/null
+++ b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metrics for the various group-by implementations.
+
+use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
+
+pub(crate) struct GroupByMetrics {
+    /// Time spent calculating the group IDs from the evaluated grouping columns.
+    pub(crate) time_calculating_group_ids: Time,
+    /// Time spent evaluating the inputs to the aggregate functions.
+    pub(crate) aggregate_arguments_time: Time,
+    /// Time spent evaluating the aggregate expressions themselves
+    /// (e.g. summing all elements and counting number of elements for `avg` aggregate).
+    pub(crate) aggregation_time: Time,
+    /// Time spent emitting the final results and constructing the record batch
+    /// which includes finalizing the grouping expressions
+    /// (e.g. emit from the hash table in case of hash aggregation) and the accumulators
+    pub(crate) emitting_time: Time,
+}
+
+impl GroupByMetrics {
+    pub(crate) fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            time_calculating_group_ids: MetricBuilder::new(metrics)
+                .subset_time("time_calculating_group_ids", partition),
+            aggregate_arguments_time: MetricBuilder::new(metrics)
+                .subset_time("aggregate_arguments_time", partition),
+            aggregation_time: MetricBuilder::new(metrics)
+                .subset_time("aggregation_time", partition),
+            emitting_time: MetricBuilder::new(metrics)
+                .subset_time("emitting_time", partition),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+    use crate::metrics::MetricsSet;
+    use crate::test::TestMemoryExec;
+    use crate::{ExecutionPlan, collect};
+    use arrow::array::{Float64Array, UInt32Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use datafusion_common::Result;
+    use datafusion_execution::TaskContext;
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_functions_aggregate::sum::sum_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::col;
+    use std::sync::Arc;
+
+    /// Helper function to verify all three GroupBy metrics exist and have non-zero values
+    fn assert_groupby_metrics(metrics: &MetricsSet) {
+        let agg_arguments_time = metrics.sum_by_name("aggregate_arguments_time");
+        assert!(agg_arguments_time.is_some());
+        assert!(agg_arguments_time.unwrap().as_usize() > 0);
+
+        let aggregation_time = metrics.sum_by_name("aggregation_time");
+        assert!(aggregation_time.is_some());
+        assert!(aggregation_time.unwrap().as_usize() > 0);
+
+        let emitting_time = metrics.sum_by_name("emitting_time");
+        assert!(emitting_time.is_some());
+        assert!(emitting_time.unwrap().as_usize() > 0);
+    }
+
+    #[tokio::test]
+    async fn test_groupby_metrics_partial_mode() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        // Create multiple batches to ensure metrics accumulate
+        let batches = (0..5)
+            .map(|i| {
+                RecordBatch::try_new(
+                    Arc::clone(&schema),
+                    vec![
+                        Arc::new(UInt32Array::from(vec![1, 2, 3, 4])),
+                        Arc::new(Float64Array::from(vec![
+                            i as f64,
+                            (i + 1) as f64,
+                            (i + 2) as f64,
+                            (i + 3) as f64,
+                        ])),
+                    ],
+                )
+                .unwrap()
+            })
+            .collect::<Vec<_>>();
+
+        let input = TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+
+        let group_by =
+            PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]);
+
+        let aggregates = vec![
+            Arc::new(
+                AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("SUM(b)")
+                    .build()?,
+            ),
+            Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(b)")
+                    .build()?,
+            ),
+        ];
+
+        let aggregate_exec = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            aggregates,
+            vec![None, None],
+            input,
+            schema,
+        )?);
+
+        let task_ctx = Arc::new(TaskContext::default());
+        let _result =
+            collect(Arc::clone(&aggregate_exec) as _, Arc::clone(&task_ctx)).await?;
+
+        let metrics = aggregate_exec.metrics().unwrap();
+        assert_groupby_metrics(&metrics);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_groupby_metrics_final_mode() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        let batches = (0..3)
+            .map(|i| {
+                RecordBatch::try_new(
+                    Arc::clone(&schema),
+                    vec![
+                        Arc::new(UInt32Array::from(vec![1, 2, 3])),
+                        Arc::new(Float64Array::from(vec![
+                            i as f64,
+                            (i + 1) as f64,
+                            (i + 2) as f64,
+                        ])),
+                    ],
+                )
+                .unwrap()
+            })
+            .collect::<Vec<_>>();
+
+        let partial_input =
+            TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+
+        let group_by =
+            PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]);
+
+        let aggregates = vec![Arc::new(
+            AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("SUM(b)")
+                .build()?,
+        )];
+
+        // Create partial aggregate
+        let partial_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by.clone(),
+            aggregates.clone(),
+            vec![None],
+            partial_input,
+            Arc::clone(&schema),
+        )?);
+
+        // Create final aggregate
+        let final_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            group_by.as_final(),
+            aggregates,
+            vec![None],
+            partial_aggregate,
+            schema,
+        )?);
+
+        let task_ctx = Arc::new(TaskContext::default());
+        let _result =
+            collect(Arc::clone(&final_aggregate) as _, Arc::clone(&task_ctx)).await?;
+
+        let metrics = final_aggregate.metrics().unwrap();
+        assert_groupby_metrics(&metrics);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
index ce56ca4f7dfd7..2f3b1a19e7d73 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
@@ -22,13 +22,13 @@ use arrow::array::types::{
     Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
     TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
 };
-use arrow::array::{downcast_primitive, ArrayRef, RecordBatch};
+use arrow::array::{ArrayRef, downcast_primitive};
 use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
 use datafusion_common::Result;
 
 use datafusion_expr::EmitTo;
 
-pub(crate) mod multi_group_by;
+pub mod multi_group_by;
 
 mod row;
 mod single_group_by;
@@ -40,14 +40,17 @@ pub(crate) use single_group_by::primitive::HashValue;
 
 use crate::aggregates::{
     group_values::single_group_by::{
-        bytes::GroupValuesByes, bytes_view::GroupValuesBytesView,
-        primitive::GroupValuesPrimitive,
+        boolean::GroupValuesBoolean, bytes::GroupValuesBytes,
+        bytes_view::GroupValuesBytesView, primitive::GroupValuesPrimitive,
     },
     order::GroupOrdering,
 };
 
+mod metrics;
 mod null_builder;
 
+pub(crate) use metrics::GroupByMetrics;
+
 /// Stores the group values during hash aggregation.
 ///
 /// # Background
@@ -84,7 +87,7 @@ mod null_builder;
 /// Each distinct group in a hash aggregation is identified by a unique group id
 /// (usize) which is assigned by instances of this trait. Group ids are
 /// continuous without gaps, starting from 0.
-pub(crate) trait GroupValues: Send {
+pub trait GroupValues: Send {
     /// Calculates the group id for each input row of `cols`, assigning new
     /// group ids as necessary.
     ///
@@ -109,7 +112,7 @@ pub(crate) trait GroupValues: Send {
     fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;
 
     /// Clear the contents and shrink the capacity to the size of the batch (free up memory usage)
-    fn clear_shrink(&mut self, batch: &RecordBatch);
+    fn clear_shrink(&mut self, num_rows: usize);
 }
 
 /// Return a specialized implementation of [`GroupValues`] for the given schema.
@@ -119,15 +122,16 @@ pub(crate) trait GroupValues: Send {
 ///   - If group by single column, and type of this column has
 ///     the specific [`GroupValues`] implementation, such implementation
 ///     will be chosen.
-///   
-///   - If group by multiple columns, and all column types have the specific
-///     [`GroupColumn`] implementations, [`GroupValuesColumn`] will be chosen.
 ///
-///   - Otherwise, the general implementation [`GroupValuesRows`] will be chosen.
+///   - If group by multiple columns, and all column types have the specific
+///     `GroupColumn` implementations, `GroupValuesColumn` will be chosen.
 ///
-/// [`GroupColumn`]:  crate::aggregates::group_values::multi_group_by::GroupColumn
+///   - Otherwise, the general implementation `GroupValuesRows` will be chosen.
 ///
-pub(crate) fn new_group_values(
+/// `GroupColumn`:  crate::aggregates::group_values::multi_group_by::GroupColumn
+/// `GroupValuesColumn`: crate::aggregates::group_values::multi_group_by::GroupValuesColumn
+/// `GroupValuesRows`: crate::aggregates::group_values::row::GroupValuesRows
+pub fn new_group_values(
     schema: SchemaRef,
     group_ordering: &GroupOrdering,
 ) -> Result<Box<dyn GroupValues>> {
@@ -172,23 +176,26 @@ pub(crate) fn new_group_values(
                 downcast_helper!(Decimal128Type, d);
             }
             DataType::Utf8 => {
-                return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Utf8)));
+                return Ok(Box::new(GroupValuesBytes::<i32>::new(OutputType::Utf8)));
             }
             DataType::LargeUtf8 => {
-                return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Utf8)));
+                return Ok(Box::new(GroupValuesBytes::<i64>::new(OutputType::Utf8)));
             }
             DataType::Utf8View => {
                 return Ok(Box::new(GroupValuesBytesView::new(OutputType::Utf8View)));
             }
             DataType::Binary => {
-                return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Binary)));
+                return Ok(Box::new(GroupValuesBytes::<i32>::new(OutputType::Binary)));
             }
             DataType::LargeBinary => {
-                return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Binary)));
+                return Ok(Box::new(GroupValuesBytes::<i64>::new(OutputType::Binary)));
             }
             DataType::BinaryView => {
                 return Ok(Box::new(GroupValuesBytesView::new(OutputType::BinaryView)));
             }
+            DataType::Boolean => {
+                return Ok(Box::new(GroupValuesBoolean::new()));
+            }
             _ => {}
         }
     }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs
new file mode 100644
index 0000000000000..91a39f28f33c1
--- /dev/null
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use crate::aggregates::group_values::multi_group_by::Nulls;
+use crate::aggregates::group_values::multi_group_by::{GroupColumn, nulls_equal_to};
+use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
+use arrow::array::{Array as _, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
+use datafusion_common::Result;
+use itertools::izip;
+
+/// An implementation of [`GroupColumn`] for booleans
+///
+/// Optimized to skip null buffer construction if the input is known to be non nullable
+///
+/// # Template parameters
+///
+/// `NULLABLE`: if the data can contain any nulls
+#[derive(Debug)]
+pub struct BooleanGroupValueBuilder<const NULLABLE: bool> {
+    buffer: BooleanBufferBuilder,
+    nulls: MaybeNullBufferBuilder,
+}
+
+impl<const NULLABLE: bool> BooleanGroupValueBuilder<NULLABLE> {
+    /// Create a new `BooleanGroupValueBuilder`
+    pub fn new() -> Self {
+        Self {
+            buffer: BooleanBufferBuilder::new(0),
+            nulls: MaybeNullBufferBuilder::new(),
+        }
+    }
+}
+
+impl<const NULLABLE: bool> GroupColumn for BooleanGroupValueBuilder<NULLABLE> {
+    fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool {
+        if NULLABLE {
+            let exist_null = self.nulls.is_null(lhs_row);
+            let input_null = array.is_null(rhs_row);
+            if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                return result;
+            }
+        }
+
+        self.buffer.get_bit(lhs_row) == array.as_boolean().value(rhs_row)
+    }
+
+    fn append_val(&mut self, array: &ArrayRef, row: usize) -> Result<()> {
+        if NULLABLE {
+            if array.is_null(row) {
+                self.nulls.append(true);
+                self.buffer.append(bool::default());
+            } else {
+                self.nulls.append(false);
+                self.buffer.append(array.as_boolean().value(row));
+            }
+        } else {
+            self.buffer.append(array.as_boolean().value(row));
+        }
+
+        Ok(())
+    }
+
+    fn vectorized_equal_to(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        let array = array.as_boolean();
+
+        let iter = izip!(
+            lhs_rows.iter(),
+            rhs_rows.iter(),
+            equal_to_results.iter_mut(),
+        );
+
+        for (&lhs_row, &rhs_row, equal_to_result) in iter {
+            // Has found not equal to in previous column, don't need to check
+            if !*equal_to_result {
+                continue;
+            }
+
+            if NULLABLE {
+                let exist_null = self.nulls.is_null(lhs_row);
+                let input_null = array.is_null(rhs_row);
+                if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                    *equal_to_result = result;
+                    continue;
+                }
+            }
+
+            *equal_to_result = self.buffer.get_bit(lhs_row) == array.value(rhs_row);
+        }
+    }
+
+    fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {
+        let arr = array.as_boolean();
+
+        let null_count = array.null_count();
+        let num_rows = array.len();
+        let all_null_or_non_null = if null_count == 0 {
+            Nulls::None
+        } else if null_count == num_rows {
+            Nulls::All
+        } else {
+            Nulls::Some
+        };
+
+        match (NULLABLE, all_null_or_non_null) {
+            (true, Nulls::Some) => {
+                for &row in rows {
+                    if array.is_null(row) {
+                        self.nulls.append(true);
+                        self.buffer.append(bool::default());
+                    } else {
+                        self.nulls.append(false);
+                        self.buffer.append(arr.value(row));
+                    }
+                }
+            }
+
+            (true, Nulls::None) => {
+                self.nulls.append_n(rows.len(), false);
+                for &row in rows {
+                    self.buffer.append(arr.value(row));
+                }
+            }
+
+            (true, Nulls::All) => {
+                self.nulls.append_n(rows.len(), true);
+                self.buffer.append_n(rows.len(), bool::default());
+            }
+
+            (false, _) => {
+                for &row in rows {
+                    self.buffer.append(arr.value(row));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn size(&self) -> usize {
+        self.buffer.capacity() / 8 + self.nulls.allocated_size()
+    }
+
+    fn build(self: Box<Self>) -> ArrayRef {
+        let Self { mut buffer, nulls } = *self;
+
+        let nulls = nulls.build();
+        if !NULLABLE {
+            assert!(nulls.is_none(), "unexpected nulls in non nullable input");
+        }
+
+        let arr = BooleanArray::new(buffer.finish(), nulls);
+
+        Arc::new(arr)
+    }
+
+    fn take_n(&mut self, n: usize) -> ArrayRef {
+        let first_n_nulls = if NULLABLE { self.nulls.take_n(n) } else { None };
+
+        let mut new_builder = BooleanBufferBuilder::new(self.buffer.len());
+        new_builder.append_packed_range(n..self.buffer.len(), self.buffer.as_slice());
+        std::mem::swap(&mut new_builder, &mut self.buffer);
+
+        // take only first n values from the original builder
+        new_builder.truncate(n);
+
+        Arc::new(BooleanArray::new(new_builder.finish(), first_n_nulls))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::NullBufferBuilder;
+
+    use super::*;
+
+    #[test]
+    fn test_nullable_boolean_equal_to() {
+        let append = |builder: &mut BooleanGroupValueBuilder<true>,
+                      builder_array: &ArrayRef,
+                      append_rows: &[usize]| {
+            for &index in append_rows {
+                builder.append_val(builder_array, index).unwrap();
+            }
+        };
+
+        let equal_to = |builder: &BooleanGroupValueBuilder<true>,
+                        lhs_rows: &[usize],
+                        input_array: &ArrayRef,
+                        rhs_rows: &[usize],
+                        equal_to_results: &mut Vec<bool>| {
+            let iter = lhs_rows.iter().zip(rhs_rows.iter());
+            for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
+                equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
+            }
+        };
+
+        test_nullable_boolean_equal_to_internal(append, equal_to);
+    }
+
+    #[test]
+    fn test_nullable_primitive_vectorized_equal_to() {
+        let append = |builder: &mut BooleanGroupValueBuilder<true>,
+                      builder_array: &ArrayRef,
+                      append_rows: &[usize]| {
+            builder
+                .vectorized_append(builder_array, append_rows)
+                .unwrap();
+        };
+
+        let equal_to = |builder: &BooleanGroupValueBuilder<true>,
+                        lhs_rows: &[usize],
+                        input_array: &ArrayRef,
+                        rhs_rows: &[usize],
+                        equal_to_results: &mut Vec<bool>| {
+            builder.vectorized_equal_to(
+                lhs_rows,
+                input_array,
+                rhs_rows,
+                equal_to_results,
+            );
+        };
+
+        test_nullable_boolean_equal_to_internal(append, equal_to);
+    }
+
+    fn test_nullable_boolean_equal_to_internal<A, E>(mut append: A, mut equal_to: E)
+    where
+        A: FnMut(&mut BooleanGroupValueBuilder<true>, &ArrayRef, &[usize]),
+        E: FnMut(
+            &BooleanGroupValueBuilder<true>,
+            &[usize],
+            &ArrayRef,
+            &[usize],
+            &mut Vec<bool>,
+        ),
+    {
+        // Will cover such cases:
+        //   - exist null, input not null
+        //   - exist null, input null; values not equal
+        //   - exist null, input null; values equal
+        //   - exist not null, input null
+        //   - exist not null, input not null; values not equal
+        //   - exist not null, input not null; values equal
+
+        // Define PrimitiveGroupValueBuilder
+        let mut builder = BooleanGroupValueBuilder::<true>::new();
+        let builder_array = Arc::new(BooleanArray::from(vec![
+            None,
+            None,
+            None,
+            Some(true),
+            Some(false),
+            Some(true),
+        ])) as ArrayRef;
+        append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5]);
+
+        // Define input array
+        let (values, _nulls) = BooleanArray::from(vec![
+            Some(true),
+            Some(false),
+            None,
+            None,
+            Some(true),
+            Some(true),
+        ])
+        .into_parts();
+
+        // explicitly build a null buffer where one of the null values also happens to match
+        let mut nulls = NullBufferBuilder::new(6);
+        nulls.append_non_null();
+        nulls.append_null(); // this sets Some(false) to null above
+        nulls.append_null();
+        nulls.append_null();
+        nulls.append_non_null();
+        nulls.append_non_null();
+        let input_array = Arc::new(BooleanArray::new(values, nulls.finish())) as ArrayRef;
+
+        // Check
+        let mut equal_to_results = vec![true; builder.len()];
+        equal_to(
+            &builder,
+            &[0, 1, 2, 3, 4, 5],
+            &input_array,
+            &[0, 1, 2, 3, 4, 5],
+            &mut equal_to_results,
+        );
+
+        assert!(!equal_to_results[0]);
+        assert!(equal_to_results[1]);
+        assert!(equal_to_results[2]);
+        assert!(!equal_to_results[3]);
+        assert!(!equal_to_results[4]);
+        assert!(equal_to_results[5]);
+    }
+
+    #[test]
+    fn test_not_nullable_primitive_equal_to() {
+        let append = |builder: &mut BooleanGroupValueBuilder<false>,
+                      builder_array: &ArrayRef,
+                      append_rows: &[usize]| {
+            for &index in append_rows {
+                builder.append_val(builder_array, index).unwrap();
+            }
+        };
+
+        let equal_to = |builder: &BooleanGroupValueBuilder<false>,
+                        lhs_rows: &[usize],
+                        input_array: &ArrayRef,
+                        rhs_rows: &[usize],
+                        equal_to_results: &mut Vec<bool>| {
+            let iter = lhs_rows.iter().zip(rhs_rows.iter());
+            for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
+                equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
+            }
+        };
+
+        test_not_nullable_boolean_equal_to_internal(append, equal_to);
+    }
+
+    #[test]
+    fn test_not_nullable_primitive_vectorized_equal_to() {
+        let append = |builder: &mut BooleanGroupValueBuilder<false>,
+                      builder_array: &ArrayRef,
+                      append_rows: &[usize]| {
+            builder
+                .vectorized_append(builder_array, append_rows)
+                .unwrap();
+        };
+
+        let equal_to = |builder: &BooleanGroupValueBuilder<false>,
+                        lhs_rows: &[usize],
+                        input_array: &ArrayRef,
+                        rhs_rows: &[usize],
+                        equal_to_results: &mut Vec<bool>| {
+            builder.vectorized_equal_to(
+                lhs_rows,
+                input_array,
+                rhs_rows,
+                equal_to_results,
+            );
+        };
+
+        test_not_nullable_boolean_equal_to_internal(append, equal_to);
+    }
+
+    fn test_not_nullable_boolean_equal_to_internal<A, E>(mut append: A, mut equal_to: E)
+    where
+        A: FnMut(&mut BooleanGroupValueBuilder<false>, &ArrayRef, &[usize]),
+        E: FnMut(
+            &BooleanGroupValueBuilder<false>,
+            &[usize],
+            &ArrayRef,
+            &[usize],
+            &mut Vec<bool>,
+        ),
+    {
+        // Will cover such cases:
+        //   - values equal
+        //   - values not equal
+
+        // Define PrimitiveGroupValueBuilder
+        let mut builder = BooleanGroupValueBuilder::<false>::new();
+        let builder_array = Arc::new(BooleanArray::from(vec![
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(true),
+        ])) as ArrayRef;
+        append(&mut builder, &builder_array, &[0, 1, 2, 3]);
+
+        // Define input array
+        let input_array = Arc::new(BooleanArray::from(vec![
+            Some(false),
+            Some(false),
+            Some(true),
+            Some(true),
+        ])) as ArrayRef;
+
+        // Check
+        let mut equal_to_results = vec![true; builder.len()];
+        equal_to(
+            &builder,
+            &[0, 1, 2, 3],
+            &input_array,
+            &[0, 1, 2, 3],
+            &mut equal_to_results,
+        );
+
+        assert!(equal_to_results[0]);
+        assert!(!equal_to_results[1]);
+        assert!(!equal_to_results[2]);
+        assert!(equal_to_results[3]);
+    }
+
+    #[test]
+    fn test_nullable_boolean_vectorized_operation_special_case() {
+        // Test the special `all nulls` or `not nulls` input array case
+        // for vectorized append and equal to
+
+        let mut builder = BooleanGroupValueBuilder::<true>::new();
+
+        // All nulls input array
+        let all_nulls_input_array =
+            Arc::new(BooleanArray::from(vec![None, None, None, None, None])) as _;
+        builder
+            .vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4])
+            .unwrap();
+
+        let mut equal_to_results = vec![true; all_nulls_input_array.len()];
+        builder.vectorized_equal_to(
+            &[0, 1, 2, 3, 4],
+            &all_nulls_input_array,
+            &[0, 1, 2, 3, 4],
+            &mut equal_to_results,
+        );
+
+        assert!(equal_to_results[0]);
+        assert!(equal_to_results[1]);
+        assert!(equal_to_results[2]);
+        assert!(equal_to_results[3]);
+        assert!(equal_to_results[4]);
+
+        // All not nulls input array
+        let all_not_nulls_input_array = Arc::new(BooleanArray::from(vec![
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(true),
+            Some(true),
+        ])) as _;
+        builder
+            .vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4])
+            .unwrap();
+
+        let mut equal_to_results = vec![true; all_not_nulls_input_array.len()];
+        builder.vectorized_equal_to(
+            &[5, 6, 7, 8, 9],
+            &all_not_nulls_input_array,
+            &[0, 1, 2, 3, 4],
+            &mut equal_to_results,
+        );
+
+        assert!(equal_to_results[0]);
+        assert!(equal_to_results[1]);
+        assert!(equal_to_results[2]);
+        assert!(equal_to_results[3]);
+        assert!(equal_to_results[4]);
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
index be1f68ea453fa..cd173741b6464 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs
@@ -15,17 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn};
+use crate::aggregates::group_values::multi_group_by::{
+    GroupColumn, Nulls, nulls_equal_to,
+};
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
 use arrow::array::{
-    types::GenericStringType, Array, ArrayRef, AsArray, BufferBuilder,
-    GenericBinaryArray, GenericByteArray, GenericStringArray, OffsetSizeTrait,
+    Array, ArrayRef, AsArray, BufferBuilder, GenericBinaryArray, GenericByteArray,
+    GenericStringArray, OffsetSizeTrait, types::GenericStringType,
 };
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
 use arrow::datatypes::{ByteArrayType, DataType, GenericBinaryType};
 use datafusion_common::utils::proxy::VecAllocExt;
-use datafusion_common::{DataFusionError, Result};
-use datafusion_physical_expr_common::binary_map::{OutputType, INITIAL_BUFFER_CAPACITY};
+use datafusion_common::{Result, exec_datafusion_err};
+use datafusion_physical_expr_common::binary_map::{INITIAL_BUFFER_CAPACITY, OutputType};
 use itertools::izip;
 use std::mem::size_of;
 use std::sync::Arc;
@@ -138,28 +140,28 @@ where
         let null_count = array.null_count();
         let num_rows = array.len();
         let all_null_or_non_null = if null_count == 0 {
-            Some(true)
+            Nulls::None
         } else if null_count == num_rows {
-            Some(false)
+            Nulls::All
         } else {
-            None
+            Nulls::Some
         };
 
         match all_null_or_non_null {
-            None => {
+            Nulls::Some => {
                 for &row in rows {
                     self.append_val_inner::<B>(array, row)?
                 }
             }
 
-            Some(true) => {
+            Nulls::None => {
                 self.nulls.append_n(rows.len(), false);
                 for &row in rows {
                     self.do_append_val_inner(arr, row)?;
                 }
             }
 
-            Some(false) => {
+            Nulls::All => {
                 self.nulls.append_n(rows.len(), true);
 
                 let new_len = self.offsets.len() + rows.len();
@@ -201,10 +203,10 @@ where
         self.buffer.append_slice(value);
 
         if self.buffer.len() > self.max_buffer_size {
-            return Err(DataFusionError::Execution(format!(
+            return Err(exec_datafusion_err!(
                 "offset overflow, buffer size > {}",
                 self.max_buffer_size
-            )));
+            ));
         }
 
         self.offsets.push(O::usize_as(self.buffer.len()));
@@ -633,7 +635,7 @@ mod tests {
         //   - exist not null, input not null; values not equal
         //   - exist not null, input not null; values equal
 
-        // Define PrimitiveGroupValueBuilder
+        // Define ByteGroupValueBuilder
         let mut builder = ByteGroupValueBuilder::<i32>::new(OutputType::Utf8);
         let builder_array = Arc::new(StringArray::from(vec![
             None,
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
index 63018874a1e40..a91dd3115d879 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn};
+use crate::aggregates::group_values::multi_group_by::{
+    GroupColumn, Nulls, nulls_equal_to,
+};
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
-use arrow::array::{make_view, Array, ArrayRef, AsArray, ByteView, GenericByteViewArray};
+use arrow::array::{Array, ArrayRef, AsArray, ByteView, GenericByteViewArray, make_view};
 use arrow::buffer::{Buffer, ScalarBuffer};
 use arrow::datatypes::ByteViewType;
 use datafusion_common::Result;
@@ -71,6 +73,12 @@ pub struct ByteViewGroupValueBuilder<B: ByteViewType> {
     _phantom: PhantomData<B>,
 }
 
+impl<B: ByteViewType> Default for ByteViewGroupValueBuilder<B> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
     pub fn new() -> Self {
         Self {
@@ -91,7 +99,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
 
     fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool {
         let array = array.as_byte_view::<B>();
-        self.do_equal_to_inner(lhs_row, array, rhs_row)
+        // since this is a single row comparison, don't bother specializing for nulls/buffers
+        self.do_equal_to_inner::<true, true>(lhs_row, array, rhs_row)
     }
 
     fn append_val_inner(&mut self, array: &ArrayRef, row: usize) {
@@ -109,15 +118,16 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         self.do_append_val_inner(arr, row);
     }
 
-    fn vectorized_equal_to_inner(
+    // Don't inline to keep the code small and give LLVM the best chance of
+    // vectorizing the inner loop
+    #[inline(never)]
+    fn vectorized_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
         &self,
         lhs_rows: &[usize],
-        array: &ArrayRef,
+        array: &GenericByteViewArray<B>,
         rhs_rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        let array = array.as_byte_view::<B>();
-
         let iter = izip!(
             lhs_rows.iter(),
             rhs_rows.iter(),
@@ -130,7 +140,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
                 continue;
             }
 
-            *equal_to_result = self.do_equal_to_inner(lhs_row, array, rhs_row);
+            *equal_to_result =
+                self.do_equal_to_inner::<HAS_NULLS, HAS_BUFFERS>(lhs_row, array, rhs_row);
         }
     }
 
@@ -139,28 +150,28 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         let null_count = array.null_count();
         let num_rows = array.len();
         let all_null_or_non_null = if null_count == 0 {
-            Some(true)
+            Nulls::None
         } else if null_count == num_rows {
-            Some(false)
+            Nulls::All
         } else {
-            None
+            Nulls::Some
         };
 
         match all_null_or_non_null {
-            None => {
+            Nulls::Some => {
                 for &row in rows {
                     self.append_val_inner(array, row);
                 }
             }
 
-            Some(true) => {
+            Nulls::None => {
                 self.nulls.append_n(rows.len(), false);
                 for &row in rows {
                     self.do_append_val_inner(arr, row);
                 }
             }
 
-            Some(false) => {
+            Nulls::All => {
                 self.nulls.append_n(rows.len(), true);
                 let new_len = self.views.len() + rows.len();
                 self.views.resize(new_len, 0);
@@ -208,26 +219,42 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         }
     }
 
-    fn do_equal_to_inner(
+    /// Compare the value at `lhs_row` in this builder with
+    /// the value at `rhs_row` in input `array`
+    ///
+    /// Templated so that the inner compare loop can be
+    /// specialized based on the input array
+    #[inline(always)]
+    fn do_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
         &self,
         lhs_row: usize,
         array: &GenericByteViewArray<B>,
         rhs_row: usize,
     ) -> bool {
         // Check if nulls equal firstly
-        let exist_null = self.nulls.is_null(lhs_row);
-        let input_null = array.is_null(rhs_row);
-        if let Some(result) = nulls_equal_to(exist_null, input_null) {
-            return result;
+        if HAS_NULLS {
+            let exist_null = self.nulls.is_null(lhs_row);
+            let input_null = array.is_null(rhs_row);
+            if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                return result;
+            }
         }
 
         // Otherwise, we need to check their values
-        let exist_view = self.views[lhs_row];
+
+        // SAFETY: the `lhs_row` and rhs_row` are valid
+        let exist_view = unsafe { *self.views.get_unchecked(lhs_row) };
         let exist_view_len = exist_view as u32;
 
-        let input_view = array.views()[rhs_row];
+        let input_view = unsafe { *array.views().get_unchecked(rhs_row) };
         let input_view_len = input_view as u32;
 
+        // fast path, if we know there are no buffers, then the view must be inlined
+        // so we can simply compare the u128 views
+        if !HAS_BUFFERS {
+            return exist_view == input_view;
+        }
+
         // The check logic
         //   - Check len equality
         //   - If inlined, check inlined value
@@ -238,19 +265,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         }
 
         if exist_view_len <= 12 {
-            let exist_inline = unsafe {
-                GenericByteViewArray::<B>::inline_value(
-                    &exist_view,
-                    exist_view_len as usize,
-                )
-            };
-            let input_inline = unsafe {
-                GenericByteViewArray::<B>::inline_value(
-                    &input_view,
-                    input_view_len as usize,
-                )
-            };
-            exist_inline == input_inline
+            // both inlined, so compare inlined value
+            exist_view == input_view
         } else {
             let exist_prefix =
                 unsafe { GenericByteViewArray::<B>::inline_value(&exist_view, 4) };
@@ -261,30 +277,28 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
                 return false;
             }
 
+            // get the full values and compare
             let exist_full = {
                 let byte_view = ByteView::from(exist_view);
-                self.value(
-                    byte_view.buffer_index as usize,
-                    byte_view.offset as usize,
-                    byte_view.length as usize,
-                )
+                let buffer_index = byte_view.buffer_index as usize;
+                let offset = byte_view.offset as usize;
+                let length = byte_view.length as usize;
+                debug_assert!(buffer_index <= self.completed.len());
+
+                unsafe {
+                    if buffer_index < self.completed.len() {
+                        let block = self.completed.get_unchecked(buffer_index);
+                        block.as_slice().get_unchecked(offset..offset + length)
+                    } else {
+                        self.in_progress.get_unchecked(offset..offset + length)
+                    }
+                }
             };
             let input_full: &[u8] = unsafe { array.value_unchecked(rhs_row).as_ref() };
             exist_full == input_full
         }
     }
 
-    fn value(&self, buffer_index: usize, offset: usize, length: usize) -> &[u8] {
-        debug_assert!(buffer_index <= self.completed.len());
-
-        if buffer_index < self.completed.len() {
-            let block = &self.completed[buffer_index];
-            &block[offset..offset + length]
-        } else {
-            &self.in_progress[offset..offset + length]
-        }
-    }
-
     fn build_inner(self) -> ArrayRef {
         let Self {
             views,
@@ -443,21 +457,23 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
         last_take_len: usize,
     ) -> Vec<Buffer> {
         let mut take_buffers = Vec::with_capacity(last_remaining_buffer_index + 1);
+        debug_assert!(last_remaining_buffer_index <= self.completed.len());
 
-        // Take `0 ~ last_remaining_buffer_index - 1` buffers
-        if !self.completed.is_empty() || last_remaining_buffer_index == 0 {
-            take_buffers.extend(self.completed.drain(0..last_remaining_buffer_index));
-        }
-
-        // Process the `last_remaining_buffer_index` buffers
+        // Process the `last_remaining_buffer_index` buffer before draining so the index is valid.
         let last_buffer = if last_remaining_buffer_index < self.completed.len() {
             // If it is in `completed`, simply clone
             self.completed[last_remaining_buffer_index].clone()
         } else {
             // If it is `in_progress`, copied `0 ~ offset` part
+            debug_assert!(last_take_len <= self.in_progress.len());
             let taken_last_buffer = self.in_progress[0..last_take_len].to_vec();
             Buffer::from_vec(taken_last_buffer)
         };
+
+        // Take `0 ~ last_remaining_buffer_index - 1` buffers
+        if last_remaining_buffer_index > 0 {
+            take_buffers.extend(self.completed.drain(0..last_remaining_buffer_index));
+        }
         take_buffers.push(last_buffer);
 
         take_buffers
@@ -499,7 +515,36 @@ impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> {
         rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results);
+        let has_nulls = array.null_count() != 0;
+        let array = array.as_byte_view::<B>();
+        let has_buffers = !array.data_buffers().is_empty();
+        // call specialized version based on nulls and buffers presence
+        match (has_nulls, has_buffers) {
+            (true, true) => self.vectorized_equal_to_inner::<true, true>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (true, false) => self.vectorized_equal_to_inner::<true, false>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (false, true) => self.vectorized_equal_to_inner::<false, true>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+            (false, false) => self.vectorized_equal_to_inner::<false, false>(
+                group_indices,
+                array,
+                rows,
+                equal_to_results,
+            ),
+        }
     }
 
     fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {
@@ -905,4 +950,28 @@ mod tests {
         let taken_array = builder.take_n(final_ones_to_append);
         assert_eq!(&taken_array, &input_array);
     }
+
+    #[test]
+    fn test_byte_view_take_n_partial_completed_nonzero_index() {
+        let mut builder =
+            ByteViewGroupValueBuilder::<StringViewType>::new().with_max_block_size(30);
+        let input_array = StringViewArray::from(vec![
+            Some("aaaaaaaaaaaaaa"),
+            Some("bbbbbbbbbbbbbb"),
+            Some("cccccccccccccc"),
+            Some("dddddddddddddd"),
+            Some("eeeeeeeeeeeeee"),
+        ]);
+        let input_array: ArrayRef = Arc::new(input_array);
+
+        for row in 0..input_array.len() {
+            builder.append_val(&input_array, row).unwrap();
+        }
+
+        assert_eq!(builder.completed.len(), 2);
+        assert_eq!(builder.in_progress.len(), 14);
+
+        let taken_array = builder.take_n(3);
+        assert_eq!(&taken_array, &input_array.slice(0, 3));
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
index 2ac0389454dec..cc4576eabddbd 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs
@@ -17,30 +17,31 @@
 
 //! `GroupValues` implementations for multi group by cases
 
+mod boolean;
 mod bytes;
-mod bytes_view;
-mod primitive;
+pub mod bytes_view;
+pub mod primitive;
 
 use std::mem::{self, size_of};
 
+use crate::aggregates::group_values::GroupValues;
 use crate::aggregates::group_values::multi_group_by::{
-    bytes::ByteGroupValueBuilder, bytes_view::ByteViewGroupValueBuilder,
-    primitive::PrimitiveGroupValueBuilder,
+    boolean::BooleanGroupValueBuilder, bytes::ByteGroupValueBuilder,
+    bytes_view::ByteViewGroupValueBuilder, primitive::PrimitiveGroupValueBuilder,
 };
-use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
-use arrow::array::{Array, ArrayRef, RecordBatch};
+use arrow::array::{Array, ArrayRef};
 use arrow::compute::cast;
 use arrow::datatypes::{
     BinaryViewType, DataType, Date32Type, Date64Type, Decimal128Type, Float32Type,
-    Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, Schema, SchemaRef,
+    Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Schema, SchemaRef,
     StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
     Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
-    TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type,
-    UInt8Type,
+    TimestampNanosecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type,
+    UInt64Type,
 };
+use datafusion_common::hash_utils::RandomState;
 use datafusion_common::hash_utils::create_hashes;
-use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_common::{Result, internal_datafusion_err, not_impl_err};
 use datafusion_execution::memory_pool::proxy::{HashTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
@@ -76,7 +77,6 @@ pub trait GroupColumn: Send + Sync {
     ///
     /// And if found nth result in `equal_to_results` is already
     /// `false`, the check for nth row will be skipped.
-    ///
     fn vectorized_equal_to(
         &self,
         lhs_rows: &[usize],
@@ -91,6 +91,11 @@ pub trait GroupColumn: Send + Sync {
     /// Returns the number of rows stored in this builder
     fn len(&self) -> usize;
 
+    /// true if len == 0
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
     /// Returns the number of bytes used by this [`GroupColumn`]
     fn size(&self) -> usize;
 
@@ -131,7 +136,6 @@ pub fn nulls_equal_to(lhs_null: bool, rhs_null: bool) -> Option<bool> {
 ///   +---------------------+---------------------------------------------+
 ///
 /// `inlined flag`: 1 represents `non-inlined`, and 0 represents `inlined`
-///
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 struct GroupIndexView(u64);
 
@@ -160,7 +164,6 @@ impl GroupIndexView {
 
 /// A [`GroupValues`] that stores multiple columns of group values,
 /// and supports vectorized operators for them
-///
 pub struct GroupValuesColumn<const STREAMING: bool> {
     /// The output schema
     schema: SchemaRef,
@@ -178,7 +181,6 @@ pub struct GroupValuesColumn<const STREAMING: bool> {
     /// instead we store the `group indices` pointing to values in `GroupValues`.
     /// And we use [`GroupIndexView`] to represent such `group indices` in table.
     ///
-    ///
     map: HashTable<(u64, GroupIndexView)>,
 
     /// The size of `map` in bytes
@@ -191,7 +193,6 @@ pub struct GroupValuesColumn<const STREAMING: bool> {
     ///
     /// The chained indices is like:
     ///   `latest group index -> older group index -> even older group index -> ...`
-    ///
     group_index_lists: Vec<Vec<usize>>,
 
     /// When emitting first n, we need to decrease/erase group indices in
@@ -317,7 +318,6 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
     ///
     /// `Group indices` order are against with their input order, and this will lead to error
     /// in `streaming aggregation`.
-    ///
     fn scalarized_intern(
         &mut self,
         cols: &[ArrayRef],
@@ -419,7 +419,6 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
     ///
     /// The vectorized approach can offer higher performance for avoiding row by row
     /// downcast for `cols` and being able to implement even more optimizations(like simd).
-    ///
     fn vectorized_intern(
         &mut self,
         cols: &[ArrayRef],
@@ -487,7 +486,6 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
     ///   - Check if the `group index view` is `inlined` or `non_inlined`:
     ///     If it is inlined, add to `vectorized_equal_to_group_indices` directly.
     ///     Otherwise get all group indices from `group_index_lists`, and add them.
-    ///
     fn collect_vectorized_process_context(
         &mut self,
         batch_hashes: &[u64],
@@ -542,14 +540,13 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
                 // into `vectorized_equal_to_row_indices` and `vectorized_equal_to_group_indices`.
                 let list_offset = group_index_view.value() as usize;
                 let group_index_list = &self.group_index_lists[list_offset];
-                for &group_index in group_index_list {
-                    self.vectorized_operation_buffers
-                        .equal_to_row_indices
-                        .push(row);
-                    self.vectorized_operation_buffers
-                        .equal_to_group_indices
-                        .push(group_index);
-                }
+
+                self.vectorized_operation_buffers
+                    .equal_to_group_indices
+                    .extend_from_slice(group_index_list);
+                self.vectorized_operation_buffers
+                    .equal_to_row_indices
+                    .extend(std::iter::repeat_n(row, group_index_list.len()));
             } else {
                 let group_index = group_index_view.value() as usize;
                 self.vectorized_operation_buffers
@@ -715,7 +712,6 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
     /// The hash collision may be not frequent, so the fallback will indeed hardly happen.
     /// In most situations, `scalarized_indices` will found to be empty after finishing to
     /// preform `vectorized_equal_to`.
-    ///
     fn scalarized_intern_remaining(
         &mut self,
         cols: &[ArrayRef],
@@ -880,7 +876,6 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
 /// `$v`: the vector to push the new builder into
 /// `$nullable`: whether the input can contains nulls
 /// `$t`: the primitive type of the builder
-///
 macro_rules! instantiate_primitive {
     ($v:expr, $nullable:expr, $t:ty, $data_type:ident) => {
         if $nullable {
@@ -1042,8 +1037,17 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
                         let b = ByteViewGroupValueBuilder::<BinaryViewType>::new();
                         v.push(Box::new(b) as _)
                     }
+                    &DataType::Boolean => {
+                        if nullable {
+                            let b = BooleanGroupValueBuilder::<true>::new();
+                            v.push(Box::new(b) as _)
+                        } else {
+                            let b = BooleanGroupValueBuilder::<false>::new();
+                            v.push(Box::new(b) as _)
+                        }
+                    }
                     dt => {
-                        return not_impl_err!("{dt} not supported in GroupValuesColumn")
+                        return not_impl_err!("{dt} not supported in GroupValuesColumn");
                     }
                 }
             }
@@ -1165,9 +1169,9 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
             if let DataType::Dictionary(_, v) = expected {
                 let actual = array.data_type();
                 if v.as_ref() != actual {
-                    return Err(DataFusionError::Internal(format!(
+                    return Err(internal_datafusion_err!(
                         "Converted group rows expected dictionary of {v} got {actual}"
-                    )));
+                    ));
                 }
                 *array = cast(array.as_ref(), expected)?;
             }
@@ -1176,14 +1180,13 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
         Ok(output)
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.group_values.clear();
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
         self.map_size = self.map.capacity() * size_of::<(u64, usize)>();
         self.hashes_buffer.clear();
-        self.hashes_buffer.shrink_to(count);
+        self.hashes_buffer.shrink_to(num_rows);
 
         // Such structures are only used in `non-streaming` case
         if !STREAMING {
@@ -1231,9 +1234,20 @@ fn supported_type(data_type: &DataType) -> bool {
             | DataType::Timestamp(_, _)
             | DataType::Utf8View
             | DataType::BinaryView
+            | DataType::Boolean
     )
 }
 
+///Shows how many `null`s there are in an array
+enum Nulls {
+    /// All array items are `null`s
+    All,
+    /// There are both `null`s and non-`null`s in the array items
+    Some,
+    /// There are no `null`s in the array items
+    None,
+}
+
 #[cfg(test)]
 mod tests {
     use std::{collections::HashMap, sync::Arc};
@@ -1245,7 +1259,7 @@ mod tests {
     use datafusion_expr::EmitTo;
 
     use crate::aggregates::group_values::{
-        multi_group_by::GroupValuesColumn, GroupValues,
+        GroupValues, multi_group_by::GroupValuesColumn,
     };
 
     use super::GroupIndexView;
@@ -1442,7 +1456,6 @@ mod tests {
     ///   - Group not exist + bucket not found in `map`
     ///   - Group not exist + not equal to inlined group view(tested in hash collision)
     ///   - Group not exist + not equal to non-inlined group view(tested in hash collision)
-    ///
     struct VectorizedTestDataSet {
         test_batches: Vec<Vec<ArrayRef>>,
         expected_batch: RecordBatch,
@@ -1736,16 +1749,19 @@ mod tests {
     }
 
     fn check_result(actual_batch: &RecordBatch, expected_batch: &RecordBatch) {
-        let formatted_actual_batch = pretty_format_batches(&[actual_batch.clone()])
-            .unwrap()
-            .to_string();
+        let formatted_actual_batch =
+            pretty_format_batches(std::slice::from_ref(actual_batch))
+                .unwrap()
+                .to_string();
         let mut formatted_actual_batch_sorted: Vec<&str> =
             formatted_actual_batch.trim().lines().collect();
         formatted_actual_batch_sorted.sort_unstable();
 
-        let formatted_expected_batch = pretty_format_batches(&[expected_batch.clone()])
-            .unwrap()
-            .to_string();
+        let formatted_expected_batch =
+            pretty_format_batches(std::slice::from_ref(expected_batch))
+                .unwrap()
+                .to_string();
+
         let mut formatted_expected_batch_sorted: Vec<&str> =
             formatted_expected_batch.trim().lines().collect();
         formatted_expected_batch_sorted.sort_unstable();
diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
index 22d5987380a83..31126348b3fd4 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs
@@ -15,9 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn};
+use crate::aggregates::group_values::multi_group_by::{
+    GroupColumn, Nulls, nulls_equal_to,
+};
 use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
-use arrow::array::{cast::AsArray, Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray};
+use arrow::array::ArrowNativeTypeOp;
+use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray, cast::AsArray};
 use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -53,6 +56,85 @@ where
             nulls: MaybeNullBufferBuilder::new(),
         }
     }
+
+    fn vectorized_equal_to_non_nullable(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        assert!(
+            !NULLABLE || (array.null_count() == 0 && !self.nulls.might_have_nulls()),
+            "called with nullable input"
+        );
+        let array_values = array.as_primitive::<T>().values();
+
+        let iter = izip!(
+            lhs_rows.iter(),
+            rhs_rows.iter(),
+            equal_to_results.iter_mut(),
+        );
+
+        for (&lhs_row, &rhs_row, equal_to_result) in iter {
+            let result = {
+                // Getting unchecked not only for bound checks but because the bound checks are
+                // what prevents auto-vectorization
+                let left = if cfg!(debug_assertions) {
+                    self.group_values[lhs_row]
+                } else {
+                    // SAFETY: indices are guaranteed to be in bounds
+                    unsafe { *self.group_values.get_unchecked(lhs_row) }
+                };
+                let right = if cfg!(debug_assertions) {
+                    array_values[rhs_row]
+                } else {
+                    // SAFETY: indices are guaranteed to be in bounds
+                    unsafe { *array_values.get_unchecked(rhs_row) }
+                };
+
+                // Always evaluate, to allow for auto-vectorization
+                left.is_eq(right)
+            };
+
+            *equal_to_result = result && *equal_to_result;
+        }
+    }
+
+    pub fn vectorized_equal_nullable(
+        &self,
+        lhs_rows: &[usize],
+        array: &ArrayRef,
+        rhs_rows: &[usize],
+        equal_to_results: &mut [bool],
+    ) {
+        assert!(NULLABLE, "called with non-nullable input");
+        let array = array.as_primitive::<T>();
+
+        let iter = izip!(
+            lhs_rows.iter(),
+            rhs_rows.iter(),
+            equal_to_results.iter_mut(),
+        );
+
+        for (&lhs_row, &rhs_row, equal_to_result) in iter {
+            // Has found not equal to in previous column, don't need to check
+            if !*equal_to_result {
+                continue;
+            }
+
+            // Perf: skip null check (by short circuit) if input is not nullable
+            let exist_null = self.nulls.is_null(lhs_row);
+            let input_null = array.is_null(rhs_row);
+            if let Some(result) = nulls_equal_to(exist_null, input_null) {
+                *equal_to_result = result;
+                continue;
+            }
+
+            // Otherwise, we need to check their values
+            *equal_to_result = self.group_values[lhs_row].is_eq(array.value(rhs_row));
+        }
+    }
 }
 
 impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
@@ -69,7 +151,7 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
             // Otherwise, we need to check their values
         }
 
-        self.group_values[lhs_row] == array.as_primitive::<T>().value(rhs_row)
+        self.group_values[lhs_row].is_eq(array.as_primitive::<T>().value(rhs_row))
     }
 
     fn append_val(&mut self, array: &ArrayRef, row: usize) -> Result<()> {
@@ -96,32 +178,15 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
         rhs_rows: &[usize],
         equal_to_results: &mut [bool],
     ) {
-        let array = array.as_primitive::<T>();
-
-        let iter = izip!(
-            lhs_rows.iter(),
-            rhs_rows.iter(),
-            equal_to_results.iter_mut(),
-        );
-
-        for (&lhs_row, &rhs_row, equal_to_result) in iter {
-            // Has found not equal to in previous column, don't need to check
-            if !*equal_to_result {
-                continue;
-            }
-
-            // Perf: skip null check (by short circuit) if input is not nullable
-            if NULLABLE {
-                let exist_null = self.nulls.is_null(lhs_row);
-                let input_null = array.is_null(rhs_row);
-                if let Some(result) = nulls_equal_to(exist_null, input_null) {
-                    *equal_to_result = result;
-                    continue;
-                }
-                // Otherwise, we need to check their values
-            }
-
-            *equal_to_result = self.group_values[lhs_row] == array.value(rhs_row);
+        if !NULLABLE || (array.null_count() == 0 && !self.nulls.might_have_nulls()) {
+            self.vectorized_equal_to_non_nullable(
+                lhs_rows,
+                array,
+                rhs_rows,
+                equal_to_results,
+            );
+        } else {
+            self.vectorized_equal_nullable(lhs_rows, array, rhs_rows, equal_to_results);
         }
     }
 
@@ -131,15 +196,15 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
         let null_count = array.null_count();
         let num_rows = array.len();
         let all_null_or_non_null = if null_count == 0 {
-            Some(true)
+            Nulls::None
         } else if null_count == num_rows {
-            Some(false)
+            Nulls::All
         } else {
-            None
+            Nulls::Some
         };
 
         match (NULLABLE, all_null_or_non_null) {
-            (true, None) => {
+            (true, Nulls::Some) => {
                 for &row in rows {
                     if array.is_null(row) {
                         self.nulls.append(true);
@@ -151,14 +216,14 @@ impl<T: ArrowPrimitiveType, const NULLABLE: bool> GroupColumn
                 }
             }
 
-            (true, Some(true)) => {
+            (true, Nulls::None) => {
                 self.nulls.append_n(rows.len(), false);
                 for &row in rows {
                     self.group_values.push(arr.value(row));
                 }
             }
 
-            (true, Some(false)) => {
+            (true, Nulls::All) => {
                 self.nulls.append_n(rows.len(), true);
                 self.group_values
                     .extend(iter::repeat_n(T::default_value(), rows.len()));
@@ -216,14 +281,14 @@ mod tests {
     use std::sync::Arc;
 
     use crate::aggregates::group_values::multi_group_by::primitive::PrimitiveGroupValueBuilder;
-    use arrow::array::{ArrayRef, Int64Array, NullBufferBuilder};
-    use arrow::datatypes::{DataType, Int64Type};
+    use arrow::array::{ArrayRef, Float32Array, Int64Array, NullBufferBuilder};
+    use arrow::datatypes::{DataType, Float32Type, Int64Type};
 
     use super::GroupColumn;
 
     #[test]
     fn test_nullable_primitive_equal_to() {
-        let append = |builder: &mut PrimitiveGroupValueBuilder<Int64Type, true>,
+        let append = |builder: &mut PrimitiveGroupValueBuilder<Float32Type, true>,
                       builder_array: &ArrayRef,
                       append_rows: &[usize]| {
             for &index in append_rows {
@@ -231,7 +296,7 @@ mod tests {
             }
         };
 
-        let equal_to = |builder: &PrimitiveGroupValueBuilder<Int64Type, true>,
+        let equal_to = |builder: &PrimitiveGroupValueBuilder<Float32Type, true>,
                         lhs_rows: &[usize],
                         input_array: &ArrayRef,
                         rhs_rows: &[usize],
@@ -247,7 +312,7 @@ mod tests {
 
     #[test]
     fn test_nullable_primitive_vectorized_equal_to() {
-        let append = |builder: &mut PrimitiveGroupValueBuilder<Int64Type, true>,
+        let append = |builder: &mut PrimitiveGroupValueBuilder<Float32Type, true>,
                       builder_array: &ArrayRef,
                       append_rows: &[usize]| {
             builder
@@ -255,7 +320,7 @@ mod tests {
                 .unwrap();
         };
 
-        let equal_to = |builder: &PrimitiveGroupValueBuilder<Int64Type, true>,
+        let equal_to = |builder: &PrimitiveGroupValueBuilder<Float32Type, true>,
                         lhs_rows: &[usize],
                         input_array: &ArrayRef,
                         rhs_rows: &[usize],
@@ -273,9 +338,9 @@ mod tests {
 
     fn test_nullable_primitive_equal_to_internal<A, E>(mut append: A, mut equal_to: E)
     where
-        A: FnMut(&mut PrimitiveGroupValueBuilder<Int64Type, true>, &ArrayRef, &[usize]),
+        A: FnMut(&mut PrimitiveGroupValueBuilder<Float32Type, true>, &ArrayRef, &[usize]),
         E: FnMut(
-            &PrimitiveGroupValueBuilder<Int64Type, true>,
+            &PrimitiveGroupValueBuilder<Float32Type, true>,
             &[usize],
             &ArrayRef,
             &[usize],
@@ -292,48 +357,58 @@ mod tests {
 
         // Define PrimitiveGroupValueBuilder
         let mut builder =
-            PrimitiveGroupValueBuilder::<Int64Type, true>::new(DataType::Int64);
-        let builder_array = Arc::new(Int64Array::from(vec![
+            PrimitiveGroupValueBuilder::<Float32Type, true>::new(DataType::Float32);
+        let builder_array = Arc::new(Float32Array::from(vec![
             None,
             None,
             None,
-            Some(1),
-            Some(2),
-            Some(3),
+            Some(1.0),
+            Some(2.0),
+            Some(f32::NAN),
+            Some(3.0),
         ])) as ArrayRef;
-        append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5]);
+        append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5, 6]);
 
         // Define input array
-        let (_nulls, values, _) =
-            Int64Array::from(vec![Some(1), Some(2), None, None, Some(1), Some(3)])
-                .into_parts();
+        let (_, values, _nulls) = Float32Array::from(vec![
+            Some(1.0),
+            Some(2.0),
+            None,
+            Some(1.0),
+            None,
+            Some(f32::NAN),
+            None,
+        ])
+        .into_parts();
 
         // explicitly build a null buffer where one of the null values also happens to match
         let mut nulls = NullBufferBuilder::new(6);
         nulls.append_non_null();
         nulls.append_null(); // this sets Some(2) to null above
         nulls.append_null();
-        nulls.append_null();
         nulls.append_non_null();
+        nulls.append_null();
         nulls.append_non_null();
-        let input_array = Arc::new(Int64Array::new(values, nulls.finish())) as ArrayRef;
+        nulls.append_null();
+        let input_array = Arc::new(Float32Array::new(values, nulls.finish())) as ArrayRef;
 
         // Check
         let mut equal_to_results = vec![true; builder.len()];
         equal_to(
             &builder,
-            &[0, 1, 2, 3, 4, 5],
+            &[0, 1, 2, 3, 4, 5, 6],
             &input_array,
-            &[0, 1, 2, 3, 4, 5],
+            &[0, 1, 2, 3, 4, 5, 6],
             &mut equal_to_results,
         );
 
         assert!(!equal_to_results[0]);
         assert!(equal_to_results[1]);
         assert!(equal_to_results[2]);
-        assert!(!equal_to_results[3]);
+        assert!(equal_to_results[3]);
         assert!(!equal_to_results[4]);
         assert!(equal_to_results[5]);
+        assert!(!equal_to_results[6]);
     }
 
     #[test]
diff --git a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
index 23ffc69f218bf..6a84d685b6c79 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs
@@ -89,4 +89,12 @@ impl MaybeNullBufferBuilder {
         new_builder.truncate(n);
         new_builder.finish()
     }
+
+    /// Returns true if this builder might have any nulls
+    ///
+    /// This is guaranteed to be true if there are nulls
+    /// but may be true even if there are no nulls
+    pub(crate) fn might_have_nulls(&self) -> bool {
+        self.nulls.as_slice().is_some()
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs
index 34893fcc4ed98..3dcf4e1240b1d 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/row.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs
@@ -16,13 +16,13 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
-use arrow::array::{Array, ArrayRef, ListArray, RecordBatch, StructArray};
+use arrow::array::{Array, ArrayRef, ListArray, StructArray};
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, SchemaRef};
 use arrow::row::{RowConverter, Rows, SortField};
-use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::create_hashes;
 use datafusion_execution::memory_pool::proxy::{HashTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
 use hashbrown::hash_table::HashTable;
@@ -236,30 +236,28 @@ impl GroupValues for GroupValuesRows {
         // https://github.com/apache/datafusion/issues/7647
         for (field, array) in self.schema.fields.iter().zip(&mut output) {
             let expected = field.data_type();
-            *array =
-                dictionary_encode_if_necessary(Arc::<dyn Array>::clone(array), expected)?;
+            *array = dictionary_encode_if_necessary(array, expected)?;
         }
 
         self.group_values = Some(group_values);
         Ok(output)
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.group_values = self.group_values.take().map(|mut rows| {
             rows.clear();
             rows
         });
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
         self.map_size = self.map.capacity() * size_of::<(u64, usize)>();
         self.hashes_buffer.clear();
-        self.hashes_buffer.shrink_to(count);
+        self.hashes_buffer.shrink_to(num_rows);
     }
 }
 
 fn dictionary_encode_if_necessary(
-    array: ArrayRef,
+    array: &ArrayRef,
     expected: &DataType,
 ) -> Result<ArrayRef> {
     match (expected, array.data_type()) {
@@ -269,10 +267,7 @@ fn dictionary_encode_if_necessary(
                 .iter()
                 .zip(struct_array.columns())
                 .map(|(expected_field, column)| {
-                    dictionary_encode_if_necessary(
-                        Arc::<dyn Array>::clone(column),
-                        expected_field.data_type(),
-                    )
+                    dictionary_encode_if_necessary(column, expected_field.data_type())
                 })
                 .collect::<Result<Vec<_>>>()?;
 
@@ -289,13 +284,13 @@ fn dictionary_encode_if_necessary(
                 Arc::<arrow::datatypes::Field>::clone(expected_field),
                 list.offsets().clone(),
                 dictionary_encode_if_necessary(
-                    Arc::<dyn Array>::clone(list.values()),
+                    list.values(),
                     expected_field.data_type(),
                 )?,
                 list.nulls().cloned(),
             )?))
         }
         (DataType::Dictionary(_, _), _) => Ok(cast(array.as_ref(), expected)?),
-        (_, _) => Ok(Arc::<dyn Array>::clone(&array)),
+        (_, _) => Ok(Arc::<dyn Array>::clone(array)),
     }
 }
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs
new file mode 100644
index 0000000000000..e993c0c53d199
--- /dev/null
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/boolean.rs
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::aggregates::group_values::GroupValues;
+
+use arrow::array::{
+    ArrayRef, AsArray as _, BooleanArray, BooleanBufferBuilder, NullBufferBuilder,
+};
+use datafusion_common::Result;
+use datafusion_expr::EmitTo;
+use std::{mem::size_of, sync::Arc};
+
+#[derive(Debug)]
+pub struct GroupValuesBoolean {
+    false_group: Option<usize>,
+    true_group: Option<usize>,
+    null_group: Option<usize>,
+}
+
+impl GroupValuesBoolean {
+    pub fn new() -> Self {
+        Self {
+            false_group: None,
+            true_group: None,
+            null_group: None,
+        }
+    }
+}
+
+impl GroupValues for GroupValuesBoolean {
+    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
+        let array = cols[0].as_boolean();
+        groups.clear();
+
+        for value in array.iter() {
+            let index = match value {
+                Some(false) => {
+                    if let Some(index) = self.false_group {
+                        index
+                    } else {
+                        let index = self.len();
+                        self.false_group = Some(index);
+                        index
+                    }
+                }
+                Some(true) => {
+                    if let Some(index) = self.true_group {
+                        index
+                    } else {
+                        let index = self.len();
+                        self.true_group = Some(index);
+                        index
+                    }
+                }
+                None => {
+                    if let Some(index) = self.null_group {
+                        index
+                    } else {
+                        let index = self.len();
+                        self.null_group = Some(index);
+                        index
+                    }
+                }
+            };
+
+            groups.push(index);
+        }
+
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        size_of::<Self>()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    fn len(&self) -> usize {
+        self.false_group.is_some() as usize
+            + self.true_group.is_some() as usize
+            + self.null_group.is_some() as usize
+    }
+
+    fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let len = self.len();
+        let mut builder = BooleanBufferBuilder::new(len);
+        let emit_count = match emit_to {
+            EmitTo::All => len,
+            EmitTo::First(n) => n,
+        };
+        builder.append_n(emit_count, false);
+        if let Some(idx) = self.true_group.as_mut() {
+            if *idx < emit_count {
+                builder.set_bit(*idx, true);
+                self.true_group = None;
+            } else {
+                *idx -= emit_count;
+            }
+        }
+
+        if let Some(idx) = self.false_group.as_mut() {
+            if *idx < emit_count {
+                // already false, no need to set
+                self.false_group = None;
+            } else {
+                *idx -= emit_count;
+            }
+        }
+
+        let values = builder.finish();
+
+        let nulls = if let Some(idx) = self.null_group.as_mut() {
+            if *idx < emit_count {
+                let mut buffer = NullBufferBuilder::new(len);
+                buffer.append_n_non_nulls(*idx);
+                buffer.append_null();
+                buffer.append_n_non_nulls(emit_count - *idx - 1);
+
+                self.null_group = None;
+                Some(buffer.finish().unwrap())
+            } else {
+                *idx -= emit_count;
+                None
+            }
+        } else {
+            None
+        };
+
+        Ok(vec![Arc::new(BooleanArray::new(values, nulls)) as _])
+    }
+
+    fn clear_shrink(&mut self, _num_rows: usize) {
+        self.false_group = None;
+        self.true_group = None;
+        self.null_group = None;
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
index 9686b8c3521d2..b881a51b25474 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs
@@ -15,24 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::mem::size_of;
+
 use crate::aggregates::group_values::GroupValues;
-use arrow::array::{Array, ArrayRef, OffsetSizeTrait, RecordBatch};
+
+use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
+use datafusion_common::Result;
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr_common::binary_map::{ArrowBytesMap, OutputType};
-use std::mem::size_of;
 
 /// A [`GroupValues`] storing single column of Utf8/LargeUtf8/Binary/LargeBinary values
 ///
 /// This specialization is significantly faster than using the more general
 /// purpose `Row`s format
-pub struct GroupValuesByes<O: OffsetSizeTrait> {
+pub struct GroupValuesBytes<O: OffsetSizeTrait> {
     /// Map string/binary values to group index
     map: ArrowBytesMap<O, usize>,
     /// The total number of groups so far (used to assign group_index)
     num_groups: usize,
 }
 
-impl<O: OffsetSizeTrait> GroupValuesByes<O> {
+impl<O: OffsetSizeTrait> GroupValuesBytes<O> {
     pub fn new(output_type: OutputType) -> Self {
         Self {
             map: ArrowBytesMap::new(output_type),
@@ -41,12 +44,8 @@ impl<O: OffsetSizeTrait> GroupValuesByes<O> {
     }
 }
 
-impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
-    fn intern(
-        &mut self,
-        cols: &[ArrayRef],
-        groups: &mut Vec<usize>,
-    ) -> datafusion_common::Result<()> {
+impl<O: OffsetSizeTrait> GroupValues for GroupValuesBytes<O> {
+    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
         assert_eq!(cols.len(), 1);
 
         // look up / add entries in the table
@@ -85,7 +84,7 @@ impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
         self.num_groups
     }
 
-    fn emit(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> {
+    fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
         // Reset the map to default, and convert it into a single array
         let map_contents = self.map.take().into_state();
 
@@ -121,7 +120,7 @@ impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
         Ok(vec![group_values])
     }
 
-    fn clear_shrink(&mut self, _batch: &RecordBatch) {
+    fn clear_shrink(&mut self, _num_rows: usize) {
         // in theory we could potentially avoid this reallocation and clear the
         // contents of the maps, but for now we just reset the map from the beginning
         self.map.take();
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
index be9a0334e3ee6..7a56f7c52c11a 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use arrow::array::{Array, ArrayRef, RecordBatch};
+use arrow::array::{Array, ArrayRef};
 use datafusion_expr::EmitTo;
 use datafusion_physical_expr::binary_map::OutputType;
 use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewMap;
@@ -122,7 +122,7 @@ impl GroupValues for GroupValuesBytesView {
         Ok(vec![group_values])
     }
 
-    fn clear_shrink(&mut self, _batch: &RecordBatch) {
+    fn clear_shrink(&mut self, _num_rows: usize) {
         // in theory we could potentially avoid this reallocation and clear the
         // contents of the maps, but for now we just reset the map from the beginning
         self.map.take();
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs
index 417618ba66af4..89c6b624e8e0a 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs
@@ -17,6 +17,7 @@
 
 //! `GroupValues` implementations for single group by cases
 
+pub(crate) mod boolean;
 pub(crate) mod bytes;
 pub(crate) mod bytes_view;
 pub(crate) mod primitive;
diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
index 8b1905e540416..4686648fb1e3d 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::{
-    cast::AsArray, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, NullBufferBuilder,
-    PrimitiveArray,
+    ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, NullBufferBuilder, PrimitiveArray,
+    cast::AsArray,
 };
-use arrow::datatypes::{i256, DataType};
-use arrow::record_batch::RecordBatch;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_execution::memory_pool::proxy::VecAllocExt;
 use datafusion_expr::EmitTo;
 use half::f16;
 use hashbrown::hash_table::HashTable;
+use std::hash::BuildHasher;
 use std::mem::size_of;
 use std::sync::Arc;
 
@@ -87,7 +87,6 @@ pub struct GroupValuesPrimitive<T: ArrowPrimitiveType> {
     /// is obvious in high cardinality group by situation.
     /// More details can see:
     /// <https://github.com/apache/datafusion/issues/15961>
-    ///
     map: HashTable<(usize, u64)>,
     /// The group index of the null value if any
     null_group: Option<usize>,
@@ -130,7 +129,9 @@ where
                     let hash = key.hash(state);
                     let insert = self.map.entry(
                         hash,
-                        |&(g, _)| unsafe { self.values.get_unchecked(g).is_eq(key) },
+                        |&(g, h)| unsafe {
+                            hash == h && self.values.get_unchecked(g).is_eq(key)
+                        },
                         |&(_, h)| h,
                     );
 
@@ -214,11 +215,10 @@ where
         Ok(vec![Arc::new(array.with_data_type(self.data_type.clone()))])
     }
 
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        let count = batch.num_rows();
+    fn clear_shrink(&mut self, num_rows: usize) {
         self.values.clear();
-        self.values.shrink_to(count);
+        self.values.shrink_to(num_rows);
         self.map.clear();
-        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+        self.map.shrink_to(num_rows, |_| 0); // hasher does not matter since the map is cleared
     }
 }
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index 656c9a2cd5cb6..42df1a8b07cd4 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -26,46 +26,119 @@ use crate::aggregates::{
     topk_stream::GroupedTopKAggregateStream,
 };
 use crate::execution_plan::{CardinalityEffect, EmissionType};
+use crate::filter_pushdown::{
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDownPredicate,
+};
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
-use crate::windows::get_ordered_partition_by_indices;
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream, Statistics, check_if_same_properties,
 };
+use datafusion_common::config::ConfigOptions;
+use datafusion_physical_expr::utils::collect_columns;
+use parking_lot::Mutex;
+use std::collections::HashSet;
 
-use arrow::array::{ArrayRef, UInt16Array, UInt32Array, UInt64Array, UInt8Array};
-use arrow::datatypes::{Field, Schema, SchemaRef};
+use arrow::array::{ArrayRef, UInt8Array, UInt16Array, UInt32Array, UInt64Array};
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use arrow_schema::FieldRef;
 use datafusion_common::stats::Precision;
-use datafusion_common::{internal_err, not_impl_err, Constraint, Constraints, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Constraint, Constraints, Result, ScalarValue, assert_eq_or_internal_err, not_impl_err,
+};
 use datafusion_execution::TaskContext;
 use datafusion_expr::{Accumulator, Aggregate};
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
+use datafusion_physical_expr::equivalence::ProjectionMapping;
+use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, lit};
 use datafusion_physical_expr::{
-    equivalence::ProjectionMapping, expressions::Column, physical_exprs_contains,
-    ConstExpr, EquivalenceProperties, LexOrdering, LexRequirement, PhysicalExpr,
-    PhysicalSortRequirement,
+    ConstExpr, EquivalenceProperties, physical_exprs_contains,
+};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, fmt_sql};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortRequirement,
 };
 
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use datafusion_expr::utils::AggregateOrderSensitivity;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use itertools::Itertools;
+use topk::hash_table::is_supported_hash_key_type;
+use topk::heap::is_supported_heap_type;
 
-pub(crate) mod group_values;
+pub mod group_values;
 mod no_grouping;
 pub mod order;
 mod row_hash;
 mod topk;
 mod topk_stream;
 
+/// Returns true if TopK aggregation data structures support the provided key and value types.
+///
+/// This function checks whether both the key type (used for grouping) and value type
+/// (used in min/max aggregation) can be handled by the TopK aggregation heap and hash table.
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals) and
+/// UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`).
+/// ```text
+pub fn topk_types_supported(key_type: &DataType, value_type: &DataType) -> bool {
+    is_supported_hash_key_type(key_type) && is_supported_heap_type(value_type)
+}
+
 /// Hard-coded seed for aggregations to ensure hash values differ from `RepartitionExec`, avoiding collisions.
-const AGGREGATION_HASH_SEED: ahash::RandomState =
-    ahash::RandomState::with_seeds('A' as u64, 'G' as u64, 'G' as u64, 'R' as u64);
+const AGGREGATION_HASH_SEED: datafusion_common::hash_utils::RandomState =
+    // This seed is chosen to be a large 64-bit number
+    datafusion_common::hash_utils::RandomState::with_seed(15395726432021054657);
+
+/// Whether an aggregate stage consumes raw input data or intermediate
+/// accumulator state from a previous aggregation stage.
+///
+/// See the [table on `AggregateMode`](AggregateMode#variants-and-their-inputoutput-modes)
+/// for how this relates to aggregate modes.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum AggregateInputMode {
+    /// The stage consumes raw, unaggregated input data and calls
+    /// [`Accumulator::update_batch`].
+    Raw,
+    /// The stage consumes intermediate accumulator state from a previous
+    /// aggregation stage and calls [`Accumulator::merge_batch`].
+    Partial,
+}
+
+/// Whether an aggregate stage produces intermediate accumulator state
+/// or final output values.
+///
+/// See the [table on `AggregateMode`](AggregateMode#variants-and-their-inputoutput-modes)
+/// for how this relates to aggregate modes.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum AggregateOutputMode {
+    /// The stage produces intermediate accumulator state, serialized via
+    /// [`Accumulator::state`].
+    Partial,
+    /// The stage produces final output values via
+    /// [`Accumulator::evaluate`].
+    Final,
+}
 
 /// Aggregation modes
 ///
 /// See [`Accumulator::state`] for background information on multi-phase
 /// aggregation and how these modes are used.
+///
+/// # Variants and their input/output modes
+///
+/// Each variant can be characterized by its [`AggregateInputMode`] and
+/// [`AggregateOutputMode`]:
+///
+/// ```text
+///                       | Input: Raw data           | Input: Partial state
+/// Output: Final values  | Single, SinglePartitioned | Final, FinalPartitioned
+/// Output: Partial state | Partial                   | PartialReduce
+/// ```
+///
+/// Use [`AggregateMode::input_mode`] and [`AggregateMode::output_mode`]
+/// to query these properties.
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum AggregateMode {
     /// One of multiple layers of aggregation, any input partitioning
@@ -117,18 +190,56 @@ pub enum AggregateMode {
     /// This mode requires that the input has more than one partition, and is
     /// partitioned by group key (like FinalPartitioned).
     SinglePartitioned,
+    /// Combine multiple partial aggregations to produce a new partial
+    /// aggregation.
+    ///
+    /// Input is intermediate accumulator state (like Final), but output is
+    /// also intermediate accumulator state (like Partial). This enables
+    /// tree-reduce aggregation strategies where partial results from
+    /// multiple workers are combined in multiple stages before a final
+    /// evaluation.
+    ///
+    /// ```text
+    ///               Final
+    ///            /        \
+    ///     PartialReduce   PartialReduce
+    ///     /         \      /         \
+    ///  Partial   Partial  Partial   Partial
+    /// ```
+    PartialReduce,
 }
 
 impl AggregateMode {
-    /// Checks whether this aggregation step describes a "first stage" calculation.
-    /// In other words, its input is not another aggregation result and the
-    /// `merge_batch` method will not be called for these modes.
-    pub fn is_first_stage(&self) -> bool {
+    /// Returns the [`AggregateInputMode`] for this mode: whether this
+    /// stage consumes raw input data or intermediate accumulator state.
+    ///
+    /// See the [table above](AggregateMode#variants-and-their-inputoutput-modes)
+    /// for details.
+    pub fn input_mode(&self) -> AggregateInputMode {
         match self {
             AggregateMode::Partial
             | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => true,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => false,
+            | AggregateMode::SinglePartitioned => AggregateInputMode::Raw,
+            AggregateMode::Final
+            | AggregateMode::FinalPartitioned
+            | AggregateMode::PartialReduce => AggregateInputMode::Partial,
+        }
+    }
+
+    /// Returns the [`AggregateOutputMode`] for this mode: whether this
+    /// stage produces intermediate accumulator state or final output values.
+    ///
+    /// See the [table above](AggregateMode#variants-and-their-inputoutput-modes)
+    /// for details.
+    pub fn output_mode(&self) -> AggregateOutputMode {
+        match self {
+            AggregateMode::Final
+            | AggregateMode::FinalPartitioned
+            | AggregateMode::Single
+            | AggregateMode::SinglePartitioned => AggregateOutputMode::Final,
+            AggregateMode::Partial | AggregateMode::PartialReduce => {
+                AggregateOutputMode::Partial
+            }
         }
     }
 }
@@ -162,6 +273,9 @@ pub struct PhysicalGroupBy {
     /// expression in null_expr. If `groups[i][j]` is true, then the
     /// j-th expression in the i-th group is NULL, otherwise it is `expr[j]`.
     groups: Vec<Vec<bool>>,
+    /// True when GROUPING SETS/CUBE/ROLLUP are used so `__grouping_id` should
+    /// be included in the output schema.
+    has_grouping_set: bool,
 }
 
 impl PhysicalGroupBy {
@@ -170,11 +284,13 @@ impl PhysicalGroupBy {
         expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
         null_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
         groups: Vec<Vec<bool>>,
+        has_grouping_set: bool,
     ) -> Self {
         Self {
             expr,
             null_expr,
             groups,
+            has_grouping_set,
         }
     }
 
@@ -186,6 +302,7 @@ impl PhysicalGroupBy {
             expr,
             null_expr: vec![],
             groups: vec![vec![false; num_exprs]],
+            has_grouping_set: false,
         }
     }
 
@@ -202,6 +319,11 @@ impl PhysicalGroupBy {
         exprs_nullable
     }
 
+    /// Returns true if this has no grouping at all (including no GROUPING SETS)
+    pub fn is_true_no_grouping(&self) -> bool {
+        self.is_empty() && !self.has_grouping_set
+    }
+
     /// Returns the group expressions
     pub fn expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
         &self.expr
@@ -217,14 +339,20 @@ impl PhysicalGroupBy {
         &self.groups
     }
 
+    /// Returns true if this grouping uses GROUPING SETS, CUBE or ROLLUP.
+    pub fn has_grouping_set(&self) -> bool {
+        self.has_grouping_set
+    }
+
     /// Returns true if this `PhysicalGroupBy` has no group expressions
     pub fn is_empty(&self) -> bool {
         self.expr.is_empty()
     }
 
-    /// Check whether grouping set is single group
+    /// Returns true if this is a "simple" GROUP BY (not using GROUPING SETS/CUBE/ROLLUP).
+    /// This determines whether the `__grouping_id` column is included in the output schema.
     pub fn is_single(&self) -> bool {
-        self.null_expr.is_empty()
+        !self.has_grouping_set
     }
 
     /// Calculate GROUP BY expressions according to input schema.
@@ -238,7 +366,7 @@ impl PhysicalGroupBy {
     /// The number of expressions in the output schema.
     fn num_output_exprs(&self) -> usize {
         let mut num_exprs = self.expr.len();
-        if !self.is_single() {
+        if self.has_grouping_set {
             num_exprs += 1
         }
         num_exprs
@@ -255,7 +383,7 @@ impl PhysicalGroupBy {
                 .take(num_output_exprs)
                 .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _),
         );
-        if !self.is_single() {
+        if self.has_grouping_set {
             output_exprs.push(Arc::new(Column::new(
                 Aggregate::INTERNAL_GROUPING_ID,
                 self.expr.len(),
@@ -265,12 +393,8 @@ impl PhysicalGroupBy {
     }
 
     /// Returns the number expression as grouping keys.
-    fn num_group_exprs(&self) -> usize {
-        if self.is_single() {
-            self.expr.len()
-        } else {
-            self.expr.len() + 1
-        }
+    pub fn num_group_exprs(&self) -> usize {
+        self.expr.len() + usize::from(self.has_grouping_set)
     }
 
     pub fn group_schema(&self, schema: &Schema) -> Result<SchemaRef> {
@@ -293,7 +417,7 @@ impl PhysicalGroupBy {
                 .into(),
             );
         }
-        if !self.is_single() {
+        if self.has_grouping_set {
             fields.push(
                 Field::new(
                     Aggregate::INTERNAL_GROUPING_ID,
@@ -329,10 +453,17 @@ impl PhysicalGroupBy {
                 )
                 .collect();
         let num_exprs = expr.len();
+        let groups = if self.expr.is_empty() && !self.has_grouping_set {
+            // No GROUP BY expressions - should have no groups
+            vec![]
+        } else {
+            vec![vec![false; num_exprs]]
+        };
         Self {
             expr,
             null_expr: vec![],
-            groups: vec![vec![false; num_exprs]],
+            groups,
+            has_grouping_set: false,
         }
     }
 }
@@ -352,10 +483,11 @@ impl PartialEq for PhysicalGroupBy {
                 .zip(other.null_expr.iter())
                 .all(|((expr1, name1), (expr2, name2))| expr1.eq(expr2) && name1 == name2)
             && self.groups == other.groups
+            && self.has_grouping_set == other.has_grouping_set
     }
 }
 
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 enum StreamType {
     AggregateStream(AggregateStream),
     GroupedHash(GroupedHashAggregateStream),
@@ -372,19 +504,140 @@ impl From<StreamType> for SendableRecordBatchStream {
     }
 }
 
+/// # Aggregate Dynamic Filter Pushdown Overview
+///
+/// For queries like
+///   -- `example_table(type TEXT, val INT)`
+///   SELECT min(val)
+///   FROM example_table
+///   WHERE type='A';
+///
+/// And `example_table`'s physical representation is a partitioned parquet file with
+/// column statistics
+/// - part-0.parquet: val {min=0, max=100}
+/// - part-1.parquet: val {min=100, max=200}
+/// - ...
+/// - part-100.parquet: val {min=10000, max=10100}
+///
+/// After scanning the 1st file, we know we only have to read files if their minimal
+/// value on `val` column is less than 0, the minimal `val` value in the 1st file.
+///
+/// We can skip scanning the remaining file by implementing dynamic filter, the
+/// intuition is we keep a shared data structure for current min in both `AggregateExec`
+/// and `DataSourceExec`, and let it update during execution, so the scanner can
+/// know during execution if it's possible to skip scanning certain files. See
+/// physical optimizer rule `FilterPushdown` for details.
+///
+/// # Implementation
+///
+/// ## Enable Condition
+/// - No grouping (no `GROUP BY` clause in the sql, only a single global group to aggregate)
+/// - The aggregate expression must be `min`/`max`, and evaluate directly on columns.
+///   Note multiple aggregate expressions that satisfy this requirement are allowed,
+///   and a dynamic filter will be constructed combining all applicable expr's
+///   states. See more in the following example with dynamic filter on multiple columns.
+///
+/// ## Filter Construction
+/// The filter is kept in the `DataSourceExec`, and it will gets update during execution,
+/// the reader will interpret it as "the upstream only needs rows that such filter
+/// predicate is evaluated to true", and certain scanner implementation like `parquet`
+/// can evalaute column statistics on those dynamic filters, to decide if they can
+/// prune a whole range.
+///
+/// ### Examples
+/// - Expr: `min(a)`, Dynamic Filter: `a < a_cur_min`
+/// - Expr: `min(a), max(a), min(b)`, Dynamic Filter: `(a < a_cur_min) OR (a > a_cur_max) OR (b < b_cur_min)`
+#[derive(Debug, Clone)]
+struct AggrDynFilter {
+    /// The physical expr for the dynamic filter shared between the `AggregateExec`
+    /// and the parquet scanner.
+    filter: Arc<DynamicFilterPhysicalExpr>,
+    /// The current bounds for the dynamic filter, updates during the execution to
+    /// tighten the bound for more effective pruning.
+    ///
+    /// Each vector element is for the accumulators that support dynamic filter.
+    /// e.g. This `AggregateExec` has accumulator:
+    /// min(a), avg(a), max(b)
+    /// And this field stores [PerAccumulatorDynFilter(min(a)), PerAccumulatorDynFilter(min(b))]
+    supported_accumulators_info: Vec<PerAccumulatorDynFilter>,
+}
+
+// ---- Aggregate Dynamic Filter Utility Structs ----
+
+/// Aggregate expressions that support the dynamic filter pushdown in aggregation.
+/// See comments in [`AggrDynFilter`] for conditions.
+#[derive(Debug, Clone)]
+struct PerAccumulatorDynFilter {
+    aggr_type: DynamicFilterAggregateType,
+    /// During planning and optimization, the parent structure is kept in `AggregateExec`,
+    /// this index is into `aggr_expr` vec inside `AggregateExec`.
+    /// During execution, the parent struct is moved into `AggregateStream` (stream
+    /// for no grouping aggregate execution), and this index is into    `aggregate_expressions`
+    /// vec inside `AggregateStreamInner`
+    aggr_index: usize,
+    // The current bound. Shared among all streams.
+    shared_bound: Arc<Mutex<ScalarValue>>,
+}
+
+/// Aggregate types that are supported for dynamic filter in `AggregateExec`
+#[derive(Debug, Clone)]
+enum DynamicFilterAggregateType {
+    Min,
+    Max,
+}
+
+/// Configuration for limit-based optimizations in aggregation
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct LimitOptions {
+    /// The maximum number of rows to return
+    pub limit: usize,
+    /// Optional ordering direction (true = descending, false = ascending)
+    /// This is used for TopK aggregation to maintain a priority queue with the correct ordering
+    pub descending: Option<bool>,
+}
+
+impl LimitOptions {
+    /// Create a new LimitOptions with a limit and no specific ordering
+    pub fn new(limit: usize) -> Self {
+        Self {
+            limit,
+            descending: None,
+        }
+    }
+
+    /// Create a new LimitOptions with a limit and ordering direction
+    pub fn new_with_order(limit: usize, descending: bool) -> Self {
+        Self {
+            limit,
+            descending: Some(descending),
+        }
+    }
+
+    pub fn limit(&self) -> usize {
+        self.limit
+    }
+
+    pub fn descending(&self) -> Option<bool> {
+        self.descending
+    }
+}
+
 /// Hash aggregate execution plan
 #[derive(Debug, Clone)]
 pub struct AggregateExec {
     /// Aggregation mode (full, partial)
     mode: AggregateMode,
     /// Group by expressions
-    group_by: PhysicalGroupBy,
+    /// [`Arc`] used for a cheap clone, which improves physical plan optimization performance.
+    group_by: Arc<PhysicalGroupBy>,
     /// Aggregate expressions
-    aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+    /// The same reason to [`Arc`] it as for [`Self::group_by`].
+    aggr_expr: Arc<[Arc<AggregateFunctionExpr>]>,
     /// FILTER (WHERE clause) expression for each aggregate expression
-    filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
-    /// Set if the output of this aggregation is truncated by a upstream sort/limit clause
-    limit: Option<usize>,
+    /// The same reason to [`Arc`] it as for [`Self::group_by`].
+    filter_expr: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
+    /// Configuration for limit-based optimizations
+    limit_options: Option<LimitOptions>,
     /// Input plan, could be a partial aggregate or the input to the aggregate
     pub input: Arc<dyn ExecutionPlan>,
     /// Schema after the aggregate is applied
@@ -397,10 +650,17 @@ pub struct AggregateExec {
     pub input_schema: SchemaRef,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    required_input_ordering: Option<LexRequirement>,
+    required_input_ordering: Option<OrderingRequirements>,
     /// Describes how the input is ordered relative to the group by columns
     input_order_mode: InputOrderMode,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// During initialization, if the plan supports dynamic filtering (see [`AggrDynFilter`]),
+    /// it is set to `Some(..)` regardless of whether it can be pushed down to a child node.
+    ///
+    /// During filter pushdown optimization, if a child node can accept this filter,
+    /// it remains `Some(..)` to enable dynamic filtering during aggregate execution;
+    /// otherwise, it is cleared to `None`.
+    dynamic_filter: Option<Arc<AggrDynFilter>>,
 }
 
 impl AggregateExec {
@@ -409,22 +669,43 @@ impl AggregateExec {
     /// Rewrites aggregate exec with new aggregate expressions.
     pub fn with_new_aggr_exprs(
         &self,
-        aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+        aggr_expr: impl Into<Arc<[Arc<AggregateFunctionExpr>]>>,
     ) -> Self {
         Self {
-            aggr_expr,
+            aggr_expr: aggr_expr.into(),
+            // clone the rest of the fields
+            required_input_ordering: self.required_input_ordering.clone(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            input_order_mode: self.input_order_mode.clone(),
+            cache: Arc::clone(&self.cache),
+            mode: self.mode,
+            group_by: Arc::clone(&self.group_by),
+            filter_expr: Arc::clone(&self.filter_expr),
+            limit_options: self.limit_options,
+            input: Arc::clone(&self.input),
+            schema: Arc::clone(&self.schema),
+            input_schema: Arc::clone(&self.input_schema),
+            dynamic_filter: self.dynamic_filter.clone(),
+        }
+    }
+
+    /// Clone this exec, overriding only the limit hint.
+    pub fn with_new_limit_options(&self, limit_options: Option<LimitOptions>) -> Self {
+        Self {
+            limit_options,
             // clone the rest of the fields
             required_input_ordering: self.required_input_ordering.clone(),
             metrics: ExecutionPlanMetricsSet::new(),
             input_order_mode: self.input_order_mode.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             mode: self.mode,
-            group_by: self.group_by.clone(),
-            filter_expr: self.filter_expr.clone(),
-            limit: self.limit,
+            group_by: Arc::clone(&self.group_by),
+            aggr_expr: Arc::clone(&self.aggr_expr),
+            filter_expr: Arc::clone(&self.filter_expr),
             input: Arc::clone(&self.input),
             schema: Arc::clone(&self.schema),
             input_schema: Arc::clone(&self.input_schema),
+            dynamic_filter: self.dynamic_filter.clone(),
         }
     }
 
@@ -435,12 +716,13 @@ impl AggregateExec {
     /// Create a new hash aggregate execution plan
     pub fn try_new(
         mode: AggregateMode,
-        group_by: PhysicalGroupBy,
+        group_by: impl Into<Arc<PhysicalGroupBy>>,
         aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
         filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
         input: Arc<dyn ExecutionPlan>,
         input_schema: SchemaRef,
     ) -> Result<Self> {
+        let group_by = group_by.into();
         let schema = create_schema(&input.schema(), &group_by, &aggr_expr, mode)?;
 
         let schema = Arc::new(schema);
@@ -463,20 +745,26 @@ impl AggregateExec {
     /// a rule may re-write aggregate expressions (e.g. reverse them) during
     /// initialization, field names may change inadvertently if one re-creates
     /// the schema in such cases.
-    #[allow(clippy::too_many_arguments)]
     fn try_new_with_schema(
         mode: AggregateMode,
-        group_by: PhysicalGroupBy,
+        group_by: impl Into<Arc<PhysicalGroupBy>>,
         mut aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
-        filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
+        filter_expr: impl Into<Arc<[Option<Arc<dyn PhysicalExpr>>]>>,
         input: Arc<dyn ExecutionPlan>,
         input_schema: SchemaRef,
         schema: SchemaRef,
     ) -> Result<Self> {
+        let group_by = group_by.into();
+        let filter_expr = filter_expr.into();
+
         // Make sure arguments are consistent in size
-        if aggr_expr.len() != filter_expr.len() {
-            return internal_err!("Inconsistent aggregate expr: {:?} and filter expr: {:?} for AggregateExec, their size should match", aggr_expr, filter_expr);
-        }
+        assert_eq_or_internal_err!(
+            aggr_expr.len(),
+            filter_expr.len(),
+            "Inconsistent aggregate expr: {:?} and filter expr: {:?} for AggregateExec, their size should match",
+            aggr_expr,
+            filter_expr
+        );
 
         let input_eq_properties = input.equivalence_properties();
         // Get GROUP BY expressions:
@@ -484,16 +772,14 @@ impl AggregateExec {
         // If existing ordering satisfies a prefix of the GROUP BY expressions,
         // prefix requirements with this section. In this case, aggregation will
         // work more efficiently.
-        let indices = get_ordered_partition_by_indices(&groupby_exprs, &input);
-        let mut new_requirement = LexRequirement::new(
-            indices
-                .iter()
-                .map(|&idx| PhysicalSortRequirement {
-                    expr: Arc::clone(&groupby_exprs[idx]),
-                    options: None,
-                })
-                .collect::<Vec<_>>(),
-        );
+        // Copy the `PhysicalSortExpr`s to retain the sort options.
+        let (new_sort_exprs, indices) =
+            input_eq_properties.find_longest_permutation(&groupby_exprs)?;
+
+        let mut new_requirements = new_sort_exprs
+            .into_iter()
+            .map(PhysicalSortRequirement::from)
+            .collect::<Vec<_>>();
 
         let req = get_finer_aggregate_exprs_requirement(
             &mut aggr_expr,
@@ -501,8 +787,10 @@ impl AggregateExec {
             input_eq_properties,
             &mode,
         )?;
-        new_requirement.inner.extend(req);
-        new_requirement = new_requirement.collapse();
+        new_requirements.extend(req);
+
+        let required_input_ordering =
+            LexRequirement::new(new_requirements).map(OrderingRequirements::new_soft);
 
         // If our aggregation has grouping sets then our base grouping exprs will
         // be expanded based on the flags in `group_by.groups` where for each
@@ -527,10 +815,7 @@ impl AggregateExec {
 
         // construct a map from the input expression to the output expression of the Aggregation group by
         let group_expr_mapping =
-            ProjectionMapping::try_new(&group_by.expr, &input.schema())?;
-
-        let required_input_ordering =
-            (!new_requirement.is_empty()).then_some(new_requirement);
+            ProjectionMapping::try_new(group_by.expr.clone(), &input.schema())?;
 
         let cache = Self::compute_properties(
             &input,
@@ -538,23 +823,28 @@ impl AggregateExec {
             &group_expr_mapping,
             &mode,
             &input_order_mode,
-            aggr_expr.as_slice(),
-        );
+            aggr_expr.as_ref(),
+        )?;
 
-        Ok(AggregateExec {
+        let mut exec = AggregateExec {
             mode,
             group_by,
-            aggr_expr,
+            aggr_expr: aggr_expr.into(),
             filter_expr,
             input,
             schema,
             input_schema,
             metrics: ExecutionPlanMetricsSet::new(),
             required_input_ordering,
-            limit: None,
+            limit_options: None,
             input_order_mode,
-            cache,
-        })
+            cache: Arc::new(cache),
+            dynamic_filter: None,
+        };
+
+        exec.init_dynamic_filter();
+
+        Ok(exec)
     }
 
     /// Aggregation mode (full, partial)
@@ -562,11 +852,17 @@ impl AggregateExec {
         &self.mode
     }
 
-    /// Set the `limit` of this AggExec
-    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
-        self.limit = limit;
+    /// Set the limit options for this AggExec
+    pub fn with_limit_options(mut self, limit_options: Option<LimitOptions>) -> Self {
+        self.limit_options = limit_options;
         self
     }
+
+    /// Get the limit options (if set)
+    pub fn limit_options(&self) -> Option<LimitOptions> {
+        self.limit_options
+    }
+
     /// Grouping expressions
     pub fn group_expr(&self) -> &PhysicalGroupBy {
         &self.group_by
@@ -597,30 +893,24 @@ impl AggregateExec {
         Arc::clone(&self.input_schema)
     }
 
-    /// number of rows soft limit of the AggregateExec
-    pub fn limit(&self) -> Option<usize> {
-        self.limit
-    }
-
     fn execute_typed(
         &self,
         partition: usize,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
     ) -> Result<StreamType> {
-        // no group by at all
-        if self.group_by.expr.is_empty() {
+        if self.group_by.is_true_no_grouping() {
             return Ok(StreamType::AggregateStream(AggregateStream::new(
                 self, context, partition,
             )?));
         }
 
         // grouping by an expression that has a sort/limit upstream
-        if let Some(limit) = self.limit {
-            if !self.is_unordered_unfiltered_group_by_distinct() {
-                return Ok(StreamType::GroupedPriorityQueue(
-                    GroupedTopKAggregateStream::new(self, context, partition, limit)?,
-                ));
-            }
+        if let Some(config) = self.limit_options
+            && !self.is_unordered_unfiltered_group_by_distinct()
+        {
+            return Ok(StreamType::GroupedPriorityQueue(
+                GroupedTopKAggregateStream::new(self, context, partition, config.limit)?,
+            ));
         }
 
         // grouping by something else and we need to just materialize all results
@@ -640,8 +930,15 @@ impl AggregateExec {
     /// This method qualifies the use of the LimitedDistinctAggregation rewrite rule
     /// on an AggregateExec.
     pub fn is_unordered_unfiltered_group_by_distinct(&self) -> bool {
+        if self
+            .limit_options()
+            .and_then(|config| config.descending)
+            .is_some()
+        {
+            return false;
+        }
         // ensure there is a group by
-        if self.group_expr().is_empty() {
+        if self.group_expr().is_empty() && !self.group_expr().has_grouping_set() {
             return false;
         }
         // ensure there are no aggregate expressions
@@ -654,7 +951,7 @@ impl AggregateExec {
             return false;
         }
         // ensure there are no order by expressions
-        if self.aggr_expr().iter().any(|e| e.order_bys().is_some()) {
+        if !self.aggr_expr().iter().all(|e| e.order_bys().is_empty()) {
             return false;
         }
         // ensure there is no output ordering; can this rule be relaxed?
@@ -662,8 +959,8 @@ impl AggregateExec {
             return false;
         }
         // ensure no ordering is required on the input
-        if self.required_input_ordering()[0].is_some() {
-            return false;
+        if let Some(requirement) = self.required_input_ordering().swap_remove(0) {
+            return matches!(requirement, OrderingRequirements::Hard(_));
         }
         true
     }
@@ -676,7 +973,7 @@ impl AggregateExec {
         mode: &AggregateMode,
         input_order_mode: &InputOrderMode,
         aggr_exprs: &[Arc<AggregateFunctionExpr>],
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Construct equivalence properties:
         let mut eq_properties = input
             .equivalence_properties()
@@ -684,13 +981,12 @@ impl AggregateExec {
 
         // If the group by is empty, then we ensure that the operator will produce
         // only one row, and mark the generated result as a constant value.
-        if group_expr_mapping.map.is_empty() {
-            let mut constants = eq_properties.constants().to_vec();
+        if group_expr_mapping.is_empty() {
             let new_constants = aggr_exprs.iter().enumerate().map(|(idx, func)| {
-                ConstExpr::new(Arc::new(Column::new(func.name(), idx)))
+                let column = Arc::new(Column::new(func.name(), idx));
+                ConstExpr::from(column as Arc<dyn PhysicalExpr>)
             });
-            constants.extend(new_constants);
-            eq_properties = eq_properties.with_constants(constants);
+            eq_properties.add_constants(new_constants)?;
         }
 
         // Group by expression will be a distinct value after the aggregation.
@@ -698,13 +994,11 @@ impl AggregateExec {
         let mut constraints = eq_properties.constraints().to_vec();
         let new_constraint = Constraint::Unique(
             group_expr_mapping
-                .map
                 .iter()
-                .filter_map(|(_, target_col)| {
-                    target_col
-                        .as_any()
-                        .downcast_ref::<Column>()
-                        .map(|c| c.index())
+                .flat_map(|(_, target_cols)| {
+                    target_cols.iter().flat_map(|(expr, _)| {
+                        expr.as_any().downcast_ref::<Column>().map(|c| c.index())
+                    })
                 })
                 .collect(),
         );
@@ -714,14 +1008,15 @@ impl AggregateExec {
 
         // Get output partitioning:
         let input_partitioning = input.output_partitioning().clone();
-        let output_partitioning = if mode.is_first_stage() {
-            // First stage aggregation will not change the output partitioning,
-            // but needs to respect aliases (e.g. mapping in the GROUP BY
-            // expression).
-            let input_eq_properties = input.equivalence_properties();
-            input_partitioning.project(group_expr_mapping, input_eq_properties)
-        } else {
-            input_partitioning.clone()
+        let output_partitioning = match mode.input_mode() {
+            AggregateInputMode::Raw => {
+                // First stage aggregation will not change the output partitioning,
+                // but needs to respect aliases (e.g. mapping in the GROUP BY
+                // expression).
+                let input_eq_properties = input.equivalence_properties();
+                input_partitioning.project(group_expr_mapping, input_eq_properties)
+            }
+            AggregateInputMode::Partial => input_partitioning.clone(),
         };
 
         // TODO: Emission type and boundedness information can be enhanced here
@@ -731,19 +1026,19 @@ impl AggregateExec {
             input.pipeline_behavior()
         };
 
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             emission_type,
             input.boundedness(),
-        )
+        ))
     }
 
     pub fn input_order_mode(&self) -> &InputOrderMode {
         &self.input_order_mode
     }
 
-    fn statistics_inner(&self, child_statistics: Statistics) -> Result<Statistics> {
+    fn statistics_inner(&self, child_statistics: &Statistics) -> Result<Statistics> {
         // TODO stats: group expressions:
         // - once expressions will be able to compute their own stats, use it here
         // - case where we group by on a column for which with have the `distinct` stat
@@ -774,10 +1069,13 @@ impl AggregateExec {
             AggregateMode::Final | AggregateMode::FinalPartitioned
                 if self.group_by.expr.is_empty() =>
             {
+                let total_byte_size =
+                    Self::calculate_scaled_byte_size(child_statistics, 1);
+
                 Ok(Statistics {
                     num_rows: Precision::Exact(1),
                     column_statistics,
-                    total_byte_size: Precision::Absent,
+                    total_byte_size,
                 })
             }
             _ => {
@@ -797,14 +1095,118 @@ impl AggregateExec {
                 } else {
                     Precision::Absent
                 };
+
+                let total_byte_size = num_rows
+                    .get_value()
+                    .and_then(|&output_rows| {
+                        Self::calculate_scaled_byte_size(child_statistics, output_rows)
+                            .get_value()
+                            .map(|&bytes| Precision::Inexact(bytes))
+                    })
+                    .unwrap_or(Precision::Absent);
+
                 Ok(Statistics {
                     num_rows,
                     column_statistics,
-                    total_byte_size: Precision::Absent,
+                    total_byte_size,
                 })
             }
         }
     }
+
+    /// Check if dynamic filter is possible for the current plan node.
+    /// - If yes, init one inside `AggregateExec`'s `dynamic_filter` field.
+    /// - If not supported, `self.dynamic_filter` should be kept `None`
+    fn init_dynamic_filter(&mut self) {
+        if (!self.group_by.is_empty()) || (self.mode != AggregateMode::Partial) {
+            debug_assert!(
+                self.dynamic_filter.is_none(),
+                "The current operator node does not support dynamic filter"
+            );
+            return;
+        }
+
+        // Already initialized.
+        if self.dynamic_filter.is_some() {
+            return;
+        }
+
+        // Collect supported accumulators
+        // It is assumed the order of aggregate expressions are not changed from `AggregateExec`
+        // to `AggregateStream`
+        let mut aggr_dyn_filters = Vec::new();
+        // All column references in the dynamic filter, used when initializing the dynamic
+        // filter, and it's used to decide if this dynamic filter is able to get push
+        // through certain node during optimization.
+        let mut all_cols: Vec<Arc<dyn PhysicalExpr>> = Vec::new();
+        for (i, aggr_expr) in self.aggr_expr.iter().enumerate() {
+            // 1. Only `min` or `max` aggregate function
+            let fun_name = aggr_expr.fun().name();
+            // HACK: Should check the function type more precisely
+            // Issue: <https://github.com/apache/datafusion/issues/18643>
+            let aggr_type = if fun_name.eq_ignore_ascii_case("min") {
+                DynamicFilterAggregateType::Min
+            } else if fun_name.eq_ignore_ascii_case("max") {
+                DynamicFilterAggregateType::Max
+            } else {
+                return;
+            };
+
+            // 2. arg should be only 1 column reference
+            if let [arg] = aggr_expr.expressions().as_slice()
+                && arg.as_any().is::<Column>()
+            {
+                all_cols.push(Arc::clone(arg));
+                aggr_dyn_filters.push(PerAccumulatorDynFilter {
+                    aggr_type,
+                    aggr_index: i,
+                    shared_bound: Arc::new(Mutex::new(ScalarValue::Null)),
+                });
+            }
+        }
+
+        if !aggr_dyn_filters.is_empty() {
+            self.dynamic_filter = Some(Arc::new(AggrDynFilter {
+                filter: Arc::new(DynamicFilterPhysicalExpr::new(all_cols, lit(true))),
+                supported_accumulators_info: aggr_dyn_filters,
+            }))
+        }
+    }
+
+    /// Calculate scaled byte size based on row count ratio.
+    /// Returns `Precision::Absent` if input statistics are insufficient.
+    /// Returns `Precision::Inexact` with the scaled value otherwise.
+    ///
+    /// This is a simple heuristic that assumes uniform row sizes.
+    #[inline]
+    fn calculate_scaled_byte_size(
+        input_stats: &Statistics,
+        target_row_count: usize,
+    ) -> Precision<usize> {
+        match (
+            input_stats.num_rows.get_value(),
+            input_stats.total_byte_size.get_value(),
+        ) {
+            (Some(&input_rows), Some(&input_bytes)) if input_rows > 0 => {
+                let bytes_per_row = input_bytes as f64 / input_rows as f64;
+                let scaled_bytes =
+                    (bytes_per_row * target_row_count as f64).ceil() as usize;
+                Precision::Inexact(scaled_bytes)
+            }
+            _ => Precision::Absent,
+        }
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for AggregateExec {
@@ -864,8 +1266,8 @@ impl DisplayAs for AggregateExec {
                     .map(|agg| agg.name().to_string())
                     .collect();
                 write!(f, ", aggr=[{}]", a.join(", "))?;
-                if let Some(limit) = self.limit {
-                    write!(f, ", lim=[{limit}]")?;
+                if let Some(config) = self.limit_options {
+                    write!(f, ", lim=[{}]", config.limit)?;
                 }
 
                 if self.input_order_mode != InputOrderMode::Linear {
@@ -924,6 +1326,9 @@ impl DisplayAs for AggregateExec {
                 if !a.is_empty() {
                     writeln!(f, "aggr={}", a.join(", "))?;
                 }
+                if let Some(config) = self.limit_options {
+                    writeln!(f, "limit={}", config.limit)?;
+                }
             }
         }
         Ok(())
@@ -940,13 +1345,13 @@ impl ExecutionPlan for AggregateExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
         match &self.mode {
-            AggregateMode::Partial => {
+            AggregateMode::Partial | AggregateMode::PartialReduce => {
                 vec![Distribution::UnspecifiedDistribution]
             }
             AggregateMode::FinalPartitioned | AggregateMode::SinglePartitioned => {
@@ -958,7 +1363,7 @@ impl ExecutionPlan for AggregateExec {
         }
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         vec![self.required_input_ordering.clone()]
     }
 
@@ -979,20 +1384,53 @@ impl ExecutionPlan for AggregateExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to group by expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for expr in self.group_by.input_exprs() {
+            tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+        }
+
+        // Apply to aggregate expressions
+        for aggr in self.aggr_expr.iter() {
+            for expr in aggr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+
+        // Apply to filter expressions (FILTER WHERE clauses)
+        for filter in self.filter_expr.iter().flatten() {
+            tnr = tnr.visit_sibling(|| f(filter.as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present
+        if let Some(dyn_filter) = &self.dynamic_filter {
+            tnr = tnr.visit_sibling(|| f(dyn_filter.filter.as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+
         let mut me = AggregateExec::try_new_with_schema(
             self.mode,
-            self.group_by.clone(),
-            self.aggr_expr.clone(),
-            self.filter_expr.clone(),
+            Arc::clone(&self.group_by),
+            self.aggr_expr.to_vec(),
+            Arc::clone(&self.filter_expr),
             Arc::clone(&children[0]),
             Arc::clone(&self.input_schema),
             Arc::clone(&self.schema),
         )?;
-        me.limit = self.limit;
+        me.limit_options = self.limit_options;
+        me.dynamic_filter.clone_from(&self.dynamic_filter);
 
         Ok(Arc::new(me))
     }
@@ -1002,7 +1440,7 @@ impl ExecutionPlan for AggregateExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        self.execute_typed(partition, context)
+        self.execute_typed(partition, &context)
             .map(|stream| stream.into())
     }
 
@@ -1010,17 +1448,156 @@ impl ExecutionPlan for AggregateExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.statistics_inner(self.input().partition_statistics(partition)?)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let child_statistics = self.input().partition_statistics(partition)?;
+        Ok(Arc::new(self.statistics_inner(&child_statistics)?))
     }
 
     fn cardinality_effect(&self) -> CardinalityEffect {
         CardinalityEffect::LowerEqual
     }
+
+    /// Push down parent filters when possible (see implementation comment for details),
+    /// and also pushdown self dynamic filters (see `AggrDynFilter` for details)
+    fn gather_filters_for_pushdown(
+        &self,
+        phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        // It's safe to push down filters through aggregates when filters only reference
+        // grouping columns, because such filters determine which groups to compute, not
+        // *how* to compute them. Each group's aggregate values (SUM, COUNT, etc.) are
+        // calculated from the same input rows regardless of whether we filter before or
+        // after grouping - filtering before just eliminates entire groups early.
+        // This optimization is NOT safe for filters on aggregated columns (like filtering on
+        // the result of SUM or COUNT), as those require computing all groups first.
+
+        let grouping_columns: HashSet<_> = self
+            .group_by
+            .expr()
+            .iter()
+            .flat_map(|(expr, _)| collect_columns(expr))
+            .collect();
+
+        // Analyze each filter separately to determine if it can be pushed down
+        let mut safe_filters = Vec::new();
+        let mut unsafe_filters = Vec::new();
+
+        for filter in parent_filters {
+            let filter_columns: HashSet<_> =
+                collect_columns(&filter).into_iter().collect();
+
+            // Check if this filter references non-grouping columns
+            let references_non_grouping = !grouping_columns.is_empty()
+                && !filter_columns.is_subset(&grouping_columns);
+
+            if references_non_grouping {
+                unsafe_filters.push(filter);
+                continue;
+            }
+
+            // For GROUPING SETS, verify this filter's columns appear in all grouping sets
+            if self.group_by.groups().len() > 1 {
+                let filter_column_indices: Vec<usize> = filter_columns
+                    .iter()
+                    .filter_map(|filter_col| {
+                        self.group_by.expr().iter().position(|(expr, _)| {
+                            collect_columns(expr).contains(filter_col)
+                        })
+                    })
+                    .collect();
+
+                // Check if any of this filter's columns are missing from any grouping set
+                let has_missing_column = self.group_by.groups().iter().any(|null_mask| {
+                    filter_column_indices
+                        .iter()
+                        .any(|&idx| null_mask.get(idx) == Some(&true))
+                });
+
+                if has_missing_column {
+                    unsafe_filters.push(filter);
+                    continue;
+                }
+            }
+
+            // This filter is safe to push down
+            safe_filters.push(filter);
+        }
+
+        // Build child filter description with both safe and unsafe filters
+        let child = self.children()[0];
+        let mut child_desc = ChildFilterDescription::from_child(&safe_filters, child)?;
+
+        // Add unsafe filters as unsupported
+        child_desc.parent_filters.extend(
+            unsafe_filters
+                .into_iter()
+                .map(PushedDownPredicate::unsupported),
+        );
+
+        // Include self dynamic filter when it's possible
+        if phase == FilterPushdownPhase::Post
+            && config.optimizer.enable_aggregate_dynamic_filter_pushdown
+            && let Some(self_dyn_filter) = &self.dynamic_filter
+        {
+            let dyn_filter = Arc::clone(&self_dyn_filter.filter);
+            child_desc = child_desc.with_self_filter(dyn_filter);
+        }
+
+        Ok(FilterDescription::new().with_child(child_desc))
+    }
+
+    /// If child accepts self's dynamic filter, keep `self.dynamic_filter` with Some,
+    /// otherwise clear it to None.
+    fn handle_child_pushdown_result(
+        &self,
+        phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        let mut result = FilterPushdownPropagation::if_any(child_pushdown_result.clone());
+
+        // If this node tried to pushdown some dynamic filter before, now we check
+        // if the child accept the filter
+        if phase == FilterPushdownPhase::Post
+            && let Some(dyn_filter) = &self.dynamic_filter
+        {
+            // let child_accepts_dyn_filter = child_pushdown_result
+            //     .self_filters
+            //     .first()
+            //     .map(|filters| {
+            //         assert_eq_or_internal_err!(
+            //             filters.len(),
+            //             1,
+            //             "Aggregate only pushdown one self dynamic filter"
+            //         );
+            //         let filter = filters.get(0).unwrap(); // Asserted above
+            //         Ok(matches!(filter.discriminant, PushedDown::Yes))
+            //     })
+            //     .unwrap_or_else(|| internal_err!("The length of self filters equals to the number of child of this ExecutionPlan, so it must be 1"))?;
+
+            // HACK: The above snippet should be used, however, now the child reply
+            // `PushDown::No` can indicate they're not able to push down row-level
+            // filter, but still keep the filter for statistics pruning.
+            // So here, we try to use ref count to determine if the dynamic filter
+            // has actually be pushed down.
+            // Issue: <https://github.com/apache/datafusion/issues/18856>
+            let child_accepts_dyn_filter = Arc::strong_count(dyn_filter) > 1;
+
+            if !child_accepts_dyn_filter {
+                // Child can't consume the self dynamic filter, so disable it by setting
+                // to `None`
+                let mut new_node = self.clone();
+                new_node.dynamic_filter = None;
+
+                result = result
+                    .with_updated_node(Arc::new(new_node) as Arc<dyn ExecutionPlan>);
+            }
+        }
+
+        Ok(result)
+    }
 }
 
 fn create_schema(
@@ -1032,20 +1609,17 @@ fn create_schema(
     let mut fields = Vec::with_capacity(group_by.num_output_exprs() + aggr_expr.len());
     fields.extend(group_by.output_fields(input_schema)?);
 
-    match mode {
-        AggregateMode::Partial => {
-            // in partial mode, the fields of the accumulator's state
+    match mode.output_mode() {
+        AggregateOutputMode::Final => {
+            // in final mode, the field with the final result of the accumulator
             for expr in aggr_expr {
-                fields.extend(expr.state_fields()?.iter().cloned());
+                fields.push(expr.field())
             }
         }
-        AggregateMode::Final
-        | AggregateMode::FinalPartitioned
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => {
-            // in final mode, the field with the final result of the accumulator
+        AggregateOutputMode::Partial => {
+            // in partial mode, the fields of the accumulator's state
             for expr in aggr_expr {
-                fields.push(expr.field())
+                fields.extend(expr.state_fields()?.iter().cloned());
             }
         }
     }
@@ -1066,6 +1640,11 @@ fn create_schema(
 ///   physical GROUP BY expression.
 /// - `agg_mode`: A reference to an `AggregateMode` instance representing the
 ///   mode of aggregation.
+/// - `include_soft_requirement`: When `false`, only hard requirements are
+///   considered, as indicated by [`AggregateFunctionExpr::order_sensitivity`]
+///   returning [`AggregateOrderSensitivity::HardRequirement`].
+///   Otherwise, also soft requirements ([`AggregateOrderSensitivity::SoftRequirement`])
+///   are considered.
 ///
 /// # Returns
 ///
@@ -1075,16 +1654,27 @@ fn get_aggregate_expr_req(
     aggr_expr: &AggregateFunctionExpr,
     group_by: &PhysicalGroupBy,
     agg_mode: &AggregateMode,
-) -> LexOrdering {
-    // If the aggregation function is ordering requirement is not absolutely
-    // necessary, or the aggregation is performing a "second stage" calculation,
-    // then ignore the ordering requirement.
-    if !aggr_expr.order_sensitivity().hard_requires() || !agg_mode.is_first_stage() {
-        return LexOrdering::default();
+    include_soft_requirement: bool,
+) -> Option<LexOrdering> {
+    // If the aggregation is performing a "second stage" calculation,
+    // then ignore the ordering requirement. Ordering requirement applies
+    // only to the aggregation input data.
+    if agg_mode.input_mode() == AggregateInputMode::Partial {
+        return None;
     }
 
-    let mut req = aggr_expr.order_bys().cloned().unwrap_or_default();
+    match aggr_expr.order_sensitivity() {
+        AggregateOrderSensitivity::Insensitive => return None,
+        AggregateOrderSensitivity::HardRequirement => {}
+        AggregateOrderSensitivity::SoftRequirement => {
+            if !include_soft_requirement {
+                return None;
+            }
+        }
+        AggregateOrderSensitivity::Beneficial => return None,
+    }
 
+    let mut sort_exprs = aggr_expr.order_bys().to_vec();
     // In non-first stage modes, we accumulate data (using `merge_batch`) from
     // different partitions (i.e. merge partial results). During this merge, we
     // consider the ordering of each partial result. Hence, we do not need to
@@ -1095,38 +1685,11 @@ fn get_aggregate_expr_req(
         // will definitely be satisfied -- Each group by expression will have
         // distinct values per group, hence all requirements are satisfied.
         let physical_exprs = group_by.input_exprs();
-        req.retain(|sort_expr| {
+        sort_exprs.retain(|sort_expr| {
             !physical_exprs_contains(&physical_exprs, &sort_expr.expr)
         });
     }
-    req
-}
-
-/// Computes the finer ordering for between given existing ordering requirement
-/// of aggregate expression.
-///
-/// # Parameters
-///
-/// * `existing_req` - The existing lexical ordering that needs refinement.
-/// * `aggr_expr` - A reference to an aggregate expression trait object.
-/// * `group_by` - Information about the physical grouping (e.g group by expression).
-/// * `eq_properties` - Equivalence properties relevant to the computation.
-/// * `agg_mode` - The mode of aggregation (e.g., Partial, Final, etc.).
-///
-/// # Returns
-///
-/// An `Option<LexOrdering>` representing the computed finer lexical ordering,
-/// or `None` if there is no finer ordering; e.g. the existing requirement and
-/// the aggregator requirement is incompatible.
-fn finer_ordering(
-    existing_req: &LexOrdering,
-    aggr_expr: &AggregateFunctionExpr,
-    group_by: &PhysicalGroupBy,
-    eq_properties: &EquivalenceProperties,
-    agg_mode: &AggregateMode,
-) -> Option<LexOrdering> {
-    let aggr_req = get_aggregate_expr_req(aggr_expr, group_by, agg_mode);
-    eq_properties.get_finer_ordering(existing_req, aggr_req.as_ref())
+    LexOrdering::new(sort_exprs)
 }
 
 /// Concatenates the given slices.
@@ -1134,7 +1697,23 @@ pub fn concat_slices<T: Clone>(lhs: &[T], rhs: &[T]) -> Vec<T> {
     [lhs, rhs].concat()
 }
 
-/// Get the common requirement that satisfies all the aggregate expressions.
+// Determines if the candidate ordering is finer than the current ordering.
+// Returns `None` if they are incomparable, `Some(true)` if there is no current
+// ordering or candidate ordering is finer, and `Some(false)` otherwise.
+fn determine_finer(
+    current: &Option<LexOrdering>,
+    candidate: &LexOrdering,
+) -> Option<bool> {
+    if let Some(ordering) = current {
+        candidate.partial_cmp(ordering).map(|cmp| cmp.is_gt())
+    } else {
+        Some(true)
+    }
+}
+
+/// Gets the common requirement that satisfies all the aggregate expressions.
+/// When possible, chooses the requirement that is already satisfied by the
+/// equivalence properties.
 ///
 /// # Parameters
 ///
@@ -1149,75 +1728,91 @@ pub fn concat_slices<T: Clone>(lhs: &[T], rhs: &[T]) -> Vec<T> {
 ///
 /// # Returns
 ///
-/// A `LexRequirement` instance, which is the requirement that satisfies all the
-/// aggregate requirements. Returns an error in case of conflicting requirements.
+/// A `Result<Vec<PhysicalSortRequirement>>` instance, which is the requirement
+/// that satisfies all the aggregate requirements. Returns an error in case of
+/// conflicting requirements.
 pub fn get_finer_aggregate_exprs_requirement(
     aggr_exprs: &mut [Arc<AggregateFunctionExpr>],
     group_by: &PhysicalGroupBy,
     eq_properties: &EquivalenceProperties,
     agg_mode: &AggregateMode,
-) -> Result<LexRequirement> {
-    let mut requirement = LexOrdering::default();
-    for aggr_expr in aggr_exprs.iter_mut() {
-        if let Some(finer_ordering) =
-            finer_ordering(&requirement, aggr_expr, group_by, eq_properties, agg_mode)
-        {
-            if eq_properties.ordering_satisfy(finer_ordering.as_ref()) {
-                // Requirement is satisfied by existing ordering
-                requirement = finer_ordering;
-                continue;
-            }
-        }
-        if let Some(reverse_aggr_expr) = aggr_expr.reverse_expr() {
-            if let Some(finer_ordering) = finer_ordering(
-                &requirement,
-                &reverse_aggr_expr,
+) -> Result<Vec<PhysicalSortRequirement>> {
+    let mut requirement = None;
+
+    // First try and find a match for all hard and soft requirements.
+    // If a match can't be found, try a second time just matching hard
+    // requirements.
+    for include_soft_requirement in [false, true] {
+        for aggr_expr in aggr_exprs.iter_mut() {
+            let Some(aggr_req) = get_aggregate_expr_req(
+                aggr_expr,
                 group_by,
-                eq_properties,
                 agg_mode,
-            ) {
-                if eq_properties.ordering_satisfy(finer_ordering.as_ref()) {
-                    // Reverse requirement is satisfied by exiting ordering.
-                    // Hence reverse the aggregator
-                    requirement = finer_ordering;
-                    *aggr_expr = Arc::new(reverse_aggr_expr);
-                    continue;
+                include_soft_requirement,
+            )
+            .and_then(|o| eq_properties.normalize_sort_exprs(o)) else {
+                // There is no aggregate ordering requirement, or it is trivially
+                // satisfied -- we can skip this expression.
+                continue;
+            };
+            // If the common requirement is finer than the current expression's,
+            // we can skip this expression. If the latter is finer than the former,
+            // adopt it if it is satisfied by the equivalence properties. Otherwise,
+            // defer the analysis to the reverse expression.
+            let forward_finer = determine_finer(&requirement, &aggr_req);
+            if let Some(finer) = forward_finer {
+                if !finer {
+                    continue;
+                } else if eq_properties.ordering_satisfy(aggr_req.clone())? {
+                    requirement = Some(aggr_req);
+                    continue;
                 }
             }
-        }
-        if let Some(finer_ordering) =
-            finer_ordering(&requirement, aggr_expr, group_by, eq_properties, agg_mode)
-        {
-            // There is a requirement that both satisfies existing requirement and current
-            // aggregate requirement. Use updated requirement
-            requirement = finer_ordering;
-            continue;
-        }
-        if let Some(reverse_aggr_expr) = aggr_expr.reverse_expr() {
-            if let Some(finer_ordering) = finer_ordering(
-                &requirement,
-                &reverse_aggr_expr,
-                group_by,
-                eq_properties,
-                agg_mode,
-            ) {
-                // There is a requirement that both satisfies existing requirement and reverse
-                // aggregate requirement. Use updated requirement
-                requirement = finer_ordering;
-                *aggr_expr = Arc::new(reverse_aggr_expr);
-                continue;
+            if let Some(reverse_aggr_expr) = aggr_expr.reverse_expr() {
+                let Some(rev_aggr_req) = get_aggregate_expr_req(
+                    &reverse_aggr_expr,
+                    group_by,
+                    agg_mode,
+                    include_soft_requirement,
+                )
+                .and_then(|o| eq_properties.normalize_sort_exprs(o)) else {
+                    // The reverse requirement is trivially satisfied -- just reverse
+                    // the expression and continue with the next one:
+                    *aggr_expr = Arc::new(reverse_aggr_expr);
+                    continue;
+                };
+                // If the common requirement is finer than the reverse expression's,
+                // just reverse it and continue the loop with the next aggregate
+                // expression. If the latter is finer than the former, adopt it if
+                // it is satisfied by the equivalence properties. Otherwise, adopt
+                // the forward expression.
+                if let Some(finer) = determine_finer(&requirement, &rev_aggr_req) {
+                    if !finer {
+                        *aggr_expr = Arc::new(reverse_aggr_expr);
+                    } else if eq_properties.ordering_satisfy(rev_aggr_req.clone())? {
+                        *aggr_expr = Arc::new(reverse_aggr_expr);
+                        requirement = Some(rev_aggr_req);
+                    } else {
+                        requirement = Some(aggr_req);
+                    }
+                } else if forward_finer.is_some() {
+                    requirement = Some(aggr_req);
+                } else {
+                    // Neither the existing requirement nor the current aggregate
+                    // requirement satisfy the other (forward or reverse), this
+                    // means they are conflicting. This is a problem only for hard
+                    // requirements. Unsatisfied soft requirements can be ignored.
+                    if !include_soft_requirement {
+                        return not_impl_err!(
+                            "Conflicting ordering requirements in aggregate functions is not supported"
+                        );
+                    }
+                }
             }
         }
-
-        // Neither the existing requirement and current aggregate requirement satisfy the other, this means
-        // requirements are conflicting. Currently, we do not support
-        // conflicting requirements.
-        return not_impl_err!(
-            "Conflicting ordering requirements in aggregate functions is not supported"
-        );
     }
 
-    Ok(LexRequirement::from(requirement))
+    Ok(requirement.map_or_else(Vec::new, |o| o.into_iter().map(Into::into).collect()))
 }
 
 /// Returns physical expressions for arguments to evaluate against a batch.
@@ -1230,24 +1825,20 @@ pub fn aggregate_expressions(
     mode: &AggregateMode,
     col_idx_base: usize,
 ) -> Result<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
-    match mode {
-        AggregateMode::Partial
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => Ok(aggr_expr
+    match mode.input_mode() {
+        AggregateInputMode::Raw => Ok(aggr_expr
             .iter()
             .map(|agg| {
                 let mut result = agg.expressions();
                 // Append ordering requirements to expressions' results. This
                 // way order sensitive aggregators can satisfy requirement
                 // themselves.
-                if let Some(ordering_req) = agg.order_bys() {
-                    result.extend(ordering_req.iter().map(|item| Arc::clone(&item.expr)));
-                }
+                result.extend(agg.order_bys().iter().map(|item| Arc::clone(&item.expr)));
                 result
             })
             .collect()),
-        // In this mode, we build the merge expressions of the aggregation.
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+        AggregateInputMode::Partial => {
+            // In merge mode, we build the merge expressions of the aggregation.
             let mut col_idx_base = col_idx_base;
             aggr_expr
                 .iter()
@@ -1295,8 +1886,15 @@ pub fn finalize_aggregation(
     accumulators: &mut [AccumulatorItem],
     mode: &AggregateMode,
 ) -> Result<Vec<ArrayRef>> {
-    match mode {
-        AggregateMode::Partial => {
+    match mode.output_mode() {
+        AggregateOutputMode::Final => {
+            // Merge the state to the final value
+            accumulators
+                .iter_mut()
+                .map(|accumulator| accumulator.evaluate().and_then(|v| v.to_array()))
+                .collect()
+        }
+        AggregateOutputMode::Partial => {
             // Build the vector of states
             accumulators
                 .iter_mut()
@@ -1310,38 +1908,17 @@ pub fn finalize_aggregation(
                 .flatten_ok()
                 .collect()
         }
-        AggregateMode::Final
-        | AggregateMode::FinalPartitioned
-        | AggregateMode::Single
-        | AggregateMode::SinglePartitioned => {
-            // Merge the state to the final value
-            accumulators
-                .iter_mut()
-                .map(|accumulator| accumulator.evaluate().and_then(|v| v.to_array()))
-                .collect()
-        }
     }
 }
 
-/// Evaluates expressions against a record batch.
-fn evaluate(
-    expr: &[Arc<dyn PhysicalExpr>],
-    batch: &RecordBatch,
-) -> Result<Vec<ArrayRef>> {
-    expr.iter()
-        .map(|expr| {
-            expr.evaluate(batch)
-                .and_then(|v| v.into_array(batch.num_rows()))
-        })
-        .collect()
-}
-
-/// Evaluates expressions against a record batch.
-pub(crate) fn evaluate_many(
+/// Evaluates groups of expressions against a record batch.
+pub fn evaluate_many(
     expr: &[Vec<Arc<dyn PhysicalExpr>>],
     batch: &RecordBatch,
 ) -> Result<Vec<Vec<ArrayRef>>> {
-    expr.iter().map(|expr| evaluate(expr, batch)).collect()
+    expr.iter()
+        .map(|expr| evaluate_expressions_to_arrays(expr, batch))
+        .collect()
 }
 
 fn evaluate_optional(
@@ -1391,27 +1968,18 @@ fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result<ArrayRef> {
 /// The outer Vec appears to be for grouping sets
 /// The inner Vec contains the results per expression
 /// The inner-inner Array contains the results per row
-pub(crate) fn evaluate_group_by(
+pub fn evaluate_group_by(
     group_by: &PhysicalGroupBy,
     batch: &RecordBatch,
 ) -> Result<Vec<Vec<ArrayRef>>> {
-    let exprs: Vec<ArrayRef> = group_by
-        .expr
-        .iter()
-        .map(|(expr, _)| {
-            let value = expr.evaluate(batch)?;
-            value.into_array(batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
-
-    let null_exprs: Vec<ArrayRef> = group_by
-        .null_expr
-        .iter()
-        .map(|(expr, _)| {
-            let value = expr.evaluate(batch)?;
-            value.into_array(batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
+    let exprs = evaluate_expressions_to_arrays(
+        group_by.expr.iter().map(|(expr, _)| expr),
+        batch,
+    )?;
+    let null_exprs = evaluate_expressions_to_arrays(
+        group_by.null_expr.iter().map(|(expr, _)| expr),
+        batch,
+    )?;
 
     group_by
         .groups
@@ -1438,26 +2006,25 @@ mod tests {
     use std::task::{Context, Poll};
 
     use super::*;
-    use crate::coalesce_batches::CoalesceBatchesExec;
+    use crate::RecordBatchStream;
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::common;
     use crate::common::collect;
     use crate::execution_plan::Boundedness;
     use crate::expressions::col;
     use crate::metrics::MetricValue;
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
-    use crate::RecordBatchStream;
+    use crate::test::assert_is_pending;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
 
     use arrow::array::{
-        DictionaryArray, Float32Array, Float64Array, Int32Array, StructArray,
+        DictionaryArray, Float32Array, Float64Array, Int32Array, Int64Array, StructArray,
         UInt32Array, UInt64Array,
     };
-    use arrow::compute::{concat_batches, SortOptions};
+    use arrow::compute::{SortOptions, concat_batches};
     use arrow::datatypes::{DataType, Int32Type};
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
-    use datafusion_common::{internal_err, DataFusionError, ScalarValue};
+    use datafusion_common::{DataFusionError, ScalarValue, internal_err};
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::memory_pool::FairSpillPool;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
@@ -1467,12 +2034,14 @@ mod tests {
     use datafusion_functions_aggregate::first_last::{first_value_udaf, last_value_udaf};
     use datafusion_functions_aggregate::median::median_udaf;
     use datafusion_functions_aggregate::sum::sum_udaf;
-    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
-    use datafusion_physical_expr::expressions::lit;
-    use datafusion_physical_expr::expressions::Literal;
     use datafusion_physical_expr::Partitioning;
     use datafusion_physical_expr::PhysicalSortExpr;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::Literal;
+    use datafusion_physical_expr::expressions::lit;
 
+    use crate::projection::ProjectionExec;
+    use datafusion_physical_expr::projection::ProjectionExpr;
     use futures::{FutureExt, Stream};
     use insta::{allow_duplicates, assert_snapshot};
 
@@ -1603,6 +2172,7 @@ mod tests {
                 vec![true, false],  // (NULL, b)
                 vec![false, false], // (a,b)
             ],
+            true,
         );
 
         let aggregates = vec![Arc::new(
@@ -1637,30 +2207,30 @@ mod tests {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result),
             @r"
-+---+-----+---------------+-----------------+
-| a | b   | __grouping_id | COUNT(1)[count] |
-+---+-----+---------------+-----------------+
-|   | 1.0 | 2             | 1               |
-|   | 1.0 | 2             | 1               |
-|   | 2.0 | 2             | 1               |
-|   | 2.0 | 2             | 1               |
-|   | 3.0 | 2             | 1               |
-|   | 3.0 | 2             | 1               |
-|   | 4.0 | 2             | 1               |
-|   | 4.0 | 2             | 1               |
-| 2 |     | 1             | 1               |
-| 2 |     | 1             | 1               |
-| 2 | 1.0 | 0             | 1               |
-| 2 | 1.0 | 0             | 1               |
-| 3 |     | 1             | 1               |
-| 3 |     | 1             | 2               |
-| 3 | 2.0 | 0             | 2               |
-| 3 | 3.0 | 0             | 1               |
-| 4 |     | 1             | 1               |
-| 4 |     | 1             | 2               |
-| 4 | 3.0 | 0             | 1               |
-| 4 | 4.0 | 0             | 2               |
-+---+-----+---------------+-----------------+
+            +---+-----+---------------+-----------------+
+            | a | b   | __grouping_id | COUNT(1)[count] |
+            +---+-----+---------------+-----------------+
+            |   | 1.0 | 2             | 1               |
+            |   | 1.0 | 2             | 1               |
+            |   | 2.0 | 2             | 1               |
+            |   | 2.0 | 2             | 1               |
+            |   | 3.0 | 2             | 1               |
+            |   | 3.0 | 2             | 1               |
+            |   | 4.0 | 2             | 1               |
+            |   | 4.0 | 2             | 1               |
+            | 2 |     | 1             | 1               |
+            | 2 |     | 1             | 1               |
+            | 2 | 1.0 | 0             | 1               |
+            | 2 | 1.0 | 0             | 1               |
+            | 3 |     | 1             | 1               |
+            | 3 |     | 1             | 2               |
+            | 3 | 2.0 | 0             | 2               |
+            | 3 | 3.0 | 0             | 1               |
+            | 4 |     | 1             | 1               |
+            | 4 |     | 1             | 2               |
+            | 4 | 3.0 | 0             | 1               |
+            | 4 | 4.0 | 0             | 2               |
+            +---+-----+---------------+-----------------+
             "
             );
             }
@@ -1668,22 +2238,22 @@ mod tests {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result),
             @r"
-+---+-----+---------------+-----------------+
-| a | b   | __grouping_id | COUNT(1)[count] |
-+---+-----+---------------+-----------------+
-|   | 1.0 | 2             | 2               |
-|   | 2.0 | 2             | 2               |
-|   | 3.0 | 2             | 2               |
-|   | 4.0 | 2             | 2               |
-| 2 |     | 1             | 2               |
-| 2 | 1.0 | 0             | 2               |
-| 3 |     | 1             | 3               |
-| 3 | 2.0 | 0             | 2               |
-| 3 | 3.0 | 0             | 1               |
-| 4 |     | 1             | 3               |
-| 4 | 3.0 | 0             | 1               |
-| 4 | 4.0 | 0             | 2               |
-+---+-----+---------------+-----------------+
+            +---+-----+---------------+-----------------+
+            | a | b   | __grouping_id | COUNT(1)[count] |
+            +---+-----+---------------+-----------------+
+            |   | 1.0 | 2             | 2               |
+            |   | 2.0 | 2             | 2               |
+            |   | 3.0 | 2             | 2               |
+            |   | 4.0 | 2             | 2               |
+            | 2 |     | 1             | 2               |
+            | 2 | 1.0 | 0             | 2               |
+            | 3 |     | 1             | 3               |
+            | 3 | 2.0 | 0             | 2               |
+            | 3 | 3.0 | 0             | 1               |
+            | 4 |     | 1             | 3               |
+            | 4 | 3.0 | 0             | 1               |
+            | 4 | 4.0 | 0             | 2               |
+            +---+-----+---------------+-----------------+
             "
             );
             }
@@ -1717,23 +2287,23 @@ mod tests {
         assert_snapshot!(
             batches_to_sort_string(&result),
             @r"
-            +---+-----+---------------+----------+
-            | a | b   | __grouping_id | COUNT(1) |
-            +---+-----+---------------+----------+
-            |   | 1.0 | 2             | 2        |
-            |   | 2.0 | 2             | 2        |
-            |   | 3.0 | 2             | 2        |
-            |   | 4.0 | 2             | 2        |
-            | 2 |     | 1             | 2        |
-            | 2 | 1.0 | 0             | 2        |
-            | 3 |     | 1             | 3        |
-            | 3 | 2.0 | 0             | 2        |
-            | 3 | 3.0 | 0             | 1        |
-            | 4 |     | 1             | 3        |
-            | 4 | 3.0 | 0             | 1        |
-            | 4 | 4.0 | 0             | 2        |
-            +---+-----+---------------+----------+
-            "
+        +---+-----+---------------+----------+
+        | a | b   | __grouping_id | COUNT(1) |
+        +---+-----+---------------+----------+
+        |   | 1.0 | 2             | 2        |
+        |   | 2.0 | 2             | 2        |
+        |   | 3.0 | 2             | 2        |
+        |   | 4.0 | 2             | 2        |
+        | 2 |     | 1             | 2        |
+        | 2 | 1.0 | 0             | 2        |
+        | 3 |     | 1             | 3        |
+        | 3 | 2.0 | 0             | 2        |
+        | 3 | 3.0 | 0             | 1        |
+        | 4 |     | 1             | 3        |
+        | 4 | 3.0 | 0             | 1        |
+        | 4 | 4.0 | 0             | 2        |
+        +---+-----+---------------+----------+
+        "
         );
         }
 
@@ -1752,6 +2322,7 @@ mod tests {
             vec![(col("a", &input_schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![Arc::new(
@@ -1783,27 +2354,27 @@ mod tests {
         if spill {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result), @r"
-                +---+---------------+-------------+
-                | a | AVG(b)[count] | AVG(b)[sum] |
-                +---+---------------+-------------+
-                | 2 | 1             | 1.0         |
-                | 2 | 1             | 1.0         |
-                | 3 | 1             | 2.0         |
-                | 3 | 2             | 5.0         |
-                | 4 | 3             | 11.0        |
-                +---+---------------+-------------+
+            +---+---------------+-------------+
+            | a | AVG(b)[count] | AVG(b)[sum] |
+            +---+---------------+-------------+
+            | 2 | 1             | 1.0         |
+            | 2 | 1             | 1.0         |
+            | 3 | 1             | 2.0         |
+            | 3 | 2             | 5.0         |
+            | 4 | 3             | 11.0        |
+            +---+---------------+-------------+
             ");
             }
         } else {
             allow_duplicates! {
             assert_snapshot!(batches_to_sort_string(&result), @r"
-                +---+---------------+-------------+
-                | a | AVG(b)[count] | AVG(b)[sum] |
-                +---+---------------+-------------+
-                | 2 | 2             | 2.0         |
-                | 3 | 3             | 7.0         |
-                | 4 | 3             | 11.0        |
-                +---+---------------+-------------+
+            +---+---------------+-------------+
+            | a | AVG(b)[count] | AVG(b)[sum] |
+            +---+---------------+-------------+
+            | 2 | 2             | 2.0         |
+            | 3 | 3             | 7.0         |
+            | 4 | 3             | 11.0        |
+            +---+---------------+-------------+
             ");
             }
         };
@@ -1821,6 +2392,10 @@ mod tests {
             input_schema,
         )?);
 
+        // Verify statistics are preserved proportionally through aggregation
+        let final_stats = merged_aggregate.partition_statistics(None)?;
+        assert!(final_stats.total_byte_size.get_value().is_some());
+
         let task_ctx = if spill {
             // enlarge memory limit to let the final aggregation finish
             new_spill_ctx(2, 2600)
@@ -1834,14 +2409,14 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_sort_string(&result), @r"
-            +---+--------------------+
-            | a | AVG(b)             |
-            +---+--------------------+
-            | 2 | 1.0                |
-            | 3 | 2.3333333333333335 |
-            | 4 | 3.6666666666666665 |
-            +---+--------------------+
-            ");
+        +---+--------------------+
+        | a | AVG(b)             |
+        +---+--------------------+
+        | 2 | 1.0                |
+        | 3 | 2.3333333333333335 |
+        | 4 | 3.6666666666666665 |
+        +---+--------------------+
+        ");
             // For row 2: 3, (2 + 3 + 2) / 3
             // For row 3: 4, (3 + 4 + 4) / 3
         }
@@ -1877,14 +2452,17 @@ mod tests {
     struct TestYieldingExec {
         /// True if this exec should yield back to runtime the first time it is polled
         pub yield_first: bool,
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl TestYieldingExec {
         fn new(yield_first: bool) -> Self {
             let schema = some_data().0;
             let cache = Self::compute_properties(schema);
-            Self { yield_first, cache }
+            Self {
+                yield_first,
+                cache: Arc::new(cache),
+            }
         }
 
         /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -1925,7 +2503,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -1933,6 +2511,13 @@ mod tests {
             vec![]
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -1954,20 +2539,19 @@ mod tests {
             Ok(Box::pin(stream))
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            self.partition_statistics(None)
-        }
-
-        fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             if partition.is_some() {
-                return Ok(Statistics::new_unknown(self.schema().as_ref()));
+                return Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref())));
             }
             let (_, batches) = some_data();
-            Ok(common::compute_record_batch_statistics(
+            Ok(Arc::new(common::compute_record_batch_statistics(
                 &[batches],
                 &self.schema(),
                 None,
-            ))
+            )))
         }
     }
 
@@ -2093,6 +2677,7 @@ mod tests {
             vec![(col("a", &input_schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         // something that allocates within the aggregator
@@ -2113,7 +2698,7 @@ mod tests {
         ] {
             let n_aggr = aggregates.len();
             let partial_aggregate = Arc::new(AggregateExec::try_new(
-                AggregateMode::Partial,
+                AggregateMode::Single,
                 groups,
                 aggregates,
                 vec![None; n_aggr],
@@ -2121,7 +2706,7 @@ mod tests {
                 Arc::clone(&input_schema),
             )?);
 
-            let stream = partial_aggregate.execute_typed(0, Arc::clone(&task_ctx))?;
+            let stream = partial_aggregate.execute_typed(0, &task_ctx)?;
 
             // ensure that we really got the version we wanted
             match version {
@@ -2228,17 +2813,9 @@ mod tests {
 
     #[tokio::test]
     async fn run_first_last_multi_partitions() -> Result<()> {
-        for use_coalesce_batches in [false, true] {
-            for is_first_acc in [false, true] {
-                for spill in [false, true] {
-                    first_last_multi_partitions(
-                        use_coalesce_batches,
-                        is_first_acc,
-                        spill,
-                        4200,
-                    )
-                    .await?
-                }
+        for is_first_acc in [false, true] {
+            for spill in [false, true] {
+                first_last_multi_partitions(is_first_acc, spill, 4200).await?
             }
         }
         Ok(())
@@ -2249,14 +2826,14 @@ mod tests {
         schema: &Schema,
         sort_options: SortOptions,
     ) -> Result<Arc<AggregateFunctionExpr>> {
-        let ordering_req = [PhysicalSortExpr {
+        let order_bys = vec![PhysicalSortExpr {
             expr: col("b", schema)?,
             options: sort_options,
         }];
         let args = [col("b", schema)?];
 
         AggregateExprBuilder::new(first_value_udaf(), args.to_vec())
-            .order_by(LexOrdering::new(ordering_req.to_vec()))
+            .order_by(order_bys)
             .schema(Arc::new(schema.clone()))
             .alias(String::from("first_value(b) ORDER BY [b ASC NULLS LAST]"))
             .build()
@@ -2268,28 +2845,20 @@ mod tests {
         schema: &Schema,
         sort_options: SortOptions,
     ) -> Result<Arc<AggregateFunctionExpr>> {
-        let ordering_req = [PhysicalSortExpr {
+        let order_bys = vec![PhysicalSortExpr {
             expr: col("b", schema)?,
             options: sort_options,
         }];
         let args = [col("b", schema)?];
         AggregateExprBuilder::new(last_value_udaf(), args.to_vec())
-            .order_by(LexOrdering::new(ordering_req.to_vec()))
+            .order_by(order_bys)
             .schema(Arc::new(schema.clone()))
             .alias(String::from("last_value(b) ORDER BY [b ASC NULLS LAST]"))
             .build()
             .map(Arc::new)
     }
 
-    // This function either constructs the physical plan below,
-    //
-    // "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[FIRST_VALUE(b)]",
-    // "  CoalesceBatchesExec: target_batch_size=1024",
-    // "    CoalescePartitionsExec",
-    // "      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[FIRST_VALUE(b)], ordering_mode=None",
-    // "        DataSourceExec: partitions=4, partition_sizes=[1, 1, 1, 1]",
-    //
-    // or
+    // This function constructs the physical plan below,
     //
     // "AggregateExec: mode=Final, gby=[a@0 as a], aggr=[FIRST_VALUE(b)]",
     // "  CoalescePartitionsExec",
@@ -2299,7 +2868,6 @@ mod tests {
     // and checks whether the function `merge_batch` works correctly for
     // FIRST_VALUE and LAST_VALUE functions.
     async fn first_last_multi_partitions(
-        use_coalesce_batches: bool,
         is_first_acc: bool,
         spill: bool,
         max_memory: usize,
@@ -2347,13 +2915,8 @@ mod tests {
             memory_exec,
             Arc::clone(&schema),
         )?);
-        let coalesce = if use_coalesce_batches {
-            let coalesce = Arc::new(CoalescePartitionsExec::new(aggregate_exec));
-            Arc::new(CoalesceBatchesExec::new(coalesce, 1024)) as Arc<dyn ExecutionPlan>
-        } else {
-            Arc::new(CoalescePartitionsExec::new(aggregate_exec))
-                as Arc<dyn ExecutionPlan>
-        };
+        let coalesce = Arc::new(CoalescePartitionsExec::new(aggregate_exec))
+            as Arc<dyn ExecutionPlan>;
         let aggregate_final = Arc::new(AggregateExec::try_new(
             AggregateMode::Final,
             groups,
@@ -2367,26 +2930,26 @@ mod tests {
         if is_first_acc {
             allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+--------------------------------------------+
-                | a | first_value(b) ORDER BY [b ASC NULLS LAST] |
-                +---+--------------------------------------------+
-                | 2 | 0.0                                        |
-                | 3 | 1.0                                        |
-                | 4 | 3.0                                        |
-                +---+--------------------------------------------+
-                ");
+            +---+--------------------------------------------+
+            | a | first_value(b) ORDER BY [b ASC NULLS LAST] |
+            +---+--------------------------------------------+
+            | 2 | 0.0                                        |
+            | 3 | 1.0                                        |
+            | 4 | 3.0                                        |
+            +---+--------------------------------------------+
+            ");
             }
         } else {
             allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+-------------------------------------------+
-                | a | last_value(b) ORDER BY [b ASC NULLS LAST] |
-                +---+-------------------------------------------+
-                | 2 | 3.0                                       |
-                | 3 | 5.0                                       |
-                | 4 | 6.0                                       |
-                +---+-------------------------------------------+
-                ");
+            +---+-------------------------------------------+
+            | a | last_value(b) ORDER BY [b ASC NULLS LAST] |
+            +---+-------------------------------------------+
+            | 2 | 3.0                                       |
+            | 3 | 5.0                                       |
+            | 4 | 6.0                                       |
+            +---+-------------------------------------------+
+            ");
             }
         };
         Ok(())
@@ -2396,9 +2959,7 @@ mod tests {
     async fn test_get_finest_requirements() -> Result<()> {
         let test_schema = create_test_schema()?;
 
-        // Assume column a and b are aliases
-        // Assume also that a ASC and c DESC describe the same global ordering for the table. (Since they are ordering equivalent).
-        let options1 = SortOptions {
+        let options = SortOptions {
             descending: false,
             nulls_first: false,
         };
@@ -2407,58 +2968,51 @@ mod tests {
         let col_c = &col("c", &test_schema)?;
         let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
         // Columns a and b are equal.
-        eq_properties.add_equal_conditions(col_a, col_b)?;
+        eq_properties.add_equal_conditions(Arc::clone(col_a), Arc::clone(col_b))?;
         // Aggregate requirements are
         // [None], [a ASC], [a ASC, b ASC, c ASC], [a ASC, b ASC] respectively
         let order_by_exprs = vec![
-            None,
-            Some(vec![PhysicalSortExpr {
+            vec![],
+            vec![PhysicalSortExpr {
                 expr: Arc::clone(col_a),
-                options: options1,
-            }]),
-            Some(vec![
+                options,
+            }],
+            vec![
                 PhysicalSortExpr {
                     expr: Arc::clone(col_a),
-                    options: options1,
+                    options,
                 },
                 PhysicalSortExpr {
                     expr: Arc::clone(col_b),
-                    options: options1,
+                    options,
                 },
                 PhysicalSortExpr {
                     expr: Arc::clone(col_c),
-                    options: options1,
+                    options,
                 },
-            ]),
-            Some(vec![
+            ],
+            vec![
                 PhysicalSortExpr {
                     expr: Arc::clone(col_a),
-                    options: options1,
+                    options,
                 },
                 PhysicalSortExpr {
                     expr: Arc::clone(col_b),
-                    options: options1,
+                    options,
                 },
-            ]),
+            ],
         ];
 
-        let common_requirement = LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::clone(col_a),
-                options: options1,
-            },
-            PhysicalSortExpr {
-                expr: Arc::clone(col_c),
-                options: options1,
-            },
-        ]);
+        let common_requirement = vec![
+            PhysicalSortRequirement::new(Arc::clone(col_a), Some(options)),
+            PhysicalSortRequirement::new(Arc::clone(col_c), Some(options)),
+        ];
         let mut aggr_exprs = order_by_exprs
             .into_iter()
             .map(|order_by_expr| {
-                let ordering_req = order_by_expr.unwrap_or_default();
                 AggregateExprBuilder::new(array_agg_udaf(), vec![Arc::clone(col_a)])
                     .alias("a")
-                    .order_by(LexOrdering::new(ordering_req.to_vec()))
+                    .order_by(order_by_expr)
                     .schema(Arc::clone(&test_schema))
                     .build()
                     .map(Arc::new)
@@ -2466,14 +3020,13 @@ mod tests {
             })
             .collect::<Vec<_>>();
         let group_by = PhysicalGroupBy::new_single(vec![]);
-        let res = get_finer_aggregate_exprs_requirement(
+        let result = get_finer_aggregate_exprs_requirement(
             &mut aggr_exprs,
             &group_by,
             &eq_properties,
             &AggregateMode::Partial,
         )?;
-        let res = LexOrdering::from(res);
-        assert_eq!(res, common_requirement);
+        assert_eq!(result, common_requirement);
         Ok(())
     }
 
@@ -2547,14 +3100,16 @@ mod tests {
                 vec![true, false, true],
                 vec![true, true, false],
             ],
+            true,
         );
 
-        let aggregates: Vec<Arc<AggregateFunctionExpr>> =
-            vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![lit(1)])
                 .schema(Arc::clone(&schema))
                 .alias("1")
                 .build()
-                .map(Arc::new)?];
+                .map(Arc::new)?,
+        ];
 
         let input_batches = (0..4)
             .map(|_| {
@@ -2583,13 +3138,13 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_sort_string(&output), @r"
-            +-----+-----+-------+---------------+-------+
-            | a   | b   | const | __grouping_id | 1     |
-            +-----+-----+-------+---------------+-------+
-            |     |     | 1     | 6             | 32768 |
-            |     | 0.0 |       | 5             | 32768 |
-            | 0.0 |     |       | 3             | 32768 |
-            +-----+-----+-------+---------------+-------+
+        +-----+-----+-------+---------------+-------+
+        | a   | b   | const | __grouping_id | 1     |
+        +-----+-----+-------+---------------+-------+
+        |     |     | 1     | 6             | 32768 |
+        |     | 0.0 |       | 5             | 32768 |
+        | 0.0 |     |       | 3             | 32768 |
+        +-----+-----+-------+---------------+-------+
         ");
         }
 
@@ -2670,14 +3225,13 @@ mod tests {
             "labels".to_string(),
         )]);
 
-        let aggr_expr = vec![AggregateExprBuilder::new(
-            sum_udaf(),
-            vec![col("value", &batch.schema())?],
-        )
-        .schema(Arc::clone(&batch.schema()))
-        .alias(String::from("SUM(value)"))
-        .build()
-        .map(Arc::new)?];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(sum_udaf(), vec![col("value", &batch.schema())?])
+                .schema(Arc::clone(&batch.schema()))
+                .alias(String::from("SUM(value)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input = TestMemoryExec::try_new_exec(
             &[vec![batch.clone()]],
@@ -2699,13 +3253,13 @@ mod tests {
 
         allow_duplicates! {
         assert_snapshot!(batches_to_string(&output), @r"
-            +--------------+------------+
-            | labels       | SUM(value) |
-            +--------------+------------+
-            | {a: a, b: b} | 2          |
-            | {a: , b: c}  | 1          |
-            +--------------+------------+
-            ");
+        +--------------+------------+
+        | labels       | SUM(value) |
+        +--------------+------------+
+        | {a: a, b: b} | 2          |
+        | {a: , b: c}  | 1          |
+        +--------------+------------+
+        ");
         }
 
         Ok(())
@@ -2721,14 +3275,13 @@ mod tests {
         let group_by =
             PhysicalGroupBy::new_single(vec![(col("key", &schema)?, "key".to_string())]);
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
-                    .schema(Arc::clone(&schema))
-                    .alias(String::from("COUNT(val)"))
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias(String::from("COUNT(val)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input_data = vec![
             RecordBatch::try_new(
@@ -2801,14 +3354,13 @@ mod tests {
         let group_by =
             PhysicalGroupBy::new_single(vec![(col("key", &schema)?, "key".to_string())]);
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
-                    .schema(Arc::clone(&schema))
-                    .alias(String::from("COUNT(val)"))
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("val", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias(String::from("COUNT(val)"))
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let input_data = vec![
             RecordBatch::try_new(
@@ -2887,14 +3439,13 @@ mod tests {
             Field::new("b", DataType::Float32, false),
         ]));
 
-        let aggr_expr =
-            vec![
-                AggregateExprBuilder::new(count_udaf(), vec![col("a", &input_schema)?])
-                    .schema(Arc::clone(&input_schema))
-                    .alias("COUNT(a)")
-                    .build()
-                    .map(Arc::new)?,
-            ];
+        let aggr_expr = vec![
+            AggregateExprBuilder::new(count_udaf(), vec![col("a", &input_schema)?])
+                .schema(Arc::clone(&input_schema))
+                .alias("COUNT(a)")
+                .build()
+                .map(Arc::new)?,
+        ];
 
         let grouping_set = PhysicalGroupBy::new(
             vec![
@@ -2909,6 +3460,7 @@ mod tests {
                 vec![false, true],  // (a, NULL)
                 vec![false, false], // (a,b)
             ],
+            true,
         );
         let aggr_schema = create_schema(
             &input_schema,
@@ -2960,6 +3512,7 @@ mod tests {
             vec![(col("a", &schema)?, "a".to_string())],
             vec![],
             vec![vec![false]],
+            false,
         );
 
         // Test with MIN for simple intermediate state (min) and AVG for multiple intermediate states (partial sum, partial count).
@@ -3008,13 +3561,13 @@ mod tests {
 
         allow_duplicates! {
             assert_snapshot!(batches_to_string(&result), @r"
-                +---+--------+--------+
-                | a | MIN(b) | AVG(b) |
-                +---+--------+--------+
-                | 2 | 1.0    | 1.0    |
-                | 3 | 2.0    | 2.0    |
-                | 4 | 3.0    | 3.5    |
-                +---+--------+--------+
+            +---+--------+--------+
+            | a | MIN(b) | AVG(b) |
+            +---+--------+--------+
+            | 2 | 1.0    | 1.0    |
+            | 3 | 2.0    | 2.0    |
+            | 4 | 3.0    | 3.5    |
+            +---+--------+--------+
             ");
         }
 
@@ -3041,7 +3594,9 @@ mod tests {
                     "Expected spill but SpillCount metric not found or SpillCount was 0."
                 );
             } else if !expect_spill && spill_count > 0 {
-                panic!("Expected no spill but found SpillCount metric with value greater than 0.");
+                panic!(
+                    "Expected no spill but found SpillCount metric with value greater than 0."
+                );
             }
         } else {
             panic!("No metrics returned from the operator; cannot verify spilling.");
@@ -3056,4 +3611,538 @@ mod tests {
         run_test_with_spill_pool_if_necessary(20_000, false).await?;
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_grouped_aggregation_respects_memory_limit() -> Result<()> {
+        // test with spill
+        fn create_record_batch(
+            schema: &Arc<Schema>,
+            data: (Vec<u32>, Vec<f64>),
+        ) -> Result<RecordBatch> {
+            Ok(RecordBatch::try_new(
+                Arc::clone(schema),
+                vec![
+                    Arc::new(UInt32Array::from(data.0)),
+                    Arc::new(Float64Array::from(data.1)),
+                ],
+            )?)
+        }
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        let batches = vec![
+            create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?,
+            create_record_batch(&schema, (vec![2, 3, 4, 4], vec![1.0, 2.0, 3.0, 4.0]))?,
+        ];
+        let plan: Arc<dyn ExecutionPlan> =
+            TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+        let proj = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr::new(lit("0"), "l".to_string()),
+                ProjectionExpr::new_from_expression(col("a", &schema)?, &schema)?,
+                ProjectionExpr::new_from_expression(col("b", &schema)?, &schema)?,
+            ],
+            plan,
+        )?;
+        let plan: Arc<dyn ExecutionPlan> = Arc::new(proj);
+        let schema = plan.schema();
+
+        let grouping_set = PhysicalGroupBy::new(
+            vec![
+                (col("l", &schema)?, "l".to_string()),
+                (col("a", &schema)?, "a".to_string()),
+            ],
+            vec![],
+            vec![vec![false, false]],
+            false,
+        );
+
+        // Test with MIN for simple intermediate state (min) and AVG for multiple intermediate states (partial sum, partial count).
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![
+            Arc::new(
+                AggregateExprBuilder::new(
+                    datafusion_functions_aggregate::min_max::min_udaf(),
+                    vec![col("b", &schema)?],
+                )
+                .schema(Arc::clone(&schema))
+                .alias("MIN(b)")
+                .build()?,
+            ),
+            Arc::new(
+                AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(b)")
+                    .build()?,
+            ),
+        ];
+
+        let single_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            grouping_set,
+            aggregates,
+            vec![None, None],
+            plan,
+            Arc::clone(&schema),
+        )?);
+
+        let batch_size = 2;
+        let memory_pool = Arc::new(FairSpillPool::new(2000));
+        let task_ctx = Arc::new(
+            TaskContext::default()
+                .with_session_config(SessionConfig::new().with_batch_size(batch_size))
+                .with_runtime(Arc::new(
+                    RuntimeEnvBuilder::new()
+                        .with_memory_pool(memory_pool)
+                        .build()?,
+                )),
+        );
+
+        let result = collect(single_aggregate.execute(0, Arc::clone(&task_ctx))?).await;
+        match result {
+            Ok(result) => {
+                assert_spill_count_metric(true, single_aggregate);
+
+                allow_duplicates! {
+                    assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+--------+--------+
+                | l | a | MIN(b) | AVG(b) |
+                +---+---+--------+--------+
+                | 0 | 2 | 1.0    | 1.0    |
+                | 0 | 3 | 2.0    | 2.0    |
+                | 0 | 4 | 3.0    | 3.5    |
+                +---+---+--------+--------+
+            ");
+                }
+            }
+            Err(e) => assert!(matches!(e, DataFusionError::ResourcesExhausted(_))),
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_aggregate_statistics_edge_cases() -> Result<()> {
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        // Test 1: Absent statistics remain absent
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Exact(100),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    ColumnStatistics::new_unknown(),
+                    ColumnStatistics::new_unknown(),
+                ],
+            },
+            (*schema).clone(),
+        )) as Arc<dyn ExecutionPlan>;
+
+        let agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?);
+
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(stats.total_byte_size, Precision::Absent);
+
+        // Test 2: Zero rows returns Absent (can't estimate output size from zero input)
+        let input_zero = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Exact(0),
+                total_byte_size: Precision::Exact(0),
+                column_statistics: vec![
+                    ColumnStatistics::new_unknown(),
+                    ColumnStatistics::new_unknown(),
+                ],
+            },
+            (*schema).clone(),
+        )) as Arc<dyn ExecutionPlan>;
+
+        let agg_zero = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::default(),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input_zero,
+            Arc::clone(&schema),
+        )?);
+
+        let stats_zero = agg_zero.partition_statistics(None)?;
+        assert_eq!(stats_zero.total_byte_size, Precision::Absent);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_order_is_retained_when_spilling() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int64, false),
+            Field::new("c", DataType::Int64, false),
+        ]));
+
+        let batches = vec![vec![
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![0])),
+                    Arc::new(Int64Array::from(vec![0])),
+                    Arc::new(Int64Array::from(vec![1])),
+                ],
+            )?,
+        ]];
+        let scan = TestMemoryExec::try_new(&batches, Arc::clone(&schema), None)?;
+        let scan = scan.try_with_sort_information(vec![
+            LexOrdering::new([PhysicalSortExpr::new(
+                col("b", schema.as_ref())?,
+                SortOptions::default().desc(),
+            )])
+            .unwrap(),
+        ])?;
+
+        let aggr = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new(
+                vec![
+                    (col("b", schema.as_ref())?, "b".to_string()),
+                    (col("c", schema.as_ref())?, "c".to_string()),
+                ],
+                vec![],
+                vec![vec![false, false]],
+                false,
+            ),
+            vec![Arc::new(
+                AggregateExprBuilder::new(sum_udaf(), vec![col("c", schema.as_ref())?])
+                    .schema(Arc::clone(&schema))
+                    .alias("SUM(c)")
+                    .build()?,
+            )],
+            vec![None],
+            Arc::new(scan) as Arc<dyn ExecutionPlan>,
+            Arc::clone(&schema),
+        )?);
+
+        let task_ctx = new_spill_ctx(1, 600);
+        let result = collect(aggr.execute(0, Arc::clone(&task_ctx))?).await?;
+        assert_spill_count_metric(true, aggr);
+
+        allow_duplicates! {
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+--------+
+            | b | c | SUM(c) |
+            +---+---+--------+
+            | 2 | 1 | 1      |
+            | 1 | 1 | 1      |
+            | 0 | 1 | 1      |
+            +---+---+--------+
+        ");
+        }
+        Ok(())
+    }
+
+    /// Tests that when the memory pool is too small to accommodate the sort
+    /// reservation during spill, the error is properly propagated as
+    /// ResourcesExhausted rather than silently exceeding memory limits.
+    #[tokio::test]
+    async fn test_sort_reservation_fails_during_spill() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("g", DataType::Int64, false),
+            Field::new("a", DataType::Float64, false),
+            Field::new("b", DataType::Float64, false),
+            Field::new("c", DataType::Float64, false),
+            Field::new("d", DataType::Float64, false),
+            Field::new("e", DataType::Float64, false),
+        ]));
+
+        let batches = vec![vec![
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![1])),
+                    Arc::new(Float64Array::from(vec![10.0])),
+                    Arc::new(Float64Array::from(vec![20.0])),
+                    Arc::new(Float64Array::from(vec![30.0])),
+                    Arc::new(Float64Array::from(vec![40.0])),
+                    Arc::new(Float64Array::from(vec![50.0])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![2])),
+                    Arc::new(Float64Array::from(vec![11.0])),
+                    Arc::new(Float64Array::from(vec![21.0])),
+                    Arc::new(Float64Array::from(vec![31.0])),
+                    Arc::new(Float64Array::from(vec![41.0])),
+                    Arc::new(Float64Array::from(vec![51.0])),
+                ],
+            )?,
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Int64Array::from(vec![3])),
+                    Arc::new(Float64Array::from(vec![12.0])),
+                    Arc::new(Float64Array::from(vec![22.0])),
+                    Arc::new(Float64Array::from(vec![32.0])),
+                    Arc::new(Float64Array::from(vec![42.0])),
+                    Arc::new(Float64Array::from(vec![52.0])),
+                ],
+            )?,
+        ]];
+
+        let scan = TestMemoryExec::try_new(&batches, Arc::clone(&schema), None)?;
+
+        let aggr = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            PhysicalGroupBy::new(
+                vec![(col("g", schema.as_ref())?, "g".to_string())],
+                vec![],
+                vec![vec![false]],
+                false,
+            ),
+            vec![
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("a", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(a)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("b", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(b)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("c", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(c)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("d", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(d)")
+                    .build()?,
+                ),
+                Arc::new(
+                    AggregateExprBuilder::new(
+                        avg_udaf(),
+                        vec![col("e", schema.as_ref())?],
+                    )
+                    .schema(Arc::clone(&schema))
+                    .alias("AVG(e)")
+                    .build()?,
+                ),
+            ],
+            vec![None, None, None, None, None],
+            Arc::new(scan) as Arc<dyn ExecutionPlan>,
+            Arc::clone(&schema),
+        )?);
+
+        // Pool must be large enough for accumulation to start but too small for
+        // sort_memory after clearing.
+        let task_ctx = new_spill_ctx(1, 500);
+        let result = collect(aggr.execute(0, Arc::clone(&task_ctx))?).await;
+
+        match &result {
+            Ok(_) => panic!("Expected ResourcesExhausted error but query succeeded"),
+            Err(e) => {
+                let root = e.find_root();
+                assert!(
+                    matches!(root, DataFusionError::ResourcesExhausted(_)),
+                    "Expected ResourcesExhausted, got: {root}",
+                );
+                let msg = root.to_string();
+                assert!(
+                    msg.contains("Failed to reserve memory for sort during spill"),
+                    "Expected sort reservation error, got: {msg}",
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Tests that PartialReduce mode:
+    /// 1. Accepts state as input (like Final)
+    /// 2. Produces state as output (like Partial)
+    /// 3. Can be followed by a Final stage to get the correct result
+    ///
+    /// This simulates a tree-reduce pattern:
+    ///   Partial -> PartialReduce -> Final
+    #[tokio::test]
+    async fn test_partial_reduce_mode() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        // Produce two partitions of input data
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(vec![1, 2, 3])),
+                Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])),
+            ],
+        )?;
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(vec![1, 2, 3])),
+                Arc::new(Float64Array::from(vec![40.0, 50.0, 60.0])),
+            ],
+        )?;
+
+        let groups =
+            PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]);
+        let aggregates: Vec<Arc<AggregateFunctionExpr>> = vec![Arc::new(
+            AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("SUM(b)")
+                .build()?,
+        )];
+
+        // Step 1: Partial aggregation on partition 1
+        let input1 =
+            TestMemoryExec::try_new_exec(&[vec![batch1]], Arc::clone(&schema), None)?;
+        let partial1 = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            input1,
+            Arc::clone(&schema),
+        )?);
+
+        // Step 2: Partial aggregation on partition 2
+        let input2 =
+            TestMemoryExec::try_new_exec(&[vec![batch2]], Arc::clone(&schema), None)?;
+        let partial2 = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            input2,
+            Arc::clone(&schema),
+        )?);
+
+        // Collect partial results
+        let task_ctx = Arc::new(TaskContext::default());
+        let partial_result1 =
+            crate::collect(Arc::clone(&partial1) as _, Arc::clone(&task_ctx)).await?;
+        let partial_result2 =
+            crate::collect(Arc::clone(&partial2) as _, Arc::clone(&task_ctx)).await?;
+
+        // The partial results have state schema (group cols + accumulator state)
+        let partial_schema = partial1.schema();
+
+        // Step 3: PartialReduce — combine partial results, still producing state
+        let combined_input = TestMemoryExec::try_new_exec(
+            &[partial_result1, partial_result2],
+            Arc::clone(&partial_schema),
+            None,
+        )?;
+        // Coalesce into a single partition for the PartialReduce
+        let coalesced = Arc::new(CoalescePartitionsExec::new(combined_input));
+
+        let partial_reduce = Arc::new(AggregateExec::try_new(
+            AggregateMode::PartialReduce,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            coalesced,
+            Arc::clone(&partial_schema),
+        )?);
+
+        // Verify PartialReduce output schema matches Partial output schema
+        // (both produce state, not final values)
+        assert_eq!(partial_reduce.schema(), partial_schema);
+
+        // Collect PartialReduce results
+        let reduce_result =
+            crate::collect(Arc::clone(&partial_reduce) as _, Arc::clone(&task_ctx))
+                .await?;
+
+        // Step 4: Final aggregation on the PartialReduce output
+        let final_input = TestMemoryExec::try_new_exec(
+            &[reduce_result],
+            Arc::clone(&partial_schema),
+            None,
+        )?;
+        let final_agg = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            groups.clone(),
+            aggregates.clone(),
+            vec![None],
+            final_input,
+            Arc::clone(&partial_schema),
+        )?);
+
+        let result = crate::collect(final_agg, Arc::clone(&task_ctx)).await?;
+
+        // Expected: group 1 -> 10+40=50, group 2 -> 20+50=70, group 3 -> 30+60=90
+        assert_snapshot!(batches_to_sort_string(&result), @r"
+            +---+--------+
+            | a | SUM(b) |
+            +---+--------+
+            | 1 | 50.0   |
+            | 2 | 70.0   |
+            | 3 | 90.0   |
+            +---+--------+
+        ");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs
index 9474a5f88c92a..a7dd7c9a66cb1 100644
--- a/datafusion/physical-plan/src/aggregates/no_grouping.rs
+++ b/datafusion/physical-plan/src/aggregates/no_grouping.rs
@@ -18,27 +18,31 @@
 //! Aggregate without grouping columns
 
 use crate::aggregates::{
-    aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem,
-    AggregateMode,
+    AccumulatorItem, AggrDynFilter, AggregateInputMode, AggregateMode,
+    DynamicFilterAggregateType, aggregate_expressions, create_accumulators,
+    finalize_aggregation,
 };
 use crate::metrics::{BaselineMetrics, RecordOutput};
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::Result;
+use datafusion_common::{Result, ScalarValue, internal_datafusion_err, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_expr::Operator;
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{BinaryExpr, lit};
 use futures::stream::BoxStream;
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use super::AggregateExec;
 use crate::filter::batch_filter;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::{Stream, StreamExt};
 
-use super::AggregateExec;
-
 /// stream struct for aggregation without grouping columns
 pub(crate) struct AggregateStream {
     stream: BoxStream<'static, Result<RecordBatch>>,
@@ -53,44 +57,268 @@ pub(crate) struct AggregateStream {
 ///
 /// The latter requires a state object, which is [`AggregateStreamInner`].
 struct AggregateStreamInner {
+    // ==== Properties ====
     schema: SchemaRef,
     mode: AggregateMode,
     input: SendableRecordBatchStream,
-    baseline_metrics: BaselineMetrics,
     aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
+
+    // ==== Runtime States/Buffers ====
     accumulators: Vec<AccumulatorItem>,
-    reservation: MemoryReservation,
+    // None if the dynamic filter is not applicable. See details in `AggrDynFilter`.
+    agg_dyn_filter_state: Option<Arc<AggrDynFilter>>,
     finished: bool,
+
+    // ==== Execution Resources ====
+    baseline_metrics: BaselineMetrics,
+    reservation: MemoryReservation,
+}
+
+impl AggregateStreamInner {
+    // TODO: check if we get Null handling correct
+    /// # Examples
+    /// - Example 1
+    ///   Accumulators: min(c1)
+    ///   Current Bounds: min(c1)=10
+    ///   --> dynamic filter PhysicalExpr: c1 < 10
+    ///
+    /// - Example 2
+    ///   Accumulators: min(c1), max(c1), min(c2)
+    ///   Current Bounds: min(c1)=10, max(c1)=100, min(c2)=20
+    ///   --> dynamic filter PhysicalExpr: (c1 < 10) OR (c1>100) OR (c2 < 20)
+    ///
+    /// # Errors
+    /// Returns internal errors if the dynamic filter is not enabled, or other
+    /// invariant check fails.
+    fn build_dynamic_filter_from_accumulator_bounds(
+        &self,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        let Some(filter_state) = self.agg_dyn_filter_state.as_ref() else {
+            return internal_err!(
+                "`build_dynamic_filter_from_accumulator_bounds()` is only called when dynamic filter is enabled"
+            );
+        };
+
+        let mut predicates: Vec<Arc<dyn PhysicalExpr>> =
+            Vec::with_capacity(filter_state.supported_accumulators_info.len());
+
+        for acc_info in &filter_state.supported_accumulators_info {
+            // Skip if we don't yet have a meaningful bound
+            let bound = {
+                let guard = acc_info.shared_bound.lock();
+                if (*guard).is_null() {
+                    continue;
+                }
+                guard.clone()
+            };
+
+            let agg_exprs = self
+                .aggregate_expressions
+                .get(acc_info.aggr_index)
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Invalid aggregate expression index {} for dynamic filter",
+                        acc_info.aggr_index
+                    )
+                })?;
+            // Only aggregates with a single argument are supported.
+            let column_expr = agg_exprs.first().ok_or_else(|| {
+                internal_datafusion_err!(
+                    "Aggregate expression at index {} expected a single argument",
+                    acc_info.aggr_index
+                )
+            })?;
+
+            let literal = lit(bound);
+            let predicate: Arc<dyn PhysicalExpr> = match acc_info.aggr_type {
+                DynamicFilterAggregateType::Min => Arc::new(BinaryExpr::new(
+                    Arc::clone(column_expr),
+                    Operator::Lt,
+                    literal,
+                )),
+                DynamicFilterAggregateType::Max => Arc::new(BinaryExpr::new(
+                    Arc::clone(column_expr),
+                    Operator::Gt,
+                    literal,
+                )),
+            };
+            predicates.push(predicate);
+        }
+
+        let combined = predicates.into_iter().reduce(|acc, pred| {
+            Arc::new(BinaryExpr::new(acc, Operator::Or, pred)) as Arc<dyn PhysicalExpr>
+        });
+
+        Ok(combined.unwrap_or_else(|| lit(true)))
+    }
+
+    // If the dynamic filter is enabled, update it using the current accumulator's
+    // values
+    fn maybe_update_dyn_filter(&mut self) -> Result<()> {
+        // Step 1: Update each partition's current bound
+        let Some(filter_state) = self.agg_dyn_filter_state.as_ref() else {
+            return Ok(());
+        };
+
+        let mut bounds_changed = false;
+
+        for acc_info in &filter_state.supported_accumulators_info {
+            let acc =
+                self.accumulators
+                    .get_mut(acc_info.aggr_index)
+                    .ok_or_else(|| {
+                        internal_datafusion_err!(
+                            "Invalid accumulator index {} for dynamic filter",
+                            acc_info.aggr_index
+                        )
+                    })?;
+            // First get current partition's bound, then update the shared bound among
+            // all partitions.
+            let current_bound = acc.evaluate()?;
+            {
+                let mut bound = acc_info.shared_bound.lock();
+                let new_bound = match acc_info.aggr_type {
+                    DynamicFilterAggregateType::Max => {
+                        scalar_max(&bound, &current_bound)?
+                    }
+                    DynamicFilterAggregateType::Min => {
+                        scalar_min(&bound, &current_bound)?
+                    }
+                };
+                if new_bound != *bound {
+                    *bound = new_bound;
+                    bounds_changed = true;
+                }
+            }
+        }
+
+        // Step 2: Sync the dynamic filter physical expression with reader,
+        // but only if any bound actually changed.
+        if bounds_changed {
+            let predicate = self.build_dynamic_filter_from_accumulator_bounds()?;
+            filter_state.filter.update(predicate)?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Returns the element-wise minimum of two `ScalarValue`s.
+///
+/// # Null semantics
+/// - `min(NULL, NULL)      = NULL`
+/// - `min(NULL, x)         = x`
+/// - `min(x, NULL)         = x`
+///
+/// # Errors
+/// Returns internal error if v1 and v2 has incompatible types.
+fn scalar_min(v1: &ScalarValue, v2: &ScalarValue) -> Result<ScalarValue> {
+    if let Some(result) = scalar_cmp_null_short_circuit(v1, v2) {
+        return Ok(result);
+    }
+
+    match v1.partial_cmp(v2) {
+        Some(Ordering::Less | Ordering::Equal) => Ok(v1.clone()),
+        Some(Ordering::Greater) => Ok(v2.clone()),
+        None => datafusion_common::internal_err!(
+            "cannot compare values of different or incompatible types: {v1:?} vs {v2:?}"
+        ),
+    }
+}
+
+/// Returns the element-wise maximum of two `ScalarValue`s.
+///
+/// # Null semantics
+/// - `max(NULL, NULL)      = NULL`
+/// - `max(NULL, x)         = x`
+/// - `max(x, NULL)         = x`
+///
+/// # Errors
+/// Returns internal error if v1 and v2 has incompatible types.
+fn scalar_max(v1: &ScalarValue, v2: &ScalarValue) -> Result<ScalarValue> {
+    if let Some(result) = scalar_cmp_null_short_circuit(v1, v2) {
+        return Ok(result);
+    }
+
+    match v1.partial_cmp(v2) {
+        Some(Ordering::Greater | Ordering::Equal) => Ok(v1.clone()),
+        Some(Ordering::Less) => Ok(v2.clone()),
+        None => datafusion_common::internal_err!(
+            "cannot compare values of different or incompatible types: {v1:?} vs {v2:?}"
+        ),
+    }
+}
+
+fn scalar_cmp_null_short_circuit(
+    v1: &ScalarValue,
+    v2: &ScalarValue,
+) -> Option<ScalarValue> {
+    match (v1, v2) {
+        (ScalarValue::Null, ScalarValue::Null) => Some(ScalarValue::Null),
+        (ScalarValue::Null, other) | (other, ScalarValue::Null) => Some(other.clone()),
+        _ => None,
+    }
+}
+
+/// Prepend the grouping ID column to the output columns if present.
+///
+/// For GROUPING SETS with no GROUP BY expressions, the schema includes a `__grouping_id`
+/// column that must be present in the output. This function inserts it at the beginning
+/// of the columns array to maintain schema alignment.
+fn prepend_grouping_id_column(
+    mut columns: Vec<Arc<dyn arrow::array::Array>>,
+    grouping_id: Option<&ScalarValue>,
+) -> Result<Vec<Arc<dyn arrow::array::Array>>> {
+    if let Some(id) = grouping_id {
+        let num_rows = columns.first().map(|array| array.len()).unwrap_or(1);
+        let grouping_ids = id.to_array_of_size(num_rows)?;
+        columns.insert(0, grouping_ids);
+    }
+    Ok(columns)
 }
 
 impl AggregateStream {
     /// Create a new AggregateStream
     pub fn new(
         agg: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
     ) -> Result<Self> {
         let agg_schema = Arc::clone(&agg.schema);
-        let agg_filter_expr = agg.filter_expr.clone();
+        let agg_filter_expr = Arc::clone(&agg.filter_expr);
 
         let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition);
-        let input = agg.input.execute(partition, Arc::clone(&context))?;
+        let input = agg.input.execute(partition, Arc::clone(context))?;
 
         let aggregate_expressions = aggregate_expressions(&agg.aggr_expr, &agg.mode, 0)?;
-        let filter_expressions = match agg.mode {
-            AggregateMode::Partial
-            | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => agg_filter_expr,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                vec![None; agg.aggr_expr.len()]
-            }
+        let filter_expressions = match agg.mode.input_mode() {
+            AggregateInputMode::Raw => agg_filter_expr,
+            AggregateInputMode::Partial => vec![None; agg.aggr_expr.len()].into(),
         };
         let accumulators = create_accumulators(&agg.aggr_expr)?;
 
         let reservation = MemoryConsumer::new(format!("AggregateStream[{partition}]"))
             .register(context.memory_pool());
 
+        // Enable dynamic filter if:
+        // 1. AggregateExec did the check and ensure it supports the dynamic filter
+        //    (its dynamic_filter field will be Some(..))
+        // 2. Aggregate dynamic filter is enabled from the config
+        let mut maybe_dynamic_filter = match agg.dynamic_filter.as_ref() {
+            Some(filter) => Some(Arc::clone(filter)),
+            _ => None,
+        };
+
+        if !context
+            .session_config()
+            .options()
+            .optimizer
+            .enable_aggregate_dynamic_filter_pushdown
+        {
+            maybe_dynamic_filter = None;
+        }
+
         let inner = AggregateStreamInner {
             schema: Arc::clone(&agg.schema),
             mode: agg.mode,
@@ -101,27 +329,33 @@ impl AggregateStream {
             accumulators,
             reservation,
             finished: false,
+            agg_dyn_filter_state: maybe_dynamic_filter,
         };
+
         let stream = futures::stream::unfold(inner, |mut this| async move {
             if this.finished {
                 return None;
             }
 
-            let elapsed_compute = this.baseline_metrics.elapsed_compute();
-
             loop {
                 let result = match this.input.next().await {
                     Some(Ok(batch)) => {
-                        let timer = elapsed_compute.timer();
-                        let result = aggregate_batch(
-                            &this.mode,
-                            batch,
-                            &mut this.accumulators,
-                            &this.aggregate_expressions,
-                            &this.filter_expressions,
-                        );
+                        let result = {
+                            let elapsed_compute = this.baseline_metrics.elapsed_compute();
+                            let _timer = elapsed_compute.timer(); // Stops on drop
+                            aggregate_batch(
+                                &this.mode,
+                                &batch,
+                                &mut this.accumulators,
+                                &this.aggregate_expressions,
+                                &this.filter_expressions,
+                            )
+                        };
 
-                        timer.done();
+                        let result = result.and_then(|allocated| {
+                            this.maybe_update_dyn_filter()?;
+                            Ok(allocated)
+                        });
 
                         // allocate memory
                         // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with
@@ -139,6 +373,9 @@ impl AggregateStream {
                         let timer = this.baseline_metrics.elapsed_compute().timer();
                         let result =
                             finalize_aggregation(&mut this.accumulators, &this.mode)
+                                .and_then(|columns| {
+                                    prepend_grouping_id_column(columns, None)
+                                })
                                 .and_then(|columns| {
                                     RecordBatch::try_new(
                                         Arc::clone(&this.schema),
@@ -195,7 +432,7 @@ impl RecordBatchStream for AggregateStream {
 /// TODO: Make this a member function
 fn aggregate_batch(
     mode: &AggregateMode,
-    batch: RecordBatch,
+    batch: &RecordBatch,
     accumulators: &mut [AccumulatorItem],
     expressions: &[Vec<Arc<dyn PhysicalExpr>>],
     filters: &[Option<Arc<dyn PhysicalExpr>>],
@@ -215,27 +452,18 @@ fn aggregate_batch(
         .try_for_each(|((accum, expr), filter)| {
             // 1.2
             let batch = match filter {
-                Some(filter) => Cow::Owned(batch_filter(&batch, filter)?),
-                None => Cow::Borrowed(&batch),
+                Some(filter) => Cow::Owned(batch_filter(batch, filter)?),
+                None => Cow::Borrowed(batch),
             };
 
-            let n_rows = batch.num_rows();
-
             // 1.3
-            let values = expr
-                .iter()
-                .map(|e| e.evaluate(&batch).and_then(|v| v.into_array(n_rows)))
-                .collect::<Result<Vec<_>>>()?;
+            let values = evaluate_expressions_to_arrays(expr, batch.as_ref())?;
 
             // 1.4
             let size_pre = accum.size();
-            let res = match mode {
-                AggregateMode::Partial
-                | AggregateMode::Single
-                | AggregateMode::SinglePartitioned => accum.update_batch(&values),
-                AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                    accum.merge_batch(&values)
-                }
+            let res = match mode.input_mode() {
+                AggregateInputMode::Raw => accum.update_batch(&values),
+                AggregateInputMode::Partial => accum.merge_batch(&values),
             };
             let size_post = accum.size();
             allocated += size_post.saturating_sub(size_pre);
diff --git a/datafusion/physical-plan/src/aggregates/order/mod.rs b/datafusion/physical-plan/src/aggregates/order/mod.rs
index 0b742b3d20fdc..e33a0287986e2 100644
--- a/datafusion/physical-plan/src/aggregates/order/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/order/mod.rs
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::mem::size_of;
+
 use arrow::array::ArrayRef;
-use arrow::datatypes::Schema;
 use datafusion_common::Result;
 use datafusion_expr::EmitTo;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use std::mem::size_of;
 
 mod full;
 mod partial;
@@ -42,22 +41,19 @@ pub enum GroupOrdering {
 
 impl GroupOrdering {
     /// Create a `GroupOrdering` for the specified ordering
-    pub fn try_new(
-        input_schema: &Schema,
-        mode: &InputOrderMode,
-        ordering: &LexOrdering,
-    ) -> Result<Self> {
+    pub fn try_new(mode: &InputOrderMode) -> Result<Self> {
         match mode {
             InputOrderMode::Linear => Ok(GroupOrdering::None),
             InputOrderMode::PartiallySorted(order_indices) => {
-                GroupOrderingPartial::try_new(input_schema, order_indices, ordering)
+                GroupOrderingPartial::try_new(order_indices.clone())
                     .map(GroupOrdering::Partial)
             }
             InputOrderMode::Sorted => Ok(GroupOrdering::Full(GroupOrderingFull::new())),
         }
     }
 
-    // How many groups be emitted, or None if no data can be emitted
+    /// Returns how many groups can be emitted while respecting the current
+    /// ordering guarantees, or `None` if no data can be emitted.
     pub fn emit_to(&self) -> Option<EmitTo> {
         match self {
             GroupOrdering::None => None,
@@ -66,7 +62,29 @@ impl GroupOrdering {
         }
     }
 
-    /// Updates the state the input is done
+    /// Returns the emit strategy to use under memory pressure (OOM).
+    ///
+    /// Returns the strategy that must be used when emitting up to `n` groups
+    /// while respecting the current ordering guarantees.
+    ///
+    /// Returns `None` if no data can be emitted.
+    pub fn oom_emit_to(&self, n: usize) -> Option<EmitTo> {
+        if n == 0 {
+            return None;
+        }
+
+        match self {
+            GroupOrdering::None => Some(EmitTo::First(n)),
+            GroupOrdering::Partial(_) | GroupOrdering::Full(_) => {
+                self.emit_to().map(|emit_to| match emit_to {
+                    EmitTo::First(max) => EmitTo::First(n.min(max)),
+                    EmitTo::All => EmitTo::First(n),
+                })
+            }
+        }
+    }
+
+    /// Updates the state to indicate that the input is complete.
     pub fn input_done(&mut self) {
         match self {
             GroupOrdering::None => {}
@@ -75,8 +93,8 @@ impl GroupOrdering {
         }
     }
 
-    /// remove the first n groups from the internal state, shifting
-    /// all existing indexes down by `n`
+    /// Removes the first `n` groups from the internal state, shifting all
+    /// existing indexes down by `n`.
     pub fn remove_groups(&mut self, n: usize) {
         match self {
             GroupOrdering::None => {}
@@ -85,16 +103,14 @@ impl GroupOrdering {
         }
     }
 
-    /// Called when new groups are added in a batch
-    ///
-    /// * `total_num_groups`: total number of groups (so max
-    ///   group_index is total_num_groups - 1).
+    /// Called when new groups are added in a batch.
     ///
-    /// * `group_values`: group key values for *each row* in the batch
+    /// * `batch_group_values`: group key values for each row in the batch
     ///
     /// * `group_indices`: indices for each row in the batch
     ///
-    /// * `hashes`: hash values for each row in the batch
+    /// * `total_num_groups`: total number of groups (so max
+    ///   group_index is total_num_groups - 1).
     pub fn new_groups(
         &mut self,
         batch_group_values: &[ArrayRef],
@@ -117,7 +133,7 @@ impl GroupOrdering {
         Ok(())
     }
 
-    /// Return the size of memory used by the ordering state, in bytes
+    /// Returns the size of memory used by the ordering state, in bytes.
     pub fn size(&self) -> usize {
         size_of::<Self>()
             + match self {
@@ -127,3 +143,63 @@ impl GroupOrdering {
             }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, Int32Array};
+
+    #[test]
+    fn test_oom_emit_to_none_ordering() {
+        let group_ordering = GroupOrdering::None;
+
+        assert_eq!(group_ordering.oom_emit_to(0), None);
+        assert_eq!(group_ordering.oom_emit_to(5), Some(EmitTo::First(5)));
+    }
+
+    /// Creates a partially ordered grouping state with three groups.
+    ///
+    /// `sort_key_values` controls whether a sort boundary exists in the batch:
+    /// distinct values such as `[1, 2, 3]` create boundaries, while repeated
+    /// values such as `[1, 1, 1]` do not.
+    fn partial_ordering(sort_key_values: Vec<i32>) -> Result<GroupOrdering> {
+        let mut group_ordering =
+            GroupOrdering::Partial(GroupOrderingPartial::try_new(vec![0])?);
+
+        let batch_group_values: Vec<ArrayRef> = vec![
+            Arc::new(Int32Array::from(sort_key_values)),
+            Arc::new(Int32Array::from(vec![10, 20, 30])),
+        ];
+        let group_indices = vec![0, 1, 2];
+
+        group_ordering.new_groups(&batch_group_values, &group_indices, 3)?;
+
+        Ok(group_ordering)
+    }
+
+    #[test]
+    fn test_oom_emit_to_partial_clamps_to_boundary() -> Result<()> {
+        let group_ordering = partial_ordering(vec![1, 2, 3])?;
+
+        // Can emit both `1` and `2` groups because we have seen `3`
+        assert_eq!(group_ordering.emit_to(), Some(EmitTo::First(2)));
+        assert_eq!(group_ordering.oom_emit_to(1), Some(EmitTo::First(1)));
+        assert_eq!(group_ordering.oom_emit_to(3), Some(EmitTo::First(2)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_oom_emit_to_partial_without_boundary() -> Result<()> {
+        let group_ordering = partial_ordering(vec![1, 1, 1])?;
+
+        // Can't emit the last `1` group as it may have more values
+        assert_eq!(group_ordering.emit_to(), None);
+        assert_eq!(group_ordering.oom_emit_to(3), None);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs
index c7a75e5f26404..476551a7ca210 100644
--- a/datafusion/physical-plan/src/aggregates/order/partial.rs
+++ b/datafusion/physical-plan/src/aggregates/order/partial.rs
@@ -15,18 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::cmp::Ordering;
+use std::mem::size_of;
+use std::sync::Arc;
+
 use arrow::array::ArrayRef;
 use arrow::compute::SortOptions;
-use arrow::datatypes::Schema;
 use arrow_ord::partition::partition;
 use datafusion_common::utils::{compare_rows, get_row_at_idx};
 use datafusion_common::{Result, ScalarValue};
 use datafusion_execution::memory_pool::proxy::VecAllocExt;
 use datafusion_expr::EmitTo;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
-use std::cmp::Ordering;
-use std::mem::size_of;
-use std::sync::Arc;
 
 /// Tracks grouping state when the data is ordered by some subset of
 /// the group keys.
@@ -62,7 +61,7 @@ use std::sync::Arc;
 ///  group indices
 /// (in group value  group_values               current tracks the most
 ///      order)                                    recent group index
-///```
+/// ```
 #[derive(Debug)]
 pub struct GroupOrderingPartial {
     /// State machine
@@ -118,17 +117,11 @@ impl State {
 
 impl GroupOrderingPartial {
     /// TODO: Remove unnecessary `input_schema` parameter.
-    pub fn try_new(
-        _input_schema: &Schema,
-        order_indices: &[usize],
-        ordering: &LexOrdering,
-    ) -> Result<Self> {
-        assert!(!order_indices.is_empty());
-        assert!(order_indices.len() <= ordering.len());
-
+    pub fn try_new(order_indices: Vec<usize>) -> Result<Self> {
+        debug_assert!(!order_indices.is_empty());
         Ok(Self {
             state: State::Start,
-            order_indices: order_indices.to_vec(),
+            order_indices,
         })
     }
 
@@ -276,29 +269,15 @@ impl GroupOrderingPartial {
 
 #[cfg(test)]
 mod tests {
-    use arrow::array::Int32Array;
-    use arrow_schema::{DataType, Field};
-    use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
-
     use super::*;
 
+    use arrow::array::Int32Array;
+
     #[test]
     fn test_group_ordering_partial() -> Result<()> {
-        let schema = Schema::new(vec![
-            Field::new("a", DataType::Int32, false),
-            Field::new("b", DataType::Int32, false),
-        ]);
-
         // Ordered on column a
         let order_indices = vec![0];
-
-        let ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
-            col("a", &schema)?,
-            SortOptions::default(),
-        )]);
-
-        let mut group_ordering =
-            GroupOrderingPartial::try_new(&schema, &order_indices, &ordering)?;
+        let mut group_ordering = GroupOrderingPartial::try_new(order_indices)?;
 
         let batch_group_values: Vec<ArrayRef> = vec![
             Arc::new(Int32Array::from(vec![1, 2, 3])),
diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs
index 62f541443068f..b857fdca3f21d 100644
--- a/datafusion/physical-plan/src/aggregates/row_hash.rs
+++ b/datafusion/physical-plan/src/aggregates/row_hash.rs
@@ -21,36 +21,38 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::vec;
 
-use crate::aggregates::group_values::{new_group_values, GroupValues};
+use super::AggregateExec;
+use super::order::GroupOrdering;
+use crate::aggregates::group_values::{GroupByMetrics, GroupValues, new_group_values};
 use crate::aggregates::order::GroupOrderingFull;
 use crate::aggregates::{
-    create_schema, evaluate_group_by, evaluate_many, evaluate_optional, AggregateMode,
-    PhysicalGroupBy,
+    AggregateInputMode, AggregateMode, AggregateOutputMode, PhysicalGroupBy,
+    create_schema, evaluate_group_by, evaluate_many, evaluate_optional,
 };
 use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput};
-use crate::sorts::sort::sort_batch;
-use crate::sorts::streaming_merge::StreamingMergeBuilder;
-use crate::spill::spill_manager::SpillManager;
-use crate::stream::RecordBatchStreamAdapter;
-use crate::{aggregates, metrics, ExecutionPlan, PhysicalExpr};
+use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
+use crate::spill::spill_manager::{GetSlicedSize, SpillManager};
+use crate::{PhysicalExpr, aggregates, metrics};
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 
 use arrow::array::*;
-use arrow::compute::SortOptions;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, DataFusionError, Result};
-use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_common::{
+    DataFusionError, Result, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_err, resources_datafusion_err,
+};
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::proxy::VecAllocExt;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
-use datafusion_execution::TaskContext;
 use datafusion_expr::{EmitTo, GroupsAccumulator};
+use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr};
-
-use super::order::GroupOrdering;
-use super::AggregateExec;
-use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
+
+use crate::sorts::IncrementalSortIterator;
+use datafusion_common::instant::Instant;
+use datafusion_common::utils::memory::get_record_batch_memory_size;
 use futures::ready;
 use futures::stream::{Stream, StreamExt};
 use log::debug;
@@ -100,7 +102,7 @@ struct SpillState {
     // ========================================================================
     /// If data has previously been spilled, the locations of the
     /// spill files (in Arrow IPC format)
-    spills: Vec<RefCountedTempFile>,
+    spills: Vec<SortedSpillFile>,
 
     /// true when streaming merge is in progress
     is_stream_merging: bool,
@@ -192,7 +194,9 @@ impl SkipAggregationProbe {
         if self.input_rows >= self.probe_rows_threshold {
             self.should_skip = self.num_groups as f64 / self.input_rows as f64
                 >= self.probe_ratio_threshold;
-            self.is_locked = true;
+            // Set is_locked to true only if we have decided to skip, otherwise we can try to skip
+            // during processing the next record_batch.
+            self.is_locked = self.should_skip;
         }
     }
 
@@ -206,6 +210,17 @@ impl SkipAggregationProbe {
     }
 }
 
+/// Controls the behavior when an out-of-memory condition occurs.
+#[derive(PartialEq, Debug)]
+enum OutOfMemoryMode {
+    /// When out of memory occurs, spill state to disk
+    Spill,
+    /// When out of memory occurs, attempt to emit group values early
+    EmitEarly,
+    /// When out of memory occurs, immediately report the error
+    ReportError,
+}
+
 /// HashTable based Grouping Aggregator
 ///
 /// # Design Goals
@@ -299,7 +314,6 @@ impl SkipAggregationProbe {
 /// later stream-merge sort on reading back the spilled data does re-grouping. Note the rows cannot
 /// be grouped once spilled onto disk, the read back data needs to be re-grouped again. In addition,
 /// re-grouping may cause out of memory again. Thus, re-grouping has to be a sort based aggregation.
-///
 /// ```text
 /// Partial Aggregation [batch_size = 2] (max memory = 3 rows)
 ///
@@ -364,10 +378,10 @@ pub(crate) struct GroupedHashAggregateStream {
     ///
     /// For example, for an aggregate like `SUM(x) FILTER (WHERE x >= 100)`,
     /// the filter expression is  `x > 100`.
-    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Arc<[Option<Arc<dyn PhysicalExpr>>]>,
 
     /// GROUP BY expressions
-    group_by: PhysicalGroupBy,
+    group_by: Arc<PhysicalGroupBy>,
 
     /// max rows in output RecordBatches
     batch_size: usize,
@@ -430,29 +444,39 @@ pub(crate) struct GroupedHashAggregateStream {
     /// The memory reservation for this grouping
     reservation: MemoryReservation,
 
+    /// The behavior to trigger when out of memory occurs
+    oom_mode: OutOfMemoryMode,
+
     /// Execution metrics
     baseline_metrics: BaselineMetrics,
+
+    /// Aggregation-specific metrics
+    group_by_metrics: GroupByMetrics,
+
+    /// Reduction factor metric, calculated as `output_rows/input_rows` (only for partial aggregation)
+    reduction_factor: Option<metrics::RatioMetrics>,
 }
 
 impl GroupedHashAggregateStream {
     /// Create a new GroupedHashAggregateStream
     pub fn new(
         agg: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
     ) -> Result<Self> {
         debug!("Creating GroupedHashAggregateStream");
         let agg_schema = Arc::clone(&agg.schema);
-        let agg_group_by = agg.group_by.clone();
-        let agg_filter_expr = agg.filter_expr.clone();
+        let agg_group_by = Arc::clone(&agg.group_by);
+        let agg_filter_expr = Arc::clone(&agg.filter_expr);
 
         let batch_size = context.session_config().batch_size();
-        let input = agg.input.execute(partition, Arc::clone(&context))?;
+        let input = agg.input.execute(partition, Arc::clone(context))?;
         let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition);
+        let group_by_metrics = GroupByMetrics::new(&agg.metrics, partition);
 
         let timer = baseline_metrics.elapsed_compute().timer();
 
-        let aggregate_exprs = agg.aggr_expr.clone();
+        let aggregate_exprs = Arc::clone(&agg.aggr_expr);
 
         // arguments for each aggregate, one vec of expressions per
         // aggregate
@@ -468,13 +492,9 @@ impl GroupedHashAggregateStream {
             agg_group_by.num_group_exprs(),
         )?;
 
-        let filter_expressions = match agg.mode {
-            AggregateMode::Partial
-            | AggregateMode::Single
-            | AggregateMode::SinglePartitioned => agg_filter_expr,
-            AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                vec![None; agg.aggr_expr.len()]
-            }
+        let filter_expressions = match agg.mode.input_mode() {
+            AggregateInputMode::Raw => agg_filter_expr,
+            AggregateInputMode::Partial => vec![None; agg.aggr_expr.len()].into(),
         };
 
         // Instantiate the accumulators
@@ -500,12 +520,12 @@ impl GroupedHashAggregateStream {
         // Therefore, when we spill these intermediate states or pass them to another
         // aggregation operator, we must use a schema that includes both the group
         // columns **and** the partial-state columns.
-        let partial_agg_schema = create_schema(
+        let spill_schema = Arc::new(create_schema(
             &agg.input().schema(),
             &agg_group_by,
             &aggregate_exprs,
             AggregateMode::Partial,
-        )?;
+        )?);
 
         // Need to update the GROUP BY expressions to point to the correct column after schema change
         let merging_group_by_expr = agg_group_by
@@ -517,17 +537,27 @@ impl GroupedHashAggregateStream {
             })
             .collect();
 
-        let partial_agg_schema = Arc::new(partial_agg_schema);
-
-        let spill_expr = group_schema
-            .fields
-            .into_iter()
-            .enumerate()
-            .map(|(idx, field)| PhysicalSortExpr {
-                expr: Arc::new(Column::new(field.name().as_str(), idx)) as _,
-                options: SortOptions::default(),
-            })
-            .collect();
+        let output_ordering = agg.cache.output_ordering();
+
+        let spill_sort_exprs =
+            group_schema
+                .fields
+                .into_iter()
+                .enumerate()
+                .map(|(idx, field)| {
+                    let output_expr = Column::new(field.name().as_str(), idx);
+
+                    // Try to use the sort options from the output ordering, if available.
+                    // This ensures that spilled state is sorted in the required order as well.
+                    let sort_options = output_ordering
+                        .and_then(|o| o.get_sort_options(&output_expr))
+                        .unwrap_or_default();
+
+                    PhysicalSortExpr::new(Arc::new(output_expr), sort_options)
+                });
+        let Some(spill_ordering) = LexOrdering::new(spill_sort_exprs) else {
+            return internal_err!("Spill expression is empty");
+        };
 
         let agg_fn_names = aggregate_exprs
             .iter()
@@ -535,20 +565,35 @@ impl GroupedHashAggregateStream {
             .collect::<Vec<_>>()
             .join(", ");
         let name = format!("GroupedHashAggregateStream[{partition}] ({agg_fn_names})");
-        let reservation = MemoryConsumer::new(name)
-            .with_can_spill(true)
-            .register(context.memory_pool());
-        let (ordering, _) = agg
-            .properties()
-            .equivalence_properties()
-            .find_longest_permutation(&agg_group_by.output_exprs());
-        let group_ordering = GroupOrdering::try_new(
-            &group_schema,
-            &agg.input_order_mode,
-            ordering.as_ref(),
-        )?;
+        let group_ordering = GroupOrdering::try_new(&agg.input_order_mode)?;
+        let oom_mode = match (agg.mode, &group_ordering) {
+            // In partial aggregation mode, always prefer to emit incomplete results early.
+            (AggregateMode::Partial, _) => OutOfMemoryMode::EmitEarly,
+            // For non-partial aggregation modes, emitting incomplete results is not an option.
+            // Instead, use disk spilling to store sorted, incomplete results, and merge them
+            // afterwards.
+            (_, GroupOrdering::None | GroupOrdering::Partial(_))
+                if context.runtime_env().disk_manager.tmp_files_enabled() =>
+            {
+                OutOfMemoryMode::Spill
+            }
+            // For `GroupOrdering::Full`, the incoming stream is already sorted. This ensures the
+            // number of incomplete groups can be kept small at all times. If we still hit
+            // an out-of-memory condition, spilling to disk would not be beneficial since the same
+            // situation is likely to reoccur when reading back the spilled data.
+            // Therefore, we fall back to simply reporting the error immediately.
+            // This mode will also be used if the `DiskManager` is not configured to allow spilling
+            // to disk.
+            _ => OutOfMemoryMode::ReportError,
+        };
 
         let group_values = new_group_values(group_schema, &group_ordering)?;
+        let reservation = MemoryConsumer::new(name)
+            // We interpret 'can spill' as 'can handle memory back pressure'.
+            // This value needs to be set to true for the default memory pool implementations
+            // to ensure fair application of back pressure amongst the memory consumers.
+            .with_can_spill(oom_mode != OutOfMemoryMode::ReportError)
+            .register(context.memory_pool());
         timer.done();
 
         let exec_state = ExecutionState::ReadingInput;
@@ -556,13 +601,14 @@ impl GroupedHashAggregateStream {
         let spill_manager = SpillManager::new(
             context.runtime_env(),
             metrics::SpillMetrics::new(&agg.metrics, partition),
-            Arc::clone(&partial_agg_schema),
-        );
+            Arc::clone(&spill_schema),
+        )
+        .with_compression_type(context.session_config().spill_compression());
 
         let spill_state = SpillState {
             spills: vec![],
-            spill_expr,
-            spill_schema: partial_agg_schema,
+            spill_expr: spill_ordering,
+            spill_schema,
             is_stream_merging: false,
             merging_aggregate_arguments,
             merging_group_by: PhysicalGroupBy::new_single(merging_group_by_expr),
@@ -601,6 +647,16 @@ impl GroupedHashAggregateStream {
             None
         };
 
+        let reduction_factor = if agg.mode == AggregateMode::Partial {
+            Some(
+                MetricBuilder::new(&agg.metrics)
+                    .with_type(metrics::MetricType::SUMMARY)
+                    .ratio_metrics("reduction_factor", partition),
+            )
+        } else {
+            None
+        };
+
         Ok(GroupedHashAggregateStream {
             schema: agg_schema,
             input,
@@ -610,16 +666,19 @@ impl GroupedHashAggregateStream {
             filter_expressions,
             group_by: agg_group_by,
             reservation,
+            oom_mode,
             group_values,
             current_group_indices: Default::default(),
             exec_state,
             baseline_metrics,
+            group_by_metrics,
             batch_size,
             group_ordering,
             input_done: false,
             spill_state,
-            group_values_soft_limit: agg.limit,
+            group_values_soft_limit: agg.limit_options().map(|config| config.limit()),
             skip_aggregation_probe,
+            reduction_factor,
         })
     }
 }
@@ -657,18 +716,24 @@ impl Stream for GroupedHashAggregateStream {
             match &self.exec_state {
                 ExecutionState::ReadingInput => 'reading_input: {
                     match ready!(self.input.poll_next_unpin(cx)) {
-                        // New batch to aggregate in partial aggregation operator
-                        Some(Ok(batch)) if self.mode == AggregateMode::Partial => {
+                        // New batch to aggregate
+                        Some(Ok(batch)) => {
                             let timer = elapsed_compute.timer();
                             let input_rows = batch.num_rows();
 
-                            // Do the grouping
-                            self.group_aggregate_batch(batch)?;
+                            if self.mode == AggregateMode::Partial
+                                && let Some(reduction_factor) =
+                                    self.reduction_factor.as_ref()
+                            {
+                                reduction_factor.add_total(input_rows);
+                            }
 
-                            self.update_skip_aggregation_probe(input_rows);
+                            // Do the grouping.
+                            // `group_aggregate_batch` will _not_ have updated the memory reservation yet.
+                            // The rest of the code will first try to reduce memory usage by
+                            // already emitting results.
+                            self.group_aggregate_batch(&batch)?;
 
-                            // If we can begin emitting rows, do so,
-                            // otherwise keep consuming input
                             assert!(!self.input_done);
 
                             // If the number of group values equals or exceeds the soft limit,
@@ -680,7 +745,13 @@ impl Stream for GroupedHashAggregateStream {
                                 break 'reading_input;
                             }
 
-                            if let Some(to_emit) = self.group_ordering.emit_to() {
+                            // Try to emit completed groups if possible.
+                            // If we already started spilling, we can no longer emit since
+                            // this might lead to incorrect output ordering
+                            if (self.spill_state.spills.is_empty()
+                                || self.spill_state.is_stream_merging)
+                                && let Some(to_emit) = self.group_ordering.emit_to()
+                            {
                                 timer.done();
                                 if let Some(batch) = self.emit(to_emit, false)? {
                                     self.exec_state =
@@ -690,44 +761,30 @@ impl Stream for GroupedHashAggregateStream {
                                 break 'reading_input;
                             }
 
-                            self.emit_early_if_necessary()?;
-
-                            self.switch_to_skip_aggregation()?;
-
-                            timer.done();
-                        }
-
-                        // New batch to aggregate in terminal aggregation operator
-                        // (Final/FinalPartitioned/Single/SinglePartitioned)
-                        Some(Ok(batch)) => {
-                            let timer = elapsed_compute.timer();
-
-                            // Make sure we have enough capacity for `batch`, otherwise spill
-                            self.spill_previous_if_necessary(&batch)?;
-
-                            // Do the grouping
-                            self.group_aggregate_batch(batch)?;
-
-                            // If we can begin emitting rows, do so,
-                            // otherwise keep consuming input
-                            assert!(!self.input_done);
-
-                            // If the number of group values equals or exceeds the soft limit,
-                            // emit all groups and switch to producing output
-                            if self.hit_soft_group_limit() {
-                                timer.done();
-                                self.set_input_done_and_produce_output()?;
-                                // make sure the exec_state just set is not overwritten below
-                                break 'reading_input;
+                            if self.mode == AggregateMode::Partial {
+                                // Spilling should never be activated in partial aggregation mode.
+                                assert!(!self.spill_state.is_stream_merging);
+
+                                // Check if we should switch to skip aggregation mode
+                                // It's important that we do this before we early emit since we've
+                                // already updated the probe.
+                                self.update_skip_aggregation_probe(input_rows);
+                                if let Some(new_state) =
+                                    self.switch_to_skip_aggregation()?
+                                {
+                                    timer.done();
+                                    self.exec_state = new_state;
+                                    break 'reading_input;
+                                }
                             }
 
-                            if let Some(to_emit) = self.group_ordering.emit_to() {
+                            // If we reach this point, try to update the memory reservation
+                            // handling out-of-memory conditions as determined by the OOM mode.
+                            if let Some(new_state) =
+                                self.try_update_memory_reservation()?
+                            {
                                 timer.done();
-                                if let Some(batch) = self.emit(to_emit, false)? {
-                                    self.exec_state =
-                                        ExecutionState::ProducingOutput(batch);
-                                };
-                                // make sure the exec_state just set is not overwritten below
+                                self.exec_state = new_state;
                                 break 'reading_input;
                             }
 
@@ -755,7 +812,7 @@ impl Stream for GroupedHashAggregateStream {
                             if let Some(probe) = self.skip_aggregation_probe.as_mut() {
                                 probe.record_skipped(&batch);
                             }
-                            let states = self.transform_to_states(batch)?;
+                            let states = self.transform_to_states(&batch)?;
                             return Poll::Ready(Some(Ok(
                                 states.record_output(&self.baseline_metrics)
                             )));
@@ -766,6 +823,15 @@ impl Stream for GroupedHashAggregateStream {
                         }
                         None => {
                             // inner is done, switching to `Done` state
+                            // Sanity check: when switching from SkippingAggregation to Done,
+                            // all groups should have already been emitted
+                            if !self.group_values.is_empty() {
+                                return Poll::Ready(Some(internal_err!(
+                                    "Switching from SkippingAggregation to Done with {} groups still in hash table. \
+                                    This is a bug - all groups should have been emitted before skip aggregation started.",
+                                    self.group_values.len()
+                                )));
+                            }
                             self.exec_state = ExecutionState::Done;
                         }
                     }
@@ -799,6 +865,11 @@ impl Stream for GroupedHashAggregateStream {
                         let output = batch.slice(0, size);
                         (ExecutionState::ProducingOutput(remaining), output)
                     };
+
+                    if let Some(reduction_factor) = self.reduction_factor.as_ref() {
+                        reduction_factor.add_part(output_batch.num_rows());
+                    }
+
                     // Empty record batches should not be emitted.
                     // They need to be treated as  [`Option<RecordBatch>`]es and handled separately
                     debug_assert!(output_batch.num_rows() > 0);
@@ -808,6 +879,14 @@ impl Stream for GroupedHashAggregateStream {
                 }
 
                 ExecutionState::Done => {
+                    // Sanity check: all groups should have been emitted by now
+                    if !self.group_values.is_empty() {
+                        return Poll::Ready(Some(internal_err!(
+                            "AggregateStream was in Done state with {} groups left in hash table. \
+                            This is a bug - all groups should have been emitted before entering Done state.",
+                            self.group_values.len()
+                        )));
+                    }
                     // release the memory reservation since sending back output batch itself needs
                     // some memory reservation, so make some room for it.
                     self.clear_all();
@@ -827,30 +906,45 @@ impl RecordBatchStream for GroupedHashAggregateStream {
 
 impl GroupedHashAggregateStream {
     /// Perform group-by aggregation for the given [`RecordBatch`].
-    fn group_aggregate_batch(&mut self, batch: RecordBatch) -> Result<()> {
+    fn group_aggregate_batch(&mut self, batch: &RecordBatch) -> Result<()> {
         // Evaluate the grouping expressions
         let group_by_values = if self.spill_state.is_stream_merging {
-            evaluate_group_by(&self.spill_state.merging_group_by, &batch)?
+            evaluate_group_by(&self.spill_state.merging_group_by, batch)?
         } else {
-            evaluate_group_by(&self.group_by, &batch)?
+            evaluate_group_by(&self.group_by, batch)?
+        };
+
+        // Only create the timer if there are actual aggregate arguments to evaluate
+        let timer = match (
+            self.spill_state.is_stream_merging,
+            self.spill_state.merging_aggregate_arguments.is_empty(),
+            self.aggregate_arguments.is_empty(),
+        ) {
+            (true, false, _) | (false, _, false) => {
+                Some(self.group_by_metrics.aggregate_arguments_time.timer())
+            }
+            _ => None,
         };
 
         // Evaluate the aggregation expressions.
         let input_values = if self.spill_state.is_stream_merging {
-            evaluate_many(&self.spill_state.merging_aggregate_arguments, &batch)?
+            evaluate_many(&self.spill_state.merging_aggregate_arguments, batch)?
         } else {
-            evaluate_many(&self.aggregate_arguments, &batch)?
+            evaluate_many(&self.aggregate_arguments, batch)?
         };
+        drop(timer);
 
         // Evaluate the filter expressions, if any, against the inputs
         let filter_values = if self.spill_state.is_stream_merging {
             let filter_expressions = vec![None; self.accumulators.len()];
-            evaluate_optional(&filter_expressions, &batch)?
+            evaluate_optional(&filter_expressions, batch)?
         } else {
-            evaluate_optional(&self.filter_expressions, &batch)?
+            evaluate_optional(&self.filter_expressions, batch)?
         };
 
         for group_values in &group_by_values {
+            let groups_start_time = Instant::now();
+
             // calculate the group indices for each input row
             let starting_num_groups = self.group_values.len();
             self.group_values
@@ -867,6 +961,12 @@ impl GroupedHashAggregateStream {
                 )?;
             }
 
+            // Use this instant for both measurements to save a syscall
+            let agg_start_time = Instant::now();
+            self.group_by_metrics
+                .time_calculating_group_ids
+                .add_duration(agg_start_time - groups_start_time);
+
             // Gather the inputs to call the actual accumulator
             let t = self
                 .accumulators
@@ -879,51 +979,101 @@ impl GroupedHashAggregateStream {
 
                 // Call the appropriate method on each aggregator with
                 // the entire input row and the relevant group indexes
-                match self.mode {
-                    AggregateMode::Partial
-                    | AggregateMode::Single
-                    | AggregateMode::SinglePartitioned
-                        if !self.spill_state.is_stream_merging =>
-                    {
-                        acc.update_batch(
-                            values,
-                            group_indices,
-                            opt_filter,
-                            total_num_groups,
-                        )?;
-                    }
-                    _ => {
-                        if opt_filter.is_some() {
-                            return internal_err!("aggregate filter should be applied in partial stage, there should be no filter in final stage");
-                        }
-
-                        // if aggregation is over intermediate states,
-                        // use merge
-                        acc.merge_batch(values, group_indices, None, total_num_groups)?;
-                    }
+                if self.mode.input_mode() == AggregateInputMode::Raw
+                    && !self.spill_state.is_stream_merging
+                {
+                    acc.update_batch(
+                        values,
+                        group_indices,
+                        opt_filter,
+                        total_num_groups,
+                    )?;
+                } else {
+                    assert_or_internal_err!(
+                        opt_filter.is_none(),
+                        "aggregate filter should be applied in partial stage, there should be no filter in final stage"
+                    );
+
+                    // if aggregation is over intermediate states,
+                    // use merge
+                    acc.merge_batch(values, group_indices, None, total_num_groups)?;
                 }
+                self.group_by_metrics
+                    .aggregation_time
+                    .add_elapsed(agg_start_time);
             }
         }
 
-        match self.update_memory_reservation() {
-            // Here we can ignore `insufficient_capacity_err` because we will spill later,
-            // but at least one batch should fit in the memory
-            Err(DataFusionError::ResourcesExhausted(_))
-                if self.group_values.len() >= self.batch_size =>
-            {
-                Ok(())
+        Ok(())
+    }
+
+    /// Attempts to update the memory reservation. If that fails due to a
+    /// [DataFusionError::ResourcesExhausted] error, an attempt will be made to resolve
+    /// the out-of-memory condition based on the [out-of-memory handling mode](OutOfMemoryMode).
+    ///
+    /// If the out-of-memory condition can not be resolved, an `Err` value will be returned
+    ///
+    /// Returns `Ok(Some(ExecutionState))` if the state should be changed, `Ok(None)` otherwise.
+    fn try_update_memory_reservation(&mut self) -> Result<Option<ExecutionState>> {
+        let oom = match self.update_memory_reservation() {
+            Err(e @ DataFusionError::ResourcesExhausted(_)) => e,
+            Err(e) => return Err(e),
+            Ok(_) => return Ok(None),
+        };
+
+        match self.oom_mode {
+            OutOfMemoryMode::Spill if !self.group_values.is_empty() => {
+                self.spill()?;
+                self.clear_shrink(self.batch_size);
+                self.update_memory_reservation()?;
+                Ok(None)
             }
-            other => other,
+            OutOfMemoryMode::EmitEarly if self.group_values.len() > 1 => {
+                let n = if self.group_values.len() >= self.batch_size {
+                    // Try to emit an integer multiple of batch size if possible
+                    self.group_values.len() / self.batch_size * self.batch_size
+                } else {
+                    // Otherwise emit whatever we can
+                    self.group_values.len()
+                };
+
+                if let Some(emit_to) = self.group_ordering.oom_emit_to(n)
+                    && let Some(batch) = self.emit(emit_to, false)?
+                {
+                    return Ok(Some(ExecutionState::ProducingOutput(batch)));
+                }
+                Err(oom)
+            }
+            OutOfMemoryMode::EmitEarly
+            | OutOfMemoryMode::Spill
+            | OutOfMemoryMode::ReportError => Err(oom),
         }
     }
 
     fn update_memory_reservation(&mut self) -> Result<()> {
         let acc = self.accumulators.iter().map(|x| x.size()).sum::<usize>();
-        let reservation_result = self.reservation.try_resize(
-            acc + self.group_values.size()
-                + self.group_ordering.size()
-                + self.current_group_indices.allocated_size(),
-        );
+        let groups_and_acc_size = acc
+            + self.group_values.size()
+            + self.group_ordering.size()
+            + self.current_group_indices.allocated_size();
+
+        // Reserve extra headroom for sorting during potential spill.
+        // When OOM triggers, group_aggregate_batch has already processed the
+        // latest input batch, so the internal state may have grown well beyond
+        // the last successful reservation. The emit batch reflects this larger
+        // actual state, and the sort needs memory proportional to it.
+        // By reserving headroom equal to the data size, we trigger OOM earlier
+        // (before too much data accumulates), ensuring the freed reservation
+        // after clear_shrink is sufficient to cover the sort memory.
+        let sort_headroom =
+            if self.oom_mode == OutOfMemoryMode::Spill && !self.group_values.is_empty() {
+                acc + self.group_values.size()
+            } else {
+                0
+            };
+
+        let new_size = groups_and_acc_size + sort_headroom;
+        let reservation_result = self.reservation.try_resize(new_size);
 
         if reservation_result.is_ok() {
             self.spill_state
@@ -946,6 +1096,7 @@ impl GroupedHashAggregateStream {
             return Ok(None);
         }
 
+        let timer = self.group_by_metrics.emitting_time.timer();
         let mut output = self.group_values.emit(emit_to)?;
         if let EmitTo::First(n) = emit_to {
             self.group_ordering.remove_groups(n);
@@ -953,44 +1104,23 @@ impl GroupedHashAggregateStream {
 
         // Next output each aggregate value
         for acc in self.accumulators.iter_mut() {
-            match self.mode {
-                AggregateMode::Partial => output.extend(acc.state(emit_to)?),
-                _ if spilling => {
-                    // If spilling, output partial state because the spilled data will be
-                    // merged and re-evaluated later.
-                    output.extend(acc.state(emit_to)?)
-                }
-                AggregateMode::Final
-                | AggregateMode::FinalPartitioned
-                | AggregateMode::Single
-                | AggregateMode::SinglePartitioned => output.push(acc.evaluate(emit_to)?),
+            if self.mode.output_mode() == AggregateOutputMode::Final && !spilling {
+                output.push(acc.evaluate(emit_to)?)
+            } else {
+                // Output partial state: either because we're in a non-final mode,
+                // or because we're spilling and will merge/re-evaluate later.
+                output.extend(acc.state(emit_to)?)
             }
         }
+        drop(timer);
 
         // emit reduces the memory usage. Ignore Err from update_memory_reservation. Even if it is
         // over the target memory size after emission, we can emit again rather than returning Err.
         let _ = self.update_memory_reservation();
         let batch = RecordBatch::try_new(schema, output)?;
         debug_assert!(batch.num_rows() > 0);
-        Ok(Some(batch))
-    }
 
-    /// Optimistically, [`Self::group_aggregate_batch`] allows to exceed the memory target slightly
-    /// (~ 1 [`RecordBatch`]) for simplicity. In such cases, spill the data to disk and clear the
-    /// memory. Currently only [`GroupOrdering::None`] is supported for spilling.
-    fn spill_previous_if_necessary(&mut self, batch: &RecordBatch) -> Result<()> {
-        // TODO: support group_ordering for spilling
-        if !self.group_values.is_empty()
-            && batch.num_rows() > 0
-            && matches!(self.group_ordering, GroupOrdering::None)
-            && !self.spill_state.is_stream_merging
-            && self.update_memory_reservation().is_err()
-        {
-            assert_ne!(self.mode, AggregateMode::Partial);
-            self.spill()?;
-            self.clear_shrink(batch);
-        }
-        Ok(())
+        Ok(Some(batch))
     }
 
     /// Emit all intermediate aggregation states, sort them, and store them on disk.
@@ -1001,16 +1131,54 @@ impl GroupedHashAggregateStream {
         let Some(emit) = self.emit(EmitTo::All, true)? else {
             return Ok(());
         };
-        let sorted = sort_batch(&emit, self.spill_state.spill_expr.as_ref(), None)?;
 
-        // Spill sorted state to disk
-        let spillfile = self.spill_state.spill_manager.spill_record_batch_by_size(
-            &sorted,
-            "HashAggSpill",
+        // Free accumulated state now that data has been emitted into `emit`.
+        // This must happen before reserving sort memory so the pool has room.
+        // Use 0 to minimize allocated capacity and maximize memory available for sorting.
+        self.clear_shrink(0);
+        self.update_memory_reservation()?;
+
+        let batch_size_ratio = self.batch_size as f32 / emit.num_rows() as f32;
+        let batch_memory = get_record_batch_memory_size(&emit);
+        // The maximum worst case for a sort is 2X the original underlying buffers(regardless of slicing)
+        // First we get the underlying buffers' size, then we get the sliced("actual") size of the batch,
+        // and multiply it by the ratio of batch_size to actual size to get the estimated memory needed for sorting the batch.
+        // If something goes wrong in get_sliced_size()(double counting or something),
+        // we fall back to the worst case.
+        let sort_memory = (batch_memory
+            + (emit.get_sliced_size()? as f32 * batch_size_ratio) as usize)
+            .min(batch_memory * 2);
+
+        // If we can't grow even that, we have no choice but to return an error since we can't spill to disk without sorting the data first.
+        self.reservation.try_grow(sort_memory).map_err(|err| {
+            resources_datafusion_err!(
+                "Failed to reserve memory for sort during spill: {err}"
+            )
+        })?;
+
+        let sorted_iter = IncrementalSortIterator::new(
+            emit,
+            self.spill_state.spill_expr.clone(),
             self.batch_size,
-        )?;
+        );
+        let spillfile = self
+            .spill_state
+            .spill_manager
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                sorted_iter,
+                "HashAggSpill",
+            )?;
+
+        // Shrink the memory we allocated for sorting as the sorting is fully done at this point.
+        self.reservation.shrink(sort_memory);
+
         match spillfile {
-            Some(spillfile) => self.spill_state.spills.push(spillfile),
+            Some((spillfile, max_record_batch_memory)) => {
+                self.spill_state.spills.push(SortedSpillFile {
+                    file: spillfile,
+                    max_record_batch_memory,
+                })
+            }
             None => {
                 return internal_err!(
                     "Calling spill with no intermediate batch to spill"
@@ -1021,72 +1189,16 @@ impl GroupedHashAggregateStream {
         Ok(())
     }
 
-    /// Clear memory and shirk capacities to the size of the batch.
-    fn clear_shrink(&mut self, batch: &RecordBatch) {
-        self.group_values.clear_shrink(batch);
+    /// Clear memory and shrink capacities to the given number of rows.
+    fn clear_shrink(&mut self, num_rows: usize) {
+        self.group_values.clear_shrink(num_rows);
         self.current_group_indices.clear();
-        self.current_group_indices.shrink_to(batch.num_rows());
+        self.current_group_indices.shrink_to(num_rows);
     }
 
-    /// Clear memory and shirk capacities to zero.
+    /// Clear memory and shrink capacities to zero.
     fn clear_all(&mut self) {
-        let s = self.schema();
-        self.clear_shrink(&RecordBatch::new_empty(s));
-    }
-
-    /// Emit if the used memory exceeds the target for partial aggregation.
-    /// Currently only [`GroupOrdering::None`] is supported for early emitting.
-    /// TODO: support group_ordering for early emitting
-    fn emit_early_if_necessary(&mut self) -> Result<()> {
-        if self.group_values.len() >= self.batch_size
-            && matches!(self.group_ordering, GroupOrdering::None)
-            && self.update_memory_reservation().is_err()
-        {
-            assert_eq!(self.mode, AggregateMode::Partial);
-            let n = self.group_values.len() / self.batch_size * self.batch_size;
-            if let Some(batch) = self.emit(EmitTo::First(n), false)? {
-                self.exec_state = ExecutionState::ProducingOutput(batch);
-            };
-        }
-        Ok(())
-    }
-
-    /// At this point, all the inputs are read and there are some spills.
-    /// Emit the remaining rows and create a batch.
-    /// Conduct a streaming merge sort between the batch and spilled data. Since the stream is fully
-    /// sorted, set `self.group_ordering` to Full, then later we can read with [`EmitTo::First`].
-    fn update_merged_stream(&mut self) -> Result<()> {
-        let Some(batch) = self.emit(EmitTo::All, true)? else {
-            return Ok(());
-        };
-        // clear up memory for streaming_merge
-        self.clear_all();
-        self.update_memory_reservation()?;
-        let mut streams: Vec<SendableRecordBatchStream> = vec![];
-        let expr = self.spill_state.spill_expr.clone();
-        let schema = batch.schema();
-        streams.push(Box::pin(RecordBatchStreamAdapter::new(
-            Arc::clone(&schema),
-            futures::stream::once(futures::future::lazy(move |_| {
-                sort_batch(&batch, expr.as_ref(), None)
-            })),
-        )));
-        for spill in self.spill_state.spills.drain(..) {
-            let stream = self.spill_state.spill_manager.read_spill_as_stream(spill)?;
-            streams.push(stream);
-        }
-        self.spill_state.is_stream_merging = true;
-        self.input = StreamingMergeBuilder::new()
-            .with_streams(streams)
-            .with_schema(schema)
-            .with_expressions(self.spill_state.spill_expr.as_ref())
-            .with_metrics(self.baseline_metrics.clone())
-            .with_batch_size(self.batch_size)
-            .with_reservation(self.reservation.new_empty())
-            .build()?;
-        self.input_done = false;
-        self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new());
-        Ok(())
+        self.clear_shrink(0);
     }
 
     /// returns true if there is a soft groups limit and the number of distinct
@@ -1098,18 +1210,72 @@ impl GroupedHashAggregateStream {
         group_values_soft_limit <= self.group_values.len()
     }
 
-    /// common function for signalling end of processing of the input stream
+    /// Finalizes reading of the input stream and prepares for producing output values.
+    ///
+    /// This method is called both when the original input stream and,
+    /// in case of disk spilling, the SPM stream have been drained.
     fn set_input_done_and_produce_output(&mut self) -> Result<()> {
         self.input_done = true;
         self.group_ordering.input_done();
         let elapsed_compute = self.baseline_metrics.elapsed_compute().clone();
         let timer = elapsed_compute.timer();
         self.exec_state = if self.spill_state.spills.is_empty() {
+            // Input has been entirely processed without spilling to disk.
+
+            // Flush any remaining group values.
             let batch = self.emit(EmitTo::All, false)?;
+
+            // If there are none, we're done; otherwise switch to emitting them
             batch.map_or(ExecutionState::Done, ExecutionState::ProducingOutput)
         } else {
-            // If spill files exist, stream-merge them.
-            self.update_merged_stream()?;
+            // Spill any remaining data to disk. There is some performance overhead in
+            // writing out this last chunk of data and reading it back. The benefit of
+            // doing this is that memory usage for this stream is reduced, and the more
+            // sophisticated memory handling in `MultiLevelMergeBuilder` can take over
+            // instead.
+            // Spilling to disk and reading back also ensures batch size is consistent
+            // rather than potentially having one significantly larger last batch.
+            self.spill()?;
+
+            // Mark that we're switching to stream merging mode.
+            self.spill_state.is_stream_merging = true;
+
+            self.input = StreamingMergeBuilder::new()
+                .with_schema(Arc::clone(&self.spill_state.spill_schema))
+                .with_spill_manager(self.spill_state.spill_manager.clone())
+                .with_sorted_spill_files(std::mem::take(&mut self.spill_state.spills))
+                .with_expressions(&self.spill_state.spill_expr)
+                .with_metrics(self.baseline_metrics.clone())
+                .with_batch_size(self.batch_size)
+                .with_reservation(self.reservation.new_empty())
+                .build()?;
+            self.input_done = false;
+
+            // Reset the group values collectors.
+            self.clear_all();
+
+            // We can now use `GroupOrdering::Full` since the spill files are sorted
+            // on the grouping columns.
+            self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new());
+
+            // Recreate `group_values` for streaming merge so group ids are assigned
+            // in first-seen order, as required by `GroupOrderingFull`.
+            // The pre-spill multi-column collector may use `vectorized_intern`, which
+            // can assign new group ids out of input order under hash collisions.
+            let group_schema = self
+                .spill_state
+                .merging_group_by
+                .group_schema(&self.spill_state.spill_schema)?;
+            if group_schema.fields().len() > 1 {
+                self.group_values = new_group_values(group_schema, &self.group_ordering)?;
+            }
+
+            // Use `OutOfMemoryMode::ReportError` from this point on
+            // to ensure we don't spill the spilled data to disk again.
+            self.oom_mode = OutOfMemoryMode::ReportError;
+
+            self.update_memory_reservation()?;
+
             ExecutionState::ReadingInput
         };
         timer.done();
@@ -1132,16 +1298,17 @@ impl GroupedHashAggregateStream {
     /// skipped, forces stream to produce currently accumulated output.
     ///
     /// Notice: It should only be called in Partial aggregation
-    fn switch_to_skip_aggregation(&mut self) -> Result<()> {
-        if let Some(probe) = self.skip_aggregation_probe.as_mut() {
-            if probe.should_skip() {
-                if let Some(batch) = self.emit(EmitTo::All, false)? {
-                    self.exec_state = ExecutionState::ProducingOutput(batch);
-                };
-            }
-        }
+    ///
+    /// Returns `Some(ExecutionState)` if the state should be changed, None otherwise.
+    fn switch_to_skip_aggregation(&mut self) -> Result<Option<ExecutionState>> {
+        if let Some(probe) = self.skip_aggregation_probe.as_mut()
+            && probe.should_skip()
+            && let Some(batch) = self.emit(EmitTo::All, false)?
+        {
+            return Ok(Some(ExecutionState::ProducingOutput(batch)));
+        };
 
-        Ok(())
+        Ok(None)
     }
 
     /// Returns true if the aggregation probe indicates that aggregation
@@ -1155,14 +1322,16 @@ impl GroupedHashAggregateStream {
     }
 
     /// Transforms input batch to intermediate aggregate state, without grouping it
-    fn transform_to_states(&self, batch: RecordBatch) -> Result<RecordBatch> {
-        let mut group_values = evaluate_group_by(&self.group_by, &batch)?;
-        let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
-        let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
-
-        if group_values.len() != 1 {
-            return internal_err!("group_values expected to have single element");
-        }
+    fn transform_to_states(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let mut group_values = evaluate_group_by(&self.group_by, batch)?;
+        let input_values = evaluate_many(&self.aggregate_arguments, batch)?;
+        let filter_values = evaluate_optional(&self.filter_expressions, batch)?;
+
+        assert_eq_or_internal_err!(
+            group_values.len(),
+            1,
+            "group_values expected to have single element"
+        );
         let mut output = group_values.swap_remove(0);
 
         let iter = self
@@ -1181,3 +1350,355 @@ impl GroupedHashAggregateStream {
         Ok(states_batch)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::InputOrderMode;
+    use crate::execution_plan::ExecutionPlan;
+    use crate::test::TestMemoryExec;
+    use arrow::array::{Int32Array, Int64Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_execution::TaskContext;
+    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+    use datafusion_functions_aggregate::count::count_udaf;
+    use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion_physical_expr::expressions::col;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn test_double_emission_race_condition_bug() -> Result<()> {
+        // Fix for https://github.com/apache/datafusion/issues/18701
+        // This test specifically proves that we have fixed double emission race condition
+        // where emit_early_if_necessary() and switch_to_skip_aggregation()
+        // both emit in the same loop iteration, causing data loss
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int64, false),
+        ]));
+
+        // Create data that will trigger BOTH conditions in the same iteration:
+        // 1. More groups than batch_size (triggers early emission when memory pressure hits)
+        // 2. High cardinality ratio (triggers skip aggregation)
+        let batch_size = 1024; // We'll set this in session config
+        let num_groups = batch_size + 100; // Slightly more than batch_size (1124 groups)
+
+        // Create exactly 1 row per group = 100% cardinality ratio
+        let group_ids: Vec<i32> = (0..num_groups as i32).collect();
+        let values: Vec<i64> = vec![1; num_groups];
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids)),
+                Arc::new(Int64Array::from(values)),
+            ],
+        )?;
+
+        let input_partitions = vec![vec![batch]];
+
+        // Create constrained memory to trigger early emission but not completely fail
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1024, 1.0) // small enough to start but will trigger pressure
+            .build_arc()?;
+
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+
+        // Configure to trigger BOTH conditions:
+        // 1. Low probe threshold (triggers skip probe after few rows)
+        // 2. Low ratio threshold (triggers skip aggregation immediately)
+        // 3. Set batch_size to 1024 so our 1124 groups will trigger early emission
+        // This creates the race condition where both emit paths are triggered
+        let mut session_config = task_ctx.session_config().clone();
+        session_config = session_config.set(
+            "datafusion.execution.batch_size",
+            &datafusion_common::ScalarValue::UInt64(Some(1024)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(50)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_ratio_threshold",
+            &datafusion_common::ScalarValue::Float64(Some(0.8)),
+        );
+        task_ctx = task_ctx.with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create aggregate: COUNT(*) GROUP BY group_col
+        let group_expr = vec![(col("group_col", &schema)?, "group_col".to_string())];
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("count_value")
+                .build()?,
+        )];
+
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // Use Partial mode where the race condition occurs
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(group_expr),
+            aggr_expr,
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+
+        // Execute and collect results
+        let mut stream =
+            GroupedHashAggregateStream::new(&aggregate_exec, &Arc::clone(&task_ctx), 0)?;
+        let mut results = Vec::new();
+
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            results.push(batch);
+        }
+
+        // Count total groups emitted
+        let mut total_output_groups = 0;
+        for batch in &results {
+            total_output_groups += batch.num_rows();
+        }
+
+        assert_eq!(
+            total_output_groups, num_groups,
+            "Unexpected number of groups",
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_skip_aggregation_probe_not_locked_until_skip() -> Result<()> {
+        // Test that the probe is not locked until we actually decide to skip.
+        // This allows us to continue evaluating the skip condition across multiple batches.
+        //
+        // Scenario:
+        // - Batch 1: Hits rows threshold but NOT ratio threshold (low cardinality) -> don't skip
+        // - Batch 2: Now hits ratio threshold (high cardinality) -> skip
+        //
+        // Without the fix, the probe would be locked after batch 1, preventing the skip
+        // decision from being made on batch 2.
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int32, false),
+        ]));
+
+        // Configure thresholds:
+        // - probe_rows_threshold: 100 rows
+        // - probe_ratio_threshold: 0.8 (80%)
+        let probe_rows_threshold = 100;
+        let probe_ratio_threshold = 0.8;
+
+        // Batch 1: 100 rows with only 10 unique groups
+        // Ratio: 10/100 = 0.1 (10%) < 0.8 -> should NOT skip
+        // This will hit the rows threshold but not the ratio threshold
+        let batch1_rows = 100;
+        let batch1_groups = 10;
+        let mut group_ids_batch1 = Vec::new();
+        for i in 0..batch1_rows {
+            group_ids_batch1.push((i % batch1_groups) as i32);
+        }
+        let values_batch1: Vec<i32> = vec![1; batch1_rows];
+
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch1)),
+                Arc::new(Int32Array::from(values_batch1)),
+            ],
+        )?;
+
+        // Batch 2: 350 rows with 350 unique NEW groups (starting from group 10)
+        // After batch 2, total: 450 rows, 360 groups
+        // Ratio: 360/450 = 0.8 (80%) >= 0.8 -> SHOULD decide to skip
+        let batch2_rows = 350;
+        let batch2_groups = 350;
+        let group_ids_batch2: Vec<i32> = (batch1_groups..(batch1_groups + batch2_groups))
+            .map(|x| x as i32)
+            .collect();
+        let values_batch2: Vec<i32> = vec![1; batch2_rows];
+
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch2)),
+                Arc::new(Int32Array::from(values_batch2)),
+            ],
+        )?;
+
+        // Batch 3: This batch should be skipped since we decided to skip after batch 2
+        // 100 rows with 100 unique groups (continuing from where batch 2 left off)
+        let batch3_rows = 100;
+        let batch3_groups = 100;
+        let batch3_start_group = batch1_groups + batch2_groups;
+        let group_ids_batch3: Vec<i32> = (batch3_start_group
+            ..(batch3_start_group + batch3_groups))
+            .map(|x| x as i32)
+            .collect();
+        let values_batch3: Vec<i32> = vec![1; batch3_rows];
+
+        let batch3 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(group_ids_batch3)),
+                Arc::new(Int32Array::from(values_batch3)),
+            ],
+        )?;
+
+        let input_partitions = vec![vec![batch1, batch2, batch3]];
+
+        let runtime = RuntimeEnvBuilder::default().build_arc()?;
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+
+        // Configure skip aggregation settings
+        let mut session_config = task_ctx.session_config().clone();
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(probe_rows_threshold)),
+        );
+        session_config = session_config.set(
+            "datafusion.execution.skip_partial_aggregation_probe_ratio_threshold",
+            &datafusion_common::ScalarValue::Float64(Some(probe_ratio_threshold)),
+        );
+        task_ctx = task_ctx.with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create aggregate: COUNT(*) GROUP BY group_col
+        let group_expr = vec![(col("group_col", &schema)?, "group_col".to_string())];
+        let aggr_expr = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("count_value")
+                .build()?,
+        )];
+
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // Use Partial mode
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(group_expr),
+            aggr_expr,
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+
+        // Execute and collect results
+        let mut stream =
+            GroupedHashAggregateStream::new(&aggregate_exec, &Arc::clone(&task_ctx), 0)?;
+        let mut results = Vec::new();
+
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            results.push(batch);
+        }
+
+        // Check that skip aggregation actually happened
+        // The key metric is skipped_aggregation_rows
+        let metrics = aggregate_exec.metrics().unwrap();
+        let skipped_rows = metrics
+            .sum_by_name("skipped_aggregation_rows")
+            .map(|m| m.as_usize())
+            .unwrap_or(0);
+
+        // We expect batch 3's rows to be skipped (100 rows)
+        assert_eq!(
+            skipped_rows, batch3_rows,
+            "Expected batch 3's rows ({batch3_rows}) to be skipped",
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_emit_early_with_partially_sorted() -> Result<()> {
+        // Reproducer for #20445: EmitEarly with PartiallySorted panics in
+        // remove_groups because it emits more groups than the sort boundary.
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("sort_col", DataType::Int32, false),
+            Field::new("group_col", DataType::Int32, false),
+            Field::new("value_col", DataType::Int64, false),
+        ]));
+
+        // All rows share sort_col=1 (no sort boundary), with unique group_col
+        // values to create many groups and trigger memory pressure.
+        let n = 256;
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1; n])),
+                Arc::new(Int32Array::from((0..n as i32).collect::<Vec<_>>())),
+                Arc::new(Int64Array::from(vec![1; n])),
+            ],
+        )?;
+
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(4096, 1.0)
+            .build_arc()?;
+        let mut task_ctx = TaskContext::default().with_runtime(runtime);
+        let mut cfg = task_ctx.session_config().clone();
+        cfg = cfg.set(
+            "datafusion.execution.batch_size",
+            &datafusion_common::ScalarValue::UInt64(Some(128)),
+        );
+        cfg = cfg.set(
+            "datafusion.execution.skip_partial_aggregation_probe_rows_threshold",
+            &datafusion_common::ScalarValue::UInt64(Some(u64::MAX)),
+        );
+        task_ctx = task_ctx.with_session_config(cfg);
+        let task_ctx = Arc::new(task_ctx);
+
+        let ordering = LexOrdering::new(vec![PhysicalSortExpr::new_default(Arc::new(
+            Column::new("sort_col", 0),
+        )
+            as _)])
+        .unwrap();
+        let exec = TestMemoryExec::try_new(&[vec![batch]], Arc::clone(&schema), None)?
+            .try_with_sort_information(vec![ordering])?;
+        let exec = Arc::new(TestMemoryExec::update_cache(&Arc::new(exec)));
+
+        // GROUP BY sort_col, group_col with input sorted on sort_col
+        // gives PartiallySorted([0])
+        let aggregate_exec = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![
+                (col("sort_col", &schema)?, "sort_col".to_string()),
+                (col("group_col", &schema)?, "group_col".to_string()),
+            ]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("value_col", &schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("count_value")
+                    .build()?,
+            )],
+            vec![None],
+            exec,
+            Arc::clone(&schema),
+        )?;
+        assert!(matches!(
+            aggregate_exec.input_order_mode(),
+            InputOrderMode::PartiallySorted(_)
+        ));
+
+        // Must not panic with "assertion failed: *current_sort >= n"
+        let mut stream = GroupedHashAggregateStream::new(&aggregate_exec, &task_ctx, 0)?;
+        while let Some(result) = stream.next().await {
+            if let Err(e) = result {
+                if e.to_string().contains("Resources exhausted") {
+                    break;
+                }
+                return Err(e);
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
index 47052fd525115..694780f08547f 100644
--- a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs
@@ -15,22 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! A wrapper around `hashbrown::RawTable` that allows entries to be tracked by index
+//! A wrapper around `hashbrown::HashTable` that allows entries to be tracked by index
 
 use crate::aggregates::group_values::HashValue;
 use crate::aggregates::topk::heap::Comparable;
-use ahash::RandomState;
 use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
 use arrow::array::{
-    builder::PrimitiveBuilder, cast::AsArray, downcast_primitive, Array, ArrayRef,
-    ArrowPrimitiveType, LargeStringArray, PrimitiveArray, StringArray, StringViewArray,
+    Array, ArrayRef, ArrowPrimitiveType, LargeStringArray, PrimitiveArray, StringArray,
+    StringViewArray, builder::PrimitiveBuilder, cast::AsArray, downcast_primitive,
 };
-use arrow::datatypes::{i256, DataType};
-use datafusion_common::DataFusionError;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::exec_datafusion_err;
+use datafusion_common::hash_utils::RandomState;
 use half::f16;
-use hashbrown::raw::RawTable;
+use hashbrown::hash_table::HashTable;
 use std::fmt::Debug;
+use std::hash::BuildHasher;
 use std::sync::Arc;
 
 /// A "type alias" for Keys which are stored in our map
@@ -48,13 +49,17 @@ pub struct HashTableItem<ID: KeyType> {
     pub heap_idx: usize,
 }
 
-/// A custom wrapper around `hashbrown::RawTable` that:
+/// A custom wrapper around `hashbrown::HashTable` that:
 /// 1. limits the number of entries to the top K
 /// 2. Allocates a capacity greater than top K to maintain a low-fill factor and prevent resizing
 /// 3. Tracks indexes to allow corresponding heap to refer to entries by index vs hash
-/// 4. Catches resize events to allow the corresponding heap to update it's indexes
 struct TopKHashTable<ID: KeyType> {
-    map: RawTable<HashTableItem<ID>>,
+    map: HashTable<usize>,
+    // Store the actual items separately to allow for index-based access
+    store: Vec<Option<HashTableItem<ID>>>,
+    // Free index in the store for reuse
+    free_index: Option<usize>,
+    // The maximum number of entries allowed
     limit: usize,
 }
 
@@ -62,25 +67,23 @@ struct TopKHashTable<ID: KeyType> {
 pub trait ArrowHashTable {
     fn set_batch(&mut self, ids: ArrayRef);
     fn len(&self) -> usize;
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide valid indexes
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]);
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide a valid index
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize;
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef;
-
-    // JUSTIFICATION
-    //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-    //  Soundness: the caller must provide valid indexes
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        map: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool);
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]);
+    fn heap_idx_at(&self, map_idx: usize) -> usize;
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef;
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool);
+}
+
+/// Returns true if the given data type can be used as a top-K aggregation hash key.
+///
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals)
+/// and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`). This is used internally by
+/// `PriorityMap::supports()` to validate grouping key type compatibility.
+pub fn is_supported_hash_key_type(kt: &DataType) -> bool {
+    kt.is_primitive()
+        || matches!(
+            kt,
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
+        )
 }
 
 // An implementation of ArrowHashTable for String keys
@@ -99,6 +102,7 @@ where
     owned: ArrayRef,
     map: TopKHashTable<Option<VAL::Native>>,
     rnd: RandomState,
+    kt: DataType,
 }
 
 impl StringHashTable {
@@ -118,6 +122,34 @@ impl StringHashTable {
             data_type,
         }
     }
+
+    /// Extracts the string value at the given row index, handling nulls and different string types.
+    ///
+    /// Returns `None` if the value is null, otherwise `Some(value.to_string())`.
+    fn extract_string_value(&self, row_idx: usize) -> Option<String> {
+        let is_null_and_value = match self.data_type {
+            DataType::Utf8 => {
+                let arr = self.owned.as_string::<i32>();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            DataType::LargeUtf8 => {
+                let arr = self.owned.as_string::<i64>();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            DataType::Utf8View => {
+                let arr = self.owned.as_string_view();
+                (arr.is_null(row_idx), arr.value(row_idx))
+            }
+            _ => panic!("Unsupported data type"),
+        };
+
+        let (is_null, value) = is_null_and_value;
+        if is_null {
+            None
+        } else {
+            Some(value.to_string())
+        }
+    }
 }
 
 impl ArrowHashTable for StringHashTable {
@@ -129,15 +161,15 @@ impl ArrowHashTable for StringHashTable {
         self.map.len()
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         self.map.update_heap_idx(mapper);
     }
 
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
+    fn heap_idx_at(&self, map_idx: usize) -> usize {
         self.map.heap_idx_at(map_idx)
     }
 
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
         let ids = self.map.take_all(indexes);
         match self.data_type {
             DataType::Utf8 => Arc::new(StringArray::from(ids)),
@@ -147,67 +179,16 @@ impl ArrowHashTable for StringHashTable {
         }
     }
 
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool) {
-        let id = match self.data_type {
-            DataType::Utf8 => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<StringArray>()
-                    .expect("Expected StringArray for DataType::Utf8");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            DataType::LargeUtf8 => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<LargeStringArray>()
-                    .expect("Expected LargeStringArray for DataType::LargeUtf8");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            DataType::Utf8View => {
-                let ids = self
-                    .owned
-                    .as_any()
-                    .downcast_ref::<StringViewArray>()
-                    .expect("Expected StringViewArray for DataType::Utf8View");
-                if ids.is_null(row_idx) {
-                    None
-                } else {
-                    Some(ids.value(row_idx))
-                }
-            }
-            _ => panic!("Unsupported data type"),
-        };
-
-        let hash = self.rnd.hash_one(id);
-        if let Some(map_idx) = self
-            .map
-            .find(hash, |mi| id == mi.as_ref().map(|id| id.as_str()))
-        {
-            return (map_idx, false);
-        }
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool) {
+        let id = self.extract_string_value(row_idx);
 
-        // we're full and this is a better value, so remove the worst
-        let heap_idx = self.map.remove_if_full(replace_idx);
+        // Compute hash and create equality closure for hash table lookup.
+        let hash = self.rnd.hash_one(id.as_deref());
+        let id_for_eq = id.clone();
+        let eq = move |mi: &Option<String>| id_for_eq.as_deref() == mi.as_deref();
 
-        // add the new group
-        let id = id.map(|id| id.to_string());
-        let map_idx = self.map.insert(hash, id, heap_idx, mapper);
-        (map_idx, true)
+        // Use entry API to avoid double lookup
+        self.map.find_or_insert(hash, id, replace_idx, eq)
     }
 }
 
@@ -216,12 +197,17 @@ where
     Option<<VAL as ArrowPrimitiveType>::Native>: Comparable,
     Option<<VAL as ArrowPrimitiveType>::Native>: HashValue,
 {
-    pub fn new(limit: usize) -> Self {
-        let owned = Arc::new(PrimitiveArray::<VAL>::builder(0).finish());
+    pub fn new(limit: usize, kt: DataType) -> Self {
+        let owned = Arc::new(
+            PrimitiveArray::<VAL>::builder(0)
+                .with_data_type(kt.clone())
+                .finish(),
+        );
         Self {
             owned,
             map: TopKHashTable::new(limit, limit * 10),
             rnd: RandomState::default(),
+            kt,
         }
     }
 }
@@ -239,17 +225,18 @@ where
         self.map.len()
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         self.map.update_heap_idx(mapper);
     }
 
-    unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
+    fn heap_idx_at(&self, map_idx: usize) -> usize {
         self.map.heap_idx_at(map_idx)
     }
 
-    unsafe fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
+    fn take_all(&mut self, indexes: Vec<usize>) -> ArrayRef {
         let ids = self.map.take_all(indexes);
-        let mut builder: PrimitiveBuilder<VAL> = PrimitiveArray::builder(ids.len());
+        let mut builder: PrimitiveBuilder<VAL> =
+            PrimitiveArray::builder(ids.len()).with_data_type(self.kt.clone());
         for id in ids.into_iter() {
             match id {
                 None => builder.append_null(),
@@ -260,112 +247,117 @@ where
         Arc::new(ids)
     }
 
-    unsafe fn find_or_insert(
-        &mut self,
-        row_idx: usize,
-        replace_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> (usize, bool) {
+    fn find_or_insert(&mut self, row_idx: usize, replace_idx: usize) -> (usize, bool) {
         let ids = self.owned.as_primitive::<VAL>();
         let id: Option<VAL::Native> = if ids.is_null(row_idx) {
             None
         } else {
             Some(ids.value(row_idx))
         };
-
+        // Compute hash and create equality closure for hash table lookup.
         let hash: u64 = id.hash(&self.rnd);
-        if let Some(map_idx) = self.map.find(hash, |mi| id == *mi) {
-            return (map_idx, false);
-        }
+        let eq = |mi: &Option<VAL::Native>| id == *mi;
 
-        // we're full and this is a better value, so remove the worst
-        let heap_idx = self.map.remove_if_full(replace_idx);
-
-        // add the new group
-        let map_idx = self.map.insert(hash, id, heap_idx, mapper);
-        (map_idx, true)
+        // Use entry API to avoid double lookup
+        self.map.find_or_insert(hash, id, replace_idx, eq)
     }
 }
 
-impl<ID: KeyType> TopKHashTable<ID> {
+use hashbrown::hash_table::Entry;
+impl<ID: KeyType + PartialEq> TopKHashTable<ID> {
     pub fn new(limit: usize, capacity: usize) -> Self {
         Self {
-            map: RawTable::with_capacity(capacity),
+            map: HashTable::with_capacity(capacity),
+            store: Vec::with_capacity(capacity),
+            free_index: None,
             limit,
         }
     }
 
-    pub fn find(&self, hash: u64, mut eq: impl FnMut(&ID) -> bool) -> Option<usize> {
-        let bucket = self.map.find(hash, |mi| eq(&mi.id))?;
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: getting the index of a bucket we just found
-        let idx = unsafe { self.map.bucket_index(&bucket) };
-        Some(idx)
-    }
-
-    pub unsafe fn heap_idx_at(&self, map_idx: usize) -> usize {
-        let bucket = unsafe { self.map.bucket(map_idx) };
-        bucket.as_ref().heap_idx
+    pub fn heap_idx_at(&self, map_idx: usize) -> usize {
+        self.store[map_idx].as_ref().unwrap().heap_idx
     }
 
-    pub unsafe fn remove_if_full(&mut self, replace_idx: usize) -> usize {
+    pub fn remove_if_full(&mut self, replace_idx: usize) -> usize {
         if self.map.len() >= self.limit {
-            self.map.erase(self.map.bucket(replace_idx));
+            let item_to_remove = self.store[replace_idx].as_ref().unwrap();
+            let hash = item_to_remove.hash;
+            let id_to_remove = &item_to_remove.id;
+
+            let eq = |&idx: &usize| self.store[idx].as_ref().unwrap().id == *id_to_remove;
+            let hasher = |idx: &usize| self.store[*idx].as_ref().unwrap().hash;
+            match self.map.entry(hash, eq, hasher) {
+                Entry::Occupied(entry) => {
+                    let (removed_idx, _) = entry.remove();
+                    self.store[removed_idx] = None;
+                    self.free_index = Some(removed_idx);
+                }
+                Entry::Vacant(_) => unreachable!(),
+            }
             0 // if full, always replace top node
         } else {
             self.map.len() // if we're not full, always append to end
         }
     }
 
-    unsafe fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
+    fn update_heap_idx(&mut self, mapper: &[(usize, usize)]) {
         for (m, h) in mapper {
-            self.map.bucket(*m).as_mut().heap_idx = *h
+            self.store[*m].as_mut().unwrap().heap_idx = *h;
         }
     }
 
-    pub fn insert(
+    /// Find an existing entry or insert a new one, avoiding double hash table lookup.
+    /// Returns (map_idx, is_new) where is_new indicates if this was a new insertion.
+    /// If inserting a new entry and the table is full, replaces the entry at replace_idx.
+    pub fn find_or_insert(
         &mut self,
         hash: u64,
         id: ID,
-        heap_idx: usize,
-        mapper: &mut Vec<(usize, usize)>,
-    ) -> usize {
-        let mi = HashTableItem::new(hash, id, heap_idx);
-        let bucket = self.map.try_insert_no_grow(hash, mi);
-        let bucket = match bucket {
-            Ok(bucket) => bucket,
-            Err(new_item) => {
-                let bucket = self.map.insert(hash, new_item, |mi| mi.hash);
-                // JUSTIFICATION
-                //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-                //  Soundness: we're getting indexes of buckets, not dereferencing them
-                unsafe {
-                    for bucket in self.map.iter() {
-                        let heap_idx = bucket.as_ref().heap_idx;
-                        let map_idx = self.map.bucket_index(&bucket);
-                        mapper.push((heap_idx, map_idx));
-                    }
-                }
-                bucket
+        replace_idx: usize,
+        mut eq: impl FnMut(&ID) -> bool,
+    ) -> (usize, bool) {
+        // Check if entry exists - this is the only hash table lookup
+        {
+            let eq_fn = |idx: &usize| eq(&self.store[*idx].as_ref().unwrap().id);
+            if let Some(&map_idx) = self.map.find(hash, eq_fn) {
+                return (map_idx, false);
             }
+        }
+
+        // Entry doesn't exist - compute heap_idx and prepare item
+        let heap_idx = self.remove_if_full(replace_idx);
+        let mi = HashTableItem::new(hash, id, heap_idx);
+        let store_idx = if let Some(idx) = self.free_index.take() {
+            self.store[idx] = Some(mi);
+            idx
+        } else {
+            self.store.push(Some(mi));
+            self.store.len() - 1
         };
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: we're getting indexes of buckets, not dereferencing them
-        unsafe { self.map.bucket_index(&bucket) }
+
+        // Reserve space if needed
+        let hasher = |idx: &usize| self.store[*idx].as_ref().unwrap().hash;
+        if self.map.len() == self.map.capacity() {
+            self.map.reserve(self.limit, hasher);
+        }
+
+        // Insert without checking again since we already confirmed it doesn't exist
+        self.map.insert_unique(hash, store_idx, hasher);
+        (store_idx, true)
     }
 
     pub fn len(&self) -> usize {
         self.map.len()
     }
 
-    pub unsafe fn take_all(&mut self, idxs: Vec<usize>) -> Vec<ID> {
+    pub fn take_all(&mut self, idxs: Vec<usize>) -> Vec<ID> {
         let ids = idxs
             .into_iter()
-            .map(|idx| self.map.bucket(idx).as_ref().id.clone())
+            .map(|idx| self.store[idx].take().unwrap().id)
             .collect();
         self.map.clear();
+        self.store.clear();
+        self.free_index = None;
         ids
     }
 }
@@ -413,7 +405,7 @@ pub fn new_hash_table(
 ) -> Result<Box<dyn ArrowHashTable + Send>> {
     macro_rules! downcast_helper {
         ($kt:ty, $d:ident) => {
-            return Ok(Box::new(PrimitiveHashTable::<$kt>::new(limit)))
+            return Ok(Box::new(PrimitiveHashTable::<$kt>::new(limit, kt)))
         };
     }
 
@@ -425,41 +417,58 @@ pub fn new_hash_table(
         _ => {}
     }
 
-    Err(DataFusionError::Execution(format!(
+    Err(exec_datafusion_err!(
         "Can't create HashTable for type: {kt:?}"
-    )))
+    ))
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use arrow::array::TimestampMillisecondArray;
+    use arrow_schema::TimeUnit;
     use std::collections::BTreeMap;
 
+    #[test]
+    fn should_emit_correct_type() -> Result<()> {
+        let ids =
+            TimestampMillisecondArray::from(vec![1000]).with_timezone("UTC".to_string());
+        let dt = DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into()));
+        let mut ht = new_hash_table(1, dt.clone())?;
+        ht.set_batch(Arc::new(ids));
+        ht.find_or_insert(0, 0);
+        let ids = ht.take_all(vec![0]);
+        assert_eq!(ids.data_type(), &dt);
+
+        Ok(())
+    }
+
     #[test]
     fn should_resize_properly() -> Result<()> {
         let mut heap_to_map = BTreeMap::<usize, usize>::new();
+        // Create TopKHashTable with limit=5 and capacity=3 to force resizing
         let mut map = TopKHashTable::<Option<String>>::new(5, 3);
-        for (heap_idx, id) in vec!["1", "2", "3", "4", "5"].into_iter().enumerate() {
-            let mut mapper = vec![];
+
+        // Insert 5 entries, tracking the heap-to-map index mapping
+        for (heap_idx, id) in ["1", "2", "3", "4", "5"].iter().enumerate() {
+            let value = Some(id.to_string());
             let hash = heap_idx as u64;
-            let map_idx = map.insert(hash, Some(id.to_string()), heap_idx, &mut mapper);
-            let _ = heap_to_map.insert(heap_idx, map_idx);
-            if heap_idx == 3 {
-                assert_eq!(
-                    mapper,
-                    vec![(0, 0), (1, 1), (2, 2), (3, 3)],
-                    "Pass {heap_idx} resized incorrectly!"
-                );
-                for (heap_idx, map_idx) in mapper {
-                    let _ = heap_to_map.insert(heap_idx, map_idx);
-                }
-            } else {
-                assert_eq!(mapper, vec![], "Pass {heap_idx} should not have resized!");
-            }
+            let (map_idx, is_new) =
+                map.find_or_insert(hash, value.clone(), heap_idx, |v| *v == value);
+            assert!(is_new, "Entry should be new");
+            heap_to_map.insert(heap_idx, map_idx);
         }
 
+        // Verify all 5 entries are present
+        assert_eq!(map.len(), 5);
+
+        // Verify that the hash table resized properly (capacity should have grown beyond 3)
+        // This is implicit - if it didn't resize, insertions would have failed or been slow
+
+        // Drain all values in heap order
         let (_heap_idxs, map_idxs): (Vec<_>, Vec<_>) = heap_to_map.into_iter().unzip();
-        let ids = unsafe { map.take_all(map_idxs) };
+        let ids = map.take_all(map_idxs);
+
         assert_eq!(
             format!("{ids:?}"),
             r#"[Some("1"), Some("2"), Some("3"), Some("4"), Some("5")]"#
diff --git a/datafusion/physical-plan/src/aggregates/topk/heap.rs b/datafusion/physical-plan/src/aggregates/topk/heap.rs
index ce47504daf039..889fe04bf830a 100644
--- a/datafusion/physical-plan/src/aggregates/topk/heap.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/heap.rs
@@ -15,17 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! A custom binary heap implementation for performant top K aggregation
-
+//! A custom binary heap implementation for performant top K aggregation.
+//!
+//! the `new_heap` //! factory function selects an appropriate heap implementation
+//! based on the Arrow data type.
+//!
+//! Supported value types include Arrow primitives (integers, floats, decimals, intervals)
+//! and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`) using lexicographic ordering.
+
+use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, downcast_primitive};
+use arrow::array::{LargeStringBuilder, StringBuilder, StringViewBuilder};
 use arrow::array::{
+    StringArray,
     cast::AsArray,
     types::{IntervalDayTime, IntervalMonthDayNano},
 };
-use arrow::array::{downcast_primitive, ArrayRef, ArrowPrimitiveType, PrimitiveArray};
 use arrow::buffer::ScalarBuffer;
-use arrow::datatypes::{i256, DataType};
-use datafusion_common::DataFusionError;
+use arrow::datatypes::{DataType, i256};
 use datafusion_common::Result;
+use datafusion_common::exec_datafusion_err;
 
 use half::f16;
 use std::cmp::Ordering;
@@ -72,7 +80,6 @@ pub trait ArrowHeap {
     fn set_batch(&mut self, vals: ArrayRef);
     fn is_worse(&self, idx: usize) -> bool;
     fn worst_map_idx(&self) -> usize;
-    fn renumber(&mut self, heap_to_map: &[(usize, usize)]);
     fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>);
     fn replace_if_better(
         &mut self,
@@ -131,10 +138,6 @@ where
         self.heap.worst_map_idx()
     }
 
-    fn renumber(&mut self, heap_to_map: &[(usize, usize)]) {
-        self.heap.renumber(heap_to_map);
-    }
-
     fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>) {
         let vals = self.batch.as_primitive::<VAL>();
         let new_val = vals.value(row_idx);
@@ -161,6 +164,164 @@ where
     }
 }
 
+/// An implementation of `ArrowHeap` that deals with string values.
+///
+/// Supports all three UTF-8 string types: `Utf8`, `LargeUtf8`, and `Utf8View`.
+/// String values are compared lexicographically using the compare-first pattern:
+/// borrowed strings are compared before allocation, and only allocated when the
+/// heap confirms they improve the top-K set.
+///
+pub struct StringHeap {
+    batch: ArrayRef,
+    heap: TopKHeap<Option<String>>,
+    desc: bool,
+    data_type: DataType,
+}
+
+impl StringHeap {
+    pub fn new(limit: usize, desc: bool, data_type: DataType) -> Self {
+        let batch: ArrayRef = Arc::new(StringArray::from(Vec::<&str>::new()));
+        Self {
+            batch,
+            heap: TopKHeap::new(limit, desc),
+            desc,
+            data_type,
+        }
+    }
+
+    /// Extracts a string value from the current batch at the given row index.
+    ///
+    /// Panics if the row index is out of bounds or if the data type is not one of
+    /// the supported UTF-8 string types.
+    ///
+    /// Note: Null values should not appear in the input; the aggregation layer
+    /// ensures nulls are filtered before reaching this code.
+    fn value(&self, row_idx: usize) -> &str {
+        extract_string_value(&self.batch, &self.data_type, row_idx)
+    }
+}
+
+/// Helper to extract a string value from an ArrayRef at a given index.
+///
+/// Supports `Utf8`, `LargeUtf8`, and `Utf8View` data types.
+///
+/// # Panics
+/// Panics if the index is out of bounds or if the data type is unsupported.
+fn extract_string_value<'a>(
+    batch: &'a ArrayRef,
+    data_type: &DataType,
+    idx: usize,
+) -> &'a str {
+    match data_type {
+        DataType::Utf8 => batch.as_string::<i32>().value(idx),
+        DataType::LargeUtf8 => batch.as_string::<i64>().value(idx),
+        DataType::Utf8View => batch.as_string_view().value(idx),
+        _ => unreachable!("Unsupported string type: {data_type}"),
+    }
+}
+
+impl ArrowHeap for StringHeap {
+    fn set_batch(&mut self, vals: ArrayRef) {
+        self.batch = vals;
+    }
+
+    fn is_worse(&self, row_idx: usize) -> bool {
+        if !self.heap.is_full() {
+            return false;
+        }
+        // Compare borrowed `&str` against the worst heap value first to avoid
+        // allocating a `String` unless this row would actually replace an
+        // existing heap entry.
+        let new_val = self.value(row_idx);
+        let worst_val = self.heap.worst_val().expect("Missing root");
+        match worst_val {
+            None => false,
+            Some(worst_str) => {
+                (!self.desc && new_val > worst_str.as_str())
+                    || (self.desc && new_val < worst_str.as_str())
+            }
+        }
+    }
+
+    fn worst_map_idx(&self) -> usize {
+        self.heap.worst_map_idx()
+    }
+
+    fn insert(&mut self, row_idx: usize, map_idx: usize, map: &mut Vec<(usize, usize)>) {
+        // When appending (heap not full) we must allocate to own the string
+        // because it will be stored in the heap. For replacements we avoid
+        // allocation until `replace_if_better` confirms a replacement is
+        // necessary.
+        let new_str = self.value(row_idx).to_string();
+        let new_val = Some(new_str);
+        self.heap.append_or_replace(new_val, map_idx, map);
+    }
+
+    fn replace_if_better(
+        &mut self,
+        heap_idx: usize,
+        row_idx: usize,
+        map: &mut Vec<(usize, usize)>,
+    ) {
+        let new_str = self.value(row_idx);
+        let existing = self.heap.heap[heap_idx]
+            .as_ref()
+            .expect("Missing heap item");
+
+        // Compare borrowed reference first—no allocation yet.
+        // We compare the borrowed `&str` with the stored `Option<String>` and
+        // only allocate (`to_string()`) when a replacement is required.
+        match &existing.val {
+            None => {
+                // Existing is null; new value always wins
+                let new_val = Some(new_str.to_string());
+                self.heap.replace_if_better(heap_idx, new_val, map);
+            }
+            Some(existing_str) => {
+                // Compare borrowed strings first
+                if (!self.desc && new_str < existing_str.as_str())
+                    || (self.desc && new_str > existing_str.as_str())
+                {
+                    let new_val = Some(new_str.to_string());
+                    self.heap.replace_if_better(heap_idx, new_val, map);
+                }
+                // Else: no improvement, no allocation
+            }
+        }
+    }
+
+    fn drain(&mut self) -> (ArrayRef, Vec<usize>) {
+        let (vals, map_idxs) = self.heap.drain();
+        // Use Arrow builders to safely construct arrays from the owned
+        // `Option<String>` values. Builders avoid needing to maintain
+        // references to temporary storage.
+
+        // Macro to eliminate duplication across string builder types.
+        // All three builders share the same interface for append_value,
+        // append_null, and finish, differing only in their concrete types.
+        macro_rules! build_string_array {
+            ($builder_type:ty) => {{
+                let mut builder = <$builder_type>::new();
+                for val in vals {
+                    match val {
+                        Some(s) => builder.append_value(&s),
+                        None => builder.append_null(),
+                    }
+                }
+                Arc::new(builder.finish())
+            }};
+        }
+
+        let arr: ArrayRef = match self.data_type {
+            DataType::Utf8 => build_string_array!(StringBuilder),
+            DataType::LargeUtf8 => build_string_array!(LargeStringBuilder),
+            DataType::Utf8View => build_string_array!(StringViewBuilder),
+            _ => unreachable!("Unsupported string type: {}", self.data_type),
+        };
+        (arr, map_idxs)
+    }
+}
+
 impl<VAL: ValueType> TopKHeap<VAL> {
     pub fn new(limit: usize, desc: bool) -> Self {
         Self {
@@ -268,14 +429,6 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         self.heapify_down(heap_idx, mapper);
     }
 
-    pub fn renumber(&mut self, heap_to_map: &[(usize, usize)]) {
-        for (heap_idx, map_idx) in heap_to_map.iter() {
-            if let Some(Some(hi)) = self.heap.get_mut(*heap_idx) {
-                hi.map_idx = *map_idx;
-            }
-        }
-    }
-
     fn heapify_up(&mut self, mut idx: usize, mapper: &mut Vec<(usize, usize)>) {
         let desc = self.desc;
         while idx != 0 {
@@ -311,13 +464,12 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         let mut best_idx = node_idx;
         let mut best_val = &entry.val;
         for child_idx in left_child..=left_child + 1 {
-            if let Some(Some(child)) = self.heap.get(child_idx) {
-                if (!desc && child.val.comp(best_val) == Ordering::Greater)
-                    || (desc && child.val.comp(best_val) == Ordering::Less)
-                {
-                    best_val = &child.val;
-                    best_idx = child_idx;
-                }
+            if let Some(Some(child)) = self.heap.get(child_idx)
+                && ((!desc && child.val.comp(best_val) == Ordering::Greater)
+                    || (desc && child.val.comp(best_val) == Ordering::Less))
+            {
+                best_val = &child.val;
+                best_idx = child_idx;
             }
         }
         if best_val.comp(&entry.val) != Ordering::Equal {
@@ -326,20 +478,10 @@ impl<VAL: ValueType> TopKHeap<VAL> {
         }
     }
 
-    fn _tree_print(
-        &self,
-        idx: usize,
-        prefix: String,
-        is_tail: bool,
-        output: &mut String,
-    ) {
+    fn _tree_print(&self, idx: usize, prefix: &str, is_tail: bool, output: &mut String) {
         if let Some(Some(hi)) = self.heap.get(idx) {
             let connector = if idx != 0 {
-                if is_tail {
-                    "└── "
-                } else {
-                    "├── "
-                }
+                if is_tail { "└── " } else { "├── " }
             } else {
                 ""
             };
@@ -357,10 +499,10 @@ impl<VAL: ValueType> TopKHeap<VAL> {
             let right_exists = right_idx < self.len;
 
             if left_exists {
-                self._tree_print(left_idx, child_prefix.clone(), !right_exists, output);
+                self._tree_print(left_idx, &child_prefix, !right_exists, output);
             }
             if right_exists {
-                self._tree_print(right_idx, child_prefix, true, output);
+                self._tree_print(right_idx, &child_prefix, true, output);
             }
         }
     }
@@ -370,7 +512,7 @@ impl<VAL: ValueType> Display for TopKHeap<VAL> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let mut output = String::new();
         if !self.heap.is_empty() {
-            self._tree_print(0, String::new(), true, &mut output);
+            self._tree_print(0, "", true, &mut output);
         }
         write!(f, "{output}")
     }
@@ -462,11 +604,31 @@ compare_integer!(u8, u16, u32, u64);
 compare_integer!(IntervalDayTime, IntervalMonthDayNano);
 compare_float!(f16, f32, f64);
 
+/// Returns true if the given data type can be stored in a top-K aggregation heap.
+///
+/// Supported types include Arrow primitives (integers, floats, decimals, intervals)
+/// and UTF-8 strings (`Utf8`, `LargeUtf8`, `Utf8View`). This is used internally by
+/// `PriorityMap::supports()` to validate aggregate value type compatibility.
+pub fn is_supported_heap_type(vt: &DataType) -> bool {
+    vt.is_primitive()
+        || matches!(
+            vt,
+            DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
+        )
+}
+
 pub fn new_heap(
     limit: usize,
     desc: bool,
     vt: DataType,
 ) -> Result<Box<dyn ArrowHeap + Send>> {
+    if matches!(
+        vt,
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
+    ) {
+        return Ok(Box::new(StringHeap::new(limit, desc, vt)));
+    }
+
     macro_rules! downcast_helper {
         ($vt:ty, $d:ident) => {
             return Ok(Box::new(PrimitiveHeap::<$vt>::new(limit, desc, vt)))
@@ -478,9 +640,9 @@ pub fn new_heap(
         _ => {}
     }
 
-    Err(DataFusionError::Execution(format!(
-        "Can't group type: {vt:?}"
-    )))
+    Err(exec_datafusion_err!(
+        "Unsupported TopK aggregate value type: {vt:?}"
+    ))
 }
 
 #[cfg(test)]
@@ -496,9 +658,7 @@ mod tests {
         heap.append_or_replace(1, 1, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=1 idx=0, bucket=1
-            "#);
+        assert_snapshot!(actual, @"val=1 idx=0, bucket=1");
 
         Ok(())
     }
@@ -515,10 +675,10 @@ val=1 idx=0, bucket=1
         assert_eq!(map, vec![(2, 0), (1, 1)]);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         Ok(())
     }
@@ -532,20 +692,20 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(2, 2, &mut map);
         heap.append_or_replace(3, 3, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=3 idx=0, bucket=3
-├── val=1 idx=1, bucket=1
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=3 idx=0, bucket=3
+        ├── val=1 idx=1, bucket=1
+        └── val=2 idx=2, bucket=2
+        ");
 
         let mut map = vec![];
         heap.append_or_replace(0, 0, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-├── val=1 idx=1, bucket=1
-└── val=0 idx=2, bucket=0
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        ├── val=1 idx=1, bucket=1
+        └── val=0 idx=2, bucket=0
+        ");
         assert_eq!(map, vec![(2, 0), (0, 2)]);
 
         Ok(())
@@ -561,22 +721,22 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(3, 3, &mut map);
         heap.append_or_replace(4, 4, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=4 idx=0, bucket=4
-├── val=3 idx=1, bucket=3
-│   └── val=1 idx=3, bucket=1
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=4 idx=0, bucket=4
+        ├── val=3 idx=1, bucket=3
+        │   └── val=1 idx=3, bucket=1
+        └── val=2 idx=2, bucket=2
+        ");
 
         let mut map = vec![];
         heap.replace_if_better(1, 0, &mut map);
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=4 idx=0, bucket=4
-├── val=1 idx=1, bucket=1
-│   └── val=0 idx=3, bucket=3
-└── val=2 idx=2, bucket=2
-            "#);
+        assert_snapshot!(actual, @r"
+        val=4 idx=0, bucket=4
+        ├── val=1 idx=1, bucket=1
+        │   └── val=0 idx=3, bucket=3
+        └── val=2 idx=2, bucket=2
+        ");
         assert_eq!(map, vec![(1, 1), (3, 3)]);
 
         Ok(())
@@ -591,10 +751,10 @@ val=4 idx=0, bucket=4
         heap.append_or_replace(2, 2, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         assert_eq!(heap.worst_val(), Some(&2));
         assert_eq!(heap.worst_map_idx(), 2);
@@ -611,10 +771,10 @@ val=2 idx=0, bucket=2
         heap.append_or_replace(2, 2, &mut map);
 
         let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
+        assert_snapshot!(actual, @r"
+        val=2 idx=0, bucket=2
+        └── val=1 idx=1, bucket=1
+        ");
 
         let (vals, map_idxs) = heap.drain();
         assert_eq!(vals, vec![1, 2]);
@@ -623,29 +783,4 @@ val=2 idx=0, bucket=2
 
         Ok(())
     }
-
-    #[test]
-    fn should_renumber() -> Result<()> {
-        let mut map = vec![];
-        let mut heap = TopKHeap::new(10, false);
-
-        heap.append_or_replace(1, 1, &mut map);
-        heap.append_or_replace(2, 2, &mut map);
-
-        let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=2
-└── val=1 idx=1, bucket=1
-            "#);
-
-        let numbers = vec![(0, 1), (1, 2)];
-        heap.renumber(numbers.as_slice());
-        let actual = heap.to_string();
-        assert_snapshot!(actual, @r#"
-val=2 idx=0, bucket=1
-└── val=1 idx=1, bucket=2
-            "#);
-
-        Ok(())
-    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
index a09d70f7471f3..c74b648d373ce 100644
--- a/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
+++ b/datafusion/physical-plan/src/aggregates/topk/priority_map.rs
@@ -17,8 +17,8 @@
 
 //! A `Map<K, V>` / `PriorityQueue` combo that evicts the worst values after reaching `capacity`
 
-use crate::aggregates::topk::hash_table::{new_hash_table, ArrowHashTable};
-use crate::aggregates::topk::heap::{new_heap, ArrowHeap};
+use crate::aggregates::topk::hash_table::{ArrowHashTable, new_hash_table};
+use crate::aggregates::topk::heap::{ArrowHeap, new_heap};
 use arrow::array::ArrayRef;
 use arrow::datatypes::DataType;
 use datafusion_common::Result;
@@ -63,40 +63,26 @@ impl PriorityMap {
         // handle new groups we haven't seen yet
         map.clear();
         let replace_idx = self.heap.worst_map_idx();
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: replace_idx kept valid during resizes
-        let (map_idx, did_insert) =
-            unsafe { self.map.find_or_insert(row_idx, replace_idx, map) };
+
+        let (map_idx, did_insert) = self.map.find_or_insert(row_idx, replace_idx);
         if did_insert {
-            self.heap.renumber(map);
-            map.clear();
             self.heap.insert(row_idx, map_idx, map);
-            // JUSTIFICATION
-            //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-            //  Soundness: the map was created on the line above, so all the indexes should be valid
-            unsafe { self.map.update_heap_idx(map) };
+            self.map.update_heap_idx(map);
             return Ok(());
         };
 
         // this is a value for an existing group
         map.clear();
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: map_idx was just found, so it is valid
-        let heap_idx = unsafe { self.map.heap_idx_at(map_idx) };
+        let heap_idx = self.map.heap_idx_at(map_idx);
         self.heap.replace_if_better(heap_idx, row_idx, map);
-        // JUSTIFICATION
-        //  Benefit:  ~15% speedup + required to index into RawTable from binary heap
-        //  Soundness: the index map was just built, so it will be valid
-        unsafe { self.map.update_heap_idx(map) };
+        self.map.update_heap_idx(map);
 
         Ok(())
     }
 
     pub fn emit(&mut self) -> Result<Vec<ArrayRef>> {
         let (vals, map_idxs) = self.heap.drain();
-        let ids = unsafe { self.map.take_all(map_idxs) };
+        let ids = self.map.take_all(map_idxs);
         Ok(vec![ids, vals])
     }
 
@@ -182,13 +168,13 @@ mod tests {
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
 
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -207,13 +193,13 @@ mod tests {
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
 
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -231,13 +217,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 2        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 2        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -255,13 +241,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -279,13 +265,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -303,13 +289,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -327,13 +313,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 2        | 2            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 2        | 2            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -351,13 +337,13 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -375,14 +361,110 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        | 1        | 2            |
+        +----------+--------------+
+        "
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_lexicographic_min_utf8_value() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringArray::from(vec!["zulu", "alpha"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8, 1, false)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
         assert_snapshot!(actual, @r#"
 +----------+--------------+
 | trace_id | timestamp_ms |
 +----------+--------------+
-| 1        | 2            |
+| 1        | alpha        |
 +----------+--------------+
-        "#
-        );
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_lexicographic_max_utf8_value_desc() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringArray::from(vec!["alpha", "zulu"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8, 1, true)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | zulu         |
++----------+--------------+
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_large_utf8_values() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(LargeStringArray::from(vec!["zulu", "alpha"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::LargeUtf8, 1, false)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::LargeUtf8), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | alpha        |
++----------+--------------+
+        "#);
+
+        Ok(())
+    }
+
+    #[test]
+    fn should_track_utf8_view_values() -> Result<()> {
+        let ids: ArrayRef = Arc::new(Int64Array::from(vec![1, 1]));
+        let vals: ArrayRef = Arc::new(StringViewArray::from(vec!["alpha", "zulu"]));
+        let mut agg = PriorityMap::new(DataType::Int64, DataType::Utf8View, 1, true)?;
+        agg.set_batch(ids, vals);
+        agg.insert(0)?;
+        agg.insert(1)?;
+
+        let cols = agg.emit()?;
+        let batch = RecordBatch::try_new(test_schema_value(DataType::Utf8View), cols)?;
+        let actual = format!("{}", pretty_format_batches(&[batch])?);
+
+        assert_snapshot!(actual, @r#"
++----------+--------------+
+| trace_id | timestamp_ms |
++----------+--------------+
+| 1        | zulu         |
++----------+--------------+
+        "#);
 
         Ok(())
     }
@@ -400,14 +482,14 @@ mod tests {
         let cols = agg.emit()?;
         let batch = RecordBatch::try_new(test_schema(), cols)?;
         let actual = format!("{}", pretty_format_batches(&[batch])?);
-        assert_snapshot!(actual, @r#"
-+----------+--------------+
-| trace_id | timestamp_ms |
-+----------+--------------+
-|          | 3            |
-| 1        | 1            |
-+----------+--------------+
-        "#
+        assert_snapshot!(actual, @r"
+        +----------+--------------+
+        | trace_id | timestamp_ms |
+        +----------+--------------+
+        |          | 3            |
+        | 1        | 1            |
+        +----------+--------------+
+        "
         );
 
         Ok(())
@@ -433,4 +515,11 @@ mod tests {
             Field::new("timestamp_ms", DataType::Int64, true),
         ]))
     }
+
+    fn test_schema_value(value_type: DataType) -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("trace_id", DataType::Int64, true),
+            Field::new("timestamp_ms", value_type, true),
+        ]))
+    }
 }
diff --git a/datafusion/physical-plan/src/aggregates/topk_stream.rs b/datafusion/physical-plan/src/aggregates/topk_stream.rs
index bf02692486cc6..4aa566ccfcd0a 100644
--- a/datafusion/physical-plan/src/aggregates/topk_stream.rs
+++ b/datafusion/physical-plan/src/aggregates/topk_stream.rs
@@ -17,21 +17,26 @@
 
 //! A memory-conscious aggregation implementation that limits group buckets to a fixed number
 
+use crate::aggregates::group_values::GroupByMetrics;
 use crate::aggregates::topk::priority_map::PriorityMap;
+#[cfg(debug_assertions)]
+use crate::aggregates::topk_types_supported;
 use crate::aggregates::{
-    aggregate_expressions, evaluate_group_by, evaluate_many, AggregateExec,
-    PhysicalGroupBy,
+    AggregateExec, PhysicalGroupBy, aggregate_expressions, evaluate_group_by,
+    evaluate_many,
 };
+use crate::metrics::BaselineMetrics;
 use crate::{RecordBatchStream, SendableRecordBatchStream};
 use arrow::array::{Array, ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use arrow::util::pretty::print_batches;
-use datafusion_common::DataFusionError;
 use datafusion_common::Result;
+use datafusion_common::internal_datafusion_err;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::metrics::RecordOutput;
 use futures::stream::{Stream, StreamExt};
-use log::{trace, Level};
+use log::{Level, trace};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -42,31 +47,62 @@ pub struct GroupedTopKAggregateStream {
     started: bool,
     schema: SchemaRef,
     input: SendableRecordBatchStream,
+    baseline_metrics: BaselineMetrics,
+    group_by_metrics: GroupByMetrics,
     aggregate_arguments: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    group_by: PhysicalGroupBy,
+    group_by: Arc<PhysicalGroupBy>,
     priority_map: PriorityMap,
 }
 
 impl GroupedTopKAggregateStream {
     pub fn new(
         aggr: &AggregateExec,
-        context: Arc<TaskContext>,
+        context: &Arc<TaskContext>,
         partition: usize,
         limit: usize,
     ) -> Result<Self> {
         let agg_schema = Arc::clone(&aggr.schema);
-        let group_by = aggr.group_by.clone();
-        let input = aggr.input.execute(partition, Arc::clone(&context))?;
+        let group_by = Arc::clone(&aggr.group_by);
+        let input = aggr.input.execute(partition, Arc::clone(context))?;
+        let baseline_metrics = BaselineMetrics::new(&aggr.metrics, partition);
+        let group_by_metrics = GroupByMetrics::new(&aggr.metrics, partition);
         let aggregate_arguments =
             aggregate_expressions(&aggr.aggr_expr, &aggr.mode, group_by.expr.len())?;
-        let (val_field, desc) = aggr
-            .get_minmax_desc()
-            .ok_or_else(|| DataFusionError::Internal("Min/max required".to_string()))?;
 
         let (expr, _) = &aggr.group_expr().expr()[0];
         let kt = expr.data_type(&aggr.input().schema())?;
-        let vt = val_field.data_type().clone();
 
+        // Check if this is a MIN/MAX aggregate or a DISTINCT-like operation
+        let (vt, desc) = if let Some((val_field, desc)) = aggr.get_minmax_desc() {
+            // MIN/MAX case: use the aggregate output type
+            (val_field.data_type().clone(), desc)
+        } else {
+            // DISTINCT case: use the group key type and get ordering from limit_order_descending
+            // The ordering direction is set by the optimizer when it pushes down the limit
+            let desc = aggr
+                .limit_options()
+                .and_then(|config| config.descending)
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Ordering direction required for DISTINCT with limit"
+                    )
+                })?;
+            (kt.clone(), desc)
+        };
+
+        // Type validation is performed by the optimizer and can_use_topk() check.
+        // This debug assertion documents the contract without runtime overhead in release builds.
+        #[cfg(debug_assertions)]
+        {
+            debug_assert!(
+                topk_types_supported(&kt, &vt),
+                "TopK type validation should have been performed by optimizer and can_use_topk(). \
+                 Found unsupported types: key={kt:?}, value={vt:?}"
+            );
+        }
+
+        // Note: Null values in aggregate columns are filtered by the aggregation layer
+        // before reaching the heap, so the heap implementations don't need explicit null handling.
         let priority_map = PriorityMap::new(kt, vt, limit, desc)?;
 
         Ok(GroupedTopKAggregateStream {
@@ -75,6 +111,8 @@ impl GroupedTopKAggregateStream {
             row_count: 0,
             schema: agg_schema,
             input,
+            baseline_metrics,
+            group_by_metrics,
             aggregate_arguments,
             group_by,
             priority_map,
@@ -89,9 +127,12 @@ impl RecordBatchStream for GroupedTopKAggregateStream {
 }
 
 impl GroupedTopKAggregateStream {
-    fn intern(&mut self, ids: ArrayRef, vals: ArrayRef) -> Result<()> {
+    fn intern(&mut self, ids: &ArrayRef, vals: &ArrayRef) -> Result<()> {
+        let _timer = self.group_by_metrics.time_calculating_group_ids.timer();
+
         let len = ids.len();
-        self.priority_map.set_batch(ids, Arc::clone(&vals));
+        self.priority_map
+            .set_batch(Arc::clone(ids), Arc::clone(vals));
 
         let has_nulls = vals.null_count() > 0;
         for row_idx in 0..len {
@@ -111,7 +152,10 @@ impl Stream for GroupedTopKAggregateStream {
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
+        let elapsed_compute = self.baseline_metrics.elapsed_compute().clone();
+        let emitting_time = self.group_by_metrics.emitting_time.clone();
         while let Poll::Ready(res) = self.input.poll_next_unpin(cx) {
+            let _timer = elapsed_compute.timer();
             match res {
                 // got a batch, convert to rows and append to our TreeMap
                 Some(Ok(batch)) => {
@@ -140,16 +184,24 @@ impl Stream for GroupedTopKAggregateStream {
                         "Exactly 1 group value required"
                     );
                     let group_by_values = Arc::clone(&group_by_values[0][0]);
-                    let input_values = evaluate_many(
-                        &self.aggregate_arguments,
-                        batches.first().unwrap(),
-                    )?;
-                    assert_eq!(input_values.len(), 1, "Exactly 1 input required");
-                    assert_eq!(input_values[0].len(), 1, "Exactly 1 input required");
-                    let input_values = Arc::clone(&input_values[0][0]);
+                    let input_values = if self.aggregate_arguments.is_empty() {
+                        // DISTINCT case: use group key as both key and value
+                        Arc::clone(&group_by_values)
+                    } else {
+                        // MIN/MAX case: evaluate aggregate expressions
+                        let _timer =
+                            self.group_by_metrics.aggregate_arguments_time.timer();
+                        let input_values = evaluate_many(
+                            &self.aggregate_arguments,
+                            batches.first().unwrap(),
+                        )?;
+                        assert_eq!(input_values.len(), 1, "Exactly 1 input required");
+                        assert_eq!(input_values[0].len(), 1, "Exactly 1 input required");
+                        Arc::clone(&input_values[0][0])
+                    };
 
                     // iterate over each column of group_by values
-                    (*self).intern(group_by_values, input_values)?;
+                    (*self).intern(&group_by_values, &input_values)?;
                 }
                 // inner is done, emit all rows and switch to producing output
                 None => {
@@ -157,8 +209,17 @@ impl Stream for GroupedTopKAggregateStream {
                         trace!("partition {} emit None", self.partition);
                         return Poll::Ready(None);
                     }
-                    let cols = self.priority_map.emit()?;
-                    let batch = RecordBatch::try_new(Arc::clone(&self.schema), cols)?;
+                    let batch = {
+                        let _timer = emitting_time.timer();
+                        let mut cols = self.priority_map.emit()?;
+                        // For DISTINCT case (no aggregate expressions), only use the group key column
+                        // since the schema only has one field and key/value are the same
+                        if self.aggregate_arguments.is_empty() {
+                            cols.truncate(1);
+                        }
+                        RecordBatch::try_new(Arc::clone(&self.schema), cols)?
+                    };
+                    let batch = batch.record_output(&self.baseline_metrics);
                     trace!(
                         "partition {} emit batch with {} rows",
                         self.partition,
diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs
index ea14ce676c1a6..4aa78055daee3 100644
--- a/datafusion/physical-plan/src/analyze.rs
+++ b/datafusion/physical-plan/src/analyze.rs
@@ -26,13 +26,16 @@ use super::{
     SendableRecordBatchStream,
 };
 use crate::display::DisplayableExecutionPlan;
+use crate::metrics::MetricType;
 use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::instant::Instant;
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::PhysicalExpr;
 
 use futures::StreamExt;
 
@@ -44,11 +47,13 @@ pub struct AnalyzeExec {
     verbose: bool,
     /// If statistics should be displayed
     show_statistics: bool,
+    /// Which metric categories should be displayed
+    metric_types: Vec<MetricType>,
     /// The input plan (the plan being analyzed)
     pub(crate) input: Arc<dyn ExecutionPlan>,
     /// The output schema for RecordBatches of this exec node
     schema: SchemaRef,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl AnalyzeExec {
@@ -56,6 +61,7 @@ impl AnalyzeExec {
     pub fn new(
         verbose: bool,
         show_statistics: bool,
+        metric_types: Vec<MetricType>,
         input: Arc<dyn ExecutionPlan>,
         schema: SchemaRef,
     ) -> Self {
@@ -63,9 +69,10 @@ impl AnalyzeExec {
         AnalyzeExec {
             verbose,
             show_statistics,
+            metric_types,
             input,
             schema,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -126,7 +133,7 @@ impl ExecutionPlan for AnalyzeExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -134,9 +141,15 @@ impl ExecutionPlan for AnalyzeExec {
         vec![&self.input]
     }
 
-    /// AnalyzeExec is handled specially so this value is ignored
     fn required_input_distribution(&self) -> Vec<Distribution> {
-        vec![]
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
     }
 
     fn with_new_children(
@@ -146,6 +159,7 @@ impl ExecutionPlan for AnalyzeExec {
         Ok(Arc::new(Self::new(
             self.verbose,
             self.show_statistics,
+            self.metric_types.clone(),
             children.pop().unwrap(),
             Arc::clone(&self.schema),
         )))
@@ -156,11 +170,11 @@ impl ExecutionPlan for AnalyzeExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if 0 != partition {
-            return internal_err!(
-                "AnalyzeExec invalid partition. Expected 0, got {partition}"
-            );
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "AnalyzeExec invalid partition. Expected 0, got {partition}"
+        );
 
         // Gather futures that will run each input partition in
         // parallel (on a separate tokio task) using a JoinSet to
@@ -183,6 +197,7 @@ impl ExecutionPlan for AnalyzeExec {
         let captured_schema = Arc::clone(&self.schema);
         let verbose = self.verbose;
         let show_statistics = self.show_statistics;
+        let metric_types = self.metric_types.clone();
 
         // future that gathers the results from all the tasks in the
         // JoinSet that computes the overall row count and final
@@ -200,8 +215,9 @@ impl ExecutionPlan for AnalyzeExec {
                 show_statistics,
                 total_rows,
                 duration,
-                captured_input,
-                captured_schema,
+                &captured_input,
+                &captured_schema,
+                &metric_types,
             )
         };
 
@@ -218,8 +234,9 @@ fn create_output_batch(
     show_statistics: bool,
     total_rows: usize,
     duration: std::time::Duration,
-    input: Arc<dyn ExecutionPlan>,
-    schema: SchemaRef,
+    input: &Arc<dyn ExecutionPlan>,
+    schema: &SchemaRef,
+    metric_types: &[MetricType],
 ) -> Result<RecordBatch> {
     let mut type_builder = StringBuilder::with_capacity(1, 1024);
     let mut plan_builder = StringBuilder::with_capacity(1, 1024);
@@ -228,6 +245,7 @@ fn create_output_batch(
     type_builder.append_value("Plan with Metrics");
 
     let annotated_plan = DisplayableExecutionPlan::with_metrics(input.as_ref())
+        .set_metric_types(metric_types.to_vec())
         .set_show_statistics(show_statistics)
         .indent(verbose)
         .to_string();
@@ -239,6 +257,7 @@ fn create_output_batch(
         type_builder.append_value("Plan with Full Metrics");
 
         let annotated_plan = DisplayableExecutionPlan::with_full_metrics(input.as_ref())
+            .set_metric_types(metric_types.to_vec())
             .set_show_statistics(show_statistics)
             .indent(verbose)
             .to_string();
@@ -252,7 +271,7 @@ fn create_output_batch(
     }
 
     RecordBatch::try_new(
-        schema,
+        Arc::clone(schema),
         vec![
             Arc::new(type_builder.finish()),
             Arc::new(plan_builder.finish()),
@@ -268,7 +287,7 @@ mod tests {
         collect,
         test::{
             assert_is_pending,
-            exec::{assert_strong_count_converges_to_zero, BlockingExec},
+            exec::{BlockingExec, assert_strong_count_converges_to_zero},
         },
     };
 
@@ -283,7 +302,13 @@ mod tests {
 
         let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
         let refs = blocking_exec.refs();
-        let analyze_exec = Arc::new(AnalyzeExec::new(true, false, blocking_exec, schema));
+        let analyze_exec = Arc::new(AnalyzeExec::new(
+            true,
+            false,
+            vec![MetricType::SUMMARY, MetricType::DEV],
+            blocking_exec,
+            schema,
+        ));
 
         let fut = collect(analyze_exec, task_ctx);
         let mut fut = fut.boxed();
diff --git a/datafusion/physical-plan/src/async_func.rs b/datafusion/physical-plan/src/async_func.rs
new file mode 100644
index 0000000000000..abfe870f52665
--- /dev/null
+++ b/datafusion/physical-plan/src/async_func.rs
@@ -0,0 +1,440 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::coalesce::LimitedBatchCoalescer;
+use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::stream::RecordBatchStreamAdapter;
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    check_if_same_properties,
+};
+use arrow::array::RecordBatch;
+use arrow_schema::{Fields, Schema, SchemaRef};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{Result, assert_eq_or_internal_err};
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::async_scalar_function::AsyncFuncExpr;
+use datafusion_physical_expr::equivalence::ProjectionMapping;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::metrics::{BaselineMetrics, RecordOutput};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use futures::Stream;
+use futures::stream::StreamExt;
+use log::trace;
+use std::any::Any;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll, ready};
+
+/// This structure evaluates a set of async expressions on a record
+/// batch producing a new record batch
+///
+/// The schema of the output of the AsyncFuncExec is:
+/// Input columns followed by one column for each async expression
+#[derive(Debug, Clone)]
+pub struct AsyncFuncExec {
+    /// The async expressions to evaluate
+    async_exprs: Vec<Arc<AsyncFuncExpr>>,
+    input: Arc<dyn ExecutionPlan>,
+    cache: Arc<PlanProperties>,
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl AsyncFuncExec {
+    pub fn try_new(
+        async_exprs: Vec<Arc<AsyncFuncExpr>>,
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Result<Self> {
+        let async_fields = async_exprs
+            .iter()
+            .map(|async_expr| async_expr.field(input.schema().as_ref()))
+            .collect::<Result<Vec<_>>>()?;
+
+        // compute the output schema: input schema then async expressions
+        let fields: Fields = input
+            .schema()
+            .fields()
+            .iter()
+            .cloned()
+            .chain(async_fields.into_iter().map(Arc::new))
+            .collect();
+
+        let schema = Arc::new(Schema::new(fields));
+        let tuples = async_exprs
+            .iter()
+            .map(|expr| (Arc::clone(&expr.func), expr.name().to_string()))
+            .collect::<Vec<_>>();
+        let async_expr_mapping = ProjectionMapping::try_new(tuples, &input.schema())?;
+        let cache =
+            AsyncFuncExec::compute_properties(&input, schema, &async_expr_mapping)?;
+        Ok(Self {
+            input,
+            async_exprs,
+            cache: Arc::new(cache),
+            metrics: ExecutionPlanMetricsSet::new(),
+        })
+    }
+
+    /// This function creates the cache object that stores the plan properties
+    /// such as schema, equivalence properties, ordering, partitioning, etc.
+    fn compute_properties(
+        input: &Arc<dyn ExecutionPlan>,
+        schema: SchemaRef,
+        async_expr_mapping: &ProjectionMapping,
+    ) -> Result<PlanProperties> {
+        Ok(PlanProperties::new(
+            input
+                .equivalence_properties()
+                .project(async_expr_mapping, schema),
+            input.output_partitioning().clone(),
+            input.pipeline_behavior(),
+            input.boundedness(),
+        ))
+    }
+
+    pub fn async_exprs(&self) -> &[Arc<AsyncFuncExpr>] {
+        &self.async_exprs
+    }
+
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
+}
+
+impl DisplayAs for AsyncFuncExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        let expr: Vec<String> = self
+            .async_exprs
+            .iter()
+            .map(|async_expr| async_expr.to_string())
+            .collect();
+        let exprs = expr.join(", ");
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "AsyncFuncExec: async_expr=[{exprs}]")
+            }
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "format=async_expr")?;
+                writeln!(f, "async_expr={exprs}")?;
+                Ok(())
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for AsyncFuncExec {
+    fn name(&self) -> &str {
+        "async_func"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        assert_eq_or_internal_err!(
+            children.len(),
+            1,
+            "AsyncFuncExec wrong number of children"
+        );
+        check_if_same_properties!(self, children);
+        Ok(Arc::new(AsyncFuncExec::try_new(
+            self.async_exprs.clone(),
+            children.swap_remove(0),
+        )?))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        trace!(
+            "Start AsyncFuncExpr::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+
+        // first execute the input stream
+        let input_stream = self.input.execute(partition, Arc::clone(&context))?;
+
+        // TODO: Track `elapsed_compute` in `BaselineMetrics`
+        // Issue: <https://github.com/apache/datafusion/issues/19658>
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+
+        // now, for each record batch, evaluate the async expressions and add the columns to the result
+        let async_exprs_captured = Arc::new(self.async_exprs.clone());
+        let schema_captured = self.schema();
+        let config_options_ref = Arc::clone(context.session_config().options());
+
+        let coalesced_input_stream = CoalesceInputStream {
+            input_stream,
+            batch_coalescer: LimitedBatchCoalescer::new(
+                Arc::clone(&self.input.schema()),
+                config_options_ref.execution.batch_size,
+                None,
+            ),
+        };
+
+        let stream_with_async_functions = coalesced_input_stream.then(move |batch| {
+            // need to clone *again* to capture the async_exprs and schema in the
+            // stream and satisfy lifetime requirements.
+            let async_exprs_captured = Arc::clone(&async_exprs_captured);
+            let schema_captured = Arc::clone(&schema_captured);
+            let config_options = Arc::clone(&config_options_ref);
+            let baseline_metrics_captured = baseline_metrics.clone();
+
+            async move {
+                let batch = batch?;
+                // append the result of evaluating the async expressions to the output
+                let mut output_arrays = batch.columns().to_vec();
+                for async_expr in async_exprs_captured.iter() {
+                    let output = async_expr
+                        .invoke_with_args(&batch, Arc::clone(&config_options))
+                        .await?;
+                    output_arrays.push(output.to_array(batch.num_rows())?);
+                }
+                let batch = RecordBatch::try_new(schema_captured, output_arrays)?;
+
+                Ok(batch.record_output(&baseline_metrics_captured))
+            }
+        });
+
+        // Adapt the stream with the output schema
+        let adapter =
+            RecordBatchStreamAdapter::new(self.schema(), stream_with_async_functions);
+        Ok(Box::pin(adapter))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+}
+
+struct CoalesceInputStream {
+    input_stream: Pin<Box<dyn RecordBatchStream + Send>>,
+    batch_coalescer: LimitedBatchCoalescer,
+}
+
+impl Stream for CoalesceInputStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let mut completed = false;
+
+        loop {
+            if let Some(batch) = self.batch_coalescer.next_completed_batch() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+
+            if completed {
+                return Poll::Ready(None);
+            }
+
+            match ready!(self.input_stream.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    if let Err(err) = self.batch_coalescer.push_batch(batch) {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+                Some(err) => {
+                    return Poll::Ready(Some(err));
+                }
+                None => {
+                    completed = true;
+                    if let Err(err) = self.batch_coalescer.finish() {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+            }
+        }
+    }
+}
+
+const ASYNC_FN_PREFIX: &str = "__async_fn_";
+
+/// Maps async_expressions to new columns
+///
+/// The output of the async functions are appended, in order, to the end of the input schema
+#[derive(Debug)]
+pub struct AsyncMapper {
+    /// the number of columns in the input plan
+    /// used to generate the output column names.
+    /// the first async expr is `__async_fn_0`, the second is `__async_fn_1`, etc
+    num_input_columns: usize,
+    /// the expressions to map
+    pub async_exprs: Vec<Arc<AsyncFuncExpr>>,
+}
+
+impl AsyncMapper {
+    pub fn new(num_input_columns: usize) -> Self {
+        Self {
+            num_input_columns,
+            async_exprs: Vec::new(),
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.async_exprs.is_empty()
+    }
+
+    pub fn next_column_name(&self) -> String {
+        format!("{}{}", ASYNC_FN_PREFIX, self.async_exprs.len())
+    }
+
+    /// Finds any references to async functions in the expression and adds them to the map
+    pub fn find_references(
+        &mut self,
+        physical_expr: &Arc<dyn PhysicalExpr>,
+        schema: &Schema,
+    ) -> Result<()> {
+        // recursively look for references to async functions
+        physical_expr.apply(|expr| {
+            if let Some(scalar_func_expr) =
+                expr.as_any().downcast_ref::<ScalarFunctionExpr>()
+                && scalar_func_expr.fun().as_async().is_some()
+            {
+                let next_name = self.next_column_name();
+                self.async_exprs.push(Arc::new(AsyncFuncExpr::try_new(
+                    next_name,
+                    Arc::clone(expr),
+                    schema,
+                )?));
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+        Ok(())
+    }
+
+    /// If the expression matches any of the async functions, return the new column
+    pub fn map_expr(
+        &self,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Transformed<Arc<dyn PhysicalExpr>> {
+        // find the first matching async function if any
+        let Some(idx) =
+            self.async_exprs
+                .iter()
+                .enumerate()
+                .find_map(|(idx, async_expr)| {
+                    if async_expr.func == Arc::clone(&expr) {
+                        Some(idx)
+                    } else {
+                        None
+                    }
+                })
+        else {
+            return Transformed::no(expr);
+        };
+        // rewrite in terms of the output column
+        Transformed::yes(self.output_column(idx))
+    }
+
+    /// return the output column for the async function at index idx
+    pub fn output_column(&self, idx: usize) -> Arc<dyn PhysicalExpr> {
+        let async_expr = &self.async_exprs[idx];
+        let output_idx = self.num_input_columns + idx;
+        Arc::new(Column::new(async_expr.name(), output_idx))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::{RecordBatch, UInt32Array};
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::Result;
+    use datafusion_execution::{TaskContext, config::SessionConfig};
+    use futures::StreamExt;
+
+    use crate::{ExecutionPlan, async_func::AsyncFuncExec, test::TestMemoryExec};
+
+    #[tokio::test]
+    async fn test_async_fn_with_coalescing() -> Result<()> {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("c0", DataType::UInt32, false)]));
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6]))],
+        )?;
+
+        let batches: Vec<RecordBatch> = std::iter::repeat_n(batch, 50).collect();
+
+        let session_config = SessionConfig::new().with_batch_size(200);
+        let task_ctx = TaskContext::default().with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        let test_exec =
+            TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?;
+        let exec = AsyncFuncExec::try_new(vec![], test_exec)?;
+
+        let mut stream = exec.execute(0, Arc::clone(&task_ctx))?;
+        let batch = stream
+            .next()
+            .await
+            .expect("expected to get a record batch")?;
+        assert_eq!(200, batch.num_rows());
+        let batch = stream
+            .next()
+            .await
+            .expect("expected to get a record batch")?;
+        assert_eq!(100, batch.num_rows());
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/buffer.rs b/datafusion/physical-plan/src/buffer.rs
new file mode 100644
index 0000000000000..3e85fb32d2f2c
--- /dev/null
+++ b/datafusion/physical-plan/src/buffer.rs
@@ -0,0 +1,648 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`BufferExec`] decouples production and consumption on messages by buffering the input in the
+//! background up to a certain capacity.
+
+use crate::execution_plan::{CardinalityEffect, SchedulingType};
+use crate::filter_pushdown::{
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
+};
+use crate::projection::ProjectionExec;
+use crate::stream::RecordBatchStreamAdapter;
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SortOrderPushdownResult,
+    check_if_same_properties,
+};
+use arrow::array::RecordBatch;
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, Statistics, internal_err, plan_err};
+use datafusion_common_runtime::SpawnedTask;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr_common::metrics::{
+    ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
+};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use futures::{Stream, StreamExt, TryStreamExt};
+use pin_project_lite::pin_project;
+use std::any::Any;
+use std::fmt;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::task::{Context, Poll};
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+/// WARNING: EXPERIMENTAL
+///
+/// Decouples production and consumption of record batches with an internal queue per partition,
+/// eagerly filling up the capacity of the queues even before any message is requested.
+///
+/// ```text
+///             ┌───────────────────────────┐
+///             │        BufferExec         │
+///             │                           │
+///             │┌────── Partition 0 ──────┐│
+///             ││            ┌────┐ ┌────┐││       ┌────┐
+/// ──background poll────────▶│    │ │    ├┼┼───────▶    │
+///             ││            └────┘ └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             │┌────── Partition 1 ──────┐│
+///             ││     ┌────┐ ┌────┐ ┌────┐││       ┌────┐
+/// ──background poll─▶│    │ │    │ │    ├┼┼───────▶    │
+///             ││     └────┘ └────┘ └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             │                           │
+///             │           ...             │
+///             │                           │
+///             │┌────── Partition N ──────┐│
+///             ││                   ┌────┐││       ┌────┐
+/// ──background poll───────────────▶│    ├┼┼───────▶    │
+///             ││                   └────┘││       └────┘
+///             │└─────────────────────────┘│
+///             └───────────────────────────┘
+/// ```
+///
+/// The capacity is provided in bytes, and for each buffered record batch it will take into account
+/// the size reported by [RecordBatch::get_array_memory_size].
+///
+/// If a single record batch exceeds the maximum capacity set in the `capacity` argument, it's still
+/// allowed to pass in order to not deadlock the buffer.
+///
+/// This is useful for operators that conditionally start polling one of their children only after
+/// other child has finished, allowing to perform some early work and accumulating batches in
+/// memory so that they can be served immediately when requested.
+#[derive(Debug, Clone)]
+pub struct BufferExec {
+    input: Arc<dyn ExecutionPlan>,
+    properties: Arc<PlanProperties>,
+    capacity: usize,
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl BufferExec {
+    /// Builds a new [BufferExec] with the provided capacity in bytes.
+    pub fn new(input: Arc<dyn ExecutionPlan>, capacity: usize) -> Self {
+        let properties = PlanProperties::clone(input.properties())
+            .with_scheduling_type(SchedulingType::Cooperative);
+
+        Self {
+            input,
+            properties: Arc::new(properties),
+            capacity,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+
+    /// Returns the input [ExecutionPlan] of this [BufferExec].
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    /// Returns the per-partition capacity in bytes for this [BufferExec].
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
+}
+
+impl DisplayAs for BufferExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "BufferExec: capacity={}", self.capacity)
+            }
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "target_batch_size={}", self.capacity)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for BufferExec {
+    fn name(&self) -> &str {
+        "BufferExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+        if children.len() != 1 {
+            return plan_err!("BufferExec can only have one child");
+        }
+        Ok(Arc::new(Self::new(children.swap_remove(0), self.capacity)))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let mem_reservation = MemoryConsumer::new(format!("BufferExec[{partition}]"))
+            .register(context.memory_pool());
+        let in_stream = self.input.execute(partition, context)?;
+
+        // Set up the metrics for the stream.
+        let curr_mem_in = Arc::new(AtomicUsize::new(0));
+        let curr_mem_out = Arc::clone(&curr_mem_in);
+        let mut max_mem_in = 0;
+        let max_mem = MetricBuilder::new(&self.metrics).gauge("max_mem_used", partition);
+
+        let curr_queued_in = Arc::new(AtomicUsize::new(0));
+        let curr_queued_out = Arc::clone(&curr_queued_in);
+        let mut max_queued_in = 0;
+        let max_queued = MetricBuilder::new(&self.metrics).gauge("max_queued", partition);
+
+        // Capture metrics when an element is queued on the stream.
+        let in_stream = in_stream.inspect_ok(move |v| {
+            let size = v.get_array_memory_size();
+            let curr_size = curr_mem_in.fetch_add(size, Ordering::Relaxed) + size;
+            if curr_size > max_mem_in {
+                max_mem_in = curr_size;
+                max_mem.set(max_mem_in);
+            }
+
+            let curr_queued = curr_queued_in.fetch_add(1, Ordering::Relaxed) + 1;
+            if curr_queued > max_queued_in {
+                max_queued_in = curr_queued;
+                max_queued.set(max_queued_in);
+            }
+        });
+        // Buffer the input.
+        let out_stream =
+            MemoryBufferedStream::new(in_stream, self.capacity, mem_reservation);
+        // Update in the metrics that when an element gets out, some memory gets freed.
+        let out_stream = out_stream.inspect_ok(move |v| {
+            curr_mem_out.fetch_sub(v.get_array_memory_size(), Ordering::Relaxed);
+            curr_queued_out.fetch_sub(1, Ordering::Relaxed);
+        });
+
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.schema(),
+            out_stream,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input.partition_statistics(partition)
+    }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        self.input.supports_limit_pushdown()
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
+    }
+
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        FilterDescription::from_children(parent_filters, &self.children())
+    }
+
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalesceBatchesExec is transparent for sort ordering - it preserves order
+        // Delegate to the child and wrap with a new CoalesceBatchesExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            Ok(Arc::new(Self::new(new_input, self.capacity)) as Arc<dyn ExecutionPlan>)
+        })
+    }
+}
+
+/// Represents anything that occupies a capacity in a [MemoryBufferedStream].
+pub trait SizedMessage {
+    fn size(&self) -> usize;
+}
+
+impl SizedMessage for RecordBatch {
+    fn size(&self) -> usize {
+        self.get_array_memory_size()
+    }
+}
+
+pin_project! {
+/// Decouples production and consumption of messages in a stream with an internal queue, eagerly
+/// filling it up to the specified maximum capacity even before any message is requested.
+///
+/// Allows each message to have a different size, which is taken into account for determining if
+/// the queue is full or not.
+pub struct MemoryBufferedStream<T: SizedMessage> {
+    task: SpawnedTask<()>,
+    batch_rx: UnboundedReceiver<Result<(T, OwnedSemaphorePermit)>>,
+    memory_reservation: Arc<MemoryReservation>,
+}}
+
+impl<T: Send + SizedMessage + 'static> MemoryBufferedStream<T> {
+    /// Builds a new [MemoryBufferedStream] with the provided capacity and event handler.
+    ///
+    /// This immediately spawns a Tokio task that will start consumption of the input stream.
+    pub fn new(
+        mut input: impl Stream<Item = Result<T>> + Unpin + Send + 'static,
+        capacity: usize,
+        memory_reservation: MemoryReservation,
+    ) -> Self {
+        let semaphore = Arc::new(Semaphore::new(capacity));
+        let (batch_tx, batch_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let memory_reservation = Arc::new(memory_reservation);
+        let memory_reservation_clone = Arc::clone(&memory_reservation);
+        let task = SpawnedTask::spawn(async move {
+            loop {
+                // Select on both the input stream and the channel being closed.
+                // By down this, we abort polling the input as soon as the consumer channel is
+                // closed. Otherwise, we would need to wait for a full new message to be available
+                // in order to consider aborting the stream
+                let item_or_err = tokio::select! {
+                    biased;
+                    _ = batch_tx.closed() => break,
+                    item_or_err = input.next() => {
+                        let Some(item_or_err) = item_or_err else {
+                            break; // stream finished
+                        };
+                        item_or_err
+                    }
+                };
+
+                let item = match item_or_err {
+                    Ok(batch) => batch,
+                    Err(err) => {
+                        let _ = batch_tx.send(Err(err)); // If there's an error it means the channel was closed, which is fine.
+                        break;
+                    }
+                };
+
+                let size = item.size();
+                if let Err(err) = memory_reservation.try_grow(size) {
+                    let _ = batch_tx.send(Err(err)); // If there's an error it means the channel was closed, which is fine.
+                    break;
+                }
+
+                // We need to cap the minimum between amount of permits and the actual size of the
+                // message. If at any point we try to acquire more permits than the capacity of the
+                // semaphore, the stream will deadlock.
+                let capped_size = size.min(capacity) as u32;
+
+                let semaphore = Arc::clone(&semaphore);
+                let Ok(permit) = semaphore.acquire_many_owned(capped_size).await else {
+                    let _ = batch_tx.send(internal_err!("Closed semaphore in MemoryBufferedStream. This is a bug in DataFusion, please report it!"));
+                    break;
+                };
+
+                if batch_tx.send(Ok((item, permit))).is_err() {
+                    break; // stream was closed
+                };
+            }
+        });
+
+        Self {
+            task,
+            batch_rx,
+            memory_reservation: memory_reservation_clone,
+        }
+    }
+
+    /// Returns the number of queued messages.
+    pub fn messages_queued(&self) -> usize {
+        self.batch_rx.len()
+    }
+}
+
+impl<T: SizedMessage> Stream for MemoryBufferedStream<T> {
+    type Item = Result<T>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let self_project = self.project();
+        match self_project.batch_rx.poll_recv(cx) {
+            Poll::Ready(Some(Ok((item, _semaphore_permit)))) => {
+                self_project.memory_reservation.shrink(item.size());
+                Poll::Ready(Some(Ok(item)))
+            }
+            Poll::Ready(Some(Err(err))) => Poll::Ready(Some(Err(err))),
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        if self.batch_rx.is_closed() {
+            let len = self.batch_rx.len();
+            (len, Some(len))
+        } else {
+            (self.batch_rx.len(), None)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_common::{DataFusionError, assert_contains};
+    use datafusion_execution::memory_pool::{
+        GreedyMemoryPool, MemoryPool, UnboundedMemoryPool,
+    };
+    use std::error::Error;
+    use std::fmt::Debug;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use tokio::time::timeout;
+
+    #[tokio::test]
+    async fn buffers_only_some_messages() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let buffered = MemoryBufferedStream::new(input, 4, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yields_all_messages() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 4);
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yields_first_msg_even_if_big() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([25, 1, 2, 3]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_pool_kills_stream() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = bounded_memory_pool_and_reservation(7);
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        let msg = pull_err_msg(&mut buffered).await?;
+
+        assert_contains!(msg.to_string(), "Failed to allocate additional 4.0 B");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_pool_does_not_kill_stream() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (_, res) = bounded_memory_pool_and_reservation(7);
+
+        let mut buffered = MemoryBufferedStream::new(input, 3, res);
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn messages_pass_even_if_all_exceed_limit() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([3, 3, 3, 3]).map(Ok);
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 2, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 1);
+        pull_ok_msg(&mut buffered).await?;
+
+        wait_for_buffering().await;
+        finished(&mut buffered).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn errors_get_propagated() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(|v| {
+            if v == 3 {
+                return internal_err!("Error on 3");
+            }
+            Ok(v)
+        });
+        let (_, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+
+        pull_ok_msg(&mut buffered).await?;
+        pull_ok_msg(&mut buffered).await?;
+        pull_err_msg(&mut buffered).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn memory_gets_released_if_stream_drops() -> Result<(), Box<dyn Error>> {
+        let input = futures::stream::iter([1, 2, 3, 4]).map(Ok);
+        let (pool, res) = memory_pool_and_reservation();
+
+        let mut buffered = MemoryBufferedStream::new(input, 10, res);
+        wait_for_buffering().await;
+        assert_eq!(buffered.messages_queued(), 4);
+        assert_eq!(pool.reserved(), 10);
+
+        pull_ok_msg(&mut buffered).await?;
+        assert_eq!(buffered.messages_queued(), 3);
+        assert_eq!(pool.reserved(), 9);
+
+        pull_ok_msg(&mut buffered).await?;
+        assert_eq!(buffered.messages_queued(), 2);
+        assert_eq!(pool.reserved(), 7);
+
+        drop(buffered);
+        assert_eq!(pool.reserved(), 0);
+        Ok(())
+    }
+
+    fn memory_pool_and_reservation() -> (Arc<dyn MemoryPool>, MemoryReservation) {
+        let pool = Arc::new(UnboundedMemoryPool::default()) as _;
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        (pool, reservation)
+    }
+
+    fn bounded_memory_pool_and_reservation(
+        size: usize,
+    ) -> (Arc<dyn MemoryPool>, MemoryReservation) {
+        let pool = Arc::new(GreedyMemoryPool::new(size)) as _;
+        let reservation = MemoryConsumer::new("test").register(&pool);
+        (pool, reservation)
+    }
+
+    async fn wait_for_buffering() {
+        // We do not have control over the spawned task, so the best we can do is to yield some
+        // cycles to the tokio runtime and let the task make progress on its own.
+        tokio::time::sleep(Duration::from_millis(1)).await;
+    }
+
+    async fn pull_ok_msg<T: SizedMessage>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<T, Box<dyn Error>> {
+        Ok(timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .unwrap_or_else(|| internal_err!("Stream should not have finished"))?)
+    }
+
+    async fn pull_err_msg<T: SizedMessage + Debug>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<DataFusionError, Box<dyn Error>> {
+        Ok(timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .map(|v| match v {
+                Ok(v) => internal_err!(
+                    "Stream should not have failed, but succeeded with {v:?}"
+                ),
+                Err(err) => Ok(err),
+            })
+            .unwrap_or_else(|| internal_err!("Stream should not have finished"))?)
+    }
+
+    async fn finished<T: SizedMessage>(
+        buffered: &mut MemoryBufferedStream<T>,
+    ) -> Result<(), Box<dyn Error>> {
+        match timeout(Duration::from_millis(1), buffered.next())
+            .await?
+            .is_none()
+        {
+            true => Ok(()),
+            false => internal_err!("Stream should have finished")?,
+        }
+    }
+
+    impl SizedMessage for usize {
+        fn size(&self) -> usize {
+            *self
+        }
+    }
+}
diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs
index 0eca27f8e40e0..ea1a87d091481 100644
--- a/datafusion/physical-plan/src/coalesce/mod.rs
+++ b/datafusion/physical-plan/src/coalesce/mod.rs
@@ -15,76 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{
-    builder::StringViewBuilder, cast::AsArray, Array, ArrayRef, RecordBatch,
-    RecordBatchOptions,
-};
-use arrow::compute::concat_batches;
+use arrow::array::RecordBatch;
+use arrow::compute::BatchCoalescer;
 use arrow::datatypes::SchemaRef;
-use std::sync::Arc;
+use datafusion_common::{Result, assert_or_internal_err};
 
-/// Concatenate multiple [`RecordBatch`]es
-///
-/// `BatchCoalescer` concatenates multiple small [`RecordBatch`]es, produced by
-/// operations such as `FilterExec` and `RepartitionExec`, into larger ones for
-/// more efficient processing by subsequent operations.
-///
-/// # Background
-///
-/// Generally speaking, larger [`RecordBatch`]es are more efficient to process
-/// than smaller record batches (until the CPU cache is exceeded) because there
-/// is fixed processing overhead per batch. DataFusion tries to operate on
-/// batches of `target_batch_size` rows to amortize this overhead
-///
-/// ```text
-/// ┌────────────────────┐
-/// │    RecordBatch     │
-/// │   num_rows = 23    │
-/// └────────────────────┘                 ┌────────────────────┐
-///                                        │                    │
-/// ┌────────────────────┐     Coalesce    │                    │
-/// │                    │      Batches    │                    │
-/// │    RecordBatch     │                 │                    │
-/// │   num_rows = 50    │  ─ ─ ─ ─ ─ ─ ▶  │                    │
-/// │                    │                 │    RecordBatch     │
-/// │                    │                 │   num_rows = 106   │
-/// └────────────────────┘                 │                    │
-///                                        │                    │
-/// ┌────────────────────┐                 │                    │
-/// │                    │                 │                    │
-/// │    RecordBatch     │                 │                    │
-/// │   num_rows = 33    │                 └────────────────────┘
-/// │                    │
-/// └────────────────────┘
-/// ```
-///
-/// # Notes:
-///
-/// 1. Output rows are produced in the same order as the input rows
-///
-/// 2. The output is a sequence of batches, with all but the last being at least
-///    `target_batch_size` rows.
-///
-/// 3. Eventually this may also be able to handle other optimizations such as a
-///    combined filter/coalesce operation.
+/// Concatenate multiple [`RecordBatch`]es and apply a limit
 ///
+/// See [`BatchCoalescer`] for more details on how this works.
 #[derive(Debug)]
-pub struct BatchCoalescer {
-    /// The input schema
-    schema: SchemaRef,
-    /// Minimum number of rows for coalesces batches
-    target_batch_size: usize,
+pub struct LimitedBatchCoalescer {
+    /// The arrow structure that builds the output batches
+    inner: BatchCoalescer,
     /// Total number of rows returned so far
     total_rows: usize,
-    /// Buffered batches
-    buffer: Vec<RecordBatch>,
-    /// Buffered row count
-    buffered_rows: usize,
     /// Limit: maximum number of rows to fetch, `None` means fetch all rows
     fetch: Option<usize>,
+    /// Indicates if the coalescer is finished
+    finished: bool,
+}
+
+/// Status returned by [`LimitedBatchCoalescer::push_batch`]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PushBatchStatus {
+    /// The limit has **not** been reached, and more batches can be pushed
+    Continue,
+    /// The limit **has** been reached after processing this batch
+    /// The caller should call [`LimitedBatchCoalescer::finish`]
+    /// to flush any buffered rows and stop pushing more batches.
+    LimitReached,
 }
 
-impl BatchCoalescer {
+impl LimitedBatchCoalescer {
     /// Create a new `BatchCoalescer`
     ///
     /// # Arguments
@@ -98,187 +60,98 @@ impl BatchCoalescer {
         fetch: Option<usize>,
     ) -> Self {
         Self {
-            schema,
-            target_batch_size,
+            inner: BatchCoalescer::new(schema, target_batch_size)
+                .with_biggest_coalesce_batch_size(Some(target_batch_size / 2)),
             total_rows: 0,
-            buffer: vec![],
-            buffered_rows: 0,
             fetch,
+            finished: false,
         }
     }
 
     /// Return the schema of the output batches
     pub fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
+        self.inner.schema()
     }
 
-    /// Push next batch, and returns [`CoalescerState`] indicating the current
-    /// state of the buffer.
-    pub fn push_batch(&mut self, batch: RecordBatch) -> CoalescerState {
-        let batch = gc_string_view_batch(&batch);
-        if self.limit_reached(&batch) {
-            CoalescerState::LimitReached
-        } else if self.target_reached(batch) {
-            CoalescerState::TargetReached
-        } else {
-            CoalescerState::Continue
-        }
-    }
-
-    /// Return true if the there is no data buffered
-    pub fn is_empty(&self) -> bool {
-        self.buffer.is_empty()
-    }
-
-    /// Checks if the buffer will reach the specified limit after getting
-    /// `batch`.
+    /// Pushes the next [`RecordBatch`] into the coalescer and returns its status.
     ///
-    /// If fetch would be exceeded, slices the received batch, updates the
-    /// buffer with it, and returns `true`.
+    /// # Arguments
+    /// * `batch` - The [`RecordBatch`] to append.
+    ///
+    /// # Returns
+    /// * [`PushBatchStatus::Continue`] - More batches can still be pushed.
+    /// * [`PushBatchStatus::LimitReached`] - The row limit was reached after processing
+    ///   this batch. The caller should call [`Self::finish`] before retrieving the
+    ///   remaining buffered batches.
     ///
-    /// Otherwise: does nothing and returns `false`.
-    fn limit_reached(&mut self, batch: &RecordBatch) -> bool {
-        match self.fetch {
-            Some(fetch) if self.total_rows + batch.num_rows() >= fetch => {
+    /// # Errors
+    /// Returns an error if called after [`Self::finish`] or if the internal push
+    /// operation fails.
+    pub fn push_batch(&mut self, batch: RecordBatch) -> Result<PushBatchStatus> {
+        assert_or_internal_err!(
+            !self.finished,
+            "LimitedBatchCoalescer: cannot push batch after finish"
+        );
+
+        // if we are at the limit, return LimitReached
+        if let Some(fetch) = self.fetch {
+            // limit previously reached
+            if self.total_rows >= fetch {
+                return Ok(PushBatchStatus::LimitReached);
+            }
+
+            // limit now reached
+            if self.total_rows + batch.num_rows() >= fetch {
                 // Limit is reached
                 let remaining_rows = fetch - self.total_rows;
                 debug_assert!(remaining_rows > 0);
 
-                let batch = batch.slice(0, remaining_rows);
-                self.buffered_rows += batch.num_rows();
-                self.total_rows = fetch;
-                self.buffer.push(batch);
-                true
+                let batch_head = batch.slice(0, remaining_rows);
+                self.total_rows += batch_head.num_rows();
+                self.inner.push_batch(batch_head)?;
+                return Ok(PushBatchStatus::LimitReached);
             }
-            _ => false,
         }
-    }
 
-    /// Updates the buffer with the given batch.
-    ///
-    /// If the target batch size is reached, returns `true`. Otherwise, returns
-    /// `false`.
-    fn target_reached(&mut self, batch: RecordBatch) -> bool {
-        if batch.num_rows() == 0 {
-            false
-        } else {
-            self.total_rows += batch.num_rows();
-            self.buffered_rows += batch.num_rows();
-            self.buffer.push(batch);
-            self.buffered_rows >= self.target_batch_size
-        }
+        // Limit not reached, push the entire batch
+        self.total_rows += batch.num_rows();
+        self.inner.push_batch(batch)?;
+
+        Ok(PushBatchStatus::Continue)
     }
 
-    /// Concatenates and returns all buffered batches, and clears the buffer.
-    pub fn finish_batch(&mut self) -> datafusion_common::Result<RecordBatch> {
-        let batch = concat_batches(&self.schema, &self.buffer)?;
-        self.buffer.clear();
-        self.buffered_rows = 0;
-        Ok(batch)
+    /// Return true if there is no data buffered
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
     }
-}
 
-/// Indicates the state of the [`BatchCoalescer`] buffer after the
-/// [`BatchCoalescer::push_batch()`] operation.
-///
-/// The caller should take different actions, depending on the variant returned.
-pub enum CoalescerState {
-    /// Neither the limit nor the target batch size is reached.
+    /// Complete the current buffered batch and finish the coalescer
     ///
-    /// Action: continue pushing batches.
-    Continue,
-    /// The limit has been reached.
-    ///
-    /// Action: call [`BatchCoalescer::finish_batch()`] to get the final
-    /// buffered results as a batch and finish the query.
-    LimitReached,
-    /// The specified minimum number of rows a batch should have is reached.
-    ///
-    /// Action: call [`BatchCoalescer::finish_batch()`] to get the current
-    /// buffered results as a batch and then continue pushing batches.
-    TargetReached,
-}
-
-/// Heuristically compact `StringViewArray`s to reduce memory usage, if needed
-///
-/// Decides when to consolidate the StringView into a new buffer to reduce
-/// memory usage and improve string locality for better performance.
-///
-/// This differs from `StringViewArray::gc` because:
-/// 1. It may not compact the array depending on a heuristic.
-/// 2. It uses a precise block size to reduce the number of buffers to track.
-///
-/// # Heuristic
-///
-/// If the average size of each view is larger than 32 bytes, we compact the array.
-///
-/// `StringViewArray` include pointers to buffer that hold the underlying data.
-/// One of the great benefits of `StringViewArray` is that many operations
-/// (e.g., `filter`) can be done without copying the underlying data.
-///
-/// However, after a while (e.g., after `FilterExec` or `HashJoinExec`) the
-/// `StringViewArray` may only refer to a small portion of the buffer,
-/// significantly increasing memory usage.
-fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch {
-    let new_columns: Vec<ArrayRef> = batch
-        .columns()
-        .iter()
-        .map(|c| {
-            // Try to re-create the `StringViewArray` to prevent holding the underlying buffer too long.
-            let Some(s) = c.as_string_view_opt() else {
-                return Arc::clone(c);
-            };
-            let ideal_buffer_size: usize = s
-                .views()
-                .iter()
-                .map(|v| {
-                    let len = (*v as u32) as usize;
-                    if len > 12 {
-                        len
-                    } else {
-                        0
-                    }
-                })
-                .sum();
-            let actual_buffer_size = s.get_buffer_memory_size();
-
-            // Re-creating the array copies data and can be time consuming.
-            // We only do it if the array is sparse
-            if actual_buffer_size > (ideal_buffer_size * 2) {
-                // We set the block size to `ideal_buffer_size` so that the new StringViewArray only has one buffer, which accelerate later concat_batches.
-                // See https://github.com/apache/arrow-rs/issues/6094 for more details.
-                let mut builder = StringViewBuilder::with_capacity(s.len());
-                if ideal_buffer_size > 0 {
-                    builder = builder.with_fixed_block_size(ideal_buffer_size as u32);
-                }
-
-                for v in s.iter() {
-                    builder.append_option(v);
-                }
-
-                let gc_string = builder.finish();
+    /// Any subsequent calls to `push_batch()` will return an Err
+    pub fn finish(&mut self) -> Result<()> {
+        self.inner.finish_buffered_batch()?;
+        self.finished = true;
+        Ok(())
+    }
 
-                debug_assert!(gc_string.data_buffers().len() <= 1); // buffer count can be 0 if the `ideal_buffer_size` is 0
+    pub(crate) fn is_finished(&self) -> bool {
+        self.finished
+    }
 
-                Arc::new(gc_string)
-            } else {
-                Arc::clone(c)
-            }
-        })
-        .collect();
-    let mut options = RecordBatchOptions::new();
-    options = options.with_row_count(Some(batch.num_rows()));
-    RecordBatch::try_new_with_options(batch.schema(), new_columns, &options)
-        .expect("Failed to re-create the gc'ed record batch")
+    /// Return the next completed batch, if any
+    pub fn next_completed_batch(&mut self) -> Option<RecordBatch> {
+        self.inner.next_completed_batch()
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use std::ops::Range;
-
     use super::*;
+    use std::ops::Range;
+    use std::sync::Arc;
 
-    use arrow::array::{builder::ArrayBuilder, StringViewArray, UInt32Array};
+    use arrow::array::UInt32Array;
+    use arrow::compute::concat_batches;
     use arrow::datatypes::{DataType, Field, Schema};
 
     #[test]
@@ -286,9 +159,9 @@ mod tests {
         let batch = uint32_batch(0..8);
         Test::new()
             .with_batches(std::iter::repeat_n(batch, 10))
-            // expected output is batches of at least 20 rows (except for the final batch)
+            // expected output is batches of exactly 21 rows (except for the final batch)
             .with_target_batch_size(21)
-            .with_expected_output_sizes(vec![24, 24, 24, 8])
+            .with_expected_output_sizes(vec![21, 21, 21, 17])
             .run()
     }
 
@@ -301,7 +174,7 @@ mod tests {
             // expected to behave the same as `test_concat_batches`
             .with_target_batch_size(21)
             .with_fetch(Some(100))
-            .with_expected_output_sizes(vec![24, 24, 24, 8])
+            .with_expected_output_sizes(vec![21, 21, 21, 17])
             .run();
     }
 
@@ -313,7 +186,7 @@ mod tests {
             // input is 10 batches x 8 rows (80 rows) with fetch limit of 50
             .with_target_batch_size(21)
             .with_fetch(Some(50))
-            .with_expected_output_sizes(vec![24, 24, 2])
+            .with_expected_output_sizes(vec![21, 21, 8])
             .run();
     }
 
@@ -323,7 +196,7 @@ mod tests {
         Test::new()
             .with_batches(std::iter::repeat_n(batch, 10))
             // input is 10 batches x 8 rows (80 rows) with fetch limit of 48
-            .with_target_batch_size(21)
+            .with_target_batch_size(24)
             .with_fetch(Some(48))
             .with_expected_output_sizes(vec![24, 24])
             .run();
@@ -352,7 +225,7 @@ mod tests {
             .run()
     }
 
-    /// Test for [`BatchCoalescer`]
+    /// Test for [`LimitedBatchCoalescer`]
     ///
     /// Pushes the input batches to the coalescer and verifies that the resulting
     /// batches have the expected number of rows and contents.
@@ -425,26 +298,32 @@ mod tests {
             let single_input_batch = concat_batches(&schema, &input_batches).unwrap();
 
             let mut coalescer =
-                BatchCoalescer::new(Arc::clone(&schema), target_batch_size, fetch);
+                LimitedBatchCoalescer::new(Arc::clone(&schema), target_batch_size, fetch);
 
             let mut output_batches = vec![];
             for batch in input_batches {
-                match coalescer.push_batch(batch) {
-                    CoalescerState::Continue => {}
-                    CoalescerState::LimitReached => {
-                        output_batches.push(coalescer.finish_batch().unwrap());
-                        break;
+                match coalescer.push_batch(batch).unwrap() {
+                    PushBatchStatus::Continue => {
+                        // continue pushing batches
                     }
-                    CoalescerState::TargetReached => {
-                        coalescer.buffered_rows = 0;
-                        output_batches.push(coalescer.finish_batch().unwrap());
+                    PushBatchStatus::LimitReached => {
+                        break;
                     }
                 }
             }
-            if coalescer.buffered_rows != 0 {
-                output_batches.extend(coalescer.buffer);
+            coalescer.finish().unwrap();
+            while let Some(batch) = coalescer.next_completed_batch() {
+                output_batches.push(batch);
             }
 
+            let actual_output_sizes: Vec<usize> =
+                output_batches.iter().map(|b| b.num_rows()).collect();
+            assert_eq!(
+                expected_output_sizes, actual_output_sizes,
+                "Unexpected number of rows in output batches\n\
+                Expected\n{expected_output_sizes:#?}\nActual:{actual_output_sizes:#?}"
+            );
+
             // make sure we got the expected number of output batches and content
             let mut starting_idx = 0;
             assert_eq!(expected_output_sizes.len(), output_batches.len());
@@ -488,112 +367,8 @@ mod tests {
         .unwrap()
     }
 
-    #[test]
-    fn test_gc_string_view_batch_small_no_compact() {
-        // view with only short strings (no buffers) --> no need to compact
-        let array = StringViewTest {
-            rows: 1000,
-            strings: vec![Some("a"), Some("b"), Some("c")],
-        }
-        .build();
-
-        let gc_array = do_gc(array.clone());
-        compare_string_array_values(&array, &gc_array);
-        assert_eq!(array.data_buffers().len(), 0);
-        assert_eq!(array.data_buffers().len(), gc_array.data_buffers().len()); // no compaction
-    }
-
-    #[test]
-    fn test_gc_string_view_test_batch_empty() {
-        let schema = Schema::empty();
-        let batch = RecordBatch::new_empty(schema.into());
-        let output_batch = gc_string_view_batch(&batch);
-        assert_eq!(batch.num_columns(), output_batch.num_columns());
-        assert_eq!(batch.num_rows(), output_batch.num_rows());
-    }
-
-    #[test]
-    fn test_gc_string_view_batch_large_no_compact() {
-        // view with large strings (has buffers) but full --> no need to compact
-        let array = StringViewTest {
-            rows: 1000,
-            strings: vec![Some("This string is longer than 12 bytes")],
-        }
-        .build();
-
-        let gc_array = do_gc(array.clone());
-        compare_string_array_values(&array, &gc_array);
-        assert_eq!(array.data_buffers().len(), 5);
-        assert_eq!(array.data_buffers().len(), gc_array.data_buffers().len()); // no compaction
-    }
-
-    #[test]
-    fn test_gc_string_view_batch_large_slice_compact() {
-        // view with large strings (has buffers) and only partially used  --> no need to compact
-        let array = StringViewTest {
-            rows: 1000,
-            strings: vec![Some("this string is longer than 12 bytes")],
-        }
-        .build();
-
-        // slice only 11 rows, so most of the buffer is not used
-        let array = array.slice(11, 22);
-
-        let gc_array = do_gc(array.clone());
-        compare_string_array_values(&array, &gc_array);
-        assert_eq!(array.data_buffers().len(), 5);
-        assert_eq!(gc_array.data_buffers().len(), 1); // compacted into a single buffer
-    }
-
-    /// Compares the values of two string view arrays
-    fn compare_string_array_values(arr1: &StringViewArray, arr2: &StringViewArray) {
-        assert_eq!(arr1.len(), arr2.len());
-        for (s1, s2) in arr1.iter().zip(arr2.iter()) {
-            assert_eq!(s1, s2);
-        }
-    }
-
-    /// runs garbage collection on string view array
-    /// and ensures the number of rows are the same
-    fn do_gc(array: StringViewArray) -> StringViewArray {
-        let batch =
-            RecordBatch::try_from_iter(vec![("a", Arc::new(array) as ArrayRef)]).unwrap();
-        let gc_batch = gc_string_view_batch(&batch);
-        assert_eq!(batch.num_rows(), gc_batch.num_rows());
-        assert_eq!(batch.schema(), gc_batch.schema());
-        gc_batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<StringViewArray>()
-            .unwrap()
-            .clone()
-    }
-
-    /// Describes parameters for creating a `StringViewArray`
-    struct StringViewTest {
-        /// The number of rows in the array
-        rows: usize,
-        /// The strings to use in the array (repeated over and over
-        strings: Vec<Option<&'static str>>,
-    }
-
-    impl StringViewTest {
-        /// Create a `StringViewArray` with the parameters specified in this struct
-        fn build(self) -> StringViewArray {
-            let mut builder =
-                StringViewBuilder::with_capacity(100).with_fixed_block_size(8192);
-            loop {
-                for &v in self.strings.iter() {
-                    builder.append_option(v);
-                    if builder.len() >= self.rows {
-                        return builder.finish();
-                    }
-                }
-            }
-        }
-    }
     fn batch_to_pretty_strings(batch: &RecordBatch) -> String {
-        arrow::util::pretty::pretty_format_batches(&[batch.clone()])
+        arrow::util::pretty::pretty_format_batches(std::slice::from_ref(batch))
             .unwrap()
             .to_string()
     }
diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs
index f35231fb6a995..3e8bfc7f81724 100644
--- a/datafusion/physical-plan/src/coalesce_batches.rs
+++ b/datafusion/physical-plan/src/coalesce_batches.rs
@@ -24,22 +24,28 @@ use std::task::{Context, Poll};
 
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use super::{DisplayAs, ExecutionPlanProperties, PlanProperties, Statistics};
+use crate::projection::ProjectionExec;
 use crate::{
     DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
+    check_if_same_properties,
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::PhysicalExpr;
 
-use crate::coalesce::{BatchCoalescer, CoalescerState};
+use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus};
 use crate::execution_plan::CardinalityEffect;
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPropagation,
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
 };
+use crate::sort_pushdown::SortOrderPushdownResult;
 use datafusion_common::config::ConfigOptions;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use futures::ready;
 use futures::stream::{Stream, StreamExt};
 
@@ -52,7 +58,11 @@ use futures::stream::{Stream, StreamExt};
 /// buffering and returns the final batch once the number of collected rows
 /// reaches the `fetch` value.
 ///
-/// See [`BatchCoalescer`] for more information
+/// See [`LimitedBatchCoalescer`] for more information
+#[deprecated(
+    since = "52.0.0",
+    note = "We now use BatchCoalescer from arrow-rs instead of a dedicated operator"
+)]
 #[derive(Debug, Clone)]
 pub struct CoalesceBatchesExec {
     /// The input plan
@@ -63,9 +73,10 @@ pub struct CoalesceBatchesExec {
     fetch: Option<usize>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
+#[expect(deprecated)]
 impl CoalesceBatchesExec {
     /// Create a new CoalesceBatchesExec
     pub fn new(input: Arc<dyn ExecutionPlan>, target_batch_size: usize) -> Self {
@@ -75,7 +86,7 @@ impl CoalesceBatchesExec {
             target_batch_size,
             fetch: None,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -106,8 +117,20 @@ impl CoalesceBatchesExec {
             input.boundedness(),
         )
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
+#[expect(deprecated)]
 impl DisplayAs for CoalesceBatchesExec {
     fn fmt_as(
         &self,
@@ -138,6 +161,7 @@ impl DisplayAs for CoalesceBatchesExec {
     }
 }
 
+#[expect(deprecated)]
 impl ExecutionPlan for CoalesceBatchesExec {
     fn name(&self) -> &'static str {
         "CoalesceBatchesExec"
@@ -148,7 +172,7 @@ impl ExecutionPlan for CoalesceBatchesExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -164,12 +188,20 @@ impl ExecutionPlan for CoalesceBatchesExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(
-            CoalesceBatchesExec::new(Arc::clone(&children[0]), self.target_batch_size)
+            CoalesceBatchesExec::new(children.swap_remove(0), self.target_batch_size)
                 .with_fetch(self.fetch),
         ))
     }
@@ -181,14 +213,13 @@ impl ExecutionPlan for CoalesceBatchesExec {
     ) -> Result<SendableRecordBatchStream> {
         Ok(Box::pin(CoalesceBatchesStream {
             input: self.input.execute(partition, context)?,
-            coalescer: BatchCoalescer::new(
+            coalescer: LimitedBatchCoalescer::new(
                 self.input.schema(),
                 self.target_batch_size,
                 self.fetch,
             ),
             baseline_metrics: BaselineMetrics::new(&self.metrics, partition),
-            // Start by pulling data
-            inner_state: CoalesceBatchesStreamState::Pull,
+            completed: false,
         }))
     }
 
@@ -196,17 +227,9 @@ impl ExecutionPlan for CoalesceBatchesExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input.partition_statistics(partition)?.with_fetch(
-            self.schema(),
-            self.fetch,
-            0,
-            1,
-        )
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
@@ -215,7 +238,7 @@ impl ExecutionPlan for CoalesceBatchesExec {
             target_batch_size: self.target_batch_size,
             fetch: limit,
             metrics: self.metrics.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         }))
     }
 
@@ -227,23 +250,48 @@ impl ExecutionPlan for CoalesceBatchesExec {
         CardinalityEffect::Equal
     }
 
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
     fn gather_filters_for_pushdown(
         &self,
+        _phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        Ok(FilterDescription::new_with_child_count(1)
-            .all_parent_filters_supported(parent_filters))
+        FilterDescription::from_children(parent_filters, &self.children())
     }
 
     fn handle_child_pushdown_result(
         &self,
+        _phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-        Ok(FilterPushdownPropagation::transparent(
-            child_pushdown_result,
-        ))
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalesceBatchesExec is transparent for sort ordering - it preserves order
+        // Delegate to the child and wrap with a new CoalesceBatchesExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            Ok(Arc::new(
+                CoalesceBatchesExec::new(new_input, self.target_batch_size)
+                    .with_fetch(self.fetch),
+            ) as Arc<dyn ExecutionPlan>)
+        })
     }
 }
 
@@ -252,12 +300,11 @@ struct CoalesceBatchesStream {
     /// The input plan
     input: SendableRecordBatchStream,
     /// Buffer for combining batches
-    coalescer: BatchCoalescer,
+    coalescer: LimitedBatchCoalescer,
     /// Execution metrics
     baseline_metrics: BaselineMetrics,
-    /// The current inner state of the stream. This state dictates the current
-    /// action or operation to be performed in the streaming process.
-    inner_state: CoalesceBatchesStreamState,
+    /// is the input stream exhausted or limit reached?
+    completed: bool,
 }
 
 impl Stream for CoalesceBatchesStream {
@@ -277,50 +324,6 @@ impl Stream for CoalesceBatchesStream {
     }
 }
 
-/// Enumeration of possible states for `CoalesceBatchesStream`.
-/// It represents different stages in the lifecycle of a stream of record batches.
-///
-/// An example of state transition:
-/// Notation:
-/// `[3000]`: A batch with size 3000
-/// `{[2000], [3000]}`: `CoalesceBatchStream`'s internal buffer with 2 batches buffered
-/// Input of `CoalesceBatchStream` will generate three batches `[2000], [3000], [4000]`
-/// The coalescing procedure will go through the following steps with 4096 coalescing threshold:
-/// 1. Read the first batch and get it buffered.
-/// - initial state: `Pull`
-/// - initial buffer: `{}`
-/// - updated buffer: `{[2000]}`
-/// - next state: `Pull`
-/// 2. Read the second batch, the coalescing target is reached since 2000 + 3000 > 4096
-/// - initial state: `Pull`
-/// - initial buffer: `{[2000]}`
-/// - updated buffer: `{[2000], [3000]}`
-/// - next state: `ReturnBuffer`
-/// 4. Two batches in the batch get merged and consumed by the upstream operator.
-/// - initial state: `ReturnBuffer`
-/// - initial buffer: `{[2000], [3000]}`
-/// - updated buffer: `{}`
-/// - next state: `Pull`
-/// 5. Read the third input batch.
-/// - initial state: `Pull`
-/// - initial buffer: `{}`
-/// - updated buffer: `{[4000]}`
-/// - next state: `Pull`
-/// 5. The input is ended now. Jump to exhaustion state preparing the finalized data.
-/// - initial state: `Pull`
-/// - initial buffer: `{[4000]}`
-/// - updated buffer: `{[4000]}`
-/// - next state: `Exhausted`
-#[derive(Debug, Clone, Eq, PartialEq)]
-enum CoalesceBatchesStreamState {
-    /// State to pull a new batch from the input stream.
-    Pull,
-    /// State to return a buffered batch.
-    ReturnBuffer,
-    /// State indicating that the stream is exhausted.
-    Exhausted,
-}
-
 impl CoalesceBatchesStream {
     fn poll_next_inner(
         self: &mut Pin<&mut Self>,
@@ -328,51 +331,39 @@ impl CoalesceBatchesStream {
     ) -> Poll<Option<Result<RecordBatch>>> {
         let cloned_time = self.baseline_metrics.elapsed_compute().clone();
         loop {
-            match &self.inner_state {
-                CoalesceBatchesStreamState::Pull => {
-                    // Attempt to pull the next batch from the input stream.
-                    let input_batch = ready!(self.input.poll_next_unpin(cx));
-                    // Start timing the operation. The timer records time upon being dropped.
-                    let _timer = cloned_time.timer();
-
-                    match input_batch {
-                        Some(Ok(batch)) => match self.coalescer.push_batch(batch) {
-                            CoalescerState::Continue => {}
-                            CoalescerState::LimitReached => {
-                                self.inner_state = CoalesceBatchesStreamState::Exhausted;
-                            }
-                            CoalescerState::TargetReached => {
-                                self.inner_state =
-                                    CoalesceBatchesStreamState::ReturnBuffer;
-                            }
-                        },
-                        None => {
-                            // End of input stream, but buffered batches might still be present.
-                            self.inner_state = CoalesceBatchesStreamState::Exhausted;
+            // If there is any completed batch ready, return it
+            if let Some(batch) = self.coalescer.next_completed_batch() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+            if self.completed {
+                // If input is done and no batches are ready, return None to signal end of stream.
+                return Poll::Ready(None);
+            }
+            // Attempt to pull the next batch from the input stream.
+            let input_batch = ready!(self.input.poll_next_unpin(cx));
+            // Start timing the operation. The timer records time upon being dropped.
+            let _timer = cloned_time.timer();
+
+            match input_batch {
+                None => {
+                    // Input stream is exhausted, finalize any remaining batches
+                    self.completed = true;
+                    self.coalescer.finish()?;
+                }
+                Some(Ok(batch)) => {
+                    match self.coalescer.push_batch(batch)? {
+                        PushBatchStatus::Continue => {
+                            // Keep pushing more batches
+                        }
+                        PushBatchStatus::LimitReached => {
+                            // limit was reached, so stop early
+                            self.completed = true;
+                            self.coalescer.finish()?;
                         }
-                        other => return Poll::Ready(other),
                     }
                 }
-                CoalesceBatchesStreamState::ReturnBuffer => {
-                    let _timer = cloned_time.timer();
-                    // Combine buffered batches into one batch and return it.
-                    let batch = self.coalescer.finish_batch()?;
-                    // Set to pull state for the next iteration.
-                    self.inner_state = CoalesceBatchesStreamState::Pull;
-                    return Poll::Ready(Some(Ok(batch)));
-                }
-                CoalesceBatchesStreamState::Exhausted => {
-                    // Handle the end of the input stream.
-                    return if self.coalescer.is_empty() {
-                        // If buffer is empty, return None indicating the stream is fully consumed.
-                        Poll::Ready(None)
-                    } else {
-                        let _timer = cloned_time.timer();
-                        // If the buffer still contains batches, prepare to return them.
-                        let batch = self.coalescer.finish_batch()?;
-                        Poll::Ready(Some(Ok(batch)))
-                    };
-                }
+                // Error case
+                other => return Poll::Ready(other),
             }
         }
     }
diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs
index 114f830688c97..5ea3589f22b3e 100644
--- a/datafusion/physical-plan/src/coalesce_partitions.rs
+++ b/datafusion/physical-plan/src/coalesce_partitions.rs
@@ -27,12 +27,18 @@ use super::{
     DisplayAs, ExecutionPlanProperties, PlanProperties, SendableRecordBatchStream,
     Statistics,
 };
-use crate::execution_plan::CardinalityEffect;
-use crate::projection::{make_with_child, ProjectionExec};
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
-
-use datafusion_common::{internal_err, Result};
+use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType};
+use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase};
+use crate::projection::{ProjectionExec, make_with_child};
+use crate::sort_pushdown::SortOrderPushdownResult;
+use crate::{DisplayFormatType, ExecutionPlan, Partitioning, check_if_same_properties};
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_physical_expr::PhysicalExpr;
 
 /// Merge execution plan executes partitions in parallel and combines them into a single
 /// partition. No guarantees are made about the order of the resulting partition.
@@ -42,7 +48,7 @@ pub struct CoalescePartitionsExec {
     input: Arc<dyn ExecutionPlan>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Optional number of rows to fetch. Stops producing rows after this fetch
     pub(crate) fetch: Option<usize>,
 }
@@ -54,7 +60,7 @@ impl CoalescePartitionsExec {
         CoalescePartitionsExec {
             input,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
             fetch: None,
         }
     }
@@ -72,6 +78,16 @@ impl CoalescePartitionsExec {
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(input: &Arc<dyn ExecutionPlan>) -> PlanProperties {
+        let input_partitions = input.output_partitioning().partition_count();
+        let (drive, scheduling) = if input_partitions > 1 {
+            (EvaluationType::Eager, SchedulingType::Cooperative)
+        } else {
+            (
+                input.properties().evaluation_type,
+                input.properties().scheduling_type,
+            )
+        };
+
         // Coalescing partitions loses existing orderings:
         let mut eq_properties = input.equivalence_properties().clone();
         eq_properties.clear_orderings();
@@ -82,6 +98,19 @@ impl CoalescePartitionsExec {
             input.pipeline_behavior(),
             input.boundedness(),
         )
+        .with_evaluation_type(drive)
+        .with_scheduling_type(scheduling)
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
     }
 }
 
@@ -118,7 +147,7 @@ impl ExecutionPlan for CoalescePartitionsExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -130,11 +159,19 @@ impl ExecutionPlan for CoalescePartitionsExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let mut plan = CoalescePartitionsExec::new(Arc::clone(&children[0]));
+        check_if_same_properties!(self, children);
+        let mut plan = CoalescePartitionsExec::new(children.swap_remove(0));
         plan.fetch = self.fetch;
         Ok(Arc::new(plan))
     }
@@ -145,9 +182,11 @@ impl ExecutionPlan for CoalescePartitionsExec {
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         // CoalescePartitionsExec produces a single partition
-        if 0 != partition {
-            return internal_err!("CoalescePartitionsExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "CoalescePartitionsExec invalid partition {partition}"
+        );
 
         let input_partitions = self.input.output_partitioning().partition_count();
         match input_partitions {
@@ -155,8 +194,18 @@ impl ExecutionPlan for CoalescePartitionsExec {
                 "CoalescePartitionsExec requires at least one input partition"
             ),
             1 => {
-                // bypass any threading / metrics if there is a single partition
-                self.input.execute(0, context)
+                // single-partition path: execute child directly, but ensure fetch is respected
+                // (wrap with ObservedStream only if fetch is present so we don't add overhead otherwise)
+                let child_stream = self.input.execute(0, context)?;
+                if self.fetch.is_some() {
+                    let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+                    return Ok(Box::pin(ObservedStream::new(
+                        child_stream,
+                        baseline_metrics,
+                        self.fetch,
+                    )));
+                }
+                Ok(child_stream)
             }
             _ => {
                 let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
@@ -195,14 +244,9 @@ impl ExecutionPlan for CoalescePartitionsExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
-        self.input
-            .partition_statistics(None)?
-            .with_fetch(self.schema(), self.fetch, 0, 1)
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(None)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn supports_limit_pushdown(&self) -> bool {
@@ -245,16 +289,74 @@ impl ExecutionPlan for CoalescePartitionsExec {
             input: Arc::clone(&self.input),
             fetch: limit,
             metrics: self.metrics.clone(),
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
         }))
     }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        FilterDescription::from_children(parent_filters, &self.children())
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // CoalescePartitionsExec merges multiple partitions into one, which loses
+        // global ordering. However, we can still push the sort requirement down
+        // to optimize individual partitions - the Sort operator above will handle
+        // the global ordering.
+        //
+        // Note: The result will always be at most Inexact (never Exact) when there
+        // are multiple partitions, because merging destroys global ordering.
+        let result = self.input.try_pushdown_sort(order)?;
+
+        // If we have multiple partitions, we can't return Exact even if the
+        // underlying source claims Exact - merging destroys global ordering
+        let has_multiple_partitions =
+            self.input.output_partitioning().partition_count() > 1;
+
+        result
+            .try_map(|new_input| {
+                Ok(
+                    Arc::new(
+                        CoalescePartitionsExec::new(new_input).with_fetch(self.fetch),
+                    ) as Arc<dyn ExecutionPlan>,
+                )
+            })
+            .map(|r| {
+                if has_multiple_partitions {
+                    // Downgrade Exact to Inexact when merging multiple partitions
+                    r.into_inexact()
+                } else {
+                    r
+                }
+            })
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::test::exec::{
-        assert_strong_count_converges_to_zero, BlockingExec, PanicExec,
+        BlockingExec, PanicExec, assert_strong_count_converges_to_zero,
     };
     use crate::test::{self, assert_is_pending};
     use crate::{collect, common};
@@ -327,4 +429,110 @@ mod tests {
 
         collect(coalesce_partitions_exec, task_ctx).await.unwrap();
     }
+
+    #[tokio::test]
+    async fn test_single_partition_with_fetch() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Use existing scan_partitioned with 1 partition (returns 100 rows per partition)
+        let input = test::scan_partitioned(1);
+
+        // Test with fetch=3
+        let coalesce = CoalescePartitionsExec::new(input).with_fetch(Some(3));
+
+        let stream = coalesce.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum();
+        assert_eq!(row_count, 3, "Should only return 3 rows due to fetch=3");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multi_partition_with_fetch_one() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Create 4 partitions, each with 100 rows
+        // This simulates the real-world scenario where each partition has data
+        let input = test::scan_partitioned(4);
+
+        // Test with fetch=1 (the original bug: was returning multiple rows instead of 1)
+        let coalesce = CoalescePartitionsExec::new(input).with_fetch(Some(1));
+
+        let stream = coalesce.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum();
+        assert_eq!(
+            row_count, 1,
+            "Should only return 1 row due to fetch=1, not one per partition"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_partition_without_fetch() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Use scan_partitioned with 1 partition
+        let input = test::scan_partitioned(1);
+
+        // Test without fetch (should return all rows)
+        let coalesce = CoalescePartitionsExec::new(input);
+
+        let stream = coalesce.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum();
+        assert_eq!(
+            row_count, 100,
+            "Should return all 100 rows when fetch is None"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_partition_fetch_larger_than_batch() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Use scan_partitioned with 1 partition (returns 100 rows)
+        let input = test::scan_partitioned(1);
+
+        // Test with fetch larger than available rows
+        let coalesce = CoalescePartitionsExec::new(input).with_fetch(Some(200));
+
+        let stream = coalesce.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum();
+        assert_eq!(
+            row_count, 100,
+            "Should return all available rows (100) when fetch (200) is larger"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multi_partition_fetch_exact_match() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Create 4 partitions, each with 100 rows
+        let num_partitions = 4;
+        let csv = test::scan_partitioned(num_partitions);
+
+        // Test with fetch=400 (exactly all rows)
+        let coalesce = CoalescePartitionsExec::new(csv).with_fetch(Some(400));
+
+        let stream = coalesce.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum();
+        assert_eq!(row_count, 400, "Should return exactly 400 rows");
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/column_rewriter.rs b/datafusion/physical-plan/src/column_rewriter.rs
new file mode 100644
index 0000000000000..7cd8656304554
--- /dev/null
+++ b/datafusion/physical-plan/src/column_rewriter.rs
@@ -0,0 +1,383 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::{
+    DataFusionError, HashMap,
+    tree_node::{Transformed, TreeNodeRecursion, TreeNodeRewriter},
+};
+use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
+
+/// Rewrite column references in a physical expr according to a mapping.
+///
+/// This rewriter traverses the expression tree and replaces [`Column`] nodes
+/// with the corresponding expression found in the `column_map`.
+///
+/// If a column is found in the map, it is replaced by the mapped expression.
+/// If a column is NOT found in the map, a `DataFusionError::Internal` is
+/// returned.
+pub struct PhysicalColumnRewriter<'a> {
+    /// Mapping from original column to new column.
+    pub column_map: &'a HashMap<Column, Arc<dyn PhysicalExpr>>,
+}
+
+impl<'a> PhysicalColumnRewriter<'a> {
+    /// Create a new PhysicalColumnRewriter with the given column mapping.
+    pub fn new(column_map: &'a HashMap<Column, Arc<dyn PhysicalExpr>>) -> Self {
+        Self { column_map }
+    }
+}
+
+impl<'a> TreeNodeRewriter for PhysicalColumnRewriter<'a> {
+    type Node = Arc<dyn PhysicalExpr>;
+
+    fn f_down(
+        &mut self,
+        node: Self::Node,
+    ) -> datafusion_common::Result<Transformed<Self::Node>> {
+        if let Some(column) = node.as_any().downcast_ref::<Column>() {
+            if let Some(new_column) = self.column_map.get(column) {
+                // jump to prevent rewriting the new sub-expression again
+                return Ok(Transformed::new(
+                    Arc::clone(new_column),
+                    true,
+                    TreeNodeRecursion::Jump,
+                ));
+            } else {
+                // Column not found in mapping
+                return Err(DataFusionError::Internal(format!(
+                    "Column {column:?} not found in column mapping {:?}",
+                    self.column_map
+                )));
+            }
+        }
+        Ok(Transformed::no(node))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::{DataFusionError, Result, tree_node::TreeNode};
+    use datafusion_physical_expr::{
+        PhysicalExpr,
+        expressions::{Column, binary, col, lit},
+    };
+    use std::sync::Arc;
+
+    /// Helper function to create a test schema
+    fn create_test_schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+            Field::new("c", DataType::Int32, true),
+            Field::new("d", DataType::Int32, true),
+            Field::new("e", DataType::Int32, true),
+            Field::new("new_col", DataType::Int32, true),
+            Field::new("inner_col", DataType::Int32, true),
+            Field::new("another_col", DataType::Int32, true),
+        ]))
+    }
+
+    /// Helper function to create a complex nested expression with multiple columns
+    /// Create: (col_a + col_b) * (col_c - col_d) + col_e
+    fn create_complex_expression(schema: &Schema) -> Arc<dyn PhysicalExpr> {
+        let col_a = col("a", schema).unwrap();
+        let col_b = col("b", schema).unwrap();
+        let col_c = col("c", schema).unwrap();
+        let col_d = col("d", schema).unwrap();
+        let col_e = col("e", schema).unwrap();
+
+        let add_expr =
+            binary(col_a, datafusion_expr::Operator::Plus, col_b, schema).unwrap();
+        let sub_expr =
+            binary(col_c, datafusion_expr::Operator::Minus, col_d, schema).unwrap();
+        let mul_expr = binary(
+            add_expr,
+            datafusion_expr::Operator::Multiply,
+            sub_expr,
+            schema,
+        )
+        .unwrap();
+        binary(mul_expr, datafusion_expr::Operator::Plus, col_e, schema).unwrap()
+    }
+
+    /// Helper function to create a deeply nested expression
+    /// Create: col_a + (col_b + (col_c + (col_d + col_e)))
+    fn create_deeply_nested_expression(schema: &Schema) -> Arc<dyn PhysicalExpr> {
+        let col_a = col("a", schema).unwrap();
+        let col_b = col("b", schema).unwrap();
+        let col_c = col("c", schema).unwrap();
+        let col_d = col("d", schema).unwrap();
+        let col_e = col("e", schema).unwrap();
+
+        let inner1 =
+            binary(col_d, datafusion_expr::Operator::Plus, col_e, schema).unwrap();
+        let inner2 =
+            binary(col_c, datafusion_expr::Operator::Plus, inner1, schema).unwrap();
+        let inner3 =
+            binary(col_b, datafusion_expr::Operator::Plus, inner2, schema).unwrap();
+        binary(col_a, datafusion_expr::Operator::Plus, inner3, schema).unwrap()
+    }
+
+    #[test]
+    fn test_simple_column_replacement_with_jump() -> Result<()> {
+        let schema = create_test_schema();
+
+        // Test that Jump prevents re-processing of replaced columns
+        let mut column_map = HashMap::new();
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(42i32));
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            lit("replaced_b"),
+        );
+        column_map.insert(
+            Column::new_with_schema("c", &schema).unwrap(),
+            col("c", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("e", &schema).unwrap(),
+            col("e", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_complex_expression(&schema);
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify the transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(
+            format!("{}", result.data),
+            "(42 + replaced_b) * (c@2 - d@3) + e@4"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_nested_column_replacement_with_jump() -> Result<()> {
+        let schema = create_test_schema();
+        // Test Jump behavior with deeply nested expressions
+        let mut column_map = HashMap::new();
+        // Replace col_c with a complex expression containing new columns
+        let replacement_expr = binary(
+            lit(100i32),
+            datafusion_expr::Operator::Plus,
+            col("new_col", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+        column_map.insert(
+            Column::new_with_schema("c", &schema).unwrap(),
+            replacement_expr,
+        );
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            col("a", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("e", &schema).unwrap(),
+            col("e", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_deeply_nested_expression(&schema);
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(
+            format!("{}", result.data),
+            "a@0 + b@1 + 100 + new_col@5 + d@3 + e@4"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_circular_reference_prevention() -> Result<()> {
+        let schema = create_test_schema();
+        // Test that Jump prevents infinite recursion with circular references
+        let mut column_map = HashMap::new();
+
+        // Create a circular reference: col_a -> col_b -> col_a (but Jump should prevent the second visit)
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("a", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Start with an expression containing col_a
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        assert_eq!(format!("{}", result.data), "b@1 + a@0");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_multiple_replacements_in_same_expression() -> Result<()> {
+        let schema = create_test_schema();
+        // Test multiple column replacements in the same complex expression
+        let mut column_map = HashMap::new();
+
+        // Replace multiple columns with literals
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(10i32));
+        column_map.insert(Column::new_with_schema("c", &schema).unwrap(), lit(20i32));
+        column_map.insert(Column::new_with_schema("e", &schema).unwrap(), lit(30i32));
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+        column_map.insert(
+            Column::new_with_schema("d", &schema).unwrap(),
+            col("d", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+        let expr = create_complex_expression(&schema); // (col_a + col_b) * (col_c - col_d) + col_e
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+        assert_eq!(format!("{}", result.data), "(10 + b@1) * (20 - d@3) + 30");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_jump_with_complex_replacement_expression() -> Result<()> {
+        let schema = create_test_schema();
+        // Test Jump behavior when replacing with very complex expressions
+        let mut column_map = HashMap::new();
+
+        // Replace col_a with a complex nested expression
+        let inner_expr = binary(
+            lit(5i32),
+            datafusion_expr::Operator::Multiply,
+            col("a", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+        let middle_expr = binary(
+            inner_expr,
+            datafusion_expr::Operator::Plus,
+            lit(3i32),
+            &schema,
+        )
+        .unwrap();
+        let complex_replacement = binary(
+            middle_expr,
+            datafusion_expr::Operator::Minus,
+            col("another_col", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        column_map.insert(
+            Column::new_with_schema("a", &schema).unwrap(),
+            complex_replacement,
+        );
+        column_map.insert(
+            Column::new_with_schema("b", &schema).unwrap(),
+            col("b", &schema).unwrap(),
+        );
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Create expression: col_a + col_b
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let result = expr.rewrite(&mut rewriter)?;
+
+        assert_eq!(
+            format!("{}", result.data),
+            "5 * a@0 + 3 - another_col@7 + b@1"
+        );
+
+        // Verify transformation occurred
+        assert!(result.transformed);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_unmapped_columns_detection() -> Result<()> {
+        let schema = create_test_schema();
+        let mut column_map = HashMap::new();
+
+        // Only map col_a, leave col_b unmapped
+        column_map.insert(Column::new_with_schema("a", &schema).unwrap(), lit(42i32));
+
+        let mut rewriter = PhysicalColumnRewriter::new(&column_map);
+
+        // Create expression: col_a + col_b
+        let expr = binary(
+            col("a", &schema).unwrap(),
+            datafusion_expr::Operator::Plus,
+            col("b", &schema).unwrap(),
+            &schema,
+        )
+        .unwrap();
+
+        let err = expr.rewrite(&mut rewriter).unwrap_err();
+        assert!(matches!(err, DataFusionError::Internal(_)));
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs
index 35f3e8d16e229..590f6f09e8b9e 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -29,7 +29,7 @@ use arrow::array::Array;
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::stats::Precision;
-use datafusion_common::{plan_err, Result};
+use datafusion_common::{Result, plan_err};
 use datafusion_execution::memory_pool::MemoryReservation;
 
 use futures::{StreamExt, TryStreamExt};
@@ -91,7 +91,7 @@ fn build_file_list_recurse(
 
 /// If running in a tokio context spawns the execution of `stream` to a separate task
 /// allowing it to execute in parallel with an intermediate buffer of size `buffer`
-pub(crate) fn spawn_buffered(
+pub fn spawn_buffered(
     mut input: SendableRecordBatchStream,
     buffer: usize,
 ) -> SendableRecordBatchStream {
@@ -181,7 +181,7 @@ pub fn compute_record_batch_statistics(
 /// Checks if the given projection is valid for the given schema.
 pub fn can_project(
     schema: &arrow::datatypes::SchemaRef,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&[usize]>,
 ) -> Result<()> {
     match projection {
         Some(columns) => {
@@ -262,6 +262,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -269,6 +270,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -302,6 +304,7 @@ mod tests {
                 min_value: Precision::Absent,
                 sum_value: Precision::Absent,
                 null_count: Precision::Exact(3),
+                byte_size: Precision::Absent,
             }],
         };
 
diff --git a/datafusion/physical-plan/src/coop.rs b/datafusion/physical-plan/src/coop.rs
new file mode 100644
index 0000000000000..efe6506edd7bd
--- /dev/null
+++ b/datafusion/physical-plan/src/coop.rs
@@ -0,0 +1,452 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for improved cooperative scheduling.
+//!
+//! # Cooperative scheduling
+//!
+//! A single call to `poll_next` on a top-level [`Stream`] may potentially perform a lot of work
+//! before it returns a `Poll::Pending`. Think for instance of calculating an aggregation over a
+//! large dataset.
+//!
+//! If a `Stream` runs for a long period of time without yielding back to the Tokio executor,
+//! it can starve other tasks waiting on that executor to execute them.
+//! Additionally, this prevents the query execution from being cancelled.
+//!
+//! For more background, please also see the [Using Rust async for Query Execution and Cancelling Long-Running Queries blog]
+//!
+//! [Using Rust async for Query Execution and Cancelling Long-Running Queries blog]: https://datafusion.apache.org/blog/2025/06/30/cancellation
+//!
+//! To ensure that `Stream` implementations yield regularly, operators can insert explicit yield
+//! points using the utilities in this module. For most operators this is **not** necessary. The
+//! `Stream`s of the built-in DataFusion operators that generate (rather than manipulate)
+//! `RecordBatch`es such as `DataSourceExec` and those that eagerly consume `RecordBatch`es
+//! (for instance, `RepartitionExec`) contain yield points that will make most query `Stream`s yield
+//! periodically.
+//!
+//! There are a couple of types of operators that _should_ insert yield points:
+//! - New source operators that do not make use of Tokio resources
+//! - Exchange like operators that do not use Tokio's `Channel` implementation to pass data between
+//!   tasks
+//!
+//! ## Adding yield points
+//!
+//! Yield points can be inserted manually using the facilities provided by the
+//! [Tokio coop module](https://docs.rs/tokio/latest/tokio/task/coop/index.html) such as
+//! [`tokio::task::coop::consume_budget`](https://docs.rs/tokio/latest/tokio/task/coop/fn.consume_budget.html).
+//!
+//! Another option is to use the wrapper `Stream` implementation provided by this module which will
+//! consume a unit of task budget every time a `RecordBatch` is produced.
+//! Wrapper `Stream`s can be created using the [`cooperative`] and [`make_cooperative`] functions.
+//!
+//! [`cooperative`] is a generic function that takes ownership of the wrapped [`RecordBatchStream`].
+//! This function has the benefit of not requiring an additional heap allocation and can avoid
+//! dynamic dispatch.
+//!
+//! [`make_cooperative`] is a non-generic function that wraps a [`SendableRecordBatchStream`]. This
+//! can be used to wrap dynamically typed, heap allocated [`RecordBatchStream`]s.
+//!
+//! ## Automatic cooperation
+//!
+//! The `EnsureCooperative` physical optimizer rule, which is included in the default set of
+//! optimizer rules, inspects query plans for potential cooperative scheduling issues.
+//! It injects the [`CooperativeExec`] wrapper `ExecutionPlan` into the query plan where necessary.
+//! This `ExecutionPlan` uses [`make_cooperative`] to wrap the `Stream` of its input.
+//!
+//! The optimizer rule currently checks the plan for exchange-like operators and leave operators
+//! that report [`SchedulingType::NonCooperative`] in their [plan properties](ExecutionPlan::properties).
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_physical_expr::PhysicalExpr;
+#[cfg(datafusion_coop = "tokio_fallback")]
+use futures::Future;
+use std::any::Any;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::execution_plan::CardinalityEffect::{self, Equal};
+use crate::filter_pushdown::{
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
+};
+use crate::projection::ProjectionExec;
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, RecordBatchStream,
+    SendableRecordBatchStream, SortOrderPushdownResult, check_if_same_properties,
+};
+use arrow::record_batch::RecordBatch;
+use arrow_schema::Schema;
+use datafusion_common::{Result, Statistics, assert_eq_or_internal_err};
+use datafusion_execution::TaskContext;
+
+use crate::execution_plan::SchedulingType;
+use crate::stream::RecordBatchStreamAdapter;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use futures::{Stream, StreamExt};
+
+/// A stream that passes record batches through unchanged while cooperating with the Tokio runtime.
+/// It consumes cooperative scheduling budget for each returned [`RecordBatch`],
+/// allowing other tasks to execute when the budget is exhausted.
+///
+/// See the [module level documentation](crate::coop) for an in-depth discussion.
+pub struct CooperativeStream<T>
+where
+    T: RecordBatchStream + Unpin,
+{
+    inner: T,
+    #[cfg(datafusion_coop = "per_stream")]
+    budget: u8,
+}
+
+#[cfg(datafusion_coop = "per_stream")]
+// Magic value that matches Tokio's task budget value
+const YIELD_FREQUENCY: u8 = 128;
+
+impl<T> CooperativeStream<T>
+where
+    T: RecordBatchStream + Unpin,
+{
+    /// Creates a new `CooperativeStream` that wraps the provided stream.
+    /// The resulting stream will cooperate with the Tokio scheduler by consuming a unit of
+    /// scheduling budget when the wrapped `Stream` returns a record batch.
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner,
+            #[cfg(datafusion_coop = "per_stream")]
+            budget: YIELD_FREQUENCY,
+        }
+    }
+}
+
+impl<T> Stream for CooperativeStream<T>
+where
+    T: RecordBatchStream + Unpin,
+{
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        #[cfg(any(
+            datafusion_coop = "tokio",
+            not(any(
+                datafusion_coop = "tokio_fallback",
+                datafusion_coop = "per_stream"
+            ))
+        ))]
+        {
+            let coop = std::task::ready!(tokio::task::coop::poll_proceed(cx));
+            let value = self.inner.poll_next_unpin(cx);
+            if value.is_ready() {
+                coop.made_progress();
+            }
+            value
+        }
+
+        #[cfg(datafusion_coop = "tokio_fallback")]
+        {
+            // This is a temporary placeholder implementation that may have slightly
+            // worse performance compared to `poll_proceed`
+            if !tokio::task::coop::has_budget_remaining() {
+                cx.waker().wake_by_ref();
+                return Poll::Pending;
+            }
+
+            let value = self.inner.poll_next_unpin(cx);
+            if value.is_ready() {
+                // In contrast to `poll_proceed` we are not able to consume
+                // budget before proceeding to do work. Instead, we try to consume budget
+                // after the work has been done and just assume that that succeeded.
+                // The poll result is ignored because we don't want to discard
+                // or buffer the Ready result we got from the inner stream.
+                let consume = tokio::task::coop::consume_budget();
+                let consume_ref = std::pin::pin!(consume);
+                let _ = consume_ref.poll(cx);
+            }
+            value
+        }
+
+        #[cfg(datafusion_coop = "per_stream")]
+        {
+            if self.budget == 0 {
+                self.budget = YIELD_FREQUENCY;
+                cx.waker().wake_by_ref();
+                return Poll::Pending;
+            }
+
+            let value = { self.inner.poll_next_unpin(cx) };
+
+            if value.is_ready() {
+                self.budget -= 1;
+            } else {
+                self.budget = YIELD_FREQUENCY;
+            }
+            value
+        }
+    }
+}
+
+impl<T> RecordBatchStream for CooperativeStream<T>
+where
+    T: RecordBatchStream + Unpin,
+{
+    fn schema(&self) -> Arc<Schema> {
+        self.inner.schema()
+    }
+}
+
+/// An execution plan decorator that enables cooperative multitasking.
+/// It wraps the streams produced by its input execution plan using the [`make_cooperative`] function,
+/// which makes the stream participate in Tokio cooperative scheduling.
+#[derive(Debug, Clone)]
+pub struct CooperativeExec {
+    input: Arc<dyn ExecutionPlan>,
+    properties: Arc<PlanProperties>,
+}
+
+impl CooperativeExec {
+    /// Creates a new `CooperativeExec` operator that wraps the given input execution plan.
+    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        let properties = PlanProperties::clone(input.properties())
+            .with_scheduling_type(SchedulingType::Cooperative)
+            .into();
+
+        Self { input, properties }
+    }
+
+    /// Returns a reference to the wrapped input execution plan.
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            ..Self::clone(self)
+        }
+    }
+}
+
+impl DisplayAs for CooperativeExec {
+    fn fmt_as(
+        &self,
+        _t: DisplayFormatType,
+        f: &mut std::fmt::Formatter<'_>,
+    ) -> std::fmt::Result {
+        write!(f, "CooperativeExec")
+    }
+}
+
+impl ExecutionPlan for CooperativeExec {
+    fn name(&self) -> &str {
+        "CooperativeExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> Arc<Schema> {
+        self.input.schema()
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.properties
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true; self.children().len()]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        assert_eq_or_internal_err!(
+            children.len(),
+            1,
+            "CooperativeExec requires exactly one child"
+        );
+        check_if_same_properties!(self, children);
+        Ok(Arc::new(CooperativeExec::new(children.swap_remove(0))))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        task_ctx: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let child_stream = self.input.execute(partition, task_ctx)?;
+        Ok(make_cooperative(child_stream))
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        self.input.partition_statistics(partition)
+    }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        true
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        Equal
+    }
+
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        match self.input.try_swapping_with_projection(projection)? {
+            Some(new_input) => Ok(Some(
+                Arc::new(self.clone()).with_new_children(vec![new_input])?,
+            )),
+            None => Ok(None),
+        }
+    }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        FilterDescription::from_children(parent_filters, &self.children())
+    }
+
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        let child = self.input();
+
+        match child.try_pushdown_sort(order)? {
+            SortOrderPushdownResult::Exact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Exact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Inexact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                Ok(SortOrderPushdownResult::Unsupported)
+            }
+        }
+    }
+}
+
+/// Creates a [`CooperativeStream`] wrapper around the given [`RecordBatchStream`].
+/// This wrapper collaborates with the Tokio cooperative scheduler by consuming a unit of
+/// scheduling budget for each returned record batch.
+pub fn cooperative<T>(stream: T) -> CooperativeStream<T>
+where
+    T: RecordBatchStream + Unpin + Send + 'static,
+{
+    CooperativeStream::new(stream)
+}
+
+/// Wraps a `SendableRecordBatchStream` inside a [`CooperativeStream`] to enable cooperative multitasking.
+/// Since `SendableRecordBatchStream` is a `dyn RecordBatchStream` this requires the use of dynamic
+/// method dispatch.
+/// When the stream type is statically known, consider use the generic [`cooperative`] function
+/// to allow static method dispatch.
+pub fn make_cooperative(stream: SendableRecordBatchStream) -> SendableRecordBatchStream {
+    // TODO is there a more elegant way to overload cooperative
+    Box::pin(cooperative(RecordBatchStreamAdapter::new(
+        stream.schema(),
+        stream,
+    )))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::stream::RecordBatchStreamAdapter;
+
+    use arrow_schema::SchemaRef;
+
+    use futures::{StreamExt, stream};
+
+    // This is the hardcoded value Tokio uses
+    const TASK_BUDGET: usize = 128;
+
+    /// Helper: construct a SendableRecordBatchStream containing `n` empty batches
+    fn make_empty_batches(n: usize) -> SendableRecordBatchStream {
+        let schema: SchemaRef = Arc::new(Schema::empty());
+        let schema_for_stream = Arc::clone(&schema);
+
+        let s =
+            stream::iter((0..n).map(move |_| {
+                Ok(RecordBatch::new_empty(Arc::clone(&schema_for_stream)))
+            }));
+
+        Box::pin(RecordBatchStreamAdapter::new(schema, s))
+    }
+
+    #[tokio::test]
+    async fn yield_less_than_threshold() -> Result<()> {
+        let count = TASK_BUDGET - 10;
+        let inner = make_empty_batches(count);
+        let out = make_cooperative(inner).collect::<Vec<_>>().await;
+        assert_eq!(out.len(), count);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yield_equal_to_threshold() -> Result<()> {
+        let count = TASK_BUDGET;
+        let inner = make_empty_batches(count);
+        let out = make_cooperative(inner).collect::<Vec<_>>().await;
+        assert_eq!(out.len(), count);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn yield_more_than_threshold() -> Result<()> {
+        let count = TASK_BUDGET + 20;
+        let inner = make_empty_batches(count);
+        let out = make_cooperative(inner).collect::<Vec<_>>().await;
+        assert_eq!(out.len(), count);
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs
index f555755dd20a5..aaf83345d99b8 100644
--- a/datafusion/physical-plan/src/display.rs
+++ b/datafusion/physical-plan/src/display.rs
@@ -28,9 +28,10 @@ use datafusion_common::display::{GraphvizBuilder, PlanType, StringifiedPlan};
 use datafusion_expr::display_schema;
 use datafusion_physical_expr::LexOrdering;
 
+use crate::metrics::MetricType;
 use crate::render_tree::RenderTree;
 
-use super::{accept, ExecutionPlan, ExecutionPlanVisitor};
+use super::{ExecutionPlan, ExecutionPlanVisitor, accept};
 
 /// Options for controlling how each [`ExecutionPlan`] should format itself
 #[derive(Debug, Clone, Copy, PartialEq)]
@@ -120,9 +121,17 @@ pub struct DisplayableExecutionPlan<'a> {
     show_statistics: bool,
     /// If schema should be displayed. See [`Self::set_show_schema`]
     show_schema: bool,
+    /// Which metric categories should be included when rendering
+    metric_types: Vec<MetricType>,
+    // (TreeRender) Maximum total width of the rendered tree
+    tree_maximum_render_width: usize,
 }
 
 impl<'a> DisplayableExecutionPlan<'a> {
+    fn default_metric_types() -> Vec<MetricType> {
+        vec![MetricType::SUMMARY, MetricType::DEV]
+    }
+
     /// Create a wrapper around an [`ExecutionPlan`] which can be
     /// pretty printed in a variety of ways
     pub fn new(inner: &'a dyn ExecutionPlan) -> Self {
@@ -131,6 +140,8 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics::None,
             show_statistics: false,
             show_schema: false,
+            metric_types: Self::default_metric_types(),
+            tree_maximum_render_width: 240,
         }
     }
 
@@ -143,6 +154,8 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics::Aggregated,
             show_statistics: false,
             show_schema: false,
+            metric_types: Self::default_metric_types(),
+            tree_maximum_render_width: 240,
         }
     }
 
@@ -155,6 +168,8 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics::Full,
             show_statistics: false,
             show_schema: false,
+            metric_types: Self::default_metric_types(),
+            tree_maximum_render_width: 240,
         }
     }
 
@@ -173,6 +188,18 @@ impl<'a> DisplayableExecutionPlan<'a> {
         self
     }
 
+    /// Specify which metric types should be rendered alongside the plan
+    pub fn set_metric_types(mut self, metric_types: Vec<MetricType>) -> Self {
+        self.metric_types = metric_types;
+        self
+    }
+
+    /// Set the maximum render width for the tree format
+    pub fn set_tree_maximum_render_width(mut self, width: usize) -> Self {
+        self.tree_maximum_render_width = width;
+        self
+    }
+
     /// Return a `format`able structure that produces a single line
     /// per node.
     ///
@@ -195,6 +222,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics,
             show_statistics: bool,
             show_schema: bool,
+            metric_types: Vec<MetricType>,
         }
         impl fmt::Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
@@ -205,6 +233,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     show_metrics: self.show_metrics,
                     show_statistics: self.show_statistics,
                     show_schema: self.show_schema,
+                    metric_types: &self.metric_types,
                 };
                 accept(self.plan, &mut visitor)
             }
@@ -215,6 +244,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: self.show_metrics,
             show_statistics: self.show_statistics,
             show_schema: self.show_schema,
+            metric_types: self.metric_types.clone(),
         }
     }
 
@@ -234,6 +264,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             plan: &'a dyn ExecutionPlan,
             show_metrics: ShowMetrics,
             show_statistics: bool,
+            metric_types: Vec<MetricType>,
         }
         impl fmt::Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
@@ -244,6 +275,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     t,
                     show_metrics: self.show_metrics,
                     show_statistics: self.show_statistics,
+                    metric_types: &self.metric_types,
                     graphviz_builder: GraphvizBuilder::default(),
                     parents: Vec::new(),
                 };
@@ -261,6 +293,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             plan: self.inner,
             show_metrics: self.show_metrics,
             show_statistics: self.show_statistics,
+            metric_types: self.metric_types.clone(),
         }
     }
 
@@ -270,14 +303,21 @@ impl<'a> DisplayableExecutionPlan<'a> {
     pub fn tree_render(&self) -> impl fmt::Display + 'a {
         struct Wrapper<'a> {
             plan: &'a dyn ExecutionPlan,
+            maximum_render_width: usize,
         }
         impl fmt::Display for Wrapper<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                let mut visitor = TreeRenderVisitor { f };
+                let mut visitor = TreeRenderVisitor {
+                    f,
+                    maximum_render_width: self.maximum_render_width,
+                };
                 visitor.visit(self.plan)
             }
         }
-        Wrapper { plan: self.inner }
+        Wrapper {
+            plan: self.inner,
+            maximum_render_width: self.tree_maximum_render_width,
+        }
     }
 
     /// Return a single-line summary of the root of the plan
@@ -288,6 +328,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: ShowMetrics,
             show_statistics: bool,
             show_schema: bool,
+            metric_types: Vec<MetricType>,
         }
 
         impl fmt::Display for Wrapper<'_> {
@@ -299,6 +340,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
                     show_metrics: self.show_metrics,
                     show_statistics: self.show_statistics,
                     show_schema: self.show_schema,
+                    metric_types: &self.metric_types,
                 };
                 visitor.pre_visit(self.plan)?;
                 Ok(())
@@ -310,6 +352,7 @@ impl<'a> DisplayableExecutionPlan<'a> {
             show_metrics: self.show_metrics,
             show_statistics: self.show_statistics,
             show_schema: self.show_schema,
+            metric_types: self.metric_types.clone(),
         }
     }
 
@@ -364,6 +407,8 @@ struct IndentVisitor<'a, 'b> {
     show_statistics: bool,
     /// If schema should be displayed
     show_schema: bool,
+    /// Which metric types should be rendered
+    metric_types: &'a [MetricType],
 }
 
 impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
@@ -376,6 +421,7 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
             ShowMetrics::Aggregated => {
                 if let Some(metrics) = plan.metrics() {
                     let metrics = metrics
+                        .filter_by_metric_types(self.metric_types)
                         .aggregate_by_name()
                         .sorted_for_display()
                         .timestamps_removed();
@@ -387,6 +433,7 @@ impl ExecutionPlanVisitor for IndentVisitor<'_, '_> {
             }
             ShowMetrics::Full => {
                 if let Some(metrics) = plan.metrics() {
+                    let metrics = metrics.filter_by_metric_types(self.metric_types);
                     write!(self.f, ", metrics=[{metrics}]")?;
                 } else {
                     write!(self.f, ", metrics=[]")?;
@@ -423,6 +470,8 @@ struct GraphvizVisitor<'a, 'b> {
     show_metrics: ShowMetrics,
     /// If statistics should be displayed
     show_statistics: bool,
+    /// Which metric types should be rendered
+    metric_types: &'a [MetricType],
 
     graphviz_builder: GraphvizBuilder,
     /// Used to record parent node ids when visiting a plan.
@@ -460,6 +509,7 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> {
             ShowMetrics::Aggregated => {
                 if let Some(metrics) = plan.metrics() {
                     let metrics = metrics
+                        .filter_by_metric_types(self.metric_types)
                         .aggregate_by_name()
                         .sorted_for_display()
                         .timestamps_removed();
@@ -471,6 +521,7 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> {
             }
             ShowMetrics::Full => {
                 if let Some(metrics) = plan.metrics() {
+                    let metrics = metrics.filter_by_metric_types(self.metric_types);
                     format!("metrics=[{metrics}]")
                 } else {
                     "metrics=[]".to_string()
@@ -540,6 +591,8 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> {
 struct TreeRenderVisitor<'a, 'b> {
     /// Write to this formatter
     f: &'a mut Formatter<'b>,
+    /// Maximum total width of the rendered tree
+    maximum_render_width: usize,
 }
 
 impl TreeRenderVisitor<'_, '_> {
@@ -557,7 +610,6 @@ impl TreeRenderVisitor<'_, '_> {
     const HORIZONTAL: &'static str = "─"; // Horizontal line
 
     // TODO: Make these variables configurable.
-    const MAXIMUM_RENDER_WIDTH: usize = 240; // Maximum total width of the rendered tree
     const NODE_RENDER_WIDTH: usize = 29; // Width of each node's box
     const MAX_EXTRA_LINES: usize = 30; // Maximum number of extra info lines per node
 
@@ -592,6 +644,12 @@ impl TreeRenderVisitor<'_, '_> {
         y: usize,
     ) -> Result<(), fmt::Error> {
         for x in 0..root.width {
+            if self.maximum_render_width > 0
+                && x * Self::NODE_RENDER_WIDTH >= self.maximum_render_width
+            {
+                break;
+            }
+
             if root.has_node(x, y) {
                 write!(self.f, "{}", Self::LTCORNER)?;
                 write!(
@@ -662,7 +720,9 @@ impl TreeRenderVisitor<'_, '_> {
         // Render the actual node.
         for render_y in 0..=extra_height {
             for (x, _) in root.nodes.iter().enumerate().take(root.width) {
-                if x * Self::NODE_RENDER_WIDTH >= Self::MAXIMUM_RENDER_WIDTH {
+                if self.maximum_render_width > 0
+                    && x * Self::NODE_RENDER_WIDTH >= self.maximum_render_width
+                {
                     break;
                 }
 
@@ -674,13 +734,14 @@ impl TreeRenderVisitor<'_, '_> {
                 if let Some(node) = root.get_node(x, y) {
                     write!(self.f, "{}", Self::VERTICAL)?;
 
-                    // Rigure out what to render.
-                    let mut render_text = String::new();
-                    if render_y == 0 {
-                        render_text = node.name.clone();
+                    // Figure out what to render.
+                    let mut render_text = if render_y == 0 {
+                        node.name.clone()
                     } else if render_y <= extra_info[x].len() {
-                        render_text = extra_info[x][render_y - 1].clone();
-                    }
+                        extra_info[x][render_y - 1].clone()
+                    } else {
+                        String::new()
+                    };
 
                     render_text = Self::adjust_text_for_rendering(
                         &render_text,
@@ -780,7 +841,9 @@ impl TreeRenderVisitor<'_, '_> {
         y: usize,
     ) -> Result<(), fmt::Error> {
         for x in 0..=root.width {
-            if x * Self::NODE_RENDER_WIDTH >= Self::MAXIMUM_RENDER_WIDTH {
+            if self.maximum_render_width > 0
+                && x * Self::NODE_RENDER_WIDTH >= self.maximum_render_width
+            {
                 break;
             }
             let mut has_adjacent_nodes = false;
@@ -906,7 +969,7 @@ impl TreeRenderVisitor<'_, '_> {
         } else {
             let total_spaces = max_render_width - render_width;
             let half_spaces = total_spaces / 2;
-            let extra_left_space = if total_spaces % 2 == 0 { 0 } else { 1 };
+            let extra_left_space = if total_spaces.is_multiple_of(2) { 0 } else { 1 };
             format!(
                 "{}{}{}",
                 " ".repeat(half_spaces + extra_left_space),
@@ -1034,27 +1097,22 @@ impl fmt::Display for ProjectSchemaDisplay<'_> {
 }
 
 pub fn display_orderings(f: &mut Formatter, orderings: &[LexOrdering]) -> fmt::Result {
-    if let Some(ordering) = orderings.first() {
-        if !ordering.is_empty() {
-            let start = if orderings.len() == 1 {
-                ", output_ordering="
-            } else {
-                ", output_orderings=["
-            };
-            write!(f, "{start}")?;
-            for (idx, ordering) in
-                orderings.iter().enumerate().filter(|(_, o)| !o.is_empty())
-            {
-                match idx {
-                    0 => write!(f, "[{ordering}]")?,
-                    _ => write!(f, ", [{ordering}]")?,
-                }
+    if !orderings.is_empty() {
+        let start = if orderings.len() == 1 {
+            ", output_ordering="
+        } else {
+            ", output_orderings=["
+        };
+        write!(f, "{start}")?;
+        for (idx, ordering) in orderings.iter().enumerate() {
+            match idx {
+                0 => write!(f, "[{ordering}]")?,
+                _ => write!(f, ", [{ordering}]")?,
             }
-            let end = if orderings.len() == 1 { "" } else { "]" };
-            write!(f, "{end}")?;
         }
+        let end = if orderings.len() == 1 { "" } else { "]" };
+        write!(f, "{end}")?;
     }
-
     Ok(())
 }
 
@@ -1063,8 +1121,11 @@ mod tests {
     use std::fmt::Write;
     use std::sync::Arc;
 
-    use datafusion_common::{DataFusionError, Result, Statistics};
+    use datafusion_common::{
+        Result, Statistics, internal_datafusion_err, tree_node::TreeNodeRecursion,
+    };
     use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+    use datafusion_physical_expr::PhysicalExpr;
 
     use crate::{DisplayAs, ExecutionPlan, PlanProperties};
 
@@ -1096,7 +1157,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1111,6 +1172,13 @@ mod tests {
             unimplemented!()
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _: usize,
@@ -1119,20 +1187,17 @@ mod tests {
             todo!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            self.partition_statistics(None)
-        }
-
-        fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             if partition.is_some() {
-                return Ok(Statistics::new_unknown(self.schema().as_ref()));
+                return Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref())));
             }
             match self {
                 Self::Panic => panic!("expected panic"),
-                Self::Error => {
-                    Err(DataFusionError::Internal("expected error".to_string()))
-                }
-                Self::Ok => Ok(Statistics::new_unknown(self.schema().as_ref())),
+                Self::Error => Err(internal_datafusion_err!("expected error")),
+                Self::Ok => Ok(Arc::new(Statistics::new_unknown(self.schema().as_ref()))),
             }
         }
     }
diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs
index 36634fbe6d7e9..078bc4b8d064b 100644
--- a/datafusion/physical-plan/src/empty.rs
+++ b/datafusion/physical-plan/src/empty.rs
@@ -21,18 +21,21 @@ use std::any::Any;
 use std::sync::Arc;
 
 use crate::memory::MemoryStream;
-use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
+use crate::{DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
 use crate::{
-    execution_plan::{Boundedness, EmissionType},
     DisplayFormatType, ExecutionPlan, Partitioning,
+    execution_plan::{Boundedness, EmissionType},
 };
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{ColumnStatistics, Result, ScalarValue, assert_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
+use crate::execution_plan::SchedulingType;
 use log::trace;
 
 /// Execution plan for empty relation with produce_one_row=false
@@ -42,7 +45,7 @@ pub struct EmptyExec {
     schema: SchemaRef,
     /// Number of partitions
     partitions: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl EmptyExec {
@@ -52,7 +55,7 @@ impl EmptyExec {
         EmptyExec {
             schema,
             partitions: 1,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -61,7 +64,7 @@ impl EmptyExec {
         self.partitions = partitions;
         // Changing partitions may invalidate output partitioning, so update it:
         let output_partitioning = Self::output_partitioning_helper(self.partitions);
-        self.cache = self.cache.with_partitioning(output_partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = output_partitioning;
         self
     }
 
@@ -81,6 +84,7 @@ impl EmptyExec {
             EmissionType::Incremental,
             Boundedness::Bounded,
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
     }
 }
 
@@ -112,7 +116,7 @@ impl ExecutionPlan for EmptyExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -120,6 +124,13 @@ impl ExecutionPlan for EmptyExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -132,15 +143,19 @@ impl ExecutionPlan for EmptyExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start EmptyExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start EmptyExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
-        if partition >= self.partitions {
-            return internal_err!(
-                "EmptyExec invalid partition {} (expected less than {})",
-                partition,
-                self.partitions
-            );
-        }
+        assert_or_internal_err!(
+            partition < self.partitions,
+            "EmptyExec invalid partition {} (expected less than {})",
+            partition,
+            self.partitions
+        );
 
         Ok(Box::pin(MemoryStream::try_new(
             self.data()?,
@@ -149,35 +164,41 @@ impl ExecutionPlan for EmptyExec {
         )?))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition) = partition {
-            if partition >= self.partitions {
-                return internal_err!(
-                    "EmptyExec invalid partition {} (expected less than {})",
-                    partition,
-                    self.partitions
-                );
-            }
+            assert_or_internal_err!(
+                partition < self.partitions,
+                "EmptyExec invalid partition {} (expected less than {})",
+                partition,
+                self.partitions
+            );
         }
 
-        let batch = self
-            .data()
-            .expect("Create empty RecordBatch should not fail");
-        Ok(common::compute_record_batch_statistics(
-            &[batch],
-            &self.schema,
-            None,
-        ))
+        // Build explicit stats: exact zero rows and bytes, with explicit known column stats
+        let mut stats = Statistics::default()
+            .with_num_rows(Precision::Exact(0))
+            .with_total_byte_size(Precision::Exact(0));
+
+        // Add explicit column stats for each field in schema
+        for _ in self.schema.fields() {
+            stats = stats.add_column_statistics(ColumnStatistics {
+                null_count: Precision::Exact(0),
+                distinct_count: Precision::Exact(0),
+                min_value: Precision::<ScalarValue>::Absent,
+                max_value: Precision::<ScalarValue>::Absent,
+                sum_value: Precision::<ScalarValue>::Absent,
+                byte_size: Precision::Exact(0),
+            });
+        }
+
+        Ok(Arc::new(stats))
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::common;
     use crate::test;
     use crate::with_new_children_if_necessary;
 
diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs
index b81b3c8beeac1..8df33452e096d 100644
--- a/datafusion/physical-plan/src/execution_plan.rs
+++ b/datafusion/physical-plan/src/execution_plan.rs
@@ -17,42 +17,51 @@
 
 pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay};
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPropagation,
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
 };
 pub use crate::metrics::Metric;
 pub use crate::ordering::InputOrderMode;
+use crate::sort_pushdown::SortOrderPushdownResult;
 pub use crate::stream::EmptyRecordBatchStream;
 
+use arrow_schema::Schema;
 pub use datafusion_common::hash_utils;
+use datafusion_common::tree_node::{
+    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
+};
 pub use datafusion_common::utils::project_schema;
-pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
+pub use datafusion_common::{ColumnStatistics, Statistics, internal_err};
 pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 pub use datafusion_expr::{Accumulator, ColumnarValue};
 pub use datafusion_physical_expr::window::WindowExpr;
 pub use datafusion_physical_expr::{
-    expressions, Distribution, Partitioning, PhysicalExpr,
+    Distribution, Partitioning, PhysicalExpr, expressions,
 };
 
 use std::any::Any;
 use std::fmt::Debug;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 
 use crate::coalesce_partitions::CoalescePartitionsExec;
 use crate::display::DisplayableExecutionPlan;
 use crate::metrics::MetricsSet;
 use crate::projection::ProjectionExec;
-use crate::repartition::RepartitionExec;
-use crate::sorts::sort_preserving_merge::SortPreservingMergeExec;
 use crate::stream::RecordBatchStreamAdapter;
 
 use arrow::array::{Array, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{exec_err, Constraints, Result};
+use datafusion_common::{
+    Constraints, DataFusionError, Result, assert_eq_or_internal_err,
+    assert_or_internal_err, exec_err,
+};
 use datafusion_common_runtime::JoinSet;
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{EquivalenceProperties, LexOrdering};
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
+use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, OrderingRequirements, PhysicalSortExpr,
+};
 
 use futures::stream::{StreamExt, TryStreamExt};
 
@@ -75,6 +84,15 @@ use futures::stream::{StreamExt, TryStreamExt};
 /// [`execute`]: ExecutionPlan::execute
 /// [`required_input_distribution`]: ExecutionPlan::required_input_distribution
 /// [`required_input_ordering`]: ExecutionPlan::required_input_ordering
+///
+/// # Examples
+///
+/// See [`datafusion-examples`] for examples, including
+/// [`memory_pool_execution_plan.rs`] which shows how to implement a custom
+/// `ExecutionPlan` with memory tracking and spilling support.
+///
+/// [`datafusion-examples`]: https://github.com/apache/datafusion/tree/main/datafusion-examples
+/// [`memory_pool_execution_plan.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs
 pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     /// Short name for the ExecutionPlan, such as 'DataSourceExec'.
     ///
@@ -113,15 +131,16 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///
     /// This information is available via methods on [`ExecutionPlanProperties`]
     /// trait, which is implemented for all `ExecutionPlan`s.
-    fn properties(&self) -> &PlanProperties;
+    fn properties(&self) -> &Arc<PlanProperties>;
 
     /// Returns an error if this individual node does not conform to its invariants.
     /// These invariants are typically only checked in debug mode.
     ///
-    /// A default set of invariants is provided in the default implementation.
+    /// A default set of invariants is provided in the [check_default_invariants] function.
+    /// The default implementation of `check_invariants` calls this function.
     /// Extension nodes can provide their own invariants.
-    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
-        Ok(())
+    fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
+        check_default_invariants(self, check)
     }
 
     /// Specifies the data distribution requirements for all the
@@ -139,7 +158,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     /// NOTE that checking `!is_empty()` does **not** check for a
     /// required input ordering. Instead, the correct check is that at
     /// least one entry must be `Some`
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         vec![None; self.children().len()]
     }
 
@@ -188,6 +207,80 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     /// joins).
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>>;
 
+    /// Apply a closure `f` to each expression (non-recursively) in the current
+    /// physical plan node. This does not include expressions in any children.
+    ///
+    /// The closure `f` is applied to expressions in the order they appear in the plan.
+    /// The closure can return `TreeNodeRecursion::Continue` to continue visiting,
+    /// `TreeNodeRecursion::Stop` to stop visiting immediately, or `TreeNodeRecursion::Jump`
+    /// to skip any remaining expressions (though typically all expressions are visited).
+    ///
+    /// The expressions visited do not necessarily represent or even contribute
+    /// to the output schema of this node. For example, `FilterExec` visits the
+    /// filter predicate even though the output of a Filter has the same columns
+    /// as the input.
+    ///
+    /// # Example Usage
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use datafusion_physical_plan::ExecutionPlan;
+    /// # use datafusion_common::tree_node::TreeNodeRecursion;
+    /// # fn example(plan: Arc<dyn ExecutionPlan>) -> datafusion_common::Result<()> {
+    /// // Count the number of expressions
+    /// let mut count = 0;
+    /// plan.apply_expressions(&mut |_expr| {
+    ///     count += 1;
+    ///     Ok(TreeNodeRecursion::Continue)
+    /// })?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    ///
+    /// # Implementation Examples
+    ///
+    /// ## Node with no expressions (e.g., EmptyExec, MemoryExec)
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     Ok(TreeNodeRecursion::Continue)
+    /// }
+    /// ```
+    ///
+    /// ## Node with a single expression (e.g., FilterExec)
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     f(self.predicate.as_ref())
+    /// }
+    /// ```
+    ///
+    /// ## Node with multiple expressions (e.g., ProjectionExec, JoinExec)
+    ///
+    /// Use [`TreeNodeRecursion::visit_sibling`] when iterating over multiple
+    /// expressions. This correctly propagates [`TreeNodeRecursion::Stop`]: if
+    /// `f` returns `Stop` for an earlier expression, `visit_sibling` short-circuits
+    /// and skips the remaining ones.
+    /// ```ignore
+    /// fn apply_expressions(
+    ///     &self,
+    ///     f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    /// ) -> Result<TreeNodeRecursion> {
+    ///     let mut tnr = TreeNodeRecursion::Continue;
+    ///     for expr in &self.expressions {
+    ///         tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+    ///     }
+    ///     Ok(tnr)
+    /// }
+    /// ```
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion>;
+
     /// Returns a new `ExecutionPlan` where all existing children were replaced
     /// by the `children`, in order
     fn with_new_children(
@@ -195,6 +288,31 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
+    /// Reset any internal state within this [`ExecutionPlan`].
+    ///
+    /// This method is called when an [`ExecutionPlan`] needs to be re-executed,
+    /// such as in recursive queries. Unlike [`ExecutionPlan::with_new_children`], this method
+    /// ensures that any stateful components (e.g., [`DynamicFilterPhysicalExpr`])
+    /// are reset to their initial state.
+    ///
+    /// The default implementation simply calls [`ExecutionPlan::with_new_children`] with the existing children,
+    /// effectively creating a new instance of the [`ExecutionPlan`] with the same children but without
+    /// necessarily resetting any internal state. Implementations that require resetting of some
+    /// internal state should override this method to provide the necessary logic.
+    ///
+    /// This method should *not* reset state recursively for children, as it is expected that
+    /// it will be called from within a walk of the execution plan tree so that it will be called on each child later
+    /// or was already called on each child.
+    ///
+    /// Note to implementers: unlike [`ExecutionPlan::with_new_children`] this method does not accept new children as an argument,
+    /// thus it is expected that any cached plan properties will remain valid after the reset.
+    ///
+    /// [`DynamicFilterPhysicalExpr`]: datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let children = self.children().into_iter().cloned().collect();
+        self.with_new_children(children)
+    }
+
     /// If supported, attempt to increase the partitioning of this `ExecutionPlan` to
     /// produce `target_partitions` partitions.
     ///
@@ -270,11 +388,13 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     /// batch is superlinear. See this [general guideline][async-guideline] for more context
     /// on this point, which explains why one should avoid spending a long time without
     /// reaching an `await`/yield point in asynchronous runtimes.
-    /// This can be achieved by manually returning [`Poll::Pending`] and setting up wakers
-    /// appropriately, or the use of [`tokio::task::yield_now()`] when appropriate.
+    /// This can be achieved by using the utilities from the [`coop`](crate::coop) module, by
+    /// manually returning [`Poll::Pending`] and setting up wakers appropriately, or by calling
+    /// [`tokio::task::yield_now()`] when appropriate.
     /// In special cases that warrant manual yielding, determination for "regularly" may be
-    /// made using a timer (being careful with the overhead-heavy system call needed to
-    /// take the time), or by counting rows or batches.
+    /// made using the [Tokio task budget](https://docs.rs/tokio/latest/tokio/task/coop/index.html),
+    /// a timer (being careful with the overhead-heavy system call needed to take the time), or by
+    /// counting rows or batches.
     ///
     /// The [cancellation benchmark] tracks some cases of how quickly queries can
     /// be cancelled.
@@ -318,12 +438,15 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///     fn execute(
     ///         &self,
     ///         partition: usize,
-    ///         context: Arc<TaskContext>
+    ///         context: Arc<TaskContext>,
     ///     ) -> Result<SendableRecordBatchStream> {
     ///         // use functions from futures crate convert the batch into a stream
     ///         let fut = futures::future::ready(Ok(self.batch.clone()));
     ///         let stream = futures::stream::once(fut);
-    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.batch.schema(), stream)))
+    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(
+    ///             self.batch.schema(),
+    ///             stream,
+    ///         )))
     ///     }
     /// }
     /// ```
@@ -353,11 +476,14 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///     fn execute(
     ///         &self,
     ///         partition: usize,
-    ///         context: Arc<TaskContext>
+    ///         context: Arc<TaskContext>,
     ///     ) -> Result<SendableRecordBatchStream> {
     ///         let fut = get_batch();
     ///         let stream = futures::stream::once(fut);
-    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), stream)))
+    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(
+    ///             self.schema.clone(),
+    ///             stream,
+    ///         )))
     ///     }
     /// }
     /// ```
@@ -389,13 +515,16 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///     fn execute(
     ///         &self,
     ///         partition: usize,
-    ///         context: Arc<TaskContext>
+    ///         context: Arc<TaskContext>,
     ///     ) -> Result<SendableRecordBatchStream> {
     ///         // A future that yields a stream
     ///         let fut = get_batch_stream();
     ///         // Use TryStreamExt::try_flatten to flatten the stream of streams
     ///         let stream = futures::stream::once(fut).try_flatten();
-    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), stream)))
+    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(
+    ///             self.schema.clone(),
+    ///             stream,
+    ///         )))
     ///     }
     /// }
     /// ```
@@ -420,34 +549,22 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
         None
     }
 
-    /// Returns statistics for this `ExecutionPlan` node. If statistics are not
-    /// available, should return [`Statistics::new_unknown`] (the default), not
-    /// an error.
-    ///
-    /// For TableScan executors, which supports filter pushdown, special attention
-    /// needs to be paid to whether the stats returned by this method are exact or not
-    #[deprecated(since = "48.0.0", note = "Use `partition_statistics` method instead")]
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
-
     /// Returns statistics for a specific partition of this `ExecutionPlan` node.
     /// If statistics are not available, should return [`Statistics::new_unknown`]
     /// (the default), not an error.
     /// If `partition` is `None`, it returns statistics for the entire plan.
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(idx) = partition {
             // Validate partition index
             let partition_count = self.properties().partitioning.partition_count();
-            if idx >= partition_count {
-                return internal_err!(
-                    "Invalid partition index: {}, the partition count is {}",
-                    idx,
-                    partition_count
-                );
-            }
+            assert_or_internal_err!(
+                idx < partition_count,
+                "Invalid partition index: {}, the partition count is {}",
+                idx,
+                partition_count
+            );
         }
-        Ok(Statistics::new_unknown(&self.schema()))
+        Ok(Arc::new(Statistics::new_unknown(&self.schema())))
     }
 
     /// Returns `true` if a limit can be safely pushed down through this
@@ -462,6 +579,10 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
 
     /// Returns a fetching variant of this `ExecutionPlan` node, if it supports
     /// fetch limits. Returns `None` otherwise.
+    ///
+    /// See physical optimizer rule [`limit_pushdown`] for details.
+    ///
+    /// [`limit_pushdown`]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/limit_pushdown/index.html
     fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
         None
     }
@@ -509,42 +630,168 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ///
     /// The default implementation bars all parent filters from being pushed down and adds no new filters.
     /// This is the safest option, making filter pushdown opt-in on a per-node pasis.
+    ///
+    /// There are two different phases in filter pushdown, which some operators may handle the same and some differently.
+    /// Depending on the phase the operator may or may not be allowed to modify the plan.
+    /// See [`FilterPushdownPhase`] for more details.
     fn gather_filters_for_pushdown(
         &self,
+        _phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        Ok(
-            FilterDescription::new_with_child_count(self.children().len())
-                .all_parent_filters_unsupported(parent_filters),
-        )
+        Ok(FilterDescription::all_unsupported(
+            &parent_filters,
+            &self.children(),
+        ))
     }
 
     /// Handle the result of a child pushdown.
-    /// This is called as we recurse back up the plan tree after recursing down and calling [`ExecutionPlan::gather_filters_for_pushdown`].
-    /// Once we know what the result of pushing down filters into children is we ask the current node what it wants to do with that result.
-    /// For a `DataSourceExec` that may be absorbing the filters to apply them during the scan phase
-    /// (also known as late materialization).
-    /// A `FilterExec` may absorb any filters its children could not absorb, or if there are no filters left it
-    /// may remove itself from the plan altogether.
-    /// It combines both [`ChildPushdownResult::parent_filters`] and [`ChildPushdownResult::self_filters`] into a single
-    /// predicate and replaces it's own predicate.
-    /// Then it passes [`PredicateSupport::Supported`] for each parent predicate to the parent.
-    /// A `HashJoinExec` may ignore the pushdown result since it needs to apply the filters as part of the join anyhow.
-    /// It passes [`ChildPushdownResult::parent_filters`] back up to it's parents wrapped in [`FilterPushdownPropagation::transparent`]
-    /// and [`ChildPushdownResult::self_filters`] is discarded.
-    ///
-    /// The default implementation is a no-op that passes the result of pushdown from the children to its parent.
-    ///
-    /// [`PredicateSupport::Supported`]: crate::filter_pushdown::PredicateSupport::Supported
+    ///
+    /// This method is called as we recurse back up the plan tree after pushing
+    /// filters down to child nodes via [`ExecutionPlan::gather_filters_for_pushdown`].
+    /// It allows the current node to process the results of filter pushdown from
+    /// its children, deciding whether to absorb filters, modify the plan, or pass
+    /// filters back up to its parent.
+    ///
+    /// **Purpose and Context:**
+    /// Filter pushdown is a critical optimization in DataFusion that aims to
+    /// reduce the amount of data processed by applying filters as early as
+    /// possible in the query plan. This method is part of the second phase of
+    /// filter pushdown, where results are propagated back up the tree after
+    /// being pushed down. Each node can inspect the pushdown results from its
+    /// children and decide how to handle any unapplied filters, potentially
+    /// optimizing the plan structure or filter application.
+    ///
+    /// **Behavior in Different Nodes:**
+    /// - For a `DataSourceExec`, this often means absorbing the filters to apply
+    ///   them during the scan phase (late materialization), reducing the data
+    ///   read from the source.
+    /// - A `FilterExec` may absorb any filters its children could not handle,
+    ///   combining them with its own predicate. If no filters remain (i.e., the
+    ///   predicate becomes trivially true), it may remove itself from the plan
+    ///   altogether. It typically marks parent filters as supported, indicating
+    ///   they have been handled.
+    /// - A `HashJoinExec` might ignore the pushdown result if filters need to
+    ///   be applied during the join operation. It passes the parent filters back
+    ///   up wrapped in [`FilterPushdownPropagation::if_any`], discarding
+    ///   any self-filters from children.
+    ///
+    /// **Example Walkthrough:**
+    /// Consider a query plan: `FilterExec (f1) -> HashJoinExec -> DataSourceExec`.
+    /// 1. **Downward Phase (`gather_filters_for_pushdown`):** Starting at
+    ///    `FilterExec`, the filter `f1` is gathered and pushed down to
+    ///    `HashJoinExec`. `HashJoinExec` may allow `f1` to pass to one side of
+    ///    the join or add its own filters (e.g., a min-max filter from the build side),
+    ///    then pushes filters to `DataSourceExec`. `DataSourceExec`, being a leaf node,
+    ///    has no children to push to, so it prepares to handle filters in the
+    ///    upward phase.
+    /// 2. **Upward Phase (`handle_child_pushdown_result`):** Starting at
+    ///    `DataSourceExec`, it absorbs applicable filters from `HashJoinExec`
+    ///    for late materialization during scanning, marking them as supported.
+    ///    `HashJoinExec` receives the result, decides whether to apply any
+    ///    remaining filters during the join, and passes unhandled filters back
+    ///    up to `FilterExec`. `FilterExec` absorbs any unhandled filters,
+    ///    updates its predicate if necessary, or removes itself if the predicate
+    ///    becomes trivial (e.g., `lit(true)`), and marks filters as supported
+    ///    for its parent.
+    ///
+    /// The default implementation is a no-op that passes the result of pushdown
+    /// from the children to its parent transparently, ensuring no filters are
+    /// lost if a node does not override this behavior.
+    ///
+    /// **Notes for Implementation:**
+    /// When returning filters via [`FilterPushdownPropagation`], the order of
+    /// filters need not match the order they were passed in via
+    /// `child_pushdown_result`. However, preserving the order is recommended for
+    /// debugging and ease of reasoning about the resulting plans.
+    ///
+    /// **Helper Methods for Customization:**
+    /// There are various helper methods to simplify implementing this method:
+    /// - [`FilterPushdownPropagation::if_any`]: Marks all parent filters as
+    ///   supported as long as at least one child supports them.
+    /// - [`FilterPushdownPropagation::if_all`]: Marks all parent filters as
+    ///   supported as long as all children support them.
+    /// - [`FilterPushdownPropagation::with_parent_pushdown_result`]: Allows adding filters
+    ///   to the propagation result, indicating which filters are supported by
+    ///   the current node.
+    /// - [`FilterPushdownPropagation::with_updated_node`]: Allows updating the
+    ///   current node in the propagation result, used if the node
+    ///   has modified its plan based on the pushdown results.
+    ///
+    /// **Filter Pushdown Phases:**
+    /// There are two different phases in filter pushdown (`Pre` and others),
+    /// which some operators may handle differently. Depending on the phase, the
+    /// operator may or may not be allowed to modify the plan. See
+    /// [`FilterPushdownPhase`] for more details on phase-specific behavior.
+    ///
+    /// [`PushedDownPredicate::supported`]: crate::filter_pushdown::PushedDownPredicate::supported
     fn handle_child_pushdown_result(
         &self,
+        _phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-        Ok(FilterPushdownPropagation::transparent(
-            child_pushdown_result,
-        ))
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    /// Injects arbitrary run-time state into this execution plan, returning a new plan
+    /// instance that incorporates that state *if* it is relevant to the concrete
+    /// node implementation.
+    ///
+    /// This is a generic entry point: the `state` can be any type wrapped in
+    /// `Arc<dyn Any + Send + Sync>`.  A node that cares about the state should
+    /// down-cast it to the concrete type it expects and, if successful, return a
+    /// modified copy of itself that captures the provided value.  If the state is
+    /// not applicable, the default behaviour is to return `None` so that parent
+    /// nodes can continue propagating the attempt further down the plan tree.
+    ///
+    /// For example, [`WorkTableExec`](crate::work_table::WorkTableExec)
+    /// down-casts the supplied state to an `Arc<WorkTable>`
+    /// in order to wire up the working table used during recursive-CTE execution.
+    /// Similar patterns can be followed by custom nodes that need late-bound
+    /// dependencies or shared state.
+    fn with_new_state(
+        &self,
+        _state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        None
+    }
+
+    /// Try to push down sort ordering requirements to this node.
+    ///
+    /// This method is called during sort pushdown optimization to determine if this
+    /// node can optimize for a requested sort ordering. Implementations should:
+    ///
+    /// - Return [`SortOrderPushdownResult::Exact`] if the node can guarantee the exact
+    ///   ordering (allowing the Sort operator to be removed)
+    /// - Return [`SortOrderPushdownResult::Inexact`] if the node can optimize for the
+    ///   ordering but cannot guarantee perfect sorting (Sort operator is kept)
+    /// - Return [`SortOrderPushdownResult::Unsupported`] if the node cannot optimize
+    ///   for the ordering
+    ///
+    /// For transparent nodes (that preserve ordering), implement this to delegate to
+    /// children and wrap the result with a new instance of this node.
+    ///
+    /// Default implementation returns `Unsupported`.
+    fn try_pushdown_sort(
+        &self,
+        _order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        Ok(SortOrderPushdownResult::Unsupported)
+    }
+
+    /// Returns a variant of this `ExecutionPlan` that is aware of order-sensitivity.
+    ///
+    /// This is used to signal to data sources that the output ordering must be
+    /// preserved, even if it might be more efficient to ignore it (e.g. by
+    /// skipping some row groups in Parquet).
+    ///
+    fn with_preserve_order(
+        &self,
+        _preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        None
     }
 }
 
@@ -663,7 +910,7 @@ impl ExecutionPlanProperties for &dyn ExecutionPlan {
 /// For unbounded streams, it also tracks whether the operator requires finite memory
 /// to process the stream or if memory usage could grow unbounded.
 ///
-/// Boundedness of the output stream is based on the the boundedness of the input stream and the nature of
+/// Boundedness of the output stream is based on the boundedness of the input stream and the nature of
 /// the operator. For example, limit or topk with fetch operator can convert an unbounded stream to a bounded stream.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Boundedness {
@@ -720,6 +967,49 @@ pub enum EmissionType {
     Both,
 }
 
+/// Represents whether an operator's `Stream` has been implemented to actively cooperate with the
+/// Tokio scheduler or not. Please refer to the [`coop`](crate::coop) module for more details.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SchedulingType {
+    /// The stream generated by [`execute`](ExecutionPlan::execute) does not actively participate in
+    /// cooperative scheduling. This means the implementation of the `Stream` returned by
+    /// [`ExecutionPlan::execute`] does not contain explicit task budget consumption such as
+    /// [`tokio::task::coop::consume_budget`].
+    ///
+    /// `NonCooperative` is the default value and is acceptable for most operators. Please refer to
+    /// the [`coop`](crate::coop) module for details on when it may be useful to use
+    /// `Cooperative` instead.
+    NonCooperative,
+    /// The stream generated by [`execute`](ExecutionPlan::execute) actively participates in
+    /// cooperative scheduling by consuming task budget when it was able to produce a
+    /// [`RecordBatch`].
+    Cooperative,
+}
+
+/// Represents how an operator's `Stream` implementation generates `RecordBatch`es.
+///
+/// Most operators in DataFusion generate `RecordBatch`es when asked to do so by a call to
+/// `Stream::poll_next`. This is known as demand-driven or lazy evaluation.
+///
+/// Some operators like `Repartition` need to drive `RecordBatch` generation themselves though. This
+/// is known as data-driven or eager evaluation.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum EvaluationType {
+    /// The stream generated by [`execute`](ExecutionPlan::execute) only generates `RecordBatch`
+    /// instances when it is demanded by invoking `Stream::poll_next`.
+    /// Filter, projection, and join are examples of such lazy operators.
+    ///
+    /// Lazy operators are also known as demand-driven operators.
+    Lazy,
+    /// The stream generated by [`execute`](ExecutionPlan::execute) eagerly generates `RecordBatch`
+    /// in one or more spawned Tokio tasks. Eager evaluation is only started the first time
+    /// `Stream::poll_next` is called.
+    /// Examples of eager operators are repartition, coalesce partitions, and sort preserving merge.
+    ///
+    /// Eager operators are also known as a data-driven operators.
+    Eager,
+}
+
 /// Utility to determine an operator's boundedness based on its children's boundedness.
 ///
 /// Assumes boundedness can be inferred from child operators:
@@ -742,7 +1032,7 @@ pub(crate) fn boundedness_from_children<'a>(
             } => {
                 return Boundedness::Unbounded {
                     requires_infinite_memory: true,
-                }
+                };
             }
             Boundedness::Unbounded {
                 requires_infinite_memory: false,
@@ -808,6 +1098,8 @@ pub struct PlanProperties {
     pub emission_type: EmissionType,
     /// See [ExecutionPlanProperties::boundedness]
     pub boundedness: Boundedness,
+    pub evaluation_type: EvaluationType,
+    pub scheduling_type: SchedulingType,
     /// See [ExecutionPlanProperties::output_ordering]
     output_ordering: Option<LexOrdering>,
 }
@@ -827,6 +1119,8 @@ impl PlanProperties {
             partitioning,
             emission_type,
             boundedness,
+            evaluation_type: EvaluationType::Lazy,
+            scheduling_type: SchedulingType::NonCooperative,
             output_ordering,
         }
     }
@@ -837,12 +1131,17 @@ impl PlanProperties {
         self
     }
 
-    /// Overwrite equivalence properties with its new value.
-    pub fn with_eq_properties(mut self, eq_properties: EquivalenceProperties) -> Self {
+    /// Set equivalence properties having mut reference.
+    pub fn set_eq_properties(&mut self, eq_properties: EquivalenceProperties) {
         // Changing equivalence properties also changes output ordering, so
         // make sure to overwrite it:
         self.output_ordering = eq_properties.output_ordering();
         self.eq_properties = eq_properties;
+    }
+
+    /// Overwrite equivalence properties with its new value.
+    pub fn with_eq_properties(mut self, eq_properties: EquivalenceProperties) -> Self {
+        self.set_eq_properties(eq_properties);
         self
     }
 
@@ -858,9 +1157,30 @@ impl PlanProperties {
         self
     }
 
+    /// Set the [`SchedulingType`].
+    ///
+    /// Defaults to [`SchedulingType::NonCooperative`]
+    pub fn with_scheduling_type(mut self, scheduling_type: SchedulingType) -> Self {
+        self.scheduling_type = scheduling_type;
+        self
+    }
+
+    /// Set the [`EvaluationType`].
+    ///
+    /// Defaults to [`EvaluationType::Lazy`]
+    pub fn with_evaluation_type(mut self, drive_type: EvaluationType) -> Self {
+        self.evaluation_type = drive_type;
+        self
+    }
+
+    /// Set constraints having mut reference.
+    pub fn set_constraints(&mut self, constraints: Constraints) {
+        self.eq_properties.set_constraints(constraints);
+    }
+
     /// Overwrite constraints with its new value.
     pub fn with_constraints(mut self, constraints: Constraints) -> Self {
-        self.eq_properties = self.eq_properties.with_constraints(constraints);
+        self.set_constraints(constraints);
         self
     }
 
@@ -882,32 +1202,46 @@ impl PlanProperties {
     }
 }
 
+macro_rules! check_len {
+    ($target:expr, $func_name:ident, $expected_len:expr) => {
+        let actual_len = $target.$func_name().len();
+        assert_eq_or_internal_err!(
+            actual_len,
+            $expected_len,
+            "{}::{} returned Vec with incorrect size: {} != {}",
+            $target.name(),
+            stringify!($func_name),
+            actual_len,
+            $expected_len
+        );
+    };
+}
+
+/// Checks a set of invariants that apply to all ExecutionPlan implementations.
+/// Returns an error if the given node does not conform.
+pub fn check_default_invariants<P: ExecutionPlan + ?Sized>(
+    plan: &P,
+    _check: InvariantLevel,
+) -> Result<(), DataFusionError> {
+    let children_len = plan.children().len();
+
+    check_len!(plan, maintains_input_order, children_len);
+    check_len!(plan, required_input_ordering, children_len);
+    check_len!(plan, required_input_distribution, children_len);
+    check_len!(plan, benefits_from_input_partitioning, children_len);
+
+    Ok(())
+}
+
 /// Indicate whether a data exchange is needed for the input of `plan`, which will be very helpful
 /// especially for the distributed engine to judge whether need to deal with shuffling.
-/// Currently there are 3 kinds of execution plan which needs data exchange
+/// Currently, there are 3 kinds of execution plan which needs data exchange
 ///     1. RepartitionExec for changing the partition number between two `ExecutionPlan`s
 ///     2. CoalescePartitionsExec for collapsing all of the partitions into one without ordering guarantee
 ///     3. SortPreservingMergeExec for collapsing all of the sorted partitions into one with ordering guarantee
+#[expect(clippy::needless_pass_by_value)]
 pub fn need_data_exchange(plan: Arc<dyn ExecutionPlan>) -> bool {
-    if let Some(repartition) = plan.as_any().downcast_ref::<RepartitionExec>() {
-        !matches!(
-            repartition.properties().output_partitioning(),
-            Partitioning::RoundRobinBatch(_)
-        )
-    } else if let Some(coalesce) = plan.as_any().downcast_ref::<CoalescePartitionsExec>()
-    {
-        coalesce.input().output_partitioning().partition_count() > 1
-    } else if let Some(sort_preserving_merge) =
-        plan.as_any().downcast_ref::<SortPreservingMergeExec>()
-    {
-        sort_preserving_merge
-            .input()
-            .output_partitioning()
-            .partition_count()
-            > 1
-    } else {
-        false
-    }
+    plan.properties().evaluation_type == EvaluationType::Eager
 }
 
 /// Returns a copy of this plan if we change any child according to the pointer comparison.
@@ -917,9 +1251,12 @@ pub fn with_new_children_if_necessary(
     children: Vec<Arc<dyn ExecutionPlan>>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let old_children = plan.children();
-    if children.len() != old_children.len() {
-        internal_err!("Wrong number of children")
-    } else if children.is_empty()
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "Wrong number of children"
+    );
+    if children.is_empty()
         || children
             .iter()
             .zip(old_children.iter())
@@ -957,6 +1294,10 @@ pub async fn collect(
 ///
 /// Dropping the stream will abort the execution of the query, and free up
 /// any allocated resources
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_stream(
     plan: Arc<dyn ExecutionPlan>,
     context: Arc<TaskContext>,
@@ -1021,6 +1362,10 @@ pub async fn collect_partitioned(
 ///
 /// Dropping the stream will abort the execution of the query, and free up
 /// any allocated resources
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_stream_partitioned(
     plan: Arc<dyn ExecutionPlan>,
     context: Arc<TaskContext>,
@@ -1052,6 +1397,10 @@ pub fn execute_stream_partitioned(
 /// violate the `not null` constraints specified in the `sink_schema`. If there are
 /// such columns, it wraps the resulting stream to enforce the `not null` constraints
 /// by invoking the [`check_not_null_constraints`] function on each batch of the stream.
+#[expect(
+    clippy::needless_pass_by_value,
+    reason = "Public API that historically takes owned Arcs"
+)]
 pub fn execute_input_stream(
     input: Arc<dyn ExecutionPlan>,
     sink_schema: SchemaRef,
@@ -1130,11 +1479,73 @@ pub fn check_not_null_constraints(
     Ok(batch)
 }
 
+/// Make plan ready to be re-executed returning its clone with state reset for all nodes.
+///
+/// Some plans will change their internal states after execution, making them unable to be executed again.
+/// This function uses [`ExecutionPlan::reset_state`] to reset any internal state within the plan.
+///
+/// An example is `CrossJoinExec`, which loads the left table into memory and stores it in the plan.
+/// However, if the data of the left table is derived from the work table, it will become outdated
+/// as the work table changes. When the next iteration executes this plan again, we must clear the left table.
+///
+/// # Limitations
+///
+/// While this function enables plan reuse, it does not allow the same plan to be executed if it (OR):
+///
+/// * uses dynamic filters,
+/// * represents a recursive query.
+///
+pub fn reset_plan_states(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    plan.transform_up(|plan| {
+        let new_plan = Arc::clone(&plan).reset_state()?;
+        Ok(Transformed::yes(new_plan))
+    })
+    .data()
+}
+
+/// Check if the `plan` children has the same properties as passed `children`.
+/// In this case plan can avoid self properties re-computation when its children
+/// replace is requested.
+/// The size of `children` must be equal to the size of `ExecutionPlan::children()`.
+pub fn has_same_children_properties(
+    plan: &impl ExecutionPlan,
+    children: &[Arc<dyn ExecutionPlan>],
+) -> Result<bool> {
+    let old_children = plan.children();
+    assert_eq_or_internal_err!(
+        children.len(),
+        old_children.len(),
+        "Wrong number of children"
+    );
+    for (lhs, rhs) in old_children.iter().zip(children.iter()) {
+        if !Arc::ptr_eq(lhs.properties(), rhs.properties()) {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
+/// Helper macro to avoid properties re-computation if passed children properties
+/// the same as plan already has. Could be used to implement fast-path for method
+/// [`ExecutionPlan::with_new_children`].
+#[macro_export]
+macro_rules! check_if_same_properties {
+    ($plan: expr, $children: expr) => {
+        if $crate::execution_plan::has_same_children_properties(
+            $plan.as_ref(),
+            &$children,
+        )? {
+            let plan = $plan.with_new_children_and_same_properties($children);
+            return Ok(::std::sync::Arc::new(plan));
+        }
+    };
+}
+
 /// Utility function yielding a string representation of the given [`ExecutionPlan`].
 pub fn get_plan_string(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
     let formatted = displayable(plan.as_ref()).indent(true).to_string();
     let actual: Vec<&str> = formatted.trim().lines().collect();
-    actual.iter().map(|elem| elem.to_string()).collect()
+    actual.iter().map(|elem| (*elem).to_string()).collect()
 }
 
 /// Indicates the effect an execution plan operator will have on the cardinality
@@ -1151,19 +1562,33 @@ pub enum CardinalityEffect {
     GreaterEqual,
 }
 
+/// Can be used in contexts where properties have not yet been initialized properly.
+pub(crate) fn stub_properties() -> Arc<PlanProperties> {
+    static STUB_PROPERTIES: LazyLock<Arc<PlanProperties>> = LazyLock::new(|| {
+        Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(Arc::new(Schema::empty())),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Final,
+            Boundedness::Bounded,
+        ))
+    });
+
+    Arc::clone(&STUB_PROPERTIES)
+}
+
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use arrow::array::{DictionaryArray, Int32Array, NullArray, RunArray};
-    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use std::any::Any;
     use std::sync::Arc;
 
+    use super::*;
+    use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+    use arrow::array::{DictionaryArray, Int32Array, NullArray, RunArray};
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::{Result, Statistics};
     use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 
-    use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
-
     #[derive(Debug)]
     pub struct EmptyExec;
 
@@ -1192,7 +1617,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1207,6 +1632,13 @@ mod tests {
             unimplemented!()
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _partition: usize,
@@ -1215,11 +1647,10 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
-            unimplemented!()
-        }
-
-        fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
             unimplemented!()
         }
     }
@@ -1259,7 +1690,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             unimplemented!()
         }
 
@@ -1267,6 +1698,13 @@ mod tests {
             vec![]
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -1282,13 +1720,119 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> Result<Statistics> {
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            unimplemented!()
+        }
+    }
+
+    /// A test node that holds a fixed list of expressions, used to test
+    /// `apply_expressions` behavior.
+    #[derive(Debug)]
+    struct MultiExprExec {
+        exprs: Vec<Arc<dyn PhysicalExpr>>,
+    }
+
+    impl DisplayAs for MultiExprExec {
+        fn fmt_as(
+            &self,
+            _t: DisplayFormatType,
+            _f: &mut std::fmt::Formatter,
+        ) -> std::fmt::Result {
+            unimplemented!()
+        }
+    }
+
+    impl ExecutionPlan for MultiExprExec {
+        fn name(&self) -> &'static str {
+            "MultiExprExec"
+        }
+
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn properties(&self) -> &std::sync::Arc<PlanProperties> {
             unimplemented!()
         }
 
-        fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
             unimplemented!()
         }
+
+        fn apply_expressions(
+            &self,
+            f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            let mut tnr = TreeNodeRecursion::Continue;
+            for expr in &self.exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+            Ok(tnr)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            unimplemented!()
+        }
+
+        fn partition_statistics(
+            &self,
+            _partition: Option<usize>,
+        ) -> Result<Arc<Statistics>> {
+            unimplemented!()
+        }
+    }
+
+    /// Returns a simple literal `Arc<dyn PhysicalExpr>` for use in tests.
+    fn lit_expr(val: i64) -> Arc<dyn PhysicalExpr> {
+        use datafusion_physical_expr::expressions::Literal;
+        Arc::new(Literal::new(datafusion_common::ScalarValue::Int64(Some(
+            val,
+        ))))
+    }
+
+    /// `apply_expressions` visits all expressions when `f` always returns `Continue`.
+    #[test]
+    fn test_apply_expressions_continue_visits_all() -> Result<()> {
+        let plan = MultiExprExec {
+            exprs: vec![lit_expr(1), lit_expr(2), lit_expr(3)],
+        };
+        let mut visited = 0usize;
+        plan.apply_expressions(&mut |_expr| {
+            visited += 1;
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+        assert_eq!(visited, 3);
+        Ok(())
+    }
+
+    #[test]
+    fn test_apply_expressions_stop_halts_early() -> Result<()> {
+        let plan = MultiExprExec {
+            exprs: vec![lit_expr(1), lit_expr(2), lit_expr(3)],
+        };
+        let mut visited = 0usize;
+        let tnr = plan.apply_expressions(&mut |_expr| {
+            visited += 1;
+            Ok(TreeNodeRecursion::Stop)
+        })?;
+        // Only the first expression is visited; the rest are skipped.
+        assert_eq!(visited, 1);
+        assert_eq!(tnr, TreeNodeRecursion::Stop);
+        Ok(())
     }
 
     #[test]
@@ -1306,7 +1850,7 @@ mod tests {
     /// A compilation test to ensure that the `ExecutionPlan::name()` method can
     /// be called from a trait object.
     /// Related ticket: https://github.com/apache/datafusion/pull/11047
-    #[allow(dead_code)]
+    #[expect(unused)]
     fn use_execution_plan_as_trait_object(plan: &dyn ExecutionPlan) {
         let _ = plan.name();
     }
diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs
index bf488ccfae56a..fa684f3483a83 100644
--- a/datafusion/physical-plan/src/explain.rs
+++ b/datafusion/physical-plan/src/explain.rs
@@ -27,9 +27,10 @@ use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
 
 use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch};
 use datafusion_common::display::StringifiedPlan;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use log::trace;
 
@@ -44,7 +45,7 @@ pub struct ExplainExec {
     stringified_plans: Vec<StringifiedPlan>,
     /// control which plans to print
     verbose: bool,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ExplainExec {
@@ -59,7 +60,7 @@ impl ExplainExec {
             schema,
             stringified_plans,
             verbose,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -112,7 +113,7 @@ impl ExecutionPlan for ExplainExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -121,6 +122,13 @@ impl ExecutionPlan for ExplainExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -133,10 +141,17 @@ impl ExecutionPlan for ExplainExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-        if 0 != partition {
-            return internal_err!("ExplainExec invalid partition {partition}");
-        }
+        trace!(
+            "Start ExplainExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "ExplainExec invalid partition {partition}"
+        );
         let mut type_builder =
             StringBuilder::with_capacity(self.stringified_plans.len(), 1024);
         let mut plan_builder =
@@ -172,7 +187,11 @@ impl ExecutionPlan for ExplainExec {
         )?;
 
         trace!(
-            "Before returning RecordBatchStream in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+            "Before returning RecordBatchStream in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             Arc::clone(&self.schema),
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index 13129e382decd..141d9c38469d8 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -16,27 +16,33 @@
 // under the License.
 
 use std::any::Any;
-use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
+
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
+use itertools::Itertools;
 
 use super::{
     ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties,
     RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
+use crate::check_if_same_properties;
+use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus};
 use crate::common::can_project;
 use crate::execution_plan::CardinalityEffect;
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPropagation,
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDown,
 };
+use crate::metrics::{MetricBuilder, MetricType};
 use crate::projection::{
-    make_with_child, try_embed_projection, update_expr, EmbeddedProjection,
-    ProjectionExec,
+    EmbeddedProjection, ProjectionExec, ProjectionExpr, make_with_child,
+    try_embed_projection, update_expr,
 };
 use crate::{
-    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
     DisplayFormatType, ExecutionPlan,
+    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RatioMetrics},
 };
 
 use arrow::compute::filter_record_batch;
@@ -45,29 +51,27 @@ use arrow::record_batch::RecordBatch;
 use datafusion_common::cast::as_boolean_array;
 use datafusion_common::config::ConfigOptions;
 use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{
-    Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
-};
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    internal_err, plan_err, project_schema, DataFusionError, Result, ScalarValue,
+    DataFusionError, Result, ScalarValue, internal_err, plan_err, project_schema,
 };
 use datafusion_execution::TaskContext;
 use datafusion_expr::Operator;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::{lit, BinaryExpr, Column};
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal, lit};
 use datafusion_physical_expr::intervals::utils::check_support;
-use datafusion_physical_expr::utils::collect_columns;
+use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
 use datafusion_physical_expr::{
-    analyze, conjunction, split_conjunction, AcrossPartitions, AnalysisContext,
-    ConstExpr, ExprBoundaries, PhysicalExpr,
+    AcrossPartitions, AnalysisContext, ConstExpr, EquivalenceProperties, ExprBoundaries,
+    PhysicalExpr, analyze, conjunction, split_conjunction,
 };
 
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use futures::stream::{Stream, StreamExt};
-use itertools::Itertools;
 use log::trace;
 
 const FILTER_EXEC_DEFAULT_SELECTIVITY: u8 = 20;
+const FILTER_EXEC_DEFAULT_BATCH_SIZE: usize = 8192;
 
 /// FilterExec evaluates a boolean predicate against all input batches to determine which rows to
 /// include in its output batches.
@@ -82,41 +86,168 @@ pub struct FilterExec {
     /// Selectivity for statistics. 0 = no rows, 100 = all rows
     default_selectivity: u8,
     /// Properties equivalence properties, partitioning, etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// The projection indices of the columns in the output schema of join
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
+    /// Target batch size for output batches
+    batch_size: usize,
+    /// Number of rows to fetch
+    fetch: Option<usize>,
+}
+
+/// Builder for [`FilterExec`] to set optional parameters
+pub struct FilterExecBuilder {
+    predicate: Arc<dyn PhysicalExpr>,
+    input: Arc<dyn ExecutionPlan>,
+    projection: Option<ProjectionRef>,
+    default_selectivity: u8,
+    batch_size: usize,
+    fetch: Option<usize>,
+}
+
+impl FilterExecBuilder {
+    /// Create a new builder with required parameters (predicate and input)
+    pub fn new(predicate: Arc<dyn PhysicalExpr>, input: Arc<dyn ExecutionPlan>) -> Self {
+        Self {
+            predicate,
+            input,
+            projection: None,
+            default_selectivity: FILTER_EXEC_DEFAULT_SELECTIVITY,
+            batch_size: FILTER_EXEC_DEFAULT_BATCH_SIZE,
+            fetch: None,
+        }
+    }
+
+    /// Set the input execution plan
+    pub fn with_input(mut self, input: Arc<dyn ExecutionPlan>) -> Self {
+        self.input = input;
+        self
+    }
+
+    /// Set the predicate expression
+    pub fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
+        self.predicate = predicate;
+        self
+    }
+
+    /// Set the projection, composing with any existing projection.
+    ///
+    /// If a projection is already set, the new projection indices are mapped
+    /// through the existing projection. For example, if the current projection
+    /// is `[0, 2, 3]` and `apply_projection(Some(vec![0, 2]))` is called, the
+    /// resulting projection will be `[0, 3]` (indices 0 and 2 of `[0, 2, 3]`).
+    ///
+    /// If no projection is currently set, the new projection is used directly.
+    /// If `None` is passed, the projection is cleared.
+    pub fn apply_projection(self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
+        self.apply_projection_by_ref(projection.as_ref())
+    }
+
+    /// The same as [`Self::apply_projection`] but takes projection shared reference.
+    pub fn apply_projection_by_ref(
+        mut self,
+        projection: Option<&ProjectionRef>,
+    ) -> Result<Self> {
+        // Check if the projection is valid against current output schema
+        can_project(&self.input.schema(), projection.map(AsRef::as_ref))?;
+        self.projection = combine_projections(projection, self.projection.as_ref())?;
+        Ok(self)
+    }
+
+    /// Set the default selectivity
+    pub fn with_default_selectivity(mut self, default_selectivity: u8) -> Self {
+        self.default_selectivity = default_selectivity;
+        self
+    }
+
+    /// Set the batch size
+    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size = batch_size;
+        self
+    }
+
+    /// Set the fetch limit
+    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
+        self.fetch = fetch;
+        self
+    }
+
+    /// Build the FilterExec, computing properties once with all configured parameters
+    pub fn build(self) -> Result<FilterExec> {
+        // Validate predicate type
+        match self.predicate.data_type(self.input.schema().as_ref())? {
+            DataType::Boolean => {}
+            other => {
+                return plan_err!(
+                    "Filter predicate must return BOOLEAN values, got {other:?}"
+                );
+            }
+        }
+
+        // Validate selectivity
+        if self.default_selectivity > 100 {
+            return plan_err!(
+                "Default filter selectivity value needs to be less than or equal to 100"
+            );
+        }
+
+        // Validate projection if provided
+        can_project(&self.input.schema(), self.projection.as_deref())?;
+
+        // Compute properties once with all parameters
+        let cache = FilterExec::compute_properties(
+            &self.input,
+            &self.predicate,
+            self.default_selectivity,
+            self.projection.as_deref(),
+        )?;
+
+        Ok(FilterExec {
+            predicate: self.predicate,
+            input: self.input,
+            metrics: ExecutionPlanMetricsSet::new(),
+            default_selectivity: self.default_selectivity,
+            cache: Arc::new(cache),
+            projection: self.projection,
+            batch_size: self.batch_size,
+            fetch: self.fetch,
+        })
+    }
+}
+
+impl From<&FilterExec> for FilterExecBuilder {
+    fn from(exec: &FilterExec) -> Self {
+        Self {
+            predicate: Arc::clone(&exec.predicate),
+            input: Arc::clone(&exec.input),
+            projection: exec.projection.clone(),
+            default_selectivity: exec.default_selectivity,
+            batch_size: exec.batch_size,
+            fetch: exec.fetch,
+            // We could cache / copy over PlanProperties
+            // here but that would require invalidating them in FilterExecBuilder::apply_projection, etc.
+            // and currently every call to this method ends up invalidating them anyway.
+            // If useful this can be added in the future as a non-breaking change.
+        }
+    }
 }
 
 impl FilterExec {
-    /// Create a FilterExec on an input
+    /// Create a FilterExec on an input using the builder pattern
     pub fn try_new(
         predicate: Arc<dyn PhysicalExpr>,
         input: Arc<dyn ExecutionPlan>,
     ) -> Result<Self> {
-        match predicate.data_type(input.schema().as_ref())? {
-            DataType::Boolean => {
-                let default_selectivity = FILTER_EXEC_DEFAULT_SELECTIVITY;
-                let cache = Self::compute_properties(
-                    &input,
-                    &predicate,
-                    default_selectivity,
-                    None,
-                )?;
-                Ok(Self {
-                    predicate,
-                    input: Arc::clone(&input),
-                    metrics: ExecutionPlanMetricsSet::new(),
-                    default_selectivity,
-                    cache,
-                    projection: None,
-                })
-            }
-            other => {
-                plan_err!("Filter predicate must return BOOLEAN values, got {other:?}")
-            }
-        }
+        FilterExecBuilder::new(predicate, input).build()
+    }
+
+    /// Get a batch size
+    pub fn batch_size(&self) -> usize {
+        self.batch_size
     }
 
+    /// Set the default selectivity
     pub fn with_default_selectivity(
         mut self,
         default_selectivity: u8,
@@ -131,31 +262,29 @@ impl FilterExec {
     }
 
     /// Return new instance of [FilterExec] with the given projection.
+    ///
+    /// # Deprecated
+    /// Use [`FilterExecBuilder::apply_projection`] instead
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use FilterExecBuilder::apply_projection instead"
+    )]
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        //  Check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
+        let builder = FilterExecBuilder::from(self);
+        builder.apply_projection(projection)?.build()
+    }
 
-        let cache = Self::compute_properties(
-            &self.input,
-            &self.predicate,
-            self.default_selectivity,
-            projection.as_ref(),
-        )?;
+    /// Set the batch size
+    pub fn with_batch_size(&self, batch_size: usize) -> Result<Self> {
         Ok(Self {
             predicate: Arc::clone(&self.predicate),
             input: Arc::clone(&self.input),
             metrics: self.metrics.clone(),
             default_selectivity: self.default_selectivity,
-            cache,
-            projection,
+            cache: Arc::clone(&self.cache),
+            projection: self.projection.clone(),
+            batch_size,
+            fetch: self.fetch,
         })
     }
 
@@ -175,18 +304,18 @@ impl FilterExec {
     }
 
     /// Projection
-    pub fn projection(&self) -> Option<&Vec<usize>> {
-        self.projection.as_ref()
+    pub fn projection(&self) -> &Option<ProjectionRef> {
+        &self.projection
     }
 
     /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics.
     fn statistics_helper(
-        schema: SchemaRef,
+        schema: &SchemaRef,
         input_stats: Statistics,
         predicate: &Arc<dyn PhysicalExpr>,
         default_selectivity: u8,
     ) -> Result<Statistics> {
-        if !check_support(predicate, &schema) {
+        if !check_support(predicate, schema) {
             let selectivity = default_selectivity as f64 / 100.0;
             let mut stats = input_stats.to_inexact();
             stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity);
@@ -198,12 +327,10 @@ impl FilterExec {
 
         let num_rows = input_stats.num_rows;
         let total_byte_size = input_stats.total_byte_size;
-        let input_analysis_ctx = AnalysisContext::try_from_statistics(
-            &schema,
-            &input_stats.column_statistics,
-        )?;
+        let input_analysis_ctx =
+            AnalysisContext::try_from_statistics(schema, &input_stats.column_statistics)?;
 
-        let analysis_ctx = analyze(predicate, input_analysis_ctx, &schema)?;
+        let analysis_ctx = analyze(predicate, input_analysis_ctx, schema)?;
 
         // Estimate (inexact) selectivity of predicate
         let selectivity = analysis_ctx.selectivity.unwrap_or(1.0);
@@ -211,6 +338,7 @@ impl FilterExec {
         let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity);
 
         let column_statistics = collect_new_statistics(
+            schema,
             &input_stats.column_statistics,
             analysis_ctx.boundaries,
         );
@@ -221,6 +349,20 @@ impl FilterExec {
         })
     }
 
+    /// Returns the `AcrossPartitions` value for `expr` if it is constant:
+    /// either already known constant in `input_eqs`, or a `Literal`
+    /// (which is inherently constant across all partitions).
+    fn expr_constant_or_literal(
+        expr: &Arc<dyn PhysicalExpr>,
+        input_eqs: &EquivalenceProperties,
+    ) -> Option<AcrossPartitions> {
+        input_eqs.is_expr_constant(expr).or_else(|| {
+            expr.as_any()
+                .downcast_ref::<Literal>()
+                .map(|l| AcrossPartitions::Uniform(Some(l.value().clone())))
+        })
+    }
+
     fn extend_constants(
         input: &Arc<dyn ExecutionPlan>,
         predicate: &Arc<dyn PhysicalExpr>,
@@ -230,28 +372,27 @@ impl FilterExec {
 
         let conjunctions = split_conjunction(predicate);
         for conjunction in conjunctions {
-            if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() {
-                if binary.op() == &Operator::Eq {
-                    // Filter evaluates to single value for all partitions
-                    if input_eqs.is_expr_constant(binary.left()) {
-                        let (expr, across_parts) = (
-                            binary.right(),
-                            input_eqs.get_expr_constant_value(binary.right()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr))
-                                .with_across_partitions(across_parts),
-                        );
-                    } else if input_eqs.is_expr_constant(binary.right()) {
-                        let (expr, across_parts) = (
-                            binary.left(),
-                            input_eqs.get_expr_constant_value(binary.left()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr))
-                                .with_across_partitions(across_parts),
-                        );
-                    }
+            if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>()
+                && binary.op() == &Operator::Eq
+            {
+                // Check if either side is constant — either already known
+                // constant from the input equivalence properties, or a literal
+                // value (which is inherently constant across all partitions).
+                let left_const = Self::expr_constant_or_literal(binary.left(), input_eqs);
+                let right_const =
+                    Self::expr_constant_or_literal(binary.right(), input_eqs);
+
+                if let Some(left_across) = left_const {
+                    // LEFT is constant, so RIGHT must also be constant.
+                    // Use RIGHT's known across value if available, otherwise
+                    // propagate LEFT's (e.g. Uniform from a literal).
+                    let across = right_const.unwrap_or(left_across);
+                    res_constants
+                        .push(ConstExpr::new(Arc::clone(binary.right()), across));
+                } else if let Some(right_across) = right_const {
+                    // RIGHT is constant, so LEFT must also be constant.
+                    res_constants
+                        .push(ConstExpr::new(Arc::clone(binary.left()), right_across));
                 }
             }
         }
@@ -262,20 +403,21 @@ impl FilterExec {
         input: &Arc<dyn ExecutionPlan>,
         predicate: &Arc<dyn PhysicalExpr>,
         default_selectivity: u8,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Combine the equal predicates with the input equivalence properties
         // to construct the equivalence properties:
+        let schema = input.schema();
         let stats = Self::statistics_helper(
-            input.schema(),
-            input.partition_statistics(None)?,
+            &schema,
+            Arc::unwrap_or_clone(input.partition_statistics(None)?),
             predicate,
             default_selectivity,
         )?;
         let mut eq_properties = input.equivalence_properties().clone();
-        let (equal_pairs, _) = collect_columns_from_predicate(predicate);
+        let (equal_pairs, _) = collect_columns_from_predicate_inner(predicate);
         for (lhs, rhs) in equal_pairs {
-            eq_properties.add_equal_conditions(lhs, rhs)?
+            eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))?
         }
         // Add the columns that have only one viable value (singleton) after
         // filtering to constants.
@@ -287,22 +429,20 @@ impl FilterExec {
                     .min_value
                     .get_value();
                 let expr = Arc::new(column) as _;
-                ConstExpr::new(expr)
-                    .with_across_partitions(AcrossPartitions::Uniform(value.cloned()))
+                ConstExpr::new(expr, AcrossPartitions::Uniform(value.cloned()))
             });
         // This is for statistics
-        eq_properties = eq_properties.with_constants(constants);
+        eq_properties.add_constants(constants)?;
         // This is for logical constant (for example: a = '1', then a could be marked as a constant)
         // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0)
-        eq_properties =
-            eq_properties.with_constants(Self::extend_constants(input, predicate));
+        eq_properties.add_constants(Self::extend_constants(input, predicate))?;
 
         let mut output_partitioning = input.output_partitioning().clone();
         // If contains projection, update the PlanProperties.
         if let Some(projection) = projection {
             let schema = eq_properties.schema();
             let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
-            let out_schema = project_schema(schema, Some(projection))?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -315,6 +455,17 @@ impl FilterExec {
             input.boundedness(),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for FilterExec {
@@ -343,7 +494,14 @@ impl DisplayAs for FilterExec {
                 } else {
                     "".to_string()
                 };
-                write!(f, "FilterExec: {}{}", self.predicate, display_projections)
+                let fetch = self
+                    .fetch
+                    .map_or_else(|| "".to_string(), |f| format!(", fetch={f}"));
+                write!(
+                    f,
+                    "FilterExec: {}{}{}",
+                    self.predicate, display_projections, fetch
+                )
             }
             DisplayFormatType::TreeRender => {
                 write!(f, "predicate={}", fmt_sql(self.predicate.as_ref()))
@@ -362,7 +520,7 @@ impl ExecutionPlan for FilterExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -370,6 +528,13 @@ impl ExecutionPlan for FilterExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        f(self.predicate.as_ref())
+    }
+
     fn maintains_input_order(&self) -> Vec<bool> {
         // Tell optimizer this operator doesn't reorder its input
         vec![true]
@@ -379,12 +544,11 @@ impl ExecutionPlan for FilterExec {
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        FilterExec::try_new(Arc::clone(&self.predicate), children.swap_remove(0))
-            .and_then(|e| {
-                let selectivity = e.default_selectivity();
-                e.with_default_selectivity(selectivity)
-            })
-            .and_then(|e| e.with_projection(self.projection().cloned()))
+        check_if_same_properties!(self, children);
+        let new_input = children.swap_remove(0);
+        FilterExecBuilder::from(&*self)
+            .with_input(new_input)
+            .build()
             .map(|e| Arc::new(e) as _)
     }
 
@@ -393,14 +557,24 @@ impl ExecutionPlan for FilterExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        trace!(
+            "Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+        let metrics = FilterExecMetrics::new(&self.metrics, partition);
         Ok(Box::pin(FilterExecStream {
             schema: self.schema(),
             predicate: Arc::clone(&self.predicate),
             input: self.input.execute(partition, context)?,
-            baseline_metrics,
+            metrics,
             projection: self.projection.clone(),
+            batch_coalescer: LimitedBatchCoalescer::new(
+                self.schema(),
+                self.batch_size,
+                self.fetch,
+            ),
         }))
     }
 
@@ -410,19 +584,16 @@ impl ExecutionPlan for FilterExec {
 
     /// The output statistics of a filtering operation can be estimated if the
     /// predicate's selectivity value can be determined for the incoming data.
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stats = self.input.partition_statistics(partition)?;
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stats =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
         let stats = Self::statistics_helper(
-            self.schema(),
+            &self.input.schema(),
             input_stats,
             self.predicate(),
             self.default_selectivity,
         )?;
-        Ok(stats.project(self.projection.as_ref()))
+        Ok(Arc::new(stats.project(self.projection.as_ref())))
     }
 
     fn cardinality_effect(&self) -> CardinalityEffect {
@@ -441,15 +612,11 @@ impl ExecutionPlan for FilterExec {
             if let Some(new_predicate) =
                 update_expr(self.predicate(), projection.expr(), false)?
             {
-                return FilterExec::try_new(
-                    new_predicate,
-                    make_with_child(projection, self.input())?,
-                )
-                .and_then(|e| {
-                    let selectivity = self.default_selectivity();
-                    e.with_default_selectivity(selectivity)
-                })
-                .map(|e| Some(Arc::new(e) as _));
+                return FilterExecBuilder::from(self)
+                    .with_input(make_with_child(projection, self.input())?)
+                    .with_predicate(new_predicate)
+                    .build()
+                    .map(|e| Some(Arc::new(e) as _));
             }
         }
         try_embed_projection(projection, self)
@@ -457,92 +624,90 @@ impl ExecutionPlan for FilterExec {
 
     fn gather_filters_for_pushdown(
         &self,
+        phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        let self_filter = split_conjunction(&self.predicate)
-            .into_iter()
-            .cloned()
-            .collect_vec();
+        if phase != FilterPushdownPhase::Pre {
+            let child =
+                ChildFilterDescription::from_child(&parent_filters, self.input())?;
+            return Ok(FilterDescription::new().with_child(child));
+        }
 
-        let parent_filters = if let Some(projection_indices) = self.projection.as_ref() {
-            // We need to invert the projection on any referenced columns in the filter
-            // Create a mapping from the output columns to the input columns (the inverse of the projection)
-            let inverse_projection = projection_indices
-                .iter()
-                .enumerate()
-                .map(|(i, &p)| (p, i))
-                .collect::<HashMap<_, _>>();
-            parent_filters
-                .into_iter()
-                .map(|f| {
-                    f.transform_up(|expr| {
-                        let mut res =
-                            if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-                                let index = col.index();
-                                let index_in_input_schema =
-                                    inverse_projection.get(&index).ok_or_else(|| {
-                                        DataFusionError::Internal(format!(
-                                            "Column {index} not found in projection"
-                                        ))
-                                    })?;
-                                Transformed::yes(Arc::new(Column::new(
-                                    col.name(),
-                                    *index_in_input_schema,
-                                )) as _)
-                            } else {
-                                Transformed::no(expr)
-                            };
-                        // Columns can only exist in the leaves, no need to try all nodes
-                        res.tnr = TreeNodeRecursion::Jump;
-                        Ok(res)
-                    })
-                    .data()
-                })
-                .collect::<Result<Vec<_>>>()?
-        } else {
-            parent_filters
-        };
+        let child = ChildFilterDescription::from_child(&parent_filters, self.input())?
+            .with_self_filters(
+                split_conjunction(&self.predicate)
+                    .into_iter()
+                    .cloned()
+                    .collect(),
+            );
 
-        Ok(FilterDescription::new_with_child_count(1)
-            .all_parent_filters_supported(parent_filters)
-            .with_self_filters_for_children(vec![self_filter]))
+        Ok(FilterDescription::new().with_child(child))
     }
 
     fn handle_child_pushdown_result(
         &self,
+        phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        if phase != FilterPushdownPhase::Pre {
+            return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
+        }
         // We absorb any parent filters that were not handled by our children
-        let mut unhandled_filters =
-            child_pushdown_result.parent_filters.collect_unsupported();
-        assert_eq!(
-            child_pushdown_result.self_filters.len(),
-            1,
-            "FilterExec should only have one child"
-        );
-        let unsupported_self_filters =
-            child_pushdown_result.self_filters[0].collect_unsupported();
-        unhandled_filters.extend(unsupported_self_filters);
+        let mut unsupported_parent_filters: Vec<Arc<dyn PhysicalExpr>> =
+            child_pushdown_result
+                .parent_filters
+                .iter()
+                .filter_map(|f| {
+                    matches!(f.all(), PushedDown::No).then_some(Arc::clone(&f.filter))
+                })
+                .collect();
+
+        // If this FilterExec has a projection, the unsupported parent filters
+        // are in the output schema (after projection) coordinates. We need to
+        // remap them to the input schema coordinates before combining with self filters.
+        if self.projection.is_some() {
+            let input_schema = self.input().schema();
+            unsupported_parent_filters = unsupported_parent_filters
+                .into_iter()
+                .map(|expr| reassign_expr_columns(expr, &input_schema))
+                .collect::<Result<Vec<_>>>()?;
+        }
+
+        let unsupported_self_filters = child_pushdown_result
+            .self_filters
+            .first()
+            .expect("we have exactly one child")
+            .iter()
+            .filter_map(|f| match f.discriminant {
+                PushedDown::Yes => None,
+                PushedDown::No => Some(&f.predicate),
+            })
+            .cloned();
+
+        let unhandled_filters = unsupported_parent_filters
+            .into_iter()
+            .chain(unsupported_self_filters)
+            .collect_vec();
 
         // If we have unhandled filters, we need to create a new FilterExec
         let filter_input = Arc::clone(self.input());
         let new_predicate = conjunction(unhandled_filters);
         let updated_node = if new_predicate.eq(&lit(true)) {
             // FilterExec is no longer needed, but we may need to leave a projection in place
-            match self.projection() {
+            match self.projection().as_ref() {
                 Some(projection_indices) => {
                     let filter_child_schema = filter_input.schema();
                     let proj_exprs = projection_indices
                         .iter()
                         .map(|p| {
                             let field = filter_child_schema.field(*p).clone();
-                            (
-                                Arc::new(Column::new(field.name(), *p))
+                            ProjectionExpr {
+                                expr: Arc::new(Column::new(field.name(), *p))
                                     as Arc<dyn PhysicalExpr>,
-                                field.name().to_string(),
-                            )
+                                alias: field.name().to_string(),
+                            }
                         })
                         .collect::<Vec<_>>();
                     Some(Arc::new(ProjectionExec::try_new(proj_exprs, filter_input)?)
@@ -557,32 +722,82 @@ impl ExecutionPlan for FilterExec {
             // The new predicate is the same as our current predicate
             None
         } else {
-            // Create a new FilterExec with the new predicate
+            // Create a new FilterExec with the new predicate, preserving the projection
             let new = FilterExec {
                 predicate: Arc::clone(&new_predicate),
                 input: Arc::clone(&filter_input),
                 metrics: self.metrics.clone(),
                 default_selectivity: self.default_selectivity,
-                cache: Self::compute_properties(
+                cache: Arc::new(Self::compute_properties(
                     &filter_input,
                     &new_predicate,
                     self.default_selectivity,
-                    self.projection.as_ref(),
-                )?,
-                projection: None,
+                    self.projection.as_deref(),
+                )?),
+                projection: self.projection.clone(),
+                batch_size: self.batch_size,
+                fetch: self.fetch,
             };
             Some(Arc::new(new) as _)
         };
+
         Ok(FilterPushdownPropagation {
-            filters: child_pushdown_result.parent_filters.make_supported(),
+            filters: vec![PushedDown::Yes; child_pushdown_result.parent_filters.len()],
             updated_node,
         })
     }
+
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    fn with_fetch(&self, fetch: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        Some(Arc::new(Self {
+            predicate: Arc::clone(&self.predicate),
+            input: Arc::clone(&self.input),
+            metrics: self.metrics.clone(),
+            default_selectivity: self.default_selectivity,
+            cache: Arc::clone(&self.cache),
+            projection: self.projection.clone(),
+            batch_size: self.batch_size,
+            fetch,
+        }))
+    }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
 }
 
 impl EmbeddedProjection for FilterExec {
     fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        self.with_projection(projection)
+        FilterExecBuilder::from(self)
+            .apply_projection(projection)?
+            .build()
+    }
+}
+
+/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
+/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
+fn interval_bound_to_precision(
+    bound: ScalarValue,
+    is_exact: bool,
+) -> Precision<ScalarValue> {
+    if bound.is_null() {
+        Precision::Absent
+    } else if is_exact {
+        Precision::Exact(bound)
+    } else {
+        Precision::Inexact(bound)
     }
 }
 
@@ -591,6 +806,7 @@ impl EmbeddedProjection for FilterExec {
 /// is adjusted by using the next/previous value for its data type to convert
 /// it into a closed bound.
 fn collect_new_statistics(
+    schema: &SchemaRef,
     input_column_stats: &[ColumnStatistics],
     analysis_boundaries: Vec<ExprBoundaries>,
 ) -> Vec<ColumnStatistics> {
@@ -607,27 +823,32 @@ fn collect_new_statistics(
                 },
             )| {
                 let Some(interval) = interval else {
-                    // If the interval is `None`, we can say that there are no rows:
+                    // If the interval is `None`, we can say that there are no rows.
+                    // Use a typed null to preserve the column's data type, so that
+                    // downstream interval analysis can still intersect intervals
+                    // of the same type.
+                    let typed_null = ScalarValue::try_from(schema.field(idx).data_type())
+                        .unwrap_or(ScalarValue::Null);
                     return ColumnStatistics {
                         null_count: Precision::Exact(0),
-                        max_value: Precision::Exact(ScalarValue::Null),
-                        min_value: Precision::Exact(ScalarValue::Null),
-                        sum_value: Precision::Exact(ScalarValue::Null),
+                        max_value: Precision::Exact(typed_null.clone()),
+                        min_value: Precision::Exact(typed_null.clone()),
+                        sum_value: Precision::Exact(typed_null),
                         distinct_count: Precision::Exact(0),
+                        byte_size: input_column_stats[idx].byte_size,
                     };
                 };
                 let (lower, upper) = interval.into_bounds();
-                let (min_value, max_value) = if lower.eq(&upper) {
-                    (Precision::Exact(lower), Precision::Exact(upper))
-                } else {
-                    (Precision::Inexact(lower), Precision::Inexact(upper))
-                };
+                let is_exact = !lower.is_null() && !upper.is_null() && lower == upper;
+                let min_value = interval_bound_to_precision(lower, is_exact);
+                let max_value = interval_bound_to_precision(upper, is_exact);
                 ColumnStatistics {
                     null_count: input_column_stats[idx].null_count.to_inexact(),
                     max_value,
                     min_value,
                     sum_value: Precision::Absent,
                     distinct_count: distinct_count.to_inexact(),
+                    byte_size: input_column_stats[idx].byte_size,
                 }
             },
         )
@@ -644,23 +865,45 @@ struct FilterExecStream {
     /// The input partition to filter.
     input: SendableRecordBatchStream,
     /// Runtime metrics recording
-    baseline_metrics: BaselineMetrics,
+    metrics: FilterExecMetrics,
     /// The projection indices of the columns in the input schema
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
+    /// Batch coalescer to combine small batches
+    batch_coalescer: LimitedBatchCoalescer,
+}
+
+/// The metrics for `FilterExec`
+struct FilterExecMetrics {
+    /// Common metrics for most operators
+    baseline_metrics: BaselineMetrics,
+    /// Selectivity of the filter, calculated as output_rows / input_rows
+    selectivity: RatioMetrics,
+    // Remember to update `docs/source/user-guide/metrics.md` when adding new metrics,
+    // or modifying metrics comments
+}
+
+impl FilterExecMetrics {
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            baseline_metrics: BaselineMetrics::new(metrics, partition),
+            selectivity: MetricBuilder::new(metrics)
+                .with_type(MetricType::SUMMARY)
+                .ratio_metrics("selectivity", partition),
+        }
+    }
 }
 
 pub fn batch_filter(
     batch: &RecordBatch,
     predicate: &Arc<dyn PhysicalExpr>,
 ) -> Result<RecordBatch> {
-    filter_and_project(batch, predicate, None, &batch.schema())
+    filter_and_project(batch, predicate, None)
 }
 
 fn filter_and_project(
     batch: &RecordBatch,
     predicate: &Arc<dyn PhysicalExpr>,
     projection: Option<&Vec<usize>>,
-    output_schema: &SchemaRef,
 ) -> Result<RecordBatch> {
     predicate
         .evaluate(batch)
@@ -670,14 +913,7 @@ fn filter_and_project(
                 // Apply filter array to record batch
                 (Ok(filter_array), None) => filter_record_batch(batch, filter_array)?,
                 (Ok(filter_array), Some(projection)) => {
-                    let projected_columns = projection
-                        .iter()
-                        .map(|i| Arc::clone(batch.column(*i)))
-                        .collect();
-                    let projected_batch = RecordBatch::try_new(
-                        Arc::clone(output_schema),
-                        projected_columns,
-                    )?;
+                    let projected_batch = batch.project(projection)?;
                     filter_record_batch(&projected_batch, filter_array)?
                 }
                 (Err(_), _) => {
@@ -696,32 +932,73 @@ impl Stream for FilterExecStream {
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        let poll;
+        let elapsed_compute = self.metrics.baseline_metrics.elapsed_compute().clone();
         loop {
+            // If there is a completed batch ready, return it
+            if let Some(batch) = self.batch_coalescer.next_completed_batch() {
+                self.metrics.selectivity.add_part(batch.num_rows());
+                let poll = Poll::Ready(Some(Ok(batch)));
+                return self.metrics.baseline_metrics.record_poll(poll);
+            }
+
+            if self.batch_coalescer.is_finished() {
+                // If input is done and no batches are ready, return None to signal end of stream.
+                return Poll::Ready(None);
+            }
+
+            // Attempt to pull the next batch from the input stream.
             match ready!(self.input.poll_next_unpin(cx)) {
+                None => {
+                    self.batch_coalescer.finish()?;
+                    // continue draining the coalescer
+                }
                 Some(Ok(batch)) => {
-                    let timer = self.baseline_metrics.elapsed_compute().timer();
-                    let filtered_batch = filter_and_project(
-                        &batch,
-                        &self.predicate,
-                        self.projection.as_ref(),
-                        &self.schema,
-                    )?;
+                    let timer = elapsed_compute.timer();
+                    let status = self.predicate.as_ref()
+                        .evaluate(&batch)
+                        .and_then(|v| v.into_array(batch.num_rows()))
+                        .and_then(|array| {
+                            Ok(match self.projection.as_ref()  {
+                                Some(projection) => {
+                                    let projected_batch = batch.project(projection)?;
+                                    (array, projected_batch)
+                                },
+                                None => (array, batch)
+                            })
+                        }).and_then(|(array, batch)| {
+                            match as_boolean_array(&array) {
+                                Ok(filter_array) => {
+                                    self.metrics.selectivity.add_total(batch.num_rows());
+                                    // TODO: support push_batch_with_filter in LimitedBatchCoalescer
+                                    let batch = filter_record_batch(&batch, filter_array)?;
+                                    let state = self.batch_coalescer.push_batch(batch)?;
+                                    Ok(state)
+                                }
+                                Err(_) => {
+                                    internal_err!(
+                                        "Cannot create filter_array from non-boolean predicates"
+                                    )
+                                }
+                            }
+                        })?;
                     timer.done();
-                    // Skip entirely filtered batches
-                    if filtered_batch.num_rows() == 0 {
-                        continue;
+
+                    match status {
+                        PushBatchStatus::Continue => {
+                            // Keep pushing more batches
+                        }
+                        PushBatchStatus::LimitReached => {
+                            // limit was reached, so stop early
+                            self.batch_coalescer.finish()?;
+                            // continue draining the coalescer
+                        }
                     }
-                    poll = Poll::Ready(Some(Ok(filtered_batch)));
-                    break;
-                }
-                value => {
-                    poll = Poll::Ready(value);
-                    break;
                 }
+
+                // Error case
+                other => return Poll::Ready(other),
             }
         }
-        self.baseline_metrics.record_poll(poll)
     }
 
     fn size_hint(&self) -> (usize, Option<usize>) {
@@ -729,7 +1006,6 @@ impl Stream for FilterExecStream {
         self.input.size_hint()
     }
 }
-
 impl RecordBatchStream for FilterExecStream {
     fn schema(&self) -> SchemaRef {
         Arc::clone(&self.schema)
@@ -737,13 +1013,38 @@ impl RecordBatchStream for FilterExecStream {
 }
 
 /// Return the equals Column-Pairs and Non-equals Column-Pairs
-fn collect_columns_from_predicate(predicate: &Arc<dyn PhysicalExpr>) -> EqualAndNonEqual {
+#[deprecated(
+    since = "51.0.0",
+    note = "This function will be internal in the future"
+)]
+pub fn collect_columns_from_predicate(
+    predicate: &'_ Arc<dyn PhysicalExpr>,
+) -> EqualAndNonEqual<'_> {
+    collect_columns_from_predicate_inner(predicate)
+}
+
+fn collect_columns_from_predicate_inner(
+    predicate: &'_ Arc<dyn PhysicalExpr>,
+) -> EqualAndNonEqual<'_> {
     let mut eq_predicate_columns = Vec::<PhysicalExprPairRef>::new();
     let mut ne_predicate_columns = Vec::<PhysicalExprPairRef>::new();
 
     let predicates = split_conjunction(predicate);
     predicates.into_iter().for_each(|p| {
         if let Some(binary) = p.as_any().downcast_ref::<BinaryExpr>() {
+            // Only extract pairs where at least one side is a Column reference.
+            // Pairs like `complex_expr = literal` should not create equivalence
+            // classes — the literal could appear in many unrelated expressions
+            // (e.g. sort keys), and normalize_expr's deep traversal would
+            // replace those occurrences with the complex expression, corrupting
+            // sort orderings. Constant propagation for such pairs is handled
+            // separately by `extend_constants`.
+            let has_direct_column_operand =
+                binary.left().as_any().downcast_ref::<Column>().is_some()
+                    || binary.right().as_any().downcast_ref::<Column>().is_some();
+            if !has_direct_column_operand {
+                return;
+            }
             match binary.op() {
                 Operator::Eq => {
                     eq_predicate_columns.push((binary.left(), binary.right()))
@@ -806,7 +1107,7 @@ mod tests {
             &schema,
         )?;
 
-        let (equal_pairs, ne_pairs) = collect_columns_from_predicate(&predicate);
+        let (equal_pairs, ne_pairs) = collect_columns_from_predicate_inner(&predicate);
         assert_eq!(2, equal_pairs.len());
         assert!(equal_pairs[0].0.eq(&col("c2", &schema)?));
         assert!(equal_pairs[0].1.eq(&lit(4u32)));
@@ -1101,7 +1402,7 @@ mod tests {
         ];
         let _ = exp_col_stats
             .into_iter()
-            .zip(statistics.column_statistics)
+            .zip(statistics.column_statistics.clone())
             .map(|(expected, actual)| {
                 if let Some(val) = actual.min_value.get_value() {
                     if val.data_type().is_floating() {
@@ -1172,7 +1473,7 @@ mod tests {
             )),
         ));
         // Since filter predicate passes all entries, statistics after filter shouldn't change.
-        let expected = input.partition_statistics(None)?.column_statistics;
+        let expected = input.partition_statistics(None)?.column_statistics.clone();
         let filter: Arc<dyn ExecutionPlan> =
             Arc::new(FilterExec::try_new(predicate, input)?);
         let statistics = filter.partition_statistics(None)?;
@@ -1236,18 +1537,20 @@ mod tests {
             statistics.column_statistics,
             vec![
                 ColumnStatistics {
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
-                    min_value: Precision::Exact(ScalarValue::Null),
-                    max_value: Precision::Exact(ScalarValue::Null),
-                    sum_value: Precision::Exact(ScalarValue::Null),
+                    min_value: Precision::Exact(ScalarValue::Int32(None)),
+                    max_value: Precision::Exact(ScalarValue::Int32(None)),
+                    sum_value: Precision::Exact(ScalarValue::Int32(None)),
                     distinct_count: Precision::Exact(0),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
             ]
         );
@@ -1255,6 +1558,70 @@ mod tests {
         Ok(())
     }
 
+    /// Regression test: stacking two FilterExecs where the inner filter
+    /// proves zero selectivity should not panic with a type mismatch
+    /// during interval intersection.
+    ///
+    /// Previously, when a filter proved no rows could match, the column
+    /// statistics used untyped `ScalarValue::Null` (data type `Null`).
+    /// If an outer FilterExec then tried to analyze its own predicate
+    /// against those statistics, `Interval::intersect` would fail with:
+    ///   "Only intervals with the same data type are intersectable, lhs:Null, rhs:Int32"
+    #[tokio::test]
+    async fn test_nested_filter_with_zero_selectivity_inner() -> Result<()> {
+        // Inner table: a: [1, 100], b: [1, 3]
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(4000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+
+        // Inner filter: a > 200 (impossible given a max=100 → zero selectivity)
+        let inner_predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(200)))),
+        ));
+        let inner_filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(inner_predicate, input)?);
+
+        // Outer filter: a = 50
+        // Before the fix, this would panic because the inner filter's
+        // zero-selectivity statistics produced Null-typed intervals for
+        // column `a`, which couldn't intersect with the Int32 literal.
+        let outer_predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+        let outer_filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(outer_predicate, inner_filter)?);
+
+        // Should succeed without error
+        let statistics = outer_filter.partition_statistics(None)?;
+        assert_eq!(statistics.num_rows, Precision::Inexact(0));
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_filter_statistics_more_inputs() -> Result<()> {
         let schema = Schema::new(vec![
@@ -1349,10 +1716,11 @@ mod tests {
                 max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
                 sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
+                byte_size: Precision::Absent,
             }],
         };
 
-        assert_eq!(filter_statistics, expected_filter_statistics);
+        assert_eq!(*filter_statistics, expected_filter_statistics);
 
         Ok(())
     }
@@ -1432,13 +1800,14 @@ mod tests {
     #[test]
     fn test_equivalence_properties_union_type() -> Result<()> {
         let union_type = DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![0, 1],
                 vec![
                     Field::new("f1", DataType::Int32, true),
                     Field::new("f2", DataType::Utf8, true),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         );
 
@@ -1461,4 +1830,448 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_builder_with_projection() -> Result<()> {
+        // Create a schema with multiple columns
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a filter predicate: a > 10
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // Create filter with projection [0, 2] (columns a and c) using builder
+        let projection = Some(vec![0, 2]);
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection.clone())
+            .unwrap()
+            .build()?;
+
+        // Verify projection is set correctly
+        assert_eq!(filter.projection(), &Some([0, 2].into()));
+
+        // Verify schema contains only projected columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+        assert_eq!(output_schema.field(0).name(), "a");
+        assert_eq!(output_schema.field(1).name(), "c");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_without_projection() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        ));
+
+        // Create filter without projection using builder
+        let filter = FilterExecBuilder::new(predicate, input).build()?;
+
+        // Verify no projection is set
+        assert!(filter.projection().is_none());
+
+        // Verify schema contains all columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_invalid_projection() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        ));
+
+        // Try to create filter with invalid projection (index out of bounds) using builder
+        let result =
+            FilterExecBuilder::new(predicate, input).apply_projection(Some(vec![0, 5])); // 5 is out of bounds
+
+        // Should return an error
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_vs_with_projection() -> Result<()> {
+        // This test verifies that the builder with projection produces the same result
+        // as try_new().with_projection(), but more efficiently (one compute_properties call)
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]);
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(4000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+        let input: Arc<dyn ExecutionPlan> = input;
+
+        let predicate: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+
+        let projection = Some(vec![0, 2]);
+
+        // Method 1: Builder with projection (one call to compute_properties)
+        let filter1 = FilterExecBuilder::new(Arc::clone(&predicate), Arc::clone(&input))
+            .apply_projection(projection.clone())
+            .unwrap()
+            .build()?;
+
+        // Method 2: Also using builder for comparison (deprecated try_new().with_projection() removed)
+        let filter2 = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection)
+            .unwrap()
+            .build()?;
+
+        // Both methods should produce equivalent results
+        assert_eq!(filter1.schema(), filter2.schema());
+        assert_eq!(filter1.projection(), filter2.projection());
+
+        // Verify statistics are the same
+        let stats1 = filter1.partition_statistics(None)?;
+        let stats2 = filter2.partition_statistics(None)?;
+        assert_eq!(stats1.num_rows, stats2.num_rows);
+        assert_eq!(stats1.total_byte_size, stats2.total_byte_size);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_statistics_with_projection() -> Result<()> {
+        // Test that statistics are correctly computed when using builder with projection
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+        ]);
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Inexact(12000),
+                column_statistics: vec![
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(200))),
+                        ..Default::default()
+                    },
+                    ColumnStatistics {
+                        min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
+                        max_value: Precision::Inexact(ScalarValue::Int32(Some(50))),
+                        ..Default::default()
+                    },
+                ],
+            },
+            schema,
+        ));
+
+        // Filter: a < 50, Project: [0, 2]
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
+        ));
+
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0, 2]))
+            .unwrap()
+            .build()?;
+
+        let statistics = filter.partition_statistics(None)?;
+
+        // Verify statistics reflect both filtering and projection
+        assert!(matches!(statistics.num_rows, Precision::Inexact(_)));
+
+        // Schema should only have 2 columns after projection
+        assert_eq!(filter.schema().fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_builder_predicate_validation() -> Result<()> {
+        // Test that builder validates predicate type correctly
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a predicate that doesn't return boolean (returns Int32)
+        let invalid_predicate = Arc::new(Column::new("a", 0));
+
+        // Should fail because predicate doesn't return boolean
+        let result = FilterExecBuilder::new(invalid_predicate, input)
+            .apply_projection(Some(vec![0]))
+            .unwrap()
+            .build();
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_projection_composition() -> Result<()> {
+        // Test that calling apply_projection multiple times composes projections
+        // If initial projection is [0, 2, 3] and we call apply_projection([0, 2]),
+        // the result should be [0, 3] (indices 0 and 2 of [0, 2, 3])
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        // Create a filter predicate: a > 10
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // First projection: [0, 2, 3] -> select columns a, c, d
+        // Second projection: [0, 2] -> select indices 0 and 2 of [0, 2, 3] -> [0, 3]
+        // Final result: columns a and d
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0, 2, 3]))?
+            .apply_projection(Some(vec![0, 2]))?
+            .build()?;
+
+        // Verify composed projection is [0, 3]
+        assert_eq!(filter.projection(), &Some([0, 3].into()));
+
+        // Verify schema contains only columns a and d
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+        assert_eq!(output_schema.field(0).name(), "a");
+        assert_eq!(output_schema.field(1).name(), "d");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_builder_projection_composition_none_clears() -> Result<()> {
+        // Test that passing None clears the projection
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+
+        // Set a projection then clear it with None
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![0]))?
+            .apply_projection(None)?
+            .build()?;
+
+        // Projection should be cleared
+        assert_eq!(filter.projection(), &None);
+
+        // Schema should have all columns
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_with_projection_remaps_post_phase_parent_filters() -> Result<()> {
+        // Test that FilterExec with a projection must remap parent dynamic
+        // filter column indices from its output schema to the input schema
+        // before passing them to the child.
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+            Field::new("c", DataType::Float64, false),
+        ]));
+        let input = Arc::new(EmptyExec::new(Arc::clone(&input_schema)));
+
+        // FilterExec: a > 0, projection=[c@2]
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
+        ));
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(Some(vec![2]))?
+            .build()?;
+
+        // Output schema should be [c:Float64]
+        let output_schema = filter.schema();
+        assert_eq!(output_schema.fields().len(), 1);
+        assert_eq!(output_schema.field(0).name(), "c");
+
+        // Simulate a parent dynamic filter referencing output column c@0
+        let parent_filter: Arc<dyn PhysicalExpr> = Arc::new(Column::new("c", 0));
+
+        let config = ConfigOptions::new();
+        let desc = filter.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![parent_filter],
+            &config,
+        )?;
+
+        // The filter pushed to the child must reference c@2 (input schema),
+        // not c@0 (output schema).
+        let parent_filters = desc.parent_filters();
+        assert_eq!(parent_filters.len(), 1); // one child
+        assert_eq!(parent_filters[0].len(), 1); // one filter
+        let remapped = &parent_filters[0][0].predicate;
+        let display = format!("{remapped}");
+        assert_eq!(
+            display, "c@2",
+            "Post-phase parent filter column index must be remapped \
+             from output schema (c@0) to input schema (c@2)"
+        );
+
+        Ok(())
+    }
+
+    /// Regression test for https://github.com/apache/datafusion/issues/20194
+    ///
+    /// `collect_columns_from_predicate_inner` should only extract equality
+    /// pairs where at least one side is a Column. Pairs like
+    /// `complex_expr = literal` must not create equivalence classes because
+    /// `normalize_expr`'s deep traversal would replace the literal inside
+    /// unrelated expressions (e.g. sort keys) with the complex expression.
+    #[test]
+    fn test_collect_columns_skips_non_column_pairs() -> Result<()> {
+        let schema = test::aggr_test_schema();
+
+        // Simulate: nvl(c2, 0) = 0  →  (c2 IS DISTINCT FROM 0) = 0
+        // Neither side is a Column, so this should NOT be extracted.
+        let complex_expr: Arc<dyn PhysicalExpr> = binary(
+            col("c2", &schema)?,
+            Operator::IsDistinctFrom,
+            lit(0u32),
+            &schema,
+        )?;
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(complex_expr, Operator::Eq, lit(0u32), &schema)?;
+
+        let (equal_pairs, _) = collect_columns_from_predicate_inner(&predicate);
+        assert_eq!(
+            0,
+            equal_pairs.len(),
+            "Should not extract equality pairs where neither side is a Column"
+        );
+
+        // But col = literal should still be extracted
+        let predicate: Arc<dyn PhysicalExpr> =
+            binary(col("c2", &schema)?, Operator::Eq, lit(0u32), &schema)?;
+        let (equal_pairs, _) = collect_columns_from_predicate_inner(&predicate);
+        assert_eq!(
+            1,
+            equal_pairs.len(),
+            "Should extract equality pairs where one side is a Column"
+        );
+
+        Ok(())
+    }
+
+    /// Columns with Absent min/max statistics should remain Absent after
+    /// FilterExec.
+    #[tokio::test]
+    async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()> {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    ColumnStatistics::default(),
+                    ColumnStatistics::default(),
+                ],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.partition_statistics(None)?;
+        let col_b_stats = &statistics.column_statistics[1];
+        assert_eq!(col_b_stats.min_value, Precision::Absent);
+        assert_eq!(col_b_stats.max_value, Precision::Absent);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs
index 4e84fe36f98f3..7e82b9e8239e0 100644
--- a/datafusion/physical-plan/src/filter_pushdown.rs
+++ b/datafusion/physical-plan/src/filter_pushdown.rs
@@ -15,135 +15,186 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Filter Pushdown Optimization Process
+//!
+//! The filter pushdown mechanism involves four key steps:
+//! 1. **Optimizer Asks Parent for a Filter Pushdown Plan**: The optimizer calls [`ExecutionPlan::gather_filters_for_pushdown`]
+//!    on the parent node, passing in parent predicates and phase. The parent node creates a [`FilterDescription`]
+//!    by inspecting its logic and children's schemas, determining which filters can be pushed to each child.
+//! 2. **Optimizer Executes Pushdown**: The optimizer recursively pushes down filters for each child,
+//!    passing the appropriate filters (`Vec<Arc<dyn PhysicalExpr>>`) for that child.
+//! 3. **Optimizer Gathers Results**: The optimizer collects [`FilterPushdownPropagation`] results from children,
+//!    containing information about which filters were successfully pushed down vs. unsupported.
+//! 4. **Parent Responds**: The optimizer calls [`ExecutionPlan::handle_child_pushdown_result`] on the parent,
+//!    passing a [`ChildPushdownResult`] containing the aggregated pushdown outcomes. The parent decides
+//!    how to handle filters that couldn't be pushed down (e.g., keep them as FilterExec nodes).
+//!
+//! [`ExecutionPlan::gather_filters_for_pushdown`]: crate::ExecutionPlan::gather_filters_for_pushdown
+//! [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
+//!
+//! See also datafusion/physical-optimizer/src/filter_pushdown.rs.
+
+use std::collections::HashSet;
 use std::sync::Arc;
-use std::vec::IntoIter;
 
+use arrow_schema::SchemaRef;
+use datafusion_common::{
+    Result,
+    tree_node::{Transformed, TreeNode},
+};
+use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum FilterPushdownPhase {
+    /// Pushdown that happens before most other optimizations.
+    /// This pushdown allows static filters that do not reference any [`ExecutionPlan`]s to be pushed down.
+    /// Filters that reference an [`ExecutionPlan`] cannot be pushed down at this stage since the whole plan tree may be rewritten
+    /// by other optimizations.
+    /// Implementers are however allowed to modify the execution plan themselves during this phase, for example by returning a completely
+    /// different [`ExecutionPlan`] from [`ExecutionPlan::handle_child_pushdown_result`].
+    ///
+    /// Pushdown of [`FilterExec`] into `DataSourceExec` is an example of a pre-pushdown.
+    /// Unlike filter pushdown in the logical phase, which operates on the logical plan to push filters into the logical table scan,
+    /// the `Pre` phase in the physical plan targets the actual physical scan, pushing filters down to specific data source implementations.
+    /// For example, Parquet supports filter pushdown to reduce data read during scanning, while CSV typically does not.
+    ///
+    /// [`ExecutionPlan`]: crate::ExecutionPlan
+    /// [`FilterExec`]: crate::filter::FilterExec
+    /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
+    Pre,
+    /// Pushdown that happens after most other optimizations.
+    /// This stage of filter pushdown allows filters that reference an [`ExecutionPlan`] to be pushed down.
+    /// Since subsequent optimizations should not change the structure of the plan tree except for calling [`ExecutionPlan::with_new_children`]
+    /// (which generally preserves internal references) it is safe for references between [`ExecutionPlan`]s to be established at this stage.
+    ///
+    /// This phase is used to link a [`SortExec`] (with a TopK operator) or a [`HashJoinExec`] to a `DataSourceExec`.
+    ///
+    /// [`ExecutionPlan`]: crate::ExecutionPlan
+    /// [`ExecutionPlan::with_new_children`]: crate::ExecutionPlan::with_new_children
+    /// [`SortExec`]: crate::sorts::sort::SortExec
+    /// [`HashJoinExec`]: crate::joins::HashJoinExec
+    /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
+    Post,
+}
+
+impl std::fmt::Display for FilterPushdownPhase {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FilterPushdownPhase::Pre => write!(f, "Pre"),
+            FilterPushdownPhase::Post => write!(f, "Post"),
+        }
+    }
+}
+
 /// The result of a plan for pushing down a filter into a child node.
 /// This contains references to filters so that nodes can mutate a filter
 /// before pushing it down to a child node (e.g. to adjust a projection)
-/// or can directly take ownership of `Unsupported` filters that their children
+/// or can directly take ownership of filters that their children
 /// could not handle.
 #[derive(Debug, Clone)]
-pub enum PredicateSupport {
-    Supported(Arc<dyn PhysicalExpr>),
-    Unsupported(Arc<dyn PhysicalExpr>),
+pub struct PushedDownPredicate {
+    pub discriminant: PushedDown,
+    pub predicate: Arc<dyn PhysicalExpr>,
 }
 
-/// A thin wrapper around [`PredicateSupport`]s that allows for easy collection of
-/// supported and unsupported filters. Inner vector stores each predicate for one node.
-#[derive(Debug, Clone)]
-pub struct PredicateSupports(Vec<PredicateSupport>);
-
-impl PredicateSupports {
-    /// Create a new FilterPushdowns with the given filters and their pushdown status.
-    pub fn new(pushdowns: Vec<PredicateSupport>) -> Self {
-        Self(pushdowns)
+impl PushedDownPredicate {
+    /// Return the wrapped [`PhysicalExpr`], discarding whether it is supported or unsupported.
+    pub fn into_inner(self) -> Arc<dyn PhysicalExpr> {
+        self.predicate
     }
 
-    /// Create a new [`PredicateSupport`] with all filters as supported.
-    pub fn all_supported(filters: Vec<Arc<dyn PhysicalExpr>>) -> Self {
-        let pushdowns = filters
-            .into_iter()
-            .map(PredicateSupport::Supported)
-            .collect();
-        Self::new(pushdowns)
-    }
-
-    /// Create a new [`PredicateSupport`] with all filters as unsupported.
-    pub fn all_unsupported(filters: Vec<Arc<dyn PhysicalExpr>>) -> Self {
-        let pushdowns = filters
-            .into_iter()
-            .map(PredicateSupport::Unsupported)
-            .collect();
-        Self::new(pushdowns)
-    }
-
-    /// Transform all filters to supported, returning a new [`PredicateSupports`]
-    /// with all filters as [`PredicateSupport::Supported`].
-    /// This does not modify the original [`PredicateSupport`].
-    pub fn make_supported(self) -> Self {
-        let pushdowns = self
-            .0
-            .into_iter()
-            .map(|f| match f {
-                PredicateSupport::Supported(expr) => PredicateSupport::Supported(expr),
-                PredicateSupport::Unsupported(expr) => PredicateSupport::Supported(expr),
-            })
-            .collect();
-        Self::new(pushdowns)
-    }
-
-    /// Transform all filters to unsupported, returning a new [`PredicateSupports`]
-    /// with all filters as [`PredicateSupport::Supported`].
-    /// This does not modify the original [`PredicateSupport`].
-    pub fn make_unsupported(self) -> Self {
-        let pushdowns = self
-            .0
-            .into_iter()
-            .map(|f| match f {
-                PredicateSupport::Supported(expr) => PredicateSupport::Unsupported(expr),
-                u @ PredicateSupport::Unsupported(_) => u,
-            })
-            .collect();
-        Self::new(pushdowns)
+    /// Create a new [`PushedDownPredicate`] with supported pushdown.
+    pub fn supported(predicate: Arc<dyn PhysicalExpr>) -> Self {
+        Self {
+            discriminant: PushedDown::Yes,
+            predicate,
+        }
     }
 
-    /// Collect unsupported filters into a Vec, without removing them from the original
-    /// [`PredicateSupport`].
-    pub fn collect_unsupported(&self) -> Vec<Arc<dyn PhysicalExpr>> {
-        self.0
-            .iter()
-            .filter_map(|f| match f {
-                PredicateSupport::Unsupported(expr) => Some(Arc::clone(expr)),
-                PredicateSupport::Supported(_) => None,
-            })
-            .collect()
+    /// Create a new [`PushedDownPredicate`] with unsupported pushdown.
+    pub fn unsupported(predicate: Arc<dyn PhysicalExpr>) -> Self {
+        Self {
+            discriminant: PushedDown::No,
+            predicate,
+        }
     }
+}
 
-    /// Collect all filters into a Vec, without removing them from the original
-    /// FilterPushdowns.
-    pub fn collect_all(self) -> Vec<Arc<dyn PhysicalExpr>> {
-        self.0
-            .into_iter()
-            .map(|f| match f {
-                PredicateSupport::Supported(expr)
-                | PredicateSupport::Unsupported(expr) => expr,
-            })
-            .collect()
-    }
+/// Discriminant for the result of pushing down a filter into a child node.
+#[derive(Debug, Clone, Copy)]
+pub enum PushedDown {
+    /// The predicate was successfully pushed down into the child node.
+    Yes,
+    /// The predicate could not be pushed down into the child node.
+    No,
+}
 
-    pub fn into_inner(self) -> Vec<PredicateSupport> {
-        self.0
+impl PushedDown {
+    /// Logical AND operation: returns `Yes` only if both operands are `Yes`.
+    pub fn and(self, other: PushedDown) -> PushedDown {
+        match (self, other) {
+            (PushedDown::Yes, PushedDown::Yes) => PushedDown::Yes,
+            _ => PushedDown::No,
+        }
     }
 
-    /// Return an iterator over the inner `Vec<FilterPushdown>`.
-    pub fn iter(&self) -> impl Iterator<Item = &PredicateSupport> {
-        self.0.iter()
+    /// Logical OR operation: returns `Yes` if either operand is `Yes`.
+    pub fn or(self, other: PushedDown) -> PushedDown {
+        match (self, other) {
+            (PushedDown::Yes, _) | (_, PushedDown::Yes) => PushedDown::Yes,
+            (PushedDown::No, PushedDown::No) => PushedDown::No,
+        }
     }
 
-    /// Return the number of filters in the inner `Vec<FilterPushdown>`.
-    pub fn len(&self) -> usize {
-        self.0.len()
+    /// Wrap a [`PhysicalExpr`] with this pushdown result.
+    pub fn wrap_expression(self, expr: Arc<dyn PhysicalExpr>) -> PushedDownPredicate {
+        PushedDownPredicate {
+            discriminant: self,
+            predicate: expr,
+        }
     }
+}
 
-    /// Check if the inner `Vec<FilterPushdown>` is empty.
-    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
+/// The result of pushing down a single parent filter into all children.
+#[derive(Debug, Clone)]
+pub struct ChildFilterPushdownResult {
+    pub filter: Arc<dyn PhysicalExpr>,
+    pub child_results: Vec<PushedDown>,
 }
 
-impl IntoIterator for PredicateSupports {
-    type Item = PredicateSupport;
-    type IntoIter = IntoIter<PredicateSupport>;
+impl ChildFilterPushdownResult {
+    /// Combine all child results using OR logic.
+    /// Returns `Yes` if **any** child supports the filter.
+    /// Returns `No` if **all** children reject the filter or if there are no children.
+    pub fn any(&self) -> PushedDown {
+        if self.child_results.is_empty() {
+            // If there are no children, filters cannot be supported
+            PushedDown::No
+        } else {
+            self.child_results
+                .iter()
+                .fold(PushedDown::No, |acc, result| acc.or(*result))
+        }
+    }
 
-    fn into_iter(self) -> Self::IntoIter {
-        self.0.into_iter()
+    /// Combine all child results using AND logic.
+    /// Returns `Yes` if **all** children support the filter.
+    /// Returns `No` if **any** child rejects the filter or if there are no children.
+    pub fn all(&self) -> PushedDown {
+        if self.child_results.is_empty() {
+            // If there are no children, filters cannot be supported
+            PushedDown::No
+        } else {
+            self.child_results
+                .iter()
+                .fold(PushedDown::Yes, |acc, result| acc.and(*result))
+        }
     }
 }
 
 /// The result of pushing down filters into a child node.
+///
 /// This is the result provided to nodes in [`ExecutionPlan::handle_child_pushdown_result`].
 /// Nodes process this result and convert it into a [`FilterPushdownPropagation`]
 /// that is returned to their parent.
@@ -151,61 +202,81 @@ impl IntoIterator for PredicateSupports {
 /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
 #[derive(Debug, Clone)]
 pub struct ChildPushdownResult {
-    /// The combined result of pushing down each parent filter into each child.
-    /// For example, given the fitlers `[a, b]` and children `[1, 2, 3]` the matrix of responses:
-    ///
-    // | filter | child 1     | child 2   | child 3   | result      |
-    // |--------|-------------|-----------|-----------|-------------|
-    // | a      | Supported   | Supported | Supported | Supported   |
-    // | b      | Unsupported | Supported | Supported | Unsupported |
-    ///
-    /// That is: if any child marks a filter as unsupported or if the filter was not pushed
-    /// down into any child then the result is unsupported.
-    /// If at least one children and all children that received the filter mark it as supported
-    /// then the result is supported.
-    pub parent_filters: PredicateSupports,
+    /// The parent filters that were pushed down as received by the current node when [`ExecutionPlan::gather_filters_for_pushdown`](crate::ExecutionPlan::handle_child_pushdown_result) was called.
+    /// Note that this may *not* be the same as the filters that were passed to the children as the current node may have modified them
+    /// (e.g. by reassigning column indices) when it returned them from [`ExecutionPlan::gather_filters_for_pushdown`](crate::ExecutionPlan::handle_child_pushdown_result) in a [`FilterDescription`].
+    /// Attached to each filter is a [`PushedDown`] *per child* that indicates whether the filter was supported or unsupported by each child.
+    /// To get combined results see [`ChildFilterPushdownResult::any`] and [`ChildFilterPushdownResult::all`].
+    pub parent_filters: Vec<ChildFilterPushdownResult>,
     /// The result of pushing down each filter this node provided into each of it's children.
-    /// This is not combined with the parent filters so that nodes can treat each child independently.
-    pub self_filters: Vec<PredicateSupports>,
+    /// The outer vector corresponds to each child, and the inner vector corresponds to each filter.
+    /// Since this node may have generated a different filter for each child the inner vector may have different lengths or the expressions may not match at all.
+    /// It is up to each node to interpret this result based on the filters it provided for each child in [`ExecutionPlan::gather_filters_for_pushdown`](crate::ExecutionPlan::handle_child_pushdown_result).
+    pub self_filters: Vec<Vec<PushedDownPredicate>>,
 }
 
-/// The result of pushing down filters into a node that it returns to its parent.
-/// This is what nodes return from [`ExecutionPlan::handle_child_pushdown_result`] to communicate
+/// The result of pushing down filters into a node.
+///
+/// Returned from [`ExecutionPlan::handle_child_pushdown_result`] to communicate
 /// to the optimizer:
 ///
-/// 1. What to do with any parent filters that were not completely handled by the children.
+/// 1. What to do with any parent filters that could not be pushed down into the children.
 /// 2. If the node needs to be replaced in the execution plan with a new node or not.
 ///
 /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result
 #[derive(Debug, Clone)]
 pub struct FilterPushdownPropagation<T> {
-    pub filters: PredicateSupports,
+    /// Which parent filters were pushed down into this node's children.
+    pub filters: Vec<PushedDown>,
+    /// The updated node, if it was updated during pushdown
     pub updated_node: Option<T>,
 }
 
 impl<T> FilterPushdownPropagation<T> {
-    /// Create a new [`FilterPushdownPropagation`] that tells the parent node
-    /// that echoes back up to the parent the result of pushing down the filters
-    /// into the children.
-    pub fn transparent(child_pushdown_result: ChildPushdownResult) -> Self {
+    /// Create a new [`FilterPushdownPropagation`] that tells the parent node that each parent filter
+    /// is supported if it was supported by *all* children.
+    pub fn if_all(child_pushdown_result: ChildPushdownResult) -> Self {
+        let filters = child_pushdown_result
+            .parent_filters
+            .into_iter()
+            .map(|result| result.all())
+            .collect();
+        Self {
+            filters,
+            updated_node: None,
+        }
+    }
+
+    /// Create a new [`FilterPushdownPropagation`] that tells the parent node that each parent filter
+    /// is supported if it was supported by *any* child.
+    pub fn if_any(child_pushdown_result: ChildPushdownResult) -> Self {
+        let filters = child_pushdown_result
+            .parent_filters
+            .into_iter()
+            .map(|result| result.any())
+            .collect();
         Self {
-            filters: child_pushdown_result.parent_filters,
+            filters,
             updated_node: None,
         }
     }
 
-    /// Create a new [`FilterPushdownPropagation`] that tells the parent node
-    /// that none of the parent filters were not pushed down.
-    pub fn unsupported(parent_filters: Vec<Arc<dyn PhysicalExpr>>) -> Self {
-        let unsupported = PredicateSupports::all_unsupported(parent_filters);
+    /// Create a new [`FilterPushdownPropagation`] that tells the parent node that no filters were pushed down regardless of the child results.
+    pub fn all_unsupported(child_pushdown_result: ChildPushdownResult) -> Self {
+        let filters = child_pushdown_result
+            .parent_filters
+            .into_iter()
+            .map(|_| PushedDown::No)
+            .collect();
         Self {
-            filters: unsupported,
+            filters,
             updated_node: None,
         }
     }
 
     /// Create a new [`FilterPushdownPropagation`] with the specified filter support.
-    pub fn with_filters(filters: PredicateSupports) -> Self {
+    /// This transmits up to our parent node what the result of pushing down the filters into our node and possibly our subtree was.
+    pub fn with_parent_pushdown_result(filters: Vec<PushedDown>) -> Self {
         Self {
             filters,
             updated_node: None,
@@ -213,34 +284,198 @@ impl<T> FilterPushdownPropagation<T> {
     }
 
     /// Bind an updated node to the [`FilterPushdownPropagation`].
+    /// Use this when the current node wants to update itself in the tree or replace itself with a new node (e.g. one of it's children).
+    /// You do not need to call this if one of the children of the current node may have updated itself, that is handled by the optimizer.
     pub fn with_updated_node(mut self, updated_node: T) -> Self {
         self.updated_node = Some(updated_node);
         self
     }
 }
 
+/// Describes filter pushdown for a single child node.
+///
+/// This structure contains two types of filters:
+/// - **Parent filters**: Filters received from the parent node, marked as supported or unsupported
+/// - **Self filters**: Filters generated by the current node to be pushed down to this child
 #[derive(Debug, Clone)]
-struct ChildFilterDescription {
+pub struct ChildFilterDescription {
     /// Description of which parent filters can be pushed down into this node.
     /// Since we need to transmit filter pushdown results back to this node's parent
     /// we need to track each parent filter for each child, even those that are unsupported / won't be pushed down.
-    /// We do this using a [`PredicateSupport`] which simplifies manipulating supported/unsupported filters.
-    parent_filters: PredicateSupports,
+    pub(crate) parent_filters: Vec<PushedDownPredicate>,
     /// Description of which filters this node is pushing down to its children.
     /// Since this is not transmitted back to the parents we can have variable sized inner arrays
     /// instead of having to track supported/unsupported.
-    self_filters: Vec<Arc<dyn PhysicalExpr>>,
+    pub(crate) self_filters: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+/// Validates and remaps filter column references to a target schema in one step.
+///
+/// When pushing filters from a parent to a child node, we need to:
+/// 1. Verify that all columns referenced by the filter exist in the target
+/// 2. Remap column indices to match the target schema
+///
+/// `allowed_indices` controls which column indices (in the parent schema) are
+/// considered valid. For single-input nodes this defaults to
+/// `0..child_schema.len()` (all columns are reachable). For join nodes it is
+/// restricted to the subset of output columns that map to the target child,
+/// which is critical when different sides have same-named columns.
+pub(crate) struct FilterRemapper {
+    /// The target schema to remap column indices into.
+    child_schema: SchemaRef,
+    /// Only columns at these indices (in the *parent* schema) are considered
+    /// valid. For non-join nodes this defaults to `0..child_schema.len()`.
+    allowed_indices: HashSet<usize>,
+}
+
+impl FilterRemapper {
+    /// Create a remapper that accepts any column whose index falls within
+    /// `0..child_schema.len()` and whose name exists in the target schema.
+    pub(crate) fn new(child_schema: SchemaRef) -> Self {
+        let allowed_indices = (0..child_schema.fields().len()).collect();
+        Self {
+            child_schema,
+            allowed_indices,
+        }
+    }
+
+    /// Create a remapper that only accepts columns at the given indices.
+    /// This is used by join nodes to restrict pushdown to one side of the
+    /// join when both sides have same-named columns.
+    fn with_allowed_indices(
+        child_schema: SchemaRef,
+        allowed_indices: HashSet<usize>,
+    ) -> Self {
+        Self {
+            child_schema,
+            allowed_indices,
+        }
+    }
+
+    /// Try to remap a filter's column references to the target schema.
+    ///
+    /// Validates and remaps in a single tree traversal: for each column,
+    /// checks that its index is in the allowed set and that
+    /// its name exists in the target schema, then remaps the index.
+    /// Returns `Some(remapped)` if all columns are valid, or `None` if any
+    /// column fails validation.
+    pub(crate) fn try_remap(
+        &self,
+        filter: &Arc<dyn PhysicalExpr>,
+    ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        let mut all_valid = true;
+        let transformed = Arc::clone(filter).transform_down(|expr| {
+            if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                if self.allowed_indices.contains(&col.index())
+                    && let Ok(new_index) = self.child_schema.index_of(col.name())
+                {
+                    Ok(Transformed::yes(Arc::new(Column::new(
+                        col.name(),
+                        new_index,
+                    ))))
+                } else {
+                    all_valid = false;
+                    Ok(Transformed::complete(expr))
+                }
+            } else {
+                Ok(Transformed::no(expr))
+            }
+        })?;
+
+        Ok(all_valid.then_some(transformed.data))
+    }
 }
 
 impl ChildFilterDescription {
-    fn new() -> Self {
+    /// Build a child filter description by analyzing which parent filters can be pushed to a specific child.
+    ///
+    /// This method performs column analysis to determine which filters can be pushed down:
+    /// - If all columns referenced by a filter exist in the child's schema, it can be pushed down
+    /// - Otherwise, it cannot be pushed down to that child
+    ///
+    /// See [`FilterDescription::from_children`] for more details
+    pub fn from_child(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        child: &Arc<dyn crate::ExecutionPlan>,
+    ) -> Result<Self> {
+        let remapper = FilterRemapper::new(child.schema());
+        Self::remap_filters(parent_filters, &remapper)
+    }
+
+    /// Like [`Self::from_child`], but restricts which parent-level columns are
+    /// considered reachable through this child.
+    ///
+    /// `allowed_indices` is the set of column indices (in the *parent*
+    /// schema) that map to this child's side of a join. A filter is only
+    /// eligible for pushdown when **every** column index it references
+    /// appears in `allowed_indices`.
+    ///
+    /// This prevents incorrect pushdown when different join sides have
+    /// columns with the same name: matching on index ensures a filter
+    /// referencing the right side's `k@2` is not pushed to the left side
+    /// which also has a column named `k` but at a different index.
+    pub fn from_child_with_allowed_indices(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        allowed_indices: HashSet<usize>,
+        child: &Arc<dyn crate::ExecutionPlan>,
+    ) -> Result<Self> {
+        let remapper =
+            FilterRemapper::with_allowed_indices(child.schema(), allowed_indices);
+        Self::remap_filters(parent_filters, &remapper)
+    }
+
+    fn remap_filters(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        remapper: &FilterRemapper,
+    ) -> Result<Self> {
+        let mut child_parent_filters = Vec::with_capacity(parent_filters.len());
+        for filter in parent_filters {
+            if let Some(remapped) = remapper.try_remap(filter)? {
+                child_parent_filters.push(PushedDownPredicate::supported(remapped));
+            } else {
+                child_parent_filters
+                    .push(PushedDownPredicate::unsupported(Arc::clone(filter)));
+            }
+        }
+
+        Ok(Self {
+            parent_filters: child_parent_filters,
+            self_filters: vec![],
+        })
+    }
+
+    /// Mark all parent filters as unsupported for this child.
+    pub fn all_unsupported(parent_filters: &[Arc<dyn PhysicalExpr>]) -> Self {
         Self {
-            parent_filters: PredicateSupports::new(vec![]),
+            parent_filters: parent_filters
+                .iter()
+                .map(|f| PushedDownPredicate::unsupported(Arc::clone(f)))
+                .collect(),
             self_filters: vec![],
         }
     }
+
+    /// Add a self filter (from the current node) to be pushed down to this child.
+    pub fn with_self_filter(mut self, filter: Arc<dyn PhysicalExpr>) -> Self {
+        self.self_filters.push(filter);
+        self
+    }
+
+    /// Add multiple self filters.
+    pub fn with_self_filters(mut self, filters: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        self.self_filters.extend(filters);
+        self
+    }
 }
 
+/// Describes how filters should be pushed down to children.
+///
+/// This structure contains filter descriptions for each child node, specifying:
+/// - Which parent filters can be pushed down to each child
+/// - Which self-generated filters should be pushed down to each child
+///
+/// The filter routing is determined by column analysis - filters can only be pushed
+/// to children whose schemas contain all the referenced columns.
 #[derive(Debug, Clone)]
 pub struct FilterDescription {
     /// A filter description for each child.
@@ -249,92 +484,72 @@ pub struct FilterDescription {
     child_filter_descriptions: Vec<ChildFilterDescription>,
 }
 
+impl Default for FilterDescription {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl FilterDescription {
-    pub fn new_with_child_count(num_children: usize) -> Self {
+    /// Create a new empty FilterDescription
+    pub fn new() -> Self {
         Self {
-            child_filter_descriptions: vec![ChildFilterDescription::new(); num_children],
+            child_filter_descriptions: vec![],
         }
     }
 
-    pub fn parent_filters(&self) -> Vec<PredicateSupports> {
-        self.child_filter_descriptions
-            .iter()
-            .map(|d| &d.parent_filters)
-            .cloned()
-            .collect()
-    }
-
-    pub fn self_filters(&self) -> Vec<Vec<Arc<dyn PhysicalExpr>>> {
-        self.child_filter_descriptions
-            .iter()
-            .map(|d| &d.self_filters)
-            .cloned()
-            .collect()
+    /// Add a child filter description
+    pub fn with_child(mut self, child: ChildFilterDescription) -> Self {
+        self.child_filter_descriptions.push(child);
+        self
     }
 
-    /// Mark all parent filters as supported for all children.
-    /// This is the case if the node allows filters to be pushed down through it
-    /// without any modification.
-    /// This broadcasts the parent filters to all children.
-    /// If handling of parent filters is different for each child then you should set the
-    /// field direclty.
-    /// For example, nodes like [`RepartitionExec`] that let filters pass through it transparently
-    /// use this to mark all parent filters as supported.
-    ///
-    /// [`RepartitionExec`]: crate::repartition::RepartitionExec
-    pub fn all_parent_filters_supported(
-        mut self,
+    /// Build a filter description by analyzing which parent filters can be pushed to each child.
+    /// This method automatically determines filter routing based on column analysis:
+    /// - If all columns referenced by a filter exist in a child's schema, it can be pushed down
+    /// - Otherwise, it cannot be pushed down to that child
+    #[expect(clippy::needless_pass_by_value)]
+    pub fn from_children(
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
-    ) -> Self {
-        let supported = PredicateSupports::all_supported(parent_filters);
-        for child in &mut self.child_filter_descriptions {
-            child.parent_filters = supported.clone();
+        children: &[&Arc<dyn crate::ExecutionPlan>],
+    ) -> Result<Self> {
+        let mut desc = Self::new();
+
+        // For each child, create a ChildFilterDescription
+        for child in children {
+            desc = desc
+                .with_child(ChildFilterDescription::from_child(&parent_filters, child)?);
         }
-        self
+
+        Ok(desc)
     }
 
     /// Mark all parent filters as unsupported for all children.
-    /// This is the case if the node does not allow filters to be pushed down through it.
-    /// This broadcasts the parent filters to all children.
-    /// If handling of parent filters is different for each child then you should set the
-    /// field direclty.
-    /// For example, the default implementation of filter pushdwon in [`ExecutionPlan`]
-    /// assumes that filters cannot be pushed down to children.
-    ///
-    /// [`ExecutionPlan`]: crate::ExecutionPlan
-    pub fn all_parent_filters_unsupported(
-        mut self,
-        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+    pub fn all_unsupported(
+        parent_filters: &[Arc<dyn PhysicalExpr>],
+        children: &[&Arc<dyn crate::ExecutionPlan>],
     ) -> Self {
-        let unsupported = PredicateSupports::all_unsupported(parent_filters);
-        for child in &mut self.child_filter_descriptions {
-            child.parent_filters = unsupported.clone();
+        let mut desc = Self::new();
+        for _ in 0..children.len() {
+            desc =
+                desc.with_child(ChildFilterDescription::all_unsupported(parent_filters));
         }
-        self
+        desc
     }
 
-    /// Add a filter generated / owned by the current node to be pushed down to all children.
-    /// This assumes that there is a single filter that that gets pushed down to all children
-    /// equally.
-    /// If there are multiple filters or pushdown to children is not homogeneous then
-    /// you should set the field directly.
-    /// For example:
-    /// - `TopK` uses this to push down a single filter to all children, it can use this method.
-    /// - `HashJoinExec` pushes down a filter only to the probe side, it cannot use this method.
-    pub fn with_self_filter(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
-        for child in &mut self.child_filter_descriptions {
-            child.self_filters = vec![Arc::clone(&predicate)];
-        }
-        self
+    pub fn parent_filters(&self) -> Vec<Vec<PushedDownPredicate>> {
+        self.child_filter_descriptions
+            .iter()
+            .map(|d| &d.parent_filters)
+            .cloned()
+            .collect()
     }
 
-    pub fn with_self_filters_for_children(
-        mut self,
-        filters: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    ) -> Self {
-        for (child, filters) in self.child_filter_descriptions.iter_mut().zip(filters) {
-            child.self_filters = filters;
-        }
-        self
+    pub fn self_filters(&self) -> Vec<Vec<Arc<dyn PhysicalExpr>>> {
+        self.child_filter_descriptions
+            .iter()
+            .map(|d| &d.self_filters)
+            .cloned()
+            .collect()
     }
 }
diff --git a/datafusion/physical-plan/src/joins/array_map.rs b/datafusion/physical-plan/src/joins/array_map.rs
new file mode 100644
index 0000000000000..ad40d6776df4f
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/array_map.rs
@@ -0,0 +1,547 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::DataType;
+use num_traits::AsPrimitive;
+use std::mem::size_of;
+
+use crate::joins::MapOffset;
+use crate::joins::chain::traverse_chain;
+use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
+use arrow::buffer::BooleanBuffer;
+use arrow::datatypes::ArrowNumericType;
+use datafusion_common::{Result, ScalarValue, internal_err};
+
+/// A macro to downcast only supported integer types (up to 64-bit) and invoke a generic function.
+///
+/// Usage: `downcast_supported_integer!(data_type => (Method, arg1, arg2, ...))`
+///
+/// The `Method` must be an associated method of [`ArrayMap`] that is generic over
+/// `<T: ArrowNumericType>` and allow `T::Native: AsPrimitive<u64>`.
+macro_rules! downcast_supported_integer {
+    ($DATA_TYPE:expr => ($METHOD:ident $(, $ARGS:expr)*)) => {
+        match $DATA_TYPE {
+            arrow::datatypes::DataType::Int8 => ArrayMap::$METHOD::<arrow::datatypes::Int8Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int16 => ArrayMap::$METHOD::<arrow::datatypes::Int16Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int32 => ArrayMap::$METHOD::<arrow::datatypes::Int32Type>($($ARGS),*),
+            arrow::datatypes::DataType::Int64 => ArrayMap::$METHOD::<arrow::datatypes::Int64Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt8 => ArrayMap::$METHOD::<arrow::datatypes::UInt8Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt16 => ArrayMap::$METHOD::<arrow::datatypes::UInt16Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt32 => ArrayMap::$METHOD::<arrow::datatypes::UInt32Type>($($ARGS),*),
+            arrow::datatypes::DataType::UInt64 => ArrayMap::$METHOD::<arrow::datatypes::UInt64Type>($($ARGS),*),
+            _ => {
+                return internal_err!(
+                    "Unsupported type for ArrayMap: {:?}",
+                    $DATA_TYPE
+                );
+            }
+        }
+    };
+}
+
+/// A dense map for single-column integer join keys within a limited range.
+///
+/// Maps join keys to build-side indices using direct array indexing:
+/// `data[val - min_val_in_build_side] -> val_idx_in_build_side + 1`.
+///
+/// NULL values are ignored on both the build side and the probe side.
+///
+/// # Handling Negative Numbers with `wrapping_sub`
+///
+/// This implementation supports signed integer ranges (e.g., `[-5, 5]`) efficiently by
+/// treating them as `u64` (Two's Complement) and relying on the bitwise properties of
+/// wrapping arithmetic (`wrapping_sub`).
+///
+/// In Two's Complement representation, `a_signed - b_signed` produces the same bit pattern
+/// as `a_unsigned.wrapping_sub(b_unsigned)` (modulo 2^N). This allows us to perform
+/// range calculations and zero-based index mapping uniformly for both signed and unsigned
+/// types without branching.
+///
+/// ## Examples
+///
+/// Consider an `Int64` range `[-5, 5]`.
+/// * `min_val (-5)` casts to `u64`: `...11111011` (`u64::MAX - 4`)
+/// * `max_val (5)` casts to `u64`: `...00000101` (`5`)
+///
+/// **1. Range Calculation**
+///
+/// ```text
+/// In modular arithmetic, this is equivalent to:
+///   (5 - (2^64 - 5)) mod 2^64
+/// = (5 - 2^64 + 5) mod 2^64
+/// = (10 - 2^64) mod 2^64
+/// = 10
+///
+/// ```
+/// The resulting `range` (10) correctly represents the size of the interval `[-5, 5]`.
+///
+/// **2. Index Lookup (in `get_matched_indices`)**
+///
+/// For a probe value of `0` (which is stored as `0u64`):
+/// ```text
+/// In modular arithmetic, this is equivalent to:
+///   (0 - (2^64 - 5)) mod 2^64
+/// = (-2^64 + 5) mod 2^64
+/// = 5
+/// ```
+/// This correctly maps `-5` to index `0`, `0` to index `5`, etc.
+#[derive(Debug)]
+pub struct ArrayMap {
+    // data[probSideVal-offset] -> valIdxInBuildSide + 1; 0 for absent
+    data: Vec<u32>,
+    // min val in buildSide
+    offset: u64,
+    // next[buildSideIdx] -> next matching valIdxInBuildSide + 1; 0 for end of chain.
+    // If next is empty, it means there are no duplicate keys (no conflicts).
+    // It uses the same chain-based conflict resolution as [`JoinHashMapType`].
+    next: Vec<u32>,
+    num_of_distinct_key: usize,
+}
+
+impl ArrayMap {
+    pub fn is_supported_type(data_type: &DataType) -> bool {
+        matches!(
+            data_type,
+            DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+        )
+    }
+
+    pub(crate) fn key_to_u64(v: &ScalarValue) -> Option<u64> {
+        match v {
+            ScalarValue::Int8(Some(v)) => Some(*v as u64),
+            ScalarValue::Int16(Some(v)) => Some(*v as u64),
+            ScalarValue::Int32(Some(v)) => Some(*v as u64),
+            ScalarValue::Int64(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt8(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt16(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt32(Some(v)) => Some(*v as u64),
+            ScalarValue::UInt64(Some(v)) => Some(*v),
+            _ => None,
+        }
+    }
+
+    /// Estimates the maximum memory usage for an `ArrayMap` with the given parameters.
+    ///
+    pub fn estimate_memory_size(min_val: u64, max_val: u64, num_rows: usize) -> usize {
+        let range = Self::calculate_range(min_val, max_val);
+        if range >= usize::MAX as u64 {
+            return usize::MAX;
+        }
+        let size = (range + 1) as usize;
+        size.saturating_mul(size_of::<u32>())
+            .saturating_add(num_rows.saturating_mul(size_of::<u32>()))
+    }
+
+    pub fn calculate_range(min_val: u64, max_val: u64) -> u64 {
+        max_val.wrapping_sub(min_val)
+    }
+
+    /// Creates a new [`ArrayMap`] from the given array of join keys.
+    ///
+    /// Note: This function processes only the non-null values in the input `array`,
+    /// ignoring any rows where the key is `NULL`.
+    ///
+    pub(crate) fn try_new(array: &ArrayRef, min_val: u64, max_val: u64) -> Result<Self> {
+        let range = max_val.wrapping_sub(min_val);
+        if range >= usize::MAX as u64 {
+            return internal_err!("ArrayMap key range is too large to be allocated.");
+        }
+        let size = (range + 1) as usize;
+
+        let mut data: Vec<u32> = vec![0; size];
+        let mut next: Vec<u32> = vec![];
+        let mut num_of_distinct_key = 0;
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                fill_data,
+                array,
+                min_val,
+                &mut data,
+                &mut next,
+                &mut num_of_distinct_key
+            )
+        )?;
+
+        Ok(Self {
+            data,
+            offset: min_val,
+            next,
+            num_of_distinct_key,
+        })
+    }
+
+    fn fill_data<T: ArrowNumericType>(
+        array: &ArrayRef,
+        offset_val: u64,
+        data: &mut [u32],
+        next: &mut Vec<u32>,
+        num_of_distinct_key: &mut usize,
+    ) -> Result<()>
+    where
+        T::Native: AsPrimitive<u64>,
+    {
+        let arr = array.as_primitive::<T>();
+        // Iterate in reverse to maintain FIFO order when there are duplicate keys.
+        for (i, val) in arr.iter().enumerate().rev() {
+            if let Some(val) = val {
+                let key: u64 = val.as_();
+                let idx = key.wrapping_sub(offset_val) as usize;
+                if idx >= data.len() {
+                    return internal_err!("failed build Array idx >= data.len()");
+                }
+
+                if data[idx] != 0 {
+                    if next.is_empty() {
+                        *next = vec![0; array.len()]
+                    }
+                    next[i] = data[idx]
+                } else {
+                    *num_of_distinct_key += 1;
+                }
+                data[idx] = (i) as u32 + 1;
+            }
+        }
+        Ok(())
+    }
+
+    pub fn num_of_distinct_key(&self) -> usize {
+        self.num_of_distinct_key
+    }
+
+    /// Returns the memory usage of this [`ArrayMap`] in bytes.
+    pub fn size(&self) -> usize {
+        self.data.capacity() * size_of::<u32>() + self.next.capacity() * size_of::<u32>()
+    }
+
+    pub fn get_matched_indices_with_limit_offset(
+        &self,
+        prob_side_keys: &[ArrayRef],
+        limit: usize,
+        current_offset: MapOffset,
+        probe_indices: &mut Vec<u32>,
+        build_indices: &mut Vec<u64>,
+    ) -> Result<Option<MapOffset>> {
+        if prob_side_keys.len() != 1 {
+            return internal_err!(
+                "ArrayMap expects 1 join key, but got {}",
+                prob_side_keys.len()
+            );
+        }
+        let array = &prob_side_keys[0];
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                lookup_and_get_indices,
+                self,
+                array,
+                limit,
+                current_offset,
+                probe_indices,
+                build_indices
+            )
+        )
+    }
+
+    fn lookup_and_get_indices<T: ArrowNumericType>(
+        &self,
+        array: &ArrayRef,
+        limit: usize,
+        current_offset: MapOffset,
+        probe_indices: &mut Vec<u32>,
+        build_indices: &mut Vec<u64>,
+    ) -> Result<Option<MapOffset>>
+    where
+        T::Native: Copy + AsPrimitive<u64>,
+    {
+        probe_indices.clear();
+        build_indices.clear();
+
+        let arr = array.as_primitive::<T>();
+
+        let have_null = arr.null_count() > 0;
+
+        if self.next.is_empty() {
+            for prob_idx in current_offset.0..arr.len() {
+                if build_indices.len() == limit {
+                    return Ok(Some((prob_idx, None)));
+                }
+
+                // short circuit
+                if have_null && arr.is_null(prob_idx) {
+                    continue;
+                }
+                // SAFETY: prob_idx is guaranteed to be within bounds by the loop range.
+                let prob_val: u64 = unsafe { arr.value_unchecked(prob_idx) }.as_();
+                let idx_in_build_side = prob_val.wrapping_sub(self.offset) as usize;
+
+                if idx_in_build_side >= self.data.len()
+                    || self.data[idx_in_build_side] == 0
+                {
+                    continue;
+                }
+                build_indices.push((self.data[idx_in_build_side] - 1) as u64);
+                probe_indices.push(prob_idx as u32);
+            }
+            Ok(None)
+        } else {
+            let mut remaining_output = limit;
+            let to_skip = match current_offset {
+                // None `initial_next_idx` indicates that `initial_idx` processing hasn't been started
+                (idx, None) => idx,
+                // Zero `initial_next_idx` indicates that `initial_idx` has been processed during
+                // previous iteration, and it should be skipped
+                (idx, Some(0)) => idx + 1,
+                // Otherwise, process remaining `initial_idx` matches by traversing `next_chain`,
+                // to start with the next index
+                (idx, Some(next_idx)) => {
+                    let is_last = idx == arr.len() - 1;
+                    if let Some(next_offset) = traverse_chain(
+                        &self.next,
+                        idx,
+                        next_idx as u32,
+                        &mut remaining_output,
+                        probe_indices,
+                        build_indices,
+                        is_last,
+                    ) {
+                        return Ok(Some(next_offset));
+                    }
+                    idx + 1
+                }
+            };
+
+            for prob_side_idx in to_skip..arr.len() {
+                if remaining_output == 0 {
+                    return Ok(Some((prob_side_idx, None)));
+                }
+
+                if arr.is_null(prob_side_idx) {
+                    continue;
+                }
+
+                let is_last = prob_side_idx == arr.len() - 1;
+
+                // SAFETY: prob_idx is guaranteed to be within bounds by the loop range.
+                let prob_val: u64 = unsafe { arr.value_unchecked(prob_side_idx) }.as_();
+                let idx_in_build_side = prob_val.wrapping_sub(self.offset) as usize;
+                if idx_in_build_side >= self.data.len()
+                    || self.data[idx_in_build_side] == 0
+                {
+                    continue;
+                }
+
+                let build_idx = self.data[idx_in_build_side];
+
+                if let Some(offset) = traverse_chain(
+                    &self.next,
+                    prob_side_idx,
+                    build_idx,
+                    &mut remaining_output,
+                    probe_indices,
+                    build_indices,
+                    is_last,
+                ) {
+                    return Ok(Some(offset));
+                }
+            }
+            Ok(None)
+        }
+    }
+
+    pub fn contain_keys(&self, probe_side_keys: &[ArrayRef]) -> Result<BooleanArray> {
+        if probe_side_keys.len() != 1 {
+            return internal_err!(
+                "ArrayMap join expects 1 join key, but got {}",
+                probe_side_keys.len()
+            );
+        }
+        let array = &probe_side_keys[0];
+
+        downcast_supported_integer!(
+            array.data_type() => (
+                contain_hashes_helper,
+                self,
+                array
+            )
+        )
+    }
+
+    fn contain_hashes_helper<T: ArrowNumericType>(
+        &self,
+        array: &ArrayRef,
+    ) -> Result<BooleanArray>
+    where
+        T::Native: AsPrimitive<u64>,
+    {
+        let arr = array.as_primitive::<T>();
+        let buffer = BooleanBuffer::collect_bool(arr.len(), |i| {
+            if arr.is_null(i) {
+                return false;
+            }
+            // SAFETY: i is within bounds [0, arr.len())
+            let key: u64 = unsafe { arr.value_unchecked(i) }.as_();
+            let idx = key.wrapping_sub(self.offset) as usize;
+            idx < self.data.len() && self.data[idx] != 0
+        });
+        Ok(BooleanArray::new(buffer, None))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int32Array;
+    use arrow::array::Int64Array;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_array_map_limit_offset_duplicate_elements() -> Result<()> {
+        let build: ArrayRef = Arc::new(Int32Array::from(vec![1, 1, 2]));
+        let map = ArrayMap::try_new(&build, 1, 2)?;
+        let probe = [Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef];
+
+        let mut prob_idx = Vec::new();
+        let mut build_idx = Vec::new();
+        let mut next = Some((0, None));
+        let mut results = vec![];
+
+        while let Some(o) = next {
+            next = map.get_matched_indices_with_limit_offset(
+                &probe,
+                1,
+                o,
+                &mut prob_idx,
+                &mut build_idx,
+            )?;
+            results.push((prob_idx.clone(), build_idx.clone(), next));
+        }
+
+        let expected = vec![
+            (vec![0], vec![0], Some((0, Some(2)))),
+            (vec![0], vec![1], Some((0, Some(0)))),
+            (vec![1], vec![2], None),
+        ];
+        assert_eq!(results, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_with_limit_and_misses() -> Result<()> {
+        let build: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let map = ArrayMap::try_new(&build, 1, 2)?;
+        let probe = [Arc::new(Int32Array::from(vec![10, 1, 2])) as ArrayRef];
+
+        let (mut p_idx, mut b_idx) = (vec![], vec![]);
+        // Skip 10, find 1, next is 2
+        let next = map.get_matched_indices_with_limit_offset(
+            &probe,
+            1,
+            (0, None),
+            &mut p_idx,
+            &mut b_idx,
+        )?;
+        assert_eq!(p_idx, vec![1]);
+        assert_eq!(b_idx, vec![0]);
+        assert_eq!(next, Some((2, None)));
+
+        // Find 2, end
+        let next = map.get_matched_indices_with_limit_offset(
+            &probe,
+            1,
+            next.unwrap(),
+            &mut p_idx,
+            &mut b_idx,
+        )?;
+        assert_eq!(p_idx, vec![2]);
+        assert_eq!(b_idx, vec![1]);
+        assert!(next.is_none());
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_with_build_duplicates_and_misses() -> Result<()> {
+        let build_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 1]));
+        let array_map = ArrayMap::try_new(&build_array, 1, 1)?;
+        // prob: 10(m), 1(h1, h2), 20(m), 1(h1, h2)
+        let probe_array: ArrayRef = Arc::new(Int32Array::from(vec![10, 1, 20, 1]));
+        let prob_side_keys = [probe_array];
+
+        let mut prob_indices = Vec::new();
+        let mut build_indices = Vec::new();
+
+        // batch_size=3, should get 2 matches from first '1' and 1 match from second '1'
+        let result_offset = array_map.get_matched_indices_with_limit_offset(
+            &prob_side_keys,
+            3,
+            (0, None),
+            &mut prob_indices,
+            &mut build_indices,
+        )?;
+
+        assert_eq!(prob_indices, vec![1, 1, 3]);
+        assert_eq!(build_indices, vec![0, 1, 0]);
+        assert_eq!(result_offset, Some((3, Some(2))));
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_map_i64_with_negative_and_positive_numbers() -> Result<()> {
+        // Build array with a mix of negative and positive i64 values, no duplicates
+        let build_array: ArrayRef = Arc::new(Int64Array::from(vec![-5, 0, 5, -2, 3, 10]));
+        let min_val = -5_i128;
+        let max_val = 10_i128;
+
+        let array_map = ArrayMap::try_new(&build_array, min_val as u64, max_val as u64)?;
+
+        // Probe array
+        let probe_array: ArrayRef = Arc::new(Int64Array::from(vec![0, -5, 10, -1]));
+        let prob_side_keys = [Arc::clone(&probe_array)];
+
+        let mut prob_indices = Vec::new();
+        let mut build_indices = Vec::new();
+
+        // Call once to get all matches
+        let result_offset = array_map.get_matched_indices_with_limit_offset(
+            &prob_side_keys,
+            10, // A batch size larger than number of probes
+            (0, None),
+            &mut prob_indices,
+            &mut build_indices,
+        )?;
+
+        // Expected matches, in probe-side order:
+        // Probe 0 (value 0) -> Build 1 (value 0)
+        // Probe 1 (value -5) -> Build 0 (value -5)
+        // Probe 2 (value 10) -> Build 5 (value 10)
+        let expected_prob_indices = vec![0, 1, 2];
+        let expected_build_indices = vec![1, 0, 5];
+
+        assert_eq!(prob_indices, expected_prob_indices);
+        assert_eq!(build_indices, expected_build_indices);
+        assert!(result_offset.is_none());
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/chain.rs b/datafusion/physical-plan/src/joins/chain.rs
new file mode 100644
index 0000000000000..846b7505d6478
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/chain.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::Debug;
+use std::ops::Sub;
+
+use arrow::datatypes::ArrowNativeType;
+
+use crate::joins::MapOffset;
+
+/// Traverses the chain of matching indices, collecting results up to the remaining limit.
+/// Returns `Some(offset)` if the limit was reached and there are more results to process,
+/// or `None` if the chain was fully traversed.
+#[inline(always)]
+pub(crate) fn traverse_chain<T>(
+    next_chain: &[T],
+    prob_idx: usize,
+    start_chain_idx: T,
+    remaining: &mut usize,
+    input_indices: &mut Vec<u32>,
+    match_indices: &mut Vec<u64>,
+    is_last_input: bool,
+) -> Option<MapOffset>
+where
+    T: Copy + TryFrom<usize> + PartialOrd + Into<u64> + Sub<Output = T>,
+    <T as TryFrom<usize>>::Error: Debug,
+    T: ArrowNativeType,
+{
+    let zero = T::usize_as(0);
+    let one = T::usize_as(1);
+    let mut match_row_idx = start_chain_idx - one;
+
+    loop {
+        match_indices.push(match_row_idx.into());
+        input_indices.push(prob_idx as u32);
+        *remaining -= 1;
+
+        let next = next_chain[match_row_idx.into() as usize];
+
+        if *remaining == 0 {
+            // Limit reached - return offset for next call
+            return if is_last_input && next == zero {
+                // Finished processing the last input row
+                None
+            } else {
+                Some((prob_idx, Some(next.into())))
+            };
+        }
+        if next == zero {
+            // End of chain
+            return None;
+        }
+        match_row_idx = next - one;
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index 4d8c48c659ef5..a895f69dc5138 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -21,33 +21,37 @@
 use std::{any::Any, sync::Arc, task::Poll};
 
 use super::utils::{
-    adjust_right_output_partitioning, reorder_output_after_swap, BatchSplitter,
-    BatchTransformer, BuildProbeJoinMetrics, NoopBatchTransformer, OnceAsync, OnceFut,
-    StatefulStreamResult,
+    BatchSplitter, BatchTransformer, BuildProbeJoinMetrics, NoopBatchTransformer,
+    OnceAsync, OnceFut, StatefulStreamResult, adjust_right_output_partitioning,
+    reorder_output_after_swap,
 };
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, ProjectionExec,
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs,
 };
 use crate::{
-    handle_state, ColumnStatistics, DisplayAs, DisplayFormatType, Distribution,
-    ExecutionPlan, ExecutionPlanProperties, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
+    ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
+    ExecutionPlanProperties, PlanProperties, RecordBatchStream,
+    SendableRecordBatchStream, Statistics, check_if_same_properties, handle_state,
 };
 
 use arrow::array::{RecordBatch, RecordBatchOptions};
 use arrow::compute::concat_batches;
 use arrow::datatypes::{Fields, Schema, SchemaRef};
 use datafusion_common::stats::Precision;
-use datafusion_common::{internal_err, JoinType, Result, ScalarValue};
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    JoinType, Result, ScalarValue, assert_eq_or_internal_err, internal_err,
+};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
 
 use async_trait::async_trait;
-use futures::{ready, Stream, StreamExt, TryStreamExt};
+use futures::{Stream, StreamExt, TryStreamExt, ready};
 
 /// Data of the left side that is buffered into memory
 #[derive(Debug)]
@@ -59,7 +63,7 @@ struct JoinLeftData {
     _reservation: MemoryReservation,
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
+#[expect(rustdoc::private_intra_doc_links)]
 /// Cross Join Execution Plan
 ///
 /// This operator is used when there are no predicates between two tables and
@@ -92,7 +96,7 @@ pub struct CrossJoinExec {
     /// Execution plan metrics
     metrics: ExecutionPlanMetricsSet,
     /// Properties such as schema, equivalence properties, ordering, partitioning, etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl CrossJoinExec {
@@ -115,7 +119,7 @@ impl CrossJoinExec {
         };
 
         let schema = Arc::new(Schema::new(all_columns).with_metadata(metadata));
-        let cache = Self::compute_properties(&left, &right, Arc::clone(&schema));
+        let cache = Self::compute_properties(&left, &right, Arc::clone(&schema)).unwrap();
 
         CrossJoinExec {
             left,
@@ -123,7 +127,7 @@ impl CrossJoinExec {
             schema,
             left_fut: Default::default(),
             metrics: ExecutionPlanMetricsSet::default(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -142,7 +146,7 @@ impl CrossJoinExec {
         left: &Arc<dyn ExecutionPlan>,
         right: &Arc<dyn ExecutionPlan>,
         schema: SchemaRef,
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Calculate equivalence properties
         // TODO: Check equivalence properties of cross join, it may preserve
         //       ordering in some cases.
@@ -154,7 +158,7 @@ impl CrossJoinExec {
             &[false, false],
             None,
             &[],
-        );
+        )?;
 
         // Get output partitioning:
         // TODO: Optimize the cross join implementation to generate M * N
@@ -162,19 +166,25 @@ impl CrossJoinExec {
         let output_partitioning = adjust_right_output_partitioning(
             right.output_partitioning(),
             left.schema().fields.len(),
-        );
+        )?;
 
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             EmissionType::Final,
             boundedness_from_children([left, right]),
-        )
+        ))
     }
 
     /// Returns a new `ExecutionPlan` that computes the same join as this one,
     /// with the left and right inputs swapped using the  specified
     /// `partition_mode`.
+    ///
+    /// # Notes:
+    ///
+    /// This function should be called BEFORE inserting any repartitioning
+    /// operators on the join's children. Check [`super::HashJoinExec::swap_inputs`]
+    /// for more details.
     pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
         let new_join =
             CrossJoinExec::new(Arc::clone(&self.right), Arc::clone(&self.left));
@@ -184,6 +194,23 @@ impl CrossJoinExec {
             &self.right.schema(),
         )
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            left_fut: Default::default(),
+            cache: Arc::clone(&self.cache),
+            schema: Arc::clone(&self.schema),
+        }
+    }
 }
 
 /// Asynchronously collect the result of the left child
@@ -198,7 +225,7 @@ async fn load_left_input(
     let (batches, _metrics, reservation) = stream
         .try_fold(
             (Vec::new(), metrics, reservation),
-            |(mut batches, metrics, mut reservation), batch| async {
+            |(mut batches, metrics, reservation), batch| async {
                 let batch_size = batch.get_array_memory_size();
                 // Reserve memory for incoming batch
                 reservation.try_grow(batch_size)?;
@@ -248,7 +275,7 @@ impl ExecutionPlan for CrossJoinExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -260,16 +287,37 @@ impl ExecutionPlan for CrossJoinExec {
         Some(self.metrics.clone_inner())
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // CrossJoin has no join conditions or expressions
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(CrossJoinExec::new(
             Arc::clone(&children[0]),
             Arc::clone(&children[1]),
         )))
     }
 
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let new_exec = CrossJoinExec {
+            left: Arc::clone(&self.left),
+            right: Arc::clone(&self.right),
+            schema: Arc::clone(&self.schema),
+            left_fut: Default::default(), // reset the build side!
+            metrics: ExecutionPlanMetricsSet::default(),
+            cache: Arc::clone(&self.cache),
+        };
+        Ok(Arc::new(new_exec))
+    }
+
     fn required_input_distribution(&self) -> Vec<Distribution> {
         vec![
             Distribution::SinglePartition,
@@ -282,12 +330,12 @@ impl ExecutionPlan for CrossJoinExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if self.left.output_partitioning().partition_count() != 1 {
-            return internal_err!(
-                "Invalid CrossJoinExec, the output partition count of the left child must be 1,\
+        assert_eq_or_internal_err!(
+            self.left.output_partitioning().partition_count(),
+            1,
+            "Invalid CrossJoinExec, the output partition count of the left child must be 1,\
                  consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        );
 
         let stream = self.right.execute(partition, Arc::clone(&context))?;
 
@@ -336,16 +384,13 @@ impl ExecutionPlan for CrossJoinExec {
         }
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         // Get the all partitions statistics of the left
-        let left_stats = self.left.partition_statistics(None)?;
-        let right_stats = self.right.partition_statistics(partition)?;
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(None)?);
+        let right_stats =
+            Arc::unwrap_or_clone(self.right.partition_statistics(partition)?);
 
-        Ok(stats_cartesian_product(left_stats, right_stats))
+        Ok(Arc::new(stats_cartesian_product(left_stats, right_stats)))
     }
 
     /// Tries to swap the projection with its input [`CrossJoinExec`]. If it can be done,
@@ -429,6 +474,7 @@ fn stats_cartesian_product(
                 })
                 .map(|row_count| s.sum_value.multiply(&row_count))
                 .unwrap_or(Precision::Absent),
+            byte_size: Precision::Absent,
         })
         .chain(right_col_stats.into_iter().map(|s| {
             ColumnStatistics {
@@ -447,6 +493,7 @@ fn stats_cartesian_product(
                     })
                     .map(|row_count| s.sum_value.multiply(&row_count))
                     .unwrap_or(Precision::Absent),
+                byte_size: Precision::Absent,
             }
         }))
         .collect();
@@ -559,7 +606,8 @@ impl<T: BatchTransformer> CrossJoinStream<T> {
                     handle_state!(ready!(self.fetch_probe_batch(cx)))
                 }
                 CrossJoinStreamState::BuildBatches(_) => {
-                    handle_state!(self.build_batches())
+                    let poll = handle_state!(self.build_batches());
+                    self.join_metrics.baseline.record_poll(poll)
                 }
             };
         }
@@ -608,7 +656,7 @@ impl<T: BatchTransformer> CrossJoinStream<T> {
         Poll::Ready(Ok(StatefulStreamResult::Continue))
     }
 
-    /// Joins the the indexed row of left data with the current probe batch.
+    /// Joins the indexed row of left data with the current probe batch.
     /// If all the results are produced, the state is set to fetch new probe batch.
     fn build_batches(&mut self) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         let right_batch = self.state.try_as_record_batch()?;
@@ -631,8 +679,6 @@ impl<T: BatchTransformer> CrossJoinStream<T> {
                         self.left_index += 1;
                     }
 
-                    self.join_metrics.output_batches.add(1);
-                    self.join_metrics.output_rows.add(batch.num_rows());
                     return Ok(StatefulStreamResult::Ready(Some(batch)));
                 }
             }
@@ -647,7 +693,7 @@ impl<T: BatchTransformer> CrossJoinStream<T> {
 mod tests {
     use super::*;
     use crate::common;
-    use crate::test::build_table_scan_i32;
+    use crate::test::{assert_join_metrics, build_table_scan_i32};
 
     use datafusion_common::{assert_contains, test_util::batches_to_sort_string};
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
@@ -657,14 +703,15 @@ mod tests {
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
         context: Arc<TaskContext>,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
         let join = CrossJoinExec::new(left, right);
         let columns_header = columns(&join.schema());
 
         let stream = join.execute(0, context)?;
         let batches = common::collect(stream).await?;
+        let metrics = join.metrics().unwrap();
 
-        Ok((columns_header, batches))
+        Ok((columns_header, batches, metrics))
     }
 
     #[tokio::test]
@@ -684,6 +731,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -691,6 +739,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -704,6 +753,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
                 sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
+                byte_size: Precision::Absent,
             }],
         };
 
@@ -721,6 +771,7 @@ mod tests {
                         42 * right_row_count as i64,
                     ))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -728,6 +779,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3 * right_row_count),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
@@ -737,6 +789,7 @@ mod tests {
                         20 * left_row_count as i64,
                     ))),
                     null_count: Precision::Exact(2 * left_row_count),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -758,6 +811,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -765,6 +819,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -778,6 +833,7 @@ mod tests {
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
                 sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
+                byte_size: Precision::Absent,
             }],
         };
 
@@ -793,6 +849,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Absent, // we don't know the row count on the right
                     null_count: Precision::Absent, // we don't know the row count on the right
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -800,6 +857,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent, // we don't know the row count on the right
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
@@ -809,6 +867,7 @@ mod tests {
                         20 * left_row_count as i64,
                     ))),
                     null_count: Precision::Exact(2 * left_row_count),
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -831,22 +890,24 @@ mod tests {
             ("c2", &vec![14, 15]),
         );
 
-        let (columns, batches) = join_collect(left, right, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(left, right, task_ctx).await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 12 | 14 |
-            | 1  | 4  | 7  | 11 | 13 | 15 |
-            | 2  | 5  | 8  | 10 | 12 | 14 |
-            | 2  | 5  | 8  | 11 | 13 | 15 |
-            | 3  | 6  | 9  | 10 | 12 | 14 |
-            | 3  | 6  | 9  | 11 | 13 | 15 |
-            +----+----+----+----+----+----+
-            "#);
+        assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 4  | 7  | 10 | 12 | 14 |
+        | 1  | 4  | 7  | 11 | 13 | 15 |
+        | 2  | 5  | 8  | 10 | 12 | 14 |
+        | 2  | 5  | 8  | 11 | 13 | 15 |
+        | 3  | 6  | 9  | 10 | 12 | 14 |
+        | 3  | 6  | 9  | 11 | 13 | 15 |
+        +----+----+----+----+----+----+
+        ");
+
+        assert_join_metrics!(metrics, 6);
 
         Ok(())
     }
@@ -874,7 +935,7 @@ mod tests {
 
         assert_contains!(
             err.to_string(),
-            "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  CrossJoinExec"
+            "Resources exhausted: Additional allocation failed for CrossJoinExec with top memory consumers (across reservations) as:\n  CrossJoinExec"
         );
 
         Ok(())
diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs
similarity index 51%
rename from datafusion/physical-plan/src/joins/hash_join.rs
rename to datafusion/physical-plan/src/joins/hash_join/exec.rs
index 398c2fed7cdf9..038eb96b7b45e 100644
--- a/datafusion/physical-plan/src/joins/hash_join.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -15,85 +15,182 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`HashJoinExec`] Partitioned Hash Join Operator
-
+use std::collections::HashSet;
 use std::fmt;
 use std::mem::size_of;
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;
-use std::task::Poll;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, OnceLock};
 use std::{any::Any, vec};
 
-use super::utils::{
-    asymmetric_join_output_partitioning, get_final_indices_from_shared_bitmap,
-    reorder_output_after_swap, swap_join_projection,
+use crate::ExecutionPlanProperties;
+use crate::execution_plan::{
+    EmissionType, boundedness_from_children, has_same_children_properties,
+    stub_properties,
+};
+use crate::filter_pushdown::{
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
+};
+use crate::joins::Map;
+use crate::joins::array_map::ArrayMap;
+use crate::joins::hash_join::inlist_builder::build_struct_inlist_values;
+use crate::joins::hash_join::shared_bounds::{
+    ColumnBounds, PartitionBounds, PushdownStrategy, SharedBuildAccumulator,
 };
-use super::{
-    utils::{OnceAsync, OnceFut},
-    PartitionMode, SharedBitmapBuilder,
+use crate::joins::hash_join::stream::{
+    BuildSide, BuildSideInitialState, HashJoinStream, HashJoinStreamState,
 };
-use super::{JoinOn, JoinOnRef};
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::joins::join_hash_map::{JoinHashMapU32, JoinHashMapU64};
+use crate::joins::utils::{
+    OnceAsync, OnceFut, asymmetric_join_output_partitioning, reorder_output_after_swap,
+    swap_join_projection, update_hash,
+};
+use crate::joins::{JoinOn, JoinOnRef, PartitionMode, SharedBitmapBuilder};
+use crate::metrics::{Count, MetricBuilder};
 use crate::projection::{
-    try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData,
-    ProjectionExec,
+    EmbeddedProjection, JoinData, ProjectionExec, try_embed_projection,
+    try_pushdown_through_join,
 };
+use crate::repartition::REPARTITION_RANDOM_STATE;
 use crate::spill::get_record_batch_memory_size;
-use crate::ExecutionPlanProperties;
 use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+    PlanProperties, SendableRecordBatchStream, Statistics,
     common::can_project,
-    handle_state,
-    hash_utils::create_hashes,
-    joins::join_hash_map::JoinHashMapOffset,
     joins::utils::{
-        adjust_indices_by_join_type, apply_join_filter_to_indices,
-        build_batch_from_indices, build_join_schema, check_join_is_valid,
-        estimate_join_statistics, need_produce_result_in_final,
-        symmetric_join_output_partitioning, BuildProbeJoinMetrics, ColumnIndex,
-        JoinFilter, JoinHashMap, JoinHashMapType, StatefulStreamResult,
+        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType,
+        build_join_schema, check_join_is_valid, estimate_join_statistics,
+        need_produce_result_in_final, symmetric_join_output_partitioning,
     },
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
-    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
 
-use arrow::array::{
-    cast::downcast_array, Array, ArrayRef, BooleanArray, BooleanBufferBuilder,
-    UInt32Array, UInt64Array,
-};
-use arrow::compute::kernels::cmp::{eq, not_distinct};
-use arrow::compute::{and, concat_batches, take, FilterBuilder};
-use arrow::datatypes::{Schema, SchemaRef};
-use arrow::error::ArrowError;
+use arrow::array::{ArrayRef, BooleanBufferBuilder};
+use arrow::compute::concat_batches;
+use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use arrow::util::bit_util;
+use arrow_schema::{DataType, Schema};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_err, project_schema, DataFusionError,
-    JoinSide, JoinType, Result,
+    JoinSide, JoinType, NullEquality, Result, assert_or_internal_err, internal_err,
+    plan_err, project_schema,
 };
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
-use datafusion_expr::Operator;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_expr::Accumulator;
+use datafusion_functions_aggregate_common::min_max::{MaxAccumulator, MinAccumulator};
 use datafusion_physical_expr::equivalence::{
-    join_equivalence_properties, ProjectionMapping,
+    ProjectionMapping, join_equivalence_properties,
 };
-use datafusion_physical_expr::PhysicalExprRef;
-use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, lit};
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
+use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef};
 
-use ahash::RandomState;
+use datafusion_common::hash_utils::RandomState;
 use datafusion_physical_expr_common::physical_expr::fmt_sql;
-use futures::{ready, Stream, StreamExt, TryStreamExt};
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
+use futures::TryStreamExt;
 use parking_lot::Mutex;
 
+use super::partitioned_hash_eval::SeededRandomState;
+
 /// Hard-coded seed to ensure hash values from the hash join differ from `RepartitionExec`, avoiding collisions.
-const HASH_JOIN_SEED: RandomState =
-    RandomState::with_seeds('J' as u64, 'O' as u64, 'I' as u64, 'N' as u64);
+pub(crate) const HASH_JOIN_SEED: SeededRandomState =
+    SeededRandomState::with_seed(12210250226015887276);
+
+const ARRAY_MAP_CREATED_COUNT_METRIC_NAME: &str = "array_map_created_count";
+
+#[expect(clippy::too_many_arguments)]
+fn try_create_array_map(
+    bounds: &Option<PartitionBounds>,
+    schema: &SchemaRef,
+    batches: &[RecordBatch],
+    on_left: &[PhysicalExprRef],
+    reservation: &mut MemoryReservation,
+    perfect_hash_join_small_build_threshold: usize,
+    perfect_hash_join_min_key_density: f64,
+    null_equality: NullEquality,
+) -> Result<Option<(ArrayMap, RecordBatch, Vec<ArrayRef>)>> {
+    if on_left.len() != 1 {
+        return Ok(None);
+    }
+
+    if null_equality == NullEquality::NullEqualsNull {
+        for batch in batches.iter() {
+            let arrays = evaluate_expressions_to_arrays(on_left, batch)?;
+            if arrays[0].null_count() > 0 {
+                return Ok(None);
+            }
+        }
+    }
+
+    let (min_val, max_val) = if let Some(bounds) = bounds {
+        let (min_val, max_val) = if let Some(cb) = bounds.get_column_bounds(0) {
+            (cb.min.clone(), cb.max.clone())
+        } else {
+            return Ok(None);
+        };
+
+        if min_val.is_null() || max_val.is_null() {
+            return Ok(None);
+        }
+
+        if min_val > max_val {
+            return internal_err!("min_val>max_val");
+        }
+
+        if let Some((mi, ma)) =
+            ArrayMap::key_to_u64(&min_val).zip(ArrayMap::key_to_u64(&max_val))
+        {
+            (mi, ma)
+        } else {
+            return Ok(None);
+        }
+    } else {
+        return Ok(None);
+    };
+
+    let range = ArrayMap::calculate_range(min_val, max_val);
+    let num_row: usize = batches.iter().map(|x| x.num_rows()).sum();
+
+    // TODO: support create ArrayMap<u64>
+    if num_row >= u32::MAX as usize {
+        return Ok(None);
+    }
+
+    // When the key range spans the full integer domain (e.g. i64::MIN to i64::MAX),
+    // range is u64::MAX and `range + 1` below would overflow.
+    if range == usize::MAX as u64 {
+        return Ok(None);
+    }
+
+    let dense_ratio = (num_row as f64) / ((range + 1) as f64);
+
+    if range >= perfect_hash_join_small_build_threshold as u64
+        && dense_ratio <= perfect_hash_join_min_key_density
+    {
+        return Ok(None);
+    }
+
+    let mem_size = ArrayMap::estimate_memory_size(min_val, max_val, num_row);
+    reservation.try_grow(mem_size)?;
+
+    let batch = concat_batches(schema, batches)?;
+    let left_values = evaluate_expressions_to_arrays(on_left, &batch)?;
+
+    let array_map = ArrayMap::try_new(&left_values[0], min_val, max_val)?;
+
+    Ok(Some((array_map, batch, left_values)))
+}
 
 /// HashTable and input data for the left (build side) of a join
-struct JoinLeftData {
+pub(super) struct JoinLeftData {
     /// The hash table with indices into `batch`
-    hash_map: JoinHashMap,
+    /// Arc is used to allow sharing with SharedBuildAccumulator for hash map pushdown
+    pub(super) map: Arc<Map>,
     /// The input rows for the build side
     batch: RecordBatch,
     /// The build side on expressions values
@@ -108,56 +205,325 @@ struct JoinLeftData {
     /// This could hide potential out-of-memory issues, especially when upstream operators increase their memory consumption.
     /// The MemoryReservation ensures proper tracking of memory resources throughout the join operation's lifecycle.
     _reservation: MemoryReservation,
+    /// Bounds computed from the build side for dynamic filter pushdown.
+    /// If the partition is empty (no rows) this will be None.
+    /// If the partition has some rows this will be Some with the bounds for each join key column.
+    pub(super) bounds: Option<PartitionBounds>,
+    /// Membership testing strategy for filter pushdown
+    /// Contains either InList values for small build sides or hash table reference for large build sides
+    pub(super) membership: PushdownStrategy,
+    /// Shared atomic flag indicating if any probe partition saw data (for null-aware anti joins)
+    /// This is shared across all probe partitions to provide global knowledge
+    pub(super) probe_side_non_empty: AtomicBool,
+    /// Shared atomic flag indicating if any probe partition saw NULL in join keys (for null-aware anti joins)
+    pub(super) probe_side_has_null: AtomicBool,
 }
 
 impl JoinLeftData {
-    /// Create a new `JoinLeftData` from its parts
-    fn new(
-        hash_map: JoinHashMap,
-        batch: RecordBatch,
-        values: Vec<ArrayRef>,
-        visited_indices_bitmap: SharedBitmapBuilder,
-        probe_threads_counter: AtomicUsize,
-        reservation: MemoryReservation,
-    ) -> Self {
-        Self {
-            hash_map,
-            batch,
-            values,
-            visited_indices_bitmap,
-            probe_threads_counter,
-            _reservation: reservation,
-        }
-    }
-
-    /// return a reference to the hash map
-    fn hash_map(&self) -> &JoinHashMap {
-        &self.hash_map
+    /// return a reference to the map
+    pub(super) fn map(&self) -> &Map {
+        &self.map
     }
 
     /// returns a reference to the build side batch
-    fn batch(&self) -> &RecordBatch {
+    pub(super) fn batch(&self) -> &RecordBatch {
         &self.batch
     }
 
     /// returns a reference to the build side expressions values
-    fn values(&self) -> &[ArrayRef] {
+    pub(super) fn values(&self) -> &[ArrayRef] {
         &self.values
     }
 
     /// returns a reference to the visited indices bitmap
-    fn visited_indices_bitmap(&self) -> &SharedBitmapBuilder {
+    pub(super) fn visited_indices_bitmap(&self) -> &SharedBitmapBuilder {
         &self.visited_indices_bitmap
     }
 
+    /// returns a reference to the InList values for filter pushdown
+    pub(super) fn membership(&self) -> &PushdownStrategy {
+        &self.membership
+    }
+
     /// Decrements the counter of running threads, and returns `true`
     /// if caller is the last running thread
-    fn report_probe_completed(&self) -> bool {
+    pub(super) fn report_probe_completed(&self) -> bool {
         self.probe_threads_counter.fetch_sub(1, Ordering::Relaxed) == 1
     }
 }
 
-#[allow(rustdoc::private_intra_doc_links)]
+/// Helps to build [`HashJoinExec`].
+///
+/// Builder can be created from an existing [`HashJoinExec`] using [`From::from`].
+/// In this case, all its fields are inherited. If a field that affects the node's
+/// properties is modified, they will be automatically recomputed during the build.
+///
+/// # Adding setters
+///
+/// When adding a new setter, it is necessary to ensure that the `preserve_properties`
+/// flag is set to false if modifying the field requires a recomputation of the plan's
+/// properties.
+///
+pub struct HashJoinExecBuilder {
+    exec: HashJoinExec,
+    preserve_properties: bool,
+}
+
+impl HashJoinExecBuilder {
+    /// Make a new [`HashJoinExecBuilder`].
+    pub fn new(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: Vec<(PhysicalExprRef, PhysicalExprRef)>,
+        join_type: JoinType,
+    ) -> Self {
+        Self {
+            exec: HashJoinExec {
+                left,
+                right,
+                on,
+                filter: None,
+                join_type,
+                left_fut: Default::default(),
+                random_state: HASH_JOIN_SEED,
+                mode: PartitionMode::Auto,
+                fetch: None,
+                metrics: ExecutionPlanMetricsSet::new(),
+                projection: None,
+                column_indices: vec![],
+                null_equality: NullEquality::NullEqualsNothing,
+                null_aware: false,
+                dynamic_filter: None,
+                // Will be computed at when plan will be built.
+                cache: stub_properties(),
+                join_schema: Arc::new(Schema::empty()),
+            },
+            // As `exec` is initialized with stub properties,
+            // they will be properly computed when plan will be built.
+            preserve_properties: false,
+        }
+    }
+
+    /// Set join type.
+    pub fn with_type(mut self, join_type: JoinType) -> Self {
+        self.exec.join_type = join_type;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set projection from the vector.
+    pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self {
+        self.with_projection_ref(projection.map(Into::into))
+    }
+
+    /// Set projection from the shared reference.
+    pub fn with_projection_ref(mut self, projection: Option<ProjectionRef>) -> Self {
+        self.exec.projection = projection;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set optional filter.
+    pub fn with_filter(mut self, filter: Option<JoinFilter>) -> Self {
+        self.exec.filter = filter;
+        self
+    }
+
+    /// Set expressions to join on.
+    pub fn with_on(mut self, on: Vec<(PhysicalExprRef, PhysicalExprRef)>) -> Self {
+        self.exec.on = on;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set partition mode.
+    pub fn with_partition_mode(mut self, mode: PartitionMode) -> Self {
+        self.exec.mode = mode;
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Set null equality property.
+    pub fn with_null_equality(mut self, null_equality: NullEquality) -> Self {
+        self.exec.null_equality = null_equality;
+        self
+    }
+
+    /// Set null aware property.
+    pub fn with_null_aware(mut self, null_aware: bool) -> Self {
+        self.exec.null_aware = null_aware;
+        self
+    }
+
+    /// Set fetch property.
+    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
+        self.exec.fetch = fetch;
+        self
+    }
+
+    /// Require to recompute plan properties.
+    pub fn recompute_properties(mut self) -> Self {
+        self.preserve_properties = false;
+        self
+    }
+
+    /// Replace children.
+    pub fn with_new_children(
+        mut self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Self> {
+        assert_or_internal_err!(
+            children.len() == 2,
+            "wrong number of children passed into `HashJoinExecBuilder`"
+        );
+        self.preserve_properties &= has_same_children_properties(&self.exec, &children)?;
+        self.exec.right = children.swap_remove(1);
+        self.exec.left = children.swap_remove(0);
+        Ok(self)
+    }
+
+    /// Reset runtime state.
+    pub fn reset_state(mut self) -> Self {
+        self.exec.left_fut = Default::default();
+        self.exec.dynamic_filter = None;
+        self.exec.metrics = ExecutionPlanMetricsSet::new();
+        self
+    }
+
+    /// Build result as a dyn execution plan.
+    pub fn build_exec(self) -> Result<Arc<dyn ExecutionPlan>> {
+        self.build().map(|p| Arc::new(p) as _)
+    }
+
+    /// Build resulting execution plan.
+    pub fn build(self) -> Result<HashJoinExec> {
+        let Self {
+            exec,
+            preserve_properties,
+        } = self;
+
+        // Validate null_aware flag
+        if exec.null_aware {
+            let join_type = exec.join_type();
+            if !matches!(join_type, JoinType::LeftAnti) {
+                return plan_err!(
+                    "null_aware can only be true for LeftAnti joins, got {join_type}"
+                );
+            }
+            let on = exec.on();
+            if on.len() != 1 {
+                return plan_err!(
+                    "null_aware anti join only supports single column join key, got {} columns",
+                    on.len()
+                );
+            }
+        }
+
+        if preserve_properties {
+            return Ok(exec);
+        }
+
+        let HashJoinExec {
+            left,
+            right,
+            on,
+            filter,
+            join_type,
+            left_fut,
+            random_state,
+            mode,
+            metrics,
+            projection,
+            null_equality,
+            null_aware,
+            dynamic_filter,
+            fetch,
+            // Recomputed.
+            join_schema: _,
+            column_indices: _,
+            cache: _,
+        } = exec;
+
+        let left_schema = left.schema();
+        let right_schema = right.schema();
+        if on.is_empty() {
+            return plan_err!("On constraints in HashJoinExec should be non-empty");
+        }
+
+        check_join_is_valid(&left_schema, &right_schema, &on)?;
+        let (join_schema, column_indices) =
+            build_join_schema(&left_schema, &right_schema, &join_type);
+
+        let join_schema = Arc::new(join_schema);
+
+        // Check if the projection is valid.
+        can_project(&join_schema, projection.as_deref())?;
+
+        let cache = HashJoinExec::compute_properties(
+            &left,
+            &right,
+            &join_schema,
+            join_type,
+            &on,
+            mode,
+            projection.as_deref(),
+        )?;
+
+        Ok(HashJoinExec {
+            left,
+            right,
+            on,
+            filter,
+            join_type,
+            join_schema,
+            left_fut,
+            random_state,
+            mode,
+            metrics,
+            projection,
+            column_indices,
+            null_equality,
+            null_aware,
+            cache: Arc::new(cache),
+            dynamic_filter,
+            fetch,
+        })
+    }
+
+    fn with_dynamic_filter(mut self, filter: Option<HashJoinExecDynamicFilter>) -> Self {
+        self.exec.dynamic_filter = filter;
+        self
+    }
+}
+
+impl From<&HashJoinExec> for HashJoinExecBuilder {
+    fn from(exec: &HashJoinExec) -> Self {
+        Self {
+            exec: HashJoinExec {
+                left: Arc::clone(exec.left()),
+                right: Arc::clone(exec.right()),
+                on: exec.on.clone(),
+                filter: exec.filter.clone(),
+                join_type: exec.join_type,
+                join_schema: Arc::clone(&exec.join_schema),
+                left_fut: Arc::clone(&exec.left_fut),
+                random_state: exec.random_state.clone(),
+                mode: exec.mode,
+                metrics: exec.metrics.clone(),
+                projection: exec.projection.clone(),
+                column_indices: exec.column_indices.clone(),
+                null_equality: exec.null_equality,
+                null_aware: exec.null_aware,
+                cache: Arc::clone(&exec.cache),
+                dynamic_filter: exec.dynamic_filter.clone(),
+                fetch: exec.fetch,
+            },
+            preserve_properties: true,
+        }
+    }
+}
+
+#[expect(rustdoc::private_intra_doc_links)]
 /// Join execution plan: Evaluates equijoin predicates in parallel on multiple
 /// partitions using a hash table and an optional filter list to apply post
 /// join.
@@ -172,6 +538,36 @@ impl JoinLeftData {
 /// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
 /// after the equijoin predicates.
 ///
+/// # ArrayMap Optimization
+///
+/// For joins with a single integer-based join key, `HashJoinExec` may use an [`ArrayMap`]
+/// (also known as a "perfect hash join") instead of a general-purpose hash map.
+/// This optimization is used when:
+/// 1. There is exactly one join key.
+/// 2. The join key is an integer type up to 64 bits wide that can be losslessly converted
+///    to `u64` (128-bit integer types such as `i128` and `u128` are not supported).
+/// 3. The range of keys is small enough (controlled by `perfect_hash_join_small_build_threshold`)
+///    OR the keys are sufficiently dense (controlled by `perfect_hash_join_min_key_density`).
+/// 4. build_side.num_rows() < u32::MAX
+/// 5. NullEqualsNothing || (NullEqualsNull && build side doesn't contain null)
+///
+/// See [`try_create_array_map`] for more details.
+///
+/// Note that when using [`PartitionMode::Partitioned`], the build side is split into multiple
+/// partitions. This can cause a dense build side to become sparse within each partition,
+/// potentially disabling this optimization.
+///
+/// For example, consider:
+/// ```sql
+/// SELECT t1.value, t2.value
+/// FROM range(10000) AS t1
+/// JOIN range(10000) AS t2
+///   ON t1.value = t2.value;
+/// ```
+/// With 24 partitions, each partition will only receive a subset of the 10,000 rows.
+/// The first partition might contain values like `3, 10, 18, 39, 43`, which are sparse
+/// relative to the original range, even though the overall data set is dense.
+///
 /// # "Build Side" vs "Probe Side"
 ///
 /// HashJoin takes two inputs, which are referred to as the "build" and the
@@ -205,9 +601,9 @@ impl JoinLeftData {
 ///    Resulting hash table stores hashed join-key fields for each row as a key, and
 ///    indices of corresponding rows in concatenated batch.
 ///
-/// Hash join uses LIFO data structure as a hash table, and in order to retain
-/// original build-side input order while obtaining data during probe phase, hash
-/// table is updated by iterating batch sequence in reverse order -- it allows to
+/// When using the standard `JoinHashMap`, hash join uses LIFO data structure as a hash table,
+/// and in order to retain original build-side input order while obtaining data during probe phase,
+/// hash table is updated by iterating batch sequence in reverse order -- it allows to
 /// keep rows with smaller indices "on the top" of hash table, and still maintain
 /// correct indexing for concatenated build-side data batch.
 ///
@@ -240,7 +636,6 @@ impl JoinLeftData {
 ///            └───────┘                                                    │          └───────┘        │
 ///                                                                         │                           │
 ///                                                                         └───────────────────────────┘
-///
 /// ```
 ///
 /// 2. the **probe phase** where the tuples of the probe side are streamed
@@ -275,7 +670,6 @@ impl JoinLeftData {
 ///     └────────────┘                                            └────────────┘
 ///
 ///        build side                                                probe side
-///
 /// ```
 ///
 /// # Example "Optimal" Plans
@@ -321,7 +715,6 @@ impl JoinLeftData {
 /// Note this structure includes a [`OnceAsync`] that is used to coordinate the
 /// loading of the left side with the processing in each output stream.
 /// Therefore it can not be [`Clone`]
-#[derive(Debug)]
 pub struct HashJoinExec {
     /// left (build) side which gets hashed
     pub left: Arc<dyn ExecutionPlan>,
@@ -342,32 +735,74 @@ pub struct HashJoinExec {
     ///
     /// Each output stream waits on the `OnceAsync` to signal the completion of
     /// the hash table creation.
-    left_fut: OnceAsync<JoinLeftData>,
-    /// Shared the `RandomState` for the hashing algorithm
-    random_state: RandomState,
+    left_fut: Arc<OnceAsync<JoinLeftData>>,
+    /// Shared the `SeededRandomState` for the hashing algorithm (seeds preserved for serialization)
+    random_state: SeededRandomState,
     /// Partitioning mode to use
     pub mode: PartitionMode,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// The projection indices of the columns in the output schema of join
-    pub projection: Option<Vec<usize>>,
+    pub projection: Option<ProjectionRef>,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
-    /// Null matching behavior: If `null_equals_null` is true, rows that have
-    /// `null`s in both left and right equijoin columns will be matched.
-    /// Otherwise, rows that have `null`s in the join columns will not be
-    /// matched and thus will not appear in the output.
-    pub null_equals_null: bool,
+    /// The equality null-handling behavior of the join algorithm.
+    pub null_equality: NullEquality,
+    /// Flag to indicate if this is a null-aware anti join
+    pub null_aware: bool,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// Dynamic filter for pushing down to the probe side
+    /// Set when dynamic filter pushdown is detected in handle_child_pushdown_result.
+    /// HashJoinExec also needs to keep a shared bounds accumulator for coordinating updates.
+    dynamic_filter: Option<HashJoinExecDynamicFilter>,
+    /// Maximum number of rows to return
+    fetch: Option<usize>,
+}
+
+#[derive(Clone)]
+struct HashJoinExecDynamicFilter {
+    /// Dynamic filter that we'll update with the results of the build side once that is done.
+    filter: Arc<DynamicFilterPhysicalExpr>,
+    /// Build accumulator to collect build-side information (hash maps and/or bounds) from each partition.
+    /// It is lazily initialized during execution to make sure we use the actual execution time partition counts.
+    build_accumulator: OnceLock<Arc<SharedBuildAccumulator>>,
+}
+
+impl fmt::Debug for HashJoinExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("HashJoinExec")
+            .field("left", &self.left)
+            .field("right", &self.right)
+            .field("on", &self.on)
+            .field("filter", &self.filter)
+            .field("join_type", &self.join_type)
+            .field("join_schema", &self.join_schema)
+            .field("left_fut", &self.left_fut)
+            .field("random_state", &self.random_state)
+            .field("mode", &self.mode)
+            .field("metrics", &self.metrics)
+            .field("projection", &self.projection)
+            .field("column_indices", &self.column_indices)
+            .field("null_equality", &self.null_equality)
+            .field("cache", &self.cache)
+            // Explicitly exclude dynamic_filter to avoid runtime state differences in tests
+            .finish()
+    }
+}
+
+impl EmbeddedProjection for HashJoinExec {
+    fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        self.with_projection(projection)
+    }
 }
 
 impl HashJoinExec {
-    /// Tries to create a new [HashJoinExec].
+    /// Tries to create a new [`HashJoinExec`].
     ///
     /// # Error
     /// This function errors when it is not possible to join the left and right sides on keys `on`.
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -376,52 +811,54 @@ impl HashJoinExec {
         join_type: &JoinType,
         projection: Option<Vec<usize>>,
         partition_mode: PartitionMode,
-        null_equals_null: bool,
+        null_equality: NullEquality,
+        null_aware: bool,
     ) -> Result<Self> {
-        let left_schema = left.schema();
-        let right_schema = right.schema();
-        if on.is_empty() {
-            return plan_err!("On constraints in HashJoinExec should be non-empty");
-        }
-
-        check_join_is_valid(&left_schema, &right_schema, &on)?;
-
-        let (join_schema, column_indices) =
-            build_join_schema(&left_schema, &right_schema, join_type);
+        HashJoinExecBuilder::new(left, right, on, *join_type)
+            .with_filter(filter)
+            .with_projection(projection)
+            .with_partition_mode(partition_mode)
+            .with_null_equality(null_equality)
+            .with_null_aware(null_aware)
+            .build()
+    }
 
-        let random_state = HASH_JOIN_SEED;
+    /// Create a builder based on the existing [`HashJoinExec`].
+    ///
+    /// Returned builder preserves all existing fields. If a field requiring properties
+    /// recomputation is modified, this will be done automatically during the node build.
+    ///
+    pub fn builder(&self) -> HashJoinExecBuilder {
+        self.into()
+    }
 
-        let join_schema = Arc::new(join_schema);
+    fn create_dynamic_filter(on: &JoinOn) -> Arc<DynamicFilterPhysicalExpr> {
+        // Extract the right-side keys (probe side keys) from the `on` clauses
+        // Dynamic filter will be created from build side values (left side) and applied to probe side (right side)
+        let right_keys: Vec<_> = on.iter().map(|(_, r)| Arc::clone(r)).collect();
+        // Initialize with a placeholder expression (true) that will be updated when the hash table is built
+        Arc::new(DynamicFilterPhysicalExpr::new(right_keys, lit(true)))
+    }
 
-        //  check if the projection is valid
-        can_project(&join_schema, projection.as_ref())?;
+    fn allow_join_dynamic_filter_pushdown(&self, config: &ConfigOptions) -> bool {
+        let (_, probe_preserved) = self.join_type.on_lr_is_preserved();
+        if !probe_preserved || !config.optimizer.enable_join_dynamic_filter_pushdown {
+            return false;
+        }
 
-        let cache = Self::compute_properties(
-            &left,
-            &right,
-            Arc::clone(&join_schema),
-            *join_type,
-            &on,
-            partition_mode,
-            projection.as_ref(),
-        )?;
+        // `preserve_file_partitions` can report Hash partitioning for Hive-style
+        // file groups, but those partitions are not actually hash-distributed.
+        // Partitioned dynamic filters rely on hash routing, so disable them in
+        // this mode to avoid incorrect results. Follow-up work: enable dynamic
+        // filtering for preserve_file_partitioned scans (issue #20195).
+        // https://github.com/apache/datafusion/issues/20195
+        if config.optimizer.preserve_file_partitions > 0
+            && self.mode == PartitionMode::Partitioned
+        {
+            return false;
+        }
 
-        Ok(HashJoinExec {
-            left,
-            right,
-            on,
-            filter,
-            join_type: *join_type,
-            join_schema,
-            left_fut: Default::default(),
-            random_state,
-            mode: partition_mode,
-            metrics: ExecutionPlanMetricsSet::new(),
-            projection,
-            column_indices,
-            null_equals_null,
-            cache,
-        })
+        true
     }
 
     /// left (build) side which gets hashed
@@ -460,9 +897,18 @@ impl HashJoinExec {
         &self.mode
     }
 
-    /// Get null_equals_null
-    pub fn null_equals_null(&self) -> bool {
-        self.null_equals_null
+    /// Get null_equality
+    pub fn null_equality(&self) -> NullEquality {
+        self.null_equality
+    }
+
+    /// Get the dynamic filter expression for testing purposes.
+    /// Returns `None` if no dynamic filter has been set.
+    ///
+    /// This method is intended for testing only and should not be used in production code.
+    #[doc(hidden)]
+    pub fn dynamic_filter_for_test(&self) -> Option<&Arc<DynamicFilterPhysicalExpr>> {
+        self.dynamic_filter.as_ref().map(|df| &df.filter)
     }
 
     /// Calculate order preservation flags for this hash join.
@@ -475,6 +921,7 @@ impl HashJoinExec {
                     | JoinType::Right
                     | JoinType::RightAnti
                     | JoinType::RightSemi
+                    | JoinType::RightMark
             ),
         ]
     }
@@ -492,57 +939,44 @@ impl HashJoinExec {
 
     /// Return new instance of [HashJoinExec] with the given projection.
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
         //  check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
-        Self::try_new(
-            Arc::clone(&self.left),
-            Arc::clone(&self.right),
-            self.on.clone(),
-            self.filter.clone(),
-            &self.join_type,
-            projection,
-            self.mode,
-            self.null_equals_null,
-        )
+        can_project(&self.schema(), projection.as_deref())?;
+        let projection =
+            combine_projections(projection.as_ref(), self.projection.as_ref())?;
+        self.builder().with_projection_ref(projection).build()
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         left: &Arc<dyn ExecutionPlan>,
         right: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
+        schema: &SchemaRef,
         join_type: JoinType,
         on: JoinOnRef,
         mode: PartitionMode,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
         let mut eq_properties = join_equivalence_properties(
             left.equivalence_properties().clone(),
             right.equivalence_properties().clone(),
             &join_type,
-            Arc::clone(&schema),
+            Arc::clone(schema),
             &Self::maintains_input_order(join_type),
             Some(Self::probe_side()),
             on,
-        );
+        )?;
 
         let mut output_partitioning = match mode {
             PartitionMode::CollectLeft => {
-                asymmetric_join_output_partitioning(left, right, &join_type)
+                asymmetric_join_output_partitioning(left, right, &join_type)?
             }
             PartitionMode::Auto => Partitioning::UnknownPartitioning(
                 right.output_partitioning().partition_count(),
             ),
             PartitionMode::Partitioned => {
-                symmetric_join_output_partitioning(left, right, &join_type)
+                symmetric_join_output_partitioning(left, right, &join_type)?
             }
         };
 
@@ -556,7 +990,8 @@ impl HashJoinExec {
                 | JoinType::LeftSemi
                 | JoinType::RightSemi
                 | JoinType::Right
-                | JoinType::RightAnti => EmissionType::Incremental,
+                | JoinType::RightAnti
+                | JoinType::RightMark => EmissionType::Incremental,
                 // If we need to generate unmatched rows from the *build side*,
                 // we need to emit them at the end.
                 JoinType::Left
@@ -571,9 +1006,8 @@ impl HashJoinExec {
         // If contains projection, update the PlanProperties.
         if let Some(projection) = projection {
             // construct a map from the input expressions to the output expression of the Projection
-            let projection_mapping =
-                ProjectionMapping::from_indices(projection, &schema)?;
-            let out_schema = project_schema(&schema, Some(projection))?;
+            let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -595,30 +1029,46 @@ impl HashJoinExec {
     ///
     /// This function is public so other downstream projects can use it to
     /// construct `HashJoinExec` with right side as the build side.
+    ///
+    /// For using this interface directly, please refer to below:
+    ///
+    /// Hash join execution may require specific input partitioning (for example,
+    /// the left child may have a single partition while the right child has multiple).
+    ///
+    /// Calling this function on join nodes whose children have already been repartitioned
+    /// (e.g., after a `RepartitionExec` has been inserted) may break the partitioning
+    /// requirements of the hash join. Therefore, ensure you call this function
+    /// before inserting any repartitioning operators on the join's children.
+    ///
+    /// In DataFusion's default SQL interface, this function is used by the `JoinSelection`
+    /// physical optimizer rule to determine a good join order, which is
+    /// executed before the `EnforceDistribution` rule (the rule that may
+    /// insert `RepartitionExec` operators).
     pub fn swap_inputs(
         &self,
         partition_mode: PartitionMode,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left = self.left();
         let right = self.right();
-        let new_join = HashJoinExec::try_new(
-            Arc::clone(right),
-            Arc::clone(left),
-            self.on()
-                .iter()
-                .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
-                .collect(),
-            self.filter().map(JoinFilter::swap),
-            &self.join_type().swap(),
-            swap_join_projection(
+        let new_join = self
+            .builder()
+            .with_type(self.join_type.swap())
+            .with_new_children(vec![Arc::clone(right), Arc::clone(left)])?
+            .with_on(
+                self.on()
+                    .iter()
+                    .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
+                    .collect(),
+            )
+            .with_filter(self.filter().map(JoinFilter::swap))
+            .with_projection(swap_join_projection(
                 left.schema().fields().len(),
                 right.schema().fields().len(),
-                self.projection.as_ref(),
+                self.projection.as_deref(),
                 self.join_type(),
-            ),
-            partition_mode,
-            self.null_equals_null(),
-        )?;
+            ))
+            .with_partition_mode(partition_mode)
+            .build()?;
         // In case of anti / semi joins or if there is embedded projection in HashJoinExec, output column order is preserved, no need to add projection again
         if matches!(
             self.join_type(),
@@ -626,6 +1076,8 @@ impl HashJoinExec {
                 | JoinType::RightSemi
                 | JoinType::LeftAnti
                 | JoinType::RightAnti
+                | JoinType::LeftMark
+                | JoinType::RightMark
         ) || self.projection.is_some()
         {
             Ok(Arc::new(new_join))
@@ -661,6 +1113,15 @@ impl DisplayAs for HashJoinExec {
                 } else {
                     "".to_string()
                 };
+                let display_null_equality =
+                    if self.null_equality() == NullEquality::NullEqualsNull {
+                        ", NullsEqual: true"
+                    } else {
+                        ""
+                    };
+                let display_fetch = self
+                    .fetch
+                    .map_or_else(String::new, |f| format!(", fetch={f}"));
                 let on = self
                     .on
                     .iter()
@@ -669,8 +1130,14 @@ impl DisplayAs for HashJoinExec {
                     .join(", ");
                 write!(
                     f,
-                    "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}",
-                    self.mode, self.join_type, on, display_filter, display_projections
+                    "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}{}{}",
+                    self.mode,
+                    self.join_type,
+                    on,
+                    display_filter,
+                    display_projections,
+                    display_null_equality,
+                    display_fetch,
                 )
             }
             DisplayFormatType::TreeRender => {
@@ -686,7 +1153,22 @@ impl DisplayAs for HashJoinExec {
                 if *self.join_type() != JoinType::Inner {
                     writeln!(f, "join_type={:?}", self.join_type)?;
                 }
-                writeln!(f, "on={on}")
+
+                writeln!(f, "on={on}")?;
+
+                if self.null_equality() == NullEquality::NullEqualsNull {
+                    writeln!(f, "NullsEqual: true")?;
+                }
+
+                if let Some(filter) = self.filter.as_ref() {
+                    writeln!(f, "filter={filter}")?;
+                }
+
+                if let Some(fetch) = self.fetch {
+                    writeln!(f, "fetch={fetch}")?;
+                }
+
+                Ok(())
             }
         }
     }
@@ -701,7 +1183,7 @@ impl ExecutionPlan for HashJoinExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -753,20 +1235,44 @@ impl ExecutionPlan for HashJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join key expressions from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+
+        // Apply to join filter expression if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present
+        if let Some(df) = &self.dynamic_filter {
+            tnr = tnr.visit_sibling(|| f(df.filter.as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
+    /// Creates a new HashJoinExec with different children while preserving configuration.
+    ///
+    /// This method is called during query optimization when the optimizer creates new
+    /// plan nodes. Importantly, it creates a fresh bounds_accumulator via `try_new`
+    /// rather than cloning the existing one because partitioning may have changed.
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(HashJoinExec::try_new(
-            Arc::clone(&children[0]),
-            Arc::clone(&children[1]),
-            self.on.clone(),
-            self.filter.clone(),
-            &self.join_type,
-            self.projection.clone(),
-            self.mode,
-            self.null_equals_null,
-        )?))
+        self.builder().with_new_children(children)?.build_exec()
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        self.builder().reset_state().build_exec()
     }
 
     fn execute(
@@ -779,30 +1285,40 @@ impl ExecutionPlan for HashJoinExec {
             .iter()
             .map(|on| Arc::clone(&on.0))
             .collect::<Vec<_>>();
-        let on_right = self
-            .on
-            .iter()
-            .map(|on| Arc::clone(&on.1))
-            .collect::<Vec<_>>();
         let left_partitions = self.left.output_partitioning().partition_count();
         let right_partitions = self.right.output_partitioning().partition_count();
 
-        if self.mode == PartitionMode::Partitioned && left_partitions != right_partitions
-        {
-            return internal_err!(
-                "Invalid HashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
-                 consider using RepartitionExec"
-            );
-        }
+        assert_or_internal_err!(
+            self.mode != PartitionMode::Partitioned
+                || left_partitions == right_partitions,
+            "Invalid HashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+             consider using RepartitionExec"
+        );
 
-        if self.mode == PartitionMode::CollectLeft && left_partitions != 1 {
-            return internal_err!(
-                "Invalid HashJoinExec, the output partition count of the left child must be 1 in CollectLeft mode,\
-                 consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        assert_or_internal_err!(
+            self.mode != PartitionMode::CollectLeft || left_partitions == 1,
+            "Invalid HashJoinExec, the output partition count of the left child must be 1 in CollectLeft mode,\
+             consider using CoalescePartitionsExec or the EnforceDistribution rule"
+        );
+
+        // Only enable dynamic filter pushdown if:
+        // - The session config enables dynamic filter pushdown
+        // - A dynamic filter exists
+        // - At least one consumer is holding a reference to it, this avoids expensive filter
+        //   computation when disabled or when no consumer will use it.
+        let enable_dynamic_filter_pushdown = self
+            .allow_join_dynamic_filter_pushdown(context.session_config().options())
+            && self
+                .dynamic_filter
+                .as_ref()
+                .map(|df| df.filter.is_used())
+                .unwrap_or(false);
 
         let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
+
+        let array_map_created_count = MetricBuilder::new(&self.metrics)
+            .counter(ARRAY_MAP_CREATED_COUNT_METRIC_NAME, partition);
+
         let left_fut = match self.mode {
             PartitionMode::CollectLeft => self.left_fut.try_once(|| {
                 let left_stream = self.left.execute(0, Arc::clone(&context))?;
@@ -811,13 +1327,17 @@ impl ExecutionPlan for HashJoinExec {
                     MemoryConsumer::new("HashJoinInput").register(context.memory_pool());
 
                 Ok(collect_left_input(
-                    self.random_state.clone(),
+                    self.random_state.random_state().clone(),
                     left_stream,
                     on_left.clone(),
                     join_metrics.clone(),
                     reservation,
                     need_produce_result_in_final(self.join_type),
                     self.right().output_partitioning().partition_count(),
+                    enable_dynamic_filter_pushdown,
+                    Arc::clone(context.session_config().options()),
+                    self.null_equality,
+                    array_map_created_count,
                 ))
             })?,
             PartitionMode::Partitioned => {
@@ -828,13 +1348,17 @@ impl ExecutionPlan for HashJoinExec {
                         .register(context.memory_pool());
 
                 OnceFut::new(collect_left_input(
-                    self.random_state.clone(),
+                    self.random_state.random_state().clone(),
                     left_stream,
                     on_left.clone(),
                     join_metrics.clone(),
                     reservation,
                     need_produce_result_in_final(self.join_type),
                     1,
+                    enable_dynamic_filter_pushdown,
+                    Arc::clone(context.session_config().options()),
+                    self.null_equality,
+                    array_map_created_count,
                 ))
             }
             PartitionMode::Auto => {
@@ -847,12 +1371,39 @@ impl ExecutionPlan for HashJoinExec {
 
         let batch_size = context.session_config().batch_size();
 
+        // Initialize build_accumulator lazily with runtime partition counts (only if enabled)
+        // Use RepartitionExec's random state (seeds: 0,0,0,0) for partition routing
+        let repartition_random_state = REPARTITION_RANDOM_STATE;
+        let build_accumulator = enable_dynamic_filter_pushdown
+            .then(|| {
+                self.dynamic_filter.as_ref().map(|df| {
+                    let filter = Arc::clone(&df.filter);
+                    let on_right = self
+                        .on
+                        .iter()
+                        .map(|(_, right_expr)| Arc::clone(right_expr))
+                        .collect::<Vec<_>>();
+                    Some(Arc::clone(df.build_accumulator.get_or_init(|| {
+                        Arc::new(SharedBuildAccumulator::new_from_partition_mode(
+                            self.mode,
+                            self.left.as_ref(),
+                            self.right.as_ref(),
+                            filter,
+                            on_right,
+                            repartition_random_state,
+                        ))
+                    })))
+                })
+            })
+            .flatten()
+            .flatten();
+
         // we have the batches and the hash map with their keys. We can how create a stream
         // over the right that uses this information to issue new batches.
         let right_stream = self.right.execute(partition, context)?;
 
         // update column indices to reflect the projection
-        let column_indices_after_projection = match &self.projection {
+        let column_indices_after_projection = match self.projection.as_ref() {
             Some(projection) => projection
                 .iter()
                 .map(|i| self.column_indices[*i].clone())
@@ -860,48 +1411,93 @@ impl ExecutionPlan for HashJoinExec {
             None => self.column_indices.clone(),
         };
 
-        Ok(Box::pin(HashJoinStream {
-            schema: self.schema(),
+        let on_right = self
+            .on
+            .iter()
+            .map(|(_, right_expr)| Arc::clone(right_expr))
+            .collect::<Vec<_>>();
+
+        Ok(Box::pin(HashJoinStream::new(
+            partition,
+            self.schema(),
             on_right,
-            filter: self.filter.clone(),
-            join_type: self.join_type,
-            right: right_stream,
-            column_indices: column_indices_after_projection,
-            random_state: self.random_state.clone(),
+            self.filter.clone(),
+            self.join_type,
+            right_stream,
+            self.random_state.random_state().clone(),
             join_metrics,
-            null_equals_null: self.null_equals_null,
-            state: HashJoinStreamState::WaitBuildSide,
-            build_side: BuildSide::Initial(BuildSideInitialState { left_fut }),
+            column_indices_after_projection,
+            self.null_equality,
+            HashJoinStreamState::WaitBuildSide,
+            BuildSide::Initial(BuildSideInitialState { left_fut }),
             batch_size,
-            hashes_buffer: vec![],
-            right_side_ordered: self.right.output_ordering().is_some(),
-        }))
+            vec![],
+            self.right.output_ordering().is_some(),
+            build_accumulator,
+            self.mode,
+            self.null_aware,
+            self.fetch,
+        )))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = match (partition, self.mode) {
+            // For CollectLeft mode, the left side is collected into a single partition,
+            // so all left partitions are available to each output partition.
+            // For the right side, we need the specific partition statistics.
+            (Some(partition), PartitionMode::CollectLeft) => {
+                let left_stats = self.left.partition_statistics(None)?;
+                let right_stats = self.right.partition_statistics(Some(partition))?;
+
+                estimate_join_statistics(
+                    (*left_stats).clone(),
+                    (*right_stats).clone(),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        // TODO stats: it is not possible in general to know the output size of joins
-        // There are some special cases though, for example:
-        // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
-        let stats = estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            self.on.clone(),
-            &self.join_type,
-            &self.join_schema,
-        )?;
+            // For Partitioned mode, both sides are partitioned, so each output partition
+            // only has access to the corresponding partition from both sides.
+            (Some(partition), PartitionMode::Partitioned) => {
+                let left_stats = self.left.partition_statistics(Some(partition))?;
+                let right_stats = self.right.partition_statistics(Some(partition))?;
+
+                estimate_join_statistics(
+                    (*left_stats).clone(),
+                    (*right_stats).clone(),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
+
+            // For Auto mode or when no specific partition is requested, fall back to
+            // the current behavior of getting all partition statistics.
+            (None, _) | (Some(_), PartitionMode::Auto) => {
+                // TODO stats: it is not possible in general to know the output size of joins
+                // There are some special cases though, for example:
+                // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
+                let left_stats = self.left.partition_statistics(None)?;
+                let right_stats = self.right.partition_statistics(None)?;
+                estimate_join_statistics(
+                    (*left_stats).clone(),
+                    (*right_stats).clone(),
+                    &self.on,
+                    &self.join_type,
+                    &self.join_schema,
+                )?
+            }
+        };
         // Project statistics if there is a projection
-        Ok(stats.project(self.projection.as_ref()))
+        let stats = stats.project(self.projection.as_ref());
+        // Apply fetch limit to statistics
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     /// Tries to push `projection` down through `hash_join`. If possible, performs the
@@ -916,6 +1512,7 @@ impl ExecutionPlan for HashJoinExec {
             return Ok(None);
         }
 
+        let schema = self.schema();
         if let Some(JoinData {
             projected_left_child,
             projected_right_child,
@@ -926,759 +1523,642 @@ impl ExecutionPlan for HashJoinExec {
             self.left(),
             self.right(),
             self.on(),
-            self.schema(),
+            &schema,
             self.filter(),
         )? {
-            Ok(Some(Arc::new(HashJoinExec::try_new(
-                Arc::new(projected_left_child),
-                Arc::new(projected_right_child),
-                join_on,
-                join_filter,
-                self.join_type(),
+            self.builder()
+                .with_new_children(vec![
+                    Arc::new(projected_left_child),
+                    Arc::new(projected_right_child),
+                ])?
+                .with_on(join_on)
+                .with_filter(join_filter)
                 // Returned early if projection is not None
-                None,
-                *self.partition_mode(),
-                self.null_equals_null,
-            )?)))
+                .with_projection(None)
+                .build_exec()
+                .map(Some)
         } else {
             try_embed_projection(projection, self)
         }
     }
-}
-
-/// Reads the left (build) side of the input, buffering it in memory, to build a
-/// hash table (`LeftJoinData`)
-async fn collect_left_input(
-    random_state: RandomState,
-    left_stream: SendableRecordBatchStream,
-    on_left: Vec<PhysicalExprRef>,
-    metrics: BuildProbeJoinMetrics,
-    reservation: MemoryReservation,
-    with_visited_indices_bitmap: bool,
-    probe_threads_count: usize,
-) -> Result<JoinLeftData> {
-    let schema = left_stream.schema();
 
-    // This operation performs 2 steps at once:
-    // 1. creates a [JoinHashMap] of all batches from the stream
-    // 2. stores the batches in a vector.
-    let initial = (Vec::new(), 0, metrics, reservation);
-    let (batches, num_rows, metrics, mut reservation) = left_stream
-        .try_fold(initial, |mut acc, batch| async {
-            let batch_size = get_record_batch_memory_size(&batch);
-            // Reserve memory for incoming batch
-            acc.3.try_grow(batch_size)?;
-            // Update metrics
-            acc.2.build_mem_used.add(batch_size);
-            acc.2.build_input_batches.add(1);
-            acc.2.build_input_rows.add(batch.num_rows());
-            // Update row count
-            acc.1 += batch.num_rows();
-            // Push batch to output
-            acc.0.push(batch);
-            Ok(acc)
-        })
-        .await?;
+    fn gather_filters_for_pushdown(
+        &self,
+        phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        // This is the physical-plan equivalent of `push_down_all_join` in
+        // `datafusion/optimizer/src/push_down_filter.rs`. That function uses `lr_is_preserved`
+        // to decide which parent predicates can be pushed past a logical join to its children,
+        // then checks column references to route each predicate to the correct side.
+        //
+        // We apply the same two-level logic here:
+        // 1. `lr_is_preserved` gates whether a side is eligible at all.
+        // 2. For each filter, we check that all column references belong to the
+        //    target child (using `column_indices` to map output column positions
+        //    to join sides). This is critical for correctness: name-based matching
+        //    alone (as done by `ChildFilterDescription::from_child`) can incorrectly
+        //    push filters when different join sides have columns with the same name
+        //    (e.g. nested mark joins both producing "mark" columns).
+        let (left_preserved, right_preserved) = lr_is_preserved(self.join_type);
+
+        // Build the set of allowed column indices for each side
+        let column_indices: Vec<ColumnIndex> = match self.projection.as_ref() {
+            Some(projection) => projection
+                .iter()
+                .map(|i| self.column_indices[*i].clone())
+                .collect(),
+            None => self.column_indices.clone(),
+        };
 
-    // Estimation of memory size, required for hashtable, prior to allocation.
-    // Final result can be verified using `RawTable.allocation_info()`
-    let fixed_size = size_of::<JoinHashMap>();
-    let estimated_hashtable_size =
-        estimate_memory_size::<(u64, u64)>(num_rows, fixed_size)?;
-
-    reservation.try_grow(estimated_hashtable_size)?;
-    metrics.build_mem_used.add(estimated_hashtable_size);
-
-    let mut hashmap = JoinHashMap::with_capacity(num_rows);
-    let mut hashes_buffer = Vec::new();
-    let mut offset = 0;
-
-    // Updating hashmap starting from the last batch
-    let batches_iter = batches.iter().rev();
-    for batch in batches_iter.clone() {
-        hashes_buffer.clear();
-        hashes_buffer.resize(batch.num_rows(), 0);
-        update_hash(
-            &on_left,
-            batch,
-            &mut hashmap,
-            offset,
-            &random_state,
-            &mut hashes_buffer,
-            0,
-            true,
-        )?;
-        offset += batch.num_rows();
-    }
-    // Merge all batches into a single batch, so we can directly index into the arrays
-    let single_batch = concat_batches(&schema, batches_iter)?;
+        let (mut left_allowed, mut right_allowed) = (HashSet::new(), HashSet::new());
+        column_indices
+            .iter()
+            .enumerate()
+            .for_each(|(output_idx, ci)| {
+                match ci.side {
+                    JoinSide::Left => left_allowed.insert(output_idx),
+                    JoinSide::Right => right_allowed.insert(output_idx),
+                    // Mark columns - don't allow pushdown to either side
+                    JoinSide::None => false,
+                };
+            });
 
-    // Reserve additional memory for visited indices bitmap and create shared builder
-    let visited_indices_bitmap = if with_visited_indices_bitmap {
-        let bitmap_size = bit_util::ceil(single_batch.num_rows(), 8);
-        reservation.try_grow(bitmap_size)?;
-        metrics.build_mem_used.add(bitmap_size);
+        // For semi/anti joins, the non-preserved side's columns are not in the
+        // output, but filters on join key columns can still be pushed there.
+        // We find output columns that are join keys on the preserved side and
+        // add their output indices to the non-preserved side's allowed set.
+        // The name-based remap in FilterRemapper will then match them to the
+        // corresponding column in the non-preserved child's schema.
+        match self.join_type {
+            JoinType::LeftSemi | JoinType::LeftAnti => {
+                let left_key_indices: HashSet<usize> = self
+                    .on
+                    .iter()
+                    .filter_map(|(left_key, _)| {
+                        left_key
+                            .as_any()
+                            .downcast_ref::<Column>()
+                            .map(|c| c.index())
+                    })
+                    .collect();
+                for (output_idx, ci) in column_indices.iter().enumerate() {
+                    if ci.side == JoinSide::Left && left_key_indices.contains(&ci.index) {
+                        right_allowed.insert(output_idx);
+                    }
+                }
+            }
+            JoinType::RightSemi | JoinType::RightAnti => {
+                let right_key_indices: HashSet<usize> = self
+                    .on
+                    .iter()
+                    .filter_map(|(_, right_key)| {
+                        right_key
+                            .as_any()
+                            .downcast_ref::<Column>()
+                            .map(|c| c.index())
+                    })
+                    .collect();
+                for (output_idx, ci) in column_indices.iter().enumerate() {
+                    if ci.side == JoinSide::Right && right_key_indices.contains(&ci.index)
+                    {
+                        left_allowed.insert(output_idx);
+                    }
+                }
+            }
+            _ => {}
+        }
 
-        let mut bitmap_buffer = BooleanBufferBuilder::new(single_batch.num_rows());
-        bitmap_buffer.append_n(num_rows, false);
-        bitmap_buffer
-    } else {
-        BooleanBufferBuilder::new(0)
-    };
+        let left_child = if left_preserved {
+            ChildFilterDescription::from_child_with_allowed_indices(
+                &parent_filters,
+                left_allowed,
+                self.left(),
+            )?
+        } else {
+            ChildFilterDescription::all_unsupported(&parent_filters)
+        };
 
-    let left_values = on_left
-        .iter()
-        .map(|c| {
-            c.evaluate(&single_batch)?
-                .into_array(single_batch.num_rows())
-        })
-        .collect::<Result<Vec<_>>>()?;
-
-    let data = JoinLeftData::new(
-        hashmap,
-        single_batch,
-        left_values,
-        Mutex::new(visited_indices_bitmap),
-        AtomicUsize::new(probe_threads_count),
-        reservation,
-    );
+        let mut right_child = if right_preserved {
+            ChildFilterDescription::from_child_with_allowed_indices(
+                &parent_filters,
+                right_allowed,
+                self.right(),
+            )?
+        } else {
+            ChildFilterDescription::all_unsupported(&parent_filters)
+        };
 
-    Ok(data)
-}
+        // Add dynamic filters in Post phase if enabled
+        if phase == FilterPushdownPhase::Post
+            && self.allow_join_dynamic_filter_pushdown(config)
+        {
+            // Add actual dynamic filter to right side (probe side)
+            let dynamic_filter = Self::create_dynamic_filter(&self.on);
+            right_child = right_child.with_self_filter(dynamic_filter);
+        }
 
-/// Updates `hash_map` with new entries from `batch` evaluated against the expressions `on`
-/// using `offset` as a start value for `batch` row indices.
-///
-/// `fifo_hashmap` sets the order of iteration over `batch` rows while updating hashmap,
-/// which allows to keep either first (if set to true) or last (if set to false) row index
-/// as a chain head for rows with equal hash values.
-#[allow(clippy::too_many_arguments)]
-pub fn update_hash<T>(
-    on: &[PhysicalExprRef],
-    batch: &RecordBatch,
-    hash_map: &mut T,
-    offset: usize,
-    random_state: &RandomState,
-    hashes_buffer: &mut Vec<u64>,
-    deleted_offset: usize,
-    fifo_hashmap: bool,
-) -> Result<()>
-where
-    T: JoinHashMapType,
-{
-    // evaluate the keys
-    let keys_values = on
-        .iter()
-        .map(|c| c.evaluate(batch)?.into_array(batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
-
-    // calculate the hash values
-    let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?;
-
-    // For usual JoinHashmap, the implementation is void.
-    hash_map.extend_zero(batch.num_rows());
-
-    // Updating JoinHashMap from hash values iterator
-    let hash_values_iter = hash_values
-        .iter()
-        .enumerate()
-        .map(|(i, val)| (i + offset, val));
-
-    if fifo_hashmap {
-        hash_map.update_from_iter(hash_values_iter.rev(), deleted_offset);
-    } else {
-        hash_map.update_from_iter(hash_values_iter, deleted_offset);
+        Ok(FilterDescription::new()
+            .with_child(left_child)
+            .with_child(right_child))
     }
 
-    Ok(())
-}
-
-/// Represents build-side of hash join.
-enum BuildSide {
-    /// Indicates that build-side not collected yet
-    Initial(BuildSideInitialState),
-    /// Indicates that build-side data has been collected
-    Ready(BuildSideReadyState),
-}
-
-/// Container for BuildSide::Initial related data
-struct BuildSideInitialState {
-    /// Future for building hash table from build-side input
-    left_fut: OnceFut<JoinLeftData>,
-}
-
-/// Container for BuildSide::Ready related data
-struct BuildSideReadyState {
-    /// Collected build-side data
-    left_data: Arc<JoinLeftData>,
-}
-
-impl BuildSide {
-    /// Tries to extract BuildSideInitialState from BuildSide enum.
-    /// Returns an error if state is not Initial.
-    fn try_as_initial_mut(&mut self) -> Result<&mut BuildSideInitialState> {
-        match self {
-            BuildSide::Initial(state) => Ok(state),
-            _ => internal_err!("Expected build side in initial state"),
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        let mut result = FilterPushdownPropagation::if_any(child_pushdown_result.clone());
+        assert_eq!(child_pushdown_result.self_filters.len(), 2); // Should always be 2, we have 2 children
+        let right_child_self_filters = &child_pushdown_result.self_filters[1]; // We only push down filters to the right child
+        // We expect 0 or 1 self filters
+        if let Some(filter) = right_child_self_filters.first() {
+            // Note that we don't check PushdDownPredicate::discrimnant because even if nothing said
+            // "yes, I can fully evaluate this filter" things might still use it for statistics -> it's worth updating
+            let predicate = Arc::clone(&filter.predicate);
+            if let Ok(dynamic_filter) =
+                Arc::downcast::<DynamicFilterPhysicalExpr>(predicate)
+            {
+                // We successfully pushed down our self filter - we need to make a new node with the dynamic filter
+                let new_node = self
+                    .builder()
+                    .with_dynamic_filter(Some(HashJoinExecDynamicFilter {
+                        filter: dynamic_filter,
+                        build_accumulator: OnceLock::new(),
+                    }))
+                    .build_exec()?;
+                result = result.with_updated_node(new_node);
+            }
         }
+        Ok(result)
     }
 
-    /// Tries to extract BuildSideReadyState from BuildSide enum.
-    /// Returns an error if state is not Ready.
-    fn try_as_ready(&self) -> Result<&BuildSideReadyState> {
-        match self {
-            BuildSide::Ready(state) => Ok(state),
-            _ => internal_err!("Expected build side in ready state"),
-        }
+    fn supports_limit_pushdown(&self) -> bool {
+        // Hash join execution plan does not support pushing limit down through to children
+        // because the children don't know about the join condition and can't
+        // determine how many rows to produce
+        false
     }
 
-    /// Tries to extract BuildSideReadyState from BuildSide enum.
-    /// Returns an error if state is not Ready.
-    fn try_as_ready_mut(&mut self) -> Result<&mut BuildSideReadyState> {
-        match self {
-            BuildSide::Ready(state) => Ok(state),
-            _ => internal_err!("Expected build side in ready state"),
-        }
+    fn fetch(&self) -> Option<usize> {
+        self.fetch
+    }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        self.builder()
+            .with_fetch(limit)
+            .build()
+            .ok()
+            .map(|exec| Arc::new(exec) as _)
     }
 }
 
-/// Represents state of HashJoinStream
+/// Determines which sides of a join are "preserved" for filter pushdown.
 ///
-/// Expected state transitions performed by HashJoinStream are:
-///
-/// ```text
+/// A preserved side means filters on that side's columns can be safely pushed
+/// below the join. This mirrors the logic in the logical optimizer's
+/// `lr_is_preserved` in `datafusion/optimizer/src/push_down_filter.rs`.
+fn lr_is_preserved(join_type: JoinType) -> (bool, bool) {
+    match join_type {
+        JoinType::Inner => (true, true),
+        JoinType::Left => (true, false),
+        JoinType::Right => (false, true),
+        JoinType::Full => (false, false),
+        // Filters in semi/anti joins are either on the preserved side, or on join keys,
+        // as all output columns come from the preserved side. Join key filters can be
+        // safely pushed down into the other side.
+        JoinType::LeftSemi | JoinType::LeftAnti => (true, true),
+        JoinType::RightSemi | JoinType::RightAnti => (true, true),
+        JoinType::LeftMark => (true, false),
+        JoinType::RightMark => (false, true),
+    }
+}
+
+/// Accumulator for collecting min/max bounds from build-side data during hash join.
 ///
-///       WaitBuildSide
-///             │
-///             ▼
-///  ┌─► FetchProbeBatch ───► ExhaustedProbeSide ───► Completed
-///  │          │
-///  │          ▼
-///  └─ ProcessProbeBatch
+/// This struct encapsulates the logic for progressively computing column bounds
+/// (minimum and maximum values) for a specific join key expression as batches
+/// are processed during the build phase of a hash join.
 ///
-/// ```
-#[derive(Debug, Clone)]
-enum HashJoinStreamState {
-    /// Initial state for HashJoinStream indicating that build-side data not collected yet
-    WaitBuildSide,
-    /// Indicates that build-side has been collected, and stream is ready for fetching probe-side
-    FetchProbeBatch,
-    /// Indicates that non-empty batch has been fetched from probe-side, and is ready to be processed
-    ProcessProbeBatch(ProcessProbeBatchState),
-    /// Indicates that probe-side has been fully processed
-    ExhaustedProbeSide,
-    /// Indicates that HashJoinStream execution is completed
-    Completed,
+/// The bounds are used for dynamic filter pushdown optimization, where filters
+/// based on the actual data ranges can be pushed down to the probe side to
+/// eliminate unnecessary data early.
+struct CollectLeftAccumulator {
+    /// The physical expression to evaluate for each batch
+    expr: Arc<dyn PhysicalExpr>,
+    /// Accumulator for tracking the minimum value across all batches
+    min: MinAccumulator,
+    /// Accumulator for tracking the maximum value across all batches
+    max: MaxAccumulator,
 }
 
-impl HashJoinStreamState {
-    /// Tries to extract ProcessProbeBatchState from HashJoinStreamState enum.
-    /// Returns an error if state is not ProcessProbeBatchState.
-    fn try_as_process_probe_batch_mut(&mut self) -> Result<&mut ProcessProbeBatchState> {
-        match self {
-            HashJoinStreamState::ProcessProbeBatch(state) => Ok(state),
-            _ => internal_err!("Expected hash join stream in ProcessProbeBatch state"),
+impl CollectLeftAccumulator {
+    /// Creates a new accumulator for tracking bounds of a join key expression.
+    ///
+    /// # Arguments
+    /// * `expr` - The physical expression to track bounds for
+    /// * `schema` - The schema of the input data
+    ///
+    /// # Returns
+    /// A new `CollectLeftAccumulator` instance configured for the expression's data type
+    fn try_new(expr: Arc<dyn PhysicalExpr>, schema: &SchemaRef) -> Result<Self> {
+        /// Recursively unwraps dictionary types to get the underlying value type.
+        fn dictionary_value_type(data_type: &DataType) -> DataType {
+            match data_type {
+                DataType::Dictionary(_, value_type) => {
+                    dictionary_value_type(value_type.as_ref())
+                }
+                _ => data_type.clone(),
+            }
         }
+
+        let data_type = expr
+            .data_type(schema)
+            // Min/Max can operate on dictionary data but expect to be initialized with the underlying value type
+            .map(|dt| dictionary_value_type(&dt))?;
+        Ok(Self {
+            expr,
+            min: MinAccumulator::try_new(&data_type)?,
+            max: MaxAccumulator::try_new(&data_type)?,
+        })
     }
-}
 
-/// Container for HashJoinStreamState::ProcessProbeBatch related data
-#[derive(Debug, Clone)]
-struct ProcessProbeBatchState {
-    /// Current probe-side batch
-    batch: RecordBatch,
-    /// Probe-side on expressions values
-    values: Vec<ArrayRef>,
-    /// Starting offset for JoinHashMap lookups
-    offset: JoinHashMapOffset,
-    /// Max joined probe-side index from current batch
-    joined_probe_idx: Option<usize>,
-}
+    /// Updates the accumulators with values from a new batch.
+    ///
+    /// Evaluates the expression on the batch and updates both min and max
+    /// accumulators with the resulting values.
+    ///
+    /// # Arguments
+    /// * `batch` - The record batch to process
+    ///
+    /// # Returns
+    /// Ok(()) if the update succeeds, or an error if expression evaluation fails
+    fn update_batch(&mut self, batch: &RecordBatch) -> Result<()> {
+        let array = self.expr.evaluate(batch)?.into_array(batch.num_rows())?;
+        self.min.update_batch(std::slice::from_ref(&array))?;
+        self.max.update_batch(std::slice::from_ref(&array))?;
+        Ok(())
+    }
 
-impl ProcessProbeBatchState {
-    fn advance(&mut self, offset: JoinHashMapOffset, joined_probe_idx: Option<usize>) {
-        self.offset = offset;
-        if joined_probe_idx.is_some() {
-            self.joined_probe_idx = joined_probe_idx;
-        }
+    /// Finalizes the accumulation and returns the computed bounds.
+    ///
+    /// Consumes self to extract the final min and max values from the accumulators.
+    ///
+    /// # Returns
+    /// The `ColumnBounds` containing the minimum and maximum values observed
+    fn evaluate(mut self) -> Result<ColumnBounds> {
+        Ok(ColumnBounds::new(
+            self.min.evaluate()?,
+            self.max.evaluate()?,
+        ))
     }
 }
 
-/// [`Stream`] for [`HashJoinExec`] that does the actual join.
-///
-/// This stream:
-///
-/// 1. Reads the entire left input (build) and constructs a hash table
-///
-/// 2. Streams [RecordBatch]es as they arrive from the right input (probe) and joins
-///    them with the contents of the hash table
-struct HashJoinStream {
-    /// Input schema
-    schema: Arc<Schema>,
-    /// equijoin columns from the right (probe side)
-    on_right: Vec<PhysicalExprRef>,
-    /// optional join filter
-    filter: Option<JoinFilter>,
-    /// type of the join (left, right, semi, etc)
-    join_type: JoinType,
-    /// right (probe) input
-    right: SendableRecordBatchStream,
-    /// Random state used for hashing initialization
-    random_state: RandomState,
-    /// Metrics
-    join_metrics: BuildProbeJoinMetrics,
-    /// Information of index and left / right placement of columns
-    column_indices: Vec<ColumnIndex>,
-    /// If null_equals_null is true, null == null else null != null
-    null_equals_null: bool,
-    /// State of the stream
-    state: HashJoinStreamState,
-    /// Build side
-    build_side: BuildSide,
-    /// Maximum output batch size
-    batch_size: usize,
-    /// Scratch space for computing hashes
-    hashes_buffer: Vec<u64>,
-    /// Specifies whether the right side has an ordering to potentially preserve
-    right_side_ordered: bool,
+/// State for collecting the build-side data during hash join
+struct BuildSideState {
+    batches: Vec<RecordBatch>,
+    num_rows: usize,
+    metrics: BuildProbeJoinMetrics,
+    reservation: MemoryReservation,
+    bounds_accumulators: Option<Vec<CollectLeftAccumulator>>,
+}
+
+impl BuildSideState {
+    /// Create a new BuildSideState with optional accumulators for bounds computation
+    fn try_new(
+        metrics: BuildProbeJoinMetrics,
+        reservation: MemoryReservation,
+        on_left: Vec<Arc<dyn PhysicalExpr>>,
+        schema: &SchemaRef,
+        should_compute_dynamic_filters: bool,
+    ) -> Result<Self> {
+        Ok(Self {
+            batches: Vec::new(),
+            num_rows: 0,
+            metrics,
+            reservation,
+            bounds_accumulators: should_compute_dynamic_filters
+                .then(|| {
+                    on_left
+                        .into_iter()
+                        .map(|expr| CollectLeftAccumulator::try_new(expr, schema))
+                        .collect::<Result<Vec<_>>>()
+                })
+                .transpose()?,
+        })
+    }
 }
 
-impl RecordBatchStream for HashJoinStream {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
+fn should_collect_min_max_for_perfect_hash(
+    on_left: &[PhysicalExprRef],
+    schema: &SchemaRef,
+) -> Result<bool> {
+    if on_left.len() != 1 {
+        return Ok(false);
     }
+
+    let expr = &on_left[0];
+    let data_type = expr.data_type(schema)?;
+    Ok(ArrayMap::is_supported_type(&data_type))
 }
 
-/// Executes lookups by hash against JoinHashMap and resolves potential
-/// hash collisions.
-/// Returns build/probe indices satisfying the equality condition, along with
-/// (optional) starting point for next iteration.
+/// Collects all batches from the left (build) side stream and creates a hash map for joining.
 ///
-/// # Example
+/// This function is responsible for:
+/// 1. Consuming the entire left stream and collecting all batches into memory
+/// 2. Building a hash map from the join key columns for efficient probe operations
+/// 3. Computing bounds for dynamic filter pushdown (if enabled)
+/// 4. Preparing visited indices bitmap for certain join types
 ///
-/// For `LEFT.b1 = RIGHT.b2`:
-/// LEFT (build) Table:
-/// ```text
-///  a1  b1  c1
-///  1   1   10
-///  3   3   30
-///  5   5   50
-///  7   7   70
-///  9   8   90
-///  11  8   110
-///  13   10  130
-/// ```
-///
-/// RIGHT (probe) Table:
-/// ```text
-///  a2   b2  c2
-///  2    2   20
-///  4    4   40
-///  6    6   60
-///  8    8   80
-/// 10   10  100
-/// 12   10  120
-/// ```
+/// # Parameters
+/// * `random_state` - Random state for consistent hashing across partitions
+/// * `left_stream` - Stream of record batches from the build side
+/// * `on_left` - Physical expressions for the left side join keys
+/// * `metrics` - Metrics collector for tracking memory usage and row counts
+/// * `reservation` - Memory reservation tracker for the hash table and data
+/// * `with_visited_indices_bitmap` - Whether to track visited indices (for outer joins)
+/// * `probe_threads_count` - Number of threads that will probe this hash table
+/// * `should_compute_dynamic_filters` - Whether to compute min/max bounds for dynamic filtering
 ///
-/// The result is
-/// ```text
-/// "+----+----+-----+----+----+-----+",
-/// "| a1 | b1 | c1  | a2 | b2 | c2  |",
-/// "+----+----+-----+----+----+-----+",
-/// "| 9  | 8  | 90  | 8  | 8  | 80  |",
-/// "| 11 | 8  | 110 | 8  | 8  | 80  |",
-/// "| 13 | 10 | 130 | 10 | 10 | 100 |",
-/// "| 13 | 10 | 130 | 12 | 10 | 120 |",
-/// "+----+----+-----+----+----+-----+"
-/// ```
+/// # Dynamic Filter Coordination
+/// When `should_compute_dynamic_filters` is true, this function computes the min/max bounds
+/// for each join key column but does NOT update the dynamic filter. Instead, the
+/// bounds are stored in the returned `JoinLeftData` and later coordinated by
+/// `SharedBuildAccumulator` to ensure all partitions contribute their bounds
+/// before updating the filter exactly once.
 ///
-/// And the result of build and probe indices are:
-/// ```text
-/// Build indices: 4, 5, 6, 6
-/// Probe indices: 3, 3, 4, 5
-/// ```
-#[allow(clippy::too_many_arguments)]
-fn lookup_join_hashmap(
-    build_hashmap: &JoinHashMap,
-    build_side_values: &[ArrayRef],
-    probe_side_values: &[ArrayRef],
-    null_equals_null: bool,
-    hashes_buffer: &[u64],
-    limit: usize,
-    offset: JoinHashMapOffset,
-) -> Result<(UInt64Array, UInt32Array, Option<JoinHashMapOffset>)> {
-    let (probe_indices, build_indices, next_offset) =
-        build_hashmap.get_matched_indices_with_limit_offset(hashes_buffer, limit, offset);
-
-    let build_indices: UInt64Array = build_indices.into();
-    let probe_indices: UInt32Array = probe_indices.into();
-
-    let (build_indices, probe_indices) = equal_rows_arr(
-        &build_indices,
-        &probe_indices,
-        build_side_values,
-        probe_side_values,
-        null_equals_null,
-    )?;
-
-    Ok((build_indices, probe_indices, next_offset))
-}
-
-// version of eq_dyn supporting equality on null arrays
-fn eq_dyn_null(
-    left: &dyn Array,
-    right: &dyn Array,
-    null_equals_null: bool,
-) -> Result<BooleanArray, ArrowError> {
-    // Nested datatypes cannot use the underlying not_distinct/eq function and must use a special
-    // implementation
-    // <https://github.com/apache/datafusion/issues/10749>
-    if left.data_type().is_nested() {
-        let op = if null_equals_null {
-            Operator::IsNotDistinctFrom
-        } else {
-            Operator::Eq
-        };
-        return Ok(compare_op_for_nested(op, &left, &right)?);
-    }
-    match (left.data_type(), right.data_type()) {
-        _ if null_equals_null => not_distinct(&left, &right),
-        _ => eq(&left, &right),
-    }
-}
+/// # Returns
+/// `JoinLeftData` containing the hash map, consolidated batch, join key values,
+/// visited indices bitmap, and computed bounds (if requested).
+#[expect(clippy::too_many_arguments)]
+async fn collect_left_input(
+    random_state: RandomState,
+    left_stream: SendableRecordBatchStream,
+    on_left: Vec<PhysicalExprRef>,
+    metrics: BuildProbeJoinMetrics,
+    reservation: MemoryReservation,
+    with_visited_indices_bitmap: bool,
+    probe_threads_count: usize,
+    should_compute_dynamic_filters: bool,
+    config: Arc<ConfigOptions>,
+    null_equality: NullEquality,
+    array_map_created_count: Count,
+) -> Result<JoinLeftData> {
+    let schema = left_stream.schema();
 
-pub fn equal_rows_arr(
-    indices_left: &UInt64Array,
-    indices_right: &UInt32Array,
-    left_arrays: &[ArrayRef],
-    right_arrays: &[ArrayRef],
-    null_equals_null: bool,
-) -> Result<(UInt64Array, UInt32Array)> {
-    let mut iter = left_arrays.iter().zip(right_arrays.iter());
-
-    let (first_left, first_right) = iter.next().ok_or_else(|| {
-        DataFusionError::Internal(
-            "At least one array should be provided for both left and right".to_string(),
-        )
-    })?;
+    let should_collect_min_max_for_phj =
+        should_collect_min_max_for_perfect_hash(&on_left, &schema)?;
 
-    let arr_left = take(first_left.as_ref(), indices_left, None)?;
-    let arr_right = take(first_right.as_ref(), indices_right, None)?;
+    let initial = BuildSideState::try_new(
+        metrics,
+        reservation,
+        on_left.clone(),
+        &schema,
+        should_compute_dynamic_filters || should_collect_min_max_for_phj,
+    )?;
 
-    let mut equal: BooleanArray = eq_dyn_null(&arr_left, &arr_right, null_equals_null)?;
+    let state = left_stream
+        .try_fold(initial, |mut state, batch| async move {
+            // Update accumulators if computing bounds
+            if let Some(ref mut accumulators) = state.bounds_accumulators {
+                for accumulator in accumulators {
+                    accumulator.update_batch(&batch)?;
+                }
+            }
 
-    // Use map and try_fold to iterate over the remaining pairs of arrays.
-    // In each iteration, take is used on the pair of arrays and their equality is determined.
-    // The results are then folded (combined) using the and function to get a final equality result.
-    equal = iter
-        .map(|(left, right)| {
-            let arr_left = take(left.as_ref(), indices_left, None)?;
-            let arr_right = take(right.as_ref(), indices_right, None)?;
-            eq_dyn_null(arr_left.as_ref(), arr_right.as_ref(), null_equals_null)
+            // Decide if we spill or not
+            let batch_size = get_record_batch_memory_size(&batch);
+            // Reserve memory for incoming batch
+            state.reservation.try_grow(batch_size)?;
+            // Update metrics
+            state.metrics.build_mem_used.add(batch_size);
+            state.metrics.build_input_batches.add(1);
+            state.metrics.build_input_rows.add(batch.num_rows());
+            // Update row count
+            state.num_rows += batch.num_rows();
+            // Push batch to output
+            state.batches.push(batch);
+            Ok(state)
         })
-        .try_fold(equal, |acc, equal2| and(&acc, &equal2?))?;
-
-    let filter_builder = FilterBuilder::new(&equal).optimize().build();
+        .await?;
 
-    let left_filtered = filter_builder.filter(indices_left)?;
-    let right_filtered = filter_builder.filter(indices_right)?;
+    // Extract fields from state
+    let BuildSideState {
+        batches,
+        num_rows,
+        metrics,
+        mut reservation,
+        bounds_accumulators,
+    } = state;
+
+    // Compute bounds
+    let mut bounds = match bounds_accumulators {
+        Some(accumulators) if num_rows > 0 => {
+            let bounds = accumulators
+                .into_iter()
+                .map(CollectLeftAccumulator::evaluate)
+                .collect::<Result<Vec<_>>>()?;
+            Some(PartitionBounds::new(bounds))
+        }
+        _ => None,
+    };
 
-    Ok((
-        downcast_array(left_filtered.as_ref()),
-        downcast_array(right_filtered.as_ref()),
-    ))
-}
+    let (join_hash_map, batch, left_values) =
+        if let Some((array_map, batch, left_value)) = try_create_array_map(
+            &bounds,
+            &schema,
+            &batches,
+            &on_left,
+            &mut reservation,
+            config.execution.perfect_hash_join_small_build_threshold,
+            config.execution.perfect_hash_join_min_key_density,
+            null_equality,
+        )? {
+            array_map_created_count.add(1);
+            metrics.build_mem_used.add(array_map.size());
 
-impl HashJoinStream {
-    /// Separate implementation function that unpins the [`HashJoinStream`] so
-    /// that partial borrows work correctly
-    fn poll_next_impl(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Option<Result<RecordBatch>>> {
-        loop {
-            return match self.state {
-                HashJoinStreamState::WaitBuildSide => {
-                    handle_state!(ready!(self.collect_build_side(cx)))
-                }
-                HashJoinStreamState::FetchProbeBatch => {
-                    handle_state!(ready!(self.fetch_probe_batch(cx)))
-                }
-                HashJoinStreamState::ProcessProbeBatch(_) => {
-                    handle_state!(self.process_probe_batch())
-                }
-                HashJoinStreamState::ExhaustedProbeSide => {
-                    handle_state!(self.process_unmatched_build_batch())
-                }
-                HashJoinStreamState::Completed => Poll::Ready(None),
+            (Map::ArrayMap(array_map), batch, left_value)
+        } else {
+            // Estimation of memory size, required for hashtable, prior to allocation.
+            // Final result can be verified using `RawTable.allocation_info()`
+            let fixed_size_u32 = size_of::<JoinHashMapU32>();
+            let fixed_size_u64 = size_of::<JoinHashMapU64>();
+
+            // Use `u32` indices for the JoinHashMap when num_rows ≤ u32::MAX, otherwise use the
+            // `u64` indice variant
+            // Arc is used instead of Box to allow sharing with SharedBuildAccumulator for hash map pushdown
+            let mut hashmap: Box<dyn JoinHashMapType> = if num_rows > u32::MAX as usize {
+                let estimated_hashtable_size =
+                    estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?;
+                reservation.try_grow(estimated_hashtable_size)?;
+                metrics.build_mem_used.add(estimated_hashtable_size);
+                Box::new(JoinHashMapU64::with_capacity(num_rows))
+            } else {
+                let estimated_hashtable_size =
+                    estimate_memory_size::<(u32, u64)>(num_rows, fixed_size_u32)?;
+                reservation.try_grow(estimated_hashtable_size)?;
+                metrics.build_mem_used.add(estimated_hashtable_size);
+                Box::new(JoinHashMapU32::with_capacity(num_rows))
             };
-        }
-    }
 
-    /// Collects build-side data by polling `OnceFut` future from initialized build-side
-    ///
-    /// Updates build-side to `Ready`, and state to `FetchProbeSide`
-    fn collect_build_side(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        let build_timer = self.join_metrics.build_time.timer();
-        // build hash table from left (build) side, if not yet done
-        let left_data = ready!(self
-            .build_side
-            .try_as_initial_mut()?
-            .left_fut
-            .get_shared(cx))?;
-        build_timer.done();
-
-        self.state = HashJoinStreamState::FetchProbeBatch;
-        self.build_side = BuildSide::Ready(BuildSideReadyState { left_data });
-
-        Poll::Ready(Ok(StatefulStreamResult::Continue))
-    }
-
-    /// Fetches next batch from probe-side
-    ///
-    /// If non-empty batch has been fetched, updates state to `ProcessProbeBatchState`,
-    /// otherwise updates state to `ExhaustedProbeSide`
-    fn fetch_probe_batch(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        match ready!(self.right.poll_next_unpin(cx)) {
-            None => {
-                self.state = HashJoinStreamState::ExhaustedProbeSide;
-            }
-            Some(Ok(batch)) => {
-                // Precalculate hash values for fetched batch
-                let keys_values = self
-                    .on_right
-                    .iter()
-                    .map(|c| c.evaluate(&batch)?.into_array(batch.num_rows()))
-                    .collect::<Result<Vec<_>>>()?;
-
-                self.hashes_buffer.clear();
-                self.hashes_buffer.resize(batch.num_rows(), 0);
-                create_hashes(&keys_values, &self.random_state, &mut self.hashes_buffer)?;
-
-                self.join_metrics.input_batches.add(1);
-                self.join_metrics.input_rows.add(batch.num_rows());
-
-                self.state =
-                    HashJoinStreamState::ProcessProbeBatch(ProcessProbeBatchState {
-                        batch,
-                        values: keys_values,
-                        offset: (0, None),
-                        joined_probe_idx: None,
-                    });
+            let mut hashes_buffer = Vec::new();
+            let mut offset = 0;
+
+            let batches_iter = batches.iter().rev();
+
+            // Updating hashmap starting from the last batch
+            for batch in batches_iter.clone() {
+                hashes_buffer.clear();
+                hashes_buffer.resize(batch.num_rows(), 0);
+                update_hash(
+                    &on_left,
+                    batch,
+                    &mut *hashmap,
+                    offset,
+                    &random_state,
+                    &mut hashes_buffer,
+                    0,
+                    true,
+                )?;
+                offset += batch.num_rows();
             }
-            Some(Err(err)) => return Poll::Ready(Err(err)),
-        };
 
-        Poll::Ready(Ok(StatefulStreamResult::Continue))
-    }
+            // Merge all batches into a single batch, so we can directly index into the arrays
+            let batch = concat_batches(&schema, batches_iter.clone())?;
 
-    /// Joins current probe batch with build-side data and produces batch with matched output
-    ///
-    /// Updates state to `FetchProbeBatch`
-    fn process_probe_batch(
-        &mut self,
-    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
-        let state = self.state.try_as_process_probe_batch_mut()?;
-        let build_side = self.build_side.try_as_ready_mut()?;
-
-        let timer = self.join_metrics.join_time.timer();
-
-        // get the matched by join keys indices
-        let (left_indices, right_indices, next_offset) = lookup_join_hashmap(
-            build_side.left_data.hash_map(),
-            build_side.left_data.values(),
-            &state.values,
-            self.null_equals_null,
-            &self.hashes_buffer,
-            self.batch_size,
-            state.offset,
-        )?;
+            let left_values = evaluate_expressions_to_arrays(&on_left, &batch)?;
 
-        // apply join filter if exists
-        let (left_indices, right_indices) = if let Some(filter) = &self.filter {
-            apply_join_filter_to_indices(
-                build_side.left_data.batch(),
-                &state.batch,
-                left_indices,
-                right_indices,
-                filter,
-                JoinSide::Left,
-            )?
-        } else {
-            (left_indices, right_indices)
+            (Map::HashMap(hashmap), batch, left_values)
         };
 
-        // mark joined left-side indices as visited, if required by join type
-        if need_produce_result_in_final(self.join_type) {
-            let mut bitmap = build_side.left_data.visited_indices_bitmap().lock();
-            left_indices.iter().flatten().for_each(|x| {
-                bitmap.set_bit(x as usize, true);
-            });
-        }
+    // Reserve additional memory for visited indices bitmap and create shared builder
+    let visited_indices_bitmap = if with_visited_indices_bitmap {
+        let bitmap_size = bit_util::ceil(batch.num_rows(), 8);
+        reservation.try_grow(bitmap_size)?;
+        metrics.build_mem_used.add(bitmap_size);
 
-        // The goals of index alignment for different join types are:
-        //
-        // 1) Right & FullJoin -- to append all missing probe-side indices between
-        //    previous (excluding) and current joined indices.
-        // 2) SemiJoin -- deduplicate probe indices in range between previous
-        //    (excluding) and current joined indices.
-        // 3) AntiJoin -- return only missing indices in range between
-        //    previous and current joined indices.
-        //    Inclusion/exclusion of the indices themselves don't matter
-        //
-        // As a summary -- alignment range can be produced based only on
-        // joined (matched with filters applied) probe side indices, excluding starting one
-        // (left from previous iteration).
-
-        // if any rows have been joined -- get last joined probe-side (right) row
-        // it's important that index counts as "joined" after hash collisions checks
-        // and join filters applied.
-        let last_joined_right_idx = match right_indices.len() {
-            0 => None,
-            n => Some(right_indices.value(n - 1) as usize),
-        };
+        let mut bitmap_buffer = BooleanBufferBuilder::new(batch.num_rows());
+        bitmap_buffer.append_n(num_rows, false);
+        bitmap_buffer
+    } else {
+        BooleanBufferBuilder::new(0)
+    };
 
-        // Calculate range and perform alignment.
-        // In case probe batch has been processed -- align all remaining rows.
-        let index_alignment_range_start = state.joined_probe_idx.map_or(0, |v| v + 1);
-        let index_alignment_range_end = if next_offset.is_none() {
-            state.batch.num_rows()
+    let map = Arc::new(join_hash_map);
+
+    let membership = if num_rows == 0 {
+        PushdownStrategy::Empty
+    } else {
+        // If the build side is small enough we can use IN list pushdown.
+        // If it's too big we fall back to pushing down a reference to the hash table.
+        // See `PushdownStrategy` for more details.
+        let estimated_size = left_values
+            .iter()
+            .map(|arr| arr.get_array_memory_size())
+            .sum::<usize>();
+        if left_values.is_empty()
+            || left_values[0].is_empty()
+            || estimated_size > config.optimizer.hash_join_inlist_pushdown_max_size
+            || map.num_of_distinct_key()
+                > config
+                    .optimizer
+                    .hash_join_inlist_pushdown_max_distinct_values
+        {
+            PushdownStrategy::Map(Arc::clone(&map))
+        } else if let Some(in_list_values) = build_struct_inlist_values(&left_values)? {
+            PushdownStrategy::InList(in_list_values)
         } else {
-            last_joined_right_idx.map_or(0, |v| v + 1)
-        };
+            PushdownStrategy::Map(Arc::clone(&map))
+        }
+    };
 
-        let (left_indices, right_indices) = adjust_indices_by_join_type(
-            left_indices,
-            right_indices,
-            index_alignment_range_start..index_alignment_range_end,
-            self.join_type,
-            self.right_side_ordered,
-        )?;
-
-        let result = build_batch_from_indices(
-            &self.schema,
-            build_side.left_data.batch(),
-            &state.batch,
-            &left_indices,
-            &right_indices,
-            &self.column_indices,
-            JoinSide::Left,
-        )?;
-
-        self.join_metrics.output_batches.add(1);
-        self.join_metrics.output_rows.add(result.num_rows());
-        timer.done();
-
-        if next_offset.is_none() {
-            self.state = HashJoinStreamState::FetchProbeBatch;
-        } else {
-            state.advance(
-                next_offset
-                    .ok_or_else(|| internal_datafusion_err!("unexpected None offset"))?,
-                last_joined_right_idx,
-            )
-        };
-
-        Ok(StatefulStreamResult::Ready(Some(result)))
+    if should_collect_min_max_for_phj && !should_compute_dynamic_filters {
+        bounds = None;
     }
 
-    /// Processes unmatched build-side rows for certain join types and produces output batch
-    ///
-    /// Updates state to `Completed`
-    fn process_unmatched_build_batch(
-        &mut self,
-    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
-        let timer = self.join_metrics.join_time.timer();
-
-        if !need_produce_result_in_final(self.join_type) {
-            self.state = HashJoinStreamState::Completed;
-            return Ok(StatefulStreamResult::Continue);
-        }
-
-        let build_side = self.build_side.try_as_ready()?;
-        if !build_side.left_data.report_probe_completed() {
-            self.state = HashJoinStreamState::Completed;
-            return Ok(StatefulStreamResult::Continue);
-        }
-
-        // use the global left bitmap to produce the left indices and right indices
-        let (left_side, right_side) = get_final_indices_from_shared_bitmap(
-            build_side.left_data.visited_indices_bitmap(),
-            self.join_type,
-        );
-        let empty_right_batch = RecordBatch::new_empty(self.right.schema());
-        // use the left and right indices to produce the batch result
-        let result = build_batch_from_indices(
-            &self.schema,
-            build_side.left_data.batch(),
-            &empty_right_batch,
-            &left_side,
-            &right_side,
-            &self.column_indices,
-            JoinSide::Left,
-        );
-
-        if let Ok(ref batch) = result {
-            self.join_metrics.input_batches.add(1);
-            self.join_metrics.input_rows.add(batch.num_rows());
-
-            self.join_metrics.output_batches.add(1);
-            self.join_metrics.output_rows.add(batch.num_rows());
-        }
-        timer.done();
-
-        self.state = HashJoinStreamState::Completed;
+    let data = JoinLeftData {
+        map,
+        batch,
+        values: left_values,
+        visited_indices_bitmap: Mutex::new(visited_indices_bitmap),
+        probe_threads_counter: AtomicUsize::new(probe_threads_count),
+        _reservation: reservation,
+        bounds,
+        membership,
+        probe_side_non_empty: AtomicBool::new(false),
+        probe_side_has_null: AtomicBool::new(false),
+    };
 
-        Ok(StatefulStreamResult::Ready(Some(result?)))
-    }
+    Ok(data)
 }
 
-impl Stream for HashJoinStream {
-    type Item = Result<RecordBatch>;
+#[cfg(test)]
+mod tests {
+    use super::*;
 
-    fn poll_next(
-        mut self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        self.poll_next_impl(cx)
+    fn assert_phj_used(metrics: &MetricsSet, use_phj: bool) {
+        if use_phj {
+            assert!(
+                metrics
+                    .sum_by_name(ARRAY_MAP_CREATED_COUNT_METRIC_NAME)
+                    .expect("should have array_map_created_count metrics")
+                    .as_usize()
+                    >= 1
+            );
+        } else {
+            assert_eq!(
+                metrics
+                    .sum_by_name(ARRAY_MAP_CREATED_COUNT_METRIC_NAME)
+                    .map(|v| v.as_usize())
+                    .unwrap_or(0),
+                0
+            )
+        }
     }
-}
 
-impl EmbeddedProjection for HashJoinExec {
-    fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        self.with_projection(projection)
+    fn build_schema_and_on() -> Result<(SchemaRef, SchemaRef, JoinOn)> {
+        let left_schema = Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("b1", DataType::Int32, true),
+        ]));
+        let right_schema = Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("b1", DataType::Int32, true),
+        ]));
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left_schema)?) as _,
+            Arc::new(Column::new_with_schema("b1", &right_schema)?) as _,
+        )];
+        Ok((left_schema, right_schema, on))
     }
-}
 
-#[cfg(test)]
-mod tests {
-    use super::*;
     use crate::coalesce_partitions::CoalescePartitionsExec;
-    use crate::test::TestMemoryExec;
+    use crate::joins::hash_join::stream::lookup_join_hashmap;
+    use crate::test::{TestMemoryExec, assert_join_metrics};
     use crate::{
         common, expressions::Column, repartition::RepartitionExec, test::build_table_i32,
         test::exec::MockExec,
     };
 
-    use arrow::array::{Date32Array, Int32Array, StructArray};
+    use arrow::array::{
+        Date32Array, Int32Array, Int64Array, StructArray, UInt32Array, UInt64Array,
+    };
     use arrow::buffer::NullBuffer;
     use arrow::datatypes::{DataType, Field};
+    use arrow_schema::Schema;
+    use datafusion_common::hash_utils::create_hashes;
     use datafusion_common::test_util::{batches_to_sort_string, batches_to_string};
     use datafusion_common::{
-        assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err,
-        ScalarValue,
+        ScalarValue, assert_batches_eq, assert_batches_sorted_eq, assert_contains,
+        exec_err, internal_err,
     };
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
     use datafusion_physical_expr::PhysicalExpr;
+    use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
     use hashbrown::HashTable;
     use insta::{allow_duplicates, assert_snapshot};
     use rstest::*;
@@ -1690,10 +2170,37 @@ mod tests {
 
     #[template]
     #[rstest]
-    fn batch_sizes(#[values(8192, 10, 5, 2, 1)] batch_size: usize) {}
+    fn hash_join_exec_configs(
+        #[values(8192, 10, 5, 2, 1)] batch_size: usize,
+        #[values(true, false)] use_perfect_hash_join_as_possible: bool,
+    ) {
+    }
 
-    fn prepare_task_ctx(batch_size: usize) -> Arc<TaskContext> {
-        let session_config = SessionConfig::default().with_batch_size(batch_size);
+    fn prepare_task_ctx(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Arc<TaskContext> {
+        let mut session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        if use_perfect_hash_join_as_possible {
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_small_build_threshold = 819200;
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_min_key_density = 0.0;
+        } else {
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_small_build_threshold = 0;
+            session_config
+                .options_mut()
+                .execution
+                .perfect_hash_join_min_key_density = f64::INFINITY;
+        }
         Arc::new(TaskContext::default().with_session_config(session_config))
     }
 
@@ -1707,12 +2214,32 @@ mod tests {
         TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
     }
 
+    /// Build a table with two columns supporting nullable values
+    fn build_table_two_cols(
+        a: (&str, &Vec<Option<i32>>),
+        b: (&str, &Vec<Option<i32>>),
+    ) -> Arc<dyn ExecutionPlan> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(a.0, DataType::Int32, true),
+            Field::new(b.0, DataType::Int32, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(a.1.clone())),
+                Arc::new(Int32Array::from(b.1.clone())),
+            ],
+        )
+        .unwrap();
+        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+    }
+
     fn join(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
         on: JoinOn,
         join_type: &JoinType,
-        null_equals_null: bool,
+        null_equality: NullEquality,
     ) -> Result<HashJoinExec> {
         HashJoinExec::try_new(
             left,
@@ -1722,7 +2249,8 @@ mod tests {
             join_type,
             None,
             PartitionMode::CollectLeft,
-            null_equals_null,
+            null_equality,
+            false,
         )
     }
 
@@ -1732,7 +2260,7 @@ mod tests {
         on: JoinOn,
         filter: JoinFilter,
         join_type: &JoinType,
-        null_equals_null: bool,
+        null_equality: NullEquality,
     ) -> Result<HashJoinExec> {
         HashJoinExec::try_new(
             left,
@@ -1742,7 +2270,8 @@ mod tests {
             join_type,
             None,
             PartitionMode::CollectLeft,
-            null_equals_null,
+            null_equality,
+            false,
         )
     }
 
@@ -1751,16 +2280,17 @@ mod tests {
         right: Arc<dyn ExecutionPlan>,
         on: JoinOn,
         join_type: &JoinType,
-        null_equals_null: bool,
+        null_equality: NullEquality,
         context: Arc<TaskContext>,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
-        let join = join(left, right, on, join_type, null_equals_null)?;
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
+        let join = join(left, right, on, join_type, null_equality)?;
         let columns_header = columns(&join.schema());
 
         let stream = join.execute(0, context)?;
         let batches = common::collect(stream).await?;
+        let metrics = join.metrics().unwrap();
 
-        Ok((columns_header, batches))
+        Ok((columns_header, batches, metrics))
     }
 
     async fn partitioned_join_collect(
@@ -1768,16 +2298,16 @@ mod tests {
         right: Arc<dyn ExecutionPlan>,
         on: JoinOn,
         join_type: &JoinType,
-        null_equals_null: bool,
+        null_equality: NullEquality,
         context: Arc<TaskContext>,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
         join_collect_with_partition_mode(
             left,
             right,
             on,
             join_type,
             PartitionMode::Partitioned,
-            null_equals_null,
+            null_equality,
             context,
         )
         .await
@@ -1789,9 +2319,9 @@ mod tests {
         on: JoinOn,
         join_type: &JoinType,
         partition_mode: PartitionMode,
-        null_equals_null: bool,
+        null_equality: NullEquality,
         context: Arc<TaskContext>,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
         let partition_count = 4;
 
         let (left_expr, right_expr) = on
@@ -1806,7 +2336,7 @@ mod tests {
                 Partitioning::Hash(left_expr, partition_count),
             )?),
             PartitionMode::Auto => {
-                return internal_err!("Unexpected PartitionMode::Auto in join tests")
+                return internal_err!("Unexpected PartitionMode::Auto in join tests");
             }
         };
 
@@ -1827,7 +2357,7 @@ mod tests {
                 Partitioning::Hash(right_expr, partition_count),
             )?),
             PartitionMode::Auto => {
-                return internal_err!("Unexpected PartitionMode::Auto in join tests")
+                return internal_err!("Unexpected PartitionMode::Auto in join tests");
             }
         };
 
@@ -1839,7 +2369,8 @@ mod tests {
             join_type,
             None,
             partition_mode,
-            null_equals_null,
+            null_equality,
+            false,
         )?;
 
         let columns = columns(&join.schema());
@@ -1855,14 +2386,18 @@ mod tests {
                     .collect::<Vec<_>>(),
             );
         }
+        let metrics = join.metrics().unwrap();
 
-        Ok((columns, batches))
+        Ok((columns, batches, metrics))
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -1879,12 +2414,12 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = join_collect(
+        let (columns, batches, metrics) = join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::Inner,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
@@ -1893,24 +2428,30 @@ mod tests {
 
         allow_duplicates! {
             // Inner join output is expected to preserve both inputs order
-            assert_snapshot!(batches_to_string(&batches), @r#"
-                +----+----+----+----+----+----+
-                | a1 | b1 | c1 | a2 | b1 | c2 |
-                +----+----+----+----+----+----+
-                | 1  | 4  | 7  | 10 | 4  | 70 |
-                | 2  | 5  | 8  | 20 | 5  | 80 |
-                | 3  | 5  | 9  | 20 | 5  | 80 |
-                +----+----+----+----+----+----+
-                "#);
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +----+----+----+----+----+----+
+            | a1 | b1 | c1 | a2 | b1 | c2 |
+            +----+----+----+----+----+----+
+            | 1  | 4  | 7  | 10 | 4  | 70 |
+            | 2  | 5  | 8  | 20 | 5  | 80 |
+            | 3  | 5  | 9  | 20 | 5  | 80 |
+            +----+----+----+----+----+----+
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_inner_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_inner_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -1926,12 +2467,12 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = partitioned_join_collect(
+        let (columns, batches, metrics) = partitioned_join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::Inner,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
@@ -1939,17 +2480,20 @@ mod tests {
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
-                +----+----+----+----+----+----+
-                | a1 | b1 | c1 | a2 | b1 | c2 |
-                +----+----+----+----+----+----+
-                | 1  | 4  | 7  | 10 | 4  | 70 |
-                | 2  | 5  | 8  | 20 | 5  | 80 |
-                | 3  | 5  | 9  | 20 | 5  | 80 |
-                +----+----+----+----+----+----+
-                "#);
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+----+----+----+----+----+
+            | a1 | b1 | c1 | a2 | b1 | c2 |
+            +----+----+----+----+----+----+
+            | 1  | 4  | 7  | 10 | 4  | 70 |
+            | 2  | 5  | 8  | 20 | 5  | 80 |
+            | 3  | 5  | 9  | 20 | 5  | 80 |
+            +----+----+----+----+----+----+
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
@@ -1971,14 +2515,21 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -1986,9 +2537,11 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 5  | 9  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+
         Ok(())
     }
 
@@ -2010,14 +2563,21 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2026,16 +2586,21 @@ mod tests {
             | 0  | 4  | 6  | 10 | 4  | 70 |
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 4);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_two(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_two(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 2]),
             ("b2", &vec![1, 2, 2]),
@@ -2057,8 +2622,15 @@ mod tests {
             ),
         ];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]);
 
@@ -2076,11 +2648,16 @@ mod tests {
             div_ceil(9, batch_size)
         };
 
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b2 | c1 | a1 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2088,17 +2665,22 @@ mod tests {
             | 2  | 2  | 8  | 2  | 2  | 80 |
             | 2  | 2  | 9  | 2  | 2  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+
         Ok(())
     }
 
     /// Test where the left has 2 parts, the right with 1 part => 1 part
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one_two_parts_left(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one_two_parts_left(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let batch1 = build_table_i32(
             ("a1", &vec![1, 2]),
             ("b2", &vec![1, 2]),
@@ -2128,8 +2710,15 @@ mod tests {
             ),
         ];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]);
 
@@ -2147,11 +2736,16 @@ mod tests {
             div_ceil(9, batch_size)
         };
 
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b2 | c1 | a1 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2159,9 +2753,11 @@ mod tests {
             | 2  | 2  | 8  | 2  | 2  | 80 |
             | 2  | 2  | 9  | 2  | 2  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+
         Ok(())
     }
 
@@ -2194,14 +2790,21 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2210,17 +2813,22 @@ mod tests {
             | 0  | 4  | 6  | 10 | 4  | 70 |
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 4);
+
         Ok(())
     }
 
     /// Test where the left has 1 part, the right has 2 parts => 2 parts
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_one_two_parts_right(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_one_two_parts_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 5]), // this has a repetition
@@ -2244,7 +2852,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Inner, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
@@ -2266,17 +2880,22 @@ mod tests {
             // and filtered later.
             div_ceil(6, batch_size)
         };
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
             | 1  | 4  | 7  | 10 | 4  | 70 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
         // second part
@@ -2291,20 +2910,28 @@ mod tests {
             // and filtered later.
             div_ceil(3, batch_size)
         };
-        assert_eq!(batches.len(), expected_batch_count);
+        // With batch coalescing, we may have fewer batches than expected
+        assert!(
+            batches.len() <= expected_batch_count,
+            "expected at most {expected_batch_count} batches, got {}",
+            batches.len()
+        );
 
         // Inner join output is expected to preserve both inputs order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
             | 2  | 5  | 8  | 30 | 5  | 90 |
             | 3  | 5  | 9  | 30 | 5  | 90 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
@@ -2318,10 +2945,13 @@ mod tests {
         TestMemoryExec::try_new_exec(&[vec![batch.clone(), batch]], schema, None).unwrap()
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_multi_batch(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_multi_batch(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2337,16 +2967,30 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema()).unwrap()) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Left, false).unwrap();
+        let join = join(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            &JoinType::Left,
+            NullEquality::NullEqualsNothing,
+        )
+        .unwrap();
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
-        let stream = join.execute(0, task_ctx).unwrap();
-        let batches = common::collect(stream).await.unwrap();
+        let (_, batches, metrics) = join_collect(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            &JoinType::Left,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2356,14 +3000,20 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+        return Ok(());
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_multi_batch(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_multi_batch(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2380,16 +3030,24 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Full, false).unwrap();
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Full,
+            NullEquality::NullEqualsNothing,
+        )
+        .unwrap();
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2401,14 +3059,19 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_empty_right(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_empty_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -2421,16 +3084,24 @@ mod tests {
         )];
         let schema = right.schema();
         let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap();
-        let join = join(left, right, on, &JoinType::Left, false).unwrap();
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Left,
+            NullEquality::NullEqualsNothing,
+        )
+        .unwrap();
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2438,14 +3109,19 @@ mod tests {
             | 2  | 5  | 8  |    |    |    |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_empty_right(batch_size: usize) {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_empty_right(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -2458,16 +3134,24 @@ mod tests {
         )];
         let schema = right.schema();
         let right = TestMemoryExec::try_new_exec(&[vec![right]], schema, None).unwrap();
-        let join = join(left, right, on, &JoinType::Full, false).unwrap();
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Full,
+            NullEquality::NullEqualsNothing,
+        )
+        .unwrap();
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
 
         let stream = join.execute(0, task_ctx).unwrap();
         let batches = common::collect(stream).await.unwrap();
+        let metrics = join.metrics().unwrap();
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -2475,14 +3159,19 @@ mod tests {
             | 2  | 5  | 8  |    |    |    |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2498,19 +3187,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = join_collect(
+        let (columns, batches, metrics) = join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::Left,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
+
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2518,16 +3208,22 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_left_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_left_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -2543,19 +3239,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = partitioned_join_collect(
+        let (columns, batches, metrics) = partitioned_join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::Left,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
+
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -2563,9 +3260,12 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
@@ -2589,10 +3289,13 @@ mod tests {
         )
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_semi(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_semi(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left semi join right_table on left_table.b1 = right_table.b2
@@ -2601,7 +3304,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::LeftSemi, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::LeftSemi,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
@@ -2611,7 +3320,7 @@ mod tests {
 
         // ignore the order
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2619,16 +3328,22 @@ mod tests {
             | 13 | 10 | 130 |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_semi_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_semi_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2663,7 +3378,7 @@ mod tests {
             on.clone(),
             filter,
             &JoinType::LeftSemi,
-            false,
+            NullEquality::NullEqualsNothing,
         )?;
 
         let columns_header = columns(&join.schema());
@@ -2684,6 +3399,9 @@ mod tests {
             ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table left semi join right_table on left_table.b1 = right_table.b2 and right_table.a2 > 10
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2696,7 +3414,14 @@ mod tests {
             Arc::new(intermediate_schema),
         );
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::LeftSemi, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::LeftSemi,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns_header = columns(&join.schema());
         assert_eq!(columns_header, vec!["a1", "b1", "c1"]);
@@ -2705,22 +3430,28 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
             | 13 | 10 | 130 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_semi(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_semi(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2730,7 +3461,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::RightSemi, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::RightSemi,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
@@ -2740,7 +3477,7 @@ mod tests {
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -2748,16 +3485,22 @@ mod tests {
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_semi_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_semi_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
 
@@ -2792,7 +3535,7 @@ mod tests {
             on.clone(),
             filter,
             &JoinType::RightSemi,
-            false,
+            NullEquality::NullEqualsNothing,
         )?;
 
         let columns = columns(&join.schema());
@@ -2803,7 +3546,7 @@ mod tests {
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -2811,9 +3554,12 @@ mod tests {
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table right semi join right_table on left_table.b1 = right_table.b2 on left_table.a1!=9
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2827,30 +3573,42 @@ mod tests {
             Arc::new(intermediate_schema.clone()),
         );
 
-        let join =
-            join_with_filter(left, right, on, filter, &JoinType::RightSemi, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::RightSemi,
+            NullEquality::NullEqualsNothing,
+        )?;
         let stream = join.execute(0, task_ctx)?;
         let batches = common::collect(stream).await?;
 
         // RightSemi join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
             | 12 | 10 | 40  |
             | 10 | 10 | 100 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_anti(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_anti(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left anti join right_table on left_table.b1 = right_table.b2
@@ -2859,7 +3617,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::LeftAnti, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::LeftAnti,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
@@ -2868,7 +3632,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+
             | a1 | b1 | c1 |
             +----+----+----+
@@ -2877,15 +3641,22 @@ mod tests {
             | 5  | 5  | 50 |
             | 7  | 7  | 70 |
             +----+----+----+
-                "#);
+            ");
         }
+
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_anti_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_anti_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table left anti join right_table on left_table.b1 = right_table.b2 and right_table.a2!=8
@@ -2918,7 +3689,7 @@ mod tests {
             on.clone(),
             filter,
             &JoinType::LeftAnti,
-            false,
+            NullEquality::NullEqualsNothing,
         )?;
 
         let columns_header = columns(&join.schema());
@@ -2928,7 +3699,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2939,9 +3710,12 @@ mod tests {
             | 7  | 7  | 70  |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table left anti join right_table on left_table.b1 = right_table.b2 and right_table.a2 != 13
         let filter_expression = Arc::new(BinaryExpr::new(
             Arc::new(Column::new("x", 0)),
@@ -2955,7 +3729,14 @@ mod tests {
             Arc::new(intermediate_schema),
         );
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::LeftAnti, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::LeftAnti,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns_header = columns(&join.schema());
         assert_eq!(columns_header, vec!["a1", "b1", "c1"]);
@@ -2964,7 +3745,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+-----+
             | a1 | b1 | c1  |
             +----+----+-----+
@@ -2975,16 +3756,22 @@ mod tests {
             | 7  | 7  | 70  |
             | 9  | 8  | 90  |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_anti(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_anti(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         let on = vec![(
@@ -2992,7 +3779,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::RightAnti, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::RightAnti,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
@@ -3002,7 +3795,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -3010,15 +3803,22 @@ mod tests {
             | 2  | 2  | 80  |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
+
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_anti_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_anti_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_semi_anti_left_table();
         let right = build_semi_anti_right_table();
         // left_table right anti join right_table on left_table.b1 = right_table.b2 and left_table.a1!=13
@@ -3052,7 +3852,7 @@ mod tests {
             on.clone(),
             filter,
             &JoinType::RightAnti,
-            false,
+            NullEquality::NullEqualsNothing,
         )?;
 
         let columns_header = columns(&join.schema());
@@ -3063,7 +3863,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -3073,9 +3873,12 @@ mod tests {
             | 10 | 10 | 100 |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         // left_table right anti join right_table on left_table.b1 = right_table.b2 and right_table.b2!=8
         let column_indices = vec![ColumnIndex {
             index: 1,
@@ -3093,8 +3896,14 @@ mod tests {
             Arc::new(intermediate_schema),
         );
 
-        let join =
-            join_with_filter(left, right, on, filter, &JoinType::RightAnti, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::RightAnti,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns_header = columns(&join.schema());
         assert_eq!(columns_header, vec!["a2", "b2", "c2"]);
@@ -3104,7 +3913,7 @@ mod tests {
 
         // RightAnti join output is expected to preserve right input order
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +----+----+-----+
             | a2 | b2 | c2  |
             +----+----+-----+
@@ -3113,16 +3922,22 @@ mod tests {
             | 2  | 2  | 80  |
             | 4  | 4  | 120 |
             +----+----+-----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -3138,13 +3953,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Right, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Right,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -3152,16 +3974,22 @@ mod tests {
             | 1  | 4  | 7  | 10 | 4  | 70 |
             | 2  | 5  | 8  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_right_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_right_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]),
@@ -3177,14 +4005,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            partitioned_join_collect(left, right, on, &JoinType::Right, false, task_ctx)
-                .await?;
+        let (columns, batches, metrics) = partitioned_join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Right,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b1 | c2 |
             +----+----+----+----+----+----+
@@ -3192,16 +4026,22 @@ mod tests {
             | 1  | 4  | 7  | 10 | 4  | 70 |
             | 2  | 5  | 8  | 20 | 5  | 80 |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_one(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_one(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3217,7 +4057,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Full, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Full,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
@@ -3226,7 +4072,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+----+----+----+
             | a1 | b1 | c1 | a2 | b2 | c2 |
             +----+----+----+----+----+----+
@@ -3235,16 +4081,22 @@ mod tests {
             | 2  | 5  | 8  | 20 | 5  | 80 |
             | 3  | 7  | 9  |    |    |    |
             +----+----+----+----+----+----+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3260,19 +4112,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = join_collect(
+        let (columns, batches, metrics) = join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::LeftMark,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
+
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+-------+
             | a1 | b1 | c1 | mark  |
             +----+----+----+-------+
@@ -3280,16 +4133,22 @@ mod tests {
             | 2  | 5  | 8  | true  |
             | 3  | 7  | 9  | false |
             +----+----+----+-------+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn partitioned_join_left_mark(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn partitioned_join_left_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
             ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
@@ -3305,19 +4164,20 @@ mod tests {
             Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
         )];
 
-        let (columns, batches) = partitioned_join_collect(
+        let (columns, batches, metrics) = partitioned_join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::LeftMark,
-            false,
+            NullEquality::NullEqualsNothing,
             task_ctx,
         )
         .await?;
+
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +----+----+----+-------+
             | a1 | b1 | c1 | mark  |
             +----+----+----+-------+
@@ -3325,14 +4185,120 @@ mod tests {
             | 2  | 5  | 8  | true  |
             | 3  | 7  | 9  | false |
             +----+----+----+-------+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn join_right_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
+            ("c1", &vec![7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+            ("c2", &vec![70, 80, 90]),
+        );
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        let (columns, batches, metrics) = join_collect(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            &JoinType::RightMark,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a2", "b1", "c2", "mark"]);
+
+        let expected = [
+            "+----+----+----+-------+",
+            "| a2 | b1 | c2 | mark  |",
+            "+----+----+----+-------+",
+            "| 10 | 4  | 70 | true  |",
+            "| 20 | 5  | 80 | true  |",
+            "| 30 | 6  | 90 | false |",
+            "+----+----+----+-------+",
+        ];
+        assert_batches_sorted_eq!(expected, &batches);
+
+        assert_join_metrics!(metrics, 3);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn partitioned_join_right_mark(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
+            ("c1", &vec![7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![10, 20, 30, 40]),
+            ("b1", &vec![4, 4, 5, 6]), // 6 does not exist on the left
+            ("c2", &vec![60, 70, 80, 90]),
+        );
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        let (columns, batches, metrics) = partitioned_join_collect(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            &JoinType::RightMark,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a2", "b1", "c2", "mark"]);
+
+        let expected = [
+            "+----+----+----+-------+",
+            "| a2 | b1 | c2 | mark  |",
+            "+----+----+----+-------+",
+            "| 10 | 4  | 60 | true  |",
+            "| 20 | 4  | 70 | true  |",
+            "| 30 | 5  | 80 | true  |",
+            "| 40 | 6  | 90 | false |",
+            "+----+----+----+-------+",
+        ];
+        assert_batches_sorted_eq!(expected, &batches);
+
+        assert_join_metrics!(metrics, 4);
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
     #[test]
-    fn join_with_hash_collision() -> Result<()> {
+    fn join_with_hash_collisions_64() -> Result<()> {
         let mut hashmap_left = HashTable::with_capacity(4);
         let left = build_table_i32(
             ("a", &vec![10, 20]),
@@ -3340,13 +4306,9 @@ mod tests {
             ("y", &vec![200, 300]),
         );
 
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let hashes_buff = &mut vec![0; left.num_rows()];
-        let hashes = create_hashes(
-            &[Arc::clone(&left.columns()[0])],
-            &random_state,
-            hashes_buff,
-        )?;
+        let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
 
         // Maps both values to both indices (1 and 2, representing input 0 and 1)
         // 0 -> (0, 1)
@@ -3369,26 +4331,26 @@ mod tests {
         // Join key column for both join sides
         let key_column: PhysicalExprRef = Arc::new(Column::new("a", 0)) as _;
 
-        let join_hash_map = JoinHashMap::new(hashmap_left, next);
+        let join_hash_map = JoinHashMapU64::new(hashmap_left, next);
 
         let left_keys_values = key_column.evaluate(&left)?.into_array(left.num_rows())?;
         let right_keys_values =
             key_column.evaluate(&right)?.into_array(right.num_rows())?;
         let mut hashes_buffer = vec![0; right.num_rows()];
-        create_hashes(
-            &[Arc::clone(&right_keys_values)],
-            &random_state,
-            &mut hashes_buffer,
-        )?;
+        create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
 
+        let mut probe_indices_buffer = Vec::new();
+        let mut build_indices_buffer = Vec::new();
         let (l, r, _) = lookup_join_hashmap(
             &join_hash_map,
             &[left_keys_values],
             &[right_keys_values],
-            false,
+            NullEquality::NullEqualsNothing,
             &hashes_buffer,
             8192,
             (0, None),
+            &mut probe_indices_buffer,
+            &mut build_indices_buffer,
         )?;
 
         let left_ids: UInt64Array = vec![0, 1].into();
@@ -3402,6 +4364,66 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn join_with_hash_collisions_u32() -> Result<()> {
+        let mut hashmap_left = HashTable::with_capacity(4);
+        let left = build_table_i32(
+            ("a", &vec![10, 20]),
+            ("x", &vec![100, 200]),
+            ("y", &vec![200, 300]),
+        );
+
+        let random_state = RandomState::with_seed(0);
+        let hashes_buff = &mut vec![0; left.num_rows()];
+        let hashes = create_hashes([&left.columns()[0]], &random_state, hashes_buff)?;
+
+        hashmap_left.insert_unique(hashes[0], (hashes[0], 1u32), |(h, _)| *h);
+        hashmap_left.insert_unique(hashes[0], (hashes[0], 2u32), |(h, _)| *h);
+        hashmap_left.insert_unique(hashes[1], (hashes[1], 1u32), |(h, _)| *h);
+        hashmap_left.insert_unique(hashes[1], (hashes[1], 2u32), |(h, _)| *h);
+
+        let next: Vec<u32> = vec![2, 0];
+
+        let right = build_table_i32(
+            ("a", &vec![10, 20]),
+            ("b", &vec![0, 0]),
+            ("c", &vec![30, 40]),
+        );
+
+        let key_column: PhysicalExprRef = Arc::new(Column::new("a", 0)) as _;
+
+        let join_hash_map = JoinHashMapU32::new(hashmap_left, next);
+
+        let left_keys_values = key_column.evaluate(&left)?.into_array(left.num_rows())?;
+        let right_keys_values =
+            key_column.evaluate(&right)?.into_array(right.num_rows())?;
+        let mut hashes_buffer = vec![0; right.num_rows()];
+        create_hashes([&right_keys_values], &random_state, &mut hashes_buffer)?;
+
+        let mut probe_indices_buffer = Vec::new();
+        let mut build_indices_buffer = Vec::new();
+        let (l, r, _) = lookup_join_hashmap(
+            &join_hash_map,
+            &[left_keys_values],
+            &[right_keys_values],
+            NullEquality::NullEqualsNothing,
+            &hashes_buffer,
+            8192,
+            (0, None),
+            &mut probe_indices_buffer,
+            &mut build_indices_buffer,
+        )?;
+
+        // We still expect to match rows 0 and 1 on both sides
+        let left_ids: UInt64Array = vec![0, 1].into();
+        let right_ids: UInt32Array = vec![0, 1].into();
+
+        assert_eq!(left_ids, l);
+        assert_eq!(right_ids, r);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn join_with_duplicated_column_names() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
@@ -3421,7 +4443,13 @@ mod tests {
             Arc::new(Column::new_with_schema("b", &right.schema()).unwrap()) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Inner, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a", "b", "c", "a", "b", "c"]);
@@ -3430,14 +4458,14 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+----+
             | a | b | c | a  | b | c  |
             +---+---+---+----+---+----+
             | 1 | 4 | 7 | 10 | 1 | 70 |
             | 2 | 5 | 8 | 20 | 2 | 80 |
             +---+---+---+----+---+----+
-                "#);
+            ");
         }
 
         Ok(())
@@ -3471,10 +4499,13 @@ mod tests {
         )
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_inner_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_inner_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3491,7 +4522,14 @@ mod tests {
         )];
         let filter = prepare_join_filter();
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::Inner, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a", "b", "c", "a", "b", "c"]);
@@ -3500,23 +4538,29 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
             | 2 | 7 | 9 | 10 | 2 | 7 |
             | 2 | 7 | 9 | 20 | 2 | 5 |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_left_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_left_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3533,7 +4577,14 @@ mod tests {
         )];
         let filter = prepare_join_filter();
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::Left, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::Left,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a", "b", "c", "a", "b", "c"]);
@@ -3542,7 +4593,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
@@ -3552,16 +4603,22 @@ mod tests {
             | 2 | 7 | 9 | 20 | 2 | 5 |
             | 2 | 8 | 1 |    |   |   |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_right_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_right_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3578,7 +4635,14 @@ mod tests {
         )];
         let filter = prepare_join_filter();
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::Right, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::Right,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a", "b", "c", "a", "b", "c"]);
@@ -3587,7 +4651,7 @@ mod tests {
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +---+---+---+----+---+---+
             | a | b | c | a  | b | c |
             +---+---+---+----+---+---+
@@ -3596,16 +4660,22 @@ mod tests {
             | 2 | 7 | 9 | 10 | 2 | 7 |
             | 2 | 7 | 9 | 20 | 2 | 5 |
             +---+---+---+----+---+---+
-                "#);
+            ");
         }
 
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
         Ok(())
     }
 
-    #[apply(batch_sizes)]
+    #[apply(hash_join_exec_configs)]
     #[tokio::test]
-    async fn join_full_with_filter(batch_size: usize) -> Result<()> {
-        let task_ctx = prepare_task_ctx(batch_size);
+    async fn join_full_with_filter(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
         let left = build_table(
             ("a", &vec![0, 1, 2, 2]),
             ("b", &vec![4, 5, 7, 8]),
@@ -3622,7 +4692,14 @@ mod tests {
         )];
         let filter = prepare_join_filter();
 
-        let join = join_with_filter(left, right, on, filter, &JoinType::Full, false)?;
+        let join = join_with_filter(
+            left,
+            right,
+            on,
+            filter,
+            &JoinType::Full,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let columns = columns(&join.schema());
         assert_eq!(columns, vec!["a", "b", "c", "a", "b", "c"]);
@@ -3645,7 +4722,10 @@ mod tests {
         ];
         assert_batches_sorted_eq!(expected, &batches);
 
-        // THIS MIGRATION HAULTED DUE TO ISSUE #15312
+        let metrics = join.metrics().unwrap();
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        // THIS MIGRATION HALTED DUE TO ISSUE #15312
         //allow_duplicates! {
         //    assert_snapshot!(batches_to_sort_string(&batches), @r#"
         //    +---+---+---+----+---+---+
@@ -3759,6 +4839,15 @@ mod tests {
             "| 3  | 7  | 9  | false |",
             "+----+----+----+-------+",
         ];
+        let expected_right_mark = vec![
+            "+----+----+----+-------+",
+            "| a2 | b2 | c2 | mark  |",
+            "+----+----+----+-------+",
+            "| 10 | 4  | 70 | true  |",
+            "| 20 | 5  | 80 | true  |",
+            "| 30 | 6  | 90 | false |",
+            "+----+----+----+-------+",
+        ];
 
         let test_cases = vec![
             (JoinType::Inner, expected_inner),
@@ -3770,20 +4859,22 @@ mod tests {
             (JoinType::RightSemi, expected_right_semi),
             (JoinType::RightAnti, expected_right_anti),
             (JoinType::LeftMark, expected_left_mark),
+            (JoinType::RightMark, expected_right_mark),
         ];
 
         for (join_type, expected) in test_cases {
-            let (_, batches) = join_collect_with_partition_mode(
+            let (_, batches, metrics) = join_collect_with_partition_mode(
                 Arc::clone(&left),
                 Arc::clone(&right),
                 on.clone(),
                 &join_type,
                 PartitionMode::CollectLeft,
-                false,
+                NullEquality::NullEqualsNothing,
                 Arc::clone(&task_ctx),
             )
             .await?;
             assert_batches_sorted_eq!(expected, &batches);
+            assert_join_metrics!(metrics, expected.len() - 4);
         }
 
         Ok(())
@@ -3811,14 +4902,20 @@ mod tests {
             Arc::new(Column::new_with_schema("date", &right.schema()).unwrap()) as _,
         )];
 
-        let join = join(left, right, on, &JoinType::Inner, false)?;
+        let join = join(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+        )?;
 
         let task_ctx = Arc::new(TaskContext::default());
         let stream = join.execute(0, task_ctx)?;
         let batches = common::collect(stream).await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
             +------------+---+------------+---+
             | date       | n | date       | n |
             +------------+---+------------+---+
@@ -3826,7 +4923,7 @@ mod tests {
             | 2022-04-26 | 2 | 2022-04-26 | 5 |
             | 2022-04-27 | 3 | 2022-04-27 | 6 |
             +------------+---+------------+---+
-                "#);
+            ");
         }
 
         Ok(())
@@ -3870,7 +4967,7 @@ mod tests {
                 Arc::clone(&right_input) as Arc<dyn ExecutionPlan>,
                 on.clone(),
                 &join_type,
-                false,
+                NullEquality::NullEqualsNothing,
             )
             .unwrap();
             let task_ctx = Arc::new(TaskContext::default());
@@ -3887,7 +4984,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn join_splitted_batch() {
+    async fn join_split_batch() {
         let left = build_table(
             ("a1", &vec![1, 2, 3, 4]),
             ("b1", &vec![1, 1, 1, 1]),
@@ -3977,14 +5074,14 @@ mod tests {
         // validation of partial join results output for different batch_size setting
         for join_type in join_types {
             for batch_size in (1..21).rev() {
-                let task_ctx = prepare_task_ctx(batch_size);
+                let task_ctx = prepare_task_ctx(batch_size, true);
 
                 let join = join(
                     Arc::clone(&left),
                     Arc::clone(&right),
                     on.clone(),
                     &join_type,
-                    false,
+                    NullEquality::NullEqualsNothing,
                 )
                 .unwrap();
 
@@ -4004,10 +5101,11 @@ mod tests {
                     }
                     _ => div_ceil(expected_resultset_records, batch_size) + 1,
                 };
-                assert_eq!(
-                    batches.len(),
-                    expected_batch_count,
-                    "expected {expected_batch_count} output batches for {join_type} join with batch_size = {batch_size}"
+                // With batch coalescing, we may have fewer batches than expected
+                assert!(
+                    batches.len() <= expected_batch_count,
+                    "expected at most {expected_batch_count} output batches for {join_type} join with batch_size = {batch_size}, got {}",
+                    batches.len()
                 );
 
                 let expected = match join_type {
@@ -4017,7 +5115,17 @@ mod tests {
                     JoinType::LeftAnti => left_empty.to_vec(),
                     _ => common_result.to_vec(),
                 };
-                assert_batches_eq!(expected, &batches);
+                // For anti joins with empty results, we may get zero batches
+                // (with coalescing) instead of one empty batch with schema
+                if batches.is_empty() {
+                    // Verify this is an expected empty result case
+                    assert!(
+                        matches!(join_type, JoinType::RightAnti | JoinType::LeftAnti),
+                        "Unexpected empty result for {join_type} join"
+                    );
+                } else {
+                    assert_batches_eq!(expected, &batches);
+                }
             }
         }
     }
@@ -4049,6 +5157,7 @@ mod tests {
             JoinType::RightSemi,
             JoinType::RightAnti,
             JoinType::LeftMark,
+            JoinType::RightMark,
         ];
 
         for join_type in join_types {
@@ -4063,7 +5172,7 @@ mod tests {
                 Arc::clone(&right),
                 on.clone(),
                 &join_type,
-                false,
+                NullEquality::NullEqualsNothing,
             )?;
 
             let stream = join.execute(0, task_ctx)?;
@@ -4072,7 +5181,7 @@ mod tests {
             // Asserting that operator-level reservation attempting to overallocate
             assert_contains!(
                 err.to_string(),
-                "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  HashJoinInput"
+                "Resources exhausted: Additional allocation failed for HashJoinInput with top memory consumers (across reservations) as:\n  HashJoinInput"
             );
 
             assert_contains!(
@@ -4144,6 +5253,7 @@ mod tests {
                 &join_type,
                 None,
                 PartitionMode::Partitioned,
+                NullEquality::NullEqualsNothing,
                 false,
             )?;
 
@@ -4153,8 +5263,7 @@ mod tests {
             // Asserting that stream-level reservation attempting to overallocate
             assert_contains!(
                 err.to_string(),
-                "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  HashJoinInput[1]"
-
+                "Resources exhausted: Additional allocation failed for HashJoinInput[1] with top memory consumers (across reservations) as:\n  HashJoinInput[1]"
             );
 
             assert_contains!(
@@ -4204,13 +5313,20 @@ mod tests {
             Arc::new(Column::new_with_schema("n2", &right.schema())?) as _,
         )];
 
-        let (columns, batches) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
 
         assert_eq!(columns, vec!["n1", "n2"]);
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&batches), @r#"
+            assert_snapshot!(batches_to_string(&batches), @r"
             +--------+--------+
             | n1     | n2     |
             +--------+--------+
@@ -4218,9 +5334,11 @@ mod tests {
             | {a: 1} | {a: 1} |
             | {a: 2} | {a: 2} |
             +--------+--------+
-                "#);
+            ");
         }
 
+        assert_join_metrics!(metrics, 3);
+
         Ok(())
     }
 
@@ -4236,32 +5354,49 @@ mod tests {
             Arc::new(Column::new_with_schema("n2", &right.schema())?) as _,
         )];
 
-        let (_, batches_null_eq) = join_collect(
+        let (_, batches_null_eq, metrics) = join_collect(
             Arc::clone(&left),
             Arc::clone(&right),
             on.clone(),
             &JoinType::Inner,
-            true,
+            NullEquality::NullEqualsNull,
             Arc::clone(&task_ctx),
         )
         .await?;
 
         allow_duplicates! {
-            assert_snapshot!(batches_to_sort_string(&batches_null_eq), @r#"
+            assert_snapshot!(batches_to_sort_string(&batches_null_eq), @r"
             +----+----+
             | n1 | n2 |
             +----+----+
             |    |    |
             +----+----+
-                "#);
+            ");
         }
 
-        let (_, batches_null_neq) =
-            join_collect(left, right, on, &JoinType::Inner, false, task_ctx).await?;
+        assert_join_metrics!(metrics, 1);
+
+        let (_, batches_null_neq, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_join_metrics!(metrics, 0);
 
-        let expected_null_neq =
-            ["+----+----+", "| n1 | n2 |", "+----+----+", "+----+----+"];
-        assert_batches_eq!(expected_null_neq, &batches_null_neq);
+        // With batch coalescing, empty results may not emit any batches
+        // Check that either we have no batches, or an empty batch with proper schema
+        if batches_null_neq.is_empty() {
+            // This is fine - no output rows
+        } else {
+            let expected_null_neq =
+                ["+----+----+", "| n1 | n2 |", "+----+----+", "+----+----+"];
+            assert_batches_eq!(expected_null_neq, &batches_null_neq);
+        }
 
         Ok(())
     }
@@ -4270,4 +5405,597 @@ mod tests {
     fn columns(schema: &Schema) -> Vec<String> {
         schema.fields().iter().map(|f| f.name().clone()).collect()
     }
+
+    /// This test verifies that the dynamic filter is marked as complete after HashJoinExec finishes building the hash table.
+    #[tokio::test]
+    async fn test_hash_join_marks_filter_complete() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 5, 6]),
+            ("c1", &vec![7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 5, 6]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        // Create a dynamic filter manually
+        let dynamic_filter = HashJoinExec::create_dynamic_filter(&on);
+        let dynamic_filter_clone = Arc::clone(&dynamic_filter);
+
+        // Create HashJoinExec with the dynamic filter
+        let mut join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+        join.dynamic_filter = Some(HashJoinExecDynamicFilter {
+            filter: dynamic_filter,
+            build_accumulator: OnceLock::new(),
+        });
+
+        // Execute the join
+        let stream = join.execute(0, task_ctx)?;
+        let _batches = common::collect(stream).await?;
+
+        // After the join completes, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter_clone.wait_complete().await;
+
+        Ok(())
+    }
+
+    /// This test verifies that the dynamic filter is marked as complete even when the build side is empty.
+    #[tokio::test]
+    async fn test_hash_join_marks_filter_complete_empty_build_side() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        // Empty left side (build side)
+        let left = build_table(("a1", &vec![]), ("b1", &vec![]), ("c1", &vec![]));
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 5, 6]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        )];
+
+        // Create a dynamic filter manually
+        let dynamic_filter = HashJoinExec::create_dynamic_filter(&on);
+        let dynamic_filter_clone = Arc::clone(&dynamic_filter);
+
+        // Create HashJoinExec with the dynamic filter
+        let mut join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            false,
+        )?;
+        join.dynamic_filter = Some(HashJoinExecDynamicFilter {
+            filter: dynamic_filter,
+            build_accumulator: OnceLock::new(),
+        });
+
+        // Execute the join
+        let stream = join.execute(0, task_ctx)?;
+        let _batches = common::collect(stream).await?;
+
+        // Even with empty build side, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter_clone.wait_complete().await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_perfect_hash_join_with_negative_numbers() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![-1, 0, 1])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![10, 20, 30, 40])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![1, -1, 0, 2])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | -1 | 20 | -1 |",
+                "| 2  | 0  | 30 | 0  |",
+                "| 3  | 1  | 10 | 1  |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, true);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_perfect_hash_join_overflow_full_int64_range() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int64Array::from(vec![i64::MIN, i64::MAX]))],
+        )?;
+        let left = TestMemoryExec::try_new_exec(
+            &[vec![batch.clone()]],
+            Arc::clone(&schema),
+            None,
+        )?;
+        let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?;
+        let on: JoinOn = vec![(
+            Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a", &right.schema())?) as _,
+        )];
+        let (_columns, batches, _metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 2);
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_phj_null_equals_null_build_no_nulls_probe_has_nulls(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![3, 4])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNull,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_phj_null_equals_nothing_build_probe_all_have_nulls(
+        batch_size: usize,
+        use_perfect_hash_join_as_possible: bool,
+    ) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, use_perfect_hash_join_as_possible);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3), Some(4)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), None])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, use_perfect_hash_join_as_possible);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_phj_null_equals_null_build_have_nulls() -> Result<()> {
+        let task_ctx = prepare_task_ctx(8192, true);
+        let (left_schema, right_schema, on) = build_schema_and_on()?;
+
+        let left_batch = RecordBatch::try_new(
+            Arc::clone(&left_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), Some(20), None])) as ArrayRef,
+            ],
+        )?;
+        let left = TestMemoryExec::try_new_exec(&[vec![left_batch]], left_schema, None)?;
+
+        let right_batch = RecordBatch::try_new(
+            Arc::clone(&right_schema),
+            vec![
+                Arc::new(Int32Array::from(vec![Some(3), Some(4)])) as ArrayRef,
+                Arc::new(Int32Array::from(vec![Some(10), Some(30)])) as ArrayRef,
+            ],
+        )?;
+        let right =
+            TestMemoryExec::try_new_exec(&[vec![right_batch]], right_schema, None)?;
+
+        let (columns, batches, metrics) = join_collect(
+            left,
+            right,
+            on,
+            &JoinType::Inner,
+            NullEquality::NullEqualsNull,
+            task_ctx,
+        )
+        .await?;
+
+        assert_eq!(columns, vec!["a1", "b1", "a2", "b1"]);
+        assert_batches_sorted_eq!(
+            [
+                "+----+----+----+----+",
+                "| a1 | b1 | a2 | b1 |",
+                "+----+----+----+----+",
+                "| 1  | 10 | 3  | 10 |",
+                "+----+----+----+----+",
+            ],
+            &batches
+        );
+
+        assert_phj_used(&metrics, false);
+
+        Ok(())
+    }
+
+    /// Test null-aware anti join when probe side (right) contains NULL
+    /// Expected: no rows should be output (NULL in subquery means all results are unknown)
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_probe_null(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table (rows to potentially output)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(2), Some(3), Some(4)]),
+            ("dummy", &vec![Some(10), Some(20), Some(30), Some(40)]),
+        );
+
+        // Build right table (subquery with NULL)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3), None]),
+            ("dummy", &vec![Some(100), Some(200), Some(300), Some(400)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: empty result (probe side has NULL, so no rows should be output)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            ++
+            ++
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test null-aware anti join when build side (left) contains NULL keys
+    /// Expected: rows with NULL keys should not be output
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_build_null(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table with NULL key (this row should not be output)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(4), None]),
+            ("dummy", &vec![Some(10), Some(40), Some(0)]),
+        );
+
+        // Build right table (no NULL, so probe-side check passes)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3)]),
+            ("dummy", &vec![Some(100), Some(200), Some(300)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: only c1=4 (not c1=1 which matches, not c1=NULL)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+-------+
+            | c1 | dummy |
+            +----+-------+
+            | 4  | 40    |
+            +----+-------+
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test null-aware anti join with no NULLs (should work like regular anti join)
+    #[apply(hash_join_exec_configs)]
+    #[tokio::test]
+    async fn test_null_aware_anti_join_no_nulls(batch_size: usize) -> Result<()> {
+        let task_ctx = prepare_task_ctx(batch_size, false);
+
+        // Build left table (no NULLs)
+        let left = build_table_two_cols(
+            ("c1", &vec![Some(1), Some(2), Some(4), Some(5)]),
+            ("dummy", &vec![Some(10), Some(20), Some(40), Some(50)]),
+        );
+
+        // Build right table (no NULLs)
+        let right = build_table_two_cols(
+            ("c2", &vec![Some(1), Some(2), Some(3)]),
+            ("dummy", &vec![Some(100), Some(200), Some(300)]),
+        );
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema())?) as _,
+        )];
+
+        // Create null-aware anti join
+        let join = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+
+        // Expected: c1=4 and c1=5 (they don't match anything in right)
+        allow_duplicates! {
+            assert_snapshot!(batches_to_sort_string(&batches), @r"
+            +----+-------+
+            | c1 | dummy |
+            +----+-------+
+            | 4  | 40    |
+            | 5  | 50    |
+            +----+-------+
+            ");
+        }
+        Ok(())
+    }
+
+    /// Test that null_aware validation rejects non-LeftAnti join types
+    #[tokio::test]
+    async fn test_null_aware_validation_wrong_join_type() {
+        let left =
+            build_table_two_cols(("c1", &vec![Some(1)]), ("dummy", &vec![Some(10)]));
+        let right =
+            build_table_two_cols(("c2", &vec![Some(1)]), ("dummy", &vec![Some(100)]));
+
+        let on = vec![(
+            Arc::new(Column::new_with_schema("c1", &left.schema()).unwrap()) as _,
+            Arc::new(Column::new_with_schema("c2", &right.schema()).unwrap()) as _,
+        )];
+
+        // Try to create null-aware Inner join (should fail)
+        let result = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::Inner,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true (invalid for Inner join)
+        );
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("null_aware can only be true for LeftAnti joins")
+        );
+    }
+
+    /// Test that null_aware validation rejects multi-column joins
+    #[tokio::test]
+    async fn test_null_aware_validation_multi_column() {
+        let left = build_table(("a", &vec![1]), ("b", &vec![2]), ("c", &vec![3]));
+        let right = build_table(("x", &vec![1]), ("y", &vec![2]), ("z", &vec![3]));
+
+        // Try multi-column join
+        let on = vec![
+            (
+                Arc::new(Column::new_with_schema("a", &left.schema()).unwrap()) as _,
+                Arc::new(Column::new_with_schema("x", &right.schema()).unwrap()) as _,
+            ),
+            (
+                Arc::new(Column::new_with_schema("b", &left.schema()).unwrap()) as _,
+                Arc::new(Column::new_with_schema("y", &right.schema()).unwrap()) as _,
+            ),
+        ];
+
+        // Try to create null-aware anti join with 2 columns (should fail)
+        let result = HashJoinExec::try_new(
+            left,
+            right,
+            on,
+            None,
+            &JoinType::LeftAnti,
+            None,
+            PartitionMode::CollectLeft,
+            NullEquality::NullEqualsNothing,
+            true, // null_aware = true (invalid for multi-column)
+        );
+
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("null_aware anti join only supports single column join key")
+        );
+    }
+
+    #[test]
+    fn test_lr_is_preserved() {
+        assert_eq!(lr_is_preserved(JoinType::Inner), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::Left), (true, false));
+        assert_eq!(lr_is_preserved(JoinType::Right), (false, true));
+        assert_eq!(lr_is_preserved(JoinType::Full), (false, false));
+        assert_eq!(lr_is_preserved(JoinType::LeftSemi), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::LeftAnti), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::LeftMark), (true, false));
+        assert_eq!(lr_is_preserved(JoinType::RightSemi), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::RightAnti), (true, true));
+        assert_eq!(lr_is_preserved(JoinType::RightMark), (false, true));
+    }
 }
diff --git a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
new file mode 100644
index 0000000000000..0ca338265ecc6
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for building InList expressions from hash join build side data
+
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StructArray};
+use arrow::datatypes::{Field, FieldRef, Fields};
+use arrow_schema::DataType;
+use datafusion_common::Result;
+
+pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result<Fields> {
+    data_types
+        .iter()
+        .enumerate()
+        .map(|(i, dt)| Ok(Field::new(format!("c{i}"), dt.clone(), true)))
+        .collect()
+}
+
+/// Builds InList values from join key column arrays.
+///
+/// If `join_key_arrays` is:
+/// 1. A single array, let's say Int32, this will produce a flat
+///    InList expression where the lookup is expected to be scalar Int32 values,
+///    that is: this will produce `IN LIST (1, 2, 3)` expected to be used as `2 IN LIST (1, 2, 3)`.
+/// 2. An Int32 array and a Utf8 array, this will produce a Struct InList expression
+///    where the lookup is expected to be Struct values with two fields (Int32, Utf8),
+///    that is: this will produce `IN LIST ((1, "a"), (2, "b"))` expected to be used as `(2, "b") IN LIST ((1, "a"), (2, "b"))`.
+///    The field names of the struct are auto-generated as "c0", "c1", ... and should match the struct expression used in the join keys.
+///
+/// Note that this function does not deduplicate values - deduplication will happen later
+/// when building an InList expression from this array via `InListExpr::try_new_from_array`.
+///
+/// Returns `None` if the estimated size exceeds `max_size_bytes` or if the number of rows
+/// exceeds `max_distinct_values`.
+pub(super) fn build_struct_inlist_values(
+    join_key_arrays: &[ArrayRef],
+) -> Result<Option<ArrayRef>> {
+    // Build the source array/struct
+    let source_array: ArrayRef = if join_key_arrays.len() == 1 {
+        // Single column: use directly
+        Arc::clone(&join_key_arrays[0])
+    } else {
+        // Multi-column: build StructArray once from all columns
+        let fields = build_struct_fields(
+            &join_key_arrays
+                .iter()
+                .map(|arr| arr.data_type().clone())
+                .collect::<Vec<_>>(),
+        )?;
+
+        // Build field references with proper Arc wrapping
+        let arrays_with_fields: Vec<(FieldRef, ArrayRef)> = fields
+            .iter()
+            .cloned()
+            .zip(join_key_arrays.iter().cloned())
+            .collect();
+
+        Arc::new(StructArray::from(arrays_with_fields))
+    };
+
+    Ok(Some(source_array))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        DictionaryArray, Int8Array, Int32Array, StringArray, StringDictionaryBuilder,
+    };
+    use arrow_schema::DataType;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_build_single_column_inlist_array() {
+        let array = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let result = build_struct_inlist_values(std::slice::from_ref(&array))
+            .unwrap()
+            .unwrap();
+
+        assert!(array.eq(&result));
+    }
+
+    #[test]
+    fn test_build_multi_column_inlist() {
+        let array1 = Arc::new(Int32Array::from(vec![1, 2, 3, 2, 1])) as ArrayRef;
+        let array2 =
+            Arc::new(StringArray::from(vec!["a", "b", "c", "b", "a"])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[array1, array2])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[DataType::Int32, DataType::Utf8]).unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_build_multi_column_inlist_with_dictionary() {
+        let mut builder = StringDictionaryBuilder::<arrow::datatypes::Int8Type>::new();
+        builder.append_value("foo");
+        builder.append_value("foo");
+        builder.append_value("foo");
+        let dict_array = Arc::new(builder.finish()) as ArrayRef;
+
+        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[dict_array, int_array])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Utf8)
+                    ),
+                    DataType::Int32
+                ])
+                .unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_build_single_column_dictionary_inlist() {
+        let keys = Int8Array::from(vec![0i8, 0, 0]);
+        let values = Arc::new(StringArray::from(vec!["foo"]));
+        let dict_array = Arc::new(DictionaryArray::new(keys, values)) as ArrayRef;
+
+        let result = build_struct_inlist_values(std::slice::from_ref(&dict_array))
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(result.data_type(), dict_array.data_type());
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/hash_join/mod.rs b/datafusion/physical-plan/src/joins/hash_join/mod.rs
new file mode 100644
index 0000000000000..b915802ea4015
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/mod.rs
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`HashJoinExec`] Partitioned Hash Join Operator
+
+pub use exec::{HashJoinExec, HashJoinExecBuilder};
+pub use partitioned_hash_eval::{HashExpr, HashTableLookupExpr, SeededRandomState};
+
+mod exec;
+mod inlist_builder;
+mod partitioned_hash_eval;
+mod shared_bounds;
+mod stream;
diff --git a/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs
new file mode 100644
index 0000000000000..f6c305fba621a
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/partitioned_hash_eval.rs
@@ -0,0 +1,602 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Hash computation and hash table lookup expressions for dynamic filtering
+
+use std::{any::Any, fmt::Display, hash::Hash, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, UInt64Array},
+    datatypes::{DataType, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_common::Result;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::{create_hashes, with_hashes};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::{
+    DynHash, PhysicalExpr, PhysicalExprRef,
+};
+
+use crate::joins::Map;
+
+/// RandomState wrapper that preserves the seed used to create it.
+///
+/// This is needed because `RandomState` doesn't expose its seed after creation,
+/// but we need them for serialization (e.g., protobuf serde).
+#[derive(Clone, Debug)]
+pub struct SeededRandomState {
+    random_state: RandomState,
+    seed: u64,
+}
+
+impl SeededRandomState {
+    /// Create a new SeededRandomState with the given seed.
+    pub const fn with_seed(k: u64) -> Self {
+        Self {
+            random_state: RandomState::with_seed(k),
+            seed: k,
+        }
+    }
+
+    /// Get the inner RandomState.
+    pub fn random_state(&self) -> &RandomState {
+        &self.random_state
+    }
+
+    /// Get the seed used to create this RandomState.
+    pub fn seed(&self) -> u64 {
+        self.seed
+    }
+}
+
+/// Physical expression that computes hash values for a set of columns
+///
+/// This expression computes the hash of join key columns using a specific RandomState.
+/// It returns a UInt64Array containing the hash values.
+///
+/// This is used for:
+/// - Computing routing hashes (with RepartitionExec's 0,0,0,0 seeds)
+/// - Computing lookup hashes (with HashJoin's 'J','O','I','N' seeds)
+pub struct HashExpr {
+    /// Columns to hash
+    on_columns: Vec<PhysicalExprRef>,
+    /// Random state for hashing (with seeds preserved for serialization)
+    random_state: SeededRandomState,
+    /// Description for display
+    description: String,
+}
+
+impl HashExpr {
+    /// Create a new HashExpr
+    ///
+    /// # Arguments
+    /// * `on_columns` - Columns to hash
+    /// * `random_state` - SeededRandomState for hashing
+    /// * `description` - Description for debugging (e.g., "hash_repartition", "hash_join")
+    pub fn new(
+        on_columns: Vec<PhysicalExprRef>,
+        random_state: SeededRandomState,
+        description: String,
+    ) -> Self {
+        Self {
+            on_columns,
+            random_state,
+            description,
+        }
+    }
+
+    /// Get the columns being hashed.
+    pub fn on_columns(&self) -> &[PhysicalExprRef] {
+        &self.on_columns
+    }
+
+    /// Get the seed used for hashing.
+    pub fn seed(&self) -> u64 {
+        self.random_state.seed()
+    }
+
+    /// Get the description.
+    pub fn description(&self) -> &str {
+        &self.description
+    }
+}
+
+impl std::fmt::Debug for HashExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let cols = self
+            .on_columns
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<_>>()
+            .join(", ");
+        let seed = self.seed();
+        write!(f, "{}({cols}, [{seed}])", self.description)
+    }
+}
+
+impl Hash for HashExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.on_columns.dyn_hash(state);
+        self.description.hash(state);
+        self.seed().hash(state);
+    }
+}
+
+impl PartialEq for HashExpr {
+    fn eq(&self, other: &Self) -> bool {
+        self.on_columns == other.on_columns
+            && self.description == other.description
+            && self.seed() == other.seed()
+    }
+}
+
+impl Eq for HashExpr {}
+
+impl Display for HashExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl PhysicalExpr for HashExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.on_columns.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(HashExpr::new(
+            children,
+            self.random_state.clone(),
+            self.description.clone(),
+        )))
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::UInt64)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        let num_rows = batch.num_rows();
+
+        // Evaluate columns
+        let keys_values = evaluate_columns(&self.on_columns, batch)?;
+
+        // Compute hashes
+        let mut hashes_buffer = vec![0; num_rows];
+        create_hashes(
+            &keys_values,
+            self.random_state.random_state(),
+            &mut hashes_buffer,
+        )?;
+
+        Ok(ColumnarValue::Array(Arc::new(UInt64Array::from(
+            hashes_buffer,
+        ))))
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+/// Physical expression that checks join keys in a [`Map`] (hash table or array map).
+///
+/// Returns a [`BooleanArray`](arrow::array::BooleanArray) indicating if join keys (from `on_columns`) exist in the map.
+// TODO: rename to MapLookupExpr
+pub struct HashTableLookupExpr {
+    /// Columns in the ON clause used to compute the join key for lookups
+    on_columns: Vec<PhysicalExprRef>,
+    /// Random state for hashing (with seeds preserved for serialization)
+    random_state: SeededRandomState,
+    /// Map to check against (hash table or array map)
+    map: Arc<Map>,
+    /// Description for display
+    description: String,
+}
+
+impl HashTableLookupExpr {
+    /// Create a new HashTableLookupExpr
+    ///
+    /// # Arguments
+    /// * `on_columns` - Columns in the ON clause used to compute the join key
+    /// * `random_state` - SeededRandomState for hashing
+    /// * `map` - Map to check membership (hash table or array map)
+    /// * `description` - Description for debugging
+    /// # Note
+    /// This is public for internal testing purposes only and is not
+    /// guaranteed to be stable across versions.
+    pub fn new(
+        on_columns: Vec<PhysicalExprRef>,
+        random_state: SeededRandomState,
+        map: Arc<Map>,
+        description: String,
+    ) -> Self {
+        Self {
+            on_columns,
+            random_state,
+            map,
+            description,
+        }
+    }
+}
+
+impl std::fmt::Debug for HashTableLookupExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let cols = self
+            .on_columns
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<_>>()
+            .join(", ");
+        let seed = self.random_state.seed();
+        write!(f, "{}({cols}, [{seed}])", self.description)
+    }
+}
+
+impl Hash for HashTableLookupExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.on_columns.dyn_hash(state);
+        self.description.hash(state);
+        self.random_state.seed().hash(state);
+        // Note that we compare hash_map by pointer equality.
+        // Actually comparing the contents of the hash maps would be expensive.
+        // The way these hash maps are used in actuality is that HashJoinExec creates
+        // one per partition per query execution, thus it is never possible for two different
+        // hash maps to have the same content in practice.
+        // Theoretically this is a public API and users could create identical hash maps,
+        // but that seems unlikely and not worth paying the cost of deep comparison all the time.
+        Arc::as_ptr(&self.map).hash(state);
+    }
+}
+
+impl PartialEq for HashTableLookupExpr {
+    fn eq(&self, other: &Self) -> bool {
+        // Note that we compare hash_map by pointer equality.
+        // Actually comparing the contents of the hash maps would be expensive.
+        // The way these hash maps are used in actuality is that HashJoinExec creates
+        // one per partition per query execution, thus it is never possible for two different
+        // hash maps to have the same content in practice.
+        // Theoretically this is a public API and users could create identical hash maps,
+        // but that seems unlikely and not worth paying the cost of deep comparison all the time.
+        self.on_columns == other.on_columns
+            && self.description == other.description
+            && self.random_state.seed() == other.random_state.seed()
+            && Arc::ptr_eq(&self.map, &other.map)
+    }
+}
+
+impl Eq for HashTableLookupExpr {}
+
+impl Display for HashTableLookupExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl PhysicalExpr for HashTableLookupExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.on_columns.iter().collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(HashTableLookupExpr::new(
+            children,
+            self.random_state.clone(),
+            Arc::clone(&self.map),
+            self.description.clone(),
+        )))
+    }
+
+    fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        // Evaluate columns
+        let join_keys = evaluate_columns(&self.on_columns, batch)?;
+
+        match self.map.as_ref() {
+            Map::HashMap(map) => {
+                with_hashes(&join_keys, self.random_state.random_state(), |hashes| {
+                    let array = map.contain_hashes(hashes);
+                    Ok(ColumnarValue::Array(Arc::new(array)))
+                })
+            }
+            Map::ArrayMap(map) => {
+                let array = map.contain_keys(&join_keys)?;
+                Ok(ColumnarValue::Array(Arc::new(array)))
+            }
+        }
+    }
+
+    fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+fn evaluate_columns(
+    columns: &[PhysicalExprRef],
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    let num_rows = batch.num_rows();
+    columns
+        .iter()
+        .map(|c| c.evaluate(batch)?.into_array(num_rows))
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::joins::join_hash_map::JoinHashMapU32;
+    use datafusion_physical_expr::expressions::Column;
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::Hasher;
+
+    fn compute_hash<T: Hash>(value: &T) -> u64 {
+        let mut hasher = DefaultHasher::new();
+        value.hash(&mut hasher);
+        hasher.finish()
+    }
+
+    #[test]
+    fn test_hash_expr_eq_same() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        assert_eq!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_columns() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+        let col_c: PhysicalExprRef = Arc::new(Column::new("c", 2));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_c)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_description() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "hash_one".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "hash_two".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_eq_different_seeds() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(5),
+            "test_hash".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_expr_hash_consistency() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let expr1 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        let expr2 = HashExpr::new(
+            vec![Arc::clone(&col_a), Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            "test_hash".to_string(),
+        );
+
+        // Equal expressions should have equal hashes
+        assert_eq!(expr1, expr2);
+        assert_eq!(compute_hash(&expr1), compute_hash(&expr2));
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_same() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        assert_eq!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_columns() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let col_b: PhysicalExprRef = Arc::new(Column::new("b", 1));
+
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_b)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_description() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup_one".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup_two".to_string(),
+        );
+
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_eq_different_hash_map() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+
+        // Two different Arc pointers (even with same content) should not be equal
+        let hash_map1 =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+        let hash_map2 =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            hash_map1,
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            hash_map2,
+            "lookup".to_string(),
+        );
+
+        // Different Arc pointers means not equal (uses Arc::ptr_eq)
+        assert_ne!(expr1, expr2);
+    }
+
+    #[test]
+    fn test_hash_table_lookup_expr_hash_consistency() {
+        let col_a: PhysicalExprRef = Arc::new(Column::new("a", 0));
+        let hash_map =
+            Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(10))));
+
+        let expr1 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        let expr2 = HashTableLookupExpr::new(
+            vec![Arc::clone(&col_a)],
+            SeededRandomState::with_seed(1),
+            Arc::clone(&hash_map),
+            "lookup".to_string(),
+        );
+
+        // Equal expressions should have equal hashes
+        assert_eq!(expr1, expr2);
+        assert_eq!(compute_hash(&expr1), compute_hash(&expr2));
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
new file mode 100644
index 0000000000000..f32dc7fa80268
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
@@ -0,0 +1,594 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utilities for shared build-side information. Used in dynamic filter pushdown in Hash Joins.
+// TODO: include the link to the Dynamic Filter blog post.
+
+use std::fmt;
+use std::sync::Arc;
+
+use crate::ExecutionPlan;
+use crate::ExecutionPlanProperties;
+use crate::joins::Map;
+use crate::joins::PartitionMode;
+use crate::joins::hash_join::exec::HASH_JOIN_SEED;
+use crate::joins::hash_join::inlist_builder::build_struct_fields;
+use crate::joins::hash_join::partitioned_hash_eval::{
+    HashExpr, HashTableLookupExpr, SeededRandomState,
+};
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::Operator;
+use datafusion_functions::core::r#struct as struct_func;
+use datafusion_physical_expr::expressions::{
+    BinaryExpr, CaseExpr, DynamicFilterPhysicalExpr, InListExpr, lit,
+};
+use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef, ScalarFunctionExpr};
+
+use parking_lot::Mutex;
+use tokio::sync::Barrier;
+
+/// Represents the minimum and maximum values for a specific column.
+/// Used in dynamic filter pushdown to establish value boundaries.
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) struct ColumnBounds {
+    /// The minimum value observed for this column
+    pub(crate) min: ScalarValue,
+    /// The maximum value observed for this column  
+    pub(crate) max: ScalarValue,
+}
+
+impl ColumnBounds {
+    pub(crate) fn new(min: ScalarValue, max: ScalarValue) -> Self {
+        Self { min, max }
+    }
+}
+
+/// Represents the bounds for all join key columns from a single partition.
+/// This contains the min/max values computed from one partition's build-side data.
+#[derive(Debug, Clone)]
+pub(crate) struct PartitionBounds {
+    /// Min/max bounds for each join key column in this partition.
+    /// Index corresponds to the join key expression index.
+    column_bounds: Vec<ColumnBounds>,
+}
+
+impl PartitionBounds {
+    pub(crate) fn new(column_bounds: Vec<ColumnBounds>) -> Self {
+        Self { column_bounds }
+    }
+
+    pub(crate) fn get_column_bounds(&self, index: usize) -> Option<&ColumnBounds> {
+        self.column_bounds.get(index)
+    }
+}
+
+/// Creates a membership predicate for filter pushdown.
+///
+/// If `inlist_values` is provided (for small build sides), creates an InList expression.
+/// Otherwise, creates a HashTableLookup expression (for large build sides).
+///
+/// Supports both single-column and multi-column joins using struct expressions.
+fn create_membership_predicate(
+    on_right: &[PhysicalExprRef],
+    pushdown: PushdownStrategy,
+    random_state: &SeededRandomState,
+    schema: &Schema,
+) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+    match pushdown {
+        // Use InList expression for small build sides
+        PushdownStrategy::InList(in_list_array) => {
+            // Build the expression to compare against
+            let expr = if on_right.len() == 1 {
+                // Single column: col IN (val1, val2, ...)
+                Arc::clone(&on_right[0])
+            } else {
+                let fields = build_struct_fields(
+                    on_right
+                        .iter()
+                        .map(|r| r.data_type(schema))
+                        .collect::<Result<Vec<_>>>()?
+                        .as_ref(),
+                )?;
+
+                // The return field name and the function field name don't really matter here.
+                let return_field =
+                    Arc::new(Field::new("struct", DataType::Struct(fields), true));
+
+                Arc::new(ScalarFunctionExpr::new(
+                    "struct",
+                    struct_func(),
+                    on_right.to_vec(),
+                    return_field,
+                    Arc::new(ConfigOptions::default()),
+                )) as Arc<dyn PhysicalExpr>
+            };
+
+            // Use in_list_from_array() helper to create InList with static_filter optimization (hash-based lookup)
+            Ok(Some(Arc::new(InListExpr::try_new_from_array(
+                expr,
+                in_list_array,
+                false,
+            )?)))
+        }
+        // Use hash table lookup for large build sides
+        PushdownStrategy::Map(hash_map) => Ok(Some(Arc::new(HashTableLookupExpr::new(
+            on_right.to_vec(),
+            random_state.clone(),
+            hash_map,
+            "hash_lookup".to_string(),
+        )) as Arc<dyn PhysicalExpr>)),
+        // Empty partition - should not create a filter for this
+        PushdownStrategy::Empty => Ok(None),
+    }
+}
+
+/// Creates a bounds predicate from partition bounds.
+///
+/// Returns `None` if no column bounds are available.
+/// Returns a combined predicate (col >= min AND col <= max) for all columns with bounds.
+fn create_bounds_predicate(
+    on_right: &[PhysicalExprRef],
+    bounds: &PartitionBounds,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    let mut column_predicates = Vec::new();
+
+    for (col_idx, right_expr) in on_right.iter().enumerate() {
+        if let Some(column_bounds) = bounds.get_column_bounds(col_idx) {
+            // Create predicate: col >= min AND col <= max
+            let min_expr = Arc::new(BinaryExpr::new(
+                Arc::clone(right_expr),
+                Operator::GtEq,
+                lit(column_bounds.min.clone()),
+            )) as Arc<dyn PhysicalExpr>;
+            let max_expr = Arc::new(BinaryExpr::new(
+                Arc::clone(right_expr),
+                Operator::LtEq,
+                lit(column_bounds.max.clone()),
+            )) as Arc<dyn PhysicalExpr>;
+            let range_expr = Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr))
+                as Arc<dyn PhysicalExpr>;
+            column_predicates.push(range_expr);
+        }
+    }
+
+    if column_predicates.is_empty() {
+        None
+    } else {
+        Some(
+            column_predicates
+                .into_iter()
+                .reduce(|acc, pred| {
+                    Arc::new(BinaryExpr::new(acc, Operator::And, pred))
+                        as Arc<dyn PhysicalExpr>
+                })
+                .unwrap(),
+        )
+    }
+}
+
+/// Coordinates build-side information collection across multiple partitions
+///
+/// This structure collects information from the build side (hash tables and/or bounds) and
+/// ensures that dynamic filters are built with complete information from all relevant
+/// partitions before being applied to probe-side scans. Incomplete filters would
+/// incorrectly eliminate valid join results.
+///
+/// ## Synchronization Strategy
+///
+/// 1. Each partition computes information from its build-side data (hash maps and/or bounds)
+/// 2. Information is stored in the shared state
+/// 3. A barrier tracks how many partitions have reported
+/// 4. When the last partition reports, information is merged and the filter is updated exactly once
+///
+/// ## Hash Map vs Bounds
+///
+/// - **Hash Maps (Partitioned mode)**: Collects Arc references to hash tables from each partition.
+///   Creates a `PartitionedHashLookupPhysicalExpr` that routes rows to the correct partition's hash table.
+/// - **Bounds (CollectLeft mode)**: Collects min/max bounds and creates range predicates.
+///
+/// ## Partition Counting
+///
+/// The `total_partitions` count represents how many times `collect_build_side` will be called:
+/// - **CollectLeft**: Number of output partitions (each accesses shared build data)
+/// - **Partitioned**: Number of input partitions (each builds independently)
+///
+/// ## Thread Safety
+///
+/// All fields use a single mutex to ensure correct coordination between concurrent
+/// partition executions.
+pub(crate) struct SharedBuildAccumulator {
+    /// Build-side data protected by a single mutex to avoid ordering concerns
+    inner: Mutex<AccumulatedBuildData>,
+    barrier: Barrier,
+    /// Dynamic filter for pushdown to probe side
+    dynamic_filter: Arc<DynamicFilterPhysicalExpr>,
+    /// Right side join expressions needed for creating filter expressions
+    on_right: Vec<PhysicalExprRef>,
+    /// Random state for partitioning (RepartitionExec's hash function with 0,0,0,0 seeds)
+    /// Used for PartitionedHashLookupPhysicalExpr
+    repartition_random_state: SeededRandomState,
+    /// Schema of the probe (right) side for evaluating filter expressions
+    probe_schema: Arc<Schema>,
+}
+
+/// Strategy for filter pushdown (decided at collection time)
+#[derive(Clone)]
+pub(crate) enum PushdownStrategy {
+    /// Use InList for small build sides (< 128MB)
+    InList(ArrayRef),
+    /// Use map lookup for large build sides
+    Map(Arc<Map>),
+    /// There was no data in this partition, do not build a dynamic filter for it
+    Empty,
+}
+
+/// Build-side data reported by a single partition
+pub(crate) enum PartitionBuildData {
+    Partitioned {
+        partition_id: usize,
+        pushdown: PushdownStrategy,
+        bounds: PartitionBounds,
+    },
+    CollectLeft {
+        pushdown: PushdownStrategy,
+        bounds: PartitionBounds,
+    },
+}
+
+/// Per-partition accumulated data (Partitioned mode)
+#[derive(Clone)]
+struct PartitionData {
+    bounds: PartitionBounds,
+    pushdown: PushdownStrategy,
+}
+
+/// Build-side data organized by partition mode
+enum AccumulatedBuildData {
+    Partitioned {
+        partitions: Vec<Option<PartitionData>>,
+    },
+    CollectLeft {
+        data: Option<PartitionData>,
+    },
+}
+
+impl SharedBuildAccumulator {
+    /// Creates a new SharedBuildAccumulator configured for the given partition mode
+    ///
+    /// This method calculates how many times `collect_build_side` will be called based on the
+    /// partition mode's execution pattern. This count is critical for determining when we have
+    /// complete information from all partitions to build the dynamic filter.
+    ///
+    /// ## Partition Mode Execution Patterns
+    ///
+    /// - **CollectLeft**: Build side is collected ONCE from partition 0 and shared via `OnceFut`
+    ///   across all output partitions. Each output partition calls `collect_build_side` to access the shared build data.
+    ///   Although this results in multiple invocations, the  `report_partition_bounds` function contains deduplication logic to handle them safely.
+    ///   Expected calls = number of output partitions.
+    ///
+    ///
+    /// - **Partitioned**: Each partition independently builds its own hash table by calling
+    ///   `collect_build_side` once. Expected calls = number of build partitions.
+    ///
+    /// - **Auto**: Placeholder mode resolved during optimization. Uses 1 as safe default since
+    ///   the actual mode will be determined and a new accumulator created before execution.
+    ///
+    /// ## Why This Matters
+    ///
+    /// We cannot build a partial filter from some partitions - it would incorrectly eliminate
+    /// valid join results. We must wait until we have complete information from ALL
+    /// relevant partitions before updating the dynamic filter.
+    pub(crate) fn new_from_partition_mode(
+        partition_mode: PartitionMode,
+        left_child: &dyn ExecutionPlan,
+        right_child: &dyn ExecutionPlan,
+        dynamic_filter: Arc<DynamicFilterPhysicalExpr>,
+        on_right: Vec<PhysicalExprRef>,
+        repartition_random_state: SeededRandomState,
+    ) -> Self {
+        // Troubleshooting: If partition counts are incorrect, verify this logic matches
+        // the actual execution pattern in collect_build_side()
+        let expected_calls = match partition_mode {
+            // Each output partition accesses shared build data
+            PartitionMode::CollectLeft => {
+                right_child.output_partitioning().partition_count()
+            }
+            // Each partition builds its own data
+            PartitionMode::Partitioned => {
+                left_child.output_partitioning().partition_count()
+            }
+            // Default value, will be resolved during optimization (does not exist once `execute()` is called; will be replaced by one of the other two)
+            PartitionMode::Auto => unreachable!(
+                "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+            ),
+        };
+
+        let mode_data = match partition_mode {
+            PartitionMode::Partitioned => AccumulatedBuildData::Partitioned {
+                partitions: vec![
+                    None;
+                    left_child.output_partitioning().partition_count()
+                ],
+            },
+            PartitionMode::CollectLeft => {
+                AccumulatedBuildData::CollectLeft { data: None }
+            }
+            PartitionMode::Auto => unreachable!(
+                "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+            ),
+        };
+
+        Self {
+            inner: Mutex::new(mode_data),
+            barrier: Barrier::new(expected_calls),
+            dynamic_filter,
+            on_right,
+            repartition_random_state,
+            probe_schema: right_child.schema(),
+        }
+    }
+
+    /// Report build-side data from a partition
+    ///
+    /// This unified method handles both CollectLeft and Partitioned modes. When all partitions
+    /// have reported (barrier wait), the leader builds the appropriate filter expression:
+    /// - CollectLeft: Simple conjunction of bounds and membership check
+    /// - Partitioned: CASE expression routing to per-partition filters
+    ///
+    /// # Arguments
+    /// * `data` - Build data including hash map, pushdown strategy, and bounds
+    ///
+    /// # Returns
+    /// * `Result<()>` - Ok if successful, Err if filter update failed or mode mismatch
+    pub(crate) async fn report_build_data(&self, data: PartitionBuildData) -> Result<()> {
+        // Store data in the accumulator
+        {
+            let mut guard = self.inner.lock();
+
+            match (data, &mut *guard) {
+                // Partitioned mode
+                (
+                    PartitionBuildData::Partitioned {
+                        partition_id,
+                        pushdown,
+                        bounds,
+                    },
+                    AccumulatedBuildData::Partitioned { partitions },
+                ) => {
+                    partitions[partition_id] = Some(PartitionData { pushdown, bounds });
+                }
+                // CollectLeft mode (store once, deduplicate across partitions)
+                (
+                    PartitionBuildData::CollectLeft { pushdown, bounds },
+                    AccumulatedBuildData::CollectLeft { data },
+                ) => {
+                    // Deduplicate - all partitions report the same data in CollectLeft
+                    if data.is_none() {
+                        *data = Some(PartitionData { pushdown, bounds });
+                    }
+                }
+                // Mismatched modes - should never happen
+                _ => {
+                    return datafusion_common::internal_err!(
+                        "Build data mode mismatch in report_build_data"
+                    );
+                }
+            }
+        }
+
+        // Wait for all partitions to report
+        if self.barrier.wait().await.is_leader() {
+            // All partitions have reported, so we can create and update the filter
+            let inner = self.inner.lock();
+
+            match &*inner {
+                // CollectLeft: Simple conjunction of bounds and membership check
+                AccumulatedBuildData::CollectLeft { data } => {
+                    if let Some(partition_data) = data {
+                        // Create membership predicate (InList for small build sides, hash lookup otherwise)
+                        let membership_expr = create_membership_predicate(
+                            &self.on_right,
+                            partition_data.pushdown.clone(),
+                            &HASH_JOIN_SEED,
+                            self.probe_schema.as_ref(),
+                        )?;
+
+                        // Create bounds check expression (if bounds available)
+                        let bounds_expr = create_bounds_predicate(
+                            &self.on_right,
+                            &partition_data.bounds,
+                        );
+
+                        // Combine membership and bounds expressions for multi-layer optimization:
+                        // - Bounds (min/max): Enable statistics-based pruning (Parquet row group/file skipping)
+                        // - Membership (InList/hash lookup): Enables:
+                        //   * Precise filtering (exact value matching)
+                        //   * Bloom filter utilization (if present in Parquet files)
+                        //   * Better pruning for data types where min/max isn't effective (e.g., UUIDs)
+                        // Together, they provide complementary benefits and maximize data skipping.
+                        // Only update the filter if we have something to push down
+                        if let Some(filter_expr) = match (membership_expr, bounds_expr) {
+                            (Some(membership), Some(bounds)) => {
+                                // Both available: combine with AND
+                                Some(Arc::new(BinaryExpr::new(
+                                    bounds,
+                                    Operator::And,
+                                    membership,
+                                ))
+                                    as Arc<dyn PhysicalExpr>)
+                            }
+                            (Some(membership), None) => {
+                                // Membership available but no bounds
+                                // This is reachable when we have data but bounds aren't available
+                                // (e.g., unsupported data types or no columns with bounds)
+                                Some(membership)
+                            }
+                            (None, Some(bounds)) => {
+                                // Bounds available but no membership.
+                                // This should be unreachable in practice: we can always push down a reference
+                                // to the hash table.
+                                // But it seems safer to handle it defensively.
+                                Some(bounds)
+                            }
+                            (None, None) => {
+                                // No filter available (e.g., empty build side)
+                                // Don't update the filter, but continue to mark complete
+                                None
+                            }
+                        } {
+                            self.dynamic_filter.update(filter_expr)?;
+                        }
+                    }
+                }
+                // Partitioned: CASE expression routing to per-partition filters
+                AccumulatedBuildData::Partitioned { partitions } => {
+                    // Collect all partition data (should all be Some at this point)
+                    let partition_data: Vec<_> =
+                        partitions.iter().filter_map(|p| p.as_ref()).collect();
+
+                    if !partition_data.is_empty() {
+                        // Build a CASE expression that combines range checks AND membership checks
+                        // CASE (hash_repartition(join_keys) % num_partitions)
+                        //   WHEN 0 THEN (col >= min_0 AND col <= max_0 AND ...) AND membership_check_0
+                        //   WHEN 1 THEN (col >= min_1 AND col <= max_1 AND ...) AND membership_check_1
+                        //   ...
+                        //   ELSE false
+                        // END
+
+                        let num_partitions = partition_data.len();
+
+                        // Create base expression: hash_repartition(join_keys) % num_partitions
+                        let routing_hash_expr = Arc::new(HashExpr::new(
+                            self.on_right.clone(),
+                            self.repartition_random_state.clone(),
+                            "hash_repartition".to_string(),
+                        ))
+                            as Arc<dyn PhysicalExpr>;
+
+                        let modulo_expr = Arc::new(BinaryExpr::new(
+                            routing_hash_expr,
+                            Operator::Modulo,
+                            lit(ScalarValue::UInt64(Some(num_partitions as u64))),
+                        ))
+                            as Arc<dyn PhysicalExpr>;
+
+                        // Create WHEN branches for each partition
+                        let when_then_branches: Vec<(
+                            Arc<dyn PhysicalExpr>,
+                            Arc<dyn PhysicalExpr>,
+                        )> = partitions
+                            .iter()
+                            .enumerate()
+                            .filter_map(|(partition_id, partition_opt)| {
+                                partition_opt.as_ref().and_then(|partition| {
+                                    // Skip empty partitions - they would always return false anyway
+                                    match &partition.pushdown {
+                                        PushdownStrategy::Empty => None,
+                                        _ => Some((partition_id, partition)),
+                                    }
+                                })
+                            })
+                            .map(|(partition_id, partition)| -> Result<_> {
+                                // WHEN partition_id
+                                let when_expr =
+                                    lit(ScalarValue::UInt64(Some(partition_id as u64)));
+
+                                // THEN: Combine bounds check AND membership predicate
+
+                                // 1. Create membership predicate (InList for small build sides, hash lookup otherwise)
+                                let membership_expr = create_membership_predicate(
+                                    &self.on_right,
+                                    partition.pushdown.clone(),
+                                    &HASH_JOIN_SEED,
+                                    self.probe_schema.as_ref(),
+                                )?;
+
+                                // 2. Create bounds check expression for this partition (if bounds available)
+                                let bounds_expr = create_bounds_predicate(
+                                    &self.on_right,
+                                    &partition.bounds,
+                                );
+
+                                // 3. Combine membership and bounds expressions
+                                let then_expr = match (membership_expr, bounds_expr) {
+                                    (Some(membership), Some(bounds)) => {
+                                        // Both available: combine with AND
+                                        Arc::new(BinaryExpr::new(
+                                            bounds,
+                                            Operator::And,
+                                            membership,
+                                        ))
+                                            as Arc<dyn PhysicalExpr>
+                                    }
+                                    (Some(membership), None) => {
+                                        // Membership available but no bounds (e.g., unsupported data types)
+                                        membership
+                                    }
+                                    (None, Some(bounds)) => {
+                                        // Bounds available but no membership.
+                                        // This should be unreachable in practice: we can always push down a reference
+                                        // to the hash table.
+                                        // But it seems safer to handle it defensively.
+                                        bounds
+                                    }
+                                    (None, None) => {
+                                        // No filter for this partition - should not happen due to filter_map above
+                                        // but handle defensively by returning a "true" literal
+                                        lit(true)
+                                    }
+                                };
+
+                                Ok((when_expr, then_expr))
+                            })
+                            .collect::<Result<Vec<_>>>()?;
+
+                        // Optimize for single partition: skip CASE expression entirely
+                        let filter_expr = if when_then_branches.is_empty() {
+                            // All partitions are empty: no rows can match
+                            lit(false)
+                        } else if when_then_branches.len() == 1 {
+                            // Single partition: just use the condition directly
+                            // since hash % 1 == 0 always, the WHEN 0 branch will always match
+                            Arc::clone(&when_then_branches[0].1)
+                        } else {
+                            // Multiple partitions: create CASE expression
+                            Arc::new(CaseExpr::try_new(
+                                Some(modulo_expr),
+                                when_then_branches,
+                                Some(lit(false)), // ELSE false
+                            )?) as Arc<dyn PhysicalExpr>
+                        };
+
+                        self.dynamic_filter.update(filter_expr)?;
+                    }
+                }
+            }
+            self.dynamic_filter.mark_complete();
+        }
+
+        Ok(())
+    }
+}
+
+impl fmt::Debug for SharedBuildAccumulator {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "SharedBuildAccumulator")
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs
new file mode 100644
index 0000000000000..ab630920184d3
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs
@@ -0,0 +1,927 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Stream implementation for Hash Join
+//!
+//! This module implements [`HashJoinStream`], the streaming engine for
+//! [`super::HashJoinExec`]. See comments in [`HashJoinStream`] for more details.
+
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
+use std::task::Poll;
+
+use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus};
+use crate::joins::Map;
+use crate::joins::MapOffset;
+use crate::joins::PartitionMode;
+use crate::joins::hash_join::exec::JoinLeftData;
+use crate::joins::hash_join::shared_bounds::{
+    PartitionBounds, PartitionBuildData, SharedBuildAccumulator,
+};
+use crate::joins::utils::{
+    OnceFut, equal_rows_arr, get_final_indices_from_shared_bitmap,
+};
+use crate::{
+    RecordBatchStream, SendableRecordBatchStream, handle_state,
+    hash_utils::create_hashes,
+    joins::utils::{
+        BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType,
+        StatefulStreamResult, adjust_indices_by_join_type, apply_join_filter_to_indices,
+        build_batch_empty_build_side, build_batch_from_indices,
+        need_produce_result_in_final,
+    },
+};
+
+use arrow::array::{Array, ArrayRef, UInt32Array, UInt64Array};
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{
+    JoinSide, JoinType, NullEquality, Result, internal_datafusion_err, internal_err,
+};
+use datafusion_physical_expr::PhysicalExprRef;
+
+use datafusion_common::hash_utils::RandomState;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
+use futures::{Stream, StreamExt, ready};
+
+/// Represents build-side of hash join.
+pub(super) enum BuildSide {
+    /// Indicates that build-side not collected yet
+    Initial(BuildSideInitialState),
+    /// Indicates that build-side data has been collected
+    Ready(BuildSideReadyState),
+}
+
+/// Container for BuildSide::Initial related data
+pub(super) struct BuildSideInitialState {
+    /// Future for building hash table from build-side input
+    pub(super) left_fut: OnceFut<JoinLeftData>,
+}
+
+/// Container for BuildSide::Ready related data
+pub(super) struct BuildSideReadyState {
+    /// Collected build-side data
+    left_data: Arc<JoinLeftData>,
+}
+
+impl BuildSide {
+    /// Tries to extract BuildSideInitialState from BuildSide enum.
+    /// Returns an error if state is not Initial.
+    fn try_as_initial_mut(&mut self) -> Result<&mut BuildSideInitialState> {
+        match self {
+            BuildSide::Initial(state) => Ok(state),
+            _ => internal_err!("Expected build side in initial state"),
+        }
+    }
+
+    /// Tries to extract BuildSideReadyState from BuildSide enum.
+    /// Returns an error if state is not Ready.
+    fn try_as_ready(&self) -> Result<&BuildSideReadyState> {
+        match self {
+            BuildSide::Ready(state) => Ok(state),
+            _ => internal_err!("Expected build side in ready state"),
+        }
+    }
+
+    /// Tries to extract BuildSideReadyState from BuildSide enum.
+    /// Returns an error if state is not Ready.
+    fn try_as_ready_mut(&mut self) -> Result<&mut BuildSideReadyState> {
+        match self {
+            BuildSide::Ready(state) => Ok(state),
+            _ => internal_err!("Expected build side in ready state"),
+        }
+    }
+}
+
+/// Represents state of HashJoinStream
+///
+/// Expected state transitions performed by HashJoinStream are:
+///
+/// ```text
+///
+///       WaitBuildSide
+///             │
+///             ▼
+///  ┌─► FetchProbeBatch ───► ExhaustedProbeSide ───► Completed
+///  │          │
+///  │          ▼
+///  └─ ProcessProbeBatch
+/// ```
+#[derive(Debug, Clone)]
+pub(super) enum HashJoinStreamState {
+    /// Initial state for HashJoinStream indicating that build-side data not collected yet
+    WaitBuildSide,
+    /// Waiting for bounds to be reported by all partitions
+    WaitPartitionBoundsReport,
+    /// Indicates that build-side has been collected, and stream is ready for fetching probe-side
+    FetchProbeBatch,
+    /// Indicates that non-empty batch has been fetched from probe-side, and is ready to be processed
+    ProcessProbeBatch(ProcessProbeBatchState),
+    /// Indicates that probe-side has been fully processed
+    ExhaustedProbeSide,
+    /// Indicates that HashJoinStream execution is completed
+    Completed,
+}
+
+impl HashJoinStreamState {
+    /// Tries to extract ProcessProbeBatchState from HashJoinStreamState enum.
+    /// Returns an error if state is not ProcessProbeBatchState.
+    fn try_as_process_probe_batch_mut(&mut self) -> Result<&mut ProcessProbeBatchState> {
+        match self {
+            HashJoinStreamState::ProcessProbeBatch(state) => Ok(state),
+            _ => internal_err!("Expected hash join stream in ProcessProbeBatch state"),
+        }
+    }
+}
+
+/// Container for HashJoinStreamState::ProcessProbeBatch related data
+#[derive(Debug, Clone)]
+pub(super) struct ProcessProbeBatchState {
+    /// Current probe-side batch
+    batch: RecordBatch,
+    /// Probe-side on expressions values
+    values: Vec<ArrayRef>,
+    /// Starting offset for JoinHashMap lookups
+    offset: MapOffset,
+    /// Max joined probe-side index from current batch
+    joined_probe_idx: Option<usize>,
+}
+
+impl ProcessProbeBatchState {
+    fn advance(&mut self, offset: MapOffset, joined_probe_idx: Option<usize>) {
+        self.offset = offset;
+        if joined_probe_idx.is_some() {
+            self.joined_probe_idx = joined_probe_idx;
+        }
+    }
+}
+
+/// [`Stream`] for [`super::HashJoinExec`] that does the actual join.
+///
+/// This stream:
+///
+/// - Collecting the build side (left input) into a hash map
+/// - Iterating over the probe side (right input) in streaming fashion
+/// - Looking up matches against the hash table and applying join filters
+/// - Producing joined [`RecordBatch`]es incrementally
+/// - Emitting unmatched rows for outer/semi/anti joins in the final stage
+pub(super) struct HashJoinStream {
+    /// Partition identifier for debugging and determinism
+    partition: usize,
+    /// Input schema
+    schema: Arc<Schema>,
+    /// equijoin columns from the right (probe side)
+    on_right: Vec<PhysicalExprRef>,
+    /// optional join filter
+    filter: Option<JoinFilter>,
+    /// type of the join (left, right, semi, etc)
+    join_type: JoinType,
+    /// right (probe) input
+    right: SendableRecordBatchStream,
+    /// Random state used for hashing initialization
+    random_state: RandomState,
+    /// Metrics
+    join_metrics: BuildProbeJoinMetrics,
+    /// Information of index and left / right placement of columns
+    column_indices: Vec<ColumnIndex>,
+    /// Defines the null equality for the join.
+    null_equality: NullEquality,
+    /// State of the stream
+    state: HashJoinStreamState,
+    /// Build side
+    build_side: BuildSide,
+    /// Maximum output batch size
+    batch_size: usize,
+    /// Scratch space for computing hashes
+    hashes_buffer: Vec<u64>,
+    /// Scratch space for probe indices during hash lookup
+    probe_indices_buffer: Vec<u32>,
+    /// Scratch space for build indices during hash lookup
+    build_indices_buffer: Vec<u64>,
+    /// Specifies whether the right side has an ordering to potentially preserve
+    right_side_ordered: bool,
+    /// Shared build accumulator for coordinating dynamic filter updates (collects hash maps and/or bounds, optional)
+    build_accumulator: Option<Arc<SharedBuildAccumulator>>,
+    /// Optional future to signal when build information has been reported by all partitions
+    /// and the dynamic filter has been updated
+    build_waiter: Option<OnceFut<()>>,
+    /// Partitioning mode to use
+    mode: PartitionMode,
+    /// Output buffer for coalescing small batches into larger ones with optional fetch limit.
+    /// Uses `LimitedBatchCoalescer` to efficiently combine batches and absorb limit with 'fetch'
+    output_buffer: LimitedBatchCoalescer,
+    /// Whether this is a null-aware anti join
+    null_aware: bool,
+}
+
+impl RecordBatchStream for HashJoinStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+/// Executes lookups by hash against JoinHashMap and resolves potential
+/// hash collisions.
+/// Returns build/probe indices satisfying the equality condition, along with
+/// (optional) starting point for next iteration.
+///
+/// # Example
+///
+/// For `LEFT.b1 = RIGHT.b2`:
+/// LEFT (build) Table:
+/// ```text
+///  a1  b1  c1
+///  1   1   10
+///  3   3   30
+///  5   5   50
+///  7   7   70
+///  9   8   90
+///  11  8   110
+///  13   10  130
+/// ```
+///
+/// RIGHT (probe) Table:
+/// ```text
+///  a2   b2  c2
+///  2    2   20
+///  4    4   40
+///  6    6   60
+///  8    8   80
+/// 10   10  100
+/// 12   10  120
+/// ```
+///
+/// The result is
+/// ```text
+/// "+----+----+-----+----+----+-----+",
+/// "| a1 | b1 | c1  | a2 | b2 | c2  |",
+/// "+----+----+-----+----+----+-----+",
+/// "| 9  | 8  | 90  | 8  | 8  | 80  |",
+/// "| 11 | 8  | 110 | 8  | 8  | 80  |",
+/// "| 13 | 10 | 130 | 10 | 10 | 100 |",
+/// "| 13 | 10 | 130 | 12 | 10 | 120 |",
+/// "+----+----+-----+----+----+-----+"
+/// ```
+///
+/// And the result of build and probe indices are:
+/// ```text
+/// Build indices: 4, 5, 6, 6
+/// Probe indices: 3, 3, 4, 5
+/// ```
+#[expect(clippy::too_many_arguments)]
+pub(super) fn lookup_join_hashmap(
+    build_hashmap: &dyn JoinHashMapType,
+    build_side_values: &[ArrayRef],
+    probe_side_values: &[ArrayRef],
+    null_equality: NullEquality,
+    hashes_buffer: &[u64],
+    limit: usize,
+    offset: MapOffset,
+    probe_indices_buffer: &mut Vec<u32>,
+    build_indices_buffer: &mut Vec<u64>,
+) -> Result<(UInt64Array, UInt32Array, Option<MapOffset>)> {
+    let next_offset = build_hashmap.get_matched_indices_with_limit_offset(
+        hashes_buffer,
+        limit,
+        offset,
+        probe_indices_buffer,
+        build_indices_buffer,
+    );
+
+    let build_indices_unfiltered: UInt64Array =
+        std::mem::take(build_indices_buffer).into();
+    let probe_indices_unfiltered: UInt32Array =
+        std::mem::take(probe_indices_buffer).into();
+
+    // TODO: optimize equal_rows_arr to avoid allocation of intermediate arrays
+    // https://github.com/apache/datafusion/issues/12131
+    let (build_indices, probe_indices) = equal_rows_arr(
+        &build_indices_unfiltered,
+        &probe_indices_unfiltered,
+        build_side_values,
+        probe_side_values,
+        null_equality,
+    )?;
+
+    // Reclaim buffers
+    *build_indices_buffer = build_indices_unfiltered.into_parts().1.into();
+    *probe_indices_buffer = probe_indices_unfiltered.into_parts().1.into();
+
+    Ok((build_indices, probe_indices, next_offset))
+}
+
+/// Counts the number of distinct elements in the input array.
+///
+/// The input array must be sorted (e.g., `[0, 1, 1, 2, 2, ...]`) and contain no null values.
+#[inline]
+fn count_distinct_sorted_indices(indices: &UInt32Array) -> usize {
+    if indices.is_empty() {
+        return 0;
+    }
+
+    debug_assert!(indices.null_count() == 0);
+
+    let values_buf = indices.values();
+    let values = values_buf.as_ref();
+    let mut iter = values.iter();
+    let Some(&first) = iter.next() else {
+        return 0;
+    };
+
+    let mut count = 1usize;
+    let mut last = first;
+    for &value in iter {
+        if value != last {
+            last = value;
+            count += 1;
+        }
+    }
+    count
+}
+
+impl HashJoinStream {
+    #[expect(clippy::too_many_arguments)]
+    pub(super) fn new(
+        partition: usize,
+        schema: Arc<Schema>,
+        on_right: Vec<PhysicalExprRef>,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        right: SendableRecordBatchStream,
+        random_state: RandomState,
+        join_metrics: BuildProbeJoinMetrics,
+        column_indices: Vec<ColumnIndex>,
+        null_equality: NullEquality,
+        state: HashJoinStreamState,
+        build_side: BuildSide,
+        batch_size: usize,
+        hashes_buffer: Vec<u64>,
+        right_side_ordered: bool,
+        build_accumulator: Option<Arc<SharedBuildAccumulator>>,
+        mode: PartitionMode,
+        null_aware: bool,
+        fetch: Option<usize>,
+    ) -> Self {
+        // Create output buffer with coalescing and optional fetch limit.
+        let output_buffer =
+            LimitedBatchCoalescer::new(Arc::clone(&schema), batch_size, fetch);
+
+        Self {
+            partition,
+            schema,
+            on_right,
+            filter,
+            join_type,
+            right,
+            random_state,
+            join_metrics,
+            column_indices,
+            null_equality,
+            state,
+            build_side,
+            batch_size,
+            hashes_buffer,
+            probe_indices_buffer: Vec::with_capacity(batch_size),
+            build_indices_buffer: Vec::with_capacity(batch_size),
+            right_side_ordered,
+            build_accumulator,
+            build_waiter: None,
+            mode,
+            output_buffer,
+            null_aware,
+        }
+    }
+
+    /// Separate implementation function that unpins the [`HashJoinStream`] so
+    /// that partial borrows work correctly
+    fn poll_next_impl(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        loop {
+            // First, check if we have any completed batches ready to emit
+            if let Some(batch) = self.output_buffer.next_completed_batch() {
+                return self
+                    .join_metrics
+                    .baseline
+                    .record_poll(Poll::Ready(Some(Ok(batch))));
+            }
+
+            // Check if the coalescer has finished (limit reached and flushed)
+            if self.output_buffer.is_finished() {
+                return Poll::Ready(None);
+            }
+
+            return match self.state {
+                HashJoinStreamState::WaitBuildSide => {
+                    handle_state!(ready!(self.collect_build_side(cx)))
+                }
+                HashJoinStreamState::WaitPartitionBoundsReport => {
+                    handle_state!(ready!(self.wait_for_partition_bounds_report(cx)))
+                }
+                HashJoinStreamState::FetchProbeBatch => {
+                    handle_state!(ready!(self.fetch_probe_batch(cx)))
+                }
+                HashJoinStreamState::ProcessProbeBatch(_) => {
+                    handle_state!(self.process_probe_batch())
+                }
+                HashJoinStreamState::ExhaustedProbeSide => {
+                    handle_state!(self.process_unmatched_build_batch())
+                }
+                HashJoinStreamState::Completed if !self.output_buffer.is_empty() => {
+                    // Flush any remaining buffered data
+                    self.output_buffer.finish()?;
+                    // Continue loop to emit the flushed batch
+                    continue;
+                }
+                HashJoinStreamState::Completed => Poll::Ready(None),
+            };
+        }
+    }
+
+    /// Optional step to wait until build-side information (hash maps or bounds) has been reported by all partitions.
+    /// This state is only entered if a build accumulator is present.
+    ///
+    /// ## Why wait?
+    ///
+    /// The dynamic filter is only built once all partitions have reported their information (hash maps or bounds).
+    /// If we do not wait here, the probe-side scan may start before the filter is ready.
+    /// This can lead to the probe-side scan missing the opportunity to apply the filter
+    /// and skip reading unnecessary data.
+    fn wait_for_partition_bounds_report(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
+        if let Some(ref mut fut) = self.build_waiter {
+            ready!(fut.get_shared(cx))?;
+        }
+        self.state = HashJoinStreamState::FetchProbeBatch;
+        Poll::Ready(Ok(StatefulStreamResult::Continue))
+    }
+
+    /// Collects build-side data by polling `OnceFut` future from initialized build-side
+    ///
+    /// Updates build-side to `Ready`, and state to `FetchProbeSide`
+    fn collect_build_side(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
+        let build_timer = self.join_metrics.build_time.timer();
+        // build hash table from left (build) side, if not yet done
+        let left_data = ready!(
+            self.build_side
+                .try_as_initial_mut()?
+                .left_fut
+                .get_shared(cx)
+        )?;
+        build_timer.done();
+
+        // Note: For null-aware anti join, we need to check the probe side (right) for NULLs,
+        // not the build side (left). The probe-side NULL check happens during process_probe_batch.
+        // The probe_side_has_null flag will be set there if any probe batch contains NULL.
+
+        // Handle dynamic filter build-side information accumulation
+        //
+        // Dynamic filter coordination between partitions:
+        // Report hash maps (Partitioned mode) or bounds (CollectLeft mode) to the accumulator
+        // which will handle synchronization and filter updates
+        if let Some(ref build_accumulator) = self.build_accumulator {
+            let build_accumulator = Arc::clone(build_accumulator);
+
+            let left_side_partition_id = match self.mode {
+                PartitionMode::Partitioned => self.partition,
+                PartitionMode::CollectLeft => 0,
+                PartitionMode::Auto => unreachable!(
+                    "PartitionMode::Auto should not be present at execution time. This is a bug in DataFusion, please report it!"
+                ),
+            };
+
+            // Determine pushdown strategy based on availability of InList values
+            let pushdown = left_data.membership().clone();
+
+            // Construct the appropriate build data enum variant based on partition mode
+            let build_data = match self.mode {
+                PartitionMode::Partitioned => PartitionBuildData::Partitioned {
+                    partition_id: left_side_partition_id,
+                    pushdown,
+                    bounds: left_data
+                        .bounds
+                        .clone()
+                        .unwrap_or_else(|| PartitionBounds::new(vec![])),
+                },
+                PartitionMode::CollectLeft => PartitionBuildData::CollectLeft {
+                    pushdown,
+                    bounds: left_data
+                        .bounds
+                        .clone()
+                        .unwrap_or_else(|| PartitionBounds::new(vec![])),
+                },
+                PartitionMode::Auto => unreachable!(
+                    "PartitionMode::Auto should not be present at execution time"
+                ),
+            };
+
+            self.build_waiter = Some(OnceFut::new(async move {
+                build_accumulator.report_build_data(build_data).await
+            }));
+            self.state = HashJoinStreamState::WaitPartitionBoundsReport;
+        } else {
+            self.state = HashJoinStreamState::FetchProbeBatch;
+        }
+
+        self.build_side = BuildSide::Ready(BuildSideReadyState { left_data });
+        Poll::Ready(Ok(StatefulStreamResult::Continue))
+    }
+
+    /// Fetches next batch from probe-side
+    ///
+    /// If non-empty batch has been fetched, updates state to `ProcessProbeBatchState`,
+    /// otherwise updates state to `ExhaustedProbeSide`
+    fn fetch_probe_batch(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
+        match ready!(self.right.poll_next_unpin(cx)) {
+            None => {
+                self.state = HashJoinStreamState::ExhaustedProbeSide;
+            }
+            Some(Ok(batch)) => {
+                // Precalculate hash values for fetched batch
+                let keys_values = evaluate_expressions_to_arrays(&self.on_right, &batch)?;
+
+                if let Map::HashMap(_) = self.build_side.try_as_ready()?.left_data.map() {
+                    self.hashes_buffer.clear();
+                    self.hashes_buffer.resize(batch.num_rows(), 0);
+                    create_hashes(
+                        &keys_values,
+                        &self.random_state,
+                        &mut self.hashes_buffer,
+                    )?;
+                }
+
+                self.join_metrics.input_batches.add(1);
+                self.join_metrics.input_rows.add(batch.num_rows());
+
+                self.state =
+                    HashJoinStreamState::ProcessProbeBatch(ProcessProbeBatchState {
+                        batch,
+                        values: keys_values,
+                        offset: (0, None),
+                        joined_probe_idx: None,
+                    });
+            }
+            Some(Err(err)) => return Poll::Ready(Err(err)),
+        };
+
+        Poll::Ready(Ok(StatefulStreamResult::Continue))
+    }
+
+    /// Joins current probe batch with build-side data and produces batch with matched output
+    ///
+    /// Updates state to `FetchProbeBatch`
+    fn process_probe_batch(
+        &mut self,
+    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
+        let state = self.state.try_as_process_probe_batch_mut()?;
+        let build_side = self.build_side.try_as_ready_mut()?;
+
+        self.join_metrics
+            .probe_hit_rate
+            .add_total(state.batch.num_rows());
+
+        let timer = self.join_metrics.join_time.timer();
+
+        // Null-aware anti join semantics:
+        // For LeftAnti: output LEFT (build) rows where LEFT.key NOT IN RIGHT.key
+        // 1. If RIGHT (probe) contains NULL in any batch, no LEFT rows should be output
+        // 2. LEFT rows with NULL keys should not be output (handled in final stage)
+        if self.null_aware {
+            // Mark that we've seen a probe batch with actual rows (probe side is non-empty)
+            // Only set this if batch has rows - empty batches don't count
+            // Use shared atomic state so all partitions can see this global information
+            if state.batch.num_rows() > 0 {
+                build_side
+                    .left_data
+                    .probe_side_non_empty
+                    .store(true, Ordering::Relaxed);
+            }
+
+            // Check if probe side (RIGHT) contains NULL
+            // Since null_aware validation ensures single column join, we only check the first column
+            let probe_key_column = &state.values[0];
+            if probe_key_column.null_count() > 0 {
+                // Found NULL in probe side - set shared flag to prevent any output
+                build_side
+                    .left_data
+                    .probe_side_has_null
+                    .store(true, Ordering::Relaxed);
+            }
+
+            // If probe side has NULL (detected in this or any other partition), return empty result
+            if build_side
+                .left_data
+                .probe_side_has_null
+                .load(Ordering::Relaxed)
+            {
+                timer.done();
+                self.state = HashJoinStreamState::FetchProbeBatch;
+                return Ok(StatefulStreamResult::Continue);
+            }
+        }
+
+        // if the left side is empty, we can skip the (potentially expensive) join operation
+        let is_empty = build_side.left_data.map().is_empty();
+
+        if is_empty && self.filter.is_none() {
+            let result = build_batch_empty_build_side(
+                &self.schema,
+                build_side.left_data.batch(),
+                &state.batch,
+                &self.column_indices,
+                self.join_type,
+            )?;
+            timer.done();
+            self.output_buffer.push_batch(result)?;
+            self.state = HashJoinStreamState::FetchProbeBatch;
+
+            return Ok(StatefulStreamResult::Continue);
+        }
+
+        // get the matched by join keys indices
+        let (left_indices, right_indices, next_offset) = match build_side.left_data.map()
+        {
+            Map::HashMap(map) => lookup_join_hashmap(
+                map.as_ref(),
+                build_side.left_data.values(),
+                &state.values,
+                self.null_equality,
+                &self.hashes_buffer,
+                self.batch_size,
+                state.offset,
+                &mut self.probe_indices_buffer,
+                &mut self.build_indices_buffer,
+            )?,
+            Map::ArrayMap(array_map) => {
+                let next_offset = array_map.get_matched_indices_with_limit_offset(
+                    &state.values,
+                    self.batch_size,
+                    state.offset,
+                    &mut self.probe_indices_buffer,
+                    &mut self.build_indices_buffer,
+                )?;
+                (
+                    UInt64Array::from(self.build_indices_buffer.clone()),
+                    UInt32Array::from(self.probe_indices_buffer.clone()),
+                    next_offset,
+                )
+            }
+        };
+
+        let distinct_right_indices_count = count_distinct_sorted_indices(&right_indices);
+
+        self.join_metrics
+            .probe_hit_rate
+            .add_part(distinct_right_indices_count);
+
+        self.join_metrics.avg_fanout.add_part(left_indices.len());
+
+        self.join_metrics
+            .avg_fanout
+            .add_total(distinct_right_indices_count);
+
+        // apply join filter if exists
+        let (left_indices, right_indices) = if let Some(filter) = &self.filter {
+            apply_join_filter_to_indices(
+                build_side.left_data.batch(),
+                &state.batch,
+                left_indices,
+                right_indices,
+                filter,
+                JoinSide::Left,
+                None,
+                self.join_type,
+            )?
+        } else {
+            (left_indices, right_indices)
+        };
+
+        // mark joined left-side indices as visited, if required by join type
+        if need_produce_result_in_final(self.join_type) {
+            let mut bitmap = build_side.left_data.visited_indices_bitmap().lock();
+            left_indices.iter().flatten().for_each(|x| {
+                bitmap.set_bit(x as usize, true);
+            });
+        }
+
+        // The goals of index alignment for different join types are:
+        //
+        // 1) Right & FullJoin -- to append all missing probe-side indices between
+        //    previous (excluding) and current joined indices.
+        // 2) SemiJoin -- deduplicate probe indices in range between previous
+        //    (excluding) and current joined indices.
+        // 3) AntiJoin -- return only missing indices in range between
+        //    previous and current joined indices.
+        //    Inclusion/exclusion of the indices themselves don't matter
+        //
+        // As a summary -- alignment range can be produced based only on
+        // joined (matched with filters applied) probe side indices, excluding starting one
+        // (left from previous iteration).
+
+        // if any rows have been joined -- get last joined probe-side (right) row
+        // it's important that index counts as "joined" after hash collisions checks
+        // and join filters applied.
+        let last_joined_right_idx = match right_indices.len() {
+            0 => None,
+            n => Some(right_indices.value(n - 1) as usize),
+        };
+
+        // Calculate range and perform alignment.
+        // In case probe batch has been processed -- align all remaining rows.
+        let index_alignment_range_start = state.joined_probe_idx.map_or(0, |v| v + 1);
+        let index_alignment_range_end = if next_offset.is_none() {
+            state.batch.num_rows()
+        } else {
+            last_joined_right_idx.map_or(0, |v| v + 1)
+        };
+
+        let (left_indices, right_indices) = adjust_indices_by_join_type(
+            left_indices,
+            right_indices,
+            index_alignment_range_start..index_alignment_range_end,
+            self.join_type,
+            self.right_side_ordered,
+        )?;
+
+        // Build output batch and push to coalescer
+        let (build_batch, probe_batch, join_side) =
+            if self.join_type == JoinType::RightMark {
+                (&state.batch, build_side.left_data.batch(), JoinSide::Right)
+            } else {
+                (build_side.left_data.batch(), &state.batch, JoinSide::Left)
+            };
+
+        let batch = build_batch_from_indices(
+            &self.schema,
+            build_batch,
+            probe_batch,
+            &left_indices,
+            &right_indices,
+            &self.column_indices,
+            join_side,
+            self.join_type,
+        )?;
+
+        let push_status = self.output_buffer.push_batch(batch)?;
+
+        timer.done();
+
+        // If limit reached, finish and move to Completed state
+        if push_status == PushBatchStatus::LimitReached {
+            self.output_buffer.finish()?;
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
+
+        if next_offset.is_none() {
+            self.state = HashJoinStreamState::FetchProbeBatch;
+        } else {
+            state.advance(
+                next_offset
+                    .ok_or_else(|| internal_datafusion_err!("unexpected None offset"))?,
+                last_joined_right_idx,
+            )
+        };
+
+        Ok(StatefulStreamResult::Continue)
+    }
+
+    /// Processes unmatched build-side rows for certain join types and produces output batch
+    ///
+    /// Updates state to `Completed`
+    fn process_unmatched_build_batch(
+        &mut self,
+    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
+        let timer = self.join_metrics.join_time.timer();
+
+        if !need_produce_result_in_final(self.join_type) {
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
+
+        let build_side = self.build_side.try_as_ready()?;
+
+        // For null-aware anti join, if probe side had NULL, no rows should be output
+        // Check shared atomic state to get global knowledge across all partitions
+        if self.null_aware
+            && build_side
+                .left_data
+                .probe_side_has_null
+                .load(Ordering::Relaxed)
+        {
+            timer.done();
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
+        if !build_side.left_data.report_probe_completed() {
+            self.state = HashJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Continue);
+        }
+
+        // use the global left bitmap to produce the left indices and right indices
+        let (mut left_side, mut right_side) = get_final_indices_from_shared_bitmap(
+            build_side.left_data.visited_indices_bitmap(),
+            self.join_type,
+            true,
+        );
+
+        // For null-aware anti join, filter out LEFT rows with NULL in join keys
+        // BUT only if the probe side (RIGHT) was non-empty. If probe side is empty,
+        // NULL NOT IN (empty) = TRUE, so NULL rows should be returned.
+        // Use shared atomic state to get global knowledge across all partitions
+        if self.null_aware
+            && self.join_type == JoinType::LeftAnti
+            && build_side
+                .left_data
+                .probe_side_non_empty
+                .load(Ordering::Relaxed)
+        {
+            // Since null_aware validation ensures single column join, we only check the first column
+            let build_key_column = &build_side.left_data.values()[0];
+
+            // Filter out indices where the key is NULL
+            let filtered_indices: Vec<u64> = left_side
+                .iter()
+                .filter_map(|idx| {
+                    let idx_usize = idx.unwrap() as usize;
+                    if build_key_column.is_null(idx_usize) {
+                        None // Skip rows with NULL keys
+                    } else {
+                        Some(idx.unwrap())
+                    }
+                })
+                .collect();
+
+            left_side = UInt64Array::from(filtered_indices);
+
+            // Update right_side to match the new length
+            let mut builder = arrow::array::UInt32Builder::with_capacity(left_side.len());
+            builder.append_nulls(left_side.len());
+            right_side = builder.finish();
+        }
+
+        self.join_metrics.input_batches.add(1);
+        self.join_metrics.input_rows.add(left_side.len());
+
+        timer.done();
+
+        self.state = HashJoinStreamState::Completed;
+
+        // Push final unmatched indices to output buffer
+        if !left_side.is_empty() {
+            let empty_right_batch = RecordBatch::new_empty(self.right.schema());
+            let batch = build_batch_from_indices(
+                &self.schema,
+                build_side.left_data.batch(),
+                &empty_right_batch,
+                &left_side,
+                &right_side,
+                &self.column_indices,
+                JoinSide::Left,
+                self.join_type,
+            )?;
+            let push_status = self.output_buffer.push_batch(batch)?;
+
+            // If limit reached, finish the coalescer
+            if push_status == PushBatchStatus::LimitReached {
+                self.output_buffer.finish()?;
+            }
+        }
+
+        Ok(StatefulStreamResult::Continue)
+    }
+}
+
+impl Stream for HashJoinStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        self.poll_next_impl(cx)
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/join_filter.rs b/datafusion/physical-plan/src/joins/join_filter.rs
index 0e46a971d90bb..de5df2be55650 100644
--- a/datafusion/physical-plan/src/joins/join_filter.rs
+++ b/datafusion/physical-plan/src/joins/join_filter.rs
@@ -19,7 +19,7 @@ use crate::joins::utils::ColumnIndex;
 use arrow::datatypes::SchemaRef;
 use datafusion_common::JoinSide;
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-use std::sync::Arc;
+use std::{fmt::Display, sync::Arc};
 
 /// Filter applied before join output. Fields are crate-public to allow
 /// downstream implementations to experiment with custom joins.
@@ -33,6 +33,14 @@ pub struct JoinFilter {
     pub(crate) schema: SchemaRef,
 }
 
+/// For display in `EXPLAIN` plans, only expression with column names is needed,
+/// it output expression like `(col1 + col2) = 0`
+impl Display for JoinFilter {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.expression.fmt_sql(f)
+    }
+}
+
 impl JoinFilter {
     /// Creates new JoinFilter
     pub fn new(
diff --git a/datafusion/physical-plan/src/joins/join_hash_map.rs b/datafusion/physical-plan/src/joins/join_hash_map.rs
index 521e19d7bf444..8f0fb66b64fbf 100644
--- a/datafusion/physical-plan/src/joins/join_hash_map.rs
+++ b/datafusion/physical-plan/src/joins/join_hash_map.rs
@@ -20,10 +20,13 @@
 //! ["on" values] to a list of indices with this key's value.
 
 use std::fmt::{self, Debug};
-use std::ops::IndexMut;
+use std::ops::Sub;
 
-use hashbrown::hash_table::Entry::{Occupied, Vacant};
+use arrow::array::BooleanArray;
+use arrow::buffer::BooleanBuffer;
+use arrow::datatypes::ArrowNativeType;
 use hashbrown::HashTable;
+use hashbrown::hash_table::Entry::{Occupied, Vacant};
 
 /// Maps a `u64` hash value based on the build side ["on" values] to a list of indices with this key's value.
 ///
@@ -35,7 +38,7 @@ use hashbrown::HashTable;
 /// During this stage it might be the case that a row is contained the same hashmap value,
 /// but the values don't match. Those are checked in the `equal_rows_arr` method.
 ///
-/// The indices (values) are stored in a separate chained list stored in the `Vec<u64>`.
+/// The indices (values) are stored in a separate chained list stored as `Vec<u32>` or `Vec<u64>`.
 ///
 /// The first value (+1) is stored in the hashmap, whereas the next value is stored in array at the position value.
 ///
@@ -87,278 +90,408 @@ use hashbrown::HashTable;
 /// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1)
 /// ---------------------
 /// ```
-pub struct JoinHashMap {
+///
+/// Here we have an option between creating a `JoinHashMapType` using `u32` or `u64` indices
+/// based on how many rows were being used for indices.
+///
+/// At runtime we choose between using `JoinHashMapU32` and `JoinHashMapU64` which oth implement
+/// `JoinHashMapType`.
+///
+/// ## Note on use of this trait as a public API
+/// This is currently a public trait but is mainly intended for internal use within DataFusion.
+/// For example, we may compare references to `JoinHashMapType` implementations by pointer equality
+/// rather than deep equality of contents, as deep equality would be expensive and in our usage
+/// patterns it is impossible for two different hash maps to have identical contents in a practical sense.
+pub trait JoinHashMapType: Send + Sync {
+    fn extend_zero(&mut self, len: usize);
+
+    fn update_from_iter<'a>(
+        &mut self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>,
+        deleted_offset: usize,
+    );
+
+    fn get_matched_indices<'a>(
+        &self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>,
+        deleted_offset: Option<usize>,
+    ) -> (Vec<u32>, Vec<u64>);
+
+    fn get_matched_indices_with_limit_offset(
+        &self,
+        hash_values: &[u64],
+        limit: usize,
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset>;
+
+    /// Returns a BooleanArray indicating which of the provided hashes exist in the map.
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray;
+
+    /// Returns `true` if the join hash map contains no entries.
+    fn is_empty(&self) -> bool;
+
+    /// Returns the number of entries in the join hash map.
+    fn len(&self) -> usize;
+}
+
+pub struct JoinHashMapU32 {
+    // Stores hash value to last row index
+    map: HashTable<(u64, u32)>,
+    // Stores indices in chained list data structure
+    next: Vec<u32>,
+}
+
+impl JoinHashMapU32 {
+    #[cfg(test)]
+    pub(crate) fn new(map: HashTable<(u64, u32)>, next: Vec<u32>) -> Self {
+        Self { map, next }
+    }
+
+    pub fn with_capacity(cap: usize) -> Self {
+        Self {
+            map: HashTable::with_capacity(cap),
+            next: vec![0; cap],
+        }
+    }
+}
+
+impl Debug for JoinHashMapU32 {
+    fn fmt(&self, _f: &mut fmt::Formatter) -> fmt::Result {
+        Ok(())
+    }
+}
+
+impl JoinHashMapType for JoinHashMapU32 {
+    fn extend_zero(&mut self, _: usize) {}
+
+    fn update_from_iter<'a>(
+        &mut self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>,
+        deleted_offset: usize,
+    ) {
+        update_from_iter::<u32>(&mut self.map, &mut self.next, iter, deleted_offset);
+    }
+
+    fn get_matched_indices<'a>(
+        &self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>,
+        deleted_offset: Option<usize>,
+    ) -> (Vec<u32>, Vec<u64>) {
+        get_matched_indices::<u32>(&self.map, &self.next, iter, deleted_offset)
+    }
+
+    fn get_matched_indices_with_limit_offset(
+        &self,
+        hash_values: &[u64],
+        limit: usize,
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
+        get_matched_indices_with_limit_offset::<u32>(
+            &self.map,
+            &self.next,
+            hash_values,
+            limit,
+            offset,
+            input_indices,
+            match_indices,
+        )
+    }
+
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.map.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.map.len()
+    }
+}
+
+pub struct JoinHashMapU64 {
     // Stores hash value to last row index
     map: HashTable<(u64, u64)>,
     // Stores indices in chained list data structure
     next: Vec<u64>,
 }
 
-impl JoinHashMap {
+impl JoinHashMapU64 {
     #[cfg(test)]
     pub(crate) fn new(map: HashTable<(u64, u64)>, next: Vec<u64>) -> Self {
         Self { map, next }
     }
 
-    pub(crate) fn with_capacity(capacity: usize) -> Self {
-        JoinHashMap {
-            map: HashTable::with_capacity(capacity),
-            next: vec![0; capacity],
+    pub fn with_capacity(cap: usize) -> Self {
+        Self {
+            map: HashTable::with_capacity(cap),
+            next: vec![0; cap],
         }
     }
 }
 
-// Type of offsets for obtaining indices from JoinHashMap.
-pub(crate) type JoinHashMapOffset = (usize, Option<u64>);
-
-// Macro for traversing chained values with limit.
-// Early returns in case of reaching output tuples limit.
-macro_rules! chain_traverse {
-    (
-        $input_indices:ident, $match_indices:ident, $hash_values:ident, $next_chain:ident,
-        $input_idx:ident, $chain_idx:ident, $remaining_output:ident
-    ) => {
-        let mut match_row_idx = $chain_idx - 1;
-        loop {
-            $match_indices.push(match_row_idx);
-            $input_indices.push($input_idx as u32);
-            $remaining_output -= 1;
-            // Follow the chain to get the next index value
-            let next = $next_chain[match_row_idx as usize];
-
-            if $remaining_output == 0 {
-                // In case current input index is the last, and no more chain values left
-                // returning None as whole input has been scanned
-                let next_offset = if $input_idx == $hash_values.len() - 1 && next == 0 {
-                    None
-                } else {
-                    Some(($input_idx, Some(next)))
-                };
-                return ($input_indices, $match_indices, next_offset);
-            }
-            if next == 0 {
-                // end of list
-                break;
-            }
-            match_row_idx = next - 1;
-        }
-    };
+impl Debug for JoinHashMapU64 {
+    fn fmt(&self, _f: &mut fmt::Formatter) -> fmt::Result {
+        Ok(())
+    }
 }
 
-// Trait defining methods that must be implemented by a hash map type to be used for joins.
-pub trait JoinHashMapType {
-    /// The type of list used to store the next list
-    type NextType: IndexMut<usize, Output = u64>;
-    /// Extend with zero
-    fn extend_zero(&mut self, len: usize);
-    /// Returns mutable references to the hash map and the next.
-    fn get_mut(&mut self) -> (&mut HashTable<(u64, u64)>, &mut Self::NextType);
-    /// Returns a reference to the hash map.
-    fn get_map(&self) -> &HashTable<(u64, u64)>;
-    /// Returns a reference to the next.
-    fn get_list(&self) -> &Self::NextType;
-
-    // Whether values in the hashmap are distinct (no duplicate keys)
-    fn is_distinct(&self) -> bool {
-        false
-    }
+impl JoinHashMapType for JoinHashMapU64 {
+    fn extend_zero(&mut self, _: usize) {}
 
-    /// Updates hashmap from iterator of row indices & row hashes pairs.
     fn update_from_iter<'a>(
         &mut self,
-        iter: impl Iterator<Item = (usize, &'a u64)>,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>,
         deleted_offset: usize,
     ) {
-        let (mut_map, mut_list) = self.get_mut();
-        for (row, &hash_value) in iter {
-            let entry = mut_map.entry(
-                hash_value,
-                |&(hash, _)| hash_value == hash,
-                |&(hash, _)| hash,
-            );
-
-            match entry {
-                Occupied(mut occupied_entry) => {
-                    // Already exists: add index to next array
-                    let (_, index) = occupied_entry.get_mut();
-                    let prev_index = *index;
-                    // Store new value inside hashmap
-                    *index = (row + 1) as u64;
-                    // Update chained Vec at `row` with previous value
-                    mut_list[row - deleted_offset] = prev_index;
-                }
-                Vacant(vacant_entry) => {
-                    vacant_entry.insert((hash_value, (row + 1) as u64));
-                    // chained list at `row` is already initialized with 0
-                    // meaning end of list
-                }
-            }
-        }
+        update_from_iter::<u64>(&mut self.map, &mut self.next, iter, deleted_offset);
     }
 
-    /// Returns all pairs of row indices matched by hash.
-    ///
-    /// This method only compares hashes, so additional further check for actual values
-    /// equality may be required.
     fn get_matched_indices<'a>(
         &self,
-        iter: impl Iterator<Item = (usize, &'a u64)>,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>,
         deleted_offset: Option<usize>,
     ) -> (Vec<u32>, Vec<u64>) {
-        let mut input_indices = vec![];
-        let mut match_indices = vec![];
-
-        let hash_map = self.get_map();
-        let next_chain = self.get_list();
-        for (row_idx, hash_value) in iter {
-            // Get the hash and find it in the index
-            if let Some((_, index)) =
-                hash_map.find(*hash_value, |(hash, _)| *hash_value == *hash)
-            {
-                let mut i = *index - 1;
-                loop {
-                    let match_row_idx = if let Some(offset) = deleted_offset {
-                        // This arguments means that we prune the next index way before here.
-                        if i < offset as u64 {
-                            // End of the list due to pruning
-                            break;
-                        }
-                        i - offset as u64
-                    } else {
-                        i
-                    };
-                    match_indices.push(match_row_idx);
-                    input_indices.push(row_idx as u32);
-                    // Follow the chain to get the next index value
-                    let next = next_chain[match_row_idx as usize];
-                    if next == 0 {
-                        // end of list
-                        break;
-                    }
-                    i = next - 1;
-                }
-            }
-        }
-
-        (input_indices, match_indices)
+        get_matched_indices::<u64>(&self.map, &self.next, iter, deleted_offset)
     }
 
-    /// Matches hashes with taking limit and offset into account.
-    /// Returns pairs of matched indices along with the starting point for next
-    /// matching iteration (`None` if limit has not been reached).
-    ///
-    /// This method only compares hashes, so additional further check for actual values
-    /// equality may be required.
     fn get_matched_indices_with_limit_offset(
         &self,
         hash_values: &[u64],
         limit: usize,
-        offset: JoinHashMapOffset,
-    ) -> (Vec<u32>, Vec<u64>, Option<JoinHashMapOffset>) {
-        let mut input_indices = Vec::with_capacity(limit);
-        let mut match_indices = Vec::with_capacity(limit);
-
-        let hash_map: &HashTable<(u64, u64)> = self.get_map();
-        let next_chain = self.get_list();
-        // Check if hashmap consists of unique values
-        // If so, we can skip the chain traversal
-        if self.is_distinct() {
-            let start = offset.0;
-            let end = (start + limit).min(hash_values.len());
-            for (row_idx, &hash_value) in hash_values[start..end].iter().enumerate() {
-                if let Some((_, index)) =
-                    hash_map.find(hash_value, |(hash, _)| hash_value == *hash)
-                {
-                    input_indices.push(start as u32 + row_idx as u32);
-                    match_indices.push(*index - 1);
-                }
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
+        get_matched_indices_with_limit_offset::<u64>(
+            &self.map,
+            &self.next,
+            hash_values,
+            limit,
+            offset,
+            input_indices,
+            match_indices,
+        )
+    }
+
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.map.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.map.len()
+    }
+}
+
+use crate::joins::MapOffset;
+use crate::joins::chain::traverse_chain;
+
+pub fn update_from_iter<'a, T>(
+    map: &mut HashTable<(u64, T)>,
+    next: &mut [T],
+    iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>,
+    deleted_offset: usize,
+) where
+    T: Copy + TryFrom<usize> + PartialOrd,
+    <T as TryFrom<usize>>::Error: Debug,
+{
+    for (row, &hash_value) in iter {
+        let entry = map.entry(
+            hash_value,
+            |&(hash, _)| hash_value == hash,
+            |&(hash, _)| hash,
+        );
+
+        match entry {
+            Occupied(mut occupied_entry) => {
+                // Already exists: add index to next array
+                let (_, index) = occupied_entry.get_mut();
+                let prev_index = *index;
+                // Store new value inside hashmap
+                *index = T::try_from(row + 1).unwrap();
+                // Update chained Vec at `row` with previous value
+                next[row - deleted_offset] = prev_index;
             }
-            if end == hash_values.len() {
-                // No more values to process
-                return (input_indices, match_indices, None);
+            Vacant(vacant_entry) => {
+                vacant_entry.insert((hash_value, T::try_from(row + 1).unwrap()));
             }
-            return (input_indices, match_indices, Some((end, None)));
         }
+    }
+}
 
-        let mut remaining_output = limit;
-
-        // Calculate initial `hash_values` index before iterating
-        let to_skip = match offset {
-            // None `initial_next_idx` indicates that `initial_idx` processing has'n been started
-            (initial_idx, None) => initial_idx,
-            // Zero `initial_next_idx` indicates that `initial_idx` has been processed during
-            // previous iteration, and it should be skipped
-            (initial_idx, Some(0)) => initial_idx + 1,
-            // Otherwise, process remaining `initial_idx` matches by traversing `next_chain`,
-            // to start with the next index
-            (initial_idx, Some(initial_next_idx)) => {
-                chain_traverse!(
-                    input_indices,
-                    match_indices,
-                    hash_values,
-                    next_chain,
-                    initial_idx,
-                    initial_next_idx,
-                    remaining_output
-                );
-
-                initial_idx + 1
-            }
-        };
+pub fn get_matched_indices<'a, T>(
+    map: &HashTable<(u64, T)>,
+    next: &[T],
+    iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>,
+    deleted_offset: Option<usize>,
+) -> (Vec<u32>, Vec<u64>)
+where
+    T: Copy + TryFrom<usize> + PartialOrd + Into<u64> + Sub<Output = T>,
+    <T as TryFrom<usize>>::Error: Debug,
+{
+    let mut input_indices = vec![];
+    let mut match_indices = vec![];
+    let zero = T::try_from(0).unwrap();
+    let one = T::try_from(1).unwrap();
 
-        let mut row_idx = to_skip;
-
-        for hash_value in &hash_values[to_skip..] {
-            if let Some((_, index)) =
-                hash_map.find(*hash_value, |(hash, _)| *hash_value == *hash)
-            {
-                chain_traverse!(
-                    input_indices,
-                    match_indices,
-                    hash_values,
-                    next_chain,
-                    row_idx,
-                    index,
-                    remaining_output
-                );
+    for (row_idx, hash_value) in iter {
+        // Get the hash and find it in the index
+        if let Some((_, index)) = map.find(*hash_value, |(hash, _)| *hash_value == *hash)
+        {
+            let mut i = *index - one;
+            loop {
+                let match_row_idx = if let Some(offset) = deleted_offset {
+                    let offset = T::try_from(offset).unwrap();
+                    // This arguments means that we prune the next index way before here.
+                    if i < offset {
+                        // End of the list due to pruning
+                        break;
+                    }
+                    i - offset
+                } else {
+                    i
+                };
+                match_indices.push(match_row_idx.into());
+                input_indices.push(row_idx as u32);
+                // Follow the chain to get the next index value
+                let next_chain = next[match_row_idx.into() as usize];
+                if next_chain == zero {
+                    // end of list
+                    break;
+                }
+                i = next_chain - one;
             }
-            row_idx += 1;
         }
-
-        (input_indices, match_indices, None)
     }
-}
 
-/// Implementation of `JoinHashMapType` for `JoinHashMap`.
-impl JoinHashMapType for JoinHashMap {
-    type NextType = Vec<u64>;
+    (input_indices, match_indices)
+}
 
-    // Void implementation
-    fn extend_zero(&mut self, _: usize) {}
+pub fn get_matched_indices_with_limit_offset<T>(
+    map: &HashTable<(u64, T)>,
+    next_chain: &[T],
+    hash_values: &[u64],
+    limit: usize,
+    offset: MapOffset,
+    input_indices: &mut Vec<u32>,
+    match_indices: &mut Vec<u64>,
+) -> Option<MapOffset>
+where
+    T: Copy + TryFrom<usize> + PartialOrd + Into<u64> + Sub<Output = T>,
+    <T as TryFrom<usize>>::Error: Debug,
+    T: ArrowNativeType,
+{
+    // Clear the buffer before producing new results
+    input_indices.clear();
+    match_indices.clear();
+    let one = T::try_from(1).unwrap();
 
-    /// Get mutable references to the hash map and the next.
-    fn get_mut(&mut self) -> (&mut HashTable<(u64, u64)>, &mut Self::NextType) {
-        (&mut self.map, &mut self.next)
+    // Check if hashmap consists of unique values
+    // If so, we can skip the chain traversal
+    if map.len() == next_chain.len() {
+        let start = offset.0;
+        let end = (start + limit).min(hash_values.len());
+        for (i, &hash) in hash_values[start..end].iter().enumerate() {
+            if let Some((_, idx)) = map.find(hash, |(h, _)| hash == *h) {
+                input_indices.push(start as u32 + i as u32);
+                match_indices.push((*idx - one).into());
+            }
+        }
+        return if end == hash_values.len() {
+            None
+        } else {
+            Some((end, None))
+        };
     }
 
-    /// Get a reference to the hash map.
-    fn get_map(&self) -> &HashTable<(u64, u64)> {
-        &self.map
-    }
+    let mut remaining_output = limit;
 
-    /// Get a reference to the next.
-    fn get_list(&self) -> &Self::NextType {
-        &self.next
-    }
+    // Calculate initial `hash_values` index before iterating
+    let to_skip = match offset {
+        // None `initial_next_idx` indicates that `initial_idx` processing hasn't been started
+        (idx, None) => idx,
+        // Zero `initial_next_idx` indicates that `initial_idx` has been processed during
+        // previous iteration, and it should be skipped
+        (idx, Some(0)) => idx + 1,
+        // Otherwise, process remaining `initial_idx` matches by traversing `next_chain`,
+        // to start with the next index
+        (idx, Some(next_idx)) => {
+            let next_idx: T = T::usize_as(next_idx as usize);
+            let is_last = idx == hash_values.len() - 1;
+            if let Some(next_offset) = traverse_chain(
+                next_chain,
+                idx,
+                next_idx,
+                &mut remaining_output,
+                input_indices,
+                match_indices,
+                is_last,
+            ) {
+                return Some(next_offset);
+            }
+            idx + 1
+        }
+    };
 
-    /// Check if the values in the hashmap are distinct.
-    fn is_distinct(&self) -> bool {
-        self.map.len() == self.next.len()
+    let hash_values_len = hash_values.len();
+    for (i, &hash) in hash_values[to_skip..].iter().enumerate() {
+        let row_idx = to_skip + i;
+        if let Some((_, idx)) = map.find(hash, |(h, _)| hash == *h) {
+            let idx: T = *idx;
+            let is_last = row_idx == hash_values_len - 1;
+            if let Some(next_offset) = traverse_chain(
+                next_chain,
+                row_idx,
+                idx,
+                &mut remaining_output,
+                input_indices,
+                match_indices,
+                is_last,
+            ) {
+                return Some(next_offset);
+            }
+        }
     }
+    None
 }
 
-impl Debug for JoinHashMap {
-    fn fmt(&self, _f: &mut fmt::Formatter) -> fmt::Result {
-        Ok(())
+pub fn contain_hashes<T>(map: &HashTable<(u64, T)>, hash_values: &[u64]) -> BooleanArray {
+    let buffer = BooleanBuffer::collect_bool(hash_values.len(), |i| {
+        let hash = hash_values[i];
+        map.find(hash, |(h, _)| hash == *h).is_some()
+    });
+    BooleanArray::new(buffer, None)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_contain_hashes() {
+        let mut hash_map = JoinHashMapU32::with_capacity(10);
+        hash_map.update_from_iter(Box::new([10u64, 20u64, 30u64].iter().enumerate()), 0);
+
+        let probe_hashes = vec![10, 11, 20, 21, 30, 31];
+        let array = hash_map.contain_hashes(&probe_hashes);
+
+        assert_eq!(array.len(), probe_hashes.len());
+
+        for (i, &hash) in probe_hashes.iter().enumerate() {
+            if matches!(hash, 10 | 20 | 30) {
+                assert!(array.value(i), "Hash {hash} should exist in the map");
+            } else {
+                assert!(!array.value(i), "Hash {hash} should NOT exist in the map");
+            }
+        }
     }
 }
diff --git a/datafusion/physical-plan/src/joins/mod.rs b/datafusion/physical-plan/src/joins/mod.rs
index 1d36db996434e..2cdfa1e6ac020 100644
--- a/datafusion/physical-plan/src/joins/mod.rs
+++ b/datafusion/physical-plan/src/joins/mod.rs
@@ -20,22 +20,57 @@
 use arrow::array::BooleanBufferBuilder;
 pub use cross_join::CrossJoinExec;
 use datafusion_physical_expr::PhysicalExprRef;
-pub use hash_join::HashJoinExec;
-pub use nested_loop_join::NestedLoopJoinExec;
+pub use hash_join::{
+    HashExpr, HashJoinExec, HashJoinExecBuilder, HashTableLookupExpr, SeededRandomState,
+};
+pub use nested_loop_join::{NestedLoopJoinExec, NestedLoopJoinExecBuilder};
 use parking_lot::Mutex;
 // Note: SortMergeJoin is not used in plans yet
+pub use piecewise_merge_join::PiecewiseMergeJoinExec;
 pub use sort_merge_join::SortMergeJoinExec;
 pub use symmetric_hash_join::SymmetricHashJoinExec;
+pub mod chain;
 mod cross_join;
 mod hash_join;
 mod nested_loop_join;
+mod piecewise_merge_join;
 mod sort_merge_join;
 mod stream_join_utils;
 mod symmetric_hash_join;
 pub mod utils;
 
+mod array_map;
 mod join_filter;
-mod join_hash_map;
+/// Hash map implementations for join operations.
+///
+/// Note: This module is public for internal testing purposes only
+/// and is not guaranteed to be stable across versions.
+pub mod join_hash_map;
+
+use array_map::ArrayMap;
+use utils::JoinHashMapType;
+
+pub enum Map {
+    HashMap(Box<dyn JoinHashMapType>),
+    ArrayMap(ArrayMap),
+}
+
+impl Map {
+    /// Returns the number of elements in the map.
+    pub fn num_of_distinct_key(&self) -> usize {
+        match self {
+            Map::HashMap(map) => map.len(),
+            Map::ArrayMap(array_map) => array_map.num_of_distinct_key(),
+        }
+    }
+
+    /// Returns `true` if the map contains no elements.
+    pub fn is_empty(&self) -> bool {
+        self.num_of_distinct_key() == 0
+    }
+}
+
+pub(crate) type MapOffset = (usize, Option<u64>);
 
 #[cfg(test)]
 pub mod test_utils;
diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs
index f87cf3d8864cf..f84cb54dac948 100644
--- a/datafusion/physical-plan/src/joins/nested_loop_join.rs
+++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs
@@ -19,134 +19,156 @@
 
 use std::any::Any;
 use std::fmt::Formatter;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::ops::{BitOr, ControlFlow};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::task::Poll;
 
 use super::utils::{
-    asymmetric_join_output_partitioning, get_final_indices_from_shared_bitmap,
-    need_produce_result_in_final, reorder_output_after_swap, swap_join_projection,
-    BatchSplitter, BatchTransformer, NoopBatchTransformer, StatefulStreamResult,
+    asymmetric_join_output_partitioning, need_produce_result_in_final,
+    reorder_output_after_swap, swap_join_projection,
 };
 use crate::common::can_project;
-use crate::execution_plan::{boundedness_from_children, EmissionType};
+use crate::execution_plan::{EmissionType, boundedness_from_children};
+use crate::joins::SharedBitmapBuilder;
 use crate::joins::utils::{
-    adjust_indices_by_join_type, apply_join_filter_to_indices, build_batch_from_indices,
-    build_join_schema, check_join_is_valid, estimate_join_statistics,
     BuildProbeJoinMetrics, ColumnIndex, JoinFilter, OnceAsync, OnceFut,
+    build_join_schema, check_join_is_valid, estimate_join_statistics,
+    need_produce_right_in_final,
+};
+use crate::metrics::{
+    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, MetricsSet, RatioMetrics,
 };
-use crate::joins::SharedBitmapBuilder;
-use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    try_embed_projection, try_pushdown_through_join, EmbeddedProjection, JoinData,
-    ProjectionExec,
+    EmbeddedProjection, JoinData, ProjectionExec, try_embed_projection,
+    try_pushdown_through_join,
 };
 use crate::{
-    handle_state, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
-    ExecutionPlanProperties, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    PlanProperties, RecordBatchStream, SendableRecordBatchStream,
+    check_if_same_properties,
 };
 
-use arrow::array::{BooleanBufferBuilder, UInt32Array, UInt64Array};
-use arrow::compute::concat_batches;
+use arrow::array::{
+    Array, BooleanArray, BooleanBufferBuilder, RecordBatchOptions, UInt32Array,
+    UInt64Array, new_null_array,
+};
+use arrow::buffer::BooleanBuffer;
+use arrow::compute::{
+    BatchCoalescer, concat_batches, filter, filter_record_batch, not, take,
+};
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
+use arrow_schema::DataType;
+use datafusion_common::cast::as_boolean_array;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    exec_datafusion_err, internal_err, project_schema, JoinSide, Result, Statistics,
+    JoinSide, Result, ScalarValue, Statistics, arrow_err, assert_eq_or_internal_err,
+    internal_datafusion_err, internal_err, project_schema, unwrap_or_internal_err,
 };
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::equivalence::{
-    join_equivalence_properties, ProjectionMapping,
+    ProjectionMapping, join_equivalence_properties,
 };
 
-use futures::{ready, Stream, StreamExt, TryStreamExt};
+use datafusion_physical_expr::projection::{ProjectionRef, combine_projections};
+use futures::{Stream, StreamExt, TryStreamExt};
+use log::debug;
 use parking_lot::Mutex;
 
-/// Left (build-side) data
-struct JoinLeftData {
-    /// Build-side data collected to single batch
-    batch: RecordBatch,
-    /// Shared bitmap builder for visited left indices
-    bitmap: SharedBitmapBuilder,
-    /// Counter of running probe-threads, potentially able to update `bitmap`
-    probe_threads_counter: AtomicUsize,
-    /// Memory reservation for tracking batch and bitmap
-    /// Cleared on `JoinLeftData` drop
-    /// reservation is cleared on Drop
-    #[expect(dead_code)]
-    reservation: MemoryReservation,
-}
-
-impl JoinLeftData {
-    fn new(
-        batch: RecordBatch,
-        bitmap: SharedBitmapBuilder,
-        probe_threads_counter: AtomicUsize,
-        reservation: MemoryReservation,
-    ) -> Self {
-        Self {
-            batch,
-            bitmap,
-            probe_threads_counter,
-            reservation,
-        }
-    }
-
-    fn batch(&self) -> &RecordBatch {
-        &self.batch
-    }
-
-    fn bitmap(&self) -> &SharedBitmapBuilder {
-        &self.bitmap
-    }
-
-    /// Decrements counter of running threads, and returns `true`
-    /// if caller is the last running thread
-    fn report_probe_completed(&self) -> bool {
-        self.probe_threads_counter.fetch_sub(1, Ordering::Relaxed) == 1
-    }
-}
-
-#[allow(rustdoc::private_intra_doc_links)]
-/// NestedLoopJoinExec is build-probe join operator, whose main task is to
-/// perform joins without any equijoin conditions in `ON` clause.
+#[expect(rustdoc::private_intra_doc_links)]
+/// NestedLoopJoinExec is a build-probe join operator designed for joins that
+/// do not have equijoin keys in their `ON` clause.
 ///
-/// Execution consists of following phases:
+/// # Execution Flow
 ///
-/// #### 1. Build phase
-/// Collecting build-side data in memory, by polling all available data from build-side input.
-/// Due to the absence of equijoin conditions, it's not possible to partition build-side data
-/// across multiple threads of the operator, so build-side is always collected in a single
-/// batch shared across all threads.
-/// The operator always considers LEFT input as build-side input, so it's crucial to adjust
-/// smaller input to be the LEFT one. Normally this selection is handled by physical optimizer.
+/// ```text
+///                                                Incoming right batch
+///                Left Side Buffered Batches
+///                       ┌───────────┐              ┌───────────────┐
+///                       │ ┌───────┐ │              │               │
+///                       │ │       │ │              │               │
+///  Current Left Row ───▶│ ├───────├─┤──────────┐   │               │
+///                       │ │       │ │          │   └───────────────┘
+///                       │ │       │ │          │           │
+///                       │ │       │ │          │           │
+///                       │ └───────┘ │          │           │
+///                       │ ┌───────┐ │          │           │
+///                       │ │       │ │          │     ┌─────┘
+///                       │ │       │ │          │     │
+///                       │ │       │ │          │     │
+///                       │ │       │ │          │     │
+///                       │ │       │ │          │     │
+///                       │ └───────┘ │          ▼     ▼
+///                       │   ......  │  ┌──────────────────────┐
+///                       │           │  │X (Cartesian Product) │
+///                       │           │  └──────────┬───────────┘
+///                       └───────────┘             │
+///                                                 │
+///                                                 ▼
+///                                      ┌───────┬───────────────┐
+///                                      │       │               │
+///                                      │       │               │
+///                                      │       │               │
+///                                      └───────┴───────────────┘
+///                                        Intermediate Batch
+///                                  (For join predicate evaluation)
+/// ```
 ///
-/// #### 2. Probe phase
-/// Sequentially polling batches from the probe-side input and processing them according to the
-/// following logic:
-/// - apply join filter (`ON` clause) to Cartesian product of probe batch and build side data
-///   -- filter evaluation is executed once per build-side data row
-/// - update shared bitmap of joined ("visited") build-side row indices, if required -- allows
-///   to produce unmatched build-side data in case of e.g. LEFT/FULL JOIN after probing phase
-///   completed
-/// - perform join index alignment is required -- depending on `JoinType`
-/// - produce output join batch
+/// The execution follows a two-phase design:
 ///
-/// Probing phase is executed in parallel, according to probe-side input partitioning -- one
-/// thread per partition. After probe input is exhausted, each thread **ATTEMPTS** to produce
-/// unmatched build-side data.
+/// ## 1. Buffering Left Input
+/// - The operator eagerly buffers all left-side input batches into memory,
+///   util a memory limit is reached.
+///   Currently, an out-of-memory error will be thrown if all the left-side input batches
+///   cannot fit into memory at once.
+///   In the future, it's possible to make this case finish execution. (see
+///   'Memory-limited Execution' section)
+/// - The rationale for buffering the left side is that scanning the right side
+///   can be expensive (e.g., decoding Parquet files), so buffering more left
+///   rows reduces the number of right-side scan passes required.
 ///
-/// #### 3. Producing unmatched build-side data
-/// Producing unmatched build-side data as an output batch, after probe input is exhausted.
-/// This step is also executed in parallel (once per probe input partition), and to avoid
-/// duplicate output of unmatched data (due to shared nature build-side data), each thread
-/// "reports" about probe phase completion (which means that "visited" bitmap won't be
-/// updated anymore), and only the last thread, reporting about completion, will return output.
+/// ## 2. Probing Right Input
+/// - Right-side input is streamed batch by batch.
+/// - For each right-side batch:
+///   - It evaluates the join filter against the full buffered left input.
+///     This results in a Cartesian product between the right batch and each
+///     left row -- with the join predicate/filter applied -- for each inner
+///     loop iteration.
+///   - Matched results are accumulated into an output buffer. (see more in
+///     `Output Buffering Strategy` section)
+/// - This process continues until all right-side input is consumed.
 ///
-/// # Clone / Shared State
+/// # Producing unmatched build-side data
+/// - For special join types like left/full joins, it's required to also output
+///   unmatched pairs. During execution, bitmaps are kept for both left and right
+///   sides of the input; they'll be handled by dedicated states in `NLJStream`.
+/// - The final output of the left side unmatched rows is handled by a single
+///   partition for simplicity, since it only counts a small portion of the
+///   execution time. (e.g. if probe side has 10k rows, the final output of
+///   unmatched build side only roughly counts for 1/10k of the total time)
+///
+/// # Output Buffering Strategy
+/// The operator uses an intermediate output buffer to accumulate results. Once
+/// the output threshold is reached (currently set to the same value as
+/// `batch_size` in the configuration), the results will be eagerly output.
+///
+/// # Extra Notes
+/// - The operator always considers the **left** side as the build (buffered) side.
+///   Therefore, the physical optimizer should assign the smaller input to the left.
+/// - The design try to minimize the intermediate data size to approximately
+///   1 batch, for better cache locality and memory efficiency.
+///
+/// # TODO: Memory-limited Execution
+/// If the memory budget is exceeded during left-side buffering, fallback
+/// strategies such as streaming left batches and re-scanning the right side
+/// may be implemented in the future.
+///
+/// Tracking issue: <https://github.com/apache/datafusion/issues/15760>
 ///
+/// # Clone / Shared State
 /// Note this structure includes a [`OnceAsync`] that is used to coordinate the
 /// loading of the left side with the processing in each output stream.
 /// Therefore it can not be [`Clone`]
@@ -160,62 +182,133 @@ pub struct NestedLoopJoinExec {
     pub(crate) filter: Option<JoinFilter>,
     /// How the join is performed
     pub(crate) join_type: JoinType,
-    /// The schema once the join is applied
+    /// The full concatenated schema of left and right children should be distinct from
+    /// the output schema of the operator
     join_schema: SchemaRef,
     /// Future that consumes left input and buffers it in memory
     ///
     /// This structure is *shared* across all output streams.
     ///
     /// Each output stream waits on the `OnceAsync` to signal the completion of
-    /// the hash table creation.
-    inner_table: OnceAsync<JoinLeftData>,
+    /// the build(left) side data, and buffer them all for later joining.
+    build_side_data: OnceAsync<JoinLeftData>,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
     /// Projection to apply to the output of the join
-    projection: Option<Vec<usize>>,
+    projection: Option<ProjectionRef>,
 
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
-impl NestedLoopJoinExec {
-    /// Try to create a new [`NestedLoopJoinExec`]
-    pub fn try_new(
+/// Helps to build [`NestedLoopJoinExec`].
+pub struct NestedLoopJoinExecBuilder {
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    join_type: JoinType,
+    filter: Option<JoinFilter>,
+    projection: Option<ProjectionRef>,
+}
+
+impl NestedLoopJoinExecBuilder {
+    /// Make a new [`NestedLoopJoinExecBuilder`].
+    pub fn new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
-        filter: Option<JoinFilter>,
-        join_type: &JoinType,
-        projection: Option<Vec<usize>>,
-    ) -> Result<Self> {
+        join_type: JoinType,
+    ) -> Self {
+        Self {
+            left,
+            right,
+            join_type,
+            filter: None,
+            projection: None,
+        }
+    }
+
+    /// Set projection from the vector.
+    pub fn with_projection(self, projection: Option<Vec<usize>>) -> Self {
+        self.with_projection_ref(projection.map(Into::into))
+    }
+
+    /// Set projection from the shared reference.
+    pub fn with_projection_ref(mut self, projection: Option<ProjectionRef>) -> Self {
+        self.projection = projection;
+        self
+    }
+
+    /// Set optional filter.
+    pub fn with_filter(mut self, filter: Option<JoinFilter>) -> Self {
+        self.filter = filter;
+        self
+    }
+
+    /// Build resulting execution plan.
+    pub fn build(self) -> Result<NestedLoopJoinExec> {
+        let Self {
+            left,
+            right,
+            join_type,
+            filter,
+            projection,
+        } = self;
+
         let left_schema = left.schema();
         let right_schema = right.schema();
         check_join_is_valid(&left_schema, &right_schema, &[])?;
         let (join_schema, column_indices) =
-            build_join_schema(&left_schema, &right_schema, join_type);
+            build_join_schema(&left_schema, &right_schema, &join_type);
         let join_schema = Arc::new(join_schema);
-        let cache = Self::compute_properties(
+        let cache = NestedLoopJoinExec::compute_properties(
             &left,
             &right,
-            Arc::clone(&join_schema),
-            *join_type,
-            projection.as_ref(),
+            &join_schema,
+            join_type,
+            projection.as_deref(),
         )?;
-
         Ok(NestedLoopJoinExec {
             left,
             right,
             filter,
-            join_type: *join_type,
+            join_type,
             join_schema,
-            inner_table: Default::default(),
+            build_side_data: Default::default(),
             column_indices,
             projection,
             metrics: Default::default(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
+}
+
+impl From<&NestedLoopJoinExec> for NestedLoopJoinExecBuilder {
+    fn from(exec: &NestedLoopJoinExec) -> Self {
+        Self {
+            left: Arc::clone(exec.left()),
+            right: Arc::clone(exec.right()),
+            join_type: exec.join_type,
+            filter: exec.filter.clone(),
+            projection: exec.projection.clone(),
+        }
+    }
+}
+
+impl NestedLoopJoinExec {
+    /// Try to create a new [`NestedLoopJoinExec`]
+    pub fn try_new(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        filter: Option<JoinFilter>,
+        join_type: &JoinType,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        NestedLoopJoinExecBuilder::new(left, right, *join_type)
+            .with_projection(projection)
+            .with_filter(filter)
+            .build()
+    }
 
     /// left side
     pub fn left(&self) -> &Arc<dyn ExecutionPlan> {
@@ -237,32 +330,32 @@ impl NestedLoopJoinExec {
         &self.join_type
     }
 
-    pub fn projection(&self) -> Option<&Vec<usize>> {
-        self.projection.as_ref()
+    pub fn projection(&self) -> &Option<ProjectionRef> {
+        &self.projection
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         left: &Arc<dyn ExecutionPlan>,
         right: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
+        schema: &SchemaRef,
         join_type: JoinType,
-        projection: Option<&Vec<usize>>,
+        projection: Option<&[usize]>,
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
         let mut eq_properties = join_equivalence_properties(
             left.equivalence_properties().clone(),
             right.equivalence_properties().clone(),
             &join_type,
-            Arc::clone(&schema),
+            Arc::clone(schema),
             &Self::maintains_input_order(join_type),
             None,
             // No on columns in nested loop join
             &[],
-        );
+        )?;
 
         let mut output_partitioning =
-            asymmetric_join_output_partitioning(left, right, &join_type);
+            asymmetric_join_output_partitioning(left, right, &join_type)?;
 
         let emission_type = if left.boundedness().is_unbounded() {
             EmissionType::Final
@@ -274,7 +367,8 @@ impl NestedLoopJoinExec {
                 | JoinType::LeftSemi
                 | JoinType::RightSemi
                 | JoinType::Right
-                | JoinType::RightAnti => EmissionType::Incremental,
+                | JoinType::RightAnti
+                | JoinType::RightMark => EmissionType::Incremental,
                 // If we need to generate unmatched rows from the *build side*,
                 // we need to emit them at the end.
                 JoinType::Left
@@ -288,9 +382,8 @@ impl NestedLoopJoinExec {
 
         if let Some(projection) = projection {
             // construct a map from the input expressions to the output expression of the Projection
-            let projection_mapping =
-                ProjectionMapping::from_indices(projection, &schema)?;
-            let out_schema = project_schema(&schema, Some(projection))?;
+            let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
+            let out_schema = project_schema(schema, Some(&projection))?;
             output_partitioning =
                 output_partitioning.project(&projection_mapping, &eq_properties);
             eq_properties = eq_properties.project(&projection_mapping, out_schema);
@@ -304,29 +397,9 @@ impl NestedLoopJoinExec {
         ))
     }
 
-    /// Returns a vector indicating whether the left and right inputs maintain their order.
-    /// The first element corresponds to the left input, and the second to the right.
-    ///
-    /// The left (build-side) input's order may change, but the right (probe-side) input's
-    /// order is maintained for INNER, RIGHT, RIGHT ANTI, and RIGHT SEMI joins.
-    ///
-    /// Maintaining the right input's order helps optimize the nodes down the pipeline
-    /// (See [`ExecutionPlan::maintains_input_order`]).
-    ///
-    /// This is a separate method because it is also called when computing properties, before
-    /// a [`NestedLoopJoinExec`] is created. It also takes [`JoinType`] as an argument, as
-    /// opposed to `Self`, for the same reason.
-    fn maintains_input_order(join_type: JoinType) -> Vec<bool> {
-        vec![
-            false,
-            matches!(
-                join_type,
-                JoinType::Inner
-                    | JoinType::Right
-                    | JoinType::RightAnti
-                    | JoinType::RightSemi
-            ),
-        ]
+    /// This join implementation does not preserve the input order of either side.
+    fn maintains_input_order(_join_type: JoinType) -> Vec<bool> {
+        vec![false, false]
     }
 
     pub fn contains_projection(&self) -> bool {
@@ -334,26 +407,24 @@ impl NestedLoopJoinExec {
     }
 
     pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        let projection = projection.map(Into::into);
         // check if the projection is valid
-        can_project(&self.schema(), projection.as_ref())?;
-        let projection = match projection {
-            Some(projection) => match &self.projection {
-                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
-                None => Some(projection),
-            },
-            None => None,
-        };
-        Self::try_new(
-            Arc::clone(&self.left),
-            Arc::clone(&self.right),
-            self.filter.clone(),
-            &self.join_type,
-            projection,
-        )
+        can_project(&self.schema(), projection.as_deref())?;
+        let projection =
+            combine_projections(projection.as_ref(), self.projection.as_ref())?;
+        NestedLoopJoinExecBuilder::from(self)
+            .with_projection_ref(projection)
+            .build()
     }
 
     /// Returns a new `ExecutionPlan` that runs NestedLoopsJoins with the left
     /// and right inputs swapped.
+    ///
+    /// # Notes:
+    ///
+    /// This function should be called BEFORE inserting any repartitioning
+    /// operators on the join's children. Check [`super::HashJoinExec::swap_inputs`]
+    /// for more details.
     pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
         let left = self.left();
         let right = self.right();
@@ -365,7 +436,7 @@ impl NestedLoopJoinExec {
             swap_join_projection(
                 left.schema().fields().len(),
                 right.schema().fields().len(),
-                self.projection.as_ref(),
+                self.projection.as_deref(),
                 self.join_type(),
             ),
         )?;
@@ -378,6 +449,8 @@ impl NestedLoopJoinExec {
                 | JoinType::RightSemi
                 | JoinType::LeftAnti
                 | JoinType::RightAnti
+                | JoinType::LeftMark
+                | JoinType::RightMark
         ) || self.projection.is_some()
         {
             Arc::new(new_join)
@@ -391,6 +464,27 @@ impl NestedLoopJoinExec {
 
         Ok(plan)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            build_side_data: Default::default(),
+            cache: Arc::clone(&self.cache),
+            filter: self.filter.clone(),
+            join_type: self.join_type,
+            join_schema: Arc::clone(&self.join_schema),
+            column_indices: self.column_indices.clone(),
+            projection: self.projection.clone(),
+        }
+    }
 }
 
 impl DisplayAs for NestedLoopJoinExec {
@@ -445,7 +539,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -464,17 +558,32 @@ impl ExecutionPlan for NestedLoopJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            f(filter.expression().as_ref())?;
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(NestedLoopJoinExec::try_new(
-            Arc::clone(&children[0]),
-            Arc::clone(&children[1]),
-            self.filter.clone(),
-            &self.join_type,
-            self.projection.clone(),
-        )?))
+        check_if_same_properties!(self, children);
+        Ok(Arc::new(
+            NestedLoopJoinExecBuilder::new(
+                Arc::clone(&children[0]),
+                Arc::clone(&children[1]),
+                self.join_type,
+            )
+            .with_filter(self.filter.clone())
+            .with_projection_ref(self.projection.clone())
+            .build()?,
+        ))
     }
 
     fn execute(
@@ -482,26 +591,26 @@ impl ExecutionPlan for NestedLoopJoinExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if self.left.output_partitioning().partition_count() != 1 {
-            return internal_err!(
-                "Invalid NestedLoopJoinExec, the output partition count of the left child must be 1,\
+        assert_eq_or_internal_err!(
+            self.left.output_partitioning().partition_count(),
+            1,
+            "Invalid NestedLoopJoinExec, the output partition count of the left child must be 1,\
                  consider using CoalescePartitionsExec or the EnforceDistribution rule"
-            );
-        }
+        );
 
-        let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
+        let metrics = NestedLoopJoinMetrics::new(&self.metrics, partition);
 
         // Initialization reservation for load of inner table
         let load_reservation =
             MemoryConsumer::new(format!("NestedLoopJoinLoad[{partition}]"))
                 .register(context.memory_pool());
 
-        let inner_table = self.inner_table.try_once(|| {
+        let build_side_data = self.build_side_data.try_once(|| {
             let stream = self.left.execute(0, Arc::clone(&context))?;
 
             Ok(collect_left_input(
                 stream,
-                join_metrics.clone(),
+                metrics.join_metrics.clone(),
                 load_reservation,
                 need_produce_result_in_final(self.join_type),
                 self.right().output_partitioning().partition_count(),
@@ -509,19 +618,11 @@ impl ExecutionPlan for NestedLoopJoinExec {
         })?;
 
         let batch_size = context.session_config().batch_size();
-        let enforce_batch_size_in_joins =
-            context.session_config().enforce_batch_size_in_joins();
-
-        let outer_table = self.right.execute(partition, context)?;
 
-        let indices_cache = (UInt64Array::new_null(0), UInt32Array::new_null(0));
-
-        // Right side has an order and it is maintained during operation.
-        let right_side_ordered =
-            self.maintains_input_order()[1] && self.right.output_ordering().is_some();
+        let probe_side_data = self.right.execute(partition, context)?;
 
         // update column indices to reflect the projection
-        let column_indices_after_projection = match &self.projection {
+        let column_indices_after_projection = match self.projection.as_ref() {
             Some(projection) => projection
                 .iter()
                 .map(|i| self.column_indices[*i].clone())
@@ -529,58 +630,51 @@ impl ExecutionPlan for NestedLoopJoinExec {
             None => self.column_indices.clone(),
         };
 
-        if enforce_batch_size_in_joins {
-            Ok(Box::pin(NestedLoopJoinStream {
-                schema: self.schema(),
-                filter: self.filter.clone(),
-                join_type: self.join_type,
-                outer_table,
-                inner_table,
-                column_indices: column_indices_after_projection,
-                join_metrics,
-                indices_cache,
-                right_side_ordered,
-                state: NestedLoopJoinStreamState::WaitBuildSide,
-                batch_transformer: BatchSplitter::new(batch_size),
-                left_data: None,
-            }))
-        } else {
-            Ok(Box::pin(NestedLoopJoinStream {
-                schema: self.schema(),
-                filter: self.filter.clone(),
-                join_type: self.join_type,
-                outer_table,
-                inner_table,
-                column_indices: column_indices_after_projection,
-                join_metrics,
-                indices_cache,
-                right_side_ordered,
-                state: NestedLoopJoinStreamState::WaitBuildSide,
-                batch_transformer: NoopBatchTransformer::new(),
-                left_data: None,
-            }))
-        }
+        Ok(Box::pin(NestedLoopJoinStream::new(
+            self.schema(),
+            self.filter.clone(),
+            self.join_type,
+            probe_side_data,
+            build_side_data,
+            column_indices_after_projection,
+            metrics,
+            batch_size,
+        )))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        // NestedLoopJoinExec is designed for joins without equijoin keys in the
+        // ON clause (e.g., `t1 JOIN t2 ON (t1.v1 + t2.v1) % 2 = 0`). Any join
+        // predicates are stored in `self.filter`, but `estimate_join_statistics`
+        // currently doesn't support selectivity estimation for such arbitrary
+        // filter expressions. We pass an empty join column list, which means
+        // the cardinality estimation cannot use column statistics and returns
+        // unknown row counts.
+        let join_columns = Vec::new();
+
+        // Left side is always a single partition (Distribution::SinglePartition),
+        // so we always request overall stats with `None`. Right side can have
+        // multiple partitions, so we forward the partition parameter to get
+        // partition-specific statistics when requested.
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(None)?);
+        let right_stats = Arc::unwrap_or_clone(match partition {
+            Some(partition) => self.right.partition_statistics(Some(partition))?,
+            None => self.right.partition_statistics(None)?,
+        });
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            vec![],
+        let stats = estimate_join_statistics(
+            left_stats,
+            right_stats,
+            &join_columns,
             &self.join_type,
             &self.join_schema,
-        )
+        )?;
+
+        Ok(Arc::new(stats.project(self.projection.as_ref())))
     }
 
     /// Tries to push `projection` down through `nested_loop_join`. If possible, performs the
@@ -595,6 +689,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             return Ok(None);
         }
 
+        let schema = self.schema();
         if let Some(JoinData {
             projected_left_child,
             projected_right_child,
@@ -605,7 +700,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             self.left(),
             self.right(),
             &[],
-            self.schema(),
+            &schema,
             self.filter(),
         )? {
             Ok(Some(Arc::new(NestedLoopJoinExec::try_new(
@@ -622,6 +717,57 @@ impl ExecutionPlan for NestedLoopJoinExec {
     }
 }
 
+impl EmbeddedProjection for NestedLoopJoinExec {
+    fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
+        self.with_projection(projection)
+    }
+}
+
+/// Left (build-side) data
+pub(crate) struct JoinLeftData {
+    /// Build-side data collected to single batch
+    batch: RecordBatch,
+    /// Shared bitmap builder for visited left indices
+    bitmap: SharedBitmapBuilder,
+    /// Counter of running probe-threads, potentially able to update `bitmap`
+    probe_threads_counter: AtomicUsize,
+    /// Memory reservation for tracking batch and bitmap
+    /// Cleared on `JoinLeftData` drop
+    /// reservation is cleared on Drop
+    #[expect(dead_code)]
+    reservation: MemoryReservation,
+}
+
+impl JoinLeftData {
+    pub(crate) fn new(
+        batch: RecordBatch,
+        bitmap: SharedBitmapBuilder,
+        probe_threads_counter: AtomicUsize,
+        reservation: MemoryReservation,
+    ) -> Self {
+        Self {
+            batch,
+            bitmap,
+            probe_threads_counter,
+            reservation,
+        }
+    }
+
+    pub(crate) fn batch(&self) -> &RecordBatch {
+        &self.batch
+    }
+
+    pub(crate) fn bitmap(&self) -> &SharedBitmapBuilder {
+        &self.bitmap
+    }
+
+    /// Decrements counter of running threads, and returns `true`
+    /// if caller is the last running thread
+    pub(crate) fn report_probe_completed(&self) -> bool {
+        self.probe_threads_counter.fetch_sub(1, Ordering::Relaxed) == 1
+    }
+}
+
 /// Asynchronously collect input into a single batch, and creates `JoinLeftData` from it
 async fn collect_left_input(
     stream: SendableRecordBatchStream,
@@ -633,10 +779,10 @@ async fn collect_left_input(
     let schema = stream.schema();
 
     // Load all batches and count the rows
-    let (batches, metrics, mut reservation) = stream
+    let (batches, metrics, reservation) = stream
         .try_fold(
             (Vec::new(), join_metrics, reservation),
-            |(mut batches, metrics, mut reservation), batch| async {
+            |(mut batches, metrics, reservation), batch| async {
                 let batch_size = batch.get_array_memory_size();
                 // Reserve memory for incoming batch
                 reservation.try_grow(batch_size)?;
@@ -675,393 +821,1529 @@ async fn collect_left_input(
     ))
 }
 
-/// This enumeration represents various states of the nested loop join algorithm.
-#[derive(Debug, Clone)]
-enum NestedLoopJoinStreamState {
-    /// The initial state, indicating that build-side data not collected yet
-    WaitBuildSide,
-    /// Indicates that build-side has been collected, and stream is ready for
-    /// fetching probe-side
-    FetchProbeBatch,
-    /// Indicates that a non-empty batch has been fetched from probe-side, and
-    /// is ready to be processed
-    ProcessProbeBatch(RecordBatch),
-    /// Indicates that probe-side has been fully processed
-    ExhaustedProbeSide,
-    /// Indicates that NestedLoopJoinStream execution is completed
-    Completed,
-}
-
-impl NestedLoopJoinStreamState {
-    /// Tries to extract a `ProcessProbeBatchState` from the
-    /// `NestedLoopJoinStreamState` enum. Returns an error if state is not
-    /// `ProcessProbeBatchState`.
-    fn try_as_process_probe_batch(&mut self) -> Result<&RecordBatch> {
-        match self {
-            NestedLoopJoinStreamState::ProcessProbeBatch(state) => Ok(state),
-            _ => internal_err!("Expected join stream in ProcessProbeBatch state"),
-        }
-    }
+/// States for join processing. See `poll_next()` comment for more details about
+/// state transitions.
+#[derive(Debug, Clone, Copy)]
+enum NLJState {
+    BufferingLeft,
+    FetchingRight,
+    ProbeRight,
+    EmitRightUnmatched,
+    EmitLeftUnmatched,
+    Done,
 }
-
-/// A stream that issues [RecordBatch]es as they arrive from the right  of the join.
-struct NestedLoopJoinStream<T> {
-    /// Input schema
-    schema: Arc<Schema>,
+pub(crate) struct NestedLoopJoinStream {
+    // ========================================================================
+    // PROPERTIES:
+    // Operator's properties that remain constant
+    //
+    // Note: The implementation uses the terms left/build-side table and
+    // right/probe-side table interchangeably. Treating the left side as the
+    // build side is a convention in DataFusion: the planner always tries to
+    // swap the smaller table to the left side.
+    // ========================================================================
+    /// Output schema
+    pub(crate) output_schema: Arc<Schema>,
     /// join filter
-    filter: Option<JoinFilter>,
+    pub(crate) join_filter: Option<JoinFilter>,
     /// type of the join
-    join_type: JoinType,
-    /// the outer table data of the nested loop join
-    outer_table: SendableRecordBatchStream,
-    /// the inner table data of the nested loop join
-    inner_table: OnceFut<JoinLeftData>,
-    /// Information of index and left / right placement of columns
-    column_indices: Vec<ColumnIndex>,
-    // TODO: support null aware equal
-    // null_equals_null: bool
+    pub(crate) join_type: JoinType,
+    /// the probe-side(right) table data of the nested loop join
+    pub(crate) right_data: SendableRecordBatchStream,
+    /// the build-side table data of the nested loop join
+    pub(crate) left_data: OnceFut<JoinLeftData>,
+    /// Projection to construct the output schema from the left and right tables.
+    /// Example:
+    /// - output_schema: ['a', 'c']
+    /// - left_schema: ['a', 'b']
+    /// - right_schema: ['c']
+    ///
+    /// The column indices would be [(left, 0), (right, 0)] -- taking the left
+    /// 0th column and right 0th column can construct the output schema.
+    ///
+    /// Note there are other columns ('b' in the example) still kept after
+    /// projection pushdown; this is because they might be used to evaluate
+    /// the join filter (e.g., `JOIN ON (b+c)>0`).
+    pub(crate) column_indices: Vec<ColumnIndex>,
     /// Join execution metrics
-    join_metrics: BuildProbeJoinMetrics,
-    /// Cache for join indices calculations
-    indices_cache: (UInt64Array, UInt32Array),
-    /// Whether the right side is ordered
-    right_side_ordered: bool,
-    /// Current state of the stream
-    state: NestedLoopJoinStreamState,
-    /// Transforms the output batch before returning.
-    batch_transformer: T,
-    /// Result of the left data future
-    left_data: Option<Arc<JoinLeftData>>,
+    pub(crate) metrics: NestedLoopJoinMetrics,
+
+    /// `batch_size` from configuration
+    batch_size: usize,
+
+    /// See comments in [`need_produce_right_in_final`] for more detail
+    should_track_unmatched_right: bool,
+
+    // ========================================================================
+    // STATE FLAGS/BUFFERS:
+    // Fields that hold intermediate data/flags during execution
+    // ========================================================================
+    /// State Tracking
+    state: NLJState,
+    /// Output buffer holds the join result to output. It will emit eagerly when
+    /// the threshold is reached.
+    output_buffer: Box<BatchCoalescer>,
+    /// See comments in [`NLJState::Done`] for its purpose
+    handled_empty_output: bool,
+
+    // Buffer(left) side
+    // -----------------
+    /// The current buffered left data to join
+    buffered_left_data: Option<Arc<JoinLeftData>>,
+    /// Index into the left buffered batch. Used in `ProbeRight` state
+    left_probe_idx: usize,
+    /// Index into the left buffered batch. Used in `EmitLeftUnmatched` state
+    left_emit_idx: usize,
+    /// Should we go back to `BufferingLeft` state again after `EmitLeftUnmatched`
+    /// state is over.
+    left_exhausted: bool,
+    /// If we can buffer all left data in one pass
+    /// TODO(now): this is for the (unimplemented) memory-limited execution
+    #[expect(dead_code)]
+    left_buffered_in_one_pass: bool,
+
+    // Probe(right) side
+    // -----------------
+    /// The current probe batch to process
+    current_right_batch: Option<RecordBatch>,
+    // For right join, keep track of matched rows in `current_right_batch`
+    // Constructed when fetching each new incoming right batch in `FetchingRight` state.
+    current_right_batch_matched: Option<BooleanArray>,
 }
 
-/// Creates a Cartesian product of two input batches, preserving the order of the right batch,
-/// and applying a join filter if provided.
-///
-/// # Example
-/// Input:
-/// left = [0, 1], right = [0, 1, 2]
-///
-/// Output:
-/// left_indices = [0, 1, 0, 1, 0, 1], right_indices = [0, 0, 1, 1, 2, 2]
-///
-/// Input:
-/// left = [0, 1, 2], right = [0, 1, 2, 3], filter = left.a != right.a
-///
-/// Output:
-/// left_indices = [1, 2, 0, 2, 0, 1, 0, 1, 2], right_indices = [0, 0, 1, 1, 2, 2, 3, 3, 3]
-fn build_join_indices(
-    left_batch: &RecordBatch,
-    right_batch: &RecordBatch,
-    filter: Option<&JoinFilter>,
-    indices_cache: &mut (UInt64Array, UInt32Array),
-) -> Result<(UInt64Array, UInt32Array)> {
-    let left_row_count = left_batch.num_rows();
-    let right_row_count = right_batch.num_rows();
-    let output_row_count = left_row_count * right_row_count;
-
-    // We always use the same indices before applying the filter, so we can cache them
-    let (left_indices_cache, right_indices_cache) = indices_cache;
-    let cached_output_row_count = left_indices_cache.len();
-
-    let (left_indices, right_indices) =
-        match output_row_count.cmp(&cached_output_row_count) {
-            std::cmp::Ordering::Equal => {
-                // Reuse the cached indices
-                (left_indices_cache.clone(), right_indices_cache.clone())
-            }
-            std::cmp::Ordering::Less => {
-                // Left_row_count never changes because it's the build side. The changes to the
-                // right_row_count can be handled trivially by taking the first output_row_count
-                // elements of the cache because of how the indices are generated.
-                // (See the Ordering::Greater match arm)
-                (
-                    left_indices_cache.slice(0, output_row_count),
-                    right_indices_cache.slice(0, output_row_count),
-                )
-            }
-            std::cmp::Ordering::Greater => {
-                // Rebuild the indices cache
-
-                // Produces 0, 1, 2, 0, 1, 2, 0, 1, 2, ...
-                *left_indices_cache = UInt64Array::from_iter_values(
-                    (0..output_row_count as u64).map(|i| i % left_row_count as u64),
-                );
-
-                // Produces 0, 0, 0, 1, 1, 1, 2, 2, 2, ...
-                *right_indices_cache = UInt32Array::from_iter_values(
-                    (0..output_row_count as u32).map(|i| i / left_row_count as u32),
-                );
-
-                (left_indices_cache.clone(), right_indices_cache.clone())
-            }
-        };
-
-    if let Some(filter) = filter {
-        apply_join_filter_to_indices(
-            left_batch,
-            right_batch,
-            left_indices,
-            right_indices,
-            filter,
-            JoinSide::Left,
-        )
-    } else {
-        Ok((left_indices, right_indices))
-    }
+pub(crate) struct NestedLoopJoinMetrics {
+    /// Join execution metrics
+    pub(crate) join_metrics: BuildProbeJoinMetrics,
+    /// Selectivity of the join: output_rows / (left_rows * right_rows)
+    pub(crate) selectivity: RatioMetrics,
 }
 
-impl<T: BatchTransformer> NestedLoopJoinStream<T> {
-    fn poll_next_impl(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Option<Result<RecordBatch>>> {
-        loop {
-            return match self.state {
-                NestedLoopJoinStreamState::WaitBuildSide => {
-                    handle_state!(ready!(self.collect_build_side(cx)))
-                }
-                NestedLoopJoinStreamState::FetchProbeBatch => {
-                    handle_state!(ready!(self.fetch_probe_batch(cx)))
-                }
-                NestedLoopJoinStreamState::ProcessProbeBatch(_) => {
-                    handle_state!(self.process_probe_batch())
-                }
-                NestedLoopJoinStreamState::ExhaustedProbeSide => {
-                    handle_state!(self.process_unmatched_build_batch())
-                }
-                NestedLoopJoinStreamState::Completed => Poll::Ready(None),
-            };
+impl NestedLoopJoinMetrics {
+    pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            join_metrics: BuildProbeJoinMetrics::new(partition, metrics),
+            selectivity: MetricBuilder::new(metrics)
+                .with_type(MetricType::SUMMARY)
+                .ratio_metrics("selectivity", partition),
         }
     }
+}
 
-    fn collect_build_side(
-        &mut self,
+impl Stream for NestedLoopJoinStream {
+    type Item = Result<RecordBatch>;
+
+    /// See the comments [`NestedLoopJoinExec`] for high-level design ideas.
+    ///
+    /// # Implementation
+    ///
+    /// This function is the entry point of NLJ operator's state machine
+    /// transitions. The rough state transition graph is as follow, for more
+    /// details see the comment in each state's matching arm.
+    ///
+    /// ============================
+    /// State transition graph:
+    /// ============================
+    ///
+    /// (start) --> BufferingLeft
+    /// ----------------------------
+    /// BufferingLeft → FetchingRight
+    ///
+    /// FetchingRight → ProbeRight (if right batch available)
+    /// FetchingRight → EmitLeftUnmatched (if right exhausted)
+    ///
+    /// ProbeRight → ProbeRight (next left row or after yielding output)
+    /// ProbeRight → EmitRightUnmatched (for special join types like right join)
+    /// ProbeRight → FetchingRight (done with the current right batch)
+    ///
+    /// EmitRightUnmatched → FetchingRight
+    ///
+    /// EmitLeftUnmatched → EmitLeftUnmatched (only process 1 chunk for each
+    /// iteration)
+    /// EmitLeftUnmatched → Done (if finished)
+    /// ----------------------------
+    /// Done → (end)
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        let build_timer = self.join_metrics.build_time.timer();
-        // build hash table from left (build) side, if not yet done
-        self.left_data = Some(ready!(self.inner_table.get_shared(cx))?);
-        build_timer.done();
+    ) -> Poll<Option<Self::Item>> {
+        loop {
+            match self.state {
+                // # NLJState transitions
+                // --> FetchingRight
+                // This state will prepare the left side batches, next state
+                // `FetchingRight` is responsible for preparing a single probe
+                // side batch, before start joining.
+                NLJState::BufferingLeft => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+                    // inside `collect_left_input` (the routine to buffer build
+                    // -side batches), related metrics except build time will be
+                    // updated.
+                    // stop on drop
+                    let build_metric = self.metrics.join_metrics.build_time.clone();
+                    let _build_timer = build_metric.timer();
+
+                    match self.handle_buffering_left(cx) {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(poll) => return poll,
+                    }
+                }
+
+                // # NLJState transitions:
+                // 1. --> ProbeRight
+                //    Start processing the join for the newly fetched right
+                //    batch.
+                // 2. --> EmitLeftUnmatched: When the right side input is exhausted, (maybe) emit
+                //    unmatched left side rows.
+                //
+                // After fetching a new batch from the right side, it will
+                // process all rows from the buffered left data:
+                // ```text
+                // for batch in right_side:
+                //     for row in left_buffer:
+                //         join(batch, row)
+                // ```
+                // Note: the implementation does this step incrementally,
+                // instead of materializing all intermediate Cartesian products
+                // at once in memory.
+                //
+                // So after the right side input is exhausted, the join phase
+                // for the current buffered left data is finished. We can go to
+                // the next `EmitLeftUnmatched` phase to check if there is any
+                // special handling (e.g., in cases like left join).
+                NLJState::FetchingRight => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+                    // stop on drop
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
+                    let _join_timer = join_metric.timer();
+
+                    match self.handle_fetching_right(cx) {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(poll) => return poll,
+                    }
+                }
+
+                // NLJState transitions:
+                // 1. --> ProbeRight(1)
+                //    If we have already buffered enough output to yield, it
+                //    will first give back control to the parent state machine,
+                //    then resume at the same place.
+                // 2. --> ProbeRight(2)
+                //    After probing one right batch, and evaluating the
+                //    join filter on (left-row x right-batch), it will advance
+                //    to the next left row, then re-enter the current state and
+                //    continue joining.
+                // 3. --> FetchRight
+                //    After it has done with the current right batch (to join
+                //    with all rows in the left buffer), it will go to
+                //    FetchRight state to check what to do next.
+                NLJState::ProbeRight => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+
+                    // stop on drop
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
+                    let _join_timer = join_metric.timer();
+
+                    match self.handle_probe_right() {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(poll) => {
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
+                        }
+                    }
+                }
+
+                // In the `current_right_batch_matched` bitmap, all trues mean
+                // it has been output by the join. In this state we have to
+                // output unmatched rows for current right batch (with null
+                // padding for left relation)
+                // Precondition: we have checked the join type so that it's
+                // possible to output right unmatched (e.g. it's right join)
+                NLJState::EmitRightUnmatched => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+
+                    // stop on drop
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
+                    let _join_timer = join_metric.timer();
+
+                    match self.handle_emit_right_unmatched() {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(poll) => {
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
+                        }
+                    }
+                }
+
+                // NLJState transitions:
+                // 1. --> EmitLeftUnmatched(1)
+                //    If we have already buffered enough output to yield, it
+                //    will first give back control to the parent state machine,
+                //    then resume at the same place.
+                // 2. --> EmitLeftUnmatched(2)
+                //    After processing some unmatched rows, it will re-enter
+                //    the same state, to check if there are any more final
+                //    results to output.
+                // 3. --> Done
+                //    It has processed all data, go to the final state and ready
+                //    to exit.
+                //
+                // TODO: For memory-limited case, go back to `BufferingLeft`
+                // state again.
+                NLJState::EmitLeftUnmatched => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+
+                    // stop on drop
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
+                    let _join_timer = join_metric.timer();
+
+                    match self.handle_emit_left_unmatched() {
+                        ControlFlow::Continue(()) => continue,
+                        ControlFlow::Break(poll) => {
+                            return self.metrics.join_metrics.baseline.record_poll(poll);
+                        }
+                    }
+                }
+
+                // The final state and the exit point
+                NLJState::Done => {
+                    debug!("[NLJState] Entering: {:?}", self.state);
+
+                    // stop on drop
+                    let join_metric = self.metrics.join_metrics.join_time.clone();
+                    let _join_timer = join_metric.timer();
+                    // counting it in join timer due to there might be some
+                    // final resout batches to output in this state
+
+                    let poll = self.handle_done();
+                    return self.metrics.join_metrics.baseline.record_poll(poll);
+                }
+            }
+        }
+    }
+}
 
-        self.state = NestedLoopJoinStreamState::FetchProbeBatch;
+impl RecordBatchStream for NestedLoopJoinStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.output_schema)
+    }
+}
 
-        Poll::Ready(Ok(StatefulStreamResult::Continue))
+impl NestedLoopJoinStream {
+    #[expect(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        schema: Arc<Schema>,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        right_data: SendableRecordBatchStream,
+        left_data: OnceFut<JoinLeftData>,
+        column_indices: Vec<ColumnIndex>,
+        metrics: NestedLoopJoinMetrics,
+        batch_size: usize,
+    ) -> Self {
+        Self {
+            output_schema: Arc::clone(&schema),
+            join_filter: filter,
+            join_type,
+            right_data,
+            column_indices,
+            left_data,
+            metrics,
+            buffered_left_data: None,
+            output_buffer: Box::new(BatchCoalescer::new(schema, batch_size)),
+            batch_size,
+            current_right_batch: None,
+            current_right_batch_matched: None,
+            state: NLJState::BufferingLeft,
+            left_probe_idx: 0,
+            left_emit_idx: 0,
+            left_exhausted: false,
+            left_buffered_in_one_pass: true,
+            handled_empty_output: false,
+            should_track_unmatched_right: need_produce_right_in_final(join_type),
+        }
     }
 
-    /// Fetches next batch from probe-side
-    ///
-    /// If a non-empty batch has been fetched, updates state to
-    /// `ProcessProbeBatchState`, otherwise updates state to `ExhaustedProbeSide`.
-    fn fetch_probe_batch(
+    // ==== State handler functions ====
+
+    /// Handle BufferingLeft state - prepare left side batches
+    fn handle_buffering_left(
         &mut self,
         cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        match ready!(self.outer_table.poll_next_unpin(cx)) {
-            None => {
-                self.state = NestedLoopJoinStreamState::ExhaustedProbeSide;
+    ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        match self.left_data.get_shared(cx) {
+            Poll::Ready(Ok(left_data)) => {
+                self.buffered_left_data = Some(left_data);
+                // TODO: implement memory-limited case
+                self.left_exhausted = true;
+                self.state = NLJState::FetchingRight;
+                // Continue to next state immediately
+                ControlFlow::Continue(())
             }
-            Some(Ok(right_batch)) => {
-                self.state = NestedLoopJoinStreamState::ProcessProbeBatch(right_batch);
+            Poll::Ready(Err(e)) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+            Poll::Pending => ControlFlow::Break(Poll::Pending),
+        }
+    }
+
+    /// Handle FetchingRight state - fetch next right batch and prepare for processing
+    fn handle_fetching_right(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        match self.right_data.poll_next_unpin(cx) {
+            Poll::Ready(result) => match result {
+                Some(Ok(right_batch)) => {
+                    // Update metrics
+                    let right_batch_size = right_batch.num_rows();
+                    self.metrics.join_metrics.input_rows.add(right_batch_size);
+                    self.metrics.join_metrics.input_batches.add(1);
+
+                    // Skip the empty batch
+                    if right_batch_size == 0 {
+                        return ControlFlow::Continue(());
+                    }
+
+                    self.current_right_batch = Some(right_batch);
+
+                    // Prepare right bitmap
+                    if self.should_track_unmatched_right {
+                        let zeroed_buf = BooleanBuffer::new_unset(right_batch_size);
+                        self.current_right_batch_matched =
+                            Some(BooleanArray::new(zeroed_buf, None));
+                    }
+
+                    self.left_probe_idx = 0;
+                    self.state = NLJState::ProbeRight;
+                    ControlFlow::Continue(())
+                }
+                Some(Err(e)) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+                None => {
+                    // Right stream exhausted
+                    self.state = NLJState::EmitLeftUnmatched;
+                    ControlFlow::Continue(())
+                }
+            },
+            Poll::Pending => ControlFlow::Break(Poll::Pending),
+        }
+    }
+
+    /// Handle ProbeRight state - process current probe batch
+    fn handle_probe_right(&mut self) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        // Return any completed batches first
+        if let Some(poll) = self.maybe_flush_ready_batch() {
+            return ControlFlow::Break(poll);
+        }
+
+        // Process current probe state
+        match self.process_probe_batch() {
+            // State unchanged (ProbeRight)
+            // Continue probing until we have done joining the
+            // current right batch with all buffered left rows.
+            Ok(true) => ControlFlow::Continue(()),
+            // To next FetchRightState
+            // We have finished joining
+            // (cur_right_batch x buffered_left_batches)
+            Ok(false) => {
+                // Left exhausted, transition to FetchingRight
+                self.left_probe_idx = 0;
+
+                // Selectivity Metric: Update total possibilities for the batch (left_rows * right_rows)
+                // If memory-limited execution is implemented, this logic must be updated accordingly.
+                if let (Ok(left_data), Some(right_batch)) =
+                    (self.get_left_data(), self.current_right_batch.as_ref())
+                {
+                    let left_rows = left_data.batch().num_rows();
+                    let right_rows = right_batch.num_rows();
+                    self.metrics.selectivity.add_total(left_rows * right_rows);
+                }
+
+                if self.should_track_unmatched_right {
+                    debug_assert!(
+                        self.current_right_batch_matched.is_some(),
+                        "If it's required to track matched rows in the right input, the right bitmap must be present"
+                    );
+                    self.state = NLJState::EmitRightUnmatched;
+                } else {
+                    self.current_right_batch = None;
+                    self.state = NLJState::FetchingRight;
+                }
+                ControlFlow::Continue(())
             }
-            Some(Err(err)) => return Poll::Ready(Err(err)),
-        };
+            Err(e) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+        }
+    }
+
+    /// Handle EmitRightUnmatched state - emit unmatched right rows
+    fn handle_emit_right_unmatched(
+        &mut self,
+    ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        // Return any completed batches first
+        if let Some(poll) = self.maybe_flush_ready_batch() {
+            return ControlFlow::Break(poll);
+        }
 
-        Poll::Ready(Ok(StatefulStreamResult::Continue))
+        debug_assert!(
+            self.current_right_batch_matched.is_some()
+                && self.current_right_batch.is_some(),
+            "This state is yielding output for unmatched rows in the current right batch, so both the right batch and the bitmap must be present"
+        );
+        // Construct the result batch for unmatched right rows using a utility function
+        match self.process_right_unmatched() {
+            Ok(Some(batch)) => {
+                match self.output_buffer.push_batch(batch) {
+                    Ok(()) => {
+                        // Processed all in one pass
+                        // cleared inside `process_right_unmatched`
+                        debug_assert!(self.current_right_batch.is_none());
+                        self.state = NLJState::FetchingRight;
+                        ControlFlow::Continue(())
+                    }
+                    Err(e) => ControlFlow::Break(Poll::Ready(Some(arrow_err!(e)))),
+                }
+            }
+            Ok(None) => {
+                // Processed all in one pass
+                // cleared inside `process_right_unmatched`
+                debug_assert!(self.current_right_batch.is_none());
+                self.state = NLJState::FetchingRight;
+                ControlFlow::Continue(())
+            }
+            Err(e) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+        }
     }
 
-    /// Joins current probe batch with build-side data and produces batch with
-    /// matched output, updates state to `FetchProbeBatch`.
-    fn process_probe_batch(
+    /// Handle EmitLeftUnmatched state - emit unmatched left rows
+    fn handle_emit_left_unmatched(
         &mut self,
-    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
-        let Some(left_data) = self.left_data.clone() else {
-            return internal_err!(
-                "Expected left_data to be Some in ProcessProbeBatch state"
+    ) -> ControlFlow<Poll<Option<Result<RecordBatch>>>> {
+        // Return any completed batches first
+        if let Some(poll) = self.maybe_flush_ready_batch() {
+            return ControlFlow::Break(poll);
+        }
+
+        // Process current unmatched state
+        match self.process_left_unmatched() {
+            // State unchanged (EmitLeftUnmatched)
+            // Continue processing until we have processed all unmatched rows
+            Ok(true) => ControlFlow::Continue(()),
+            // To Done state
+            // We have finished processing all unmatched rows
+            Ok(false) => match self.output_buffer.finish_buffered_batch() {
+                Ok(()) => {
+                    self.state = NLJState::Done;
+                    ControlFlow::Continue(())
+                }
+                Err(e) => ControlFlow::Break(Poll::Ready(Some(arrow_err!(e)))),
+            },
+            Err(e) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+        }
+    }
+
+    /// Handle Done state - final state processing
+    fn handle_done(&mut self) -> Poll<Option<Result<RecordBatch>>> {
+        // Return any remaining completed batches before final termination
+        if let Some(poll) = self.maybe_flush_ready_batch() {
+            return poll;
+        }
+
+        // HACK for the doc test in https://github.com/apache/datafusion/blob/main/datafusion/core/src/dataframe/mod.rs#L1265
+        // If this operator directly return `Poll::Ready(None)`
+        // for empty result, the final result will become an empty
+        // batch with empty schema, however the expected result
+        // should be with the expected schema for this operator
+        if !self.handled_empty_output {
+            let zero_count = Count::new();
+            if *self.metrics.join_metrics.baseline.output_rows() == zero_count {
+                let empty_batch = RecordBatch::new_empty(Arc::clone(&self.output_schema));
+                self.handled_empty_output = true;
+                return Poll::Ready(Some(Ok(empty_batch)));
+            }
+        }
+
+        Poll::Ready(None)
+    }
+
+    // ==== Core logic handling for each state ====
+
+    /// Returns bool to indicate should it continue probing
+    /// true -> continue in the same ProbeRight state
+    /// false -> It has done with the (buffered_left x cur_right_batch), go to
+    /// next state (ProbeRight)
+    fn process_probe_batch(&mut self) -> Result<bool> {
+        let left_data = Arc::clone(self.get_left_data()?);
+        let right_batch = self
+            .current_right_batch
+            .as_ref()
+            .ok_or_else(|| internal_datafusion_err!("Right batch should be available"))?
+            .clone();
+
+        // stop probing, the caller will go to the next state
+        if self.left_probe_idx >= left_data.batch().num_rows() {
+            return Ok(false);
+        }
+
+        // ========
+        // Join (l_row x right_batch)
+        // and push the result into output_buffer
+        // ========
+
+        // Special case:
+        // When the right batch is very small, join with multiple left rows at once,
+        //
+        // The regular implementation is not efficient if the plan's right child is
+        // very small (e.g. 1 row total), because inside the inner loop of NLJ, it's
+        // handling one input right batch at once, if it's not large enough, the
+        // overheads like filter evaluation can't be amortized through vectorization.
+        debug_assert_ne!(
+            right_batch.num_rows(),
+            0,
+            "When fetching the right batch, empty batches will be skipped"
+        );
+
+        let l_row_cnt_ratio = self.batch_size / right_batch.num_rows();
+        if l_row_cnt_ratio > 10 {
+            // Calculate max left rows to handle at once. This operator tries to handle
+            // up to `datafusion.execution.batch_size` rows at once in the intermediate
+            // batch.
+            let l_row_count = std::cmp::min(
+                l_row_cnt_ratio,
+                left_data.batch().num_rows() - self.left_probe_idx,
             );
-        };
-        let visited_left_side = left_data.bitmap();
-        let batch = self.state.try_as_process_probe_batch()?;
-
-        match self.batch_transformer.next() {
-            None => {
-                // Setting up timer & updating input metrics
-                self.join_metrics.input_batches.add(1);
-                self.join_metrics.input_rows.add(batch.num_rows());
-                let timer = self.join_metrics.join_time.timer();
-
-                let result = join_left_and_right_batch(
-                    left_data.batch(),
-                    batch,
-                    self.join_type,
-                    self.filter.as_ref(),
-                    &self.column_indices,
-                    &self.schema,
-                    visited_left_side,
-                    &mut self.indices_cache,
-                    self.right_side_ordered,
-                );
-                timer.done();
 
-                self.batch_transformer.set_batch(result?);
-                Ok(StatefulStreamResult::Continue)
+            debug_assert!(
+                l_row_count != 0,
+                "This function should only be entered when there are remaining left rows to process"
+            );
+            let joined_batch = self.process_left_range_join(
+                &left_data,
+                &right_batch,
+                self.left_probe_idx,
+                l_row_count,
+            )?;
+
+            if let Some(batch) = joined_batch {
+                self.output_buffer.push_batch(batch)?;
             }
-            Some((batch, last)) => {
-                if last {
-                    self.state = NestedLoopJoinStreamState::FetchProbeBatch;
+
+            self.left_probe_idx += l_row_count;
+
+            return Ok(true);
+        }
+
+        let l_idx = self.left_probe_idx;
+        let joined_batch =
+            self.process_single_left_row_join(&left_data, &right_batch, l_idx)?;
+
+        if let Some(batch) = joined_batch {
+            self.output_buffer.push_batch(batch)?;
+        }
+
+        // ==== Prepare for the next iteration ====
+
+        // Advance left cursor
+        self.left_probe_idx += 1;
+
+        // Return true to continue probing
+        Ok(true)
+    }
+
+    /// Process [l_start_index, l_start_index + l_count) JOIN right_batch
+    /// Returns a RecordBatch containing the join results (None if empty)
+    ///
+    /// Side Effect: If the join type requires, left or right side matched bitmap
+    /// will be set for matched indices.
+    fn process_left_range_join(
+        &mut self,
+        left_data: &JoinLeftData,
+        right_batch: &RecordBatch,
+        l_start_index: usize,
+        l_row_count: usize,
+    ) -> Result<Option<RecordBatch>> {
+        // Construct the Cartesian product between the specified range of left rows
+        // and the entire right_batch. First, it calculates the index vectors, then
+        // materializes the intermediate batch, and finally applies the join filter
+        // to it.
+        // -----------------------------------------------------------
+        let right_rows = right_batch.num_rows();
+        let total_rows = l_row_count * right_rows;
+
+        // Build index arrays for cartesian product: left_range X right_batch
+        let left_indices: UInt32Array =
+            UInt32Array::from_iter_values((0..l_row_count).flat_map(|i| {
+                std::iter::repeat_n((l_start_index + i) as u32, right_rows)
+            }));
+        let right_indices: UInt32Array = UInt32Array::from_iter_values(
+            (0..l_row_count).flat_map(|_| 0..right_rows as u32),
+        );
+
+        debug_assert!(
+            left_indices.len() == right_indices.len()
+                && right_indices.len() == total_rows,
+            "The length or cartesian product should be (left_size * right_size)",
+        );
+
+        // Evaluate the join filter (if any) over an intermediate batch built
+        // using the filter's own schema/column indices.
+        let bitmap_combined = if let Some(filter) = &self.join_filter {
+            // Build the intermediate batch for filter evaluation
+            let intermediate_batch = if filter.schema.fields().is_empty() {
+                // Constant predicate (e.g., TRUE/FALSE). Use an empty schema with row_count
+                create_record_batch_with_empty_schema(
+                    Arc::new((*filter.schema).clone()),
+                    total_rows,
+                )?
+            } else {
+                let mut filter_columns: Vec<Arc<dyn Array>> =
+                    Vec::with_capacity(filter.column_indices().len());
+                for column_index in filter.column_indices() {
+                    let array = if column_index.side == JoinSide::Left {
+                        let col = left_data.batch().column(column_index.index);
+                        take(col.as_ref(), &left_indices, None)?
+                    } else {
+                        let col = right_batch.column(column_index.index);
+                        take(col.as_ref(), &right_indices, None)?
+                    };
+                    filter_columns.push(array);
                 }
 
-                self.join_metrics.output_batches.add(1);
-                self.join_metrics.output_rows.add(batch.num_rows());
-                Ok(StatefulStreamResult::Ready(Some(batch)))
+                RecordBatch::try_new(Arc::new((*filter.schema).clone()), filter_columns)?
+            };
+
+            let filter_result = filter
+                .expression()
+                .evaluate(&intermediate_batch)?
+                .into_array(intermediate_batch.num_rows())?;
+            let filter_arr = as_boolean_array(&filter_result)?;
+
+            // Combine with null bitmap to get a unified mask
+            boolean_mask_from_filter(filter_arr)
+        } else {
+            // No filter: all pairs match
+            BooleanArray::from(vec![true; total_rows])
+        };
+
+        // Update the global left or right bitmap for matched indices
+        // -----------------------------------------------------------
+
+        // None means we don't have to update left bitmap for this join type
+        let mut left_bitmap = if need_produce_result_in_final(self.join_type) {
+            Some(left_data.bitmap().lock())
+        } else {
+            None
+        };
+
+        // 'local' meaning: we want to collect 'is_matched' flag for the current
+        // right batch, after it has joining all of the left buffer, here it's only
+        // the partial result for joining given left range
+        let mut local_right_bitmap = if self.should_track_unmatched_right {
+            let mut current_right_batch_bitmap = BooleanBufferBuilder::new(right_rows);
+            // Ensure builder has logical length so set_bit is in-bounds
+            current_right_batch_bitmap.append_n(right_rows, false);
+            Some(current_right_batch_bitmap)
+        } else {
+            None
+        };
+
+        // Set the matched bit for left and right side bitmap
+        for (i, is_matched) in bitmap_combined.iter().enumerate() {
+            let is_matched = is_matched.ok_or_else(|| {
+                internal_datafusion_err!("Must be Some after the previous combining step")
+            })?;
+
+            let l_index = l_start_index + i / right_rows;
+            let r_index = i % right_rows;
+
+            if let Some(bitmap) = left_bitmap.as_mut()
+                && is_matched
+            {
+                // Map local index back to absolute left index within the batch
+                bitmap.set_bit(l_index, true);
             }
+
+            if let Some(bitmap) = local_right_bitmap.as_mut()
+                && is_matched
+            {
+                bitmap.set_bit(r_index, true);
+            }
+        }
+
+        // Apply the local right bitmap to the global bitmap
+        if self.should_track_unmatched_right {
+            // Remember to put it back after update
+            let global_right_bitmap =
+                std::mem::take(&mut self.current_right_batch_matched).ok_or_else(
+                    || internal_datafusion_err!("right batch's bitmap should be present"),
+                )?;
+            let (buf, nulls) = global_right_bitmap.into_parts();
+            debug_assert!(nulls.is_none());
+
+            let current_right_bitmap = local_right_bitmap
+                .ok_or_else(|| {
+                    internal_datafusion_err!(
+                        "Should be Some if the current join type requires right bitmap"
+                    )
+                })?
+                .finish();
+            let updated_global_right_bitmap = buf.bitor(&current_right_bitmap);
+
+            self.current_right_batch_matched =
+                Some(BooleanArray::new(updated_global_right_bitmap, None));
+        }
+
+        // For the following join types: only bitmaps are updated; do not emit rows now
+        if matches!(
+            self.join_type,
+            JoinType::LeftAnti
+                | JoinType::LeftSemi
+                | JoinType::LeftMark
+                | JoinType::RightAnti
+                | JoinType::RightMark
+                | JoinType::RightSemi
+        ) {
+            return Ok(None);
+        }
+
+        // Build the projected output batch (using output schema/column_indices),
+        // then apply the bitmap filter to it.
+        if self.output_schema.fields().is_empty() {
+            // Empty projection: only row count matters
+            let row_count = bitmap_combined.true_count();
+            return Ok(Some(create_record_batch_with_empty_schema(
+                Arc::clone(&self.output_schema),
+                row_count,
+            )?));
+        }
+
+        let mut out_columns: Vec<Arc<dyn Array>> =
+            Vec::with_capacity(self.output_schema.fields().len());
+        for column_index in &self.column_indices {
+            let array = if column_index.side == JoinSide::Left {
+                let col = left_data.batch().column(column_index.index);
+                take(col.as_ref(), &left_indices, None)?
+            } else {
+                let col = right_batch.column(column_index.index);
+                take(col.as_ref(), &right_indices, None)?
+            };
+            out_columns.push(array);
         }
+        let pre_filtered =
+            RecordBatch::try_new(Arc::clone(&self.output_schema), out_columns)?;
+        let filtered = filter_record_batch(&pre_filtered, &bitmap_combined)?;
+        Ok(Some(filtered))
     }
 
-    /// Processes unmatched build-side rows for certain join types and produces
-    /// output batch, updates state to `Completed`.
-    fn process_unmatched_build_batch(
+    /// Process a single left row join with the current right batch.
+    /// Returns a RecordBatch containing the join results (None if empty)
+    ///
+    /// Side Effect: If the join type requires, left or right side matched bitmap
+    /// will be set for matched indices.
+    fn process_single_left_row_join(
         &mut self,
-    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
-        let Some(left_data) = self.left_data.clone() else {
-            return internal_err!(
-                "Expected left_data to be Some in ExhaustedProbeSide state"
-            );
+        left_data: &JoinLeftData,
+        right_batch: &RecordBatch,
+        l_index: usize,
+    ) -> Result<Option<RecordBatch>> {
+        let right_row_count = right_batch.num_rows();
+        if right_row_count == 0 {
+            return Ok(None);
+        }
+
+        let cur_right_bitmap = if let Some(filter) = &self.join_filter {
+            apply_filter_to_row_join_batch(
+                left_data.batch(),
+                l_index,
+                right_batch,
+                filter,
+            )?
+        } else {
+            BooleanArray::from(vec![true; right_row_count])
         };
-        let visited_left_side = left_data.bitmap();
-        if need_produce_result_in_final(self.join_type) {
-            // At this stage `visited_left_side` won't be updated, so it's
-            // safe to report about probe completion.
-            //
-            // Setting `is_exhausted` / returning None will prevent from
-            // multiple calls of `report_probe_completed()`
-            if !left_data.report_probe_completed() {
-                self.state = NestedLoopJoinStreamState::Completed;
-                return Ok(StatefulStreamResult::Ready(None));
-            };
 
-            // Only setting up timer, input is exhausted
-            let timer = self.join_metrics.join_time.timer();
-            // use the global left bitmap to produce the left indices and right indices
-            let (left_side, right_side) =
-                get_final_indices_from_shared_bitmap(visited_left_side, self.join_type);
-            let empty_right_batch = RecordBatch::new_empty(self.outer_table.schema());
-            // use the left and right indices to produce the batch result
-            let result = build_batch_from_indices(
-                &self.schema,
+        self.update_matched_bitmap(l_index, &cur_right_bitmap)?;
+
+        // For the following join types: here we only have to set the left/right
+        // bitmap, and no need to output result
+        if matches!(
+            self.join_type,
+            JoinType::LeftAnti
+                | JoinType::LeftSemi
+                | JoinType::LeftMark
+                | JoinType::RightAnti
+                | JoinType::RightMark
+                | JoinType::RightSemi
+        ) {
+            return Ok(None);
+        }
+
+        if cur_right_bitmap.true_count() == 0 {
+            // If none of the pairs has passed the join predicate/filter
+            Ok(None)
+        } else {
+            // Use the optimized approach similar to build_intermediate_batch_for_single_left_row
+            let join_batch = build_row_join_batch(
+                &self.output_schema,
                 left_data.batch(),
-                &empty_right_batch,
-                &left_side,
-                &right_side,
+                l_index,
+                right_batch,
+                Some(cur_right_bitmap),
                 &self.column_indices,
                 JoinSide::Left,
+            )?;
+            Ok(join_batch)
+        }
+    }
+
+    /// Returns bool to indicate should it continue processing unmatched rows
+    /// true -> continue in the same EmitLeftUnmatched state
+    /// false -> next state (Done)
+    fn process_left_unmatched(&mut self) -> Result<bool> {
+        let left_data = self.get_left_data()?;
+        let left_batch = left_data.batch();
+
+        // ========
+        // Check early return conditions
+        // ========
+
+        // Early return if join type can't have unmatched rows
+        let join_type_no_produce_left = !need_produce_result_in_final(self.join_type);
+        // Early return if another thread is already processing unmatched rows
+        let handled_by_other_partition =
+            self.left_emit_idx == 0 && !left_data.report_probe_completed();
+        // Stop processing unmatched rows, the caller will go to the next state
+        let finished = self.left_emit_idx >= left_batch.num_rows();
+
+        if join_type_no_produce_left || handled_by_other_partition || finished {
+            return Ok(false);
+        }
+
+        // ========
+        // Process unmatched rows and push the result into output_buffer
+        // Each time, the number to process is up to batch size
+        // ========
+        let start_idx = self.left_emit_idx;
+        let end_idx = std::cmp::min(start_idx + self.batch_size, left_batch.num_rows());
+
+        if let Some(batch) =
+            self.process_left_unmatched_range(left_data, start_idx, end_idx)?
+        {
+            self.output_buffer.push_batch(batch)?;
+        }
+
+        // ==== Prepare for the next iteration ====
+        self.left_emit_idx = end_idx;
+
+        // Return true to continue processing unmatched rows
+        Ok(true)
+    }
+
+    /// Process unmatched rows from the left data within the specified range.
+    /// Returns a RecordBatch containing the unmatched rows (None if empty).
+    ///
+    /// # Arguments
+    /// * `left_data` - The left side data containing the batch and bitmap
+    /// * `start_idx` - Start index (inclusive) of the range to process
+    /// * `end_idx` - End index (exclusive) of the range to process
+    ///
+    /// # Safety
+    /// The caller is responsible for ensuring that `start_idx` and `end_idx` are
+    /// within valid bounds of the left batch. This function does not perform
+    /// bounds checking.
+    fn process_left_unmatched_range(
+        &self,
+        left_data: &JoinLeftData,
+        start_idx: usize,
+        end_idx: usize,
+    ) -> Result<Option<RecordBatch>> {
+        if start_idx == end_idx {
+            return Ok(None);
+        }
+
+        // Slice both left batch, and bitmap to range [start_idx, end_idx)
+        // The range is bit index (not byte)
+        let left_batch = left_data.batch();
+        let left_batch_sliced = left_batch.slice(start_idx, end_idx - start_idx);
+
+        // Can this be more efficient?
+        let mut bitmap_sliced = BooleanBufferBuilder::new(end_idx - start_idx);
+        bitmap_sliced.append_n(end_idx - start_idx, false);
+        let bitmap = left_data.bitmap().lock();
+        for i in start_idx..end_idx {
+            assert!(
+                i - start_idx < bitmap_sliced.capacity(),
+                "DBG: {start_idx}, {end_idx}"
             );
-            self.state = NestedLoopJoinStreamState::Completed;
+            bitmap_sliced.set_bit(i - start_idx, bitmap.get_bit(i));
+        }
+        let bitmap_sliced = BooleanArray::new(bitmap_sliced.finish(), None);
+
+        let right_schema = self.right_data.schema();
+        build_unmatched_batch(
+            &self.output_schema,
+            &left_batch_sliced,
+            bitmap_sliced,
+            &right_schema,
+            &self.column_indices,
+            self.join_type,
+            JoinSide::Left,
+        )
+    }
 
-            // Recording time
-            if result.is_ok() {
-                timer.done();
-            }
+    /// Process unmatched rows from the current right batch and reset the bitmap.
+    /// Returns a RecordBatch containing the unmatched right rows (None if empty).
+    fn process_right_unmatched(&mut self) -> Result<Option<RecordBatch>> {
+        // ==== Take current right batch and its bitmap ====
+        let right_batch_bitmap: BooleanArray =
+            std::mem::take(&mut self.current_right_batch_matched).ok_or_else(|| {
+                internal_datafusion_err!("right bitmap should be available")
+            })?;
+
+        let right_batch = self.current_right_batch.take();
+        let cur_right_batch = unwrap_or_internal_err!(right_batch);
+
+        let left_data = self.get_left_data()?;
+        let left_schema = left_data.batch().schema();
+
+        let res = build_unmatched_batch(
+            &self.output_schema,
+            &cur_right_batch,
+            right_batch_bitmap,
+            &left_schema,
+            &self.column_indices,
+            self.join_type,
+            JoinSide::Right,
+        );
 
-            Ok(StatefulStreamResult::Ready(Some(result?)))
-        } else {
-            // end of the join loop
-            self.state = NestedLoopJoinStreamState::Completed;
-            Ok(StatefulStreamResult::Ready(None))
+        // ==== Clean-up ====
+        self.current_right_batch_matched = None;
+
+        res
+    }
+
+    // ==== Utilities ====
+
+    /// Get the build-side data of the left input, errors if it's None
+    fn get_left_data(&self) -> Result<&Arc<JoinLeftData>> {
+        self.buffered_left_data
+            .as_ref()
+            .ok_or_else(|| internal_datafusion_err!("LeftData should be available"))
+    }
+
+    /// Flush the `output_buffer` if there are batches ready to output
+    /// None if no result batch ready.
+    fn maybe_flush_ready_batch(&mut self) -> Option<Poll<Option<Result<RecordBatch>>>> {
+        if self.output_buffer.has_completed_batch()
+            && let Some(batch) = self.output_buffer.next_completed_batch()
+        {
+            // Update output rows for selectivity metric
+            let output_rows = batch.num_rows();
+            self.metrics.selectivity.add_part(output_rows);
+
+            return Some(Poll::Ready(Some(Ok(batch))));
+        }
+
+        None
+    }
+
+    /// After joining (l_index@left_buffer x current_right_batch), it will result
+    /// in a bitmap (the same length as current_right_batch) as the join match
+    /// result. Use this bitmap to update the global bitmap, for special join
+    /// types like full joins.
+    ///
+    /// Example:
+    /// After joining l_index=1 (1-indexed row in the left buffer), and the
+    /// current right batch with 3 elements, this function will be called with
+    /// arguments: l_index = 1, r_matched = [false, false, true]
+    /// - If the join type is FullJoin, the 1-index in the left bitmap will be
+    ///   set to true, and also the right bitmap will be bitwise-ORed with the
+    ///   input r_matched bitmap.
+    /// - For join types that don't require output unmatched rows, this
+    ///   function can be a no-op. For inner joins, this function is a no-op; for left
+    ///   joins, only the left bitmap may be updated.
+    fn update_matched_bitmap(
+        &mut self,
+        l_index: usize,
+        r_matched_bitmap: &BooleanArray,
+    ) -> Result<()> {
+        let left_data = self.get_left_data()?;
+
+        // number of successfully joined pairs from (l_index x cur_right_batch)
+        let joined_len = r_matched_bitmap.true_count();
+
+        // 1. Maybe update the left bitmap
+        if need_produce_result_in_final(self.join_type) && (joined_len > 0) {
+            let mut bitmap = left_data.bitmap().lock();
+            bitmap.set_bit(l_index, true);
+        }
+
+        // 2. Maybe updateh the right bitmap
+        if self.should_track_unmatched_right {
+            debug_assert!(self.current_right_batch_matched.is_some());
+            // after bit-wise or, it will be put back
+            let right_bitmap = std::mem::take(&mut self.current_right_batch_matched)
+                .ok_or_else(|| {
+                    internal_datafusion_err!("right batch's bitmap should be present")
+                })?;
+            let (buf, nulls) = right_bitmap.into_parts();
+            debug_assert!(nulls.is_none());
+            let updated_right_bitmap = buf.bitor(r_matched_bitmap.values());
+
+            self.current_right_batch_matched =
+                Some(BooleanArray::new(updated_right_bitmap, None));
         }
+
+        Ok(())
     }
 }
 
-#[allow(clippy::too_many_arguments)]
-fn join_left_and_right_batch(
+// ==== Utilities ====
+
+/// Apply the join filter between:
+/// (l_index th row in left buffer) x (right batch)
+/// Returns a bitmap, with successfully joined indices set to true
+fn apply_filter_to_row_join_batch(
     left_batch: &RecordBatch,
+    l_index: usize,
     right_batch: &RecordBatch,
-    join_type: JoinType,
-    filter: Option<&JoinFilter>,
-    column_indices: &[ColumnIndex],
-    schema: &Schema,
-    visited_left_side: &SharedBitmapBuilder,
-    indices_cache: &mut (UInt64Array, UInt32Array),
-    right_side_ordered: bool,
-) -> Result<RecordBatch> {
-    let (left_side, right_side) =
-        build_join_indices(left_batch, right_batch, filter, indices_cache).map_err(
-            |e| {
-                exec_datafusion_err!(
-                    "Fail to build join indices in NestedLoopJoinExec, error: {e}"
-                )
-            },
-        )?;
+    filter: &JoinFilter,
+) -> Result<BooleanArray> {
+    debug_assert!(left_batch.num_rows() != 0 && right_batch.num_rows() != 0);
+
+    let intermediate_batch = if filter.schema.fields().is_empty() {
+        // If filter is constant (e.g. literal `true`), empty batch can be used
+        // in the later filter step.
+        create_record_batch_with_empty_schema(
+            Arc::new((*filter.schema).clone()),
+            right_batch.num_rows(),
+        )?
+    } else {
+        build_row_join_batch(
+            &filter.schema,
+            left_batch,
+            l_index,
+            right_batch,
+            None,
+            &filter.column_indices,
+            JoinSide::Left,
+        )?
+        .ok_or_else(|| internal_datafusion_err!("This function assume input batch is not empty, so the intermediate batch can't be empty too"))?
+    };
 
-    // set the left bitmap
-    // and only full join need the left bitmap
-    if need_produce_result_in_final(join_type) {
-        let mut bitmap = visited_left_side.lock();
-        left_side.values().iter().for_each(|x| {
-            bitmap.set_bit(*x as usize, true);
-        });
+    let filter_result = filter
+        .expression()
+        .evaluate(&intermediate_batch)?
+        .into_array(intermediate_batch.num_rows())?;
+    let filter_arr = as_boolean_array(&filter_result)?;
+
+    // Convert boolean array with potential nulls into a unified mask bitmap
+    let bitmap_combined = boolean_mask_from_filter(filter_arr);
+
+    Ok(bitmap_combined)
+}
+
+/// Convert a boolean filter array into a unified mask bitmap.
+///
+/// Caution: The filter result is NOT a bitmap; it contains true/false/null values.
+/// For example, `1 < NULL` evaluates to NULL. Therefore, we must combine (AND)
+/// the boolean array with its null bitmap to construct a unified bitmap.
+#[inline]
+fn boolean_mask_from_filter(filter_arr: &BooleanArray) -> BooleanArray {
+    let (values, nulls) = filter_arr.clone().into_parts();
+    match nulls {
+        Some(nulls) => BooleanArray::new(nulls.inner() & &values, None),
+        None => BooleanArray::new(values, None),
     }
-    // adjust the two side indices base on the join type
-    let (left_side, right_side) = adjust_indices_by_join_type(
-        left_side,
-        right_side,
-        0..right_batch.num_rows(),
-        join_type,
-        right_side_ordered,
-    )?;
-
-    build_batch_from_indices(
-        schema,
-        left_batch,
-        right_batch,
-        &left_side,
-        &right_side,
-        column_indices,
-        JoinSide::Left,
-    )
 }
 
-impl<T: BatchTransformer + Unpin + Send> Stream for NestedLoopJoinStream<T> {
-    type Item = Result<RecordBatch>;
+/// This function performs the following steps:
+/// 1. Apply filter to probe-side batch
+/// 2. Broadcast the left row (build_side_batch\[build_side_index\]) to the
+///    filtered probe-side batch
+/// 3. Concat them together according to `col_indices`, and return the result
+///    (None if the result is empty)
+///
+/// Example:
+/// build_side_batch:
+/// a
+/// ----
+/// 1
+/// 2
+/// 3
+///
+/// # 0 index element in the build_side_batch (that is `1`) will be used
+/// build_side_index: 0
+///
+/// probe_side_batch:
+/// b
+/// ----
+/// 10
+/// 20
+/// 30
+/// 40
+///
+/// # After applying it, only index 1 and 3 elements in probe_side_batch will be
+/// # kept
+/// probe_side_filter:
+/// false
+/// true
+/// false
+/// true
+///
+///
+/// # Projections to the build/probe side batch, to construct the output batch
+/// col_indices:
+/// [(left, 0), (right, 0)]
+///
+/// build_side: left
+///
+/// ====
+/// Result batch:
+/// a b
+/// ----
+/// 1 20
+/// 1 40
+fn build_row_join_batch(
+    output_schema: &Schema,
+    build_side_batch: &RecordBatch,
+    build_side_index: usize,
+    probe_side_batch: &RecordBatch,
+    probe_side_filter: Option<BooleanArray>,
+    // See [`NLJStream`] struct's `column_indices` field for more detail
+    col_indices: &[ColumnIndex],
+    // If the build side is left or right, used to interpret the side information
+    // in `col_indices`
+    build_side: JoinSide,
+) -> Result<Option<RecordBatch>> {
+    debug_assert!(build_side != JoinSide::None);
+
+    // TODO(perf): since the output might be projection of right batch, this
+    // filtering step is more efficient to be done inside the column_index loop
+    let filtered_probe_batch = if let Some(filter) = probe_side_filter {
+        &filter_record_batch(probe_side_batch, &filter)?
+    } else {
+        probe_side_batch
+    };
 
-    fn poll_next(
-        mut self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        self.poll_next_impl(cx)
+    if filtered_probe_batch.num_rows() == 0 {
+        return Ok(None);
+    }
+
+    // Edge case: downstream operator does not require any columns from this NLJ,
+    // so allow an empty projection.
+    // Example:
+    //  SELECT DISTINCT 32 AS col2
+    //  FROM tab0 AS cor0
+    //  LEFT OUTER JOIN tab2 AS cor1
+    //  ON ( NULL ) IS NULL;
+    if output_schema.fields.is_empty() {
+        return Ok(Some(create_record_batch_with_empty_schema(
+            Arc::new(output_schema.clone()),
+            filtered_probe_batch.num_rows(),
+        )?));
+    }
+
+    let mut columns: Vec<Arc<dyn Array>> =
+        Vec::with_capacity(output_schema.fields().len());
+
+    for column_index in col_indices {
+        let array = if column_index.side == build_side {
+            // Broadcast the single build-side row to match the filtered
+            // probe-side batch length
+            let original_left_array = build_side_batch.column(column_index.index);
+
+            // Use `arrow::compute::take` directly for `List(Utf8View)` rather
+            // than going through `ScalarValue::to_array_of_size()`, which
+            // avoids some intermediate allocations.
+            //
+            // In other cases, `to_array_of_size()` is faster.
+            match original_left_array.data_type() {
+                DataType::List(field) | DataType::LargeList(field)
+                    if field.data_type() == &DataType::Utf8View =>
+                {
+                    let indices_iter = std::iter::repeat_n(
+                        build_side_index as u64,
+                        filtered_probe_batch.num_rows(),
+                    );
+                    let indices_array = UInt64Array::from_iter_values(indices_iter);
+                    take(original_left_array.as_ref(), &indices_array, None)?
+                }
+                _ => {
+                    let scalar_value = ScalarValue::try_from_array(
+                        original_left_array.as_ref(),
+                        build_side_index,
+                    )?;
+                    scalar_value.to_array_of_size(filtered_probe_batch.num_rows())?
+                }
+            }
+        } else {
+            // Take the filtered probe-side column using compute::take
+            Arc::clone(filtered_probe_batch.column(column_index.index))
+        };
+
+        columns.push(array);
     }
+
+    Ok(Some(RecordBatch::try_new(
+        Arc::new(output_schema.clone()),
+        columns,
+    )?))
 }
 
-impl<T: BatchTransformer + Unpin + Send> RecordBatchStream for NestedLoopJoinStream<T> {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
+/// Special case for `PlaceHolderRowExec`
+/// Minimal example:  SELECT 1 WHERE EXISTS (SELECT 1);
+//
+/// # Return
+/// If Some, that's the result batch
+/// If None, it's not for this special case. Continue execution.
+fn build_unmatched_batch_empty_schema(
+    output_schema: &SchemaRef,
+    batch_bitmap: &BooleanArray,
+    // For left/right/full joins, it needs to fill nulls for another side
+    join_type: JoinType,
+) -> Result<Option<RecordBatch>> {
+    let result_size = match join_type {
+        JoinType::Left
+        | JoinType::Right
+        | JoinType::Full
+        | JoinType::LeftAnti
+        | JoinType::RightAnti => batch_bitmap.false_count(),
+        JoinType::LeftSemi | JoinType::RightSemi => batch_bitmap.true_count(),
+        JoinType::LeftMark | JoinType::RightMark => batch_bitmap.len(),
+        _ => unreachable!(),
+    };
+
+    if output_schema.fields().is_empty() {
+        Ok(Some(create_record_batch_with_empty_schema(
+            Arc::clone(output_schema),
+            result_size,
+        )?))
+    } else {
+        Ok(None)
     }
 }
 
-impl EmbeddedProjection for NestedLoopJoinExec {
-    fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
-        self.with_projection(projection)
+/// Creates an empty RecordBatch with a specific row count.
+/// This is useful for cases where we need a batch with the correct schema and row count
+/// but no actual data columns (e.g., for constant filters).
+fn create_record_batch_with_empty_schema(
+    schema: SchemaRef,
+    row_count: usize,
+) -> Result<RecordBatch> {
+    let options = RecordBatchOptions::new()
+        .with_match_field_names(true)
+        .with_row_count(Some(row_count));
+
+    RecordBatch::try_new_with_options(schema, vec![], &options).map_err(|e| {
+        internal_datafusion_err!("Failed to create empty record batch: {}", e)
+    })
+}
+
+/// # Example:
+/// batch:
+/// a
+/// ----
+/// 1
+/// 2
+/// 3
+///
+/// batch_bitmap:
+/// ----
+/// false
+/// true
+/// false
+///
+/// another_side_schema:
+/// [(b, bool), (c, int32)]
+///
+/// join_type: JoinType::Left
+///
+/// col_indices: ...(please refer to the comment in `NLJStream::column_indices``)
+///
+/// batch_side: right
+///
+/// # Walkthrough:
+///
+/// This executor is performing a right join, and the currently processed right
+/// batch is as above. After joining it with all buffered left rows, the joined
+/// entries are marked by the `batch_bitmap`.
+/// This method will keep the unmatched indices on the batch side (right), and pad
+/// the left side with nulls. The result would be:
+///
+/// b          c           a
+/// ------------------------
+/// Null(bool) Null(Int32) 1
+/// Null(bool) Null(Int32) 3
+fn build_unmatched_batch(
+    output_schema: &SchemaRef,
+    batch: &RecordBatch,
+    batch_bitmap: BooleanArray,
+    // For left/right/full joins, it needs to fill nulls for another side
+    another_side_schema: &SchemaRef,
+    col_indices: &[ColumnIndex],
+    join_type: JoinType,
+    batch_side: JoinSide,
+) -> Result<Option<RecordBatch>> {
+    // Should not call it for inner joins
+    debug_assert_ne!(join_type, JoinType::Inner);
+    debug_assert_ne!(batch_side, JoinSide::None);
+
+    // Handle special case (see function comment)
+    if let Some(batch) =
+        build_unmatched_batch_empty_schema(output_schema, &batch_bitmap, join_type)?
+    {
+        return Ok(Some(batch));
+    }
+
+    match join_type {
+        JoinType::Full | JoinType::Right | JoinType::Left => {
+            if join_type == JoinType::Right {
+                debug_assert_eq!(batch_side, JoinSide::Right);
+            }
+            if join_type == JoinType::Left {
+                debug_assert_eq!(batch_side, JoinSide::Left);
+            }
+
+            // 1. Filter the batch with *flipped* bitmap
+            // 2. Fill left side with nulls
+            let flipped_bitmap = not(&batch_bitmap)?;
+
+            // create a recordbatch, with left_schema, of only one row of all nulls
+            let left_null_columns: Vec<Arc<dyn Array>> = another_side_schema
+                .fields()
+                .iter()
+                .map(|field| new_null_array(field.data_type(), 1))
+                .collect();
+
+            // Hack: If the left schema is not nullable, the full join result
+            // might contain null, this is only a temporary batch to construct
+            // such full join result.
+            let nullable_left_schema = Arc::new(Schema::new(
+                another_side_schema
+                    .fields()
+                    .iter()
+                    .map(|field| (**field).clone().with_nullable(true))
+                    .collect::<Vec<_>>(),
+            ));
+            let left_null_batch = if nullable_left_schema.fields.is_empty() {
+                // Left input can be an empty relation, in this case left relation
+                // won't be used to construct the result batch (i.e. not in `col_indices`)
+                create_record_batch_with_empty_schema(nullable_left_schema, 0)?
+            } else {
+                RecordBatch::try_new(nullable_left_schema, left_null_columns)?
+            };
+
+            debug_assert_ne!(batch_side, JoinSide::None);
+            let opposite_side = batch_side.negate();
+
+            build_row_join_batch(
+                output_schema,
+                &left_null_batch,
+                0,
+                batch,
+                Some(flipped_bitmap),
+                col_indices,
+                opposite_side,
+            )
+        }
+        JoinType::RightSemi
+        | JoinType::RightAnti
+        | JoinType::LeftSemi
+        | JoinType::LeftAnti => {
+            if matches!(join_type, JoinType::RightSemi | JoinType::RightAnti) {
+                debug_assert_eq!(batch_side, JoinSide::Right);
+            }
+            if matches!(join_type, JoinType::LeftSemi | JoinType::LeftAnti) {
+                debug_assert_eq!(batch_side, JoinSide::Left);
+            }
+
+            let bitmap = if matches!(join_type, JoinType::LeftSemi | JoinType::RightSemi)
+            {
+                batch_bitmap.clone()
+            } else {
+                not(&batch_bitmap)?
+            };
+
+            if bitmap.true_count() == 0 {
+                return Ok(None);
+            }
+
+            let mut columns: Vec<Arc<dyn Array>> =
+                Vec::with_capacity(output_schema.fields().len());
+
+            for column_index in col_indices {
+                debug_assert!(column_index.side == batch_side);
+
+                let col = batch.column(column_index.index);
+                let filtered_col = filter(col, &bitmap)?;
+
+                columns.push(filtered_col);
+            }
+
+            Ok(Some(RecordBatch::try_new(
+                Arc::clone(output_schema),
+                columns,
+            )?))
+        }
+        JoinType::RightMark | JoinType::LeftMark => {
+            if join_type == JoinType::RightMark {
+                debug_assert_eq!(batch_side, JoinSide::Right);
+            }
+            if join_type == JoinType::LeftMark {
+                debug_assert_eq!(batch_side, JoinSide::Left);
+            }
+
+            let mut columns: Vec<Arc<dyn Array>> =
+                Vec::with_capacity(output_schema.fields().len());
+
+            // Hack to deal with the borrow checker
+            let mut right_batch_bitmap_opt = Some(batch_bitmap);
+
+            for column_index in col_indices {
+                if column_index.side == batch_side {
+                    let col = batch.column(column_index.index);
+
+                    columns.push(Arc::clone(col));
+                } else if column_index.side == JoinSide::None {
+                    let right_batch_bitmap = std::mem::take(&mut right_batch_bitmap_opt);
+                    match right_batch_bitmap {
+                        Some(right_batch_bitmap) => {
+                            columns.push(Arc::new(right_batch_bitmap))
+                        }
+                        None => unreachable!("Should only be one mark column"),
+                    }
+                } else {
+                    return internal_err!(
+                        "Not possible to have this join side for RightMark join"
+                    );
+                }
+            }
+
+            Ok(Some(RecordBatch::try_new(
+                Arc::clone(output_schema),
+                columns,
+            )?))
+        }
+        _ => internal_err!(
+            "If batch is at right side, this function must be handling Full/Right/RightSemi/RightAnti/RightMark joins"
+        ),
     }
 }
 
 #[cfg(test)]
 pub(crate) mod tests {
     use super::*;
-    use crate::test::TestMemoryExec;
+    use crate::test::{TestMemoryExec, assert_join_metrics};
     use crate::{
         common, expressions::Column, repartition::RepartitionExec, test::build_table_i32,
     };
 
-    use arrow::array::Int32Array;
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field};
     use datafusion_common::test_util::batches_to_sort_string;
-    use datafusion_common::{assert_contains, ScalarValue};
+    use datafusion_common::{ScalarValue, assert_contains};
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use datafusion_expr::Operator;
     use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
     use datafusion_physical_expr::{Partitioning, PhysicalExpr};
     use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
 
+    use insta::allow_duplicates;
     use insta::assert_snapshot;
     use rstest::rstest;
 
@@ -1088,25 +2370,22 @@ pub(crate) mod tests {
             vec![batch]
         };
 
-        let mut source =
-            TestMemoryExec::try_new(&[batches], Arc::clone(&schema), None).unwrap();
-        if !sorted_column_names.is_empty() {
-            let mut sort_info = LexOrdering::default();
-            for name in sorted_column_names {
-                let index = schema.index_of(name).unwrap();
-                let sort_expr = PhysicalSortExpr {
-                    expr: Arc::new(Column::new(name, index)),
-                    options: SortOptions {
-                        descending: false,
-                        nulls_first: false,
-                    },
-                };
-                sort_info.push(sort_expr);
-            }
-            source = source.try_with_sort_information(vec![sort_info]).unwrap();
+        let mut sort_info = vec![];
+        for name in sorted_column_names {
+            let index = schema.index_of(name).unwrap();
+            let sort_expr = PhysicalSortExpr::new(
+                Arc::new(Column::new(name, index)),
+                SortOptions::new(false, false),
+            );
+            sort_info.push(sort_expr);
+        }
+        let mut source = TestMemoryExec::try_new(&[batches], schema, None).unwrap();
+        if let Some(ordering) = LexOrdering::new(sort_info) {
+            source = source.try_with_sort_information(vec![ordering]).unwrap();
         }
 
-        Arc::new(TestMemoryExec::update_cache(Arc::new(source)))
+        let source = Arc::new(source);
+        Arc::new(TestMemoryExec::update_cache(&source))
     }
 
     fn build_left_table() -> Arc<dyn ExecutionPlan> {
@@ -1183,7 +2462,7 @@ pub(crate) mod tests {
         join_type: &JoinType,
         join_filter: Option<JoinFilter>,
         context: Arc<TaskContext>,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    ) -> Result<(Vec<String>, Vec<RecordBatch>, MetricsSet)> {
         let partition_count = 4;
 
         // Redistributing right input
@@ -1203,20 +2482,35 @@ pub(crate) mod tests {
             batches.extend(
                 more_batches
                     .into_iter()
+                    .inspect(|b| {
+                        assert!(b.num_rows() <= context.session_config().batch_size())
+                    })
                     .filter(|b| b.num_rows() > 0)
                     .collect::<Vec<_>>(),
             );
         }
-        Ok((columns, batches))
+
+        let metrics = nested_loop_join.metrics().unwrap();
+
+        Ok((columns, batches, metrics))
+    }
+
+    fn new_task_ctx(batch_size: usize) -> Arc<TaskContext> {
+        let base = TaskContext::default();
+        // limit max size of intermediate batch used in nlj to 1
+        let cfg = base.session_config().clone().with_batch_size(batch_size);
+        Arc::new(base.with_session_config(cfg))
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_inner_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_inner_with_filter(#[values(1, 2, 16)] batch_size: usize) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
+        dbg!(&batch_size);
         let left = build_left_table();
         let right = build_right_table();
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::Inner,
@@ -1224,26 +2518,30 @@ pub(crate) mod tests {
             task_ctx,
         )
         .await?;
+
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 5  | 5  | 50 | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b2 | c2 |
+        +----+----+----+----+----+----+
+        | 5  | 5  | 50 | 2  | 2  | 80 |
+        +----+----+----+----+----+----+
+        "));
+
+        assert_join_metrics!(metrics, 1);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_left_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_left_with_filter(#[values(1, 2, 16)] batch_size: usize) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::Left,
@@ -1252,27 +2550,30 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+----+----+----+
-            | a1 | b1 | c1  | a2 | b2 | c2 |
-            +----+----+-----+----+----+----+
-            | 11 | 8  | 110 |    |    |    |
-            | 5  | 5  | 50  | 2  | 2  | 80 |
-            | 9  | 8  | 90  |    |    |    |
-            +----+----+-----+----+----+----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+----+----+----+
+        | a1 | b1 | c1  | a2 | b2 | c2 |
+        +----+----+-----+----+----+----+
+        | 11 | 8  | 110 |    |    |    |
+        | 5  | 5  | 50  | 2  | 2  | 80 |
+        | 9  | 8  | 90  |    |    |    |
+        +----+----+-----+----+----+----+
+        "));
+
+        assert_join_metrics!(metrics, 3);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_right_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_right_with_filter(#[values(1, 2, 16)] batch_size: usize) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::Right,
@@ -1281,27 +2582,30 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+-----+
-            | a1 | b1 | c1 | a2 | b2 | c2  |
-            +----+----+----+----+----+-----+
-            |    |    |    | 10 | 10 | 100 |
-            |    |    |    | 12 | 10 | 40  |
-            | 5  | 5  | 50 | 2  | 2  | 80  |
-            +----+----+----+----+----+-----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+----+----+-----+
+        | a1 | b1 | c1 | a2 | b2 | c2  |
+        +----+----+----+----+----+-----+
+        |    |    |    | 10 | 10 | 100 |
+        |    |    |    | 12 | 10 | 40  |
+        | 5  | 5  | 50 | 2  | 2  | 80  |
+        +----+----+----+----+----+-----+
+        "));
+
+        assert_join_metrics!(metrics, 3);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_full_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_full_with_filter(#[values(1, 2, 16)] batch_size: usize) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::Full,
@@ -1310,29 +2614,34 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+----+----+-----+
-            | a1 | b1 | c1  | a2 | b2 | c2  |
-            +----+----+-----+----+----+-----+
-            |    |    |     | 10 | 10 | 100 |
-            |    |    |     | 12 | 10 | 40  |
-            | 11 | 8  | 110 |    |    |     |
-            | 5  | 5  | 50  | 2  | 2  | 80  |
-            | 9  | 8  | 90  |    |    |     |
-            +----+----+-----+----+----+-----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+----+----+-----+
+        | a1 | b1 | c1  | a2 | b2 | c2  |
+        +----+----+-----+----+----+-----+
+        |    |    |     | 10 | 10 | 100 |
+        |    |    |     | 12 | 10 | 40  |
+        | 11 | 8  | 110 |    |    |     |
+        | 5  | 5  | 50  | 2  | 2  | 80  |
+        | 9  | 8  | 90  |    |    |     |
+        +----+----+-----+----+----+-----+
+        "));
+
+        assert_join_metrics!(metrics, 5);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_left_semi_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_left_semi_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::LeftSemi,
@@ -1341,25 +2650,30 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 5  | 5  | 50 |
-            +----+----+----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+
+        | a1 | b1 | c1 |
+        +----+----+----+
+        | 5  | 5  | 50 |
+        +----+----+----+
+        "));
+
+        assert_join_metrics!(metrics, 1);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_left_anti_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_left_anti_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::LeftAnti,
@@ -1368,26 +2682,51 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+
-            | a1 | b1 | c1  |
-            +----+----+-----+
-            | 11 | 8  | 110 |
-            | 9  | 8  | 90  |
-            +----+----+-----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+
+        | a1 | b1 | c1  |
+        +----+----+-----+
+        | 11 | 8  | 110 |
+        | 9  | 8  | 90  |
+        +----+----+-----+
+        "));
+
+        assert_join_metrics!(metrics, 2);
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn join_right_semi_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_has_correct_stats() -> Result<()> {
+        let left = build_left_table();
+        let right = build_right_table();
+        let nested_loop_join = NestedLoopJoinExec::try_new(
+            left,
+            right,
+            None,
+            &JoinType::Left,
+            Some(vec![1, 2]),
+        )?;
+        let stats = nested_loop_join.partition_statistics(None)?;
+        assert_eq!(
+            nested_loop_join.schema().fields().len(),
+            stats.column_statistics.len(),
+        );
+        assert_eq!(2, stats.column_statistics.len());
+        Ok(())
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn join_right_semi_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::RightSemi,
@@ -1396,25 +2735,30 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+
-            | a2 | b2 | c2 |
-            +----+----+----+
-            | 2  | 2  | 80 |
-            +----+----+----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+----+
+        | a2 | b2 | c2 |
+        +----+----+----+
+        | 2  | 2  | 80 |
+        +----+----+----+
+        "));
+
+        assert_join_metrics!(metrics, 1);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_right_anti_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_right_anti_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::RightAnti,
@@ -1423,26 +2767,31 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a2", "b2", "c2"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+
-            | a2 | b2 | c2  |
-            +----+----+-----+
-            | 10 | 10 | 100 |
-            | 12 | 10 | 40  |
-            +----+----+-----+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+
+        | a2 | b2 | c2  |
+        +----+----+-----+
+        | 10 | 10 | 100 |
+        | 12 | 10 | 40  |
+        +----+----+-----+
+        "));
+
+        assert_join_metrics!(metrics, 2);
 
         Ok(())
     }
 
+    #[rstest]
     #[tokio::test]
-    async fn join_left_mark_with_filter() -> Result<()> {
-        let task_ctx = Arc::new(TaskContext::default());
+    async fn join_left_mark_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
         let left = build_left_table();
         let right = build_right_table();
 
         let filter = prepare_join_filter();
-        let (columns, batches) = multi_partitioned_join_collect(
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
             left,
             right,
             &JoinType::LeftMark,
@@ -1451,15 +2800,52 @@ pub(crate) mod tests {
         )
         .await?;
         assert_eq!(columns, vec!["a1", "b1", "c1", "mark"]);
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+-----+-------+
-            | a1 | b1 | c1  | mark  |
-            +----+----+-----+-------+
-            | 11 | 8  | 110 | false |
-            | 5  | 5  | 50  | true  |
-            | 9  | 8  | 90  | false |
-            +----+----+-----+-------+
-            "#);
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+-------+
+        | a1 | b1 | c1  | mark  |
+        +----+----+-----+-------+
+        | 11 | 8  | 110 | false |
+        | 5  | 5  | 50  | true  |
+        | 9  | 8  | 90  | false |
+        +----+----+-----+-------+
+        "));
+
+        assert_join_metrics!(metrics, 3);
+
+        Ok(())
+    }
+
+    #[rstest]
+    #[tokio::test]
+    async fn join_right_mark_with_filter(
+        #[values(1, 2, 16)] batch_size: usize,
+    ) -> Result<()> {
+        let task_ctx = new_task_ctx(batch_size);
+        let left = build_left_table();
+        let right = build_right_table();
+
+        let filter = prepare_join_filter();
+        let (columns, batches, metrics) = multi_partitioned_join_collect(
+            left,
+            right,
+            &JoinType::RightMark,
+            Some(filter),
+            task_ctx,
+        )
+        .await?;
+        assert_eq!(columns, vec!["a2", "b2", "c2", "mark"]);
+
+        allow_duplicates!(assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +----+----+-----+-------+
+        | a2 | b2 | c2  | mark  |
+        +----+----+-----+-------+
+        | 10 | 10 | 100 | false |
+        | 12 | 10 | 40  | false |
+        | 2  | 2  | 80  | true  |
+        +----+----+-----+-------+
+        "));
+
+        assert_join_metrics!(metrics, 3);
 
         Ok(())
     }
@@ -1492,6 +2878,7 @@ pub(crate) mod tests {
             JoinType::LeftMark,
             JoinType::RightSemi,
             JoinType::RightAnti,
+            JoinType::RightMark,
         ];
 
         for join_type in join_types {
@@ -1513,167 +2900,8 @@ pub(crate) mod tests {
 
             assert_contains!(
                 err.to_string(),
-                "Resources exhausted: Additional allocation failed with top memory consumers (across reservations) as:\n  NestedLoopJoinLoad[0]"
-            );
-        }
-
-        Ok(())
-    }
-
-    fn prepare_mod_join_filter() -> JoinFilter {
-        let column_indices = vec![
-            ColumnIndex {
-                index: 1,
-                side: JoinSide::Left,
-            },
-            ColumnIndex {
-                index: 1,
-                side: JoinSide::Right,
-            },
-        ];
-        let intermediate_schema = Schema::new(vec![
-            Field::new("x", DataType::Int32, true),
-            Field::new("x", DataType::Int32, true),
-        ]);
-
-        // left.b1 % 3
-        let left_mod = Arc::new(BinaryExpr::new(
-            Arc::new(Column::new("x", 0)),
-            Operator::Modulo,
-            Arc::new(Literal::new(ScalarValue::Int32(Some(3)))),
-        )) as Arc<dyn PhysicalExpr>;
-        // left.b1 % 3 != 0
-        let left_filter = Arc::new(BinaryExpr::new(
-            left_mod,
-            Operator::NotEq,
-            Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
-        )) as Arc<dyn PhysicalExpr>;
-
-        // right.b2 % 5
-        let right_mod = Arc::new(BinaryExpr::new(
-            Arc::new(Column::new("x", 1)),
-            Operator::Modulo,
-            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
-        )) as Arc<dyn PhysicalExpr>;
-        // right.b2 % 5 != 0
-        let right_filter = Arc::new(BinaryExpr::new(
-            right_mod,
-            Operator::NotEq,
-            Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
-        )) as Arc<dyn PhysicalExpr>;
-        // filter = left.b1 % 3 != 0 and right.b2 % 5 != 0
-        let filter_expression =
-            Arc::new(BinaryExpr::new(left_filter, Operator::And, right_filter))
-                as Arc<dyn PhysicalExpr>;
-
-        JoinFilter::new(
-            filter_expression,
-            column_indices,
-            Arc::new(intermediate_schema),
-        )
-    }
-
-    fn generate_columns(num_columns: usize, num_rows: usize) -> Vec<Vec<i32>> {
-        let column = (1..=num_rows).map(|x| x as i32).collect();
-        vec![column; num_columns]
-    }
-
-    #[rstest]
-    #[tokio::test]
-    async fn join_maintains_right_order(
-        #[values(
-            JoinType::Inner,
-            JoinType::Right,
-            JoinType::RightAnti,
-            JoinType::RightSemi
-        )]
-        join_type: JoinType,
-        #[values(1, 100, 1000)] left_batch_size: usize,
-        #[values(1, 100, 1000)] right_batch_size: usize,
-    ) -> Result<()> {
-        let left_columns = generate_columns(3, 1000);
-        let left = build_table(
-            ("a1", &left_columns[0]),
-            ("b1", &left_columns[1]),
-            ("c1", &left_columns[2]),
-            Some(left_batch_size),
-            Vec::new(),
-        );
-
-        let right_columns = generate_columns(3, 1000);
-        let right = build_table(
-            ("a2", &right_columns[0]),
-            ("b2", &right_columns[1]),
-            ("c2", &right_columns[2]),
-            Some(right_batch_size),
-            vec!["a2", "b2", "c2"],
-        );
-
-        let filter = prepare_mod_join_filter();
-
-        let nested_loop_join = Arc::new(NestedLoopJoinExec::try_new(
-            left,
-            Arc::clone(&right),
-            Some(filter),
-            &join_type,
-            None,
-        )?) as Arc<dyn ExecutionPlan>;
-        assert_eq!(nested_loop_join.maintains_input_order(), vec![false, true]);
-
-        let right_column_indices = match join_type {
-            JoinType::Inner | JoinType::Right => vec![3, 4, 5],
-            JoinType::RightAnti | JoinType::RightSemi => vec![0, 1, 2],
-            _ => unreachable!(),
-        };
-
-        let right_ordering = right.output_ordering().unwrap();
-        let join_ordering = nested_loop_join.output_ordering().unwrap();
-        for (right, join) in right_ordering.iter().zip(join_ordering.iter()) {
-            let right_column = right.expr.as_any().downcast_ref::<Column>().unwrap();
-            let join_column = join.expr.as_any().downcast_ref::<Column>().unwrap();
-            assert_eq!(join_column.name(), join_column.name());
-            assert_eq!(
-                right_column_indices[right_column.index()],
-                join_column.index()
+                "Resources exhausted: Additional allocation failed for NestedLoopJoinLoad[0] with top memory consumers (across reservations) as:\n  NestedLoopJoinLoad[0]"
             );
-            assert_eq!(right.options, join.options);
-        }
-
-        let batches = nested_loop_join
-            .execute(0, Arc::new(TaskContext::default()))?
-            .try_collect::<Vec<_>>()
-            .await?;
-
-        // Make sure that the order of the right side is maintained
-        let mut prev_values = [i32::MIN, i32::MIN, i32::MIN];
-
-        for (batch_index, batch) in batches.iter().enumerate() {
-            let columns: Vec<_> = right_column_indices
-                .iter()
-                .map(|&i| {
-                    batch
-                        .column(i)
-                        .as_any()
-                        .downcast_ref::<Int32Array>()
-                        .unwrap()
-                })
-                .collect();
-
-            for row in 0..batch.num_rows() {
-                let current_values = [
-                    columns[0].value(row),
-                    columns[1].value(row),
-                    columns[2].value(row),
-                ];
-                assert!(
-                    current_values
-                        .into_iter()
-                        .zip(prev_values)
-                        .all(|(current, prev)| current >= prev),
-                    "batch_index: {batch_index} row: {row} current: {current_values:?}, prev: {prev_values:?}"
-                );
-                prev_values = current_values;
-            }
         }
 
         Ok(())
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs
new file mode 100644
index 0000000000000..bb32a222de962
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/classic_join.rs
@@ -0,0 +1,1545 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Stream Implementation for PiecewiseMergeJoin's Classic Join (Left, Right, Full, Inner)
+
+use arrow::array::{Array, PrimitiveBuilder, new_null_array};
+use arrow::compute::{BatchCoalescer, take};
+use arrow::datatypes::UInt32Type;
+use arrow::{
+    array::{ArrayRef, RecordBatch, UInt32Array},
+    compute::{sort_to_indices, take_record_batch},
+};
+use arrow_schema::{Schema, SchemaRef, SortOptions};
+use datafusion_common::NullEquality;
+use datafusion_common::{Result, internal_err};
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
+use datafusion_expr::{JoinType, Operator};
+use datafusion_physical_expr::PhysicalExprRef;
+use futures::{Stream, StreamExt};
+use std::{cmp::Ordering, task::ready};
+use std::{sync::Arc, task::Poll};
+
+use crate::handle_state;
+use crate::joins::piecewise_merge_join::exec::{BufferedSide, BufferedSideReadyState};
+use crate::joins::piecewise_merge_join::utils::need_produce_result_in_final;
+use crate::joins::utils::{BuildProbeJoinMetrics, StatefulStreamResult};
+use crate::joins::utils::{compare_join_arrays, get_final_indices_from_shared_bitmap};
+
+pub(super) enum PiecewiseMergeJoinStreamState {
+    WaitBufferedSide,
+    FetchStreamBatch,
+    ProcessStreamBatch(SortedStreamBatch),
+    ProcessUnmatched,
+    Completed,
+}
+
+impl PiecewiseMergeJoinStreamState {
+    // Grab mutable reference to the current stream batch
+    fn try_as_process_stream_batch_mut(&mut self) -> Result<&mut SortedStreamBatch> {
+        match self {
+            PiecewiseMergeJoinStreamState::ProcessStreamBatch(state) => Ok(state),
+            _ => internal_err!("Expected streamed batch in StreamBatch"),
+        }
+    }
+}
+
+/// The stream side incoming batch with required sort order.
+///
+/// Note the compare key in the join predicate might include expressions on the original
+/// columns, so we store the evaluated compare key separately.
+/// e.g. For join predicate `buffer.v1 < (stream.v1 + 1)`, the `compare_key_values` field stores
+/// the evaluated `stream.v1 + 1` array.
+pub(super) struct SortedStreamBatch {
+    pub batch: RecordBatch,
+    compare_key_values: Vec<ArrayRef>,
+}
+
+impl SortedStreamBatch {
+    fn new(batch: RecordBatch, compare_key_values: Vec<ArrayRef>) -> Self {
+        Self {
+            batch,
+            compare_key_values,
+        }
+    }
+
+    fn compare_key_values(&self) -> &Vec<ArrayRef> {
+        &self.compare_key_values
+    }
+}
+
+pub(super) struct ClassicPWMJStream {
+    // Output schema of the `PiecewiseMergeJoin`
+    pub schema: Arc<Schema>,
+
+    // Physical expression that is evaluated on the streamed side
+    // We do not need on_buffered as this is already evaluated when
+    // creating the buffered side which happens before initializing
+    // `PiecewiseMergeJoinStream`
+    pub on_streamed: PhysicalExprRef,
+    // Type of join
+    pub join_type: JoinType,
+    // Comparison operator
+    pub operator: Operator,
+    // Streamed batch
+    pub streamed: SendableRecordBatchStream,
+    // Streamed schema
+    streamed_schema: SchemaRef,
+    // Buffered side data
+    buffered_side: BufferedSide,
+    // Tracks the state of the `PiecewiseMergeJoin`
+    state: PiecewiseMergeJoinStreamState,
+    // Sort option for streamed side (specifies whether
+    // the sort is ascending or descending)
+    sort_option: SortOptions,
+    // Metrics for build + probe joins
+    join_metrics: BuildProbeJoinMetrics,
+    // Tracking incremental state for emitting record batches
+    batch_process_state: BatchProcessState,
+}
+
+impl RecordBatchStream for ClassicPWMJStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+// `PiecewiseMergeJoinStreamState` is separated into `WaitBufferedSide`, `FetchStreamBatch`,
+// `ProcessStreamBatch`, `ProcessUnmatched` and `Completed`.
+//
+// Classic Joins
+//  1. `WaitBufferedSide` - Load in the buffered side data into memory.
+//  2. `FetchStreamBatch` -  Fetch + sort incoming stream batches. We switch the state to
+//     `Completed` if there are are still remaining partitions to process. It is only switched to
+//     `ExhaustedStreamBatch` if all partitions have been processed.
+//  3. `ProcessStreamBatch` - Compare stream batch row values against the buffered side data.
+//  4. `ExhaustedStreamBatch` - If the join type is Left or Inner we will return state as
+//      `Completed` however for Full and Right we will need to process the unmatched buffered rows.
+impl ClassicPWMJStream {
+    // Creates a new `PiecewiseMergeJoinStream` instance
+    #[expect(clippy::too_many_arguments)]
+    pub fn try_new(
+        schema: Arc<Schema>,
+        on_streamed: PhysicalExprRef,
+        join_type: JoinType,
+        operator: Operator,
+        streamed: SendableRecordBatchStream,
+        buffered_side: BufferedSide,
+        state: PiecewiseMergeJoinStreamState,
+        sort_option: SortOptions,
+        join_metrics: BuildProbeJoinMetrics,
+        batch_size: usize,
+    ) -> Self {
+        Self {
+            schema: Arc::clone(&schema),
+            on_streamed,
+            join_type,
+            operator,
+            streamed_schema: streamed.schema(),
+            streamed,
+            buffered_side,
+            state,
+            sort_option,
+            join_metrics,
+            batch_process_state: BatchProcessState::new(schema, batch_size),
+        }
+    }
+
+    fn poll_next_impl(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        loop {
+            return match self.state {
+                PiecewiseMergeJoinStreamState::WaitBufferedSide => {
+                    handle_state!(ready!(self.collect_buffered_side(cx)))
+                }
+                PiecewiseMergeJoinStreamState::FetchStreamBatch => {
+                    handle_state!(ready!(self.fetch_stream_batch(cx)))
+                }
+                PiecewiseMergeJoinStreamState::ProcessStreamBatch(_) => {
+                    handle_state!(self.process_stream_batch())
+                }
+                PiecewiseMergeJoinStreamState::ProcessUnmatched => {
+                    handle_state!(self.process_unmatched_buffered_batch())
+                }
+                PiecewiseMergeJoinStreamState::Completed => Poll::Ready(None),
+            };
+        }
+    }
+
+    // Collects buffered side data
+    fn collect_buffered_side(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
+        let build_timer = self.join_metrics.build_time.timer();
+        let buffered_data = ready!(
+            self.buffered_side
+                .try_as_initial_mut()?
+                .buffered_fut
+                .get_shared(cx)
+        )?;
+        build_timer.done();
+
+        // We will start fetching stream batches for classic joins
+        self.state = PiecewiseMergeJoinStreamState::FetchStreamBatch;
+
+        self.buffered_side =
+            BufferedSide::Ready(BufferedSideReadyState { buffered_data });
+
+        Poll::Ready(Ok(StatefulStreamResult::Continue))
+    }
+
+    // Fetches incoming stream batches
+    fn fetch_stream_batch(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
+        match ready!(self.streamed.poll_next_unpin(cx)) {
+            None => {
+                if self
+                    .buffered_side
+                    .try_as_ready_mut()?
+                    .buffered_data
+                    .remaining_partitions
+                    .fetch_sub(1, std::sync::atomic::Ordering::SeqCst)
+                    == 1
+                {
+                    self.batch_process_state.reset();
+                    self.state = PiecewiseMergeJoinStreamState::ProcessUnmatched;
+                } else {
+                    self.state = PiecewiseMergeJoinStreamState::Completed;
+                }
+            }
+            Some(Ok(batch)) => {
+                // Evaluate the streamed physical expression on the stream batch
+                let stream_values: ArrayRef = self
+                    .on_streamed
+                    .evaluate(&batch)?
+                    .into_array(batch.num_rows())?;
+
+                self.join_metrics.input_batches.add(1);
+                self.join_metrics.input_rows.add(batch.num_rows());
+
+                // Sort stream values and change the streamed record batch accordingly
+                let indices = sort_to_indices(
+                    stream_values.as_ref(),
+                    Some(self.sort_option),
+                    None,
+                )?;
+                let stream_batch = take_record_batch(&batch, &indices)?;
+                let stream_values = take(stream_values.as_ref(), &indices, None)?;
+
+                // Reset BatchProcessState before processing a new stream batch
+                self.batch_process_state.reset();
+                self.state = PiecewiseMergeJoinStreamState::ProcessStreamBatch(
+                    SortedStreamBatch::new(stream_batch, vec![stream_values]),
+                );
+            }
+            Some(Err(err)) => return Poll::Ready(Err(err)),
+        };
+
+        Poll::Ready(Ok(StatefulStreamResult::Continue))
+    }
+
+    // Only classic join will call. This function will process stream batches and evaluate against
+    // the buffered side data.
+    fn process_stream_batch(
+        &mut self,
+    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
+        let buffered_side = self.buffered_side.try_as_ready_mut()?;
+        let stream_batch = self.state.try_as_process_stream_batch_mut()?;
+
+        if let Some(batch) = self
+            .batch_process_state
+            .output_batches
+            .next_completed_batch()
+        {
+            return Ok(StatefulStreamResult::Ready(Some(batch)));
+        }
+
+        // Produce more work
+        let batch = resolve_classic_join(
+            buffered_side,
+            stream_batch,
+            &self.schema,
+            self.operator,
+            self.sort_option,
+            self.join_type,
+            &mut self.batch_process_state,
+        )?;
+
+        if !self.batch_process_state.continue_process {
+            // We finished scanning this stream batch.
+            self.batch_process_state
+                .output_batches
+                .finish_buffered_batch()?;
+            if let Some(b) = self
+                .batch_process_state
+                .output_batches
+                .next_completed_batch()
+            {
+                self.state = PiecewiseMergeJoinStreamState::FetchStreamBatch;
+                return Ok(StatefulStreamResult::Ready(Some(b)));
+            }
+
+            // Nothing pending; hand back whatever `resolve` returned (often empty) and move on.
+            if self.batch_process_state.output_batches.is_empty() {
+                self.state = PiecewiseMergeJoinStreamState::FetchStreamBatch;
+
+                return Ok(StatefulStreamResult::Ready(Some(batch)));
+            }
+        }
+
+        Ok(StatefulStreamResult::Ready(Some(batch)))
+    }
+
+    // Process remaining unmatched rows
+    fn process_unmatched_buffered_batch(
+        &mut self,
+    ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
+        // Return early for `JoinType::Right` and `JoinType::Inner`
+        if matches!(self.join_type, JoinType::Right | JoinType::Inner) {
+            self.state = PiecewiseMergeJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Ready(None));
+        }
+
+        if !self.batch_process_state.continue_process {
+            if let Some(batch) = self
+                .batch_process_state
+                .output_batches
+                .next_completed_batch()
+            {
+                return Ok(StatefulStreamResult::Ready(Some(batch)));
+            }
+
+            self.batch_process_state
+                .output_batches
+                .finish_buffered_batch()?;
+            if let Some(batch) = self
+                .batch_process_state
+                .output_batches
+                .next_completed_batch()
+            {
+                self.state = PiecewiseMergeJoinStreamState::Completed;
+                return Ok(StatefulStreamResult::Ready(Some(batch)));
+            }
+        }
+
+        let buffered_data =
+            Arc::clone(&self.buffered_side.try_as_ready().unwrap().buffered_data);
+
+        let (buffered_indices, _streamed_indices) = get_final_indices_from_shared_bitmap(
+            &buffered_data.visited_indices_bitmap,
+            self.join_type,
+            true,
+        );
+
+        let new_buffered_batch =
+            take_record_batch(buffered_data.batch(), &buffered_indices)?;
+        let mut buffered_columns = new_buffered_batch.columns().to_vec();
+
+        let streamed_columns: Vec<ArrayRef> = self
+            .streamed_schema
+            .fields()
+            .iter()
+            .map(|f| new_null_array(f.data_type(), new_buffered_batch.num_rows()))
+            .collect();
+
+        buffered_columns.extend(streamed_columns);
+
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), buffered_columns)?;
+
+        self.batch_process_state.output_batches.push_batch(batch)?;
+
+        self.batch_process_state.continue_process = false;
+        if let Some(batch) = self
+            .batch_process_state
+            .output_batches
+            .next_completed_batch()
+        {
+            return Ok(StatefulStreamResult::Ready(Some(batch)));
+        }
+
+        self.batch_process_state
+            .output_batches
+            .finish_buffered_batch()?;
+        if let Some(batch) = self
+            .batch_process_state
+            .output_batches
+            .next_completed_batch()
+        {
+            self.state = PiecewiseMergeJoinStreamState::Completed;
+            return Ok(StatefulStreamResult::Ready(Some(batch)));
+        }
+
+        self.state = PiecewiseMergeJoinStreamState::Completed;
+        self.batch_process_state.reset();
+        Ok(StatefulStreamResult::Ready(None))
+    }
+}
+
+struct BatchProcessState {
+    // Used to pick up from the last index on the stream side
+    output_batches: Box<BatchCoalescer>,
+    // Used to store the unmatched stream indices for `JoinType::Right` and `JoinType::Full`
+    unmatched_indices: PrimitiveBuilder<UInt32Type>,
+    // Used to store the start index on the buffered side; used to resume processing on the correct
+    // row
+    start_buffer_idx: usize,
+    // Used to store the start index on the stream side; used to resume processing on the correct
+    // row
+    start_stream_idx: usize,
+    // Signals if we found a match for the current stream row
+    found: bool,
+    // Signals to continue processing the current stream batch
+    continue_process: bool,
+    // Skip nulls
+    processed_null_count: bool,
+}
+
+impl BatchProcessState {
+    pub(crate) fn new(schema: Arc<Schema>, batch_size: usize) -> Self {
+        Self {
+            output_batches: Box::new(BatchCoalescer::new(schema, batch_size)),
+            unmatched_indices: PrimitiveBuilder::new(),
+            start_buffer_idx: 0,
+            start_stream_idx: 0,
+            found: false,
+            continue_process: true,
+            processed_null_count: false,
+        }
+    }
+
+    pub(crate) fn reset(&mut self) {
+        self.unmatched_indices = PrimitiveBuilder::new();
+        self.start_buffer_idx = 0;
+        self.start_stream_idx = 0;
+        self.found = false;
+        self.continue_process = true;
+        self.processed_null_count = false;
+    }
+}
+
+impl Stream for ClassicPWMJStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        self.poll_next_impl(cx)
+    }
+}
+
+// For Left, Right, Full, and Inner joins, incoming stream batches will already be sorted.
+fn resolve_classic_join(
+    buffered_side: &mut BufferedSideReadyState,
+    stream_batch: &SortedStreamBatch,
+    join_schema: &SchemaRef,
+    operator: Operator,
+    sort_options: SortOptions,
+    join_type: JoinType,
+    batch_process_state: &mut BatchProcessState,
+) -> Result<RecordBatch> {
+    let buffered_len = buffered_side.buffered_data.values().len();
+    let stream_values = stream_batch.compare_key_values();
+
+    let mut buffer_idx = batch_process_state.start_buffer_idx;
+    let mut stream_idx = batch_process_state.start_stream_idx;
+
+    if !batch_process_state.processed_null_count {
+        let buffered_null_idx = buffered_side.buffered_data.values().null_count();
+        let stream_null_idx = stream_values[0].null_count();
+        buffer_idx = buffered_null_idx;
+        stream_idx = stream_null_idx;
+        batch_process_state.processed_null_count = true;
+    }
+
+    // Our buffer_idx variable allows us to start probing on the buffered side where we last matched
+    // in the previous stream row.
+    for row_idx in stream_idx..stream_batch.batch.num_rows() {
+        while buffer_idx < buffered_len {
+            let compare = {
+                let buffered_values = buffered_side.buffered_data.values();
+                compare_join_arrays(
+                    &[Arc::clone(&stream_values[0])],
+                    row_idx,
+                    &[Arc::clone(buffered_values)],
+                    buffer_idx,
+                    &[sort_options],
+                    NullEquality::NullEqualsNothing,
+                )?
+            };
+
+            // If we find a match we append all indices and move to the next stream row index
+            match operator {
+                Operator::Gt | Operator::Lt => {
+                    if compare == Ordering::Less {
+                        batch_process_state.found = true;
+                        let count = buffered_len - buffer_idx;
+
+                        let batch = build_matched_indices_and_set_buffered_bitmap(
+                            (buffer_idx, count),
+                            (row_idx, count),
+                            buffered_side,
+                            stream_batch,
+                            join_type,
+                            join_schema,
+                        )?;
+
+                        batch_process_state.output_batches.push_batch(batch)?;
+
+                        // Flush batch and update pointers if we have a completed batch
+                        if let Some(batch) =
+                            batch_process_state.output_batches.next_completed_batch()
+                        {
+                            batch_process_state.found = false;
+                            batch_process_state.start_buffer_idx = buffer_idx;
+                            batch_process_state.start_stream_idx = row_idx + 1;
+                            return Ok(batch);
+                        }
+
+                        break;
+                    }
+                }
+                Operator::GtEq | Operator::LtEq => {
+                    if matches!(compare, Ordering::Equal | Ordering::Less) {
+                        batch_process_state.found = true;
+                        let count = buffered_len - buffer_idx;
+                        let batch = build_matched_indices_and_set_buffered_bitmap(
+                            (buffer_idx, count),
+                            (row_idx, count),
+                            buffered_side,
+                            stream_batch,
+                            join_type,
+                            join_schema,
+                        )?;
+
+                        // Flush batch and update pointers if we have a completed batch
+                        batch_process_state.output_batches.push_batch(batch)?;
+                        if let Some(batch) =
+                            batch_process_state.output_batches.next_completed_batch()
+                        {
+                            batch_process_state.found = false;
+                            batch_process_state.start_buffer_idx = buffer_idx;
+                            batch_process_state.start_stream_idx = row_idx + 1;
+                            return Ok(batch);
+                        }
+
+                        break;
+                    }
+                }
+                _ => {
+                    return internal_err!(
+                        "PiecewiseMergeJoin should not contain operator, {}",
+                        operator
+                    );
+                }
+            };
+
+            // Increment buffer_idx after every row
+            buffer_idx += 1;
+        }
+
+        // If a match was not found for the current stream row index the stream indice is appended
+        // to the unmatched indices to be flushed later.
+        if matches!(join_type, JoinType::Right | JoinType::Full)
+            && !batch_process_state.found
+        {
+            batch_process_state
+                .unmatched_indices
+                .append_value(row_idx as u32);
+        }
+
+        batch_process_state.found = false;
+    }
+
+    // Flushed all unmatched indices on the streamed side
+    if matches!(join_type, JoinType::Right | JoinType::Full) {
+        let batch = create_unmatched_batch(
+            &mut batch_process_state.unmatched_indices,
+            stream_batch,
+            join_schema,
+        )?;
+
+        batch_process_state.output_batches.push_batch(batch)?;
+    }
+
+    batch_process_state.continue_process = false;
+    Ok(RecordBatch::new_empty(Arc::clone(join_schema)))
+}
+
+// Builds a record batch from indices ranges on the buffered and streamed side.
+//
+// The two ranges are: buffered_range: (start index, count) and streamed_range: (start index, count) due
+// to batch.slice(start, count).
+fn build_matched_indices_and_set_buffered_bitmap(
+    buffered_range: (usize, usize),
+    streamed_range: (usize, usize),
+    buffered_side: &mut BufferedSideReadyState,
+    stream_batch: &SortedStreamBatch,
+    join_type: JoinType,
+    join_schema: &SchemaRef,
+) -> Result<RecordBatch> {
+    // Mark the buffered indices as visited
+    if need_produce_result_in_final(join_type) {
+        let mut bitmap = buffered_side.buffered_data.visited_indices_bitmap.lock();
+        for i in buffered_range.0..buffered_range.0 + buffered_range.1 {
+            bitmap.set_bit(i, true);
+        }
+    }
+
+    let new_buffered_batch = buffered_side
+        .buffered_data
+        .batch()
+        .slice(buffered_range.0, buffered_range.1);
+    let mut buffered_columns = new_buffered_batch.columns().to_vec();
+
+    let indices = UInt32Array::from_value(streamed_range.0 as u32, streamed_range.1);
+    let new_stream_batch = take_record_batch(&stream_batch.batch, &indices)?;
+    let streamed_columns = new_stream_batch.columns().to_vec();
+
+    buffered_columns.extend(streamed_columns);
+
+    Ok(RecordBatch::try_new(
+        Arc::clone(join_schema),
+        buffered_columns,
+    )?)
+}
+
+// Creates a record batch from the unmatched indices on the streamed side
+fn create_unmatched_batch(
+    streamed_indices: &mut PrimitiveBuilder<UInt32Type>,
+    stream_batch: &SortedStreamBatch,
+    join_schema: &SchemaRef,
+) -> Result<RecordBatch> {
+    let streamed_indices = streamed_indices.finish();
+    let new_stream_batch = take_record_batch(&stream_batch.batch, &streamed_indices)?;
+    let streamed_columns = new_stream_batch.columns().to_vec();
+    let buffered_cols_len = join_schema.fields().len() - streamed_columns.len();
+
+    let num_rows = new_stream_batch.num_rows();
+    let mut buffered_columns: Vec<ArrayRef> = join_schema
+        .fields()
+        .iter()
+        .take(buffered_cols_len)
+        .map(|field| new_null_array(field.data_type(), num_rows))
+        .collect();
+
+    buffered_columns.extend(streamed_columns);
+
+    Ok(RecordBatch::try_new(
+        Arc::clone(join_schema),
+        buffered_columns,
+    )?)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        ExecutionPlan, common,
+        joins::PiecewiseMergeJoinExec,
+        test::{TestMemoryExec, build_table_i32},
+    };
+    use arrow::array::{Date32Array, Date64Array};
+    use arrow_schema::{DataType, Field};
+    use datafusion_common::test_util::batches_to_string;
+    use datafusion_execution::TaskContext;
+    use datafusion_expr::JoinType;
+    use datafusion_physical_expr::{PhysicalExpr, expressions::Column};
+    use insta::assert_snapshot;
+    use std::sync::Arc;
+
+    fn columns(schema: &Schema) -> Vec<String> {
+        schema.fields().iter().map(|f| f.name().clone()).collect()
+    }
+
+    fn build_table(
+        a: (&str, &Vec<i32>),
+        b: (&str, &Vec<i32>),
+        c: (&str, &Vec<i32>),
+    ) -> Arc<dyn ExecutionPlan> {
+        let batch = build_table_i32(a, b, c);
+        let schema = batch.schema();
+        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+    }
+
+    fn build_date_table(
+        a: (&str, &Vec<i32>),
+        b: (&str, &Vec<i32>),
+        c: (&str, &Vec<i32>),
+    ) -> Arc<dyn ExecutionPlan> {
+        let schema = Schema::new(vec![
+            Field::new(a.0, DataType::Date32, false),
+            Field::new(b.0, DataType::Date32, false),
+            Field::new(c.0, DataType::Date32, false),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(Date32Array::from(a.1.clone())),
+                Arc::new(Date32Array::from(b.1.clone())),
+                Arc::new(Date32Array::from(c.1.clone())),
+            ],
+        )
+        .unwrap();
+
+        let schema = batch.schema();
+        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+    }
+
+    fn build_date64_table(
+        a: (&str, &Vec<i64>),
+        b: (&str, &Vec<i64>),
+        c: (&str, &Vec<i64>),
+    ) -> Arc<dyn ExecutionPlan> {
+        let schema = Schema::new(vec![
+            Field::new(a.0, DataType::Date64, false),
+            Field::new(b.0, DataType::Date64, false),
+            Field::new(c.0, DataType::Date64, false),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(Date64Array::from(a.1.clone())),
+                Arc::new(Date64Array::from(b.1.clone())),
+                Arc::new(Date64Array::from(c.1.clone())),
+            ],
+        )
+        .unwrap();
+
+        let schema = batch.schema();
+        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+    }
+
+    fn join(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>),
+        operator: Operator,
+        join_type: JoinType,
+    ) -> Result<PiecewiseMergeJoinExec> {
+        PiecewiseMergeJoinExec::try_new(left, right, on, operator, join_type, 1)
+    }
+
+    async fn join_collect(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: (PhysicalExprRef, PhysicalExprRef),
+        operator: Operator,
+        join_type: JoinType,
+    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+        join_collect_with_options(left, right, on, operator, join_type).await
+    }
+
+    async fn join_collect_with_options(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: (PhysicalExprRef, PhysicalExprRef),
+        operator: Operator,
+        join_type: JoinType,
+    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let join = join(left, right, on, operator, join_type)?;
+        let columns = columns(&join.schema());
+
+        let stream = join.execute(0, task_ctx)?;
+        let batches = common::collect(stream).await?;
+        Ok((columns, batches))
+    }
+
+    #[tokio::test]
+    async fn join_inner_less_than() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 3  | 7  |
+        // | 2  | 2  | 8  |
+        // | 3  | 1  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![3, 2, 1]), // this has a repetition
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 2  | 70 |
+        // | 20 | 3  | 80 |
+        // | 30 | 4  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![2, 3, 4]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 3  | 7  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 30 | 4  | 90 |
+        | 3  | 1  | 9  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 20 | 3  | 80 |
+        | 3  | 1  | 9  | 20 | 3  | 80 |
+        | 3  | 1  | 9  | 10 | 2  | 70 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_less_than_unsorted() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 3  | 7  |
+        // | 2  | 2  | 8  |
+        // | 3  | 1  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![3, 2, 1]), // this has a repetition
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // | 20 | 2  | 80 |
+        // | 30 | 4  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![3, 2, 4]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 3  | 7  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 30 | 4  | 90 |
+        | 3  | 1  | 9  | 30 | 4  | 90 |
+        | 2  | 2  | 8  | 10 | 3  | 70 |
+        | 3  | 1  | 9  | 10 | 3  | 70 |
+        | 3  | 1  | 9  | 20 | 2  | 80 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_greater_than_equal_to() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 2  | 7  |
+        // | 2  | 3  | 8  |
+        // | 3  | 4  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![2, 3, 4]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // | 20 | 2  | 80 |
+        // | 30 | 1  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![3, 2, 1]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::GtEq, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 2  | 7  | 30 | 1  | 90 |
+        | 2  | 3  | 8  | 30 | 1  | 90 |
+        | 3  | 4  | 9  | 30 | 1  | 90 |
+        | 1  | 2  | 7  | 20 | 2  | 80 |
+        | 2  | 3  | 8  | 20 | 2  | 80 |
+        | 3  | 4  | 9  | 20 | 2  | 80 |
+        | 2  | 3  | 8  | 10 | 3  | 70 |
+        | 3  | 4  | 9  | 10 | 3  | 70 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_empty_left() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // (empty)
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &Vec::<i32>::new()),
+            ("b1", &Vec::<i32>::new()),
+            ("c1", &Vec::<i32>::new()),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 1  | 1  | 1  |
+        // | 2  | 2  | 2  |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![1, 2]),
+            ("b1", &vec![1, 2]),
+            ("c2", &vec![1, 2]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+        let (_, batches) =
+            join_collect(left, right, on, Operator::LtEq, JoinType::Inner).await?;
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_full_greater_than_equal_to() -> Result<()> {
+        // +----+----+-----+
+        // | a1 | b1 | c1  |
+        // +----+----+-----+
+        // | 1  | 1  | 100 |
+        // | 2  | 2  | 200 |
+        // +----+----+-----+
+        let left = build_table(
+            ("a1", &vec![1, 2]),
+            ("b1", &vec![1, 2]),
+            ("c1", &vec![100, 200]),
+        );
+
+        // +----+----+-----+
+        // | a2 | b1 | c2  |
+        // +----+----+-----+
+        // | 10 | 3  | 300 |
+        // | 20 | 2  | 400 |
+        // +----+----+-----+
+        let right = build_table(
+            ("a2", &vec![10, 20]),
+            ("b1", &vec![3, 2]),
+            ("c2", &vec![300, 400]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::GtEq, JoinType::Full).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+-----+----+----+-----+
+        | a1 | b1 | c1  | a2 | b1 | c2  |
+        +----+----+-----+----+----+-----+
+        | 2  | 2  | 200 | 20 | 2  | 400 |
+        |    |    |     | 10 | 3  | 300 |
+        | 1  | 1  | 100 |    |    |     |
+        +----+----+-----+----+----+-----+
+        ");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_left_greater_than() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 1  | 7  |
+        // | 2  | 3  | 8  |
+        // | 3  | 4  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![1, 3, 4]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // | 20 | 2  | 80 |
+        // | 30 | 1  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![3, 2, 1]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Gt, JoinType::Left).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 2  | 3  | 8  | 30 | 1  | 90 |
+        | 3  | 4  | 9  | 30 | 1  | 90 |
+        | 2  | 3  | 8  | 20 | 2  | 80 |
+        | 3  | 4  | 9  | 20 | 2  | 80 |
+        | 3  | 4  | 9  | 10 | 3  | 70 |
+        | 1  | 1  | 7  |    |    |    |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_right_greater_than() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 1  | 7  |
+        // | 2  | 3  | 8  |
+        // | 3  | 4  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![1, 3, 4]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 5  | 70 |
+        // | 20 | 3  | 80 |
+        // | 30 | 2  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![5, 3, 2]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Gt, JoinType::Right).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 2  | 3  | 8  | 30 | 2  | 90 |
+        | 3  | 4  | 9  | 30 | 2  | 90 |
+        | 3  | 4  | 9  | 20 | 3  | 80 |
+        |    |    |    | 10 | 5  | 70 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_right_less_than() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 4  | 7  |
+        // | 2  | 3  | 8  |
+        // | 3  | 1  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 3, 1]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 2  | 70 |
+        // | 20 | 3  | 80 |
+        // | 30 | 5  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![2, 3, 5]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Right).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 4  | 7  | 30 | 5  | 90 |
+        | 2  | 3  | 8  | 30 | 5  | 90 |
+        | 3  | 1  | 9  | 30 | 5  | 90 |
+        | 3  | 1  | 9  | 20 | 3  | 80 |
+        | 3  | 1  | 9  | 10 | 2  | 70 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_less_than_equal_with_dups() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 4  | 7  |
+        // | 2  | 4  | 8  |
+        // | 3  | 2  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![4, 4, 2]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 4  | 70 |
+        // | 20 | 3  | 80 |
+        // | 30 | 2  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![4, 3, 2]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::LtEq, JoinType::Inner).await?;
+
+        // Expected grouping follows right.b1 descending (4, 3, 2)
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 1  | 4  | 7  | 10 | 4  | 70 |
+        | 2  | 4  | 8  | 10 | 4  | 70 |
+        | 3  | 2  | 9  | 10 | 4  | 70 |
+        | 3  | 2  | 9  | 20 | 3  | 80 |
+        | 3  | 2  | 9  | 30 | 2  | 90 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_greater_than_unsorted_right() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 1  | 7  |
+        // | 2  | 2  | 8  |
+        // | 3  | 4  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![1, 2, 4]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // | 20 | 1  | 80 |
+        // | 30 | 2  | 90 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![3, 1, 2]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Gt, JoinType::Inner).await?;
+
+        // Grouped by right in ascending evaluation for > (1,2,3)
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 2  | 2  | 8  | 20 | 1  | 80 |
+        | 3  | 4  | 9  | 20 | 1  | 80 |
+        | 3  | 4  | 9  | 30 | 2  | 90 |
+        | 3  | 4  | 9  | 10 | 3  | 70 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_left_less_than_equal_with_left_nulls_on_no_match() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 5  | 7  |
+        // | 2  | 4  | 8  |
+        // | 3  | 1  | 9  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![5, 4, 1]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // +----+----+----+
+        let right = build_table(("a2", &vec![10]), ("b1", &vec![3]), ("c2", &vec![70]));
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::LtEq, JoinType::Left).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        | 3  | 1  | 9  | 10 | 3  | 70 |
+        | 1  | 5  | 7  |    |    |    |
+        | 2  | 4  | 8  |    |    |    |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_right_greater_than_equal_with_right_nulls_on_no_match() -> Result<()> {
+        // +----+----+----+
+        // | a1 | b1 | c1 |
+        // +----+----+----+
+        // | 1  | 1  | 7  |
+        // | 2  | 2  | 8  |
+        // +----+----+----+
+        let left = build_table(
+            ("a1", &vec![1, 2]),
+            ("b1", &vec![1, 2]),
+            ("c1", &vec![7, 8]),
+        );
+
+        // +----+----+----+
+        // | a2 | b1 | c2 |
+        // +----+----+----+
+        // | 10 | 3  | 70 |
+        // | 20 | 5  | 80 |
+        // +----+----+----+
+        let right = build_table(
+            ("a2", &vec![10, 20]),
+            ("b1", &vec![3, 5]),
+            ("c2", &vec![70, 80]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::GtEq, JoinType::Right).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        |    |    |    | 10 | 3  | 70 |
+        |    |    |    | 20 | 5  | 80 |
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_single_row_left_less_than() -> Result<()> {
+        let left = build_table(("a1", &vec![42]), ("b1", &vec![5]), ("c1", &vec![999]));
+
+        let right = build_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![1, 5, 7]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+-----+----+----+----+
+        | a1 | b1 | c1  | a2 | b1 | c2 |
+        +----+----+-----+----+----+----+
+        | 42 | 5  | 999 | 30 | 7  | 90 |
+        +----+----+-----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_inner_empty_right() -> Result<()> {
+        let left = build_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![1, 2, 3]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        let right = build_table(
+            ("a2", &Vec::<i32>::new()),
+            ("b1", &Vec::<i32>::new()),
+            ("c2", &Vec::<i32>::new()),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Gt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+----+----+----+----+----+
+        | a1 | b1 | c1 | a2 | b1 | c2 |
+        +----+----+----+----+----+----+
+        +----+----+----+----+----+----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_date32_inner_less_than() -> Result<()> {
+        // +----+-------+----+
+        // | a1 |  b1   | c1 |
+        // +----+-------+----+
+        // | 1  | 19107 | 7  |
+        // | 2  | 19107 | 8  |
+        // | 3  | 19105 | 9  |
+        // +----+-------+----+
+        let left = build_date_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![19107, 19107, 19105]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+-------+----+
+        // | a2 |  b1   | c2 |
+        // +----+-------+----+
+        // | 10 | 19105 | 70 |
+        // | 20 | 19103 | 80 |
+        // | 30 | 19107 | 90 |
+        // +----+-------+----+
+        let right = build_date_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![19105, 19103, 19107]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +------------+------------+------------+------------+------------+------------+
+        | a1         | b1         | c1         | a2         | b1         | c2         |
+        +------------+------------+------------+------------+------------+------------+
+        | 1970-01-04 | 2022-04-23 | 1970-01-10 | 1970-01-31 | 2022-04-25 | 1970-04-01 |
+        +------------+------------+------------+------------+------------+------------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_date64_inner_less_than() -> Result<()> {
+        // +----+---------------+----+
+        // | a1 |     b1        | c1 |
+        // +----+---------------+----+
+        // | 1  | 1650903441000 |  7 |
+        // | 2  | 1650903441000 |  8 |
+        // | 3  | 1650703441000 |  9 |
+        // +----+---------------+----+
+        let left = build_date64_table(
+            ("a1", &vec![1, 2, 3]),
+            ("b1", &vec![1650903441000, 1650903441000, 1650703441000]),
+            ("c1", &vec![7, 8, 9]),
+        );
+
+        // +----+---------------+----+
+        // | a2 |     b1        | c2 |
+        // +----+---------------+----+
+        // | 10 | 1650703441000 | 70 |
+        // | 20 | 1650503441000 | 80 |
+        // | 30 | 1650903441000 | 90 |
+        // +----+---------------+----+
+        let right = build_date64_table(
+            ("a2", &vec![10, 20, 30]),
+            ("b1", &vec![1650703441000, 1650503441000, 1650903441000]),
+            ("c2", &vec![70, 80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Inner).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | 1970-01-01T00:00:00.003 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_date64_right_less_than() -> Result<()> {
+        // +----+---------------+----+
+        // | a1 |     b1        | c1 |
+        // +----+---------------+----+
+        // | 1  | 1650903441000 |  7 |
+        // | 2  | 1650703441000 |  8 |
+        // +----+---------------+----+
+        let left = build_date64_table(
+            ("a1", &vec![1, 2]),
+            ("b1", &vec![1650903441000, 1650703441000]),
+            ("c1", &vec![7, 8]),
+        );
+
+        // +----+---------------+----+
+        // | a2 |     b1        | c2 |
+        // +----+---------------+----+
+        // | 10 | 1650703441000 | 80 |
+        // | 20 | 1650903441000 | 90 |
+        // +----+---------------+----+
+        let right = build_date64_table(
+            ("a2", &vec![10, 20]),
+            ("b1", &vec![1650703441000, 1650903441000]),
+            ("c2", &vec![80, 90]),
+        );
+
+        let on = (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        );
+
+        let (_, batches) =
+            join_collect(left, right, on, Operator::Lt, JoinType::Right).await?;
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        | 1970-01-01T00:00:00.002 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.020 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+        |                         |                     |                         | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.080 |
+        +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+        ");
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs
new file mode 100644
index 0000000000000..fb1c4b160528d
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs
@@ -0,0 +1,795 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Array;
+use arrow::{
+    array::{ArrayRef, BooleanBufferBuilder, RecordBatch},
+    compute::concat_batches,
+    util::bit_util,
+};
+use arrow_schema::{SchemaRef, SortOptions};
+use datafusion_common::not_impl_err;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{JoinSide, Result, internal_err};
+use datafusion_execution::{
+    SendableRecordBatchStream,
+    memory_pool::{MemoryConsumer, MemoryReservation},
+};
+use datafusion_expr::{JoinType, Operator};
+use datafusion_physical_expr::equivalence::join_equivalence_properties;
+use datafusion_physical_expr::{
+    Distribution, LexOrdering, OrderingRequirements, PhysicalExpr, PhysicalExprRef,
+    PhysicalSortExpr,
+};
+use datafusion_physical_expr_common::physical_expr::fmt_sql;
+use futures::TryStreamExt;
+use parking_lot::Mutex;
+use std::fmt::Formatter;
+use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
+
+use crate::execution_plan::{EmissionType, boundedness_from_children};
+
+use crate::joins::piecewise_merge_join::classic_join::{
+    ClassicPWMJStream, PiecewiseMergeJoinStreamState,
+};
+use crate::joins::piecewise_merge_join::utils::{
+    build_visited_indices_map, is_existence_join, is_right_existence_join,
+};
+use crate::joins::utils::asymmetric_join_output_partitioning;
+use crate::metrics::MetricsSet;
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlanProperties, check_if_same_properties,
+};
+use crate::{
+    ExecutionPlan, PlanProperties,
+    joins::{
+        SharedBitmapBuilder,
+        utils::{BuildProbeJoinMetrics, OnceAsync, OnceFut, build_join_schema},
+    },
+    metrics::ExecutionPlanMetricsSet,
+    spill::get_record_batch_memory_size,
+};
+
+/// `PiecewiseMergeJoinExec` is a join execution plan that only evaluates single range filter and show much
+/// better performance for these workloads than `NestedLoopJoin`
+///
+/// The physical planner will choose to evaluate this join when there is only one comparison filter. This
+/// is a binary expression which contains [`Operator::Lt`], [`Operator::LtEq`], [`Operator::Gt`], and
+/// [`Operator::GtEq`].:
+/// Examples:
+///  - `col0` < `colb`, `col0` <= `colb`, `col0` > `colb`, `col0` >= `colb`
+///
+/// # Execution Plan Inputs
+/// For `PiecewiseMergeJoin` we label all right inputs as the `streamed' side and the left outputs as the
+/// 'buffered' side.
+///
+/// `PiecewiseMergeJoin` takes a sorted input for the side to be buffered and is able to sort streamed record
+/// batches during processing. Sorted input must specifically be ascending/descending based on the operator.
+///
+/// # Algorithms
+/// Classic joins are processed differently compared to existence joins.
+///
+/// ## Classic Joins (Inner, Full, Left, Right)
+/// For classic joins we buffer the build side and stream the probe side (the "probe" side).
+/// Both sides are sorted so that we can iterate from index 0 to the end on each side.  This ordering ensures
+/// that when we find the first matching pair of rows, we can emit the current stream row joined with all remaining
+/// probe rows from the match position onward, without rescanning earlier probe rows.
+///
+/// For `<` and `<=` operators, both inputs are sorted in **descending** order, while for `>` and `>=` operators
+/// they are sorted in **ascending** order. This choice ensures that the pointer on the buffered side can advance
+/// monotonically as we stream new batches from the stream side.
+///
+/// The streamed side may arrive unsorted, so this operator sorts each incoming batch in memory before
+/// processing. The buffered side is required to be globally sorted; the plan declares this requirement
+/// in `requires_input_order`, which allows the optimizer to automatically insert a `SortExec` on that side if needed.
+/// By the time this operator runs, the buffered side is guaranteed to be in the proper order.
+///
+/// The pseudocode for the algorithm looks like this:
+///
+/// ```text
+/// for stream_row in stream_batch:
+///     for buffer_row in buffer_batch:
+///         if compare(stream_row, probe_row):
+///             output stream_row X buffer_batch[buffer_row:]
+///         else:
+///             continue
+/// ```
+///
+/// The algorithm uses the streamed side (larger) to drive the loop. This is due to every row on the stream side iterating
+/// the buffered side to find every first match. By doing this, each match can output more result so that output
+/// handling can be better vectorized for performance.
+///
+/// Here is an example:
+///
+/// We perform a `JoinType::Left` with these two batches and the operator being `Operator::Lt`(<). For each
+/// row on the streamed side we move a pointer on the buffered until it matches the condition. Once we reach
+/// the row which matches (in this case with row 1 on streamed will have its first match on row 2 on
+/// buffered; 100 < 200 is true), we can emit all rows after that match. We can emit the rows like this because
+/// if the batch is sorted in ascending order, every subsequent row will also satisfy the condition as they will
+/// all be larger values.
+///
+/// ```text
+/// SQL statement:
+/// SELECT *
+/// FROM (VALUES (100), (200), (500)) AS streamed(a)
+/// LEFT JOIN (VALUES (100), (200), (200), (300), (400)) AS buffered(b)
+///   ON streamed.a < buffered.b;
+///
+/// Processing Row 1:
+///
+///       Sorted Buffered Side                                         Sorted Streamed Side
+///       ┌──────────────────┐                                         ┌──────────────────┐
+///     1 │       100        │                                       1 │       100        │
+///       ├──────────────────┤                                         ├──────────────────┤
+///     2 │       200        │ ─┐                                    2 │       200        │
+///       ├──────────────────┤  │  For row 1 on streamed side with     ├──────────────────┤
+///     3 │       200        │  │  value 100, we emit rows 2 - 5.    3 │       500        │
+///       ├──────────────────┤  │  as matches when the operator is     └──────────────────┘
+///     4 │       300        │  │  `Operator::Lt` (<) Emitting all
+///       ├──────────────────┤  │  rows after the first match (row
+///     5 │       400        │ ─┘  2 buffered side; 100 < 200)
+///       └──────────────────┘
+///
+/// Processing Row 2:
+///   By sorting the streamed side we know
+///
+///       Sorted Buffered Side                                         Sorted Streamed Side
+///       ┌──────────────────┐                                         ┌──────────────────┐
+///     1 │       100        │                                       1 │       100        │
+///       ├──────────────────┤                                         ├──────────────────┤
+///     2 │       200        │ <- Start here when probing for the    2 │       200        │
+///       ├──────────────────┤    streamed side row 2.                 ├──────────────────┤
+///     3 │       200        │                                       3 │       500        │
+///       ├──────────────────┤                                         └──────────────────┘
+///     4 │       300        │
+///       ├──────────────────┤
+///     5 │       400        │
+///       └──────────────────┘
+/// ```
+///
+/// ## Existence Joins (Semi, Anti, Mark)
+/// Existence joins are made magnitudes of times faster with a `PiecewiseMergeJoin` as we only need to find
+/// the min/max value of the streamed side to be able to emit all matches on the buffered side. By putting
+/// the side we need to mark onto the sorted buffer side, we can emit all these matches at once.
+///
+/// For less than operations (`<`) both inputs are to be sorted in descending order and vice versa for greater
+/// than (`>`) operations. `SortExec` is used to enforce sorting on the buffered side and streamed side does not
+/// need to be sorted due to only needing to find the min/max.
+///
+/// For Left Semi, Anti, and Mark joins we swap the inputs so that the marked side is on the buffered side.
+///
+/// The pseudocode for the algorithm looks like this:
+///
+/// ```text
+/// // Using the example of a less than `<` operation
+/// let max = max_batch(streamed_batch)
+///
+/// for buffer_row in buffer_batch:
+///     if buffer_row < max:
+///         output buffer_batch[buffer_row:]
+/// ```
+///
+/// Only need to find the min/max value and iterate through the buffered side once.
+///
+/// Here is an example:
+/// We perform a `JoinType::LeftSemi` with these two batches and the operator being `Operator::Lt`(<). Because
+/// the operator is `Operator::Lt` we can find the minimum value in the streamed side; in this case it is 200.
+/// We can then advance a pointer from the start of the buffer side until we find the first value that satisfies
+/// the predicate. All rows after that first matched value satisfy the condition 200 < x so we can mark all of
+/// those rows as matched.
+///
+/// ```text
+/// SQL statement:
+/// SELECT *
+/// FROM (VALUES (500), (200), (300)) AS streamed(a)
+/// LEFT SEMI JOIN (VALUES (100), (200), (200), (300), (400)) AS buffered(b)
+///   ON streamed.a < buffered.b;
+///
+///          Sorted Buffered Side             Unsorted Streamed Side
+///            ┌──────────────────┐          ┌──────────────────┐
+///          1 │       100        │        1 │       500        │
+///            ├──────────────────┤          ├──────────────────┤
+///          2 │       200        │        2 │       200        │
+///            ├──────────────────┤          ├──────────────────┤
+///          3 │       200        │        3 │       300        │
+///            ├──────────────────┤          └──────────────────┘
+///          4 │       300        │ ─┐
+///            ├──────────────────┤  | We emit matches for row 4 - 5
+///          5 │       400        │ ─┘ on the buffered side.
+///            └──────────────────┘
+///             min value: 200
+/// ```
+///
+/// For both types of joins, the buffered side must be sorted ascending for `Operator::Lt` (<) or
+/// `Operator::LtEq` (<=) and descending for `Operator::Gt` (>) or `Operator::GtEq` (>=).
+///
+/// # Partitioning Logic
+/// Piecewise Merge Join requires one buffered side partition + round robin partitioned stream side. A counter
+/// is used in the buffered side to coordinate when all streamed partitions are finished execution. This allows
+/// for processing the rest of the unmatched rows for Left and Full joins. The last partition that finishes
+/// execution will be responsible for outputting the unmatched rows.
+///
+/// # Performance Explanation (cost)
+/// Piecewise Merge Join is used over Nested Loop Join due to its superior performance. Here is the breakdown:
+///
+/// R: Buffered Side
+/// S: Streamed Side
+///
+/// ## Piecewise Merge Join (PWMJ)
+///
+/// # Classic Join:
+/// Requires sorting the probe side and, for each probe row, scanning the buffered side until the first match
+/// is found.
+///     Complexity: `O(sort(S) + num_of_batches(|S|) * scan(R))`.
+///
+/// # Mark Join:
+/// Sorts the probe side, then computes the min/max range of the probe keys and scans the buffered side only
+/// within that range.
+///   Complexity: `O(|S| + scan(R[range]))`.
+///
+/// ## Nested Loop Join
+/// Compares every row from `S` with every row from `R`.
+///   Complexity: `O(|S| * |R|)`.
+///
+/// ## Nested Loop Join
+///   Always going to be probe (O(S) * O(R)).
+///
+/// # Further Reference Material
+/// DuckDB blog on Range Joins: [Range Joins in DuckDB](https://duckdb.org/2022/05/27/iejoin.html)
+#[derive(Debug)]
+pub struct PiecewiseMergeJoinExec {
+    /// Left buffered execution plan
+    pub buffered: Arc<dyn ExecutionPlan>,
+    /// Right streamed execution plan
+    pub streamed: Arc<dyn ExecutionPlan>,
+    /// The two expressions being compared
+    pub on: (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>),
+    /// Comparison operator in the range predicate
+    pub operator: Operator,
+    /// How the join is performed
+    pub join_type: JoinType,
+    /// The schema once the join is applied
+    schema: SchemaRef,
+    /// Buffered data
+    buffered_fut: OnceAsync<BufferedSideData>,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+
+    /// Sort expressions - See above for more details [`PiecewiseMergeJoinExec`]
+    ///
+    /// The left sort order, descending for `<`, `<=` operations + ascending for `>`, `>=` operations
+    left_child_plan_required_order: LexOrdering,
+    /// The right sort order, descending for `<`, `<=` operations + ascending for `>`, `>=` operations
+    /// Unsorted for mark joins
+    right_batch_required_orders: LexOrdering,
+
+    /// This determines the sort order of all join columns used in sorting the stream and buffered execution plans.
+    sort_options: SortOptions,
+    /// Cache holding plan properties like equivalences, output partitioning etc.
+    cache: Arc<PlanProperties>,
+    /// Number of partitions to process
+    num_partitions: usize,
+}
+
+impl PiecewiseMergeJoinExec {
+    pub fn try_new(
+        buffered: Arc<dyn ExecutionPlan>,
+        streamed: Arc<dyn ExecutionPlan>,
+        on: (Arc<dyn PhysicalExpr>, Arc<dyn PhysicalExpr>),
+        operator: Operator,
+        join_type: JoinType,
+        num_partitions: usize,
+    ) -> Result<Self> {
+        // TODO: Implement existence joins for PiecewiseMergeJoin
+        if is_existence_join(join_type) {
+            return not_impl_err!(
+                "Existence Joins are currently not supported for PiecewiseMergeJoin"
+            );
+        }
+
+        // Take the operator and enforce a sort order on the streamed + buffered side based on
+        // the operator type.
+        let sort_options = match operator {
+            Operator::Lt | Operator::LtEq => {
+                // For left existence joins the inputs will be swapped so the sort
+                // options are switched
+                if is_right_existence_join(join_type) {
+                    SortOptions::new(false, true)
+                } else {
+                    SortOptions::new(true, true)
+                }
+            }
+            Operator::Gt | Operator::GtEq => {
+                if is_right_existence_join(join_type) {
+                    SortOptions::new(true, true)
+                } else {
+                    SortOptions::new(false, true)
+                }
+            }
+            _ => {
+                return internal_err!(
+                    "Cannot contain non-range operator in PiecewiseMergeJoinExec"
+                );
+            }
+        };
+
+        // Give the same `sort_option for comparison later`
+        let left_child_plan_required_order =
+            vec![PhysicalSortExpr::new(Arc::clone(&on.0), sort_options)];
+        let right_batch_required_orders =
+            vec![PhysicalSortExpr::new(Arc::clone(&on.1), sort_options)];
+
+        let Some(left_child_plan_required_order) =
+            LexOrdering::new(left_child_plan_required_order)
+        else {
+            return internal_err!(
+                "PiecewiseMergeJoinExec requires valid sort expressions for its left side"
+            );
+        };
+        let Some(right_batch_required_orders) =
+            LexOrdering::new(right_batch_required_orders)
+        else {
+            return internal_err!(
+                "PiecewiseMergeJoinExec requires valid sort expressions for its right side"
+            );
+        };
+
+        let buffered_schema = buffered.schema();
+        let streamed_schema = streamed.schema();
+
+        // Create output schema for the join
+        let schema =
+            Arc::new(build_join_schema(&buffered_schema, &streamed_schema, &join_type).0);
+        let cache = Self::compute_properties(
+            &buffered,
+            &streamed,
+            Arc::clone(&schema),
+            join_type,
+            &on,
+        )?;
+
+        Ok(Self {
+            streamed,
+            buffered,
+            on,
+            operator,
+            join_type,
+            schema,
+            buffered_fut: Default::default(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            left_child_plan_required_order,
+            right_batch_required_orders,
+            sort_options,
+            cache: Arc::new(cache),
+            num_partitions,
+        })
+    }
+
+    /// Reference to buffered side execution plan
+    pub fn buffered(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.buffered
+    }
+
+    /// Reference to streamed side execution plan
+    pub fn streamed(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.streamed
+    }
+
+    /// Join type
+    pub fn join_type(&self) -> JoinType {
+        self.join_type
+    }
+
+    /// Reference to sort options
+    pub fn sort_options(&self) -> &SortOptions {
+        &self.sort_options
+    }
+
+    /// Get probe side (streamed side) for the PiecewiseMergeJoin
+    /// In current implementation, probe side is determined according to join type.
+    pub fn probe_side(join_type: &JoinType) -> JoinSide {
+        match join_type {
+            JoinType::Right
+            | JoinType::Inner
+            | JoinType::Full
+            | JoinType::RightSemi
+            | JoinType::RightAnti
+            | JoinType::RightMark => JoinSide::Right,
+            JoinType::Left
+            | JoinType::LeftAnti
+            | JoinType::LeftSemi
+            | JoinType::LeftMark => JoinSide::Left,
+        }
+    }
+
+    pub fn compute_properties(
+        buffered: &Arc<dyn ExecutionPlan>,
+        streamed: &Arc<dyn ExecutionPlan>,
+        schema: SchemaRef,
+        join_type: JoinType,
+        join_on: &(PhysicalExprRef, PhysicalExprRef),
+    ) -> Result<PlanProperties> {
+        let eq_properties = join_equivalence_properties(
+            buffered.equivalence_properties().clone(),
+            streamed.equivalence_properties().clone(),
+            &join_type,
+            schema,
+            &Self::maintains_input_order(join_type),
+            Some(Self::probe_side(&join_type)),
+            std::slice::from_ref(join_on),
+        )?;
+
+        let output_partitioning =
+            asymmetric_join_output_partitioning(buffered, streamed, &join_type)?;
+
+        Ok(PlanProperties::new(
+            eq_properties,
+            output_partitioning,
+            EmissionType::Incremental,
+            boundedness_from_children([buffered, streamed]),
+        ))
+    }
+
+    // TODO: Add input order. Now they're all `false` indicating it will not maintain the input order.
+    // However, for certain join types the order is maintained. This can be updated in the future after
+    // more testing.
+    fn maintains_input_order(join_type: JoinType) -> Vec<bool> {
+        match join_type {
+            // The existence side is expected to come in sorted
+            JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
+                vec![false, false]
+            }
+            JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+                vec![false, false]
+            }
+            // Left, Right, Full, Inner Join is not guaranteed to maintain
+            // input order as the streamed side will be sorted during
+            // execution for `PiecewiseMergeJoin`
+            _ => vec![false, false],
+        }
+    }
+
+    // TODO
+    pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
+        todo!()
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let buffered = children.swap_remove(0);
+        let streamed = children.swap_remove(0);
+        Self {
+            buffered,
+            streamed,
+            on: self.on.clone(),
+            operator: self.operator,
+            join_type: self.join_type,
+            schema: Arc::clone(&self.schema),
+            left_child_plan_required_order: self.left_child_plan_required_order.clone(),
+            right_batch_required_orders: self.right_batch_required_orders.clone(),
+            sort_options: self.sort_options,
+            cache: Arc::clone(&self.cache),
+            num_partitions: self.num_partitions,
+
+            // Re-set state.
+            metrics: ExecutionPlanMetricsSet::new(),
+            buffered_fut: Default::default(),
+        }
+    }
+}
+
+impl ExecutionPlan for PiecewiseMergeJoinExec {
+    fn name(&self) -> &str {
+        "PiecewiseMergeJoinExec"
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.buffered, &self.streamed]
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to the two expressions being compared in the range predicate
+        f(self.on.0.as_ref())?.visit_sibling(|| f(self.on.1.as_ref()))
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![
+            Distribution::SinglePartition,
+            Distribution::UnspecifiedDistribution,
+        ]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        // Existence joins don't need to be sorted on one side.
+        if is_right_existence_join(self.join_type) {
+            unimplemented!()
+        } else {
+            // Sort the right side in memory, so we do not need to enforce any sorting
+            vec![
+                Some(OrderingRequirements::from(
+                    self.left_child_plan_required_order.clone(),
+                )),
+                None,
+            ]
+        }
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+        match &children[..] {
+            [left, right] => Ok(Arc::new(PiecewiseMergeJoinExec::try_new(
+                Arc::clone(left),
+                Arc::clone(right),
+                self.on.clone(),
+                self.operator,
+                self.join_type,
+                self.num_partitions,
+            )?)),
+            _ => internal_err!(
+                "PiecewiseMergeJoin should have 2 children, found {}",
+                children.len()
+            ),
+        }
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(self.with_new_children_and_same_properties(vec![
+            Arc::clone(&self.buffered),
+            Arc::clone(&self.streamed),
+        ])))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<datafusion_execution::TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let on_buffered = Arc::clone(&self.on.0);
+        let on_streamed = Arc::clone(&self.on.1);
+
+        let metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
+        let buffered_fut = self.buffered_fut.try_once(|| {
+            let reservation = MemoryConsumer::new("PiecewiseMergeJoinInput")
+                .register(context.memory_pool());
+
+            let buffered_stream = self.buffered.execute(0, Arc::clone(&context))?;
+            Ok(build_buffered_data(
+                buffered_stream,
+                Arc::clone(&on_buffered),
+                metrics.clone(),
+                reservation,
+                build_visited_indices_map(self.join_type),
+                self.num_partitions,
+            ))
+        })?;
+
+        let streamed = self.streamed.execute(partition, Arc::clone(&context))?;
+
+        let batch_size = context.session_config().batch_size();
+
+        // TODO: Add existence joins + this is guarded at physical planner
+        if is_existence_join(self.join_type()) {
+            unreachable!()
+        } else {
+            Ok(Box::pin(ClassicPWMJStream::try_new(
+                Arc::clone(&self.schema),
+                on_streamed,
+                self.join_type,
+                self.operator,
+                streamed,
+                BufferedSide::Initial(BufferedSideInitialState { buffered_fut }),
+                PiecewiseMergeJoinStreamState::WaitBufferedSide,
+                self.sort_options,
+                metrics,
+                batch_size,
+            )))
+        }
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+}
+
+impl DisplayAs for PiecewiseMergeJoinExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        let on_str = format!(
+            "({} {} {})",
+            fmt_sql(self.on.0.as_ref()),
+            self.operator,
+            fmt_sql(self.on.1.as_ref())
+        );
+
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "PiecewiseMergeJoin: operator={:?}, join_type={:?}, on={}",
+                    self.operator, self.join_type, on_str
+                )
+            }
+
+            DisplayFormatType::TreeRender => {
+                writeln!(f, "operator={:?}", self.operator)?;
+                if self.join_type != JoinType::Inner {
+                    writeln!(f, "join_type={:?}", self.join_type)?;
+                }
+                writeln!(f, "on={on_str}")
+            }
+        }
+    }
+}
+
+async fn build_buffered_data(
+    buffered: SendableRecordBatchStream,
+    on_buffered: PhysicalExprRef,
+    metrics: BuildProbeJoinMetrics,
+    reservation: MemoryReservation,
+    build_map: bool,
+    remaining_partitions: usize,
+) -> Result<BufferedSideData> {
+    let schema = buffered.schema();
+
+    // Combine batches and record number of rows
+    let initial = (Vec::new(), 0, metrics, reservation);
+    let (batches, num_rows, metrics, reservation) = buffered
+        .try_fold(initial, |mut acc, batch| async {
+            let batch_size = get_record_batch_memory_size(&batch);
+            acc.3.try_grow(batch_size)?;
+            acc.2.build_mem_used.add(batch_size);
+            acc.2.build_input_batches.add(1);
+            acc.2.build_input_rows.add(batch.num_rows());
+            // Update row count
+            acc.1 += batch.num_rows();
+            // Push batch to output
+            acc.0.push(batch);
+            Ok(acc)
+        })
+        .await?;
+
+    let single_batch = concat_batches(&schema, batches.iter())?;
+
+    // Evaluate physical expression on the buffered side.
+    let buffered_values = on_buffered
+        .evaluate(&single_batch)?
+        .into_array(single_batch.num_rows())?;
+
+    // We add the single batch size + the memory of the join keys
+    // size of the size estimation
+    let size_estimation = get_record_batch_memory_size(&single_batch)
+        + buffered_values.get_array_memory_size();
+    reservation.try_grow(size_estimation)?;
+    metrics.build_mem_used.add(size_estimation);
+
+    // Created visited indices bitmap only if the join type requires it
+    let visited_indices_bitmap = if build_map {
+        let bitmap_size = bit_util::ceil(single_batch.num_rows(), 8);
+        reservation.try_grow(bitmap_size)?;
+        metrics.build_mem_used.add(bitmap_size);
+
+        let mut bitmap_buffer = BooleanBufferBuilder::new(single_batch.num_rows());
+        bitmap_buffer.append_n(num_rows, false);
+        bitmap_buffer
+    } else {
+        BooleanBufferBuilder::new(0)
+    };
+
+    let buffered_data = BufferedSideData::new(
+        single_batch,
+        buffered_values,
+        Mutex::new(visited_indices_bitmap),
+        remaining_partitions,
+        reservation,
+    );
+
+    Ok(buffered_data)
+}
+
+pub(super) struct BufferedSideData {
+    pub(super) batch: RecordBatch,
+    values: ArrayRef,
+    pub(super) visited_indices_bitmap: SharedBitmapBuilder,
+    pub(super) remaining_partitions: AtomicUsize,
+    _reservation: MemoryReservation,
+}
+
+impl BufferedSideData {
+    pub(super) fn new(
+        batch: RecordBatch,
+        values: ArrayRef,
+        visited_indices_bitmap: SharedBitmapBuilder,
+        remaining_partitions: usize,
+        reservation: MemoryReservation,
+    ) -> Self {
+        Self {
+            batch,
+            values,
+            visited_indices_bitmap,
+            remaining_partitions: AtomicUsize::new(remaining_partitions),
+            _reservation: reservation,
+        }
+    }
+
+    pub(super) fn batch(&self) -> &RecordBatch {
+        &self.batch
+    }
+
+    pub(super) fn values(&self) -> &ArrayRef {
+        &self.values
+    }
+}
+
+pub(super) enum BufferedSide {
+    /// Indicates that build-side not collected yet
+    Initial(BufferedSideInitialState),
+    /// Indicates that build-side data has been collected
+    Ready(BufferedSideReadyState),
+}
+
+impl BufferedSide {
+    // Takes a mutable state of the buffered row batches
+    pub(super) fn try_as_initial_mut(&mut self) -> Result<&mut BufferedSideInitialState> {
+        match self {
+            BufferedSide::Initial(state) => Ok(state),
+            _ => internal_err!("Expected build side in initial state"),
+        }
+    }
+
+    pub(super) fn try_as_ready(&self) -> Result<&BufferedSideReadyState> {
+        match self {
+            BufferedSide::Ready(state) => Ok(state),
+            _ => {
+                internal_err!("Expected build side in ready state")
+            }
+        }
+    }
+
+    /// Tries to extract BuildSideReadyState from BuildSide enum.
+    /// Returns an error if state is not Ready.
+    pub(super) fn try_as_ready_mut(&mut self) -> Result<&mut BufferedSideReadyState> {
+        match self {
+            BufferedSide::Ready(state) => Ok(state),
+            _ => internal_err!("Expected build side in ready state"),
+        }
+    }
+}
+
+pub(super) struct BufferedSideInitialState {
+    pub(crate) buffered_fut: OnceFut<BufferedSideData>,
+}
+
+pub(super) struct BufferedSideReadyState {
+    /// Collected build-side data
+    pub(super) buffered_data: Arc<BufferedSideData>,
+}
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/mod.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/mod.rs
new file mode 100644
index 0000000000000..c85a7cc16f657
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/mod.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! PiecewiseMergeJoin is currently experimental
+
+pub use exec::PiecewiseMergeJoinExec;
+
+mod classic_join;
+mod exec;
+mod utils;
diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/utils.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/utils.rs
new file mode 100644
index 0000000000000..5bbb496322b5f
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/utils.rs
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::JoinType;
+
+// Returns boolean for whether the join is a right existence join
+pub(super) fn is_right_existence_join(join_type: JoinType) -> bool {
+    matches!(
+        join_type,
+        JoinType::RightAnti | JoinType::RightSemi | JoinType::RightMark
+    )
+}
+
+// Returns boolean for whether the join is an existence join
+pub(super) fn is_existence_join(join_type: JoinType) -> bool {
+    matches!(
+        join_type,
+        JoinType::LeftAnti
+            | JoinType::RightAnti
+            | JoinType::LeftSemi
+            | JoinType::RightSemi
+            | JoinType::LeftMark
+            | JoinType::RightMark
+    )
+}
+
+// Returns boolean to check if the join type needs to record
+// buffered side matches for classic joins
+pub(super) fn need_produce_result_in_final(join_type: JoinType) -> bool {
+    matches!(join_type, JoinType::Full | JoinType::Left)
+}
+
+// Returns boolean for whether or not we need to build the buffered side
+// bitmap for marking matched rows on the buffered side.
+pub(super) fn build_visited_indices_map(join_type: JoinType) -> bool {
+    matches!(
+        join_type,
+        JoinType::Full
+            | JoinType::Left
+            | JoinType::LeftAnti
+            | JoinType::RightAnti
+            | JoinType::LeftSemi
+            | JoinType::RightSemi
+            | JoinType::LeftMark
+            | JoinType::RightMark
+    )
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs
deleted file mode 100644
index cadd2b53ab117..0000000000000
--- a/datafusion/physical-plan/src/joins/sort_merge_join.rs
+++ /dev/null
@@ -1,5106 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Defines the Sort-Merge join execution plan.
-//! A Sort-Merge join plan consumes two sorted children plans and produces
-//! joined output by given join type and other options.
-
-use std::any::Any;
-use std::cmp::Ordering;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Formatter;
-use std::fs::File;
-use std::io::BufReader;
-use std::mem::size_of;
-use std::ops::Range;
-use std::pin::Pin;
-use std::sync::atomic::AtomicUsize;
-use std::sync::atomic::Ordering::Relaxed;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use crate::execution_plan::{boundedness_from_children, EmissionType};
-use crate::expressions::PhysicalSortExpr;
-use crate::joins::utils::{
-    build_join_schema, check_join_is_valid, estimate_join_statistics,
-    reorder_output_after_swap, symmetric_join_output_partitioning, JoinFilter, JoinOn,
-    JoinOnRef,
-};
-use crate::metrics::{
-    Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, SpillMetrics,
-};
-use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, update_join_on, ProjectionExec,
-};
-use crate::spill::spill_manager::SpillManager;
-use crate::{
-    metrics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
-    ExecutionPlanProperties, PhysicalExpr, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
-};
-
-use arrow::array::{types::UInt64Type, *};
-use arrow::compute::{
-    self, concat_batches, filter_record_batch, is_not_null, take, SortOptions,
-};
-use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
-use arrow::error::ArrowError;
-use arrow::ipc::reader::StreamReader;
-use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_err, DataFusionError, HashSet, JoinSide,
-    JoinType, Result,
-};
-use datafusion_execution::disk_manager::RefCountedTempFile;
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
-use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
-use datafusion_physical_expr::equivalence::join_equivalence_properties;
-use datafusion_physical_expr::PhysicalExprRef;
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
-
-use futures::{Stream, StreamExt};
-
-/// Join execution plan that executes equi-join predicates on multiple partitions using Sort-Merge
-/// join algorithm and applies an optional filter post join. Can be used to join arbitrarily large
-/// inputs where one or both of the inputs don't fit in the available memory.
-///
-/// # Join Expressions
-///
-/// Equi-join predicate (e.g. `<col1> = <col2>`) expressions are represented by [`Self::on`].
-///
-/// Non-equality predicates, which can not be pushed down to join inputs (e.g.
-/// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
-/// after the equijoin predicates. They are represented by [`Self::filter`]. These are optional
-/// expressions.
-///
-/// # Sorting
-///
-/// Assumes that both the left and right input to the join are pre-sorted. It is not the
-/// responsibility of this execution plan to sort the inputs.
-///
-/// # "Streamed" vs "Buffered"
-///
-/// The number of record batches of streamed input currently present in the memory will depend
-/// on the output batch size of the execution plan. There is no spilling support for streamed input.
-/// The comparisons are performed from values of join keys in streamed input with the values of
-/// join keys in buffered input. One row in streamed record batch could be matched with multiple rows in
-/// buffered input batches. The streamed input is managed through the states in `StreamedState`
-/// and streamed input batches are represented by `StreamedBatch`.
-///
-/// Buffered input is buffered for all record batches having the same value of join key.
-/// If the memory limit increases beyond the specified value and spilling is enabled,
-/// buffered batches could be spilled to disk. If spilling is disabled, the execution
-/// will fail under the same conditions. Multiple record batches of buffered could currently reside
-/// in memory/disk during the execution. The number of buffered batches residing in
-/// memory/disk depends on the number of rows of buffered input having the same value
-/// of join key as that of streamed input rows currently present in memory. Due to pre-sorted inputs,
-/// the algorithm understands when it is not needed anymore, and releases the buffered batches
-/// from memory/disk. The buffered input is managed through the states in `BufferedState`
-/// and buffered input batches are represented by `BufferedBatch`.
-///
-/// Depending on the type of join, left or right input may be selected as streamed or buffered
-/// respectively. For example, in a left-outer join, the left execution plan will be selected as
-/// streamed input while in a right-outer join, the right execution plan will be selected as the
-/// streamed input.
-///
-/// Reference for the algorithm:
-/// <https://en.wikipedia.org/wiki/Sort-merge_join>.
-///
-/// Helpful short video demonstration:
-/// <https://www.youtube.com/watch?v=jiWCPJtDE2c>.
-#[derive(Debug, Clone)]
-pub struct SortMergeJoinExec {
-    /// Left sorted joining execution plan
-    pub left: Arc<dyn ExecutionPlan>,
-    /// Right sorting joining execution plan
-    pub right: Arc<dyn ExecutionPlan>,
-    /// Set of common columns used to join on
-    pub on: JoinOn,
-    /// Filters which are applied while finding matching rows
-    pub filter: Option<JoinFilter>,
-    /// How the join is performed
-    pub join_type: JoinType,
-    /// The schema once the join is applied
-    schema: SchemaRef,
-    /// Execution metrics
-    metrics: ExecutionPlanMetricsSet,
-    /// The left SortExpr
-    left_sort_exprs: LexOrdering,
-    /// The right SortExpr
-    right_sort_exprs: LexOrdering,
-    /// Sort options of join columns used in sorting left and right execution plans
-    pub sort_options: Vec<SortOptions>,
-    /// If null_equals_null is true, null == null else null != null
-    pub null_equals_null: bool,
-    /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
-}
-
-impl SortMergeJoinExec {
-    /// Tries to create a new [SortMergeJoinExec].
-    /// The inputs are sorted using `sort_options` are applied to the columns in the `on`
-    /// # Error
-    /// This function errors when it is not possible to join the left and right sides on keys `on`.
-    pub fn try_new(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        filter: Option<JoinFilter>,
-        join_type: JoinType,
-        sort_options: Vec<SortOptions>,
-        null_equals_null: bool,
-    ) -> Result<Self> {
-        let left_schema = left.schema();
-        let right_schema = right.schema();
-
-        check_join_is_valid(&left_schema, &right_schema, &on)?;
-        if sort_options.len() != on.len() {
-            return plan_err!(
-                "Expected number of sort options: {}, actual: {}",
-                on.len(),
-                sort_options.len()
-            );
-        }
-
-        let (left_sort_exprs, right_sort_exprs): (Vec<_>, Vec<_>) = on
-            .iter()
-            .zip(sort_options.iter())
-            .map(|((l, r), sort_op)| {
-                let left = PhysicalSortExpr {
-                    expr: Arc::clone(l),
-                    options: *sort_op,
-                };
-                let right = PhysicalSortExpr {
-                    expr: Arc::clone(r),
-                    options: *sort_op,
-                };
-                (left, right)
-            })
-            .unzip();
-
-        let schema =
-            Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0);
-        let cache =
-            Self::compute_properties(&left, &right, Arc::clone(&schema), join_type, &on);
-        Ok(Self {
-            left,
-            right,
-            on,
-            filter,
-            join_type,
-            schema,
-            metrics: ExecutionPlanMetricsSet::new(),
-            left_sort_exprs: LexOrdering::new(left_sort_exprs),
-            right_sort_exprs: LexOrdering::new(right_sort_exprs),
-            sort_options,
-            null_equals_null,
-            cache,
-        })
-    }
-
-    /// Get probe side (e.g streaming side) information for this sort merge join.
-    /// In current implementation, probe side is determined according to join type.
-    pub fn probe_side(join_type: &JoinType) -> JoinSide {
-        // When output schema contains only the right side, probe side is right.
-        // Otherwise probe side is the left side.
-        match join_type {
-            JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => {
-                JoinSide::Right
-            }
-            JoinType::Inner
-            | JoinType::Left
-            | JoinType::Full
-            | JoinType::LeftAnti
-            | JoinType::LeftSemi
-            | JoinType::LeftMark => JoinSide::Left,
-        }
-    }
-
-    /// Calculate order preservation flags for this sort merge join.
-    fn maintains_input_order(join_type: JoinType) -> Vec<bool> {
-        match join_type {
-            JoinType::Inner => vec![true, false],
-            JoinType::Left
-            | JoinType::LeftSemi
-            | JoinType::LeftAnti
-            | JoinType::LeftMark => vec![true, false],
-            JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => {
-                vec![false, true]
-            }
-            _ => vec![false, false],
-        }
-    }
-
-    /// Set of common columns used to join on
-    pub fn on(&self) -> &[(PhysicalExprRef, PhysicalExprRef)] {
-        &self.on
-    }
-
-    /// Ref to right execution plan
-    pub fn right(&self) -> &Arc<dyn ExecutionPlan> {
-        &self.right
-    }
-
-    /// Join type
-    pub fn join_type(&self) -> JoinType {
-        self.join_type
-    }
-
-    /// Ref to left execution plan
-    pub fn left(&self) -> &Arc<dyn ExecutionPlan> {
-        &self.left
-    }
-
-    /// Ref to join filter
-    pub fn filter(&self) -> &Option<JoinFilter> {
-        &self.filter
-    }
-
-    /// Ref to sort options
-    pub fn sort_options(&self) -> &[SortOptions] {
-        &self.sort_options
-    }
-
-    /// Null equals null
-    pub fn null_equals_null(&self) -> bool {
-        self.null_equals_null
-    }
-
-    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
-    fn compute_properties(
-        left: &Arc<dyn ExecutionPlan>,
-        right: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
-        join_type: JoinType,
-        join_on: JoinOnRef,
-    ) -> PlanProperties {
-        // Calculate equivalence properties:
-        let eq_properties = join_equivalence_properties(
-            left.equivalence_properties().clone(),
-            right.equivalence_properties().clone(),
-            &join_type,
-            schema,
-            &Self::maintains_input_order(join_type),
-            Some(Self::probe_side(&join_type)),
-            join_on,
-        );
-
-        let output_partitioning =
-            symmetric_join_output_partitioning(left, right, &join_type);
-
-        PlanProperties::new(
-            eq_properties,
-            output_partitioning,
-            EmissionType::Incremental,
-            boundedness_from_children([left, right]),
-        )
-    }
-
-    pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
-        let left = self.left();
-        let right = self.right();
-        let new_join = SortMergeJoinExec::try_new(
-            Arc::clone(right),
-            Arc::clone(left),
-            self.on()
-                .iter()
-                .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
-                .collect::<Vec<_>>(),
-            self.filter().as_ref().map(JoinFilter::swap),
-            self.join_type().swap(),
-            self.sort_options.clone(),
-            self.null_equals_null,
-        )?;
-
-        // TODO: OR this condition with having a built-in projection (like
-        //       ordinary hash join) when we support it.
-        if matches!(
-            self.join_type(),
-            JoinType::LeftSemi
-                | JoinType::RightSemi
-                | JoinType::LeftAnti
-                | JoinType::RightAnti
-        ) {
-            Ok(Arc::new(new_join))
-        } else {
-            reorder_output_after_swap(Arc::new(new_join), &left.schema(), &right.schema())
-        }
-    }
-}
-
-impl DisplayAs for SortMergeJoinExec {
-    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
-        match t {
-            DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                let on = self
-                    .on
-                    .iter()
-                    .map(|(c1, c2)| format!("({c1}, {c2})"))
-                    .collect::<Vec<String>>()
-                    .join(", ");
-                write!(
-                    f,
-                    "SortMergeJoin: join_type={:?}, on=[{}]{}",
-                    self.join_type,
-                    on,
-                    self.filter.as_ref().map_or("".to_string(), |f| format!(
-                        ", filter={}",
-                        f.expression()
-                    ))
-                )
-            }
-            DisplayFormatType::TreeRender => {
-                let on = self
-                    .on
-                    .iter()
-                    .map(|(c1, c2)| {
-                        format!("({} = {})", fmt_sql(c1.as_ref()), fmt_sql(c2.as_ref()))
-                    })
-                    .collect::<Vec<String>>()
-                    .join(", ");
-
-                if self.join_type() != JoinType::Inner {
-                    writeln!(f, "join_type={:?}", self.join_type)?;
-                }
-                writeln!(f, "on={on}")
-            }
-        }
-    }
-}
-
-impl ExecutionPlan for SortMergeJoinExec {
-    fn name(&self) -> &'static str {
-        "SortMergeJoinExec"
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
-        &self.cache
-    }
-
-    fn required_input_distribution(&self) -> Vec<Distribution> {
-        let (left_expr, right_expr) = self
-            .on
-            .iter()
-            .map(|(l, r)| (Arc::clone(l), Arc::clone(r)))
-            .unzip();
-        vec![
-            Distribution::HashPartitioned(left_expr),
-            Distribution::HashPartitioned(right_expr),
-        ]
-    }
-
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
-        vec![
-            Some(LexRequirement::from(self.left_sort_exprs.clone())),
-            Some(LexRequirement::from(self.right_sort_exprs.clone())),
-        ]
-    }
-
-    fn maintains_input_order(&self) -> Vec<bool> {
-        Self::maintains_input_order(self.join_type)
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        vec![&self.left, &self.right]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        match &children[..] {
-            [left, right] => Ok(Arc::new(SortMergeJoinExec::try_new(
-                Arc::clone(left),
-                Arc::clone(right),
-                self.on.clone(),
-                self.filter.clone(),
-                self.join_type,
-                self.sort_options.clone(),
-                self.null_equals_null,
-            )?)),
-            _ => internal_err!("SortMergeJoin wrong number of children"),
-        }
-    }
-
-    fn execute(
-        &self,
-        partition: usize,
-        context: Arc<TaskContext>,
-    ) -> Result<SendableRecordBatchStream> {
-        let left_partitions = self.left.output_partitioning().partition_count();
-        let right_partitions = self.right.output_partitioning().partition_count();
-        if left_partitions != right_partitions {
-            return internal_err!(
-                "Invalid SortMergeJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
-                 consider using RepartitionExec"
-            );
-        }
-        let (on_left, on_right) = self.on.iter().cloned().unzip();
-        let (streamed, buffered, on_streamed, on_buffered) =
-            if SortMergeJoinExec::probe_side(&self.join_type) == JoinSide::Left {
-                (
-                    Arc::clone(&self.left),
-                    Arc::clone(&self.right),
-                    on_left,
-                    on_right,
-                )
-            } else {
-                (
-                    Arc::clone(&self.right),
-                    Arc::clone(&self.left),
-                    on_right,
-                    on_left,
-                )
-            };
-
-        // execute children plans
-        let streamed = streamed.execute(partition, Arc::clone(&context))?;
-        let buffered = buffered.execute(partition, Arc::clone(&context))?;
-
-        // create output buffer
-        let batch_size = context.session_config().batch_size();
-
-        // create memory reservation
-        let reservation = MemoryConsumer::new(format!("SMJStream[{partition}]"))
-            .register(context.memory_pool());
-
-        // create join stream
-        Ok(Box::pin(SortMergeJoinStream::try_new(
-            Arc::clone(&self.schema),
-            self.sort_options.clone(),
-            self.null_equals_null,
-            streamed,
-            buffered,
-            on_streamed,
-            on_buffered,
-            self.filter.clone(),
-            self.join_type,
-            batch_size,
-            SortMergeJoinMetrics::new(partition, &self.metrics),
-            reservation,
-            context.runtime_env(),
-        )?))
-    }
-
-    fn metrics(&self) -> Option<MetricsSet> {
-        Some(self.metrics.clone_inner())
-    }
-
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        // TODO stats: it is not possible in general to know the output size of joins
-        // There are some special cases though, for example:
-        // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
-        estimate_join_statistics(
-            self.left.partition_statistics(None)?,
-            self.right.partition_statistics(None)?,
-            self.on.clone(),
-            &self.join_type,
-            &self.schema,
-        )
-    }
-
-    /// Tries to swap the projection with its input [`SortMergeJoinExec`]. If it can be done,
-    /// it returns the new swapped version having the [`SortMergeJoinExec`] as the top plan.
-    /// Otherwise, it returns None.
-    fn try_swapping_with_projection(
-        &self,
-        projection: &ProjectionExec,
-    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
-        // Convert projected PhysicalExpr's to columns. If not possible, we cannot proceed.
-        let Some(projection_as_columns) = physical_to_column_exprs(projection.expr())
-        else {
-            return Ok(None);
-        };
-
-        let (far_right_left_col_ind, far_left_right_col_ind) = join_table_borders(
-            self.left().schema().fields().len(),
-            &projection_as_columns,
-        );
-
-        if !join_allows_pushdown(
-            &projection_as_columns,
-            &self.schema(),
-            far_right_left_col_ind,
-            far_left_right_col_ind,
-        ) {
-            return Ok(None);
-        }
-
-        let Some(new_on) = update_join_on(
-            &projection_as_columns[0..=far_right_left_col_ind as _],
-            &projection_as_columns[far_left_right_col_ind as _..],
-            self.on(),
-            self.left().schema().fields().len(),
-        ) else {
-            return Ok(None);
-        };
-
-        let (new_left, new_right) = new_join_children(
-            &projection_as_columns,
-            far_right_left_col_ind,
-            far_left_right_col_ind,
-            self.children()[0],
-            self.children()[1],
-        )?;
-
-        Ok(Some(Arc::new(SortMergeJoinExec::try_new(
-            Arc::new(new_left),
-            Arc::new(new_right),
-            new_on,
-            self.filter.clone(),
-            self.join_type,
-            self.sort_options.clone(),
-            self.null_equals_null,
-        )?)))
-    }
-}
-
-/// Metrics for SortMergeJoinExec
-#[allow(dead_code)]
-struct SortMergeJoinMetrics {
-    /// Total time for joining probe-side batches to the build-side batches
-    join_time: metrics::Time,
-    /// Number of batches consumed by this operator
-    input_batches: Count,
-    /// Number of rows consumed by this operator
-    input_rows: Count,
-    /// Number of batches produced by this operator
-    output_batches: Count,
-    /// Number of rows produced by this operator
-    output_rows: Count,
-    /// Peak memory used for buffered data.
-    /// Calculated as sum of peak memory values across partitions
-    peak_mem_used: metrics::Gauge,
-    /// Metrics related to spilling
-    spill_metrics: SpillMetrics,
-}
-
-impl SortMergeJoinMetrics {
-    #[allow(dead_code)]
-    pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
-        let join_time = MetricBuilder::new(metrics).subset_time("join_time", partition);
-        let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
-        let output_rows = MetricBuilder::new(metrics).output_rows(partition);
-        let peak_mem_used = MetricBuilder::new(metrics).gauge("peak_mem_used", partition);
-        let spill_metrics = SpillMetrics::new(metrics, partition);
-
-        Self {
-            join_time,
-            input_batches,
-            input_rows,
-            output_batches,
-            output_rows,
-            peak_mem_used,
-            spill_metrics,
-        }
-    }
-}
-
-/// State of SMJ stream
-#[derive(Debug, PartialEq, Eq)]
-enum SortMergeJoinState {
-    /// Init joining with a new streamed row or a new buffered batches
-    Init,
-    /// Polling one streamed row or one buffered batch, or both
-    Polling,
-    /// Joining polled data and making output
-    JoinOutput,
-    /// No more output
-    Exhausted,
-}
-
-/// State of streamed data stream
-#[derive(Debug, PartialEq, Eq)]
-enum StreamedState {
-    /// Init polling
-    Init,
-    /// Polling one streamed row
-    Polling,
-    /// Ready to produce one streamed row
-    Ready,
-    /// No more streamed row
-    Exhausted,
-}
-
-/// State of buffered data stream
-#[derive(Debug, PartialEq, Eq)]
-enum BufferedState {
-    /// Init polling
-    Init,
-    /// Polling first row in the next batch
-    PollingFirst,
-    /// Polling rest rows in the next batch
-    PollingRest,
-    /// Ready to produce one batch
-    Ready,
-    /// No more buffered batches
-    Exhausted,
-}
-
-/// Represents a chunk of joined data from streamed and buffered side
-struct StreamedJoinedChunk {
-    /// Index of batch in buffered_data
-    buffered_batch_idx: Option<usize>,
-    /// Array builder for streamed indices
-    streamed_indices: UInt64Builder,
-    /// Array builder for buffered indices
-    /// This could contain nulls if the join is null-joined
-    buffered_indices: UInt64Builder,
-}
-
-/// Represents a record batch from streamed input.
-///
-/// Also stores information of matching rows from buffered batches.
-struct StreamedBatch {
-    /// The streamed record batch
-    pub batch: RecordBatch,
-    /// The index of row in the streamed batch to compare with buffered batches
-    pub idx: usize,
-    /// The join key arrays of streamed batch which are used to compare with buffered batches
-    /// and to produce output. They are produced by evaluating `on` expressions.
-    pub join_arrays: Vec<ArrayRef>,
-    /// Chunks of indices from buffered side (may be nulls) joined to streamed
-    pub output_indices: Vec<StreamedJoinedChunk>,
-    /// Index of currently scanned batch from buffered data
-    pub buffered_batch_idx: Option<usize>,
-    /// Indices that found a match for the given join filter
-    /// Used for semi joins to keep track the streaming index which got a join filter match
-    /// and already emitted to the output.
-    pub join_filter_matched_idxs: HashSet<u64>,
-}
-
-impl StreamedBatch {
-    fn new(batch: RecordBatch, on_column: &[Arc<dyn PhysicalExpr>]) -> Self {
-        let join_arrays = join_arrays(&batch, on_column);
-        StreamedBatch {
-            batch,
-            idx: 0,
-            join_arrays,
-            output_indices: vec![],
-            buffered_batch_idx: None,
-            join_filter_matched_idxs: HashSet::new(),
-        }
-    }
-
-    fn new_empty(schema: SchemaRef) -> Self {
-        StreamedBatch {
-            batch: RecordBatch::new_empty(schema),
-            idx: 0,
-            join_arrays: vec![],
-            output_indices: vec![],
-            buffered_batch_idx: None,
-            join_filter_matched_idxs: HashSet::new(),
-        }
-    }
-
-    /// Appends new pair consisting of current streamed index and `buffered_idx`
-    /// index of buffered batch with `buffered_batch_idx` index.
-    fn append_output_pair(
-        &mut self,
-        buffered_batch_idx: Option<usize>,
-        buffered_idx: Option<usize>,
-    ) {
-        // If no current chunk exists or current chunk is not for current buffered batch,
-        // create a new chunk
-        if self.output_indices.is_empty() || self.buffered_batch_idx != buffered_batch_idx
-        {
-            self.output_indices.push(StreamedJoinedChunk {
-                buffered_batch_idx,
-                streamed_indices: UInt64Builder::with_capacity(1),
-                buffered_indices: UInt64Builder::with_capacity(1),
-            });
-            self.buffered_batch_idx = buffered_batch_idx;
-        };
-        let current_chunk = self.output_indices.last_mut().unwrap();
-
-        // Append index of streamed batch and index of buffered batch into current chunk
-        current_chunk.streamed_indices.append_value(self.idx as u64);
-        if let Some(idx) = buffered_idx {
-            current_chunk.buffered_indices.append_value(idx as u64);
-        } else {
-            current_chunk.buffered_indices.append_null();
-        }
-    }
-}
-
-/// A buffered batch that contains contiguous rows with same join key
-#[derive(Debug)]
-struct BufferedBatch {
-    /// The buffered record batch
-    /// None if the batch spilled to disk th
-    pub batch: Option<RecordBatch>,
-    /// The range in which the rows share the same join key
-    pub range: Range<usize>,
-    /// Array refs of the join key
-    pub join_arrays: Vec<ArrayRef>,
-    /// Buffered joined index (null joining buffered)
-    pub null_joined: Vec<usize>,
-    /// Size estimation used for reserving / releasing memory
-    pub size_estimation: usize,
-    /// The indices of buffered batch that the join filter doesn't satisfy.
-    /// This is a map between right row index and a boolean value indicating whether all joined row
-    /// of the right row does not satisfy the filter .
-    /// When dequeuing the buffered batch, we need to produce null joined rows for these indices.
-    pub join_filter_not_matched_map: HashMap<u64, bool>,
-    /// Current buffered batch number of rows. Equal to batch.num_rows()
-    /// but if batch is spilled to disk this property is preferable
-    /// and less expensive
-    pub num_rows: usize,
-    /// An optional temp spill file name on the disk if the batch spilled
-    /// None by default
-    /// Some(fileName) if the batch spilled to the disk
-    pub spill_file: Option<RefCountedTempFile>,
-}
-
-impl BufferedBatch {
-    fn new(
-        batch: RecordBatch,
-        range: Range<usize>,
-        on_column: &[PhysicalExprRef],
-    ) -> Self {
-        let join_arrays = join_arrays(&batch, on_column);
-
-        // Estimation is calculated as
-        //   inner batch size
-        // + join keys size
-        // + worst case null_joined (as vector capacity * element size)
-        // + Range size
-        // + size of this estimation
-        let size_estimation = batch.get_array_memory_size()
-            + join_arrays
-                .iter()
-                .map(|arr| arr.get_array_memory_size())
-                .sum::<usize>()
-            + batch.num_rows().next_power_of_two() * size_of::<usize>()
-            + size_of::<Range<usize>>()
-            + size_of::<usize>();
-
-        let num_rows = batch.num_rows();
-        BufferedBatch {
-            batch: Some(batch),
-            range,
-            join_arrays,
-            null_joined: vec![],
-            size_estimation,
-            join_filter_not_matched_map: HashMap::new(),
-            num_rows,
-            spill_file: None,
-        }
-    }
-}
-
-/// Sort-Merge join stream that consumes streamed and buffered data streams
-/// and produces joined output stream.
-struct SortMergeJoinStream {
-    // ========================================================================
-    // PROPERTIES:
-    // These fields are initialized at the start and remain constant throughout
-    // the execution.
-    // ========================================================================
-    /// Output schema
-    pub schema: SchemaRef,
-    /// null == null?
-    pub null_equals_null: bool,
-    /// Sort options of join columns used to sort streamed and buffered data stream
-    pub sort_options: Vec<SortOptions>,
-    /// optional join filter
-    pub filter: Option<JoinFilter>,
-    /// How the join is performed
-    pub join_type: JoinType,
-    /// Target output batch size
-    pub batch_size: usize,
-
-    // ========================================================================
-    // STREAMED FIELDS:
-    // These fields manage the properties and state of the streamed input.
-    // ========================================================================
-    /// Input schema of streamed
-    pub streamed_schema: SchemaRef,
-    /// Streamed data stream
-    pub streamed: SendableRecordBatchStream,
-    /// Current processing record batch of streamed
-    pub streamed_batch: StreamedBatch,
-    /// (used in outer join) Is current streamed row joined at least once?
-    pub streamed_joined: bool,
-    /// State of streamed
-    pub streamed_state: StreamedState,
-    /// Join key columns of streamed
-    pub on_streamed: Vec<PhysicalExprRef>,
-
-    // ========================================================================
-    // BUFFERED FIELDS:
-    // These fields manage the properties and state of the buffered input.
-    // ========================================================================
-    /// Input schema of buffered
-    pub buffered_schema: SchemaRef,
-    /// Buffered data stream
-    pub buffered: SendableRecordBatchStream,
-    /// Current buffered data
-    pub buffered_data: BufferedData,
-    /// (used in outer join) Is current buffered batches joined at least once?
-    pub buffered_joined: bool,
-    /// State of buffered
-    pub buffered_state: BufferedState,
-    /// Join key columns of buffered
-    pub on_buffered: Vec<PhysicalExprRef>,
-
-    // ========================================================================
-    // MERGE JOIN STATES:
-    // These fields track the execution state of merge join and are updated
-    // during the execution.
-    // ========================================================================
-    /// Current state of the stream
-    pub state: SortMergeJoinState,
-    /// Staging output array builders
-    pub staging_output_record_batches: JoinedRecordBatches,
-    /// Output buffer. Currently used by filtering as it requires double buffering
-    /// to avoid small/empty batches. Non-filtered join outputs directly from `staging_output_record_batches.batches`
-    pub output: RecordBatch,
-    /// Staging output size, including output batches and staging joined results.
-    /// Increased when we put rows into buffer and decreased after we actually output batches.
-    /// Used to trigger output when sufficient rows are ready
-    pub output_size: usize,
-    /// The comparison result of current streamed row and buffered batches
-    pub current_ordering: Ordering,
-    /// Manages the process of spilling and reading back intermediate data
-    pub spill_manager: SpillManager,
-
-    // ========================================================================
-    // EXECUTION RESOURCES:
-    // Fields related to managing execution resources and monitoring performance.
-    // ========================================================================
-    /// Metrics
-    pub join_metrics: SortMergeJoinMetrics,
-    /// Memory reservation
-    pub reservation: MemoryReservation,
-    /// Runtime env
-    pub runtime_env: Arc<RuntimeEnv>,
-    /// A unique number for each batch
-    pub streamed_batch_counter: AtomicUsize,
-}
-
-/// Joined batches with attached join filter information
-struct JoinedRecordBatches {
-    /// Joined batches. Each batch is already joined columns from left and right sources
-    pub batches: Vec<RecordBatch>,
-    /// Filter match mask for each row(matched/non-matched)
-    pub filter_mask: BooleanBuilder,
-    /// Left row indices to glue together rows in `batches` and `filter_mask`
-    pub row_indices: UInt64Builder,
-    /// Which unique batch id the row belongs to
-    /// It is necessary to differentiate rows that are distributed the way when they point to the same
-    /// row index but in not the same batches
-    pub batch_ids: Vec<usize>,
-}
-
-impl JoinedRecordBatches {
-    fn clear(&mut self) {
-        self.batches.clear();
-        self.batch_ids.clear();
-        self.filter_mask = BooleanBuilder::new();
-        self.row_indices = UInt64Builder::new();
-    }
-}
-impl RecordBatchStream for SortMergeJoinStream {
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-}
-
-/// True if next index refers to either:
-/// - another batch id
-/// - another row index within same batch id
-/// - end of row indices
-#[inline(always)]
-fn last_index_for_row(
-    row_index: usize,
-    indices: &UInt64Array,
-    batch_ids: &[usize],
-    indices_len: usize,
-) -> bool {
-    row_index == indices_len - 1
-        || batch_ids[row_index] != batch_ids[row_index + 1]
-        || indices.value(row_index) != indices.value(row_index + 1)
-}
-
-// Returns a corrected boolean bitmask for the given join type
-// Values in the corrected bitmask can be: true, false, null
-// `true` - the row found its match and sent to the output
-// `null` - the row ignored, no output
-// `false` - the row sent as NULL joined row
-fn get_corrected_filter_mask(
-    join_type: JoinType,
-    row_indices: &UInt64Array,
-    batch_ids: &[usize],
-    filter_mask: &BooleanArray,
-    expected_size: usize,
-) -> Option<BooleanArray> {
-    let row_indices_length = row_indices.len();
-    let mut corrected_mask: BooleanBuilder =
-        BooleanBuilder::with_capacity(row_indices_length);
-    let mut seen_true = false;
-
-    match join_type {
-        JoinType::Left | JoinType::Right => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else if seen_true || !filter_mask.value(i) && !last_index {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                } else {
-                    corrected_mask.append_value(false); // to be converted to null joined row
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            // Generate null joined rows for records which have no matching join key
-            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftMark => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) && !seen_true {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else if seen_true || !filter_mask.value(i) && !last_index {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                } else {
-                    corrected_mask.append_value(false); // to be converted to null joined row
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            // Generate null joined rows for records which have no matching join key
-            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftSemi | JoinType::RightSemi => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                if filter_mask.value(i) && !seen_true {
-                    seen_true = true;
-                    corrected_mask.append_value(true);
-                } else {
-                    corrected_mask.append_null(); // to be ignored and not set to output
-                }
-
-                if last_index {
-                    seen_true = false;
-                }
-            }
-
-            Some(corrected_mask.finish())
-        }
-        JoinType::LeftAnti | JoinType::RightAnti => {
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-
-                if filter_mask.value(i) {
-                    seen_true = true;
-                }
-
-                if last_index {
-                    if !seen_true {
-                        corrected_mask.append_value(true);
-                    } else {
-                        corrected_mask.append_null();
-                    }
-
-                    seen_true = false;
-                } else {
-                    corrected_mask.append_null();
-                }
-            }
-            // Generate null joined rows for records which have no matching join key,
-            // for LeftAnti non-matched considered as true
-            corrected_mask.append_n(expected_size - corrected_mask.len(), true);
-            Some(corrected_mask.finish())
-        }
-        JoinType::Full => {
-            let mut mask: Vec<Option<bool>> = vec![Some(true); row_indices_length];
-            let mut last_true_idx = 0;
-            let mut first_row_idx = 0;
-            let mut seen_false = false;
-
-            for i in 0..row_indices_length {
-                let last_index =
-                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
-                let val = filter_mask.value(i);
-                let is_null = filter_mask.is_null(i);
-
-                if val {
-                    // memoize the first seen matched row
-                    if !seen_true {
-                        last_true_idx = i;
-                    }
-                    seen_true = true;
-                }
-
-                if is_null || val {
-                    mask[i] = Some(true);
-                } else if !is_null && !val && (seen_true || seen_false) {
-                    mask[i] = None;
-                } else {
-                    mask[i] = Some(false);
-                }
-
-                if !is_null && !val {
-                    seen_false = true;
-                }
-
-                if last_index {
-                    // If the left row seen as true its needed to output it once
-                    // To do that we mark all other matches for same row as null to avoid the output
-                    if seen_true {
-                        #[allow(clippy::needless_range_loop)]
-                        for j in first_row_idx..last_true_idx {
-                            mask[j] = None;
-                        }
-                    }
-
-                    seen_true = false;
-                    seen_false = false;
-                    last_true_idx = 0;
-                    first_row_idx = i + 1;
-                }
-            }
-
-            Some(BooleanArray::from(mask))
-        }
-        // Only outer joins needs to keep track of processed rows and apply corrected filter mask
-        _ => None,
-    }
-}
-
-impl Stream for SortMergeJoinStream {
-    type Item = Result<RecordBatch>;
-
-    fn poll_next(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        let join_time = self.join_metrics.join_time.clone();
-        let _timer = join_time.timer();
-        loop {
-            match &self.state {
-                SortMergeJoinState::Init => {
-                    let streamed_exhausted =
-                        self.streamed_state == StreamedState::Exhausted;
-                    let buffered_exhausted =
-                        self.buffered_state == BufferedState::Exhausted;
-                    self.state = if streamed_exhausted && buffered_exhausted {
-                        SortMergeJoinState::Exhausted
-                    } else {
-                        match self.current_ordering {
-                            Ordering::Less | Ordering::Equal => {
-                                if !streamed_exhausted {
-                                    if self.filter.is_some()
-                                        && matches!(
-                                            self.join_type,
-                                            JoinType::Left
-                                                | JoinType::LeftSemi
-                                                | JoinType::LeftMark
-                                                | JoinType::Right
-                                                | JoinType::RightSemi
-                                                | JoinType::LeftAnti
-                                                | JoinType::RightAnti
-                                                | JoinType::Full
-                                        )
-                                    {
-                                        self.freeze_all()?;
-
-                                        // If join is filtered and there is joined tuples waiting
-                                        // to be filtered
-                                        if !self
-                                            .staging_output_record_batches
-                                            .batches
-                                            .is_empty()
-                                        {
-                                            // Apply filter on joined tuples and get filtered batch
-                                            let out_filtered_batch =
-                                                self.filter_joined_batch()?;
-
-                                            // Append filtered batch to the output buffer
-                                            self.output = concat_batches(
-                                                &self.schema(),
-                                                vec![&self.output, &out_filtered_batch],
-                                            )?;
-
-                                            // Send to output if the output buffer surpassed the `batch_size`
-                                            if self.output.num_rows() >= self.batch_size {
-                                                let record_batch = std::mem::replace(
-                                                    &mut self.output,
-                                                    RecordBatch::new_empty(
-                                                        out_filtered_batch.schema(),
-                                                    ),
-                                                );
-                                                return Poll::Ready(Some(Ok(
-                                                    record_batch,
-                                                )));
-                                            }
-                                        }
-                                    }
-
-                                    self.streamed_joined = false;
-                                    self.streamed_state = StreamedState::Init;
-                                }
-                            }
-                            Ordering::Greater => {
-                                if !buffered_exhausted {
-                                    self.buffered_joined = false;
-                                    self.buffered_state = BufferedState::Init;
-                                }
-                            }
-                        }
-                        SortMergeJoinState::Polling
-                    };
-                }
-                SortMergeJoinState::Polling => {
-                    if ![StreamedState::Exhausted, StreamedState::Ready]
-                        .contains(&self.streamed_state)
-                    {
-                        match self.poll_streamed_row(cx)? {
-                            Poll::Ready(_) => {}
-                            Poll::Pending => return Poll::Pending,
-                        }
-                    }
-
-                    if ![BufferedState::Exhausted, BufferedState::Ready]
-                        .contains(&self.buffered_state)
-                    {
-                        match self.poll_buffered_batches(cx)? {
-                            Poll::Ready(_) => {}
-                            Poll::Pending => return Poll::Pending,
-                        }
-                    }
-                    let streamed_exhausted =
-                        self.streamed_state == StreamedState::Exhausted;
-                    let buffered_exhausted =
-                        self.buffered_state == BufferedState::Exhausted;
-                    if streamed_exhausted && buffered_exhausted {
-                        self.state = SortMergeJoinState::Exhausted;
-                        continue;
-                    }
-                    self.current_ordering = self.compare_streamed_buffered()?;
-                    self.state = SortMergeJoinState::JoinOutput;
-                }
-                SortMergeJoinState::JoinOutput => {
-                    self.join_partial()?;
-
-                    if self.output_size < self.batch_size {
-                        if self.buffered_data.scanning_finished() {
-                            self.buffered_data.scanning_reset();
-                            self.state = SortMergeJoinState::Init;
-                        }
-                    } else {
-                        self.freeze_all()?;
-                        if !self.staging_output_record_batches.batches.is_empty() {
-                            let record_batch = self.output_record_batch_and_reset()?;
-                            // For non-filtered join output whenever the target output batch size
-                            // is hit. For filtered join its needed to output on later phase
-                            // because target output batch size can be hit in the middle of
-                            // filtering causing the filtering to be incomplete and causing
-                            // correctness issues
-                            if self.filter.is_some()
-                                && matches!(
-                                    self.join_type,
-                                    JoinType::Left
-                                        | JoinType::LeftSemi
-                                        | JoinType::Right
-                                        | JoinType::RightSemi
-                                        | JoinType::LeftAnti
-                                        | JoinType::RightAnti
-                                        | JoinType::LeftMark
-                                        | JoinType::Full
-                                )
-                            {
-                                continue;
-                            }
-
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        }
-                        return Poll::Pending;
-                    }
-                }
-                SortMergeJoinState::Exhausted => {
-                    self.freeze_all()?;
-
-                    // if there is still something not processed
-                    if !self.staging_output_record_batches.batches.is_empty() {
-                        if self.filter.is_some()
-                            && matches!(
-                                self.join_type,
-                                JoinType::Left
-                                    | JoinType::LeftSemi
-                                    | JoinType::Right
-                                    | JoinType::RightSemi
-                                    | JoinType::LeftAnti
-                                    | JoinType::RightAnti
-                                    | JoinType::Full
-                                    | JoinType::LeftMark
-                            )
-                        {
-                            let record_batch = self.filter_joined_batch()?;
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        } else {
-                            let record_batch = self.output_record_batch_and_reset()?;
-                            return Poll::Ready(Some(Ok(record_batch)));
-                        }
-                    } else if self.output.num_rows() > 0 {
-                        // if processed but still not outputted because it didn't hit batch size before
-                        let schema = self.output.schema();
-                        let record_batch = std::mem::replace(
-                            &mut self.output,
-                            RecordBatch::new_empty(schema),
-                        );
-                        return Poll::Ready(Some(Ok(record_batch)));
-                    } else {
-                        return Poll::Ready(None);
-                    }
-                }
-            }
-        }
-    }
-}
-
-impl SortMergeJoinStream {
-    #[allow(clippy::too_many_arguments)]
-    pub fn try_new(
-        schema: SchemaRef,
-        sort_options: Vec<SortOptions>,
-        null_equals_null: bool,
-        streamed: SendableRecordBatchStream,
-        buffered: SendableRecordBatchStream,
-        on_streamed: Vec<Arc<dyn PhysicalExpr>>,
-        on_buffered: Vec<Arc<dyn PhysicalExpr>>,
-        filter: Option<JoinFilter>,
-        join_type: JoinType,
-        batch_size: usize,
-        join_metrics: SortMergeJoinMetrics,
-        reservation: MemoryReservation,
-        runtime_env: Arc<RuntimeEnv>,
-    ) -> Result<Self> {
-        let streamed_schema = streamed.schema();
-        let buffered_schema = buffered.schema();
-        let spill_manager = SpillManager::new(
-            Arc::clone(&runtime_env),
-            join_metrics.spill_metrics.clone(),
-            Arc::clone(&buffered_schema),
-        );
-        Ok(Self {
-            state: SortMergeJoinState::Init,
-            sort_options,
-            null_equals_null,
-            schema: Arc::clone(&schema),
-            streamed_schema: Arc::clone(&streamed_schema),
-            buffered_schema,
-            streamed,
-            buffered,
-            streamed_batch: StreamedBatch::new_empty(streamed_schema),
-            buffered_data: BufferedData::default(),
-            streamed_joined: false,
-            buffered_joined: false,
-            streamed_state: StreamedState::Init,
-            buffered_state: BufferedState::Init,
-            current_ordering: Ordering::Equal,
-            on_streamed,
-            on_buffered,
-            filter,
-            staging_output_record_batches: JoinedRecordBatches {
-                batches: vec![],
-                filter_mask: BooleanBuilder::new(),
-                row_indices: UInt64Builder::new(),
-                batch_ids: vec![],
-            },
-            output: RecordBatch::new_empty(schema),
-            output_size: 0,
-            batch_size,
-            join_type,
-            join_metrics,
-            reservation,
-            runtime_env,
-            spill_manager,
-            streamed_batch_counter: AtomicUsize::new(0),
-        })
-    }
-
-    /// Poll next streamed row
-    fn poll_streamed_row(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
-        loop {
-            match &self.streamed_state {
-                StreamedState::Init => {
-                    if self.streamed_batch.idx + 1 < self.streamed_batch.batch.num_rows()
-                    {
-                        self.streamed_batch.idx += 1;
-                        self.streamed_state = StreamedState::Ready;
-                        return Poll::Ready(Some(Ok(())));
-                    } else {
-                        self.streamed_state = StreamedState::Polling;
-                    }
-                }
-                StreamedState::Polling => match self.streamed.poll_next_unpin(cx)? {
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                    Poll::Ready(None) => {
-                        self.streamed_state = StreamedState::Exhausted;
-                    }
-                    Poll::Ready(Some(batch)) => {
-                        if batch.num_rows() > 0 {
-                            self.freeze_streamed()?;
-                            self.join_metrics.input_batches.add(1);
-                            self.join_metrics.input_rows.add(batch.num_rows());
-                            self.streamed_batch =
-                                StreamedBatch::new(batch, &self.on_streamed);
-                            // Every incoming streaming batch should have its unique id
-                            // Check `JoinedRecordBatches.self.streamed_batch_counter` documentation
-                            self.streamed_batch_counter
-                                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-                            self.streamed_state = StreamedState::Ready;
-                        }
-                    }
-                },
-                StreamedState::Ready => {
-                    return Poll::Ready(Some(Ok(())));
-                }
-                StreamedState::Exhausted => {
-                    return Poll::Ready(None);
-                }
-            }
-        }
-    }
-
-    fn free_reservation(&mut self, buffered_batch: BufferedBatch) -> Result<()> {
-        // Shrink memory usage for in-memory batches only
-        if buffered_batch.spill_file.is_none() && buffered_batch.batch.is_some() {
-            self.reservation
-                .try_shrink(buffered_batch.size_estimation)?;
-        }
-
-        Ok(())
-    }
-
-    fn allocate_reservation(&mut self, mut buffered_batch: BufferedBatch) -> Result<()> {
-        match self.reservation.try_grow(buffered_batch.size_estimation) {
-            Ok(_) => {
-                self.join_metrics
-                    .peak_mem_used
-                    .set_max(self.reservation.size());
-                Ok(())
-            }
-            Err(_) if self.runtime_env.disk_manager.tmp_files_enabled() => {
-                // Spill buffered batch to disk
-                if let Some(batch) = buffered_batch.batch {
-                    let spill_file = self
-                        .spill_manager
-                        .spill_record_batch_and_finish(
-                            &[batch],
-                            "sort_merge_join_buffered_spill",
-                        )?
-                        .unwrap(); // Operation only return None if no batches are spilled, here we ensure that at least one batch is spilled
-
-                    buffered_batch.spill_file = Some(spill_file);
-                    buffered_batch.batch = None;
-
-                    Ok(())
-                } else {
-                    internal_err!("Buffered batch has empty body")
-                }
-            }
-            Err(e) => exec_err!("{}. Disk spilling disabled.", e.message()),
-        }?;
-
-        self.buffered_data.batches.push_back(buffered_batch);
-        Ok(())
-    }
-
-    /// Poll next buffered batches
-    fn poll_buffered_batches(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
-        loop {
-            match &self.buffered_state {
-                BufferedState::Init => {
-                    // pop previous buffered batches
-                    while !self.buffered_data.batches.is_empty() {
-                        let head_batch = self.buffered_data.head_batch();
-                        // If the head batch is fully processed, dequeue it and produce output of it.
-                        if head_batch.range.end == head_batch.num_rows {
-                            self.freeze_dequeuing_buffered()?;
-                            if let Some(mut buffered_batch) =
-                                self.buffered_data.batches.pop_front()
-                            {
-                                self.produce_buffered_not_matched(&mut buffered_batch)?;
-                                self.free_reservation(buffered_batch)?;
-                            }
-                        } else {
-                            // If the head batch is not fully processed, break the loop.
-                            // Streamed batch will be joined with the head batch in the next step.
-                            break;
-                        }
-                    }
-                    if self.buffered_data.batches.is_empty() {
-                        self.buffered_state = BufferedState::PollingFirst;
-                    } else {
-                        let tail_batch = self.buffered_data.tail_batch_mut();
-                        tail_batch.range.start = tail_batch.range.end;
-                        tail_batch.range.end += 1;
-                        self.buffered_state = BufferedState::PollingRest;
-                    }
-                }
-                BufferedState::PollingFirst => match self.buffered.poll_next_unpin(cx)? {
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                    Poll::Ready(None) => {
-                        self.buffered_state = BufferedState::Exhausted;
-                        return Poll::Ready(None);
-                    }
-                    Poll::Ready(Some(batch)) => {
-                        self.join_metrics.input_batches.add(1);
-                        self.join_metrics.input_rows.add(batch.num_rows());
-
-                        if batch.num_rows() > 0 {
-                            let buffered_batch =
-                                BufferedBatch::new(batch, 0..1, &self.on_buffered);
-
-                            self.allocate_reservation(buffered_batch)?;
-                            self.buffered_state = BufferedState::PollingRest;
-                        }
-                    }
-                },
-                BufferedState::PollingRest => {
-                    if self.buffered_data.tail_batch().range.end
-                        < self.buffered_data.tail_batch().num_rows
-                    {
-                        while self.buffered_data.tail_batch().range.end
-                            < self.buffered_data.tail_batch().num_rows
-                        {
-                            if is_join_arrays_equal(
-                                &self.buffered_data.head_batch().join_arrays,
-                                self.buffered_data.head_batch().range.start,
-                                &self.buffered_data.tail_batch().join_arrays,
-                                self.buffered_data.tail_batch().range.end,
-                            )? {
-                                self.buffered_data.tail_batch_mut().range.end += 1;
-                            } else {
-                                self.buffered_state = BufferedState::Ready;
-                                return Poll::Ready(Some(Ok(())));
-                            }
-                        }
-                    } else {
-                        match self.buffered.poll_next_unpin(cx)? {
-                            Poll::Pending => {
-                                return Poll::Pending;
-                            }
-                            Poll::Ready(None) => {
-                                self.buffered_state = BufferedState::Ready;
-                            }
-                            Poll::Ready(Some(batch)) => {
-                                // Polling batches coming concurrently as multiple partitions
-                                self.join_metrics.input_batches.add(1);
-                                self.join_metrics.input_rows.add(batch.num_rows());
-                                if batch.num_rows() > 0 {
-                                    let buffered_batch = BufferedBatch::new(
-                                        batch,
-                                        0..0,
-                                        &self.on_buffered,
-                                    );
-                                    self.allocate_reservation(buffered_batch)?;
-                                }
-                            }
-                        }
-                    }
-                }
-                BufferedState::Ready => {
-                    return Poll::Ready(Some(Ok(())));
-                }
-                BufferedState::Exhausted => {
-                    return Poll::Ready(None);
-                }
-            }
-        }
-    }
-
-    /// Get comparison result of streamed row and buffered batches
-    fn compare_streamed_buffered(&self) -> Result<Ordering> {
-        if self.streamed_state == StreamedState::Exhausted {
-            return Ok(Ordering::Greater);
-        }
-        if !self.buffered_data.has_buffered_rows() {
-            return Ok(Ordering::Less);
-        }
-
-        compare_join_arrays(
-            &self.streamed_batch.join_arrays,
-            self.streamed_batch.idx,
-            &self.buffered_data.head_batch().join_arrays,
-            self.buffered_data.head_batch().range.start,
-            &self.sort_options,
-            self.null_equals_null,
-        )
-    }
-
-    /// Produce join and fill output buffer until reaching target batch size
-    /// or the join is finished
-    fn join_partial(&mut self) -> Result<()> {
-        // Whether to join streamed rows
-        let mut join_streamed = false;
-        // Whether to join buffered rows
-        let mut join_buffered = false;
-        // For Mark join we store a dummy id to indicate the the row has a match
-        let mut mark_row_as_match = false;
-
-        // determine whether we need to join streamed/buffered rows
-        match self.current_ordering {
-            Ordering::Less => {
-                if matches!(
-                    self.join_type,
-                    JoinType::Left
-                        | JoinType::Right
-                        | JoinType::Full
-                        | JoinType::LeftAnti
-                        | JoinType::RightAnti
-                        | JoinType::LeftMark
-                ) {
-                    join_streamed = !self.streamed_joined;
-                }
-            }
-            Ordering::Equal => {
-                if matches!(
-                    self.join_type,
-                    JoinType::LeftSemi | JoinType::LeftMark | JoinType::RightSemi
-                ) {
-                    mark_row_as_match = matches!(self.join_type, JoinType::LeftMark);
-                    // if the join filter is specified then its needed to output the streamed index
-                    // only if it has not been emitted before
-                    // the `join_filter_matched_idxs` keeps track on if streamed index has a successful
-                    // filter match and prevents the same index to go into output more than once
-                    if self.filter.is_some() {
-                        join_streamed = !self
-                            .streamed_batch
-                            .join_filter_matched_idxs
-                            .contains(&(self.streamed_batch.idx as u64))
-                            && !self.streamed_joined;
-                        // if the join filter specified there can be references to buffered columns
-                        // so buffered columns are needed to access them
-                        join_buffered = join_streamed;
-                    } else {
-                        join_streamed = !self.streamed_joined;
-                    }
-                }
-                if matches!(
-                    self.join_type,
-                    JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full
-                ) {
-                    join_streamed = true;
-                    join_buffered = true;
-                };
-
-                if matches!(self.join_type, JoinType::LeftAnti | JoinType::RightAnti)
-                    && self.filter.is_some()
-                {
-                    join_streamed = !self.streamed_joined;
-                    join_buffered = join_streamed;
-                }
-            }
-            Ordering::Greater => {
-                if matches!(self.join_type, JoinType::Full) {
-                    join_buffered = !self.buffered_joined;
-                };
-            }
-        }
-        if !join_streamed && !join_buffered {
-            // no joined data
-            self.buffered_data.scanning_finish();
-            return Ok(());
-        }
-
-        if join_buffered {
-            // joining streamed/nulls and buffered
-            while !self.buffered_data.scanning_finished()
-                && self.output_size < self.batch_size
-            {
-                let scanning_idx = self.buffered_data.scanning_idx();
-                if join_streamed {
-                    // Join streamed row and buffered row
-                    self.streamed_batch.append_output_pair(
-                        Some(self.buffered_data.scanning_batch_idx),
-                        Some(scanning_idx),
-                    );
-                } else {
-                    // Join nulls and buffered row for FULL join
-                    self.buffered_data
-                        .scanning_batch_mut()
-                        .null_joined
-                        .push(scanning_idx);
-                }
-                self.output_size += 1;
-                self.buffered_data.scanning_advance();
-
-                if self.buffered_data.scanning_finished() {
-                    self.streamed_joined = join_streamed;
-                    self.buffered_joined = true;
-                }
-            }
-        } else {
-            // joining streamed and nulls
-            let scanning_batch_idx = if self.buffered_data.scanning_finished() {
-                None
-            } else {
-                Some(self.buffered_data.scanning_batch_idx)
-            };
-            // For Mark join we store a dummy id to indicate the the row has a match
-            let scanning_idx = mark_row_as_match.then_some(0);
-
-            self.streamed_batch
-                .append_output_pair(scanning_batch_idx, scanning_idx);
-            self.output_size += 1;
-            self.buffered_data.scanning_finish();
-            self.streamed_joined = true;
-        }
-        Ok(())
-    }
-
-    fn freeze_all(&mut self) -> Result<()> {
-        self.freeze_buffered(self.buffered_data.batches.len())?;
-        self.freeze_streamed()?;
-        Ok(())
-    }
-
-    // Produces and stages record batches to ensure dequeued buffered batch
-    // no longer needed:
-    //   1. freezes all indices joined to streamed side
-    //   2. freezes NULLs joined to dequeued buffered batch to "release" it
-    fn freeze_dequeuing_buffered(&mut self) -> Result<()> {
-        self.freeze_streamed()?;
-        // Only freeze and produce the first batch in buffered_data as the batch is fully processed
-        self.freeze_buffered(1)?;
-        Ok(())
-    }
-
-    // Produces and stages record batch from buffered indices with corresponding
-    // NULLs on streamed side.
-    //
-    // Applicable only in case of Full join.
-    //
-    fn freeze_buffered(&mut self, batch_count: usize) -> Result<()> {
-        if !matches!(self.join_type, JoinType::Full) {
-            return Ok(());
-        }
-        for buffered_batch in self.buffered_data.batches.range_mut(..batch_count) {
-            let buffered_indices = UInt64Array::from_iter_values(
-                buffered_batch.null_joined.iter().map(|&index| index as u64),
-            );
-            if let Some(record_batch) = produce_buffered_null_batch(
-                &self.schema,
-                &self.streamed_schema,
-                &buffered_indices,
-                buffered_batch,
-            )? {
-                let num_rows = record_batch.num_rows();
-                self.staging_output_record_batches
-                    .filter_mask
-                    .append_nulls(num_rows);
-                self.staging_output_record_batches
-                    .row_indices
-                    .append_nulls(num_rows);
-                self.staging_output_record_batches.batch_ids.resize(
-                    self.staging_output_record_batches.batch_ids.len() + num_rows,
-                    0,
-                );
-
-                self.staging_output_record_batches
-                    .batches
-                    .push(record_batch);
-            }
-            buffered_batch.null_joined.clear();
-        }
-        Ok(())
-    }
-
-    fn produce_buffered_not_matched(
-        &mut self,
-        buffered_batch: &mut BufferedBatch,
-    ) -> Result<()> {
-        if !matches!(self.join_type, JoinType::Full) {
-            return Ok(());
-        }
-
-        // For buffered row which is joined with streamed side rows but all joined rows
-        // don't satisfy the join filter
-        let not_matched_buffered_indices = buffered_batch
-            .join_filter_not_matched_map
-            .iter()
-            .filter_map(|(idx, failed)| if *failed { Some(*idx) } else { None })
-            .collect::<Vec<_>>();
-
-        let buffered_indices =
-            UInt64Array::from_iter_values(not_matched_buffered_indices.iter().copied());
-
-        if let Some(record_batch) = produce_buffered_null_batch(
-            &self.schema,
-            &self.streamed_schema,
-            &buffered_indices,
-            buffered_batch,
-        )? {
-            let num_rows = record_batch.num_rows();
-
-            self.staging_output_record_batches
-                .filter_mask
-                .append_nulls(num_rows);
-            self.staging_output_record_batches
-                .row_indices
-                .append_nulls(num_rows);
-            self.staging_output_record_batches.batch_ids.resize(
-                self.staging_output_record_batches.batch_ids.len() + num_rows,
-                0,
-            );
-            self.staging_output_record_batches
-                .batches
-                .push(record_batch);
-        }
-        buffered_batch.join_filter_not_matched_map.clear();
-
-        Ok(())
-    }
-
-    // Produces and stages record batch for all output indices found
-    // for current streamed batch and clears staged output indices.
-    fn freeze_streamed(&mut self) -> Result<()> {
-        for chunk in self.streamed_batch.output_indices.iter_mut() {
-            // The row indices of joined streamed batch
-            let left_indices = chunk.streamed_indices.finish();
-
-            if left_indices.is_empty() {
-                continue;
-            }
-
-            let mut left_columns = self
-                .streamed_batch
-                .batch
-                .columns()
-                .iter()
-                .map(|column| take(column, &left_indices, None))
-                .collect::<Result<Vec<_>, ArrowError>>()?;
-
-            // The row indices of joined buffered batch
-            let right_indices: UInt64Array = chunk.buffered_indices.finish();
-            let mut right_columns = if matches!(self.join_type, JoinType::LeftMark) {
-                vec![Arc::new(is_not_null(&right_indices)?) as ArrayRef]
-            } else if matches!(
-                self.join_type,
-                JoinType::LeftSemi
-                    | JoinType::LeftAnti
-                    | JoinType::RightAnti
-                    | JoinType::RightSemi
-            ) {
-                vec![]
-            } else if let Some(buffered_idx) = chunk.buffered_batch_idx {
-                fetch_right_columns_by_idxs(
-                    &self.buffered_data,
-                    buffered_idx,
-                    &right_indices,
-                )?
-            } else {
-                // If buffered batch none, meaning it is null joined batch.
-                // We need to create null arrays for buffered columns to join with streamed rows.
-                create_unmatched_columns(
-                    self.join_type,
-                    &self.buffered_schema,
-                    right_indices.len(),
-                )
-            };
-
-            // Prepare the columns we apply join filter on later.
-            // Only for joined rows between streamed and buffered.
-            let filter_columns = if chunk.buffered_batch_idx.is_some() {
-                if !matches!(self.join_type, JoinType::Right) {
-                    if matches!(
-                        self.join_type,
-                        JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark
-                    ) {
-                        let right_cols = fetch_right_columns_by_idxs(
-                            &self.buffered_data,
-                            chunk.buffered_batch_idx.unwrap(),
-                            &right_indices,
-                        )?;
-
-                        get_filter_column(&self.filter, &left_columns, &right_cols)
-                    } else if matches!(
-                        self.join_type,
-                        JoinType::RightAnti | JoinType::RightSemi
-                    ) {
-                        let right_cols = fetch_right_columns_by_idxs(
-                            &self.buffered_data,
-                            chunk.buffered_batch_idx.unwrap(),
-                            &right_indices,
-                        )?;
-
-                        get_filter_column(&self.filter, &right_cols, &left_columns)
-                    } else {
-                        get_filter_column(&self.filter, &left_columns, &right_columns)
-                    }
-                } else {
-                    get_filter_column(&self.filter, &right_columns, &left_columns)
-                }
-            } else {
-                // This chunk is totally for null joined rows (outer join), we don't need to apply join filter.
-                // Any join filter applied only on either streamed or buffered side will be pushed already.
-                vec![]
-            };
-
-            let columns = if !matches!(self.join_type, JoinType::Right) {
-                left_columns.extend(right_columns);
-                left_columns
-            } else {
-                right_columns.extend(left_columns);
-                right_columns
-            };
-
-            let output_batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
-            // Apply join filter if any
-            if !filter_columns.is_empty() {
-                if let Some(f) = &self.filter {
-                    // Construct batch with only filter columns
-                    let filter_batch =
-                        RecordBatch::try_new(Arc::clone(f.schema()), filter_columns)?;
-
-                    let filter_result = f
-                        .expression()
-                        .evaluate(&filter_batch)?
-                        .into_array(filter_batch.num_rows())?;
-
-                    // The boolean selection mask of the join filter result
-                    let pre_mask =
-                        datafusion_common::cast::as_boolean_array(&filter_result)?;
-
-                    // If there are nulls in join filter result, exclude them from selecting
-                    // the rows to output.
-                    let mask = if pre_mask.null_count() > 0 {
-                        compute::prep_null_mask_filter(
-                            datafusion_common::cast::as_boolean_array(&filter_result)?,
-                        )
-                    } else {
-                        pre_mask.clone()
-                    };
-
-                    // Push the filtered batch which contains rows passing join filter to the output
-                    if matches!(
-                        self.join_type,
-                        JoinType::Left
-                            | JoinType::LeftSemi
-                            | JoinType::Right
-                            | JoinType::RightSemi
-                            | JoinType::LeftAnti
-                            | JoinType::RightAnti
-                            | JoinType::LeftMark
-                            | JoinType::Full
-                    ) {
-                        self.staging_output_record_batches
-                            .batches
-                            .push(output_batch);
-                    } else {
-                        let filtered_batch = filter_record_batch(&output_batch, &mask)?;
-                        self.staging_output_record_batches
-                            .batches
-                            .push(filtered_batch);
-                    }
-
-                    if !matches!(self.join_type, JoinType::Full) {
-                        self.staging_output_record_batches.filter_mask.extend(&mask);
-                    } else {
-                        self.staging_output_record_batches
-                            .filter_mask
-                            .extend(pre_mask);
-                    }
-                    self.staging_output_record_batches
-                        .row_indices
-                        .extend(&left_indices);
-                    self.staging_output_record_batches.batch_ids.resize(
-                        self.staging_output_record_batches.batch_ids.len()
-                            + left_indices.len(),
-                        self.streamed_batch_counter.load(Relaxed),
-                    );
-
-                    // For outer joins, we need to push the null joined rows to the output if
-                    // all joined rows are failed on the join filter.
-                    // I.e., if all rows joined from a streamed row are failed with the join filter,
-                    // we need to join it with nulls as buffered side.
-                    if matches!(self.join_type, JoinType::Full) {
-                        let buffered_batch = &mut self.buffered_data.batches
-                            [chunk.buffered_batch_idx.unwrap()];
-
-                        for i in 0..pre_mask.len() {
-                            // If the buffered row is not joined with streamed side,
-                            // skip it.
-                            if right_indices.is_null(i) {
-                                continue;
-                            }
-
-                            let buffered_index = right_indices.value(i);
-
-                            buffered_batch.join_filter_not_matched_map.insert(
-                                buffered_index,
-                                *buffered_batch
-                                    .join_filter_not_matched_map
-                                    .get(&buffered_index)
-                                    .unwrap_or(&true)
-                                    && !pre_mask.value(i),
-                            );
-                        }
-                    }
-                } else {
-                    self.staging_output_record_batches
-                        .batches
-                        .push(output_batch);
-                }
-            } else {
-                self.staging_output_record_batches
-                    .batches
-                    .push(output_batch);
-            }
-        }
-
-        self.streamed_batch.output_indices.clear();
-
-        Ok(())
-    }
-
-    fn output_record_batch_and_reset(&mut self) -> Result<RecordBatch> {
-        let record_batch =
-            concat_batches(&self.schema, &self.staging_output_record_batches.batches)?;
-        self.join_metrics.output_batches.add(1);
-        self.join_metrics.output_rows.add(record_batch.num_rows());
-        // If join filter exists, `self.output_size` is not accurate as we don't know the exact
-        // number of rows in the output record batch. If streamed row joined with buffered rows,
-        // once join filter is applied, the number of output rows may be more than 1.
-        // If `record_batch` is empty, we should reset `self.output_size` to 0. It could be happened
-        // when the join filter is applied and all rows are filtered out.
-        if record_batch.num_rows() == 0 || record_batch.num_rows() > self.output_size {
-            self.output_size = 0;
-        } else {
-            self.output_size -= record_batch.num_rows();
-        }
-
-        if !(self.filter.is_some()
-            && matches!(
-                self.join_type,
-                JoinType::Left
-                    | JoinType::LeftSemi
-                    | JoinType::Right
-                    | JoinType::RightSemi
-                    | JoinType::LeftAnti
-                    | JoinType::RightAnti
-                    | JoinType::LeftMark
-                    | JoinType::Full
-            ))
-        {
-            self.staging_output_record_batches.batches.clear();
-        }
-        Ok(record_batch)
-    }
-
-    fn filter_joined_batch(&mut self) -> Result<RecordBatch> {
-        let record_batch =
-            concat_batches(&self.schema, &self.staging_output_record_batches.batches)?;
-        let mut out_indices = self.staging_output_record_batches.row_indices.finish();
-        let mut out_mask = self.staging_output_record_batches.filter_mask.finish();
-        let mut batch_ids = &self.staging_output_record_batches.batch_ids;
-        let default_batch_ids = vec![0; record_batch.num_rows()];
-
-        // If only nulls come in and indices sizes doesn't match with expected record batch count
-        // generate missing indices
-        // Happens for null joined batches for Full Join
-        if out_indices.null_count() == out_indices.len()
-            && out_indices.len() != record_batch.num_rows()
-        {
-            out_mask = BooleanArray::from(vec![None; record_batch.num_rows()]);
-            out_indices = UInt64Array::from(vec![None; record_batch.num_rows()]);
-            batch_ids = &default_batch_ids;
-        }
-
-        if out_mask.is_empty() {
-            self.staging_output_record_batches.batches.clear();
-            return Ok(record_batch);
-        }
-
-        let maybe_corrected_mask = get_corrected_filter_mask(
-            self.join_type,
-            &out_indices,
-            batch_ids,
-            &out_mask,
-            record_batch.num_rows(),
-        );
-
-        let corrected_mask = if let Some(ref filtered_join_mask) = maybe_corrected_mask {
-            filtered_join_mask
-        } else {
-            &out_mask
-        };
-
-        self.filter_record_batch_by_join_type(record_batch, corrected_mask)
-    }
-
-    fn filter_record_batch_by_join_type(
-        &mut self,
-        record_batch: RecordBatch,
-        corrected_mask: &BooleanArray,
-    ) -> Result<RecordBatch> {
-        let mut filtered_record_batch =
-            filter_record_batch(&record_batch, corrected_mask)?;
-        let left_columns_length = self.streamed_schema.fields.len();
-        let right_columns_length = self.buffered_schema.fields.len();
-
-        if matches!(
-            self.join_type,
-            JoinType::Left | JoinType::LeftMark | JoinType::Right
-        ) {
-            let null_mask = compute::not(corrected_mask)?;
-            let null_joined_batch = filter_record_batch(&record_batch, &null_mask)?;
-
-            let mut right_columns = create_unmatched_columns(
-                self.join_type,
-                &self.buffered_schema,
-                null_joined_batch.num_rows(),
-            );
-
-            let columns = if !matches!(self.join_type, JoinType::Right) {
-                let mut left_columns = null_joined_batch
-                    .columns()
-                    .iter()
-                    .take(right_columns_length)
-                    .cloned()
-                    .collect::<Vec<_>>();
-
-                left_columns.extend(right_columns);
-                left_columns
-            } else {
-                let left_columns = null_joined_batch
-                    .columns()
-                    .iter()
-                    .skip(left_columns_length)
-                    .cloned()
-                    .collect::<Vec<_>>();
-
-                right_columns.extend(left_columns);
-                right_columns
-            };
-
-            // Push the streamed/buffered batch joined nulls to the output
-            let null_joined_streamed_batch =
-                RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
-
-            filtered_record_batch = concat_batches(
-                &self.schema,
-                &[filtered_record_batch, null_joined_streamed_batch],
-            )?;
-        } else if matches!(self.join_type, JoinType::LeftSemi | JoinType::LeftAnti) {
-            let output_column_indices = (0..left_columns_length).collect::<Vec<_>>();
-            filtered_record_batch =
-                filtered_record_batch.project(&output_column_indices)?;
-        } else if matches!(self.join_type, JoinType::RightAnti | JoinType::RightSemi) {
-            let output_column_indices = (0..right_columns_length).collect::<Vec<_>>();
-            filtered_record_batch =
-                filtered_record_batch.project(&output_column_indices)?;
-        } else if matches!(self.join_type, JoinType::Full)
-            && corrected_mask.false_count() > 0
-        {
-            // Find rows which joined by key but Filter predicate evaluated as false
-            let joined_filter_not_matched_mask = compute::not(corrected_mask)?;
-            let joined_filter_not_matched_batch =
-                filter_record_batch(&record_batch, &joined_filter_not_matched_mask)?;
-
-            // Add left unmatched rows adding the right side as nulls
-            let right_null_columns = self
-                .buffered_schema
-                .fields()
-                .iter()
-                .map(|f| {
-                    new_null_array(
-                        f.data_type(),
-                        joined_filter_not_matched_batch.num_rows(),
-                    )
-                })
-                .collect::<Vec<_>>();
-
-            let mut result_joined = joined_filter_not_matched_batch
-                .columns()
-                .iter()
-                .take(left_columns_length)
-                .cloned()
-                .collect::<Vec<_>>();
-
-            result_joined.extend(right_null_columns);
-
-            let left_null_joined_batch =
-                RecordBatch::try_new(Arc::clone(&self.schema), result_joined)?;
-
-            // Add right unmatched rows adding the left side as nulls
-            let mut result_joined = self
-                .streamed_schema
-                .fields()
-                .iter()
-                .map(|f| {
-                    new_null_array(
-                        f.data_type(),
-                        joined_filter_not_matched_batch.num_rows(),
-                    )
-                })
-                .collect::<Vec<_>>();
-
-            let right_data = joined_filter_not_matched_batch
-                .columns()
-                .iter()
-                .skip(left_columns_length)
-                .cloned()
-                .collect::<Vec<_>>();
-
-            result_joined.extend(right_data);
-
-            filtered_record_batch = concat_batches(
-                &self.schema,
-                &[filtered_record_batch, left_null_joined_batch],
-            )?;
-        }
-
-        self.staging_output_record_batches.clear();
-
-        Ok(filtered_record_batch)
-    }
-}
-
-fn create_unmatched_columns(
-    join_type: JoinType,
-    schema: &SchemaRef,
-    size: usize,
-) -> Vec<ArrayRef> {
-    if matches!(join_type, JoinType::LeftMark) {
-        vec![Arc::new(BooleanArray::from(vec![false; size])) as ArrayRef]
-    } else {
-        schema
-            .fields()
-            .iter()
-            .map(|f| new_null_array(f.data_type(), size))
-            .collect::<Vec<_>>()
-    }
-}
-
-/// Gets the arrays which join filters are applied on.
-fn get_filter_column(
-    join_filter: &Option<JoinFilter>,
-    streamed_columns: &[ArrayRef],
-    buffered_columns: &[ArrayRef],
-) -> Vec<ArrayRef> {
-    let mut filter_columns = vec![];
-
-    if let Some(f) = join_filter {
-        let left_columns = f
-            .column_indices()
-            .iter()
-            .filter(|col_index| col_index.side == JoinSide::Left)
-            .map(|i| Arc::clone(&streamed_columns[i.index]))
-            .collect::<Vec<_>>();
-
-        let right_columns = f
-            .column_indices()
-            .iter()
-            .filter(|col_index| col_index.side == JoinSide::Right)
-            .map(|i| Arc::clone(&buffered_columns[i.index]))
-            .collect::<Vec<_>>();
-
-        filter_columns.extend(left_columns);
-        filter_columns.extend(right_columns);
-    }
-
-    filter_columns
-}
-
-fn produce_buffered_null_batch(
-    schema: &SchemaRef,
-    streamed_schema: &SchemaRef,
-    buffered_indices: &PrimitiveArray<UInt64Type>,
-    buffered_batch: &BufferedBatch,
-) -> Result<Option<RecordBatch>> {
-    if buffered_indices.is_empty() {
-        return Ok(None);
-    }
-
-    // Take buffered (right) columns
-    let right_columns =
-        fetch_right_columns_from_batch_by_idxs(buffered_batch, buffered_indices)?;
-
-    // Create null streamed (left) columns
-    let mut left_columns = streamed_schema
-        .fields()
-        .iter()
-        .map(|f| new_null_array(f.data_type(), buffered_indices.len()))
-        .collect::<Vec<_>>();
-
-    left_columns.extend(right_columns);
-
-    Ok(Some(RecordBatch::try_new(
-        Arc::clone(schema),
-        left_columns,
-    )?))
-}
-
-/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` by specific column indices
-#[inline(always)]
-fn fetch_right_columns_by_idxs(
-    buffered_data: &BufferedData,
-    buffered_batch_idx: usize,
-    buffered_indices: &UInt64Array,
-) -> Result<Vec<ArrayRef>> {
-    fetch_right_columns_from_batch_by_idxs(
-        &buffered_data.batches[buffered_batch_idx],
-        buffered_indices,
-    )
-}
-
-#[inline(always)]
-fn fetch_right_columns_from_batch_by_idxs(
-    buffered_batch: &BufferedBatch,
-    buffered_indices: &UInt64Array,
-) -> Result<Vec<ArrayRef>> {
-    match (&buffered_batch.spill_file, &buffered_batch.batch) {
-        // In memory batch
-        (None, Some(batch)) => Ok(batch
-            .columns()
-            .iter()
-            .map(|column| take(column, &buffered_indices, None))
-            .collect::<Result<Vec<_>, ArrowError>>()
-            .map_err(Into::<DataFusionError>::into)?),
-        // If the batch was spilled to disk, less likely
-        (Some(spill_file), None) => {
-            let mut buffered_cols: Vec<ArrayRef> =
-                Vec::with_capacity(buffered_indices.len());
-
-            let file = BufReader::new(File::open(spill_file.path())?);
-            let reader = StreamReader::try_new(file, None)?;
-
-            for batch in reader {
-                batch?.columns().iter().for_each(|column| {
-                    buffered_cols.extend(take(column, &buffered_indices, None))
-                });
-            }
-
-                Ok(buffered_cols)
-            }
-        // Invalid combination
-        (spill, batch) => internal_err!("Unexpected buffered batch spill status. Spill exists: {}. In-memory exists: {}", spill.is_some(), batch.is_some()),
-    }
-}
-
-/// Buffered data contains all buffered batches with one unique join key
-#[derive(Debug, Default)]
-struct BufferedData {
-    /// Buffered batches with the same key
-    pub batches: VecDeque<BufferedBatch>,
-    /// current scanning batch index used in join_partial()
-    pub scanning_batch_idx: usize,
-    /// current scanning offset used in join_partial()
-    pub scanning_offset: usize,
-}
-
-impl BufferedData {
-    pub fn head_batch(&self) -> &BufferedBatch {
-        self.batches.front().unwrap()
-    }
-
-    pub fn tail_batch(&self) -> &BufferedBatch {
-        self.batches.back().unwrap()
-    }
-
-    pub fn tail_batch_mut(&mut self) -> &mut BufferedBatch {
-        self.batches.back_mut().unwrap()
-    }
-
-    pub fn has_buffered_rows(&self) -> bool {
-        self.batches.iter().any(|batch| !batch.range.is_empty())
-    }
-
-    pub fn scanning_reset(&mut self) {
-        self.scanning_batch_idx = 0;
-        self.scanning_offset = 0;
-    }
-
-    pub fn scanning_advance(&mut self) {
-        self.scanning_offset += 1;
-        while !self.scanning_finished() && self.scanning_batch_finished() {
-            self.scanning_batch_idx += 1;
-            self.scanning_offset = 0;
-        }
-    }
-
-    pub fn scanning_batch(&self) -> &BufferedBatch {
-        &self.batches[self.scanning_batch_idx]
-    }
-
-    pub fn scanning_batch_mut(&mut self) -> &mut BufferedBatch {
-        &mut self.batches[self.scanning_batch_idx]
-    }
-
-    pub fn scanning_idx(&self) -> usize {
-        self.scanning_batch().range.start + self.scanning_offset
-    }
-
-    pub fn scanning_batch_finished(&self) -> bool {
-        self.scanning_offset == self.scanning_batch().range.len()
-    }
-
-    pub fn scanning_finished(&self) -> bool {
-        self.scanning_batch_idx == self.batches.len()
-    }
-
-    pub fn scanning_finish(&mut self) {
-        self.scanning_batch_idx = self.batches.len();
-        self.scanning_offset = 0;
-    }
-}
-
-/// Get join array refs of given batch and join columns
-fn join_arrays(batch: &RecordBatch, on_column: &[PhysicalExprRef]) -> Vec<ArrayRef> {
-    on_column
-        .iter()
-        .map(|c| {
-            let num_rows = batch.num_rows();
-            let c = c.evaluate(batch).unwrap();
-            c.into_array(num_rows).unwrap()
-        })
-        .collect()
-}
-
-/// Get comparison result of two rows of join arrays
-fn compare_join_arrays(
-    left_arrays: &[ArrayRef],
-    left: usize,
-    right_arrays: &[ArrayRef],
-    right: usize,
-    sort_options: &[SortOptions],
-    null_equals_null: bool,
-) -> Result<Ordering> {
-    let mut res = Ordering::Equal;
-    for ((left_array, right_array), sort_options) in
-        left_arrays.iter().zip(right_arrays).zip(sort_options)
-    {
-        macro_rules! compare_value {
-            ($T:ty) => {{
-                let left_array = left_array.as_any().downcast_ref::<$T>().unwrap();
-                let right_array = right_array.as_any().downcast_ref::<$T>().unwrap();
-                match (left_array.is_null(left), right_array.is_null(right)) {
-                    (false, false) => {
-                        let left_value = &left_array.value(left);
-                        let right_value = &right_array.value(right);
-                        res = left_value.partial_cmp(right_value).unwrap();
-                        if sort_options.descending {
-                            res = res.reverse();
-                        }
-                    }
-                    (true, false) => {
-                        res = if sort_options.nulls_first {
-                            Ordering::Less
-                        } else {
-                            Ordering::Greater
-                        };
-                    }
-                    (false, true) => {
-                        res = if sort_options.nulls_first {
-                            Ordering::Greater
-                        } else {
-                            Ordering::Less
-                        };
-                    }
-                    _ => {
-                        res = if null_equals_null {
-                            Ordering::Equal
-                        } else {
-                            Ordering::Less
-                        };
-                    }
-                }
-            }};
-        }
-
-        match left_array.data_type() {
-            DataType::Null => {}
-            DataType::Boolean => compare_value!(BooleanArray),
-            DataType::Int8 => compare_value!(Int8Array),
-            DataType::Int16 => compare_value!(Int16Array),
-            DataType::Int32 => compare_value!(Int32Array),
-            DataType::Int64 => compare_value!(Int64Array),
-            DataType::UInt8 => compare_value!(UInt8Array),
-            DataType::UInt16 => compare_value!(UInt16Array),
-            DataType::UInt32 => compare_value!(UInt32Array),
-            DataType::UInt64 => compare_value!(UInt64Array),
-            DataType::Float32 => compare_value!(Float32Array),
-            DataType::Float64 => compare_value!(Float64Array),
-            DataType::Utf8 => compare_value!(StringArray),
-            DataType::LargeUtf8 => compare_value!(LargeStringArray),
-            DataType::Decimal128(..) => compare_value!(Decimal128Array),
-            DataType::Timestamp(time_unit, None) => match time_unit {
-                TimeUnit::Second => compare_value!(TimestampSecondArray),
-                TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray),
-                TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray),
-                TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray),
-            },
-            DataType::Date32 => compare_value!(Date32Array),
-            DataType::Date64 => compare_value!(Date64Array),
-            dt => {
-                return not_impl_err!(
-                    "Unsupported data type in sort merge join comparator: {}",
-                    dt
-                );
-            }
-        }
-        if !res.is_eq() {
-            break;
-        }
-    }
-    Ok(res)
-}
-
-/// A faster version of compare_join_arrays() that only output whether
-/// the given two rows are equal
-fn is_join_arrays_equal(
-    left_arrays: &[ArrayRef],
-    left: usize,
-    right_arrays: &[ArrayRef],
-    right: usize,
-) -> Result<bool> {
-    let mut is_equal = true;
-    for (left_array, right_array) in left_arrays.iter().zip(right_arrays) {
-        macro_rules! compare_value {
-            ($T:ty) => {{
-                match (left_array.is_null(left), right_array.is_null(right)) {
-                    (false, false) => {
-                        let left_array =
-                            left_array.as_any().downcast_ref::<$T>().unwrap();
-                        let right_array =
-                            right_array.as_any().downcast_ref::<$T>().unwrap();
-                        if left_array.value(left) != right_array.value(right) {
-                            is_equal = false;
-                        }
-                    }
-                    (true, false) => is_equal = false,
-                    (false, true) => is_equal = false,
-                    _ => {}
-                }
-            }};
-        }
-
-        match left_array.data_type() {
-            DataType::Null => {}
-            DataType::Boolean => compare_value!(BooleanArray),
-            DataType::Int8 => compare_value!(Int8Array),
-            DataType::Int16 => compare_value!(Int16Array),
-            DataType::Int32 => compare_value!(Int32Array),
-            DataType::Int64 => compare_value!(Int64Array),
-            DataType::UInt8 => compare_value!(UInt8Array),
-            DataType::UInt16 => compare_value!(UInt16Array),
-            DataType::UInt32 => compare_value!(UInt32Array),
-            DataType::UInt64 => compare_value!(UInt64Array),
-            DataType::Float32 => compare_value!(Float32Array),
-            DataType::Float64 => compare_value!(Float64Array),
-            DataType::Utf8 => compare_value!(StringArray),
-            DataType::LargeUtf8 => compare_value!(LargeStringArray),
-            DataType::Decimal128(..) => compare_value!(Decimal128Array),
-            DataType::Timestamp(time_unit, None) => match time_unit {
-                TimeUnit::Second => compare_value!(TimestampSecondArray),
-                TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray),
-                TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray),
-                TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray),
-            },
-            DataType::Date32 => compare_value!(Date32Array),
-            DataType::Date64 => compare_value!(Date64Array),
-            dt => {
-                return not_impl_err!(
-                    "Unsupported data type in sort merge join comparator: {}",
-                    dt
-                );
-            }
-        }
-        if !is_equal {
-            return Ok(false);
-        }
-    }
-    Ok(true)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        builder::{BooleanBuilder, UInt64Builder},
-        BooleanArray, Date32Array, Date64Array, Int32Array, RecordBatch, UInt64Array,
-    };
-    use arrow::compute::{concat_batches, filter_record_batch, SortOptions};
-    use arrow::datatypes::{DataType, Field, Schema};
-
-    use datafusion_common::JoinType::*;
-    use datafusion_common::{assert_batches_eq, assert_contains, JoinType, Result};
-    use datafusion_common::{
-        test_util::{batches_to_sort_string, batches_to_string},
-        JoinSide,
-    };
-    use datafusion_execution::config::SessionConfig;
-    use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
-    use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-    use datafusion_execution::TaskContext;
-    use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::BinaryExpr;
-    use insta::{allow_duplicates, assert_snapshot};
-
-    use crate::expressions::Column;
-    use crate::joins::sort_merge_join::{get_corrected_filter_mask, JoinedRecordBatches};
-    use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn};
-    use crate::joins::SortMergeJoinExec;
-    use crate::test::TestMemoryExec;
-    use crate::test::{build_table_i32, build_table_i32_two_cols};
-    use crate::{common, ExecutionPlan};
-
-    fn build_table(
-        a: (&str, &Vec<i32>),
-        b: (&str, &Vec<i32>),
-        c: (&str, &Vec<i32>),
-    ) -> Arc<dyn ExecutionPlan> {
-        let batch = build_table_i32(a, b, c);
-        let schema = batch.schema();
-        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
-    }
-
-    fn build_table_from_batches(batches: Vec<RecordBatch>) -> Arc<dyn ExecutionPlan> {
-        let schema = batches.first().unwrap().schema();
-        TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap()
-    }
-
-    fn build_date_table(
-        a: (&str, &Vec<i32>),
-        b: (&str, &Vec<i32>),
-        c: (&str, &Vec<i32>),
-    ) -> Arc<dyn ExecutionPlan> {
-        let schema = Schema::new(vec![
-            Field::new(a.0, DataType::Date32, false),
-            Field::new(b.0, DataType::Date32, false),
-            Field::new(c.0, DataType::Date32, false),
-        ]);
-
-        let batch = RecordBatch::try_new(
-            Arc::new(schema),
-            vec![
-                Arc::new(Date32Array::from(a.1.clone())),
-                Arc::new(Date32Array::from(b.1.clone())),
-                Arc::new(Date32Array::from(c.1.clone())),
-            ],
-        )
-        .unwrap();
-
-        let schema = batch.schema();
-        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
-    }
-
-    fn build_date64_table(
-        a: (&str, &Vec<i64>),
-        b: (&str, &Vec<i64>),
-        c: (&str, &Vec<i64>),
-    ) -> Arc<dyn ExecutionPlan> {
-        let schema = Schema::new(vec![
-            Field::new(a.0, DataType::Date64, false),
-            Field::new(b.0, DataType::Date64, false),
-            Field::new(c.0, DataType::Date64, false),
-        ]);
-
-        let batch = RecordBatch::try_new(
-            Arc::new(schema),
-            vec![
-                Arc::new(Date64Array::from(a.1.clone())),
-                Arc::new(Date64Array::from(b.1.clone())),
-                Arc::new(Date64Array::from(c.1.clone())),
-            ],
-        )
-        .unwrap();
-
-        let schema = batch.schema();
-        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
-    }
-
-    /// returns a table with 3 columns of i32 in memory
-    pub fn build_table_i32_nullable(
-        a: (&str, &Vec<Option<i32>>),
-        b: (&str, &Vec<Option<i32>>),
-        c: (&str, &Vec<Option<i32>>),
-    ) -> Arc<dyn ExecutionPlan> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new(a.0, DataType::Int32, true),
-            Field::new(b.0, DataType::Int32, true),
-            Field::new(c.0, DataType::Int32, true),
-        ]));
-        let batch = RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(a.1.clone())),
-                Arc::new(Int32Array::from(b.1.clone())),
-                Arc::new(Int32Array::from(c.1.clone())),
-            ],
-        )
-        .unwrap();
-        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
-    }
-
-    pub fn build_table_two_cols(
-        a: (&str, &Vec<i32>),
-        b: (&str, &Vec<i32>),
-    ) -> Arc<dyn ExecutionPlan> {
-        let batch = build_table_i32_two_cols(a, b);
-        let schema = batch.schema();
-        TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
-    }
-
-    fn join(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        join_type: JoinType,
-    ) -> Result<SortMergeJoinExec> {
-        let sort_options = vec![SortOptions::default(); on.len()];
-        SortMergeJoinExec::try_new(left, right, on, None, join_type, sort_options, false)
-    }
-
-    fn join_with_options(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        join_type: JoinType,
-        sort_options: Vec<SortOptions>,
-        null_equals_null: bool,
-    ) -> Result<SortMergeJoinExec> {
-        SortMergeJoinExec::try_new(
-            left,
-            right,
-            on,
-            None,
-            join_type,
-            sort_options,
-            null_equals_null,
-        )
-    }
-
-    fn join_with_filter(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        filter: JoinFilter,
-        join_type: JoinType,
-        sort_options: Vec<SortOptions>,
-        null_equals_null: bool,
-    ) -> Result<SortMergeJoinExec> {
-        SortMergeJoinExec::try_new(
-            left,
-            right,
-            on,
-            Some(filter),
-            join_type,
-            sort_options,
-            null_equals_null,
-        )
-    }
-
-    async fn join_collect(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        join_type: JoinType,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
-        let sort_options = vec![SortOptions::default(); on.len()];
-        join_collect_with_options(left, right, on, join_type, sort_options, false).await
-    }
-
-    async fn join_collect_with_filter(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        filter: JoinFilter,
-        join_type: JoinType,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
-        let sort_options = vec![SortOptions::default(); on.len()];
-
-        let task_ctx = Arc::new(TaskContext::default());
-        let join =
-            join_with_filter(left, right, on, filter, join_type, sort_options, false)?;
-        let columns = columns(&join.schema());
-
-        let stream = join.execute(0, task_ctx)?;
-        let batches = common::collect(stream).await?;
-        Ok((columns, batches))
-    }
-
-    async fn join_collect_with_options(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        join_type: JoinType,
-        sort_options: Vec<SortOptions>,
-        null_equals_null: bool,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
-        let task_ctx = Arc::new(TaskContext::default());
-        let join = join_with_options(
-            left,
-            right,
-            on,
-            join_type,
-            sort_options,
-            null_equals_null,
-        )?;
-        let columns = columns(&join.schema());
-
-        let stream = join.execute(0, task_ctx)?;
-        let batches = common::collect(stream).await?;
-        Ok((columns, batches))
-    }
-
-    async fn join_collect_batch_size_equals_two(
-        left: Arc<dyn ExecutionPlan>,
-        right: Arc<dyn ExecutionPlan>,
-        on: JoinOn,
-        join_type: JoinType,
-    ) -> Result<(Vec<String>, Vec<RecordBatch>)> {
-        let task_ctx = TaskContext::default()
-            .with_session_config(SessionConfig::new().with_batch_size(2));
-        let task_ctx = Arc::new(task_ctx);
-        let join = join(left, right, on, join_type)?;
-        let columns = columns(&join.schema());
-
-        let stream = join.execute(0, task_ctx)?;
-        let batches = common::collect(stream).await?;
-        Ok((columns, batches))
-    }
-
-    #[tokio::test]
-    async fn join_inner_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![4, 5, 5]), // this has a repetition
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Inner).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 5  | 9  | 20 | 5  | 80 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_inner_two() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b2", &vec![1, 2, 2]),
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b2", &vec![1, 2, 2]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_columns, batches) = join_collect(left, right, on, Inner).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_inner_two_two() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 1, 2]),
-            ("b2", &vec![1, 1, 2]),
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a1", &vec![1, 1, 3]),
-            ("b2", &vec![1, 1, 2]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_columns, batches) = join_collect(left, right, on, Inner).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 1  | 1  | 7  | 1  | 1  | 80 |
-            | 1  | 1  | 8  | 1  | 1  | 70 |
-            | 1  | 1  | 8  | 1  | 1  | 80 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_inner_with_nulls() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(1), Some(1), Some(2), Some(2)]),
-            ("b2", &vec![None, Some(1), Some(2), Some(2)]), // null in key field
-            ("c1", &vec![Some(1), None, Some(8), Some(9)]), // null in non-key field
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(1), Some(1), Some(2), Some(3)]),
-            ("b2", &vec![None, Some(1), Some(2), Some(2)]),
-            ("c2", &vec![Some(10), Some(70), Some(80), Some(90)]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, Inner).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  |    | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_inner_with_nulls_with_options() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(2), Some(2), Some(1), Some(1)]),
-            ("b2", &vec![Some(2), Some(2), Some(1), None]), // null in key field
-            ("c1", &vec![Some(9), Some(8), None, Some(1)]), // null in non-key field
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(3), Some(2), Some(1), Some(1)]),
-            ("b2", &vec![Some(2), Some(2), Some(1), None]),
-            ("c2", &vec![Some(90), Some(80), Some(70), Some(10)]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-            ),
-        ];
-        let (_, batches) = join_collect_with_options(
-            left,
-            right,
-            on,
-            Inner,
-            vec![
-                SortOptions {
-                    descending: true,
-                    nulls_first: false,
-                };
-                2
-            ],
-            true,
-        )
-        .await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 1  | 1  |    | 1  | 1  | 70 |
-            | 1  |    | 1  | 1  |    | 10 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_inner_output_two_batches() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b2", &vec![1, 2, 2]),
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b2", &vec![1, 2, 2]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) =
-            join_collect_batch_size_equals_two(left, right, on, Inner).await?;
-        assert_eq!(batches.len(), 2);
-        assert_eq!(batches[0].num_rows(), 2);
-        assert_eq!(batches[1].num_rows(), 1);
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b2 | c1 | a1 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 1  | 7  | 1  | 1  | 70 |
-            | 2  | 2  | 8  | 2  | 2  | 80 |
-            | 2  | 2  | 9  | 2  | 2  | 80 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Left).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![4, 5, 7]),
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Right).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b1 | c2 |
-            +----+----+----+----+----+----+
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            |    |    |    | 30 | 6  | 90 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_full_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b2", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema()).unwrap()) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Full).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 30 | 6  | 90 |
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            | 3  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_anti() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2, 3, 5]),
-            ("b1", &vec![4, 5, 5, 7, 7]), // 7 does not exist on the right
-            ("c1", &vec![7, 8, 8, 9, 11]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, LeftAnti).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 3  | 7  | 9  |
-            | 5  | 7  | 11 |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_one_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b1", &vec![4, 5, 5]),
-            ("c1", &vec![7, 8, 8]),
-        );
-        let right =
-            build_table_two_cols(("a2", &vec![10, 20, 30]), ("b1", &vec![4, 5, 6]));
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, RightAnti).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+
-            | a2 | b1 |
-            +----+----+
-            | 30 | 6  |
-            +----+----+
-            "#);
-
-        let left2 = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b1", &vec![4, 5, 5]),
-            ("c1", &vec![7, 8, 8]),
-        );
-        let right2 = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left2.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right2.schema())?) as _,
-        )];
-
-        let (_, batches2) = join_collect(left2, right2, on, RightAnti).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches2), @r#"
-            +----+----+----+
-            | a2 | b1 | c2 |
-            +----+----+----+
-            | 30 | 6  | 90 |
-            +----+----+----+
-            "#);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_two_two() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b1", &vec![4, 5, 5]),
-            ("c1", &vec![7, 8, 8]),
-        );
-        let right =
-            build_table_two_cols(("a2", &vec![10, 20, 30]), ("b1", &vec![4, 5, 6]));
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, RightAnti).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+
-            | a2 | b1 |
-            +----+----+
-            | 10 | 4  |
-            | 20 | 5  |
-            | 30 | 6  |
-            +----+----+
-            "#);
-
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b1", &vec![4, 5, 5]),
-            ("c1", &vec![7, 8, 8]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, RightAnti).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a2 | b1 | c2 |",
-            "+----+----+----+",
-            "| 10 | 4  | 70 |",
-            "| 20 | 5  | 80 |",
-            "| 30 | 6  | 90 |",
-            "+----+----+----+",
-        ];
-        // The output order is important as SMJ preserves sortedness
-        assert_batches_eq!(expected, &batches);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_two_with_filter() -> Result<()> {
-        let left = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c1", &vec![30]));
-        let right = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c2", &vec![20]));
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-        let filter = JoinFilter::new(
-            Arc::new(BinaryExpr::new(
-                Arc::new(Column::new("c2", 1)),
-                Operator::Gt,
-                Arc::new(Column::new("c1", 0)),
-            )),
-            vec![
-                ColumnIndex {
-                    index: 2,
-                    side: JoinSide::Left,
-                },
-                ColumnIndex {
-                    index: 2,
-                    side: JoinSide::Right,
-                },
-            ],
-            Arc::new(Schema::new(vec![
-                Field::new("c1", DataType::Int32, true),
-                Field::new("c2", DataType::Int32, true),
-            ])),
-        );
-        let (_, batches) =
-            join_collect_with_filter(left, right, on, filter, RightAnti).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 1  | 10 | 20 |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_with_nulls() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(0), Some(1), Some(2), Some(2), Some(3)]),
-            ("b1", &vec![Some(3), Some(4), Some(5), None, Some(6)]),
-            ("c2", &vec![Some(60), None, Some(80), Some(85), Some(90)]),
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(1), Some(2), Some(2), Some(3)]),
-            ("b1", &vec![Some(4), Some(5), None, Some(6)]), // null in key field
-            ("c2", &vec![Some(7), Some(8), Some(8), None]), // null in non-key field
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, RightAnti).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 2  |    | 8  |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_with_nulls_with_options() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(1), Some(2), Some(1), Some(0), Some(2)]),
-            ("b1", &vec![Some(4), Some(5), Some(5), None, Some(5)]),
-            ("c1", &vec![Some(7), Some(8), Some(8), Some(60), None]),
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(3), Some(2), Some(2), Some(1)]),
-            ("b1", &vec![None, Some(5), Some(5), Some(4)]), // null in key field
-            ("c2", &vec![Some(9), None, Some(8), Some(7)]), // null in non-key field
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect_with_options(
-            left,
-            right,
-            on,
-            RightAnti,
-            vec![
-                SortOptions {
-                    descending: true,
-                    nulls_first: false,
-                };
-                2
-            ],
-            true,
-        )
-        .await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c2 |
-            +----+----+----+
-            | 3  |    | 9  |
-            | 2  | 5  |    |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_anti_output_two_batches() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2]),
-            ("b1", &vec![4, 5, 5]),
-            ("c1", &vec![7, 8, 8]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]),
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) =
-            join_collect_batch_size_equals_two(left, right, on, LeftAnti).await?;
-        assert_eq!(batches.len(), 2);
-        assert_eq!(batches[0].num_rows(), 2);
-        assert_eq!(batches[1].num_rows(), 1);
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 1  | 4  | 7  |
-            | 2  | 5  | 8  |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_semi() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right
-            ("c1", &vec![7, 8, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![4, 5, 6]), // 5 is double on the right
-            ("c2", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, LeftSemi).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+
-            | a1 | b1 | c1 |
-            +----+----+----+
-            | 1  | 4  | 7  |
-            | 2  | 5  | 8  |
-            | 2  | 5  | 8  |
-            +----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_one() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![10, 20, 30, 40]),
-            ("b1", &vec![4, 5, 5, 6]),
-            ("c1", &vec![70, 80, 90, 100]),
-        );
-        let right = build_table(
-            ("a2", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 7]),
-            ("c2", &vec![7, 8, 8, 9]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, RightSemi).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a2 | b1 | c2 |",
-            "+----+----+----+",
-            "| 1  | 4  | 7  |",
-            "| 2  | 5  | 8  |",
-            "| 2  | 5  | 8  |",
-            "+----+----+----+",
-        ];
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_two() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 6]),
-            ("c1", &vec![70, 80, 90, 100]),
-        );
-        let right = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 7]),
-            ("c2", &vec![7, 8, 8, 9]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, RightSemi).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a1 | b1 | c2 |",
-            "+----+----+----+",
-            "| 1  | 4  | 7  |",
-            "| 2  | 5  | 8  |",
-            "| 2  | 5  | 8  |",
-            "+----+----+----+",
-        ];
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_two_with_filter() -> Result<()> {
-        let left = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c1", &vec![30]));
-        let right = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c2", &vec![20]));
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-        let filter = JoinFilter::new(
-            Arc::new(BinaryExpr::new(
-                Arc::new(Column::new("c2", 1)),
-                Operator::Lt,
-                Arc::new(Column::new("c1", 0)),
-            )),
-            vec![
-                ColumnIndex {
-                    index: 2,
-                    side: JoinSide::Left,
-                },
-                ColumnIndex {
-                    index: 2,
-                    side: JoinSide::Right,
-                },
-            ],
-            Arc::new(Schema::new(vec![
-                Field::new("c1", DataType::Int32, true),
-                Field::new("c2", DataType::Int32, true),
-            ])),
-        );
-        let (_, batches) =
-            join_collect_with_filter(left, right, on, filter, RightSemi).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a1 | b1 | c2 |",
-            "+----+----+----+",
-            "| 1  | 10 | 20 |",
-            "+----+----+----+",
-        ];
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_with_nulls() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(0), Some(1), Some(2), Some(2), Some(3)]),
-            ("b1", &vec![Some(3), Some(4), Some(5), None, Some(6)]),
-            ("c2", &vec![Some(60), None, Some(80), Some(85), Some(90)]),
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(1), Some(2), Some(2), Some(3)]),
-            ("b1", &vec![Some(4), Some(5), None, Some(6)]), // null in key field
-            ("c2", &vec![Some(7), Some(8), Some(8), None]), // null in non-key field
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect(left, right, on, RightSemi).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a1 | b1 | c2 |",
-            "+----+----+----+",
-            "| 1  | 4  | 7  |",
-            "| 2  | 5  | 8  |",
-            "| 3  | 6  |    |",
-            "+----+----+----+",
-        ];
-        // The output order is important as SMJ preserves sortedness
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_with_nulls_with_options() -> Result<()> {
-        let left = build_table_i32_nullable(
-            ("a1", &vec![Some(3), Some(2), Some(1), Some(0), Some(2)]),
-            ("b1", &vec![None, Some(5), Some(4), None, Some(5)]),
-            ("c2", &vec![Some(90), Some(80), Some(70), Some(60), None]),
-        );
-        let right = build_table_i32_nullable(
-            ("a1", &vec![Some(3), Some(2), Some(2), Some(1)]),
-            ("b1", &vec![None, Some(5), Some(5), Some(4)]), // null in key field
-            ("c2", &vec![Some(9), None, Some(8), Some(7)]), // null in non-key field
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) = join_collect_with_options(
-            left,
-            right,
-            on,
-            RightSemi,
-            vec![
-                SortOptions {
-                    descending: true,
-                    nulls_first: false,
-                };
-                2
-            ],
-            true,
-        )
-        .await?;
-
-        let expected = [
-            "+----+----+----+",
-            "| a1 | b1 | c2 |",
-            "+----+----+----+",
-            "| 3  |    | 9  |",
-            "| 2  | 5  |    |",
-            "| 2  | 5  | 8  |",
-            "| 1  | 4  | 7  |",
-            "+----+----+----+",
-        ];
-        // The output order is important as SMJ preserves sortedness
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_semi_output_two_batches() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 6]),
-            ("c1", &vec![70, 80, 90, 100]),
-        );
-        let right = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 7]),
-            ("c2", &vec![7, 8, 8, 9]),
-        );
-        let on = vec![
-            (
-                Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
-            ),
-            (
-                Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-                Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-            ),
-        ];
-
-        let (_, batches) =
-            join_collect_batch_size_equals_two(left, right, on, RightSemi).await?;
-        let expected = [
-            "+----+----+----+",
-            "| a1 | b1 | c2 |",
-            "+----+----+----+",
-            "| 1  | 4  | 7  |",
-            "| 2  | 5  | 8  |",
-            "| 2  | 5  | 8  |",
-            "+----+----+----+",
-        ];
-        assert_eq!(batches.len(), 2);
-        assert_eq!(batches[0].num_rows(), 2);
-        assert_eq!(batches[1].num_rows(), 1);
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_mark() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![1, 2, 2, 3]),
-            ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right
-            ("c1", &vec![7, 8, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![10, 20, 30, 40]),
-            ("b1", &vec![4, 4, 5, 6]), // 5 is double on the right
-            ("c2", &vec![60, 70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, LeftMark).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+-------+
-            | a1 | b1 | c1 | mark  |
-            +----+----+----+-------+
-            | 1  | 4  | 7  | true  |
-            | 2  | 5  | 8  | true  |
-            | 2  | 5  | 8  | true  |
-            | 3  | 7  | 9  | false |
-            +----+----+----+-------+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_with_duplicated_column_names() -> Result<()> {
-        let left = build_table(
-            ("a", &vec![1, 2, 3]),
-            ("b", &vec![4, 5, 7]),
-            ("c", &vec![7, 8, 9]),
-        );
-        let right = build_table(
-            ("a", &vec![10, 20, 30]),
-            ("b", &vec![1, 2, 7]),
-            ("c", &vec![70, 80, 90]),
-        );
-        let on = vec![(
-            // join on a=b so there are duplicate column names on unjoined columns
-            Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Inner).await?;
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +---+---+---+----+---+----+
-            | a | b | c | a  | b | c  |
-            +---+---+---+----+---+----+
-            | 1 | 4 | 7 | 10 | 1 | 70 |
-            | 2 | 5 | 8 | 20 | 2 | 80 |
-            +---+---+---+----+---+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_date32() -> Result<()> {
-        let left = build_date_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![19107, 19108, 19108]), // this has a repetition
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_date_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![19107, 19108, 19109]),
-            ("c2", &vec![70, 80, 90]),
-        );
-
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Inner).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +------------+------------+------------+------------+------------+------------+
-            | a1         | b1         | c1         | a2         | b1         | c2         |
-            +------------+------------+------------+------------+------------+------------+
-            | 1970-01-02 | 2022-04-25 | 1970-01-08 | 1970-01-11 | 2022-04-25 | 1970-03-12 |
-            | 1970-01-03 | 2022-04-26 | 1970-01-09 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
-            | 1970-01-04 | 2022-04-26 | 1970-01-10 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
-            +------------+------------+------------+------------+------------+------------+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_date64() -> Result<()> {
-        let left = build_date64_table(
-            ("a1", &vec![1, 2, 3]),
-            ("b1", &vec![1650703441000, 1650903441000, 1650903441000]), // this has a repetition
-            ("c1", &vec![7, 8, 9]),
-        );
-        let right = build_date64_table(
-            ("a2", &vec![10, 20, 30]),
-            ("b1", &vec![1650703441000, 1650503441000, 1650903441000]),
-            ("c2", &vec![70, 80, 90]),
-        );
-
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Inner).await?;
-
-        // The output order is important as SMJ preserves sortedness
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            | 1970-01-01T00:00:00.001 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.007 | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.070 |
-            | 1970-01-01T00:00:00.002 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
-            | 1970-01-01T00:00:00.003 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
-            +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_sort_order() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![0, 1, 2, 3, 4, 5]),
-            ("b1", &vec![3, 4, 5, 6, 6, 7]),
-            ("c1", &vec![4, 5, 6, 7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![0, 10, 20, 30, 40]),
-            ("b2", &vec![2, 4, 6, 6, 8]),
-            ("c2", &vec![50, 60, 70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Left).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_sort_order() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![0, 1, 2, 3]),
-            ("b1", &vec![3, 4, 5, 7]),
-            ("c1", &vec![6, 7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![0, 10, 20, 30]),
-            ("b2", &vec![2, 4, 5, 6]),
-            ("c2", &vec![60, 70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Right).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 2  | 60 |
-            | 1  | 4  | 7  | 10 | 4  | 70 |
-            | 2  | 5  | 8  | 20 | 5  | 80 |
-            |    |    |    | 30 | 6  | 90 |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_left_multiple_batches() -> Result<()> {
-        let left_batch_1 = build_table_i32(
-            ("a1", &vec![0, 1, 2]),
-            ("b1", &vec![3, 4, 5]),
-            ("c1", &vec![4, 5, 6]),
-        );
-        let left_batch_2 = build_table_i32(
-            ("a1", &vec![3, 4, 5, 6]),
-            ("b1", &vec![6, 6, 7, 9]),
-            ("c1", &vec![7, 8, 9, 9]),
-        );
-        let right_batch_1 = build_table_i32(
-            ("a2", &vec![0, 10, 20]),
-            ("b2", &vec![2, 4, 6]),
-            ("c2", &vec![50, 60, 70]),
-        );
-        let right_batch_2 = build_table_i32(
-            ("a2", &vec![30, 40]),
-            ("b2", &vec![6, 8]),
-            ("c2", &vec![80, 90]),
-        );
-        let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
-        let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Left).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            | 6  | 9  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_right_multiple_batches() -> Result<()> {
-        let right_batch_1 = build_table_i32(
-            ("a2", &vec![0, 1, 2]),
-            ("b2", &vec![3, 4, 5]),
-            ("c2", &vec![4, 5, 6]),
-        );
-        let right_batch_2 = build_table_i32(
-            ("a2", &vec![3, 4, 5, 6]),
-            ("b2", &vec![6, 6, 7, 9]),
-            ("c2", &vec![7, 8, 9, 9]),
-        );
-        let left_batch_1 = build_table_i32(
-            ("a1", &vec![0, 10, 20]),
-            ("b1", &vec![2, 4, 6]),
-            ("c1", &vec![50, 60, 70]),
-        );
-        let left_batch_2 = build_table_i32(
-            ("a1", &vec![30, 40]),
-            ("b1", &vec![6, 8]),
-            ("c1", &vec![80, 90]),
-        );
-        let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
-        let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Right).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 3  | 4  |
-            | 10 | 4  | 60 | 1  | 4  | 5  |
-            |    |    |    | 2  | 5  | 6  |
-            | 20 | 6  | 70 | 3  | 6  | 7  |
-            | 30 | 6  | 80 | 3  | 6  | 7  |
-            | 20 | 6  | 70 | 4  | 6  | 8  |
-            | 30 | 6  | 80 | 4  | 6  | 8  |
-            |    |    |    | 5  | 7  | 9  |
-            |    |    |    | 6  | 9  | 9  |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn join_full_multiple_batches() -> Result<()> {
-        let left_batch_1 = build_table_i32(
-            ("a1", &vec![0, 1, 2]),
-            ("b1", &vec![3, 4, 5]),
-            ("c1", &vec![4, 5, 6]),
-        );
-        let left_batch_2 = build_table_i32(
-            ("a1", &vec![3, 4, 5, 6]),
-            ("b1", &vec![6, 6, 7, 9]),
-            ("c1", &vec![7, 8, 9, 9]),
-        );
-        let right_batch_1 = build_table_i32(
-            ("a2", &vec![0, 10, 20]),
-            ("b2", &vec![2, 4, 6]),
-            ("c2", &vec![50, 60, 70]),
-        );
-        let right_batch_2 = build_table_i32(
-            ("a2", &vec![30, 40]),
-            ("b2", &vec![6, 8]),
-            ("c2", &vec![80, 90]),
-        );
-        let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
-        let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-
-        let (_, batches) = join_collect(left, right, on, Full).await?;
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +----+----+----+----+----+----+
-            | a1 | b1 | c1 | a2 | b2 | c2 |
-            +----+----+----+----+----+----+
-            |    |    |    | 0  | 2  | 50 |
-            |    |    |    | 40 | 8  | 90 |
-            | 0  | 3  | 4  |    |    |    |
-            | 1  | 4  | 5  | 10 | 4  | 60 |
-            | 2  | 5  | 6  |    |    |    |
-            | 3  | 6  | 7  | 20 | 6  | 70 |
-            | 3  | 6  | 7  | 30 | 6  | 80 |
-            | 4  | 6  | 8  | 20 | 6  | 70 |
-            | 4  | 6  | 8  | 30 | 6  | 80 |
-            | 5  | 7  | 9  |    |    |    |
-            | 6  | 9  | 9  |    |    |    |
-            +----+----+----+----+----+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn overallocation_single_batch_no_spill() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![0, 1, 2, 3, 4, 5]),
-            ("b1", &vec![1, 2, 3, 4, 5, 6]),
-            ("c1", &vec![4, 5, 6, 7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![0, 10, 20, 30, 40]),
-            ("b2", &vec![1, 3, 4, 6, 8]),
-            ("c2", &vec![50, 60, 70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-        let sort_options = vec![SortOptions::default(); on.len()];
-
-        let join_types = vec![
-            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark,
-        ];
-
-        // Disable DiskManager to prevent spilling
-        let runtime = RuntimeEnvBuilder::new()
-            .with_memory_limit(100, 1.0)
-            .with_disk_manager_builder(
-                DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
-            )
-            .build_arc()?;
-        let session_config = SessionConfig::default().with_batch_size(50);
-
-        for join_type in join_types {
-            let task_ctx = TaskContext::default()
-                .with_session_config(session_config.clone())
-                .with_runtime(Arc::clone(&runtime));
-            let task_ctx = Arc::new(task_ctx);
-
-            let join = join_with_options(
-                Arc::clone(&left),
-                Arc::clone(&right),
-                on.clone(),
-                join_type,
-                sort_options.clone(),
-                false,
-            )?;
-
-            let stream = join.execute(0, task_ctx)?;
-            let err = common::collect(stream).await.unwrap_err();
-
-            assert_contains!(err.to_string(), "Failed to allocate additional");
-            assert_contains!(err.to_string(), "SMJStream[0]");
-            assert_contains!(err.to_string(), "Disk spilling disabled");
-            assert!(join.metrics().is_some());
-            assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
-            assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
-            assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
-        }
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn overallocation_multi_batch_no_spill() -> Result<()> {
-        let left_batch_1 = build_table_i32(
-            ("a1", &vec![0, 1]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![4, 5]),
-        );
-        let left_batch_2 = build_table_i32(
-            ("a1", &vec![2, 3]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![6, 7]),
-        );
-        let left_batch_3 = build_table_i32(
-            ("a1", &vec![4, 5]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![8, 9]),
-        );
-        let right_batch_1 = build_table_i32(
-            ("a2", &vec![0, 10]),
-            ("b2", &vec![1, 1]),
-            ("c2", &vec![50, 60]),
-        );
-        let right_batch_2 = build_table_i32(
-            ("a2", &vec![20, 30]),
-            ("b2", &vec![1, 1]),
-            ("c2", &vec![70, 80]),
-        );
-        let right_batch_3 =
-            build_table_i32(("a2", &vec![40]), ("b2", &vec![1]), ("c2", &vec![90]));
-        let left =
-            build_table_from_batches(vec![left_batch_1, left_batch_2, left_batch_3]);
-        let right =
-            build_table_from_batches(vec![right_batch_1, right_batch_2, right_batch_3]);
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-        let sort_options = vec![SortOptions::default(); on.len()];
-
-        let join_types = vec![
-            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark,
-        ];
-
-        // Disable DiskManager to prevent spilling
-        let runtime = RuntimeEnvBuilder::new()
-            .with_memory_limit(100, 1.0)
-            .with_disk_manager_builder(
-                DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
-            )
-            .build_arc()?;
-        let session_config = SessionConfig::default().with_batch_size(50);
-
-        for join_type in join_types {
-            let task_ctx = TaskContext::default()
-                .with_session_config(session_config.clone())
-                .with_runtime(Arc::clone(&runtime));
-            let task_ctx = Arc::new(task_ctx);
-            let join = join_with_options(
-                Arc::clone(&left),
-                Arc::clone(&right),
-                on.clone(),
-                join_type,
-                sort_options.clone(),
-                false,
-            )?;
-
-            let stream = join.execute(0, task_ctx)?;
-            let err = common::collect(stream).await.unwrap_err();
-
-            assert_contains!(err.to_string(), "Failed to allocate additional");
-            assert_contains!(err.to_string(), "SMJStream[0]");
-            assert_contains!(err.to_string(), "Disk spilling disabled");
-            assert!(join.metrics().is_some());
-            assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
-            assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
-            assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
-        }
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn overallocation_single_batch_spill() -> Result<()> {
-        let left = build_table(
-            ("a1", &vec![0, 1, 2, 3, 4, 5]),
-            ("b1", &vec![1, 2, 3, 4, 5, 6]),
-            ("c1", &vec![4, 5, 6, 7, 8, 9]),
-        );
-        let right = build_table(
-            ("a2", &vec![0, 10, 20, 30, 40]),
-            ("b2", &vec![1, 3, 4, 6, 8]),
-            ("c2", &vec![50, 60, 70, 80, 90]),
-        );
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-        let sort_options = vec![SortOptions::default(); on.len()];
-
-        let join_types = [
-            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark,
-        ];
-
-        // Enable DiskManager to allow spilling
-        let runtime = RuntimeEnvBuilder::new()
-            .with_memory_limit(100, 1.0)
-            .with_disk_manager_builder(
-                DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
-            )
-            .build_arc()?;
-
-        for batch_size in [1, 50] {
-            let session_config = SessionConfig::default().with_batch_size(batch_size);
-
-            for join_type in &join_types {
-                let task_ctx = TaskContext::default()
-                    .with_session_config(session_config.clone())
-                    .with_runtime(Arc::clone(&runtime));
-                let task_ctx = Arc::new(task_ctx);
-
-                let join = join_with_options(
-                    Arc::clone(&left),
-                    Arc::clone(&right),
-                    on.clone(),
-                    *join_type,
-                    sort_options.clone(),
-                    false,
-                )?;
-
-                let stream = join.execute(0, task_ctx)?;
-                let spilled_join_result = common::collect(stream).await.unwrap();
-
-                assert!(join.metrics().is_some());
-                assert!(join.metrics().unwrap().spill_count().unwrap() > 0);
-                assert!(join.metrics().unwrap().spilled_bytes().unwrap() > 0);
-                assert!(join.metrics().unwrap().spilled_rows().unwrap() > 0);
-
-                // Run the test with no spill configuration as
-                let task_ctx_no_spill =
-                    TaskContext::default().with_session_config(session_config.clone());
-                let task_ctx_no_spill = Arc::new(task_ctx_no_spill);
-
-                let join = join_with_options(
-                    Arc::clone(&left),
-                    Arc::clone(&right),
-                    on.clone(),
-                    *join_type,
-                    sort_options.clone(),
-                    false,
-                )?;
-                let stream = join.execute(0, task_ctx_no_spill)?;
-                let no_spilled_join_result = common::collect(stream).await.unwrap();
-
-                assert!(join.metrics().is_some());
-                assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
-                assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
-                assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
-                // Compare spilled and non spilled data to check spill logic doesn't corrupt the data
-                assert_eq!(spilled_join_result, no_spilled_join_result);
-            }
-        }
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn overallocation_multi_batch_spill() -> Result<()> {
-        let left_batch_1 = build_table_i32(
-            ("a1", &vec![0, 1]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![4, 5]),
-        );
-        let left_batch_2 = build_table_i32(
-            ("a1", &vec![2, 3]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![6, 7]),
-        );
-        let left_batch_3 = build_table_i32(
-            ("a1", &vec![4, 5]),
-            ("b1", &vec![1, 1]),
-            ("c1", &vec![8, 9]),
-        );
-        let right_batch_1 = build_table_i32(
-            ("a2", &vec![0, 10]),
-            ("b2", &vec![1, 1]),
-            ("c2", &vec![50, 60]),
-        );
-        let right_batch_2 = build_table_i32(
-            ("a2", &vec![20, 30]),
-            ("b2", &vec![1, 1]),
-            ("c2", &vec![70, 80]),
-        );
-        let right_batch_3 =
-            build_table_i32(("a2", &vec![40]), ("b2", &vec![1]), ("c2", &vec![90]));
-        let left =
-            build_table_from_batches(vec![left_batch_1, left_batch_2, left_batch_3]);
-        let right =
-            build_table_from_batches(vec![right_batch_1, right_batch_2, right_batch_3]);
-        let on = vec![(
-            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
-            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
-        )];
-        let sort_options = vec![SortOptions::default(); on.len()];
-
-        let join_types = [
-            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark,
-        ];
-
-        // Enable DiskManager to allow spilling
-        let runtime = RuntimeEnvBuilder::new()
-            .with_memory_limit(500, 1.0)
-            .with_disk_manager_builder(
-                DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
-            )
-            .build_arc()?;
-
-        for batch_size in [1, 50] {
-            let session_config = SessionConfig::default().with_batch_size(batch_size);
-
-            for join_type in &join_types {
-                let task_ctx = TaskContext::default()
-                    .with_session_config(session_config.clone())
-                    .with_runtime(Arc::clone(&runtime));
-                let task_ctx = Arc::new(task_ctx);
-                let join = join_with_options(
-                    Arc::clone(&left),
-                    Arc::clone(&right),
-                    on.clone(),
-                    *join_type,
-                    sort_options.clone(),
-                    false,
-                )?;
-
-                let stream = join.execute(0, task_ctx)?;
-                let spilled_join_result = common::collect(stream).await.unwrap();
-                assert!(join.metrics().is_some());
-                assert!(join.metrics().unwrap().spill_count().unwrap() > 0);
-                assert!(join.metrics().unwrap().spilled_bytes().unwrap() > 0);
-                assert!(join.metrics().unwrap().spilled_rows().unwrap() > 0);
-
-                // Run the test with no spill configuration as
-                let task_ctx_no_spill =
-                    TaskContext::default().with_session_config(session_config.clone());
-                let task_ctx_no_spill = Arc::new(task_ctx_no_spill);
-
-                let join = join_with_options(
-                    Arc::clone(&left),
-                    Arc::clone(&right),
-                    on.clone(),
-                    *join_type,
-                    sort_options.clone(),
-                    false,
-                )?;
-                let stream = join.execute(0, task_ctx_no_spill)?;
-                let no_spilled_join_result = common::collect(stream).await.unwrap();
-
-                assert!(join.metrics().is_some());
-                assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
-                assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
-                assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
-                // Compare spilled and non spilled data to check spill logic doesn't corrupt the data
-                assert_eq!(spilled_join_result, no_spilled_join_result);
-            }
-        }
-
-        Ok(())
-    }
-
-    fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Int32, true),
-            Field::new("b", DataType::Int32, true),
-            Field::new("x", DataType::Int32, true),
-            Field::new("y", DataType::Int32, true),
-        ]));
-
-        let mut batches = JoinedRecordBatches {
-            batches: vec![],
-            filter_mask: BooleanBuilder::new(),
-            row_indices: UInt64Builder::new(),
-            batch_ids: vec![],
-        };
-
-        // Insert already prejoined non-filtered rows
-        batches.batches.push(RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![10, 10])),
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![11, 9])),
-            ],
-        )?);
-
-        batches.batches.push(RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(vec![1])),
-                Arc::new(Int32Array::from(vec![11])),
-                Arc::new(Int32Array::from(vec![1])),
-                Arc::new(Int32Array::from(vec![12])),
-            ],
-        )?);
-
-        batches.batches.push(RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![12, 12])),
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![11, 13])),
-            ],
-        )?);
-
-        batches.batches.push(RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(vec![1])),
-                Arc::new(Int32Array::from(vec![13])),
-                Arc::new(Int32Array::from(vec![1])),
-                Arc::new(Int32Array::from(vec![12])),
-            ],
-        )?);
-
-        batches.batches.push(RecordBatch::try_new(
-            Arc::clone(&schema),
-            vec![
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![14, 14])),
-                Arc::new(Int32Array::from(vec![1, 1])),
-                Arc::new(Int32Array::from(vec![12, 11])),
-            ],
-        )?);
-
-        let streamed_indices = vec![0, 0];
-        batches.batch_ids.extend(vec![0; streamed_indices.len()]);
-        batches
-            .row_indices
-            .extend(&UInt64Array::from(streamed_indices));
-
-        let streamed_indices = vec![1];
-        batches.batch_ids.extend(vec![0; streamed_indices.len()]);
-        batches
-            .row_indices
-            .extend(&UInt64Array::from(streamed_indices));
-
-        let streamed_indices = vec![0, 0];
-        batches.batch_ids.extend(vec![1; streamed_indices.len()]);
-        batches
-            .row_indices
-            .extend(&UInt64Array::from(streamed_indices));
-
-        let streamed_indices = vec![0];
-        batches.batch_ids.extend(vec![2; streamed_indices.len()]);
-        batches
-            .row_indices
-            .extend(&UInt64Array::from(streamed_indices));
-
-        let streamed_indices = vec![0, 0];
-        batches.batch_ids.extend(vec![3; streamed_indices.len()]);
-        batches
-            .row_indices
-            .extend(&UInt64Array::from(streamed_indices));
-
-        batches
-            .filter_mask
-            .extend(&BooleanArray::from(vec![true, false]));
-        batches.filter_mask.extend(&BooleanArray::from(vec![true]));
-        batches
-            .filter_mask
-            .extend(&BooleanArray::from(vec![false, true]));
-        batches.filter_mask.extend(&BooleanArray::from(vec![false]));
-        batches
-            .filter_mask
-            .extend(&BooleanArray::from(vec![false, false]));
-
-        Ok(batches)
-    }
-
-    #[tokio::test]
-    async fn test_left_outer_join_filtered_mask() -> Result<()> {
-        let mut joined_batches = build_joined_record_batches()?;
-        let schema = joined_batches.batches.first().unwrap().schema();
-
-        let output = concat_batches(&schema, &joined_batches.batches)?;
-        let out_mask = joined_batches.filter_mask.finish();
-        let out_indices = joined_batches.row_indices.finish();
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                true, false, false, false, false, false, false, false
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0]),
-                &[0usize],
-                &BooleanArray::from(vec![false]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                false, false, false, false, false, false, false, false
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0]),
-                &[0usize; 2],
-                &BooleanArray::from(vec![true, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                true, true, false, false, false, false, false, false
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, true, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![true, true, true, false, false, false, false, false])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![true, false, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                Some(true),
-                None,
-                Some(true),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false)
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                None,
-                None,
-                Some(true),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false)
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, true, true]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                None,
-                Some(true),
-                Some(true),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false)
-            ])
-        );
-
-        assert_eq!(
-            get_corrected_filter_mask(
-                Left,
-                &UInt64Array::from(vec![0, 0, 0]),
-                &[0usize; 3],
-                &BooleanArray::from(vec![false, false, false]),
-                output.num_rows()
-            )
-            .unwrap(),
-            BooleanArray::from(vec![
-                None,
-                None,
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false),
-                Some(false)
-            ])
-        );
-
-        let corrected_mask = get_corrected_filter_mask(
-            Left,
-            &out_indices,
-            &joined_batches.batch_ids,
-            &out_mask,
-            output.num_rows(),
-        )
-        .unwrap();
-
-        assert_eq!(
-            corrected_mask,
-            BooleanArray::from(vec![
-                Some(true),
-                None,
-                Some(true),
-                None,
-                Some(true),
-                Some(false),
-                None,
-                Some(false)
-            ])
-        );
-
-        let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
-
-        assert_snapshot!(batches_to_string(&[filtered_rb]), @r#"
-                +---+----+---+----+
-                | a | b  | x | y  |
-                +---+----+---+----+
-                | 1 | 10 | 1 | 11 |
-                | 1 | 11 | 1 | 12 |
-                | 1 | 12 | 1 | 13 |
-                +---+----+---+----+
-            "#);
-
-        // output null rows
-
-        let null_mask = arrow::compute::not(&corrected_mask)?;
-        assert_eq!(
-            null_mask,
-            BooleanArray::from(vec![
-                Some(false),
-                None,
-                Some(false),
-                None,
-                Some(false),
-                Some(true),
-                None,
-                Some(true)
-            ])
-        );
-
-        let null_joined_batch = filter_record_batch(&output, &null_mask)?;
-
-        assert_snapshot!(batches_to_string(&[null_joined_batch]), @r#"
-                +---+----+---+----+
-                | a | b  | x | y  |
-                +---+----+---+----+
-                | 1 | 13 | 1 | 12 |
-                | 1 | 14 | 1 | 11 |
-                +---+----+---+----+
-            "#);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_semi_join_filtered_mask() -> Result<()> {
-        for join_type in [LeftSemi, RightSemi] {
-            let mut joined_batches = build_joined_record_batches()?;
-            let schema = joined_batches.batches.first().unwrap().schema();
-
-            let output = concat_batches(&schema, &joined_batches.batches)?;
-            let out_mask = joined_batches.filter_mask.finish();
-            let out_indices = joined_batches.row_indices.finish();
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0]),
-                    &[0usize],
-                    &BooleanArray::from(vec![true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![true])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0]),
-                    &[0usize],
-                    &BooleanArray::from(vec![false]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0]),
-                    &[0usize; 2],
-                    &BooleanArray::from(vec![true, true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![Some(true), None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![true, true, true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![Some(true), None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![true, false, true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![Some(true), None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, false, true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, Some(true),])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, true, true]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, Some(true), None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, false, false]),
-                    output.num_rows()
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, None])
-            );
-
-            let corrected_mask = get_corrected_filter_mask(
-                join_type,
-                &out_indices,
-                &joined_batches.batch_ids,
-                &out_mask,
-                output.num_rows(),
-            )
-            .unwrap();
-
-            assert_eq!(
-                corrected_mask,
-                BooleanArray::from(vec![
-                    Some(true),
-                    None,
-                    Some(true),
-                    None,
-                    Some(true),
-                    None,
-                    None,
-                    None
-                ])
-            );
-
-            let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
-
-            assert_batches_eq!(
-                &[
-                    "+---+----+---+----+",
-                    "| a | b  | x | y  |",
-                    "+---+----+---+----+",
-                    "| 1 | 10 | 1 | 11 |",
-                    "| 1 | 11 | 1 | 12 |",
-                    "| 1 | 12 | 1 | 13 |",
-                    "+---+----+---+----+",
-                ],
-                &[filtered_rb]
-            );
-
-            // output null rows
-            let null_mask = arrow::compute::not(&corrected_mask)?;
-            assert_eq!(
-                null_mask,
-                BooleanArray::from(vec![
-                    Some(false),
-                    None,
-                    Some(false),
-                    None,
-                    Some(false),
-                    None,
-                    None,
-                    None
-                ])
-            );
-
-            let null_joined_batch = filter_record_batch(&output, &null_mask)?;
-
-            assert_batches_eq!(
-                &[
-                    "+---+---+---+---+",
-                    "| a | b | x | y |",
-                    "+---+---+---+---+",
-                    "+---+---+---+---+",
-                ],
-                &[null_joined_batch]
-            );
-        }
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_anti_join_filtered_mask() -> Result<()> {
-        for join_type in [LeftAnti, RightAnti] {
-            let mut joined_batches = build_joined_record_batches()?;
-            let schema = joined_batches.batches.first().unwrap().schema();
-
-            let output = concat_batches(&schema, &joined_batches.batches)?;
-            let out_mask = joined_batches.filter_mask.finish();
-            let out_indices = joined_batches.row_indices.finish();
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0]),
-                    &[0usize],
-                    &BooleanArray::from(vec![true]),
-                    1
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0]),
-                    &[0usize],
-                    &BooleanArray::from(vec![false]),
-                    1
-                )
-                .unwrap(),
-                BooleanArray::from(vec![Some(true)])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0]),
-                    &[0usize; 2],
-                    &BooleanArray::from(vec![true, true]),
-                    2
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![true, true, true]),
-                    3
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![true, false, true]),
-                    3
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, false, true]),
-                    3
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, true, true]),
-                    3
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, None])
-            );
-
-            assert_eq!(
-                get_corrected_filter_mask(
-                    join_type,
-                    &UInt64Array::from(vec![0, 0, 0]),
-                    &[0usize; 3],
-                    &BooleanArray::from(vec![false, false, false]),
-                    3
-                )
-                .unwrap(),
-                BooleanArray::from(vec![None, None, Some(true)])
-            );
-
-            let corrected_mask = get_corrected_filter_mask(
-                join_type,
-                &out_indices,
-                &joined_batches.batch_ids,
-                &out_mask,
-                output.num_rows(),
-            )
-            .unwrap();
-
-            assert_eq!(
-                corrected_mask,
-                BooleanArray::from(vec![
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    Some(true),
-                    None,
-                    Some(true)
-                ])
-            );
-
-            let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
-
-            allow_duplicates! {
-                assert_snapshot!(batches_to_string(&[filtered_rb]), @r#"
-                    +---+----+---+----+
-                    | a | b  | x | y  |
-                    +---+----+---+----+
-                    | 1 | 13 | 1 | 12 |
-                    | 1 | 14 | 1 | 11 |
-                    +---+----+---+----+
-            "#);
-            }
-
-            // output null rows
-            let null_mask = arrow::compute::not(&corrected_mask)?;
-            assert_eq!(
-                null_mask,
-                BooleanArray::from(vec![
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    Some(false),
-                    None,
-                    Some(false),
-                ])
-            );
-
-            let null_joined_batch = filter_record_batch(&output, &null_mask)?;
-
-            allow_duplicates! {
-                assert_snapshot!(batches_to_string(&[null_joined_batch]), @r#"
-                        +---+---+---+---+
-                        | a | b | x | y |
-                        +---+---+---+---+
-                        +---+---+---+---+
-                "#);
-            }
-        }
-        Ok(())
-    }
-
-    /// Returns the column names on the schema
-    fn columns(schema: &Schema) -> Vec<String> {
-        schema.fields().iter().map(|f| f.name().clone()).collect()
-    }
-}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
new file mode 100644
index 0000000000000..ac077792f592c
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
@@ -0,0 +1,633 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the Sort-Merge join execution plan.
+//! A Sort-Merge join plan consumes two sorted children plans and produces
+//! joined output by given join type and other options.
+
+use std::any::Any;
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use crate::execution_plan::{EmissionType, boundedness_from_children};
+use crate::expressions::PhysicalSortExpr;
+use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
+use crate::joins::sort_merge_join::stream::SortMergeJoinStream;
+use crate::joins::utils::{
+    JoinFilter, JoinOn, JoinOnRef, build_join_schema, check_join_is_valid,
+    estimate_join_statistics, reorder_output_after_swap,
+    symmetric_join_output_partitioning,
+};
+use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::projection::{
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs, update_join_on,
+};
+use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    PlanProperties, SendableRecordBatchStream, Statistics, check_if_same_properties,
+};
+
+use arrow::compute::SortOptions;
+use arrow::datatypes::SchemaRef;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    JoinSide, JoinType, NullEquality, Result, assert_eq_or_internal_err, internal_err,
+    plan_err,
+};
+use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_physical_expr::equivalence::join_equivalence_properties;
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
+
+/// Join execution plan that executes equi-join predicates on multiple partitions using Sort-Merge
+/// join algorithm and applies an optional filter post join. Can be used to join arbitrarily large
+/// inputs where one or both of the inputs don't fit in the available memory.
+///
+/// # Join Expressions
+///
+/// Equi-join predicate (e.g. `<col1> = <col2>`) expressions are represented by [`Self::on`].
+///
+/// Non-equality predicates, which can not be pushed down to join inputs (e.g.
+/// `<col1> != <col2>`) are known as "filter expressions" and are evaluated
+/// after the equijoin predicates. They are represented by [`Self::filter`]. These are optional
+/// expressions.
+///
+/// # Sorting
+///
+/// Assumes that both the left and right input to the join are pre-sorted. It is not the
+/// responsibility of this execution plan to sort the inputs.
+///
+/// # "Streamed" vs "Buffered"
+///
+/// The number of record batches of streamed input currently present in the memory will depend
+/// on the output batch size of the execution plan. There is no spilling support for streamed input.
+/// The comparisons are performed from values of join keys in streamed input with the values of
+/// join keys in buffered input. One row in streamed record batch could be matched with multiple rows in
+/// buffered input batches. The streamed input is managed through the states in `StreamedState`
+/// and streamed input batches are represented by `StreamedBatch`.
+///
+/// Buffered input is buffered for all record batches having the same value of join key.
+/// If the memory limit increases beyond the specified value and spilling is enabled,
+/// buffered batches could be spilled to disk. If spilling is disabled, the execution
+/// will fail under the same conditions. Multiple record batches of buffered could currently reside
+/// in memory/disk during the execution. The number of buffered batches residing in
+/// memory/disk depends on the number of rows of buffered input having the same value
+/// of join key as that of streamed input rows currently present in memory. Due to pre-sorted inputs,
+/// the algorithm understands when it is not needed anymore, and releases the buffered batches
+/// from memory/disk. The buffered input is managed through the states in `BufferedState`
+/// and buffered input batches are represented by `BufferedBatch`.
+///
+/// Depending on the type of join, left or right input may be selected as streamed or buffered
+/// respectively. For example, in a left-outer join, the left execution plan will be selected as
+/// streamed input while in a right-outer join, the right execution plan will be selected as the
+/// streamed input.
+///
+/// Reference for the algorithm:
+/// <https://en.wikipedia.org/wiki/Sort-merge_join>.
+///
+/// Helpful short video demonstration:
+/// <https://www.youtube.com/watch?v=jiWCPJtDE2c>.
+#[derive(Debug, Clone)]
+pub struct SortMergeJoinExec {
+    /// Left sorted joining execution plan
+    pub left: Arc<dyn ExecutionPlan>,
+    /// Right sorting joining execution plan
+    pub right: Arc<dyn ExecutionPlan>,
+    /// Set of common columns used to join on
+    pub on: JoinOn,
+    /// Filters which are applied while finding matching rows
+    pub filter: Option<JoinFilter>,
+    /// How the join is performed
+    pub join_type: JoinType,
+    /// The schema once the join is applied
+    schema: SchemaRef,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+    /// The left SortExpr
+    left_sort_exprs: LexOrdering,
+    /// The right SortExpr
+    right_sort_exprs: LexOrdering,
+    /// Sort options of join columns used in sorting left and right execution plans
+    pub sort_options: Vec<SortOptions>,
+    /// Defines the null equality for the join.
+    pub null_equality: NullEquality,
+    /// Cache holding plan properties like equivalences, output partitioning etc.
+    cache: Arc<PlanProperties>,
+}
+
+impl SortMergeJoinExec {
+    /// Tries to create a new [SortMergeJoinExec].
+    /// The inputs are sorted using `sort_options` are applied to the columns in the `on`
+    /// # Error
+    /// This function errors when it is not possible to join the left and right sides on keys `on`.
+    pub fn try_new(
+        left: Arc<dyn ExecutionPlan>,
+        right: Arc<dyn ExecutionPlan>,
+        on: JoinOn,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        sort_options: Vec<SortOptions>,
+        null_equality: NullEquality,
+    ) -> Result<Self> {
+        let left_schema = left.schema();
+        let right_schema = right.schema();
+
+        check_join_is_valid(&left_schema, &right_schema, &on)?;
+        if sort_options.len() != on.len() {
+            return plan_err!(
+                "Expected number of sort options: {}, actual: {}",
+                on.len(),
+                sort_options.len()
+            );
+        }
+
+        let (left_sort_exprs, right_sort_exprs): (Vec<_>, Vec<_>) = on
+            .iter()
+            .zip(sort_options.iter())
+            .map(|((l, r), sort_op)| {
+                let left = PhysicalSortExpr {
+                    expr: Arc::clone(l),
+                    options: *sort_op,
+                };
+                let right = PhysicalSortExpr {
+                    expr: Arc::clone(r),
+                    options: *sort_op,
+                };
+                (left, right)
+            })
+            .unzip();
+        let Some(left_sort_exprs) = LexOrdering::new(left_sort_exprs) else {
+            return plan_err!(
+                "SortMergeJoinExec requires valid sort expressions for its left side"
+            );
+        };
+        let Some(right_sort_exprs) = LexOrdering::new(right_sort_exprs) else {
+            return plan_err!(
+                "SortMergeJoinExec requires valid sort expressions for its right side"
+            );
+        };
+
+        let schema =
+            Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0);
+        let cache =
+            Self::compute_properties(&left, &right, Arc::clone(&schema), join_type, &on)?;
+        Ok(Self {
+            left,
+            right,
+            on,
+            filter,
+            join_type,
+            schema,
+            metrics: ExecutionPlanMetricsSet::new(),
+            left_sort_exprs,
+            right_sort_exprs,
+            sort_options,
+            null_equality,
+            cache: Arc::new(cache),
+        })
+    }
+
+    /// Get probe side (e.g streaming side) information for this sort merge join.
+    /// In current implementation, probe side is determined according to join type.
+    pub fn probe_side(join_type: &JoinType) -> JoinSide {
+        // When output schema contains only the right side, probe side is right.
+        // Otherwise probe side is the left side.
+        match join_type {
+            // TODO: sort merge support for right mark (tracked here: https://github.com/apache/datafusion/issues/16226)
+            JoinType::Right
+            | JoinType::RightSemi
+            | JoinType::RightAnti
+            | JoinType::RightMark => JoinSide::Right,
+            JoinType::Inner
+            | JoinType::Left
+            | JoinType::Full
+            | JoinType::LeftAnti
+            | JoinType::LeftSemi
+            | JoinType::LeftMark => JoinSide::Left,
+        }
+    }
+
+    /// Calculate order preservation flags for this sort merge join.
+    fn maintains_input_order(join_type: JoinType) -> Vec<bool> {
+        match join_type {
+            JoinType::Inner => vec![true, false],
+            JoinType::Left
+            | JoinType::LeftSemi
+            | JoinType::LeftAnti
+            | JoinType::LeftMark => vec![true, false],
+            JoinType::Right
+            | JoinType::RightSemi
+            | JoinType::RightAnti
+            | JoinType::RightMark => {
+                vec![false, true]
+            }
+            _ => vec![false, false],
+        }
+    }
+
+    /// Set of common columns used to join on
+    pub fn on(&self) -> &[(PhysicalExprRef, PhysicalExprRef)] {
+        &self.on
+    }
+
+    /// Ref to right execution plan
+    pub fn right(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.right
+    }
+
+    /// Join type
+    pub fn join_type(&self) -> JoinType {
+        self.join_type
+    }
+
+    /// Ref to left execution plan
+    pub fn left(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.left
+    }
+
+    /// Ref to join filter
+    pub fn filter(&self) -> &Option<JoinFilter> {
+        &self.filter
+    }
+
+    /// Ref to sort options
+    pub fn sort_options(&self) -> &[SortOptions] {
+        &self.sort_options
+    }
+
+    /// Null equality
+    pub fn null_equality(&self) -> NullEquality {
+        self.null_equality
+    }
+
+    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
+    fn compute_properties(
+        left: &Arc<dyn ExecutionPlan>,
+        right: &Arc<dyn ExecutionPlan>,
+        schema: SchemaRef,
+        join_type: JoinType,
+        join_on: JoinOnRef,
+    ) -> Result<PlanProperties> {
+        // Calculate equivalence properties:
+        let eq_properties = join_equivalence_properties(
+            left.equivalence_properties().clone(),
+            right.equivalence_properties().clone(),
+            &join_type,
+            schema,
+            &Self::maintains_input_order(join_type),
+            Some(Self::probe_side(&join_type)),
+            join_on,
+        )?;
+
+        let output_partitioning =
+            symmetric_join_output_partitioning(left, right, &join_type)?;
+
+        Ok(PlanProperties::new(
+            eq_properties,
+            output_partitioning,
+            EmissionType::Incremental,
+            boundedness_from_children([left, right]),
+        ))
+    }
+
+    /// # Notes:
+    ///
+    /// This function should be called BEFORE inserting any repartitioning
+    /// operators on the join's children. Check [`super::super::HashJoinExec::swap_inputs`]
+    /// for more details.
+    pub fn swap_inputs(&self) -> Result<Arc<dyn ExecutionPlan>> {
+        let left = self.left();
+        let right = self.right();
+        let new_join = SortMergeJoinExec::try_new(
+            Arc::clone(right),
+            Arc::clone(left),
+            self.on()
+                .iter()
+                .map(|(l, r)| (Arc::clone(r), Arc::clone(l)))
+                .collect::<Vec<_>>(),
+            self.filter().as_ref().map(JoinFilter::swap),
+            self.join_type().swap(),
+            self.sort_options.clone(),
+            self.null_equality,
+        )?;
+
+        // TODO: OR this condition with having a built-in projection (like
+        //       ordinary hash join) when we support it.
+        if matches!(
+            self.join_type(),
+            JoinType::LeftSemi
+                | JoinType::RightSemi
+                | JoinType::LeftAnti
+                | JoinType::RightAnti
+        ) {
+            Ok(Arc::new(new_join))
+        } else {
+            reorder_output_after_swap(Arc::new(new_join), &left.schema(), &right.schema())
+        }
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
+}
+
+impl DisplayAs for SortMergeJoinExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let on = self
+                    .on
+                    .iter()
+                    .map(|(c1, c2)| format!("({c1}, {c2})"))
+                    .collect::<Vec<String>>()
+                    .join(", ");
+                let display_null_equality =
+                    if self.null_equality() == NullEquality::NullEqualsNull {
+                        ", NullsEqual: true"
+                    } else {
+                        ""
+                    };
+                write!(
+                    f,
+                    "{}: join_type={:?}, on=[{}]{}{}",
+                    Self::static_name(),
+                    self.join_type,
+                    on,
+                    self.filter.as_ref().map_or_else(
+                        || "".to_string(),
+                        |f| format!(", filter={}", f.expression())
+                    ),
+                    display_null_equality,
+                )
+            }
+            DisplayFormatType::TreeRender => {
+                let on = self
+                    .on
+                    .iter()
+                    .map(|(c1, c2)| {
+                        format!("({} = {})", fmt_sql(c1.as_ref()), fmt_sql(c2.as_ref()))
+                    })
+                    .collect::<Vec<String>>()
+                    .join(", ");
+
+                if self.join_type() != JoinType::Inner {
+                    writeln!(f, "join_type={:?}", self.join_type)?;
+                }
+                writeln!(f, "on={on}")?;
+
+                if self.null_equality() == NullEquality::NullEqualsNull {
+                    writeln!(f, "NullsEqual: true")?;
+                }
+
+                Ok(())
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for SortMergeJoinExec {
+    fn name(&self) -> &'static str {
+        "SortMergeJoinExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        let (left_expr, right_expr) = self
+            .on
+            .iter()
+            .map(|(l, r)| (Arc::clone(l), Arc::clone(r)))
+            .unzip();
+        vec![
+            Distribution::HashPartitioned(left_expr),
+            Distribution::HashPartitioned(right_expr),
+        ]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        vec![
+            Some(OrderingRequirements::from(self.left_sort_exprs.clone())),
+            Some(OrderingRequirements::from(self.right_sort_exprs.clone())),
+        ]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        Self::maintains_input_order(self.join_type)
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.left, &self.right]
+    }
+
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join keys from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
+        match &children[..] {
+            [left, right] => Ok(Arc::new(SortMergeJoinExec::try_new(
+                Arc::clone(left),
+                Arc::clone(right),
+                self.on.clone(),
+                self.filter.clone(),
+                self.join_type,
+                self.sort_options.clone(),
+                self.null_equality,
+            )?)),
+            _ => internal_err!("SortMergeJoin wrong number of children"),
+        }
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let left_partitions = self.left.output_partitioning().partition_count();
+        let right_partitions = self.right.output_partitioning().partition_count();
+        assert_eq_or_internal_err!(
+            left_partitions,
+            right_partitions,
+            "Invalid SortMergeJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+                 consider using RepartitionExec"
+        );
+        let (on_left, on_right) = self.on.iter().cloned().unzip();
+        let (streamed, buffered, on_streamed, on_buffered) =
+            if SortMergeJoinExec::probe_side(&self.join_type) == JoinSide::Left {
+                (
+                    Arc::clone(&self.left),
+                    Arc::clone(&self.right),
+                    on_left,
+                    on_right,
+                )
+            } else {
+                (
+                    Arc::clone(&self.right),
+                    Arc::clone(&self.left),
+                    on_right,
+                    on_left,
+                )
+            };
+
+        // execute children plans
+        let streamed = streamed.execute(partition, Arc::clone(&context))?;
+        let buffered = buffered.execute(partition, Arc::clone(&context))?;
+
+        // create output buffer
+        let batch_size = context.session_config().batch_size();
+
+        // create memory reservation
+        let reservation = MemoryConsumer::new(format!("SMJStream[{partition}]"))
+            .register(context.memory_pool());
+
+        // create join stream
+        Ok(Box::pin(SortMergeJoinStream::try_new(
+            context.session_config().spill_compression(),
+            Arc::clone(&self.schema),
+            self.sort_options.clone(),
+            self.null_equality,
+            streamed,
+            buffered,
+            on_streamed,
+            on_buffered,
+            self.filter.clone(),
+            self.join_type,
+            batch_size,
+            SortMergeJoinMetrics::new(partition, &self.metrics),
+            reservation,
+            context.runtime_env(),
+        )?))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        // SortMergeJoinExec uses symmetric hash partitioning where both left and right
+        // inputs are hash-partitioned on the join keys. This means partition `i` of the
+        // left input is joined with partition `i` of the right input.
+        //
+        // Therefore, partition-specific statistics can be computed by getting the
+        // partition-specific statistics from both children and combining them via
+        // `estimate_join_statistics`.
+        //
+        // TODO stats: it is not possible in general to know the output size of joins
+        // There are some special cases though, for example:
+        // - `A LEFT JOIN B ON A.col=B.col` with `COUNT_DISTINCT(B.col)=COUNT(B.col)`
+        let left_stats = Arc::unwrap_or_clone(self.left.partition_statistics(partition)?);
+        let right_stats =
+            Arc::unwrap_or_clone(self.right.partition_statistics(partition)?);
+        Ok(Arc::new(estimate_join_statistics(
+            left_stats,
+            right_stats,
+            &self.on,
+            &self.join_type,
+            &self.schema,
+        )?))
+    }
+
+    /// Tries to swap the projection with its input [`SortMergeJoinExec`]. If it can be done,
+    /// it returns the new swapped version having the [`SortMergeJoinExec`] as the top plan.
+    /// Otherwise, it returns None.
+    fn try_swapping_with_projection(
+        &self,
+        projection: &ProjectionExec,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        // Convert projected PhysicalExpr's to columns. If not possible, we cannot proceed.
+        let Some(projection_as_columns) = physical_to_column_exprs(projection.expr())
+        else {
+            return Ok(None);
+        };
+
+        let (far_right_left_col_ind, far_left_right_col_ind) = join_table_borders(
+            self.left().schema().fields().len(),
+            &projection_as_columns,
+        );
+
+        if !join_allows_pushdown(
+            &projection_as_columns,
+            &self.schema(),
+            far_right_left_col_ind,
+            far_left_right_col_ind,
+        ) {
+            return Ok(None);
+        }
+
+        let Some(new_on) = update_join_on(
+            &projection_as_columns[0..=far_right_left_col_ind as _],
+            &projection_as_columns[far_left_right_col_ind as _..],
+            self.on(),
+            self.left().schema().fields().len(),
+        ) else {
+            return Ok(None);
+        };
+
+        let (new_left, new_right) = new_join_children(
+            &projection_as_columns,
+            far_right_left_col_ind,
+            far_left_right_col_ind,
+            self.children()[0],
+            self.children()[1],
+        )?;
+
+        Ok(Some(Arc::new(SortMergeJoinExec::try_new(
+            Arc::new(new_left),
+            Arc::new(new_right),
+            new_on,
+            self.filter.clone(),
+            self.join_type,
+            self.sort_options.clone(),
+            self.null_equality,
+        )?)))
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs b/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs
new file mode 100644
index 0000000000000..d598442b653eb
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/filter.rs
@@ -0,0 +1,595 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Filter handling for Sort-Merge Join
+//!
+//! This module encapsulates the complexity of join filter evaluation, including:
+//! - Immediate filtering for INNER joins
+//! - Deferred filtering for outer/semi/anti/mark joins
+//! - Metadata tracking for grouping output rows by input row
+//! - Correcting filter masks to handle multiple matches per input row
+
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, BooleanArray, BooleanBuilder, RecordBatch,
+    UInt64Array, UInt64Builder,
+};
+use arrow::compute::{self, concat_batches, filter_record_batch};
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{JoinSide, JoinType, Result};
+
+use crate::joins::utils::JoinFilter;
+
+/// Metadata for tracking filter results during deferred filtering
+///
+/// When a join filter is present and we need to ensure each input row produces
+/// at least one output (outer joins) or exactly one output (semi joins), we can't
+/// filter immediately. Instead, we accumulate all joined rows with metadata,
+/// then post-process to determine which rows to output.
+#[derive(Debug)]
+pub struct FilterMetadata {
+    /// Did each output row pass the join filter?
+    /// Used to detect if an input row found ANY match
+    pub filter_mask: BooleanBuilder,
+
+    /// Which input row (within batch) produced each output row?
+    /// Used for grouping output rows by input row
+    pub row_indices: UInt64Builder,
+
+    /// Which input batch did each output row come from?
+    /// Used to disambiguate row_indices across multiple batches
+    pub batch_ids: Vec<usize>,
+}
+
+impl FilterMetadata {
+    /// Create new empty filter metadata
+    pub fn new() -> Self {
+        Self {
+            filter_mask: BooleanBuilder::new(),
+            row_indices: UInt64Builder::new(),
+            batch_ids: vec![],
+        }
+    }
+
+    /// Returns (row_indices, filter_mask, batch_ids_ref) and clears builders
+    pub fn finish_metadata(&mut self) -> (UInt64Array, BooleanArray, &[usize]) {
+        let row_indices = self.row_indices.finish();
+        let filter_mask = self.filter_mask.finish();
+        (row_indices, filter_mask, &self.batch_ids)
+    }
+
+    /// Add metadata for null-joined rows (no filter applied)
+    pub fn append_nulls(&mut self, num_rows: usize) {
+        self.filter_mask.append_nulls(num_rows);
+        self.row_indices.append_nulls(num_rows);
+        self.batch_ids.resize(
+            self.batch_ids.len() + num_rows,
+            0, // batch_id = 0 for null-joined rows
+        );
+    }
+
+    /// Add metadata for filtered rows
+    pub fn append_filter_metadata(
+        &mut self,
+        row_indices: &UInt64Array,
+        filter_mask: &BooleanArray,
+        batch_id: usize,
+    ) {
+        debug_assert_eq!(
+            row_indices.len(),
+            filter_mask.len(),
+            "row_indices and filter_mask must have same length"
+        );
+
+        for i in 0..row_indices.len() {
+            if filter_mask.is_null(i) {
+                self.filter_mask.append_null();
+            } else if filter_mask.value(i) {
+                self.filter_mask.append_value(true);
+            } else {
+                self.filter_mask.append_value(false);
+            }
+
+            if row_indices.is_null(i) {
+                self.row_indices.append_null();
+            } else {
+                self.row_indices.append_value(row_indices.value(i));
+            }
+
+            self.batch_ids.push(batch_id);
+        }
+    }
+
+    /// Verify that metadata arrays are aligned (same length)
+    pub fn debug_assert_metadata_aligned(&self) {
+        if self.filter_mask.len() > 0 {
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                self.row_indices.len(),
+                "filter_mask and row_indices must have same length when metadata is used"
+            );
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                self.batch_ids.len(),
+                "filter_mask and batch_ids must have same length when metadata is used"
+            );
+        } else {
+            debug_assert_eq!(
+                self.filter_mask.len(),
+                0,
+                "filter_mask should be empty when batches is empty"
+            );
+        }
+    }
+}
+
+impl Default for FilterMetadata {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Determines if a join type needs deferred filtering
+///
+/// Deferred filtering is required when:
+/// - A filter exists AND
+/// - The join type requires ensuring each input row produces at least one output
+///   (or exactly one for semi joins)
+pub fn needs_deferred_filtering(
+    filter: &Option<JoinFilter>,
+    join_type: JoinType,
+) -> bool {
+    filter.is_some()
+        && matches!(
+            join_type,
+            JoinType::Left
+                | JoinType::LeftSemi
+                | JoinType::LeftMark
+                | JoinType::Right
+                | JoinType::RightSemi
+                | JoinType::RightMark
+                | JoinType::LeftAnti
+                | JoinType::RightAnti
+                | JoinType::Full
+        )
+}
+
+/// Gets the arrays which join filters are applied on
+///
+/// Extracts the columns needed for filter evaluation from left and right batch columns
+pub fn get_filter_columns(
+    join_filter: &Option<JoinFilter>,
+    left_columns: &[ArrayRef],
+    right_columns: &[ArrayRef],
+) -> Vec<ArrayRef> {
+    let mut filter_columns = vec![];
+
+    if let Some(f) = join_filter {
+        let left_columns: Vec<ArrayRef> = f
+            .column_indices()
+            .iter()
+            .filter(|col_index| col_index.side == JoinSide::Left)
+            .map(|i| Arc::clone(&left_columns[i.index]))
+            .collect();
+        let right_columns: Vec<ArrayRef> = f
+            .column_indices()
+            .iter()
+            .filter(|col_index| col_index.side == JoinSide::Right)
+            .map(|i| Arc::clone(&right_columns[i.index]))
+            .collect();
+
+        filter_columns.extend(left_columns);
+        filter_columns.extend(right_columns);
+    }
+
+    filter_columns
+}
+
+/// Determines if current index is the last occurrence of a row
+///
+/// Used during filter mask correction to detect row boundaries when grouping
+/// output rows by input row.
+fn last_index_for_row(
+    row_index: usize,
+    indices: &UInt64Array,
+    batch_ids: &[usize],
+    indices_len: usize,
+) -> bool {
+    debug_assert_eq!(
+        indices.len(),
+        indices_len,
+        "indices.len() should match indices_len parameter"
+    );
+    debug_assert_eq!(
+        batch_ids.len(),
+        indices_len,
+        "batch_ids.len() should match indices_len"
+    );
+    debug_assert!(
+        row_index < indices_len,
+        "row_index {row_index} should be < indices_len {indices_len}",
+    );
+
+    // If this is the last index overall, it's definitely the last for this row
+    if row_index == indices_len - 1 {
+        return true;
+    }
+
+    // Check if next row has different (batch_id, index) pair
+    let current_batch_id = batch_ids[row_index];
+    let next_batch_id = batch_ids[row_index + 1];
+
+    if current_batch_id != next_batch_id {
+        return true;
+    }
+
+    // Same batch_id, check if row index is different
+    // Both current and next should be non-null (already joined rows)
+    if indices.is_null(row_index) || indices.is_null(row_index + 1) {
+        return true;
+    }
+
+    indices.value(row_index) != indices.value(row_index + 1)
+}
+
+/// Corrects the filter mask for joins with deferred filtering
+///
+/// When an input row joins with multiple buffered rows, we get multiple output rows.
+/// This function groups them by input row and applies join-type-specific logic:
+///
+/// - **Outer joins**: Keep first matching row, convert rest to nulls, add null-joined for unmatched
+/// - **Semi joins**: Keep first matching row, discard rest
+/// - **Anti joins**: Keep row only if NO matches passed filter
+/// - **Mark joins**: Like semi but first match only
+///
+/// # Arguments
+/// * `join_type` - The type of join being performed
+/// * `row_indices` - Which input row produced each output row
+/// * `batch_ids` - Which batch each output row came from
+/// * `filter_mask` - Whether each output row passed the filter
+/// * `expected_size` - Total number of input rows (for adding unmatched)
+///
+/// # Returns
+/// Corrected mask indicating which rows to include in final output:
+/// - `true`: Include this row
+/// - `false`: Convert to null-joined row (outer joins) or include as unmatched (anti joins)
+/// - `null`: Discard this row
+pub fn get_corrected_filter_mask(
+    join_type: JoinType,
+    row_indices: &UInt64Array,
+    batch_ids: &[usize],
+    filter_mask: &BooleanArray,
+    expected_size: usize,
+) -> Option<BooleanArray> {
+    let row_indices_length = row_indices.len();
+    let mut corrected_mask: BooleanBuilder =
+        BooleanBuilder::with_capacity(row_indices_length);
+    let mut seen_true = false;
+
+    match join_type {
+        JoinType::Left | JoinType::Right => {
+            // For outer joins: Keep first matching row per input row,
+            // convert rest to nulls, add null-joined rows for unmatched
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+                if filter_mask.value(i) {
+                    seen_true = true;
+                    corrected_mask.append_value(true);
+                } else if seen_true || !filter_mask.value(i) && !last_index {
+                    corrected_mask.append_null(); // to be ignored and not set to output
+                } else {
+                    corrected_mask.append_value(false); // to be converted to null joined row
+                }
+
+                if last_index {
+                    seen_true = false;
+                }
+            }
+
+            // Generate null joined rows for records which have no matching join key
+            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
+            Some(corrected_mask.finish())
+        }
+        JoinType::LeftMark | JoinType::RightMark => {
+            // For mark joins: Like outer but only keep first match, mark with boolean
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+                if filter_mask.value(i) && !seen_true {
+                    seen_true = true;
+                    corrected_mask.append_value(true);
+                } else if seen_true || !filter_mask.value(i) && !last_index {
+                    corrected_mask.append_null(); // to be ignored and not set to output
+                } else {
+                    corrected_mask.append_value(false); // to be converted to null joined row
+                }
+
+                if last_index {
+                    seen_true = false;
+                }
+            }
+
+            // Generate null joined rows for records which have no matching join key
+            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
+            Some(corrected_mask.finish())
+        }
+        JoinType::LeftSemi | JoinType::RightSemi => {
+            // For semi joins: Keep only first matching row per input row, discard rest
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+                if filter_mask.value(i) && !seen_true {
+                    seen_true = true;
+                    corrected_mask.append_value(true);
+                } else {
+                    corrected_mask.append_null(); // to be ignored and not set to output
+                }
+
+                if last_index {
+                    seen_true = false;
+                }
+            }
+
+            Some(corrected_mask.finish())
+        }
+        JoinType::LeftAnti | JoinType::RightAnti => {
+            // For anti joins: Keep row only if NO matches passed the filter
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+
+                if filter_mask.value(i) {
+                    seen_true = true;
+                }
+
+                if last_index {
+                    if !seen_true {
+                        corrected_mask.append_value(true);
+                    } else {
+                        corrected_mask.append_null();
+                    }
+
+                    seen_true = false;
+                } else {
+                    corrected_mask.append_null();
+                }
+            }
+            // Generate null joined rows for records which have no matching join key,
+            // for LeftAnti non-matched considered as true
+            corrected_mask.append_n(expected_size - corrected_mask.len(), true);
+            Some(corrected_mask.finish())
+        }
+        JoinType::Full => {
+            // For full joins: Similar to outer but handle both sides
+            for i in 0..row_indices_length {
+                let last_index =
+                    last_index_for_row(i, row_indices, batch_ids, row_indices_length);
+
+                if filter_mask.is_null(i) {
+                    // null joined
+                    corrected_mask.append_value(true);
+                } else if filter_mask.value(i) {
+                    seen_true = true;
+                    corrected_mask.append_value(true);
+                } else if seen_true || !filter_mask.value(i) && !last_index {
+                    corrected_mask.append_null(); // to be ignored and not set to output
+                } else {
+                    corrected_mask.append_value(false); // to be converted to null joined row
+                }
+
+                if last_index {
+                    seen_true = false;
+                }
+            }
+            // Generate null joined rows for records which have no matching join key
+            corrected_mask.append_n(expected_size - corrected_mask.len(), false);
+            Some(corrected_mask.finish())
+        }
+        JoinType::Inner => {
+            // Inner joins don't need deferred filtering
+            None
+        }
+    }
+}
+
+/// Applies corrected filter mask to record batch based on join type
+///
+/// Different join types require different handling of filtered results:
+/// - Outer joins: Add null-joined rows for false mask values
+/// - Semi/Anti joins: May need projection to remove right columns
+/// - Full joins: Add null-joined rows for both sides
+pub fn filter_record_batch_by_join_type(
+    record_batch: &RecordBatch,
+    corrected_mask: &BooleanArray,
+    join_type: JoinType,
+    schema: &SchemaRef,
+    streamed_schema: &SchemaRef,
+    buffered_schema: &SchemaRef,
+) -> Result<RecordBatch> {
+    let filtered_record_batch = filter_record_batch(record_batch, corrected_mask)?;
+
+    match join_type {
+        JoinType::Left | JoinType::LeftMark => {
+            // For left joins, add null-joined rows where mask is false
+            let null_mask = compute::not(corrected_mask)?;
+            let null_joined_batch = filter_record_batch(record_batch, &null_mask)?;
+
+            if null_joined_batch.num_rows() == 0 {
+                return Ok(filtered_record_batch);
+            }
+
+            // Create null columns for right side
+            let null_joined_streamed_batch = create_null_joined_batch(
+                &null_joined_batch,
+                buffered_schema,
+                JoinSide::Left,
+                join_type,
+                schema,
+            )?;
+
+            Ok(concat_batches(
+                schema,
+                &[filtered_record_batch, null_joined_streamed_batch],
+            )?)
+        }
+        JoinType::LeftSemi
+        | JoinType::LeftAnti
+        | JoinType::RightSemi
+        | JoinType::RightAnti => {
+            // For semi/anti joins, project to only include the outer side columns
+            // Both Left and Right semi/anti use streamed_schema.len() because:
+            // - For Left: columns are [left, right], so we take first streamed_schema.len()
+            // - For Right: columns are [right, left], and streamed side is right, so we take first streamed_schema.len()
+            let output_column_indices: Vec<usize> =
+                (0..streamed_schema.fields().len()).collect();
+            Ok(filtered_record_batch.project(&output_column_indices)?)
+        }
+        JoinType::Right | JoinType::RightMark => {
+            // For right joins, add null-joined rows where mask is false
+            let null_mask = compute::not(corrected_mask)?;
+            let null_joined_batch = filter_record_batch(record_batch, &null_mask)?;
+
+            if null_joined_batch.num_rows() == 0 {
+                return Ok(filtered_record_batch);
+            }
+
+            // Create null columns for left side (buffered side for RIGHT join)
+            let null_joined_buffered_batch = create_null_joined_batch(
+                &null_joined_batch,
+                buffered_schema, // Pass buffered (left) schema to create nulls for it
+                JoinSide::Right,
+                join_type,
+                schema,
+            )?;
+
+            Ok(concat_batches(
+                schema,
+                &[filtered_record_batch, null_joined_buffered_batch],
+            )?)
+        }
+        JoinType::Full => {
+            // For full joins, add null-joined rows for both sides
+            let joined_filter_not_matched_mask = compute::not(corrected_mask)?;
+            let joined_filter_not_matched_batch =
+                filter_record_batch(record_batch, &joined_filter_not_matched_mask)?;
+
+            if joined_filter_not_matched_batch.num_rows() == 0 {
+                return Ok(filtered_record_batch);
+            }
+
+            // Create null-joined batches for both sides
+            let left_null_joined_batch = create_null_joined_batch(
+                &joined_filter_not_matched_batch,
+                buffered_schema,
+                JoinSide::Left,
+                join_type,
+                schema,
+            )?;
+
+            Ok(concat_batches(
+                schema,
+                &[filtered_record_batch, left_null_joined_batch],
+            )?)
+        }
+        JoinType::Inner => Ok(filtered_record_batch),
+    }
+}
+
+/// Creates a batch with null columns for the non-joined side
+///
+/// Note: The input `batch` is assumed to be a fully-joined batch that already contains
+/// columns from both sides. We need to extract the data side columns and replace the
+/// null side columns with actual nulls.
+fn create_null_joined_batch(
+    batch: &RecordBatch,
+    null_schema: &SchemaRef,
+    join_side: JoinSide,
+    join_type: JoinType,
+    output_schema: &SchemaRef,
+) -> Result<RecordBatch> {
+    let num_rows = batch.num_rows();
+
+    // The input batch is a fully-joined batch [left_cols..., right_cols...]
+    // We need to extract the appropriate side and replace the other with nulls (or mark column)
+    let columns = match (join_side, join_type) {
+        (JoinSide::Left, JoinType::LeftMark) => {
+            // For LEFT mark: output is [left_cols..., mark_col]
+            // Batch is [left_cols..., right_cols...], extract left from beginning
+            // Number of left columns = output columns - 1 (mark column)
+            let left_col_count = output_schema.fields().len() - 1;
+            let mut result: Vec<ArrayRef> = batch.columns()[..left_col_count].to_vec();
+            result.push(Arc::new(BooleanArray::from(vec![false; num_rows])) as ArrayRef);
+            result
+        }
+        (JoinSide::Right, JoinType::RightMark) => {
+            // For RIGHT mark: output is [right_cols..., mark_col]
+            // For RIGHT joins, batch is [right_cols..., left_cols...] (right comes first!)
+            // Extract right columns from the beginning
+            let right_col_count = output_schema.fields().len() - 1; // -1 for mark column
+            let mut result: Vec<ArrayRef> = batch.columns()[..right_col_count].to_vec();
+            result.push(Arc::new(BooleanArray::from(vec![false; num_rows])) as ArrayRef);
+            result
+        }
+        (JoinSide::Left, _) => {
+            // For LEFT join: output is [left_cols..., right_cols...]
+            // Extract left columns, then add null right columns
+            let null_columns: Vec<ArrayRef> = null_schema
+                .fields()
+                .iter()
+                .map(|field| arrow::array::new_null_array(field.data_type(), num_rows))
+                .collect();
+            let left_col_count = output_schema.fields().len() - null_columns.len();
+            let mut result: Vec<ArrayRef> = batch.columns()[..left_col_count].to_vec();
+            result.extend(null_columns);
+            result
+        }
+        (JoinSide::Right, _) => {
+            // For RIGHT join: batch is [left_cols..., right_cols...] (same as schema)
+            // We want: [null_left..., actual_right...]
+            // Extract left columns from beginning, replace with nulls, keep right columns
+            let null_columns: Vec<ArrayRef> = null_schema
+                .fields()
+                .iter()
+                .map(|field| arrow::array::new_null_array(field.data_type(), num_rows))
+                .collect();
+            let left_col_count = null_columns.len();
+            let mut result = null_columns;
+            // Extract right columns starting after left columns
+            result.extend_from_slice(&batch.columns()[left_col_count..]);
+            result
+        }
+        (JoinSide::None, _) => {
+            // This should not happen in normal join operations
+            unreachable!(
+                "JoinSide::None should not be used in null-joined batch creation"
+            )
+        }
+    };
+
+    // Create the batch - don't validate nullability since outer joins can have
+    // null values in columns that were originally non-nullable
+    use arrow::array::RecordBatchOptions;
+    let mut options = RecordBatchOptions::new();
+    options = options.with_row_count(Some(num_rows));
+    Ok(RecordBatch::try_new_with_options(
+        Arc::clone(output_schema),
+        columns,
+        &options,
+    )?)
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs
new file mode 100644
index 0000000000000..8457408919e63
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/metrics.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Module for tracking Sort Merge Join metrics
+
+use crate::metrics::{
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, SpillMetrics,
+    Time,
+};
+
+/// Metrics for SortMergeJoinExec
+pub(super) struct SortMergeJoinMetrics {
+    /// Total time for joining probe-side batches to the build-side batches
+    join_time: Time,
+    /// Number of batches consumed by this operator
+    input_batches: Count,
+    /// Number of rows consumed by this operator
+    input_rows: Count,
+    /// Execution metrics
+    baseline_metrics: BaselineMetrics,
+    /// Peak memory used for buffered data.
+    /// Calculated as sum of peak memory values across partitions
+    peak_mem_used: Gauge,
+    /// Metrics related to spilling
+    spill_metrics: SpillMetrics,
+}
+
+impl SortMergeJoinMetrics {
+    pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
+        let join_time = MetricBuilder::new(metrics).subset_time("join_time", partition);
+        let input_batches =
+            MetricBuilder::new(metrics).counter("input_batches", partition);
+        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+        let peak_mem_used = MetricBuilder::new(metrics).gauge("peak_mem_used", partition);
+        let spill_metrics = SpillMetrics::new(metrics, partition);
+
+        let baseline_metrics = BaselineMetrics::new(metrics, partition);
+
+        Self {
+            join_time,
+            input_batches,
+            input_rows,
+            baseline_metrics,
+            peak_mem_used,
+            spill_metrics,
+        }
+    }
+
+    pub fn join_time(&self) -> Time {
+        self.join_time.clone()
+    }
+
+    pub fn baseline_metrics(&self) -> BaselineMetrics {
+        self.baseline_metrics.clone()
+    }
+
+    pub fn input_batches(&self) -> Count {
+        self.input_batches.clone()
+    }
+
+    pub fn input_rows(&self) -> Count {
+        self.input_rows.clone()
+    }
+
+    pub fn peak_mem_used(&self) -> Gauge {
+        self.peak_mem_used.clone()
+    }
+
+    pub fn spill_metrics(&self) -> SpillMetrics {
+        self.spill_metrics.clone()
+    }
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs b/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs
new file mode 100644
index 0000000000000..06290ec4d0908
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/mod.rs
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort Merge Join Execution Plan Operator
+
+pub use exec::SortMergeJoinExec;
+
+mod exec;
+mod filter;
+mod metrics;
+mod stream;
+
+#[cfg(test)]
+mod tests;
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs b/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs
new file mode 100644
index 0000000000000..4dcbe1f647990
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/stream.rs
@@ -0,0 +1,1840 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort-Merge Join execution
+//!
+//! This module implements the runtime state machine for the Sort-Merge Join
+//! operator. It drives two sorted input streams (the *streamed* side and the
+//! *buffered* side), compares join keys, and produces joined `RecordBatch`es.
+
+use std::cmp::Ordering;
+use std::collections::{HashMap, VecDeque};
+use std::fs::File;
+use std::io::BufReader;
+use std::mem::size_of;
+use std::ops::Range;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering::Relaxed;
+use std::task::{Context, Poll};
+
+use crate::joins::sort_merge_join::filter::{
+    FilterMetadata, filter_record_batch_by_join_type, get_corrected_filter_mask,
+    get_filter_columns, needs_deferred_filtering,
+};
+use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
+use crate::joins::utils::{JoinFilter, compare_join_arrays};
+use crate::metrics::RecordOutput;
+use crate::spill::spill_manager::SpillManager;
+use crate::{PhysicalExpr, RecordBatchStream, SendableRecordBatchStream};
+
+use arrow::array::{types::UInt64Type, *};
+use arrow::compute::{
+    self, BatchCoalescer, SortOptions, concat_batches, filter_record_batch, is_not_null,
+    take, take_arrays,
+};
+use arrow::datatypes::{DataType, SchemaRef, TimeUnit};
+use arrow::ipc::reader::StreamReader;
+use datafusion_common::config::SpillCompression;
+use datafusion_common::{
+    HashSet, JoinType, NullEquality, Result, exec_err, internal_err, not_impl_err,
+};
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use datafusion_physical_expr_common::physical_expr::PhysicalExprRef;
+
+use futures::{Stream, StreamExt};
+
+/// State of SMJ stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum SortMergeJoinState {
+    /// Init joining with a new streamed row or a new buffered batches
+    Init,
+    /// Polling one streamed row or one buffered batch, or both
+    Polling,
+    /// Joining polled data and making output
+    JoinOutput,
+    /// Emit ready data if have any and then go back to [`Self::Init`] state
+    EmitReadyThenInit,
+    /// No more output
+    Exhausted,
+}
+
+/// State of streamed data stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum StreamedState {
+    /// Init polling
+    Init,
+    /// Polling one streamed row
+    Polling,
+    /// Ready to produce one streamed row
+    Ready,
+    /// No more streamed row
+    Exhausted,
+}
+
+/// State of buffered data stream
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum BufferedState {
+    /// Init polling
+    Init,
+    /// Polling first row in the next batch
+    PollingFirst,
+    /// Polling rest rows in the next batch
+    PollingRest,
+    /// Ready to produce one batch
+    Ready,
+    /// No more buffered batches
+    Exhausted,
+}
+
+/// Represents a chunk of joined data from streamed and buffered side
+pub(super) struct StreamedJoinedChunk {
+    /// Index of batch in buffered_data
+    buffered_batch_idx: Option<usize>,
+    /// Array builder for streamed indices
+    streamed_indices: UInt64Builder,
+    /// Array builder for buffered indices
+    /// This could contain nulls if the join is null-joined
+    buffered_indices: UInt64Builder,
+}
+
+/// Represents a record batch from streamed input.
+///
+/// Also stores information of matching rows from buffered batches.
+pub(super) struct StreamedBatch {
+    /// The streamed record batch
+    pub batch: RecordBatch,
+    /// The index of row in the streamed batch to compare with buffered batches
+    pub idx: usize,
+    /// The join key arrays of streamed batch which are used to compare with buffered batches
+    /// and to produce output. They are produced by evaluating `on` expressions.
+    pub join_arrays: Vec<ArrayRef>,
+    /// Chunks of indices from buffered side (may be nulls) joined to streamed
+    pub output_indices: Vec<StreamedJoinedChunk>,
+    /// Total number of output rows across all chunks in `output_indices`
+    pub num_output_rows: usize,
+    /// Index of currently scanned batch from buffered data
+    pub buffered_batch_idx: Option<usize>,
+    /// Indices that found a match for the given join filter
+    /// Used for semi joins to keep track the streaming index which got a join filter match
+    /// and already emitted to the output.
+    pub join_filter_matched_idxs: HashSet<u64>,
+}
+
+impl StreamedBatch {
+    fn new(batch: RecordBatch, on_column: &[Arc<dyn PhysicalExpr>]) -> Self {
+        let join_arrays = join_arrays(&batch, on_column);
+        StreamedBatch {
+            batch,
+            idx: 0,
+            join_arrays,
+            output_indices: vec![],
+            num_output_rows: 0,
+            buffered_batch_idx: None,
+            join_filter_matched_idxs: HashSet::new(),
+        }
+    }
+
+    fn new_empty(schema: SchemaRef) -> Self {
+        StreamedBatch {
+            batch: RecordBatch::new_empty(schema),
+            idx: 0,
+            join_arrays: vec![],
+            output_indices: vec![],
+            num_output_rows: 0,
+            buffered_batch_idx: None,
+            join_filter_matched_idxs: HashSet::new(),
+        }
+    }
+
+    /// Number of unfrozen output pairs in this streamed batch
+    fn num_output_rows(&self) -> usize {
+        self.num_output_rows
+    }
+
+    /// Appends new pair consisting of current streamed index and `buffered_idx`
+    /// index of buffered batch with `buffered_batch_idx` index.
+    fn append_output_pair(
+        &mut self,
+        buffered_batch_idx: Option<usize>,
+        buffered_idx: Option<usize>,
+        batch_size: usize,
+    ) {
+        // If no current chunk exists or current chunk is not for current buffered batch,
+        // create a new chunk
+        if self.output_indices.is_empty() || self.buffered_batch_idx != buffered_batch_idx
+        {
+            // Compute capacity only when creating a new chunk (infrequent operation).
+            // The capacity is the remaining space to reach batch_size.
+            // This should always be >= 1 since we only call this when num_output_rows < batch_size.
+            debug_assert!(
+                batch_size > self.num_output_rows,
+                "batch_size ({batch_size}) must be > num_output_rows ({})",
+                self.num_output_rows
+            );
+            let capacity = batch_size - self.num_output_rows;
+            self.output_indices.push(StreamedJoinedChunk {
+                buffered_batch_idx,
+                streamed_indices: UInt64Builder::with_capacity(capacity),
+                buffered_indices: UInt64Builder::with_capacity(capacity),
+            });
+            self.buffered_batch_idx = buffered_batch_idx;
+        };
+        let current_chunk = self.output_indices.last_mut().unwrap();
+
+        // Append index of streamed batch and index of buffered batch into current chunk
+        current_chunk.streamed_indices.append_value(self.idx as u64);
+        if let Some(idx) = buffered_idx {
+            current_chunk.buffered_indices.append_value(idx as u64);
+        } else {
+            current_chunk.buffered_indices.append_null();
+        }
+        self.num_output_rows += 1;
+    }
+}
+
+/// A buffered batch that contains contiguous rows with same join key
+///
+/// `BufferedBatch` can exist as either an in-memory `RecordBatch` or a `RefCountedTempFile` on disk.
+#[derive(Debug)]
+pub(super) struct BufferedBatch {
+    /// Represents in memory or spilled record batch
+    pub batch: BufferedBatchState,
+    /// The range in which the rows share the same join key
+    pub range: Range<usize>,
+    /// Array refs of the join key
+    pub join_arrays: Vec<ArrayRef>,
+    /// Buffered joined index (null joining buffered)
+    pub null_joined: Vec<usize>,
+    /// Size estimation used for reserving / releasing memory
+    pub size_estimation: usize,
+    /// The indices of buffered batch that the join filter doesn't satisfy.
+    /// This is a map between right row index and a boolean value indicating whether all joined row
+    /// of the right row does not satisfy the filter .
+    /// When dequeuing the buffered batch, we need to produce null joined rows for these indices.
+    pub join_filter_not_matched_map: HashMap<u64, bool>,
+    /// Current buffered batch number of rows. Equal to batch.num_rows()
+    /// but if batch is spilled to disk this property is preferable
+    /// and less expensive
+    pub num_rows: usize,
+}
+
+impl BufferedBatch {
+    fn new(
+        batch: RecordBatch,
+        range: Range<usize>,
+        on_column: &[PhysicalExprRef],
+    ) -> Self {
+        let join_arrays = join_arrays(&batch, on_column);
+
+        // Estimation is calculated as
+        //   inner batch size
+        // + join keys size
+        // + worst case null_joined (as vector capacity * element size)
+        // + Range size
+        // + size of this estimation
+        let size_estimation = batch.get_array_memory_size()
+            + join_arrays
+                .iter()
+                .map(|arr| arr.get_array_memory_size())
+                .sum::<usize>()
+            + batch.num_rows().next_power_of_two() * size_of::<usize>()
+            + size_of::<Range<usize>>()
+            + size_of::<usize>();
+
+        let num_rows = batch.num_rows();
+        BufferedBatch {
+            batch: BufferedBatchState::InMemory(batch),
+            range,
+            join_arrays,
+            null_joined: vec![],
+            size_estimation,
+            join_filter_not_matched_map: HashMap::new(),
+            num_rows,
+        }
+    }
+}
+
+// TODO: Spill join arrays (https://github.com/apache/datafusion/pull/17429)
+// Used to represent whether the buffered data is currently in memory or written to disk
+#[derive(Debug)]
+pub(super) enum BufferedBatchState {
+    // In memory record batch
+    InMemory(RecordBatch),
+    // Spilled temp file
+    Spilled(RefCountedTempFile),
+}
+
+/// Sort-Merge join stream that consumes streamed and buffered data streams
+/// and produces joined output stream.
+pub(super) struct SortMergeJoinStream {
+    // ========================================================================
+    // PROPERTIES:
+    // These fields are initialized at the start and remain constant throughout
+    // the execution.
+    // ========================================================================
+    /// Output schema
+    pub schema: SchemaRef,
+    /// Defines the null equality for the join.
+    pub null_equality: NullEquality,
+    /// Sort options of join columns used to sort streamed and buffered data stream
+    pub sort_options: Vec<SortOptions>,
+    /// optional join filter
+    pub filter: Option<JoinFilter>,
+    /// How the join is performed
+    pub join_type: JoinType,
+    /// Target output batch size
+    pub batch_size: usize,
+
+    // ========================================================================
+    // STREAMED FIELDS:
+    // These fields manage the properties and state of the streamed input.
+    // ========================================================================
+    /// Input schema of streamed
+    pub streamed_schema: SchemaRef,
+    /// Streamed data stream
+    pub streamed: SendableRecordBatchStream,
+    /// Current processing record batch of streamed
+    pub streamed_batch: StreamedBatch,
+    /// (used in outer join) Is current streamed row joined at least once?
+    pub streamed_joined: bool,
+    /// State of streamed
+    pub streamed_state: StreamedState,
+    /// Join key columns of streamed
+    pub on_streamed: Vec<PhysicalExprRef>,
+
+    // ========================================================================
+    // BUFFERED FIELDS:
+    // These fields manage the properties and state of the buffered input.
+    // ========================================================================
+    /// Input schema of buffered
+    pub buffered_schema: SchemaRef,
+    /// Buffered data stream
+    pub buffered: SendableRecordBatchStream,
+    /// Current buffered data
+    pub buffered_data: BufferedData,
+    /// (used in outer join) Is current buffered batches joined at least once?
+    pub buffered_joined: bool,
+    /// State of buffered
+    pub buffered_state: BufferedState,
+    /// Join key columns of buffered
+    pub on_buffered: Vec<PhysicalExprRef>,
+
+    // ========================================================================
+    // MERGE JOIN STATES:
+    // These fields track the execution state of merge join and are updated
+    // during the execution.
+    // ========================================================================
+    /// Current state of the stream
+    pub state: SortMergeJoinState,
+    /// Staging output array builders
+    pub joined_record_batches: JoinedRecordBatches,
+    /// Output buffer. Currently used by filtering as it requires double buffering
+    /// to avoid small/empty batches. Non-filtered join outputs directly from `staging_output_record_batches.batches`
+    pub output: BatchCoalescer,
+    /// The comparison result of current streamed row and buffered batches
+    pub current_ordering: Ordering,
+    /// Manages the process of spilling and reading back intermediate data
+    pub spill_manager: SpillManager,
+
+    // ========================================================================
+    // EXECUTION RESOURCES:
+    // Fields related to managing execution resources and monitoring performance.
+    // ========================================================================
+    /// Metrics
+    pub join_metrics: SortMergeJoinMetrics,
+    /// Memory reservation
+    pub reservation: MemoryReservation,
+    /// Runtime env
+    pub runtime_env: Arc<RuntimeEnv>,
+    /// A unique number for each batch
+    pub streamed_batch_counter: AtomicUsize,
+}
+
+/// Staging area for joined data before output
+///
+/// Accumulates joined rows until either:
+/// - Target batch size reached (for efficiency)
+/// - Stream exhausted (flush remaining data)
+pub(super) struct JoinedRecordBatches {
+    /// Joined batches. Each batch is already joined columns from left and right sources
+    pub(super) joined_batches: BatchCoalescer,
+    /// Filter metadata for deferred filtering
+    pub(super) filter_metadata: FilterMetadata,
+}
+
+impl JoinedRecordBatches {
+    /// Concatenates all accumulated batches into a single RecordBatch
+    ///
+    /// Must drain ALL batches from BatchCoalescer for filtered joins to ensure
+    /// metadata alignment when applying get_corrected_filter_mask().
+    pub(super) fn concat_batches(&mut self, schema: &SchemaRef) -> Result<RecordBatch> {
+        self.joined_batches.finish_buffered_batch()?;
+
+        let mut all_batches = vec![];
+        while let Some(batch) = self.joined_batches.next_completed_batch() {
+            all_batches.push(batch);
+        }
+
+        match all_batches.as_slice() {
+            [] => unreachable!("concat_batches called with empty BatchCoalescer"),
+            [single_batch] => Ok(single_batch.clone()),
+            multiple_batches => Ok(concat_batches(schema, multiple_batches)?),
+        }
+    }
+
+    /// Clears batches without touching metadata (for early return when no filtering needed)
+    fn clear_batches(&mut self, schema: &SchemaRef, batch_size: usize) {
+        self.joined_batches = BatchCoalescer::new(Arc::clone(schema), batch_size)
+            .with_biggest_coalesce_batch_size(Option::from(batch_size / 2));
+    }
+
+    /// Asserts that if batches is empty, metadata is also empty
+    #[inline]
+    fn debug_assert_empty_consistency(&self) {
+        if self.joined_batches.is_empty() {
+            debug_assert_eq!(
+                self.filter_metadata.filter_mask.len(),
+                0,
+                "filter_mask should be empty when batches is empty"
+            );
+            debug_assert_eq!(
+                self.filter_metadata.row_indices.len(),
+                0,
+                "row_indices should be empty when batches is empty"
+            );
+            debug_assert_eq!(
+                self.filter_metadata.batch_ids.len(),
+                0,
+                "batch_ids should be empty when batches is empty"
+            );
+        }
+    }
+
+    /// Pushes a batch with null metadata (Full join null-joined rows only)
+    ///
+    /// These buffered rows had NO matching streamed rows. Since we can't group
+    /// by input row (no input row exists), we use null metadata as a sentinel.
+    ///
+    /// Maintains invariant: N rows → N metadata entries (nulls)
+    fn push_batch_with_null_metadata(&mut self, batch: RecordBatch, join_type: JoinType) {
+        debug_assert!(
+            join_type == JoinType::Full,
+            "push_batch_with_null_metadata should only be called for Full joins"
+        );
+
+        let num_rows = batch.num_rows();
+
+        self.filter_metadata.append_nulls(num_rows);
+
+        self.filter_metadata.debug_assert_metadata_aligned();
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    /// Pushes a batch with filter metadata (filtered outer/semi/anti/mark joins)
+    ///
+    /// Deferred filtering: An input row may join with multiple buffered rows, but we
+    /// don't know yet if all matches failed the filter. We track metadata so
+    /// `get_corrected_filter_mask()` can later group by input row and decide:
+    /// - If any match passed: emit passing rows
+    /// - If all matches failed: emit null-joined row
+    ///
+    /// Maintains invariant: N rows → N metadata entries
+    fn push_batch_with_filter_metadata(
+        &mut self,
+        batch: RecordBatch,
+        row_indices: &UInt64Array,
+        filter_mask: &BooleanArray,
+        streamed_batch_id: usize,
+        join_type: JoinType,
+    ) {
+        debug_assert!(
+            matches!(
+                join_type,
+                JoinType::Left
+                    | JoinType::LeftSemi
+                    | JoinType::LeftMark
+                    | JoinType::Right
+                    | JoinType::RightSemi
+                    | JoinType::RightMark
+                    | JoinType::LeftAnti
+                    | JoinType::RightAnti
+                    | JoinType::Full
+            ),
+            "push_batch_with_filter_metadata should only be called for outer/semi/anti/mark joins that need deferred filtering"
+        );
+
+        debug_assert_eq!(
+            row_indices.len(),
+            filter_mask.len(),
+            "row_indices and filter_mask must have same length"
+        );
+
+        self.filter_metadata.append_filter_metadata(
+            row_indices,
+            filter_mask,
+            streamed_batch_id,
+        );
+
+        self.filter_metadata.debug_assert_metadata_aligned();
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    /// Pushes a batch without metadata (non-filtered joins)
+    ///
+    /// No deferred filtering needed. Either every join match is output (Inner),
+    /// or null-joined rows are handled separately. No need to track which input
+    /// row produced which output row.
+    fn push_batch_without_metadata(&mut self, batch: RecordBatch, _join_type: JoinType) {
+        self.joined_batches
+            .push_batch(batch)
+            .expect("Failed to push batch to BatchCoalescer");
+    }
+
+    fn clear(&mut self, schema: &SchemaRef, batch_size: usize) {
+        self.joined_batches = BatchCoalescer::new(Arc::clone(schema), batch_size)
+            .with_biggest_coalesce_batch_size(Option::from(batch_size / 2));
+        self.filter_metadata = FilterMetadata::new();
+        self.debug_assert_empty_consistency();
+    }
+}
+impl RecordBatchStream for SortMergeJoinStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Stream for SortMergeJoinStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let join_time = self.join_metrics.join_time().clone();
+        let _timer = join_time.timer();
+        loop {
+            match &self.state {
+                SortMergeJoinState::Init => {
+                    let streamed_exhausted =
+                        self.streamed_state == StreamedState::Exhausted;
+                    let buffered_exhausted =
+                        self.buffered_state == BufferedState::Exhausted;
+                    self.state = if streamed_exhausted && buffered_exhausted {
+                        SortMergeJoinState::Exhausted
+                    } else {
+                        match self.current_ordering {
+                            Ordering::Less | Ordering::Equal => {
+                                if !streamed_exhausted {
+                                    if needs_deferred_filtering(
+                                        &self.filter,
+                                        self.join_type,
+                                    ) {
+                                        match self.process_filtered_batches()? {
+                                            Poll::Ready(Some(batch)) => {
+                                                return Poll::Ready(Some(Ok(batch)));
+                                            }
+                                            Poll::Ready(None) | Poll::Pending => {}
+                                        }
+                                    }
+
+                                    self.streamed_joined = false;
+                                    self.streamed_state = StreamedState::Init;
+                                }
+                            }
+                            Ordering::Greater => {
+                                if !buffered_exhausted {
+                                    self.buffered_joined = false;
+                                    self.buffered_state = BufferedState::Init;
+                                }
+                            }
+                        }
+                        SortMergeJoinState::Polling
+                    };
+                }
+                SortMergeJoinState::Polling => {
+                    if ![StreamedState::Exhausted, StreamedState::Ready]
+                        .contains(&self.streamed_state)
+                    {
+                        match self.poll_streamed_row(cx)? {
+                            Poll::Ready(_) => {}
+                            Poll::Pending => return Poll::Pending,
+                        }
+                    }
+
+                    if ![BufferedState::Exhausted, BufferedState::Ready]
+                        .contains(&self.buffered_state)
+                    {
+                        match self.poll_buffered_batches(cx)? {
+                            Poll::Ready(_) => {}
+                            Poll::Pending => return Poll::Pending,
+                        }
+                    }
+                    let streamed_exhausted =
+                        self.streamed_state == StreamedState::Exhausted;
+                    let buffered_exhausted =
+                        self.buffered_state == BufferedState::Exhausted;
+                    if streamed_exhausted && buffered_exhausted {
+                        self.state = SortMergeJoinState::Exhausted;
+                        continue;
+                    }
+                    self.current_ordering = self.compare_streamed_buffered()?;
+                    self.state = SortMergeJoinState::JoinOutput;
+                }
+                SortMergeJoinState::EmitReadyThenInit => {
+                    // If have data to emit, emit it and if no more, change to next
+
+                    // Verify metadata alignment before checking if we have batches to output
+                    self.joined_record_batches
+                        .filter_metadata
+                        .debug_assert_metadata_aligned();
+
+                    // For filtered joins, skip output and let Init state handle it
+                    if needs_deferred_filtering(&self.filter, self.join_type) {
+                        self.state = SortMergeJoinState::Init;
+                        continue;
+                    }
+
+                    // For non-filtered joins, only output if we have a completed batch
+                    // (opportunistic output when target batch size is reached)
+                    if self
+                        .joined_record_batches
+                        .joined_batches
+                        .has_completed_batch()
+                    {
+                        let record_batch = self
+                            .joined_record_batches
+                            .joined_batches
+                            .next_completed_batch()
+                            .expect("has_completed_batch was true");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+                    self.state = SortMergeJoinState::Init;
+                }
+                SortMergeJoinState::JoinOutput => {
+                    self.join_partial()?;
+
+                    if self.num_unfrozen_pairs() < self.batch_size {
+                        if self.buffered_data.scanning_finished() {
+                            self.buffered_data.scanning_reset();
+                            self.state = SortMergeJoinState::EmitReadyThenInit;
+                        }
+                    } else {
+                        self.freeze_all()?;
+
+                        // Verify metadata alignment before checking if we have batches to output
+                        self.joined_record_batches
+                            .filter_metadata
+                            .debug_assert_metadata_aligned();
+
+                        // For filtered joins, skip output and let Init state handle it
+                        if needs_deferred_filtering(&self.filter, self.join_type) {
+                            continue;
+                        }
+
+                        // For non-filtered joins, only output if we have a completed batch
+                        // (opportunistic output when target batch size is reached)
+                        if self
+                            .joined_record_batches
+                            .joined_batches
+                            .has_completed_batch()
+                        {
+                            let record_batch = self
+                                .joined_record_batches
+                                .joined_batches
+                                .next_completed_batch()
+                                .expect("has_completed_batch was true");
+                            (&record_batch)
+                                .record_output(&self.join_metrics.baseline_metrics());
+                            return Poll::Ready(Some(Ok(record_batch)));
+                        }
+                        // Otherwise keep buffering (don't output yet)
+                    }
+                }
+                SortMergeJoinState::Exhausted => {
+                    self.freeze_all()?;
+
+                    // Verify metadata alignment before final output
+                    self.joined_record_batches
+                        .filter_metadata
+                        .debug_assert_metadata_aligned();
+
+                    // For filtered joins, must concat and filter ALL data at once
+                    if needs_deferred_filtering(&self.filter, self.join_type)
+                        && !self.joined_record_batches.joined_batches.is_empty()
+                    {
+                        let record_batch = self.filter_joined_batch()?;
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+
+                    // For non-filtered joins, finish buffered data first
+                    if !self.joined_record_batches.joined_batches.is_empty() {
+                        self.joined_record_batches
+                            .joined_batches
+                            .finish_buffered_batch()?;
+                    }
+
+                    // Output one completed batch at a time (stay in Exhausted until empty)
+                    if self
+                        .joined_record_batches
+                        .joined_batches
+                        .has_completed_batch()
+                    {
+                        let record_batch = self
+                            .joined_record_batches
+                            .joined_batches
+                            .next_completed_batch()
+                            .expect("has_completed_batch was true");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        return Poll::Ready(Some(Ok(record_batch)));
+                    }
+
+                    // Finally check self.output BatchCoalescer (used by filtered joins)
+                    return if !self.output.is_empty() {
+                        self.output.finish_buffered_batch()?;
+                        let record_batch = self
+                            .output
+                            .next_completed_batch()
+                            .expect("Failed to get last batch");
+                        (&record_batch)
+                            .record_output(&self.join_metrics.baseline_metrics());
+                        Poll::Ready(Some(Ok(record_batch)))
+                    } else {
+                        Poll::Ready(None)
+                    };
+                }
+            }
+        }
+    }
+}
+
+impl SortMergeJoinStream {
+    #[expect(clippy::too_many_arguments)]
+    pub fn try_new(
+        // Configured via `datafusion.execution.spill_compression`.
+        spill_compression: SpillCompression,
+        schema: SchemaRef,
+        sort_options: Vec<SortOptions>,
+        null_equality: NullEquality,
+        streamed: SendableRecordBatchStream,
+        buffered: SendableRecordBatchStream,
+        on_streamed: Vec<Arc<dyn PhysicalExpr>>,
+        on_buffered: Vec<Arc<dyn PhysicalExpr>>,
+        filter: Option<JoinFilter>,
+        join_type: JoinType,
+        batch_size: usize,
+        join_metrics: SortMergeJoinMetrics,
+        reservation: MemoryReservation,
+        runtime_env: Arc<RuntimeEnv>,
+    ) -> Result<Self> {
+        let streamed_schema = streamed.schema();
+        let buffered_schema = buffered.schema();
+        let spill_manager = SpillManager::new(
+            Arc::clone(&runtime_env),
+            join_metrics.spill_metrics().clone(),
+            Arc::clone(&buffered_schema),
+        )
+        .with_compression_type(spill_compression);
+        Ok(Self {
+            state: SortMergeJoinState::Init,
+            sort_options,
+            null_equality,
+            schema: Arc::clone(&schema),
+            streamed_schema: Arc::clone(&streamed_schema),
+            buffered_schema,
+            streamed,
+            buffered,
+            streamed_batch: StreamedBatch::new_empty(streamed_schema),
+            buffered_data: BufferedData::default(),
+            streamed_joined: false,
+            buffered_joined: false,
+            streamed_state: StreamedState::Init,
+            buffered_state: BufferedState::Init,
+            current_ordering: Ordering::Equal,
+            on_streamed,
+            on_buffered,
+            filter,
+            joined_record_batches: JoinedRecordBatches {
+                joined_batches: BatchCoalescer::new(Arc::clone(&schema), batch_size)
+                    .with_biggest_coalesce_batch_size(Option::from(batch_size / 2)),
+                filter_metadata: FilterMetadata::new(),
+            },
+            output: BatchCoalescer::new(schema, batch_size)
+                .with_biggest_coalesce_batch_size(Option::from(batch_size / 2)),
+            batch_size,
+            join_type,
+            join_metrics,
+            reservation,
+            runtime_env,
+            spill_manager,
+            streamed_batch_counter: AtomicUsize::new(0),
+        })
+    }
+
+    /// Number of unfrozen output pairs (used to decide when to freeze + output)
+    fn num_unfrozen_pairs(&self) -> usize {
+        self.streamed_batch.num_output_rows()
+    }
+
+    /// Process accumulated batches for filtered joins
+    ///
+    /// Freezes unfrozen pairs, applies deferred filtering, and outputs if ready.
+    /// Returns Poll::Ready with a batch if one is available, otherwise Poll::Pending.
+    fn process_filtered_batches(&mut self) -> Poll<Option<Result<RecordBatch>>> {
+        self.freeze_all()?;
+
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        if !self.joined_record_batches.joined_batches.is_empty() {
+            let out_filtered_batch = self.filter_joined_batch()?;
+            self.output
+                .push_batch(out_filtered_batch)
+                .expect("Failed to push output batch");
+
+            if self.output.has_completed_batch() {
+                let record_batch = self
+                    .output
+                    .next_completed_batch()
+                    .expect("Failed to get output batch");
+                (&record_batch).record_output(&self.join_metrics.baseline_metrics());
+                return Poll::Ready(Some(Ok(record_batch)));
+            }
+        }
+
+        Poll::Pending
+    }
+
+    /// Poll next streamed row
+    fn poll_streamed_row(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
+        loop {
+            match &self.streamed_state {
+                StreamedState::Init => {
+                    if self.streamed_batch.idx + 1 < self.streamed_batch.batch.num_rows()
+                    {
+                        self.streamed_batch.idx += 1;
+                        self.streamed_state = StreamedState::Ready;
+                        return Poll::Ready(Some(Ok(())));
+                    } else {
+                        self.streamed_state = StreamedState::Polling;
+                    }
+                }
+                StreamedState::Polling => match self.streamed.poll_next_unpin(cx)? {
+                    Poll::Pending => {
+                        return Poll::Pending;
+                    }
+                    Poll::Ready(None) => {
+                        self.streamed_state = StreamedState::Exhausted;
+                    }
+                    Poll::Ready(Some(batch)) => {
+                        if batch.num_rows() > 0 {
+                            self.freeze_streamed()?;
+                            self.join_metrics.input_batches().add(1);
+                            self.join_metrics.input_rows().add(batch.num_rows());
+                            self.streamed_batch =
+                                StreamedBatch::new(batch, &self.on_streamed);
+                            // Every incoming streaming batch should have its unique id
+                            // Check `JoinedRecordBatches.self.streamed_batch_counter` documentation
+                            self.streamed_batch_counter
+                                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+                            self.streamed_state = StreamedState::Ready;
+                        }
+                    }
+                },
+                StreamedState::Ready => {
+                    return Poll::Ready(Some(Ok(())));
+                }
+                StreamedState::Exhausted => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+
+    fn free_reservation(&mut self, buffered_batch: &BufferedBatch) -> Result<()> {
+        // Shrink memory usage for in-memory batches only
+        if let BufferedBatchState::InMemory(_) = buffered_batch.batch {
+            self.reservation
+                .try_shrink(buffered_batch.size_estimation)?;
+        }
+        Ok(())
+    }
+
+    fn allocate_reservation(&mut self, mut buffered_batch: BufferedBatch) -> Result<()> {
+        match self.reservation.try_grow(buffered_batch.size_estimation) {
+            Ok(_) => {
+                self.join_metrics
+                    .peak_mem_used()
+                    .set_max(self.reservation.size());
+                Ok(())
+            }
+            Err(_) if self.runtime_env.disk_manager.tmp_files_enabled() => {
+                // Spill buffered batch to disk
+
+                match buffered_batch.batch {
+                    BufferedBatchState::InMemory(batch) => {
+                        let spill_file = self
+                            .spill_manager
+                            .spill_record_batch_and_finish(
+                                &[batch],
+                                "sort_merge_join_buffered_spill",
+                            )?
+                            .unwrap(); // Operation only return None if no batches are spilled, here we ensure that at least one batch is spilled
+
+                        buffered_batch.batch = BufferedBatchState::Spilled(spill_file);
+                        Ok(())
+                    }
+                    _ => internal_err!("Buffered batch has empty body"),
+                }
+            }
+            Err(e) => exec_err!("{}. Disk spilling disabled.", e.message()),
+        }?;
+
+        self.buffered_data.batches.push_back(buffered_batch);
+        Ok(())
+    }
+
+    /// Poll next buffered batches
+    fn poll_buffered_batches(&mut self, cx: &mut Context) -> Poll<Option<Result<()>>> {
+        loop {
+            match &self.buffered_state {
+                BufferedState::Init => {
+                    // pop previous buffered batches
+                    while !self.buffered_data.batches.is_empty() {
+                        let head_batch = self.buffered_data.head_batch();
+                        // If the head batch is fully processed, dequeue it and produce output of it.
+                        if head_batch.range.end == head_batch.num_rows {
+                            self.freeze_dequeuing_buffered()?;
+                            if let Some(mut buffered_batch) =
+                                self.buffered_data.batches.pop_front()
+                            {
+                                self.produce_buffered_not_matched(&mut buffered_batch)?;
+                                self.free_reservation(&buffered_batch)?;
+                            }
+                        } else {
+                            // If the head batch is not fully processed, break the loop.
+                            // Streamed batch will be joined with the head batch in the next step.
+                            break;
+                        }
+                    }
+                    if self.buffered_data.batches.is_empty() {
+                        self.buffered_state = BufferedState::PollingFirst;
+                    } else {
+                        let tail_batch = self.buffered_data.tail_batch_mut();
+                        tail_batch.range.start = tail_batch.range.end;
+                        tail_batch.range.end += 1;
+                        self.buffered_state = BufferedState::PollingRest;
+                    }
+                }
+                BufferedState::PollingFirst => match self.buffered.poll_next_unpin(cx)? {
+                    Poll::Pending => {
+                        return Poll::Pending;
+                    }
+                    Poll::Ready(None) => {
+                        self.buffered_state = BufferedState::Exhausted;
+                        return Poll::Ready(None);
+                    }
+                    Poll::Ready(Some(batch)) => {
+                        self.join_metrics.input_batches().add(1);
+                        self.join_metrics.input_rows().add(batch.num_rows());
+
+                        if batch.num_rows() > 0 {
+                            let buffered_batch =
+                                BufferedBatch::new(batch, 0..1, &self.on_buffered);
+
+                            self.allocate_reservation(buffered_batch)?;
+                            self.buffered_state = BufferedState::PollingRest;
+                        }
+                    }
+                },
+                BufferedState::PollingRest => {
+                    if self.buffered_data.tail_batch().range.end
+                        < self.buffered_data.tail_batch().num_rows
+                    {
+                        while self.buffered_data.tail_batch().range.end
+                            < self.buffered_data.tail_batch().num_rows
+                        {
+                            if is_join_arrays_equal(
+                                &self.buffered_data.head_batch().join_arrays,
+                                self.buffered_data.head_batch().range.start,
+                                &self.buffered_data.tail_batch().join_arrays,
+                                self.buffered_data.tail_batch().range.end,
+                            )? {
+                                self.buffered_data.tail_batch_mut().range.end += 1;
+                            } else {
+                                self.buffered_state = BufferedState::Ready;
+                                return Poll::Ready(Some(Ok(())));
+                            }
+                        }
+                    } else {
+                        match self.buffered.poll_next_unpin(cx)? {
+                            Poll::Pending => {
+                                return Poll::Pending;
+                            }
+                            Poll::Ready(None) => {
+                                self.buffered_state = BufferedState::Ready;
+                            }
+                            Poll::Ready(Some(batch)) => {
+                                // Polling batches coming concurrently as multiple partitions
+                                self.join_metrics.input_batches().add(1);
+                                self.join_metrics.input_rows().add(batch.num_rows());
+                                if batch.num_rows() > 0 {
+                                    let buffered_batch = BufferedBatch::new(
+                                        batch,
+                                        0..0,
+                                        &self.on_buffered,
+                                    );
+                                    self.allocate_reservation(buffered_batch)?;
+                                }
+                            }
+                        }
+                    }
+                }
+                BufferedState::Ready => {
+                    return Poll::Ready(Some(Ok(())));
+                }
+                BufferedState::Exhausted => {
+                    return Poll::Ready(None);
+                }
+            }
+        }
+    }
+
+    /// Get comparison result of streamed row and buffered batches
+    fn compare_streamed_buffered(&self) -> Result<Ordering> {
+        if self.streamed_state == StreamedState::Exhausted {
+            return Ok(Ordering::Greater);
+        }
+        if !self.buffered_data.has_buffered_rows() {
+            return Ok(Ordering::Less);
+        }
+
+        compare_join_arrays(
+            &self.streamed_batch.join_arrays,
+            self.streamed_batch.idx,
+            &self.buffered_data.head_batch().join_arrays,
+            self.buffered_data.head_batch().range.start,
+            &self.sort_options,
+            self.null_equality,
+        )
+    }
+
+    /// Produce join and fill output buffer until reaching target batch size
+    /// or the join is finished
+    fn join_partial(&mut self) -> Result<()> {
+        // Whether to join streamed rows
+        let mut join_streamed = false;
+        // Whether to join buffered rows
+        let mut join_buffered = false;
+        // For Mark join we store a dummy id to indicate the row has a match
+        let mut mark_row_as_match = false;
+
+        // determine whether we need to join streamed/buffered rows
+        match self.current_ordering {
+            Ordering::Less => {
+                if matches!(
+                    self.join_type,
+                    JoinType::Left
+                        | JoinType::Right
+                        | JoinType::Full
+                        | JoinType::LeftAnti
+                        | JoinType::RightAnti
+                        | JoinType::LeftMark
+                        | JoinType::RightMark
+                ) {
+                    join_streamed = !self.streamed_joined;
+                }
+            }
+            Ordering::Equal => {
+                if matches!(
+                    self.join_type,
+                    JoinType::LeftSemi
+                        | JoinType::LeftMark
+                        | JoinType::RightSemi
+                        | JoinType::RightMark
+                ) {
+                    mark_row_as_match = matches!(
+                        self.join_type,
+                        JoinType::LeftMark | JoinType::RightMark
+                    );
+                    // if the join filter is specified then its needed to output the streamed index
+                    // only if it has not been emitted before
+                    // the `join_filter_matched_idxs` keeps track on if streamed index has a successful
+                    // filter match and prevents the same index to go into output more than once
+                    if self.filter.is_some() {
+                        join_streamed = !self
+                            .streamed_batch
+                            .join_filter_matched_idxs
+                            .contains(&(self.streamed_batch.idx as u64))
+                            && !self.streamed_joined;
+                        // if the join filter specified there can be references to buffered columns
+                        // so buffered columns are needed to access them
+                        join_buffered = join_streamed;
+                    } else {
+                        join_streamed = !self.streamed_joined;
+                    }
+                }
+                if matches!(
+                    self.join_type,
+                    JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full
+                ) {
+                    join_streamed = true;
+                    join_buffered = true;
+                };
+
+                if matches!(self.join_type, JoinType::LeftAnti | JoinType::RightAnti)
+                    && self.filter.is_some()
+                {
+                    join_streamed = !self.streamed_joined;
+                    join_buffered = join_streamed;
+                }
+            }
+            Ordering::Greater => {
+                if self.join_type == JoinType::Full {
+                    join_buffered = !self.buffered_joined;
+                };
+            }
+        }
+        if !join_streamed && !join_buffered {
+            // no joined data
+            self.buffered_data.scanning_finish();
+            return Ok(());
+        }
+
+        if join_buffered {
+            // joining streamed/nulls and buffered
+            while !self.buffered_data.scanning_finished()
+                && self.num_unfrozen_pairs() < self.batch_size
+            {
+                let scanning_idx = self.buffered_data.scanning_idx();
+                if join_streamed {
+                    // Join streamed row and buffered row
+                    self.streamed_batch.append_output_pair(
+                        Some(self.buffered_data.scanning_batch_idx),
+                        Some(scanning_idx),
+                        self.batch_size,
+                    );
+                } else {
+                    // Join nulls and buffered row for FULL join
+                    self.buffered_data
+                        .scanning_batch_mut()
+                        .null_joined
+                        .push(scanning_idx);
+                }
+                self.buffered_data.scanning_advance();
+
+                if self.buffered_data.scanning_finished() {
+                    self.streamed_joined = join_streamed;
+                    self.buffered_joined = true;
+                }
+            }
+        } else {
+            // joining streamed and nulls
+            let scanning_batch_idx = if self.buffered_data.scanning_finished() {
+                None
+            } else {
+                Some(self.buffered_data.scanning_batch_idx)
+            };
+            // For Mark join we store a dummy id to indicate the row has a match
+            let scanning_idx = mark_row_as_match.then_some(0);
+
+            self.streamed_batch.append_output_pair(
+                scanning_batch_idx,
+                scanning_idx,
+                self.batch_size,
+            );
+            self.buffered_data.scanning_finish();
+            self.streamed_joined = true;
+        }
+        Ok(())
+    }
+
+    fn freeze_all(&mut self) -> Result<()> {
+        self.freeze_buffered(self.buffered_data.batches.len())?;
+        self.freeze_streamed()?;
+
+        // After freezing, metadata should be aligned
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        Ok(())
+    }
+
+    // Produces and stages record batches to ensure dequeued buffered batch
+    // no longer needed:
+    //   1. freezes all indices joined to streamed side
+    //   2. freezes NULLs joined to dequeued buffered batch to "release" it
+    fn freeze_dequeuing_buffered(&mut self) -> Result<()> {
+        self.freeze_streamed()?;
+        // Only freeze and produce the first batch in buffered_data as the batch is fully processed
+        self.freeze_buffered(1)?;
+
+        // After freezing, metadata should be aligned
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        Ok(())
+    }
+
+    // Produces and stages record batch from buffered indices with corresponding
+    // NULLs on streamed side.
+    //
+    // Applicable only in case of Full join.
+    //
+    fn freeze_buffered(&mut self, batch_count: usize) -> Result<()> {
+        if self.join_type != JoinType::Full {
+            return Ok(());
+        }
+        for buffered_batch in self.buffered_data.batches.range_mut(..batch_count) {
+            let buffered_indices = UInt64Array::from_iter_values(
+                buffered_batch.null_joined.iter().map(|&index| index as u64),
+            );
+            if let Some(record_batch) = produce_buffered_null_batch(
+                &self.schema,
+                &self.streamed_schema,
+                &buffered_indices,
+                buffered_batch,
+            )? {
+                self.joined_record_batches
+                    .push_batch_with_null_metadata(record_batch, self.join_type);
+            }
+            buffered_batch.null_joined.clear();
+        }
+        Ok(())
+    }
+
+    fn produce_buffered_not_matched(
+        &mut self,
+        buffered_batch: &mut BufferedBatch,
+    ) -> Result<()> {
+        if self.join_type != JoinType::Full {
+            return Ok(());
+        }
+
+        // For buffered row which is joined with streamed side rows but all joined rows
+        // don't satisfy the join filter
+        let not_matched_buffered_indices = buffered_batch
+            .join_filter_not_matched_map
+            .iter()
+            .filter_map(|(idx, failed)| if *failed { Some(*idx) } else { None })
+            .collect::<Vec<_>>();
+
+        let buffered_indices =
+            UInt64Array::from_iter_values(not_matched_buffered_indices.iter().copied());
+
+        if let Some(record_batch) = produce_buffered_null_batch(
+            &self.schema,
+            &self.streamed_schema,
+            &buffered_indices,
+            buffered_batch,
+        )? {
+            self.joined_record_batches
+                .push_batch_with_null_metadata(record_batch, self.join_type);
+        }
+        buffered_batch.join_filter_not_matched_map.clear();
+
+        Ok(())
+    }
+
+    // Produces and stages record batch for all output indices found
+    // for current streamed batch and clears staged output indices.
+    fn freeze_streamed(&mut self) -> Result<()> {
+        for chunk in self.streamed_batch.output_indices.iter_mut() {
+            // The row indices of joined streamed batch
+            let left_indices = chunk.streamed_indices.finish();
+
+            if left_indices.is_empty() {
+                continue;
+            }
+
+            let mut left_columns = if let Some(range) = is_contiguous_range(&left_indices)
+            {
+                // When indices form a contiguous range (common for the streamed
+                // side which advances sequentially), use zero-copy slice instead
+                // of the O(n) take kernel.
+                self.streamed_batch
+                    .batch
+                    .slice(range.start, range.len())
+                    .columns()
+                    .to_vec()
+            } else {
+                take_arrays(self.streamed_batch.batch.columns(), &left_indices, None)?
+            };
+
+            // The row indices of joined buffered batch
+            let right_indices: UInt64Array = chunk.buffered_indices.finish();
+            let mut right_columns =
+                if matches!(self.join_type, JoinType::LeftMark | JoinType::RightMark) {
+                    vec![Arc::new(is_not_null(&right_indices)?) as ArrayRef]
+                } else if matches!(
+                    self.join_type,
+                    JoinType::LeftSemi
+                        | JoinType::LeftAnti
+                        | JoinType::RightAnti
+                        | JoinType::RightSemi
+                ) {
+                    vec![]
+                } else if let Some(buffered_idx) = chunk.buffered_batch_idx {
+                    fetch_right_columns_by_idxs(
+                        &self.buffered_data,
+                        buffered_idx,
+                        &right_indices,
+                    )?
+                } else {
+                    // If buffered batch none, meaning it is null joined batch.
+                    // We need to create null arrays for buffered columns to join with streamed rows.
+                    create_unmatched_columns(
+                        self.join_type,
+                        &self.buffered_schema,
+                        right_indices.len(),
+                    )
+                };
+
+            // Prepare the columns we apply join filter on later.
+            // Only for joined rows between streamed and buffered.
+            let filter_columns = if let Some(buffered_batch_idx) =
+                chunk.buffered_batch_idx
+            {
+                if self.join_type != JoinType::Right {
+                    if matches!(
+                        self.join_type,
+                        JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark
+                    ) {
+                        let right_cols = fetch_right_columns_by_idxs(
+                            &self.buffered_data,
+                            buffered_batch_idx,
+                            &right_indices,
+                        )?;
+
+                        get_filter_columns(&self.filter, &left_columns, &right_cols)
+                    } else if matches!(
+                        self.join_type,
+                        JoinType::RightAnti | JoinType::RightSemi | JoinType::RightMark
+                    ) {
+                        let right_cols = fetch_right_columns_by_idxs(
+                            &self.buffered_data,
+                            buffered_batch_idx,
+                            &right_indices,
+                        )?;
+
+                        get_filter_columns(&self.filter, &right_cols, &left_columns)
+                    } else {
+                        get_filter_columns(&self.filter, &left_columns, &right_columns)
+                    }
+                } else {
+                    get_filter_columns(&self.filter, &right_columns, &left_columns)
+                }
+            } else {
+                // This chunk is totally for null joined rows (outer join), we don't need to apply join filter.
+                // Any join filter applied only on either streamed or buffered side will be pushed already.
+                vec![]
+            };
+
+            let columns = if self.join_type != JoinType::Right {
+                left_columns.extend(right_columns);
+                left_columns
+            } else {
+                right_columns.extend(left_columns);
+                right_columns
+            };
+
+            let output_batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+            // Apply join filter if any
+            if !filter_columns.is_empty() {
+                if let Some(f) = &self.filter {
+                    // Construct batch with only filter columns
+                    let filter_batch =
+                        RecordBatch::try_new(Arc::clone(f.schema()), filter_columns)?;
+
+                    let filter_result = f
+                        .expression()
+                        .evaluate(&filter_batch)?
+                        .into_array(filter_batch.num_rows())?;
+
+                    // The boolean selection mask of the join filter result
+                    let pre_mask =
+                        datafusion_common::cast::as_boolean_array(&filter_result)?;
+
+                    // If there are nulls in join filter result, exclude them from selecting
+                    // the rows to output.
+                    let mask = if pre_mask.null_count() > 0 {
+                        compute::prep_null_mask_filter(
+                            datafusion_common::cast::as_boolean_array(&filter_result)?,
+                        )
+                    } else {
+                        pre_mask.clone()
+                    };
+
+                    // Push the filtered batch which contains rows passing join filter to the output
+                    // For outer/semi/anti/mark joins with deferred filtering, push the unfiltered batch with metadata
+                    // For INNER joins, filter immediately and push without metadata
+                    let needs_deferred_filtering = matches!(
+                        self.join_type,
+                        JoinType::Left
+                            | JoinType::LeftSemi
+                            | JoinType::Right
+                            | JoinType::RightSemi
+                            | JoinType::LeftAnti
+                            | JoinType::RightAnti
+                            | JoinType::LeftMark
+                            | JoinType::RightMark
+                            | JoinType::Full
+                    );
+
+                    if needs_deferred_filtering {
+                        // Outer/semi/anti/mark joins: push unfiltered batch with metadata for deferred filtering
+                        let mask_to_use = if self.join_type != JoinType::Full {
+                            &mask
+                        } else {
+                            pre_mask
+                        };
+
+                        self.joined_record_batches.push_batch_with_filter_metadata(
+                            output_batch,
+                            &left_indices,
+                            mask_to_use,
+                            self.streamed_batch_counter.load(Relaxed),
+                            self.join_type,
+                        );
+                    } else {
+                        // INNER joins: filter immediately and push without metadata
+                        let filtered_batch = filter_record_batch(&output_batch, &mask)?;
+                        self.joined_record_batches
+                            .push_batch_without_metadata(filtered_batch, self.join_type);
+                    }
+
+                    // For outer joins, we need to push the null joined rows to the output if
+                    // all joined rows are failed on the join filter.
+                    // I.e., if all rows joined from a streamed row are failed with the join filter,
+                    // we need to join it with nulls as buffered side.
+                    if self.join_type == JoinType::Full {
+                        let buffered_batch = &mut self.buffered_data.batches
+                            [chunk.buffered_batch_idx.unwrap()];
+
+                        for i in 0..pre_mask.len() {
+                            // If the buffered row is not joined with streamed side,
+                            // skip it.
+                            if right_indices.is_null(i) {
+                                continue;
+                            }
+
+                            let buffered_index = right_indices.value(i);
+
+                            buffered_batch.join_filter_not_matched_map.insert(
+                                buffered_index,
+                                *buffered_batch
+                                    .join_filter_not_matched_map
+                                    .get(&buffered_index)
+                                    .unwrap_or(&true)
+                                    && !pre_mask.value(i),
+                            );
+                        }
+                    }
+                }
+            } else {
+                self.joined_record_batches
+                    .push_batch_without_metadata(output_batch, self.join_type);
+            }
+        }
+
+        self.streamed_batch.output_indices.clear();
+        self.streamed_batch.num_output_rows = 0;
+
+        Ok(())
+    }
+
+    fn filter_joined_batch(&mut self) -> Result<RecordBatch> {
+        // Metadata should be aligned before processing
+        self.joined_record_batches
+            .filter_metadata
+            .debug_assert_metadata_aligned();
+
+        let record_batch = self.joined_record_batches.concat_batches(&self.schema)?;
+        let (mut out_indices, mut out_mask, mut batch_ids) =
+            self.joined_record_batches.filter_metadata.finish_metadata();
+        let default_batch_ids = vec![0; record_batch.num_rows()];
+
+        // If only nulls come in and indices sizes doesn't match with expected record batch count
+        // generate missing indices
+        // Happens for null joined batches for Full Join
+        if out_indices.null_count() == out_indices.len()
+            && out_indices.len() != record_batch.num_rows()
+        {
+            out_mask = BooleanArray::from(vec![None; record_batch.num_rows()]);
+            out_indices = UInt64Array::from(vec![None; record_batch.num_rows()]);
+            batch_ids = &default_batch_ids;
+        }
+
+        // After potential reconstruction, metadata should align with batch row count
+        debug_assert_eq!(
+            out_indices.len(),
+            record_batch.num_rows(),
+            "out_indices length should match record_batch row count"
+        );
+        debug_assert_eq!(
+            out_mask.len(),
+            record_batch.num_rows(),
+            "out_mask length should match record_batch row count (unless empty)"
+        );
+        debug_assert_eq!(
+            batch_ids.len(),
+            record_batch.num_rows(),
+            "batch_ids length should match record_batch row count"
+        );
+
+        if out_mask.is_empty() {
+            self.joined_record_batches
+                .clear_batches(&self.schema, self.batch_size);
+            return Ok(record_batch);
+        }
+
+        // Validate inputs to get_corrected_filter_mask
+        debug_assert_eq!(
+            out_indices.len(),
+            out_mask.len(),
+            "out_indices and out_mask must have same length for get_corrected_filter_mask"
+        );
+        debug_assert_eq!(
+            batch_ids.len(),
+            out_mask.len(),
+            "batch_ids and out_mask must have same length for get_corrected_filter_mask"
+        );
+
+        let maybe_corrected_mask = get_corrected_filter_mask(
+            self.join_type,
+            &out_indices,
+            batch_ids,
+            &out_mask,
+            record_batch.num_rows(),
+        );
+
+        let corrected_mask = if let Some(ref filtered_join_mask) = maybe_corrected_mask {
+            filtered_join_mask
+        } else {
+            &out_mask
+        };
+
+        self.filter_record_batch_by_join_type(&record_batch, corrected_mask)
+    }
+
+    fn filter_record_batch_by_join_type(
+        &mut self,
+        record_batch: &RecordBatch,
+        corrected_mask: &BooleanArray,
+    ) -> Result<RecordBatch> {
+        let filtered_record_batch = filter_record_batch_by_join_type(
+            record_batch,
+            corrected_mask,
+            self.join_type,
+            &self.schema,
+            &self.streamed_schema,
+            &self.buffered_schema,
+        )?;
+
+        self.joined_record_batches
+            .clear(&self.schema, self.batch_size);
+
+        Ok(filtered_record_batch)
+    }
+}
+
+fn create_unmatched_columns(
+    join_type: JoinType,
+    schema: &SchemaRef,
+    size: usize,
+) -> Vec<ArrayRef> {
+    if matches!(join_type, JoinType::LeftMark | JoinType::RightMark) {
+        vec![Arc::new(BooleanArray::from(vec![false; size])) as ArrayRef]
+    } else {
+        schema
+            .fields()
+            .iter()
+            .map(|f| new_null_array(f.data_type(), size))
+            .collect::<Vec<_>>()
+    }
+}
+
+fn produce_buffered_null_batch(
+    schema: &SchemaRef,
+    streamed_schema: &SchemaRef,
+    buffered_indices: &PrimitiveArray<UInt64Type>,
+    buffered_batch: &BufferedBatch,
+) -> Result<Option<RecordBatch>> {
+    if buffered_indices.is_empty() {
+        return Ok(None);
+    }
+
+    // Take buffered (right) columns
+    let right_columns =
+        fetch_right_columns_from_batch_by_idxs(buffered_batch, buffered_indices)?;
+
+    // Create null streamed (left) columns
+    let mut left_columns = streamed_schema
+        .fields()
+        .iter()
+        .map(|f| new_null_array(f.data_type(), buffered_indices.len()))
+        .collect::<Vec<_>>();
+
+    left_columns.extend(right_columns);
+
+    Ok(Some(RecordBatch::try_new(
+        Arc::clone(schema),
+        left_columns,
+    )?))
+}
+
+/// Checks if a `UInt64Array` contains a contiguous ascending range (e.g. \[3,4,5,6\]).
+/// Returns `Some(start..start+len)` if so, `None` otherwise.
+/// This allows replacing an O(n) `take` with an O(1) `slice`.
+#[inline]
+fn is_contiguous_range(indices: &UInt64Array) -> Option<Range<usize>> {
+    if indices.is_empty() || indices.null_count() > 0 {
+        return None;
+    }
+    let values = indices.values();
+    let start = values[0];
+    let len = values.len() as u64;
+    // Quick rejection: if last element doesn't match expected, not contiguous
+    if values[values.len() - 1] != start + len - 1 {
+        return None;
+    }
+    // Verify every element is sequential (handles duplicates and gaps)
+    for i in 1..values.len() {
+        if values[i] != start + i as u64 {
+            return None;
+        }
+    }
+    Some(start as usize..(start + len) as usize)
+}
+
+/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` by specific column indices
+#[inline(always)]
+fn fetch_right_columns_by_idxs(
+    buffered_data: &BufferedData,
+    buffered_batch_idx: usize,
+    buffered_indices: &UInt64Array,
+) -> Result<Vec<ArrayRef>> {
+    fetch_right_columns_from_batch_by_idxs(
+        &buffered_data.batches[buffered_batch_idx],
+        buffered_indices,
+    )
+}
+
+#[inline(always)]
+fn fetch_right_columns_from_batch_by_idxs(
+    buffered_batch: &BufferedBatch,
+    buffered_indices: &UInt64Array,
+) -> Result<Vec<ArrayRef>> {
+    match &buffered_batch.batch {
+        // In memory batch
+        // In memory batch
+        BufferedBatchState::InMemory(batch) => {
+            // When indices form a contiguous range (common in SMJ since the
+            // buffered side is scanned sequentially), use zero-copy slice.
+            if let Some(range) = is_contiguous_range(buffered_indices) {
+                Ok(batch.slice(range.start, range.len()).columns().to_vec())
+            } else {
+                Ok(take_arrays(batch.columns(), buffered_indices, None)?)
+            }
+        }
+        // If the batch was spilled to disk, less likely
+        BufferedBatchState::Spilled(spill_file) => {
+            let mut buffered_cols: Vec<ArrayRef> =
+                Vec::with_capacity(buffered_indices.len());
+
+            let file = BufReader::new(File::open(spill_file.path())?);
+            let reader = StreamReader::try_new(file, None)?;
+
+            for batch in reader {
+                batch?.columns().iter().for_each(|column| {
+                    buffered_cols.extend(take(column, &buffered_indices, None))
+                });
+            }
+
+            Ok(buffered_cols)
+        }
+    }
+}
+
+/// Buffered data contains all buffered batches with one unique join key
+#[derive(Debug, Default)]
+pub(super) struct BufferedData {
+    /// Buffered batches with the same key
+    pub batches: VecDeque<BufferedBatch>,
+    /// current scanning batch index used in join_partial()
+    pub scanning_batch_idx: usize,
+    /// current scanning offset used in join_partial()
+    pub scanning_offset: usize,
+}
+
+impl BufferedData {
+    pub fn head_batch(&self) -> &BufferedBatch {
+        self.batches.front().unwrap()
+    }
+
+    pub fn tail_batch(&self) -> &BufferedBatch {
+        self.batches.back().unwrap()
+    }
+
+    pub fn tail_batch_mut(&mut self) -> &mut BufferedBatch {
+        self.batches.back_mut().unwrap()
+    }
+
+    pub fn has_buffered_rows(&self) -> bool {
+        self.batches.iter().any(|batch| !batch.range.is_empty())
+    }
+
+    pub fn scanning_reset(&mut self) {
+        self.scanning_batch_idx = 0;
+        self.scanning_offset = 0;
+    }
+
+    pub fn scanning_advance(&mut self) {
+        self.scanning_offset += 1;
+        while !self.scanning_finished() && self.scanning_batch_finished() {
+            self.scanning_batch_idx += 1;
+            self.scanning_offset = 0;
+        }
+    }
+
+    pub fn scanning_batch(&self) -> &BufferedBatch {
+        &self.batches[self.scanning_batch_idx]
+    }
+
+    pub fn scanning_batch_mut(&mut self) -> &mut BufferedBatch {
+        &mut self.batches[self.scanning_batch_idx]
+    }
+
+    pub fn scanning_idx(&self) -> usize {
+        self.scanning_batch().range.start + self.scanning_offset
+    }
+
+    pub fn scanning_batch_finished(&self) -> bool {
+        self.scanning_offset == self.scanning_batch().range.len()
+    }
+
+    pub fn scanning_finished(&self) -> bool {
+        self.scanning_batch_idx == self.batches.len()
+    }
+
+    pub fn scanning_finish(&mut self) {
+        self.scanning_batch_idx = self.batches.len();
+        self.scanning_offset = 0;
+    }
+}
+
+/// Get join array refs of given batch and join columns
+fn join_arrays(batch: &RecordBatch, on_column: &[PhysicalExprRef]) -> Vec<ArrayRef> {
+    on_column
+        .iter()
+        .map(|c| {
+            let num_rows = batch.num_rows();
+            let c = c.evaluate(batch).unwrap();
+            c.into_array(num_rows).unwrap()
+        })
+        .collect()
+}
+
+/// A faster version of compare_join_arrays() that only output whether
+/// the given two rows are equal
+fn is_join_arrays_equal(
+    left_arrays: &[ArrayRef],
+    left: usize,
+    right_arrays: &[ArrayRef],
+    right: usize,
+) -> Result<bool> {
+    let mut is_equal = true;
+    for (left_array, right_array) in left_arrays.iter().zip(right_arrays) {
+        macro_rules! compare_value {
+            ($T:ty) => {{
+                match (left_array.is_null(left), right_array.is_null(right)) {
+                    (false, false) => {
+                        let left_array =
+                            left_array.as_any().downcast_ref::<$T>().unwrap();
+                        let right_array =
+                            right_array.as_any().downcast_ref::<$T>().unwrap();
+                        if left_array.value(left) != right_array.value(right) {
+                            is_equal = false;
+                        }
+                    }
+                    (true, false) => is_equal = false,
+                    (false, true) => is_equal = false,
+                    _ => {}
+                }
+            }};
+        }
+
+        match left_array.data_type() {
+            DataType::Null => {}
+            DataType::Boolean => compare_value!(BooleanArray),
+            DataType::Int8 => compare_value!(Int8Array),
+            DataType::Int16 => compare_value!(Int16Array),
+            DataType::Int32 => compare_value!(Int32Array),
+            DataType::Int64 => compare_value!(Int64Array),
+            DataType::UInt8 => compare_value!(UInt8Array),
+            DataType::UInt16 => compare_value!(UInt16Array),
+            DataType::UInt32 => compare_value!(UInt32Array),
+            DataType::UInt64 => compare_value!(UInt64Array),
+            DataType::Float32 => compare_value!(Float32Array),
+            DataType::Float64 => compare_value!(Float64Array),
+            DataType::Utf8 => compare_value!(StringArray),
+            DataType::Utf8View => compare_value!(StringViewArray),
+            DataType::LargeUtf8 => compare_value!(LargeStringArray),
+            DataType::Binary => compare_value!(BinaryArray),
+            DataType::BinaryView => compare_value!(BinaryViewArray),
+            DataType::FixedSizeBinary(_) => compare_value!(FixedSizeBinaryArray),
+            DataType::LargeBinary => compare_value!(LargeBinaryArray),
+            DataType::Decimal32(..) => compare_value!(Decimal32Array),
+            DataType::Decimal64(..) => compare_value!(Decimal64Array),
+            DataType::Decimal128(..) => compare_value!(Decimal128Array),
+            DataType::Decimal256(..) => compare_value!(Decimal256Array),
+            DataType::Timestamp(time_unit, None) => match time_unit {
+                TimeUnit::Second => compare_value!(TimestampSecondArray),
+                TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray),
+                TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray),
+                TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray),
+            },
+            DataType::Date32 => compare_value!(Date32Array),
+            DataType::Date64 => compare_value!(Date64Array),
+            dt => {
+                return not_impl_err!(
+                    "Unsupported data type in sort merge join comparator: {}",
+                    dt
+                );
+            }
+        }
+        if !is_equal {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs
new file mode 100644
index 0000000000000..b16ad59abc5b1
--- /dev/null
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/tests.rs
@@ -0,0 +1,3553 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SortMergeJoin Testing Module
+//!
+//! This module currently contains the following test types in this order:
+//!  - Join behaviour (left, right, full, inner, semi, anti, mark)
+//!  - Batch spilling
+//!  - Filter mask
+//!
+//! Add relevant tests under the specified sections.
+
+use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn};
+use crate::joins::{HashJoinExec, PartitionMode, SortMergeJoinExec};
+use crate::test::TestMemoryExec;
+use crate::test::exec::BarrierExec;
+use crate::test::{build_table_i32, build_table_i32_two_cols};
+use crate::{ExecutionPlan, common};
+use crate::{
+    expressions::Column, joins::sort_merge_join::filter::get_corrected_filter_mask,
+    joins::sort_merge_join::stream::JoinedRecordBatches,
+};
+use arrow::array::{
+    BinaryArray, BooleanArray, Date32Array, Date64Array, FixedSizeBinaryArray,
+    Int32Array, RecordBatch, UInt64Array,
+};
+use arrow::compute::{BatchCoalescer, SortOptions, filter_record_batch};
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow_ord::sort::SortColumn;
+use arrow_schema::SchemaRef;
+use datafusion_common::JoinType::*;
+use datafusion_common::{
+    JoinSide, internal_err,
+    test_util::{batches_to_sort_string, batches_to_string},
+};
+use datafusion_common::{
+    JoinType, NullEquality, Result, assert_batches_eq, assert_contains,
+};
+use datafusion_common_runtime::JoinSet;
+use datafusion_execution::config::SessionConfig;
+use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_expr::Operator;
+use datafusion_physical_expr::expressions::BinaryExpr;
+use futures::StreamExt;
+use insta::{allow_duplicates, assert_snapshot};
+use itertools::Itertools;
+use std::sync::Arc;
+use std::task::Poll;
+
+fn build_table(
+    a: (&str, &Vec<i32>),
+    b: (&str, &Vec<i32>),
+    c: (&str, &Vec<i32>),
+) -> Arc<dyn ExecutionPlan> {
+    let batch = build_table_i32(a, b, c);
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+fn build_table_from_batches(batches: Vec<RecordBatch>) -> Arc<dyn ExecutionPlan> {
+    let schema = batches.first().unwrap().schema();
+    TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap()
+}
+
+fn build_date_table(
+    a: (&str, &Vec<i32>),
+    b: (&str, &Vec<i32>),
+    c: (&str, &Vec<i32>),
+) -> Arc<dyn ExecutionPlan> {
+    let schema = Schema::new(vec![
+        Field::new(a.0, DataType::Date32, false),
+        Field::new(b.0, DataType::Date32, false),
+        Field::new(c.0, DataType::Date32, false),
+    ]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(Date32Array::from(a.1.clone())),
+            Arc::new(Date32Array::from(b.1.clone())),
+            Arc::new(Date32Array::from(c.1.clone())),
+        ],
+    )
+    .unwrap();
+
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+fn build_date64_table(
+    a: (&str, &Vec<i64>),
+    b: (&str, &Vec<i64>),
+    c: (&str, &Vec<i64>),
+) -> Arc<dyn ExecutionPlan> {
+    let schema = Schema::new(vec![
+        Field::new(a.0, DataType::Date64, false),
+        Field::new(b.0, DataType::Date64, false),
+        Field::new(c.0, DataType::Date64, false),
+    ]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(Date64Array::from(a.1.clone())),
+            Arc::new(Date64Array::from(b.1.clone())),
+            Arc::new(Date64Array::from(c.1.clone())),
+        ],
+    )
+    .unwrap();
+
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+fn build_binary_table(
+    a: (&str, &Vec<&[u8]>),
+    b: (&str, &Vec<i32>),
+    c: (&str, &Vec<i32>),
+) -> Arc<dyn ExecutionPlan> {
+    let schema = Schema::new(vec![
+        Field::new(a.0, DataType::Binary, false),
+        Field::new(b.0, DataType::Int32, false),
+        Field::new(c.0, DataType::Int32, false),
+    ]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(BinaryArray::from(a.1.clone())),
+            Arc::new(Int32Array::from(b.1.clone())),
+            Arc::new(Int32Array::from(c.1.clone())),
+        ],
+    )
+    .unwrap();
+
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+fn build_fixed_size_binary_table(
+    a: (&str, &Vec<&[u8]>),
+    b: (&str, &Vec<i32>),
+    c: (&str, &Vec<i32>),
+) -> Arc<dyn ExecutionPlan> {
+    let schema = Schema::new(vec![
+        Field::new(a.0, DataType::FixedSizeBinary(3), false),
+        Field::new(b.0, DataType::Int32, false),
+        Field::new(c.0, DataType::Int32, false),
+    ]);
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(FixedSizeBinaryArray::from(a.1.clone())),
+            Arc::new(Int32Array::from(b.1.clone())),
+            Arc::new(Int32Array::from(c.1.clone())),
+        ],
+    )
+    .unwrap();
+
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+/// returns a table with 3 columns of i32 in memory
+pub fn build_table_i32_nullable(
+    a: (&str, &Vec<Option<i32>>),
+    b: (&str, &Vec<Option<i32>>),
+    c: (&str, &Vec<Option<i32>>),
+) -> Arc<dyn ExecutionPlan> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(a.0, DataType::Int32, true),
+        Field::new(b.0, DataType::Int32, true),
+        Field::new(c.0, DataType::Int32, true),
+    ]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(a.1.clone())),
+            Arc::new(Int32Array::from(b.1.clone())),
+            Arc::new(Int32Array::from(c.1.clone())),
+        ],
+    )
+    .unwrap();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+pub fn build_table_two_cols(
+    a: (&str, &Vec<i32>),
+    b: (&str, &Vec<i32>),
+) -> Arc<dyn ExecutionPlan> {
+    let batch = build_table_i32_two_cols(a, b);
+    let schema = batch.schema();
+    TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap()
+}
+
+fn join(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+) -> Result<SortMergeJoinExec> {
+    let sort_options = vec![SortOptions::default(); on.len()];
+    SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        None,
+        join_type,
+        sort_options,
+        NullEquality::NullEqualsNothing,
+    )
+}
+
+fn join_with_options(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+    sort_options: Vec<SortOptions>,
+    null_equality: NullEquality,
+) -> Result<SortMergeJoinExec> {
+    SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        None,
+        join_type,
+        sort_options,
+        null_equality,
+    )
+}
+
+fn join_with_filter(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    filter: JoinFilter,
+    join_type: JoinType,
+    sort_options: Vec<SortOptions>,
+    null_equality: NullEquality,
+) -> Result<SortMergeJoinExec> {
+    SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        Some(filter),
+        join_type,
+        sort_options,
+        null_equality,
+    )
+}
+
+async fn join_collect(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    let sort_options = vec![SortOptions::default(); on.len()];
+    join_collect_with_options(
+        left,
+        right,
+        on,
+        join_type,
+        sort_options,
+        NullEquality::NullEqualsNothing,
+    )
+    .await
+}
+
+async fn join_collect_with_filter(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    filter: JoinFilter,
+    join_type: JoinType,
+) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let task_ctx = Arc::new(TaskContext::default());
+    let join = join_with_filter(
+        left,
+        right,
+        on,
+        filter,
+        join_type,
+        sort_options,
+        NullEquality::NullEqualsNothing,
+    )?;
+    let columns = columns(&join.schema());
+
+    let stream = join.execute(0, task_ctx)?;
+    let batches = common::collect(stream).await?;
+    Ok((columns, batches))
+}
+
+async fn join_collect_with_options(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+    sort_options: Vec<SortOptions>,
+    null_equality: NullEquality,
+) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    let task_ctx = Arc::new(TaskContext::default());
+    let join =
+        join_with_options(left, right, on, join_type, sort_options, null_equality)?;
+    let columns = columns(&join.schema());
+
+    let stream = join.execute(0, task_ctx)?;
+    let batches = common::collect(stream).await?;
+    Ok((columns, batches))
+}
+
+async fn join_collect_batch_size_equals_two(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+) -> Result<(Vec<String>, Vec<RecordBatch>)> {
+    let task_ctx = TaskContext::default()
+        .with_session_config(SessionConfig::new().with_batch_size(2));
+    let task_ctx = Arc::new(task_ctx);
+    let join = join(left, right, on, join_type)?;
+    let columns = columns(&join.schema());
+
+    let stream = join.execute(0, task_ctx)?;
+    let batches = common::collect(stream).await?;
+    Ok((columns, batches))
+}
+
+#[tokio::test]
+async fn join_inner_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 5]), // this has a repetition
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 5  | 9  | 20 | 5  | 80 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_inner_two() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b2", &vec![1, 2, 2]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b2", &vec![1, 2, 2]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_columns, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_inner_two_two() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 1, 2]),
+        ("b2", &vec![1, 1, 2]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a1", &vec![1, 1, 3]),
+        ("b2", &vec![1, 1, 2]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_columns, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 1  | 1  | 7  | 1  | 1  | 80 |
+    | 1  | 1  | 8  | 1  | 1  | 70 |
+    | 1  | 1  | 8  | 1  | 1  | 80 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_inner_with_nulls() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(1), Some(1), Some(2), Some(2)]),
+        ("b2", &vec![None, Some(1), Some(2), Some(2)]), // null in key field
+        ("c1", &vec![Some(1), None, Some(8), Some(9)]), // null in non-key field
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(1), Some(1), Some(2), Some(3)]),
+        ("b2", &vec![None, Some(1), Some(2), Some(2)]),
+        ("c2", &vec![Some(10), Some(70), Some(80), Some(90)]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  |    | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_inner_with_nulls_with_options() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(2), Some(2), Some(1), Some(1)]),
+        ("b2", &vec![Some(2), Some(2), Some(1), None]), // null in key field
+        ("c1", &vec![Some(9), Some(8), None, Some(1)]), // null in non-key field
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(3), Some(2), Some(1), Some(1)]),
+        ("b2", &vec![Some(2), Some(2), Some(1), None]),
+        ("c2", &vec![Some(90), Some(80), Some(70), Some(10)]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+    let (_, batches) = join_collect_with_options(
+        left,
+        right,
+        on,
+        Inner,
+        vec![
+            SortOptions {
+                descending: true,
+                nulls_first: false,
+            };
+            2
+        ],
+        NullEquality::NullEqualsNull,
+    )
+    .await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 1  | 1  |    | 1  | 1  | 70 |
+    | 1  |    | 1  | 1  |    | 10 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_inner_output_two_batches() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b2", &vec![1, 2, 2]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b2", &vec![1, 2, 2]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b2", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect_batch_size_equals_two(left, right, on, Inner).await?;
+    assert_eq!(batches.len(), 2);
+    assert_eq!(batches[0].num_rows(), 2);
+    assert_eq!(batches[1].num_rows(), 1);
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b2 | c1 | a1 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 1  | 7  | 1  | 1  | 70 |
+    | 2  | 2  | 8  | 2  | 2  | 80 |
+    | 2  | 2  | 9  | 2  | 2  | 80 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Left).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Right).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 | c2 |
+    +----+----+----+----+----+----+
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    |    |    |    | 30 | 6  | 90 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t1
+    // right join t2 on t1.b1 = t2.b1 and t1.a1 > t2.a2
+
+    let left = build_table(
+        ("a1", &vec![1, 21, 3]), // 21(t1.a1) > 20(t2.a2)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let right = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a1", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("a2", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) = join_collect_with_filter(left, right, on, filter, Right).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b1 |
+    +----+----+----+----+----+
+    |    |    |    | 10 | 4  |
+    | 21 | 5  | 8  | 20 | 5  |
+    |    |    |    | 30 | 6  |
+    +----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t2
+    // left join t1 on t2.b1 = t1.b1 and t2.a2 > t1.a1
+
+    let left = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the right
+    );
+
+    let right = build_table(
+        ("a1", &vec![1, 21, 3]), // 20(t2.a2) > 1(t1.a1)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a2", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a1", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("a1", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) = join_collect_with_filter(left, right, on, filter, Left).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+
+    | a2 | b1 | a1 | b1 | c1 |
+    +----+----+----+----+----+
+    | 10 | 4  | 1  | 4  | 7  |
+    | 20 | 5  |    |    |    |
+    | 30 | 6  |    |    |    |
+    +----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_mark_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t2
+    // left mark join t1 on t2.b1 = t1.b1 and t2.a2 > t1.a1
+
+    let left = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the right
+    );
+
+    let right = build_table(
+        ("a1", &vec![1, 21, 3]), // 20(t2.a2) > 1(t1.a1)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a2", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a1", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a2", DataType::Int32, true),
+            Field::new("a1", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, LeftMark).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+-------+
+    | a2 | b1 | mark  |
+    +----+----+-------+
+    | 10 | 4  | true  |
+    | 20 | 5  | false |
+    | 30 | 6  | false |
+    +----+----+-------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_mark_different_columns_count_with_filter() -> Result<()> {
+    // select *
+    // from t1
+    // right mark join t2 on t1.b1 = t2.b1 and t1.a1 > t2.a2
+
+    let left = build_table(
+        ("a1", &vec![1, 21, 3]), // 21(t1.a1) > 20(t2.a2)
+        ("b1", &vec![4, 5, 7]),
+        ("c1", &vec![7, 8, 9]),
+    );
+
+    let right = build_table_two_cols(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 6 does not exist on the left
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a1", 0)),
+            Operator::Gt,
+            Arc::new(Column::new("a2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 0,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("a1", DataType::Int32, true),
+            Field::new("a2", DataType::Int32, true),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightMark).await?;
+
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+-------+
+    | a2 | b1 | mark  |
+    +----+----+-------+
+    | 10 | 4  | false |
+    | 20 | 5  | true  |
+    | 30 | 6  | false |
+    +----+----+-------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_full_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b2", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema()).unwrap()) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema()).unwrap()) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Full).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 30 | 6  | 90 |
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    | 3  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_anti() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3, 5]),
+        ("b1", &vec![4, 5, 5, 7, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 8, 9, 11]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, LeftAnti).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 3  | 7  | 9  |
+    | 5  | 7  | 11 |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_one_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 8]),
+    );
+    let right = build_table_two_cols(("a2", &vec![10, 20, 30]), ("b1", &vec![4, 5, 6]));
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, RightAnti).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+
+    | a2 | b1 |
+    +----+----+
+    | 30 | 6  |
+    +----+----+
+    ");
+
+    let left2 = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 8]),
+    );
+    let right2 = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left2.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right2.schema())?) as _,
+    )];
+
+    let (_, batches2) = join_collect(left2, right2, on, RightAnti).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches2), @r"
+    +----+----+----+
+    | a2 | b1 | c2 |
+    +----+----+----+
+    | 30 | 6  | 90 |
+    +----+----+----+
+    ");
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_two_two() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 8]),
+    );
+    let right = build_table_two_cols(("a2", &vec![10, 20, 30]), ("b1", &vec![4, 5, 6]));
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, RightAnti).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+
+    | a2 | b1 |
+    +----+----+
+    | 10 | 4  |
+    | 20 | 5  |
+    | 30 | 6  |
+    +----+----+
+    ");
+
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 8]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, RightAnti).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a2 | b1 | c2 |",
+        "+----+----+----+",
+        "| 10 | 4  | 70 |",
+        "| 20 | 5  | 80 |",
+        "| 30 | 6  | 90 |",
+        "+----+----+----+",
+    ];
+    // The output order is important as SMJ preserves sortedness
+    assert_batches_eq!(expected, &batches);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_two_with_filter() -> Result<()> {
+    let left = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c1", &vec![30]));
+    let right = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c2", &vec![20]));
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c2", 1)),
+            Operator::Gt,
+            Arc::new(Column::new("c1", 0)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, true),
+            Field::new("c2", DataType::Int32, true),
+        ])),
+    );
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightAnti).await?;
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 1  | 10 | 20 |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_filtered_with_mismatched_columns() -> Result<()> {
+    let left = build_table_two_cols(("a1", &vec![31, 31]), ("b1", &vec![32, 33]));
+    let right = build_table(
+        ("a2", &vec![31, 31]),
+        ("b2", &vec![32, 35]),
+        ("c2", &vec![108, 109]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+        ),
+    ];
+
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b1", 0)),
+            Operator::LtEq,
+            Arc::new(Column::new("c2", 1)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 1,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("b1", DataType::Int32, false),
+            Field::new("c2", DataType::Int32, false),
+        ])),
+    );
+
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightAnti).await?;
+
+    let expected = [
+        "+----+----+-----+",
+        "| a2 | b2 | c2  |",
+        "+----+----+-----+",
+        "| 31 | 35 | 109 |",
+        "+----+----+-----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_with_nulls() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(0), Some(1), Some(2), Some(2), Some(3)]),
+        ("b1", &vec![Some(3), Some(4), Some(5), None, Some(6)]),
+        ("c2", &vec![Some(60), None, Some(80), Some(85), Some(90)]),
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(1), Some(2), Some(2), Some(3)]),
+        ("b1", &vec![Some(4), Some(5), None, Some(6)]), // null in key field
+        ("c2", &vec![Some(7), Some(8), Some(8), None]), // null in non-key field
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, RightAnti).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 2  |    | 8  |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_with_nulls_with_options() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(1), Some(2), Some(1), Some(0), Some(2)]),
+        ("b1", &vec![Some(4), Some(5), Some(5), None, Some(5)]),
+        ("c1", &vec![Some(7), Some(8), Some(8), Some(60), None]),
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(3), Some(2), Some(2), Some(1)]),
+        ("b1", &vec![None, Some(5), Some(5), Some(4)]), // null in key field
+        ("c2", &vec![Some(9), None, Some(8), Some(7)]), // null in non-key field
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect_with_options(
+        left,
+        right,
+        on,
+        RightAnti,
+        vec![
+            SortOptions {
+                descending: true,
+                nulls_first: false,
+            };
+            2
+        ],
+        NullEquality::NullEqualsNull,
+    )
+    .await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c2 |
+    +----+----+----+
+    | 3  |    | 9  |
+    | 2  | 5  |    |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_anti_output_two_batches() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 8]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a2", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) =
+        join_collect_batch_size_equals_two(left, right, on, LeftAnti).await?;
+    assert_eq!(batches.len(), 2);
+    assert_eq!(batches[0].num_rows(), 2);
+    assert_eq!(batches[1].num_rows(), 1);
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 1  | 4  | 7  |
+    | 2  | 5  | 8  |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_semi() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]), // 5 is double on the right
+        ("c2", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, LeftSemi).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+
+    | a1 | b1 | c1 |
+    +----+----+----+
+    | 1  | 4  | 7  |
+    | 2  | 5  | 8  |
+    | 2  | 5  | 8  |
+    +----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_one() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![10, 20, 30, 40]),
+        ("b1", &vec![4, 5, 5, 6]),
+        ("c1", &vec![70, 80, 90, 100]),
+    );
+    let right = build_table(
+        ("a2", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]),
+        ("c2", &vec![7, 8, 8, 9]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, RightSemi).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a2 | b1 | c2 |",
+        "+----+----+----+",
+        "| 1  | 4  | 7  |",
+        "| 2  | 5  | 8  |",
+        "| 2  | 5  | 8  |",
+        "+----+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_two() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 6]),
+        ("c1", &vec![70, 80, 90, 100]),
+    );
+    let right = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]),
+        ("c2", &vec![7, 8, 8, 9]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, RightSemi).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a1 | b1 | c2 |",
+        "+----+----+----+",
+        "| 1  | 4  | 7  |",
+        "| 2  | 5  | 8  |",
+        "| 2  | 5  | 8  |",
+        "+----+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_two_with_filter() -> Result<()> {
+    let left = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c1", &vec![30]));
+    let right = build_table(("a1", &vec![1]), ("b1", &vec![10]), ("c2", &vec![20]));
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+    let filter = JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("c2", 1)),
+            Operator::Lt,
+            Arc::new(Column::new("c1", 0)),
+        )),
+        vec![
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Left,
+            },
+            ColumnIndex {
+                index: 2,
+                side: JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, true),
+            Field::new("c2", DataType::Int32, true),
+        ])),
+    );
+    let (_, batches) =
+        join_collect_with_filter(left, right, on, filter, RightSemi).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a1 | b1 | c2 |",
+        "+----+----+----+",
+        "| 1  | 10 | 20 |",
+        "+----+----+----+",
+    ];
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_with_nulls() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(0), Some(1), Some(2), Some(2), Some(3)]),
+        ("b1", &vec![Some(3), Some(4), Some(5), None, Some(6)]),
+        ("c2", &vec![Some(60), None, Some(80), Some(85), Some(90)]),
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(1), Some(2), Some(2), Some(3)]),
+        ("b1", &vec![Some(4), Some(5), None, Some(6)]), // null in key field
+        ("c2", &vec![Some(7), Some(8), Some(8), None]), // null in non-key field
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect(left, right, on, RightSemi).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a1 | b1 | c2 |",
+        "+----+----+----+",
+        "| 1  | 4  | 7  |",
+        "| 2  | 5  | 8  |",
+        "| 3  | 6  |    |",
+        "+----+----+----+",
+    ];
+    // The output order is important as SMJ preserves sortedness
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_with_nulls_with_options() -> Result<()> {
+    let left = build_table_i32_nullable(
+        ("a1", &vec![Some(3), Some(2), Some(1), Some(0), Some(2)]),
+        ("b1", &vec![None, Some(5), Some(4), None, Some(5)]),
+        ("c2", &vec![Some(90), Some(80), Some(70), Some(60), None]),
+    );
+    let right = build_table_i32_nullable(
+        ("a1", &vec![Some(3), Some(2), Some(2), Some(1)]),
+        ("b1", &vec![None, Some(5), Some(5), Some(4)]), // null in key field
+        ("c2", &vec![Some(9), None, Some(8), Some(7)]), // null in non-key field
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) = join_collect_with_options(
+        left,
+        right,
+        on,
+        RightSemi,
+        vec![
+            SortOptions {
+                descending: true,
+                nulls_first: false,
+            };
+            2
+        ],
+        NullEquality::NullEqualsNull,
+    )
+    .await?;
+
+    let expected = [
+        "+----+----+----+",
+        "| a1 | b1 | c2 |",
+        "+----+----+----+",
+        "| 3  |    | 9  |",
+        "| 2  | 5  |    |",
+        "| 2  | 5  | 8  |",
+        "| 1  | 4  | 7  |",
+        "+----+----+----+",
+    ];
+    // The output order is important as SMJ preserves sortedness
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_semi_output_two_batches() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 6]),
+        ("c1", &vec![70, 80, 90, 100]),
+    );
+    let right = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]),
+        ("c2", &vec![7, 8, 8, 9]),
+    );
+    let on = vec![
+        (
+            Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+        ),
+        (
+            Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+            Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+        ),
+    ];
+
+    let (_, batches) =
+        join_collect_batch_size_equals_two(left, right, on, RightSemi).await?;
+    let expected = [
+        "+----+----+----+",
+        "| a1 | b1 | c2 |",
+        "+----+----+----+",
+        "| 1  | 4  | 7  |",
+        "| 2  | 5  | 8  |",
+        "| 2  | 5  | 8  |",
+        "+----+----+----+",
+    ];
+    assert_eq!(batches.len(), 2);
+    assert_eq!(batches[0].num_rows(), 2);
+    assert_eq!(batches[1].num_rows(), 1);
+    assert_batches_eq!(expected, &batches);
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_mark() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30, 40]),
+        ("b1", &vec![4, 4, 5, 6]), // 5 is double on the right
+        ("c2", &vec![60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, LeftMark).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+-------+
+    | a1 | b1 | c1 | mark  |
+    +----+----+----+-------+
+    | 1  | 4  | 7  | true  |
+    | 2  | 5  | 8  | true  |
+    | 2  | 5  | 8  | true  |
+    | 3  | 7  | 9  | false |
+    +----+----+----+-------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_mark() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![1, 2, 2, 3]),
+        ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right
+        ("c1", &vec![7, 8, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30, 40]),
+        ("b1", &vec![4, 4, 5, 6]), // 5 is double on the left
+        ("c2", &vec![60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, RightMark).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+-------+
+    | a2 | b1 | c2 | mark  |
+    +----+----+----+-------+
+    | 10 | 4  | 60 | true  |
+    | 20 | 4  | 70 | true  |
+    | 30 | 5  | 80 | true  |
+    | 40 | 6  | 90 | false |
+    +----+----+----+-------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_with_duplicated_column_names() -> Result<()> {
+    let left = build_table(
+        ("a", &vec![1, 2, 3]),
+        ("b", &vec![4, 5, 7]),
+        ("c", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a", &vec![10, 20, 30]),
+        ("b", &vec![1, 2, 7]),
+        ("c", &vec![70, 80, 90]),
+    );
+    let on = vec![(
+        // join on a=b so there are duplicate column names on unjoined columns
+        Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +---+---+---+----+---+----+
+    | a | b | c | a  | b | c  |
+    +---+---+---+----+---+----+
+    | 1 | 4 | 7 | 10 | 1 | 70 |
+    | 2 | 5 | 8 | 20 | 2 | 80 |
+    +---+---+---+----+---+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_date32() -> Result<()> {
+    let left = build_date_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![19107, 19108, 19108]), // this has a repetition
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_date_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![19107, 19108, 19109]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +------------+------------+------------+------------+------------+------------+
+    | a1         | b1         | c1         | a2         | b1         | c2         |
+    +------------+------------+------------+------------+------------+------------+
+    | 1970-01-02 | 2022-04-25 | 1970-01-08 | 1970-01-11 | 2022-04-25 | 1970-03-12 |
+    | 1970-01-03 | 2022-04-26 | 1970-01-09 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
+    | 1970-01-04 | 2022-04-26 | 1970-01-10 | 1970-01-21 | 2022-04-26 | 1970-03-22 |
+    +------------+------------+------------+------------+------------+------------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_date64() -> Result<()> {
+    let left = build_date64_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![1650703441000, 1650903441000, 1650903441000]), // this has a repetition
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_date64_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![1650703441000, 1650503441000, 1650903441000]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    | a1                      | b1                  | c1                      | a2                      | b1                  | c2                      |
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    | 1970-01-01T00:00:00.001 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.007 | 1970-01-01T00:00:00.010 | 2022-04-23T08:44:01 | 1970-01-01T00:00:00.070 |
+    | 1970-01-01T00:00:00.002 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.008 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+    | 1970-01-01T00:00:00.003 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.009 | 1970-01-01T00:00:00.030 | 2022-04-25T16:17:21 | 1970-01-01T00:00:00.090 |
+    +-------------------------+---------------------+-------------------------+-------------------------+---------------------+-------------------------+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_binary() -> Result<()> {
+    let left = build_binary_table(
+        (
+            "a1",
+            &vec![
+                &[0xc0, 0xff, 0xee],
+                &[0xde, 0xca, 0xde],
+                &[0xfa, 0xca, 0xde],
+            ],
+        ),
+        ("b1", &vec![5, 10, 15]), // this has a repetition
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_binary_table(
+        (
+            "a1",
+            &vec![
+                &[0xc0, 0xff, 0xee],
+                &[0xde, 0xca, 0xde],
+                &[0xfa, 0xca, 0xde],
+            ],
+        ),
+        ("b2", &vec![105, 110, 115]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +--------+----+----+--------+-----+----+
+    | a1     | b1 | c1 | a1     | b2  | c2 |
+    +--------+----+----+--------+-----+----+
+    | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
+    | decade | 10 | 8  | decade | 110 | 80 |
+    | facade | 15 | 9  | facade | 115 | 90 |
+    +--------+----+----+--------+-----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_fixed_size_binary() -> Result<()> {
+    let left = build_fixed_size_binary_table(
+        (
+            "a1",
+            &vec![
+                &[0xc0, 0xff, 0xee],
+                &[0xde, 0xca, 0xde],
+                &[0xfa, 0xca, 0xde],
+            ],
+        ),
+        ("b1", &vec![5, 10, 15]), // this has a repetition
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_fixed_size_binary_table(
+        (
+            "a1",
+            &vec![
+                &[0xc0, 0xff, 0xee],
+                &[0xde, 0xca, 0xde],
+                &[0xfa, 0xca, 0xde],
+            ],
+        ),
+        ("b2", &vec![105, 110, 115]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("a1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("a1", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Inner).await?;
+
+    // The output order is important as SMJ preserves sortedness
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +--------+----+----+--------+-----+----+
+    | a1     | b1 | c1 | a1     | b2  | c2 |
+    +--------+----+----+--------+-----+----+
+    | c0ffee | 5  | 7  | c0ffee | 105 | 70 |
+    | decade | 10 | 8  | decade | 110 | 80 |
+    | facade | 15 | 9  | facade | 115 | 90 |
+    +--------+----+----+--------+-----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_sort_order() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3, 4, 5]),
+        ("b1", &vec![3, 4, 5, 6, 6, 7]),
+        ("c1", &vec![4, 5, 6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![0, 10, 20, 30, 40]),
+        ("b2", &vec![2, 4, 6, 6, 8]),
+        ("c2", &vec![50, 60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Left).await?;
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_sort_order() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3]),
+        ("b1", &vec![3, 4, 5, 7]),
+        ("c1", &vec![6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![0, 10, 20, 30]),
+        ("b2", &vec![2, 4, 5, 6]),
+        ("c2", &vec![60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Right).await?;
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 2  | 60 |
+    | 1  | 4  | 7  | 10 | 4  | 70 |
+    | 2  | 5  | 8  | 20 | 5  | 80 |
+    |    |    |    | 30 | 6  | 90 |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_left_multiple_batches() -> Result<()> {
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 1, 2]),
+        ("b1", &vec![3, 4, 5]),
+        ("c1", &vec![4, 5, 6]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![3, 4, 5, 6]),
+        ("b1", &vec![6, 6, 7, 9]),
+        ("c1", &vec![7, 8, 9, 9]),
+    );
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 10, 20]),
+        ("b2", &vec![2, 4, 6]),
+        ("c2", &vec![50, 60, 70]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![30, 40]),
+        ("b2", &vec![6, 8]),
+        ("c2", &vec![80, 90]),
+    );
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
+    let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Left).await?;
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    | 6  | 9  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_right_multiple_batches() -> Result<()> {
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 1, 2]),
+        ("b2", &vec![3, 4, 5]),
+        ("c2", &vec![4, 5, 6]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![3, 4, 5, 6]),
+        ("b2", &vec![6, 6, 7, 9]),
+        ("c2", &vec![7, 8, 9, 9]),
+    );
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 10, 20]),
+        ("b1", &vec![2, 4, 6]),
+        ("c1", &vec![50, 60, 70]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![30, 40]),
+        ("b1", &vec![6, 8]),
+        ("c1", &vec![80, 90]),
+    );
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
+    let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Right).await?;
+    assert_snapshot!(batches_to_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 3  | 4  |
+    | 10 | 4  | 60 | 1  | 4  | 5  |
+    |    |    |    | 2  | 5  | 6  |
+    | 20 | 6  | 70 | 3  | 6  | 7  |
+    | 30 | 6  | 80 | 3  | 6  | 7  |
+    | 20 | 6  | 70 | 4  | 6  | 8  |
+    | 30 | 6  | 80 | 4  | 6  | 8  |
+    |    |    |    | 5  | 7  | 9  |
+    |    |    |    | 6  | 9  | 9  |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn join_full_multiple_batches() -> Result<()> {
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 1, 2]),
+        ("b1", &vec![3, 4, 5]),
+        ("c1", &vec![4, 5, 6]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![3, 4, 5, 6]),
+        ("b1", &vec![6, 6, 7, 9]),
+        ("c1", &vec![7, 8, 9, 9]),
+    );
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 10, 20]),
+        ("b2", &vec![2, 4, 6]),
+        ("c2", &vec![50, 60, 70]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![30, 40]),
+        ("b2", &vec![6, 8]),
+        ("c2", &vec![80, 90]),
+    );
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2]);
+    let right = build_table_from_batches(vec![right_batch_1, right_batch_2]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+
+    let (_, batches) = join_collect(left, right, on, Full).await?;
+    assert_snapshot!(batches_to_sort_string(&batches), @r"
+    +----+----+----+----+----+----+
+    | a1 | b1 | c1 | a2 | b2 | c2 |
+    +----+----+----+----+----+----+
+    |    |    |    | 0  | 2  | 50 |
+    |    |    |    | 40 | 8  | 90 |
+    | 0  | 3  | 4  |    |    |    |
+    | 1  | 4  | 5  | 10 | 4  | 60 |
+    | 2  | 5  | 6  |    |    |    |
+    | 3  | 6  | 7  | 20 | 6  | 70 |
+    | 3  | 6  | 7  | 30 | 6  | 80 |
+    | 4  | 6  | 8  | 20 | 6  | 70 |
+    | 4  | 6  | 8  | 30 | 6  | 80 |
+    | 5  | 7  | 9  |    |    |    |
+    | 6  | 9  | 9  |    |    |    |
+    +----+----+----+----+----+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn overallocation_single_batch_no_spill() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3, 4, 5]),
+        ("b1", &vec![1, 2, 3, 4, 5, 6]),
+        ("c1", &vec![4, 5, 6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![0, 10, 20, 30, 40]),
+        ("b2", &vec![1, 3, 4, 6, 8]),
+        ("c2", &vec![50, 60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let join_types = vec![
+        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+    ];
+
+    // Disable DiskManager to prevent spilling
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
+        )
+        .build_arc()?;
+    let session_config = SessionConfig::default().with_batch_size(50);
+
+    for join_type in join_types {
+        let task_ctx = TaskContext::default()
+            .with_session_config(session_config.clone())
+            .with_runtime(Arc::clone(&runtime));
+        let task_ctx = Arc::new(task_ctx);
+
+        let join = join_with_options(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            join_type,
+            sort_options.clone(),
+            NullEquality::NullEqualsNothing,
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let err = common::collect(stream).await.unwrap_err();
+
+        assert_contains!(err.to_string(), "Failed to allocate additional");
+        assert_contains!(err.to_string(), "SMJStream[0]");
+        assert_contains!(err.to_string(), "Disk spilling disabled");
+        assert!(join.metrics().is_some());
+        assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
+        assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
+        assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
+    }
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn overallocation_multi_batch_no_spill() -> Result<()> {
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 1]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![4, 5]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![2, 3]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![6, 7]),
+    );
+    let left_batch_3 = build_table_i32(
+        ("a1", &vec![4, 5]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![8, 9]),
+    );
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 10]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![50, 60]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![20, 30]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![70, 80]),
+    );
+    let right_batch_3 =
+        build_table_i32(("a2", &vec![40]), ("b2", &vec![1]), ("c2", &vec![90]));
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2, left_batch_3]);
+    let right =
+        build_table_from_batches(vec![right_batch_1, right_batch_2, right_batch_3]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let join_types = vec![
+        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+    ];
+
+    // Disable DiskManager to prevent spilling
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
+        )
+        .build_arc()?;
+    let session_config = SessionConfig::default().with_batch_size(50);
+
+    for join_type in join_types {
+        let task_ctx = TaskContext::default()
+            .with_session_config(session_config.clone())
+            .with_runtime(Arc::clone(&runtime));
+        let task_ctx = Arc::new(task_ctx);
+        let join = join_with_options(
+            Arc::clone(&left),
+            Arc::clone(&right),
+            on.clone(),
+            join_type,
+            sort_options.clone(),
+            NullEquality::NullEqualsNothing,
+        )?;
+
+        let stream = join.execute(0, task_ctx)?;
+        let err = common::collect(stream).await.unwrap_err();
+
+        assert_contains!(err.to_string(), "Failed to allocate additional");
+        assert_contains!(err.to_string(), "SMJStream[0]");
+        assert_contains!(err.to_string(), "Disk spilling disabled");
+        assert!(join.metrics().is_some());
+        assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
+        assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
+        assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
+    }
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn overallocation_single_batch_spill() -> Result<()> {
+    let left = build_table(
+        ("a1", &vec![0, 1, 2, 3, 4, 5]),
+        ("b1", &vec![1, 2, 3, 4, 5, 6]),
+        ("c1", &vec![4, 5, 6, 7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![0, 10, 20, 30, 40]),
+        ("b2", &vec![1, 3, 4, 6, 8]),
+        ("c2", &vec![50, 60, 70, 80, 90]),
+    );
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let join_types = [
+        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+    ];
+
+    // Enable DiskManager to allow spilling
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(100, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        for join_type in &join_types {
+            let task_ctx = TaskContext::default()
+                .with_session_config(session_config.clone())
+                .with_runtime(Arc::clone(&runtime));
+            let task_ctx = Arc::new(task_ctx);
+
+            let join = join_with_options(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+
+            let stream = join.execute(0, task_ctx)?;
+            let spilled_join_result = common::collect(stream).await.unwrap();
+
+            assert!(join.metrics().is_some());
+            assert!(join.metrics().unwrap().spill_count().unwrap() > 0);
+            assert!(join.metrics().unwrap().spilled_bytes().unwrap() > 0);
+            assert!(join.metrics().unwrap().spilled_rows().unwrap() > 0);
+
+            // Run the test with no spill configuration as
+            let task_ctx_no_spill =
+                TaskContext::default().with_session_config(session_config.clone());
+            let task_ctx_no_spill = Arc::new(task_ctx_no_spill);
+
+            let join = join_with_options(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join.execute(0, task_ctx_no_spill)?;
+            let no_spilled_join_result = common::collect(stream).await.unwrap();
+
+            assert!(join.metrics().is_some());
+            assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
+            assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
+            assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
+            // Compare spilled and non spilled data to check spill logic doesn't corrupt the data
+            assert_eq!(spilled_join_result, no_spilled_join_result);
+        }
+    }
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn overallocation_multi_batch_spill() -> Result<()> {
+    let left_batch_1 = build_table_i32(
+        ("a1", &vec![0, 1]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![4, 5]),
+    );
+    let left_batch_2 = build_table_i32(
+        ("a1", &vec![2, 3]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![6, 7]),
+    );
+    let left_batch_3 = build_table_i32(
+        ("a1", &vec![4, 5]),
+        ("b1", &vec![1, 1]),
+        ("c1", &vec![8, 9]),
+    );
+    let right_batch_1 = build_table_i32(
+        ("a2", &vec![0, 10]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![50, 60]),
+    );
+    let right_batch_2 = build_table_i32(
+        ("a2", &vec![20, 30]),
+        ("b2", &vec![1, 1]),
+        ("c2", &vec![70, 80]),
+    );
+    let right_batch_3 =
+        build_table_i32(("a2", &vec![40]), ("b2", &vec![1]), ("c2", &vec![90]));
+    let left = build_table_from_batches(vec![left_batch_1, left_batch_2, left_batch_3]);
+    let right =
+        build_table_from_batches(vec![right_batch_1, right_batch_2, right_batch_3]);
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b2", &right.schema())?) as _,
+    )];
+    let sort_options = vec![SortOptions::default(); on.len()];
+
+    let join_types = [
+        Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+    ];
+
+    // Enable DiskManager to allow spilling
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_limit(500, 1.0)
+        .with_disk_manager_builder(
+            DiskManagerBuilder::default().with_mode(DiskManagerMode::OsTmpDirectory),
+        )
+        .build_arc()?;
+
+    for batch_size in [1, 50] {
+        let session_config = SessionConfig::default().with_batch_size(batch_size);
+
+        for join_type in &join_types {
+            let task_ctx = TaskContext::default()
+                .with_session_config(session_config.clone())
+                .with_runtime(Arc::clone(&runtime));
+            let task_ctx = Arc::new(task_ctx);
+            let join = join_with_options(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+
+            let stream = join.execute(0, task_ctx)?;
+            let spilled_join_result = common::collect(stream).await.unwrap();
+            assert!(join.metrics().is_some());
+            assert!(join.metrics().unwrap().spill_count().unwrap() > 0);
+            assert!(join.metrics().unwrap().spilled_bytes().unwrap() > 0);
+            assert!(join.metrics().unwrap().spilled_rows().unwrap() > 0);
+
+            // Run the test with no spill configuration as
+            let task_ctx_no_spill =
+                TaskContext::default().with_session_config(session_config.clone());
+            let task_ctx_no_spill = Arc::new(task_ctx_no_spill);
+
+            let join = join_with_options(
+                Arc::clone(&left),
+                Arc::clone(&right),
+                on.clone(),
+                *join_type,
+                sort_options.clone(),
+                NullEquality::NullEqualsNothing,
+            )?;
+            let stream = join.execute(0, task_ctx_no_spill)?;
+            let no_spilled_join_result = common::collect(stream).await.unwrap();
+
+            assert!(join.metrics().is_some());
+            assert_eq!(join.metrics().unwrap().spill_count(), Some(0));
+            assert_eq!(join.metrics().unwrap().spilled_bytes(), Some(0));
+            assert_eq!(join.metrics().unwrap().spilled_rows(), Some(0));
+            // Compare spilled and non spilled data to check spill logic doesn't corrupt the data
+            assert_eq!(spilled_join_result, no_spilled_join_result);
+        }
+    }
+
+    Ok(())
+}
+
+fn build_joined_record_batches() -> Result<JoinedRecordBatches> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, true),
+        Field::new("x", DataType::Int32, true),
+        Field::new("y", DataType::Int32, true),
+    ]));
+
+    let mut batches = JoinedRecordBatches {
+        joined_batches: BatchCoalescer::new(Arc::clone(&schema), 8192),
+        filter_metadata: crate::joins::sort_merge_join::filter::FilterMetadata::new(),
+    };
+
+    // Insert already prejoined non-filtered rows
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![10, 10])),
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![11, 9])),
+        ],
+    )?)?;
+
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![11])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![12])),
+        ],
+    )?)?;
+
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![12, 12])),
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![11, 13])),
+        ],
+    )?)?;
+
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![13])),
+            Arc::new(Int32Array::from(vec![1])),
+            Arc::new(Int32Array::from(vec![12])),
+        ],
+    )?)?;
+
+    batches.joined_batches.push_batch(RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![14, 14])),
+            Arc::new(Int32Array::from(vec![1, 1])),
+            Arc::new(Int32Array::from(vec![12, 11])),
+        ],
+    )?)?;
+
+    let streamed_indices = vec![0, 0];
+    batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![0; streamed_indices.len()]);
+    batches
+        .filter_metadata
+        .row_indices
+        .extend(&UInt64Array::from(streamed_indices));
+
+    let streamed_indices = vec![1];
+    batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![0; streamed_indices.len()]);
+    batches
+        .filter_metadata
+        .row_indices
+        .extend(&UInt64Array::from(streamed_indices));
+
+    let streamed_indices = vec![0, 0];
+    batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![1; streamed_indices.len()]);
+    batches
+        .filter_metadata
+        .row_indices
+        .extend(&UInt64Array::from(streamed_indices));
+
+    let streamed_indices = vec![0];
+    batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![2; streamed_indices.len()]);
+    batches
+        .filter_metadata
+        .row_indices
+        .extend(&UInt64Array::from(streamed_indices));
+
+    let streamed_indices = vec![0, 0];
+    batches
+        .filter_metadata
+        .batch_ids
+        .extend(vec![3; streamed_indices.len()]);
+    batches
+        .filter_metadata
+        .row_indices
+        .extend(&UInt64Array::from(streamed_indices));
+
+    batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![true, false]));
+    batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![true]));
+    batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![false, true]));
+    batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![false]));
+    batches
+        .filter_metadata
+        .filter_mask
+        .extend(&BooleanArray::from(vec![false, false]));
+
+    Ok(batches)
+}
+
+#[tokio::test]
+async fn test_left_outer_join_filtered_mask() -> Result<()> {
+    let mut joined_batches = build_joined_record_batches()?;
+    let schema = joined_batches.joined_batches.schema();
+
+    let output = joined_batches.concat_batches(&schema)?;
+    let out_mask = joined_batches.filter_metadata.filter_mask.finish();
+    let out_indices = joined_batches.filter_metadata.row_indices.finish();
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0]),
+            &[0usize],
+            &BooleanArray::from(vec![true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![true, false, false, false, false, false, false, false])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0]),
+            &[0usize],
+            &BooleanArray::from(vec![false]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![false, false, false, false, false, false, false, false])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0]),
+            &[0usize; 2],
+            &BooleanArray::from(vec![true, true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![true, true, false, false, false, false, false, false])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0, 0]),
+            &[0usize; 3],
+            &BooleanArray::from(vec![true, true, true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![true, true, true, false, false, false, false, false])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0, 0]),
+            &[0usize; 3],
+            &BooleanArray::from(vec![true, false, true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![
+            Some(true),
+            None,
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false)
+        ])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0, 0]),
+            &[0usize; 3],
+            &BooleanArray::from(vec![false, false, true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![
+            None,
+            None,
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false)
+        ])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0, 0]),
+            &[0usize; 3],
+            &BooleanArray::from(vec![false, true, true]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![
+            None,
+            Some(true),
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false)
+        ])
+    );
+
+    assert_eq!(
+        get_corrected_filter_mask(
+            Left,
+            &UInt64Array::from(vec![0, 0, 0]),
+            &[0usize; 3],
+            &BooleanArray::from(vec![false, false, false]),
+            output.num_rows()
+        )
+        .unwrap(),
+        BooleanArray::from(vec![
+            None,
+            None,
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false)
+        ])
+    );
+
+    let corrected_mask = get_corrected_filter_mask(
+        Left,
+        &out_indices,
+        &joined_batches.filter_metadata.batch_ids,
+        &out_mask,
+        output.num_rows(),
+    )
+    .unwrap();
+
+    assert_eq!(
+        corrected_mask,
+        BooleanArray::from(vec![
+            Some(true),
+            None,
+            Some(true),
+            None,
+            Some(true),
+            Some(false),
+            None,
+            Some(false)
+        ])
+    );
+
+    let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
+
+    assert_snapshot!(batches_to_string(&[filtered_rb]), @r"
+    +---+----+---+----+
+    | a | b  | x | y  |
+    +---+----+---+----+
+    | 1 | 10 | 1 | 11 |
+    | 1 | 11 | 1 | 12 |
+    | 1 | 12 | 1 | 13 |
+    +---+----+---+----+
+    ");
+
+    // output null rows
+
+    let null_mask = arrow::compute::not(&corrected_mask)?;
+    assert_eq!(
+        null_mask,
+        BooleanArray::from(vec![
+            Some(false),
+            None,
+            Some(false),
+            None,
+            Some(false),
+            Some(true),
+            None,
+            Some(true)
+        ])
+    );
+
+    let null_joined_batch = filter_record_batch(&output, &null_mask)?;
+
+    assert_snapshot!(batches_to_string(&[null_joined_batch]), @r"
+    +---+----+---+----+
+    | a | b  | x | y  |
+    +---+----+---+----+
+    | 1 | 13 | 1 | 12 |
+    | 1 | 14 | 1 | 11 |
+    +---+----+---+----+
+    ");
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_semi_join_filtered_mask() -> Result<()> {
+    for join_type in [LeftSemi, RightSemi] {
+        let mut joined_batches = build_joined_record_batches()?;
+        let schema = joined_batches.joined_batches.schema();
+
+        let output = joined_batches.concat_batches(&schema)?;
+        let out_mask = joined_batches.filter_metadata.filter_mask.finish();
+        let out_indices = joined_batches.filter_metadata.row_indices.finish();
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0]),
+                &[0usize],
+                &BooleanArray::from(vec![true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![true])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0]),
+                &[0usize],
+                &BooleanArray::from(vec![false]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0]),
+                &[0usize; 2],
+                &BooleanArray::from(vec![true, true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![Some(true), None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![true, true, true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![Some(true), None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![true, false, true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![Some(true), None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, false, true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, Some(true),])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, true, true]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, Some(true), None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, false, false]),
+                output.num_rows()
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, None])
+        );
+
+        let corrected_mask = get_corrected_filter_mask(
+            join_type,
+            &out_indices,
+            &joined_batches.filter_metadata.batch_ids,
+            &out_mask,
+            output.num_rows(),
+        )
+        .unwrap();
+
+        assert_eq!(
+            corrected_mask,
+            BooleanArray::from(vec![
+                Some(true),
+                None,
+                Some(true),
+                None,
+                Some(true),
+                None,
+                None,
+                None
+            ])
+        );
+
+        let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
+
+        assert_batches_eq!(
+            &[
+                "+---+----+---+----+",
+                "| a | b  | x | y  |",
+                "+---+----+---+----+",
+                "| 1 | 10 | 1 | 11 |",
+                "| 1 | 11 | 1 | 12 |",
+                "| 1 | 12 | 1 | 13 |",
+                "+---+----+---+----+",
+            ],
+            &[filtered_rb]
+        );
+
+        // output null rows
+        let null_mask = arrow::compute::not(&corrected_mask)?;
+        assert_eq!(
+            null_mask,
+            BooleanArray::from(vec![
+                Some(false),
+                None,
+                Some(false),
+                None,
+                Some(false),
+                None,
+                None,
+                None
+            ])
+        );
+
+        let null_joined_batch = filter_record_batch(&output, &null_mask)?;
+
+        assert_batches_eq!(
+            &[
+                "+---+---+---+---+",
+                "| a | b | x | y |",
+                "+---+---+---+---+",
+                "+---+---+---+---+",
+            ],
+            &[null_joined_batch]
+        );
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_anti_join_filtered_mask() -> Result<()> {
+    for join_type in [LeftAnti, RightAnti] {
+        let mut joined_batches = build_joined_record_batches()?;
+        let schema = joined_batches.joined_batches.schema();
+
+        let output = joined_batches.concat_batches(&schema)?;
+        let out_mask = joined_batches.filter_metadata.filter_mask.finish();
+        let out_indices = joined_batches.filter_metadata.row_indices.finish();
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0]),
+                &[0usize],
+                &BooleanArray::from(vec![true]),
+                1
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0]),
+                &[0usize],
+                &BooleanArray::from(vec![false]),
+                1
+            )
+            .unwrap(),
+            BooleanArray::from(vec![Some(true)])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0]),
+                &[0usize; 2],
+                &BooleanArray::from(vec![true, true]),
+                2
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![true, true, true]),
+                3
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![true, false, true]),
+                3
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, false, true]),
+                3
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, true, true]),
+                3
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, None])
+        );
+
+        assert_eq!(
+            get_corrected_filter_mask(
+                join_type,
+                &UInt64Array::from(vec![0, 0, 0]),
+                &[0usize; 3],
+                &BooleanArray::from(vec![false, false, false]),
+                3
+            )
+            .unwrap(),
+            BooleanArray::from(vec![None, None, Some(true)])
+        );
+
+        let corrected_mask = get_corrected_filter_mask(
+            join_type,
+            &out_indices,
+            &joined_batches.filter_metadata.batch_ids,
+            &out_mask,
+            output.num_rows(),
+        )
+        .unwrap();
+
+        assert_eq!(
+            corrected_mask,
+            BooleanArray::from(vec![
+                None,
+                None,
+                None,
+                None,
+                None,
+                Some(true),
+                None,
+                Some(true)
+            ])
+        );
+
+        let filtered_rb = filter_record_batch(&output, &corrected_mask)?;
+
+        allow_duplicates! {
+            assert_snapshot!(batches_to_string(&[filtered_rb]), @r"
+            +---+----+---+----+
+            | a | b  | x | y  |
+            +---+----+---+----+
+            | 1 | 13 | 1 | 12 |
+            | 1 | 14 | 1 | 11 |
+            +---+----+---+----+
+            ");
+        }
+
+        // output null rows
+        let null_mask = arrow::compute::not(&corrected_mask)?;
+        assert_eq!(
+            null_mask,
+            BooleanArray::from(vec![
+                None,
+                None,
+                None,
+                None,
+                None,
+                Some(false),
+                None,
+                Some(false),
+            ])
+        );
+
+        let null_joined_batch = filter_record_batch(&output, &null_mask)?;
+
+        allow_duplicates! {
+            assert_snapshot!(batches_to_string(&[null_joined_batch]), @r"
+            +---+---+---+---+
+            | a | b | x | y |
+            +---+---+---+---+
+            +---+---+---+---+
+            ");
+        }
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_partition_statistics() -> Result<()> {
+    use crate::ExecutionPlan;
+    use datafusion_common::stats::Precision;
+
+    let left = build_table(
+        ("a1", &vec![1, 2, 3]),
+        ("b1", &vec![4, 5, 5]),
+        ("c1", &vec![7, 8, 9]),
+    );
+    let right = build_table(
+        ("a2", &vec![10, 20, 30]),
+        ("b1", &vec![4, 5, 6]),
+        ("c2", &vec![70, 80, 90]),
+    );
+
+    let on = vec![(
+        Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+        Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+    )];
+
+    // Test different join types to ensure partition_statistics works correctly for all
+    let join_types = vec![
+        (Inner, 6),     // left cols + right cols
+        (Left, 6),      // left cols + right cols
+        (Right, 6),     // left cols + right cols
+        (Full, 6),      // left cols + right cols
+        (LeftSemi, 3),  // only left cols
+        (LeftAnti, 3),  // only left cols
+        (RightSemi, 3), // only right cols
+        (RightAnti, 3), // only right cols
+    ];
+
+    for (join_type, expected_cols) in join_types {
+        let join_exec =
+            join(Arc::clone(&left), Arc::clone(&right), on.clone(), join_type)?;
+
+        // Test aggregate statistics (partition = None)
+        // Should return meaningful statistics computed from both inputs
+        let stats = join_exec.partition_statistics(None)?;
+        assert_eq!(
+            stats.column_statistics.len(),
+            expected_cols,
+            "Aggregate stats column count failed for {join_type:?}"
+        );
+        // Verify that aggregate statistics have a meaningful num_rows (not Absent)
+        assert!(
+            stats.num_rows != Precision::Absent,
+            "Aggregate stats should have meaningful num_rows for {join_type:?}, got {:?}",
+            stats.num_rows
+        );
+
+        // Test partition-specific statistics (partition = Some(0))
+        // The implementation correctly passes `partition` to children.
+        // Since the child TestMemoryExec returns unknown stats for specific partitions,
+        // the join output will also have Absent num_rows. This is expected behavior
+        // as the statistics depend on what the children can provide.
+        let partition_stats = join_exec.partition_statistics(Some(0))?;
+        assert_eq!(
+            partition_stats.column_statistics.len(),
+            expected_cols,
+            "Partition stats column count failed for {join_type:?}"
+        );
+        // When children return unknown stats, the join's partition stats will be Absent
+        assert!(
+            partition_stats.num_rows == Precision::Absent,
+            "Partition stats should have Absent num_rows when children return unknown for {join_type:?}, got {:?}",
+            partition_stats.num_rows
+        );
+    }
+
+    Ok(())
+}
+
+fn build_batches(
+    a: (&str, &[Vec<bool>]),
+    b: (&str, &[Vec<i32>]),
+    c: (&str, &[Vec<i32>]),
+) -> (Vec<RecordBatch>, SchemaRef) {
+    assert_eq!(a.1.len(), b.1.len());
+    let mut batches = vec![];
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(a.0, DataType::Boolean, false),
+        Field::new(b.0, DataType::Int32, false),
+        Field::new(c.0, DataType::Int32, false),
+    ]));
+
+    for i in 0..a.1.len() {
+        batches.push(
+            RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(BooleanArray::from(a.1[i].clone())),
+                    Arc::new(Int32Array::from(b.1[i].clone())),
+                    Arc::new(Int32Array::from(c.1[i].clone())),
+                ],
+            )
+            .unwrap(),
+        );
+    }
+    let schema = batches[0].schema();
+    (batches, schema)
+}
+
+fn build_batched_finish_barrier_table(
+    a: (&str, &[Vec<bool>]),
+    b: (&str, &[Vec<i32>]),
+    c: (&str, &[Vec<i32>]),
+) -> (Arc<BarrierExec>, Arc<TestMemoryExec>) {
+    let (batches, schema) = build_batches(a, b, c);
+
+    let memory_exec = TestMemoryExec::try_new_exec(
+        std::slice::from_ref(&batches),
+        Arc::clone(&schema),
+        None,
+    )
+    .unwrap();
+
+    let barrier_exec = Arc::new(
+        BarrierExec::new(vec![batches], schema)
+            .with_log(false)
+            .without_start_barrier()
+            .with_finish_barrier(),
+    );
+
+    (barrier_exec, memory_exec)
+}
+
+/// Concat and sort batches by all the columns to make sure we can compare them with different join
+fn prepare_record_batches_for_cmp(output: Vec<RecordBatch>) -> RecordBatch {
+    let output_batch = arrow::compute::concat_batches(output[0].schema_ref(), &output)
+        .expect("failed to concat batches");
+
+    // Sort on all columns to make sure we have a deterministic order for the assertion
+    let sort_columns = output_batch
+        .columns()
+        .iter()
+        .map(|c| SortColumn {
+            values: Arc::clone(c),
+            options: None,
+        })
+        .collect::<Vec<_>>();
+
+    let sorted_columns =
+        arrow::compute::lexsort(&sort_columns, None).expect("failed to sort");
+
+    RecordBatch::try_new(output_batch.schema(), sorted_columns)
+        .expect("failed to create batch")
+}
+
+#[expect(clippy::too_many_arguments)]
+async fn join_get_stream_and_get_expected(
+    left: Arc<dyn ExecutionPlan>,
+    right: Arc<dyn ExecutionPlan>,
+    oracle_left: Arc<dyn ExecutionPlan>,
+    oracle_right: Arc<dyn ExecutionPlan>,
+    on: JoinOn,
+    join_type: JoinType,
+    filter: Option<JoinFilter>,
+    batch_size: usize,
+) -> Result<(SendableRecordBatchStream, RecordBatch)> {
+    let sort_options = vec![SortOptions::default(); on.len()];
+    let null_equality = NullEquality::NullEqualsNothing;
+    let task_ctx = Arc::new(
+        TaskContext::default()
+            .with_session_config(SessionConfig::default().with_batch_size(batch_size)),
+    );
+
+    let expected_output = {
+        let oracle = HashJoinExec::try_new(
+            oracle_left,
+            oracle_right,
+            on.clone(),
+            filter.clone(),
+            &join_type,
+            None,
+            PartitionMode::Partitioned,
+            null_equality,
+            false,
+        )?;
+
+        let stream = oracle.execute(0, Arc::clone(&task_ctx))?;
+
+        let batches = common::collect(stream).await?;
+
+        prepare_record_batches_for_cmp(batches)
+    };
+
+    let join = SortMergeJoinExec::try_new(
+        left,
+        right,
+        on,
+        filter,
+        join_type,
+        sort_options,
+        null_equality,
+    )?;
+
+    let stream = join.execute(0, task_ctx)?;
+
+    Ok((stream, expected_output))
+}
+
+fn generate_data_for_emit_early_test(
+    batch_size: usize,
+    number_of_batches: usize,
+    join_type: JoinType,
+) -> (
+    Arc<BarrierExec>,
+    Arc<BarrierExec>,
+    Arc<TestMemoryExec>,
+    Arc<TestMemoryExec>,
+) {
+    let number_of_rows_per_batch = number_of_batches * batch_size;
+    // Prepare data
+    let left_a1 = (0..number_of_rows_per_batch as i32)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let left_b1 = (0..1000000)
+        .filter(|item| {
+            match join_type {
+                LeftAnti | RightAnti => {
+                    let remainder = item % (batch_size as i32);
+
+                    // Make sure to have one that match and one that don't
+                    remainder == 0 || remainder == 1
+                }
+                // Have at least 1 that is not matching
+                _ => item % batch_size as i32 != 0,
+            }
+        })
+        .take(number_of_rows_per_batch)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+
+    let left_bool_col1 = left_a1
+        .clone()
+        .into_iter()
+        .map(|b| {
+            b.into_iter()
+                // Mostly true but have some false that not overlap with the right column
+                .map(|a| a % (batch_size as i32) != (batch_size as i32) - 2)
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let (left, left_memory) = build_batched_finish_barrier_table(
+        ("bool_col1", left_bool_col1.as_slice()),
+        ("b1", left_b1.as_slice()),
+        ("a1", left_a1.as_slice()),
+    );
+
+    let right_a2 = (0..number_of_rows_per_batch as i32)
+        .map(|item| item * 11)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let right_b1 = (0..1000000)
+        .filter(|item| {
+            match join_type {
+                LeftAnti | RightAnti => {
+                    let remainder = item % (batch_size as i32);
+
+                    // Make sure to have one that match and one that don't
+                    remainder == 1 || remainder == 2
+                }
+                // Have at least 1 that is not matching
+                _ => item % batch_size as i32 != 1,
+            }
+        })
+        .take(number_of_rows_per_batch)
+        .chunks(batch_size)
+        .into_iter()
+        .map(|chunk| chunk.collect::<Vec<_>>())
+        .collect::<Vec<_>>();
+    let right_bool_col2 = right_a2
+        .clone()
+        .into_iter()
+        .map(|b| {
+            b.into_iter()
+                // Mostly true but have some false that not overlap with the left column
+                .map(|a| a % (batch_size as i32) != (batch_size as i32) - 1)
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let (right, right_memory) = build_batched_finish_barrier_table(
+        ("bool_col2", right_bool_col2.as_slice()),
+        ("b1", right_b1.as_slice()),
+        ("a2", right_a2.as_slice()),
+    );
+
+    (left, right, left_memory, right_memory)
+}
+
+#[tokio::test]
+async fn test_should_emit_early_when_have_enough_data_to_emit() -> Result<()> {
+    for with_filtering in [false, true] {
+        let join_types = vec![
+            Inner, Left, Right, RightSemi, Full, LeftSemi, LeftAnti, LeftMark, RightMark,
+        ];
+        const BATCH_SIZE: usize = 10;
+        for join_type in join_types {
+            for output_batch_size in [
+                BATCH_SIZE / 3,
+                BATCH_SIZE / 2,
+                BATCH_SIZE,
+                BATCH_SIZE * 2,
+                BATCH_SIZE * 3,
+            ] {
+                // Make sure the number of batches is enough for all join type to emit some output
+                let number_of_batches = if output_batch_size <= BATCH_SIZE {
+                    100
+                } else {
+                    // Have enough batches
+                    (output_batch_size * 100) / BATCH_SIZE
+                };
+
+                let (left, right, left_memory, right_memory) =
+                    generate_data_for_emit_early_test(
+                        BATCH_SIZE,
+                        number_of_batches,
+                        join_type,
+                    );
+
+                let on = vec![(
+                    Arc::new(Column::new_with_schema("b1", &left.schema())?) as _,
+                    Arc::new(Column::new_with_schema("b1", &right.schema())?) as _,
+                )];
+
+                let join_filter = if with_filtering {
+                    let filter = JoinFilter::new(
+                        Arc::new(BinaryExpr::new(
+                            Arc::new(Column::new("bool_col1", 0)),
+                            Operator::And,
+                            Arc::new(Column::new("bool_col2", 1)),
+                        )),
+                        vec![
+                            ColumnIndex {
+                                index: 0,
+                                side: JoinSide::Left,
+                            },
+                            ColumnIndex {
+                                index: 0,
+                                side: JoinSide::Right,
+                            },
+                        ],
+                        Arc::new(Schema::new(vec![
+                            Field::new("bool_col1", DataType::Boolean, true),
+                            Field::new("bool_col2", DataType::Boolean, true),
+                        ])),
+                    );
+                    Some(filter)
+                } else {
+                    None
+                };
+
+                // select *
+                // from t1
+                // right join t2 on t1.b1 = t2.b1 and t1.bool_col1 AND t2.bool_col2
+                let (mut output_stream, expected) = join_get_stream_and_get_expected(
+                    Arc::clone(&left) as Arc<dyn ExecutionPlan>,
+                    Arc::clone(&right) as Arc<dyn ExecutionPlan>,
+                    left_memory as Arc<dyn ExecutionPlan>,
+                    right_memory as Arc<dyn ExecutionPlan>,
+                    on,
+                    join_type,
+                    join_filter,
+                    output_batch_size,
+                )
+                .await?;
+
+                let (output_batched, output_batches_after_finish) =
+                  consume_stream_until_finish_barrier_reached(left, right, &mut output_stream).await.unwrap_or_else(|e| panic!("Failed to consume stream for join type: '{join_type}' and with filtering '{with_filtering}': {e:?}"));
+
+                // It should emit more than that, but we are being generous
+                // and to make sure the test pass for all
+                const MINIMUM_OUTPUT_BATCHES: usize = 5;
+                assert!(
+                    MINIMUM_OUTPUT_BATCHES <= number_of_batches / 5,
+                    "Make sure that the minimum output batches is realistic"
+                );
+                // Test to make sure that we are not waiting for input to be fully consumed to emit some output
+                assert!(
+                    output_batched.len() >= MINIMUM_OUTPUT_BATCHES,
+                    "[Sort Merge Join {join_type}] Stream must have at least emit {} batches, but only got {} batches",
+                    MINIMUM_OUTPUT_BATCHES,
+                    output_batched.len()
+                );
+
+                // Just sanity test to make sure we are still producing valid output
+                {
+                    let output = [output_batched, output_batches_after_finish].concat();
+                    let actual_prepared = prepare_record_batches_for_cmp(output);
+
+                    assert_eq!(actual_prepared.columns(), expected.columns());
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Polls the stream until both barriers are reached,
+/// collecting the emitted batches along the way.
+///
+/// If the stream is pending for too long (5s) without emitting any batches,
+/// it panics to avoid hanging the test indefinitely.
+///
+/// Note: The left and right BarrierExec might be the input of the output stream
+async fn consume_stream_until_finish_barrier_reached(
+    left: Arc<BarrierExec>,
+    right: Arc<BarrierExec>,
+    output_stream: &mut SendableRecordBatchStream,
+) -> Result<(Vec<RecordBatch>, Vec<RecordBatch>)> {
+    let mut switch_to_finish_barrier = false;
+    let mut output_batched = vec![];
+    let mut after_finish_barrier_reached = vec![];
+    let mut background_task = JoinSet::new();
+
+    let mut start_time_since_last_ready = datafusion_common::instant::Instant::now();
+    loop {
+        let next_item = output_stream.next();
+
+        // Manual polling
+        let poll_output = futures::poll!(next_item);
+
+        // Wake up the stream to make sure it makes progress
+        tokio::task::yield_now().await;
+
+        match poll_output {
+            Poll::Ready(Some(Ok(batch))) => {
+                if batch.num_rows() == 0 {
+                    return internal_err!("join stream should not emit empty batch");
+                }
+                if switch_to_finish_barrier {
+                    after_finish_barrier_reached.push(batch);
+                } else {
+                    output_batched.push(batch);
+                }
+                start_time_since_last_ready = datafusion_common::instant::Instant::now();
+            }
+            Poll::Ready(Some(Err(e))) => return Err(e),
+            Poll::Ready(None) if !switch_to_finish_barrier => {
+                unreachable!("Stream should not end before manually finishing it")
+            }
+            Poll::Ready(None) => {
+                break;
+            }
+            Poll::Pending => {
+                if right.is_finish_barrier_reached()
+                    && left.is_finish_barrier_reached()
+                    && !switch_to_finish_barrier
+                {
+                    switch_to_finish_barrier = true;
+
+                    let right = Arc::clone(&right);
+                    background_task.spawn(async move {
+                        right.wait_finish().await;
+                    });
+                    let left = Arc::clone(&left);
+                    background_task.spawn(async move {
+                        left.wait_finish().await;
+                    });
+                }
+
+                // Make sure the test doesn't run forever
+                if start_time_since_last_ready.elapsed()
+                    > std::time::Duration::from_secs(5)
+                {
+                    return internal_err!(
+                        "Stream should have emitted data by now, but it's still pending. Output batches so far: {}",
+                        output_batched.len()
+                    );
+                }
+            }
+        }
+    }
+
+    Ok((output_batched, after_finish_barrier_reached))
+}
+
+/// Returns the column names on the schema
+fn columns(schema: &Schema) -> Vec<String> {
+    schema.fields().iter().map(|f| f.name().clone()).collect()
+}
diff --git a/datafusion/physical-plan/src/joins/stream_join_utils.rs b/datafusion/physical-plan/src/joins/stream_join_utils.rs
index 677601a12845f..beed07f562db3 100644
--- a/datafusion/physical-plan/src/joins/stream_join_utils.rs
+++ b/datafusion/physical-plan/src/joins/stream_join_utils.rs
@@ -22,20 +22,24 @@ use std::collections::{HashMap, VecDeque};
 use std::mem::size_of;
 use std::sync::Arc;
 
+use crate::joins::MapOffset;
+use crate::joins::join_hash_map::{
+    contain_hashes, get_matched_indices, get_matched_indices_with_limit_offset,
+    update_from_iter,
+};
 use crate::joins::utils::{JoinFilter, JoinHashMapType};
-use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder};
-use crate::{metrics, ExecutionPlan};
+use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder};
+use crate::{ExecutionPlan, metrics};
 
 use arrow::array::{
-    ArrowPrimitiveType, BooleanBufferBuilder, NativeAdapter, PrimitiveArray, RecordBatch,
+    ArrowPrimitiveType, BooleanArray, BooleanBufferBuilder, NativeAdapter,
+    PrimitiveArray, RecordBatch,
 };
 use arrow::compute::concat_batches;
 use arrow::datatypes::{ArrowNativeType, Schema, SchemaRef};
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::utils::memory::estimate_memory_size;
-use datafusion_common::{
-    arrow_datafusion_err, DataFusionError, HashSet, JoinSide, Result, ScalarValue,
-};
+use datafusion_common::{HashSet, JoinSide, Result, ScalarValue, arrow_datafusion_err};
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::intervals::cp_solver::ExprIntervalGraph;
@@ -47,26 +51,61 @@ use hashbrown::HashTable;
 
 /// Implementation of `JoinHashMapType` for `PruningJoinHashMap`.
 impl JoinHashMapType for PruningJoinHashMap {
-    type NextType = VecDeque<u64>;
-
     // Extend with zero
     fn extend_zero(&mut self, len: usize) {
         self.next.resize(self.next.len() + len, 0)
     }
 
-    /// Get mutable references to the hash map and the next.
-    fn get_mut(&mut self) -> (&mut HashTable<(u64, u64)>, &mut Self::NextType) {
-        (&mut self.map, &mut self.next)
+    fn update_from_iter<'a>(
+        &mut self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>,
+        deleted_offset: usize,
+    ) {
+        let slice: &mut [u64] = self.next.make_contiguous();
+        update_from_iter::<u64>(&mut self.map, slice, iter, deleted_offset);
+    }
+
+    fn get_matched_indices<'a>(
+        &self,
+        iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>,
+        deleted_offset: Option<usize>,
+    ) -> (Vec<u32>, Vec<u64>) {
+        // Flatten the deque
+        let next: Vec<u64> = self.next.iter().copied().collect();
+        get_matched_indices::<u64>(&self.map, &next, iter, deleted_offset)
     }
 
-    /// Get a reference to the hash map.
-    fn get_map(&self) -> &HashTable<(u64, u64)> {
-        &self.map
+    fn get_matched_indices_with_limit_offset(
+        &self,
+        hash_values: &[u64],
+        limit: usize,
+        offset: MapOffset,
+        input_indices: &mut Vec<u32>,
+        match_indices: &mut Vec<u64>,
+    ) -> Option<MapOffset> {
+        // Flatten the deque
+        let next: Vec<u64> = self.next.iter().copied().collect();
+        get_matched_indices_with_limit_offset::<u64>(
+            &self.map,
+            &next,
+            hash_values,
+            limit,
+            offset,
+            input_indices,
+            match_indices,
+        )
     }
 
-    /// Get a reference to the next.
-    fn get_list(&self) -> &Self::NextType {
-        &self.next
+    fn contain_hashes(&self, hash_values: &[u64]) -> BooleanArray {
+        contain_hashes(&self.map, hash_values)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.map.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.map.len()
     }
 }
 
@@ -259,7 +298,7 @@ pub fn map_origin_col_to_filter_col(
 ///    the [`convert_filter_columns`] function.
 /// 5. Searches for the converted filter expression in the filter expression using the
 ///    [`check_filter_expr_contains_sort_information`] function.
-/// 6. If an exact match is found, returns the converted filter expression as [`Some(Arc<dyn PhysicalExpr>)`].
+/// 6. If an exact match is found, returns the converted filter expression as `Some(Arc<dyn PhysicalExpr>)`.
 /// 7. If all columns are not included or an exact match is not found, returns [`None`].
 ///
 /// Examples:
@@ -628,7 +667,6 @@ pub fn combine_two_batches(
 /// * `visited` - A hash set to store the visited indices.
 /// * `offset` - An offset to the indices in the `PrimitiveArray`.
 /// * `indices` - The input `PrimitiveArray` of type `T` which stores the indices to be recorded.
-///
 pub fn record_visited_indices<T: ArrowPrimitiveType>(
     visited: &mut HashSet<usize>,
     offset: usize,
@@ -656,25 +694,25 @@ pub struct StreamJoinMetrics {
     pub(crate) right: StreamJoinSideMetrics,
     /// Memory used by sides in bytes
     pub(crate) stream_memory_usage: metrics::Gauge,
-    /// Number of batches produced by this operator
-    pub(crate) output_batches: metrics::Count,
     /// Number of rows produced by this operator
-    pub(crate) output_rows: metrics::Count,
+    pub(crate) baseline_metrics: BaselineMetrics,
 }
 
 impl StreamJoinMetrics {
     pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
         let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+            MetricBuilder::new(metrics).counter("left_input_batches", partition);
+        let input_rows =
+            MetricBuilder::new(metrics).counter("left_input_rows", partition);
         let left = StreamJoinSideMetrics {
             input_batches,
             input_rows,
         };
 
         let input_batches =
-            MetricBuilder::new(metrics).counter("input_batches", partition);
-        let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
+            MetricBuilder::new(metrics).counter("right_input_batches", partition);
+        let input_rows =
+            MetricBuilder::new(metrics).counter("right_input_rows", partition);
         let right = StreamJoinSideMetrics {
             input_batches,
             input_rows,
@@ -683,17 +721,11 @@ impl StreamJoinMetrics {
         let stream_memory_usage =
             MetricBuilder::new(metrics).gauge("stream_memory_usage", partition);
 
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
-
-        let output_rows = MetricBuilder::new(metrics).output_rows(partition);
-
         Self {
             left,
             right,
-            output_batches,
             stream_memory_usage,
-            output_rows,
+            baseline_metrics: BaselineMetrics::new(metrics, partition),
         }
     }
 }
@@ -996,46 +1028,54 @@ pub mod tests {
         let left_schema = Arc::new(left_schema);
         let right_schema = Arc::new(right_schema);
 
-        assert!(build_filter_input_order(
-            JoinSide::Left,
-            &filter,
-            &left_schema,
-            &PhysicalSortExpr {
-                expr: col("la1", left_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_some());
-        assert!(build_filter_input_order(
-            JoinSide::Left,
-            &filter,
-            &left_schema,
-            &PhysicalSortExpr {
-                expr: col("lt1", left_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_none());
-        assert!(build_filter_input_order(
-            JoinSide::Right,
-            &filter,
-            &right_schema,
-            &PhysicalSortExpr {
-                expr: col("ra1", right_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_some());
-        assert!(build_filter_input_order(
-            JoinSide::Right,
-            &filter,
-            &right_schema,
-            &PhysicalSortExpr {
-                expr: col("rb1", right_schema.as_ref())?,
-                options: SortOptions::default(),
-            }
-        )?
-        .is_none());
+        assert!(
+            build_filter_input_order(
+                JoinSide::Left,
+                &filter,
+                &left_schema,
+                &PhysicalSortExpr {
+                    expr: col("la1", left_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_some()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Left,
+                &filter,
+                &left_schema,
+                &PhysicalSortExpr {
+                    expr: col("lt1", left_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_none()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Right,
+                &filter,
+                &right_schema,
+                &PhysicalSortExpr {
+                    expr: col("ra1", right_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_some()
+        );
+        assert!(
+            build_filter_input_order(
+                JoinSide::Right,
+                &filter,
+                &right_schema,
+                &PhysicalSortExpr {
+                    expr: col("rb1", right_schema.as_ref())?,
+                    options: SortOptions::default(),
+                }
+            )?
+            .is_none()
+        );
 
         Ok(())
     }
diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
index 819a3302b0626..f31cd8d446de2 100644
--- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
+++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
@@ -32,30 +32,30 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::vec;
 
+use crate::check_if_same_properties;
 use crate::common::SharedMemoryReservation;
 use crate::execution_plan::{boundedness_from_children, emission_type_from_children};
-use crate::joins::hash_join::{equal_rows_arr, update_hash};
 use crate::joins::stream_join_utils::{
+    PruningJoinHashMap, SortedFilterExpr, StreamJoinMetrics,
     calculate_filter_expr_intervals, combine_two_batches,
     convert_sort_expr_with_filter_schema, get_pruning_anti_indices,
     get_pruning_semi_indices, prepare_sorted_exprs, record_visited_indices,
-    PruningJoinHashMap, SortedFilterExpr, StreamJoinMetrics,
 };
 use crate::joins::utils::{
-    apply_join_filter_to_indices, build_batch_from_indices, build_join_schema,
-    check_join_is_valid, symmetric_join_output_partitioning, BatchSplitter,
-    BatchTransformer, ColumnIndex, JoinFilter, JoinHashMapType, JoinOn, JoinOnRef,
-    NoopBatchTransformer, StatefulStreamResult,
+    BatchSplitter, BatchTransformer, ColumnIndex, JoinFilter, JoinHashMapType, JoinOn,
+    JoinOnRef, NoopBatchTransformer, StatefulStreamResult, apply_join_filter_to_indices,
+    build_batch_from_indices, build_join_schema, check_join_is_valid, equal_rows_arr,
+    symmetric_join_output_partitioning, update_hash,
 };
 use crate::projection::{
-    join_allows_pushdown, join_table_borders, new_join_children,
-    physical_to_column_exprs, update_join_filter, update_join_on, ProjectionExec,
+    ProjectionExec, join_allows_pushdown, join_table_borders, new_join_children,
+    physical_to_column_exprs, update_join_filter, update_join_on,
 };
 use crate::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    PlanProperties, RecordBatchStream, SendableRecordBatchStream,
     joins::StreamJoinPartitionMode,
     metrics::{ExecutionPlanMetricsSet, MetricsSet},
-    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
 
 use arrow::array::{
@@ -66,19 +66,23 @@ use arrow::compute::concat_batches;
 use arrow::datatypes::{ArrowNativeType, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_common::hash_utils::create_hashes;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::bisect;
-use datafusion_common::{internal_err, plan_err, HashSet, JoinSide, JoinType, Result};
-use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_common::{
+    HashSet, JoinSide, JoinType, NullEquality, Result, assert_eq_or_internal_err,
+    plan_err,
+};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_physical_expr::equivalence::join_equivalence_properties;
 use datafusion_physical_expr::intervals::cp_solver::ExprIntervalGraph;
-use datafusion_physical_expr::PhysicalExprRef;
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
 
-use ahash::RandomState;
-use futures::{ready, Stream, StreamExt};
+use datafusion_common::hash_utils::RandomState;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
+use futures::{Stream, StreamExt, ready};
 use parking_lot::Mutex;
 
 const HASHMAP_SHRINK_SCALE_FACTOR: usize = 4;
@@ -186,8 +190,8 @@ pub struct SymmetricHashJoinExec {
     metrics: ExecutionPlanMetricsSet,
     /// Information of index and left / right placement of columns
     column_indices: Vec<ColumnIndex>,
-    /// If null_equals_null is true, null == null else null != null
-    pub(crate) null_equals_null: bool,
+    /// Defines the null equality for the join.
+    pub(crate) null_equality: NullEquality,
     /// Left side sort expression(s)
     pub(crate) left_sort_exprs: Option<LexOrdering>,
     /// Right side sort expression(s)
@@ -195,7 +199,7 @@ pub struct SymmetricHashJoinExec {
     /// Partition Mode
     mode: StreamJoinPartitionMode,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl SymmetricHashJoinExec {
@@ -205,14 +209,14 @@ impl SymmetricHashJoinExec {
     /// - It is not possible to join the left and right sides on keys `on`, or
     /// - It fails to construct `SortedFilterExpr`s, or
     /// - It fails to create the [ExprIntervalGraph].
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn try_new(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
         on: JoinOn,
         filter: Option<JoinFilter>,
         join_type: &JoinType,
-        null_equals_null: bool,
+        null_equality: NullEquality,
         left_sort_exprs: Option<LexOrdering>,
         right_sort_exprs: Option<LexOrdering>,
         mode: StreamJoinPartitionMode,
@@ -235,10 +239,9 @@ impl SymmetricHashJoinExec {
             build_join_schema(&left_schema, &right_schema, join_type);
 
         // Initialize the random state for the join operation:
-        let random_state = RandomState::with_seeds(0, 0, 0, 0);
+        let random_state = RandomState::with_seed(0);
         let schema = Arc::new(schema);
-        let cache =
-            Self::compute_properties(&left, &right, Arc::clone(&schema), *join_type, &on);
+        let cache = Self::compute_properties(&left, &right, schema, *join_type, &on)?;
         Ok(SymmetricHashJoinExec {
             left,
             right,
@@ -248,11 +251,11 @@ impl SymmetricHashJoinExec {
             random_state,
             metrics: ExecutionPlanMetricsSet::new(),
             column_indices,
-            null_equals_null,
+            null_equality,
             left_sort_exprs,
             right_sort_exprs,
             mode,
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -263,7 +266,7 @@ impl SymmetricHashJoinExec {
         schema: SchemaRef,
         join_type: JoinType,
         join_on: JoinOnRef,
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
         let eq_properties = join_equivalence_properties(
             left.equivalence_properties().clone(),
@@ -274,17 +277,17 @@ impl SymmetricHashJoinExec {
             // Has alternating probe side
             None,
             join_on,
-        );
+        )?;
 
         let output_partitioning =
-            symmetric_join_output_partitioning(left, right, &join_type);
+            symmetric_join_output_partitioning(left, right, &join_type)?;
 
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             emission_type_from_children([left, right]),
             boundedness_from_children([left, right]),
-        )
+        ))
     }
 
     /// left stream
@@ -312,9 +315,9 @@ impl SymmetricHashJoinExec {
         &self.join_type
     }
 
-    /// Get null_equals_null
-    pub fn null_equals_null(&self) -> bool {
-        self.null_equals_null
+    /// Get null_equality
+    pub fn null_equality(&self) -> NullEquality {
+        self.null_equality
     }
 
     /// Get partition mode
@@ -359,6 +362,20 @@ impl SymmetricHashJoinExec {
         }
         Ok(false)
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        let left = children.swap_remove(0);
+        let right = children.swap_remove(0);
+        Self {
+            left,
+            right,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for SymmetricHashJoinExec {
@@ -410,7 +427,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -433,16 +450,14 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         }
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         vec![
             self.left_sort_exprs
                 .as_ref()
-                .cloned()
-                .map(LexRequirement::from),
+                .map(|e| OrderingRequirements::from(e.clone())),
             self.right_sort_exprs
                 .as_ref()
-                .cloned()
-                .map(LexRequirement::from),
+                .map(|e| OrderingRequirements::from(e.clone())),
         ]
     }
 
@@ -450,17 +465,35 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         vec![&self.left, &self.right]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn crate::PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to join keys from both sides
+        let mut tnr = TreeNodeRecursion::Continue;
+        for (left, right) in &self.on {
+            tnr = tnr.visit_sibling(|| f(left.as_ref()))?;
+            tnr = tnr.visit_sibling(|| f(right.as_ref()))?;
+        }
+        // Apply to join filter expressions if present
+        if let Some(filter) = &self.filter {
+            tnr = tnr.visit_sibling(|| f(filter.expression().as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(SymmetricHashJoinExec::try_new(
             Arc::clone(&children[0]),
             Arc::clone(&children[1]),
             self.on.clone(),
             self.filter.clone(),
             &self.join_type,
-            self.null_equals_null,
+            self.null_equality,
             self.left_sort_exprs.clone(),
             self.right_sort_exprs.clone(),
             self.mode,
@@ -471,11 +504,6 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        // TODO stats: it is not possible in general to know the output size of joins
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
-
     fn execute(
         &self,
         partition: usize,
@@ -483,12 +511,12 @@ impl ExecutionPlan for SymmetricHashJoinExec {
     ) -> Result<SendableRecordBatchStream> {
         let left_partitions = self.left.output_partitioning().partition_count();
         let right_partitions = self.right.output_partitioning().partition_count();
-        if left_partitions != right_partitions {
-            return internal_err!(
-                "Invalid SymmetricHashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
+        assert_eq_or_internal_err!(
+            left_partitions,
+            right_partitions,
+            "Invalid SymmetricHashJoinExec, partition count mismatch {left_partitions}!={right_partitions},\
                  consider using RepartitionExec"
-            );
-        }
+        );
         // If `filter_state` and `filter` are both present, then calculate sorted
         // filter expressions for both sides, and build an expression graph.
         let (left_sorted_filter_expr, right_sorted_filter_expr, graph) = match (
@@ -549,7 +577,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
                 graph,
                 left_sorted_filter_expr,
                 right_sorted_filter_expr,
-                null_equals_null: self.null_equals_null,
+                null_equality: self.null_equality,
                 state: SHJStreamState::PullRight,
                 reservation,
                 batch_transformer: BatchSplitter::new(batch_size),
@@ -569,7 +597,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
                 graph,
                 left_sorted_filter_expr,
                 right_sorted_filter_expr,
-                null_equals_null: self.null_equals_null,
+                null_equality: self.null_equality,
                 state: SHJStreamState::PullRight,
                 reservation,
                 batch_transformer: NoopBatchTransformer::new(),
@@ -635,21 +663,18 @@ impl ExecutionPlan for SymmetricHashJoinExec {
             self.right(),
         )?;
 
-        Ok(Some(Arc::new(SymmetricHashJoinExec::try_new(
+        SymmetricHashJoinExec::try_new(
             Arc::new(new_left),
             Arc::new(new_right),
             new_on,
             new_filter,
             self.join_type(),
-            self.null_equals_null(),
-            self.right()
-                .output_ordering()
-                .map(|p| LexOrdering::new(p.to_vec())),
-            self.left()
-                .output_ordering()
-                .map(|p| LexOrdering::new(p.to_vec())),
+            self.null_equality(),
+            self.right().output_ordering().cloned(),
+            self.left().output_ordering().cloned(),
             self.partition_mode(),
-        )?)))
+        )
+        .map(|e| Some(Arc::new(e) as _))
     }
 }
 
@@ -678,8 +703,8 @@ struct SymmetricHashJoinStream<T> {
     right_sorted_filter_expr: Option<SortedFilterExpr>,
     /// Random state used for hashing initialization
     random_state: RandomState,
-    /// If null_equals_null is true, null == null else null != null
-    null_equals_null: bool,
+    /// Defines the null equality for the join.
+    null_equality: NullEquality,
     /// Metrics
     metrics: StreamJoinMetrics,
     /// Memory reservation
@@ -777,7 +802,11 @@ fn need_to_produce_result_in_final(build_side: JoinSide, join_type: JoinType) ->
     } else {
         matches!(
             join_type,
-            JoinType::Right | JoinType::RightAnti | JoinType::Full | JoinType::RightSemi
+            JoinType::Right
+                | JoinType::RightAnti
+                | JoinType::Full
+                | JoinType::RightSemi
+                | JoinType::RightMark
         )
     }
 }
@@ -798,7 +827,6 @@ fn need_to_produce_result_in_final(build_side: JoinSide, join_type: JoinType) ->
 /// # Returns
 ///
 /// A tuple of two arrays of primitive types representing the build and probe indices.
-///
 fn calculate_indices_by_join_type<L: ArrowPrimitiveType, R: ArrowPrimitiveType>(
     build_side: JoinSide,
     prune_length: usize,
@@ -811,6 +839,21 @@ where
 {
     // Store the result in a tuple
     let result = match (build_side, join_type) {
+        // For a mark join we “mark” each build‐side row with a dummy 0 in the probe‐side index
+        // if it ever matched. For example, if
+        //
+        // prune_length = 5
+        // deleted_offset = 0
+        // visited_rows = {1, 3}
+        //
+        // then we produce:
+        //
+        // build_indices = [0, 1, 2, 3, 4]
+        // probe_indices = [None, Some(0), None, Some(0), None]
+        //
+        // Example: for each build row i in [0..5):
+        //   – We always output its own index i in `build_indices`
+        //   – We output `Some(0)` in `probe_indices[i]` if row i was ever visited, else `None`
         (JoinSide::Left, JoinType::LeftMark) => {
             let build_indices = (0..prune_length)
                 .map(L::Native::from_usize)
@@ -825,6 +868,20 @@ where
                 .collect();
             (build_indices, probe_indices)
         }
+        (JoinSide::Right, JoinType::RightMark) => {
+            let build_indices = (0..prune_length)
+                .map(L::Native::from_usize)
+                .collect::<PrimitiveArray<L>>();
+            let probe_indices = (0..prune_length)
+                .map(|idx| {
+                    // For mark join we output a dummy index 0 to indicate the row had a match
+                    visited_rows
+                        .contains(&(idx + deleted_offset))
+                        .then_some(R::Native::from_usize(0).unwrap())
+                })
+                .collect();
+            (build_indices, probe_indices)
+        }
         // In the case of `Left` or `Right` join, or `Full` join, get the anti indices
         (JoinSide::Left, JoinType::Left | JoinType::LeftAnti)
         | (JoinSide::Right, JoinType::Right | JoinType::RightAnti)
@@ -902,6 +959,7 @@ pub(crate) fn build_side_determined_results(
             &probe_indices,
             column_indices,
             build_hash_joiner.build_side,
+            join_type,
         )
         .map(|batch| (batch.num_rows() > 0).then_some(batch))
     } else {
@@ -923,13 +981,13 @@ pub(crate) fn build_side_determined_results(
 /// * `probe_batch` - The second record batch to be joined.
 /// * `column_indices` - An array of columns to be selected for the result of the join.
 /// * `random_state` - The random state for the join.
-/// * `null_equals_null` - A boolean indicating whether NULL values should be treated as equal when joining.
+/// * `null_equality` - Indicates whether NULL values should be treated as equal when joining.
 ///
 /// # Returns
 ///
 /// A [Result] containing an optional record batch if the join type is not one of `LeftAnti`, `RightAnti`, `LeftSemi` or `RightSemi`.
 /// If the join type is one of the above four, the function will return [None].
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn join_with_probe_batch(
     build_hash_joiner: &mut OneSideHashJoiner,
     probe_hash_joiner: &mut OneSideHashJoiner,
@@ -939,7 +997,7 @@ pub(crate) fn join_with_probe_batch(
     probe_batch: &RecordBatch,
     column_indices: &[ColumnIndex],
     random_state: &RandomState,
-    null_equals_null: bool,
+    null_equality: NullEquality,
 ) -> Result<Option<RecordBatch>> {
     if build_hash_joiner.input_buffer.num_rows() == 0 || probe_batch.num_rows() == 0 {
         return Ok(None);
@@ -951,7 +1009,7 @@ pub(crate) fn join_with_probe_batch(
         &build_hash_joiner.on,
         &probe_hash_joiner.on,
         random_state,
-        null_equals_null,
+        null_equality,
         &mut build_hash_joiner.hashes_buffer,
         Some(build_hash_joiner.deleted_offset),
     )?;
@@ -964,6 +1022,8 @@ pub(crate) fn join_with_probe_batch(
             probe_indices,
             filter,
             build_hash_joiner.build_side,
+            None,
+            join_type,
         )?
     } else {
         (build_indices, probe_indices)
@@ -990,6 +1050,7 @@ pub(crate) fn join_with_probe_batch(
             | JoinType::LeftSemi
             | JoinType::LeftMark
             | JoinType::RightSemi
+            | JoinType::RightMark
     ) {
         Ok(None)
     } else {
@@ -1001,6 +1062,7 @@ pub(crate) fn join_with_probe_batch(
             &probe_indices,
             column_indices,
             build_hash_joiner.build_side,
+            join_type,
         )
         .map(|batch| (batch.num_rows() > 0).then_some(batch))
     }
@@ -1017,7 +1079,7 @@ pub(crate) fn join_with_probe_batch(
 /// * `build_on` - An array of columns on which the join will be performed. The columns are from the build side of the join.
 /// * `probe_on` - An array of columns on which the join will be performed. The columns are from the probe side of the join.
 /// * `random_state` - The random state for the join.
-/// * `null_equals_null` - A boolean indicating whether NULL values should be treated as equal when joining.
+/// * `null_equality` - Indicates whether NULL values should be treated as equal when joining.
 /// * `hashes_buffer` - Buffer used for probe side keys hash calculation.
 /// * `deleted_offset` - deleted offset for build side data.
 ///
@@ -1025,7 +1087,7 @@ pub(crate) fn join_with_probe_batch(
 ///
 /// A [Result] containing a tuple with two equal length arrays, representing indices of rows from build and probe side,
 /// matched by join key columns.
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 fn lookup_join_hashmap(
     build_hashmap: &PruningJoinHashMap,
     build_batch: &RecordBatch,
@@ -1033,18 +1095,12 @@ fn lookup_join_hashmap(
     build_on: &[PhysicalExprRef],
     probe_on: &[PhysicalExprRef],
     random_state: &RandomState,
-    null_equals_null: bool,
+    null_equality: NullEquality,
     hashes_buffer: &mut Vec<u64>,
     deleted_offset: Option<usize>,
 ) -> Result<(UInt64Array, UInt32Array)> {
-    let keys_values = probe_on
-        .iter()
-        .map(|c| c.evaluate(probe_batch)?.into_array(probe_batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
-    let build_join_values = build_on
-        .iter()
-        .map(|c| c.evaluate(build_batch)?.into_array(build_batch.num_rows()))
-        .collect::<Result<Vec<_>>>()?;
+    let keys_values = evaluate_expressions_to_arrays(probe_on, probe_batch)?;
+    let build_join_values = evaluate_expressions_to_arrays(build_on, build_batch)?;
 
     hashes_buffer.clear();
     hashes_buffer.resize(probe_batch.num_rows(), 0);
@@ -1080,8 +1136,10 @@ fn lookup_join_hashmap(
     //     (5,1)
     //
     // With this approach, the lexicographic order on both the probe side and the build side is preserved.
-    let (mut matched_probe, mut matched_build) = build_hashmap
-        .get_matched_indices(hash_values.iter().enumerate().rev(), deleted_offset);
+    let (mut matched_probe, mut matched_build) = build_hashmap.get_matched_indices(
+        Box::new(hash_values.iter().enumerate().rev()),
+        deleted_offset,
+    );
 
     matched_probe.reverse();
     matched_build.reverse();
@@ -1094,7 +1152,7 @@ fn lookup_join_hashmap(
         &probe_indices,
         &build_join_values,
         &keys_values,
-        null_equals_null,
+        null_equality,
     )?;
 
     Ok((build_indices, probe_indices))
@@ -1215,7 +1273,7 @@ impl OneSideHashJoiner {
             filter_intervals.push((expr.node_index(), expr.interval().clone()))
         }
         // Update the physical expression graph using the join filter intervals:
-        graph.update_ranges(&mut filter_intervals, Interval::CERTAINLY_TRUE)?;
+        graph.update_ranges(&mut filter_intervals, Interval::TRUE)?;
         // Extract the new join filter interval for the build side:
         let calculated_build_side_interval = filter_intervals.remove(0).1;
         // If the intervals have not changed, return early without pruning:
@@ -1346,9 +1404,10 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     }
                 }
                 Some((batch, _)) => {
-                    self.metrics.output_batches.add(1);
-                    self.metrics.output_rows.add(batch.num_rows());
-                    return Poll::Ready(Some(Ok(batch)));
+                    return self
+                        .metrics
+                        .baseline_metrics
+                        .record_poll(Poll::Ready(Some(Ok(batch))));
                 }
             }
         }
@@ -1372,7 +1431,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
                 self.set_state(SHJStreamState::PullLeft);
-                Poll::Ready(self.process_batch_from_right(batch))
+                Poll::Ready(self.process_batch_from_right(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1401,7 +1460,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
                 self.set_state(SHJStreamState::PullRight);
-                Poll::Ready(self.process_batch_from_left(batch))
+                Poll::Ready(self.process_batch_from_left(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1430,7 +1489,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                 if batch.num_rows() == 0 {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
-                Poll::Ready(self.process_batch_after_right_end(batch))
+                Poll::Ready(self.process_batch_after_right_end(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1461,7 +1520,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
                 if batch.num_rows() == 0 {
                     return Poll::Ready(Ok(StatefulStreamResult::Continue));
                 }
-                Poll::Ready(self.process_batch_after_left_end(batch))
+                Poll::Ready(self.process_batch_after_left_end(&batch))
             }
             Some(Err(e)) => Poll::Ready(Err(e)),
             None => {
@@ -1491,7 +1550,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_from_right(
         &mut self,
-        batch: RecordBatch,
+        batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.perform_join_for_given_side(batch, JoinSide::Right)
             .map(|maybe_batch| {
@@ -1505,7 +1564,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_from_left(
         &mut self,
-        batch: RecordBatch,
+        batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.perform_join_for_given_side(batch, JoinSide::Left)
             .map(|maybe_batch| {
@@ -1519,14 +1578,14 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
 
     fn process_batch_after_left_end(
         &mut self,
-        right_batch: RecordBatch,
+        right_batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.process_batch_from_right(right_batch)
     }
 
     fn process_batch_after_right_end(
         &mut self,
-        left_batch: RecordBatch,
+        left_batch: &RecordBatch,
     ) -> Result<StatefulStreamResult<Option<RecordBatch>>> {
         self.process_batch_from_left(left_batch)
     }
@@ -1591,7 +1650,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
         size += size_of_val(&self.left_sorted_filter_expr);
         size += size_of_val(&self.right_sorted_filter_expr);
         size += size_of_val(&self.random_state);
-        size += size_of_val(&self.null_equals_null);
+        size += size_of_val(&self.null_equality);
         size += size_of_val(&self.metrics);
         size
     }
@@ -1605,7 +1664,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
     /// 5. Combines the results and returns a combined batch or `None` if no batch was produced.
     fn perform_join_for_given_side(
         &mut self,
-        probe_batch: RecordBatch,
+        probe_batch: &RecordBatch,
         probe_side: JoinSide,
     ) -> Result<Option<RecordBatch>> {
         let (
@@ -1635,7 +1694,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
         probe_side_metrics.input_batches.add(1);
         probe_side_metrics.input_rows.add(probe_batch.num_rows());
         // Update the internal state of the hash joiner for the build side:
-        probe_hash_joiner.update_internal_state(&probe_batch, &self.random_state)?;
+        probe_hash_joiner.update_internal_state(probe_batch, &self.random_state)?;
         // Join the two sides:
         let equal_result = join_with_probe_batch(
             build_hash_joiner,
@@ -1643,10 +1702,10 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
             &self.schema,
             self.join_type,
             self.filter.as_ref(),
-            &probe_batch,
+            probe_batch,
             &self.column_indices,
             &self.random_state,
-            self.null_equals_null,
+            self.null_equality,
         )?;
         // Increment the offset for the probe hash joiner:
         probe_hash_joiner.offset += probe_batch.num_rows();
@@ -1664,7 +1723,7 @@ impl<T: BatchTransformer> SymmetricHashJoinStream<T> {
             calculate_filter_expr_intervals(
                 &build_hash_joiner.input_buffer,
                 build_side_sorted_filter_expr,
-                &probe_batch,
+                probe_batch,
                 probe_side_sorted_filter_expr,
             )?;
             let prune_length = build_hash_joiner
@@ -1742,8 +1801,8 @@ mod tests {
     use datafusion_common::ScalarValue;
     use datafusion_execution::config::SessionConfig;
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{binary, col, lit, Column};
-    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_expr::expressions::{Column, binary, col, lit};
+    use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 
     use rstest::*;
 
@@ -1802,12 +1861,18 @@ mod tests {
             on.clone(),
             filter.clone(),
             &join_type,
-            false,
+            NullEquality::NullEqualsNothing,
             Arc::clone(&task_ctx),
         )
         .await?;
         let second_batches = partitioned_hash_join_with_filter(
-            left, right, on, filter, &join_type, false, task_ctx,
+            left,
+            right,
+            on,
+            filter,
+            &join_type,
+            NullEquality::NullEqualsNothing,
+            task_ctx,
         )
         .await?;
         compare_batches(&first_batches, &second_batches);
@@ -1826,6 +1891,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -1843,7 +1909,7 @@ mod tests {
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
 
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: binary(
                 col("la1", left_schema)?,
                 Operator::Plus,
@@ -1851,11 +1917,13 @@ mod tests {
                 left_schema,
             )?,
             options: SortOptions::default(),
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ra1", right_schema)?,
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -1912,6 +1980,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -1923,14 +1992,16 @@ mod tests {
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
 
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("la1", left_schema)?,
             options: SortOptions::default(),
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ra1", right_schema)?,
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -1978,6 +2049,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2031,6 +2103,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2059,6 +2132,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2068,20 +2142,22 @@ mod tests {
         let (left_partition, right_partition) = get_or_create_table((11, 21), 8)?;
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("la1_des", left_schema)?,
             options: SortOptions {
                 descending: true,
                 nulls_first: true,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ra1_des", right_schema)?,
             options: SortOptions {
                 descending: true,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2127,20 +2203,22 @@ mod tests {
         let (left_partition, right_partition) = get_or_create_table((10, 11), 8)?;
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("l_asc_null_first", left_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("r_asc_null_first", right_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2186,20 +2264,22 @@ mod tests {
 
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("l_asc_null_last", left_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: false,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("r_asc_null_last", right_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: false,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2247,20 +2327,22 @@ mod tests {
 
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("l_desc_null_first", left_schema)?,
             options: SortOptions {
                 descending: true,
                 nulls_first: true,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("r_desc_null_first", right_schema)?,
             options: SortOptions {
                 descending: true,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2309,15 +2391,16 @@ mod tests {
 
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("la1", left_schema)?,
             options: SortOptions::default(),
-        }]);
-
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ra1", right_schema)?,
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2368,20 +2451,23 @@ mod tests {
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
         let left_sorted = vec![
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("la1", left_schema)?,
                 options: SortOptions::default(),
-            }]),
-            LexOrdering::new(vec![PhysicalSortExpr {
+            }]
+            .into(),
+            [PhysicalSortExpr {
                 expr: col("la2", left_schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
         ];
 
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ra1", right_schema)?,
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
 
         let (left, right) = create_memory_table(
             left_partition,
@@ -2431,6 +2517,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2449,20 +2536,22 @@ mod tests {
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
         let on = vec![(col("lc1", left_schema)?, col("rc1", right_schema)?)];
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("lt1", left_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("rt1", right_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2515,6 +2604,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2532,20 +2622,22 @@ mod tests {
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
         let on = vec![(col("lc1", left_schema)?, col("rc1", right_schema)?)];
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("li1", left_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("ri1", right_schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
@@ -2591,6 +2683,7 @@ mod tests {
             JoinType::LeftAnti,
             JoinType::LeftMark,
             JoinType::RightAnti,
+            JoinType::RightMark,
             JoinType::Full
         )]
         join_type: JoinType,
@@ -2608,14 +2701,16 @@ mod tests {
 
         let left_schema = &left_partition[0].schema();
         let right_schema = &right_partition[0].schema();
-        let left_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        let left_sorted = [PhysicalSortExpr {
             expr: col("l_float", left_schema)?,
             options: SortOptions::default(),
-        }]);
-        let right_sorted = LexOrdering::new(vec![PhysicalSortExpr {
+        }]
+        .into();
+        let right_sorted = [PhysicalSortExpr {
             expr: col("r_float", right_schema)?,
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
         let (left, right) = create_memory_table(
             left_partition,
             right_partition,
diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs
index 81f56c865f04a..0455fb2a1eb6e 100644
--- a/datafusion/physical-plan/src/joins/test_utils.rs
+++ b/datafusion/physical-plan/src/joins/test_utils.rs
@@ -25,15 +25,15 @@ use crate::joins::{
 };
 use crate::repartition::RepartitionExec;
 use crate::test::TestMemoryExec;
-use crate::{common, ExecutionPlan, ExecutionPlanProperties, Partitioning};
+use crate::{ExecutionPlan, ExecutionPlanProperties, Partitioning, common};
 
 use arrow::array::{
-    types::IntervalDayTime, ArrayRef, Float64Array, Int32Array, IntervalDayTimeArray,
-    RecordBatch, TimestampMillisecondArray,
+    ArrayRef, Float64Array, Int32Array, IntervalDayTimeArray, RecordBatch,
+    TimestampMillisecondArray, types::IntervalDayTime,
 };
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::pretty::pretty_format_batches;
-use datafusion_common::{Result, ScalarValue};
+use datafusion_common::{NullEquality, Result, ScalarValue};
 use datafusion_execution::TaskContext;
 use datafusion_expr::{JoinType, Operator};
 use datafusion_physical_expr::expressions::{binary, cast, col, lit};
@@ -74,7 +74,7 @@ pub async fn partitioned_sym_join_with_filter(
     on: JoinOn,
     filter: Option<JoinFilter>,
     join_type: &JoinType,
-    null_equals_null: bool,
+    null_equality: NullEquality,
     context: Arc<TaskContext>,
 ) -> Result<Vec<RecordBatch>> {
     let partition_count = 4;
@@ -101,11 +101,9 @@ pub async fn partitioned_sym_join_with_filter(
         on,
         filter,
         join_type,
-        null_equals_null,
-        left.output_ordering().map(|p| LexOrdering::new(p.to_vec())),
-        right
-            .output_ordering()
-            .map(|p| LexOrdering::new(p.to_vec())),
+        null_equality,
+        left.output_ordering().cloned(),
+        right.output_ordering().cloned(),
         StreamJoinPartitionMode::Partitioned,
     )?;
 
@@ -130,7 +128,7 @@ pub async fn partitioned_hash_join_with_filter(
     on: JoinOn,
     filter: Option<JoinFilter>,
     join_type: &JoinType,
-    null_equals_null: bool,
+    null_equality: NullEquality,
     context: Arc<TaskContext>,
 ) -> Result<Vec<RecordBatch>> {
     let partition_count = 4;
@@ -153,7 +151,8 @@ pub async fn partitioned_hash_join_with_filter(
         join_type,
         None,
         PartitionMode::Partitioned,
-        null_equals_null,
+        null_equality,
+        false, // null_aware
     )?);
 
     let mut batches = vec![];
@@ -419,12 +418,14 @@ pub fn build_sides_record_batches(
     key_cardinality: (i32, i32),
 ) -> Result<(RecordBatch, RecordBatch)> {
     let null_ratio: f64 = 0.4;
+    let duplicate_ratio = 0.4;
     let initial_range = 0..table_size;
     let index = (table_size as f64 * null_ratio).round() as i32;
     let rest_of = index..table_size;
     let ordered: ArrayRef = Arc::new(Int32Array::from_iter(
         initial_range.clone().collect::<Vec<i32>>(),
     ));
+    let random_ordered = generate_ordered_array(table_size, duplicate_ratio);
     let ordered_des = Arc::new(Int32Array::from_iter(
         initial_range.clone().rev().collect::<Vec<i32>>(),
     ));
@@ -503,6 +504,7 @@ pub fn build_sides_record_batches(
         ),
         ("li1", Arc::clone(&interval_time)),
         ("l_float", Arc::clone(&float_asc) as ArrayRef),
+        ("l_random_ordered", Arc::clone(&random_ordered) as ArrayRef),
     ])?;
     let right = RecordBatch::try_from_iter(vec![
         ("ra1", Arc::clone(&ordered)),
@@ -516,6 +518,7 @@ pub fn build_sides_record_batches(
         ("r_desc_null_first", ordered_desc_null_first),
         ("ri1", interval_time),
         ("r_float", float_asc),
+        ("r_random_ordered", random_ordered),
     ])?;
     Ok((left, right))
 }
@@ -532,9 +535,11 @@ pub fn create_memory_table(
     let right_schema = right_partition[0].schema();
     let right = TestMemoryExec::try_new(&[right_partition], right_schema, None)?
         .try_with_sort_information(right_sorted)?;
+    let left = Arc::new(left);
+    let right = Arc::new(right);
     Ok((
-        Arc::new(TestMemoryExec::update_cache(Arc::new(left))),
-        Arc::new(TestMemoryExec::update_cache(Arc::new(right))),
+        Arc::new(TestMemoryExec::update_cache(&left)),
+        Arc::new(TestMemoryExec::update_cache(&right)),
     ))
 }
 
@@ -585,3 +590,24 @@ pub(crate) fn complicated_filter(
     )?;
     binary(left_expr, Operator::And, right_expr, filter_schema)
 }
+
+fn generate_ordered_array(size: i32, duplicate_ratio: f32) -> Arc<Int32Array> {
+    let mut rng = StdRng::seed_from_u64(42);
+    let unique_count = (size as f32 * (1.0 - duplicate_ratio)) as i32;
+
+    // Generate unique random values
+    let mut values: Vec<i32> = (0..unique_count)
+        .map(|_| rng.random_range(1..500)) // Modify as per your range
+        .collect();
+
+    // Duplicate the values according to the duplicate ratio
+    for _ in 0..(size - unique_count) {
+        let index = rng.random_range(0..unique_count);
+        values.push(values[index as usize]);
+    }
+
+    // Sort the values to ensure they are ordered
+    values.sort();
+
+    Arc::new(Int32Array::from_iter(values))
+}
diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs
index 4be14374249a6..3130134e253d9 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -17,6 +17,7 @@
 
 //! Join related functionality used both on logical and physical plans
 
+use std::cmp::{Ordering, min};
 use std::collections::HashSet;
 use std::fmt::{self, Debug};
 use std::future::Future;
@@ -25,42 +26,60 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use crate::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder};
+use crate::joins::SharedBitmapBuilder;
+use crate::metrics::{
+    self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricType,
+};
+use crate::projection::{ProjectionExec, ProjectionExpr};
 use crate::{
     ColumnStatistics, ExecutionPlan, ExecutionPlanProperties, Partitioning, Statistics,
 };
 // compatibility
 pub use super::join_filter::JoinFilter;
-pub use super::join_hash_map::{JoinHashMap, JoinHashMapType};
+pub use super::join_hash_map::JoinHashMapType;
 pub use crate::joins::{JoinOn, JoinOnRef};
 
 use arrow::array::{
-    builder::UInt64Builder, downcast_array, new_null_array, Array, ArrowPrimitiveType,
-    BooleanBufferBuilder, NativeAdapter, PrimitiveArray, RecordBatch, RecordBatchOptions,
-    UInt32Array, UInt32Builder, UInt64Array,
+    Array, ArrowPrimitiveType, BooleanBufferBuilder, NativeAdapter, PrimitiveArray,
+    RecordBatch, RecordBatchOptions, UInt32Array, UInt32Builder, UInt64Array,
+    builder::UInt64Builder, downcast_array, new_null_array,
 };
-use arrow::compute;
+use arrow::array::{
+    ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array,
+    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int8Array,
+    Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, StringArray,
+    StringViewArray, TimestampMicrosecondArray, TimestampMillisecondArray,
+    TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt16Array,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::compute::kernels::cmp::eq;
+use arrow::compute::{self, FilterBuilder, and, take};
 use arrow::datatypes::{
     ArrowNativeType, Field, Schema, SchemaBuilder, UInt32Type, UInt64Type,
 };
+use arrow_ord::cmp::not_distinct;
+use arrow_schema::{ArrowError, DataType, SortOptions, TimeUnit};
 use datafusion_common::cast::as_boolean_array;
+use datafusion_common::hash_utils::RandomState;
+use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_common::{
-    plan_err, DataFusionError, JoinSide, JoinType, Result, SharedResult,
+    DataFusionError, JoinSide, JoinType, NullEquality, Result, SharedResult,
+    not_impl_err, plan_err,
 };
+use datafusion_expr::Operator;
 use datafusion_expr::interval_arithmetic::Interval;
-use datafusion_physical_expr::equivalence::add_offset_to_expr;
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::utils::{collect_columns, merge_vectors};
+use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{
-    LexOrdering, PhysicalExpr, PhysicalExprRef, PhysicalSortExpr,
+    LexOrdering, PhysicalExpr, PhysicalExprRef, add_offset_to_expr,
+    add_offset_to_physical_sort_exprs,
 };
 
-use crate::joins::SharedBitmapBuilder;
-use crate::projection::ProjectionExec;
+use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::future::{BoxFuture, Shared};
-use futures::{ready, FutureExt};
+use futures::{FutureExt, ready};
 use parking_lot::Mutex;
 
 /// Checks whether the schemas "left" and "right" and columns "on" represent a valid join.
@@ -114,113 +133,85 @@ fn check_join_set_is_valid(
 pub fn adjust_right_output_partitioning(
     right_partitioning: &Partitioning,
     left_columns_len: usize,
-) -> Partitioning {
-    match right_partitioning {
+) -> Result<Partitioning> {
+    let result = match right_partitioning {
         Partitioning::Hash(exprs, size) => {
             let new_exprs = exprs
                 .iter()
-                .map(|expr| add_offset_to_expr(Arc::clone(expr), left_columns_len))
-                .collect();
+                .map(|expr| add_offset_to_expr(Arc::clone(expr), left_columns_len as _))
+                .collect::<Result<_>>()?;
             Partitioning::Hash(new_exprs, *size)
         }
         result => result.clone(),
-    }
-}
-
-/// Replaces the right column (first index in the `on_column` tuple) with
-/// the left column (zeroth index in the tuple) inside `right_ordering`.
-fn replace_on_columns_of_right_ordering(
-    on_columns: &[(PhysicalExprRef, PhysicalExprRef)],
-    right_ordering: &mut LexOrdering,
-) -> Result<()> {
-    for (left_col, right_col) in on_columns {
-        right_ordering.transform(|item| {
-            let new_expr = Arc::clone(&item.expr)
-                .transform(|e| {
-                    if e.eq(right_col) {
-                        Ok(Transformed::yes(Arc::clone(left_col)))
-                    } else {
-                        Ok(Transformed::no(e))
-                    }
-                })
-                .data()
-                .expect("closure is infallible");
-            item.expr = new_expr;
-        });
-    }
-    Ok(())
-}
-
-fn offset_ordering(
-    ordering: &LexOrdering,
-    join_type: &JoinType,
-    offset: usize,
-) -> LexOrdering {
-    match join_type {
-        // In the case below, right ordering should be offsetted with the left
-        // side length, since we append the right table to the left table.
-        JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => ordering
-            .iter()
-            .map(|sort_expr| PhysicalSortExpr {
-                expr: add_offset_to_expr(Arc::clone(&sort_expr.expr), offset),
-                options: sort_expr.options,
-            })
-            .collect(),
-        _ => ordering.clone(),
-    }
+    };
+    Ok(result)
 }
 
 /// Calculate the output ordering of a given join operation.
 pub fn calculate_join_output_ordering(
-    left_ordering: &LexOrdering,
-    right_ordering: &LexOrdering,
+    left_ordering: Option<&LexOrdering>,
+    right_ordering: Option<&LexOrdering>,
     join_type: JoinType,
-    on_columns: &[(PhysicalExprRef, PhysicalExprRef)],
     left_columns_len: usize,
     maintains_input_order: &[bool],
     probe_side: Option<JoinSide>,
-) -> Option<LexOrdering> {
-    let output_ordering = match maintains_input_order {
+) -> Result<Option<LexOrdering>> {
+    match maintains_input_order {
         [true, false] => {
             // Special case, we can prefix ordering of right side with the ordering of left side.
-            if join_type == JoinType::Inner && probe_side == Some(JoinSide::Left) {
-                replace_on_columns_of_right_ordering(
-                    on_columns,
-                    &mut right_ordering.clone(),
-                )
-                .ok()?;
-                merge_vectors(
-                    left_ordering,
-                    offset_ordering(right_ordering, &join_type, left_columns_len)
-                        .as_ref(),
-                )
-            } else {
-                left_ordering.clone()
+            if join_type == JoinType::Inner
+                && probe_side == Some(JoinSide::Left)
+                && let Some(right_ordering) = right_ordering.cloned()
+            {
+                let right_offset = add_offset_to_physical_sort_exprs(
+                    right_ordering,
+                    left_columns_len as _,
+                )?;
+                return if let Some(left_ordering) = left_ordering {
+                    let mut result = left_ordering.clone();
+                    result.extend(right_offset);
+                    Ok(Some(result))
+                } else {
+                    Ok(LexOrdering::new(right_offset))
+                };
             }
+            Ok(left_ordering.cloned())
         }
         [false, true] => {
             // Special case, we can prefix ordering of left side with the ordering of right side.
             if join_type == JoinType::Inner && probe_side == Some(JoinSide::Right) {
-                replace_on_columns_of_right_ordering(
-                    on_columns,
-                    &mut right_ordering.clone(),
-                )
-                .ok()?;
-                merge_vectors(
-                    offset_ordering(right_ordering, &join_type, left_columns_len)
-                        .as_ref(),
-                    left_ordering,
-                )
-            } else {
-                offset_ordering(right_ordering, &join_type, left_columns_len)
+                return if let Some(right_ordering) = right_ordering.cloned() {
+                    let mut right_offset = add_offset_to_physical_sort_exprs(
+                        right_ordering,
+                        left_columns_len as _,
+                    )?;
+                    if let Some(left_ordering) = left_ordering {
+                        right_offset.extend(left_ordering.clone());
+                    }
+                    Ok(LexOrdering::new(right_offset))
+                } else {
+                    Ok(left_ordering.cloned())
+                };
+            }
+            let Some(right_ordering) = right_ordering else {
+                return Ok(None);
+            };
+            match join_type {
+                JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => {
+                    add_offset_to_physical_sort_exprs(
+                        right_ordering.clone(),
+                        left_columns_len as _,
+                    )
+                    .map(LexOrdering::new)
+                }
+                _ => Ok(Some(right_ordering.clone())),
             }
         }
         // Doesn't maintain ordering, output ordering is None.
-        [false, false] => return None,
+        [false, false] => Ok(None),
         [true, true] => unreachable!("Cannot maintain ordering of both sides"),
         _ => unreachable!("Join operators can not have more than two children"),
-    };
-    (!output_ordering.is_empty()).then_some(output_ordering)
+    }
 }
 
 /// Information about the index and placement (left or right) of the columns
@@ -234,7 +225,6 @@ pub struct ColumnIndex {
 
 /// Returns the output field given the input field. Outer joins may
 /// insert nulls even if the input was not null
-///
 fn output_join_field(old_field: &Field, join_type: &JoinType, is_left: bool) -> Field {
     let force_nullable = match join_type {
         JoinType::Inner => false,
@@ -246,6 +236,7 @@ fn output_join_field(old_field: &Field, join_type: &JoinType, is_left: bool) ->
         JoinType::LeftAnti => false, // doesn't introduce nulls (or can it??)
         JoinType::RightAnti => false, // doesn't introduce nulls (or can it??)
         JoinType::LeftMark => false,
+        JoinType::RightMark => false,
     };
 
     if force_nullable {
@@ -303,7 +294,7 @@ pub fn build_join_schema(
         JoinType::LeftSemi | JoinType::LeftAnti => left_fields().unzip(),
         JoinType::LeftMark => {
             let right_field = once((
-                Field::new("mark", arrow::datatypes::DataType::Boolean, false),
+                Field::new("mark", DataType::Boolean, false),
                 ColumnIndex {
                     index: 0,
                     side: JoinSide::None,
@@ -312,14 +303,33 @@ pub fn build_join_schema(
             left_fields().chain(right_field).unzip()
         }
         JoinType::RightSemi | JoinType::RightAnti => right_fields().unzip(),
+        JoinType::RightMark => {
+            let left_field = once((
+                Field::new("mark", DataType::Boolean, false),
+                ColumnIndex {
+                    index: 0,
+                    side: JoinSide::None,
+                },
+            ));
+            right_fields().chain(left_field).unzip()
+        }
+    };
+
+    let (schema1, schema2) = match join_type {
+        JoinType::Right
+        | JoinType::RightSemi
+        | JoinType::RightAnti
+        | JoinType::RightMark => (left, right),
+        _ => (right, left),
     };
 
-    let metadata = left
+    let metadata = schema1
         .metadata()
         .clone()
         .into_iter()
-        .chain(right.metadata().clone())
+        .chain(schema2.metadata().clone())
         .collect();
+
     (fields.finish().with_metadata(metadata), column_indices)
 }
 
@@ -401,15 +411,43 @@ struct PartialJoinStatistics {
     pub column_statistics: Vec<ColumnStatistics>,
 }
 
-/// Estimate the statistics for the given join's output.
+/// Estimates the output statistics for a join operation based on input statistics.
+///
+/// # Statistics Propagation
+///
+/// This function estimates join output statistics using the following approach:
+/// - **Row count estimation**: Uses the `on` parameter (equijoin keys) to estimate
+///   output cardinality via [`estimate_join_cardinality`]. The estimation is based on
+///   column-level statistics (distinct counts, min/max values) of the join keys.
+/// - **Column statistics**: Combines column statistics from both inputs. For join types
+///   that preserve all columns (Inner, Left, Right, Full), statistics from both sides
+///   are concatenated. For semi/anti joins, only the relevant side's statistics are kept.
+/// - **Byte size**: Always returns `Precision::Absent` as join output size is difficult
+///   to estimate without knowing the actual data.
+///
+/// # The `on` Parameter
+///
+/// The `on` parameter represents equijoin keys (e.g., `t1.id = t2.id`). When `on` is
+/// empty (as in NestedLoopJoinExec which handles non-equijoin predicates), the
+/// cardinality estimation cannot compute selectivity from join keys, and this function
+/// returns unknown statistics (`num_rows: Precision::Absent`).
+///
+/// # Limitations
+///
+/// - Does not account for selectivity of arbitrary join filter expressions
+///   (e.g., `(t1.v1 + t2.v1) % 2 = 0`). Such filters, common in NestedLoopJoinExec,
+///   are not factored into the cardinality estimation.
+/// - Column statistics for the output are simply combined from inputs without
+///   adjusting for join selectivity (acknowledged in the code as needing
+///   "filter selectivity analysis").
 pub(crate) fn estimate_join_statistics(
     left_stats: Statistics,
     right_stats: Statistics,
-    on: JoinOn,
+    on: &JoinOn,
     join_type: &JoinType,
     schema: &Schema,
 ) -> Result<Statistics> {
-    let join_stats = estimate_join_cardinality(join_type, left_stats, right_stats, &on);
+    let join_stats = estimate_join_cardinality(join_type, left_stats, right_stats, on);
     let (num_rows, column_statistics) = match join_stats {
         Some(stats) => (Precision::Inexact(stats.num_rows), stats.column_statistics),
         None => (Precision::Absent, Statistics::unknown_column(schema)),
@@ -533,6 +571,15 @@ fn estimate_join_cardinality(
                 column_statistics,
             })
         }
+        JoinType::RightMark => {
+            let num_rows = *right_stats.num_rows.get_value()?;
+            let mut column_statistics = right_stats.column_statistics;
+            column_statistics.push(ColumnStatistics::new_unknown());
+            Some(PartialJoinStatistics {
+                num_rows,
+                column_statistics,
+            })
+        }
     }
 }
 
@@ -549,25 +596,26 @@ fn estimate_inner_join_cardinality(
         return Some(estimation);
     };
 
+    let Statistics {
+        num_rows: left_num_rows,
+        column_statistics: left_column_statistics,
+        ..
+    } = left_stats;
+    let Statistics {
+        num_rows: right_num_rows,
+        column_statistics: right_column_statistics,
+        ..
+    } = right_stats;
+
     // The algorithm here is partly based on the non-histogram selectivity estimation
     // from Spark's Catalyst optimizer.
     let mut join_selectivity = Precision::Absent;
-    for (left_stat, right_stat) in left_stats
-        .column_statistics
+    for (left_stat, right_stat) in left_column_statistics
         .iter()
-        .zip(right_stats.column_statistics.iter())
+        .zip(right_column_statistics.iter())
     {
-        // Break if any of statistics bounds are undefined
-        if left_stat.min_value.get_value().is_none()
-            || left_stat.max_value.get_value().is_none()
-            || right_stat.min_value.get_value().is_none()
-            || right_stat.max_value.get_value().is_none()
-        {
-            return None;
-        }
-
-        let left_max_distinct = max_distinct_count(&left_stats.num_rows, left_stat);
-        let right_max_distinct = max_distinct_count(&right_stats.num_rows, right_stat);
+        let left_max_distinct = max_distinct_count(&left_num_rows, left_stat);
+        let right_max_distinct = max_distinct_count(&right_num_rows, right_stat);
         let max_distinct = left_max_distinct.max(&right_max_distinct);
         if max_distinct.get_value().is_some() {
             // Seems like there are a few implementations of this algorithm that implement
@@ -652,7 +700,8 @@ fn estimate_disjoint_inputs(
 /// Estimate the number of maximum distinct values that can be present in the
 /// given column from its statistics. If distinct_count is available, uses it
 /// directly. Otherwise, if the column is numeric and has min/max values, it
-/// estimates the maximum distinct count from those.
+/// estimates the maximum distinct count from those. Otherwise, the num_rows
+/// is used.
 fn max_distinct_count(
     num_rows: &Precision<usize>,
     stats: &ColumnStatistics,
@@ -684,27 +733,25 @@ fn max_distinct_count(
             // Cap the estimate using the number of possible values:
             if let (Some(min), Some(max)) =
                 (stats.min_value.get_value(), stats.max_value.get_value())
-            {
-                if let Some(range_dc) = Interval::try_new(min.clone(), max.clone())
+                && let Some(range_dc) = Interval::try_new(min.clone(), max.clone())
                     .ok()
                     .and_then(|e| e.cardinality())
+            {
+                let range_dc = range_dc as usize;
+                // Note that the `unwrap` calls in the below statement are safe.
+                return if result == Precision::Absent
+                    || &range_dc < result.get_value().unwrap()
                 {
-                    let range_dc = range_dc as usize;
-                    // Note that the `unwrap` calls in the below statement are safe.
-                    return if matches!(result, Precision::Absent)
-                        || &range_dc < result.get_value().unwrap()
+                    if stats.min_value.is_exact().unwrap()
+                        && stats.max_value.is_exact().unwrap()
                     {
-                        if stats.min_value.is_exact().unwrap()
-                            && stats.max_value.is_exact().unwrap()
-                        {
-                            Precision::Exact(range_dc)
-                        } else {
-                            Precision::Inexact(range_dc)
-                        }
+                        Precision::Exact(range_dc)
                     } else {
-                        result
-                    };
-                }
+                        Precision::Inexact(range_dc)
+                    }
+                } else {
+                    result
+                };
             }
 
             result
@@ -775,6 +822,23 @@ impl<T: 'static> OnceFut<T> {
     }
 }
 
+/// Should we use a bitmap to track each incoming right batch's each row's
+/// 'joined' status.
+///
+/// For example in right joins, we have to use a bit map to track matched
+/// right side rows, and later enter a `EmitRightUnmatched` stage to emit
+/// unmatched right rows.
+pub(crate) fn need_produce_right_in_final(join_type: JoinType) -> bool {
+    matches!(
+        join_type,
+        JoinType::Full
+            | JoinType::Right
+            | JoinType::RightAnti
+            | JoinType::RightMark
+            | JoinType::RightSemi
+    )
+}
+
 /// Some type `join_type` of join need to maintain the matched indices bit map for the left side, and
 /// use the bit map to generate the part of result of the join.
 ///
@@ -794,9 +858,10 @@ pub(crate) fn need_produce_result_in_final(join_type: JoinType) -> bool {
 pub(crate) fn get_final_indices_from_shared_bitmap(
     shared_bitmap: &SharedBitmapBuilder,
     join_type: JoinType,
+    piecewise: bool,
 ) -> (UInt64Array, UInt32Array) {
     let bitmap = shared_bitmap.lock();
-    get_final_indices_from_bit_map(&bitmap, join_type)
+    get_final_indices_from_bit_map(&bitmap, join_type, piecewise)
 }
 
 /// In the end of join execution, need to use bit map of the matched
@@ -811,16 +876,22 @@ pub(crate) fn get_final_indices_from_shared_bitmap(
 pub(crate) fn get_final_indices_from_bit_map(
     left_bit_map: &BooleanBufferBuilder,
     join_type: JoinType,
+    // We add a flag for whether this is being passed from the `PiecewiseMergeJoin`
+    // because the bitmap can be for left + right `JoinType`s
+    piecewise: bool,
 ) -> (UInt64Array, UInt32Array) {
     let left_size = left_bit_map.len();
-    if join_type == JoinType::LeftMark {
+    if join_type == JoinType::LeftMark || (join_type == JoinType::RightMark && piecewise)
+    {
         let left_indices = (0..left_size as u64).collect::<UInt64Array>();
         let right_indices = (0..left_size)
             .map(|idx| left_bit_map.get_bit(idx).then_some(0))
             .collect::<UInt32Array>();
         return (left_indices, right_indices);
     }
-    let left_indices = if join_type == JoinType::LeftSemi {
+    let left_indices = if join_type == JoinType::LeftSemi
+        || (join_type == JoinType::RightSemi && piecewise)
+    {
         (0..left_size)
             .filter_map(|idx| (left_bit_map.get_bit(idx)).then_some(idx as u64))
             .collect::<UInt64Array>()
@@ -839,6 +910,7 @@ pub(crate) fn get_final_indices_from_bit_map(
     (left_indices, right_indices)
 }
 
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn apply_join_filter_to_indices(
     build_input_buffer: &RecordBatch,
     probe_batch: &RecordBatch,
@@ -846,24 +918,59 @@ pub(crate) fn apply_join_filter_to_indices(
     probe_indices: UInt32Array,
     filter: &JoinFilter,
     build_side: JoinSide,
+    max_intermediate_size: Option<usize>,
+    join_type: JoinType,
 ) -> Result<(UInt64Array, UInt32Array)> {
     if build_indices.is_empty() && probe_indices.is_empty() {
         return Ok((build_indices, probe_indices));
     };
 
-    let intermediate_batch = build_batch_from_indices(
-        filter.schema(),
-        build_input_buffer,
-        probe_batch,
-        &build_indices,
-        &probe_indices,
-        filter.column_indices(),
-        build_side,
-    )?;
-    let filter_result = filter
-        .expression()
-        .evaluate(&intermediate_batch)?
-        .into_array(intermediate_batch.num_rows())?;
+    let filter_result = if let Some(max_size) = max_intermediate_size {
+        let mut filter_results =
+            Vec::with_capacity(build_indices.len().div_ceil(max_size));
+
+        for i in (0..build_indices.len()).step_by(max_size) {
+            let end = min(build_indices.len(), i + max_size);
+            let len = end - i;
+            let intermediate_batch = build_batch_from_indices(
+                filter.schema(),
+                build_input_buffer,
+                probe_batch,
+                &build_indices.slice(i, len),
+                &probe_indices.slice(i, len),
+                filter.column_indices(),
+                build_side,
+                join_type,
+            )?;
+            let filter_result = filter
+                .expression()
+                .evaluate(&intermediate_batch)?
+                .into_array(intermediate_batch.num_rows())?;
+            filter_results.push(filter_result);
+        }
+
+        let filter_refs: Vec<&dyn Array> =
+            filter_results.iter().map(|a| a.as_ref()).collect();
+
+        compute::concat(&filter_refs)?
+    } else {
+        let intermediate_batch = build_batch_from_indices(
+            filter.schema(),
+            build_input_buffer,
+            probe_batch,
+            &build_indices,
+            &probe_indices,
+            filter.column_indices(),
+            build_side,
+            join_type,
+        )?;
+
+        filter
+            .expression()
+            .evaluate(&intermediate_batch)?
+            .into_array(intermediate_batch.num_rows())?
+    };
+
     let mask = as_boolean_array(&filter_result)?;
 
     let left_filtered = compute::filter(&build_indices, mask)?;
@@ -874,8 +981,20 @@ pub(crate) fn apply_join_filter_to_indices(
     ))
 }
 
+/// Creates a [RecordBatch] with zero columns but the given row count.
+/// Used when a join has an empty projection (e.g. `SELECT count(1) ...`).
+fn new_empty_schema_batch(schema: &Schema, row_count: usize) -> Result<RecordBatch> {
+    let options = RecordBatchOptions::new().with_row_count(Some(row_count));
+    Ok(RecordBatch::try_new_with_options(
+        Arc::new(schema.clone()),
+        vec![],
+        &options,
+    )?)
+}
+
 /// Returns a new [RecordBatch] by combining the `left` and `right` according to `indices`.
 /// The resulting batch has [Schema] `schema`.
+#[expect(clippy::too_many_arguments)]
 pub(crate) fn build_batch_from_indices(
     schema: &Schema,
     build_input_buffer: &RecordBatch,
@@ -884,17 +1003,17 @@ pub(crate) fn build_batch_from_indices(
     probe_indices: &UInt32Array,
     column_indices: &[ColumnIndex],
     build_side: JoinSide,
+    join_type: JoinType,
 ) -> Result<RecordBatch> {
     if schema.fields().is_empty() {
-        let options = RecordBatchOptions::new()
-            .with_match_field_names(true)
-            .with_row_count(Some(build_indices.len()));
-
-        return Ok(RecordBatch::try_new_with_options(
-            Arc::new(schema.clone()),
-            vec![],
-            &options,
-        )?);
+        // For RightAnti and RightSemi joins, after `adjust_indices_by_join_type`
+        // the build_indices were untouched so only probe_indices hold the actual
+        // row count.
+        let row_count = match join_type {
+            JoinType::RightAnti | JoinType::RightSemi => probe_indices.len(),
+            _ => build_indices.len(),
+        };
+        return new_empty_schema_batch(schema, row_count);
     }
 
     // build the columns of the new [RecordBatch]:
@@ -904,7 +1023,7 @@ pub(crate) fn build_batch_from_indices(
 
     for column_index in column_indices {
         let array = if column_index.side == JoinSide::None {
-            // LeftMark join, the mark column is a true if the indices is not null, otherwise it will be false
+            // For mark joins, the mark column is a true if the indices is not null, otherwise it will be false
             Arc::new(compute::is_not_null(probe_indices)?)
         } else if column_index.side == build_side {
             let array = build_input_buffer.column(column_index.index);
@@ -915,7 +1034,7 @@ pub(crate) fn build_batch_from_indices(
                 assert_eq!(build_indices.null_count(), build_indices.len());
                 new_null_array(array.data_type(), build_indices.len())
             } else {
-                compute::take(array.as_ref(), build_indices, None)?
+                take(array.as_ref(), build_indices, None)?
             }
         } else {
             let array = probe_batch.column(column_index.index);
@@ -923,14 +1042,67 @@ pub(crate) fn build_batch_from_indices(
                 assert_eq!(probe_indices.null_count(), probe_indices.len());
                 new_null_array(array.data_type(), probe_indices.len())
             } else {
-                compute::take(array.as_ref(), probe_indices, None)?
+                take(array.as_ref(), probe_indices, None)?
             }
         };
+
         columns.push(array);
     }
     Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?)
 }
 
+/// Returns a new [RecordBatch] resulting of a join where the build/left side is empty.
+/// The resulting batch has [Schema] `schema`.
+pub(crate) fn build_batch_empty_build_side(
+    schema: &Schema,
+    build_batch: &RecordBatch,
+    probe_batch: &RecordBatch,
+    column_indices: &[ColumnIndex],
+    join_type: JoinType,
+) -> Result<RecordBatch> {
+    match join_type {
+        // these join types only return data if the left side is not empty, so we return an
+        // empty RecordBatch
+        JoinType::Inner
+        | JoinType::Left
+        | JoinType::LeftSemi
+        | JoinType::RightSemi
+        | JoinType::LeftAnti
+        | JoinType::LeftMark => Ok(RecordBatch::new_empty(Arc::new(schema.clone()))),
+
+        // the remaining joins will return data for the right columns and null for the left ones
+        JoinType::Right | JoinType::Full | JoinType::RightAnti | JoinType::RightMark => {
+            let num_rows = probe_batch.num_rows();
+            if schema.fields().is_empty() {
+                return new_empty_schema_batch(schema, num_rows);
+            }
+            let mut columns: Vec<Arc<dyn Array>> =
+                Vec::with_capacity(schema.fields().len());
+
+            for column_index in column_indices {
+                let array = match column_index.side {
+                    // left -> null array
+                    JoinSide::Left => new_null_array(
+                        build_batch.column(column_index.index).data_type(),
+                        num_rows,
+                    ),
+                    // right -> respective right array
+                    JoinSide::Right => Arc::clone(probe_batch.column(column_index.index)),
+                    // right mark -> unset boolean array as there are no matches on the left side
+                    JoinSide::None => Arc::new(BooleanArray::new(
+                        BooleanBuffer::new_unset(num_rows),
+                        None,
+                    )),
+                };
+
+                columns.push(array);
+            }
+
+            Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?)
+        }
+    }
+}
+
 /// The input is the matched indices for left and right and
 /// adjust the indices according to the join type
 pub(crate) fn adjust_indices_by_join_type(
@@ -975,6 +1147,12 @@ pub(crate) fn adjust_indices_by_join_type(
             // the left_indices will not be used later for the `right anti` join
             Ok((left_indices, right_indices))
         }
+        JoinType::RightMark => {
+            let right_indices = get_mark_indices(&adjust_range, &right_indices);
+            let left_indices_vec: Vec<u64> = adjust_range.map(|i| i as u64).collect();
+            let left_indices = UInt64Array::from(left_indices_vec);
+            Ok((left_indices, right_indices))
+        }
         JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
             // matched or unmatched left row will be produced in the end of loop
             // When visit the right batch, we can output the matched left row and don't need to wait the end of loop
@@ -1009,8 +1187,8 @@ pub(crate) fn append_right_indices(
 ) -> Result<(UInt64Array, UInt32Array)> {
     if preserve_order_for_right {
         Ok(append_probe_indices_in_order(
-            left_indices,
-            right_indices,
+            &left_indices,
+            &right_indices,
             adjust_range,
         ))
     } else {
@@ -1076,17 +1254,7 @@ pub(crate) fn get_anti_indices<T: ArrowPrimitiveType>(
 where
     NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
 {
-    let mut bitmap = BooleanBufferBuilder::new(range.len());
-    bitmap.append_n(range.len(), false);
-    input_indices
-        .iter()
-        .flatten()
-        .map(|v| v.as_usize())
-        .filter(|v| range.contains(v))
-        .for_each(|v| {
-            bitmap.set_bit(v - range.start, true);
-        });
-
+    let bitmap = build_range_bitmap(&range, input_indices);
     let offset = range.start;
 
     // get the anti index
@@ -1105,19 +1273,8 @@ pub(crate) fn get_semi_indices<T: ArrowPrimitiveType>(
 where
     NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
 {
-    let mut bitmap = BooleanBufferBuilder::new(range.len());
-    bitmap.append_n(range.len(), false);
-    input_indices
-        .iter()
-        .flatten()
-        .map(|v| v.as_usize())
-        .filter(|v| range.contains(v))
-        .for_each(|v| {
-            bitmap.set_bit(v - range.start, true);
-        });
-
+    let bitmap = build_range_bitmap(&range, input_indices);
     let offset = range.start;
-
     // get the semi index
     (range)
         .filter_map(|idx| {
@@ -1126,6 +1283,37 @@ where
         .collect()
 }
 
+pub(crate) fn get_mark_indices<T: ArrowPrimitiveType>(
+    range: &Range<usize>,
+    input_indices: &PrimitiveArray<T>,
+) -> PrimitiveArray<UInt32Type>
+where
+    NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
+{
+    let mut bitmap = build_range_bitmap(range, input_indices);
+    PrimitiveArray::new(
+        vec![0; range.len()].into(),
+        Some(NullBuffer::new(bitmap.finish())),
+    )
+}
+
+fn build_range_bitmap<T: ArrowPrimitiveType>(
+    range: &Range<usize>,
+    input: &PrimitiveArray<T>,
+) -> BooleanBufferBuilder {
+    let mut builder = BooleanBufferBuilder::new(range.len());
+    builder.append_n(range.len(), false);
+
+    input.iter().flatten().for_each(|v| {
+        let idx = v.as_usize();
+        if range.contains(&idx) {
+            builder.set_bit(idx - range.start, true);
+        }
+    });
+
+    builder
+}
+
 /// Appends probe indices in order by considering the given build indices.
 ///
 /// This function constructs new build and probe indices by iterating through
@@ -1144,8 +1332,8 @@ where
 /// - A `PrimitiveArray` of `UInt64Type` with the newly constructed build indices.
 /// - A `PrimitiveArray` of `UInt32Type` with the newly constructed probe indices.
 fn append_probe_indices_in_order(
-    build_indices: PrimitiveArray<UInt64Type>,
-    probe_indices: PrimitiveArray<UInt32Type>,
+    build_indices: &PrimitiveArray<UInt64Type>,
+    probe_indices: &PrimitiveArray<UInt32Type>,
     range: Range<usize>,
 ) -> (PrimitiveArray<UInt64Type>, PrimitiveArray<UInt32Type>) {
     // Builders for new indices:
@@ -1183,6 +1371,7 @@ fn append_probe_indices_in_order(
 /// Metrics for build & probe joins
 #[derive(Clone, Debug)]
 pub(crate) struct BuildProbeJoinMetrics {
+    pub(crate) baseline: BaselineMetrics,
     /// Total time for collecting build-side of join
     pub(crate) build_time: metrics::Time,
     /// Number of batches consumed by build-side
@@ -1197,14 +1386,35 @@ pub(crate) struct BuildProbeJoinMetrics {
     pub(crate) input_batches: metrics::Count,
     /// Number of rows consumed by probe-side this operator
     pub(crate) input_rows: metrics::Count,
-    /// Number of batches produced by this operator
-    pub(crate) output_batches: metrics::Count,
-    /// Number of rows produced by this operator
-    pub(crate) output_rows: metrics::Count,
+    /// Fraction of probe rows that found more than one match
+    pub(crate) probe_hit_rate: metrics::RatioMetrics,
+    /// Average number of build matches per matched probe row
+    pub(crate) avg_fanout: metrics::RatioMetrics,
+}
+
+// This Drop implementation updates the elapsed compute part of the metrics.
+//
+// Why is this in a Drop?
+// - We keep track of build_time and join_time separately, but baseline metrics have
+// a total elapsed_compute time. Instead of remembering to update both the metrics
+// at the same time, we chose to update elapsed_compute once at the end - summing up
+// both the parts.
+//
+// How does this work?
+// - The elapsed_compute `Time` is represented by an `Arc<AtomicUsize>`. So even when
+// this `BuildProbeJoinMetrics` is dropped, the elapsed_compute is usable through the
+// Arc reference.
+impl Drop for BuildProbeJoinMetrics {
+    fn drop(&mut self) {
+        self.baseline.elapsed_compute().add(&self.build_time);
+        self.baseline.elapsed_compute().add(&self.join_time);
+    }
 }
 
 impl BuildProbeJoinMetrics {
     pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
+        let baseline = BaselineMetrics::new(metrics, partition);
+
         let join_time = MetricBuilder::new(metrics).subset_time("join_time", partition);
 
         let build_time = MetricBuilder::new(metrics).subset_time("build_time", partition);
@@ -1223,10 +1433,13 @@ impl BuildProbeJoinMetrics {
 
         let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
 
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
+        let probe_hit_rate = MetricBuilder::new(metrics)
+            .with_type(MetricType::SUMMARY)
+            .ratio_metrics("probe_hit_rate", partition);
 
-        let output_rows = MetricBuilder::new(metrics).output_rows(partition);
+        let avg_fanout = MetricBuilder::new(metrics)
+            .with_type(MetricType::SUMMARY)
+            .ratio_metrics("avg_fanout", partition);
 
         Self {
             build_time,
@@ -1236,8 +1449,9 @@ impl BuildProbeJoinMetrics {
             join_time,
             input_batches,
             input_rows,
-            output_batches,
-            output_rows,
+            baseline,
+            probe_hit_rate,
+            avg_fanout,
         }
     }
 }
@@ -1293,36 +1507,41 @@ pub(crate) fn symmetric_join_output_partitioning(
     left: &Arc<dyn ExecutionPlan>,
     right: &Arc<dyn ExecutionPlan>,
     join_type: &JoinType,
-) -> Partitioning {
+) -> Result<Partitioning> {
     let left_columns_len = left.schema().fields.len();
     let left_partitioning = left.output_partitioning();
     let right_partitioning = right.output_partitioning();
-    match join_type {
+    let result = match join_type {
         JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {
             left_partitioning.clone()
         }
-        JoinType::RightSemi | JoinType::RightAnti => right_partitioning.clone(),
+        JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+            right_partitioning.clone()
+        }
         JoinType::Inner | JoinType::Right => {
-            adjust_right_output_partitioning(right_partitioning, left_columns_len)
+            adjust_right_output_partitioning(right_partitioning, left_columns_len)?
         }
         JoinType::Full => {
             // We could also use left partition count as they are necessarily equal.
             Partitioning::UnknownPartitioning(right_partitioning.partition_count())
         }
-    }
+    };
+    Ok(result)
 }
 
 pub(crate) fn asymmetric_join_output_partitioning(
     left: &Arc<dyn ExecutionPlan>,
     right: &Arc<dyn ExecutionPlan>,
     join_type: &JoinType,
-) -> Partitioning {
-    match join_type {
+) -> Result<Partitioning> {
+    let result = match join_type {
         JoinType::Inner | JoinType::Right => adjust_right_output_partitioning(
             right.output_partitioning(),
             left.schema().fields().len(),
-        ),
-        JoinType::RightSemi | JoinType::RightAnti => right.output_partitioning().clone(),
+        )?,
+        JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {
+            right.output_partitioning().clone()
+        }
         JoinType::Left
         | JoinType::LeftSemi
         | JoinType::LeftAnti
@@ -1330,7 +1549,8 @@ pub(crate) fn asymmetric_join_output_partitioning(
         | JoinType::LeftMark => Partitioning::UnknownPartitioning(
             right.output_partitioning().partition_count(),
         ),
-    }
+    };
+    Ok(result)
 }
 
 /// Trait for incrementally generating Join output.
@@ -1424,7 +1644,7 @@ impl BatchTransformer for BatchSplitter {
 /// Joins output columns from their left input followed by their right input.
 /// Thus if the inputs are reordered, the output columns must be reordered to
 /// match the original order.
-pub(crate) fn reorder_output_after_swap(
+pub fn reorder_output_after_swap(
     plan: Arc<dyn ExecutionPlan>,
     left_schema: &Schema,
     right_schema: &Schema,
@@ -1444,29 +1664,36 @@ pub(crate) fn reorder_output_after_swap(
 fn swap_reverting_projection(
     left_schema: &Schema,
     right_schema: &Schema,
-) -> Vec<(Arc<dyn PhysicalExpr>, String)> {
-    let right_cols = right_schema.fields().iter().enumerate().map(|(i, f)| {
-        (
-            Arc::new(Column::new(f.name(), i)) as Arc<dyn PhysicalExpr>,
-            f.name().to_owned(),
-        )
-    });
+) -> Vec<ProjectionExpr> {
+    let right_cols =
+        right_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(i, f)| ProjectionExpr {
+                expr: Arc::new(Column::new(f.name(), i)) as Arc<dyn PhysicalExpr>,
+                alias: f.name().to_owned(),
+            });
     let right_len = right_cols.len();
-    let left_cols = left_schema.fields().iter().enumerate().map(|(i, f)| {
-        (
-            Arc::new(Column::new(f.name(), right_len + i)) as Arc<dyn PhysicalExpr>,
-            f.name().to_owned(),
-        )
-    });
+    let left_cols =
+        left_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(i, f)| ProjectionExpr {
+                expr: Arc::new(Column::new(f.name(), right_len + i))
+                    as Arc<dyn PhysicalExpr>,
+                alias: f.name().to_owned(),
+            });
 
     left_cols.chain(right_cols).collect()
 }
 
 /// This function swaps the given join's projection.
-pub(super) fn swap_join_projection(
+pub fn swap_join_projection(
     left_schema_len: usize,
     right_schema_len: usize,
-    projection: Option<&Vec<usize>>,
+    projection: Option<&[usize]>,
     join_type: &JoinType,
 ) -> Option<Vec<usize>> {
     match join_type {
@@ -1475,8 +1702,9 @@ pub(super) fn swap_join_projection(
         JoinType::LeftAnti
         | JoinType::LeftSemi
         | JoinType::RightAnti
-        | JoinType::RightSemi => projection.cloned(),
-
+        | JoinType::RightSemi
+        | JoinType::LeftMark
+        | JoinType::RightMark => projection.map(|p| p.to_vec()),
         _ => projection.map(|p| {
             p.iter()
                 .map(|i| {
@@ -1495,17 +1723,215 @@ pub(super) fn swap_join_projection(
     }
 }
 
+/// Updates `hash_map` with new entries from `batch` evaluated against the expressions `on`
+/// using `offset` as a start value for `batch` row indices.
+///
+/// `fifo_hashmap` sets the order of iteration over `batch` rows while updating hashmap,
+/// which allows to keep either first (if set to true) or last (if set to false) row index
+/// as a chain head for rows with equal hash values.
+#[expect(clippy::too_many_arguments)]
+pub fn update_hash(
+    on: &[PhysicalExprRef],
+    batch: &RecordBatch,
+    hash_map: &mut dyn JoinHashMapType,
+    offset: usize,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    deleted_offset: usize,
+    fifo_hashmap: bool,
+) -> Result<()> {
+    // evaluate the keys
+    let keys_values = evaluate_expressions_to_arrays(on, batch)?;
+
+    // calculate the hash values
+    let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?;
+
+    // For usual JoinHashmap, the implementation is void.
+    hash_map.extend_zero(batch.num_rows());
+
+    // Updating JoinHashMap from hash values iterator
+    let hash_values_iter = hash_values
+        .iter()
+        .enumerate()
+        .map(|(i, val)| (i + offset, val));
+
+    if fifo_hashmap {
+        hash_map.update_from_iter(Box::new(hash_values_iter.rev()), deleted_offset);
+    } else {
+        hash_map.update_from_iter(Box::new(hash_values_iter), deleted_offset);
+    }
+
+    Ok(())
+}
+
+pub(super) fn equal_rows_arr(
+    indices_left: &UInt64Array,
+    indices_right: &UInt32Array,
+    left_arrays: &[ArrayRef],
+    right_arrays: &[ArrayRef],
+    null_equality: NullEquality,
+) -> Result<(UInt64Array, UInt32Array)> {
+    let mut iter = left_arrays.iter().zip(right_arrays.iter());
+
+    let Some((first_left, first_right)) = iter.next() else {
+        return Ok((Vec::<u64>::new().into(), Vec::<u32>::new().into()));
+    };
+
+    let arr_left = take(first_left.as_ref(), indices_left, None)?;
+    let arr_right = take(first_right.as_ref(), indices_right, None)?;
+
+    let mut equal: BooleanArray = eq_dyn_null(&arr_left, &arr_right, null_equality)?;
+
+    // Use map and try_fold to iterate over the remaining pairs of arrays.
+    // In each iteration, take is used on the pair of arrays and their equality is determined.
+    // The results are then folded (combined) using the and function to get a final equality result.
+    equal = iter
+        .map(|(left, right)| {
+            let arr_left = take(left.as_ref(), indices_left, None)?;
+            let arr_right = take(right.as_ref(), indices_right, None)?;
+            eq_dyn_null(arr_left.as_ref(), arr_right.as_ref(), null_equality)
+        })
+        .try_fold(equal, |acc, equal2| and(&acc, &equal2?))?;
+
+    let filter_builder = FilterBuilder::new(&equal).optimize().build();
+
+    let left_filtered = filter_builder.filter(indices_left)?;
+    let right_filtered = filter_builder.filter(indices_right)?;
+
+    Ok((
+        downcast_array(left_filtered.as_ref()),
+        downcast_array(right_filtered.as_ref()),
+    ))
+}
+
+// version of eq_dyn supporting equality on null arrays
+fn eq_dyn_null(
+    left: &dyn Array,
+    right: &dyn Array,
+    null_equality: NullEquality,
+) -> Result<BooleanArray, ArrowError> {
+    // Nested datatypes cannot use the underlying not_distinct/eq function and must use a special
+    // implementation
+    // <https://github.com/apache/datafusion/issues/10749>
+    if left.data_type().is_nested() {
+        let op = match null_equality {
+            NullEquality::NullEqualsNothing => Operator::Eq,
+            NullEquality::NullEqualsNull => Operator::IsNotDistinctFrom,
+        };
+        return Ok(compare_op_for_nested(op, &left, &right)?);
+    }
+    match null_equality {
+        NullEquality::NullEqualsNothing => eq(&left, &right),
+        NullEquality::NullEqualsNull => not_distinct(&left, &right),
+    }
+}
+
+/// Get comparison result of two rows of join arrays
+pub fn compare_join_arrays(
+    left_arrays: &[ArrayRef],
+    left: usize,
+    right_arrays: &[ArrayRef],
+    right: usize,
+    sort_options: &[SortOptions],
+    null_equality: NullEquality,
+) -> Result<Ordering> {
+    let mut res = Ordering::Equal;
+    for ((left_array, right_array), sort_options) in
+        left_arrays.iter().zip(right_arrays).zip(sort_options)
+    {
+        macro_rules! compare_value {
+            ($T:ty) => {{
+                let left_array = left_array.as_any().downcast_ref::<$T>().unwrap();
+                let right_array = right_array.as_any().downcast_ref::<$T>().unwrap();
+                match (left_array.is_null(left), right_array.is_null(right)) {
+                    (false, false) => {
+                        let left_value = &left_array.value(left);
+                        let right_value = &right_array.value(right);
+                        res = left_value.partial_cmp(right_value).unwrap();
+                        if sort_options.descending {
+                            res = res.reverse();
+                        }
+                    }
+                    (true, false) => {
+                        res = if sort_options.nulls_first {
+                            Ordering::Less
+                        } else {
+                            Ordering::Greater
+                        };
+                    }
+                    (false, true) => {
+                        res = if sort_options.nulls_first {
+                            Ordering::Greater
+                        } else {
+                            Ordering::Less
+                        };
+                    }
+                    _ => {
+                        res = match null_equality {
+                            NullEquality::NullEqualsNothing => Ordering::Less,
+                            NullEquality::NullEqualsNull => Ordering::Equal,
+                        };
+                    }
+                }
+            }};
+        }
+
+        match left_array.data_type() {
+            DataType::Null => {}
+            DataType::Boolean => compare_value!(BooleanArray),
+            DataType::Int8 => compare_value!(Int8Array),
+            DataType::Int16 => compare_value!(Int16Array),
+            DataType::Int32 => compare_value!(Int32Array),
+            DataType::Int64 => compare_value!(Int64Array),
+            DataType::UInt8 => compare_value!(UInt8Array),
+            DataType::UInt16 => compare_value!(UInt16Array),
+            DataType::UInt32 => compare_value!(UInt32Array),
+            DataType::UInt64 => compare_value!(UInt64Array),
+            DataType::Float32 => compare_value!(Float32Array),
+            DataType::Float64 => compare_value!(Float64Array),
+            DataType::Binary => compare_value!(BinaryArray),
+            DataType::BinaryView => compare_value!(BinaryViewArray),
+            DataType::FixedSizeBinary(_) => compare_value!(FixedSizeBinaryArray),
+            DataType::LargeBinary => compare_value!(LargeBinaryArray),
+            DataType::Utf8 => compare_value!(StringArray),
+            DataType::Utf8View => compare_value!(StringViewArray),
+            DataType::LargeUtf8 => compare_value!(LargeStringArray),
+            DataType::Decimal128(..) => compare_value!(Decimal128Array),
+            DataType::Timestamp(time_unit, None) => match time_unit {
+                TimeUnit::Second => compare_value!(TimestampSecondArray),
+                TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray),
+                TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray),
+                TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray),
+            },
+            DataType::Date32 => compare_value!(Date32Array),
+            DataType::Date64 => compare_value!(Date64Array),
+            dt => {
+                return not_impl_err!(
+                    "Unsupported data type in sort merge join comparator: {}",
+                    dt
+                );
+            }
+        }
+        if !res.is_eq() {
+            break;
+        }
+    }
+    Ok(res)
+}
+
 #[cfg(test)]
 mod tests {
-    use super::*;
+    use std::collections::HashMap;
     use std::pin::Pin;
 
+    use super::*;
+
     use arrow::array::Int32Array;
-    use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Fields};
     use arrow::error::{ArrowError, Result as ArrowResult};
     use datafusion_common::stats::Precision::{Absent, Exact, Inexact};
-    use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue};
+    use datafusion_common::{ScalarValue, arrow_datafusion_err, arrow_err};
+    use datafusion_physical_expr::PhysicalSortExpr;
 
     use rstest::rstest;
 
@@ -1704,6 +2130,7 @@ mod tests {
             max_value: max.map(ScalarValue::from),
             sum_value: Absent,
             null_count,
+            byte_size: Absent,
         }
     }
 
@@ -1754,12 +2181,18 @@ mod tests {
                 (20, Inexact(1), Inexact(40), Absent, Absent),
                 Some(Inexact(10)),
             ),
-            // When we have distinct count.
+            // Distinct count matches the range
             (
                 (10, Inexact(1), Inexact(10), Inexact(10), Absent),
                 (10, Inexact(1), Inexact(10), Inexact(10), Absent),
                 Some(Inexact(10)),
             ),
+            // Distinct count takes precedence over the range
+            (
+                (10, Inexact(1), Inexact(3), Inexact(10), Absent),
+                (10, Inexact(1), Inexact(3), Inexact(10), Absent),
+                Some(Inexact(10)),
+            ),
             // distinct(left) > distinct(right)
             (
                 (10, Inexact(1), Inexact(10), Inexact(5), Absent),
@@ -1803,32 +2236,33 @@ mod tests {
             // Edge cases
             // ==========
             //
-            // No column level stats.
+            // No column level stats, fall back to row count.
             (
                 (10, Absent, Absent, Absent, Absent),
                 (10, Absent, Absent, Absent, Absent),
-                None,
+                Some(Inexact(10)),
             ),
-            // No min or max (or both).
+            // No min or max (or both), but distinct available.
             (
                 (10, Absent, Absent, Inexact(3), Absent),
                 (10, Absent, Absent, Inexact(3), Absent),
-                None,
+                Some(Inexact(33)),
             ),
             (
                 (10, Inexact(2), Absent, Inexact(3), Absent),
                 (10, Absent, Inexact(5), Inexact(3), Absent),
-                None,
+                Some(Inexact(33)),
             ),
             (
                 (10, Absent, Inexact(3), Inexact(3), Absent),
                 (10, Inexact(1), Absent, Inexact(3), Absent),
-                None,
+                Some(Inexact(33)),
             ),
+            // No min or max, fall back to row count
             (
                 (10, Absent, Inexact(3), Absent, Absent),
                 (10, Inexact(1), Absent, Absent, Absent),
-                None,
+                Some(Inexact(10)),
             ),
             // Non overlapping min/max (when exact=False).
             (
@@ -2281,18 +2715,21 @@ mod tests {
             &JoinType::LeftSemi,
             Statistics {
                 num_rows: Inexact(500),
-                total_byte_size: Absent,
+                    total_byte_size: Absent,
                 column_statistics: dummy_column_stats.clone(),
             },
             Statistics {
                 num_rows: Absent,
-                total_byte_size: Absent,
+                    total_byte_size: Absent,
                 column_statistics: dummy_column_stats.clone(),
             },
             &join_on,
         ).expect("Expected non-empty PartialJoinStatistics for SemiJoin with absent inner num_rows");
 
-        assert_eq!(absent_inner_estimation.num_rows, 500, "Expected outer.num_rows estimated SemiJoin cardinality for absent inner num_rows");
+        assert_eq!(
+            absent_inner_estimation.num_rows, 500,
+            "Expected outer.num_rows estimated SemiJoin cardinality for absent inner num_rows"
+        );
 
         let absent_inner_estimation = estimate_join_cardinality(
             &JoinType::LeftSemi,
@@ -2308,92 +2745,45 @@ mod tests {
             },
             &join_on,
         );
-        assert!(absent_inner_estimation.is_none(), "Expected \"None\" estimated SemiJoin cardinality for absent outer and inner num_rows");
+        assert!(
+            absent_inner_estimation.is_none(),
+            "Expected \"None\" estimated SemiJoin cardinality for absent outer and inner num_rows"
+        );
 
         Ok(())
     }
 
     #[test]
     fn test_calculate_join_output_ordering() -> Result<()> {
-        let options = SortOptions::default();
         let left_ordering = LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("a", 0)),
-                options,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("c", 2)),
-                options,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("d", 3)),
-                options,
-            },
+            PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0))),
+            PhysicalSortExpr::new_default(Arc::new(Column::new("c", 2))),
+            PhysicalSortExpr::new_default(Arc::new(Column::new("d", 3))),
         ]);
         let right_ordering = LexOrdering::new(vec![
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("z", 2)),
-                options,
-            },
-            PhysicalSortExpr {
-                expr: Arc::new(Column::new("y", 1)),
-                options,
-            },
+            PhysicalSortExpr::new_default(Arc::new(Column::new("z", 2))),
+            PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))),
         ]);
         let join_type = JoinType::Inner;
-        let on_columns = [(
-            Arc::new(Column::new("b", 1)) as _,
-            Arc::new(Column::new("x", 0)) as _,
-        )];
         let left_columns_len = 5;
         let maintains_input_orders = [[true, false], [false, true]];
         let probe_sides = [Some(JoinSide::Left), Some(JoinSide::Right)];
 
         let expected = [
-            Some(LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("a", 0)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("c", 2)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("d", 3)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("z", 7)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("y", 6)),
-                    options,
-                },
-            ])),
-            Some(LexOrdering::new(vec![
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("z", 7)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("y", 6)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("a", 0)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("c", 2)),
-                    options,
-                },
-                PhysicalSortExpr {
-                    expr: Arc::new(Column::new("d", 3)),
-                    options,
-                },
-            ])),
+            LexOrdering::new(vec![
+                PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("c", 2))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("d", 3))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("z", 7))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("y", 6))),
+            ]),
+            LexOrdering::new(vec![
+                PhysicalSortExpr::new_default(Arc::new(Column::new("z", 7))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("y", 6))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("c", 2))),
+                PhysicalSortExpr::new_default(Arc::new(Column::new("d", 3))),
+            ]),
         ];
 
         for (i, (maintains_input_order, probe_side)) in
@@ -2404,11 +2794,10 @@ mod tests {
                     left_ordering.as_ref(),
                     right_ordering.as_ref(),
                     join_type,
-                    &on_columns,
                     left_columns_len,
                     maintains_input_order,
                     probe_side,
-                ),
+                )?,
                 expected[i]
             );
         }
@@ -2474,17 +2863,17 @@ mod tests {
 
         assert_eq!(proj.len(), 3);
 
-        let (col, name) = &proj[0];
-        assert_eq!(name, "a");
-        assert_col_expr(col, "a", 1);
+        let proj_expr = &proj[0];
+        assert_eq!(proj_expr.alias, "a");
+        assert_col_expr(&proj_expr.expr, "a", 1);
 
-        let (col, name) = &proj[1];
-        assert_eq!(name, "b");
-        assert_col_expr(col, "b", 2);
+        let proj_expr = &proj[1];
+        assert_eq!(proj_expr.alias, "b");
+        assert_col_expr(&proj_expr.expr, "b", 2);
 
-        let (col, name) = &proj[2];
-        assert_eq!(name, "c");
-        assert_col_expr(col, "c", 0);
+        let proj_expr = &proj[2];
+        assert_eq!(proj_expr.alias, "c");
+        assert_col_expr(&proj_expr.expr, "c", 0);
     }
 
     fn assert_col_expr(expr: &Arc<dyn PhysicalExpr>, name: &str, index: usize) {
@@ -2495,4 +2884,59 @@ mod tests {
         assert_eq!(col.name(), name);
         assert_eq!(col.index(), index);
     }
+
+    #[test]
+    fn test_join_metadata() -> Result<()> {
+        let left_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)])
+            .with_metadata(HashMap::from([("key".to_string(), "left".to_string())]));
+
+        let right_schema = Schema::new(vec![Field::new("b", DataType::Int32, false)])
+            .with_metadata(HashMap::from([("key".to_string(), "right".to_string())]));
+
+        let (join_schema, _) =
+            build_join_schema(&left_schema, &right_schema, &JoinType::Left);
+        assert_eq!(
+            join_schema.metadata(),
+            &HashMap::from([("key".to_string(), "left".to_string())])
+        );
+        let (join_schema, _) =
+            build_join_schema(&left_schema, &right_schema, &JoinType::Right);
+        assert_eq!(
+            join_schema.metadata(),
+            &HashMap::from([("key".to_string(), "right".to_string())])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_build_batch_empty_build_side_empty_schema() -> Result<()> {
+        // When the output schema has no fields (empty projection pushed into
+        // the join), build_batch_empty_build_side should return a RecordBatch
+        // with the correct row count but no columns.
+        let empty_schema = Schema::empty();
+
+        let build_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])),
+            vec![Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3]))],
+        )?;
+
+        let probe_batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])),
+            vec![Arc::new(arrow::array::Int32Array::from(vec![4, 5, 6, 7]))],
+        )?;
+
+        let result = build_batch_empty_build_side(
+            &empty_schema,
+            &build_batch,
+            &probe_batch,
+            &[], // no column indices with empty projection
+            JoinType::Right,
+        )?;
+
+        assert_eq!(result.num_rows(), 4);
+        assert_eq!(result.num_columns(), 0);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index ba423f958c78e..6467d7a2e389d 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Traits for physical query plan, supporting parallel execution for partitioned relations.
 //!
@@ -30,26 +31,28 @@
 
 pub use datafusion_common::hash_utils;
 pub use datafusion_common::utils::project_schema;
-pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
+pub use datafusion_common::{ColumnStatistics, Statistics, internal_err};
 pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
 pub use datafusion_expr::{Accumulator, ColumnarValue};
-pub use datafusion_physical_expr::window::WindowExpr;
 use datafusion_physical_expr::PhysicalSortExpr;
+pub use datafusion_physical_expr::window::WindowExpr;
 pub use datafusion_physical_expr::{
-    expressions, Distribution, Partitioning, PhysicalExpr,
+    Distribution, Partitioning, PhysicalExpr, expressions,
 };
 
 pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay};
 pub use crate::execution_plan::{
-    collect, collect_partitioned, displayable, execute_input_stream, execute_stream,
-    execute_stream_partitioned, get_plan_string, with_new_children_if_necessary,
-    ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+    ExecutionPlan, ExecutionPlanProperties, PlanProperties, collect, collect_partitioned,
+    displayable, execute_input_stream, execute_stream, execute_stream_partitioned,
+    get_plan_string, with_new_children_if_necessary,
 };
 pub use crate::metrics::Metric;
 pub use crate::ordering::InputOrderMode;
+pub use crate::sort_pushdown::SortOrderPushdownResult;
 pub use crate::stream::EmptyRecordBatchStream;
 pub use crate::topk::TopK;
-pub use crate::visitor::{accept, visit_execution_plan, ExecutionPlanVisitor};
+pub use crate::visitor::{ExecutionPlanVisitor, accept, visit_execution_plan};
+pub use crate::work_table::WorkTable;
 pub use spill::spill_manager::SpillManager;
 
 mod ordering;
@@ -59,9 +62,14 @@ mod visitor;
 
 pub mod aggregates;
 pub mod analyze;
+pub mod async_func;
+pub mod buffer;
+pub mod coalesce;
 pub mod coalesce_batches;
 pub mod coalesce_partitions;
+pub mod column_rewriter;
 pub mod common;
+pub mod coop;
 pub mod display;
 pub mod empty;
 pub mod execution_plan;
@@ -76,6 +84,7 @@ pub mod placeholder_row;
 pub mod projection;
 pub mod recursive_query;
 pub mod repartition;
+pub mod sort_pushdown;
 pub mod sorts;
 pub mod spill;
 pub mod stream;
@@ -83,7 +92,6 @@ pub mod streaming;
 pub mod tree_node;
 pub mod union;
 pub mod unnest;
-pub mod values;
 pub mod windows;
 pub mod work_table;
 pub mod udaf {
@@ -91,6 +99,4 @@ pub mod udaf {
     pub use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 }
 
-pub mod coalesce;
-#[cfg(any(test, feature = "bench"))]
 pub mod test;
diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index 2224f85cc1226..d135434898d8f 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -28,13 +28,18 @@ use super::{
     SendableRecordBatchStream, Statistics,
 };
 use crate::execution_plan::{Boundedness, CardinalityEffect};
-use crate::{DisplayFormatType, Distribution, ExecutionPlan, Partitioning};
+use crate::{
+    DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+    check_if_same_properties,
+};
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
 
+use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 use futures::stream::{Stream, StreamExt};
 use log::trace;
 
@@ -50,7 +55,10 @@ pub struct GlobalLimitExec {
     fetch: Option<usize>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    /// Does the limit have to preserve the order of its input, and if so what is it?
+    /// Some optimizations may reorder the input if no particular sort is required
+    required_ordering: Option<LexOrdering>,
+    cache: Arc<PlanProperties>,
 }
 
 impl GlobalLimitExec {
@@ -62,7 +70,8 @@ impl GlobalLimitExec {
             skip,
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            required_ordering: None,
+            cache: Arc::new(cache),
         }
     }
 
@@ -91,6 +100,27 @@ impl GlobalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
+    }
+
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for GlobalLimitExec {
@@ -105,7 +135,8 @@ impl DisplayAs for GlobalLimitExec {
                     f,
                     "GlobalLimitExec: skip={}, fetch={}",
                     self.skip,
-                    self.fetch.map_or("None".to_string(), |x| x.to_string())
+                    self.fetch
+                        .map_or_else(|| "None".to_string(), |x| x.to_string())
                 )
             }
             DisplayFormatType::TreeRender => {
@@ -128,7 +159,7 @@ impl ExecutionPlan for GlobalLimitExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -148,12 +179,27 @@ impl ExecutionPlan for GlobalLimitExec {
         vec![false]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to required ordering expressions if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(GlobalLimitExec::new(
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             self.skip,
             self.fetch,
         )))
@@ -166,14 +212,18 @@ impl ExecutionPlan for GlobalLimitExec {
     ) -> Result<SendableRecordBatchStream> {
         trace!("Start GlobalLimitExec::execute for partition: {partition}");
         // GlobalLimitExec has a single output partition
-        if 0 != partition {
-            return internal_err!("GlobalLimitExec invalid partition {partition}");
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "GlobalLimitExec invalid partition {partition}"
+        );
 
         // GlobalLimitExec requires a single input partition
-        if 1 != self.input.output_partitioning().partition_count() {
-            return internal_err!("GlobalLimitExec requires a single input partition");
-        }
+        assert_eq_or_internal_err!(
+            self.input.output_partitioning().partition_count(),
+            1,
+            "GlobalLimitExec requires a single input partition"
+        );
 
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         let stream = self.input.execute(0, context)?;
@@ -189,17 +239,9 @@ impl ExecutionPlan for GlobalLimitExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input.partition_statistics(partition)?.with_fetch(
-            self.schema(),
-            self.fetch,
-            self.skip,
-            1,
-        )
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, self.skip, 1)?))
     }
 
     fn fetch(&self) -> Option<usize> {
@@ -212,7 +254,7 @@ impl ExecutionPlan for GlobalLimitExec {
 }
 
 /// LocalLimitExec applies a limit to a single partition
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct LocalLimitExec {
     /// Input execution plan
     input: Arc<dyn ExecutionPlan>,
@@ -220,7 +262,10 @@ pub struct LocalLimitExec {
     fetch: usize,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
-    cache: PlanProperties,
+    /// If the child plan is a sort node, after the sort node is removed during
+    /// physical optimization, we should add the required ordering to the limit node
+    required_ordering: Option<LexOrdering>,
+    cache: Arc<PlanProperties>,
 }
 
 impl LocalLimitExec {
@@ -231,7 +276,8 @@ impl LocalLimitExec {
             input,
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            required_ordering: None,
+            cache: Arc::new(cache),
         }
     }
 
@@ -255,6 +301,27 @@ impl LocalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
+    }
+
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for LocalLimitExec {
@@ -284,7 +351,7 @@ impl ExecutionPlan for LocalLimitExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -300,10 +367,25 @@ impl ExecutionPlan for LocalLimitExec {
         vec![true]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to required ordering expressions if present
+        let mut tnr = TreeNodeRecursion::Continue;
+        if let Some(ordering) = &self.required_ordering {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         match children.len() {
             1 => Ok(Arc::new(LocalLimitExec::new(
                 Arc::clone(&children[0]),
@@ -318,7 +400,12 @@ impl ExecutionPlan for LocalLimitExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start LocalLimitExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start LocalLimitExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         let stream = self.input.execute(partition, context)?;
         Ok(Box::pin(LimitStream::new(
@@ -333,17 +420,9 @@ impl ExecutionPlan for LocalLimitExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        self.input.partition_statistics(partition)?.with_fetch(
-            self.schema(),
-            Some(self.fetch),
-            0,
-            1,
-        )
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(stats.with_fetch(Some(self.fetch), 0, 1)?))
     }
 
     fn fetch(&self) -> Option<usize> {
@@ -495,8 +574,8 @@ mod tests {
     use arrow::array::RecordBatchOptions;
     use arrow::datatypes::Schema;
     use datafusion_common::stats::Precision;
-    use datafusion_physical_expr::expressions::col;
     use datafusion_physical_expr::PhysicalExpr;
+    use datafusion_physical_expr::expressions::col;
 
     #[tokio::test]
     async fn limit() -> Result<()> {
diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs
index 1bc872a56e763..d607f2b440f66 100644
--- a/datafusion/physical-plan/src/memory.rs
+++ b/datafusion/physical-plan/src/memory.rs
@@ -22,19 +22,23 @@ use std::fmt;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use crate::execution_plan::{Boundedness, EmissionType};
+use crate::coop::cooperative;
+use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
+use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
-    RecordBatchStream, SendableRecordBatchStream, Statistics,
+    RecordBatchStream, SendableRecordBatchStream,
 };
 
 use arrow::array::RecordBatch;
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, assert_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
 use futures::Stream;
 use parking_lot::RwLock;
 
@@ -131,8 +135,19 @@ impl RecordBatchStream for MemoryStream {
 }
 
 pub trait LazyBatchGenerator: Send + Sync + fmt::Debug + fmt::Display {
+    /// Returns the generator as [`Any`] so that it can be
+    /// downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    fn boundedness(&self) -> Boundedness {
+        Boundedness::Bounded
+    }
+
     /// Generate the next batch, return `None` when no more batches are available
     fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>>;
+
+    /// Returns a new instance with the state reset.
+    fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>>;
 }
 
 /// Execution plan for lazy in-memory batches of data
@@ -142,10 +157,14 @@ pub trait LazyBatchGenerator: Send + Sync + fmt::Debug + fmt::Display {
 pub struct LazyMemoryExec {
     /// Schema representing the data
     schema: SchemaRef,
+    /// Optional projection for which columns to load
+    projection: Option<Vec<usize>>,
     /// Functions to generate batches for each partition
     batch_generators: Vec<Arc<RwLock<dyn LazyBatchGenerator>>>,
     /// Plan properties cache storing equivalence properties, partitioning, and execution mode
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
 }
 
 impl LazyMemoryExec {
@@ -154,18 +173,85 @@ impl LazyMemoryExec {
         schema: SchemaRef,
         generators: Vec<Arc<RwLock<dyn LazyBatchGenerator>>>,
     ) -> Result<Self> {
+        let boundedness = generators
+            .iter()
+            .map(|g| g.read().boundedness())
+            .reduce(|acc, b| match acc {
+                Boundedness::Bounded => b,
+                Boundedness::Unbounded {
+                    requires_infinite_memory,
+                } => {
+                    let acc_infinite_memory = requires_infinite_memory;
+                    match b {
+                        Boundedness::Bounded => acc,
+                        Boundedness::Unbounded {
+                            requires_infinite_memory,
+                        } => Boundedness::Unbounded {
+                            requires_infinite_memory: requires_infinite_memory
+                                || acc_infinite_memory,
+                        },
+                    }
+                }
+            })
+            .unwrap_or(Boundedness::Bounded);
+
         let cache = PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&schema)),
             Partitioning::RoundRobinBatch(generators.len()),
             EmissionType::Incremental,
-            Boundedness::Bounded,
-        );
+            boundedness,
+        )
+        .with_scheduling_type(SchedulingType::Cooperative)
+        .into();
+
         Ok(Self {
             schema,
+            projection: None,
             batch_generators: generators,
             cache,
+            metrics: ExecutionPlanMetricsSet::new(),
         })
     }
+
+    pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
+        match projection.as_ref() {
+            Some(columns) => {
+                let projected = Arc::new(self.schema.project(columns).unwrap());
+                Arc::make_mut(&mut self.cache).set_eq_properties(
+                    EquivalenceProperties::new(Arc::clone(&projected)),
+                );
+                self.schema = projected;
+                self.projection = projection;
+                self
+            }
+            _ => self,
+        }
+    }
+
+    pub fn try_set_partitioning(&mut self, partitioning: Partitioning) -> Result<()> {
+        let partition_count = partitioning.partition_count();
+        let generator_count = self.batch_generators.len();
+        assert_eq_or_internal_err!(
+            partition_count,
+            generator_count,
+            "Partition count must match generator count: {} != {}",
+            partition_count,
+            generator_count
+        );
+        Arc::make_mut(&mut self.cache).partitioning = partitioning;
+        Ok(())
+    }
+
+    pub fn add_ordering(&mut self, ordering: impl IntoIterator<Item = PhysicalSortExpr>) {
+        Arc::make_mut(&mut self.cache)
+            .eq_properties
+            .add_orderings(std::iter::once(ordering));
+    }
+
+    /// Get the batch generators
+    pub fn generators(&self) -> &Vec<Arc<RwLock<dyn LazyBatchGenerator>>> {
+        &self.batch_generators
+    }
 }
 
 impl fmt::Debug for LazyMemoryExec {
@@ -222,7 +308,7 @@ impl ExecutionPlan for LazyMemoryExec {
         Arc::clone(&self.schema)
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -230,15 +316,22 @@ impl ExecutionPlan for LazyMemoryExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        if children.is_empty() {
-            Ok(self)
-        } else {
-            internal_err!("Children cannot be replaced in LazyMemoryExec")
-        }
+        assert_or_internal_err!(
+            children.is_empty(),
+            "Children cannot be replaced in LazyMemoryExec"
+        );
+        Ok(self)
     }
 
     fn execute(
@@ -246,28 +339,49 @@ impl ExecutionPlan for LazyMemoryExec {
         partition: usize,
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if partition >= self.batch_generators.len() {
-            return internal_err!(
-                "Invalid partition {} for LazyMemoryExec with {} partitions",
-                partition,
-                self.batch_generators.len()
-            );
-        }
+        assert_or_internal_err!(
+            partition < self.batch_generators.len(),
+            "Invalid partition {} for LazyMemoryExec with {} partitions",
+            partition,
+            self.batch_generators.len()
+        );
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
 
-        Ok(Box::pin(LazyMemoryStream {
+        let stream = LazyMemoryStream {
             schema: Arc::clone(&self.schema),
+            projection: self.projection.clone(),
             generator: Arc::clone(&self.batch_generators[partition]),
-        }))
+            baseline_metrics,
+        };
+        Ok(Box::pin(cooperative(stream)))
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema))
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let generators = self
+            .generators()
+            .iter()
+            .map(|g| g.read().reset_state())
+            .collect::<Vec<_>>();
+        Ok(Arc::new(LazyMemoryExec {
+            schema: Arc::clone(&self.schema),
+            batch_generators: generators,
+            cache: Arc::clone(&self.cache),
+            metrics: ExecutionPlanMetricsSet::new(),
+            projection: self.projection.clone(),
+        }))
     }
 }
 
 /// Stream that generates record batches on demand
 pub struct LazyMemoryStream {
     schema: SchemaRef,
+    /// Optional projection for which columns to load
+    projection: Option<Vec<usize>>,
     /// Generator to produce batches
     ///
     /// Note: Idiomatically, DataFusion uses plan-time parallelism - each stream
@@ -276,6 +390,8 @@ pub struct LazyMemoryStream {
     /// parallel execution.
     /// Sharing generators between streams should be used with caution.
     generator: Arc<RwLock<dyn LazyBatchGenerator>>,
+    /// Execution metrics
+    baseline_metrics: BaselineMetrics,
 }
 
 impl Stream for LazyMemoryStream {
@@ -285,13 +401,23 @@ impl Stream for LazyMemoryStream {
         self: std::pin::Pin<&mut Self>,
         _: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
+        let _timer_guard = self.baseline_metrics.elapsed_compute().timer();
         let batch = self.generator.write().generate_next_batch();
 
-        match batch {
-            Ok(Some(batch)) => Poll::Ready(Some(Ok(batch))),
+        let poll = match batch {
+            Ok(Some(batch)) => {
+                // return just the columns requested
+                let batch = match self.projection.as_ref() {
+                    Some(columns) => batch.project(columns)?,
+                    None => batch,
+                };
+                Poll::Ready(Some(Ok(batch)))
+            }
             Ok(None) => Poll::Ready(None),
             Err(e) => Poll::Ready(Some(Err(e))),
-        }
+        };
+
+        self.baseline_metrics.record_poll(poll)
     }
 }
 
@@ -304,6 +430,7 @@ impl RecordBatchStream for LazyMemoryStream {
 #[cfg(test)]
 mod lazy_memory_tests {
     use super::*;
+    use crate::common::collect;
     use arrow::array::Int64Array;
     use arrow::datatypes::{DataType, Field, Schema};
     use futures::StreamExt;
@@ -327,6 +454,10 @@ mod lazy_memory_tests {
     }
 
     impl LazyBatchGenerator for TestGenerator {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
         fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>> {
             if self.counter >= self.max_batches {
                 return Ok(None);
@@ -342,6 +473,15 @@ mod lazy_memory_tests {
                 vec![Arc::new(array)],
             )?))
         }
+
+        fn reset_state(&self) -> Arc<RwLock<dyn LazyBatchGenerator>> {
+            Arc::new(RwLock::new(TestGenerator {
+                counter: 0,
+                max_batches: self.max_batches,
+                batch_size: self.batch_size,
+                schema: Arc::clone(&self.schema),
+            }))
+        }
     }
 
     #[tokio::test]
@@ -419,4 +559,72 @@ mod lazy_memory_tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_generate_series_metrics_integration() -> Result<()> {
+        // Test LazyMemoryExec metrics with different configurations
+        let test_cases = vec![
+            (10, 2, 10),    // 10 rows, batch size 2, expected 10 rows
+            (100, 10, 100), // 100 rows, batch size 10, expected 100 rows
+            (5, 1, 5),      // 5 rows, batch size 1, expected 5 rows
+        ];
+
+        for (total_rows, batch_size, expected_rows) in test_cases {
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+            let generator = TestGenerator {
+                counter: 0,
+                max_batches: (total_rows + batch_size - 1) / batch_size, // ceiling division
+                batch_size: batch_size as usize,
+                schema: Arc::clone(&schema),
+            };
+
+            let exec =
+                LazyMemoryExec::try_new(schema, vec![Arc::new(RwLock::new(generator))])?;
+            let task_ctx = Arc::new(TaskContext::default());
+
+            let stream = exec.execute(0, task_ctx)?;
+            let batches = collect(stream).await?;
+
+            // Verify metrics exist with actual expected numbers
+            let metrics = exec.metrics().unwrap();
+
+            // Count actual rows returned
+            let actual_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+            assert_eq!(actual_rows, expected_rows);
+
+            // Verify metrics match actual output
+            assert_eq!(metrics.output_rows().unwrap(), expected_rows);
+            assert!(metrics.elapsed_compute().unwrap() > 0);
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_lazy_memory_exec_reset_state() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]));
+        let generator = TestGenerator {
+            counter: 0,
+            max_batches: 3,
+            batch_size: 2,
+            schema: Arc::clone(&schema),
+        };
+
+        let exec = Arc::new(LazyMemoryExec::try_new(
+            schema,
+            vec![Arc::new(RwLock::new(generator))],
+        )?);
+        let stream = exec.execute(0, Arc::new(TaskContext::default()))?;
+        let batches = collect(stream).await?;
+
+        let exec_reset = exec.reset_state()?;
+        let stream = exec_reset.execute(0, Arc::new(TaskContext::default()))?;
+        let batches_reset = collect(stream).await?;
+
+        // if the reset_state is not correct, the batches_reset will be empty
+        assert_eq!(batches, batches_reset);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/metrics.rs b/datafusion/physical-plan/src/metrics.rs
new file mode 100644
index 0000000000000..fe17cbdd4a2c2
--- /dev/null
+++ b/datafusion/physical-plan/src/metrics.rs
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metrics live in `datafusion-physical-expr-common`; this module re-exports
+//! them to keep the public APIs stable.
+
+pub use datafusion_physical_expr_common::metrics::*;
diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs
deleted file mode 100644
index 249cd5edb1333..0000000000000
--- a/datafusion/physical-plan/src/metrics/value.rs
+++ /dev/null
@@ -1,671 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Value representation of metrics
-
-use std::{
-    borrow::{Borrow, Cow},
-    fmt::Display,
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
-
-use chrono::{DateTime, Utc};
-use datafusion_common::instant::Instant;
-use datafusion_execution::memory_pool::human_readable_size;
-use parking_lot::Mutex;
-
-/// A counter to record things such as number of input or output rows
-///
-/// Note `clone`ing counters update the same underlying metrics
-#[derive(Debug, Clone)]
-pub struct Count {
-    /// value of the metric counter
-    value: Arc<AtomicUsize>,
-}
-
-impl PartialEq for Count {
-    fn eq(&self, other: &Self) -> bool {
-        self.value().eq(&other.value())
-    }
-}
-
-impl Display for Count {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.value())
-    }
-}
-
-impl Default for Count {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl Count {
-    /// create a new counter
-    pub fn new() -> Self {
-        Self {
-            value: Arc::new(AtomicUsize::new(0)),
-        }
-    }
-
-    /// Add `n` to the metric's value
-    pub fn add(&self, n: usize) {
-        // relaxed ordering for operations on `value` poses no issues
-        // we're purely using atomic ops with no associated memory ops
-        self.value.fetch_add(n, Ordering::Relaxed);
-    }
-
-    /// Get the current value
-    pub fn value(&self) -> usize {
-        self.value.load(Ordering::Relaxed)
-    }
-}
-
-/// A gauge is the simplest metrics type. It just returns a value.
-/// For example, you can easily expose current memory consumption with a gauge.
-///
-/// Note `clone`ing gauge update the same underlying metrics
-#[derive(Debug, Clone)]
-pub struct Gauge {
-    /// value of the metric gauge
-    value: Arc<AtomicUsize>,
-}
-
-impl PartialEq for Gauge {
-    fn eq(&self, other: &Self) -> bool {
-        self.value().eq(&other.value())
-    }
-}
-
-impl Display for Gauge {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.value())
-    }
-}
-
-impl Default for Gauge {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl Gauge {
-    /// create a new gauge
-    pub fn new() -> Self {
-        Self {
-            value: Arc::new(AtomicUsize::new(0)),
-        }
-    }
-
-    /// Add `n` to the metric's value
-    pub fn add(&self, n: usize) {
-        // relaxed ordering for operations on `value` poses no issues
-        // we're purely using atomic ops with no associated memory ops
-        self.value.fetch_add(n, Ordering::Relaxed);
-    }
-
-    /// Sub `n` from the metric's value
-    pub fn sub(&self, n: usize) {
-        // relaxed ordering for operations on `value` poses no issues
-        // we're purely using atomic ops with no associated memory ops
-        self.value.fetch_sub(n, Ordering::Relaxed);
-    }
-
-    /// Set metric's value to maximum of `n` and current value
-    pub fn set_max(&self, n: usize) {
-        self.value.fetch_max(n, Ordering::Relaxed);
-    }
-
-    /// Set the metric's value to `n` and return the previous value
-    pub fn set(&self, n: usize) -> usize {
-        // relaxed ordering for operations on `value` poses no issues
-        // we're purely using atomic ops with no associated memory ops
-        self.value.swap(n, Ordering::Relaxed)
-    }
-
-    /// Get the current value
-    pub fn value(&self) -> usize {
-        self.value.load(Ordering::Relaxed)
-    }
-}
-
-/// Measure a potentially non contiguous duration of time
-#[derive(Debug, Clone)]
-pub struct Time {
-    /// elapsed time, in nanoseconds
-    nanos: Arc<AtomicUsize>,
-}
-
-impl Default for Time {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl PartialEq for Time {
-    fn eq(&self, other: &Self) -> bool {
-        self.value().eq(&other.value())
-    }
-}
-
-impl Display for Time {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let duration = Duration::from_nanos(self.value() as u64);
-        write!(f, "{duration:?}")
-    }
-}
-
-impl Time {
-    /// Create a new [`Time`] wrapper suitable for recording elapsed
-    /// times for operations.
-    pub fn new() -> Self {
-        Self {
-            nanos: Arc::new(AtomicUsize::new(0)),
-        }
-    }
-
-    /// Add elapsed nanoseconds since `start`to self
-    pub fn add_elapsed(&self, start: Instant) {
-        self.add_duration(start.elapsed());
-    }
-
-    /// Add duration of time to self
-    ///
-    /// Note: this will always increment the recorded time by at least 1 nanosecond
-    /// to distinguish between the scenario of no values recorded, in which
-    /// case the value will be 0, and no measurable amount of time having passed,
-    /// in which case the value will be small but not 0.
-    ///
-    /// This is based on the assumption that the timing logic in most cases is likely
-    /// to take at least a nanosecond, and so this is reasonable mechanism to avoid
-    /// ambiguity, especially on systems with low-resolution monotonic clocks
-    pub fn add_duration(&self, duration: Duration) {
-        let more_nanos = duration.as_nanos() as usize;
-        self.nanos.fetch_add(more_nanos.max(1), Ordering::Relaxed);
-    }
-
-    /// Add the number of nanoseconds of other `Time` to self
-    pub fn add(&self, other: &Time) {
-        self.add_duration(Duration::from_nanos(other.value() as u64))
-    }
-
-    /// return a scoped guard that adds the amount of time elapsed
-    /// between its creation and its drop or call to `stop` to the
-    /// underlying metric.
-    pub fn timer(&self) -> ScopedTimerGuard<'_> {
-        ScopedTimerGuard {
-            inner: self,
-            start: Some(Instant::now()),
-        }
-    }
-
-    /// Get the number of nanoseconds record by this Time metric
-    pub fn value(&self) -> usize {
-        self.nanos.load(Ordering::Relaxed)
-    }
-}
-
-/// Stores a single timestamp, stored as the number of nanoseconds
-/// elapsed from Jan 1, 1970 UTC
-#[derive(Debug, Clone)]
-pub struct Timestamp {
-    /// Time thing started
-    timestamp: Arc<Mutex<Option<DateTime<Utc>>>>,
-}
-
-impl Default for Timestamp {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl Timestamp {
-    /// Create a new timestamp and sets its value to 0
-    pub fn new() -> Self {
-        Self {
-            timestamp: Arc::new(Mutex::new(None)),
-        }
-    }
-
-    /// Sets the timestamps value to the current time
-    pub fn record(&self) {
-        self.set(Utc::now())
-    }
-
-    /// Sets the timestamps value to a specified time
-    pub fn set(&self, now: DateTime<Utc>) {
-        *self.timestamp.lock() = Some(now);
-    }
-
-    /// return the timestamps value at the last time `record()` was
-    /// called.
-    ///
-    /// Returns `None` if `record()` has not been called
-    pub fn value(&self) -> Option<DateTime<Utc>> {
-        *self.timestamp.lock()
-    }
-
-    /// sets the value of this timestamp to the minimum of this and other
-    pub fn update_to_min(&self, other: &Timestamp) {
-        let min = match (self.value(), other.value()) {
-            (None, None) => None,
-            (Some(v), None) => Some(v),
-            (None, Some(v)) => Some(v),
-            (Some(v1), Some(v2)) => Some(if v1 < v2 { v1 } else { v2 }),
-        };
-
-        *self.timestamp.lock() = min;
-    }
-
-    /// sets the value of this timestamp to the maximum of this and other
-    pub fn update_to_max(&self, other: &Timestamp) {
-        let max = match (self.value(), other.value()) {
-            (None, None) => None,
-            (Some(v), None) => Some(v),
-            (None, Some(v)) => Some(v),
-            (Some(v1), Some(v2)) => Some(if v1 < v2 { v2 } else { v1 }),
-        };
-
-        *self.timestamp.lock() = max;
-    }
-}
-
-impl PartialEq for Timestamp {
-    fn eq(&self, other: &Self) -> bool {
-        self.value().eq(&other.value())
-    }
-}
-
-impl Display for Timestamp {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self.value() {
-            None => write!(f, "NONE"),
-            Some(v) => {
-                write!(f, "{v}")
-            }
-        }
-    }
-}
-
-/// RAAI structure that adds all time between its construction and
-/// destruction to the CPU time or the first call to `stop` whichever
-/// comes first
-pub struct ScopedTimerGuard<'a> {
-    inner: &'a Time,
-    start: Option<Instant>,
-}
-
-impl ScopedTimerGuard<'_> {
-    /// Stop the timer timing and record the time taken
-    pub fn stop(&mut self) {
-        if let Some(start) = self.start.take() {
-            self.inner.add_elapsed(start)
-        }
-    }
-
-    /// Restarts the timer recording from the current time
-    pub fn restart(&mut self) {
-        self.start = Some(Instant::now())
-    }
-
-    /// Stop the timer, record the time taken and consume self
-    pub fn done(mut self) {
-        self.stop()
-    }
-}
-
-impl Drop for ScopedTimerGuard<'_> {
-    fn drop(&mut self) {
-        self.stop()
-    }
-}
-
-/// Possible values for a [super::Metric].
-///
-/// Among other differences, the metric types have different ways to
-/// logically interpret their underlying values and some metrics are
-/// so common they are given special treatment.
-#[derive(Debug, Clone, PartialEq)]
-pub enum MetricValue {
-    /// Number of output rows produced: "output_rows" metric
-    OutputRows(Count),
-    /// Elapsed Compute Time: the wall clock time spent in "cpu
-    /// intensive" work.
-    ///
-    /// This measurement represents, roughly:
-    /// ```
-    /// use std::time::Instant;
-    /// let start = Instant::now();
-    /// // ...CPU intensive work here...
-    /// let elapsed_compute = (Instant::now() - start).as_nanos();
-    /// ```
-    ///
-    /// Note 1: Does *not* include time other operators spend
-    /// computing input.
-    ///
-    /// Note 2: *Does* includes time when the thread could have made
-    /// progress but the OS did not schedule it (e.g. due to CPU
-    /// contention), thus making this value different than the
-    /// classical definition of "cpu_time", which is the time reported
-    /// from `clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..)`.
-    ElapsedCompute(Time),
-    /// Number of spills produced: "spill_count" metric
-    SpillCount(Count),
-    /// Total size of spilled bytes produced: "spilled_bytes" metric
-    SpilledBytes(Count),
-    /// Total size of spilled rows produced: "spilled_rows" metric
-    SpilledRows(Count),
-    /// Current memory used
-    CurrentMemoryUsage(Gauge),
-    /// Operator defined count.
-    Count {
-        /// The provided name of this metric
-        name: Cow<'static, str>,
-        /// The value of the metric
-        count: Count,
-    },
-    /// Operator defined gauge.
-    Gauge {
-        /// The provided name of this metric
-        name: Cow<'static, str>,
-        /// The value of the metric
-        gauge: Gauge,
-    },
-    /// Operator defined time
-    Time {
-        /// The provided name of this metric
-        name: Cow<'static, str>,
-        /// The value of the metric
-        time: Time,
-    },
-    /// The time at which execution started
-    StartTimestamp(Timestamp),
-    /// The time at which execution ended
-    EndTimestamp(Timestamp),
-}
-
-impl MetricValue {
-    /// Return the name of this SQL metric
-    pub fn name(&self) -> &str {
-        match self {
-            Self::OutputRows(_) => "output_rows",
-            Self::SpillCount(_) => "spill_count",
-            Self::SpilledBytes(_) => "spilled_bytes",
-            Self::SpilledRows(_) => "spilled_rows",
-            Self::CurrentMemoryUsage(_) => "mem_used",
-            Self::ElapsedCompute(_) => "elapsed_compute",
-            Self::Count { name, .. } => name.borrow(),
-            Self::Gauge { name, .. } => name.borrow(),
-            Self::Time { name, .. } => name.borrow(),
-            Self::StartTimestamp(_) => "start_timestamp",
-            Self::EndTimestamp(_) => "end_timestamp",
-        }
-    }
-
-    /// Return the value of the metric as a usize value
-    pub fn as_usize(&self) -> usize {
-        match self {
-            Self::OutputRows(count) => count.value(),
-            Self::SpillCount(count) => count.value(),
-            Self::SpilledBytes(bytes) => bytes.value(),
-            Self::SpilledRows(count) => count.value(),
-            Self::CurrentMemoryUsage(used) => used.value(),
-            Self::ElapsedCompute(time) => time.value(),
-            Self::Count { count, .. } => count.value(),
-            Self::Gauge { gauge, .. } => gauge.value(),
-            Self::Time { time, .. } => time.value(),
-            Self::StartTimestamp(timestamp) => timestamp
-                .value()
-                .and_then(|ts| ts.timestamp_nanos_opt())
-                .map(|nanos| nanos as usize)
-                .unwrap_or(0),
-            Self::EndTimestamp(timestamp) => timestamp
-                .value()
-                .and_then(|ts| ts.timestamp_nanos_opt())
-                .map(|nanos| nanos as usize)
-                .unwrap_or(0),
-        }
-    }
-
-    /// create a new MetricValue with the same type as `self` suitable
-    /// for accumulating
-    pub fn new_empty(&self) -> Self {
-        match self {
-            Self::OutputRows(_) => Self::OutputRows(Count::new()),
-            Self::SpillCount(_) => Self::SpillCount(Count::new()),
-            Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()),
-            Self::SpilledRows(_) => Self::SpilledRows(Count::new()),
-            Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()),
-            Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()),
-            Self::Count { name, .. } => Self::Count {
-                name: name.clone(),
-                count: Count::new(),
-            },
-            Self::Gauge { name, .. } => Self::Gauge {
-                name: name.clone(),
-                gauge: Gauge::new(),
-            },
-            Self::Time { name, .. } => Self::Time {
-                name: name.clone(),
-                time: Time::new(),
-            },
-            Self::StartTimestamp(_) => Self::StartTimestamp(Timestamp::new()),
-            Self::EndTimestamp(_) => Self::EndTimestamp(Timestamp::new()),
-        }
-    }
-
-    /// Aggregates the value of other to `self`. panic's if the types
-    /// are mismatched or aggregating does not make sense for this
-    /// value
-    ///
-    /// Note this is purposely marked `mut` (even though atomics are
-    /// used) so Rust's type system can be used to ensure the
-    /// appropriate API access. `MetricValues` should be modified
-    /// using the original [`Count`] or [`Time`] they were created
-    /// from.
-    pub fn aggregate(&mut self, other: &Self) {
-        match (self, other) {
-            (Self::OutputRows(count), Self::OutputRows(other_count))
-            | (Self::SpillCount(count), Self::SpillCount(other_count))
-            | (Self::SpilledBytes(count), Self::SpilledBytes(other_count))
-            | (Self::SpilledRows(count), Self::SpilledRows(other_count))
-            | (
-                Self::Count { count, .. },
-                Self::Count {
-                    count: other_count, ..
-                },
-            ) => count.add(other_count.value()),
-            (Self::CurrentMemoryUsage(gauge), Self::CurrentMemoryUsage(other_gauge))
-            | (
-                Self::Gauge { gauge, .. },
-                Self::Gauge {
-                    gauge: other_gauge, ..
-                },
-            ) => gauge.add(other_gauge.value()),
-            (Self::ElapsedCompute(time), Self::ElapsedCompute(other_time))
-            | (
-                Self::Time { time, .. },
-                Self::Time {
-                    time: other_time, ..
-                },
-            ) => time.add(other_time),
-            // timestamps are aggregated by min/max
-            (Self::StartTimestamp(timestamp), Self::StartTimestamp(other_timestamp)) => {
-                timestamp.update_to_min(other_timestamp);
-            }
-            // timestamps are aggregated by min/max
-            (Self::EndTimestamp(timestamp), Self::EndTimestamp(other_timestamp)) => {
-                timestamp.update_to_max(other_timestamp);
-            }
-            m @ (_, _) => {
-                panic!(
-                    "Mismatched metric types. Can not aggregate {:?} with value {:?}",
-                    m.0, m.1
-                )
-            }
-        }
-    }
-
-    /// Returns a number by which to sort metrics by display. Lower
-    /// numbers are "more useful" (and displayed first)
-    pub fn display_sort_key(&self) -> u8 {
-        match self {
-            Self::OutputRows(_) => 0,     // show first
-            Self::ElapsedCompute(_) => 1, // show second
-            Self::SpillCount(_) => 2,
-            Self::SpilledBytes(_) => 3,
-            Self::SpilledRows(_) => 4,
-            Self::CurrentMemoryUsage(_) => 5,
-            Self::Count { .. } => 6,
-            Self::Gauge { .. } => 7,
-            Self::Time { .. } => 8,
-            Self::StartTimestamp(_) => 9, // show timestamps last
-            Self::EndTimestamp(_) => 10,
-        }
-    }
-
-    /// returns true if this metric has a timestamp value
-    pub fn is_timestamp(&self) -> bool {
-        matches!(self, Self::StartTimestamp(_) | Self::EndTimestamp(_))
-    }
-}
-
-impl Display for MetricValue {
-    /// Prints the value of this metric
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            Self::OutputRows(count)
-            | Self::SpillCount(count)
-            | Self::SpilledRows(count)
-            | Self::Count { count, .. } => {
-                write!(f, "{count}")
-            }
-            Self::SpilledBytes(count) => {
-                let readable_count = human_readable_size(count.value());
-                write!(f, "{readable_count}")
-            }
-            Self::CurrentMemoryUsage(gauge) | Self::Gauge { gauge, .. } => {
-                write!(f, "{gauge}")
-            }
-            Self::ElapsedCompute(time) | Self::Time { time, .. } => {
-                // distinguish between no time recorded and very small
-                // amount of time recorded
-                if time.value() > 0 {
-                    write!(f, "{time}")
-                } else {
-                    write!(f, "NOT RECORDED")
-                }
-            }
-            Self::StartTimestamp(timestamp) | Self::EndTimestamp(timestamp) => {
-                write!(f, "{timestamp}")
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use chrono::TimeZone;
-    use datafusion_execution::memory_pool::units::MB;
-
-    use super::*;
-
-    #[test]
-    fn test_display_output_rows() {
-        let count = Count::new();
-        let values = vec![
-            MetricValue::OutputRows(count.clone()),
-            MetricValue::Count {
-                name: "my_counter".into(),
-                count: count.clone(),
-            },
-        ];
-
-        for value in &values {
-            assert_eq!("0", value.to_string(), "value {value:?}");
-        }
-
-        count.add(42);
-        for value in &values {
-            assert_eq!("42", value.to_string(), "value {value:?}");
-        }
-    }
-
-    #[test]
-    fn test_display_spilled_bytes() {
-        let count = Count::new();
-        let spilled_byte = MetricValue::SpilledBytes(count.clone());
-
-        assert_eq!("0.0 B", spilled_byte.to_string());
-
-        count.add((100 * MB) as usize);
-        assert_eq!("100.0 MB", spilled_byte.to_string());
-
-        count.add((0.5 * MB as f64) as usize);
-        assert_eq!("100.5 MB", spilled_byte.to_string());
-    }
-
-    #[test]
-    fn test_display_time() {
-        let time = Time::new();
-        let values = vec![
-            MetricValue::ElapsedCompute(time.clone()),
-            MetricValue::Time {
-                name: "my_time".into(),
-                time: time.clone(),
-            },
-        ];
-
-        // if time is not set, it should not be reported as zero
-        for value in &values {
-            assert_eq!("NOT RECORDED", value.to_string(), "value {value:?}");
-        }
-
-        time.add_duration(Duration::from_nanos(1042));
-        for value in &values {
-            assert_eq!("1.042µs", value.to_string(), "value {value:?}");
-        }
-    }
-
-    #[test]
-    fn test_display_timestamp() {
-        let timestamp = Timestamp::new();
-        let values = vec![
-            MetricValue::StartTimestamp(timestamp.clone()),
-            MetricValue::EndTimestamp(timestamp.clone()),
-        ];
-
-        // if time is not set, it should not be reported as zero
-        for value in &values {
-            assert_eq!("NONE", value.to_string(), "value {value:?}");
-        }
-
-        timestamp.set(Utc.timestamp_nanos(1431648000000000));
-        for value in &values {
-            assert_eq!(
-                "1970-01-17 13:40:48 UTC",
-                value.to_string(),
-                "value {value:?}"
-            );
-        }
-    }
-}
diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs
index 46847b2413c0e..eaa895c821837 100644
--- a/datafusion/physical-plan/src/placeholder_row.rs
+++ b/datafusion/physical-plan/src/placeholder_row.rs
@@ -20,16 +20,21 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use crate::execution_plan::{Boundedness, EmissionType};
+use crate::coop::cooperative;
+use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::memory::MemoryStream;
-use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning};
-use arrow::array::{ArrayRef, NullArray};
-use arrow::array::{RecordBatch, RecordBatchOptions};
+use crate::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream, Statistics, common,
+};
+
+use arrow::array::{ArrayRef, NullArray, RecordBatch, RecordBatchOptions};
 use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_or_internal_err};
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::PhysicalExpr;
 
 use log::trace;
 
@@ -40,7 +45,7 @@ pub struct PlaceholderRowExec {
     schema: SchemaRef,
     /// Number of partitions
     partitions: usize,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PlaceholderRowExec {
@@ -51,7 +56,7 @@ impl PlaceholderRowExec {
         PlaceholderRowExec {
             schema,
             partitions,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -60,7 +65,7 @@ impl PlaceholderRowExec {
         self.partitions = partitions;
         // Update output partitioning when updating partitions:
         let output_partitioning = Self::output_partitioning_helper(self.partitions);
-        self.cache = self.cache.with_partitioning(output_partitioning);
+        Arc::make_mut(&mut self.cache).partitioning = output_partitioning;
         self
     }
 
@@ -99,6 +104,7 @@ impl PlaceholderRowExec {
             EmissionType::Incremental,
             Boundedness::Bounded,
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
     }
 }
 
@@ -128,7 +134,7 @@ impl ExecutionPlan for PlaceholderRowExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -136,6 +142,13 @@ impl ExecutionPlan for PlaceholderRowExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -148,39 +161,39 @@ impl ExecutionPlan for PlaceholderRowExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start PlaceholderRowExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-
-        if partition >= self.partitions {
-            return internal_err!(
-                "PlaceholderRowExec invalid partition {} (expected less than {})",
-                partition,
-                self.partitions
-            );
-        }
+        trace!(
+            "Start PlaceholderRowExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
-        Ok(Box::pin(MemoryStream::try_new(
-            self.data()?,
-            Arc::clone(&self.schema),
-            None,
-        )?))
-    }
+        assert_or_internal_err!(
+            partition < self.partitions,
+            "PlaceholderRowExec invalid partition {partition} (expected less than {})",
+            self.partitions
+        );
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
+        let ms = MemoryStream::try_new(self.data()?, Arc::clone(&self.schema), None)?;
+        Ok(Box::pin(cooperative(ms)))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
-        let batch = self
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let batches = self
             .data()
             .expect("Create single row placeholder RecordBatch should not fail");
-        Ok(common::compute_record_batch_statistics(
-            &[batch],
+
+        let batches = match partition {
+            Some(_) => vec![batches],
+            // entire plan
+            None => vec![batches; self.partitions],
+        };
+
+        Ok(Arc::new(common::compute_record_batch_statistics(
+            &batches,
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index f1621acd0debb..a4cce0436b10e 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -20,99 +20,158 @@
 //! of a projection on table `t1` where the expressions `a`, `b`, and `a+b` are the
 //! projection expressions. `SELECT` without `FROM` will only evaluate expressions.
 
-use std::any::Any;
-use std::collections::HashMap;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-use super::expressions::{Column, Literal};
+use super::expressions::Column;
 use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use super::{
     DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics,
+    SendableRecordBatchStream, SortOrderPushdownResult, Statistics,
 };
+use crate::column_rewriter::PhysicalColumnRewriter;
 use crate::execution_plan::CardinalityEffect;
+use crate::filter_pushdown::{
+    ChildFilterDescription, ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, FilterRemapper, PushedDownPredicate,
+};
 use crate::joins::utils::{ColumnIndex, JoinFilter, JoinOn, JoinOnRef};
-use crate::{ColumnStatistics, DisplayFormatType, ExecutionPlan, PhysicalExpr};
+use crate::{DisplayFormatType, ExecutionPlan, PhysicalExpr, check_if_same_properties};
+use std::any::Any;
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
 
-use arrow::datatypes::{Field, Schema, SchemaRef};
-use arrow::record_batch::{RecordBatch, RecordBatchOptions};
-use datafusion_common::stats::Precision;
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::config::ConfigOptions;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
-use datafusion_common::{internal_err, JoinSide, Result};
+use datafusion_common::{DataFusionError, JoinSide, Result, internal_err};
 use datafusion_execution::TaskContext;
+use datafusion_expr::ExpressionPlacement;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
+use datafusion_physical_expr::projection::Projector;
 use datafusion_physical_expr::utils::collect_columns;
-use datafusion_physical_expr::PhysicalExprRef;
+use datafusion_physical_expr_common::physical_expr::{PhysicalExprRef, fmt_sql};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, LexRequirement, PhysicalSortExpr,
+};
+// Re-exported from datafusion-physical-expr for backwards compatibility
+// We recommend updating your imports to use datafusion-physical-expr directly
+pub use datafusion_physical_expr::projection::{
+    ProjectionExpr, ProjectionExprs, update_expr,
+};
 
-use datafusion_physical_expr_common::physical_expr::fmt_sql;
 use futures::stream::{Stream, StreamExt};
-use itertools::Itertools;
 use log::trace;
 
-/// Execution plan for a projection
+/// [`ExecutionPlan`] for a projection
+///
+/// Computes a set of scalar value expressions for each input row, producing one
+/// output row for each input row.
 #[derive(Debug, Clone)]
 pub struct ProjectionExec {
-    /// The projection expressions stored as tuples of (expression, output column name)
-    pub(crate) expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
-    /// The schema once the projection has been applied to the input
-    schema: SchemaRef,
+    /// A projector specialized to apply the projection to the input schema from the child node
+    /// and produce [`RecordBatch`]es with the output schema of this node.
+    projector: Projector,
     /// The input plan
     input: Arc<dyn ExecutionPlan>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ProjectionExec {
     /// Create a projection on an input
-    pub fn try_new(
-        expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
-        input: Arc<dyn ExecutionPlan>,
-    ) -> Result<Self> {
+    ///
+    /// # Example:
+    /// Create a `ProjectionExec` to crate `SELECT a, a+b AS sum_ab FROM t1`:
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use arrow_schema::{Schema, Field, DataType};
+    /// # use datafusion_expr::Operator;
+    /// # use datafusion_physical_plan::ExecutionPlan;
+    /// # use datafusion_physical_expr::expressions::{col, binary};
+    /// # use datafusion_physical_plan::empty::EmptyExec;
+    /// # use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
+    /// # fn schema() -> Arc<Schema> {
+    /// #  Arc::new(Schema::new(vec![
+    /// #   Field::new("a", DataType::Int32, false),
+    /// #   Field::new("b", DataType::Int32, false),
+    /// # ]))
+    /// # }
+    /// #
+    /// # fn input() -> Arc<dyn ExecutionPlan> {
+    /// #  Arc::new(EmptyExec::new(schema()))
+    /// # }
+    /// #
+    /// # fn main() {
+    /// let schema = schema();
+    /// // Create PhysicalExprs
+    /// let a = col("a", &schema).unwrap();
+    /// let b = col("b", &schema).unwrap();
+    /// let a_plus_b = binary(Arc::clone(&a), Operator::Plus, b, &schema).unwrap();
+    /// // create ProjectionExec
+    /// let proj = ProjectionExec::try_new(
+    ///     [
+    ///         ProjectionExpr {
+    ///             // expr a produces the column named "a"
+    ///             expr: a,
+    ///             alias: "a".to_string(),
+    ///         },
+    ///         ProjectionExpr {
+    ///             // expr: a + b produces the column named "sum_ab"
+    ///             expr: a_plus_b,
+    ///             alias: "sum_ab".to_string(),
+    ///         },
+    ///     ],
+    ///     input(),
+    /// )
+    /// .unwrap();
+    /// # }
+    /// ```
+    pub fn try_new<I, E>(expr: I, input: Arc<dyn ExecutionPlan>) -> Result<Self>
+    where
+        I: IntoIterator<Item = E>,
+        E: Into<ProjectionExpr>,
+    {
         let input_schema = input.schema();
+        let expr_arc = expr.into_iter().map(Into::into).collect::<Arc<_>>();
+        let projection = ProjectionExprs::from_expressions(expr_arc);
+        let projector = projection.make_projector(&input_schema)?;
+        Self::try_from_projector(projector, input)
+    }
 
-        let fields: Result<Vec<Field>> = expr
-            .iter()
-            .map(|(e, name)| {
-                let metadata = e.return_field(&input_schema)?.metadata().clone();
-
-                let field = Field::new(
-                    name,
-                    e.data_type(&input_schema)?,
-                    e.nullable(&input_schema)?,
-                )
-                .with_metadata(metadata);
-
-                Ok(field)
-            })
-            .collect();
-
-        let schema = Arc::new(Schema::new_with_metadata(
-            fields?,
-            input_schema.metadata().clone(),
-        ));
-
+    fn try_from_projector(
+        projector: Projector,
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Result<Self> {
         // Construct a map from the input expressions to the output expression of the Projection
-        let projection_mapping = ProjectionMapping::try_new(&expr, &input_schema)?;
-        let cache =
-            Self::compute_properties(&input, &projection_mapping, Arc::clone(&schema))?;
+        let projection_mapping =
+            projector.projection().projection_mapping(&input.schema())?;
+        let cache = Self::compute_properties(
+            &input,
+            &projection_mapping,
+            Arc::clone(projector.output_schema()),
+        )?;
         Ok(Self {
-            expr,
-            schema,
+            projector,
             input,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
     /// The projection expressions stored as tuples of (expression, output column name)
-    pub fn expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
-        &self.expr
+    pub fn expr(&self) -> &[ProjectionExpr] {
+        self.projector.projection().as_ref()
+    }
+
+    /// The projection expressions as a [`ProjectionExprs`].
+    pub fn projection_expr(&self) -> &ProjectionExprs {
+        self.projector.projection()
     }
 
     /// The input plan
@@ -127,14 +186,12 @@ impl ProjectionExec {
         schema: SchemaRef,
     ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
-        let mut input_eq_properties = input.equivalence_properties().clone();
-        input_eq_properties.substitute_oeq_class(projection_mapping)?;
+        let input_eq_properties = input.equivalence_properties();
         let eq_properties = input_eq_properties.project(projection_mapping, schema);
-
         // Calculate output partitioning, which needs to respect aliases:
-        let input_partition = input.output_partitioning();
-        let output_partitioning =
-            input_partition.project(projection_mapping, &input_eq_properties);
+        let output_partitioning = input
+            .output_partitioning()
+            .project(projection_mapping, input_eq_properties);
 
         Ok(PlanProperties::new(
             eq_properties,
@@ -143,6 +200,40 @@ impl ProjectionExec {
             input.boundedness(),
         ))
     }
+
+    /// Collect reverse alias mapping from projection expressions.
+    /// The result hash map is a map from aliased Column in parent to original expr.
+    fn collect_reverse_alias(
+        &self,
+    ) -> Result<datafusion_common::HashMap<Column, Arc<dyn PhysicalExpr>>> {
+        let mut alias_map = datafusion_common::HashMap::new();
+        for projection in self.projection_expr().iter() {
+            let (aliased_index, _output_field) = self
+                .projector
+                .output_schema()
+                .column_with_name(&projection.alias)
+                .ok_or_else(|| {
+                    DataFusionError::Internal(format!(
+                        "Expr {} with alias {} not found in output schema",
+                        projection.expr, projection.alias
+                    ))
+                })?;
+            let aliased_col = Column::new(&projection.alias, aliased_index);
+            alias_map.insert(aliased_col, Arc::clone(&projection.expr));
+        }
+        Ok(alias_map)
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for ProjectionExec {
@@ -154,12 +245,14 @@ impl DisplayAs for ProjectionExec {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 let expr: Vec<String> = self
-                    .expr
+                    .projector
+                    .projection()
+                    .as_ref()
                     .iter()
-                    .map(|(e, alias)| {
-                        let e = e.to_string();
-                        if &e != alias {
-                            format!("{e} as {alias}")
+                    .map(|proj_expr| {
+                        let e = proj_expr.expr.to_string();
+                        if e != proj_expr.alias {
+                            format!("{e} as {}", proj_expr.alias)
                         } else {
                             e
                         }
@@ -169,12 +262,12 @@ impl DisplayAs for ProjectionExec {
                 write!(f, "ProjectionExec: expr=[{}]", expr.join(", "))
             }
             DisplayFormatType::TreeRender => {
-                for (i, (e, alias)) in self.expr().iter().enumerate() {
-                    let expr_sql = fmt_sql(e.as_ref());
-                    if &e.to_string() == alias {
+                for (i, proj_expr) in self.expr().iter().enumerate() {
+                    let expr_sql = fmt_sql(proj_expr.expr.as_ref());
+                    if proj_expr.expr.to_string() == proj_expr.alias {
                         writeln!(f, "expr{i}={expr_sql}")?;
                     } else {
-                        writeln!(f, "{alias}={expr_sql}")?;
+                        writeln!(f, "{}={expr_sql}", proj_expr.alias)?;
                     }
                 }
 
@@ -194,7 +287,7 @@ impl ExecutionPlan for ProjectionExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -204,11 +297,19 @@ impl ExecutionPlan for ProjectionExec {
     }
 
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
-        let all_simple_exprs = self
-            .expr
-            .iter()
-            .all(|(e, _)| e.as_any().is::<Column>() || e.as_any().is::<Literal>());
-        // If expressions are all either column_expr or Literal, then all computations in this projection are reorder or rename,
+        let all_simple_exprs =
+            self.projector
+                .projection()
+                .as_ref()
+                .iter()
+                .all(|proj_expr| {
+                    !matches!(
+                        proj_expr.expr.placement(),
+                        ExpressionPlacement::KeepInPlace
+                    )
+                });
+        // If expressions are all either column_expr or Literal (or other cheap expressions),
+        // then all computations in this projection are reorder or rename,
         // and projection would not benefit from the repartition, benefits_from_input_partitioning will return false.
         vec![!all_simple_exprs]
     }
@@ -217,12 +318,27 @@ impl ExecutionPlan for ProjectionExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for proj_expr in self.projector.projection().as_ref().iter() {
+            tnr = tnr.visit_sibling(|| f(proj_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        ProjectionExec::try_new(self.expr.clone(), children.swap_remove(0))
-            .map(|p| Arc::new(p) as _)
+        check_if_same_properties!(self, children);
+        ProjectionExec::try_from_projector(
+            self.projector.clone(),
+            children.swap_remove(0),
+        )
+        .map(|p| Arc::new(p) as _)
     }
 
     fn execute(
@@ -230,29 +346,33 @@ impl ExecutionPlan for ProjectionExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
-        Ok(Box::pin(ProjectionStream {
-            schema: Arc::clone(&self.schema),
-            expr: self.expr.iter().map(|x| Arc::clone(&x.0)).collect(),
-            input: self.input.execute(partition, context)?,
-            baseline_metrics: BaselineMetrics::new(&self.metrics, partition),
-        }))
+        trace!(
+            "Start ProjectionExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
+
+        let projector = self.projector.with_metrics(&self.metrics, partition);
+        Ok(Box::pin(ProjectionStream::new(
+            projector,
+            self.input.execute(partition, context)?,
+            BaselineMetrics::new(&self.metrics, partition),
+        )?))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stats = self.input.partition_statistics(partition)?;
-        Ok(stats_projection(
-            input_stats,
-            self.expr.iter().map(|(e, _)| Arc::clone(e)),
-            Arc::clone(&self.schema),
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stats =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let output_schema = self.schema();
+        Ok(Arc::new(
+            self.projector
+                .projection()
+                .project_statistics(input_stats, &output_schema)?,
         ))
     }
 
@@ -276,70 +396,150 @@ impl ExecutionPlan for ProjectionExec {
             Ok(Some(Arc::new(projection.clone())))
         }
     }
-}
 
-fn stats_projection(
-    mut stats: Statistics,
-    exprs: impl Iterator<Item = Arc<dyn PhysicalExpr>>,
-    schema: SchemaRef,
-) -> Statistics {
-    let mut primitive_row_size = 0;
-    let mut primitive_row_size_possible = true;
-    let mut column_statistics = vec![];
-    for expr in exprs {
-        let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
-            stats.column_statistics[col.index()].clone()
-        } else {
-            // TODO stats: estimate more statistics from expressions
-            // (expressions should compute their statistics themselves)
-            ColumnStatistics::new_unknown()
-        };
-        column_statistics.push(col_stats);
-        if let Ok(data_type) = expr.data_type(&schema) {
-            if let Some(value) = data_type.primitive_width() {
-                primitive_row_size += value;
-                continue;
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        // expand alias column to original expr in parent filters
+        let invert_alias_map = self.collect_reverse_alias()?;
+        let output_schema = self.schema();
+        let remapper = FilterRemapper::new(output_schema);
+        let mut child_parent_filters = Vec::with_capacity(parent_filters.len());
+
+        for filter in parent_filters {
+            // Check that column exists in child, then reassign column indices to match child schema
+            if let Some(reassigned) = remapper.try_remap(&filter)? {
+                // rewrite filter expression using invert alias map
+                let mut rewriter = PhysicalColumnRewriter::new(&invert_alias_map);
+                let rewritten = reassigned.rewrite(&mut rewriter)?.data;
+                child_parent_filters.push(PushedDownPredicate::supported(rewritten));
+            } else {
+                child_parent_filters.push(PushedDownPredicate::unsupported(filter));
+            }
+        }
+
+        Ok(FilterDescription::new().with_child(ChildFilterDescription {
+            parent_filters: child_parent_filters,
+            self_filters: vec![],
+        }))
+    }
+
+    fn handle_child_pushdown_result(
+        &self,
+        _phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        let child = self.input();
+        let mut child_order = Vec::new();
+
+        // Check and transform sort expressions
+        for sort_expr in order {
+            // Recursively transform the expression
+            let mut can_pushdown = true;
+            let transformed = Arc::clone(&sort_expr.expr).transform(|expr| {
+                if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+                    // Check if column index is valid.
+                    // This should always be true but fail gracefully if it's not.
+                    if col.index() >= self.expr().len() {
+                        can_pushdown = false;
+                        return Ok(Transformed::no(expr));
+                    }
+
+                    let proj_expr = &self.expr()[col.index()];
+
+                    // Check if projection expression is a simple column
+                    // We cannot push down order by clauses that depend on
+                    // projected computations as they would have nothing to reference.
+                    if let Some(child_col) =
+                        proj_expr.expr.as_any().downcast_ref::<Column>()
+                    {
+                        // Replace with the child column
+                        Ok(Transformed::yes(Arc::new(child_col.clone()) as _))
+                    } else {
+                        // Projection involves computation, cannot push down
+                        can_pushdown = false;
+                        Ok(Transformed::no(expr))
+                    }
+                } else {
+                    Ok(Transformed::no(expr))
+                }
+            })?;
+
+            if !can_pushdown {
+                return Ok(SortOrderPushdownResult::Unsupported);
+            }
+
+            child_order.push(PhysicalSortExpr {
+                expr: transformed.data,
+                options: sort_expr.options,
+            });
+        }
+
+        // Recursively push down to child node
+        match child.try_pushdown_sort(&child_order)? {
+            SortOrderPushdownResult::Exact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Exact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Inexact { inner } => {
+                let new_exec = Arc::new(self.clone()).with_new_children(vec![inner])?;
+                Ok(SortOrderPushdownResult::Inexact { inner: new_exec })
+            }
+            SortOrderPushdownResult::Unsupported => {
+                Ok(SortOrderPushdownResult::Unsupported)
             }
         }
-        primitive_row_size_possible = false;
     }
 
-    if primitive_row_size_possible {
-        stats.total_byte_size =
-            Precision::Exact(primitive_row_size).multiply(&stats.num_rows);
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
     }
-    stats.column_statistics = column_statistics;
-    stats
 }
 
 impl ProjectionStream {
+    /// Create a new projection stream
+    fn new(
+        projector: Projector,
+        input: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Result<Self> {
+        Ok(Self {
+            projector,
+            input,
+            baseline_metrics,
+        })
+    }
+
     fn batch_project(&self, batch: &RecordBatch) -> Result<RecordBatch> {
         // Records time on drop
         let _timer = self.baseline_metrics.elapsed_compute().timer();
-        let arrays = self
-            .expr
-            .iter()
-            .map(|expr| {
-                expr.evaluate(batch)
-                    .and_then(|v| v.into_array(batch.num_rows()))
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        if arrays.is_empty() {
-            let options =
-                RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
-            RecordBatch::try_new_with_options(Arc::clone(&self.schema), arrays, &options)
-                .map_err(Into::into)
-        } else {
-            RecordBatch::try_new(Arc::clone(&self.schema), arrays).map_err(Into::into)
-        }
+        self.projector.project_batch(batch)
     }
 }
 
 /// Projection iterator
 struct ProjectionStream {
-    schema: SchemaRef,
-    expr: Vec<Arc<dyn PhysicalExpr>>,
+    projector: Projector,
     input: SendableRecordBatchStream,
     baseline_metrics: BaselineMetrics,
 }
@@ -368,10 +568,19 @@ impl Stream for ProjectionStream {
 impl RecordBatchStream for ProjectionStream {
     /// Get the schema
     fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
+        Arc::clone(self.projector.output_schema())
     }
 }
 
+/// Trait for execution plans that can embed a projection, avoiding a separate
+/// [`ProjectionExec`] wrapper.
+///
+/// # Empty projections
+///
+/// `Some(vec![])` is a valid projection that produces zero output columns while
+/// preserving the correct row count. Implementors must ensure that runtime batch
+/// construction still returns batches with the right number of rows even when no
+/// columns are selected (e.g. for `SELECT count(1) … JOIN …`).
 pub trait EmbeddedProjection: ExecutionPlan + Sized {
     fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self>;
 }
@@ -382,6 +591,15 @@ pub fn try_embed_projection<Exec: EmbeddedProjection + 'static>(
     projection: &ProjectionExec,
     execution_plan: &Exec,
 ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    // If the projection has no expressions at all (e.g., ProjectionExec: expr=[]),
+    // embed an empty projection into the execution plan so it outputs zero columns.
+    // This avoids allocating throwaway null arrays for build-side columns
+    // when no output columns are actually needed (e.g., count(1) over a right join).
+    if projection.expr().is_empty() {
+        let new_execution_plan = Arc::new(execution_plan.with_projection(Some(vec![]))?);
+        return Ok(Some(new_execution_plan));
+    }
+
     // Collect all column indices from the given projection expressions.
     let projection_index = collect_column_indices(projection.expr());
 
@@ -404,22 +622,25 @@ pub fn try_embed_projection<Exec: EmbeddedProjection + 'static>(
     let embed_project_exprs = projection_index
         .iter()
         .zip(new_execution_plan.schema().fields())
-        .map(|(index, field)| {
-            (
-                Arc::new(Column::new(field.name(), *index)) as Arc<dyn PhysicalExpr>,
-                field.name().to_owned(),
-            )
+        .map(|(index, field)| ProjectionExpr {
+            expr: Arc::new(Column::new(field.name(), *index)) as Arc<dyn PhysicalExpr>,
+            alias: field.name().to_owned(),
         })
         .collect::<Vec<_>>();
 
     let mut new_projection_exprs = Vec::with_capacity(projection.expr().len());
 
-    for (expr, alias) in projection.expr() {
+    for proj_expr in projection.expr() {
         // update column index for projection expression since the input schema has been changed.
-        let Some(expr) = update_expr(expr, embed_project_exprs.as_slice(), false)? else {
+        let Some(expr) =
+            update_expr(&proj_expr.expr, embed_project_exprs.as_slice(), false)?
+        else {
             return Ok(None);
         };
-        new_projection_exprs.push((expr, alias.clone()));
+        new_projection_exprs.push(ProjectionExpr {
+            expr,
+            alias: proj_expr.alias.clone(),
+        });
     }
     // Old projection may contain some alias or expression such as `a + 1` and `CAST('true' AS BOOLEAN)`, but our projection_exprs in hash join just contain column, so we need to create the new projection to keep the original projection.
     let new_projection = Arc::new(ProjectionExec::try_new(
@@ -445,7 +666,7 @@ pub fn try_pushdown_through_join(
     join_left: &Arc<dyn ExecutionPlan>,
     join_right: &Arc<dyn ExecutionPlan>,
     join_on: JoinOnRef,
-    schema: SchemaRef,
+    schema: &SchemaRef,
     filter: Option<&JoinFilter>,
 ) -> Result<Option<JoinData>> {
     // Convert projected expressions to columns. We can not proceed if this is not possible.
@@ -458,7 +679,7 @@ pub fn try_pushdown_through_join(
 
     if !join_allows_pushdown(
         &projection_as_columns,
-        &schema,
+        schema,
         far_right_left_col_ind,
         far_left_right_col_ind,
     ) {
@@ -534,21 +755,23 @@ pub fn remove_unnecessary_projections(
 /// but `SELECT b, a` and `SELECT a+1, b` and `SELECT a AS c, b` are not.
 fn is_projection_removable(projection: &ProjectionExec) -> bool {
     let exprs = projection.expr();
-    exprs.iter().enumerate().all(|(idx, (expr, alias))| {
-        let Some(col) = expr.as_any().downcast_ref::<Column>() else {
+    exprs.iter().enumerate().all(|(idx, proj_expr)| {
+        let Some(col) = proj_expr.expr.as_any().downcast_ref::<Column>() else {
             return false;
         };
-        col.name() == alias && col.index() == idx
+        col.name() == proj_expr.alias && col.index() == idx
     }) && exprs.len() == projection.input().schema().fields().len()
 }
 
 /// Given the expression set of a projection, checks if the projection causes
 /// any renaming or constructs a non-`Column` physical expression.
-pub fn all_alias_free_columns(exprs: &[(Arc<dyn PhysicalExpr>, String)]) -> bool {
-    exprs.iter().all(|(expr, alias)| {
-        expr.as_any()
+pub fn all_alias_free_columns(exprs: &[ProjectionExpr]) -> bool {
+    exprs.iter().all(|proj_expr| {
+        proj_expr
+            .expr
+            .as_any()
             .downcast_ref::<Column>()
-            .map(|column| column.name() == alias)
+            .map(|column| column.name() == proj_expr.alias)
             .unwrap_or(false)
     })
 }
@@ -557,14 +780,15 @@ pub fn all_alias_free_columns(exprs: &[(Arc<dyn PhysicalExpr>, String)]) -> bool
 /// projection operator's expressions. To use this function safely, one must
 /// ensure that all expressions are `Column` expressions without aliases.
 pub fn new_projections_for_columns(
-    projection: &ProjectionExec,
+    projection: &[ProjectionExpr],
     source: &[usize],
 ) -> Vec<usize> {
     projection
-        .expr()
         .iter()
-        .filter_map(|(expr, _)| {
-            expr.as_any()
+        .filter_map(|proj_expr| {
+            proj_expr
+                .expr
+                .as_any()
                 .downcast_ref::<Column>()
                 .map(|expr| source[expr.index()])
         })
@@ -582,101 +806,61 @@ pub fn make_with_child(
 }
 
 /// Returns `true` if all the expressions in the argument are `Column`s.
-pub fn all_columns(exprs: &[(Arc<dyn PhysicalExpr>, String)]) -> bool {
-    exprs.iter().all(|(expr, _)| expr.as_any().is::<Column>())
+pub fn all_columns(exprs: &[ProjectionExpr]) -> bool {
+    exprs
+        .iter()
+        .all(|proj_expr| proj_expr.expr.as_any().is::<Column>())
 }
 
-/// The function operates in two modes:
-///
-/// 1) When `sync_with_child` is `true`:
-///
-///    The function updates the indices of `expr` if the expression resides
-///    in the input plan. For instance, given the expressions `a@1 + b@2`
-///    and `c@0` with the input schema `c@2, a@0, b@1`, the expressions are
-///    updated to `a@0 + b@1` and `c@2`.
-///
-/// 2) When `sync_with_child` is `false`:
-///
-///    The function determines how the expression would be updated if a projection
-///    was placed before the plan associated with the expression. If the expression
-///    cannot be rewritten after the projection, it returns `None`. For example,
-///    given the expressions `c@0`, `a@1` and `b@2`, and the [`ProjectionExec`] with
-///    an output schema of `a, c_new`, then `c@0` becomes `c_new@1`, `a@1` becomes
-///    `a@0`, but `b@2` results in `None` since the projection does not include `b`.
-pub fn update_expr(
-    expr: &Arc<dyn PhysicalExpr>,
-    projected_exprs: &[(Arc<dyn PhysicalExpr>, String)],
-    sync_with_child: bool,
-) -> Result<Option<Arc<dyn PhysicalExpr>>> {
-    #[derive(Debug, PartialEq)]
-    enum RewriteState {
-        /// The expression is unchanged.
-        Unchanged,
-        /// Some part of the expression has been rewritten
-        RewrittenValid,
-        /// Some part of the expression has been rewritten, but some column
-        /// references could not be.
-        RewrittenInvalid,
+/// Updates the given lexicographic ordering according to given projected
+/// expressions using the [`update_expr`] function.
+pub fn update_ordering(
+    ordering: LexOrdering,
+    projected_exprs: &[ProjectionExpr],
+) -> Result<Option<LexOrdering>> {
+    let mut updated_exprs = vec![];
+    for mut sort_expr in ordering.into_iter() {
+        let Some(updated_expr) = update_expr(&sort_expr.expr, projected_exprs, false)?
+        else {
+            return Ok(None);
+        };
+        sort_expr.expr = updated_expr;
+        updated_exprs.push(sort_expr);
     }
+    Ok(LexOrdering::new(updated_exprs))
+}
 
-    let mut state = RewriteState::Unchanged;
-
-    let new_expr = Arc::clone(expr)
-        .transform_up(|expr: Arc<dyn PhysicalExpr>| {
-            if state == RewriteState::RewrittenInvalid {
-                return Ok(Transformed::no(expr));
-            }
-
-            let Some(column) = expr.as_any().downcast_ref::<Column>() else {
-                return Ok(Transformed::no(expr));
-            };
-            if sync_with_child {
-                state = RewriteState::RewrittenValid;
-                // Update the index of `column`:
-                Ok(Transformed::yes(Arc::clone(
-                    &projected_exprs[column.index()].0,
-                )))
-            } else {
-                // default to invalid, in case we can't find the relevant column
-                state = RewriteState::RewrittenInvalid;
-                // Determine how to update `column` to accommodate `projected_exprs`
-                projected_exprs
-                    .iter()
-                    .enumerate()
-                    .find_map(|(index, (projected_expr, alias))| {
-                        projected_expr.as_any().downcast_ref::<Column>().and_then(
-                            |projected_column| {
-                                (column.name().eq(projected_column.name())
-                                    && column.index() == projected_column.index())
-                                .then(|| {
-                                    state = RewriteState::RewrittenValid;
-                                    Arc::new(Column::new(alias, index)) as _
-                                })
-                            },
-                        )
-                    })
-                    .map_or_else(
-                        || Ok(Transformed::no(expr)),
-                        |c| Ok(Transformed::yes(c)),
-                    )
-            }
-        })
-        .data();
-
-    new_expr.map(|e| (state == RewriteState::RewrittenValid).then_some(e))
+/// Updates the given lexicographic requirement according to given projected
+/// expressions using the [`update_expr`] function.
+pub fn update_ordering_requirement(
+    reqs: LexRequirement,
+    projected_exprs: &[ProjectionExpr],
+) -> Result<Option<LexRequirement>> {
+    let mut updated_exprs = vec![];
+    for mut sort_expr in reqs.into_iter() {
+        let Some(updated_expr) = update_expr(&sort_expr.expr, projected_exprs, false)?
+        else {
+            return Ok(None);
+        };
+        sort_expr.expr = updated_expr;
+        updated_exprs.push(sort_expr);
+    }
+    Ok(LexRequirement::new(updated_exprs))
 }
 
 /// Downcasts all the expressions in `exprs` to `Column`s. If any of the given
 /// expressions is not a `Column`, returns `None`.
 pub fn physical_to_column_exprs(
-    exprs: &[(Arc<dyn PhysicalExpr>, String)],
+    exprs: &[ProjectionExpr],
 ) -> Option<Vec<(Column, String)>> {
     exprs
         .iter()
-        .map(|(expr, alias)| {
-            expr.as_any()
+        .map(|proj_expr| {
+            proj_expr
+                .expr
+                .as_any()
                 .downcast_ref::<Column>()
-                .map(|col| (col.clone(), alias.clone()))
+                .map(|col| (col.clone(), proj_expr.alias.clone()))
         })
         .collect()
 }
@@ -694,13 +878,10 @@ pub fn new_join_children(
     let new_left = ProjectionExec::try_new(
         projection_as_columns[0..=far_right_left_col_ind as _]
             .iter()
-            .map(|(col, alias)| {
-                (
-                    Arc::new(Column::new(col.name(), col.index())) as _,
-                    alias.clone(),
-                )
-            })
-            .collect_vec(),
+            .map(|(col, alias)| ProjectionExpr {
+                expr: Arc::new(Column::new(col.name(), col.index())) as _,
+                alias: alias.clone(),
+            }),
         Arc::clone(left_child),
     )?;
     let left_size = left_child.schema().fields().len() as i32;
@@ -708,17 +889,16 @@ pub fn new_join_children(
         projection_as_columns[far_left_right_col_ind as _..]
             .iter()
             .map(|(col, alias)| {
-                (
-                    Arc::new(Column::new(
+                ProjectionExpr {
+                    expr: Arc::new(Column::new(
                         col.name(),
                         // Align projected expressions coming from the right
                         // table with the new right child projection:
                         (col.index() as i32 - left_size) as _,
                     )) as _,
-                    alias.clone(),
-                )
-            })
-            .collect_vec(),
+                    alias: alias.clone(),
+                }
+            }),
         Arc::clone(right_child),
     )?;
 
@@ -786,10 +966,6 @@ pub fn update_join_on(
     hash_join_on: &[(PhysicalExprRef, PhysicalExprRef)],
     left_field_size: usize,
 ) -> Option<Vec<(PhysicalExprRef, PhysicalExprRef)>> {
-    // TODO: Clippy wants the "map" call removed, but doing so generates
-    //       a compilation error. Remove the clippy directive once this
-    //       issue is fixed.
-    #[allow(clippy::map_identity)]
     let (left_idx, right_idx): (Vec<_>, Vec<_>) = hash_join_on
         .iter()
         .map(|(left, right)| (left, right))
@@ -860,45 +1036,54 @@ fn try_unifying_projections(
     let mut column_ref_map: HashMap<Column, usize> = HashMap::new();
 
     // Collect the column references usage in the outer projection.
-    projection.expr().iter().for_each(|(expr, _)| {
-        expr.apply(|expr| {
-            Ok({
-                if let Some(column) = expr.as_any().downcast_ref::<Column>() {
-                    *column_ref_map.entry(column.clone()).or_default() += 1;
-                }
-                TreeNodeRecursion::Continue
+    projection.expr().iter().for_each(|proj_expr| {
+        proj_expr
+            .expr
+            .apply(|expr| {
+                Ok({
+                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                        *column_ref_map.entry(column.clone()).or_default() += 1;
+                    }
+                    TreeNodeRecursion::Continue
+                })
             })
-        })
-        .unwrap();
+            .unwrap();
     });
     // Merging these projections is not beneficial, e.g
-    // If an expression is not trivial and it is referred more than 1, unifies projections will be
+    // If an expression is not trivial (KeepInPlace) and it is referred more than 1, unifies projections will be
     // beneficial as caching mechanism for non-trivial computations.
     // See discussion in: https://github.com/apache/datafusion/issues/8296
     if column_ref_map.iter().any(|(column, count)| {
-        *count > 1 && !is_expr_trivial(&Arc::clone(&child.expr()[column.index()].0))
+        *count > 1
+            && !child.expr()[column.index()]
+                .expr
+                .placement()
+                .should_push_to_leaves()
     }) {
         return Ok(None);
     }
-    for (expr, alias) in projection.expr() {
+    for proj_expr in projection.expr() {
         // If there is no match in the input projection, we cannot unify these
         // projections. This case will arise if the projection expression contains
         // a `PhysicalExpr` variant `update_expr` doesn't support.
-        let Some(expr) = update_expr(expr, child.expr(), true)? else {
+        let Some(expr) = update_expr(&proj_expr.expr, child.expr(), true)? else {
             return Ok(None);
         };
-        projected_exprs.push((expr, alias.clone()));
+        projected_exprs.push(ProjectionExpr {
+            expr,
+            alias: proj_expr.alias.clone(),
+        });
     }
     ProjectionExec::try_new(projected_exprs, Arc::clone(child.input()))
         .map(|e| Some(Arc::new(e) as _))
 }
 
 /// Collect all column indices from the given projection expressions.
-fn collect_column_indices(exprs: &[(Arc<dyn PhysicalExpr>, String)]) -> Vec<usize> {
+fn collect_column_indices(exprs: &[ProjectionExpr]) -> Vec<usize> {
     // Collect indices and remove duplicates.
     let mut indices = exprs
         .iter()
-        .flat_map(|(expr, _)| collect_columns(expr))
+        .flat_map(|proj_expr| collect_columns(&proj_expr.expr))
         .map(|x| x.index())
         .collect::<std::collections::HashSet<_>>()
         .into_iter()
@@ -983,26 +1168,25 @@ fn new_columns_for_join_on(
     (new_columns.len() == hash_join_on.len()).then_some(new_columns)
 }
 
-/// Checks if the given expression is trivial.
-/// An expression is considered trivial if it is either a `Column` or a `Literal`.
-fn is_expr_trivial(expr: &Arc<dyn PhysicalExpr>) -> bool {
-    expr.as_any().downcast_ref::<Column>().is_some()
-        || expr.as_any().downcast_ref::<Literal>().is_some()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use std::sync::Arc;
 
     use crate::common::collect;
+
+    use crate::filter_pushdown::PushedDown;
     use crate::test;
+    use crate::test::exec::StatisticsExec;
 
-    use arrow::datatypes::DataType;
+    use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::ScalarValue;
+    use datafusion_common::stats::{ColumnStatistics, Precision, Statistics};
 
     use datafusion_expr::Operator;
-    use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
+    use datafusion_physical_expr::expressions::{
+        BinaryExpr, Column, DynamicFilterPhysicalExpr, Literal, binary, col, lit,
+    };
 
     #[test]
     fn test_collect_column_indices() -> Result<()> {
@@ -1015,7 +1199,10 @@ mod tests {
                 Arc::new(Column::new("a", 1)),
             )),
         ));
-        let column_indices = collect_column_indices(&[(expr, "b-(1+a)".to_string())]);
+        let column_indices = collect_column_indices(&[ProjectionExpr {
+            expr,
+            alias: "b-(1+a)".to_string(),
+        }]);
         assert_eq!(column_indices, vec![1, 7]);
         Ok(())
     }
@@ -1082,7 +1269,7 @@ mod tests {
         let exec = test::scan_partitioned(1);
         let expected = collect(exec.execute(0, Arc::clone(&task_ctx))?).await?;
 
-        let projection = ProjectionExec::try_new(vec![], exec)?;
+        let projection = ProjectionExec::try_new(vec![] as Vec<ProjectionExpr>, exec)?;
         let stream = projection.execute(0, Arc::clone(&task_ctx))?;
         let output = collect(stream).await?;
         assert_eq!(output.len(), expected.len());
@@ -1090,111 +1277,529 @@ mod tests {
         Ok(())
     }
 
-    fn get_stats() -> Statistics {
-        Statistics {
-            num_rows: Precision::Exact(5),
-            total_byte_size: Precision::Exact(23),
+    #[tokio::test]
+    async fn project_old_syntax() {
+        let exec = test::scan_partitioned(1);
+        let schema = exec.schema();
+        let expr = col("i", &schema).unwrap();
+        ProjectionExec::try_new(
+            vec![
+                // use From impl of ProjectionExpr to create ProjectionExpr
+                // to test old syntax
+                (expr, "c".to_string()),
+            ],
+            exec,
+        )
+        // expect this to succeed
+        .unwrap();
+    }
+
+    #[test]
+    fn test_projection_statistics_uses_input_schema() {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, false),
+            Field::new("d", DataType::Int32, false),
+            Field::new("e", DataType::Int32, false),
+            Field::new("f", DataType::Int32, false),
+        ]);
+
+        let input_statistics = Statistics {
+            num_rows: Precision::Exact(10),
             column_statistics: vec![
                 ColumnStatistics {
-                    distinct_count: Precision::Exact(5),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
-                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    ..Default::default()
+                },
+                ColumnStatistics {
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(50))),
+                    ..Default::default()
+                },
+                ColumnStatistics {
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(40))),
+                    ..Default::default()
+                },
+                ColumnStatistics {
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(20))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(30))),
+                    ..Default::default()
                 },
                 ColumnStatistics {
-                    distinct_count: Precision::Exact(1),
-                    max_value: Precision::Exact(ScalarValue::from("x")),
-                    min_value: Precision::Exact(ScalarValue::from("a")),
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Exact(3),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(21))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(29))),
+                    ..Default::default()
                 },
                 ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
-                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
-                    sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
-                    null_count: Precision::Absent,
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(24))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(26))),
+                    ..Default::default()
                 },
             ],
-        }
-    }
+            ..Default::default()
+        };
 
-    fn get_schema() -> Schema {
-        let field_0 = Field::new("col0", DataType::Int64, false);
-        let field_1 = Field::new("col1", DataType::Utf8, false);
-        let field_2 = Field::new("col2", DataType::Float32, false);
-        Schema::new(vec![field_0, field_1, field_2])
+        let input = Arc::new(StatisticsExec::new(input_statistics, input_schema));
+
+        // Create projection expressions that reference columns from the input schema and the length
+        // of output schema columns < input schema columns and hence if we use the last few columns
+        // from the input schema in the expressions here, bounds_check would fail on them if output
+        // schema is supplied to the partitions_statistics method.
+        let exprs: Vec<ProjectionExpr> = vec![
+            ProjectionExpr {
+                expr: Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>,
+                alias: "c_renamed".to_string(),
+            },
+            ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("e", 4)),
+                    Operator::Plus,
+                    Arc::new(Column::new("f", 5)),
+                )) as Arc<dyn PhysicalExpr>,
+                alias: "e_plus_f".to_string(),
+            },
+        ];
+
+        let projection = ProjectionExec::try_new(exprs, input).unwrap();
+
+        let stats = projection.partition_statistics(None).unwrap();
+
+        assert_eq!(stats.num_rows, Precision::Exact(10));
+        assert_eq!(
+            stats.column_statistics.len(),
+            2,
+            "Expected 2 columns in projection statistics"
+        );
+        assert!(stats.total_byte_size.is_exact().unwrap_or(false));
     }
-    #[tokio::test]
-    async fn test_stats_projection_columns_only() {
-        let source = get_stats();
-        let schema = get_schema();
 
-        let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
-            Arc::new(Column::new("col1", 1)),
-            Arc::new(Column::new("col0", 0)),
-        ];
+    #[test]
+    fn test_filter_pushdown_with_alias() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics::new_unknown(&input_schema),
+            input_schema.clone(),
+        ));
 
-        let result = stats_projection(source, exprs.into_iter(), Arc::new(schema));
+        // project "a" as "b"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "b".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "b > 5"
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        // Should be converted to "a > 5"
+        // "a" is index 0 in input
+        let expected_filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        assert_eq!(description.self_filters(), vec![vec![]]);
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            format!("{}", expected_filter)
+        );
+        // Verify the predicate was actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
 
-        let expected = Statistics {
-            num_rows: Precision::Exact(5),
-            total_byte_size: Precision::Exact(23),
-            column_statistics: vec![
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(1),
-                    max_value: Precision::Exact(ScalarValue::from("x")),
-                    min_value: Precision::Exact(ScalarValue::from("a")),
-                    sum_value: Precision::Absent,
-                    null_count: Precision::Exact(3),
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_multiple_aliases() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "x", "b" as "y"
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "x".to_string(),
                 },
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(5),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
-                    null_count: Precision::Exact(0),
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "y".to_string(),
                 },
             ],
-        };
+            input,
+        )?;
+
+        // filter "x > 5"
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("x", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "y < 10"
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("y", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        // Should be converted to "a > 5" and "b < 10"
+        let expected_filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let expected_filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+        // Note: The order of filters is preserved
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            format!("{}", expected_filter1)
+        );
+        assert_eq!(
+            format!("{}", pushed_filters[1].predicate),
+            format!("{}", expected_filter2)
+        );
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
 
-        assert_eq!(result, expected);
+        Ok(())
     }
 
-    #[tokio::test]
-    async fn test_stats_projection_column_with_primitive_width_only() {
-        let source = get_stats();
-        let schema = get_schema();
+    #[test]
+    fn test_filter_pushdown_with_swapped_aliases() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
 
-        let exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
-            Arc::new(Column::new("col2", 2)),
-            Arc::new(Column::new("col0", 0)),
-        ];
+        // project "a" as "b", "b" as "a"
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "b".to_string(),
+                },
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "a".to_string(),
+                },
+            ],
+            input,
+        )?;
+
+        // filter "b > 5" (output column 0, which is "a" in input)
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "a < 10" (output column 1, which is "b" in input)
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+
+        // "b" (output index 0) -> "a" (input index 0)
+        let expected_filter1 = "a@0 > 5";
+        // "a" (output index 1) -> "b" (input index 1)
+        let expected_filter2 = "b@1 < 10";
+
+        assert_eq!(format!("{}", pushed_filters[0].predicate), expected_filter1);
+        assert_eq!(format!("{}", pushed_filters[1].predicate), expected_filter2);
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
 
-        let result = stats_projection(source, exprs.into_iter(), Arc::new(schema));
+        Ok(())
+    }
 
-        let expected = Statistics {
-            num_rows: Precision::Exact(5),
-            total_byte_size: Precision::Exact(60),
-            column_statistics: vec![
-                ColumnStatistics {
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
-                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
-                    sum_value: Precision::Exact(ScalarValue::Float32(Some(5.5))),
-                    null_count: Precision::Absent,
+    #[test]
+    fn test_filter_pushdown_with_mixed_columns() -> Result<()> {
+        let input_schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "x", "b" as "b" (pass through)
+        let projection = ProjectionExec::try_new(
+            vec![
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("a", 0)),
+                    alias: "x".to_string(),
                 },
-                ColumnStatistics {
-                    distinct_count: Precision::Exact(5),
-                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
-                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
-                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
-                    null_count: Precision::Exact(0),
+                ProjectionExpr {
+                    expr: Arc::new(Column::new("b", 1)),
+                    alias: "b".to_string(),
                 },
             ],
-        };
+            input,
+        )?;
+
+        // filter "x > 5"
+        let filter1 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("x", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        // filter "b < 10" (using output index 1 which corresponds to 'b')
+        let filter2 = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("b", 1)),
+            Operator::Lt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter1, filter2],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert_eq!(pushed_filters.len(), 2);
+        // "x" -> "a" (index 0)
+        let expected_filter1 = "a@0 > 5";
+        // "b" -> "b" (index 1)
+        let expected_filter2 = "b@1 < 10";
+
+        assert_eq!(format!("{}", pushed_filters[0].predicate), expected_filter1);
+        assert_eq!(format!("{}", pushed_filters[1].predicate), expected_filter2);
+        // Verify the predicates were actually pushed down
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert!(matches!(pushed_filters[1].discriminant, PushedDown::Yes));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_complex_expression() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a + 1" as "z"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(BinaryExpr::new(
+                    Arc::new(Column::new("a", 0)),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                alias: "z".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "z > 10"
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("z", 0)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        // expand to `a + 1 > 10`
+        let pushed_filters = &description.parent_filters()[0];
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::Yes));
+        assert_eq!(format!("{}", pushed_filters[0].predicate), "a@0 + 1 > 10");
 
-        assert_eq!(result, expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_filter_pushdown_with_unknown_column() -> Result<()> {
+        let input_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.clone(),
+        ));
+
+        // project "a" as "a"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                alias: "a".to_string(),
+            }],
+            input,
+        )?;
+
+        // filter "unknown_col > 5" - using a column name that doesn't exist in projection output
+        // Column constructor: name, index. Index 1 doesn't exist.
+        let filter = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("unknown_col", 1)),
+            Operator::Gt,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
+        )) as Arc<dyn PhysicalExpr>;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![filter],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0];
+        assert!(matches!(pushed_filters[0].discriminant, PushedDown::No));
+        // The column shouldn't be found in the alias map, so it remains unchanged with its index
+        assert_eq!(
+            format!("{}", pushed_filters[0].predicate),
+            "unknown_col@1 > 5"
+        );
+
+        Ok(())
+    }
+
+    /// Basic test for `DynamicFilterPhysicalExpr` can correctly update its child expression
+    /// i.e. starting with lit(true) and after update it becomes `a > 5`
+    /// with projection [b - 1 as a], the pushed down filter should be `b - 1 > 5`
+    #[test]
+    fn test_basic_dyn_filter_projection_pushdown_update_child() -> Result<()> {
+        let input_schema =
+            Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, false)]));
+
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                column_statistics: vec![Default::default(); input_schema.fields().len()],
+                ..Default::default()
+            },
+            input_schema.as_ref().clone(),
+        ));
+
+        // project "b" - 1 as "a"
+        let projection = ProjectionExec::try_new(
+            vec![ProjectionExpr {
+                expr: binary(
+                    Arc::new(Column::new("b", 0)),
+                    Operator::Minus,
+                    lit(1),
+                    &input_schema,
+                )
+                .unwrap(),
+                alias: "a".to_string(),
+            }],
+            input,
+        )?;
+
+        // simulate projection's parent create a dynamic filter on "a"
+        let projected_schema = projection.schema();
+        let col_a = col("a", &projected_schema)?;
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(
+            vec![Arc::clone(&col_a)],
+            lit(true),
+        ));
+        // Initial state should be lit(true)
+        let current = dynamic_filter.current()?;
+        assert_eq!(format!("{current}"), "true");
+
+        let dyn_phy_expr: Arc<dyn PhysicalExpr> = Arc::clone(&dynamic_filter) as _;
+
+        let description = projection.gather_filters_for_pushdown(
+            FilterPushdownPhase::Post,
+            vec![dyn_phy_expr],
+            &ConfigOptions::default(),
+        )?;
+
+        let pushed_filters = &description.parent_filters()[0][0];
+
+        // Check currently pushed_filters is lit(true)
+        assert_eq!(
+            format!("{}", pushed_filters.predicate),
+            "DynamicFilter [ empty ]"
+        );
+
+        // Update to a > 5 (after projection, b is now called a)
+        let new_expr =
+            Arc::new(BinaryExpr::new(Arc::clone(&col_a), Operator::Gt, lit(5i32)));
+        dynamic_filter.update(new_expr)?;
+
+        // Now it should be a > 5
+        let current = dynamic_filter.current()?;
+        assert_eq!(format!("{current}"), "a@0 > 5");
+
+        // Check currently pushed_filters is b - 1 > 5 (because b - 1 is projected as a)
+        assert_eq!(
+            format!("{}", pushed_filters.predicate),
+            "DynamicFilter [ b@0 - 1 > 5 ]"
+        );
+
+        Ok(())
     }
 }
diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs
index 210db90c3c7fe..049aa9563d52e 100644
--- a/datafusion/physical-plan/src/recursive_query.rs
+++ b/datafusion/physical-plan/src/recursive_query.rs
@@ -21,23 +21,30 @@ use std::any::Any;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use super::work_table::{ReservedBatches, WorkTable, WorkTableExec};
-use crate::execution_plan::{Boundedness, EmissionType};
+use super::work_table::{ReservedBatches, WorkTable};
+use crate::aggregates::group_values::{GroupValues, new_group_values};
+use crate::aggregates::order::GroupOrdering;
+use crate::execution_plan::{Boundedness, EmissionType, reset_plan_states};
+use crate::metrics::{
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput,
+};
 use crate::{
-    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, RecordBatchStream,
+    SendableRecordBatchStream,
 };
-use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
-
+use arrow::array::{BooleanArray, BooleanBuilder};
+use arrow::compute::filter_record_batch;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
-use datafusion_common::{not_impl_err, DataFusionError, Result};
-use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_common::{Result, internal_datafusion_err, not_impl_err};
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
+use datafusion_physical_expr::PhysicalExpr;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 
 /// Recursive query execution plan.
 ///
@@ -69,7 +76,7 @@ pub struct RecursiveQueryExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl RecursiveQueryExec {
@@ -81,9 +88,9 @@ impl RecursiveQueryExec {
         is_distinct: bool,
     ) -> Result<Self> {
         // Each recursive query needs its own work table
-        let work_table = Arc::new(WorkTable::new());
+        let work_table = Arc::new(WorkTable::new(name.clone()));
         // Use the same work table for both the WorkTableExec and the recursive term
-        let recursive_term = assign_work_table(recursive_term, Arc::clone(&work_table))?;
+        let recursive_term = assign_work_table(recursive_term, &work_table)?;
         let cache = Self::compute_properties(static_term.schema());
         Ok(RecursiveQueryExec {
             name,
@@ -92,7 +99,7 @@ impl RecursiveQueryExec {
             is_distinct,
             work_table,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -138,7 +145,7 @@ impl ExecutionPlan for RecursiveQueryExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -146,6 +153,13 @@ impl ExecutionPlan for RecursiveQueryExec {
         vec![&self.static_term, &self.recursive_term]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     // TODO: control these hints and see whether we can
     // infer some from the child plans (static/recursive terms).
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -183,9 +197,9 @@ impl ExecutionPlan for RecursiveQueryExec {
     ) -> Result<SendableRecordBatchStream> {
         // TODO: we might be able to handle multiple partitions in the future.
         if partition != 0 {
-            return Err(DataFusionError::Internal(format!(
+            return Err(internal_datafusion_err!(
                 "RecursiveQueryExec got an invalid partition {partition} (expected 0)"
-            )));
+            ));
         }
 
         let static_stream = self.static_term.execute(partition, Arc::clone(&context))?;
@@ -195,17 +209,14 @@ impl ExecutionPlan for RecursiveQueryExec {
             Arc::clone(&self.work_table),
             Arc::clone(&self.recursive_term),
             static_stream,
+            self.is_distinct,
             baseline_metrics,
-        )))
+        )?))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
-
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
-    }
 }
 
 impl DisplayAs for RecursiveQueryExec {
@@ -247,7 +258,6 @@ impl DisplayAs for RecursiveQueryExec {
 ///    while batch := recursive_stream.next():
 ///        buffer.append(batch)
 ///        yield buffer
-///
 struct RecursiveQueryStream {
     /// The context to be used for managing handlers & executing new tasks
     task_context: Arc<TaskContext>,
@@ -268,8 +278,10 @@ struct RecursiveQueryStream {
     buffer: Vec<RecordBatch>,
     /// Tracks the memory used by the buffer
     reservation: MemoryReservation,
-    // /// Metrics.
-    _baseline_metrics: BaselineMetrics,
+    /// If the distinct flag is set, then we use this hash table to remove duplicates from result and work tables
+    distinct_deduplicator: Option<DistinctDeduplicator>,
+    /// Metrics.
+    baseline_metrics: BaselineMetrics,
 }
 
 impl RecursiveQueryStream {
@@ -279,12 +291,16 @@ impl RecursiveQueryStream {
         work_table: Arc<WorkTable>,
         recursive_term: Arc<dyn ExecutionPlan>,
         static_stream: SendableRecordBatchStream,
+        is_distinct: bool,
         baseline_metrics: BaselineMetrics,
-    ) -> Self {
+    ) -> Result<Self> {
         let schema = static_stream.schema();
         let reservation =
             MemoryConsumer::new("RecursiveQuery").register(task_context.memory_pool());
-        Self {
+        let distinct_deduplicator = is_distinct
+            .then(|| DistinctDeduplicator::new(Arc::clone(&schema), &task_context))
+            .transpose()?;
+        Ok(Self {
             task_context,
             work_table,
             recursive_term,
@@ -293,21 +309,28 @@ impl RecursiveQueryStream {
             schema,
             buffer: vec![],
             reservation,
-            _baseline_metrics: baseline_metrics,
-        }
+            distinct_deduplicator,
+            baseline_metrics,
+        })
     }
 
     /// Push a clone of the given batch to the in memory buffer, and then return
     /// a poll with it.
     fn push_batch(
         mut self: std::pin::Pin<&mut Self>,
-        batch: RecordBatch,
+        mut batch: RecordBatch,
     ) -> Poll<Option<Result<RecordBatch>>> {
+        let baseline_metrics = self.baseline_metrics.clone();
+        if let Some(deduplicator) = &mut self.distinct_deduplicator {
+            let _timer_guard = baseline_metrics.elapsed_compute().timer();
+            batch = deduplicator.deduplicate(&batch)?;
+        }
+
         if let Err(e) = self.reservation.try_grow(batch.get_array_memory_size()) {
             return Poll::Ready(Some(Err(e)));
         }
-
         self.buffer.push(batch.clone());
+        (&batch).record_output(&baseline_metrics);
         Poll::Ready(Some(Ok(batch)))
     }
 
@@ -347,23 +370,21 @@ impl RecursiveQueryStream {
 
 fn assign_work_table(
     plan: Arc<dyn ExecutionPlan>,
-    work_table: Arc<WorkTable>,
+    work_table: &Arc<WorkTable>,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let mut work_table_refs = 0;
     plan.transform_down(|plan| {
-        if let Some(exec) = plan.as_any().downcast_ref::<WorkTableExec>() {
+        if let Some(new_plan) =
+            plan.with_new_state(Arc::clone(work_table) as Arc<dyn Any + Send + Sync>)
+        {
             if work_table_refs > 0 {
                 not_impl_err!(
                     "Multiple recursive references to the same CTE are not supported"
                 )
             } else {
                 work_table_refs += 1;
-                Ok(Transformed::yes(Arc::new(
-                    exec.with_work_table(Arc::clone(&work_table)),
-                )))
+                Ok(Transformed::yes(new_plan))
             }
-        } else if plan.as_any().is::<RecursiveQueryExec>() {
-            not_impl_err!("Recursive queries cannot be nested")
         } else {
             Ok(Transformed::no(plan))
         }
@@ -371,26 +392,6 @@ fn assign_work_table(
     .data()
 }
 
-/// Some plans will change their internal states after execution, making them unable to be executed again.
-/// This function uses `ExecutionPlan::with_new_children` to fork a new plan with initial states.
-///
-/// An example is `CrossJoinExec`, which loads the left table into memory and stores it in the plan.
-/// However, if the data of the left table is derived from the work table, it will become outdated
-/// as the work table changes. When the next iteration executes this plan again, we must clear the left table.
-fn reset_plan_states(plan: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
-    plan.transform_up(|plan| {
-        // WorkTableExec's states have already been updated correctly.
-        if plan.as_any().is::<WorkTableExec>() {
-            Ok(Transformed::no(plan))
-        } else {
-            let new_plan = Arc::clone(&plan)
-                .with_new_children(plan.children().into_iter().cloned().collect())?;
-            Ok(Transformed::yes(new_plan))
-        }
-    })
-    .data()
-}
-
 impl Stream for RecursiveQueryStream {
     type Item = Result<RecordBatch>;
 
@@ -398,7 +399,6 @@ impl Stream for RecursiveQueryStream {
         mut self: std::pin::Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        // TODO: we should use this poll to record some metrics!
         if let Some(static_stream) = &mut self.static_stream {
             // While the static term's stream is available, we'll be forwarding the batches from it (also
             // saving them for the initial iteration of the recursive term).
@@ -435,5 +435,61 @@ impl RecordBatchStream for RecursiveQueryStream {
     }
 }
 
+/// Deduplicator based on a hash table.
+struct DistinctDeduplicator {
+    /// Grouped rows used for distinct
+    group_values: Box<dyn GroupValues>,
+    reservation: MemoryReservation,
+    intern_output_buffer: Vec<usize>,
+}
+
+impl DistinctDeduplicator {
+    fn new(schema: SchemaRef, task_context: &TaskContext) -> Result<Self> {
+        let group_values = new_group_values(schema, &GroupOrdering::None)?;
+        let reservation = MemoryConsumer::new("RecursiveQueryHashTable")
+            .register(task_context.memory_pool());
+        Ok(Self {
+            group_values,
+            reservation,
+            intern_output_buffer: Vec::new(),
+        })
+    }
+
+    /// Remove duplicated rows from the given batch, keeping a state between batches.
+    ///
+    /// We use a hash table to allocate new group ids for the new rows.
+    /// [`GroupValues`] allocate increasing group ids.
+    /// Hence, if groups (i.e., rows) are new, then they have ids >= length before interning, we keep them.
+    /// We also detect duplicates by enforcing that group ids are increasing.
+    fn deduplicate(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let size_before = self.group_values.len();
+        self.intern_output_buffer.reserve(batch.num_rows());
+        self.group_values
+            .intern(batch.columns(), &mut self.intern_output_buffer)?;
+        let mask = new_groups_mask(&self.intern_output_buffer, size_before);
+        self.intern_output_buffer.clear();
+        // We update the reservation to reflect the new size of the hash table.
+        self.reservation.try_resize(self.group_values.size())?;
+        Ok(filter_record_batch(batch, &mask)?)
+    }
+}
+
+/// Return a mask, each element being true if, and only if, the element is greater than all previous elements and greater or equal than the provided max_already_seen_group_id
+fn new_groups_mask(
+    values: &[usize],
+    mut max_already_seen_group_id: usize,
+) -> BooleanArray {
+    let mut output = BooleanBuilder::with_capacity(values.len());
+    for value in values {
+        if *value >= max_already_seen_group_id {
+            output.append_value(true);
+            max_already_seen_group_id = *value + 1; // We want to be increasing
+        } else {
+            output.append_value(false);
+        }
+    }
+    output.finish()
+}
+
 #[cfg(test)]
 mod tests {}
diff --git a/datafusion/physical-plan/src/render_tree.rs b/datafusion/physical-plan/src/render_tree.rs
index f86e4c55e7b0e..40e2763698093 100644
--- a/datafusion/physical-plan/src/render_tree.rs
+++ b/datafusion/physical-plan/src/render_tree.rs
@@ -31,11 +31,12 @@ use crate::{DisplayFormatType, ExecutionPlan};
 // TODO: It's never used.
 /// Represents a 2D coordinate in the rendered tree.
 /// Used to track positions of nodes and their connections.
-#[allow(dead_code)]
 pub struct Coordinate {
     /// Horizontal position in the tree
+    #[expect(dead_code)]
     pub x: usize,
     /// Vertical position in the tree
+    #[expect(dead_code)]
     pub y: usize,
 }
 
diff --git a/datafusion/physical-plan/src/repartition/distributor_channels.rs b/datafusion/physical-plan/src/repartition/distributor_channels.rs
index 6e06c87a48213..22872d1e32d49 100644
--- a/datafusion/physical-plan/src/repartition/distributor_channels.rs
+++ b/datafusion/physical-plan/src/repartition/distributor_channels.rs
@@ -43,8 +43,8 @@ use std::{
     ops::DerefMut,
     pin::Pin,
     sync::{
-        atomic::{AtomicUsize, Ordering},
         Arc,
+        atomic::{AtomicUsize, Ordering},
     },
     task::{Context, Poll, Waker},
 };
@@ -151,7 +151,7 @@ impl<T> Clone for DistributionSender<T> {
 impl<T> Drop for DistributionSender<T> {
     fn drop(&mut self) {
         let n_senders_pre = self.channel.n_senders.fetch_sub(1, Ordering::SeqCst);
-        // is the the last copy of the sender side?
+        // is the last copy of the sender side?
         if n_senders_pre > 1 {
             return;
         }
@@ -476,7 +476,7 @@ type SharedGate = Arc<Gate>;
 mod tests {
     use std::sync::atomic::AtomicBool;
 
-    use futures::{task::ArcWake, FutureExt};
+    use futures::{FutureExt, task::ArcWake};
 
     use super::*;
 
diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs
index d0ad50666416c..342b2f50357c5 100644
--- a/datafusion/physical-plan/src/repartition/mod.rs
+++ b/datafusion/physical-plan/src/repartition/mod.rs
@@ -30,65 +30,186 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet};
 use super::{
     DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream,
 };
-use crate::execution_plan::CardinalityEffect;
+use crate::coalesce::LimitedBatchCoalescer;
+use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType};
 use crate::hash_utils::create_hashes;
-use crate::metrics::BaselineMetrics;
-use crate::projection::{all_columns, make_with_child, update_expr, ProjectionExec};
-use crate::repartition::distributor_channels::{
-    channels, partition_aware_channels, DistributionReceiver, DistributionSender,
-};
+use crate::metrics::{BaselineMetrics, SpillMetrics};
+use crate::projection::{ProjectionExec, all_columns, make_with_child, update_expr};
 use crate::sorts::streaming_merge::StreamingMergeBuilder;
+use crate::spill::spill_manager::SpillManager;
+use crate::spill::spill_pool::{self, SpillPoolWriter};
 use crate::stream::RecordBatchStreamAdapter;
-use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics};
+use crate::{
+    DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics,
+    check_if_same_properties,
+};
 
 use arrow::array::{PrimitiveArray, RecordBatch, RecordBatchOptions};
 use arrow::compute::take_arrays;
 use arrow::datatypes::{SchemaRef, UInt32Type};
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::transpose;
-use datafusion_common::{internal_err, HashMap};
-use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_common::{
+    ColumnStatistics, DataFusionError, HashMap, assert_or_internal_err,
+    internal_datafusion_err, internal_err,
+};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryConsumer;
 use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 use crate::filter_pushdown::{
-    ChildPushdownResult, FilterDescription, FilterPushdownPropagation,
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation,
 };
+use crate::joins::SeededRandomState;
+use crate::sort_pushdown::SortOrderPushdownResult;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::Stream;
-use futures::{FutureExt, StreamExt, TryStreamExt};
+use futures::{FutureExt, StreamExt, TryStreamExt, ready};
 use log::trace;
 use parking_lot::Mutex;
 
 mod distributor_channels;
+use distributor_channels::{
+    DistributionReceiver, DistributionSender, channels, partition_aware_channels,
+};
+
+/// A batch in the repartition queue - either in memory or spilled to disk.
+///
+/// This enum represents the two states a batch can be in during repartitioning.
+/// The decision to spill is made based on memory availability when sending a batch
+/// to an output partition.
+///
+/// # Batch Flow with Spilling
+///
+/// ```text
+/// Input Stream ──▶ Partition Logic ──▶ try_grow()
+///                                            │
+///                            ┌───────────────┴────────────────┐
+///                            │                                │
+///                            ▼                                ▼
+///                   try_grow() succeeds            try_grow() fails
+///                   (Memory Available)              (Memory Pressure)
+///                            │                                │
+///                            ▼                                ▼
+///                  RepartitionBatch::Memory         spill_writer.push_batch()
+///                  (batch held in memory)           (batch written to disk)
+///                            │                                │
+///                            │                                ▼
+///                            │                      RepartitionBatch::Spilled
+///                            │                      (marker - no batch data)
+///                            │                                │
+///                            └────────┬───────────────────────┘
+///                                     │
+///                                     ▼
+///                              Send to channel
+///                                     │
+///                                     ▼
+///                            Output Stream (poll)
+///                                     │
+///                      ┌──────────────┴─────────────┐
+///                      │                            │
+///                      ▼                            ▼
+///         RepartitionBatch::Memory      RepartitionBatch::Spilled
+///         Return batch immediately       Poll spill_stream (blocks)
+///                      │                            │
+///                      └────────┬───────────────────┘
+///                               │
+///                               ▼
+///                          Return batch
+///                    (FIFO order preserved)
+/// ```
+///
+/// See [`RepartitionExec`] for overall architecture and [`StreamState`] for
+/// the state machine that handles reading these batches.
+#[derive(Debug)]
+enum RepartitionBatch {
+    /// Batch held in memory (counts against memory reservation)
+    Memory(RecordBatch),
+    /// Marker indicating a batch was spilled to the partition's SpillPool.
+    /// The actual batch can be retrieved by reading from the SpillPoolStream.
+    /// This variant contains no data itself - it's just a signal to the reader
+    /// to fetch the next batch from the spill stream.
+    Spilled,
+}
 
-type MaybeBatch = Option<Result<RecordBatch>>;
+type MaybeBatch = Option<Result<RepartitionBatch>>;
 type InputPartitionsToCurrentPartitionSender = Vec<DistributionSender<MaybeBatch>>;
 type InputPartitionsToCurrentPartitionReceiver = Vec<DistributionReceiver<MaybeBatch>>;
 
-#[derive(Debug)]
+/// Output channel with its associated memory reservation and spill writer
+struct OutputChannel {
+    sender: DistributionSender<MaybeBatch>,
+    reservation: SharedMemoryReservation,
+    spill_writer: SpillPoolWriter,
+}
+
+/// Channels and resources for a single output partition.
+///
+/// Each output partition has channels to receive data from all input partitions.
+/// To handle memory pressure, each (input, output) pair gets its own
+/// [`SpillPool`](crate::spill::spill_pool) channel via [`spill_pool::channel`].
+///
+/// # Structure
+///
+/// For an output partition receiving from N input partitions:
+/// - `tx`: N senders (one per input) for sending batches to this output
+/// - `rx`: N receivers (one per input) for receiving batches at this output
+/// - `spill_writers`: N spill writers (one per input) for writing spilled data
+/// - `spill_readers`: N spill readers (one per input) for reading spilled data
+///
+/// This 1:1 mapping between input partitions and spill channels ensures that
+/// batches from each input are processed in FIFO order, even when some batches
+/// are spilled to disk and others remain in memory.
+///
+/// See [`RepartitionExec`] for the overall N×M architecture.
+///
+/// [`spill_pool::channel`]: crate::spill::spill_pool::channel
+struct PartitionChannels {
+    /// Senders for each input partition to send data to this output partition
+    tx: InputPartitionsToCurrentPartitionSender,
+    /// Receivers for each input partition sending data to this output partition
+    rx: InputPartitionsToCurrentPartitionReceiver,
+    /// Memory reservation for this output partition
+    reservation: SharedMemoryReservation,
+    /// Spill writers for writing spilled data.
+    /// SpillPoolWriter is Clone, so multiple writers can share state in non-preserve-order mode.
+    spill_writers: Vec<SpillPoolWriter>,
+    /// Spill readers for reading spilled data - one per input partition (FIFO semantics).
+    /// Each (input, output) pair gets its own reader to maintain proper ordering.
+    spill_readers: Vec<SendableRecordBatchStream>,
+}
+
 struct ConsumingInputStreamsState {
     /// Channels for sending batches from input partitions to output partitions.
     /// Key is the partition number.
-    channels: HashMap<
-        usize,
-        (
-            InputPartitionsToCurrentPartitionSender,
-            InputPartitionsToCurrentPartitionReceiver,
-            SharedMemoryReservation,
-        ),
-    >,
-
-    /// Helper that ensures that that background job is killed once it is no longer needed.
+    channels: HashMap<usize, PartitionChannels>,
+
+    /// Helper that ensures that background jobs are killed once they are no longer needed.
     abort_helper: Arc<Vec<SpawnedTask<()>>>,
 }
 
+impl Debug for ConsumingInputStreamsState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ConsumingInputStreamsState")
+            .field("num_channels", &self.channels.len())
+            .field("abort_helper", &self.abort_helper)
+            .finish()
+    }
+}
+
 /// Inner state of [`RepartitionExec`].
+#[derive(Default)]
 enum RepartitionExecState {
     /// Not initialized yet. This is the default state stored in the RepartitionExec node
     /// upon instantiation.
+    #[default]
     NotInitialized,
     /// Input streams are initialized, but they are still not being consumed. The node
     /// transitions to this state when the arrow's RecordBatch stream is created in
@@ -99,12 +220,6 @@ enum RepartitionExecState {
     ConsumingInputStreams(ConsumingInputStreamsState),
 }
 
-impl Default for RepartitionExecState {
-    fn default() -> Self {
-        Self::NotInitialized
-    }
-}
-
 impl Debug for RepartitionExecState {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -122,10 +237,10 @@ impl Debug for RepartitionExecState {
 impl RepartitionExecState {
     fn ensure_input_streams_initialized(
         &mut self,
-        input: Arc<dyn ExecutionPlan>,
-        metrics: ExecutionPlanMetricsSet,
+        input: &Arc<dyn ExecutionPlan>,
+        metrics: &ExecutionPlanMetricsSet,
         output_partitions: usize,
-        ctx: Arc<TaskContext>,
+        ctx: &Arc<TaskContext>,
     ) -> Result<()> {
         if !matches!(self, RepartitionExecState::NotInitialized) {
             return Ok(());
@@ -135,10 +250,10 @@ impl RepartitionExecState {
         let mut streams_and_metrics = Vec::with_capacity(num_input_partitions);
 
         for i in 0..num_input_partitions {
-            let metrics = RepartitionMetrics::new(i, output_partitions, &metrics);
+            let metrics = RepartitionMetrics::new(i, output_partitions, metrics);
 
             let timer = metrics.fetch_time.timer();
-            let stream = input.execute(i, Arc::clone(&ctx))?;
+            let stream = input.execute(i, Arc::clone(ctx))?;
             timer.done();
 
             streams_and_metrics.push((stream, metrics));
@@ -147,14 +262,16 @@ impl RepartitionExecState {
         Ok(())
     }
 
+    #[expect(clippy::too_many_arguments)]
     fn consume_input_streams(
         &mut self,
-        input: Arc<dyn ExecutionPlan>,
-        metrics: ExecutionPlanMetricsSet,
-        partitioning: Partitioning,
+        input: &Arc<dyn ExecutionPlan>,
+        metrics: &ExecutionPlanMetricsSet,
+        partitioning: &Partitioning,
         preserve_order: bool,
-        name: String,
-        context: Arc<TaskContext>,
+        name: &str,
+        context: &Arc<TaskContext>,
+        spill_manager: SpillManager,
     ) -> Result<&mut ConsumingInputStreamsState> {
         let streams_and_metrics = match self {
             RepartitionExecState::NotInitialized => {
@@ -162,12 +279,14 @@ impl RepartitionExecState {
                     input,
                     metrics,
                     partitioning.partition_count(),
-                    Arc::clone(&context),
+                    context,
                 )?;
                 let RepartitionExecState::InputStreamsInitialized(value) = self else {
                     // This cannot happen, as ensure_input_streams_initialized() was just called,
                     // but the compiler does not know.
-                    return internal_err!("Programming error: RepartitionExecState must be in the InputStreamsInitialized state after calling RepartitionExecState::ensure_input_streams_initialized");
+                    return internal_err!(
+                        "Programming error: RepartitionExecState must be in the InputStreamsInitialized state after calling RepartitionExecState::ensure_input_streams_initialized"
+                    );
                 };
                 value
             }
@@ -178,17 +297,19 @@ impl RepartitionExecState {
         let num_input_partitions = streams_and_metrics.len();
         let num_output_partitions = partitioning.partition_count();
 
+        let spill_manager = Arc::new(spill_manager);
+
         let (txs, rxs) = if preserve_order {
-            let (txs, rxs) =
+            // Create partition-aware channels with one channel per (input, output) pair
+            // This provides backpressure while maintaining proper ordering
+            let (txs_all, rxs_all) =
                 partition_aware_channels(num_input_partitions, num_output_partitions);
             // Take transpose of senders and receivers. `state.channels` keeps track of entries per output partition
-            let txs = transpose(txs);
-            let rxs = transpose(rxs);
+            let txs = transpose(txs_all);
+            let rxs = transpose(rxs_all);
             (txs, rxs)
         } else {
-            // create one channel per *output* partition
-            // note we use a custom channel that ensures there is always data for each receiver
-            // but limits the amount of buffering if required.
+            // Create one channel per *output* partition with backpressure
             let (txs, rxs) = channels(num_output_partitions);
             // Clone sender for each input partitions
             let txs = txs
@@ -203,9 +324,39 @@ impl RepartitionExecState {
         for (partition, (tx, rx)) in txs.into_iter().zip(rxs).enumerate() {
             let reservation = Arc::new(Mutex::new(
                 MemoryConsumer::new(format!("{name}[{partition}]"))
+                    .with_can_spill(true)
                     .register(context.memory_pool()),
             ));
-            channels.insert(partition, (tx, rx, reservation));
+
+            // Create spill channels based on mode:
+            // - preserve_order: one spill channel per (input, output) pair for proper FIFO ordering
+            // - non-preserve-order: one shared spill channel per output partition since all inputs
+            //   share the same receiver
+            let max_file_size = context
+                .session_config()
+                .options()
+                .execution
+                .max_spill_file_size_bytes;
+            let num_spill_channels = if preserve_order {
+                num_input_partitions
+            } else {
+                1
+            };
+            let (spill_writers, spill_readers): (Vec<_>, Vec<_>) = (0
+                ..num_spill_channels)
+                .map(|_| spill_pool::channel(max_file_size, Arc::clone(&spill_manager)))
+                .unzip();
+
+            channels.insert(
+                partition,
+                PartitionChannels {
+                    tx,
+                    rx,
+                    reservation,
+                    spill_readers,
+                    spill_writers,
+                },
+            );
         }
 
         // launch one async task per *input* partition
@@ -215,26 +366,42 @@ impl RepartitionExecState {
         {
             let txs: HashMap<_, _> = channels
                 .iter()
-                .map(|(partition, (tx, _rx, reservation))| {
-                    (*partition, (tx[i].clone(), Arc::clone(reservation)))
+                .map(|(partition, channels)| {
+                    // In preserve_order mode: each input gets its own spill writer (index i)
+                    // In non-preserve-order mode: all inputs share spill writer 0 via clone
+                    let spill_writer_idx = if preserve_order { i } else { 0 };
+                    (
+                        *partition,
+                        OutputChannel {
+                            sender: channels.tx[i].clone(),
+                            reservation: Arc::clone(&channels.reservation),
+                            spill_writer: channels.spill_writers[spill_writer_idx]
+                                .clone(),
+                        },
+                    )
                 })
                 .collect();
 
+            // Extract senders for wait_for_task before moving txs
+            let senders: HashMap<_, _> = txs
+                .iter()
+                .map(|(partition, channel)| (*partition, channel.sender.clone()))
+                .collect();
+
             let input_task = SpawnedTask::spawn(RepartitionExec::pull_from_input(
                 stream,
-                txs.clone(),
+                txs,
                 partitioning.clone(),
                 metrics,
+                // preserve_order depends on partition index to start from 0
+                if preserve_order { 0 } else { i },
+                num_input_partitions,
             ));
 
             // In a separate task, wait for each input to be done
             // (and pass along any errors, including panic!s)
-            let wait_for_task = SpawnedTask::spawn(RepartitionExec::wait_for_task(
-                input_task,
-                txs.into_iter()
-                    .map(|(partition, (tx, _reservation))| (partition, tx))
-                    .collect(),
-            ));
+            let wait_for_task =
+                SpawnedTask::spawn(RepartitionExec::wait_for_task(input_task, senders));
             spawned_tasks.push(wait_for_task);
         }
         *self = Self::ConsumingInputStreams(ConsumingInputStreamsState {
@@ -256,10 +423,10 @@ pub struct BatchPartitioner {
 
 enum BatchPartitionerState {
     Hash {
-        random_state: ahash::RandomState,
         exprs: Vec<Arc<dyn PhysicalExpr>>,
         num_partitions: usize,
         hash_buffer: Vec<u64>,
+        indices: Vec<Vec<u32>>,
     },
     RoundRobin {
         num_partitions: usize,
@@ -267,29 +434,96 @@ enum BatchPartitionerState {
     },
 }
 
+/// Fixed RandomState used for hash repartitioning to ensure consistent behavior across
+/// executions and runs.
+pub const REPARTITION_RANDOM_STATE: SeededRandomState = SeededRandomState::with_seed(0);
+
 impl BatchPartitioner {
-    /// Create a new [`BatchPartitioner`] with the provided [`Partitioning`]
+    /// Create a new [`BatchPartitioner`] for hash-based repartitioning.
     ///
-    /// The time spent repartitioning will be recorded to `timer`
-    pub fn try_new(partitioning: Partitioning, timer: metrics::Time) -> Result<Self> {
-        let state = match partitioning {
-            Partitioning::RoundRobinBatch(num_partitions) => {
-                BatchPartitionerState::RoundRobin {
-                    num_partitions,
-                    next_idx: 0,
-                }
-            }
-            Partitioning::Hash(exprs, num_partitions) => BatchPartitionerState::Hash {
+    /// # Parameters
+    /// - `exprs`: Expressions used to compute the hash for each input row.
+    /// - `num_partitions`: Total number of output partitions.
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    ///
+    /// # Notes
+    /// This constructor cannot fail and performs no validation.
+    pub fn new_hash_partitioner(
+        exprs: Vec<Arc<dyn PhysicalExpr>>,
+        num_partitions: usize,
+        timer: metrics::Time,
+    ) -> Self {
+        Self {
+            state: BatchPartitionerState::Hash {
                 exprs,
                 num_partitions,
-                // Use fixed random hash
-                random_state: ahash::RandomState::with_seeds(0, 0, 0, 0),
                 hash_buffer: vec![],
+                indices: vec![vec![]; num_partitions],
             },
-            other => return not_impl_err!("Unsupported repartitioning scheme {other:?}"),
-        };
+            timer,
+        }
+    }
 
-        Ok(Self { state, timer })
+    /// Create a new [`BatchPartitioner`] for round-robin repartitioning.
+    ///
+    /// # Parameters
+    /// - `num_partitions`: Total number of output partitions.
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    /// - `input_partition`: Index of the current input partition.
+    /// - `num_input_partitions`: Total number of input partitions.
+    ///
+    /// # Notes
+    /// The starting output partition is derived from the input partition
+    /// to avoid skew when multiple input partitions are used.
+    pub fn new_round_robin_partitioner(
+        num_partitions: usize,
+        timer: metrics::Time,
+        input_partition: usize,
+        num_input_partitions: usize,
+    ) -> Self {
+        Self {
+            state: BatchPartitionerState::RoundRobin {
+                num_partitions,
+                next_idx: (input_partition * num_partitions) / num_input_partitions,
+            },
+            timer,
+        }
+    }
+    /// Create a new [`BatchPartitioner`] based on the provided [`Partitioning`] scheme.
+    ///
+    /// This is a convenience constructor that delegates to the specialized
+    /// hash or round-robin constructors depending on the partitioning variant.
+    ///
+    /// # Parameters
+    /// - `partitioning`: Partitioning scheme to apply (hash or round-robin).
+    /// - `timer`: Metric used to record time spent during repartitioning.
+    /// - `input_partition`: Index of the current input partition.
+    /// - `num_input_partitions`: Total number of input partitions.
+    ///
+    /// # Errors
+    /// Returns an error if the provided partitioning scheme is not supported.
+    pub fn try_new(
+        partitioning: Partitioning,
+        timer: metrics::Time,
+        input_partition: usize,
+        num_input_partitions: usize,
+    ) -> Result<Self> {
+        match partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                Ok(Self::new_hash_partitioner(exprs, num_partitions, timer))
+            }
+            Partitioning::RoundRobinBatch(num_partitions) => {
+                Ok(Self::new_round_robin_partitioner(
+                    num_partitions,
+                    timer,
+                    input_partition,
+                    num_input_partitions,
+                ))
+            }
+            other => {
+                not_impl_err!("Unsupported repartitioning scheme {other:?}")
+            }
+        }
     }
 
     /// Partition the provided [`RecordBatch`] into one or more partitioned [`RecordBatch`]
@@ -331,27 +565,27 @@ impl BatchPartitioner {
                     Box::new(std::iter::once(Ok((idx, batch))))
                 }
                 BatchPartitionerState::Hash {
-                    random_state,
                     exprs,
                     num_partitions: partitions,
                     hash_buffer,
+                    indices,
                 } => {
                     // Tracking time required for distributing indexes across output partitions
                     let timer = self.timer.timer();
 
-                    let arrays = exprs
-                        .iter()
-                        .map(|expr| expr.evaluate(&batch)?.into_array(batch.num_rows()))
-                        .collect::<Result<Vec<_>>>()?;
+                    let arrays =
+                        evaluate_expressions_to_arrays(exprs.as_slice(), &batch)?;
 
                     hash_buffer.clear();
                     hash_buffer.resize(batch.num_rows(), 0);
 
-                    create_hashes(&arrays, random_state, hash_buffer)?;
+                    create_hashes(
+                        &arrays,
+                        REPARTITION_RANDOM_STATE.random_state(),
+                        hash_buffer,
+                    )?;
 
-                    let mut indices: Vec<_> = (0..*partitions)
-                        .map(|_| Vec::with_capacity(batch.num_rows()))
-                        .collect();
+                    indices.iter_mut().for_each(|v| v.clear());
 
                     for (index, hash) in hash_buffer.iter().enumerate() {
                         indices[(*hash % *partitions as u64) as usize].push(index as u32);
@@ -362,22 +596,23 @@ impl BatchPartitioner {
 
                     // Borrowing partitioner timer to prevent moving `self` to closure
                     let partitioner_timer = &self.timer;
-                    let it = indices
-                        .into_iter()
-                        .enumerate()
-                        .filter_map(|(partition, indices)| {
-                            let indices: PrimitiveArray<UInt32Type> = indices.into();
-                            (!indices.is_empty()).then_some((partition, indices))
-                        })
-                        .map(move |(partition, indices)| {
+
+                    let mut partitioned_batches = vec![];
+                    for (partition, p_indices) in indices.iter_mut().enumerate() {
+                        if !p_indices.is_empty() {
+                            let taken_indices = std::mem::take(p_indices);
+                            let indices_array: PrimitiveArray<UInt32Type> =
+                                taken_indices.into();
+
                             // Tracking time required for repartitioned batches construction
                             let _timer = partitioner_timer.timer();
 
                             // Produce batches based on indices
-                            let columns = take_arrays(batch.columns(), &indices, None)?;
+                            let columns =
+                                take_arrays(batch.columns(), &indices_array, None)?;
 
                             let mut options = RecordBatchOptions::new();
-                            options = options.with_row_count(Some(indices.len()));
+                            options = options.with_row_count(Some(indices_array.len()));
                             let batch = RecordBatch::try_new_with_options(
                                 batch.schema(),
                                 columns,
@@ -385,10 +620,22 @@ impl BatchPartitioner {
                             )
                             .unwrap();
 
-                            Ok((partition, batch))
-                        });
+                            partitioned_batches.push(Ok((partition, batch)));
+
+                            // Return the taken vec
+                            let (_, buffer, _) = indices_array.into_parts();
+                            let mut vec =
+                                buffer.into_inner().into_vec::<u32>().map_err(|e| {
+                                    internal_datafusion_err!(
+                                        "Could not convert buffer to vec: {e:?}"
+                                    )
+                                })?;
+                            vec.clear();
+                            *p_indices = vec;
+                        }
+                    }
 
-                    Box::new(it)
+                    Box::new(partitioned_batches.into_iter())
                 }
             };
 
@@ -428,10 +675,10 @@ impl BatchPartitioner {
 ///        │                  │                  │
 ///        │                  │                  │
 ///        │                  │                  │
-///┌───────────────┐  ┌───────────────┐  ┌───────────────┐
-///│    GroupBy    │  │    GroupBy    │  │    GroupBy    │
-///│   (Partial)   │  │   (Partial)   │  │   (Partial)   │
-///└───────────────┘  └───────────────┘  └───────────────┘
+/// ┌───────────────┐  ┌───────────────┐  ┌───────────────┐
+/// │    GroupBy    │  │    GroupBy    │  │    GroupBy    │
+/// │   (Partial)   │  │   (Partial)   │  │   (Partial)   │
+/// └───────────────┘  └───────────────┘  └───────────────┘
 ///        ▲                  ▲                  ▲
 ///        └──────────────────┼──────────────────┘
 ///                           │
@@ -450,7 +697,7 @@ impl BatchPartitioner {
 ///     ╲               ╱           ╲               ╱
 ///      '─.         ,─'             '─.         ,─'
 ///         `───────'                   `───────'
-///```
+/// ```
 ///
 /// # Error Handling
 ///
@@ -463,6 +710,38 @@ impl BatchPartitioner {
 /// arbitrary interleaving (and thus unordered) unless
 /// [`Self::with_preserve_order`] specifies otherwise.
 ///
+/// # Spilling Architecture
+///
+/// RepartitionExec uses [`SpillPool`](crate::spill::spill_pool) channels to handle
+/// memory pressure during repartitioning. Each (input partition, output partition)
+/// pair gets its own SpillPool channel for FIFO ordering.
+///
+/// ```text
+/// Input Partitions (N)          Output Partitions (M)
+/// ────────────────────          ─────────────────────
+///
+///    Input 0 ──┐                      ┌──▶ Output 0
+///              │  ┌──────────────┐    │
+///              ├─▶│ SpillPool    │────┤
+///              │  │ [In0→Out0]   │    │
+///    Input 1 ──┤  └──────────────┘    ├──▶ Output 1
+///              │                       │
+///              │  ┌──────────────┐    │
+///              ├─▶│ SpillPool    │────┤
+///              │  │ [In1→Out0]   │    │
+///    Input 2 ──┤  └──────────────┘    ├──▶ Output 2
+///              │                      │
+///              │       ... (N×M SpillPools total)
+///              │                      │
+///              │  ┌──────────────┐    │
+///              └─▶│ SpillPool    │────┘
+///                 │ [InN→OutM]   │
+///                 └──────────────┘
+///
+/// Each SpillPool maintains FIFO order for its (input, output) pair.
+/// See `RepartitionBatch` for details on the memory/spill decision logic.
+/// ```
+///
 /// # Footnote
 ///
 /// The "Exchange Operator" was first described in the 1989 paper
@@ -470,6 +749,10 @@ impl BatchPartitioner {
 /// system Paper](https://dl.acm.org/doi/pdf/10.1145/93605.98720)
 /// which uses the term "Exchange" for the concept of repartitioning
 /// data across threads.
+///
+/// For more background, please also see the [Optimizing Repartitions in DataFusion] blog.
+///
+/// [Optimizing Repartitions in DataFusion]: https://datafusion.apache.org/blog/2025/12/15/avoid-consecutive-repartitions
 #[derive(Debug, Clone)]
 pub struct RepartitionExec {
     /// Input execution plan
@@ -483,7 +766,7 @@ pub struct RepartitionExec {
     /// `SortPreservingRepartitionExec`, false means `RepartitionExec`.
     preserve_order: bool,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 #[derive(Debug, Clone)]
@@ -542,7 +825,7 @@ impl RepartitionExec {
         &self.cache.partitioning
     }
 
-    /// Get preserve_order flag of the RepartitionExecutor
+    /// Get preserve_order flag of the RepartitionExec
     /// `true` means `SortPreservingRepartitionExec`, `false` means `RepartitionExec`
     pub fn preserve_order(&self) -> bool {
         self.preserve_order
@@ -552,10 +835,23 @@ impl RepartitionExec {
     pub fn name(&self) -> &str {
         "RepartitionExec"
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            state: Default::default(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for RepartitionExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        let input_partition_count = self.input.output_partitioning().partition_count();
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
                 write!(
@@ -563,11 +859,17 @@ impl DisplayAs for RepartitionExec {
                     "{}: partitioning={}, input_partitions={}",
                     self.name(),
                     self.partitioning(),
-                    self.input.output_partitioning().partition_count()
+                    input_partition_count,
                 )?;
 
                 if self.preserve_order {
                     write!(f, ", preserve_order=true")?;
+                } else if input_partition_count <= 1
+                    && self.input.output_ordering().is_some()
+                {
+                    // Make it explicit that repartition maintains sortedness for a single input partition even
+                    // when `preserve_sort order` is false
+                    write!(f, ", maintains_sort_order=true")?;
                 }
 
                 if let Some(sort_exprs) = self.sort_exprs() {
@@ -577,9 +879,6 @@ impl DisplayAs for RepartitionExec {
             }
             DisplayFormatType::TreeRender => {
                 writeln!(f, "partitioning_scheme={}", self.partitioning(),)?;
-
-                let input_partition_count =
-                    self.input.output_partitioning().partition_count();
                 let output_partition_count = self.partitioning().partition_count();
                 let input_to_output_partition_str =
                     format!("{input_partition_count} -> {output_partition_count}");
@@ -607,7 +906,7 @@ impl ExecutionPlan for RepartitionExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -615,10 +914,26 @@ impl ExecutionPlan for RepartitionExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to hash partition expressions if this is a hash repartition
+        if let Partitioning::Hash(exprs, _) = self.partitioning() {
+            let mut tnr = TreeNodeRecursion::Continue;
+            for expr in exprs {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+            return Ok(tnr);
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         let mut repartition = RepartitionExec::try_new(
             children.swap_remove(0),
             self.partitioning().clone(),
@@ -648,51 +963,70 @@ impl ExecutionPlan for RepartitionExec {
             partition
         );
 
+        let spill_metrics = SpillMetrics::new(&self.metrics, partition);
+
         let input = Arc::clone(&self.input);
         let partitioning = self.partitioning().clone();
         let metrics = self.metrics.clone();
-        let preserve_order = self.preserve_order;
+        let preserve_order = self.sort_exprs().is_some();
         let name = self.name().to_owned();
         let schema = self.schema();
         let schema_captured = Arc::clone(&schema);
 
+        let spill_manager = SpillManager::new(
+            Arc::clone(&context.runtime_env()),
+            spill_metrics,
+            input.schema(),
+        );
+
         // Get existing ordering to use for merging
-        let sort_exprs = self.sort_exprs().cloned().unwrap_or_default();
+        let sort_exprs = self.sort_exprs().cloned();
 
         let state = Arc::clone(&self.state);
         if let Some(mut state) = state.try_lock() {
             state.ensure_input_streams_initialized(
-                Arc::clone(&input),
-                metrics.clone(),
+                &input,
+                &metrics,
                 partitioning.partition_count(),
-                Arc::clone(&context),
+                &context,
             )?;
         }
 
-        let stream = futures::stream::once(async move {
-            let num_input_partitions = input.output_partitioning().partition_count();
+        let num_input_partitions = input.output_partitioning().partition_count();
 
+        let stream = futures::stream::once(async move {
             // lock scope
-            let (mut rx, reservation, abort_helper) = {
+            let (rx, reservation, spill_readers, abort_helper) = {
                 // lock mutexes
                 let mut state = state.lock();
                 let state = state.consume_input_streams(
-                    Arc::clone(&input),
-                    metrics.clone(),
-                    partitioning,
+                    &input,
+                    &metrics,
+                    &partitioning,
                     preserve_order,
-                    name.clone(),
-                    Arc::clone(&context),
+                    &name,
+                    &context,
+                    spill_manager.clone(),
                 )?;
 
                 // now return stream for the specified *output* partition which will
                 // read from the channel
-                let (_tx, rx, reservation) = state
+                let PartitionChannels {
+                    rx,
+                    reservation,
+                    spill_readers,
+                    ..
+                } = state
                     .channels
                     .remove(&partition)
                     .expect("partition not used yet");
 
-                (rx, reservation, Arc::clone(&state.abort_helper))
+                (
+                    rx,
+                    reservation,
+                    spill_readers,
+                    Arc::clone(&state.abort_helper),
+                )
             };
 
             trace!(
@@ -701,15 +1035,22 @@ impl ExecutionPlan for RepartitionExec {
 
             if preserve_order {
                 // Store streams from all the input partitions:
+                // Each input partition gets its own spill reader to maintain proper FIFO ordering
                 let input_streams = rx
                     .into_iter()
-                    .map(|receiver| {
-                        Box::pin(PerPartitionStream {
-                            schema: Arc::clone(&schema_captured),
+                    .zip(spill_readers)
+                    .map(|(receiver, spill_stream)| {
+                        // In preserve_order mode, each receiver corresponds to exactly one input partition
+                        Box::pin(PerPartitionStream::new(
+                            Arc::clone(&schema_captured),
                             receiver,
-                            _drop_helper: Arc::clone(&abort_helper),
-                            reservation: Arc::clone(&reservation),
-                        }) as SendableRecordBatchStream
+                            Arc::clone(&abort_helper),
+                            Arc::clone(&reservation),
+                            spill_stream,
+                            1, // Each receiver handles one input partition
+                            BaselineMetrics::new(&metrics, partition),
+                            None, // subsequent merge sort already does batching https://github.com/apache/datafusion/blob/e4dcf0c85611ad0bd291f03a8e03fe56d773eb16/datafusion/physical-plan/src/sorts/merge.rs#L286
+                        )) as SendableRecordBatchStream
                     })
                     .collect::<Vec<_>>();
                 // Note that receiver size (`rx.len()`) and `num_input_partitions` are same.
@@ -723,21 +1064,32 @@ impl ExecutionPlan for RepartitionExec {
                 StreamingMergeBuilder::new()
                     .with_streams(input_streams)
                     .with_schema(schema_captured)
-                    .with_expressions(&sort_exprs)
+                    .with_expressions(&sort_exprs.unwrap())
                     .with_metrics(BaselineMetrics::new(&metrics, partition))
                     .with_batch_size(context.session_config().batch_size())
                     .with_fetch(fetch)
                     .with_reservation(merge_reservation)
+                    .with_spill_manager(spill_manager)
                     .build()
             } else {
-                Ok(Box::pin(RepartitionStream {
-                    num_input_partitions,
-                    num_input_partitions_processed: 0,
-                    schema: input.schema(),
-                    input: rx.swap_remove(0),
-                    _drop_helper: abort_helper,
+                // Non-preserve-order case: single input stream, so use the first spill reader
+                let spill_stream = spill_readers
+                    .into_iter()
+                    .next()
+                    .expect("at least one spill reader should exist");
+
+                Ok(Box::pin(PerPartitionStream::new(
+                    schema_captured,
+                    rx.into_iter()
+                        .next()
+                        .expect("at least one receiver should exist"),
+                    abort_helper,
                     reservation,
-                }) as SendableRecordBatchStream)
+                    spill_stream,
+                    num_input_partitions,
+                    BaselineMetrics::new(&metrics, partition),
+                    Some(context.session_config().batch_size()),
+                )) as SendableRecordBatchStream)
             }
         })
         .try_flatten();
@@ -749,15 +1101,44 @@ impl ExecutionPlan for RepartitionExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        if let Some(partition) = partition {
+            let partition_count = self.partitioning().partition_count();
+            if partition_count == 0 {
+                return Ok(Arc::new(Statistics::new_unknown(&self.schema())));
+            }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_none() {
-            self.input.partition_statistics(None)
+            assert_or_internal_err!(
+                partition < partition_count,
+                "RepartitionExec invalid partition {} (expected less than {})",
+                partition,
+                partition_count
+            );
+
+            let mut stats = Arc::unwrap_or_clone(self.input.partition_statistics(None)?);
+
+            // Distribute statistics across partitions
+            stats.num_rows = stats
+                .num_rows
+                .get_value()
+                .map(|rows| Precision::Inexact(rows / partition_count))
+                .unwrap_or(Precision::Absent);
+            stats.total_byte_size = stats
+                .total_byte_size
+                .get_value()
+                .map(|bytes| Precision::Inexact(bytes / partition_count))
+                .unwrap_or(Precision::Absent);
+
+            // Make all column stats unknown
+            stats.column_statistics = stats
+                .column_statistics
+                .iter()
+                .map(|_| ColumnStatistics::new_unknown())
+                .collect();
+
+            Ok(Arc::new(stats))
         } else {
-            Ok(Statistics::new_unknown(&self.schema()))
+            self.input.partition_statistics(None)
         }
     }
 
@@ -807,21 +1188,62 @@ impl ExecutionPlan for RepartitionExec {
 
     fn gather_filters_for_pushdown(
         &self,
+        _phase: FilterPushdownPhase,
         parent_filters: Vec<Arc<dyn PhysicalExpr>>,
         _config: &ConfigOptions,
     ) -> Result<FilterDescription> {
-        Ok(FilterDescription::new_with_child_count(1)
-            .all_parent_filters_supported(parent_filters))
+        FilterDescription::from_children(parent_filters, &self.children())
     }
 
     fn handle_child_pushdown_result(
         &self,
+        _phase: FilterPushdownPhase,
         child_pushdown_result: ChildPushdownResult,
         _config: &ConfigOptions,
     ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
-        Ok(FilterPushdownPropagation::transparent(
-            child_pushdown_result,
-        ))
+        Ok(FilterPushdownPropagation::if_all(child_pushdown_result))
+    }
+
+    fn try_pushdown_sort(
+        &self,
+        order: &[PhysicalSortExpr],
+    ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
+        // RepartitionExec only maintains input order if preserve_order is set
+        // or if there's only one partition
+        if !self.maintains_input_order()[0] {
+            return Ok(SortOrderPushdownResult::Unsupported);
+        }
+
+        // Delegate to the child and wrap with a new RepartitionExec
+        self.input.try_pushdown_sort(order)?.try_map(|new_input| {
+            let mut new_repartition =
+                RepartitionExec::try_new(new_input, self.partitioning().clone())?;
+            if self.preserve_order {
+                new_repartition = new_repartition.with_preserve_order();
+            }
+            Ok(Arc::new(new_repartition) as Arc<dyn ExecutionPlan>)
+        })
+    }
+
+    fn repartitioned(
+        &self,
+        target_partitions: usize,
+        _config: &ConfigOptions,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        use Partitioning::*;
+        let mut new_properties = PlanProperties::clone(&self.cache);
+        new_properties.partitioning = match new_properties.partitioning {
+            RoundRobinBatch(_) => RoundRobinBatch(target_partitions),
+            Hash(hash, _) => Hash(hash, target_partitions),
+            UnknownPartitioning(_) => UnknownPartitioning(target_partitions),
+        };
+        Ok(Some(Arc::new(Self {
+            input: Arc::clone(&self.input),
+            state: Arc::clone(&self.state),
+            metrics: self.metrics.clone(),
+            preserve_order: self.preserve_order,
+            cache: new_properties.into(),
+        })))
     }
 }
 
@@ -834,14 +1256,13 @@ impl RepartitionExec {
         partitioning: Partitioning,
     ) -> Result<Self> {
         let preserve_order = false;
-        let cache =
-            Self::compute_properties(&input, partitioning.clone(), preserve_order);
+        let cache = Self::compute_properties(&input, partitioning, preserve_order);
         Ok(RepartitionExec {
             input,
             state: Default::default(),
             metrics: ExecutionPlanMetricsSet::new(),
             preserve_order,
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -883,6 +1304,8 @@ impl RepartitionExec {
             input.pipeline_behavior(),
             input.boundedness(),
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
+        .with_evaluation_type(EvaluationType::Eager)
     }
 
     /// Specify if this repartitioning operation should preserve the order of
@@ -900,7 +1323,7 @@ impl RepartitionExec {
                 // to maintain order
                 self.input.output_partitioning().partition_count() > 1;
         let eq_properties = Self::eq_properties_helper(&self.input, self.preserve_order);
-        self.cache = self.cache.with_eq_properties(eq_properties);
+        Arc::make_mut(&mut self.cache).set_eq_properties(eq_properties);
         self
     }
 
@@ -916,18 +1339,35 @@ impl RepartitionExec {
     /// Pulls data from the specified input plan, feeding it to the
     /// output partitions based on the desired partitioning
     ///
-    /// txs hold the output sending channels for each output partition
+    /// `output_channels` holds the output sending channels for each output partition
     async fn pull_from_input(
         mut stream: SendableRecordBatchStream,
-        mut output_channels: HashMap<
-            usize,
-            (DistributionSender<MaybeBatch>, SharedMemoryReservation),
-        >,
+        mut output_channels: HashMap<usize, OutputChannel>,
         partitioning: Partitioning,
         metrics: RepartitionMetrics,
+        input_partition: usize,
+        num_input_partitions: usize,
     ) -> Result<()> {
-        let mut partitioner =
-            BatchPartitioner::try_new(partitioning, metrics.repartition_time.clone())?;
+        let mut partitioner = match &partitioning {
+            Partitioning::Hash(exprs, num_partitions) => {
+                BatchPartitioner::new_hash_partitioner(
+                    exprs.clone(),
+                    *num_partitions,
+                    metrics.repartition_time.clone(),
+                )
+            }
+            Partitioning::RoundRobinBatch(num_partitions) => {
+                BatchPartitioner::new_round_robin_partitioner(
+                    *num_partitions,
+                    metrics.repartition_time.clone(),
+                    input_partition,
+                    num_input_partitions,
+                )
+            }
+            other => {
+                return not_impl_err!("Unsupported repartitioning scheme {other:?}");
+            }
+        };
 
         // While there are still outputs to send to, keep pulling inputs
         let mut batches_until_yield = partitioner.num_partitions();
@@ -943,18 +1383,39 @@ impl RepartitionExec {
                 None => break,
             };
 
+            // Handle empty batch
+            if batch.num_rows() == 0 {
+                continue;
+            }
+
             for res in partitioner.partition_iter(batch)? {
                 let (partition, batch) = res?;
                 let size = batch.get_array_memory_size();
 
                 let timer = metrics.send_time[partition].timer();
                 // if there is still a receiver, send to it
-                if let Some((tx, reservation)) = output_channels.get_mut(&partition) {
-                    reservation.lock().try_grow(size)?;
-
-                    if tx.send(Some(Ok(batch))).await.is_err() {
+                if let Some(channel) = output_channels.get_mut(&partition) {
+                    let (batch_to_send, is_memory_batch) =
+                        match channel.reservation.lock().try_grow(size) {
+                            Ok(_) => {
+                                // Memory available - send in-memory batch
+                                (RepartitionBatch::Memory(batch), true)
+                            }
+                            Err(_) => {
+                                // We're memory limited - spill to SpillPool
+                                // SpillPool handles file handle reuse and rotation
+                                channel.spill_writer.push_batch(&batch)?;
+                                // Send marker indicating batch was spilled
+                                (RepartitionBatch::Spilled, false)
+                            }
+                        };
+
+                    if channel.sender.send(Some(Ok(batch_to_send))).await.is_err() {
                         // If the other end has hung up, it was an early shutdown (e.g. LIMIT)
-                        reservation.lock().shrink(size);
+                        // Only shrink memory if it was a memory batch
+                        if is_memory_batch {
+                            channel.reservation.lock().shrink(size);
+                        }
                         output_channels.remove(&partition);
                     }
                 }
@@ -985,6 +1446,8 @@ impl RepartitionExec {
             }
         }
 
+        // Spill writers will auto-finalize when dropped
+        // No need for explicit flush
         Ok(())
     }
 
@@ -1027,7 +1490,7 @@ impl RepartitionExec {
             // Input task completed successfully
             Ok(Ok(())) => {
                 // notify each output partition that this input partition has no more data
-                for (_, tx) in txs {
+                for (_partition, tx) in txs {
                     tx.send(None).await.ok();
                 }
             }
@@ -1035,87 +1498,234 @@ impl RepartitionExec {
     }
 }
 
-struct RepartitionStream {
-    /// Number of input partitions that will be sending batches to this output channel
-    num_input_partitions: usize,
-
-    /// Number of input partitions that have finished sending batches to this output channel
-    num_input_partitions_processed: usize,
+/// State for tracking whether we're reading from memory channel or spill stream.
+///
+/// This state machine ensures proper ordering when batches are mixed between memory
+/// and spilled storage. When a [`RepartitionBatch::Spilled`] marker is received,
+/// the stream must block on the spill stream until the corresponding batch arrives.
+///
+/// # State Machine
+///
+/// ```text
+///                        ┌─────────────────┐
+///                   ┌───▶│  ReadingMemory  │◀───┐
+///                   │    └────────┬────────┘    │
+///                   │             │             │
+///                   │     Poll channel          │
+///                   │             │             │
+///                   │  ┌──────────┼─────────────┐
+///                   │  │          │             │
+///                   │  ▼          ▼             │
+///                   │ Memory   Spilled          │
+///       Got batch   │ batch    marker           │
+///       from spill  │  │          │             │
+///                   │  │          ▼             │
+///                   │  │  ┌──────────────────┐  │
+///                   │  │  │ ReadingSpilled   │  │
+///                   │  │  └────────┬─────────┘  │
+///                   │  │           │            │
+///                   │  │   Poll spill_stream    │
+///                   │  │           │            │
+///                   │  │           ▼            │
+///                   │  │      Get batch         │
+///                   │  │           │            │
+///                   └──┴───────────┴────────────┘
+///                                  │
+///                                  ▼
+///                           Return batch
+///                     (Order preserved within
+///                      (input, output) pair)
+/// ```
+///
+/// The transition to `ReadingSpilled` blocks further channel polling to maintain
+/// FIFO ordering - we cannot read the next item from the channel until the spill
+/// stream provides the current batch.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum StreamState {
+    /// Reading from the memory channel (normal operation)
+    ReadingMemory,
+    /// Waiting for a spilled batch from the spill stream.
+    /// Must not poll channel until spilled batch is received to preserve ordering.
+    ReadingSpilled,
+}
 
+/// This struct converts a receiver to a stream.
+/// Receiver receives data on an SPSC channel.
+struct PerPartitionStream {
     /// Schema wrapped by Arc
     schema: SchemaRef,
 
     /// channel containing the repartitioned batches
-    input: DistributionReceiver<MaybeBatch>,
+    receiver: DistributionReceiver<MaybeBatch>,
 
     /// Handle to ensure background tasks are killed when no longer needed.
     _drop_helper: Arc<Vec<SpawnedTask<()>>>,
 
     /// Memory reservation.
     reservation: SharedMemoryReservation,
+
+    /// Infinite stream for reading from the spill pool
+    spill_stream: SendableRecordBatchStream,
+
+    /// Internal state indicating if we are reading from memory or spill stream
+    state: StreamState,
+
+    /// Number of input partitions that have not yet finished.
+    /// In non-preserve-order mode, multiple input partitions send to the same channel,
+    /// each sending None when complete. We must wait for all of them.
+    remaining_partitions: usize,
+
+    /// Execution metrics
+    baseline_metrics: BaselineMetrics,
+
+    /// None for sort preserving variant (merge sort already does coalescing)
+    batch_coalescer: Option<LimitedBatchCoalescer>,
 }
 
-impl Stream for RepartitionStream {
-    type Item = Result<RecordBatch>;
+impl PerPartitionStream {
+    #[expect(clippy::too_many_arguments)]
+    fn new(
+        schema: SchemaRef,
+        receiver: DistributionReceiver<MaybeBatch>,
+        drop_helper: Arc<Vec<SpawnedTask<()>>>,
+        reservation: SharedMemoryReservation,
+        spill_stream: SendableRecordBatchStream,
+        num_input_partitions: usize,
+        baseline_metrics: BaselineMetrics,
+        batch_size: Option<usize>,
+    ) -> Self {
+        let batch_coalescer =
+            batch_size.map(|s| LimitedBatchCoalescer::new(Arc::clone(&schema), s, None));
+        Self {
+            schema,
+            receiver,
+            _drop_helper: drop_helper,
+            reservation,
+            spill_stream,
+            state: StreamState::ReadingMemory,
+            remaining_partitions: num_input_partitions,
+            baseline_metrics,
+            batch_coalescer,
+        }
+    }
 
-    fn poll_next(
-        mut self: Pin<&mut Self>,
+    fn poll_next_inner(
+        self: &mut Pin<&mut Self>,
         cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        use futures::StreamExt;
+        let cloned_time = self.baseline_metrics.elapsed_compute().clone();
+        let _timer = cloned_time.timer();
+
         loop {
-            match self.input.recv().poll_unpin(cx) {
-                Poll::Ready(Some(Some(v))) => {
-                    if let Ok(batch) = &v {
-                        self.reservation
-                            .lock()
-                            .shrink(batch.get_array_memory_size());
-                    }
+            match self.state {
+                StreamState::ReadingMemory => {
+                    // Poll the memory channel for next message
+                    let value = match self.receiver.recv().poll_unpin(cx) {
+                        Poll::Ready(v) => v,
+                        Poll::Pending => {
+                            // Nothing from channel, wait
+                            return Poll::Pending;
+                        }
+                    };
 
-                    return Poll::Ready(Some(v));
-                }
-                Poll::Ready(Some(None)) => {
-                    self.num_input_partitions_processed += 1;
-
-                    if self.num_input_partitions == self.num_input_partitions_processed {
-                        // all input partitions have finished sending batches
-                        return Poll::Ready(None);
-                    } else {
-                        // other partitions still have data to send
-                        continue;
+                    match value {
+                        Some(Some(v)) => match v {
+                            Ok(RepartitionBatch::Memory(batch)) => {
+                                // Release memory and return batch
+                                self.reservation
+                                    .lock()
+                                    .shrink(batch.get_array_memory_size());
+                                return Poll::Ready(Some(Ok(batch)));
+                            }
+                            Ok(RepartitionBatch::Spilled) => {
+                                // Batch was spilled, transition to reading from spill stream
+                                // We must block on spill stream until we get the batch
+                                // to preserve ordering
+                                self.state = StreamState::ReadingSpilled;
+                                continue;
+                            }
+                            Err(e) => {
+                                return Poll::Ready(Some(Err(e)));
+                            }
+                        },
+                        Some(None) => {
+                            // One input partition finished
+                            self.remaining_partitions -= 1;
+                            if self.remaining_partitions == 0 {
+                                // All input partitions finished
+                                return Poll::Ready(None);
+                            }
+                            // Continue to poll for more data from other partitions
+                            continue;
+                        }
+                        None => {
+                            // Channel closed unexpectedly
+                            return Poll::Ready(None);
+                        }
                     }
                 }
-                Poll::Ready(None) => {
-                    return Poll::Ready(None);
-                }
-                Poll::Pending => {
-                    return Poll::Pending;
+                StreamState::ReadingSpilled => {
+                    // Poll spill stream for the spilled batch
+                    match self.spill_stream.poll_next_unpin(cx) {
+                        Poll::Ready(Some(Ok(batch))) => {
+                            self.state = StreamState::ReadingMemory;
+                            return Poll::Ready(Some(Ok(batch)));
+                        }
+                        Poll::Ready(Some(Err(e))) => {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        Poll::Ready(None) => {
+                            // Spill stream ended, keep draining the memory channel
+                            self.state = StreamState::ReadingMemory;
+                        }
+                        Poll::Pending => {
+                            // Spilled batch not ready yet, must wait
+                            // This preserves ordering by blocking until spill data arrives
+                            return Poll::Pending;
+                        }
+                    }
                 }
             }
         }
     }
-}
-
-impl RecordBatchStream for RepartitionStream {
-    /// Get the schema
-    fn schema(&self) -> SchemaRef {
-        Arc::clone(&self.schema)
-    }
-}
 
-/// This struct converts a receiver to a stream.
-/// Receiver receives data on an SPSC channel.
-struct PerPartitionStream {
-    /// Schema wrapped by Arc
-    schema: SchemaRef,
-
-    /// channel containing the repartitioned batches
-    receiver: DistributionReceiver<MaybeBatch>,
+    fn poll_next_and_coalesce(
+        self: &mut Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        coalescer: &mut LimitedBatchCoalescer,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let cloned_time = self.baseline_metrics.elapsed_compute().clone();
+        let mut completed = false;
 
-    /// Handle to ensure background tasks are killed when no longer needed.
-    _drop_helper: Arc<Vec<SpawnedTask<()>>>,
+        loop {
+            if let Some(batch) = coalescer.next_completed_batch() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+            if completed {
+                return Poll::Ready(None);
+            }
 
-    /// Memory reservation.
-    reservation: SharedMemoryReservation,
+            match ready!(self.poll_next_inner(cx)) {
+                Some(Ok(batch)) => {
+                    let _timer = cloned_time.timer();
+                    if let Err(err) = coalescer.push_batch(batch) {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+                Some(err) => {
+                    return Poll::Ready(Some(err));
+                }
+                None => {
+                    completed = true;
+                    let _timer = cloned_time.timer();
+                    if let Err(err) = coalescer.finish() {
+                        return Poll::Ready(Some(Err(err)));
+                    }
+                }
+            }
+        }
+    }
 }
 
 impl Stream for PerPartitionStream {
@@ -1125,22 +1735,14 @@ impl Stream for PerPartitionStream {
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Self::Item>> {
-        match self.receiver.recv().poll_unpin(cx) {
-            Poll::Ready(Some(Some(v))) => {
-                if let Ok(batch) = &v {
-                    self.reservation
-                        .lock()
-                        .shrink(batch.get_array_memory_size());
-                }
-                Poll::Ready(Some(v))
-            }
-            Poll::Ready(Some(None)) => {
-                // Input partition has finished sending batches
-                Poll::Ready(None)
-            }
-            Poll::Ready(None) => Poll::Ready(None),
-            Poll::Pending => Poll::Pending,
+        let poll;
+        if let Some(mut coalescer) = self.batch_coalescer.take() {
+            poll = self.poll_next_and_coalesce(cx, &mut coalescer);
+            self.batch_coalescer = Some(coalescer);
+        } else {
+            poll = self.poll_next_inner(cx);
         }
+        self.baseline_metrics.record_poll(poll)
     }
 }
 
@@ -1161,8 +1763,8 @@ mod tests {
         test::{
             assert_is_pending,
             exec::{
-                assert_strong_count_converges_to_zero, BarrierExec, BlockingExec,
-                ErrorExec, MockExec,
+                BarrierExec, BlockingExec, ErrorExec, MockExec,
+                assert_strong_count_converges_to_zero,
             },
         },
         {collect, expressions::col},
@@ -1171,12 +1773,12 @@ mod tests {
     use arrow::array::{ArrayRef, StringArray, UInt32Array};
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::cast::as_string_array;
+    use datafusion_common::exec_err;
     use datafusion_common::test_util::batches_to_sort_string;
-    use datafusion_common::{arrow_datafusion_err, exec_err};
     use datafusion_common_runtime::JoinSet;
+    use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
     use insta::assert_snapshot;
-    use itertools::Itertools;
 
     #[tokio::test]
     async fn one_to_many_round_robin() -> Result<()> {
@@ -1190,10 +1792,13 @@ mod tests {
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(4)).await?;
 
         assert_eq!(4, output_partitions.len());
-        assert_eq!(13, output_partitions[0].len());
-        assert_eq!(13, output_partitions[1].len());
-        assert_eq!(12, output_partitions[2].len());
-        assert_eq!(12, output_partitions[3].len());
+        for partition in &output_partitions {
+            assert_eq!(1, partition.len());
+        }
+        assert_eq!(13 * 8, output_partitions[0][0].num_rows());
+        assert_eq!(13 * 8, output_partitions[1][0].num_rows());
+        assert_eq!(12 * 8, output_partitions[2][0].num_rows());
+        assert_eq!(12 * 8, output_partitions[3][0].num_rows());
 
         Ok(())
     }
@@ -1210,7 +1815,7 @@ mod tests {
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(1)).await?;
 
         assert_eq!(1, output_partitions.len());
-        assert_eq!(150, output_partitions[0].len());
+        assert_eq!(150 * 8, output_partitions[0][0].num_rows());
 
         Ok(())
     }
@@ -1226,12 +1831,12 @@ mod tests {
         let output_partitions =
             repartition(&schema, partitions, Partitioning::RoundRobinBatch(5)).await?;
 
+        let total_rows_per_partition = 8 * 50 * 3 / 5;
         assert_eq!(5, output_partitions.len());
-        assert_eq!(30, output_partitions[0].len());
-        assert_eq!(30, output_partitions[1].len());
-        assert_eq!(30, output_partitions[2].len());
-        assert_eq!(30, output_partitions[3].len());
-        assert_eq!(30, output_partitions[4].len());
+        for partition in output_partitions {
+            assert_eq!(1, partition.len());
+            assert_eq!(total_rows_per_partition, partition[0].num_rows());
+        }
 
         Ok(())
     }
@@ -1261,6 +1866,32 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_repartition_with_coalescing() -> Result<()> {
+        let schema = test_schema();
+        // create 50 batches, each having 8 rows
+        let partition = create_vec_batches(50);
+        let partitions = vec![partition.clone(), partition.clone()];
+        let partitioning = Partitioning::RoundRobinBatch(1);
+
+        let session_config = SessionConfig::new().with_batch_size(200);
+        let task_ctx = TaskContext::default().with_session_config(session_config);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec = TestMemoryExec::try_new_exec(&partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                assert_eq!(200, batch.num_rows());
+            }
+        }
+        Ok(())
+    }
+
     fn test_schema() -> Arc<Schema> {
         Arc::new(Schema::new(vec![Field::new("c0", DataType::UInt32, false)]))
     }
@@ -1306,12 +1937,12 @@ mod tests {
 
         let output_partitions = handle.join().await.unwrap().unwrap();
 
+        let total_rows_per_partition = 8 * 50 * 3 / 5;
         assert_eq!(5, output_partitions.len());
-        assert_eq!(30, output_partitions[0].len());
-        assert_eq!(30, output_partitions[1].len());
-        assert_eq!(30, output_partitions[2].len());
-        assert_eq!(30, output_partitions[3].len());
-        assert_eq!(30, output_partitions[4].len());
+        for partition in output_partitions {
+            assert_eq!(1, partition.len());
+            assert_eq!(total_rows_per_partition, partition[0].num_rows());
+        }
 
         Ok(())
     }
@@ -1480,16 +2111,16 @@ mod tests {
         // output stream 1 should *not* error and have one of the input batches
         let batches = crate::common::collect(output_stream1).await.unwrap();
 
-        assert_snapshot!(batches_to_sort_string(&batches), @r#"
-            +------------------+
-            | my_awesome_field |
-            +------------------+
-            | baz              |
-            | frob             |
-            | gaz              |
-            | grob             |
-            +------------------+
-            "#);
+        assert_snapshot!(batches_to_sort_string(&batches), @r"
+        +------------------+
+        | my_awesome_field |
+        +------------------+
+        | baz              |
+        | frob             |
+        | gar              |
+        | goo              |
+        +------------------+
+        ");
     }
 
     #[tokio::test]
@@ -1549,14 +2180,13 @@ mod tests {
         });
         let batches_with_drop = crate::common::collect(output_stream1).await.unwrap();
 
-        fn sort(batch: Vec<RecordBatch>) -> Vec<RecordBatch> {
-            batch
-                .into_iter()
-                .sorted_by_key(|b| format!("{b:?}"))
-                .collect()
-        }
-
-        assert_eq!(sort(batches_without_drop), sort(batches_with_drop));
+        let items_vec_with_drop = str_batches_to_vec(&batches_with_drop);
+        let items_set_with_drop: HashSet<&str> =
+            items_vec_with_drop.iter().copied().collect();
+        assert_eq!(
+            items_set_with_drop.symmetric_difference(&items_set).count(),
+            0
+        );
     }
 
     fn str_batches_to_vec(batches: &[RecordBatch]) -> Vec<&str> {
@@ -1653,17 +2283,210 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn repartition_with_spilling() -> Result<()> {
+        // Test that repartition successfully spills to disk when memory is constrained
+        let schema = test_schema();
+        let partition = create_vec_batches(50);
+        let input_partitions = vec![partition];
+        let partitioning = Partitioning::RoundRobinBatch(4);
+
+        // Set up context with very tight memory limit to force spilling
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec =
+            TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        // Collect all partitions - should succeed by spilling to disk
+        let mut total_rows = 0;
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                total_rows += batch.num_rows();
+            }
+        }
+
+        // Verify we got all the data (50 batches * 8 rows each)
+        assert_eq!(total_rows, 50 * 8);
+
+        // Verify spilling metrics to confirm spilling actually happened
+        let metrics = exec.metrics().unwrap();
+        assert!(
+            metrics.spill_count().unwrap() > 0,
+            "Expected spill_count > 0, but got {:?}",
+            metrics.spill_count()
+        );
+        println!("Spilled {} times", metrics.spill_count().unwrap());
+        assert!(
+            metrics.spilled_bytes().unwrap() > 0,
+            "Expected spilled_bytes > 0, but got {:?}",
+            metrics.spilled_bytes()
+        );
+        println!(
+            "Spilled {} bytes in {} spills",
+            metrics.spilled_bytes().unwrap(),
+            metrics.spill_count().unwrap()
+        );
+        assert!(
+            metrics.spilled_rows().unwrap() > 0,
+            "Expected spilled_rows > 0, but got {:?}",
+            metrics.spilled_rows()
+        );
+        println!("Spilled {} rows", metrics.spilled_rows().unwrap());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn repartition_with_partial_spilling() -> Result<()> {
+        // Test that repartition can handle partial spilling (some batches in memory, some spilled)
+        let schema = test_schema();
+        let partition = create_vec_batches(50);
+        let input_partitions = vec![partition];
+        let partitioning = Partitioning::RoundRobinBatch(4);
+
+        // Set up context with moderate memory limit to force partial spilling
+        // 2KB should allow some batches in memory but force others to spill
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(2 * 1024, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec =
+            TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        // Collect all partitions - should succeed with partial spilling
+        let mut total_rows = 0;
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                total_rows += batch.num_rows();
+            }
+        }
+
+        // Verify we got all the data (50 batches * 8 rows each)
+        assert_eq!(total_rows, 50 * 8);
+
+        // Verify partial spilling metrics
+        let metrics = exec.metrics().unwrap();
+        let spill_count = metrics.spill_count().unwrap();
+        let spilled_rows = metrics.spilled_rows().unwrap();
+        let spilled_bytes = metrics.spilled_bytes().unwrap();
+
+        assert!(
+            spill_count > 0,
+            "Expected some spilling to occur, but got spill_count={spill_count}"
+        );
+        assert!(
+            spilled_rows > 0 && spilled_rows < total_rows,
+            "Expected partial spilling (0 < spilled_rows < {total_rows}), but got spilled_rows={spilled_rows}"
+        );
+        assert!(
+            spilled_bytes > 0,
+            "Expected some bytes to be spilled, but got spilled_bytes={spilled_bytes}"
+        );
+
+        println!(
+            "Partial spilling: spilled {} out of {} rows ({:.1}%) in {} spills, {} bytes",
+            spilled_rows,
+            total_rows,
+            (spilled_rows as f64 / total_rows as f64) * 100.0,
+            spill_count,
+            spilled_bytes
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn repartition_without_spilling() -> Result<()> {
+        // Test that repartition does not spill when there's ample memory
+        let schema = test_schema();
+        let partition = create_vec_batches(50);
+        let input_partitions = vec![partition];
+        let partitioning = Partitioning::RoundRobinBatch(4);
+
+        // Set up context with generous memory limit - no spilling should occur
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(10 * 1024 * 1024, 1.0) // 10MB
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec =
+            TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        // Collect all partitions - should succeed without spilling
+        let mut total_rows = 0;
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                total_rows += batch.num_rows();
+            }
+        }
+
+        // Verify we got all the data (50 batches * 8 rows each)
+        assert_eq!(total_rows, 50 * 8);
+
+        // Verify no spilling occurred
+        let metrics = exec.metrics().unwrap();
+        assert_eq!(
+            metrics.spill_count(),
+            Some(0),
+            "Expected no spilling, but got spill_count={:?}",
+            metrics.spill_count()
+        );
+        assert_eq!(
+            metrics.spilled_bytes(),
+            Some(0),
+            "Expected no bytes spilled, but got spilled_bytes={:?}",
+            metrics.spilled_bytes()
+        );
+        assert_eq!(
+            metrics.spilled_rows(),
+            Some(0),
+            "Expected no rows spilled, but got spilled_rows={:?}",
+            metrics.spilled_rows()
+        );
+
+        println!("No spilling occurred - all data processed in memory");
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn oom() -> Result<()> {
-        // define input partitions
+        use datafusion_execution::disk_manager::{DiskManagerBuilder, DiskManagerMode};
+
+        // Test that repartition fails with OOM when disk manager is disabled
         let schema = test_schema();
         let partition = create_vec_batches(50);
         let input_partitions = vec![partition];
         let partitioning = Partitioning::RoundRobinBatch(4);
 
-        // setup up context
+        // Setup context with memory limit but NO disk manager (explicitly disabled)
         let runtime = RuntimeEnvBuilder::default()
             .with_memory_limit(1, 1.0)
+            .with_disk_manager_builder(
+                DiskManagerBuilder::default().with_mode(DiskManagerMode::Disabled),
+            )
             .build_arc()?;
 
         let task_ctx = TaskContext::default().with_runtime(runtime);
@@ -1674,11 +2497,10 @@ mod tests {
             TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
         let exec = RepartitionExec::try_new(exec, partitioning)?;
 
-        // pull partitions
+        // Attempt to execute - should fail with ResourcesExhausted error
         for i in 0..exec.partitioning().partition_count() {
             let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
-            let err =
-                arrow_datafusion_err!(stream.next().await.unwrap().unwrap_err().into());
+            let err = stream.next().await.unwrap().unwrap_err();
             let err = err.find_root();
             assert!(
                 matches!(err, DataFusionError::ResourcesExhausted(_)),
@@ -1692,7 +2514,7 @@ mod tests {
     /// Create vector batches
     fn create_vec_batches(n: usize) -> Vec<RecordBatch> {
         let batch = create_batch();
-        (0..n).map(|_| batch.clone()).collect()
+        std::iter::repeat_n(batch, n).collect()
     }
 
     /// Create batch
@@ -1704,12 +2526,105 @@ mod tests {
         )
         .unwrap()
     }
+
+    /// Create batches with sequential values for ordering tests
+    fn create_ordered_batches(num_batches: usize) -> Vec<RecordBatch> {
+        let schema = test_schema();
+        (0..num_batches)
+            .map(|i| {
+                let start = (i * 8) as u32;
+                RecordBatch::try_new(
+                    Arc::clone(&schema),
+                    vec![Arc::new(UInt32Array::from(
+                        (start..start + 8).collect::<Vec<_>>(),
+                    ))],
+                )
+                .unwrap()
+            })
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn test_repartition_ordering_with_spilling() -> Result<()> {
+        // Test that repartition preserves ordering when spilling occurs
+        // This tests the state machine fix where we must block on spill_stream
+        // when a Spilled marker is received, rather than continuing to poll the channel
+
+        let schema = test_schema();
+        // Create batches with sequential values: batch 0 has [0,1,2,3,4,5,6,7],
+        // batch 1 has [8,9,10,11,12,13,14,15], etc.
+        let partition = create_ordered_batches(20);
+        let input_partitions = vec![partition];
+
+        // Use RoundRobinBatch to ensure predictable ordering
+        let partitioning = Partitioning::RoundRobinBatch(2);
+
+        // Set up context with very tight memory limit to force spilling
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // create physical plan
+        let exec =
+            TestMemoryExec::try_new_exec(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = RepartitionExec::try_new(exec, partitioning)?;
+
+        // Collect all output partitions
+        let mut all_batches = Vec::new();
+        for i in 0..exec.partitioning().partition_count() {
+            let mut partition_batches = Vec::new();
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                partition_batches.push(batch);
+            }
+            all_batches.push(partition_batches);
+        }
+
+        // Verify spilling occurred
+        let metrics = exec.metrics().unwrap();
+        assert!(
+            metrics.spill_count().unwrap() > 0,
+            "Expected spilling to occur, but spill_count = 0"
+        );
+
+        // Verify ordering is preserved within each partition
+        // With RoundRobinBatch, even batches go to partition 0, odd batches to partition 1
+        for (partition_idx, batches) in all_batches.iter().enumerate() {
+            let mut last_value = None;
+            for batch in batches {
+                let array = batch
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<UInt32Array>()
+                    .unwrap();
+
+                for i in 0..array.len() {
+                    let value = array.value(i);
+                    if let Some(last) = last_value {
+                        assert!(
+                            value > last,
+                            "Ordering violated in partition {partition_idx}: {value} is not greater than {last}"
+                        );
+                    }
+                    last_value = Some(value);
+                }
+            }
+        }
+
+        Ok(())
+    }
 }
 
 #[cfg(test)]
 mod test {
+    use arrow::array::record_batch;
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::assert_batches_eq;
 
     use super::*;
     use crate::test::TestMemoryExec;
@@ -1722,19 +2637,13 @@ mod test {
     ///
     /// `$EXPECTED_PLAN_LINES`: input plan
     /// `$PLAN`: the plan to optimized
-    ///
     macro_rules! assert_plan {
-        ($EXPECTED_PLAN_LINES: expr,  $PLAN: expr) => {
-            let physical_plan = $PLAN;
-            let formatted = crate::displayable(&physical_plan).indent(true).to_string();
-            let actual: Vec<&str> = formatted.trim().lines().collect();
-
-            let expected_plan_lines: Vec<&str> = $EXPECTED_PLAN_LINES
-                .iter().map(|s| *s).collect();
+        ($PLAN: expr,  @ $EXPECTED: expr) => {
+            let formatted = crate::displayable($PLAN).indent(true).to_string();
 
-            assert_eq!(
-                expected_plan_lines, actual,
-                "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n"
+            insta::assert_snapshot!(
+                formatted,
+                @$EXPECTED
             );
         };
     }
@@ -1746,20 +2655,17 @@ mod test {
         let source1 = sorted_memory_exec(&schema, sort_exprs.clone());
         let source2 = sorted_memory_exec(&schema, sort_exprs);
         // output has multiple partitions, and is sorted
-        let union = UnionExec::new(vec![source1, source2]);
-        let exec =
-            RepartitionExec::try_new(Arc::new(union), Partitioning::RoundRobinBatch(10))
-                .unwrap()
-                .with_preserve_order();
+        let union = UnionExec::try_new(vec![source1, source2])?;
+        let exec = RepartitionExec::try_new(union, Partitioning::RoundRobinBatch(10))?
+            .with_preserve_order();
 
         // Repartition should preserve order
-        let expected_plan = [
-            "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c0@0 ASC",
-            "  UnionExec",
-            "    DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC",
-            "    DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC",
-        ];
-        assert_plan!(expected_plan, exec);
+        assert_plan!(&exec, @r"
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c0@0 ASC
+          UnionExec
+            DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
+            DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
+        ");
         Ok(())
     }
 
@@ -1769,16 +2675,15 @@ mod test {
         let sort_exprs = sort_exprs(&schema);
         let source = sorted_memory_exec(&schema, sort_exprs);
         // output is sorted, but has only a single partition, so no need to sort
-        let exec = RepartitionExec::try_new(source, Partitioning::RoundRobinBatch(10))
-            .unwrap()
+        let exec = RepartitionExec::try_new(source, Partitioning::RoundRobinBatch(10))?
             .with_preserve_order();
 
         // Repartition should not preserve order
-        let expected_plan = [
-            "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
-            "  DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC",
-        ];
-        assert_plan!(expected_plan, exec);
+        assert_plan!(&exec, @r"
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
+        ");
+
         Ok(())
     }
 
@@ -1788,20 +2693,235 @@ mod test {
         let source1 = memory_exec(&schema);
         let source2 = memory_exec(&schema);
         // output has multiple partitions, but is not sorted
-        let union = UnionExec::new(vec![source1, source2]);
-        let exec =
-            RepartitionExec::try_new(Arc::new(union), Partitioning::RoundRobinBatch(10))
-                .unwrap()
-                .with_preserve_order();
+        let union = UnionExec::try_new(vec![source1, source2])?;
+        let exec = RepartitionExec::try_new(union, Partitioning::RoundRobinBatch(10))?
+            .with_preserve_order();
 
         // Repartition should not preserve order, as there is no order to preserve
-        let expected_plan = [
-            "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
-            "  UnionExec",
-            "    DataSourceExec: partitions=1, partition_sizes=[0]",
-            "    DataSourceExec: partitions=1, partition_sizes=[0]",
+        assert_plan!(&exec, @r"
+        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2
+          UnionExec
+            DataSourceExec: partitions=1, partition_sizes=[0]
+            DataSourceExec: partitions=1, partition_sizes=[0]
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_preserve_order_with_spilling() -> Result<()> {
+        use datafusion_execution::TaskContext;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Create sorted input data across multiple partitions
+        // Partition1: [1,3], [5,7], [9,11]
+        // Partition2: [2,4], [6,8], [10,12]
+        let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap();
+        let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap();
+        let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap();
+        let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap();
+        let batch5 = record_batch!(("c0", UInt32, [9, 11])).unwrap();
+        let batch6 = record_batch!(("c0", UInt32, [10, 12])).unwrap();
+        let schema = batch1.schema();
+        let sort_exprs = LexOrdering::new([PhysicalSortExpr {
+            expr: col("c0", &schema).unwrap(),
+            options: SortOptions::default().asc(),
+        }])
+        .unwrap();
+        let partition1 = vec![batch1.clone(), batch3.clone(), batch5.clone()];
+        let partition2 = vec![batch2.clone(), batch4.clone(), batch6.clone()];
+        let input_partitions = vec![partition1, partition2];
+
+        // Set up context with tight memory limit to force spilling
+        // Sorting needs some non-spillable memory, so 64 bytes should force spilling while still allowing the query to complete
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(64, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create physical plan with order preservation
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?
+            .try_with_sort_information(vec![sort_exprs.clone(), sort_exprs])?;
+        let exec = Arc::new(exec);
+        let exec = Arc::new(TestMemoryExec::update_cache(&exec));
+        // Repartition into 3 partitions with order preservation
+        // We expect 1 batch per output partition after repartitioning
+        let exec = RepartitionExec::try_new(exec, Partitioning::RoundRobinBatch(3))?
+            .with_preserve_order();
+
+        let mut batches = vec![];
+
+        // Collect all partitions - should succeed by spilling to disk
+        for i in 0..exec.partitioning().partition_count() {
+            let mut stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            while let Some(result) = stream.next().await {
+                let batch = result?;
+                batches.push(batch);
+            }
+        }
+
+        #[rustfmt::skip]
+        let expected = [
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 1  |",
+                "| 2  |",
+                "| 3  |",
+                "| 4  |",
+                "+----+",
+            ],
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 5  |",
+                "| 6  |",
+                "| 7  |",
+                "| 8  |",
+                "+----+",
+            ],
+            [
+                "+----+",
+                "| c0 |",
+                "+----+",
+                "| 9  |",
+                "| 10 |",
+                "| 11 |",
+                "| 12 |",
+                "+----+",
+            ],
         ];
-        assert_plan!(expected_plan, exec);
+
+        for (batch, expected) in batches.iter().zip(expected.iter()) {
+            assert_batches_eq!(expected, std::slice::from_ref(batch));
+        }
+
+        // We should have spilled ~ all of the data.
+        // - We spill data during the repartitioning phase
+        // - We may also spill during the final merge sort
+        let all_batches = [batch1, batch2, batch3, batch4, batch5, batch6];
+        let metrics = exec.metrics().unwrap();
+        assert!(
+            metrics.spill_count().unwrap() > input_partitions.len(),
+            "Expected spill_count > {} for order-preserving repartition, but got {:?}",
+            input_partitions.len(),
+            metrics.spill_count()
+        );
+        assert!(
+            metrics.spilled_bytes().unwrap()
+                > all_batches
+                    .iter()
+                    .map(|b| b.get_array_memory_size())
+                    .sum::<usize>(),
+            "Expected spilled_bytes > {} for order-preserving repartition, got {}",
+            all_batches
+                .iter()
+                .map(|b| b.get_array_memory_size())
+                .sum::<usize>(),
+            metrics.spilled_bytes().unwrap()
+        );
+        assert!(
+            metrics.spilled_rows().unwrap()
+                >= all_batches.iter().map(|b| b.num_rows()).sum::<usize>(),
+            "Expected spilled_rows > {} for order-preserving repartition, got {}",
+            all_batches.iter().map(|b| b.num_rows()).sum::<usize>(),
+            metrics.spilled_rows().unwrap()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hash_partitioning_with_spilling() -> Result<()> {
+        use datafusion_execution::TaskContext;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Create input data similar to the round-robin test
+        let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap();
+        let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap();
+        let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap();
+        let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap();
+        let schema = batch1.schema();
+
+        let partition1 = vec![batch1.clone(), batch3.clone()];
+        let partition2 = vec![batch2.clone(), batch4.clone()];
+        let input_partitions = vec![partition1, partition2];
+
+        // Set up context with memory limit to test hash partitioning with spilling infrastructure
+        let runtime = RuntimeEnvBuilder::default()
+            .with_memory_limit(1, 1.0)
+            .build_arc()?;
+
+        let task_ctx = TaskContext::default().with_runtime(runtime);
+        let task_ctx = Arc::new(task_ctx);
+
+        // Create physical plan with hash partitioning
+        let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)?;
+        let exec = Arc::new(exec);
+        let exec = Arc::new(TestMemoryExec::update_cache(&exec));
+        // Hash partition into 2 partitions by column c0
+        let hash_expr = col("c0", &schema)?;
+        let exec =
+            RepartitionExec::try_new(exec, Partitioning::Hash(vec![hash_expr], 2))?;
+
+        // Collect all partitions concurrently using JoinSet - this prevents deadlock
+        // where the distribution channel gate closes when all output channels are full
+        let mut join_set = tokio::task::JoinSet::new();
+        for i in 0..exec.partitioning().partition_count() {
+            let stream = exec.execute(i, Arc::clone(&task_ctx))?;
+            join_set.spawn(async move {
+                let mut count = 0;
+                futures::pin_mut!(stream);
+                while let Some(result) = stream.next().await {
+                    let batch = result?;
+                    count += batch.num_rows();
+                }
+                Ok::<usize, DataFusionError>(count)
+            });
+        }
+
+        // Wait for all partitions and sum the rows
+        let mut total_rows = 0;
+        while let Some(result) = join_set.join_next().await {
+            total_rows += result.unwrap()?;
+        }
+
+        // Verify we got all rows back
+        let all_batches = [batch1, batch2, batch3, batch4];
+        let expected_rows: usize = all_batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, expected_rows);
+
+        // Verify metrics are available
+        let metrics = exec.metrics().unwrap();
+        // Just verify the metrics can be retrieved (spilling may or may not occur)
+        let spill_count = metrics.spill_count().unwrap_or(0);
+        assert!(spill_count > 0);
+        let spilled_bytes = metrics.spilled_bytes().unwrap_or(0);
+        assert!(spilled_bytes > 0);
+        let spilled_rows = metrics.spilled_rows().unwrap_or(0);
+        assert!(spilled_rows > 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_repartition() -> Result<()> {
+        let schema = test_schema();
+        let sort_exprs = sort_exprs(&schema);
+        let source = sorted_memory_exec(&schema, sort_exprs);
+        // output is sorted, but has only a single partition, so no need to sort
+        let exec = RepartitionExec::try_new(source, Partitioning::RoundRobinBatch(10))?
+            .repartitioned(20, &Default::default())?
+            .unwrap();
+
+        // Repartition should not preserve order
+        assert_plan!(exec.as_ref(), @r"
+        RepartitionExec: partitioning=RoundRobinBatch(20), input_partitions=1, maintains_sort_order=true
+          DataSourceExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC
+        ");
         Ok(())
     }
 
@@ -1810,11 +2930,11 @@ mod test {
     }
 
     fn sort_exprs(schema: &Schema) -> LexOrdering {
-        let options = SortOptions::default();
-        LexOrdering::new(vec![PhysicalSortExpr {
+        [PhysicalSortExpr {
             expr: col("c0", schema).unwrap(),
-            options,
-        }])
+            options: SortOptions::default(),
+        }]
+        .into()
     }
 
     fn memory_exec(schema: &SchemaRef) -> Arc<dyn ExecutionPlan> {
@@ -1825,11 +2945,11 @@ mod test {
         schema: &SchemaRef,
         sort_exprs: LexOrdering,
     ) -> Arc<dyn ExecutionPlan> {
-        Arc::new(TestMemoryExec::update_cache(Arc::new(
-            TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None)
-                .unwrap()
-                .try_with_sort_information(vec![sort_exprs])
-                .unwrap(),
-        )))
+        let exec = TestMemoryExec::try_new(&[vec![]], Arc::clone(schema), None)
+            .unwrap()
+            .try_with_sort_information(vec![sort_exprs])
+            .unwrap();
+        let exec = Arc::new(exec);
+        Arc::new(TestMemoryExec::update_cache(&exec))
     }
 }
diff --git a/datafusion/physical-plan/src/sort_pushdown.rs b/datafusion/physical-plan/src/sort_pushdown.rs
new file mode 100644
index 0000000000000..8432fd5dabee7
--- /dev/null
+++ b/datafusion/physical-plan/src/sort_pushdown.rs
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort pushdown types for physical execution plans.
+//!
+//! This module provides types used for pushing sort ordering requirements
+//! down through the execution plan tree to data sources.
+
+/// Result of attempting to push down sort ordering to a node.
+///
+/// Used by [`ExecutionPlan::try_pushdown_sort`] to communicate
+/// whether and how sort ordering was successfully pushed down.
+///
+/// [`ExecutionPlan::try_pushdown_sort`]: crate::ExecutionPlan::try_pushdown_sort
+#[derive(Debug, Clone)]
+pub enum SortOrderPushdownResult<T> {
+    /// The source can guarantee exact ordering (data is perfectly sorted).
+    ///
+    /// When this is returned, the optimizer can safely remove the Sort operator
+    /// entirely since the data source guarantees the requested ordering.
+    Exact {
+        /// The optimized node that provides exact ordering
+        inner: T,
+    },
+    /// The source has optimized for the ordering but cannot guarantee perfect sorting.
+    ///
+    /// This indicates the data source has been optimized (e.g., reordered files/row groups
+    /// based on statistics, enabled reverse scanning) but the data may not be perfectly
+    /// sorted. The optimizer should keep the Sort operator but benefits from the
+    /// optimization (e.g., faster TopK queries due to early termination).
+    Inexact {
+        /// The optimized node that provides approximate ordering
+        inner: T,
+    },
+    /// The source cannot optimize for this ordering.
+    ///
+    /// The data source does not support the requested sort ordering and no
+    /// optimization was applied.
+    Unsupported,
+}
+
+impl<T> SortOrderPushdownResult<T> {
+    /// Extract the inner value if present
+    pub fn into_inner(self) -> Option<T> {
+        match self {
+            Self::Exact { inner } | Self::Inexact { inner } => Some(inner),
+            Self::Unsupported => None,
+        }
+    }
+
+    /// Map the inner value to a different type while preserving the variant.
+    pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> SortOrderPushdownResult<U> {
+        match self {
+            Self::Exact { inner } => SortOrderPushdownResult::Exact { inner: f(inner) },
+            Self::Inexact { inner } => {
+                SortOrderPushdownResult::Inexact { inner: f(inner) }
+            }
+            Self::Unsupported => SortOrderPushdownResult::Unsupported,
+        }
+    }
+
+    /// Try to map the inner value, returning an error if the function fails.
+    pub fn try_map<U, E, F: FnOnce(T) -> Result<U, E>>(
+        self,
+        f: F,
+    ) -> Result<SortOrderPushdownResult<U>, E> {
+        match self {
+            Self::Exact { inner } => {
+                Ok(SortOrderPushdownResult::Exact { inner: f(inner)? })
+            }
+            Self::Inexact { inner } => {
+                Ok(SortOrderPushdownResult::Inexact { inner: f(inner)? })
+            }
+            Self::Unsupported => Ok(SortOrderPushdownResult::Unsupported),
+        }
+    }
+
+    /// Convert this result to `Inexact`, downgrading `Exact` if present.
+    ///
+    /// This is useful when an operation (like merging multiple partitions)
+    /// cannot guarantee exact ordering even if the input provides it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use datafusion_physical_plan::SortOrderPushdownResult;
+    /// let exact = SortOrderPushdownResult::Exact { inner: 42 };
+    /// let inexact = exact.into_inexact();
+    /// assert!(matches!(inexact, SortOrderPushdownResult::Inexact { inner: 42 }));
+    ///
+    /// let already_inexact = SortOrderPushdownResult::Inexact { inner: 42 };
+    /// let still_inexact = already_inexact.into_inexact();
+    /// assert!(matches!(still_inexact, SortOrderPushdownResult::Inexact { inner: 42 }));
+    ///
+    /// let unsupported = SortOrderPushdownResult::<i32>::Unsupported;
+    /// let still_unsupported = unsupported.into_inexact();
+    /// assert!(matches!(still_unsupported, SortOrderPushdownResult::Unsupported));
+    /// ```
+    pub fn into_inexact(self) -> Self {
+        match self {
+            Self::Exact { inner } => Self::Inexact { inner },
+            Self::Inexact { inner } => Self::Inexact { inner },
+            Self::Unsupported => Self::Unsupported,
+        }
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/builder.rs b/datafusion/physical-plan/src/sorts/builder.rs
index 9b2fa968222c4..a462b832056bd 100644
--- a/datafusion/physical-plan/src/sorts/builder.rs
+++ b/datafusion/physical-plan/src/sorts/builder.rs
@@ -40,9 +40,24 @@ pub struct BatchBuilder {
     /// Maintain a list of [`RecordBatch`] and their corresponding stream
     batches: Vec<(usize, RecordBatch)>,
 
-    /// Accounts for memory used by buffered batches
+    /// Accounts for memory used by buffered batches.
+    ///
+    /// May include pre-reserved bytes (from `sort_spill_reservation_bytes`)
+    /// that were transferred via [`MemoryReservation::take()`] to prevent
+    /// starvation when concurrent sort partitions compete for pool memory.
     reservation: MemoryReservation,
 
+    /// Tracks the actual memory used by buffered batches (not including
+    /// pre-reserved bytes). This allows [`Self::push_batch`] to skip pool
+    /// allocation requests when the pre-reserved bytes cover the batch.
+    batches_mem_used: usize,
+
+    /// The initial reservation size at construction time. When the reservation
+    /// is pre-loaded with `sort_spill_reservation_bytes` (via `take()`), this
+    /// records that amount so we never shrink below it, maintaining the
+    /// anti-starvation guarantee throughout the merge.
+    initial_reservation: usize,
+
     /// The current [`BatchCursor`] for each stream
     cursors: Vec<BatchCursor>,
 
@@ -59,19 +74,26 @@ impl BatchBuilder {
         batch_size: usize,
         reservation: MemoryReservation,
     ) -> Self {
+        let initial_reservation = reservation.size();
         Self {
             schema,
             batches: Vec::with_capacity(stream_count * 2),
             cursors: vec![BatchCursor::default(); stream_count],
             indices: Vec::with_capacity(batch_size),
             reservation,
+            batches_mem_used: 0,
+            initial_reservation,
         }
     }
 
     /// Append a new batch in `stream_idx`
     pub fn push_batch(&mut self, stream_idx: usize, batch: RecordBatch) -> Result<()> {
-        self.reservation
-            .try_grow(get_record_batch_memory_size(&batch))?;
+        let size = get_record_batch_memory_size(&batch);
+        self.batches_mem_used += size;
+        // Only request additional memory from the pool when actual batch
+        // usage exceeds the current reservation (which may include
+        // pre-reserved bytes from sort_spill_reservation_bytes).
+        try_grow_reservation_to_at_least(&mut self.reservation, self.batches_mem_used)?;
         let batch_idx = self.batches.len();
         self.batches.push((stream_idx, batch));
         self.cursors[stream_idx] = BatchCursor {
@@ -143,14 +165,38 @@ impl BatchBuilder {
                 stream_cursor.batch_idx = retained;
                 retained += 1;
             } else {
-                self.reservation.shrink(get_record_batch_memory_size(batch));
+                self.batches_mem_used -= get_record_batch_memory_size(batch);
             }
             retain
         });
 
+        // Release excess memory back to the pool, but never shrink below
+        // initial_reservation to maintain the anti-starvation guarantee
+        // for the merge phase.
+        let target = self.batches_mem_used.max(self.initial_reservation);
+        if self.reservation.size() > target {
+            self.reservation.shrink(self.reservation.size() - target);
+        }
+
         Ok(Some(RecordBatch::try_new(
             Arc::clone(&self.schema),
             columns,
         )?))
     }
 }
+
+/// Try to grow `reservation` so it covers at least `needed` bytes.
+///
+/// When a reservation has been pre-loaded with bytes (e.g. via
+/// [`MemoryReservation::take()`]), this avoids redundant pool
+/// allocations: if the reservation already covers `needed`, this is
+/// a no-op; otherwise only the deficit is requested from the pool.
+pub(crate) fn try_grow_reservation_to_at_least(
+    reservation: &mut MemoryReservation,
+    needed: usize,
+) -> Result<()> {
+    if needed > reservation.size() {
+        reservation.try_grow(needed - reservation.size())?;
+    }
+    Ok(())
+}
diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs
index efb9c0a47bf58..de3ec2e7a91ed 100644
--- a/datafusion/physical-plan/src/sorts/cursor.rs
+++ b/datafusion/physical-plan/src/sorts/cursor.rs
@@ -16,10 +16,11 @@
 // under the License.
 
 use std::cmp::Ordering;
+use std::sync::Arc;
 
 use arrow::array::{
-    types::ByteArrayType, Array, ArrowPrimitiveType, GenericByteArray,
-    GenericByteViewArray, OffsetSizeTrait, PrimitiveArray, StringViewArray,
+    Array, ArrowPrimitiveType, GenericByteArray, GenericByteViewArray, OffsetSizeTrait,
+    PrimitiveArray, StringViewArray, types::ByteArrayType,
 };
 use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
 use arrow::compute::SortOptions;
@@ -151,7 +152,7 @@ impl<T: CursorValues> Ord for Cursor<T> {
 /// Used for sorting when there are multiple columns in the sort key
 #[derive(Debug)]
 pub struct RowValues {
-    rows: Rows,
+    rows: Arc<Rows>,
 
     /// Tracks for the memory used by in the `Rows` of this
     /// cursor. Freed on drop
@@ -164,7 +165,7 @@ impl RowValues {
     ///
     /// Panics if the reservation is not for exactly `rows.size()`
     /// bytes or if `rows` is empty.
-    pub fn new(rows: Rows, reservation: MemoryReservation) -> Self {
+    pub fn new(rows: Arc<Rows>, reservation: MemoryReservation) -> Self {
         assert_eq!(
             rows.size(),
             reservation.size(),
@@ -293,14 +294,19 @@ impl CursorValues for StringViewArray {
         self.views().len()
     }
 
+    #[inline(always)]
     fn eq(l: &Self, l_idx: usize, r: &Self, r_idx: usize) -> bool {
         // SAFETY: Both l_idx and r_idx are guaranteed to be within bounds,
         // and any null-checks are handled in the outer layers.
         // Fast path: Compare the lengths before full byte comparison.
-
         let l_view = unsafe { l.views().get_unchecked(l_idx) };
-        let l_len = *l_view as u32;
         let r_view = unsafe { r.views().get_unchecked(r_idx) };
+
+        if l.data_buffers().is_empty() && r.data_buffers().is_empty() {
+            return l_view == r_view;
+        }
+
+        let l_len = *l_view as u32;
         let r_len = *r_view as u32;
         if l_len != r_len {
             return false;
@@ -309,14 +315,20 @@ impl CursorValues for StringViewArray {
         unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx).is_eq() }
     }
 
+    #[inline(always)]
     fn eq_to_previous(cursor: &Self, idx: usize) -> bool {
         // SAFETY: The caller guarantees that idx > 0 and the indices are valid.
         // Already checked it in is_eq_to_prev_one function
         // Fast path: Compare the lengths of the current and previous views.
         let l_view = unsafe { cursor.views().get_unchecked(idx) };
-        let l_len = *l_view as u32;
         let r_view = unsafe { cursor.views().get_unchecked(idx - 1) };
+        if cursor.data_buffers().is_empty() {
+            return l_view == r_view;
+        }
+
+        let l_len = *l_view as u32;
         let r_len = *r_view as u32;
+
         if l_len != r_len {
             return false;
         }
@@ -326,10 +338,18 @@ impl CursorValues for StringViewArray {
         }
     }
 
+    #[inline(always)]
     fn compare(l: &Self, l_idx: usize, r: &Self, r_idx: usize) -> Ordering {
         // SAFETY: Prior assertions guarantee that l_idx and r_idx are valid indices.
         // Null-checks are assumed to have been handled in the wrapper (e.g., ArrayValues).
         // And the bound is checked in is_finished, it is safe to call get_unchecked
+        if l.data_buffers().is_empty() && r.data_buffers().is_empty() {
+            let l_view = unsafe { l.views().get_unchecked(l_idx) };
+            let r_view = unsafe { r.views().get_unchecked(r_idx) };
+            return StringViewArray::inline_key_fast(*l_view)
+                .cmp(&StringViewArray::inline_key_fast(*r_view));
+        }
+
         unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) }
     }
 }
@@ -422,11 +442,10 @@ impl<T: CursorValues> CursorValues for ArrayValues<T> {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use datafusion_execution::memory_pool::{
         GreedyMemoryPool, MemoryConsumer, MemoryPool,
     };
+    use std::sync::Arc;
 
     use super::*;
 
diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs
index 2b42457635f7b..272816251daf9 100644
--- a/datafusion/physical-plan/src/sorts/merge.rs
+++ b/datafusion/physical-plan/src/sorts/merge.rs
@@ -18,16 +18,15 @@
 //! Merge that deals with an arbitrary size of streaming inputs.
 //! This is an order-preserving merge.
 
-use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
+use crate::RecordBatchStream;
 use crate::metrics::BaselineMetrics;
 use crate::sorts::builder::BatchBuilder;
 use crate::sorts::cursor::{Cursor, CursorValues};
 use crate::sorts::stream::PartitionedStream;
-use crate::RecordBatchStream;
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
@@ -50,8 +49,9 @@ pub(crate) struct SortPreservingMergeStream<C: CursorValues> {
     /// used to record execution metrics
     metrics: BaselineMetrics,
 
-    /// If the stream has encountered an error
-    aborted: bool,
+    /// If the stream has encountered an error or reaches the
+    /// `fetch` limit.
+    done: bool,
 
     /// A loser tree that always produces the minimum cursor
     ///
@@ -143,11 +143,8 @@ pub(crate) struct SortPreservingMergeStream<C: CursorValues> {
     /// number of rows produced
     produced: usize,
 
-    /// This queue contains partition indices in order. When a partition is polled and returns `Poll::Ready`,
-    /// it is removed from the vector. If a partition returns `Poll::Pending`, it is moved to the end of the
-    /// vector to ensure the next iteration starts with a different partition, preventing the same partition
-    /// from being continuously polled.
-    uninitiated_partitions: VecDeque<usize>,
+    /// This vector contains the indices of the partitions that have not started emitting yet.
+    uninitiated_partitions: Vec<usize>,
 }
 
 impl<C: CursorValues> SortPreservingMergeStream<C> {
@@ -166,7 +163,7 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
             in_progress: BatchBuilder::new(schema, stream_count, batch_size, reservation),
             streams,
             metrics,
-            aborted: false,
+            done: false,
             cursors: (0..stream_count).map(|_| None).collect(),
             prev_cursors: (0..stream_count).map(|_| None).collect(),
             round_robin_tie_breaker_mode: false,
@@ -210,42 +207,56 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
         &mut self,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<RecordBatch>>> {
-        if self.aborted {
+        if self.done {
             return Poll::Ready(None);
         }
         // Once all partitions have set their corresponding cursors for the loser tree,
         // we skip the following block. Until then, this function may be called multiple
         // times and can return Poll::Pending if any partition returns Poll::Pending.
+
         if self.loser_tree.is_empty() {
-            while let Some(&partition_idx) = self.uninitiated_partitions.front() {
+            // Manual indexing since we're iterating over the vector and shrinking it in the loop
+            let mut idx = 0;
+            while idx < self.uninitiated_partitions.len() {
+                let partition_idx = self.uninitiated_partitions[idx];
                 match self.maybe_poll_stream(cx, partition_idx) {
                     Poll::Ready(Err(e)) => {
-                        self.aborted = true;
+                        self.done = true;
                         return Poll::Ready(Some(Err(e)));
                     }
                     Poll::Pending => {
-                        // If a partition returns Poll::Pending, to avoid continuously polling it
-                        // and potentially increasing upstream buffer sizes, we move it to the
-                        // back of the polling queue.
-                        self.uninitiated_partitions.rotate_left(1);
-
-                        // This function could remain in a pending state, so we manually wake it here.
-                        // However, this approach can be investigated further to find a more natural way
-                        // to avoid disrupting the runtime scheduler.
-                        cx.waker().wake_by_ref();
-                        return Poll::Pending;
+                        // The polled stream is pending which means we're already set up to
+                        // be woken when necessary
+                        // Try the next stream
+                        idx += 1;
                     }
                     _ => {
-                        // If the polling result is Poll::Ready(Some(batch)) or Poll::Ready(None),
-                        // we remove this partition from the queue so it is not polled again.
-                        self.uninitiated_partitions.pop_front();
+                        // The polled stream is ready
+                        // Remove it from uninitiated_partitions
+                        // Don't bump idx here, since a new element will have taken its
+                        // place which we'll try in the next loop iteration
+                        // swap_remove will change the partition poll order, but that shouldn't
+                        // make a difference since we're waiting for all streams to be ready.
+                        self.uninitiated_partitions.swap_remove(idx);
                     }
                 }
             }
 
-            // Claim the memory for the uninitiated partitions
-            self.uninitiated_partitions.shrink_to_fit();
-            self.init_loser_tree();
+            if self.uninitiated_partitions.is_empty() {
+                // If there are no more uninitiated partitions, set up the loser tree and continue
+                // to the next phase.
+
+                // Claim the memory for the uninitiated partitions
+                self.uninitiated_partitions.shrink_to_fit();
+                self.init_loser_tree();
+            } else {
+                // There are still uninitiated partitions so return pending.
+                // We only get here if we've polled all uninitiated streams and at least one of them
+                // returned pending itself. That means we will be woken as soon as one of the
+                // streams would like to be polled again.
+                // There is no need to reschedule ourselves eagerly.
+                return Poll::Pending;
+            }
         }
 
         // NB timer records time taken on drop, so there are no
@@ -258,7 +269,7 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
             if !self.loser_tree_adjusted {
                 let winner = self.loser_tree[0];
                 if let Err(e) = ready!(self.maybe_poll_stream(cx, winner)) {
-                    self.aborted = true;
+                    self.done = true;
                     return Poll::Ready(Some(Err(e)));
                 }
                 self.update_loser_tree();
@@ -271,7 +282,7 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
 
                 // stop sorting if fetch has been reached
                 if self.fetch_reached() {
-                    self.aborted = true;
+                    self.done = true;
                 } else if self.in_progress.len() < self.batch_size {
                     continue;
                 }
@@ -379,7 +390,6 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
     ///
     /// Zooming in at node 2 in the loser tree as an example, we can see that
     /// it takes as input the next item at (S0) and the loser of (S3, S4).
-    ///
     #[inline]
     fn lt_leaf_node_index(&self, cursor_index: usize) -> usize {
         (self.cursors.len() + cursor_index) / 2
@@ -483,13 +493,12 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
             if self.enable_round_robin_tie_breaker && cmp_node == 1 {
                 match (&self.cursors[winner], &self.cursors[challenger]) {
                     (Some(ac), Some(bc)) => {
-                        let ord = ac.cmp(bc);
-                        if ord.is_eq() {
+                        if ac == bc {
                             self.handle_tie(cmp_node, &mut winner, challenger);
                         } else {
                             // Ends of tie breaker
                             self.round_robin_tie_breaker_mode = false;
-                            if ord.is_gt() {
+                            if ac > bc {
                                 self.update_winner(cmp_node, &mut winner, challenger);
                             }
                         }
diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs
index c7ffae4061c0e..a73872a175b9b 100644
--- a/datafusion/physical-plan/src/sorts/mod.rs
+++ b/datafusion/physical-plan/src/sorts/mod.rs
@@ -20,8 +20,11 @@
 mod builder;
 mod cursor;
 mod merge;
+mod multi_level_merge;
 pub mod partial_sort;
 pub mod sort;
 pub mod sort_preserving_merge;
 mod stream;
 pub mod streaming_merge;
+
+pub(crate) use stream::IncrementalSortIterator;
diff --git a/datafusion/physical-plan/src/sorts/multi_level_merge.rs b/datafusion/physical-plan/src/sorts/multi_level_merge.rs
new file mode 100644
index 0000000000000..8985e1d8c70ee
--- /dev/null
+++ b/datafusion/physical-plan/src/sorts/multi_level_merge.rs
@@ -0,0 +1,483 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Create a stream that do a multi level merge stream
+
+use crate::metrics::BaselineMetrics;
+use crate::{EmptyRecordBatchStream, SpillManager};
+use arrow::array::RecordBatch;
+use std::fmt::{Debug, Formatter};
+use std::mem;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::Result;
+use datafusion_execution::memory_pool::MemoryReservation;
+
+use crate::sorts::builder::try_grow_reservation_to_at_least;
+use crate::sorts::sort::get_reserved_bytes_for_record_batch_size;
+use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
+use crate::stream::RecordBatchStreamAdapter;
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use futures::TryStreamExt;
+use futures::{Stream, StreamExt};
+
+/// Merges a stream of sorted cursors and record batches into a single sorted stream
+///
+/// This is a wrapper around [`SortPreservingMergeStream`](crate::sorts::merge::SortPreservingMergeStream)
+/// that provide it the sorted streams/files to merge while making sure we can merge them in memory.
+/// In case we can't merge all of them in a single pass we will spill the intermediate results to disk
+/// and repeat the process.
+///
+/// ## High level Algorithm
+/// 1. Get the maximum amount of sorted in-memory streams and spill files we can merge with the available memory
+/// 2. Sort them to a sorted stream
+/// 3. Do we have more spill files to merge?
+///  - Yes: write that sorted stream to a spill file,
+///    add that spill file back to the spill files to merge and
+///    repeat the process
+///
+///  - No: return that sorted stream as the final output stream
+///
+/// ```text
+/// Initial State: Multiple sorted streams + spill files
+///      ┌───────────┐
+///      │  Phase 1  │
+///      └───────────┘
+/// ┌──Can hold in memory─┐
+/// │   ┌──────────────┐  │
+/// │   │  In-memory   │
+/// │   │sorted stream │──┼────────┐
+/// │   │      1       │  │        │
+///     └──────────────┘  │        │
+/// │   ┌──────────────┐  │        │
+/// │   │  In-memory   │           │
+/// │   │sorted stream │──┼────────┤
+/// │   │      2       │  │        │
+///     └──────────────┘  │        │
+/// │   ┌──────────────┐  │        │
+/// │   │  In-memory   │           │
+/// │   │sorted stream │──┼────────┤
+/// │   │      3       │  │        │
+///     └──────────────┘  │        │
+/// │   ┌──────────────┐  │        │            ┌───────────┐
+/// │   │ Sorted Spill │           │            │  Phase 2  │
+/// │   │    file 1    │──┼────────┤            └───────────┘
+/// │   └──────────────┘  │        │
+///  ──── ──── ──── ──── ─┘        │       ┌──Can hold in memory─┐
+///                                │       │                     │
+///     ┌──────────────┐           │       │   ┌──────────────┐
+///     │ Sorted Spill │           │       │   │ Sorted Spill │  │
+///     │    file 2    │──────────────────────▶│    file 2    │──┼─────┐
+///     └──────────────┘           │           └──────────────┘  │     │
+///     ┌──────────────┐           │       │   ┌──────────────┐  │     │
+///     │ Sorted Spill │           │       │   │ Sorted Spill │        │
+///     │    file 3    │──────────────────────▶│    file 3    │──┼─────┤
+///     └──────────────┘           │       │   └──────────────┘  │     │
+///     ┌──────────────┐           │           ┌──────────────┐  │     │
+///     │ Sorted Spill │           │       │   │ Sorted Spill │  │     │
+///     │    file 4    │──────────────────────▶│    file 4    │────────┤          ┌───────────┐
+///     └──────────────┘           │       │   └──────────────┘  │     │          │  Phase 3  │
+///                                │       │                     │     │          └───────────┘
+///                                │        ──── ──── ──── ──── ─┘     │     ┌──Can hold in memory─┐
+///                                │                                   │     │                     │
+///     ┌──────────────┐           │           ┌──────────────┐        │     │  ┌──────────────┐
+///     │ Sorted Spill │           │           │ Sorted Spill │        │     │  │ Sorted Spill │   │
+///     │    file 5    │──────────────────────▶│    file 5    │────────────────▶│    file 5    │───┼───┐
+///     └──────────────┘           │           └──────────────┘        │     │  └──────────────┘   │   │
+///                                │                                   │     │                     │   │
+///                                │           ┌──────────────┐        │     │  ┌──────────────┐       │
+///                                │           │ Sorted Spill │        │     │  │ Sorted Spill │   │   │       ┌── ─── ─── ─── ─── ─── ─── ──┐
+///                                └──────────▶│    file 6    │────────────────▶│    file 6    │───┼───┼──────▶         Output Stream
+///                                            └──────────────┘        │     │  └──────────────┘   │   │       └── ─── ─── ─── ─── ─── ─── ──┘
+///                                                                    │     │                     │   │
+///                                                                    │     │  ┌──────────────┐       │
+///                                                                    │     │  │ Sorted Spill │   │   │
+///                                                                    └───────▶│    file 7    │───┼───┘
+///                                                                          │  └──────────────┘   │
+///                                                                          │                     │
+///                                                                          └─ ──── ──── ──── ────
+/// ```
+///
+/// ## Memory Management Strategy
+///
+/// This multi-level merge make sure that we can handle any amount of data to sort as long as
+/// we have enough memory to merge at least 2 streams at a time.
+///
+/// 1. **Worst-Case Memory Reservation**: Reserves memory based on the largest
+///    batch size encountered in each spill file to merge, ensuring sufficient memory is always
+///    available during merge operations.
+/// 2. **Adaptive Buffer Sizing**: Reduces buffer sizes when memory is constrained
+/// 3. **Spill-to-Disk**: Spill to disk when we cannot merge all files in memory
+pub(crate) struct MultiLevelMergeBuilder {
+    spill_manager: SpillManager,
+    schema: SchemaRef,
+    sorted_spill_files: Vec<SortedSpillFile>,
+    sorted_streams: Vec<SendableRecordBatchStream>,
+    expr: LexOrdering,
+    metrics: BaselineMetrics,
+    batch_size: usize,
+    reservation: MemoryReservation,
+    fetch: Option<usize>,
+    enable_round_robin_tie_breaker: bool,
+}
+
+impl Debug for MultiLevelMergeBuilder {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "MultiLevelMergeBuilder")
+    }
+}
+
+impl MultiLevelMergeBuilder {
+    #[expect(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        spill_manager: SpillManager,
+        schema: SchemaRef,
+        sorted_spill_files: Vec<SortedSpillFile>,
+        sorted_streams: Vec<SendableRecordBatchStream>,
+        expr: LexOrdering,
+        metrics: BaselineMetrics,
+        batch_size: usize,
+        reservation: MemoryReservation,
+        fetch: Option<usize>,
+        enable_round_robin_tie_breaker: bool,
+    ) -> Self {
+        Self {
+            spill_manager,
+            schema,
+            sorted_spill_files,
+            sorted_streams,
+            expr,
+            metrics,
+            batch_size,
+            reservation,
+            enable_round_robin_tie_breaker,
+            fetch,
+        }
+    }
+
+    pub(crate) fn create_spillable_merge_stream(self) -> SendableRecordBatchStream {
+        Box::pin(RecordBatchStreamAdapter::new(
+            Arc::clone(&self.schema),
+            futures::stream::once(self.create_stream()).try_flatten(),
+        ))
+    }
+
+    async fn create_stream(mut self) -> Result<SendableRecordBatchStream> {
+        loop {
+            let mut stream = self.merge_sorted_runs_within_mem_limit()?;
+
+            // TODO - add a threshold for number of files to disk even if empty and reading from disk so
+            //        we can avoid the memory reservation
+
+            // If no spill files are left, we can return the stream as this is the last sorted run
+            // TODO - We can write to disk before reading it back to avoid having multiple streams in memory
+            if self.sorted_spill_files.is_empty() {
+                assert!(
+                    self.sorted_streams.is_empty(),
+                    "We should not have any sorted streams left"
+                );
+
+                return Ok(stream);
+            }
+
+            // Need to sort to a spill file
+            let Some((spill_file, max_record_batch_memory)) = self
+                .spill_manager
+                .spill_record_batch_stream_and_return_max_batch_memory(
+                    &mut stream,
+                    "MultiLevelMergeBuilder intermediate spill",
+                )
+                .await?
+            else {
+                continue;
+            };
+
+            // Add the spill file
+            self.sorted_spill_files.push(SortedSpillFile {
+                file: spill_file,
+                max_record_batch_memory,
+            });
+        }
+    }
+
+    /// This tries to create a stream that merges the most sorted streams and sorted spill files
+    /// as possible within the memory limit.
+    fn merge_sorted_runs_within_mem_limit(
+        &mut self,
+    ) -> Result<SendableRecordBatchStream> {
+        match (self.sorted_spill_files.len(), self.sorted_streams.len()) {
+            // No data so empty batch
+            (0, 0) => Ok(Box::pin(EmptyRecordBatchStream::new(Arc::clone(
+                &self.schema,
+            )))),
+
+            // Only in-memory stream, return that
+            (0, 1) => Ok(self.sorted_streams.remove(0)),
+
+            // Only single sorted spill file so return it
+            (1, 0) => {
+                let spill_file = self.sorted_spill_files.remove(0);
+
+                // Not reserving any memory for this disk as we are not holding it in memory
+                self.spill_manager
+                    .read_spill_as_stream(spill_file.file, None)
+            }
+
+            // Only in memory streams, so merge them all in a single pass
+            (0, _) => {
+                let sorted_stream = mem::take(&mut self.sorted_streams);
+                self.create_new_merge_sort(
+                    sorted_stream,
+                    // If we have no sorted spill files left, this is the last run
+                    true,
+                    true,
+                )
+            }
+
+            // Need to merge multiple streams
+            (_, _) => {
+                // Transfer any pre-reserved bytes (from sort_spill_reservation_bytes)
+                // to the merge memory reservation. This prevents starvation when
+                // concurrent sort partitions compete for pool memory: the pre-reserved
+                // bytes cover spill file buffer reservations without additional pool
+                // allocation.
+                let mut memory_reservation = self.reservation.take();
+
+                // Don't account for existing streams memory
+                // as we are not holding the memory for them
+                let mut sorted_streams = mem::take(&mut self.sorted_streams);
+
+                let (sorted_spill_files, buffer_size) = self
+                    .get_sorted_spill_files_to_merge(
+                        2,
+                        // we must have at least 2 streams to merge
+                        2_usize.saturating_sub(sorted_streams.len()),
+                        &mut memory_reservation,
+                    )?;
+
+                let is_only_merging_memory_streams = sorted_spill_files.is_empty();
+
+                // If no spill files were selected (e.g. all too large for
+                // available memory but enough in-memory streams exist),
+                // return the pre-reserved bytes to self.reservation so
+                // create_new_merge_sort can transfer them to the merge
+                // stream's BatchBuilder.
+                if is_only_merging_memory_streams {
+                    mem::swap(&mut self.reservation, &mut memory_reservation);
+                }
+
+                for spill in sorted_spill_files {
+                    let stream = self
+                        .spill_manager
+                        .clone()
+                        .with_batch_read_buffer_capacity(buffer_size)
+                        .read_spill_as_stream(
+                            spill.file,
+                            Some(spill.max_record_batch_memory),
+                        )?;
+                    sorted_streams.push(stream);
+                }
+                let merge_sort_stream = self.create_new_merge_sort(
+                    sorted_streams,
+                    // If we have no sorted spill files left, this is the last run
+                    self.sorted_spill_files.is_empty(),
+                    is_only_merging_memory_streams,
+                )?;
+
+                // If we're only merging memory streams, we don't need to attach the memory reservation
+                // as it's empty
+                if is_only_merging_memory_streams {
+                    assert_eq!(
+                        memory_reservation.size(),
+                        0,
+                        "when only merging memory streams, we should not have any memory reservation and let the merge sort handle the memory"
+                    );
+
+                    Ok(merge_sort_stream)
+                } else {
+                    // Attach the memory reservation to the stream to make sure we have enough memory
+                    // throughout the merge process as we bypassed the memory pool for the merge sort stream
+                    Ok(Box::pin(StreamAttachedReservation::new(
+                        merge_sort_stream,
+                        memory_reservation,
+                    )))
+                }
+            }
+        }
+    }
+
+    fn create_new_merge_sort(
+        &mut self,
+        streams: Vec<SendableRecordBatchStream>,
+        is_output: bool,
+        all_in_memory: bool,
+    ) -> Result<SendableRecordBatchStream> {
+        let mut builder = StreamingMergeBuilder::new()
+            .with_schema(Arc::clone(&self.schema))
+            .with_expressions(&self.expr)
+            .with_batch_size(self.batch_size)
+            .with_fetch(self.fetch)
+            .with_metrics(if is_output {
+                // Only add the metrics to the last run
+                self.metrics.clone()
+            } else {
+                self.metrics.intermediate()
+            })
+            .with_round_robin_tie_breaker(self.enable_round_robin_tie_breaker)
+            .with_streams(streams);
+
+        if !all_in_memory {
+            // Don't track memory used by this stream as we reserve that memory by worst case sceneries
+            // (reserving memory for the biggest batch in each stream)
+            // TODO - avoid this hack as this can be broken easily when `SortPreservingMergeStream`
+            //        changes the implementation to use more/less memory
+            builder = builder.with_bypass_mempool();
+        } else {
+            // If we are only merging in-memory streams, we need to use the memory reservation
+            // because we don't know the maximum size of the batches in the streams.
+            // Use take() to transfer any pre-reserved bytes so the merge can use them
+            // as its initial budget without additional pool allocation.
+            builder = builder.with_reservation(self.reservation.take());
+        }
+
+        builder.build()
+    }
+
+    /// Return the sorted spill files to use for the next phase, and the buffer size
+    /// This will try to get as many spill files as possible to merge, and if we don't have enough streams
+    /// it will try to reduce the buffer size until we have enough streams to merge
+    /// otherwise it will return an error
+    fn get_sorted_spill_files_to_merge(
+        &mut self,
+        buffer_len: usize,
+        minimum_number_of_required_streams: usize,
+        reservation: &mut MemoryReservation,
+    ) -> Result<(Vec<SortedSpillFile>, usize)> {
+        assert_ne!(buffer_len, 0, "Buffer length must be greater than 0");
+        let mut number_of_spills_to_read_for_current_phase = 0;
+        // Track total memory needed for spill file buffers. When the
+        // reservation has pre-reserved bytes (from sort_spill_reservation_bytes),
+        // those bytes cover the first N spill files without additional pool
+        // allocation, preventing starvation under memory pressure.
+        let mut total_needed: usize = 0;
+
+        for spill in &self.sorted_spill_files {
+            let per_spill = get_reserved_bytes_for_record_batch_size(
+                spill.max_record_batch_memory,
+                // Size will be the same as the sliced size, bc it is a spilled batch.
+                spill.max_record_batch_memory,
+            ) * buffer_len;
+            total_needed += per_spill;
+
+            // For memory pools that are not shared this is good, for other
+            // this is not and there should be some upper limit to memory
+            // reservation so we won't starve the system.
+            match try_grow_reservation_to_at_least(reservation, total_needed) {
+                Ok(_) => {
+                    number_of_spills_to_read_for_current_phase += 1;
+                }
+                // If we can't grow the reservation, we need to stop
+                Err(err) => {
+                    // We must have at least 2 streams to merge, so if we don't have enough memory
+                    // fail
+                    if minimum_number_of_required_streams
+                        > number_of_spills_to_read_for_current_phase
+                    {
+                        // Free the memory we reserved for this merge as we either try again or fail
+                        reservation.free();
+                        if buffer_len > 1 {
+                            // Try again with smaller buffer size, it will be slower but at least we can merge
+                            return self.get_sorted_spill_files_to_merge(
+                                buffer_len - 1,
+                                minimum_number_of_required_streams,
+                                reservation,
+                            );
+                        }
+
+                        return Err(err);
+                    }
+
+                    // We reached the maximum amount of memory we can use
+                    // for this merge
+                    break;
+                }
+            }
+        }
+
+        let spills = self
+            .sorted_spill_files
+            .drain(..number_of_spills_to_read_for_current_phase)
+            .collect::<Vec<_>>();
+
+        Ok((spills, buffer_len))
+    }
+}
+
+struct StreamAttachedReservation {
+    stream: SendableRecordBatchStream,
+    reservation: MemoryReservation,
+}
+
+impl StreamAttachedReservation {
+    fn new(stream: SendableRecordBatchStream, reservation: MemoryReservation) -> Self {
+        Self {
+            stream,
+            reservation,
+        }
+    }
+}
+
+impl Stream for StreamAttachedReservation {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let res = self.stream.poll_next_unpin(cx);
+
+        match res {
+            Poll::Ready(res) => {
+                match res {
+                    Some(Ok(batch)) => Poll::Ready(Some(Ok(batch))),
+                    Some(Err(err)) => {
+                        // Had an error so drop the data
+                        self.reservation.free();
+                        Poll::Ready(Some(Err(err)))
+                    }
+                    None => {
+                        // Stream is done so free the memory
+                        self.reservation.free();
+
+                        Poll::Ready(None)
+                    }
+                }
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl RecordBatchStream for StreamAttachedReservation {
+    fn schema(&self) -> SchemaRef {
+        self.stream.schema()
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs
index 78f898a2d77a2..127998601fba8 100644
--- a/datafusion/physical-plan/src/sorts/partial_sort.rs
+++ b/datafusion/physical-plan/src/sorts/partial_sort.rs
@@ -32,7 +32,7 @@
 //! | 0 | 1 | 1 |
 //! | 0 | 2 | 0 |
 //! +---+---+---+
-//!```
+//! ```
 //!
 //! and required ordering for the plan is `a ASC, b ASC, d ASC`.
 //! The first 3 rows(segment) can be sorted as the segment already
@@ -46,7 +46,7 @@
 //! +---+---+---+
 //! | 0 | 2 | 4 |
 //! +---+---+---+
-//!```
+//! ```
 //!
 //! The plan concats incoming data with such last rows of previous input
 //! and continues partial sorting of the segments.
@@ -62,17 +62,19 @@ use crate::sorts::sort::sort_batch;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
     Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
+    check_if_same_properties,
 };
 
 use arrow::compute::concat_batches;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_common::Result;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::utils::evaluate_partition_ranges;
 use datafusion_execution::{RecordBatchStream, TaskContext};
-use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 use log::trace;
 
 /// Partial Sort execution plan.
@@ -93,7 +95,7 @@ pub struct PartialSortExec {
     /// Fetch highest/lowest n results
     fetch: Option<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PartialSortExec {
@@ -105,7 +107,8 @@ impl PartialSortExec {
     ) -> Self {
         debug_assert!(common_prefix_length > 0);
         let preserve_partitioning = false;
-        let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning);
+        let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning)
+            .unwrap();
         Self {
             input,
             expr,
@@ -113,7 +116,7 @@ impl PartialSortExec {
             metrics_set: ExecutionPlanMetricsSet::new(),
             preserve_partitioning,
             fetch: None,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -131,12 +134,8 @@ impl PartialSortExec {
     /// input partitions producing a single, sorted partition.
     pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self {
         self.preserve_partitioning = preserve_partitioning;
-        self.cache = self
-            .cache
-            .with_partitioning(Self::output_partitioning_helper(
-                &self.input,
-                self.preserve_partitioning,
-            ));
+        Arc::make_mut(&mut self.cache).partitioning =
+            Self::output_partitioning_helper(&self.input, self.preserve_partitioning);
         self
     }
 
@@ -159,7 +158,7 @@ impl PartialSortExec {
 
     /// Sort expressions
     pub fn expr(&self) -> &LexOrdering {
-        self.expr.as_ref()
+        &self.expr
     }
 
     /// If `Some(fetch)`, limits output to only the first "fetch" items
@@ -189,24 +188,33 @@ impl PartialSortExec {
         input: &Arc<dyn ExecutionPlan>,
         sort_exprs: LexOrdering,
         preserve_partitioning: bool,
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Calculate equivalence properties; i.e. reset the ordering equivalence
         // class with the new ordering:
-        let eq_properties = input
-            .equivalence_properties()
-            .clone()
-            .with_reorder(sort_exprs);
+        let mut eq_properties = input.equivalence_properties().clone();
+        eq_properties.reorder(sort_exprs)?;
 
         // Get output partitioning:
         let output_partitioning =
             Self::output_partitioning_helper(input, preserve_partitioning);
 
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             input.pipeline_behavior(),
             input.boundedness(),
-        )
+        ))
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics_set: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
     }
 }
 
@@ -221,9 +229,17 @@ impl DisplayAs for PartialSortExec {
                 let common_prefix_length = self.common_prefix_length;
                 match self.fetch {
                     Some(fetch) => {
-                        write!(f, "PartialSortExec: TopK(fetch={fetch}), expr=[{}], common_prefix_length=[{common_prefix_length}]", self.expr)
+                        write!(
+                            f,
+                            "PartialSortExec: TopK(fetch={fetch}), expr=[{}], common_prefix_length=[{common_prefix_length}]",
+                            self.expr
+                        )
                     }
-                    None => write!(f, "PartialSortExec: expr=[{}], common_prefix_length=[{common_prefix_length}]", self.expr),
+                    None => write!(
+                        f,
+                        "PartialSortExec: expr=[{}], common_prefix_length=[{common_prefix_length}]",
+                        self.expr
+                    ),
                 }
             }
             DisplayFormatType::TreeRender => match self.fetch {
@@ -248,7 +264,7 @@ impl ExecutionPlan for PartialSortExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -272,10 +288,22 @@ impl ExecutionPlan for PartialSortExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         let new_partial_sort = PartialSortExec::new(
             self.expr.clone(),
             Arc::clone(&children[0]),
@@ -292,7 +320,12 @@ impl ExecutionPlan for PartialSortExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         let input = self.input.execute(partition, Arc::clone(&context))?;
 
@@ -306,7 +339,7 @@ impl ExecutionPlan for PartialSortExec {
             input,
             expr: self.expr.clone(),
             common_prefix_length: self.common_prefix_length,
-            in_mem_batches: vec![],
+            in_mem_batch: RecordBatch::new_empty(Arc::clone(&self.schema())),
             fetch: self.fetch,
             is_closed: false,
             baseline_metrics: BaselineMetrics::new(&self.metrics_set, partition),
@@ -317,11 +350,7 @@ impl ExecutionPlan for PartialSortExec {
         Some(self.metrics_set.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(partition)
     }
 }
@@ -335,7 +364,7 @@ struct PartialSortStream {
     /// should be more than 0 otherwise PartialSort is not applicable
     common_prefix_length: usize,
     /// Used as a buffer for part of the input not ready for sort
-    in_mem_batches: Vec<RecordBatch>,
+    in_mem_batch: RecordBatch,
     /// Fetch top N results
     fetch: Option<usize>,
     /// Whether the stream has finished returning all of its data or not
@@ -376,52 +405,62 @@ impl PartialSortStream {
             return Poll::Ready(None);
         }
         loop {
-            return Poll::Ready(match ready!(self.input.poll_next_unpin(cx)) {
+            // Check if we've already reached the fetch limit
+            if self.fetch == Some(0) {
+                self.is_closed = true;
+                return Poll::Ready(None);
+            }
+
+            match ready!(self.input.poll_next_unpin(cx)) {
                 Some(Ok(batch)) => {
-                    if let Some(slice_point) =
-                        self.get_slice_point(self.common_prefix_length, &batch)?
+                    // Merge new batch into in_mem_batch
+                    self.in_mem_batch = concat_batches(
+                        &self.schema(),
+                        &[self.in_mem_batch.clone(), batch],
+                    )?;
+
+                    // Check if we have a slice point, otherwise keep accumulating in `self.in_mem_batch`.
+                    if let Some(slice_point) = self
+                        .get_slice_point(self.common_prefix_length, &self.in_mem_batch)?
                     {
-                        self.in_mem_batches.push(batch.slice(0, slice_point));
-                        let remaining_batch =
-                            batch.slice(slice_point, batch.num_rows() - slice_point);
-                        // Extract the sorted batch
-                        let sorted_batch = self.sort_in_mem_batches();
-                        // Refill with the remaining batch
-                        self.in_mem_batches.push(remaining_batch);
-
-                        debug_assert!(sorted_batch
-                            .as_ref()
-                            .map(|batch| batch.num_rows() > 0)
-                            .unwrap_or(true));
-                        Some(sorted_batch)
-                    } else {
-                        self.in_mem_batches.push(batch);
-                        continue;
+                        let sorted = self.in_mem_batch.slice(0, slice_point);
+                        self.in_mem_batch = self.in_mem_batch.slice(
+                            slice_point,
+                            self.in_mem_batch.num_rows() - slice_point,
+                        );
+                        let sorted_batch = sort_batch(&sorted, &self.expr, self.fetch)?;
+                        if let Some(fetch) = self.fetch.as_mut() {
+                            *fetch -= sorted_batch.num_rows();
+                        }
+
+                        if sorted_batch.num_rows() > 0 {
+                            return Poll::Ready(Some(Ok(sorted_batch)));
+                        }
                     }
                 }
-                Some(Err(e)) => Some(Err(e)),
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
                 None => {
                     self.is_closed = true;
-                    // once input is consumed, sort the rest of the inserted batches
-                    let remaining_batch = self.sort_in_mem_batches()?;
-                    if remaining_batch.num_rows() > 0 {
-                        Some(Ok(remaining_batch))
+                    // Once input is consumed, sort the rest of the inserted batches
+                    let remaining_batch = self.sort_in_mem_batch()?;
+                    return if remaining_batch.num_rows() > 0 {
+                        Poll::Ready(Some(Ok(remaining_batch)))
                     } else {
-                        None
-                    }
+                        Poll::Ready(None)
+                    };
                 }
-            });
+            };
         }
     }
 
     /// Returns a sorted RecordBatch from in_mem_batches and clears in_mem_batches
     ///
-    /// If fetch is specified for PartialSortStream `sort_in_mem_batches` will limit
+    /// If fetch is specified for PartialSortStream `sort_in_mem_batch` will limit
     /// the last RecordBatch returned and will mark the stream as closed
-    fn sort_in_mem_batches(self: &mut Pin<&mut Self>) -> Result<RecordBatch> {
-        let input_batch = concat_batches(&self.schema(), &self.in_mem_batches)?;
-        self.in_mem_batches.clear();
-        let result = sort_batch(&input_batch, self.expr.as_ref(), self.fetch)?;
+    fn sort_in_mem_batch(self: &mut Pin<&mut Self>) -> Result<RecordBatch> {
+        let input_batch = self.in_mem_batch.clone();
+        self.in_mem_batch = RecordBatch::new_empty(self.schema());
+        let result = sort_batch(&input_batch, &self.expr, self.fetch)?;
         if let Some(remaining_fetch) = self.fetch {
             // remaining_fetch - result.num_rows() is always be >= 0
             // because result length of sort_batch with limit cannot be
@@ -475,13 +514,13 @@ mod tests {
     use itertools::Itertools;
 
     use crate::collect;
-    use crate::expressions::col;
     use crate::expressions::PhysicalSortExpr;
+    use crate::expressions::col;
     use crate::sorts::sort::SortExec;
     use crate::test;
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::assert_is_pending;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
 
     use super::*;
 
@@ -504,7 +543,7 @@ mod tests {
         };
 
         let partial_sort_exec = Arc::new(PartialSortExec::new(
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: option_asc,
@@ -517,27 +556,28 @@ mod tests {
                     expr: col("c", &schema)?,
                     options: option_asc,
                 },
-            ]),
+            ]
+            .into(),
             Arc::clone(&source),
             2,
-        )) as Arc<dyn ExecutionPlan>;
+        ));
 
         let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
 
         assert_eq!(2, result.len());
         allow_duplicates! {
-            assert_snapshot!(batches_to_string(&result), @r#"
-                +---+---+---+
-                | a | b | c |
-                +---+---+---+
-                | 0 | 1 | 0 |
-                | 0 | 1 | 1 |
-                | 0 | 2 | 5 |
-                | 1 | 2 | 4 |
-                | 1 | 3 | 2 |
-                | 1 | 3 | 3 |
-                +---+---+---+
-                "#);
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+---+
+            | a | b | c |
+            +---+---+---+
+            | 0 | 1 | 0 |
+            | 0 | 1 | 1 |
+            | 0 | 2 | 5 |
+            | 1 | 2 | 4 |
+            | 1 | 3 | 2 |
+            | 1 | 3 | 3 |
+            +---+---+---+
+            ");
         }
         assert_eq!(
             task_ctx.runtime_env().memory_pool.reserved(),
@@ -569,7 +609,7 @@ mod tests {
         for common_prefix_length in [1, 2] {
             let partial_sort_exec = Arc::new(
                 PartialSortExec::new(
-                    LexOrdering::new(vec![
+                    [
                         PhysicalSortExpr {
                             expr: col("a", &schema)?,
                             options: option_asc,
@@ -582,27 +622,28 @@ mod tests {
                             expr: col("c", &schema)?,
                             options: option_asc,
                         },
-                    ]),
+                    ]
+                    .into(),
                     Arc::clone(&source),
                     common_prefix_length,
                 )
                 .with_fetch(Some(4)),
-            ) as Arc<dyn ExecutionPlan>;
+            );
 
             let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
 
             assert_eq!(2, result.len());
             allow_duplicates! {
-                assert_snapshot!(batches_to_string(&result), @r#"
-                    +---+---+---+
-                    | a | b | c |
-                    +---+---+---+
-                    | 0 | 1 | 4 |
-                    | 0 | 2 | 3 |
-                    | 1 | 2 | 2 |
-                    | 1 | 3 | 0 |
-                    +---+---+---+
-                    "#);
+                assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+---+
+                | a | b | c |
+                +---+---+---+
+                | 0 | 1 | 4 |
+                | 0 | 2 | 3 |
+                | 1 | 2 | 2 |
+                | 1 | 3 | 0 |
+                +---+---+---+
+                ");
             }
             assert_eq!(
                 task_ctx.runtime_env().memory_pool.reserved(),
@@ -642,7 +683,7 @@ mod tests {
             [(1, &source_tables[0]), (2, &source_tables[1])]
         {
             let partial_sort_exec = Arc::new(PartialSortExec::new(
-                LexOrdering::new(vec![
+                [
                     PhysicalSortExpr {
                         expr: col("a", &schema)?,
                         options: option_asc,
@@ -655,7 +696,8 @@ mod tests {
                         expr: col("c", &schema)?,
                         options: option_asc,
                     },
-                ]),
+                ]
+                .into(),
                 Arc::clone(source),
                 common_prefix_length,
             ));
@@ -668,20 +710,20 @@ mod tests {
                 "The sort should have returned all memory used back to the memory manager"
             );
             allow_duplicates! {
-                assert_snapshot!(batches_to_string(&result), @r#"
-                    +---+---+---+
-                    | a | b | c |
-                    +---+---+---+
-                    | 0 | 1 | 6 |
-                    | 0 | 1 | 7 |
-                    | 0 | 3 | 4 |
-                    | 0 | 3 | 5 |
-                    | 1 | 2 | 0 |
-                    | 1 | 2 | 1 |
-                    | 1 | 4 | 2 |
-                    | 1 | 4 | 3 |
-                    +---+---+---+
-                    "#);
+                assert_snapshot!(batches_to_string(&result), @r"
+                +---+---+---+
+                | a | b | c |
+                +---+---+---+
+                | 0 | 1 | 6 |
+                | 0 | 1 | 7 |
+                | 0 | 3 | 4 |
+                | 0 | 3 | 5 |
+                | 1 | 2 | 0 |
+                | 1 | 2 | 1 |
+                | 1 | 4 | 2 |
+                | 1 | 4 | 3 |
+                +---+---+---+
+                ");
             }
         }
         Ok(())
@@ -731,8 +773,8 @@ mod tests {
             nulls_first: false,
         };
         let schema = mem_exec.schema();
-        let partial_sort_executor = PartialSortExec::new(
-            LexOrdering::new(vec![
+        let partial_sort_exec = PartialSortExec::new(
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: option_asc,
@@ -745,17 +787,16 @@ mod tests {
                     expr: col("c", &schema)?,
                     options: option_asc,
                 },
-            ]),
+            ]
+            .into(),
             Arc::clone(&mem_exec),
             1,
         );
-        let partial_sort_exec =
-            Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>;
         let sort_exec = Arc::new(SortExec::new(
-            partial_sort_executor.expr,
-            partial_sort_executor.input,
-        )) as Arc<dyn ExecutionPlan>;
-        let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
+            partial_sort_exec.expr.clone(),
+            Arc::clone(&partial_sort_exec.input),
+        ));
+        let result = collect(Arc::new(partial_sort_exec), Arc::clone(&task_ctx)).await?;
         assert_eq!(
             result.iter().map(|r| r.num_rows()).collect_vec(),
             [125, 125, 150]
@@ -792,8 +833,8 @@ mod tests {
             (Some(150), vec![125, 25]),
             (Some(250), vec![125, 125]),
         ] {
-            let partial_sort_executor = PartialSortExec::new(
-                LexOrdering::new(vec![
+            let partial_sort_exec = PartialSortExec::new(
+                [
                     PhysicalSortExpr {
                         expr: col("a", &schema)?,
                         options: option_asc,
@@ -806,19 +847,22 @@ mod tests {
                         expr: col("c", &schema)?,
                         options: option_asc,
                     },
-                ]),
+                ]
+                .into(),
                 Arc::clone(&mem_exec),
                 1,
             )
             .with_fetch(fetch_size);
 
-            let partial_sort_exec =
-                Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>;
             let sort_exec = Arc::new(
-                SortExec::new(partial_sort_executor.expr, partial_sort_executor.input)
-                    .with_fetch(fetch_size),
-            ) as Arc<dyn ExecutionPlan>;
-            let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
+                SortExec::new(
+                    partial_sort_exec.expr.clone(),
+                    Arc::clone(&partial_sort_exec.input),
+                )
+                .with_fetch(fetch_size),
+            );
+            let result =
+                collect(Arc::new(partial_sort_exec), Arc::clone(&task_ctx)).await?;
             assert_eq!(
                 result.iter().map(|r| r.num_rows()).collect_vec(),
                 expected_batch_num_rows
@@ -847,8 +891,8 @@ mod tests {
             nulls_first: false,
         };
         let fetch_size = Some(250);
-        let partial_sort_executor = PartialSortExec::new(
-            LexOrdering::new(vec![
+        let partial_sort_exec = PartialSortExec::new(
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: option_asc,
@@ -857,15 +901,14 @@ mod tests {
                     expr: col("c", &schema)?,
                     options: option_asc,
                 },
-            ]),
+            ]
+            .into(),
             Arc::clone(&mem_exec),
             1,
         )
         .with_fetch(fetch_size);
 
-        let partial_sort_exec =
-            Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>;
-        let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
+        let result = collect(Arc::new(partial_sort_exec), Arc::clone(&task_ctx)).await?;
         for rb in result {
             assert!(rb.num_rows() > 0);
         }
@@ -898,10 +941,11 @@ mod tests {
             TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?;
 
         let partial_sort_exec = Arc::new(PartialSortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("field_name", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             input,
             1,
         ));
@@ -987,7 +1031,7 @@ mod tests {
         )?;
 
         let partial_sort_exec = Arc::new(PartialSortExec::new(
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: option_asc,
@@ -1000,7 +1044,8 @@ mod tests {
                     expr: col("c", &schema)?,
                     options: option_desc,
                 },
-            ]),
+            ]
+            .into(),
             TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?,
             2,
         ));
@@ -1023,20 +1068,20 @@ mod tests {
             task_ctx,
         )
         .await?;
-        assert_snapshot!(batches_to_string(&result), @r#"
-            +-----+------+-------+
-            | a   | b    | c     |
-            +-----+------+-------+
-            | 1.0 | 20.0 | 20.0  |
-            | 1.0 | 20.0 | 10.0  |
-            | 1.0 | 40.0 | 10.0  |
-            | 2.0 | 40.0 | 100.0 |
-            | 2.0 | NaN  | NaN   |
-            | 3.0 |      |       |
-            | 3.0 |      | 100.0 |
-            | 3.0 | NaN  | NaN   |
-            +-----+------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(&result), @r"
+        +-----+------+-------+
+        | a   | b    | c     |
+        +-----+------+-------+
+        | 1.0 | 20.0 | 20.0  |
+        | 1.0 | 20.0 | 10.0  |
+        | 1.0 | 40.0 | 10.0  |
+        | 2.0 | 40.0 | 100.0 |
+        | 2.0 | NaN  | NaN   |
+        | 3.0 |      |       |
+        | 3.0 |      | 100.0 |
+        | 3.0 | NaN  | NaN   |
+        +-----+------+-------+
+        ");
         assert_eq!(result.len(), 2);
         let metrics = partial_sort_exec.metrics().unwrap();
         assert!(metrics.elapsed_compute().unwrap() > 0);
@@ -1062,10 +1107,11 @@ mod tests {
         let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
         let refs = blocking_exec.refs();
         let sort_exec = Arc::new(PartialSortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("a", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             blocking_exec,
             1,
         ));
@@ -1085,4 +1131,87 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_partial_sort_with_homogeneous_batches() -> Result<()> {
+        // Test case for the bug where batches with homogeneous sort keys
+        // (e.g., [1,1,1], [2,2,2]) would not be properly detected as having
+        // slice points between batches.
+        let task_ctx = Arc::new(TaskContext::default());
+
+        // Create batches where each batch has homogeneous values for sort keys
+        let batch1 = test::build_table_i32(
+            ("a", &vec![1; 3]),
+            ("b", &vec![1; 3]),
+            ("c", &vec![3, 2, 1]),
+        );
+        let batch2 = test::build_table_i32(
+            ("a", &vec![2; 3]),
+            ("b", &vec![2; 3]),
+            ("c", &vec![4, 6, 4]),
+        );
+        let batch3 = test::build_table_i32(
+            ("a", &vec![3; 3]),
+            ("b", &vec![3; 3]),
+            ("c", &vec![9, 7, 8]),
+        );
+
+        let schema = batch1.schema();
+        let mem_exec = TestMemoryExec::try_new_exec(
+            &[vec![batch1, batch2, batch3]],
+            Arc::clone(&schema),
+            None,
+        )?;
+
+        let option_asc = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+
+        // Partial sort with common prefix of 2 (sorting by a, b, c)
+        let partial_sort_exec = Arc::new(PartialSortExec::new(
+            [
+                PhysicalSortExpr {
+                    expr: col("a", &schema)?,
+                    options: option_asc,
+                },
+                PhysicalSortExpr {
+                    expr: col("b", &schema)?,
+                    options: option_asc,
+                },
+                PhysicalSortExpr {
+                    expr: col("c", &schema)?,
+                    options: option_asc,
+                },
+            ]
+            .into(),
+            mem_exec,
+            2,
+        ));
+
+        let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await?;
+
+        assert_eq!(result.len(), 3,);
+
+        allow_duplicates! {
+            assert_snapshot!(batches_to_string(&result), @r"
+            +---+---+---+
+            | a | b | c |
+            +---+---+---+
+            | 1 | 1 | 1 |
+            | 1 | 1 | 2 |
+            | 1 | 1 | 3 |
+            | 2 | 2 | 4 |
+            | 2 | 2 | 4 |
+            | 2 | 2 | 6 |
+            | 3 | 3 | 7 |
+            | 3 | 3 | 8 |
+            | 3 | 3 | 9 |
+            +---+---+---+
+            ");
+        }
+
+        assert_eq!(task_ctx.runtime_env().memory_pool.reserved(), 0,);
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
index 683983d9e6979..da2171847cc7b 100644
--- a/datafusion/physical-plan/src/sorts/sort.rs
+++ b/datafusion/physical-plan/src/sorts/sort.rs
@@ -24,20 +24,30 @@ use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
+use parking_lot::RwLock;
+
 use crate::common::spawn_buffered;
-use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType};
+use crate::execution_plan::{
+    Boundedness, CardinalityEffect, EmissionType, has_same_children_properties,
+};
 use crate::expressions::PhysicalSortExpr;
+use crate::filter_pushdown::{
+    ChildFilterDescription, FilterDescription, FilterPushdownPhase,
+};
 use crate::limit::LimitStream;
 use crate::metrics::{
     BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics,
 };
-use crate::projection::{make_with_child, update_expr, ProjectionExec};
-use crate::sorts::streaming_merge::StreamingMergeBuilder;
+use crate::projection::{ProjectionExec, make_with_child, update_ordering};
+use crate::sorts::IncrementalSortIterator;
+use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder};
 use crate::spill::get_record_batch_memory_size;
 use crate::spill::in_progress_spill_file::InProgressSpillFile;
-use crate::spill::spill_manager::SpillManager;
+use crate::spill::spill_manager::{GetSlicedSize, SpillManager};
 use crate::stream::RecordBatchStreamAdapter;
+use crate::stream::ReservationStream;
 use crate::topk::TopK;
+use crate::topk::TopKDynamicFilters;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan,
     ExecutionPlanProperties, Partitioning, PlanProperties, SendableRecordBatchStream,
@@ -47,13 +57,18 @@ use crate::{
 use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray};
 use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays};
 use arrow::datatypes::SchemaRef;
-use datafusion_common::{internal_datafusion_err, internal_err, DataFusionError, Result};
-use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_common::config::SpillCompression;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    DataFusionError, Result, assert_or_internal_err, internal_datafusion_err,
+    unwrap_or_internal_err,
+};
+use datafusion_execution::TaskContext;
 use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
 use datafusion_physical_expr::LexOrdering;
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit};
 
 use futures::{StreamExt, TryStreamExt};
 use log::{debug, trace};
@@ -119,7 +134,6 @@ impl ExternalSorterMetrics {
 ///    └─────┘
 ///
 /// in_mem_batches
-///
 /// ```
 ///
 /// # When data does not fit in available memory
@@ -200,7 +214,7 @@ struct ExternalSorter {
     /// Schema of the output (and the input)
     schema: SchemaRef,
     /// Sort expressions
-    expr: Arc<[PhysicalSortExpr]>,
+    expr: LexOrdering,
     /// The target number of rows for output batches
     batch_size: usize,
     /// If the in size of buffered memory batches is below this size,
@@ -217,12 +231,16 @@ struct ExternalSorter {
 
     /// During external sorting, in-memory intermediate data will be appended to
     /// this file incrementally. Once finished, this file will be moved to [`Self::finished_spill_files`].
-    in_progress_spill_file: Option<InProgressSpillFile>,
+    ///
+    /// this is a tuple of:
+    /// 1. `InProgressSpillFile` - the file that is being written to
+    /// 2. `max_record_batch_memory` - the maximum memory usage of a single batch in this spill file.
+    in_progress_spill_file: Option<(InProgressSpillFile, usize)>,
     /// If data has previously been spilled, the locations of the spill files (in
     /// Arrow IPC format)
     /// Within the same spill file, the data might be chunked into multiple batches,
     /// and ordered by sort keys.
-    finished_spill_files: Vec<RefCountedTempFile>,
+    finished_spill_files: Vec<SortedSpillFile>,
 
     // ========================================================================
     // EXECUTION RESOURCES:
@@ -248,7 +266,7 @@ struct ExternalSorter {
 impl ExternalSorter {
     // TODO: make a builder or some other nicer API to avoid the
     // clippy warning
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     pub fn new(
         partition_id: usize,
         schema: SchemaRef,
@@ -256,6 +274,8 @@ impl ExternalSorter {
         batch_size: usize,
         sort_spill_reservation_bytes: usize,
         sort_in_place_threshold_bytes: usize,
+        // Configured via `datafusion.execution.spill_compression`.
+        spill_compression: SpillCompression,
         metrics: &ExecutionPlanMetricsSet,
         runtime: Arc<RuntimeEnv>,
     ) -> Result<Self> {
@@ -272,14 +292,15 @@ impl ExternalSorter {
             Arc::clone(&runtime),
             metrics.spill_metrics.clone(),
             Arc::clone(&schema),
-        );
+        )
+        .with_compression_type(spill_compression);
 
         Ok(Self {
             schema,
             in_mem_batches: vec![],
             in_progress_spill_file: None,
             finished_spill_files: vec![],
-            expr: expr.into(),
+            expr,
             metrics,
             reservation,
             spill_manager,
@@ -321,14 +342,7 @@ impl ExternalSorter {
     /// 2. A combined streaming merge incorporating both in-memory
     ///    batches and data from spill files on disk.
     async fn sort(&mut self) -> Result<SendableRecordBatchStream> {
-        // Release the memory reserved for merge back to the pool so
-        // there is some left when `in_mem_sort_stream` requests an
-        // allocation.
-        self.merge_reservation.free();
-
         if self.spilled_before() {
-            let mut streams = vec![];
-
             // Sort `in_mem_batches` and spill it first. If there are many
             // `in_mem_batches` and the memory limit is almost reached, merging
             // them with the spilled files at the same time might cause OOM.
@@ -336,26 +350,29 @@ impl ExternalSorter {
                 self.sort_and_spill_in_mem_batches().await?;
             }
 
-            for spill in self.finished_spill_files.drain(..) {
-                if !spill.path().exists() {
-                    return internal_err!("Spill file {:?} does not exist", spill.path());
-                }
-                let stream = self.spill_manager.read_spill_as_stream(spill)?;
-                streams.push(stream);
-            }
-
-            let expressions: LexOrdering = self.expr.iter().cloned().collect();
-
+            // Transfer the pre-reserved merge memory to the streaming merge
+            // using `take()` instead of `new_empty()`. This ensures the merge
+            // stream starts with `sort_spill_reservation_bytes` already
+            // allocated, preventing starvation when concurrent sort partitions
+            // compete for pool memory. `take()` moves the bytes atomically
+            // without releasing them back to the pool, so other partitions
+            // cannot race to consume the freed memory.
             StreamingMergeBuilder::new()
-                .with_streams(streams)
+                .with_sorted_spill_files(std::mem::take(&mut self.finished_spill_files))
+                .with_spill_manager(self.spill_manager.clone())
                 .with_schema(Arc::clone(&self.schema))
-                .with_expressions(expressions.as_ref())
+                .with_expressions(&self.expr.clone())
                 .with_metrics(self.metrics.baseline.clone())
                 .with_batch_size(self.batch_size)
                 .with_fetch(None)
-                .with_reservation(self.merge_reservation.new_empty())
+                .with_reservation(self.merge_reservation.take())
                 .build()
         } else {
+            // Release the memory reserved for merge back to the pool so
+            // there is some left when `in_mem_sort_stream` requests an
+            // allocation. Only needed for the non-spill path; the spill
+            // path transfers the reservation to the merge stream instead.
+            self.merge_reservation.free();
             self.in_mem_sort_stream(self.metrics.baseline.clone())
         }
     }
@@ -365,6 +382,12 @@ impl ExternalSorter {
         self.reservation.size()
     }
 
+    /// How much memory is reserved for the merge phase?
+    #[cfg(test)]
+    fn merge_reservation_size(&self) -> usize {
+        self.merge_reservation.size()
+    }
+
     /// How many bytes have been spilled to disk?
     fn spilled_bytes(&self) -> usize {
         self.metrics.spill_metrics.spilled_bytes.value()
@@ -393,7 +416,7 @@ impl ExternalSorter {
         // Lazily initialize the in-progress spill file
         if self.in_progress_spill_file.is_none() {
             self.in_progress_spill_file =
-                Some(self.spill_manager.create_in_progress_file("Sorting")?);
+                Some((self.spill_manager.create_in_progress_file("Sorting")?, 0));
         }
 
         Self::organize_stringview_arrays(globally_sorted_batches)?;
@@ -403,31 +426,39 @@ impl ExternalSorter {
         let batches_to_spill = std::mem::take(globally_sorted_batches);
         self.reservation.free();
 
-        let in_progress_file = self.in_progress_spill_file.as_mut().ok_or_else(|| {
-            internal_datafusion_err!("In-progress spill file should be initialized")
-        })?;
+        let (in_progress_file, max_record_batch_size) =
+            self.in_progress_spill_file.as_mut().ok_or_else(|| {
+                internal_datafusion_err!("In-progress spill file should be initialized")
+            })?;
 
         for batch in batches_to_spill {
             in_progress_file.append_batch(&batch)?;
-        }
 
-        if !globally_sorted_batches.is_empty() {
-            return internal_err!("This function consumes globally_sorted_batches, so it should be empty after taking.");
+            *max_record_batch_size =
+                (*max_record_batch_size).max(batch.get_sliced_size()?);
         }
 
+        assert_or_internal_err!(
+            globally_sorted_batches.is_empty(),
+            "This function consumes globally_sorted_batches, so it should be empty after taking."
+        );
+
         Ok(())
     }
 
     /// Finishes the in-progress spill file and moves it to the finished spill files.
     async fn spill_finish(&mut self) -> Result<()> {
-        let mut in_progress_file =
+        let (mut in_progress_file, max_record_batch_memory) =
             self.in_progress_spill_file.take().ok_or_else(|| {
                 internal_datafusion_err!("Should be called after `spill_append`")
             })?;
         let spill_file = in_progress_file.finish()?;
 
         if let Some(spill_file) = spill_file {
-            self.finished_spill_files.push(spill_file);
+            self.finished_spill_files.push(SortedSpillFile {
+                file: spill_file,
+                max_record_batch_memory,
+            });
         }
 
         Ok(())
@@ -501,11 +532,10 @@ impl ExternalSorter {
     /// Sorts the in-memory batches and merges them into a single sorted run, then writes
     /// the result to spill files.
     async fn sort_and_spill_in_mem_batches(&mut self) -> Result<()> {
-        if self.in_mem_batches.is_empty() {
-            return internal_err!(
-                "in_mem_batches must not be empty when attempting to sort and spill"
-            );
-        }
+        assert_or_internal_err!(
+            !self.in_mem_batches.is_empty(),
+            "in_mem_batches must not be empty when attempting to sort and spill"
+        );
 
         // Release the memory reserved for merge back to the pool so
         // there is some left when `in_mem_sort_stream` requests an
@@ -517,11 +547,10 @@ impl ExternalSorter {
             self.in_mem_sort_stream(self.metrics.baseline.intermediate())?;
         // After `in_mem_sort_stream()` is constructed, all `in_mem_batches` is taken
         // to construct a globally sorted stream.
-        if !self.in_mem_batches.is_empty() {
-            return internal_err!(
-                "in_mem_batches should be empty after constructing sorted stream"
-            );
-        }
+        assert_or_internal_err!(
+            self.in_mem_batches.is_empty(),
+            "in_mem_batches should be empty after constructing sorted stream"
+        );
         // 'global' here refers to all buffered batches when the memory limit is
         // reached. This variable will buffer the sorted batches after
         // sort-preserving merge and incrementally append to spill files.
@@ -529,7 +558,7 @@ impl ExternalSorter {
 
         while let Some(batch) = sorted_stream.next().await {
             let batch = batch?;
-            let sorted_size = get_reserved_byte_for_record_batch(&batch);
+            let sorted_size = get_reserved_bytes_for_record_batch(&batch)?;
             if self.reservation.try_grow(sorted_size).is_err() {
                 // Although the reservation is not enough, the batch is
                 // already in memory, so it's okay to combine it with previously
@@ -553,11 +582,10 @@ impl ExternalSorter {
         // Sanity check after spilling
         let buffers_cleared_property =
             self.in_mem_batches.is_empty() && globally_sorted_batches.is_empty();
-        if !buffers_cleared_property {
-            return internal_err!(
-                "in_mem_batches and globally_sorted_batches should be cleared before"
-            );
-        }
+        assert_or_internal_err!(
+            buffers_cleared_property,
+            "in_mem_batches and globally_sorted_batches should be cleared before"
+        );
 
         // Reserve headroom for next sort/merge
         self.reserve_memory_for_merge()?;
@@ -647,7 +675,7 @@ impl ExternalSorter {
         if self.in_mem_batches.len() == 1 {
             let batch = self.in_mem_batches.swap_remove(0);
             let reservation = self.reservation.take();
-            return self.sort_batch_stream(batch, metrics, reservation);
+            return self.sort_batch_stream(batch, &metrics, reservation);
         }
 
         // If less than sort_in_place_threshold_bytes, concatenate and sort in place
@@ -656,10 +684,10 @@ impl ExternalSorter {
             let batch = concat_batches(&self.schema, &self.in_mem_batches)?;
             self.in_mem_batches.clear();
             self.reservation
-                .try_resize(get_reserved_byte_for_record_batch(&batch))
+                .try_resize(get_reserved_bytes_for_record_batch(&batch)?)
                 .map_err(Self::err_with_oom_context)?;
             let reservation = self.reservation.take();
-            return self.sort_batch_stream(batch, metrics, reservation);
+            return self.sort_batch_stream(batch, &metrics, reservation);
         }
 
         let streams = std::mem::take(&mut self.in_mem_batches)
@@ -668,18 +696,16 @@ impl ExternalSorter {
                 let metrics = self.metrics.baseline.intermediate();
                 let reservation = self
                     .reservation
-                    .split(get_reserved_byte_for_record_batch(&batch));
-                let input = self.sort_batch_stream(batch, metrics, reservation)?;
+                    .split(get_reserved_bytes_for_record_batch(&batch)?);
+                let input = self.sort_batch_stream(batch, &metrics, reservation)?;
                 Ok(spawn_buffered(input, 1))
             })
             .collect::<Result<_>>()?;
 
-        let expressions: LexOrdering = self.expr.iter().cloned().collect();
-
         StreamingMergeBuilder::new()
             .with_streams(streams)
             .with_schema(Arc::clone(&self.schema))
-            .with_expressions(expressions.as_ref())
+            .with_expressions(&self.expr.clone())
             .with_metrics(metrics)
             .with_batch_size(self.batch_size)
             .with_fetch(None)
@@ -689,30 +715,64 @@ impl ExternalSorter {
 
     /// Sorts a single `RecordBatch` into a single stream.
     ///
-    /// `reservation` accounts for the memory used by this batch and
-    /// is released when the sort is complete
+    /// This may output multiple batches depending on the size of the
+    /// sorted data and the target batch size.
+    /// For single-batch output cases, `reservation` will be freed immediately after sorting,
+    /// as the batch will be output and is expected to be reserved by the consumer of the stream.
+    /// For multi-batch output cases, `reservation` will be grown to match the actual
+    /// size of sorted output, and as each batch is output, its memory will be freed from the reservation.
+    /// (This leads to the same behaviour, as futures are only evaluated when polled by the consumer.)
     fn sort_batch_stream(
         &self,
         batch: RecordBatch,
-        metrics: BaselineMetrics,
+        metrics: &BaselineMetrics,
         reservation: MemoryReservation,
     ) -> Result<SendableRecordBatchStream> {
         assert_eq!(
-            get_reserved_byte_for_record_batch(&batch),
+            get_reserved_bytes_for_record_batch(&batch)?,
             reservation.size()
         );
+
         let schema = batch.schema();
+        let expressions = self.expr.clone();
+        let batch_size = self.batch_size;
+        let output_row_metrics = metrics.output_rows().clone();
 
-        let expressions: LexOrdering = self.expr.iter().cloned().collect();
         let stream = futures::stream::once(async move {
-            let _timer = metrics.elapsed_compute().timer();
-
-            let sorted = sort_batch(&batch, &expressions, None)?;
+            let schema = batch.schema();
+
+            // Sort the batch immediately and get all output batches
+            let sorted_batches = sort_batch_chunked(&batch, &expressions, batch_size)?;
+
+            // Resize the reservation to match the actual sorted output size.
+            // Using try_resize avoids a release-then-reacquire cycle, which
+            // matters for MemoryPool implementations where grow/shrink have
+            // non-trivial cost (e.g. JNI calls in Comet).
+            let total_sorted_size: usize = sorted_batches
+                .iter()
+                .map(get_record_batch_memory_size)
+                .sum();
+            reservation
+                .try_resize(total_sorted_size)
+                .map_err(Self::err_with_oom_context)?;
 
-            metrics.record_output(sorted.num_rows());
-            drop(batch);
-            drop(reservation);
-            Ok(sorted)
+            // Wrap in ReservationStream to hold the reservation
+            Result::<_, DataFusionError>::Ok(Box::pin(ReservationStream::new(
+                Arc::clone(&schema),
+                Box::pin(RecordBatchStreamAdapter::new(
+                    Arc::clone(&schema),
+                    futures::stream::iter(sorted_batches.into_iter().map(Ok)),
+                )),
+                reservation,
+            )) as SendableRecordBatchStream)
+        })
+        .try_flatten()
+        .map(move |batch| match batch {
+            Ok(batch) => {
+                output_row_metrics.add(batch.num_rows());
+                Ok(batch)
+            }
+            Err(e) => Err(e),
         });
 
         Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
@@ -741,7 +801,7 @@ impl ExternalSorter {
         &mut self,
         input: &RecordBatch,
     ) -> Result<()> {
-        let size = get_reserved_byte_for_record_batch(input);
+        let size = get_reserved_bytes_for_record_batch(input)?;
 
         match self.reservation.try_grow(size) {
             Ok(_) => Ok(()),
@@ -765,7 +825,8 @@ impl ExternalSorter {
         match e {
             DataFusionError::ResourcesExhausted(_) => e.context(
                 "Not enough memory to continue external sort. \
-                    Consider increasing the memory limit, or decreasing sort_spill_reservation_bytes"
+                    Consider increasing the memory limit config: 'datafusion.runtime.memory_limit', \
+                    or decreasing the config: 'datafusion.execution.sort_spill_reservation_bytes'."
             ),
             // This is not an OOM error, so just return it as is.
             _ => e,
@@ -780,11 +841,29 @@ impl ExternalSorter {
 /// in sorting and merging. The sorted copies are in either row format or array format.
 /// Please refer to cursor.rs and stream.rs for more details. No matter what format the
 /// sorted copies are, they will use more memory than the original record batch.
-fn get_reserved_byte_for_record_batch(batch: &RecordBatch) -> usize {
-    // 2x may not be enough for some cases, but it's a good start.
+///
+/// This can basically be calculated as the sum of the actual space it takes in
+/// memory (which would be larger for a sliced batch), and the size of the actual data.
+pub(crate) fn get_reserved_bytes_for_record_batch_size(
+    record_batch_size: usize,
+    sliced_size: usize,
+) -> usize {
+    // Even 2x may not be enough for some cases, but it's a good enough estimation as a baseline.
     // If 2x is not enough, user can set a larger value for `sort_spill_reservation_bytes`
     // to compensate for the extra memory needed.
-    get_record_batch_memory_size(batch) * 2
+    record_batch_size + sliced_size
+}
+
+/// Estimate how much memory is needed to sort a `RecordBatch`.
+/// This will just call `get_reserved_bytes_for_record_batch_size` with the
+/// memory size of the record batch and its sliced size.
+pub(crate) fn get_reserved_bytes_for_record_batch(batch: &RecordBatch) -> Result<usize> {
+    batch.get_sliced_size().map(|sliced_size| {
+        get_reserved_bytes_for_record_batch_size(
+            get_record_batch_memory_size(batch),
+            sliced_size,
+        )
+    })
 }
 
 impl Debug for ExternalSorter {
@@ -809,15 +888,7 @@ pub fn sort_batch(
         .collect::<Result<Vec<_>>>()?;
 
     let indices = lexsort_to_indices(&sort_columns, fetch)?;
-    let mut columns = take_arrays(batch.columns(), &indices, None)?;
-
-    // The columns may be larger than the unsorted columns in `batch` especially for variable length
-    // data types due to exponential growth when building the sort columns. We shrink the columns
-    // to prevent memory reservation failures, as well as excessive memory allocation when running
-    // merges in `SortPreservingMergeStream`.
-    columns.iter_mut().for_each(|c| {
-        c.shrink_to_fit();
-    });
+    let columns = take_arrays(batch.columns(), &indices, None)?;
 
     let options = RecordBatchOptions::new().with_row_count(Some(indices.len()));
     Ok(RecordBatch::try_new_with_options(
@@ -827,6 +898,17 @@ pub fn sort_batch(
     )?)
 }
 
+/// Sort a batch and return the result as multiple batches of size `batch_size`.
+/// This is useful when you want to avoid creating one large sorted batch in memory,
+/// and instead want to process the sorted data in smaller chunks.
+pub fn sort_batch_chunked(
+    batch: &RecordBatch,
+    expressions: &LexOrdering,
+    batch_size: usize,
+) -> Result<Vec<RecordBatch>> {
+    IncrementalSortIterator::new(batch.clone(), expressions.clone(), batch_size).collect()
+}
+
 /// Sort execution plan.
 ///
 /// Support sorting datasets that are larger than the memory allotted
@@ -845,9 +927,13 @@ pub struct SortExec {
     /// Fetch highest/lowest n results
     fetch: Option<usize>,
     /// Normalized common sort prefix between the input and the sort expressions (only used with fetch)
-    common_sort_prefix: LexOrdering,
+    common_sort_prefix: Vec<PhysicalSortExpr>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
+    /// Filter matching the state of the sort for dynamic filter pushdown.
+    /// If `fetch` is `Some`, this will also be set and a TopK operator may be used.
+    /// If `fetch` is `None`, this will be `None`.
+    filter: Option<Arc<RwLock<TopKDynamicFilters>>>,
 }
 
 impl SortExec {
@@ -856,7 +942,8 @@ impl SortExec {
     pub fn new(expr: LexOrdering, input: Arc<dyn ExecutionPlan>) -> Self {
         let preserve_partitioning = false;
         let (cache, sort_prefix) =
-            Self::compute_properties(&input, expr.clone(), preserve_partitioning);
+            Self::compute_properties(&input, expr.clone(), preserve_partitioning)
+                .unwrap();
         Self {
             expr,
             input,
@@ -864,7 +951,8 @@ impl SortExec {
             preserve_partitioning,
             fetch: None,
             common_sort_prefix: sort_prefix,
-            cache,
+            cache: Arc::new(cache),
+            filter: None,
         }
     }
 
@@ -882,15 +970,36 @@ impl SortExec {
     /// input partitions producing a single, sorted partition.
     pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self {
         self.preserve_partitioning = preserve_partitioning;
-        self.cache = self
-            .cache
-            .with_partitioning(Self::output_partitioning_helper(
-                &self.input,
-                self.preserve_partitioning,
-            ));
+        Arc::make_mut(&mut self.cache).partitioning =
+            Self::output_partitioning_helper(&self.input, self.preserve_partitioning);
         self
     }
 
+    /// Add or reset `self.filter` to a new `TopKDynamicFilters`.
+    fn create_filter(&self) -> Arc<RwLock<TopKDynamicFilters>> {
+        let children = self
+            .expr
+            .iter()
+            .map(|sort_expr| Arc::clone(&sort_expr.expr))
+            .collect::<Vec<_>>();
+        Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
+            DynamicFilterPhysicalExpr::new(children, lit(true)),
+        ))))
+    }
+
+    fn cloned(&self) -> Self {
+        SortExec {
+            input: Arc::clone(&self.input),
+            expr: self.expr.clone(),
+            metrics_set: self.metrics_set.clone(),
+            preserve_partitioning: self.preserve_partitioning,
+            common_sort_prefix: self.common_sort_prefix.clone(),
+            fetch: self.fetch,
+            cache: Arc::clone(&self.cache),
+            filter: self.filter.clone(),
+        }
+    }
+
     /// Modify how many rows to include in the result
     ///
     /// If None, then all rows will be returned, in sorted order.
@@ -899,26 +1008,26 @@ impl SortExec {
     /// operation since rows that are not going to be included
     /// can be dropped.
     pub fn with_fetch(&self, fetch: Option<usize>) -> Self {
-        let mut cache = self.cache.clone();
+        let mut cache = PlanProperties::clone(&self.cache);
         // If the SortExec can emit incrementally (that means the sort requirements
         // and properties of the input match), the SortExec can generate its result
         // without scanning the entire input when a fetch value exists.
         let is_pipeline_friendly = matches!(
-            self.cache.emission_type,
+            cache.emission_type,
             EmissionType::Incremental | EmissionType::Both
         );
         if fetch.is_some() && is_pipeline_friendly {
             cache = cache.with_boundedness(Boundedness::Bounded);
         }
-        SortExec {
-            input: Arc::clone(&self.input),
-            expr: self.expr.clone(),
-            metrics_set: self.metrics_set.clone(),
-            preserve_partitioning: self.preserve_partitioning,
-            common_sort_prefix: self.common_sort_prefix.clone(),
-            fetch,
-            cache,
-        }
+        let filter = fetch.is_some().then(|| {
+            // If we already have a filter, keep it. Otherwise, create a new one.
+            self.filter.clone().unwrap_or_else(|| self.create_filter())
+        });
+        let mut new_sort = self.cloned();
+        new_sort.fetch = fetch;
+        new_sort.cache = cache.into();
+        new_sort.filter = filter;
+        new_sort
     }
 
     /// Input schema
@@ -954,13 +1063,10 @@ impl SortExec {
         input: &Arc<dyn ExecutionPlan>,
         sort_exprs: LexOrdering,
         preserve_partitioning: bool,
-    ) -> (PlanProperties, LexOrdering) {
-        // Determine execution mode:
-        let requirement = LexRequirement::from(sort_exprs);
-
+    ) -> Result<(PlanProperties, Vec<PhysicalSortExpr>)> {
         let (sort_prefix, sort_satisfied) = input
             .equivalence_properties()
-            .extract_common_sort_prefix(&requirement);
+            .extract_common_sort_prefix(sort_exprs.clone())?;
 
         // The emission type depends on whether the input is already sorted:
         // - If already fully sorted, we can emit results in the same way as the input
@@ -989,25 +1095,22 @@ impl SortExec {
 
         // Calculate equivalence properties; i.e. reset the ordering equivalence
         // class with the new ordering:
-        let sort_exprs = LexOrdering::from(requirement);
-        let eq_properties = input
-            .equivalence_properties()
-            .clone()
-            .with_reorder(sort_exprs);
+        let mut eq_properties = input.equivalence_properties().clone();
+        eq_properties.reorder(sort_exprs)?;
 
         // Get output partitioning:
         let output_partitioning =
             Self::output_partitioning_helper(input, preserve_partitioning);
 
-        (
+        Ok((
             PlanProperties::new(
                 eq_properties,
                 output_partitioning,
                 emission_type,
                 boundedness,
             ),
-            LexOrdering::from(sort_prefix),
-        )
+            sort_prefix,
+        ))
     }
 }
 
@@ -1018,14 +1121,38 @@ impl DisplayAs for SortExec {
                 let preserve_partitioning = self.preserve_partitioning;
                 match self.fetch {
                     Some(fetch) => {
-                        write!(f, "SortExec: TopK(fetch={fetch}), expr=[{}], preserve_partitioning=[{preserve_partitioning}]", self.expr)?;
+                        write!(
+                            f,
+                            "SortExec: TopK(fetch={fetch}), expr=[{}], preserve_partitioning=[{preserve_partitioning}]",
+                            self.expr
+                        )?;
+                        if let Some(filter) = &self.filter
+                            && let Ok(current) = filter.read().expr().current()
+                            && !current.eq(&lit(true))
+                        {
+                            write!(f, ", filter=[{current}]")?;
+                        }
                         if !self.common_sort_prefix.is_empty() {
-                            write!(f, ", sort_prefix=[{}]", self.common_sort_prefix)
+                            write!(f, ", sort_prefix=[")?;
+                            let mut first = true;
+                            for sort_expr in &self.common_sort_prefix {
+                                if first {
+                                    first = false;
+                                } else {
+                                    write!(f, ", ")?;
+                                }
+                                write!(f, "{sort_expr}")?;
+                            }
+                            write!(f, "]")
                         } else {
                             Ok(())
                         }
                     }
-                    None => write!(f, "SortExec: expr=[{}], preserve_partitioning=[{preserve_partitioning}]", self.expr),
+                    None => write!(
+                        f,
+                        "SortExec: expr=[{}], preserve_partitioning=[{preserve_partitioning}]",
+                        self.expr
+                    ),
                 }
             }
             DisplayFormatType::TreeRender => match self.fetch {
@@ -1053,7 +1180,7 @@ impl ExecutionPlan for SortExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -1071,6 +1198,25 @@ impl ExecutionPlan for SortExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to sort expressions
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+
+        // Apply to dynamic filter expression if present (when fetch is Some, TopK mode)
+        if let Some(filter) = &self.filter {
+            let filter_guard = filter.read();
+            tnr = tnr.visit_sibling(|| f(filter_guard.expr().as_ref()))?;
+        }
+
+        Ok(tnr)
+    }
+
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
         vec![false]
     }
@@ -1079,9 +1225,35 @@ impl ExecutionPlan for SortExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let new_sort = SortExec::new(self.expr.clone(), Arc::clone(&children[0]))
-            .with_fetch(self.fetch)
-            .with_preserve_partitioning(self.preserve_partitioning);
+        let mut new_sort = self.cloned();
+        assert_eq!(children.len(), 1, "SortExec should have exactly one child");
+        new_sort.input = Arc::clone(&children[0]);
+
+        if !has_same_children_properties(self.as_ref(), &children)? {
+            // Recompute the properties based on the new input since they may have changed
+            let (cache, sort_prefix) = Self::compute_properties(
+                &new_sort.input,
+                new_sort.expr.clone(),
+                new_sort.preserve_partitioning,
+            )?;
+            new_sort.cache = Arc::new(cache);
+            new_sort.common_sort_prefix = sort_prefix;
+        }
+
+        Ok(Arc::new(new_sort))
+    }
+
+    fn reset_state(self: Arc<Self>) -> Result<Arc<dyn ExecutionPlan>> {
+        let children = self.children().into_iter().cloned().collect();
+        let new_sort = self.with_new_children(children)?;
+        let mut new_sort = new_sort
+            .as_any()
+            .downcast_ref::<SortExec>()
+            .expect("cloned 1 lines above this line, we know the type")
+            .clone();
+        // Our dynamic filter and execution metrics are the state we need to reset.
+        new_sort.filter = Some(new_sort.create_filter());
+        new_sort.metrics_set = ExecutionPlanMetricsSet::new();
 
         Ok(Arc::new(new_sort))
     }
@@ -1091,7 +1263,12 @@ impl ExecutionPlan for SortExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start SortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start SortExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
 
         let mut input = self.input.execute(partition, Arc::clone(&context))?;
 
@@ -1099,12 +1276,10 @@ impl ExecutionPlan for SortExec {
 
         trace!("End SortExec's input.execute for partition: {partition}");
 
-        let requirement = &LexRequirement::from(self.expr.clone());
-
         let sort_satisfied = self
             .input
             .equivalence_properties()
-            .ordering_satisfy_requirement(requirement);
+            .ordering_satisfy(self.expr.clone())?;
 
         match (sort_satisfied, self.fetch.as_ref()) {
             (true, Some(fetch)) => Ok(Box::pin(LimitStream::new(
@@ -1115,6 +1290,7 @@ impl ExecutionPlan for SortExec {
             ))),
             (true, None) => Ok(input),
             (false, Some(fetch)) => {
+                let filter = self.filter.clone();
                 let mut topk = TopK::try_new(
                     partition,
                     input.schema(),
@@ -1124,6 +1300,7 @@ impl ExecutionPlan for SortExec {
                     context.session_config().batch_size(),
                     context.runtime_env(),
                     &self.metrics_set,
+                    Arc::clone(&unwrap_or_internal_err!(filter)),
                 )?;
                 Ok(Box::pin(RecordBatchStreamAdapter::new(
                     self.schema(),
@@ -1148,6 +1325,7 @@ impl ExecutionPlan for SortExec {
                     context.session_config().batch_size(),
                     execution_options.sort_spill_reservation_bytes,
                     execution_options.sort_in_place_threshold_bytes,
+                    context.session_config().spill_compression(),
                     &self.metrics_set,
                     context.runtime_env(),
                 )?;
@@ -1170,25 +1348,14 @@ impl ExecutionPlan for SortExec {
         Some(self.metrics_set.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if !self.preserve_partitioning() {
-            return self.input.partition_statistics(None)?.with_fetch(
-                self.schema(),
-                self.fetch,
-                0,
-                1,
-            );
-        }
-        self.input.partition_statistics(partition)?.with_fetch(
-            self.schema(),
-            self.fetch,
-            0,
-            1,
-        )
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let p = if !self.preserve_partitioning() {
+            None
+        } else {
+            partition
+        };
+        let stats = Arc::unwrap_or_clone(self.input.partition_statistics(p)?);
+        Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?))
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
@@ -1219,17 +1386,10 @@ impl ExecutionPlan for SortExec {
             return Ok(None);
         }
 
-        let mut updated_exprs = LexOrdering::default();
-        for sort in self.expr() {
-            let Some(new_expr) = update_expr(&sort.expr, projection.expr(), false)?
-            else {
-                return Ok(None);
-            };
-            updated_exprs.push(PhysicalSortExpr {
-                expr: new_expr,
-                options: sort.options,
-            });
-        }
+        let Some(updated_exprs) = update_ordering(self.expr.clone(), projection.expr())?
+        else {
+            return Ok(None);
+        };
 
         Ok(Some(Arc::new(
             SortExec::new(updated_exprs, make_with_child(projection, self.input())?)
@@ -1237,6 +1397,28 @@ impl ExecutionPlan for SortExec {
                 .with_preserve_partitioning(self.preserve_partitioning()),
         )))
     }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        config: &datafusion_common::config::ConfigOptions,
+    ) -> Result<FilterDescription> {
+        if phase != FilterPushdownPhase::Post {
+            return FilterDescription::from_children(parent_filters, &self.children());
+        }
+
+        let mut child =
+            ChildFilterDescription::from_child(&parent_filters, self.input())?;
+
+        if let Some(filter) = &self.filter
+            && config.optimizer.enable_topk_dynamic_filter_pushdown
+        {
+            child = child.with_self_filter(filter.read().expr());
+        }
+
+        Ok(FilterDescription::new().with_child(child))
+    }
 }
 
 #[cfg(test)]
@@ -1251,9 +1433,9 @@ mod tests {
     use crate::execution_plan::Boundedness;
     use crate::expressions::col;
     use crate::test;
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
+    use crate::test::{assert_is_pending, make_partition};
 
     use arrow::array::*;
     use arrow::compute::SortOptions;
@@ -1261,11 +1443,11 @@ mod tests {
     use datafusion_common::cast::as_primitive_array;
     use datafusion_common::test_util::batches_to_string;
     use datafusion_common::{DataFusionError, Result, ScalarValue};
+    use datafusion_execution::RecordBatchStream;
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-    use datafusion_execution::RecordBatchStream;
-    use datafusion_physical_expr::expressions::{Column, Literal};
     use datafusion_physical_expr::EquivalenceProperties;
+    use datafusion_physical_expr::expressions::{Column, Literal};
 
     use futures::{FutureExt, Stream};
     use insta::assert_snapshot;
@@ -1274,7 +1456,7 @@ mod tests {
     pub struct SortedUnboundedExec {
         schema: Schema,
         batch_size: u64,
-        cache: PlanProperties,
+        cache: Arc<PlanProperties>,
     }
 
     impl DisplayAs for SortedUnboundedExec {
@@ -1291,9 +1473,9 @@ mod tests {
     impl SortedUnboundedExec {
         fn compute_properties(schema: SchemaRef) -> PlanProperties {
             let mut eq_properties = EquivalenceProperties::new(schema);
-            eq_properties.add_new_orderings(vec![LexOrdering::new(vec![
-                PhysicalSortExpr::new_default(Arc::new(Column::new("c1", 0))),
-            ])]);
+            eq_properties.add_ordering([PhysicalSortExpr::new_default(Arc::new(
+                Column::new("c1", 0),
+            ))]);
             PlanProperties::new(
                 eq_properties,
                 Partitioning::UnknownPartitioning(1),
@@ -1314,7 +1496,7 @@ mod tests {
             self
         }
 
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
 
@@ -1329,6 +1511,13 @@ mod tests {
             Ok(self)
         }
 
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
+
         fn execute(
             &self,
             _partition: usize,
@@ -1393,10 +1582,11 @@ mod tests {
         let schema = csv.schema();
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("i", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             Arc::new(CoalescePartitionsExec::new(csv)),
         ));
 
@@ -1404,7 +1594,6 @@ mod tests {
 
         assert_eq!(result.len(), 1);
         assert_eq!(result[0].num_rows(), 400);
-
         assert_eq!(
             task_ctx.runtime_env().memory_pool.reserved(),
             0,
@@ -1439,10 +1628,11 @@ mod tests {
         let schema = input.schema();
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("i", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             Arc::new(CoalescePartitionsExec::new(input)),
         ));
 
@@ -1468,14 +1658,13 @@ mod tests {
         // bytes. We leave a little wiggle room for the actual numbers.
         assert!((3..=10).contains(&spill_count));
         assert!((9000..=10000).contains(&spilled_rows));
-        assert!((38000..=42000).contains(&spilled_bytes));
+        assert!((38000..=44000).contains(&spilled_bytes));
 
         let columns = result[0].columns();
 
         let i = as_primitive_array::<Int32Type>(&columns[0])?;
         assert_eq!(i.value(0), 0);
         assert_eq!(i.value(i.len() - 1), 81);
-
         assert_eq!(
             task_ctx.runtime_env().memory_pool.reserved(),
             0,
@@ -1488,13 +1677,24 @@ mod tests {
     #[tokio::test]
     async fn test_batch_reservation_error() -> Result<()> {
         // Pick a memory limit and sort_spill_reservation that make the first batch reservation fail.
-        // These values assume that the ExternalSorter will reserve 800 bytes for the first batch.
-        let expected_batch_reservation = 800;
         let merge_reservation: usize = 0; // Set to 0 for simplicity
-        let memory_limit: usize = expected_batch_reservation + merge_reservation - 1; // Just short of what we need
 
         let session_config =
             SessionConfig::new().with_sort_spill_reservation_bytes(merge_reservation);
+
+        let plan = test::scan_partitioned(1);
+
+        // Read the first record batch to determine the actual memory requirement
+        let expected_batch_reservation = {
+            let temp_ctx = Arc::new(TaskContext::default());
+            let mut stream = plan.execute(0, Arc::clone(&temp_ctx))?;
+            let first_batch = stream.next().await.unwrap()?;
+            get_reserved_bytes_for_record_batch(&first_batch)?
+        };
+
+        // Set memory limit just short of what we need
+        let memory_limit: usize = expected_batch_reservation + merge_reservation - 1;
+
         let runtime = RuntimeEnvBuilder::new()
             .with_memory_limit(memory_limit, 1.0)
             .build_arc()?;
@@ -1504,32 +1704,22 @@ mod tests {
                 .with_runtime(runtime),
         );
 
-        let plan = test::scan_partitioned(1);
-
-        // Read the first record batch to assert that our memory limit and sort_spill_reservation
-        // settings trigger the test scenario.
+        // Verify that our memory limit is insufficient
         {
             let mut stream = plan.execute(0, Arc::clone(&task_ctx))?;
             let first_batch = stream.next().await.unwrap()?;
-            let batch_reservation = get_reserved_byte_for_record_batch(&first_batch);
+            let batch_reservation = get_reserved_bytes_for_record_batch(&first_batch)?;
 
             assert_eq!(batch_reservation, expected_batch_reservation);
             assert!(memory_limit < (merge_reservation + batch_reservation));
         }
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
-                expr: col("i", &plan.schema())?,
-                options: SortOptions::default(),
-            }]),
+            [PhysicalSortExpr::new_default(col("i", &plan.schema())?)].into(),
             plan,
         ));
 
-        let result = collect(
-            Arc::clone(&sort_exec) as Arc<dyn ExecutionPlan>,
-            Arc::clone(&task_ctx),
-        )
-        .await;
+        let result = collect(Arc::clone(&sort_exec) as _, Arc::clone(&task_ctx)).await;
 
         let err = result.unwrap_err();
         assert!(
@@ -1543,6 +1733,21 @@ mod tests {
             "Assertion failed: expected a ResourcesExhausted error, but got: {err:?}"
         );
 
+        // Verify external sorter error message when resource is exhausted
+        let config_vector = vec![
+            "datafusion.runtime.memory_limit",
+            "datafusion.execution.sort_spill_reservation_bytes",
+        ];
+        let error_message = err.message().to_string();
+        for config in config_vector.into_iter() {
+            assert!(
+                error_message.as_str().contains(config),
+                "Config: '{}' should be contained in error message: {}.",
+                config,
+                error_message.as_str()
+            );
+        }
+
         Ok(())
     }
 
@@ -1563,23 +1768,20 @@ mod tests {
 
         // The input has 200 partitions, each partition has a batch containing 100 rows.
         // Each row has a single Utf8 column, the Utf8 string values are roughly 42 bytes.
-        // The total size of the input is roughly 8.4 KB.
+        // The total size of the input is roughly 820 KB.
         let input = test::scan_partitioned_utf8(200);
         let schema = input.schema();
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("i", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             Arc::new(CoalescePartitionsExec::new(input)),
         ));
 
-        let result = collect(
-            Arc::clone(&sort_exec) as Arc<dyn ExecutionPlan>,
-            Arc::clone(&task_ctx),
-        )
-        .await?;
+        let result = collect(Arc::clone(&sort_exec) as _, Arc::clone(&task_ctx)).await?;
 
         let num_rows = result.iter().map(|batch| batch.num_rows()).sum::<usize>();
         assert_eq!(num_rows, 20000);
@@ -1668,20 +1870,18 @@ mod tests {
 
             let sort_exec = Arc::new(
                 SortExec::new(
-                    LexOrdering::new(vec![PhysicalSortExpr {
+                    [PhysicalSortExpr {
                         expr: col("i", &schema)?,
                         options: SortOptions::default(),
-                    }]),
+                    }]
+                    .into(),
                     Arc::new(CoalescePartitionsExec::new(csv)),
                 )
                 .with_fetch(fetch),
             );
 
-            let result = collect(
-                Arc::clone(&sort_exec) as Arc<dyn ExecutionPlan>,
-                Arc::clone(&task_ctx),
-            )
-            .await?;
+            let result =
+                collect(Arc::clone(&sort_exec) as _, Arc::clone(&task_ctx)).await?;
             assert_eq!(result.len(), 1);
 
             let metrics = sort_exec.metrics().unwrap();
@@ -1691,6 +1891,93 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_sort_memory_reduction_per_batch() -> Result<()> {
+        // This test verifies that memory reservation is reduced for every batch emitted
+        // during the sort process. This is important to ensure we don't hold onto
+        // memory longer than necessary.
+
+        // Create a large enough batch that will be split into multiple output batches
+        let batch_size = 50; // Small batch size to force multiple output batches
+        let num_rows = 1000; // Create enough data for multiple batches
+
+        let task_ctx = Arc::new(
+            TaskContext::default().with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(batch_size)
+                    .with_sort_in_place_threshold_bytes(usize::MAX), // Ensure we don't concat batches
+            ),
+        );
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create unsorted data
+        let mut values: Vec<i32> = (0..num_rows).collect();
+        values.reverse();
+
+        let input_batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let batches = vec![input_batch];
+
+        let sort_exec = Arc::new(SortExec::new(
+            [PhysicalSortExpr {
+                expr: Arc::new(Column::new("a", 0)),
+                options: SortOptions::default(),
+            }]
+            .into(),
+            TestMemoryExec::try_new_exec(
+                std::slice::from_ref(&batches),
+                Arc::clone(&schema),
+                None,
+            )?,
+        ));
+
+        let mut stream = sort_exec.execute(0, Arc::clone(&task_ctx))?;
+
+        let mut previous_reserved = task_ctx.runtime_env().memory_pool.reserved();
+        let mut batch_count = 0;
+
+        // Collect batches and verify memory is reduced with each batch
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            batch_count += 1;
+
+            // Verify we got a non-empty batch
+            assert!(batch.num_rows() > 0, "Batch should not be empty");
+
+            let current_reserved = task_ctx.runtime_env().memory_pool.reserved();
+
+            // After the first batch, memory should be reducing or staying the same
+            // (it should not increase as we emit batches)
+            if batch_count > 1 {
+                assert!(
+                    current_reserved <= previous_reserved,
+                    "Memory reservation should decrease or stay same as batches are emitted. \
+                     Batch {batch_count}: previous={previous_reserved}, current={current_reserved}"
+                );
+            }
+
+            previous_reserved = current_reserved;
+        }
+
+        assert!(
+            batch_count > 1,
+            "Expected multiple batches to be emitted, got {batch_count}"
+        );
+
+        // Verify all memory is returned at the end
+        assert_eq!(
+            task_ctx.runtime_env().memory_pool.reserved(),
+            0,
+            "All memory should be returned after consuming all batches"
+        );
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_sort_metadata() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
@@ -1711,16 +1998,16 @@ mod tests {
         let data: ArrayRef =
             Arc::new(vec![3, 2, 1].into_iter().map(Some).collect::<UInt64Array>());
 
-        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data]).unwrap();
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])?;
         let input =
-            TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)
-                .unwrap();
+            TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?;
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("field_name", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             input,
         ));
 
@@ -1729,7 +2016,7 @@ mod tests {
         let expected_data: ArrayRef =
             Arc::new(vec![1, 2, 3].into_iter().map(Some).collect::<UInt64Array>());
         let expected_batch =
-            RecordBatch::try_new(Arc::clone(&schema), vec![expected_data]).unwrap();
+            RecordBatch::try_new(Arc::clone(&schema), vec![expected_data])?;
 
         // Data is correct
         assert_eq!(&vec![expected_batch], &result);
@@ -1768,7 +2055,7 @@ mod tests {
         )?;
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: SortOptions {
@@ -1783,7 +2070,8 @@ mod tests {
                         nulls_first: false,
                     },
                 },
-            ]),
+            ]
+            .into(),
             TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?,
         ));
 
@@ -1854,7 +2142,7 @@ mod tests {
         )?;
 
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![
+            [
                 PhysicalSortExpr {
                     expr: col("a", &schema)?,
                     options: SortOptions {
@@ -1869,7 +2157,8 @@ mod tests {
                         nulls_first: false,
                     },
                 },
-            ]),
+            ]
+            .into(),
             TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?,
         ));
 
@@ -1933,10 +2222,11 @@ mod tests {
         let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
         let refs = blocking_exec.refs();
         let sort_exec = Arc::new(SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("a", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             blocking_exec,
         ));
 
@@ -1964,12 +2254,13 @@ mod tests {
             RecordBatch::try_new_with_options(Arc::clone(&schema), vec![], &options)
                 .unwrap();
 
-        let expressions = LexOrdering::new(vec![PhysicalSortExpr {
+        let expressions = [PhysicalSortExpr {
             expr: Arc::new(Literal::new(ScalarValue::Int64(Some(1)))),
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
 
-        let result = sort_batch(&batch, expressions.as_ref(), None).unwrap();
+        let result = sort_batch(&batch, &expressions, None).unwrap();
         assert_eq!(result.num_rows(), 1);
     }
 
@@ -1980,32 +2271,596 @@ mod tests {
         let source = SortedUnboundedExec {
             schema: schema.clone(),
             batch_size: 2,
-            cache: SortedUnboundedExec::compute_properties(Arc::new(schema.clone())),
+            cache: Arc::new(SortedUnboundedExec::compute_properties(Arc::new(
+                schema.clone(),
+            ))),
         };
         let mut plan = SortExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr::new_default(Arc::new(Column::new(
+            [PhysicalSortExpr::new_default(Arc::new(Column::new(
                 "c1", 0,
-            )))]),
+            )))]
+            .into(),
             Arc::new(source),
         );
         plan = plan.with_fetch(Some(9));
 
         let batches = collect(Arc::new(plan), task_ctx).await?;
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+
-            | c1 |
-            +----+
-            | 0  |
-            | 1  |
-            | 2  |
-            | 3  |
-            | 4  |
-            | 5  |
-            | 6  |
-            | 7  |
-            | 8  |
-            +----+
-            "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+
+        | c1 |
+        +----+
+        | 0  |
+        | 1  |
+        | 2  |
+        | 3  |
+        | 4  |
+        | 5  |
+        | 6  |
+        | 7  |
+        | 8  |
+        +----+
+        ");
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn should_return_stream_with_batches_in_the_requested_size() -> Result<()> {
+        let batch_size = 100;
+
+        let create_task_ctx = |_: &[RecordBatch]| {
+            TaskContext::default().with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(batch_size)
+                    .with_sort_in_place_threshold_bytes(usize::MAX),
+            )
+        };
+
+        // Smaller than batch size and require more than a single batch to get the requested batch size
+        test_sort_output_batch_size(10, batch_size / 4, create_task_ctx).await?;
+
+        // Not evenly divisible by batch size
+        test_sort_output_batch_size(10, batch_size + 7, create_task_ctx).await?;
+
+        // Evenly divisible by batch size and is larger than 2 output batches
+        test_sort_output_batch_size(10, batch_size * 3, create_task_ctx).await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn should_return_stream_with_batches_in_the_requested_size_when_sorting_in_place()
+    -> Result<()> {
+        let batch_size = 100;
+
+        let create_task_ctx = |_: &[RecordBatch]| {
+            TaskContext::default().with_session_config(
+                SessionConfig::new()
+                    .with_batch_size(batch_size)
+                    .with_sort_in_place_threshold_bytes(usize::MAX - 1),
+            )
+        };
+
+        // Smaller than batch size and require more than a single batch to get the requested batch size
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size / 4, create_task_ctx).await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        // Not evenly divisible by batch size
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size + 7, create_task_ctx).await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        // Evenly divisible by batch size and is larger than 2 output batches
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size * 3, create_task_ctx).await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn should_return_stream_with_batches_in_the_requested_size_when_having_a_single_batch()
+    -> Result<()> {
+        let batch_size = 100;
+
+        let create_task_ctx = |_: &[RecordBatch]| {
+            TaskContext::default()
+                .with_session_config(SessionConfig::new().with_batch_size(batch_size))
+        };
+
+        // Smaller than batch size and require more than a single batch to get the requested batch size
+        {
+            let metrics = test_sort_output_batch_size(
+                // Single batch
+                1,
+                batch_size / 4,
+                create_task_ctx,
+            )
+            .await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        // Not evenly divisible by batch size
+        {
+            let metrics = test_sort_output_batch_size(
+                // Single batch
+                1,
+                batch_size + 7,
+                create_task_ctx,
+            )
+            .await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        // Evenly divisible by batch size and is larger than 2 output batches
+        {
+            let metrics = test_sort_output_batch_size(
+                // Single batch
+                1,
+                batch_size * 3,
+                create_task_ctx,
+            )
+            .await?;
+
+            assert_eq!(
+                metrics.spill_count(),
+                Some(0),
+                "Expected no spills when sorting in place"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn should_return_stream_with_batches_in_the_requested_size_when_having_to_spill()
+    -> Result<()> {
+        let batch_size = 100;
+
+        let create_task_ctx = |generated_batches: &[RecordBatch]| {
+            let batches_memory = generated_batches
+                .iter()
+                .map(|b| b.get_array_memory_size())
+                .sum::<usize>();
+
+            TaskContext::default()
+                .with_session_config(
+                    SessionConfig::new()
+                        .with_batch_size(batch_size)
+                        // To make sure there is no in place sorting
+                        .with_sort_in_place_threshold_bytes(1)
+                        .with_sort_spill_reservation_bytes(1),
+                )
+                .with_runtime(
+                    RuntimeEnvBuilder::default()
+                        .with_memory_limit(batches_memory, 1.0)
+                        .build_arc()
+                        .unwrap(),
+                )
+        };
+
+        // Smaller than batch size and require more than a single batch to get the requested batch size
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size / 4, create_task_ctx).await?;
+
+            assert_ne!(metrics.spill_count().unwrap(), 0, "expected to spill");
+        }
+
+        // Not evenly divisible by batch size
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size + 7, create_task_ctx).await?;
+
+            assert_ne!(metrics.spill_count().unwrap(), 0, "expected to spill");
+        }
+
+        // Evenly divisible by batch size and is larger than 2 batches
+        {
+            let metrics =
+                test_sort_output_batch_size(10, batch_size * 3, create_task_ctx).await?;
+
+            assert_ne!(metrics.spill_count().unwrap(), 0, "expected to spill");
+        }
+
+        Ok(())
+    }
+
+    async fn test_sort_output_batch_size(
+        number_of_batches: usize,
+        batch_size_to_generate: usize,
+        create_task_ctx: impl Fn(&[RecordBatch]) -> TaskContext,
+    ) -> Result<MetricsSet> {
+        let batches = (0..number_of_batches)
+            .map(|_| make_partition(batch_size_to_generate as i32))
+            .collect::<Vec<_>>();
+        let task_ctx = create_task_ctx(batches.as_slice());
+
+        let expected_batch_size = task_ctx.session_config().batch_size();
+
+        let (mut output_batches, metrics) =
+            run_sort_on_input(task_ctx, "i", batches).await?;
+
+        let last_batch = output_batches.pop().unwrap();
+
+        for batch in output_batches {
+            assert_eq!(batch.num_rows(), expected_batch_size);
+        }
+
+        let mut last_expected_batch_size =
+            (batch_size_to_generate * number_of_batches) % expected_batch_size;
+        if last_expected_batch_size == 0 {
+            last_expected_batch_size = expected_batch_size;
+        }
+        assert_eq!(last_batch.num_rows(), last_expected_batch_size);
+
+        Ok(metrics)
+    }
+
+    async fn run_sort_on_input(
+        task_ctx: TaskContext,
+        order_by_col: &str,
+        batches: Vec<RecordBatch>,
+    ) -> Result<(Vec<RecordBatch>, MetricsSet)> {
+        let task_ctx = Arc::new(task_ctx);
+
+        // let task_ctx = env.
+        let schema = batches[0].schema();
+        let ordering: LexOrdering = [PhysicalSortExpr {
+            expr: col(order_by_col, &schema)?,
+            options: SortOptions {
+                descending: false,
+                nulls_first: true,
+            },
+        }]
+        .into();
+        let sort_exec: Arc<dyn ExecutionPlan> = Arc::new(SortExec::new(
+            ordering.clone(),
+            TestMemoryExec::try_new_exec(std::slice::from_ref(&batches), schema, None)?,
+        ));
+
+        let sorted_batches =
+            collect(Arc::clone(&sort_exec), Arc::clone(&task_ctx)).await?;
+
+        let metrics = sort_exec.metrics().expect("sort have metrics");
+
+        // assert output
+        {
+            let input_batches_concat = concat_batches(batches[0].schema_ref(), &batches)?;
+            let sorted_input_batch = sort_batch(&input_batches_concat, &ordering, None)?;
+
+            let sorted_batches_concat =
+                concat_batches(sorted_batches[0].schema_ref(), &sorted_batches)?;
+
+            assert_eq!(sorted_input_batch, sorted_batches_concat);
+        }
+
+        Ok((sorted_batches, metrics))
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_basic() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 1000 rows
+        let mut values: Vec<i32> = (0..1000).collect();
+        // Shuffle to make it unsorted
+        values.reverse();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 250
+        let result_batches = sort_batch_chunked(&batch, &expressions, 250)?;
+
+        // Verify 4 batches are returned
+        assert_eq!(result_batches.len(), 4);
+
+        // Verify each batch has <= 250 rows
+        let mut total_rows = 0;
+        for (i, batch) in result_batches.iter().enumerate() {
+            assert!(
+                batch.num_rows() <= 250,
+                "Batch {} has {} rows, expected <= 250",
+                i,
+                batch.num_rows()
+            );
+            total_rows += batch.num_rows();
+        }
+
+        // Verify total row count matches input
+        assert_eq!(total_rows, 1000);
+
+        // Verify data is correctly sorted across all chunks
+        let concatenated = concat_batches(&schema, &result_batches)?;
+        let array = as_primitive_array::<Int32Type>(concatenated.column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(
+                array.value(i) <= array.value(i + 1),
+                "Array not sorted at position {}: {} > {}",
+                i,
+                array.value(i),
+                array.value(i + 1)
+            );
+        }
+        assert_eq!(array.value(0), 0);
+        assert_eq!(array.value(array.len() - 1), 999);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_smaller_than_batch_size() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 50 rows
+        let values: Vec<i32> = (0..50).rev().collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 100
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Should return exactly 1 batch
+        assert_eq!(result_batches.len(), 1);
+        assert_eq!(result_batches[0].num_rows(), 50);
+
+        // Verify it's correctly sorted
+        let array = as_primitive_array::<Int32Type>(result_batches[0].column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(array.value(i) <= array.value(i + 1));
+        }
+        assert_eq!(array.value(0), 0);
+        assert_eq!(array.value(49), 49);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_exact_multiple() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a batch with 1000 rows
+        let values: Vec<i32> = (0..1000).rev().collect();
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(values))],
+        )?;
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        // Sort with batch_size = 100
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Should return exactly 10 batches of 100 rows each
+        assert_eq!(result_batches.len(), 10);
+        for batch in &result_batches {
+            assert_eq!(batch.num_rows(), 100);
+        }
+
+        // Verify sorted correctly across all batches
+        let concatenated = concat_batches(&schema, &result_batches)?;
+        let array = as_primitive_array::<Int32Type>(concatenated.column(0))?;
+        for i in 0..array.len() - 1 {
+            assert!(array.value(i) <= array.value(i + 1));
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_sort_batch_chunked_empty_batch() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let batch = RecordBatch::new_empty(Arc::clone(&schema));
+
+        let expressions: LexOrdering =
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into();
+
+        let result_batches = sort_batch_chunked(&batch, &expressions, 100)?;
+
+        // Empty input produces no output batches (0 chunks)
+        assert_eq!(result_batches.len(), 0);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_reserved_bytes_for_record_batch_with_sliced_batches() -> Result<()>
+    {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a larger batch then slice it
+        let large_array = Int32Array::from((0..1000).collect::<Vec<i32>>());
+        let sliced_array = large_array.slice(100, 50); // Take 50 elements starting at 100
+
+        let sliced_batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(sliced_array)])?;
+        let batch =
+            RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(large_array)])?;
+
+        let sliced_reserved = get_reserved_bytes_for_record_batch(&sliced_batch)?;
+        let reserved = get_reserved_bytes_for_record_batch(&batch)?;
+
+        // The reserved memory for the sliced batch should be less than that of the full batch
+        assert!(reserved > sliced_reserved);
+
+        Ok(())
+    }
+
+    /// Verifies that `ExternalSorter::sort()` transfers the pre-reserved
+    /// merge bytes to the merge stream via `take()`, rather than leaving
+    /// them in the sorter (via `new_empty()`).
+    ///
+    /// 1. Create a sorter with a tight memory pool and insert enough data
+    ///    to force spilling
+    /// 2. Verify `merge_reservation` holds the pre-reserved bytes before sort
+    /// 3. Call `sort()` to get the merge stream
+    /// 4. Verify `merge_reservation` is now 0 (bytes transferred to merge stream)
+    /// 5. Simulate contention: a competing consumer grabs all available pool memory
+    /// 6. Verify the merge stream still works (it uses its pre-reserved bytes
+    ///    as initial budget, not requesting from pool starting at 0)
+    ///
+    /// With `new_empty()` (before fix), step 4 fails: `merge_reservation`
+    /// still holds the bytes, the merge stream starts with 0 budget, and
+    /// those bytes become unaccounted-for reserved memory that nobody uses.
+    #[tokio::test]
+    async fn test_sort_merge_reservation_transferred_not_freed() -> Result<()> {
+        use datafusion_execution::memory_pool::{
+            GreedyMemoryPool, MemoryConsumer, MemoryPool,
+        };
+        use futures::TryStreamExt;
+
+        let sort_spill_reservation_bytes: usize = 10 * 1024; // 10 KB
+
+        // Pool: merge reservation (10KB) + enough room for sort to work.
+        // The room must accommodate batch data accumulation before spilling.
+        let sort_working_memory: usize = 40 * 1024; // 40 KB for sort operations
+        let pool_size = sort_spill_reservation_bytes + sort_working_memory;
+        let pool: Arc<dyn MemoryPool> = Arc::new(GreedyMemoryPool::new(pool_size));
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_pool(Arc::clone(&pool))
+            .build_arc()?;
+
+        let metrics_set = ExecutionPlanMetricsSet::new();
+        let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
+
+        let mut sorter = ExternalSorter::new(
+            0,
+            Arc::clone(&schema),
+            [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(),
+            128, // batch_size
+            sort_spill_reservation_bytes,
+            usize::MAX, // sort_in_place_threshold_bytes (high to avoid concat path)
+            SpillCompression::Uncompressed,
+            &metrics_set,
+            Arc::clone(&runtime),
+        )?;
+
+        // Insert enough data to force spilling.
+        let num_batches = 200;
+        for i in 0..num_batches {
+            let values: Vec<i32> = ((i * 100)..((i + 1) * 100)).rev().collect();
+            let batch = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![Arc::new(Int32Array::from(values))],
+            )?;
+            sorter.insert_batch(batch).await?;
+        }
+
+        assert!(
+            sorter.spilled_before(),
+            "Test requires spilling to exercise the merge path"
+        );
+
+        // Before sort(), merge_reservation holds sort_spill_reservation_bytes.
+        assert!(
+            sorter.merge_reservation_size() >= sort_spill_reservation_bytes,
+            "merge_reservation should hold the pre-reserved bytes before sort()"
+        );
+
+        // Call sort() to get the merge stream. With the fix (take()),
+        // the pre-reserved merge bytes are transferred to the merge
+        // stream. Without the fix (free() + new_empty()), the bytes
+        // are released back to the pool and the merge stream starts
+        // with 0 bytes.
+        let merge_stream = sorter.sort().await?;
+
+        // THE KEY ASSERTION: after sort(), merge_reservation must be 0.
+        // This proves take() transferred the bytes to the merge stream,
+        // rather than them being freed back to the pool where other
+        // partitions could steal them.
+        assert_eq!(
+            sorter.merge_reservation_size(),
+            0,
+            "After sort(), merge_reservation should be 0 (bytes transferred \
+             to merge stream via take()). If non-zero, the bytes are still \
+             held by the sorter and will be freed on drop, allowing other \
+             partitions to steal them."
+        );
+
+        // Drop the sorter to free its reservations back to the pool.
+        drop(sorter);
+
+        // Simulate contention: another partition grabs ALL available
+        // pool memory. If the merge stream didn't receive the
+        // pre-reserved bytes via take(), it will fail when it tries
+        // to allocate memory for reading spill files.
+        let contender = MemoryConsumer::new("CompetingPartition").register(&pool);
+        let available = pool_size.saturating_sub(pool.reserved());
+        if available > 0 {
+            contender.try_grow(available).unwrap();
+        }
+
+        // The merge stream must still produce correct results despite
+        // the pool being fully consumed by the contender. This only
+        // works if sort() transferred the pre-reserved bytes to the
+        // merge stream (via take()) rather than freeing them.
+        let batches: Vec<RecordBatch> = merge_stream.try_collect().await?;
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(
+            total_rows,
+            (num_batches * 100) as usize,
+            "Merge stream should produce all rows even under memory contention"
+        );
+
+        // Verify data is sorted
+        let merged = concat_batches(&schema, &batches)?;
+        let col = merged.column(0).as_primitive::<Int32Type>();
+        for i in 1..col.len() {
+            assert!(
+                col.value(i - 1) <= col.value(i),
+                "Output should be sorted, but found {} > {} at index {}",
+                col.value(i - 1),
+                col.value(i),
+                i
+            );
+        }
+
+        drop(contender);
         Ok(())
     }
 }
diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
index 6930473360f07..b1ee5b4d5e8da 100644
--- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
+++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
@@ -23,19 +23,22 @@ use std::sync::Arc;
 use crate::common::spawn_buffered;
 use crate::limit::LimitStream;
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
-use crate::projection::{make_with_child, update_expr, ProjectionExec};
+use crate::projection::{ProjectionExec, make_with_child, update_ordering};
 use crate::sorts::streaming_merge::StreamingMergeBuilder;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
     Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
+    check_if_same_properties,
 };
 
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_execution::memory_pool::MemoryConsumer;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, OrderingRequirements};
 
+use crate::execution_plan::{EvaluationType, SchedulingType};
 use log::{debug, trace};
 
 /// Sort preserving merge execution plan
@@ -93,7 +96,7 @@ pub struct SortPreservingMergeExec {
     /// Optional number of rows to fetch. Stops producing rows after this fetch
     fetch: Option<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Use round-robin selection of tied winners of loser tree
     ///
     /// See [`Self::with_round_robin_repartition`] for more information.
@@ -109,7 +112,7 @@ impl SortPreservingMergeExec {
             expr,
             metrics: ExecutionPlanMetricsSet::new(),
             fetch: None,
-            cache,
+            cache: Arc::new(cache),
             enable_round_robin_repartition: true,
         }
     }
@@ -144,7 +147,7 @@ impl SortPreservingMergeExec {
 
     /// Sort expressions
     pub fn expr(&self) -> &LexOrdering {
-        self.expr.as_ref()
+        &self.expr
     }
 
     /// Fetch
@@ -158,15 +161,38 @@ impl SortPreservingMergeExec {
         input: &Arc<dyn ExecutionPlan>,
         ordering: LexOrdering,
     ) -> PlanProperties {
+        let input_partitions = input.output_partitioning().partition_count();
+        let (drive, scheduling) = if input_partitions > 1 {
+            (EvaluationType::Eager, SchedulingType::Cooperative)
+        } else {
+            (
+                input.properties().evaluation_type,
+                input.properties().scheduling_type,
+            )
+        };
+
         let mut eq_properties = input.equivalence_properties().clone();
         eq_properties.clear_per_partition_constants();
-        eq_properties.add_new_orderings(vec![ordering]);
+        eq_properties.add_ordering(ordering);
         PlanProperties::new(
             eq_properties,                        // Equivalence Properties
             Partitioning::UnknownPartitioning(1), // Output Partitioning
             input.pipeline_behavior(),            // Pipeline Behavior
             input.boundedness(),                  // Boundedness
         )
+        .with_evaluation_type(drive)
+        .with_scheduling_type(scheduling)
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
     }
 }
 
@@ -186,15 +212,16 @@ impl DisplayAs for SortPreservingMergeExec {
                 Ok(())
             }
             DisplayFormatType::TreeRender => {
+                if let Some(fetch) = self.fetch {
+                    writeln!(f, "limit={fetch}")?;
+                };
+
                 for (i, e) in self.expr().iter().enumerate() {
                     e.fmt_sql(f)?;
                     if i != self.expr().len() - 1 {
                         write!(f, ", ")?;
                     }
                 }
-                if let Some(fetch) = self.fetch {
-                    writeln!(f, "limit={fetch}")?;
-                };
 
                 Ok(())
             }
@@ -212,7 +239,7 @@ impl ExecutionPlan for SortPreservingMergeExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -227,11 +254,24 @@ impl ExecutionPlan for SortPreservingMergeExec {
             expr: self.expr.clone(),
             metrics: self.metrics.clone(),
             fetch: limit,
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             enable_round_robin_repartition: true,
         }))
     }
 
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
     fn required_input_distribution(&self) -> Vec<Distribution> {
         vec![Distribution::UnspecifiedDistribution]
     }
@@ -240,8 +280,8 @@ impl ExecutionPlan for SortPreservingMergeExec {
         vec![false]
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
-        vec![Some(LexRequirement::from(self.expr.clone()))]
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        vec![Some(OrderingRequirements::from(self.expr.clone()))]
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -252,12 +292,24 @@ impl ExecutionPlan for SortPreservingMergeExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for sort_expr in &self.expr {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(
-            SortPreservingMergeExec::new(self.expr.clone(), Arc::clone(&children[0]))
+            SortPreservingMergeExec::new(self.expr.clone(), children.swap_remove(0))
                 .with_fetch(self.fetch),
         ))
     }
@@ -268,11 +320,11 @@ impl ExecutionPlan for SortPreservingMergeExec {
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         trace!("Start SortPreservingMergeExec::execute for partition: {partition}");
-        if 0 != partition {
-            return internal_err!(
-                "SortPreservingMergeExec invalid partition {partition}"
-            );
-        }
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "SortPreservingMergeExec invalid partition {partition}"
+        );
 
         let input_partitions = self.input.output_partitioning().partition_count();
         trace!(
@@ -291,7 +343,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
             1 => match self.fetch {
                 Some(fetch) => {
                     let stream = self.input.execute(0, context)?;
-                    debug!("Done getting stream for SortPreservingMergeExec::execute with 1 input with {fetch}");
+                    debug!(
+                        "Done getting stream for SortPreservingMergeExec::execute with 1 input with {fetch}"
+                    );
                     Ok(Box::pin(LimitStream::new(
                         stream,
                         0,
@@ -301,7 +355,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
                 }
                 None => {
                     let stream = self.input.execute(0, context);
-                    debug!("Done getting stream for SortPreservingMergeExec::execute with 1 input without fetch");
+                    debug!(
+                        "Done getting stream for SortPreservingMergeExec::execute with 1 input without fetch"
+                    );
                     stream
                 }
             },
@@ -314,12 +370,14 @@ impl ExecutionPlan for SortPreservingMergeExec {
                     })
                     .collect::<Result<_>>()?;
 
-                debug!("Done setting up sender-receiver for SortPreservingMergeExec::execute");
+                debug!(
+                    "Done setting up sender-receiver for SortPreservingMergeExec::execute"
+                );
 
                 let result = StreamingMergeBuilder::new()
                     .with_streams(receivers)
                     .with_schema(schema)
-                    .with_expressions(self.expr.as_ref())
+                    .with_expressions(&self.expr)
                     .with_metrics(BaselineMetrics::new(&self.metrics, partition))
                     .with_batch_size(context.session_config().batch_size())
                     .with_fetch(self.fetch)
@@ -327,7 +385,9 @@ impl ExecutionPlan for SortPreservingMergeExec {
                     .with_round_robin_tie_breaker(self.enable_round_robin_repartition)
                     .build()?;
 
-                debug!("Got stream result from SortPreservingMergeStream::new_from_receivers");
+                debug!(
+                    "Got stream result from SortPreservingMergeStream::new_from_receivers"
+                );
 
                 Ok(result)
             }
@@ -338,11 +398,7 @@ impl ExecutionPlan for SortPreservingMergeExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
         self.input.partition_statistics(None)
     }
 
@@ -362,17 +418,10 @@ impl ExecutionPlan for SortPreservingMergeExec {
             return Ok(None);
         }
 
-        let mut updated_exprs = LexOrdering::default();
-        for sort in self.expr() {
-            let Some(updated_expr) = update_expr(&sort.expr, projection.expr(), false)?
-            else {
-                return Ok(None);
-            };
-            updated_exprs.push(PhysicalSortExpr {
-                expr: updated_expr,
-                options: sort.options,
-            });
-        }
+        let Some(updated_exprs) = update_ordering(self.expr.clone(), projection.expr())?
+        else {
+            return Ok(None);
+        };
 
         Ok(Some(Arc::new(
             SortPreservingMergeExec::new(
@@ -386,14 +435,14 @@ impl ExecutionPlan for SortPreservingMergeExec {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::HashSet;
     use std::fmt::Formatter;
     use std::pin::Pin;
     use std::sync::Mutex;
-    use std::task::{Context, Poll};
+    use std::task::{Context, Poll, Waker, ready};
     use std::time::Duration;
 
     use super::*;
-    use crate::coalesce_batches::CoalesceBatchesExec;
     use crate::coalesce_partitions::CoalescePartitionsExec;
     use crate::execution_plan::{Boundedness, EmissionType};
     use crate::expressions::col;
@@ -401,8 +450,8 @@ mod tests {
     use crate::repartition::RepartitionExec;
     use crate::sorts::sort::SortExec;
     use crate::stream::RecordBatchReceiverStream;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
     use crate::test::TestMemoryExec;
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
     use crate::test::{self, assert_is_pending, make_partition};
     use crate::{collect, common};
 
@@ -413,27 +462,30 @@ mod tests {
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{assert_batches_eq, assert_contains, DataFusionError};
+    use datafusion_common::{assert_batches_eq, exec_err};
     use datafusion_common_runtime::SpawnedTask;
+    use datafusion_execution::RecordBatchStream;
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::runtime_env::RuntimeEnvBuilder;
-    use datafusion_execution::RecordBatchStream;
-    use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr::EquivalenceProperties;
+    use datafusion_physical_expr::expressions::Column;
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
-
     use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+
     use futures::{FutureExt, Stream, StreamExt};
     use insta::assert_snapshot;
     use tokio::time::timeout;
 
     // The number in the function is highly related to the memory limit we are testing
     // any change of the constant should be aware of
-    fn generate_task_ctx_for_round_robin_tie_breaker() -> Result<Arc<TaskContext>> {
+    fn generate_task_ctx_for_round_robin_tie_breaker(
+        target_batch_size: usize,
+    ) -> Result<Arc<TaskContext>> {
         let runtime = RuntimeEnvBuilder::new()
             .with_memory_limit(20_000_000, 1.0)
             .build_arc()?;
-        let config = SessionConfig::new();
+        let mut config = SessionConfig::new();
+        config.options_mut().execution.batch_size = target_batch_size;
         let task_ctx = TaskContext::default()
             .with_runtime(runtime)
             .with_session_config(config);
@@ -444,34 +496,31 @@ mod tests {
     fn generate_spm_for_round_robin_tie_breaker(
         enable_round_robin_repartition: bool,
     ) -> Result<Arc<SortPreservingMergeExec>> {
-        let target_batch_size = 12500;
         let row_size = 12500;
         let a: ArrayRef = Arc::new(Int32Array::from(vec![1; row_size]));
         let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("a"); row_size]));
         let c: ArrayRef = Arc::new(Int64Array::from_iter(vec![0; row_size]));
-        let rb = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
-
-        let rbs = (0..1024).map(|_| rb.clone()).collect::<Vec<_>>();
-
+        let rb = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)])?;
         let schema = rb.schema();
-        let sort = LexOrdering::new(vec![
+
+        let rbs = std::iter::repeat_n(rb, 1024).collect::<Vec<_>>();
+        let sort = [
             PhysicalSortExpr {
-                expr: col("b", &schema).unwrap(),
+                expr: col("b", &schema)?,
                 options: Default::default(),
             },
             PhysicalSortExpr {
-                expr: col("c", &schema).unwrap(),
+                expr: col("c", &schema)?,
                 options: Default::default(),
             },
-        ]);
+        ]
+        .into();
 
         let repartition_exec = RepartitionExec::try_new(
-            TestMemoryExec::try_new_exec(&[rbs], schema, None).unwrap(),
+            TestMemoryExec::try_new_exec(&[rbs], schema, None)?,
             Partitioning::RoundRobinBatch(2),
         )?;
-        let coalesce_batches_exec =
-            CoalesceBatchesExec::new(Arc::new(repartition_exec), target_batch_size);
-        let spm = SortPreservingMergeExec::new(sort, Arc::new(coalesce_batches_exec))
+        let spm = SortPreservingMergeExec::new(sort, Arc::new(repartition_exec))
             .with_round_robin_repartition(enable_round_robin_repartition);
         Ok(Arc::new(spm))
     }
@@ -483,9 +532,10 @@ mod tests {
     /// based on whether the tie breaker is enabled or disabled.
     #[tokio::test(flavor = "multi_thread")]
     async fn test_round_robin_tie_breaker_success() -> Result<()> {
-        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker()?;
+        let target_batch_size = 12500;
+        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker(target_batch_size)?;
         let spm = generate_spm_for_round_robin_tie_breaker(true)?;
-        let _collected = collect(spm, task_ctx).await.unwrap();
+        let _collected = collect(spm, task_ctx).await?;
         Ok(())
     }
 
@@ -496,7 +546,7 @@ mod tests {
     /// based on whether the tie breaker is enabled or disabled.
     #[tokio::test(flavor = "multi_thread")]
     async fn test_round_robin_tie_breaker_fail() -> Result<()> {
-        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker()?;
+        let task_ctx = generate_task_ctx_for_round_robin_tie_breaker(8192)?;
         let spm = generate_spm_for_round_robin_tie_breaker(false)?;
         let _err = collect(spm, task_ctx).await.unwrap_err();
         Ok(())
@@ -550,30 +600,6 @@ mod tests {
         .await;
     }
 
-    #[tokio::test]
-    async fn test_merge_no_exprs() {
-        let task_ctx = Arc::new(TaskContext::default());
-        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
-        let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
-
-        let schema = batch.schema();
-        let sort = LexOrdering::default(); // no sort expressions
-        let exec = TestMemoryExec::try_new_exec(
-            &[vec![batch.clone()], vec![batch]],
-            schema,
-            None,
-        )
-        .unwrap();
-
-        let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
-
-        let res = collect(merge, task_ctx).await.unwrap_err();
-        assert_contains!(
-            res.to_string(),
-            "Internal error: Sort expressions cannot be empty for streaming merge"
-        );
-    }
-
     #[tokio::test]
     async fn test_merge_some_overlap() {
         let task_ctx = Arc::new(TaskContext::default());
@@ -741,7 +767,7 @@ mod tests {
         context: Arc<TaskContext>,
     ) {
         let schema = partitions[0][0].schema();
-        let sort = LexOrdering::new(vec![
+        let sort = [
             PhysicalSortExpr {
                 expr: col("b", &schema).unwrap(),
                 options: Default::default(),
@@ -750,7 +776,8 @@ mod tests {
                 expr: col("c", &schema).unwrap(),
                 options: Default::default(),
             },
-        ]);
+        ]
+        .into();
         let exec = TestMemoryExec::try_new_exec(partitions, schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
 
@@ -798,13 +825,14 @@ mod tests {
         let csv = test::scan_partitioned(partitions);
         let schema = csv.schema();
 
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("i", &schema).unwrap(),
+        let sort: LexOrdering = [PhysicalSortExpr {
+            expr: col("i", &schema)?,
             options: SortOptions {
                 descending: true,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
 
         let basic =
             basic_sort(Arc::clone(&csv), sort.clone(), Arc::clone(&task_ctx)).await;
@@ -859,17 +887,18 @@ mod tests {
         let sorted = basic_sort(csv, sort, context).await;
         let split: Vec<_> = sizes.iter().map(|x| split_batch(&sorted, *x)).collect();
 
-        Ok(TestMemoryExec::try_new_exec(&split, sorted.schema(), None).unwrap())
+        TestMemoryExec::try_new_exec(&split, sorted.schema(), None).map(|e| e as _)
     }
 
     #[tokio::test]
     async fn test_partition_sort_streaming_input() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
         let schema = make_partition(11).schema();
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("i", &schema).unwrap(),
+        let sort: LexOrdering = [PhysicalSortExpr {
+            expr: col("i", &schema)?,
             options: Default::default(),
-        }]);
+        }]
+        .into();
 
         let input =
             sorted_partitioned_input(sort.clone(), &[10, 3, 11], Arc::clone(&task_ctx))
@@ -881,12 +910,9 @@ mod tests {
         assert_eq!(basic.num_rows(), 1200);
         assert_eq!(partition.num_rows(), 1200);
 
-        let basic = arrow::util::pretty::pretty_format_batches(&[basic])
-            .unwrap()
-            .to_string();
-        let partition = arrow::util::pretty::pretty_format_batches(&[partition])
-            .unwrap()
-            .to_string();
+        let basic = arrow::util::pretty::pretty_format_batches(&[basic])?.to_string();
+        let partition =
+            arrow::util::pretty::pretty_format_batches(&[partition])?.to_string();
 
         assert_eq!(basic, partition);
 
@@ -896,10 +922,11 @@ mod tests {
     #[tokio::test]
     async fn test_partition_sort_streaming_input_output() -> Result<()> {
         let schema = make_partition(11).schema();
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("i", &schema).unwrap(),
+        let sort: LexOrdering = [PhysicalSortExpr {
+            expr: col("i", &schema)?,
             options: Default::default(),
-        }]);
+        }]
+        .into();
 
         // Test streaming with default batch size
         let task_ctx = Arc::new(TaskContext::default());
@@ -914,19 +941,14 @@ mod tests {
         let task_ctx = Arc::new(task_ctx);
 
         let merge = Arc::new(SortPreservingMergeExec::new(sort, input));
-        let merged = collect(merge, task_ctx).await.unwrap();
+        let merged = collect(merge, task_ctx).await?;
 
         assert_eq!(merged.len(), 53);
-
         assert_eq!(basic.num_rows(), 1200);
         assert_eq!(merged.iter().map(|x| x.num_rows()).sum::<usize>(), 1200);
 
-        let basic = arrow::util::pretty::pretty_format_batches(&[basic])
-            .unwrap()
-            .to_string();
-        let partition = arrow::util::pretty::pretty_format_batches(merged.as_slice())
-            .unwrap()
-            .to_string();
+        let basic = arrow::util::pretty::pretty_format_batches(&[basic])?.to_string();
+        let partition = arrow::util::pretty::pretty_format_batches(&merged)?.to_string();
 
         assert_eq!(basic, partition);
 
@@ -971,7 +993,7 @@ mod tests {
         let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
         let schema = b1.schema();
 
-        let sort = LexOrdering::new(vec![
+        let sort = [
             PhysicalSortExpr {
                 expr: col("b", &schema).unwrap(),
                 options: SortOptions {
@@ -986,7 +1008,8 @@ mod tests {
                     nulls_first: false,
                 },
             },
-        ]);
+        ]
+        .into();
         let exec =
             TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
@@ -994,22 +1017,22 @@ mod tests {
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+-------------------------------+
-            | a | b | c                             |
-            +---+---+-------------------------------+
-            | 1 |   | 1970-01-01T00:00:00.000000008 |
-            | 1 |   | 1970-01-01T00:00:00.000000008 |
-            | 2 | a |                               |
-            | 7 | b | 1970-01-01T00:00:00.000000006 |
-            | 2 | b |                               |
-            | 9 | d |                               |
-            | 3 | e | 1970-01-01T00:00:00.000000004 |
-            | 3 | g | 1970-01-01T00:00:00.000000005 |
-            | 4 | h |                               |
-            | 5 | i | 1970-01-01T00:00:00.000000004 |
-            +---+---+-------------------------------+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+-------------------------------+
+        | a | b | c                             |
+        +---+---+-------------------------------+
+        | 1 |   | 1970-01-01T00:00:00.000000008 |
+        | 1 |   | 1970-01-01T00:00:00.000000008 |
+        | 2 | a |                               |
+        | 7 | b | 1970-01-01T00:00:00.000000006 |
+        | 2 | b |                               |
+        | 9 | d |                               |
+        | 3 | e | 1970-01-01T00:00:00.000000004 |
+        | 3 | g | 1970-01-01T00:00:00.000000005 |
+        | 4 | h |                               |
+        | 5 | i | 1970-01-01T00:00:00.000000004 |
+        +---+---+-------------------------------+
+        ");
     }
 
     #[tokio::test]
@@ -1020,13 +1043,14 @@ mod tests {
         let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
         let schema = batch.schema();
 
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort = [PhysicalSortExpr {
             expr: col("b", &schema).unwrap(),
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap();
         let merge =
             Arc::new(SortPreservingMergeExec::new(sort, exec).with_fetch(Some(2)));
@@ -1034,14 +1058,14 @@ mod tests {
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | a |
-            | 2 | b |
-            +---+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | a |
+        | 2 | b |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
@@ -1052,40 +1076,42 @@ mod tests {
         let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
         let schema = batch.schema();
 
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort = [PhysicalSortExpr {
             expr: col("b", &schema).unwrap(),
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
         let exec = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
 
         let collected = collect(merge, task_ctx).await.unwrap();
         assert_eq!(collected.len(), 1);
 
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +---+---+
-            | a | b |
-            +---+---+
-            | 1 | a |
-            | 2 | b |
-            | 7 | c |
-            | 9 | d |
-            | 3 | e |
-            +---+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +---+---+
+        | a | b |
+        +---+---+
+        | 1 | a |
+        | 2 | b |
+        | 7 | c |
+        | 9 | d |
+        | 3 | e |
+        +---+---+
+        ");
     }
 
     #[tokio::test]
     async fn test_async() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
         let schema = make_partition(11).schema();
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort: LexOrdering = [PhysicalSortExpr {
             expr: col("i", &schema).unwrap(),
             options: SortOptions::default(),
-        }]);
+        }]
+        .into();
 
         let batches =
             sorted_partitioned_input(sort.clone(), &[5, 7, 3], Arc::clone(&task_ctx))
@@ -1121,7 +1147,7 @@ mod tests {
         let merge_stream = StreamingMergeBuilder::new()
             .with_streams(streams)
             .with_schema(batches.schema())
-            .with_expressions(sort.as_ref())
+            .with_expressions(&sort)
             .with_metrics(BaselineMetrics::new(&metrics, 0))
             .with_batch_size(task_ctx.session_config().batch_size())
             .with_fetch(fetch)
@@ -1161,10 +1187,11 @@ mod tests {
         let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
 
         let schema = b1.schema();
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort = [PhysicalSortExpr {
             expr: col("b", &schema).unwrap(),
             options: Default::default(),
-        }]);
+        }]
+        .into();
         let exec =
             TestMemoryExec::try_new_exec(&[vec![b1], vec![b2]], schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
@@ -1172,16 +1199,16 @@ mod tests {
         let collected = collect(Arc::clone(&merge) as Arc<dyn ExecutionPlan>, task_ctx)
             .await
             .unwrap();
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-            +----+---+
-            | a  | b |
-            +----+---+
-            | 1  | a |
-            | 10 | b |
-            | 2  | c |
-            | 20 | d |
-            +----+---+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +----+---+
+        | a  | b |
+        +----+---+
+        | 1  | a |
+        | 10 | b |
+        | 2  | c |
+        | 20 | d |
+        +----+---+
+        ");
 
         // Now, validate metrics
         let metrics = merge.metrics().unwrap();
@@ -1220,10 +1247,11 @@ mod tests {
         let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 2));
         let refs = blocking_exec.refs();
         let sort_preserving_merge_exec = Arc::new(SortPreservingMergeExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr {
+            [PhysicalSortExpr {
                 expr: col("a", &schema)?,
                 options: SortOptions::default(),
-            }]),
+            }]
+            .into(),
             blocking_exec,
         ));
 
@@ -1268,13 +1296,14 @@ mod tests {
 
         let schema = partitions[0][0].schema();
 
-        let sort = LexOrdering::new(vec![PhysicalSortExpr {
+        let sort = [PhysicalSortExpr {
             expr: col("value", &schema).unwrap(),
             options: SortOptions {
                 descending: false,
                 nulls_first: true,
             },
-        }]);
+        }]
+        .into();
 
         let exec = TestMemoryExec::try_new_exec(&partitions, schema, None).unwrap();
         let merge = Arc::new(SortPreservingMergeExec::new(sort, exec));
@@ -1285,32 +1314,69 @@ mod tests {
         // Expect the data to be sorted first by "batch_number" (because
         // that was the order it was fed in, even though only "value"
         // is in the sort key)
-        assert_snapshot!(batches_to_string(collected.as_slice()), @r#"
-                +--------------+-------+
-                | batch_number | value |
-                +--------------+-------+
-                | 0            | A     |
-                | 1            | A     |
-                | 2            | A     |
-                | 3            | A     |
-                | 4            | A     |
-                | 5            | A     |
-                | 6            | A     |
-                | 7            | A     |
-                | 8            | A     |
-                | 9            | A     |
-                | 0            | B     |
-                | 1            | B     |
-                | 2            | B     |
-                | 3            | B     |
-                | 4            | B     |
-                | 5            | B     |
-                | 6            | B     |
-                | 7            | B     |
-                | 8            | B     |
-                | 9            | B     |
-                +--------------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(collected.as_slice()), @r"
+        +--------------+-------+
+        | batch_number | value |
+        +--------------+-------+
+        | 0            | A     |
+        | 1            | A     |
+        | 2            | A     |
+        | 3            | A     |
+        | 4            | A     |
+        | 5            | A     |
+        | 6            | A     |
+        | 7            | A     |
+        | 8            | A     |
+        | 9            | A     |
+        | 0            | B     |
+        | 1            | B     |
+        | 2            | B     |
+        | 3            | B     |
+        | 4            | B     |
+        | 5            | B     |
+        | 6            | B     |
+        | 7            | B     |
+        | 8            | B     |
+        | 9            | B     |
+        +--------------+-------+
+        ");
+    }
+
+    #[derive(Debug)]
+    struct CongestionState {
+        wakers: Vec<Waker>,
+        unpolled_partitions: HashSet<usize>,
+    }
+
+    #[derive(Debug)]
+    struct Congestion {
+        congestion_state: Mutex<CongestionState>,
+    }
+
+    impl Congestion {
+        fn new(partition_count: usize) -> Self {
+            Congestion {
+                congestion_state: Mutex::new(CongestionState {
+                    wakers: vec![],
+                    unpolled_partitions: (0usize..partition_count).collect(),
+                }),
+            }
+        }
+
+        fn check_congested(&self, partition: usize, cx: &mut Context<'_>) -> Poll<()> {
+            let mut state = self.congestion_state.lock().unwrap();
+
+            state.unpolled_partitions.remove(&partition);
+
+            if state.unpolled_partitions.is_empty() {
+                state.wakers.iter().for_each(|w| w.wake_by_ref());
+                state.wakers.clear();
+                Poll::Ready(())
+            } else {
+                state.wakers.push(cx.waker().clone());
+                Poll::Pending
+            }
+        }
     }
 
     /// It returns pending for the 2nd partition until the 3rd partition is polled. The 1st
@@ -1318,8 +1384,8 @@ mod tests {
     #[derive(Debug, Clone)]
     struct CongestedExec {
         schema: Schema,
-        cache: PlanProperties,
-        congestion_cleared: Arc<Mutex<bool>>,
+        cache: Arc<PlanProperties>,
+        congestion: Arc<Congestion>,
     }
 
     impl CongestedExec {
@@ -1331,10 +1397,11 @@ mod tests {
                 .map(|(i, f)| Arc::new(Column::new(f.name(), i)) as Arc<dyn PhysicalExpr>)
                 .collect::<Vec<_>>();
             let mut eq_properties = EquivalenceProperties::new(schema);
-            eq_properties.add_new_orderings(vec![columns
-                .iter()
-                .map(|expr| PhysicalSortExpr::new_default(Arc::clone(expr)))
-                .collect::<LexOrdering>()]);
+            eq_properties.add_ordering(
+                columns
+                    .iter()
+                    .map(|expr| PhysicalSortExpr::new_default(Arc::clone(expr))),
+            );
             PlanProperties::new(
                 eq_properties,
                 Partitioning::Hash(columns, 3),
@@ -1353,12 +1420,18 @@ mod tests {
         fn as_any(&self) -> &dyn Any {
             self
         }
-        fn properties(&self) -> &PlanProperties {
+        fn properties(&self) -> &Arc<PlanProperties> {
             &self.cache
         }
         fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
             vec![]
         }
+        fn apply_expressions(
+            &self,
+            _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+        ) -> Result<TreeNodeRecursion> {
+            Ok(TreeNodeRecursion::Continue)
+        }
         fn with_new_children(
             self: Arc<Self>,
             _: Vec<Arc<dyn ExecutionPlan>>,
@@ -1373,7 +1446,7 @@ mod tests {
             Ok(Box::pin(CongestedStream {
                 schema: Arc::new(self.schema.clone()),
                 none_polled_once: false,
-                congestion_cleared: Arc::clone(&self.congestion_cleared),
+                congestion: Arc::clone(&self.congestion),
                 partition,
             }))
         }
@@ -1400,7 +1473,7 @@ mod tests {
     pub struct CongestedStream {
         schema: SchemaRef,
         none_polled_once: bool,
-        congestion_cleared: Arc<Mutex<bool>>,
+        congestion: Arc<Congestion>,
         partition: usize,
     }
 
@@ -1408,31 +1481,22 @@ mod tests {
         type Item = Result<RecordBatch>;
         fn poll_next(
             mut self: Pin<&mut Self>,
-            _cx: &mut Context<'_>,
+            cx: &mut Context<'_>,
         ) -> Poll<Option<Self::Item>> {
             match self.partition {
                 0 => {
+                    let _ = self.congestion.check_congested(self.partition, cx);
                     if self.none_polled_once {
-                        panic!("Exhausted stream is polled more than one")
+                        panic!("Exhausted stream is polled more than once")
                     } else {
                         self.none_polled_once = true;
                         Poll::Ready(None)
                     }
                 }
-                1 => {
-                    let cleared = self.congestion_cleared.lock().unwrap();
-                    if *cleared {
-                        Poll::Ready(None)
-                    } else {
-                        Poll::Pending
-                    }
-                }
-                2 => {
-                    let mut cleared = self.congestion_cleared.lock().unwrap();
-                    *cleared = true;
+                _ => {
+                    ready!(self.congestion.check_congested(self.partition, cx));
                     Poll::Ready(None)
                 }
-                _ => unreachable!(),
             }
         }
     }
@@ -1447,15 +1511,22 @@ mod tests {
     async fn test_spm_congestion() -> Result<()> {
         let task_ctx = Arc::new(TaskContext::default());
         let schema = Schema::new(vec![Field::new("c1", DataType::UInt64, false)]);
+        let properties = CongestedExec::compute_properties(Arc::new(schema.clone()));
+        let &partition_count = match properties.output_partitioning() {
+            Partitioning::RoundRobinBatch(partitions) => partitions,
+            Partitioning::Hash(_, partitions) => partitions,
+            Partitioning::UnknownPartitioning(partitions) => partitions,
+        };
         let source = CongestedExec {
             schema: schema.clone(),
-            cache: CongestedExec::compute_properties(Arc::new(schema.clone())),
-            congestion_cleared: Arc::new(Mutex::new(false)),
+            cache: Arc::new(properties),
+            congestion: Arc::new(Congestion::new(partition_count)),
         };
         let spm = SortPreservingMergeExec::new(
-            LexOrdering::new(vec![PhysicalSortExpr::new_default(Arc::new(Column::new(
+            [PhysicalSortExpr::new_default(Arc::new(Column::new(
                 "c1", 0,
-            )))]),
+            )))]
+            .into(),
             Arc::new(source),
         );
         let spm_task = SpawnedTask::spawn(collect(Arc::new(spm), task_ctx));
@@ -1464,12 +1535,8 @@ mod tests {
         match result {
             Ok(Ok(Ok(_batches))) => Ok(()),
             Ok(Ok(Err(e))) => Err(e),
-            Ok(Err(_)) => Err(DataFusionError::Execution(
-                "SortPreservingMerge task panicked or was cancelled".to_string(),
-            )),
-            Err(_) => Err(DataFusionError::Execution(
-                "SortPreservingMerge caused a deadlock".to_string(),
-            )),
+            Ok(Err(_)) => exec_err!("SortPreservingMerge task panicked or was cancelled"),
+            Err(_) => exec_err!("SortPreservingMerge caused a deadlock"),
         }
     }
 }
diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs
index e029c60b285b6..ff7f259dd1347 100644
--- a/datafusion/physical-plan/src/sorts/stream.rs
+++ b/datafusion/physical-plan/src/sorts/stream.rs
@@ -15,20 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::sorts::cursor::{ArrayValues, CursorArray, RowValues};
 use crate::SendableRecordBatchStream;
+use crate::sorts::cursor::{ArrayValues, CursorArray, RowValues};
 use crate::{PhysicalExpr, PhysicalSortExpr};
-use arrow::array::Array;
+use arrow::array::{Array, UInt32Array};
+use arrow::compute::take_record_batch;
 use arrow::datatypes::Schema;
 use arrow::record_batch::RecordBatch;
-use arrow::row::{RowConverter, SortField};
-use datafusion_common::Result;
+use arrow::row::{RowConverter, Rows, SortField};
+use arrow_ord::sort::lexsort_to_indices;
+use datafusion_common::{Result, internal_datafusion_err};
 use datafusion_execution::memory_pool::MemoryReservation;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays;
 use futures::stream::{Fuse, StreamExt};
+use std::iter::FusedIterator;
 use std::marker::PhantomData;
+use std::mem;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
 /// A [`Stream`](futures::Stream) that has multiple partitions that can
 /// be polled separately but not concurrently
@@ -76,8 +81,40 @@ impl FusedStreams {
     }
 }
 
+/// A pair of `Arc<Rows>` that can be reused
+#[derive(Debug)]
+struct ReusableRows {
+    // inner[stream_idx] holds a two Arcs:
+    // at start of a new poll
+    // .0 is the rows from the previous poll (at start),
+    // .1 is the one that is being written to
+    // at end of a poll, .0 will be swapped with .1,
+    inner: Vec<[Option<Arc<Rows>>; 2]>,
+}
+
+impl ReusableRows {
+    // return a Rows for writing,
+    // does not clone if the existing rows can be reused
+    fn take_next(&mut self, stream_idx: usize) -> Result<Rows> {
+        Arc::try_unwrap(self.inner[stream_idx][1].take().unwrap()).map_err(|_| {
+            internal_datafusion_err!(
+                "Rows from RowCursorStream is still in use by consumer"
+            )
+        })
+    }
+    // save the Rows
+    fn save(&mut self, stream_idx: usize, rows: &Arc<Rows>) {
+        self.inner[stream_idx][1] = Some(Arc::clone(rows));
+        // swap the current with the previous one, so that the next poll can reuse the Rows from the previous poll
+        let [a, b] = &mut self.inner[stream_idx];
+        mem::swap(a, b);
+    }
+}
+
 /// A [`PartitionedStream`] that wraps a set of [`SendableRecordBatchStream`]
 /// and computes [`RowValues`] based on the provided [`PhysicalSortExpr`]
+/// Note: the stream returns an error if the consumer buffers more than one RowValues (i.e. holds on to two RowValues
+/// from the same partition at the same time).
 #[derive(Debug)]
 pub struct RowCursorStream {
     /// Converter to convert output of physical expressions
@@ -88,6 +125,9 @@ pub struct RowCursorStream {
     streams: FusedStreams,
     /// Tracks the memory used by `converter`
     reservation: MemoryReservation,
+    /// Allocated rows for each partition, we keep two to allow for buffering one
+    /// in the consumer of the stream
+    rows: ReusableRows,
 }
 
 impl RowCursorStream {
@@ -105,28 +145,46 @@ impl RowCursorStream {
             })
             .collect::<Result<Vec<_>>>()?;
 
-        let streams = streams.into_iter().map(|s| s.fuse()).collect();
+        let streams: Vec<_> = streams.into_iter().map(|s| s.fuse()).collect();
         let converter = RowConverter::new(sort_fields)?;
+        let mut rows = Vec::with_capacity(streams.len());
+        for _ in &streams {
+            // Initialize each stream with an empty Rows
+            rows.push([
+                Some(Arc::new(converter.empty_rows(0, 0))),
+                Some(Arc::new(converter.empty_rows(0, 0))),
+            ]);
+        }
         Ok(Self {
             converter,
             reservation,
             column_expressions: expressions.iter().map(|x| Arc::clone(&x.expr)).collect(),
             streams: FusedStreams(streams),
+            rows: ReusableRows { inner: rows },
         })
     }
 
-    fn convert_batch(&mut self, batch: &RecordBatch) -> Result<RowValues> {
-        let cols = self
-            .column_expressions
-            .iter()
-            .map(|expr| expr.evaluate(batch)?.into_array(batch.num_rows()))
-            .collect::<Result<Vec<_>>>()?;
+    fn convert_batch(
+        &mut self,
+        batch: &RecordBatch,
+        stream_idx: usize,
+    ) -> Result<RowValues> {
+        let cols = evaluate_expressions_to_arrays(&self.column_expressions, batch)?;
+
+        // At this point, ownership should of this Rows should be unique
+        let mut rows = self.rows.take_next(stream_idx)?;
 
-        let rows = self.converter.convert_columns(&cols)?;
+        rows.clear();
+
+        self.converter.append(&mut rows, &cols)?;
         self.reservation.try_resize(self.converter.size())?;
 
+        let rows = Arc::new(rows);
+
+        self.rows.save(stream_idx, &rows);
+
         // track the memory in the newly created Rows.
-        let mut rows_reservation = self.reservation.new_empty();
+        let rows_reservation = self.reservation.new_empty();
         rows_reservation.try_grow(rows.size())?;
         Ok(RowValues::new(rows, rows_reservation))
     }
@@ -146,7 +204,7 @@ impl PartitionedStream for RowCursorStream {
     ) -> Poll<Option<Self::Output>> {
         Poll::Ready(ready!(self.streams.poll_next(cx, stream_idx)).map(|r| {
             r.and_then(|batch| {
-                let cursor = self.convert_batch(&batch)?;
+                let cursor = self.convert_batch(&batch, stream_idx)?;
                 Ok((cursor, batch))
             })
         }))
@@ -192,7 +250,7 @@ impl<T: CursorArray> FieldCursorStream<T> {
         let array = value.into_array(batch.num_rows())?;
         let size_in_mem = array.get_buffer_memory_size();
         let array = array.as_any().downcast_ref::<T>().expect("field values");
-        let mut array_reservation = self.reservation.new_empty();
+        let array_reservation = self.reservation.new_empty();
         array_reservation.try_grow(size_in_mem)?;
         Ok(ArrayValues::new(
             self.sort.options,
@@ -222,3 +280,159 @@ impl<T: CursorArray> PartitionedStream for FieldCursorStream<T> {
         }))
     }
 }
+
+/// A lazy, memory-efficient sort iterator used as a fallback during aggregate
+/// spill when there is not enough memory for an eager sort (which requires ~2x
+/// peak memory to hold both the unsorted and sorted copies simultaneously).
+///
+/// On the first call to `next()`, a sorted index array (`UInt32Array`) is
+/// computed via `lexsort_to_indices`. Subsequent calls yield chunks of
+/// `batch_size` rows by `take`-ing from the original batch using slices of
+/// this index array. Each `take` copies data for the chunk (not zero-copy),
+/// but only one chunk is live at a time since the caller consumes it before
+/// requesting the next. Once all rows have been yielded, the original batch
+/// and index array are dropped to free memory.
+///
+/// The caller must reserve `sizeof(batch) + sizeof(one chunk)` for this iterator,
+/// and free the reservation once the iterator is depleted.
+pub(crate) struct IncrementalSortIterator {
+    batch: RecordBatch,
+    expressions: LexOrdering,
+    batch_size: usize,
+    indices: Option<UInt32Array>,
+    cursor: usize,
+}
+
+impl IncrementalSortIterator {
+    pub(crate) fn new(
+        batch: RecordBatch,
+        expressions: LexOrdering,
+        batch_size: usize,
+    ) -> Self {
+        Self {
+            batch,
+            expressions,
+            batch_size,
+            cursor: 0,
+            indices: None,
+        }
+    }
+}
+
+impl Iterator for IncrementalSortIterator {
+    type Item = Result<RecordBatch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor >= self.batch.num_rows() {
+            return None;
+        }
+
+        match self.indices.as_ref() {
+            None => {
+                let sort_columns = match self
+                    .expressions
+                    .iter()
+                    .map(|expr| expr.evaluate_to_sort_column(&self.batch))
+                    .collect::<Result<Vec<_>>>()
+                {
+                    Ok(cols) => cols,
+                    Err(e) => return Some(Err(e)),
+                };
+
+                let indices = match lexsort_to_indices(&sort_columns, None) {
+                    Ok(indices) => indices,
+                    Err(e) => return Some(Err(e.into())),
+                };
+                self.indices = Some(indices);
+
+                // Call again, this time it will hit the Some(indices) branch and return the first batch
+                self.next()
+            }
+            Some(indices) => {
+                let batch_size = self.batch_size.min(self.batch.num_rows() - self.cursor);
+
+                // Perform the take to produce the next batch
+                let new_batch_indices = indices.slice(self.cursor, batch_size);
+                let new_batch = match take_record_batch(&self.batch, &new_batch_indices) {
+                    Ok(batch) => batch,
+                    Err(e) => return Some(Err(e.into())),
+                };
+
+                self.cursor += batch_size;
+
+                // If this is the last batch, we can release the memory
+                if self.cursor >= self.batch.num_rows() {
+                    let schema = self.batch.schema();
+                    let _ = mem::replace(&mut self.batch, RecordBatch::new_empty(schema));
+                    self.indices = None;
+                }
+
+                // Return the new batch
+                Some(Ok(new_batch))
+            }
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let num_rows = self.batch.num_rows();
+        let batch_size = self.batch_size;
+        let num_batches = num_rows.div_ceil(batch_size);
+        (num_batches, Some(num_batches))
+    }
+}
+
+impl FusedIterator for IncrementalSortIterator {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{AsArray, Int32Array};
+    use arrow::datatypes::{DataType, Field, Int32Type};
+    use datafusion_common::DataFusionError;
+    use datafusion_physical_expr::expressions::col;
+
+    /// Verifies that `take_record_batch` in `IncrementalSortIterator` actually
+    /// copies the data into a new allocation rather than returning a zero-copy
+    /// slice of the original batch. If the output arrays were slices, their
+    /// underlying buffer length would match the original array's length; a true
+    /// copy will have a buffer sized to fit only the chunk.
+    #[test]
+    fn incremental_sort_iterator_copies_data() -> Result<()> {
+        let original_len = 10;
+        let batch_size = 3;
+
+        // Build a batch with a single Int32 column of descending values
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+        let col_a: Int32Array = Int32Array::from(vec![0; original_len]);
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(col_a)])?;
+
+        // Sort ascending on column "a"
+        let expressions = LexOrdering::new(vec![PhysicalSortExpr::new_default(col(
+            "a",
+            &batch.schema(),
+        )?)])
+        .unwrap();
+
+        let mut total_rows = 0;
+        IncrementalSortIterator::new(batch.clone(), expressions, batch_size).try_for_each(
+            |result| {
+                let chunk = result?;
+                total_rows += chunk.num_rows();
+
+                // Every output column must be a fresh allocation whose length
+                // equals the chunk size, NOT the original array length.
+                chunk.columns().iter().zip(batch.columns()).for_each(|(arr, original_arr)| {
+                    let (_, scalar_buf, _) = arr.as_primitive::<Int32Type>().clone().into_parts();
+                    let (_, original_scalar_buf, _) = original_arr.as_primitive::<Int32Type>().clone().into_parts();
+
+                    assert_ne!(scalar_buf.inner().data_ptr(), original_scalar_buf.inner().data_ptr(), "Expected a copy of the data for each chunk, but got a slice that shares the same buffer as the original array");
+                });
+
+                Result::<_, DataFusionError>::Ok(())
+            },
+        )?;
+
+        assert_eq!(total_rows, original_len);
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs
index 3f022ec6095ae..8129c3d8f695d 100644
--- a/datafusion/physical-plan/src/sorts/streaming_merge.rs
+++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs
@@ -19,16 +19,22 @@
 //! This is an order-preserving merge.
 
 use crate::metrics::BaselineMetrics;
+use crate::sorts::multi_level_merge::MultiLevelMergeBuilder;
 use crate::sorts::{
     merge::SortPreservingMergeStream,
     stream::{FieldCursorStream, RowCursorStream},
 };
-use crate::SendableRecordBatchStream;
+use crate::{SendableRecordBatchStream, SpillManager};
 use arrow::array::*;
 use arrow::datatypes::{DataType, SchemaRef};
-use datafusion_common::{internal_err, Result};
-use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_common::human_readable_size;
+use datafusion_common::{Result, assert_or_internal_err, internal_err};
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::memory_pool::{
+    MemoryConsumer, MemoryPool, MemoryReservation, UnboundedMemoryPool,
+};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use std::sync::Arc;
 
 macro_rules! primitive_merge_helper {
     ($t:ty, $($v:ident),+) => {
@@ -52,10 +58,31 @@ macro_rules! merge_helper {
     }};
 }
 
+pub struct SortedSpillFile {
+    pub file: RefCountedTempFile,
+
+    /// how much memory the largest memory batch is taking
+    pub max_record_batch_memory: usize,
+}
+
+impl std::fmt::Debug for SortedSpillFile {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "SortedSpillFile({:?}) takes {}",
+            self.file.path(),
+            human_readable_size(self.max_record_batch_memory)
+        )
+    }
+}
+
+#[derive(Default)]
 pub struct StreamingMergeBuilder<'a> {
     streams: Vec<SendableRecordBatchStream>,
+    sorted_spill_files: Vec<SortedSpillFile>,
+    spill_manager: Option<SpillManager>,
     schema: Option<SchemaRef>,
-    expressions: &'a LexOrdering,
+    expressions: Option<&'a LexOrdering>,
     metrics: Option<BaselineMetrics>,
     batch_size: Option<usize>,
     fetch: Option<usize>,
@@ -63,21 +90,6 @@ pub struct StreamingMergeBuilder<'a> {
     enable_round_robin_tie_breaker: bool,
 }
 
-impl Default for StreamingMergeBuilder<'_> {
-    fn default() -> Self {
-        Self {
-            streams: vec![],
-            schema: None,
-            expressions: LexOrdering::empty(),
-            metrics: None,
-            batch_size: None,
-            fetch: None,
-            reservation: None,
-            enable_round_robin_tie_breaker: false,
-        }
-    }
-}
-
 impl<'a> StreamingMergeBuilder<'a> {
     pub fn new() -> Self {
         Self {
@@ -91,13 +103,26 @@ impl<'a> StreamingMergeBuilder<'a> {
         self
     }
 
+    pub fn with_sorted_spill_files(
+        mut self,
+        sorted_spill_files: Vec<SortedSpillFile>,
+    ) -> Self {
+        self.sorted_spill_files = sorted_spill_files;
+        self
+    }
+
+    pub fn with_spill_manager(mut self, spill_manager: SpillManager) -> Self {
+        self.spill_manager = Some(spill_manager);
+        self
+    }
+
     pub fn with_schema(mut self, schema: SchemaRef) -> Self {
         self.schema = Some(schema);
         self
     }
 
     pub fn with_expressions(mut self, expressions: &'a LexOrdering) -> Self {
-        self.expressions = expressions;
+        self.expressions = Some(expressions);
         self
     }
 
@@ -133,9 +158,22 @@ impl<'a> StreamingMergeBuilder<'a> {
         self
     }
 
+    /// Bypass the mempool and avoid using the memory reservation.
+    ///
+    /// This is not marked as `pub` because it is not recommended to use this method
+    pub(super) fn with_bypass_mempool(self) -> Self {
+        let mem_pool: Arc<dyn MemoryPool> = Arc::new(UnboundedMemoryPool::default());
+
+        self.with_reservation(
+            MemoryConsumer::new("merge stream mock memory").register(&mem_pool),
+        )
+    }
+
     pub fn build(self) -> Result<SendableRecordBatchStream> {
         let Self {
             streams,
+            sorted_spill_files,
+            spill_manager,
             schema,
             metrics,
             batch_size,
@@ -145,23 +183,41 @@ impl<'a> StreamingMergeBuilder<'a> {
             enable_round_robin_tie_breaker,
         } = self;
 
-        // Early return if streams or expressions are empty
-        let checks = [
-            (
-                streams.is_empty(),
-                "Streams cannot be empty for streaming merge",
-            ),
-            (
-                expressions.is_empty(),
-                "Sort expressions cannot be empty for streaming merge",
-            ),
-        ];
-
-        if let Some((_, error_message)) = checks.iter().find(|(condition, _)| *condition)
-        {
-            return internal_err!("{}", error_message);
+        // Early return if expressions are empty:
+        let Some(expressions) = expressions else {
+            return internal_err!("Sort expressions cannot be empty for streaming merge");
+        };
+
+        if !sorted_spill_files.is_empty() {
+            // Unwrapping mandatory fields
+            let schema = schema.expect("Schema cannot be empty for streaming merge");
+            let metrics = metrics.expect("Metrics cannot be empty for streaming merge");
+            let batch_size =
+                batch_size.expect("Batch size cannot be empty for streaming merge");
+            let reservation =
+                reservation.expect("Reservation cannot be empty for streaming merge");
+
+            return Ok(MultiLevelMergeBuilder::new(
+                spill_manager.expect("spill_manager should exist"),
+                schema,
+                sorted_spill_files,
+                streams,
+                expressions.clone(),
+                metrics,
+                batch_size,
+                reservation,
+                fetch,
+                enable_round_robin_tie_breaker,
+            )
+            .create_spillable_merge_stream());
         }
 
+        // Early return if streams are empty:
+        assert_or_internal_err!(
+            !streams.is_empty(),
+            "Streams/sorted spill files cannot be empty for streaming merge"
+        );
+
         // Unwrapping mandatory fields
         let schema = schema.expect("Schema cannot be empty for streaming merge");
         let metrics = metrics.expect("Metrics cannot be empty for streaming merge");
diff --git a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
index 7617e0a22a504..2666ab8822ed9 100644
--- a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
+++ b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
@@ -24,7 +24,7 @@ use arrow::array::RecordBatch;
 use datafusion_common::exec_datafusion_err;
 use datafusion_execution::disk_manager::RefCountedTempFile;
 
-use super::{spill_manager::SpillManager, IPCStreamWriter};
+use super::{IPCStreamWriter, spill_manager::SpillManager};
 
 /// Represents an in-progress spill file used for writing `RecordBatch`es to disk, created by `SpillManager`.
 /// Caller is able to use this struct to incrementally append in-memory batches to
@@ -63,31 +63,57 @@ impl InProgressSpillFile {
         }
         if self.writer.is_none() {
             let schema = batch.schema();
-            if let Some(ref in_progress_file) = self.in_progress_file {
+            if let Some(in_progress_file) = &mut self.in_progress_file {
                 self.writer = Some(IPCStreamWriter::new(
                     in_progress_file.path(),
                     schema.as_ref(),
+                    self.spill_writer.compression,
                 )?);
 
                 // Update metrics
                 self.spill_writer.metrics.spill_file_count.add(1);
+
+                // Update initial size (schema/header)
+                in_progress_file.update_disk_usage()?;
+                let initial_size = in_progress_file.current_disk_usage();
+                self.spill_writer
+                    .metrics
+                    .spilled_bytes
+                    .add(initial_size as usize);
             }
         }
         if let Some(writer) = &mut self.writer {
-            let (spilled_rows, spilled_bytes) = writer.write(batch)?;
+            let (spilled_rows, _) = writer.write(batch)?;
             if let Some(in_progress_file) = &mut self.in_progress_file {
+                let pre_size = in_progress_file.current_disk_usage();
                 in_progress_file.update_disk_usage()?;
+                let post_size = in_progress_file.current_disk_usage();
+
+                self.spill_writer.metrics.spilled_rows.add(spilled_rows);
+                self.spill_writer
+                    .metrics
+                    .spilled_bytes
+                    .add((post_size - pre_size) as usize);
             } else {
                 unreachable!() // Already checked inside current function
             }
+        }
+        Ok(())
+    }
 
-            // Update metrics
-            self.spill_writer.metrics.spilled_bytes.add(spilled_bytes);
-            self.spill_writer.metrics.spilled_rows.add(spilled_rows);
+    pub fn flush(&mut self) -> Result<()> {
+        if let Some(writer) = &mut self.writer {
+            writer.flush()?;
         }
         Ok(())
     }
 
+    /// Returns a reference to the in-progress file, if it exists.
+    /// This can be used to get the file path for creating readers before the file is finished.
+    pub fn file(&self) -> Option<&RefCountedTempFile> {
+        self.in_progress_file.as_ref()
+    }
+
     /// Finalizes the file, returning the completed file reference.
     /// If there are no batches spilled before, it returns `None`.
     pub fn finish(&mut self) -> Result<Option<RefCountedTempFile>> {
@@ -97,6 +123,18 @@ impl InProgressSpillFile {
             return Ok(None);
         }
 
+        // Since spill files are append-only, add the file size to spilled_bytes
+        if let Some(in_progress_file) = &mut self.in_progress_file {
+            // Since writer.finish() writes continuation marker and message length at the end
+            let pre_size = in_progress_file.current_disk_usage();
+            in_progress_file.update_disk_usage()?;
+            let post_size = in_progress_file.current_disk_usage();
+            self.spill_writer
+                .metrics
+                .spilled_bytes
+                .add((post_size - pre_size) as usize);
+        }
+
         Ok(self.in_progress_file.take())
     }
 }
diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs
index 1101616a41060..f6ce546a42238 100644
--- a/datafusion/physical-plan/src/spill/mod.rs
+++ b/datafusion/physical-plan/src/spill/mod.rs
@@ -19,25 +19,37 @@
 
 pub(crate) mod in_progress_spill_file;
 pub(crate) mod spill_manager;
+pub mod spill_pool;
+
+// Moved for refactor, re-export to keep the public API stable
+pub use datafusion_common::utils::memory::get_record_batch_memory_size;
+// Re-export SpillManager for doctests only (hidden from public docs)
+#[doc(hidden)]
+pub use spill_manager::SpillManager;
 
 use std::fs::File;
 use std::io::BufReader;
 use std::path::{Path, PathBuf};
 use std::pin::Pin;
-use std::ptr::NonNull;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use arrow::array::ArrayData;
+use arrow::array::{BufferSpec, layout};
 use arrow::datatypes::{Schema, SchemaRef};
-use arrow::ipc::{reader::StreamReader, writer::StreamWriter};
+use arrow::ipc::{
+    MetadataVersion,
+    reader::StreamReader,
+    writer::{IpcWriteOptions, StreamWriter},
+};
 use arrow::record_batch::RecordBatch;
 
-use datafusion_common::{exec_datafusion_err, DataFusionError, HashSet, Result};
+use datafusion_common::config::SpillCompression;
+use datafusion_common::{DataFusionError, Result, exec_datafusion_err};
 use datafusion_common_runtime::SpawnedTask;
-use datafusion_execution::disk_manager::RefCountedTempFile;
 use datafusion_execution::RecordBatchStream;
+use datafusion_execution::disk_manager::RefCountedTempFile;
 use futures::{FutureExt as _, Stream};
+use log::debug;
 
 /// Stream that reads spill files from disk where each batch is read in a spawned blocking task
 /// It will read one batch at a time and will not do any buffering, to buffer data use [`crate::common::spawn_buffered`]
@@ -49,8 +61,16 @@ use futures::{FutureExt as _, Stream};
 struct SpillReaderStream {
     schema: SchemaRef,
     state: SpillReaderStreamState,
+    /// Maximum memory size observed among spilling sorted record batches.
+    /// This is used for validation purposes during reading each RecordBatch from spill.
+    /// For context on why this value is recorded and validated,
+    /// see `physical_plan/sort/multi_level_merge.rs`.
+    max_record_batch_memory: Option<usize>,
 }
 
+// Small margin allowed to accommodate slight memory accounting variation
+const SPILL_BATCH_MEMORY_MARGIN: usize = 4096;
+
 /// When we poll for the next batch, we will get back both the batch and the reader,
 /// so we can call `next` again.
 type NextRecordBatchResult = Result<(StreamReader<BufReader<File>>, Option<RecordBatch>)>;
@@ -71,10 +91,15 @@ enum SpillReaderStreamState {
 }
 
 impl SpillReaderStream {
-    fn new(schema: SchemaRef, spill_file: RefCountedTempFile) -> Self {
+    fn new(
+        schema: SchemaRef,
+        spill_file: RefCountedTempFile,
+        max_record_batch_memory: Option<usize>,
+    ) -> Self {
         Self {
             schema,
             state: SpillReaderStreamState::Uninitialized(spill_file),
+            max_record_batch_memory,
         }
     }
 
@@ -120,6 +145,23 @@ impl SpillReaderStream {
                     Ok((reader, batch)) => {
                         match batch {
                             Some(batch) => {
+                                if let Some(max_record_batch_memory) =
+                                    self.max_record_batch_memory
+                                {
+                                    let actual_size =
+                                        get_record_batch_memory_size(&batch);
+                                    if actual_size
+                                        > max_record_batch_memory
+                                            + SPILL_BATCH_MEMORY_MARGIN
+                                    {
+                                        debug!(
+                                            "Record batch memory usage ({actual_size} bytes) exceeds the expected limit ({max_record_batch_memory} bytes) \n\
+                                                by more than the allowed tolerance ({SPILL_BATCH_MEMORY_MARGIN} bytes).\n\
+                                                This likely indicates a bug in memory accounting during spilling.\n\
+                                                Please report this issue in https://github.com/apache/datafusion/issues/17340."
+                                        );
+                                    }
+                                }
                                 self.state = SpillReaderStreamState::Waiting(reader);
 
                                 Poll::Ready(Some(Ok(batch)))
@@ -186,6 +228,7 @@ impl RecordBatchStream for SpillReaderStream {
     since = "46.0.0",
     note = "This method is deprecated. Use `SpillManager::spill_record_batch_by_size` instead."
 )]
+#[expect(clippy::needless_pass_by_value)]
 pub fn spill_record_batch_by_size(
     batch: &RecordBatch,
     path: PathBuf,
@@ -194,7 +237,8 @@ pub fn spill_record_batch_by_size(
 ) -> Result<()> {
     let mut offset = 0;
     let total_rows = batch.num_rows();
-    let mut writer = IPCStreamWriter::new(&path, schema.as_ref())?;
+    let mut writer =
+        IPCStreamWriter::new(&path, schema.as_ref(), SpillCompression::Uncompressed)?;
 
     while offset < total_rows {
         let length = std::cmp::min(total_rows - offset, batch_size_rows);
@@ -207,74 +251,6 @@ pub fn spill_record_batch_by_size(
     Ok(())
 }
 
-/// Calculate total used memory of this batch.
-///
-/// This function is used to estimate the physical memory usage of the `RecordBatch`.
-/// It only counts the memory of large data `Buffer`s, and ignores metadata like
-/// types and pointers.
-/// The implementation will add up all unique `Buffer`'s memory
-/// size, due to:
-/// - The data pointer inside `Buffer` are memory regions returned by global memory
-///   allocator, those regions can't have overlap.
-/// - The actual used range of `ArrayRef`s inside `RecordBatch` can have overlap
-///   or reuse the same `Buffer`. For example: taking a slice from `Array`.
-///
-/// Example:
-/// For a `RecordBatch` with two columns: `col1` and `col2`, two columns are pointing
-/// to a sub-region of the same buffer.
-///
-/// {xxxxxxxxxxxxxxxxxxx} <--- buffer
-///       ^    ^  ^    ^
-///       |    |  |    |
-/// col1->{    }  |    |
-/// col2--------->{    }
-///
-/// In the above case, `get_record_batch_memory_size` will return the size of
-/// the buffer, instead of the sum of `col1` and `col2`'s actual memory size.
-///
-/// Note: Current `RecordBatch`.get_array_memory_size()` will double count the
-/// buffer memory size if multiple arrays within the batch are sharing the same
-/// `Buffer`. This method provides temporary fix until the issue is resolved:
-/// <https://github.com/apache/arrow-rs/issues/6439>
-pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize {
-    // Store pointers to `Buffer`'s start memory address (instead of actual
-    // used data region's pointer represented by current `Array`)
-    let mut counted_buffers: HashSet<NonNull<u8>> = HashSet::new();
-    let mut total_size = 0;
-
-    for array in batch.columns() {
-        let array_data = array.to_data();
-        count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size);
-    }
-
-    total_size
-}
-
-/// Count the memory usage of `array_data` and its children recursively.
-fn count_array_data_memory_size(
-    array_data: &ArrayData,
-    counted_buffers: &mut HashSet<NonNull<u8>>,
-    total_size: &mut usize,
-) {
-    // Count memory usage for `array_data`
-    for buffer in array_data.buffers() {
-        if counted_buffers.insert(buffer.data_ptr()) {
-            *total_size += buffer.capacity();
-        } // Otherwise the buffer's memory is already counted
-    }
-
-    if let Some(null_buffer) = array_data.nulls() {
-        if counted_buffers.insert(null_buffer.inner().inner().data_ptr()) {
-            *total_size += null_buffer.inner().inner().capacity();
-        }
-    }
-
-    // Count all children `ArrayData` recursively
-    for child in array_data.child_data() {
-        count_array_data_memory_size(child, counted_buffers, total_size);
-    }
-}
-
 /// Write in Arrow IPC Stream format to a file.
 ///
 /// Stream format is used for spill because it supports dictionary replacement, and the random
@@ -292,15 +268,32 @@ struct IPCStreamWriter {
 
 impl IPCStreamWriter {
     /// Create new writer
-    pub fn new(path: &Path, schema: &Schema) -> Result<Self> {
+    pub fn new(
+        path: &Path,
+        schema: &Schema,
+        compression_type: SpillCompression,
+    ) -> Result<Self> {
         let file = File::create(path).map_err(|e| {
-            exec_datafusion_err!("Failed to create partition file at {path:?}: {e:?}")
+            exec_datafusion_err!("(Hint: you may increase the file descriptor limit with shell command 'ulimit -n 4096') Failed to create partition file at {path:?}: {e:?}")
         })?;
+
+        let metadata_version = MetadataVersion::V5;
+        // Depending on the schema, some array types such as StringViewArray require larger (16 byte in this case) alignment.
+        // If the actual buffer layout after IPC read does not satisfy the alignment requirement,
+        // Arrow ArrayBuilder will copy the buffer into a newly allocated, properly aligned buffer.
+        // This copying may lead to memory blowup during IPC read due to duplicated buffers.
+        // To avoid this, we compute the maximum required alignment based on the schema and configure the IPCStreamWriter accordingly.
+        let alignment = get_max_alignment_for_schema(schema);
+        let mut write_options =
+            IpcWriteOptions::try_new(alignment, false, metadata_version)?;
+        write_options = write_options.try_with_compression(compression_type.into())?;
+
+        let writer = StreamWriter::try_new_with_options(file, schema, write_options)?;
         Ok(Self {
             num_batches: 0,
             num_rows: 0,
             num_bytes: 0,
-            writer: StreamWriter::try_new(file, schema)?,
+            writer,
         })
     }
 
@@ -317,12 +310,40 @@ impl IPCStreamWriter {
         Ok((delta_num_rows, delta_num_bytes))
     }
 
+    pub fn flush(&mut self) -> Result<()> {
+        self.writer.flush()?;
+        Ok(())
+    }
+
     /// Finish the writer
     pub fn finish(&mut self) -> Result<()> {
         self.writer.finish().map_err(Into::into)
     }
 }
 
+// Returns the maximum byte alignment required by any field in the schema (>= 8), derived from Arrow buffer layouts.
+fn get_max_alignment_for_schema(schema: &Schema) -> usize {
+    let minimum_alignment = 8;
+    let mut max_alignment = minimum_alignment;
+    for field in schema.fields() {
+        let layout = layout(field.data_type());
+        let required_alignment = layout
+            .buffers
+            .iter()
+            .map(|buffer_spec| {
+                if let BufferSpec::FixedWidth { alignment, .. } = buffer_spec {
+                    *alignment
+                } else {
+                    minimum_alignment
+                }
+            })
+            .max()
+            .unwrap_or(minimum_alignment);
+        max_alignment = std::cmp::max(max_alignment, required_alignment);
+    }
+    max_alignment
+}
+
 #[cfg(test)]
 mod tests {
     use super::in_progress_spill_file::InProgressSpillFile;
@@ -332,9 +353,9 @@ mod tests {
     use crate::metrics::SpillMetrics;
     use crate::spill::spill_manager::SpillManager;
     use crate::test::build_table_i32;
-    use arrow::array::{Float64Array, Int32Array, ListArray, StringArray};
+    use arrow::array::{ArrayRef, Int32Array, StringArray};
     use arrow::compute::cast;
-    use arrow::datatypes::{DataType, Field, Int32Type, Schema};
+    use arrow::datatypes::{DataType, Field, Schema};
     use arrow::record_batch::RecordBatch;
     use datafusion_common::Result;
     use datafusion_execution::runtime_env::RuntimeEnv;
@@ -371,7 +392,7 @@ mod tests {
         let spilled_rows = spill_manager.metrics.spilled_rows.value();
         assert_eq!(spilled_rows, num_rows);
 
-        let stream = spill_manager.read_spill_as_stream(spill_file)?;
+        let stream = spill_manager.read_spill_as_stream(spill_file, None)?;
         assert_eq!(stream.schema(), schema);
 
         let batches = collect(stream).await?;
@@ -435,7 +456,7 @@ mod tests {
         let spilled_rows = spill_manager.metrics.spilled_rows.value();
         assert_eq!(spilled_rows, num_rows);
 
-        let stream = spill_manager.read_spill_as_stream(spill_file)?;
+        let stream = spill_manager.read_spill_as_stream(spill_file, None)?;
         assert_eq!(stream.schema(), dict_schema);
         let batches = collect(stream).await?;
         assert_eq!(batches.len(), 2);
@@ -456,12 +477,18 @@ mod tests {
         let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
         let spill_manager = SpillManager::new(env, metrics, Arc::clone(&schema));
 
-        let spill_file = spill_manager
-            .spill_record_batch_by_size(&batch1, "Test Spill", 1)?
+        let row_batches: Vec<RecordBatch> =
+            (0..batch1.num_rows()).map(|i| batch1.slice(i, 1)).collect();
+        let (spill_file, max_batch_mem) = spill_manager
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                row_batches.iter().map(Ok),
+                "Test Spill",
+            )?
             .unwrap();
         assert!(spill_file.path().exists());
+        assert!(max_batch_mem > 0);
 
-        let stream = spill_manager.read_spill_as_stream(spill_file)?;
+        let stream = spill_manager.read_spill_as_stream(spill_file, None)?;
         assert_eq!(stream.schema(), schema);
 
         let batches = collect(stream).await?;
@@ -470,131 +497,111 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_get_record_batch_memory_size() {
-        // Create a simple record batch with two columns
+    fn build_compressible_batch() -> RecordBatch {
         let schema = Arc::new(Schema::new(vec![
-            Field::new("ints", DataType::Int32, true),
-            Field::new("float64", DataType::Float64, false),
+            Field::new("a", DataType::Utf8, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int32, true),
         ]));
 
-        let int_array =
-            Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
-        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![Arc::new(int_array), Arc::new(float64_array)],
-        )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 60);
-    }
-
-    #[test]
-    fn test_get_record_batch_memory_size_with_null() {
-        // Create a simple record batch with two columns
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("ints", DataType::Int32, true),
-            Field::new("float64", DataType::Float64, false),
-        ]));
-
-        let int_array = Int32Array::from(vec![None, Some(2), Some(3)]);
-        let float64_array = Float64Array::from(vec![1.0, 2.0, 3.0]);
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![Arc::new(int_array), Arc::new(float64_array)],
-        )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 100);
-    }
+        let a: ArrayRef = Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+            "repeated", 100,
+        )));
+        let b: ArrayRef = Arc::new(Int32Array::from(vec![1; 100]));
+        let c: ArrayRef = Arc::new(Int32Array::from(vec![2; 100]));
 
-    #[test]
-    fn test_get_record_batch_memory_size_empty() {
-        // Test with empty record batch
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "ints",
-            DataType::Int32,
-            false,
-        )]));
-
-        let int_array: Int32Array = Int32Array::from(vec![] as Vec<i32>);
-        let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)]).unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 0, "Empty batch should have 0 memory size");
+        RecordBatch::try_new(schema, vec![a, b, c]).unwrap()
     }
 
-    #[test]
-    fn test_get_record_batch_memory_size_shared_buffer() {
-        // Test with slices that share the same underlying buffer
-        let original = Int32Array::from(vec![1, 2, 3, 4, 5]);
-        let slice1 = original.slice(0, 3);
-        let slice2 = original.slice(2, 3);
-
-        // `RecordBatch` with `original` array
-        // ----
-        let schema_origin = Arc::new(Schema::new(vec![Field::new(
-            "origin_col",
-            DataType::Int32,
-            false,
-        )]));
-        let batch_origin =
-            RecordBatch::try_new(schema_origin, vec![Arc::new(original)]).unwrap();
-
-        // `RecordBatch` with all columns are reference to `original` array
-        // ----
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("slice1", DataType::Int32, false),
-            Field::new("slice2", DataType::Int32, false),
-        ]));
+    async fn validate(
+        spill_manager: &SpillManager,
+        spill_file: RefCountedTempFile,
+        num_rows: usize,
+        schema: SchemaRef,
+        batch_count: usize,
+    ) -> Result<()> {
+        let spilled_rows = spill_manager.metrics.spilled_rows.value();
+        assert_eq!(spilled_rows, num_rows);
 
-        let batch_sliced =
-            RecordBatch::try_new(schema, vec![Arc::new(slice1), Arc::new(slice2)])
-                .unwrap();
+        let stream = spill_manager.read_spill_as_stream(spill_file, None)?;
+        assert_eq!(stream.schema(), schema);
 
-        // Two sizes should all be only counting the buffer in `original` array
-        let size_origin = get_record_batch_memory_size(&batch_origin);
-        let size_sliced = get_record_batch_memory_size(&batch_sliced);
+        let batches = collect(stream).await?;
+        assert_eq!(batches.len(), batch_count);
 
-        assert_eq!(size_origin, size_sliced);
+        Ok(())
     }
 
-    #[test]
-    fn test_get_record_batch_memory_size_nested_array() {
-        let schema = Arc::new(Schema::new(vec![
-            Field::new(
-                "nested_int",
-                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
-                false,
-            ),
-            Field::new(
-                "nested_int2",
-                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
-                false,
-            ),
-        ]));
-
-        let int_list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-            Some(vec![Some(1), Some(2), Some(3)]),
-        ]);
-
-        let int_list_array2 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
-            Some(vec![Some(4), Some(5), Some(6)]),
-        ]);
+    #[tokio::test]
+    async fn test_spill_compression() -> Result<()> {
+        let batch = build_compressible_batch();
+        let num_rows = batch.num_rows();
+        let schema = batch.schema();
+        let batch_count = 1;
+        let batches = [batch];
 
-        let batch = RecordBatch::try_new(
+        // Construct SpillManager
+        let env = Arc::new(RuntimeEnv::default());
+        let uncompressed_metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let lz4_metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let zstd_metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let uncompressed_spill_manager = SpillManager::new(
+            Arc::clone(&env),
+            uncompressed_metrics,
+            Arc::clone(&schema),
+        );
+        let lz4_spill_manager =
+            SpillManager::new(Arc::clone(&env), lz4_metrics, Arc::clone(&schema))
+                .with_compression_type(SpillCompression::Lz4Frame);
+        let zstd_spill_manager =
+            SpillManager::new(env, zstd_metrics, Arc::clone(&schema))
+                .with_compression_type(SpillCompression::Zstd);
+        let uncompressed_spill_file = uncompressed_spill_manager
+            .spill_record_batch_and_finish(&batches, "Test")?
+            .unwrap();
+        let lz4_spill_file = lz4_spill_manager
+            .spill_record_batch_and_finish(&batches, "Lz4_Test")?
+            .unwrap();
+        let zstd_spill_file = zstd_spill_manager
+            .spill_record_batch_and_finish(&batches, "ZSTD_Test")?
+            .unwrap();
+        assert!(uncompressed_spill_file.path().exists());
+        assert!(lz4_spill_file.path().exists());
+        assert!(zstd_spill_file.path().exists());
+
+        let lz4_spill_size = std::fs::metadata(lz4_spill_file.path())?.len();
+        let zstd_spill_size = std::fs::metadata(zstd_spill_file.path())?.len();
+        let uncompressed_spill_size =
+            std::fs::metadata(uncompressed_spill_file.path())?.len();
+
+        assert!(uncompressed_spill_size > lz4_spill_size);
+        assert!(uncompressed_spill_size > zstd_spill_size);
+
+        validate(
+            &lz4_spill_manager,
+            lz4_spill_file,
+            num_rows,
+            Arc::clone(&schema),
+            batch_count,
+        )
+        .await?;
+        validate(
+            &zstd_spill_manager,
+            zstd_spill_file,
+            num_rows,
+            Arc::clone(&schema),
+            batch_count,
+        )
+        .await?;
+        validate(
+            &uncompressed_spill_manager,
+            uncompressed_spill_file,
+            num_rows,
             schema,
-            vec![Arc::new(int_list_array), Arc::new(int_list_array2)],
+            batch_count,
         )
-        .unwrap();
-
-        let size = get_record_batch_memory_size(&batch);
-        assert_eq!(size, 8320);
+        .await?;
+        Ok(())
     }
 
     // ==== Spill manager tests ====
@@ -684,12 +691,13 @@ mod tests {
                 Arc::new(StringArray::from(vec!["d", "e", "f"])),
             ],
         )?;
-
+        // After appending each batch, spilled_rows and spilled_bytes should increase incrementally,
+        // while spill_file_count remains 1 (since we're writing to the same file)
         in_progress_file.append_batch(&batch1)?;
-        verify_metrics(&in_progress_file, 1, 356, 3)?;
+        verify_metrics(&in_progress_file, 1, 440, 3)?;
 
         in_progress_file.append_batch(&batch2)?;
-        verify_metrics(&in_progress_file, 1, 712, 6)?;
+        verify_metrics(&in_progress_file, 1, 704, 6)?;
 
         let completed_file = in_progress_file.finish()?;
         assert!(completed_file.is_some());
@@ -724,7 +732,7 @@ mod tests {
         let completed_file = spill_manager.spill_record_batch_and_finish(&[], "Test")?;
         assert!(completed_file.is_none());
 
-        // Test write empty batch with interface `spill_record_batch_by_size()`
+        // Test write empty batch with interface `spill_record_batch_iter_and_return_max_batch_memory()`
         let empty_batch = RecordBatch::try_new(
             Arc::clone(&schema),
             vec![
@@ -732,8 +740,11 @@ mod tests {
                 Arc::new(StringArray::from(Vec::<Option<&str>>::new())),
             ],
         )?;
-        let completed_file =
-            spill_manager.spill_record_batch_by_size(&empty_batch, "Test", 1)?;
+        let completed_file = spill_manager
+            .spill_record_batch_iter_and_return_max_batch_memory(
+                std::iter::once(Ok(&empty_batch)),
+                "Test",
+            )?;
         assert!(completed_file.is_none());
 
         Ok(())
@@ -768,12 +779,95 @@ mod tests {
                     .spill_record_batch_and_finish(&batches, "Test2")?
                     .unwrap();
 
-                let mut stream_1 = spill_manager.read_spill_as_stream(spill_file_1)?;
-                let mut stream_2 = spill_manager.read_spill_as_stream(spill_file_2)?;
+                let mut stream_1 =
+                    spill_manager.read_spill_as_stream(spill_file_1, None)?;
+                let mut stream_2 =
+                    spill_manager.read_spill_as_stream(spill_file_2, None)?;
                 stream_1.next().await;
                 stream_2.next().await;
 
                 Ok(())
             })
     }
+
+    #[test]
+    fn test_alignment_for_schema() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("strings", DataType::Utf8View, false)]);
+        let alignment = get_max_alignment_for_schema(&schema);
+        assert_eq!(alignment, 16);
+
+        let schema = Schema::new(vec![
+            Field::new("int32", DataType::Int32, false),
+            Field::new("int64", DataType::Int64, false),
+        ]);
+        let alignment = get_max_alignment_for_schema(&schema);
+        assert_eq!(alignment, 8);
+        Ok(())
+    }
+    #[tokio::test]
+    async fn test_real_time_spill_metrics() -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]));
+
+        let spill_manager = Arc::new(SpillManager::new(
+            Arc::clone(&env),
+            metrics.clone(),
+            Arc::clone(&schema),
+        ));
+        let mut in_progress_file = spill_manager.create_in_progress_file("Test")?;
+
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2, 3])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ],
+        )?;
+
+        // Before any batch, metrics should be 0
+        assert_eq!(metrics.spilled_bytes.value(), 0);
+        assert_eq!(metrics.spill_file_count.value(), 0);
+
+        // Append first batch
+        in_progress_file.append_batch(&batch1)?;
+
+        // Metrics should be updated immediately (at least schema and first batch)
+        let bytes_after_batch1 = metrics.spilled_bytes.value();
+        assert_eq!(bytes_after_batch1, 440);
+        assert_eq!(metrics.spill_file_count.value(), 1);
+
+        // Check global progress
+        let progress = env.spilling_progress();
+        assert_eq!(progress.current_bytes, bytes_after_batch1 as u64);
+        assert_eq!(progress.active_files_count, 1);
+
+        // Append another batch
+        in_progress_file.append_batch(&batch1)?;
+        let bytes_after_batch2 = metrics.spilled_bytes.value();
+        assert!(bytes_after_batch2 > bytes_after_batch1);
+
+        // Check global progress again
+        let progress = env.spilling_progress();
+        assert_eq!(progress.current_bytes, bytes_after_batch2 as u64);
+
+        // Finish the file
+        let spilled_file = in_progress_file.finish()?;
+        let final_bytes = metrics.spilled_bytes.value();
+        assert!(final_bytes > bytes_after_batch2);
+
+        // Even after finish, file is still "active" until dropped
+        let progress = env.spilling_progress();
+        assert!(progress.current_bytes > 0);
+        assert_eq!(progress.active_files_count, 1);
+
+        drop(spilled_file);
+        assert_eq!(env.spilling_progress().active_files_count, 0);
+        assert_eq!(env.spilling_progress().current_bytes, 0);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs
index 78cd47a8bad07..07ba6d3989bc5 100644
--- a/datafusion/physical-plan/src/spill/spill_manager.rs
+++ b/datafusion/physical-plan/src/spill/spill_manager.rs
@@ -17,19 +17,19 @@
 
 //! Define the `SpillManager` struct, which is responsible for reading and writing `RecordBatch`es to raw files based on the provided configurations.
 
-use std::sync::Arc;
-
+use super::{SpillReaderStream, in_progress_spill_file::InProgressSpillFile};
+use crate::coop::cooperative;
+use crate::{common::spawn_buffered, metrics::SpillMetrics};
+use arrow::array::StringViewArray;
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_execution::runtime_env::RuntimeEnv;
-
-use datafusion_common::Result;
-use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_common::utils::memory::get_record_batch_memory_size;
+use datafusion_common::{DataFusionError, Result, config::SpillCompression};
 use datafusion_execution::SendableRecordBatchStream;
-
-use crate::{common::spawn_buffered, metrics::SpillMetrics};
-
-use super::{in_progress_spill_file::InProgressSpillFile, SpillReaderStream};
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::runtime_env::RuntimeEnv;
+use std::borrow::Borrow;
+use std::sync::Arc;
 
 /// The `SpillManager` is responsible for the following tasks:
 /// - Reading and writing `RecordBatch`es to raw files based on the provided configurations.
@@ -44,7 +44,8 @@ pub struct SpillManager {
     schema: SchemaRef,
     /// Number of batches to buffer in memory during disk reads
     batch_read_buffer_capacity: usize,
-    // TODO: Add general-purpose compression options
+    /// general-purpose compression options
+    pub(crate) compression: SpillCompression,
 }
 
 impl SpillManager {
@@ -54,9 +55,28 @@ impl SpillManager {
             metrics,
             schema,
             batch_read_buffer_capacity: 2,
+            compression: SpillCompression::default(),
         }
     }
 
+    pub fn with_batch_read_buffer_capacity(
+        mut self,
+        batch_read_buffer_capacity: usize,
+    ) -> Self {
+        self.batch_read_buffer_capacity = batch_read_buffer_capacity;
+        self
+    }
+
+    pub fn with_compression_type(mut self, spill_compression: SpillCompression) -> Self {
+        self.compression = spill_compression;
+        self
+    }
+
+    /// Returns the schema for batches managed by this SpillManager
+    pub fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
     /// Creates a temporary file for in-progress operations, returning an error
     /// message if file creation fails. The file can be used to append batches
     /// incrementally and then finish the file when done.
@@ -90,32 +110,57 @@ impl SpillManager {
         in_progress_file.finish()
     }
 
-    /// Refer to the documentation for [`Self::spill_record_batch_and_finish`]. This method
-    /// additionally spills the `RecordBatch` into smaller batches, divided by `row_limit`.
-    ///
-    /// # Errors
-    /// - Returns an error if spilling would exceed the disk usage limit configured
-    ///   by `max_temp_directory_size` in `DiskManager`
-    pub fn spill_record_batch_by_size(
+    /// Spill an iterator of `RecordBatch`es to disk and return the spill file and the size of the largest batch in memory
+    /// Note that this expects the caller to provide *non-sliced* batches, so the memory calculation of each batch is accurate.
+    pub(crate) fn spill_record_batch_iter_and_return_max_batch_memory(
         &self,
-        batch: &RecordBatch,
+        mut iter: impl Iterator<Item = Result<impl Borrow<RecordBatch>>>,
         request_description: &str,
-        row_limit: usize,
-    ) -> Result<Option<RefCountedTempFile>> {
-        let total_rows = batch.num_rows();
-        let mut batches = Vec::new();
-        let mut offset = 0;
-
-        // It's ok to calculate all slices first, because slicing is zero-copy.
-        while offset < total_rows {
-            let length = std::cmp::min(total_rows - offset, row_limit);
-            let sliced_batch = batch.slice(offset, length);
-            batches.push(sliced_batch);
-            offset += length;
+    ) -> Result<Option<(RefCountedTempFile, usize)>> {
+        let mut in_progress_file = self.create_in_progress_file(request_description)?;
+
+        let mut max_record_batch_size = 0;
+
+        iter.try_for_each(|batch| {
+            let batch = batch?;
+            let borrowed = batch.borrow();
+            if borrowed.num_rows() == 0 {
+                return Ok(());
+            }
+            in_progress_file.append_batch(borrowed)?;
+
+            max_record_batch_size =
+                max_record_batch_size.max(get_record_batch_memory_size(borrowed));
+            Result::<_, DataFusionError>::Ok(())
+        })?;
+
+        let file = in_progress_file.finish()?;
+
+        Ok(file.map(|f| (f, max_record_batch_size)))
+    }
+
+    /// Spill a stream of `RecordBatch`es to disk and return the spill file and the size of the largest batch in memory
+    pub(crate) async fn spill_record_batch_stream_and_return_max_batch_memory(
+        &self,
+        stream: &mut SendableRecordBatchStream,
+        request_description: &str,
+    ) -> Result<Option<(RefCountedTempFile, usize)>> {
+        use futures::StreamExt;
+
+        let mut in_progress_file = self.create_in_progress_file(request_description)?;
+
+        let mut max_record_batch_size = 0;
+
+        while let Some(batch) = stream.next().await {
+            let batch = batch?;
+            in_progress_file.append_batch(&batch)?;
+
+            max_record_batch_size = max_record_batch_size.max(batch.get_sliced_size()?);
         }
 
-        // Spill the sliced batches to disk
-        self.spill_record_batch_and_finish(&batches, request_description)
+        let file = in_progress_file.finish()?;
+
+        Ok(file.map(|f| (f, max_record_batch_size)))
     }
 
     /// Reads a spill file as a stream. The file must be created by the current `SpillManager`.
@@ -124,12 +169,121 @@ impl SpillManager {
     pub fn read_spill_as_stream(
         &self,
         spill_file_path: RefCountedTempFile,
+        max_record_batch_memory: Option<usize>,
     ) -> Result<SendableRecordBatchStream> {
-        let stream = Box::pin(SpillReaderStream::new(
+        let stream = Box::pin(cooperative(SpillReaderStream::new(
             Arc::clone(&self.schema),
             spill_file_path,
-        ));
+            max_record_batch_memory,
+        )));
 
         Ok(spawn_buffered(stream, self.batch_read_buffer_capacity))
     }
+
+    /// Same as `read_spill_as_stream`, but without buffering.
+    pub fn read_spill_as_stream_unbuffered(
+        &self,
+        spill_file_path: RefCountedTempFile,
+        max_record_batch_memory: Option<usize>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(cooperative(SpillReaderStream::new(
+            Arc::clone(&self.schema),
+            spill_file_path,
+            max_record_batch_memory,
+        ))))
+    }
+}
+
+pub(crate) trait GetSlicedSize {
+    /// Returns the size of the `RecordBatch` when sliced.
+    /// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.
+    /// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.
+    fn get_sliced_size(&self) -> Result<usize>;
+}
+
+impl GetSlicedSize for RecordBatch {
+    fn get_sliced_size(&self) -> Result<usize> {
+        let mut total = 0;
+        for array in self.columns() {
+            let data = array.to_data();
+            total += data.get_slice_memory_size()?;
+
+            // While StringViewArray holds large data buffer for non inlined string, the Arrow layout (BufferSpec)
+            // does not include any data buffers. Currently, ArrayData::get_slice_memory_size()
+            // under-counts memory size by accounting only views buffer although data buffer is cloned during slice()
+            //
+            // Therefore, we manually add the sum of the lengths used by all non inlined views
+            // on top of the sliced size for views buffer. This matches the intended semantics of
+            // "bytes needed if we materialized exactly this slice into fresh buffers".
+            // This is a workaround until https://github.com/apache/arrow-rs/issues/8230
+            if let Some(sv) = array.as_any().downcast_ref::<StringViewArray>() {
+                for buffer in sv.data_buffers() {
+                    total += buffer.capacity();
+                }
+            }
+        }
+        Ok(total)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::spill::{get_record_batch_memory_size, spill_manager::GetSlicedSize};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::{
+        array::{ArrayRef, StringViewArray},
+        record_batch::RecordBatch,
+    };
+    use datafusion_common::Result;
+    use std::sync::Arc;
+
+    #[test]
+    fn check_sliced_size_for_string_view_array() -> Result<()> {
+        let array_length = 50;
+        let short_len = 8;
+        let long_len = 25;
+
+        // Build StringViewArray that includes both inline strings and non inlined strings
+        let strings: Vec<String> = (0..array_length)
+            .map(|i| {
+                if i % 2 == 0 {
+                    "a".repeat(short_len)
+                } else {
+                    "b".repeat(long_len)
+                }
+            })
+            .collect();
+
+        let string_array = StringViewArray::from(strings);
+        let array_ref: ArrayRef = Arc::new(string_array);
+        let batch = RecordBatch::try_new(
+            Arc::new(Schema::new(vec![Field::new(
+                "strings",
+                DataType::Utf8View,
+                false,
+            )])),
+            vec![array_ref],
+        )
+        .unwrap();
+
+        // We did not slice the batch, so these two memory size should be equal
+        assert_eq!(
+            batch.get_sliced_size().unwrap(),
+            get_record_batch_memory_size(&batch)
+        );
+
+        // Slice the batch into half
+        let half_batch = batch.slice(0, array_length / 2);
+        // Now sliced_size is smaller because the views buffer is sliced
+        assert!(
+            half_batch.get_sliced_size().unwrap()
+                < get_record_batch_memory_size(&half_batch)
+        );
+        let data = arrow::array::Array::to_data(&half_batch.column(0));
+        let views_sliced_size = data.get_slice_memory_size()?;
+        // The sliced size should be larger than sliced views buffer size
+        assert!(views_sliced_size < half_batch.get_sliced_size().unwrap());
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/spill/spill_pool.rs b/datafusion/physical-plan/src/spill/spill_pool.rs
new file mode 100644
index 0000000000000..2777b753bb37a
--- /dev/null
+++ b/datafusion/physical-plan/src/spill/spill_pool.rs
@@ -0,0 +1,1544 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use futures::{Stream, StreamExt};
+use std::collections::VecDeque;
+use std::sync::Arc;
+use std::task::Waker;
+
+use parking_lot::Mutex;
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::Result;
+use datafusion_execution::disk_manager::RefCountedTempFile;
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
+
+use super::in_progress_spill_file::InProgressSpillFile;
+use super::spill_manager::SpillManager;
+
+/// Shared state between the writer and readers of a spill pool.
+/// This contains the queue of files and coordination state.
+///
+/// # Locking Design
+///
+/// This struct uses **fine-grained locking** with nested `Arc<Mutex<>>`:
+/// - `SpillPoolShared` is wrapped in `Arc<Mutex<>>` (outer lock)
+/// - Each `ActiveSpillFileShared` is wrapped in `Arc<Mutex<>>` (inner lock)
+///
+/// This enables:
+/// 1. **Short critical sections**: The outer lock is held only for queue operations
+/// 2. **I/O outside locks**: Disk I/O happens while holding only the file-specific lock
+/// 3. **Concurrent operations**: Reader can access the queue while writer does I/O
+///
+/// **Lock ordering discipline**: Never hold both locks simultaneously to prevent deadlock.
+/// Always: acquire outer lock → release outer lock → acquire inner lock (if needed).
+struct SpillPoolShared {
+    /// Queue of ALL files (including the current write file if it exists).
+    /// Readers always read from the front of this queue (FIFO).
+    /// Each file has its own lock to enable concurrent reader/writer access.
+    files: VecDeque<Arc<Mutex<ActiveSpillFileShared>>>,
+    /// SpillManager for creating files and tracking metrics
+    spill_manager: Arc<SpillManager>,
+    /// Pool-level waker to notify when new files are available (single reader)
+    waker: Option<Waker>,
+    /// Whether the writer has been dropped (no more files will be added)
+    writer_dropped: bool,
+    /// Writer's reference to the current file (shared by all cloned writers).
+    /// Has its own lock to allow I/O without blocking queue access.
+    current_write_file: Option<Arc<Mutex<ActiveSpillFileShared>>>,
+    /// Number of active writer clones. Only when this reaches zero should
+    /// `writer_dropped` be set to true. This prevents premature EOF signaling
+    /// when one writer clone is dropped while others are still active.
+    active_writer_count: usize,
+}
+
+impl SpillPoolShared {
+    /// Creates a new shared pool state
+    fn new(spill_manager: Arc<SpillManager>) -> Self {
+        Self {
+            files: VecDeque::new(),
+            spill_manager,
+            waker: None,
+            writer_dropped: false,
+            current_write_file: None,
+            active_writer_count: 1,
+        }
+    }
+
+    /// Registers a waker to be notified when new data is available (pool-level)
+    fn register_waker(&mut self, waker: Waker) {
+        self.waker = Some(waker);
+    }
+
+    /// Wakes the pool-level reader
+    fn wake(&mut self) {
+        if let Some(waker) = self.waker.take() {
+            waker.wake();
+        }
+    }
+}
+
+/// Writer for a spill pool. Provides coordinated write access with FIFO semantics.
+///
+/// Created by [`channel`]. See that function for architecture diagrams and usage examples.
+///
+/// The writer is `Clone`, allowing multiple writers to coordinate on the same pool.
+/// All clones share the same current write file and coordinate file rotation.
+/// The writer automatically manages file rotation based on the `max_file_size_bytes`
+/// configured in [`channel`]. When the last writer clone is dropped, it finalizes the
+/// current file so readers can access all written data.
+pub struct SpillPoolWriter {
+    /// Maximum size in bytes before rotating to a new file.
+    /// Typically set from configuration `datafusion.execution.max_spill_file_size_bytes`.
+    max_file_size_bytes: usize,
+    /// Shared state with readers (includes current_write_file for coordination)
+    shared: Arc<Mutex<SpillPoolShared>>,
+}
+
+impl Clone for SpillPoolWriter {
+    fn clone(&self) -> Self {
+        // Increment the active writer count so that `writer_dropped` is only
+        // set to true when the *last* clone is dropped.
+        self.shared.lock().active_writer_count += 1;
+        Self {
+            max_file_size_bytes: self.max_file_size_bytes,
+            shared: Arc::clone(&self.shared),
+        }
+    }
+}
+
+impl SpillPoolWriter {
+    /// Spills a batch to the pool, rotating files when necessary.
+    ///
+    /// If the current file would exceed `max_file_size_bytes` after adding
+    /// this batch, the file is finalized and a new one is started.
+    ///
+    /// See [`channel`] for overall architecture and examples.
+    ///
+    /// # File Rotation Logic
+    ///
+    /// ```text
+    /// push_batch()
+    ///      │
+    ///      ▼
+    /// Current file exists?
+    ///      │
+    ///      ├─ No ──▶ Create new file ──▶ Add to shared queue
+    ///      │                               Wake readers
+    ///      ▼
+    /// Write batch to current file
+    ///      │
+    ///      ▼
+    /// estimated_size > max_file_size_bytes?
+    ///      │
+    ///      ├─ No ──▶ Keep current file for next batch
+    ///      │
+    ///      ▼
+    /// Yes: finish() current file
+    ///      Mark writer_finished = true
+    ///      Wake readers
+    ///      │
+    ///      ▼
+    /// Next push_batch() creates new file
+    /// ```
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if disk I/O fails or disk quota is exceeded.
+    pub fn push_batch(&self, batch: &RecordBatch) -> Result<()> {
+        if batch.num_rows() == 0 {
+            // Skip empty batches
+            return Ok(());
+        }
+
+        let batch_size = batch.get_array_memory_size();
+
+        // Fine-grained locking: Lock shared state briefly for queue access
+        let mut shared = self.shared.lock();
+
+        // Create new file if we don't have one yet
+        if shared.current_write_file.is_none() {
+            let spill_manager = Arc::clone(&shared.spill_manager);
+            // Release shared lock before disk I/O (fine-grained locking)
+            drop(shared);
+
+            let writer = spill_manager.create_in_progress_file("SpillPool")?;
+            // Clone the file so readers can access it immediately
+            let file = writer.file().expect("InProgressSpillFile should always have a file when it is first created").clone();
+
+            let file_shared = Arc::new(Mutex::new(ActiveSpillFileShared {
+                writer: Some(writer),
+                file: Some(file), // Set immediately so readers can access it
+                batches_written: 0,
+                estimated_size: 0,
+                writer_finished: false,
+                waker: None,
+            }));
+
+            // Re-acquire lock and push to shared queue
+            shared = self.shared.lock();
+            shared.files.push_back(Arc::clone(&file_shared));
+            shared.current_write_file = Some(file_shared);
+            shared.wake(); // Wake readers waiting for new files
+        }
+
+        let current_write_file = shared.current_write_file.take();
+        // Release shared lock before file I/O (fine-grained locking)
+        // This allows readers to access the queue while we do disk I/O
+        drop(shared);
+
+        // Write batch to current file - lock only the specific file
+        if let Some(current_file) = current_write_file {
+            // Now lock just this file for I/O (separate from shared lock)
+            let mut file_shared = current_file.lock();
+
+            // Append the batch
+            if let Some(ref mut writer) = file_shared.writer {
+                writer.append_batch(batch)?;
+                // make sure we flush the writer for readers
+                writer.flush()?;
+                file_shared.batches_written += 1;
+                file_shared.estimated_size += batch_size;
+            }
+
+            // Wake reader waiting on this specific file
+            file_shared.wake();
+
+            // Check if we need to rotate
+            let needs_rotation = file_shared.estimated_size > self.max_file_size_bytes;
+
+            if needs_rotation {
+                // Finish the IPC writer
+                if let Some(mut writer) = file_shared.writer.take() {
+                    writer.finish()?;
+                }
+                // Mark as finished so readers know not to wait for more data
+                file_shared.writer_finished = true;
+                // Wake reader waiting on this file (it's now finished)
+                file_shared.wake();
+                // Don't put back current_write_file - let it rotate
+            } else {
+                // Release file lock
+                drop(file_shared);
+                // Put back the current file for further writing
+                let mut shared = self.shared.lock();
+                shared.current_write_file = Some(current_file);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for SpillPoolWriter {
+    fn drop(&mut self) {
+        let mut shared = self.shared.lock();
+
+        shared.active_writer_count -= 1;
+        let is_last_writer = shared.active_writer_count == 0;
+
+        if !is_last_writer {
+            // Other writer clones are still active; do not finalize or
+            // signal EOF to readers.
+            return;
+        }
+
+        // Finalize the current file when the last writer is dropped
+        if let Some(current_file) = shared.current_write_file.take() {
+            // Release shared lock before locking file
+            drop(shared);
+
+            let mut file_shared = current_file.lock();
+
+            // Finish the current writer if it exists
+            if let Some(mut writer) = file_shared.writer.take() {
+                // Ignore errors on drop - we're in destructor
+                let _ = writer.finish();
+            }
+
+            // Mark as finished so readers know not to wait for more data
+            file_shared.writer_finished = true;
+
+            // Wake reader waiting on this file (it's now finished)
+            file_shared.wake();
+
+            drop(file_shared);
+            shared = self.shared.lock();
+        }
+
+        // Mark writer as dropped and wake pool-level readers
+        shared.writer_dropped = true;
+        shared.wake();
+    }
+}
+
+/// Creates a paired writer and reader for a spill pool with MPSC (multi-producer, single-consumer)
+/// semantics.
+///
+/// This is the recommended way to create a spill pool. The writer is `Clone`, allowing
+/// multiple producers to coordinate writes to the same pool. The reader can consume batches
+/// in FIFO order. The reader can start reading immediately after a writer appends a batch
+/// to the spill file, without waiting for the file to be sealed, while writers continue to
+/// write more data.
+///
+/// Internally this coordinates rotating spill files based on size limits, and
+/// handles asynchronous notification between the writer and reader using wakers.
+/// This ensures that we manage disk usage efficiently while allowing concurrent
+/// I/O between the writer and reader.
+///
+/// # Data Flow Overview
+///
+/// 1. Writer write batch `B0` to F1
+/// 2. Writer write batch `B1` to F1, notices the size limit exceeded, finishes F1.
+/// 3. Reader read `B0` from F1
+/// 4. Reader read `B1`, no more batch to read -> wait on the waker
+/// 5. Writer write batch `B2` to a new file `F2`, wake up the waiting reader.
+/// 6. Reader read `B2` from F2.
+/// 7. Repeat until writer is dropped.
+///
+/// # Architecture
+///
+/// ```text
+/// ┌─────────────────────────────────────────────────────────────────────────┐
+/// │                            SpillPool                                    │
+/// │                                                                         │
+/// │  Writer Side              Shared State              Reader Side         │
+/// │  ───────────              ────────────              ───────────         │
+/// │                                                                         │
+/// │  SpillPoolWriter    ┌────────────────────┐    SpillPoolReader           │
+/// │       │             │  VecDeque<File>    │          │                   │
+/// │       │             │  ┌────┐┌────┐      │          │                   │
+/// │  push_batch()       │  │ F1 ││ F2 │ ...  │      next().await            │
+/// │       │             │  └────┘└────┘      │          │                   │
+/// │       ▼             │   (FIFO order)     │          ▼                   │
+/// │  ┌─────────┐        │                    │    ┌──────────┐              │
+/// │  │Current  │───────▶│ Coordination:      │◀───│ Current  │              │
+/// │  │Write    │        │ - Wakers           │    │ Read     │              │
+/// │  │File     │        │ - Batch counts     │    │ File     │              │
+/// │  └─────────┘        │ - Writer status    │    └──────────┘              │
+/// │       │             └────────────────────┘          │                   │
+/// │       │                                              │                  │
+/// │  Size > limit?                                Read all batches?         │
+/// │       │                                              │                  │
+/// │       ▼                                              ▼                  │
+/// │  Rotate to new file                            Pop from queue           │
+/// └─────────────────────────────────────────────────────────────────────────┘
+///
+/// Writer produces → Shared FIFO queue → Reader consumes
+/// ```
+///
+/// # File State Machine
+///
+/// Each file in the pool coordinates between writer and reader:
+///
+/// ```text
+///                Writer View              Reader View
+///                ───────────              ───────────
+///
+/// Created        writer: Some(..)         batches_read: 0
+///                batches_written: 0       (waiting for data)
+///                       │
+///                       ▼
+/// Writing        append_batch()           Can read if:
+///                batches_written++        batches_read < batches_written
+///                wake readers
+///                       │                        │
+///                       │                        ▼
+///                ┌──────┴──────┐          poll_next() → batch
+///                │             │          batches_read++
+///                ▼             ▼
+///          Size > limit?  More data?
+///                │             │
+///                │             └─▶ Yes ──▶ Continue writing
+///                ▼
+///          finish()                   Reader catches up:
+///          writer_finished = true     batches_read == batches_written
+///          wake readers                       │
+///                │                            ▼
+///                └─────────────────────▶ Returns Poll::Ready(None)
+///                                       File complete, pop from queue
+/// ```
+///
+/// # Arguments
+///
+/// * `max_file_size_bytes` - Maximum size per file before rotation. When a file
+///   exceeds this size, the writer automatically rotates to a new file.
+/// * `spill_manager` - Manager for file creation and metrics tracking
+///
+/// # Returns
+///
+/// A tuple of `(SpillPoolWriter, SendableRecordBatchStream)` that share the same
+/// underlying pool. The reader is returned as a stream for immediate use with
+/// async stream combinators.
+///
+/// # Example
+///
+/// ```
+/// use std::sync::Arc;
+/// use arrow::array::{ArrayRef, Int32Array};
+/// use arrow::datatypes::{DataType, Field, Schema};
+/// use arrow::record_batch::RecordBatch;
+/// use datafusion_execution::runtime_env::RuntimeEnv;
+/// use futures::StreamExt;
+///
+/// # use datafusion_physical_plan::spill::spill_pool;
+/// # use datafusion_physical_plan::spill::SpillManager; // Re-exported for doctests
+/// # use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+/// #
+/// # #[tokio::main]
+/// # async fn main() -> datafusion_common::Result<()> {
+/// # // Setup for the example (typically comes from TaskContext in production)
+/// # let env = Arc::new(RuntimeEnv::default());
+/// # let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+/// # let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+/// # let spill_manager = Arc::new(SpillManager::new(env, metrics, schema.clone()));
+/// #
+/// // Create channel with 1MB file size limit
+/// let (writer, mut reader) = spill_pool::channel(1024 * 1024, spill_manager);
+///
+/// // Spawn writer and reader concurrently; writer wakes reader via wakers
+/// let writer_task = tokio::spawn(async move {
+///     for i in 0..5 {
+///         let array: ArrayRef = Arc::new(Int32Array::from(vec![i; 100]));
+///         let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap();
+///         writer.push_batch(&batch)?;
+///     }
+///     // Explicitly drop writer to finalize the spill file and wake the reader
+///     drop(writer);
+///     datafusion_common::Result::<()>::Ok(())
+/// });
+///
+/// let reader_task = tokio::spawn(async move {
+///     let mut batches_read = 0;
+///     while let Some(result) = reader.next().await {
+///         let _batch = result?;
+///         batches_read += 1;
+///     }
+///     datafusion_common::Result::<usize>::Ok(batches_read)
+/// });
+///
+/// let (writer_res, reader_res) = tokio::join!(writer_task, reader_task);
+/// writer_res
+///     .map_err(|e| datafusion_common::DataFusionError::Execution(e.to_string()))??;
+/// let batches_read = reader_res
+///     .map_err(|e| datafusion_common::DataFusionError::Execution(e.to_string()))??;
+///
+/// assert_eq!(batches_read, 5);
+/// # Ok(())
+/// # }
+/// ```
+///
+/// # Why rotate files?
+///
+/// File rotation ensures we don't end up with unreferenced disk usage.
+/// If we used a single file for all spilled data, we would end up with
+/// unreferenced data at the beginning of the file that has already been read
+/// by readers but we can't delete because you can't truncate from the start of a file.
+///
+/// Consider the case of a query like `SELECT * FROM large_table WHERE false`.
+/// Obviously this query produces no output rows, but if we had a spilling operator
+/// in the middle of this query between the scan and the filter it would see the entire
+/// `large_table` flow through it and thus would spill all of that data to disk.
+/// So we'd end up using up to `size(large_table)` bytes of disk space.
+/// If instead we use file rotation, and as long as the readers can keep up with the writer,
+/// then we can ensure that once a file is fully read by all readers it can be deleted,
+/// thus bounding the maximum disk usage to roughly `max_file_size_bytes`.
+pub fn channel(
+    max_file_size_bytes: usize,
+    spill_manager: Arc<SpillManager>,
+) -> (SpillPoolWriter, SendableRecordBatchStream) {
+    let schema = Arc::clone(spill_manager.schema());
+    let shared = Arc::new(Mutex::new(SpillPoolShared::new(spill_manager)));
+
+    let writer = SpillPoolWriter {
+        max_file_size_bytes,
+        shared: Arc::clone(&shared),
+    };
+
+    let reader = SpillPoolReader::new(shared, schema);
+
+    (writer, Box::pin(reader))
+}
+
+/// Shared state between writer and readers for an active spill file.
+/// Protected by a Mutex to coordinate between concurrent readers and the writer.
+struct ActiveSpillFileShared {
+    /// Writer handle - taken (set to None) when finish() is called
+    writer: Option<InProgressSpillFile>,
+    /// The spill file, set when the writer finishes.
+    /// Taken by the reader when creating a stream (the file stays open via file handles).
+    file: Option<RefCountedTempFile>,
+    /// Total number of batches written to this file
+    batches_written: usize,
+    /// Estimated size in bytes of data written to this file
+    estimated_size: usize,
+    /// Whether the writer has finished writing to this file
+    writer_finished: bool,
+    /// Waker for reader waiting on this specific file (SPSC: only one reader)
+    waker: Option<Waker>,
+}
+
+impl ActiveSpillFileShared {
+    /// Registers a waker to be notified when new data is written to this file
+    fn register_waker(&mut self, waker: Waker) {
+        self.waker = Some(waker);
+    }
+
+    /// Wakes the reader waiting on this file
+    fn wake(&mut self) {
+        if let Some(waker) = self.waker.take() {
+            waker.wake();
+        }
+    }
+}
+
+/// Reader state for a SpillFile (owned by individual SpillFile instances).
+/// This is kept separate from the shared state to avoid holding locks during I/O.
+struct SpillFileReader {
+    /// The actual stream reading from disk
+    stream: SendableRecordBatchStream,
+    /// Number of batches this reader has consumed
+    batches_read: usize,
+}
+
+struct SpillFile {
+    /// Shared coordination state (contains writer and batch counts)
+    shared: Arc<Mutex<ActiveSpillFileShared>>,
+    /// Reader state (lazy-initialized, owned by this SpillFile)
+    reader: Option<SpillFileReader>,
+    /// Spill manager for creating readers
+    spill_manager: Arc<SpillManager>,
+}
+
+impl Stream for SpillFile {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        use std::task::Poll;
+
+        // Step 1: Lock shared state and check coordination
+        let (should_read, file) = {
+            let mut shared = self.shared.lock();
+
+            // Determine if we can read
+            let batches_read = self.reader.as_ref().map_or(0, |r| r.batches_read);
+
+            if batches_read < shared.batches_written {
+                // More data available to read - take the file if we don't have a reader yet
+                let file = if self.reader.is_none() {
+                    shared.file.take()
+                } else {
+                    None
+                };
+                (true, file)
+            } else if shared.writer_finished {
+                // No more data and writer is done - EOF
+                return Poll::Ready(None);
+            } else {
+                // Caught up to writer, but writer still active - register waker and wait
+                shared.register_waker(cx.waker().clone());
+                return Poll::Pending;
+            }
+        }; // Lock released here
+
+        // Step 2: Lazy-create reader stream if needed
+        if self.reader.is_none() && should_read {
+            if let Some(file) = file {
+                // we want this unbuffered because files are actively being written to
+                match self
+                    .spill_manager
+                    .read_spill_as_stream_unbuffered(file, None)
+                {
+                    Ok(stream) => {
+                        self.reader = Some(SpillFileReader {
+                            stream,
+                            batches_read: 0,
+                        });
+                    }
+                    Err(e) => return Poll::Ready(Some(Err(e))),
+                }
+            } else {
+                // File not available yet (writer hasn't finished or already taken)
+                // Register waker and wait for file to be ready
+                let mut shared = self.shared.lock();
+                shared.register_waker(cx.waker().clone());
+                return Poll::Pending;
+            }
+        }
+
+        // Step 3: Poll the reader stream (no lock held)
+        if let Some(reader) = &mut self.reader {
+            match reader.stream.poll_next_unpin(cx) {
+                Poll::Ready(Some(Ok(batch))) => {
+                    // Successfully read a batch - increment counter
+                    reader.batches_read += 1;
+                    Poll::Ready(Some(Ok(batch)))
+                }
+                Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
+                Poll::Ready(None) => {
+                    // Stream exhausted unexpectedly
+                    // This shouldn't happen if coordination is correct, but handle gracefully
+                    Poll::Ready(None)
+                }
+                Poll::Pending => Poll::Pending,
+            }
+        } else {
+            // Should not reach here, but handle gracefully
+            Poll::Ready(None)
+        }
+    }
+}
+
+/// A stream that reads from a SpillPool in FIFO order.
+///
+/// Created by [`channel`]. See that function for architecture diagrams and usage examples.
+///
+/// The stream automatically handles file rotation and reads from completed files.
+/// When no data is available, it returns `Poll::Pending` and registers a waker to
+/// be notified when the writer produces more data.
+///
+/// # Infinite Stream Semantics
+///
+/// This stream never returns `None` (`Poll::Ready(None)`) on its own - it will keep
+/// waiting for the writer to produce more data. The stream ends only when:
+/// - The reader is dropped
+/// - The writer is dropped AND all queued data has been consumed
+///
+/// This makes it suitable for continuous streaming scenarios where the writer may
+/// produce data intermittently.
+pub struct SpillPoolReader {
+    /// Shared reference to the spill pool
+    shared: Arc<Mutex<SpillPoolShared>>,
+    /// Current SpillFile we're reading from
+    current_file: Option<SpillFile>,
+    /// Schema of the spilled data
+    schema: SchemaRef,
+}
+
+impl SpillPoolReader {
+    /// Creates a new reader from shared pool state.
+    ///
+    /// This is private - use the `channel()` function to create a reader/writer pair.
+    ///
+    /// # Arguments
+    ///
+    /// * `shared` - Shared reference to the pool state
+    fn new(shared: Arc<Mutex<SpillPoolShared>>, schema: SchemaRef) -> Self {
+        Self {
+            shared,
+            current_file: None,
+            schema,
+        }
+    }
+}
+
+impl Stream for SpillPoolReader {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        use std::task::Poll;
+
+        loop {
+            // If we have a current file, try to read from it
+            if let Some(ref mut file) = self.current_file {
+                match file.poll_next_unpin(cx) {
+                    Poll::Ready(Some(Ok(batch))) => {
+                        // Got a batch, return it
+                        return Poll::Ready(Some(Ok(batch)));
+                    }
+                    Poll::Ready(Some(Err(e))) => {
+                        // Error reading batch
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                    Poll::Ready(None) => {
+                        // Current file stream exhausted
+                        // Check if this file is marked as writer_finished
+                        let writer_finished = { file.shared.lock().writer_finished };
+
+                        if writer_finished {
+                            // File is complete, pop it from the queue and move to next
+                            let mut shared = self.shared.lock();
+                            shared.files.pop_front();
+                            drop(shared); // Release lock
+
+                            // Clear current file and continue loop to get next file
+                            self.current_file = None;
+                            continue;
+                        } else {
+                            // Stream exhausted but writer not finished - unexpected
+                            // This shouldn't happen with proper coordination
+                            return Poll::Ready(None);
+                        }
+                    }
+                    Poll::Pending => {
+                        // File not ready yet (waiting for writer)
+                        // Register waker so we get notified when writer adds more batches
+                        let mut shared = self.shared.lock();
+                        shared.register_waker(cx.waker().clone());
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // No current file, need to get the next one
+            let mut shared = self.shared.lock();
+
+            // Peek at the front of the queue (don't pop yet)
+            if let Some(file_shared) = shared.files.front() {
+                // Create a SpillFile from the shared state
+                let spill_manager = Arc::clone(&shared.spill_manager);
+                let file_shared = Arc::clone(file_shared);
+                drop(shared); // Release lock before creating SpillFile
+
+                self.current_file = Some(SpillFile {
+                    shared: file_shared,
+                    reader: None,
+                    spill_manager,
+                });
+
+                // Continue loop to poll the new file
+                continue;
+            }
+
+            // No files in queue - check if writer is done
+            if shared.writer_dropped {
+                // Writer is done and no more files will be added - EOF
+                return Poll::Ready(None);
+            }
+
+            // Writer still active, register waker that will get notified when new files are added
+            shared.register_waker(cx.waker().clone());
+            return Poll::Pending;
+        }
+    }
+}
+
+impl RecordBatchStream for SpillPoolReader {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+    use arrow::array::{ArrayRef, Int32Array};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common_runtime::SpawnedTask;
+    use datafusion_execution::runtime_env::RuntimeEnv;
+    use futures::StreamExt;
+
+    fn create_test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]))
+    }
+
+    fn create_test_batch(start: i32, count: usize) -> RecordBatch {
+        let schema = create_test_schema();
+        let a: ArrayRef = Arc::new(Int32Array::from(
+            (start..start + count as i32).collect::<Vec<_>>(),
+        ));
+        RecordBatch::try_new(schema, vec![a]).unwrap()
+    }
+
+    fn create_spill_channel(
+        max_file_size: usize,
+    ) -> (SpillPoolWriter, SendableRecordBatchStream) {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(env, metrics, schema));
+
+        channel(max_file_size, spill_manager)
+    }
+
+    fn create_spill_channel_with_metrics(
+        max_file_size: usize,
+    ) -> (SpillPoolWriter, SendableRecordBatchStream, SpillMetrics) {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(env, metrics.clone(), schema));
+
+        let (writer, reader) = channel(max_file_size, spill_manager);
+        (writer, reader, metrics)
+    }
+
+    #[tokio::test]
+    async fn test_basic_write_and_read() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write one batch
+        let batch1 = create_test_batch(0, 10);
+        writer.push_batch(&batch1)?;
+
+        // Read the batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 10);
+
+        // Write another batch
+        let batch2 = create_test_batch(10, 5);
+        writer.push_batch(&batch2)?;
+        // Read the second batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_write_read() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write one batch
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Read it back
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        // Verify the actual data
+        let col = result
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 0);
+        assert_eq!(col.value(4), 4);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multiple_batches_sequential() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write multiple batches
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Read all batches and verify FIFO order
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10, "Batch {i} not in FIFO order");
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_empty_writer() -> Result<()> {
+        let (_writer, reader) = create_spill_channel(1024 * 1024);
+
+        // Reader should pend since no batches were written
+        let mut reader = reader;
+        let result =
+            tokio::time::timeout(std::time::Duration::from_millis(100), reader.next())
+                .await;
+
+        assert!(result.is_err(), "Reader should timeout on empty writer");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_empty_batch_skipping() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Write empty batch
+        let empty_batch = create_test_batch(0, 0);
+        writer.push_batch(&empty_batch)?;
+
+        // Write non-empty batch
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Should only read the non-empty batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_rotation_triggered_by_size() -> Result<()> {
+        // Set a small max_file_size to trigger rotation after one batch
+        let batch1 = create_test_batch(0, 10);
+        let batch_size = batch1.get_array_memory_size() + 1;
+
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write first batch (should fit in first file)
+        writer.push_batch(&batch1)?;
+
+        // Check metrics after first batch - file created but not finalized yet
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file after first batch"
+        );
+        assert_eq!(
+            metrics.spilled_bytes.value(),
+            320,
+            "Spilled bytes should reflect data written (header + 1 batch)"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            10,
+            "Should have spilled 10 rows from first batch"
+        );
+
+        // Write second batch (should trigger rotation - finalize first file)
+        let batch2 = create_test_batch(10, 10);
+        assert!(
+            batch2.get_array_memory_size() <= batch_size,
+            "batch2 size {} exceeds limit {batch_size}",
+            batch2.get_array_memory_size(),
+        );
+        assert!(
+            batch1.get_array_memory_size() + batch2.get_array_memory_size() > batch_size,
+            "Combined size {} does not exceed limit to trigger rotation",
+            batch1.get_array_memory_size() + batch2.get_array_memory_size()
+        );
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after rotation - first file finalized, but second file not created yet
+        // (new file created lazily on next push_batch call)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should still have 1 file (second file not created until next write)"
+        );
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after first file finalized (got {})",
+            metrics.spilled_bytes.value()
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            20,
+            "Should have spilled 20 total rows (10 + 10)"
+        );
+
+        // Write a third batch to confirm rotation occurred (creates second file)
+        let batch3 = create_test_batch(20, 5);
+        writer.push_batch(&batch3)?;
+
+        // Now check that second file was created
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after writing to new file"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            25,
+            "Should have spilled 25 total rows (10 + 10 + 5)"
+        );
+
+        // Read all three batches
+        let result1 = reader.next().await.unwrap()?;
+        assert_eq!(result1.num_rows(), 10);
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        let result3 = reader.next().await.unwrap()?;
+        assert_eq!(result3.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_multiple_rotations() -> Result<()> {
+        let batches = (0..10)
+            .map(|i| create_test_batch(i * 10, 10))
+            .collect::<Vec<_>>();
+
+        let batch_size = batches[0].get_array_memory_size() * 2 + 1;
+
+        // Very small max_file_size to force frequent rotations
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write many batches to cause multiple rotations
+        for i in 0..10 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Check metrics after all writes - should have multiple files due to rotations
+        // With batch_size = 2 * one_batch + 1, each file fits ~2 batches before rotating
+        // 10 batches should create multiple files (exact count depends on rotation timing)
+        let file_count = metrics.spill_file_count.value();
+        assert!(
+            file_count >= 4,
+            "Should have created at least 4 files with multiple rotations (got {file_count})"
+        );
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after rotations (got {})",
+            metrics.spilled_bytes.value()
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            100,
+            "Should have spilled 100 total rows (10 batches * 10 rows)"
+        );
+
+        // Read all batches and verify order
+        for i in 0..10 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(
+                col.value(0),
+                i * 10,
+                "Batch {i} not in correct order after rotations"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_larger_than_limit() -> Result<()> {
+        // Very small limit
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(100);
+
+        // Write a batch that exceeds the limit
+        let large_batch = create_test_batch(0, 100);
+        writer.push_batch(&large_batch)?;
+
+        // Check metrics after large batch - should trigger rotation immediately
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file for large batch"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            100,
+            "Should have spilled 100 rows from large batch"
+        );
+
+        // Should still write and read successfully
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 100);
+
+        // Next batch should go to a new file
+        let batch2 = create_test_batch(100, 10);
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after second batch - should have rotated to a new file
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after rotation"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            110,
+            "Should have spilled 110 total rows (100 + 10)"
+        );
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_very_small_max_file_size() -> Result<()> {
+        // Test with just 1 byte max (extreme case)
+        let (writer, mut reader) = create_spill_channel(1);
+
+        // Any batch will exceed this limit
+        let batch = create_test_batch(0, 5);
+        writer.push_batch(&batch)?;
+
+        // Should still work
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), 5);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_exact_size_boundary() -> Result<()> {
+        // Create a batch and measure its approximate size
+        let batch = create_test_batch(0, 10);
+        let batch_size = batch.get_array_memory_size();
+
+        // Set max_file_size to exactly the batch size
+        let (writer, mut reader, metrics) = create_spill_channel_with_metrics(batch_size);
+
+        // Write first batch (exactly at the size limit)
+        writer.push_batch(&batch)?;
+
+        // Check metrics after first batch - should NOT rotate yet (size == limit, not >)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should have created 1 file after first batch at exact boundary"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            10,
+            "Should have spilled 10 rows from first batch"
+        );
+
+        // Write second batch (exceeds the limit, should trigger rotation)
+        let batch2 = create_test_batch(10, 10);
+        writer.push_batch(&batch2)?;
+
+        // Check metrics after second batch - rotation triggered, first file finalized
+        // Note: second file not created yet (lazy creation on next write)
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            1,
+            "Should still have 1 file after rotation (second file created lazily)"
+        );
+        assert_eq!(
+            metrics.spilled_rows.value(),
+            20,
+            "Should have spilled 20 total rows (10 + 10)"
+        );
+        // Verify first file was finalized by checking spilled_bytes
+        assert!(
+            metrics.spilled_bytes.value() > 0,
+            "Spilled bytes should be > 0 after file finalization (got {})",
+            metrics.spilled_bytes.value()
+        );
+
+        // Both should be readable
+        let result1 = reader.next().await.unwrap()?;
+        assert_eq!(result1.num_rows(), 10);
+
+        let result2 = reader.next().await.unwrap()?;
+        assert_eq!(result2.num_rows(), 10);
+
+        // Spill another batch, now we should see the second file created
+        let batch3 = create_test_batch(20, 5);
+        writer.push_batch(&batch3)?;
+        assert_eq!(
+            metrics.spill_file_count.value(),
+            2,
+            "Should have created 2 files after writing to new file"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_reader_writer() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        // Spawn writer task
+        let writer_handle = SpawnedTask::spawn(async move {
+            for i in 0..10 {
+                let batch = create_test_batch(i * 10, 10);
+                writer.push_batch(&batch).unwrap();
+                // Small delay to simulate real concurrent work
+                tokio::time::sleep(std::time::Duration::from_millis(5)).await;
+            }
+        });
+
+        // Reader task (runs concurrently)
+        let reader_handle = SpawnedTask::spawn(async move {
+            let mut count = 0;
+            for i in 0..10 {
+                let result = reader.next().await.unwrap().unwrap();
+                assert_eq!(result.num_rows(), 10);
+
+                let col = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .unwrap();
+                assert_eq!(col.value(0), i * 10);
+                count += 1;
+            }
+            count
+        });
+
+        // Wait for both to complete
+        writer_handle.await.unwrap();
+        let batches_read = reader_handle.await.unwrap();
+        assert_eq!(batches_read, 10);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reader_catches_up_to_writer() -> Result<()> {
+        let (writer, mut reader) = create_spill_channel(1024 * 1024);
+
+        let (reader_waiting_tx, reader_waiting_rx) = tokio::sync::oneshot::channel();
+        let (first_read_done_tx, first_read_done_rx) = tokio::sync::oneshot::channel();
+
+        #[derive(Clone, Copy, Debug, PartialEq, Eq)]
+        enum ReadWriteEvent {
+            ReadStart,
+            Read(usize),
+            Write(usize),
+        }
+
+        let events = Arc::new(Mutex::new(vec![]));
+        // Start reader first (will pend)
+        let reader_events = Arc::clone(&events);
+        let reader_handle = SpawnedTask::spawn(async move {
+            reader_events.lock().push(ReadWriteEvent::ReadStart);
+            reader_waiting_tx
+                .send(())
+                .expect("reader_waiting channel closed unexpectedly");
+            let result = reader.next().await.unwrap().unwrap();
+            reader_events
+                .lock()
+                .push(ReadWriteEvent::Read(result.num_rows()));
+            first_read_done_tx
+                .send(())
+                .expect("first_read_done channel closed unexpectedly");
+            let result = reader.next().await.unwrap().unwrap();
+            reader_events
+                .lock()
+                .push(ReadWriteEvent::Read(result.num_rows()));
+        });
+
+        // Wait until the reader is pending on the first batch
+        reader_waiting_rx
+            .await
+            .expect("reader should signal when waiting");
+
+        // Now write a batch (should wake the reader)
+        let batch = create_test_batch(0, 5);
+        events.lock().push(ReadWriteEvent::Write(batch.num_rows()));
+        writer.push_batch(&batch)?;
+
+        // Wait for the reader to finish the first read before allowing the
+        // second write. This ensures deterministic ordering of events:
+        // 1. The reader starts and pends on the first `next()`
+        // 2. The first write wakes the reader
+        // 3. The reader processes the first batch and signals completion
+        // 4. The second write is issued, ensuring consistent event ordering
+        first_read_done_rx
+            .await
+            .expect("reader should signal when first read completes");
+
+        // Write another batch
+        let batch = create_test_batch(5, 10);
+        events.lock().push(ReadWriteEvent::Write(batch.num_rows()));
+        writer.push_batch(&batch)?;
+
+        // Reader should complete
+        reader_handle.await.unwrap();
+        let events = events.lock().clone();
+        assert_eq!(
+            events,
+            vec![
+                ReadWriteEvent::ReadStart,
+                ReadWriteEvent::Write(5),
+                ReadWriteEvent::Read(5),
+                ReadWriteEvent::Write(10),
+                ReadWriteEvent::Read(10)
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reader_starts_after_writer_finishes() -> Result<()> {
+        let (writer, reader) = create_spill_channel(128);
+
+        // Writer writes all data
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        drop(writer);
+
+        // Now start reader
+        let mut reader = reader;
+        let mut count = 0;
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10);
+            count += 1;
+        }
+
+        assert_eq!(count, 5, "Should read all batches after writer finishes");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_writer_drop_finalizes_file() -> Result<()> {
+        let env = Arc::new(RuntimeEnv::default());
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager =
+            Arc::new(SpillManager::new(Arc::clone(&env), metrics.clone(), schema));
+
+        let (writer, mut reader) = channel(1024 * 1024, spill_manager);
+
+        // Write some batches
+        for i in 0..5 {
+            let batch = create_test_batch(i * 10, 10);
+            writer.push_batch(&batch)?;
+        }
+
+        // Check metrics before drop - spilled_bytes already reflects written data
+        let spilled_bytes_before = metrics.spilled_bytes.value();
+        assert_eq!(
+            spilled_bytes_before, 1088,
+            "Spilled bytes should reflect data written (header + 5 batches)"
+        );
+
+        // Explicitly drop the writer - this should finalize the current file
+        drop(writer);
+
+        // Check metrics after drop - spilled_bytes should be > 0 now
+        let spilled_bytes_after = metrics.spilled_bytes.value();
+        assert!(
+            spilled_bytes_after > 0,
+            "Spilled bytes should be > 0 after writer is dropped (got {spilled_bytes_after})"
+        );
+
+        // Verify reader can still read all batches
+        let mut count = 0;
+        for i in 0..5 {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), 10);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), i * 10);
+            count += 1;
+        }
+
+        assert_eq!(count, 5, "Should read all batches after writer is dropped");
+
+        Ok(())
+    }
+
+    /// Verifies that the reader stays alive as long as any writer clone exists.
+    ///
+    /// `SpillPoolWriter` is `Clone`, and in non-preserve-order repartitioning
+    /// mode multiple input partition tasks share clones of the same writer.
+    /// The reader must not see EOF until **all** clones have been dropped,
+    /// even if the queue is temporarily empty between writes from different
+    /// clones.
+    ///
+    /// The test sequence is:
+    ///
+    /// 1. writer1 writes a batch, then is dropped.
+    /// 2. The reader consumes that batch (queue is now empty).
+    /// 3. writer2 (still alive) writes a batch.
+    /// 4. The reader must see that batch.
+    /// 5. EOF is only signalled after writer2 is also dropped.
+    #[tokio::test]
+    async fn test_clone_drop_does_not_signal_eof_prematurely() -> Result<()> {
+        let (writer1, mut reader) = create_spill_channel(1024 * 1024);
+        let writer2 = writer1.clone();
+
+        // Synchronization: tell writer2 when it may proceed.
+        let (proceed_tx, proceed_rx) = tokio::sync::oneshot::channel::<()>();
+
+        // Spawn writer2 — it waits for the signal before writing.
+        let writer2_handle = SpawnedTask::spawn(async move {
+            proceed_rx.await.unwrap();
+            writer2.push_batch(&create_test_batch(10, 10)).unwrap();
+            // writer2 is dropped here (last clone → true EOF)
+        });
+
+        // Writer1 writes one batch, then drops.
+        writer1.push_batch(&create_test_batch(0, 10))?;
+        drop(writer1);
+
+        // Read writer1's batch.
+        let batch1 = reader.next().await.unwrap()?;
+        assert_eq!(batch1.num_rows(), 10);
+        let col = batch1
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 0);
+
+        // Signal writer2 to write its batch. It will execute when the
+        // current task yields (i.e. when reader.next() returns Pending).
+        proceed_tx.send(()).unwrap();
+
+        // The reader should wait (Pending) for writer2's data, not EOF.
+        let batch2 =
+            tokio::time::timeout(std::time::Duration::from_secs(5), reader.next())
+                .await
+                .expect("Reader timed out — should not hang");
+
+        assert!(
+            batch2.is_some(),
+            "Reader must not return EOF while a writer clone is still alive"
+        );
+        let batch2 = batch2.unwrap()?;
+        assert_eq!(batch2.num_rows(), 10);
+        let col = batch2
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 10);
+
+        writer2_handle.await.unwrap();
+
+        // All writers dropped — reader should see real EOF now.
+        assert!(reader.next().await.is_none());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_disk_usage_decreases_as_files_consumed() -> Result<()> {
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        // Test configuration
+        const NUM_BATCHES: usize = 3;
+        const ROWS_PER_BATCH: usize = 100;
+
+        // Step 1: Create a test batch and measure its size
+        let batch = create_test_batch(0, ROWS_PER_BATCH);
+        let batch_size = batch.get_array_memory_size();
+
+        // Step 2: Configure file rotation to approximately 1 batch per file
+        // Create a custom RuntimeEnv so we can access the DiskManager
+        let runtime = Arc::new(RuntimeEnvBuilder::default().build()?);
+        let disk_manager = Arc::clone(&runtime.disk_manager);
+
+        let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
+        let schema = create_test_schema();
+        let spill_manager = Arc::new(SpillManager::new(runtime, metrics.clone(), schema));
+
+        let (writer, mut reader) = channel(batch_size, spill_manager);
+
+        // Step 3: Write NUM_BATCHES batches to create approximately NUM_BATCHES files
+        for i in 0..NUM_BATCHES {
+            let start = (i * ROWS_PER_BATCH) as i32;
+            writer.push_batch(&create_test_batch(start, ROWS_PER_BATCH))?;
+        }
+
+        // Check how many files were created (should be at least a few due to file rotation)
+        let file_count = metrics.spill_file_count.value();
+        assert_eq!(
+            file_count,
+            NUM_BATCHES - 1,
+            "Expected at {} files with rotation, got {file_count}",
+            NUM_BATCHES - 1
+        );
+
+        // Step 4: Verify initial disk usage reflects all files
+        let initial_disk_usage = disk_manager.used_disk_space();
+        assert!(
+            initial_disk_usage > 0,
+            "Expected disk usage > 0 after writing batches, got {initial_disk_usage}"
+        );
+
+        // Step 5: Read NUM_BATCHES - 1 batches (all but 1)
+        // As each file is fully consumed, it should be dropped and disk usage should decrease
+        for i in 0..(NUM_BATCHES - 1) {
+            let result = reader.next().await.unwrap()?;
+            assert_eq!(result.num_rows(), ROWS_PER_BATCH);
+
+            let col = result
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap();
+            assert_eq!(col.value(0), (i * ROWS_PER_BATCH) as i32);
+        }
+
+        // Step 6: Verify disk usage decreased but is not zero (at least 1 batch remains)
+        let partial_disk_usage = disk_manager.used_disk_space();
+        assert!(
+            partial_disk_usage > 0
+                && partial_disk_usage < (batch_size * NUM_BATCHES * 2) as u64,
+            "Disk usage should be > 0 with remaining batches"
+        );
+        assert!(
+            partial_disk_usage < initial_disk_usage,
+            "Disk usage should have decreased after reading most batches: initial={initial_disk_usage}, partial={partial_disk_usage}"
+        );
+
+        // Step 7: Read the final batch
+        let result = reader.next().await.unwrap()?;
+        assert_eq!(result.num_rows(), ROWS_PER_BATCH);
+
+        // Step 8: Drop writer first to signal no more data will be written
+        // The reader has infinite stream semantics and will wait for the writer
+        // to be dropped before returning None
+        drop(writer);
+
+        // Verify we've read all batches - now the reader should return None
+        assert!(
+            reader.next().await.is_none(),
+            "Should have no more batches to read"
+        );
+
+        // Step 9: Drop reader to release all references
+        drop(reader);
+
+        // Step 10: Verify complete cleanup - disk usage should be 0
+        let final_disk_usage = disk_manager.used_disk_space();
+        assert_eq!(
+            final_disk_usage, 0,
+            "Disk usage should be 0 after all files dropped, got {final_disk_usage}"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs
index 338ac7d048a33..4b7e707fccedd 100644
--- a/datafusion/physical-plan/src/stream.rs
+++ b/datafusion/physical-plan/src/stream.rs
@@ -22,19 +22,25 @@ use std::sync::Arc;
 use std::task::Context;
 use std::task::Poll;
 
-use super::metrics::BaselineMetrics;
+#[cfg(test)]
+use super::metrics::ExecutionPlanMetricsSet;
+use super::metrics::{BaselineMetrics, SplitMetrics};
 use super::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream};
 use crate::displayable;
+use crate::spill::get_record_batch_memory_size;
 
 use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_common_runtime::JoinSet;
 use datafusion_execution::TaskContext;
+use datafusion_execution::memory_pool::MemoryReservation;
 
+use futures::ready;
 use futures::stream::BoxStream;
 use futures::{Future, Stream, StreamExt};
 use log::debug;
 use pin_project_lite::pin_project;
+use tokio::runtime::Handle;
 use tokio::sync::mpsc::{Receiver, Sender};
 
 /// Creates a stream from a collection of producing tasks, routing panics to the stream.
@@ -81,6 +87,15 @@ impl<O: Send + 'static> ReceiverStreamBuilder<O> {
         self.join_set.spawn(task);
     }
 
+    /// Same as [`Self::spawn`] but it spawns the task on the provided runtime
+    pub fn spawn_on<F>(&mut self, task: F, handle: &Handle)
+    where
+        F: Future<Output = Result<()>>,
+        F: Send + 'static,
+    {
+        self.join_set.spawn_on(task, handle);
+    }
+
     /// Spawn a blocking task that will be aborted if this builder (or the stream
     /// built from it) are dropped.
     ///
@@ -94,6 +109,15 @@ impl<O: Send + 'static> ReceiverStreamBuilder<O> {
         self.join_set.spawn_blocking(f);
     }
 
+    /// Same as [`Self::spawn_blocking`] but it spawns the blocking task on the provided runtime
+    pub fn spawn_blocking_on<F>(&mut self, f: F, handle: &Handle)
+    where
+        F: FnOnce() -> Result<()>,
+        F: Send + 'static,
+    {
+        self.join_set.spawn_blocking_on(f, handle);
+    }
+
     /// Create a stream of all data written to `tx`
     pub fn build(self) -> BoxStream<'static, Result<O>> {
         let Self {
@@ -185,7 +209,9 @@ impl<O: Send + 'static> ReceiverStreamBuilder<O> {
 /// let schema_1 = Arc::clone(&schema);
 /// builder.spawn(async move {
 ///     // Your task needs to send batches to the tx
-///     tx_1.send(Ok(RecordBatch::new_empty(schema_1))).await.unwrap();
+///     tx_1.send(Ok(RecordBatch::new_empty(schema_1)))
+///         .await
+///         .unwrap();
 ///
 ///     Ok(())
 /// });
@@ -195,7 +221,9 @@ impl<O: Send + 'static> ReceiverStreamBuilder<O> {
 /// let schema_2 = Arc::clone(&schema);
 /// builder.spawn(async move {
 ///     // Your task needs to send batches to the tx
-///     tx_2.send(Ok(RecordBatch::new_empty(schema_2))).await.unwrap();
+///     tx_2.send(Ok(RecordBatch::new_empty(schema_2)))
+///         .await
+///         .unwrap();
 ///
 ///     Ok(())
 /// });
@@ -245,6 +273,15 @@ impl RecordBatchReceiverStreamBuilder {
         self.inner.spawn(task)
     }
 
+    /// Same as [`Self::spawn`] but it spawns the task on the provided runtime.
+    pub fn spawn_on<F>(&mut self, task: F, handle: &Handle)
+    where
+        F: Future<Output = Result<()>>,
+        F: Send + 'static,
+    {
+        self.inner.spawn_on(task, handle)
+    }
+
     /// Spawn a blocking task tied to the builder and stream.
     ///
     /// # Drop / Cancel Behavior
@@ -272,6 +309,15 @@ impl RecordBatchReceiverStreamBuilder {
         self.inner.spawn_blocking(f)
     }
 
+    /// Same as [`Self::spawn_blocking`] but it spawns the blocking task on the provided runtime.
+    pub fn spawn_blocking_on<F>(&mut self, f: F, handle: &Handle)
+    where
+        F: FnOnce() -> Result<()>,
+        F: Send + 'static,
+    {
+        self.inner.spawn_blocking_on(f, handle)
+    }
+
     /// Runs the `partition` of the `input` ExecutionPlan on the
     /// tokio thread pool and writes its outputs to this stream
     ///
@@ -377,9 +423,10 @@ impl<S> RecordBatchStreamAdapter<S> {
     /// # use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
     /// // Create stream of Result<RecordBatch>
     /// let batch = record_batch!(
-    ///   ("a", Int32, [1, 2, 3]),
-    ///   ("b", Float64, [Some(4.0), None, Some(5.0)])
-    /// ).expect("created batch");
+    ///     ("a", Int32, [1, 2, 3]),
+    ///     ("b", Float64, [Some(4.0), None, Some(5.0)])
+    /// )
+    /// .expect("created batch");
     /// let schema = batch.schema();
     /// let stream = futures::stream::iter(vec![Ok(batch)]);
     /// // Convert the stream to a SendableRecordBatchStream
@@ -522,11 +569,207 @@ impl Stream for ObservedStream {
     }
 }
 
+pin_project! {
+    /// Stream wrapper that splits large [`RecordBatch`]es into smaller batches.
+    ///
+    /// This ensures upstream operators receive batches no larger than
+    /// `batch_size`, which can improve parallelism when data sources
+    /// generate very large batches.
+    ///
+    /// # Fields
+    ///
+    /// - `current_batch`: The batch currently being split, if any
+    /// - `offset`: Index of the next row to split from `current_batch`.
+    ///   This tracks our position within the current batch being split.
+    ///
+    /// # Invariants
+    ///
+    /// - `offset` is always ≤ `current_batch.num_rows()` when `current_batch` is `Some`
+    /// - When `current_batch` is `None`, `offset` is always 0
+    /// - `batch_size` is always > 0
+pub struct BatchSplitStream {
+        #[pin]
+        input: SendableRecordBatchStream,
+        schema: SchemaRef,
+        batch_size: usize,
+        metrics: SplitMetrics,
+        current_batch: Option<RecordBatch>,
+        offset: usize,
+    }
+}
+
+impl BatchSplitStream {
+    /// Create a new [`BatchSplitStream`]
+    pub fn new(
+        input: SendableRecordBatchStream,
+        batch_size: usize,
+        metrics: SplitMetrics,
+    ) -> Self {
+        let schema = input.schema();
+        Self {
+            input,
+            schema,
+            batch_size,
+            metrics,
+            current_batch: None,
+            offset: 0,
+        }
+    }
+
+    /// Attempt to produce the next sliced batch from the current batch.
+    ///
+    /// Returns `Some(batch)` if a slice was produced, `None` if the current batch
+    /// is exhausted and we need to poll upstream for more data.
+    fn next_sliced_batch(&mut self) -> Option<Result<RecordBatch>> {
+        let batch = self.current_batch.take()?;
+
+        // Assert slice boundary safety - offset should never exceed batch size
+        debug_assert!(
+            self.offset <= batch.num_rows(),
+            "Offset {} exceeds batch size {}",
+            self.offset,
+            batch.num_rows()
+        );
+
+        let remaining = batch.num_rows() - self.offset;
+        let to_take = remaining.min(self.batch_size);
+        let out = batch.slice(self.offset, to_take);
+
+        self.metrics.batches_split.add(1);
+        self.offset += to_take;
+        if self.offset < batch.num_rows() {
+            // More data remains in this batch, store it back
+            self.current_batch = Some(batch);
+        } else {
+            // Batch is exhausted, reset offset
+            // Note: current_batch is already None since we took it at the start
+            self.offset = 0;
+        }
+        Some(Ok(out))
+    }
+
+    /// Poll the upstream input for the next batch.
+    ///
+    /// Returns the appropriate `Poll` result based on upstream state.
+    /// Small batches are passed through directly, large batches are stored
+    /// for slicing and return the first slice immediately.
+    fn poll_upstream(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        match ready!(self.input.as_mut().poll_next(cx)) {
+            Some(Ok(batch)) => {
+                if batch.num_rows() <= self.batch_size {
+                    // Small batch, pass through directly
+                    Poll::Ready(Some(Ok(batch)))
+                } else {
+                    // Large batch, store for slicing and return first slice
+                    self.current_batch = Some(batch);
+                    // Immediately produce the first slice
+                    match self.next_sliced_batch() {
+                        Some(result) => Poll::Ready(Some(result)),
+                        None => Poll::Ready(None), // Should not happen
+                    }
+                }
+            }
+            Some(Err(e)) => Poll::Ready(Some(Err(e))),
+            None => Poll::Ready(None),
+        }
+    }
+}
+
+impl Stream for BatchSplitStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        // First, try to produce a slice from the current batch
+        if let Some(result) = self.next_sliced_batch() {
+            return Poll::Ready(Some(result));
+        }
+
+        // No current batch or current batch exhausted, poll upstream
+        self.poll_upstream(cx)
+    }
+}
+
+impl RecordBatchStream for BatchSplitStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+/// A stream that holds a memory reservation for its lifetime,
+/// shrinking the reservation as batches are consumed.
+/// The original reservation must have its batch sizes calculated using [`get_record_batch_memory_size`]
+/// On error, the reservation is *NOT* freed, until the stream is dropped.
+pub(crate) struct ReservationStream {
+    schema: SchemaRef,
+    inner: SendableRecordBatchStream,
+    reservation: MemoryReservation,
+}
+
+impl ReservationStream {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        inner: SendableRecordBatchStream,
+        reservation: MemoryReservation,
+    ) -> Self {
+        Self {
+            schema,
+            inner,
+            reservation,
+        }
+    }
+}
+
+impl Stream for ReservationStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let res = self.inner.poll_next_unpin(cx);
+
+        match res {
+            Poll::Ready(res) => {
+                match res {
+                    Some(Ok(batch)) => {
+                        self.reservation
+                            .shrink(get_record_batch_memory_size(&batch));
+                        Poll::Ready(Some(Ok(batch)))
+                    }
+                    Some(Err(err)) => Poll::Ready(Some(Err(err))),
+                    None => {
+                        // Stream is done so free the reservation completely
+                        self.reservation.free();
+                        Poll::Ready(None)
+                    }
+                }
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+impl RecordBatchStream for ReservationStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::test::exec::{
-        assert_strong_count_converges_to_zero, BlockingExec, MockExec, PanicExec,
+        BlockingExec, MockExec, PanicExec, assert_strong_count_converges_to_zero,
     };
 
     use arrow::datatypes::{DataType, Field, Schema};
@@ -616,6 +859,44 @@ mod test {
         assert!(stream.next().await.is_none());
     }
 
+    #[tokio::test]
+    async fn batch_split_stream_basic_functionality() {
+        use arrow::array::{Int32Array, RecordBatch};
+        use futures::stream::{self, StreamExt};
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create a large batch that should be split
+        let large_batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from((0..2000).collect::<Vec<_>>()))],
+        )
+        .unwrap();
+
+        // Create a stream with the large batch
+        let input_stream = stream::iter(vec![Ok(large_batch)]);
+        let adapter = RecordBatchStreamAdapter::new(Arc::clone(&schema), input_stream);
+        let batch_stream = Box::pin(adapter) as SendableRecordBatchStream;
+
+        // Create a BatchSplitStream with batch_size = 500
+        let metrics = ExecutionPlanMetricsSet::new();
+        let split_metrics = SplitMetrics::new(&metrics, 0);
+        let mut split_stream = BatchSplitStream::new(batch_stream, 500, split_metrics);
+
+        let mut total_rows = 0;
+        let mut batch_count = 0;
+
+        while let Some(result) = split_stream.next().await {
+            let batch = result.unwrap();
+            assert!(batch.num_rows() <= 500, "Batch size should not exceed 500");
+            total_rows += batch.num_rows();
+            batch_count += 1;
+        }
+
+        assert_eq!(total_rows, 2000, "All rows should be preserved");
+        assert_eq!(batch_count, 4, "Should have 4 batches of 500 rows each");
+    }
+
     /// Consumes all the input's partitions into a
     /// RecordBatchReceiverStream and runs it to completion
     ///
@@ -649,4 +930,186 @@ mod test {
             );
         }
     }
+
+    #[test]
+    fn record_batch_receiver_stream_builder_spawn_on_runtime() {
+        let tokio_runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
+        let mut builder =
+            RecordBatchReceiverStreamBuilder::new(Arc::new(Schema::empty()), 10);
+
+        let tx1 = builder.tx();
+        builder.spawn_on(
+            async move {
+                tx1.send(Ok(RecordBatch::new_empty(Arc::new(Schema::empty()))))
+                    .await
+                    .unwrap();
+
+                Ok(())
+            },
+            tokio_runtime.handle(),
+        );
+
+        let tx2 = builder.tx();
+        builder.spawn_blocking_on(
+            move || {
+                tx2.blocking_send(Ok(RecordBatch::new_empty(Arc::new(Schema::empty()))))
+                    .unwrap();
+
+                Ok(())
+            },
+            tokio_runtime.handle(),
+        );
+
+        let mut stream = builder.build();
+
+        let mut number_of_batches = 0;
+
+        loop {
+            let poll = stream.poll_next_unpin(&mut Context::from_waker(
+                futures::task::noop_waker_ref(),
+            ));
+
+            match poll {
+                Poll::Ready(None) => {
+                    break;
+                }
+                Poll::Ready(Some(Ok(batch))) => {
+                    number_of_batches += 1;
+                    assert_eq!(batch.num_rows(), 0);
+                }
+                Poll::Ready(Some(Err(e))) => panic!("Unexpected error: {e}"),
+                Poll::Pending => {
+                    continue;
+                }
+            }
+        }
+
+        assert_eq!(
+            number_of_batches, 2,
+            "Should have received exactly two empty batches"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_reservation_stream_shrinks_on_poll() {
+        use arrow::array::Int32Array;
+        use datafusion_execution::memory_pool::MemoryConsumer;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(10 * 1024 * 1024, 1.0)
+            .build_arc()
+            .unwrap();
+
+        let reservation = MemoryConsumer::new("test").register(&runtime.memory_pool);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        // Create batches
+        let batch1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))],
+        )
+        .unwrap();
+        let batch2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int32Array::from(vec![6, 7, 8, 9, 10]))],
+        )
+        .unwrap();
+
+        let batch1_size = get_record_batch_memory_size(&batch1);
+        let batch2_size = get_record_batch_memory_size(&batch2);
+
+        // Reserve memory upfront
+        reservation.try_grow(batch1_size + batch2_size).unwrap();
+        let initial_reserved = runtime.memory_pool.reserved();
+        assert_eq!(initial_reserved, batch1_size + batch2_size);
+
+        // Create stream with batches
+        let stream = futures::stream::iter(vec![Ok(batch1), Ok(batch2)]);
+        let inner = Box::pin(RecordBatchStreamAdapter::new(Arc::clone(&schema), stream))
+            as SendableRecordBatchStream;
+
+        let mut res_stream =
+            ReservationStream::new(Arc::clone(&schema), inner, reservation);
+
+        // Poll first batch
+        let result1 = res_stream.next().await;
+        assert!(result1.is_some());
+
+        // Memory should be reduced by batch1_size
+        let after_first = runtime.memory_pool.reserved();
+        assert_eq!(after_first, batch2_size);
+
+        // Poll second batch
+        let result2 = res_stream.next().await;
+        assert!(result2.is_some());
+
+        // Memory should be reduced by batch2_size
+        let after_second = runtime.memory_pool.reserved();
+        assert_eq!(after_second, 0);
+
+        // Poll None (end of stream)
+        let result3 = res_stream.next().await;
+        assert!(result3.is_none());
+
+        // Memory should still be 0
+        assert_eq!(runtime.memory_pool.reserved(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_reservation_stream_error_handling() {
+        use datafusion_execution::memory_pool::MemoryConsumer;
+        use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+        let runtime = RuntimeEnvBuilder::new()
+            .with_memory_limit(10 * 1024 * 1024, 1.0)
+            .build_arc()
+            .unwrap();
+
+        let reservation = MemoryConsumer::new("test").register(&runtime.memory_pool);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        reservation.try_grow(1000).unwrap();
+        let initial = runtime.memory_pool.reserved();
+        assert_eq!(initial, 1000);
+
+        // Create a stream that errors
+        let stream = futures::stream::iter(vec![exec_err!("Test error")]);
+        let inner = Box::pin(RecordBatchStreamAdapter::new(Arc::clone(&schema), stream))
+            as SendableRecordBatchStream;
+
+        let mut res_stream =
+            ReservationStream::new(Arc::clone(&schema), inner, reservation);
+
+        // Get the error
+        let result = res_stream.next().await;
+        assert!(result.is_some());
+        assert!(result.unwrap().is_err());
+
+        // Verify reservation is NOT automatically freed on error
+        // The reservation is only freed when poll_next returns Poll::Ready(None)
+        // After an error, the stream may continue to hold the reservation
+        // until it's explicitly dropped or polled to None
+        let after_error = runtime.memory_pool.reserved();
+        assert_eq!(
+            after_error, 1000,
+            "Reservation should still be held after error"
+        );
+
+        // Drop the stream to free the reservation
+        drop(res_stream);
+
+        // Now memory should be freed
+        assert_eq!(
+            runtime.memory_pool.reserved(),
+            0,
+            "Memory should be freed when stream is dropped"
+        );
+    }
 }
diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs
index 6274995d04dab..5a1206629ac7b 100644
--- a/datafusion/physical-plan/src/streaming.rs
+++ b/datafusion/physical-plan/src/streaming.rs
@@ -22,20 +22,23 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use super::{DisplayAs, DisplayFormatType, PlanProperties};
-use crate::display::{display_orderings, ProjectSchemaDisplay};
-use crate::execution_plan::{Boundedness, EmissionType};
+use crate::coop::make_cooperative;
+use crate::display::{ProjectSchemaDisplay, display_orderings};
+use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::limit::LimitStream;
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
-    all_alias_free_columns, new_projections_for_columns, update_expr, ProjectionExec,
+    ProjectionExec, all_alias_free_columns, new_projections_for_columns, update_ordering,
 };
 use crate::stream::RecordBatchStreamAdapter;
 use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream};
 
 use arrow::datatypes::{Schema, SchemaRef};
-use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, internal_err, plan_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalSortExpr};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::{EquivalenceProperties, LexOrdering};
 
 use async_trait::async_trait;
 use futures::stream::StreamExt;
@@ -66,7 +69,7 @@ pub struct StreamingTableExec {
     projected_output_ordering: Vec<LexOrdering>,
     infinite: bool,
     limit: Option<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     metrics: ExecutionPlanMetricsSet,
 }
 
@@ -99,7 +102,7 @@ impl StreamingTableExec {
             projected_output_ordering.into_iter().collect::<Vec<_>>();
         let cache = Self::compute_properties(
             Arc::clone(&projected_schema),
-            &projected_output_ordering,
+            projected_output_ordering.clone(),
             &partitions,
             infinite,
         );
@@ -110,7 +113,7 @@ impl StreamingTableExec {
             projected_output_ordering,
             infinite,
             limit,
-            cache,
+            cache: Arc::new(cache),
             metrics: ExecutionPlanMetricsSet::new(),
         })
     }
@@ -146,7 +149,7 @@ impl StreamingTableExec {
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         schema: SchemaRef,
-        orderings: &[LexOrdering],
+        orderings: Vec<LexOrdering>,
         partitions: &[Arc<dyn PartitionStream>],
         infinite: bool,
     ) -> PlanProperties {
@@ -168,6 +171,7 @@ impl StreamingTableExec {
             EmissionType::Incremental,
             boundedness,
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
     }
 }
 
@@ -234,7 +238,7 @@ impl ExecutionPlan for StreamingTableExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -246,6 +250,13 @@ impl ExecutionPlan for StreamingTableExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
@@ -262,7 +273,7 @@ impl ExecutionPlan for StreamingTableExec {
         partition: usize,
         ctx: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        let stream = self.partitions[partition].execute(ctx);
+        let stream = self.partitions[partition].execute(Arc::clone(&ctx));
         let projected_stream = match self.projection.clone() {
             Some(projection) => Box::pin(RecordBatchStreamAdapter::new(
                 Arc::clone(&self.projected_schema),
@@ -272,16 +283,13 @@ impl ExecutionPlan for StreamingTableExec {
             )),
             None => stream,
         };
+        let stream = make_cooperative(projected_stream);
+
         Ok(match self.limit {
-            None => projected_stream,
+            None => stream,
             Some(fetch) => {
                 let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
-                Box::pin(LimitStream::new(
-                    projected_stream,
-                    0,
-                    Some(fetch),
-                    baseline_metrics,
-                ))
+                Box::pin(LimitStream::new(stream, 0, Some(fetch), baseline_metrics))
             }
         })
     }
@@ -300,26 +308,17 @@ impl ExecutionPlan for StreamingTableExec {
         let streaming_table_projections =
             self.projection().as_ref().map(|i| i.as_ref().to_vec());
         let new_projections = new_projections_for_columns(
-            projection,
+            projection.expr(),
             &streaming_table_projections
                 .unwrap_or_else(|| (0..self.schema().fields().len()).collect()),
         );
 
         let mut lex_orderings = vec![];
-        for lex_ordering in self.projected_output_ordering().into_iter() {
-            let mut orderings = LexOrdering::default();
-            for order in lex_ordering {
-                let Some(new_ordering) =
-                    update_expr(&order.expr, projection.expr(), false)?
-                else {
-                    return Ok(None);
-                };
-                orderings.push(PhysicalSortExpr {
-                    expr: new_ordering,
-                    options: order.options,
-                });
-            }
-            lex_orderings.push(orderings);
+        for ordering in self.projected_output_ordering().into_iter() {
+            let Some(ordering) = update_ordering(ordering, projection.expr())? else {
+                return Ok(None);
+            };
+            lex_orderings.push(ordering);
         }
 
         StreamingTableExec::try_new(
@@ -345,7 +344,7 @@ impl ExecutionPlan for StreamingTableExec {
             projected_output_ordering: self.projected_output_ordering.clone(),
             infinite: self.infinite,
             limit,
-            cache: self.cache.clone(),
+            cache: Arc::clone(&self.cache),
             metrics: self.metrics.clone(),
         }))
     }
@@ -356,7 +355,7 @@ mod test {
     use super::*;
     use crate::collect_partitioned;
     use crate::streaming::PartitionStream;
-    use crate::test::{make_partition, TestPartitionStream};
+    use crate::test::{TestPartitionStream, make_partition};
     use arrow::record_batch::RecordBatch;
 
     #[tokio::test]
diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs
index 4d5244e0e1d45..0630b8f174563 100644
--- a/datafusion/physical-plan/src/test.rs
+++ b/datafusion/physical-plan/src/test.rs
@@ -25,24 +25,29 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::Context;
 
+use crate::ExecutionPlan;
 use crate::common;
 use crate::execution_plan::{Boundedness, EmissionType};
 use crate::memory::MemoryStream;
 use crate::metrics::MetricsSet;
 use crate::stream::RecordBatchStreamAdapter;
 use crate::streaming::PartitionStream;
-use crate::ExecutionPlan;
 use crate::{DisplayAs, DisplayFormatType, PlanProperties};
 
 use arrow::array::{Array, ArrayRef, Int32Array, RecordBatch};
 use arrow_schema::{DataType, Field, Schema, SchemaRef};
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    config::ConfigOptions, internal_err, project_schema, Result, Statistics,
+    Result, Statistics, assert_or_internal_err, config::ConfigOptions, project_schema,
 };
 use datafusion_execution::{SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::equivalence::{
+    OrderingEquivalenceClass, ProjectionMapping,
+};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr::{
-    equivalence::ProjectionMapping, expressions::Column, utils::collect_columns,
-    EquivalenceProperties, LexOrdering, Partitioning,
+    EquivalenceProperties, LexOrdering, Partitioning, PhysicalExpr,
 };
 
 use futures::{Future, FutureExt};
@@ -73,7 +78,7 @@ pub struct TestMemoryExec {
     /// The maximum number of records to read from this plan. If `None`,
     /// all records after filtering are returned.
     fetch: Option<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl DisplayAs for TestMemoryExec {
@@ -103,10 +108,10 @@ impl DisplayAs for TestMemoryExec {
                     .map_or(String::new(), |limit| format!(", fetch={limit}"));
                 if self.show_sizes {
                     write!(
-                                f,
-                                "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
-                                partition_sizes.len(),
-                            )
+                        f,
+                        "partitions={}, partition_sizes={partition_sizes:?}{limit}{output_ordering}{constraints}",
+                        partition_sizes.len(),
+                    )
                 } else {
                     write!(
                         f,
@@ -129,10 +134,10 @@ impl ExecutionPlan for TestMemoryExec {
     }
 
     fn as_any(&self) -> &dyn Any {
-        unimplemented!()
+        self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -140,11 +145,25 @@ impl ExecutionPlan for TestMemoryExec {
         Vec::new()
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to all sort information orderings
+        let mut tnr = TreeNodeRecursion::Continue;
+        for ordering in &self.sort_information {
+            for sort_expr in ordering {
+                tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        unimplemented!()
+        Ok(self)
     }
 
     fn repartitioned(
@@ -167,15 +186,11 @@ impl ExecutionPlan for TestMemoryExec {
         unimplemented!()
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.statistics_inner()
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            Ok(Statistics::new_unknown(&self.schema))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema)))
         } else {
-            self.statistics_inner()
+            Ok(Arc::new(self.statistics_inner()?))
         }
     }
 
@@ -216,7 +231,7 @@ impl TestMemoryExec {
     fn eq_properties(&self) -> EquivalenceProperties {
         EquivalenceProperties::new_with_orderings(
             Arc::clone(&self.projected_schema),
-            self.sort_information.as_slice(),
+            self.sort_information.clone(),
         )
     }
 
@@ -237,15 +252,15 @@ impl TestMemoryExec {
         Ok(Self {
             partitions: partitions.to_vec(),
             schema,
-            cache: PlanProperties::new(
+            cache: Arc::new(PlanProperties::new(
                 EquivalenceProperties::new_with_orderings(
                     Arc::clone(&projected_schema),
-                    vec![].as_slice(),
+                    Vec::<LexOrdering>::new(),
                 ),
                 Partitioning::UnknownPartitioning(partitions.len()),
                 EmissionType::Incremental,
                 Boundedness::Bounded,
-            ),
+            )),
             projected_schema,
             projection,
             sort_information: vec![],
@@ -263,16 +278,15 @@ impl TestMemoryExec {
     ) -> Result<Arc<TestMemoryExec>> {
         let mut source = Self::try_new(partitions, schema, projection)?;
         let cache = source.compute_properties();
-        source.cache = cache;
+        source.cache = Arc::new(cache);
         Ok(Arc::new(source))
     }
 
     // Equivalent of `DataSourceExec::new`
-    pub fn update_cache(source: Arc<TestMemoryExec>) -> TestMemoryExec {
+    pub fn update_cache(source: &Arc<TestMemoryExec>) -> TestMemoryExec {
         let cache = source.compute_properties();
-        let source = &*source;
-        let mut source = source.clone();
-        source.cache = cache;
+        let mut source = (**source).clone();
+        source.cache = Arc::new(cache);
         source
     }
 
@@ -298,7 +312,7 @@ impl TestMemoryExec {
     }
 
     /// refer to `try_with_sort_information` at MemorySourceConfig for more information.
-    /// https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs
+    /// <https://github.com/apache/datafusion/tree/main/datafusion/datasource/src/memory.rs>
     pub fn try_with_sort_information(
         mut self,
         mut sort_information: Vec<LexOrdering>,
@@ -315,36 +329,33 @@ impl TestMemoryExec {
                     .map(|field| field.name() != col.name())
                     .unwrap_or(true)
             });
-        if let Some(col) = ambiguous_column {
-            return internal_err!(
-                "Column {:?} is not found in the original schema of the TestMemoryExec",
-                col
-            );
-        }
+        assert_or_internal_err!(
+            ambiguous_column.is_none(),
+            "Column {:?} is not found in the original schema of the TestMemoryExec",
+            ambiguous_column.as_ref().unwrap()
+        );
 
         // If there is a projection on the source, we also need to project orderings
         if let Some(projection) = &self.projection {
+            let base_schema = self.original_schema();
+            let proj_exprs = projection.iter().map(|idx| {
+                let name = base_schema.field(*idx).name();
+                (Arc::new(Column::new(name, *idx)) as _, name.to_string())
+            });
+            let projection_mapping =
+                ProjectionMapping::try_new(proj_exprs, &base_schema)?;
             let base_eqp = EquivalenceProperties::new_with_orderings(
-                self.original_schema(),
-                &sort_information,
+                Arc::clone(&base_schema),
+                sort_information,
             );
-            let proj_exprs = projection
-                .iter()
-                .map(|idx| {
-                    let base_schema = self.original_schema();
-                    let name = base_schema.field(*idx).name();
-                    (Arc::new(Column::new(name, *idx)) as _, name.to_string())
-                })
-                .collect::<Vec<_>>();
-            let projection_mapping =
-                ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?;
-            sort_information = base_eqp
-                .project(&projection_mapping, Arc::clone(&self.projected_schema))
-                .into_oeq_class()
-                .into_inner();
+            let proj_eqp =
+                base_eqp.project(&projection_mapping, Arc::clone(&self.projected_schema));
+            let oeq_class: OrderingEquivalenceClass = proj_eqp.into();
+            sort_information = oeq_class.into();
         }
 
         self.sort_information = sort_information;
+        self.cache = Arc::new(self.compute_properties());
         Ok(self)
     }
 
@@ -523,3 +534,33 @@ impl PartitionStream for TestPartitionStream {
         ))
     }
 }
+
+#[cfg(test)]
+macro_rules! assert_join_metrics {
+    ($metrics:expr, $expected_rows:expr) => {
+        assert_eq!($metrics.output_rows().unwrap(), $expected_rows);
+
+        let elapsed_compute = $metrics
+            .elapsed_compute()
+            .expect("did not find elapsed_compute metric");
+        let join_time = $metrics
+            .sum_by_name("join_time")
+            .expect("did not find join_time metric")
+            .as_usize();
+        let build_time = $metrics
+            .sum_by_name("build_time")
+            .expect("did not find build_time metric")
+            .as_usize();
+        // ensure join_time and build_time are considered in elapsed_compute
+        assert!(
+            join_time + build_time <= elapsed_compute,
+            "join_time ({}) + build_time ({}) = {} was <= elapsed_compute = {}",
+            join_time,
+            build_time,
+            join_time + build_time,
+            elapsed_compute
+        );
+    };
+}
+#[cfg(test)]
+pub(crate) use assert_join_metrics;
diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs
index 12ffca871f073..5458fa7ab8264 100644
--- a/datafusion/physical-plan/src/test/exec.rs
+++ b/datafusion/physical-plan/src/test/exec.rs
@@ -17,28 +17,29 @@
 
 //! Simple iterator over batches for use in testing
 
-use std::{
-    any::Any,
-    pin::Pin,
-    sync::{Arc, Weak},
-    task::{Context, Poll},
-};
-
 use crate::{
-    common, execution_plan::Boundedness, DisplayAs, DisplayFormatType, ExecutionPlan,
-    Partitioning, PlanProperties, RecordBatchStream, SendableRecordBatchStream,
-    Statistics,
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    RecordBatchStream, SendableRecordBatchStream, Statistics, common,
+    execution_plan::Boundedness,
 };
 use crate::{
     execution_plan::EmissionType,
     stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter},
 };
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::{
+    any::Any,
+    pin::Pin,
+    sync::{Arc, Weak},
+    task::{Context, Poll},
+};
 
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{DataFusionError, Result, internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr};
 
 use futures::Stream;
 use tokio::sync::Barrier;
@@ -125,7 +126,7 @@ pub struct MockExec {
     /// if true (the default), sends data using a separate task to ensure the
     /// batches are not available without this stream yielding first
     use_task: bool,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl MockExec {
@@ -142,7 +143,7 @@ impl MockExec {
             data,
             schema,
             use_task: true,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -192,7 +193,7 @@ impl ExecutionPlan for MockExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -200,6 +201,13 @@ impl ExecutionPlan for MockExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -254,13 +262,9 @@ impl ExecutionPlan for MockExec {
     }
 
     // Panics if one of the batches is an error
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema)));
         }
         let data: Result<Vec<_>> = self
             .data
@@ -273,11 +277,11 @@ impl ExecutionPlan for MockExec {
 
         let data = data?;
 
-        Ok(common::compute_record_batch_statistics(
+        Ok(Arc::new(common::compute_record_batch_statistics(
             &[data],
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
@@ -291,7 +295,6 @@ fn clone_error(e: &DataFusionError) -> DataFusionError {
 
 /// A Mock ExecutionPlan that does not start producing input until a
 /// barrier is called
-///
 #[derive(Debug)]
 pub struct BarrierExec {
     /// partitions to send back
@@ -299,29 +302,91 @@ pub struct BarrierExec {
     schema: SchemaRef,
 
     /// all streams wait on this barrier to produce
-    barrier: Arc<Barrier>,
-    cache: PlanProperties,
+    start_data_barrier: Option<Arc<Barrier>>,
+
+    /// the stream wait for this to return Poll::Ready(None)
+    finish_barrier: Option<Arc<(Barrier, AtomicUsize)>>,
+
+    cache: Arc<PlanProperties>,
+
+    log: bool,
 }
 
 impl BarrierExec {
     /// Create a new exec with some number of partitions.
     pub fn new(data: Vec<Vec<RecordBatch>>, schema: SchemaRef) -> Self {
         // wait for all streams and the input
-        let barrier = Arc::new(Barrier::new(data.len() + 1));
+        let barrier = Some(Arc::new(Barrier::new(data.len() + 1)));
         let cache = Self::compute_properties(Arc::clone(&schema), &data);
         Self {
             data,
             schema,
-            barrier,
-            cache,
+            start_data_barrier: barrier,
+            cache: Arc::new(cache),
+            finish_barrier: None,
+            log: true,
         }
     }
 
+    pub fn with_log(mut self, log: bool) -> Self {
+        self.log = log;
+        self
+    }
+
+    pub fn without_start_barrier(mut self) -> Self {
+        self.start_data_barrier = None;
+        self
+    }
+
+    pub fn with_finish_barrier(mut self) -> Self {
+        let barrier = Arc::new((
+            // wait for all streams and the input
+            Barrier::new(self.data.len() + 1),
+            AtomicUsize::new(0),
+        ));
+
+        self.finish_barrier = Some(barrier);
+        self
+    }
+
     /// wait until all the input streams and this function is ready
     pub async fn wait(&self) {
-        println!("BarrierExec::wait waiting on barrier");
-        self.barrier.wait().await;
-        println!("BarrierExec::wait done waiting");
+        let barrier = &self
+            .start_data_barrier
+            .as_ref()
+            .expect("Must only be called when having a start barrier");
+        if self.log {
+            println!("BarrierExec::wait waiting on barrier");
+        }
+        barrier.wait().await;
+        if self.log {
+            println!("BarrierExec::wait done waiting");
+        }
+    }
+
+    pub async fn wait_finish(&self) {
+        let (barrier, _) = &self
+            .finish_barrier
+            .as_deref()
+            .expect("Must only be called when having a finish barrier");
+
+        if self.log {
+            println!("BarrierExec::wait_finish waiting on barrier");
+        }
+        barrier.wait().await;
+        if self.log {
+            println!("BarrierExec::wait_finish done waiting");
+        }
+    }
+
+    /// Return true if the finish barrier has been reached in all partitions
+    pub fn is_finish_barrier_reached(&self) -> bool {
+        let (_, reached_finish) = self
+            .finish_barrier
+            .as_deref()
+            .expect("Must only be called when having finish barrier");
+
+        reached_finish.load(Ordering::Relaxed) == self.data.len()
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -365,7 +430,7 @@ impl ExecutionPlan for BarrierExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -380,6 +445,13 @@ impl ExecutionPlan for BarrierExec {
         unimplemented!()
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Returns a stream which yields data
     fn execute(
         &self,
@@ -392,17 +464,32 @@ impl ExecutionPlan for BarrierExec {
 
         // task simply sends data in order after barrier is reached
         let data = self.data[partition].clone();
-        let b = Arc::clone(&self.barrier);
+        let start_barrier = self.start_data_barrier.as_ref().map(Arc::clone);
+        let finish_barrier = self.finish_barrier.as_ref().map(Arc::clone);
+        let log = self.log;
         let tx = builder.tx();
         builder.spawn(async move {
-            println!("Partition {partition} waiting on barrier");
-            b.wait().await;
+            if let Some(barrier) = start_barrier {
+                if log {
+                    println!("Partition {partition} waiting on barrier");
+                }
+                barrier.wait().await;
+            }
             for batch in data {
-                println!("Partition {partition} sending batch");
+                if log {
+                    println!("Partition {partition} sending batch");
+                }
                 if let Err(e) = tx.send(Ok(batch)).await {
                     println!("ERROR batch via barrier stream stream: {e}");
                 }
             }
+            if let Some((barrier, reached_finish)) = finish_barrier.as_deref() {
+                if log {
+                    println!("Partition {partition} waiting on finish barrier");
+                }
+                reached_finish.fetch_add(1, Ordering::Relaxed);
+                barrier.wait().await;
+            }
 
             Ok(())
         });
@@ -411,26 +498,22 @@ impl ExecutionPlan for BarrierExec {
         Ok(builder.build())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema));
+            return Ok(Arc::new(Statistics::new_unknown(&self.schema)));
         }
-        Ok(common::compute_record_batch_statistics(
+        Ok(Arc::new(common::compute_record_batch_statistics(
             &self.data,
             &self.schema,
             None,
-        ))
+        )))
     }
 }
 
 /// A mock execution plan that errors on a call to execute
 #[derive(Debug)]
 pub struct ErrorExec {
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl Default for ErrorExec {
@@ -447,7 +530,9 @@ impl ErrorExec {
             true,
         )]));
         let cache = Self::compute_properties(schema);
-        Self { cache }
+        Self {
+            cache: Arc::new(cache),
+        }
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
@@ -488,7 +573,7 @@ impl ExecutionPlan for ErrorExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -503,6 +588,13 @@ impl ExecutionPlan for ErrorExec {
         unimplemented!()
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     /// Returns a stream which yields data
     fn execute(
         &self,
@@ -518,20 +610,20 @@ impl ExecutionPlan for ErrorExec {
 pub struct StatisticsExec {
     stats: Statistics,
     schema: Arc<Schema>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 impl StatisticsExec {
     pub fn new(stats: Statistics, schema: Schema) -> Self {
         assert_eq!(
-            stats
-                .column_statistics.len(), schema.fields().len(),
+            stats.column_statistics.len(),
+            schema.fields().len(),
             "if defined, the column statistics vector length should be the number of fields"
         );
         let cache = Self::compute_properties(Arc::new(schema.clone()));
         Self {
             stats,
             schema: Arc::new(schema),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -578,7 +670,7 @@ impl ExecutionPlan for StatisticsExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -586,6 +678,13 @@ impl ExecutionPlan for StatisticsExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
@@ -601,16 +700,12 @@ impl ExecutionPlan for StatisticsExec {
         unimplemented!("This plan only serves for testing statistics")
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(self.stats.clone())
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        Ok(if partition.is_some() {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(if partition.is_some() {
             Statistics::new_unknown(&self.schema)
         } else {
             self.stats.clone()
-        })
+        }))
     }
 }
 
@@ -624,7 +719,7 @@ pub struct BlockingExec {
 
     /// Ref-counting helper to check if the plan and the produced stream are still in memory.
     refs: Arc<()>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl BlockingExec {
@@ -634,7 +729,7 @@ impl BlockingExec {
         Self {
             schema,
             refs: Default::default(),
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -685,7 +780,7 @@ impl ExecutionPlan for BlockingExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -701,6 +796,13 @@ impl ExecutionPlan for BlockingExec {
         internal_err!("Children cannot be replaced in {self:?}")
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn execute(
         &self,
         _partition: usize,
@@ -767,7 +869,7 @@ pub struct PanicExec {
     /// Number of output partitions. Each partition will produce this
     /// many empty output record batches prior to panicking
     batches_until_panics: Vec<usize>,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl PanicExec {
@@ -779,7 +881,7 @@ impl PanicExec {
         Self {
             schema,
             batches_until_panics,
-            cache,
+            cache: Arc::new(cache),
         }
     }
 
@@ -831,7 +933,7 @@ impl ExecutionPlan for PanicExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -840,6 +942,13 @@ impl ExecutionPlan for PanicExec {
         vec![]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         _: Vec<Arc<dyn ExecutionPlan>>,
diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 0b5780b9143f9..e0b91f25161c0 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -18,25 +18,35 @@
 //! TopK: Combination of Sort / LIMIT
 
 use arrow::{
-    compute::interleave_record_batch,
+    array::{Array, AsArray},
+    compute::{FilterBuilder, interleave_record_batch, prep_null_mask_filter},
     row::{RowConverter, Rows, SortField},
 };
+use datafusion_expr::{ColumnarValue, Operator};
 use std::mem::size_of;
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder};
+use super::metrics::{
+    BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, RecordOutput,
+};
 use crate::spill::get_record_batch_memory_size;
-use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
+use crate::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter};
+
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
-use datafusion_common::Result;
-use datafusion_common::{internal_datafusion_err, HashMap};
+use datafusion_common::{
+    HashMap, Result, ScalarValue, internal_datafusion_err, internal_err,
+};
 use datafusion_execution::{
     memory_pool::{MemoryConsumer, MemoryReservation},
     runtime_env::RuntimeEnv,
 };
-use datafusion_physical_expr::PhysicalSortExpr;
-use datafusion_physical_expr_common::sort_expr::LexOrdering;
+use datafusion_physical_expr::{
+    PhysicalExpr,
+    expressions::{BinaryExpr, DynamicFilterPhysicalExpr, is_not_null, is_null, lit},
+};
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use parking_lot::RwLock;
 
 /// Global TopK
 ///
@@ -102,7 +112,7 @@ pub struct TopK {
     /// The target number of rows for output batches
     batch_size: usize,
     /// sort expressions
-    expr: Arc<[PhysicalSortExpr]>,
+    expr: LexOrdering,
     /// row converter, for sort keys
     row_converter: RowConverter,
     /// scratch space for converting rows
@@ -113,17 +123,47 @@ pub struct TopK {
     common_sort_prefix_converter: Option<RowConverter>,
     /// Common sort prefix between the input and the sort expressions to allow early exit optimization
     common_sort_prefix: Arc<[PhysicalSortExpr]>,
+    /// Filter matching the state of the `TopK` heap used for dynamic filter pushdown
+    filter: Arc<RwLock<TopKDynamicFilters>>,
     /// If true, indicates that all rows of subsequent batches are guaranteed
     /// to be greater (by byte order, after row conversion) than the top K,
     /// which means the top K won't change and the computation can be finished early.
     pub(crate) finished: bool,
 }
 
+/// For more background, please also see the [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]
+///
+/// [Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries blog]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
+#[derive(Debug, Clone)]
+pub struct TopKDynamicFilters {
+    /// The current *global* threshold for the dynamic filter.
+    /// This is shared across all partitions and is updated by any of them.
+    /// Stored as row bytes for efficient comparison.
+    threshold_row: Option<Vec<u8>>,
+    /// The expression used to evaluate the dynamic filter
+    /// Only updated when lock held for the duration of the update
+    expr: Arc<DynamicFilterPhysicalExpr>,
+}
+
+impl TopKDynamicFilters {
+    /// Create a new `TopKDynamicFilters` with the given expression
+    pub fn new(expr: Arc<DynamicFilterPhysicalExpr>) -> Self {
+        Self {
+            threshold_row: None,
+            expr,
+        }
+    }
+
+    pub fn expr(&self) -> Arc<DynamicFilterPhysicalExpr> {
+        Arc::clone(&self.expr)
+    }
+}
+
 // Guesstimate for memory allocation: estimated number of bytes used per row in the RowConverter
 const ESTIMATED_BYTES_PER_ROW: usize = 20;
 
 fn build_sort_fields(
-    ordering: &LexOrdering,
+    ordering: &[PhysicalSortExpr],
     schema: &SchemaRef,
 ) -> Result<Vec<SortField>> {
     ordering
@@ -141,21 +181,23 @@ impl TopK {
     /// Create a new [`TopK`] that stores the top `k` values, as
     /// defined by the sort expressions in `expr`.
     // TODO: make a builder or some other nicer API
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
+    #[expect(clippy::needless_pass_by_value)]
     pub fn try_new(
         partition_id: usize,
         schema: SchemaRef,
-        common_sort_prefix: LexOrdering,
+        common_sort_prefix: Vec<PhysicalSortExpr>,
         expr: LexOrdering,
         k: usize,
         batch_size: usize,
         runtime: Arc<RuntimeEnv>,
         metrics: &ExecutionPlanMetricsSet,
+        filter: Arc<RwLock<TopKDynamicFilters>>,
     ) -> Result<Self> {
         let reservation = MemoryConsumer::new(format!("TopK[{partition_id}]"))
             .register(&runtime.memory_pool);
 
-        let sort_fields: Vec<_> = build_sort_fields(&expr, &schema)?;
+        let sort_fields = build_sort_fields(&expr, &schema)?;
 
         // TODO there is potential to add special cases for single column sort fields
         // to improve performance
@@ -166,8 +208,7 @@ impl TopK {
         let prefix_row_converter = if common_sort_prefix.is_empty() {
             None
         } else {
-            let input_sort_fields: Vec<_> =
-                build_sort_fields(&common_sort_prefix, &schema)?;
+            let input_sort_fields = build_sort_fields(&common_sort_prefix, &schema)?;
             Some(RowConverter::new(input_sort_fields)?)
         };
 
@@ -176,24 +217,26 @@ impl TopK {
             metrics: TopKMetrics::new(metrics, partition_id),
             reservation,
             batch_size,
-            expr: Arc::from(expr),
+            expr,
             row_converter,
             scratch_rows,
             heap: TopKHeap::new(k, batch_size),
             common_sort_prefix_converter: prefix_row_converter,
             common_sort_prefix: Arc::from(common_sort_prefix),
             finished: false,
+            filter,
         })
     }
 
     /// Insert `batch`, remembering if any of its values are among
     /// the top k seen so far.
+    #[expect(clippy::needless_pass_by_value)]
     pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
         // Updates on drop
         let baseline = self.metrics.baseline.clone();
         let _timer = baseline.elapsed_compute().timer();
 
-        let sort_keys: Vec<ArrayRef> = self
+        let mut sort_keys: Vec<ArrayRef> = self
             .expr
             .iter()
             .map(|expr| {
@@ -202,43 +245,268 @@ impl TopK {
             })
             .collect::<Result<Vec<_>>>()?;
 
+        let mut selected_rows = None;
+
+        // If a filter is provided, update it with the new rows
+        let filter = self.filter.read().expr.current()?;
+        let filtered = filter.evaluate(&batch)?;
+        let num_rows = batch.num_rows();
+        let array = filtered.into_array(num_rows)?;
+        let mut filter = array.as_boolean().clone();
+        let true_count = filter.true_count();
+        if true_count == 0 {
+            // nothing to filter, so no need to update
+            return Ok(());
+        }
+        // only update the keys / rows if the filter does not match all rows
+        if true_count < num_rows {
+            // Indices in `set_indices` should be correct if filter contains nulls
+            // So we prepare the filter here. Note this is also done in the `FilterBuilder`
+            // so there is no overhead to do this here.
+            if filter.nulls().is_some() {
+                filter = prep_null_mask_filter(&filter);
+            }
+
+            let filter_predicate = FilterBuilder::new(&filter);
+            let filter_predicate = if sort_keys.len() > 1 {
+                // Optimize filter when it has multiple sort keys
+                filter_predicate.optimize().build()
+            } else {
+                filter_predicate.build()
+            };
+            selected_rows = Some(filter);
+            sort_keys = sort_keys
+                .iter()
+                .map(|key| filter_predicate.filter(key).map_err(|x| x.into()))
+                .collect::<Result<Vec<_>>>()?;
+        }
         // reuse existing `Rows` to avoid reallocations
         let rows = &mut self.scratch_rows;
         rows.clear();
         self.row_converter.append(rows, &sort_keys)?;
 
-        // TODO make this algorithmically better?:
-        // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`)
-        //       this avoids some work and also might be better vectorizable.
         let mut batch_entry = self.heap.register_batch(batch.clone());
-        for (index, row) in rows.iter().enumerate() {
+
+        let replacements = match selected_rows {
+            Some(filter) => {
+                self.find_new_topk_items(filter.values().set_indices(), &mut batch_entry)
+            }
+            None => self.find_new_topk_items(0..sort_keys[0].len(), &mut batch_entry),
+        };
+
+        if replacements > 0 {
+            self.metrics.row_replacements.add(replacements);
+
+            self.heap.insert_batch_entry(batch_entry);
+
+            // conserve memory
+            self.heap.maybe_compact()?;
+
+            // update memory reservation
+            self.reservation.try_resize(self.size())?;
+
+            // flag the topK as finished if we know that all
+            // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K,
+            // which means the top K won't change and the computation can be finished early.
+            self.attempt_early_completion(&batch)?;
+
+            // update the filter representation of our TopK heap
+            self.update_filter()?;
+        }
+
+        Ok(())
+    }
+
+    fn find_new_topk_items(
+        &mut self,
+        items: impl Iterator<Item = usize>,
+        batch_entry: &mut RecordBatchEntry,
+    ) -> usize {
+        let mut replacements = 0;
+        let rows = &mut self.scratch_rows;
+        for (index, row) in items.zip(rows.iter()) {
             match self.heap.max() {
                 // heap has k items, and the new row is greater than the
                 // current max in the heap ==> it is not a new topk
                 Some(max_row) if row.as_ref() >= max_row.row() => {}
                 // don't yet have k items or new item is lower than the currently k low values
                 None | Some(_) => {
-                    self.heap.add(&mut batch_entry, row, index);
-                    self.metrics.row_replacements.add(1);
+                    self.heap.add(batch_entry, row, index);
+                    replacements += 1;
                 }
             }
         }
-        self.heap.insert_batch_entry(batch_entry);
+        replacements
+    }
+
+    /// Update the filter representation of our TopK heap.
+    /// For example, given the sort expression `ORDER BY a DESC, b ASC LIMIT 3`,
+    /// and the current heap values `[(1, 5), (1, 4), (2, 3)]`,
+    /// the filter will be updated to:
+    ///
+    /// ```sql
+    /// (a > 1 OR (a = 1 AND b < 5)) AND
+    /// (a > 1 OR (a = 1 AND b < 4)) AND
+    /// (a > 2 OR (a = 2 AND b < 3))
+    /// ```
+    fn update_filter(&mut self) -> Result<()> {
+        // If the heap doesn't have k elements yet, we can't create thresholds
+        let Some(max_row) = self.heap.max() else {
+            return Ok(());
+        };
 
-        // conserve memory
-        self.heap.maybe_compact()?;
+        let new_threshold_row = &max_row.row;
 
-        // update memory reservation
-        self.reservation.try_resize(self.size())?;
+        // Fast path: check if the current value in topk is better than what is
+        // currently set in the filter with a read only lock
+        let needs_update = self
+            .filter
+            .read()
+            .threshold_row
+            .as_ref()
+            .map(|current_row| {
+                // new < current means new threshold is more selective
+                new_threshold_row < current_row
+            })
+            .unwrap_or(true); // No current threshold, so we need to set one
 
-        // flag the topK as finished if we know that all
-        // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K,
-        // which means the top K won't change and the computation can be finished early.
-        self.attempt_early_completion(&batch)?;
+        // exit early if the current values are better
+        if !needs_update {
+            return Ok(());
+        }
+
+        // Extract scalar values BEFORE acquiring lock to reduce critical section
+        let thresholds = match self.heap.get_threshold_values(&self.expr)? {
+            Some(t) => t,
+            None => return Ok(()),
+        };
+
+        // Build the filter expression OUTSIDE any synchronization
+        let predicate = Self::build_filter_expression(&self.expr, &thresholds)?;
+        let new_threshold = new_threshold_row.to_vec();
+
+        // update the threshold. Since there was a lock gap, we must check if it is still the best
+        // may have changed while we were building the expression without the lock
+        let mut filter = self.filter.write();
+        let old_threshold = filter.threshold_row.take();
+
+        // Update filter if we successfully updated the threshold
+        // (or if there was no previous threshold and we're the first)
+        match old_threshold {
+            Some(old_threshold) => {
+                // new threshold is still better than the old one
+                if new_threshold.as_slice() < old_threshold.as_slice() {
+                    filter.threshold_row = Some(new_threshold);
+                } else {
+                    // some other thread updated the threshold to a better
+                    // one while we were building so there is no need to
+                    // update the filter
+                    filter.threshold_row = Some(old_threshold);
+                    return Ok(());
+                }
+            }
+            None => {
+                // No previous threshold, so we can set the new one
+                filter.threshold_row = Some(new_threshold);
+            }
+        };
+
+        // Update the filter expression
+        if let Some(pred) = predicate
+            && !pred.eq(&lit(true))
+        {
+            filter.expr.update(pred)?;
+        }
 
         Ok(())
     }
 
+    /// Build the filter expression with the given thresholds.
+    /// This is now called outside of any locks to reduce critical section time.
+    fn build_filter_expression(
+        sort_exprs: &[PhysicalSortExpr],
+        thresholds: &[ScalarValue],
+    ) -> Result<Option<Arc<dyn PhysicalExpr>>> {
+        // Create filter expressions for each threshold
+        let mut filters: Vec<Arc<dyn PhysicalExpr>> =
+            Vec::with_capacity(thresholds.len());
+
+        let mut prev_sort_expr: Option<Arc<dyn PhysicalExpr>> = None;
+        for (sort_expr, value) in sort_exprs.iter().zip(thresholds.iter()) {
+            // Create the appropriate operator based on sort order
+            let op = if sort_expr.options.descending {
+                // For descending sort, we want col > threshold (exclude smaller values)
+                Operator::Gt
+            } else {
+                // For ascending sort, we want col < threshold (exclude larger values)
+                Operator::Lt
+            };
+
+            let value_null = value.is_null();
+
+            let comparison = Arc::new(BinaryExpr::new(
+                Arc::clone(&sort_expr.expr),
+                op,
+                lit(value.clone()),
+            ));
+
+            let comparison_with_null = match (sort_expr.options.nulls_first, value_null) {
+                // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison)
+                (true, true) => lit(false),
+                (true, false) => Arc::new(BinaryExpr::new(
+                    is_null(Arc::clone(&sort_expr.expr))?,
+                    Operator::Or,
+                    comparison,
+                )),
+                // For nulls last, transform to (threshold.value is null and threshold.expr is not null)
+                // or (threshold.value is not null and comparison)
+                (false, true) => is_not_null(Arc::clone(&sort_expr.expr))?,
+                (false, false) => comparison,
+            };
+
+            let mut eq_expr = Arc::new(BinaryExpr::new(
+                Arc::clone(&sort_expr.expr),
+                Operator::Eq,
+                lit(value.clone()),
+            ));
+
+            if value_null {
+                eq_expr = Arc::new(BinaryExpr::new(
+                    is_null(Arc::clone(&sort_expr.expr))?,
+                    Operator::Or,
+                    eq_expr,
+                ));
+            }
+
+            // For a query like order by a, b, the filter for column `b` is only applied if
+            // the condition a = threshold.value (considering null equality) is met.
+            // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field,
+            // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields.
+            match prev_sort_expr.take() {
+                None => {
+                    prev_sort_expr = Some(eq_expr);
+                    filters.push(comparison_with_null);
+                }
+                Some(p) => {
+                    filters.push(Arc::new(BinaryExpr::new(
+                        Arc::clone(&p),
+                        Operator::And,
+                        comparison_with_null,
+                    )));
+
+                    prev_sort_expr =
+                        Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr)));
+                }
+            }
+        }
+
+        let dynamic_predicate = filters
+            .into_iter()
+            .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b)));
+
+        Ok(dynamic_predicate)
+    }
+
     /// If input ordering shares a common sort prefix with the TopK, and if the TopK's heap is full,
     /// check if the computation can be finished early.
     /// This is the case if the last row of the current batch is strictly greater than the max row in the heap,
@@ -328,13 +596,17 @@ impl TopK {
             common_sort_prefix_converter: _,
             common_sort_prefix: _,
             finished: _,
+            filter,
         } = self;
         let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop
 
+        // Mark the dynamic filter as complete now that TopK processing is finished.
+        filter.read().expr().mark_complete();
+
         // break into record batches as needed
         let mut batches = vec![];
         if let Some(mut batch) = heap.emit()? {
-            metrics.baseline.output_rows().add(batch.num_rows());
+            (&batch).record_output(&metrics.baseline);
 
             loop {
                 if batch.num_rows() <= batch_size {
@@ -452,8 +724,8 @@ impl TopKHeap {
         let row = row.as_ref();
 
         // Reuse storage for evicted item if possible
-        let new_top_k = if self.inner.len() == self.k {
-            let prev_min = self.inner.pop().unwrap();
+        if self.inner.len() == self.k {
+            let mut prev_min = self.inner.peek_mut().unwrap();
 
             // Update batch use
             if prev_min.batch_id == batch_entry.id {
@@ -464,15 +736,16 @@ impl TopKHeap {
 
             // update memory accounting
             self.owned_bytes -= prev_min.owned_size();
-            prev_min.with_new_row(row, batch_id, index)
-        } else {
-            TopKRow::new(row, batch_id, index)
-        };
 
-        self.owned_bytes += new_top_k.owned_size();
+            prev_min.replace_with(row, batch_id, index);
 
-        // put the new row into the heap
-        self.inner.push(new_top_k)
+            self.owned_bytes += prev_min.owned_size();
+        } else {
+            let new_row = TopKRow::new(row, batch_id, index);
+            self.owned_bytes += new_row.owned_size();
+            // put the new row into the heap
+            self.inner.push(new_row);
+        };
     }
 
     /// Returns the values stored in this heap, from values low to
@@ -492,19 +765,18 @@ impl TopKHeap {
             return Ok((None, topk_rows));
         }
 
-        // Indices for each row within its respective RecordBatch
-        let indices: Vec<_> = topk_rows
-            .iter()
-            .enumerate()
-            .map(|(i, k)| (i, k.index))
-            .collect();
+        // Collect the batches into a vec and store the "batch_id -> array_pos" mapping, to then
+        // build the `indices` vec below. This is needed since the batch ids are not continuous.
+        let mut record_batches = Vec::new();
+        let mut batch_id_array_pos = HashMap::new();
+        for (array_pos, (batch_id, batch)) in self.store.batches.iter().enumerate() {
+            record_batches.push(&batch.batch);
+            batch_id_array_pos.insert(*batch_id, array_pos);
+        }
 
-        let record_batches: Vec<_> = topk_rows
+        let indices: Vec<_> = topk_rows
             .iter()
-            .map(|k| {
-                let entry = self.store.get(k.batch_id).expect("invalid stored batch id");
-                &entry.batch
-            })
+            .map(|k| (batch_id_array_pos[&k.batch_id], k.index))
             .collect();
 
         // At this point `indices` contains indexes within the
@@ -570,6 +842,47 @@ impl TopKHeap {
             + self.store.size()
             + self.owned_bytes
     }
+
+    fn get_threshold_values(
+        &self,
+        sort_exprs: &[PhysicalSortExpr],
+    ) -> Result<Option<Vec<ScalarValue>>> {
+        // If the heap doesn't have k elements yet, we can't create thresholds
+        let max_row = match self.max() {
+            Some(row) => row,
+            None => return Ok(None),
+        };
+
+        // Get the batch that contains the max row
+        let batch_entry = match self.store.get(max_row.batch_id) {
+            Some(entry) => entry,
+            None => return internal_err!("Invalid batch ID in TopKRow"),
+        };
+
+        // Extract threshold values for each sort expression
+        let mut scalar_values = Vec::with_capacity(sort_exprs.len());
+        for sort_expr in sort_exprs {
+            // Extract the value for this column from the max row
+            let expr = Arc::clone(&sort_expr.expr);
+            let value = expr.evaluate(&batch_entry.batch.slice(max_row.index, 1))?;
+
+            // Convert to scalar value - should be a single value since we're evaluating on a single row batch
+            let scalar = match value {
+                ColumnarValue::Scalar(scalar) => scalar,
+                ColumnarValue::Array(array) if array.len() == 1 => {
+                    // Extract the first (and only) value from the array
+                    ScalarValue::try_from_array(&array, 0)?
+                }
+                array => {
+                    return internal_err!("Expected a scalar value, got {:?}", array);
+                }
+            };
+
+            scalar_values.push(scalar);
+        }
+
+        Ok(Some(scalar_values))
+    }
 }
 
 /// Represents one of the top K rows held in this heap. Orders
@@ -599,26 +912,13 @@ impl TopKRow {
         }
     }
 
-    /// Create a new  TopKRow reusing the existing allocation
-    fn with_new_row(
-        self,
-        new_row: impl AsRef<[u8]>,
-        batch_id: u32,
-        index: usize,
-    ) -> Self {
-        let Self {
-            mut row,
-            batch_id: _,
-            index: _,
-        } = self;
-        row.clear();
-        row.extend_from_slice(new_row.as_ref());
+    // Replace the existing row capacity with new values
+    fn replace_with(&mut self, new_row: impl AsRef<[u8]>, batch_id: u32, index: usize) {
+        self.row.clear();
+        self.row.extend_from_slice(new_row.as_ref());
 
-        Self {
-            row,
-            batch_id,
-            index,
-        }
+        self.batch_id = batch_id;
+        self.index = index;
     }
 
     /// Returns the number of bytes owned by this row in the heap (not
@@ -637,6 +937,7 @@ impl Eq for TopKRow {}
 
 impl PartialOrd for TopKRow {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        // TODO PartialOrd is not consistent with PartialEq; PartialOrd contract is violated
         Some(self.cmp(other))
     }
 }
@@ -821,8 +1122,8 @@ mod tests {
         };
 
         // Input ordering uses only column "a" (a prefix of the full sort).
-        let input_ordering = LexOrdering::from(vec![sort_expr_a.clone()]);
-        let full_expr = LexOrdering::from(vec![sort_expr_a, sort_expr_b]);
+        let prefix = vec![sort_expr_a.clone()];
+        let full_expr = LexOrdering::from([sort_expr_a, sort_expr_b]);
 
         // Create a dummy runtime environment and metrics.
         let runtime = Arc::new(RuntimeEnv::default());
@@ -832,12 +1133,15 @@ mod tests {
         let mut topk = TopK::try_new(
             0,
             Arc::clone(&schema),
-            input_ordering,
+            prefix,
             full_expr,
             3,
             2,
             runtime,
             &metrics,
+            Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
+                DynamicFilterPhysicalExpr::new(vec![], lit(true)),
+            )))),
         )?;
 
         // Create the first batch with two columns:
@@ -890,4 +1194,52 @@ mod tests {
 
         Ok(())
     }
+
+    /// This test verifies that the dynamic filter is marked as complete after TopK processing finishes.
+    #[tokio::test]
+    async fn test_topk_marks_filter_complete() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+
+        let sort_expr = PhysicalSortExpr {
+            expr: col("a", schema.as_ref())?,
+            options: SortOptions::default(),
+        };
+
+        let full_expr = LexOrdering::from([sort_expr.clone()]);
+        let prefix = vec![sort_expr];
+
+        // Create a dummy runtime environment and metrics
+        let runtime = Arc::new(RuntimeEnv::default());
+        let metrics = ExecutionPlanMetricsSet::new();
+
+        // Create a dynamic filter that we'll check for completion
+        let dynamic_filter = Arc::new(DynamicFilterPhysicalExpr::new(vec![], lit(true)));
+        let dynamic_filter_clone = Arc::clone(&dynamic_filter);
+
+        // Create a TopK instance
+        let mut topk = TopK::try_new(
+            0,
+            Arc::clone(&schema),
+            prefix,
+            full_expr,
+            2,
+            10,
+            runtime,
+            &metrics,
+            Arc::new(RwLock::new(TopKDynamicFilters::new(dynamic_filter))),
+        )?;
+
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(3), Some(1), Some(2)]));
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![array])?;
+        topk.insert_batch(batch)?;
+
+        // Call emit to finish TopK processing
+        let _results: Vec<_> = topk.emit()?.try_collect().await?;
+
+        // After emit is called, the dynamic filter should be marked as complete
+        // wait_complete() should return immediately
+        dynamic_filter_clone.wait_complete().await;
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/tree_node.rs b/datafusion/physical-plan/src/tree_node.rs
index 78ba984ed1a58..aa4f144f91898 100644
--- a/datafusion/physical-plan/src/tree_node.rs
+++ b/datafusion/physical-plan/src/tree_node.rs
@@ -20,10 +20,10 @@
 use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
-use crate::{displayable, with_new_children_if_necessary, ExecutionPlan};
+use crate::{ExecutionPlan, displayable, with_new_children_if_necessary};
 
-use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 use datafusion_common::Result;
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
 
 impl DynTreeNode for dyn ExecutionPlan {
     fn arc_children(&self) -> Vec<&Arc<Self>> {
@@ -42,8 +42,8 @@ impl DynTreeNode for dyn ExecutionPlan {
 /// A node context object beneficial for writing optimizer rules.
 /// This context encapsulating an [`ExecutionPlan`] node with a payload.
 ///
-/// Since each wrapped node has it's children within both the [`PlanContext.plan.children()`],
-/// as well as separately within the [`PlanContext.children`] (which are child nodes wrapped in the context),
+/// Since each wrapped node has it's children within both the `PlanContext.plan.children()`,
+/// as well as separately within the `PlanContext.children` (which are child nodes wrapped in the context),
 /// it's important to keep these child plans in sync when performing mutations.
 ///
 /// Since there are two ways to access child plans directly -— it's recommended
@@ -69,7 +69,7 @@ impl<T> PlanContext<T> {
         }
     }
 
-    /// Update the [`PlanContext.plan.children()`] from the [`PlanContext.children`],
+    /// Update the `PlanContext.plan.children()` from the `PlanContext.children`,
     /// if the `PlanContext.children` have been changed.
     pub fn update_plan_from_children(mut self) -> Result<Self> {
         let children_plans = self.children.iter().map(|c| Arc::clone(&c.plan)).collect();
diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs
index 930fe793d1d4c..db550e7f147d1 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -27,24 +27,37 @@ use std::task::{Context, Poll};
 use std::{any::Any, sync::Arc};
 
 use super::{
-    metrics::{ExecutionPlanMetricsSet, MetricsSet},
     ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan,
     ExecutionPlanProperties, Partitioning, PlanProperties, RecordBatchStream,
     SendableRecordBatchStream, Statistics,
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
 };
+use crate::check_if_same_properties;
 use crate::execution_plan::{
-    boundedness_from_children, emission_type_from_children, InvariantLevel,
+    CardinalityEffect, InvariantLevel, boundedness_from_children,
+    check_default_invariants, emission_type_from_children,
+};
+use crate::filter::FilterExec;
+use crate::filter_pushdown::{
+    ChildPushdownResult, FilterDescription, FilterPushdownPhase,
+    FilterPushdownPropagation, PushedDown,
 };
 use crate::metrics::BaselineMetrics;
-use crate::projection::{make_with_child, ProjectionExec};
+use crate::projection::{ProjectionExec, make_with_child};
 use crate::stream::ObservedStream;
 
 use arrow::datatypes::{Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::stats::Precision;
-use datafusion_common::{exec_err, internal_err, DataFusionError, Result};
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::stats::{Precision, estimate_ndv_with_overlap};
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{
+    Result, assert_or_internal_err, exec_err, internal_datafusion_err,
+};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{calculate_union, EquivalenceProperties};
+use datafusion_physical_expr::{
+    EquivalenceProperties, PhysicalExpr, calculate_union, conjunction,
+};
 
 use futures::Stream;
 use itertools::Itertools;
@@ -64,14 +77,14 @@ use tokio::macros::support::thread_rng_n;
 /// partitions, and then next `M` output partitions are from Input 2.
 ///
 /// ```text
-///                       ▲       ▲           ▲         ▲
-///                       │       │           │         │
-///     Output            │  ...  │           │         │
-///   Partitions          │0      │N-1        │ N       │N+M-1
-///(passes through   ┌────┴───────┴───────────┴─────────┴───┐
-/// the N+M input    │              UnionExec               │
-///  partitions)     │                                      │
-///                  └──────────────────────────────────────┘
+///                        ▲       ▲           ▲         ▲
+///                        │       │           │         │
+///      Output            │  ...  │           │         │
+///    Partitions          │0      │N-1        │ N       │N+M-1
+/// (passes through   ┌────┴───────┴───────────┴─────────┴───┐
+///  the N+M input    │              UnionExec               │
+///   partitions)     │                                      │
+///                   └──────────────────────────────────────┘
 ///                                      ▲
 ///                                      │
 ///                                      │
@@ -95,13 +108,15 @@ pub struct UnionExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnionExec {
     /// Create a new UnionExec
+    #[deprecated(since = "44.0.0", note = "Use UnionExec::try_new instead")]
     pub fn new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Self {
-        let schema = union_schema(&inputs);
+        let schema =
+            union_schema(&inputs).expect("UnionExec::new called with empty inputs");
         // The schema of the inputs and the union schema is consistent when:
         // - They have the same number of fields, and
         // - Their fields have same types at the same indices.
@@ -111,7 +126,38 @@ impl UnionExec {
         UnionExec {
             inputs,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
+        }
+    }
+
+    /// Try to create a new UnionExec.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - `inputs` is empty
+    ///
+    /// # Optimization
+    /// If there is only one input, returns that input directly rather than wrapping it in a UnionExec
+    pub fn try_new(
+        inputs: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match inputs.len() {
+            0 => exec_err!("UnionExec requires at least one input"),
+            1 => Ok(inputs.into_iter().next().unwrap()),
+            _ => {
+                let schema = union_schema(&inputs)?;
+                // The schema of the inputs and the union schema is consistent when:
+                // - They have the same number of fields, and
+                // - Their fields have same types at the same indices.
+                // Here, we know that schemas are consistent and the call below can
+                // not return an error.
+                let cache = Self::compute_properties(&inputs, schema).unwrap();
+                Ok(Arc::new(UnionExec {
+                    inputs,
+                    metrics: ExecutionPlanMetricsSet::new(),
+                    cache: Arc::new(cache),
+                }))
+            }
         }
     }
 
@@ -145,6 +191,17 @@ impl UnionExec {
             boundedness_from_children(inputs),
         ))
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            inputs: children,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for UnionExec {
@@ -172,20 +229,16 @@ impl ExecutionPlan for UnionExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
-    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
-        (self.inputs().len() >= 2)
-            .then_some(())
-            .ok_or(DataFusionError::Internal(
-                "UnionExec should have at least 2 children".into(),
-            ))
-    }
+    fn check_invariants(&self, check: InvariantLevel) -> Result<()> {
+        check_default_invariants(self, check)?;
 
-    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        self.inputs.iter().collect()
+        (self.inputs().len() >= 2).then_some(()).ok_or_else(|| {
+            internal_datafusion_err!("UnionExec should have at least 2 children")
+        })
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -213,11 +266,27 @@ impl ExecutionPlan for UnionExec {
         }
     }
 
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false; self.children().len()]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        self.inputs.iter().collect()
+    }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(UnionExec::new(children)))
+        check_if_same_properties!(self, children);
+        UnionExec::try_new(children)
     }
 
     fn execute(
@@ -225,7 +294,12 @@ impl ExecutionPlan for UnionExec {
         mut partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start UnionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start UnionExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         // record the tiny amount of work done in this function so
         // elapsed_compute is reported as non zero
@@ -257,11 +331,7 @@ impl ExecutionPlan for UnionExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         if let Some(partition_idx) = partition {
             // For a specific partition, find which input it belongs to
             let mut remaining_idx = partition_idx;
@@ -274,24 +344,32 @@ impl ExecutionPlan for UnionExec {
                 remaining_idx -= input_partition_count;
             }
             // If we get here, the partition index is out of bounds
-            Ok(Statistics::new_unknown(&self.schema()))
+            Ok(Arc::new(Statistics::new_unknown(&self.schema())))
         } else {
             // Collect statistics from all inputs
             let stats = self
                 .inputs
                 .iter()
-                .map(|input_exec| input_exec.partition_statistics(None))
+                .map(|input_exec| {
+                    input_exec
+                        .partition_statistics(None)
+                        .map(Arc::unwrap_or_clone)
+                })
                 .collect::<Result<Vec<_>>>()?;
 
-            Ok(stats
-                .into_iter()
-                .reduce(stats_union)
-                .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
+            Ok(Arc::new(
+                stats
+                    .into_iter()
+                    .reduce(stats_union)
+                    .unwrap_or_else(|| Statistics::new_unknown(&self.schema())),
+            ))
         }
     }
 
-    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
-        vec![false; self.children().len()]
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        // Union combines rows from multiple inputs, so output rows are not tied
+        // to any single input and can only be constrained as greater-or-equal.
+        CardinalityEffect::GreaterEqual
     }
 
     fn supports_limit_pushdown(&self) -> bool {
@@ -316,7 +394,93 @@ impl ExecutionPlan for UnionExec {
             .map(|child| make_with_child(projection, child))
             .collect::<Result<Vec<_>>>()?;
 
-        Ok(Some(Arc::new(UnionExec::new(new_children))))
+        Ok(Some(UnionExec::try_new(new_children.clone())?))
+    }
+
+    fn gather_filters_for_pushdown(
+        &self,
+        _phase: FilterPushdownPhase,
+        parent_filters: Vec<Arc<dyn PhysicalExpr>>,
+        _config: &ConfigOptions,
+    ) -> Result<FilterDescription> {
+        FilterDescription::from_children(parent_filters, &self.children())
+    }
+
+    fn handle_child_pushdown_result(
+        &self,
+        phase: FilterPushdownPhase,
+        child_pushdown_result: ChildPushdownResult,
+        _config: &ConfigOptions,
+    ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> {
+        // Pre phase: handle heterogeneous pushdown by wrapping individual
+        // children with FilterExec and reporting all filters as handled.
+        // Post phase: use default behavior to let the filter creator decide how to handle
+        // filters that weren't fully pushed down.
+        if phase != FilterPushdownPhase::Pre {
+            return Ok(FilterPushdownPropagation::if_all(child_pushdown_result));
+        }
+
+        // UnionExec needs specialized filter pushdown handling when children have
+        // heterogeneous pushdown support. Without this, when some children support
+        // pushdown and others don't, the default behavior would leave FilterExec
+        // above UnionExec, re-applying filters to outputs of all children—including
+        // those that already applied the filters via pushdown. This specialized
+        // implementation adds FilterExec only to children that don't support
+        // pushdown, avoiding redundant filtering and improving performance.
+        //
+        // Example: Given Child1 (no pushdown support) and Child2 (has pushdown support)
+        //   Default behavior:          This implementation:
+        //   FilterExec                 UnionExec
+        //     UnionExec                  FilterExec
+        //       Child1                     Child1
+        //       Child2(filter)           Child2(filter)
+
+        // Collect unsupported filters for each child
+        let mut unsupported_filters_per_child = vec![Vec::new(); self.inputs.len()];
+        for parent_filter_result in child_pushdown_result.parent_filters.iter() {
+            for (child_idx, &child_result) in
+                parent_filter_result.child_results.iter().enumerate()
+            {
+                if matches!(child_result, PushedDown::No) {
+                    unsupported_filters_per_child[child_idx]
+                        .push(Arc::clone(&parent_filter_result.filter));
+                }
+            }
+        }
+
+        // Wrap children that have unsupported filters with FilterExec
+        let mut new_children = self.inputs.clone();
+        for (child_idx, unsupported_filters) in
+            unsupported_filters_per_child.iter().enumerate()
+        {
+            if !unsupported_filters.is_empty() {
+                let combined_filter = conjunction(unsupported_filters.clone());
+                new_children[child_idx] = Arc::new(FilterExec::try_new(
+                    combined_filter,
+                    Arc::clone(&self.inputs[child_idx]),
+                )?);
+            }
+        }
+
+        // Check if any children were modified
+        let children_modified = new_children
+            .iter()
+            .zip(self.inputs.iter())
+            .any(|(new, old)| !Arc::ptr_eq(new, old));
+
+        let all_filters_pushed =
+            vec![PushedDown::Yes; child_pushdown_result.parent_filters.len()];
+        let propagation = if children_modified {
+            let updated_node = UnionExec::try_new(new_children)?;
+            FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed)
+                .with_updated_node(updated_node)
+        } else {
+            FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed)
+        };
+
+        // Report all parent filters as supported since we've ensured they're applied
+        // on all children (either pushed down or via FilterExec)
+        Ok(propagation)
     }
 }
 
@@ -359,22 +523,21 @@ pub struct InterleaveExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl InterleaveExec {
     /// Create a new InterleaveExec
     pub fn try_new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Result<Self> {
-        if !can_interleave(inputs.iter()) {
-            return internal_err!(
-                "Not all InterleaveExec children have a consistent hash partitioning"
-            );
-        }
-        let cache = Self::compute_properties(&inputs);
+        assert_or_internal_err!(
+            can_interleave(inputs.iter()),
+            "Not all InterleaveExec children have a consistent hash partitioning"
+        );
+        let cache = Self::compute_properties(&inputs)?;
         Ok(InterleaveExec {
             inputs,
             metrics: ExecutionPlanMetricsSet::new(),
-            cache,
+            cache: Arc::new(cache),
         })
     }
 
@@ -384,17 +547,28 @@ impl InterleaveExec {
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
-    fn compute_properties(inputs: &[Arc<dyn ExecutionPlan>]) -> PlanProperties {
-        let schema = union_schema(inputs);
+    fn compute_properties(inputs: &[Arc<dyn ExecutionPlan>]) -> Result<PlanProperties> {
+        let schema = union_schema(inputs)?;
         let eq_properties = EquivalenceProperties::new(schema);
         // Get output partitioning:
         let output_partitioning = inputs[0].output_partitioning().clone();
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             emission_type_from_children(inputs),
             boundedness_from_children(inputs),
-        )
+        ))
+    }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            inputs: children,
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
     }
 }
 
@@ -423,7 +597,7 @@ impl ExecutionPlan for InterleaveExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -435,16 +609,23 @@ impl ExecutionPlan for InterleaveExec {
         vec![false; self.inputs().len()]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         // New children are no longer interleavable, which might be a bug of optimization rewrite.
-        if !can_interleave(children.iter()) {
-            return internal_err!(
-                "Can not create InterleaveExec: new children can not be interleaved"
-            );
-        }
+        assert_or_internal_err!(
+            can_interleave(children.iter()),
+            "Can not create InterleaveExec: new children can not be interleaved"
+        );
+        check_if_same_properties!(self, children);
         Ok(Arc::new(InterleaveExec::try_new(children)?))
     }
 
@@ -453,7 +634,12 @@ impl ExecutionPlan for InterleaveExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        trace!("Start InterleaveExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
+        trace!(
+            "Start InterleaveExec::execute for partition {} of context session_id {} and task_id {:?}",
+            partition,
+            context.session_id(),
+            context.task_id()
+        );
         let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
         // record the tiny amount of work done in this function so
         // elapsed_compute is reported as non zero
@@ -490,24 +676,22 @@ impl ExecutionPlan for InterleaveExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
-    }
-
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_some() {
-            return Ok(Statistics::new_unknown(&self.schema()));
-        }
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
         let stats = self
             .inputs
             .iter()
-            .map(|stat| stat.partition_statistics(None))
+            .map(|stat| {
+                stat.partition_statistics(partition)
+                    .map(Arc::unwrap_or_clone)
+            })
             .collect::<Result<Vec<_>>>()?;
 
-        Ok(stats
-            .into_iter()
-            .reduce(stats_union)
-            .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
+        Ok(Arc::new(
+            stats
+                .into_iter()
+                .reduce(stats_union)
+                .unwrap_or_else(|| Statistics::new_unknown(&self.schema())),
+        ))
     }
 
     fn benefits_from_input_partitioning(&self) -> Vec<bool> {
@@ -535,17 +719,34 @@ pub fn can_interleave<T: Borrow<Arc<dyn ExecutionPlan>>>(
             .all(|partition| partition == *reference)
 }
 
-fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
+fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> Result<SchemaRef> {
+    if inputs.is_empty() {
+        return exec_err!("Cannot create union schema from empty inputs");
+    }
+
     let first_schema = inputs[0].schema();
+    let first_field_count = first_schema.fields().len();
+
+    // validate that all inputs have the same number of fields
+    for (idx, input) in inputs.iter().enumerate().skip(1) {
+        let field_count = input.schema().fields().len();
+        if field_count != first_field_count {
+            return exec_err!(
+                "UnionExec/InterleaveExec requires all inputs to have the same number of fields. \
+                 Input 0 has {first_field_count} fields, but input {idx} has {field_count} fields"
+            );
+        }
+    }
 
-    let fields = (0..first_schema.fields().len())
+    let fields = (0..first_field_count)
         .map(|i| {
             // We take the name from the left side of the union to match how names are coerced during logical planning,
             // which also uses the left side names.
             let base_field = first_schema.field(i).clone();
 
             // Coerce metadata and nullability across all inputs
-            let merged_field = inputs
+
+            inputs
                 .iter()
                 .enumerate()
                 .map(|(input_idx, input)| {
@@ -567,9 +768,7 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
                 // We can unwrap this because if inputs was empty, this would've already panic'ed when we
                 // indexed into inputs[0].
                 .unwrap()
-                .with_name(base_field.name());
-
-            merged_field
+                .with_name(base_field.name())
         })
         .collect::<Vec<_>>();
 
@@ -578,7 +777,10 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
         .flat_map(|i| i.schema().metadata().clone().into_iter())
         .collect();
 
-    Arc::new(Schema::new_with_metadata(fields, all_metadata_merged))
+    Ok(Arc::new(Schema::new_with_metadata(
+        fields,
+        all_metadata_merged,
+    )))
 }
 
 /// CombinedRecordBatchStream can be used to combine a Vec of SendableRecordBatchStreams into one
@@ -650,9 +852,9 @@ impl Stream for CombinedRecordBatchStream {
 
 fn col_stats_union(
     mut left: ColumnStatistics,
-    right: ColumnStatistics,
+    right: &ColumnStatistics,
 ) -> ColumnStatistics {
-    left.distinct_count = Precision::Absent;
+    left.distinct_count = union_distinct_count(&left, right);
     left.min_value = left.min_value.min(&right.min_value);
     left.max_value = left.max_value.max(&right.max_value);
     left.sum_value = left.sum_value.add(&right.sum_value);
@@ -661,13 +863,42 @@ fn col_stats_union(
     left
 }
 
+fn union_distinct_count(
+    left: &ColumnStatistics,
+    right: &ColumnStatistics,
+) -> Precision<usize> {
+    let (ndv_left, ndv_right) = match (
+        left.distinct_count.get_value(),
+        right.distinct_count.get_value(),
+    ) {
+        (Some(&l), Some(&r)) => (l, r),
+        _ => return Precision::Absent,
+    };
+
+    // Even with exact inputs, the union NDV depends on how
+    // many distinct values are shared between the left and right.
+    // We can only estimate this via range overlap. Thus both paths
+    // below return `Inexact`.
+    if let Some(ndv) = estimate_ndv_with_overlap(left, right, ndv_left, ndv_right) {
+        return Precision::Inexact(ndv);
+    }
+
+    Precision::Inexact(ndv_left + ndv_right)
+}
+
 fn stats_union(mut left: Statistics, right: Statistics) -> Statistics {
-    left.num_rows = left.num_rows.add(&right.num_rows);
-    left.total_byte_size = left.total_byte_size.add(&right.total_byte_size);
+    let Statistics {
+        num_rows: right_num_rows,
+        total_byte_size: right_total_bytes,
+        column_statistics: right_column_statistics,
+        ..
+    } = right;
+    left.num_rows = left.num_rows.add(&right_num_rows);
+    left.total_byte_size = left.total_byte_size.add(&right_total_bytes);
     left.column_statistics = left
         .column_statistics
         .into_iter()
-        .zip(right.column_statistics)
+        .zip(right_column_statistics.iter())
         .map(|(a, b)| col_stats_union(a, b))
         .collect::<Vec<_>>();
     left
@@ -677,15 +908,14 @@ fn stats_union(mut left: Statistics, right: Statistics) -> Statistics {
 mod tests {
     use super::*;
     use crate::collect;
-    use crate::test;
-    use crate::test::TestMemoryExec;
+    use crate::test::{self, TestMemoryExec};
 
     use arrow::compute::SortOptions;
     use arrow::datatypes::DataType;
     use datafusion_common::ScalarValue;
+    use datafusion_common::stats::Precision;
+    use datafusion_physical_expr::equivalence::convert_to_orderings;
     use datafusion_physical_expr::expressions::col;
-    use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
-    use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
     // Generate a schema which consists of 7 columns (a, b, c, d, e, f, g)
     fn create_test_schema() -> Result<SchemaRef> {
@@ -701,17 +931,16 @@ mod tests {
         Ok(schema)
     }
 
-    // Convert each tuple to PhysicalSortExpr
-    fn convert_to_sort_exprs(
-        in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)],
-    ) -> LexOrdering {
-        in_data
-            .iter()
-            .map(|(expr, options)| PhysicalSortExpr {
-                expr: Arc::clone(*expr),
-                options: *options,
-            })
-            .collect::<LexOrdering>()
+    fn create_test_schema2() -> Result<SchemaRef> {
+        let a = Field::new("a", DataType::Int32, true);
+        let b = Field::new("b", DataType::Int32, true);
+        let c = Field::new("c", DataType::Int32, true);
+        let d = Field::new("d", DataType::Int32, true);
+        let e = Field::new("e", DataType::Int32, true);
+        let f = Field::new("f", DataType::Int32, true);
+        let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f]));
+
+        Ok(schema)
     }
 
     #[tokio::test]
@@ -722,7 +951,7 @@ mod tests {
         let csv = test::scan_partitioned(4);
         let csv2 = test::scan_partitioned(5);
 
-        let union_exec = Arc::new(UnionExec::new(vec![csv, csv2]));
+        let union_exec: Arc<dyn ExecutionPlan> = UnionExec::try_new(vec![csv, csv2])?;
 
         // Should have 9 partitions and 9 output batches
         assert_eq!(
@@ -751,6 +980,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
@@ -758,6 +988,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -765,6 +996,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
                     sum_value: Precision::Exact(ScalarValue::Float32(Some(42.0))),
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -779,6 +1011,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(1),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -786,6 +1019,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("b")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -793,6 +1027,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -803,11 +1038,12 @@ mod tests {
             total_byte_size: Precision::Exact(52),
             column_statistics: vec![
                 ColumnStatistics {
-                    distinct_count: Precision::Absent,
+                    distinct_count: Precision::Inexact(6),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
                     sum_value: Precision::Exact(ScalarValue::Int64(Some(84))),
                     null_count: Precision::Exact(1),
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -815,6 +1051,7 @@ mod tests {
                     min_value: Precision::Exact(ScalarValue::from("a")),
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
@@ -822,6 +1059,7 @@ mod tests {
                     min_value: Precision::Absent,
                     sum_value: Precision::Absent,
                     null_count: Precision::Absent,
+                    byte_size: Precision::Absent,
                 },
             ],
         };
@@ -829,6 +1067,197 @@ mod tests {
         assert_eq!(result, expected);
     }
 
+    #[test]
+    fn test_union_distinct_count() {
+        // (left_ndv, left_min, left_max, right_ndv, right_min, right_max, expected)
+        type NdvTestCase = (
+            Precision<usize>,
+            Option<i64>,
+            Option<i64>,
+            Precision<usize>,
+            Option<i64>,
+            Option<i64>,
+            Precision<usize>,
+        );
+        let cases: Vec<NdvTestCase> = vec![
+            // disjoint ranges: NDV = 5 + 3
+            (
+                Precision::Exact(5),
+                Some(0),
+                Some(10),
+                Precision::Exact(3),
+                Some(20),
+                Some(30),
+                Precision::Inexact(8),
+            ),
+            // identical ranges: intersection = max(10, 8) = 10
+            (
+                Precision::Exact(10),
+                Some(0),
+                Some(100),
+                Precision::Exact(8),
+                Some(0),
+                Some(100),
+                Precision::Inexact(10),
+            ),
+            // partial overlap: 50 + 50 + 25 = 125
+            (
+                Precision::Exact(100),
+                Some(0),
+                Some(100),
+                Precision::Exact(50),
+                Some(50),
+                Some(150),
+                Precision::Inexact(125),
+            ),
+            // right contained in left: 50 + 50 + 0 = 100
+            (
+                Precision::Exact(100),
+                Some(0),
+                Some(100),
+                Precision::Exact(50),
+                Some(25),
+                Some(75),
+                Precision::Inexact(100),
+            ),
+            // both constant, same value
+            (
+                Precision::Exact(1),
+                Some(5),
+                Some(5),
+                Precision::Exact(1),
+                Some(5),
+                Some(5),
+                Precision::Inexact(1),
+            ),
+            // both constant, different values
+            (
+                Precision::Exact(1),
+                Some(5),
+                Some(5),
+                Precision::Exact(1),
+                Some(10),
+                Some(10),
+                Precision::Inexact(2),
+            ),
+            // left constant within right range
+            (
+                Precision::Exact(1),
+                Some(5),
+                Some(5),
+                Precision::Exact(10),
+                Some(0),
+                Some(10),
+                Precision::Inexact(10),
+            ),
+            // left constant outside right range
+            (
+                Precision::Exact(1),
+                Some(20),
+                Some(20),
+                Precision::Exact(10),
+                Some(0),
+                Some(10),
+                Precision::Inexact(11),
+            ),
+            // right constant within left range
+            (
+                Precision::Exact(10),
+                Some(0),
+                Some(10),
+                Precision::Exact(1),
+                Some(5),
+                Some(5),
+                Precision::Inexact(10),
+            ),
+            // right constant outside left range
+            (
+                Precision::Exact(10),
+                Some(0),
+                Some(10),
+                Precision::Exact(1),
+                Some(20),
+                Some(20),
+                Precision::Inexact(11),
+            ),
+            // missing min/max falls back to sum (exact + exact)
+            (
+                Precision::Exact(10),
+                None,
+                None,
+                Precision::Exact(5),
+                None,
+                None,
+                Precision::Inexact(15),
+            ),
+            // missing min/max falls back to sum (exact + inexact)
+            (
+                Precision::Exact(10),
+                None,
+                None,
+                Precision::Inexact(5),
+                None,
+                None,
+                Precision::Inexact(15),
+            ),
+            // missing min/max falls back to sum (inexact + inexact)
+            (
+                Precision::Inexact(7),
+                None,
+                None,
+                Precision::Inexact(3),
+                None,
+                None,
+                Precision::Inexact(10),
+            ),
+            // one side absent
+            (
+                Precision::Exact(10),
+                None,
+                None,
+                Precision::Absent,
+                None,
+                None,
+                Precision::Absent,
+            ),
+            // one side absent (inexact + absent)
+            (
+                Precision::Inexact(4),
+                None,
+                None,
+                Precision::Absent,
+                None,
+                None,
+                Precision::Absent,
+            ),
+        ];
+
+        for (
+            i,
+            (left_ndv, left_min, left_max, right_ndv, right_min, right_max, expected),
+        ) in cases.into_iter().enumerate()
+        {
+            let to_sv = |v| Precision::Exact(ScalarValue::Int64(Some(v)));
+            let left = ColumnStatistics {
+                distinct_count: left_ndv,
+                min_value: left_min.map(to_sv).unwrap_or(Precision::Absent),
+                max_value: left_max.map(to_sv).unwrap_or(Precision::Absent),
+                ..Default::default()
+            };
+            let right = ColumnStatistics {
+                distinct_count: right_ndv,
+                min_value: right_min.map(to_sv).unwrap_or(Precision::Absent),
+                max_value: right_max.map(to_sv).unwrap_or(Precision::Absent),
+                ..Default::default()
+            };
+            assert_eq!(
+                union_distinct_count(&left, &right),
+                expected,
+                "case {i} failed"
+            );
+        }
+    }
+
     #[tokio::test]
     async fn test_union_equivalence_properties() -> Result<()> {
         let schema = create_test_schema()?;
@@ -889,31 +1318,22 @@ mod tests {
             (first_child_orderings, second_child_orderings, union_orderings),
         ) in test_cases.iter().enumerate()
         {
-            let first_orderings = first_child_orderings
-                .iter()
-                .map(|ordering| convert_to_sort_exprs(ordering))
-                .collect::<Vec<_>>();
-            let second_orderings = second_child_orderings
-                .iter()
-                .map(|ordering| convert_to_sort_exprs(ordering))
-                .collect::<Vec<_>>();
-            let union_expected_orderings = union_orderings
-                .iter()
-                .map(|ordering| convert_to_sort_exprs(ordering))
-                .collect::<Vec<_>>();
-            let child1 = Arc::new(TestMemoryExec::update_cache(Arc::new(
-                TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
-                    .try_with_sort_information(first_orderings)?,
-            )));
-            let child2 = Arc::new(TestMemoryExec::update_cache(Arc::new(
-                TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
-                    .try_with_sort_information(second_orderings)?,
-            )));
+            let first_orderings = convert_to_orderings(first_child_orderings);
+            let second_orderings = convert_to_orderings(second_child_orderings);
+            let union_expected_orderings = convert_to_orderings(union_orderings);
+            let child1_exec = TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
+                .try_with_sort_information(first_orderings)?;
+            let child1 = Arc::new(child1_exec);
+            let child1 = Arc::new(TestMemoryExec::update_cache(&child1));
+            let child2_exec = TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?
+                .try_with_sort_information(second_orderings)?;
+            let child2 = Arc::new(child2_exec);
+            let child2 = Arc::new(TestMemoryExec::update_cache(&child2));
 
             let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema));
-            union_expected_eq.add_new_orderings(union_expected_orderings);
+            union_expected_eq.add_orderings(union_expected_orderings);
 
-            let union = UnionExec::new(vec![child1, child2]);
+            let union: Arc<dyn ExecutionPlan> = UnionExec::try_new(vec![child1, child2])?;
             let union_eq_properties = union.properties().equivalence_properties();
             let err_msg = format!(
                 "Error in test id: {:?}, test case: {:?}",
@@ -937,4 +1357,110 @@ mod tests {
             assert!(lhs_orderings.contains(rhs_ordering), "{}", err_msg);
         }
     }
+
+    #[test]
+    fn test_union_empty_inputs() {
+        // Test that UnionExec::try_new fails with empty inputs
+        let result = UnionExec::try_new(vec![]);
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("UnionExec requires at least one input")
+        );
+    }
+
+    #[test]
+    fn test_union_schema_empty_inputs() {
+        // Test that union_schema fails with empty inputs
+        let result = union_schema(&[]);
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Cannot create union schema from empty inputs")
+        );
+    }
+
+    #[test]
+    fn test_union_single_input() -> Result<()> {
+        // Test that UnionExec::try_new returns the single input directly
+        let schema = create_test_schema()?;
+        let memory_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let memory_exec_clone = Arc::clone(&memory_exec);
+        let result = UnionExec::try_new(vec![memory_exec])?;
+
+        // Check that the result is the same as the input (no UnionExec wrapper)
+        assert_eq!(result.schema(), schema);
+        // Verify it's the same execution plan
+        assert!(Arc::ptr_eq(&result, &memory_exec_clone));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_union_schema_multiple_inputs() -> Result<()> {
+        // Test that existing functionality with multiple inputs still works
+        let schema = create_test_schema()?;
+        let memory_exec1 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let memory_exec2 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+
+        let union_plan = UnionExec::try_new(vec![memory_exec1, memory_exec2])?;
+
+        // Downcast to verify it's a UnionExec
+        let union = union_plan
+            .as_any()
+            .downcast_ref::<UnionExec>()
+            .expect("Expected UnionExec");
+
+        // Check that schema is correct
+        assert_eq!(union.schema(), schema);
+        // Check that we have 2 inputs
+        assert_eq!(union.inputs().len(), 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_union_schema_mismatch() {
+        // Test that UnionExec properly rejects inputs with different field counts
+        let schema = create_test_schema().unwrap();
+        let schema2 = create_test_schema2().unwrap();
+        let memory_exec1 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None).unwrap());
+        let memory_exec2 =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema2), None).unwrap());
+
+        let result = UnionExec::try_new(vec![memory_exec1, memory_exec2]);
+        assert!(result.is_err());
+        assert!(
+            result.unwrap_err().to_string().contains(
+                "UnionExec/InterleaveExec requires all inputs to have the same number of fields"
+            )
+        );
+    }
+
+    #[test]
+    fn test_union_cardinality_effect() -> Result<()> {
+        let schema = create_test_schema()?;
+        let input1: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let input2: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+
+        let union = UnionExec::try_new(vec![input1, input2])?;
+        let union = union
+            .as_any()
+            .downcast_ref::<UnionExec>()
+            .expect("expected UnionExec for multiple inputs");
+
+        assert!(matches!(
+            union.cardinality_effect(),
+            CardinalityEffect::GreaterEqual
+        ));
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs
index c06b09f2fecd5..85799250181b6 100644
--- a/datafusion/physical-plan/src/unnest.rs
+++ b/datafusion/physical-plan/src/unnest.rs
@@ -18,19 +18,23 @@
 //! Define a plan for unnesting values in columns that contain a list type.
 
 use std::cmp::{self, Ordering};
-use std::task::{ready, Poll};
+use std::task::{Poll, ready};
 use std::{any::Any, sync::Arc};
 
-use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet};
+use super::metrics::{
+    self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
+    RecordOutput,
+};
 use super::{DisplayAs, ExecutionPlanProperties, PlanProperties};
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, RecordBatchStream,
-    SendableRecordBatchStream,
+    SendableRecordBatchStream, check_if_same_properties,
 };
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, AsArray, FixedSizeListArray, Int64Array,
-    LargeListArray, ListArray, PrimitiveArray, Scalar, StructArray,
+    Array, ArrayRef, AsArray, BooleanBufferBuilder, FixedSizeListArray, Int64Array,
+    LargeListArray, LargeListViewArray, ListArray, ListViewArray, PrimitiveArray, Scalar,
+    StructArray, new_null_array,
 };
 use arrow::compute::kernels::length::length;
 use arrow::compute::kernels::zip::zip;
@@ -38,19 +42,22 @@ use arrow::compute::{cast, is_not_null, kernels, sum};
 use arrow::datatypes::{DataType, Int64Type, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use arrow_ord::cmp::lt;
+use async_trait::async_trait;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, internal_err, HashMap, HashSet, Result, UnnestOptions,
+    Constraints, HashMap, HashSet, Result, UnnestOptions, exec_datafusion_err, exec_err,
+    internal_err,
 };
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
-
-use async_trait::async_trait;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_expr::equivalence::ProjectionMapping;
+use datafusion_physical_expr::expressions::Column;
 use futures::{Stream, StreamExt};
 use log::trace;
 
 /// Unnest the given columns (either with type struct or list)
-/// For list unnesting, each rows is vertically transformed into multiple rows
-/// For struct unnesting, each columns is horizontally transformed into multiple columns,
+/// For list unnesting, each row is vertically transformed into multiple rows
+/// For struct unnesting, each column is horizontally transformed into multiple columns,
 /// Thus the original RecordBatch with dimension (n x m) may have new dimension (n' x m')
 ///
 /// See [`UnnestOptions`] for more details and an example.
@@ -69,7 +76,7 @@ pub struct UnnestExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl UnnestExec {
@@ -80,31 +87,94 @@ impl UnnestExec {
         struct_column_indices: Vec<usize>,
         schema: SchemaRef,
         options: UnnestOptions,
-    ) -> Self {
-        let cache = Self::compute_properties(&input, Arc::clone(&schema));
+    ) -> Result<Self> {
+        let cache = Self::compute_properties(
+            &input,
+            &list_column_indices,
+            &struct_column_indices,
+            &schema,
+        )?;
 
-        UnnestExec {
+        Ok(UnnestExec {
             input,
             schema,
             list_column_indices,
             struct_column_indices,
             options,
             metrics: Default::default(),
-            cache,
-        }
+            cache: Arc::new(cache),
+        })
     }
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         input: &Arc<dyn ExecutionPlan>,
-        schema: SchemaRef,
-    ) -> PlanProperties {
-        PlanProperties::new(
-            EquivalenceProperties::new(schema),
-            input.output_partitioning().to_owned(),
+        list_column_indices: &[ListUnnest],
+        struct_column_indices: &[usize],
+        schema: &SchemaRef,
+    ) -> Result<PlanProperties> {
+        // Find out which indices are not unnested, such that they can be copied over from the input plan
+        let input_schema = input.schema();
+        let mut unnested_indices = BooleanBufferBuilder::new(input_schema.fields().len());
+        unnested_indices.append_n(input_schema.fields().len(), false);
+        for list_unnest in list_column_indices {
+            unnested_indices.set_bit(list_unnest.index_in_input_schema, true);
+        }
+        for struct_unnest in struct_column_indices {
+            unnested_indices.set_bit(*struct_unnest, true)
+        }
+        let unnested_indices = unnested_indices.finish();
+        let non_unnested_indices: Vec<usize> = (0..input_schema.fields().len())
+            .filter(|idx| !unnested_indices.value(*idx))
+            .collect();
+
+        // Manually build projection mapping from non-unnested input columns to their positions in the output
+        let input_schema = input.schema();
+        let projection_mapping: ProjectionMapping = non_unnested_indices
+            .iter()
+            .map(|&input_idx| {
+                // Find what index the input column has in the output schema
+                let input_field = input_schema.field(input_idx);
+                let output_idx = schema
+                    .fields()
+                    .iter()
+                    .position(|output_field| output_field.name() == input_field.name())
+                    .ok_or_else(|| {
+                        exec_datafusion_err!(
+                            "Non-unnested column '{}' must exist in output schema",
+                            input_field.name()
+                        )
+                    })?;
+
+                let input_col = Arc::new(Column::new(input_field.name(), input_idx))
+                    as Arc<dyn PhysicalExpr>;
+                let target_col = Arc::new(Column::new(input_field.name(), output_idx))
+                    as Arc<dyn PhysicalExpr>;
+                // Use From<Vec<(Arc<dyn PhysicalExpr>, usize)>> for ProjectionTargets
+                let targets = vec![(target_col, output_idx)].into();
+                Ok((input_col, targets))
+            })
+            .collect::<Result<ProjectionMapping>>()?;
+
+        // Create the unnest's equivalence properties by copying the input plan's equivalence properties
+        // for the unaffected columns. Except for the constraints, which are removed entirely because
+        // the unnest operation invalidates any global uniqueness or primary-key constraints.
+        let input_eq_properties = input.equivalence_properties();
+        let eq_properties = input_eq_properties
+            .project(&projection_mapping, Arc::clone(schema))
+            .with_constraints(Constraints::default());
+
+        // Output partitioning must use the projection mapping
+        let output_partitioning = input
+            .output_partitioning()
+            .project(&projection_mapping, &eq_properties);
+
+        Ok(PlanProperties::new(
+            eq_properties,
+            output_partitioning,
             input.pipeline_behavior(),
             input.boundedness(),
-        )
+        ))
     }
 
     /// Input execution plan
@@ -125,6 +195,17 @@ impl UnnestExec {
     pub fn options(&self) -> &UnnestOptions {
         &self.options
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for UnnestExec {
@@ -153,7 +234,7 @@ impl ExecutionPlan for UnnestExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -161,17 +242,25 @@ impl ExecutionPlan for UnnestExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
+
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(UnnestExec::new(
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             self.list_column_indices.clone(),
             self.struct_column_indices.clone(),
             Arc::clone(&self.schema),
             self.options.clone(),
-        )))
+        )?))
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
@@ -203,38 +292,25 @@ impl ExecutionPlan for UnnestExec {
 
 #[derive(Clone, Debug)]
 struct UnnestMetrics {
-    /// Total time for column unnesting
-    elapsed_compute: metrics::Time,
+    /// Execution metrics
+    baseline_metrics: BaselineMetrics,
     /// Number of batches consumed
     input_batches: metrics::Count,
     /// Number of rows consumed
     input_rows: metrics::Count,
-    /// Number of batches produced
-    output_batches: metrics::Count,
-    /// Number of rows produced by this operator
-    output_rows: metrics::Count,
 }
 
 impl UnnestMetrics {
     fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
-        let elapsed_compute = MetricBuilder::new(metrics).elapsed_compute(partition);
-
         let input_batches =
             MetricBuilder::new(metrics).counter("input_batches", partition);
 
         let input_rows = MetricBuilder::new(metrics).counter("input_rows", partition);
 
-        let output_batches =
-            MetricBuilder::new(metrics).counter("output_batches", partition);
-
-        let output_rows = MetricBuilder::new(metrics).output_rows(partition);
-
         Self {
+            baseline_metrics: BaselineMetrics::new(metrics, partition),
             input_batches,
             input_rows,
-            output_batches,
-            output_rows,
-            elapsed_compute,
         }
     }
 }
@@ -284,7 +360,9 @@ impl UnnestStream {
         loop {
             return Poll::Ready(match ready!(self.input.poll_next_unpin(cx)) {
                 Some(Ok(batch)) => {
-                    let timer = self.metrics.elapsed_compute.timer();
+                    let elapsed_compute =
+                        self.metrics.baseline_metrics.elapsed_compute().clone();
+                    let timer = elapsed_compute.timer();
                     self.metrics.input_batches.add(1);
                     self.metrics.input_rows.add(batch.num_rows());
                     let result = build_batch(
@@ -298,8 +376,7 @@ impl UnnestStream {
                     let Some(result_batch) = result else {
                         continue;
                     };
-                    self.metrics.output_batches.add(1);
-                    self.metrics.output_rows.add(result_batch.num_rows());
+                    (&result_batch).record_output(&self.metrics.baseline_metrics);
 
                     // Empty record batches should not be emitted.
                     // They need to be treated as  [`Option<RecordBatch>`]es and handled separately
@@ -312,9 +389,9 @@ impl UnnestStream {
                         produced {} output batches containing {} rows in {}",
                         self.metrics.input_batches,
                         self.metrics.input_rows,
-                        self.metrics.output_batches,
-                        self.metrics.output_rows,
-                        self.metrics.elapsed_compute,
+                        self.metrics.baseline_metrics.output_batches(),
+                        self.metrics.baseline_metrics.output_rows(),
+                        self.metrics.baseline_metrics.elapsed_compute(),
                     );
                     other
                 }
@@ -350,9 +427,7 @@ fn flatten_struct_cols(
                     Ok(struct_arr.columns().to_vec())
                 }
                 data_type => internal_err!(
-                    "expecting column {} from input plan to be a struct, got {:?}",
-                    idx,
-                    data_type
+                    "expecting column {idx} from input plan to be a struct, got {data_type}"
                 ),
             },
             None => Ok(vec![Arc::clone(column_data)]),
@@ -697,7 +772,6 @@ fn build_batch(
 /// ```ignore
 /// longest_length: [3, 1, 1, 2]
 /// ```
-///
 fn find_longest_length(
     list_arrays: &[ArrayRef],
     options: &UnnestOptions,
@@ -772,6 +846,30 @@ impl ListArrayType for FixedSizeListArray {
     }
 }
 
+impl ListArrayType for ListViewArray {
+    fn values(&self) -> &ArrayRef {
+        self.values()
+    }
+
+    fn value_offsets(&self, row: usize) -> (i64, i64) {
+        let offset = self.value_offsets()[row] as i64;
+        let size = self.value_sizes()[row] as i64;
+        (offset, offset + size)
+    }
+}
+
+impl ListArrayType for LargeListViewArray {
+    fn values(&self) -> &ArrayRef {
+        self.values()
+    }
+
+    fn value_offsets(&self, row: usize) -> (i64, i64) {
+        let offset = self.value_offsets()[row];
+        let size = self.value_sizes()[row];
+        (offset, offset + size)
+    }
+}
+
 /// Unnest multiple list arrays according to the length array.
 fn unnest_list_arrays(
     list_arrays: &[ArrayRef],
@@ -788,6 +886,12 @@ fn unnest_list_arrays(
             DataType::FixedSizeList(_, _) => {
                 Ok(list_array.as_fixed_size_list() as &dyn ListArrayType)
             }
+            DataType::ListView(_) => {
+                Ok(list_array.as_list_view::<i32>() as &dyn ListArrayType)
+            }
+            DataType::LargeListView(_) => {
+                Ok(list_array.as_list_view::<i64>() as &dyn ListArrayType)
+            }
             other => exec_err!("Invalid unnest datatype {other }"),
         })
         .collect::<Result<Vec<_>>>()?;
@@ -818,7 +922,6 @@ fn unnest_list_arrays(
 /// ```ignore
 /// [1, null, 2, 3, 4, null, null, 5, null, null]
 /// ```
-///
 fn unnest_list_array(
     list_array: &dyn ListArrayType,
     length_array: &PrimitiveArray<Int64Type>,
@@ -866,7 +969,6 @@ fn unnest_list_array(
 /// ```ignore
 /// [0, 0, 1, 1, 1, 2]
 /// ```
-///
 fn create_take_indices(
     length_array: &PrimitiveArray<Int64Type>,
     capacity: usize,
@@ -931,7 +1033,6 @@ fn create_take_indices(
 /// ```ignore
 /// c1: 1, null, 2, 3, 4, null, 5, 6  // Repeated using `indices`
 /// c2: null, null, null, null, null, null, null, null  // Replaced with nulls
-///
 fn repeat_arrs_from_indices(
     batch: &[ArrayRef],
     indices: &PrimitiveArray<Int64Type>,
@@ -1147,32 +1248,32 @@ mod tests {
         .unwrap();
 
         assert_snapshot!(batches_to_string(&[ret]),
-        @r###"
-+---------------------------------+---------------------------------+---------------------------------+
-| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |
-+---------------------------------+---------------------------------+---------------------------------+
-| [1, 2, 3]                       | 1                               | a                               |
-|                                 | 2                               | b                               |
-| [4, 5]                          | 3                               |                                 |
-| [1, 2, 3]                       |                                 | a                               |
-|                                 |                                 | b                               |
-| [4, 5]                          |                                 |                                 |
-| [1, 2, 3]                       | 4                               | a                               |
-|                                 | 5                               | b                               |
-| [4, 5]                          |                                 |                                 |
-| [7, 8, 9, 10]                   | 7                               | c                               |
-|                                 | 8                               | d                               |
-| [11, 12, 13]                    | 9                               |                                 |
-|                                 | 10                              |                                 |
-| [7, 8, 9, 10]                   |                                 | c                               |
-|                                 |                                 | d                               |
-| [11, 12, 13]                    |                                 |                                 |
-| [7, 8, 9, 10]                   | 11                              | c                               |
-|                                 | 12                              | d                               |
-| [11, 12, 13]                    | 13                              |                                 |
-|                                 |                                 | e                               |
-+---------------------------------+---------------------------------+---------------------------------+
-        "###);
+        @r"
+        +---------------------------------+---------------------------------+---------------------------------+
+        | col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |
+        +---------------------------------+---------------------------------+---------------------------------+
+        | [1, 2, 3]                       | 1                               | a                               |
+        |                                 | 2                               | b                               |
+        | [4, 5]                          | 3                               |                                 |
+        | [1, 2, 3]                       |                                 | a                               |
+        |                                 |                                 | b                               |
+        | [4, 5]                          |                                 |                                 |
+        | [1, 2, 3]                       | 4                               | a                               |
+        |                                 | 5                               | b                               |
+        | [4, 5]                          |                                 |                                 |
+        | [7, 8, 9, 10]                   | 7                               | c                               |
+        |                                 | 8                               | d                               |
+        | [11, 12, 13]                    | 9                               |                                 |
+        |                                 | 10                              |                                 |
+        | [7, 8, 9, 10]                   |                                 | c                               |
+        |                                 |                                 | d                               |
+        | [11, 12, 13]                    |                                 |                                 |
+        | [7, 8, 9, 10]                   | 11                              | c                               |
+        |                                 | 12                              | d                               |
+        | [11, 12, 13]                    | 13                              |                                 |
+        |                                 |                                 | e                               |
+        +---------------------------------+---------------------------------+---------------------------------+
+        ");
         Ok(())
     }
 
diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs
deleted file mode 100644
index fb27ccf30179a..0000000000000
--- a/datafusion/physical-plan/src/values.rs
+++ /dev/null
@@ -1,330 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Values execution plan
-
-use std::any::Any;
-use std::sync::Arc;
-
-use crate::execution_plan::{Boundedness, EmissionType};
-use crate::memory::MemoryStream;
-use crate::{common, DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics};
-use crate::{
-    ColumnarValue, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr,
-};
-use arrow::datatypes::{Schema, SchemaRef};
-use arrow::record_batch::{RecordBatch, RecordBatchOptions};
-use datafusion_common::{internal_err, plan_err, Result, ScalarValue};
-use datafusion_execution::TaskContext;
-use datafusion_physical_expr::EquivalenceProperties;
-
-/// Execution plan for values list based relation (produces constant rows)
-#[deprecated(
-    since = "45.0.0",
-    note = "Use `MemorySourceConfig::try_new_as_values` instead"
-)]
-#[derive(Debug, Clone)]
-pub struct ValuesExec {
-    /// The schema
-    schema: SchemaRef,
-    /// The data
-    data: Vec<RecordBatch>,
-    /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
-}
-
-#[allow(deprecated)]
-impl ValuesExec {
-    /// Create a new values exec from data as expr
-    #[deprecated(since = "45.0.0", note = "Use `MemoryExec::try_new` instead")]
-    pub fn try_new(
-        schema: SchemaRef,
-        data: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    ) -> Result<Self> {
-        if data.is_empty() {
-            return plan_err!("Values list cannot be empty");
-        }
-        let n_row = data.len();
-        let n_col = schema.fields().len();
-        // We have this single row batch as a placeholder to satisfy evaluation argument
-        // and generate a single output row
-        let batch = RecordBatch::try_new_with_options(
-            Arc::new(Schema::empty()),
-            vec![],
-            &RecordBatchOptions::new().with_row_count(Some(1)),
-        )?;
-
-        let arr = (0..n_col)
-            .map(|j| {
-                (0..n_row)
-                    .map(|i| {
-                        let r = data[i][j].evaluate(&batch);
-
-                        match r {
-                            Ok(ColumnarValue::Scalar(scalar)) => Ok(scalar),
-                            Ok(ColumnarValue::Array(a)) if a.len() == 1 => {
-                                ScalarValue::try_from_array(&a, 0)
-                            }
-                            Ok(ColumnarValue::Array(a)) => {
-                                plan_err!(
-                                    "Cannot have array values {a:?} in a values list"
-                                )
-                            }
-                            Err(err) => Err(err),
-                        }
-                    })
-                    .collect::<Result<Vec<_>>>()
-                    .and_then(ScalarValue::iter_to_array)
-            })
-            .collect::<Result<Vec<_>>>()?;
-        let batch = RecordBatch::try_new_with_options(
-            Arc::clone(&schema),
-            arr,
-            &RecordBatchOptions::new().with_row_count(Some(n_row)),
-        )?;
-        let data: Vec<RecordBatch> = vec![batch];
-        Self::try_new_from_batches(schema, data)
-    }
-
-    /// Create a new plan using the provided schema and batches.
-    ///
-    /// Errors if any of the batches don't match the provided schema, or if no
-    /// batches are provided.
-    #[deprecated(
-        since = "45.0.0",
-        note = "Use `MemoryExec::try_new_from_batches` instead"
-    )]
-    pub fn try_new_from_batches(
-        schema: SchemaRef,
-        batches: Vec<RecordBatch>,
-    ) -> Result<Self> {
-        if batches.is_empty() {
-            return plan_err!("Values list cannot be empty");
-        }
-
-        for batch in &batches {
-            let batch_schema = batch.schema();
-            if batch_schema != schema {
-                return plan_err!(
-                    "Batch has invalid schema. Expected: {schema}, got: {batch_schema}"
-                );
-            }
-        }
-
-        let cache = Self::compute_properties(Arc::clone(&schema));
-        #[allow(deprecated)]
-        Ok(ValuesExec {
-            schema,
-            data: batches,
-            cache,
-        })
-    }
-
-    /// Provides the data
-    pub fn data(&self) -> Vec<RecordBatch> {
-        #[allow(deprecated)]
-        self.data.clone()
-    }
-
-    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
-    fn compute_properties(schema: SchemaRef) -> PlanProperties {
-        PlanProperties::new(
-            EquivalenceProperties::new(schema),
-            Partitioning::UnknownPartitioning(1),
-            EmissionType::Incremental,
-            Boundedness::Bounded,
-        )
-    }
-}
-
-#[allow(deprecated)]
-impl DisplayAs for ValuesExec {
-    fn fmt_as(
-        &self,
-        t: DisplayFormatType,
-        f: &mut std::fmt::Formatter,
-    ) -> std::fmt::Result {
-        match t {
-            DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, "ValuesExec")
-            }
-            DisplayFormatType::TreeRender => {
-                // TODO: collect info
-                write!(f, "")
-            }
-        }
-    }
-}
-
-#[allow(deprecated)]
-impl ExecutionPlan for ValuesExec {
-    fn name(&self) -> &'static str {
-        "ValuesExec"
-    }
-
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn properties(&self) -> &PlanProperties {
-        #[allow(deprecated)]
-        &self.cache
-    }
-
-    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        vec![]
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        _: Vec<Arc<dyn ExecutionPlan>>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        #[allow(deprecated)]
-        ValuesExec::try_new_from_batches(Arc::clone(&self.schema), self.data.clone())
-            .map(|e| Arc::new(e) as _)
-    }
-
-    fn execute(
-        &self,
-        partition: usize,
-        _context: Arc<TaskContext>,
-    ) -> Result<SendableRecordBatchStream> {
-        // ValuesExec has a single output partition
-        if 0 != partition {
-            return internal_err!(
-                "ValuesExec invalid partition {partition} (expected 0)"
-            );
-        }
-
-        Ok(Box::pin(MemoryStream::try_new(
-            self.data(),
-            #[allow(deprecated)]
-            Arc::clone(&self.schema),
-            None,
-        )?))
-    }
-
-    fn statistics(&self) -> Result<Statistics> {
-        let batch = self.data();
-        Ok(common::compute_record_batch_statistics(
-            &[batch],
-            #[allow(deprecated)]
-            &self.schema,
-            None,
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::expressions::lit;
-    use crate::test::{self, make_partition};
-
-    use arrow::datatypes::{DataType, Field};
-    use datafusion_common::stats::{ColumnStatistics, Precision};
-
-    #[tokio::test]
-    async fn values_empty_case() -> Result<()> {
-        let schema = test::aggr_test_schema();
-        #[allow(deprecated)]
-        let empty = ValuesExec::try_new(schema, vec![]);
-        assert!(empty.is_err());
-        Ok(())
-    }
-
-    #[test]
-    fn new_exec_with_batches() {
-        let batch = make_partition(7);
-        let schema = batch.schema();
-        let batches = vec![batch.clone(), batch];
-        #[allow(deprecated)]
-        let _exec = ValuesExec::try_new_from_batches(schema, batches).unwrap();
-    }
-
-    #[test]
-    fn new_exec_with_batches_empty() {
-        let batch = make_partition(7);
-        let schema = batch.schema();
-        #[allow(deprecated)]
-        let _ = ValuesExec::try_new_from_batches(schema, Vec::new()).unwrap_err();
-    }
-
-    #[test]
-    fn new_exec_with_batches_invalid_schema() {
-        let batch = make_partition(7);
-        let batches = vec![batch.clone(), batch];
-
-        let invalid_schema = Arc::new(Schema::new(vec![
-            Field::new("col0", DataType::UInt32, false),
-            Field::new("col1", DataType::Utf8, false),
-        ]));
-        #[allow(deprecated)]
-        let _ = ValuesExec::try_new_from_batches(invalid_schema, batches).unwrap_err();
-    }
-
-    // Test issue: https://github.com/apache/datafusion/issues/8763
-    #[test]
-    fn new_exec_with_non_nullable_schema() {
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "col0",
-            DataType::UInt32,
-            false,
-        )]));
-        #[allow(deprecated)]
-        let _ = ValuesExec::try_new(Arc::clone(&schema), vec![vec![lit(1u32)]]).unwrap();
-        // Test that a null value is rejected
-        #[allow(deprecated)]
-        let _ = ValuesExec::try_new(schema, vec![vec![lit(ScalarValue::UInt32(None))]])
-            .unwrap_err();
-    }
-
-    #[test]
-    fn values_stats_with_nulls_only() -> Result<()> {
-        let data = vec![
-            vec![lit(ScalarValue::Null)],
-            vec![lit(ScalarValue::Null)],
-            vec![lit(ScalarValue::Null)],
-        ];
-        let rows = data.len();
-        #[allow(deprecated)]
-        let values = ValuesExec::try_new(
-            Arc::new(Schema::new(vec![Field::new("col0", DataType::Null, true)])),
-            data,
-        )?;
-
-        #[allow(deprecated)]
-        let stats = values.statistics()?;
-        assert_eq!(
-            stats,
-            Statistics {
-                num_rows: Precision::Exact(rows),
-                total_byte_size: Precision::Exact(8), // not important
-                column_statistics: vec![ColumnStatistics {
-                    null_count: Precision::Exact(rows), // there are only nulls
-                    distinct_count: Precision::Absent,
-                    max_value: Precision::Absent,
-                    min_value: Precision::Absent,
-                    sum_value: Precision::Absent,
-                },],
-            }
-        );
-
-        Ok(())
-    }
-}
diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
index 6751f9b202400..d0c44c659c20d 100644
--- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
@@ -21,7 +21,7 @@
 //! infinite inputs.
 
 use std::any::Any;
-use std::cmp::{min, Ordering};
+use std::cmp::{Ordering, min};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -36,9 +36,9 @@ use crate::windows::{
 use crate::{
     ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
     ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics, WindowExpr,
+    SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties,
 };
-use ahash::RandomState;
+
 use arrow::compute::take_record_batch;
 use arrow::{
     array::{Array, ArrayRef, RecordBatchOptions, UInt32Builder},
@@ -48,23 +48,28 @@ use arrow::{
 };
 use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::{
     evaluate_partition_ranges, get_at_indices, get_row_at_idx,
 };
 use datafusion_common::{
-    arrow_datafusion_err, exec_err, DataFusionError, HashMap, Result,
+    HashMap, Result, arrow_datafusion_err, exec_datafusion_err, exec_err,
 };
 use datafusion_execution::TaskContext;
-use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
 use datafusion_expr::ColumnarValue;
+use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
 use datafusion_physical_expr::window::{
     PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState,
 };
-use datafusion_physical_expr::PhysicalExpr;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_expr_common::sort_expr::{
+    OrderingRequirements, PhysicalSortExpr,
+};
 
+use crate::execution_plan::CardinalityEffect;
+use datafusion_common::hash_utils::RandomState;
 use futures::stream::Stream;
-use futures::{ready, StreamExt};
+use futures::{StreamExt, ready};
 use hashbrown::hash_table::HashTable;
 use indexmap::IndexMap;
 use log::debug;
@@ -90,7 +95,7 @@ pub struct BoundedWindowAggExec {
     // See `get_ordered_partition_by_indices` for more details.
     ordered_partition_by_indices: Vec<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// If `can_rerepartition` is false, partition_keys is always empty.
     can_repartition: bool,
 }
@@ -111,7 +116,7 @@ impl BoundedWindowAggExec {
                 let indices = get_ordered_partition_by_indices(
                     window_expr[0].partition_by(),
                     &input,
-                );
+                )?;
                 if indices.len() == partition_by_exprs.len() {
                     indices
                 } else {
@@ -123,7 +128,7 @@ impl BoundedWindowAggExec {
                 vec![]
             }
         };
-        let cache = Self::compute_properties(&input, &schema, &window_expr);
+        let cache = Self::compute_properties(&input, &schema, &window_expr)?;
         Ok(Self {
             input,
             window_expr,
@@ -131,7 +136,7 @@ impl BoundedWindowAggExec {
             metrics: ExecutionPlanMetricsSet::new(),
             input_order_mode,
             ordered_partition_by_indices,
-            cache,
+            cache: Arc::new(cache),
             can_repartition,
         })
     }
@@ -151,7 +156,7 @@ impl BoundedWindowAggExec {
     // We are sure that partition by columns are always at the beginning of sort_keys
     // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely
     // to calculate partition separation points
-    pub fn partition_by_sort_keys(&self) -> Result<LexOrdering> {
+    pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> {
         let partition_by = self.window_expr()[0].partition_by();
         get_partition_by_sort_exprs(
             &self.input,
@@ -172,7 +177,9 @@ impl BoundedWindowAggExec {
                 if self.window_expr()[0].partition_by().len()
                     != ordered_partition_by_indices.len()
                 {
-                    return exec_err!("All partition by columns should have an ordering in Sorted mode.");
+                    return exec_err!(
+                        "All partition by columns should have an ordering in Sorted mode."
+                    );
                 }
                 Box::new(SortedSearch {
                     partition_by_sort_keys,
@@ -191,9 +198,9 @@ impl BoundedWindowAggExec {
         input: &Arc<dyn ExecutionPlan>,
         schema: &SchemaRef,
         window_exprs: &[Arc<dyn WindowExpr>],
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
-        let eq_properties = window_equivalence_properties(schema, input, window_exprs);
+        let eq_properties = window_equivalence_properties(schema, input, window_exprs)?;
 
         // As we can have repartitioning using the partition keys, this can
         // be either one or more than one, depending on the presence of
@@ -201,13 +208,13 @@ impl BoundedWindowAggExec {
         let output_partitioning = input.output_partitioning().clone();
 
         // Construct properties cache
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             // TODO: Emission type and boundedness information can be enhanced here
             input.pipeline_behavior(),
             input.boundedness(),
-        )
+        ))
     }
 
     pub fn partition_keys(&self) -> Vec<Arc<dyn PhysicalExpr>> {
@@ -243,6 +250,17 @@ impl BoundedWindowAggExec {
             total_byte_size: Precision::Absent,
         })
     }
+
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
+        }
+    }
 }
 
 impl DisplayAs for BoundedWindowAggExec {
@@ -258,10 +276,14 @@ impl DisplayAs for BoundedWindowAggExec {
                     .window_expr
                     .iter()
                     .map(|e| {
+                        let field = match e.field() {
+                            Ok(f) => f.to_string(),
+                            Err(e) => format!("{e:?}"),
+                        };
                         format!(
-                            "{}: {:?}, frame: {:?}",
+                            "{}: {}, frame: {}",
                             e.name().to_owned(),
-                            e.field(),
+                            field,
                             e.get_window_frame()
                         )
                     })
@@ -295,7 +317,7 @@ impl ExecutionPlan for BoundedWindowAggExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -303,14 +325,27 @@ impl ExecutionPlan for BoundedWindowAggExec {
         vec![&self.input]
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for window_expr in &self.window_expr {
+            for expr in window_expr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         let partition_bys = self.window_expr()[0].partition_by();
         let order_keys = self.window_expr()[0].order_by();
         let partition_bys = self
             .ordered_partition_by_indices
             .iter()
             .map(|idx| &partition_bys[*idx]);
-        vec![calc_requirements(partition_bys, order_keys.iter())]
+        vec![calc_requirements(partition_bys, order_keys)]
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
@@ -330,6 +365,7 @@ impl ExecutionPlan for BoundedWindowAggExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(BoundedWindowAggExec::try_new(
             self.window_expr.clone(),
             Arc::clone(&children[0]),
@@ -359,13 +395,14 @@ impl ExecutionPlan for BoundedWindowAggExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.partition_statistics(None)
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stat =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        Ok(Arc::new(self.statistics_helper(input_stat)?))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        let input_stat = self.input.partition_statistics(partition)?;
-        self.statistics_helper(input_stat)
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
     }
 }
 
@@ -422,16 +459,25 @@ trait PartitionSearcher: Send {
         let partition_batches =
             self.evaluate_partition_batches(&record_batch, window_expr)?;
         for (partition_row, partition_batch) in partition_batches {
-            let partition_batch_state = partition_buffers
-                .entry(partition_row)
+            if let Some(partition_batch_state) = partition_buffers.get_mut(&partition_row)
+            {
+                partition_batch_state.extend(&partition_batch)?
+            } else {
+                let options = RecordBatchOptions::new()
+                    .with_row_count(Some(partition_batch.num_rows()));
                 // Use input_schema for the buffer schema, not `record_batch.schema()`
                 // as it may not have the "correct" schema in terms of output
                 // nullability constraints. For details, see the following issue:
                 // https://github.com/apache/datafusion/issues/9320
-                .or_insert_with(|| {
-                    PartitionBatchState::new(Arc::clone(self.input_schema()))
-                });
-            partition_batch_state.extend(&partition_batch)?;
+                let partition_batch = RecordBatch::try_new_with_options(
+                    Arc::clone(self.input_schema()),
+                    partition_batch.columns().to_vec(),
+                    &options,
+                )?;
+                let partition_batch_state =
+                    PartitionBatchState::new_with_batch(partition_batch);
+                partition_buffers.insert(partition_row, partition_batch_state);
+            }
         }
 
         if self.is_mode_linear() {
@@ -611,23 +657,23 @@ impl PartitionSearcher for LinearSearch {
     fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) {
         // We should be in the `PartiallySorted` case, otherwise we can not
         // tell when we are at the end of a given partition.
-        if !self.ordered_partition_by_indices.is_empty() {
-            if let Some((last_row, _)) = partition_buffers.last() {
-                let last_sorted_cols = self
+        if !self.ordered_partition_by_indices.is_empty()
+            && let Some((last_row, _)) = partition_buffers.last()
+        {
+            let last_sorted_cols = self
+                .ordered_partition_by_indices
+                .iter()
+                .map(|idx| last_row[*idx].clone())
+                .collect::<Vec<_>>();
+            for (row, partition_batch_state) in partition_buffers.iter_mut() {
+                let sorted_cols = self
                     .ordered_partition_by_indices
                     .iter()
-                    .map(|idx| last_row[*idx].clone())
-                    .collect::<Vec<_>>();
-                for (row, partition_batch_state) in partition_buffers.iter_mut() {
-                    let sorted_cols = self
-                        .ordered_partition_by_indices
-                        .iter()
-                        .map(|idx| &row[*idx]);
-                    // All the partitions other than `last_sorted_cols` are done.
-                    // We are sure that we will no longer receive values for these
-                    // partitions (arrival of a new value would violate ordering).
-                    partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols);
-                }
+                    .map(|idx| &row[*idx]);
+                // All the partitions other than `last_sorted_cols` are done.
+                // We are sure that we will no longer receive values for these
+                // partitions (arrival of a new value would violate ordering).
+                partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols);
             }
         }
     }
@@ -750,7 +796,7 @@ impl LinearSearch {
 /// when computing partitions.
 pub struct SortedSearch {
     /// Stores partition by columns and their ordering information
-    partition_by_sort_keys: LexOrdering,
+    partition_by_sort_keys: Vec<PhysicalSortExpr>,
     /// Input ordering and partition by key ordering need not be the same, so
     /// this vector stores the mapping between them. For instance, if the input
     /// is ordered by a, b and the window expression contains a PARTITION BY b, a
@@ -863,9 +909,11 @@ impl SortedSearch {
             cur_window_expr_out_result_len
         });
         argmin(out_col_counts).map_or(0, |(min_idx, minima)| {
-            for (row, count) in counts.swap_remove(min_idx).into_iter() {
-                let partition_batch = &mut partition_buffers[row];
-                partition_batch.n_out_row = count;
+            let mut slowest_partition = counts.swap_remove(min_idx);
+            for (partition_key, partition_batch) in partition_buffers.iter_mut() {
+                if let Some(count) = slowest_partition.remove(partition_key) {
+                    partition_batch.n_out_row = count;
+                }
             }
             minima
         })
@@ -1169,6 +1217,7 @@ fn get_aggregate_result_out_column(
 ) -> Result<ArrayRef> {
     let mut result = None;
     let mut running_length = 0;
+    let mut batches_to_concat = vec![];
     // We assume that iteration order is according to insertion order
     for (
         _,
@@ -1180,23 +1229,31 @@ fn get_aggregate_result_out_column(
     {
         if running_length < len_to_show {
             let n_to_use = min(len_to_show - running_length, out_col.len());
-            let slice_to_use = out_col.slice(0, n_to_use);
-            result = Some(match result {
-                Some(arr) => concat(&[&arr, &slice_to_use])?,
-                None => slice_to_use,
-            });
+            let slice_to_use = if n_to_use == out_col.len() {
+                // avoid slice when the entire column is used
+                Arc::clone(out_col)
+            } else {
+                out_col.slice(0, n_to_use)
+            };
+            batches_to_concat.push(slice_to_use);
             running_length += n_to_use;
         } else {
             break;
         }
     }
+
+    if !batches_to_concat.is_empty() {
+        let array_refs: Vec<&dyn Array> =
+            batches_to_concat.iter().map(|a| a.as_ref()).collect();
+        result = Some(concat(&array_refs)?);
+    }
+
     if running_length != len_to_show {
         return exec_err!(
             "Generated row number should be {len_to_show}, it is {running_length}"
         );
     }
-    result
-        .ok_or_else(|| DataFusionError::Execution("Should contain something".to_string()))
+    result.ok_or_else(|| exec_datafusion_err!("Should contain something"))
 }
 
 /// Constructs a batch from the last row of batch in the argument.
@@ -1215,23 +1272,24 @@ mod tests {
     use std::time::Duration;
 
     use crate::common::collect;
+    use crate::execution_plan::CardinalityEffect;
     use crate::expressions::PhysicalSortExpr;
-    use crate::projection::ProjectionExec;
+    use crate::projection::{ProjectionExec, ProjectionExpr};
     use crate::streaming::{PartitionStream, StreamingTableExec};
     use crate::test::TestMemoryExec;
     use crate::windows::{
-        create_udwf_window_expr, create_window_expr, BoundedWindowAggExec, InputOrderMode,
+        BoundedWindowAggExec, InputOrderMode, create_udwf_window_expr, create_window_expr,
     };
-    use crate::{execute_stream, get_plan_string, ExecutionPlan};
+    use crate::{ExecutionPlan, displayable, execute_stream};
 
     use arrow::array::{
-        builder::{Int64Builder, UInt64Builder},
         RecordBatch,
+        builder::{Int64Builder, UInt64Builder},
     };
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::{exec_datafusion_err, Result, ScalarValue};
+    use datafusion_common::{Result, ScalarValue, exec_datafusion_err};
     use datafusion_execution::config::SessionConfig;
     use datafusion_execution::{
         RecordBatchStream, SendableRecordBatchStream, TaskContext,
@@ -1242,12 +1300,12 @@ mod tests {
     use datafusion_functions_aggregate::count::count_udaf;
     use datafusion_functions_window::nth_value::last_value_udwf;
     use datafusion_functions_window::nth_value::nth_value_udwf;
-    use datafusion_physical_expr::expressions::{col, Column, Literal};
+    use datafusion_physical_expr::expressions::{Column, Literal, col};
     use datafusion_physical_expr::window::StandardWindowExpr;
     use datafusion_physical_expr::{LexOrdering, PhysicalExpr};
 
     use futures::future::Shared;
-    use futures::{pin_mut, ready, FutureExt, Stream, StreamExt};
+    use futures::{FutureExt, Stream, StreamExt, pin_mut, ready};
     use insta::assert_snapshot;
     use itertools::Itertools;
     use tokio::time::timeout;
@@ -1347,10 +1405,10 @@ mod tests {
             Arc::new(Column::new(schema.fields[0].name(), 0)) as Arc<dyn PhysicalExpr>;
         let args = vec![col_expr];
         let partitionby_exprs = vec![col(hash, &schema)?];
-        let orderby_exprs = LexOrdering::new(vec![PhysicalSortExpr {
+        let orderby_exprs = vec![PhysicalSortExpr {
             expr: col(order_by, &schema)?,
             options: SortOptions::default(),
-        }]);
+        }];
         let window_frame = WindowFrame::new_bounds(
             WindowFrameUnits::Range,
             WindowFrameBound::CurrentRow,
@@ -1366,10 +1424,12 @@ mod tests {
                 fn_name,
                 &args,
                 &partitionby_exprs,
-                orderby_exprs.as_ref(),
+                &orderby_exprs,
                 Arc::new(window_frame),
-                &input.schema(),
+                input.schema(),
+                false,
                 false,
+                None,
             )?],
             input,
             input_order_mode,
@@ -1394,7 +1454,11 @@ mod tests {
                 (expr, name)
             })
             .collect::<Vec<_>>();
-        Ok(Arc::new(ProjectionExec::try_new(exprs, input)?))
+        let proj_exprs: Vec<ProjectionExpr> = exprs
+            .into_iter()
+            .map(|(expr, alias)| ProjectionExpr { expr, alias })
+            .collect();
+        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
     }
 
     fn task_context_helper() -> TaskContext {
@@ -1441,20 +1505,6 @@ mod tests {
         Ok(results)
     }
 
-    /// Execute the [ExecutionPlan] and collect the results in memory
-    #[allow(dead_code)]
-    pub async fn collect_bonafide(
-        plan: Arc<dyn ExecutionPlan>,
-        context: Arc<TaskContext>,
-    ) -> Result<Vec<RecordBatch>> {
-        let stream = execute_stream(plan, context)?;
-        let mut results = vec![];
-
-        collect_stream(stream, &mut results).await?;
-
-        Ok(results)
-    }
-
     fn test_schema() -> SchemaRef {
         Arc::new(Schema::new(vec![
             Field::new("sn", DataType::UInt64, true),
@@ -1463,13 +1513,16 @@ mod tests {
     }
 
     fn schema_orders(schema: &SchemaRef) -> Result<Vec<LexOrdering>> {
-        let orderings = vec![LexOrdering::new(vec![PhysicalSortExpr {
-            expr: col("sn", schema)?,
-            options: SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        }])];
+        let orderings = vec![
+            [PhysicalSortExpr {
+                expr: col("sn", schema)?,
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            }]
+            .into(),
+        ];
         Ok(orderings)
     }
 
@@ -1620,7 +1673,7 @@ mod tests {
             Arc::new(StandardWindowExpr::new(
                 last_value_func,
                 &[],
-                &LexOrdering::default(),
+                &[],
                 Arc::new(WindowFrame::new_bounds(
                     WindowFrameUnits::Rows,
                     WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
@@ -1631,7 +1684,7 @@ mod tests {
             Arc::new(StandardWindowExpr::new(
                 nth_value_func1,
                 &[],
-                &LexOrdering::default(),
+                &[],
                 Arc::new(WindowFrame::new_bounds(
                     WindowFrameUnits::Rows,
                     WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
@@ -1642,7 +1695,7 @@ mod tests {
             Arc::new(StandardWindowExpr::new(
                 nth_value_func2,
                 &[],
-                &LexOrdering::default(),
+                &[],
                 Arc::new(WindowFrame::new_bounds(
                     WindowFrameUnits::Rows,
                     WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
@@ -1660,32 +1713,27 @@ mod tests {
 
         let batches = collect(physical_plan.execute(0, task_ctx)?).await?;
 
-        let expected = vec![
-            "BoundedWindowAggExec: wdw=[last: Ok(Field { name: \"last\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-1): Ok(Field { name: \"nth_value(-1)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-2): Ok(Field { name: \"nth_value(-2)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
-            "  DataSourceExec: partitions=1, partition_sizes=[3]",
-        ];
         // Get string representation of the plan
-        let actual = get_plan_string(&physical_plan);
-        assert_eq!(
-            expected, actual,
-            "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
-
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +---+------+---------------+---------------+
-            | a | last | nth_value(-1) | nth_value(-2) |
-            +---+------+---------------+---------------+
-            | 1 | 1    | 1             |               |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            | 1 | 1    | 1             | 3             |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            | 1 | 1    | 1             | 3             |
-            | 2 | 2    | 2             | 1             |
-            | 3 | 3    | 3             | 2             |
-            +---+------+---------------+---------------+
-            "#);
+        assert_snapshot!(displayable(physical_plan.as_ref()).indent(true), @r#"
+        BoundedWindowAggExec: wdw=[last: Field { "last": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-1): Field { "nth_value(-1)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-2): Field { "nth_value(-2)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+          DataSourceExec: partitions=1, partition_sizes=[3]
+        "#);
+
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +---+------+---------------+---------------+
+        | a | last | nth_value(-1) | nth_value(-2) |
+        +---+------+---------------+---------------+
+        | 1 | 1    | 1             |               |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        | 1 | 1    | 1             | 3             |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        | 1 | 1    | 1             | 3             |
+        | 2 | 2    | 2             | 1             |
+        | 3 | 3    | 3             | 2             |
+        +---+------+---------------+---------------+
+        ");
         Ok(())
     }
 
@@ -1782,37 +1830,49 @@ mod tests {
 
         let plan = projection_exec(window)?;
 
-        let expected_plan = vec![
-            "ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [LexOrdering { inner: [PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }] }]@2 as col_2]",
-            "  BoundedWindowAggExec: wdw=[count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [LexOrdering { inner: [PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }] }]: Ok(Field { name: \"count([Column { name: \\\"sn\\\", index: 0 }]) PARTITION BY: [[Column { name: \\\"hash\\\", index: 1 }]], ORDER BY: [LexOrdering { inner: [PhysicalSortExpr { expr: Column { name: \\\"sn\\\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }] }]\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(1)), is_causal: false }], mode=[Linear]",
-            "    StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST]",
-        ];
-
         // Get string representation of the plan
-        let actual = get_plan_string(&plan);
-        assert_eq!(
-            expected_plan, actual,
-            "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_plan:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
+        assert_snapshot!(displayable(plan.as_ref()).indent(true), @r#"
+        ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]@2 as col_2]
+          BoundedWindowAggExec: wdw=[count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Field { "count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]": Int64 }, frame: RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Linear]
+            StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST]
+        "#);
 
         let task_ctx = task_context();
         let batches = collect_with_timeout(plan, task_ctx, timeout_duration).await?;
 
-        assert_snapshot!(batches_to_string(&batches), @r#"
-            +----+------+-------+
-            | sn | hash | col_2 |
-            +----+------+-------+
-            | 0  | 2    | 2     |
-            | 1  | 2    | 2     |
-            | 2  | 2    | 2     |
-            | 3  | 2    | 1     |
-            | 4  | 1    | 2     |
-            | 5  | 1    | 2     |
-            | 6  | 1    | 2     |
-            | 7  | 1    | 1     |
-            +----+------+-------+
-            "#);
+        assert_snapshot!(batches_to_string(&batches), @r"
+        +----+------+-------+
+        | sn | hash | col_2 |
+        +----+------+-------+
+        | 0  | 2    | 2     |
+        | 1  | 2    | 2     |
+        | 2  | 2    | 2     |
+        | 3  | 2    | 1     |
+        | 4  | 1    | 2     |
+        | 5  | 1    | 2     |
+        | 6  | 1    | 2     |
+        | 7  | 1    | 1     |
+        +----+------+-------+
+        ");
 
         Ok(())
     }
+
+    #[test]
+    fn test_bounded_window_agg_cardinality_effect() -> Result<()> {
+        let schema = test_schema();
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let plan = bounded_window_exec_pb_latent_range(input, 1, "hash", "sn")?;
+        let plan = plan
+            .as_any()
+            .downcast_ref::<BoundedWindowAggExec>()
+            .expect("expected BoundedWindowAggExec");
+
+        assert!(matches!(
+            plan.cardinality_effect(),
+            CardinalityEffect::Equal
+        ));
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs
index d2b7e0a49e951..b72a65cf996be 100644
--- a/datafusion/physical-plan/src/windows/mod.rs
+++ b/datafusion/physical-plan/src/windows/mod.rs
@@ -22,19 +22,18 @@ mod utils;
 mod window_agg_exec;
 
 use std::borrow::Borrow;
-use std::iter;
 use std::sync::Arc;
 
 use crate::{
-    expressions::PhysicalSortExpr, ExecutionPlan, ExecutionPlanProperties,
-    InputOrderMode, PhysicalExpr,
+    ExecutionPlan, ExecutionPlanProperties, InputOrderMode, PhysicalExpr,
+    expressions::PhysicalSortExpr,
 };
 
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow_schema::{FieldRef, SortOptions};
-use datafusion_common::{exec_err, Result};
+use datafusion_common::{Result, exec_err};
 use datafusion_expr::{
-    PartitionEvaluator, ReversedUDWF, SetMonotonicity, WindowFrame,
+    LimitEffect, PartitionEvaluator, ReversedUDWF, SetMonotonicity, WindowFrame,
     WindowFunctionDefinition, WindowUDF,
 };
 use datafusion_functions_window_common::expr::ExpressionArgs;
@@ -42,12 +41,13 @@ use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
 use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::{
-    reverse_order_bys,
-    window::{SlidingAggregateWindowExpr, StandardWindowFunctionExpr},
-    ConstExpr, EquivalenceProperties, LexOrdering, PhysicalSortRequirement,
+use datafusion_physical_expr::window::{
+    SlidingAggregateWindowExpr, StandardWindowFunctionExpr,
+};
+use datafusion_physical_expr::{ConstExpr, EquivalenceProperties};
+use datafusion_physical_expr_common::sort_expr::{
+    LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortRequirement,
 };
-use datafusion_physical_expr_common::sort_expr::LexRequirement;
 
 use itertools::Itertools;
 
@@ -69,12 +69,7 @@ pub fn schema_add_window_field(
         .iter()
         .map(|e| Arc::clone(e).as_ref().return_field(schema))
         .collect::<Result<Vec<_>>>()?;
-    let nullability = args
-        .iter()
-        .map(|e| Arc::clone(e).as_ref().nullable(schema))
-        .collect::<Result<Vec<_>>>()?;
-    let window_expr_return_field =
-        window_fn.return_field(&fields, &nullability, fn_name)?;
+    let window_expr_return_field = window_fn.return_field(&fields, fn_name)?;
     let mut window_fields = schema
         .fields()
         .iter()
@@ -93,34 +88,47 @@ pub fn schema_add_window_field(
 }
 
 /// Create a physical expression for window function
-#[allow(clippy::too_many_arguments)]
+#[expect(clippy::too_many_arguments)]
 pub fn create_window_expr(
     fun: &WindowFunctionDefinition,
     name: String,
     args: &[Arc<dyn PhysicalExpr>],
     partition_by: &[Arc<dyn PhysicalExpr>],
-    order_by: &LexOrdering,
+    order_by: &[PhysicalSortExpr],
     window_frame: Arc<WindowFrame>,
-    input_schema: &Schema,
+    input_schema: SchemaRef,
     ignore_nulls: bool,
+    distinct: bool,
+    filter: Option<Arc<dyn PhysicalExpr>>,
 ) -> Result<Arc<dyn WindowExpr>> {
     Ok(match fun {
         WindowFunctionDefinition::AggregateUDF(fun) => {
-            let aggregate = AggregateExprBuilder::new(Arc::clone(fun), args.to_vec())
-                .schema(Arc::new(input_schema.clone()))
-                .alias(name)
-                .with_ignore_nulls(ignore_nulls)
-                .build()
-                .map(Arc::new)?;
+            let aggregate = if distinct {
+                AggregateExprBuilder::new(Arc::clone(fun), args.to_vec())
+                    .schema(input_schema)
+                    .alias(name)
+                    .with_ignore_nulls(ignore_nulls)
+                    .distinct()
+                    .build()
+                    .map(Arc::new)?
+            } else {
+                AggregateExprBuilder::new(Arc::clone(fun), args.to_vec())
+                    .schema(input_schema)
+                    .alias(name)
+                    .with_ignore_nulls(ignore_nulls)
+                    .build()
+                    .map(Arc::new)?
+            };
             window_expr_from_aggregate_expr(
                 partition_by,
                 order_by,
                 window_frame,
                 aggregate,
+                filter,
             )
         }
         WindowFunctionDefinition::WindowUDF(fun) => Arc::new(StandardWindowExpr::new(
-            create_udwf_window_expr(fun, args, input_schema, name, ignore_nulls)?,
+            create_udwf_window_expr(fun, args, &input_schema, name, ignore_nulls)?,
             partition_by,
             order_by,
             window_frame,
@@ -131,9 +139,10 @@ pub fn create_window_expr(
 /// Creates an appropriate [`WindowExpr`] based on the window frame and
 fn window_expr_from_aggregate_expr(
     partition_by: &[Arc<dyn PhysicalExpr>],
-    order_by: &LexOrdering,
+    order_by: &[PhysicalSortExpr],
     window_frame: Arc<WindowFrame>,
     aggregate: Arc<AggregateFunctionExpr>,
+    filter: Option<Arc<dyn PhysicalExpr>>,
 ) -> Arc<dyn WindowExpr> {
     // Is there a potentially unlimited sized window frame?
     let unbounded_window = window_frame.is_ever_expanding();
@@ -144,6 +153,7 @@ fn window_expr_from_aggregate_expr(
             partition_by,
             order_by,
             window_frame,
+            filter,
         ))
     } else {
         Arc::new(PlainAggregateWindowExpr::new(
@@ -151,6 +161,7 @@ fn window_expr_from_aggregate_expr(
             partition_by,
             order_by,
             window_frame,
+            filter,
         ))
     }
 }
@@ -215,6 +226,18 @@ impl WindowUDFExpr {
     pub fn fun(&self) -> &Arc<WindowUDF> {
         &self.fun
     }
+
+    /// Returns all arguments passed to this window function.
+    ///
+    /// Unlike [`StandardWindowFunctionExpr::expressions`], which returns
+    /// only the expressions that need batch evaluation (and may filter out
+    /// literal offset/default args like those for `lead`/`lag`), this
+    /// method returns the complete, unfiltered argument list. This is
+    /// needed for serialization so that all arguments survive a
+    /// protobuf round-trip.
+    pub fn args(&self) -> &[Arc<dyn PhysicalExpr>] {
+        &self.args
+    }
 }
 
 impl StandardWindowFunctionExpr for WindowUDFExpr {
@@ -270,6 +293,10 @@ impl StandardWindowFunctionExpr for WindowUDFExpr {
                 PhysicalSortExpr { expr, options }
             })
     }
+
+    fn limit_effect(&self) -> LimitEffect {
+        self.fun.inner().limit_effect(self.args.as_slice())
+    }
 }
 
 pub(crate) fn calc_requirements<
@@ -278,26 +305,33 @@ pub(crate) fn calc_requirements<
 >(
     partition_by_exprs: impl IntoIterator<Item = T>,
     orderby_sort_exprs: impl IntoIterator<Item = S>,
-) -> Option<LexRequirement> {
-    let mut sort_reqs = LexRequirement::new(
-        partition_by_exprs
-            .into_iter()
-            .map(|partition_by| {
-                PhysicalSortRequirement::new(Arc::clone(partition_by.borrow()), None)
-            })
-            .collect::<Vec<_>>(),
-    );
+) -> Option<OrderingRequirements> {
+    let mut sort_reqs_with_partition = partition_by_exprs
+        .into_iter()
+        .map(|partition_by| {
+            PhysicalSortRequirement::new(Arc::clone(partition_by.borrow()), None)
+        })
+        .collect::<Vec<_>>();
+    let mut sort_reqs = vec![];
     for element in orderby_sort_exprs.into_iter() {
         let PhysicalSortExpr { expr, options } = element.borrow();
-        if !sort_reqs.iter().any(|e| e.expr.eq(expr)) {
-            sort_reqs.push(PhysicalSortRequirement::new(
-                Arc::clone(expr),
-                Some(*options),
-            ));
+        let sort_req = PhysicalSortRequirement::new(Arc::clone(expr), Some(*options));
+        if !sort_reqs_with_partition.iter().any(|e| e.expr.eq(expr)) {
+            sort_reqs_with_partition.push(sort_req.clone());
+        }
+        if !sort_reqs
+            .iter()
+            .any(|e: &PhysicalSortRequirement| e.expr.eq(expr))
+        {
+            sort_reqs.push(sort_req);
         }
     }
-    // Convert empty result to None. Otherwise wrap result inside Some()
-    (!sort_reqs.is_empty()).then_some(sort_reqs)
+
+    let mut alternatives = vec![];
+    alternatives.extend(LexRequirement::new(sort_reqs_with_partition));
+    alternatives.extend(LexRequirement::new(sort_reqs));
+
+    OrderingRequirements::new_alternatives(alternatives, false)
 }
 
 /// This function calculates the indices such that when partition by expressions reordered with the indices
@@ -308,18 +342,18 @@ pub(crate) fn calc_requirements<
 pub fn get_ordered_partition_by_indices(
     partition_by_exprs: &[Arc<dyn PhysicalExpr>],
     input: &Arc<dyn ExecutionPlan>,
-) -> Vec<usize> {
+) -> Result<Vec<usize>> {
     let (_, indices) = input
         .equivalence_properties()
-        .find_longest_permutation(partition_by_exprs);
-    indices
+        .find_longest_permutation(partition_by_exprs)?;
+    Ok(indices)
 }
 
 pub(crate) fn get_partition_by_sort_exprs(
     input: &Arc<dyn ExecutionPlan>,
     partition_by_exprs: &[Arc<dyn PhysicalExpr>],
     ordered_partition_by_indices: &[usize],
-) -> Result<LexOrdering> {
+) -> Result<Vec<PhysicalSortExpr>> {
     let ordered_partition_exprs = ordered_partition_by_indices
         .iter()
         .map(|idx| Arc::clone(&partition_by_exprs[*idx]))
@@ -328,7 +362,7 @@ pub(crate) fn get_partition_by_sort_exprs(
     assert!(ordered_partition_by_indices.len() <= partition_by_exprs.len());
     let (ordering, _) = input
         .equivalence_properties()
-        .find_longest_permutation(&ordered_partition_exprs);
+        .find_longest_permutation(&ordered_partition_exprs)?;
     if ordering.len() == ordered_partition_exprs.len() {
         Ok(ordering)
     } else {
@@ -340,11 +374,11 @@ pub(crate) fn window_equivalence_properties(
     schema: &SchemaRef,
     input: &Arc<dyn ExecutionPlan>,
     window_exprs: &[Arc<dyn WindowExpr>],
-) -> EquivalenceProperties {
+) -> Result<EquivalenceProperties> {
     // We need to update the schema, so we can't directly use input's equivalence
     // properties.
     let mut window_eq_properties = EquivalenceProperties::new(Arc::clone(schema))
-        .extend(input.equivalence_properties().clone());
+        .extend(input.equivalence_properties().clone())?;
 
     let window_schema_len = schema.fields.len();
     let input_schema_len = window_schema_len - window_exprs.len();
@@ -353,25 +387,51 @@ pub(crate) fn window_equivalence_properties(
     for (i, expr) in window_exprs.iter().enumerate() {
         let partitioning_exprs = expr.partition_by();
         let no_partitioning = partitioning_exprs.is_empty();
-        // Collect columns defining partitioning, and construct all `SortOptions`
-        // variations for them. Then, we will check each one whether it satisfies
-        // the existing ordering provided by the input plan.
-        let partition_by_orders = partitioning_exprs
-            .iter()
-            .map(|pb_order| sort_options_resolving_constant(Arc::clone(pb_order)));
-        let all_satisfied_lexs = partition_by_orders
-            .multi_cartesian_product()
-            .map(LexOrdering::new)
-            .filter(|lex| window_eq_properties.ordering_satisfy(lex))
-            .collect::<Vec<_>>();
+
+        // Find "one" valid ordering for partition columns to avoid exponential complexity.
+        // see https://github.com/apache/datafusion/issues/17401
+        let mut all_satisfied_lexs = vec![];
+        let mut candidate_ordering = vec![];
+
+        for partition_expr in partitioning_exprs.iter() {
+            let sort_options =
+                sort_options_resolving_constant(Arc::clone(partition_expr), true);
+
+            // Try each sort option and pick the first one that works
+            let mut found = false;
+            for sort_expr in sort_options.into_iter() {
+                candidate_ordering.push(sort_expr);
+                if let Some(lex) = LexOrdering::new(candidate_ordering.clone())
+                    && window_eq_properties.ordering_satisfy(lex)?
+                {
+                    found = true;
+                    break;
+                }
+                // This option didn't work, remove it and try the next one
+                candidate_ordering.pop();
+            }
+            // If no sort option works for this column, we can't build a valid ordering
+            if !found {
+                candidate_ordering.clear();
+                break;
+            }
+        }
+
+        // If we successfully built an ordering for all columns, use it
+        // When there are no partition expressions, candidate_ordering will be empty and won't be added
+        if candidate_ordering.len() == partitioning_exprs.len()
+            && let Some(lex) = LexOrdering::new(candidate_ordering)
+        {
+            all_satisfied_lexs.push(lex);
+        }
         // If there is a partitioning, and no possible ordering cannot satisfy
         // the input plan's orderings, then we cannot further introduce any
         // new orderings for the window plan.
         if !no_partitioning && all_satisfied_lexs.is_empty() {
-            return window_eq_properties;
+            return Ok(window_eq_properties);
         } else if let Some(std_expr) = expr.as_any().downcast_ref::<StandardWindowExpr>()
         {
-            std_expr.add_equal_orderings(&mut window_eq_properties);
+            std_expr.add_equal_orderings(&mut window_eq_properties)?;
         } else if let Some(plain_expr) =
             expr.as_any().downcast_ref::<PlainAggregateWindowExpr>()
         {
@@ -379,26 +439,28 @@ pub(crate) fn window_equivalence_properties(
             // unbounded starting point.
             // First, check if the frame covers the whole table:
             if plain_expr.get_window_frame().end_bound.is_unbounded() {
-                let window_col = Column::new(expr.name(), i + input_schema_len);
+                let window_col =
+                    Arc::new(Column::new(expr.name(), i + input_schema_len)) as _;
                 if no_partitioning {
                     // Window function has a constant result across the table:
-                    window_eq_properties = window_eq_properties
-                        .with_constants(iter::once(ConstExpr::new(Arc::new(window_col))))
+                    window_eq_properties
+                        .add_constants(std::iter::once(ConstExpr::from(window_col)))?
                 } else {
                     // Window function results in a partial constant value in
                     // some ordering. Adjust the ordering equivalences accordingly:
                     let new_lexs = all_satisfied_lexs.into_iter().flat_map(|lex| {
-                        let orderings = lex.take_exprs();
-                        let new_partial_consts =
-                            sort_options_resolving_constant(Arc::new(window_col.clone()));
+                        let new_partial_consts = sort_options_resolving_constant(
+                            Arc::clone(&window_col),
+                            false,
+                        );
 
                         new_partial_consts.into_iter().map(move |partial| {
-                            let mut existing = orderings.clone();
+                            let mut existing = lex.clone();
                             existing.push(partial);
-                            LexOrdering::new(existing)
+                            existing
                         })
                     });
-                    window_eq_properties.add_new_orderings(new_lexs);
+                    window_eq_properties.add_orderings(new_lexs);
                 }
             } else {
                 // The window frame is ever expanding, so set monotonicity comes
@@ -406,7 +468,7 @@ pub(crate) fn window_equivalence_properties(
                 plain_expr.add_equal_orderings(
                     &mut window_eq_properties,
                     window_expr_indices[i],
-                );
+                )?;
             }
         } else if let Some(sliding_expr) =
             expr.as_any().downcast_ref::<SlidingAggregateWindowExpr>()
@@ -424,22 +486,18 @@ pub(crate) fn window_equivalence_properties(
                     let window_col = Column::new(expr.name(), i + input_schema_len);
                     if no_partitioning {
                         // Reverse set-monotonic cases with no partitioning:
-                        let new_ordering =
-                            vec![LexOrdering::new(vec![PhysicalSortExpr::new(
-                                Arc::new(window_col),
-                                SortOptions::new(increasing, true),
-                            )])];
-                        window_eq_properties.add_new_orderings(new_ordering);
+                        window_eq_properties.add_ordering([PhysicalSortExpr::new(
+                            Arc::new(window_col),
+                            SortOptions::new(increasing, true),
+                        )]);
                     } else {
                         // Reverse set-monotonic cases for all orderings:
-                        for lex in all_satisfied_lexs.into_iter() {
-                            let mut existing = lex.take_exprs();
-                            existing.push(PhysicalSortExpr::new(
+                        for mut lex in all_satisfied_lexs.into_iter() {
+                            lex.push(PhysicalSortExpr::new(
                                 Arc::new(window_col.clone()),
                                 SortOptions::new(increasing, true),
                             ));
-                            window_eq_properties
-                                .add_new_ordering(LexOrdering::new(existing));
+                            window_eq_properties.add_ordering(lex);
                         }
                     }
                 }
@@ -450,44 +508,73 @@ pub(crate) fn window_equivalence_properties(
                 // utilize set-monotonicity since the set shrinks as the frame
                 // boundary starts "touching" the end of the table.
                 else if frame.is_causal() {
-                    let mut args_all_lexs = sliding_expr
-                        .get_aggregate_expr()
-                        .expressions()
-                        .into_iter()
-                        .map(sort_options_resolving_constant)
-                        .multi_cartesian_product();
-
+                    // Find one valid ordering for aggregate arguments instead of
+                    // checking all combinations
+                    let aggregate_exprs = sliding_expr.get_aggregate_expr().expressions();
+                    let mut candidate_order = vec![];
                     let mut asc = false;
-                    if args_all_lexs.any(|order| {
-                        if let Some(f) = order.first() {
-                            asc = !f.options.descending;
+
+                    for (idx, expr) in aggregate_exprs.iter().enumerate() {
+                        let mut found = false;
+                        let sort_options =
+                            sort_options_resolving_constant(Arc::clone(expr), false);
+
+                        // Try each option and pick the first that works
+                        for sort_expr in sort_options.into_iter() {
+                            let is_asc = !sort_expr.options.descending;
+                            candidate_order.push(sort_expr);
+
+                            if let Some(lex) = LexOrdering::new(candidate_order.clone())
+                                && window_eq_properties.ordering_satisfy(lex)?
+                            {
+                                if idx == 0 {
+                                    // The first column's ordering direction determines the overall
+                                    // monotonicity behavior of the window result.
+                                    // - If the aggregate has increasing set monotonicity (e.g., MAX, COUNT)
+                                    //   and the first arg is ascending, the window result is increasing
+                                    // - If the aggregate has decreasing set monotonicity (e.g., MIN)
+                                    //   and the first arg is ascending, the window result is also increasing
+                                    // This flag is used to determine the final window column ordering.
+                                    asc = is_asc;
+                                }
+                                found = true;
+                                break;
+                            }
+                            // This option didn't work, remove it and try the next one
+                            candidate_order.pop();
+                        }
+
+                        // If we couldn't extend the ordering, stop trying
+                        if !found {
+                            break;
                         }
-                        window_eq_properties.ordering_satisfy(&LexOrdering::new(order))
-                    }) {
+                    }
+
+                    // Check if we successfully built a complete ordering
+                    let satisfied = candidate_order.len() == aggregate_exprs.len()
+                        && !aggregate_exprs.is_empty();
+
+                    if satisfied {
                         let increasing =
                             set_monotonicity.eq(&SetMonotonicity::Increasing);
                         let window_col = Column::new(expr.name(), i + input_schema_len);
                         if increasing && (asc || no_partitioning) {
-                            let new_ordering =
-                                LexOrdering::new(vec![PhysicalSortExpr::new(
-                                    Arc::new(window_col),
-                                    SortOptions::new(false, false),
-                                )]);
-                            window_eq_properties.add_new_ordering(new_ordering);
+                            window_eq_properties.add_ordering([PhysicalSortExpr::new(
+                                Arc::new(window_col),
+                                SortOptions::new(false, false),
+                            )]);
                         } else if !increasing && (!asc || no_partitioning) {
-                            let new_ordering =
-                                LexOrdering::new(vec![PhysicalSortExpr::new(
-                                    Arc::new(window_col),
-                                    SortOptions::new(true, false),
-                                )]);
-                            window_eq_properties.add_new_ordering(new_ordering);
+                            window_eq_properties.add_ordering([PhysicalSortExpr::new(
+                                Arc::new(window_col),
+                                SortOptions::new(true, false),
+                            )]);
                         };
                     }
                 }
             }
         }
     }
-    window_eq_properties
+    Ok(window_eq_properties)
 }
 
 /// Constructs the best-fitting windowing operator (a `WindowAggExec` or a
@@ -514,7 +601,7 @@ pub fn get_best_fitting_window(
     let orderby_keys = window_exprs[0].order_by();
     let (should_reverse, input_order_mode) =
         if let Some((should_reverse, input_order_mode)) =
-            get_window_mode(partitionby_exprs, orderby_keys, input)
+            get_window_mode(partitionby_exprs, orderby_keys, input)?
         {
             (should_reverse, input_order_mode)
         } else {
@@ -580,35 +667,29 @@ pub fn get_best_fitting_window(
 /// the mode this window operator should work in to accommodate the existing ordering.
 pub fn get_window_mode(
     partitionby_exprs: &[Arc<dyn PhysicalExpr>],
-    orderby_keys: &LexOrdering,
+    orderby_keys: &[PhysicalSortExpr],
     input: &Arc<dyn ExecutionPlan>,
-) -> Option<(bool, InputOrderMode)> {
-    let input_eqs = input.equivalence_properties().clone();
-    let mut partition_by_reqs: LexRequirement = LexRequirement::new(vec![]);
-    let (_, indices) = input_eqs.find_longest_permutation(partitionby_exprs);
-    vec![].extend(indices.iter().map(|&idx| PhysicalSortRequirement {
-        expr: Arc::clone(&partitionby_exprs[idx]),
-        options: None,
-    }));
-    partition_by_reqs
-        .inner
-        .extend(indices.iter().map(|&idx| PhysicalSortRequirement {
+) -> Result<Option<(bool, InputOrderMode)>> {
+    let mut input_eqs = input.equivalence_properties().clone();
+    let (_, indices) = input_eqs.find_longest_permutation(partitionby_exprs)?;
+    let partition_by_reqs = indices
+        .iter()
+        .map(|&idx| PhysicalSortRequirement {
             expr: Arc::clone(&partitionby_exprs[idx]),
             options: None,
-        }));
+        })
+        .collect::<Vec<_>>();
     // Treat partition by exprs as constant. During analysis of requirements are satisfied.
-    let const_exprs = partitionby_exprs.iter().map(ConstExpr::from);
-    let partition_by_eqs = input_eqs.with_constants(const_exprs);
-    let order_by_reqs = LexRequirement::from(orderby_keys.clone());
-    let reverse_order_by_reqs = LexRequirement::from(reverse_order_bys(orderby_keys));
-    for (should_swap, order_by_reqs) in
-        [(false, order_by_reqs), (true, reverse_order_by_reqs)]
+    let const_exprs = partitionby_exprs.iter().cloned().map(ConstExpr::from);
+    input_eqs.add_constants(const_exprs)?;
+    let reverse_orderby_keys =
+        orderby_keys.iter().map(|e| e.reverse()).collect::<Vec<_>>();
+    for (should_swap, orderbys) in
+        [(false, orderby_keys), (true, reverse_orderby_keys.as_ref())]
     {
-        let req = LexRequirement::new(
-            [partition_by_reqs.inner.clone(), order_by_reqs.inner].concat(),
-        )
-        .collapse();
-        if partition_by_eqs.ordering_satisfy_requirement(&req) {
+        let mut req = partition_by_reqs.clone();
+        req.extend(orderbys.iter().cloned().map(Into::into));
+        if req.is_empty() || input_eqs.ordering_satisfy_requirement(req)? {
             // Window can be run with existing ordering
             let mode = if indices.len() == partitionby_exprs.len() {
                 InputOrderMode::Sorted
@@ -617,17 +698,51 @@ pub fn get_window_mode(
             } else {
                 InputOrderMode::PartiallySorted(indices)
             };
-            return Some((should_swap, mode));
+            return Ok(Some((should_swap, mode)));
         }
     }
-    None
+    Ok(None)
 }
 
-fn sort_options_resolving_constant(expr: Arc<dyn PhysicalExpr>) -> Vec<PhysicalSortExpr> {
-    vec![
-        PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)),
-        PhysicalSortExpr::new(expr, SortOptions::new(true, true)),
-    ]
+/// Generates sort option variations for a given expression.
+///
+/// This function is used to handle constant columns in window operations. Since constant
+/// columns can be considered as having any ordering, we generate multiple sort options
+/// to explore different ordering possibilities.
+///
+/// # Parameters
+/// - `expr`: The physical expression to generate sort options for
+/// - `only_monotonic`: If false, generates all 4 possible sort options (ASC/DESC × NULLS FIRST/LAST).
+///   If true, generates only 2 options that preserve set monotonicity.
+///
+/// # When to use `only_monotonic = false`:
+/// Use for PARTITION BY columns where we want to explore all possible orderings to find
+/// one that matches the existing data ordering.
+///
+/// # When to use `only_monotonic = true`:
+/// Use for aggregate/window function arguments where set monotonicity needs to be preserved.
+/// Only generates ASC NULLS LAST and DESC NULLS FIRST because:
+/// - Set monotonicity is broken if data has increasing order but nulls come first
+/// - Set monotonicity is broken if data has decreasing order but nulls come last
+fn sort_options_resolving_constant(
+    expr: Arc<dyn PhysicalExpr>,
+    only_monotonic: bool,
+) -> Vec<PhysicalSortExpr> {
+    if only_monotonic {
+        // Generate only the 2 options that preserve set monotonicity
+        vec![
+            PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), // ASC NULLS LAST
+            PhysicalSortExpr::new(expr, SortOptions::new(true, true)), // DESC NULLS FIRST
+        ]
+    } else {
+        // Generate all 4 possible sort options for partition columns
+        vec![
+            PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, false)), // ASC NULLS LAST
+            PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(false, true)), // ASC NULLS FIRST
+            PhysicalSortExpr::new(Arc::clone(&expr), SortOptions::new(true, false)), // DESC NULLS LAST
+            PhysicalSortExpr::new(expr, SortOptions::new(true, true)), // DESC NULLS FIRST
+        ]
+    }
 }
 
 #[cfg(test)]
@@ -637,15 +752,15 @@ mod tests {
     use crate::expressions::col;
     use crate::streaming::StreamingTableExec;
     use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
+    use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero};
 
+    use InputOrderMode::{Linear, PartiallySorted, Sorted};
     use arrow::compute::SortOptions;
     use arrow_schema::{DataType, Field};
     use datafusion_execution::TaskContext;
-
     use datafusion_functions_aggregate::count::count_udaf;
+
     use futures::FutureExt;
-    use InputOrderMode::{Linear, PartiallySorted, Sorted};
 
     fn create_test_schema() -> Result<SchemaRef> {
         let nullable_column = Field::new("nullable_col", DataType::Int32, true);
@@ -696,16 +811,14 @@ mod tests {
     /// Created a sorted Streaming Table exec
     pub fn streaming_table_exec(
         schema: &SchemaRef,
-        sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
+        ordering: LexOrdering,
         infinite_source: bool,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let sort_exprs = sort_exprs.into_iter().collect();
-
         Ok(Arc::new(StreamingTableExec::try_new(
             Arc::clone(schema),
             vec![],
             None,
-            Some(sort_exprs),
+            Some(ordering),
             infinite_source,
             None,
         )?))
@@ -719,25 +832,38 @@ mod tests {
             (
                 vec!["a"],
                 vec![("b", true, true)],
-                vec![("a", None), ("b", Some((true, true)))],
+                vec![
+                    vec![("a", None), ("b", Some((true, true)))],
+                    vec![("b", Some((true, true)))],
+                ],
             ),
             // PARTITION BY a, ORDER BY a ASC NULLS FIRST
-            (vec!["a"], vec![("a", true, true)], vec![("a", None)]),
+            (
+                vec!["a"],
+                vec![("a", true, true)],
+                vec![vec![("a", None)], vec![("a", Some((true, true)))]],
+            ),
             // PARTITION BY a, ORDER BY b ASC NULLS FIRST, c DESC NULLS LAST
             (
                 vec!["a"],
                 vec![("b", true, true), ("c", false, false)],
                 vec![
-                    ("a", None),
-                    ("b", Some((true, true))),
-                    ("c", Some((false, false))),
+                    vec![
+                        ("a", None),
+                        ("b", Some((true, true))),
+                        ("c", Some((false, false))),
+                    ],
+                    vec![("b", Some((true, true))), ("c", Some((false, false)))],
                 ],
             ),
             // PARTITION BY a, c, ORDER BY b ASC NULLS FIRST, c DESC NULLS LAST
             (
                 vec!["a", "c"],
                 vec![("b", true, true), ("c", false, false)],
-                vec![("a", None), ("c", None), ("b", Some((true, true)))],
+                vec![
+                    vec![("a", None), ("c", None), ("b", Some((true, true)))],
+                    vec![("b", Some((true, true))), ("c", Some((false, false)))],
+                ],
             ),
         ];
         for (pb_params, ob_params, expected_params) in test_data {
@@ -749,25 +875,26 @@ mod tests {
             let mut orderbys = vec![];
             for (col_name, descending, nulls_first) in ob_params {
                 let expr = col(col_name, &schema)?;
-                let options = SortOptions {
-                    descending,
-                    nulls_first,
-                };
-                orderbys.push(PhysicalSortExpr { expr, options });
+                let options = SortOptions::new(descending, nulls_first);
+                orderbys.push(PhysicalSortExpr::new(expr, options));
             }
 
-            let mut expected: Option<LexRequirement> = None;
-            for (col_name, reqs) in expected_params {
-                let options = reqs.map(|(descending, nulls_first)| SortOptions {
-                    descending,
-                    nulls_first,
-                });
-                let expr = col(col_name, &schema)?;
-                let res = PhysicalSortRequirement::new(expr, options);
-                if let Some(expected) = &mut expected {
-                    expected.push(res);
-                } else {
-                    expected = Some(LexRequirement::new(vec![res]));
+            let mut expected: Option<OrderingRequirements> = None;
+            for expected_param in expected_params.clone() {
+                let mut requirements = vec![];
+                for (col_name, reqs) in expected_param {
+                    let options = reqs.map(|(descending, nulls_first)| {
+                        SortOptions::new(descending, nulls_first)
+                    });
+                    let expr = col(col_name, &schema)?;
+                    requirements.push(PhysicalSortRequirement::new(expr, options));
+                }
+                if let Some(requirements) = LexRequirement::new(requirements) {
+                    if let Some(alts) = expected.as_mut() {
+                        alts.add_alternative(requirements);
+                    } else {
+                        expected = Some(OrderingRequirements::new(requirements));
+                    }
                 }
             }
             assert_eq!(calc_requirements(partitionbys, orderbys), expected);
@@ -789,10 +916,12 @@ mod tests {
                 "count".to_owned(),
                 &[col("a", &schema)?],
                 &[],
-                &LexOrdering::default(),
+                &[],
                 Arc::new(WindowFrame::new(None)),
-                schema.as_ref(),
+                schema,
+                false,
                 false,
+                None,
             )?],
             blocking_exec,
             false,
@@ -893,13 +1022,14 @@ mod tests {
         // Columns a,c are nullable whereas b,d are not nullable.
         // Source is sorted by a ASC NULLS FIRST, b ASC NULLS FIRST, c ASC NULLS FIRST, d ASC NULLS FIRST
         // Column e is not ordered.
-        let sort_exprs = vec![
+        let ordering = [
             sort_expr("a", &test_schema),
             sort_expr("b", &test_schema),
             sort_expr("c", &test_schema),
             sort_expr("d", &test_schema),
-        ];
-        let exec_unbounded = streaming_table_exec(&test_schema, sort_exprs, true)?;
+        ]
+        .into();
+        let exec_unbounded = streaming_table_exec(&test_schema, ordering, true)?;
 
         // test cases consists of vector of tuples. Where each tuple represents a single test case.
         // First field in the tuple is Vec<str> where each element in the vector represents PARTITION BY columns
@@ -986,7 +1116,7 @@ mod tests {
                 partition_by_exprs.push(col(col_name, &test_schema)?);
             }
 
-            let mut order_by_exprs = LexOrdering::default();
+            let mut order_by_exprs = vec![];
             for col_name in order_by_params {
                 let expr = col(col_name, &test_schema)?;
                 // Give default ordering, this is same with input ordering direction
@@ -994,11 +1124,8 @@ mod tests {
                 let options = SortOptions::default();
                 order_by_exprs.push(PhysicalSortExpr { expr, options });
             }
-            let res = get_window_mode(
-                &partition_by_exprs,
-                order_by_exprs.as_ref(),
-                &exec_unbounded,
-            );
+            let res =
+                get_window_mode(&partition_by_exprs, &order_by_exprs, &exec_unbounded)?;
             // Since reversibility is not important in this test. Convert Option<(bool, InputOrderMode)> to Option<InputOrderMode>
             let res = res.map(|(_, mode)| mode);
             assert_eq!(
@@ -1016,13 +1143,14 @@ mod tests {
         // Columns a,c are nullable whereas b,d are not nullable.
         // Source is sorted by a ASC NULLS FIRST, b ASC NULLS FIRST, c ASC NULLS FIRST, d ASC NULLS FIRST
         // Column e is not ordered.
-        let sort_exprs = vec![
+        let ordering = [
             sort_expr("a", &test_schema),
             sort_expr("b", &test_schema),
             sort_expr("c", &test_schema),
             sort_expr("d", &test_schema),
-        ];
-        let exec_unbounded = streaming_table_exec(&test_schema, sort_exprs, true)?;
+        ]
+        .into();
+        let exec_unbounded = streaming_table_exec(&test_schema, ordering, true)?;
 
         // test cases consists of vector of tuples. Where each tuple represents a single test case.
         // First field in the tuple is Vec<str> where each element in the vector represents PARTITION BY columns
@@ -1151,7 +1279,7 @@ mod tests {
                 partition_by_exprs.push(col(col_name, &test_schema)?);
             }
 
-            let mut order_by_exprs = LexOrdering::default();
+            let mut order_by_exprs = vec![];
             for (col_name, descending, nulls_first) in order_by_params {
                 let expr = col(col_name, &test_schema)?;
                 let options = SortOptions {
@@ -1162,7 +1290,7 @@ mod tests {
             }
 
             assert_eq!(
-                get_window_mode(&partition_by_exprs, order_by_exprs.as_ref(), &exec_unbounded),
+                get_window_mode(&partition_by_exprs, &order_by_exprs, &exec_unbounded)?,
                 *expected,
                 "Unexpected result for in unbounded test case#: {case_idx:?}, case: {test_case:?}"
             );
diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs
index 4c76e22308759..c9958c875c6b6 100644
--- a/datafusion/physical-plan/src/windows/window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use super::utils::create_schema;
-use crate::execution_plan::EmissionType;
+use crate::execution_plan::{CardinalityEffect, EmissionType};
 use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use crate::windows::{
     calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs,
@@ -32,7 +32,7 @@ use crate::windows::{
 use crate::{
     ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
     ExecutionPlanProperties, PhysicalExpr, PlanProperties, RecordBatchStream,
-    SendableRecordBatchStream, Statistics, WindowExpr,
+    SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties,
 };
 
 use arrow::array::ArrayRef;
@@ -41,12 +41,15 @@ use arrow::datatypes::SchemaRef;
 use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::TreeNodeRecursion;
 use datafusion_common::utils::{evaluate_partition_ranges, transpose};
-use datafusion_common::{internal_err, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr_common::sort_expr::{
+    OrderingRequirements, PhysicalSortExpr,
+};
 
-use futures::{ready, Stream, StreamExt};
+use futures::{Stream, StreamExt, ready};
 
 /// Window execution plan
 #[derive(Debug, Clone)]
@@ -63,7 +66,7 @@ pub struct WindowAggExec {
     // see `get_ordered_partition_by_indices` for more details.
     ordered_partition_by_indices: Vec<usize>,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// If `can_partition` is false, partition_keys is always empty.
     can_repartition: bool,
 }
@@ -79,15 +82,15 @@ impl WindowAggExec {
         let schema = Arc::new(schema);
 
         let ordered_partition_by_indices =
-            get_ordered_partition_by_indices(window_expr[0].partition_by(), &input);
-        let cache = Self::compute_properties(Arc::clone(&schema), &input, &window_expr);
+            get_ordered_partition_by_indices(window_expr[0].partition_by(), &input)?;
+        let cache = Self::compute_properties(&schema, &input, &window_expr)?;
         Ok(Self {
             input,
             window_expr,
             schema,
             metrics: ExecutionPlanMetricsSet::new(),
             ordered_partition_by_indices,
-            cache,
+            cache: Arc::new(cache),
             can_repartition,
         })
     }
@@ -107,7 +110,7 @@ impl WindowAggExec {
     // We are sure that partition by columns are always at the beginning of sort_keys
     // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely
     // to calculate partition separation points
-    pub fn partition_by_sort_keys(&self) -> Result<LexOrdering> {
+    pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> {
         let partition_by = self.window_expr()[0].partition_by();
         get_partition_by_sort_exprs(
             &self.input,
@@ -118,12 +121,12 @@ impl WindowAggExec {
 
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
-        schema: SchemaRef,
+        schema: &SchemaRef,
         input: &Arc<dyn ExecutionPlan>,
         window_exprs: &[Arc<dyn WindowExpr>],
-    ) -> PlanProperties {
+    ) -> Result<PlanProperties> {
         // Calculate equivalence properties:
-        let eq_properties = window_equivalence_properties(&schema, input, window_exprs);
+        let eq_properties = window_equivalence_properties(schema, input, window_exprs)?;
 
         // Get output partitioning:
         // Because we can have repartitioning using the partition keys this
@@ -131,13 +134,13 @@ impl WindowAggExec {
         let output_partitioning = input.output_partitioning().clone();
 
         // Construct properties cache:
-        PlanProperties::new(
+        Ok(PlanProperties::new(
             eq_properties,
             output_partitioning,
             // TODO: Emission type and boundedness information can be enhanced here
             EmissionType::Final,
             input.boundedness(),
-        )
+        ))
     }
 
     pub fn partition_keys(&self) -> Vec<Arc<dyn PhysicalExpr>> {
@@ -157,22 +160,15 @@ impl WindowAggExec {
         }
     }
 
-    fn statistics_inner(&self) -> Result<Statistics> {
-        let input_stat = self.input.partition_statistics(None)?;
-        let win_cols = self.window_expr.len();
-        let input_cols = self.input.schema().fields().len();
-        // TODO stats: some windowing function will maintain invariants such as min, max...
-        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
-        // copy stats of the input to the beginning of the schema.
-        column_statistics.extend(input_stat.column_statistics);
-        for _ in 0..win_cols {
-            column_statistics.push(ColumnStatistics::new_unknown())
+    fn with_new_children_and_same_properties(
+        &self,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Self {
+        Self {
+            input: children.swap_remove(0),
+            metrics: ExecutionPlanMetricsSet::new(),
+            ..Self::clone(self)
         }
-        Ok(Statistics {
-            num_rows: input_stat.num_rows,
-            column_statistics,
-            total_byte_size: Precision::Absent,
-        })
     }
 }
 
@@ -222,7 +218,7 @@ impl ExecutionPlan for WindowAggExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -230,21 +226,34 @@ impl ExecutionPlan for WindowAggExec {
         vec![&self.input]
     }
 
+    fn apply_expressions(
+        &self,
+        f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        let mut tnr = TreeNodeRecursion::Continue;
+        for window_expr in &self.window_expr {
+            for expr in window_expr.expressions() {
+                tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+            }
+        }
+        Ok(tnr)
+    }
+
     fn maintains_input_order(&self) -> Vec<bool> {
         vec![true]
     }
 
-    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         let partition_bys = self.window_expr()[0].partition_by();
         let order_keys = self.window_expr()[0].order_by();
         if self.ordered_partition_by_indices.len() < partition_bys.len() {
-            vec![calc_requirements(partition_bys, order_keys.iter())]
+            vec![calc_requirements(partition_bys, order_keys)]
         } else {
             let partition_bys = self
                 .ordered_partition_by_indices
                 .iter()
                 .map(|idx| &partition_bys[*idx]);
-            vec![calc_requirements(partition_bys, order_keys.iter())]
+            vec![calc_requirements(partition_bys, order_keys)]
         }
     }
 
@@ -258,11 +267,12 @@ impl ExecutionPlan for WindowAggExec {
 
     fn with_new_children(
         self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
+        mut children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        check_if_same_properties!(self, children);
         Ok(Arc::new(WindowAggExec::try_new(
             self.window_expr.clone(),
-            Arc::clone(&children[0]),
+            children.swap_remove(0),
             true,
         )?))
     }
@@ -288,16 +298,27 @@ impl ExecutionPlan for WindowAggExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.statistics_inner()
+    fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+        let input_stat =
+            Arc::unwrap_or_clone(self.input.partition_statistics(partition)?);
+        let win_cols = self.window_expr.len();
+        let input_cols = self.input.schema().fields().len();
+        // TODO stats: some windowing function will maintain invariants such as min, max...
+        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
+        // copy stats of the input to the beginning of the schema.
+        column_statistics.extend(input_stat.column_statistics);
+        for _ in 0..win_cols {
+            column_statistics.push(ColumnStatistics::new_unknown())
+        }
+        Ok(Arc::new(Statistics {
+            num_rows: input_stat.num_rows,
+            column_statistics,
+            total_byte_size: Precision::Absent,
+        }))
     }
 
-    fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
-        if partition.is_none() {
-            self.statistics_inner()
-        } else {
-            Ok(Statistics::new_unknown(&self.schema()))
-        }
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::Equal
     }
 }
 
@@ -319,7 +340,7 @@ pub struct WindowAggStream {
     batches: Vec<RecordBatch>,
     finished: bool,
     window_expr: Vec<Arc<dyn WindowExpr>>,
-    partition_by_sort_keys: LexOrdering,
+    partition_by_sort_keys: Vec<PhysicalSortExpr>,
     baseline_metrics: BaselineMetrics,
     ordered_partition_by_indices: Vec<usize>,
 }
@@ -331,13 +352,15 @@ impl WindowAggStream {
         window_expr: Vec<Arc<dyn WindowExpr>>,
         input: SendableRecordBatchStream,
         baseline_metrics: BaselineMetrics,
-        partition_by_sort_keys: LexOrdering,
+        partition_by_sort_keys: Vec<PhysicalSortExpr>,
         ordered_partition_by_indices: Vec<usize>,
     ) -> Result<Self> {
         // In WindowAggExec all partition by columns should be ordered.
-        if window_expr[0].partition_by().len() != ordered_partition_by_indices.len() {
-            return internal_err!("All partition by columns should have an ordering");
-        }
+        assert_eq_or_internal_err!(
+            window_expr[0].partition_by().len(),
+            ordered_partition_by_indices.len(),
+            "All partition by columns should have an ordering"
+        );
         Ok(Self {
             schema,
             input,
@@ -446,3 +469,47 @@ impl RecordBatchStream for WindowAggStream {
         Arc::clone(&self.schema)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::TestMemoryExec;
+    use crate::windows::create_window_expr;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{
+        WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    };
+    use datafusion_functions_aggregate::count::count_udaf;
+
+    #[test]
+    fn test_window_agg_cardinality_effect() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestMemoryExec::try_new(&[], Arc::clone(&schema), None)?);
+        let args = vec![crate::expressions::col("a", &schema)?];
+        let window_expr = create_window_expr(
+            &WindowFunctionDefinition::AggregateUDF(count_udaf()),
+            "count(a)".to_string(),
+            &args,
+            &[],
+            &[],
+            Arc::new(WindowFrame::new_bounds(
+                WindowFrameUnits::Rows,
+                WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
+                WindowFrameBound::CurrentRow,
+            )),
+            Arc::clone(&schema),
+            false,
+            false,
+            None,
+        )?;
+
+        let window = WindowAggExec::try_new(vec![window_expr], input, true)?;
+        assert!(matches!(
+            window.cardinality_effect(),
+            CardinalityEffect::Equal
+        ));
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs
index eea1b99586330..c2ef6bf071c43 100644
--- a/datafusion/physical-plan/src/work_table.rs
+++ b/datafusion/physical-plan/src/work_table.rs
@@ -20,26 +20,27 @@
 use std::any::Any;
 use std::sync::{Arc, Mutex};
 
-use crate::execution_plan::{Boundedness, EmissionType};
+use crate::coop::cooperative;
+use crate::execution_plan::{Boundedness, EmissionType, SchedulingType};
 use crate::memory::MemoryStream;
+use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::{
-    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties,
     SendableRecordBatchStream, Statistics,
 };
-use crate::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
 
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{internal_datafusion_err, internal_err, Result};
-use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_common::tree_node::TreeNodeRecursion;
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_datafusion_err};
 use datafusion_execution::TaskContext;
-use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_execution::memory_pool::MemoryReservation;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr};
 
 /// A vector of record batches with a memory reservation.
 #[derive(Debug)]
 pub(super) struct ReservedBatches {
     batches: Vec<RecordBatch>,
-    #[allow(dead_code)]
     reservation: MemoryReservation,
 }
 
@@ -56,15 +57,17 @@ impl ReservedBatches {
 /// See <https://wiki.postgresql.org/wiki/CTEReadme#How_Recursion_Works>
 /// This table serves as a mirror or buffer between each iteration of a recursive query.
 #[derive(Debug)]
-pub(super) struct WorkTable {
+pub struct WorkTable {
     batches: Mutex<Option<ReservedBatches>>,
+    name: String,
 }
 
 impl WorkTable {
     /// Create a new work table.
-    pub(super) fn new() -> Self {
+    pub(super) fn new(name: String) -> Self {
         Self {
             batches: Mutex::new(None),
+            name,
         }
     }
 
@@ -100,25 +103,35 @@ pub struct WorkTableExec {
     name: String,
     /// The schema of the stream
     schema: SchemaRef,
+    /// Projection to apply to build the output stream from the recursion state
+    projection: Option<Vec<usize>>,
     /// The work table
     work_table: Arc<WorkTable>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning etc.
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl WorkTableExec {
     /// Create a new execution plan for a worktable exec.
-    pub fn new(name: String, schema: SchemaRef) -> Self {
+    pub fn new(
+        name: String,
+        mut schema: SchemaRef,
+        projection: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        if let Some(projection) = &projection {
+            schema = Arc::new(schema.project(projection)?);
+        }
         let cache = Self::compute_properties(Arc::clone(&schema));
-        Self {
-            name,
+        Ok(Self {
+            name: name.clone(),
             schema,
+            projection,
+            work_table: Arc::new(WorkTable::new(name)),
             metrics: ExecutionPlanMetricsSet::new(),
-            work_table: Arc::new(WorkTable::new()),
-            cache,
-        }
+            cache: Arc::new(cache),
+        })
     }
 
     /// Ref to name
@@ -131,16 +144,6 @@ impl WorkTableExec {
         Arc::clone(&self.schema)
     }
 
-    pub(super) fn with_work_table(&self, work_table: Arc<WorkTable>) -> Self {
-        Self {
-            name: self.name.clone(),
-            schema: Arc::clone(&self.schema),
-            metrics: ExecutionPlanMetricsSet::new(),
-            work_table,
-            cache: self.cache.clone(),
-        }
-    }
-
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(schema: SchemaRef) -> PlanProperties {
         PlanProperties::new(
@@ -149,6 +152,7 @@ impl WorkTableExec {
             EmissionType::Incremental,
             Boundedness::Bounded,
         )
+        .with_scheduling_type(SchedulingType::Cooperative)
     }
 }
 
@@ -178,7 +182,7 @@ impl ExecutionPlan for WorkTableExec {
         self
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -186,12 +190,11 @@ impl ExecutionPlan for WorkTableExec {
         vec![]
     }
 
-    fn maintains_input_order(&self) -> Vec<bool> {
-        vec![false]
-    }
-
-    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
-        vec![false]
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
     }
 
     fn with_new_children(
@@ -208,45 +211,83 @@ impl ExecutionPlan for WorkTableExec {
         _context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
         // WorkTable streams must be the plan base.
-        if partition != 0 {
-            return internal_err!(
-                "WorkTableExec got an invalid partition {partition} (expected 0)"
-            );
+        assert_eq_or_internal_err!(
+            partition,
+            0,
+            "WorkTableExec got an invalid partition {partition} (expected 0)"
+        );
+        let ReservedBatches {
+            mut batches,
+            reservation,
+        } = self.work_table.take()?;
+        if let Some(projection) = &self.projection {
+            // We apply the projection
+            // TODO: it would be better to apply it as soon as possible and not only here
+            // TODO: an aggressive projection makes the memory reservation smaller, even if we do not edit it
+            batches = batches
+                .into_iter()
+                .map(|b| b.project(projection))
+                .collect::<Result<Vec<_>, _>>()?;
         }
-        let batch = self.work_table.take()?;
-        Ok(Box::pin(
-            MemoryStream::try_new(batch.batches, Arc::clone(&self.schema), None)?
-                .with_reservation(batch.reservation),
-        ))
+
+        let stream = MemoryStream::try_new(batches, Arc::clone(&self.schema), None)?
+            .with_reservation(reservation);
+        Ok(Box::pin(cooperative(stream)))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
+    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Arc<Statistics>> {
+        Ok(Arc::new(Statistics::new_unknown(&self.schema())))
     }
 
-    fn partition_statistics(&self, _partition: Option<usize>) -> Result<Statistics> {
-        Ok(Statistics::new_unknown(&self.schema()))
+    /// Injects run-time state into this `WorkTableExec`.
+    ///
+    /// The only state this node currently understands is an [`Arc<WorkTable>`].
+    /// If `state` can be down-cast to that type, a new `WorkTableExec` backed
+    /// by the provided work table is returned.  Otherwise `None` is returned
+    /// so that callers can attempt to propagate the state further down the
+    /// execution plan tree.
+    fn with_new_state(
+        &self,
+        state: Arc<dyn Any + Send + Sync>,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        // Down-cast to the expected state type; propagate `None` on failure
+        let work_table = state.downcast::<WorkTable>().ok()?;
+
+        if work_table.name != self.name {
+            return None; // Different table
+        }
+
+        Some(Arc::new(Self {
+            name: self.name.clone(),
+            schema: Arc::clone(&self.schema),
+            projection: self.projection.clone(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            work_table,
+            cache: Arc::clone(&self.cache),
+        }))
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{ArrayRef, Int32Array};
+    use arrow::array::{ArrayRef, Int16Array, Int32Array, Int64Array};
+    use arrow_schema::{DataType, Field, Schema};
     use datafusion_execution::memory_pool::{MemoryConsumer, UnboundedMemoryPool};
+    use futures::StreamExt;
 
     #[test]
     fn test_work_table() {
-        let work_table = WorkTable::new();
+        let work_table = WorkTable::new("test".into());
         // Can't take from empty work_table
         assert!(work_table.take().is_err());
 
         let pool = Arc::new(UnboundedMemoryPool::default()) as _;
-        let mut reservation = MemoryConsumer::new("test_work_table").register(&pool);
+        let reservation = MemoryConsumer::new("test_work_table").register(&pool);
 
         // Update batch to work_table
         let array: ArrayRef = Arc::new((0..5).collect::<Int32Array>());
@@ -270,4 +311,53 @@ mod tests {
         drop(memory_stream);
         assert_eq!(pool.reserved(), 0);
     }
+
+    #[tokio::test]
+    async fn test_work_table_exec() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int64, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Int16, false),
+        ]));
+        let work_table_exec =
+            WorkTableExec::new("wt".into(), Arc::clone(&schema), Some(vec![2, 1]))
+                .unwrap();
+
+        // We inject the work table
+        let work_table = Arc::new(WorkTable::new("wt".into()));
+        let work_table_exec = work_table_exec
+            .with_new_state(Arc::clone(&work_table) as _)
+            .unwrap();
+
+        // We update the work table
+        let pool = Arc::new(UnboundedMemoryPool::default()) as _;
+        let reservation = MemoryConsumer::new("test_work_table").register(&pool);
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5])),
+                Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
+                Arc::new(Int16Array::from(vec![1, 2, 3, 4, 5])),
+            ],
+        )
+        .unwrap();
+        work_table.update(ReservedBatches::new(vec![batch], reservation));
+
+        // We get back the batch from the work table
+        let returned_batch = work_table_exec
+            .execute(0, Arc::new(TaskContext::default()))
+            .unwrap()
+            .next()
+            .await
+            .unwrap()
+            .unwrap();
+        assert_eq!(
+            returned_batch,
+            RecordBatch::try_from_iter(vec![
+                ("c", Arc::new(Int16Array::from(vec![1, 2, 3, 4, 5])) as _),
+                ("b", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _),
+            ])
+            .unwrap()
+        );
+    }
 }
diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml
index 957cbc253616b..46dae36ba40ed 100644
--- a/datafusion/proto-common/Cargo.toml
+++ b/datafusion/proto-common/Cargo.toml
@@ -19,18 +19,15 @@
 name = "datafusion-proto-common"
 description = "Protobuf serialization of DataFusion common types"
 keywords = ["arrow", "query", "sql"]
+readme = "README.md"
 version = { workspace = true }
 edition = { workspace = true }
-readme = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
-# Exclude proto files so crates.io consumers don't need protoc
-exclude = ["*.proto"]
-
 [package.metadata.docs.rs]
 all-features = true
 
@@ -39,7 +36,7 @@ name = "datafusion_proto_common"
 
 [features]
 default = []
-json = ["serde", "serde_json", "pbjson"]
+json = ["serde", "pbjson"]
 
 [dependencies]
 arrow = { workspace = true }
@@ -47,7 +44,6 @@ datafusion-common = { workspace = true }
 pbjson = { workspace = true, optional = true }
 prost = { workspace = true }
 serde = { version = "1.0", optional = true }
-serde_json = { workspace = true, optional = true }
 
 [dev-dependencies]
 doc-comment = { workspace = true }
diff --git a/datafusion/proto-common/README.md b/datafusion/proto-common/README.md
index c8b46424f701e..9c4aa707b0ea6 100644
--- a/datafusion/proto-common/README.md
+++ b/datafusion/proto-common/README.md
@@ -17,12 +17,21 @@
   under the License.
 -->
 
-# `datafusion-proto-common`: Apache DataFusion Protobuf Serialization / Deserialization
+# Apache DataFusion Protobuf Common Serialization / Deserialization
 
-This crate contains code to convert Apache [DataFusion] primitive types to and from
-bytes, which can be useful for sending data over the network.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate contains code to convert DataFusion primitive types to and from
+bytes using [Protocol Buffers], which can be useful for sending data over the network.
 
 See [API Docs] for details and examples.
 
-[datafusion]: https://datafusion.apache.org
+Most projects should use the [`datafusion-proto`] crate directly, which re-exports
+this module. If you are already using the [`datafusion-proto`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[protocol buffers]: https://protobuf.dev/
+[`datafusion-proto`]: https://crates.io/crates/datafusion-proto
 [api docs]: http://docs.rs/datafusion-proto/latest
diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml
index cfd3368b0c5ee..f0e60819d42a8 100644
--- a/datafusion/proto-common/gen/Cargo.toml
+++ b/datafusion/proto-common/gen/Cargo.toml
@@ -29,10 +29,13 @@ publish = false
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [dependencies]
 # Pin these dependencies so that the generated output is deterministic
-pbjson-build = "=0.7.0"
-prost-build = "=0.13.5"
+pbjson-build = "=0.9.0"
+prost-build = "=0.14.3"
diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto
index 82f1e91d9c9b4..62c6bbe85612a 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -55,6 +55,8 @@ message NdJsonFormat {
   JsonOptions options = 1;
 }
 
+message ArrowFormat {}
+
 
 message PrimaryKeyConstraint{
   repeated uint64 indices = 1;
@@ -85,6 +87,7 @@ enum JoinType {
   RIGHTSEMI = 6;
   RIGHTANTI = 7;
   LEFTMARK = 8;
+  RIGHTMARK = 9;
 }
 
 enum JoinConstraint {
@@ -92,6 +95,11 @@ enum JoinConstraint {
   USING = 1;
 }
 
+enum NullEquality {
+  NULL_EQUALS_NOTHING = 0;
+  NULL_EQUALS_NULL = 1;
+}
+
 message AvroOptions {}
 message ArrowOptions {}
 
@@ -108,7 +116,6 @@ message Field {
   // for complex data types like structs, unions
   repeated Field children = 4;
   map<string, string> metadata = 5;
-  bool dict_ordered = 6;
 }
 
 message Timestamp{
@@ -129,7 +136,19 @@ enum IntervalUnit{
   MonthDayNano = 2;
 }
 
-message Decimal{
+message Decimal32Type {
+  reserved 1, 2;
+  uint32 precision = 3;
+  int32 scale = 4;
+}
+
+message Decimal64Type {
+  reserved 1, 2;
+  uint32 precision = 3;
+  int32 scale = 4;
+}
+
+message Decimal128Type {
   reserved 1, 2;
   uint32 precision = 3;
   int32 scale = 4;
@@ -164,6 +183,11 @@ message Map {
   bool keys_sorted = 2;
 }
 
+message RunEndEncoded {
+  Field run_ends_field = 1;
+  Field values_field = 2;
+}
+
 enum UnionMode{
   sparse = 0;
   dense = 1;
@@ -217,6 +241,12 @@ message ScalarDictionaryValue {
   ScalarValue value = 2;
 }
 
+message ScalarRunEndEncodedValue {
+  Field run_ends_field = 1;
+  Field values_field = 2;
+  ScalarValue value = 3;
+}
+
 message IntervalDayTimeValue {
   int32 days = 1;
   int32 milliseconds = 2;
@@ -279,6 +309,8 @@ message ScalarValue{
     ScalarNestedValue struct_value = 32;
     ScalarNestedValue map_value = 41;
 
+    Decimal32 decimal32_value = 43;
+    Decimal64 decimal64_value = 44;
     Decimal128 decimal128_value = 20;
     Decimal256 decimal256_value = 39;
 
@@ -300,9 +332,23 @@ message ScalarValue{
     IntervalMonthDayNanoValue interval_month_day_nano = 31;
     ScalarFixedSizeBinary fixed_size_binary_value = 34;
     UnionValue union_value = 42;
+
+    ScalarRunEndEncodedValue run_end_encoded_value = 45;
   }
 }
 
+message Decimal32{
+  bytes value = 1;
+  int64 p = 2;
+  int64 s = 3;
+}
+
+message Decimal64{
+  bytes value = 1;
+  int64 p = 2;
+  int64 s = 3;
+}
+
 message Decimal128{
   bytes value = 1;
   int64 p = 2;
@@ -345,7 +391,9 @@ message ArrowType{
     TimeUnit TIME32 = 21 ;
     TimeUnit TIME64 = 22 ;
     IntervalUnit INTERVAL = 23 ;
-    Decimal DECIMAL = 24 ;
+    Decimal32Type DECIMAL32 = 40;
+    Decimal64Type DECIMAL64 = 41;
+    Decimal128Type DECIMAL128 = 24;
     Decimal256Type DECIMAL256 = 36;
     List LIST = 25;
     List LARGE_LIST = 26;
@@ -354,6 +402,7 @@ message ArrowType{
     Union UNION = 29;
     Dictionary DICTIONARY = 30;
     Map MAP = 33;
+    RunEndEncoded RUN_END_ENCODED = 42;
   }
 }
 
@@ -425,12 +474,16 @@ message CsvOptions {
   bytes double_quote = 15; // Indicates if quotes are doubled
   bytes newlines_in_values = 16; // Indicates if newlines are supported in values
   bytes terminator = 17; // Optional terminator character as a byte
+  bytes truncated_rows = 18; // Indicates if truncated rows are allowed
+  optional uint32 compression_level = 19; // Optional compression level
 }
 
 // Options controlling CSV format
 message JsonOptions {
   CompressionTypeVariant compression = 1; // Compression type
   optional uint64 schema_infer_max_rec = 2; // Optional max records for schema inference
+  optional uint32 compression_level = 3; // Optional compression level
+  optional bool newline_delimited = 4; // Whether to read as newline-delimited JSON (default true). When false, expects JSON array format [{},...]
 }
 
 message TableParquetOptions {
@@ -473,9 +526,7 @@ message ParquetColumnOptions {
     uint64 bloom_filter_ndv = 7;
   }
 
-  oneof max_statistics_size_opt {
-    uint32 max_statistics_size = 8;
-  }
+  reserved 8; // used to be uint32 max_statistics_size = 8;
 }
 
 message ParquetOptions {
@@ -485,6 +536,7 @@ message ParquetOptions {
   bool skip_metadata = 3; // default = true
   bool pushdown_filters = 5; // default = false
   bool reorder_filters = 6; // default = false
+  bool force_filter_selections = 34; // default = false
   uint64 data_pagesize_limit = 7; // default = 1024 * 1024
   uint64 write_batch_size = 8; // default = 1024
   string writer_version = 9; // default = "1.0"
@@ -514,9 +566,7 @@ message ParquetOptions {
     string statistics_enabled = 13;
   }
 
-  oneof max_statistics_size_opt {
-    uint64 max_statistics_size = 14;
-  }
+  reserved 14; // used to be uint32 max_statistics_size = 20;
 
   oneof column_index_truncate_length_opt {
     uint64 column_index_truncate_length = 17;
@@ -549,6 +599,10 @@ message ParquetOptions {
   oneof coerce_int96_opt {
     string coerce_int96 = 32;
   }
+
+  oneof max_predicate_cache_size_opt {
+    uint64 max_predicate_cache_size = 33;
+  }
 }
 
 enum JoinSide {
@@ -572,6 +626,8 @@ message Statistics {
   Precision num_rows = 1;
   Precision total_byte_size = 2;
   repeated ColumnStats column_stats = 3;
+  // total_rows was removed - field 4 is reserved
+  reserved 4;
 }
 
 message ColumnStats {
@@ -580,4 +636,5 @@ message ColumnStats {
   Precision sum_value = 5;
   Precision null_count = 3;
   Precision distinct_count = 4;
+  Precision byte_size = 6;
 }
diff --git a/datafusion/proto-common/src/common.rs b/datafusion/proto-common/src/common.rs
index 9af63e3b07365..d5046aee2e2c7 100644
--- a/datafusion/proto-common/src/common.rs
+++ b/datafusion/proto-common/src/common.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_datafusion_err, DataFusionError};
+use datafusion_common::{DataFusionError, internal_datafusion_err};
 
 /// Return a `DataFusionError::Internal` with the given message
 pub fn proto_error<S: Into<String>>(message: S) -> DataFusionError {
diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs
index bd969db316872..ca8a269958d73 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -25,12 +25,19 @@ use arrow::array::{ArrayRef, AsArray};
 use arrow::buffer::Buffer;
 use arrow::csv::WriterBuilder;
 use arrow::datatypes::{
-    i256, DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit,
-    Schema, TimeUnit, UnionFields, UnionMode,
+    DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
+    TimeUnit, UnionFields, UnionMode, i256,
+};
+use arrow::ipc::{
+    convert::fb_to_schema,
+    reader::{read_dictionary, read_record_batch},
+    root_as_message,
+    writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions},
 };
-use arrow::ipc::{reader::read_record_batch, root_as_message};
 
 use datafusion_common::{
+    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
+    DataFusionError, JoinSide, ScalarValue, Statistics, TableReference,
     arrow_datafusion_err,
     config::{
         CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
@@ -40,8 +47,6 @@ use datafusion_common::{
     parsers::CompressionTypeVariant,
     plan_datafusion_err,
     stats::Precision,
-    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, JoinSide, ScalarValue, Statistics, TableReference,
 };
 
 #[derive(Debug)]
@@ -138,11 +143,17 @@ where
     }
 }
 
+impl From<protobuf::ColumnRelation> for TableReference {
+    fn from(rel: protobuf::ColumnRelation) -> Self {
+        Self::parse_str_normalized(rel.relation.as_str(), true)
+    }
+}
+
 impl From<protobuf::Column> for Column {
     fn from(c: protobuf::Column) -> Self {
         let protobuf::Column { relation, name } = c;
 
-        Self::new(relation.map(|r| r.relation), name)
+        Self::new(relation, name)
     }
 }
 
@@ -164,10 +175,7 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema {
             .map(|df_field| {
                 let field: Field = df_field.field.as_ref().required("field")?;
                 Ok((
-                    df_field
-                        .qualifier
-                        .as_ref()
-                        .map(|q| q.relation.clone().into()),
+                    df_field.qualifier.as_ref().map(|q| q.clone().into()),
                     Arc::new(field),
                 ))
             })
@@ -257,7 +265,15 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
             arrow_type::ArrowTypeEnum::Interval(interval_unit) => {
                 DataType::Interval(parse_i32_to_interval_unit(interval_unit)?)
             }
-            arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal {
+            arrow_type::ArrowTypeEnum::Decimal32(protobuf::Decimal32Type {
+                precision,
+                scale,
+            }) => DataType::Decimal32(*precision as u8, *scale as i8),
+            arrow_type::ArrowTypeEnum::Decimal64(protobuf::Decimal64Type {
+                precision,
+                scale,
+            }) => DataType::Decimal64(*precision as u8, *scale as i8),
+            arrow_type::ArrowTypeEnum::Decimal128(protobuf::Decimal128Type {
                 precision,
                 scale,
             }) => DataType::Decimal128(*precision as u8, *scale as i8),
@@ -293,13 +309,16 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 };
                 let union_fields = parse_proto_fields_to_fields(&union.union_types)?;
 
-                // Default to index based type ids if not provided
-                let type_ids: Vec<_> = match union.type_ids.is_empty() {
-                    true => (0..union_fields.len() as i8).collect(),
-                    false => union.type_ids.iter().map(|i| *i as i8).collect(),
+                // Default to index based type ids if not explicitly provided
+                let union_fields = if union.type_ids.is_empty() {
+                    UnionFields::from_fields(union_fields)
+                } else {
+                    let type_ids = union.type_ids.iter().map(|i| *i as i8);
+                    UnionFields::try_new(type_ids, union_fields).map_err(|e| {
+                        DataFusionError::from(e).context("Deserializing Union DataType")
+                    })?
                 };
-
-                DataType::Union(UnionFields::new(type_ids, union_fields), union_mode)
+                DataType::Union(union_fields, union_mode)
             }
             arrow_type::ArrowTypeEnum::Dictionary(dict) => {
                 let key_datatype = dict.as_ref().key.as_deref().required("key")?;
@@ -312,6 +331,19 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 let keys_sorted = map.keys_sorted;
                 DataType::Map(Arc::new(field), keys_sorted)
             }
+            arrow_type::ArrowTypeEnum::RunEndEncoded(run_end_encoded) => {
+                let run_ends_field: Field = run_end_encoded
+                    .as_ref()
+                    .run_ends_field
+                    .as_deref()
+                    .required("run_ends_field")?;
+                let value_field: Field = run_end_encoded
+                    .as_ref()
+                    .values_field
+                    .as_deref()
+                    .required("values_field")?;
+                DataType::RunEndEncoded(run_ends_field.into(), value_field.into())
+            }
         })
     }
 }
@@ -370,7 +402,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
             Value::Float32Value(v) => Self::Float32(Some(*v)),
             Value::Float64Value(v) => Self::Float64(Some(*v)),
             Value::Date32Value(v) => Self::Date32(Some(*v)),
-            // ScalarValue::List is serialized using arrow IPC format
+            // Nested ScalarValue types are serialized using arrow IPC format
             Value::ListValue(v)
             | Value::FixedSizeListValue(v)
             | Value::LargeListValue(v)
@@ -387,55 +419,83 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     schema_ref.try_into()?
                 } else {
                     return Err(Error::General(
-                        "Invalid schema while deserializing ScalarValue::List"
+                        "Invalid schema while deserializing nested ScalarValue"
                             .to_string(),
                     ));
                 };
 
+                // IPC dictionary batch IDs are assigned when encoding the schema, but our protobuf
+                // `Schema` doesn't preserve those IDs. Reconstruct them deterministically by
+                // round-tripping the schema through IPC.
+                let schema: Schema = {
+                    let ipc_gen = IpcDataGenerator {};
+                    let write_options = IpcWriteOptions::default();
+                    let mut dict_tracker = DictionaryTracker::new(false);
+                    let encoded_schema = ipc_gen.schema_to_bytes_with_dictionary_tracker(
+                        &schema,
+                        &mut dict_tracker,
+                        &write_options,
+                    );
+                    let message =
+                        root_as_message(encoded_schema.ipc_message.as_slice()).map_err(
+                            |e| {
+                                Error::General(format!(
+                                    "Error IPC schema message while deserializing nested ScalarValue: {e}"
+                                ))
+                            },
+                        )?;
+                    let ipc_schema = message.header_as_schema().ok_or_else(|| {
+                        Error::General(
+                            "Unexpected message type deserializing nested ScalarValue schema"
+                                .to_string(),
+                        )
+                    })?;
+                    fb_to_schema(ipc_schema)
+                };
+
                 let message = root_as_message(ipc_message.as_slice()).map_err(|e| {
                     Error::General(format!(
-                        "Error IPC message while deserializing ScalarValue::List: {e}"
+                        "Error IPC message while deserializing nested ScalarValue: {e}"
                     ))
                 })?;
                 let buffer = Buffer::from(arrow_data.as_slice());
 
                 let ipc_batch = message.header_as_record_batch().ok_or_else(|| {
                     Error::General(
-                        "Unexpected message type deserializing ScalarValue::List"
+                        "Unexpected message type deserializing nested ScalarValue"
                             .to_string(),
                     )
                 })?;
 
-                let dict_by_id: HashMap<i64,ArrayRef> = dictionaries.iter().map(|protobuf::scalar_nested_value::Dictionary { ipc_message, arrow_data }| {
+                let mut dict_by_id: HashMap<i64, ArrayRef> = HashMap::new();
+                for protobuf::scalar_nested_value::Dictionary {
+                    ipc_message,
+                    arrow_data,
+                } in dictionaries
+                {
                     let message = root_as_message(ipc_message.as_slice()).map_err(|e| {
                         Error::General(format!(
-                            "Error IPC message while deserializing ScalarValue::List dictionary message: {e}"
+                            "Error IPC message while deserializing nested ScalarValue dictionary message: {e}"
                         ))
                     })?;
                     let buffer = Buffer::from(arrow_data.as_slice());
 
                     let dict_batch = message.header_as_dictionary_batch().ok_or_else(|| {
                         Error::General(
-                            "Unexpected message type deserializing ScalarValue::List dictionary message"
+                            "Unexpected message type deserializing nested ScalarValue dictionary message"
                                 .to_string(),
                         )
                     })?;
-
-                    let id = dict_batch.id();
-
-                    let record_batch = read_record_batch(
+                    read_dictionary(
                         &buffer,
-                        dict_batch.data().unwrap(),
-                        Arc::new(schema.clone()),
-                        &Default::default(),
-                        None,
+                        dict_batch,
+                        &schema,
+                        &mut dict_by_id,
                         &message.version(),
-                    )?;
-
-                    let values: ArrayRef = Arc::clone(record_batch.column(0));
-
-                    Ok((id, values))
-                }).collect::<datafusion_common::Result<HashMap<_, _>>>()?;
+                    )
+                    .map_err(|e| arrow_datafusion_err!(e))
+                    .map_err(|e| e.context("Decoding nested ScalarValue dictionary"))?;
+                }
 
                 let record_batch = read_record_batch(
                     &buffer,
@@ -446,7 +506,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     &message.version(),
                 )
                 .map_err(|e| arrow_datafusion_err!(e))
-                .map_err(|e| e.context("Decoding ScalarValue::List Value"))?;
+                .map_err(|e| e.context("Decoding nested ScalarValue value"))?;
                 let arr = record_batch.column(0);
                 match value {
                     Value::ListValue(_) => {
@@ -469,6 +529,14 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                 let null_type: DataType = v.try_into()?;
                 null_type.try_into().map_err(Error::DataFusionError)?
             }
+            Value::Decimal32Value(val) => {
+                let array = vec_to_array(val.value.clone());
+                Self::Decimal32(Some(i32::from_be_bytes(array)), val.p as u8, val.s as i8)
+            }
+            Value::Decimal64Value(val) => {
+                let array = vec_to_array(val.value.clone());
+                Self::Decimal64(Some(i64::from_be_bytes(array)), val.p as u8, val.s as i8)
+            }
             Value::Decimal128Value(val) => {
                 let array = vec_to_array(val.value.clone());
                 Self::Decimal128(
@@ -556,6 +624,32 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
 
                 Self::Dictionary(Box::new(index_type), Box::new(value))
             }
+            Value::RunEndEncodedValue(v) => {
+                let run_ends_field: Field = v
+                    .run_ends_field
+                    .as_ref()
+                    .ok_or_else(|| Error::required("run_ends_field"))?
+                    .try_into()?;
+
+                let values_field: Field = v
+                    .values_field
+                    .as_ref()
+                    .ok_or_else(|| Error::required("values_field"))?
+                    .try_into()?;
+
+                let value: Self = v
+                    .value
+                    .as_ref()
+                    .ok_or_else(|| Error::required("value"))?
+                    .as_ref()
+                    .try_into()?;
+
+                Self::RunEndEncoded(
+                    run_ends_field.into(),
+                    values_field.into(),
+                    Box::new(value),
+                )
+            }
             Value::BinaryValue(v) => Self::Binary(Some(v.clone())),
             Value::BinaryViewValue(v) => Self::BinaryView(Some(v.clone())),
             Value::LargeBinaryValue(v) => Self::LargeBinary(Some(v.clone())),
@@ -583,7 +677,9 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     .collect::<Option<Vec<_>>>();
                 let fields = fields.ok_or_else(|| Error::required("UnionField"))?;
                 let fields = parse_proto_fields_to_fields(&fields)?;
-                let fields = UnionFields::new(ids, fields);
+                let union_fields = UnionFields::try_new(ids, fields).map_err(|e| {
+                    DataFusionError::from(e).context("Deserializing Union ScalarValue")
+                })?;
                 let v_id = val.value_id as i8;
                 let val = match &val.value {
                     None => None,
@@ -595,7 +691,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                         Some((v_id, Box::new(val)))
                     }
                 };
-                Self::Union(val, fields, mode)
+                Self::Union(val, union_fields, mode)
             }
             Value::FixedSizeBinaryValue(v) => {
                 Self::FixedSizeBinary(v.length, Some(v.clone().values))
@@ -680,6 +776,11 @@ impl From<&protobuf::ColumnStats> for ColumnStatistics {
             } else {
                 Precision::Absent
             },
+            byte_size: if let Some(sbs) = &cs.byte_size {
+                sbs.clone().into()
+            } else {
+                Precision::Absent
+            },
         }
     }
 }
@@ -881,9 +982,10 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
             quote: proto_opts.quote[0],
             terminator: proto_opts.terminator.first().copied(),
             escape: proto_opts.escape.first().copied(),
-            double_quote: proto_opts.has_header.first().map(|h| *h != 0),
+            double_quote: proto_opts.double_quote.first().map(|h| *h != 0),
             newlines_in_values: proto_opts.newlines_in_values.first().map(|h| *h != 0),
             compression: proto_opts.compression().into(),
+            compression_level: proto_opts.compression_level,
             schema_infer_max_rec: proto_opts.schema_infer_max_rec.map(|h| h as usize),
             date_format: (!proto_opts.date_format.is_empty())
                 .then(|| proto_opts.date_format.clone()),
@@ -900,6 +1002,7 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
             null_regex: (!proto_opts.null_regex.is_empty())
                 .then(|| proto_opts.null_regex.clone()),
             comment: proto_opts.comment.first().copied(),
+            truncated_rows: proto_opts.truncated_rows.first().map(|h| *h != 0),
         })
     }
 }
@@ -910,7 +1013,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
     fn try_from(
         value: &protobuf::ParquetOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
-        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetOptions {
             enable_page_index: value.enable_page_index,
             pruning: value.pruning,
@@ -923,9 +1025,12 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
                 .unwrap_or(None),
             pushdown_filters: value.pushdown_filters,
             reorder_filters: value.reorder_filters,
+            force_filter_selections: value.force_filter_selections,
             data_pagesize_limit: value.data_pagesize_limit as usize,
             write_batch_size: value.write_batch_size as usize,
-            writer_version: value.writer_version.clone(),
+            writer_version: value.writer_version.parse().map_err(|e| {
+                DataFusionError::Internal(format!("Failed to parse writer_version: {e}"))
+            })?,
             compression: value.compression_opt.clone().map(|opt| match opt {
                 protobuf::parquet_options::CompressionOpt::Compression(v) => Some(v),
             }).unwrap_or(None),
@@ -938,12 +1043,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
                     protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
                 })
                 .unwrap_or(None),
-            max_statistics_size: value
-                .max_statistics_size_opt.as_ref()
-                .map(|opt| match opt {
-                    protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize),
-                })
-                .unwrap_or(None),
             max_row_group_size: value.max_row_group_size as usize,
             created_by: value.created_by.clone(),
             column_index_truncate_length: value
@@ -988,6 +1087,9 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
                 protobuf::parquet_options::CoerceInt96Opt::CoerceInt96(v) => Some(v),
             }).unwrap_or(None),
             skip_arrow_metadata: value.skip_arrow_metadata,
+            max_predicate_cache_size: value.max_predicate_cache_size_opt.map(|opt| match opt {
+                protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => Some(v as usize),
+            }).unwrap_or(None),
         })
     }
 }
@@ -997,7 +1099,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
     fn try_from(
         value: &protobuf::ParquetColumnOptions,
     ) -> datafusion_common::Result<Self, Self::Error> {
-        #[allow(deprecated)] // max_statistics_size
         Ok(ParquetColumnOptions {
             compression: value.compression_opt.clone().map(|opt| match opt {
                 protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v),
@@ -1009,12 +1110,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
                     protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
                 })
                 .unwrap_or(None),
-            max_statistics_size: value
-                .max_statistics_size_opt
-                .map(|opt| match opt {
-                    protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize),
-                })
-                .unwrap_or(None),
             encoding: value
                 .encoding_opt.clone()
                 .map(|opt| match opt {
@@ -1066,6 +1161,7 @@ impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions {
                 .unwrap(),
             column_specific_options,
             key_value_metadata: Default::default(),
+            crypto: Default::default(),
         })
     }
 }
@@ -1079,7 +1175,9 @@ impl TryFrom<&protobuf::JsonOptions> for JsonOptions {
         let compression: protobuf::CompressionTypeVariant = proto_opts.compression();
         Ok(JsonOptions {
             compression: compression.into(),
+            compression_level: proto_opts.compression_level,
             schema_infer_max_rec: proto_opts.schema_infer_max_rec.map(|h| h as usize),
+            newline_delimited: proto_opts.newline_delimited.unwrap_or(true),
         })
     }
 }
diff --git a/datafusion/proto-common/src/generated/mod.rs b/datafusion/proto-common/src/generated/mod.rs
index 24a062e4cad59..9c2ca9385aa5e 100644
--- a/datafusion/proto-common/src/generated/mod.rs
+++ b/datafusion/proto-common/src/generated/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// This code is generated so we don't want to fix any lint violations manually
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod datafusion_proto_common {
diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs
index b44b05e9ca296..b00e7546bba20 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -1,3 +1,74 @@
+impl serde::Serialize for ArrowFormat {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let len = 0;
+        let struct_ser = serializer.serialize_struct("datafusion_common.ArrowFormat", len)?;
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ArrowFormat {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                            Err(serde::de::Error::unknown_field(value, FIELDS))
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ArrowFormat;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.ArrowFormat")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ArrowFormat, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                while map_.next_key::<GeneratedField>()?.is_some() {
+                    let _ = map_.next_value::<serde::de::IgnoredAny>()?;
+                }
+                Ok(ArrowFormat {
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.ArrowFormat", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for ArrowOptions {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -29,7 +100,7 @@ impl<'de> serde::Deserialize<'de> for ArrowOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -172,8 +243,14 @@ impl serde::Serialize for ArrowType {
                         .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?;
                     struct_ser.serialize_field("INTERVAL", &v)?;
                 }
-                arrow_type::ArrowTypeEnum::Decimal(v) => {
-                    struct_ser.serialize_field("DECIMAL", v)?;
+                arrow_type::ArrowTypeEnum::Decimal32(v) => {
+                    struct_ser.serialize_field("DECIMAL32", v)?;
+                }
+                arrow_type::ArrowTypeEnum::Decimal64(v) => {
+                    struct_ser.serialize_field("DECIMAL64", v)?;
+                }
+                arrow_type::ArrowTypeEnum::Decimal128(v) => {
+                    struct_ser.serialize_field("DECIMAL128", v)?;
                 }
                 arrow_type::ArrowTypeEnum::Decimal256(v) => {
                     struct_ser.serialize_field("DECIMAL256", v)?;
@@ -199,6 +276,9 @@ impl serde::Serialize for ArrowType {
                 arrow_type::ArrowTypeEnum::Map(v) => {
                     struct_ser.serialize_field("MAP", v)?;
                 }
+                arrow_type::ArrowTypeEnum::RunEndEncoded(v) => {
+                    struct_ser.serialize_field("RUNENDENCODED", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -243,7 +323,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             "TIME32",
             "TIME64",
             "INTERVAL",
-            "DECIMAL",
+            "DECIMAL32",
+            "DECIMAL64",
+            "DECIMAL128",
             "DECIMAL256",
             "LIST",
             "LARGE_LIST",
@@ -254,6 +336,8 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             "UNION",
             "DICTIONARY",
             "MAP",
+            "RUN_END_ENCODED",
+            "RUNENDENCODED",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -285,7 +369,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             Time32,
             Time64,
             Interval,
-            Decimal,
+            Decimal32,
+            Decimal64,
+            Decimal128,
             Decimal256,
             List,
             LargeList,
@@ -294,6 +380,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             Union,
             Dictionary,
             Map,
+            RunEndEncoded,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -302,7 +389,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -342,7 +429,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                             "TIME32" => Ok(GeneratedField::Time32),
                             "TIME64" => Ok(GeneratedField::Time64),
                             "INTERVAL" => Ok(GeneratedField::Interval),
-                            "DECIMAL" => Ok(GeneratedField::Decimal),
+                            "DECIMAL32" => Ok(GeneratedField::Decimal32),
+                            "DECIMAL64" => Ok(GeneratedField::Decimal64),
+                            "DECIMAL128" => Ok(GeneratedField::Decimal128),
                             "DECIMAL256" => Ok(GeneratedField::Decimal256),
                             "LIST" => Ok(GeneratedField::List),
                             "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList),
@@ -351,6 +440,7 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                             "UNION" => Ok(GeneratedField::Union),
                             "DICTIONARY" => Ok(GeneratedField::Dictionary),
                             "MAP" => Ok(GeneratedField::Map),
+                            "RUNENDENCODED" | "RUN_END_ENCODED" => Ok(GeneratedField::RunEndEncoded),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -557,11 +647,25 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                             }
                             arrow_type_enum__ = map_.next_value::<::std::option::Option<IntervalUnit>>()?.map(|x| arrow_type::ArrowTypeEnum::Interval(x as i32));
                         }
-                        GeneratedField::Decimal => {
+                        GeneratedField::Decimal32 => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("DECIMAL32"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal32)
+;
+                        }
+                        GeneratedField::Decimal64 => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("DECIMAL64"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal64)
+;
+                        }
+                        GeneratedField::Decimal128 => {
                             if arrow_type_enum__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("DECIMAL"));
+                                return Err(serde::de::Error::duplicate_field("DECIMAL128"));
                             }
-                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal)
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal128)
 ;
                         }
                         GeneratedField::Decimal256 => {
@@ -618,6 +722,13 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
                                 return Err(serde::de::Error::duplicate_field("MAP"));
                             }
                             arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Map)
+;
+                        }
+                        GeneratedField::RunEndEncoded => {
+                            if arrow_type_enum__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("RUNENDENCODED"));
+                            }
+                            arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::RunEndEncoded)
 ;
                         }
                     }
@@ -661,7 +772,7 @@ impl<'de> serde::Deserialize<'de> for AvroFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -732,7 +843,7 @@ impl<'de> serde::Deserialize<'de> for AvroOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -819,7 +930,7 @@ impl<'de> serde::Deserialize<'de> for Column {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -919,7 +1030,7 @@ impl<'de> serde::Deserialize<'de> for ColumnRelation {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -994,6 +1105,9 @@ impl serde::Serialize for ColumnStats {
         if self.distinct_count.is_some() {
             len += 1;
         }
+        if self.byte_size.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.ColumnStats", len)?;
         if let Some(v) = self.min_value.as_ref() {
             struct_ser.serialize_field("minValue", v)?;
@@ -1010,6 +1124,9 @@ impl serde::Serialize for ColumnStats {
         if let Some(v) = self.distinct_count.as_ref() {
             struct_ser.serialize_field("distinctCount", v)?;
         }
+        if let Some(v) = self.byte_size.as_ref() {
+            struct_ser.serialize_field("byteSize", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -1030,6 +1147,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             "nullCount",
             "distinct_count",
             "distinctCount",
+            "byte_size",
+            "byteSize",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -1039,6 +1158,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             SumValue,
             NullCount,
             DistinctCount,
+            ByteSize,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1047,7 +1167,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1065,6 +1185,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                             "sumValue" | "sum_value" => Ok(GeneratedField::SumValue),
                             "nullCount" | "null_count" => Ok(GeneratedField::NullCount),
                             "distinctCount" | "distinct_count" => Ok(GeneratedField::DistinctCount),
+                            "byteSize" | "byte_size" => Ok(GeneratedField::ByteSize),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1089,6 +1210,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                 let mut sum_value__ = None;
                 let mut null_count__ = None;
                 let mut distinct_count__ = None;
+                let mut byte_size__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::MinValue => {
@@ -1121,6 +1243,12 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                             }
                             distinct_count__ = map_.next_value()?;
                         }
+                        GeneratedField::ByteSize => {
+                            if byte_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("byteSize"));
+                            }
+                            byte_size__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(ColumnStats {
@@ -1129,6 +1257,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                     sum_value: sum_value__,
                     null_count: null_count__,
                     distinct_count: distinct_count__,
+                    byte_size: byte_size__,
                 })
             }
         }
@@ -1167,7 +1296,7 @@ impl<'de> serde::Deserialize<'de> for CompressionTypeVariant {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = CompressionTypeVariant;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1264,7 +1393,7 @@ impl<'de> serde::Deserialize<'de> for Constraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1364,7 +1493,7 @@ impl<'de> serde::Deserialize<'de> for Constraints {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1455,7 +1584,7 @@ impl<'de> serde::Deserialize<'de> for CsvFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1566,6 +1695,12 @@ impl serde::Serialize for CsvOptions {
         if !self.terminator.is_empty() {
             len += 1;
         }
+        if !self.truncated_rows.is_empty() {
+            len += 1;
+        }
+        if self.compression_level.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.CsvOptions", len)?;
         if !self.has_header.is_empty() {
             #[allow(clippy::needless_borrow)]
@@ -1638,6 +1773,14 @@ impl serde::Serialize for CsvOptions {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("terminator", pbjson::private::base64::encode(&self.terminator).as_str())?;
         }
+        if !self.truncated_rows.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("truncatedRows", pbjson::private::base64::encode(&self.truncated_rows).as_str())?;
+        }
+        if let Some(v) = self.compression_level.as_ref() {
+            struct_ser.serialize_field("compressionLevel", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -1676,6 +1819,10 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             "newlines_in_values",
             "newlinesInValues",
             "terminator",
+            "truncated_rows",
+            "truncatedRows",
+            "compression_level",
+            "compressionLevel",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -1697,6 +1844,8 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             DoubleQuote,
             NewlinesInValues,
             Terminator,
+            TruncatedRows,
+            CompressionLevel,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1705,7 +1854,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1735,6 +1884,8 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                             "doubleQuote" | "double_quote" => Ok(GeneratedField::DoubleQuote),
                             "newlinesInValues" | "newlines_in_values" => Ok(GeneratedField::NewlinesInValues),
                             "terminator" => Ok(GeneratedField::Terminator),
+                            "truncatedRows" | "truncated_rows" => Ok(GeneratedField::TruncatedRows),
+                            "compressionLevel" | "compression_level" => Ok(GeneratedField::CompressionLevel),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1771,6 +1922,8 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                 let mut double_quote__ = None;
                 let mut newlines_in_values__ = None;
                 let mut terminator__ = None;
+                let mut truncated_rows__ = None;
+                let mut compression_level__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::HasHeader => {
@@ -1893,6 +2046,22 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                                 Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
                             ;
                         }
+                        GeneratedField::TruncatedRows => {
+                            if truncated_rows__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("truncatedRows"));
+                            }
+                            truncated_rows__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::CompressionLevel => {
+                            if compression_level__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("compressionLevel"));
+                            }
+                            compression_level__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
                     }
                 }
                 Ok(CsvOptions {
@@ -1913,6 +2082,8 @@ impl<'de> serde::Deserialize<'de> for CsvOptions {
                     double_quote: double_quote__.unwrap_or_default(),
                     newlines_in_values: newlines_in_values__.unwrap_or_default(),
                     terminator: terminator__.unwrap_or_default(),
+                    truncated_rows: truncated_rows__.unwrap_or_default(),
+                    compression_level: compression_level__,
                 })
             }
         }
@@ -2047,7 +2218,7 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2189,7 +2360,144 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions {
         deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for Decimal {
+impl serde::Serialize for Decimal128 {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.value.is_empty() {
+            len += 1;
+        }
+        if self.p != 0 {
+            len += 1;
+        }
+        if self.s != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?;
+        if !self.value.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?;
+        }
+        if self.p != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?;
+        }
+        if self.s != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for Decimal128 {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "value",
+            "p",
+            "s",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Value,
+            P,
+            S,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "value" => Ok(GeneratedField::Value),
+                            "p" => Ok(GeneratedField::P),
+                            "s" => Ok(GeneratedField::S),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = Decimal128;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.Decimal128")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut value__ = None;
+                let mut p__ = None;
+                let mut s__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("value"));
+                            }
+                            value__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::P => {
+                            if p__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("p"));
+                            }
+                            p__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::S => {
+                            if s__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("s"));
+                            }
+                            s__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(Decimal128 {
+                    value: value__.unwrap_or_default(),
+                    p: p__.unwrap_or_default(),
+                    s: s__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for Decimal128Type {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -2203,7 +2511,7 @@ impl serde::Serialize for Decimal {
         if self.scale != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128Type", len)?;
         if self.precision != 0 {
             struct_ser.serialize_field("precision", &self.precision)?;
         }
@@ -2213,7 +2521,7 @@ impl serde::Serialize for Decimal {
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for Decimal {
+impl<'de> serde::Deserialize<'de> for Decimal128Type {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -2236,7 +2544,7 @@ impl<'de> serde::Deserialize<'de> for Decimal {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2260,13 +2568,13 @@ impl<'de> serde::Deserialize<'de> for Decimal {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = Decimal;
+            type Value = Decimal128Type;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion_common.Decimal")
+                formatter.write_str("struct datafusion_common.Decimal128Type")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128Type, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
@@ -2292,16 +2600,16 @@ impl<'de> serde::Deserialize<'de> for Decimal {
                         }
                     }
                 }
-                Ok(Decimal {
+                Ok(Decimal128Type {
                     precision: precision__.unwrap_or_default(),
                     scale: scale__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion_common.Decimal", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion_common.Decimal128Type", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for Decimal128 {
+impl serde::Serialize for Decimal256 {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -2318,7 +2626,7 @@ impl serde::Serialize for Decimal128 {
         if self.s != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?;
         if !self.value.is_empty() {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
@@ -2337,7 +2645,7 @@ impl serde::Serialize for Decimal128 {
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for Decimal128 {
+impl<'de> serde::Deserialize<'de> for Decimal256 {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -2362,7 +2670,7 @@ impl<'de> serde::Deserialize<'de> for Decimal128 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2387,13 +2695,13 @@ impl<'de> serde::Deserialize<'de> for Decimal128 {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = Decimal128;
+            type Value = Decimal256;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion_common.Decimal128")
+                formatter.write_str("struct datafusion_common.Decimal256")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
@@ -2428,17 +2736,17 @@ impl<'de> serde::Deserialize<'de> for Decimal128 {
                         }
                     }
                 }
-                Ok(Decimal128 {
+                Ok(Decimal256 {
                     value: value__.unwrap_or_default(),
                     p: p__.unwrap_or_default(),
                     s: s__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for Decimal256 {
+impl serde::Serialize for Decimal256Type {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -2446,16 +2754,128 @@ impl serde::Serialize for Decimal256 {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if !self.value.is_empty() {
+        if self.precision != 0 {
             len += 1;
         }
-        if self.p != 0 {
+        if self.scale != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?;
+        if self.precision != 0 {
+            struct_ser.serialize_field("precision", &self.precision)?;
+        }
+        if self.scale != 0 {
+            struct_ser.serialize_field("scale", &self.scale)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for Decimal256Type {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "precision",
+            "scale",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Precision,
+            Scale,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "precision" => Ok(GeneratedField::Precision),
+                            "scale" => Ok(GeneratedField::Scale),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = Decimal256Type;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.Decimal256Type")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256Type, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut precision__ = None;
+                let mut scale__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Precision => {
+                            if precision__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("precision"));
+                            }
+                            precision__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Scale => {
+                            if scale__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("scale"));
+                            }
+                            scale__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(Decimal256Type {
+                    precision: precision__.unwrap_or_default(),
+                    scale: scale__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for Decimal32 {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.value.is_empty() {
+            len += 1;
+        }
+        if self.p != 0 {
             len += 1;
         }
         if self.s != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32", len)?;
         if !self.value.is_empty() {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
@@ -2474,7 +2894,7 @@ impl serde::Serialize for Decimal256 {
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for Decimal256 {
+impl<'de> serde::Deserialize<'de> for Decimal32 {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -2499,7 +2919,7 @@ impl<'de> serde::Deserialize<'de> for Decimal256 {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2524,13 +2944,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256 {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = Decimal256;
+            type Value = Decimal32;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion_common.Decimal256")
+                formatter.write_str("struct datafusion_common.Decimal32")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal32, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
@@ -2565,17 +2985,17 @@ impl<'de> serde::Deserialize<'de> for Decimal256 {
                         }
                     }
                 }
-                Ok(Decimal256 {
+                Ok(Decimal32 {
                     value: value__.unwrap_or_default(),
                     p: p__.unwrap_or_default(),
                     s: s__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion_common.Decimal32", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for Decimal256Type {
+impl serde::Serialize for Decimal32Type {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -2589,7 +3009,7 @@ impl serde::Serialize for Decimal256Type {
         if self.scale != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32Type", len)?;
         if self.precision != 0 {
             struct_ser.serialize_field("precision", &self.precision)?;
         }
@@ -2599,7 +3019,7 @@ impl serde::Serialize for Decimal256Type {
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for Decimal256Type {
+impl<'de> serde::Deserialize<'de> for Decimal32Type {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
@@ -2622,7 +3042,7 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2646,13 +3066,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = Decimal256Type;
+            type Value = Decimal32Type;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion_common.Decimal256Type")
+                formatter.write_str("struct datafusion_common.Decimal32Type")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256Type, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal32Type, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
@@ -2678,13 +3098,262 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type {
                         }
                     }
                 }
-                Ok(Decimal256Type {
+                Ok(Decimal32Type {
                     precision: precision__.unwrap_or_default(),
                     scale: scale__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion_common.Decimal32Type", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for Decimal64 {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.value.is_empty() {
+            len += 1;
+        }
+        if self.p != 0 {
+            len += 1;
+        }
+        if self.s != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64", len)?;
+        if !self.value.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?;
+        }
+        if self.p != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?;
+        }
+        if self.s != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for Decimal64 {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "value",
+            "p",
+            "s",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Value,
+            P,
+            S,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "value" => Ok(GeneratedField::Value),
+                            "p" => Ok(GeneratedField::P),
+                            "s" => Ok(GeneratedField::S),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = Decimal64;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.Decimal64")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal64, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut value__ = None;
+                let mut p__ = None;
+                let mut s__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("value"));
+                            }
+                            value__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::P => {
+                            if p__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("p"));
+                            }
+                            p__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::S => {
+                            if s__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("s"));
+                            }
+                            s__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(Decimal64 {
+                    value: value__.unwrap_or_default(),
+                    p: p__.unwrap_or_default(),
+                    s: s__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.Decimal64", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for Decimal64Type {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.precision != 0 {
+            len += 1;
+        }
+        if self.scale != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64Type", len)?;
+        if self.precision != 0 {
+            struct_ser.serialize_field("precision", &self.precision)?;
+        }
+        if self.scale != 0 {
+            struct_ser.serialize_field("scale", &self.scale)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for Decimal64Type {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "precision",
+            "scale",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Precision,
+            Scale,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "precision" => Ok(GeneratedField::Precision),
+                            "scale" => Ok(GeneratedField::Scale),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = Decimal64Type;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.Decimal64Type")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal64Type, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut precision__ = None;
+                let mut scale__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Precision => {
+                            if precision__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("precision"));
+                            }
+                            precision__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Scale => {
+                            if scale__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("scale"));
+                            }
+                            scale__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(Decimal64Type {
+                    precision: precision__.unwrap_or_default(),
+                    scale: scale__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.Decimal64Type", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for DfField {
@@ -2734,7 +3403,7 @@ impl<'de> serde::Deserialize<'de> for DfField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2842,7 +3511,7 @@ impl<'de> serde::Deserialize<'de> for DfSchema {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2952,7 +3621,7 @@ impl<'de> serde::Deserialize<'de> for Dictionary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3044,7 +3713,7 @@ impl<'de> serde::Deserialize<'de> for EmptyMessage {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3107,9 +3776,6 @@ impl serde::Serialize for Field {
         if !self.metadata.is_empty() {
             len += 1;
         }
-        if self.dict_ordered {
-            len += 1;
-        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.Field", len)?;
         if !self.name.is_empty() {
             struct_ser.serialize_field("name", &self.name)?;
@@ -3126,9 +3792,6 @@ impl serde::Serialize for Field {
         if !self.metadata.is_empty() {
             struct_ser.serialize_field("metadata", &self.metadata)?;
         }
-        if self.dict_ordered {
-            struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
-        }
         struct_ser.end()
     }
 }
@@ -3145,8 +3808,6 @@ impl<'de> serde::Deserialize<'de> for Field {
             "nullable",
             "children",
             "metadata",
-            "dict_ordered",
-            "dictOrdered",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -3156,7 +3817,6 @@ impl<'de> serde::Deserialize<'de> for Field {
             Nullable,
             Children,
             Metadata,
-            DictOrdered,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -3165,7 +3825,7 @@ impl<'de> serde::Deserialize<'de> for Field {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3183,7 +3843,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                             "nullable" => Ok(GeneratedField::Nullable),
                             "children" => Ok(GeneratedField::Children),
                             "metadata" => Ok(GeneratedField::Metadata),
-                            "dictOrdered" | "dict_ordered" => Ok(GeneratedField::DictOrdered),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -3208,7 +3867,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                 let mut nullable__ = None;
                 let mut children__ = None;
                 let mut metadata__ = None;
-                let mut dict_ordered__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Name => {
@@ -3243,12 +3901,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                                 map_.next_value::<std::collections::HashMap<_, _>>()?
                             );
                         }
-                        GeneratedField::DictOrdered => {
-                            if dict_ordered__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("dictOrdered"));
-                            }
-                            dict_ordered__ = Some(map_.next_value()?);
-                        }
                     }
                 }
                 Ok(Field {
@@ -3257,7 +3909,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                     nullable: nullable__.unwrap_or_default(),
                     children: children__.unwrap_or_default(),
                     metadata: metadata__.unwrap_or_default(),
-                    dict_ordered: dict_ordered__.unwrap_or_default(),
                 })
             }
         }
@@ -3313,7 +3964,7 @@ impl<'de> serde::Deserialize<'de> for FixedSizeList {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3423,7 +4074,7 @@ impl<'de> serde::Deserialize<'de> for IntervalDayTimeValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3545,7 +4196,7 @@ impl<'de> serde::Deserialize<'de> for IntervalMonthDayNanoValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3649,7 +4300,7 @@ impl<'de> serde::Deserialize<'de> for IntervalUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = IntervalUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3721,7 +4372,7 @@ impl<'de> serde::Deserialize<'de> for JoinConstraint {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinConstraint;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3794,7 +4445,7 @@ impl<'de> serde::Deserialize<'de> for JoinSide {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinSide;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3856,6 +4507,7 @@ impl serde::Serialize for JoinType {
             Self::Rightsemi => "RIGHTSEMI",
             Self::Rightanti => "RIGHTANTI",
             Self::Leftmark => "LEFTMARK",
+            Self::Rightmark => "RIGHTMARK",
         };
         serializer.serialize_str(variant)
     }
@@ -3876,11 +4528,12 @@ impl<'de> serde::Deserialize<'de> for JoinType {
             "RIGHTSEMI",
             "RIGHTANTI",
             "LEFTMARK",
+            "RIGHTMARK",
         ];
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = JoinType;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3925,6 +4578,7 @@ impl<'de> serde::Deserialize<'de> for JoinType {
                     "RIGHTSEMI" => Ok(JoinType::Rightsemi),
                     "RIGHTANTI" => Ok(JoinType::Rightanti),
                     "LEFTMARK" => Ok(JoinType::Leftmark),
+                    "RIGHTMARK" => Ok(JoinType::Rightmark),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
@@ -3946,6 +4600,12 @@ impl serde::Serialize for JsonOptions {
         if self.schema_infer_max_rec.is_some() {
             len += 1;
         }
+        if self.compression_level.is_some() {
+            len += 1;
+        }
+        if self.newline_delimited.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.JsonOptions", len)?;
         if self.compression != 0 {
             let v = CompressionTypeVariant::try_from(self.compression)
@@ -3957,6 +4617,12 @@ impl serde::Serialize for JsonOptions {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("schemaInferMaxRec", ToString::to_string(&v).as_str())?;
         }
+        if let Some(v) = self.compression_level.as_ref() {
+            struct_ser.serialize_field("compressionLevel", v)?;
+        }
+        if let Some(v) = self.newline_delimited.as_ref() {
+            struct_ser.serialize_field("newlineDelimited", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -3970,12 +4636,18 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             "compression",
             "schema_infer_max_rec",
             "schemaInferMaxRec",
+            "compression_level",
+            "compressionLevel",
+            "newline_delimited",
+            "newlineDelimited",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Compression,
             SchemaInferMaxRec,
+            CompressionLevel,
+            NewlineDelimited,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -3984,7 +4656,7 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3999,6 +4671,8 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
                         match value {
                             "compression" => Ok(GeneratedField::Compression),
                             "schemaInferMaxRec" | "schema_infer_max_rec" => Ok(GeneratedField::SchemaInferMaxRec),
+                            "compressionLevel" | "compression_level" => Ok(GeneratedField::CompressionLevel),
+                            "newlineDelimited" | "newline_delimited" => Ok(GeneratedField::NewlineDelimited),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -4020,6 +4694,8 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
             {
                 let mut compression__ = None;
                 let mut schema_infer_max_rec__ = None;
+                let mut compression_level__ = None;
+                let mut newline_delimited__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Compression => {
@@ -4036,11 +4712,27 @@ impl<'de> serde::Deserialize<'de> for JsonOptions {
                                 map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::CompressionLevel => {
+                            if compression_level__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("compressionLevel"));
+                            }
+                            compression_level__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
+                        GeneratedField::NewlineDelimited => {
+                            if newline_delimited__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("newlineDelimited"));
+                            }
+                            newline_delimited__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(JsonOptions {
                     compression: compression__.unwrap_or_default(),
                     schema_infer_max_rec: schema_infer_max_rec__,
+                    compression_level: compression_level__,
+                    newline_delimited: newline_delimited__,
                 })
             }
         }
@@ -4088,7 +4780,7 @@ impl<'de> serde::Deserialize<'de> for JsonWriterOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4180,7 +4872,7 @@ impl<'de> serde::Deserialize<'de> for List {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4281,7 +4973,7 @@ impl<'de> serde::Deserialize<'de> for Map {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4381,7 +5073,7 @@ impl<'de> serde::Deserialize<'de> for NdJsonFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4406,31 +5098,102 @@ impl<'de> serde::Deserialize<'de> for NdJsonFormat {
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
             type Value = NdJsonFormat;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion_common.NdJsonFormat")
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.NdJsonFormat")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<NdJsonFormat, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut options__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Options => {
+                            if options__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("options"));
+                            }
+                            options__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(NdJsonFormat {
+                    options: options__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.NdJsonFormat", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for NullEquality {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::NullEqualsNothing => "NULL_EQUALS_NOTHING",
+            Self::NullEqualsNull => "NULL_EQUALS_NULL",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for NullEquality {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "NULL_EQUALS_NOTHING",
+            "NULL_EQUALS_NULL",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = NullEquality;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<NdJsonFormat, V::Error>
-                where
-                    V: serde::de::MapAccess<'de>,
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
             {
-                let mut options__ = None;
-                while let Some(k) = map_.next_key()? {
-                    match k {
-                        GeneratedField::Options => {
-                            if options__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("options"));
-                            }
-                            options__ = map_.next_value()?;
-                        }
-                    }
+                match value {
+                    "NULL_EQUALS_NOTHING" => Ok(NullEquality::NullEqualsNothing),
+                    "NULL_EQUALS_NULL" => Ok(NullEquality::NullEqualsNull),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
-                Ok(NdJsonFormat {
-                    options: options__,
-                })
             }
         }
-        deserializer.deserialize_struct("datafusion_common.NdJsonFormat", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_any(GeneratedVisitor)
     }
 }
 impl serde::Serialize for ParquetColumnOptions {
@@ -4462,9 +5225,6 @@ impl serde::Serialize for ParquetColumnOptions {
         if self.bloom_filter_ndv_opt.is_some() {
             len += 1;
         }
-        if self.max_statistics_size_opt.is_some() {
-            len += 1;
-        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?;
         if let Some(v) = self.bloom_filter_enabled_opt.as_ref() {
             match v {
@@ -4517,13 +5277,6 @@ impl serde::Serialize for ParquetColumnOptions {
                 }
             }
         }
-        if let Some(v) = self.max_statistics_size_opt.as_ref() {
-            match v {
-                parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
-                    struct_ser.serialize_field("maxStatisticsSize", v)?;
-                }
-            }
-        }
         struct_ser.end()
     }
 }
@@ -4546,8 +5299,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
             "bloomFilterFpp",
             "bloom_filter_ndv",
             "bloomFilterNdv",
-            "max_statistics_size",
-            "maxStatisticsSize",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -4559,7 +5310,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
             StatisticsEnabled,
             BloomFilterFpp,
             BloomFilterNdv,
-            MaxStatisticsSize,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -4568,7 +5318,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4588,7 +5338,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
                             "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
                             "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
                             "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv),
-                            "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -4615,7 +5364,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
                 let mut statistics_enabled_opt__ = None;
                 let mut bloom_filter_fpp_opt__ = None;
                 let mut bloom_filter_ndv_opt__ = None;
-                let mut max_statistics_size_opt__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::BloomFilterEnabled => {
@@ -4660,12 +5408,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
                             }
                             bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0));
                         }
-                        GeneratedField::MaxStatisticsSize => {
-                            if max_statistics_size_opt__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
-                            }
-                            max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
-                        }
                     }
                 }
                 Ok(ParquetColumnOptions {
@@ -4676,7 +5418,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
                     statistics_enabled_opt: statistics_enabled_opt__,
                     bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
                     bloom_filter_ndv_opt: bloom_filter_ndv_opt__,
-                    max_statistics_size_opt: max_statistics_size_opt__,
                 })
             }
         }
@@ -4731,7 +5472,7 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnSpecificOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4831,7 +5572,7 @@ impl<'de> serde::Deserialize<'de> for ParquetFormat {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4906,6 +5647,9 @@ impl serde::Serialize for ParquetOptions {
         if self.reorder_filters {
             len += 1;
         }
+        if self.force_filter_selections {
+            len += 1;
+        }
         if self.data_pagesize_limit != 0 {
             len += 1;
         }
@@ -4963,9 +5707,6 @@ impl serde::Serialize for ParquetOptions {
         if self.statistics_enabled_opt.is_some() {
             len += 1;
         }
-        if self.max_statistics_size_opt.is_some() {
-            len += 1;
-        }
         if self.column_index_truncate_length_opt.is_some() {
             len += 1;
         }
@@ -4984,6 +5725,9 @@ impl serde::Serialize for ParquetOptions {
         if self.coerce_int96_opt.is_some() {
             len += 1;
         }
+        if self.max_predicate_cache_size_opt.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetOptions", len)?;
         if self.enable_page_index {
             struct_ser.serialize_field("enablePageIndex", &self.enable_page_index)?;
@@ -5000,6 +5744,9 @@ impl serde::Serialize for ParquetOptions {
         if self.reorder_filters {
             struct_ser.serialize_field("reorderFilters", &self.reorder_filters)?;
         }
+        if self.force_filter_selections {
+            struct_ser.serialize_field("forceFilterSelections", &self.force_filter_selections)?;
+        }
         if self.data_pagesize_limit != 0 {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
@@ -5089,15 +5836,6 @@ impl serde::Serialize for ParquetOptions {
                 }
             }
         }
-        if let Some(v) = self.max_statistics_size_opt.as_ref() {
-            match v {
-                parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
-                    #[allow(clippy::needless_borrow)]
-                    #[allow(clippy::needless_borrows_for_generic_args)]
-                    struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?;
-                }
-            }
-        }
         if let Some(v) = self.column_index_truncate_length_opt.as_ref() {
             match v {
                 parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => {
@@ -5146,6 +5884,15 @@ impl serde::Serialize for ParquetOptions {
                 }
             }
         }
+        if let Some(v) = self.max_predicate_cache_size_opt.as_ref() {
+            match v {
+                parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v) => {
+                    #[allow(clippy::needless_borrow)]
+                    #[allow(clippy::needless_borrows_for_generic_args)]
+                    struct_ser.serialize_field("maxPredicateCacheSize", ToString::to_string(&v).as_str())?;
+                }
+            }
+        }
         struct_ser.end()
     }
 }
@@ -5165,6 +5912,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             "pushdownFilters",
             "reorder_filters",
             "reorderFilters",
+            "force_filter_selections",
+            "forceFilterSelections",
             "data_pagesize_limit",
             "dataPagesizeLimit",
             "write_batch_size",
@@ -5202,8 +5951,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             "dictionaryEnabled",
             "statistics_enabled",
             "statisticsEnabled",
-            "max_statistics_size",
-            "maxStatisticsSize",
             "column_index_truncate_length",
             "columnIndexTruncateLength",
             "statistics_truncate_length",
@@ -5215,6 +5962,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             "bloomFilterNdv",
             "coerce_int96",
             "coerceInt96",
+            "max_predicate_cache_size",
+            "maxPredicateCacheSize",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -5224,6 +5973,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             SkipMetadata,
             PushdownFilters,
             ReorderFilters,
+            ForceFilterSelections,
             DataPagesizeLimit,
             WriteBatchSize,
             WriterVersion,
@@ -5243,13 +5993,13 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             Compression,
             DictionaryEnabled,
             StatisticsEnabled,
-            MaxStatisticsSize,
             ColumnIndexTruncateLength,
             StatisticsTruncateLength,
             Encoding,
             BloomFilterFpp,
             BloomFilterNdv,
             CoerceInt96,
+            MaxPredicateCacheSize,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -5258,7 +6008,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5276,6 +6026,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             "skipMetadata" | "skip_metadata" => Ok(GeneratedField::SkipMetadata),
                             "pushdownFilters" | "pushdown_filters" => Ok(GeneratedField::PushdownFilters),
                             "reorderFilters" | "reorder_filters" => Ok(GeneratedField::ReorderFilters),
+                            "forceFilterSelections" | "force_filter_selections" => Ok(GeneratedField::ForceFilterSelections),
                             "dataPagesizeLimit" | "data_pagesize_limit" => Ok(GeneratedField::DataPagesizeLimit),
                             "writeBatchSize" | "write_batch_size" => Ok(GeneratedField::WriteBatchSize),
                             "writerVersion" | "writer_version" => Ok(GeneratedField::WriterVersion),
@@ -5295,13 +6046,13 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             "compression" => Ok(GeneratedField::Compression),
                             "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled),
                             "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
-                            "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
                             "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength),
                             "statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength),
                             "encoding" => Ok(GeneratedField::Encoding),
                             "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
                             "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv),
                             "coerceInt96" | "coerce_int96" => Ok(GeneratedField::CoerceInt96),
+                            "maxPredicateCacheSize" | "max_predicate_cache_size" => Ok(GeneratedField::MaxPredicateCacheSize),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -5326,6 +6077,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                 let mut skip_metadata__ = None;
                 let mut pushdown_filters__ = None;
                 let mut reorder_filters__ = None;
+                let mut force_filter_selections__ = None;
                 let mut data_pagesize_limit__ = None;
                 let mut write_batch_size__ = None;
                 let mut writer_version__ = None;
@@ -5345,13 +6097,13 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                 let mut compression_opt__ = None;
                 let mut dictionary_enabled_opt__ = None;
                 let mut statistics_enabled_opt__ = None;
-                let mut max_statistics_size_opt__ = None;
                 let mut column_index_truncate_length_opt__ = None;
                 let mut statistics_truncate_length_opt__ = None;
                 let mut encoding_opt__ = None;
                 let mut bloom_filter_fpp_opt__ = None;
                 let mut bloom_filter_ndv_opt__ = None;
                 let mut coerce_int96_opt__ = None;
+                let mut max_predicate_cache_size_opt__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::EnablePageIndex => {
@@ -5384,6 +6136,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             }
                             reorder_filters__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::ForceFilterSelections => {
+                            if force_filter_selections__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("forceFilterSelections"));
+                            }
+                            force_filter_selections__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::DataPagesizeLimit => {
                             if data_pagesize_limit__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("dataPagesizeLimit"));
@@ -5512,12 +6270,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             }
                             statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled);
                         }
-                        GeneratedField::MaxStatisticsSize => {
-                            if max_statistics_size_opt__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
-                            }
-                            max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
-                        }
                         GeneratedField::ColumnIndexTruncateLength => {
                             if column_index_truncate_length_opt__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength"));
@@ -5554,6 +6306,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                             }
                             coerce_int96_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::CoerceInt96Opt::CoerceInt96);
                         }
+                        GeneratedField::MaxPredicateCacheSize => {
+                            if max_predicate_cache_size_opt__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("maxPredicateCacheSize"));
+                            }
+                            max_predicate_cache_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(x.0));
+                        }
                     }
                 }
                 Ok(ParquetOptions {
@@ -5562,6 +6320,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                     skip_metadata: skip_metadata__.unwrap_or_default(),
                     pushdown_filters: pushdown_filters__.unwrap_or_default(),
                     reorder_filters: reorder_filters__.unwrap_or_default(),
+                    force_filter_selections: force_filter_selections__.unwrap_or_default(),
                     data_pagesize_limit: data_pagesize_limit__.unwrap_or_default(),
                     write_batch_size: write_batch_size__.unwrap_or_default(),
                     writer_version: writer_version__.unwrap_or_default(),
@@ -5581,13 +6340,13 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
                     compression_opt: compression_opt__,
                     dictionary_enabled_opt: dictionary_enabled_opt__,
                     statistics_enabled_opt: statistics_enabled_opt__,
-                    max_statistics_size_opt: max_statistics_size_opt__,
                     column_index_truncate_length_opt: column_index_truncate_length_opt__,
                     statistics_truncate_length_opt: statistics_truncate_length_opt__,
                     encoding_opt: encoding_opt__,
                     bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
                     bloom_filter_ndv_opt: bloom_filter_ndv_opt__,
                     coerce_int96_opt: coerce_int96_opt__,
+                    max_predicate_cache_size_opt: max_predicate_cache_size_opt__,
                 })
             }
         }
@@ -5644,7 +6403,7 @@ impl<'de> serde::Deserialize<'de> for Precision {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5733,7 +6492,7 @@ impl<'de> serde::Deserialize<'de> for PrecisionInfo {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = PrecisionInfo;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5818,7 +6577,7 @@ impl<'de> serde::Deserialize<'de> for PrimaryKeyConstraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5873,6 +6632,116 @@ impl<'de> serde::Deserialize<'de> for PrimaryKeyConstraint {
         deserializer.deserialize_struct("datafusion_common.PrimaryKeyConstraint", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for RunEndEncoded {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.run_ends_field.is_some() {
+            len += 1;
+        }
+        if self.values_field.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.RunEndEncoded", len)?;
+        if let Some(v) = self.run_ends_field.as_ref() {
+            struct_ser.serialize_field("runEndsField", v)?;
+        }
+        if let Some(v) = self.values_field.as_ref() {
+            struct_ser.serialize_field("valuesField", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for RunEndEncoded {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "run_ends_field",
+            "runEndsField",
+            "values_field",
+            "valuesField",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            RunEndsField,
+            ValuesField,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "runEndsField" | "run_ends_field" => Ok(GeneratedField::RunEndsField),
+                            "valuesField" | "values_field" => Ok(GeneratedField::ValuesField),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = RunEndEncoded;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.RunEndEncoded")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<RunEndEncoded, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut run_ends_field__ = None;
+                let mut values_field__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::RunEndsField => {
+                            if run_ends_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndsField"));
+                            }
+                            run_ends_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::ValuesField => {
+                            if values_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("valuesField"));
+                            }
+                            values_field__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(RunEndEncoded {
+                    run_ends_field: run_ends_field__,
+                    values_field: values_field__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.RunEndEncoded", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for ScalarDictionaryValue {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -5921,7 +6790,7 @@ impl<'de> serde::Deserialize<'de> for ScalarDictionaryValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6031,7 +6900,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFixedSizeBinary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6165,7 +7034,7 @@ impl<'de> serde::Deserialize<'de> for ScalarNestedValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6301,7 +7170,7 @@ impl<'de> serde::Deserialize<'de> for scalar_nested_value::Dictionary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6366,6 +7235,133 @@ impl<'de> serde::Deserialize<'de> for scalar_nested_value::Dictionary {
         deserializer.deserialize_struct("datafusion_common.ScalarNestedValue.Dictionary", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for ScalarRunEndEncodedValue {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.run_ends_field.is_some() {
+            len += 1;
+        }
+        if self.values_field.is_some() {
+            len += 1;
+        }
+        if self.value.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion_common.ScalarRunEndEncodedValue", len)?;
+        if let Some(v) = self.run_ends_field.as_ref() {
+            struct_ser.serialize_field("runEndsField", v)?;
+        }
+        if let Some(v) = self.values_field.as_ref() {
+            struct_ser.serialize_field("valuesField", v)?;
+        }
+        if let Some(v) = self.value.as_ref() {
+            struct_ser.serialize_field("value", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ScalarRunEndEncodedValue {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "run_ends_field",
+            "runEndsField",
+            "values_field",
+            "valuesField",
+            "value",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            RunEndsField,
+            ValuesField,
+            Value,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "runEndsField" | "run_ends_field" => Ok(GeneratedField::RunEndsField),
+                            "valuesField" | "values_field" => Ok(GeneratedField::ValuesField),
+                            "value" => Ok(GeneratedField::Value),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ScalarRunEndEncodedValue;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion_common.ScalarRunEndEncodedValue")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ScalarRunEndEncodedValue, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut run_ends_field__ = None;
+                let mut values_field__ = None;
+                let mut value__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::RunEndsField => {
+                            if run_ends_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndsField"));
+                            }
+                            run_ends_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::ValuesField => {
+                            if values_field__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("valuesField"));
+                            }
+                            values_field__ = map_.next_value()?;
+                        }
+                        GeneratedField::Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("value"));
+                            }
+                            value__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ScalarRunEndEncodedValue {
+                    run_ends_field: run_ends_field__,
+                    values_field: values_field__,
+                    value: value__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion_common.ScalarRunEndEncodedValue", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for ScalarTime32Value {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -6416,7 +7412,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTime32Value {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6529,7 +7525,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTime64Value {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6666,7 +7662,7 @@ impl<'de> serde::Deserialize<'de> for ScalarTimestampValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6832,6 +7828,12 @@ impl serde::Serialize for ScalarValue {
                 scalar_value::Value::MapValue(v) => {
                     struct_ser.serialize_field("mapValue", v)?;
                 }
+                scalar_value::Value::Decimal32Value(v) => {
+                    struct_ser.serialize_field("decimal32Value", v)?;
+                }
+                scalar_value::Value::Decimal64Value(v) => {
+                    struct_ser.serialize_field("decimal64Value", v)?;
+                }
                 scalar_value::Value::Decimal128Value(v) => {
                     struct_ser.serialize_field("decimal128Value", v)?;
                 }
@@ -6902,6 +7904,9 @@ impl serde::Serialize for ScalarValue {
                 scalar_value::Value::UnionValue(v) => {
                     struct_ser.serialize_field("unionValue", v)?;
                 }
+                scalar_value::Value::RunEndEncodedValue(v) => {
+                    struct_ser.serialize_field("runEndEncodedValue", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -6958,6 +7963,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             "structValue",
             "map_value",
             "mapValue",
+            "decimal32_value",
+            "decimal32Value",
+            "decimal64_value",
+            "decimal64Value",
             "decimal128_value",
             "decimal128Value",
             "decimal256_value",
@@ -6994,6 +8003,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             "fixedSizeBinaryValue",
             "union_value",
             "unionValue",
+            "run_end_encoded_value",
+            "runEndEncodedValue",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -7020,6 +8031,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             FixedSizeListValue,
             StructValue,
             MapValue,
+            Decimal32Value,
+            Decimal64Value,
             Decimal128Value,
             Decimal256Value,
             Date64Value,
@@ -7038,6 +8051,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             IntervalMonthDayNano,
             FixedSizeBinaryValue,
             UnionValue,
+            RunEndEncodedValue,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -7046,7 +8060,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7081,6 +8095,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                             "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue),
                             "structValue" | "struct_value" => Ok(GeneratedField::StructValue),
                             "mapValue" | "map_value" => Ok(GeneratedField::MapValue),
+                            "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value),
+                            "decimal64Value" | "decimal64_value" => Ok(GeneratedField::Decimal64Value),
                             "decimal128Value" | "decimal128_value" => Ok(GeneratedField::Decimal128Value),
                             "decimal256Value" | "decimal256_value" => Ok(GeneratedField::Decimal256Value),
                             "date64Value" | "date_64_value" => Ok(GeneratedField::Date64Value),
@@ -7099,6 +8115,7 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                             "intervalMonthDayNano" | "interval_month_day_nano" => Ok(GeneratedField::IntervalMonthDayNano),
                             "fixedSizeBinaryValue" | "fixed_size_binary_value" => Ok(GeneratedField::FixedSizeBinaryValue),
                             "unionValue" | "union_value" => Ok(GeneratedField::UnionValue),
+                            "runEndEncodedValue" | "run_end_encoded_value" => Ok(GeneratedField::RunEndEncodedValue),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -7258,6 +8275,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                                 return Err(serde::de::Error::duplicate_field("mapValue"));
                             }
                             value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::MapValue)
+;
+                        }
+                        GeneratedField::Decimal32Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("decimal32Value"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal32Value)
+;
+                        }
+                        GeneratedField::Decimal64Value => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("decimal64Value"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal64Value)
 ;
                         }
                         GeneratedField::Decimal128Value => {
@@ -7375,6 +8406,13 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
                                 return Err(serde::de::Error::duplicate_field("unionValue"));
                             }
                             value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::UnionValue)
+;
+                        }
+                        GeneratedField::RunEndEncodedValue => {
+                            if value__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("runEndEncodedValue"));
+                            }
+                            value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::RunEndEncodedValue)
 ;
                         }
                     }
@@ -7434,7 +8472,7 @@ impl<'de> serde::Deserialize<'de> for Schema {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7555,7 +8593,7 @@ impl<'de> serde::Deserialize<'de> for Statistics {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7665,7 +8703,7 @@ impl<'de> serde::Deserialize<'de> for Struct {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7774,7 +8812,7 @@ impl<'de> serde::Deserialize<'de> for TableParquetOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7876,7 +8914,7 @@ impl<'de> serde::Deserialize<'de> for TimeUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = TimeUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7973,7 +9011,7 @@ impl<'de> serde::Deserialize<'de> for Timestamp {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8094,7 +9132,7 @@ impl<'de> serde::Deserialize<'de> for Union {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8215,7 +9253,7 @@ impl<'de> serde::Deserialize<'de> for UnionField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8304,7 +9342,7 @@ impl<'de> serde::Deserialize<'de> for UnionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = UnionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8415,7 +9453,7 @@ impl<'de> serde::Deserialize<'de> for UnionValue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8535,7 +9573,7 @@ impl<'de> serde::Deserialize<'de> for UniqueConstraint {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs
index e029327d481d1..a09826a29be52 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -1,10 +1,10 @@
 // This file is @generated by prost-build.
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ColumnRelation {
     #[prost(string, tag = "1")]
     pub relation: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Column {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
@@ -28,7 +28,7 @@ pub struct DfSchema {
         ::prost::alloc::string::String,
     >,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvFormat {
     #[prost(message, optional, tag = "5")]
     pub options: ::core::option::Option<CsvOptions>,
@@ -38,31 +38,33 @@ pub struct ParquetFormat {
     #[prost(message, optional, tag = "2")]
     pub options: ::core::option::Option<TableParquetOptions>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AvroFormat {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct NdJsonFormat {
     #[prost(message, optional, tag = "1")]
     pub options: ::core::option::Option<JsonOptions>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct ArrowFormat {}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PrimaryKeyConstraint {
     #[prost(uint64, repeated, tag = "1")]
     pub indices: ::prost::alloc::vec::Vec<u64>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct UniqueConstraint {
     #[prost(uint64, repeated, tag = "1")]
     pub indices: ::prost::alloc::vec::Vec<u64>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Constraint {
     #[prost(oneof = "constraint::ConstraintMode", tags = "1, 2")]
     pub constraint_mode: ::core::option::Option<constraint::ConstraintMode>,
 }
 /// Nested message and enum types in `Constraint`.
 pub mod constraint {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum ConstraintMode {
         #[prost(message, tag = "1")]
         PrimaryKey(super::PrimaryKeyConstraint),
@@ -75,9 +77,9 @@ pub struct Constraints {
     #[prost(message, repeated, tag = "1")]
     pub constraints: ::prost::alloc::vec::Vec<Constraint>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AvroOptions {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ArrowOptions {}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Schema {
@@ -106,24 +108,36 @@ pub struct Field {
         ::prost::alloc::string::String,
         ::prost::alloc::string::String,
     >,
-    #[prost(bool, tag = "6")]
-    pub dict_ordered: bool,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Timestamp {
     #[prost(enumeration = "TimeUnit", tag = "1")]
     pub time_unit: i32,
     #[prost(string, tag = "2")]
     pub timezone: ::prost::alloc::string::String,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
-pub struct Decimal {
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal32Type {
+    #[prost(uint32, tag = "3")]
+    pub precision: u32,
+    #[prost(int32, tag = "4")]
+    pub scale: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal64Type {
+    #[prost(uint32, tag = "3")]
+    pub precision: u32,
+    #[prost(int32, tag = "4")]
+    pub scale: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal128Type {
     #[prost(uint32, tag = "3")]
     pub precision: u32,
     #[prost(int32, tag = "4")]
     pub scale: i32,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal256Type {
     #[prost(uint32, tag = "3")]
     pub precision: u32,
@@ -162,6 +176,13 @@ pub struct Map {
     pub keys_sorted: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RunEndEncoded {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub run_ends_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub values_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Union {
     #[prost(message, repeated, tag = "1")]
     pub union_types: ::prost::alloc::vec::Vec<Field>,
@@ -184,7 +205,7 @@ pub struct ScalarNestedValue {
 }
 /// Nested message and enum types in `ScalarNestedValue`.
 pub mod scalar_nested_value {
-    #[derive(Clone, PartialEq, ::prost::Message)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
     pub struct Dictionary {
         #[prost(bytes = "vec", tag = "1")]
         pub ipc_message: ::prost::alloc::vec::Vec<u8>,
@@ -192,14 +213,14 @@ pub mod scalar_nested_value {
         pub arrow_data: ::prost::alloc::vec::Vec<u8>,
     }
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTime32Value {
     #[prost(oneof = "scalar_time32_value::Value", tags = "1, 2")]
     pub value: ::core::option::Option<scalar_time32_value::Value>,
 }
 /// Nested message and enum types in `ScalarTime32Value`.
 pub mod scalar_time32_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int32, tag = "1")]
         Time32SecondValue(i32),
@@ -207,14 +228,14 @@ pub mod scalar_time32_value {
         Time32MillisecondValue(i32),
     }
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTime64Value {
     #[prost(oneof = "scalar_time64_value::Value", tags = "1, 2")]
     pub value: ::core::option::Option<scalar_time64_value::Value>,
 }
 /// Nested message and enum types in `ScalarTime64Value`.
 pub mod scalar_time64_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int64, tag = "1")]
         Time64MicrosecondValue(i64),
@@ -222,7 +243,7 @@ pub mod scalar_time64_value {
         Time64NanosecondValue(i64),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTimestampValue {
     #[prost(string, tag = "5")]
     pub timezone: ::prost::alloc::string::String,
@@ -231,7 +252,7 @@ pub struct ScalarTimestampValue {
 }
 /// Nested message and enum types in `ScalarTimestampValue`.
 pub mod scalar_timestamp_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int64, tag = "1")]
         TimeMicrosecondValue(i64),
@@ -250,14 +271,23 @@ pub struct ScalarDictionaryValue {
     #[prost(message, optional, boxed, tag = "2")]
     pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarRunEndEncodedValue {
+    #[prost(message, optional, tag = "1")]
+    pub run_ends_field: ::core::option::Option<Field>,
+    #[prost(message, optional, tag = "2")]
+    pub values_field: ::core::option::Option<Field>,
+    #[prost(message, optional, boxed, tag = "3")]
+    pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalDayTimeValue {
     #[prost(int32, tag = "1")]
     pub days: i32,
     #[prost(int32, tag = "2")]
     pub milliseconds: i32,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalMonthDayNanoValue {
     #[prost(int32, tag = "1")]
     pub months: i32,
@@ -286,7 +316,7 @@ pub struct UnionValue {
     #[prost(enumeration = "UnionMode", tag = "4")]
     pub mode: i32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarFixedSizeBinary {
     #[prost(bytes = "vec", tag = "1")]
     pub values: ::prost::alloc::vec::Vec<u8>,
@@ -297,7 +327,7 @@ pub struct ScalarFixedSizeBinary {
 pub struct ScalarValue {
     #[prost(
         oneof = "scalar_value::Value",
-        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42"
+        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
     )]
     pub value: ::core::option::Option<scalar_value::Value>,
 }
@@ -352,6 +382,10 @@ pub mod scalar_value {
         StructValue(super::ScalarNestedValue),
         #[prost(message, tag = "41")]
         MapValue(super::ScalarNestedValue),
+        #[prost(message, tag = "43")]
+        Decimal32Value(super::Decimal32),
+        #[prost(message, tag = "44")]
+        Decimal64Value(super::Decimal64),
         #[prost(message, tag = "20")]
         Decimal128Value(super::Decimal128),
         #[prost(message, tag = "39")]
@@ -388,9 +422,29 @@ pub mod scalar_value {
         FixedSizeBinaryValue(super::ScalarFixedSizeBinary),
         #[prost(message, tag = "42")]
         UnionValue(::prost::alloc::boxed::Box<super::UnionValue>),
+        #[prost(message, tag = "45")]
+        RunEndEncodedValue(::prost::alloc::boxed::Box<super::ScalarRunEndEncodedValue>),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal32 {
+    #[prost(bytes = "vec", tag = "1")]
+    pub value: ::prost::alloc::vec::Vec<u8>,
+    #[prost(int64, tag = "2")]
+    pub p: i64,
+    #[prost(int64, tag = "3")]
+    pub s: i64,
+}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal64 {
+    #[prost(bytes = "vec", tag = "1")]
+    pub value: ::prost::alloc::vec::Vec<u8>,
+    #[prost(int64, tag = "2")]
+    pub p: i64,
+    #[prost(int64, tag = "3")]
+    pub s: i64,
+}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal128 {
     #[prost(bytes = "vec", tag = "1")]
     pub value: ::prost::alloc::vec::Vec<u8>,
@@ -399,7 +453,7 @@ pub struct Decimal128 {
     #[prost(int64, tag = "3")]
     pub s: i64,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal256 {
     #[prost(bytes = "vec", tag = "1")]
     pub value: ::prost::alloc::vec::Vec<u8>,
@@ -413,7 +467,7 @@ pub struct Decimal256 {
 pub struct ArrowType {
     #[prost(
         oneof = "arrow_type::ArrowTypeEnum",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33, 42"
     )]
     pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
 }
@@ -480,8 +534,12 @@ pub mod arrow_type {
         Time64(i32),
         #[prost(enumeration = "super::IntervalUnit", tag = "23")]
         Interval(i32),
+        #[prost(message, tag = "40")]
+        Decimal32(super::Decimal32Type),
+        #[prost(message, tag = "41")]
+        Decimal64(super::Decimal64Type),
         #[prost(message, tag = "24")]
-        Decimal(super::Decimal),
+        Decimal128(super::Decimal128Type),
         #[prost(message, tag = "36")]
         Decimal256(super::Decimal256Type),
         #[prost(message, tag = "25")]
@@ -498,6 +556,8 @@ pub mod arrow_type {
         Dictionary(::prost::alloc::boxed::Box<super::Dictionary>),
         #[prost(message, tag = "33")]
         Map(::prost::alloc::boxed::Box<super::Map>),
+        #[prost(message, tag = "42")]
+        RunEndEncoded(::prost::alloc::boxed::Box<super::RunEndEncoded>),
     }
 }
 /// Useful for representing an empty enum variant in rust
@@ -509,14 +569,14 @@ pub mod arrow_type {
 ///         i32 Two = 2;
 ///    }
 /// }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct EmptyMessage {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct JsonWriterOptions {
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
     pub compression: i32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvWriterOptions {
     /// Compression type
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
@@ -553,7 +613,7 @@ pub struct CsvWriterOptions {
     pub double_quote: bool,
 }
 /// Options controlling CSV format
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvOptions {
     /// Indicates if the CSV has a header row
     #[prost(bytes = "vec", tag = "1")]
@@ -606,9 +666,15 @@ pub struct CsvOptions {
     /// Optional terminator character as a byte
     #[prost(bytes = "vec", tag = "17")]
     pub terminator: ::prost::alloc::vec::Vec<u8>,
+    /// Indicates if truncated rows are allowed
+    #[prost(bytes = "vec", tag = "18")]
+    pub truncated_rows: ::prost::alloc::vec::Vec<u8>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "19")]
+    pub compression_level: ::core::option::Option<u32>,
 }
 /// Options controlling CSV format
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct JsonOptions {
     /// Compression type
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
@@ -616,6 +682,12 @@ pub struct JsonOptions {
     /// Optional max records for schema inference
     #[prost(uint64, optional, tag = "2")]
     pub schema_infer_max_rec: ::core::option::Option<u64>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "3")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Whether to read as newline-delimited JSON (default true). When false, expects JSON array format \[{},...\]
+    #[prost(bool, optional, tag = "4")]
+    pub newline_delimited: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TableParquetOptions {
@@ -662,34 +734,30 @@ pub struct ParquetColumnOptions {
     pub bloom_filter_ndv_opt: ::core::option::Option<
         parquet_column_options::BloomFilterNdvOpt,
     >,
-    #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")]
-    pub max_statistics_size_opt: ::core::option::Option<
-        parquet_column_options::MaxStatisticsSizeOpt,
-    >,
 }
 /// Nested message and enum types in `ParquetColumnOptions`.
 pub mod parquet_column_options {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterEnabledOpt {
         #[prost(bool, tag = "1")]
         BloomFilterEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum EncodingOpt {
         #[prost(string, tag = "2")]
         Encoding(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum DictionaryEnabledOpt {
         #[prost(bool, tag = "3")]
         DictionaryEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CompressionOpt {
         #[prost(string, tag = "4")]
         Compression(::prost::alloc::string::String),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsEnabledOpt {
         #[prost(string, tag = "5")]
         StatisticsEnabled(::prost::alloc::string::String),
@@ -699,16 +767,11 @@ pub mod parquet_column_options {
         #[prost(double, tag = "6")]
         BloomFilterFpp(f64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterNdvOpt {
         #[prost(uint64, tag = "7")]
         BloomFilterNdv(u64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
-    pub enum MaxStatisticsSizeOpt {
-        #[prost(uint32, tag = "8")]
-        MaxStatisticsSize(u32),
-    }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ParquetOptions {
@@ -729,6 +792,9 @@ pub struct ParquetOptions {
     /// default = false
     #[prost(bool, tag = "6")]
     pub reorder_filters: bool,
+    /// default = false
+    #[prost(bool, tag = "34")]
+    pub force_filter_selections: bool,
     /// default = 1024 * 1024
     #[prost(uint64, tag = "7")]
     pub data_pagesize_limit: u64,
@@ -786,10 +852,6 @@ pub struct ParquetOptions {
     pub statistics_enabled_opt: ::core::option::Option<
         parquet_options::StatisticsEnabledOpt,
     >,
-    #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")]
-    pub max_statistics_size_opt: ::core::option::Option<
-        parquet_options::MaxStatisticsSizeOpt,
-    >,
     #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")]
     pub column_index_truncate_length_opt: ::core::option::Option<
         parquet_options::ColumnIndexTruncateLengthOpt,
@@ -806,45 +868,44 @@ pub struct ParquetOptions {
     pub bloom_filter_ndv_opt: ::core::option::Option<parquet_options::BloomFilterNdvOpt>,
     #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")]
     pub coerce_int96_opt: ::core::option::Option<parquet_options::CoerceInt96Opt>,
+    #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")]
+    pub max_predicate_cache_size_opt: ::core::option::Option<
+        parquet_options::MaxPredicateCacheSizeOpt,
+    >,
 }
 /// Nested message and enum types in `ParquetOptions`.
 pub mod parquet_options {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum MetadataSizeHintOpt {
         #[prost(uint64, tag = "4")]
         MetadataSizeHint(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CompressionOpt {
         #[prost(string, tag = "10")]
         Compression(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum DictionaryEnabledOpt {
         #[prost(bool, tag = "11")]
         DictionaryEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsEnabledOpt {
         #[prost(string, tag = "13")]
         StatisticsEnabled(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
-    pub enum MaxStatisticsSizeOpt {
-        #[prost(uint64, tag = "14")]
-        MaxStatisticsSize(u64),
-    }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum ColumnIndexTruncateLengthOpt {
         #[prost(uint64, tag = "17")]
         ColumnIndexTruncateLength(u64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsTruncateLengthOpt {
         #[prost(uint64, tag = "31")]
         StatisticsTruncateLength(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum EncodingOpt {
         #[prost(string, tag = "19")]
         Encoding(::prost::alloc::string::String),
@@ -854,16 +915,21 @@ pub mod parquet_options {
         #[prost(double, tag = "21")]
         BloomFilterFpp(f64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterNdvOpt {
         #[prost(uint64, tag = "22")]
         BloomFilterNdv(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CoerceInt96Opt {
         #[prost(string, tag = "32")]
         CoerceInt96(::prost::alloc::string::String),
     }
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
+    pub enum MaxPredicateCacheSizeOpt {
+        #[prost(uint64, tag = "33")]
+        MaxPredicateCacheSize(u64),
+    }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Precision {
@@ -893,6 +959,8 @@ pub struct ColumnStats {
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]
     pub distinct_count: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "6")]
+    pub byte_size: ::core::option::Option<Precision>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
@@ -906,6 +974,7 @@ pub enum JoinType {
     Rightsemi = 6,
     Rightanti = 7,
     Leftmark = 8,
+    Rightmark = 9,
 }
 impl JoinType {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -923,6 +992,7 @@ impl JoinType {
             Self::Rightsemi => "RIGHTSEMI",
             Self::Rightanti => "RIGHTANTI",
             Self::Leftmark => "LEFTMARK",
+            Self::Rightmark => "RIGHTMARK",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
@@ -937,6 +1007,7 @@ impl JoinType {
             "RIGHTSEMI" => Some(Self::Rightsemi),
             "RIGHTANTI" => Some(Self::Rightanti),
             "LEFTMARK" => Some(Self::Leftmark),
+            "RIGHTMARK" => Some(Self::Rightmark),
             _ => None,
         }
     }
@@ -969,6 +1040,32 @@ impl JoinConstraint {
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
+pub enum NullEquality {
+    NullEqualsNothing = 0,
+    NullEqualsNull = 1,
+}
+impl NullEquality {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::NullEqualsNothing => "NULL_EQUALS_NOTHING",
+            Self::NullEqualsNull => "NULL_EQUALS_NULL",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "NULL_EQUALS_NOTHING" => Some(Self::NullEqualsNothing),
+            "NULL_EQUALS_NULL" => Some(Self::NullEqualsNull),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
 pub enum TimeUnit {
     Second = 0,
     Millisecond = 1,
diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs
index 6400e4bdc66de..6f7fb7b89c0c4 100644
--- a/datafusion/proto-common/src/lib.rs
+++ b/datafusion/proto-common/src/lib.rs
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
@@ -62,28 +63,33 @@
 //! # use datafusion_proto_common::protobuf_common;
 //! # use prost::Message;
 //! # fn main() -> Result<()>{
-//!     // Create a new ScalarValue
-//!     let val = ScalarValue::UInt64(Some(3));
-//!     let mut buffer = BytesMut::new();
-//!     let protobuf: protobuf_common::ScalarValue = match val {
-//!         ScalarValue::UInt64(Some(val)) => {
-//!             protobuf_common::ScalarValue{value: Some(protobuf_common::scalar_value::Value::Uint64Value(val))}
-//!         }
-//!         _ => unreachable!(),
-//!     };
+//! // Create a new ScalarValue
+//! let val = ScalarValue::UInt64(Some(3));
+//! let mut buffer = BytesMut::new();
+//! let protobuf: protobuf_common::ScalarValue = match val {
+//!     ScalarValue::UInt64(Some(val)) => protobuf_common::ScalarValue {
+//!         value: Some(protobuf_common::scalar_value::Value::Uint64Value(val)),
+//!     },
+//!     _ => unreachable!(),
+//! };
 //!
-//!     protobuf.encode(&mut buffer)
+//! protobuf
+//!     .encode(&mut buffer)
 //!     .map_err(|e| plan_datafusion_err!("Error encoding protobuf as bytes: {e}"))?;
-//!     // Convert it to bytes (for sending over the network, etc.)
-//!     let bytes: Bytes = buffer.into();
+//! // Convert it to bytes (for sending over the network, etc.)
+//! let bytes: Bytes = buffer.into();
 //!
-//!     let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}"))?;
-//!     // Decode bytes from somewhere (over network, etc.) back to ScalarValue
-//!     let decoded_val: ScalarValue = match protobuf.value {
-//!         Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => ScalarValue::UInt64(Some(val)),
-//!         _ => unreachable!(),
-//!     };
-//!     assert_eq!(val, decoded_val);
+//! let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| {
+//!     plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}")
+//! })?;
+//! // Decode bytes from somewhere (over network, etc.) back to ScalarValue
+//! let decoded_val: ScalarValue = match protobuf.value {
+//!     Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => {
+//!         ScalarValue::UInt64(Some(val))
+//!     }
+//!     _ => unreachable!(),
+//! };
+//! assert_eq!(val, decoded_val);
 //! # Ok(())
 //! # }
 //! ```
diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs
index 28927cad03b4c..79e3306a4df1b 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 
 use crate::protobuf_common as protobuf;
 use crate::protobuf_common::{
-    arrow_type::ArrowTypeEnum, scalar_value::Value, EmptyMessage,
+    EmptyMessage, arrow_type::ArrowTypeEnum, scalar_value::Value,
 };
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::csv::WriterBuilder;
@@ -28,8 +28,12 @@ use arrow::datatypes::{
     DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
     SchemaRef, TimeUnit, UnionMode,
 };
-use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator};
+use arrow::ipc::writer::{
+    CompressionContext, DictionaryTracker, IpcDataGenerator, IpcWriteOptions,
+};
 use datafusion_common::{
+    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
+    DataFusionError, JoinSide, ScalarValue, Statistics,
     config::{
         CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
         TableParquetOptions,
@@ -38,8 +42,6 @@ use datafusion_common::{
     parsers::CompressionTypeVariant,
     plan_datafusion_err,
     stats::Precision,
-    Column, ColumnStatistics, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, JoinSide, ScalarValue, Statistics,
 };
 
 #[derive(Debug)]
@@ -65,7 +67,7 @@ impl std::fmt::Display for Error {
                 write!(f, "{value:?} is invalid as a DataFusion scalar value")
             }
             Self::InvalidScalarType(data_type) => {
-                write!(f, "{data_type:?} is invalid as a DataFusion scalar type")
+                write!(f, "{data_type} is invalid as a DataFusion scalar type")
             }
             Self::InvalidTimeUnit(time_unit) => {
                 write!(
@@ -97,7 +99,6 @@ impl TryFrom<&Field> for protobuf::Field {
             nullable: field.is_nullable(),
             children: Vec::new(),
             metadata: field.metadata().clone(),
-            dict_ordered: field.dict_is_ordered().unwrap_or(false),
         })
     }
 }
@@ -179,7 +180,9 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
                     UnionMode::Dense => protobuf::UnionMode::Dense,
                 };
                 Self::Union(protobuf::Union {
-                    union_types: convert_arc_fields_to_proto_fields(fields.iter().map(|(_, item)|item))?,
+                    union_types: convert_arc_fields_to_proto_fields(
+                        fields.iter().map(|(_, item)| item),
+                    )?,
                     union_mode: union_mode.into(),
                     type_ids: fields.iter().map(|(x, _)| x as i32).collect(),
                 })
@@ -190,29 +193,44 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
                     value: Some(Box::new(value_type.as_ref().try_into()?)),
                 }))
             }
-            DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Decimal256(precision, scale) => Self::Decimal256(protobuf::Decimal256Type {
-                precision: *precision as u32,
-                scale: *scale as i32,
-            }),
-            DataType::Map(field, sorted) => {
-                Self::Map(Box::new(
-                    protobuf::Map {
-                        field_type: Some(Box::new(field.as_ref().try_into()?)),
-                        keys_sorted: *sorted,
-                    }
-                ))
-            }
-            DataType::RunEndEncoded(_, _) => {
-                return Err(Error::General(
-                    "Proto serialization error: The RunEndEncoded data type is not yet supported".to_owned()
-                ))
+            DataType::Decimal32(precision, scale) => {
+                Self::Decimal32(protobuf::Decimal32Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal64(precision, scale) => {
+                Self::Decimal64(protobuf::Decimal64Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal128(precision, scale) => {
+                Self::Decimal128(protobuf::Decimal128Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Decimal256(precision, scale) => {
+                Self::Decimal256(protobuf::Decimal256Type {
+                    precision: *precision as u32,
+                    scale: *scale as i32,
+                })
+            }
+            DataType::Map(field, sorted) => Self::Map(Box::new(protobuf::Map {
+                field_type: Some(Box::new(field.as_ref().try_into()?)),
+                keys_sorted: *sorted,
+            })),
+            DataType::RunEndEncoded(run_ends_field, values_field) => {
+                Self::RunEndEncoded(Box::new(protobuf::RunEndEncoded {
+                    run_ends_field: Some(Box::new(run_ends_field.as_ref().try_into()?)),
+                    values_field: Some(Box::new(values_field.as_ref().try_into()?)),
+                }))
             }
             DataType::ListView(_) | DataType::LargeListView(_) => {
-                return Err(Error::General(format!("Proto serialization error: {val} not yet supported")))
+                return Err(Error::General(format!(
+                    "Proto serialization error: {val} not yet supported"
+                )));
             }
         };
 
@@ -398,6 +416,42 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
                     })
                 })
             }
+            ScalarValue::Decimal32(val, p, s) => match *val {
+                Some(v) => {
+                    let array = v.to_be_bytes();
+                    let vec_val: Vec<u8> = array.to_vec();
+                    Ok(protobuf::ScalarValue {
+                        value: Some(Value::Decimal32Value(protobuf::Decimal32 {
+                            value: vec_val,
+                            p: *p as i64,
+                            s: *s as i64,
+                        })),
+                    })
+                }
+                None => Ok(protobuf::ScalarValue {
+                    value: Some(protobuf::scalar_value::Value::NullValue(
+                        (&data_type).try_into()?,
+                    )),
+                }),
+            },
+            ScalarValue::Decimal64(val, p, s) => match *val {
+                Some(v) => {
+                    let array = v.to_be_bytes();
+                    let vec_val: Vec<u8> = array.to_vec();
+                    Ok(protobuf::ScalarValue {
+                        value: Some(Value::Decimal64Value(protobuf::Decimal64 {
+                            value: vec_val,
+                            p: *p as i64,
+                            s: *s as i64,
+                        })),
+                    })
+                }
+                None => Ok(protobuf::ScalarValue {
+                    value: Some(protobuf::scalar_value::Value::NullValue(
+                        (&data_type).try_into()?,
+                    )),
+                }),
+            },
             ScalarValue::Decimal128(val, p, s) => match *val {
                 Some(v) => {
                     let array = v.to_be_bytes();
@@ -635,6 +689,18 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
                     ))),
                 })
             }
+
+            ScalarValue::RunEndEncoded(run_ends_field, values_field, val) => {
+                Ok(protobuf::ScalarValue {
+                    value: Some(Value::RunEndEncodedValue(Box::new(
+                        protobuf::ScalarRunEndEncodedValue {
+                            run_ends_field: Some(run_ends_field.as_ref().try_into()?),
+                            values_field: Some(values_field.as_ref().try_into()?),
+                            value: Some(Box::new(val.as_ref().try_into()?)),
+                        },
+                    ))),
+                })
+            }
         }
     }
 }
@@ -750,6 +816,7 @@ impl From<&ColumnStatistics> for protobuf::ColumnStats {
             sum_value: Some(protobuf::Precision::from(&s.sum_value)),
             null_count: Some(protobuf::Precision::from(&s.null_count)),
             distinct_count: Some(protobuf::Precision::from(&s.distinct_count)),
+            byte_size: Some(protobuf::Precision::from(&s.byte_size)),
         }
     }
 }
@@ -811,15 +878,14 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
             metadata_size_hint_opt: value.metadata_size_hint.map(|v| protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v as u64)),
             pushdown_filters: value.pushdown_filters,
             reorder_filters: value.reorder_filters,
+            force_filter_selections: value.force_filter_selections,
             data_pagesize_limit: value.data_pagesize_limit as u64,
             write_batch_size: value.write_batch_size as u64,
-            writer_version: value.writer_version.clone(),
+            writer_version: value.writer_version.to_string(),
             compression_opt: value.compression.clone().map(protobuf::parquet_options::CompressionOpt::Compression),
             dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled),
             dictionary_page_size_limit: value.dictionary_page_size_limit as u64,
             statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled),
-            #[allow(deprecated)]
-            max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)),
             max_row_group_size: value.max_row_group_size as u64,
             created_by: value.created_by.clone(),
             column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)),
@@ -837,6 +903,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
             binary_as_string: value.binary_as_string,
             skip_arrow_metadata: value.skip_arrow_metadata,
             coerce_int96_opt: value.coerce_int96.clone().map(protobuf::parquet_options::CoerceInt96Opt::CoerceInt96),
+            max_predicate_cache_size_opt: value.max_predicate_cache_size.map(|v| protobuf::parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(v as u64)),
         })
     }
 }
@@ -859,12 +926,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions {
                 .statistics_enabled
                 .clone()
                 .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled),
-            #[allow(deprecated)]
-            max_statistics_size_opt: value.max_statistics_size.map(|v| {
-                protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(
-                    v as u32,
-                )
-            }),
             encoding_opt: value
                 .encoding
                 .clone()
@@ -935,6 +996,8 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions {
             null_value: opts.null_value.clone().unwrap_or_default(),
             null_regex: opts.null_regex.clone().unwrap_or_default(),
             comment: opts.comment.map_or_else(Vec::new, |h| vec![h]),
+            truncated_rows: opts.truncated_rows.map_or_else(Vec::new, |h| vec![h as u8]),
+            compression_level: opts.compression_level,
         })
     }
 }
@@ -947,6 +1010,8 @@ impl TryFrom<&JsonOptions> for protobuf::JsonOptions {
         Ok(protobuf::JsonOptions {
             compression: compression.into(),
             schema_infer_max_rec: opts.schema_infer_max_rec.map(|h| h as u64),
+            compression_level: opts.compression_level,
+            newline_delimited: Some(opts.newline_delimited),
         })
     }
 }
@@ -967,7 +1032,7 @@ fn create_proto_scalar<I, T: FnOnce(&I) -> protobuf::scalar_value::Value>(
     Ok(protobuf::ScalarValue { value: Some(value) })
 }
 
-// ScalarValue::List / FixedSizeList / LargeList / Struct / Map are serialized using
+// Nested ScalarValue types (List / FixedSizeList / LargeList / Struct / Map) are serialized using
 // Arrow IPC messages as a single column RecordBatch
 fn encode_scalar_nested_value(
     arr: ArrayRef,
@@ -975,16 +1040,30 @@ fn encode_scalar_nested_value(
 ) -> Result<protobuf::ScalarValue, Error> {
     let batch = RecordBatch::try_from_iter(vec![("field_name", arr)]).map_err(|e| {
         Error::General(format!(
-            "Error creating temporary batch while encoding ScalarValue::List: {e}"
+            "Error creating temporary batch while encoding nested ScalarValue: {e}"
         ))
     })?;
 
-    let gen = IpcDataGenerator {};
+    let ipc_gen = IpcDataGenerator {};
     let mut dict_tracker = DictionaryTracker::new(false);
-    let (encoded_dictionaries, encoded_message) = gen
-        .encoded_batch(&batch, &mut dict_tracker, &Default::default())
+    let write_options = IpcWriteOptions::default();
+    // The IPC writer requires pre-allocated dictionary IDs (normally assigned when
+    // serializing the schema). Populate `dict_tracker` by encoding the schema first.
+    ipc_gen.schema_to_bytes_with_dictionary_tracker(
+        batch.schema().as_ref(),
+        &mut dict_tracker,
+        &write_options,
+    );
+    let mut compression_context = CompressionContext::default();
+    let (encoded_dictionaries, encoded_message) = ipc_gen
+        .encode(
+            &batch,
+            &mut dict_tracker,
+            &write_options,
+            &mut compression_context,
+        )
         .map_err(|e| {
-            Error::General(format!("Error encoding ScalarValue::List as IPC: {e}"))
+            Error::General(format!("Error encoding nested ScalarValue as IPC: {e}"))
         })?;
 
     let schema: protobuf::Schema = batch.schema().try_into()?;
diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml
index a1eeabdf87f4a..3d17ed30d5726 100644
--- a/datafusion/proto/Cargo.toml
+++ b/datafusion/proto/Cargo.toml
@@ -28,9 +28,6 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
-# Exclude proto files so crates.io consumers don't need protoc
-exclude = ["*.proto"]
-
 [package.metadata.docs.rs]
 all-features = true
 
@@ -40,24 +37,50 @@ name = "datafusion_proto"
 [features]
 default = ["parquet"]
 json = ["pbjson", "serde", "serde_json", "datafusion-proto-common/json"]
-parquet = ["datafusion/parquet", "datafusion-common/parquet"]
-avro = ["datafusion/avro", "datafusion-common/avro"]
+parquet = ["datafusion-datasource-parquet", "datafusion-common/parquet", "datafusion/parquet"]
+avro = ["datafusion-datasource-avro", "datafusion-common/avro"]
+
+# Note to developers: do *not* add `datafusion` as a dependency in
+# this crate. See https://github.com/apache/datafusion/issues/17713
+# for additional information.
 
 [dependencies]
 arrow = { workspace = true }
 chrono = { workspace = true }
-datafusion = { workspace = true, default-features = true }
-datafusion-common = { workspace = true, default-features = true }
+datafusion-catalog = { workspace = true }
+datafusion-catalog-listing = { workspace = true }
+datafusion-common = { workspace = true }
+datafusion-datasource = { workspace = true }
+datafusion-datasource-arrow = { workspace = true }
+datafusion-datasource-avro = { workspace = true, optional = true }
+datafusion-datasource-csv = { workspace = true }
+datafusion-datasource-json = { workspace = true }
+datafusion-datasource-parquet = { workspace = true, optional = true }
+datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
+datafusion-functions-table = { workspace = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
 datafusion-proto-common = { workspace = true }
 object_store = { workspace = true }
 pbjson = { workspace = true, optional = true }
 prost = { workspace = true }
+rand = { workspace = true }
 serde = { version = "1.0", optional = true }
 serde_json = { workspace = true, optional = true }
+
 [dev-dependencies]
+async-trait = { workspace = true }
+datafusion = { workspace = true, default-features = false, features = [
+    "sql",
+    "datetime_expressions",
+    "nested_expressions",
+    "unicode_expressions",
+] }
 datafusion-functions = { workspace = true, default-features = true }
 datafusion-functions-aggregate = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 doc-comment = { workspace = true }
+pretty_assertions = "1.4"
 tokio = { workspace = true, features = ["rt-multi-thread"] }
diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md
index f8930779db890..c1382c5b8f8f8 100644
--- a/datafusion/proto/README.md
+++ b/datafusion/proto/README.md
@@ -17,13 +17,17 @@
   under the License.
 -->
 
-# `datafusion-proto`: Apache DataFusion Protobuf Serialization / Deserialization
+# Apache DataFusion Protobuf Serialization / Deserialization
 
-This crate contains code to convert [Apache DataFusion] plans to and from
-bytes, which can be useful for sending plans over the network, for example
-when building a distributed query engine.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate contains code to convert DataFusion plans to and from bytes using [Protocol Buffers],
+which can be useful for sending plans over the network, for example when building a distributed
+query engine.
 
 See [API Docs] for details and examples.
 
-[apache datafusion]: https://datafusion.apache.org
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[protocol buffers]: https://protobuf.dev/
 [api docs]: http://docs.rs/datafusion-proto/latest
diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml
index 467a7f487dae9..8b48dfe70e6c7 100644
--- a/datafusion/proto/gen/Cargo.toml
+++ b/datafusion/proto/gen/Cargo.toml
@@ -29,10 +29,13 @@ publish = false
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [dependencies]
 # Pin these dependencies so that the generated output is deterministic
-pbjson-build = "=0.7.0"
-prost-build = "=0.13.5"
+pbjson-build = "=0.9.0"
+prost-build = "=0.14.3"
diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto
index 4c8b6c588d949..e422ce7bed4f3 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -98,6 +98,7 @@ message ListingTableScanNode {
     datafusion_common.ParquetFormat parquet = 11;
     datafusion_common.AvroFormat avro = 12;
     datafusion_common.NdJsonFormat json = 15;
+    datafusion_common.ArrowFormat arrow = 16;
   }
   repeated SortExprNodeCollection file_sort_order = 13;
 }
@@ -166,6 +167,7 @@ message CreateExternalTableNode {
   datafusion_common.DfSchema schema = 4;
   repeated string table_partition_cols = 5;
   bool if_not_exists = 6;
+  bool or_replace = 15;
   bool temporary = 14;
   string definition = 7;
   repeated SortExprNodeCollection order_exprs = 10;
@@ -177,8 +179,11 @@ message CreateExternalTableNode {
 
 message PrepareNode {
   string name = 1;
+  // We serialize both the data types and the fields for compatibility with
+  // older versions (newer versions populate both).
   repeated datafusion_common.ArrowType data_types = 2;
   LogicalPlanNode input = 3;
+  repeated datafusion_common.Field fields = 4;
 }
 
 message CreateCatalogSchemaNode {
@@ -243,7 +248,7 @@ message JoinNode {
   datafusion_common.JoinConstraint join_constraint = 4;
   repeated LogicalExprNode left_join_key = 5;
   repeated LogicalExprNode right_join_key = 6;
-  bool null_equals_null = 7;
+  datafusion_common.NullEquality null_equality = 7;
   LogicalExprNode filter = 8;
 }
 
@@ -265,6 +270,25 @@ message CopyToNode {
   repeated string partition_by = 7;
 }
 
+// Identifies a built-in file format supported by DataFusion.
+// Used by DefaultLogicalExtensionCodec to serialize/deserialize
+// FileFormatFactory instances (e.g. in CopyTo plans).
+enum FileFormatKind {
+  FILE_FORMAT_KIND_UNSPECIFIED = 0;
+  FILE_FORMAT_KIND_CSV = 1;
+  FILE_FORMAT_KIND_JSON = 2;
+  FILE_FORMAT_KIND_PARQUET = 3;
+  FILE_FORMAT_KIND_ARROW = 4;
+  FILE_FORMAT_KIND_AVRO = 5;
+}
+
+// Wraps a serialized FileFormatFactory with its format kind tag,
+// so the decoder can dispatch to the correct format-specific codec.
+message FileFormatProto {
+  FileFormatKind kind = 1;
+  bytes encoded_file_format = 2;
+}
+
 message DmlNode{
    enum Type {
     UPDATE = 0;
@@ -273,6 +297,7 @@ message DmlNode{
     INSERT_APPEND = 3;
     INSERT_OVERWRITE = 4;
     INSERT_REPLACE = 5;
+    TRUNCATE = 6;
   }
   Type dml_type = 1;
   LogicalPlanNode input = 2;
@@ -410,7 +435,11 @@ message Wildcard {
 
 message PlaceholderNode {
   string id = 1;
+  // We serialize the data type, metadata, and nullability separately to maintain
+  // compatibility with older versions
   datafusion_common.ArrowType data_type = 2;
+  optional bool nullable = 3;
+  map<string, string> metadata = 4;
 }
 
 message LogicalExprList {
@@ -516,6 +545,7 @@ message AggregateUDFExprNode {
   LogicalExprNode filter = 3;
   repeated SortExprNode order_by = 4;
   optional bytes fun_definition = 6;
+  optional NullTreatment null_treatment = 7;
 }
 
 message ScalarUDFExprNode {
@@ -536,6 +566,9 @@ message WindowExprNode {
   // repeated LogicalExprNode filter = 7;
   WindowFrame window_frame = 8;
   optional bytes fun_definition = 10;
+  optional NullTreatment null_treatment = 11;
+  bool distinct = 12;
+  LogicalExprNode filter = 13;
 }
 
 message BetweenNode {
@@ -580,11 +613,15 @@ message WhenThen {
 message CastNode {
   LogicalExprNode expr = 1;
   datafusion_common.ArrowType arrow_type = 2;
+  map<string, string> metadata = 3;
+  optional bool nullable = 4;
 }
 
 message TryCastNode {
   LogicalExprNode expr = 1;
   datafusion_common.ArrowType arrow_type = 2;
+  map<string, string> metadata = 3;
+  optional bool nullable = 4;
 }
 
 message SortExprNode {
@@ -620,6 +657,11 @@ message WindowFrameBound {
   datafusion_common.ScalarValue bound_value = 2;
 }
 
+enum NullTreatment {
+  RESPECT_NULLS = 0;
+  IGNORE_NULLS = 1;
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Arrow Data Types
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -726,6 +768,13 @@ message PhysicalPlanNode {
     ParquetSinkExecNode parquet_sink = 29;
     UnnestExecNode unnest = 30;
     JsonScanExecNode json_scan = 31;
+    CooperativeExecNode cooperative = 32;
+    GenerateSeriesNode generate_series = 33;
+    SortMergeJoinExecNode sort_merge_join = 34;
+    MemoryScanExecNode memory_scan = 35;
+    AsyncFuncExecNode async_func = 36;
+    BufferExecNode buffer = 37;
+    ArrowScanExecNode arrow_scan = 38;
   }
 }
 
@@ -735,6 +784,16 @@ message PartitionColumn {
 }
 
 
+// Determines how file sink output paths are interpreted.
+enum FileOutputMode {
+  // Infer output mode from the URL (extension/trailing `/` heuristic).
+  FILE_OUTPUT_MODE_AUTOMATIC = 0;
+  // Write to a single file at the exact output path.
+  FILE_OUTPUT_MODE_SINGLE_FILE = 1;
+  // Write to a directory with generated filenames.
+  FILE_OUTPUT_MODE_DIRECTORY = 2;
+}
+
 message FileSinkConfig {
   reserved 6; // writer_mode
   reserved 8; // was `overwrite` which has been superseded by `insert_op`
@@ -747,6 +806,8 @@ message FileSinkConfig {
   bool keep_partition_by_columns = 9;
   InsertOp insert_op = 10;
   string file_extension = 11;
+  // Determines how the output path is interpreted.
+  FileOutputMode file_output_mode = 12;
 }
 
 enum InsertOp {
@@ -814,6 +875,14 @@ message PhysicalExprNode {
   // Was date_time_interval_expr
   reserved 17;
 
+  // Unique identifier for this expression to do deduplication during deserialization.
+  // When serializing, this is set to a unique identifier for each combination of
+  // expression, process and serialization run.
+  // When deserializing, if this ID has been seen before, the cached Arc is returned
+  // instead of creating a new one, enabling reconstruction of referential integrity
+  // across serde roundtrips.
+  optional uint64 expr_id = 30;
+
   oneof ExprType {
     // column references
     PhysicalColumn column = 1;
@@ -849,6 +918,8 @@ message PhysicalExprNode {
     PhysicalExtensionExprNode extension = 19;
 
     UnknownColumn unknown_column = 20;
+
+    PhysicalHashExprNode hash_expr = 21;
   }
 }
 
@@ -858,6 +929,7 @@ message PhysicalScalarUdfNode {
   optional bytes fun_definition = 3;
   datafusion_common.ArrowType return_type = 4;
   bool nullable = 5;
+  string return_field_name = 6;
 }
 
 message PhysicalAggregateExprNode {
@@ -869,6 +941,7 @@ message PhysicalAggregateExprNode {
   bool distinct = 3;
   bool ignore_nulls = 6;
   optional bytes fun_definition = 7;
+  string human_display = 8;
 }
 
 message PhysicalWindowExprNode {
@@ -883,6 +956,8 @@ message PhysicalWindowExprNode {
   WindowFrame window_frame = 7;
   string name = 8;
   optional bytes fun_definition = 9;
+  bool ignore_nulls = 11;
+  bool distinct = 12;
 }
 
 message PhysicalIsNull {
@@ -963,11 +1038,19 @@ message PhysicalExtensionExprNode {
   repeated PhysicalExprNode inputs = 2;
 }
 
+message PhysicalHashExprNode {
+  repeated PhysicalExprNode on_columns = 1;
+  uint64 seed0 = 2;
+  string description = 6;
+}
+
 message FilterExecNode {
   PhysicalPlanNode input = 1;
   PhysicalExprNode expr = 2;
   uint32 default_filter_selectivity = 3;
   repeated uint32 projection = 9;
+  uint32 batch_size = 10;
+  optional uint32 fetch = 11;
 }
 
 message FileGroup {
@@ -983,6 +1066,15 @@ message PhysicalSortExprNodeCollection {
   repeated PhysicalSortExprNode physical_sort_expr_nodes = 1;
 }
 
+message ProjectionExpr {
+  string alias = 1;
+  PhysicalExprNode expr = 2;
+}
+
+message ProjectionExprs {
+  repeated ProjectionExpr projections = 1;
+}
+
 message FileScanExecConf {
   repeated FileGroup file_groups = 1;
   datafusion_common.Schema schema = 2;
@@ -998,6 +1090,8 @@ message FileScanExecConf {
 
   datafusion_common.Constraints constraints = 11;
   optional uint64 batch_size = 12;
+
+  optional ProjectionExprs projection_exprs = 13;
 }
 
 message ParquetScanExecNode {
@@ -1023,6 +1117,7 @@ message CsvScanExecNode {
     string comment = 6;
   }
   bool newlines_in_values = 7;
+  bool truncate_rows = 8;
 }
 
 message JsonScanExecNode {
@@ -1033,6 +1128,23 @@ message AvroScanExecNode {
   FileScanExecConf base_conf = 1;
 }
 
+message ArrowScanExecNode {
+  FileScanExecConf base_conf = 1;
+}
+
+message MemoryScanExecNode {
+  repeated bytes partitions = 1;
+  datafusion_common.Schema schema = 2;
+  repeated uint32 projection = 3;
+  repeated PhysicalSortExprNodeCollection sort_information = 4;
+  bool show_sizes = 5;
+  optional uint32 fetch = 6;
+}
+
+message CooperativeExecNode {
+  PhysicalPlanNode input = 1;
+}
+
 enum PartitionMode {
   COLLECT_LEFT = 0;
   PARTITIONED = 1;
@@ -1045,9 +1157,10 @@ message HashJoinExecNode {
   repeated JoinOn on = 3;
   datafusion_common.JoinType join_type = 4;
   PartitionMode partition_mode = 6;
-  bool null_equals_null = 7;
+  datafusion_common.NullEquality null_equality = 7;
   JoinFilter filter = 8;
   repeated uint32 projection = 9;
+  bool null_aware = 10;
 }
 
 enum StreamPartitionMode {
@@ -1061,7 +1174,7 @@ message SymmetricHashJoinExecNode {
   repeated JoinOn on = 3;
   datafusion_common.JoinType join_type = 4;
   StreamPartitionMode partition_mode = 6;
-  bool null_equals_null = 7;
+  datafusion_common.NullEquality null_equality = 7;
   JoinFilter filter = 8;
   repeated PhysicalSortExprNode left_sort_exprs = 9;
   repeated PhysicalSortExprNode right_sort_exprs = 10;
@@ -1127,6 +1240,7 @@ enum AggregateMode {
   FINAL_PARTITIONED = 2;
   SINGLE = 3;
   SINGLE_PARTITIONED = 4;
+  PARTIAL_REDUCE = 5;
 }
 
 message PartiallySortedInputOrderMode {
@@ -1156,6 +1270,8 @@ message MaybePhysicalSortExprs {
 message AggLimit {
   // wrap into a message to make it optional
   uint64 limit = 1;
+  // Optional ordering direction for TopK aggregation (true = descending, false = ascending)
+  optional bool descending = 2;
 }
 
 message AggregateExecNode {
@@ -1171,6 +1287,7 @@ message AggregateExecNode {
   repeated bool groups = 9;
   repeated MaybeFilter filter_expr = 10;
   AggLimit limit = 11;
+  bool has_grouping_set = 12;
 }
 
 message GlobalLimitExecNode {
@@ -1233,6 +1350,7 @@ message RepartitionExecNode{
   //   uint64 unknown = 4;
   // }
   Partitioning partitioning = 5;
+  bool preserve_order = 6;
 }
 
 message Partitioning {
@@ -1286,3 +1404,70 @@ message CteWorkTableScanNode {
     string name = 1;
     datafusion_common.Schema schema = 2;
 }
+
+enum GenerateSeriesName {
+  GS_GENERATE_SERIES = 0;
+  GS_RANGE = 1;
+}
+
+message GenerateSeriesArgsContainsNull {
+    GenerateSeriesName name = 1;
+}
+
+message GenerateSeriesArgsInt64 {
+    int64 start = 1;
+    int64 end = 2;
+    int64 step = 3;
+    bool include_end = 4;
+    GenerateSeriesName name = 5;
+}
+
+message GenerateSeriesArgsTimestamp {
+    int64 start = 1;
+    int64 end = 2;
+    datafusion_common.IntervalMonthDayNanoValue step = 3;
+    optional string tz = 4;
+    bool include_end = 5;
+    GenerateSeriesName name = 6;
+}
+
+message GenerateSeriesArgsDate {
+    int64 start = 1;
+    int64 end = 2;
+    datafusion_common.IntervalMonthDayNanoValue step = 3;
+    bool include_end = 4;
+    GenerateSeriesName name = 5;
+}
+
+message GenerateSeriesNode {
+    datafusion_common.Schema schema = 1;
+    uint32 target_batch_size = 2;
+
+    oneof args {
+        GenerateSeriesArgsContainsNull contains_null = 3;
+        GenerateSeriesArgsInt64 int64_args = 4;
+        GenerateSeriesArgsTimestamp timestamp_args = 5;
+        GenerateSeriesArgsDate date_args = 6;
+    }
+}
+
+message SortMergeJoinExecNode {
+  PhysicalPlanNode left = 1;
+  PhysicalPlanNode right = 2;
+  repeated JoinOn on = 3;
+  datafusion_common.JoinType join_type = 4;
+  JoinFilter filter = 5;
+  repeated SortExprNode sort_options = 6;
+  datafusion_common.NullEquality null_equality = 7;
+}
+
+message AsyncFuncExecNode {
+  PhysicalPlanNode input = 1;
+  repeated PhysicalExprNode async_exprs = 2;
+  repeated string async_expr_names = 3;
+}
+
+message BufferExecNode {
+  PhysicalPlanNode input = 1;
+  uint64 capacity = 2;
+}
\ No newline at end of file
diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs
index da01d89c0c3d1..84b15ea9a8920 100644
--- a/datafusion/proto/src/bytes/mod.rs
+++ b/datafusion/proto/src/bytes/mod.rs
@@ -21,25 +21,26 @@ use crate::logical_plan::{
     self, AsLogicalPlan, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
 };
 use crate::physical_plan::{
-    AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec,
+    DefaultPhysicalExtensionCodec, DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
 };
 use crate::protobuf;
-use datafusion_common::{plan_datafusion_err, Result};
+use datafusion_common::{Result, plan_datafusion_err};
+use datafusion_execution::TaskContext;
 use datafusion_expr::{
-    create_udaf, create_udf, create_udwf, AggregateUDF, Expr, LogicalPlan, Volatility,
-    WindowUDF,
+    AggregateUDF, Expr, LogicalPlan, Volatility, WindowUDF, create_udaf, create_udf,
+    create_udwf,
 };
 use prost::{
-    bytes::{Bytes, BytesMut},
     Message,
+    bytes::{Bytes, BytesMut},
 };
 use std::sync::Arc;
 
 // Reexport Bytes which appears in the API
-use datafusion::execution::registry::FunctionRegistry;
-use datafusion::physical_plan::ExecutionPlan;
-use datafusion::prelude::SessionContext;
+use datafusion_execution::registry::FunctionRegistry;
 use datafusion_expr::planner::ExprPlanner;
+use datafusion_physical_plan::ExecutionPlan;
 
 mod registry;
 
@@ -170,6 +171,14 @@ impl Serializeable for Expr {
             fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
                 vec![]
             }
+
+            fn udafs(&self) -> std::collections::HashSet<String> {
+                std::collections::HashSet::default()
+            }
+
+            fn udwfs(&self) -> std::collections::HashSet<String> {
+                std::collections::HashSet::default()
+            }
         }
         Expr::from_bytes_with_registry(&bytes, &PlaceHolderRegistry)?;
 
@@ -231,16 +240,13 @@ pub fn logical_plan_to_json_with_extension_codec(
 
 /// Deserialize a LogicalPlan from JSON
 #[cfg(feature = "json")]
-pub fn logical_plan_from_json(json: &str, ctx: &SessionContext) -> Result<LogicalPlan> {
+pub fn logical_plan_from_json(json: &str, ctx: &TaskContext) -> Result<LogicalPlan> {
     let extension_codec = DefaultLogicalExtensionCodec {};
     logical_plan_from_json_with_extension_codec(json, ctx, &extension_codec)
 }
 
 /// Deserialize a LogicalPlan from bytes
-pub fn logical_plan_from_bytes(
-    bytes: &[u8],
-    ctx: &SessionContext,
-) -> Result<LogicalPlan> {
+pub fn logical_plan_from_bytes(bytes: &[u8], ctx: &TaskContext) -> Result<LogicalPlan> {
     let extension_codec = DefaultLogicalExtensionCodec {};
     logical_plan_from_bytes_with_extension_codec(bytes, ctx, &extension_codec)
 }
@@ -248,7 +254,7 @@ pub fn logical_plan_from_bytes(
 /// Deserialize a LogicalPlan from bytes
 pub fn logical_plan_from_bytes_with_extension_codec(
     bytes: &[u8],
-    ctx: &SessionContext,
+    ctx: &TaskContext,
     extension_codec: &dyn LogicalExtensionCodec,
 ) -> Result<LogicalPlan> {
     let protobuf = protobuf::LogicalPlanNode::decode(bytes)
@@ -260,7 +266,7 @@ pub fn logical_plan_from_bytes_with_extension_codec(
 #[cfg(feature = "json")]
 pub fn logical_plan_from_json_with_extension_codec(
     json: &str,
-    ctx: &SessionContext,
+    ctx: &TaskContext,
     extension_codec: &dyn LogicalExtensionCodec,
 ) -> Result<LogicalPlan> {
     let back: protobuf::LogicalPlanNode = serde_json::from_str(json)
@@ -271,16 +277,18 @@ pub fn logical_plan_from_json_with_extension_codec(
 /// Serialize a PhysicalPlan as bytes
 pub fn physical_plan_to_bytes(plan: Arc<dyn ExecutionPlan>) -> Result<Bytes> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    physical_plan_to_bytes_with_extension_codec(plan, &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_to_bytes_with_proto_converter(plan, &extension_codec, &proto_converter)
 }
 
 /// Serialize a PhysicalPlan as JSON
 #[cfg(feature = "json")]
 pub fn physical_plan_to_json(plan: Arc<dyn ExecutionPlan>) -> Result<String> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    let protobuf =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(plan, &extension_codec)
-            .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let protobuf = proto_converter
+        .execution_plan_to_proto(&plan, &extension_codec)
+        .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
     serde_json::to_string(&protobuf)
         .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))
 }
@@ -290,8 +298,18 @@ pub fn physical_plan_to_bytes_with_extension_codec(
     plan: Arc<dyn ExecutionPlan>,
     extension_codec: &dyn PhysicalExtensionCodec,
 ) -> Result<Bytes> {
-    let protobuf =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(plan, extension_codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_to_bytes_with_proto_converter(plan, extension_codec, &proto_converter)
+}
+
+/// Serialize a PhysicalPlan as bytes, using the provided extension codec
+/// and protobuf converter.
+pub fn physical_plan_to_bytes_with_proto_converter(
+    plan: Arc<dyn ExecutionPlan>,
+    extension_codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+) -> Result<Bytes> {
+    let protobuf = proto_converter.execution_plan_to_proto(&plan, extension_codec)?;
     let mut buffer = BytesMut::new();
     protobuf
         .encode(&mut buffer)
@@ -303,30 +321,53 @@ pub fn physical_plan_to_bytes_with_extension_codec(
 #[cfg(feature = "json")]
 pub fn physical_plan_from_json(
     json: &str,
-    ctx: &SessionContext,
+    ctx: &TaskContext,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let back: protobuf::PhysicalPlanNode = serde_json::from_str(json)
         .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?;
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    back.try_into_physical_plan(ctx, &ctx.runtime_env(), &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    proto_converter.proto_to_execution_plan(ctx, &extension_codec, &back)
 }
 
 /// Deserialize a PhysicalPlan from bytes
 pub fn physical_plan_from_bytes(
     bytes: &[u8],
-    ctx: &SessionContext,
+    ctx: &TaskContext,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let extension_codec = DefaultPhysicalExtensionCodec {};
-    physical_plan_from_bytes_with_extension_codec(bytes, ctx, &extension_codec)
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_from_bytes_with_proto_converter(
+        bytes,
+        ctx,
+        &extension_codec,
+        &proto_converter,
+    )
 }
 
 /// Deserialize a PhysicalPlan from bytes
 pub fn physical_plan_from_bytes_with_extension_codec(
     bytes: &[u8],
-    ctx: &SessionContext,
+    ctx: &TaskContext,
+    extension_codec: &dyn PhysicalExtensionCodec,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    physical_plan_from_bytes_with_proto_converter(
+        bytes,
+        ctx,
+        extension_codec,
+        &proto_converter,
+    )
+}
+
+/// Deserialize a PhysicalPlan from bytes
+pub fn physical_plan_from_bytes_with_proto_converter(
+    bytes: &[u8],
+    ctx: &TaskContext,
     extension_codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let protobuf = protobuf::PhysicalPlanNode::decode(bytes)
         .map_err(|e| plan_datafusion_err!("Error decoding expr as protobuf: {e}"))?;
-    protobuf.try_into_physical_plan(ctx, &ctx.runtime_env(), extension_codec)
+    proto_converter.proto_to_execution_plan(ctx, extension_codec, &protobuf)
 }
diff --git a/datafusion/proto/src/bytes/registry.rs b/datafusion/proto/src/bytes/registry.rs
index eae2425f8ac19..a3f74787e2b50 100644
--- a/datafusion/proto/src/bytes/registry.rs
+++ b/datafusion/proto/src/bytes/registry.rs
@@ -17,9 +17,9 @@
 
 use std::{collections::HashSet, sync::Arc};
 
-use datafusion::execution::registry::FunctionRegistry;
-use datafusion_common::plan_err;
 use datafusion_common::Result;
+use datafusion_common::plan_err;
+use datafusion_execution::registry::FunctionRegistry;
 use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
 
@@ -33,30 +33,53 @@ impl FunctionRegistry for NoRegistry {
     }
 
     fn udf(&self, name: &str) -> Result<Arc<ScalarUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Function '{name}'")
+        plan_err!(
+            "No function registry provided to deserialize, so can not deserialize User Defined Function '{name}'"
+        )
     }
 
     fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Aggregate Function '{name}'")
+        plan_err!(
+            "No function registry provided to deserialize, so can not deserialize User Defined Aggregate Function '{name}'"
+        )
     }
 
     fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Window Function '{name}'")
+        plan_err!(
+            "No function registry provided to deserialize, so can not deserialize User Defined Window Function '{name}'"
+        )
     }
     fn register_udaf(
         &mut self,
         udaf: Arc<AggregateUDF>,
     ) -> Result<Option<Arc<AggregateUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not register User Defined Aggregate Function '{}'", udaf.inner().name())
+        plan_err!(
+            "No function registry provided to deserialize, so can not register User Defined Aggregate Function '{}'",
+            udaf.inner().name()
+        )
     }
     fn register_udf(&mut self, udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Function '{}'", udf.inner().name())
+        plan_err!(
+            "No function registry provided to deserialize, so can not deserialize User Defined Function '{}'",
+            udf.inner().name()
+        )
     }
     fn register_udwf(&mut self, udwf: Arc<WindowUDF>) -> Result<Option<Arc<WindowUDF>>> {
-        plan_err!("No function registry provided to deserialize, so can not deserialize User Defined Window Function '{}'", udwf.inner().name())
+        plan_err!(
+            "No function registry provided to deserialize, so can not deserialize User Defined Window Function '{}'",
+            udwf.inner().name()
+        )
     }
 
     fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
         vec![]
     }
+
+    fn udafs(&self) -> HashSet<String> {
+        HashSet::new()
+    }
+
+    fn udwfs(&self) -> HashSet<String> {
+        HashSet::new()
+    }
 }
diff --git a/datafusion/proto/src/common.rs b/datafusion/proto/src/common.rs
index 2b052a31b8b76..22ded708d8c71 100644
--- a/datafusion/proto/src/common.rs
+++ b/datafusion/proto/src/common.rs
@@ -15,23 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::{Result, assert_eq_or_internal_err, internal_datafusion_err};
 
 pub(crate) fn str_to_byte(s: &String, description: &str) -> Result<u8> {
-    if s.len() != 1 {
-        return internal_err!(
-            "Invalid CSV {description}: expected single character, got {s}"
-        );
-    }
+    assert_eq_or_internal_err!(
+        s.len(),
+        1,
+        "Invalid CSV {description}: expected single character, got {s}"
+    );
     Ok(s.as_bytes()[0])
 }
 
 pub(crate) fn byte_to_string(b: u8, description: &str) -> Result<String> {
     let b = &[b];
     let b = std::str::from_utf8(b).map_err(|_| {
-        DataFusionError::Internal(format!(
+        internal_datafusion_err!(
             "Invalid CSV {description}: can not represent {b:0x?} as utf8"
-        ))
+        )
     })?;
     Ok(b.to_owned())
 }
diff --git a/datafusion/proto/src/generated/datafusion.rs b/datafusion/proto/src/generated/datafusion.rs
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/datafusion/proto/src/generated/datafusion.rs
@@ -0,0 +1 @@
+
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs
index e029327d481d1..a09826a29be52 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -1,10 +1,10 @@
 // This file is @generated by prost-build.
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ColumnRelation {
     #[prost(string, tag = "1")]
     pub relation: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Column {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
@@ -28,7 +28,7 @@ pub struct DfSchema {
         ::prost::alloc::string::String,
     >,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvFormat {
     #[prost(message, optional, tag = "5")]
     pub options: ::core::option::Option<CsvOptions>,
@@ -38,31 +38,33 @@ pub struct ParquetFormat {
     #[prost(message, optional, tag = "2")]
     pub options: ::core::option::Option<TableParquetOptions>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AvroFormat {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct NdJsonFormat {
     #[prost(message, optional, tag = "1")]
     pub options: ::core::option::Option<JsonOptions>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct ArrowFormat {}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PrimaryKeyConstraint {
     #[prost(uint64, repeated, tag = "1")]
     pub indices: ::prost::alloc::vec::Vec<u64>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct UniqueConstraint {
     #[prost(uint64, repeated, tag = "1")]
     pub indices: ::prost::alloc::vec::Vec<u64>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Constraint {
     #[prost(oneof = "constraint::ConstraintMode", tags = "1, 2")]
     pub constraint_mode: ::core::option::Option<constraint::ConstraintMode>,
 }
 /// Nested message and enum types in `Constraint`.
 pub mod constraint {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum ConstraintMode {
         #[prost(message, tag = "1")]
         PrimaryKey(super::PrimaryKeyConstraint),
@@ -75,9 +77,9 @@ pub struct Constraints {
     #[prost(message, repeated, tag = "1")]
     pub constraints: ::prost::alloc::vec::Vec<Constraint>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AvroOptions {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ArrowOptions {}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Schema {
@@ -106,24 +108,36 @@ pub struct Field {
         ::prost::alloc::string::String,
         ::prost::alloc::string::String,
     >,
-    #[prost(bool, tag = "6")]
-    pub dict_ordered: bool,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Timestamp {
     #[prost(enumeration = "TimeUnit", tag = "1")]
     pub time_unit: i32,
     #[prost(string, tag = "2")]
     pub timezone: ::prost::alloc::string::String,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
-pub struct Decimal {
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal32Type {
+    #[prost(uint32, tag = "3")]
+    pub precision: u32,
+    #[prost(int32, tag = "4")]
+    pub scale: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal64Type {
+    #[prost(uint32, tag = "3")]
+    pub precision: u32,
+    #[prost(int32, tag = "4")]
+    pub scale: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal128Type {
     #[prost(uint32, tag = "3")]
     pub precision: u32,
     #[prost(int32, tag = "4")]
     pub scale: i32,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal256Type {
     #[prost(uint32, tag = "3")]
     pub precision: u32,
@@ -162,6 +176,13 @@ pub struct Map {
     pub keys_sorted: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RunEndEncoded {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub run_ends_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub values_field: ::core::option::Option<::prost::alloc::boxed::Box<Field>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Union {
     #[prost(message, repeated, tag = "1")]
     pub union_types: ::prost::alloc::vec::Vec<Field>,
@@ -184,7 +205,7 @@ pub struct ScalarNestedValue {
 }
 /// Nested message and enum types in `ScalarNestedValue`.
 pub mod scalar_nested_value {
-    #[derive(Clone, PartialEq, ::prost::Message)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
     pub struct Dictionary {
         #[prost(bytes = "vec", tag = "1")]
         pub ipc_message: ::prost::alloc::vec::Vec<u8>,
@@ -192,14 +213,14 @@ pub mod scalar_nested_value {
         pub arrow_data: ::prost::alloc::vec::Vec<u8>,
     }
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTime32Value {
     #[prost(oneof = "scalar_time32_value::Value", tags = "1, 2")]
     pub value: ::core::option::Option<scalar_time32_value::Value>,
 }
 /// Nested message and enum types in `ScalarTime32Value`.
 pub mod scalar_time32_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int32, tag = "1")]
         Time32SecondValue(i32),
@@ -207,14 +228,14 @@ pub mod scalar_time32_value {
         Time32MillisecondValue(i32),
     }
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTime64Value {
     #[prost(oneof = "scalar_time64_value::Value", tags = "1, 2")]
     pub value: ::core::option::Option<scalar_time64_value::Value>,
 }
 /// Nested message and enum types in `ScalarTime64Value`.
 pub mod scalar_time64_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int64, tag = "1")]
         Time64MicrosecondValue(i64),
@@ -222,7 +243,7 @@ pub mod scalar_time64_value {
         Time64NanosecondValue(i64),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarTimestampValue {
     #[prost(string, tag = "5")]
     pub timezone: ::prost::alloc::string::String,
@@ -231,7 +252,7 @@ pub struct ScalarTimestampValue {
 }
 /// Nested message and enum types in `ScalarTimestampValue`.
 pub mod scalar_timestamp_value {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum Value {
         #[prost(int64, tag = "1")]
         TimeMicrosecondValue(i64),
@@ -250,14 +271,23 @@ pub struct ScalarDictionaryValue {
     #[prost(message, optional, boxed, tag = "2")]
     pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ScalarRunEndEncodedValue {
+    #[prost(message, optional, tag = "1")]
+    pub run_ends_field: ::core::option::Option<Field>,
+    #[prost(message, optional, tag = "2")]
+    pub values_field: ::core::option::Option<Field>,
+    #[prost(message, optional, boxed, tag = "3")]
+    pub value: ::core::option::Option<::prost::alloc::boxed::Box<ScalarValue>>,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalDayTimeValue {
     #[prost(int32, tag = "1")]
     pub days: i32,
     #[prost(int32, tag = "2")]
     pub milliseconds: i32,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct IntervalMonthDayNanoValue {
     #[prost(int32, tag = "1")]
     pub months: i32,
@@ -286,7 +316,7 @@ pub struct UnionValue {
     #[prost(enumeration = "UnionMode", tag = "4")]
     pub mode: i32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScalarFixedSizeBinary {
     #[prost(bytes = "vec", tag = "1")]
     pub values: ::prost::alloc::vec::Vec<u8>,
@@ -297,7 +327,7 @@ pub struct ScalarFixedSizeBinary {
 pub struct ScalarValue {
     #[prost(
         oneof = "scalar_value::Value",
-        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42"
+        tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
     )]
     pub value: ::core::option::Option<scalar_value::Value>,
 }
@@ -352,6 +382,10 @@ pub mod scalar_value {
         StructValue(super::ScalarNestedValue),
         #[prost(message, tag = "41")]
         MapValue(super::ScalarNestedValue),
+        #[prost(message, tag = "43")]
+        Decimal32Value(super::Decimal32),
+        #[prost(message, tag = "44")]
+        Decimal64Value(super::Decimal64),
         #[prost(message, tag = "20")]
         Decimal128Value(super::Decimal128),
         #[prost(message, tag = "39")]
@@ -388,9 +422,29 @@ pub mod scalar_value {
         FixedSizeBinaryValue(super::ScalarFixedSizeBinary),
         #[prost(message, tag = "42")]
         UnionValue(::prost::alloc::boxed::Box<super::UnionValue>),
+        #[prost(message, tag = "45")]
+        RunEndEncodedValue(::prost::alloc::boxed::Box<super::ScalarRunEndEncodedValue>),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal32 {
+    #[prost(bytes = "vec", tag = "1")]
+    pub value: ::prost::alloc::vec::Vec<u8>,
+    #[prost(int64, tag = "2")]
+    pub p: i64,
+    #[prost(int64, tag = "3")]
+    pub s: i64,
+}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct Decimal64 {
+    #[prost(bytes = "vec", tag = "1")]
+    pub value: ::prost::alloc::vec::Vec<u8>,
+    #[prost(int64, tag = "2")]
+    pub p: i64,
+    #[prost(int64, tag = "3")]
+    pub s: i64,
+}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal128 {
     #[prost(bytes = "vec", tag = "1")]
     pub value: ::prost::alloc::vec::Vec<u8>,
@@ -399,7 +453,7 @@ pub struct Decimal128 {
     #[prost(int64, tag = "3")]
     pub s: i64,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Decimal256 {
     #[prost(bytes = "vec", tag = "1")]
     pub value: ::prost::alloc::vec::Vec<u8>,
@@ -413,7 +467,7 @@ pub struct Decimal256 {
 pub struct ArrowType {
     #[prost(
         oneof = "arrow_type::ArrowTypeEnum",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33, 42"
     )]
     pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
 }
@@ -480,8 +534,12 @@ pub mod arrow_type {
         Time64(i32),
         #[prost(enumeration = "super::IntervalUnit", tag = "23")]
         Interval(i32),
+        #[prost(message, tag = "40")]
+        Decimal32(super::Decimal32Type),
+        #[prost(message, tag = "41")]
+        Decimal64(super::Decimal64Type),
         #[prost(message, tag = "24")]
-        Decimal(super::Decimal),
+        Decimal128(super::Decimal128Type),
         #[prost(message, tag = "36")]
         Decimal256(super::Decimal256Type),
         #[prost(message, tag = "25")]
@@ -498,6 +556,8 @@ pub mod arrow_type {
         Dictionary(::prost::alloc::boxed::Box<super::Dictionary>),
         #[prost(message, tag = "33")]
         Map(::prost::alloc::boxed::Box<super::Map>),
+        #[prost(message, tag = "42")]
+        RunEndEncoded(::prost::alloc::boxed::Box<super::RunEndEncoded>),
     }
 }
 /// Useful for representing an empty enum variant in rust
@@ -509,14 +569,14 @@ pub mod arrow_type {
 ///         i32 Two = 2;
 ///    }
 /// }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct EmptyMessage {}
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct JsonWriterOptions {
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
     pub compression: i32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvWriterOptions {
     /// Compression type
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
@@ -553,7 +613,7 @@ pub struct CsvWriterOptions {
     pub double_quote: bool,
 }
 /// Options controlling CSV format
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct CsvOptions {
     /// Indicates if the CSV has a header row
     #[prost(bytes = "vec", tag = "1")]
@@ -606,9 +666,15 @@ pub struct CsvOptions {
     /// Optional terminator character as a byte
     #[prost(bytes = "vec", tag = "17")]
     pub terminator: ::prost::alloc::vec::Vec<u8>,
+    /// Indicates if truncated rows are allowed
+    #[prost(bytes = "vec", tag = "18")]
+    pub truncated_rows: ::prost::alloc::vec::Vec<u8>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "19")]
+    pub compression_level: ::core::option::Option<u32>,
 }
 /// Options controlling CSV format
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct JsonOptions {
     /// Compression type
     #[prost(enumeration = "CompressionTypeVariant", tag = "1")]
@@ -616,6 +682,12 @@ pub struct JsonOptions {
     /// Optional max records for schema inference
     #[prost(uint64, optional, tag = "2")]
     pub schema_infer_max_rec: ::core::option::Option<u64>,
+    /// Optional compression level
+    #[prost(uint32, optional, tag = "3")]
+    pub compression_level: ::core::option::Option<u32>,
+    /// Whether to read as newline-delimited JSON (default true). When false, expects JSON array format \[{},...\]
+    #[prost(bool, optional, tag = "4")]
+    pub newline_delimited: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TableParquetOptions {
@@ -662,34 +734,30 @@ pub struct ParquetColumnOptions {
     pub bloom_filter_ndv_opt: ::core::option::Option<
         parquet_column_options::BloomFilterNdvOpt,
     >,
-    #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")]
-    pub max_statistics_size_opt: ::core::option::Option<
-        parquet_column_options::MaxStatisticsSizeOpt,
-    >,
 }
 /// Nested message and enum types in `ParquetColumnOptions`.
 pub mod parquet_column_options {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterEnabledOpt {
         #[prost(bool, tag = "1")]
         BloomFilterEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum EncodingOpt {
         #[prost(string, tag = "2")]
         Encoding(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum DictionaryEnabledOpt {
         #[prost(bool, tag = "3")]
         DictionaryEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CompressionOpt {
         #[prost(string, tag = "4")]
         Compression(::prost::alloc::string::String),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsEnabledOpt {
         #[prost(string, tag = "5")]
         StatisticsEnabled(::prost::alloc::string::String),
@@ -699,16 +767,11 @@ pub mod parquet_column_options {
         #[prost(double, tag = "6")]
         BloomFilterFpp(f64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterNdvOpt {
         #[prost(uint64, tag = "7")]
         BloomFilterNdv(u64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
-    pub enum MaxStatisticsSizeOpt {
-        #[prost(uint32, tag = "8")]
-        MaxStatisticsSize(u32),
-    }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ParquetOptions {
@@ -729,6 +792,9 @@ pub struct ParquetOptions {
     /// default = false
     #[prost(bool, tag = "6")]
     pub reorder_filters: bool,
+    /// default = false
+    #[prost(bool, tag = "34")]
+    pub force_filter_selections: bool,
     /// default = 1024 * 1024
     #[prost(uint64, tag = "7")]
     pub data_pagesize_limit: u64,
@@ -786,10 +852,6 @@ pub struct ParquetOptions {
     pub statistics_enabled_opt: ::core::option::Option<
         parquet_options::StatisticsEnabledOpt,
     >,
-    #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")]
-    pub max_statistics_size_opt: ::core::option::Option<
-        parquet_options::MaxStatisticsSizeOpt,
-    >,
     #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")]
     pub column_index_truncate_length_opt: ::core::option::Option<
         parquet_options::ColumnIndexTruncateLengthOpt,
@@ -806,45 +868,44 @@ pub struct ParquetOptions {
     pub bloom_filter_ndv_opt: ::core::option::Option<parquet_options::BloomFilterNdvOpt>,
     #[prost(oneof = "parquet_options::CoerceInt96Opt", tags = "32")]
     pub coerce_int96_opt: ::core::option::Option<parquet_options::CoerceInt96Opt>,
+    #[prost(oneof = "parquet_options::MaxPredicateCacheSizeOpt", tags = "33")]
+    pub max_predicate_cache_size_opt: ::core::option::Option<
+        parquet_options::MaxPredicateCacheSizeOpt,
+    >,
 }
 /// Nested message and enum types in `ParquetOptions`.
 pub mod parquet_options {
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum MetadataSizeHintOpt {
         #[prost(uint64, tag = "4")]
         MetadataSizeHint(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CompressionOpt {
         #[prost(string, tag = "10")]
         Compression(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum DictionaryEnabledOpt {
         #[prost(bool, tag = "11")]
         DictionaryEnabled(bool),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsEnabledOpt {
         #[prost(string, tag = "13")]
         StatisticsEnabled(::prost::alloc::string::String),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
-    pub enum MaxStatisticsSizeOpt {
-        #[prost(uint64, tag = "14")]
-        MaxStatisticsSize(u64),
-    }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum ColumnIndexTruncateLengthOpt {
         #[prost(uint64, tag = "17")]
         ColumnIndexTruncateLength(u64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum StatisticsTruncateLengthOpt {
         #[prost(uint64, tag = "31")]
         StatisticsTruncateLength(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum EncodingOpt {
         #[prost(string, tag = "19")]
         Encoding(::prost::alloc::string::String),
@@ -854,16 +915,21 @@ pub mod parquet_options {
         #[prost(double, tag = "21")]
         BloomFilterFpp(f64),
     }
-    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum BloomFilterNdvOpt {
         #[prost(uint64, tag = "22")]
         BloomFilterNdv(u64),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum CoerceInt96Opt {
         #[prost(string, tag = "32")]
         CoerceInt96(::prost::alloc::string::String),
     }
+    #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Oneof)]
+    pub enum MaxPredicateCacheSizeOpt {
+        #[prost(uint64, tag = "33")]
+        MaxPredicateCacheSize(u64),
+    }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Precision {
@@ -893,6 +959,8 @@ pub struct ColumnStats {
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]
     pub distinct_count: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "6")]
+    pub byte_size: ::core::option::Option<Precision>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
@@ -906,6 +974,7 @@ pub enum JoinType {
     Rightsemi = 6,
     Rightanti = 7,
     Leftmark = 8,
+    Rightmark = 9,
 }
 impl JoinType {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -923,6 +992,7 @@ impl JoinType {
             Self::Rightsemi => "RIGHTSEMI",
             Self::Rightanti => "RIGHTANTI",
             Self::Leftmark => "LEFTMARK",
+            Self::Rightmark => "RIGHTMARK",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
@@ -937,6 +1007,7 @@ impl JoinType {
             "RIGHTSEMI" => Some(Self::Rightsemi),
             "RIGHTANTI" => Some(Self::Rightanti),
             "LEFTMARK" => Some(Self::Leftmark),
+            "RIGHTMARK" => Some(Self::Rightmark),
             _ => None,
         }
     }
@@ -969,6 +1040,32 @@ impl JoinConstraint {
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
+pub enum NullEquality {
+    NullEqualsNothing = 0,
+    NullEqualsNull = 1,
+}
+impl NullEquality {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::NullEqualsNothing => "NULL_EQUALS_NOTHING",
+            Self::NullEqualsNull => "NULL_EQUALS_NULL",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "NULL_EQUALS_NOTHING" => Some(Self::NullEqualsNothing),
+            "NULL_EQUALS_NULL" => Some(Self::NullEqualsNull),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
 pub enum TimeUnit {
     Second = 0,
     Millisecond = 1,
diff --git a/datafusion/proto/src/generated/mod.rs b/datafusion/proto/src/generated/mod.rs
index da3302a743753..ca32b1500d57b 100644
--- a/datafusion/proto/src/generated/mod.rs
+++ b/datafusion/proto/src/generated/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// This code is generated so we don't want to fix any lint violations manually
+#[allow(clippy::allow_attributes)]
 #[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod datafusion {
diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs
index 932422944508d..eb86afe3d6e00 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -9,12 +9,18 @@ impl serde::Serialize for AggLimit {
         if self.limit != 0 {
             len += 1;
         }
+        if self.descending.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AggLimit", len)?;
         if self.limit != 0 {
             #[allow(clippy::needless_borrow)]
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("limit", ToString::to_string(&self.limit).as_str())?;
         }
+        if let Some(v) = self.descending.as_ref() {
+            struct_ser.serialize_field("descending", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -26,11 +32,13 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
     {
         const FIELDS: &[&str] = &[
             "limit",
+            "descending",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Limit,
+            Descending,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -39,7 +47,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -53,6 +61,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                     {
                         match value {
                             "limit" => Ok(GeneratedField::Limit),
+                            "descending" => Ok(GeneratedField::Descending),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -73,6 +82,7 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                     V: serde::de::MapAccess<'de>,
             {
                 let mut limit__ = None;
+                let mut descending__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Limit => {
@@ -83,10 +93,17 @@ impl<'de> serde::Deserialize<'de> for AggLimit {
                                 Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
                             ;
                         }
+                        GeneratedField::Descending => {
+                            if descending__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("descending"));
+                            }
+                            descending__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(AggLimit {
                     limit: limit__.unwrap_or_default(),
+                    descending: descending__,
                 })
             }
         }
@@ -134,6 +151,9 @@ impl serde::Serialize for AggregateExecNode {
         if self.limit.is_some() {
             len += 1;
         }
+        if self.has_grouping_set {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AggregateExecNode", len)?;
         if !self.group_expr.is_empty() {
             struct_ser.serialize_field("groupExpr", &self.group_expr)?;
@@ -170,6 +190,9 @@ impl serde::Serialize for AggregateExecNode {
         if let Some(v) = self.limit.as_ref() {
             struct_ser.serialize_field("limit", v)?;
         }
+        if self.has_grouping_set {
+            struct_ser.serialize_field("hasGroupingSet", &self.has_grouping_set)?;
+        }
         struct_ser.end()
     }
 }
@@ -198,6 +221,8 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             "filter_expr",
             "filterExpr",
             "limit",
+            "has_grouping_set",
+            "hasGroupingSet",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -213,6 +238,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             Groups,
             FilterExpr,
             Limit,
+            HasGroupingSet,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -221,7 +247,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -245,6 +271,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                             "groups" => Ok(GeneratedField::Groups),
                             "filterExpr" | "filter_expr" => Ok(GeneratedField::FilterExpr),
                             "limit" => Ok(GeneratedField::Limit),
+                            "hasGroupingSet" | "has_grouping_set" => Ok(GeneratedField::HasGroupingSet),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -275,6 +302,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                 let mut groups__ = None;
                 let mut filter_expr__ = None;
                 let mut limit__ = None;
+                let mut has_grouping_set__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::GroupExpr => {
@@ -343,6 +371,12 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                             }
                             limit__ = map_.next_value()?;
                         }
+                        GeneratedField::HasGroupingSet => {
+                            if has_grouping_set__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hasGroupingSet"));
+                            }
+                            has_grouping_set__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(AggregateExecNode {
@@ -357,6 +391,7 @@ impl<'de> serde::Deserialize<'de> for AggregateExecNode {
                     groups: groups__.unwrap_or_default(),
                     filter_expr: filter_expr__.unwrap_or_default(),
                     limit: limit__,
+                    has_grouping_set: has_grouping_set__.unwrap_or_default(),
                 })
             }
         }
@@ -375,6 +410,7 @@ impl serde::Serialize for AggregateMode {
             Self::FinalPartitioned => "FINAL_PARTITIONED",
             Self::Single => "SINGLE",
             Self::SinglePartitioned => "SINGLE_PARTITIONED",
+            Self::PartialReduce => "PARTIAL_REDUCE",
         };
         serializer.serialize_str(variant)
     }
@@ -391,11 +427,12 @@ impl<'de> serde::Deserialize<'de> for AggregateMode {
             "FINAL_PARTITIONED",
             "SINGLE",
             "SINGLE_PARTITIONED",
+            "PARTIAL_REDUCE",
         ];
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = AggregateMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -436,6 +473,7 @@ impl<'de> serde::Deserialize<'de> for AggregateMode {
                     "FINAL_PARTITIONED" => Ok(AggregateMode::FinalPartitioned),
                     "SINGLE" => Ok(AggregateMode::Single),
                     "SINGLE_PARTITIONED" => Ok(AggregateMode::SinglePartitioned),
+                    "PARTIAL_REDUCE" => Ok(AggregateMode::PartialReduce),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
@@ -500,7 +538,7 @@ impl<'de> serde::Deserialize<'de> for AggregateNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -596,6 +634,9 @@ impl serde::Serialize for AggregateUdfExprNode {
         if self.fun_definition.is_some() {
             len += 1;
         }
+        if self.null_treatment.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.AggregateUDFExprNode", len)?;
         if !self.fun_name.is_empty() {
             struct_ser.serialize_field("funName", &self.fun_name)?;
@@ -617,6 +658,11 @@ impl serde::Serialize for AggregateUdfExprNode {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?;
         }
+        if let Some(v) = self.null_treatment.as_ref() {
+            let v = NullTreatment::try_from(*v)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?;
+            struct_ser.serialize_field("nullTreatment", &v)?;
+        }
         struct_ser.end()
     }
 }
@@ -636,6 +682,8 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
             "orderBy",
             "fun_definition",
             "funDefinition",
+            "null_treatment",
+            "nullTreatment",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -646,6 +694,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
             Filter,
             OrderBy,
             FunDefinition,
+            NullTreatment,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -654,7 +703,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -673,6 +722,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
                             "filter" => Ok(GeneratedField::Filter),
                             "orderBy" | "order_by" => Ok(GeneratedField::OrderBy),
                             "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition),
+                            "nullTreatment" | "null_treatment" => Ok(GeneratedField::NullTreatment),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -698,6 +748,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
                 let mut filter__ = None;
                 let mut order_by__ = None;
                 let mut fun_definition__ = None;
+                let mut null_treatment__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::FunName => {
@@ -738,6 +789,12 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
                                 map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::NullTreatment => {
+                            if null_treatment__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullTreatment"));
+                            }
+                            null_treatment__ = map_.next_value::<::std::option::Option<NullTreatment>>()?.map(|x| x as i32);
+                        }
                     }
                 }
                 Ok(AggregateUdfExprNode {
@@ -747,6 +804,7 @@ impl<'de> serde::Deserialize<'de> for AggregateUdfExprNode {
                     filter: filter__,
                     order_by: order_by__.unwrap_or_default(),
                     fun_definition: fun_definition__,
+                    null_treatment: null_treatment__,
                 })
             }
         }
@@ -816,7 +874,7 @@ impl<'de> serde::Deserialize<'de> for AliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -961,7 +1019,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzeExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1087,7 +1145,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1188,7 +1246,7 @@ impl<'de> serde::Deserialize<'de> for AnalyzedLogicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1240,6 +1298,225 @@ impl<'de> serde::Deserialize<'de> for AnalyzedLogicalPlanType {
         deserializer.deserialize_struct("datafusion.AnalyzedLogicalPlanType", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for ArrowScanExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.base_conf.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ArrowScanExecNode", len)?;
+        if let Some(v) = self.base_conf.as_ref() {
+            struct_ser.serialize_field("baseConf", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ArrowScanExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "base_conf",
+            "baseConf",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            BaseConf,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "baseConf" | "base_conf" => Ok(GeneratedField::BaseConf),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ArrowScanExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ArrowScanExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ArrowScanExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut base_conf__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::BaseConf => {
+                            if base_conf__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("baseConf"));
+                            }
+                            base_conf__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ArrowScanExecNode {
+                    base_conf: base_conf__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ArrowScanExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for AsyncFuncExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        if !self.async_exprs.is_empty() {
+            len += 1;
+        }
+        if !self.async_expr_names.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.AsyncFuncExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        if !self.async_exprs.is_empty() {
+            struct_ser.serialize_field("asyncExprs", &self.async_exprs)?;
+        }
+        if !self.async_expr_names.is_empty() {
+            struct_ser.serialize_field("asyncExprNames", &self.async_expr_names)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for AsyncFuncExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+            "async_exprs",
+            "asyncExprs",
+            "async_expr_names",
+            "asyncExprNames",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+            AsyncExprs,
+            AsyncExprNames,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            "asyncExprs" | "async_exprs" => Ok(GeneratedField::AsyncExprs),
+                            "asyncExprNames" | "async_expr_names" => Ok(GeneratedField::AsyncExprNames),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = AsyncFuncExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.AsyncFuncExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<AsyncFuncExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut input__ = None;
+                let mut async_exprs__ = None;
+                let mut async_expr_names__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
+                            }
+                            input__ = map_.next_value()?;
+                        }
+                        GeneratedField::AsyncExprs => {
+                            if async_exprs__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("asyncExprs"));
+                            }
+                            async_exprs__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::AsyncExprNames => {
+                            if async_expr_names__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("asyncExprNames"));
+                            }
+                            async_expr_names__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(AsyncFuncExecNode {
+                    input: input__,
+                    async_exprs: async_exprs__.unwrap_or_default(),
+                    async_expr_names: async_expr_names__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.AsyncFuncExecNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for AvroScanExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -1280,7 +1557,7 @@ impl<'de> serde::Deserialize<'de> for AvroScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1371,7 +1648,7 @@ impl<'de> serde::Deserialize<'de> for BareTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1486,7 +1763,7 @@ impl<'de> serde::Deserialize<'de> for BetweenNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1612,7 +1889,7 @@ impl<'de> serde::Deserialize<'de> for BinaryExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1673,7 +1950,7 @@ impl<'de> serde::Deserialize<'de> for BinaryExprNode {
         deserializer.deserialize_struct("datafusion.BinaryExprNode", FIELDS, GeneratedVisitor)
     }
 }
-impl serde::Serialize for CaseNode {
+impl serde::Serialize for BufferExecNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -1681,47 +1958,39 @@ impl serde::Serialize for CaseNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if self.expr.is_some() {
-            len += 1;
-        }
-        if !self.when_then_expr.is_empty() {
+        if self.input.is_some() {
             len += 1;
         }
-        if self.else_expr.is_some() {
+        if self.capacity != 0 {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.CaseNode", len)?;
-        if let Some(v) = self.expr.as_ref() {
-            struct_ser.serialize_field("expr", v)?;
-        }
-        if !self.when_then_expr.is_empty() {
-            struct_ser.serialize_field("whenThenExpr", &self.when_then_expr)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.BufferExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
         }
-        if let Some(v) = self.else_expr.as_ref() {
-            struct_ser.serialize_field("elseExpr", v)?;
+        if self.capacity != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("capacity", ToString::to_string(&self.capacity).as_str())?;
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for CaseNode {
+impl<'de> serde::Deserialize<'de> for BufferExecNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "expr",
-            "when_then_expr",
-            "whenThenExpr",
-            "else_expr",
-            "elseExpr",
+            "input",
+            "capacity",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Expr,
-            WhenThenExpr,
-            ElseExpr,
+            Input,
+            Capacity,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1730,7 +1999,7 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1743,9 +2012,8 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
                         E: serde::de::Error,
                     {
                         match value {
-                            "expr" => Ok(GeneratedField::Expr),
-                            "whenThenExpr" | "when_then_expr" => Ok(GeneratedField::WhenThenExpr),
-                            "elseExpr" | "else_expr" => Ok(GeneratedField::ElseExpr),
+                            "input" => Ok(GeneratedField::Input),
+                            "capacity" => Ok(GeneratedField::Capacity),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1755,45 +2023,166 @@ impl<'de> serde::Deserialize<'de> for CaseNode {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = CaseNode;
+            type Value = BufferExecNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.CaseNode")
+                formatter.write_str("struct datafusion.BufferExecNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CaseNode, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<BufferExecNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut expr__ = None;
-                let mut when_then_expr__ = None;
-                let mut else_expr__ = None;
+                let mut input__ = None;
+                let mut capacity__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::Expr => {
-                            if expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("expr"));
-                            }
-                            expr__ = map_.next_value()?;
-                        }
-                        GeneratedField::WhenThenExpr => {
-                            if when_then_expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("whenThenExpr"));
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
                             }
-                            when_then_expr__ = Some(map_.next_value()?);
+                            input__ = map_.next_value()?;
                         }
-                        GeneratedField::ElseExpr => {
-                            if else_expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("elseExpr"));
+                        GeneratedField::Capacity => {
+                            if capacity__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("capacity"));
                             }
-                            else_expr__ = map_.next_value()?;
+                            capacity__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
                         }
                     }
                 }
-                Ok(CaseNode {
-                    expr: expr__,
-                    when_then_expr: when_then_expr__.unwrap_or_default(),
-                    else_expr: else_expr__,
+                Ok(BufferExecNode {
+                    input: input__,
+                    capacity: capacity__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.BufferExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for CaseNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.expr.is_some() {
+            len += 1;
+        }
+        if !self.when_then_expr.is_empty() {
+            len += 1;
+        }
+        if self.else_expr.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.CaseNode", len)?;
+        if let Some(v) = self.expr.as_ref() {
+            struct_ser.serialize_field("expr", v)?;
+        }
+        if !self.when_then_expr.is_empty() {
+            struct_ser.serialize_field("whenThenExpr", &self.when_then_expr)?;
+        }
+        if let Some(v) = self.else_expr.as_ref() {
+            struct_ser.serialize_field("elseExpr", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for CaseNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "expr",
+            "when_then_expr",
+            "whenThenExpr",
+            "else_expr",
+            "elseExpr",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Expr,
+            WhenThenExpr,
+            ElseExpr,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "expr" => Ok(GeneratedField::Expr),
+                            "whenThenExpr" | "when_then_expr" => Ok(GeneratedField::WhenThenExpr),
+                            "elseExpr" | "else_expr" => Ok(GeneratedField::ElseExpr),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = CaseNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.CaseNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CaseNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut expr__ = None;
+                let mut when_then_expr__ = None;
+                let mut else_expr__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Expr => {
+                            if expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("expr"));
+                            }
+                            expr__ = map_.next_value()?;
+                        }
+                        GeneratedField::WhenThenExpr => {
+                            if when_then_expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("whenThenExpr"));
+                            }
+                            when_then_expr__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::ElseExpr => {
+                            if else_expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("elseExpr"));
+                            }
+                            else_expr__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(CaseNode {
+                    expr: expr__,
+                    when_then_expr: when_then_expr__.unwrap_or_default(),
+                    else_expr: else_expr__,
                 })
             }
         }
@@ -1814,6 +2203,12 @@ impl serde::Serialize for CastNode {
         if self.arrow_type.is_some() {
             len += 1;
         }
+        if !self.metadata.is_empty() {
+            len += 1;
+        }
+        if self.nullable.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.CastNode", len)?;
         if let Some(v) = self.expr.as_ref() {
             struct_ser.serialize_field("expr", v)?;
@@ -1821,6 +2216,12 @@ impl serde::Serialize for CastNode {
         if let Some(v) = self.arrow_type.as_ref() {
             struct_ser.serialize_field("arrowType", v)?;
         }
+        if !self.metadata.is_empty() {
+            struct_ser.serialize_field("metadata", &self.metadata)?;
+        }
+        if let Some(v) = self.nullable.as_ref() {
+            struct_ser.serialize_field("nullable", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -1834,12 +2235,16 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             "expr",
             "arrow_type",
             "arrowType",
+            "metadata",
+            "nullable",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Expr,
             ArrowType,
+            Metadata,
+            Nullable,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -1848,7 +2253,7 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1863,6 +2268,8 @@ impl<'de> serde::Deserialize<'de> for CastNode {
                         match value {
                             "expr" => Ok(GeneratedField::Expr),
                             "arrowType" | "arrow_type" => Ok(GeneratedField::ArrowType),
+                            "metadata" => Ok(GeneratedField::Metadata),
+                            "nullable" => Ok(GeneratedField::Nullable),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -1884,6 +2291,8 @@ impl<'de> serde::Deserialize<'de> for CastNode {
             {
                 let mut expr__ = None;
                 let mut arrow_type__ = None;
+                let mut metadata__ = None;
+                let mut nullable__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Expr => {
@@ -1898,11 +2307,27 @@ impl<'de> serde::Deserialize<'de> for CastNode {
                             }
                             arrow_type__ = map_.next_value()?;
                         }
+                        GeneratedField::Metadata => {
+                            if metadata__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metadata"));
+                            }
+                            metadata__ = Some(
+                                map_.next_value::<std::collections::HashMap<_, _>>()?
+                            );
+                        }
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
+                            }
+                            nullable__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(CastNode {
                     expr: expr__,
                     arrow_type: arrow_type__,
+                    metadata: metadata__.unwrap_or_default(),
+                    nullable: nullable__,
                 })
             }
         }
@@ -1965,7 +2390,7 @@ impl<'de> serde::Deserialize<'de> for CoalesceBatchesExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2086,7 +2511,7 @@ impl<'de> serde::Deserialize<'de> for CoalescePartitionsExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2198,7 +2623,7 @@ impl<'de> serde::Deserialize<'de> for ColumnIndex {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2309,7 +2734,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListItem {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2420,7 +2845,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListRecursion {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2522,7 +2947,7 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListRecursions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2574,6 +2999,97 @@ impl<'de> serde::Deserialize<'de> for ColumnUnnestListRecursions {
         deserializer.deserialize_struct("datafusion.ColumnUnnestListRecursions", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for CooperativeExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.input.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.CooperativeExecNode", len)?;
+        if let Some(v) = self.input.as_ref() {
+            struct_ser.serialize_field("input", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for CooperativeExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "input",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Input,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "input" => Ok(GeneratedField::Input),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = CooperativeExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.CooperativeExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<CooperativeExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut input__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Input => {
+                            if input__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("input"));
+                            }
+                            input__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(CooperativeExecNode {
+                    input: input__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.CooperativeExecNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for CopyToNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -2642,7 +3158,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2780,7 +3296,7 @@ impl<'de> serde::Deserialize<'de> for CreateCatalogNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -2907,7 +3423,7 @@ impl<'de> serde::Deserialize<'de> for CreateCatalogSchemaNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3003,6 +3519,9 @@ impl serde::Serialize for CreateExternalTableNode {
         if self.if_not_exists {
             len += 1;
         }
+        if self.or_replace {
+            len += 1;
+        }
         if self.temporary {
             len += 1;
         }
@@ -3043,6 +3562,9 @@ impl serde::Serialize for CreateExternalTableNode {
         if self.if_not_exists {
             struct_ser.serialize_field("ifNotExists", &self.if_not_exists)?;
         }
+        if self.or_replace {
+            struct_ser.serialize_field("orReplace", &self.or_replace)?;
+        }
         if self.temporary {
             struct_ser.serialize_field("temporary", &self.temporary)?;
         }
@@ -3083,6 +3605,8 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
             "tablePartitionCols",
             "if_not_exists",
             "ifNotExists",
+            "or_replace",
+            "orReplace",
             "temporary",
             "definition",
             "order_exprs",
@@ -3102,6 +3626,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
             Schema,
             TablePartitionCols,
             IfNotExists,
+            OrReplace,
             Temporary,
             Definition,
             OrderExprs,
@@ -3117,7 +3642,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3136,6 +3661,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
                             "schema" => Ok(GeneratedField::Schema),
                             "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols),
                             "ifNotExists" | "if_not_exists" => Ok(GeneratedField::IfNotExists),
+                            "orReplace" | "or_replace" => Ok(GeneratedField::OrReplace),
                             "temporary" => Ok(GeneratedField::Temporary),
                             "definition" => Ok(GeneratedField::Definition),
                             "orderExprs" | "order_exprs" => Ok(GeneratedField::OrderExprs),
@@ -3168,6 +3694,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
                 let mut schema__ = None;
                 let mut table_partition_cols__ = None;
                 let mut if_not_exists__ = None;
+                let mut or_replace__ = None;
                 let mut temporary__ = None;
                 let mut definition__ = None;
                 let mut order_exprs__ = None;
@@ -3213,6 +3740,12 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
                             }
                             if_not_exists__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::OrReplace => {
+                            if or_replace__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("orReplace"));
+                            }
+                            or_replace__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::Temporary => {
                             if temporary__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("temporary"));
@@ -3268,6 +3801,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode {
                     schema: schema__,
                     table_partition_cols: table_partition_cols__.unwrap_or_default(),
                     if_not_exists: if_not_exists__.unwrap_or_default(),
+                    or_replace: or_replace__.unwrap_or_default(),
                     temporary: temporary__.unwrap_or_default(),
                     definition: definition__.unwrap_or_default(),
                     order_exprs: order_exprs__.unwrap_or_default(),
@@ -3353,7 +3887,7 @@ impl<'de> serde::Deserialize<'de> for CreateViewNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3488,7 +4022,7 @@ impl<'de> serde::Deserialize<'de> for CrossJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3596,7 +4130,7 @@ impl<'de> serde::Deserialize<'de> for CrossJoinNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3680,6 +4214,9 @@ impl serde::Serialize for CsvScanExecNode {
         if self.newlines_in_values {
             len += 1;
         }
+        if self.truncate_rows {
+            len += 1;
+        }
         if self.optional_escape.is_some() {
             len += 1;
         }
@@ -3702,6 +4239,9 @@ impl serde::Serialize for CsvScanExecNode {
         if self.newlines_in_values {
             struct_ser.serialize_field("newlinesInValues", &self.newlines_in_values)?;
         }
+        if self.truncate_rows {
+            struct_ser.serialize_field("truncateRows", &self.truncate_rows)?;
+        }
         if let Some(v) = self.optional_escape.as_ref() {
             match v {
                 csv_scan_exec_node::OptionalEscape::Escape(v) => {
@@ -3734,6 +4274,8 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
             "quote",
             "newlines_in_values",
             "newlinesInValues",
+            "truncate_rows",
+            "truncateRows",
             "escape",
             "comment",
         ];
@@ -3745,6 +4287,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
             Delimiter,
             Quote,
             NewlinesInValues,
+            TruncateRows,
             Escape,
             Comment,
         }
@@ -3755,7 +4298,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3773,6 +4316,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
                             "delimiter" => Ok(GeneratedField::Delimiter),
                             "quote" => Ok(GeneratedField::Quote),
                             "newlinesInValues" | "newlines_in_values" => Ok(GeneratedField::NewlinesInValues),
+                            "truncateRows" | "truncate_rows" => Ok(GeneratedField::TruncateRows),
                             "escape" => Ok(GeneratedField::Escape),
                             "comment" => Ok(GeneratedField::Comment),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
@@ -3799,6 +4343,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
                 let mut delimiter__ = None;
                 let mut quote__ = None;
                 let mut newlines_in_values__ = None;
+                let mut truncate_rows__ = None;
                 let mut optional_escape__ = None;
                 let mut optional_comment__ = None;
                 while let Some(k) = map_.next_key()? {
@@ -3833,6 +4378,12 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
                             }
                             newlines_in_values__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::TruncateRows => {
+                            if truncate_rows__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("truncateRows"));
+                            }
+                            truncate_rows__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::Escape => {
                             if optional_escape__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("escape"));
@@ -3853,6 +4404,7 @@ impl<'de> serde::Deserialize<'de> for CsvScanExecNode {
                     delimiter: delimiter__.unwrap_or_default(),
                     quote: quote__.unwrap_or_default(),
                     newlines_in_values: newlines_in_values__.unwrap_or_default(),
+                    truncate_rows: truncate_rows__.unwrap_or_default(),
                     optional_escape: optional_escape__,
                     optional_comment: optional_comment__,
                 })
@@ -3909,7 +4461,7 @@ impl<'de> serde::Deserialize<'de> for CsvSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4035,7 +4587,7 @@ impl<'de> serde::Deserialize<'de> for CsvSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4161,7 +4713,7 @@ impl<'de> serde::Deserialize<'de> for CteWorkTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4261,7 +4813,7 @@ impl<'de> serde::Deserialize<'de> for CubeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4388,7 +4940,7 @@ impl<'de> serde::Deserialize<'de> for CustomTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4504,7 +5056,7 @@ impl<'de> serde::Deserialize<'de> for DateUnit {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = DateUnit;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4588,7 +5140,7 @@ impl<'de> serde::Deserialize<'de> for DistinctNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4706,7 +5258,7 @@ impl<'de> serde::Deserialize<'de> for DistinctOnNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4852,7 +5404,7 @@ impl<'de> serde::Deserialize<'de> for DmlNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -4944,6 +5496,7 @@ impl serde::Serialize for dml_node::Type {
             Self::InsertAppend => "INSERT_APPEND",
             Self::InsertOverwrite => "INSERT_OVERWRITE",
             Self::InsertReplace => "INSERT_REPLACE",
+            Self::Truncate => "TRUNCATE",
         };
         serializer.serialize_str(variant)
     }
@@ -4961,11 +5514,12 @@ impl<'de> serde::Deserialize<'de> for dml_node::Type {
             "INSERT_APPEND",
             "INSERT_OVERWRITE",
             "INSERT_REPLACE",
+            "TRUNCATE",
         ];
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = dml_node::Type;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5007,6 +5561,7 @@ impl<'de> serde::Deserialize<'de> for dml_node::Type {
                     "INSERT_APPEND" => Ok(dml_node::Type::InsertAppend),
                     "INSERT_OVERWRITE" => Ok(dml_node::Type::InsertOverwrite),
                     "INSERT_REPLACE" => Ok(dml_node::Type::InsertReplace),
+                    "TRUNCATE" => Ok(dml_node::Type::Truncate),
                     _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
                 }
             }
@@ -5070,7 +5625,7 @@ impl<'de> serde::Deserialize<'de> for DropViewNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5179,7 +5734,7 @@ impl<'de> serde::Deserialize<'de> for EmptyExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5271,7 +5826,7 @@ impl<'de> serde::Deserialize<'de> for EmptyRelationNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5379,7 +5934,7 @@ impl<'de> serde::Deserialize<'de> for ExplainExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5496,7 +6051,7 @@ impl<'de> serde::Deserialize<'de> for ExplainNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5557,6 +6112,204 @@ impl<'de> serde::Deserialize<'de> for ExplainNode {
         deserializer.deserialize_struct("datafusion.ExplainNode", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for FileFormatKind {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::Unspecified => "FILE_FORMAT_KIND_UNSPECIFIED",
+            Self::Csv => "FILE_FORMAT_KIND_CSV",
+            Self::Json => "FILE_FORMAT_KIND_JSON",
+            Self::Parquet => "FILE_FORMAT_KIND_PARQUET",
+            Self::Arrow => "FILE_FORMAT_KIND_ARROW",
+            Self::Avro => "FILE_FORMAT_KIND_AVRO",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileFormatKind {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "FILE_FORMAT_KIND_UNSPECIFIED",
+            "FILE_FORMAT_KIND_CSV",
+            "FILE_FORMAT_KIND_JSON",
+            "FILE_FORMAT_KIND_PARQUET",
+            "FILE_FORMAT_KIND_ARROW",
+            "FILE_FORMAT_KIND_AVRO",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = FileFormatKind;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "FILE_FORMAT_KIND_UNSPECIFIED" => Ok(FileFormatKind::Unspecified),
+                    "FILE_FORMAT_KIND_CSV" => Ok(FileFormatKind::Csv),
+                    "FILE_FORMAT_KIND_JSON" => Ok(FileFormatKind::Json),
+                    "FILE_FORMAT_KIND_PARQUET" => Ok(FileFormatKind::Parquet),
+                    "FILE_FORMAT_KIND_ARROW" => Ok(FileFormatKind::Arrow),
+                    "FILE_FORMAT_KIND_AVRO" => Ok(FileFormatKind::Avro),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
+            }
+        }
+        deserializer.deserialize_any(GeneratedVisitor)
+    }
+}
+impl serde::Serialize for FileFormatProto {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.kind != 0 {
+            len += 1;
+        }
+        if !self.encoded_file_format.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.FileFormatProto", len)?;
+        if self.kind != 0 {
+            let v = FileFormatKind::try_from(self.kind)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.kind)))?;
+            struct_ser.serialize_field("kind", &v)?;
+        }
+        if !self.encoded_file_format.is_empty() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("encodedFileFormat", pbjson::private::base64::encode(&self.encoded_file_format).as_str())?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileFormatProto {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "kind",
+            "encoded_file_format",
+            "encodedFileFormat",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Kind,
+            EncodedFileFormat,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "kind" => Ok(GeneratedField::Kind),
+                            "encodedFileFormat" | "encoded_file_format" => Ok(GeneratedField::EncodedFileFormat),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = FileFormatProto;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.FileFormatProto")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FileFormatProto, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut kind__ = None;
+                let mut encoded_file_format__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Kind => {
+                            if kind__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("kind"));
+                            }
+                            kind__ = Some(map_.next_value::<FileFormatKind>()? as i32);
+                        }
+                        GeneratedField::EncodedFileFormat => {
+                            if encoded_file_format__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("encodedFileFormat"));
+                            }
+                            encoded_file_format__ = 
+                                Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0)
+                            ;
+                        }
+                    }
+                }
+                Ok(FileFormatProto {
+                    kind: kind__.unwrap_or_default(),
+                    encoded_file_format: encoded_file_format__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.FileFormatProto", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for FileGroup {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -5596,7 +6349,7 @@ impl<'de> serde::Deserialize<'de> for FileGroup {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5648,6 +6401,80 @@ impl<'de> serde::Deserialize<'de> for FileGroup {
         deserializer.deserialize_struct("datafusion.FileGroup", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for FileOutputMode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::Automatic => "FILE_OUTPUT_MODE_AUTOMATIC",
+            Self::SingleFile => "FILE_OUTPUT_MODE_SINGLE_FILE",
+            Self::Directory => "FILE_OUTPUT_MODE_DIRECTORY",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for FileOutputMode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "FILE_OUTPUT_MODE_AUTOMATIC",
+            "FILE_OUTPUT_MODE_SINGLE_FILE",
+            "FILE_OUTPUT_MODE_DIRECTORY",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = FileOutputMode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "FILE_OUTPUT_MODE_AUTOMATIC" => Ok(FileOutputMode::Automatic),
+                    "FILE_OUTPUT_MODE_SINGLE_FILE" => Ok(FileOutputMode::SingleFile),
+                    "FILE_OUTPUT_MODE_DIRECTORY" => Ok(FileOutputMode::Directory),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
+            }
+        }
+        deserializer.deserialize_any(GeneratedVisitor)
+    }
+}
 impl serde::Serialize for FileRange {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -5699,7 +6526,7 @@ impl<'de> serde::Deserialize<'de> for FileRange {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5802,6 +6629,9 @@ impl serde::Serialize for FileScanExecConf {
         if self.batch_size.is_some() {
             len += 1;
         }
+        if self.projection_exprs.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FileScanExecConf", len)?;
         if !self.file_groups.is_empty() {
             struct_ser.serialize_field("fileGroups", &self.file_groups)?;
@@ -5835,6 +6665,9 @@ impl serde::Serialize for FileScanExecConf {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("batchSize", ToString::to_string(&v).as_str())?;
         }
+        if let Some(v) = self.projection_exprs.as_ref() {
+            struct_ser.serialize_field("projectionExprs", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -5860,6 +6693,8 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             "constraints",
             "batch_size",
             "batchSize",
+            "projection_exprs",
+            "projectionExprs",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -5874,6 +6709,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             OutputOrdering,
             Constraints,
             BatchSize,
+            ProjectionExprs,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -5882,7 +6718,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -5905,6 +6741,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                             "outputOrdering" | "output_ordering" => Ok(GeneratedField::OutputOrdering),
                             "constraints" => Ok(GeneratedField::Constraints),
                             "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize),
+                            "projectionExprs" | "projection_exprs" => Ok(GeneratedField::ProjectionExprs),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -5934,6 +6771,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                 let mut output_ordering__ = None;
                 let mut constraints__ = None;
                 let mut batch_size__ = None;
+                let mut projection_exprs__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::FileGroups => {
@@ -6001,6 +6839,12 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                                 map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::ProjectionExprs => {
+                            if projection_exprs__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projectionExprs"));
+                            }
+                            projection_exprs__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(FileScanExecConf {
@@ -6014,6 +6858,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf {
                     output_ordering: output_ordering__.unwrap_or_default(),
                     constraints: constraints__,
                     batch_size: batch_size__,
+                    projection_exprs: projection_exprs__,
                 })
             }
         }
@@ -6052,6 +6897,9 @@ impl serde::Serialize for FileSinkConfig {
         if !self.file_extension.is_empty() {
             len += 1;
         }
+        if self.file_output_mode != 0 {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FileSinkConfig", len)?;
         if !self.object_store_url.is_empty() {
             struct_ser.serialize_field("objectStoreUrl", &self.object_store_url)?;
@@ -6079,6 +6927,11 @@ impl serde::Serialize for FileSinkConfig {
         if !self.file_extension.is_empty() {
             struct_ser.serialize_field("fileExtension", &self.file_extension)?;
         }
+        if self.file_output_mode != 0 {
+            let v = FileOutputMode::try_from(self.file_output_mode)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.file_output_mode)))?;
+            struct_ser.serialize_field("fileOutputMode", &v)?;
+        }
         struct_ser.end()
     }
 }
@@ -6105,6 +6958,8 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             "insertOp",
             "file_extension",
             "fileExtension",
+            "file_output_mode",
+            "fileOutputMode",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6117,6 +6972,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             KeepPartitionByColumns,
             InsertOp,
             FileExtension,
+            FileOutputMode,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6125,7 +6981,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6146,6 +7002,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                             "keepPartitionByColumns" | "keep_partition_by_columns" => Ok(GeneratedField::KeepPartitionByColumns),
                             "insertOp" | "insert_op" => Ok(GeneratedField::InsertOp),
                             "fileExtension" | "file_extension" => Ok(GeneratedField::FileExtension),
+                            "fileOutputMode" | "file_output_mode" => Ok(GeneratedField::FileOutputMode),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6173,6 +7030,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                 let mut keep_partition_by_columns__ = None;
                 let mut insert_op__ = None;
                 let mut file_extension__ = None;
+                let mut file_output_mode__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::ObjectStoreUrl => {
@@ -6223,6 +7081,12 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                             }
                             file_extension__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::FileOutputMode => {
+                            if file_output_mode__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fileOutputMode"));
+                            }
+                            file_output_mode__ = Some(map_.next_value::<FileOutputMode>()? as i32);
+                        }
                     }
                 }
                 Ok(FileSinkConfig {
@@ -6234,6 +7098,7 @@ impl<'de> serde::Deserialize<'de> for FileSinkConfig {
                     keep_partition_by_columns: keep_partition_by_columns__.unwrap_or_default(),
                     insert_op: insert_op__.unwrap_or_default(),
                     file_extension: file_extension__.unwrap_or_default(),
+                    file_output_mode: file_output_mode__.unwrap_or_default(),
                 })
             }
         }
@@ -6260,6 +7125,12 @@ impl serde::Serialize for FilterExecNode {
         if !self.projection.is_empty() {
             len += 1;
         }
+        if self.batch_size != 0 {
+            len += 1;
+        }
+        if self.fetch.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.FilterExecNode", len)?;
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
@@ -6273,6 +7144,12 @@ impl serde::Serialize for FilterExecNode {
         if !self.projection.is_empty() {
             struct_ser.serialize_field("projection", &self.projection)?;
         }
+        if self.batch_size != 0 {
+            struct_ser.serialize_field("batchSize", &self.batch_size)?;
+        }
+        if let Some(v) = self.fetch.as_ref() {
+            struct_ser.serialize_field("fetch", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -6288,6 +7165,9 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             "default_filter_selectivity",
             "defaultFilterSelectivity",
             "projection",
+            "batch_size",
+            "batchSize",
+            "fetch",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6296,6 +7176,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             Expr,
             DefaultFilterSelectivity,
             Projection,
+            BatchSize,
+            Fetch,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6304,7 +7186,7 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6321,6 +7203,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                             "expr" => Ok(GeneratedField::Expr),
                             "defaultFilterSelectivity" | "default_filter_selectivity" => Ok(GeneratedField::DefaultFilterSelectivity),
                             "projection" => Ok(GeneratedField::Projection),
+                            "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize),
+                            "fetch" => Ok(GeneratedField::Fetch),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6344,6 +7228,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                 let mut expr__ = None;
                 let mut default_filter_selectivity__ = None;
                 let mut projection__ = None;
+                let mut batch_size__ = None;
+                let mut fetch__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Input => {
@@ -6375,6 +7261,22 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                                     .into_iter().map(|x| x.0).collect())
                             ;
                         }
+                        GeneratedField::BatchSize => {
+                            if batch_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("batchSize"));
+                            }
+                            batch_size__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Fetch => {
+                            if fetch__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fetch"));
+                            }
+                            fetch__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
                     }
                 }
                 Ok(FilterExecNode {
@@ -6382,6 +7284,8 @@ impl<'de> serde::Deserialize<'de> for FilterExecNode {
                     expr: expr__,
                     default_filter_selectivity: default_filter_selectivity__.unwrap_or_default(),
                     projection: projection__.unwrap_or_default(),
+                    batch_size: batch_size__.unwrap_or_default(),
+                    fetch: fetch__,
                 })
             }
         }
@@ -6427,7 +7331,7 @@ impl<'de> serde::Deserialize<'de> for FixedSizeBinary {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6473,15 +7377,835 @@ impl<'de> serde::Deserialize<'de> for FixedSizeBinary {
                         }
                     }
                 }
-                Ok(FixedSizeBinary {
-                    length: length__.unwrap_or_default(),
-                })
+                Ok(FixedSizeBinary {
+                    length: length__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.FixedSizeBinary", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for FullTableReference {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.catalog.is_empty() {
+            len += 1;
+        }
+        if !self.schema.is_empty() {
+            len += 1;
+        }
+        if !self.table.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.FullTableReference", len)?;
+        if !self.catalog.is_empty() {
+            struct_ser.serialize_field("catalog", &self.catalog)?;
+        }
+        if !self.schema.is_empty() {
+            struct_ser.serialize_field("schema", &self.schema)?;
+        }
+        if !self.table.is_empty() {
+            struct_ser.serialize_field("table", &self.table)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for FullTableReference {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "catalog",
+            "schema",
+            "table",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Catalog,
+            Schema,
+            Table,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "catalog" => Ok(GeneratedField::Catalog),
+                            "schema" => Ok(GeneratedField::Schema),
+                            "table" => Ok(GeneratedField::Table),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = FullTableReference;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.FullTableReference")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FullTableReference, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut catalog__ = None;
+                let mut schema__ = None;
+                let mut table__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Catalog => {
+                            if catalog__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("catalog"));
+                            }
+                            catalog__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Schema => {
+                            if schema__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("schema"));
+                            }
+                            schema__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Table => {
+                            if table__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("table"));
+                            }
+                            table__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(FullTableReference {
+                    catalog: catalog__.unwrap_or_default(),
+                    schema: schema__.unwrap_or_default(),
+                    table: table__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.FullTableReference", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for GenerateSeriesArgsContainsNull {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.name != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.GenerateSeriesArgsContainsNull", len)?;
+        if self.name != 0 {
+            let v = GenerateSeriesName::try_from(self.name)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.name)))?;
+            struct_ser.serialize_field("name", &v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsContainsNull {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "name",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Name,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "name" => Ok(GeneratedField::Name),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = GenerateSeriesArgsContainsNull;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.GenerateSeriesArgsContainsNull")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<GenerateSeriesArgsContainsNull, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut name__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Name => {
+                            if name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("name"));
+                            }
+                            name__ = Some(map_.next_value::<GenerateSeriesName>()? as i32);
+                        }
+                    }
+                }
+                Ok(GenerateSeriesArgsContainsNull {
+                    name: name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.GenerateSeriesArgsContainsNull", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for GenerateSeriesArgsDate {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.start != 0 {
+            len += 1;
+        }
+        if self.end != 0 {
+            len += 1;
+        }
+        if self.step.is_some() {
+            len += 1;
+        }
+        if self.include_end {
+            len += 1;
+        }
+        if self.name != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.GenerateSeriesArgsDate", len)?;
+        if self.start != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("start", ToString::to_string(&self.start).as_str())?;
+        }
+        if self.end != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("end", ToString::to_string(&self.end).as_str())?;
+        }
+        if let Some(v) = self.step.as_ref() {
+            struct_ser.serialize_field("step", v)?;
+        }
+        if self.include_end {
+            struct_ser.serialize_field("includeEnd", &self.include_end)?;
+        }
+        if self.name != 0 {
+            let v = GenerateSeriesName::try_from(self.name)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.name)))?;
+            struct_ser.serialize_field("name", &v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsDate {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "start",
+            "end",
+            "step",
+            "include_end",
+            "includeEnd",
+            "name",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Start,
+            End,
+            Step,
+            IncludeEnd,
+            Name,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "start" => Ok(GeneratedField::Start),
+                            "end" => Ok(GeneratedField::End),
+                            "step" => Ok(GeneratedField::Step),
+                            "includeEnd" | "include_end" => Ok(GeneratedField::IncludeEnd),
+                            "name" => Ok(GeneratedField::Name),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = GenerateSeriesArgsDate;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.GenerateSeriesArgsDate")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<GenerateSeriesArgsDate, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut start__ = None;
+                let mut end__ = None;
+                let mut step__ = None;
+                let mut include_end__ = None;
+                let mut name__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Start => {
+                            if start__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("start"));
+                            }
+                            start__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::End => {
+                            if end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("end"));
+                            }
+                            end__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Step => {
+                            if step__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("step"));
+                            }
+                            step__ = map_.next_value()?;
+                        }
+                        GeneratedField::IncludeEnd => {
+                            if include_end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("includeEnd"));
+                            }
+                            include_end__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Name => {
+                            if name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("name"));
+                            }
+                            name__ = Some(map_.next_value::<GenerateSeriesName>()? as i32);
+                        }
+                    }
+                }
+                Ok(GenerateSeriesArgsDate {
+                    start: start__.unwrap_or_default(),
+                    end: end__.unwrap_or_default(),
+                    step: step__,
+                    include_end: include_end__.unwrap_or_default(),
+                    name: name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.GenerateSeriesArgsDate", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for GenerateSeriesArgsInt64 {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.start != 0 {
+            len += 1;
+        }
+        if self.end != 0 {
+            len += 1;
+        }
+        if self.step != 0 {
+            len += 1;
+        }
+        if self.include_end {
+            len += 1;
+        }
+        if self.name != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.GenerateSeriesArgsInt64", len)?;
+        if self.start != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("start", ToString::to_string(&self.start).as_str())?;
+        }
+        if self.end != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("end", ToString::to_string(&self.end).as_str())?;
+        }
+        if self.step != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("step", ToString::to_string(&self.step).as_str())?;
+        }
+        if self.include_end {
+            struct_ser.serialize_field("includeEnd", &self.include_end)?;
+        }
+        if self.name != 0 {
+            let v = GenerateSeriesName::try_from(self.name)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.name)))?;
+            struct_ser.serialize_field("name", &v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsInt64 {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "start",
+            "end",
+            "step",
+            "include_end",
+            "includeEnd",
+            "name",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Start,
+            End,
+            Step,
+            IncludeEnd,
+            Name,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "start" => Ok(GeneratedField::Start),
+                            "end" => Ok(GeneratedField::End),
+                            "step" => Ok(GeneratedField::Step),
+                            "includeEnd" | "include_end" => Ok(GeneratedField::IncludeEnd),
+                            "name" => Ok(GeneratedField::Name),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = GenerateSeriesArgsInt64;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.GenerateSeriesArgsInt64")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<GenerateSeriesArgsInt64, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut start__ = None;
+                let mut end__ = None;
+                let mut step__ = None;
+                let mut include_end__ = None;
+                let mut name__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Start => {
+                            if start__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("start"));
+                            }
+                            start__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::End => {
+                            if end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("end"));
+                            }
+                            end__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Step => {
+                            if step__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("step"));
+                            }
+                            step__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::IncludeEnd => {
+                            if include_end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("includeEnd"));
+                            }
+                            include_end__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Name => {
+                            if name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("name"));
+                            }
+                            name__ = Some(map_.next_value::<GenerateSeriesName>()? as i32);
+                        }
+                    }
+                }
+                Ok(GenerateSeriesArgsInt64 {
+                    start: start__.unwrap_or_default(),
+                    end: end__.unwrap_or_default(),
+                    step: step__.unwrap_or_default(),
+                    include_end: include_end__.unwrap_or_default(),
+                    name: name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.GenerateSeriesArgsInt64", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for GenerateSeriesArgsTimestamp {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.start != 0 {
+            len += 1;
+        }
+        if self.end != 0 {
+            len += 1;
+        }
+        if self.step.is_some() {
+            len += 1;
+        }
+        if self.tz.is_some() {
+            len += 1;
+        }
+        if self.include_end {
+            len += 1;
+        }
+        if self.name != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.GenerateSeriesArgsTimestamp", len)?;
+        if self.start != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("start", ToString::to_string(&self.start).as_str())?;
+        }
+        if self.end != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("end", ToString::to_string(&self.end).as_str())?;
+        }
+        if let Some(v) = self.step.as_ref() {
+            struct_ser.serialize_field("step", v)?;
+        }
+        if let Some(v) = self.tz.as_ref() {
+            struct_ser.serialize_field("tz", v)?;
+        }
+        if self.include_end {
+            struct_ser.serialize_field("includeEnd", &self.include_end)?;
+        }
+        if self.name != 0 {
+            let v = GenerateSeriesName::try_from(self.name)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.name)))?;
+            struct_ser.serialize_field("name", &v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for GenerateSeriesArgsTimestamp {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "start",
+            "end",
+            "step",
+            "tz",
+            "include_end",
+            "includeEnd",
+            "name",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Start,
+            End,
+            Step,
+            Tz,
+            IncludeEnd,
+            Name,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "start" => Ok(GeneratedField::Start),
+                            "end" => Ok(GeneratedField::End),
+                            "step" => Ok(GeneratedField::Step),
+                            "tz" => Ok(GeneratedField::Tz),
+                            "includeEnd" | "include_end" => Ok(GeneratedField::IncludeEnd),
+                            "name" => Ok(GeneratedField::Name),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = GenerateSeriesArgsTimestamp;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.GenerateSeriesArgsTimestamp")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<GenerateSeriesArgsTimestamp, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut start__ = None;
+                let mut end__ = None;
+                let mut step__ = None;
+                let mut tz__ = None;
+                let mut include_end__ = None;
+                let mut name__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Start => {
+                            if start__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("start"));
+                            }
+                            start__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::End => {
+                            if end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("end"));
+                            }
+                            end__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Step => {
+                            if step__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("step"));
+                            }
+                            step__ = map_.next_value()?;
+                        }
+                        GeneratedField::Tz => {
+                            if tz__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("tz"));
+                            }
+                            tz__ = map_.next_value()?;
+                        }
+                        GeneratedField::IncludeEnd => {
+                            if include_end__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("includeEnd"));
+                            }
+                            include_end__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Name => {
+                            if name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("name"));
+                            }
+                            name__ = Some(map_.next_value::<GenerateSeriesName>()? as i32);
+                        }
+                    }
+                }
+                Ok(GenerateSeriesArgsTimestamp {
+                    start: start__.unwrap_or_default(),
+                    end: end__.unwrap_or_default(),
+                    step: step__,
+                    tz: tz__,
+                    include_end: include_end__.unwrap_or_default(),
+                    name: name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.GenerateSeriesArgsTimestamp", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for GenerateSeriesName {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::GsGenerateSeries => "GS_GENERATE_SERIES",
+            Self::GsRange => "GS_RANGE",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for GenerateSeriesName {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "GS_GENERATE_SERIES",
+            "GS_RANGE",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = GenerateSeriesName;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "GS_GENERATE_SERIES" => Ok(GenerateSeriesName::GsGenerateSeries),
+                    "GS_RANGE" => Ok(GenerateSeriesName::GsRange),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
             }
         }
-        deserializer.deserialize_struct("datafusion.FixedSizeBinary", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_any(GeneratedVisitor)
     }
 }
-impl serde::Serialize for FullTableReference {
+impl serde::Serialize for GenerateSeriesNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
@@ -6489,45 +8213,69 @@ impl serde::Serialize for FullTableReference {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
-        if !self.catalog.is_empty() {
+        if self.schema.is_some() {
             len += 1;
         }
-        if !self.schema.is_empty() {
+        if self.target_batch_size != 0 {
             len += 1;
         }
-        if !self.table.is_empty() {
+        if self.args.is_some() {
             len += 1;
         }
-        let mut struct_ser = serializer.serialize_struct("datafusion.FullTableReference", len)?;
-        if !self.catalog.is_empty() {
-            struct_ser.serialize_field("catalog", &self.catalog)?;
+        let mut struct_ser = serializer.serialize_struct("datafusion.GenerateSeriesNode", len)?;
+        if let Some(v) = self.schema.as_ref() {
+            struct_ser.serialize_field("schema", v)?;
         }
-        if !self.schema.is_empty() {
-            struct_ser.serialize_field("schema", &self.schema)?;
+        if self.target_batch_size != 0 {
+            struct_ser.serialize_field("targetBatchSize", &self.target_batch_size)?;
         }
-        if !self.table.is_empty() {
-            struct_ser.serialize_field("table", &self.table)?;
+        if let Some(v) = self.args.as_ref() {
+            match v {
+                generate_series_node::Args::ContainsNull(v) => {
+                    struct_ser.serialize_field("containsNull", v)?;
+                }
+                generate_series_node::Args::Int64Args(v) => {
+                    struct_ser.serialize_field("int64Args", v)?;
+                }
+                generate_series_node::Args::TimestampArgs(v) => {
+                    struct_ser.serialize_field("timestampArgs", v)?;
+                }
+                generate_series_node::Args::DateArgs(v) => {
+                    struct_ser.serialize_field("dateArgs", v)?;
+                }
+            }
         }
         struct_ser.end()
     }
 }
-impl<'de> serde::Deserialize<'de> for FullTableReference {
+impl<'de> serde::Deserialize<'de> for GenerateSeriesNode {
     #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
-            "catalog",
             "schema",
-            "table",
+            "target_batch_size",
+            "targetBatchSize",
+            "contains_null",
+            "containsNull",
+            "int64_args",
+            "int64Args",
+            "timestamp_args",
+            "timestampArgs",
+            "date_args",
+            "dateArgs",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
-            Catalog,
             Schema,
-            Table,
+            TargetBatchSize,
+            ContainsNull,
+            Int64Args,
+            TimestampArgs,
+            DateArgs,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6536,7 +8284,7 @@ impl<'de> serde::Deserialize<'de> for FullTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6549,9 +8297,12 @@ impl<'de> serde::Deserialize<'de> for FullTableReference {
                         E: serde::de::Error,
                     {
                         match value {
-                            "catalog" => Ok(GeneratedField::Catalog),
                             "schema" => Ok(GeneratedField::Schema),
-                            "table" => Ok(GeneratedField::Table),
+                            "targetBatchSize" | "target_batch_size" => Ok(GeneratedField::TargetBatchSize),
+                            "containsNull" | "contains_null" => Ok(GeneratedField::ContainsNull),
+                            "int64Args" | "int64_args" => Ok(GeneratedField::Int64Args),
+                            "timestampArgs" | "timestamp_args" => Ok(GeneratedField::TimestampArgs),
+                            "dateArgs" | "date_args" => Ok(GeneratedField::DateArgs),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6561,49 +8312,73 @@ impl<'de> serde::Deserialize<'de> for FullTableReference {
         }
         struct GeneratedVisitor;
         impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = FullTableReference;
+            type Value = GenerateSeriesNode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.FullTableReference")
+                formatter.write_str("struct datafusion.GenerateSeriesNode")
             }
 
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<FullTableReference, V::Error>
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<GenerateSeriesNode, V::Error>
                 where
                     V: serde::de::MapAccess<'de>,
             {
-                let mut catalog__ = None;
                 let mut schema__ = None;
-                let mut table__ = None;
+                let mut target_batch_size__ = None;
+                let mut args__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
-                        GeneratedField::Catalog => {
-                            if catalog__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("catalog"));
-                            }
-                            catalog__ = Some(map_.next_value()?);
-                        }
                         GeneratedField::Schema => {
                             if schema__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("schema"));
                             }
-                            schema__ = Some(map_.next_value()?);
+                            schema__ = map_.next_value()?;
                         }
-                        GeneratedField::Table => {
-                            if table__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("table"));
+                        GeneratedField::TargetBatchSize => {
+                            if target_batch_size__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("targetBatchSize"));
                             }
-                            table__ = Some(map_.next_value()?);
+                            target_batch_size__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::ContainsNull => {
+                            if args__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("containsNull"));
+                            }
+                            args__ = map_.next_value::<::std::option::Option<_>>()?.map(generate_series_node::Args::ContainsNull)
+;
+                        }
+                        GeneratedField::Int64Args => {
+                            if args__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("int64Args"));
+                            }
+                            args__ = map_.next_value::<::std::option::Option<_>>()?.map(generate_series_node::Args::Int64Args)
+;
+                        }
+                        GeneratedField::TimestampArgs => {
+                            if args__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("timestampArgs"));
+                            }
+                            args__ = map_.next_value::<::std::option::Option<_>>()?.map(generate_series_node::Args::TimestampArgs)
+;
+                        }
+                        GeneratedField::DateArgs => {
+                            if args__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("dateArgs"));
+                            }
+                            args__ = map_.next_value::<::std::option::Option<_>>()?.map(generate_series_node::Args::DateArgs)
+;
                         }
                     }
                 }
-                Ok(FullTableReference {
-                    catalog: catalog__.unwrap_or_default(),
-                    schema: schema__.unwrap_or_default(),
-                    table: table__.unwrap_or_default(),
+                Ok(GenerateSeriesNode {
+                    schema: schema__,
+                    target_batch_size: target_batch_size__.unwrap_or_default(),
+                    args: args__,
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.FullTableReference", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.GenerateSeriesNode", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for GlobalLimitExecNode {
@@ -6663,7 +8438,7 @@ impl<'de> serde::Deserialize<'de> for GlobalLimitExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6776,7 +8551,7 @@ impl<'de> serde::Deserialize<'de> for GroupingSetNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6851,7 +8626,7 @@ impl serde::Serialize for HashJoinExecNode {
         if self.partition_mode != 0 {
             len += 1;
         }
-        if self.null_equals_null {
+        if self.null_equality != 0 {
             len += 1;
         }
         if self.filter.is_some() {
@@ -6860,6 +8635,9 @@ impl serde::Serialize for HashJoinExecNode {
         if !self.projection.is_empty() {
             len += 1;
         }
+        if self.null_aware {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.HashJoinExecNode", len)?;
         if let Some(v) = self.left.as_ref() {
             struct_ser.serialize_field("left", v)?;
@@ -6880,8 +8658,10 @@ impl serde::Serialize for HashJoinExecNode {
                 .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.partition_mode)))?;
             struct_ser.serialize_field("partitionMode", &v)?;
         }
-        if self.null_equals_null {
-            struct_ser.serialize_field("nullEqualsNull", &self.null_equals_null)?;
+        if self.null_equality != 0 {
+            let v = super::datafusion_common::NullEquality::try_from(self.null_equality)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.null_equality)))?;
+            struct_ser.serialize_field("nullEquality", &v)?;
         }
         if let Some(v) = self.filter.as_ref() {
             struct_ser.serialize_field("filter", v)?;
@@ -6889,6 +8669,9 @@ impl serde::Serialize for HashJoinExecNode {
         if !self.projection.is_empty() {
             struct_ser.serialize_field("projection", &self.projection)?;
         }
+        if self.null_aware {
+            struct_ser.serialize_field("nullAware", &self.null_aware)?;
+        }
         struct_ser.end()
     }
 }
@@ -6906,10 +8689,12 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             "joinType",
             "partition_mode",
             "partitionMode",
-            "null_equals_null",
-            "nullEqualsNull",
+            "null_equality",
+            "nullEquality",
             "filter",
             "projection",
+            "null_aware",
+            "nullAware",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -6919,9 +8704,10 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             On,
             JoinType,
             PartitionMode,
-            NullEqualsNull,
+            NullEquality,
             Filter,
             Projection,
+            NullAware,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -6930,7 +8716,7 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -6948,9 +8734,10 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                             "on" => Ok(GeneratedField::On),
                             "joinType" | "join_type" => Ok(GeneratedField::JoinType),
                             "partitionMode" | "partition_mode" => Ok(GeneratedField::PartitionMode),
-                            "nullEqualsNull" | "null_equals_null" => Ok(GeneratedField::NullEqualsNull),
+                            "nullEquality" | "null_equality" => Ok(GeneratedField::NullEquality),
                             "filter" => Ok(GeneratedField::Filter),
                             "projection" => Ok(GeneratedField::Projection),
+                            "nullAware" | "null_aware" => Ok(GeneratedField::NullAware),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -6975,9 +8762,10 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                 let mut on__ = None;
                 let mut join_type__ = None;
                 let mut partition_mode__ = None;
-                let mut null_equals_null__ = None;
+                let mut null_equality__ = None;
                 let mut filter__ = None;
                 let mut projection__ = None;
+                let mut null_aware__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Left => {
@@ -7010,11 +8798,11 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                             }
                             partition_mode__ = Some(map_.next_value::<PartitionMode>()? as i32);
                         }
-                        GeneratedField::NullEqualsNull => {
-                            if null_equals_null__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("nullEqualsNull"));
+                        GeneratedField::NullEquality => {
+                            if null_equality__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullEquality"));
                             }
-                            null_equals_null__ = Some(map_.next_value()?);
+                            null_equality__ = Some(map_.next_value::<super::datafusion_common::NullEquality>()? as i32);
                         }
                         GeneratedField::Filter => {
                             if filter__.is_some() {
@@ -7031,6 +8819,12 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                                     .into_iter().map(|x| x.0).collect())
                             ;
                         }
+                        GeneratedField::NullAware => {
+                            if null_aware__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullAware"));
+                            }
+                            null_aware__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(HashJoinExecNode {
@@ -7039,9 +8833,10 @@ impl<'de> serde::Deserialize<'de> for HashJoinExecNode {
                     on: on__.unwrap_or_default(),
                     join_type: join_type__.unwrap_or_default(),
                     partition_mode: partition_mode__.unwrap_or_default(),
-                    null_equals_null: null_equals_null__.unwrap_or_default(),
+                    null_equality: null_equality__.unwrap_or_default(),
                     filter: filter__,
                     projection: projection__.unwrap_or_default(),
+                    null_aware: null_aware__.unwrap_or_default(),
                 })
             }
         }
@@ -7099,7 +8894,7 @@ impl<'de> serde::Deserialize<'de> for HashRepartition {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7226,7 +9021,7 @@ impl<'de> serde::Deserialize<'de> for ILikeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7360,7 +9155,7 @@ impl<'de> serde::Deserialize<'de> for InListNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7458,7 +9253,7 @@ impl<'de> serde::Deserialize<'de> for InsertOp {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = InsertOp;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7543,7 +9338,7 @@ impl<'de> serde::Deserialize<'de> for InterleaveExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7634,7 +9429,7 @@ impl<'de> serde::Deserialize<'de> for IsFalse {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7725,7 +9520,7 @@ impl<'de> serde::Deserialize<'de> for IsNotFalse {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7816,7 +9611,7 @@ impl<'de> serde::Deserialize<'de> for IsNotNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7907,7 +9702,7 @@ impl<'de> serde::Deserialize<'de> for IsNotTrue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -7998,7 +9793,7 @@ impl<'de> serde::Deserialize<'de> for IsNotUnknown {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8089,7 +9884,7 @@ impl<'de> serde::Deserialize<'de> for IsNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8180,7 +9975,7 @@ impl<'de> serde::Deserialize<'de> for IsTrue {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8271,7 +10066,7 @@ impl<'de> serde::Deserialize<'de> for IsUnknown {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8379,7 +10174,7 @@ impl<'de> serde::Deserialize<'de> for JoinFilter {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8475,7 +10270,7 @@ impl serde::Serialize for JoinNode {
         if !self.right_join_key.is_empty() {
             len += 1;
         }
-        if self.null_equals_null {
+        if self.null_equality != 0 {
             len += 1;
         }
         if self.filter.is_some() {
@@ -8504,8 +10299,10 @@ impl serde::Serialize for JoinNode {
         if !self.right_join_key.is_empty() {
             struct_ser.serialize_field("rightJoinKey", &self.right_join_key)?;
         }
-        if self.null_equals_null {
-            struct_ser.serialize_field("nullEqualsNull", &self.null_equals_null)?;
+        if self.null_equality != 0 {
+            let v = super::datafusion_common::NullEquality::try_from(self.null_equality)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.null_equality)))?;
+            struct_ser.serialize_field("nullEquality", &v)?;
         }
         if let Some(v) = self.filter.as_ref() {
             struct_ser.serialize_field("filter", v)?;
@@ -8530,8 +10327,8 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
             "leftJoinKey",
             "right_join_key",
             "rightJoinKey",
-            "null_equals_null",
-            "nullEqualsNull",
+            "null_equality",
+            "nullEquality",
             "filter",
         ];
 
@@ -8543,7 +10340,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
             JoinConstraint,
             LeftJoinKey,
             RightJoinKey,
-            NullEqualsNull,
+            NullEquality,
             Filter,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -8553,7 +10350,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8572,7 +10369,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
                             "joinConstraint" | "join_constraint" => Ok(GeneratedField::JoinConstraint),
                             "leftJoinKey" | "left_join_key" => Ok(GeneratedField::LeftJoinKey),
                             "rightJoinKey" | "right_join_key" => Ok(GeneratedField::RightJoinKey),
-                            "nullEqualsNull" | "null_equals_null" => Ok(GeneratedField::NullEqualsNull),
+                            "nullEquality" | "null_equality" => Ok(GeneratedField::NullEquality),
                             "filter" => Ok(GeneratedField::Filter),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
@@ -8599,7 +10396,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
                 let mut join_constraint__ = None;
                 let mut left_join_key__ = None;
                 let mut right_join_key__ = None;
-                let mut null_equals_null__ = None;
+                let mut null_equality__ = None;
                 let mut filter__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
@@ -8639,11 +10436,11 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
                             }
                             right_join_key__ = Some(map_.next_value()?);
                         }
-                        GeneratedField::NullEqualsNull => {
-                            if null_equals_null__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("nullEqualsNull"));
+                        GeneratedField::NullEquality => {
+                            if null_equality__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullEquality"));
                             }
-                            null_equals_null__ = Some(map_.next_value()?);
+                            null_equality__ = Some(map_.next_value::<super::datafusion_common::NullEquality>()? as i32);
                         }
                         GeneratedField::Filter => {
                             if filter__.is_some() {
@@ -8660,7 +10457,7 @@ impl<'de> serde::Deserialize<'de> for JoinNode {
                     join_constraint: join_constraint__.unwrap_or_default(),
                     left_join_key: left_join_key__.unwrap_or_default(),
                     right_join_key: right_join_key__.unwrap_or_default(),
-                    null_equals_null: null_equals_null__.unwrap_or_default(),
+                    null_equality: null_equality__.unwrap_or_default(),
                     filter: filter__,
                 })
             }
@@ -8715,7 +10512,7 @@ impl<'de> serde::Deserialize<'de> for JoinOn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8816,7 +10613,7 @@ impl<'de> serde::Deserialize<'de> for JsonScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -8916,7 +10713,7 @@ impl<'de> serde::Deserialize<'de> for JsonSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9042,7 +10839,7 @@ impl<'de> serde::Deserialize<'de> for JsonSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9185,7 +10982,7 @@ impl<'de> serde::Deserialize<'de> for LikeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9323,7 +11120,7 @@ impl<'de> serde::Deserialize<'de> for LimitNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9436,7 +11233,7 @@ impl<'de> serde::Deserialize<'de> for ListIndex {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9543,7 +11340,7 @@ impl<'de> serde::Deserialize<'de> for ListRange {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9661,7 +11458,7 @@ impl<'de> serde::Deserialize<'de> for ListUnnest {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9812,6 +11609,9 @@ impl serde::Serialize for ListingTableScanNode {
                 listing_table_scan_node::FileFormatType::Json(v) => {
                     struct_ser.serialize_field("json", v)?;
                 }
+                listing_table_scan_node::FileFormatType::Arrow(v) => {
+                    struct_ser.serialize_field("arrow", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -9844,6 +11644,7 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
             "parquet",
             "avro",
             "json",
+            "arrow",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -9862,6 +11663,7 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
             Parquet,
             Avro,
             Json,
+            Arrow,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -9870,7 +11672,7 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -9897,6 +11699,7 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
                             "parquet" => Ok(GeneratedField::Parquet),
                             "avro" => Ok(GeneratedField::Avro),
                             "json" => Ok(GeneratedField::Json),
+                            "arrow" => Ok(GeneratedField::Arrow),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -10017,6 +11820,13 @@ impl<'de> serde::Deserialize<'de> for ListingTableScanNode {
                                 return Err(serde::de::Error::duplicate_field("json"));
                             }
                             file_format_type__ = map_.next_value::<::std::option::Option<_>>()?.map(listing_table_scan_node::FileFormatType::Json)
+;
+                        }
+                        GeneratedField::Arrow => {
+                            if file_format_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("arrow"));
+                            }
+                            file_format_type__ = map_.next_value::<::std::option::Option<_>>()?.map(listing_table_scan_node::FileFormatType::Arrow)
 ;
                         }
                     }
@@ -10086,7 +11896,7 @@ impl<'de> serde::Deserialize<'de> for LocalLimitExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10188,7 +11998,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprList {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10451,7 +12261,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10784,7 +12594,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -10885,7 +12695,7 @@ impl<'de> serde::Deserialize<'de> for LogicalExtensionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11161,7 +12971,7 @@ impl<'de> serde::Deserialize<'de> for LogicalPlanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11501,7 +13311,7 @@ impl<'de> serde::Deserialize<'de> for MaybeFilter {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11593,7 +13403,7 @@ impl<'de> serde::Deserialize<'de> for MaybePhysicalSortExprs {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11609,40 +13419,226 @@ impl<'de> serde::Deserialize<'de> for MaybePhysicalSortExprs {
                             "sortExpr" | "sort_expr" => Ok(GeneratedField::SortExpr),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
-                    }
-                }
-                deserializer.deserialize_identifier(GeneratedVisitor)
-            }
-        }
-        struct GeneratedVisitor;
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
-            type Value = MaybePhysicalSortExprs;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                formatter.write_str("struct datafusion.MaybePhysicalSortExprs")
-            }
-
-            fn visit_map<V>(self, mut map_: V) -> std::result::Result<MaybePhysicalSortExprs, V::Error>
-                where
-                    V: serde::de::MapAccess<'de>,
-            {
-                let mut sort_expr__ = None;
-                while let Some(k) = map_.next_key()? {
-                    match k {
-                        GeneratedField::SortExpr => {
-                            if sort_expr__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("sortExpr"));
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = MaybePhysicalSortExprs;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.MaybePhysicalSortExprs")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<MaybePhysicalSortExprs, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut sort_expr__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::SortExpr => {
+                            if sort_expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("sortExpr"));
+                            }
+                            sort_expr__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(MaybePhysicalSortExprs {
+                    sort_expr: sort_expr__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.MaybePhysicalSortExprs", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for MemoryScanExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.partitions.is_empty() {
+            len += 1;
+        }
+        if self.schema.is_some() {
+            len += 1;
+        }
+        if !self.projection.is_empty() {
+            len += 1;
+        }
+        if !self.sort_information.is_empty() {
+            len += 1;
+        }
+        if self.show_sizes {
+            len += 1;
+        }
+        if self.fetch.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.MemoryScanExecNode", len)?;
+        if !self.partitions.is_empty() {
+            struct_ser.serialize_field("partitions", &self.partitions.iter().map(pbjson::private::base64::encode).collect::<Vec<_>>())?;
+        }
+        if let Some(v) = self.schema.as_ref() {
+            struct_ser.serialize_field("schema", v)?;
+        }
+        if !self.projection.is_empty() {
+            struct_ser.serialize_field("projection", &self.projection)?;
+        }
+        if !self.sort_information.is_empty() {
+            struct_ser.serialize_field("sortInformation", &self.sort_information)?;
+        }
+        if self.show_sizes {
+            struct_ser.serialize_field("showSizes", &self.show_sizes)?;
+        }
+        if let Some(v) = self.fetch.as_ref() {
+            struct_ser.serialize_field("fetch", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for MemoryScanExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "partitions",
+            "schema",
+            "projection",
+            "sort_information",
+            "sortInformation",
+            "show_sizes",
+            "showSizes",
+            "fetch",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Partitions,
+            Schema,
+            Projection,
+            SortInformation,
+            ShowSizes,
+            Fetch,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "partitions" => Ok(GeneratedField::Partitions),
+                            "schema" => Ok(GeneratedField::Schema),
+                            "projection" => Ok(GeneratedField::Projection),
+                            "sortInformation" | "sort_information" => Ok(GeneratedField::SortInformation),
+                            "showSizes" | "show_sizes" => Ok(GeneratedField::ShowSizes),
+                            "fetch" => Ok(GeneratedField::Fetch),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = MemoryScanExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.MemoryScanExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<MemoryScanExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut partitions__ = None;
+                let mut schema__ = None;
+                let mut projection__ = None;
+                let mut sort_information__ = None;
+                let mut show_sizes__ = None;
+                let mut fetch__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Partitions => {
+                            if partitions__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("partitions"));
+                            }
+                            partitions__ = 
+                                Some(map_.next_value::<Vec<::pbjson::private::BytesDeserialize<_>>>()?
+                                    .into_iter().map(|x| x.0).collect())
+                            ;
+                        }
+                        GeneratedField::Schema => {
+                            if schema__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("schema"));
+                            }
+                            schema__ = map_.next_value()?;
+                        }
+                        GeneratedField::Projection => {
+                            if projection__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projection"));
+                            }
+                            projection__ = 
+                                Some(map_.next_value::<Vec<::pbjson::private::NumberDeserialize<_>>>()?
+                                    .into_iter().map(|x| x.0).collect())
+                            ;
+                        }
+                        GeneratedField::SortInformation => {
+                            if sort_information__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("sortInformation"));
+                            }
+                            sort_information__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::ShowSizes => {
+                            if show_sizes__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("showSizes"));
+                            }
+                            show_sizes__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Fetch => {
+                            if fetch__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fetch"));
                             }
-                            sort_expr__ = Some(map_.next_value()?);
+                            fetch__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
                         }
                     }
                 }
-                Ok(MaybePhysicalSortExprs {
-                    sort_expr: sort_expr__.unwrap_or_default(),
+                Ok(MemoryScanExecNode {
+                    partitions: partitions__.unwrap_or_default(),
+                    schema: schema__,
+                    projection: projection__.unwrap_or_default(),
+                    sort_information: sort_information__.unwrap_or_default(),
+                    show_sizes: show_sizes__.unwrap_or_default(),
+                    fetch: fetch__,
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.MaybePhysicalSortExprs", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.MemoryScanExecNode", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for NamedStructField {
@@ -11684,7 +13680,7 @@ impl<'de> serde::Deserialize<'de> for NamedStructField {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11775,7 +13771,7 @@ impl<'de> serde::Deserialize<'de> for NegativeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -11901,7 +13897,7 @@ impl<'de> serde::Deserialize<'de> for NestedLoopJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12031,7 +14027,7 @@ impl<'de> serde::Deserialize<'de> for Not {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12083,6 +14079,77 @@ impl<'de> serde::Deserialize<'de> for Not {
         deserializer.deserialize_struct("datafusion.Not", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for NullTreatment {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let variant = match self {
+            Self::RespectNulls => "RESPECT_NULLS",
+            Self::IgnoreNulls => "IGNORE_NULLS",
+        };
+        serializer.serialize_str(variant)
+    }
+}
+impl<'de> serde::Deserialize<'de> for NullTreatment {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "RESPECT_NULLS",
+            "IGNORE_NULLS",
+        ];
+
+        struct GeneratedVisitor;
+
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
+            type Value = NullTreatment;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(formatter, "expected one of: {:?}", &FIELDS)
+            }
+
+            fn visit_i64<E>(self, v: i64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self)
+                    })
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                i32::try_from(v)
+                    .ok()
+                    .and_then(|x| x.try_into().ok())
+                    .ok_or_else(|| {
+                        serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self)
+                    })
+            }
+
+            fn visit_str<E>(self, value: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                match value {
+                    "RESPECT_NULLS" => Ok(NullTreatment::RespectNulls),
+                    "IGNORE_NULLS" => Ok(NullTreatment::IgnoreNulls),
+                    _ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
+                }
+            }
+        }
+        deserializer.deserialize_any(GeneratedVisitor)
+    }
+}
 impl serde::Serialize for OptimizedLogicalPlanType {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -12123,7 +14190,7 @@ impl<'de> serde::Deserialize<'de> for OptimizedLogicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12215,7 +14282,7 @@ impl<'de> serde::Deserialize<'de> for OptimizedPhysicalPlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12324,7 +14391,7 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12442,7 +14509,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSink {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12568,7 +14635,7 @@ impl<'de> serde::Deserialize<'de> for ParquetSinkExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12694,7 +14761,7 @@ impl<'de> serde::Deserialize<'de> for PartialTableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12794,7 +14861,7 @@ impl<'de> serde::Deserialize<'de> for PartiallySortedInputOrderMode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12897,7 +14964,7 @@ impl<'de> serde::Deserialize<'de> for PartitionColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -12986,7 +15053,7 @@ impl<'de> serde::Deserialize<'de> for PartitionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = PartitionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13105,7 +15172,7 @@ impl<'de> serde::Deserialize<'de> for PartitionStats {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13275,7 +15342,7 @@ impl<'de> serde::Deserialize<'de> for PartitionedFile {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13434,7 +15501,7 @@ impl<'de> serde::Deserialize<'de> for Partitioning {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13524,6 +15591,9 @@ impl serde::Serialize for PhysicalAggregateExprNode {
         if self.fun_definition.is_some() {
             len += 1;
         }
+        if !self.human_display.is_empty() {
+            len += 1;
+        }
         if self.aggregate_function.is_some() {
             len += 1;
         }
@@ -13545,6 +15615,9 @@ impl serde::Serialize for PhysicalAggregateExprNode {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?;
         }
+        if !self.human_display.is_empty() {
+            struct_ser.serialize_field("humanDisplay", &self.human_display)?;
+        }
         if let Some(v) = self.aggregate_function.as_ref() {
             match v {
                 physical_aggregate_expr_node::AggregateFunction::UserDefinedAggrFunction(v) => {
@@ -13570,6 +15643,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
             "ignoreNulls",
             "fun_definition",
             "funDefinition",
+            "human_display",
+            "humanDisplay",
             "user_defined_aggr_function",
             "userDefinedAggrFunction",
         ];
@@ -13581,6 +15656,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
             Distinct,
             IgnoreNulls,
             FunDefinition,
+            HumanDisplay,
             UserDefinedAggrFunction,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -13590,7 +15666,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13608,6 +15684,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
                             "distinct" => Ok(GeneratedField::Distinct),
                             "ignoreNulls" | "ignore_nulls" => Ok(GeneratedField::IgnoreNulls),
                             "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition),
+                            "humanDisplay" | "human_display" => Ok(GeneratedField::HumanDisplay),
                             "userDefinedAggrFunction" | "user_defined_aggr_function" => Ok(GeneratedField::UserDefinedAggrFunction),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
@@ -13633,6 +15710,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
                 let mut distinct__ = None;
                 let mut ignore_nulls__ = None;
                 let mut fun_definition__ = None;
+                let mut human_display__ = None;
                 let mut aggregate_function__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
@@ -13668,6 +15746,12 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
                                 map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::HumanDisplay => {
+                            if human_display__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("humanDisplay"));
+                            }
+                            human_display__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::UserDefinedAggrFunction => {
                             if aggregate_function__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("userDefinedAggrFunction"));
@@ -13682,6 +15766,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAggregateExprNode {
                     distinct: distinct__.unwrap_or_default(),
                     ignore_nulls: ignore_nulls__.unwrap_or_default(),
                     fun_definition: fun_definition__,
+                    human_display: human_display__.unwrap_or_default(),
                     aggregate_function: aggregate_function__,
                 })
             }
@@ -13736,7 +15821,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalAliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13852,7 +15937,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalBinaryExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -13979,7 +16064,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalCaseNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14097,7 +16182,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14205,7 +16290,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14323,7 +16408,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalDateTimeIntervalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14401,10 +16486,18 @@ impl serde::Serialize for PhysicalExprNode {
     {
         use serde::ser::SerializeStruct;
         let mut len = 0;
+        if self.expr_id.is_some() {
+            len += 1;
+        }
         if self.expr_type.is_some() {
             len += 1;
         }
         let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExprNode", len)?;
+        if let Some(v) = self.expr_id.as_ref() {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("exprId", ToString::to_string(&v).as_str())?;
+        }
         if let Some(v) = self.expr_type.as_ref() {
             match v {
                 physical_expr_node::ExprType::Column(v) => {
@@ -14461,6 +16554,9 @@ impl serde::Serialize for PhysicalExprNode {
                 physical_expr_node::ExprType::UnknownColumn(v) => {
                     struct_ser.serialize_field("unknownColumn", v)?;
                 }
+                physical_expr_node::ExprType::HashExpr(v) => {
+                    struct_ser.serialize_field("hashExpr", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -14473,6 +16569,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
         D: serde::Deserializer<'de>,
     {
         const FIELDS: &[&str] = &[
+            "expr_id",
+            "exprId",
             "column",
             "literal",
             "binary_expr",
@@ -14503,10 +16601,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             "extension",
             "unknown_column",
             "unknownColumn",
+            "hash_expr",
+            "hashExpr",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
+            ExprId,
             Column,
             Literal,
             BinaryExpr,
@@ -14525,6 +16626,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             LikeExpr,
             Extension,
             UnknownColumn,
+            HashExpr,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -14533,7 +16635,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14546,6 +16648,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                         E: serde::de::Error,
                     {
                         match value {
+                            "exprId" | "expr_id" => Ok(GeneratedField::ExprId),
                             "column" => Ok(GeneratedField::Column),
                             "literal" => Ok(GeneratedField::Literal),
                             "binaryExpr" | "binary_expr" => Ok(GeneratedField::BinaryExpr),
@@ -14564,6 +16667,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                             "likeExpr" | "like_expr" => Ok(GeneratedField::LikeExpr),
                             "extension" => Ok(GeneratedField::Extension),
                             "unknownColumn" | "unknown_column" => Ok(GeneratedField::UnknownColumn),
+                            "hashExpr" | "hash_expr" => Ok(GeneratedField::HashExpr),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -14583,9 +16687,18 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                 where
                     V: serde::de::MapAccess<'de>,
             {
+                let mut expr_id__ = None;
                 let mut expr_type__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
+                        GeneratedField::ExprId => {
+                            if expr_id__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("exprId"));
+                            }
+                            expr_id__ = 
+                                map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0)
+                            ;
+                        }
                         GeneratedField::Column => {
                             if expr_type__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("column"));
@@ -14710,11 +16823,19 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode {
                                 return Err(serde::de::Error::duplicate_field("unknownColumn"));
                             }
                             expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::UnknownColumn)
+;
+                        }
+                        GeneratedField::HashExpr => {
+                            if expr_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("hashExpr"));
+                            }
+                            expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::HashExpr)
 ;
                         }
                     }
                 }
                 Ok(PhysicalExprNode {
+                    expr_id: expr_id__,
                     expr_type: expr_type__,
                 })
             }
@@ -14771,7 +16892,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14883,7 +17004,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -14946,6 +17067,136 @@ impl<'de> serde::Deserialize<'de> for PhysicalExtensionNode {
         deserializer.deserialize_struct("datafusion.PhysicalExtensionNode", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for PhysicalHashExprNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.on_columns.is_empty() {
+            len += 1;
+        }
+        if self.seed0 != 0 {
+            len += 1;
+        }
+        if !self.description.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalHashExprNode", len)?;
+        if !self.on_columns.is_empty() {
+            struct_ser.serialize_field("onColumns", &self.on_columns)?;
+        }
+        if self.seed0 != 0 {
+            #[allow(clippy::needless_borrow)]
+            #[allow(clippy::needless_borrows_for_generic_args)]
+            struct_ser.serialize_field("seed0", ToString::to_string(&self.seed0).as_str())?;
+        }
+        if !self.description.is_empty() {
+            struct_ser.serialize_field("description", &self.description)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for PhysicalHashExprNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "on_columns",
+            "onColumns",
+            "seed0",
+            "description",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            OnColumns,
+            Seed0,
+            Description,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "onColumns" | "on_columns" => Ok(GeneratedField::OnColumns),
+                            "seed0" => Ok(GeneratedField::Seed0),
+                            "description" => Ok(GeneratedField::Description),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = PhysicalHashExprNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.PhysicalHashExprNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<PhysicalHashExprNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut on_columns__ = None;
+                let mut seed0__ = None;
+                let mut description__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::OnColumns => {
+                            if on_columns__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("onColumns"));
+                            }
+                            on_columns__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Seed0 => {
+                            if seed0__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("seed0"));
+                            }
+                            seed0__ = 
+                                Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
+                            ;
+                        }
+                        GeneratedField::Description => {
+                            if description__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("description"));
+                            }
+                            description__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(PhysicalHashExprNode {
+                    on_columns: on_columns__.unwrap_or_default(),
+                    seed0: seed0__.unwrap_or_default(),
+                    description: description__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.PhysicalHashExprNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for PhysicalHashRepartition {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -14997,7 +17248,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalHashRepartition {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15115,7 +17366,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalInListNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15224,7 +17475,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalIsNotNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15315,7 +17566,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalIsNull {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15431,7 +17682,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalLikeExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15549,7 +17800,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalNegativeNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15640,7 +17891,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalNot {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15796,6 +18047,27 @@ impl serde::Serialize for PhysicalPlanNode {
                 physical_plan_node::PhysicalPlanType::JsonScan(v) => {
                     struct_ser.serialize_field("jsonScan", v)?;
                 }
+                physical_plan_node::PhysicalPlanType::Cooperative(v) => {
+                    struct_ser.serialize_field("cooperative", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::GenerateSeries(v) => {
+                    struct_ser.serialize_field("generateSeries", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::SortMergeJoin(v) => {
+                    struct_ser.serialize_field("sortMergeJoin", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::MemoryScan(v) => {
+                    struct_ser.serialize_field("memoryScan", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::AsyncFunc(v) => {
+                    struct_ser.serialize_field("asyncFunc", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::Buffer(v) => {
+                    struct_ser.serialize_field("buffer", v)?;
+                }
+                physical_plan_node::PhysicalPlanType::ArrowScan(v) => {
+                    struct_ser.serialize_field("arrowScan", v)?;
+                }
             }
         }
         struct_ser.end()
@@ -15854,6 +18126,18 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             "unnest",
             "json_scan",
             "jsonScan",
+            "cooperative",
+            "generate_series",
+            "generateSeries",
+            "sort_merge_join",
+            "sortMergeJoin",
+            "memory_scan",
+            "memoryScan",
+            "async_func",
+            "asyncFunc",
+            "buffer",
+            "arrow_scan",
+            "arrowScan",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -15888,6 +18172,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             ParquetSink,
             Unnest,
             JsonScan,
+            Cooperative,
+            GenerateSeries,
+            SortMergeJoin,
+            MemoryScan,
+            AsyncFunc,
+            Buffer,
+            ArrowScan,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -15896,7 +18187,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -15939,6 +18230,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
                             "parquetSink" | "parquet_sink" => Ok(GeneratedField::ParquetSink),
                             "unnest" => Ok(GeneratedField::Unnest),
                             "jsonScan" | "json_scan" => Ok(GeneratedField::JsonScan),
+                            "cooperative" => Ok(GeneratedField::Cooperative),
+                            "generateSeries" | "generate_series" => Ok(GeneratedField::GenerateSeries),
+                            "sortMergeJoin" | "sort_merge_join" => Ok(GeneratedField::SortMergeJoin),
+                            "memoryScan" | "memory_scan" => Ok(GeneratedField::MemoryScan),
+                            "asyncFunc" | "async_func" => Ok(GeneratedField::AsyncFunc),
+                            "buffer" => Ok(GeneratedField::Buffer),
+                            "arrowScan" | "arrow_scan" => Ok(GeneratedField::ArrowScan),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -16138,37 +18436,86 @@ impl<'de> serde::Deserialize<'de> for PhysicalPlanNode {
                         }
                         GeneratedField::PlaceholderRow => {
                             if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("placeholderRow"));
+                                return Err(serde::de::Error::duplicate_field("placeholderRow"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::PlaceholderRow)
+;
+                        }
+                        GeneratedField::CsvSink => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("csvSink"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::CsvSink)
+;
+                        }
+                        GeneratedField::ParquetSink => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("parquetSink"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::ParquetSink)
+;
+                        }
+                        GeneratedField::Unnest => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("unnest"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Unnest)
+;
+                        }
+                        GeneratedField::JsonScan => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("jsonScan"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::JsonScan)
+;
+                        }
+                        GeneratedField::Cooperative => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("cooperative"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Cooperative)
+;
+                        }
+                        GeneratedField::GenerateSeries => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("generateSeries"));
+                            }
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::GenerateSeries)
+;
+                        }
+                        GeneratedField::SortMergeJoin => {
+                            if physical_plan_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("sortMergeJoin"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::PlaceholderRow)
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::SortMergeJoin)
 ;
                         }
-                        GeneratedField::CsvSink => {
+                        GeneratedField::MemoryScan => {
                             if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("csvSink"));
+                                return Err(serde::de::Error::duplicate_field("memoryScan"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::CsvSink)
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::MemoryScan)
 ;
                         }
-                        GeneratedField::ParquetSink => {
+                        GeneratedField::AsyncFunc => {
                             if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("parquetSink"));
+                                return Err(serde::de::Error::duplicate_field("asyncFunc"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::ParquetSink)
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::AsyncFunc)
 ;
                         }
-                        GeneratedField::Unnest => {
+                        GeneratedField::Buffer => {
                             if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("unnest"));
+                                return Err(serde::de::Error::duplicate_field("buffer"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Unnest)
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::Buffer)
 ;
                         }
-                        GeneratedField::JsonScan => {
+                        GeneratedField::ArrowScan => {
                             if physical_plan_type__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("jsonScan"));
+                                return Err(serde::de::Error::duplicate_field("arrowScan"));
                             }
-                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::JsonScan)
+                            physical_plan_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_plan_node::PhysicalPlanType::ArrowScan)
 ;
                         }
                     }
@@ -16204,6 +18551,9 @@ impl serde::Serialize for PhysicalScalarUdfNode {
         if self.nullable {
             len += 1;
         }
+        if !self.return_field_name.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalScalarUdfNode", len)?;
         if !self.name.is_empty() {
             struct_ser.serialize_field("name", &self.name)?;
@@ -16222,6 +18572,9 @@ impl serde::Serialize for PhysicalScalarUdfNode {
         if self.nullable {
             struct_ser.serialize_field("nullable", &self.nullable)?;
         }
+        if !self.return_field_name.is_empty() {
+            struct_ser.serialize_field("returnFieldName", &self.return_field_name)?;
+        }
         struct_ser.end()
     }
 }
@@ -16239,6 +18592,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
             "return_type",
             "returnType",
             "nullable",
+            "return_field_name",
+            "returnFieldName",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -16248,6 +18603,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
             FunDefinition,
             ReturnType,
             Nullable,
+            ReturnFieldName,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -16256,7 +18612,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16274,6 +18630,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
                             "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition),
                             "returnType" | "return_type" => Ok(GeneratedField::ReturnType),
                             "nullable" => Ok(GeneratedField::Nullable),
+                            "returnFieldName" | "return_field_name" => Ok(GeneratedField::ReturnFieldName),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -16298,6 +18655,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
                 let mut fun_definition__ = None;
                 let mut return_type__ = None;
                 let mut nullable__ = None;
+                let mut return_field_name__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Name => {
@@ -16332,6 +18690,12 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
                             }
                             nullable__ = Some(map_.next_value()?);
                         }
+                        GeneratedField::ReturnFieldName => {
+                            if return_field_name__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("returnFieldName"));
+                            }
+                            return_field_name__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(PhysicalScalarUdfNode {
@@ -16340,6 +18704,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalScalarUdfNode {
                     fun_definition: fun_definition__,
                     return_type: return_type__,
                     nullable: nullable__.unwrap_or_default(),
+                    return_field_name: return_field_name__.unwrap_or_default(),
                 })
             }
         }
@@ -16402,7 +18767,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalSortExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16512,7 +18877,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalSortExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16612,7 +18977,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalTryCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16722,7 +19087,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalWhenThen {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16809,6 +19174,12 @@ impl serde::Serialize for PhysicalWindowExprNode {
         if self.fun_definition.is_some() {
             len += 1;
         }
+        if self.ignore_nulls {
+            len += 1;
+        }
+        if self.distinct {
+            len += 1;
+        }
         if self.window_function.is_some() {
             len += 1;
         }
@@ -16833,6 +19204,12 @@ impl serde::Serialize for PhysicalWindowExprNode {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?;
         }
+        if self.ignore_nulls {
+            struct_ser.serialize_field("ignoreNulls", &self.ignore_nulls)?;
+        }
+        if self.distinct {
+            struct_ser.serialize_field("distinct", &self.distinct)?;
+        }
         if let Some(v) = self.window_function.as_ref() {
             match v {
                 physical_window_expr_node::WindowFunction::UserDefinedAggrFunction(v) => {
@@ -16863,6 +19240,9 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
             "name",
             "fun_definition",
             "funDefinition",
+            "ignore_nulls",
+            "ignoreNulls",
+            "distinct",
             "user_defined_aggr_function",
             "userDefinedAggrFunction",
             "user_defined_window_function",
@@ -16877,6 +19257,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
             WindowFrame,
             Name,
             FunDefinition,
+            IgnoreNulls,
+            Distinct,
             UserDefinedAggrFunction,
             UserDefinedWindowFunction,
         }
@@ -16887,7 +19269,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -16906,6 +19288,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
                             "windowFrame" | "window_frame" => Ok(GeneratedField::WindowFrame),
                             "name" => Ok(GeneratedField::Name),
                             "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition),
+                            "ignoreNulls" | "ignore_nulls" => Ok(GeneratedField::IgnoreNulls),
+                            "distinct" => Ok(GeneratedField::Distinct),
                             "userDefinedAggrFunction" | "user_defined_aggr_function" => Ok(GeneratedField::UserDefinedAggrFunction),
                             "userDefinedWindowFunction" | "user_defined_window_function" => Ok(GeneratedField::UserDefinedWindowFunction),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
@@ -16933,6 +19317,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
                 let mut window_frame__ = None;
                 let mut name__ = None;
                 let mut fun_definition__ = None;
+                let mut ignore_nulls__ = None;
+                let mut distinct__ = None;
                 let mut window_function__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
@@ -16974,6 +19360,18 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
                                 map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::IgnoreNulls => {
+                            if ignore_nulls__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("ignoreNulls"));
+                            }
+                            ignore_nulls__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Distinct => {
+                            if distinct__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("distinct"));
+                            }
+                            distinct__ = Some(map_.next_value()?);
+                        }
                         GeneratedField::UserDefinedAggrFunction => {
                             if window_function__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("userDefinedAggrFunction"));
@@ -16995,6 +19393,8 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode {
                     window_frame: window_frame__,
                     name: name__.unwrap_or_default(),
                     fun_definition: fun_definition__,
+                    ignore_nulls: ignore_nulls__.unwrap_or_default(),
+                    distinct: distinct__.unwrap_or_default(),
                     window_function: window_function__,
                 })
             }
@@ -17016,6 +19416,12 @@ impl serde::Serialize for PlaceholderNode {
         if self.data_type.is_some() {
             len += 1;
         }
+        if self.nullable.is_some() {
+            len += 1;
+        }
+        if !self.metadata.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.PlaceholderNode", len)?;
         if !self.id.is_empty() {
             struct_ser.serialize_field("id", &self.id)?;
@@ -17023,6 +19429,12 @@ impl serde::Serialize for PlaceholderNode {
         if let Some(v) = self.data_type.as_ref() {
             struct_ser.serialize_field("dataType", v)?;
         }
+        if let Some(v) = self.nullable.as_ref() {
+            struct_ser.serialize_field("nullable", v)?;
+        }
+        if !self.metadata.is_empty() {
+            struct_ser.serialize_field("metadata", &self.metadata)?;
+        }
         struct_ser.end()
     }
 }
@@ -17036,12 +19448,16 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
             "id",
             "data_type",
             "dataType",
+            "nullable",
+            "metadata",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Id,
             DataType,
+            Nullable,
+            Metadata,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -17050,7 +19466,7 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17065,6 +19481,8 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
                         match value {
                             "id" => Ok(GeneratedField::Id),
                             "dataType" | "data_type" => Ok(GeneratedField::DataType),
+                            "nullable" => Ok(GeneratedField::Nullable),
+                            "metadata" => Ok(GeneratedField::Metadata),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -17086,6 +19504,8 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
             {
                 let mut id__ = None;
                 let mut data_type__ = None;
+                let mut nullable__ = None;
+                let mut metadata__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Id => {
@@ -17100,11 +19520,27 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode {
                             }
                             data_type__ = map_.next_value()?;
                         }
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
+                            }
+                            nullable__ = map_.next_value()?;
+                        }
+                        GeneratedField::Metadata => {
+                            if metadata__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metadata"));
+                            }
+                            metadata__ = Some(
+                                map_.next_value::<std::collections::HashMap<_, _>>()?
+                            );
+                        }
                     }
                 }
                 Ok(PlaceholderNode {
                     id: id__.unwrap_or_default(),
                     data_type: data_type__,
+                    nullable: nullable__,
+                    metadata: metadata__.unwrap_or_default(),
                 })
             }
         }
@@ -17150,7 +19586,7 @@ impl<'de> serde::Deserialize<'de> for PlaceholderRowExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17305,7 +19741,7 @@ impl<'de> serde::Deserialize<'de> for PlanType {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17471,6 +19907,9 @@ impl serde::Serialize for PrepareNode {
         if self.input.is_some() {
             len += 1;
         }
+        if !self.fields.is_empty() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.PrepareNode", len)?;
         if !self.name.is_empty() {
             struct_ser.serialize_field("name", &self.name)?;
@@ -17481,6 +19920,9 @@ impl serde::Serialize for PrepareNode {
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
         }
+        if !self.fields.is_empty() {
+            struct_ser.serialize_field("fields", &self.fields)?;
+        }
         struct_ser.end()
     }
 }
@@ -17495,6 +19937,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
             "data_types",
             "dataTypes",
             "input",
+            "fields",
         ];
 
         #[allow(clippy::enum_variant_names)]
@@ -17502,6 +19945,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
             Name,
             DataTypes,
             Input,
+            Fields,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -17510,7 +19954,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17526,6 +19970,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
                             "name" => Ok(GeneratedField::Name),
                             "dataTypes" | "data_types" => Ok(GeneratedField::DataTypes),
                             "input" => Ok(GeneratedField::Input),
+                            "fields" => Ok(GeneratedField::Fields),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -17548,6 +19993,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
                 let mut name__ = None;
                 let mut data_types__ = None;
                 let mut input__ = None;
+                let mut fields__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Name => {
@@ -17568,12 +20014,19 @@ impl<'de> serde::Deserialize<'de> for PrepareNode {
                             }
                             input__ = map_.next_value()?;
                         }
+                        GeneratedField::Fields => {
+                            if fields__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("fields"));
+                            }
+                            fields__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(PrepareNode {
                     name: name__.unwrap_or_default(),
                     data_types: data_types__.unwrap_or_default(),
                     input: input__,
+                    fields: fields__.unwrap_or_default(),
                 })
             }
         }
@@ -17619,7 +20072,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionColumns {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17727,7 +20180,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17783,18 +20236,217 @@ impl<'de> serde::Deserialize<'de> for ProjectionExecNode {
                             if expr_name__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("exprName"));
                             }
-                            expr_name__ = Some(map_.next_value()?);
+                            expr_name__ = Some(map_.next_value()?);
+                        }
+                    }
+                }
+                Ok(ProjectionExecNode {
+                    input: input__,
+                    expr: expr__.unwrap_or_default(),
+                    expr_name: expr_name__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ProjectionExecNode", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for ProjectionExpr {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.alias.is_empty() {
+            len += 1;
+        }
+        if self.expr.is_some() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExpr", len)?;
+        if !self.alias.is_empty() {
+            struct_ser.serialize_field("alias", &self.alias)?;
+        }
+        if let Some(v) = self.expr.as_ref() {
+            struct_ser.serialize_field("expr", v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ProjectionExpr {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "alias",
+            "expr",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Alias,
+            Expr,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "alias" => Ok(GeneratedField::Alias),
+                            "expr" => Ok(GeneratedField::Expr),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ProjectionExpr;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ProjectionExpr")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ProjectionExpr, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut alias__ = None;
+                let mut expr__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Alias => {
+                            if alias__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("alias"));
+                            }
+                            alias__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Expr => {
+                            if expr__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("expr"));
+                            }
+                            expr__ = map_.next_value()?;
+                        }
+                    }
+                }
+                Ok(ProjectionExpr {
+                    alias: alias__.unwrap_or_default(),
+                    expr: expr__,
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.ProjectionExpr", FIELDS, GeneratedVisitor)
+    }
+}
+impl serde::Serialize for ProjectionExprs {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if !self.projections.is_empty() {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExprs", len)?;
+        if !self.projections.is_empty() {
+            struct_ser.serialize_field("projections", &self.projections)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for ProjectionExprs {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "projections",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Projections,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "projections" => Ok(GeneratedField::Projections),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = ProjectionExprs;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.ProjectionExprs")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<ProjectionExprs, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut projections__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Projections => {
+                            if projections__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("projections"));
+                            }
+                            projections__ = Some(map_.next_value()?);
                         }
                     }
                 }
-                Ok(ProjectionExecNode {
-                    input: input__,
-                    expr: expr__.unwrap_or_default(),
-                    expr_name: expr_name__.unwrap_or_default(),
+                Ok(ProjectionExprs {
+                    projections: projections__.unwrap_or_default(),
                 })
             }
         }
-        deserializer.deserialize_struct("datafusion.ProjectionExecNode", FIELDS, GeneratedVisitor)
+        deserializer.deserialize_struct("datafusion.ProjectionExprs", FIELDS, GeneratedVisitor)
     }
 }
 impl serde::Serialize for ProjectionNode {
@@ -17856,7 +20508,7 @@ impl<'de> serde::Deserialize<'de> for ProjectionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -17983,7 +20635,7 @@ impl<'de> serde::Deserialize<'de> for RecursionUnnestOption {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18121,7 +20773,7 @@ impl<'de> serde::Deserialize<'de> for RecursiveQueryNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18214,6 +20866,9 @@ impl serde::Serialize for RepartitionExecNode {
         if self.partitioning.is_some() {
             len += 1;
         }
+        if self.preserve_order {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.RepartitionExecNode", len)?;
         if let Some(v) = self.input.as_ref() {
             struct_ser.serialize_field("input", v)?;
@@ -18221,6 +20876,9 @@ impl serde::Serialize for RepartitionExecNode {
         if let Some(v) = self.partitioning.as_ref() {
             struct_ser.serialize_field("partitioning", v)?;
         }
+        if self.preserve_order {
+            struct_ser.serialize_field("preserveOrder", &self.preserve_order)?;
+        }
         struct_ser.end()
     }
 }
@@ -18233,12 +20891,15 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
         const FIELDS: &[&str] = &[
             "input",
             "partitioning",
+            "preserve_order",
+            "preserveOrder",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Input,
             Partitioning,
+            PreserveOrder,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -18247,7 +20908,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18262,6 +20923,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
                         match value {
                             "input" => Ok(GeneratedField::Input),
                             "partitioning" => Ok(GeneratedField::Partitioning),
+                            "preserveOrder" | "preserve_order" => Ok(GeneratedField::PreserveOrder),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -18283,6 +20945,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
             {
                 let mut input__ = None;
                 let mut partitioning__ = None;
+                let mut preserve_order__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Input => {
@@ -18297,11 +20960,18 @@ impl<'de> serde::Deserialize<'de> for RepartitionExecNode {
                             }
                             partitioning__ = map_.next_value()?;
                         }
+                        GeneratedField::PreserveOrder => {
+                            if preserve_order__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("preserveOrder"));
+                            }
+                            preserve_order__ = Some(map_.next_value()?);
+                        }
                     }
                 }
                 Ok(RepartitionExecNode {
                     input: input__,
                     partitioning: partitioning__,
+                    preserve_order: preserve_order__.unwrap_or_default(),
                 })
             }
         }
@@ -18367,7 +21037,7 @@ impl<'de> serde::Deserialize<'de> for RepartitionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18475,7 +21145,7 @@ impl<'de> serde::Deserialize<'de> for RollupNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18586,7 +21256,7 @@ impl<'de> serde::Deserialize<'de> for ScalarUdfExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18697,7 +21367,7 @@ impl<'de> serde::Deserialize<'de> for ScanLimit {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18790,7 +21460,7 @@ impl<'de> serde::Deserialize<'de> for SelectionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -18889,7 +21559,7 @@ impl<'de> serde::Deserialize<'de> for SelectionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19014,7 +21684,7 @@ impl<'de> serde::Deserialize<'de> for SimilarToNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19159,7 +21829,7 @@ impl<'de> serde::Deserialize<'de> for SortExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19296,7 +21966,7 @@ impl<'de> serde::Deserialize<'de> for SortExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19406,7 +22076,7 @@ impl<'de> serde::Deserialize<'de> for SortExprNodeCollection {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19458,6 +22128,206 @@ impl<'de> serde::Deserialize<'de> for SortExprNodeCollection {
         deserializer.deserialize_struct("datafusion.SortExprNodeCollection", FIELDS, GeneratedVisitor)
     }
 }
+impl serde::Serialize for SortMergeJoinExecNode {
+    #[allow(deprecated)]
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeStruct;
+        let mut len = 0;
+        if self.left.is_some() {
+            len += 1;
+        }
+        if self.right.is_some() {
+            len += 1;
+        }
+        if !self.on.is_empty() {
+            len += 1;
+        }
+        if self.join_type != 0 {
+            len += 1;
+        }
+        if self.filter.is_some() {
+            len += 1;
+        }
+        if !self.sort_options.is_empty() {
+            len += 1;
+        }
+        if self.null_equality != 0 {
+            len += 1;
+        }
+        let mut struct_ser = serializer.serialize_struct("datafusion.SortMergeJoinExecNode", len)?;
+        if let Some(v) = self.left.as_ref() {
+            struct_ser.serialize_field("left", v)?;
+        }
+        if let Some(v) = self.right.as_ref() {
+            struct_ser.serialize_field("right", v)?;
+        }
+        if !self.on.is_empty() {
+            struct_ser.serialize_field("on", &self.on)?;
+        }
+        if self.join_type != 0 {
+            let v = super::datafusion_common::JoinType::try_from(self.join_type)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.join_type)))?;
+            struct_ser.serialize_field("joinType", &v)?;
+        }
+        if let Some(v) = self.filter.as_ref() {
+            struct_ser.serialize_field("filter", v)?;
+        }
+        if !self.sort_options.is_empty() {
+            struct_ser.serialize_field("sortOptions", &self.sort_options)?;
+        }
+        if self.null_equality != 0 {
+            let v = super::datafusion_common::NullEquality::try_from(self.null_equality)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.null_equality)))?;
+            struct_ser.serialize_field("nullEquality", &v)?;
+        }
+        struct_ser.end()
+    }
+}
+impl<'de> serde::Deserialize<'de> for SortMergeJoinExecNode {
+    #[allow(deprecated)]
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        const FIELDS: &[&str] = &[
+            "left",
+            "right",
+            "on",
+            "join_type",
+            "joinType",
+            "filter",
+            "sort_options",
+            "sortOptions",
+            "null_equality",
+            "nullEquality",
+        ];
+
+        #[allow(clippy::enum_variant_names)]
+        enum GeneratedField {
+            Left,
+            Right,
+            On,
+            JoinType,
+            Filter,
+            SortOptions,
+            NullEquality,
+        }
+        impl<'de> serde::Deserialize<'de> for GeneratedField {
+            fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                struct GeneratedVisitor;
+
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
+                    type Value = GeneratedField;
+
+                    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                        write!(formatter, "expected one of: {:?}", &FIELDS)
+                    }
+
+                    #[allow(unused_variables)]
+                    fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E>
+                    where
+                        E: serde::de::Error,
+                    {
+                        match value {
+                            "left" => Ok(GeneratedField::Left),
+                            "right" => Ok(GeneratedField::Right),
+                            "on" => Ok(GeneratedField::On),
+                            "joinType" | "join_type" => Ok(GeneratedField::JoinType),
+                            "filter" => Ok(GeneratedField::Filter),
+                            "sortOptions" | "sort_options" => Ok(GeneratedField::SortOptions),
+                            "nullEquality" | "null_equality" => Ok(GeneratedField::NullEquality),
+                            _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
+                        }
+                    }
+                }
+                deserializer.deserialize_identifier(GeneratedVisitor)
+            }
+        }
+        struct GeneratedVisitor;
+        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+            type Value = SortMergeJoinExecNode;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                formatter.write_str("struct datafusion.SortMergeJoinExecNode")
+            }
+
+            fn visit_map<V>(self, mut map_: V) -> std::result::Result<SortMergeJoinExecNode, V::Error>
+                where
+                    V: serde::de::MapAccess<'de>,
+            {
+                let mut left__ = None;
+                let mut right__ = None;
+                let mut on__ = None;
+                let mut join_type__ = None;
+                let mut filter__ = None;
+                let mut sort_options__ = None;
+                let mut null_equality__ = None;
+                while let Some(k) = map_.next_key()? {
+                    match k {
+                        GeneratedField::Left => {
+                            if left__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("left"));
+                            }
+                            left__ = map_.next_value()?;
+                        }
+                        GeneratedField::Right => {
+                            if right__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("right"));
+                            }
+                            right__ = map_.next_value()?;
+                        }
+                        GeneratedField::On => {
+                            if on__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("on"));
+                            }
+                            on__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::JoinType => {
+                            if join_type__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("joinType"));
+                            }
+                            join_type__ = Some(map_.next_value::<super::datafusion_common::JoinType>()? as i32);
+                        }
+                        GeneratedField::Filter => {
+                            if filter__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("filter"));
+                            }
+                            filter__ = map_.next_value()?;
+                        }
+                        GeneratedField::SortOptions => {
+                            if sort_options__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("sortOptions"));
+                            }
+                            sort_options__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::NullEquality => {
+                            if null_equality__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullEquality"));
+                            }
+                            null_equality__ = Some(map_.next_value::<super::datafusion_common::NullEquality>()? as i32);
+                        }
+                    }
+                }
+                Ok(SortMergeJoinExecNode {
+                    left: left__,
+                    right: right__,
+                    on: on__.unwrap_or_default(),
+                    join_type: join_type__.unwrap_or_default(),
+                    filter: filter__,
+                    sort_options: sort_options__.unwrap_or_default(),
+                    null_equality: null_equality__.unwrap_or_default(),
+                })
+            }
+        }
+        deserializer.deserialize_struct("datafusion.SortMergeJoinExecNode", FIELDS, GeneratedVisitor)
+    }
+}
 impl serde::Serialize for SortNode {
     #[allow(deprecated)]
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -19515,7 +22385,7 @@ impl<'de> serde::Deserialize<'de> for SortNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19644,7 +22514,7 @@ impl<'de> serde::Deserialize<'de> for SortPreservingMergeExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19742,7 +22612,7 @@ impl<'de> serde::Deserialize<'de> for StreamPartitionMode {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = StreamPartitionMode;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19835,7 +22705,7 @@ impl<'de> serde::Deserialize<'de> for StringifiedPlan {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -19943,7 +22813,7 @@ impl<'de> serde::Deserialize<'de> for SubqueryAliasNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20027,7 +22897,7 @@ impl serde::Serialize for SymmetricHashJoinExecNode {
         if self.partition_mode != 0 {
             len += 1;
         }
-        if self.null_equals_null {
+        if self.null_equality != 0 {
             len += 1;
         }
         if self.filter.is_some() {
@@ -20059,8 +22929,10 @@ impl serde::Serialize for SymmetricHashJoinExecNode {
                 .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.partition_mode)))?;
             struct_ser.serialize_field("partitionMode", &v)?;
         }
-        if self.null_equals_null {
-            struct_ser.serialize_field("nullEqualsNull", &self.null_equals_null)?;
+        if self.null_equality != 0 {
+            let v = super::datafusion_common::NullEquality::try_from(self.null_equality)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.null_equality)))?;
+            struct_ser.serialize_field("nullEquality", &v)?;
         }
         if let Some(v) = self.filter.as_ref() {
             struct_ser.serialize_field("filter", v)?;
@@ -20088,8 +22960,8 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
             "joinType",
             "partition_mode",
             "partitionMode",
-            "null_equals_null",
-            "nullEqualsNull",
+            "null_equality",
+            "nullEquality",
             "filter",
             "left_sort_exprs",
             "leftSortExprs",
@@ -20104,7 +22976,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
             On,
             JoinType,
             PartitionMode,
-            NullEqualsNull,
+            NullEquality,
             Filter,
             LeftSortExprs,
             RightSortExprs,
@@ -20116,7 +22988,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20134,7 +23006,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
                             "on" => Ok(GeneratedField::On),
                             "joinType" | "join_type" => Ok(GeneratedField::JoinType),
                             "partitionMode" | "partition_mode" => Ok(GeneratedField::PartitionMode),
-                            "nullEqualsNull" | "null_equals_null" => Ok(GeneratedField::NullEqualsNull),
+                            "nullEquality" | "null_equality" => Ok(GeneratedField::NullEquality),
                             "filter" => Ok(GeneratedField::Filter),
                             "leftSortExprs" | "left_sort_exprs" => Ok(GeneratedField::LeftSortExprs),
                             "rightSortExprs" | "right_sort_exprs" => Ok(GeneratedField::RightSortExprs),
@@ -20162,7 +23034,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
                 let mut on__ = None;
                 let mut join_type__ = None;
                 let mut partition_mode__ = None;
-                let mut null_equals_null__ = None;
+                let mut null_equality__ = None;
                 let mut filter__ = None;
                 let mut left_sort_exprs__ = None;
                 let mut right_sort_exprs__ = None;
@@ -20198,11 +23070,11 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
                             }
                             partition_mode__ = Some(map_.next_value::<StreamPartitionMode>()? as i32);
                         }
-                        GeneratedField::NullEqualsNull => {
-                            if null_equals_null__.is_some() {
-                                return Err(serde::de::Error::duplicate_field("nullEqualsNull"));
+                        GeneratedField::NullEquality => {
+                            if null_equality__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullEquality"));
                             }
-                            null_equals_null__ = Some(map_.next_value()?);
+                            null_equality__ = Some(map_.next_value::<super::datafusion_common::NullEquality>()? as i32);
                         }
                         GeneratedField::Filter => {
                             if filter__.is_some() {
@@ -20230,7 +23102,7 @@ impl<'de> serde::Deserialize<'de> for SymmetricHashJoinExecNode {
                     on: on__.unwrap_or_default(),
                     join_type: join_type__.unwrap_or_default(),
                     partition_mode: partition_mode__.unwrap_or_default(),
-                    null_equals_null: null_equals_null__.unwrap_or_default(),
+                    null_equality: null_equality__.unwrap_or_default(),
                     filter: filter__,
                     left_sort_exprs: left_sort_exprs__.unwrap_or_default(),
                     right_sort_exprs: right_sort_exprs__.unwrap_or_default(),
@@ -20293,7 +23165,7 @@ impl<'de> serde::Deserialize<'de> for TableReference {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20376,6 +23248,12 @@ impl serde::Serialize for TryCastNode {
         if self.arrow_type.is_some() {
             len += 1;
         }
+        if !self.metadata.is_empty() {
+            len += 1;
+        }
+        if self.nullable.is_some() {
+            len += 1;
+        }
         let mut struct_ser = serializer.serialize_struct("datafusion.TryCastNode", len)?;
         if let Some(v) = self.expr.as_ref() {
             struct_ser.serialize_field("expr", v)?;
@@ -20383,6 +23261,12 @@ impl serde::Serialize for TryCastNode {
         if let Some(v) = self.arrow_type.as_ref() {
             struct_ser.serialize_field("arrowType", v)?;
         }
+        if !self.metadata.is_empty() {
+            struct_ser.serialize_field("metadata", &self.metadata)?;
+        }
+        if let Some(v) = self.nullable.as_ref() {
+            struct_ser.serialize_field("nullable", v)?;
+        }
         struct_ser.end()
     }
 }
@@ -20396,12 +23280,16 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             "expr",
             "arrow_type",
             "arrowType",
+            "metadata",
+            "nullable",
         ];
 
         #[allow(clippy::enum_variant_names)]
         enum GeneratedField {
             Expr,
             ArrowType,
+            Metadata,
+            Nullable,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
             fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
@@ -20410,7 +23298,7 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20425,6 +23313,8 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
                         match value {
                             "expr" => Ok(GeneratedField::Expr),
                             "arrowType" | "arrow_type" => Ok(GeneratedField::ArrowType),
+                            "metadata" => Ok(GeneratedField::Metadata),
+                            "nullable" => Ok(GeneratedField::Nullable),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
                         }
                     }
@@ -20446,6 +23336,8 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
             {
                 let mut expr__ = None;
                 let mut arrow_type__ = None;
+                let mut metadata__ = None;
+                let mut nullable__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
                         GeneratedField::Expr => {
@@ -20460,11 +23352,27 @@ impl<'de> serde::Deserialize<'de> for TryCastNode {
                             }
                             arrow_type__ = map_.next_value()?;
                         }
+                        GeneratedField::Metadata => {
+                            if metadata__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("metadata"));
+                            }
+                            metadata__ = Some(
+                                map_.next_value::<std::collections::HashMap<_, _>>()?
+                            );
+                        }
+                        GeneratedField::Nullable => {
+                            if nullable__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullable"));
+                            }
+                            nullable__ = map_.next_value()?;
+                        }
                     }
                 }
                 Ok(TryCastNode {
                     expr: expr__,
                     arrow_type: arrow_type__,
+                    metadata: metadata__.unwrap_or_default(),
+                    nullable: nullable__,
                 })
             }
         }
@@ -20510,7 +23418,7 @@ impl<'de> serde::Deserialize<'de> for UnionExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20601,7 +23509,7 @@ impl<'de> serde::Deserialize<'de> for UnionNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20692,7 +23600,7 @@ impl<'de> serde::Deserialize<'de> for UnknownColumn {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20783,7 +23691,7 @@ impl<'de> serde::Deserialize<'de> for Unnest {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -20908,7 +23816,7 @@ impl<'de> serde::Deserialize<'de> for UnnestExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21090,7 +23998,7 @@ impl<'de> serde::Deserialize<'de> for UnnestNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21250,7 +24158,7 @@ impl<'de> serde::Deserialize<'de> for UnnestOptions {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21362,7 +24270,7 @@ impl<'de> serde::Deserialize<'de> for ValuesNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21497,7 +24405,7 @@ impl<'de> serde::Deserialize<'de> for ViewTableScanNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21634,7 +24542,7 @@ impl<'de> serde::Deserialize<'de> for WhenThen {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21734,7 +24642,7 @@ impl<'de> serde::Deserialize<'de> for Wildcard {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21866,7 +24774,7 @@ impl<'de> serde::Deserialize<'de> for WindowAggExecNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -21985,6 +24893,15 @@ impl serde::Serialize for WindowExprNode {
         if self.fun_definition.is_some() {
             len += 1;
         }
+        if self.null_treatment.is_some() {
+            len += 1;
+        }
+        if self.distinct {
+            len += 1;
+        }
+        if self.filter.is_some() {
+            len += 1;
+        }
         if self.window_function.is_some() {
             len += 1;
         }
@@ -22006,6 +24923,17 @@ impl serde::Serialize for WindowExprNode {
             #[allow(clippy::needless_borrows_for_generic_args)]
             struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?;
         }
+        if let Some(v) = self.null_treatment.as_ref() {
+            let v = NullTreatment::try_from(*v)
+                .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?;
+            struct_ser.serialize_field("nullTreatment", &v)?;
+        }
+        if self.distinct {
+            struct_ser.serialize_field("distinct", &self.distinct)?;
+        }
+        if let Some(v) = self.filter.as_ref() {
+            struct_ser.serialize_field("filter", v)?;
+        }
         if let Some(v) = self.window_function.as_ref() {
             match v {
                 window_expr_node::WindowFunction::Udaf(v) => {
@@ -22035,6 +24963,10 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
             "windowFrame",
             "fun_definition",
             "funDefinition",
+            "null_treatment",
+            "nullTreatment",
+            "distinct",
+            "filter",
             "udaf",
             "udwf",
         ];
@@ -22046,6 +24978,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
             OrderBy,
             WindowFrame,
             FunDefinition,
+            NullTreatment,
+            Distinct,
+            Filter,
             Udaf,
             Udwf,
         }
@@ -22056,7 +24991,7 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22074,6 +25009,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
                             "orderBy" | "order_by" => Ok(GeneratedField::OrderBy),
                             "windowFrame" | "window_frame" => Ok(GeneratedField::WindowFrame),
                             "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition),
+                            "nullTreatment" | "null_treatment" => Ok(GeneratedField::NullTreatment),
+                            "distinct" => Ok(GeneratedField::Distinct),
+                            "filter" => Ok(GeneratedField::Filter),
                             "udaf" => Ok(GeneratedField::Udaf),
                             "udwf" => Ok(GeneratedField::Udwf),
                             _ => Err(serde::de::Error::unknown_field(value, FIELDS)),
@@ -22100,6 +25038,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
                 let mut order_by__ = None;
                 let mut window_frame__ = None;
                 let mut fun_definition__ = None;
+                let mut null_treatment__ = None;
+                let mut distinct__ = None;
+                let mut filter__ = None;
                 let mut window_function__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
@@ -22135,6 +25076,24 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
                                 map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0)
                             ;
                         }
+                        GeneratedField::NullTreatment => {
+                            if null_treatment__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("nullTreatment"));
+                            }
+                            null_treatment__ = map_.next_value::<::std::option::Option<NullTreatment>>()?.map(|x| x as i32);
+                        }
+                        GeneratedField::Distinct => {
+                            if distinct__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("distinct"));
+                            }
+                            distinct__ = Some(map_.next_value()?);
+                        }
+                        GeneratedField::Filter => {
+                            if filter__.is_some() {
+                                return Err(serde::de::Error::duplicate_field("filter"));
+                            }
+                            filter__ = map_.next_value()?;
+                        }
                         GeneratedField::Udaf => {
                             if window_function__.is_some() {
                                 return Err(serde::de::Error::duplicate_field("udaf"));
@@ -22155,6 +25114,9 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode {
                     order_by: order_by__.unwrap_or_default(),
                     window_frame: window_frame__,
                     fun_definition: fun_definition__,
+                    null_treatment: null_treatment__,
+                    distinct: distinct__.unwrap_or_default(),
+                    filter: filter__,
                     window_function: window_function__,
                 })
             }
@@ -22225,7 +25187,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrame {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22347,7 +25309,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameBound {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22436,7 +25398,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameBoundType {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = WindowFrameBoundType;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22510,7 +25472,7 @@ impl<'de> serde::Deserialize<'de> for WindowFrameUnits {
 
         struct GeneratedVisitor;
 
-        impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+        impl serde::de::Visitor<'_> for GeneratedVisitor {
             type Value = WindowFrameUnits;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -22604,7 +25566,7 @@ impl<'de> serde::Deserialize<'de> for WindowNode {
             {
                 struct GeneratedVisitor;
 
-                impl<'de> serde::de::Visitor<'de> for GeneratedVisitor {
+                impl serde::de::Visitor<'_> for GeneratedVisitor {
                     type Value = GeneratedField;
 
                     fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs
index c2f4e93cef6ae..e0a0c636fbb32 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -86,7 +86,7 @@ pub struct LogicalExtensionNode {
     #[prost(message, repeated, tag = "2")]
     pub inputs: ::prost::alloc::vec::Vec<LogicalPlanNode>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ProjectionColumns {
     #[prost(string, repeated, tag = "1")]
     pub columns: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
@@ -123,7 +123,10 @@ pub struct ListingTableScanNode {
     pub target_partitions: u32,
     #[prost(message, repeated, tag = "13")]
     pub file_sort_order: ::prost::alloc::vec::Vec<SortExprNodeCollection>,
-    #[prost(oneof = "listing_table_scan_node::FileFormatType", tags = "10, 11, 12, 15")]
+    #[prost(
+        oneof = "listing_table_scan_node::FileFormatType",
+        tags = "10, 11, 12, 15, 16"
+    )]
     pub file_format_type: ::core::option::Option<
         listing_table_scan_node::FileFormatType,
     >,
@@ -140,6 +143,8 @@ pub mod listing_table_scan_node {
         Avro(super::super::datafusion_common::AvroFormat),
         #[prost(message, tag = "15")]
         Json(super::super::datafusion_common::NdJsonFormat),
+        #[prost(message, tag = "16")]
+        Arrow(super::super::datafusion_common::ArrowFormat),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -180,7 +185,7 @@ pub struct ProjectionNode {
 }
 /// Nested message and enum types in `ProjectionNode`.
 pub mod projection_node {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum OptionalAlias {
         #[prost(string, tag = "3")]
         Alias(::prost::alloc::string::String),
@@ -227,7 +232,7 @@ pub struct HashRepartition {
     #[prost(uint64, tag = "2")]
     pub partition_count: u64,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct EmptyRelationNode {
     #[prost(bool, tag = "1")]
     pub produce_one_row: bool,
@@ -246,6 +251,8 @@ pub struct CreateExternalTableNode {
     pub table_partition_cols: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
     #[prost(bool, tag = "6")]
     pub if_not_exists: bool,
+    #[prost(bool, tag = "15")]
+    pub or_replace: bool,
     #[prost(bool, tag = "14")]
     pub temporary: bool,
     #[prost(string, tag = "7")]
@@ -271,10 +278,14 @@ pub struct CreateExternalTableNode {
 pub struct PrepareNode {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
+    /// We serialize both the data types and the fields for compatibility with
+    /// older versions (newer versions populate both).
     #[prost(message, repeated, tag = "2")]
     pub data_types: ::prost::alloc::vec::Vec<super::datafusion_common::ArrowType>,
     #[prost(message, optional, boxed, tag = "3")]
     pub input: ::core::option::Option<::prost::alloc::boxed::Box<LogicalPlanNode>>,
+    #[prost(message, repeated, tag = "4")]
+    pub fields: ::prost::alloc::vec::Vec<super::datafusion_common::Field>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct CreateCatalogSchemaNode {
@@ -369,8 +380,8 @@ pub struct JoinNode {
     pub left_join_key: ::prost::alloc::vec::Vec<LogicalExprNode>,
     #[prost(message, repeated, tag = "6")]
     pub right_join_key: ::prost::alloc::vec::Vec<LogicalExprNode>,
-    #[prost(bool, tag = "7")]
-    pub null_equals_null: bool,
+    #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
+    pub null_equality: i32,
     #[prost(message, optional, tag = "8")]
     pub filter: ::core::option::Option<LogicalExprNode>,
 }
@@ -401,6 +412,15 @@ pub struct CopyToNode {
     #[prost(string, repeated, tag = "7")]
     pub partition_by: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
+/// Wraps a serialized FileFormatFactory with its format kind tag,
+/// so the decoder can dispatch to the correct format-specific codec.
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct FileFormatProto {
+    #[prost(enumeration = "FileFormatKind", tag = "1")]
+    pub kind: i32,
+    #[prost(bytes = "vec", tag = "2")]
+    pub encoded_file_format: ::prost::alloc::vec::Vec<u8>,
+}
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DmlNode {
     #[prost(enumeration = "dml_node::Type", tag = "1")]
@@ -433,6 +453,7 @@ pub mod dml_node {
         InsertAppend = 3,
         InsertOverwrite = 4,
         InsertReplace = 5,
+        Truncate = 6,
     }
     impl Type {
         /// String value of the enum field names used in the ProtoBuf definition.
@@ -447,6 +468,7 @@ pub mod dml_node {
                 Self::InsertAppend => "INSERT_APPEND",
                 Self::InsertOverwrite => "INSERT_OVERWRITE",
                 Self::InsertReplace => "INSERT_REPLACE",
+                Self::Truncate => "TRUNCATE",
             }
         }
         /// Creates an enum from field names used in the ProtoBuf definition.
@@ -458,6 +480,7 @@ pub mod dml_node {
                 "INSERT_APPEND" => Some(Self::InsertAppend),
                 "INSERT_OVERWRITE" => Some(Self::InsertOverwrite),
                 "INSERT_REPLACE" => Some(Self::InsertReplace),
+                "TRUNCATE" => Some(Self::Truncate),
                 _ => None,
             }
         }
@@ -480,7 +503,7 @@ pub struct UnnestNode {
     #[prost(message, optional, tag = "7")]
     pub options: ::core::option::Option<UnnestOptions>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ColumnUnnestListItem {
     #[prost(uint32, tag = "1")]
     pub input_index: u32,
@@ -492,7 +515,7 @@ pub struct ColumnUnnestListRecursions {
     #[prost(message, repeated, tag = "2")]
     pub recursions: ::prost::alloc::vec::Vec<ColumnUnnestListRecursion>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ColumnUnnestListRecursion {
     #[prost(message, optional, tag = "1")]
     pub output_column: ::core::option::Option<super::datafusion_common::Column>,
@@ -506,7 +529,7 @@ pub struct UnnestOptions {
     #[prost(message, repeated, tag = "2")]
     pub recursions: ::prost::alloc::vec::Vec<RecursionUnnestOption>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct RecursionUnnestOption {
     #[prost(message, optional, tag = "1")]
     pub output_column: ::core::option::Option<super::datafusion_common::Column>,
@@ -598,7 +621,7 @@ pub mod logical_expr_node {
         TryCast(::prost::alloc::boxed::Box<super::TryCastNode>),
         /// window expressions
         #[prost(message, tag = "18")]
-        WindowExpr(super::WindowExprNode),
+        WindowExpr(::prost::alloc::boxed::Box<super::WindowExprNode>),
         /// AggregateUDF expressions
         #[prost(message, tag = "19")]
         AggregateUdfExpr(::prost::alloc::boxed::Box<super::AggregateUdfExprNode>),
@@ -635,7 +658,7 @@ pub mod logical_expr_node {
         Unnest(super::Unnest),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct Wildcard {
     #[prost(message, optional, tag = "1")]
     pub qualifier: ::core::option::Option<TableReference>,
@@ -644,8 +667,17 @@ pub struct Wildcard {
 pub struct PlaceholderNode {
     #[prost(string, tag = "1")]
     pub id: ::prost::alloc::string::String,
+    /// We serialize the data type, metadata, and nullability separately to maintain
+    /// compatibility with older versions
     #[prost(message, optional, tag = "2")]
     pub data_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(bool, optional, tag = "3")]
+    pub nullable: ::core::option::Option<bool>,
+    #[prost(map = "string, string", tag = "4")]
+    pub metadata: ::std::collections::HashMap<
+        ::prost::alloc::string::String,
+        ::prost::alloc::string::String,
+    >,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct LogicalExprList {
@@ -788,6 +820,8 @@ pub struct AggregateUdfExprNode {
     pub order_by: ::prost::alloc::vec::Vec<SortExprNode>,
     #[prost(bytes = "vec", optional, tag = "6")]
     pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec<u8>>,
+    #[prost(enumeration = "NullTreatment", optional, tag = "7")]
+    pub null_treatment: ::core::option::Option<i32>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ScalarUdfExprNode {
@@ -811,12 +845,18 @@ pub struct WindowExprNode {
     pub window_frame: ::core::option::Option<WindowFrame>,
     #[prost(bytes = "vec", optional, tag = "10")]
     pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec<u8>>,
+    #[prost(enumeration = "NullTreatment", optional, tag = "11")]
+    pub null_treatment: ::core::option::Option<i32>,
+    #[prost(bool, tag = "12")]
+    pub distinct: bool,
+    #[prost(message, optional, boxed, tag = "13")]
+    pub filter: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
     #[prost(oneof = "window_expr_node::WindowFunction", tags = "3, 9")]
     pub window_function: ::core::option::Option<window_expr_node::WindowFunction>,
 }
 /// Nested message and enum types in `WindowExprNode`.
 pub mod window_expr_node {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum WindowFunction {
         /// BuiltInWindowFunction built_in_function = 2;
         #[prost(string, tag = "3")]
@@ -891,6 +931,13 @@ pub struct CastNode {
     pub expr: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
     #[prost(message, optional, tag = "2")]
     pub arrow_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(map = "string, string", tag = "3")]
+    pub metadata: ::std::collections::HashMap<
+        ::prost::alloc::string::String,
+        ::prost::alloc::string::String,
+    >,
+    #[prost(bool, optional, tag = "4")]
+    pub nullable: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct TryCastNode {
@@ -898,6 +945,13 @@ pub struct TryCastNode {
     pub expr: ::core::option::Option<::prost::alloc::boxed::Box<LogicalExprNode>>,
     #[prost(message, optional, tag = "2")]
     pub arrow_type: ::core::option::Option<super::datafusion_common::ArrowType>,
+    #[prost(map = "string, string", tag = "3")]
+    pub metadata: ::std::collections::HashMap<
+        ::prost::alloc::string::String,
+        ::prost::alloc::string::String,
+    >,
+    #[prost(bool, optional, tag = "4")]
+    pub nullable: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SortExprNode {
@@ -936,27 +990,27 @@ pub struct WindowFrameBound {
     #[prost(message, optional, tag = "2")]
     pub bound_value: ::core::option::Option<super::datafusion_common::ScalarValue>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct FixedSizeBinary {
     #[prost(int32, tag = "1")]
     pub length: i32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AnalyzedLogicalPlanType {
     #[prost(string, tag = "1")]
     pub analyzer_name: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct OptimizedLogicalPlanType {
     #[prost(string, tag = "1")]
     pub optimizer_name: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct OptimizedPhysicalPlanType {
     #[prost(string, tag = "1")]
     pub optimizer_name: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PlanType {
     #[prost(
         oneof = "plan_type::PlanTypeEnum",
@@ -966,7 +1020,7 @@ pub struct PlanType {
 }
 /// Nested message and enum types in `PlanType`.
 pub mod plan_type {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum PlanTypeEnum {
         #[prost(message, tag = "1")]
         InitialLogicalPlan(super::super::datafusion_common::EmptyMessage),
@@ -996,26 +1050,26 @@ pub mod plan_type {
         PhysicalPlanError(super::super::datafusion_common::EmptyMessage),
     }
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct StringifiedPlan {
     #[prost(message, optional, tag = "1")]
     pub plan_type: ::core::option::Option<PlanType>,
     #[prost(string, tag = "2")]
     pub plan: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct BareTableReference {
     #[prost(string, tag = "1")]
     pub table: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PartialTableReference {
     #[prost(string, tag = "1")]
     pub schema: ::prost::alloc::string::String,
     #[prost(string, tag = "2")]
     pub table: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct FullTableReference {
     #[prost(string, tag = "1")]
     pub catalog: ::prost::alloc::string::String,
@@ -1024,7 +1078,7 @@ pub struct FullTableReference {
     #[prost(string, tag = "3")]
     pub table: ::prost::alloc::string::String,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct TableReference {
     #[prost(oneof = "table_reference::TableReferenceEnum", tags = "1, 2, 3")]
     pub table_reference_enum: ::core::option::Option<
@@ -1033,7 +1087,7 @@ pub struct TableReference {
 }
 /// Nested message and enum types in `TableReference`.
 pub mod table_reference {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum TableReferenceEnum {
         #[prost(message, tag = "1")]
         Bare(super::BareTableReference),
@@ -1048,7 +1102,7 @@ pub mod table_reference {
 pub struct PhysicalPlanNode {
     #[prost(
         oneof = "physical_plan_node::PhysicalPlanType",
-        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31"
+        tags = "1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38"
     )]
     pub physical_plan_type: ::core::option::Option<physical_plan_node::PhysicalPlanType>,
 }
@@ -1118,6 +1172,20 @@ pub mod physical_plan_node {
         Unnest(::prost::alloc::boxed::Box<super::UnnestExecNode>),
         #[prost(message, tag = "31")]
         JsonScan(super::JsonScanExecNode),
+        #[prost(message, tag = "32")]
+        Cooperative(::prost::alloc::boxed::Box<super::CooperativeExecNode>),
+        #[prost(message, tag = "33")]
+        GenerateSeries(super::GenerateSeriesNode),
+        #[prost(message, tag = "34")]
+        SortMergeJoin(::prost::alloc::boxed::Box<super::SortMergeJoinExecNode>),
+        #[prost(message, tag = "35")]
+        MemoryScan(super::MemoryScanExecNode),
+        #[prost(message, tag = "36")]
+        AsyncFunc(::prost::alloc::boxed::Box<super::AsyncFuncExecNode>),
+        #[prost(message, tag = "37")]
+        Buffer(::prost::alloc::boxed::Box<super::BufferExecNode>),
+        #[prost(message, tag = "38")]
+        ArrowScan(super::ArrowScanExecNode),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -1145,6 +1213,9 @@ pub struct FileSinkConfig {
     pub insert_op: i32,
     #[prost(string, tag = "11")]
     pub file_extension: ::prost::alloc::string::String,
+    /// Determines how the output path is interpreted.
+    #[prost(enumeration = "FileOutputMode", tag = "12")]
+    pub file_output_mode: i32,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct JsonSink {
@@ -1219,7 +1290,7 @@ pub struct UnnestExecNode {
     #[prost(message, optional, tag = "5")]
     pub options: ::core::option::Option<UnnestOptions>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ListUnnest {
     #[prost(uint32, tag = "1")]
     pub index_in_input_schema: u32,
@@ -1236,9 +1307,17 @@ pub struct PhysicalExtensionNode {
 /// physical expressions
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PhysicalExprNode {
+    /// Unique identifier for this expression to do deduplication during deserialization.
+    /// When serializing, this is set to a unique identifier for each combination of
+    /// expression, process and serialization run.
+    /// When deserializing, if this ID has been seen before, the cached Arc is returned
+    /// instead of creating a new one, enabling reconstruction of referential integrity
+    /// across serde roundtrips.
+    #[prost(uint64, optional, tag = "30")]
+    pub expr_id: ::core::option::Option<u64>,
     #[prost(
         oneof = "physical_expr_node::ExprType",
-        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20"
+        tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21"
     )]
     pub expr_type: ::core::option::Option<physical_expr_node::ExprType>,
 }
@@ -1289,6 +1368,8 @@ pub mod physical_expr_node {
         Extension(super::PhysicalExtensionExprNode),
         #[prost(message, tag = "20")]
         UnknownColumn(super::UnknownColumn),
+        #[prost(message, tag = "21")]
+        HashExpr(super::PhysicalHashExprNode),
     }
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
@@ -1303,6 +1384,8 @@ pub struct PhysicalScalarUdfNode {
     pub return_type: ::core::option::Option<super::datafusion_common::ArrowType>,
     #[prost(bool, tag = "5")]
     pub nullable: bool,
+    #[prost(string, tag = "6")]
+    pub return_field_name: ::prost::alloc::string::String,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PhysicalAggregateExprNode {
@@ -1316,6 +1399,8 @@ pub struct PhysicalAggregateExprNode {
     pub ignore_nulls: bool,
     #[prost(bytes = "vec", optional, tag = "7")]
     pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec<u8>>,
+    #[prost(string, tag = "8")]
+    pub human_display: ::prost::alloc::string::String,
     #[prost(oneof = "physical_aggregate_expr_node::AggregateFunction", tags = "4")]
     pub aggregate_function: ::core::option::Option<
         physical_aggregate_expr_node::AggregateFunction,
@@ -1323,7 +1408,7 @@ pub struct PhysicalAggregateExprNode {
 }
 /// Nested message and enum types in `PhysicalAggregateExprNode`.
 pub mod physical_aggregate_expr_node {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum AggregateFunction {
         #[prost(string, tag = "4")]
         UserDefinedAggrFunction(::prost::alloc::string::String),
@@ -1343,6 +1428,10 @@ pub struct PhysicalWindowExprNode {
     pub name: ::prost::alloc::string::String,
     #[prost(bytes = "vec", optional, tag = "9")]
     pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec<u8>>,
+    #[prost(bool, tag = "11")]
+    pub ignore_nulls: bool,
+    #[prost(bool, tag = "12")]
+    pub distinct: bool,
     #[prost(oneof = "physical_window_expr_node::WindowFunction", tags = "3, 10")]
     pub window_function: ::core::option::Option<
         physical_window_expr_node::WindowFunction,
@@ -1350,7 +1439,7 @@ pub struct PhysicalWindowExprNode {
 }
 /// Nested message and enum types in `PhysicalWindowExprNode`.
 pub mod physical_window_expr_node {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum WindowFunction {
         /// BuiltInWindowFunction built_in_function = 2;
         #[prost(string, tag = "3")]
@@ -1471,6 +1560,15 @@ pub struct PhysicalExtensionExprNode {
     pub inputs: ::prost::alloc::vec::Vec<PhysicalExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct PhysicalHashExprNode {
+    #[prost(message, repeated, tag = "1")]
+    pub on_columns: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(uint64, tag = "2")]
+    pub seed0: u64,
+    #[prost(string, tag = "6")]
+    pub description: ::prost::alloc::string::String,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterExecNode {
     #[prost(message, optional, boxed, tag = "1")]
     pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
@@ -1480,13 +1578,17 @@ pub struct FilterExecNode {
     pub default_filter_selectivity: u32,
     #[prost(uint32, repeated, tag = "9")]
     pub projection: ::prost::alloc::vec::Vec<u32>,
+    #[prost(uint32, tag = "10")]
+    pub batch_size: u32,
+    #[prost(uint32, optional, tag = "11")]
+    pub fetch: ::core::option::Option<u32>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FileGroup {
     #[prost(message, repeated, tag = "1")]
     pub files: ::prost::alloc::vec::Vec<PartitionedFile>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ScanLimit {
     /// wrap into a message to make it optional
     #[prost(uint32, tag = "1")]
@@ -1498,6 +1600,18 @@ pub struct PhysicalSortExprNodeCollection {
     pub physical_sort_expr_nodes: ::prost::alloc::vec::Vec<PhysicalSortExprNode>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ProjectionExpr {
+    #[prost(string, tag = "1")]
+    pub alias: ::prost::alloc::string::String,
+    #[prost(message, optional, tag = "2")]
+    pub expr: ::core::option::Option<PhysicalExprNode>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ProjectionExprs {
+    #[prost(message, repeated, tag = "1")]
+    pub projections: ::prost::alloc::vec::Vec<ProjectionExpr>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FileScanExecConf {
     #[prost(message, repeated, tag = "1")]
     pub file_groups: ::prost::alloc::vec::Vec<FileGroup>,
@@ -1519,6 +1633,8 @@ pub struct FileScanExecConf {
     pub constraints: ::core::option::Option<super::datafusion_common::Constraints>,
     #[prost(uint64, optional, tag = "12")]
     pub batch_size: ::core::option::Option<u64>,
+    #[prost(message, optional, tag = "13")]
+    pub projection_exprs: ::core::option::Option<ProjectionExprs>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ParquetScanExecNode {
@@ -1543,6 +1659,8 @@ pub struct CsvScanExecNode {
     pub quote: ::prost::alloc::string::String,
     #[prost(bool, tag = "7")]
     pub newlines_in_values: bool,
+    #[prost(bool, tag = "8")]
+    pub truncate_rows: bool,
     #[prost(oneof = "csv_scan_exec_node::OptionalEscape", tags = "5")]
     pub optional_escape: ::core::option::Option<csv_scan_exec_node::OptionalEscape>,
     #[prost(oneof = "csv_scan_exec_node::OptionalComment", tags = "6")]
@@ -1550,12 +1668,12 @@ pub struct CsvScanExecNode {
 }
 /// Nested message and enum types in `CsvScanExecNode`.
 pub mod csv_scan_exec_node {
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum OptionalEscape {
         #[prost(string, tag = "5")]
         Escape(::prost::alloc::string::String),
     }
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum OptionalComment {
         #[prost(string, tag = "6")]
         Comment(::prost::alloc::string::String),
@@ -1572,6 +1690,31 @@ pub struct AvroScanExecNode {
     pub base_conf: ::core::option::Option<FileScanExecConf>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ArrowScanExecNode {
+    #[prost(message, optional, tag = "1")]
+    pub base_conf: ::core::option::Option<FileScanExecConf>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct MemoryScanExecNode {
+    #[prost(bytes = "vec", repeated, tag = "1")]
+    pub partitions: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec<u8>>,
+    #[prost(message, optional, tag = "2")]
+    pub schema: ::core::option::Option<super::datafusion_common::Schema>,
+    #[prost(uint32, repeated, tag = "3")]
+    pub projection: ::prost::alloc::vec::Vec<u32>,
+    #[prost(message, repeated, tag = "4")]
+    pub sort_information: ::prost::alloc::vec::Vec<PhysicalSortExprNodeCollection>,
+    #[prost(bool, tag = "5")]
+    pub show_sizes: bool,
+    #[prost(uint32, optional, tag = "6")]
+    pub fetch: ::core::option::Option<u32>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct CooperativeExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HashJoinExecNode {
     #[prost(message, optional, boxed, tag = "1")]
     pub left: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
@@ -1583,12 +1726,14 @@ pub struct HashJoinExecNode {
     pub join_type: i32,
     #[prost(enumeration = "PartitionMode", tag = "6")]
     pub partition_mode: i32,
-    #[prost(bool, tag = "7")]
-    pub null_equals_null: bool,
+    #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
+    pub null_equality: i32,
     #[prost(message, optional, tag = "8")]
     pub filter: ::core::option::Option<JoinFilter>,
     #[prost(uint32, repeated, tag = "9")]
     pub projection: ::prost::alloc::vec::Vec<u32>,
+    #[prost(bool, tag = "10")]
+    pub null_aware: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SymmetricHashJoinExecNode {
@@ -1602,8 +1747,8 @@ pub struct SymmetricHashJoinExecNode {
     pub join_type: i32,
     #[prost(enumeration = "StreamPartitionMode", tag = "6")]
     pub partition_mode: i32,
-    #[prost(bool, tag = "7")]
-    pub null_equals_null: bool,
+    #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
+    pub null_equality: i32,
     #[prost(message, optional, tag = "8")]
     pub filter: ::core::option::Option<JoinFilter>,
     #[prost(message, repeated, tag = "9")]
@@ -1648,14 +1793,14 @@ pub struct CrossJoinExecNode {
     #[prost(message, optional, boxed, tag = "2")]
     pub right: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PhysicalColumn {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
     #[prost(uint32, tag = "2")]
     pub index: u32,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct UnknownColumn {
     #[prost(string, tag = "1")]
     pub name: ::prost::alloc::string::String,
@@ -1686,7 +1831,7 @@ pub struct ProjectionExecNode {
     #[prost(string, repeated, tag = "3")]
     pub expr_name: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
-#[derive(Clone, PartialEq, ::prost::Message)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct PartiallySortedInputOrderMode {
     #[prost(uint64, repeated, tag = "6")]
     pub columns: ::prost::alloc::vec::Vec<u64>,
@@ -1706,7 +1851,7 @@ pub struct WindowAggExecNode {
 /// Nested message and enum types in `WindowAggExecNode`.
 pub mod window_agg_exec_node {
     /// Set optional to `None` for `BoundedWindowAggExec`.
-    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
     pub enum InputOrderMode {
         #[prost(message, tag = "7")]
         Linear(super::super::datafusion_common::EmptyMessage),
@@ -1726,11 +1871,14 @@ pub struct MaybePhysicalSortExprs {
     #[prost(message, repeated, tag = "1")]
     pub sort_expr: ::prost::alloc::vec::Vec<PhysicalSortExprNode>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct AggLimit {
     /// wrap into a message to make it optional
     #[prost(uint64, tag = "1")]
     pub limit: u64,
+    /// Optional ordering direction for TopK aggregation (true = descending, false = ascending)
+    #[prost(bool, optional, tag = "2")]
+    pub descending: ::core::option::Option<bool>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct AggregateExecNode {
@@ -1757,6 +1905,8 @@ pub struct AggregateExecNode {
     pub filter_expr: ::prost::alloc::vec::Vec<MaybeFilter>,
     #[prost(message, optional, tag = "11")]
     pub limit: ::core::option::Option<AggLimit>,
+    #[prost(bool, tag = "12")]
+    pub has_grouping_set: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GlobalLimitExecNode {
@@ -1845,6 +1995,8 @@ pub struct RepartitionExecNode {
     /// }
     #[prost(message, optional, tag = "5")]
     pub partitioning: ::core::option::Option<Partitioning>,
+    #[prost(bool, tag = "6")]
+    pub preserve_order: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Partitioning {
@@ -1872,7 +2024,7 @@ pub struct JoinFilter {
     #[prost(message, optional, tag = "3")]
     pub schema: ::core::option::Option<super::datafusion_common::Schema>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct ColumnIndex {
     #[prost(uint32, tag = "1")]
     pub index: u32,
@@ -1896,7 +2048,7 @@ pub struct PartitionedFile {
     #[prost(message, optional, tag = "6")]
     pub statistics: ::core::option::Option<super::datafusion_common::Statistics>,
 }
-#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
 pub struct FileRange {
     #[prost(int64, tag = "1")]
     pub start: i64,
@@ -1934,6 +2086,153 @@ pub struct CteWorkTableScanNode {
     #[prost(message, optional, tag = "2")]
     pub schema: ::core::option::Option<super::datafusion_common::Schema>,
 }
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct GenerateSeriesArgsContainsNull {
+    #[prost(enumeration = "GenerateSeriesName", tag = "1")]
+    pub name: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct GenerateSeriesArgsInt64 {
+    #[prost(int64, tag = "1")]
+    pub start: i64,
+    #[prost(int64, tag = "2")]
+    pub end: i64,
+    #[prost(int64, tag = "3")]
+    pub step: i64,
+    #[prost(bool, tag = "4")]
+    pub include_end: bool,
+    #[prost(enumeration = "GenerateSeriesName", tag = "5")]
+    pub name: i32,
+}
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct GenerateSeriesArgsTimestamp {
+    #[prost(int64, tag = "1")]
+    pub start: i64,
+    #[prost(int64, tag = "2")]
+    pub end: i64,
+    #[prost(message, optional, tag = "3")]
+    pub step: ::core::option::Option<
+        super::datafusion_common::IntervalMonthDayNanoValue,
+    >,
+    #[prost(string, optional, tag = "4")]
+    pub tz: ::core::option::Option<::prost::alloc::string::String>,
+    #[prost(bool, tag = "5")]
+    pub include_end: bool,
+    #[prost(enumeration = "GenerateSeriesName", tag = "6")]
+    pub name: i32,
+}
+#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct GenerateSeriesArgsDate {
+    #[prost(int64, tag = "1")]
+    pub start: i64,
+    #[prost(int64, tag = "2")]
+    pub end: i64,
+    #[prost(message, optional, tag = "3")]
+    pub step: ::core::option::Option<
+        super::datafusion_common::IntervalMonthDayNanoValue,
+    >,
+    #[prost(bool, tag = "4")]
+    pub include_end: bool,
+    #[prost(enumeration = "GenerateSeriesName", tag = "5")]
+    pub name: i32,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct GenerateSeriesNode {
+    #[prost(message, optional, tag = "1")]
+    pub schema: ::core::option::Option<super::datafusion_common::Schema>,
+    #[prost(uint32, tag = "2")]
+    pub target_batch_size: u32,
+    #[prost(oneof = "generate_series_node::Args", tags = "3, 4, 5, 6")]
+    pub args: ::core::option::Option<generate_series_node::Args>,
+}
+/// Nested message and enum types in `GenerateSeriesNode`.
+pub mod generate_series_node {
+    #[derive(Clone, PartialEq, Eq, Hash, ::prost::Oneof)]
+    pub enum Args {
+        #[prost(message, tag = "3")]
+        ContainsNull(super::GenerateSeriesArgsContainsNull),
+        #[prost(message, tag = "4")]
+        Int64Args(super::GenerateSeriesArgsInt64),
+        #[prost(message, tag = "5")]
+        TimestampArgs(super::GenerateSeriesArgsTimestamp),
+        #[prost(message, tag = "6")]
+        DateArgs(super::GenerateSeriesArgsDate),
+    }
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SortMergeJoinExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub left: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(message, optional, boxed, tag = "2")]
+    pub right: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(message, repeated, tag = "3")]
+    pub on: ::prost::alloc::vec::Vec<JoinOn>,
+    #[prost(enumeration = "super::datafusion_common::JoinType", tag = "4")]
+    pub join_type: i32,
+    #[prost(message, optional, tag = "5")]
+    pub filter: ::core::option::Option<JoinFilter>,
+    #[prost(message, repeated, tag = "6")]
+    pub sort_options: ::prost::alloc::vec::Vec<SortExprNode>,
+    #[prost(enumeration = "super::datafusion_common::NullEquality", tag = "7")]
+    pub null_equality: i32,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct AsyncFuncExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(message, repeated, tag = "2")]
+    pub async_exprs: ::prost::alloc::vec::Vec<PhysicalExprNode>,
+    #[prost(string, repeated, tag = "3")]
+    pub async_expr_names: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct BufferExecNode {
+    #[prost(message, optional, boxed, tag = "1")]
+    pub input: ::core::option::Option<::prost::alloc::boxed::Box<PhysicalPlanNode>>,
+    #[prost(uint64, tag = "2")]
+    pub capacity: u64,
+}
+/// Identifies a built-in file format supported by DataFusion.
+/// Used by DefaultLogicalExtensionCodec to serialize/deserialize
+/// FileFormatFactory instances (e.g. in CopyTo plans).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum FileFormatKind {
+    Unspecified = 0,
+    Csv = 1,
+    Json = 2,
+    Parquet = 3,
+    Arrow = 4,
+    Avro = 5,
+}
+impl FileFormatKind {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Unspecified => "FILE_FORMAT_KIND_UNSPECIFIED",
+            Self::Csv => "FILE_FORMAT_KIND_CSV",
+            Self::Json => "FILE_FORMAT_KIND_JSON",
+            Self::Parquet => "FILE_FORMAT_KIND_PARQUET",
+            Self::Arrow => "FILE_FORMAT_KIND_ARROW",
+            Self::Avro => "FILE_FORMAT_KIND_AVRO",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "FILE_FORMAT_KIND_UNSPECIFIED" => Some(Self::Unspecified),
+            "FILE_FORMAT_KIND_CSV" => Some(Self::Csv),
+            "FILE_FORMAT_KIND_JSON" => Some(Self::Json),
+            "FILE_FORMAT_KIND_PARQUET" => Some(Self::Parquet),
+            "FILE_FORMAT_KIND_ARROW" => Some(Self::Arrow),
+            "FILE_FORMAT_KIND_AVRO" => Some(Self::Avro),
+            _ => None,
+        }
+    }
+}
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum WindowFrameUnits {
@@ -1994,6 +2293,32 @@ impl WindowFrameBoundType {
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
+pub enum NullTreatment {
+    RespectNulls = 0,
+    IgnoreNulls = 1,
+}
+impl NullTreatment {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::RespectNulls => "RESPECT_NULLS",
+            Self::IgnoreNulls => "IGNORE_NULLS",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "RESPECT_NULLS" => Some(Self::RespectNulls),
+            "IGNORE_NULLS" => Some(Self::IgnoreNulls),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
 pub enum DateUnit {
     Day = 0,
     DateMillisecond = 1,
@@ -2018,6 +2343,39 @@ impl DateUnit {
         }
     }
 }
+/// Determines how file sink output paths are interpreted.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum FileOutputMode {
+    /// Infer output mode from the URL (extension/trailing `/` heuristic).
+    Automatic = 0,
+    /// Write to a single file at the exact output path.
+    SingleFile = 1,
+    /// Write to a directory with generated filenames.
+    Directory = 2,
+}
+impl FileOutputMode {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Automatic => "FILE_OUTPUT_MODE_AUTOMATIC",
+            Self::SingleFile => "FILE_OUTPUT_MODE_SINGLE_FILE",
+            Self::Directory => "FILE_OUTPUT_MODE_DIRECTORY",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "FILE_OUTPUT_MODE_AUTOMATIC" => Some(Self::Automatic),
+            "FILE_OUTPUT_MODE_SINGLE_FILE" => Some(Self::SingleFile),
+            "FILE_OUTPUT_MODE_DIRECTORY" => Some(Self::Directory),
+            _ => None,
+        }
+    }
+}
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum InsertOp {
@@ -2110,6 +2468,7 @@ pub enum AggregateMode {
     FinalPartitioned = 2,
     Single = 3,
     SinglePartitioned = 4,
+    PartialReduce = 5,
 }
 impl AggregateMode {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -2123,6 +2482,7 @@ impl AggregateMode {
             Self::FinalPartitioned => "FINAL_PARTITIONED",
             Self::Single => "SINGLE",
             Self::SinglePartitioned => "SINGLE_PARTITIONED",
+            Self::PartialReduce => "PARTIAL_REDUCE",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
@@ -2133,6 +2493,33 @@ impl AggregateMode {
             "FINAL_PARTITIONED" => Some(Self::FinalPartitioned),
             "SINGLE" => Some(Self::Single),
             "SINGLE_PARTITIONED" => Some(Self::SinglePartitioned),
+            "PARTIAL_REDUCE" => Some(Self::PartialReduce),
+            _ => None,
+        }
+    }
+}
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum GenerateSeriesName {
+    GsGenerateSeries = 0,
+    GsRange = 1,
+}
+impl GenerateSeriesName {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::GsGenerateSeries => "GS_GENERATE_SERIES",
+            Self::GsRange => "GS_RANGE",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "GS_GENERATE_SERIES" => Some(Self::GsGenerateSeries),
+            "GS_RANGE" => Some(Self::GsRange),
             _ => None,
         }
     }
diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs
index 2df162f21e3a3..7ddc930fa257e 100644
--- a/datafusion/proto/src/lib.rs
+++ b/datafusion/proto/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Serialize / Deserialize DataFusion Plans to bytes
 //!
@@ -34,8 +35,8 @@
 //!
 //! [`LogicalPlan`]: datafusion_expr::LogicalPlan
 //! [`Expr`]: datafusion_expr::Expr
-//! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan
-//! [`PhysicalExpr`]: datafusion::physical_expr::PhysicalExpr
+//! [`ExecutionPlan`]: datafusion_physical_plan::ExecutionPlan
+//! [`PhysicalExpr`]: datafusion_physical_expr::PhysicalExpr
 //!
 //! Internally, this crate is implemented by converting the plans to [protocol
 //! buffers] using [prost].
@@ -64,15 +65,15 @@
 //! # use datafusion_expr::{col, lit, Expr};
 //! # use datafusion_proto::bytes::Serializeable;
 //! # fn main() -> Result<()>{
-//!  // Create a new `Expr` a < 32
-//!  let expr = col("a").lt(lit(5i32));
+//! // Create a new `Expr` a < 32
+//! let expr = col("a").lt(lit(5i32));
 //!
-//!  // Convert it to bytes (for sending over the network, etc.)
-//!  let bytes = expr.to_bytes()?;
+//! // Convert it to bytes (for sending over the network, etc.)
+//! let bytes = expr.to_bytes()?;
 //!
-//!  // Decode bytes from somewhere (over network, etc.) back to Expr
-//!  let decoded_expr = Expr::from_bytes(&bytes)?;
-//!  assert_eq!(expr, decoded_expr);
+//! // Decode bytes from somewhere (over network, etc.) back to Expr
+//! let decoded_expr = Expr::from_bytes(&bytes)?;
+//! assert_eq!(expr, decoded_expr);
 //! # Ok(())
 //! # }
 //! ```
@@ -93,7 +94,7 @@
 //!  let bytes = logical_plan_to_bytes(&plan)?;
 //!
 //!  // Decode bytes from somewhere (over network, etc.) back to LogicalPlan
-//!  let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+//!  let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
 //!  assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
 //! # Ok(())
 //! # }
@@ -115,7 +116,7 @@
 //!  let bytes = physical_plan_to_bytes(physical_plan.clone())?;
 //!
 //!  // Decode bytes from somewhere (over network, etc.) back to ExecutionPlan
-//!  let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?;
+//!  let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
 //!  assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip));
 //! # Ok(())
 //! # }
@@ -130,8 +131,9 @@ pub mod protobuf {
     pub use crate::generated::datafusion::*;
     pub use datafusion_proto_common::common::proto_error;
     pub use datafusion_proto_common::protobuf_common::{
-        ArrowOptions, ArrowType, AvroFormat, AvroOptions, CsvFormat, DfSchema,
-        EmptyMessage, Field, JoinSide, NdJsonFormat, ParquetFormat, ScalarValue, Schema,
+        ArrowFormat, ArrowOptions, ArrowType, AvroFormat, AvroOptions, CsvFormat,
+        DfSchema, EmptyMessage, Field, JoinSide, NdJsonFormat, ParquetFormat,
+        ScalarValue, Schema,
     };
     pub use datafusion_proto_common::{FromProtoError, ToProtoError};
 }
diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs
index d3f6511ec98fa..08f42b0af7290 100644
--- a/datafusion/proto/src/logical_plan/file_formats.rs
+++ b/datafusion/proto/src/logical_plan/file_formats.rs
@@ -17,30 +17,19 @@
 
 use std::sync::Arc;
 
-use datafusion::{
-    config::{
-        CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
-        TableParquetOptions,
-    },
-    datasource::file_format::{
-        arrow::ArrowFormatFactory, csv::CsvFormatFactory, json::JsonFormatFactory,
-        parquet::ParquetFormatFactory, FileFormatFactory,
-    },
-    prelude::SessionContext,
-};
+use crate::protobuf::{CsvOptions as CsvOptionsProto, JsonOptions as JsonOptionsProto};
+use datafusion_common::config::{CsvOptions, JsonOptions};
 use datafusion_common::{
-    exec_err, not_impl_err, parsers::CompressionTypeVariant, DataFusionError,
-    TableReference,
+    TableReference, exec_datafusion_err, exec_err, not_impl_err,
+    parsers::CompressionTypeVariant,
 };
+use datafusion_datasource::file_format::FileFormatFactory;
+use datafusion_datasource_arrow::file_format::ArrowFormatFactory;
+use datafusion_datasource_csv::file_format::CsvFormatFactory;
+use datafusion_datasource_json::file_format::JsonFormatFactory;
+use datafusion_execution::TaskContext;
 use prost::Message;
 
-use crate::protobuf::{
-    parquet_column_options, parquet_options, CsvOptions as CsvOptionsProto,
-    JsonOptions as JsonOptionsProto, ParquetColumnOptions as ParquetColumnOptionsProto,
-    ParquetColumnSpecificOptions, ParquetOptions as ParquetOptionsProto,
-    TableParquetOptions as TableParquetOptionsProto,
-};
-
 use super::LogicalExtensionCodec;
 
 #[derive(Debug)]
@@ -72,6 +61,8 @@ impl CsvOptionsProto {
                 newlines_in_values: options
                     .newlines_in_values
                     .map_or(vec![], |v| vec![v as u8]),
+                truncated_rows: options.truncated_rows.map_or(vec![], |v| vec![v as u8]),
+                compression_level: options.compression_level,
             }
         } else {
             CsvOptionsProto::default()
@@ -157,6 +148,12 @@ impl From<&CsvOptionsProto> for CsvOptions {
             } else {
                 Some(proto.newlines_in_values[0] != 0)
             },
+            truncated_rows: if proto.truncated_rows.is_empty() {
+                None
+            } else {
+                Some(proto.truncated_rows[0] != 0)
+            },
+            compression_level: proto.compression_level,
         }
     }
 }
@@ -167,7 +164,7 @@ impl LogicalExtensionCodec for CsvLogicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[datafusion_expr::LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<datafusion_expr::Extension> {
         not_impl_err!("Method not implemented")
     }
@@ -185,15 +182,15 @@ impl LogicalExtensionCodec for CsvLogicalExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: arrow::datatypes::SchemaRef,
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn datafusion::datasource::TableProvider>> {
+        _ctx: &TaskContext,
+    ) -> datafusion_common::Result<Arc<dyn datafusion_catalog::TableProvider>> {
         not_impl_err!("Method not implemented")
     }
 
     fn try_encode_table_provider(
         &self,
         _table_ref: &TableReference,
-        _node: Arc<dyn datafusion::datasource::TableProvider>,
+        _node: Arc<dyn datafusion_catalog::TableProvider>,
         _buf: &mut Vec<u8>,
     ) -> datafusion_common::Result<()> {
         not_impl_err!("Method not implemented")
@@ -202,10 +199,10 @@ impl LogicalExtensionCodec for CsvLogicalExtensionCodec {
     fn try_decode_file_format(
         &self,
         buf: &[u8],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
         let proto = CsvOptionsProto::decode(buf).map_err(|e| {
-            DataFusionError::Execution(format!("Failed to decode CsvOptionsProto: {e:?}"))
+            exec_datafusion_err!("Failed to decode CsvOptionsProto: {e:?}")
         })?;
         let options: CsvOptions = (&proto).into();
         Ok(Arc::new(CsvFormatFactory {
@@ -229,9 +226,9 @@ impl LogicalExtensionCodec for CsvLogicalExtensionCodec {
             options: Some(options),
         });
 
-        proto.encode(buf).map_err(|e| {
-            DataFusionError::Execution(format!("Failed to encode CsvOptions: {e:?}"))
-        })?;
+        proto
+            .encode(buf)
+            .map_err(|e| exec_datafusion_err!("Failed to encode CsvOptions: {e:?}"))?;
 
         Ok(())
     }
@@ -243,6 +240,8 @@ impl JsonOptionsProto {
             JsonOptionsProto {
                 compression: options.compression as i32,
                 schema_infer_max_rec: options.schema_infer_max_rec.map(|v| v as u64),
+                compression_level: options.compression_level,
+                newline_delimited: Some(options.newline_delimited),
             }
         } else {
             JsonOptionsProto::default()
@@ -261,6 +260,8 @@ impl From<&JsonOptionsProto> for JsonOptions {
                 _ => CompressionTypeVariant::UNCOMPRESSED,
             },
             schema_infer_max_rec: proto.schema_infer_max_rec.map(|v| v as usize),
+            compression_level: proto.compression_level,
+            newline_delimited: proto.newline_delimited.unwrap_or(true),
         }
     }
 }
@@ -274,7 +275,7 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[datafusion_expr::LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<datafusion_expr::Extension> {
         not_impl_err!("Method not implemented")
     }
@@ -292,15 +293,15 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: arrow::datatypes::SchemaRef,
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn datafusion::datasource::TableProvider>> {
+        _ctx: &TaskContext,
+    ) -> datafusion_common::Result<Arc<dyn datafusion_catalog::TableProvider>> {
         not_impl_err!("Method not implemented")
     }
 
     fn try_encode_table_provider(
         &self,
         _table_ref: &TableReference,
-        _node: Arc<dyn datafusion::datasource::TableProvider>,
+        _node: Arc<dyn datafusion_catalog::TableProvider>,
         _buf: &mut Vec<u8>,
     ) -> datafusion_common::Result<()> {
         not_impl_err!("Method not implemented")
@@ -309,12 +310,10 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec {
     fn try_decode_file_format(
         &self,
         buf: &[u8],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
         let proto = JsonOptionsProto::decode(buf).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "Failed to decode JsonOptionsProto: {e:?}"
-            ))
+            exec_datafusion_err!("Failed to decode JsonOptionsProto: {e:?}")
         })?;
         let options: JsonOptions = (&proto).into();
         Ok(Arc::new(JsonFormatFactory {
@@ -332,34 +331,46 @@ impl LogicalExtensionCodec for JsonLogicalExtensionCodec {
         {
             json_factory.options.clone().unwrap_or_default()
         } else {
-            return Err(DataFusionError::Execution(
-                "Unsupported FileFormatFactory type".to_string(),
-            ));
+            return exec_err!("Unsupported FileFormatFactory type");
         };
 
         let proto = JsonOptionsProto::from_factory(&JsonFormatFactory {
             options: Some(options),
         });
 
-        proto.encode(buf).map_err(|e| {
-            DataFusionError::Execution(format!("Failed to encode JsonOptions: {e:?}"))
-        })?;
+        proto
+            .encode(buf)
+            .map_err(|e| exec_datafusion_err!("Failed to encode JsonOptions: {e:?}"))?;
 
         Ok(())
     }
 }
 
-impl TableParquetOptionsProto {
-    fn from_factory(factory: &ParquetFormatFactory) -> Self {
-        let global_options = if let Some(ref options) = factory.options {
-            options.clone()
-        } else {
-            return TableParquetOptionsProto::default();
-        };
+#[cfg(feature = "parquet")]
+mod parquet {
+    use super::*;
+
+    use crate::protobuf::{
+        ParquetColumnOptions as ParquetColumnOptionsProto, ParquetColumnSpecificOptions,
+        ParquetOptions as ParquetOptionsProto,
+        TableParquetOptions as TableParquetOptionsProto, parquet_column_options,
+        parquet_options,
+    };
+    use datafusion_common::config::{
+        ParquetColumnOptions, ParquetOptions, TableParquetOptions,
+    };
+    use datafusion_datasource_parquet::file_format::ParquetFormatFactory;
+
+    impl TableParquetOptionsProto {
+        fn from_factory(factory: &ParquetFormatFactory) -> Self {
+            let global_options = if let Some(ref options) = factory.options {
+                options.clone()
+            } else {
+                return TableParquetOptionsProto::default();
+            };
 
-        let column_specific_options = global_options.column_specific_options;
-        #[allow(deprecated)] // max_statistics_size
-        TableParquetOptionsProto {
+            let column_specific_options = global_options.column_specific_options;
+            TableParquetOptionsProto {
             global: Some(ParquetOptionsProto {
                 enable_page_index: global_options.global.enable_page_index,
                 pruning: global_options.global.pruning,
@@ -369,9 +380,10 @@ impl TableParquetOptionsProto {
                 }),
                 pushdown_filters: global_options.global.pushdown_filters,
                 reorder_filters: global_options.global.reorder_filters,
+                force_filter_selections: global_options.global.force_filter_selections,
                 data_pagesize_limit: global_options.global.data_pagesize_limit as u64,
                 write_batch_size: global_options.global.write_batch_size as u64,
-                writer_version: global_options.global.writer_version.clone(),
+                writer_version: global_options.global.writer_version.to_string(),
                 compression_opt: global_options.global.compression.map(|compression| {
                     parquet_options::CompressionOpt::Compression(compression)
                 }),
@@ -382,9 +394,6 @@ impl TableParquetOptionsProto {
                 statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| {
                     parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled)
                 }),
-                max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| {
-                    parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64)
-                }),
                 max_row_group_size: global_options.global.max_row_group_size as u64,
                 created_by: global_options.global.created_by.clone(),
                 column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| {
@@ -414,6 +423,9 @@ impl TableParquetOptionsProto {
                 coerce_int96_opt: global_options.global.coerce_int96.map(|compression| {
                     parquet_options::CoerceInt96Opt::CoerceInt96(compression)
                 }),
+                max_predicate_cache_size_opt: global_options.global.max_predicate_cache_size.map(|size| {
+                    parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size as u64)
+                }),
             }),
             column_specific_options: column_specific_options.into_iter().map(|(column_name, options)| {
                 ParquetColumnSpecificOptions {
@@ -440,9 +452,6 @@ impl TableParquetOptionsProto {
                         bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| {
                             parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv)
                         }),
-                        max_statistics_size_opt: options.max_statistics_size.map(|size| {
-                            parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32)
-                        }),
                     })
                 }
             }).collect(),
@@ -453,13 +462,12 @@ impl TableParquetOptionsProto {
                 })
                 .collect(),
         }
+        }
     }
-}
 
-impl From<&ParquetOptionsProto> for ParquetOptions {
-    fn from(proto: &ParquetOptionsProto) -> Self {
-        #[allow(deprecated)] // max_statistics_size
-        ParquetOptions {
+    impl From<&ParquetOptionsProto> for ParquetOptions {
+        fn from(proto: &ParquetOptionsProto) -> Self {
+            ParquetOptions {
             enable_page_index: proto.enable_page_index,
             pruning: proto.pruning,
             skip_metadata: proto.skip_metadata,
@@ -468,9 +476,13 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
             }),
             pushdown_filters: proto.pushdown_filters,
             reorder_filters: proto.reorder_filters,
+            force_filter_selections: proto.force_filter_selections,
             data_pagesize_limit: proto.data_pagesize_limit as usize,
             write_batch_size: proto.write_batch_size as usize,
-            writer_version: proto.writer_version.clone(),
+                   // TODO: Consider changing to TryFrom to avoid panic on invalid proto data
+            writer_version: proto.writer_version.parse().expect("
+                Invalid parquet writer version in proto, expected '1.0' or '2.0'
+            "),
             compression: proto.compression_opt.as_ref().map(|opt| match opt {
                 parquet_options::CompressionOpt::Compression(compression) => compression.clone(),
             }),
@@ -481,9 +493,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
             statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt {
                 parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(),
             }),
-            max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt {
-                parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize,
-            }),
             max_row_group_size: proto.max_row_group_size as usize,
             created_by: proto.created_by.clone(),
             column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt {
@@ -513,14 +522,16 @@ impl From<&ParquetOptionsProto> for ParquetOptions {
             coerce_int96: proto.coerce_int96_opt.as_ref().map(|opt| match opt {
                 parquet_options::CoerceInt96Opt::CoerceInt96(coerce_int96) => coerce_int96.clone(),
             }),
+            max_predicate_cache_size: proto.max_predicate_cache_size_opt.as_ref().map(|opt| match opt {
+                parquet_options::MaxPredicateCacheSizeOpt::MaxPredicateCacheSize(size) => *size as usize,
+            }),
+        }
         }
     }
-}
 
-impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
-    fn from(proto: ParquetColumnOptionsProto) -> Self {
-        #[allow(deprecated)] // max_statistics_size
-        ParquetColumnOptions {
+    impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
+        fn from(proto: ParquetColumnOptionsProto) -> Self {
+            ParquetColumnOptions {
             bloom_filter_enabled: proto.bloom_filter_enabled_opt.map(
                 |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v,
             ),
@@ -542,129 +553,130 @@ impl From<ParquetColumnOptionsProto> for ParquetColumnOptions {
             bloom_filter_ndv: proto
                 .bloom_filter_ndv_opt
                 .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v),
-            max_statistics_size: proto.max_statistics_size_opt.map(
-                |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| {
-                    v as usize
-                },
-            ),
+        }
         }
     }
-}
 
-impl From<&TableParquetOptionsProto> for TableParquetOptions {
-    fn from(proto: &TableParquetOptionsProto) -> Self {
-        TableParquetOptions {
-            global: proto
-                .global
-                .as_ref()
-                .map(ParquetOptions::from)
-                .unwrap_or_default(),
-            column_specific_options: proto
-                .column_specific_options
-                .iter()
-                .map(|parquet_column_options| {
-                    (
-                        parquet_column_options.column_name.clone(),
-                        ParquetColumnOptions::from(
-                            parquet_column_options.options.clone().unwrap_or_default(),
-                        ),
-                    )
-                })
-                .collect(),
-            key_value_metadata: proto
-                .key_value_metadata
-                .iter()
-                .map(|(k, v)| (k.clone(), Some(v.clone())))
-                .collect(),
+    impl From<&TableParquetOptionsProto> for TableParquetOptions {
+        fn from(proto: &TableParquetOptionsProto) -> Self {
+            TableParquetOptions {
+                global: proto
+                    .global
+                    .as_ref()
+                    .map(ParquetOptions::from)
+                    .unwrap_or_default(),
+                column_specific_options: proto
+                    .column_specific_options
+                    .iter()
+                    .map(|parquet_column_options| {
+                        (
+                            parquet_column_options.column_name.clone(),
+                            ParquetColumnOptions::from(
+                                parquet_column_options
+                                    .options
+                                    .clone()
+                                    .unwrap_or_default(),
+                            ),
+                        )
+                    })
+                    .collect(),
+                key_value_metadata: proto
+                    .key_value_metadata
+                    .iter()
+                    .map(|(k, v)| (k.clone(), Some(v.clone())))
+                    .collect(),
+                crypto: Default::default(),
+            }
         }
     }
-}
 
-#[derive(Debug)]
-pub struct ParquetLogicalExtensionCodec;
+    #[derive(Debug)]
+    pub struct ParquetLogicalExtensionCodec;
 
-// TODO! This is a placeholder for now and needs to be implemented for real.
-impl LogicalExtensionCodec for ParquetLogicalExtensionCodec {
-    fn try_decode(
-        &self,
-        _buf: &[u8],
-        _inputs: &[datafusion_expr::LogicalPlan],
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<datafusion_expr::Extension> {
-        not_impl_err!("Method not implemented")
-    }
+    // TODO! This is a placeholder for now and needs to be implemented for real.
+    impl LogicalExtensionCodec for ParquetLogicalExtensionCodec {
+        fn try_decode(
+            &self,
+            _buf: &[u8],
+            _inputs: &[datafusion_expr::LogicalPlan],
+            _ctx: &TaskContext,
+        ) -> datafusion_common::Result<datafusion_expr::Extension> {
+            not_impl_err!("Method not implemented")
+        }
 
-    fn try_encode(
-        &self,
-        _node: &datafusion_expr::Extension,
-        _buf: &mut Vec<u8>,
-    ) -> datafusion_common::Result<()> {
-        not_impl_err!("Method not implemented")
-    }
+        fn try_encode(
+            &self,
+            _node: &datafusion_expr::Extension,
+            _buf: &mut Vec<u8>,
+        ) -> datafusion_common::Result<()> {
+            not_impl_err!("Method not implemented")
+        }
 
-    fn try_decode_table_provider(
-        &self,
-        _buf: &[u8],
-        _table_ref: &TableReference,
-        _schema: arrow::datatypes::SchemaRef,
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn datafusion::datasource::TableProvider>> {
-        not_impl_err!("Method not implemented")
-    }
+        fn try_decode_table_provider(
+            &self,
+            _buf: &[u8],
+            _table_ref: &TableReference,
+            _schema: arrow::datatypes::SchemaRef,
+            _ctx: &TaskContext,
+        ) -> datafusion_common::Result<Arc<dyn datafusion_catalog::TableProvider>>
+        {
+            not_impl_err!("Method not implemented")
+        }
 
-    fn try_encode_table_provider(
-        &self,
-        _table_ref: &TableReference,
-        _node: Arc<dyn datafusion::datasource::TableProvider>,
-        _buf: &mut Vec<u8>,
-    ) -> datafusion_common::Result<()> {
-        not_impl_err!("Method not implemented")
-    }
+        fn try_encode_table_provider(
+            &self,
+            _table_ref: &TableReference,
+            _node: Arc<dyn datafusion_catalog::TableProvider>,
+            _buf: &mut Vec<u8>,
+        ) -> datafusion_common::Result<()> {
+            not_impl_err!("Method not implemented")
+        }
 
-    fn try_decode_file_format(
-        &self,
-        buf: &[u8],
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
-        let proto = TableParquetOptionsProto::decode(buf).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "Failed to decode TableParquetOptionsProto: {e:?}"
+        fn try_decode_file_format(
+            &self,
+            buf: &[u8],
+            _ctx: &TaskContext,
+        ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
+            let proto = TableParquetOptionsProto::decode(buf).map_err(|e| {
+                exec_datafusion_err!("Failed to decode TableParquetOptionsProto: {e:?}")
+            })?;
+            let options: TableParquetOptions = (&proto).into();
+            Ok(Arc::new(
+                datafusion_datasource_parquet::file_format::ParquetFormatFactory {
+                    options: Some(options),
+                },
             ))
-        })?;
-        let options: TableParquetOptions = (&proto).into();
-        Ok(Arc::new(ParquetFormatFactory {
-            options: Some(options),
-        }))
-    }
+        }
 
-    fn try_encode_file_format(
-        &self,
-        buf: &mut Vec<u8>,
-        node: Arc<dyn FileFormatFactory>,
-    ) -> datafusion_common::Result<()> {
-        let options = if let Some(parquet_factory) =
-            node.as_any().downcast_ref::<ParquetFormatFactory>()
-        {
-            parquet_factory.options.clone().unwrap_or_default()
-        } else {
-            return Err(DataFusionError::Execution(
-                "Unsupported FileFormatFactory type".to_string(),
-            ));
-        };
+        fn try_encode_file_format(
+            &self,
+            buf: &mut Vec<u8>,
+            node: Arc<dyn FileFormatFactory>,
+        ) -> datafusion_common::Result<()> {
+            use datafusion_datasource_parquet::file_format::ParquetFormatFactory;
+
+            let options = if let Some(parquet_factory) =
+                node.as_any().downcast_ref::<ParquetFormatFactory>()
+            {
+                parquet_factory.options.clone().unwrap_or_default()
+            } else {
+                return exec_err!("Unsupported FileFormatFactory type");
+            };
 
-        let proto = TableParquetOptionsProto::from_factory(&ParquetFormatFactory {
-            options: Some(options),
-        });
+            let proto = TableParquetOptionsProto::from_factory(&ParquetFormatFactory {
+                options: Some(options),
+            });
 
-        proto.encode(buf).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "Failed to encode TableParquetOptionsProto: {e:?}"
-            ))
-        })?;
+            proto.encode(buf).map_err(|e| {
+                exec_datafusion_err!("Failed to encode TableParquetOptionsProto: {e:?}")
+            })?;
 
-        Ok(())
+            Ok(())
+        }
     }
 }
+#[cfg(feature = "parquet")]
+pub use parquet::ParquetLogicalExtensionCodec;
 
 #[derive(Debug)]
 pub struct ArrowLogicalExtensionCodec;
@@ -675,7 +687,7 @@ impl LogicalExtensionCodec for ArrowLogicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[datafusion_expr::LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<datafusion_expr::Extension> {
         not_impl_err!("Method not implemented")
     }
@@ -693,15 +705,15 @@ impl LogicalExtensionCodec for ArrowLogicalExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: arrow::datatypes::SchemaRef,
-        _ctx: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn datafusion::datasource::TableProvider>> {
+        _ctx: &TaskContext,
+    ) -> datafusion_common::Result<Arc<dyn datafusion_catalog::TableProvider>> {
         not_impl_err!("Method not implemented")
     }
 
     fn try_encode_table_provider(
         &self,
         _table_ref: &TableReference,
-        _node: Arc<dyn datafusion::datasource::TableProvider>,
+        _node: Arc<dyn datafusion_catalog::TableProvider>,
         _buf: &mut Vec<u8>,
     ) -> datafusion_common::Result<()> {
         not_impl_err!("Method not implemented")
@@ -710,7 +722,7 @@ impl LogicalExtensionCodec for ArrowLogicalExtensionCodec {
     fn try_decode_file_format(
         &self,
         __buf: &[u8],
-        __ctx: &SessionContext,
+        __ctx: &TaskContext,
     ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
         Ok(Arc::new(ArrowFormatFactory::new()))
     }
@@ -733,7 +745,7 @@ impl LogicalExtensionCodec for AvroLogicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[datafusion_expr::LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> datafusion_common::Result<datafusion_expr::Extension> {
         not_impl_err!("Method not implemented")
     }
@@ -751,15 +763,15 @@ impl LogicalExtensionCodec for AvroLogicalExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: arrow::datatypes::SchemaRef,
-        _cts: &SessionContext,
-    ) -> datafusion_common::Result<Arc<dyn datafusion::datasource::TableProvider>> {
+        _cts: &TaskContext,
+    ) -> datafusion_common::Result<Arc<dyn datafusion_catalog::TableProvider>> {
         not_impl_err!("Method not implemented")
     }
 
     fn try_encode_table_provider(
         &self,
         _table_ref: &TableReference,
-        _node: Arc<dyn datafusion::datasource::TableProvider>,
+        _node: Arc<dyn datafusion_catalog::TableProvider>,
         _buf: &mut Vec<u8>,
     ) -> datafusion_common::Result<()> {
         not_impl_err!("Method not implemented")
@@ -768,7 +780,7 @@ impl LogicalExtensionCodec for AvroLogicalExtensionCodec {
     fn try_decode_file_format(
         &self,
         __buf: &[u8],
-        __ctx: &SessionContext,
+        __ctx: &TaskContext,
     ) -> datafusion_common::Result<Arc<dyn FileFormatFactory>> {
         Ok(Arc::new(ArrowFormatFactory::new()))
     }
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs
index 9f0489d6b0ea4..ed33d9fab1820 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -17,38 +17,39 @@
 
 use std::sync::Arc;
 
-use datafusion::execution::registry::FunctionRegistry;
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::{
-    exec_datafusion_err, internal_err, plan_datafusion_err, RecursionUnnestOption,
-    Result, ScalarValue, TableReference, UnnestOptions,
+    NullEquality, RecursionUnnestOption, Result, ScalarValue, TableReference,
+    UnnestOptions, exec_datafusion_err, internal_err, plan_datafusion_err,
 };
+use datafusion_execution::registry::FunctionRegistry;
 use datafusion_expr::dml::InsertOp;
-use datafusion_expr::expr::{Alias, Placeholder, Sort};
+use datafusion_expr::expr::{Alias, NullTreatment, Placeholder, Sort};
 use datafusion_expr::expr::{Unnest, WildcardOptions};
 use datafusion_expr::{
-    expr::{self, InList, WindowFunction},
-    logical_plan::{PlanType, StringifiedPlan},
     Between, BinaryExpr, Case, Cast, Expr, GroupingSet,
     GroupingSet::GroupingSets,
     JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound,
     WindowFrameUnits,
+    expr::{self, InList, WindowFunction},
+    logical_plan::{PlanType, StringifiedPlan},
 };
 use datafusion_expr::{ExprFunctionExt, WriteOp};
-use datafusion_proto_common::{from_proto::FromOptionalField, FromProtoError as Error};
+use datafusion_proto_common::{FromProtoError as Error, from_proto::FromOptionalField};
 
 use crate::protobuf::plan_type::PlanTypeEnum::{
     FinalPhysicalPlanWithSchema, InitialPhysicalPlanWithSchema,
 };
 use crate::protobuf::{
-    self,
+    self, AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType,
+    OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
     plan_type::PlanTypeEnum::{
         AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
         FinalPhysicalPlan, FinalPhysicalPlanWithStats, InitialLogicalPlan,
         InitialPhysicalPlan, InitialPhysicalPlanWithStats, OptimizedLogicalPlan,
         OptimizedPhysicalPlan, PhysicalPlanError,
     },
-    AnalyzedLogicalPlanType, CubeNode, GroupingSetNode, OptimizedLogicalPlanType,
-    OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
 };
 
 use super::LogicalExtensionCodec;
@@ -205,6 +206,7 @@ impl From<protobuf::JoinType> for JoinType {
             protobuf::JoinType::Leftanti => JoinType::LeftAnti,
             protobuf::JoinType::Rightanti => JoinType::RightAnti,
             protobuf::JoinType::Leftmark => JoinType::LeftMark,
+            protobuf::JoinType::Rightmark => JoinType::RightMark,
         }
     }
 }
@@ -218,6 +220,15 @@ impl From<protobuf::JoinConstraint> for JoinConstraint {
     }
 }
 
+impl From<protobuf::NullEquality> for NullEquality {
+    fn from(t: protobuf::NullEquality) -> Self {
+        match t {
+            protobuf::NullEquality::NullEqualsNothing => NullEquality::NullEqualsNothing,
+            protobuf::NullEquality::NullEqualsNull => NullEquality::NullEqualsNull,
+        }
+    }
+}
+
 impl From<protobuf::dml_node::Type> for WriteOp {
     fn from(t: protobuf::dml_node::Type) -> Self {
         match t {
@@ -229,6 +240,16 @@ impl From<protobuf::dml_node::Type> for WriteOp {
             }
             protobuf::dml_node::Type::InsertReplace => WriteOp::Insert(InsertOp::Replace),
             protobuf::dml_node::Type::Ctas => WriteOp::Ctas,
+            protobuf::dml_node::Type::Truncate => WriteOp::Truncate,
+        }
+    }
+}
+
+impl From<protobuf::NullTreatment> for NullTreatment {
+    fn from(t: protobuf::NullTreatment) -> Self {
+        match t {
+            protobuf::NullTreatment::RespectNulls => NullTreatment::RespectNulls,
+            protobuf::NullTreatment::IgnoreNulls => NullTreatment::IgnoreNulls,
         }
     }
 }
@@ -268,7 +289,7 @@ pub fn parse_expr(
         ExprType::Column(column) => Ok(Expr::Column(column.into())),
         ExprType::Literal(literal) => {
             let scalar_value: ScalarValue = literal.try_into()?;
-            Ok(Expr::Literal(scalar_value))
+            Ok(Expr::Literal(scalar_value, None))
         }
         ExprType::WindowExpr(expr) => {
             let window_function = expr
@@ -291,8 +312,20 @@ pub fn parse_expr(
                     exec_datafusion_err!("missing window frame during deserialization")
                 })?;
 
-            // TODO: support proto for null treatment
-            match window_function {
+            let null_treatment = match expr.null_treatment {
+                Some(null_treatment) => {
+                    let null_treatment  =  protobuf::NullTreatment::try_from(null_treatment)
+                    .map_err(|_| {
+                        proto_error(format!(
+                            "Received a WindowExprNode message with unknown NullTreatment {null_treatment}",
+                        ))
+                    })?;
+                    Some(NullTreatment::from(null_treatment))
+                }
+                None => None,
+            };
+
+            let agg_fn = match window_function {
                 window_expr_node::WindowFunction::Udaf(udaf_name) => {
                     let udaf_function = match &expr.fun_definition {
                         Some(buf) => codec.try_decode_udaf(udaf_name, buf)?,
@@ -300,17 +333,7 @@ pub fn parse_expr(
                             .udaf(udaf_name)
                             .or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?,
                     };
-
-                    let args = parse_exprs(&expr.exprs, registry, codec)?;
-                    Expr::WindowFunction(WindowFunction::new(
-                        expr::WindowFunctionDefinition::AggregateUDF(udaf_function),
-                        args,
-                    ))
-                    .partition_by(partition_by)
-                    .order_by(order_by)
-                    .window_frame(window_frame)
-                    .build()
-                    .map_err(Error::DataFusionError)
+                    expr::WindowFunctionDefinition::AggregateUDF(udaf_function)
                 }
                 window_expr_node::WindowFunction::Udwf(udwf_name) => {
                     let udwf_function = match &expr.fun_definition {
@@ -319,19 +342,28 @@ pub fn parse_expr(
                             .udwf(udwf_name)
                             .or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?,
                     };
-
-                    let args = parse_exprs(&expr.exprs, registry, codec)?;
-                    Expr::WindowFunction(WindowFunction::new(
-                        expr::WindowFunctionDefinition::WindowUDF(udwf_function),
-                        args,
-                    ))
-                    .partition_by(partition_by)
-                    .order_by(order_by)
-                    .window_frame(window_frame)
-                    .build()
-                    .map_err(Error::DataFusionError)
+                    expr::WindowFunctionDefinition::WindowUDF(udwf_function)
                 }
+            };
+
+            let args = parse_exprs(&expr.exprs, registry, codec)?;
+            let mut builder = Expr::from(WindowFunction::new(agg_fn, args))
+                .partition_by(partition_by)
+                .order_by(order_by)
+                .window_frame(window_frame)
+                .null_treatment(null_treatment);
+
+            if expr.distinct {
+                builder = builder.distinct();
+            };
+
+            if let Some(filter) =
+                parse_optional_expr(expr.filter.as_deref(), registry, codec)?
+            {
+                builder = builder.filter(filter);
             }
+
+            builder.build().map_err(Error::DataFusionError)
         }
         ExprType::Alias(alias) => Ok(Expr::Alias(Alias::new(
             parse_required_expr(alias.expr.as_deref(), registry, "expr", codec)?,
@@ -496,8 +528,11 @@ pub fn parse_expr(
                 "expr",
                 codec,
             )?);
-            let data_type = cast.arrow_type.as_ref().required("arrow_type")?;
-            Ok(Expr::Cast(Cast::new(expr, data_type)))
+            let data_type: DataType = cast.arrow_type.as_ref().required("arrow_type")?;
+            let field = data_type
+                .into_nullable_field()
+                .with_nullable(cast.nullable.unwrap_or(true));
+            Ok(Expr::Cast(Cast::new_from_field(expr, Arc::new(field))))
         }
         ExprType::TryCast(cast) => {
             let expr = Box::new(parse_required_expr(
@@ -506,8 +541,14 @@ pub fn parse_expr(
                 "expr",
                 codec,
             )?);
-            let data_type = cast.arrow_type.as_ref().required("arrow_type")?;
-            Ok(Expr::TryCast(TryCast::new(expr, data_type)))
+            let data_type: DataType = cast.arrow_type.as_ref().required("arrow_type")?;
+            let field = data_type
+                .into_nullable_field()
+                .with_nullable(cast.nullable.unwrap_or(true));
+            Ok(Expr::TryCast(TryCast::new_from_field(
+                expr,
+                Arc::new(field),
+            )))
         }
         ExprType::Negative(negative) => Ok(Expr::Negative(Box::new(
             parse_required_expr(negative.expr.as_deref(), registry, "expr", codec)?,
@@ -560,17 +601,26 @@ pub fn parse_expr(
                     .udaf(&pb.fun_name)
                     .or_else(|_| codec.try_decode_udaf(&pb.fun_name, &[]))?,
             };
+            let null_treatment = match pb.null_treatment {
+                Some(null_treatment) => {
+                    let null_treatment  =  protobuf::NullTreatment::try_from(null_treatment)
+                    .map_err(|_| {
+                        proto_error(format!(
+                            "Received an AggregateUdfExprNode message with unknown NullTreatment {null_treatment}",
+                        ))
+                    })?;
+                    Some(NullTreatment::from(null_treatment))
+                }
+                None => None,
+            };
 
             Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf(
                 agg_fn,
                 parse_exprs(&pb.args, registry, codec)?,
                 pb.distinct,
                 parse_optional_expr(pb.filter.as_deref(), registry, codec)?.map(Box::new),
-                match pb.order_by.len() {
-                    0 => None,
-                    _ => Some(parse_sorts(&pb.order_by, registry, codec)?),
-                },
-                None,
+                parse_sorts(&pb.order_by, registry, codec)?,
+                null_treatment,
             )))
         }
 
@@ -587,12 +637,25 @@ pub fn parse_expr(
         ExprType::Rollup(RollupNode { expr }) => Ok(Expr::GroupingSet(
             GroupingSet::Rollup(parse_exprs(expr, registry, codec)?),
         )),
-        ExprType::Placeholder(PlaceholderNode { id, data_type }) => match data_type {
-            None => Ok(Expr::Placeholder(Placeholder::new(id.clone(), None))),
-            Some(data_type) => Ok(Expr::Placeholder(Placeholder::new(
+        ExprType::Placeholder(PlaceholderNode {
+            id,
+            data_type,
+            nullable,
+            metadata,
+        }) => match data_type {
+            None => Ok(Expr::Placeholder(Placeholder::new_with_field(
                 id.clone(),
-                Some(data_type.try_into()?),
+                None,
             ))),
+            Some(data_type) => {
+                let field =
+                    Field::new("", data_type.try_into()?, nullable.unwrap_or(true))
+                        .with_metadata(metadata.clone());
+                Ok(Expr::Placeholder(Placeholder::new_with_field(
+                    id.clone(),
+                    Some(field.into()),
+                )))
+            }
         },
     }
 }
@@ -676,6 +739,10 @@ pub fn from_proto_binary_op(op: &str) -> Result<Operator, Error> {
         "RegexMatch" => Ok(Operator::RegexMatch),
         "RegexNotIMatch" => Ok(Operator::RegexNotIMatch),
         "RegexNotMatch" => Ok(Operator::RegexNotMatch),
+        "LikeMatch" => Ok(Operator::LikeMatch),
+        "ILikeMatch" => Ok(Operator::ILikeMatch),
+        "NotLikeMatch" => Ok(Operator::NotLikeMatch),
+        "NotILikeMatch" => Ok(Operator::NotILikeMatch),
         "StringConcat" => Ok(Operator::StringConcat),
         "AtArrow" => Ok(Operator::AtArrow),
         "ArrowAt" => Ok(Operator::ArrowAt),
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index d934b24dc3411..1e0264690d4fb 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -21,64 +21,62 @@ use std::sync::Arc;
 
 use crate::protobuf::logical_plan_node::LogicalPlanType::CustomScan;
 use crate::protobuf::{
-    dml_node, ColumnUnnestListItem, ColumnUnnestListRecursion, CteWorkTableScanNode,
-    CustomTableScanNode, DmlNode, SortExprNodeCollection,
+    ColumnUnnestListItem, ColumnUnnestListRecursion, CteWorkTableScanNode,
+    CustomTableScanNode, DmlNode, SortExprNodeCollection, dml_node,
 };
 use crate::{
     convert_required, into_required,
     protobuf::{
-        self, listing_table_scan_node::FileFormatType,
-        logical_plan_node::LogicalPlanType, LogicalExtensionNode, LogicalPlanNode,
+        self, LogicalExtensionNode, LogicalPlanNode,
+        listing_table_scan_node::FileFormatType, logical_plan_node::LogicalPlanType,
     },
 };
 
-use crate::protobuf::{proto_error, ToProtoError};
-use arrow::datatypes::{DataType, Schema, SchemaBuilder, SchemaRef};
-use datafusion::datasource::cte_worktable::CteWorkTable;
-#[cfg(feature = "avro")]
-use datafusion::datasource::file_format::avro::AvroFormat;
-#[cfg(feature = "parquet")]
-use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::file_format::{
-    file_type_to_format, format_as_file_type, FileFormatFactory,
-};
-use datafusion::{
-    datasource::{
-        file_format::{
-            csv::CsvFormat, json::JsonFormat as OtherNdJsonFormat, FileFormat,
-        },
-        listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
-        view::ViewTable,
-        TableProvider,
-    },
-    datasource::{provider_as_source, source_as_provider},
-    prelude::SessionContext,
-};
+use crate::protobuf::{ToProtoError, proto_error};
+use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef};
+use datafusion_catalog::cte_worktable::CteWorkTable;
 use datafusion_common::file_options::file_type::FileType;
 use datafusion_common::{
-    context, internal_datafusion_err, internal_err, not_impl_err, plan_err,
-    DataFusionError, Result, TableReference, ToDFSchema,
+    Result, TableReference, ToDFSchema, assert_or_internal_err, context,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
+};
+use datafusion_datasource::file_format::FileFormat;
+use datafusion_datasource::file_format::{
+    FileFormatFactory, file_type_to_format, format_as_file_type,
+};
+use datafusion_datasource_arrow::file_format::{ArrowFormat, ArrowFormatFactory};
+#[cfg(feature = "avro")]
+use datafusion_datasource_avro::file_format::AvroFormat;
+use datafusion_datasource_csv::file_format::{CsvFormat, CsvFormatFactory};
+use datafusion_datasource_json::file_format::{
+    JsonFormat as OtherNdJsonFormat, JsonFormatFactory,
 };
+#[cfg(feature = "parquet")]
+use datafusion_datasource_parquet::file_format::{ParquetFormat, ParquetFormatFactory};
 use datafusion_expr::{
-    dml,
-    logical_plan::{
-        builder::project, Aggregate, CreateCatalog, CreateCatalogSchema,
-        CreateExternalTable, CreateView, DdlStatement, Distinct, EmptyRelation,
-        Extension, Join, JoinConstraint, Prepare, Projection, Repartition, Sort,
-        SubqueryAlias, TableScan, Values, Window,
-    },
-    DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, SortExpr,
-    Statement, WindowUDF,
+    AggregateUDF, DmlStatement, FetchType, RecursiveQuery, SkipType, TableSource, Unnest,
 };
 use datafusion_expr::{
-    AggregateUDF, ColumnUnnestList, DmlStatement, FetchType, RecursiveQuery, SkipType,
-    TableSource, Unnest,
+    DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, SortExpr,
+    Statement, WindowUDF, dml,
+    logical_plan::{
+        Aggregate, CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateView,
+        DdlStatement, Distinct, EmptyRelation, Extension, Join, JoinConstraint, Prepare,
+        Projection, Repartition, Sort, SubqueryAlias, TableScan, Values, Window,
+        builder::project,
+    },
 };
 
 use self::to_proto::{serialize_expr, serialize_exprs};
 use crate::logical_plan::to_proto::serialize_sorts;
-use prost::bytes::BufMut;
+use datafusion_catalog::TableProvider;
+use datafusion_catalog::default_table_source::{provider_as_source, source_as_provider};
+use datafusion_catalog::view::ViewTable;
+use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_execution::TaskContext;
 use prost::Message;
+use prost::bytes::BufMut;
 
 pub mod file_formats;
 pub mod from_proto;
@@ -96,7 +94,7 @@ pub trait AsLogicalPlan: Debug + Send + Sync + Clone {
 
     fn try_into_logical_plan(
         &self,
-        ctx: &SessionContext,
+        ctx: &TaskContext,
         extension_codec: &dyn LogicalExtensionCodec,
     ) -> Result<LogicalPlan>;
 
@@ -108,12 +106,12 @@ pub trait AsLogicalPlan: Debug + Send + Sync + Clone {
         Self: Sized;
 }
 
-pub trait LogicalExtensionCodec: Debug + Send + Sync {
+pub trait LogicalExtensionCodec: Debug + Send + Sync + std::any::Any {
     fn try_decode(
         &self,
         buf: &[u8],
         inputs: &[LogicalPlan],
-        ctx: &SessionContext,
+        ctx: &TaskContext,
     ) -> Result<Extension>;
 
     fn try_encode(&self, node: &Extension, buf: &mut Vec<u8>) -> Result<()>;
@@ -123,7 +121,7 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync {
         buf: &[u8],
         table_ref: &TableReference,
         schema: SchemaRef,
-        ctx: &SessionContext,
+        ctx: &TaskContext,
     ) -> Result<Arc<dyn TableProvider>>;
 
     fn try_encode_table_provider(
@@ -136,7 +134,7 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync {
     fn try_decode_file_format(
         &self,
         _buf: &[u8],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn FileFormatFactory>> {
         not_impl_err!("LogicalExtensionCodec is not provided for file format")
     }
@@ -184,7 +182,7 @@ impl LogicalExtensionCodec for DefaultLogicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Extension> {
         not_impl_err!("LogicalExtensionCodec is not provided")
     }
@@ -198,7 +196,7 @@ impl LogicalExtensionCodec for DefaultLogicalExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: SchemaRef,
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn TableProvider>> {
         not_impl_err!("LogicalExtensionCodec is not provided")
     }
@@ -211,6 +209,99 @@ impl LogicalExtensionCodec for DefaultLogicalExtensionCodec {
     ) -> Result<()> {
         not_impl_err!("LogicalExtensionCodec is not provided")
     }
+
+    fn try_decode_file_format(
+        &self,
+        buf: &[u8],
+        ctx: &TaskContext,
+    ) -> Result<Arc<dyn FileFormatFactory>> {
+        let proto = protobuf::FileFormatProto::decode(buf).map_err(|e| {
+            internal_datafusion_err!("Failed to decode FileFormatProto: {e}")
+        })?;
+
+        let kind = protobuf::FileFormatKind::try_from(proto.kind).map_err(|_| {
+            internal_datafusion_err!("Unknown FileFormatKind: {}", proto.kind)
+        })?;
+
+        match kind {
+            protobuf::FileFormatKind::Csv => file_formats::CsvLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            protobuf::FileFormatKind::Json => file_formats::JsonLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            #[cfg(feature = "parquet")]
+            protobuf::FileFormatKind::Parquet => {
+                file_formats::ParquetLogicalExtensionCodec
+                    .try_decode_file_format(&proto.encoded_file_format, ctx)
+            }
+            protobuf::FileFormatKind::Arrow => file_formats::ArrowLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            protobuf::FileFormatKind::Avro => file_formats::AvroLogicalExtensionCodec
+                .try_decode_file_format(&proto.encoded_file_format, ctx),
+            #[cfg(not(feature = "parquet"))]
+            protobuf::FileFormatKind::Parquet => {
+                not_impl_err!("Parquet support requires the 'parquet' feature")
+            }
+            protobuf::FileFormatKind::Unspecified => {
+                not_impl_err!("Unspecified file format kind")
+            }
+        }
+    }
+
+    fn try_encode_file_format(
+        &self,
+        buf: &mut Vec<u8>,
+        node: Arc<dyn FileFormatFactory>,
+    ) -> Result<()> {
+        let mut encoded_file_format = Vec::new();
+
+        let kind = if node.as_any().downcast_ref::<CsvFormatFactory>().is_some() {
+            file_formats::CsvLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Csv
+        } else if node.as_any().downcast_ref::<JsonFormatFactory>().is_some() {
+            file_formats::JsonLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Json
+        } else if node.as_any().downcast_ref::<ArrowFormatFactory>().is_some() {
+            file_formats::ArrowLogicalExtensionCodec
+                .try_encode_file_format(&mut encoded_file_format, Arc::clone(&node))?;
+            protobuf::FileFormatKind::Arrow
+        } else {
+            #[cfg(feature = "parquet")]
+            {
+                if node
+                    .as_any()
+                    .downcast_ref::<ParquetFormatFactory>()
+                    .is_some()
+                {
+                    file_formats::ParquetLogicalExtensionCodec.try_encode_file_format(
+                        &mut encoded_file_format,
+                        Arc::clone(&node),
+                    )?;
+                    protobuf::FileFormatKind::Parquet
+                } else {
+                    return not_impl_err!(
+                        "Unsupported FileFormatFactory type for DefaultLogicalExtensionCodec"
+                    );
+                }
+            }
+            #[cfg(not(feature = "parquet"))]
+            {
+                return not_impl_err!(
+                    "Unsupported FileFormatFactory type for DefaultLogicalExtensionCodec"
+                );
+            }
+        };
+
+        let proto = protobuf::FileFormatProto {
+            kind: kind as i32,
+            encoded_file_format,
+        };
+        proto.encode(buf).map_err(|e| {
+            internal_datafusion_err!("Failed to encode FileFormatProto: {e}")
+        })?;
+        Ok(())
+    }
 }
 
 #[macro_export]
@@ -229,9 +320,9 @@ fn from_table_reference(
     error_context: &str,
 ) -> Result<TableReference> {
     let table_ref = table_ref.ok_or_else(|| {
-        DataFusionError::Internal(format!(
+        internal_datafusion_err!(
             "Protobuf deserialization error, {error_context} was missing required field name."
-        ))
+        )
     })?;
 
     Ok(table_ref.clone().try_into()?)
@@ -242,7 +333,7 @@ fn from_table_reference(
 /// serialized by [from_table_source]
 fn to_table_source(
     node: &Option<Box<LogicalPlanNode>>,
-    ctx: &SessionContext,
+    ctx: &TaskContext,
     extension_codec: &dyn LogicalExtensionCodec,
 ) -> Result<Arc<dyn TableSource>> {
     if let Some(node) = node {
@@ -281,9 +372,8 @@ impl AsLogicalPlan for LogicalPlanNode {
     where
         Self: Sized,
     {
-        LogicalPlanNode::decode(buf).map_err(|e| {
-            DataFusionError::Internal(format!("failed to decode logical plan: {e:?}"))
-        })
+        LogicalPlanNode::decode(buf)
+            .map_err(|e| internal_datafusion_err!("failed to decode logical plan: {e:?}"))
     }
 
     fn try_encode<B>(&self, buf: &mut B) -> Result<()>
@@ -291,14 +381,13 @@ impl AsLogicalPlan for LogicalPlanNode {
         B: BufMut,
         Self: Sized,
     {
-        self.encode(buf).map_err(|e| {
-            DataFusionError::Internal(format!("failed to encode logical plan: {e:?}"))
-        })
+        self.encode(buf)
+            .map_err(|e| internal_datafusion_err!("failed to encode logical plan: {e:?}"))
     }
 
     fn try_into_logical_plan(
         &self,
-        ctx: &SessionContext,
+        ctx: &TaskContext,
         extension_codec: &dyn LogicalExtensionCodec,
     ) -> Result<LogicalPlan> {
         let plan = self.logical_plan_type.as_ref().ok_or_else(|| {
@@ -379,16 +468,6 @@ impl AsLogicalPlan for LogicalPlanNode {
             LogicalPlanType::ListingScan(scan) => {
                 let schema: Schema = convert_required!(scan.schema)?;
 
-                let mut projection = None;
-                if let Some(columns) = &scan.projection {
-                    let column_indices = columns
-                        .columns
-                        .iter()
-                        .map(|name| schema.index_of(name))
-                        .collect::<Result<Vec<usize>, _>>()?;
-                    projection = Some(column_indices);
-                }
-
                 let filters =
                     from_proto::parse_exprs(&scan.filters, ctx, extension_codec)?;
 
@@ -438,14 +517,20 @@ impl AsLogicalPlan for LogicalPlanNode {
                             }
                             Arc::new(json)
                         }
-                        #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
                         FileFormatType::Avro(..) => {
-                            #[cfg(feature = "avro")] 
+                            #[cfg(feature = "avro")]
                             {
                                 Arc::new(AvroFormat)
                             }
                             #[cfg(not(feature = "avro"))]
-                            panic!("Unable to process avro file since `avro` feature is not enabled");
+                            {
+                                panic!(
+                                    "Unable to process avro file since `avro` feature is not enabled"
+                                );
+                            }
+                        }
+                        FileFormatType::Arrow(..) => {
+                            Arc::new(ArrowFormat)
                         }
                     };
 
@@ -484,15 +569,22 @@ impl AsLogicalPlan for LogicalPlanNode {
                         .with_schema(Arc::new(schema));
 
                 let provider = ListingTable::try_new(config)?.with_cache(
-                    ctx.state()
-                        .runtime_env()
-                        .cache_manager
-                        .get_file_statistic_cache(),
+                    ctx.runtime_env().cache_manager.get_file_statistic_cache(),
                 );
 
                 let table_name =
                     from_table_reference(scan.table_name.as_ref(), "ListingTableScan")?;
 
+                let mut projection = None;
+                if let Some(columns) = &scan.projection {
+                    let column_indices = columns
+                        .columns
+                        .iter()
+                        .map(|name| provider.schema().index_of(name))
+                        .collect::<Result<Vec<usize>, _>>()?;
+                    projection = Some(column_indices);
+                }
+
                 LogicalPlanBuilder::scan_with_filters(
                     table_name,
                     provider_as_source(Arc::new(provider)),
@@ -546,7 +638,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     .build()
             }
             LogicalPlanType::Repartition(repartition) => {
-                use datafusion::logical_expr::Partitioning;
+                use datafusion_expr::Partitioning;
                 let input: LogicalPlan =
                     into_logical_plan!(repartition.input, ctx, extension_codec)?;
                 use protobuf::repartition_node::PartitionMethod;
@@ -578,15 +670,15 @@ impl AsLogicalPlan for LogicalPlanNode {
             }
             LogicalPlanType::CreateExternalTable(create_extern_table) => {
                 let pb_schema = (create_extern_table.schema.clone()).ok_or_else(|| {
-                    DataFusionError::Internal(String::from(
+                    internal_datafusion_err!(
                         "Protobuf deserialization error, CreateExternalTableNode was missing required field schema."
-                    ))
+                    )
                 })?;
 
                 let constraints = (create_extern_table.constraints.clone()).ok_or_else(|| {
-                    DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, CreateExternalTableNode was missing required table constraints.",
-                    ))
+                    internal_datafusion_err!(
+                        "Protobuf deserialization error, CreateExternalTableNode was missing required table constraints."
+                    )
                 })?;
                 let definition = if !create_extern_table.definition.is_empty() {
                     Some(create_extern_table.definition.clone())
@@ -594,11 +686,6 @@ impl AsLogicalPlan for LogicalPlanNode {
                     None
                 };
 
-                let file_type = create_extern_table.file_type.as_str();
-                if ctx.table_factory(file_type).is_none() {
-                    internal_err!("No TableProviderFactory for file type: {file_type}")?
-                }
-
                 let mut order_exprs = vec![];
                 for expr in &create_extern_table.order_exprs {
                     order_exprs.push(from_proto::parse_sorts(
@@ -616,33 +703,33 @@ impl AsLogicalPlan for LogicalPlanNode {
                 }
 
                 Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(
-                    CreateExternalTable {
-                        schema: pb_schema.try_into()?,
-                        name: from_table_reference(
+                    CreateExternalTable::builder(
+                        from_table_reference(
                             create_extern_table.name.as_ref(),
                             "CreateExternalTable",
                         )?,
-                        location: create_extern_table.location.clone(),
-                        file_type: create_extern_table.file_type.clone(),
-                        table_partition_cols: create_extern_table
-                            .table_partition_cols
-                            .clone(),
-                        order_exprs,
-                        if_not_exists: create_extern_table.if_not_exists,
-                        temporary: create_extern_table.temporary,
-                        definition,
-                        unbounded: create_extern_table.unbounded,
-                        options: create_extern_table.options.clone(),
-                        constraints: constraints.into(),
-                        column_defaults,
-                    },
+                        create_extern_table.location.clone(),
+                        create_extern_table.file_type.clone(),
+                        pb_schema.try_into()?,
+                    )
+                    .with_partition_cols(create_extern_table.table_partition_cols.clone())
+                    .with_order_exprs(order_exprs)
+                    .with_if_not_exists(create_extern_table.if_not_exists)
+                    .with_or_replace(create_extern_table.or_replace)
+                    .with_temporary(create_extern_table.temporary)
+                    .with_definition(definition)
+                    .with_unbounded(create_extern_table.unbounded)
+                    .with_options(create_extern_table.options.clone())
+                    .with_constraints(constraints.into())
+                    .with_column_defaults(column_defaults)
+                    .build(),
                 )))
             }
             LogicalPlanType::CreateView(create_view) => {
                 let plan = create_view
-                    .input.clone().ok_or_else(|| DataFusionError::Internal(String::from(
-                    "Protobuf deserialization error, CreateViewNode has invalid LogicalPlan input.",
-                )))?
+                    .input.clone().ok_or_else(|| internal_datafusion_err!(
+                    "Protobuf deserialization error, CreateViewNode has invalid LogicalPlan input."
+                ))?
                     .try_into_logical_plan(ctx, extension_codec)?;
                 let definition = if !create_view.definition.is_empty() {
                     Some(create_view.definition.clone())
@@ -660,9 +747,9 @@ impl AsLogicalPlan for LogicalPlanNode {
             }
             LogicalPlanType::CreateCatalogSchema(create_catalog_schema) => {
                 let pb_schema = (create_catalog_schema.schema.clone()).ok_or_else(|| {
-                    DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, CreateCatalogSchemaNode was missing required field schema.",
-                    ))
+                    internal_datafusion_err!(
+                        "Protobuf deserialization error, CreateCatalogSchemaNode was missing required field schema."
+                    )
                 })?;
 
                 Ok(LogicalPlan::Ddl(DdlStatement::CreateCatalogSchema(
@@ -675,9 +762,9 @@ impl AsLogicalPlan for LogicalPlanNode {
             }
             LogicalPlanType::CreateCatalog(create_catalog) => {
                 let pb_schema = (create_catalog.schema.clone()).ok_or_else(|| {
-                    DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, CreateCatalogNode was missing required field schema.",
-                    ))
+                    internal_datafusion_err!(
+                        "Protobuf deserialization error, CreateCatalogNode was missing required field schema."
+                    )
                 })?;
 
                 Ok(LogicalPlan::Ddl(DdlStatement::CreateCatalog(
@@ -785,11 +872,10 @@ impl AsLogicalPlan for LogicalPlanNode {
                 builder.build()
             }
             LogicalPlanType::Union(union) => {
-                if union.inputs.len() < 2 {
-                    return  Err( DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, Union was require at least two input.",
-                    )));
-                }
+                assert_or_internal_err!(
+                    union.inputs.len() >= 2,
+                    "Protobuf deserialization error, Union requires at least two inputs."
+                );
                 let (first, rest) = union.inputs.split_first().unwrap();
                 let mut builder = LogicalPlanBuilder::from(
                     first.try_into_logical_plan(ctx, extension_codec)?,
@@ -886,9 +972,33 @@ impl AsLogicalPlan for LogicalPlanNode {
                     .iter()
                     .map(DataType::try_from)
                     .collect::<Result<_, _>>()?;
-                LogicalPlanBuilder::from(input)
-                    .prepare(prepare.name.clone(), data_types)?
-                    .build()
+                let fields: Vec<Field> = prepare
+                    .fields
+                    .iter()
+                    .map(Field::try_from)
+                    .collect::<Result<_, _>>()?;
+
+                // If the fields are empty this may have been generated by an
+                // earlier version of DataFusion, in which case the DataTypes
+                // can be used to construct the plan.
+                if fields.is_empty() {
+                    LogicalPlanBuilder::from(input)
+                        .prepare(
+                            prepare.name.clone(),
+                            data_types
+                                .into_iter()
+                                .map(|dt| Field::new("", dt, true).into())
+                                .collect(),
+                        )?
+                        .build()
+                } else {
+                    LogicalPlanBuilder::from(input)
+                        .prepare(
+                            prepare.name.clone(),
+                            fields.into_iter().map(|f| f.into()).collect(),
+                        )?
+                        .build()
+                }
             }
             LogicalPlanType::DropView(dropview) => {
                 Ok(LogicalPlan::Ddl(DdlStatement::DropView(DropView {
@@ -905,67 +1015,40 @@ impl AsLogicalPlan for LogicalPlanNode {
                     extension_codec.try_decode_file_format(&copy.file_type, ctx)?,
                 );
 
-                Ok(LogicalPlan::Copy(dml::CopyTo {
-                    input: Arc::new(input),
-                    output_url: copy.output_url.clone(),
-                    partition_by: copy.partition_by.clone(),
+                Ok(LogicalPlan::Copy(dml::CopyTo::new(
+                    Arc::new(input),
+                    copy.output_url.clone(),
+                    copy.partition_by.clone(),
                     file_type,
-                    options: Default::default(),
-                }))
+                    Default::default(),
+                )))
             }
             LogicalPlanType::Unnest(unnest) => {
                 let input: LogicalPlan =
                     into_logical_plan!(unnest.input, ctx, extension_codec)?;
-                Ok(LogicalPlan::Unnest(Unnest {
-                    input: Arc::new(input),
-                    exec_columns: unnest.exec_columns.iter().map(|c| c.into()).collect(),
-                    list_type_columns: unnest
-                        .list_type_columns
-                        .iter()
-                        .map(|c| {
-                            let recursion_item = c.recursion.as_ref().unwrap();
-                            (
-                                c.input_index as _,
-                                ColumnUnnestList {
-                                    output_column: recursion_item
-                                        .output_column
-                                        .as_ref()
-                                        .unwrap()
-                                        .into(),
-                                    depth: recursion_item.depth as _,
-                                },
-                            )
-                        })
-                        .collect(),
-                    struct_type_columns: unnest
-                        .struct_type_columns
-                        .iter()
-                        .map(|c| *c as usize)
-                        .collect(),
-                    dependency_indices: unnest
-                        .dependency_indices
-                        .iter()
-                        .map(|c| *c as usize)
-                        .collect(),
-                    schema: Arc::new(convert_required!(unnest.schema)?),
-                    options: into_required!(unnest.options)?,
-                }))
+
+                LogicalPlanBuilder::from(input)
+                    .unnest_columns_with_options(
+                        unnest.exec_columns.iter().map(|c| c.into()).collect(),
+                        into_required!(unnest.options)?,
+                    )?
+                    .build()
             }
             LogicalPlanType::RecursiveQuery(recursive_query_node) => {
                 let static_term = recursive_query_node
                     .static_term
                     .as_ref()
-                    .ok_or_else(|| DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, RecursiveQueryNode was missing required field static_term.",
-                    )))?
+                    .ok_or_else(|| internal_datafusion_err!(
+                        "Protobuf deserialization error, RecursiveQueryNode was missing required field static_term."
+                    ))?
                     .try_into_logical_plan(ctx, extension_codec)?;
 
                 let recursive_term = recursive_query_node
                     .recursive_term
                     .as_ref()
-                    .ok_or_else(|| DataFusionError::Internal(String::from(
-                        "Protobuf deserialization error, RecursiveQueryNode was missing required field recursive_term.",
-                    )))?
+                    .ok_or_else(|| internal_datafusion_err!(
+                        "Protobuf deserialization error, RecursiveQueryNode was missing required field recursive_term."
+                    ))?
                     .try_into_logical_plan(ctx, extension_codec)?;
 
                 Ok(LogicalPlan::RecursiveQuery(RecursiveQuery {
@@ -986,14 +1069,14 @@ impl AsLogicalPlan for LogicalPlanNode {
                 )?
                 .build()
             }
-            LogicalPlanType::Dml(dml_node) => Ok(LogicalPlan::Dml(
-                datafusion::logical_expr::DmlStatement::new(
+            LogicalPlanType::Dml(dml_node) => {
+                Ok(LogicalPlan::Dml(datafusion_expr::DmlStatement::new(
                     from_table_reference(dml_node.table_name.as_ref(), "DML ")?,
                     to_table_source(&dml_node.target, ctx, extension_codec)?,
                     dml_node.dml_type().into(),
                     Arc::new(into_logical_plan!(dml_node.input, ctx, extension_codec)?),
-                ),
-            )),
+                )))
+            }
         }
     }
 
@@ -1085,13 +1168,18 @@ impl AsLogicalPlan for LogicalPlanNode {
                                 Some(FileFormatType::Avro(protobuf::AvroFormat {}))
                         }
 
+                        if any.is::<ArrowFormat>() {
+                            maybe_some_type =
+                                Some(FileFormatType::Arrow(protobuf::ArrowFormat {}))
+                        }
+
                         if let Some(file_format_type) = maybe_some_type {
                             file_format_type
                         } else {
                             return Err(proto_error(format!(
-                            "Error converting file format, {:?} is invalid as a datafusion format.",
-                            listing_table.options().format
-                        )));
+                                "Error deserializing unknown file format: {:?}",
+                                listing_table.options().format
+                            )));
                         }
                     };
 
@@ -1327,7 +1415,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                 filter,
                 join_type,
                 join_constraint,
-                null_equals_null,
+                null_equality,
                 ..
             }) => {
                 let left: LogicalPlanNode = LogicalPlanNode::try_from_logical_plan(
@@ -1352,6 +1440,8 @@ impl AsLogicalPlan for LogicalPlanNode {
                 let join_type: protobuf::JoinType = join_type.to_owned().into();
                 let join_constraint: protobuf::JoinConstraint =
                     join_constraint.to_owned().into();
+                let null_equality: protobuf::NullEquality =
+                    null_equality.to_owned().into();
                 let filter = filter
                     .as_ref()
                     .map(|e| serialize_expr(e, extension_codec))
@@ -1365,7 +1455,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                             join_constraint: join_constraint.into(),
                             left_join_key,
                             right_join_key,
-                            null_equals_null: *null_equals_null,
+                            null_equality: null_equality.into(),
                             filter,
                         },
                     ))),
@@ -1435,14 +1525,14 @@ impl AsLogicalPlan for LogicalPlanNode {
                 input,
                 partitioning_scheme,
             }) => {
-                use datafusion::logical_expr::Partitioning;
+                use datafusion_expr::Partitioning;
                 let input: LogicalPlanNode = LogicalPlanNode::try_from_logical_plan(
                     input.as_ref(),
                     extension_codec,
                 )?;
 
                 // Assumed common usize field was batch size
-                // Used u64 to avoid any nastyness involving large values, most data clusters are probably uniformly 64 bits any ways
+                // Used u64 to avoid any nastiness involving large values, most data clusters are probably uniformly 64 bits any ways
                 use protobuf::repartition_node::PartitionMethod;
 
                 let pb_partition_method = match partitioning_scheme {
@@ -1456,7 +1546,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                         PartitionMethod::RoundRobin(*partition_count as u64)
                     }
                     Partitioning::DistributeBy(_) => {
-                        return not_impl_err!("DistributeBy")
+                        return not_impl_err!("DistributeBy");
                     }
                 };
 
@@ -1486,6 +1576,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     schema: df_schema,
                     table_partition_cols,
                     if_not_exists,
+                    or_replace,
                     definition,
                     order_exprs,
                     unbounded,
@@ -1519,6 +1610,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                             schema: Some(df_schema.try_into()?),
                             table_partition_cols: table_partition_cols.clone(),
                             if_not_exists: *if_not_exists,
+                            or_replace: *or_replace,
                             temporary: *temporary,
                             order_exprs: converted_order_exprs,
                             definition: definition.clone().unwrap_or_default(),
@@ -1637,7 +1729,7 @@ impl AsLogicalPlan for LogicalPlanNode {
             }
             LogicalPlan::Statement(Statement::Prepare(Prepare {
                 name,
-                data_types,
+                fields,
                 input,
             })) => {
                 let input =
@@ -1646,11 +1738,17 @@ impl AsLogicalPlan for LogicalPlanNode {
                     logical_plan_type: Some(LogicalPlanType::Prepare(Box::new(
                         protobuf::PrepareNode {
                             name: name.clone(),
-                            data_types: data_types
+                            input: Some(Box::new(input)),
+                            // Store the DataTypes for reading by older DataFusion
+                            data_types: fields
                                 .iter()
-                                .map(|t| t.try_into())
+                                .map(|f| f.data_type().try_into())
+                                .collect::<Result<Vec<_>, _>>()?,
+                            // Store the Fields for current and future DataFusion
+                            fields: fields
+                                .iter()
+                                .map(|f| f.as_ref().try_into())
                                 .collect::<Result<Vec<_>, _>>()?,
-                            input: Some(Box::new(input)),
                         },
                     ))),
                 })
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs
index 841c31fa035f4..6fcb7389922ad 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -21,22 +21,24 @@
 
 use std::collections::HashMap;
 
-use datafusion_common::{TableReference, UnnestOptions};
+use datafusion_common::{NullEquality, TableReference, UnnestOptions};
+use datafusion_expr::WriteOp;
 use datafusion_expr::dml::InsertOp;
 use datafusion_expr::expr::{
     self, AggregateFunctionParams, Alias, Between, BinaryExpr, Cast, GroupingSet, InList,
-    Like, Placeholder, ScalarFunction, Unnest,
+    Like, NullTreatment, Placeholder, ScalarFunction, Unnest,
 };
-use datafusion_expr::WriteOp;
 use datafusion_expr::{
-    logical_plan::PlanType, logical_plan::StringifiedPlan, Expr, JoinConstraint,
-    JoinType, SortExpr, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits,
-    WindowFunctionDefinition,
+    Expr, JoinConstraint, JoinType, SortExpr, TryCast, WindowFrame, WindowFrameBound,
+    WindowFrameUnits, WindowFunctionDefinition, logical_plan::PlanType,
+    logical_plan::StringifiedPlan,
 };
 
 use crate::protobuf::RecursionUnnestOption;
 use crate::protobuf::{
-    self,
+    self, AnalyzedLogicalPlanType, CubeNode, EmptyMessage, GroupingSetNode,
+    LogicalExprList, OptimizedLogicalPlanType, OptimizedPhysicalPlanType,
+    PlaceholderNode, RollupNode, ToProtoError as Error,
     plan_type::PlanTypeEnum::{
         AnalyzedLogicalPlan, FinalAnalyzedLogicalPlan, FinalLogicalPlan,
         FinalPhysicalPlan, FinalPhysicalPlanWithSchema, FinalPhysicalPlanWithStats,
@@ -44,9 +46,6 @@ use crate::protobuf::{
         InitialPhysicalPlanWithStats, OptimizedLogicalPlan, OptimizedPhysicalPlan,
         PhysicalPlanError,
     },
-    AnalyzedLogicalPlanType, CubeNode, EmptyMessage, GroupingSetNode, LogicalExprList,
-    OptimizedLogicalPlanType, OptimizedPhysicalPlanType, PlaceholderNode, RollupNode,
-    ToProtoError as Error,
 };
 
 use super::LogicalExtensionCodec;
@@ -211,13 +210,16 @@ pub fn serialize_expr(
                     .map(|r| vec![r.into()])
                     .unwrap_or(vec![]),
                 alias: name.to_owned(),
-                metadata: metadata.to_owned().unwrap_or(HashMap::new()),
+                metadata: metadata
+                    .as_ref()
+                    .map(|m| m.to_hashmap())
+                    .unwrap_or(HashMap::new()),
             });
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::Alias(alias)),
             }
         }
-        Expr::Literal(value) => {
+        Expr::Literal(value, _) => {
             let pb_value: protobuf::ScalarValue = value.try_into()?;
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::Literal(pb_value)),
@@ -302,66 +304,70 @@ pub fn serialize_expr(
                 expr_type: Some(ExprType::SimilarTo(pb)),
             }
         }
-        Expr::WindowFunction(expr::WindowFunction {
-            ref fun,
-            params:
-                expr::WindowFunctionParams {
-                    ref args,
-                    ref partition_by,
-                    ref order_by,
-                    ref window_frame,
-                    // TODO: support null treatment in proto
-                    null_treatment: _,
-                },
-        }) => {
-            let (window_function, fun_definition) = match fun {
+        Expr::WindowFunction(window_fun) => {
+            let expr::WindowFunction {
+                fun,
+                params:
+                    expr::WindowFunctionParams {
+                        args,
+                        partition_by,
+                        order_by,
+                        window_frame,
+                        null_treatment,
+                        distinct,
+                        filter,
+                    },
+            } = window_fun.as_ref();
+            let mut buf = Vec::new();
+            let window_function = match fun {
                 WindowFunctionDefinition::AggregateUDF(aggr_udf) => {
-                    let mut buf = Vec::new();
                     let _ = codec.try_encode_udaf(aggr_udf, &mut buf);
-                    (
-                        protobuf::window_expr_node::WindowFunction::Udaf(
-                            aggr_udf.name().to_string(),
-                        ),
-                        (!buf.is_empty()).then_some(buf),
+                    protobuf::window_expr_node::WindowFunction::Udaf(
+                        aggr_udf.name().to_string(),
                     )
                 }
                 WindowFunctionDefinition::WindowUDF(window_udf) => {
-                    let mut buf = Vec::new();
                     let _ = codec.try_encode_udwf(window_udf, &mut buf);
-                    (
-                        protobuf::window_expr_node::WindowFunction::Udwf(
-                            window_udf.name().to_string(),
-                        ),
-                        (!buf.is_empty()).then_some(buf),
+                    protobuf::window_expr_node::WindowFunction::Udwf(
+                        window_udf.name().to_string(),
                     )
                 }
             };
+            let fun_definition = (!buf.is_empty()).then_some(buf);
             let partition_by = serialize_exprs(partition_by, codec)?;
             let order_by = serialize_sorts(order_by, codec)?;
 
             let window_frame: Option<protobuf::WindowFrame> =
                 Some(window_frame.try_into()?);
+
             let window_expr = protobuf::WindowExprNode {
                 exprs: serialize_exprs(args, codec)?,
                 window_function: Some(window_function),
                 partition_by,
                 order_by,
                 window_frame,
+                distinct: *distinct,
+                filter: match filter {
+                    Some(e) => Some(Box::new(serialize_expr(e.as_ref(), codec)?)),
+                    None => None,
+                },
+                null_treatment: null_treatment
+                    .map(|nt| protobuf::NullTreatment::from(nt).into()),
                 fun_definition,
             };
             protobuf::LogicalExprNode {
-                expr_type: Some(ExprType::WindowExpr(window_expr)),
+                expr_type: Some(ExprType::WindowExpr(Box::new(window_expr))),
             }
         }
         Expr::AggregateFunction(expr::AggregateFunction {
-            ref func,
+            func,
             params:
                 AggregateFunctionParams {
-                    ref args,
-                    ref distinct,
-                    ref filter,
-                    ref order_by,
-                    null_treatment: _,
+                    args,
+                    distinct,
+                    filter,
+                    order_by,
+                    null_treatment,
                 },
         }) => {
             let mut buf = Vec::new();
@@ -376,11 +382,10 @@ pub fn serialize_expr(
                             Some(e) => Some(Box::new(serialize_expr(e.as_ref(), codec)?)),
                             None => None,
                         },
-                        order_by: match order_by {
-                            Some(e) => serialize_sorts(e, codec)?,
-                            None => vec![],
-                        },
+                        order_by: serialize_sorts(order_by, codec)?,
                         fun_definition: (!buf.is_empty()).then_some(buf),
+                        null_treatment: null_treatment
+                            .map(|nt| protobuf::NullTreatment::from(nt).into()),
                     },
                 ))),
             }
@@ -389,7 +394,7 @@ pub fn serialize_expr(
         Expr::ScalarVariable(_, _) => {
             return Err(Error::General(
                 "Proto serialization error: Scalar Variable not supported".to_string(),
-            ))
+            ));
         }
         Expr::ScalarFunction(ScalarFunction { func, args }) => {
             let mut buf = Vec::new();
@@ -516,19 +521,23 @@ pub fn serialize_expr(
                 expr_type: Some(ExprType::Case(expr)),
             }
         }
-        Expr::Cast(Cast { expr, data_type }) => {
+        Expr::Cast(Cast { expr, field }) => {
             let expr = Box::new(protobuf::CastNode {
                 expr: Some(Box::new(serialize_expr(expr.as_ref(), codec)?)),
-                arrow_type: Some(data_type.try_into()?),
+                arrow_type: Some(field.data_type().try_into()?),
+                metadata: field.metadata().clone(),
+                nullable: Some(field.is_nullable()),
             });
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::Cast(expr)),
             }
         }
-        Expr::TryCast(TryCast { expr, data_type }) => {
+        Expr::TryCast(TryCast { expr, field }) => {
             let expr = Box::new(protobuf::TryCastNode {
                 expr: Some(Box::new(serialize_expr(expr.as_ref(), codec)?)),
-                arrow_type: Some(data_type.try_into()?),
+                arrow_type: Some(field.data_type().try_into()?),
+                metadata: field.metadata().clone(),
+                nullable: Some(field.is_nullable()),
             });
             protobuf::LogicalExprNode {
                 expr_type: Some(ExprType::TryCast(expr)),
@@ -573,6 +582,7 @@ pub fn serialize_expr(
         Expr::ScalarSubquery(_)
         | Expr::InSubquery(_)
         | Expr::Exists { .. }
+        | Expr::SetComparison(_)
         | Expr::OuterReferenceColumn { .. } => {
             // we would need to add logical plan operators to datafusion.proto to support this
             // see discussion in https://github.com/apache/datafusion/issues/2565
@@ -602,18 +612,20 @@ pub fn serialize_expr(
                 })),
             }
         }
-        Expr::Placeholder(Placeholder { id, data_type }) => {
-            let data_type = match data_type {
-                Some(data_type) => Some(data_type.try_into()?),
-                None => None,
-            };
-            protobuf::LogicalExprNode {
-                expr_type: Some(ExprType::Placeholder(PlaceholderNode {
-                    id: id.clone(),
-                    data_type,
-                })),
-            }
-        }
+        Expr::Placeholder(Placeholder { id, field }) => protobuf::LogicalExprNode {
+            expr_type: Some(ExprType::Placeholder(PlaceholderNode {
+                id: id.clone(),
+                data_type: match field {
+                    Some(field) => Some(field.data_type().try_into()?),
+                    None => None,
+                },
+                nullable: field.as_ref().map(|f| f.is_nullable()),
+                metadata: field
+                    .as_ref()
+                    .map(|f| f.metadata().clone())
+                    .unwrap_or(HashMap::new()),
+            })),
+        },
     };
 
     Ok(expr_node)
@@ -687,6 +699,7 @@ impl From<JoinType> for protobuf::JoinType {
             JoinType::LeftAnti => protobuf::JoinType::Leftanti,
             JoinType::RightAnti => protobuf::JoinType::Rightanti,
             JoinType::LeftMark => protobuf::JoinType::Leftmark,
+            JoinType::RightMark => protobuf::JoinType::Rightmark,
         }
     }
 }
@@ -700,6 +713,15 @@ impl From<JoinConstraint> for protobuf::JoinConstraint {
     }
 }
 
+impl From<NullEquality> for protobuf::NullEquality {
+    fn from(t: NullEquality) -> Self {
+        match t {
+            NullEquality::NullEqualsNothing => protobuf::NullEquality::NullEqualsNothing,
+            NullEquality::NullEqualsNull => protobuf::NullEquality::NullEqualsNull,
+        }
+    }
+}
+
 impl From<&WriteOp> for protobuf::dml_node::Type {
     fn from(t: &WriteOp) -> Self {
         match t {
@@ -711,6 +733,16 @@ impl From<&WriteOp> for protobuf::dml_node::Type {
             WriteOp::Delete => protobuf::dml_node::Type::Delete,
             WriteOp::Update => protobuf::dml_node::Type::Update,
             WriteOp::Ctas => protobuf::dml_node::Type::Ctas,
+            WriteOp::Truncate => protobuf::dml_node::Type::Truncate,
+        }
+    }
+}
+
+impl From<NullTreatment> for protobuf::NullTreatment {
+    fn from(t: NullTreatment) -> Self {
+        match t {
+            NullTreatment::RespectNulls => protobuf::NullTreatment::RespectNulls,
+            NullTreatment::IgnoreNulls => protobuf::NullTreatment::IgnoreNulls,
         }
     }
 }
diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs
index 5024bb558a65a..c7a4bd822663b 100644
--- a/datafusion/proto/src/physical_plan/from_proto.rs
+++ b/datafusion/proto/src/physical_plan/from_proto.rs
@@ -19,41 +19,45 @@
 
 use std::sync::Arc;
 
+use arrow::array::RecordBatch;
 use arrow::compute::SortOptions;
-use arrow::datatypes::Field;
+use arrow::datatypes::{Field, Schema};
+use arrow::ipc::reader::StreamReader;
 use chrono::{TimeZone, Utc};
-use datafusion_expr::dml::InsertOp;
-use object_store::path::Path;
-use object_store::ObjectMeta;
-
-use datafusion::arrow::datatypes::Schema;
-use datafusion::datasource::file_format::csv::CsvSink;
-use datafusion::datasource::file_format::json::JsonSink;
+use datafusion_common::{DataFusionError, Result, internal_datafusion_err, not_impl_err};
+use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_groups::FileGroup;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::file_sink_config::FileSinkConfig;
+use datafusion_datasource::{FileRange, ListingTableUrl, PartitionedFile, TableSchema};
+use datafusion_datasource_csv::file_format::CsvSink;
+use datafusion_datasource_json::file_format::JsonSink;
 #[cfg(feature = "parquet")]
-use datafusion::datasource::file_format::parquet::ParquetSink;
-use datafusion::datasource::listing::{FileRange, ListingTableUrl, PartitionedFile};
-use datafusion::datasource::object_store::ObjectStoreUrl;
-use datafusion::datasource::physical_plan::{
-    FileGroup, FileScanConfig, FileScanConfigBuilder, FileSinkConfig, FileSource,
-};
-use datafusion::execution::FunctionRegistry;
-use datafusion::logical_expr::WindowFunctionDefinition;
-use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr};
-use datafusion::physical_plan::expressions::{
-    in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr,
-    Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn,
+use datafusion_datasource_parquet::file_format::ParquetSink;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_execution::{FunctionRegistry, TaskContext};
+use datafusion_expr::WindowFunctionDefinition;
+use datafusion_expr::dml::InsertOp;
+use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs};
+use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr};
+use datafusion_physical_plan::expressions::{
+    BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr, Literal,
+    NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn, in_list,
 };
-use datafusion::physical_plan::windows::{create_window_expr, schema_add_window_field};
-use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
-use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_physical_plan::joins::{HashExpr, SeededRandomState};
+use datafusion_physical_plan::windows::{create_window_expr, schema_add_window_field};
+use datafusion_physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
 use datafusion_proto_common::common::proto_error;
+use object_store::ObjectMeta;
+use object_store::path::Path;
 
-use crate::convert_required;
+use super::{
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
+};
 use crate::logical_plan::{self};
-use crate::protobuf;
 use crate::protobuf::physical_expr_node::ExprType;
-
-use super::PhysicalExtensionCodec;
+use crate::{convert_required, protobuf};
 
 impl From<&protobuf::PhysicalColumn> for Column {
     fn from(c: &protobuf::PhysicalColumn) -> Column {
@@ -72,12 +76,18 @@ impl From<&protobuf::PhysicalColumn> for Column {
 /// * `codec` - An extension codec used to decode custom UDFs.
 pub fn parse_physical_sort_expr(
     proto: &protobuf::PhysicalSortExprNode,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<PhysicalSortExpr> {
     if let Some(expr) = &proto.expr {
-        let expr = parse_physical_expr(expr.as_ref(), registry, input_schema, codec)?;
+        let expr = proto_converter.proto_to_physical_expr(
+            expr.as_ref(),
+            ctx,
+            input_schema,
+            codec,
+        )?;
         let options = SortOptions {
             descending: !proto.asc,
             nulls_first: proto.nulls_first,
@@ -99,16 +109,17 @@ pub fn parse_physical_sort_expr(
 /// * `codec` - An extension codec used to decode custom UDFs.
 pub fn parse_physical_sort_exprs(
     proto: &[protobuf::PhysicalSortExprNode],
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
-) -> Result<LexOrdering> {
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+) -> Result<Vec<PhysicalSortExpr>> {
     proto
         .iter()
         .map(|sort_expr| {
-            parse_physical_sort_expr(sort_expr, registry, input_schema, codec)
+            parse_physical_sort_expr(sort_expr, ctx, input_schema, codec, proto_converter)
         })
-        .collect::<Result<LexOrdering>>()
+        .collect()
 }
 
 /// Parses a physical window expr from a protobuf.
@@ -123,28 +134,37 @@ pub fn parse_physical_sort_exprs(
 /// * `codec` - An extension codec used to decode custom UDFs.
 pub fn parse_physical_window_expr(
     proto: &protobuf::PhysicalWindowExprNode,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn WindowExpr>> {
     let window_node_expr =
-        parse_physical_exprs(&proto.args, registry, input_schema, codec)?;
-    let partition_by =
-        parse_physical_exprs(&proto.partition_by, registry, input_schema, codec)?;
-
-    let order_by =
-        parse_physical_sort_exprs(&proto.order_by, registry, input_schema, codec)?;
+        parse_physical_exprs(&proto.args, ctx, input_schema, codec, proto_converter)?;
+    let partition_by = parse_physical_exprs(
+        &proto.partition_by,
+        ctx,
+        input_schema,
+        codec,
+        proto_converter,
+    )?;
+
+    let order_by = parse_physical_sort_exprs(
+        &proto.order_by,
+        ctx,
+        input_schema,
+        codec,
+        proto_converter,
+    )?;
 
     let window_frame = proto
         .window_frame
         .as_ref()
         .map(|wf| wf.clone().try_into())
         .transpose()
-        .map_err(|e| DataFusionError::Internal(format!("{e}")))?
+        .map_err(|e| internal_datafusion_err!("{e}"))?
         .ok_or_else(|| {
-            DataFusionError::Internal(
-                "Missing required field 'window_frame' in protobuf".to_string(),
-            )
+            internal_datafusion_err!("Missing required field 'window_frame' in protobuf")
         })?;
 
     let fun = if let Some(window_func) = proto.window_function.as_ref() {
@@ -152,13 +172,13 @@ pub fn parse_physical_window_expr(
             protobuf::physical_window_expr_node::WindowFunction::UserDefinedAggrFunction(udaf_name) => {
                 WindowFunctionDefinition::AggregateUDF(match &proto.fun_definition {
                     Some(buf) => codec.try_decode_udaf(udaf_name, buf)?,
-                    None => registry.udaf(udaf_name).or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?,
+                    None => ctx.udaf(udaf_name).or_else(|_| codec.try_decode_udaf(udaf_name, &[]))?,
                 })
             }
             protobuf::physical_window_expr_node::WindowFunction::UserDefinedWindowFunction(udwf_name) => {
                 WindowFunctionDefinition::WindowUDF(match &proto.fun_definition {
                     Some(buf) => codec.try_decode_udwf(udwf_name, buf)?,
-                    None => registry.udwf(udwf_name).or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?
+                    None => ctx.udwf(udwf_name).or_else(|_| codec.try_decode_udwf(udwf_name, &[]))?
                 })
             }
         }
@@ -175,25 +195,28 @@ pub fn parse_physical_window_expr(
         name,
         &window_node_expr,
         &partition_by,
-        order_by.as_ref(),
+        &order_by,
         Arc::new(window_frame),
-        &extended_schema,
-        false,
+        extended_schema,
+        proto.ignore_nulls,
+        proto.distinct,
+        None,
     )
 }
 
 pub fn parse_physical_exprs<'a, I>(
     protos: I,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<Arc<dyn PhysicalExpr>>>
 where
     I: IntoIterator<Item = &'a protobuf::PhysicalExprNode>,
 {
     protos
         .into_iter()
-        .map(|p| parse_physical_expr(p, registry, input_schema, codec))
+        .map(|p| proto_converter.proto_to_physical_expr(p, ctx, input_schema, codec))
         .collect::<Result<Vec<_>>>()
 }
 
@@ -208,9 +231,35 @@ where
 /// * `codec` - An extension codec used to decode custom UDFs.
 pub fn parse_physical_expr(
     proto: &protobuf::PhysicalExprNode,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    parse_physical_expr_with_converter(
+        proto,
+        ctx,
+        input_schema,
+        codec,
+        &DefaultPhysicalProtoConverter {},
+    )
+}
+
+/// Parses a physical expression from a protobuf.
+///
+/// # Arguments
+///
+/// * `proto` - Input proto with physical expression node
+/// * `registry` - A registry knows how to build logical expressions out of user-defined function names
+/// * `input_schema` - The Arrow schema for the input, used for determining expression data types
+///   when performing type coercion.
+/// * `codec` - An extension codec used to decode custom UDFs.
+/// * `proto_converter` - Conversion functions for physical plans and expressions
+pub fn parse_physical_expr_with_converter(
+    proto: &protobuf::PhysicalExprNode,
+    ctx: &TaskContext,
+    input_schema: &Schema,
+    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     let expr_type = proto
         .expr_type
@@ -227,18 +276,20 @@ pub fn parse_physical_expr(
         ExprType::BinaryExpr(binary_expr) => Arc::new(BinaryExpr::new(
             parse_required_physical_expr(
                 binary_expr.l.as_deref(),
-                registry,
+                ctx,
                 "left",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
             logical_plan::from_proto::from_proto_binary_op(&binary_expr.op)?,
             parse_required_physical_expr(
                 binary_expr.r.as_deref(),
-                registry,
+                ctx,
                 "right",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
         )),
         ExprType::AggregateExpr(_) => {
@@ -257,53 +308,65 @@ pub fn parse_physical_expr(
         ExprType::IsNullExpr(e) => {
             Arc::new(IsNullExpr::new(parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?))
         }
         ExprType::IsNotNullExpr(e) => {
             Arc::new(IsNotNullExpr::new(parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?))
         }
         ExprType::NotExpr(e) => Arc::new(NotExpr::new(parse_required_physical_expr(
             e.expr.as_deref(),
-            registry,
+            ctx,
             "expr",
             input_schema,
             codec,
+            proto_converter,
         )?)),
         ExprType::Negative(e) => {
             Arc::new(NegativeExpr::new(parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?))
         }
         ExprType::InList(e) => in_list(
             parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
-            parse_physical_exprs(&e.list, registry, input_schema, codec)?,
+            parse_physical_exprs(&e.list, ctx, input_schema, codec, proto_converter)?,
             &e.negated,
             input_schema,
         )?,
         ExprType::Case(e) => Arc::new(CaseExpr::try_new(
             e.expr
                 .as_ref()
-                .map(|e| parse_physical_expr(e.as_ref(), registry, input_schema, codec))
+                .map(|e| {
+                    proto_converter.proto_to_physical_expr(
+                        e.as_ref(),
+                        ctx,
+                        input_schema,
+                        codec,
+                    )
+                })
                 .transpose()?,
             e.when_then_expr
                 .iter()
@@ -311,33 +374,43 @@ pub fn parse_physical_expr(
                     Ok((
                         parse_required_physical_expr(
                             e.when_expr.as_ref(),
-                            registry,
+                            ctx,
                             "when_expr",
                             input_schema,
                             codec,
+                            proto_converter,
                         )?,
                         parse_required_physical_expr(
                             e.then_expr.as_ref(),
-                            registry,
+                            ctx,
                             "then_expr",
                             input_schema,
                             codec,
+                            proto_converter,
                         )?,
                     ))
                 })
                 .collect::<Result<Vec<_>>>()?,
             e.else_expr
                 .as_ref()
-                .map(|e| parse_physical_expr(e.as_ref(), registry, input_schema, codec))
+                .map(|e| {
+                    proto_converter.proto_to_physical_expr(
+                        e.as_ref(),
+                        ctx,
+                        input_schema,
+                        codec,
+                    )
+                })
                 .transpose()?,
         )?),
         ExprType::Cast(e) => Arc::new(CastExpr::new(
             parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
             convert_required!(e.arrow_type)?,
             None,
@@ -345,30 +418,40 @@ pub fn parse_physical_expr(
         ExprType::TryCast(e) => Arc::new(TryCastExpr::new(
             parse_required_physical_expr(
                 e.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
             convert_required!(e.arrow_type)?,
         )),
         ExprType::ScalarUdf(e) => {
             let udf = match &e.fun_definition {
                 Some(buf) => codec.try_decode_udf(&e.name, buf)?,
-                None => registry
+                None => ctx
                     .udf(e.name.as_str())
                     .or_else(|_| codec.try_decode_udf(&e.name, &[]))?,
             };
             let scalar_fun_def = Arc::clone(&udf);
 
-            let args = parse_physical_exprs(&e.args, registry, input_schema, codec)?;
+            let args =
+                parse_physical_exprs(&e.args, ctx, input_schema, codec, proto_converter)?;
+
+            let config_options = Arc::clone(ctx.session_config().options());
 
             Arc::new(
                 ScalarFunctionExpr::new(
                     e.name.as_str(),
                     scalar_fun_def,
                     args,
-                    Field::new("f", convert_required!(e.return_type)?, true).into(),
+                    Field::new(
+                        &e.return_field_name,
+                        convert_required!(e.return_type)?,
+                        true,
+                    )
+                    .into(),
+                    config_options,
                 )
                 .with_nullable(e.nullable),
             )
@@ -378,26 +461,44 @@ pub fn parse_physical_expr(
             like_expr.case_insensitive,
             parse_required_physical_expr(
                 like_expr.expr.as_deref(),
-                registry,
+                ctx,
                 "expr",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
             parse_required_physical_expr(
                 like_expr.pattern.as_deref(),
-                registry,
+                ctx,
                 "pattern",
                 input_schema,
                 codec,
+                proto_converter,
             )?,
         )),
+        ExprType::HashExpr(hash_expr) => {
+            let on_columns = parse_physical_exprs(
+                &hash_expr.on_columns,
+                ctx,
+                input_schema,
+                codec,
+                proto_converter,
+            )?;
+            Arc::new(HashExpr::new(
+                on_columns,
+                SeededRandomState::with_seed(hash_expr.seed0),
+                hash_expr.description.clone(),
+            ))
+        }
         ExprType::Extension(extension) => {
             let inputs: Vec<Arc<dyn PhysicalExpr>> = extension
                 .inputs
                 .iter()
-                .map(|e| parse_physical_expr(e, registry, input_schema, codec))
+                .map(|e| {
+                    proto_converter.proto_to_physical_expr(e, ctx, input_schema, codec)
+                })
                 .collect::<Result<_>>()?;
-            (codec.try_decode_expr(extension.expr.as_slice(), &inputs)?) as _
+            codec.try_decode_expr(extension.expr.as_slice(), &inputs)? as _
         }
     };
 
@@ -406,31 +507,32 @@ pub fn parse_physical_expr(
 
 fn parse_required_physical_expr(
     expr: Option<&protobuf::PhysicalExprNode>,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     field: &str,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn PhysicalExpr>> {
-    expr.map(|e| parse_physical_expr(e, registry, input_schema, codec))
+    expr.map(|e| proto_converter.proto_to_physical_expr(e, ctx, input_schema, codec))
         .transpose()?
-        .ok_or_else(|| {
-            DataFusionError::Internal(format!("Missing required field {field:?}"))
-        })
+        .ok_or_else(|| internal_datafusion_err!("Missing required field {field:?}"))
 }
 
 pub fn parse_protobuf_hash_partitioning(
     partitioning: Option<&protobuf::PhysicalHashRepartition>,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Option<Partitioning>> {
     match partitioning {
         Some(hash_part) => {
             let expr = parse_physical_exprs(
                 &hash_part.hash_expr,
-                registry,
+                ctx,
                 input_schema,
                 codec,
+                proto_converter,
             )?;
 
             Ok(Some(Partitioning::Hash(
@@ -444,9 +546,10 @@ pub fn parse_protobuf_hash_partitioning(
 
 pub fn parse_protobuf_partitioning(
     partitioning: Option<&protobuf::Partitioning>,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     input_schema: &Schema,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Option<Partitioning>> {
     match partitioning {
         Some(protobuf::Partitioning { partition_method }) => match partition_method {
@@ -458,9 +561,10 @@ pub fn parse_protobuf_partitioning(
             Some(protobuf::partitioning::PartitionMethod::Hash(hash_repartition)) => {
                 parse_protobuf_hash_partitioning(
                     Some(hash_repartition),
-                    registry,
+                    ctx,
                     input_schema,
                     codec,
+                    proto_converter,
                 )
             }
             Some(protobuf::partitioning::PartitionMethod::Unknown(partition_count)) => {
@@ -480,18 +584,45 @@ pub fn parse_protobuf_file_scan_schema(
     Ok(Arc::new(convert_required!(proto.schema)?))
 }
 
+/// Parses a TableSchema from protobuf, extracting the file schema and partition columns
+pub fn parse_table_schema_from_proto(
+    proto: &protobuf::FileScanExecConf,
+) -> Result<TableSchema> {
+    let schema: Arc<Schema> = parse_protobuf_file_scan_schema(proto)?;
+
+    // Reacquire the partition column types from the schema before removing them below.
+    let table_partition_cols = proto
+        .table_partition_cols
+        .iter()
+        .map(|col| Ok(Arc::new(schema.field_with_name(col)?.clone())))
+        .collect::<Result<Vec<_>>>()?;
+
+    // Remove partition columns from the schema after recreating table_partition_cols
+    // because the partition columns are not in the file. They are present to allow
+    // the partition column types to be reconstructed after serde.
+    let file_schema = Arc::new(
+        Schema::new(
+            schema
+                .fields()
+                .iter()
+                .filter(|field| !table_partition_cols.contains(field))
+                .cloned()
+                .collect::<Vec<_>>(),
+        )
+        .with_metadata(schema.metadata.clone()),
+    );
+
+    Ok(TableSchema::new(file_schema, table_partition_cols))
+}
+
 pub fn parse_protobuf_file_scan_config(
     proto: &protobuf::FileScanExecConf,
-    registry: &dyn FunctionRegistry,
+    ctx: &TaskContext,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
     file_source: Arc<dyn FileSource>,
 ) -> Result<FileScanConfig> {
     let schema: Arc<Schema> = parse_protobuf_file_scan_schema(proto)?;
-    let projection = proto
-        .projection
-        .iter()
-        .map(|i| *i as usize)
-        .collect::<Vec<_>>();
 
     let constraints = convert_required!(proto.constraints)?;
     let statistics = convert_required!(proto.statistics)?;
@@ -507,75 +638,95 @@ pub fn parse_protobuf_file_scan_config(
         true => ObjectStoreUrl::local_filesystem(),
     };
 
-    // Reacquire the partition column types from the schema before removing them below.
-    let table_partition_cols = proto
-        .table_partition_cols
-        .iter()
-        .map(|col| Ok(schema.field_with_name(col)?.clone()))
-        .collect::<Result<Vec<_>>>()?;
-
-    // Remove partition columns from the schema after recreating table_partition_cols
-    // because the partition columns are not in the file. They are present to allow
-    // the partition column types to be reconstructed after serde.
-    let file_schema = Arc::new(Schema::new(
-        schema
-            .fields()
-            .iter()
-            .filter(|field| !table_partition_cols.contains(field))
-            .cloned()
-            .collect::<Vec<_>>(),
-    ));
-
     let mut output_ordering = vec![];
     for node_collection in &proto.output_ordering {
-        let sort_expr = parse_physical_sort_exprs(
+        let sort_exprs = parse_physical_sort_exprs(
             &node_collection.physical_sort_expr_nodes,
-            registry,
+            ctx,
             &schema,
             codec,
+            proto_converter,
         )?;
-        output_ordering.push(sort_expr);
+        output_ordering.extend(LexOrdering::new(sort_exprs));
     }
 
-    let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source)
+    // Parse projection expressions if present and apply to file source
+    let file_source = if let Some(proto_projection_exprs) = &proto.projection_exprs {
+        let projection_exprs: Vec<ProjectionExpr> = proto_projection_exprs
+            .projections
+            .iter()
+            .map(|proto_expr| {
+                let expr = proto_converter.proto_to_physical_expr(
+                    proto_expr.expr.as_ref().ok_or_else(|| {
+                        internal_datafusion_err!("ProjectionExpr missing expr field")
+                    })?,
+                    ctx,
+                    &schema,
+                    codec,
+                )?;
+                Ok(ProjectionExpr::new(expr, proto_expr.alias.clone()))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let projection_exprs = ProjectionExprs::new(projection_exprs);
+
+        // Apply projection to file source
+        file_source
+            .try_pushdown_projection(&projection_exprs)?
+            .unwrap_or(file_source)
+    } else {
+        file_source
+    };
+
+    let config = FileScanConfigBuilder::new(object_store_url, file_source)
         .with_file_groups(file_groups)
         .with_constraints(constraints)
         .with_statistics(statistics)
-        .with_projection(Some(projection))
         .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize))
-        .with_table_partition_cols(table_partition_cols)
         .with_output_ordering(output_ordering)
         .with_batch_size(proto.batch_size.map(|s| s as usize))
         .build();
     Ok(config)
 }
 
+pub fn parse_record_batches(buf: &[u8]) -> Result<Vec<RecordBatch>> {
+    if buf.is_empty() {
+        return Ok(vec![]);
+    }
+    let reader = StreamReader::try_new(buf, None)?;
+    let mut batches = Vec::new();
+    for batch in reader {
+        batches.push(batch?);
+    }
+    Ok(batches)
+}
+
 impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile {
     type Error = DataFusionError;
 
     fn try_from(val: &protobuf::PartitionedFile) -> Result<Self, Self::Error> {
-        Ok(PartitionedFile {
-            object_meta: ObjectMeta {
-                location: Path::from(val.path.as_str()),
-                last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64),
-                size: val.size,
-                e_tag: None,
-                version: None,
-            },
-            partition_values: val
-                .partition_values
+        let mut pf = PartitionedFile::new_from_meta(ObjectMeta {
+            location: Path::parse(val.path.as_str())
+                .map_err(|e| proto_error(format!("Invalid object_store path: {e}")))?,
+            last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64),
+            size: val.size,
+            e_tag: None,
+            version: None,
+        })
+        .with_partition_values(
+            val.partition_values
                 .iter()
                 .map(|v| v.try_into())
                 .collect::<Result<Vec<_>, _>>()?,
-            range: val.range.as_ref().map(|v| v.try_into()).transpose()?,
-            statistics: val
-                .statistics
-                .as_ref()
-                .map(|v| v.try_into().map(Arc::new))
-                .transpose()?,
-            extensions: None,
-            metadata_size_hint: None,
-        })
+        );
+        if let Some(range) = val.range.as_ref() {
+            let file_range: FileRange = range.try_into()?;
+            pf = pf.with_range(file_range.start, file_range.end);
+        }
+        if let Some(proto_stats) = val.statistics.as_ref() {
+            pf = pf.with_statistics(Arc::new(proto_stats.try_into()?));
+        }
+        Ok(pf)
     }
 }
 
@@ -665,6 +816,17 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig {
             protobuf::InsertOp::Overwrite => InsertOp::Overwrite,
             protobuf::InsertOp::Replace => InsertOp::Replace,
         };
+        let file_output_mode = match conf.file_output_mode() {
+            protobuf::FileOutputMode::Automatic => {
+                datafusion_datasource::file_sink_config::FileOutputMode::Automatic
+            }
+            protobuf::FileOutputMode::SingleFile => {
+                datafusion_datasource::file_sink_config::FileOutputMode::SingleFile
+            }
+            protobuf::FileOutputMode::Directory => {
+                datafusion_datasource::file_sink_config::FileOutputMode::Directory
+            }
+        };
         Ok(Self {
             original_url: String::default(),
             object_store_url: ObjectStoreUrl::parse(&conf.object_store_url)?,
@@ -675,6 +837,53 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig {
             insert_op,
             keep_partition_by_columns: conf.keep_partition_by_columns,
             file_extension: conf.file_extension.clone(),
+            file_output_mode,
         })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use chrono::{TimeZone, Utc};
+    use datafusion_datasource::PartitionedFile;
+    use object_store::ObjectMeta;
+    use object_store::path::Path;
+
+    use super::*;
+
+    #[test]
+    fn partitioned_file_path_roundtrip_percent_encoded() {
+        let path_str = "foo/foo%2Fbar/baz%252Fqux";
+        let pf = PartitionedFile::new_from_meta(ObjectMeta {
+            location: Path::parse(path_str).unwrap(),
+            last_modified: Utc.timestamp_nanos(1_000),
+            size: 42,
+            e_tag: None,
+            version: None,
+        });
+
+        let proto = protobuf::PartitionedFile::try_from(&pf).unwrap();
+        assert_eq!(proto.path, path_str);
+
+        let pf2 = PartitionedFile::try_from(&proto).unwrap();
+        assert_eq!(pf2.object_meta.location.as_ref(), path_str);
+        assert_eq!(pf2.object_meta.location, pf.object_meta.location);
+        assert_eq!(pf2.object_meta.size, pf.object_meta.size);
+        assert_eq!(pf2.object_meta.last_modified, pf.object_meta.last_modified);
+    }
+
+    #[test]
+    fn partitioned_file_from_proto_invalid_path() {
+        let proto = protobuf::PartitionedFile {
+            path: "foo//bar".to_string(),
+            size: 1,
+            last_modified_ns: 0,
+            partition_values: vec![],
+            range: None,
+            statistics: None,
+        };
+
+        let err = PartitionedFile::try_from(&proto).unwrap_err();
+        assert!(err.to_string().contains("Invalid object_store path"));
+    }
+}
diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs
index 7a85a2a8efbd0..cf06b60d1cd05 100644
--- a/datafusion/proto/src/physical_plan/mod.rs
+++ b/datafusion/proto/src/physical_plan/mod.rs
@@ -15,83 +15,105 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
+use std::cell::RefCell;
+use std::collections::HashMap;
 use std::fmt::Debug;
+use std::hash::{DefaultHasher, Hash, Hasher};
 use std::sync::Arc;
 
+use arrow::compute::SortOptions;
+use arrow::datatypes::{IntervalMonthDayNanoType, Schema, SchemaRef};
+use datafusion_catalog::memory::MemorySourceConfig;
+use datafusion_common::config::CsvOptions;
+use datafusion_common::{
+    DataFusionError, Result, internal_datafusion_err, internal_err, not_impl_err,
+};
+#[cfg(feature = "parquet")]
+use datafusion_datasource::file::FileSource;
+use datafusion_datasource::file_compression_type::FileCompressionType;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::sink::DataSinkExec;
+use datafusion_datasource::source::{DataSource, DataSourceExec};
+use datafusion_datasource_arrow::source::ArrowSource;
+#[cfg(feature = "avro")]
+use datafusion_datasource_avro::source::AvroSource;
+use datafusion_datasource_csv::file_format::CsvSink;
+use datafusion_datasource_csv::source::CsvSource;
+use datafusion_datasource_json::file_format::JsonSink;
+use datafusion_datasource_json::source::JsonSource;
+#[cfg(feature = "parquet")]
+use datafusion_datasource_parquet::CachedParquetFileReaderFactory;
+#[cfg(feature = "parquet")]
+use datafusion_datasource_parquet::file_format::ParquetSink;
+#[cfg(feature = "parquet")]
+use datafusion_datasource_parquet::source::ParquetSource;
+#[cfg(feature = "parquet")]
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_execution::{FunctionRegistry, TaskContext};
+use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
+use datafusion_functions_table::generate_series::{
+    Empty, GenSeriesArgs, GenerateSeriesTable, GenericSeriesState, TimestampValue,
+};
+use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
+use datafusion_physical_expr::async_scalar_function::AsyncFuncExpr;
+use datafusion_physical_expr::{LexOrdering, LexRequirement, PhysicalExprRef};
+use datafusion_physical_plan::aggregates::{
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
+};
+use datafusion_physical_plan::analyze::AnalyzeExec;
+use datafusion_physical_plan::async_func::AsyncFuncExec;
+use datafusion_physical_plan::buffer::BufferExec;
+#[expect(deprecated)]
+use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::coop::CooperativeExec;
+use datafusion_physical_plan::empty::EmptyExec;
+use datafusion_physical_plan::explain::ExplainExec;
+use datafusion_physical_plan::expressions::PhysicalSortExpr;
+use datafusion_physical_plan::filter::{FilterExec, FilterExecBuilder};
+use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
+use datafusion_physical_plan::joins::{
+    CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
+    StreamJoinPartitionMode, SymmetricHashJoinExec,
+};
+use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::memory::LazyMemoryExec;
+use datafusion_physical_plan::metrics::MetricType;
+use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion_physical_plan::union::{InterleaveExec, UnionExec};
+use datafusion_physical_plan::unnest::{ListUnnest, UnnestExec};
+use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
+use datafusion_physical_plan::{ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr};
+use prost::Message;
+use prost::bytes::BufMut;
+
 use self::from_proto::parse_protobuf_partitioning;
-use self::to_proto::{serialize_partitioning, serialize_physical_expr};
+use self::to_proto::serialize_partitioning;
 use crate::common::{byte_to_string, str_to_byte};
 use crate::physical_plan::from_proto::{
-    parse_physical_expr, parse_physical_sort_expr, parse_physical_sort_exprs,
-    parse_physical_window_expr, parse_protobuf_file_scan_config,
-    parse_protobuf_file_scan_schema,
+    parse_physical_expr_with_converter, parse_physical_sort_expr,
+    parse_physical_sort_exprs, parse_physical_window_expr,
+    parse_protobuf_file_scan_config, parse_record_batches, parse_table_schema_from_proto,
 };
 use crate::physical_plan::to_proto::{
     serialize_file_scan_config, serialize_maybe_filter, serialize_physical_aggr_expr,
-    serialize_physical_window_expr,
+    serialize_physical_expr_with_converter, serialize_physical_sort_exprs,
+    serialize_physical_window_expr, serialize_record_batches,
 };
 use crate::protobuf::physical_aggregate_expr_node::AggregateFunction;
 use crate::protobuf::physical_expr_node::ExprType;
 use crate::protobuf::physical_plan_node::PhysicalPlanType;
 use crate::protobuf::{
-    self, proto_error, window_agg_exec_node, ListUnnest as ProtoListUnnest,
+    self, ListUnnest as ProtoListUnnest, SortExprNode, SortMergeJoinExecNode,
+    proto_error, window_agg_exec_node,
 };
 use crate::{convert_required, into_required};
 
-use datafusion::arrow::compute::SortOptions;
-use datafusion::arrow::datatypes::SchemaRef;
-use datafusion::datasource::file_format::csv::CsvSink;
-use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
-use datafusion::datasource::file_format::json::JsonSink;
-#[cfg(feature = "parquet")]
-use datafusion::datasource::file_format::parquet::ParquetSink;
-#[cfg(feature = "avro")]
-use datafusion::datasource::physical_plan::AvroSource;
-#[cfg(feature = "parquet")]
-use datafusion::datasource::physical_plan::ParquetSource;
-use datafusion::datasource::physical_plan::{
-    CsvSource, FileScanConfig, FileScanConfigBuilder, JsonSource,
-};
-use datafusion::datasource::sink::DataSinkExec;
-use datafusion::datasource::source::DataSourceExec;
-use datafusion::execution::runtime_env::RuntimeEnv;
-use datafusion::execution::FunctionRegistry;
-use datafusion::physical_expr::aggregate::AggregateExprBuilder;
-use datafusion::physical_expr::aggregate::AggregateFunctionExpr;
-use datafusion::physical_expr::{LexOrdering, LexRequirement, PhysicalExprRef};
-use datafusion::physical_plan::aggregates::AggregateMode;
-use datafusion::physical_plan::aggregates::{AggregateExec, PhysicalGroupBy};
-use datafusion::physical_plan::analyze::AnalyzeExec;
-use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
-use datafusion::physical_plan::empty::EmptyExec;
-use datafusion::physical_plan::explain::ExplainExec;
-use datafusion::physical_plan::expressions::PhysicalSortExpr;
-use datafusion::physical_plan::filter::FilterExec;
-use datafusion::physical_plan::joins::utils::{ColumnIndex, JoinFilter};
-use datafusion::physical_plan::joins::{
-    CrossJoinExec, NestedLoopJoinExec, StreamJoinPartitionMode, SymmetricHashJoinExec,
-};
-use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
-use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
-use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion::physical_plan::projection::ProjectionExec;
-use datafusion::physical_plan::repartition::RepartitionExec;
-use datafusion::physical_plan::sorts::sort::SortExec;
-use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use datafusion::physical_plan::union::{InterleaveExec, UnionExec};
-use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec};
-use datafusion::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
-use datafusion::physical_plan::{
-    ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr,
-};
-use datafusion_common::config::TableParquetOptions;
-use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result};
-use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF};
-
-use prost::bytes::BufMut;
-use prost::Message;
-
 pub mod from_proto;
 pub mod to_proto;
 
@@ -101,7 +123,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
         Self: Sized,
     {
         protobuf::PhysicalPlanNode::decode(buf).map_err(|e| {
-            DataFusionError::Internal(format!("failed to decode physical plan: {e:?}"))
+            internal_datafusion_err!("failed to decode physical plan: {e:?}")
         })
     }
 
@@ -111,15 +133,44 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
         Self: Sized,
     {
         self.encode(buf).map_err(|e| {
-            DataFusionError::Internal(format!("failed to encode physical plan: {e:?}"))
+            internal_datafusion_err!("failed to encode physical plan: {e:?}")
         })
     }
 
     fn try_into_physical_plan(
         &self,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.try_into_physical_plan_with_converter(
+            ctx,
+            codec,
+            &DefaultPhysicalProtoConverter {},
+        )
+    }
+
+    fn try_from_physical_plan(
+        plan: Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Self>
+    where
+        Self: Sized,
+    {
+        Self::try_from_physical_plan_with_converter(
+            plan,
+            codec,
+            &DefaultPhysicalProtoConverter {},
+        )
+    }
+}
+
+impl protobuf::PhysicalPlanNode {
+    pub fn try_into_physical_plan_with_converter(
+        &self,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let plan = self.physical_plan_type.as_ref().ok_or_else(|| {
             proto_error(format!(
@@ -127,209 +178,156 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
             ))
         })?;
         match plan {
-            PhysicalPlanType::Explain(explain) => self.try_into_explain_physical_plan(
-                explain,
-                registry,
-                runtime,
-                extension_codec,
-            ),
+            PhysicalPlanType::Explain(explain) => {
+                self.try_into_explain_physical_plan(explain, ctx, codec, proto_converter)
+            }
             PhysicalPlanType::Projection(projection) => self
                 .try_into_projection_physical_plan(
                     projection,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
-            PhysicalPlanType::Filter(filter) => self.try_into_filter_physical_plan(
-                filter,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            PhysicalPlanType::CsvScan(scan) => self.try_into_csv_scan_physical_plan(
-                scan,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            PhysicalPlanType::JsonScan(scan) => self.try_into_json_scan_physical_plan(
-                scan,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
+            PhysicalPlanType::Filter(filter) => {
+                self.try_into_filter_physical_plan(filter, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::CsvScan(scan) => {
+                self.try_into_csv_scan_physical_plan(scan, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::JsonScan(scan) => {
+                self.try_into_json_scan_physical_plan(scan, ctx, codec, proto_converter)
+            }
             PhysicalPlanType::ParquetScan(scan) => self
-                .try_into_parquet_scan_physical_plan(
-                    scan,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
-            #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
-            PhysicalPlanType::AvroScan(scan) => self.try_into_avro_scan_physical_plan(
-                scan,
-                registry,
-                runtime,
-                extension_codec,
-            ),
+                .try_into_parquet_scan_physical_plan(scan, ctx, codec, proto_converter),
+            PhysicalPlanType::AvroScan(scan) => {
+                self.try_into_avro_scan_physical_plan(scan, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::MemoryScan(scan) => {
+                self.try_into_memory_scan_physical_plan(scan, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::ArrowScan(scan) => {
+                self.try_into_arrow_scan_physical_plan(scan, ctx, codec, proto_converter)
+            }
             PhysicalPlanType::CoalesceBatches(coalesce_batches) => self
                 .try_into_coalesce_batches_physical_plan(
                     coalesce_batches,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
-            PhysicalPlanType::Merge(merge) => self.try_into_merge_physical_plan(
-                merge,
-                registry,
-                runtime,
-                extension_codec,
-            ),
+            PhysicalPlanType::Merge(merge) => {
+                self.try_into_merge_physical_plan(merge, ctx, codec, proto_converter)
+            }
             PhysicalPlanType::Repartition(repart) => self
-                .try_into_repartition_physical_plan(
-                    repart,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_repartition_physical_plan(repart, ctx, codec, proto_converter),
             PhysicalPlanType::GlobalLimit(limit) => self
-                .try_into_global_limit_physical_plan(
-                    limit,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_global_limit_physical_plan(limit, ctx, codec, proto_converter),
             PhysicalPlanType::LocalLimit(limit) => self
-                .try_into_local_limit_physical_plan(
-                    limit,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_local_limit_physical_plan(limit, ctx, codec, proto_converter),
             PhysicalPlanType::Window(window_agg) => self.try_into_window_physical_plan(
                 window_agg,
-                registry,
-                runtime,
-                extension_codec,
+                ctx,
+                codec,
+                proto_converter,
             ),
             PhysicalPlanType::Aggregate(hash_agg) => self
-                .try_into_aggregate_physical_plan(
-                    hash_agg,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_aggregate_physical_plan(hash_agg, ctx, codec, proto_converter),
             PhysicalPlanType::HashJoin(hashjoin) => self
-                .try_into_hash_join_physical_plan(
-                    hashjoin,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_hash_join_physical_plan(hashjoin, ctx, codec, proto_converter),
             PhysicalPlanType::SymmetricHashJoin(sym_join) => self
                 .try_into_symmetric_hash_join_physical_plan(
                     sym_join,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
-            PhysicalPlanType::Union(union) => self.try_into_union_physical_plan(
-                union,
-                registry,
-                runtime,
-                extension_codec,
-            ),
+            PhysicalPlanType::Union(union) => {
+                self.try_into_union_physical_plan(union, ctx, codec, proto_converter)
+            }
             PhysicalPlanType::Interleave(interleave) => self
                 .try_into_interleave_physical_plan(
                     interleave,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
             PhysicalPlanType::CrossJoin(crossjoin) => self
                 .try_into_cross_join_physical_plan(
                     crossjoin,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
-            PhysicalPlanType::Empty(empty) => self.try_into_empty_physical_plan(
-                empty,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            PhysicalPlanType::PlaceholderRow(placeholder) => self
-                .try_into_placeholder_row_physical_plan(
-                    placeholder,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
+            PhysicalPlanType::Empty(empty) => {
+                self.try_into_empty_physical_plan(empty, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::PlaceholderRow(placeholder) => {
+                self.try_into_placeholder_row_physical_plan(placeholder, ctx, codec)
+            }
             PhysicalPlanType::Sort(sort) => {
-                self.try_into_sort_physical_plan(sort, registry, runtime, extension_codec)
+                self.try_into_sort_physical_plan(sort, ctx, codec, proto_converter)
             }
             PhysicalPlanType::SortPreservingMerge(sort) => self
                 .try_into_sort_preserving_merge_physical_plan(
                     sort,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
             PhysicalPlanType::Extension(extension) => self
-                .try_into_extension_physical_plan(
-                    extension,
-                    registry,
-                    runtime,
-                    extension_codec,
-                ),
+                .try_into_extension_physical_plan(extension, ctx, codec, proto_converter),
             PhysicalPlanType::NestedLoopJoin(join) => self
                 .try_into_nested_loop_join_physical_plan(
                     join,
-                    registry,
-                    runtime,
-                    extension_codec,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
-            PhysicalPlanType::Analyze(analyze) => self.try_into_analyze_physical_plan(
-                analyze,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            PhysicalPlanType::JsonSink(sink) => self.try_into_json_sink_physical_plan(
-                sink,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-            PhysicalPlanType::CsvSink(sink) => self.try_into_csv_sink_physical_plan(
-                sink,
-                registry,
-                runtime,
-                extension_codec,
-            ),
-
+            PhysicalPlanType::Analyze(analyze) => {
+                self.try_into_analyze_physical_plan(analyze, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::JsonSink(sink) => {
+                self.try_into_json_sink_physical_plan(sink, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::CsvSink(sink) => {
+                self.try_into_csv_sink_physical_plan(sink, ctx, codec, proto_converter)
+            }
             #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
             PhysicalPlanType::ParquetSink(sink) => self
-                .try_into_parquet_sink_physical_plan(
-                    sink,
-                    registry,
-                    runtime,
-                    extension_codec,
+                .try_into_parquet_sink_physical_plan(sink, ctx, codec, proto_converter),
+            PhysicalPlanType::Unnest(unnest) => {
+                self.try_into_unnest_physical_plan(unnest, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::Cooperative(cooperative) => self
+                .try_into_cooperative_physical_plan(
+                    cooperative,
+                    ctx,
+                    codec,
+                    proto_converter,
                 ),
-            PhysicalPlanType::Unnest(unnest) => self.try_into_unnest_physical_plan(
-                unnest,
-                registry,
-                runtime,
-                extension_codec,
-            ),
+            PhysicalPlanType::GenerateSeries(generate_series) => {
+                self.try_into_generate_series_physical_plan(generate_series)
+            }
+            PhysicalPlanType::SortMergeJoin(sort_join) => {
+                self.try_into_sort_join(sort_join, ctx, codec, proto_converter)
+            }
+            PhysicalPlanType::AsyncFunc(async_func) => self
+                .try_into_async_func_physical_plan(
+                    async_func,
+                    ctx,
+                    codec,
+                    proto_converter,
+                ),
+            PhysicalPlanType::Buffer(buffer) => {
+                self.try_into_buffer_physical_plan(buffer, ctx, codec, proto_converter)
+            }
         }
     }
 
-    fn try_from_physical_plan(
+    pub fn try_from_physical_plan_with_converter(
         plan: Arc<dyn ExecutionPlan>,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self>
     where
         Self: Sized,
@@ -338,192 +336,251 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
         let plan = plan.as_any();
 
         if let Some(exec) = plan.downcast_ref::<ExplainExec>() {
-            return protobuf::PhysicalPlanNode::try_from_explain_exec(
-                exec,
-                extension_codec,
-            );
+            return protobuf::PhysicalPlanNode::try_from_explain_exec(exec, codec);
         }
 
         if let Some(exec) = plan.downcast_ref::<ProjectionExec>() {
             return protobuf::PhysicalPlanNode::try_from_projection_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<AnalyzeExec>() {
             return protobuf::PhysicalPlanNode::try_from_analyze_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<FilterExec>() {
             return protobuf::PhysicalPlanNode::try_from_filter_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(limit) = plan.downcast_ref::<GlobalLimitExec>() {
             return protobuf::PhysicalPlanNode::try_from_global_limit_exec(
                 limit,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(limit) = plan.downcast_ref::<LocalLimitExec>() {
             return protobuf::PhysicalPlanNode::try_from_local_limit_exec(
                 limit,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<HashJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_hash_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SymmetricHashJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_symmetric_hash_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<SortMergeJoinExec>() {
+            return protobuf::PhysicalPlanNode::try_from_sort_merge_join_exec(
+                exec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<CrossJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_cross_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<AggregateExec>() {
             return protobuf::PhysicalPlanNode::try_from_aggregate_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(empty) = plan.downcast_ref::<EmptyExec>() {
-            return protobuf::PhysicalPlanNode::try_from_empty_exec(
-                empty,
-                extension_codec,
-            );
+            return protobuf::PhysicalPlanNode::try_from_empty_exec(empty, codec);
         }
 
         if let Some(empty) = plan.downcast_ref::<PlaceholderRowExec>() {
             return protobuf::PhysicalPlanNode::try_from_placeholder_row_exec(
-                empty,
-                extension_codec,
+                empty, codec,
             );
         }
 
+        #[expect(deprecated)]
         if let Some(coalesce_batches) = plan.downcast_ref::<CoalesceBatchesExec>() {
             return protobuf::PhysicalPlanNode::try_from_coalesce_batches_exec(
                 coalesce_batches,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
-        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>() {
-            if let Some(node) = protobuf::PhysicalPlanNode::try_from_data_source_exec(
+        if let Some(data_source_exec) = plan.downcast_ref::<DataSourceExec>()
+            && let Some(node) = protobuf::PhysicalPlanNode::try_from_data_source_exec(
                 data_source_exec,
-                extension_codec,
-            )? {
-                return Ok(node);
-            }
+                codec,
+                proto_converter,
+            )?
+        {
+            return Ok(node);
         }
 
         if let Some(exec) = plan.downcast_ref::<CoalescePartitionsExec>() {
             return protobuf::PhysicalPlanNode::try_from_coalesce_partitions_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<RepartitionExec>() {
             return protobuf::PhysicalPlanNode::try_from_repartition_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SortExec>() {
-            return protobuf::PhysicalPlanNode::try_from_sort_exec(exec, extension_codec);
+            return protobuf::PhysicalPlanNode::try_from_sort_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
         }
 
         if let Some(union) = plan.downcast_ref::<UnionExec>() {
             return protobuf::PhysicalPlanNode::try_from_union_exec(
                 union,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(interleave) = plan.downcast_ref::<InterleaveExec>() {
             return protobuf::PhysicalPlanNode::try_from_interleave_exec(
                 interleave,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<SortPreservingMergeExec>() {
             return protobuf::PhysicalPlanNode::try_from_sort_preserving_merge_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<NestedLoopJoinExec>() {
             return protobuf::PhysicalPlanNode::try_from_nested_loop_join_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
             return protobuf::PhysicalPlanNode::try_from_window_agg_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
         if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
             return protobuf::PhysicalPlanNode::try_from_bounded_window_agg_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
             );
         }
 
-        if let Some(exec) = plan.downcast_ref::<DataSinkExec>() {
-            if let Some(node) = protobuf::PhysicalPlanNode::try_from_data_sink_exec(
+        if let Some(exec) = plan.downcast_ref::<DataSinkExec>()
+            && let Some(node) = protobuf::PhysicalPlanNode::try_from_data_sink_exec(
                 exec,
-                extension_codec,
-            )? {
-                return Ok(node);
-            }
+                codec,
+                proto_converter,
+            )?
+        {
+            return Ok(node);
         }
 
         if let Some(exec) = plan.downcast_ref::<UnnestExec>() {
             return protobuf::PhysicalPlanNode::try_from_unnest_exec(
                 exec,
-                extension_codec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<CooperativeExec>() {
+            return protobuf::PhysicalPlanNode::try_from_cooperative_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<LazyMemoryExec>()
+            && let Some(node) =
+                protobuf::PhysicalPlanNode::try_from_lazy_memory_exec(exec)?
+        {
+            return Ok(node);
+        }
+
+        if let Some(exec) = plan.downcast_ref::<AsyncFuncExec>() {
+            return protobuf::PhysicalPlanNode::try_from_async_func_exec(
+                exec,
+                codec,
+                proto_converter,
+            );
+        }
+
+        if let Some(exec) = plan.downcast_ref::<BufferExec>() {
+            return protobuf::PhysicalPlanNode::try_from_buffer_exec(
+                exec,
+                codec,
+                proto_converter,
             );
         }
 
         let mut buf: Vec<u8> = vec![];
-        match extension_codec.try_encode(Arc::clone(&plan_clone), &mut buf) {
+        match codec.try_encode(Arc::clone(&plan_clone), &mut buf) {
             Ok(_) => {
                 let inputs: Vec<protobuf::PhysicalPlanNode> = plan_clone
                     .children()
                     .into_iter()
                     .cloned()
                     .map(|i| {
-                        protobuf::PhysicalPlanNode::try_from_physical_plan(
+                        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
                             i,
-                            extension_codec,
+                            codec,
+                            proto_converter,
                         )
                     })
                     .collect::<Result<_>>()?;
@@ -545,9 +602,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_explain_physical_plan(
         &self,
         explain: &protobuf::ExplainExecNode,
-        _registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &TaskContext,
+
+        _codec: &dyn PhysicalExtensionCodec,
+        _proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         Ok(Arc::new(ExplainExec::new(
             Arc::new(explain.schema.as_ref().unwrap().try_into()?),
@@ -563,57 +621,65 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_projection_physical_plan(
         &self,
         projection: &protobuf::ProjectionExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&projection.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&projection.input, ctx, codec, proto_converter)?;
         let exprs = projection
             .expr
             .iter()
             .zip(projection.expr_name.iter())
             .map(|(expr, name)| {
                 Ok((
-                    parse_physical_expr(
+                    proto_converter.proto_to_physical_expr(
                         expr,
-                        registry,
+                        ctx,
                         input.schema().as_ref(),
-                        extension_codec,
+                        codec,
                     )?,
                     name.to_string(),
                 ))
             })
             .collect::<Result<Vec<(Arc<dyn PhysicalExpr>, String)>>>()?;
-        Ok(Arc::new(ProjectionExec::try_new(exprs, input)?))
+        let proj_exprs: Vec<ProjectionExpr> = exprs
+            .into_iter()
+            .map(|(expr, alias)| ProjectionExpr { expr, alias })
+            .collect();
+        Ok(Arc::new(ProjectionExec::try_new(proj_exprs, input)?))
     }
 
     fn try_into_filter_physical_plan(
         &self,
         filter: &protobuf::FilterExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&filter.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&filter.input, ctx, codec, proto_converter)?;
+
         let predicate = filter
             .expr
             .as_ref()
             .map(|expr| {
-                parse_physical_expr(
+                proto_converter.proto_to_physical_expr(
                     expr,
-                    registry,
+                    ctx,
                     input.schema().as_ref(),
-                    extension_codec,
+                    codec,
                 )
             })
             .transpose()?
             .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "filter (FilterExecNode) in PhysicalPlanNode is missing.".to_owned(),
+                internal_datafusion_err!(
+                    "filter (FilterExecNode) in PhysicalPlanNode is missing."
                 )
             })?;
+
         let filter_selectivity = filter.default_filter_selectivity.try_into();
         let projection = if !filter.projection.is_empty() {
             Some(
@@ -626,14 +692,18 @@ impl protobuf::PhysicalPlanNode {
         } else {
             None
         };
-        let filter =
-            FilterExec::try_new(predicate, input)?.with_projection(projection)?;
+
+        let filter = FilterExecBuilder::new(predicate, input)
+            .apply_projection(projection)?
+            .with_batch_size(filter.batch_size as usize)
+            .with_fetch(filter.fetch.map(|f| f as usize))
+            .build()?;
         match filter_selectivity {
             Ok(filter_selectivity) => Ok(Arc::new(
                 filter.with_default_selectivity(filter_selectivity)?,
             )),
-            Err(_) => Err(DataFusionError::Internal(
-                "filter_selectivity in PhysicalPlanNode is invalid ".to_owned(),
+            Err(_) => Err(internal_datafusion_err!(
+                "filter_selectivity in PhysicalPlanNode is invalid "
             )),
         }
     }
@@ -641,9 +711,10 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_csv_scan_physical_plan(
         &self,
         scan: &protobuf::CsvScanExecNode,
-        registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let escape =
             if let Some(protobuf::csv_scan_exec_node::OptionalEscape::Escape(escape)) =
@@ -663,23 +734,31 @@ impl protobuf::PhysicalPlanNode {
             None
         };
 
+        // Parse table schema with partition columns
+        let table_schema =
+            parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?;
+
+        let csv_options = CsvOptions {
+            has_header: Some(scan.has_header),
+            delimiter: str_to_byte(&scan.delimiter, "delimiter")?,
+            quote: str_to_byte(&scan.quote, "quote")?,
+            newlines_in_values: Some(scan.newlines_in_values),
+            ..Default::default()
+        };
         let source = Arc::new(
-            CsvSource::new(
-                scan.has_header,
-                str_to_byte(&scan.delimiter, "delimiter")?,
-                0,
-            )
-            .with_escape(escape)
-            .with_comment(comment),
+            CsvSource::new(table_schema)
+                .with_csv_options(csv_options)
+                .with_escape(escape)
+                .with_comment(comment),
         );
 
         let conf = FileScanConfigBuilder::from(parse_protobuf_file_scan_config(
             scan.base_conf.as_ref().unwrap(),
-            registry,
-            extension_codec,
+            ctx,
+            codec,
+            proto_converter,
             source,
         )?)
-        .with_newlines_in_values(scan.newlines_in_values)
         .with_file_compression_type(FileCompressionType::UNCOMPRESSED)
         .build();
         Ok(DataSourceExec::from_data_source(conf))
@@ -688,96 +767,213 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_json_scan_physical_plan(
         &self,
         scan: &protobuf::JsonScanExecNode,
-        registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let base_conf = scan.base_conf.as_ref().unwrap();
+        let table_schema = parse_table_schema_from_proto(base_conf)?;
         let scan_conf = parse_protobuf_file_scan_config(
-            scan.base_conf.as_ref().unwrap(),
-            registry,
-            extension_codec,
-            Arc::new(JsonSource::new()),
+            base_conf,
+            ctx,
+            codec,
+            proto_converter,
+            Arc::new(JsonSource::new(table_schema)),
+        )?;
+        Ok(DataSourceExec::from_data_source(scan_conf))
+    }
+
+    fn try_into_arrow_scan_physical_plan(
+        &self,
+        scan: &protobuf::ArrowScanExecNode,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let base_conf = scan.base_conf.as_ref().ok_or_else(|| {
+            internal_datafusion_err!("base_conf in ArrowScanExecNode is missing.")
+        })?;
+        let table_schema = parse_table_schema_from_proto(base_conf)?;
+        let scan_conf = parse_protobuf_file_scan_config(
+            base_conf,
+            ctx,
+            codec,
+            proto_converter,
+            Arc::new(ArrowSource::new_file_source(table_schema)),
         )?;
         Ok(DataSourceExec::from_data_source(scan_conf))
     }
 
-    #[cfg_attr(not(feature = "parquet"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "parquet"), expect(unused_variables))]
     fn try_into_parquet_scan_physical_plan(
         &self,
         scan: &protobuf::ParquetScanExecNode,
-        registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "parquet")]
         {
-            let schema =
-                parse_protobuf_file_scan_schema(scan.base_conf.as_ref().unwrap())?;
+            let schema = from_proto::parse_protobuf_file_scan_schema(
+                scan.base_conf.as_ref().unwrap(),
+            )?;
+
+            // Check if there's a projection and use projected schema for predicate parsing
+            let base_conf = scan.base_conf.as_ref().unwrap();
+            let predicate_schema = if !base_conf.projection.is_empty() {
+                // Create projected schema for parsing the predicate
+                let projected_fields: Vec<_> = base_conf
+                    .projection
+                    .iter()
+                    .map(|&i| schema.field(i as usize).clone())
+                    .collect();
+                Arc::new(Schema::new(projected_fields))
+            } else {
+                schema
+            };
+
             let predicate = scan
                 .predicate
                 .as_ref()
                 .map(|expr| {
-                    parse_physical_expr(expr, registry, schema.as_ref(), extension_codec)
+                    proto_converter.proto_to_physical_expr(
+                        expr,
+                        ctx,
+                        predicate_schema.as_ref(),
+                        codec,
+                    )
                 })
                 .transpose()?;
-            let mut options = TableParquetOptions::default();
+            let mut options = datafusion_common::config::TableParquetOptions::default();
 
             if let Some(table_options) = scan.parquet_options.as_ref() {
                 options = table_options.try_into()?;
             }
-            let mut source = ParquetSource::new(options);
+
+            // Parse table schema with partition columns
+            let table_schema = parse_table_schema_from_proto(base_conf)?;
+            let object_store_url = match base_conf.object_store_url.is_empty() {
+                false => ObjectStoreUrl::parse(&base_conf.object_store_url)?,
+                true => ObjectStoreUrl::local_filesystem(),
+            };
+            let store = ctx.runtime_env().object_store(object_store_url)?;
+            let metadata_cache =
+                ctx.runtime_env().cache_manager.get_file_metadata_cache();
+            let reader_factory =
+                Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
+
+            let mut source = ParquetSource::new(table_schema)
+                .with_parquet_file_reader_factory(reader_factory)
+                .with_table_parquet_options(options);
 
             if let Some(predicate) = predicate {
                 source = source.with_predicate(predicate);
             }
             let base_config = parse_protobuf_file_scan_config(
-                scan.base_conf.as_ref().unwrap(),
-                registry,
-                extension_codec,
+                base_conf,
+                ctx,
+                codec,
+                proto_converter,
                 Arc::new(source),
             )?;
             Ok(DataSourceExec::from_data_source(base_config))
         }
         #[cfg(not(feature = "parquet"))]
-        panic!("Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled")
+        panic!(
+            "Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled"
+        )
     }
 
-    #[cfg_attr(not(feature = "avro"), allow(unused_variables))]
+    #[cfg_attr(not(feature = "avro"), expect(unused_variables))]
     fn try_into_avro_scan_physical_plan(
         &self,
         scan: &protobuf::AvroScanExecNode,
-        registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "avro")]
         {
+            let table_schema =
+                parse_table_schema_from_proto(scan.base_conf.as_ref().unwrap())?;
             let conf = parse_protobuf_file_scan_config(
                 scan.base_conf.as_ref().unwrap(),
-                registry,
-                extension_codec,
-                Arc::new(AvroSource::new()),
+                ctx,
+                codec,
+                proto_converter,
+                Arc::new(AvroSource::new(table_schema)),
             )?;
             Ok(DataSourceExec::from_data_source(conf))
         }
+
         #[cfg(not(feature = "avro"))]
         panic!("Unable to process a Avro PhysicalPlan when `avro` feature is not enabled")
     }
 
+    fn try_into_memory_scan_physical_plan(
+        &self,
+        scan: &protobuf::MemoryScanExecNode,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let partitions = scan
+            .partitions
+            .iter()
+            .map(|p| parse_record_batches(p))
+            .collect::<Result<Vec<_>>>()?;
+
+        let proto_schema = scan.schema.as_ref().ok_or_else(|| {
+            internal_datafusion_err!("schema in MemoryScanExecNode is missing.")
+        })?;
+        let schema: SchemaRef = SchemaRef::new(proto_schema.try_into()?);
+
+        let projection = if !scan.projection.is_empty() {
+            Some(
+                scan.projection
+                    .iter()
+                    .map(|i| *i as usize)
+                    .collect::<Vec<_>>(),
+            )
+        } else {
+            None
+        };
+
+        let mut sort_information = vec![];
+        for ordering in &scan.sort_information {
+            let sort_exprs = parse_physical_sort_exprs(
+                &ordering.physical_sort_expr_nodes,
+                ctx,
+                &schema,
+                codec,
+                proto_converter,
+            )?;
+            sort_information.extend(LexOrdering::new(sort_exprs));
+        }
+
+        let source = MemorySourceConfig::try_new(&partitions, schema, projection)?
+            .with_limit(scan.fetch.map(|f| f as usize))
+            .with_show_sizes(scan.show_sizes);
+
+        let source = source.try_with_sort_information(sort_information)?;
+
+        Ok(DataSourceExec::from_data_source(source))
+    }
+
     fn try_into_coalesce_batches_physical_plan(
         &self,
         coalesce_batches: &protobuf::CoalesceBatchesExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input: Arc<dyn ExecutionPlan> = into_physical_plan(
-            &coalesce_batches.input,
-            registry,
-            runtime,
-            extension_codec,
-        )?;
+        let input: Arc<dyn ExecutionPlan> =
+            into_physical_plan(&coalesce_batches.input, ctx, codec, proto_converter)?;
         Ok(Arc::new(
+            #[expect(deprecated)]
             CoalesceBatchesExec::new(input, coalesce_batches.target_batch_size as usize)
                 .with_fetch(coalesce_batches.fetch.map(|f| f as usize)),
         ))
@@ -786,12 +982,13 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_merge_physical_plan(
         &self,
         merge: &protobuf::CoalescePartitionsExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&merge.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&merge.input, ctx, codec, proto_converter)?;
         Ok(Arc::new(
             CoalescePartitionsExec::new(input)
                 .with_fetch(merge.fetch.map(|f| f as usize)),
@@ -801,33 +998,37 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_repartition_physical_plan(
         &self,
         repart: &protobuf::RepartitionExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&repart.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&repart.input, ctx, codec, proto_converter)?;
         let partitioning = parse_protobuf_partitioning(
             repart.partitioning.as_ref(),
-            registry,
+            ctx,
             input.schema().as_ref(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        Ok(Arc::new(RepartitionExec::try_new(
-            input,
-            partitioning.unwrap(),
-        )?))
+        let mut repart_exec = RepartitionExec::try_new(input, partitioning.unwrap())?;
+        if repart.preserve_order {
+            repart_exec = repart_exec.with_preserve_order();
+        }
+        Ok(Arc::new(repart_exec))
     }
 
     fn try_into_global_limit_physical_plan(
         &self,
         limit: &protobuf::GlobalLimitExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&limit.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&limit.input, ctx, codec, proto_converter)?;
         let fetch = if limit.fetch >= 0 {
             Some(limit.fetch as usize)
         } else {
@@ -843,24 +1044,26 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_local_limit_physical_plan(
         &self,
         limit: &protobuf::LocalLimitExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&limit.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&limit.input, ctx, codec, proto_converter)?;
         Ok(Arc::new(LocalLimitExec::new(input, limit.fetch as usize)))
     }
 
     fn try_into_window_physical_plan(
         &self,
         window_agg: &protobuf::WindowAggExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&window_agg.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&window_agg.input, ctx, codec, proto_converter)?;
         let input_schema = input.schema();
 
         let physical_window_expr: Vec<Arc<dyn WindowExpr>> = window_agg
@@ -869,9 +1072,10 @@ impl protobuf::PhysicalPlanNode {
             .map(|window_expr| {
                 parse_physical_window_expr(
                     window_expr,
-                    registry,
+                    ctx,
                     input_schema.as_ref(),
-                    extension_codec,
+                    codec,
+                    proto_converter,
                 )
             })
             .collect::<Result<Vec<_>, _>>()?;
@@ -880,11 +1084,11 @@ impl protobuf::PhysicalPlanNode {
             .partition_keys
             .iter()
             .map(|expr| {
-                parse_physical_expr(
+                proto_converter.proto_to_physical_expr(
                     expr,
-                    registry,
+                    ctx,
                     input.schema().as_ref(),
-                    extension_codec,
+                    codec,
                 )
             })
             .collect::<Result<Vec<Arc<dyn PhysicalExpr>>>>()?;
@@ -918,12 +1122,13 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_aggregate_physical_plan(
         &self,
         hash_agg: &protobuf::AggregateExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hash_agg.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&hash_agg.input, ctx, codec, proto_converter)?;
         let mode = protobuf::AggregateMode::try_from(hash_agg.mode).map_err(|_| {
             proto_error(format!(
                 "Received a AggregateNode message with unknown AggregateMode {}",
@@ -938,6 +1143,7 @@ impl protobuf::PhysicalPlanNode {
             protobuf::AggregateMode::SinglePartitioned => {
                 AggregateMode::SinglePartitioned
             }
+            protobuf::AggregateMode::PartialReduce => AggregateMode::PartialReduce,
         };
 
         let num_expr = hash_agg.group_expr.len();
@@ -947,13 +1153,9 @@ impl protobuf::PhysicalPlanNode {
             .iter()
             .zip(hash_agg.group_expr_name.iter())
             .map(|(expr, name)| {
-                parse_physical_expr(
-                    expr,
-                    registry,
-                    input.schema().as_ref(),
-                    extension_codec,
-                )
-                .map(|expr| (expr, name.to_string()))
+                proto_converter
+                    .proto_to_physical_expr(expr, ctx, input.schema().as_ref(), codec)
+                    .map(|expr| (expr, name.to_string()))
             })
             .collect::<Result<Vec<_>, _>>()?;
 
@@ -962,13 +1164,9 @@ impl protobuf::PhysicalPlanNode {
             .iter()
             .zip(hash_agg.group_expr_name.iter())
             .map(|(expr, name)| {
-                parse_physical_expr(
-                    expr,
-                    registry,
-                    input.schema().as_ref(),
-                    extension_codec,
-                )
-                .map(|expr| (expr, name.to_string()))
+                proto_converter
+                    .proto_to_physical_expr(expr, ctx, input.schema().as_ref(), codec)
+                    .map(|expr| (expr, name.to_string()))
             })
             .collect::<Result<Vec<_>, _>>()?;
 
@@ -982,10 +1180,10 @@ impl protobuf::PhysicalPlanNode {
             vec![]
         };
 
+        let has_grouping_set = hash_agg.has_grouping_set;
+
         let input_schema = hash_agg.input_schema.as_ref().ok_or_else(|| {
-            DataFusionError::Internal(
-                "input_schema in AggregateNode is missing.".to_owned(),
-            )
+            internal_datafusion_err!("input_schema in AggregateNode is missing.")
         })?;
         let physical_schema: SchemaRef = SchemaRef::new(input_schema.try_into()?);
 
@@ -996,11 +1194,11 @@ impl protobuf::PhysicalPlanNode {
                 expr.expr
                     .as_ref()
                     .map(|e| {
-                        parse_physical_expr(
+                        proto_converter.proto_to_physical_expr(
                             e,
-                            registry,
+                            ctx,
                             &physical_schema,
-                            extension_codec,
+                            codec,
                         )
                     })
                     .transpose()
@@ -1022,48 +1220,48 @@ impl protobuf::PhysicalPlanNode {
                             .expr
                             .iter()
                             .map(|e| {
-                                parse_physical_expr(
+                                proto_converter.proto_to_physical_expr(
                                     e,
-                                    registry,
+                                    ctx,
                                     &physical_schema,
-                                    extension_codec,
+                                    codec,
                                 )
                             })
                             .collect::<Result<Vec<_>>>()?;
-                        let ordering_req: LexOrdering = agg_node
+                        let order_bys = agg_node
                             .ordering_req
                             .iter()
                             .map(|e| {
                                 parse_physical_sort_expr(
                                     e,
-                                    registry,
+                                    ctx,
                                     &physical_schema,
-                                    extension_codec,
+                                    codec,
+                                    proto_converter,
                                 )
                             })
-                            .collect::<Result<LexOrdering>>()?;
+                            .collect::<Result<_>>()?;
                         agg_node
                             .aggregate_function
                             .as_ref()
                             .map(|func| match func {
                                 AggregateFunction::UserDefinedAggrFunction(udaf_name) => {
                                     let agg_udf = match &agg_node.fun_definition {
-                                        Some(buf) => extension_codec
-                                            .try_decode_udaf(udaf_name, buf)?,
-                                        None => {
-                                            registry.udaf(udaf_name).or_else(|_| {
-                                                extension_codec
-                                                    .try_decode_udaf(udaf_name, &[])
-                                            })?
+                                        Some(buf) => {
+                                            codec.try_decode_udaf(udaf_name, buf)?
                                         }
+                                        None => ctx.udaf(udaf_name).or_else(|_| {
+                                            codec.try_decode_udaf(udaf_name, &[])
+                                        })?,
                                     };
 
                                     AggregateExprBuilder::new(agg_udf, input_phy_expr)
                                         .schema(Arc::clone(&physical_schema))
                                         .alias(name)
+                                        .human_display(agg_node.human_display.clone())
                                         .with_ignore_nulls(agg_node.ignore_nulls)
                                         .with_distinct(agg_node.distinct)
-                                        .order_by(ordering_req)
+                                        .order_by(order_bys)
                                         .build()
                                         .map(Arc::new)
                                 }
@@ -1080,21 +1278,25 @@ impl protobuf::PhysicalPlanNode {
             })
             .collect::<Result<Vec<_>, _>>()?;
 
-        let limit = hash_agg
-            .limit
-            .as_ref()
-            .map(|lit_value| lit_value.limit as usize);
-
         let agg = AggregateExec::try_new(
             agg_mode,
-            PhysicalGroupBy::new(group_expr, null_expr, groups),
+            PhysicalGroupBy::new(group_expr, null_expr, groups, has_grouping_set),
             physical_aggr_expr,
             physical_filter_expr,
             input,
             physical_schema,
         )?;
 
-        let agg = agg.with_limit(limit);
+        let agg = if let Some(limit_proto) = &hash_agg.limit {
+            let limit = limit_proto.limit as usize;
+            let limit_options = match limit_proto.descending {
+                Some(descending) => LimitOptions::new_with_order(limit, descending),
+                None => LimitOptions::new(limit),
+            };
+            agg.with_limit_options(Some(limit_options))
+        } else {
+            agg
+        };
 
         Ok(Arc::new(agg))
     }
@@ -1102,31 +1304,32 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_hash_join_physical_plan(
         &self,
         hashjoin: &protobuf::HashJoinExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hashjoin.left, registry, runtime, extension_codec)?;
+            into_physical_plan(&hashjoin.left, ctx, codec, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&hashjoin.right, registry, runtime, extension_codec)?;
+            into_physical_plan(&hashjoin.right, ctx, codec, proto_converter)?;
         let left_schema = left.schema();
         let right_schema = right.schema();
         let on: Vec<(PhysicalExprRef, PhysicalExprRef)> = hashjoin
             .on
             .iter()
             .map(|col| {
-                let left = parse_physical_expr(
+                let left = proto_converter.proto_to_physical_expr(
                     &col.left.clone().unwrap(),
-                    registry,
+                    ctx,
                     left_schema.as_ref(),
-                    extension_codec,
+                    codec,
                 )?;
-                let right = parse_physical_expr(
+                let right = proto_converter.proto_to_physical_expr(
                     &col.right.clone().unwrap(),
-                    registry,
+                    ctx,
                     right_schema.as_ref(),
-                    extension_codec,
+                    codec,
                 )?;
                 Ok((left, right))
             })
@@ -1138,6 +1341,13 @@ impl protobuf::PhysicalPlanNode {
                     hashjoin.join_type
                 ))
             })?;
+        let null_equality = protobuf::NullEquality::try_from(hashjoin.null_equality)
+            .map_err(|_| {
+                proto_error(format!(
+                    "Received a HashJoinNode message with unknown NullEquality {}",
+                    hashjoin.null_equality
+                ))
+            })?;
         let filter = hashjoin
             .filter
             .as_ref()
@@ -1148,12 +1358,12 @@ impl protobuf::PhysicalPlanNode {
                     .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                     .try_into()?;
 
-                let expression = parse_physical_expr(
+                let expression = proto_converter.proto_to_physical_expr(
                     f.expression.as_ref().ok_or_else(|| {
                         proto_error("Unexpected empty filter expression")
                     })?,
-                    registry, &schema,
-                    extension_codec,
+                    ctx, &schema,
+                    codec,
                 )?;
                 let column_indices = f.column_indices
                     .iter()
@@ -1206,38 +1416,38 @@ impl protobuf::PhysicalPlanNode {
             &join_type.into(),
             projection,
             partition_mode,
-            hashjoin.null_equals_null,
+            null_equality.into(),
+            hashjoin.null_aware,
         )?))
     }
 
     fn try_into_symmetric_hash_join_physical_plan(
         &self,
         sym_join: &protobuf::SymmetricHashJoinExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let left =
-            into_physical_plan(&sym_join.left, registry, runtime, extension_codec)?;
-        let right =
-            into_physical_plan(&sym_join.right, registry, runtime, extension_codec)?;
+        let left = into_physical_plan(&sym_join.left, ctx, codec, proto_converter)?;
+        let right = into_physical_plan(&sym_join.right, ctx, codec, proto_converter)?;
         let left_schema = left.schema();
         let right_schema = right.schema();
         let on = sym_join
             .on
             .iter()
             .map(|col| {
-                let left = parse_physical_expr(
+                let left = proto_converter.proto_to_physical_expr(
                     &col.left.clone().unwrap(),
-                    registry,
+                    ctx,
                     left_schema.as_ref(),
-                    extension_codec,
+                    codec,
                 )?;
-                let right = parse_physical_expr(
+                let right = proto_converter.proto_to_physical_expr(
                     &col.right.clone().unwrap(),
-                    registry,
+                    ctx,
                     right_schema.as_ref(),
-                    extension_codec,
+                    codec,
                 )?;
                 Ok((left, right))
             })
@@ -1249,6 +1459,13 @@ impl protobuf::PhysicalPlanNode {
                     sym_join.join_type
                 ))
             })?;
+        let null_equality = protobuf::NullEquality::try_from(sym_join.null_equality)
+            .map_err(|_| {
+                proto_error(format!(
+                    "Received a SymmetricHashJoin message with unknown NullEquality {}",
+                    sym_join.null_equality
+                ))
+            })?;
         let filter = sym_join
             .filter
             .as_ref()
@@ -1259,12 +1476,12 @@ impl protobuf::PhysicalPlanNode {
                     .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                     .try_into()?;
 
-                let expression = parse_physical_expr(
+                let expression = proto_converter.proto_to_physical_expr(
                     f.expression.as_ref().ok_or_else(|| {
                         proto_error("Unexpected empty filter expression")
                     })?,
-                    registry, &schema,
-                    extension_codec,
+                    ctx, &schema,
+                    codec,
                 )?;
                 let column_indices = f.column_indices
                     .iter()
@@ -1288,27 +1505,21 @@ impl protobuf::PhysicalPlanNode {
 
         let left_sort_exprs = parse_physical_sort_exprs(
             &sym_join.left_sort_exprs,
-            registry,
+            ctx,
             &left_schema,
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let left_sort_exprs = if left_sort_exprs.is_empty() {
-            None
-        } else {
-            Some(left_sort_exprs)
-        };
+        let left_sort_exprs = LexOrdering::new(left_sort_exprs);
 
         let right_sort_exprs = parse_physical_sort_exprs(
             &sym_join.right_sort_exprs,
-            registry,
+            ctx,
             &right_schema,
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right_sort_exprs = if right_sort_exprs.is_empty() {
-            None
-        } else {
-            Some(right_sort_exprs)
-        };
+        let right_sort_exprs = LexOrdering::new(right_sort_exprs);
 
         let partition_mode = protobuf::StreamPartitionMode::try_from(
             sym_join.partition_mode,
@@ -1333,7 +1544,7 @@ impl protobuf::PhysicalPlanNode {
             on,
             filter,
             &join_type.into(),
-            sym_join.null_equals_null,
+            null_equality.into(),
             left_sort_exprs,
             right_sort_exprs,
             partition_mode,
@@ -1344,35 +1555,29 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_union_physical_plan(
         &self,
         union: &protobuf::UnionExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = vec![];
         for input in &union.inputs {
-            inputs.push(input.try_into_physical_plan(
-                registry,
-                runtime,
-                extension_codec,
-            )?);
+            inputs.push(proto_converter.proto_to_execution_plan(ctx, codec, input)?);
         }
-        Ok(Arc::new(UnionExec::new(inputs)))
+        UnionExec::try_new(inputs)
     }
 
     fn try_into_interleave_physical_plan(
         &self,
         interleave: &protobuf::InterleaveExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let mut inputs: Vec<Arc<dyn ExecutionPlan>> = vec![];
         for input in &interleave.inputs {
-            inputs.push(input.try_into_physical_plan(
-                registry,
-                runtime,
-                extension_codec,
-            )?);
+            inputs.push(proto_converter.proto_to_execution_plan(ctx, codec, input)?);
         }
         Ok(Arc::new(InterleaveExec::try_new(inputs)?))
     }
@@ -1380,23 +1585,25 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_cross_join_physical_plan(
         &self,
         crossjoin: &protobuf::CrossJoinExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&crossjoin.left, registry, runtime, extension_codec)?;
+            into_physical_plan(&crossjoin.left, ctx, codec, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&crossjoin.right, registry, runtime, extension_codec)?;
+            into_physical_plan(&crossjoin.right, ctx, codec, proto_converter)?;
         Ok(Arc::new(CrossJoinExec::new(left, right)))
     }
 
     fn try_into_empty_physical_plan(
         &self,
         empty: &protobuf::EmptyExecNode,
-        _registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &TaskContext,
+
+        _codec: &dyn PhysicalExtensionCodec,
+        _proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let schema = Arc::new(convert_required!(empty.schema)?);
         Ok(Arc::new(EmptyExec::new(schema)))
@@ -1405,9 +1612,9 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_placeholder_row_physical_plan(
         &self,
         placeholder: &protobuf::PlaceholderRowExecNode,
-        _registry: &dyn FunctionRegistry,
-        _runtime: &RuntimeEnv,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _ctx: &TaskContext,
+
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let schema = Arc::new(convert_required!(placeholder.schema)?);
         Ok(Arc::new(PlaceholderRowExec::new(schema)))
@@ -1416,51 +1623,50 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_sort_physical_plan(
         &self,
         sort: &protobuf::SortExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&sort.input, registry, runtime, extension_codec)?;
+        let input = into_physical_plan(&sort.input, ctx, codec, proto_converter)?;
         let exprs = sort
-                    .expr
-                    .iter()
-                    .map(|expr| {
-                        let expr = expr.expr_type.as_ref().ok_or_else(|| {
-                            proto_error(format!(
-                                "physical_plan::from_proto() Unexpected expr {self:?}"
-                            ))
-                        })?;
-                        if let ExprType::Sort(sort_expr) = expr {
-                            let expr = sort_expr
-                                .expr
-                                .as_ref()
-                                .ok_or_else(|| {
-                                    proto_error(format!(
-                                        "physical_plan::from_proto() Unexpected sort expr {self:?}"
-                                    ))
-                                })?
-                                .as_ref();
-                            Ok(PhysicalSortExpr {
-                                expr: parse_physical_expr(expr, registry, input.schema().as_ref(), extension_codec)?,
-                                options: SortOptions {
-                                    descending: !sort_expr.asc,
-                                    nulls_first: sort_expr.nulls_first,
-                                },
-                            })
-                        } else {
-                            internal_err!(
-                                "physical_plan::from_proto() {self:?}"
-                            )
-                        }
+            .expr
+            .iter()
+            .map(|expr| {
+                let expr = expr.expr_type.as_ref().ok_or_else(|| {
+                    proto_error(format!(
+                        "physical_plan::from_proto() Unexpected expr {self:?}"
+                    ))
+                })?;
+                if let ExprType::Sort(sort_expr) = expr {
+                    let expr = sort_expr
+                        .expr
+                        .as_ref()
+                        .ok_or_else(|| {
+                            proto_error(format!(
+                                "physical_plan::from_proto() Unexpected sort expr {self:?}"
+                            ))
+                        })?
+                        .as_ref();
+                    Ok(PhysicalSortExpr {
+                        expr: proto_converter.proto_to_physical_expr(expr, ctx, input.schema().as_ref(), codec)?,
+                        options: SortOptions {
+                            descending: !sort_expr.asc,
+                            nulls_first: sort_expr.nulls_first,
+                        },
                     })
-                    .collect::<Result<LexOrdering, _>>()?;
-        let fetch = if sort.fetch < 0 {
-            None
-        } else {
-            Some(sort.fetch as usize)
+                } else {
+                    internal_err!(
+                        "physical_plan::from_proto() {self:?}"
+                    )
+                }
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let Some(ordering) = LexOrdering::new(exprs) else {
+            return internal_err!("SortExec requires an ordering");
         };
-        let new_sort = SortExec::new(exprs, input)
+        let fetch = (sort.fetch >= 0).then_some(sort.fetch as _);
+        let new_sort = SortExec::new(ordering, input)
             .with_fetch(fetch)
             .with_preserve_partitioning(sort.preserve_partitioning);
 
@@ -1470,12 +1676,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_sort_preserving_merge_physical_plan(
         &self,
         sort: &protobuf::SortPreservingMergeExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&sort.input, registry, runtime, extension_codec)?;
+        let input = into_physical_plan(&sort.input, ctx, codec, proto_converter)?;
         let exprs = sort
             .expr
             .iter()
@@ -1496,11 +1702,11 @@ impl protobuf::PhysicalPlanNode {
                         })?
                         .as_ref();
                     Ok(PhysicalSortExpr {
-                        expr: parse_physical_expr(
+                        expr: proto_converter.proto_to_physical_expr(
                             expr,
-                            registry,
+                            ctx,
                             input.schema().as_ref(),
-                            extension_codec,
+                            codec,
                         )?,
                         options: SortOptions {
                             descending: !sort_expr.asc,
@@ -1511,32 +1717,31 @@ impl protobuf::PhysicalPlanNode {
                     internal_err!("physical_plan::from_proto() {self:?}")
                 }
             })
-            .collect::<Result<LexOrdering, _>>()?;
-        let fetch = if sort.fetch < 0 {
-            None
-        } else {
-            Some(sort.fetch as usize)
+            .collect::<Result<Vec<_>>>()?;
+        let Some(ordering) = LexOrdering::new(exprs) else {
+            return internal_err!("SortExec requires an ordering");
         };
+        let fetch = (sort.fetch >= 0).then_some(sort.fetch as _);
         Ok(Arc::new(
-            SortPreservingMergeExec::new(exprs, input).with_fetch(fetch),
+            SortPreservingMergeExec::new(ordering, input).with_fetch(fetch),
         ))
     }
 
     fn try_into_extension_physical_plan(
         &self,
         extension: &protobuf::PhysicalExtensionNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let inputs: Vec<Arc<dyn ExecutionPlan>> = extension
             .inputs
             .iter()
-            .map(|i| i.try_into_physical_plan(registry, runtime, extension_codec))
+            .map(|i| proto_converter.proto_to_execution_plan(ctx, codec, i))
             .collect::<Result<_>>()?;
 
-        let extension_node =
-            extension_codec.try_decode(extension.node.as_slice(), &inputs, registry)?;
+        let extension_node = codec.try_decode(extension.node.as_slice(), &inputs, ctx)?;
 
         Ok(extension_node)
     }
@@ -1544,14 +1749,15 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_nested_loop_join_physical_plan(
         &self,
         join: &protobuf::NestedLoopJoinExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let left: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&join.left, registry, runtime, extension_codec)?;
+            into_physical_plan(&join.left, ctx, codec, proto_converter)?;
         let right: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&join.right, registry, runtime, extension_codec)?;
+            into_physical_plan(&join.right, ctx, codec, proto_converter)?;
         let join_type = protobuf::JoinType::try_from(join.join_type).map_err(|_| {
             proto_error(format!(
                 "Received a NestedLoopJoinExecNode message with unknown JoinType {}",
@@ -1568,12 +1774,12 @@ impl protobuf::PhysicalPlanNode {
                             .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
                             .try_into()?;
 
-                        let expression = parse_physical_expr(
+                        let expression = proto_converter.proto_to_physical_expr(
                             f.expression.as_ref().ok_or_else(|| {
                                 proto_error("Unexpected empty filter expression")
                             })?,
-                            registry, &schema,
-                            extension_codec,
+                            ctx, &schema,
+                            codec,
                         )?;
                         let column_indices = f.column_indices
                             .iter()
@@ -1618,15 +1824,17 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_analyze_physical_plan(
         &self,
         analyze: &protobuf::AnalyzeExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let input: Arc<dyn ExecutionPlan> =
-            into_physical_plan(&analyze.input, registry, runtime, extension_codec)?;
+            into_physical_plan(&analyze.input, ctx, codec, proto_converter)?;
         Ok(Arc::new(AnalyzeExec::new(
             analyze.verbose,
             analyze.show_statistics,
+            vec![MetricType::SUMMARY, MetricType::DEV],
             input,
             Arc::new(convert_required!(analyze.schema)?),
         )))
@@ -1635,11 +1843,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_json_sink_physical_plan(
         &self,
         sink: &protobuf::JsonSinkExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sink.input, registry, runtime, extension_codec)?;
+        let input = into_physical_plan(&sink.input, ctx, codec, proto_converter)?;
 
         let data_sink: JsonSink = sink
             .sink
@@ -1653,13 +1862,17 @@ impl protobuf::PhysicalPlanNode {
             .map(|collection| {
                 parse_physical_sort_exprs(
                     &collection.physical_sort_expr_nodes,
-                    registry,
+                    ctx,
                     &sink_schema,
-                    extension_codec,
+                    codec,
+                    proto_converter,
                 )
-                .map(LexRequirement::from)
+                .map(|sort_exprs| {
+                    LexRequirement::new(sort_exprs.into_iter().map(Into::into))
+                })
             })
-            .transpose()?;
+            .transpose()?
+            .flatten();
         Ok(Arc::new(DataSinkExec::new(
             input,
             Arc::new(data_sink),
@@ -1670,11 +1883,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_csv_sink_physical_plan(
         &self,
         sink: &protobuf::CsvSinkExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input = into_physical_plan(&sink.input, registry, runtime, extension_codec)?;
+        let input = into_physical_plan(&sink.input, ctx, codec, proto_converter)?;
 
         let data_sink: CsvSink = sink
             .sink
@@ -1688,13 +1902,17 @@ impl protobuf::PhysicalPlanNode {
             .map(|collection| {
                 parse_physical_sort_exprs(
                     &collection.physical_sort_expr_nodes,
-                    registry,
+                    ctx,
                     &sink_schema,
-                    extension_codec,
+                    codec,
+                    proto_converter,
                 )
-                .map(LexRequirement::from)
+                .map(|sort_exprs| {
+                    LexRequirement::new(sort_exprs.into_iter().map(Into::into))
+                })
             })
-            .transpose()?;
+            .transpose()?
+            .flatten();
         Ok(Arc::new(DataSinkExec::new(
             input,
             Arc::new(data_sink),
@@ -1702,17 +1920,18 @@ impl protobuf::PhysicalPlanNode {
         )))
     }
 
+    #[cfg_attr(not(feature = "parquet"), expect(unused_variables))]
     fn try_into_parquet_sink_physical_plan(
         &self,
         sink: &protobuf::ParquetSinkExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         #[cfg(feature = "parquet")]
         {
-            let input =
-                into_physical_plan(&sink.input, registry, runtime, extension_codec)?;
+            let input = into_physical_plan(&sink.input, ctx, codec, proto_converter)?;
 
             let data_sink: ParquetSink = sink
                 .sink
@@ -1726,13 +1945,17 @@ impl protobuf::PhysicalPlanNode {
                 .map(|collection| {
                     parse_physical_sort_exprs(
                         &collection.physical_sort_expr_nodes,
-                        registry,
+                        ctx,
                         &sink_schema,
-                        extension_codec,
+                        codec,
+                        proto_converter,
                     )
-                    .map(LexRequirement::from)
+                    .map(|sort_exprs| {
+                        LexRequirement::new(sort_exprs.into_iter().map(Into::into))
+                    })
                 })
-                .transpose()?;
+                .transpose()?
+                .flatten();
             Ok(Arc::new(DataSinkExec::new(
                 input,
                 Arc::new(data_sink),
@@ -1746,12 +1969,12 @@ impl protobuf::PhysicalPlanNode {
     fn try_into_unnest_physical_plan(
         &self,
         unnest: &protobuf::UnnestExecNode,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let input =
-            into_physical_plan(&unnest.input, registry, runtime, extension_codec)?;
+        let input = into_physical_plan(&unnest.input, ctx, codec, proto_converter)?;
 
         Ok(Arc::new(UnnestExec::new(
             input,
@@ -1766,12 +1989,260 @@ impl protobuf::PhysicalPlanNode {
             unnest.struct_type_columns.iter().map(|c| *c as _).collect(),
             Arc::new(convert_required!(unnest.schema)?),
             into_required!(unnest.options)?,
-        )))
+        )?))
+    }
+
+    fn generate_series_name_to_str(name: protobuf::GenerateSeriesName) -> &'static str {
+        match name {
+            protobuf::GenerateSeriesName::GsGenerateSeries => "generate_series",
+            protobuf::GenerateSeriesName::GsRange => "range",
+        }
+    }
+    fn try_into_sort_join(
+        &self,
+        sort_join: &SortMergeJoinExecNode,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let left = into_physical_plan(&sort_join.left, ctx, codec, proto_converter)?;
+        let left_schema = left.schema();
+        let right = into_physical_plan(&sort_join.right, ctx, codec, proto_converter)?;
+        let right_schema = right.schema();
+
+        let filter = sort_join
+            .filter
+            .as_ref()
+            .map(|f| {
+                let schema = f
+                    .schema
+                    .as_ref()
+                    .ok_or_else(|| proto_error("Missing JoinFilter schema"))?
+                    .try_into()?;
+
+                let expression = proto_converter.proto_to_physical_expr(
+                    f.expression.as_ref().ok_or_else(|| {
+                        proto_error("Unexpected empty filter expression")
+                    })?,
+                    ctx,
+                    &schema,
+                    codec,
+                )?;
+                let column_indices = f
+                    .column_indices
+                    .iter()
+                    .map(|i| {
+                        let side =
+                            protobuf::JoinSide::try_from(i.side).map_err(|_| {
+                                proto_error(format!(
+                                    "Received a SortMergeJoinExecNode message with JoinSide in Filter {}",
+                                    i.side
+                                ))
+                            })?;
+
+                        Ok(ColumnIndex {
+                            index: i.index as usize,
+                            side: side.into(),
+                        })
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                Ok(JoinFilter::new(
+                    expression,
+                    column_indices,
+                    Arc::new(schema),
+                ))
+            })
+            .map_or(Ok(None), |v: Result<JoinFilter>| v.map(Some))?;
+
+        let join_type =
+            protobuf::JoinType::try_from(sort_join.join_type).map_err(|_| {
+                proto_error(format!(
+                    "Received a SortMergeJoinExecNode message with unknown JoinType {}",
+                    sort_join.join_type
+                ))
+            })?;
+
+        let null_equality = protobuf::NullEquality::try_from(sort_join.null_equality)
+            .map_err(|_| {
+                proto_error(format!(
+                    "Received a SortMergeJoinExecNode message with unknown NullEquality {}",
+                    sort_join.null_equality
+                ))
+            })?;
+
+        let sort_options = sort_join
+            .sort_options
+            .iter()
+            .map(|e| SortOptions {
+                descending: !e.asc,
+                nulls_first: e.nulls_first,
+            })
+            .collect();
+        let on = sort_join
+            .on
+            .iter()
+            .map(|col| {
+                let left = proto_converter.proto_to_physical_expr(
+                    &col.left.clone().unwrap(),
+                    ctx,
+                    left_schema.as_ref(),
+                    codec,
+                )?;
+                let right = proto_converter.proto_to_physical_expr(
+                    &col.right.clone().unwrap(),
+                    ctx,
+                    right_schema.as_ref(),
+                    codec,
+                )?;
+                Ok((left, right))
+            })
+            .collect::<Result<_>>()?;
+
+        Ok(Arc::new(SortMergeJoinExec::try_new(
+            left,
+            right,
+            on,
+            filter,
+            join_type.into(),
+            sort_options,
+            null_equality.into(),
+        )?))
+    }
+
+    fn try_into_generate_series_physical_plan(
+        &self,
+        generate_series: &protobuf::GenerateSeriesNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let schema: SchemaRef = Arc::new(convert_required!(generate_series.schema)?);
+
+        let args = match &generate_series.args {
+            Some(protobuf::generate_series_node::Args::ContainsNull(args)) => {
+                GenSeriesArgs::ContainsNull {
+                    name: Self::generate_series_name_to_str(args.name()),
+                }
+            }
+            Some(protobuf::generate_series_node::Args::Int64Args(args)) => {
+                GenSeriesArgs::Int64Args {
+                    start: args.start,
+                    end: args.end,
+                    step: args.step,
+                    include_end: args.include_end,
+                    name: Self::generate_series_name_to_str(args.name()),
+                }
+            }
+            Some(protobuf::generate_series_node::Args::TimestampArgs(args)) => {
+                let step_proto = args.step.as_ref().ok_or_else(|| {
+                    internal_datafusion_err!("Missing step in TimestampArgs")
+                })?;
+                let step = IntervalMonthDayNanoType::make_value(
+                    step_proto.months,
+                    step_proto.days,
+                    step_proto.nanos,
+                );
+                GenSeriesArgs::TimestampArgs {
+                    start: args.start,
+                    end: args.end,
+                    step,
+                    tz: args.tz.as_ref().map(|s| Arc::from(s.as_str())),
+                    include_end: args.include_end,
+                    name: Self::generate_series_name_to_str(args.name()),
+                }
+            }
+            Some(protobuf::generate_series_node::Args::DateArgs(args)) => {
+                let step_proto = args.step.as_ref().ok_or_else(|| {
+                    internal_datafusion_err!("Missing step in DateArgs")
+                })?;
+                let step = IntervalMonthDayNanoType::make_value(
+                    step_proto.months,
+                    step_proto.days,
+                    step_proto.nanos,
+                );
+                GenSeriesArgs::DateArgs {
+                    start: args.start,
+                    end: args.end,
+                    step,
+                    include_end: args.include_end,
+                    name: Self::generate_series_name_to_str(args.name()),
+                }
+            }
+            None => return internal_err!("Missing args in GenerateSeriesNode"),
+        };
+
+        let table = GenerateSeriesTable::new(Arc::clone(&schema), args);
+        let generator = table.as_generator(generate_series.target_batch_size as usize)?;
+
+        Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?))
+    }
+
+    fn try_into_cooperative_physical_plan(
+        &self,
+        field_stream: &protobuf::CooperativeExecNode,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let input = into_physical_plan(&field_stream.input, ctx, codec, proto_converter)?;
+        Ok(Arc::new(CooperativeExec::new(input)))
+    }
+
+    fn try_into_async_func_physical_plan(
+        &self,
+        async_func: &protobuf::AsyncFuncExecNode,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let input: Arc<dyn ExecutionPlan> =
+            into_physical_plan(&async_func.input, ctx, codec, proto_converter)?;
+
+        if async_func.async_exprs.len() != async_func.async_expr_names.len() {
+            return internal_err!(
+                "AsyncFuncExecNode async_exprs length does not match async_expr_names"
+            );
+        }
+
+        let async_exprs = async_func
+            .async_exprs
+            .iter()
+            .zip(async_func.async_expr_names.iter())
+            .map(|(expr, name)| {
+                let physical_expr = proto_converter.proto_to_physical_expr(
+                    expr,
+                    ctx,
+                    input.schema().as_ref(),
+                    codec,
+                )?;
+
+                Ok(Arc::new(AsyncFuncExpr::try_new(
+                    name.clone(),
+                    physical_expr,
+                    input.schema().as_ref(),
+                )?))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(Arc::new(AsyncFuncExec::try_new(async_exprs, input)?))
+    }
+
+    fn try_into_buffer_physical_plan(
+        &self,
+        buffer: &protobuf::BufferExecNode,
+        ctx: &TaskContext,
+        extension_codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let input: Arc<dyn ExecutionPlan> =
+            into_physical_plan(&buffer.input, ctx, extension_codec, proto_converter)?;
+
+        Ok(Arc::new(BufferExec::new(input, buffer.capacity as usize)))
     }
 
     fn try_from_explain_exec(
         exec: &ExplainExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Explain(
@@ -1790,18 +2261,26 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_projection_exec(
         exec: &ProjectionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let expr = exec
             .expr()
             .iter()
-            .map(|expr| serialize_physical_expr(&expr.0, extension_codec))
+            .map(|proj_expr| {
+                proto_converter.physical_expr_to_proto(&proj_expr.expr, codec)
+            })
             .collect::<Result<Vec<_>>>()?;
-        let expr_name = exec.expr().iter().map(|expr| expr.1.clone()).collect();
+        let expr_name = exec
+            .expr()
+            .iter()
+            .map(|proj_expr| proj_expr.alias.clone())
+            .collect();
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Projection(Box::new(
                 protobuf::ProjectionExecNode {
@@ -1815,11 +2294,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_analyze_exec(
         exec: &AnalyzeExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Analyze(Box::new(
@@ -1835,24 +2316,28 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_filter_exec(
         exec: &FilterExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Filter(Box::new(
                 protobuf::FilterExecNode {
                     input: Some(Box::new(input)),
-                    expr: Some(serialize_physical_expr(
-                        exec.predicate(),
-                        extension_codec,
-                    )?),
+                    expr: Some(
+                        proto_converter
+                            .physical_expr_to_proto(exec.predicate(), codec)?,
+                    ),
                     default_filter_selectivity: exec.default_selectivity() as u32,
                     projection: exec.projection().as_ref().map_or_else(Vec::new, |v| {
                         v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
                     }),
+                    batch_size: exec.batch_size() as u32,
+                    fetch: exec.fetch().map(|f| f as u32),
                 },
             ))),
         })
@@ -1860,11 +2345,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_global_limit_exec(
         limit: &GlobalLimitExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             limit.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -1883,11 +2370,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_local_limit_exec(
         limit: &LocalLimitExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             limit.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::LocalLimit(Box::new(
@@ -1901,22 +2390,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_hash_join_exec(
         exec: &HashJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let on: Vec<protobuf::JoinOn> = exec
             .on()
             .iter()
             .map(|tuple| {
-                let l = serialize_physical_expr(&tuple.0, extension_codec)?;
-                let r = serialize_physical_expr(&tuple.1, extension_codec)?;
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
                 Ok::<_, DataFusionError>(protobuf::JoinOn {
                     left: Some(l),
                     right: Some(r),
@@ -1924,12 +2416,13 @@ impl protobuf::PhysicalPlanNode {
             })
             .collect::<Result<_>>()?;
         let join_type: protobuf::JoinType = exec.join_type().to_owned().into();
+        let null_equality: protobuf::NullEquality = exec.null_equality().into();
         let filter = exec
             .filter()
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -1964,11 +2457,12 @@ impl protobuf::PhysicalPlanNode {
                     on,
                     join_type: join_type.into(),
                     partition_mode: partition_mode.into(),
-                    null_equals_null: exec.null_equals_null(),
+                    null_equality: null_equality.into(),
                     filter,
                     projection: exec.projection.as_ref().map_or_else(Vec::new, |v| {
                         v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
                     }),
+                    null_aware: exec.null_aware,
                 },
             ))),
         })
@@ -1976,22 +2470,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_symmetric_hash_join_exec(
         exec: &SymmetricHashJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let on = exec
             .on()
             .iter()
             .map(|tuple| {
-                let l = serialize_physical_expr(&tuple.0, extension_codec)?;
-                let r = serialize_physical_expr(&tuple.1, extension_codec)?;
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
                 Ok::<_, DataFusionError>(protobuf::JoinOn {
                     left: Some(l),
                     right: Some(r),
@@ -1999,12 +2496,13 @@ impl protobuf::PhysicalPlanNode {
             })
             .collect::<Result<_>>()?;
         let join_type: protobuf::JoinType = exec.join_type().to_owned().into();
+        let null_equality: protobuf::NullEquality = exec.null_equality().into();
         let filter = exec
             .filter()
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2041,10 +2539,10 @@ impl protobuf::PhysicalPlanNode {
                     .iter()
                     .map(|expr| {
                         Ok(protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         })
@@ -2061,10 +2559,10 @@ impl protobuf::PhysicalPlanNode {
                     .iter()
                     .map(|expr| {
                         Ok(protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         })
@@ -2082,7 +2580,7 @@ impl protobuf::PhysicalPlanNode {
                     on,
                     join_type: join_type.into(),
                     partition_mode: partition_mode.into(),
-                    null_equals_null: exec.null_equals_null(),
+                    null_equality: null_equality.into(),
                     left_sort_exprs,
                     right_sort_exprs,
                     filter,
@@ -2091,32 +2589,123 @@ impl protobuf::PhysicalPlanNode {
         })
     }
 
-    fn try_from_cross_join_exec(
-        exec: &CrossJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+    fn try_from_sort_merge_join_exec(
+        exec: &SortMergeJoinExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
+        let on = exec
+            .on()
+            .iter()
+            .map(|tuple| {
+                let l = proto_converter.physical_expr_to_proto(&tuple.0, codec)?;
+                let r = proto_converter.physical_expr_to_proto(&tuple.1, codec)?;
+                Ok::<_, DataFusionError>(protobuf::JoinOn {
+                    left: Some(l),
+                    right: Some(r),
+                })
+            })
+            .collect::<Result<_>>()?;
+        let join_type: protobuf::JoinType = exec.join_type().to_owned().into();
+        let null_equality: protobuf::NullEquality = exec.null_equality().into();
+        let filter = exec
+            .filter()
+            .as_ref()
+            .map(|f| {
+                let expression =
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
+                let column_indices = f
+                    .column_indices()
+                    .iter()
+                    .map(|i| {
+                        let side: protobuf::JoinSide = i.side.to_owned().into();
+                        protobuf::ColumnIndex {
+                            index: i.index as u32,
+                            side: side.into(),
+                        }
+                    })
+                    .collect();
+                let schema = f.schema().as_ref().try_into()?;
+                Ok(protobuf::JoinFilter {
+                    expression: Some(expression),
+                    column_indices,
+                    schema: Some(schema),
+                })
+            })
+            .map_or(Ok(None), |v: Result<protobuf::JoinFilter>| v.map(Some))?;
+
+        let sort_options = exec
+            .sort_options()
+            .iter()
+            .map(
+                |SortOptions {
+                     descending,
+                     nulls_first,
+                 }| {
+                    SortExprNode {
+                        expr: None,
+                        asc: !*descending,
+                        nulls_first: *nulls_first,
+                    }
+                },
+            )
+            .collect();
+
         Ok(protobuf::PhysicalPlanNode {
-            physical_plan_type: Some(PhysicalPlanType::CrossJoin(Box::new(
-                protobuf::CrossJoinExecNode {
+            physical_plan_type: Some(PhysicalPlanType::SortMergeJoin(Box::new(
+                SortMergeJoinExecNode {
                     left: Some(Box::new(left)),
                     right: Some(Box::new(right)),
+                    on,
+                    join_type: join_type.into(),
+                    null_equality: null_equality.into(),
+                    filter,
+                    sort_options,
                 },
             ))),
         })
     }
 
-    fn try_from_aggregate_exec(
-        exec: &AggregateExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
-    ) -> Result<Self> {
+    fn try_from_cross_join_exec(
+        exec: &CrossJoinExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            exec.left().to_owned(),
+            codec,
+            proto_converter,
+        )?;
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            exec.right().to_owned(),
+            codec,
+            proto_converter,
+        )?;
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::CrossJoin(Box::new(
+                protobuf::CrossJoinExecNode {
+                    left: Some(Box::new(left)),
+                    right: Some(Box::new(right)),
+                },
+            ))),
+        })
+    }
+
+    fn try_from_aggregate_exec(
+        exec: &AggregateExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
         let groups: Vec<bool> = exec
             .group_expr()
             .groups()
@@ -2135,13 +2724,15 @@ impl protobuf::PhysicalPlanNode {
         let filter = exec
             .filter_expr()
             .iter()
-            .map(|expr| serialize_maybe_filter(expr.to_owned(), extension_codec))
+            .map(|expr| serialize_maybe_filter(expr.to_owned(), codec, proto_converter))
             .collect::<Result<Vec<_>>>()?;
 
         let agg = exec
             .aggr_expr()
             .iter()
-            .map(|expr| serialize_physical_aggr_expr(expr.to_owned(), extension_codec))
+            .map(|expr| {
+                serialize_physical_aggr_expr(expr.to_owned(), codec, proto_converter)
+            })
             .collect::<Result<Vec<_>>>()?;
 
         let agg_names = exec
@@ -2158,29 +2749,32 @@ impl protobuf::PhysicalPlanNode {
             AggregateMode::SinglePartitioned => {
                 protobuf::AggregateMode::SinglePartitioned
             }
+            AggregateMode::PartialReduce => protobuf::AggregateMode::PartialReduce,
         };
         let input_schema = exec.input_schema();
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let null_expr = exec
             .group_expr()
             .null_expr()
             .iter()
-            .map(|expr| serialize_physical_expr(&expr.0, extension_codec))
+            .map(|expr| proto_converter.physical_expr_to_proto(&expr.0, codec))
             .collect::<Result<Vec<_>>>()?;
 
         let group_expr = exec
             .group_expr()
             .expr()
             .iter()
-            .map(|expr| serialize_physical_expr(&expr.0, extension_codec))
+            .map(|expr| proto_converter.physical_expr_to_proto(&expr.0, codec))
             .collect::<Result<Vec<_>>>()?;
 
-        let limit = exec.limit().map(|value| protobuf::AggLimit {
-            limit: value as u64,
+        let limit = exec.limit_options().map(|config| protobuf::AggLimit {
+            limit: config.limit() as u64,
+            descending: config.descending(),
         });
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2197,6 +2791,7 @@ impl protobuf::PhysicalPlanNode {
                     null_expr,
                     groups,
                     limit,
+                    has_grouping_set: exec.group_expr().has_grouping_set(),
                 },
             ))),
         })
@@ -2204,7 +2799,7 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_empty_exec(
         empty: &EmptyExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         let schema = empty.schema().as_ref().try_into()?;
         Ok(protobuf::PhysicalPlanNode {
@@ -2216,7 +2811,7 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_placeholder_row_exec(
         empty: &PlaceholderRowExec,
-        _extension_codec: &dyn PhysicalExtensionCodec,
+        _codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self> {
         let schema = empty.schema().as_ref().try_into()?;
         Ok(protobuf::PhysicalPlanNode {
@@ -2228,13 +2823,16 @@ impl protobuf::PhysicalPlanNode {
         })
     }
 
+    #[expect(deprecated)]
     fn try_from_coalesce_batches_exec(
         coalesce_batches: &CoalesceBatchesExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             coalesce_batches.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::CoalesceBatches(Box::new(
@@ -2249,7 +2847,8 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_data_source_exec(
         data_source_exec: &DataSourceExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Option<Self>> {
         let data_source = data_source_exec.data_source();
         if let Some(maybe_csv) = data_source.as_any().downcast_ref::<FileScanConfig>() {
@@ -2260,7 +2859,8 @@ impl protobuf::PhysicalPlanNode {
                         protobuf::CsvScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 maybe_csv,
-                                extension_codec,
+                                codec,
+                                proto_converter,
                             )?),
                             has_header: csv_config.has_header(),
                             delimiter: byte_to_string(
@@ -2285,7 +2885,8 @@ impl protobuf::PhysicalPlanNode {
                             } else {
                                 None
                             },
-                            newlines_in_values: maybe_csv.newlines_in_values(),
+                            newlines_in_values: csv_config.newlines_in_values(),
+                            truncate_rows: csv_config.truncate_rows(),
                         },
                     )),
                 }));
@@ -2300,7 +2901,25 @@ impl protobuf::PhysicalPlanNode {
                         protobuf::JsonScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 scan_conf,
-                                extension_codec,
+                                codec,
+                                proto_converter,
+                            )?),
+                        },
+                    )),
+                }));
+            }
+        }
+
+        if let Some(scan_conf) = data_source.as_any().downcast_ref::<FileScanConfig>() {
+            let source = scan_conf.file_source();
+            if let Some(_arrow_source) = source.as_any().downcast_ref::<ArrowSource>() {
+                return Ok(Some(protobuf::PhysicalPlanNode {
+                    physical_plan_type: Some(PhysicalPlanType::ArrowScan(
+                        protobuf::ArrowScanExecNode {
+                            base_conf: Some(serialize_file_scan_config(
+                                scan_conf,
+                                codec,
+                                proto_converter,
                             )?),
                         },
                     )),
@@ -2313,15 +2932,16 @@ impl protobuf::PhysicalPlanNode {
             data_source_exec.downcast_to_file_source::<ParquetSource>()
         {
             let predicate = conf
-                .predicate()
-                .map(|pred| serialize_physical_expr(pred, extension_codec))
+                .filter()
+                .map(|pred| proto_converter.physical_expr_to_proto(&pred, codec))
                 .transpose()?;
             return Ok(Some(protobuf::PhysicalPlanNode {
                 physical_plan_type: Some(PhysicalPlanType::ParquetScan(
                     protobuf::ParquetScanExecNode {
                         base_conf: Some(serialize_file_scan_config(
                             maybe_parquet,
-                            extension_codec,
+                            codec,
+                            proto_converter,
                         )?),
                         predicate,
                         parquet_options: Some(conf.table_parquet_options().try_into()?),
@@ -2339,7 +2959,8 @@ impl protobuf::PhysicalPlanNode {
                         protobuf::AvroScanExecNode {
                             base_conf: Some(serialize_file_scan_config(
                                 maybe_avro,
-                                extension_codec,
+                                codec,
+                                proto_converter,
                             )?),
                         },
                     )),
@@ -2347,16 +2968,66 @@ impl protobuf::PhysicalPlanNode {
             }
         }
 
+        if let Some(source_conf) =
+            data_source.as_any().downcast_ref::<MemorySourceConfig>()
+        {
+            let proto_partitions = source_conf
+                .partitions()
+                .iter()
+                .map(|p| serialize_record_batches(p))
+                .collect::<Result<Vec<_>>>()?;
+
+            let proto_schema: protobuf::Schema =
+                source_conf.original_schema().as_ref().try_into()?;
+
+            let proto_projection = source_conf
+                .projection()
+                .as_ref()
+                .map_or_else(Vec::new, |v| {
+                    v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
+                });
+
+            let proto_sort_information = source_conf
+                .sort_information()
+                .iter()
+                .map(|ordering| {
+                    let sort_exprs = serialize_physical_sort_exprs(
+                        ordering.to_owned(),
+                        codec,
+                        proto_converter,
+                    )?;
+                    Ok::<_, DataFusionError>(protobuf::PhysicalSortExprNodeCollection {
+                        physical_sort_expr_nodes: sort_exprs,
+                    })
+                })
+                .collect::<Result<Vec<_>, _>>()?;
+
+            return Ok(Some(protobuf::PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::MemoryScan(
+                    protobuf::MemoryScanExecNode {
+                        partitions: proto_partitions,
+                        schema: Some(proto_schema),
+                        projection: proto_projection,
+                        sort_information: proto_sort_information,
+                        show_sizes: source_conf.show_sizes(),
+                        fetch: source_conf.fetch().map(|f| f as u32),
+                    },
+                )),
+            }));
+        }
+
         Ok(None)
     }
 
     fn try_from_coalesce_partitions_exec(
         exec: &CoalescePartitionsExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Merge(Box::new(
@@ -2370,21 +3041,24 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_repartition_exec(
         exec: &RepartitionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let pb_partitioning =
-            serialize_partitioning(exec.partitioning(), extension_codec)?;
+            serialize_partitioning(exec.partitioning(), codec, proto_converter)?;
 
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Repartition(Box::new(
                 protobuf::RepartitionExecNode {
                     input: Some(Box::new(input)),
                     partitioning: Some(pb_partitioning),
+                    preserve_order: exec.preserve_order(),
                 },
             ))),
         })
@@ -2392,25 +3066,23 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_sort_exec(
         exec: &SortExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
-            exec.input().to_owned(),
-            extension_codec,
-        )?;
+        let input = proto_converter.execution_plan_to_proto(exec.input(), codec)?;
         let expr = exec
             .expr()
             .iter()
             .map(|expr| {
                 let sort_expr = Box::new(protobuf::PhysicalSortExprNode {
-                    expr: Some(Box::new(serialize_physical_expr(
-                        &expr.expr,
-                        extension_codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(&expr.expr, codec)?,
+                    )),
                     asc: !expr.options.descending,
                     nulls_first: expr.options.nulls_first,
                 });
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id: None,
                     expr_type: Some(ExprType::Sort(sort_expr)),
                 })
             })
@@ -2432,14 +3104,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_union_exec(
         union: &UnionExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
         let mut inputs: Vec<protobuf::PhysicalPlanNode> = vec![];
         for input in union.inputs() {
-            inputs.push(protobuf::PhysicalPlanNode::try_from_physical_plan(
-                input.to_owned(),
-                extension_codec,
-            )?);
+            inputs.push(
+                protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+                    input.to_owned(),
+                    codec,
+                    proto_converter,
+                )?,
+            );
         }
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Union(protobuf::UnionExecNode {
@@ -2450,14 +3126,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_interleave_exec(
         interleave: &InterleaveExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
         let mut inputs: Vec<protobuf::PhysicalPlanNode> = vec![];
         for input in interleave.inputs() {
-            inputs.push(protobuf::PhysicalPlanNode::try_from_physical_plan(
-                input.to_owned(),
-                extension_codec,
-            )?);
+            inputs.push(
+                protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+                    input.to_owned(),
+                    codec,
+                    proto_converter,
+                )?,
+            );
         }
         Ok(protobuf::PhysicalPlanNode {
             physical_plan_type: Some(PhysicalPlanType::Interleave(
@@ -2468,25 +3148,27 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_sort_preserving_merge_exec(
         exec: &SortPreservingMergeExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
         let expr = exec
             .expr()
             .iter()
             .map(|expr| {
                 let sort_expr = Box::new(protobuf::PhysicalSortExprNode {
-                    expr: Some(Box::new(serialize_physical_expr(
-                        &expr.expr,
-                        extension_codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(&expr.expr, codec)?,
+                    )),
                     asc: !expr.options.descending,
                     nulls_first: expr.options.nulls_first,
                 });
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id: None,
                     expr_type: Some(ExprType::Sort(sort_expr)),
                 })
             })
@@ -2504,15 +3186,18 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_nested_loop_join_exec(
         exec: &NestedLoopJoinExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let left = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let left = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.left().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
-        let right = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let right = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.right().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let join_type: protobuf::JoinType = exec.join_type().to_owned().into();
@@ -2521,7 +3206,7 @@ impl protobuf::PhysicalPlanNode {
             .as_ref()
             .map(|f| {
                 let expression =
-                    serialize_physical_expr(f.expression(), extension_codec)?;
+                    proto_converter.physical_expr_to_proto(f.expression(), codec)?;
                 let column_indices = f
                     .column_indices()
                     .iter()
@@ -2549,7 +3234,7 @@ impl protobuf::PhysicalPlanNode {
                     right: Some(Box::new(right)),
                     join_type: join_type.into(),
                     filter,
-                    projection: exec.projection().map_or_else(Vec::new, |v| {
+                    projection: exec.projection().as_ref().map_or_else(Vec::new, |v| {
                         v.iter().map(|x| *x as u32).collect::<Vec<u32>>()
                     }),
                 },
@@ -2559,23 +3244,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_window_agg_exec(
         exec: &WindowAggExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let window_expr = exec
             .window_expr()
             .iter()
-            .map(|e| serialize_physical_window_expr(e, extension_codec))
+            .map(|e| serialize_physical_window_expr(e, codec, proto_converter))
             .collect::<Result<Vec<protobuf::PhysicalWindowExprNode>>>()?;
 
         let partition_keys = exec
             .partition_keys()
             .iter()
-            .map(|e| serialize_physical_expr(e, extension_codec))
+            .map(|e| proto_converter.physical_expr_to_proto(e, codec))
             .collect::<Result<Vec<protobuf::PhysicalExprNode>>>()?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2592,23 +3279,25 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_bounded_window_agg_exec(
         exec: &BoundedWindowAggExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         let window_expr = exec
             .window_expr()
             .iter()
-            .map(|e| serialize_physical_window_expr(e, extension_codec))
+            .map(|e| serialize_physical_window_expr(e, codec, proto_converter))
             .collect::<Result<Vec<protobuf::PhysicalWindowExprNode>>>()?;
 
         let partition_keys = exec
             .partition_keys()
             .iter()
-            .map(|e| serialize_physical_expr(e, extension_codec))
+            .map(|e| proto_converter.physical_expr_to_proto(e, codec))
             .collect::<Result<Vec<protobuf::PhysicalExprNode>>>()?;
 
         let input_order_mode = match &exec.input_order_mode {
@@ -2641,12 +3330,14 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_data_sink_exec(
         exec: &DataSinkExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Option<Self>> {
         let input: protobuf::PhysicalPlanNode =
-            protobuf::PhysicalPlanNode::try_from_physical_plan(
+            protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
                 exec.input().to_owned(),
-                extension_codec,
+                codec,
+                proto_converter,
             )?;
         let sort_order = match exec.sort_order() {
             Some(requirements) => {
@@ -2655,10 +3346,10 @@ impl protobuf::PhysicalPlanNode {
                     .map(|requirement| {
                         let expr: PhysicalSortExpr = requirement.to_owned().into();
                         let sort_expr = protobuf::PhysicalSortExprNode {
-                            expr: Some(Box::new(serialize_physical_expr(
-                                &expr.expr,
-                                extension_codec,
-                            )?)),
+                            expr: Some(Box::new(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            )),
                             asc: !expr.options.descending,
                             nulls_first: expr.options.nulls_first,
                         };
@@ -2718,11 +3409,13 @@ impl protobuf::PhysicalPlanNode {
 
     fn try_from_unnest_exec(
         exec: &UnnestExec,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
     ) -> Result<Self> {
-        let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
             exec.input().to_owned(),
-            extension_codec,
+            codec,
+            proto_converter,
         )?;
 
         Ok(protobuf::PhysicalPlanNode {
@@ -2748,6 +3441,193 @@ impl protobuf::PhysicalPlanNode {
             ))),
         })
     }
+
+    fn try_from_cooperative_exec(
+        exec: &CooperativeExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            exec.input().to_owned(),
+            codec,
+            proto_converter,
+        )?;
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::Cooperative(Box::new(
+                protobuf::CooperativeExecNode {
+                    input: Some(Box::new(input)),
+                },
+            ))),
+        })
+    }
+
+    fn str_to_generate_series_name(name: &str) -> Result<protobuf::GenerateSeriesName> {
+        match name {
+            "generate_series" => Ok(protobuf::GenerateSeriesName::GsGenerateSeries),
+            "range" => Ok(protobuf::GenerateSeriesName::GsRange),
+            _ => internal_err!("unknown name: {name}"),
+        }
+    }
+
+    fn try_from_lazy_memory_exec(exec: &LazyMemoryExec) -> Result<Option<Self>> {
+        let generators = exec.generators();
+
+        // ensure we only have one generator
+        let [generator] = generators.as_slice() else {
+            return Ok(None);
+        };
+
+        let generator_guard = generator.read();
+
+        // Try to downcast to different generate_series types
+        if let Some(empty_gen) = generator_guard.as_any().downcast_ref::<Empty>() {
+            let schema = exec.schema();
+            let node = protobuf::GenerateSeriesNode {
+                schema: Some(schema.as_ref().try_into()?),
+                target_batch_size: 8192, // Default batch size
+                args: Some(protobuf::generate_series_node::Args::ContainsNull(
+                    protobuf::GenerateSeriesArgsContainsNull {
+                        name: Self::str_to_generate_series_name(empty_gen.name())? as i32,
+                    },
+                )),
+            };
+
+            return Ok(Some(protobuf::PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::GenerateSeries(node)),
+            }));
+        }
+
+        if let Some(int_64) = generator_guard
+            .as_any()
+            .downcast_ref::<GenericSeriesState<i64>>()
+        {
+            let schema = exec.schema();
+            let node = protobuf::GenerateSeriesNode {
+                schema: Some(schema.as_ref().try_into()?),
+                target_batch_size: int_64.batch_size() as u32,
+                args: Some(protobuf::generate_series_node::Args::Int64Args(
+                    protobuf::GenerateSeriesArgsInt64 {
+                        start: *int_64.start(),
+                        end: *int_64.end(),
+                        step: *int_64.step(),
+                        include_end: int_64.include_end(),
+                        name: Self::str_to_generate_series_name(int_64.name())? as i32,
+                    },
+                )),
+            };
+
+            return Ok(Some(protobuf::PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::GenerateSeries(node)),
+            }));
+        }
+
+        if let Some(timestamp_args) = generator_guard
+            .as_any()
+            .downcast_ref::<GenericSeriesState<TimestampValue>>()
+        {
+            let schema = exec.schema();
+
+            let start = timestamp_args.start().value();
+            let end = timestamp_args.end().value();
+
+            let step_value = timestamp_args.step();
+
+            let step = Some(datafusion_proto_common::IntervalMonthDayNanoValue {
+                months: step_value.months,
+                days: step_value.days,
+                nanos: step_value.nanoseconds,
+            });
+            let include_end = timestamp_args.include_end();
+            let name = Self::str_to_generate_series_name(timestamp_args.name())? as i32;
+
+            let args = match timestamp_args.current().tz_str() {
+                Some(tz) => protobuf::generate_series_node::Args::TimestampArgs(
+                    protobuf::GenerateSeriesArgsTimestamp {
+                        start,
+                        end,
+                        step,
+                        include_end,
+                        name,
+                        tz: Some(tz.to_string()),
+                    },
+                ),
+                None => protobuf::generate_series_node::Args::DateArgs(
+                    protobuf::GenerateSeriesArgsDate {
+                        start,
+                        end,
+                        step,
+                        include_end,
+                        name,
+                    },
+                ),
+            };
+
+            let node = protobuf::GenerateSeriesNode {
+                schema: Some(schema.as_ref().try_into()?),
+                target_batch_size: timestamp_args.batch_size() as u32,
+                args: Some(args),
+            };
+
+            return Ok(Some(protobuf::PhysicalPlanNode {
+                physical_plan_type: Some(PhysicalPlanType::GenerateSeries(node)),
+            }));
+        }
+
+        Ok(None)
+    }
+
+    fn try_from_async_func_exec(
+        exec: &AsyncFuncExec,
+        codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(exec.input()),
+            codec,
+            proto_converter,
+        )?;
+
+        let mut async_exprs = vec![];
+        let mut async_expr_names = vec![];
+
+        for async_expr in exec.async_exprs() {
+            async_exprs
+                .push(proto_converter.physical_expr_to_proto(&async_expr.func, codec)?);
+            async_expr_names.push(async_expr.name.clone())
+        }
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::AsyncFunc(Box::new(
+                protobuf::AsyncFuncExecNode {
+                    input: Some(Box::new(input)),
+                    async_exprs,
+                    async_expr_names,
+                },
+            ))),
+        })
+    }
+
+    fn try_from_buffer_exec(
+        exec: &BufferExec,
+        extension_codec: &dyn PhysicalExtensionCodec,
+        proto_converter: &dyn PhysicalProtoConverterExtension,
+    ) -> Result<Self> {
+        let input = protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(exec.input()),
+            extension_codec,
+            proto_converter,
+        )?;
+
+        Ok(protobuf::PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::Buffer(Box::new(
+                protobuf::BufferExecNode {
+                    input: Some(Box::new(input)),
+                    capacity: exec.capacity() as u64,
+                },
+            ))),
+        })
+    }
 }
 
 pub trait AsExecutionPlan: Debug + Send + Sync + Clone {
@@ -2762,25 +3642,25 @@ pub trait AsExecutionPlan: Debug + Send + Sync + Clone {
 
     fn try_into_physical_plan(
         &self,
-        registry: &dyn FunctionRegistry,
-        runtime: &RuntimeEnv,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        ctx: &TaskContext,
+
+        codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
     fn try_from_physical_plan(
         plan: Arc<dyn ExecutionPlan>,
-        extension_codec: &dyn PhysicalExtensionCodec,
+        codec: &dyn PhysicalExtensionCodec,
     ) -> Result<Self>
     where
         Self: Sized;
 }
 
-pub trait PhysicalExtensionCodec: Debug + Send + Sync {
+pub trait PhysicalExtensionCodec: Debug + Send + Sync + Any {
     fn try_decode(
         &self,
         buf: &[u8],
         inputs: &[Arc<dyn ExecutionPlan>],
-        registry: &dyn FunctionRegistry,
+        ctx: &TaskContext,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
     fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()>;
@@ -2836,7 +3716,7 @@ impl PhysicalExtensionCodec for DefaultPhysicalExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[Arc<dyn ExecutionPlan>],
-        _registry: &dyn FunctionRegistry,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         not_impl_err!("PhysicalExtensionCodec is not provided")
     }
@@ -2850,14 +3730,418 @@ impl PhysicalExtensionCodec for DefaultPhysicalExtensionCodec {
     }
 }
 
+/// Controls the conversion of physical plans and expressions to and from their
+/// Protobuf variants. Using this trait, users can perform optimizations on the
+/// conversion process or collect performance metrics.
+pub trait PhysicalProtoConverterExtension {
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto: &protobuf::PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>>;
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>;
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>>;
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode>;
+}
+
+/// DataEncoderTuple captures the position of the encoder
+/// in the codec list that was used to encode the data and actual encoded data
+#[derive(Clone, PartialEq, prost::Message)]
+struct DataEncoderTuple {
+    /// The position of encoder used to encode data
+    /// (to be used for decoding)
+    #[prost(uint32, tag = 1)]
+    pub encoder_position: u32,
+
+    #[prost(bytes, tag = 2)]
+    pub blob: Vec<u8>,
+}
+
+pub struct DefaultPhysicalProtoConverter;
+impl PhysicalProtoConverterExtension for DefaultPhysicalProtoConverter {
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto: &protobuf::PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        proto.try_into_physical_plan_with_converter(ctx, codec, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            codec,
+            self,
+        )
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        // Default implementation calls the free function
+        parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        serialize_physical_expr_with_converter(expr, codec, self)
+    }
+}
+
+/// Internal serializer that adds expr_id to expressions.
+/// Created fresh for each serialization operation.
+struct DeduplicatingSerializer {
+    /// Random salt combined with pointer addresses and process ID to create globally unique expr_ids.
+    session_id: u64,
+}
+
+impl DeduplicatingSerializer {
+    fn new() -> Self {
+        Self {
+            session_id: rand::random(),
+        }
+    }
+}
+
+impl PhysicalProtoConverterExtension for DeduplicatingSerializer {
+    fn proto_to_execution_plan(
+        &self,
+        _ctx: &TaskContext,
+        _codec: &dyn PhysicalExtensionCodec,
+        _proto: &protobuf::PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        internal_err!("DeduplicatingSerializer cannot deserialize execution plans")
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            codec,
+            self,
+        )
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        _proto: &protobuf::PhysicalExprNode,
+        _ctx: &TaskContext,
+        _input_schema: &Schema,
+        _codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        internal_err!("DeduplicatingSerializer cannot deserialize physical expressions")
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        let mut proto = serialize_physical_expr_with_converter(expr, codec, self)?;
+
+        // Hash session_id, pointer address, and process ID together to create expr_id.
+        // - session_id: random per serializer, prevents collisions when merging serializations
+        // - ptr: unique address per Arc within a process
+        // - pid: prevents collisions if serializer is shared across processes
+        let mut hasher = DefaultHasher::new();
+        self.session_id.hash(&mut hasher);
+        (Arc::as_ptr(expr) as *const () as u64).hash(&mut hasher);
+        std::process::id().hash(&mut hasher);
+        proto.expr_id = Some(hasher.finish());
+
+        Ok(proto)
+    }
+}
+
+/// Internal deserializer that caches expressions by expr_id.
+/// Created fresh for each deserialization operation.
+#[derive(Default)]
+struct DeduplicatingDeserializer {
+    /// Cache mapping expr_id to deserialized expressions.
+    cache: RefCell<HashMap<u64, Arc<dyn PhysicalExpr>>>,
+}
+
+impl PhysicalProtoConverterExtension for DeduplicatingDeserializer {
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto: &protobuf::PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        proto.try_into_physical_plan_with_converter(ctx, codec, self)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        _plan: &Arc<dyn ExecutionPlan>,
+        _codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        internal_err!("DeduplicatingDeserializer cannot serialize execution plans")
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        if let Some(expr_id) = proto.expr_id {
+            // Check cache first
+            if let Some(cached) = self.cache.borrow().get(&expr_id) {
+                return Ok(Arc::clone(cached));
+            }
+            // Deserialize and cache
+            let expr = parse_physical_expr_with_converter(
+                proto,
+                ctx,
+                input_schema,
+                codec,
+                self,
+            )?;
+            self.cache.borrow_mut().insert(expr_id, Arc::clone(&expr));
+            Ok(expr)
+        } else {
+            parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)
+        }
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        _expr: &Arc<dyn PhysicalExpr>,
+        _codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        internal_err!("DeduplicatingDeserializer cannot serialize physical expressions")
+    }
+}
+
+/// A proto converter that adds expression deduplication during serialization
+/// and deserialization.
+///
+/// During serialization, each expression's Arc pointer address is XORed with a
+/// random session_id to create a salted `expr_id`. This prevents cross-process
+/// collisions when serialized plans are merged.
+///
+/// During deserialization, expressions with the same `expr_id` share the same
+/// Arc, reducing memory usage for plans with duplicate expressions (e.g., large
+/// IN lists) and supporting correctly linking [`DynamicFilterPhysicalExpr`] instances.
+///
+/// This converter is stateless - it creates internal serializers/deserializers
+/// on demand for each operation.
+///
+/// [`DynamicFilterPhysicalExpr`]: https://docs.rs/datafusion-physical-expr/latest/datafusion_physical_expr/expressions/struct.DynamicFilterPhysicalExpr.html
+#[derive(Debug, Default, Clone, Copy)]
+pub struct DeduplicatingProtoConverter {}
+
+impl PhysicalProtoConverterExtension for DeduplicatingProtoConverter {
+    fn proto_to_execution_plan(
+        &self,
+        ctx: &TaskContext,
+        codec: &dyn PhysicalExtensionCodec,
+        proto: &protobuf::PhysicalPlanNode,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let deserializer = DeduplicatingDeserializer::default();
+        proto.try_into_physical_plan_with_converter(ctx, codec, &deserializer)
+    }
+
+    fn execution_plan_to_proto(
+        &self,
+        plan: &Arc<dyn ExecutionPlan>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalPlanNode>
+    where
+        Self: Sized,
+    {
+        let serializer = DeduplicatingSerializer::new();
+        protobuf::PhysicalPlanNode::try_from_physical_plan_with_converter(
+            Arc::clone(plan),
+            codec,
+            &serializer,
+        )
+    }
+
+    fn proto_to_physical_expr(
+        &self,
+        proto: &protobuf::PhysicalExprNode,
+        ctx: &TaskContext,
+        input_schema: &Schema,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<Arc<dyn PhysicalExpr>>
+    where
+        Self: Sized,
+    {
+        let deserializer = DeduplicatingDeserializer::default();
+        deserializer.proto_to_physical_expr(proto, ctx, input_schema, codec)
+    }
+
+    fn physical_expr_to_proto(
+        &self,
+        expr: &Arc<dyn PhysicalExpr>,
+        codec: &dyn PhysicalExtensionCodec,
+    ) -> Result<protobuf::PhysicalExprNode> {
+        let serializer = DeduplicatingSerializer::new();
+        serializer.physical_expr_to_proto(expr, codec)
+    }
+}
+
+/// A PhysicalExtensionCodec that tries one of multiple inner codecs
+/// until one works
+#[derive(Debug)]
+pub struct ComposedPhysicalExtensionCodec {
+    codecs: Vec<Arc<dyn PhysicalExtensionCodec>>,
+}
+
+impl ComposedPhysicalExtensionCodec {
+    // Position in this codecs list is important as it will be used for decoding.
+    // If new codec is added it should go to last position.
+    pub fn new(codecs: Vec<Arc<dyn PhysicalExtensionCodec>>) -> Self {
+        Self { codecs }
+    }
+
+    fn decode_protobuf<R>(
+        &self,
+        buf: &[u8],
+        decode: impl FnOnce(&dyn PhysicalExtensionCodec, &[u8]) -> Result<R>,
+    ) -> Result<R> {
+        let proto =
+            DataEncoderTuple::decode(buf).map_err(|e| internal_datafusion_err!("{e}"))?;
+
+        let codec = self.codecs.get(proto.encoder_position as usize).ok_or(
+            internal_datafusion_err!("Can't find required codec in codec list"),
+        )?;
+
+        decode(codec.as_ref(), &proto.blob)
+    }
+
+    fn encode_protobuf(
+        &self,
+        buf: &mut Vec<u8>,
+        mut encode: impl FnMut(&dyn PhysicalExtensionCodec, &mut Vec<u8>) -> Result<()>,
+    ) -> Result<()> {
+        let mut data = vec![];
+        let mut last_err = None;
+        let mut encoder_position = None;
+
+        // find the encoder
+        for (position, codec) in self.codecs.iter().enumerate() {
+            match encode(codec.as_ref(), &mut data) {
+                Ok(_) => {
+                    encoder_position = Some(position as u32);
+                    break;
+                }
+                Err(err) => last_err = Some(err),
+            }
+        }
+
+        let encoder_position = encoder_position.ok_or_else(|| {
+            last_err.unwrap_or_else(|| {
+                DataFusionError::NotImplemented(
+                    "Empty list of composed codecs".to_owned(),
+                )
+            })
+        })?;
+
+        // encode with encoder position
+        let proto = DataEncoderTuple {
+            encoder_position,
+            blob: data,
+        };
+        proto
+            .encode(buf)
+            .map_err(|e| internal_datafusion_err!("{e}"))
+    }
+}
+
+impl PhysicalExtensionCodec for ComposedPhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.decode_protobuf(buf, |codec, data| codec.try_decode(data, inputs, ctx))
+    }
+
+    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
+        self.encode_protobuf(buf, |codec, data| codec.try_encode(Arc::clone(&node), data))
+    }
+
+    fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        self.decode_protobuf(buf, |codec, data| codec.try_decode_udf(name, data))
+    }
+
+    fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) -> Result<()> {
+        self.encode_protobuf(buf, |codec, data| codec.try_encode_udf(node, data))
+    }
+
+    fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
+        self.decode_protobuf(buf, |codec, data| codec.try_decode_udaf(name, data))
+    }
+
+    fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) -> Result<()> {
+        self.encode_protobuf(buf, |codec, data| codec.try_encode_udaf(node, data))
+    }
+}
+
 fn into_physical_plan(
     node: &Option<Box<protobuf::PhysicalPlanNode>>,
-    registry: &dyn FunctionRegistry,
-    runtime: &RuntimeEnv,
-    extension_codec: &dyn PhysicalExtensionCodec,
-) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    ctx: &TaskContext,
+    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
+) -> Result<Arc<dyn ExecutionPlan>> {
     if let Some(field) = node {
-        field.try_into_physical_plan(registry, runtime, extension_codec)
+        proto_converter.proto_to_execution_plan(ctx, codec, field)
     } else {
         Err(proto_error("Missing required field in protobuf"))
     }
diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs
index d1b1f51ae1075..990a54cf94c7a 100644
--- a/datafusion/proto/src/physical_plan/to_proto.rs
+++ b/datafusion/proto/src/physical_plan/to_proto.rs
@@ -17,60 +17,70 @@
 
 use std::sync::Arc;
 
+use arrow::array::RecordBatch;
+use arrow::datatypes::Schema;
+use arrow::ipc::writer::StreamWriter;
+use datafusion_common::{
+    DataFusionError, Result, internal_datafusion_err, internal_err, not_impl_err,
+};
+use datafusion_datasource::file_scan_config::FileScanConfig;
+use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig};
+use datafusion_datasource::{FileRange, PartitionedFile};
+use datafusion_datasource_csv::file_format::CsvSink;
+use datafusion_datasource_json::file_format::JsonSink;
 #[cfg(feature = "parquet")]
-use datafusion::datasource::file_format::parquet::ParquetSink;
-use datafusion::datasource::physical_plan::FileSink;
-use datafusion::physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
-use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr};
-use datafusion::physical_expr_common::physical_expr::snapshot_physical_expr;
-use datafusion::physical_plan::expressions::{
+use datafusion_datasource_parquet::file_format::ParquetSink;
+use datafusion_expr::WindowFrame;
+use datafusion_physical_expr::ScalarFunctionExpr;
+use datafusion_physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
+use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_plan::expressions::{
     BinaryExpr, CaseExpr, CastExpr, Column, InListExpr, IsNotNullExpr, IsNullExpr,
-    Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn,
+    LikeExpr, Literal, NegativeExpr, NotExpr, TryCastExpr, UnKnownColumn,
 };
-use datafusion::physical_plan::udaf::AggregateFunctionExpr;
-use datafusion::physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr};
-use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
-use datafusion::{
-    datasource::{
-        file_format::{csv::CsvSink, json::JsonSink},
-        listing::{FileRange, PartitionedFile},
-        physical_plan::{FileScanConfig, FileSinkConfig},
-    },
-    physical_plan::expressions::LikeExpr,
+use datafusion_physical_plan::joins::{HashExpr, HashTableLookupExpr};
+use datafusion_physical_plan::udaf::AggregateFunctionExpr;
+use datafusion_physical_plan::windows::{PlainAggregateWindowExpr, WindowUDFExpr};
+use datafusion_physical_plan::{Partitioning, PhysicalExpr, WindowExpr};
+
+use super::{
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
 };
-use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result};
-use datafusion_expr::WindowFrame;
-
 use crate::protobuf::{
-    self, physical_aggregate_expr_node, physical_window_expr_node, PhysicalSortExprNode,
-    PhysicalSortExprNodeCollection,
+    self, PhysicalSortExprNode, PhysicalSortExprNodeCollection,
+    physical_aggregate_expr_node, physical_window_expr_node,
 };
 
-use super::PhysicalExtensionCodec;
-
+#[expect(clippy::needless_pass_by_value)]
 pub fn serialize_physical_aggr_expr(
     aggr_expr: Arc<AggregateFunctionExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalExprNode> {
-    let expressions = serialize_physical_exprs(&aggr_expr.expressions(), codec)?;
-    let ordering_req = match aggr_expr.order_bys() {
-        Some(order) => order.clone(),
-        None => LexOrdering::default(),
-    };
-    let ordering_req = serialize_physical_sort_exprs(ordering_req, codec)?;
+    let expressions =
+        serialize_physical_exprs(&aggr_expr.expressions(), codec, proto_converter)?;
+    let order_bys = serialize_physical_sort_exprs(
+        aggr_expr.order_bys().iter().cloned(),
+        codec,
+        proto_converter,
+    )?;
 
     let name = aggr_expr.fun().name().to_string();
     let mut buf = Vec::new();
     codec.try_encode_udaf(aggr_expr.fun(), &mut buf)?;
     Ok(protobuf::PhysicalExprNode {
+        expr_id: None,
         expr_type: Some(protobuf::physical_expr_node::ExprType::AggregateExpr(
             protobuf::PhysicalAggregateExprNode {
                 aggregate_function: Some(physical_aggregate_expr_node::AggregateFunction::UserDefinedAggrFunction(name)),
                 expr: expressions,
-                ordering_req,
+                ordering_req: order_bys,
                 distinct: aggr_expr.is_distinct(),
                 ignore_nulls: aggr_expr.ignore_nulls(),
                 fun_definition: (!buf.is_empty()).then_some(buf),
+                human_display: aggr_expr.human_display().to_string(),
             },
         )),
     })
@@ -81,12 +91,7 @@ fn serialize_physical_window_aggr_expr(
     _window_frame: &WindowFrame,
     codec: &dyn PhysicalExtensionCodec,
 ) -> Result<(physical_window_expr_node::WindowFunction, Option<Vec<u8>>)> {
-    if aggr_expr.is_distinct() || aggr_expr.ignore_nulls() {
-        // TODO
-        return not_impl_err!(
-            "Distinct aggregate functions not supported in window expressions"
-        );
-    }
+    // Distinct and ignore_nulls are now supported in window expressions
 
     let mut buf = Vec::new();
     codec.try_encode_udaf(aggr_expr.fun(), &mut buf)?;
@@ -101,57 +106,75 @@ fn serialize_physical_window_aggr_expr(
 pub fn serialize_physical_window_expr(
     window_expr: &Arc<dyn WindowExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalWindowExprNode> {
     let expr = window_expr.as_any();
-    let args = window_expr.expressions().to_vec();
+    let mut args = window_expr.expressions().to_vec();
     let window_frame = window_expr.get_window_frame();
 
-    let (window_function, fun_definition) = if let Some(plain_aggr_window_expr) =
-        expr.downcast_ref::<PlainAggregateWindowExpr>()
-    {
-        serialize_physical_window_aggr_expr(
-            plain_aggr_window_expr.get_aggregate_expr(),
-            window_frame,
-            codec,
-        )?
-    } else if let Some(sliding_aggr_window_expr) =
-        expr.downcast_ref::<SlidingAggregateWindowExpr>()
-    {
-        serialize_physical_window_aggr_expr(
-            sliding_aggr_window_expr.get_aggregate_expr(),
-            window_frame,
-            codec,
-        )?
-    } else if let Some(udf_window_expr) = expr.downcast_ref::<StandardWindowExpr>() {
-        if let Some(expr) = udf_window_expr
-            .get_standard_func_expr()
-            .as_any()
-            .downcast_ref::<WindowUDFExpr>()
+    let (window_function, fun_definition, ignore_nulls, distinct) =
+        if let Some(plain_aggr_window_expr) =
+            expr.downcast_ref::<PlainAggregateWindowExpr>()
         {
-            let mut buf = Vec::new();
-            codec.try_encode_udwf(expr.fun(), &mut buf)?;
+            let aggr_expr = plain_aggr_window_expr.get_aggregate_expr();
+            let (window_function, fun_definition) =
+                serialize_physical_window_aggr_expr(aggr_expr, window_frame, codec)?;
             (
-                physical_window_expr_node::WindowFunction::UserDefinedWindowFunction(
-                    expr.fun().name().to_string(),
-                ),
-                (!buf.is_empty()).then_some(buf),
+                window_function,
+                fun_definition,
+                aggr_expr.ignore_nulls(),
+                aggr_expr.is_distinct(),
             )
+        } else if let Some(sliding_aggr_window_expr) =
+            expr.downcast_ref::<SlidingAggregateWindowExpr>()
+        {
+            let aggr_expr = sliding_aggr_window_expr.get_aggregate_expr();
+            let (window_function, fun_definition) =
+                serialize_physical_window_aggr_expr(aggr_expr, window_frame, codec)?;
+            (
+                window_function,
+                fun_definition,
+                aggr_expr.ignore_nulls(),
+                aggr_expr.is_distinct(),
+            )
+        } else if let Some(udf_window_expr) = expr.downcast_ref::<StandardWindowExpr>() {
+            if let Some(expr) = udf_window_expr
+                .get_standard_func_expr()
+                .as_any()
+                .downcast_ref::<WindowUDFExpr>()
+            {
+                let mut buf = Vec::new();
+                codec.try_encode_udwf(expr.fun(), &mut buf)?;
+                args = expr.args().to_vec();
+                (
+                    physical_window_expr_node::WindowFunction::UserDefinedWindowFunction(
+                        expr.fun().name().to_string(),
+                    ),
+                    (!buf.is_empty()).then_some(buf),
+                    false, // WindowUDFExpr doesn't have ignore_nulls/distinct
+                    false,
+                )
+            } else {
+                return not_impl_err!(
+                    "User-defined window function not supported: {window_expr:?}"
+                );
+            }
         } else {
-            return not_impl_err!(
-                "User-defined window function not supported: {window_expr:?}"
-            );
-        }
-    } else {
-        return not_impl_err!("WindowExpr not supported: {window_expr:?}");
-    };
-
-    let args = serialize_physical_exprs(&args, codec)?;
-    let partition_by = serialize_physical_exprs(window_expr.partition_by(), codec)?;
-    let order_by = serialize_physical_sort_exprs(window_expr.order_by().to_vec(), codec)?;
+            return not_impl_err!("WindowExpr not supported: {window_expr:?}");
+        };
+
+    let args = serialize_physical_exprs(&args, codec, proto_converter)?;
+    let partition_by =
+        serialize_physical_exprs(window_expr.partition_by(), codec, proto_converter)?;
+    let order_by = serialize_physical_sort_exprs(
+        window_expr.order_by().to_vec(),
+        codec,
+        proto_converter,
+    )?;
     let window_frame: protobuf::WindowFrame = window_frame
         .as_ref()
         .try_into()
-        .map_err(|e| DataFusionError::Internal(format!("{e}")))?;
+        .map_err(|e| internal_datafusion_err!("{e}"))?;
 
     Ok(protobuf::PhysicalWindowExprNode {
         args,
@@ -161,28 +184,32 @@ pub fn serialize_physical_window_expr(
         window_function: Some(window_function),
         name: window_expr.name().to_string(),
         fun_definition,
+        ignore_nulls,
+        distinct,
     })
 }
 
 pub fn serialize_physical_sort_exprs<I>(
     sort_exprs: I,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<PhysicalSortExprNode>>
 where
     I: IntoIterator<Item = PhysicalSortExpr>,
 {
     sort_exprs
         .into_iter()
-        .map(|sort_expr| serialize_physical_sort_expr(sort_expr, codec))
+        .map(|sort_expr| serialize_physical_sort_expr(sort_expr, codec, proto_converter))
         .collect()
 }
 
 pub fn serialize_physical_sort_expr(
     sort_expr: PhysicalSortExpr,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<PhysicalSortExprNode> {
     let PhysicalSortExpr { expr, options } = sort_expr;
-    let expr = serialize_physical_expr(&expr, codec)?;
+    let expr = proto_converter.physical_expr_to_proto(&expr, codec)?;
     Ok(PhysicalSortExprNode {
         expr: Some(Box::new(expr)),
         asc: !options.descending,
@@ -193,13 +220,14 @@ pub fn serialize_physical_sort_expr(
 pub fn serialize_physical_exprs<'a, I>(
     values: I,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Vec<protobuf::PhysicalExprNode>>
 where
     I: IntoIterator<Item = &'a Arc<dyn PhysicalExpr>>,
 {
     values
         .into_iter()
-        .map(|value| serialize_physical_expr(value, codec))
+        .map(|value| proto_converter.physical_expr_to_proto(value, codec))
         .collect()
 }
 
@@ -210,14 +238,58 @@ where
 pub fn serialize_physical_expr(
     value: &Arc<dyn PhysicalExpr>,
     codec: &dyn PhysicalExtensionCodec,
+) -> Result<protobuf::PhysicalExprNode> {
+    serialize_physical_expr_with_converter(
+        value,
+        codec,
+        &DefaultPhysicalProtoConverter {},
+    )
+}
+
+/// Serialize a `PhysicalExpr` to default protobuf representation.
+///
+/// If required, a [`PhysicalExtensionCodec`] can be provided which can handle
+/// serialization of udfs requiring specialized serialization (see [`PhysicalExtensionCodec::try_encode_udf`]).
+/// A [`PhysicalProtoConverterExtension`] can be provided to handle the
+/// conversion process (see [`PhysicalProtoConverterExtension::physical_expr_to_proto`]).
+pub fn serialize_physical_expr_with_converter(
+    value: &Arc<dyn PhysicalExpr>,
+    codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalExprNode> {
     // Snapshot the expr in case it has dynamic predicate state so
     // it can be serialized
     let value = snapshot_physical_expr(Arc::clone(value))?;
     let expr = value.as_any();
 
+    // HashTableLookupExpr is used for dynamic filter pushdown in hash joins.
+    // It contains an Arc<dyn JoinHashMapType> (the build-side hash table) which
+    // cannot be serialized - the hash table is a runtime structure built during
+    // execution on the build side.
+    //
+    // We replace it with lit(true) which is safe because:
+    // 1. The filter is a performance optimization, not a correctness requirement
+    // 2. lit(true) passes all rows, so no valid rows are incorrectly filtered out
+    // 3. The join itself will still produce correct results, just without the
+    //    benefit of early filtering on the probe side
+    //
+    // In distributed execution, the remote worker won't have access to the hash
+    // table anyway, so the best we can do is skip this optimization.
+    if expr.downcast_ref::<HashTableLookupExpr>().is_some() {
+        let value = datafusion_proto_common::ScalarValue {
+            value: Some(datafusion_proto_common::scalar_value::Value::BoolValue(
+                true,
+            )),
+        };
+        return Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::Literal(value)),
+        });
+    }
+
     if let Some(expr) = expr.downcast_ref::<Column>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Column(
                 protobuf::PhysicalColumn {
                     name: expr.name().to_string(),
@@ -227,6 +299,7 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<UnKnownColumn>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::UnknownColumn(
                 protobuf::UnknownColumn {
                     name: expr.name().to_string(),
@@ -235,18 +308,24 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<BinaryExpr>() {
         let binary_expr = Box::new(protobuf::PhysicalBinaryExprNode {
-            l: Some(Box::new(serialize_physical_expr(expr.left(), codec)?)),
-            r: Some(Box::new(serialize_physical_expr(expr.right(), codec)?)),
+            l: Some(Box::new(
+                proto_converter.physical_expr_to_proto(expr.left(), codec)?,
+            )),
+            r: Some(Box::new(
+                proto_converter.physical_expr_to_proto(expr.right(), codec)?,
+            )),
             op: format!("{:?}", expr.op()),
         });
 
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::BinaryExpr(
                 binary_expr,
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<CaseExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(
                 protobuf::physical_expr_node::ExprType::Case(
                     Box::new(
@@ -254,14 +333,21 @@ pub fn serialize_physical_expr(
                             expr: expr
                                 .expr()
                                 .map(|exp| {
-                                    serialize_physical_expr(exp, codec).map(Box::new)
+                                    proto_converter
+                                        .physical_expr_to_proto(exp, codec)
+                                        .map(Box::new)
                                 })
                                 .transpose()?,
                             when_then_expr: expr
                                 .when_then_expr()
                                 .iter()
                                 .map(|(when_expr, then_expr)| {
-                                    serialize_when_then_expr(when_expr, then_expr, codec)
+                                    serialize_when_then_expr(
+                                        when_expr,
+                                        then_expr,
+                                        codec,
+                                        proto_converter,
+                                    )
                                 })
                                 .collect::<Result<
                                     Vec<protobuf::PhysicalWhenThen>,
@@ -269,7 +355,11 @@ pub fn serialize_physical_expr(
                                 >>()?,
                             else_expr: expr
                                 .else_expr()
-                                .map(|a| serialize_physical_expr(a, codec).map(Box::new))
+                                .map(|a| {
+                                    proto_converter
+                                        .physical_expr_to_proto(a, codec)
+                                        .map(Box::new)
+                                })
                                 .transpose()?,
                         },
                     ),
@@ -278,66 +368,88 @@ pub fn serialize_physical_expr(
         })
     } else if let Some(expr) = expr.downcast_ref::<NotExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::NotExpr(Box::new(
                 protobuf::PhysicalNot {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 },
             ))),
         })
     } else if let Some(expr) = expr.downcast_ref::<IsNullExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::IsNullExpr(
                 Box::new(protobuf::PhysicalIsNull {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 }),
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<IsNotNullExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::IsNotNullExpr(
                 Box::new(protobuf::PhysicalIsNotNull {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 }),
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<InListExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::InList(Box::new(
                 protobuf::PhysicalInListNode {
-                    expr: Some(Box::new(serialize_physical_expr(expr.expr(), codec)?)),
-                    list: serialize_physical_exprs(expr.list(), codec)?,
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.expr(), codec)?,
+                    )),
+                    list: serialize_physical_exprs(expr.list(), codec, proto_converter)?,
                     negated: expr.negated(),
                 },
             ))),
         })
     } else if let Some(expr) = expr.downcast_ref::<NegativeExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Negative(Box::new(
                 protobuf::PhysicalNegativeNode {
-                    expr: Some(Box::new(serialize_physical_expr(expr.arg(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.arg(), codec)?,
+                    )),
                 },
             ))),
         })
     } else if let Some(lit) = expr.downcast_ref::<Literal>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Literal(
                 lit.value().try_into()?,
             )),
         })
     } else if let Some(cast) = expr.downcast_ref::<CastExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::Cast(Box::new(
                 protobuf::PhysicalCastNode {
-                    expr: Some(Box::new(serialize_physical_expr(cast.expr(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(cast.expr(), codec)?,
+                    )),
                     arrow_type: Some(cast.cast_type().try_into()?),
                 },
             ))),
         })
     } else if let Some(cast) = expr.downcast_ref::<TryCastExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::TryCast(Box::new(
                 protobuf::PhysicalTryCastNode {
-                    expr: Some(Box::new(serialize_physical_expr(cast.expr(), codec)?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(cast.expr(), codec)?,
+                    )),
                     arrow_type: Some(cast.cast_type().try_into()?),
                 },
             ))),
@@ -346,30 +458,52 @@ pub fn serialize_physical_expr(
         let mut buf = Vec::new();
         codec.try_encode_udf(expr.fun(), &mut buf)?;
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::ScalarUdf(
                 protobuf::PhysicalScalarUdfNode {
                     name: expr.name().to_string(),
-                    args: serialize_physical_exprs(expr.args(), codec)?,
+                    args: serialize_physical_exprs(expr.args(), codec, proto_converter)?,
                     fun_definition: (!buf.is_empty()).then_some(buf),
                     return_type: Some(expr.return_type().try_into()?),
                     nullable: expr.nullable(),
+                    return_field_name: expr
+                        .return_field(&Schema::empty())?
+                        .name()
+                        .to_string(),
                 },
             )),
         })
     } else if let Some(expr) = expr.downcast_ref::<LikeExpr>() {
         Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
             expr_type: Some(protobuf::physical_expr_node::ExprType::LikeExpr(Box::new(
                 protobuf::PhysicalLikeExprNode {
                     negated: expr.negated(),
                     case_insensitive: expr.case_insensitive(),
-                    expr: Some(Box::new(serialize_physical_expr(expr.expr(), codec)?)),
-                    pattern: Some(Box::new(serialize_physical_expr(
-                        expr.pattern(),
-                        codec,
-                    )?)),
+                    expr: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.expr(), codec)?,
+                    )),
+                    pattern: Some(Box::new(
+                        proto_converter.physical_expr_to_proto(expr.pattern(), codec)?,
+                    )),
                 },
             ))),
         })
+    } else if let Some(expr) = expr.downcast_ref::<HashExpr>() {
+        Ok(protobuf::PhysicalExprNode {
+            expr_id: None,
+            expr_type: Some(protobuf::physical_expr_node::ExprType::HashExpr(
+                protobuf::PhysicalHashExprNode {
+                    on_columns: serialize_physical_exprs(
+                        expr.on_columns(),
+                        codec,
+                        proto_converter,
+                    )?,
+                    seed0: expr.seed(),
+                    description: expr.description().to_string(),
+                },
+            )),
+        })
     } else {
         let mut buf: Vec<u8> = vec![];
         match codec.try_encode_expr(&value, &mut buf) {
@@ -377,9 +511,10 @@ pub fn serialize_physical_expr(
                 let inputs: Vec<protobuf::PhysicalExprNode> = value
                     .children()
                     .into_iter()
-                    .map(|e| serialize_physical_expr(e, codec))
+                    .map(|e| proto_converter.physical_expr_to_proto(e, codec))
                     .collect::<Result<_>>()?;
                 Ok(protobuf::PhysicalExprNode {
+                    expr_id: None,
                     expr_type: Some(protobuf::physical_expr_node::ExprType::Extension(
                         protobuf::PhysicalExtensionExprNode { expr: buf, inputs },
                     )),
@@ -395,6 +530,7 @@ pub fn serialize_physical_expr(
 pub fn serialize_partitioning(
     partitioning: &Partitioning,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::Partitioning> {
     let serialized_partitioning = match partitioning {
         Partitioning::RoundRobinBatch(partition_count) => protobuf::Partitioning {
@@ -403,7 +539,8 @@ pub fn serialize_partitioning(
             )),
         },
         Partitioning::Hash(exprs, partition_count) => {
-            let serialized_exprs = serialize_physical_exprs(exprs, codec)?;
+            let serialized_exprs =
+                serialize_physical_exprs(exprs, codec, proto_converter)?;
             protobuf::Partitioning {
                 partition_method: Some(protobuf::partitioning::PartitionMethod::Hash(
                     protobuf::PhysicalHashRepartition {
@@ -426,10 +563,11 @@ fn serialize_when_then_expr(
     when_expr: &Arc<dyn PhysicalExpr>,
     then_expr: &Arc<dyn PhysicalExpr>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::PhysicalWhenThen> {
     Ok(protobuf::PhysicalWhenThen {
-        when_expr: Some(serialize_physical_expr(when_expr, codec)?),
-        then_expr: Some(serialize_physical_expr(then_expr, codec)?),
+        when_expr: Some(proto_converter.physical_expr_to_proto(when_expr, codec)?),
+        then_expr: Some(proto_converter.physical_expr_to_proto(then_expr, codec)?),
     })
 }
 
@@ -485,6 +623,7 @@ impl TryFrom<&[PartitionedFile]> for protobuf::FileGroup {
 pub fn serialize_file_scan_config(
     conf: &FileScanConfig,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::FileScanExecConf> {
     let file_groups = conf
         .file_groups
@@ -494,35 +633,56 @@ pub fn serialize_file_scan_config(
 
     let mut output_orderings = vec![];
     for order in &conf.output_ordering {
-        let ordering = serialize_physical_sort_exprs(order.to_vec(), codec)?;
+        let ordering =
+            serialize_physical_sort_exprs(order.to_vec(), codec, proto_converter)?;
         output_orderings.push(ordering)
     }
 
     // Fields must be added to the schema so that they can persist in the protobuf,
     // and then they are to be removed from the schema in `parse_protobuf_file_scan_config`
     let mut fields = conf
-        .file_schema
+        .file_schema()
         .fields()
         .iter()
         .cloned()
         .collect::<Vec<_>>();
-    fields.extend(conf.table_partition_cols.iter().cloned().map(Arc::new));
-    let schema = Arc::new(arrow::datatypes::Schema::new(fields.clone()));
+    fields.extend(conf.table_partition_cols().iter().cloned());
+
+    let schema = Arc::new(
+        Schema::new(fields.clone()).with_metadata(conf.file_schema().metadata.clone()),
+    );
+
+    let projection_exprs = conf
+        .file_source
+        .projection()
+        .as_ref()
+        .map(|projection_exprs| {
+            let projections = projection_exprs.iter().cloned().collect::<Vec<_>>();
+            Ok::<_, DataFusionError>(protobuf::ProjectionExprs {
+                projections: projections
+                    .into_iter()
+                    .map(|expr| {
+                        Ok(protobuf::ProjectionExpr {
+                            alias: expr.alias.to_string(),
+                            expr: Some(
+                                proto_converter
+                                    .physical_expr_to_proto(&expr.expr, codec)?,
+                            ),
+                        })
+                    })
+                    .collect::<Result<Vec<_>>>()?,
+            })
+        })
+        .transpose()?;
 
     Ok(protobuf::FileScanExecConf {
         file_groups,
-        statistics: Some((&conf.file_source.statistics().unwrap()).into()),
+        statistics: Some((&conf.statistics()).into()),
         limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
-        projection: conf
-            .projection
-            .as_ref()
-            .unwrap_or(&(0..schema.fields().len()).collect::<Vec<_>>())
-            .iter()
-            .map(|n| *n as u32)
-            .collect(),
+        projection: vec![],
         schema: Some(schema.as_ref().try_into()?),
         table_partition_cols: conf
-            .table_partition_cols
+            .table_partition_cols()
             .iter()
             .map(|x| x.name().clone())
             .collect::<Vec<_>>(),
@@ -535,21 +695,37 @@ pub fn serialize_file_scan_config(
             .collect::<Vec<_>>(),
         constraints: Some(conf.constraints.clone().into()),
         batch_size: conf.batch_size.map(|s| s as u64),
+        projection_exprs,
     })
 }
 
 pub fn serialize_maybe_filter(
     expr: Option<Arc<dyn PhysicalExpr>>,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<protobuf::MaybeFilter> {
     match expr {
         None => Ok(protobuf::MaybeFilter { expr: None }),
         Some(expr) => Ok(protobuf::MaybeFilter {
-            expr: Some(serialize_physical_expr(&expr, codec)?),
+            expr: Some(proto_converter.physical_expr_to_proto(&expr, codec)?),
         }),
     }
 }
 
+pub fn serialize_record_batches(batches: &[RecordBatch]) -> Result<Vec<u8>> {
+    if batches.is_empty() {
+        return Ok(vec![]);
+    }
+    let schema = batches[0].schema();
+    let mut buf = Vec::new();
+    let mut writer = StreamWriter::try_new(&mut buf, &schema)?;
+    for batch in batches {
+        writer.write(batch)?;
+    }
+    writer.finish()?;
+    Ok(buf)
+}
+
 impl TryFrom<&JsonSink> for protobuf::JsonSink {
     type Error = DataFusionError;
 
@@ -608,6 +784,17 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig {
                 })
             })
             .collect::<Result<Vec<_>>>()?;
+        let file_output_mode = match conf.file_output_mode {
+            datafusion_datasource::file_sink_config::FileOutputMode::Automatic => {
+                protobuf::FileOutputMode::Automatic
+            }
+            datafusion_datasource::file_sink_config::FileOutputMode::SingleFile => {
+                protobuf::FileOutputMode::SingleFile
+            }
+            datafusion_datasource::file_sink_config::FileOutputMode::Directory => {
+                protobuf::FileOutputMode::Directory
+            }
+        };
         Ok(Self {
             object_store_url: conf.object_store_url.to_string(),
             file_groups,
@@ -617,6 +804,7 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig {
             keep_partition_by_columns: conf.keep_partition_by_columns,
             insert_op: conf.insert_op as i32,
             file_extension: conf.file_extension.to_string(),
+            file_output_mode: file_output_mode.into(),
         })
     }
 }
diff --git a/datafusion/proto/tests/cases/mod.rs b/datafusion/proto/tests/cases/mod.rs
index 4c7da2768e744..aec6c1de30309 100644
--- a/datafusion/proto/tests/cases/mod.rs
+++ b/datafusion/proto/tests/cases/mod.rs
@@ -17,16 +17,19 @@
 
 use arrow::datatypes::{DataType, Field, FieldRef};
 use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_expr::PhysicalExpr;
 use datafusion_common::plan_err;
 use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::{
-    Accumulator, AggregateUDFImpl, PartitionEvaluator, ScalarFunctionArgs, ScalarUDFImpl,
-    Signature, Volatility, WindowUDFImpl,
+    Accumulator, AggregateUDFImpl, LimitEffect, PartitionEvaluator, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, Volatility, WindowUDFImpl,
 };
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
 use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 use std::any::Any;
 use std::fmt::Debug;
+use std::hash::Hash;
+use std::sync::Arc;
 
 mod roundtrip_logical_plan;
 mod roundtrip_physical_plan;
@@ -131,7 +134,7 @@ pub struct MyAggregateUdfNode {
     pub result: String,
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub(in crate::cases) struct CustomUDWF {
     signature: Signature,
     payload: String,
@@ -172,6 +175,10 @@ impl WindowUDFImpl for CustomUDWF {
     ) -> datafusion_common::Result<FieldRef> {
         Ok(Field::new(field_args.name(), DataType::UInt64, false).into())
     }
+
+    fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+        LimitEffect::Unknown
+    }
 }
 
 #[derive(Debug)]
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index b515ef6e38de2..63ad00c92e6a9 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -19,18 +19,21 @@ use arrow::array::{
     ArrayRef, FixedSizeListArray, Int32Builder, MapArray, MapBuilder, StringBuilder,
 };
 use arrow::datatypes::{
-    DataType, Field, FieldRef, Fields, Int32Type, IntervalDayTimeType,
-    IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields,
-    UnionMode, DECIMAL256_MAX_PRECISION,
+    DECIMAL256_MAX_PRECISION, DataType, Field, FieldRef, Fields, Int32Type,
+    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef,
+    TimeUnit, UnionFields, UnionMode,
 };
 use arrow::util::pretty::pretty_format_batches;
 use datafusion::datasource::file_format::json::{JsonFormat, JsonFormatFactory};
 use datafusion::datasource::listing::{
     ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
 };
-use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion;
+use datafusion::execution::options::{ArrowReadOptions, JsonReadOptions};
 use datafusion::optimizer::Optimizer;
+use datafusion::optimizer::optimize_unions::OptimizeUnions;
+use datafusion_common::parquet_config::DFParquetWriterVersion;
 use datafusion_common::parsers::CompressionTypeVariant;
+use datafusion_functions_aggregate::sum::sum_distinct;
 use prost::Message;
 use std::any::Any;
 use std::collections::HashMap;
@@ -40,12 +43,13 @@ use std::sync::Arc;
 use std::vec;
 
 use datafusion::catalog::{TableProvider, TableProviderFactory};
+use datafusion::datasource::DefaultTableSource;
 use datafusion::datasource::file_format::arrow::ArrowFormatFactory;
 use datafusion::datasource::file_format::csv::CsvFormatFactory;
 use datafusion::datasource::file_format::parquet::ParquetFormatFactory;
-use datafusion::datasource::file_format::{format_as_file_type, DefaultFileType};
-use datafusion::execution::session_state::SessionStateBuilder;
+use datafusion::datasource::file_format::{DefaultFileType, format_as_file_type};
 use datafusion::execution::FunctionRegistry;
+use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::functions_aggregate::count::count_udaf;
 use datafusion::functions_aggregate::expr_fn::{
     approx_median, approx_percentile_cont, approx_percentile_cont_with_weight, count,
@@ -59,30 +63,32 @@ use datafusion::functions_window::expr_fn::{
     cume_dist, dense_rank, lag, lead, ntile, percent_rank, rank, row_number,
 };
 use datafusion::functions_window::rank::rank_udwf;
+use datafusion::physical_expr::PhysicalExpr;
 use datafusion::prelude::*;
 use datafusion::test_util::{TestTableFactory, TestTableProvider};
 use datafusion_common::config::TableOptions;
 use datafusion_common::scalar::ScalarStructBuilder;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef,
-    DataFusionError, Result, ScalarValue, TableReference,
+    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
 };
+use datafusion_execution::TaskContext;
 use datafusion_expr::dml::CopyTo;
 use datafusion_expr::expr::{
-    self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, ScalarFunction,
-    Unnest, WildcardOptions,
+    self, Between, BinaryExpr, Case, Cast, GroupingSet, InList, Like, NullTreatment,
+    ScalarFunction, Unnest, WildcardOptions,
 };
 use datafusion_expr::logical_plan::{Extension, UserDefinedLogicalNodeCore};
 use datafusion_expr::{
-    Accumulator, AggregateUDF, ColumnarValue, ExprFunctionExt, ExprSchemable, Literal,
-    LogicalPlan, Operator, PartitionEvaluator, ScalarUDF, Signature, TryCast, Volatility,
-    WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, WindowUDF,
-    WindowUDFImpl,
+    Accumulator, AggregateUDF, ColumnarValue, ExprFunctionExt, ExprSchemable,
+    LimitEffect, Literal, LogicalPlan, LogicalPlanBuilder, Operator, PartitionEvaluator,
+    ScalarUDF, Signature, TryCast, Volatility, WindowFrame, WindowFrameBound,
+    WindowFrameUnits, WindowFunctionDefinition, WindowUDF, WindowUDFImpl,
 };
 use datafusion_functions_aggregate::average::avg_udaf;
 use datafusion_functions_aggregate::expr_fn::{
-    approx_distinct, array_agg, avg, bit_and, bit_or, bit_xor, bool_and, bool_or, corr,
-    nth_value,
+    approx_distinct, array_agg, avg, avg_distinct, bit_and, bit_or, bit_xor, bool_and,
+    bool_or, corr, nth_value,
 };
 use datafusion_functions_aggregate::string_agg::string_agg;
 use datafusion_functions_window_common::field::WindowUDFFieldArgs;
@@ -97,7 +103,7 @@ use datafusion_proto::logical_plan::file_formats::{
 };
 use datafusion_proto::logical_plan::to_proto::serialize_expr;
 use datafusion_proto::logical_plan::{
-    from_proto, DefaultLogicalExtensionCodec, LogicalExtensionCodec,
+    DefaultLogicalExtensionCodec, LogicalExtensionCodec, from_proto,
 };
 use datafusion_proto::protobuf;
 
@@ -149,8 +155,11 @@ async fn roundtrip_logical_plan() -> Result<()> {
     });
     let extension_codec = TopKExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&topk_plan, &extension_codec)?;
-    let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &extension_codec)?;
+    let logical_round_trip = logical_plan_from_bytes_with_extension_codec(
+        &bytes,
+        &ctx.task_ctx(),
+        &extension_codec,
+    )?;
     assert_eq!(format!("{topk_plan:?}"), format!("{logical_round_trip:?}"));
     Ok(())
 }
@@ -173,7 +182,7 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
         &self,
         _buf: &[u8],
         _inputs: &[LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Extension> {
         not_impl_err!("No extension codec provided")
     }
@@ -187,11 +196,10 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
         buf: &[u8],
         table_ref: &TableReference,
         schema: SchemaRef,
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn TableProvider>> {
-        let msg = TestTableProto::decode(buf).map_err(|_| {
-            DataFusionError::Internal("Error decoding test table".to_string())
-        })?;
+        let msg = TestTableProto::decode(buf)
+            .map_err(|_| internal_datafusion_err!("Error decoding test table"))?;
         assert_eq!(msg.table_name, table_ref.to_string());
         let provider = TestTableProvider {
             url: msg.url,
@@ -215,9 +223,8 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
             url: table.url.clone(),
             table_name: table_ref.to_string(),
         };
-        msg.encode(buf).map_err(|_| {
-            DataFusionError::Internal("Error encoding test table".to_string())
-        })
+        msg.encode(buf)
+            .map_err(|_| internal_datafusion_err!("Error encoding test table"))
     }
 }
 
@@ -238,7 +245,7 @@ async fn roundtrip_custom_tables() -> Result<()> {
     let scan = ctx.table("t").await?.into_optimized_plan()?;
     let bytes = logical_plan_to_bytes_with_extension_codec(&scan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{scan:?}"), format!("{logical_round_trip:?}"));
     Ok(())
 }
@@ -264,7 +271,7 @@ async fn roundtrip_custom_memory_tables() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));
 
     Ok(())
@@ -291,7 +298,7 @@ async fn roundtrip_custom_listing_tables() -> Result<()> {
     let plan = ctx.state().create_logical_plan(query).await?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     // Use exact matching to verify everything. Make sure during round-trip,
     // information like constraints, column defaults, and other aspects of the plan are preserved.
     assert_eq!(plan, logical_round_trip);
@@ -326,7 +333,7 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -352,7 +359,7 @@ async fn roundtrip_logical_plan_aggregation() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -378,7 +385,7 @@ async fn roundtrip_logical_plan_sort() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -406,11 +413,12 @@ async fn roundtrip_logical_plan_dml() -> Result<()> {
         "DELETE FROM T1",
         "UPDATE T1 SET a = 1",
         "CREATE TABLE T2 AS SELECT * FROM T1",
+        "TRUNCATE TABLE T1",
     ];
     for query in queries {
         let plan = ctx.sql(query).await?.into_optimized_plan()?;
         let bytes = logical_plan_to_bytes(&plan)?;
-        let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+        let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
         assert_eq!(
             format!("{plan}"),
             format!("{logical_round_trip}"),
@@ -429,18 +437,18 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> {
     let input = create_csv_scan(&ctx).await?;
     let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new()));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.csv".to_string(),
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.csv".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     let codec = CsvLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -458,7 +466,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> {
 
     parquet_format.global.bloom_filter_on_read = true;
     parquet_format.global.created_by = "DataFusion Test".to_string();
-    parquet_format.global.writer_version = "PARQUET_2_0".to_string();
+    parquet_format.global.writer_version = DFParquetWriterVersion::V2_0;
     parquet_format.global.write_batch_size = 111;
     parquet_format.global.data_pagesize_limit = 222;
     parquet_format.global.data_page_row_count_limit = 333;
@@ -469,18 +477,18 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> {
         ParquetFormatFactory::new_with_options(parquet_format),
     ));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.parquet".to_string(),
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.parquet".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     let codec = ParquetLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));
     match logical_round_trip {
         LogicalPlan::Copy(copy_to) => {
@@ -501,18 +509,18 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> {
 
     let file_type = format_as_file_type(Arc::new(ArrowFormatFactory::new()));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.arrow".to_string(),
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.arrow".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     let codec = ArrowLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     match logical_round_trip {
@@ -543,23 +551,25 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> {
     csv_format.timestamp_format = Some("HH:mm:ss.SSSSSS".to_string());
     csv_format.time_format = Some("HH:mm:ss".to_string());
     csv_format.null_value = Some("NIL".to_string());
+    csv_format.compression = CompressionTypeVariant::GZIP;
+    csv_format.compression_level = Some(6);
 
     let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new_with_options(
         csv_format.clone(),
     )));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.csv".to_string(),
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.csv".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     let codec = CsvLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));
 
     match logical_round_trip {
@@ -587,7 +597,9 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> {
             assert_eq!(csv_format.datetime_format, csv_config.datetime_format);
             assert_eq!(csv_format.timestamp_format, csv_config.timestamp_format);
             assert_eq!(csv_format.time_format, csv_config.time_format);
-            assert_eq!(csv_format.null_value, csv_config.null_value)
+            assert_eq!(csv_format.null_value, csv_config.null_value);
+            assert_eq!(csv_format.compression, csv_config.compression);
+            assert_eq!(csv_format.compression_level, csv_config.compression_level);
         }
         _ => panic!(),
     }
@@ -614,19 +626,19 @@ async fn roundtrip_logical_plan_copy_to_json() -> Result<()> {
         json_format.clone(),
     )));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.json".to_string(),
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.json".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     // Assume JsonLogicalExtensionCodec is implemented similarly to CsvLogicalExtensionCodec
     let codec = JsonLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     match logical_round_trip {
@@ -686,19 +698,19 @@ async fn roundtrip_logical_plan_copy_to_parquet() -> Result<()> {
         ParquetFormatFactory::new_with_options(parquet_format.clone()),
     ));
 
-    let plan = LogicalPlan::Copy(CopyTo {
-        input: Arc::new(input),
-        output_url: "test.parquet".to_string(),
-        partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()],
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.parquet".to_string(),
+        vec!["a".to_string(), "b".to_string(), "c".to_string()],
         file_type,
-        options: Default::default(),
-    });
+        Default::default(),
+    ));
 
     // Assume ParquetLogicalExtensionCodec is implemented similarly to JsonLogicalExtensionCodec
     let codec = ParquetLogicalExtensionCodec {};
     let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?;
     let logical_round_trip =
-        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?;
+        logical_plan_from_bytes_with_extension_codec(&bytes, &ctx.task_ctx(), &codec)?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     match logical_round_trip {
@@ -731,6 +743,192 @@ async fn roundtrip_logical_plan_copy_to_parquet() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn roundtrip_default_codec_csv() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_csv_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut csv_format = table_options.csv;
+    csv_format.delimiter = b'|';
+    csv_format.has_header = Some(true);
+    csv_format.compression = CompressionTypeVariant::GZIP;
+
+    let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new_with_options(
+        csv_format.clone(),
+    )));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.csv".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.csv", copy_to.output_url);
+            assert_eq!("csv", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let csv = dt
+                .as_format_factory()
+                .as_ref()
+                .as_any()
+                .downcast_ref::<CsvFormatFactory>()
+                .unwrap();
+            let decoded = csv.options.as_ref().unwrap();
+            assert_eq!(csv_format.delimiter, decoded.delimiter);
+            assert_eq!(csv_format.has_header, decoded.has_header);
+            assert_eq!(csv_format.compression, decoded.compression);
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_json() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_json_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut json_format = table_options.json;
+    json_format.compression = CompressionTypeVariant::GZIP;
+    json_format.schema_infer_max_rec = Some(500);
+
+    let file_type = format_as_file_type(Arc::new(JsonFormatFactory::new_with_options(
+        json_format.clone(),
+    )));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.json".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.json", copy_to.output_url);
+            assert_eq!("json", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let json = dt
+                .as_format_factory()
+                .as_ref()
+                .as_any()
+                .downcast_ref::<JsonFormatFactory>()
+                .unwrap();
+            let decoded = json.options.as_ref().unwrap();
+            assert_eq!(json_format.compression, decoded.compression);
+            assert_eq!(
+                json_format.schema_infer_max_rec,
+                decoded.schema_infer_max_rec
+            );
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_parquet() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_parquet_scan(&ctx).await?;
+
+    let table_options =
+        TableOptions::default_from_session_config(ctx.state().config_options());
+    let mut parquet_format = table_options.parquet;
+    parquet_format.global.bloom_filter_on_read = true;
+    parquet_format.global.created_by = "DefaultCodecTest".to_string();
+
+    let file_type = format_as_file_type(Arc::new(
+        ParquetFormatFactory::new_with_options(parquet_format.clone()),
+    ));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.parquet".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.parquet", copy_to.output_url);
+            assert_eq!("parquet", copy_to.file_type.get_ext());
+            let dt = copy_to
+                .file_type
+                .as_ref()
+                .as_any()
+                .downcast_ref::<DefaultFileType>()
+                .unwrap();
+            let pq = dt
+                .as_format_factory()
+                .as_ref()
+                .as_any()
+                .downcast_ref::<ParquetFormatFactory>()
+                .unwrap();
+            let decoded = pq.options.as_ref().unwrap();
+            assert!(decoded.global.bloom_filter_on_read);
+            assert_eq!("DefaultCodecTest", decoded.global.created_by);
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_default_codec_arrow() -> Result<()> {
+    let ctx = SessionContext::new();
+    let input = create_csv_scan(&ctx).await?;
+
+    let file_type = format_as_file_type(Arc::new(ArrowFormatFactory::new()));
+
+    let plan = LogicalPlan::Copy(CopyTo::new(
+        Arc::new(input),
+        "test.arrow".to_string(),
+        vec![],
+        file_type,
+        Default::default(),
+    ));
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let roundtrip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
+    match roundtrip {
+        LogicalPlan::Copy(copy_to) => {
+            assert_eq!("test.arrow", copy_to.output_url);
+            assert_eq!("arrow", copy_to.file_type.get_ext());
+        }
+        _ => panic!("Expected CopyTo plan"),
+    }
+    Ok(())
+}
+
 async fn create_csv_scan(ctx: &SessionContext) -> Result<LogicalPlan, DataFusionError> {
     ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default())
         .await?;
@@ -743,7 +941,7 @@ async fn create_json_scan(ctx: &SessionContext) -> Result<LogicalPlan, DataFusio
     ctx.register_json(
         "t1",
         "../core/tests/data/1.json",
-        NdJsonReadOptions::default(),
+        JsonReadOptions::default(),
     )
     .await?;
 
@@ -785,7 +983,7 @@ async fn roundtrip_logical_plan_distinct_on() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -811,7 +1009,7 @@ async fn roundtrip_single_count_distinct() -> Result<()> {
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
@@ -824,7 +1022,7 @@ async fn roundtrip_logical_plan_with_extension() -> Result<()> {
         .await?;
     let plan = ctx.table("t1").await?.into_optimized_plan()?;
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
     Ok(())
 }
@@ -849,7 +1047,7 @@ async fn roundtrip_logical_plan_unnest() -> Result<()> {
     let query = "SELECT unnest(b) FROM t1";
     let plan = ctx.sql(query).await?.into_optimized_plan()?;
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
     Ok(())
 }
@@ -960,16 +1158,18 @@ async fn roundtrip_expr_api() -> Result<()> {
         array_replace_all(make_array(vec![lit(1), lit(2), lit(3)]), lit(2), lit(4)),
         count(lit(1)),
         count_distinct(lit(1)),
-        first_value(lit(1), None),
-        first_value(lit(1), Some(vec![lit(2).sort(true, true)])),
+        first_value(lit(1), vec![]),
+        first_value(lit(1), vec![lit(2).sort(true, true)]),
         functions_window::nth_value::first_value(lit(1)),
         functions_window::nth_value::last_value(lit(1)),
         functions_window::nth_value::nth_value(lit(1), 1),
         avg(lit(1.5)),
+        avg_distinct(lit(1.5)),
         covar_samp(lit(1.5), lit(2.2)),
         covar_pop(lit(1.5), lit(2.2)),
         corr(lit(1.5), lit(2.2)),
         sum(lit(1)),
+        sum_distinct(lit(1)),
         max(lit(1)),
         median(lit(2)),
         min(lit(2)),
@@ -981,7 +1181,18 @@ async fn roundtrip_expr_api() -> Result<()> {
         approx_median(lit(2)),
         approx_percentile_cont(lit(2).sort(true, false), lit(0.5), None),
         approx_percentile_cont(lit(2).sort(true, false), lit(0.5), Some(lit(50))),
-        approx_percentile_cont_with_weight(lit(2), lit(1), lit(0.5)),
+        approx_percentile_cont_with_weight(
+            lit(2).sort(true, false),
+            lit(1),
+            lit(0.5),
+            None,
+        ),
+        approx_percentile_cont_with_weight(
+            lit(2).sort(true, false),
+            lit(1),
+            lit(0.5),
+            Some(lit(50)),
+        ),
         grouping(lit(1)),
         bit_and(lit(2)),
         bit_or(lit(2)),
@@ -1024,7 +1235,7 @@ async fn roundtrip_expr_api() -> Result<()> {
     // ensure expressions created with the expr api can be round tripped
     let plan = table.select(expr_list)?.into_optimized_plan()?;
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
     Ok(())
 }
@@ -1044,18 +1255,49 @@ async fn roundtrip_logical_plan_with_view_scan() -> Result<()> {
         .into_optimized_plan()?;
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     // DROP
     let plan = ctx.sql("DROP VIEW view_t1").await?.into_optimized_plan()?;
     let bytes = logical_plan_to_bytes(&plan)?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     assert_eq!(format!("{plan}"), format!("{logical_round_trip}"));
 
     Ok(())
 }
 
+#[tokio::test]
+async fn roundtrip_logical_plan_prepared_statement_with_metadata() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    let plan = ctx
+        .sql("SELECT $1")
+        .await
+        .unwrap()
+        .into_optimized_plan()
+        .unwrap();
+    let prepared = LogicalPlanBuilder::new(plan)
+        .prepare(
+            "".to_string(),
+            vec![
+                Field::new("", DataType::Int32, true)
+                    .with_metadata(
+                        [("some_key".to_string(), "some_value".to_string())].into(),
+                    )
+                    .into(),
+            ],
+        )
+        .unwrap()
+        .plan()
+        .clone();
+
+    let bytes = logical_plan_to_bytes(&prepared)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+    assert_eq!(format!("{prepared}"), format!("{logical_round_trip}"));
+    Ok(())
+}
+
 pub mod proto {
     #[derive(Clone, PartialEq, ::prost::Message)]
     pub struct TopKPlanProto {
@@ -1066,6 +1308,7 @@ pub mod proto {
         pub expr: Option<datafusion_proto::protobuf::LogicalExprNode>,
     }
 
+    #[allow(dead_code)]
     #[derive(Clone, PartialEq, Eq, ::prost::Message)]
     pub struct TopKExecProto {
         #[prost(uint64, tag = "1")]
@@ -1144,11 +1387,11 @@ impl LogicalExtensionCodec for TopKExtensionCodec {
         &self,
         buf: &[u8],
         inputs: &[LogicalPlan],
-        ctx: &SessionContext,
+        ctx: &TaskContext,
     ) -> Result<Extension> {
         if let Some((input, _)) = inputs.split_first() {
             let proto = proto::TopKPlanProto::decode(buf).map_err(|e| {
-                DataFusionError::Internal(format!("failed to decode logical plan: {e:?}"))
+                internal_datafusion_err!("failed to decode logical plan: {e:?}")
             })?;
 
             if let Some(expr) = proto.expr.as_ref() {
@@ -1177,7 +1420,7 @@ impl LogicalExtensionCodec for TopKExtensionCodec {
             };
 
             proto.encode(buf).map_err(|e| {
-                DataFusionError::Internal(format!("failed to encode logical plan: {e:?}"))
+                internal_datafusion_err!("failed to encode logical plan: {e:?}")
             })?;
 
             Ok(())
@@ -1191,7 +1434,7 @@ impl LogicalExtensionCodec for TopKExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: SchemaRef,
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn TableProvider>> {
         internal_err!("unsupported plan type")
     }
@@ -1214,7 +1457,7 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[LogicalPlan],
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Extension> {
         not_impl_err!("No extension codec provided")
     }
@@ -1228,7 +1471,7 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
         _buf: &[u8],
         _table_ref: &TableReference,
         _schema: SchemaRef,
-        _ctx: &SessionContext,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn TableProvider>> {
         internal_err!("unsupported plan type")
     }
@@ -1245,7 +1488,7 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
     fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
         if name == "regex_udf" {
             let proto = MyRegexUdfNode::decode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to decode regex_udf: {err}"))
+                internal_datafusion_err!("failed to decode regex_udf: {err}")
             })?;
 
             Ok(Arc::new(ScalarUDF::from(MyRegexUdf::new(proto.pattern))))
@@ -1260,18 +1503,16 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
         let proto = MyRegexUdfNode {
             pattern: udf.pattern.clone(),
         };
-        proto.encode(buf).map_err(|err| {
-            DataFusionError::Internal(format!("failed to encode udf: {err}"))
-        })?;
+        proto
+            .encode(buf)
+            .map_err(|err| internal_datafusion_err!("failed to encode udf: {err}"))?;
         Ok(())
     }
 
     fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
         if name == "aggregate_udf" {
             let proto = MyAggregateUdfNode::decode(buf).map_err(|err| {
-                DataFusionError::Internal(format!(
-                    "failed to decode aggregate_udf: {err}"
-                ))
+                internal_datafusion_err!("failed to decode aggregate_udf: {err}")
             })?;
 
             Ok(Arc::new(AggregateUDF::from(MyAggregateUDF::new(
@@ -1288,9 +1529,9 @@ impl LogicalExtensionCodec for UDFExtensionCodec {
         let proto = MyAggregateUdfNode {
             result: udf.result.clone(),
         };
-        proto.encode(buf).map_err(|err| {
-            DataFusionError::Internal(format!("failed to encode udf: {err}"))
-        })?;
+        proto
+            .encode(buf)
+            .map_err(|err| internal_datafusion_err!("failed to encode udf: {err}"))?;
         Ok(())
     }
 }
@@ -1479,6 +1720,16 @@ fn round_trip_scalar_values_and_data_types() {
             Box::new(DataType::Int32),
             Box::new(ScalarValue::Utf8(None)),
         ),
+        ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int32, false).into(),
+            Field::new("values", DataType::Utf8, true).into(),
+            Box::new(ScalarValue::from("foo")),
+        ),
+        ScalarValue::RunEndEncoded(
+            Field::new("run_ends", DataType::Int32, false).into(),
+            Field::new("values", DataType::Utf8, true).into(),
+            Box::new(ScalarValue::Utf8(None)),
+        ),
         ScalarValue::Binary(Some(b"bar".to_vec())),
         ScalarValue::Binary(None),
         ScalarValue::LargeBinary(Some(b"bar".to_vec())),
@@ -1574,7 +1825,7 @@ fn round_trip_scalar_values_and_data_types() {
         assert_eq!(
             dt, roundtrip,
             "DataType was not the same after round trip!\n\n\
-                        Input: {dt:?}\n\nRoundtrip: {roundtrip:?}"
+                        Input: {dt}\n\nRoundtrip: {roundtrip:?}"
         );
     }
 }
@@ -1726,19 +1977,20 @@ fn round_trip_datatype() {
             ),
         ])),
         DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![7, 5, 3],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
                     Field::new("datatype", DataType::Binary, false),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         ),
         DataType::Union(
-            UnionFields::new(
-                vec![5, 8, 1],
+            UnionFields::try_new(
+                vec![5, 8, 1, 100],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
@@ -1753,7 +2005,8 @@ fn round_trip_datatype() {
                         true,
                     ),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Dense,
         ),
         DataType::Dictionary(
@@ -1934,6 +2187,10 @@ fn roundtrip_binary_op() {
     test(Operator::RegexNotMatch);
     test(Operator::RegexIMatch);
     test(Operator::RegexMatch);
+    test(Operator::LikeMatch);
+    test(Operator::ILikeMatch);
+    test(Operator::NotLikeMatch);
+    test(Operator::NotILikeMatch);
     test(Operator::BitwiseShiftRight);
     test(Operator::BitwiseShiftLeft);
     test(Operator::BitwiseAnd);
@@ -1968,7 +2225,7 @@ fn roundtrip_case_with_null() {
     let test_expr = Expr::Case(Case::new(
         Some(Box::new(lit(1.0_f32))),
         vec![(Box::new(lit(2.0_f32)), Box::new(lit(3.0_f32)))],
-        Some(Box::new(Expr::Literal(ScalarValue::Null))),
+        Some(Box::new(Expr::Literal(ScalarValue::Null, None))),
     ));
 
     let ctx = SessionContext::new();
@@ -1977,7 +2234,7 @@ fn roundtrip_case_with_null() {
 
 #[test]
 fn roundtrip_null_literal() {
-    let test_expr = Expr::Literal(ScalarValue::Null);
+    let test_expr = Expr::Literal(ScalarValue::Null, None);
 
     let ctx = SessionContext::new();
     roundtrip_expr_test(test_expr, ctx);
@@ -2176,19 +2433,42 @@ fn roundtrip_aggregate_udf() {
         Arc::new(vec![DataType::Float64, DataType::UInt32]),
     );
 
-    let test_expr = Expr::AggregateFunction(expr::AggregateFunction::new_udf(
+    let ctx = SessionContext::new();
+    ctx.register_udaf(dummy_agg.clone());
+
+    // null_treatment absent
+    let test_expr1 = Expr::AggregateFunction(expr::AggregateFunction::new_udf(
         Arc::new(dummy_agg.clone()),
         vec![lit(1.0_f64)],
         false,
         Some(Box::new(lit(true))),
-        None,
+        vec![],
         None,
     ));
 
-    let ctx = SessionContext::new();
-    ctx.register_udaf(dummy_agg);
+    // null_treatment respect nulls
+    let test_expr2 = Expr::AggregateFunction(expr::AggregateFunction::new_udf(
+        Arc::new(dummy_agg.clone()),
+        vec![lit(1.0_f64)],
+        true,
+        Some(Box::new(lit(true))),
+        vec![],
+        Some(NullTreatment::RespectNulls),
+    ));
 
-    roundtrip_expr_test(test_expr, ctx);
+    // null_treatment ignore nulls
+    let test_expr3 = Expr::AggregateFunction(expr::AggregateFunction::new_udf(
+        Arc::new(dummy_agg),
+        vec![lit(1.0_f64)],
+        true,
+        Some(Box::new(lit(true))),
+        vec![],
+        Some(NullTreatment::IgnoreNulls),
+    ));
+
+    roundtrip_expr_test(test_expr1, ctx.clone());
+    roundtrip_expr_test(test_expr2, ctx.clone());
+    roundtrip_expr_test(test_expr3, ctx);
 }
 
 fn dummy_udf() -> ScalarUDF {
@@ -2232,7 +2512,7 @@ fn roundtrip_scalar_udf() {
             &self,
             _buf: &[u8],
             _inputs: &[LogicalPlan],
-            _ctx: &SessionContext,
+            _ctx: &TaskContext,
         ) -> Result<Extension> {
             not_impl_err!("LogicalExtensionCodec is not provided")
         }
@@ -2246,7 +2526,7 @@ fn roundtrip_scalar_udf() {
             _buf: &[u8],
             _table_ref: &TableReference,
             _schema: SchemaRef,
-            _ctx: &SessionContext,
+            _ctx: &TaskContext,
         ) -> Result<Arc<dyn TableProvider>> {
             not_impl_err!("LogicalExtensionCodec is not provided")
         }
@@ -2264,7 +2544,7 @@ fn roundtrip_scalar_udf() {
             if name == "dummy" {
                 Ok(Arc::new(dummy_udf()))
             } else {
-                Err(DataFusionError::Internal(format!("UDF {name} not found")))
+                Err(internal_datafusion_err!("UDF {name} not found"))
             }
         }
     }
@@ -2359,7 +2639,7 @@ fn roundtrip_window() {
     let ctx = SessionContext::new();
 
     // 1. without window_frame
-    let test_expr1 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr1 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::WindowUDF(rank_udwf()),
         vec![],
     ))
@@ -2370,7 +2650,7 @@ fn roundtrip_window() {
     .unwrap();
 
     // 2. with default window_frame
-    let test_expr2 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr2 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::WindowUDF(rank_udwf()),
         vec![],
     ))
@@ -2387,7 +2667,7 @@ fn roundtrip_window() {
         WindowFrameBound::Following(ScalarValue::UInt64(Some(2))),
     );
 
-    let test_expr3 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr3 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::WindowUDF(rank_udwf()),
         vec![],
     ))
@@ -2404,7 +2684,7 @@ fn roundtrip_window() {
         WindowFrameBound::Following(ScalarValue::UInt64(Some(2))),
     );
 
-    let test_expr4 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr4 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::AggregateUDF(max_udaf()),
         vec![col("col1")],
     ))
@@ -2454,7 +2734,7 @@ fn roundtrip_window() {
         Arc::new(vec![DataType::Float64, DataType::UInt32]),
     );
 
-    let test_expr5 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr5 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::AggregateUDF(Arc::new(dummy_agg.clone())),
         vec![col("col1")],
     ))
@@ -2483,7 +2763,7 @@ fn roundtrip_window() {
         }
     }
 
-    #[derive(Debug, Clone)]
+    #[derive(Debug, Clone, PartialEq, Eq, Hash)]
     struct SimpleWindowUDF {
         signature: Signature,
     }
@@ -2531,6 +2811,10 @@ fn roundtrip_window() {
                 )
             }
         }
+
+        fn limit_effect(&self, _args: &[Arc<dyn PhysicalExpr>]) -> LimitEffect {
+            LimitEffect::Unknown
+        }
     }
 
     fn make_partition_evaluator() -> Result<Box<dyn PartitionEvaluator>> {
@@ -2539,7 +2823,7 @@ fn roundtrip_window() {
 
     let dummy_window_udf = WindowUDF::from(SimpleWindowUDF::new());
 
-    let test_expr6 = Expr::WindowFunction(expr::WindowFunction::new(
+    let test_expr6 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::WindowUDF(Arc::new(dummy_window_udf.clone())),
         vec![col("col1")],
     ))
@@ -2548,8 +2832,10 @@ fn roundtrip_window() {
     .window_frame(row_number_frame.clone())
     .build()
     .unwrap();
+    ctx.register_udwf(dummy_window_udf);
 
-    let text_expr7 = Expr::WindowFunction(expr::WindowFunction::new(
+    // 7. test with average udaf
+    let test_expr7 = Expr::from(expr::WindowFunction::new(
         WindowFunctionDefinition::AggregateUDF(avg_udaf()),
         vec![col("col1")],
     ))
@@ -2557,7 +2843,53 @@ fn roundtrip_window() {
     .build()
     .unwrap();
 
-    ctx.register_udwf(dummy_window_udf);
+    // 8. test with respect nulls
+    let test_expr8 = Expr::from(expr::WindowFunction::new(
+        WindowFunctionDefinition::WindowUDF(rank_udwf()),
+        vec![],
+    ))
+    .partition_by(vec![col("col1")])
+    .order_by(vec![col("col2").sort(true, false)])
+    .window_frame(WindowFrame::new(Some(false)))
+    .null_treatment(NullTreatment::RespectNulls)
+    .build()
+    .unwrap();
+
+    // 9. test with ignore nulls
+    let test_expr9 = Expr::from(expr::WindowFunction::new(
+        WindowFunctionDefinition::WindowUDF(rank_udwf()),
+        vec![],
+    ))
+    .partition_by(vec![col("col1")])
+    .order_by(vec![col("col2").sort(true, false)])
+    .window_frame(WindowFrame::new(Some(false)))
+    .null_treatment(NullTreatment::IgnoreNulls)
+    .build()
+    .unwrap();
+
+    // 10. test with distinct is `true`
+    let test_expr10 = Expr::from(expr::WindowFunction::new(
+        WindowFunctionDefinition::WindowUDF(rank_udwf()),
+        vec![],
+    ))
+    .partition_by(vec![col("col1")])
+    .order_by(vec![col("col2").sort(true, false)])
+    .window_frame(WindowFrame::new(Some(false)))
+    .distinct()
+    .build()
+    .unwrap();
+
+    // 11. test with filter
+    let test_expr11 = Expr::from(expr::WindowFunction::new(
+        WindowFunctionDefinition::WindowUDF(rank_udwf()),
+        vec![],
+    ))
+    .partition_by(vec![col("col1")])
+    .order_by(vec![col("col2").sort(true, false)])
+    .window_frame(WindowFrame::new(Some(false)))
+    .filter(col("col1").eq(lit(1)))
+    .build()
+    .unwrap();
 
     roundtrip_expr_test(test_expr1, ctx.clone());
     roundtrip_expr_test(test_expr2, ctx.clone());
@@ -2565,7 +2897,11 @@ fn roundtrip_window() {
     roundtrip_expr_test(test_expr4, ctx.clone());
     roundtrip_expr_test(test_expr5, ctx.clone());
     roundtrip_expr_test(test_expr6, ctx.clone());
-    roundtrip_expr_test(text_expr7, ctx);
+    roundtrip_expr_test(test_expr7, ctx.clone());
+    roundtrip_expr_test(test_expr8, ctx.clone());
+    roundtrip_expr_test(test_expr9, ctx.clone());
+    roundtrip_expr_test(test_expr10, ctx.clone());
+    roundtrip_expr_test(test_expr11, ctx);
 }
 
 #[tokio::test]
@@ -2584,7 +2920,7 @@ async fn roundtrip_recursive_query() {
     let bytes = logical_plan_to_bytes(&plan).unwrap();
 
     let ctx = SessionContext::new();
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx).unwrap();
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx()).unwrap();
     assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));
     let dataframe = ctx.execute_logical_plan(logical_round_trip).await.unwrap();
     let output_round_trip = dataframe.collect().await.unwrap();
@@ -2615,10 +2951,10 @@ async fn roundtrip_union_query() -> Result<()> {
         .await?;
     ctx.register_csv("t2", "tests/testdata/test.csv", CsvReadOptions::default())
         .await?;
-    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
     // proto deserialization only supports 2-way union, hence this plan has nested unions
     // apply the flatten unions optimizer rule to be able to compare
-    let optimizer = Optimizer::with_rules(vec![Arc::new(EliminateNestedUnion::new())]);
+    let optimizer = Optimizer::with_rules(vec![Arc::new(OptimizeUnions::new())]);
     let unnested = optimizer.optimize(logical_round_trip, &(ctx.state()), |_x, _y| {})?;
     assert_eq!(
         format!("{}", plan.display_indent_schema()),
@@ -2652,7 +2988,104 @@ async fn roundtrip_custom_listing_tables_schema() -> Result<()> {
         .clone();
 
     let bytes = logical_plan_to_bytes(&plan)?;
-    let new_plan = logical_plan_from_bytes(&bytes, &ctx)?;
+    let new_plan = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+    assert_eq!(plan, new_plan);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_custom_listing_tables_schema_table_scan_projection() -> Result<()> {
+    let ctx = SessionContext::new();
+    // Make sure during round-trip, constraint information is preserved
+    let file_format = JsonFormat::default();
+    let table_partition_cols = vec![("part".to_owned(), DataType::Int64)];
+    let data = "../core/tests/data/partitioned_table_json";
+    let listing_table_url = ListingTableUrl::parse(data)?;
+    let listing_options = ListingOptions::new(Arc::new(file_format))
+        .with_table_partition_cols(table_partition_cols);
+
+    let config = ListingTableConfig::new(listing_table_url)
+        .with_listing_options(listing_options)
+        .infer_schema(&ctx.state())
+        .await?;
+
+    let listing_table: Arc<dyn TableProvider> = Arc::new(ListingTable::try_new(config)?);
+
+    let projection = ["part", "value"]
+        .iter()
+        .map(|field_name| listing_table.schema().index_of(field_name))
+        .collect::<Result<Vec<_>, _>>()?;
+
+    let plan = LogicalPlanBuilder::scan(
+        "hive_style",
+        Arc::new(DefaultTableSource::new(listing_table)),
+        Some(projection),
+    )?
+    .limit(0, Some(1))?
+    .build()?;
+
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let new_plan = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+
     assert_eq!(plan, new_plan);
     Ok(())
 }
+
+#[tokio::test]
+async fn roundtrip_arrow_scan() -> Result<()> {
+    let ctx = SessionContext::new();
+    let plan = ctx
+        .read_arrow("tests/testdata/test.arrow", ArrowReadOptions::default())
+        .await?
+        .into_optimized_plan()?;
+    let bytes = logical_plan_to_bytes(&plan)?;
+    let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+    assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_mixed_case_table_reference() -> Result<()> {
+    // Prepare "client" database
+    let client_ctx = SessionContext::new_with_config(
+        SessionConfig::new()
+            .set_bool("datafusion.sql_parser.enable_ident_normalization", false),
+    );
+    client_ctx
+        .register_csv(
+            "\"TestData\"",
+            "tests/testdata/test.csv",
+            CsvReadOptions::default(),
+        )
+        .await?;
+
+    // Prepare "server" database
+    let server_ctx = SessionContext::new_with_config(
+        SessionConfig::new()
+            .set_bool("datafusion.sql_parser.enable_ident_normalization", false),
+    );
+    server_ctx
+        .register_csv(
+            "\"TestData\"",
+            "tests/testdata/test.csv",
+            CsvReadOptions::default(),
+        )
+        .await?;
+
+    // Create a logical plan, serialize it (client), then deserialize it (server)
+    let dataframe = client_ctx
+        .sql("SELECT a FROM TestData WHERE TestData.a = 1")
+        .await?;
+
+    let client_logical_plan = dataframe.into_optimized_plan()?;
+    let plan_bytes = logical_plan_to_bytes(&client_logical_plan)?;
+    let server_logical_plan =
+        logical_plan_from_bytes(&plan_bytes, &server_ctx.task_ctx())?;
+
+    assert_eq!(
+        format!("{}", client_logical_plan.display_indent_schema()),
+        format!("{}", server_logical_plan.display_indent_schema())
+    );
+
+    Ok(())
+}
diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
index 7d56bb6c5db1b..0a5ed766e6cc1 100644
--- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
@@ -16,101 +16,119 @@
 // under the License.
 
 use std::any::Any;
+use std::collections::HashMap;
 use std::fmt::{Display, Formatter};
-use std::ops::Deref;
-use std::sync::Arc;
+use std::sync::{Arc, RwLock};
 use std::vec;
 
-use crate::cases::{
-    CustomUDWF, CustomUDWFNode, MyAggregateUDF, MyAggregateUdfNode, MyRegexUdf,
-    MyRegexUdfNode,
-};
-
 use arrow::array::RecordBatch;
 use arrow::csv::WriterBuilder;
 use arrow::datatypes::{Fields, TimeUnit};
-use datafusion::physical_expr::aggregate::AggregateExprBuilder;
-use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
-use datafusion_expr::dml::InsertOp;
-use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf;
-use datafusion_functions_aggregate::array_agg::array_agg_udaf;
-use datafusion_functions_aggregate::min_max::max_udaf;
-use prost::Message;
-
 use datafusion::arrow::array::ArrayRef;
 use datafusion::arrow::compute::kernels::sort::SortOptions;
 use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema};
 use datafusion::datasource::empty::EmptyTable;
 use datafusion::datasource::file_format::csv::CsvSink;
-use datafusion::datasource::file_format::json::JsonSink;
+use datafusion::datasource::file_format::json::{JsonFormat, JsonSink};
 use datafusion::datasource::file_format::parquet::ParquetSink;
-use datafusion::datasource::listing::{ListingTableUrl, PartitionedFile};
+use datafusion::datasource::listing::{
+    ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, PartitionedFile,
+};
 use datafusion::datasource::object_store::ObjectStoreUrl;
 use datafusion::datasource::physical_plan::{
-    wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileGroup,
-    FileScanConfigBuilder, FileSinkConfig, FileSource, ParquetSource,
+    ArrowSource, FileGroup, FileOutputMode, FileScanConfig, FileScanConfigBuilder,
+    FileSinkConfig, ParquetSource, wrap_partition_type_in_dict,
+    wrap_partition_value_in_dict,
 };
 use datafusion::datasource::sink::DataSinkExec;
 use datafusion::datasource::source::DataSourceExec;
-use datafusion::execution::FunctionRegistry;
+use datafusion::execution::TaskContext;
+use datafusion::functions_aggregate::count::count_udaf;
 use datafusion::functions_aggregate::sum::sum_udaf;
 use datafusion::functions_window::nth_value::nth_value_udwf;
 use datafusion::functions_window::row_number::row_number_udwf;
-use datafusion::logical_expr::{create_udf, JoinType, Operator, Volatility};
+use datafusion::logical_expr::{JoinType, Operator, Volatility, create_udf};
+use datafusion::physical_expr::aggregate::AggregateExprBuilder;
 use datafusion::physical_expr::expressions::Literal;
 use datafusion::physical_expr::window::{SlidingAggregateWindowExpr, StandardWindowExpr};
 use datafusion::physical_expr::{
-    LexOrdering, LexRequirement, PhysicalSortRequirement, ScalarFunctionExpr,
+    LexOrdering, PhysicalSortRequirement, ScalarFunctionExpr,
 };
 use datafusion::physical_plan::aggregates::{
-    AggregateExec, AggregateMode, PhysicalGroupBy,
+    AggregateExec, AggregateMode, LimitOptions, PhysicalGroupBy,
 };
 use datafusion::physical_plan::analyze::AnalyzeExec;
+#[expect(deprecated)]
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::empty::EmptyExec;
 use datafusion::physical_plan::expressions::{
-    binary, cast, col, in_list, like, lit, BinaryExpr, Column, NotExpr, PhysicalSortExpr,
+    BinaryExpr, Column, NotExpr, PhysicalSortExpr, binary, cast, col, in_list, like, lit,
 };
-use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::filter::{FilterExec, FilterExecBuilder};
 use datafusion::physical_plan::joins::{
-    HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode,
+    HashJoinExec, NestedLoopJoinExec, PartitionMode, SortMergeJoinExec,
+    StreamJoinPartitionMode, SymmetricHashJoinExec,
 };
 use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion::physical_plan::metrics::MetricType;
 use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion::physical_plan::projection::ProjectionExec;
+use datafusion::physical_plan::projection::{ProjectionExec, ProjectionExpr};
 use datafusion::physical_plan::repartition::RepartitionExec;
 use datafusion::physical_plan::sorts::sort::SortExec;
 use datafusion::physical_plan::union::{InterleaveExec, UnionExec};
 use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec};
 use datafusion::physical_plan::windows::{
-    create_udwf_window_expr, BoundedWindowAggExec, PlainAggregateWindowExpr,
-    WindowAggExec,
+    BoundedWindowAggExec, PlainAggregateWindowExpr, WindowAggExec,
+    create_udwf_window_expr,
 };
 use datafusion::physical_plan::{
-    displayable, ExecutionPlan, InputOrderMode, Partitioning, PhysicalExpr, Statistics,
+    ExecutionPlan, InputOrderMode, Partitioning, PhysicalExpr, Statistics, displayable,
 };
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion::scalar::ScalarValue;
-use datafusion_common::config::TableParquetOptions;
+use datafusion_common::config::{ConfigOptions, TableParquetOptions};
 use datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::file_options::json_writer::JsonWriterOptions;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::stats::Precision;
 use datafusion_common::{
-    internal_err, not_impl_err, DataFusionError, Result, UnnestOptions,
+    DataFusionError, NullEquality, Result, UnnestOptions, exec_datafusion_err,
+    internal_datafusion_err, internal_err, not_impl_err,
 };
+use datafusion_datasource::TableSchema;
+use datafusion_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion_expr::dml::InsertOp;
 use datafusion_expr::{
-    Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarUDF,
-    Signature, SimpleAggregateUDF, WindowFrame, WindowFrameBound, WindowUDF,
+    Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue,
+    ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF,
+    WindowFrame, WindowFrameBound, WindowUDF,
 };
+use datafusion_functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf;
+use datafusion_functions_aggregate::array_agg::array_agg_udaf;
 use datafusion_functions_aggregate::average::avg_udaf;
+use datafusion_functions_aggregate::min_max::max_udaf;
 use datafusion_functions_aggregate::nth_value::nth_value_udaf;
 use datafusion_functions_aggregate::string_agg::string_agg_udaf;
+use datafusion_proto::bytes::{
+    physical_plan_from_bytes_with_proto_converter,
+    physical_plan_to_bytes_with_proto_converter,
+};
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr_with_converter;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr_with_converter;
 use datafusion_proto::physical_plan::{
-    AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec,
+    AsExecutionPlan, DeduplicatingProtoConverter, DefaultPhysicalExtensionCodec,
+    DefaultPhysicalProtoConverter, PhysicalExtensionCodec,
+    PhysicalProtoConverterExtension,
 };
 use datafusion_proto::protobuf;
-use datafusion_proto::protobuf::PhysicalPlanNode;
+use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode};
+use prost::Message;
+
+use crate::cases::{
+    CustomUDWF, CustomUDWFNode, MyAggregateUDF, MyAggregateUdfNode, MyRegexUdf,
+    MyRegexUdfNode,
+};
 
 /// Perform a serde roundtrip and assert that the string representation of the before and after plans
 /// are identical. Note that this often isn't sufficient to guarantee that no information is
@@ -118,7 +136,8 @@ use datafusion_proto::protobuf::PhysicalPlanNode;
 fn roundtrip_test(exec_plan: Arc<dyn ExecutionPlan>) -> Result<()> {
     let ctx = SessionContext::new();
     let codec = DefaultPhysicalExtensionCodec {};
-    roundtrip_test_and_return(exec_plan, &ctx, &codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -132,15 +151,24 @@ fn roundtrip_test_and_return(
     exec_plan: Arc<dyn ExecutionPlan>,
     ctx: &SessionContext,
     codec: &dyn PhysicalExtensionCodec,
+    proto_converter: &dyn PhysicalProtoConverterExtension,
 ) -> Result<Arc<dyn ExecutionPlan>> {
-    let proto: protobuf::PhysicalPlanNode =
-        protobuf::PhysicalPlanNode::try_from_physical_plan(exec_plan.clone(), codec)
-            .expect("to proto");
-    let runtime = ctx.runtime_env();
-    let result_exec_plan: Arc<dyn ExecutionPlan> = proto
-        .try_into_physical_plan(ctx, runtime.deref(), codec)
-        .expect("from proto");
-    assert_eq!(format!("{exec_plan:?}"), format!("{result_exec_plan:?}"));
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan),
+        codec,
+        proto_converter,
+    )?;
+    let result_exec_plan = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        codec,
+        proto_converter,
+    )?;
+
+    pretty_assertions::assert_eq!(
+        format!("{exec_plan:?}"),
+        format!("{result_exec_plan:?}")
+    );
     Ok(result_exec_plan)
 }
 
@@ -155,7 +183,8 @@ fn roundtrip_test_with_context(
     ctx: &SessionContext,
 ) -> Result<()> {
     let codec = DefaultPhysicalExtensionCodec {};
-    roundtrip_test_and_return(exec_plan, ctx, &codec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(exec_plan, ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -163,9 +192,10 @@ fn roundtrip_test_with_context(
 /// query results are identical.
 async fn roundtrip_test_sql_with_context(sql: &str, ctx: &SessionContext) -> Result<()> {
     let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
     let initial_plan = ctx.sql(sql).await?.create_physical_plan().await?;
 
-    roundtrip_test_and_return(initial_plan, ctx, &codec)?;
+    roundtrip_test_and_return(initial_plan, ctx, &codec, &proto_converter)?;
     Ok(())
 }
 
@@ -205,7 +235,10 @@ fn roundtrip_date_time_interval() -> Result<()> {
     let date_time_interval_expr =
         binary(date_expr, Operator::Plus, literal_expr, &schema)?;
     let plan = Arc::new(ProjectionExec::try_new(
-        vec![(date_time_interval_expr, "result".to_string())],
+        vec![ProjectionExpr {
+            expr: date_time_interval_expr,
+            alias: "result".to_string(),
+        }],
         input,
     )?);
     roundtrip_test(plan)
@@ -268,6 +301,7 @@ fn roundtrip_hash_join() -> Result<()> {
                 join_type,
                 None,
                 *partition_mode,
+                NullEquality::NullEqualsNothing,
                 false,
             )?))?;
         }
@@ -321,9 +355,9 @@ fn roundtrip_udwf() -> Result<()> {
         &[
             col("a", &schema)?
         ],
-        &LexOrdering::new(vec![
-            PhysicalSortExpr::new(col("b", &schema)?, SortOptions::new(true, true)),
-        ]),
+        &[
+            PhysicalSortExpr::new(col("b", &schema)?, SortOptions::new(true, true))
+        ],
         Arc::new(WindowFrame::new(None)),
     ));
 
@@ -360,13 +394,13 @@ fn roundtrip_window() -> Result<()> {
     let udwf_expr = Arc::new(StandardWindowExpr::new(
         nth_value_window,
         &[col("b", &schema)?],
-        &LexOrdering::new(vec![PhysicalSortExpr {
+        &[PhysicalSortExpr {
             expr: col("a", &schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: false,
             },
-        }]),
+        }],
         Arc::new(window_frame),
     ));
 
@@ -380,8 +414,9 @@ fn roundtrip_window() -> Result<()> {
         .build()
         .map(Arc::new)?,
         &[],
-        &LexOrdering::default(),
+        &[],
         Arc::new(WindowFrame::new(None)),
+        None,
     ));
 
     let window_frame = WindowFrame::new_bounds(
@@ -393,15 +428,16 @@ fn roundtrip_window() -> Result<()> {
     let args = vec![cast(col("a", &schema)?, &schema, DataType::Float64)?];
     let sum_expr = AggregateExprBuilder::new(sum_udaf(), args)
         .schema(Arc::clone(&schema))
-        .alias("SUM(a) RANGE BETWEEN CURRENT ROW AND UNBOUNDED PRECEEDING")
+        .alias("SUM(a) RANGE BETWEEN CURRENT ROW AND UNBOUNDED PRECEDING")
         .build()
         .map(Arc::new)?;
 
     let sliding_aggr_window_expr = Arc::new(SlidingAggregateWindowExpr::new(
         sum_expr,
         &[],
-        &LexOrdering::default(),
+        &[],
         Arc::new(window_frame),
+        None,
     ));
 
     let input = Arc::new(EmptyExec::new(schema.clone()));
@@ -414,7 +450,118 @@ fn roundtrip_window() -> Result<()> {
 }
 
 #[test]
-fn rountrip_aggregate() -> Result<()> {
+fn roundtrip_window_distinct() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+
+    // Create a distinct count window expression with unbounded frame (becomes PlainAggregateWindowExpr)
+    let distinct_count_expr = Arc::new(PlainAggregateWindowExpr::new(
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("count(DISTINCT a)")
+            .distinct() // Enable distinct
+            .build()
+            .map(Arc::new)?,
+        &[col("b", &schema)?],            // partition by b
+        &[],                              // no order by
+        Arc::new(WindowFrame::new(None)), // unbounded frame
+        None,
+    ));
+
+    // Create a distinct sum window expression with bounded frame (becomes SlidingAggregateWindowExpr)
+    let bounded_frame = WindowFrame::new_bounds(
+        datafusion_expr::WindowFrameUnits::Rows,
+        WindowFrameBound::Preceding(ScalarValue::UInt64(Some(1))),
+        WindowFrameBound::CurrentRow,
+    );
+
+    let distinct_sum_expr = Arc::new(SlidingAggregateWindowExpr::new(
+        AggregateExprBuilder::new(
+            sum_udaf(),
+            vec![cast(col("a", &schema)?, &schema, DataType::Float64)?],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("sum(DISTINCT a)")
+        .distinct() // Enable distinct
+        .with_ignore_nulls(true) // Enable ignore nulls
+        .build()
+        .map(Arc::new)?,
+        &[],                     // no partition by
+        &[],                     // no order by
+        Arc::new(bounded_frame), // bounded frame
+        None,
+    ));
+
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    roundtrip_test(Arc::new(WindowAggExec::try_new(
+        vec![distinct_count_expr, distinct_sum_expr],
+        input,
+        false,
+    )?))
+}
+
+#[test]
+fn test_distinct_window_serialization_end_to_end() -> Result<()> {
+    // Create a more comprehensive test that verifies distinct window functions
+    // work properly through the entire serialization/deserialization pipeline
+    let field_a = Field::new("a", DataType::Int64, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+
+    // Test 1: DISTINCT COUNT with IGNORE NULLS
+    let distinct_count_ignore_nulls = Arc::new(PlainAggregateWindowExpr::new(
+        AggregateExprBuilder::new(count_udaf(), vec![col("a", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("count_distinct_ignore_nulls")
+            .distinct()
+            .with_ignore_nulls(true)
+            .build()
+            .map(Arc::new)?,
+        &[col("b", &schema)?],
+        &[],
+        Arc::new(WindowFrame::new(None)),
+        None,
+    ));
+
+    // Test 2: DISTINCT SUM (without ignore nulls)
+    let bounded_frame = WindowFrame::new_bounds(
+        datafusion_expr::WindowFrameUnits::Rows,
+        WindowFrameBound::Preceding(ScalarValue::UInt64(Some(2))),
+        WindowFrameBound::CurrentRow,
+    );
+
+    let distinct_sum = Arc::new(SlidingAggregateWindowExpr::new(
+        AggregateExprBuilder::new(
+            sum_udaf(),
+            vec![cast(col("a", &schema)?, &schema, DataType::Float64)?],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("sum_distinct")
+        .distinct()
+        .build()
+        .map(Arc::new)?,
+        &[],
+        &[],
+        Arc::new(bounded_frame),
+        None,
+    ));
+
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    let window_exec = Arc::new(WindowAggExec::try_new(
+        vec![distinct_count_ignore_nulls, distinct_sum],
+        input,
+        false,
+    )?);
+
+    // Perform the roundtrip test
+    roundtrip_test(window_exec)
+}
+
+#[test]
+fn roundtrip_aggregate() -> Result<()> {
     let field_a = Field::new("a", DataType::Int64, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
@@ -462,7 +609,7 @@ fn rountrip_aggregate() -> Result<()> {
 }
 
 #[test]
-fn rountrip_aggregate_with_limit() -> Result<()> {
+fn roundtrip_aggregate_with_limit() -> Result<()> {
     let field_a = Field::new("a", DataType::Int64, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
@@ -470,14 +617,13 @@ fn rountrip_aggregate_with_limit() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("AVG(b)")
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(avg_udaf(), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("AVG(b)")
+            .build()
+            .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -487,12 +633,12 @@ fn rountrip_aggregate_with_limit() -> Result<()> {
         Arc::new(EmptyExec::new(schema.clone())),
         schema,
     )?;
-    let agg = agg.with_limit(Some(12));
+    let agg = agg.with_limit_options(Some(LimitOptions::new_with_order(12, false)));
     roundtrip_test(Arc::new(agg))
 }
 
 #[test]
-fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
+fn roundtrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
     let field_a = Field::new("a", DataType::Int64, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
@@ -500,14 +646,16 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates = vec![AggregateExprBuilder::new(
-        approx_percentile_cont_udaf(),
-        vec![col("b", &schema)?, lit(0.5)],
-    )
-    .schema(Arc::clone(&schema))
-    .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
-    .build()
-    .map(Arc::new)?];
+    let aggregates = vec![
+        AggregateExprBuilder::new(
+            approx_percentile_cont_udaf(),
+            vec![col("b", &schema)?, lit(0.5)],
+        )
+        .schema(Arc::clone(&schema))
+        .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
+        .build()
+        .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -521,30 +669,29 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
 }
 
 #[test]
-fn rountrip_aggregate_with_sort() -> Result<()> {
+fn roundtrip_aggregate_with_sort() -> Result<()> {
     let field_a = Field::new("a", DataType::Int64, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
 
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
-    let sort_exprs = LexOrdering::new(vec![PhysicalSortExpr {
+    let sort_exprs = vec![PhysicalSortExpr {
         expr: col("b", &schema)?,
         options: SortOptions {
             descending: false,
             nulls_first: true,
         },
-    }]);
+    }];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("ARRAY_AGG(b)")
-                .order_by(sort_exprs)
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(array_agg_udaf(), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("ARRAY_AGG(b)")
+            .order_by(sort_exprs)
+            .build()
+            .map(Arc::new)?,
+    ];
 
     let agg = AggregateExec::try_new(
         AggregateMode::Final,
@@ -604,14 +751,13 @@ fn roundtrip_aggregate_udaf() -> Result<()> {
     let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
         vec![(col("a", &schema)?, "unused".to_string())];
 
-    let aggregates =
-        vec![
-            AggregateExprBuilder::new(Arc::new(udaf), vec![col("b", &schema)?])
-                .schema(Arc::clone(&schema))
-                .alias("example_agg")
-                .build()
-                .map(Arc::new)?,
-        ];
+    let aggregates = vec![
+        AggregateExprBuilder::new(Arc::new(udaf), vec![col("b", &schema)?])
+            .schema(Arc::clone(&schema))
+            .alias("example_agg")
+            .build()
+            .map(Arc::new)?,
+    ];
 
     roundtrip_test_with_context(
         Arc::new(AggregateExec::try_new(
@@ -649,12 +795,25 @@ fn roundtrip_filter_with_not_and_in_list() -> Result<()> {
     )?))
 }
 
+#[test]
+fn roundtrip_filter_with_fetch() -> Result<()> {
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+    let predicate = col("a", &schema)?;
+    let filter = FilterExecBuilder::new(predicate, Arc::new(EmptyExec::new(schema)))
+        .with_fetch(Some(10))
+        .build()?;
+    assert_eq!(filter.fetch(), Some(10));
+    roundtrip_test(Arc::new(filter))
+}
+
 #[test]
 fn roundtrip_sort() -> Result<()> {
     let field_a = Field::new("a", DataType::Boolean, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
-    let sort_exprs = LexOrdering::new(vec![
+    let sort_exprs = [
         PhysicalSortExpr {
             expr: col("a", &schema)?,
             options: SortOptions {
@@ -669,7 +828,8 @@ fn roundtrip_sort() -> Result<()> {
                 nulls_first: true,
             },
         },
-    ]);
+    ]
+    .into();
     roundtrip_test(Arc::new(SortExec::new(
         sort_exprs,
         Arc::new(EmptyExec::new(schema)),
@@ -681,7 +841,7 @@ fn roundtrip_sort_preserve_partitioning() -> Result<()> {
     let field_a = Field::new("a", DataType::Boolean, false);
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
-    let sort_exprs = LexOrdering::new(vec![
+    let sort_exprs: LexOrdering = [
         PhysicalSortExpr {
             expr: col("a", &schema)?,
             options: SortOptions {
@@ -696,7 +856,8 @@ fn roundtrip_sort_preserve_partitioning() -> Result<()> {
                 nulls_first: true,
             },
         },
-    ]);
+    ]
+    .into();
 
     roundtrip_test(Arc::new(SortExec::new(
         sort_exprs.clone(),
@@ -715,11 +876,13 @@ fn roundtrip_coalesce_batches_with_fetch() -> Result<()> {
     let field_b = Field::new("b", DataType::Int64, false);
     let schema = Arc::new(Schema::new(vec![field_a, field_b]));
 
+    #[expect(deprecated)]
     roundtrip_test(Arc::new(CoalesceBatchesExec::new(
         Arc::new(EmptyExec::new(schema.clone())),
         8096,
     )))?;
 
+    #[expect(deprecated)]
     roundtrip_test(Arc::new(
         CoalesceBatchesExec::new(Arc::new(EmptyExec::new(schema)), 8096)
             .with_fetch(Some(10)),
@@ -756,25 +919,103 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
     let mut options = TableParquetOptions::new();
     options.global.pushdown_filters = true;
 
-    let file_source = Arc::new(ParquetSource::new(options).with_predicate(predicate));
+    let file_source = Arc::new(
+        ParquetSource::new(Arc::clone(&file_schema))
+            .with_table_parquet_options(options)
+            .with_predicate(predicate),
+    );
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(
+                    vec![Field::new("col", DataType::Utf8, false)],
+                ))),
+            })
+            .build();
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(Statistics {
-        num_rows: Precision::Inexact(100),
-        total_byte_size: Precision::Inexact(1024),
-        column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
-            Field::new("col", DataType::Utf8, false),
-        ]))),
-    })
-    .build();
+    roundtrip_test(DataSourceExec::from_data_source(scan_config))
+}
+
+#[test]
+fn roundtrip_parquet_exec_attaches_cached_reader_factory_after_roundtrip() -> Result<()> {
+    let file_schema =
+        Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
+    let file_source = Arc::new(ParquetSource::new(Arc::clone(&file_schema)));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&file_schema),
+            })
+            .build();
+    let exec_plan = DataSourceExec::from_data_source(scan_config);
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    let roundtripped =
+        roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
+
+    let data_source = roundtripped
+        .as_any()
+        .downcast_ref::<DataSourceExec>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected DataSourceExec after roundtrip")
+        })?;
+    let file_scan = data_source
+        .data_source()
+        .as_any()
+        .downcast_ref::<FileScanConfig>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected FileScanConfig after roundtrip")
+        })?;
+    let parquet_source = file_scan
+        .file_source()
+        .as_any()
+        .downcast_ref::<ParquetSource>()
+        .ok_or_else(|| {
+            internal_datafusion_err!("Expected ParquetSource after roundtrip")
+        })?;
+
+    assert!(
+        parquet_source.parquet_file_reader_factory().is_some(),
+        "Parquet reader factory should be attached after decoding from protobuf"
+    );
+    Ok(())
+}
+
+#[test]
+fn roundtrip_arrow_scan() -> Result<()> {
+    let file_schema =
+        Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
+
+    let table_schema = TableSchema::new(file_schema.clone(), vec![]);
+    let file_source = Arc::new(ArrowSource::new_file_source(table_schema));
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.arrow".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&file_schema),
+            })
+            .build();
 
     roundtrip_test(DataSourceExec::from_data_source(scan_config))
 }
@@ -787,21 +1028,21 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> {
         vec![wrap_partition_value_in_dict(ScalarValue::Int64(Some(0)))];
     let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
 
-    let file_source = Arc::new(ParquetSource::default());
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema,
-        file_source,
-    )
-    .with_projection(Some(vec![0, 1]))
-    .with_file_group(FileGroup::new(vec![file_group]))
-    .with_table_partition_cols(vec![Field::new(
-        "part".to_string(),
-        wrap_partition_type_in_dict(DataType::Int16),
-        false,
-    )])
-    .with_newlines_in_values(false)
-    .build();
+    let table_schema = TableSchema::new(
+        schema.clone(),
+        vec![Arc::new(Field::new(
+            "part".to_string(),
+            wrap_partition_type_in_dict(DataType::Int16),
+            false,
+        ))],
+    );
+
+    let file_source = Arc::new(ParquetSource::new(table_schema.clone()));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_projection_indices(Some(vec![0, 1]))?
+            .with_file_group(FileGroup::new(vec![file_group]))
+            .build();
 
     roundtrip_test(DataSourceExec::from_data_source(scan_config))
 }
@@ -815,26 +1056,25 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
         inner: Arc::new(Column::new("col", 1)),
     });
 
-    let file_source =
-        Arc::new(ParquetSource::default().with_predicate(custom_predicate_expr));
+    let file_source = Arc::new(
+        ParquetSource::new(Arc::clone(&file_schema))
+            .with_predicate(custom_predicate_expr),
+    );
 
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        file_schema,
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(Statistics {
-        num_rows: Precision::Inexact(100),
-        total_byte_size: Precision::Inexact(1024),
-        column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
-            Field::new("col", DataType::Utf8, false),
-        ]))),
-    })
-    .build();
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Inexact(1024),
+                column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(
+                    vec![Field::new("col", DataType::Utf8, false)],
+                ))),
+            })
+            .build();
 
     #[derive(Debug, Clone, Eq)]
     struct CustomPredicateExpr {
@@ -855,7 +1095,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
     }
 
     impl Display for CustomPredicateExpr {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
             write!(f, "CustomPredicateExpr")
         }
     }
@@ -885,7 +1125,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
             self: Arc<Self>,
             _children: Vec<Arc<dyn PhysicalExpr>>,
         ) -> Result<Arc<dyn PhysicalExpr>> {
-            todo!()
+            Ok(self)
         }
 
         fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
@@ -900,7 +1140,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
             &self,
             _buf: &[u8],
             _inputs: &[Arc<dyn ExecutionPlan>],
-            _registry: &dyn FunctionRegistry,
+            _ctx: &TaskContext,
         ) -> Result<Arc<dyn ExecutionPlan>> {
             unreachable!()
         }
@@ -948,7 +1188,12 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
     let exec_plan = DataSourceExec::from_data_source(scan_config);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?;
+    roundtrip_test_and_return(
+        exec_plan,
+        &ctx,
+        &CustomPhysicalExtensionCodec {},
+        &DefaultPhysicalProtoConverter {},
+    )?;
     Ok(())
 }
 
@@ -982,10 +1227,16 @@ fn roundtrip_scalar_udf() -> Result<()> {
         fun_def,
         vec![col("a", &schema)?],
         Field::new("f", DataType::Int64, true).into(),
+        Arc::new(ConfigOptions::default()),
     );
 
-    let project =
-        ProjectionExec::try_new(vec![(Arc::new(expr), "a".to_string())], input)?;
+    let project = ProjectionExec::try_new(
+        vec![ProjectionExpr {
+            expr: Arc::new(expr),
+            alias: "a".to_string(),
+        }],
+        input,
+    )?;
 
     let ctx = SessionContext::new();
 
@@ -1002,7 +1253,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
         &self,
         _buf: &[u8],
         _inputs: &[Arc<dyn ExecutionPlan>],
-        _registry: &dyn FunctionRegistry,
+        _ctx: &TaskContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         not_impl_err!("No extension codec provided")
     }
@@ -1018,7 +1269,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
     fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>> {
         if name == "regex_udf" {
             let proto = MyRegexUdfNode::decode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to decode regex_udf: {err}"))
+                internal_datafusion_err!("failed to decode regex_udf: {err}")
             })?;
 
             Ok(Arc::new(ScalarUDF::from(MyRegexUdf::new(proto.pattern))))
@@ -1033,9 +1284,9 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
             let proto = MyRegexUdfNode {
                 pattern: udf.pattern.clone(),
             };
-            proto.encode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to encode udf: {err}"))
-            })?;
+            proto
+                .encode(buf)
+                .map_err(|err| internal_datafusion_err!("failed to encode udf: {err}"))?;
         }
         Ok(())
     }
@@ -1043,9 +1294,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
     fn try_decode_udaf(&self, name: &str, buf: &[u8]) -> Result<Arc<AggregateUDF>> {
         if name == "aggregate_udf" {
             let proto = MyAggregateUdfNode::decode(buf).map_err(|err| {
-                DataFusionError::Internal(format!(
-                    "failed to decode aggregate_udf: {err}"
-                ))
+                internal_datafusion_err!("failed to decode aggregate_udf: {err}")
             })?;
 
             Ok(Arc::new(AggregateUDF::from(MyAggregateUDF::new(
@@ -1063,7 +1312,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
                 result: udf.result.clone(),
             };
             proto.encode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to encode udf: {err:?}"))
+                internal_datafusion_err!("failed to encode udf: {err:?}")
             })?;
         }
         Ok(())
@@ -1072,7 +1321,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
     fn try_decode_udwf(&self, name: &str, buf: &[u8]) -> Result<Arc<WindowUDF>> {
         if name == "custom_udwf" {
             let proto = CustomUDWFNode::decode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to decode custom_udwf: {err}"))
+                internal_datafusion_err!("failed to decode custom_udwf: {err}")
             })?;
 
             Ok(Arc::new(WindowUDF::from(CustomUDWF::new(proto.payload))))
@@ -1090,7 +1339,7 @@ impl PhysicalExtensionCodec for UDFExtensionCodec {
                 payload: udwf.payload.clone(),
             };
             proto.encode(buf).map_err(|err| {
-                DataFusionError::Internal(format!("failed to encode udwf: {err:?}"))
+                internal_datafusion_err!("failed to encode udwf: {err:?}")
             })?;
         }
         Ok(())
@@ -1110,6 +1359,7 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
         Arc::new(ScalarUDF::from(MyRegexUdf::new(".*".to_string()))),
         vec![col("text", &schema)?],
         Field::new("f", DataType::Int64, true).into(),
+        Arc::new(ConfigOptions::default()),
     ));
 
     let filter = Arc::new(FilterExec::try_new(
@@ -1131,8 +1381,9 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
         vec![Arc::new(PlainAggregateWindowExpr::new(
             aggr_expr.clone(),
             &[col("author", &schema)?],
-            &LexOrdering::default(),
+            &[],
             Arc::new(WindowFrame::new(None)),
+            None,
         ))],
         filter,
         true,
@@ -1140,7 +1391,7 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
 
     let aggregate = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![aggr_expr],
         vec![None],
         window,
@@ -1148,7 +1399,8 @@ fn roundtrip_scalar_udf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1176,13 +1428,13 @@ fn roundtrip_udwf_extension_codec() -> Result<()> {
     let udwf_expr = Arc::new(StandardWindowExpr::new(
         udwf,
         &[col("b", &schema)?],
-        &LexOrdering::new(vec![PhysicalSortExpr {
+        &[PhysicalSortExpr {
             expr: col("a", &schema)?,
             options: SortOptions {
                 descending: false,
                 nulls_first: false,
             },
-        }]),
+        }],
         Arc::new(window_frame),
     ));
 
@@ -1195,7 +1447,8 @@ fn roundtrip_udwf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(window, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(window, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1212,6 +1465,7 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
         Arc::new(ScalarUDF::from(MyRegexUdf::new(".*".to_string()))),
         vec![col("text", &schema)?],
         Field::new("f", DataType::Int64, true).into(),
+        Arc::new(ConfigOptions::default()),
     ));
 
     let udaf = Arc::new(AggregateUDF::from(MyAggregateUDF::new(
@@ -1239,8 +1493,9 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
         vec![Arc::new(PlainAggregateWindowExpr::new(
             aggr_expr,
             &[col("author", &schema)?],
-            &LexOrdering::default(),
+            &[],
             Arc::new(WindowFrame::new(None)),
+            None,
         ))],
         filter,
         true,
@@ -1256,7 +1511,7 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
 
     let aggregate = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
-        PhysicalGroupBy::new(vec![], vec![], vec![]),
+        PhysicalGroupBy::new(vec![], vec![], vec![], false),
         vec![aggr_expr],
         vec![None],
         window,
@@ -1264,7 +1519,8 @@ fn roundtrip_aggregate_udf_extension_codec() -> Result<()> {
     )?);
 
     let ctx = SessionContext::new();
-    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec)?;
+    let proto_converter = DefaultPhysicalProtoConverter {};
+    roundtrip_test_and_return(aggregate, &ctx, &UDFExtensionCodec, &proto_converter)?;
     Ok(())
 }
 
@@ -1283,7 +1539,10 @@ fn roundtrip_like() -> Result<()> {
         &schema,
     )?;
     let plan = Arc::new(ProjectionExec::try_new(
-        vec![(like_expr, "result".to_string())],
+        vec![ProjectionExpr {
+            expr: like_expr,
+            alias: "result".to_string(),
+        }],
         input,
     )?);
     roundtrip_test(plan)
@@ -1299,6 +1558,7 @@ fn roundtrip_analyze() -> Result<()> {
     roundtrip_test(Arc::new(AnalyzeExec::new(
         false,
         false,
+        vec![MetricType::SUMMARY, MetricType::DEV],
         input,
         Arc::new(schema),
     )))
@@ -1330,18 +1590,20 @@ fn roundtrip_json_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "json".into(),
+        file_output_mode: FileOutputMode::SingleFile,
     };
     let data_sink = Arc::new(JsonSink::new(
         file_sink_config,
         JsonWriterOptions::new(CompressionTypeVariant::UNCOMPRESSED),
     ));
-    let sort_order = LexRequirement::new(vec![PhysicalSortRequirement::new(
+    let sort_order = [PhysicalSortRequirement::new(
         Arc::new(Column::new("plan_type", 0)),
         Some(SortOptions {
             descending: true,
             nulls_first: false,
         }),
-    )]);
+    )]
+    .into();
 
     roundtrip_test(Arc::new(DataSinkExec::new(
         input,
@@ -1367,27 +1629,31 @@ fn roundtrip_csv_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "csv".into(),
+        file_output_mode: FileOutputMode::Directory,
     };
     let data_sink = Arc::new(CsvSink::new(
         file_sink_config,
         CsvWriterOptions::new(WriterBuilder::default(), CompressionTypeVariant::ZSTD),
     ));
-    let sort_order = LexRequirement::new(vec![PhysicalSortRequirement::new(
+    let sort_order = [PhysicalSortRequirement::new(
         Arc::new(Column::new("plan_type", 0)),
         Some(SortOptions {
             descending: true,
             nulls_first: false,
         }),
-    )]);
+    )]
+    .into();
 
     let ctx = SessionContext::new();
     let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+
     let roundtrip_plan = roundtrip_test_and_return(
         Arc::new(DataSinkExec::new(input, data_sink, Some(sort_order))),
         &ctx,
         &codec,
-    )
-    .unwrap();
+        &proto_converter,
+    )?;
 
     let roundtrip_plan = roundtrip_plan
         .as_any()
@@ -1423,18 +1689,20 @@ fn roundtrip_parquet_sink() -> Result<()> {
         insert_op: InsertOp::Overwrite,
         keep_partition_by_columns: true,
         file_extension: "parquet".into(),
+        file_output_mode: FileOutputMode::Automatic,
     };
     let data_sink = Arc::new(ParquetSink::new(
         file_sink_config,
         TableParquetOptions::default(),
     ));
-    let sort_order = LexRequirement::new(vec![PhysicalSortRequirement::new(
+    let sort_order = [PhysicalSortRequirement::new(
         Arc::new(Column::new("plan_type", 0)),
         Some(SortOptions {
             descending: true,
             nulls_first: false,
         }),
-    )]);
+    )]
+    .into();
 
     roundtrip_test(Arc::new(DataSinkExec::new(
         input,
@@ -1471,31 +1739,29 @@ fn roundtrip_sym_hash_join() -> Result<()> {
         ] {
             for left_order in &[
                 None,
-                Some(LexOrdering::new(vec![PhysicalSortExpr {
+                LexOrdering::new(vec![PhysicalSortExpr {
                     expr: Arc::new(Column::new("col", schema_left.index_of("col")?)),
                     options: Default::default(),
-                }])),
+                }]),
             ] {
-                for right_order in &[
+                for right_order in [
                     None,
-                    Some(LexOrdering::new(vec![PhysicalSortExpr {
+                    LexOrdering::new(vec![PhysicalSortExpr {
                         expr: Arc::new(Column::new("col", schema_right.index_of("col")?)),
                         options: Default::default(),
-                    }])),
+                    }]),
                 ] {
-                    roundtrip_test(Arc::new(
-                        datafusion::physical_plan::joins::SymmetricHashJoinExec::try_new(
-                            Arc::new(EmptyExec::new(schema_left.clone())),
-                            Arc::new(EmptyExec::new(schema_right.clone())),
-                            on.clone(),
-                            None,
-                            join_type,
-                            false,
-                            left_order.clone(),
-                            right_order.clone(),
-                            *partition_mode,
-                        )?,
-                    ))?;
+                    roundtrip_test(Arc::new(SymmetricHashJoinExec::try_new(
+                        Arc::new(EmptyExec::new(schema_left.clone())),
+                        Arc::new(EmptyExec::new(schema_right.clone())),
+                        on.clone(),
+                        None,
+                        join_type,
+                        NullEquality::NullEqualsNothing,
+                        left_order.clone(),
+                        right_order,
+                        *partition_mode,
+                    )?))?;
                 }
             }
         }
@@ -1511,8 +1777,37 @@ fn roundtrip_union() -> Result<()> {
     let left = EmptyExec::new(Arc::new(schema_left));
     let right = EmptyExec::new(Arc::new(schema_right));
     let inputs: Vec<Arc<dyn ExecutionPlan>> = vec![Arc::new(left), Arc::new(right)];
-    let union = UnionExec::new(inputs);
-    roundtrip_test(Arc::new(union))
+    let union = UnionExec::try_new(inputs)?;
+    roundtrip_test(union)
+}
+
+#[test]
+fn roundtrip_repartition_preserve_order() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+    let sort_exprs: LexOrdering = [PhysicalSortExpr {
+        expr: col("a", &schema)?,
+        options: SortOptions::default(),
+    }]
+    .into();
+
+    // Create two sorted single-partition inputs, then union them to get
+    // a sorted input with 2 partitions.
+    let source1 = SortExec::new(
+        sort_exprs.clone(),
+        Arc::new(EmptyExec::new(Arc::clone(&schema))),
+    );
+    let source2 = SortExec::new(sort_exprs, Arc::new(EmptyExec::new(schema)));
+    let union = UnionExec::try_new(vec![
+        Arc::new(source1) as Arc<dyn ExecutionPlan>,
+        Arc::new(source2) as Arc<dyn ExecutionPlan>,
+    ])?;
+
+    let repartition = RepartitionExec::try_new(union, Partitioning::RoundRobinBatch(10))?
+        .with_preserve_order();
+    assert!(repartition.preserve_order());
+
+    roundtrip_test(Arc::new(repartition))
 }
 
 #[test]
@@ -1576,7 +1871,7 @@ fn roundtrip_unnest() -> Result<()> {
         vec![2, 4],
         output_schema,
         options,
-    );
+    )?;
     roundtrip_test(Arc::new(unnest))
 }
 
@@ -1598,11 +1893,44 @@ async fn roundtrip_coalesce() -> Result<()> {
     )?;
     let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice())
         .map_err(|e| DataFusionError::External(Box::new(e)))?;
-    let restored = node.try_into_physical_plan(
-        &ctx,
-        ctx.runtime_env().as_ref(),
+    let restored =
+        node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?;
+
+    assert_eq!(
+        plan.schema(),
+        restored.schema(),
+        "Schema mismatch for plans:\n>> initial:\n{}>> final: \n{}",
+        displayable(plan.as_ref())
+            .set_show_schema(true)
+            .indent(true),
+        displayable(restored.as_ref())
+            .set_show_schema(true)
+            .indent(true),
+    );
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_generate_series() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_table(
+        "t",
+        Arc::new(EmptyTable::new(Arc::new(Schema::new(Fields::from([
+            Arc::new(Field::new("f", DataType::Int64, false)),
+        ]))))),
+    )?;
+    let df = ctx.sql("select * from generate_series(1, 10000)").await?;
+    let plan = df.create_physical_plan().await?;
+
+    let node = PhysicalPlanNode::try_from_physical_plan(
+        plan.clone(),
         &DefaultPhysicalExtensionCodec {},
     )?;
+    let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice())
+        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+    let restored =
+        node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?;
 
     assert_eq!(
         plan.schema(),
@@ -1630,26 +1958,24 @@ async fn roundtrip_projection_source() -> Result<()> {
 
     let statistics = Statistics::new_unknown(&schema);
 
-    let file_source = ParquetSource::default().with_statistics(statistics.clone());
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        schema.clone(),
-        file_source,
-    )
-    .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
-        "/path/to/file.parquet".to_string(),
-        1024,
-    )])])
-    .with_statistics(statistics)
-    .with_projection(Some(vec![0, 1, 2]))
-    .build();
+    let file_source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source)
+            .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new(
+                "/path/to/file.parquet".to_string(),
+                1024,
+            )])])
+            .with_statistics(statistics)
+            .with_projection_indices(Some(vec![0, 1, 2]))?
+            .build();
 
     let filter = Arc::new(
-        FilterExec::try_new(
+        FilterExecBuilder::new(
             Arc::new(BinaryExpr::new(col("c", &schema)?, Operator::Eq, lit(1))),
             DataSourceExec::from_data_source(scan_config),
-        )?
-        .with_projection(Some(vec![0, 1]))?,
+        )
+        .apply_projection(Some(vec![0, 1]))?
+        .build()?,
     );
 
     roundtrip_test(filter)
@@ -1724,12 +2050,1149 @@ async fn roundtrip_physical_plan_node() {
             .unwrap();
 
     let plan = node
-        .try_into_physical_plan(
-            &ctx,
-            &ctx.runtime_env(),
-            &DefaultPhysicalExtensionCodec {},
-        )
+        .try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})
         .unwrap();
 
     let _ = plan.execute(0, ctx.task_ctx()).unwrap();
 }
+
+/// Helper function to create a SessionContext with all TPC-H tables registered as external tables
+async fn tpch_context() -> Result<SessionContext> {
+    use datafusion_common::test_util::datafusion_test_data;
+
+    let ctx = SessionContext::new();
+    let test_data = datafusion_test_data();
+
+    // TPC-H table names
+    let tables = [
+        "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation",
+        "region",
+    ];
+
+    // Create external tables for all TPC-H tables
+    for table in &tables {
+        let table_sql = format!(
+            "CREATE EXTERNAL TABLE {table} STORED AS PARQUET LOCATION '{test_data}/tpch_{table}_small.parquet'"
+        );
+        ctx.sql(&table_sql).await.map_err(|e| {
+            DataFusionError::External(
+                format!("Failed to create {table} table: {e}").into(),
+            )
+        })?;
+    }
+
+    Ok(ctx)
+}
+
+/// Helper function to get TPC-H query SQL
+fn get_tpch_query_sql(query: usize) -> Result<Vec<String>> {
+    use std::fs;
+
+    if !(1..=22).contains(&query) {
+        return Err(DataFusionError::External(
+            format!("Invalid TPC-H query number: {query}").into(),
+        ));
+    }
+
+    let filename = format!("../../benchmarks/queries/q{query}.sql");
+    let contents = fs::read_to_string(&filename).map_err(|e| {
+        DataFusionError::External(
+            format!("Failed to read query file {filename}: {e}").into(),
+        )
+    })?;
+
+    Ok(contents
+        .split(';')
+        .map(|s| s.trim())
+        .filter(|s| !s.is_empty())
+        .map(|s| s.to_string())
+        .collect())
+}
+
+#[tokio::test]
+async fn test_serialize_deserialize_tpch_queries() -> Result<()> {
+    // Create context with TPC-H tables
+    let ctx = tpch_context().await?;
+
+    // repeat to run all 22 queries
+    for query in 1..=22 {
+        // run all statements in the query
+        let sql = get_tpch_query_sql(query)?;
+        for stmt in sql {
+            let logical_plan = ctx.sql(&stmt).await?.into_unoptimized_plan();
+            let optimized_plan = ctx.state().optimize(&logical_plan)?;
+            let physical_plan = ctx.state().create_physical_plan(&optimized_plan).await?;
+
+            // serialize the physical plan
+            let codec = DefaultPhysicalExtensionCodec {};
+
+            let proto =
+                PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
+
+            // deserialize the physical plan
+            let _deserialized_plan =
+                proto.try_into_physical_plan(&ctx.task_ctx(), &codec)?;
+        }
+    }
+
+    Ok(())
+}
+
+// Bugs: https://github.com/apache/datafusion/issues/16772
+#[tokio::test]
+async fn test_round_trip_tpch_queries() -> Result<()> {
+    // Create context with TPC-H tables
+    let ctx = tpch_context().await?;
+
+    // repeat to run all 22 queries
+    for query in 1..=22 {
+        // run all statements in the query
+        let sql = get_tpch_query_sql(query)?;
+        for stmt in sql {
+            roundtrip_test_sql_with_context(&stmt, &ctx).await?;
+        }
+    }
+
+    Ok(())
+}
+
+// Bug 1 of https://github.com/apache/datafusion/issues/16772
+/// Test that AggregateFunctionExpr human_display field is correctly preserved
+/// during serialization/deserialization roundtrip.
+///
+/// Test for issue where the human_display field (used for EXPLAIN output)
+/// was not being serialized to protobuf, causing it to be lost during roundtrip
+/// and resulting in empty or incorrect display strings in query plans.
+#[tokio::test]
+async fn test_round_trip_human_display() -> Result<()> {
+    // Create context with TPC-H tables
+    let ctx = tpch_context().await?;
+
+    let sql = "select r_name, count(1) from region group by r_name";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    let sql = "select r_name, count(*) from region group by r_name";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    let sql = "select r_name, count(r_name) from region group by r_name";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    Ok(())
+}
+
+// Bug 2 of https://github.com/apache/datafusion/issues/16772
+/// Test that PhysicalGroupBy groups field is correctly serialized/deserialized
+/// for simple aggregates (no GROUP BY clause).
+///
+/// Test for issue where simple aggregates like "SELECT SUM(col1 * col2) FROM table"
+/// would incorrectly serialize groups as [[]] instead of [] during roundtrip serialization.
+/// The groups field should be empty ([]) when there are no GROUP BY expressions.
+#[tokio::test]
+async fn test_round_trip_groups_display() -> Result<()> {
+    // Create context with TPC-H tables
+    let ctx = tpch_context().await?;
+
+    let sql = "select sum(l_extendedprice * l_discount) as revenue from lineitem;";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    let sql = "select sum(l_extendedprice) as revenue from lineitem;";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    Ok(())
+}
+
+// Bug 3 of https://github.com/apache/datafusion/issues/16772
+/// Test that ScalarFunctionExpr return_field name is correctly preserved
+/// during serialization/deserialization roundtrip.
+///
+/// Test for issue where the return_field.name for scalar functions
+/// was not being serialized to protobuf, causing it to be lost during roundtrip
+/// and defaulting to a generic name like "f" instead of the proper function name.
+#[tokio::test]
+async fn test_round_trip_date_part_display() -> Result<()> {
+    // Create context with TPC-H tables
+    let ctx = tpch_context().await?;
+
+    let sql = "select extract(year from l_shipdate) as l_year from lineitem ";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    let sql = "select extract(month from l_shipdate) as l_year from lineitem ";
+    roundtrip_test_sql_with_context(sql, &ctx).await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_tpch_part_in_list_query_with_real_parquet_data() -> Result<()> {
+    use datafusion_common::test_util::datafusion_test_data;
+
+    let ctx = SessionContext::new();
+
+    // Register the TPC-H part table using the local test data
+    let test_data = datafusion_test_data();
+    let table_sql = format!(
+        "CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '{test_data}/tpch_part_small.parquet'"
+    );
+    ctx.sql(&table_sql).await.map_err(|e| {
+        DataFusionError::External(format!("Failed to create part table: {e}").into())
+    })?;
+
+    // Test the exact problematic query
+    let sql =
+        "SELECT p_size FROM part WHERE p_size IN (14, 6, 5, 31) and p_partkey > 1000";
+
+    let logical_plan = ctx.sql(sql).await?.into_unoptimized_plan();
+    let optimized_plan = ctx.state().optimize(&logical_plan)?;
+    let physical_plan = ctx.state().create_physical_plan(&optimized_plan).await?;
+
+    // Serialize the physical plan - bug may happen here already but not necessarily manifests
+    let codec = DefaultPhysicalExtensionCodec {};
+
+    let proto = PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
+
+    // This will fail with the bug, but should succeed when fixed
+    let _deserialized_plan = proto.try_into_physical_plan(&ctx.task_ctx(), &codec)?;
+    Ok(())
+}
+
+#[tokio::test]
+/// Tests that we can serialize an unoptimized "analyze" plan and it will work on the other end
+async fn analyze_roundtrip_unoptimized() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // No optimizations
+    let session_state =
+        datafusion::execution::SessionStateBuilder::new_from_existing(ctx.state())
+            .with_physical_optimizer_rules(vec![])
+            .build();
+
+    let logical_plan = session_state
+        .create_logical_plan("explain analyze select 1")
+        .await?;
+    let plan = session_state.create_physical_plan(&logical_plan).await?;
+
+    let node = PhysicalPlanNode::try_from_physical_plan(
+        plan.clone(),
+        &DefaultPhysicalExtensionCodec {},
+    )?;
+
+    let node = PhysicalPlanNode::decode(node.encode_to_vec().as_slice())
+        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+    let unoptimized =
+        node.try_into_physical_plan(&ctx.task_ctx(), &DefaultPhysicalExtensionCodec {})?;
+
+    let physical_planner =
+        datafusion::physical_planner::DefaultPhysicalPlanner::default();
+    physical_planner.optimize_physical_plan(unoptimized, &session_state, |_, _| {})?;
+    Ok(())
+}
+
+#[test]
+fn roundtrip_sort_merge_join() -> Result<()> {
+    let field_a = Field::new("col_a", DataType::Int64, false);
+    let field_b = Field::new("col_b", DataType::Int64, false);
+    let schema_left = Schema::new(vec![field_a.clone()]);
+    let schema_right = Schema::new(vec![field_b.clone()]);
+    let on = vec![(
+        Arc::new(Column::new("col_a", schema_left.index_of("col_a")?)) as _,
+        Arc::new(Column::new("col_b", schema_right.index_of("col_b")?)) as _,
+    )];
+
+    let filter = datafusion::physical_plan::joins::utils::JoinFilter::new(
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("col_a", 1)),
+            Operator::Gt,
+            Arc::new(Column::new("col_b", 0)),
+        )),
+        vec![
+            datafusion::physical_plan::joins::utils::ColumnIndex {
+                index: 0,
+                side: datafusion_common::JoinSide::Left,
+            },
+            datafusion::physical_plan::joins::utils::ColumnIndex {
+                index: 0,
+                side: datafusion_common::JoinSide::Right,
+            },
+        ],
+        Arc::new(Schema::new(vec![field_a, field_b])),
+    );
+
+    let schema_left = Arc::new(schema_left);
+    let schema_right = Arc::new(schema_right);
+    for filter in [None, Some(filter)] {
+        for join_type in [
+            JoinType::Inner,
+            JoinType::Left,
+            JoinType::Right,
+            JoinType::Full,
+            JoinType::LeftAnti,
+            JoinType::RightAnti,
+            JoinType::LeftSemi,
+            JoinType::RightSemi,
+        ] {
+            roundtrip_test(Arc::new(SortMergeJoinExec::try_new(
+                Arc::new(EmptyExec::new(schema_left.clone())),
+                Arc::new(EmptyExec::new(schema_right.clone())),
+                on.clone(),
+                filter.clone(),
+                join_type,
+                vec![Default::default()],
+                NullEquality::NullEqualsNothing,
+            )?))?;
+        }
+    }
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_logical_plan_sort_merge_join() -> Result<()> {
+    let ctx = SessionContext::new();
+    ctx.register_csv(
+        "t0",
+        "tests/testdata/test.csv",
+        datafusion::prelude::CsvReadOptions::default().has_header(true),
+    )
+    .await?;
+    ctx.register_csv(
+        "t1",
+        "tests/testdata/test.csv",
+        datafusion::prelude::CsvReadOptions::default().has_header(true),
+    )
+    .await?;
+
+    ctx.sql("SET datafusion.optimizer.prefer_hash_join = false")
+        .await?
+        .show()
+        .await?;
+
+    let query = "SELECT t1.* FROM t0 join t1 on t0.a = t1.a";
+    let plan = ctx.sql(query).await?.create_physical_plan().await?;
+    roundtrip_test(plan)
+}
+
+#[tokio::test]
+async fn roundtrip_memory_source() -> Result<()> {
+    let ctx = SessionContext::new();
+    let plan = ctx
+        .sql("select * from values ('Tom', 18)")
+        .await?
+        .create_physical_plan()
+        .await?;
+    roundtrip_test(plan)
+}
+
+#[tokio::test]
+async fn roundtrip_listing_table_with_schema_metadata() -> Result<()> {
+    let ctx = SessionContext::new();
+    let file_format = JsonFormat::default();
+    let table_partition_cols = vec![("part".to_owned(), DataType::Int64)];
+    let data = "../core/tests/data/partitioned_table_json";
+    let listing_table_url = ListingTableUrl::parse(data)?;
+    let listing_options = ListingOptions::new(Arc::new(file_format))
+        .with_table_partition_cols(table_partition_cols);
+
+    let config = ListingTableConfig::new(listing_table_url)
+        .with_listing_options(listing_options)
+        .infer_schema(&ctx.state())
+        .await?;
+
+    // Decorate metadata onto the inferred ListingTable schema
+    let schema_with_meta = config
+        .file_schema
+        .clone()
+        .map(|s| {
+            let mut meta: HashMap<String, String> = HashMap::new();
+            meta.insert("foo.bar".to_string(), "baz".to_string());
+            s.as_ref().clone().with_metadata(meta)
+        })
+        .expect("Must decorate metadata");
+
+    let config = config.with_schema(Arc::new(schema_with_meta));
+    ctx.register_table("hive_style", Arc::new(ListingTable::try_new(config)?))?;
+
+    let plan = ctx
+        .sql("select * from hive_style limit 1")
+        .await?
+        .create_physical_plan()
+        .await?;
+
+    roundtrip_test(plan)
+}
+
+#[tokio::test]
+async fn roundtrip_async_func_exec() -> Result<()> {
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct TestAsyncUDF {
+        signature: Signature,
+    }
+
+    impl TestAsyncUDF {
+        fn new() -> Self {
+            Self {
+                signature: Signature::exact(vec![DataType::Int64], Volatility::Volatile),
+            }
+        }
+    }
+
+    impl ScalarUDFImpl for TestAsyncUDF {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "test_async_udf"
+        }
+
+        fn signature(&self) -> &Signature {
+            &self.signature
+        }
+
+        fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+            Ok(DataType::Int64)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            not_impl_err!("Must call from `invoke_async_with_args`")
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl AsyncScalarUDFImpl for TestAsyncUDF {
+        async fn invoke_async_with_args(
+            &self,
+            args: ScalarFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            Ok(args.args[0].clone())
+        }
+    }
+
+    let ctx = SessionContext::new();
+    let async_udf = AsyncScalarUDF::new(Arc::new(TestAsyncUDF::new()));
+    ctx.register_udf(async_udf.into_scalar_udf());
+
+    let physical_plan = ctx
+        .sql("select test_async_udf(1)")
+        .await?
+        .create_physical_plan()
+        .await?;
+
+    roundtrip_test_with_context(physical_plan, &ctx)?;
+
+    Ok(())
+}
+
+/// Test that HashTableLookupExpr serializes to lit(true)
+///
+/// HashTableLookupExpr contains a runtime hash table that cannot be serialized.
+/// The serialization code replaces it with lit(true) which is safe because
+/// it's a performance optimization filter, not a correctness requirement.
+#[test]
+fn roundtrip_hash_table_lookup_expr_to_lit() -> Result<()> {
+    use datafusion::physical_plan::joins::join_hash_map::JoinHashMapU32;
+    use datafusion::physical_plan::joins::{HashTableLookupExpr, Map};
+
+    // Create a simple schema and input plan
+    let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Int64, false)]));
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    // Create a HashTableLookupExpr - it will be replaced with lit(true) during serialization
+    let hash_map = Arc::new(Map::HashMap(Box::new(JoinHashMapU32::with_capacity(0))));
+    let on_columns = vec![datafusion::physical_plan::expressions::col("col", &schema)?];
+    let lookup_expr: Arc<dyn PhysicalExpr> = Arc::new(HashTableLookupExpr::new(
+        on_columns,
+        datafusion::physical_plan::joins::SeededRandomState::with_seed(0),
+        hash_map,
+        "test_lookup".to_string(),
+    ));
+
+    // Create a filter with the lookup expression
+    let filter = Arc::new(FilterExec::try_new(lookup_expr, input)?);
+
+    // Serialize
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+
+    let proto: PhysicalPlanNode =
+        PhysicalPlanNode::try_from_physical_plan(filter.clone(), &codec)
+            .expect("serialization should succeed");
+
+    // Deserialize
+    let result: Arc<dyn ExecutionPlan> = proto
+        .try_into_physical_plan(&ctx.task_ctx(), &codec)
+        .expect("deserialization should succeed");
+
+    // The deserialized plan should have lit(true) instead of HashTableLookupExpr
+    // Verify the filter predicate is a Literal(true)
+    let result_filter = result.as_any().downcast_ref::<FilterExec>().unwrap();
+    let predicate = result_filter.predicate();
+    let literal = predicate.as_any().downcast_ref::<Literal>().unwrap();
+    assert_eq!(*literal.value(), ScalarValue::Boolean(Some(true)));
+
+    Ok(())
+}
+
+#[test]
+fn roundtrip_hash_expr() -> Result<()> {
+    use datafusion::physical_plan::joins::{HashExpr, SeededRandomState};
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int64, false),
+        Field::new("b", DataType::Utf8, false),
+    ]));
+
+    // Create a HashExpr with test columns and seeds
+    let on_columns = vec![col("a", &schema)?, col("b", &schema)?];
+    let hash_expr: Arc<dyn PhysicalExpr> = Arc::new(HashExpr::new(
+        on_columns,
+        SeededRandomState::with_seed(0), // arbitrary random seed for testing
+        "test_hash".to_string(),
+    ));
+
+    // Wrap in a filter by comparing hash value to a literal
+    // hash_expr > 0 is always boolean
+    let filter_expr = binary(hash_expr, Operator::Gt, lit(0u64), &schema)?;
+    let filter = Arc::new(FilterExec::try_new(
+        filter_expr,
+        Arc::new(EmptyExec::new(schema)),
+    )?);
+
+    // Confirm that the debug string contains the random state seeds
+    assert!(
+        format!("{filter:?}").contains("test_hash(a@0, b@1, [0])"),
+        "Debug string missing seeds: {filter:?}"
+    );
+    roundtrip_test(filter)
+}
+
+#[test]
+fn custom_proto_converter_intercepts() -> Result<()> {
+    #[derive(Default)]
+    struct CustomConverterInterceptor {
+        num_proto_plans: RwLock<usize>,
+        num_physical_plans: RwLock<usize>,
+        num_proto_exprs: RwLock<usize>,
+        num_physical_exprs: RwLock<usize>,
+    }
+
+    impl PhysicalProtoConverterExtension for CustomConverterInterceptor {
+        fn proto_to_execution_plan(
+            &self,
+            ctx: &TaskContext,
+            codec: &dyn PhysicalExtensionCodec,
+            proto: &protobuf::PhysicalPlanNode,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            {
+                let mut counter = self
+                    .num_proto_plans
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            proto.try_into_physical_plan_with_converter(ctx, codec, self)
+        }
+
+        fn execution_plan_to_proto(
+            &self,
+            plan: &Arc<dyn ExecutionPlan>,
+            codec: &dyn PhysicalExtensionCodec,
+        ) -> Result<protobuf::PhysicalPlanNode>
+        where
+            Self: Sized,
+        {
+            {
+                let mut counter = self
+                    .num_physical_plans
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            PhysicalPlanNode::try_from_physical_plan_with_converter(
+                Arc::clone(plan),
+                codec,
+                self,
+            )
+        }
+
+        fn proto_to_physical_expr(
+            &self,
+            proto: &PhysicalExprNode,
+            ctx: &TaskContext,
+            input_schema: &Schema,
+            codec: &dyn PhysicalExtensionCodec,
+        ) -> Result<Arc<dyn PhysicalExpr>>
+        where
+            Self: Sized,
+        {
+            {
+                let mut counter = self
+                    .num_proto_exprs
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            parse_physical_expr_with_converter(proto, ctx, input_schema, codec, self)
+        }
+
+        fn physical_expr_to_proto(
+            &self,
+            expr: &Arc<dyn PhysicalExpr>,
+            codec: &dyn PhysicalExtensionCodec,
+        ) -> Result<PhysicalExprNode> {
+            {
+                let mut counter = self
+                    .num_physical_exprs
+                    .write()
+                    .map_err(|err| exec_datafusion_err!("{err}"))?;
+                *counter += 1;
+            }
+            serialize_physical_expr_with_converter(expr, codec, self)
+        }
+    }
+
+    let field_a = Field::new("a", DataType::Boolean, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+    let sort_exprs = [
+        PhysicalSortExpr {
+            expr: col("a", &schema)?,
+            options: SortOptions {
+                descending: true,
+                nulls_first: false,
+            },
+        },
+        PhysicalSortExpr {
+            expr: col("b", &schema)?,
+            options: SortOptions {
+                descending: false,
+                nulls_first: true,
+            },
+        },
+    ]
+    .into();
+
+    let exec_plan = Arc::new(SortExec::new(sort_exprs, Arc::new(EmptyExec::new(schema))));
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = CustomConverterInterceptor::default();
+    roundtrip_test_and_return(exec_plan, &ctx, &codec, &proto_converter)?;
+
+    assert_eq!(*proto_converter.num_proto_exprs.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_physical_exprs.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_proto_plans.read().unwrap(), 2);
+    assert_eq!(*proto_converter.num_physical_plans.read().unwrap(), 2);
+
+    Ok(())
+}
+
+#[test]
+fn roundtrip_call_null_scalar_struct_dict() -> Result<()> {
+    let data_type = DataType::Struct(Fields::from(vec![Field::new(
+        "item",
+        DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
+        true,
+    )]));
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", data_type.clone(), true)]));
+    let scan = Arc::new(EmptyExec::new(Arc::clone(&schema)));
+    let scalar = lit(ScalarValue::try_from(data_type)?);
+    let filter = Arc::new(FilterExec::try_new(
+        Arc::new(BinaryExpr::new(scalar, Operator::Eq, col("a", &schema)?)),
+        scan,
+    )?);
+
+    roundtrip_test(filter)
+}
+
+/// Test that expression deduplication works during deserialization.
+/// When the same expression Arc is serialized multiple times, it should be
+/// deduplicated on deserialization (sharing the same Arc).
+#[test]
+fn test_expression_deduplication() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a shared expression that will be used multiple times
+    let shared_col: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+    // Create an InList expression that uses the same column Arc multiple times
+    // This simulates a real-world scenario where expressions are shared
+    let in_list_expr = in_list(
+        Arc::clone(&shared_col),
+        vec![lit(1i64), lit(2i64), lit(3i64)],
+        &false,
+        &schema,
+    )?;
+
+    // Create a binary expression that uses the shared column and the in_list result
+    let binary_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        Arc::clone(&shared_col),
+        Operator::Eq,
+        lit(42i64),
+    ));
+
+    // Create a plan that has both expressions (they share the `shared_col` Arc)
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+    let filter = FilterExecBuilder::new(in_list_expr, input).build()?;
+    let projection_exprs = vec![ProjectionExpr {
+        expr: binary_expr,
+        alias: "result".to_string(),
+    }];
+    let exec_plan =
+        Arc::new(ProjectionExec::try_new(projection_exprs, Arc::new(filter))?);
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // Perform roundtrip
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan) as Arc<dyn ExecutionPlan>,
+        &codec,
+        &proto_converter,
+    )?;
+
+    // Create a new converter for deserialization (fresh cache)
+    let deser_converter = DeduplicatingProtoConverter {};
+    let result_plan = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &deser_converter,
+    )?;
+
+    // Verify the plan structure is correct
+    pretty_assertions::assert_eq!(format!("{exec_plan:?}"), format!("{result_plan:?}"));
+
+    Ok(())
+}
+
+/// Test that expression deduplication correctly shares Arcs for identical expressions.
+/// This test verifies the core deduplication behavior.
+#[test]
+fn test_expression_deduplication_arc_sharing() -> Result<()> {
+    use datafusion_proto::bytes::{
+        physical_plan_from_bytes_with_proto_converter,
+        physical_plan_to_bytes_with_proto_converter,
+    };
+
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a column expression
+    let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+    // Create a projection that uses the SAME Arc twice
+    // After roundtrip, both should point to the same Arc
+    let projection_exprs = vec![
+        ProjectionExpr {
+            expr: Arc::clone(&col_expr),
+            alias: "a1".to_string(),
+        },
+        ProjectionExpr {
+            expr: Arc::clone(&col_expr), // Same Arc!
+            alias: "a2".to_string(),
+        },
+    ];
+
+    let input = Arc::new(EmptyExec::new(schema));
+    let exec_plan = Arc::new(ProjectionExec::try_new(projection_exprs, input)?);
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // Serialize
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan) as Arc<dyn ExecutionPlan>,
+        &codec,
+        &proto_converter,
+    )?;
+
+    // Deserialize with a fresh converter
+    let deser_converter = DeduplicatingProtoConverter {};
+    let result_plan = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &deser_converter,
+    )?;
+
+    // Get the projection from the result
+    let projection = result_plan
+        .as_any()
+        .downcast_ref::<ProjectionExec>()
+        .expect("Expected ProjectionExec");
+
+    let exprs: Vec<_> = projection.expr().iter().collect();
+    assert_eq!(exprs.len(), 2);
+
+    // The key test: both expressions should point to the same Arc after deduplication
+    // This is because they were the same Arc before serialization
+    assert!(
+        Arc::ptr_eq(&exprs[0].expr, &exprs[1].expr),
+        "Expected both expressions to share the same Arc after deduplication"
+    );
+
+    Ok(())
+}
+
+/// Test backward compatibility: protos without expr_id should still deserialize correctly.
+#[test]
+fn test_backward_compatibility_no_expr_id() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Manually create a proto without expr_id set
+    let proto = PhysicalExprNode {
+        expr_id: None, // Simulating old proto without this field
+        expr_type: Some(
+            datafusion_proto::protobuf::physical_expr_node::ExprType::Column(
+                datafusion_proto::protobuf::PhysicalColumn {
+                    name: "a".to_string(),
+                    index: 0,
+                },
+            ),
+        ),
+    };
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DefaultPhysicalProtoConverter {};
+
+    // Should deserialize without error
+    let result = proto_converter.proto_to_physical_expr(
+        &proto,
+        ctx.task_ctx().as_ref(),
+        &schema,
+        &codec,
+    )?;
+
+    // Verify the result is correct
+    let col = result
+        .as_any()
+        .downcast_ref::<Column>()
+        .expect("Expected Column");
+    assert_eq!(col.name(), "a");
+    assert_eq!(col.index(), 0);
+
+    Ok(())
+}
+
+/// Test that deduplication works within a single plan deserialization and that
+/// separate deserializations produce independent expressions (no cross-operation sharing).
+#[test]
+fn test_deduplication_within_plan_deserialization() -> Result<()> {
+    use datafusion_proto::bytes::{
+        physical_plan_from_bytes_with_proto_converter,
+        physical_plan_to_bytes_with_proto_converter,
+    };
+
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a plan with expressions that will be deduplicated
+    let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+    let projection_exprs = vec![
+        ProjectionExpr {
+            expr: Arc::clone(&col_expr),
+            alias: "a1".to_string(),
+        },
+        ProjectionExpr {
+            expr: Arc::clone(&col_expr), // Same Arc - will be deduplicated
+            alias: "a2".to_string(),
+        },
+    ];
+    let exec_plan = Arc::new(ProjectionExec::try_new(
+        projection_exprs,
+        Arc::new(EmptyExec::new(schema)),
+    )?);
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // Serialize
+    let bytes = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan) as Arc<dyn ExecutionPlan>,
+        &codec,
+        &proto_converter,
+    )?;
+
+    // First deserialization
+    let plan1 = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &proto_converter,
+    )?;
+
+    // Check that the plan was deserialized correctly with deduplication
+    let projection1 = plan1
+        .as_any()
+        .downcast_ref::<ProjectionExec>()
+        .expect("Expected ProjectionExec");
+    let exprs1: Vec<_> = projection1.expr().iter().collect();
+    assert_eq!(exprs1.len(), 2);
+    assert!(
+        Arc::ptr_eq(&exprs1[0].expr, &exprs1[1].expr),
+        "Expected both expressions to share the same Arc after deduplication"
+    );
+
+    // Second deserialization
+    let plan2 = physical_plan_from_bytes_with_proto_converter(
+        bytes.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &proto_converter,
+    )?;
+
+    // Check that the second plan was also deserialized correctly
+    let projection2 = plan2
+        .as_any()
+        .downcast_ref::<ProjectionExec>()
+        .expect("Expected ProjectionExec");
+    let exprs2: Vec<_> = projection2.expr().iter().collect();
+    assert_eq!(exprs2.len(), 2);
+    assert!(
+        Arc::ptr_eq(&exprs2[0].expr, &exprs2[1].expr),
+        "Expected both expressions to share the same Arc after deduplication"
+    );
+
+    // Check that there was no deduplication across deserializations
+    assert!(
+        !Arc::ptr_eq(&exprs1[0].expr, &exprs2[0].expr),
+        "Expected expressions from different deserializations to be different Arcs"
+    );
+    assert!(
+        !Arc::ptr_eq(&exprs1[1].expr, &exprs2[1].expr),
+        "Expected expressions from different deserializations to be different Arcs"
+    );
+
+    Ok(())
+}
+
+/// Test that deduplication works within direct expression deserialization and that
+/// separate deserializations produce independent expressions (no cross-operation sharing).
+#[test]
+fn test_deduplication_within_expr_deserialization() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a binary expression where both sides are the same Arc
+    // This allows us to test deduplication within a single deserialization
+    let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+    let binary_expr: Arc<dyn PhysicalExpr> = Arc::new(BinaryExpr::new(
+        Arc::clone(&col_expr),
+        Operator::Plus,
+        Arc::clone(&col_expr), // Same Arc - will be deduplicated
+    ));
+
+    let ctx = SessionContext::new();
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // Serialize the expression
+    let proto = proto_converter.physical_expr_to_proto(&binary_expr, &codec)?;
+
+    // First expression deserialization
+    let expr1 = proto_converter.proto_to_physical_expr(
+        &proto,
+        ctx.task_ctx().as_ref(),
+        &schema,
+        &codec,
+    )?;
+
+    // Check that deduplication worked within the deserialization
+    let binary1 = expr1
+        .as_any()
+        .downcast_ref::<BinaryExpr>()
+        .expect("Expected BinaryExpr");
+    assert!(
+        Arc::ptr_eq(binary1.left(), binary1.right()),
+        "Expected both sides to share the same Arc after deduplication"
+    );
+
+    // Second expression deserialization
+    let expr2 = proto_converter.proto_to_physical_expr(
+        &proto,
+        ctx.task_ctx().as_ref(),
+        &schema,
+        &codec,
+    )?;
+
+    // Check that the second expression was also deserialized correctly
+    let binary2 = expr2
+        .as_any()
+        .downcast_ref::<BinaryExpr>()
+        .expect("Expected BinaryExpr");
+    assert!(
+        Arc::ptr_eq(binary2.left(), binary2.right()),
+        "Expected both sides to share the same Arc after deduplication"
+    );
+
+    // Check that there was no deduplication across deserializations
+    assert!(
+        !Arc::ptr_eq(binary1.left(), binary2.left()),
+        "Expected expressions from different deserializations to be different Arcs"
+    );
+    assert!(
+        !Arc::ptr_eq(binary1.right(), binary2.right()),
+        "Expected expressions from different deserializations to be different Arcs"
+    );
+
+    Ok(())
+}
+
+/// Test that session_id rotates between top-level serialization operations.
+/// This verifies that each top-level serialization gets a fresh session_id,
+/// which prevents cross-process collisions when serialized plans are merged.
+#[test]
+fn test_session_id_rotation_between_serializations() -> Result<()> {
+    let field_a = Field::new("a", DataType::Int64, false);
+    let _schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a simple expression
+    let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // First serialization
+    let proto1 = proto_converter.physical_expr_to_proto(&col_expr, &codec)?;
+    let expr_id1 = proto1.expr_id.expect("Expected expr_id to be set");
+
+    // Second serialization with the same converter
+    // The session_id should have rotated, so the expr_id should be different
+    // even though we're serializing the same expression (same pointer address)
+    let proto2 = proto_converter.physical_expr_to_proto(&col_expr, &codec)?;
+    let expr_id2 = proto2.expr_id.expect("Expected expr_id to be set");
+
+    // The expr_ids should be different because session_id rotated
+    assert_ne!(
+        expr_id1, expr_id2,
+        "Expected different expr_ids due to session_id rotation between serializations"
+    );
+
+    // Also test that serializing the same expression multiple times within
+    // the same top-level operation would give the same expr_id (not testable
+    // here directly since each physical_expr_to_proto is a top-level operation,
+    // but the deduplication tests verify this indirectly)
+
+    Ok(())
+}
+
+/// Test that session_id rotation works correctly with execution plans.
+/// This verifies the end-to-end behavior with plan serialization.
+#[test]
+fn test_session_id_rotation_with_execution_plans() -> Result<()> {
+    use datafusion_proto::bytes::physical_plan_to_bytes_with_proto_converter;
+
+    let field_a = Field::new("a", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a]));
+
+    // Create a simple plan
+    let col_expr: Arc<dyn PhysicalExpr> = Arc::new(Column::new("a", 0));
+    let projection_exprs = vec![ProjectionExpr {
+        expr: Arc::clone(&col_expr),
+        alias: "a1".to_string(),
+    }];
+    let exec_plan = Arc::new(ProjectionExec::try_new(
+        projection_exprs.clone(),
+        Arc::new(EmptyExec::new(Arc::clone(&schema))),
+    )?);
+
+    let codec = DefaultPhysicalExtensionCodec {};
+    let proto_converter = DeduplicatingProtoConverter {};
+
+    // First serialization
+    let bytes1 = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan) as Arc<dyn ExecutionPlan>,
+        &codec,
+        &proto_converter,
+    )?;
+
+    // Second serialization with the same converter
+    let bytes2 = physical_plan_to_bytes_with_proto_converter(
+        Arc::clone(&exec_plan) as Arc<dyn ExecutionPlan>,
+        &codec,
+        &proto_converter,
+    )?;
+
+    // The serialized bytes should be different due to different session_ids
+    // (specifically, the expr_id values embedded in the protobuf will differ)
+    assert_ne!(
+        bytes1.as_ref(),
+        bytes2.as_ref(),
+        "Expected different serialized bytes due to session_id rotation"
+    );
+
+    // But both should deserialize correctly
+    let ctx = SessionContext::new();
+    let deser_converter = DeduplicatingProtoConverter {};
+
+    let plan1 = datafusion_proto::bytes::physical_plan_from_bytes_with_proto_converter(
+        bytes1.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &deser_converter,
+    )?;
+
+    let plan2 = datafusion_proto::bytes::physical_plan_from_bytes_with_proto_converter(
+        bytes2.as_ref(),
+        ctx.task_ctx().as_ref(),
+        &codec,
+        &deser_converter,
+    )?;
+
+    // Verify both plans have the expected structure
+    assert_eq!(plan1.schema(), plan2.schema());
+
+    Ok(())
+}
+
+/// Tests that `lead` window function with offset and default value args
+/// survives a protobuf round-trip. This is a regression test for a bug
+/// where `expressions()` (used during serialization) returns only the
+/// column expression for lead/lag, silently dropping the offset and
+/// default value literal args.
+#[test]
+fn roundtrip_lead_with_default_value() -> Result<()> {
+    use datafusion::functions_window::lead_lag::lead_udwf;
+
+    let field_a = Field::new("a", DataType::Int64, false);
+    let field_b = Field::new("b", DataType::Int64, false);
+    let schema = Arc::new(Schema::new(vec![field_a, field_b]));
+
+    // lead(a, 2, 42) — column a, offset 2, default value 42
+    let lead_window = create_udwf_window_expr(
+        &lead_udwf(),
+        &[col("a", &schema)?, lit(2i64), lit(42i64)],
+        schema.as_ref(),
+        "test lead with default".to_string(),
+        false,
+    )?;
+
+    let udwf_expr = Arc::new(StandardWindowExpr::new(
+        lead_window,
+        &[col("b", &schema)?],
+        &[PhysicalSortExpr {
+            expr: col("a", &schema)?,
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }],
+        Arc::new(WindowFrame::new(None)),
+    ));
+
+    let input = Arc::new(EmptyExec::new(schema.clone()));
+
+    roundtrip_test(Arc::new(BoundedWindowAggExec::try_new(
+        vec![udwf_expr],
+        input,
+        InputOrderMode::Sorted,
+        true,
+    )?))
+}
diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs
index ed99150831e7a..bb955a426ca78 100644
--- a/datafusion/proto/tests/cases/serialize.rs
+++ b/datafusion/proto/tests/cases/serialize.rs
@@ -18,16 +18,17 @@
 use std::sync::Arc;
 
 use arrow::array::ArrayRef;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field};
 
 use datafusion::execution::FunctionRegistry;
 use datafusion::prelude::SessionContext;
-use datafusion_expr::{col, create_udf, lit, ColumnarValue};
+use datafusion_expr::expr::Placeholder;
+use datafusion_expr::{ColumnarValue, col, create_udf, lit};
 use datafusion_expr::{Expr, Volatility};
 use datafusion_functions::string;
 use datafusion_proto::bytes::Serializeable;
-use datafusion_proto::logical_plan::to_proto::serialize_expr;
 use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec;
+use datafusion_proto::logical_plan::to_proto::serialize_expr;
 
 #[test]
 #[should_panic(
@@ -41,7 +42,7 @@ fn bad_decode() {
 #[cfg(feature = "json")]
 fn plan_to_json() {
     use datafusion_common::DFSchema;
-    use datafusion_expr::{logical_plan::EmptyRelation, LogicalPlan};
+    use datafusion_expr::{LogicalPlan, logical_plan::EmptyRelation};
     use datafusion_proto::bytes::logical_plan_to_json;
 
     let plan = LogicalPlan::EmptyRelation(EmptyRelation {
@@ -61,7 +62,7 @@ fn json_to_plan() {
 
     let input = r#"{"emptyRelation":{}}"#.to_string();
     let ctx = SessionContext::new();
-    let actual = logical_plan_from_json(&input, &ctx).unwrap();
+    let actual = logical_plan_from_json(&input, &ctx.task_ctx()).unwrap();
     let result = matches!(actual, LogicalPlan::EmptyRelation(_));
     assert!(result, "Should parse empty relation");
 }
@@ -136,6 +137,21 @@ fn roundtrip_qualified_alias() {
     assert_eq!(qual_alias, roundtrip_expr(&qual_alias));
 }
 
+#[test]
+fn roundtrip_placeholder_with_metadata() {
+    let expr = Expr::Placeholder(Placeholder::new_with_field(
+        "placeholder_id".to_string(),
+        Some(
+            Field::new("", DataType::Utf8, false)
+                .with_metadata(
+                    [("some_key".to_string(), "some_value".to_string())].into(),
+                )
+                .into(),
+        ),
+    ));
+    assert_eq!(expr, roundtrip_expr(&expr));
+}
+
 #[test]
 fn roundtrip_deeply_nested_binary_expr() {
     // We need more stack space so this doesn't overflow in dev builds
@@ -256,7 +272,7 @@ fn test_expression_serialization_roundtrip() {
     use datafusion_proto::logical_plan::from_proto::parse_expr;
 
     let ctx = SessionContext::new();
-    let lit = Expr::Literal(ScalarValue::Utf8(None));
+    let lit = Expr::Literal(ScalarValue::Utf8(None), None);
     for function in string::functions() {
         // default to 4 args (though some exprs like substr have error checking)
         let num_args = 4;
diff --git a/datafusion/proto/tests/testdata/test.arrow b/datafusion/proto/tests/testdata/test.arrow
new file mode 100644
index 0000000000000..5314d9eea1345
Binary files /dev/null and b/datafusion/proto/tests/testdata/test.arrow differ
diff --git a/datafusion/pruning/Cargo.toml b/datafusion/pruning/Cargo.toml
new file mode 100644
index 0000000000000..e6f4bb6f273c9
--- /dev/null
+++ b/datafusion/pruning/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "datafusion-pruning"
+description = "DataFusion Pruning Logic"
+readme = "README.md"
+version = { workspace = true }
+edition = { workspace = true }
+homepage = { workspace = true }
+repository = { workspace = true }
+license = { workspace = true }
+authors = { workspace = true }
+
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+datafusion-common = { workspace = true, default-features = true }
+datafusion-datasource = { workspace = true }
+datafusion-expr-common = { workspace = true, default-features = true }
+datafusion-physical-expr = { workspace = true }
+datafusion-physical-expr-common = { workspace = true }
+datafusion-physical-plan = { workspace = true }
+log = { workspace = true }
+
+[dev-dependencies]
+datafusion-expr = { workspace = true }
+datafusion-functions-nested = { workspace = true }
+insta = { workspace = true }
+itertools = { workspace = true }
diff --git a/datafusion/pruning/LICENSE.txt b/datafusion/pruning/LICENSE.txt
new file mode 120000
index 0000000000000..1ef648f64b34f
--- /dev/null
+++ b/datafusion/pruning/LICENSE.txt
@@ -0,0 +1 @@
+../../LICENSE.txt
\ No newline at end of file
diff --git a/datafusion/pruning/NOTICE.txt b/datafusion/pruning/NOTICE.txt
new file mode 120000
index 0000000000000..fb051c92b10b2
--- /dev/null
+++ b/datafusion/pruning/NOTICE.txt
@@ -0,0 +1 @@
+../../NOTICE.txt
\ No newline at end of file
diff --git a/datafusion/pruning/README.md b/datafusion/pruning/README.md
new file mode 100644
index 0000000000000..4db509193d172
--- /dev/null
+++ b/datafusion/pruning/README.md
@@ -0,0 +1,34 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache DataFusion Pruning Logic
+
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that contains pruning logic, to analyze filter expressions with
+statistics such as min/max values and null counts, proving files / large subsections of files can be skipped
+without reading the actual data.
+
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/pruning/src/file_pruner.rs b/datafusion/pruning/src/file_pruner.rs
new file mode 100644
index 0000000000000..f850e0c0114fb
--- /dev/null
+++ b/datafusion/pruning/src/file_pruner.rs
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! File-level pruning based on partition values and file-level statistics
+
+use std::sync::Arc;
+
+use arrow::datatypes::{FieldRef, SchemaRef};
+use datafusion_common::{Result, internal_datafusion_err, pruning::PrunableStatistics};
+use datafusion_datasource::PartitionedFile;
+use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, snapshot_generation};
+use datafusion_physical_plan::metrics::Count;
+use log::debug;
+
+use crate::build_pruning_predicate;
+
+/// Prune based on file-level statistics.
+///
+/// Note: Partition column pruning is handled earlier via `replace_columns_with_literals`
+/// which substitutes partition column references with their literal values before
+/// the predicate reaches this pruner.
+pub struct FilePruner {
+    predicate_generation: Option<u64>,
+    predicate: Arc<dyn PhysicalExpr>,
+    /// Schema used for pruning (the logical file schema).
+    file_schema: SchemaRef,
+    file_stats_pruning: PrunableStatistics,
+    predicate_creation_errors: Count,
+}
+
+impl FilePruner {
+    #[deprecated(
+        since = "52.0.0",
+        note = "Use `try_new` instead which returns None if no statistics are available"
+    )]
+    #[expect(clippy::needless_pass_by_value)]
+    pub fn new(
+        predicate: Arc<dyn PhysicalExpr>,
+        logical_file_schema: &SchemaRef,
+        _partition_fields: Vec<FieldRef>,
+        partitioned_file: PartitionedFile,
+        predicate_creation_errors: Count,
+    ) -> Result<Self> {
+        Self::try_new(
+            predicate,
+            logical_file_schema,
+            &partitioned_file,
+            predicate_creation_errors,
+        )
+        .ok_or_else(|| {
+            internal_datafusion_err!(
+                "FilePruner::new called on a file without statistics: {:?}",
+                partitioned_file
+            )
+        })
+    }
+
+    /// Create a new file pruner if statistics are available.
+    /// Returns None if this file does not have statistics.
+    pub fn try_new(
+        predicate: Arc<dyn PhysicalExpr>,
+        file_schema: &SchemaRef,
+        partitioned_file: &PartitionedFile,
+        predicate_creation_errors: Count,
+    ) -> Option<Self> {
+        let file_stats = partitioned_file.statistics.as_ref()?;
+        let file_stats_pruning =
+            PrunableStatistics::new(vec![file_stats.clone()], Arc::clone(file_schema));
+        Some(Self {
+            predicate_generation: None,
+            predicate,
+            file_schema: Arc::clone(file_schema),
+            file_stats_pruning,
+            predicate_creation_errors,
+        })
+    }
+
+    pub fn should_prune(&mut self) -> Result<bool> {
+        // Check if the predicate has changed since last invocation by tracking
+        // its "generation". Dynamic filter expressions can change their values
+        // during query execution, so we use generation tracking to detect when
+        // the predicate has been updated and needs to be rebuilt.
+        //
+        // If the generation hasn't changed, we can skip rebuilding the pruning
+        // predicate, which is an expensive operation involving expression analysis.
+        let new_generation = snapshot_generation(&self.predicate);
+        if let Some(current_generation) = self.predicate_generation.as_mut() {
+            if *current_generation == new_generation {
+                return Ok(false);
+            }
+            *current_generation = new_generation;
+        } else {
+            self.predicate_generation = Some(new_generation);
+        }
+        let pruning_predicate = build_pruning_predicate(
+            Arc::clone(&self.predicate),
+            &self.file_schema,
+            &self.predicate_creation_errors,
+        );
+        let Some(pruning_predicate) = pruning_predicate else {
+            return Ok(false);
+        };
+        match pruning_predicate.prune(&self.file_stats_pruning) {
+            Ok(values) => {
+                assert!(values.len() == 1);
+                // We expect a single container -> if all containers are false skip this file
+                if values.into_iter().all(|v| !v) {
+                    return Ok(true);
+                }
+            }
+            // Stats filter array could not be built, so we can't prune
+            Err(e) => {
+                debug!("Ignoring error building pruning predicate for file: {e}");
+                self.predicate_creation_errors.add(1);
+            }
+        }
+
+        Ok(false)
+    }
+}
diff --git a/datafusion/pruning/src/lib.rs b/datafusion/pruning/src/lib.rs
new file mode 100644
index 0000000000000..be17f29eaafa0
--- /dev/null
+++ b/datafusion/pruning/src/lib.rs
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+
+mod file_pruner;
+mod pruning_predicate;
+
+pub use file_pruner::FilePruner;
+pub use pruning_predicate::{
+    PredicateRewriter, PruningPredicate, PruningStatistics, RequiredColumns,
+    UnhandledPredicateHook, build_pruning_predicate,
+};
diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/pruning/src/pruning_predicate.rs
similarity index 91%
rename from datafusion/physical-optimizer/src/pruning.rs
rename to datafusion/pruning/src/pruning_predicate.rs
index 1beaa0eb00186..6f6b00e80abc2 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/pruning/src/pruning_predicate.rs
@@ -24,25 +24,28 @@ use std::sync::Arc;
 
 use arrow::array::AsArray;
 use arrow::{
-    array::{new_null_array, ArrayRef, BooleanArray},
+    array::{ArrayRef, BooleanArray, new_null_array},
     datatypes::{DataType, Field, Schema, SchemaRef},
     record_batch::{RecordBatch, RecordBatchOptions},
 };
-use datafusion_common::pruning::PruningStatistics;
+// pub use for backwards compatibility
+pub use datafusion_common::pruning::PruningStatistics;
+use datafusion_physical_expr::simplifier::PhysicalExprSimplifier;
+use datafusion_physical_plan::metrics::Count;
 use log::{debug, trace};
 
-use datafusion_common::error::{DataFusionError, Result};
-use datafusion_common::tree_node::TransformedResult;
+use datafusion_common::error::Result;
+use datafusion_common::tree_node::{TransformedResult, TreeNodeRecursion};
+use datafusion_common::{Column, DFSchema, assert_eq_or_internal_err};
 use datafusion_common::{
-    internal_err, plan_datafusion_err, plan_err,
+    ScalarValue, internal_datafusion_err, plan_datafusion_err, plan_err,
     tree_node::{Transformed, TreeNode},
-    ScalarValue,
 };
-use datafusion_common::{Column, DFSchema};
 use datafusion_expr_common::operator::Operator;
-use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee};
-use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef};
-use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
+use datafusion_physical_expr::expressions::CastColumnExpr;
+use datafusion_physical_expr::utils::{Guarantee, LiteralGuarantee};
+use datafusion_physical_expr::{PhysicalExprRef, expressions as phys_expr};
+use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr_opt;
 use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 
 /// Used to prove that arbitrary predicates (boolean expression) can not
@@ -83,7 +86,7 @@ use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 /// example of how to use `PruningPredicate` to prune files based on min/max
 /// values.
 ///
-/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/pruning.rs
+/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/pruning.rs
 ///
 /// Given an expression like `x = 5` and statistics for 3 containers (Row
 /// Groups, files, etc) `A`, `B`, and `C`:
@@ -235,7 +238,7 @@ use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
 /// Original Predicate | Rewritten Predicate
 /// ------------------ | --------------------
 /// `x = 5` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max)`
-/// `x < 5` | `x_null_count != x_row_count THEN false (x_max < 5)`
+/// `x < 5` | `x_null_count != x_row_count AND (x_min < 5)`
 /// `x = 5 AND y = 10` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) AND y_null_count != y_row_count (y_min <= 10 AND 10 <= y_max)`
 /// `x IS NULL`  | `x_null_count > 0`
 /// `x IS NOT NULL`  | `x_null_count != row_count`
@@ -375,6 +378,30 @@ pub struct PruningPredicate {
     literal_guarantees: Vec<LiteralGuarantee>,
 }
 
+/// Build a pruning predicate from an optional predicate expression.
+/// If the predicate is None or the predicate cannot be converted to a pruning
+/// predicate, return None.
+/// If there is an error creating the pruning predicate it is recorded by incrementing
+/// the `predicate_creation_errors` counter.
+pub fn build_pruning_predicate(
+    predicate: Arc<dyn PhysicalExpr>,
+    file_schema: &SchemaRef,
+    predicate_creation_errors: &Count,
+) -> Option<Arc<PruningPredicate>> {
+    match PruningPredicate::try_new(predicate, Arc::clone(file_schema)) {
+        Ok(pruning_predicate) => {
+            if !pruning_predicate.always_true() {
+                return Some(Arc::new(pruning_predicate));
+            }
+        }
+        Err(e) => {
+            debug!("Could not create pruning predicate for: {e}");
+            predicate_creation_errors.add(1);
+        }
+    }
+    None
+}
+
 /// Rewrites predicates that [`PredicateRewriter`] can not handle, e.g. certain
 /// complex expressions or predicates that reference columns that are not in the
 /// schema.
@@ -428,21 +455,43 @@ impl PruningPredicate {
     ///
     /// See the struct level documentation on [`PruningPredicate`] for more
     /// details.
-    pub fn try_new(expr: Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Result<Self> {
-        // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate`
-        // which does not handle dynamic exprs  in general
-        let expr = snapshot_physical_expr(expr)?;
+    ///
+    /// Note that `PruningPredicate` does not attempt to normalize or simplify
+    /// the input expression unless calling [`snapshot_physical_expr_opt`]
+    /// returns a new expression.
+    /// It is recommended that you pass the expressions through [`PhysicalExprSimplifier`]
+    /// before calling this method to make sure the expressions can be used for pruning.
+    pub fn try_new(mut expr: Arc<dyn PhysicalExpr>, schema: SchemaRef) -> Result<Self> {
+        // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate`.
+        // In particular this unravels any `DynamicFilterPhysicalExpr`s by snapshotting them
+        // so that PruningPredicate can work with a static expression.
+        let tf = snapshot_physical_expr_opt(expr)?;
+        if tf.transformed {
+            // If we had an expression such as Dynamic(part_col < 5 and col < 10)
+            // (this could come from something like `select * from t order by part_col, col, limit 10`)
+            // after snapshotting and because `DynamicFilterPhysicalExpr` applies child replacements to its
+            // children after snapshotting and previously `replace_columns_with_literals` may have been called with partition values
+            // the expression we have now is `8 < 5 and col < 10`.
+            // Thus we need as simplifier pass to get `false and col < 10` => `false` here.
+            let simplifier = PhysicalExprSimplifier::new(&schema);
+            expr = simplifier.simplify(tf.data)?;
+        } else {
+            expr = tf.data;
+        }
         let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _;
 
         // build predicate expression once
         let mut required_columns = RequiredColumns::new();
         let predicate_expr = build_predicate_expression(
             &expr,
-            schema.as_ref(),
+            &schema,
             &mut required_columns,
             &unhandled_hook,
         );
-
+        let predicate_schema = required_columns.schema();
+        // Simplify the newly created predicate to get rid of redundant casts, comparisons, etc.
+        let predicate_expr =
+            PhysicalExprSimplifier::new(&predicate_schema).simplify(predicate_expr)?;
         let literal_guarantees = LiteralGuarantee::analyze(&expr);
 
         Ok(Self {
@@ -468,7 +517,10 @@ impl PruningPredicate {
     /// simplified version `b`. See [`ExprSimplifier`] to simplify expressions.
     ///
     /// [`ExprSimplifier`]: https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html
-    pub fn prune<S: PruningStatistics>(&self, statistics: &S) -> Result<Vec<bool>> {
+    pub fn prune<S: PruningStatistics + ?Sized>(
+        &self,
+        statistics: &S,
+    ) -> Result<Vec<bool>> {
         let mut builder = BoolVecBuilder::new(statistics.num_containers());
 
         // Try to prove the predicate can't be true for the containers based on
@@ -486,9 +538,9 @@ impl PruningPredicate {
                     // If `contained` returns false, that means the column is
                     // not any of the values so we can prune the container
                     Guarantee::In => builder.combine_array(&results),
-                    // `NotIn` means the values in the column must must not be
+                    // `NotIn` means the values in the column must not be
                     // any of the values in the set for the predicate to
-                    // evaluate to true. If contained returns true, it means the
+                    // evaluate to true. If `contained` returns true, it means the
                     // column is only in the set of values so we can prune the
                     // container
                     Guarantee::NotIn => {
@@ -551,8 +603,6 @@ impl PruningPredicate {
         is_always_true(&self.predicate_expr) && self.literal_guarantees.is_empty()
     }
 
-    // this is only used by `parquet` feature right now
-    #[allow(dead_code)]
     pub fn required_columns(&self) -> &RequiredColumns {
         &self.required_columns
     }
@@ -691,8 +741,6 @@ impl RequiredColumns {
     /// * `a > 5 OR a < 10` returns `Some(a)`
     /// * `a > 5 OR b < 10` returns `None`
     /// * `true` returns None
-    #[allow(dead_code)]
-    // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)`
     pub fn single_column(&self) -> Option<&phys_expr::Column> {
         if self.columns.windows(2).all(|w| {
             // check if all columns are the same (ignoring statistics and field)
@@ -706,6 +754,21 @@ impl RequiredColumns {
         }
     }
 
+    /// Returns a schema that describes the columns required to evaluate this
+    /// pruning predicate.
+    /// The schema contains the fields for each column in `self.columns` with
+    /// the appropriate data type for the statistics.
+    /// Order matters, this same order is used to evaluate the
+    /// pruning predicate.
+    fn schema(&self) -> Schema {
+        let fields = self
+            .columns
+            .iter()
+            .map(|(_c, _t, f)| f.clone())
+            .collect::<Vec<_>>();
+        Schema::new(fields)
+    }
+
     /// Returns an iterator over items in columns (see doc on
     /// `self.columns` for details)
     pub(crate) fn iter(
@@ -827,13 +890,13 @@ impl From<Vec<(phys_expr::Column, StatisticsType, Field)>> for RequiredColumns {
 
 /// Build a RecordBatch from a list of statistics, creating arrays,
 /// with one row for each PruningStatistics and columns specified in
-/// in the required_columns parameter.
+/// the required_columns parameter.
 ///
 /// For example, if the requested columns are
 /// ```text
 /// ("s1", Min, Field:s1_min)
 /// ("s2", Max, field:s2_max)
-///```
+/// ```
 ///
 /// And the input statistics had
 /// ```text
@@ -850,11 +913,10 @@ impl From<Vec<(phys_expr::Column, StatisticsType, Field)>> for RequiredColumns {
 /// -------+--------
 ///   5    | 1000
 /// ```
-fn build_statistics_record_batch<S: PruningStatistics>(
+fn build_statistics_record_batch<S: PruningStatistics + ?Sized>(
     statistics: &S,
     required_columns: &RequiredColumns,
 ) -> Result<RecordBatch> {
-    let mut fields = Vec::<Field>::new();
     let mut arrays = Vec::<ArrayRef>::new();
     // For each needed statistics column:
     for (column, statistics_type, stat_field) in required_columns.iter() {
@@ -871,23 +933,22 @@ fn build_statistics_record_batch<S: PruningStatistics>(
         };
         let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers));
 
-        if num_containers != array.len() {
-            return internal_err!(
-                "mismatched statistics length. Expected {}, got {}",
-                num_containers,
-                array.len()
-            );
-        }
+        assert_eq_or_internal_err!(
+            num_containers,
+            array.len(),
+            "mismatched statistics length. Expected {}, got {}",
+            num_containers,
+            array.len()
+        );
 
         // cast statistics array to required data type (e.g. parquet
         // provides timestamp statistics as "Int64")
         let array = arrow::compute::cast(&array, data_type)?;
 
-        fields.push(stat_field.clone());
         arrays.push(array);
     }
 
-    let schema = Arc::new(Schema::new(fields));
+    let schema = Arc::new(required_columns.schema());
     // provide the count in case there were no needed statistics
     let mut options = RecordBatchOptions::default();
     options.row_count = Some(statistics.num_containers());
@@ -912,33 +973,49 @@ impl<'a> PruningExpressionBuilder<'a> {
     fn try_new(
         left: &'a Arc<dyn PhysicalExpr>,
         right: &'a Arc<dyn PhysicalExpr>,
+        left_columns: ColumnReferenceCount,
+        right_columns: ColumnReferenceCount,
         op: Operator,
-        schema: &'a Schema,
+        schema: &'a SchemaRef,
         required_columns: &'a mut RequiredColumns,
     ) -> Result<Self> {
         // find column name; input could be a more complicated expression
-        let left_columns = collect_columns(left);
-        let right_columns = collect_columns(right);
-        let (column_expr, scalar_expr, columns, correct_operator) =
-            match (left_columns.len(), right_columns.len()) {
-                (1, 0) => (left, right, left_columns, op),
-                (0, 1) => (right, left, right_columns, reverse_operator(op)?),
-                _ => {
-                    // if more than one column used in expression - not supported
-                    return plan_err!(
-                        "Multi-column expressions are not currently supported"
-                    );
-                }
-            };
+        let (column_expr, scalar_expr, column, correct_operator) = match (
+            left_columns,
+            right_columns,
+        ) {
+            (ColumnReferenceCount::One(column), ColumnReferenceCount::Zero) => {
+                (left, right, column, op)
+            }
+            (ColumnReferenceCount::Zero, ColumnReferenceCount::One(column)) => {
+                (right, left, column, reverse_operator(op)?)
+            }
+            (ColumnReferenceCount::One(_), ColumnReferenceCount::One(_)) => {
+                // both sides have one column - not supported
+                return plan_err!(
+                    "Expression not supported for pruning: left has 1 column, right has 1 column"
+                );
+            }
+            (ColumnReferenceCount::Zero, ColumnReferenceCount::Zero) => {
+                // both sides are literals - should be handled before calling try_new
+                return plan_err!(
+                    "Pruning literal expressions is not supported, please call PhysicalExprSimplifier first"
+                );
+            }
+            (ColumnReferenceCount::Many, _) | (_, ColumnReferenceCount::Many) => {
+                return plan_err!(
+                    "Expression not supported for pruning: left or right has multiple columns"
+                );
+            }
+        };
 
-        let df_schema = DFSchema::try_from(schema.clone())?;
+        let df_schema = DFSchema::try_from(Arc::clone(schema))?;
         let (column_expr, correct_operator, scalar_expr) = rewrite_expr_to_prunable(
             column_expr,
             correct_operator,
             scalar_expr,
             df_schema,
         )?;
-        let column = columns.iter().next().unwrap().clone();
         let field = match schema.column_with_name(column.name()) {
             Some((_, f)) => f,
             _ => {
@@ -1047,8 +1124,8 @@ fn rewrite_expr_to_prunable(
         Ok((Arc::clone(column_expr), op, Arc::clone(scalar_expr)))
     } else if let Some(cast) = column_expr_any.downcast_ref::<phys_expr::CastExpr>() {
         // `cast(col) op lit()`
-        let arrow_schema: SchemaRef = schema.clone().into();
-        let from_type = cast.expr().data_type(&arrow_schema)?;
+        let arrow_schema = schema.as_arrow();
+        let from_type = cast.expr().data_type(arrow_schema)?;
         verify_support_type_for_prune(&from_type, cast.cast_type())?;
         let (left, op, right) =
             rewrite_expr_to_prunable(cast.expr(), op, scalar_expr, schema)?;
@@ -1058,12 +1135,26 @@ fn rewrite_expr_to_prunable(
             None,
         ));
         Ok((left, op, right))
+    } else if let Some(cast_col) = column_expr_any.downcast_ref::<CastColumnExpr>() {
+        // `cast_column(col) op lit()` - same as CastExpr but uses CastColumnExpr
+        let arrow_schema = schema.as_arrow();
+        let from_type = cast_col.expr().data_type(arrow_schema)?;
+        let to_type = cast_col.target_field().data_type();
+        verify_support_type_for_prune(&from_type, to_type)?;
+        let (left, op, right) =
+            rewrite_expr_to_prunable(cast_col.expr(), op, scalar_expr, schema)?;
+        // Predicate pruning / statistics generally don't support struct columns yet.
+        // In the future we may want to support pruning on nested fields, in which case we probably need to
+        // do something more sophisticated here.
+        // But for now since we don't support pruning on nested fields, we can just cast to the target type directly.
+        let left = Arc::new(phys_expr::CastExpr::new(left, to_type.clone(), None));
+        Ok((left, op, right))
     } else if let Some(try_cast) =
         column_expr_any.downcast_ref::<phys_expr::TryCastExpr>()
     {
         // `try_cast(col) op lit()`
-        let arrow_schema: SchemaRef = schema.clone().into();
-        let from_type = try_cast.expr().data_type(&arrow_schema)?;
+        let arrow_schema = schema.as_arrow();
+        let from_type = try_cast.expr().data_type(arrow_schema)?;
         verify_support_type_for_prune(&from_type, try_cast.cast_type())?;
         let (left, op, right) =
             rewrite_expr_to_prunable(try_cast.expr(), op, scalar_expr, schema)?;
@@ -1114,13 +1205,6 @@ fn is_compare_op(op: Operator) -> bool {
     )
 }
 
-fn is_string_type(data_type: &DataType) -> bool {
-    matches!(
-        data_type,
-        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
-    )
-}
-
 // The pruning logic is based on the comparing the min/max bounds.
 // Must make sure the two type has order.
 // For example, casts from string to numbers is not correct.
@@ -1129,20 +1213,20 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re
     // Dictionary casts are always supported as long as the value types are supported
     let from_type = match from_type {
         DataType::Dictionary(_, t) => {
-            return verify_support_type_for_prune(t.as_ref(), to_type)
+            return verify_support_type_for_prune(t.as_ref(), to_type);
         }
         _ => from_type,
     };
     let to_type = match to_type {
         DataType::Dictionary(_, t) => {
-            return verify_support_type_for_prune(from_type, t.as_ref())
+            return verify_support_type_for_prune(from_type, t.as_ref());
         }
         _ => to_type,
     };
     // If both types are strings or both are not strings (number, timestamp, etc)
     // then we can compare them.
     // PruningPredicate does not support casting of strings to numbers and such.
-    if is_string_type(from_type) == is_string_type(to_type) {
+    if from_type.is_string() == to_type.is_string() {
         Ok(())
     } else {
         plan_err!(
@@ -1158,10 +1242,10 @@ fn rewrite_column_expr(
     column_new: &phys_expr::Column,
 ) -> Result<Arc<dyn PhysicalExpr>> {
     e.transform(|expr| {
-        if let Some(column) = expr.as_any().downcast_ref::<phys_expr::Column>() {
-            if column == column_old {
-                return Ok(Transformed::yes(Arc::new(column_new.clone())));
-            }
+        if let Some(column) = expr.as_any().downcast_ref::<phys_expr::Column>()
+            && column == column_old
+        {
+            return Ok(Transformed::yes(Arc::new(column_new.clone())));
         }
 
         Ok(Transformed::no(expr))
@@ -1171,9 +1255,9 @@ fn rewrite_column_expr(
 
 fn reverse_operator(op: Operator) -> Result<Operator> {
     op.swap().ok_or_else(|| {
-        DataFusionError::Internal(format!(
+        internal_datafusion_err!(
             "Could not reverse operator {op} while building pruning predicate"
-        ))
+        )
     })
 }
 
@@ -1189,7 +1273,7 @@ fn build_single_column_expr(
 ) -> Option<Arc<dyn PhysicalExpr>> {
     let field = schema.field_with_name(column.name()).ok()?;
 
-    if matches!(field.data_type(), &DataType::Boolean) {
+    if *field.data_type() == DataType::Boolean {
         let col_ref = Arc::new(column.clone()) as _;
 
         let min = required_columns
@@ -1321,7 +1405,7 @@ impl PredicateRewriter {
         let mut required_columns = RequiredColumns::new();
         build_predicate_expression(
             expr,
-            schema,
+            &Arc::new(schema.clone()),
             &mut required_columns,
             &self.unhandled_hook,
         )
@@ -1339,7 +1423,7 @@ impl PredicateRewriter {
 /// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will fall back to calling `unhandled_hook`
 fn build_predicate_expression(
     expr: &Arc<dyn PhysicalExpr>,
-    schema: &Schema,
+    schema: &SchemaRef,
     required_columns: &mut RequiredColumns,
     unhandled_hook: &Arc<dyn UnhandledPredicateHook>,
 ) -> Arc<dyn PhysicalExpr> {
@@ -1467,8 +1551,17 @@ fn build_predicate_expression(
         return expr;
     }
 
-    let expr_builder =
-        PruningExpressionBuilder::try_new(&left, &right, op, schema, required_columns);
+    let left_columns = ColumnReferenceCount::from_expression(&left);
+    let right_columns = ColumnReferenceCount::from_expression(&right);
+    let expr_builder = PruningExpressionBuilder::try_new(
+        &left,
+        &right,
+        left_columns,
+        right_columns,
+        op,
+        schema,
+        required_columns,
+    );
     let mut expr_builder = match expr_builder {
         Ok(builder) => builder,
         // allow partial failure in predicate expression generation
@@ -1483,6 +1576,50 @@ fn build_predicate_expression(
         .unwrap_or_else(|_| unhandled_hook.handle(expr))
 }
 
+/// Count of distinct column references in an expression.
+/// This is the same as [`collect_columns`] but optimized to stop counting
+/// once more than one distinct column is found.
+///
+/// For example, in expression `col1 + col2`, the count is `Many`.
+/// In expression `col1 + 5`, the count is `One`.
+/// In expression `5 + 10`, the count is `Zero`.
+///
+/// [`collect_columns`]: datafusion_physical_expr::utils::collect_columns
+#[derive(Debug, PartialEq, Eq)]
+enum ColumnReferenceCount {
+    /// no column references
+    Zero,
+    /// Only one column reference
+    One(phys_expr::Column),
+    /// More than one column reference
+    Many,
+}
+
+impl ColumnReferenceCount {
+    /// Count the number of distinct column references in an expression
+    fn from_expression(expr: &Arc<dyn PhysicalExpr>) -> Self {
+        let mut seen = HashSet::<phys_expr::Column>::new();
+        expr.apply(|expr| {
+            if let Some(column) = expr.as_any().downcast_ref::<phys_expr::Column>() {
+                seen.insert(column.clone());
+                if seen.len() > 1 {
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        // pre_visit always returns OK, so this will always too
+        .expect("no way to return error during recursion");
+        match seen.len() {
+            0 => ColumnReferenceCount::Zero,
+            1 => ColumnReferenceCount::One(
+                seen.into_iter().next().expect("just checked len==1"),
+            ),
+            _ => ColumnReferenceCount::Many,
+        }
+    }
+}
+
 fn build_statistics_expr(
     expr_builder: &mut PruningExpressionBuilder,
 ) -> Result<Arc<dyn PhysicalExpr>> {
@@ -1754,13 +1891,13 @@ fn increment_utf8(data: &str) -> Option<String> {
         let original = code_points[idx] as u32;
 
         // Try incrementing the code point
-        if let Some(next_char) = char::from_u32(original + 1) {
-            if is_valid_unicode(next_char) {
-                code_points[idx] = next_char;
-                // truncate the string to the current index
-                code_points.truncate(idx + 1);
-                return Some(code_points.into_iter().collect());
-            }
+        if let Some(next_char) = char::from_u32(original + 1)
+            && is_valid_unicode(next_char)
+        {
+            code_points[idx] = next_char;
+            // truncate the string to the current index
+            code_points.truncate(idx + 1);
+            return Some(code_points.into_iter().collect());
         }
     }
 
@@ -1822,6 +1959,7 @@ mod tests {
     use super::*;
     use datafusion_common::test_util::batches_to_string;
     use datafusion_expr::{and, col, lit, or};
+    use datafusion_physical_expr::utils::collect_columns;
     use insta::assert_snapshot;
 
     use arrow::array::Decimal128Array;
@@ -1830,10 +1968,13 @@ mod tests {
         datatypes::TimeUnit,
     };
     use datafusion_expr::expr::InList;
-    use datafusion_expr::{cast, is_null, try_cast, Expr};
+    use datafusion_expr::{Expr, cast, is_null, try_cast};
     use datafusion_functions_nested::expr_fn::{array_has, make_array};
-    use datafusion_physical_expr::expressions as phys_expr;
+    use datafusion_physical_expr::expressions::{
+        self as phys_expr, DynamicFilterPhysicalExpr,
+    };
     use datafusion_physical_expr::planner::logical2physical;
+    use itertools::Itertools;
 
     #[derive(Debug, Default)]
     /// Mock statistic provider for tests
@@ -2712,6 +2853,164 @@ mod tests {
         Ok(())
     }
 
+    /// Test that non-boolean literal expressions don't prune any containers and error gracefully by not pruning anything instead of e.g. panicking
+    #[test]
+    fn row_group_predicate_non_boolean() {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_expr(lit(1), &schema, &statistics, expected_ret);
+    }
+
+    // Test that literal-to-literal comparisons are correctly evaluated.
+    // When both sides are constants, the expression should be evaluated directly
+    // and if it's false, all containers should be pruned.
+    #[test]
+    fn row_group_predicate_literal_false() {
+        // lit(1) = lit(2) is always false, so all containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[false];
+        prune_with_simplified_expr(lit(1).eq(lit(2)), &schema, &statistics, expected_ret);
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_literal_true() {
+        // lit(1) = lit(1) is always true, so no containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_simplified_expr(lit(1).eq(lit(1)), &schema, &statistics, expected_ret);
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_literal_null() {
+        // lit(1) = null is always null, so no containers should be pruned
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+        let expected_ret = &[true];
+        prune_with_simplified_expr(
+            lit(1).eq(lit(ScalarValue::Null)),
+            &schema,
+            &statistics,
+            expected_ret,
+        );
+    }
+
+    /// Test nested/complex literal expression trees.
+    /// This is an integration test that PhysicalExprSimplifier + PruningPredicate work together as expected.
+    #[test]
+    fn row_group_predicate_complex_literals() {
+        let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
+        let statistics = TestStatistics::new()
+            .with("c1", ContainerStats::new_i32(vec![Some(0)], vec![Some(10)]));
+
+        // (1 + 2) > 0 is always true
+        prune_with_simplified_expr(
+            (lit(1) + lit(2)).gt(lit(0)),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // (1 + 2) < 0 is always false
+        prune_with_simplified_expr(
+            (lit(1) + lit(2)).lt(lit(0)),
+            &schema,
+            &statistics,
+            &[false],
+        );
+
+        // Nested AND of literals: true AND false = false
+        prune_with_simplified_expr(
+            lit(true).and(lit(false)),
+            &schema,
+            &statistics,
+            &[false],
+        );
+
+        // Nested OR of literals: true OR false = true
+        prune_with_simplified_expr(
+            lit(true).or(lit(false)),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // Complex nested: (1 < 2) AND (3 > 1) = true AND true = true
+        prune_with_simplified_expr(
+            lit(1).lt(lit(2)).and(lit(3).gt(lit(1))),
+            &schema,
+            &statistics,
+            &[true],
+        );
+
+        // Complex nested: (1 > 2) OR (3 < 1) = false OR false = false
+        prune_with_simplified_expr(
+            lit(1).gt(lit(2)).or(lit(3).lt(lit(1))),
+            &schema,
+            &statistics,
+            &[false],
+        );
+    }
+
+    /// Integration test demonstrating that a dynamic filter with replaced children as literals will be snapshotted, simplified and then pruned correctly.
+    #[test]
+    fn row_group_predicate_dynamic_filter_with_literals() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Int32, true),
+            Field::new("part", DataType::Utf8, true),
+        ]));
+        let statistics = TestStatistics::new()
+            // Note that we have no stats, pruning can only happen via partition value pruning from the dynamic filter
+            .with_row_counts("c1", vec![Some(10)]);
+        let dynamic_filter_expr = col("c1").gt(lit(5)).and(col("part").eq(lit("B")));
+        let phys_expr = logical2physical(&dynamic_filter_expr, &schema);
+        let children = collect_columns(&phys_expr)
+            .iter()
+            .map(|c| Arc::new(c.clone()) as Arc<dyn PhysicalExpr>)
+            .collect_vec();
+        let dynamic_phys_expr =
+            Arc::new(DynamicFilterPhysicalExpr::new(children, phys_expr))
+                as Arc<dyn PhysicalExpr>;
+        // Simulate the partition value substitution that would happen in ParquetOpener
+        let remapped_expr = dynamic_phys_expr
+            .children()
+            .into_iter()
+            .map(|child_expr| {
+                let Some(col_expr) =
+                    child_expr.as_any().downcast_ref::<phys_expr::Column>()
+                else {
+                    return Arc::clone(child_expr);
+                };
+                if col_expr.name() == "part" {
+                    // simulate dynamic filter replacement with literal "A"
+                    Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+                        "A".to_string(),
+                    )))) as Arc<dyn PhysicalExpr>
+                } else {
+                    Arc::clone(child_expr)
+                }
+            })
+            .collect_vec();
+        let dynamic_filter_expr =
+            dynamic_phys_expr.with_new_children(remapped_expr).unwrap();
+        // After substitution the expression is c1 > 5 AND part = "B" which should prune the file since the partition value is "A"
+        let expected = &[false];
+        let p =
+            PruningPredicate::try_new(dynamic_filter_expr, Arc::clone(&schema)).unwrap();
+        let result = p.prune(&statistics).unwrap();
+        assert_eq!(result, expected);
+    }
+
     #[test]
     fn row_group_predicate_lt_bool() -> Result<()> {
         let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]);
@@ -2743,7 +3042,7 @@ mod tests {
             test_build_predicate_expression(&expr, &schema, &mut required_columns);
         assert_eq!(predicate_expr.to_string(), expected_expr);
         println!("required_columns: {required_columns:#?}"); // for debugging assertions below
-                                                             // c1 < 1 should add c1_min
+        // c1 < 1 should add c1_min
         let c1_min_field = Field::new("c1_min", DataType::Int32, false);
         assert_eq!(
             required_columns.columns[0],
@@ -4375,8 +4674,8 @@ mod tests {
             true,
             // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
             true,
-            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate 
-            // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate
+            // original (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
             true,
         ];
         prune_with_expr(expr, &schema, &statistics, expected_ret);
@@ -5061,7 +5360,6 @@ mod tests {
     ///
     /// `expected` is a vector of bools, where true means the row group should
     /// be kept, and false means it should be pruned.
-    ///
     // TODO refactor other tests to use this to reduce boiler plate
     fn prune_with_expr(
         expr: Expr,
@@ -5076,6 +5374,21 @@ mod tests {
         assert_eq!(result, expected);
     }
 
+    fn prune_with_simplified_expr(
+        expr: Expr,
+        schema: &SchemaRef,
+        statistics: &TestStatistics,
+        expected: &[bool],
+    ) {
+        println!("Pruning with expr: {expr}");
+        let expr = logical2physical(&expr, schema);
+        let simplifier = PhysicalExprSimplifier::new(schema);
+        let expr = simplifier.simplify(expr).unwrap();
+        let p = PruningPredicate::try_new(expr, Arc::<Schema>::clone(schema)).unwrap();
+        let result = p.prune(statistics).unwrap();
+        assert_eq!(result, expected);
+    }
+
     fn test_build_predicate_expression(
         expr: &Expr,
         schema: &Schema,
@@ -5083,7 +5396,12 @@ mod tests {
     ) -> Arc<dyn PhysicalExpr> {
         let expr = logical2physical(expr, schema);
         let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _;
-        build_predicate_expression(&expr, schema, required_columns, &unhandled_hook)
+        build_predicate_expression(
+            &expr,
+            &Arc::new(schema.clone()),
+            required_columns,
+            &unhandled_hook,
+        )
     }
 
     #[test]
diff --git a/datafusion/session/Cargo.toml b/datafusion/session/Cargo.toml
index c6e268735a7b3..230e26d1fc9fc 100644
--- a/datafusion/session/Cargo.toml
+++ b/datafusion/session/Cargo.toml
@@ -18,11 +18,11 @@
 [package]
 name = "datafusion-session"
 description = "datafusion-session"
+readme = "README.md"
 authors.workspace = true
 edition.workspace = true
 homepage.workspace = true
 license.workspace = true
-readme.workspace = true
 repository.workspace = true
 rust-version.workspace = true
 version.workspace = true
@@ -31,22 +31,15 @@ version.workspace = true
 all-features = true
 
 [dependencies]
-arrow = { workspace = true }
 async-trait = { workspace = true }
-dashmap = { workspace = true }
 datafusion-common = { workspace = true }
-datafusion-common-runtime = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-physical-expr = { workspace = true }
 datafusion-physical-plan = { workspace = true }
-datafusion-sql = { workspace = true }
-futures = { workspace = true }
-itertools = { workspace = true }
-log = { workspace = true }
-object_store = { workspace = true }
 parking_lot = { workspace = true }
-tokio = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
diff --git a/datafusion/session/README.md b/datafusion/session/README.md
index 019f9f8892476..4bb605b1e199c 100644
--- a/datafusion/session/README.md
+++ b/datafusion/session/README.md
@@ -17,10 +17,16 @@
   under the License.
 -->
 
-# DataFusion Session
+# Apache DataFusion Session
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
 This crate provides **session-related abstractions** used in the DataFusion query engine. A _session_ represents the runtime context for query execution, including configuration, runtime environment, function registry, and planning.
 
-[df]: https://crates.io/crates/datafusion
+Most projects should use the [`datafusion`] crate directly, which re-exports
+this module. If you are already using the [`datafusion`] crate, there is no
+reason to use this crate directly in your project as well.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
diff --git a/datafusion/session/src/lib.rs b/datafusion/session/src/lib.rs
index a2e1d9ca3ae8f..11f734e757452 100644
--- a/datafusion/session/src/lib.rs
+++ b/datafusion/session/src/lib.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
+
 //! Session management for DataFusion query execution environment
 //!
 //! This module provides the core session management functionality for DataFusion,
diff --git a/datafusion/session/src/session.rs b/datafusion/session/src/session.rs
index de23dba491fd6..2593e8cd71f4c 100644
--- a/datafusion/session/src/session.rs
+++ b/datafusion/session/src/session.rs
@@ -18,9 +18,9 @@
 use async_trait::async_trait;
 use datafusion_common::config::{ConfigOptions, TableOptions};
 use datafusion_common::{DFSchema, Result};
+use datafusion_execution::TaskContext;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnv;
-use datafusion_execution::TaskContext;
 use datafusion_expr::execution_props::ExecutionProps;
 use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF};
 use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
@@ -57,9 +57,12 @@ use std::sync::{Arc, Weak};
 /// // Given a `Session` reference, get the concrete `SessionState` reference
 /// // Note: this may stop working in future versions,
 /// fn session_state_from_session(session: &dyn Session) -> Result<&SessionState> {
-///    session.as_any()
-///     .downcast_ref::<SessionState>()
-///     .ok_or_else(|| exec_datafusion_err!("Failed to downcast Session to SessionState"))
+///     session
+///         .as_any()
+///         .downcast_ref::<SessionState>()
+///         .ok_or_else(|| {
+///             exec_datafusion_err!("Failed to downcast Session to SessionState")
+///         })
 /// }
 /// ```
 ///
@@ -97,7 +100,7 @@ pub trait Session: Send + Sync {
     /// + 2` will not be simplified to `a = 3` as this is a more involved process.
     /// See the [expr_api] example for how to simplify expressions.
     ///
-    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
+    /// [expr_api]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs
     fn create_physical_expr(
         &self,
         expr: Expr,
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
index 1ded8c40aa4b3..162b6d814e804 100644
--- a/datafusion/spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -22,13 +22,20 @@ version = { workspace = true }
 homepage = { workspace = true }
 repository = { workspace = true }
 authors = { workspace = true }
-readme = { workspace = true }
+readme = "README.md"
 license = { workspace = true }
 edition = { workspace = true }
 
 [package.metadata.docs.rs]
 all-features = true
 
+[features]
+default = []
+core = ["datafusion"]
+
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -37,10 +44,56 @@ name = "datafusion_spark"
 
 [dependencies]
 arrow = { workspace = true }
+bigdecimal = { workspace = true }
+chrono = { workspace = true }
+crc32fast = "1.4"
+# Optional dependency for SessionStateBuilderSpark extension trait
+datafusion = { workspace = true, optional = true, default-features = false }
 datafusion-catalog = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
-datafusion-functions = { workspace = true }
-datafusion-macros = { workspace = true }
+datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
+datafusion-functions-aggregate = { workspace = true }
+datafusion-functions-nested = { workspace = true }
 log = { workspace = true }
+percent-encoding = "2.3.2"
+rand = { workspace = true }
+serde_json = { workspace = true }
+sha1 = "0.10"
+sha2 = { workspace = true }
+url = { workspace = true }
+
+[dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
+criterion = { workspace = true }
+# for SessionStateBuilderSpark tests
+datafusion = { workspace = true, default-features = false }
+
+[[bench]]
+harness = false
+name = "char"
+
+[[bench]]
+harness = false
+name = "space"
+
+[[bench]]
+harness = false
+name = "hex"
+
+[[bench]]
+harness = false
+name = "slice"
+
+[[bench]]
+harness = false
+name = "substring"
+
+[[bench]]
+harness = false
+name = "unhex"
+
+[[bench]]
+harness = false
+name = "sha2"
diff --git a/datafusion/spark/README.md b/datafusion/spark/README.md
index c92ada0ab4772..7cb24084cd228 100644
--- a/datafusion/spark/README.md
+++ b/datafusion/spark/README.md
@@ -17,9 +17,15 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-# datafusion-spark: Spark-compatible Expressions
+# Apache DataFusion Spark-compatible Expressions
 
-This crate provides Apache Spark-compatible expressions for use with DataFusion.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that provides [Apache Spark] compatible expressions for use with DataFusion.
+
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[apache spark]: https://spark.apache.org/
 
 ## Testing Guide
 
@@ -29,12 +35,15 @@ or `coerce_types`) is not applied.
 Therefore, direct invocation tests should only be used to verify that the function is correctly implemented.
 
 Please be sure to add additional tests beyond direct invocation.
-For more detailed testing guidelines, refer to
-the [Spark SQLLogicTest README](../sqllogictest/test_files/spark/README.md).
+For more detailed testing guidelines, refer to the [Spark SQLLogicTest README].
 
 ## Implementation References
 
 When implementing Spark-compatible functions, you can check if there are existing implementations in
-the [Sail](https://github.com/lakehq/sail) or [Comet](https://github.com/apache/datafusion-comet) projects first.
+the [Sail] or [Comet] projects first.
 If you do port functionality from these sources, make sure to port over the corresponding tests too, to ensure
 correctness and compatibility.
+
+[spark sqllogictest readme]: ../sqllogictest/test_files/spark/README.md
+[sail]: https://github.com/lakehq/sail
+[comet]: https://github.com/apache/datafusion-comet
diff --git a/datafusion/spark/benches/char.rs b/datafusion/spark/benches/char.rs
new file mode 100644
index 0000000000000..38d9ebdeb4f5f
--- /dev/null
+++ b/datafusion/spark/benches/char.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use arrow::{array::PrimitiveArray, datatypes::Int64Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::char;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Returns fixed seedable RNG
+pub fn seedable_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let cot_fn = char();
+    let size = 1024;
+    let input: PrimitiveArray<Int64Type> = {
+        let null_density = 0.2;
+        let mut rng = StdRng::seed_from_u64(42);
+        (0..size)
+            .map(|_| {
+                if rng.random::<f32>() < null_density {
+                    None
+                } else {
+                    Some(rng.random_range::<i64, _>(1i64..10_000))
+                }
+            })
+            .collect()
+    };
+    let input = Arc::new(input);
+    let args = vec![ColumnarValue::Array(input)];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function("char", |b| {
+        b.iter(|| {
+            black_box(
+                cot_fn
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/hex.rs b/datafusion/spark/benches/hex.rs
new file mode 100644
index 0000000000000..9785371cc5827
--- /dev/null
+++ b/datafusion/spark/benches/hex.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::*;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::math::hex::SparkHex;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn seedable_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+fn generate_int64_data(size: usize, null_density: f32) -> PrimitiveArray<Int64Type> {
+    let mut rng = seedable_rng();
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < null_density {
+                None
+            } else {
+                Some(rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999))
+            }
+        })
+        .collect()
+}
+
+fn generate_utf8_data(size: usize, null_density: f32) -> StringArray {
+    let mut rng = seedable_rng();
+    let mut builder = StringBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let s: String =
+                std::iter::repeat_with(|| rng.random_range(b'a'..=b'z') as char)
+                    .take(len)
+                    .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_binary_data(size: usize, null_density: f32) -> BinaryArray {
+    let mut rng = seedable_rng();
+    let mut builder = BinaryBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
+            builder.append_value(&bytes);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_int64_dict_data(
+    size: usize,
+    null_density: f32,
+) -> DictionaryArray<Int32Type> {
+    let mut rng = seedable_rng();
+    let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int64Type>::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            builder.append_value(
+                rng.random_range::<i64, _>(-999_999_999_999..999_999_999_999),
+            );
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
+    let hex_func = SparkHex::new();
+    let args = vec![ColumnarValue::Array(array)];
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                hex_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    for &size in &sizes {
+        let data = generate_int64_data(size, null_density);
+        run_benchmark(c, "hex_int64", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_utf8_data(size, null_density);
+        run_benchmark(c, "hex_utf8", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_binary_data(size, null_density);
+        run_benchmark(c, "hex_binary", size, Arc::new(data));
+    }
+
+    for &size in &sizes {
+        let data = generate_int64_dict_data(size, null_density);
+        run_benchmark(c, "hex_int64_dict", size, Arc::new(data));
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/sha2.rs b/datafusion/spark/benches/sha2.rs
new file mode 100644
index 0000000000000..6e835984703f0
--- /dev/null
+++ b/datafusion/spark/benches/sha2.rs
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::*;
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::hash::sha2::SparkSha2;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn seedable_rng() -> StdRng {
+    StdRng::seed_from_u64(42)
+}
+
+fn generate_binary_data(size: usize, null_density: f32) -> BinaryArray {
+    let mut rng = seedable_rng();
+    let mut builder = BinaryBuilder::new();
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(1..=100);
+            let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
+            builder.append_value(&bytes);
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, args: &[ColumnarValue]) {
+    let sha2_func = SparkSha2::new();
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                sha2_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.to_vec(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    // Scalar benchmark (avoid array expansion)
+    let scalar_args = vec![
+        ColumnarValue::Scalar(ScalarValue::Binary(Some(b"Spark".to_vec()))),
+        ColumnarValue::Scalar(ScalarValue::Int32(Some(256))),
+    ];
+    run_benchmark(c, "sha2/scalar", 1, &scalar_args);
+
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    for &size in &sizes {
+        let values: ArrayRef = Arc::new(generate_binary_data(size, null_density));
+        let bit_lengths: ArrayRef = Arc::new(Int32Array::from(vec![256; size]));
+
+        let array_args = vec![
+            ColumnarValue::Array(Arc::clone(&values)),
+            ColumnarValue::Array(Arc::clone(&bit_lengths)),
+        ];
+        run_benchmark(c, "sha2/array_binary_256", size, &array_args);
+
+        let array_scalar_args = vec![
+            ColumnarValue::Array(Arc::clone(&values)),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(256))),
+        ];
+        run_benchmark(c, "sha2/array_scalar_binary_256", size, &array_scalar_args);
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/slice.rs b/datafusion/spark/benches/slice.rs
new file mode 100644
index 0000000000000..da392dc042f92
--- /dev/null
+++ b/datafusion/spark/benches/slice.rs
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Int64Array, ListArray, ListViewArray, NullBufferBuilder, PrimitiveArray,
+};
+use arrow::buffer::{OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field, Int64Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::array::slice;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_inputs(
+    rng: &mut StdRng,
+    size: usize,
+    child_array_size: usize,
+    null_density: f32,
+) -> (ListArray, ListViewArray) {
+    let mut nulls_builder = NullBufferBuilder::new(size);
+    let mut sizes = Vec::with_capacity(size);
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            nulls_builder.append_null();
+        } else {
+            nulls_builder.append_non_null();
+        }
+        sizes.push(rng.random_range(1..child_array_size));
+    }
+    let nulls = nulls_builder.finish();
+
+    let length = sizes.iter().sum();
+    let values: PrimitiveArray<Int64Type> =
+        (0..length).map(|_| Some(rng.random())).collect();
+    let values = Arc::new(values);
+
+    let offsets = OffsetBuffer::from_lengths(sizes.clone());
+    let list_array = ListArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets.clone(),
+        values.clone(),
+        nulls.clone(),
+    );
+
+    let offsets = ScalarBuffer::from(offsets.slice(0, size - 1));
+    let sizes = ScalarBuffer::from_iter(sizes.into_iter().map(|v| v as i32));
+    let list_view_array = ListViewArray::new(
+        Arc::new(Field::new_list_field(DataType::Int64, true)),
+        offsets,
+        sizes,
+        values,
+        nulls,
+    );
+
+    (list_array, list_view_array)
+}
+
+fn random_from_to(
+    rng: &mut StdRng,
+    size: i64,
+    null_density: f32,
+) -> (Option<i64>, Option<i64>) {
+    let from = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        Some(rng.random_range(1..=size))
+    };
+
+    let to = if rng.random::<f32>() < null_density {
+        None
+    } else {
+        match from {
+            Some(from) => Some(rng.random_range(from..=size)),
+            None => Some(rng.random_range(1..=size)),
+        }
+    };
+
+    (from, to)
+}
+
+fn array_slice_benchmark(
+    name: &str,
+    input: ColumnarValue,
+    mut args: Vec<ColumnarValue>,
+    c: &mut Criterion,
+    size: usize,
+) {
+    args.insert(0, input);
+
+    let array_slice = slice();
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            <Arc<Field>>::from(Field::new(format!("arg_{idx}"), arg.data_type(), true))
+        })
+        .collect::<Vec<_>>();
+    c.bench_function(name, |b| {
+        b.iter(|| {
+            black_box(
+                array_slice
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Field::new_list_field(args[0].data_type(), true)
+                            .into(),
+                        config_options: Arc::new(ConfigOptions::default()),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let rng = &mut StdRng::seed_from_u64(42);
+    let size = 1_000_000;
+    let child_array_size = 100;
+    let null_density = 0.1;
+
+    let (list_array, list_view_array) =
+        create_inputs(rng, size, child_array_size, null_density);
+
+    let mut array_from = Vec::with_capacity(size);
+    let mut array_to = Vec::with_capacity(size);
+    for child_array_size in list_array.offsets().lengths() {
+        let (from, to) = random_from_to(rng, child_array_size as i64, null_density);
+        array_from.push(from);
+        array_to.push(to);
+    }
+
+    // input
+    let list_array = ColumnarValue::Array(Arc::new(list_array));
+    let list_view_array = ColumnarValue::Array(Arc::new(list_view_array));
+
+    // args
+    let array_from = ColumnarValue::Array(Arc::new(Int64Array::from(array_from)));
+    let array_to = ColumnarValue::Array(Arc::new(Int64Array::from(array_to)));
+    let scalar_from = ColumnarValue::Scalar(ScalarValue::from(1i64));
+    let scalar_to = ColumnarValue::Scalar(ScalarValue::from(child_array_size as i64 / 2));
+
+    for input in [list_array, list_view_array] {
+        let input_type = input.data_type().to_string();
+
+        array_slice_benchmark(
+            &format!("slice: input {input_type}, array args, no stride"),
+            input.clone(),
+            vec![array_from.clone(), array_to.clone()],
+            c,
+            size,
+        );
+
+        array_slice_benchmark(
+            &format!("slice: input {input_type}, scalar args, no stride"),
+            input.clone(),
+            vec![scalar_from.clone(), scalar_to.clone()],
+            c,
+            size,
+        );
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/space.rs b/datafusion/spark/benches/space.rs
new file mode 100644
index 0000000000000..bd9d370ca37fe
--- /dev/null
+++ b/datafusion/spark/benches/space.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::PrimitiveArray;
+use arrow::datatypes::{DataType, Field, Int32Type};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::space;
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let space_func = space();
+    let size = 1024;
+    let input: PrimitiveArray<Int32Type> = {
+        let null_density = 0.2;
+        let mut rng = StdRng::seed_from_u64(42);
+        (0..size)
+            .map(|_| {
+                if rng.random::<f32>() < null_density {
+                    None
+                } else {
+                    Some(rng.random_range::<i32, _>(1i32..10))
+                }
+            })
+            .collect()
+    };
+    let input = Arc::new(input);
+    let args = vec![ColumnarValue::Array(input)];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+    c.bench_function("space", |b| {
+        b.iter(|| {
+            black_box(
+                space_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Utf8, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/substring.rs b/datafusion/spark/benches/substring.rs
new file mode 100644
index 0000000000000..d6eac817c322f
--- /dev/null
+++ b/datafusion/spark/benches/substring.rs
@@ -0,0 +1,205 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::substring;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_args_without_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    start_half_way: bool,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array = Arc::new(Int64Array::from(
+        (0..size)
+            .map(|_| {
+                if start_half_way {
+                    (str_len / 2) as i64
+                } else {
+                    1i64
+                }
+            })
+            .collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+        ]
+    }
+}
+
+fn create_args_with_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    count_max: usize,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array =
+        Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
+    let count = count_max.min(str_len) as i64;
+    let count_array = Arc::new(Int64Array::from(
+        (0..size).map(|_| count).collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+            ColumnarValue::Array(count_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+            ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
+        ]
+    }
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn invoke_substr_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    substring().invoke_with_args(ScalarFunctionArgs {
+        args: args.clone(),
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8View, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        // string_len = 12, substring_len=6 (see `create_args_without_count`)
+        let len = 12;
+        let mut group = c.benchmark_group("SHORTER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        let args = create_args_without_count::<i64>(size, len, true, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=64, substring_len=64
+        let len = 128;
+        let count = 64;
+        let mut group = c.benchmark_group("LONGER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=6, substring_len=6
+        let len = 128;
+        let count = 6;
+        let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/benches/unhex.rs b/datafusion/spark/benches/unhex.rs
new file mode 100644
index 0000000000000..7dce683485bc7
--- /dev/null
+++ b/datafusion/spark/benches/unhex.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, LargeStringArray, LargeStringBuilder, StringArray, StringBuilder,
+    StringViewArray, StringViewBuilder,
+};
+use arrow::datatypes::{DataType, Field};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_spark::function::math::unhex::SparkUnhex;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn generate_hex_string_data(size: usize, null_density: f32) -> StringArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = StringBuilder::with_capacity(size, 0);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_hex_large_string_data(size: usize, null_density: f32) -> LargeStringArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = LargeStringBuilder::with_capacity(size, 0);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn generate_hex_utf8view_data(size: usize, null_density: f32) -> StringViewArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let mut builder = StringViewBuilder::with_capacity(size);
+    let hex_chars = b"0123456789abcdefABCDEF";
+
+    for _ in 0..size {
+        if rng.random::<f32>() < null_density {
+            builder.append_null();
+        } else {
+            let len = rng.random_range::<usize, _>(2..=100);
+            let s: String = std::iter::repeat_with(|| {
+                hex_chars[rng.random_range(0..hex_chars.len())] as char
+            })
+            .take(len)
+            .collect();
+            builder.append_value(&s);
+        }
+    }
+    builder.finish()
+}
+
+fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
+    let unhex_func = SparkUnhex::new();
+    let args = vec![ColumnarValue::Array(array)];
+    let arg_fields: Vec<_> = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    c.bench_function(&format!("{name}/size={size}"), |b| {
+        b.iter(|| {
+            black_box(
+                unhex_func
+                    .invoke_with_args(ScalarFunctionArgs {
+                        args: args.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: size,
+                        return_field: Arc::new(Field::new("f", DataType::Binary, true)),
+                        config_options: Arc::clone(&config_options),
+                    })
+                    .unwrap(),
+            )
+        })
+    });
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let sizes = vec![1024, 4096, 8192];
+    let null_density = 0.1;
+
+    // Benchmark with hex string
+    for &size in &sizes {
+        let data = generate_hex_string_data(size, null_density);
+        run_benchmark(c, "unhex_utf8", size, Arc::new(data));
+    }
+
+    // Benchmark with hex large string
+    for &size in &sizes {
+        let data = generate_hex_large_string_data(size, null_density);
+        run_benchmark(c, "unhex_large_utf8", size, Arc::new(data));
+    }
+
+    // Benchmark with hex Utf8View
+    for &size in &sizes {
+        let data = generate_hex_utf8view_data(size, null_density);
+        run_benchmark(c, "unhex_utf8view", size, Arc::new(data));
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/src/function/aggregate/avg.rs b/datafusion/spark/src/function/aggregate/avg.rs
new file mode 100644
index 0000000000000..9ad712713d26b
--- /dev/null
+++ b/datafusion/spark/src/function/aggregate/avg.rs
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, ArrowNativeTypeOp, ArrowNumericType, Int64Array, PrimitiveArray,
+    builder::PrimitiveBuilder,
+    cast::AsArray,
+    types::{Float64Type, Int64Type},
+};
+use arrow::compute::sum;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_float64};
+use datafusion_common::{Result, ScalarValue, not_impl_err};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Coercion, EmitTo, GroupsAccumulator, ReversedUDAF,
+    Signature, TypeSignatureClass, Volatility,
+};
+use std::{any::Any, sync::Arc};
+
+/// AVG aggregate expression
+/// Spark average aggregate expression. Differs from standard DataFusion average aggregate
+/// in that it uses an `i64` for the count (DataFusion version uses `u64`); also there is ANSI mode
+/// support planned in the future for Spark version.
+
+// TODO: see if can deduplicate with DF version
+//       https://github.com/apache/datafusion/issues/17964
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SparkAvg {
+    signature: Signature,
+}
+
+impl Default for SparkAvg {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAvg {
+    /// Implement AVG aggregate function
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Native(logical_float64()),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Float64,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl AggregateUDFImpl for SparkAvg {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        if acc_args.is_distinct {
+            return not_impl_err!("DistinctAvgAccumulator");
+        }
+
+        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+
+        // instantiate specialized accumulator based for the type
+        match (&data_type, &acc_args.return_type()) {
+            (DataType::Float64, DataType::Float64) => {
+                Ok(Box::<AvgAccumulator>::default())
+            }
+            (dt, return_type) => {
+                not_impl_err!("AvgAccumulator for ({dt} --> {return_type})")
+            }
+        }
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        Ok(vec![
+            Arc::new(Field::new(
+                format_state_name(self.name(), "sum"),
+                args.input_fields[0].data_type().clone(),
+                true,
+            )),
+            Arc::new(Field::new(
+                format_state_name(self.name(), "count"),
+                DataType::Int64,
+                true,
+            )),
+        ])
+    }
+
+    fn name(&self) -> &str {
+        "avg"
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn groups_accumulator_supported(&self, args: AccumulatorArgs) -> bool {
+        !args.is_distinct
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        let data_type = args.exprs[0].data_type(args.schema)?;
+
+        // instantiate specialized accumulator based for the type
+        match (&data_type, args.return_type()) {
+            (DataType::Float64, DataType::Float64) => {
+                Ok(Box::new(AvgGroupsAccumulator::<Float64Type, _>::new(
+                    args.return_field.data_type(),
+                    |sum: f64, count: i64| Ok(sum / count as f64),
+                )))
+            }
+            (dt, return_type) => {
+                not_impl_err!("AvgGroupsAccumulator for ({dt} --> {return_type})")
+            }
+        }
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Float64(None))
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+}
+
+/// An accumulator to compute the average
+#[derive(Debug, Default)]
+pub struct AvgAccumulator {
+    sum: Option<f64>,
+    count: i64,
+}
+
+impl Accumulator for AvgAccumulator {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            ScalarValue::Float64(self.sum),
+            ScalarValue::from(self.count),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let values = values[0].as_primitive::<Float64Type>();
+        self.count += (values.len() - values.null_count()) as i64;
+        let v = self.sum.get_or_insert(0.);
+        if let Some(x) = sum(values) {
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // counts are summed
+        self.count += sum(states[1].as_primitive::<Int64Type>()).unwrap_or_default();
+
+        // sums are summed
+        if let Some(x) = sum(states[0].as_primitive::<Float64Type>()) {
+            let v = self.sum.get_or_insert(0.);
+            *v += x;
+        }
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        if self.count == 0 {
+            // If all input are nulls, count will be 0 and we will get null after the division.
+            // This is consistent with Spark Average implementation.
+            Ok(ScalarValue::Float64(None))
+        } else {
+            Ok(ScalarValue::Float64(
+                self.sum.map(|f| f / self.count as f64),
+            ))
+        }
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+}
+
+/// An accumulator to compute the average of `[PrimitiveArray<T>]`.
+/// Stores values as native types, and does overflow checking
+///
+/// F: Function that calculates the average value from a sum of
+/// T::Native and a total count
+#[derive(Debug)]
+struct AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
+{
+    /// The type of the returned average
+    return_data_type: DataType,
+
+    /// Count per group (use i64 to make Int64Array)
+    counts: Vec<i64>,
+
+    /// Sums per group, stored as the native type
+    sums: Vec<T::Native>,
+
+    /// Function that computes the final average (value / count)
+    avg_fn: F,
+}
+
+impl<T, F> AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
+{
+    pub fn new(return_data_type: &DataType, avg_fn: F) -> Self {
+        Self {
+            return_data_type: return_data_type.clone(),
+            counts: vec![],
+            sums: vec![],
+            avg_fn,
+        }
+    }
+}
+
+impl<T, F> GroupsAccumulator for AvgGroupsAccumulator<T, F>
+where
+    T: ArrowNumericType + Send,
+    F: Fn(T::Native, i64) -> Result<T::Native> + Send + 'static,
+{
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow::array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 1, "single argument to update_batch");
+        let values = values[0].as_primitive::<T>();
+        let data = values.values();
+
+        // increment counts, update sums
+        self.counts.resize(total_num_groups, 0);
+        self.sums.resize(total_num_groups, T::default_value());
+
+        let iter = group_indices.iter().zip(data.iter());
+        if values.null_count() == 0 {
+            for (&group_index, &value) in iter {
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+                self.counts[group_index] += 1;
+            }
+        } else {
+            for (idx, (&group_index, &value)) in iter.enumerate() {
+                if values.is_null(idx) {
+                    continue;
+                }
+                let sum = &mut self.sums[group_index];
+                *sum = (*sum).add_wrapping(value);
+
+                self.counts[group_index] += 1;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        _opt_filter: Option<&arrow::array::BooleanArray>,
+        total_num_groups: usize,
+    ) -> Result<()> {
+        assert_eq!(values.len(), 2, "two arguments to merge_batch");
+        // first batch is partial sums, second is counts
+        let partial_sums = values[0].as_primitive::<T>();
+        let partial_counts = values[1].as_primitive::<Int64Type>();
+        // update counts with partial counts
+        self.counts.resize(total_num_groups, 0);
+        let iter1 = group_indices.iter().zip(partial_counts.values().iter());
+        for (&group_index, &partial_count) in iter1 {
+            self.counts[group_index] += partial_count;
+        }
+
+        // update sums
+        self.sums.resize(total_num_groups, T::default_value());
+        let iter2 = group_indices.iter().zip(partial_sums.values().iter());
+        for (&group_index, &new_value) in iter2 {
+            let sum = &mut self.sums[group_index];
+            *sum = sum.add_wrapping(new_value);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let sums = emit_to.take_needed(&mut self.sums);
+        let mut builder = PrimitiveBuilder::<T>::with_capacity(sums.len());
+        let iter = sums.into_iter().zip(counts);
+
+        for (sum, count) in iter {
+            if count != 0 {
+                builder.append_value((self.avg_fn)(sum, count)?)
+            } else {
+                builder.append_null();
+            }
+        }
+        let array: PrimitiveArray<T> = builder.finish();
+
+        Ok(Arc::new(array))
+    }
+
+    // return arrays for sums and counts
+    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        let counts = emit_to.take_needed(&mut self.counts);
+        let counts = Int64Array::new(counts.into(), None);
+
+        let sums = emit_to.take_needed(&mut self.sums);
+        let sums = PrimitiveArray::<T>::new(sums.into(), None)
+            .with_data_type(self.return_data_type.clone());
+
+        Ok(vec![
+            Arc::new(sums) as ArrayRef,
+            Arc::new(counts) as ArrayRef,
+        ])
+    }
+
+    fn size(&self) -> usize {
+        self.counts.capacity() * size_of::<i64>() + self.sums.capacity() * size_of::<T>()
+    }
+}
diff --git a/datafusion/spark/src/function/aggregate/collect.rs b/datafusion/spark/src/function/aggregate/collect.rs
new file mode 100644
index 0000000000000..50497e2826383
--- /dev/null
+++ b/datafusion/spark/src/function/aggregate/collect.rs
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use datafusion_functions_aggregate::array_agg::{
+    ArrayAggAccumulator, DistinctArrayAggAccumulator,
+};
+use std::{any::Any, sync::Arc};
+
+// Spark implementation of collect_list/collect_set aggregate function.
+// Differs from DataFusion ArrayAgg in the following ways:
+// - ignores NULL inputs
+// - returns an empty list when all inputs are NULL
+// - does not support ordering
+
+// <https://spark.apache.org/docs/latest/api/sql/index.html#collect_list>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCollectList {
+    signature: Signature,
+}
+
+impl Default for SparkCollectList {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCollectList {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl AggregateUDFImpl for SparkCollectList {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "collect_list"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        Ok(vec![
+            Field::new_list(
+                format_state_name(args.name, "collect_list"),
+                Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                true,
+            )
+            .into(),
+        ])
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let field = &acc_args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = true;
+        Ok(Box::new(NullToEmptyListAccumulator::new(
+            ArrayAggAccumulator::try_new(&data_type, ignore_nulls)?,
+            data_type,
+        )))
+    }
+}
+
+// <https://spark.apache.org/docs/latest/api/sql/index.html#collect_set>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCollectSet {
+    signature: Signature,
+}
+
+impl Default for SparkCollectSet {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCollectSet {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl AggregateUDFImpl for SparkCollectSet {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "collect_set"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        Ok(vec![
+            Field::new_list(
+                format_state_name(args.name, "collect_set"),
+                Field::new_list_field(args.input_fields[0].data_type().clone(), true),
+                true,
+            )
+            .into(),
+        ])
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        let field = &acc_args.expr_fields[0];
+        let data_type = field.data_type().clone();
+        let ignore_nulls = true;
+        Ok(Box::new(NullToEmptyListAccumulator::new(
+            DistinctArrayAggAccumulator::try_new(&data_type, None, ignore_nulls)?,
+            data_type,
+        )))
+    }
+}
+
+/// Wrapper accumulator that returns an empty list instead of NULL when all inputs are NULL.
+/// This implements Spark's behavior for collect_list and collect_set.
+#[derive(Debug)]
+struct NullToEmptyListAccumulator<T: Accumulator> {
+    inner: T,
+    data_type: DataType,
+}
+
+impl<T: Accumulator> NullToEmptyListAccumulator<T> {
+    pub fn new(inner: T, data_type: DataType) -> Self {
+        Self { inner, data_type }
+    }
+}
+
+impl<T: Accumulator> Accumulator for NullToEmptyListAccumulator<T> {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        self.inner.update_batch(values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        self.inner.merge_batch(states)
+    }
+
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        self.inner.state()
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        let result = self.inner.evaluate()?;
+        if result.is_null() {
+            let empty_array = arrow::array::new_empty_array(&self.data_type);
+            Ok(SingleRowListArrayBuilder::new(empty_array).build_list_scalar())
+        } else {
+            Ok(result)
+        }
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size() + self.data_type.size()
+    }
+}
diff --git a/datafusion/spark/src/function/aggregate/mod.rs b/datafusion/spark/src/function/aggregate/mod.rs
index 0856e2872d4f3..d6a2fe7a8503e 100644
--- a/datafusion/spark/src/function/aggregate/mod.rs
+++ b/datafusion/spark/src/function/aggregate/mod.rs
@@ -18,8 +18,45 @@
 use datafusion_expr::AggregateUDF;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+pub mod avg;
+pub mod collect;
+pub mod try_sum;
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((avg, "Returns the average value of a given column", arg1));
+    export_functions!((
+        try_sum,
+        "Returns the sum of values for a column, or NULL if overflow occurs",
+        arg1
+    ));
+    export_functions!((
+        collect_list,
+        "Returns a list created from the values in a column",
+        arg1
+    ));
+    export_functions!((
+        collect_set,
+        "Returns a set created from the values in a column",
+        arg1
+    ));
+}
+
+// TODO: try use something like datafusion_functions_aggregate::create_func!()
+pub fn avg() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(avg::SparkAvg::new()))
+}
+pub fn try_sum() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(try_sum::SparkTrySum::new()))
+}
+pub fn collect_list() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(collect::SparkCollectList::new()))
+}
+pub fn collect_set() -> Arc<AggregateUDF> {
+    Arc::new(AggregateUDF::new_from_impl(collect::SparkCollectSet::new()))
+}
 
 pub fn functions() -> Vec<Arc<AggregateUDF>> {
-    vec![]
+    vec![avg(), try_sum(), collect_list(), collect_set()]
 }
diff --git a/datafusion/spark/src/function/aggregate/try_sum.rs b/datafusion/spark/src/function/aggregate/try_sum.rs
new file mode 100644
index 0000000000000..6509cea26b716
--- /dev/null
+++ b/datafusion/spark/src/function/aggregate/try_sum.rs
@@ -0,0 +1,660 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, ArrowNumericType, AsArray, BooleanArray, PrimitiveArray};
+use arrow::datatypes::{
+    DECIMAL128_MAX_PRECISION, DataType, Decimal128Type, Field, FieldRef, Float64Type,
+    Int64Type,
+};
+use datafusion_common::{Result, ScalarValue, downcast_value, exec_err, not_impl_err};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::fmt::{Debug, Formatter};
+use std::mem::size_of_val;
+
+#[derive(PartialEq, Eq, Hash)]
+pub struct SparkTrySum {
+    signature: Signature,
+}
+
+impl Default for SparkTrySum {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTrySum {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl Debug for SparkTrySum {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SparkTrySum")
+            .field("signature", &self.signature)
+            .finish()
+    }
+}
+
+/// Accumulator for try_sum that detects overflow
+struct TrySumAccumulator<T: ArrowNumericType> {
+    sum: Option<T::Native>,
+    data_type: DataType,
+    failed: bool,
+    // Only used if data_type is Decimal128(p, s)
+    dec_precision: Option<u8>,
+}
+
+impl<T: ArrowNumericType> Debug for TrySumAccumulator<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TrySumAccumulator({})", self.data_type)
+    }
+}
+
+impl<T: ArrowNumericType> TrySumAccumulator<T> {
+    fn new(data_type: DataType) -> Self {
+        let dec_precision = match &data_type {
+            DataType::Decimal128(p, _) => Some(*p),
+            _ => None,
+        };
+        Self {
+            sum: None,
+            data_type,
+            failed: false,
+            dec_precision,
+        }
+    }
+}
+
+impl<T: ArrowNumericType> Accumulator for TrySumAccumulator<T> {
+    fn state(&mut self) -> Result<Vec<ScalarValue>> {
+        Ok(vec![
+            self.evaluate()?,
+            ScalarValue::Boolean(Some(self.failed)),
+        ])
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        update_batch_internal(self, values)
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        // Check if any partition has failed
+        if downcast_value!(states[1], BooleanArray)
+            .iter()
+            .flatten()
+            .any(|f| f)
+        {
+            self.failed = true;
+            return Ok(());
+        }
+
+        // Merge the sum values using the same logic as update_batch
+        update_batch_internal(self, states)
+    }
+
+    fn evaluate(&mut self) -> Result<ScalarValue> {
+        evaluate_internal(self)
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+}
+
+// Specialized implementations for update_batch for each type
+
+fn update_batch_internal<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    values: &[ArrayRef],
+) -> Result<()> {
+    if values.is_empty() || acc.failed {
+        return Ok(());
+    }
+
+    let array: &PrimitiveArray<T> = values[0].as_primitive::<T>();
+
+    match acc.data_type {
+        DataType::Int64 => update_int64(acc, array),
+        DataType::Float64 => update_float64(acc, array),
+        DataType::Decimal128(_, _) => update_decimal128(acc, array),
+        _ => exec_err!(
+            "try_sum: unsupported type in update_batch: {:?}",
+            acc.data_type
+        ),
+    }
+}
+
+fn update_int64<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    for v in array.iter().flatten() {
+        // Cast to i64 for checked_add
+        let v_i64 = unsafe { std::mem::transmute_copy::<T::Native, i64>(&v) };
+        let sum_i64 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, i64>(&s) });
+
+        let new_sum = match sum_i64 {
+            None => v_i64,
+            Some(s) => match s.checked_add(v_i64) {
+                Some(result) => result,
+                None => {
+                    acc.failed = true;
+                    return Ok(());
+                }
+            },
+        };
+
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<i64, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn update_float64<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    for v in array.iter().flatten() {
+        let v_f64 = unsafe { std::mem::transmute_copy::<T::Native, f64>(&v) };
+        let sum_f64 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, f64>(&s) })
+            .unwrap_or(0.0);
+        let new_sum = sum_f64 + v_f64;
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<f64, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn update_decimal128<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+    array: &PrimitiveArray<T>,
+) -> Result<()> {
+    let precision = acc.dec_precision.unwrap_or(38);
+
+    for v in array.iter().flatten() {
+        let v_i128 = unsafe { std::mem::transmute_copy::<T::Native, i128>(&v) };
+        let sum_i128 = acc
+            .sum
+            .map(|s| unsafe { std::mem::transmute_copy::<T::Native, i128>(&s) });
+
+        let new_sum = match sum_i128 {
+            None => v_i128,
+            Some(s) => match s.checked_add(v_i128) {
+                Some(result) => result,
+                None => {
+                    acc.failed = true;
+                    return Ok(());
+                }
+            },
+        };
+
+        if exceeds_decimal128_precision(new_sum, precision) {
+            acc.failed = true;
+            return Ok(());
+        }
+
+        acc.sum = Some(unsafe { std::mem::transmute_copy::<i128, T::Native>(&new_sum) });
+    }
+    Ok(())
+}
+
+fn evaluate_internal<T: ArrowNumericType>(
+    acc: &mut TrySumAccumulator<T>,
+) -> Result<ScalarValue> {
+    if acc.failed {
+        return ScalarValue::new_primitive::<T>(None, &acc.data_type);
+    }
+    ScalarValue::new_primitive::<T>(acc.sum, &acc.data_type)
+}
+
+// Helpers to determine if it exceeds decimal precision
+fn pow10_i128(p: u8) -> Option<i128> {
+    let mut v: i128 = 1;
+    for _ in 0..p {
+        v = v.checked_mul(10)?;
+    }
+    Some(v)
+}
+
+fn exceeds_decimal128_precision(sum: i128, p: u8) -> bool {
+    if let Some(max_plus_one) = pow10_i128(p) {
+        let max = max_plus_one - 1;
+        sum > max || sum < -max
+    } else {
+        true
+    }
+}
+
+impl AggregateUDFImpl for SparkTrySum {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "try_sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        use DataType::*;
+
+        let dt = &arg_types[0];
+        let result_type = match dt {
+            Null => Float64,
+            Decimal128(p, s) => {
+                let new_precision = DECIMAL128_MAX_PRECISION.min(p + 10);
+                Decimal128(new_precision, *s)
+            }
+            Int8 | Int16 | Int32 | Int64 => Int64,
+            Float16 | Float32 | Float64 => Float64,
+
+            other => return exec_err!("try_sum: unsupported type: {other:?}"),
+        };
+
+        Ok(result_type)
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        macro_rules! helper {
+            ($t:ty, $dt:expr) => {
+                Ok(Box::new(TrySumAccumulator::<$t>::new($dt.clone())))
+            };
+        }
+
+        match acc_args.return_field.data_type() {
+            DataType::Int64 => helper!(Int64Type, acc_args.return_field.data_type()),
+            DataType::Float64 => helper!(Float64Type, acc_args.return_field.data_type()),
+            DataType::Decimal128(_, _) => {
+                helper!(Decimal128Type, acc_args.return_field.data_type())
+            }
+            _ => not_impl_err!(
+                "try_sum: unsupported type for accumulator: {}",
+                acc_args.return_field.data_type()
+            ),
+        }
+    }
+
+    fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<FieldRef>> {
+        let sum_dt = args.return_field.data_type().clone();
+        Ok(vec![
+            Field::new(format_state_name(args.name, "sum"), sum_dt, true).into(),
+            Field::new(
+                format_state_name(args.name, "failed"),
+                DataType::Boolean,
+                false,
+            )
+            .into(),
+        ])
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        use DataType::*;
+        if arg_types.len() != 1 {
+            return exec_err!(
+                "try_sum: exactly 1 argument expected, got {}",
+                arg_types.len()
+            );
+        }
+
+        let dt = &arg_types[0];
+        let coerced = match dt {
+            Null => Float64,
+            Decimal128(p, s) => Decimal128(*p, *s),
+            Int8 | Int16 | Int32 | Int64 => Int64,
+            Float16 | Float32 | Float64 => Float64,
+            other => return exec_err!("try_sum: unsupported type: {other:?}"),
+        };
+        Ok(vec![coerced])
+    }
+
+    fn default_value(&self, _data_type: &DataType) -> Result<ScalarValue> {
+        Ok(ScalarValue::Null)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{BooleanArray, Decimal128Array, Float64Array, Int64Array};
+    use datafusion_common::{DataFusionError, ScalarValue};
+    use std::sync::Arc;
+
+    use super::*;
+    // -------- Helpers --------
+
+    fn int64(values: Vec<Option<i64>>) -> ArrayRef {
+        Arc::new(Int64Array::from(values)) as ArrayRef
+    }
+
+    fn f64(values: Vec<Option<f64>>) -> ArrayRef {
+        Arc::new(Float64Array::from(values)) as ArrayRef
+    }
+
+    fn dec128(p: u8, s: i8, vals: Vec<Option<i128>>) -> Result<ArrayRef> {
+        let base = Decimal128Array::from(vals);
+        let arr = base.with_precision_and_scale(p, s).map_err(|e| {
+            DataFusionError::Execution(format!("invalid precision/scale ({p},{s}): {e}"))
+        })?;
+        Ok(Arc::new(arr) as ArrayRef)
+    }
+
+    // -------- update_batch + evaluate --------
+
+    #[test]
+    fn try_sum_int_basic() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64((0..10).map(Some).collect())])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(Some(45)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_with_nulls() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64(vec![None, Some(2), Some(3), None, Some(5)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(Some(10)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_float_basic() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1.5), Some(2.5), None, Some(3.0)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Float64(Some(7.0)));
+        Ok(())
+    }
+
+    #[test]
+    fn float_overflow_behaves_like_spark_sum_infinite() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1e308), Some(1e308)])])?;
+
+        let out = acc.evaluate()?;
+        assert!(
+            matches!(out, ScalarValue::Float64(Some(v)) if v.is_infinite() && v.is_sign_positive()),
+            "waiting +Infinity, got: {out:?}"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_float_negative_zero_normalizes_to_positive_zero() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        // -0.0 + 0.0 should normalize to 0.0 (positive zero), not -0.0
+        acc.update_batch(&[f64(vec![Some(-0.0), Some(0.0)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Float64(Some(0.0)));
+        // Verify it's positive zero using is_sign_positive
+        if let ScalarValue::Float64(Some(v)) = out {
+            assert!(v.is_sign_positive() || v == 0.0);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_basic() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(123), Some(477)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(Some(600), p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_with_nulls() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(150), None, Some(200)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(Some(350), p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_overflow_sets_failed() -> Result<()> {
+        let p = 5u8;
+        let s = 0i8;
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        acc.update_batch(&[dec128(p, s, vec![Some(90_000), Some(20_000)])?])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Decimal128(None, p, s));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_decimal_merge_ok_and_failure_propagation() -> Result<()> {
+        let p = 10u8;
+        let s = 2i8;
+
+        let mut p_ok =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        p_ok.update_batch(&[dec128(p, s, vec![Some(100), Some(200)])?])?;
+        let s_ok = p_ok
+            .state()?
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<Vec<_>>>()?;
+
+        let mut p_fail =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        p_fail.update_batch(&[dec128(p, s, vec![Some(i128::MAX), Some(1)])?])?;
+        let s_fail = p_fail
+            .state()?
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<Vec<_>>>()?;
+
+        let mut final_acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(p, s));
+        final_acc.merge_batch(&s_ok)?;
+        final_acc.merge_batch(&s_fail)?;
+
+        assert!(final_acc.failed);
+        assert_eq!(final_acc.evaluate()?, ScalarValue::Decimal128(None, p, s));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_overflow_sets_failed() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        // i64::MAX + 1 => overflow => failed => result NULL
+        acc.update_batch(&[int64(vec![Some(i64::MAX), Some(1)])])?;
+        let out = acc.evaluate()?;
+        assert_eq!(out, ScalarValue::Int64(None));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_int_negative_overflow_sets_failed() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        // i64::MIN - 1 → overflow negative
+        acc.update_batch(&[int64(vec![Some(i64::MIN), Some(-1)])])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(None));
+        assert!(acc.failed);
+        Ok(())
+    }
+
+    // -------- state + merge_batch --------
+
+    #[test]
+    fn try_sum_state_two_fields_and_merge_ok() -> Result<()> {
+        // acumulador 1 [10, 5] -> sum=15
+        let mut acc1 = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc1.update_batch(&[int64(vec![Some(10), Some(5)])])?;
+        let state1 = acc1.state()?; // [sum, failed]
+        assert_eq!(state1.len(), 2);
+
+        // acumulador 2 [20, NULL] -> sum=20
+        let mut acc2 = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc2.update_batch(&[int64(vec![Some(20), None])])?;
+        let state2 = acc2.state()?; // [sum, failed]
+
+        let state1_arrays: Vec<ArrayRef> = state1
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+
+        let state2_arrays: Vec<ArrayRef> = state2
+            .into_iter()
+            .map(|sv| sv.to_array())
+            .collect::<Result<_>>()?;
+
+        // final accumulator
+        let mut final_acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+
+        final_acc.merge_batch(&state1_arrays)?;
+        final_acc.merge_batch(&state2_arrays)?;
+
+        // sum total = 15 + 20 = 35
+        assert!(!final_acc.failed);
+        assert_eq!(final_acc.evaluate()?, ScalarValue::Int64(Some(35)));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_merge_propagates_failure() -> Result<()> {
+        // sum=NULL, failed=true
+        let failed_sum = Arc::new(Int64Array::from(vec![None])) as ArrayRef;
+        let failed_flag = Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef;
+
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.merge_batch(&[failed_sum, failed_flag])?;
+
+        assert!(acc.failed);
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(None));
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_merge_empty_partition_is_not_failure() -> Result<()> {
+        // sum=NULL, failed=false
+        let empty_sum = Arc::new(Int64Array::from(vec![None])) as ArrayRef;
+        let ok_flag = Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef;
+
+        let mut acc = TrySumAccumulator::<Int64Type>::new(DataType::Int64);
+        acc.update_batch(&[int64(vec![Some(7), Some(8)])])?; // 15
+
+        acc.merge_batch(&[empty_sum, ok_flag])?;
+
+        assert!(!acc.failed);
+        assert_eq!(acc.evaluate()?, ScalarValue::Int64(Some(15)));
+        Ok(())
+    }
+
+    // -------- signature --------
+
+    #[test]
+    fn try_sum_return_type_matches_input() -> Result<()> {
+        let f = SparkTrySum::new();
+        assert_eq!(f.return_type(&[DataType::Int64])?, DataType::Int64);
+        assert_eq!(f.return_type(&[DataType::Float64])?, DataType::Float64);
+        Ok(())
+    }
+
+    #[test]
+    fn try_sum_state_and_evaluate_consistency() -> Result<()> {
+        let mut acc = TrySumAccumulator::<Float64Type>::new(DataType::Float64);
+        acc.update_batch(&[f64(vec![Some(1.0), Some(2.0)])])?;
+        let eval = acc.evaluate()?;
+        let state = acc.state()?;
+        assert_eq!(state[0], eval);
+        assert_eq!(state[1], ScalarValue::Boolean(Some(false)));
+        Ok(())
+    }
+
+    // -------------------------
+    // DECIMAL
+    // -------------------------
+
+    #[test]
+    fn decimal_10_2_sum_and_schema_widened() -> Result<()> {
+        // input: DECIMAL(10,2)  -> result: DECIMAL(20,2)
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(10, 2)])?,
+            DataType::Decimal128(20, 2),
+            "Spark needs +10 more digits of precision"
+        );
+
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(20, 2));
+        acc.update_batch(&[dec128(10, 2, vec![Some(123), Some(477)])?])?;
+        assert_eq!(acc.evaluate()?, ScalarValue::Decimal128(Some(600), 20, 2));
+        Ok(())
+    }
+
+    #[test]
+    fn decimal_5_0_fits_after_widening() -> Result<()> {
+        // input: DECIMAL(5,0) -> result: DECIMAL(15,0)
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(5, 0)])?,
+            DataType::Decimal128(15, 0)
+        );
+
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(15, 0));
+        acc.update_batch(&[dec128(5, 0, vec![Some(90_000), Some(20_000)])?])?;
+        assert_eq!(
+            acc.evaluate()?,
+            ScalarValue::Decimal128(Some(110_000), 15, 0)
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn decimal_38_0_max_precision_overflows_to_null() -> Result<()> {
+        let f = SparkTrySum::new();
+        assert_eq!(
+            f.return_type(&[DataType::Decimal128(38, 0)])?,
+            DataType::Decimal128(38, 0)
+        );
+        let ten_pow_38_minus_1 = {
+            let p10 = pow10_i128(38)
+                .ok_or_else(|| DataFusionError::Internal("10^38 overflow".into()))?;
+            p10 - 1
+        };
+        let mut acc =
+            TrySumAccumulator::<Decimal128Type>::new(DataType::Decimal128(38, 0));
+        acc.update_batch(&[dec128(38, 0, vec![Some(ten_pow_38_minus_1), Some(1)])?])?;
+
+        assert!(acc.failed, "need fail in overflow p=38");
+        assert_eq!(acc.evaluate()?, ScalarValue::Decimal128(None, 38, 0));
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/array/array_contains.rs b/datafusion/spark/src/function/array/array_contains.rs
new file mode 100644
index 0000000000000..2bc5d64d8bff8
--- /dev/null
+++ b/datafusion/spark/src/function/array/array_contains.rs
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, AsArray, BooleanArray, BooleanBufferBuilder, GenericListArray, OffsetSizeTrait,
+};
+use arrow::buffer::{BooleanBuffer, NullBuffer};
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions_nested::array_has::array_has_udf;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `array_contains` function.
+///
+/// Calls DataFusion's `array_has` and then applies Spark's null semantics:
+/// - If the result from `array_has` is `true`, return `true`.
+/// - If the result is `false` and the input array row contains any null elements,
+///   return `null` (because the element might have been the null).
+/// - If the result is `false` and the input array row has no null elements,
+///   return `false`.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkArrayContains {
+    signature: Signature,
+}
+
+impl Default for SparkArrayContains {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkArrayContains {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::array_and_element(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkArrayContains {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "array_contains"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Boolean)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let haystack = args.args[0].clone();
+        let array_has_result = array_has_udf().invoke_with_args(args)?;
+
+        let result_array = array_has_result.to_array(1)?;
+        let patched = apply_spark_null_semantics(result_array.as_boolean(), &haystack)?;
+        Ok(ColumnarValue::Array(Arc::new(patched)))
+    }
+}
+
+/// For each row where `array_has` returned `false`, set the output to null
+/// if that row's input array contains any null elements.
+fn apply_spark_null_semantics(
+    result: &BooleanArray,
+    haystack_arg: &ColumnarValue,
+) -> Result<BooleanArray> {
+    // happy path
+    if result.false_count() == 0 || haystack_arg.data_type() == DataType::Null {
+        return Ok(result.clone());
+    }
+
+    let haystack = haystack_arg.to_array_of_size(result.len())?;
+
+    let row_has_nulls = compute_row_has_nulls(&haystack)?;
+
+    // A row keeps its validity when result is true OR the row has no nulls.
+    let keep_mask = result.values() | &!&row_has_nulls;
+    let new_validity = match result.nulls() {
+        Some(n) => n.inner() & &keep_mask,
+        None => keep_mask,
+    };
+
+    Ok(BooleanArray::new(
+        result.values().clone(),
+        Some(NullBuffer::new(new_validity)),
+    ))
+}
+
+/// Returns a per-row bitmap where bit i is set if row i's list contains any null element.
+fn compute_row_has_nulls(haystack: &dyn Array) -> Result<BooleanBuffer> {
+    match haystack.data_type() {
+        DataType::List(_) => generic_list_row_has_nulls(haystack.as_list::<i32>()),
+        DataType::LargeList(_) => generic_list_row_has_nulls(haystack.as_list::<i64>()),
+        DataType::FixedSizeList(_, _) => {
+            let list = haystack.as_fixed_size_list();
+            let buf = match list.values().nulls() {
+                Some(nulls) => {
+                    let validity = nulls.inner();
+                    let vl = list.value_length() as usize;
+                    let mut builder = BooleanBufferBuilder::new(list.len());
+                    for i in 0..list.len() {
+                        builder.append(validity.slice(i * vl, vl).count_set_bits() < vl);
+                    }
+                    builder.finish()
+                }
+                None => BooleanBuffer::new_unset(list.len()),
+            };
+            Ok(mask_with_list_nulls(buf, list.nulls()))
+        }
+        dt => exec_err!("compute_row_has_nulls: unsupported data type {dt}"),
+    }
+}
+
+/// Computes per-row null presence for `List` and `LargeList` arrays.
+fn generic_list_row_has_nulls<O: OffsetSizeTrait>(
+    list: &GenericListArray<O>,
+) -> Result<BooleanBuffer> {
+    let buf = match list.values().nulls() {
+        Some(nulls) => {
+            let validity = nulls.inner();
+            let offsets = list.offsets();
+            let mut builder = BooleanBufferBuilder::new(list.len());
+            for i in 0..list.len() {
+                let s = offsets[i].as_usize();
+                let len = offsets[i + 1].as_usize() - s;
+                builder.append(validity.slice(s, len).count_set_bits() < len);
+            }
+            builder.finish()
+        }
+        None => BooleanBuffer::new_unset(list.len()),
+    };
+    Ok(mask_with_list_nulls(buf, list.nulls()))
+}
+
+/// Rows where the list itself is null should not be marked as "has nulls".
+fn mask_with_list_nulls(
+    buf: BooleanBuffer,
+    list_nulls: Option<&NullBuffer>,
+) -> BooleanBuffer {
+    match list_nulls {
+        Some(n) => &buf & n.inner(),
+        None => buf,
+    }
+}
diff --git a/datafusion/spark/src/function/array/mod.rs b/datafusion/spark/src/function/array/mod.rs
index a87df9a2c87a0..6c16e05361641 100644
--- a/datafusion/spark/src/function/array/mod.rs
+++ b/datafusion/spark/src/function/array/mod.rs
@@ -15,11 +15,54 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod array_contains;
+pub mod repeat;
+pub mod shuffle;
+pub mod slice;
+pub mod spark_array;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(array_contains::SparkArrayContains, spark_array_contains);
+make_udf_function!(spark_array::SparkArray, array);
+make_udf_function!(shuffle::SparkShuffle, shuffle);
+make_udf_function!(repeat::SparkArrayRepeat, array_repeat);
+make_udf_function!(slice::SparkSlice, slice);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        spark_array_contains,
+        "Returns true if the array contains the element (Spark semantics).",
+        array element
+    ));
+    export_functions!((array, "Returns an array with the given elements.", args));
+    export_functions!((
+        shuffle,
+        "Returns a random permutation of the given array.",
+        args
+    ));
+    export_functions!((
+        array_repeat,
+        "returns an array containing element count times.",
+        element count
+    ));
+    export_functions!((
+        slice,
+        "Returns a slice of the array from the start index with the given length.",
+        array start length
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![
+        spark_array_contains(),
+        array(),
+        shuffle(),
+        array_repeat(),
+        slice(),
+    ]
 }
diff --git a/datafusion/spark/src/function/array/repeat.rs b/datafusion/spark/src/function/array/repeat.rs
new file mode 100644
index 0000000000000..7543300a91078
--- /dev/null
+++ b/datafusion/spark/src/function/array/repeat.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions_nested::repeat::ArrayRepeat;
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::function::null_utils::{
+    NullMaskResolution, apply_null_mask, compute_null_mask,
+};
+
+/// Spark-compatible `array_repeat` expression. The difference with DataFusion's `array_repeat` is the handling of NULL inputs: in spark if any input is NULL, the result is NULL.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#array_repeat>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkArrayRepeat {
+    signature: Signature,
+}
+
+impl Default for SparkArrayRepeat {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkArrayRepeat {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkArrayRepeat {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "array_repeat"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::List(Arc::new(Field::new_list_field(
+            arg_types[0].clone(),
+            true,
+        ))))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_array_repeat(args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let [first_type, second_type] = take_function_args(self.name(), arg_types)?;
+
+        // Coerce the second argument to Int64/UInt64 if it's a numeric type
+        let second = match second_type {
+            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                DataType::Int64
+            }
+            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
+                DataType::UInt64
+            }
+            _ => return exec_err!("count must be an integer type"),
+        };
+
+        Ok(vec![first_type.clone(), second])
+    }
+}
+
+/// This is a Spark-specific wrapper around DataFusion's array_repeat that returns NULL
+/// if any argument is NULL (Spark behavior), whereas DataFusion's array_repeat ignores NULLs.
+fn spark_array_repeat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+    let ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    } = args;
+    let return_type = return_field.data_type().clone();
+
+    // Step 1: Check for NULL mask in incoming args
+    let null_mask = compute_null_mask(&arg_values, number_rows)?;
+
+    // If any argument is null then return NULL immediately
+    if matches!(null_mask, NullMaskResolution::ReturnNull) {
+        return Ok(ColumnarValue::Scalar(ScalarValue::try_from(return_type)?));
+    }
+
+    // Step 2: Delegate to DataFusion's array_repeat
+    let array_repeat_func = ArrayRepeat::new();
+    let func_args = ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    };
+    let result = array_repeat_func.invoke_with_args(func_args)?;
+
+    // Step 3: Apply NULL mask to result
+    apply_null_mask(result, null_mask, &return_type)
+}
diff --git a/datafusion/spark/src/function/array/shuffle.rs b/datafusion/spark/src/function/array/shuffle.rs
new file mode 100644
index 0000000000000..bb3b02449a1ca
--- /dev/null
+++ b/datafusion/spark/src/function/array/shuffle.rs
@@ -0,0 +1,320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray, MutableArrayData,
+    OffsetSizeTrait,
+};
+use arrow::buffer::OffsetBuffer;
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{FixedSizeList, LargeList, List, Null};
+use arrow::datatypes::FieldRef;
+use datafusion_common::cast::{
+    as_fixed_size_list_array, as_large_list_array, as_list_array,
+};
+use datafusion_common::{
+    Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
+};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use rand::rng;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng, seq::SliceRandom};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkShuffle {
+    signature: Signature,
+}
+
+impl Default for SparkShuffle {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkShuffle {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature {
+                type_signature: TypeSignature::OneOf(vec![
+                    // Only array argument
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![ArrayFunctionArgument::Array],
+                        array_coercion: None,
+                    }),
+                    // Array + Index (seed) argument
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::Index,
+                        ],
+                        array_coercion: None,
+                    }),
+                ]),
+                volatility: Volatility::Volatile,
+                parameter_names: None,
+            },
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkShuffle {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "shuffle"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        // Shuffle returns an array with the same type and nullability as the input
+        Ok(Arc::clone(&args.arg_fields[0]))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.is_empty() || args.args.len() > 2 {
+            return exec_err!("shuffle expects 1 or 2 argument(s)");
+        }
+
+        // Extract seed from second argument if present
+        let seed = if args.args.len() == 2 {
+            extract_seed(&args.args[1])?
+        } else {
+            None
+        };
+
+        // Convert arguments to arrays
+        let arrays = ColumnarValue::values_to_arrays(&args.args[..1])?;
+        array_shuffle_with_seed(&arrays, seed).map(ColumnarValue::Array)
+    }
+}
+
+/// Extract seed value from ColumnarValue
+fn extract_seed(seed_arg: &ColumnarValue) -> Result<Option<u64>> {
+    match seed_arg {
+        ColumnarValue::Scalar(scalar) => {
+            let seed = match scalar {
+                ScalarValue::Int64(Some(v)) => Some(*v as u64),
+                ScalarValue::Null | ScalarValue::Int64(None) => None,
+                _ => {
+                    return exec_err!(
+                        "shuffle seed must be Int64 type but got '{}'",
+                        scalar.data_type()
+                    );
+                }
+            };
+            Ok(seed)
+        }
+        ColumnarValue::Array(_) => {
+            exec_err!("shuffle seed must be a scalar value, not an array")
+        }
+    }
+}
+
+/// array_shuffle SQL function with optional seed
+fn array_shuffle_with_seed(arg: &[ArrayRef], seed: Option<u64>) -> Result<ArrayRef> {
+    let [input_array] = take_function_args("shuffle", arg)?;
+    match &input_array.data_type() {
+        List(field) => {
+            let array = as_list_array(input_array)?;
+            general_array_shuffle::<i32>(array, field, seed)
+        }
+        LargeList(field) => {
+            let array = as_large_list_array(input_array)?;
+            general_array_shuffle::<i64>(array, field, seed)
+        }
+        FixedSizeList(field, _) => {
+            let array = as_fixed_size_list_array(input_array)?;
+            fixed_size_array_shuffle(array, field, seed)
+        }
+        Null => Ok(Arc::clone(input_array)),
+        array_type => exec_err!(
+            "shuffle does not support type '{array_type}'; \
+        expected types: List, LargeList, FixedSizeList or Null."
+        ),
+    }
+}
+
+fn general_array_shuffle<O: OffsetSizeTrait>(
+    array: &GenericListArray<O>,
+    field: &FieldRef,
+    seed: Option<u64>,
+) -> Result<ArrayRef> {
+    let values = array.values();
+    let original_data = values.to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut offsets = vec![O::usize_as(0)];
+    let mut nulls = vec![];
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
+    let mut rng = if let Some(s) = seed {
+        StdRng::seed_from_u64(s)
+    } else {
+        // Use a random seed from the thread-local RNG
+        let seed = rng().random::<u64>();
+        StdRng::seed_from_u64(seed)
+    };
+
+    for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
+        // skip the null value
+        if array.is_null(row_index) {
+            nulls.push(false);
+            offsets.push(offsets[row_index] + O::one());
+            mutable.extend(0, 0, 1);
+            continue;
+        }
+        nulls.push(true);
+        let start = offset_window[0];
+        let end = offset_window[1];
+        let length = (end - start).to_usize().unwrap();
+
+        // Create indices and shuffle them
+        let mut indices: Vec<usize> =
+            (start.to_usize().unwrap()..end.to_usize().unwrap()).collect();
+        indices.shuffle(&mut rng);
+
+        // Add shuffled elements
+        for &index in &indices {
+            mutable.extend(0, index, index + 1);
+        }
+
+        offsets.push(offsets[row_index] + O::usize_as(length));
+    }
+
+    let data = mutable.freeze();
+    Ok(Arc::new(GenericListArray::<O>::try_new(
+        Arc::clone(field),
+        OffsetBuffer::<O>::new(offsets.into()),
+        arrow::array::make_array(data),
+        Some(nulls.into()),
+    )?))
+}
+
+fn fixed_size_array_shuffle(
+    array: &FixedSizeListArray,
+    field: &FieldRef,
+    seed: Option<u64>,
+) -> Result<ArrayRef> {
+    let values = array.values();
+    let original_data = values.to_data();
+    let capacity = Capacities::Array(original_data.len());
+    let mut nulls = vec![];
+    let mut mutable =
+        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
+    let value_length = array.value_length() as usize;
+    let mut rng = if let Some(s) = seed {
+        StdRng::seed_from_u64(s)
+    } else {
+        // Use a random seed from the thread-local RNG
+        let seed = rng().random::<u64>();
+        StdRng::seed_from_u64(seed)
+    };
+
+    for row_index in 0..array.len() {
+        // skip the null value
+        if array.is_null(row_index) {
+            nulls.push(false);
+            mutable.extend(0, 0, value_length);
+            continue;
+        }
+        nulls.push(true);
+
+        let start = row_index * value_length;
+        let end = start + value_length;
+
+        // Create indices and shuffle them
+        let mut indices: Vec<usize> = (start..end).collect();
+        indices.shuffle(&mut rng);
+
+        // Add shuffled elements
+        for &index in &indices {
+            mutable.extend(0, index, index + 1);
+        }
+    }
+
+    let data = mutable.freeze();
+    Ok(Arc::new(FixedSizeListArray::try_new(
+        Arc::clone(field),
+        array.value_length(),
+        arrow::array::make_array(data),
+        Some(nulls.into()),
+    )?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_shuffle_nullability() {
+        let shuffle = SparkShuffle::new();
+
+        // Test with non-nullable array
+        let non_nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            false, // not nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), non_nullable_field.data_type());
+
+        // Test with nullable array
+        let nullable_field = Arc::new(Field::new(
+            "arr",
+            List(Arc::new(Field::new("item", DataType::Int32, true))),
+            true, // nullable
+        ));
+
+        let result = shuffle
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), nullable_field.data_type());
+    }
+}
diff --git a/datafusion/spark/src/function/array/slice.rs b/datafusion/spark/src/function/array/slice.rs
new file mode 100644
index 0000000000000..6c168a4f491b5
--- /dev/null
+++ b/datafusion/spark/src/function/array/slice.rs
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, Int64Builder};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{as_int64_array, as_list_array};
+use datafusion_common::utils::ListCoercion;
+use datafusion_common::{Result, exec_err, internal_err, utils::take_function_args};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use datafusion_functions_nested::extract::array_slice_udf;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark slice function implementation
+/// Main difference from DataFusion's array_slice is that the third argument is the length of the slice and not the end index.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#slice>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSlice {
+    signature: Signature,
+}
+
+impl Default for SparkSlice {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSlice {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature {
+                type_signature: TypeSignature::ArraySignature(
+                    ArrayFunctionSignature::Array {
+                        arguments: vec![
+                            ArrayFunctionArgument::Array,
+                            ArrayFunctionArgument::Index,
+                            ArrayFunctionArgument::Index,
+                        ],
+                        array_coercion: Some(ListCoercion::FixedSizedListToList),
+                    },
+                ),
+                volatility: Volatility::Immutable,
+                parameter_names: None,
+            },
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSlice {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "slice"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            "slice",
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(
+        &self,
+        mut func_args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let array_len = func_args
+            .args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(func_args.number_rows);
+
+        let arrays = func_args
+            .args
+            .iter()
+            .map(|arg| match arg {
+                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
+                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let (start, end) = calculate_start_end(&arrays)?;
+
+        array_slice_udf().invoke_with_args(ScalarFunctionArgs {
+            args: vec![
+                func_args.args.swap_remove(0),
+                ColumnarValue::Array(start),
+                ColumnarValue::Array(end),
+            ],
+            arg_fields: func_args.arg_fields,
+            number_rows: func_args.number_rows,
+            return_field: func_args.return_field,
+            config_options: func_args.config_options,
+        })
+    }
+}
+
+fn calculate_start_end(args: &[ArrayRef]) -> Result<(ArrayRef, ArrayRef)> {
+    let [values, start, length] = take_function_args("slice", args)?;
+
+    let values_len = values.len();
+
+    let start = as_int64_array(&start)?;
+    let length = as_int64_array(&length)?;
+
+    let values = as_list_array(values)?;
+
+    let mut adjusted_start = Int64Builder::with_capacity(values_len);
+    let mut end = Int64Builder::with_capacity(values_len);
+
+    for row in 0..values_len {
+        if values.is_null(row) || start.is_null(row) || length.is_null(row) {
+            adjusted_start.append_null();
+            end.append_null();
+            continue;
+        }
+        let start = start.value(row);
+        let length = length.value(row);
+        let value_length = values.value(row).len() as i64;
+
+        if start == 0 {
+            return exec_err!("Start index must not be zero");
+        }
+        if length < 0 {
+            return exec_err!("Length must be non-negative, but got {}", length);
+        }
+
+        let adjusted_start_value = if start < 0 {
+            start + value_length + 1
+        } else {
+            start
+        };
+
+        adjusted_start.append_value(adjusted_start_value);
+        end.append_value(adjusted_start_value + (length - 1));
+    }
+
+    Ok((Arc::new(adjusted_start.finish()), Arc::new(end.finish())))
+}
diff --git a/datafusion/spark/src/function/array/spark_array.rs b/datafusion/spark/src/function/array/spark_array.rs
new file mode 100644
index 0000000000000..1ad0a394b8ca6
--- /dev/null
+++ b/datafusion/spark/src/function/array/spark_array.rs
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, sync::Arc};
+
+use arrow::array::{Array, ArrayRef, new_null_array};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::utils::SingleRowListArrayBuilder;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions_nested::make_array::{array_array, coerce_types_inner};
+
+use crate::function::functions_nested_utils::make_scalar_function;
+
+const ARRAY_FIELD_DEFAULT_NAME: &str = "element";
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkArray {
+    signature: Signature,
+}
+
+impl Default for SparkArray {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkArray {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkArray {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "array"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let data_types = args
+            .arg_fields
+            .iter()
+            .map(|f| f.data_type())
+            .cloned()
+            .collect::<Vec<_>>();
+
+        let mut expr_type = DataType::Null;
+        for arg_type in &data_types {
+            if !arg_type.equals_datatype(&DataType::Null) {
+                expr_type = arg_type.clone();
+                break;
+            }
+        }
+
+        let return_type = DataType::List(Arc::new(Field::new(
+            ARRAY_FIELD_DEFAULT_NAME,
+            expr_type,
+            true,
+        )));
+
+        Ok(Arc::new(Field::new(
+            "this_field_name_is_irrelevant",
+            return_type,
+            false,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(make_array_inner)(args.as_slice())
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.is_empty() {
+            Ok(vec![])
+        } else {
+            coerce_types_inner(arg_types, self.name())
+        }
+    }
+}
+
+/// `make_array_inner` is the implementation of the `make_array` function.
+/// Constructs an array using the input `data` as `ArrayRef`.
+/// Returns a reference-counted `Array` instance result.
+pub fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
+    let mut data_type = DataType::Null;
+    for arg in arrays {
+        let arg_data_type = arg.data_type();
+        if !arg_data_type.equals_datatype(&DataType::Null) {
+            data_type = arg_data_type.clone();
+            break;
+        }
+    }
+
+    match data_type {
+        // Either an empty array or all nulls:
+        DataType::Null => {
+            let length = arrays.iter().map(|a| a.len()).sum();
+            // By default Int32
+            let array = new_null_array(&DataType::Null, length);
+            Ok(Arc::new(
+                SingleRowListArrayBuilder::new(array)
+                    .with_nullable(true)
+                    .with_field_name(Some(ARRAY_FIELD_DEFAULT_NAME.to_string()))
+                    .build_list_array(),
+            ))
+        }
+        _ => array_array::<i32>(arrays, data_type, ARRAY_FIELD_DEFAULT_NAME),
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs b/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs
new file mode 100644
index 0000000000000..3871d00cc91d8
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_bit_position.rs
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, Int64Array};
+use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
+use datafusion::logical_expr::{ColumnarValue, Signature, TypeSignature, Volatility};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `bitmap_bit_position` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bit_position>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapBitPosition {
+    signature: Signature,
+}
+
+impl Default for BitmapBitPosition {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapBitPosition {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapBitPosition {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bitmap_bit_position"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(bitmap_bit_position_inner, vec![])(&args.args)
+    }
+}
+
+pub fn bitmap_bit_position_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bitmap_bit_position", arg)?;
+    match &array.data_type() {
+        DataType::Int8 => {
+            let result: Int64Array = array
+                .as_primitive::<Int8Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int64Array = array
+                .as_primitive::<Int16Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int64Array = array
+                .as_primitive::<Int32Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bit_position(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int64Array = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(bitmap_bit_position))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bitmap_bit_position does not support {data_type}")
+        }
+    }
+}
+
+const NUM_BYTES: i64 = 4 * 1024;
+const NUM_BITS: i64 = NUM_BYTES * 8;
+
+fn bitmap_bit_position(value: i64) -> i64 {
+    if value > 0 {
+        (value - 1) % NUM_BITS
+    } else {
+        (value.wrapping_neg()) % NUM_BITS
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
new file mode 100644
index 0000000000000..fe72a4fe8ac3e
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, Int64Array};
+use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
+use datafusion::logical_expr::{ColumnarValue, Signature, TypeSignature, Volatility};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `bitmap_bucket_number` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bucket_number>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapBucketNumber {
+    signature: Signature,
+}
+
+impl Default for BitmapBucketNumber {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapBucketNumber {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapBucketNumber {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bitmap_bucket_number"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(bitmap_bucket_number_inner, vec![])(&args.args)
+    }
+}
+
+pub fn bitmap_bucket_number_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bitmap_bucket_number", arg)?;
+    match &array.data_type() {
+        DataType::Int8 => {
+            let result: Int64Array = array
+                .as_primitive::<Int8Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int64Array = array
+                .as_primitive::<Int16Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int64Array = array
+                .as_primitive::<Int32Type>()
+                .iter()
+                .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int64Array = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(bitmap_bucket_number))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bitmap_bucket_number does not support {data_type}")
+        }
+    }
+}
+
+const NUM_BYTES: i64 = 4 * 1024;
+const NUM_BITS: i64 = NUM_BYTES * 8;
+
+fn bitmap_bucket_number(value: i64) -> i64 {
+    if value > 0 {
+        1 + (value - 1) / NUM_BITS
+    } else {
+        value / NUM_BITS
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs
new file mode 100644
index 0000000000000..e59bc5f529317
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, Int64Array,
+    LargeBinaryArray, as_dictionary_array,
+};
+use arrow::datatypes::DataType::{
+    Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary,
+};
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::downcast_arg;
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapCount {
+    signature: Signature,
+}
+
+impl Default for BitmapCount {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapCount {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Binary)],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapCount {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bitmap_count"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        use arrow::datatypes::Field;
+        // bitmap_count returns Int64 with the same nullability as the input
+        Ok(Arc::new(Field::new(
+            args.arg_fields[0].name(),
+            DataType::Int64,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(bitmap_count_inner, vec![])(&args.args)
+    }
+}
+
+fn binary_count_ones(opt: Option<&[u8]>) -> Option<i64> {
+    opt.map(|value| value.iter().map(|b| b.count_ones() as i64).sum())
+}
+
+macro_rules! downcast_and_count_ones {
+    ($input_array:expr, $array_type:ident) => {{
+        let arr = downcast_arg!($input_array, $array_type);
+        Ok(arr.iter().map(binary_count_ones).collect::<Int64Array>())
+    }};
+}
+
+macro_rules! downcast_dict_and_count_ones {
+    ($input_dict:expr, $key_array_type:ident) => {{
+        let dict_array = as_dictionary_array::<$key_array_type>($input_dict);
+        let array = dict_array.downcast_dict::<BinaryArray>().unwrap();
+        Ok(array
+            .into_iter()
+            .map(binary_count_ones)
+            .collect::<Int64Array>())
+    }};
+}
+
+pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input_array] = take_function_args("bitmap_count", arg)?;
+
+    let res: Result<Int64Array> = match &input_array.data_type() {
+        Binary => downcast_and_count_ones!(input_array, BinaryArray),
+        BinaryView => downcast_and_count_ones!(input_array, BinaryViewArray),
+        LargeBinary => downcast_and_count_ones!(input_array, LargeBinaryArray),
+        FixedSizeBinary(_size) => {
+            downcast_and_count_ones!(input_array, FixedSizeBinaryArray)
+        }
+        Dictionary(k, v) if v.as_ref() == &Binary => match k.as_ref() {
+            DataType::Int8 => downcast_dict_and_count_ones!(input_array, Int8Type),
+            DataType::Int16 => downcast_dict_and_count_ones!(input_array, Int16Type),
+            DataType::Int32 => downcast_dict_and_count_ones!(input_array, Int32Type),
+            DataType::Int64 => downcast_dict_and_count_ones!(input_array, Int64Type),
+            data_type => {
+                internal_err!(
+                    "bitmap_count does not support Dictionary({data_type}, Binary)"
+                )
+            }
+        },
+        data_type => {
+            internal_err!("bitmap_count does not support {data_type}")
+        }
+    };
+
+    Ok(Arc::new(res?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::function::bitmap::bitmap_count::BitmapCount;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, Int64Array};
+    use arrow::datatypes::DataType::Int64;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue::Scalar;
+    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+    use std::sync::Arc;
+
+    macro_rules! test_bitmap_count_binary_invoke {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Binary($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::LargeBinary($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::BinaryView($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::FixedSizeBinary(
+                    $INPUT.map(|a| a.len()).unwrap_or(0) as i32,
+                    $INPUT
+                ))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+        };
+    }
+
+    #[test]
+    fn test_bitmap_count_invoke() -> Result<()> {
+        test_bitmap_count_binary_invoke!(None::<Vec<u8>>, Ok(None));
+        test_bitmap_count_binary_invoke!(Some(vec![0x0Au8]), Ok(Some(2)));
+        test_bitmap_count_binary_invoke!(Some(vec![0xFFu8, 0xFFu8]), Ok(Some(16)));
+        test_bitmap_count_binary_invoke!(
+            Some(vec![0x0Au8, 0xB0u8, 0xCDu8]),
+            Ok(Some(10))
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_dictionary_encoded_bitmap_count_invoke() -> Result<()> {
+        let dict = Scalar(ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))),
+        ));
+
+        let arg_fields = vec![
+            Field::new(
+                "a",
+                DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Binary),
+                ),
+                true,
+            )
+            .into(),
+        ];
+        let args = ScalarFunctionArgs {
+            args: vec![dict.clone()],
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Int64, true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        let udf = BitmapCount::new();
+        let actual = udf.invoke_with_args(args)?;
+        let expect = Scalar(ScalarValue::Int64(Some(16)));
+        assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?);
+        Ok(())
+    }
+
+    #[test]
+    fn test_bitmap_count_nullability() -> Result<()> {
+        use datafusion_expr::ReturnFieldArgs;
+
+        let bitmap_count = BitmapCount::new();
+
+        // Test with non-nullable binary field
+        let non_nullable_field = Arc::new(Field::new("bin", DataType::Binary, false));
+
+        let result = bitmap_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &Int64);
+
+        // Test with nullable binary field
+        let nullable_field = Arc::new(Field::new("bin", DataType::Binary, true));
+
+        let result = bitmap_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Int64);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/bitmap/mod.rs b/datafusion/spark/src/function/bitmap/mod.rs
new file mode 100644
index 0000000000000..4992992aeae8b
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/mod.rs
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod bitmap_bit_position;
+pub mod bitmap_bucket_number;
+pub mod bitmap_count;
+
+use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
+use std::sync::Arc;
+
+make_udf_function!(bitmap_count::BitmapCount, bitmap_count);
+make_udf_function!(bitmap_bit_position::BitmapBitPosition, bitmap_bit_position);
+make_udf_function!(
+    bitmap_bucket_number::BitmapBucketNumber,
+    bitmap_bucket_number
+);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        bitmap_count,
+        "Returns the number of set bits in the input bitmap.",
+        arg
+    ));
+    export_functions!((
+        bitmap_bit_position,
+        "Returns the bit position for the given input child expression.",
+        arg
+    ));
+    export_functions!((
+        bitmap_bucket_number,
+        "Returns the bucket number for the given input child expression.",
+        arg
+    ));
+}
+
+pub fn functions() -> Vec<Arc<ScalarUDF>> {
+    vec![
+        bitmap_count(),
+        bitmap_bit_position(),
+        bitmap_bucket_number(),
+    ]
+}
diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs
new file mode 100644
index 0000000000000..00170293dc391
--- /dev/null
+++ b/datafusion/spark/src/function/bitwise/bit_count.rs
@@ -0,0 +1,385 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray, Int32Array};
+use arrow::datatypes::{
+    DataType, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type,
+    UInt32Type, UInt64Type,
+};
+use datafusion_common::cast::as_boolean_array;
+use datafusion_common::{Result, internal_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBitCount {
+    signature: Signature,
+}
+
+impl Default for SparkBitCount {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBitCount {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Boolean]),
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::UInt8]),
+                    TypeSignature::Exact(vec![DataType::UInt16]),
+                    TypeSignature::Exact(vec![DataType::UInt32]),
+                    TypeSignature::Exact(vec![DataType::UInt64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBitCount {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bit_count"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        use arrow::datatypes::Field;
+        // bit_count returns Int32 with the same nullability as the input
+        Ok(Arc::new(Field::new(
+            args.arg_fields[0].name(),
+            DataType::Int32,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.len() != 1 {
+            return plan_err!("bit_count expects exactly 1 argument");
+        }
+
+        make_scalar_function(spark_bit_count, vec![])(&args.args)
+    }
+}
+
+fn spark_bit_count(value_array: &[ArrayRef]) -> Result<ArrayRef> {
+    let value_array = value_array[0].as_ref();
+    match value_array.data_type() {
+        DataType::Boolean => {
+            let result: Int32Array = as_boolean_array(value_array)?
+                .iter()
+                .map(|x| x.map(|y| y as i32))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        DataType::Int8 => {
+            let result: Int32Array = value_array
+                .as_primitive::<Int8Type>()
+                .unary(|v| (v as i64).count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int32Array = value_array
+                .as_primitive::<Int16Type>()
+                .unary(|v| (v as i64).count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int32Array = value_array
+                .as_primitive::<Int32Type>()
+                .unary(|v| (v as i64).count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int32Array = value_array
+                .as_primitive::<Int64Type>()
+                .unary(|v| v.count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::UInt8 => {
+            let result: Int32Array = value_array
+                .as_primitive::<UInt8Type>()
+                .unary(|v| v.count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::UInt16 => {
+            let result: Int32Array = value_array
+                .as_primitive::<UInt16Type>()
+                .unary(|v| v.count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::UInt32 => {
+            let result: Int32Array = value_array
+                .as_primitive::<UInt32Type>()
+                .unary(|v| v.count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        DataType::UInt64 => {
+            let result: Int32Array = value_array
+                .as_primitive::<UInt64Type>()
+                .unary(|v| v.count_ones() as i32);
+            Ok(Arc::new(result))
+        }
+        _ => {
+            plan_err!(
+                "bit_count function does not support data type: {}",
+                value_array.data_type()
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{
+        Array, BooleanArray, Int8Array, Int16Array, Int32Array, Int64Array, UInt8Array,
+        UInt16Array, UInt32Array, UInt64Array,
+    };
+    use arrow::datatypes::{Field, Int32Type};
+
+    #[test]
+    fn test_bit_count_basic() {
+        // Test bit_count(0) - no bits set
+        let result = spark_bit_count(&[Arc::new(Int32Array::from(vec![0]))]).unwrap();
+
+        assert_eq!(result.as_primitive::<Int32Type>().value(0), 0);
+
+        // Test bit_count(1) - 1 bit set
+        let result = spark_bit_count(&[Arc::new(Int32Array::from(vec![1]))]).unwrap();
+
+        assert_eq!(result.as_primitive::<Int32Type>().value(0), 1);
+
+        // Test bit_count(7) - 7 = 111 in binary, 3 bits set
+        let result = spark_bit_count(&[Arc::new(Int32Array::from(vec![7]))]).unwrap();
+
+        assert_eq!(result.as_primitive::<Int32Type>().value(0), 3);
+
+        // Test bit_count(15) - 15 = 1111 in binary, 4 bits set
+        let result = spark_bit_count(&[Arc::new(Int32Array::from(vec![15]))]).unwrap();
+
+        assert_eq!(result.as_primitive::<Int32Type>().value(0), 4);
+    }
+
+    #[test]
+    fn test_bit_count_int8() {
+        // Test bit_count on Int8Array
+        let result =
+            spark_bit_count(&[Arc::new(Int8Array::from(vec![0i8, 1, 3, 7, 15, -1]))])
+                .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0);
+        assert_eq!(arr.value(1), 1);
+        assert_eq!(arr.value(2), 2);
+        assert_eq!(arr.value(3), 3);
+        assert_eq!(arr.value(4), 4);
+        assert_eq!(arr.value(5), 64);
+    }
+
+    #[test]
+    fn test_bit_count_boolean() {
+        // Test bit_count on BooleanArray
+        let result =
+            spark_bit_count(&[Arc::new(BooleanArray::from(vec![true, false]))]).unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 1);
+        assert_eq!(arr.value(1), 0);
+    }
+
+    #[test]
+    fn test_bit_count_int16() {
+        // Test bit_count on Int16Array
+        let result =
+            spark_bit_count(&[Arc::new(Int16Array::from(vec![0i16, 1, 255, 1023, -1]))])
+                .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0);
+        assert_eq!(arr.value(1), 1);
+        assert_eq!(arr.value(2), 8);
+        assert_eq!(arr.value(3), 10);
+        assert_eq!(arr.value(4), 64);
+    }
+
+    #[test]
+    fn test_bit_count_int32() {
+        // Test bit_count on Int32Array
+        let result =
+            spark_bit_count(&[Arc::new(Int32Array::from(vec![0i32, 1, 255, 1023, -1]))])
+                .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0); // 0b00000000000000000000000000000000 = 0
+        assert_eq!(arr.value(1), 1); // 0b00000000000000000000000000000001 = 1
+        assert_eq!(arr.value(2), 8); // 0b00000000000000000000000011111111 = 8
+        assert_eq!(arr.value(3), 10); // 0b00000000000000000000001111111111 = 10
+        assert_eq!(arr.value(4), 64); // -1 in two's complement = all 32 bits set
+    }
+
+    #[test]
+    fn test_bit_count_int64() {
+        // Test bit_count on Int64Array
+        let result =
+            spark_bit_count(&[Arc::new(Int64Array::from(vec![0i64, 1, 255, 1023, -1]))])
+                .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0); // 0b0000000000000000000000000000000000000000000000000000000000000000 = 0
+        assert_eq!(arr.value(1), 1); // 0b0000000000000000000000000000000000000000000000000000000000000001 = 1
+        assert_eq!(arr.value(2), 8); // 0b0000000000000000000000000000000000000000000000000000000011111111 = 8
+        assert_eq!(arr.value(3), 10); // 0b0000000000000000000000000000000000000000000000000000001111111111 = 10
+        assert_eq!(arr.value(4), 64); // -1 in two's complement = all 64 bits set
+    }
+
+    #[test]
+    fn test_bit_count_uint8() {
+        // Test bit_count on UInt8Array
+        let result =
+            spark_bit_count(&[Arc::new(UInt8Array::from(vec![0u8, 1, 255]))]).unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0); // 0b00000000 = 0
+        assert_eq!(arr.value(1), 1); // 0b00000001 = 1
+        assert_eq!(arr.value(2), 8); // 0b11111111 = 8
+    }
+
+    #[test]
+    fn test_bit_count_uint16() {
+        // Test bit_count on UInt16Array
+        let result =
+            spark_bit_count(&[Arc::new(UInt16Array::from(vec![0u16, 1, 255, 65535]))])
+                .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0); // 0b0000000000000000 = 0
+        assert_eq!(arr.value(1), 1); // 0b0000000000000001 = 1
+        assert_eq!(arr.value(2), 8); // 0b0000000011111111 = 8
+        assert_eq!(arr.value(3), 16); // 0b1111111111111111 = 16
+    }
+
+    #[test]
+    fn test_bit_count_uint32() {
+        // Test bit_count on UInt32Array
+        let result = spark_bit_count(&[Arc::new(UInt32Array::from(vec![
+            0u32, 1, 255, 4294967295,
+        ]))])
+        .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 0); // 0b00000000000000000000000000000000 = 0
+        assert_eq!(arr.value(1), 1); // 0b00000000000000000000000000000001 = 1
+        assert_eq!(arr.value(2), 8); // 0b00000000000000000000000011111111 = 8
+        assert_eq!(arr.value(3), 32); // 0b11111111111111111111111111111111 = 32
+    }
+
+    #[test]
+    fn test_bit_count_uint64() {
+        // Test bit_count on UInt64Array
+        let result = spark_bit_count(&[Arc::new(UInt64Array::from(vec![
+            0u64,
+            1,
+            255,
+            256,
+            u64::MAX,
+        ]))])
+        .unwrap();
+
+        let arr = result.as_primitive::<Int32Type>();
+        // 0b0 = 0
+        assert_eq!(arr.value(0), 0);
+        // 0b1 = 1
+        assert_eq!(arr.value(1), 1);
+        // 0b11111111 = 8
+        assert_eq!(arr.value(2), 8);
+        // 0b100000000 = 1
+        assert_eq!(arr.value(3), 1);
+        // u64::MAX = all 64 bits set
+        assert_eq!(arr.value(4), 64);
+    }
+
+    #[test]
+    fn test_bit_count_nulls() {
+        // Test bit_count with nulls
+        let arr = Int32Array::from(vec![Some(3), None, Some(7)]);
+        let result = spark_bit_count(&[Arc::new(arr)]).unwrap();
+        let arr = result.as_primitive::<Int32Type>();
+        assert_eq!(arr.value(0), 2); // 0b11
+        assert!(arr.is_null(1));
+        assert_eq!(arr.value(2), 3); // 0b111
+    }
+
+    #[test]
+    fn test_bit_count_nullability() -> Result<()> {
+        use datafusion_expr::ReturnFieldArgs;
+
+        let bit_count = SparkBitCount::new();
+
+        // Test with non-nullable Int32 field
+        let non_nullable_field = Arc::new(Field::new("num", DataType::Int32, false));
+
+        let result = bit_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should not be nullable (same as input)
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int32);
+
+        // Test with nullable Int32 field
+        let nullable_field = Arc::new(Field::new("num", DataType::Int32, true));
+
+        let result = bit_count.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_field)],
+            scalar_arguments: &[None],
+        })?;
+
+        // The result should be nullable (same as input)
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int32);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/bitwise/bit_get.rs b/datafusion/spark/src/function/bitwise/bit_get.rs
new file mode 100644
index 0000000000000..3343c6c61de0b
--- /dev/null
+++ b/datafusion/spark/src/function/bitwise/bit_get.rs
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::mem::size_of;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, Int8Array, Int32Array, PrimitiveArray,
+    downcast_integer_array,
+};
+use arrow::compute::try_binary;
+use arrow::datatypes::{ArrowNativeType, DataType, Field, FieldRef, Int8Type, Int32Type};
+use datafusion_common::types::{NativeType, logical_int32};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBitGet {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkBitGet {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBitGet {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Integer),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int32()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int32,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec!["getbit".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBitGet {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bit_get"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // Spark derives nullability for BinaryExpression from its children
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int8, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_bit_get, vec![])(&args.args)
+    }
+}
+
+fn spark_bit_get_inner<T: ArrowPrimitiveType>(
+    value: &PrimitiveArray<T>,
+    pos: &Int32Array,
+) -> Result<Int8Array> {
+    let bit_length = (size_of::<T::Native>() * 8) as i32;
+
+    let result: PrimitiveArray<Int8Type> = try_binary(value, pos, |value, pos| {
+        if pos < 0 || pos >= bit_length {
+            return Err(arrow::error::ArrowError::ComputeError(format!(
+                "bit_get: position {pos} is out of bounds. Expected pos < {bit_length} and pos >= 0"
+            )));
+        }
+        Ok(((value.to_i64().unwrap() >> pos) & 1) as i8)
+    })?;
+    Ok(result)
+}
+
+fn spark_bit_get(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [value, position] = take_function_args("bit_get", args)?;
+    let pos_arg = position.as_primitive::<Int32Type>();
+    let ret = downcast_integer_array!(
+        value => spark_bit_get_inner(value, pos_arg),
+        DataType::Null => Ok(Int8Array::new_null(value.len())),
+        d => internal_err!("Unsupported datatype for bit_get: {d}"),
+    )?;
+    Ok(Arc::new(ret))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+
+    #[test]
+    fn test_bit_get_nullability_non_nullable_inputs() {
+        let func = SparkBitGet::new();
+        let value_field = Arc::new(Field::new("value", DataType::Int32, false));
+        let pos_field = Arc::new(Field::new("pos", DataType::Int32, false));
+
+        let out_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[value_field, pos_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(out_field.data_type(), &DataType::Int8);
+        assert!(!out_field.is_nullable());
+    }
+
+    #[test]
+    fn test_bit_get_nullability_nullable_inputs() {
+        let func = SparkBitGet::new();
+        let value_field = Arc::new(Field::new("value", DataType::Int32, true));
+        let pos_field = Arc::new(Field::new("pos", DataType::Int32, false));
+
+        let out_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[value_field, pos_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(out_field.data_type(), &DataType::Int8);
+        assert!(out_field.is_nullable());
+    }
+}
diff --git a/datafusion/spark/src/function/bitwise/bit_shift.rs b/datafusion/spark/src/function/bitwise/bit_shift.rs
new file mode 100644
index 0000000000000..fc3df28e968a8
--- /dev/null
+++ b/datafusion/spark/src/function/bitwise/bit_shift.rs
@@ -0,0 +1,350 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, Int32Array, PrimitiveArray};
+use arrow::compute;
+use arrow::datatypes::{
+    ArrowNativeType, DataType, Field, FieldRef, Int32Type, Int64Type, UInt32Type,
+    UInt64Type,
+};
+use datafusion_common::types::{
+    NativeType, logical_int8, logical_int16, logical_int32, logical_int64, logical_uint8,
+    logical_uint16, logical_uint32, logical_uint64,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Bitwise left shift on elements in `value` by corresponding `shift` amount.
+/// The shift amount is normalized to the bit width of the type, matching Spark/Java
+/// semantics for negative and large shifts.
+fn shift_left<T>(
+    value: &PrimitiveArray<T>,
+    shift: &Int32Array,
+) -> Result<PrimitiveArray<T>>
+where
+    T: ArrowPrimitiveType,
+    T::Native: std::ops::Shl<i32, Output = T::Native>,
+{
+    let bit_num = (T::Native::get_byte_width() * 8) as i32;
+    let result = compute::binary::<_, Int32Type, _, _>(
+        value,
+        shift,
+        |value: T::Native, shift: i32| {
+            let shift = ((shift % bit_num) + bit_num) % bit_num;
+            value << shift
+        },
+    )?;
+    Ok(result)
+}
+
+/// Bitwise right shift on elements in `value` by corresponding `shift` amount.
+/// The shift amount is normalized to the bit width of the type, matching Spark/Java
+/// semantics for negative and large shifts.
+fn shift_right<T>(
+    value: &PrimitiveArray<T>,
+    shift: &Int32Array,
+) -> Result<PrimitiveArray<T>>
+where
+    T: ArrowPrimitiveType,
+    T::Native: std::ops::Shr<i32, Output = T::Native>,
+{
+    let bit_num = (T::Native::get_byte_width() * 8) as i32;
+    let result = compute::binary::<_, Int32Type, _, _>(
+        value,
+        shift,
+        |value: T::Native, shift: i32| {
+            let shift = ((shift % bit_num) + bit_num) % bit_num;
+            value >> shift
+        },
+    )?;
+    Ok(result)
+}
+
+/// Trait for performing an unsigned right shift (logical shift right).
+/// This is used to mimic Java's `>>>` operator, which does not exist in Rust.
+/// For unsigned types, this is just the normal right shift.
+/// For signed types, this casts to the unsigned type, shifts, then casts back.
+trait UShr {
+    fn ushr(self, rhs: i32) -> Self;
+}
+
+impl UShr for u32 {
+    fn ushr(self, rhs: i32) -> Self {
+        self >> rhs
+    }
+}
+
+impl UShr for u64 {
+    fn ushr(self, rhs: i32) -> Self {
+        self >> rhs
+    }
+}
+
+impl UShr for i32 {
+    fn ushr(self, rhs: i32) -> Self {
+        ((self as u32) >> rhs) as i32
+    }
+}
+
+impl UShr for i64 {
+    fn ushr(self, rhs: i32) -> Self {
+        ((self as u64) >> rhs) as i64
+    }
+}
+
+/// Bitwise unsigned right shift on elements in `value` by corresponding `shift`
+/// amount. The shift amount is normalized to the bit width of the type, matching
+/// Spark/Java semantics for negative and large shifts.
+fn shift_right_unsigned<T>(
+    value: &PrimitiveArray<T>,
+    shift: &Int32Array,
+) -> Result<PrimitiveArray<T>>
+where
+    T: ArrowPrimitiveType,
+    T::Native: UShr,
+{
+    let bit_num = (T::Native::get_byte_width() * 8) as i32;
+    let result = compute::binary::<_, Int32Type, _, _>(
+        value,
+        shift,
+        |value: T::Native, shift: i32| {
+            let shift = ((shift % bit_num) + bit_num) % bit_num;
+            value.ushr(shift)
+        },
+    )?;
+    Ok(result)
+}
+
+fn shift_inner(
+    arrays: &[ArrayRef],
+    name: &str,
+    bit_shift_type: BitShiftType,
+) -> Result<ArrayRef> {
+    let [value_array, shift_array] = take_function_args(name, arrays)?;
+    let shift_array = shift_array.as_primitive::<Int32Type>();
+
+    fn shift<T>(
+        value: &PrimitiveArray<T>,
+        shift: &Int32Array,
+        bit_shift_type: BitShiftType,
+    ) -> Result<PrimitiveArray<T>>
+    where
+        T: ArrowPrimitiveType,
+        T::Native: std::ops::Shl<i32, Output = T::Native>
+            + std::ops::Shr<i32, Output = T::Native>
+            + UShr,
+    {
+        match bit_shift_type {
+            BitShiftType::Left => shift_left(value, shift),
+            BitShiftType::Right => shift_right(value, shift),
+            BitShiftType::RightUnsigned => shift_right_unsigned(value, shift),
+        }
+    }
+
+    match value_array.data_type() {
+        DataType::Int32 => {
+            let value_array = value_array.as_primitive::<Int32Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
+        }
+        DataType::Int64 => {
+            let value_array = value_array.as_primitive::<Int64Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
+        }
+        DataType::UInt32 => {
+            let value_array = value_array.as_primitive::<UInt32Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
+        }
+        DataType::UInt64 => {
+            let value_array = value_array.as_primitive::<UInt64Type>();
+            Ok(Arc::new(shift(value_array, shift_array, bit_shift_type)?))
+        }
+        dt => {
+            internal_err!("{name} function does not support data type: {dt}")
+        }
+    }
+}
+
+#[derive(Debug, Hash, Copy, Clone, Eq, PartialEq)]
+enum BitShiftType {
+    Left,
+    Right,
+    RightUnsigned,
+}
+
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub struct SparkBitShift {
+    signature: Signature,
+    name: &'static str,
+    bit_shift_type: BitShiftType,
+}
+
+impl SparkBitShift {
+    fn new(name: &'static str, bit_shift_type: BitShiftType) -> Self {
+        let shift_amount = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Upcast small ints to 32bit
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Native(logical_int32()),
+                            vec![
+                                TypeSignatureClass::Native(logical_int8()),
+                                TypeSignatureClass::Native(logical_int16()),
+                            ],
+                            NativeType::Int32,
+                        ),
+                        shift_amount.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_implicit(
+                            TypeSignatureClass::Native(logical_uint32()),
+                            vec![
+                                TypeSignatureClass::Native(logical_uint8()),
+                                TypeSignatureClass::Native(logical_uint16()),
+                            ],
+                            NativeType::UInt32,
+                        ),
+                        shift_amount.clone(),
+                    ]),
+                    // Otherwise accept direct 64 bit integers
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_int64())),
+                        shift_amount.clone(),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_uint64())),
+                        shift_amount.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+            name,
+            bit_shift_type,
+        }
+    }
+
+    pub fn left() -> Self {
+        Self::new("shiftleft", BitShiftType::Left)
+    }
+
+    pub fn right() -> Self {
+        Self::new("shiftright", BitShiftType::Right)
+    }
+
+    pub fn right_unsigned() -> Self {
+        Self::new("shiftrightunsigned", BitShiftType::RightUnsigned)
+    }
+}
+
+impl ScalarUDFImpl for SparkBitShift {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        let data_type = args.arg_fields[0].data_type().clone();
+        Ok(Arc::new(Field::new(self.name(), data_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let inner = |arr: &[ArrayRef]| -> Result<ArrayRef> {
+            shift_inner(arr, self.name(), self.bit_shift_type)
+        };
+        make_scalar_function(inner, vec![])(&args.args)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_bit_shift_nullability() -> Result<()> {
+        let func = SparkBitShift::left();
+
+        let non_nullable_value: FieldRef =
+            Arc::new(Field::new("value", DataType::Int64, false));
+        let non_nullable_shift: FieldRef =
+            Arc::new(Field::new("shift", DataType::Int32, false));
+
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[
+                Arc::clone(&non_nullable_value),
+                Arc::clone(&non_nullable_shift),
+            ],
+            scalar_arguments: &[None, None],
+        })?;
+
+        assert_eq!(out.data_type(), non_nullable_value.data_type());
+        assert!(
+            !out.is_nullable(),
+            "shift result should be non-nullable when both inputs are non-nullable"
+        );
+
+        let nullable_value: FieldRef =
+            Arc::new(Field::new("value", DataType::Int64, true));
+        let out_nullable_value = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable_value), Arc::clone(&non_nullable_shift)],
+            scalar_arguments: &[None, None],
+        })?;
+        assert!(
+            out_nullable_value.is_nullable(),
+            "shift result should be nullable when value is nullable"
+        );
+
+        let nullable_shift: FieldRef =
+            Arc::new(Field::new("shift", DataType::Int32, true));
+        let out_nullable_shift = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_value, nullable_shift],
+            scalar_arguments: &[None, None],
+        })?;
+        assert!(
+            out_nullable_shift.is_nullable(),
+            "shift result should be nullable when shift is nullable"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/bitwise/bitwise_not.rs b/datafusion/spark/src/function/bitwise/bitwise_not.rs
new file mode 100644
index 0000000000000..e7285d4804950
--- /dev/null
+++ b/datafusion/spark/src/function/bitwise/bitwise_not.rs
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::compute::kernels::bitwise;
+use arrow::datatypes::{
+    DataType, Field, FieldRef, Int8Type, Int16Type, Int32Type, Int64Type,
+};
+use datafusion_common::{Result, internal_err, plan_err};
+use datafusion_expr::{ColumnarValue, TypeSignature, Volatility};
+use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature};
+use datafusion_functions::utils::make_scalar_function;
+use std::{any::Any, sync::Arc};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBitwiseNot {
+    signature: Signature,
+}
+
+impl Default for SparkBitwiseNot {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBitwiseNot {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBitwiseNot {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bitwise_not"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "SparkBitwiseNot: return_type() is not used; return_field_from_args() is implemented"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.len() != 1 {
+            return plan_err!("bitwise_not expects exactly 1 argument");
+        }
+        make_scalar_function(spark_bitwise_not, vec![])(&args.args)
+    }
+}
+
+pub fn spark_bitwise_not(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let array = args[0].as_ref();
+    match array.data_type() {
+        DataType::Int8 => {
+            let result: Int8Array =
+                bitwise::bitwise_not(array.as_primitive::<Int8Type>())?;
+            Ok(Arc::new(result))
+        }
+        DataType::Int16 => {
+            let result: Int16Array =
+                bitwise::bitwise_not(array.as_primitive::<Int16Type>())?;
+            Ok(Arc::new(result))
+        }
+        DataType::Int32 => {
+            let result: Int32Array =
+                bitwise::bitwise_not(array.as_primitive::<Int32Type>())?;
+            Ok(Arc::new(result))
+        }
+        DataType::Int64 => {
+            let result: Int64Array =
+                bitwise::bitwise_not(array.as_primitive::<Int64Type>())?;
+            Ok(Arc::new(result))
+        }
+        _ => {
+            plan_err!(
+                "bitwise_not function does not support data type: {}",
+                array.data_type()
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field};
+    use std::sync::Arc;
+
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_bitwise_not_nullability() {
+        let bitwise_not = SparkBitwiseNot::new();
+
+        // --- non-nullable Int32 input ---
+        let non_nullable_i32 = Arc::new(Field::new("c", DataType::Int32, false));
+        let out_non_null = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i32)],
+                // single-argument function -> one scalar_argument slot (None)
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be non-nullable and the same DataType as input
+        assert!(!out_non_null.is_nullable());
+        assert_eq!(out_non_null.data_type(), &DataType::Int32);
+
+        // --- nullable Int32 input ---
+        let nullable_i32 = Arc::new(Field::new("c", DataType::Int32, true));
+        let out_nullable = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be nullable and the same DataType as input
+        assert!(out_nullable.is_nullable());
+        assert_eq!(out_nullable.data_type(), &DataType::Int32);
+
+        // --- also test another integer type (Int64) for completeness ---
+        let non_nullable_i64 = Arc::new(Field::new("c", DataType::Int64, false));
+        let out_i64 = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(!out_i64.is_nullable());
+        assert_eq!(out_i64.data_type(), &DataType::Int64);
+
+        let nullable_i64 = Arc::new(Field::new("c", DataType::Int64, true));
+        let out_i64_null = bitwise_not
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(out_i64_null.is_nullable());
+        assert_eq!(out_i64_null.data_type(), &DataType::Int64);
+    }
+}
diff --git a/datafusion/spark/src/function/bitwise/mod.rs b/datafusion/spark/src/function/bitwise/mod.rs
index a87df9a2c87a0..769ecf5c2fef5 100644
--- a/datafusion/spark/src/function/bitwise/mod.rs
+++ b/datafusion/spark/src/function/bitwise/mod.rs
@@ -15,11 +15,72 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod bit_count;
+pub mod bit_get;
+pub mod bit_shift;
+pub mod bitwise_not;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftleft,
+    bit_shift::SparkBitShift::left
+);
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftright,
+    bit_shift::SparkBitShift::right
+);
+make_udf_function!(
+    bit_shift::SparkBitShift,
+    shiftrightunsigned,
+    bit_shift::SparkBitShift::right_unsigned
+);
+make_udf_function!(bit_get::SparkBitGet, bit_get);
+make_udf_function!(bit_count::SparkBitCount, bit_count);
+make_udf_function!(bitwise_not::SparkBitwiseNot, bitwise_not);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((bit_get, "Returns the value of the bit (0 or 1) at the specified position.", col pos));
+    export_functions!((
+        bit_count,
+        "Returns the number of bits set in the binary representation of the argument.",
+        col
+    ));
+    export_functions!((
+        bitwise_not,
+        "Returns the result of a bitwise negation operation on the argument, where each bit in the binary representation is flipped, following two's complement arithmetic for signed integers.",
+        col
+    ));
+    export_functions!((
+        shiftleft,
+        "Shifts the bits of the first argument left by the number of positions specified by the second argument. If the shift amount is negative or greater than or equal to the bit width, it is normalized to the bit width (i.e., pmod(shift, bit_width)).",
+        value shift
+    ));
+    export_functions!((
+        shiftright,
+        "Shifts the bits of the first argument right by the number of positions specified by the second argument (arithmetic/signed shift). If the shift amount is negative or greater than or equal to the bit width, it is normalized to the bit width (i.e., pmod(shift, bit_width)).",
+        value shift
+    ));
+    export_functions!((
+        shiftrightunsigned,
+        "Shifts the bits of the first argument right by the number of positions specified by the second argument (logical/unsigned shift). If the shift amount is negative or greater than or equal to the bit width, it is normalized to the bit width (i.e., pmod(shift, bit_width)).",
+        value shift
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![
+        bit_get(),
+        bit_count(),
+        bitwise_not(),
+        shiftleft(),
+        shiftright(),
+        shiftrightunsigned(),
+    ]
 }
diff --git a/datafusion/spark/src/function/collection/mod.rs b/datafusion/spark/src/function/collection/mod.rs
index a87df9a2c87a0..6871e3aba6469 100644
--- a/datafusion/spark/src/function/collection/mod.rs
+++ b/datafusion/spark/src/function/collection/mod.rs
@@ -15,11 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod size;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(size::SparkSize, size);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((size, "Return the size of an array or map.", arg));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![size()]
 }
diff --git a/datafusion/spark/src/function/collection/size.rs b/datafusion/spark/src/function/collection/size.rs
new file mode 100644
index 0000000000000..05b8ba315675c
--- /dev/null
+++ b/datafusion/spark/src/function/collection/size.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Int32Array};
+use arrow::compute::kernels::length::length as arrow_length;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, plan_err};
+use datafusion_expr::{
+    ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `size` function.
+///
+/// Returns the number of elements in an array or the number of key-value pairs in a map.
+/// Returns -1 for null input (Spark behavior).
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSize {
+    signature: Signature,
+}
+
+impl Default for SparkSize {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSize {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // Array Type
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
+                        arguments: vec![ArrayFunctionArgument::Array],
+                        array_coercion: None,
+                    }),
+                    // Map Type
+                    TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSize {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "size"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
+        // nullable=false for legacy behavior (NULL -> -1); set to input nullability for null-on-null
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, false)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_size_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_size_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let array = &args[0];
+
+    match array.data_type() {
+        DataType::List(_) => {
+            if array.null_count() == 0 {
+                Ok(arrow_length(array)?)
+            } else {
+                let list_array = array.as_list::<i32>();
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            }
+        }
+        DataType::FixedSizeList(_, size) => {
+            if array.null_count() == 0 {
+                Ok(arrow_length(array)?)
+            } else {
+                let length: Vec<i32> = (0..array.len())
+                    .map(|i| if array.is_null(i) { -1 } else { *size })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(length)))
+            }
+        }
+        DataType::LargeList(_) => {
+            // Arrow length kernel returns Int64 for LargeList
+            let list_array = array.as_list::<i64>();
+            if array.null_count() == 0 {
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .map(|len| len as i32)
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            } else {
+                let lengths: Vec<i32> = list_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect();
+                Ok(Arc::new(Int32Array::from(lengths)))
+            }
+        }
+        DataType::Map(_, _) => {
+            let map_array = array.as_map();
+            let length: Vec<i32> = if array.null_count() == 0 {
+                map_array
+                    .offsets()
+                    .lengths()
+                    .map(|len| len as i32)
+                    .collect()
+            } else {
+                map_array
+                    .offsets()
+                    .lengths()
+                    .enumerate()
+                    .map(|(i, len)| if array.is_null(i) { -1 } else { len as i32 })
+                    .collect()
+            };
+            Ok(Arc::new(Int32Array::from(length)))
+        }
+        DataType::Null => Ok(Arc::new(Int32Array::from(vec![-1; array.len()]))),
+        dt => {
+            plan_err!("size function does not support type: {}", dt)
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/conditional/if.rs b/datafusion/spark/src/function/conditional/if.rs
new file mode 100644
index 0000000000000..e423f8264ecca
--- /dev/null
+++ b/datafusion/spark/src/function/conditional/if.rs
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, internal_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    binary::try_type_union_resolution, simplify::ExprSimplifyResult, when,
+};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkIf {
+    signature: Signature,
+}
+
+impl Default for SparkIf {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkIf {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkIf {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "if"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() != 3 {
+            return plan_err!(
+                "Function 'if' expects 3 arguments but received {}",
+                arg_types.len()
+            );
+        }
+
+        if arg_types[0] != DataType::Boolean && arg_types[0] != DataType::Null {
+            return plan_err!(
+                "For function 'if' {} is not a boolean or null",
+                arg_types[0]
+            );
+        }
+
+        let target_types = try_type_union_resolution(&arg_types[1..])?;
+        let mut result = vec![DataType::Boolean];
+        result.extend(target_types);
+        Ok(result)
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[1].clone())
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("if should have been simplified to case")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &datafusion_expr::simplify::SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let condition = args[0].clone();
+        let then_expr = args[1].clone();
+        let else_expr = args[2].clone();
+
+        // Convert IF(condition, then_expr, else_expr) to
+        // CASE WHEN condition THEN then_expr ELSE else_expr END
+        let case_expr = when(condition, then_expr).otherwise(else_expr)?;
+
+        Ok(ExprSimplifyResult::Simplified(case_expr))
+    }
+}
diff --git a/datafusion/spark/src/function/conditional/mod.rs b/datafusion/spark/src/function/conditional/mod.rs
index a87df9a2c87a0..4301d7642b41d 100644
--- a/datafusion/spark/src/function/conditional/mod.rs
+++ b/datafusion/spark/src/function/conditional/mod.rs
@@ -16,10 +16,19 @@
 // under the License.
 
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+mod r#if;
+
+make_udf_function!(r#if::SparkIf, r#if);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((r#if, "If arg1 evaluates to true, then returns arg2; otherwise returns arg3", arg1 arg2 arg3));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![r#if()]
 }
diff --git a/datafusion/spark/src/function/datetime/add_months.rs b/datafusion/spark/src/function/datetime/add_months.rs
new file mode 100644
index 0000000000000..fa9f6fa8db945
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/add_months.rs
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::ops::Add;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, IntervalUnit};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, Volatility,
+};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#add_months>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkAddMonths {
+    signature: Signature,
+}
+
+impl Default for SparkAddMonths {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAddMonths {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(
+                vec![DataType::Date32, DataType::Int32],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkAddMonths {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "add_months"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [date_arg, months_arg] = take_function_args("add_months", args)?;
+        let interval = months_arg
+            .cast_to(&DataType::Interval(IntervalUnit::YearMonth), info.schema())?;
+        Ok(ExprSimplifyResult::Simplified(date_arg.add(interval)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke should not be called on a simplified add_months() function")
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_add.rs b/datafusion/spark/src/function/datetime/date_add.rs
new file mode 100644
index 0000000000000..3745f77969f22
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_add.rs
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::ArrayRef;
+use arrow::compute;
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
+use datafusion_common::cast::{
+    as_date32_array, as_int8_array, as_int16_array, as_int32_array,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateAdd {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkDateAdd {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateAdd {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec!["dateadd".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateAdd {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "date_add"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("Use return_field_from_args in this case instead.")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_date_add, vec![])(&args.args)
+    }
+}
+
+fn spark_date_add(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [date_arg, days_arg] = take_function_args("date_add", args)?;
+    let date_array = as_date32_array(date_arg)?;
+    let result = match days_arg.data_type() {
+        DataType::Int8 => {
+            let days_array = as_int8_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_add(days as i32),
+            )?
+        }
+        DataType::Int16 => {
+            let days_array = as_int16_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_add(days as i32),
+            )?
+        }
+        DataType::Int32 => {
+            let days_array = as_int32_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_add(days),
+            )?
+        }
+        _ => {
+            return internal_err!(
+                "Spark `date_add` function: argument must be int8, int16, int32, got {:?}",
+                days_arg.data_type()
+            );
+        }
+    };
+    Ok(Arc::new(result))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+
+    #[test]
+    fn test_date_add_non_nullable_inputs() {
+        let func = SparkDateAdd::new();
+        let args = &[
+            Arc::new(Field::new("date", DataType::Date32, false)),
+            Arc::new(Field::new("num", DataType::Int8, false)),
+        ];
+
+        let ret_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: args,
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(ret_field.data_type(), &DataType::Date32);
+        assert!(!ret_field.is_nullable());
+    }
+
+    #[test]
+    fn test_date_add_nullable_inputs() {
+        let func = SparkDateAdd::new();
+        let args = &[
+            Arc::new(Field::new("date", DataType::Date32, false)),
+            Arc::new(Field::new("num", DataType::Int16, true)),
+        ];
+
+        let ret_field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: args,
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(ret_field.data_type(), &DataType::Date32);
+        assert!(ret_field.is_nullable());
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_diff.rs b/datafusion/spark/src/function/datetime/date_diff.rs
new file mode 100644
index 0000000000000..094c35eec56b5
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_diff.rs
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_date, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, Operator, ReturnFieldArgs,
+    ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+    binary_expr,
+};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_diff>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateDiff {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkDateDiff {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateDiff {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![
+                            TypeSignatureClass::Native(logical_string()),
+                            TypeSignatureClass::Timestamp,
+                        ],
+                        NativeType::Date,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![
+                            TypeSignatureClass::Native(logical_string()),
+                            TypeSignatureClass::Timestamp,
+                        ],
+                        NativeType::Date,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec!["datediff".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateDiff {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "date_diff"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "Apache Spark `date_diff` should have been simplified to standard subtraction"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [end, start] = take_function_args(self.name(), args)?;
+        let end = end.cast_to(&DataType::Date32, info.schema())?;
+        let start = start.cast_to(&DataType::Date32, info.schema())?;
+        Ok(ExprSimplifyResult::Simplified(
+            binary_expr(end, Operator::Minus, start)
+                .cast_to(&DataType::Int32, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_part.rs b/datafusion/spark/src/function/datetime/date_part.rs
new file mode 100644
index 0000000000000..e30a162ef42db
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_part.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::logical_date;
+use datafusion_common::{
+    Result, ScalarValue, internal_err, types::logical_string, utils::take_function_args,
+};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use std::{any::Any, sync::Arc};
+
+/// Wrapper around datafusion date_part function to handle
+/// Spark behavior returning day of the week 1-indexed instead of 0-indexed and different part aliases.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_part>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDatePart {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkDatePart {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDatePart {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Timestamp),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_date())),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec![String::from("datepart")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDatePart {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "date_part"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("Use return_field_from_args in this case instead.")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("spark date_part should have been simplified to standard date_part")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [part_expr, date_expr] = take_function_args(self.name(), args)?;
+
+        let part = match part_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return internal_err!(
+                    "First argument of `DATE_PART` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific date part aliases to datafusion ones
+        let part = match part.as_str() {
+            "yearofweek" | "year_iso" => "isoyear",
+            "dayofweek" => "dow",
+            "dayofweek_iso" | "dow_iso" => "isodow",
+            other => other,
+        };
+
+        let part_expr = Expr::Literal(ScalarValue::new_utf8(part), None);
+
+        let date_part_expr = Expr::ScalarFunction(ScalarFunction::new_udf(
+            datafusion_functions::datetime::date_part(),
+            vec![part_expr, date_expr],
+        ));
+
+        match part {
+            // Add 1 for day-of-week parts to convert 0-indexed to 1-indexed
+            "dow" | "isodow" => Ok(ExprSimplifyResult::Simplified(
+                date_part_expr + Expr::Literal(ScalarValue::Int32(Some(1)), None),
+            )),
+            _ => Ok(ExprSimplifyResult::Simplified(date_part_expr)),
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_sub.rs b/datafusion/spark/src/function/datetime/date_sub.rs
new file mode 100644
index 0000000000000..af1b8d5a4e91e
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_sub.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::ArrayRef;
+use arrow::compute;
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
+use datafusion_common::cast::{
+    as_date32_array, as_int8_array, as_int16_array, as_int32_array,
+};
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateSub {
+    signature: Signature,
+}
+
+impl Default for SparkDateSub {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateSub {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int8]),
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int16]),
+                    TypeSignature::Exact(vec![DataType::Date32, DataType::Int32]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateSub {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "date_sub"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_date_sub, vec![])(&args.args)
+    }
+}
+
+fn spark_date_sub(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [date_arg, days_arg] = args else {
+        return internal_err!(
+            "Spark `date_sub` function requires 2 arguments, got {}",
+            args.len()
+        );
+    };
+    let date_array = as_date32_array(date_arg)?;
+    let result = match days_arg.data_type() {
+        DataType::Int8 => {
+            let days_array = as_int8_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_sub(days as i32),
+            )?
+        }
+        DataType::Int16 => {
+            let days_array = as_int16_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_sub(days as i32),
+            )?
+        }
+        DataType::Int32 => {
+            let days_array = as_int32_array(days_arg)?;
+            compute::binary::<_, _, _, Date32Type>(
+                date_array,
+                days_array,
+                |date, days| date.wrapping_sub(days),
+            )?
+        }
+        _ => {
+            return internal_err!(
+                "Spark `date_sub` function: argument must be int8, int16, int32, got {:?}",
+                days_arg.data_type()
+            );
+        }
+    };
+    Ok(Arc::new(result))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_date_sub_nullability_non_nullable_args() {
+        let udf = SparkDateSub::new();
+        let date_field = Arc::new(Field::new("d", DataType::Date32, false));
+        let days_field = Arc::new(Field::new("n", DataType::Int32, false));
+
+        let result = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[date_field, days_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Date32);
+    }
+
+    #[test]
+    fn test_date_sub_nullability_nullable_arg() {
+        let udf = SparkDateSub::new();
+        let date_field = Arc::new(Field::new("d", DataType::Date32, false));
+        let nullable_days_field = Arc::new(Field::new("n", DataType::Int32, true));
+
+        let result = udf
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[date_field, nullable_days_field],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Date32);
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/date_trunc.rs b/datafusion/spark/src/function/datetime/date_trunc.rs
new file mode 100644
index 0000000000000..2199c90703b38
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/date_trunc.rs
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark date_trunc supports extra format aliases.
+/// It also handles timestamps with timezones by converting to session timezone first.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkDateTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkDateTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkDateTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkDateTrunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "date_trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[1].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "spark date_trunc should have been simplified to standard date_trunc"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [fmt_expr, ts_expr] = take_function_args(self.name(), args)?;
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "First argument of `DATE_TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific fmt aliases to datafusion ones
+        let fmt = match fmt.as_str() {
+            "yy" | "yyyy" => "year",
+            "mm" | "mon" => "month",
+            "dd" => "day",
+            other => other,
+        };
+
+        let session_tz = info.config_options().execution.time_zone.clone();
+        let ts_type = ts_expr.get_type(info.schema())?;
+
+        // Spark interprets timestamps in the session timezone before truncating,
+        // then returns a timestamp at microsecond precision.
+        // See: https://github.com/apache/spark/blob/f310f4fcc95580a6824bc7d22b76006f79b8804a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala#L492
+        //
+        // For sub-second truncations (second, millisecond, microsecond), timezone
+        // adjustment is unnecessary since timezone offsets are whole seconds.
+        let ts_expr = match (&ts_type, fmt) {
+            // Sub-second truncations don't need timezone adjustment
+            (_, "second" | "millisecond" | "microsecond") => ts_expr,
+
+            // convert to session timezone, strip timezone and convert back to original timezone
+            (DataType::Timestamp(unit, tz), _) => {
+                let ts_expr = match &session_tz {
+                    Some(session_tz) => ts_expr.cast_to(
+                        &DataType::Timestamp(
+                            TimeUnit::Microsecond,
+                            Some(Arc::from(session_tz.as_str())),
+                        ),
+                        info.schema(),
+                    )?,
+                    None => ts_expr,
+                };
+                Expr::ScalarFunction(ScalarFunction::new_udf(
+                    datafusion_functions::datetime::to_local_time(),
+                    vec![ts_expr],
+                ))
+                .cast_to(&DataType::Timestamp(*unit, tz.clone()), info.schema())?
+            }
+
+            _ => {
+                return plan_err!(
+                    "Second argument of `DATE_TRUNC` must be Timestamp, got {}",
+                    ts_type
+                );
+            }
+        };
+
+        let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt), None);
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(
+                datafusion_functions::datetime::date_trunc(),
+                vec![fmt_expr, ts_expr],
+            ),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/extract.rs b/datafusion/spark/src/function/datetime/extract.rs
new file mode 100644
index 0000000000000..4bf130a73f566
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/extract.rs
@@ -0,0 +1,268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::array::ArrayRef;
+use arrow::compute::{DatePart, date_part};
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+use datafusion_common::utils::take_function_args;
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Creates a signature for datetime extraction functions that accept timestamp types.
+fn extract_signature() -> Signature {
+    Signature::coercible(
+        vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+        Volatility::Immutable,
+    )
+}
+
+// -----------------------------------------------------------------------------
+// SparkHour
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkHour {
+    signature: Signature,
+}
+
+impl Default for SparkHour {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkHour {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkHour {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "hour"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_hour, vec![])(&args.args)
+    }
+}
+
+fn spark_hour(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("hour", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Hour)?;
+    Ok(result)
+}
+
+// -----------------------------------------------------------------------------
+// SparkMinute
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMinute {
+    signature: Signature,
+}
+
+impl Default for SparkMinute {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMinute {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMinute {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "minute"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_minute, vec![])(&args.args)
+    }
+}
+
+fn spark_minute(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("minute", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Minute)?;
+    Ok(result)
+}
+
+// -----------------------------------------------------------------------------
+// SparkSecond
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSecond {
+    signature: Signature,
+}
+
+impl Default for SparkSecond {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSecond {
+    pub fn new() -> Self {
+        Self {
+            signature: extract_signature(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSecond {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "second"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_second, vec![])(&args.args)
+    }
+}
+
+fn spark_second(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [ts_arg] = take_function_args("second", args)?;
+    let result = date_part(ts_arg.as_ref(), DatePart::Second)?;
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Array, Int32Array, TimestampMicrosecondArray};
+    use arrow::datatypes::TimeUnit;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_spark_hour() {
+        // Create a timestamp array: 2024-01-15 14:30:45 UTC (in microseconds)
+        // 14:30:45 -> hour = 14
+        let ts_micros = 1_705_329_045_000_000_i64; // 2024-01-15 14:30:45 UTC
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_hour(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 14);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_spark_minute() {
+        // 14:30:45 -> minute = 30
+        let ts_micros = 1_705_329_045_000_000_i64;
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_minute(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 30);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_spark_second() {
+        // 14:30:45 -> second = 45
+        let ts_micros = 1_705_329_045_000_000_i64;
+        let ts_array = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]);
+        let ts_array = Arc::new(ts_array) as ArrayRef;
+
+        let result = spark_second(&[ts_array]).unwrap();
+        let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+        assert_eq!(result.value(0), 45);
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_hour_return_type() {
+        let func = SparkHour::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+
+    #[test]
+    fn test_minute_return_type() {
+        let func = SparkMinute::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+
+    #[test]
+    fn test_second_return_type() {
+        let func = SparkSecond::new();
+        let result = func
+            .return_type(&[DataType::Timestamp(TimeUnit::Microsecond, None)])
+            .unwrap();
+        assert_eq!(result, DataType::Int32);
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/from_utc_timestamp.rs b/datafusion/spark/src/function/datetime/from_utc_timestamp.rs
new file mode 100644
index 0000000000000..77cc66da5f37d
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/from_utc_timestamp.rs
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, AsArray, PrimitiveBuilder, StringArrayType};
+use arrow::datatypes::TimeUnit;
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Field, FieldRef, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_datafusion_err, exec_err, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::datetime::to_local_time::adjust_to_local_time;
+use datafusion_functions::utils::make_scalar_function;
+
+/// Apache Spark `from_utc_timestamp` function.
+///
+/// Interprets the given timestamp as UTC and converts it to the given timezone.
+///
+/// Timestamp in Apache Spark represents number of microseconds from the Unix epoch, which is not
+/// timezone-agnostic. So in Apache Spark this function just shift the timestamp value from UTC timezone to
+/// the given timezone.
+///
+/// See <https://spark.apache.org/docs/latest/api/sql/index.html#from_utc_timestamp>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkFromUtcTimestamp {
+    signature: Signature,
+}
+
+impl Default for SparkFromUtcTimestamp {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkFromUtcTimestamp {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkFromUtcTimestamp {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "from_utc_timestamp"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_from_utc_timestamp, vec![])(&args.args)
+    }
+}
+
+fn spark_from_utc_timestamp(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [timestamp, timezone] = take_function_args("from_utc_timestamp", args)?;
+
+    match timestamp.data_type() {
+        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampNanosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMicrosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMillisecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Second, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampSecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        ts_type => {
+            exec_err!("`from_utc_timestamp`: unsupported argument types: {ts_type}")
+        }
+    }
+}
+
+fn process_timestamp_with_tz_array<T: ArrowTimestampType>(
+    ts_array: &ArrayRef,
+    tz_array: &ArrayRef,
+    tz_opt: Option<Arc<str>>,
+) -> Result<ArrayRef> {
+    match tz_array.data_type() {
+        DataType::Utf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i32>())
+        }
+        DataType::LargeUtf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i64>())
+        }
+        DataType::Utf8View => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string_view())
+        }
+        other => {
+            exec_err!("`from_utc_timestamp`: timezone must be a string type, got {other}")
+        }
+    }
+}
+
+fn process_arrays<'a, T: ArrowTimestampType, S>(
+    return_tz_opt: Option<Arc<str>>,
+    ts_array: &ArrayRef,
+    tz_array: &'a S,
+) -> Result<ArrayRef>
+where
+    &'a S: StringArrayType<'a>,
+{
+    let ts_primitive = ts_array.as_primitive::<T>();
+    let mut builder = PrimitiveBuilder::<T>::with_capacity(ts_array.len());
+
+    for (ts_opt, tz_opt) in ts_primitive.iter().zip(tz_array.iter()) {
+        match (ts_opt, tz_opt) {
+            (Some(ts), Some(tz_str)) => {
+                let tz: Tz = tz_str.parse().map_err(|e| {
+                    exec_datafusion_err!(
+                        "`from_utc_timestamp`: invalid timezone '{tz_str}': {e}"
+                    )
+                })?;
+                let val = adjust_to_local_time::<T>(ts, tz)?;
+                builder.append_value(val);
+            }
+            _ => builder.append_null(),
+        }
+    }
+
+    builder = builder.with_timezone_opt(return_tz_opt);
+    Ok(Arc::new(builder.finish()))
+}
diff --git a/datafusion/spark/src/function/datetime/last_day.rs b/datafusion/spark/src/function/datetime/last_day.rs
new file mode 100644
index 0000000000000..4c6f731db18a6
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/last_day.rs
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray, Date32Array};
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
+use chrono::{Datelike, Duration, NaiveDate};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_datafusion_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkLastDay {
+    signature: Signature,
+}
+
+impl Default for SparkLastDay {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkLastDay {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Date32], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkLastDay {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "last_day"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let Some(field) = args.arg_fields.first() else {
+            return internal_err!("Spark `last_day` expects exactly one argument");
+        };
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Date32,
+            field.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        let [arg] = take_function_args("last_day", args)?;
+        match arg {
+            ColumnarValue::Scalar(ScalarValue::Date32(days)) => {
+                if let Some(days) = days {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
+                        spark_last_day(days)?,
+                    ))))
+                } else {
+                    Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+                }
+            }
+            ColumnarValue::Array(array) => {
+                let result = match array.data_type() {
+                    DataType::Date32 => {
+                        let result: Date32Array = array
+                            .as_primitive::<Date32Type>()
+                            .try_unary(spark_last_day)?
+                            .with_data_type(DataType::Date32);
+                        Ok(Arc::new(result) as ArrayRef)
+                    }
+                    other => {
+                        internal_err!(
+                            "Unsupported data type {other:?} for Spark function `last_day`"
+                        )
+                    }
+                }?;
+                Ok(ColumnarValue::Array(result))
+            }
+            other => {
+                internal_err!("Unsupported arg {other:?} for Spark function `last_day")
+            }
+        }
+    }
+}
+
+fn spark_last_day(days: i32) -> Result<i32> {
+    let date = Date32Type::to_naive_date_opt(days).ok_or_else(|| {
+        exec_datafusion_err!(
+            "Spark `last_day`: Unable to convert days value {days} to date"
+        )
+    })?;
+
+    let (year, month) = (date.year(), date.month());
+    let (next_year, next_month) = if month == 12 {
+        (year + 1, 1)
+    } else {
+        (year, month + 1)
+    };
+
+    let first_day_next_month = NaiveDate::from_ymd_opt(next_year, next_month, 1)
+        .ok_or_else(|| {
+            exec_datafusion_err!(
+                "Spark `last_day`: Unable to parse date from {next_year}, {next_month}, 1"
+            )
+        })?;
+
+    Ok(Date32Type::from_naive_date(
+        first_day_next_month - Duration::days(1),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, Date32Array};
+    use arrow::datatypes::Field;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{ColumnarValue, ReturnFieldArgs};
+
+    #[test]
+    fn test_last_day_nullability_matches_input() {
+        let func = SparkLastDay::new();
+
+        let non_nullable_arg = Arc::new(Field::new("arg", DataType::Date32, false));
+        let nullable_arg = Arc::new(Field::new("arg", DataType::Date32, true));
+
+        let non_nullable_out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_arg)],
+                scalar_arguments: &[None],
+            })
+            .expect("non-nullable arg should succeed");
+        assert_eq!(non_nullable_out.data_type(), &DataType::Date32);
+        assert!(!non_nullable_out.is_nullable());
+
+        let nullable_out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_arg)],
+                scalar_arguments: &[None],
+            })
+            .expect("nullable arg should succeed");
+        assert_eq!(nullable_out.data_type(), &DataType::Date32);
+        assert!(nullable_out.is_nullable());
+    }
+
+    #[test]
+    fn test_last_day_scalar_evaluation() {
+        test_scalar_function!(
+            SparkLastDay::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Date32(Some(0)))],
+            Ok(Some(30)),
+            i32,
+            DataType::Date32,
+            Date32Array
+        );
+
+        test_scalar_function!(
+            SparkLastDay::new(),
+            vec![ColumnarValue::Scalar(ScalarValue::Date32(None))],
+            Ok(None),
+            i32,
+            DataType::Date32,
+            Date32Array
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/make_dt_interval.rs b/datafusion/spark/src/function/datetime/make_dt_interval.rs
new file mode 100644
index 0000000000000..f00b4c5804eca
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/make_dt_interval.rs
@@ -0,0 +1,590 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, AsArray, DurationMicrosecondBuilder, PrimitiveArray,
+};
+use arrow::datatypes::TimeUnit::Microsecond;
+use arrow::datatypes::{DataType, Field, FieldRef, Float64Type, Int32Type};
+use datafusion_common::types::{NativeType, logical_float64, logical_int32};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, internal_err, plan_datafusion_err,
+};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMakeDtInterval {
+    signature: Signature,
+}
+
+impl Default for SparkMakeDtInterval {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMakeDtInterval {
+    pub fn new() -> Self {
+        let int32 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+
+        let variants = vec![
+            TypeSignature::Nullary,
+            // (days)
+            TypeSignature::Coercible(vec![int32.clone()]),
+            // (days, hours)
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone()]),
+            // (days, hours, minutes)
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone(), int32.clone()]),
+            // (days, hours, minutes, seconds)
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                float64,
+            ]),
+        ];
+
+        Self {
+            signature: Signature::one_of(variants, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMakeDtInterval {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "make_dt_interval"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    /// Note the return type is `DataType::Duration(TimeUnit::Microsecond)` and not `DataType::Interval(DayTime)` as you might expect.
+    /// This is because `DataType::Interval(DayTime)` has precision only to the millisecond, whilst Spark's `DayTimeIntervalType` has
+    /// precision to the microsecond. We use `DataType::Duration(TimeUnit::Microsecond)` in order to not lose any precision. See the
+    /// [Sail compatibility doc] for reference.
+    ///
+    /// [Sail compatibility doc]: https://github.com/lakehq/sail/blob/dc5368daa24d40a7758a299e1ba8fc985cb29108/docs/guide/dataframe/data-types/compatibility.md?plain=1#L260
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let has_non_finite_secs = args
+            .scalar_arguments
+            .get(3)
+            .and_then(|arg| {
+                arg.map(|scalar| match scalar {
+                    ScalarValue::Float64(Some(v)) => !v.is_finite(),
+                    ScalarValue::Float32(Some(v)) => !v.is_finite(),
+                    _ => false,
+                })
+            })
+            .unwrap_or(false);
+        let nullable =
+            has_non_finite_secs || args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Duration(Microsecond),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.is_empty() {
+            return Ok(ColumnarValue::Scalar(ScalarValue::DurationMicrosecond(
+                Some(0),
+            )));
+        }
+        if args.args.len() > 4 {
+            return Err(DataFusionError::Execution(format!(
+                "make_dt_interval expects between 0 and 4 arguments, got {}",
+                args.args.len()
+            )));
+        }
+        make_scalar_function(make_dt_interval_kernel, vec![])(&args.args)
+    }
+}
+
+fn make_dt_interval_kernel(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+    let n_rows = args[0].len();
+    let days = args[0]
+        .as_primitive_opt::<Int32Type>()
+        .ok_or_else(|| plan_datafusion_err!("make_dt_interval arg[0] must be Int32"))?;
+    let hours: Option<&PrimitiveArray<Int32Type>> = args
+        .get(1)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[1] must be Int32")
+            })
+        })
+        .transpose()?;
+    let mins: Option<&PrimitiveArray<Int32Type>> = args
+        .get(2)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[2] must be Int32")
+            })
+        })
+        .transpose()?;
+    let secs: Option<&PrimitiveArray<Float64Type>> = args
+        .get(3)
+        .map(|a| {
+            a.as_primitive_opt::<Float64Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[3] must be Float64")
+            })
+        })
+        .transpose()?;
+    let mut builder = DurationMicrosecondBuilder::with_capacity(n_rows);
+
+    for i in 0..n_rows {
+        // if one column is NULL → result NULL
+        let any_null_present = days.is_null(i)
+            || hours.as_ref().is_some_and(|a| a.is_null(i))
+            || mins.as_ref().is_some_and(|a| a.is_null(i))
+            || secs
+                .as_ref()
+                .is_some_and(|a| a.is_null(i) || !a.value(i).is_finite());
+
+        if any_null_present {
+            builder.append_null();
+            continue;
+        }
+
+        // default values 0 or 0.0
+        let d = days.value(i);
+        let h = hours.as_ref().map_or(0, |a| a.value(i));
+        let mi = mins.as_ref().map_or(0, |a| a.value(i));
+        let s = secs.as_ref().map_or(0.0, |a| a.value(i));
+
+        match make_interval_dt_nano(d, h, mi, s) {
+            Some(v) => builder.append_value(v),
+            None => {
+                builder.append_null();
+                continue;
+            }
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+fn make_interval_dt_nano(day: i32, hour: i32, min: i32, sec: f64) -> Option<i64> {
+    const HOURS_PER_DAY: i32 = 24;
+    const MINS_PER_HOUR: i32 = 60;
+    const SECS_PER_MINUTE: i64 = 60;
+    const MICROS_PER_SEC: i64 = 1_000_000;
+
+    let total_hours: i32 = day
+        .checked_mul(HOURS_PER_DAY)
+        .and_then(|v| v.checked_add(hour))?;
+
+    let total_mins: i32 = total_hours
+        .checked_mul(MINS_PER_HOUR)
+        .and_then(|v| v.checked_add(min))?;
+
+    let mut sec_whole: i64 = sec.trunc() as i64;
+    let sec_frac: f64 = sec - (sec_whole as f64);
+    let mut frac_us: i64 = (sec_frac * (MICROS_PER_SEC as f64)).round() as i64;
+
+    if frac_us.abs() >= MICROS_PER_SEC {
+        if frac_us > 0 {
+            frac_us -= MICROS_PER_SEC;
+            sec_whole = sec_whole.checked_add(1)?;
+        } else {
+            frac_us += MICROS_PER_SEC;
+            sec_whole = sec_whole.checked_sub(1)?;
+        }
+    }
+
+    let total_secs: i64 = (total_mins as i64)
+        .checked_mul(SECS_PER_MINUTE)
+        .and_then(|v| v.checked_add(sec_whole))?;
+
+    let total_us = total_secs
+        .checked_mul(MICROS_PER_SEC)
+        .and_then(|v| v.checked_add(frac_us))?;
+
+    Some(total_us)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::{DurationMicrosecondArray, Float64Array, Int32Array};
+    use arrow::datatypes::DataType::Duration;
+    use arrow::datatypes::{DataType, Field, TimeUnit::Microsecond};
+    use datafusion_common::{DataFusionError, Result, internal_datafusion_err};
+    use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs};
+
+    use super::*;
+
+    fn run_make_dt_interval(arrs: Vec<ArrayRef>) -> Result<ArrayRef> {
+        make_dt_interval_kernel(&arrs)
+    }
+
+    #[test]
+    fn nulls_propagate_per_row() -> Result<()> {
+        let days = Arc::new(Int32Array::from(vec![
+            None,
+            Some(2),
+            Some(3),
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+        ])) as ArrayRef;
+
+        let hours = Arc::new(Int32Array::from(vec![
+            Some(1),
+            None,
+            Some(3),
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+        ])) as ArrayRef;
+
+        let mins = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+        ])) as ArrayRef;
+
+        let secs = Arc::new(Float64Array::from(vec![
+            Some(1.0),
+            Some(2.0),
+            Some(3.0),
+            None,
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+            Some(f64::NEG_INFINITY),
+        ])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![days, hours, mins, secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn return_field_respects_nullability() -> Result<()> {
+        let udf = SparkMakeDtInterval::new();
+
+        // All nullable inputs -> nullable output
+        let arg_fields = vec![
+            Arc::new(Field::new("days", DataType::Int32, true)),
+            Arc::new(Field::new("hours", DataType::Int32, true)),
+            Arc::new(Field::new("mins", DataType::Int32, true)),
+            Arc::new(Field::new("secs", DataType::Float64, true)),
+        ];
+
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &arg_fields,
+            scalar_arguments: &[None, None, None, None],
+        })?;
+        assert!(out.is_nullable());
+        assert_eq!(out.data_type(), &Duration(Microsecond));
+
+        // Non-nullable inputs -> non-nullable output
+        let non_nullable_arg_fields = vec![
+            Arc::new(Field::new("days", DataType::Int32, false)),
+            Arc::new(Field::new("hours", DataType::Int32, false)),
+            Arc::new(Field::new("mins", DataType::Int32, false)),
+            Arc::new(Field::new("secs", DataType::Float64, false)),
+        ];
+
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &non_nullable_arg_fields,
+            scalar_arguments: &[None, None, None, None],
+        })?;
+        assert!(!out.is_nullable());
+
+        // Non-finite secs scalar should force nullable even if fields are non-nullable
+        let scalar_values =
+            [None, None, None, Some(ScalarValue::Float64(Some(f64::NAN)))];
+        let scalar_refs = scalar_values.iter().map(|v| v.as_ref()).collect::<Vec<_>>();
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &non_nullable_arg_fields,
+            scalar_arguments: &scalar_refs,
+        })?;
+        assert!(out.is_nullable());
+
+        // Zero-arg call (defaults) should also be non-nullable
+        let out = udf.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[],
+            scalar_arguments: &[],
+        })?;
+        assert!(!out.is_nullable());
+
+        Ok(())
+    }
+
+    #[test]
+    fn error_months_overflow_should_be_null() -> Result<()> {
+        // months = year*12 + month → NULL
+
+        let days = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef;
+
+        let hours = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef;
+
+        let mins = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef;
+
+        let secs = Arc::new(Float64Array::from(vec![Some(1.0)])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![days, hours, mins, secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+
+        Ok(())
+    }
+
+    fn invoke_make_dt_interval_with_args(
+        args: Vec<ColumnarValue>,
+        number_rows: usize,
+    ) -> Result<ColumnarValue, DataFusionError> {
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Field::new("f", Duration(Microsecond), true).into(),
+            config_options: Arc::new(Default::default()),
+        };
+        SparkMakeDtInterval::new().invoke_with_args(args)
+    }
+
+    #[test]
+    fn zero_args_returns_zero_duration() -> Result<()> {
+        let number_rows: usize = 3;
+
+        let res: ColumnarValue = invoke_make_dt_interval_with_args(vec![], number_rows)?;
+        let arr = res.into_array(number_rows)?;
+        let arr = arr
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(arr.len(), number_rows);
+        for i in 0..number_rows {
+            assert!(!arr.is_null(i));
+            assert_eq!(arr.value(i), 0_i64);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn one_day_minus_24_hours_equals_zero() -> Result<()> {
+        let arr_days = Arc::new(Int32Array::from(vec![Some(1), Some(-1)])) as ArrayRef;
+        let arr_hours = Arc::new(Int32Array::from(vec![Some(-24), Some(24)])) as ArrayRef;
+        let arr_mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let arr_secs =
+            Arc::new(Float64Array::from(vec![Some(0.0), Some(0.0)])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(out.len(), 2);
+        assert_eq!(out.null_count(), 0);
+        assert_eq!(out.value(0), 0_i64);
+        assert_eq!(out.value(1), 0_i64);
+        Ok(())
+    }
+
+    #[test]
+    fn one_hour_minus_60_mins_equals_zero() -> Result<()> {
+        let arr_days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let arr_hours = Arc::new(Int32Array::from(vec![Some(-1), Some(1)])) as ArrayRef;
+        let arr_mins = Arc::new(Int32Array::from(vec![Some(60), Some(-60)])) as ArrayRef;
+        let arr_secs =
+            Arc::new(Float64Array::from(vec![Some(0.0), Some(0.0)])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(out.len(), 2);
+        assert_eq!(out.null_count(), 0);
+        assert_eq!(out.value(0), 0_i64);
+        assert_eq!(out.value(1), 0_i64);
+        Ok(())
+    }
+
+    #[test]
+    fn one_mins_minus_60_secs_equals_zero() -> Result<()> {
+        let arr_days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let arr_hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let arr_mins = Arc::new(Int32Array::from(vec![Some(-1), Some(1)])) as ArrayRef;
+        let arr_secs =
+            Arc::new(Float64Array::from(vec![Some(60.0), Some(-60.0)])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![arr_days, arr_hours, arr_mins, arr_secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(out.len(), 2);
+        assert_eq!(out.null_count(), 0);
+        assert_eq!(out.value(0), 0_i64);
+        assert_eq!(out.value(1), 0_i64);
+        Ok(())
+    }
+
+    #[test]
+    fn frac_carries_up_to_next_second_positive() -> Result<()> {
+        // 0.9999995s → 1_000_000 µs (carry a +1s)
+        let days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let secs = Arc::new(Float64Array::from(vec![
+            Some(0.999_999_5),
+            Some(0.999_999_4),
+        ])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![days, hours, mins, secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(out.len(), 2);
+        assert_eq!(out.value(0), 1_000_000);
+        assert_eq!(out.value(1), 999_999);
+        Ok(())
+    }
+
+    #[test]
+    fn frac_carries_down_to_prev_second_negative() -> Result<()> {
+        // -0.9999995s → -1_000_000 µs (carry a −1s)
+        let days = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let hours = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let mins = Arc::new(Int32Array::from(vec![Some(0), Some(0)])) as ArrayRef;
+        let secs = Arc::new(Float64Array::from(vec![
+            Some(-0.999_999_5),
+            Some(-0.999_999_4),
+        ])) as ArrayRef;
+
+        let out = run_make_dt_interval(vec![days, hours, mins, secs])?;
+        let out = out
+            .as_any()
+            .downcast_ref::<DurationMicrosecondArray>()
+            .ok_or_else(|| {
+                internal_datafusion_err!("expected DurationMicrosecondArray")
+            })?;
+
+        assert_eq!(out.len(), 2);
+        assert_eq!(out.value(0), -1_000_000);
+        assert_eq!(out.value(1), -999_999);
+        Ok(())
+    }
+
+    #[test]
+    fn no_more_than_4_params() -> Result<()> {
+        let udf = SparkMakeDtInterval::new();
+
+        // Create args with 5 parameters (exceeds the limit of 4)
+        let args = vec![
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(2))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(3))),
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(4.0))),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(5))),
+        ];
+
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+
+        let func_args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows: 1,
+            return_field: Field::new("f", Duration(Microsecond), true).into(),
+            config_options: Arc::new(Default::default()),
+        };
+
+        let res = udf.invoke_with_args(func_args);
+
+        assert!(
+            matches!(res, Err(DataFusionError::Execution(_))),
+            "make_dt_interval should return execution error for more than 4 arguments"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/make_interval.rs b/datafusion/spark/src/function/datetime/make_interval.rs
new file mode 100644
index 0000000000000..e4dd541793048
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/make_interval.rs
@@ -0,0 +1,609 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, IntervalMonthDayNanoBuilder, PrimitiveArray};
+use arrow::datatypes::DataType::Interval;
+use arrow::datatypes::IntervalUnit::MonthDayNano;
+use arrow::datatypes::{DataType, IntervalMonthDayNano};
+use datafusion_common::types::{NativeType, logical_float64, logical_int32};
+use datafusion_common::{DataFusionError, Result, ScalarValue, plan_datafusion_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMakeInterval {
+    signature: Signature,
+}
+
+impl Default for SparkMakeInterval {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMakeInterval {
+    pub fn new() -> Self {
+        let int32 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int32()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int32,
+        );
+
+        let float64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+
+        let variants = vec![
+            TypeSignature::Nullary,
+            // year
+            TypeSignature::Coercible(vec![int32.clone()]),
+            // year, month
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone()]),
+            // year, month, week
+            TypeSignature::Coercible(vec![int32.clone(), int32.clone(), int32.clone()]),
+            // year, month, week, day
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour, minute
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+            ]),
+            // year, month, week, day, hour, minute, second
+            TypeSignature::Coercible(vec![
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                int32.clone(),
+                float64.clone(),
+            ]),
+        ];
+
+        Self {
+            signature: Signature::one_of(variants, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMakeInterval {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "make_interval"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Interval(MonthDayNano))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        if args.args.is_empty() {
+            return Ok(ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(
+                Some(IntervalMonthDayNano::new(0, 0, 0)),
+            )));
+        }
+        make_scalar_function(make_interval_kernel, vec![])(&args.args)
+    }
+}
+
+fn make_interval_kernel(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+    use arrow::array::AsArray;
+    use arrow::datatypes::{Float64Type, Int32Type};
+
+    let n_rows = args[0].len();
+
+    let years = args[0]
+        .as_primitive_opt::<Int32Type>()
+        .ok_or_else(|| plan_datafusion_err!("make_interval arg[0] must be Int32"))?;
+    let months = args
+        .get(1)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[1] must be Int32")
+            })
+        })
+        .transpose()?;
+    let weeks = args
+        .get(2)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[2] must be Int32")
+            })
+        })
+        .transpose()?;
+    let days: Option<&PrimitiveArray<Int32Type>> = args
+        .get(3)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[3] must be Int32")
+            })
+        })
+        .transpose()?;
+    let hours: Option<&PrimitiveArray<Int32Type>> = args
+        .get(4)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[4] must be Int32")
+            })
+        })
+        .transpose()?;
+    let mins: Option<&PrimitiveArray<Int32Type>> = args
+        .get(5)
+        .map(|a| {
+            a.as_primitive_opt::<Int32Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[5] must be Int32")
+            })
+        })
+        .transpose()?;
+    let secs: Option<&PrimitiveArray<Float64Type>> = args
+        .get(6)
+        .map(|a| {
+            a.as_primitive_opt::<Float64Type>().ok_or_else(|| {
+                plan_datafusion_err!("make_dt_interval arg[6] must be Float64")
+            })
+        })
+        .transpose()?;
+
+    let mut builder = IntervalMonthDayNanoBuilder::with_capacity(n_rows);
+
+    for i in 0..n_rows {
+        // if one column is NULL → result NULL
+        let any_null_present = years.is_null(i)
+            || months.as_ref().is_some_and(|a| a.is_null(i))
+            || weeks.as_ref().is_some_and(|a| a.is_null(i))
+            || days.as_ref().is_some_and(|a| a.is_null(i))
+            || hours.as_ref().is_some_and(|a| a.is_null(i))
+            || mins.as_ref().is_some_and(|a| a.is_null(i))
+            || secs
+                .as_ref()
+                .is_some_and(|a| a.is_null(i) || !a.value(i).is_finite());
+
+        if any_null_present {
+            builder.append_null();
+            continue;
+        }
+
+        // default values 0 or 0.0
+        let y = years.value(i);
+        let mo = months.as_ref().map_or(0, |a| a.value(i));
+        let w = weeks.as_ref().map_or(0, |a| a.value(i));
+        let d = days.as_ref().map_or(0, |a| a.value(i));
+        let h = hours.as_ref().map_or(0, |a| a.value(i));
+        let mi = mins.as_ref().map_or(0, |a| a.value(i));
+        let s = secs.as_ref().map_or(0.0, |a| a.value(i));
+
+        match make_interval_month_day_nano(y, mo, w, d, h, mi, s) {
+            Some(v) => builder.append_value(v),
+            None => {
+                builder.append_null();
+                continue;
+            }
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+
+fn make_interval_month_day_nano(
+    year: i32,
+    month: i32,
+    week: i32,
+    day: i32,
+    hour: i32,
+    min: i32,
+    sec: f64,
+) -> Option<IntervalMonthDayNano> {
+    // checks if overflow
+    let months = year.checked_mul(12).and_then(|v| v.checked_add(month))?;
+    let total_days = week.checked_mul(7).and_then(|v| v.checked_add(day))?;
+
+    let hours_nanos = (hour as i64).checked_mul(3_600_000_000_000)?;
+    let mins_nanos = (min as i64).checked_mul(60_000_000_000)?;
+
+    let sec_int = sec.trunc() as i64;
+    let frac = sec - sec.trunc();
+    let mut frac_nanos = (frac * 1_000_000_000.0).round() as i64;
+
+    if frac_nanos.abs() >= 1_000_000_000 {
+        if frac_nanos > 0 {
+            frac_nanos -= 1_000_000_000;
+        } else {
+            frac_nanos += 1_000_000_000;
+        }
+    }
+
+    let secs_nanos = sec_int.checked_mul(1_000_000_000)?;
+
+    let total_nanos = hours_nanos
+        .checked_add(mins_nanos)
+        .and_then(|v| v.checked_add(secs_nanos))
+        .and_then(|v| v.checked_add(frac_nanos))?;
+
+    Some(IntervalMonthDayNano::new(months, total_days, total_nanos))
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Float64Array, Int32Array, IntervalMonthDayNanoArray};
+    use arrow::datatypes::Field;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_common::{
+        Result, assert_eq_or_internal_err, internal_datafusion_err, internal_err,
+    };
+
+    use super::*;
+    fn run_make_interval_month_day_nano(arrs: Vec<ArrayRef>) -> Result<ArrayRef> {
+        make_interval_kernel(&arrs)
+    }
+
+    #[test]
+    fn nulls_propagate_per_row() {
+        let year = Arc::new(Int32Array::from(vec![
+            None,
+            Some(2),
+            Some(3),
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let month = Arc::new(Int32Array::from(vec![
+            Some(1),
+            None,
+            Some(3),
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let week = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+            Some(6),
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let day = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(3),
+            None,
+            Some(5),
+            Some(6),
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let hour = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(3),
+            Some(4),
+            None,
+            Some(6),
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let min = Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(3),
+            Some(4),
+            Some(5),
+            None,
+            Some(7),
+            Some(8),
+            Some(9),
+        ]));
+        let sec = Arc::new(Float64Array::from(vec![
+            Some(1.0),
+            Some(2.0),
+            Some(3.0),
+            Some(4.0),
+            Some(5.0),
+            Some(6.0),
+            None,
+            Some(f64::INFINITY),
+            Some(f64::NEG_INFINITY),
+        ]));
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, min, sec,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .ok_or_else(|| internal_datafusion_err!("expected IntervalMonthDayNano"))
+            .unwrap();
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+    }
+
+    #[test]
+    fn error_months_overflow_should_be_null() {
+        // months = year*12 + month → NULL
+        let year = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, min, sec,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .ok_or_else(|| internal_datafusion_err!("expected IntervalMonthDayNano"))
+            .unwrap();
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+    }
+    #[test]
+    fn error_days_overflow_should_be_null() {
+        // months = year*12 + month →  NULL
+        let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, min, sec,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .ok_or_else(|| internal_datafusion_err!("expected IntervalMonthDayNano"))
+            .unwrap();
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+    }
+    #[test]
+    fn error_min_overflow_should_be_null() {
+        let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let min = Arc::new(Int32Array::from(vec![Some(i32::MAX)])) as ArrayRef;
+        let sec = Arc::new(Float64Array::from(vec![Some(0.0)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, min, sec,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .ok_or_else(|| internal_datafusion_err!("expected IntervalMonthDayNano"))
+            .unwrap();
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+    }
+    #[test]
+    fn error_sec_overflow_should_be_null() {
+        let year = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let min = Arc::new(Int32Array::from(vec![Some(0)])) as ArrayRef;
+        let sec = Arc::new(Float64Array::from(vec![Some(f64::MAX)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, min, sec,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .ok_or_else(|| internal_datafusion_err!("expected IntervalMonthDayNano"))
+            .unwrap();
+
+        for i in 0..out.len() {
+            assert!(out.is_null(i), "row {i} should be NULL");
+        }
+    }
+
+    #[test]
+    fn happy_path_all_present_single_row() {
+        // 1y 2m 3w 4d 5h 6m 7.25s
+        let year = Arc::new(Int32Array::from(vec![Some(1)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(2)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(3)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(4)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(5)])) as ArrayRef;
+        let mins = Arc::new(Int32Array::from(vec![Some(6)])) as ArrayRef;
+        let secs = Arc::new(Float64Array::from(vec![Some(7.25)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, mins, secs,
+        ])
+        .unwrap();
+        assert_eq!(out.data_type(), &Interval(MonthDayNano));
+
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .unwrap();
+        assert_eq!(out.len(), 1);
+        assert_eq!(out.null_count(), 0);
+
+        let v: IntervalMonthDayNano = out.value(0);
+        assert_eq!(v.months, 12 + 2); // 14
+        assert_eq!(v.days, 3 * 7 + 4); // 25
+        let expected_nanos = (5_i64 * 3600 + 6 * 60 + 7) * 1_000_000_000 + 250_000_000;
+        assert_eq!(v.nanoseconds, expected_nanos);
+    }
+
+    #[test]
+    fn negative_components_and_fractional_seconds() {
+        // -1y -2m  -1w -1d  -1h -1m  -1.5s
+        let year = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef;
+        let month = Arc::new(Int32Array::from(vec![Some(-2)])) as ArrayRef;
+        let week = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef;
+        let day = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef;
+        let hour = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef;
+        let mins = Arc::new(Int32Array::from(vec![Some(-1)])) as ArrayRef;
+        let secs = Arc::new(Float64Array::from(vec![Some(-1.5)])) as ArrayRef;
+
+        let out = run_make_interval_month_day_nano(vec![
+            year, month, week, day, hour, mins, secs,
+        ])
+        .unwrap();
+        let out = out
+            .as_any()
+            .downcast_ref::<IntervalMonthDayNanoArray>()
+            .unwrap();
+
+        assert_eq!(out.len(), 1);
+        assert_eq!(out.null_count(), 0);
+        let v = out.value(0);
+
+        assert_eq!(v.months, -12 + (-2)); // -14
+        assert_eq!(v.days, -7 + (-1)); // -8
+
+        // -(1h + 1m + 1.5s) en nanos
+        let expected_nanos = -((3600_i64 + 60 + 1) * 1_000_000_000 + 500_000_000);
+        assert_eq!(v.nanoseconds, expected_nanos);
+    }
+
+    fn invoke_make_interval_with_args(
+        args: Vec<ColumnarValue>,
+        number_rows: usize,
+    ) -> Result<ColumnarValue, DataFusionError> {
+        let arg_fields = args
+            .iter()
+            .map(|arg| Field::new("a", arg.data_type(), true).into())
+            .collect::<Vec<_>>();
+        let args = ScalarFunctionArgs {
+            args,
+            arg_fields,
+            number_rows,
+            return_field: Field::new("f", Interval(MonthDayNano), true).into(),
+            config_options: Arc::new(ConfigOptions::default()),
+        };
+        SparkMakeInterval::new().invoke_with_args(args)
+    }
+
+    #[test]
+    fn zero_args_returns_zero_seconds() -> Result<()> {
+        let number_rows = 2;
+        let res: ColumnarValue = invoke_make_interval_with_args(vec![], number_rows)?;
+
+        match res {
+            ColumnarValue::Array(arr) => {
+                let arr = arr
+                    .as_any()
+                    .downcast_ref::<IntervalMonthDayNanoArray>()
+                    .ok_or_else(|| {
+                        internal_datafusion_err!("expected IntervalMonthDayNanoArray")
+                    })?;
+                assert_eq_or_internal_err!(
+                    arr.len(),
+                    number_rows,
+                    "expected array length {number_rows}"
+                );
+                for i in 0..number_rows {
+                    let iv = arr.value(i);
+                    assert_eq_or_internal_err!(
+                        (iv.months, iv.days, iv.nanoseconds),
+                        (0, 0, 0),
+                        "row {i}: expected (0,0,0), got ({},{},{})",
+                        iv.months,
+                        iv.days,
+                        iv.nanoseconds
+                    );
+                }
+            }
+            ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(iv))) => {
+                assert_eq_or_internal_err!(
+                    (iv.months, iv.days, iv.nanoseconds),
+                    (0, 0, 0),
+                    "expected scalar 0s, got ({},{},{})",
+                    iv.months,
+                    iv.days,
+                    iv.nanoseconds
+                );
+            }
+            other => {
+                return internal_err!(
+                    "expected Array or Scalar IntervalMonthDayNano, got {other:?}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/mod.rs b/datafusion/spark/src/function/datetime/mod.rs
index a87df9a2c87a0..3133ed7337f25 100644
--- a/datafusion/spark/src/function/datetime/mod.rs
+++ b/datafusion/spark/src/function/datetime/mod.rs
@@ -15,11 +15,194 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod add_months;
+pub mod date_add;
+pub mod date_diff;
+pub mod date_part;
+pub mod date_sub;
+pub mod date_trunc;
+pub mod extract;
+pub mod from_utc_timestamp;
+pub mod last_day;
+pub mod make_dt_interval;
+pub mod make_interval;
+pub mod next_day;
+pub mod time_trunc;
+pub mod to_utc_timestamp;
+pub mod trunc;
+pub mod unix;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(add_months::SparkAddMonths, add_months);
+make_udf_function!(date_add::SparkDateAdd, date_add);
+make_udf_function!(date_diff::SparkDateDiff, date_diff);
+make_udf_function!(date_part::SparkDatePart, date_part);
+make_udf_function!(date_sub::SparkDateSub, date_sub);
+make_udf_function!(date_trunc::SparkDateTrunc, date_trunc);
+make_udf_function!(
+    from_utc_timestamp::SparkFromUtcTimestamp,
+    from_utc_timestamp
+);
+make_udf_function!(extract::SparkHour, hour);
+make_udf_function!(extract::SparkMinute, minute);
+make_udf_function!(extract::SparkSecond, second);
+make_udf_function!(last_day::SparkLastDay, last_day);
+make_udf_function!(make_dt_interval::SparkMakeDtInterval, make_dt_interval);
+make_udf_function!(make_interval::SparkMakeInterval, make_interval);
+make_udf_function!(next_day::SparkNextDay, next_day);
+make_udf_function!(time_trunc::SparkTimeTrunc, time_trunc);
+make_udf_function!(to_utc_timestamp::SparkToUtcTimestamp, to_utc_timestamp);
+make_udf_function!(trunc::SparkTrunc, trunc);
+make_udf_function!(unix::SparkUnixDate, unix_date);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_micros,
+    unix::SparkUnixTimestamp::microseconds
+);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_millis,
+    unix::SparkUnixTimestamp::milliseconds
+);
+make_udf_function!(
+    unix::SparkUnixTimestamp,
+    unix_seconds,
+    unix::SparkUnixTimestamp::seconds
+);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        add_months,
+        "Returns the date that is months months after start. The function returns NULL if at least one of the input parameters is NULL.",
+        arg1 arg2
+    ));
+    export_functions!((
+        date_add,
+        "Returns the date that is days days after start. The function returns NULL if at least one of the input parameters is NULL.",
+        arg1 arg2
+    ));
+    export_functions!((
+        date_sub,
+        "Returns the date that is days days before start. The function returns NULL if at least one of the input parameters is NULL.",
+        arg1 arg2
+    ));
+    export_functions!((hour, "Extracts the hour component of a timestamp.", arg1));
+    export_functions!((
+        minute,
+        "Extracts the minute component of a timestamp.",
+        arg1
+    ));
+    export_functions!((
+        second,
+        "Extracts the second component of a timestamp.",
+        arg1
+    ));
+    export_functions!((
+        last_day,
+        "Returns the last day of the month which the date belongs to.",
+        arg1
+    ));
+    export_functions!((
+        make_dt_interval,
+        "Make a day time interval from given days, hours, mins and secs (return type is actually a Duration(Microsecond))",
+         days hours mins secs
+    ));
+    export_functions!((
+        make_interval,
+        "Make interval from years, months, weeks, days, hours, mins and secs.",
+        years months weeks days hours mins secs
+    ));
+    // TODO: add once ANSI support is added:
+    // "When both of the input parameters are not NULL and day_of_week is an invalid input, the function throws SparkIllegalArgumentException if spark.sql.ansi.enabled is set to true, otherwise NULL."
+    export_functions!((
+        next_day,
+        "Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.",
+        arg1 arg2
+    ));
+    export_functions!((
+        date_diff,
+        "Returns the number of days from start `start` to end `end`.",
+        end start
+    ));
+    export_functions!((
+        date_trunc,
+        "Truncates a timestamp `ts` to the unit specified by the format `fmt`.",
+        fmt ts
+    ));
+    export_functions!((
+        time_trunc,
+        "Truncates a time `t` to the unit specified by the format `fmt`.",
+        fmt t
+    ));
+    export_functions!((
+        trunc,
+        "Truncates a date `dt` to the unit specified by the format `fmt`.",
+        dt fmt
+    ));
+    export_functions!((
+        date_part,
+        "Extracts a part of the date or time from a date, time, or timestamp expression.",
+        arg1 arg2
+    ));
+    export_functions!((
+        from_utc_timestamp,
+        "Interpret a given timestamp `ts` in UTC timezone and then convert it to timezone `tz`.",
+        ts tz
+    ));
+    export_functions!((
+        to_utc_timestamp,
+        "Interpret a given timestamp `ts` in timezone `tz` and then convert it to UTC timezone.",
+        ts tz
+    ));
+    export_functions!((
+        unix_date,
+        "Returns the number of days since epoch (1970-01-01) for the given date `dt`.",
+        dt
+    ));
+    export_functions!((
+        unix_micros,
+        "Returns the number of microseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
+    export_functions!((
+        unix_millis,
+        "Returns the number of milliseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
+    export_functions!((
+        unix_seconds,
+        "Returns the number of seconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp `ts`.",
+        ts
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![
+        add_months(),
+        date_add(),
+        date_diff(),
+        date_part(),
+        date_sub(),
+        date_trunc(),
+        from_utc_timestamp(),
+        hour(),
+        last_day(),
+        make_dt_interval(),
+        make_interval(),
+        minute(),
+        next_day(),
+        second(),
+        time_trunc(),
+        to_utc_timestamp(),
+        trunc(),
+        unix_date(),
+        unix_micros(),
+        unix_millis(),
+        unix_seconds(),
+    ]
 }
diff --git a/datafusion/spark/src/function/datetime/next_day.rs b/datafusion/spark/src/function/datetime/next_day.rs
new file mode 100644
index 0000000000000..a456a78315970
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/next_day.rs
@@ -0,0 +1,288 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, AsArray, Date32Array, StringArrayType};
+use arrow::datatypes::{DataType, Date32Type, Field, FieldRef};
+use chrono::{Datelike, Duration, Weekday};
+use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#next_day>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkNextDay {
+    signature: Signature,
+}
+
+impl Default for SparkNextDay {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkNextDay {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(
+                vec![DataType::Date32, DataType::Utf8],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkNextDay {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "next_day"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result<FieldRef> {
+        // Spark marks next_day as always nullable because invalid day_of_week values
+        // can yield NULL even when inputs are non-null.
+        Ok(Arc::new(Field::new(self.name(), DataType::Date32, true)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        let [date, day_of_week] = args.as_slice() else {
+            return exec_err!(
+                "Spark `next_day` function requires 2 arguments, got {}",
+                args.len()
+            );
+        };
+
+        match (date, day_of_week) {
+            (ColumnarValue::Scalar(date), ColumnarValue::Scalar(day_of_week)) => {
+                match (date, day_of_week) {
+                    (
+                        ScalarValue::Date32(days),
+                        ScalarValue::Utf8(day_of_week)
+                        | ScalarValue::LargeUtf8(day_of_week)
+                        | ScalarValue::Utf8View(day_of_week),
+                    ) => {
+                        if let Some(days) = days {
+                            if let Some(day_of_week) = day_of_week {
+                                Ok(ColumnarValue::Scalar(ScalarValue::Date32(
+                                    spark_next_day(*days, day_of_week.as_str()),
+                                )))
+                            } else {
+                                // TODO: if spark.sql.ansi.enabled is false,
+                                //  returns NULL instead of an error for a malformed dayOfWeek.
+                                Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+                            }
+                        } else {
+                            Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+                        }
+                    }
+                    _ => exec_err!(
+                        "Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"
+                    ),
+                }
+            }
+            (ColumnarValue::Array(date_array), ColumnarValue::Scalar(day_of_week)) => {
+                match (date_array.data_type(), day_of_week) {
+                    (
+                        DataType::Date32,
+                        ScalarValue::Utf8(day_of_week)
+                        | ScalarValue::LargeUtf8(day_of_week)
+                        | ScalarValue::Utf8View(day_of_week),
+                    ) => {
+                        if let Some(day_of_week) = day_of_week {
+                            let result: Date32Array = date_array
+                                .as_primitive::<Date32Type>()
+                                .unary_opt(|days| {
+                                    spark_next_day(days, day_of_week.as_str())
+                                })
+                                .with_data_type(DataType::Date32);
+                            Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
+                        } else {
+                            // TODO: if spark.sql.ansi.enabled is false,
+                            //  returns NULL instead of an error for a malformed dayOfWeek.
+                            Ok(ColumnarValue::Scalar(ScalarValue::Date32(None)))
+                        }
+                    }
+                    _ => exec_err!(
+                        "Spark `next_day` function: first arg must be date, second arg must be string. Got {args:?}"
+                    ),
+                }
+            }
+            (
+                ColumnarValue::Array(date_array),
+                ColumnarValue::Array(day_of_week_array),
+            ) => {
+                let result = match (date_array.data_type(), day_of_week_array.data_type())
+                {
+                    (
+                        DataType::Date32,
+                        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View,
+                    ) => {
+                        let date_array: &Date32Array =
+                            date_array.as_primitive::<Date32Type>();
+                        match day_of_week_array.data_type() {
+                            DataType::Utf8 => {
+                                let day_of_week_array =
+                                    day_of_week_array.as_string::<i32>();
+                                process_next_day_arrays(date_array, day_of_week_array)
+                            }
+                            DataType::LargeUtf8 => {
+                                let day_of_week_array =
+                                    day_of_week_array.as_string::<i64>();
+                                process_next_day_arrays(date_array, day_of_week_array)
+                            }
+                            DataType::Utf8View => {
+                                let day_of_week_array =
+                                    day_of_week_array.as_string_view();
+                                process_next_day_arrays(date_array, day_of_week_array)
+                            }
+                            other => {
+                                exec_err!(
+                                    "Spark `next_day` function: second arg must be string. Got {other:?}"
+                                )
+                            }
+                        }
+                    }
+                    (left, right) => {
+                        exec_err!(
+                            "Spark `next_day` function: first arg must be date, second arg must be string. Got {left:?}, {right:?}"
+                        )
+                    }
+                }?;
+                Ok(ColumnarValue::Array(result))
+            }
+            _ => exec_err!("Unsupported args {args:?} for Spark function `next_day`"),
+        }
+    }
+}
+
+fn process_next_day_arrays<'a, S>(
+    date_array: &Date32Array,
+    day_of_week_array: &'a S,
+) -> Result<ArrayRef>
+where
+    &'a S: StringArrayType<'a>,
+{
+    let result = date_array
+        .iter()
+        .zip(day_of_week_array.iter())
+        .map(|(days, day_of_week)| {
+            if let Some(days) = days {
+                if let Some(day_of_week) = day_of_week {
+                    spark_next_day(days, day_of_week)
+                } else {
+                    // TODO: if spark.sql.ansi.enabled is false,
+                    //  returns NULL instead of an error for a malformed dayOfWeek.
+                    None
+                }
+            } else {
+                None
+            }
+        })
+        .collect::<Date32Array>();
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+fn spark_next_day(days: i32, day_of_week: &str) -> Option<i32> {
+    let date = Date32Type::to_naive_date_opt(days)?;
+
+    let day_of_week = day_of_week.trim().to_uppercase();
+    let day_of_week = match day_of_week.as_str() {
+        "MO" | "MON" | "MONDAY" => Some("MONDAY"),
+        "TU" | "TUE" | "TUESDAY" => Some("TUESDAY"),
+        "WE" | "WED" | "WEDNESDAY" => Some("WEDNESDAY"),
+        "TH" | "THU" | "THURSDAY" => Some("THURSDAY"),
+        "FR" | "FRI" | "FRIDAY" => Some("FRIDAY"),
+        "SA" | "SAT" | "SATURDAY" => Some("SATURDAY"),
+        "SU" | "SUN" | "SUNDAY" => Some("SUNDAY"),
+        _ => {
+            // TODO: if spark.sql.ansi.enabled is false,
+            //  returns NULL instead of an error for a malformed dayOfWeek.
+            None
+        }
+    };
+
+    if let Some(day_of_week) = day_of_week {
+        let day_of_week = day_of_week.parse::<Weekday>();
+        match day_of_week {
+            Ok(day_of_week) => Some(Date32Type::from_naive_date(
+                date + Duration::days(
+                    (7 - date.weekday().days_since(day_of_week)) as i64,
+                ),
+            )),
+            Err(_) => {
+                // TODO: if spark.sql.ansi.enabled is false,
+                //  returns NULL instead of an error for a malformed dayOfWeek.
+                None
+            }
+        }
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn return_type_is_not_used() {
+        let func = SparkNextDay::new();
+        let err = func
+            .return_type(&[DataType::Date32, DataType::Utf8])
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("return_field_from_args should be used instead")
+        );
+    }
+
+    #[test]
+    fn next_day_is_always_nullable() {
+        let func = SparkNextDay::new();
+        let date_field: FieldRef =
+            Arc::new(Field::new("start_date", DataType::Date32, false));
+        let day_field: FieldRef =
+            Arc::new(Field::new("day_of_week", DataType::Utf8, false));
+
+        let field = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&date_field), Arc::clone(&day_field)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        assert_eq!(field.data_type(), &DataType::Date32);
+        assert!(field.is_nullable());
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/time_trunc.rs b/datafusion/spark/src/function/datetime/time_trunc.rs
new file mode 100644
index 0000000000000..718502a05ee6d
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/time_trunc.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::logical_string;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark time_trunc function only handles time inputs.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#time_trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkTimeTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkTimeTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTimeTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    Coercion::new_exact(TypeSignatureClass::Time),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkTimeTrunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "time_trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[1].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!(
+            "spark time_trunc should have been simplified to standard date_trunc"
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let fmt_expr = &args[0];
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "First argument of `TIME_TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        if !matches!(
+            fmt.as_str(),
+            "hour" | "minute" | "second" | "millisecond" | "microsecond"
+        ) {
+            return plan_err!(
+                "The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond"
+            );
+        }
+
+        Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
+            ScalarFunction::new_udf(datafusion_functions::datetime::date_trunc(), args),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/to_utc_timestamp.rs b/datafusion/spark/src/function/datetime/to_utc_timestamp.rs
new file mode 100644
index 0000000000000..0e8c267a390e1
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/to_utc_timestamp.rs
@@ -0,0 +1,225 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, AsArray, PrimitiveBuilder, StringArrayType};
+use arrow::datatypes::TimeUnit;
+use arrow::datatypes::{
+    ArrowTimestampType, DataType, Field, FieldRef, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use chrono::{DateTime, Offset, TimeZone};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{
+    Result, exec_datafusion_err, exec_err, internal_datafusion_err, internal_err,
+};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Apache Spark `to_utc_timestamp` function.
+///
+/// Interprets the given timestamp in the provided timezone and then converts it to UTC.
+///
+/// Timestamp in Apache Spark represents number of microseconds from the Unix epoch, which is not
+/// timezone-agnostic. So in Apache Spark this function just shift the timestamp value from the given
+/// timezone to UTC timezone.
+///
+/// See <https://spark.apache.org/docs/latest/api/sql/index.html#to_utc_timestamp>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkToUtcTimestamp {
+    signature: Signature,
+}
+
+impl Default for SparkToUtcTimestamp {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkToUtcTimestamp {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Timestamp,
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Timestamp(TimeUnit::Microsecond, None),
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkToUtcTimestamp {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "to_utc_timestamp"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(to_utc_timestamp, vec![])(&args.args)
+    }
+}
+
+fn to_utc_timestamp(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [timestamp, timezone] = take_function_args("to_utc_timestamp", args)?;
+
+    match timestamp.data_type() {
+        DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampNanosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMicrosecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampMillisecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        DataType::Timestamp(TimeUnit::Second, tz_opt) => {
+            process_timestamp_with_tz_array::<TimestampSecondType>(
+                timestamp,
+                timezone,
+                tz_opt.clone(),
+            )
+        }
+        ts_type => {
+            exec_err!("`to_utc_timestamp`: unsupported argument types: {ts_type}")
+        }
+    }
+}
+
+fn process_timestamp_with_tz_array<T: ArrowTimestampType>(
+    ts_array: &ArrayRef,
+    tz_array: &ArrayRef,
+    tz_opt: Option<Arc<str>>,
+) -> Result<ArrayRef> {
+    match tz_array.data_type() {
+        DataType::Utf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i32>())
+        }
+        DataType::LargeUtf8 => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i64>())
+        }
+        DataType::Utf8View => {
+            process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string_view())
+        }
+        other => {
+            exec_err!("`to_utc_timestamp`: timezone must be a string type, got {other}")
+        }
+    }
+}
+
+fn process_arrays<'a, T: ArrowTimestampType, S>(
+    return_tz_opt: Option<Arc<str>>,
+    ts_array: &ArrayRef,
+    tz_array: &'a S,
+) -> Result<ArrayRef>
+where
+    &'a S: StringArrayType<'a>,
+{
+    let ts_primitive = ts_array.as_primitive::<T>();
+    let mut builder = PrimitiveBuilder::<T>::with_capacity(ts_array.len());
+
+    for (ts_opt, tz_opt) in ts_primitive.iter().zip(tz_array.iter()) {
+        match (ts_opt, tz_opt) {
+            (Some(ts), Some(tz_str)) => {
+                let tz: Tz = tz_str.parse().map_err(|e| {
+                    exec_datafusion_err!(
+                        "`to_utc_timestamp`: invalid timezone '{tz_str}': {e}"
+                    )
+                })?;
+                let val = adjust_to_utc_time::<T>(ts, tz)?;
+                builder.append_value(val);
+            }
+            _ => builder.append_null(),
+        }
+    }
+
+    builder = builder.with_timezone_opt(return_tz_opt);
+    Ok(Arc::new(builder.finish()))
+}
+
+fn adjust_to_utc_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
+    let dt = match T::UNIT {
+        TimeUnit::Nanosecond => Some(DateTime::from_timestamp_nanos(ts)),
+        TimeUnit::Microsecond => DateTime::from_timestamp_micros(ts),
+        TimeUnit::Millisecond => DateTime::from_timestamp_millis(ts),
+        TimeUnit::Second => DateTime::from_timestamp(ts, 0),
+    }
+    .ok_or_else(|| internal_datafusion_err!("Invalid timestamp"))?;
+    let naive_dt = dt.naive_utc();
+
+    let offset_seconds = tz
+        .offset_from_utc_datetime(&naive_dt)
+        .fix()
+        .local_minus_utc() as i64;
+
+    let offset_in_unit = match T::UNIT {
+        TimeUnit::Nanosecond => offset_seconds.checked_mul(1_000_000_000),
+        TimeUnit::Microsecond => offset_seconds.checked_mul(1_000_000),
+        TimeUnit::Millisecond => offset_seconds.checked_mul(1_000),
+        TimeUnit::Second => Some(offset_seconds),
+    }
+    .ok_or_else(|| internal_datafusion_err!("Offset overflow"))?;
+
+    ts.checked_sub(offset_in_unit).ok_or_else(|| {
+        internal_datafusion_err!("Timestamp overflow during timezone adjustment")
+    })
+}
diff --git a/datafusion/spark/src/function/datetime/trunc.rs b/datafusion/spark/src/function/datetime/trunc.rs
new file mode 100644
index 0000000000000..b584cc9a70d44
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/trunc.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::{NativeType, logical_date, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err, plan_err};
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Spark trunc supports date inputs only and extra format aliases.
+/// Also spark trunc's argument order is (date, format).
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#trunc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkTrunc {
+    signature: Signature,
+}
+
+impl Default for SparkTrunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkTrunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_date()),
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Date,
+                    ),
+                    Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkTrunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "trunc"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("spark trunc should have been simplified to standard date_trunc")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [dt_expr, fmt_expr] = take_function_args(self.name(), args)?;
+
+        let fmt = match fmt_expr.as_literal() {
+            Some(ScalarValue::Utf8(Some(v)))
+            | Some(ScalarValue::Utf8View(Some(v)))
+            | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
+            _ => {
+                return plan_err!(
+                    "Second argument of `TRUNC` must be non-null scalar Utf8"
+                );
+            }
+        };
+
+        // Map Spark-specific fmt aliases to datafusion ones
+        let fmt = match fmt.as_str() {
+            "yy" | "yyyy" => "year",
+            "mm" | "mon" => "month",
+            "year" | "month" | "day" | "week" | "quarter" => fmt.as_str(),
+            _ => {
+                return plan_err!(
+                    "The format argument of `TRUNC` must be one of: year, yy, yyyy, month, mm, mon, day, week, quarter."
+                );
+            }
+        };
+        let return_type = dt_expr.get_type(info.schema())?;
+
+        let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt), None);
+
+        // Spark uses Dates so we need to cast to timestamp and back to work with datafusion's date_trunc
+        Ok(ExprSimplifyResult::Simplified(
+            Expr::ScalarFunction(ScalarFunction::new_udf(
+                datafusion_functions::datetime::date_trunc(),
+                vec![
+                    fmt_expr,
+                    dt_expr.cast_to(
+                        &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                        info.schema(),
+                    )?,
+                ],
+            ))
+            .cast_to(&return_type, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/datetime/unix.rs b/datafusion/spark/src/function/datetime/unix.rs
new file mode 100644
index 0000000000000..4254b2ed85d58
--- /dev/null
+++ b/datafusion/spark/src/function/datetime/unix.rs
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
+use datafusion_common::types::logical_date;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{
+    Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs,
+    ScalarUDFImpl, Signature, TypeSignatureClass, Volatility,
+};
+
+/// Returns the number of days since epoch (1970-01-01) for the given date.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_date>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnixDate {
+    signature: Signature,
+}
+
+impl Default for SparkUnixDate {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnixDate {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Native(
+                    logical_date(),
+                ))],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnixDate {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "unix_date"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields[0].is_nullable();
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke_with_args should not be called on SparkUnixDate")
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [date] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(
+            date.cast_to(&DataType::Date32, info.schema())?
+                .cast_to(&DataType::Int32, info.schema())?,
+        ))
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnixTimestamp {
+    time_unit: TimeUnit,
+    signature: Signature,
+    name: &'static str,
+}
+
+impl SparkUnixTimestamp {
+    pub fn new(name: &'static str, time_unit: TimeUnit) -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_exact(TypeSignatureClass::Timestamp)],
+                Volatility::Immutable,
+            ),
+            time_unit,
+            name,
+        }
+    }
+
+    /// Returns the number of microseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_micros>
+    pub fn microseconds() -> Self {
+        Self::new("unix_micros", TimeUnit::Microsecond)
+    }
+
+    /// Returns the number of milliseconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_millis>
+    pub fn milliseconds() -> Self {
+        Self::new("unix_millis", TimeUnit::Millisecond)
+    }
+
+    /// Returns the number of seconds since epoch (1970-01-01 00:00:00 UTC) for the given timestamp.
+    /// <https://spark.apache.org/docs/latest/api/sql/index.html#unix_seconds>
+    pub fn seconds() -> Self {
+        Self::new("unix_seconds", TimeUnit::Second)
+    }
+}
+
+impl ScalarUDFImpl for SparkUnixTimestamp {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields[0].is_nullable();
+        Ok(Arc::new(Field::new(self.name(), DataType::Int64, nullable)))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        internal_err!("invoke_with_args should not be called on `{}`", self.name())
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [ts] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(
+            ts.cast_to(
+                &DataType::Timestamp(self.time_unit, Some("UTC".into())),
+                info.schema(),
+            )?
+            .cast_to(&DataType::Int64, info.schema())?,
+        ))
+    }
+}
diff --git a/datafusion/spark/src/function/error_utils.rs b/datafusion/spark/src/function/error_utils.rs
index b972d64ed3e9a..362a32bcd0cc2 100644
--- a/datafusion/spark/src/function/error_utils.rs
+++ b/datafusion/spark/src/function/error_utils.rs
@@ -18,7 +18,7 @@
 // TODO: https://github.com/apache/spark/tree/master/common/utils/src/main/resources/error
 
 use arrow::datatypes::DataType;
-use datafusion_common::{exec_datafusion_err, internal_datafusion_err, DataFusionError};
+use datafusion_common::{DataFusionError, exec_datafusion_err, internal_datafusion_err};
 
 pub fn invalid_arg_count_exec_err(
     function_name: &str,
@@ -44,7 +44,9 @@ pub fn unsupported_data_type_exec_err(
     required: &str,
     provided: &DataType,
 ) -> DataFusionError {
-    exec_datafusion_err!("Unsupported Data Type: Spark `{function_name}` function expects {required}, got {provided}")
+    exec_datafusion_err!(
+        "Unsupported Data Type: Spark `{function_name}` function expects {required}, got {provided}"
+    )
 }
 
 pub fn unsupported_data_types_exec_err(
diff --git a/datafusion/spark/src/function/functions_nested_utils.rs b/datafusion/spark/src/function/functions_nested_utils.rs
new file mode 100644
index 0000000000000..b455ba735d749
--- /dev/null
+++ b/datafusion/spark/src/function/functions_nested_utils.rs
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::ColumnarValue;
+
+/// array function wrapper that differentiates between scalar (length 1) and array.
+pub(crate) fn make_scalar_function<F>(
+    inner: F,
+) -> impl Fn(&[ColumnarValue]) -> Result<ColumnarValue>
+where
+    F: Fn(&[ArrayRef]) -> Result<ArrayRef>,
+{
+    move |args: &[ColumnarValue]| {
+        // first, identify if any of the arguments is an Array. If yes, store its `len`,
+        // as any scalar will need to be converted to an array of len `len`.
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+
+        let args = ColumnarValue::values_to_arrays(args)?;
+
+        let result = (inner)(&args);
+
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/hash/crc32.rs b/datafusion/spark/src/function/hash/crc32.rs
new file mode 100644
index 0000000000000..f079d82f9bded
--- /dev/null
+++ b/datafusion/spark/src/function/hash/crc32.rs
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int64Array};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use crc32fast::Hasher;
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_fixed_size_binary_array,
+    as_large_binary_array,
+};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#crc32>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCrc32 {
+    signature: Signature,
+}
+
+impl Default for SparkCrc32 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCrc32 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkCrc32 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "crc32"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Int64, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_crc32, vec![])(&args.args)
+    }
+}
+
+fn spark_crc32_digest(value: &[u8]) -> i64 {
+    let mut hasher = Hasher::new();
+    hasher.update(value);
+    hasher.finalize() as i64
+}
+
+fn spark_crc32_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayRef {
+    let result = input
+        .map(|value| value.map(spark_crc32_digest))
+        .collect::<Int64Array>();
+    Arc::new(result)
+}
+
+fn spark_crc32(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input] = take_function_args("crc32", args)?;
+
+    match input.data_type() {
+        DataType::Null => Ok(Arc::new(Int64Array::new_null(input.len()))),
+        DataType::Binary => {
+            let input = as_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        DataType::LargeBinary => {
+            let input = as_large_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        DataType::BinaryView => {
+            let input = as_binary_view_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        DataType::FixedSizeBinary(_) => {
+            let input = as_fixed_size_binary_array(input)?;
+            Ok(spark_crc32_impl(input.iter()))
+        }
+        dt => {
+            internal_err!("Unsupported data type for crc32: {dt}")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_crc32_nullability() -> Result<()> {
+        let crc32_func = SparkCrc32::new();
+
+        // non-nullable field should produce non-nullable output
+        let field_not_null = Arc::new(Field::new("data", DataType::Binary, false));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: std::slice::from_ref(&field_not_null),
+            scalar_arguments: &[None],
+        })?;
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        // nullable field should produce nullable output
+        let field_nullable = Arc::new(Field::new("data", DataType::Binary, true));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[field_nullable],
+            scalar_arguments: &[None],
+        })?;
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/hash/mod.rs b/datafusion/spark/src/function/hash/mod.rs
index f31918e6a46b5..5860596ac70a3 100644
--- a/datafusion/spark/src/function/hash/mod.rs
+++ b/datafusion/spark/src/function/hash/mod.rs
@@ -15,19 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod crc32;
+pub mod sha1;
 pub mod sha2;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(crc32::SparkCrc32, crc32);
+make_udf_function!(sha1::SparkSha1, sha1);
 make_udf_function!(sha2::SparkSha2, sha2);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
-    export_functions!((sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2));
+    export_functions!(
+        (crc32, "crc32(expr) - Returns a cyclic redundancy check value of the expr as a bigint.", arg1),
+        (sha1, "sha1(expr) - Returns a SHA-1 hash value of the expr as a hex string.", arg1),
+        (sha2, "sha2(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of expr. SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.", arg1 arg2)
+    );
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![sha2()]
+    vec![crc32(), sha1(), sha2()]
 }
diff --git a/datafusion/spark/src/function/hash/sha1.rs b/datafusion/spark/src/function/hash/sha1.rs
new file mode 100644
index 0000000000000..605d2a9567c49
--- /dev/null
+++ b/datafusion/spark/src/function/hash/sha1.rs
@@ -0,0 +1,173 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, StringArray};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{
+    as_binary_array, as_binary_view_array, as_fixed_size_binary_array,
+    as_large_binary_array,
+};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use sha1::{Digest, Sha1};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#sha1>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSha1 {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkSha1 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSha1 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+            aliases: vec!["sha".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSha1 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "sha1"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), DataType::Utf8, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_sha1, vec![])(&args.args)
+    }
+}
+
+/// Hex encoding lookup table for fast byte-to-hex conversion
+const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
+
+#[inline]
+fn spark_sha1_digest(value: &[u8]) -> String {
+    let result = Sha1::digest(value);
+    let mut s = String::with_capacity(result.len() * 2);
+    for &b in result.as_slice() {
+        s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
+        s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
+    }
+    s
+}
+
+fn spark_sha1_impl<'a>(input: impl Iterator<Item = Option<&'a [u8]>>) -> ArrayRef {
+    let result = input
+        .map(|value| value.map(spark_sha1_digest))
+        .collect::<StringArray>();
+    Arc::new(result)
+}
+
+fn spark_sha1(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input] = take_function_args("sha1", args)?;
+
+    match input.data_type() {
+        DataType::Null => Ok(Arc::new(StringArray::new_null(input.len()))),
+        DataType::Binary => {
+            let input = as_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        DataType::LargeBinary => {
+            let input = as_large_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        DataType::BinaryView => {
+            let input = as_binary_view_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        DataType::FixedSizeBinary(_) => {
+            let input = as_fixed_size_binary_array(input)?;
+            Ok(spark_sha1_impl(input.iter()))
+        }
+        dt => {
+            internal_err!("Unsupported data type for sha1: {dt}")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sha1_nullability() -> Result<()> {
+        let func = SparkSha1::new();
+
+        // Non-nullable input keeps output non-nullable
+        let non_nullable: FieldRef = Arc::new(Field::new("col", DataType::Binary, false));
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&non_nullable)],
+            scalar_arguments: &[None],
+        })?;
+        assert!(!out.is_nullable());
+        assert_eq!(out.data_type(), &DataType::Utf8);
+
+        // Nullable input makes output nullable
+        let nullable: FieldRef = Arc::new(Field::new("col", DataType::Binary, true));
+        let out = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[Arc::clone(&nullable)],
+            scalar_arguments: &[None],
+        })?;
+        assert!(out.is_nullable());
+        assert_eq!(out.data_type(), &DataType::Utf8);
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/hash/sha2.rs b/datafusion/spark/src/function/hash/sha2.rs
index b4b29ef33478e..3fa41aba71b5b 100644
--- a/datafusion/spark/src/function/hash/sha2.rs
+++ b/datafusion/spark/src/function/hash/sha2.rs
@@ -15,26 +15,29 @@
 // specific language governing permissions and limitations
 // under the License.
 
-extern crate datafusion_functions;
-
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
+use arrow::array::{ArrayRef, AsArray, BinaryArrayType, Int32Array, StringArray};
+use arrow::datatypes::{DataType, Int32Type};
+use datafusion_common::types::{
+    NativeType, logical_binary, logical_int32, logical_string,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
 };
-use crate::function::math::hex::spark_hex;
-use arrow::array::{ArrayRef, AsArray, StringArray};
-use arrow::datatypes::{DataType, UInt32Type};
-use datafusion_common::{exec_err, internal_datafusion_err, Result, ScalarValue};
-use datafusion_expr::Signature;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
-pub use datafusion_functions::crypto::basic::{sha224, sha256, sha384, sha512};
+use datafusion_functions::utils::make_scalar_function;
+use sha2::{self, Digest};
 use std::any::Any;
 use std::sync::Arc;
 
+/// Differs from DataFusion version in allowing array input for bit lengths, and
+/// also hex encoding the output.
+///
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#sha2>
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkSha2 {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkSha2 {
@@ -46,8 +49,21 @@ impl Default for SparkSha2 {
 impl SparkSha2 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Signature::coercible(
+                vec![
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_binary()),
+                        vec![TypeSignatureClass::Native(logical_string())],
+                        NativeType::Binary,
+                    ),
+                    Coercion::new_implicit(
+                        TypeSignatureClass::Native(logical_int32()),
+                        vec![TypeSignatureClass::Integer],
+                        NativeType::Int32,
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -65,153 +81,188 @@ impl ScalarUDFImpl for SparkSha2 {
         &self.signature
     }
 
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types[1].is_null() {
-            return Ok(DataType::Null);
-        }
-        Ok(match arg_types[0] {
-            DataType::Utf8View
-            | DataType::LargeUtf8
-            | DataType::Utf8
-            | DataType::Binary
-            | DataType::BinaryView
-            | DataType::LargeBinary => DataType::Utf8,
-            DataType::Null => DataType::Null,
-            _ => {
-                return exec_err!(
-                    "{} function can only accept strings or binary arrays.",
-                    self.name()
-                )
-            }
-        })
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        let args: [ColumnarValue; 2] = args.args.try_into().map_err(|_| {
-            internal_datafusion_err!("Expected 2 arguments for function sha2")
-        })?;
+        let [values, bit_lengths] = take_function_args(self.name(), args.args.iter())?;
 
-        sha2(args)
-    }
+        match (values, bit_lengths) {
+            (
+                ColumnarValue::Scalar(value_scalar),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length))),
+            ) => {
+                if value_scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+                }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
+                // Accept both Binary and Utf8 scalars (depending on coercion)
+                let bytes = match value_scalar {
+                    ScalarValue::Binary(Some(b)) => b.as_slice(),
+                    ScalarValue::LargeBinary(Some(b)) => b.as_slice(),
+                    ScalarValue::BinaryView(Some(b)) => b.as_slice(),
+                    ScalarValue::Utf8(Some(s))
+                    | ScalarValue::LargeUtf8(Some(s))
+                    | ScalarValue::Utf8View(Some(s)) => s.as_bytes(),
+                    other => {
+                        return internal_err!(
+                            "Unsupported scalar datatype for sha2: {}",
+                            other.data_type()
+                        );
+                    }
+                };
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 2 {
-            return Err(invalid_arg_count_exec_err(
-                self.name(),
-                (2, 2),
-                arg_types.len(),
-            ));
+                let out = match bit_length {
+                    224 => {
+                        let mut digest = sha2::Sha224::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    0 | 256 => {
+                        let mut digest = sha2::Sha256::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    384 => {
+                        let mut digest = sha2::Sha384::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    512 => {
+                        let mut digest = sha2::Sha512::default();
+                        digest.update(bytes);
+                        Some(hex_encode(digest.finalize()))
+                    }
+                    _ => None,
+                };
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(out)))
+            }
+            // Array values + scalar bit length (common case: sha2(col, 256))
+            (
+                ColumnarValue::Array(values_array),
+                ColumnarValue::Scalar(ScalarValue::Int32(Some(bit_length))),
+            ) => {
+                let output: ArrayRef = match values_array.data_type() {
+                    DataType::Binary => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary::<i32>(),
+                        *bit_length,
+                    ),
+                    DataType::LargeBinary => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary::<i64>(),
+                        *bit_length,
+                    ),
+                    DataType::BinaryView => sha2_binary_scalar_bitlen(
+                        &values_array.as_binary_view(),
+                        *bit_length,
+                    ),
+                    dt => return internal_err!("Unsupported datatype for sha2: {dt}"),
+                };
+                Ok(ColumnarValue::Array(output))
+            }
+            (
+                ColumnarValue::Scalar(_),
+                ColumnarValue::Scalar(ScalarValue::Int32(None)),
+            ) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+            (
+                ColumnarValue::Array(_),
+                ColumnarValue::Scalar(ScalarValue::Int32(None)),
+            ) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+            _ => {
+                // Fallback to existing behavior for any array/mixed cases
+                make_scalar_function(sha2_impl, vec![])(&args.args)
+            }
         }
-        let expr_type = match &arg_types[0] {
-            DataType::Utf8View
-            | DataType::LargeUtf8
-            | DataType::Utf8
-            | DataType::Binary
-            | DataType::BinaryView
-            | DataType::LargeBinary
-            | DataType::Null => Ok(arg_types[0].clone()),
-            _ => Err(unsupported_data_type_exec_err(
-                self.name(),
-                "String, Binary",
-                &arg_types[0],
-            )),
-        }?;
-        let bit_length_type = if arg_types[1].is_numeric() {
-            Ok(DataType::UInt32)
-        } else if arg_types[1].is_null() {
-            Ok(DataType::Null)
-        } else {
-            Err(unsupported_data_type_exec_err(
-                self.name(),
-                "Numeric Type",
-                &arg_types[1],
-            ))
-        }?;
-
-        Ok(vec![expr_type, bit_length_type])
     }
 }
 
-pub fn sha2(args: [ColumnarValue; 2]) -> Result<ColumnarValue> {
-    match args {
-        [ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg)), ColumnarValue::Scalar(ScalarValue::UInt32(Some(bit_length_arg)))] => {
-            match bit_length_arg {
-                0 | 256 => sha256(&[ColumnarValue::from(ScalarValue::Utf8(expr_arg))]),
-                224 => sha224(&[ColumnarValue::from(ScalarValue::Utf8(expr_arg))]),
-                384 => sha384(&[ColumnarValue::from(ScalarValue::Utf8(expr_arg))]),
-                512 => sha512(&[ColumnarValue::from(ScalarValue::Utf8(expr_arg))]),
-                _ => exec_err!(
-                    "sha2 function only supports 224, 256, 384, and 512 bit lengths."
-                ),
-            }
-            .map(|hashed| spark_hex(&[hashed]).unwrap())
+fn sha2_impl(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [values, bit_lengths] = take_function_args("sha2", args)?;
+
+    let bit_lengths = bit_lengths.as_primitive::<Int32Type>();
+    let output = match values.data_type() {
+        DataType::Binary => sha2_binary_impl(&values.as_binary::<i32>(), bit_lengths),
+        DataType::LargeBinary => {
+            sha2_binary_impl(&values.as_binary::<i64>(), bit_lengths)
         }
-        [ColumnarValue::Array(expr_arg), ColumnarValue::Scalar(ScalarValue::UInt32(Some(bit_length_arg)))] => {
-            match bit_length_arg {
-                0 | 256 => sha256(&[ColumnarValue::from(expr_arg)]),
-                224 => sha224(&[ColumnarValue::from(expr_arg)]),
-                384 => sha384(&[ColumnarValue::from(expr_arg)]),
-                512 => sha512(&[ColumnarValue::from(expr_arg)]),
-                _ => exec_err!(
-                    "sha2 function only supports 224, 256, 384, and 512 bit lengths."
-                ),
+        DataType::BinaryView => sha2_binary_impl(&values.as_binary_view(), bit_lengths),
+        dt => return internal_err!("Unsupported datatype for sha2: {dt}"),
+    };
+    Ok(output)
+}
+
+fn sha2_binary_impl<'a, BinaryArrType>(
+    values: &BinaryArrType,
+    bit_lengths: &Int32Array,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+{
+    sha2_binary_bitlen_iter(values, bit_lengths.iter())
+}
+
+fn sha2_binary_scalar_bitlen<'a, BinaryArrType>(
+    values: &BinaryArrType,
+    bit_length: i32,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+{
+    sha2_binary_bitlen_iter(values, std::iter::repeat(Some(bit_length)))
+}
+
+fn sha2_binary_bitlen_iter<'a, BinaryArrType, I>(
+    values: &BinaryArrType,
+    bit_lengths: I,
+) -> ArrayRef
+where
+    BinaryArrType: BinaryArrayType<'a>,
+    I: Iterator<Item = Option<i32>>,
+{
+    let array = values
+        .iter()
+        .zip(bit_lengths)
+        .map(|(value, bit_length)| match (value, bit_length) {
+            (Some(value), Some(224)) => {
+                let mut digest = sha2::Sha224::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
             }
-            .map(|hashed| spark_hex(&[hashed]).unwrap())
-        }
-        [ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg)), ColumnarValue::Array(bit_length_arg)] =>
-        {
-            let arr: StringArray = bit_length_arg
-                .as_primitive::<UInt32Type>()
-                .iter()
-                .map(|bit_length| {
-                    match sha2([
-                        ColumnarValue::Scalar(ScalarValue::Utf8(expr_arg.clone())),
-                        ColumnarValue::Scalar(ScalarValue::UInt32(bit_length)),
-                    ])
-                    .unwrap()
-                    {
-                        ColumnarValue::Scalar(ScalarValue::Utf8(str)) => str,
-                        ColumnarValue::Array(arr) => arr
-                            .as_string::<i32>()
-                            .iter()
-                            .map(|str| str.unwrap().to_string())
-                            .next(), // first element
-                        _ => unreachable!(),
-                    }
-                })
-                .collect();
-            Ok(ColumnarValue::Array(Arc::new(arr) as ArrayRef))
-        }
-        [ColumnarValue::Array(expr_arg), ColumnarValue::Array(bit_length_arg)] => {
-            let expr_iter = expr_arg.as_string::<i32>().iter();
-            let bit_length_iter = bit_length_arg.as_primitive::<UInt32Type>().iter();
-            let arr: StringArray = expr_iter
-                .zip(bit_length_iter)
-                .map(|(expr, bit_length)| {
-                    match sha2([
-                        ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-                            expr.unwrap().to_string(),
-                        ))),
-                        ColumnarValue::Scalar(ScalarValue::UInt32(bit_length)),
-                    ])
-                    .unwrap()
-                    {
-                        ColumnarValue::Scalar(ScalarValue::Utf8(str)) => str,
-                        ColumnarValue::Array(arr) => arr
-                            .as_string::<i32>()
-                            .iter()
-                            .map(|str| str.unwrap().to_string())
-                            .next(), // first element
-                        _ => unreachable!(),
-                    }
-                })
-                .collect();
-            Ok(ColumnarValue::Array(Arc::new(arr) as ArrayRef))
-        }
-        _ => exec_err!("Unsupported argument types for sha2 function"),
+            (Some(value), Some(0 | 256)) => {
+                let mut digest = sha2::Sha256::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            (Some(value), Some(384)) => {
+                let mut digest = sha2::Sha384::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            (Some(value), Some(512)) => {
+                let mut digest = sha2::Sha512::default();
+                digest.update(value);
+                Some(hex_encode(digest.finalize()))
+            }
+            // Unknown bit-lengths go to null, same as in Spark
+            _ => None,
+        })
+        .collect::<StringArray>();
+    Arc::new(array)
+}
+
+const HEX_CHARS: [u8; 16] = *b"0123456789abcdef";
+
+#[inline]
+fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
+    let bytes = data.as_ref();
+    let mut out = Vec::with_capacity(bytes.len() * 2);
+    for &b in bytes {
+        let hi = b >> 4;
+        let lo = b & 0x0F;
+        out.push(HEX_CHARS[hi as usize]);
+        out.push(HEX_CHARS[lo as usize]);
     }
+    // SAFETY: out contains only ASCII
+    unsafe { String::from_utf8_unchecked(out) }
 }
diff --git a/datafusion/spark/src/function/json/json_tuple.rs b/datafusion/spark/src/function/json/json_tuple.rs
new file mode 100644
index 0000000000000..f3ba7e91ac3da
--- /dev/null
+++ b/datafusion/spark/src/function/json/json_tuple.rs
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, NullBufferBuilder, StringBuilder, StructArray};
+use arrow::datatypes::{DataType, Field, FieldRef, Fields};
+use datafusion_common::cast::as_string_array;
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+
+/// Spark-compatible `json_tuple` expression
+///
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#json_tuple>
+///
+/// Extracts top-level fields from a JSON string and returns them as a struct.
+///
+/// `json_tuple(json_string, field1, field2, ...) -> Struct<c0: Utf8, c1: Utf8, ...>`
+///
+/// Note: In Spark, `json_tuple` is a Generator that produces multiple columns directly.
+/// In DataFusion, a ScalarUDF can only return one value per row, so the result is wrapped
+/// in a Struct. The caller (e.g. Comet) is expected to destructure the struct fields.
+///
+/// - Returns NULL for each field that is missing from the JSON object
+/// - Returns NULL for all fields if the input is NULL or not valid JSON
+/// - Non-string JSON values are converted to their JSON string representation
+/// - JSON `null` values are returned as NULL (not the string "null")
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct JsonTuple {
+    signature: Signature,
+}
+
+impl Default for JsonTuple {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl JsonTuple {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic(vec![DataType::Utf8], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for JsonTuple {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "json_tuple"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        if args.arg_fields.len() < 2 {
+            return exec_err!(
+                "json_tuple requires at least 2 arguments (json_string, field1), got {}",
+                args.arg_fields.len()
+            );
+        }
+
+        let num_fields = args.arg_fields.len() - 1;
+        let fields: Fields = (0..num_fields)
+            .map(|i| Field::new(format!("c{i}"), DataType::Utf8, true))
+            .collect::<Vec<_>>()
+            .into();
+
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Struct(fields),
+            true,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs {
+            args: arg_values,
+            return_field,
+            ..
+        } = args;
+        let arrays = ColumnarValue::values_to_arrays(&arg_values)?;
+        let result = json_tuple_inner(&arrays, return_field.data_type())?;
+
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+fn json_tuple_inner(args: &[ArrayRef], return_type: &DataType) -> Result<ArrayRef> {
+    let num_rows = args[0].len();
+    let num_fields = args.len() - 1;
+
+    let json_array = as_string_array(&args[0])?;
+
+    let field_arrays = args[1..]
+        .iter()
+        .map(|arg| as_string_array(arg))
+        .collect::<Result<Vec<_>>>()?;
+
+    let mut builders: Vec<StringBuilder> =
+        (0..num_fields).map(|_| StringBuilder::new()).collect();
+
+    let mut null_buffer = NullBufferBuilder::new(num_rows);
+
+    for row_idx in 0..num_rows {
+        if json_array.is_null(row_idx) {
+            for builder in &mut builders {
+                builder.append_null();
+            }
+            null_buffer.append_null();
+            continue;
+        }
+
+        let json_str = json_array.value(row_idx);
+        match serde_json::from_str::<serde_json::Value>(json_str) {
+            Ok(serde_json::Value::Object(map)) => {
+                null_buffer.append_non_null();
+                for (field_idx, builder) in builders.iter_mut().enumerate() {
+                    if field_arrays[field_idx].is_null(row_idx) {
+                        builder.append_null();
+                        continue;
+                    }
+                    let field_name = field_arrays[field_idx].value(row_idx);
+                    match map.get(field_name) {
+                        Some(serde_json::Value::Null) => {
+                            builder.append_null();
+                        }
+                        Some(serde_json::Value::String(s)) => {
+                            builder.append_value(s);
+                        }
+                        Some(other) => {
+                            builder.append_value(other.to_string());
+                        }
+                        None => {
+                            builder.append_null();
+                        }
+                    }
+                }
+            }
+            _ => {
+                for builder in &mut builders {
+                    builder.append_null();
+                }
+                null_buffer.append_null();
+            }
+        }
+    }
+
+    let struct_fields = match return_type {
+        DataType::Struct(fields) => fields.clone(),
+        _ => {
+            return internal_err!(
+                "json_tuple requires a Struct return type, got {:?}",
+                return_type
+            );
+        }
+    };
+
+    let arrays: Vec<ArrayRef> = builders
+        .into_iter()
+        .map(|mut builder| Arc::new(builder.finish()) as ArrayRef)
+        .collect();
+
+    let struct_array = StructArray::try_new(struct_fields, arrays, null_buffer.finish())?;
+
+    Ok(Arc::new(struct_array))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_return_field_shape() {
+        let func = JsonTuple::new();
+        let fields = vec![
+            Arc::new(Field::new("json", DataType::Utf8, false)),
+            Arc::new(Field::new("f1", DataType::Utf8, false)),
+            Arc::new(Field::new("f2", DataType::Utf8, false)),
+        ];
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &fields,
+                scalar_arguments: &[None, None, None],
+            })
+            .unwrap();
+
+        match result.data_type() {
+            DataType::Struct(inner) => {
+                assert_eq!(inner.len(), 2);
+                assert_eq!(inner[0].name(), "c0");
+                assert_eq!(inner[1].name(), "c1");
+                assert_eq!(inner[0].data_type(), &DataType::Utf8);
+                assert!(inner[0].is_nullable());
+            }
+            other => panic!("Expected Struct, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_too_few_args() {
+        let func = JsonTuple::new();
+        let fields = vec![Arc::new(Field::new("json", DataType::Utf8, false))];
+        let result = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[None],
+        });
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("at least 2 arguments")
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/json/mod.rs b/datafusion/spark/src/function/json/mod.rs
index a87df9a2c87a0..01378235d7c64 100644
--- a/datafusion/spark/src/function/json/mod.rs
+++ b/datafusion/spark/src/function/json/mod.rs
@@ -15,11 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod json_tuple;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(json_tuple::JsonTuple, json_tuple);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        json_tuple,
+        "Extracts top-level fields from a JSON string and returns them as a struct.",
+        args,
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![json_tuple()]
 }
diff --git a/datafusion/spark/src/function/map/map_from_arrays.rs b/datafusion/spark/src/function/map/map_from_arrays.rs
new file mode 100644
index 0000000000000..5aba42feb96f5
--- /dev/null
+++ b/datafusion/spark/src/function/map/map_from_arrays.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use crate::function::map::utils::{
+    get_element_type, get_list_offsets, get_list_values,
+    map_from_keys_values_offsets_nulls, map_type_from_key_value_types,
+};
+use arrow::array::{Array, ArrayRef, NullArray};
+use arrow::compute::kernels::cast;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `map_from_arrays` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#map_from_arrays>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MapFromArrays {
+    signature: Signature,
+}
+
+impl Default for MapFromArrays {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MapFromArrays {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::any(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MapFromArrays {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "map_from_arrays"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [keys_field, values_field] = args.arg_fields else {
+            return internal_err!("map_from_arrays expects exactly 2 arguments");
+        };
+
+        let map_type = map_type_from_key_value_types(
+            get_element_type(keys_field.data_type())?,
+            get_element_type(values_field.data_type())?,
+        );
+        // Spark marks map_from_arrays as null intolerant, so the output is
+        // nullable if either input is nullable.
+        let nullable = keys_field.is_nullable() || values_field.is_nullable();
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(map_from_arrays_inner, vec![])(&args.args)
+    }
+}
+
+fn map_from_arrays_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [keys, values] = take_function_args("map_from_arrays", args)?;
+
+    if *keys.data_type() == DataType::Null || *values.data_type() == DataType::Null {
+        return Ok(cast(
+            &NullArray::new(keys.len()),
+            &map_type_from_key_value_types(
+                get_element_type(keys.data_type())?,
+                get_element_type(values.data_type())?,
+            ),
+        )?);
+    }
+
+    map_from_keys_values_offsets_nulls(
+        get_list_values(keys)?,
+        get_list_values(values)?,
+        &get_list_offsets(keys)?,
+        &get_list_offsets(values)?,
+        keys.nulls(),
+        values.nulls(),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Field;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_map_from_arrays_nullability_and_type() {
+        let func = MapFromArrays::new();
+
+        let keys_field: FieldRef = Arc::new(Field::new(
+            "keys",
+            DataType::List(Arc::new(Field::new("item", DataType::Int32, false))),
+            false,
+        ));
+        let values_field: FieldRef = Arc::new(Field::new(
+            "values",
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+            false,
+        ));
+
+        let out = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&keys_field), Arc::clone(&values_field)],
+                scalar_arguments: &[None, None],
+            })
+            .expect("return_field_from_args should succeed");
+
+        let expected_type =
+            map_type_from_key_value_types(&DataType::Int32, &DataType::Utf8);
+        assert_eq!(out.data_type(), &expected_type);
+        assert!(
+            !out.is_nullable(),
+            "map_from_arrays should be non-nullable when both inputs are non-nullable"
+        );
+
+        let nullable_keys: FieldRef = Arc::new(Field::new(
+            "keys",
+            DataType::List(Arc::new(Field::new("item", DataType::Int32, false))),
+            true,
+        ));
+
+        let out_nullable = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[nullable_keys, values_field],
+                scalar_arguments: &[None, None],
+            })
+            .expect("return_field_from_args should succeed");
+
+        assert!(
+            out_nullable.is_nullable(),
+            "map_from_arrays should be nullable when any input is nullable"
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/map/map_from_entries.rs b/datafusion/spark/src/function/map/map_from_entries.rs
new file mode 100644
index 0000000000000..364f1978306d0
--- /dev/null
+++ b/datafusion/spark/src/function/map/map_from_entries.rs
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::function::map::utils::{
+    get_list_offsets, get_list_values, map_from_keys_values_offsets_nulls,
+    map_type_from_key_value_types,
+};
+use arrow::array::{Array, ArrayRef, NullBufferBuilder, StructArray};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// Spark-compatible `map_from_entries` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#map_from_entries>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct MapFromEntries {
+    signature: Signature,
+}
+
+impl Default for MapFromEntries {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MapFromEntries {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::array(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MapFromEntries {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "map_from_entries"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let [entries_field] = args.arg_fields else {
+            return exec_err!("map_from_entries: expected one argument");
+        };
+
+        let (entries_element_field, entries_element_type) =
+            match entries_field.data_type() {
+                DataType::List(field)
+                | DataType::LargeList(field)
+                | DataType::FixedSizeList(field, _) => {
+                    Ok((field.as_ref(), field.data_type()))
+                }
+                wrong_type => exec_err!(
+                    "map_from_entries: expected array<struct<key, value>>, got {:?}",
+                    wrong_type
+                ),
+            }?;
+
+        let (keys_type, values_type) = match entries_element_type {
+            DataType::Struct(fields) if fields.len() == 2 => {
+                Ok((fields[0].data_type(), fields[1].data_type()))
+            }
+            wrong_type => exec_err!(
+                "map_from_entries: expected array<struct<key, value>>, got {:?}",
+                wrong_type
+            ),
+        }?;
+
+        let map_type = map_type_from_key_value_types(keys_type, values_type);
+        let nullable = entries_field.is_nullable() || entries_element_field.is_nullable();
+
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(map_from_entries_inner, vec![])(&args.args)
+    }
+}
+
+fn map_from_entries_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [entries] = take_function_args("map_from_entries", args)?;
+    let entries_offsets = get_list_offsets(entries)?;
+    let entries_values = get_list_values(entries)?;
+
+    let (flat_keys, flat_values) =
+        match entries_values.as_any().downcast_ref::<StructArray>() {
+            Some(a) => Ok((a.column(0), a.column(1))),
+            None => exec_err!(
+                "map_from_entries: expected array<struct<key, value>>, got {:?}",
+                entries_values.data_type()
+            ),
+        }?;
+
+    let entries_with_nulls = entries_values.nulls().and_then(|entries_inner_nulls| {
+        let mut builder = NullBufferBuilder::new_with_len(0);
+        let mut cur_offset = entries_offsets
+            .first()
+            .map(|offset| *offset as usize)
+            .unwrap_or(0);
+
+        for next_offset in entries_offsets.iter().skip(1) {
+            let num_entries = *next_offset as usize - cur_offset;
+            builder.append(
+                entries_inner_nulls
+                    .slice(cur_offset, num_entries)
+                    .null_count()
+                    == 0,
+            );
+            cur_offset = *next_offset as usize;
+        }
+        builder.finish()
+    });
+
+    let res_nulls = NullBuffer::union(entries.nulls(), entries_with_nulls.as_ref());
+
+    map_from_keys_values_offsets_nulls(
+        flat_keys,
+        flat_values,
+        &entries_offsets,
+        &entries_offsets,
+        None,
+        res_nulls.as_ref(),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::Fields;
+    use datafusion_expr::ReturnFieldArgs;
+
+    fn make_entries_field(array_nullable: bool, element_nullable: bool) -> FieldRef {
+        let struct_type = DataType::Struct(Fields::from(vec![
+            Field::new("key", DataType::Int32, false),
+            Field::new("value", DataType::Utf8, true),
+        ]));
+        Arc::new(Field::new(
+            "entries",
+            DataType::List(Arc::new(Field::new("item", struct_type, element_nullable))),
+            array_nullable,
+        ))
+    }
+
+    #[test]
+    fn test_map_from_entries_nullability_matches_input() {
+        let func = MapFromEntries::new();
+        let expected_type =
+            map_type_from_key_value_types(&DataType::Int32, &DataType::Utf8);
+
+        // Non-nullable array and elements => non-nullable result
+        let non_nullable_field = make_entries_field(false, false);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+
+        // Nullable elements should make result nullable even if array is non-nullable
+        let element_nullable_field = make_entries_field(false, true);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&element_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+
+        // Nullable array should also yield nullable result
+        let array_nullable_field = make_entries_field(true, false);
+        let result = func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&array_nullable_field)],
+                scalar_arguments: &[None],
+            })
+            .expect("should infer field");
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &expected_type);
+    }
+}
diff --git a/datafusion/spark/src/function/map/mod.rs b/datafusion/spark/src/function/map/mod.rs
index a87df9a2c87a0..c9ebed6f612e1 100644
--- a/datafusion/spark/src/function/map/mod.rs
+++ b/datafusion/spark/src/function/map/mod.rs
@@ -15,11 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod map_from_arrays;
+pub mod map_from_entries;
+pub mod str_to_map;
+mod utils;
+
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+make_udf_function!(map_from_arrays::MapFromArrays, map_from_arrays);
+make_udf_function!(map_from_entries::MapFromEntries, map_from_entries);
+make_udf_function!(str_to_map::SparkStrToMap, str_to_map);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        map_from_arrays,
+        "Creates a map from arrays of keys and values.",
+        keys values
+    ));
+
+    export_functions!((
+        map_from_entries,
+        "Creates a map from array<struct<key, value>>.",
+        arg1
+    ));
+
+    export_functions!((
+        str_to_map,
+        "Creates a map after splitting the text into key/value pairs using delimiters.",
+        text pair_delim key_value_delim
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![map_from_arrays(), map_from_entries(), str_to_map()]
 }
diff --git a/datafusion/spark/src/function/map/str_to_map.rs b/datafusion/spark/src/function/map/str_to_map.rs
new file mode 100644
index 0000000000000..b722fb7abd6b2
--- /dev/null
+++ b/datafusion/spark/src/function/map/str_to_map.rs
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, MapBuilder, MapFieldNames, StringArrayType, StringBuilder,
+};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+
+use crate::function::map::utils::map_type_from_key_value_types;
+
+const DEFAULT_PAIR_DELIM: &str = ",";
+const DEFAULT_KV_DELIM: &str = ":";
+
+/// Spark-compatible `str_to_map` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map>
+///
+/// Creates a map from a string by splitting on delimiters.
+/// str_to_map(text[, pairDelim[, keyValueDelim]]) -> Map<String, String>
+///
+/// - text: The input string
+/// - pairDelim: Delimiter between key-value pairs (default: ',')
+/// - keyValueDelim: Delimiter between key and value (default: ':')
+///
+/// # Duplicate Key Handling
+/// Uses EXCEPTION behavior (Spark 3.0+ default): errors on duplicate keys.
+/// See `spark.sql.mapKeyDedupPolicy`:
+/// <https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4502-L4511>
+///
+/// TODO: Support configurable `spark.sql.mapKeyDedupPolicy` (LAST_WIN) in a follow-up PR.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkStrToMap {
+    signature: Signature,
+}
+
+impl Default for SparkStrToMap {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkStrToMap {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    // str_to_map(text)
+                    TypeSignature::String(1),
+                    // str_to_map(text, pairDelim)
+                    TypeSignature::String(2),
+                    // str_to_map(text, pairDelim, keyValueDelim)
+                    TypeSignature::String(3),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkStrToMap {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "str_to_map"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        let map_type = map_type_from_key_value_types(&DataType::Utf8, &DataType::Utf8);
+        Ok(Arc::new(Field::new(self.name(), map_type, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let arrays: Vec<ArrayRef> = ColumnarValue::values_to_arrays(&args.args)?;
+        let result = str_to_map_inner(&arrays)?;
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+fn str_to_map_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args.len() {
+        1 => match args[0].data_type() {
+            DataType::Utf8 => str_to_map_impl(as_string_array(&args[0])?, None, None),
+            DataType::LargeUtf8 => {
+                str_to_map_impl(as_large_string_array(&args[0])?, None, None)
+            }
+            DataType::Utf8View => {
+                str_to_map_impl(as_string_view_array(&args[0])?, None, None)
+            }
+            other => exec_err!(
+                "Unsupported data type {other:?} for str_to_map, \
+                expected Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        2 => match (args[0].data_type(), args[1].data_type()) {
+            (DataType::Utf8, DataType::Utf8) => str_to_map_impl(
+                as_string_array(&args[0])?,
+                Some(as_string_array(&args[1])?),
+                None,
+            ),
+            (DataType::LargeUtf8, DataType::LargeUtf8) => str_to_map_impl(
+                as_large_string_array(&args[0])?,
+                Some(as_large_string_array(&args[1])?),
+                None,
+            ),
+            (DataType::Utf8View, DataType::Utf8View) => str_to_map_impl(
+                as_string_view_array(&args[0])?,
+                Some(as_string_view_array(&args[1])?),
+                None,
+            ),
+            (t1, t2) => exec_err!(
+                "Unsupported data types ({t1:?}, {t2:?}) for str_to_map, \
+                expected matching Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        3 => match (
+            args[0].data_type(),
+            args[1].data_type(),
+            args[2].data_type(),
+        ) {
+            (DataType::Utf8, DataType::Utf8, DataType::Utf8) => str_to_map_impl(
+                as_string_array(&args[0])?,
+                Some(as_string_array(&args[1])?),
+                Some(as_string_array(&args[2])?),
+            ),
+            (DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => {
+                str_to_map_impl(
+                    as_large_string_array(&args[0])?,
+                    Some(as_large_string_array(&args[1])?),
+                    Some(as_large_string_array(&args[2])?),
+                )
+            }
+            (DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
+                str_to_map_impl(
+                    as_string_view_array(&args[0])?,
+                    Some(as_string_view_array(&args[1])?),
+                    Some(as_string_view_array(&args[2])?),
+                )
+            }
+            (t1, t2, t3) => exec_err!(
+                "Unsupported data types ({t1:?}, {t2:?}, {t3:?}) for str_to_map, \
+                expected matching Utf8, LargeUtf8, or Utf8View"
+            ),
+        },
+        n => exec_err!("str_to_map expects 1-3 arguments, got {n}"),
+    }
+}
+
+fn str_to_map_impl<'a, V: StringArrayType<'a> + Copy>(
+    text_array: V,
+    pair_delim_array: Option<V>,
+    kv_delim_array: Option<V>,
+) -> Result<ArrayRef> {
+    let num_rows = text_array.len();
+
+    // Precompute combined null buffer from all input arrays.
+    // NullBuffer::union performs a bitmap-level AND, which is more efficient
+    // than checking per-row nullability inline.
+    let text_nulls = text_array.nulls().cloned();
+    let pair_nulls = pair_delim_array.and_then(|a| a.nulls().cloned());
+    let kv_nulls = kv_delim_array.and_then(|a| a.nulls().cloned());
+    let combined_nulls = [text_nulls.as_ref(), pair_nulls.as_ref(), kv_nulls.as_ref()]
+        .into_iter()
+        .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls));
+
+    // Use field names matching map_type_from_key_value_types: "key" and "value"
+    let field_names = MapFieldNames {
+        entry: "entries".to_string(),
+        key: "key".to_string(),
+        value: "value".to_string(),
+    };
+    let mut map_builder = MapBuilder::new(
+        Some(field_names),
+        StringBuilder::new(),
+        StringBuilder::new(),
+    );
+
+    let mut seen_keys = HashSet::new();
+    for row_idx in 0..num_rows {
+        if combined_nulls.as_ref().is_some_and(|n| n.is_null(row_idx)) {
+            map_builder.append(false)?;
+            continue;
+        }
+
+        // Per-row delimiter extraction
+        let pair_delim =
+            pair_delim_array.map_or(DEFAULT_PAIR_DELIM, |a| a.value(row_idx));
+        let kv_delim = kv_delim_array.map_or(DEFAULT_KV_DELIM, |a| a.value(row_idx));
+
+        let text = text_array.value(row_idx);
+        if text.is_empty() {
+            // Empty string -> map with empty key and NULL value (Spark behavior)
+            map_builder.keys().append_value("");
+            map_builder.values().append_null();
+            map_builder.append(true)?;
+            continue;
+        }
+
+        seen_keys.clear();
+        for pair in text.split(pair_delim) {
+            if pair.is_empty() {
+                continue;
+            }
+
+            let mut kv_iter = pair.splitn(2, kv_delim);
+            let key = kv_iter.next().unwrap_or("");
+            let value = kv_iter.next();
+
+            // TODO: Support LAST_WIN policy via spark.sql.mapKeyDedupPolicy config
+            // EXCEPTION policy: error on duplicate keys (Spark 3.0+ default)
+            if !seen_keys.insert(key) {
+                return exec_err!(
+                    "Duplicate map key '{key}' was found, please check the input data. \
+                    If you want to remove the duplicated keys, you can set \
+                    spark.sql.mapKeyDedupPolicy to \"LAST_WIN\" so that the key \
+                    inserted at last takes precedence."
+                );
+            }
+
+            map_builder.keys().append_value(key);
+            match value {
+                Some(v) => map_builder.values().append_value(v),
+                None => map_builder.values().append_null(),
+            }
+        }
+        map_builder.append(true)?;
+    }
+
+    Ok(Arc::new(map_builder.finish()))
+}
diff --git a/datafusion/spark/src/function/map/utils.rs b/datafusion/spark/src/function/map/utils.rs
new file mode 100644
index 0000000000000..28fa3227fd628
--- /dev/null
+++ b/datafusion/spark/src/function/map/utils.rs
@@ -0,0 +1,234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::borrow::Cow;
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, AsArray, BooleanBuilder, MapArray, StructArray};
+use arrow::buffer::{NullBuffer, OffsetBuffer};
+use arrow::compute::filter;
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::{Result, ScalarValue, exec_err};
+
+/// Helper function to get element [`DataType`]
+/// from [`List`](DataType::List)/[`LargeList`](DataType::LargeList)/[`FixedSizeList`](DataType::FixedSizeList)<br>
+/// [`Null`](DataType::Null) can be coerced to `ListType`([`Null`](DataType::Null)), so [`Null`](DataType::Null) is returned<br>
+/// For all other types [`exec_err`] is raised
+pub fn get_element_type(data_type: &DataType) -> Result<&DataType> {
+    match data_type {
+        DataType::Null => Ok(data_type),
+        DataType::List(element)
+        | DataType::LargeList(element)
+        | DataType::FixedSizeList(element, _) => Ok(element.data_type()),
+        _ => exec_err!(
+            "get_element_type expects List/LargeList/FixedSizeList/Null as argument, got {data_type:?}"
+        ),
+    }
+}
+
+/// Helper function to get [`values`](arrow::array::ListArray::values)
+/// from [`ListArray`](arrow::array::ListArray)/[`LargeListArray`](arrow::array::LargeListArray)/[`FixedSizeListArray`](arrow::array::FixedSizeListArray)<br>
+/// [`NullArray`](arrow::array::NullArray) can be coerced to `ListType`([`Null`](DataType::Null)), so [`NullArray`](arrow::array::NullArray) is returned<br>
+/// For all other types [`exec_err`] is raised
+pub fn get_list_values(array: &ArrayRef) -> Result<&ArrayRef> {
+    match array.data_type() {
+        DataType::Null => Ok(array),
+        DataType::List(_) => Ok(array.as_list::<i32>().values()),
+        DataType::LargeList(_) => Ok(array.as_list::<i64>().values()),
+        DataType::FixedSizeList(..) => Ok(array.as_fixed_size_list().values()),
+        wrong_type => exec_err!(
+            "get_list_values expects List/LargeList/FixedSizeList/Null as argument, got {wrong_type:?}"
+        ),
+    }
+}
+
+/// Helper function to get [`offsets`](arrow::array::ListArray::offsets)
+/// from [`ListArray`](arrow::array::ListArray)/[`LargeListArray`](arrow::array::LargeListArray)/[`FixedSizeListArray`](arrow::array::FixedSizeListArray)<br>
+/// For all other types [`exec_err`] is raised
+pub fn get_list_offsets(array: &ArrayRef) -> Result<Cow<'_, [i32]>> {
+    match array.data_type() {
+        DataType::List(_) => Ok(Cow::Borrowed(array.as_list::<i32>().offsets().as_ref())),
+        DataType::LargeList(_) => Ok(Cow::Owned(
+            array
+                .as_list::<i64>()
+                .offsets()
+                .iter()
+                .map(|i| *i as i32)
+                .collect::<Vec<_>>(),
+        )),
+        DataType::FixedSizeList(_, size) => Ok(Cow::Owned(
+            (0..=array.len() as i32).map(|i| size * i).collect(),
+        )),
+        wrong_type => exec_err!(
+            "get_list_offsets expects List/LargeList/FixedSizeList as argument, got {wrong_type:?}"
+        ),
+    }
+}
+
+/// Helper function to construct [`MapType<K, V>`](DataType::Map) given K and V DataTypes for keys and values
+/// - Map keys are unsorted
+/// - Map keys are non-nullable
+/// - Map entries are non-nullable
+/// - Map values can be null
+pub fn map_type_from_key_value_types(
+    key_type: &DataType,
+    value_type: &DataType,
+) -> DataType {
+    DataType::Map(
+        Arc::new(Field::new(
+            "entries",
+            DataType::Struct(Fields::from(vec![
+                // the key must not be nullable
+                Field::new("key", key_type.clone(), false),
+                Field::new("value", value_type.clone(), true),
+            ])),
+            false, // the entry is not nullable
+        )),
+        false, // the keys are not sorted
+    )
+}
+
+/// Helper function to construct MapArray from flattened ListArrays and OffsetBuffer
+///
+/// Logic is close to `datafusion_functions_nested::map::make_map_array_internal`<br>
+/// But there are some core differences:
+/// 1. Input arrays are not [`ListArrays`](arrow::array::ListArray) itself, but their flattened [`values`](arrow::array::ListArray::values)<br>
+///    So the inputs can be [`ListArray`](`arrow::array::ListArray`)/[`LargeListArray`](`arrow::array::LargeListArray`)/[`FixedSizeListArray`](`arrow::array::FixedSizeListArray`)<br>
+///    To preserve the row info, [`offsets`](arrow::array::ListArray::offsets) and [`nulls`](arrow::array::ListArray::nulls) for both keys and values need to be provided<br>
+///    [`FixedSizeListArray`](`arrow::array::FixedSizeListArray`) has no `offsets`, so they can be generated as a cumulative sum of it's `Size`
+/// 2. Spark provides [spark.sql.mapKeyDedupPolicy](https://github.com/apache/spark/blob/cf3a34e19dfcf70e2d679217ff1ba21302212472/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4961)
+///    to handle duplicate keys<br>
+///    For now, configurable functions are not supported by Datafusion<br>
+///    So more permissive `LAST_WIN` option is used in this implementation (instead of `EXCEPTION`)<br>
+///    `EXCEPTION` behaviour can still be achieved externally in cost of performance:<br>
+///    `when(array_length(array_distinct(keys)) == array_length(keys), constructed_map)`<br>
+///    `.otherwise(raise_error("duplicate keys occurred during map construction"))`
+pub fn map_from_keys_values_offsets_nulls(
+    flat_keys: &ArrayRef,
+    flat_values: &ArrayRef,
+    keys_offsets: &[i32],
+    values_offsets: &[i32],
+    keys_nulls: Option<&NullBuffer>,
+    values_nulls: Option<&NullBuffer>,
+) -> Result<ArrayRef> {
+    let (keys, values, offsets) = map_deduplicate_keys(
+        flat_keys,
+        flat_values,
+        keys_offsets,
+        values_offsets,
+        keys_nulls,
+        values_nulls,
+    )?;
+    let nulls = NullBuffer::union(keys_nulls, values_nulls);
+
+    let fields = Fields::from(vec![
+        Field::new("key", flat_keys.data_type().clone(), false),
+        Field::new("value", flat_values.data_type().clone(), true),
+    ]);
+    let entries = StructArray::try_new(fields.clone(), vec![keys, values], None)?;
+    let field = Arc::new(Field::new("entries", DataType::Struct(fields), false));
+    Ok(Arc::new(MapArray::try_new(
+        field, offsets, entries, nulls, false,
+    )?))
+}
+
+fn map_deduplicate_keys(
+    flat_keys: &ArrayRef,
+    flat_values: &ArrayRef,
+    keys_offsets: &[i32],
+    values_offsets: &[i32],
+    keys_nulls: Option<&NullBuffer>,
+    values_nulls: Option<&NullBuffer>,
+) -> Result<(ArrayRef, ArrayRef, OffsetBuffer<i32>)> {
+    let offsets_len = keys_offsets.len();
+    let mut new_offsets = Vec::with_capacity(offsets_len);
+
+    let mut cur_keys_offset = keys_offsets
+        .first()
+        .map(|offset| *offset as usize)
+        .unwrap_or(0);
+    let mut cur_values_offset = values_offsets
+        .first()
+        .map(|offset| *offset as usize)
+        .unwrap_or(0);
+
+    let mut new_last_offset = 0;
+    new_offsets.push(new_last_offset);
+
+    let mut keys_mask_builder = BooleanBuilder::new();
+    let mut values_mask_builder = BooleanBuilder::new();
+    for (row_idx, (next_keys_offset, next_values_offset)) in keys_offsets
+        .iter()
+        .zip(values_offsets.iter())
+        .skip(1)
+        .enumerate()
+    {
+        let num_keys_entries = *next_keys_offset as usize - cur_keys_offset;
+        let num_values_entries = *next_values_offset as usize - cur_values_offset;
+
+        let mut keys_mask_one = vec![false; num_keys_entries];
+        let mut values_mask_one = vec![false; num_values_entries];
+
+        let key_is_valid = keys_nulls.is_none_or(|buf| buf.is_valid(row_idx));
+        let value_is_valid = values_nulls.is_none_or(|buf| buf.is_valid(row_idx));
+
+        if key_is_valid && value_is_valid {
+            if num_keys_entries != num_values_entries {
+                return exec_err!(
+                    "map_deduplicate_keys: keys and values lists in the same row must have equal lengths"
+                );
+            } else if num_keys_entries != 0 {
+                let mut seen_keys = HashSet::new();
+
+                for cur_entry_idx in (0..num_keys_entries).rev() {
+                    let key = ScalarValue::try_from_array(
+                        &flat_keys,
+                        cur_keys_offset + cur_entry_idx,
+                    )?
+                    .compacted();
+                    if seen_keys.contains(&key) {
+                        // TODO: implement configuration and logic for spark.sql.mapKeyDedupPolicy=EXCEPTION (this is default spark-config)
+                        // exec_err!("invalid argument: duplicate keys in map")
+                        // https://github.com/apache/spark/blob/cf3a34e19dfcf70e2d679217ff1ba21302212472/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L4961
+                    } else {
+                        // This code implements deduplication logic for spark.sql.mapKeyDedupPolicy=LAST_WIN (this is NOT default spark-config)
+                        keys_mask_one[cur_entry_idx] = true;
+                        values_mask_one[cur_entry_idx] = true;
+                        seen_keys.insert(key);
+                        new_last_offset += 1;
+                    }
+                }
+            }
+        } else {
+            // the result entry is NULL
+            // both current row offsets are skipped
+            // keys or values in the current row are marked false in the masks
+        }
+        keys_mask_builder.append_array(&keys_mask_one.into());
+        values_mask_builder.append_array(&values_mask_one.into());
+        new_offsets.push(new_last_offset);
+        cur_keys_offset += num_keys_entries;
+        cur_values_offset += num_values_entries;
+    }
+    let keys_mask = keys_mask_builder.finish();
+    let values_mask = values_mask_builder.finish();
+    let needed_keys = filter(&flat_keys, &keys_mask)?;
+    let needed_values = filter(&flat_values, &values_mask)?;
+    let offsets = OffsetBuffer::new(new_offsets.into());
+    Ok((needed_keys, needed_values, offsets))
+}
diff --git a/datafusion/spark/src/function/math/abs.rs b/datafusion/spark/src/function/math/abs.rs
new file mode 100644
index 0000000000000..5edb40ae8ae9b
--- /dev/null
+++ b/datafusion/spark/src/function/math/abs.rs
@@ -0,0 +1,574 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::*;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use arrow::error::ArrowError;
+use datafusion_common::{DataFusionError, Result, ScalarValue, internal_err};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::{
+    downcast_named_arg, make_abs_function, make_try_abs_function,
+    make_wrapping_abs_function,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `abs` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#abs>
+///
+/// Returns the absolute value of input
+/// Returns NULL if input is NULL, returns NaN if input is NaN.
+///
+/// Differences with DataFusion abs:
+///  - Spark's ANSI-compliant dialect, when off (i.e. `spark.sql.ansi.enabled=false`), taking absolute value on the minimal value of a signed integer returns the value as is. DataFusion's abs throws "DataFusion error: Arrow error: Compute error" on arithmetic overflow
+///
+/// TODOs:
+///  - Spark's abs also supports ANSI interval types: YearMonthIntervalType and DayTimeIntervalType. DataFusion's abs doesn't.
+///
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkAbs {
+    signature: Signature,
+}
+
+impl Default for SparkAbs {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAbs {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkAbs {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "abs"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "SparkAbs: return_type() is not used; return_field_from_args() is implemented"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let input_field = &args.arg_fields[0];
+        let out_dt = input_field.data_type().clone();
+        let out_nullable = input_field.is_nullable();
+
+        Ok(Arc::new(Field::new(self.name(), out_dt, out_nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_abs(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+macro_rules! scalar_compute_op {
+    ($ENABLE_ANSI_MODE:expr, $INPUT:ident, $SCALAR_TYPE:ident) => {{
+        let result = if $ENABLE_ANSI_MODE {
+            $INPUT.checked_abs().ok_or_else(|| {
+                ArrowError::ComputeError(format!(
+                    "{} overflow on abs({:?})",
+                    stringify!($SCALAR_TYPE),
+                    $INPUT
+                ))
+            })?
+        } else {
+            $INPUT.wrapping_abs()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$SCALAR_TYPE(Some(
+            result,
+        ))))
+    }};
+    ($ENABLE_ANSI_MODE:expr, $INPUT:ident, $PRECISION:expr, $SCALE:expr, $SCALAR_TYPE:ident) => {{
+        let result = if $ENABLE_ANSI_MODE {
+            $INPUT.checked_abs().ok_or_else(|| {
+                ArrowError::ComputeError(format!(
+                    "{} overflow on abs({:?})",
+                    stringify!($SCALAR_TYPE),
+                    $INPUT
+                ))
+            })?
+        } else {
+            $INPUT.wrapping_abs()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$SCALAR_TYPE(
+            Some(result),
+            $PRECISION,
+            $SCALE,
+        )))
+    }};
+}
+
+pub fn spark_abs(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue, DataFusionError> {
+    if args.len() != 1 {
+        return internal_err!("abs takes exactly 1 argument, but got: {}", args.len());
+    }
+
+    match &args[0] {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64 => Ok(args[0].clone()),
+            DataType::Int8 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int8Array)
+                } else {
+                    make_wrapping_abs_function!(Int8Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int16 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int16Array)
+                } else {
+                    make_wrapping_abs_function!(Int16Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int32 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int32Array)
+                } else {
+                    make_wrapping_abs_function!(Int32Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Int64 => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Int64Array)
+                } else {
+                    make_wrapping_abs_function!(Int64Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Float32 => {
+                let abs_fun = make_abs_function!(Float32Array);
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Float64 => {
+                let abs_fun = make_abs_function!(Float64Array);
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Decimal128(_, _) => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Decimal128Array)
+                } else {
+                    make_wrapping_abs_function!(Decimal128Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            DataType::Decimal256(_, _) => {
+                let abs_fun = if enable_ansi_mode {
+                    make_try_abs_function!(Decimal256Array)
+                } else {
+                    make_wrapping_abs_function!(Decimal256Array)
+                };
+                abs_fun(array).map(ColumnarValue::Array)
+            }
+            dt => internal_err!("Not supported datatype for Spark ABS: {dt}"),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Null
+            | ScalarValue::UInt8(_)
+            | ScalarValue::UInt16(_)
+            | ScalarValue::UInt32(_)
+            | ScalarValue::UInt64(_) => Ok(args[0].clone()),
+            sv if sv.is_null() => Ok(args[0].clone()),
+            ScalarValue::Int8(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int8),
+            ScalarValue::Int16(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int16),
+            ScalarValue::Int32(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int32),
+            ScalarValue::Int64(Some(v)) => scalar_compute_op!(enable_ansi_mode, v, Int64),
+            ScalarValue::Float32(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(v.abs()))))
+            }
+            ScalarValue::Float64(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(v.abs()))))
+            }
+            ScalarValue::Decimal128(Some(v), precision, scale) => {
+                scalar_compute_op!(enable_ansi_mode, v, *precision, *scale, Decimal128)
+            }
+            ScalarValue::Decimal256(Some(v), precision, scale) => {
+                scalar_compute_op!(enable_ansi_mode, v, *precision, *scale, Decimal256)
+            }
+            dt => internal_err!("Not supported datatype for Spark ABS: {dt}"),
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::i256;
+
+    macro_rules! eval_array_legacy_mode {
+        ($INPUT:expr, $OUTPUT:expr, $FUNC:ident) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            let expected = $OUTPUT;
+            match spark_abs(&[args], false) {
+                Ok(ColumnarValue::Array(result)) => {
+                    let actual = datafusion_common::cast::$FUNC(&result).unwrap();
+                    assert_eq!(actual, &expected);
+                }
+                _ => unreachable!(),
+            }
+        }};
+    }
+
+    #[test]
+    fn test_abs_array_legacy_mode() {
+        eval_array_legacy_mode!(
+            Int8Array::from(vec![Some(-1), Some(i8::MIN), Some(i8::MAX), None]),
+            Int8Array::from(vec![Some(1), Some(i8::MIN), Some(i8::MAX), None]),
+            as_int8_array
+        );
+
+        eval_array_legacy_mode!(
+            Int16Array::from(vec![Some(-1), Some(i16::MIN), Some(i16::MAX), None]),
+            Int16Array::from(vec![Some(1), Some(i16::MIN), Some(i16::MAX), None]),
+            as_int16_array
+        );
+
+        eval_array_legacy_mode!(
+            Int32Array::from(vec![Some(-1), Some(i32::MIN), Some(i32::MAX), None]),
+            Int32Array::from(vec![Some(1), Some(i32::MIN), Some(i32::MAX), None]),
+            as_int32_array
+        );
+
+        eval_array_legacy_mode!(
+            Int64Array::from(vec![Some(-1), Some(i64::MIN), Some(i64::MAX), None]),
+            Int64Array::from(vec![Some(1), Some(i64::MIN), Some(i64::MAX), None]),
+            as_int64_array
+        );
+
+        eval_array_legacy_mode!(
+            Float32Array::from(vec![
+                Some(-1f32),
+                Some(f32::MIN),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float32Array::from(vec![
+                Some(1f32),
+                Some(f32::MAX),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float32_array
+        );
+
+        eval_array_legacy_mode!(
+            Float64Array::from(vec![
+                Some(-1f64),
+                Some(f64::MIN),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float64Array::from(vec![
+                Some(1f64),
+                Some(f64::MAX),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float64_array
+        );
+
+        eval_array_legacy_mode!(
+            Decimal128Array::from(vec![Some(i128::MIN), Some(i128::MIN + 1), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            Decimal128Array::from(vec![Some(i128::MIN), Some(i128::MAX), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            as_decimal128_array
+        );
+
+        eval_array_legacy_mode!(
+            Decimal256Array::from(vec![
+                Some(i256::MIN),
+                Some(i256::MINUS_ONE),
+                Some(i256::MIN + i256::from(1)),
+                None
+            ])
+            .with_precision_and_scale(5, 2)
+            .unwrap(),
+            Decimal256Array::from(vec![
+                Some(i256::MIN),
+                Some(i256::ONE),
+                Some(i256::MAX),
+                None
+            ])
+            .with_precision_and_scale(5, 2)
+            .unwrap(),
+            as_decimal256_array
+        );
+    }
+
+    macro_rules! eval_array_ansi_mode {
+        ($INPUT:expr) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            match spark_abs(&[args], true) {
+                Err(e) => {
+                    assert!(
+                        e.to_string().contains("overflow on abs"),
+                        "Error message did not match. Actual message: {e}"
+                    );
+                }
+                _ => unreachable!(),
+            }
+        }};
+        ($INPUT:expr, $OUTPUT:expr, $FUNC:ident) => {{
+            let input = $INPUT;
+            let args = ColumnarValue::Array(Arc::new(input));
+            let expected = $OUTPUT;
+            match spark_abs(&[args], true) {
+                Ok(ColumnarValue::Array(result)) => {
+                    let actual = datafusion_common::cast::$FUNC(&result).unwrap();
+                    assert_eq!(actual, &expected);
+                }
+                _ => unreachable!(),
+            }
+        }};
+    }
+    #[test]
+    fn test_abs_array_ansi_mode() {
+        eval_array_ansi_mode!(
+            UInt64Array::from(vec![Some(u64::MIN), Some(u64::MAX), None]),
+            UInt64Array::from(vec![Some(u64::MIN), Some(u64::MAX), None]),
+            as_uint64_array
+        );
+
+        eval_array_ansi_mode!(Int8Array::from(vec![
+            Some(-1),
+            Some(i8::MIN),
+            Some(i8::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int16Array::from(vec![
+            Some(-1),
+            Some(i16::MIN),
+            Some(i16::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int32Array::from(vec![
+            Some(-1),
+            Some(i32::MIN),
+            Some(i32::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(Int64Array::from(vec![
+            Some(-1),
+            Some(i64::MIN),
+            Some(i64::MAX),
+            None
+        ]));
+        eval_array_ansi_mode!(
+            Float32Array::from(vec![
+                Some(-1f32),
+                Some(f32::MIN),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float32Array::from(vec![
+                Some(1f32),
+                Some(f32::MAX),
+                Some(f32::MAX),
+                None,
+                Some(f32::NAN),
+                Some(f32::INFINITY),
+                Some(f32::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float32_array
+        );
+
+        eval_array_ansi_mode!(
+            Float64Array::from(vec![
+                Some(-1f64),
+                Some(f64::MIN),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::NEG_INFINITY),
+                Some(0.0),
+                Some(-0.0),
+            ]),
+            Float64Array::from(vec![
+                Some(1f64),
+                Some(f64::MAX),
+                Some(f64::MAX),
+                None,
+                Some(f64::NAN),
+                Some(f64::INFINITY),
+                Some(f64::INFINITY),
+                Some(0.0),
+                Some(0.0),
+            ]),
+            as_float64_array
+        );
+
+        // decimal: no arithmetic overflow
+        eval_array_ansi_mode!(
+            Decimal128Array::from(vec![Some(-1), Some(-2), Some(i128::MIN + 1)])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            Decimal128Array::from(vec![Some(1), Some(2), Some(i128::MAX)])
+                .with_precision_and_scale(38, 37)
+                .unwrap(),
+            as_decimal128_array
+        );
+
+        eval_array_ansi_mode!(
+            Decimal256Array::from(vec![
+                Some(i256::MINUS_ONE),
+                Some(i256::from(-2)),
+                Some(i256::MIN + i256::from(1))
+            ])
+            .with_precision_and_scale(18, 7)
+            .unwrap(),
+            Decimal256Array::from(vec![
+                Some(i256::ONE),
+                Some(i256::from(2)),
+                Some(i256::MAX)
+            ])
+            .with_precision_and_scale(18, 7)
+            .unwrap(),
+            as_decimal256_array
+        );
+
+        // decimal: arithmetic overflow
+        eval_array_ansi_mode!(
+            Decimal128Array::from(vec![Some(i128::MIN), None])
+                .with_precision_and_scale(38, 37)
+                .unwrap()
+        );
+        eval_array_ansi_mode!(
+            Decimal256Array::from(vec![Some(i256::MIN), None])
+                .with_precision_and_scale(5, 2)
+                .unwrap()
+        );
+    }
+
+    #[test]
+    fn test_abs_nullability() {
+        use arrow::datatypes::{DataType, Field};
+        use datafusion_expr::ReturnFieldArgs;
+        use std::sync::Arc;
+
+        let abs = SparkAbs::new();
+
+        // --- non-nullable Int32 input ---
+        let non_nullable_i32 = Arc::new(Field::new("c", DataType::Int32, false));
+        let out_non_null = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be non-nullable and the same DataType as input
+        assert!(!out_non_null.is_nullable());
+        assert_eq!(out_non_null.data_type(), &DataType::Int32);
+
+        // --- nullable Int32 input ---
+        let nullable_i32 = Arc::new(Field::new("c", DataType::Int32, true));
+        let out_nullable = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_i32)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        // result should be nullable and the same DataType as input
+        assert!(out_nullable.is_nullable());
+        assert_eq!(out_nullable.data_type(), &DataType::Int32);
+
+        // --- non-nullable Float64 input ---
+        let non_nullable_f64 = Arc::new(Field::new("c", DataType::Float64, false));
+        let out_f64 = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&non_nullable_f64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(!out_f64.is_nullable());
+        assert_eq!(out_f64.data_type(), &DataType::Float64);
+
+        // --- nullable Float64 input ---
+        let nullable_f64 = Arc::new(Field::new("c", DataType::Float64, true));
+        let out_f64_null = abs
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_f64)],
+                scalar_arguments: &[None],
+            })
+            .unwrap();
+
+        assert!(out_f64_null.is_nullable());
+        assert_eq!(out_f64_null.data_type(), &DataType::Float64);
+    }
+}
diff --git a/datafusion/spark/src/function/math/bin.rs b/datafusion/spark/src/function/math/bin.rs
new file mode 100644
index 0000000000000..5d3ed0f77a4e1
--- /dev/null
+++ b/datafusion/spark/src/function/math/bin.rs
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, StringArray};
+use arrow::datatypes::{DataType, Field, FieldRef, Int64Type};
+use datafusion::logical_expr::{ColumnarValue, Signature, TypeSignature, Volatility};
+use datafusion_common::types::{NativeType, logical_int64};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{Coercion, ScalarFunctionArgs, ScalarUDFImpl, TypeSignatureClass};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `bin` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#bin>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBin {
+    signature: Signature,
+}
+
+impl Default for SparkBin {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBin {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![TypeSignature::Coercible(vec![Coercion::new_implicit(
+                    TypeSignatureClass::Native(logical_int64()),
+                    vec![TypeSignatureClass::Numeric],
+                    NativeType::Int64,
+                )])],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBin {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bin"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: datafusion_expr::ReturnFieldArgs,
+    ) -> Result<FieldRef> {
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Utf8,
+            args.arg_fields[0].is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_bin_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_bin_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("bin", arg)?;
+    match &array.data_type() {
+        DataType::Int64 => {
+            let result: StringArray = array
+                .as_primitive::<Int64Type>()
+                .iter()
+                .map(|opt| opt.map(spark_bin))
+                .collect();
+            Ok(Arc::new(result))
+        }
+        data_type => {
+            internal_err!("bin does not support: {data_type}")
+        }
+    }
+}
+
+fn spark_bin(value: i64) -> String {
+    format!("{value:b}")
+}
diff --git a/datafusion/spark/src/function/math/expm1.rs b/datafusion/spark/src/function/math/expm1.rs
index 3a3a0c3835d37..b0b2b1a0865cd 100644
--- a/datafusion/spark/src/function/math/expm1.rs
+++ b/datafusion/spark/src/function/math/expm1.rs
@@ -15,11 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
+use crate::function::error_utils::unsupported_data_type_exec_err;
 use arrow::array::{ArrayRef, AsArray};
 use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::utils::take_function_args;
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
@@ -28,10 +27,9 @@ use std::any::Any;
 use std::sync::Arc;
 
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#expm1>
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkExpm1 {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkExpm1 {
@@ -43,8 +41,7 @@ impl Default for SparkExpm1 {
 impl SparkExpm1 {
     pub fn new() -> Self {
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
         }
     }
 }
@@ -67,10 +64,8 @@ impl ScalarUDFImpl for SparkExpm1 {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        if args.args.len() != 1 {
-            return Err(invalid_arg_count_exec_err("expm1", (1, 1), args.args.len()));
-        }
-        match &args.args[0] {
+        let [arg] = take_function_args(self.name(), args.args)?;
+        match arg {
             ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(
                 ColumnarValue::Scalar(ScalarValue::Float64(value.map(|x| x.exp_m1()))),
             ),
@@ -94,52 +89,4 @@ impl ScalarUDFImpl for SparkExpm1 {
             )),
         }
     }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return Err(invalid_arg_count_exec_err("expm1", (1, 1), arg_types.len()));
-        }
-        if arg_types[0].is_numeric() {
-            Ok(vec![DataType::Float64])
-        } else {
-            Err(unsupported_data_type_exec_err(
-                "expm1",
-                "Numeric Type",
-                &arg_types[0],
-            ))
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::function::math::expm1::SparkExpm1;
-    use crate::function::utils::test::test_scalar_function;
-    use arrow::array::{Array, Float64Array};
-    use arrow::datatypes::DataType::Float64;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-
-    macro_rules! test_expm1_float64_invoke {
-        ($INPUT:expr, $EXPECTED:expr) => {
-            test_scalar_function!(
-                SparkExpm1::new(),
-                vec![ColumnarValue::Scalar(ScalarValue::Float64($INPUT))],
-                $EXPECTED,
-                f64,
-                Float64,
-                Float64Array
-            );
-        };
-    }
-
-    #[test]
-    fn test_expm1_invoke() -> Result<()> {
-        test_expm1_float64_invoke!(Some(0f64), Ok(Some(0.0f64)));
-        Ok(())
-    }
 }
diff --git a/datafusion/spark/src/function/math/factorial.rs b/datafusion/spark/src/function/math/factorial.rs
new file mode 100644
index 0000000000000..439e79a9dd8b2
--- /dev/null
+++ b/datafusion/spark/src/function/math/factorial.rs
@@ -0,0 +1,194 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{Array, Int64Array};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{Int32, Int64};
+use datafusion_common::cast::as_int32_array;
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, exec_err, utils::take_function_args,
+};
+use datafusion_expr::Signature;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#factorial>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkFactorial {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkFactorial {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkFactorial {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![Int32], Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkFactorial {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "factorial"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_factorial(&args.args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+}
+
+const FACTORIALS: [i64; 21] = [
+    1,
+    1,
+    2,
+    6,
+    24,
+    120,
+    720,
+    5040,
+    40320,
+    362880,
+    3628800,
+    39916800,
+    479001600,
+    6227020800,
+    87178291200,
+    1307674368000,
+    20922789888000,
+    355687428096000,
+    6402373705728000,
+    121645100408832000,
+    2432902008176640000,
+];
+
+pub fn spark_factorial(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let [arg] = take_function_args("factorial", args)?;
+
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Int32(value)) => {
+            let result = compute_factorial(*value);
+            Ok(ColumnarValue::Scalar(ScalarValue::Int64(result)))
+        }
+        ColumnarValue::Scalar(other) => {
+            exec_err!("`factorial` got an unexpected scalar type: {}", other)
+        }
+        ColumnarValue::Array(array) => match array.data_type() {
+            Int32 => {
+                let array = as_int32_array(array)?;
+
+                let result: Int64Array = array.iter().map(compute_factorial).collect();
+
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            other => {
+                exec_err!("`factorial` got an unexpected argument type: {}", other)
+            }
+        },
+    }
+}
+
+#[inline]
+fn compute_factorial(num: Option<i32>) -> Option<i64> {
+    num.filter(|&v| (0..=20).contains(&v))
+        .map(|v| FACTORIALS[v as usize])
+}
+
+#[cfg(test)]
+mod test {
+    use crate::function::math::factorial::spark_factorial;
+    use arrow::array::{Int32Array, Int64Array};
+    use datafusion_common::ScalarValue;
+    use datafusion_common::cast::as_int64_array;
+    use datafusion_expr::ColumnarValue;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_spark_factorial_array() {
+        let input = Int32Array::from(vec![
+            Some(-1),
+            Some(0),
+            Some(1),
+            Some(2),
+            Some(4),
+            Some(20),
+            Some(21),
+            None,
+        ]);
+
+        let args = ColumnarValue::Array(Arc::new(input));
+        let result = spark_factorial(&[args]).unwrap();
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let actual = as_int64_array(&result).unwrap();
+        let expected = Int64Array::from(vec![
+            None,
+            Some(1),
+            Some(1),
+            Some(2),
+            Some(24),
+            Some(2432902008176640000),
+            None,
+            None,
+        ]);
+
+        assert_eq!(actual, &expected);
+    }
+
+    #[test]
+    fn test_spark_factorial_scalar() {
+        let input = ScalarValue::Int32(Some(5));
+
+        let args = ColumnarValue::Scalar(input);
+        let result = spark_factorial(&[args]).unwrap();
+        let result = match result {
+            ColumnarValue::Scalar(ScalarValue::Int64(val)) => val,
+            _ => panic!("Expected scalar"),
+        };
+        let actual = result.unwrap();
+        let expected = 120_i64;
+
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs
index 74ec7641b38ff..06c77f37021bf 100644
--- a/datafusion/spark/src/function/math/hex.rs
+++ b/datafusion/spark/src/function/math/hex.rs
@@ -16,27 +16,30 @@
 // under the License.
 
 use std::any::Any;
+use std::str::from_utf8_unchecked;
 use std::sync::Arc;
 
-use crate::function::error_utils::{
-    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
-};
-use arrow::array::{Array, StringArray};
+use arrow::array::{Array, ArrayRef, StringBuilder};
 use arrow::datatypes::DataType;
 use arrow::{
     array::{as_dictionary_array, as_largestring_array, as_string_array},
     datatypes::Int32Type,
 };
+use datafusion_common::cast::as_large_binary_array;
+use datafusion_common::cast::as_string_view_array;
+use datafusion_common::types::{NativeType, logical_int64, logical_string};
+use datafusion_common::utils::take_function_args;
 use datafusion_common::{
+    DataFusionError,
     cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array},
-    exec_err, DataFusionError,
+    exec_err,
+};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass, Volatility,
 };
-use datafusion_expr::Signature;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
-use std::fmt::Write;
-
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#hex>
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkHex {
     signature: Signature,
     aliases: Vec<String>,
@@ -50,8 +53,27 @@ impl Default for SparkHex {
 
 impl SparkHex {
     pub fn new() -> Self {
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Int64,
+        );
+
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+
+        let binary = Coercion::new_exact(TypeSignatureClass::Binary);
+
+        let variants = vec![
+            // accepts numeric types
+            TypeSignature::Coercible(vec![int64]),
+            // accepts string types (Utf8, Utf8View, LargeUtf8)
+            TypeSignature::Coercible(vec![string]),
+            // accepts binary types (Binary, FixedSizeBinary, LargeBinary)
+            TypeSignature::Coercible(vec![binary]),
+        ];
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
+            signature: Signature::one_of(variants, Volatility::Immutable),
             aliases: vec![],
         }
     }
@@ -70,11 +92,13 @@ impl ScalarUDFImpl for SparkHex {
         &self.signature
     }
 
-    fn return_type(
-        &self,
-        _arg_types: &[DataType],
-    ) -> datafusion_common::Result<DataType> {
-        Ok(DataType::Utf8)
+    fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(match &arg_types[0] {
+            DataType::Dictionary(key_type, _) => {
+                DataType::Dictionary(key_type.clone(), Box::new(DataType::Utf8))
+            }
+            _ => DataType::Utf8,
+        })
     }
 
     fn invoke_with_args(
@@ -87,182 +111,217 @@ impl ScalarUDFImpl for SparkHex {
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
+}
 
-    fn coerce_types(
-        &self,
-        arg_types: &[DataType],
-    ) -> datafusion_common::Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return Err(invalid_arg_count_exec_err("hex", (1, 1), arg_types.len()));
-        }
-        match &arg_types[0] {
-            DataType::Int64
-            | DataType::Utf8
-            | DataType::LargeUtf8
-            | DataType::Binary
-            | DataType::LargeBinary => Ok(vec![arg_types[0].clone()]),
-            DataType::Dictionary(key_type, value_type) => match value_type.as_ref() {
-                DataType::Int64
-                | DataType::Utf8
-                | DataType::LargeUtf8
-                | DataType::Binary
-                | DataType::LargeBinary => Ok(vec![arg_types[0].clone()]),
-                other => {
-                    if other.is_numeric() {
-                        Ok(vec![DataType::Dictionary(
-                            key_type.clone(),
-                            Box::new(DataType::Int64),
-                        )])
-                    } else {
-                        Err(unsupported_data_type_exec_err(
-                            "hex",
-                            "Numeric, String, or Binary",
-                            &arg_types[0],
-                        ))
-                    }
-                }
-            },
-            other => {
-                if other.is_numeric() {
-                    Ok(vec![DataType::Int64])
-                } else {
-                    Err(unsupported_data_type_exec_err(
-                        "hex",
-                        "Numeric, String, or Binary",
-                        &arg_types[0],
-                    ))
-                }
-            }
-        }
+/// Hex encoding lookup tables for fast byte-to-hex conversion
+const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
+const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
+
+#[inline]
+fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
+    if num == 0 {
+        return b"0";
     }
-}
 
-fn hex_int64(num: i64) -> String {
-    format!("{num:X}")
+    let mut n = num as u64;
+    let mut i = 16;
+    while n != 0 {
+        i -= 1;
+        buffer[i] = HEX_CHARS_UPPER[(n & 0xF) as usize];
+        n >>= 4;
+    }
+    &buffer[i..]
 }
 
-#[inline(always)]
-fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
-    let mut s = String::with_capacity(data.as_ref().len() * 2);
-    if lower_case {
-        for b in data.as_ref() {
-            // Writing to a string never errors, so we can unwrap here.
-            write!(&mut s, "{b:02x}").unwrap();
-        }
+/// Generic hex encoding for byte array types
+fn hex_encode_bytes<'a, I, T>(
+    iter: I,
+    lowercase: bool,
+    len: usize,
+) -> Result<ArrayRef, DataFusionError>
+where
+    I: Iterator<Item = Option<T>>,
+    T: AsRef<[u8]> + 'a,
+{
+    let mut builder = StringBuilder::with_capacity(len, len * 64);
+    let mut buffer = Vec::with_capacity(64);
+    let hex_chars = if lowercase {
+        HEX_CHARS_LOWER
     } else {
-        for b in data.as_ref() {
-            // Writing to a string never errors, so we can unwrap here.
-            write!(&mut s, "{b:02X}").unwrap();
+        HEX_CHARS_UPPER
+    };
+
+    for v in iter {
+        if let Some(b) = v {
+            buffer.clear();
+            let bytes = b.as_ref();
+            for &byte in bytes {
+                buffer.push(hex_chars[(byte >> 4) as usize]);
+                buffer.push(hex_chars[(byte & 0x0f) as usize]);
+            }
+            // SAFETY: buffer contains only ASCII hex digests, which are valid UTF-8
+            unsafe {
+                builder.append_value(from_utf8_unchecked(&buffer));
+            }
+        } else {
+            builder.append_null();
         }
     }
-    s
+
+    Ok(Arc::new(builder.finish()))
 }
 
-#[inline(always)]
-fn hex_bytes<T: AsRef<[u8]>>(bytes: T) -> Result<String, std::fmt::Error> {
-    let hex_string = hex_encode(bytes, false);
-    Ok(hex_string)
+/// Generic hex encoding for int64 type
+fn hex_encode_int64(
+    iter: impl Iterator<Item = Option<i64>>,
+    len: usize,
+) -> Result<ArrayRef, DataFusionError> {
+    let mut builder = StringBuilder::with_capacity(len, len * 16);
+
+    for v in iter {
+        if let Some(num) = v {
+            let mut temp = [0u8; 16];
+            let slice = hex_int64(num, &mut temp);
+            // SAFETY: slice contains only ASCII hex digests, which are valid UTF-8
+            unsafe {
+                builder.append_value(from_utf8_unchecked(slice));
+            }
+        } else {
+            builder.append_null();
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
 }
 
 /// Spark-compatible `hex` function
 pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
-    if args.len() != 1 {
-        return Err(DataFusionError::Internal(
-            "hex expects exactly one argument".to_string(),
-        ));
-    }
+    compute_hex(args, false)
+}
 
-    let input = match &args[0] {
-        ColumnarValue::Scalar(value) => ColumnarValue::Array(value.to_array()?),
-        ColumnarValue::Array(_) => args[0].clone(),
+/// Spark-compatible `sha2` function
+pub fn spark_sha2_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    compute_hex(args, true)
+}
+
+pub fn compute_hex(
+    args: &[ColumnarValue],
+    lowercase: bool,
+) -> Result<ColumnarValue, DataFusionError> {
+    let input = match take_function_args("hex", args)? {
+        [ColumnarValue::Scalar(value)] => ColumnarValue::Array(value.to_array()?),
+        [ColumnarValue::Array(arr)] => ColumnarValue::Array(Arc::clone(arr)),
     };
 
     match &input {
         ColumnarValue::Array(array) => match array.data_type() {
             DataType::Int64 => {
                 let array = as_int64_array(array)?;
-
-                let hexed_array: StringArray =
-                    array.iter().map(|v| v.map(hex_int64)).collect();
-
-                Ok(ColumnarValue::Array(Arc::new(hexed_array)))
+                Ok(ColumnarValue::Array(hex_encode_int64(
+                    array.iter(),
+                    array.len(),
+                )?))
             }
             DataType::Utf8 => {
                 let array = as_string_array(array);
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(hex_bytes).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
+            }
+            DataType::Utf8View => {
+                let array = as_string_view_array(array)?;
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::LargeUtf8 => {
                 let array = as_largestring_array(array);
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(hex_bytes).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::Binary => {
                 let array = as_binary_array(array)?;
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(hex_bytes).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
+            }
+            DataType::LargeBinary => {
+                let array = as_large_binary_array(array)?;
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
             DataType::FixedSizeBinary(_) => {
                 let array = as_fixed_size_binary_array(array)?;
-
-                let hexed: StringArray = array
-                    .iter()
-                    .map(|v| v.map(hex_bytes).transpose())
-                    .collect::<Result<_, _>>()?;
-
-                Ok(ColumnarValue::Array(Arc::new(hexed)))
+                Ok(ColumnarValue::Array(hex_encode_bytes(
+                    array.iter(),
+                    lowercase,
+                    array.len(),
+                )?))
             }
-            DataType::Dictionary(_, value_type) => {
+            DataType::Dictionary(key_type, _) => {
+                if **key_type != DataType::Int32 {
+                    return exec_err!(
+                        "hex only supports Int32 dictionary keys, get: {}",
+                        key_type
+                    );
+                }
+
                 let dict = as_dictionary_array::<Int32Type>(&array);
+                let dict_values = dict.values();
 
-                let values = match **value_type {
-                    DataType::Int64 => as_int64_array(dict.values())?
-                        .iter()
-                        .map(|v| v.map(hex_int64))
-                        .collect::<Vec<_>>(),
-                    DataType::Utf8 => as_string_array(dict.values())
-                        .iter()
-                        .map(|v| v.map(hex_bytes).transpose())
-                        .collect::<Result<_, _>>()?,
-                    DataType::Binary => as_binary_array(dict.values())?
-                        .iter()
-                        .map(|v| v.map(hex_bytes).transpose())
-                        .collect::<Result<_, _>>()?,
-                    _ => exec_err!(
-                        "hex got an unexpected argument type: {:?}",
-                        array.data_type()
-                    )?,
+                let encoded_values = match dict_values.data_type() {
+                    DataType::Int64 => {
+                        let arr = as_int64_array(dict_values)?;
+                        hex_encode_int64(arr.iter(), arr.len())?
+                    }
+                    DataType::Utf8 => {
+                        let arr = as_string_array(dict_values);
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::LargeUtf8 => {
+                        let arr = as_largestring_array(dict_values);
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::Utf8View => {
+                        let arr = as_string_view_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::Binary => {
+                        let arr = as_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::LargeBinary => {
+                        let arr = as_large_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    DataType::FixedSizeBinary(_) => {
+                        let arr = as_fixed_size_binary_array(dict_values)?;
+                        hex_encode_bytes(arr.iter(), lowercase, arr.len())?
+                    }
+                    _ => {
+                        return exec_err!(
+                            "hex got an unexpected argument type: {}",
+                            dict_values.data_type()
+                        );
+                    }
                 };
 
-                let new_values: Vec<Option<String>> = dict
-                    .keys()
-                    .iter()
-                    .map(|key| key.map(|k| values[k as usize].clone()).unwrap_or(None))
-                    .collect();
-
-                let string_array_values = StringArray::from(new_values);
-
-                Ok(ColumnarValue::Array(Arc::new(string_array_values)))
+                let new_dict = dict.with_values(encoded_values);
+                Ok(ColumnarValue::Array(Arc::new(new_dict)))
             }
-            _ => exec_err!(
-                "hex got an unexpected argument type: {:?}",
-                array.data_type()
-            ),
+            _ => exec_err!("hex got an unexpected argument type: {}", array.data_type()),
         },
         _ => exec_err!("native hex does not support scalar values at this time"),
     }
@@ -270,16 +329,18 @@ pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionErro
 
 #[cfg(test)]
 mod test {
+    use std::str::from_utf8_unchecked;
     use std::sync::Arc;
 
-    use arrow::array::{Int64Array, StringArray};
+    use arrow::array::{DictionaryArray, Int32Array, Int64Array, StringArray};
     use arrow::{
         array::{
-            as_string_array, BinaryDictionaryBuilder, PrimitiveDictionaryBuilder,
-            StringBuilder, StringDictionaryBuilder,
+            BinaryDictionaryBuilder, PrimitiveDictionaryBuilder, StringDictionaryBuilder,
+            as_string_array,
         },
         datatypes::{Int32Type, Int64Type},
     };
+    use datafusion_common::cast::as_dictionary_array;
     use datafusion_expr::ColumnarValue;
 
     #[test]
@@ -291,12 +352,12 @@ mod test {
         input_builder.append_value("rust");
         let input = input_builder.finish();
 
-        let mut string_builder = StringBuilder::new();
-        string_builder.append_value("6869");
-        string_builder.append_value("627965");
-        string_builder.append_null();
-        string_builder.append_value("72757374");
-        let expected = string_builder.finish();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
+        expected_builder.append_value("6869");
+        expected_builder.append_value("627965");
+        expected_builder.append_null();
+        expected_builder.append_value("72757374");
+        let expected = expected_builder.finish();
 
         let columnar_value = ColumnarValue::Array(Arc::new(input));
         let result = super::spark_hex(&[columnar_value]).unwrap();
@@ -306,7 +367,7 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
@@ -320,12 +381,12 @@ mod test {
         input_builder.append_value(3);
         let input = input_builder.finish();
 
-        let mut string_builder = StringBuilder::new();
-        string_builder.append_value("1");
-        string_builder.append_value("2");
-        string_builder.append_null();
-        string_builder.append_value("3");
-        let expected = string_builder.finish();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
+        expected_builder.append_value("1");
+        expected_builder.append_value("2");
+        expected_builder.append_null();
+        expected_builder.append_value("3");
+        let expected = expected_builder.finish();
 
         let columnar_value = ColumnarValue::Array(Arc::new(input));
         let result = super::spark_hex(&[columnar_value]).unwrap();
@@ -335,7 +396,7 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
@@ -349,7 +410,7 @@ mod test {
         input_builder.append_value("3");
         let input = input_builder.finish();
 
-        let mut expected_builder = StringBuilder::new();
+        let mut expected_builder = StringDictionaryBuilder::<Int32Type>::new();
         expected_builder.append_value("31");
         expected_builder.append_value("6A");
         expected_builder.append_null();
@@ -364,20 +425,24 @@ mod test {
             _ => panic!("Expected array"),
         };
 
-        let result = as_string_array(&result);
+        let result = as_dictionary_array(&result).unwrap();
 
         assert_eq!(result, &expected);
     }
 
     #[test]
     fn test_hex_int64() {
-        let num = 1234;
-        let hexed = super::hex_int64(num);
-        assert_eq!(hexed, "4D2".to_string());
+        let test_cases = vec![(1234, "4D2"), (-1, "FFFFFFFFFFFFFFFF")];
 
-        let num = -1;
-        let hexed = super::hex_int64(num);
-        assert_eq!(hexed, "FFFFFFFFFFFFFFFF".to_string());
+        for (num, expected) in test_cases {
+            let mut cache = [0u8; 16];
+            let slice = super::hex_int64(num, &mut cache);
+
+            unsafe {
+                let result = from_utf8_unchecked(slice);
+                assert_eq!(expected, result);
+            }
+        }
     }
 
     #[test]
@@ -401,4 +466,28 @@ mod test {
 
         assert_eq!(string_array, &expected_array);
     }
+
+    #[test]
+    fn test_dict_values_null() {
+        let keys = Int32Array::from(vec![Some(0), None, Some(1)]);
+        let vals = Int64Array::from(vec![Some(32), None]);
+        // [32, null, null]
+        let dict = DictionaryArray::new(keys, Arc::new(vals));
+
+        let columnar_value = ColumnarValue::Array(Arc::new(dict));
+        let result = super::spark_hex(&[columnar_value]).unwrap();
+
+        let result = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+
+        let result = as_dictionary_array(&result).unwrap();
+
+        let keys = Int32Array::from(vec![Some(0), None, Some(1)]);
+        let vals = StringArray::from(vec![Some("20"), None]);
+        let expected = DictionaryArray::new(keys, Arc::new(vals));
+
+        assert_eq!(&expected, result);
+    }
 }
diff --git a/datafusion/spark/src/function/math/mod.rs b/datafusion/spark/src/function/math/mod.rs
index 80bcdc39a41de..7f7d04e06b0be 100644
--- a/datafusion/spark/src/function/math/mod.rs
+++ b/datafusion/spark/src/function/math/mod.rs
@@ -15,23 +15,84 @@
 // specific language governing permissions and limitations
 // under the License.
 
+pub mod abs;
+pub mod bin;
 pub mod expm1;
+pub mod factorial;
 pub mod hex;
+pub mod modulus;
+pub mod negative;
+pub mod rint;
+pub mod trigonometry;
+pub mod unhex;
+pub mod width_bucket;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
+make_udf_function!(abs::SparkAbs, abs);
 make_udf_function!(expm1::SparkExpm1, expm1);
+make_udf_function!(factorial::SparkFactorial, factorial);
 make_udf_function!(hex::SparkHex, hex);
+make_udf_function!(modulus::SparkMod, modulus);
+make_udf_function!(modulus::SparkPmod, pmod);
+make_udf_function!(rint::SparkRint, rint);
+make_udf_function!(unhex::SparkUnhex, unhex);
+make_udf_function!(width_bucket::SparkWidthBucket, width_bucket);
+make_udf_function!(trigonometry::SparkCsc, csc);
+make_udf_function!(trigonometry::SparkSec, sec);
+make_udf_function!(negative::SparkNegative, negative);
+make_udf_function!(bin::SparkBin, bin);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
 
+    export_functions!((abs, "Returns abs(expr)", arg1));
     export_functions!((expm1, "Returns exp(expr) - 1 as a Float64.", arg1));
+    export_functions!((
+        factorial,
+        "Returns the factorial of expr. expr is [0..20]. Otherwise, null.",
+        arg1
+    ));
     export_functions!((hex, "Computes hex value of the given column.", arg1));
+    export_functions!((modulus, "Returns the remainder of division of the first argument by the second argument.", arg1 arg2));
+    export_functions!((pmod, "Returns the positive remainder of division of the first argument by the second argument.", arg1 arg2));
+    export_functions!((
+        rint,
+        "Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
+        arg1
+    ));
+    export_functions!((unhex, "Converts hexadecimal string to binary.", arg1));
+    export_functions!((width_bucket, "Returns the bucket number into which the value of this expression would fall after being evaluated.", arg1 arg2 arg3 arg4));
+    export_functions!((csc, "Returns the cosecant of expr.", arg1));
+    export_functions!((sec, "Returns the secant of expr.", arg1));
+    export_functions!((
+        negative,
+        "Returns the negation of expr (unary minus).",
+        arg1
+    ));
+    export_functions!((
+        bin,
+        "Returns the string representation of the long value represented in binary.",
+        arg1
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![expm1(), hex()]
+    vec![
+        abs(),
+        expm1(),
+        factorial(),
+        hex(),
+        modulus(),
+        pmod(),
+        rint(),
+        unhex(),
+        width_bucket(),
+        csc(),
+        sec(),
+        negative(),
+        bin(),
+    ]
 }
diff --git a/datafusion/spark/src/function/math/modulus.rs b/datafusion/spark/src/function/math/modulus.rs
new file mode 100644
index 0000000000000..7a21aabbdf855
--- /dev/null
+++ b/datafusion/spark/src/function/math/modulus.rs
@@ -0,0 +1,693 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Scalar, new_null_array};
+use arrow::compute::kernels::numeric::add;
+use arrow::compute::kernels::{
+    cmp::{eq, lt},
+    numeric::rem,
+    zip::zip,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, ScalarValue, assert_eq_or_internal_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use std::any::Any;
+
+/// Attempts `rem(left, right)` with per-element divide-by-zero handling.
+/// In ANSI mode, any zero divisor causes an error.
+/// In legacy mode (ANSI off), positions where the divisor is zero return NULL
+/// while other positions compute normally.
+fn try_rem(
+    left: &arrow::array::ArrayRef,
+    right: &arrow::array::ArrayRef,
+    enable_ansi_mode: bool,
+) -> Result<arrow::array::ArrayRef> {
+    match rem(left, right) {
+        Ok(result) => Ok(result),
+        Err(arrow::error::ArrowError::DivideByZero) if !enable_ansi_mode => {
+            // Integer rem fails when ANY divisor element is zero.
+            // Handle per-element: null out zero divisors
+            let zero = ScalarValue::new_zero(right.data_type())?.to_array()?;
+            let zero = Scalar::new(zero);
+            let null = Scalar::new(new_null_array(right.data_type(), 1));
+            let is_zero = eq(right, &zero)?;
+            let safe_right = zip(&is_zero, &null, right)?;
+            Ok(rem(left, &safe_right)?)
+        }
+        Err(e) => Err(e.into()),
+    }
+}
+
+/// Spark-compatible `mod` function
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_mod(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    assert_eq_or_internal_err!(args.len(), 2, "mod expects exactly two arguments");
+    let args = ColumnarValue::values_to_arrays(args)?;
+    let result = try_rem(&args[0], &args[1], enable_ansi_mode)?;
+    Ok(ColumnarValue::Array(result))
+}
+
+/// Spark-compatible `pmod` function
+/// In ANSI mode, division by zero throws an error.
+/// In legacy mode, division by zero returns NULL (Spark behavior).
+pub fn spark_pmod(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    assert_eq_or_internal_err!(args.len(), 2, "pmod expects exactly two arguments");
+    let args = ColumnarValue::values_to_arrays(args)?;
+    let left = &args[0];
+    let right = &args[1];
+    let zero = ScalarValue::new_zero(left.data_type())?.to_array_of_size(left.len())?;
+    let result = try_rem(left, right, enable_ansi_mode)?;
+    let neg = lt(&result, &zero)?;
+    let plus = zip(&neg, right, &zero)?;
+    let result = add(&plus, &result)?;
+    let result = try_rem(&result, right, enable_ansi_mode)?;
+    Ok(ColumnarValue::Array(result))
+}
+
+/// SparkMod implements the Spark-compatible modulo function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkMod {
+    signature: Signature,
+}
+
+impl Default for SparkMod {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkMod {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMod {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "mod"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        assert_eq_or_internal_err!(
+            arg_types.len(),
+            2,
+            "mod expects exactly two arguments"
+        );
+
+        // Return the same type as the first argument for simplicity
+        // Arrow's rem function handles type promotion internally
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_mod(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+/// SparkMod implements the Spark-compatible modulo function
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkPmod {
+    signature: Signature,
+}
+
+impl Default for SparkPmod {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkPmod {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkPmod {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "pmod"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        assert_eq_or_internal_err!(
+            arg_types.len(),
+            2,
+            "pmod expects exactly two arguments"
+        );
+
+        // Return the same type as the first argument for simplicity
+        // Arrow's rem function handles type promotion internally
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_pmod(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use super::*;
+    use arrow::array::*;
+    use datafusion_common::ScalarValue;
+
+    #[test]
+    fn test_mod_int32() {
+        let left = Int32Array::from(vec![Some(10), Some(7), Some(15), None]);
+        let right = Int32Array::from(vec![Some(3), Some(2), Some(4), Some(5)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 1); // 10 % 3 = 1
+            assert_eq!(result_int32.value(1), 1); // 7 % 2 = 1
+            assert_eq!(result_int32.value(2), 3); // 15 % 4 = 3
+            assert!(result_int32.is_null(3)); // None % 5 = None
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_int64() {
+        let left = Int64Array::from(vec![Some(100), Some(50), Some(200)]);
+        let right = Int64Array::from(vec![Some(30), Some(25), Some(60)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int64 =
+                result_array.as_any().downcast_ref::<Int64Array>().unwrap();
+            assert_eq!(result_int64.value(0), 10); // 100 % 30 = 10
+            assert_eq!(result_int64.value(1), 0); // 50 % 25 = 0
+            assert_eq!(result_int64.value(2), 20); // 200 % 60 = 20
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_float64() {
+        let left = Float64Array::from(vec![
+            Some(10.5),
+            Some(7.2),
+            Some(15.8),
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+            Some(5.0),
+            Some(5.0),
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+        ]);
+        let right = Float64Array::from(vec![
+            Some(3.0),
+            Some(2.5),
+            Some(4.2),
+            Some(2.0),
+            Some(2.0),
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+            Some(f64::INFINITY),
+            Some(f64::NAN),
+        ]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_float64 = result_array
+                .as_any()
+                .downcast_ref::<Float64Array>()
+                .unwrap();
+            // Regular cases
+            assert!((result_float64.value(0) - 1.5).abs() < f64::EPSILON); // 10.5 % 3.0 = 1.5
+            assert!((result_float64.value(1) - 2.2).abs() < f64::EPSILON); // 7.2 % 2.5 = 2.2
+            assert!((result_float64.value(2) - 3.2).abs() < f64::EPSILON); // 15.8 % 4.2 = 3.2
+            // nan % 2.0 = nan
+            assert!(result_float64.value(3).is_nan());
+            // inf % 2.0 = nan (IEEE 754)
+            assert!(result_float64.value(4).is_nan());
+            // 5.0 % nan = nan
+            assert!(result_float64.value(5).is_nan());
+            // 5.0 % inf = 5.0
+            assert!((result_float64.value(6) - 5.0).abs() < f64::EPSILON);
+            // nan % inf = nan
+            assert!(result_float64.value(7).is_nan());
+            // inf % nan = nan
+            assert!(result_float64.value(8).is_nan());
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_float32() {
+        let left = Float32Array::from(vec![
+            Some(10.5),
+            Some(7.2),
+            Some(15.8),
+            Some(f32::NAN),
+            Some(f32::INFINITY),
+            Some(5.0),
+            Some(5.0),
+            Some(f32::NAN),
+            Some(f32::INFINITY),
+        ]);
+        let right = Float32Array::from(vec![
+            Some(3.0),
+            Some(2.5),
+            Some(4.2),
+            Some(2.0),
+            Some(2.0),
+            Some(f32::NAN),
+            Some(f32::INFINITY),
+            Some(f32::INFINITY),
+            Some(f32::NAN),
+        ]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_float32 = result_array
+                .as_any()
+                .downcast_ref::<Float32Array>()
+                .unwrap();
+            // Regular cases
+            assert!((result_float32.value(0) - 1.5).abs() < f32::EPSILON); // 10.5 % 3.0 = 1.5
+            assert!((result_float32.value(1) - 2.2).abs() < f32::EPSILON * 3.0); // 7.2 % 2.5 = 2.2
+            assert!((result_float32.value(2) - 3.2).abs() < f32::EPSILON * 10.0); // 15.8 % 4.2 = 3.2
+            // nan % 2.0 = nan
+            assert!(result_float32.value(3).is_nan());
+            // inf % 2.0 = nan (IEEE 754)
+            assert!(result_float32.value(4).is_nan());
+            // 5.0 % nan = nan
+            assert!(result_float32.value(5).is_nan());
+            // 5.0 % inf = 5.0
+            assert!((result_float32.value(6) - 5.0).abs() < f32::EPSILON);
+            // nan % inf = nan
+            assert!(result_float32.value(7).is_nan());
+            // inf % nan = nan
+            assert!(result_float32.value(8).is_nan());
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_scalar() {
+        let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
+        let right_value = ColumnarValue::Scalar(ScalarValue::Int32(Some(3)));
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 1); // 10 % 3 = 1
+            assert_eq!(result_int32.value(1), 1); // 7 % 3 = 1
+            assert_eq!(result_int32.value(2), 0); // 15 % 3 = 0
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_wrong_arg_count() {
+        let left = Int32Array::from(vec![Some(10)]);
+        let left_value = ColumnarValue::Array(Arc::new(left));
+
+        let result = spark_mod(&[left_value], false);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_mod_zero_division_legacy() {
+        // In legacy mode (ANSI off), division by zero returns NULL per-element
+        let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert!(result_int32.is_null(0)); // 10 % 0 = NULL
+            assert_eq!(result_int32.value(1), 1); // 7 % 2 = 1
+            assert_eq!(result_int32.value(2), 3); // 15 % 4 = 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_mod_zero_division_ansi() {
+        // In ANSI mode, division by zero should error
+        let left = Int32Array::from(vec![Some(10), Some(7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(2), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_mod(&[left_value, right_value], true);
+        assert!(result.is_err());
+    }
+
+    // PMOD tests
+    #[test]
+    fn test_pmod_int32() {
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15), Some(-15), None]);
+        let right = Int32Array::from(vec![Some(3), Some(3), Some(4), Some(4), Some(5)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 1); // 10 pmod 3 = 1
+            assert_eq!(result_int32.value(1), 2); // -7 pmod 3 = 2 (positive remainder)
+            assert_eq!(result_int32.value(2), 3); // 15 pmod 4 = 3
+            assert_eq!(result_int32.value(3), 1); // -15 pmod 4 = 1 (positive remainder)
+            assert!(result_int32.is_null(4)); // None pmod 5 = None
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_int64() {
+        let left = Int64Array::from(vec![Some(100), Some(-50), Some(200), Some(-200)]);
+        let right = Int64Array::from(vec![Some(30), Some(30), Some(60), Some(60)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int64 =
+                result_array.as_any().downcast_ref::<Int64Array>().unwrap();
+            assert_eq!(result_int64.value(0), 10); // 100 pmod 30 = 10
+            assert_eq!(result_int64.value(1), 10); // -50 pmod 30 = 10 (positive remainder)
+            assert_eq!(result_int64.value(2), 20); // 200 pmod 60 = 20
+            assert_eq!(result_int64.value(3), 40); // -200 pmod 60 = 40 (positive remainder)
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_float64() {
+        let left = Float64Array::from(vec![
+            Some(10.5),
+            Some(-7.2),
+            Some(15.8),
+            Some(-15.8),
+            Some(f64::NAN),
+            Some(f64::INFINITY),
+            Some(5.0),
+            Some(-5.0),
+        ]);
+        let right = Float64Array::from(vec![
+            Some(3.0),
+            Some(3.0),
+            Some(4.2),
+            Some(4.2),
+            Some(2.0),
+            Some(2.0),
+            Some(f64::INFINITY),
+            Some(f64::INFINITY),
+        ]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_float64 = result_array
+                .as_any()
+                .downcast_ref::<Float64Array>()
+                .unwrap();
+            // Regular cases
+            assert!((result_float64.value(0) - 1.5).abs() < f64::EPSILON); // 10.5 pmod 3.0 = 1.5
+            assert!((result_float64.value(1) - 1.8).abs() < f64::EPSILON * 3.0); // -7.2 pmod 3.0 = 1.8 (positive)
+            assert!((result_float64.value(2) - 3.2).abs() < f64::EPSILON * 3.0); // 15.8 pmod 4.2 = 3.2
+            assert!((result_float64.value(3) - 1.0).abs() < f64::EPSILON * 3.0); // -15.8 pmod 4.2 = 1.0 (positive)
+            // nan pmod 2.0 = nan
+            assert!(result_float64.value(4).is_nan());
+            // inf pmod 2.0 = nan (IEEE 754)
+            assert!(result_float64.value(5).is_nan());
+            // 5.0 pmod inf = 5.0
+            assert!((result_float64.value(6) - 5.0).abs() < f64::EPSILON);
+            // -5.0 pmod inf = NaN
+            assert!(result_float64.value(7).is_nan());
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_float32() {
+        let left = Float32Array::from(vec![
+            Some(10.5),
+            Some(-7.2),
+            Some(15.8),
+            Some(-15.8),
+            Some(f32::NAN),
+            Some(f32::INFINITY),
+            Some(5.0),
+            Some(-5.0),
+        ]);
+        let right = Float32Array::from(vec![
+            Some(3.0),
+            Some(3.0),
+            Some(4.2),
+            Some(4.2),
+            Some(2.0),
+            Some(2.0),
+            Some(f32::INFINITY),
+            Some(f32::INFINITY),
+        ]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_float32 = result_array
+                .as_any()
+                .downcast_ref::<Float32Array>()
+                .unwrap();
+            // Regular cases
+            assert!((result_float32.value(0) - 1.5).abs() < f32::EPSILON); // 10.5 pmod 3.0 = 1.5
+            assert!((result_float32.value(1) - 1.8).abs() < f32::EPSILON * 3.0); // -7.2 pmod 3.0 = 1.8 (positive)
+            assert!((result_float32.value(2) - 3.2).abs() < f32::EPSILON * 10.0); // 15.8 pmod 4.2 = 3.2
+            assert!((result_float32.value(3) - 1.0).abs() < f32::EPSILON * 10.0); // -15.8 pmod 4.2 = 1.0 (positive)
+            // nan pmod 2.0 = nan
+            assert!(result_float32.value(4).is_nan());
+            // inf pmod 2.0 = nan (IEEE 754)
+            assert!(result_float32.value(5).is_nan());
+            // 5.0 pmod inf = 5.0
+            assert!((result_float32.value(6) - 5.0).abs() < f32::EPSILON * 10.0);
+            // -5.0 pmod inf = NaN
+            assert!(result_float32.value(7).is_nan());
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_scalar() {
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15), Some(-15)]);
+        let right_value = ColumnarValue::Scalar(ScalarValue::Int32(Some(3)));
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 1); // 10 pmod 3 = 1
+            assert_eq!(result_int32.value(1), 2); // -7 pmod 3 = 2 (positive remainder)
+            assert_eq!(result_int32.value(2), 0); // 15 pmod 3 = 0
+            assert_eq!(result_int32.value(3), 0); // -15 pmod 3 = 0 (positive remainder)
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_wrong_arg_count() {
+        let left = Int32Array::from(vec![Some(10)]);
+        let left_value = ColumnarValue::Array(Arc::new(left));
+
+        let result = spark_pmod(&[left_value], false);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_pmod_zero_division_legacy() {
+        // In legacy mode (ANSI off), division by zero returns NULL per-element
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert!(result_int32.is_null(0)); // 10 pmod 0 = NULL
+            assert!(result_int32.is_null(1)); // -7 pmod 0 = NULL
+            assert_eq!(result_int32.value(2), 3); // 15 pmod 4 = 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_zero_division_ansi() {
+        // In ANSI mode, division by zero should error
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
+        let right = Int32Array::from(vec![Some(0), Some(0), Some(4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], true);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_pmod_negative_divisor() {
+        // PMOD with negative divisor should still work like regular mod
+        let left = Int32Array::from(vec![Some(10), Some(-7), Some(15)]);
+        let right = Int32Array::from(vec![Some(-3), Some(-3), Some(-4)]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 1); // 10 pmod -3 = 1
+            assert_eq!(result_int32.value(1), -1); // -7 pmod -3 = -1
+            assert_eq!(result_int32.value(2), 3); // 15 pmod -4 = 3
+        } else {
+            panic!("Expected array result");
+        }
+    }
+
+    #[test]
+    fn test_pmod_edge_cases() {
+        // Test edge cases for PMOD
+        let left = Int32Array::from(vec![
+            Some(0),  // 0 pmod 5 = 0
+            Some(-1), // -1 pmod 5 = 4
+            Some(1),  // 1 pmod 5 = 1
+            Some(-5), // -5 pmod 5 = 0
+            Some(5),  // 5 pmod 5 = 0
+            Some(-6), // -6 pmod 5 = 4
+            Some(6),  // 6 pmod 5 = 1
+        ]);
+        let right = Int32Array::from(vec![
+            Some(5),
+            Some(5),
+            Some(5),
+            Some(5),
+            Some(5),
+            Some(5),
+            Some(5),
+        ]);
+
+        let left_value = ColumnarValue::Array(Arc::new(left));
+        let right_value = ColumnarValue::Array(Arc::new(right));
+
+        let result = spark_pmod(&[left_value, right_value], false).unwrap();
+
+        if let ColumnarValue::Array(result_array) = result {
+            let result_int32 =
+                result_array.as_any().downcast_ref::<Int32Array>().unwrap();
+            assert_eq!(result_int32.value(0), 0); // 0 pmod 5 = 0
+            assert_eq!(result_int32.value(1), 4); // -1 pmod 5 = 4
+            assert_eq!(result_int32.value(2), 1); // 1 pmod 5 = 1
+            assert_eq!(result_int32.value(3), 0); // -5 pmod 5 = 0
+            assert_eq!(result_int32.value(4), 0); // 5 pmod 5 = 0
+            assert_eq!(result_int32.value(5), 4); // -6 pmod 5 = 4
+            assert_eq!(result_int32.value(6), 1); // 6 pmod 5 = 1
+        } else {
+            panic!("Expected array result");
+        }
+    }
+}
diff --git a/datafusion/spark/src/function/math/negative.rs b/datafusion/spark/src/function/math/negative.rs
new file mode 100644
index 0000000000000..2df71b709d8c4
--- /dev/null
+++ b/datafusion/spark/src/function/math/negative.rs
@@ -0,0 +1,477 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::types::*;
+use arrow::array::*;
+use arrow::datatypes::{DataType, IntervalDayTime, IntervalMonthDayNano, IntervalUnit};
+use bigdecimal::num_traits::WrappingNeg;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `negative` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#negative>
+///
+/// Returns the negation of input (equivalent to unary minus)
+/// Returns NULL if input is NULL, returns NaN if input is NaN.
+///
+/// ANSI mode support:
+///  - When ANSI mode is disabled (`spark.sql.ansi.enabled=false`), negating the minimal
+///    value of a signed integer wraps around. For example: negative(i32::MIN) returns
+///    i32::MIN (wraps instead of error).
+///  - When ANSI mode is enabled (`spark.sql.ansi.enabled=true`), overflow conditions
+///    throw an ARITHMETIC_OVERFLOW error instead of wrapping.
+///
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkNegative {
+    signature: Signature,
+}
+
+impl Default for SparkNegative {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkNegative {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature {
+                type_signature: TypeSignature::OneOf(vec![
+                    // Numeric types: signed integers, float, decimals
+                    TypeSignature::Numeric(1),
+                    // Interval types: YearMonth, DayTime, MonthDayNano
+                    TypeSignature::Uniform(
+                        1,
+                        vec![
+                            DataType::Interval(IntervalUnit::YearMonth),
+                            DataType::Interval(IntervalUnit::DayTime),
+                            DataType::Interval(IntervalUnit::MonthDayNano),
+                        ],
+                    ),
+                ]),
+                volatility: Volatility::Immutable,
+                parameter_names: None,
+            },
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkNegative {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "negative"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_negative(&args.args, args.config_options.execution.enable_ansi_mode)
+    }
+}
+
+/// Macro to implement negation for integer array types
+macro_rules! impl_integer_array_negative {
+    ($array:expr, $type:ty, $type_name:expr, $enable_ansi_mode:expr) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = if $enable_ansi_mode {
+            array.try_unary(|x| {
+                x.checked_neg().ok_or_else(|| {
+                    (exec_err!("{} overflow on negative({x})", $type_name)
+                        as Result<(), _>)
+                        .unwrap_err()
+                })
+            })?
+        } else {
+            array.unary(|x| x.wrapping_neg())
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for float array types
+macro_rules! impl_float_array_negative {
+    ($array:expr, $type:ty) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = array.unary(|x| -x);
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for decimal array types
+macro_rules! impl_decimal_array_negative {
+    ($array:expr, $type:ty, $type_name:expr, $enable_ansi_mode:expr) => {{
+        let array = $array.as_primitive::<$type>();
+        let result: PrimitiveArray<$type> = if $enable_ansi_mode {
+            array
+                .try_unary(|x| {
+                    x.checked_neg().ok_or_else(|| {
+                        (exec_err!("{} overflow on negative({x})", $type_name)
+                            as Result<(), _>)
+                            .unwrap_err()
+                    })
+                })?
+                .with_data_type(array.data_type().clone())
+        } else {
+            array.unary(|x| x.wrapping_neg())
+        };
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }};
+}
+
+/// Macro to implement negation for integer scalar types
+macro_rules! impl_integer_scalar_negative {
+    ($v:expr, $type_name:expr, $variant:ident, $enable_ansi_mode:expr) => {{
+        let result = if $enable_ansi_mode {
+            $v.checked_neg().ok_or_else(|| {
+                (exec_err!("{} overflow on negative({})", $type_name, $v)
+                    as Result<(), _>)
+                    .unwrap_err()
+            })?
+        } else {
+            $v.wrapping_neg()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$variant(Some(result))))
+    }};
+}
+
+/// Macro to implement negation for decimal scalar types
+macro_rules! impl_decimal_scalar_negative {
+    ($v:expr, $precision:expr, $scale:expr, $type_name:expr, $variant:ident, $enable_ansi_mode:expr) => {{
+        let result = if $enable_ansi_mode {
+            $v.checked_neg().ok_or_else(|| {
+                (exec_err!("{} overflow on negative({})", $type_name, $v)
+                    as Result<(), _>)
+                    .unwrap_err()
+            })?
+        } else {
+            $v.wrapping_neg()
+        };
+        Ok(ColumnarValue::Scalar(ScalarValue::$variant(
+            Some(result),
+            *$precision,
+            *$scale,
+        )))
+    }};
+}
+
+/// Core implementation of Spark's negative function
+fn spark_negative(
+    args: &[ColumnarValue],
+    enable_ansi_mode: bool,
+) -> Result<ColumnarValue> {
+    let [arg] = take_function_args("negative", args)?;
+
+    match arg {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Null => Ok(arg.clone()),
+
+            // Signed integers - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Int8 => {
+                impl_integer_array_negative!(array, Int8Type, "Int8", enable_ansi_mode)
+            }
+            DataType::Int16 => {
+                impl_integer_array_negative!(array, Int16Type, "Int16", enable_ansi_mode)
+            }
+            DataType::Int32 => {
+                impl_integer_array_negative!(array, Int32Type, "Int32", enable_ansi_mode)
+            }
+            DataType::Int64 => {
+                impl_integer_array_negative!(array, Int64Type, "Int64", enable_ansi_mode)
+            }
+
+            // Floating point - simple negation (no overflow possible)
+            DataType::Float16 => impl_float_array_negative!(array, Float16Type),
+            DataType::Float32 => impl_float_array_negative!(array, Float32Type),
+            DataType::Float64 => impl_float_array_negative!(array, Float64Type),
+
+            // Decimal types - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Decimal32(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal32Type,
+                "Decimal32",
+                enable_ansi_mode
+            ),
+            DataType::Decimal64(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal64Type,
+                "Decimal64",
+                enable_ansi_mode
+            ),
+            DataType::Decimal128(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal128Type,
+                "Decimal128",
+                enable_ansi_mode
+            ),
+            DataType::Decimal256(_, _) => impl_decimal_array_negative!(
+                array,
+                Decimal256Type,
+                "Decimal256",
+                enable_ansi_mode
+            ),
+
+            // interval type - use checked negation in ANSI mode, wrapping in legacy mode
+            DataType::Interval(IntervalUnit::YearMonth) => {
+                impl_integer_array_negative!(
+                    array,
+                    IntervalYearMonthType,
+                    "IntervalYearMonth",
+                    enable_ansi_mode
+                )
+            }
+            DataType::Interval(IntervalUnit::DayTime) => {
+                let array = array.as_primitive::<IntervalDayTimeType>();
+                let result: PrimitiveArray<IntervalDayTimeType> = if enable_ansi_mode {
+                    array.try_unary(|x| {
+                        let days = x.days.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalDayTime overflow on negative (days: {})",
+                                x.days
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let milliseconds =
+                            x.milliseconds.checked_neg().ok_or_else(|| {
+                                (exec_err!(
+                                "IntervalDayTime overflow on negative (milliseconds: {})",
+                                x.milliseconds
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                            })?;
+                        Ok::<_, arrow::error::ArrowError>(IntervalDayTime {
+                            days,
+                            milliseconds,
+                        })
+                    })?
+                } else {
+                    array.unary(|x| IntervalDayTime {
+                        days: x.days.wrapping_neg(),
+                        milliseconds: x.milliseconds.wrapping_neg(),
+                    })
+                };
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            DataType::Interval(IntervalUnit::MonthDayNano) => {
+                let array = array.as_primitive::<IntervalMonthDayNanoType>();
+                let result: PrimitiveArray<IntervalMonthDayNanoType> = if enable_ansi_mode
+                {
+                    array.try_unary(|x| {
+                        let months = x.months.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (months: {})",
+                                x.months
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let days = x.days.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (days: {})",
+                                x.days
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        let nanoseconds = x.nanoseconds.checked_neg().ok_or_else(|| {
+                            (exec_err!(
+                                "IntervalMonthDayNano overflow on negative (nanoseconds: {})",
+                                x.nanoseconds
+                            ) as Result<(), _>)
+                                .unwrap_err()
+                        })?;
+                        Ok::<_, arrow::error::ArrowError>(IntervalMonthDayNano {
+                            months,
+                            days,
+                            nanoseconds,
+                        })
+                    })?
+                } else {
+                    array.unary(|x| IntervalMonthDayNano {
+                        months: x.months.wrapping_neg(),
+                        days: x.days.wrapping_neg(),
+                        nanoseconds: x.nanoseconds.wrapping_neg(),
+                    })
+                };
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+
+            dt => not_impl_err!("Not supported datatype for Spark negative(): {dt}"),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Null => Ok(arg.clone()),
+            _ if sv.is_null() => Ok(arg.clone()),
+
+            // Signed integers - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::Int8(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int8", Int8, enable_ansi_mode)
+            }
+            ScalarValue::Int16(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int16", Int16, enable_ansi_mode)
+            }
+            ScalarValue::Int32(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int32", Int32, enable_ansi_mode)
+            }
+            ScalarValue::Int64(Some(v)) => {
+                impl_integer_scalar_negative!(v, "Int64", Int64, enable_ansi_mode)
+            }
+
+            // Floating point - simple negation
+            ScalarValue::Float16(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float16(Some(-v))))
+            }
+            ScalarValue::Float32(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float32(Some(-v))))
+            }
+            ScalarValue::Float64(Some(v)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Float64(Some(-v))))
+            }
+
+            // Decimal types - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::Decimal32(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal32",
+                    Decimal32,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal64(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal64",
+                    Decimal64,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal128(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal128",
+                    Decimal128,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::Decimal256(Some(v), precision, scale) => {
+                impl_decimal_scalar_negative!(
+                    v,
+                    precision,
+                    scale,
+                    "Decimal256",
+                    Decimal256,
+                    enable_ansi_mode
+                )
+            }
+
+            //interval type - use checked negation in ANSI mode, wrapping in legacy mode
+            ScalarValue::IntervalYearMonth(Some(v)) => {
+                impl_integer_scalar_negative!(
+                    v,
+                    "IntervalYearMonth",
+                    IntervalYearMonth,
+                    enable_ansi_mode
+                )
+            }
+            ScalarValue::IntervalDayTime(Some(v)) => {
+                let result = if enable_ansi_mode {
+                    let days = v.days.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalDayTime overflow on negative (days: {})",
+                            v.days
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let milliseconds = v.milliseconds.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalDayTime overflow on negative (milliseconds: {})",
+                            v.milliseconds
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    IntervalDayTime { days, milliseconds }
+                } else {
+                    IntervalDayTime {
+                        days: v.days.wrapping_neg(),
+                        milliseconds: v.milliseconds.wrapping_neg(),
+                    }
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(
+                    result,
+                ))))
+            }
+            ScalarValue::IntervalMonthDayNano(Some(v)) => {
+                let result = if enable_ansi_mode {
+                    let months = v.months.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (months: {})",
+                            v.months
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let days = v.days.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (days: {})",
+                            v.days
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    let nanoseconds = v.nanoseconds.checked_neg().ok_or_else(|| {
+                        (exec_err!(
+                            "IntervalMonthDayNano overflow on negative (nanoseconds: {})",
+                            v.nanoseconds
+                        ) as Result<(), _>)
+                            .unwrap_err()
+                    })?;
+                    IntervalMonthDayNano {
+                        months,
+                        days,
+                        nanoseconds,
+                    }
+                } else {
+                    IntervalMonthDayNano {
+                        months: v.months.wrapping_neg(),
+                        days: v.days.wrapping_neg(),
+                        nanoseconds: v.nanoseconds.wrapping_neg(),
+                    }
+                };
+                Ok(ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(
+                    Some(result),
+                )))
+            }
+
+            dt => not_impl_err!("Not supported datatype for Spark negative(): {dt}"),
+        },
+    }
+}
diff --git a/datafusion/spark/src/function/math/rint.rs b/datafusion/spark/src/function/math/rint.rs
new file mode 100644
index 0000000000000..ae1a25110ac89
--- /dev/null
+++ b/datafusion/spark/src/function/math/rint.rs
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, AsArray};
+use arrow::compute::cast;
+use arrow::datatypes::DataType::{
+    Float32, Float64, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64,
+};
+use arrow::datatypes::{DataType, Float32Type, Float64Type};
+use datafusion_common::{Result, assert_eq_or_internal_err, exec_err};
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkRint {
+    signature: Signature,
+}
+
+impl Default for SparkRint {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkRint {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::numeric(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkRint {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "rint"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_rint, vec![])(&args.args)
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        // round preserves the order of the first argument
+        if input.len() == 1 {
+            let value = &input[0];
+            Ok(value.sort_properties)
+        } else {
+            Ok(SortProperties::default())
+        }
+    }
+}
+
+pub fn spark_rint(args: &[ArrayRef]) -> Result<ArrayRef> {
+    assert_eq_or_internal_err!(args.len(), 1, "`rint` expects exactly one argument");
+
+    let array: &dyn Array = args[0].as_ref();
+    match args[0].data_type() {
+        Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 => {
+            Ok(cast(array, &Float64)?)
+        }
+        Float64 => {
+            let array = array
+                .as_primitive::<Float64Type>()
+                .unary::<_, Float64Type>(|value: f64| value.round_ties_even());
+            Ok(Arc::new(array))
+        }
+        Float32 => {
+            let array = array
+                .as_primitive::<Float32Type>()
+                .unary::<_, Float64Type>(|value: f32| value.round_ties_even() as f64);
+            Ok(Arc::new(array))
+        }
+        _ => {
+            exec_err!(
+                "rint expects a numeric argument, got {}",
+                args[0].data_type()
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Float64Array;
+
+    #[test]
+    fn test_rint_positive_decimals() {
+        // Test positive decimal rounding
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![12.3456]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![12.0]));
+
+        // Test rounding to nearest even (banker's rounding)
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![2.5]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![2.0]));
+
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![3.5]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![4.0]));
+    }
+
+    #[test]
+    fn test_rint_negative_decimals() {
+        // Test negative decimal rounding
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![-12.3456]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![-12.0]));
+
+        // Test negative rounding to nearest even
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![-2.5]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![-2.0]));
+    }
+
+    #[test]
+    fn test_rint_integers() {
+        // Test integer input (should return as float64)
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![42.0]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![42.0]));
+    }
+
+    #[test]
+    fn test_rint_null() {
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![None]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![None]));
+    }
+
+    #[test]
+    fn test_rint_zero() {
+        // Test zero
+        let result = spark_rint(&[Arc::new(Float64Array::from(vec![0.0]))]).unwrap();
+        assert_eq!(result.as_ref(), &Float64Array::from(vec![0.0]));
+    }
+}
diff --git a/datafusion/spark/src/function/math/trigonometry.rs b/datafusion/spark/src/function/math/trigonometry.rs
new file mode 100644
index 0000000000000..85b10f5b998c6
--- /dev/null
+++ b/datafusion/spark/src/function/math/trigonometry.rs
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::function::error_utils::unsupported_data_type_exec_err;
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+static CSC_FUNCTION_NAME: &str = "csc";
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#csc>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkCsc {
+    signature: Signature,
+}
+
+impl Default for SparkCsc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkCsc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkCsc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        CSC_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [arg] = take_function_args(self.name(), &args.args)?;
+        spark_csc(arg)
+    }
+}
+
+fn spark_csc(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Float64(value.map(|x| 1.0 / x.sin())),
+        )),
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => Ok(ColumnarValue::Array(Arc::new(
+                array
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(|x| 1.0 / x.sin()),
+            ) as ArrayRef)),
+            other => Err(unsupported_data_type_exec_err(
+                CSC_FUNCTION_NAME,
+                format!("{}", DataType::Float64).as_str(),
+                other,
+            )),
+        },
+        other => Err(unsupported_data_type_exec_err(
+            CSC_FUNCTION_NAME,
+            format!("{}", DataType::Float64).as_str(),
+            &other.data_type(),
+        )),
+    }
+}
+
+static SEC_FUNCTION_NAME: &str = "sec";
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#sec>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSec {
+    signature: Signature,
+}
+
+impl Default for SparkSec {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSec {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        SEC_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [arg] = take_function_args(self.name(), &args.args)?;
+        spark_sec(arg)
+    }
+}
+
+fn spark_sec(arg: &ColumnarValue) -> Result<ColumnarValue> {
+    match arg {
+        ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(ColumnarValue::Scalar(
+            ScalarValue::Float64(value.map(|x| 1.0 / x.cos())),
+        )),
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => Ok(ColumnarValue::Array(Arc::new(
+                array
+                    .as_primitive::<Float64Type>()
+                    .unary::<_, Float64Type>(|x| 1.0 / x.cos()),
+            ) as ArrayRef)),
+            other => Err(unsupported_data_type_exec_err(
+                SEC_FUNCTION_NAME,
+                format!("{}", DataType::Float64).as_str(),
+                other,
+            )),
+        },
+        other => Err(unsupported_data_type_exec_err(
+            SEC_FUNCTION_NAME,
+            format!("{}", DataType::Float64).as_str(),
+            &other.data_type(),
+        )),
+    }
+}
diff --git a/datafusion/spark/src/function/math/unhex.rs b/datafusion/spark/src/function/math/unhex.rs
new file mode 100644
index 0000000000000..dee532d818f83
--- /dev/null
+++ b/datafusion/spark/src/function/math/unhex.rs
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, BinaryBuilder};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::types::logical_string;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignatureClass, Volatility,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unhex>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnhex {
+    signature: Signature,
+}
+
+impl Default for SparkUnhex {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnhex {
+    pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+
+        Self {
+            signature: Signature::coercible(vec![string], Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnhex {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "unhex"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Binary)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_unhex(&args.args)
+    }
+}
+
+#[inline]
+fn hex_nibble(c: u8) -> Option<u8> {
+    match c {
+        b'0'..=b'9' => Some(c - b'0'),
+        b'a'..=b'f' => Some(c - b'a' + 10),
+        b'A'..=b'F' => Some(c - b'A' + 10),
+        _ => None,
+    }
+}
+
+/// Decodes a hex-encoded byte slice into binary data.
+/// Returns `true` if decoding succeeded, `false` if the input contains invalid hex characters.
+fn unhex_common(bytes: &[u8], out: &mut Vec<u8>) -> bool {
+    if bytes.is_empty() {
+        return true;
+    }
+
+    let mut i = 0usize;
+
+    // If the hex string length is odd, implicitly left-pad with '0'.
+    if (bytes.len() & 1) == 1 {
+        match hex_nibble(bytes[0]) {
+            // Equivalent to (0 << 4) | lo
+            Some(lo) => out.push(lo),
+            None => return false,
+        }
+        i = 1;
+    }
+
+    while i + 1 < bytes.len() {
+        match (hex_nibble(bytes[i]), hex_nibble(bytes[i + 1])) {
+            (Some(hi), Some(lo)) => out.push((hi << 4) | lo),
+            _ => return false,
+        }
+        i += 2;
+    }
+
+    true
+}
+
+/// Converts an iterator of hex strings to a binary array.
+fn unhex_array<I, T>(
+    iter: I,
+    len: usize,
+    capacity: usize,
+) -> Result<ArrayRef, DataFusionError>
+where
+    I: Iterator<Item = Option<T>>,
+    T: AsRef<str>,
+{
+    let mut builder = BinaryBuilder::with_capacity(len, capacity);
+    let mut buffer = Vec::new();
+
+    for v in iter {
+        if let Some(s) = v {
+            buffer.clear();
+            buffer.reserve(s.as_ref().len().div_ceil(2));
+            if unhex_common(s.as_ref().as_bytes(), &mut buffer) {
+                builder.append_value(&buffer);
+            } else {
+                builder.append_null();
+            }
+        } else {
+            builder.append_null();
+        }
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+
+/// Convert a single hex string to binary
+fn unhex_scalar(s: &str) -> Option<Vec<u8>> {
+    let mut buffer = Vec::with_capacity(s.len().div_ceil(2));
+    if unhex_common(s.as_bytes(), &mut buffer) {
+        Some(buffer)
+    } else {
+        None
+    }
+}
+
+fn spark_unhex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> {
+    let [args] = take_function_args("unhex", args)?;
+
+    match args {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Utf8 => {
+                let array = as_string_array(array)?;
+                let capacity = array.values().len().div_ceil(2);
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            DataType::Utf8View => {
+                let array = as_string_view_array(array)?;
+                // Estimate capacity since StringViewArray data can be scattered or inlined.
+                let capacity = array.len() * 32;
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            DataType::LargeUtf8 => {
+                let array = as_large_string_array(array)?;
+                let capacity = array.values().len().div_ceil(2);
+                Ok(ColumnarValue::Array(unhex_array(
+                    array.iter(),
+                    array.len(),
+                    capacity,
+                )?))
+            }
+            _ => exec_err!(
+                "unhex only supports string argument, but got: {}",
+                array.data_type()
+            ),
+        },
+        ColumnarValue::Scalar(sv) => match sv {
+            ScalarValue::Utf8(None)
+            | ScalarValue::Utf8View(None)
+            | ScalarValue::LargeUtf8(None) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(None)))
+            }
+            ScalarValue::Utf8(Some(s))
+            | ScalarValue::Utf8View(Some(s))
+            | ScalarValue::LargeUtf8(Some(s)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Binary(unhex_scalar(s))))
+            }
+            _ => {
+                exec_err!(
+                    "unhex only supports string argument, but got: {}",
+                    sv.data_type()
+                )
+            }
+        },
+    }
+}
diff --git a/datafusion/spark/src/function/math/width_bucket.rs b/datafusion/spark/src/function/math/width_bucket.rs
new file mode 100644
index 0000000000000..905c108197906
--- /dev/null
+++ b/datafusion/spark/src/function/math/width_bucket.rs
@@ -0,0 +1,787 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, DurationMicrosecondArray, Float64Array, IntervalMonthDayNanoArray,
+    IntervalYearMonthArray,
+};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{Duration, Float64, Int32, Interval};
+use arrow::datatypes::IntervalUnit::{MonthDayNano, YearMonth};
+use datafusion_common::cast::{
+    as_duration_microsecond_array, as_float64_array, as_int64_array,
+    as_interval_mdn_array, as_interval_ym_array,
+};
+use datafusion_common::types::{
+    NativeType, logical_duration_microsecond, logical_float64, logical_int64,
+    logical_interval_mdn, logical_interval_year_month,
+};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    TypeSignatureClass,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+use arrow::array::{Int32Array, Int32Builder, Int64Array};
+use arrow::datatypes::TimeUnit::Microsecond;
+use datafusion_expr::Coercion;
+use datafusion_expr::Volatility::Immutable;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkWidthBucket {
+    signature: Signature,
+}
+
+impl Default for SparkWidthBucket {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkWidthBucket {
+    pub fn new() -> Self {
+        let numeric = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_float64()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::Float64,
+        );
+        let duration = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_duration_microsecond()),
+            vec![TypeSignatureClass::Duration],
+            NativeType::Duration(Microsecond),
+        );
+        let interval_ym = Coercion::new_exact(TypeSignatureClass::Native(
+            logical_interval_year_month(),
+        ));
+        let interval_mdn =
+            Coercion::new_exact(TypeSignatureClass::Native(logical_interval_mdn()));
+        let bucket = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Integer],
+            NativeType::Int64,
+        );
+        let type_signature = Signature::one_of(
+            vec![
+                TypeSignature::Coercible(vec![
+                    numeric.clone(),
+                    numeric.clone(),
+                    numeric.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    duration.clone(),
+                    duration.clone(),
+                    duration.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    interval_ym.clone(),
+                    interval_ym.clone(),
+                    interval_ym.clone(),
+                    bucket.clone(),
+                ]),
+                TypeSignature::Coercible(vec![
+                    interval_mdn.clone(),
+                    interval_mdn.clone(),
+                    interval_mdn.clone(),
+                    bucket.clone(),
+                ]),
+            ],
+            Immutable,
+        )
+        .with_parameter_names(vec!["expr", "min", "max", "num_buckets"])
+        .expect("valid parameter names");
+        Self {
+            signature: type_signature,
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkWidthBucket {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "width_bucket"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int32)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(width_bucket_kern, vec![])(&args.args)
+    }
+
+    fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
+        if input.len() == 1 {
+            let value = &input[0];
+            Ok(value.sort_properties)
+        } else {
+            Ok(SortProperties::default())
+        }
+    }
+}
+
+fn width_bucket_kern(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [v, minv, maxv, nb] = args else {
+        return exec_err!(
+            "width_bucket expects exactly 4 argument, got {}",
+            args.len()
+        );
+    };
+
+    match v.data_type() {
+        Float64 => {
+            let v = as_float64_array(v)?;
+            let min = as_float64_array(minv)?;
+            let max = as_float64_array(maxv)?;
+            let n_bucket = as_int64_array(nb)?;
+            Ok(Arc::new(width_bucket_float64(v, min, max, n_bucket)))
+        }
+        Duration(Microsecond) => {
+            let v = as_duration_microsecond_array(v)?;
+            let min = as_duration_microsecond_array(minv)?;
+            let max = as_duration_microsecond_array(maxv)?;
+            let n_bucket = as_int64_array(nb)?;
+            Ok(Arc::new(width_bucket_i64_as_float(v, min, max, n_bucket)))
+        }
+        Interval(YearMonth) => {
+            let v = as_interval_ym_array(v)?;
+            let min = as_interval_ym_array(minv)?;
+            let max = as_interval_ym_array(maxv)?;
+            let n_bucket = as_int64_array(nb)?;
+            Ok(Arc::new(width_bucket_i32_as_float(v, min, max, n_bucket)))
+        }
+        Interval(MonthDayNano) => {
+            let v = as_interval_mdn_array(v)?;
+            let min = as_interval_mdn_array(minv)?;
+            let max = as_interval_mdn_array(maxv)?;
+            let n_bucket = as_int64_array(nb)?;
+            Ok(Arc::new(width_bucket_interval_mdn_exact(
+                v, min, max, n_bucket,
+            )))
+        }
+
+        other => internal_err!(
+            "width_bucket received unexpected data types: {:?}, {:?}, {:?}, {:?}",
+            other,
+            minv.data_type(),
+            maxv.data_type(),
+            nb.data_type()
+        ),
+    }
+}
+
+macro_rules! width_bucket_kernel_impl {
+    ($name:ident, $arr_ty:ty, $to_f64:expr, $check_nan:expr) => {
+        pub(crate) fn $name(
+            v: &$arr_ty,
+            min: &$arr_ty,
+            max: &$arr_ty,
+            n_bucket: &Int64Array,
+        ) -> Int32Array {
+            let len = v.len();
+            let mut b = Int32Builder::with_capacity(len);
+
+            for i in 0..len {
+                if v.is_null(i) || min.is_null(i) || max.is_null(i) || n_bucket.is_null(i)
+                {
+                    b.append_null();
+                    continue;
+                }
+                let x = ($to_f64)(v, i);
+                let l = ($to_f64)(min, i);
+                let h = ($to_f64)(max, i);
+                let buckets = n_bucket.value(i);
+
+                if buckets <= 0 {
+                    b.append_null();
+                    continue;
+                }
+                let next_bucket = (buckets + 1) as i32;
+                if $check_nan {
+                    if !x.is_finite() || !l.is_finite() || !h.is_finite() {
+                        b.append_null();
+                        continue;
+                    }
+                }
+
+                let ord = match l.partial_cmp(&h) {
+                    Some(o) => o,
+                    None => {
+                        b.append_null();
+                        continue;
+                    }
+                };
+                if ord == std::cmp::Ordering::Equal {
+                    b.append_null();
+                    continue;
+                }
+                let asc = ord == std::cmp::Ordering::Less;
+
+                if asc {
+                    if x < l {
+                        b.append_value(0);
+                        continue;
+                    }
+                    if x >= h {
+                        b.append_value(next_bucket);
+                        continue;
+                    }
+                } else {
+                    if x > l {
+                        b.append_value(0);
+                        continue;
+                    }
+                    if x <= h {
+                        b.append_value(next_bucket);
+                        continue;
+                    }
+                }
+
+                let width = (h - l) / (buckets as f64);
+                if width == 0.0 || !width.is_finite() {
+                    b.append_null();
+                    continue;
+                }
+                let mut bucket = ((x - l) / width).floor() as i32 + 1;
+                if bucket < 1 {
+                    bucket = 1;
+                }
+                if bucket > next_bucket {
+                    bucket = next_bucket;
+                }
+
+                b.append_value(bucket);
+            }
+
+            b.finish()
+        }
+    };
+}
+
+width_bucket_kernel_impl!(
+    width_bucket_float64,
+    Float64Array,
+    |arr: &Float64Array, i: usize| arr.value(i),
+    true
+);
+
+width_bucket_kernel_impl!(
+    width_bucket_i64_as_float,
+    DurationMicrosecondArray,
+    |arr: &DurationMicrosecondArray, i: usize| arr.value(i) as f64,
+    false
+);
+
+width_bucket_kernel_impl!(
+    width_bucket_i32_as_float,
+    IntervalYearMonthArray,
+    |arr: &IntervalYearMonthArray, i: usize| arr.value(i) as f64,
+    false
+);
+const NS_PER_DAY_I128: i128 = 86_400_000_000_000;
+pub(crate) fn width_bucket_interval_mdn_exact(
+    v: &IntervalMonthDayNanoArray,
+    lo: &IntervalMonthDayNanoArray,
+    hi: &IntervalMonthDayNanoArray,
+    n: &Int64Array,
+) -> Int32Array {
+    let len = v.len();
+    let mut b = Int32Builder::with_capacity(len);
+
+    for i in 0..len {
+        if v.is_null(i) || lo.is_null(i) || hi.is_null(i) || n.is_null(i) {
+            b.append_null();
+            continue;
+        }
+        let buckets = n.value(i);
+        if buckets <= 0 {
+            b.append_null();
+            continue;
+        }
+        let next_bucket = (buckets + 1) as i32;
+
+        let x = v.value(i);
+        let l = lo.value(i);
+        let h = hi.value(i);
+
+        // asc/desc
+        // Values of IntervalMonthDayNano are compared using their binary representation, which can lead to surprising results.
+        let asc = (l.months, l.days, l.nanoseconds) < (h.months, h.days, h.nanoseconds);
+        if (l.months, l.days, l.nanoseconds) == (h.months, h.days, h.nanoseconds) {
+            b.append_null();
+            continue;
+        }
+
+        // ------------------- only month -------------------
+        if l.days == h.days && l.nanoseconds == h.nanoseconds && l.months != h.months {
+            let x_m = x.months as f64;
+            let l_m = l.months as f64;
+            let h_m = h.months as f64;
+
+            if asc {
+                if x_m < l_m {
+                    b.append_value(0);
+                    continue;
+                }
+                if x_m >= h_m {
+                    b.append_value(next_bucket);
+                    continue;
+                }
+            } else {
+                if x_m > l_m {
+                    b.append_value(0);
+                    continue;
+                }
+                if x_m <= h_m {
+                    b.append_value(next_bucket);
+                    continue;
+                }
+            }
+
+            let width = (h_m - l_m) / (buckets as f64);
+            if width == 0.0 || !width.is_finite() {
+                b.append_null();
+                continue;
+            }
+
+            let mut bucket = ((x_m - l_m) / width).floor() as i32 + 1;
+            if bucket < 1 {
+                bucket = 1;
+            }
+            if bucket > next_bucket {
+                bucket = next_bucket;
+            }
+            b.append_value(bucket);
+            continue;
+        }
+
+        // ---------------  months equals -------------------
+        if l.months == h.months {
+            let base_days = l.days as i128;
+            let base_ns = l.nanoseconds as i128;
+
+            let xf = (x.days as i128 - base_days) * NS_PER_DAY_I128
+                + (x.nanoseconds as i128 - base_ns);
+            let hf = (h.days as i128 - base_days) * NS_PER_DAY_I128
+                + (h.nanoseconds as i128 - base_ns);
+
+            let x_f = xf as f64;
+            let l_f = 0.0;
+            let h_f = hf as f64;
+
+            if asc {
+                if x_f < l_f {
+                    b.append_value(0);
+                    continue;
+                }
+                if x_f >= h_f {
+                    b.append_value(next_bucket);
+                    continue;
+                }
+            } else {
+                if x_f > l_f {
+                    b.append_value(0);
+                    continue;
+                }
+                if x_f <= h_f {
+                    b.append_value(next_bucket);
+                    continue;
+                }
+            }
+
+            let width = (h_f - l_f) / (buckets as f64);
+            if width == 0.0 || !width.is_finite() {
+                b.append_null();
+                continue;
+            }
+
+            let mut bucket = ((x_f - l_f) / width).floor() as i32 + 1;
+            if bucket < 1 {
+                bucket = 1;
+            }
+            if bucket > next_bucket {
+                bucket = next_bucket;
+            }
+            b.append_value(bucket);
+            continue;
+        }
+
+        b.append_null();
+    }
+
+    b.finish()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    use arrow::array::{
+        ArrayRef, DurationMicrosecondArray, Float64Array, Int32Array, Int64Array,
+        IntervalYearMonthArray,
+    };
+    use arrow::datatypes::IntervalMonthDayNano;
+
+    // --- Helpers -------------------------------------------------------------
+
+    fn i64_array_all(len: usize, val: i64) -> Arc<Int64Array> {
+        Arc::new(Int64Array::from(vec![val; len]))
+    }
+
+    fn f64_array(vals: &[f64]) -> Arc<Float64Array> {
+        Arc::new(Float64Array::from(vals.to_vec()))
+    }
+
+    fn f64_array_opt(vals: &[Option<f64>]) -> Arc<Float64Array> {
+        Arc::new(Float64Array::from(vals.to_vec()))
+    }
+
+    fn dur_us_array(vals: &[i64]) -> Arc<DurationMicrosecondArray> {
+        Arc::new(DurationMicrosecondArray::from(vals.to_vec()))
+    }
+
+    fn ym_array(vals: &[i32]) -> Arc<IntervalYearMonthArray> {
+        Arc::new(IntervalYearMonthArray::from(vals.to_vec()))
+    }
+
+    fn downcast_i32(arr: &ArrayRef) -> &Int32Array {
+        arr.as_any().downcast_ref::<Int32Array>().unwrap()
+    }
+
+    fn mdn_array(vals: &[(i32, i32, i64)]) -> Arc<IntervalMonthDayNanoArray> {
+        let data: Vec<IntervalMonthDayNano> = vals
+            .iter()
+            .map(|(m, d, ns)| IntervalMonthDayNano::new(*m, *d, *ns))
+            .collect();
+        Arc::new(IntervalMonthDayNanoArray::from(data))
+    }
+
+    // --- Float64 -------------------------------------------------------------
+
+    #[test]
+    fn test_width_bucket_f64_basic() {
+        let v = f64_array(&[0.5, 1.0, 9.9, -1.0, 10.0]);
+        let lo = f64_array(&[0.0, 0.0, 0.0, 0.0, 0.0]);
+        let hi = f64_array(&[10.0, 10.0, 10.0, 10.0, 10.0]);
+        let n = i64_array_all(5, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[1, 2, 10, 0, 11]);
+    }
+
+    #[test]
+    fn test_width_bucket_f64_descending_range() {
+        let v = f64_array(&[9.9, 10.0, 0.0, -0.1, 10.1]);
+        let lo = f64_array(&[10.0; 5]);
+        let hi = f64_array(&[0.0; 5]);
+        let n = i64_array_all(5, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+
+        assert_eq!(out.values(), &[1, 1, 11, 11, 0]);
+    }
+    #[test]
+    fn test_width_bucket_f64_bounds_inclusive_exclusive_asc() {
+        let v = f64_array(&[0.0, 9.999999999, 10.0]);
+        let lo = f64_array(&[0.0; 3]);
+        let hi = f64_array(&[10.0; 3]);
+        let n = i64_array_all(3, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[1, 10, 11]);
+    }
+
+    #[test]
+    fn test_width_bucket_f64_bounds_inclusive_exclusive_desc() {
+        let v = f64_array(&[10.0, 0.0, -0.000001]);
+        let lo = f64_array(&[10.0; 3]);
+        let hi = f64_array(&[0.0; 3]);
+        let n = i64_array_all(3, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[1, 11, 11]);
+    }
+
+    #[test]
+    fn test_width_bucket_f64_edge_cases() {
+        let v = f64_array(&[1.0, 5.0, 9.0]);
+        let lo = f64_array(&[0.0, 0.0, 0.0]);
+        let hi = f64_array(&[10.0, 10.0, 10.0]);
+        let n = Arc::new(Int64Array::from(vec![0, -1, 10]));
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+        assert!(out.is_null(1));
+        assert_eq!(out.value(2), 10);
+
+        let v = f64_array(&[1.0]);
+        let lo = f64_array(&[5.0]);
+        let hi = f64_array(&[5.0]);
+        let n = i64_array_all(1, 10);
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+
+        let v = f64_array_opt(&[Some(f64::NAN)]);
+        let lo = f64_array(&[0.0]);
+        let hi = f64_array(&[10.0]);
+        let n = i64_array_all(1, 10);
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+    }
+
+    #[test]
+    fn test_width_bucket_f64_nulls_propagate() {
+        let v = f64_array_opt(&[None, Some(1.0), Some(2.0), Some(3.0)]);
+        let lo = f64_array(&[0.0; 4]);
+        let hi = f64_array(&[10.0; 4]);
+        let n = i64_array_all(4, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+        assert_eq!(out.value(1), 2);
+        assert_eq!(out.value(2), 3);
+        assert_eq!(out.value(3), 4);
+
+        let v = f64_array(&[1.0]);
+        let lo = f64_array_opt(&[None]);
+        let hi = f64_array(&[10.0]);
+        let n = i64_array_all(1, 10);
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+    }
+
+    // --- Duration(Microsecond) ----------------------------------------------
+
+    #[test]
+    fn test_width_bucket_duration_us() {
+        let v = dur_us_array(&[1_000_000, 0, -1]);
+        let lo = dur_us_array(&[0, 0, 0]);
+        let hi = dur_us_array(&[2_000_000, 2_000_000, 2_000_000]);
+        let n = i64_array_all(3, 2);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[2, 1, 0]);
+    }
+
+    #[test]
+    fn test_width_bucket_duration_us_equal_bounds() {
+        let v = dur_us_array(&[0]);
+        let lo = dur_us_array(&[1]);
+        let hi = dur_us_array(&[1]);
+        let n = i64_array_all(1, 10);
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        assert!(downcast_i32(&out).is_null(0));
+    }
+
+    // --- Interval(YearMonth) ------------------------------------------------
+
+    #[test]
+    fn test_width_bucket_interval_ym_basic() {
+        let v = ym_array(&[0, 5, 11, 12, 13]);
+        let lo = ym_array(&[0; 5]);
+        let hi = ym_array(&[12; 5]);
+        let n = i64_array_all(5, 12);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[1, 6, 12, 13, 13]);
+    }
+
+    #[test]
+    fn test_width_bucket_interval_ym_desc() {
+        let v = ym_array(&[11, 12, 0, -1, 13]);
+        let lo = ym_array(&[12; 5]);
+        let hi = ym_array(&[0; 5]);
+        let n = i64_array_all(5, 12);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[2, 1, 13, 13, 0]);
+    }
+
+    // --- Interval(MonthDayNano) --------------------------------------------
+
+    #[test]
+    fn test_width_bucket_interval_mdn_months_only_basic() {
+        let v = mdn_array(&[(0, 0, 0), (5, 0, 0), (11, 0, 0), (12, 0, 0), (13, 0, 0)]);
+        let lo = mdn_array(&[(0, 0, 0); 5]);
+        let hi = mdn_array(&[(12, 0, 0); 5]);
+        let n = i64_array_all(5, 12);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert_eq!(out.values(), &[1, 6, 12, 13, 13]);
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_months_only_desc() {
+        let v = mdn_array(&[(11, 0, 0), (12, 0, 0), (0, 0, 0), (-1, 0, 0), (13, 0, 0)]);
+        let lo = mdn_array(&[(12, 0, 0); 5]);
+        let hi = mdn_array(&[(0, 0, 0); 5]);
+        let n = i64_array_all(5, 12);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        // Mismo patrón que YM descendente
+        assert_eq!(out.values(), &[2, 1, 13, 13, 0]);
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_day_nano_basic() {
+        let v = mdn_array(&[
+            (0, 0, 0),
+            (0, 5, 0),
+            (0, 9, 0),
+            (0, 10, 0),
+            (0, -1, 0),
+            (0, 11, 0),
+        ]);
+        let lo = mdn_array(&[(0, 0, 0); 6]);
+        let hi = mdn_array(&[(0, 10, 0); 6]);
+        let n = i64_array_all(6, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        // x==hi -> n+1, x<lo -> 0, x>hi -> n+1
+        assert_eq!(out.values(), &[1, 6, 10, 11, 0, 11]);
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_day_nano_desc() {
+        let v = mdn_array(&[(0, 9, 0), (0, 10, 0), (0, 0, 0), (0, -1, 0), (0, 11, 0)]);
+        let lo = mdn_array(&[(0, 10, 0); 5]);
+        let hi = mdn_array(&[(0, 0, 0); 5]);
+        let n = i64_array_all(5, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+
+        assert_eq!(out.values(), &[2, 1, 11, 11, 0]);
+    }
+    #[test]
+    fn test_width_bucket_interval_mdn_day_nano_desc_inside() {
+        let v = mdn_array(&[(0, 9, 1), (0, 10, 0), (0, 0, 0), (0, -1, 0), (0, 11, 0)]);
+        let lo = mdn_array(&[(0, 10, 0); 5]);
+        let hi = mdn_array(&[(0, 0, 0); 5]);
+        let n = i64_array_all(5, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+
+        assert_eq!(out.values(), &[1, 1, 11, 11, 0]);
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_mixed_months_and_days_is_null() {
+        let v = mdn_array(&[(0, 1, 0)]);
+        let lo = mdn_array(&[(0, 0, 0)]);
+        let hi = mdn_array(&[(1, 1, 0)]);
+        let n = i64_array_all(1, 4);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_equal_bounds_is_null() {
+        let v = mdn_array(&[(0, 0, 0)]);
+        let lo = mdn_array(&[(1, 2, 3)]);
+        let hi = mdn_array(&[(1, 2, 3)]); // lo == hi
+        let n = i64_array_all(1, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        assert!(downcast_i32(&out).is_null(0));
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_invalid_n_is_null() {
+        let v = mdn_array(&[(0, 0, 0)]);
+        let lo = mdn_array(&[(0, 0, 0)]);
+        let hi = mdn_array(&[(0, 10, 0)]);
+        let n = Arc::new(Int64Array::from(vec![0])); // n <= 0
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        assert!(downcast_i32(&out).is_null(0));
+    }
+
+    #[test]
+    fn test_width_bucket_interval_mdn_nulls_propagate() {
+        let v = Arc::new(IntervalMonthDayNanoArray::from(vec![
+            None,
+            Some(IntervalMonthDayNano::new(0, 5, 0)),
+        ]));
+        let lo = mdn_array(&[(0, 0, 0), (0, 0, 0)]);
+        let hi = mdn_array(&[(0, 10, 0), (0, 10, 0)]);
+        let n = i64_array_all(2, 10);
+
+        let out = width_bucket_kern(&[v, lo, hi, n]).unwrap();
+        let out = downcast_i32(&out);
+        assert!(out.is_null(0));
+        assert_eq!(out.value(1), 6);
+    }
+
+    // --- Errores -------------------------------------------------------------
+
+    #[test]
+    fn test_width_bucket_wrong_arg_count() {
+        let v = f64_array(&[1.0]);
+        let lo = f64_array(&[0.0]);
+        let hi = f64_array(&[10.0]);
+        let err = width_bucket_kern(&[v, lo, hi]).unwrap_err();
+        let msg = format!("{err}");
+        assert!(msg.contains("expects exactly 4"), "unexpected error: {msg}");
+    }
+
+    #[test]
+    fn test_width_bucket_unsupported_type() {
+        let v: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let lo = f64_array(&[0.0, 0.0, 0.0]);
+        let hi = f64_array(&[10.0, 10.0, 10.0]);
+        let n = i64_array_all(3, 10);
+
+        let err = width_bucket_kern(&[v, lo, hi, n]).unwrap_err();
+        let msg = format!("{err}");
+        assert!(
+            msg.contains("width_bucket received unexpected data types"),
+            "unexpected error: {msg}"
+        );
+    }
+}
diff --git a/datafusion/spark/src/function/mod.rs b/datafusion/spark/src/function/mod.rs
index dfdd94a040a9f..d5dd60c3545a5 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/mod.rs
@@ -17,6 +17,7 @@
 
 pub mod aggregate;
 pub mod array;
+pub mod bitmap;
 pub mod bitwise;
 pub mod collection;
 pub mod conditional;
@@ -24,6 +25,7 @@ pub mod conversion;
 pub mod csv;
 pub mod datetime;
 pub mod error_utils;
+pub mod functions_nested_utils;
 pub mod generator;
 pub mod hash;
 pub mod json;
@@ -31,6 +33,7 @@ pub mod lambda;
 pub mod map;
 pub mod math;
 pub mod misc;
+mod null_utils;
 pub mod predicate;
 pub mod string;
 pub mod r#struct;
diff --git a/datafusion/spark/src/function/null_utils.rs b/datafusion/spark/src/function/null_utils.rs
new file mode 100644
index 0000000000000..b25dc07d0e525
--- /dev/null
+++ b/datafusion/spark/src/function/null_utils.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::Array;
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::DataType;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::ColumnarValue;
+use std::sync::Arc;
+
+pub(crate) enum NullMaskResolution {
+    /// Return NULL as the result (e.g., scalar inputs with at least one NULL)
+    ReturnNull,
+    /// No null mask needed (e.g., all scalar inputs are non-NULL)
+    NoMask,
+    /// Null mask to apply for arrays
+    Apply(NullBuffer),
+}
+
+/// Compute NULL mask for the arguments using NullBuffer::union
+pub(crate) fn compute_null_mask(
+    args: &[ColumnarValue],
+    number_rows: usize,
+) -> Result<NullMaskResolution> {
+    // Check if all arguments are scalars
+    let all_scalars = args
+        .iter()
+        .all(|arg| matches!(arg, ColumnarValue::Scalar(_)));
+
+    if all_scalars {
+        // For scalars, check if any is NULL
+        for arg in args {
+            if let ColumnarValue::Scalar(scalar) = arg
+                && scalar.is_null()
+            {
+                return Ok(NullMaskResolution::ReturnNull);
+            }
+        }
+        // No NULLs in scalars
+        Ok(NullMaskResolution::NoMask)
+    } else {
+        // For arrays, compute NULL mask for each row using NullBuffer::union
+        let array_len = args
+            .iter()
+            .find_map(|arg| match arg {
+                ColumnarValue::Array(array) => Some(array.len()),
+                _ => None,
+            })
+            .unwrap_or(number_rows);
+
+        // Convert all scalars to arrays for uniform processing
+        let arrays: Result<Vec<_>> = args
+            .iter()
+            .map(|arg| match arg {
+                ColumnarValue::Array(array) => Ok(Arc::clone(array)),
+                ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(array_len),
+            })
+            .collect();
+        let arrays = arrays?;
+
+        // Use NullBuffer::union to combine all null buffers
+        let combined_nulls = arrays
+            .iter()
+            .map(|arr| arr.nulls())
+            .fold(None, |acc, nulls| NullBuffer::union(acc.as_ref(), nulls));
+
+        match combined_nulls {
+            Some(nulls) => Ok(NullMaskResolution::Apply(nulls)),
+            None => Ok(NullMaskResolution::NoMask),
+        }
+    }
+}
+
+/// Apply NULL mask to the result using NullBuffer::union
+pub(crate) fn apply_null_mask(
+    result: ColumnarValue,
+    null_mask: NullMaskResolution,
+    return_type: &DataType,
+) -> Result<ColumnarValue> {
+    match (result, null_mask) {
+        // Scalar with ReturnNull mask means return NULL of the correct type
+        (ColumnarValue::Scalar(_), NullMaskResolution::ReturnNull) => {
+            Ok(ColumnarValue::Scalar(ScalarValue::try_from(return_type)?))
+        }
+        // Scalar without mask, return as-is
+        (scalar @ ColumnarValue::Scalar(_), NullMaskResolution::NoMask) => Ok(scalar),
+        // Array with NULL mask - use NullBuffer::union to combine nulls
+        (ColumnarValue::Array(array), NullMaskResolution::Apply(null_mask)) => {
+            // Combine the result's existing nulls with our computed null mask
+            let combined_nulls = NullBuffer::union(array.nulls(), Some(&null_mask));
+
+            // Create new array with combined nulls
+            let new_array = array
+                .into_data()
+                .into_builder()
+                .nulls(combined_nulls)
+                .build()?;
+
+            Ok(ColumnarValue::Array(Arc::new(arrow::array::make_array(
+                new_array,
+            ))))
+        }
+        // Array without NULL mask, return as-is
+        (array @ ColumnarValue::Array(_), NullMaskResolution::NoMask) => Ok(array),
+        // Edge cases that shouldn't happen in practice
+        (scalar, _) => Ok(scalar),
+    }
+}
diff --git a/datafusion/spark/src/function/string/ascii.rs b/datafusion/spark/src/function/string/ascii.rs
index c05aa214ccc0c..44e3501b86adb 100644
--- a/datafusion/spark/src/function/string/ascii.rs
+++ b/datafusion/spark/src/function/string/ascii.rs
@@ -15,21 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array};
-use arrow::datatypes::DataType;
-use arrow::error::ArrowError;
-use datafusion_common::{internal_err, plan_err, Result};
-use datafusion_expr::ColumnarValue;
-use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
+};
+use datafusion_functions::string::ascii::ascii;
 use datafusion_functions::utils::make_scalar_function;
 use std::any::Any;
-use std::sync::Arc;
 
-/// <https://spark.apache.org/docs/latest/api/sql/index.html#ascii>
-#[derive(Debug)]
+/// Spark compatible version of the [ascii] function. Differs from the [default ascii function]
+/// in that it is more permissive of input types, for example casting numeric input to string
+/// before executing the function (default version doesn't allow numeric input).
+///
+/// [ascii]: https://spark.apache.org/docs/latest/api/sql/index.html#ascii
+/// [default ascii function]: datafusion_functions::string::ascii::AsciiFunc
+#[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkAscii {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Default for SparkAscii {
@@ -40,9 +47,17 @@ impl Default for SparkAscii {
 
 impl SparkAscii {
     pub fn new() -> Self {
+        // Spark's ascii uses ImplicitCastInputTypes with StringType,
+        // which allows numeric types to be implicitly cast to String.
+        // See: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+        let string_coercion = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_string()),
+            vec![TypeSignatureClass::Numeric],
+            NativeType::String,
+        );
+
         Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
+            signature: Signature::coercible(vec![string_coercion], Volatility::Immutable),
         }
     }
 }
@@ -61,114 +76,61 @@ impl ScalarUDFImpl for SparkAscii {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int32)
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
-        make_scalar_function(ascii, vec![])(&args.args)
-    }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
+        internal_err!("return_field_from_args should be used instead")
     }
 
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        if arg_types.len() != 1 {
-            return plan_err!(
-                "The {} function requires 1 argument, but got {}.",
-                self.name(),
-                arg_types.len()
-            );
-        }
-        Ok(vec![DataType::Utf8])
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        // ascii returns an Int32 value
+        // The result is nullable only if any of the input arguments is nullable
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new("ascii", DataType::Int32, nullable)))
     }
-}
-
-fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
-where
-    V: ArrayAccessor<Item = &'a str>,
-{
-    let iter = ArrayIter::new(array);
-    let result = iter
-        .map(|string| {
-            string.map(|s| {
-                let mut chars = s.chars();
-                chars.next().map_or(0, |v| v as i32)
-            })
-        })
-        .collect::<Int32Array>();
 
-    Ok(Arc::new(result) as ArrayRef)
-}
-
-/// Returns the numeric code of the first character of the argument.
-pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        DataType::Utf8 => {
-            let string_array = args[0].as_string::<i32>();
-            Ok(calculate_ascii(string_array)?)
-        }
-        DataType::LargeUtf8 => {
-            let string_array = args[0].as_string::<i64>();
-            Ok(calculate_ascii(string_array)?)
-        }
-        DataType::Utf8View => {
-            let string_array = args[0].as_string_view();
-            Ok(calculate_ascii(string_array)?)
-        }
-        _ => internal_err!("Unsupported data type"),
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(ascii, vec![])(&args.args)
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use crate::function::string::ascii::SparkAscii;
-    use crate::function::utils::test::test_scalar_function;
-    use arrow::array::{Array, Int32Array};
-    use arrow::datatypes::DataType::Int32;
-    use datafusion_common::{Result, ScalarValue};
-    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
-
-    macro_rules! test_ascii_string_invoke {
-        ($INPUT:expr, $EXPECTED:expr) => {
-            test_scalar_function!(
-                SparkAscii::new(),
-                vec![ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
-                $EXPECTED,
-                i32,
-                Int32,
-                Int32Array
-            );
-
-            test_scalar_function!(
-                SparkAscii::new(),
-                vec![ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
-                $EXPECTED,
-                i32,
-                Int32,
-                Int32Array
-            );
-
-            test_scalar_function!(
-                SparkAscii::new(),
-                vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
-                $EXPECTED,
-                i32,
-                Int32,
-                Int32Array
-            );
-        };
+    use super::*;
+    use datafusion_expr::ReturnFieldArgs;
+
+    #[test]
+    fn test_return_field_nullable_input() {
+        let ascii_func = SparkAscii::new();
+        let nullable_field = Arc::new(Field::new("input", DataType::Utf8, true));
+
+        let result = ascii_func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[nullable_field],
+                scalar_arguments: &[],
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), &DataType::Int32);
+        assert!(
+            result.is_nullable(),
+            "Output should be nullable when input is nullable"
+        );
     }
 
     #[test]
-    fn test_ascii_invoke() -> Result<()> {
-        test_ascii_string_invoke!(Some(String::from("x")), Ok(Some(120)));
-        test_ascii_string_invoke!(Some(String::from("a")), Ok(Some(97)));
-        test_ascii_string_invoke!(Some(String::from("")), Ok(Some(0)));
-        test_ascii_string_invoke!(Some(String::from("\n")), Ok(Some(10)));
-        test_ascii_string_invoke!(Some(String::from("\t")), Ok(Some(9)));
-        test_ascii_string_invoke!(None, Ok(None));
-
-        Ok(())
+    fn test_return_field_non_nullable_input() {
+        let ascii_func = SparkAscii::new();
+        let non_nullable_field = Arc::new(Field::new("input", DataType::Utf8, false));
+
+        let result = ascii_func
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[non_nullable_field],
+                scalar_arguments: &[],
+            })
+            .unwrap();
+
+        assert_eq!(result.data_type(), &DataType::Int32);
+        assert!(
+            !result.is_nullable(),
+            "Output should not be nullable when input is not nullable"
+        );
     }
 }
diff --git a/datafusion/spark/src/function/string/base64.rs b/datafusion/spark/src/function/string/base64.rs
new file mode 100644
index 0000000000000..a171d4823b0fa
--- /dev/null
+++ b/datafusion/spark/src/function/string/base64.rs
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::types::{NativeType, logical_string};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
+use datafusion_expr::{Coercion, Expr, ReturnFieldArgs, TypeSignatureClass, lit};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::expr_fn::{decode, encode};
+
+/// Apache Spark base64 uses padded base64 encoding.
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#base64>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkBase64 {
+    signature: Signature,
+}
+
+impl Default for SparkBase64 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkBase64 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkBase64 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "base64"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_type should not be called for {}", self.name())
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        let [bin] = take_function_args(self.name(), args.arg_fields)?;
+        let return_type = match bin.data_type() {
+            DataType::LargeBinary => DataType::LargeUtf8,
+            _ => DataType::Utf8,
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            bin.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        exec_err!(
+            "invoke should not be called on a simplified {} function",
+            self.name()
+        )
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [bin] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(encode(
+            bin,
+            lit("base64pad"),
+        )))
+    }
+}
+
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#unbase64>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkUnBase64 {
+    signature: Signature,
+}
+
+impl Default for SparkUnBase64 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkUnBase64 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::coercible(
+                vec![Coercion::new_implicit(
+                    TypeSignatureClass::Binary,
+                    vec![TypeSignatureClass::Native(logical_string())],
+                    NativeType::Binary,
+                )],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkUnBase64 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "unbase64"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_type should not be called for {}", self.name())
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        let [str] = take_function_args(self.name(), args.arg_fields)?;
+        let return_type = match str.data_type() {
+            DataType::LargeBinary => DataType::LargeBinary,
+            _ => DataType::Binary,
+        };
+        Ok(Arc::new(Field::new(
+            self.name(),
+            return_type,
+            str.is_nullable(),
+        )))
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        exec_err!("{} should have been simplified", self.name())
+    }
+
+    fn simplify(
+        &self,
+        args: Vec<Expr>,
+        _info: &SimplifyContext,
+    ) -> Result<ExprSimplifyResult> {
+        let [bin] = take_function_args(self.name(), args)?;
+        Ok(ExprSimplifyResult::Simplified(decode(
+            bin,
+            lit("base64pad"),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/function/string/char.rs b/datafusion/spark/src/function/string/char.rs
index dd6cdc83b30d4..16dfe0943565f 100644
--- a/datafusion/spark/src/function/string/char.rs
+++ b/datafusion/spark/src/function/string/char.rs
@@ -15,35 +15,33 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::array::ArrayRef;
+use arrow::array::GenericStringBuilder;
+use arrow::datatypes::DataType::Int64;
+use arrow::datatypes::DataType::Utf8;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use std::{any::Any, sync::Arc};
 
-use arrow::{
-    array::{ArrayRef, StringArray},
-    datatypes::{
-        DataType,
-        DataType::{Int64, Utf8},
-    },
-};
-
-use datafusion_common::{cast::as_int64_array, exec_err, Result, ScalarValue};
+use datafusion_common::{Result, ScalarValue, cast::as_int64_array, exec_err};
 use datafusion_expr::{
-    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 
 /// Spark-compatible `char` expression
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#char>
-#[derive(Debug)]
-pub struct SparkChar {
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct CharFunc {
     signature: Signature,
 }
 
-impl Default for SparkChar {
+impl Default for CharFunc {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl SparkChar {
+impl CharFunc {
     pub fn new() -> Self {
         Self {
             signature: Signature::uniform(1, vec![Int64], Volatility::Immutable),
@@ -51,7 +49,7 @@ impl SparkChar {
     }
 }
 
-impl ScalarUDFImpl for SparkChar {
+impl ScalarUDFImpl for CharFunc {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -65,12 +63,19 @@ impl ScalarUDFImpl for SparkChar {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(Utf8)
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         spark_chr(&args.args)
     }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(self.name(), Utf8, nullable)))
+    }
 }
 
 /// Returns the ASCII character having the binary equivalent to the input expression.
@@ -106,25 +111,75 @@ fn spark_chr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
 fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
     let integer_array = as_int64_array(&args[0])?;
 
-    // first map is the iterator, second is for the `Option<_>`
-    let result = integer_array
-        .iter()
-        .map(|integer: Option<i64>| {
-            integer
-                .map(|integer| {
-                    if integer < 0 {
-                        return Ok("".to_string()); // Return empty string for negative integers
-                    }
+    let mut builder = GenericStringBuilder::<i32>::with_capacity(
+        integer_array.len(),
+        integer_array.len(),
+    );
+
+    for integer_opt in integer_array {
+        match integer_opt {
+            Some(integer) => {
+                if integer < 0 {
+                    builder.append_value(""); // empty string for negative numbers.
+                } else {
                     match core::char::from_u32((integer % 256) as u32) {
-                        Some(ch) => Ok(ch.to_string()),
+                        Some(ch) => builder.append_value(ch.to_string()),
                         None => {
-                            exec_err!("requested character not compatible for encoding.")
+                            return exec_err!(
+                                "requested character not compatible for encoding."
+                            );
                         }
                     }
-                })
-                .transpose()
-        })
-        .collect::<Result<StringArray>>()?;
+                }
+            }
+            None => builder.append_null(),
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+#[test]
+fn test_char_nullability() -> Result<()> {
+    use arrow::datatypes::{DataType::Utf8, Field, FieldRef};
+    use datafusion_expr::ReturnFieldArgs;
+    use std::sync::Arc;
+
+    let func = CharFunc::new();
+
+    let nullable_field: FieldRef = Arc::new(Field::new("col", Int64, true));
+
+    let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+        arg_fields: &[nullable_field],
+        scalar_arguments: &[None],
+    })?;
+
+    assert!(
+        out_nullable.is_nullable(),
+        "char(col) should be nullable when input column is nullable"
+    );
+    assert_eq!(
+        out_nullable.data_type(),
+        &Utf8,
+        "char always returns Utf8 regardless of input type"
+    );
+
+    let non_nullable_field: FieldRef = Arc::new(Field::new("col", Int64, false));
+
+    let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+        arg_fields: &[non_nullable_field],
+        scalar_arguments: &[None],
+    })?;
+
+    assert!(
+        !out_non_nullable.is_nullable(),
+        "char(col) should NOT be nullable when input column is NOT nullable"
+    );
+    assert_eq!(
+        out_non_nullable.data_type(),
+        &Utf8,
+        "char always returns Utf8 regardless of input type"
+    );
 
-    Ok(Arc::new(result) as ArrayRef)
+    Ok(())
 }
diff --git a/datafusion/spark/src/function/string/concat.rs b/datafusion/spark/src/function/string/concat.rs
new file mode 100644
index 0000000000000..b2073690fc446
--- /dev/null
+++ b/datafusion/spark/src/function/string/concat.rs
@@ -0,0 +1,254 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::arrow::datatypes::FieldRef;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::ReturnFieldArgs;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::string::concat::ConcatFunc;
+use std::any::Any;
+use std::sync::Arc;
+
+use crate::function::null_utils::{
+    NullMaskResolution, apply_null_mask, compute_null_mask,
+};
+
+/// Spark-compatible `concat` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#concat>
+///
+/// Concatenates multiple input strings into a single string.
+/// Returns NULL if any input is NULL.
+///
+/// Differences with DataFusion concat:
+/// - Support 0 arguments
+/// - Return NULL if any input is NULL
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkConcat {
+    signature: Signature,
+}
+
+impl Default for SparkConcat {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkConcat {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkConcat {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "concat"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_concat(args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        // Accept any string types, including zero arguments
+        Ok(arg_types.to_vec())
+    }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called for Spark concat"
+        )
+    }
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        use DataType::*;
+
+        // Spark semantics: concat returns NULL if ANY input is NULL
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        // Determine return type: Utf8View > LargeUtf8 > Utf8
+        let mut dt = &Utf8;
+        for field in args.arg_fields {
+            let data_type = field.data_type();
+            if data_type == &Utf8View || (data_type == &LargeUtf8 && dt != &Utf8View) {
+                dt = data_type;
+            }
+        }
+
+        Ok(Arc::new(Field::new("concat", dt.clone(), nullable)))
+    }
+}
+
+/// Concatenates strings, returning NULL if any input is NULL
+/// This is a Spark-specific wrapper around DataFusion's concat that returns NULL
+/// if any argument is NULL (Spark behavior), whereas DataFusion's concat ignores NULLs.
+fn spark_concat(args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+    let ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    } = args;
+
+    // Handle zero-argument case: return empty string
+    if arg_values.is_empty() {
+        let return_type = return_field.data_type();
+        return match return_type {
+            DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
+                String::new(),
+            )))),
+            DataType::LargeUtf8 => Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
+                Some(String::new()),
+            ))),
+            _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
+                Some(String::new()),
+            ))),
+        };
+    }
+
+    // Step 1: Check for NULL mask in incoming args
+    let null_mask = compute_null_mask(&arg_values, number_rows)?;
+
+    // If all scalars and any is NULL, return NULL immediately
+    if matches!(null_mask, NullMaskResolution::ReturnNull) {
+        let return_type = return_field.data_type();
+        return match return_type {
+            DataType::Utf8View => Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None))),
+            DataType::LargeUtf8 => {
+                Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+            }
+            _ => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
+        };
+    }
+
+    // Step 2: Delegate to DataFusion's concat
+    let concat_func = ConcatFunc::new();
+    let return_type = return_field.data_type().clone();
+    let func_args = ScalarFunctionArgs {
+        args: arg_values,
+        arg_fields,
+        number_rows,
+        return_field,
+        config_options,
+    };
+    let result = concat_func.invoke_with_args(func_args)?;
+
+    // Step 3: Apply NULL mask to result
+    apply_null_mask(result, null_mask, &return_type)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, StringArray};
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::Result;
+    use datafusion_expr::ReturnFieldArgs;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_concat_basic() -> Result<()> {
+        test_scalar_function!(
+            SparkConcat::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("Spark".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("SQL".to_string()))),
+            ],
+            Ok(Some("SparkSQL")),
+            &str,
+            DataType::Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_concat_with_null() -> Result<()> {
+        test_scalar_function!(
+            SparkConcat::new(),
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("Spark".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("SQL".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(None)),
+            ],
+            Ok(None),
+            &str,
+            DataType::Utf8,
+            StringArray
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_concat_return_field_non_nullable() -> Result<()> {
+        let func = SparkConcat::new();
+
+        let fields = vec![
+            Arc::new(Field::new("a", DataType::Utf8, false)),
+            Arc::new(Field::new("b", DataType::Utf8, false)),
+        ];
+
+        let args = ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[],
+        };
+
+        let field = func.return_field_from_args(args)?;
+
+        assert!(
+            !field.is_nullable(),
+            "Expected concat result to be non-nullable when all inputs are non-nullable"
+        );
+
+        Ok(())
+    }
+    #[test]
+    fn test_spark_concat_return_field_nullable() -> Result<()> {
+        let func = SparkConcat::new();
+
+        let fields = vec![
+            Arc::new(Field::new("a", DataType::Utf8, false)),
+            Arc::new(Field::new("b", DataType::Utf8, true)),
+        ];
+
+        let args = ReturnFieldArgs {
+            arg_fields: &fields,
+            scalar_arguments: &[],
+        };
+
+        let field = func.return_field_from_args(args)?;
+
+        assert!(
+            field.is_nullable(),
+            "Expected concat result to be nullable when any input is nullable"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/elt.rs b/datafusion/spark/src/function/string/elt.rs
new file mode 100644
index 0000000000000..80090d58641f4
--- /dev/null
+++ b/datafusion/spark/src/function/string/elt.rs
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, AsArray, PrimitiveArray, StringArray, StringBuilder,
+};
+use arrow::compute::{can_cast_types, cast};
+use arrow::datatypes::DataType::{Int64, Utf8};
+use arrow::datatypes::{DataType, Int64Type};
+use datafusion_common::cast::as_string_array;
+use datafusion_common::{DataFusionError, Result, plan_datafusion_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkElt {
+    signature: Signature,
+}
+
+impl Default for SparkElt {
+    fn default() -> Self {
+        SparkElt::new()
+    }
+}
+
+impl SparkElt {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkElt {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "elt"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Utf8)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(elt, vec![])(&args.args)
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        let length = arg_types.len();
+        if length < 2 {
+            plan_datafusion_err!(
+                "ELT function expects at least 2 arguments: index, value1"
+            );
+        }
+
+        let idx_dt: &DataType = &arg_types[0];
+        if *idx_dt != Int64 && !can_cast_types(idx_dt, &Int64) {
+            return Err(DataFusionError::Plan(format!(
+                "ELT index must be Int64 (or castable to Int64), got {idx_dt:?}"
+            )));
+        }
+        let mut coerced = Vec::with_capacity(arg_types.len());
+        coerced.push(Int64);
+
+        for _ in 1..length {
+            coerced.push(Utf8);
+        }
+
+        Ok(coerced)
+    }
+}
+
+fn elt(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+    let n_rows = args[0].len();
+
+    let idx: &PrimitiveArray<Int64Type> =
+        args[0].as_primitive_opt::<Int64Type>().ok_or_else(|| {
+            DataFusionError::Plan(format!(
+                "ELT function: first argument must be Int64 (got {:?})",
+                args[0].data_type()
+            ))
+        })?;
+
+    let num_values = args.len() - 1;
+    let mut cols: Vec<Arc<StringArray>> = Vec::with_capacity(num_values);
+    for a in args.iter().skip(1) {
+        let casted = cast(a, &Utf8)?;
+        let sa = as_string_array(&casted)?;
+        cols.push(Arc::new(sa.clone()));
+    }
+
+    let mut builder = StringBuilder::new();
+
+    for i in 0..n_rows {
+        if idx.is_null(i) {
+            builder.append_null();
+            continue;
+        }
+
+        let index = idx.value(i);
+
+        // TODO: if spark.sql.ansi.enabled is true,
+        //  throw ArrayIndexOutOfBoundsException for invalid indices;
+        //  if false, return NULL instead (current behavior).
+        if index < 1 || (index as usize) > num_values {
+            builder.append_null();
+            continue;
+        }
+
+        let value_idx = (index as usize) - 1;
+        let col = &cols[value_idx];
+
+        if col.is_null(i) {
+            builder.append_null();
+        } else {
+            builder.append_value(col.value(i));
+        }
+    }
+
+    Ok(Arc::new(builder.finish()) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int64Array;
+    use datafusion_common::Result;
+
+    use arrow::array::{ArrayRef, StringArray};
+    use datafusion_common::DataFusionError;
+    use std::sync::Arc;
+
+    fn run_elt_arrays(arrs: Vec<ArrayRef>) -> Result<Arc<StringArray>> {
+        let arr = elt(&arrs)?;
+        let string_array = arr
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| DataFusionError::Internal("expected Utf8".into()))?;
+        Ok(Arc::new(string_array.clone()))
+    }
+
+    #[test]
+    fn elt_utf8_basic() -> Result<()> {
+        let idx = Arc::new(Int64Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(3),
+            Some(4),
+            Some(0),
+            None,
+        ]));
+        let v1 = Arc::new(StringArray::from(vec![
+            Some("a1"),
+            Some("a2"),
+            Some("a3"),
+            Some("a4"),
+            Some("a5"),
+            Some("a6"),
+        ]));
+        let v2 = Arc::new(StringArray::from(vec![
+            Some("b1"),
+            Some("b2"),
+            None,
+            Some("b4"),
+            Some("b5"),
+            Some("b6"),
+        ]));
+        let v3 = Arc::new(StringArray::from(vec![
+            Some("c1"),
+            Some("c2"),
+            Some("c3"),
+            None,
+            Some("c5"),
+            Some("c6"),
+        ]));
+
+        let out = run_elt_arrays(vec![idx, v1, v2, v3])?;
+        assert_eq!(out.len(), 6);
+        assert_eq!(out.value(0), "a1");
+        assert_eq!(out.value(1), "b2");
+        assert_eq!(out.value(2), "c3");
+        assert!(out.is_null(3));
+        assert!(out.is_null(4));
+        assert!(out.is_null(5));
+        Ok(())
+    }
+
+    #[test]
+    fn elt_int64_basic() -> Result<()> {
+        let idx = Arc::new(Int64Array::from(vec![Some(2), Some(1), Some(2)]));
+        let v1 = Arc::new(Int64Array::from(vec![Some(10), Some(20), Some(30)]));
+        let v2 = Arc::new(Int64Array::from(vec![Some(100), None, Some(300)]));
+
+        let out = run_elt_arrays(vec![idx, v1, v2])?;
+        assert_eq!(out.len(), 3);
+        assert_eq!(out.value(0), "100");
+        assert_eq!(out.value(1), "20");
+        assert_eq!(out.value(2), "300");
+        Ok(())
+    }
+
+    #[test]
+    fn elt_out_of_range_all_null() -> Result<()> {
+        let idx = Arc::new(Int64Array::from(vec![Some(5), Some(-1), Some(0)]));
+        let v1 = Arc::new(StringArray::from(vec![Some("x"), Some("y"), Some("z")]));
+        let v2 = Arc::new(StringArray::from(vec![Some("a"), Some("b"), Some("c")]));
+
+        let out = run_elt_arrays(vec![idx, v1, v2])?;
+        assert!(out.is_null(0));
+        assert!(out.is_null(1));
+        assert!(out.is_null(2));
+        Ok(())
+    }
+
+    #[test]
+    fn elt_utf8_returns_utf8() -> Result<()> {
+        let idx = Arc::new(Int64Array::from(vec![Some(1)]));
+        let v1 = Arc::new(StringArray::from(vec![Some("scala")]));
+        let v2 = Arc::new(StringArray::from(vec![Some("java")]));
+
+        let out = run_elt_arrays(vec![idx, v1, v2])?;
+        assert_eq!(out.data_type(), &Utf8);
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/format_string.rs b/datafusion/spark/src/function/string/format_string.rs
new file mode 100644
index 0000000000000..3adf508895949
--- /dev/null
+++ b/datafusion/spark/src/function/string/format_string.rs
@@ -0,0 +1,2394 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::fmt::Write;
+use std::sync::Arc;
+
+use core::num::FpCategory;
+
+use arrow::{
+    array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray},
+    datatypes::{DataType, Field, FieldRef},
+};
+use bigdecimal::{
+    BigDecimal, ToPrimitive,
+    num_bigint::{BigInt, Sign},
+};
+use chrono::{DateTime, Datelike, Timelike, Utc};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, plan_err,
+};
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility,
+};
+
+/// Spark-compatible `format_string` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#format_string>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct FormatStringFunc {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for FormatStringFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FormatStringFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::new(TypeSignature::VariadicAny, Volatility::Immutable),
+            aliases: vec![String::from("printf")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for FormatStringFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "format_string"
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        match args.arg_fields[0].data_type() {
+            DataType::Null => {
+                Ok(Arc::new(Field::new("format_string", DataType::Utf8, true)))
+            }
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+                Ok(Arc::clone(&args.arg_fields[0]))
+            }
+            _ => exec_err!(
+                "format_string expects the first argument to be Utf8, LargeUtf8 or Utf8View, got {} instead",
+                args.arg_fields[0].data_type()
+            ),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let len = args.args.iter().find_map(|arg| match arg {
+            ColumnarValue::Scalar(_) => None,
+            ColumnarValue::Array(a) => Some(a.len()),
+        });
+        let is_scalar = len.is_none();
+        let data_types = args.args[1..]
+            .iter()
+            .map(|arg| arg.data_type())
+            .collect::<Vec<_>>();
+        let fmt_type = args.args[0].data_type();
+
+        match &args.args[0] {
+            ColumnarValue::Scalar(ScalarValue::Null) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+            }
+            ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(None)))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt)))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(fmt)))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(Some(fmt))) => {
+                let formatter = Formatter::parse(fmt, &data_types)?;
+                let mut result = Vec::with_capacity(len.unwrap_or(1));
+                for i in 0..len.unwrap_or(1) {
+                    let scalars = args.args[1..]
+                        .iter()
+                        .map(|arg| try_to_scalar(arg.clone(), i))
+                        .collect::<Result<Vec<_>>>()?;
+                    let formatted = formatter.format(&scalars)?;
+                    result.push(formatted);
+                }
+                if is_scalar {
+                    let scalar_result = result.pop().unwrap();
+                    match fmt_type {
+                        DataType::Utf8 => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
+                            Some(scalar_result),
+                        ))),
+                        DataType::LargeUtf8 => Ok(ColumnarValue::Scalar(
+                            ScalarValue::LargeUtf8(Some(scalar_result)),
+                        )),
+                        DataType::Utf8View => Ok(ColumnarValue::Scalar(
+                            ScalarValue::Utf8View(Some(scalar_result)),
+                        )),
+                        _ => unreachable!(),
+                    }
+                } else {
+                    let array: ArrayRef = match fmt_type {
+                        DataType::Utf8 => Arc::new(StringArray::from(result)),
+                        DataType::LargeUtf8 => Arc::new(LargeStringArray::from(result)),
+                        DataType::Utf8View => Arc::new(StringViewArray::from(result)),
+                        _ => unreachable!(),
+                    };
+                    Ok(ColumnarValue::Array(array))
+                }
+            }
+            ColumnarValue::Array(fmts) => {
+                let mut result = Vec::with_capacity(len.unwrap());
+                for i in 0..len.unwrap() {
+                    let fmt = ScalarValue::try_from_array(fmts, i)?;
+                    match fmt.try_as_str() {
+                        Some(Some(fmt)) => {
+                            let formatter = Formatter::parse(fmt, &data_types)?;
+                            let scalars = args.args[1..]
+                                .iter()
+                                .map(|arg| try_to_scalar(arg.clone(), i))
+                                .collect::<Result<Vec<_>>>()?;
+                            let formatted = formatter.format(&scalars)?;
+                            result.push(Some(formatted));
+                        }
+                        Some(None) => {
+                            result.push(None);
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+                let array: ArrayRef = match fmt_type {
+                    DataType::Utf8 => Arc::new(StringArray::from(result)),
+                    DataType::LargeUtf8 => Arc::new(LargeStringArray::from(result)),
+                    DataType::Utf8View => Arc::new(StringViewArray::from(result)),
+                    _ => unreachable!(),
+                };
+                Ok(ColumnarValue::Array(array))
+            }
+            _ => exec_err!(
+                "The format_string function expects the first argument to be a string"
+            ),
+        }
+    }
+}
+
+fn try_to_scalar(arg: ColumnarValue, index: usize) -> Result<ScalarValue> {
+    match arg {
+        ColumnarValue::Scalar(scalar) => Ok(scalar),
+        ColumnarValue::Array(array) => ScalarValue::try_from_array(&array, index),
+    }
+}
+
+/// Compatible with `java.util.Formatter`
+#[derive(Debug)]
+pub struct Formatter<'a> {
+    pub elements: Vec<FormatElement<'a>>,
+    pub arg_num: usize,
+}
+
+impl<'a> Formatter<'a> {
+    pub fn new(elements: Vec<FormatElement<'a>>) -> Self {
+        let arg_num = elements
+            .iter()
+            .map(|element| match element {
+                FormatElement::Format(spec) => spec.argument_index,
+                _ => 0,
+            })
+            .max()
+            .unwrap_or(0);
+        Self { elements, arg_num }
+    }
+
+    /// Parses a printf-style format string into a Formatter with validation.
+    ///
+    /// This method implements a comprehensive parser for Java `java.util.Formatter` syntax,
+    /// processing the format string character by character to identify and validate format
+    /// specifiers against the provided argument types.
+    ///
+    /// # Arguments
+    ///
+    /// * `fmt` - The format string containing literal text and format specifiers
+    /// * `arg_types` - Array of DataFusion DataTypes corresponding to the arguments
+    ///
+    /// # Parsing Process
+    ///
+    /// The parser operates in several phases:
+    ///
+    /// 1. **String Scanning**: Iterates through the format string looking for '%' characters
+    ///    that mark the beginning of format specifiers or special sequences.
+    ///
+    /// 2. **Special Sequence Handling**: Processes escape sequences:
+    ///    - `%%` becomes a literal '%' character
+    ///    - `%n` becomes a newline character
+    ///    - `%<` indicates reuse of the previous argument with a new format specifier
+    ///
+    /// 3. **Argument Index Resolution**: Determines which argument each format specifier refers to:
+    ///    - Sequential indexing: arguments are consumed in order (1, 2, 3, ...)
+    ///    - Positional indexing: explicit argument position using `%n$` syntax
+    ///    - Previous argument reuse: `%<` references the last used argument
+    ///
+    /// 4. **Format Specifier Parsing**: For each format specifier, extracts:
+    ///    - Flags (-, +, space, #, 0, ',', '(')
+    ///    - Width specification (minimum field width)
+    ///    - Precision specification (decimal places or maximum characters)
+    ///    - Conversion type (d, s, f, x, etc.)
+    ///
+    /// 5. **Type Validation**: Verifies that each format specifier's conversion type
+    ///    is compatible with the corresponding argument's DataType. For example:
+    ///    - Integer conversions (%d, %x, %o) require integer DataTypes
+    ///    - String conversions (%s, %S) accept any DataType
+    ///    - Float conversions (%f, %e, %g) require numeric DataTypes
+    ///
+    /// 6. **Element Construction**: Creates FormatElement instances for:
+    ///    - Verbatim text sections (copied directly to output)
+    ///    - Validated format specifiers with their parsed parameters
+    ///
+    /// # Internal State Management
+    ///
+    /// The parser maintains several state variables:
+    /// - `argument_index`: Tracks the current sequential argument position
+    /// - `prev`: Remembers the last used argument index for `%<` references
+    /// - `res`: Accumulates the parsed FormatElement instances
+    /// - `rem`: Points to the remaining unparsed portion of the format string
+    ///
+    /// # Validation and Error Handling
+    ///
+    /// The parser performs extensive validation including:
+    /// - Argument index bounds checking against the provided arg_types array
+    /// - Format specifier syntax validation
+    /// - Type compatibility verification between conversion types and DataTypes
+    /// - Detection of malformed numeric parameters and invalid flag combinations
+    ///
+    /// # Returns
+    ///
+    /// Returns a Formatter containing the parsed elements and the maximum argument
+    /// index encountered, enabling efficient argument validation during formatting.
+    pub fn parse(fmt: &'a str, arg_types: &[DataType]) -> Result<Self> {
+        // find the first %
+        let mut res = Vec::new();
+
+        let mut rem = fmt;
+        let mut argument_index = 0;
+
+        let mut prev: Option<usize> = None;
+
+        while !rem.is_empty() {
+            if let Some((verbatim_prefix, rest)) = rem.split_once('%') {
+                if !verbatim_prefix.is_empty() {
+                    res.push(FormatElement::Verbatim(verbatim_prefix));
+                }
+                if let Some(rest) = rest.strip_prefix('%') {
+                    res.push(FormatElement::Verbatim("%"));
+                    rem = rest;
+                    continue;
+                }
+                if let Some(rest) = rest.strip_prefix('n') {
+                    res.push(FormatElement::Verbatim("\n"));
+                    rem = rest;
+                    continue;
+                }
+                if let Some(rest) = rest.strip_prefix('<') {
+                    // %< means reuse the previous argument
+                    let Some(p) = prev else {
+                        return exec_err!("No previous argument to reference");
+                    };
+                    let (spec, rest) =
+                        take_conversion_specifier(rest, p, &arg_types[p - 1])?;
+                    res.push(FormatElement::Format(spec));
+                    rem = rest;
+                    continue;
+                }
+
+                let (current_argument_index, rest2) = take_numeric_param(rest, false);
+                let (current_argument_index, rest) =
+                    match (current_argument_index, rest2.starts_with('$')) {
+                        (NumericParam::Literal(index), true) => {
+                            (index as usize, &rest2[1..])
+                        }
+                        (NumericParam::FromArgument, true) => {
+                            return exec_err!("Invalid numeric parameter");
+                        }
+                        (_, false) => {
+                            argument_index += 1;
+                            (argument_index, rest)
+                        }
+                    };
+                if current_argument_index == 0 || current_argument_index > arg_types.len()
+                {
+                    return exec_err!(
+                        "Argument index {} is out of bounds",
+                        current_argument_index
+                    );
+                }
+
+                let (spec, rest) = take_conversion_specifier(
+                    rest,
+                    current_argument_index,
+                    &arg_types[current_argument_index - 1],
+                )
+                .map_err(|e| exec_datafusion_err!("{:?}, format string: {:?}", e, fmt))?;
+                res.push(FormatElement::Format(spec));
+                prev = Some(spec.argument_index);
+                rem = rest;
+            } else {
+                res.push(FormatElement::Verbatim(rem));
+                break;
+            }
+        }
+
+        Ok(Self::new(res))
+    }
+
+    pub fn format(&self, args: &[ScalarValue]) -> Result<String> {
+        if args.len() < self.arg_num {
+            return exec_err!(
+                "Expected at least {} arguments, got {}",
+                self.arg_num,
+                args.len()
+            );
+        }
+        let mut string = String::new();
+        for element in &self.elements {
+            match element {
+                FormatElement::Verbatim(text) => {
+                    string.push_str(text);
+                }
+                FormatElement::Format(spec) => {
+                    spec.format(&mut string, &args[spec.argument_index - 1])?;
+                }
+            }
+        }
+        Ok(string)
+    }
+}
+
+#[derive(Debug)]
+pub enum FormatElement<'a> {
+    /// Some characters that are copied to the output as-is
+    Verbatim(&'a str),
+    /// A format specifier
+    Format(ConversionSpecifier),
+}
+
+/// Parsed printf conversion specifier
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ConversionSpecifier {
+    pub argument_index: usize,
+    /// flag `#`: use `0x`, etc?
+    pub alt_form: bool,
+    /// flag `0`: left-pad with zeros?
+    pub zero_pad: bool,
+    /// flag `-`: left-adjust (pad with spaces on the right)
+    pub left_adj: bool,
+    /// flag `' '` (space): indicate sign with a space?
+    pub space_sign: bool,
+    /// flag `+`: Always show sign? (for signed numbers)
+    pub force_sign: bool,
+    /// flag `,`: include locale-specific grouping separators
+    pub grouping_separator: bool,
+    /// flag `(`: enclose negative numbers in parentheses
+    pub negative_in_parentheses: bool,
+    /// field width
+    pub width: NumericParam,
+    /// floating point field precision
+    pub precision: NumericParam,
+    /// data type
+    pub conversion_type: ConversionType,
+}
+
+/// Width / precision parameter
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum NumericParam {
+    /// The literal width
+    Literal(i32),
+    /// Get the width from the previous argument
+    FromArgument,
+}
+
+/// Printf data type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ConversionType {
+    /// `B`
+    BooleanUpper,
+    /// `b`
+    BooleanLower,
+    /// Not implemented yet. Can be implemented after <https://github.com/apache/datafusion/pull/17093> is merged
+    /// `h`
+    HexHashLower,
+    /// `H`
+    HexHashUpper,
+    /// `d`
+    DecInt,
+    /// `o`
+    OctInt,
+    /// `x`
+    HexIntLower,
+    /// `X`
+    HexIntUpper,
+    /// `e`
+    SciFloatLower,
+    /// `E`
+    SciFloatUpper,
+    /// `f`
+    DecFloatLower,
+    /// `g`
+    CompactFloatLower,
+    /// `G`
+    CompactFloatUpper,
+    /// `a`
+    HexFloatLower,
+    /// `A`
+    HexFloatUpper,
+    /// `t`
+    TimeLower(TimeFormat),
+    /// `T`
+    TimeUpper(TimeFormat),
+    /// `c`
+    CharLower,
+    /// `C`
+    CharUpper,
+    /// `s`
+    StringLower,
+    /// `S`
+    StringUpper,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TimeFormat {
+    // Hour of the day for the 24-hour clock,
+    // formatted as two digits with a leading zero as necessary i.e. 00 - 23. 00 corresponds to midnight.
+    HUpper,
+    // Hour for the 12-hour clock,
+    // formatted as two digits with a leading zero as necessary, i.e. 01 - 12. 01 corresponds to one o'clock (either morning or afternoon).
+    IUpper,
+    // Hour of the day for the 24-hour clock,
+    // i.e. 0 - 23. 0 corresponds to midnight.
+    KLower,
+    // Hour for the 12-hour clock,
+    // i.e. 1 - 12. 1 corresponds to one o'clock (either morning or afternoon).
+    LLower,
+    // Minute within the hour formatted as two digits with a leading zero as necessary, i.e. 00 - 59.
+    MUpper,
+    // Seconds within the minute, formatted as two digits with a leading zero as necessary,
+    // i.e. 00 - 60 ("60" is a special value required to support leap seconds).
+    SUpper,
+    // Millisecond within the second formatted as three digits with leading zeros as necessary, i.e. 000 - 999.
+    LUpper,
+    // Nanosecond within the second, formatted as nine digits with leading zeros as necessary,
+    // i.e. 000000000 - 999999999. The precision of this value is limited by the resolution of the underlying operating system or hardware.
+    NUpper,
+    // Locale-specific morning or afternoon marker in lower case, e.g."am" or "pm".
+    // Use of the conversion prefix 'T' forces this output to upper case. (Note that 'p' produces lower-case output.
+    // This is different from GNU date and POSIX strftime(3c) which produce upper-case output.)
+    PLower,
+    // RFC 822 style numeric time zone offset from GMT,
+    // e.g. -0800. This value will be adjusted as necessary for Daylight Saving Time.
+    // For long, Long, and Date the time zone used is the default time zone for this instance of the Java virtual machine.
+    ZLower,
+    // A string representing the abbreviation for the time zone. This value will be adjusted as necessary for Daylight Saving Time.
+    // For long, Long, and Date the time zone used is the default time zone for this instance of the Java virtual machine.
+    // The Formatter's locale will supersede the locale of the argument (if any).
+    ZUpper,
+    // Seconds since the beginning of the epoch starting at 1 January 1970 00:00:00 UTC,
+    // i.e. Long.MIN_VALUE/1000 to Long.MAX_VALUE/1000.
+    SLower,
+    // Milliseconds since the beginning of the epoch starting at 1 January 1970 00:00:00 UTC,
+    // i.e. Long.MIN_VALUE to Long.MAX_VALUE. The precision of this value is limited by the resolution of the underlying operating system or hardware.
+    QUpper,
+    // Locale-specific full month name, e.g. "January", "February".
+    BUpper,
+    // Locale-specific abbreviated month name, e.g. "Jan", "Feb".
+    BLower,
+    // Locale-specific full weekday name, e.g. "Monday", "Tuesday".
+    AUpper,
+    // Locale-specific abbreviated weekday name, e.g. "Mon", "Tue".
+    ALower,
+    // Four-digit year divided by 100, formatted as two digits with leading zero as necessary, i.e. 00 - 99
+    CUpper,
+    // Year, formatted to at least four digits with leading zeros as necessary, e.g. 0092 equals 92 CE for the Gregorian calendar.
+    YUpper,
+    // Last two digits of the year, formatted with leading zeros as necessary, i.e. 00 - 99.
+    YLower,
+    // Day of year, formatted as three digits with leading zeros as necessary, e.g. 001 - 366 for the Gregorian calendar. 001 corresponds to the first day of the year.
+    JLower,
+    // Month, formatted as two digits with leading zeros as necessary, i.e. 01 - 13, where "01" is the first month of the year and ("13" is a special value required to support lunar calendars).
+    MLower,
+    // Day of month, formatted as two digits with leading zeros as necessary, i.e. 01 - 31, where "01" is the first day of the month.
+    DLower,
+    // Day of month, formatted as two digits, i.e. 1 - 31 where "1" is the first day of the month.
+    ELower,
+    // Time formatted for the 24-hour clock as "%tH:%tM"
+    RUpper,
+    // Time formatted for the 24-hour clock as "%tH:%tM:%tS"
+    TUpper,
+    // Time formatted for the 12-hour clock as "%tI:%tM:%tS %Tp". The location of the morning or afternoon marker ('%Tp') may be locale-dependent.
+    RLower,
+    // Date formatted as "%tm/%td/%ty"
+    DUpper,
+    // ISO 8601 complete date formatted as "%tY-%tm-%td"
+    FUpper,
+    // Date and time formatted as "%ta %tb %td %tT %tZ %tY", e.g. "Sun Jul 20 16:17:00 EDT 1969"
+    CLower,
+}
+
+impl TryFrom<char> for TimeFormat {
+    type Error = DataFusionError;
+    fn try_from(value: char) -> Result<Self, Self::Error> {
+        match value {
+            'H' => Ok(TimeFormat::HUpper),
+            'I' => Ok(TimeFormat::IUpper),
+            'k' => Ok(TimeFormat::KLower),
+            'l' => Ok(TimeFormat::LLower),
+            'M' => Ok(TimeFormat::MUpper),
+            'S' => Ok(TimeFormat::SUpper),
+            'L' => Ok(TimeFormat::LUpper),
+            'N' => Ok(TimeFormat::NUpper),
+            'p' => Ok(TimeFormat::PLower),
+            'z' => Ok(TimeFormat::ZLower),
+            'Z' => Ok(TimeFormat::ZUpper),
+            's' => Ok(TimeFormat::SLower),
+            'Q' => Ok(TimeFormat::QUpper),
+            'B' => Ok(TimeFormat::BUpper),
+            'b' | 'h' => Ok(TimeFormat::BLower),
+            'A' => Ok(TimeFormat::AUpper),
+            'a' => Ok(TimeFormat::ALower),
+            'C' => Ok(TimeFormat::CUpper),
+            'Y' => Ok(TimeFormat::YUpper),
+            'y' => Ok(TimeFormat::YLower),
+            'j' => Ok(TimeFormat::JLower),
+            'm' => Ok(TimeFormat::MLower),
+            'd' => Ok(TimeFormat::DLower),
+            'e' => Ok(TimeFormat::ELower),
+            'R' => Ok(TimeFormat::RUpper),
+            'T' => Ok(TimeFormat::TUpper),
+            'r' => Ok(TimeFormat::RLower),
+            'D' => Ok(TimeFormat::DUpper),
+            'F' => Ok(TimeFormat::FUpper),
+            'c' => Ok(TimeFormat::CLower),
+            _ => exec_err!("Invalid time format: {}", value),
+        }
+    }
+}
+
+impl ConversionType {
+    pub fn validate(&self, arg_type: &DataType) -> Result<()> {
+        match self {
+            ConversionType::BooleanLower | ConversionType::BooleanUpper => {
+                if *arg_type != DataType::Boolean {
+                    return exec_err!(
+                        "Invalid argument type for boolean conversion: {:?}",
+                        arg_type
+                    );
+                }
+            }
+            ConversionType::CharLower | ConversionType::CharUpper => {
+                if !matches!(
+                    arg_type,
+                    DataType::Int8
+                        | DataType::UInt8
+                        | DataType::Int16
+                        | DataType::UInt16
+                        | DataType::Int32
+                        | DataType::UInt32
+                        | DataType::Int64
+                        | DataType::UInt64
+                ) {
+                    return exec_err!(
+                        "Invalid argument type for char conversion: {:?}",
+                        arg_type
+                    );
+                }
+            }
+            ConversionType::DecInt
+            | ConversionType::OctInt
+            | ConversionType::HexIntLower
+            | ConversionType::HexIntUpper => {
+                if !arg_type.is_integer() {
+                    return exec_err!(
+                        "Invalid argument type for integer conversion: {:?}",
+                        arg_type
+                    );
+                }
+            }
+            ConversionType::SciFloatLower
+            | ConversionType::SciFloatUpper
+            | ConversionType::DecFloatLower
+            | ConversionType::CompactFloatLower
+            | ConversionType::CompactFloatUpper
+            | ConversionType::HexFloatLower
+            | ConversionType::HexFloatUpper => {
+                if !arg_type.is_numeric() {
+                    return exec_err!(
+                        "Invalid argument type for float conversion: {:?}",
+                        arg_type
+                    );
+                }
+            }
+            ConversionType::TimeLower(_) | ConversionType::TimeUpper(_) => {
+                if !arg_type.is_temporal() {
+                    return exec_err!(
+                        "Invalid argument type for time conversion: {:?}",
+                        arg_type
+                    );
+                }
+            }
+            _ => {}
+        }
+        Ok(())
+    }
+
+    fn supports_integer(&self) -> bool {
+        matches!(
+            self,
+            ConversionType::DecInt
+                | ConversionType::HexIntLower
+                | ConversionType::HexIntUpper
+                | ConversionType::OctInt
+                | ConversionType::CharLower
+                | ConversionType::CharUpper
+                | ConversionType::StringLower
+                | ConversionType::StringUpper
+        )
+    }
+
+    fn supports_float(&self) -> bool {
+        matches!(
+            self,
+            ConversionType::DecFloatLower
+                | ConversionType::SciFloatLower
+                | ConversionType::SciFloatUpper
+                | ConversionType::CompactFloatLower
+                | ConversionType::CompactFloatUpper
+                | ConversionType::StringLower
+                | ConversionType::StringUpper
+                | ConversionType::HexFloatLower
+                | ConversionType::HexFloatUpper
+        )
+    }
+
+    fn supports_decimal(&self) -> bool {
+        matches!(
+            self,
+            ConversionType::DecFloatLower
+                | ConversionType::SciFloatLower
+                | ConversionType::SciFloatUpper
+                | ConversionType::CompactFloatLower
+                | ConversionType::CompactFloatUpper
+                | ConversionType::StringLower
+                | ConversionType::StringUpper
+        )
+    }
+
+    fn supports_time(&self) -> bool {
+        matches!(
+            self,
+            ConversionType::TimeLower(_)
+                | ConversionType::TimeUpper(_)
+                | ConversionType::StringLower
+                | ConversionType::StringUpper
+        )
+    }
+
+    fn is_upper(&self) -> bool {
+        matches!(
+            self,
+            ConversionType::BooleanUpper
+                | ConversionType::HexHashUpper
+                | ConversionType::HexIntUpper
+                | ConversionType::SciFloatUpper
+                | ConversionType::CompactFloatUpper
+                | ConversionType::HexFloatUpper
+                | ConversionType::TimeUpper(_)
+                | ConversionType::CharUpper
+                | ConversionType::StringUpper
+        )
+    }
+}
+
+fn take_conversion_specifier<'a>(
+    mut s: &'a str,
+    argument_index: usize,
+    arg_type: &DataType,
+) -> Result<(ConversionSpecifier, &'a str)> {
+    let mut spec = ConversionSpecifier {
+        argument_index,
+        alt_form: false,
+        zero_pad: false,
+        left_adj: false,
+        space_sign: false,
+        force_sign: false,
+        grouping_separator: false,
+        negative_in_parentheses: false,
+        width: NumericParam::Literal(0),
+        precision: NumericParam::FromArgument, // Placeholder - must not be returned!
+        // ignore length modifier
+        conversion_type: ConversionType::DecInt,
+    };
+
+    // parse flags
+    loop {
+        match s.chars().next() {
+            Some('#') => {
+                spec.alt_form = true;
+            }
+            Some('0') => {
+                if spec.left_adj {
+                    return exec_err!("Invalid flag combination: '0' and '-'");
+                }
+                spec.zero_pad = true;
+            }
+            Some('-') => {
+                spec.left_adj = true;
+            }
+            Some(' ') => {
+                if spec.force_sign {
+                    return exec_err!("Invalid flag combination: '+' and ' '");
+                }
+                spec.space_sign = true;
+            }
+            Some('+') => {
+                if spec.space_sign {
+                    return exec_err!("Invalid flag combination: '+' and ' '");
+                }
+                spec.force_sign = true;
+            }
+            Some(',') => {
+                spec.grouping_separator = true;
+            }
+            Some('(') => {
+                spec.negative_in_parentheses = true;
+            }
+            _ => {
+                break;
+            }
+        }
+        s = &s[1..];
+    }
+    // parse width
+    let (w, mut s) = take_numeric_param(s, false);
+    spec.width = w;
+    // parse precision
+    if matches!(s.chars().next(), Some('.')) {
+        s = &s[1..];
+        let (p, s2) = take_numeric_param(s, true);
+        spec.precision = p;
+        s = s2;
+    }
+    let mut chars = s.chars();
+    let mut offset = 1;
+    // parse conversion type
+    spec.conversion_type = match chars.next() {
+        Some('b') => ConversionType::BooleanLower,
+        Some('B') => ConversionType::BooleanUpper,
+        Some('h') => ConversionType::HexHashLower,
+        Some('H') => ConversionType::HexHashUpper,
+        Some('s') => ConversionType::StringLower,
+        Some('S') => ConversionType::StringUpper,
+        Some('c') => ConversionType::CharLower,
+        Some('C') => ConversionType::CharUpper,
+        Some('d') => ConversionType::DecInt,
+        Some('o') => ConversionType::OctInt,
+        Some('x') => ConversionType::HexIntLower,
+        Some('X') => ConversionType::HexIntUpper,
+        Some('e') => ConversionType::SciFloatLower,
+        Some('E') => ConversionType::SciFloatUpper,
+        Some('f') => ConversionType::DecFloatLower,
+        Some('g') => ConversionType::CompactFloatLower,
+        Some('G') => ConversionType::CompactFloatUpper,
+        Some('a') => ConversionType::HexFloatLower,
+        Some('A') => ConversionType::HexFloatUpper,
+        Some('t') => {
+            let Some(chr) = chars.next() else {
+                return exec_err!("Invalid time format: {}", s);
+            };
+            offset += 1;
+            ConversionType::TimeLower(chr.try_into()?)
+        }
+        Some('T') => {
+            let Some(chr) = chars.next() else {
+                return exec_err!("Invalid time format: {}", s);
+            };
+            offset += 1;
+            ConversionType::TimeUpper(chr.try_into()?)
+        }
+        chr => {
+            return plan_err!("Invalid conversion type: {:?}", chr);
+        }
+    };
+
+    spec.conversion_type.validate(arg_type)?;
+    Ok((spec, &s[offset..]))
+}
+
+fn take_numeric_param(s: &str, zero: bool) -> (NumericParam, &str) {
+    match s.chars().next() {
+        Some(digit) if (if zero { '0'..='9' } else { '1'..='9' }).contains(&digit) => {
+            let mut s = s;
+            let mut w = 0;
+            loop {
+                match s.chars().next() {
+                    Some(digit) if digit.is_ascii_digit() => {
+                        w = 10 * w + (digit as i32 - '0' as i32);
+                    }
+                    _ => {
+                        break;
+                    }
+                }
+                s = &s[1..];
+            }
+            (NumericParam::Literal(w), s)
+        }
+        _ => (NumericParam::FromArgument, s),
+    }
+}
+
+impl ConversionSpecifier {
+    pub fn format(&self, string: &mut String, value: &ScalarValue) -> Result<()> {
+        match value {
+            ScalarValue::Boolean(value) => match self.conversion_type {
+                ConversionType::StringLower | ConversionType::StringUpper => {
+                    self.format_string(string, &value.unwrap_or(false).to_string())
+                }
+
+                _ => self.format_boolean(string, value),
+            },
+            ScalarValue::Int8(value) => match (self.conversion_type, value) {
+                (ConversionType::DecInt, Some(value)) => {
+                    self.format_signed(string, *value as i64)
+                }
+                (
+                    ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, (*value as u8) as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, *value as u8 as char)
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Int8",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Int16(value) => match (self.conversion_type, value) {
+                (ConversionType::DecInt, Some(value)) => {
+                    self.format_signed(string, *value as i64)
+                }
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(
+                        string,
+                        char::from_u32((*value as u16) as u32).unwrap(),
+                    )
+                }
+                (
+                    ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, (*value as u16) as u64),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Int16",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Int32(value) => match (self.conversion_type, value) {
+                (ConversionType::DecInt, Some(value)) => {
+                    self.format_signed(string, *value as i64)
+                }
+                (
+                    ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, (*value as u32) as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, char::from_u32(*value as u32).unwrap())
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Int32",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Int64(value) => match (self.conversion_type, value) {
+                (ConversionType::DecInt, Some(value)) => {
+                    self.format_signed(string, *value)
+                }
+                (
+                    ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, *value as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(
+                        string,
+                        char::from_u32((*value as u64) as u32).unwrap(),
+                    )
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Int64",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::UInt8(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecInt
+                    | ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, *value as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, *value as char)
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for UInt8",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::UInt16(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecInt
+                    | ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, *value as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, char::from_u32(*value as u32).unwrap())
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for UInt16",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::UInt32(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecInt
+                    | ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, *value as u64),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, char::from_u32(*value).unwrap())
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for UInt32",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::UInt64(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecInt
+                    | ConversionType::HexIntLower
+                    | ConversionType::HexIntUpper
+                    | ConversionType::OctInt,
+                    Some(value),
+                ) => self.format_unsigned(string, *value),
+                (ConversionType::CharLower | ConversionType::CharUpper, Some(value)) => {
+                    self.format_char(string, char::from_u32(*value as u32).unwrap())
+                }
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_integer() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for UInt64",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Float16(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecFloatLower
+                    | ConversionType::SciFloatLower
+                    | ConversionType::SciFloatUpper
+                    | ConversionType::CompactFloatLower
+                    | ConversionType::CompactFloatUpper,
+                    Some(value),
+                ) => self.format_float(string, value.to_f64().unwrap()),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_f32().unwrap().spark_string()),
+                (
+                    ConversionType::HexFloatLower | ConversionType::HexFloatUpper,
+                    Some(value),
+                ) => self.format_hex_float(string, value.to_f64().unwrap()),
+                (t, None) if t.supports_float() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Float16",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Float32(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecFloatLower
+                    | ConversionType::SciFloatLower
+                    | ConversionType::SciFloatUpper
+                    | ConversionType::CompactFloatLower
+                    | ConversionType::CompactFloatUpper,
+                    Some(value),
+                ) => self.format_float(string, *value as f64),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.spark_string()),
+                (
+                    ConversionType::HexFloatLower | ConversionType::HexFloatUpper,
+                    Some(value),
+                ) => self.format_hex_float(string, *value as f64),
+                (t, None) if t.supports_float() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Float32",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Float64(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::DecFloatLower
+                    | ConversionType::SciFloatLower
+                    | ConversionType::SciFloatUpper
+                    | ConversionType::CompactFloatLower
+                    | ConversionType::CompactFloatUpper,
+                    Some(value),
+                ) => self.format_float(string, *value),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.spark_string()),
+                (
+                    ConversionType::HexFloatLower | ConversionType::HexFloatUpper,
+                    Some(value),
+                ) => self.format_hex_float(string, *value),
+                (t, None) if t.supports_float() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Float64",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Utf8(value) => {
+                let value: &str = match value {
+                    Some(value) => value.as_str(),
+                    None => "null",
+                };
+                if matches!(
+                    self.conversion_type,
+                    ConversionType::StringLower | ConversionType::StringUpper
+                ) {
+                    self.format_string(string, value)
+                } else {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Utf8",
+                        self.conversion_type
+                    )
+                }
+            }
+            ScalarValue::LargeUtf8(value) => {
+                let value: &str = match value {
+                    Some(value) => value.as_str(),
+                    None => "null",
+                };
+                if matches!(
+                    self.conversion_type,
+                    ConversionType::StringLower | ConversionType::StringUpper
+                ) {
+                    self.format_string(string, value)
+                } else {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for LargeUtf8",
+                        self.conversion_type
+                    )
+                }
+            }
+            ScalarValue::Utf8View(value) => {
+                let value: &str = match value {
+                    Some(value) => value.as_str(),
+                    None => "null",
+                };
+                self.format_string(string, value)
+            }
+            ScalarValue::Decimal128(value, _, scale) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::DecFloatLower
+                        | ConversionType::SciFloatLower
+                        | ConversionType::SciFloatUpper
+                        | ConversionType::CompactFloatLower
+                        | ConversionType::CompactFloatUpper,
+                        Some(value),
+                    ) => self.format_decimal(string, &value.to_string(), *scale as i64),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_decimal() => {
+                        self.format_string(string, "null")
+                    }
+
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for Decimal128",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::Decimal256(value, _, scale) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::DecFloatLower
+                        | ConversionType::SciFloatLower
+                        | ConversionType::SciFloatUpper
+                        | ConversionType::CompactFloatLower
+                        | ConversionType::CompactFloatUpper,
+                        Some(value),
+                    ) => self.format_decimal(string, &value.to_string(), *scale as i64),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_decimal() => {
+                        self.format_string(string, "null")
+                    }
+
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for Decimal256",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+
+            ScalarValue::Time32Second(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                    Some(value),
+                ) => self.format_time(string, *value as i64 * 1000000000, &None),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_time() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Time32Second",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Time32Millisecond(value) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, *value as i64 * 1000000, &None),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for Time32Millisecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::Time64Microsecond(value) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, *value * 1000, &None),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for Time64Microsecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::Time64Nanosecond(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                    Some(value),
+                ) => self.format_time(string, *value, &None),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_time() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Time64Nanosecond",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::TimestampSecond(value, zone) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, value * 1000000000, zone),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for TimestampSecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::TimestampMillisecond(value, zone) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, *value * 1000000, zone),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for TimestampMillisecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::TimestampMicrosecond(value, zone) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, value * 1000, zone),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for timestampmicrosecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+
+            ScalarValue::TimestampNanosecond(value, zone) => {
+                match (self.conversion_type, value) {
+                    (
+                        ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                        Some(value),
+                    ) => self.format_time(string, *value, zone),
+                    (
+                        ConversionType::StringLower | ConversionType::StringUpper,
+                        Some(value),
+                    ) => self.format_string(string, &value.to_string()),
+                    (t, None) if t.supports_time() => self.format_string(string, "null"),
+                    _ => {
+                        exec_err!(
+                            "Invalid conversion type: {:?} for TimestampNanosecond",
+                            self.conversion_type
+                        )
+                    }
+                }
+            }
+            ScalarValue::Date32(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                    Some(value),
+                ) => self.format_date(string, *value as i64),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_time() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Date32",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Date64(value) => match (self.conversion_type, value) {
+                (
+                    ConversionType::TimeLower(_) | ConversionType::TimeUpper(_),
+                    Some(value),
+                ) => self.format_date(string, *value),
+                (
+                    ConversionType::StringLower | ConversionType::StringUpper,
+                    Some(value),
+                ) => self.format_string(string, &value.to_string()),
+                (t, None) if t.supports_time() => self.format_string(string, "null"),
+                _ => {
+                    exec_err!(
+                        "Invalid conversion type: {:?} for Date64",
+                        self.conversion_type
+                    )
+                }
+            },
+            ScalarValue::Null => {
+                let value = "null".to_string();
+                self.format_string(string, &value)
+            }
+            _ => exec_err!("Invalid scalar value: {value}"),
+        }
+    }
+
+    fn format_hex_float(&self, writer: &mut String, value: f64) -> Result<()> {
+        // Handle special cases first
+        let (sign, raw_exponent, mantissa) = value.to_parts();
+        let is_subnormal = raw_exponent == 0;
+
+        let precision = match self.precision {
+            NumericParam::FromArgument => None,
+            NumericParam::Literal(p) => Some(p),
+        };
+
+        // Determine if we need to normalize subnormal numbers
+        // Only normalize when precision is specified and less than full mantissa width
+        let mantissa_hex_digits = f64::MANTISSA_BITS.div_ceil(4); // 13 for f64
+        let should_normalize = is_subnormal
+            && precision.is_some()
+            && precision.unwrap() < mantissa_hex_digits as i32;
+
+        let (value, raw_exponent, mantissa) = if should_normalize {
+            let value = value * f64::SCALEUP;
+            let (_, raw_exponent, mantissa) = value.to_parts();
+            (value, raw_exponent, mantissa)
+        } else {
+            (value, raw_exponent, mantissa)
+        };
+
+        let mut temp = String::new();
+
+        let sign_char = if sign {
+            "-"
+        } else if self.force_sign {
+            "+"
+        } else if self.space_sign {
+            " "
+        } else {
+            ""
+        };
+        match value.category() {
+            FpCategory::Nan => {
+                write!(&mut temp, "NaN")?;
+            }
+            FpCategory::Infinite => {
+                write!(&mut temp, "{sign_char}Infinity")?;
+            }
+            FpCategory::Zero => {
+                write!(&mut temp, "{sign_char}0x0.0p0")?;
+            }
+            _ => {
+                let bias = i32::from(f64::EXPONENT_BIAS);
+                // Calculate actual exponent
+                // For subnormal numbers, the exponent is 1 - bias (not 0 - bias)
+                let exponent = if is_subnormal && !should_normalize {
+                    1 - bias
+                } else {
+                    raw_exponent as i32 - bias
+                };
+
+                // Handle precision for rounding
+                let final_mantissa = if let Some(p) = precision {
+                    if p == 0 {
+                        // For precision 0, we still need at least 1 hex digit
+                        // Round to the nearest integer mantissa value
+                        let shift_distance = f64::MANTISSA_BITS as i32 - 4; // Keep 1 hex digit (4 bits)
+                        let shifted = mantissa >> shift_distance;
+                        let rounding_bits = mantissa & ((1u64 << shift_distance) - 1);
+                        let round_bit = 1u64 << (shift_distance - 1);
+
+                        // Round to nearest, ties to even
+                        if rounding_bits > round_bit
+                            || (rounding_bits == round_bit && (shifted & 1) != 0)
+                        {
+                            (shifted + 1) << shift_distance
+                        } else {
+                            shifted << shift_distance
+                        }
+                    } else {
+                        // Apply rounding based on precision
+                        let precision_bits = p * 4; // Each hex digit is 4 bits
+                        let keep_bits = f64::MANTISSA_BITS as i32;
+                        let shift_distance = keep_bits - precision_bits;
+
+                        if shift_distance > 0 {
+                            let shifted = mantissa >> shift_distance;
+                            let rounding_bits = mantissa & ((1u64 << shift_distance) - 1);
+                            let round_bit = 1u64 << (shift_distance - 1);
+
+                            // Round to nearest, ties to even
+                            if rounding_bits > round_bit
+                                || (rounding_bits == round_bit && (shifted & 1) != 0)
+                            {
+                                (shifted + 1) << shift_distance
+                            } else {
+                                shifted << shift_distance
+                            }
+                        } else {
+                            mantissa
+                        }
+                    }
+                } else {
+                    mantissa
+                };
+
+                if is_subnormal && !should_normalize {
+                    // Original subnormal format: 0x0.xxxp-1022
+                    if precision.is_some() {
+                        // precision >= 13, show as subnormal
+                        let full_hex = format!(
+                            "{:0width$x}",
+                            final_mantissa,
+                            width = mantissa_hex_digits as usize
+                        );
+                        write!(&mut temp, "{sign_char}0x0.{full_hex}p{exponent}")?;
+                    } else {
+                        // No precision specified, show full subnormal
+                        let hex_digits = format!(
+                            "{:0width$x}",
+                            final_mantissa,
+                            width = mantissa_hex_digits as usize
+                        );
+                        write!(&mut temp, "{sign_char}0x0.{hex_digits}p{exponent}")?;
+                    }
+                } else {
+                    // Normal format or normalized subnormal: 0x1.xxxpN
+                    if let Some(p) = precision {
+                        let p = if p == 0 { 1 } else { p };
+                        let hex_digits = format!("{final_mantissa:x}");
+                        let formatted_digits = if p as usize >= hex_digits.len() {
+                            // Pad with zeros to match precision
+                            format!("{:0<width$}", hex_digits, width = p as usize)
+                        } else {
+                            hex_digits[..p as usize].to_string()
+                        };
+                        write!(
+                            &mut temp,
+                            "{sign_char}0x1.{formatted_digits}p{exponent}"
+                        )?;
+                    } else {
+                        // Default: show all significant digits
+                        let mut hex_digits = format!("{final_mantissa:x}");
+                        hex_digits = trim_trailing_0s_hex(&hex_digits).to_owned();
+                        if hex_digits.is_empty() {
+                            write!(&mut temp, "{sign_char}0x1.0p{exponent}")?;
+                        } else {
+                            write!(&mut temp, "{sign_char}0x1.{hex_digits}p{exponent}")?;
+                        }
+                    }
+                }
+                if should_normalize {
+                    let (prefix, exp) = temp.split_once('p').unwrap();
+                    let iexp = exp.parse::<i32>().unwrap() - f64::SCALEUP_POWER as i32;
+                    temp = format!("{prefix}p{iexp}");
+                }
+            }
+        };
+
+        if self.conversion_type.is_upper() {
+            temp = temp.to_ascii_uppercase();
+        }
+
+        let NumericParam::Literal(width) = self.width else {
+            writer.push_str(&temp);
+            return Ok(());
+        };
+        if self.left_adj {
+            writer.push_str(&temp);
+            for _ in temp.len()..width as usize {
+                writer.push(' ');
+            }
+        } else if self.zero_pad && value.is_finite() {
+            let delimiter = if self.conversion_type.is_upper() {
+                "0X"
+            } else {
+                "0x"
+            };
+            let (prefix, suffix) = temp.split_once(delimiter).unwrap();
+            writer.push_str(prefix);
+            writer.push_str(delimiter);
+            for _ in temp.len()..width as usize {
+                writer.push('0');
+            }
+            writer.push_str(suffix);
+        } else {
+            while temp.len() < width as usize {
+                temp = " ".to_owned() + &temp;
+            }
+            writer.push_str(&temp);
+        };
+        Ok(())
+    }
+
+    fn format_char(&self, writer: &mut String, value: char) -> Result<()> {
+        let upper = self.conversion_type.is_upper();
+        match self.conversion_type {
+            ConversionType::CharLower | ConversionType::CharUpper => {
+                let NumericParam::Literal(width) = self.width else {
+                    if upper {
+                        writer.push(value.to_ascii_uppercase());
+                    } else {
+                        writer.push(value);
+                    }
+                    return Ok(());
+                };
+
+                let start_len = writer.len();
+                if self.left_adj {
+                    if upper {
+                        writer.push(value.to_ascii_uppercase());
+                    } else {
+                        writer.push(value);
+                    }
+                    while writer.len() - start_len < width as usize {
+                        writer.push(' ');
+                    }
+                } else {
+                    while writer.len() - start_len + value.len_utf8() < width as usize {
+                        writer.push(' ');
+                    }
+                    if upper {
+                        writer.push(value.to_ascii_uppercase());
+                    } else {
+                        writer.push(value);
+                    }
+                }
+                Ok(())
+            }
+            _ => exec_err!(
+                "Invalid conversion type: {:?} for char",
+                self.conversion_type
+            ),
+        }
+    }
+
+    fn format_boolean(&self, writer: &mut String, value: &Option<bool>) -> Result<()> {
+        let value = value.unwrap_or(false);
+
+        let formatted = match self.conversion_type {
+            ConversionType::BooleanUpper => {
+                if value {
+                    "TRUE"
+                } else {
+                    "FALSE"
+                }
+            }
+            ConversionType::BooleanLower => {
+                if value {
+                    "true"
+                } else {
+                    "false"
+                }
+            }
+            _ => {
+                return exec_err!(
+                    "Invalid conversion type: {:?} for boolean array",
+                    self.conversion_type
+                );
+            }
+        };
+        self.format_str(writer, formatted)
+    }
+
+    fn format_float(&self, writer: &mut String, value: f64) -> Result<()> {
+        let mut prefix = String::new();
+        let mut suffix = String::new();
+        let mut number = String::new();
+        let upper = self.conversion_type.is_upper();
+
+        // set up the sign
+        if value.is_sign_negative() {
+            if self.negative_in_parentheses {
+                prefix.push('(');
+                suffix.push(')');
+            } else {
+                prefix.push('-');
+            }
+        } else if self.space_sign {
+            prefix.push(' ');
+        } else if self.force_sign {
+            prefix.push('+');
+        }
+
+        if value.is_finite() {
+            let mut use_scientific = false;
+            let mut strip_trailing_0s = false;
+            let mut abs = value.abs();
+            let mut exponent = abs.log10().floor() as i32;
+            let mut precision = match self.precision {
+                NumericParam::Literal(p) => p,
+                _ => 6,
+            };
+            match self.conversion_type {
+                ConversionType::DecFloatLower => {
+                    // default
+                }
+                ConversionType::SciFloatLower => {
+                    use_scientific = true;
+                }
+                ConversionType::SciFloatUpper => {
+                    use_scientific = true;
+                }
+                ConversionType::CompactFloatLower | ConversionType::CompactFloatUpper => {
+                    strip_trailing_0s = true;
+                    if precision == 0 {
+                        precision = 1;
+                    }
+                    // exponent signifies significant digits - we must round now
+                    // to (re)calculate the exponent
+                    let rounding_factor =
+                        10.0_f64.powf((precision - 1 - exponent) as f64);
+                    let rounded_fixed = (abs * rounding_factor).round();
+                    abs = rounded_fixed / rounding_factor;
+                    exponent = abs.log10().floor() as i32;
+                    if exponent < -4 || exponent >= precision {
+                        use_scientific = true;
+                        precision -= 1;
+                    } else {
+                        // precision specifies the number of significant digits
+                        precision -= 1 + exponent;
+                    }
+                }
+                _ => {
+                    return exec_err!(
+                        "Invalid conversion type: {:?} for float",
+                        self.conversion_type
+                    );
+                }
+            }
+
+            if use_scientific {
+                // Manual scientific notation formatting for uppercase E
+                let mantissa = abs / 10.0_f64.powf(exponent as f64);
+                let exp_char = if upper { 'E' } else { 'e' };
+                number = format!("{mantissa:.prec$}", prec = precision as usize);
+                if strip_trailing_0s {
+                    number = trim_trailing_0s(&number).to_owned();
+                }
+                number = format!("{number}{exp_char}{exponent:+03}");
+            } else {
+                number = format!("{abs:.prec$}", prec = precision as usize);
+                if strip_trailing_0s {
+                    number = trim_trailing_0s(&number).to_owned();
+                }
+            }
+            if self.alt_form && !number.contains('.') {
+                number += ".";
+            }
+        } else {
+            // not finite
+            match self.conversion_type {
+                ConversionType::DecFloatLower
+                | ConversionType::SciFloatLower
+                | ConversionType::CompactFloatLower => {
+                    if value.is_infinite() {
+                        number.push_str("Infinity")
+                    } else {
+                        number.push_str("NaN")
+                    }
+                }
+                ConversionType::SciFloatUpper | ConversionType::CompactFloatUpper => {
+                    if value.is_infinite() {
+                        number.push_str("INFINITY")
+                    } else {
+                        number.push_str("NAN")
+                    }
+                }
+                _ => {
+                    return exec_err!(
+                        "Invalid conversion type: {:?} for float",
+                        self.conversion_type
+                    );
+                }
+            }
+        }
+        // Take care of padding
+        let NumericParam::Literal(width) = self.width else {
+            writer.push_str(&prefix);
+            writer.push_str(&number);
+            writer.push_str(&suffix);
+            return Ok(());
+        };
+        if self.left_adj {
+            let mut full_num = prefix + &number + &suffix;
+            while full_num.len() < width as usize {
+                full_num.push(' ');
+            }
+            writer.push_str(&full_num);
+        } else if self.zero_pad && value.is_finite() {
+            while prefix.len() + number.len() + suffix.len() < width as usize {
+                prefix.push('0');
+            }
+            writer.push_str(&prefix);
+            writer.push_str(&number);
+            writer.push_str(&suffix);
+        } else {
+            let mut full_num = prefix + &number + &suffix;
+            while full_num.len() < width as usize {
+                full_num = " ".to_owned() + &full_num;
+            }
+            writer.push_str(&full_num);
+        };
+
+        Ok(())
+    }
+
+    fn format_signed(&self, writer: &mut String, value: i64) -> Result<()> {
+        let negative = value < 0;
+        let abs_val = value.abs();
+
+        let (sign_prefix, sign_suffix) = if negative && self.negative_in_parentheses {
+            ("(".to_owned(), ")".to_owned())
+        } else if negative {
+            ("-".to_owned(), "".to_owned())
+        } else if self.force_sign {
+            ("+".to_owned(), "".to_owned())
+        } else if self.space_sign {
+            (" ".to_owned(), "".to_owned())
+        } else {
+            ("".to_owned(), "".to_owned())
+        };
+
+        let mut mod_spec = *self;
+        mod_spec.width = match self.width {
+            NumericParam::Literal(w) => NumericParam::Literal(
+                w - sign_prefix.len() as i32 - sign_suffix.len() as i32,
+            ),
+            _ => NumericParam::FromArgument,
+        };
+        let mut formatted = String::new();
+        mod_spec.format_unsigned(&mut formatted, abs_val as u64)?;
+        // put the sign a after any leading spaces
+        let mut actual_number = &formatted[0..];
+        let mut leading_spaces = &formatted[0..0];
+        if let Some(first_non_space) = formatted.find(|c| c != ' ') {
+            actual_number = &formatted[first_non_space..];
+            leading_spaces = &formatted[0..first_non_space];
+        }
+        write!(
+            writer,
+            "{}{}{}{}",
+            leading_spaces.to_owned(),
+            sign_prefix,
+            actual_number,
+            sign_suffix
+        )
+        .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+        Ok(())
+    }
+
+    fn format_unsigned(&self, writer: &mut String, value: u64) -> Result<()> {
+        let mut s = String::new();
+        let mut alt_prefix = "";
+        match self.conversion_type {
+            ConversionType::DecInt => {
+                let num_str = format!("{value}");
+                if self.grouping_separator {
+                    // Add thousands separators
+                    let mut result = String::new();
+                    let chars: Vec<char> = num_str.chars().collect();
+                    for (i, c) in chars.iter().enumerate() {
+                        if i > 0 && (chars.len() - i).is_multiple_of(3) {
+                            result.push(',');
+                        }
+                        result.push(*c);
+                    }
+                    s = result;
+                } else {
+                    s = num_str;
+                }
+            }
+            ConversionType::HexIntLower => {
+                alt_prefix = "0x";
+                write!(&mut s, "{value:x}")
+                    .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+            }
+            ConversionType::HexIntUpper => {
+                alt_prefix = "0X";
+                write!(&mut s, "{value:X}")
+                    .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+            }
+            ConversionType::OctInt => {
+                alt_prefix = "0";
+                write!(&mut s, "{value:o}")
+                    .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+            }
+            _ => {
+                return exec_err!(
+                    "Invalid conversion type: {:?} for u64",
+                    self.conversion_type
+                );
+            }
+        }
+        let mut prefix = if self.alt_form {
+            alt_prefix.to_owned()
+        } else {
+            String::new()
+        };
+
+        let formatted = if let NumericParam::Literal(width) = self.width {
+            if self.left_adj {
+                let mut num_str = prefix + &s;
+                while num_str.len() < width as usize {
+                    num_str.push(' ');
+                }
+                num_str
+            } else if self.zero_pad {
+                while prefix.len() + s.len() < width as usize {
+                    prefix.push('0');
+                }
+                prefix + &s
+            } else {
+                let mut num_str = prefix + &s;
+                while num_str.len() < width as usize {
+                    num_str = " ".to_owned() + &num_str;
+                }
+                num_str
+            }
+        } else {
+            prefix + &s
+        };
+        write!(writer, "{formatted}")
+            .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+        Ok(())
+    }
+
+    fn format_str(&self, writer: &mut String, value: &str) -> Result<()> {
+        // Take care of precision, putting the truncated string in `content`
+        let precision: usize = match self.precision {
+            NumericParam::Literal(p) => p,
+            _ => i32::MAX,
+        }
+        .try_into()
+        .unwrap_or_default();
+        let content_len = {
+            let mut content_len = precision.min(value.len());
+            while !value.is_char_boundary(content_len) {
+                content_len -= 1;
+            }
+            content_len
+        };
+        let content = &value[..content_len];
+
+        // Pad to width if needed, putting the padded string in `s`
+
+        if let NumericParam::Literal(width) = self.width {
+            let start_len = writer.len();
+            if self.left_adj {
+                writer.push_str(content);
+                while writer.len() - start_len < width as usize {
+                    writer.push(' ');
+                }
+            } else {
+                while writer.len() - start_len + content.len() < width as usize {
+                    writer.push(' ');
+                }
+                writer.push_str(content);
+            }
+        } else {
+            writer.push_str(content);
+        }
+        Ok(())
+    }
+
+    fn format_string(&self, writer: &mut String, value: &str) -> Result<()> {
+        if self.conversion_type.is_upper() {
+            let upper = value.to_ascii_uppercase();
+            self.format_str(writer, &upper)
+        } else {
+            self.format_str(writer, value)
+        }
+    }
+
+    fn format_decimal(&self, writer: &mut String, value: &str, scale: i64) -> Result<()> {
+        let mut prefix = String::new();
+        let upper = self.conversion_type.is_upper();
+
+        // Parse as BigDecimal
+        let decimal = value
+            .parse::<BigInt>()
+            .map_err(|e| exec_datafusion_err!("Failed to parse decimal: {}", e))?;
+        let decimal = BigDecimal::from_bigint(decimal, scale);
+
+        // Handle sign
+        let is_negative = decimal.sign() == Sign::Minus;
+        let abs_decimal = decimal.abs();
+
+        if is_negative {
+            prefix.push('-');
+        } else if self.space_sign {
+            prefix.push(' ');
+        } else if self.force_sign {
+            prefix.push('+');
+        }
+
+        let exp_symb = if upper { 'E' } else { 'e' };
+        let mut strip_trailing_0s = false;
+
+        // Get precision setting
+        let mut precision = match self.precision {
+            NumericParam::Literal(p) => p,
+            _ => 6,
+        };
+
+        let number = match self.conversion_type {
+            ConversionType::DecFloatLower => {
+                // Format as fixed-point decimal
+                self.format_decimal_fixed(&abs_decimal, precision, strip_trailing_0s)?
+            }
+            ConversionType::SciFloatLower => self.format_decimal_scientific(
+                &abs_decimal,
+                precision,
+                'e',
+                strip_trailing_0s,
+            )?,
+            ConversionType::SciFloatUpper => self.format_decimal_scientific(
+                &abs_decimal,
+                precision,
+                'E',
+                strip_trailing_0s,
+            )?,
+            ConversionType::CompactFloatLower | ConversionType::CompactFloatUpper => {
+                strip_trailing_0s = true;
+                if precision == 0 {
+                    precision = 1;
+                }
+                // Determine if we should use scientific notation
+                let log10_val = abs_decimal.to_f64().map(|f| f.log10()).unwrap_or(0.0);
+                if log10_val < -4.0 || log10_val >= precision as f64 {
+                    self.format_decimal_scientific(
+                        &abs_decimal,
+                        precision - 1,
+                        exp_symb,
+                        strip_trailing_0s,
+                    )?
+                } else {
+                    self.format_decimal_fixed(
+                        &abs_decimal,
+                        precision - 1 - log10_val.floor() as i32,
+                        strip_trailing_0s,
+                    )?
+                }
+            }
+            _ => {
+                return exec_err!(
+                    "Invalid conversion type: {:?} for decimal",
+                    self.conversion_type
+                );
+            }
+        };
+
+        // Handle padding
+        let NumericParam::Literal(width) = self.width else {
+            writer.push_str(&prefix);
+            writer.push_str(&number);
+            return Ok(());
+        };
+
+        if self.left_adj {
+            let mut full_num = prefix + &number;
+            while full_num.len() < width as usize {
+                full_num.push(' ');
+            }
+            writer.push_str(&full_num);
+        } else if self.zero_pad {
+            while prefix.len() + number.len() < width as usize {
+                prefix.push('0');
+            }
+            writer.push_str(&prefix);
+            writer.push_str(&number);
+        } else {
+            let mut full_num = prefix + &number;
+            while full_num.len() < width as usize {
+                full_num = " ".to_owned() + &full_num;
+            }
+            writer.push_str(&full_num);
+        }
+
+        Ok(())
+    }
+
+    fn format_decimal_fixed(
+        &self,
+        decimal: &BigDecimal,
+        precision: i32,
+        strip_trailing_0s: bool,
+    ) -> Result<String> {
+        if precision <= 0 {
+            Ok(decimal.round(0).to_string())
+        } else {
+            // Use BigDecimal's with_scale method for precise decimal formatting
+            let scaled = decimal.round(precision as i64);
+            let mut number = scaled.to_string();
+            if strip_trailing_0s {
+                number = trim_trailing_0s(&number).to_owned();
+            }
+            Ok(number)
+        }
+    }
+
+    fn format_decimal_scientific(
+        &self,
+        decimal: &BigDecimal,
+        precision: i32,
+        exp_char: char,
+        strip_trailing_0s: bool,
+    ) -> Result<String> {
+        // Convert to f64 for scientific notation (may lose precision for very large numbers)
+        let float_val = decimal.to_f64().unwrap_or(0.0);
+        if float_val == 0.0 {
+            return Ok(format!("0{exp_char}+00"));
+        }
+
+        let abs_val = float_val.abs();
+        let exponent = abs_val.log10().floor() as i32;
+        let mantissa = abs_val / 10.0_f64.powf(exponent as f64);
+
+        let mut number = if precision <= 0 {
+            format!("{mantissa:.0}")
+        } else {
+            format!("{mantissa:.prec$}", prec = precision as usize)
+        };
+
+        if strip_trailing_0s {
+            number = trim_trailing_0s(&number).to_owned();
+        }
+
+        Ok(format!("{number}{exp_char}{exponent:+03}"))
+    }
+
+    fn format_time(
+        &self,
+        writer: &mut String,
+        timestamp_nanos: i64,
+        timezone: &Option<Arc<str>>,
+    ) -> Result<()> {
+        let upper = self.conversion_type.is_upper();
+        match &self.conversion_type {
+            ConversionType::TimeLower(time_format)
+            | ConversionType::TimeUpper(time_format) => {
+                let formatted =
+                    self.format_time_component(timestamp_nanos, *time_format, timezone)?;
+                let result = if upper {
+                    formatted.to_uppercase()
+                } else {
+                    formatted
+                };
+                write!(writer, "{result}")
+                    .map_err(|e| exec_datafusion_err!("Write error: {}", e))?;
+                Ok(())
+            }
+            _ => exec_err!(
+                "Invalid conversion type for time: {:?}",
+                self.conversion_type
+            ),
+        }
+    }
+
+    fn format_date(&self, writer: &mut String, date_days: i64) -> Result<()> {
+        // Convert days since epoch to timestamp in nanoseconds
+        let timestamp_nanos = date_days * 24 * 60 * 60 * 1_000_000_000;
+        self.format_time(writer, timestamp_nanos, &None)
+    }
+
+    fn format_time_component(
+        &self,
+        timestamp_nanos: i64,
+        time_format: TimeFormat,
+        _timezone: &Option<Arc<str>>,
+    ) -> Result<String> {
+        // Convert nanoseconds to seconds and nanoseconds remainder
+        let secs = timestamp_nanos / 1_000_000_000;
+        let nanos = (timestamp_nanos % 1_000_000_000) as u32;
+
+        // Create DateTime from timestamp
+        let dt = DateTime::<Utc>::from_timestamp(secs, nanos).ok_or_else(|| {
+            exec_datafusion_err!("Invalid timestamp: {}", timestamp_nanos)
+        })?;
+
+        match time_format {
+            TimeFormat::HUpper => Ok(format!("{:02}", dt.hour())),
+            TimeFormat::IUpper => {
+                let hour_12 = match dt.hour12() {
+                    (true, h) => h,  // PM
+                    (false, h) => h, // AM
+                };
+                Ok(format!("{hour_12:02}"))
+            }
+            TimeFormat::KLower => Ok(format!("{}", dt.hour())),
+            TimeFormat::LLower => {
+                let hour_12 = match dt.hour12() {
+                    (true, h) => h,  // PM
+                    (false, h) => h, // AM
+                };
+                Ok(format!("{hour_12}"))
+            }
+            TimeFormat::MUpper => Ok(format!("{:02}", dt.minute())),
+            TimeFormat::SUpper => Ok(format!("{:02}", dt.second())),
+            TimeFormat::LUpper => Ok(format!("{:03}", dt.timestamp_millis() % 1000)),
+            TimeFormat::NUpper => Ok(format!("{:09}", dt.nanosecond())),
+            TimeFormat::PLower => {
+                let (is_pm, _) = dt.hour12();
+                Ok(if is_pm {
+                    "pm".to_string()
+                } else {
+                    "am".to_string()
+                })
+            }
+            TimeFormat::ZLower => Ok("+0000".to_string()), // UTC timezone offset
+            TimeFormat::ZUpper => Ok("UTC".to_string()),   // UTC timezone name
+            TimeFormat::SLower => Ok(format!("{}", dt.timestamp())),
+            TimeFormat::QUpper => Ok(format!("{}", dt.timestamp_millis())),
+            TimeFormat::BUpper => Ok(dt.format("%B").to_string()), // Full month name
+            TimeFormat::BLower => Ok(dt.format("%b").to_string()), // Abbreviated month name
+            TimeFormat::AUpper => Ok(dt.format("%A").to_string()), // Full weekday name
+            TimeFormat::ALower => Ok(dt.format("%a").to_string()), // Abbreviated weekday name
+            TimeFormat::CUpper => Ok(format!("{:02}", dt.year() / 100)),
+            TimeFormat::YUpper => Ok(format!("{:04}", dt.year())),
+            TimeFormat::YLower => Ok(format!("{:02}", dt.year() % 100)),
+            TimeFormat::JLower => Ok(format!("{:03}", dt.ordinal())), // Day of year
+            TimeFormat::MLower => Ok(format!("{:02}", dt.month())),
+            TimeFormat::DLower => Ok(format!("{:02}", dt.day())),
+            TimeFormat::ELower => Ok(format!("{}", dt.day())),
+            TimeFormat::RUpper => Ok(dt.format("%H:%M").to_string()),
+            TimeFormat::TUpper => Ok(dt.format("%H:%M:%S").to_string()),
+            TimeFormat::RLower => {
+                let (is_pm, hour_12) = dt.hour12();
+                let am_pm = if is_pm { "PM" } else { "AM" };
+                Ok(format!(
+                    "{:02}:{:02}:{:02} {}",
+                    hour_12,
+                    dt.minute(),
+                    dt.second(),
+                    am_pm
+                ))
+            }
+            TimeFormat::DUpper => Ok(dt.format("%m/%d/%y").to_string()),
+            TimeFormat::FUpper => Ok(dt.format("%Y-%m-%d").to_string()),
+            TimeFormat::CLower => Ok(dt.format("%a %b %d %H:%M:%S UTC %Y").to_string()),
+        }
+    }
+}
+
+trait FloatFormattable: std::fmt::Display {
+    fn category(&self) -> FpCategory;
+
+    fn spark_string(&self) -> String {
+        match self.category() {
+            FpCategory::Nan => "NaN".to_string(),
+            FpCategory::Infinite => {
+                if self.negative() {
+                    "-Infinity".to_string()
+                } else {
+                    "Infinity".to_string()
+                }
+            }
+            _ => self.to_string(),
+        }
+    }
+    fn negative(&self) -> bool;
+}
+
+impl FloatFormattable for f32 {
+    fn category(&self) -> FpCategory {
+        self.classify()
+    }
+
+    fn negative(&self) -> bool {
+        self.is_sign_negative()
+    }
+}
+
+impl FloatFormattable for f64 {
+    fn category(&self) -> FpCategory {
+        self.classify()
+    }
+
+    fn negative(&self) -> bool {
+        self.is_sign_negative()
+    }
+}
+
+trait FloatBits: FloatFormattable {
+    const MANTISSA_BITS: u8;
+    const EXPONENT_BIAS: u16;
+    const SCALEUP_POWER: u8;
+    const SCALEUP: Self;
+
+    fn to_parts(&self) -> (bool, u16, u64);
+}
+
+impl FloatBits for f64 {
+    const MANTISSA_BITS: u8 = 52;
+    const EXPONENT_BIAS: u16 = 1023;
+    const SCALEUP_POWER: u8 = 54;
+    const SCALEUP: f64 = (1_i64 << Self::SCALEUP_POWER) as f64;
+
+    fn to_parts(&self) -> (bool, u16, u64) {
+        let bits = self.to_bits();
+        let sign: bool = (bits >> 63) == 1;
+        let exponent = ((bits >> 52) & 0x7FF) as u16;
+        let mantissa = bits & 0x000F_FFFF_FFFF_FFFF;
+        (sign, exponent, mantissa)
+    }
+}
+
+fn trim_trailing_0s(number: &str) -> &str {
+    if number.contains('.') {
+        for (i, c) in number.chars().rev().enumerate() {
+            if c != '0' {
+                return &number[..number.len() - i];
+            }
+        }
+    }
+    number
+}
+
+fn trim_trailing_0s_hex(number: &str) -> &str {
+    for (i, c) in number.chars().rev().enumerate() {
+        if c != '0' {
+            return &number[..number.len() - i];
+        }
+    }
+    number
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::DataType::Utf8;
+    use datafusion_common::Result;
+
+    #[test]
+    fn test_format_string_nullability() -> Result<()> {
+        let func = FormatStringFunc::new();
+        let nullable_format: FieldRef = Arc::new(Field::new("fmt", Utf8, true));
+
+        let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[nullable_format],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            out_nullable.is_nullable(),
+            "format_string(fmt, ...) should be nullable when fmt is nullable"
+        );
+        let non_nullable_format: FieldRef = Arc::new(Field::new("fmt", Utf8, false));
+
+        let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_format],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            !out_non_nullable.is_nullable(),
+            "format_string(fmt, ...) should NOT be nullable when fmt is NOT nullable"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/ilike.rs b/datafusion/spark/src/function/string/ilike.rs
new file mode 100644
index 0000000000000..0d90bd1694175
--- /dev/null
+++ b/datafusion/spark/src/function/string/ilike.rs
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::compute::ilike;
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// ILIKE function for case-insensitive pattern matching
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#ilike>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkILike {
+    signature: Signature,
+}
+
+impl Default for SparkILike {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkILike {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkILike {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "ilike"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<Arc<Field>> {
+        // ILIKE returns a boolean value
+        // The result is nullable if any of the input arguments is nullable
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new("ilike", DataType::Boolean, nullable)))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_ilike, vec![])(&args.args)
+    }
+}
+
+/// Returns true if str matches pattern (case insensitive).
+pub fn spark_ilike(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 2 {
+        return exec_err!("ilike function requires exactly 2 arguments");
+    }
+
+    let result = ilike(&args[0], &args[1])?;
+    Ok(Arc::new(result))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, BooleanArray};
+    use arrow::datatypes::{DataType::Boolean, Field};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarUDFImpl};
+
+    macro_rules! test_ilike_string_invoke {
+        ($INPUT1:expr, $INPUT2:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                SparkILike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+
+            test_scalar_function!(
+                SparkILike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+
+            test_scalar_function!(
+                SparkILike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+        };
+    }
+
+    #[test]
+    fn test_ilike_invoke() -> Result<()> {
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("_park")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("_PARK")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("SPARK")),
+            Some(String::from("_park")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("sp%")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("SP%")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("%ARK")),
+            Ok(Some(true))
+        );
+        test_ilike_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("xyz")),
+            Ok(Some(false))
+        );
+        test_ilike_string_invoke!(None, Some(String::from("_park")), Ok(None));
+        test_ilike_string_invoke!(Some(String::from("Spark")), None, Ok(None));
+        test_ilike_string_invoke!(None, None, Ok(None));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ilike_nullability() {
+        let ilike = SparkILike::new();
+
+        // Test with non-nullable arguments
+        let non_nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, false));
+        let non_nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, false));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should not be nullable when both inputs are non-nullable
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with first argument nullable
+        let nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, true));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when first input is nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with second argument nullable
+        let nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, true));
+
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when second input is nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+
+        // Test with both arguments nullable
+        let result = ilike
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field1), Arc::clone(&nullable_field2)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when both inputs are nullable
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &Boolean);
+    }
+}
diff --git a/datafusion/spark/src/function/string/length.rs b/datafusion/spark/src/function/string/length.rs
new file mode 100644
index 0000000000000..078b294cac07d
--- /dev/null
+++ b/datafusion/spark/src/function/string/length.rs
@@ -0,0 +1,326 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, AsArray, BinaryArrayType, PrimitiveArray, StringArrayType,
+};
+use arrow::datatypes::{DataType, Field, FieldRef, Int32Type};
+use datafusion_common::exec_err;
+use datafusion_expr::{
+    ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::sync::Arc;
+
+/// Spark-compatible `length` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#length>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkLengthFunc {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkLengthFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkLengthFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Utf8View,
+                    DataType::Utf8,
+                    DataType::LargeUtf8,
+                    DataType::Binary,
+                    DataType::LargeBinary,
+                    DataType::BinaryView,
+                ],
+                Volatility::Immutable,
+            ),
+            aliases: vec![
+                String::from("character_length"),
+                String::from("char_length"),
+                String::from("len"),
+            ],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkLengthFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "length"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _args: &[DataType]) -> datafusion_common::Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called, use return_field_from_args instead"
+        )
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        make_scalar_function(spark_length, vec![])(&args.args)
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn return_field_from_args(
+        &self,
+        args: ReturnFieldArgs,
+    ) -> datafusion_common::Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        // spark length always returns Int32
+        Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable)))
+    }
+}
+
+fn spark_length(args: &[ArrayRef]) -> datafusion_common::Result<ArrayRef> {
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = args[0].as_string::<i32>();
+            character_length::<_>(&string_array)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = args[0].as_string::<i64>();
+            character_length::<_>(&string_array)
+        }
+        DataType::Utf8View => {
+            let string_array = args[0].as_string_view();
+            character_length::<_>(&string_array)
+        }
+        DataType::Binary => {
+            let binary_array = args[0].as_binary::<i32>();
+            byte_length::<_>(&binary_array)
+        }
+        DataType::LargeBinary => {
+            let binary_array = args[0].as_binary::<i64>();
+            byte_length::<_>(&binary_array)
+        }
+        DataType::BinaryView => {
+            let binary_array = args[0].as_binary_view();
+            byte_length::<_>(&binary_array)
+        }
+        other => exec_err!("Unsupported data type {other:?} for function `length`"),
+    }
+}
+
+fn character_length<'a, V>(array: &V) -> datafusion_common::Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+{
+    // String characters are variable length encoded in UTF-8, counting the
+    // number of chars requires expensive decoding, however checking if the
+    // string is ASCII only is relatively cheap.
+    // If strings are ASCII only, count bytes instead.
+    let is_array_ascii_only = array.is_ascii();
+    let nulls = array.nulls().cloned();
+    let array = {
+        if is_array_ascii_only {
+            let values: Vec<_> = (0..array.len())
+                .map(|i| {
+                    // Safety: we are iterating with array.len() so the index is always valid
+                    let value = unsafe { array.value_unchecked(i) };
+                    value.len() as i32
+                })
+                .collect();
+            PrimitiveArray::<Int32Type>::new(values.into(), nulls)
+        } else {
+            let values: Vec<_> = (0..array.len())
+                .map(|i| {
+                    // Safety: we are iterating with array.len() so the index is always valid
+                    if array.is_null(i) {
+                        i32::default()
+                    } else {
+                        let value = unsafe { array.value_unchecked(i) };
+                        if value.is_empty() {
+                            i32::default()
+                        } else if value.is_ascii() {
+                            value.len() as i32
+                        } else {
+                            value.chars().count() as i32
+                        }
+                    }
+                })
+                .collect();
+            PrimitiveArray::<Int32Type>::new(values.into(), nulls)
+        }
+    };
+
+    Ok(Arc::new(array))
+}
+
+fn byte_length<'a, V>(array: &V) -> datafusion_common::Result<ArrayRef>
+where
+    V: BinaryArrayType<'a>,
+{
+    let nulls = array.nulls().cloned();
+    let values: Vec<_> = (0..array.len())
+        .map(|i| {
+            // Safety: we are iterating with array.len() so the index is always valid
+            let value = unsafe { array.value_unchecked(i) };
+            value.len() as i32
+        })
+        .collect();
+    Ok(Arc::new(PrimitiveArray::<Int32Type>::new(
+        values.into(),
+        nulls,
+    )))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, Int32Array};
+    use arrow::datatypes::DataType::Int32;
+    use arrow::datatypes::{Field, FieldRef};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarUDFImpl};
+
+    macro_rules! test_spark_length_string {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+        };
+    }
+
+    macro_rules! test_spark_length_binary {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Binary($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::LargeBinary($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_scalar_function!(
+                SparkLengthFunc::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::BinaryView($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+        };
+    }
+
+    #[test]
+    fn test_functions() -> Result<()> {
+        test_spark_length_string!(Some(String::from("chars")), Ok(Some(5)));
+        test_spark_length_string!(Some(String::from("josé")), Ok(Some(4)));
+        // test long strings (more than 12 bytes for StringView)
+        test_spark_length_string!(Some(String::from("joséjoséjoséjosé")), Ok(Some(16)));
+        test_spark_length_string!(Some(String::from("")), Ok(Some(0)));
+        test_spark_length_string!(None, Ok(None));
+
+        test_spark_length_binary!(Some(String::from("chars").into_bytes()), Ok(Some(5)));
+        test_spark_length_binary!(Some(String::from("josé").into_bytes()), Ok(Some(5)));
+        // test long strings (more than 12 bytes for BinaryView)
+        test_spark_length_binary!(
+            Some(String::from("joséjoséjoséjosé").into_bytes()),
+            Ok(Some(20))
+        );
+        test_spark_length_binary!(Some(String::from("").into_bytes()), Ok(Some(0)));
+        test_spark_length_binary!(None, Ok(None));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_length_nullability() -> Result<()> {
+        let func = SparkLengthFunc::new();
+
+        let nullable_field: FieldRef = Arc::new(Field::new("col", DataType::Utf8, true));
+
+        let out_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[nullable_field],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            out_nullable.is_nullable(),
+            "length(col) should be nullable when child is nullable"
+        );
+
+        let non_nullable_field: FieldRef =
+            Arc::new(Field::new("col", DataType::Utf8, false));
+
+        let out_non_nullable = func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[non_nullable_field],
+            scalar_arguments: &[None],
+        })?;
+
+        assert!(
+            !out_non_nullable.is_nullable(),
+            "length(col) should NOT be nullable when child is NOT nullable"
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/like.rs b/datafusion/spark/src/function/string/like.rs
new file mode 100644
index 0000000000000..ffa1737023b61
--- /dev/null
+++ b/datafusion/spark/src/function/string/like.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrayRef;
+use arrow::compute::like;
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{Result, exec_err, internal_err};
+use datafusion_expr::ColumnarValue;
+use datafusion_expr::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// LIKE function for case-sensitive pattern matching
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#like>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkLike {
+    signature: Signature,
+}
+
+impl Default for SparkLike {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkLike {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkLike {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "like"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+        Ok(Arc::new(Field::new(
+            self.name(),
+            DataType::Boolean,
+            nullable,
+        )))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_like, vec![])(&args.args)
+    }
+}
+
+/// Returns true if str matches pattern (case sensitive).
+pub fn spark_like(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 2 {
+        return exec_err!("like function requires exactly 2 arguments");
+    }
+
+    let result = like(&args[0], &args[1])?;
+    Ok(Arc::new(result))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, BooleanArray};
+    use arrow::datatypes::{DataType::Boolean, Field};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarUDFImpl};
+
+    macro_rules! test_like_string_invoke {
+        ($INPUT1:expr, $INPUT2:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                SparkLike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+
+            test_scalar_function!(
+                SparkLike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+
+            test_scalar_function!(
+                SparkLike::new(),
+                vec![
+                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT1)),
+                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT2))
+                ],
+                $EXPECTED,
+                bool,
+                Boolean,
+                BooleanArray
+            );
+        };
+    }
+
+    #[test]
+    fn test_like_invoke() -> Result<()> {
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("_park")),
+            Ok(Some(true))
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("_PARK")),
+            Ok(Some(false)) // case-sensitive
+        );
+        test_like_string_invoke!(
+            Some(String::from("SPARK")),
+            Some(String::from("_park")),
+            Ok(Some(false)) // case-sensitive
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("Sp%")),
+            Ok(Some(true))
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("SP%")),
+            Ok(Some(false)) // case-sensitive
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("%ark")),
+            Ok(Some(true))
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("%ARK")),
+            Ok(Some(false)) // case-sensitive
+        );
+        test_like_string_invoke!(
+            Some(String::from("Spark")),
+            Some(String::from("xyz")),
+            Ok(Some(false))
+        );
+        test_like_string_invoke!(None, Some(String::from("_park")), Ok(None));
+        test_like_string_invoke!(Some(String::from("Spark")), None, Ok(None));
+        test_like_string_invoke!(None, None, Ok(None));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_like_nullability() {
+        let like = SparkLike::new();
+
+        // Test with non-nullable arguments
+        let non_nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, false));
+        let non_nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, false));
+
+        let both_non_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should not be nullable when both inputs are non-nullable
+        assert!(!both_non_nullable.is_nullable());
+        assert_eq!(both_non_nullable.data_type(), &Boolean);
+
+        // Test with first argument nullable
+        let nullable_field1 = Arc::new(Field::new("str", DataType::Utf8, true));
+
+        let first_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&nullable_field1),
+                    Arc::clone(&non_nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when first input is nullable
+        assert!(first_nullable.is_nullable());
+        assert_eq!(first_nullable.data_type(), &Boolean);
+
+        // Test with second argument nullable
+        let nullable_field2 = Arc::new(Field::new("pattern", DataType::Utf8, true));
+
+        let second_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[
+                    Arc::clone(&non_nullable_field1),
+                    Arc::clone(&nullable_field2),
+                ],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when second input is nullable
+        assert!(second_nullable.is_nullable());
+        assert_eq!(second_nullable.data_type(), &Boolean);
+
+        // Test with both arguments nullable
+        let first_second_nullable = like
+            .return_field_from_args(ReturnFieldArgs {
+                arg_fields: &[Arc::clone(&nullable_field1), Arc::clone(&nullable_field2)],
+                scalar_arguments: &[None, None],
+            })
+            .unwrap();
+
+        // The result should be nullable when both inputs are nullable
+        assert!(first_second_nullable.is_nullable());
+        assert_eq!(first_second_nullable.data_type(), &Boolean);
+    }
+}
diff --git a/datafusion/spark/src/function/string/luhn_check.rs b/datafusion/spark/src/function/string/luhn_check.rs
new file mode 100644
index 0000000000000..dffd4fe0ae7e2
--- /dev/null
+++ b/datafusion/spark/src/function/string/luhn_check.rs
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{any::Any, sync::Arc};
+
+use arrow::array::{Array, AsArray, BooleanArray};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::Boolean;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+
+/// Spark-compatible `luhn_check` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#luhn_check>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkLuhnCheck {
+    signature: Signature,
+}
+
+impl Default for SparkLuhnCheck {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkLuhnCheck {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::Utf8View]),
+                    TypeSignature::Exact(vec![DataType::LargeUtf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkLuhnCheck {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "luhn_check"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Boolean)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let [array] = take_function_args(self.name(), &args.args)?;
+
+        match array {
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Utf8View => {
+                    let str_array = array.as_string_view();
+                    let values = str_array
+                        .iter()
+                        .map(|s| s.map(luhn_check_impl))
+                        .collect::<BooleanArray>();
+                    Ok(ColumnarValue::Array(Arc::new(values)))
+                }
+                DataType::Utf8 => {
+                    let str_array = array.as_string::<i32>();
+                    let values = str_array
+                        .iter()
+                        .map(|s| s.map(luhn_check_impl))
+                        .collect::<BooleanArray>();
+                    Ok(ColumnarValue::Array(Arc::new(values)))
+                }
+                DataType::LargeUtf8 => {
+                    let str_array = array.as_string::<i64>();
+                    let values = str_array
+                        .iter()
+                        .map(|s| s.map(luhn_check_impl))
+                        .collect::<BooleanArray>();
+                    Ok(ColumnarValue::Array(Arc::new(values)))
+                }
+                other => {
+                    exec_err!("Unsupported data type {other:?} for function `luhn_check`")
+                }
+            },
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(s)))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(Some(s))) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Boolean(Some(luhn_check_impl(s)))),
+            ),
+            ColumnarValue::Scalar(ScalarValue::Utf8(None))
+            | ColumnarValue::Scalar(ScalarValue::LargeUtf8(None))
+            | ColumnarValue::Scalar(ScalarValue::Utf8View(None)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)))
+            }
+            other => {
+                exec_err!("Unsupported data type {other:?} for function `luhn_check`")
+            }
+        }
+    }
+}
+
+/// Validates a string using the Luhn algorithm.
+/// Returns `true` if the input is a valid Luhn number.
+fn luhn_check_impl(input: &str) -> bool {
+    let mut sum = 0u32;
+    let mut alt = false;
+    let mut digits_processed = 0;
+
+    for b in input.as_bytes().iter().rev() {
+        let digit = match b {
+            b'0'..=b'9' => {
+                digits_processed += 1;
+                b - b'0'
+            }
+            _ => return false,
+        };
+
+        let mut val = digit as u32;
+        if alt {
+            val *= 2;
+            if val > 9 {
+                val -= 9;
+            }
+        }
+        sum += val;
+        alt = !alt;
+    }
+
+    digits_processed > 0 && sum.is_multiple_of(10)
+}
diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs
index 9d5fabe832e92..8859beca77996 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -16,14 +16,35 @@
 // under the License.
 
 pub mod ascii;
+pub mod base64;
 pub mod char;
+pub mod concat;
+pub mod elt;
+pub mod format_string;
+pub mod ilike;
+pub mod length;
+pub mod like;
+pub mod luhn_check;
+pub mod space;
+pub mod substring;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
 make_udf_function!(ascii::SparkAscii, ascii);
-make_udf_function!(char::SparkChar, char);
+make_udf_function!(base64::SparkBase64, base64);
+make_udf_function!(char::CharFunc, char);
+make_udf_function!(concat::SparkConcat, concat);
+make_udf_function!(ilike::SparkILike, ilike);
+make_udf_function!(length::SparkLengthFunc, length);
+make_udf_function!(elt::SparkElt, elt);
+make_udf_function!(like::SparkLike, like);
+make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check);
+make_udf_function!(format_string::FormatStringFunc, format_string);
+make_udf_function!(space::SparkSpace, space);
+make_udf_function!(substring::SparkSubstring, substring);
+make_udf_function!(base64::SparkUnBase64, unbase64);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -33,13 +54,78 @@ pub mod expr_fn {
         "Returns the ASCII code point of the first character of string.",
         arg1
     ));
+    export_functions!((
+        base64,
+        "Encodes the input binary `bin` into a base64 string.",
+        bin
+    ));
     export_functions!((
         char,
         "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).",
         arg1
     ));
+    export_functions!((
+        concat,
+        "Concatenates multiple input strings into a single string. Returns NULL if any input is NULL.",
+        args
+    ));
+    export_functions!((
+        elt,
+        "Returns the n-th input (1-indexed), e.g. returns 2nd input when n is 2. The function returns NULL if the index is 0 or exceeds the length of the array.",
+        select_col arg1 arg2 argn
+    ));
+    export_functions!((
+        ilike,
+        "Returns true if str matches pattern (case insensitive).",
+        str pattern
+    ));
+    export_functions!((
+        length,
+        "Returns the character length of string data or number of bytes of binary data. The length of string data includes the trailing spaces. The length of binary data includes binary zeros.",
+        arg1
+    ));
+    export_functions!((
+        like,
+        "Returns true if str matches pattern (case sensitive).",
+        str pattern
+    ));
+    export_functions!((
+        luhn_check,
+        "Returns whether the input string of digits is valid according to the Luhn algorithm.",
+        arg1
+    ));
+    export_functions!((
+        format_string,
+        "Returns a formatted string from printf-style format strings.",
+        strfmt args
+    ));
+    export_functions!((space, "Returns a string consisting of n spaces.", arg1));
+    export_functions!((
+        substring,
+        "Returns the substring from string `str` starting at position `pos` with length `length.",
+        str pos length
+    ));
+    export_functions!((
+        unbase64,
+        "Decodes the input string `str` from a base64 string into binary data.",
+        str
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![ascii(), char()]
+    vec![
+        ascii(),
+        base64(),
+        char(),
+        concat(),
+        elt(),
+        ilike(),
+        length(),
+        like(),
+        luhn_check(),
+        format_string(),
+        space(),
+        substring(),
+        unbase64(),
+    ]
 }
diff --git a/datafusion/spark/src/function/string/space.rs b/datafusion/spark/src/function/string/space.rs
new file mode 100644
index 0000000000000..77daff28ff1a1
--- /dev/null
+++ b/datafusion/spark/src/function/string/space.rs
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayRef, DictionaryArray, Int32Array, StringArray, StringBuilder,
+    as_dictionary_array,
+};
+use arrow::datatypes::{DataType, Int32Type};
+use datafusion_common::cast::as_int32_array;
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `space` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#space>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSpace {
+    signature: Signature,
+}
+
+impl Default for SparkSpace {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSpace {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Int32,
+                    DataType::Dictionary(
+                        Box::new(DataType::Int32),
+                        Box::new(DataType::Int32),
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSpace {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "space"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
+        let return_type = match &args[0] {
+            DataType::Dictionary(key_type, _) => {
+                DataType::Dictionary(key_type.clone(), Box::new(DataType::Utf8))
+            }
+            _ => DataType::Utf8,
+        };
+        Ok(return_type)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        spark_space(&args.args)
+    }
+}
+
+pub fn spark_space(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    if args.len() != 1 {
+        return exec_err!("space function takes exactly one argument");
+    }
+    match &args[0] {
+        ColumnarValue::Array(array) => {
+            let result = spark_space_array(array)?;
+            Ok(ColumnarValue::Array(result))
+        }
+        ColumnarValue::Scalar(scalar) => {
+            let result = spark_space_scalar(scalar)?;
+            Ok(ColumnarValue::Scalar(result))
+        }
+    }
+}
+
+fn spark_space_array(array: &ArrayRef) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::Int32 => {
+            let array = as_int32_array(array)?;
+            Ok(Arc::new(spark_space_array_inner(array)))
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_space_array(dict.values())?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
+            Ok(Arc::new(result))
+        }
+        other => {
+            exec_err!("Unsupported data type {other:?} for function `space`")
+        }
+    }
+}
+
+fn spark_space_scalar(scalar: &ScalarValue) -> Result<ScalarValue> {
+    match scalar {
+        ScalarValue::Int32(value) => {
+            let result = value.map(|v| {
+                if v <= 0 {
+                    String::new()
+                } else {
+                    " ".repeat(v as usize)
+                }
+            });
+            Ok(ScalarValue::Utf8(result))
+        }
+        other => {
+            exec_err!("Unsupported data type {other:?} for function `space`")
+        }
+    }
+}
+
+fn spark_space_array_inner(array: &Int32Array) -> StringArray {
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 16);
+    let mut space_buf = String::new();
+    for value in array.iter() {
+        match value {
+            None => builder.append_null(),
+            Some(l) if l > 0 => {
+                let l = l as usize;
+                if space_buf.len() < l {
+                    space_buf = " ".repeat(l);
+                }
+                builder.append_value(&space_buf[..l]);
+            }
+            Some(_) => builder.append_value(""),
+        }
+    }
+    builder.finish()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::function::string::space::spark_space;
+    use arrow::array::{Array, Int32Array, Int32DictionaryArray};
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::cast::{as_dictionary_array, as_string_array};
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::ColumnarValue;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_spark_space_int32_array() -> Result<()> {
+        let int32_array = ColumnarValue::Array(Arc::new(Int32Array::from(vec![
+            Some(1),
+            Some(-3),
+            Some(0),
+            Some(5),
+            None,
+        ])));
+        let ColumnarValue::Array(result) = spark_space(&[int32_array])? else {
+            unreachable!()
+        };
+        let result = as_string_array(&result)?;
+
+        assert_eq!(result.value(0), " ");
+        assert_eq!(result.value(1), "");
+        assert_eq!(result.value(2), "");
+        assert_eq!(result.value(3), "     ");
+        assert!(result.is_null(4));
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_space_dictionary() -> Result<()> {
+        let dictionary = ColumnarValue::Array(Arc::new(Int32DictionaryArray::new(
+            Int32Array::from(vec![0, 1, 2, 3, 4]),
+            Arc::new(Int32Array::from(vec![
+                Some(1),
+                Some(-3),
+                Some(0),
+                Some(5),
+                None,
+            ])),
+        )));
+        let ColumnarValue::Array(result) = spark_space(&[dictionary])? else {
+            unreachable!()
+        };
+        let result =
+            as_string_array(as_dictionary_array::<Int32Type>(&result)?.values())?;
+        assert_eq!(result.value(0), " ");
+        assert_eq!(result.value(1), "");
+        assert_eq!(result.value(2), "");
+        assert_eq!(result.value(3), "     ");
+        assert!(result.is_null(4));
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_space_scalar() -> Result<()> {
+        let scalar = ColumnarValue::Scalar(ScalarValue::Int32(Some(-5)));
+        let ColumnarValue::Scalar(result) = spark_space(&[scalar])? else {
+            unreachable!()
+        };
+        match result {
+            ScalarValue::Utf8(Some(result)) => {
+                assert_eq!(result, "");
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/string/substring.rs b/datafusion/spark/src/function/string/substring.rs
new file mode 100644
index 0000000000000..524262b12f193
--- /dev/null
+++ b/datafusion/spark/src/function/string/substring.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+    OffsetSizeTrait, StringArrayType, StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{
+    NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{Coercion, ReturnFieldArgs, TypeSignatureClass};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::unicode::substr::{enable_ascii_fast_path, get_true_start_end};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `substring` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#substring>
+///
+/// Returns the substring from string starting at position pos with length len.
+/// Position is 1-indexed. If pos is negative, it counts from the end of the string.
+/// Returns NULL if any input is NULL.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSubstring {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkSubstring {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSubstring {
+    pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Native(logical_int32())],
+            NativeType::Int64,
+        );
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
+                    TypeSignature::Coercible(vec![
+                        string.clone(),
+                        int64.clone(),
+                        int64.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec![
+                "str".to_string(),
+                "pos".to_string(),
+                "length".to_string(),
+            ])
+            .expect("valid parameter names"),
+            aliases: vec![String::from("substr")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSubstring {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "substring"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_substring, vec![])(&args.args)
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called for Spark substring"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        // Spark semantics: substring returns NULL if ANY input is NULL
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            "substring",
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+}
+
+fn spark_substring(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let start_array = as_int64_array(&args[1])?;
+    let length_array = if args.len() > 2 {
+        Some(as_int64_array(&args[2])?)
+    } else {
+        None
+    };
+
+    match args[0].data_type() {
+        DataType::Utf8 => spark_substring_impl(
+            &args[0].as_string::<i32>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i32>::new(),
+        ),
+        DataType::LargeUtf8 => spark_substring_impl(
+            &args[0].as_string::<i64>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i64>::new(),
+        ),
+        DataType::Utf8View => spark_substring_impl(
+            &args[0].as_string_view(),
+            start_array,
+            length_array,
+            StringViewBuilder::new(),
+        ),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function spark_substring, expected Utf8View, Utf8 or LargeUtf8."
+        ),
+    }
+}
+
+/// Convert Spark's start position to DataFusion's 1-based start position.
+///
+/// Spark semantics:
+/// - Positive start: 1-based index from beginning
+/// - Zero start: treated as 1
+/// - Negative start: counts from end of string
+///
+/// Returns the converted 1-based start position for use with `get_true_start_end`.
+#[inline]
+fn spark_start_to_datafusion_start(start: i64, len: usize) -> i64 {
+    if start >= 0 {
+        start.max(1)
+    } else {
+        let len_i64 = i64::try_from(len).unwrap_or(i64::MAX);
+        let start = start.saturating_add(len_i64).saturating_add(1);
+        start.max(1)
+    }
+}
+
+trait StringArrayBuilder: ArrayBuilder {
+    fn append_value(&mut self, val: &str);
+    fn append_null(&mut self);
+}
+
+impl<O: OffsetSizeTrait> StringArrayBuilder for GenericStringBuilder<O> {
+    fn append_value(&mut self, val: &str) {
+        GenericStringBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        GenericStringBuilder::append_null(self);
+    }
+}
+
+impl StringArrayBuilder for StringViewBuilder {
+    fn append_value(&mut self, val: &str) {
+        StringViewBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        StringViewBuilder::append_null(self);
+    }
+}
+
+fn spark_substring_impl<'a, V, B>(
+    string_array: &V,
+    start_array: &Int64Array,
+    length_array: Option<&Int64Array>,
+    mut builder: B,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    B: StringArrayBuilder,
+{
+    let is_ascii = enable_ascii_fast_path(string_array, start_array, length_array);
+
+    for i in 0..string_array.len() {
+        if string_array.is_null(i) || start_array.is_null(i) {
+            builder.append_null();
+            continue;
+        }
+
+        if let Some(len_arr) = length_array
+            && len_arr.is_null(i)
+        {
+            builder.append_null();
+            continue;
+        }
+
+        let string = string_array.value(i);
+        let start = start_array.value(i);
+        let len_opt = length_array.map(|arr| arr.value(i));
+
+        // Spark: negative length returns empty string
+        if let Some(len) = len_opt
+            && len < 0
+        {
+            builder.append_value("");
+            continue;
+        }
+
+        let string_len = if is_ascii {
+            string.len()
+        } else {
+            string.chars().count()
+        };
+
+        let adjusted_start = spark_start_to_datafusion_start(start, string_len);
+
+        let (byte_start, byte_end) = get_true_start_end(
+            string,
+            adjusted_start,
+            len_opt.map(|l| l as u64),
+            is_ascii,
+        );
+        let substr = &string[byte_start..byte_end];
+        builder.append_value(substr);
+    }
+
+    Ok(builder.finish())
+}
diff --git a/datafusion/spark/src/function/url/mod.rs b/datafusion/spark/src/function/url/mod.rs
index a87df9a2c87a0..1313edaed5347 100644
--- a/datafusion/spark/src/function/url/mod.rs
+++ b/datafusion/spark/src/function/url/mod.rs
@@ -16,10 +16,57 @@
 // under the License.
 
 use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
 use std::sync::Arc;
 
-pub mod expr_fn {}
+pub mod parse_url;
+pub mod try_parse_url;
+pub mod try_url_decode;
+pub mod url_decode;
+pub mod url_encode;
+
+make_udf_function!(parse_url::ParseUrl, parse_url);
+make_udf_function!(try_parse_url::TryParseUrl, try_parse_url);
+make_udf_function!(try_url_decode::TryUrlDecode, try_url_decode);
+make_udf_function!(url_decode::UrlDecode, url_decode);
+make_udf_function!(url_encode::UrlEncode, url_encode);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        parse_url,
+        "Extracts a part from a URL, throwing an error if an invalid URL is provided.",
+        args
+    ));
+    export_functions!((
+        try_parse_url,
+        "Same as parse_url but returns NULL if an invalid URL is provided.",
+        args
+    ));
+    export_functions!((
+        url_decode,
+        "Decodes a URL-encoded string in ‘application/x-www-form-urlencoded’ format to its original format.",
+        args
+    ));
+    export_functions!((
+        try_url_decode,
+        "Same as url_decode but returns NULL if an invalid URL-encoded string is provided",
+        args
+    ));
+    export_functions!((
+        url_encode,
+        "Encodes a string into a URL-encoded string in ‘application/x-www-form-urlencoded’ format.",
+        args
+    ));
+}
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
-    vec![]
+    vec![
+        parse_url(),
+        try_parse_url(),
+        try_url_decode(),
+        url_decode(),
+        url_encode(),
+    ]
 }
diff --git a/datafusion/spark/src/function/url/parse_url.rs b/datafusion/spark/src/function/url/parse_url.rs
new file mode 100644
index 0000000000000..50591fb25e9d1
--- /dev/null
+++ b/datafusion/spark/src/function/url/parse_url.rs
@@ -0,0 +1,556 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, GenericStringBuilder, LargeStringArray, StringArray,
+    StringArrayType, StringViewArray,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_datafusion_err, exec_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use url::{ParseError, Url};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ParseUrl {
+    signature: Signature,
+}
+
+impl Default for ParseUrl {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ParseUrl {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![TypeSignature::String(2), TypeSignature::String(3)],
+                Volatility::Immutable,
+            ),
+        }
+    }
+    /// Parses a URL and extracts the specified component.
+    ///
+    /// This function takes a URL string and extracts different parts of it based on the
+    /// `part` parameter. For query parameters, an optional `key` can be specified to
+    /// extract a specific query parameter value.
+    ///
+    /// # Arguments
+    ///
+    /// * `value` - The URL string to parse
+    /// * `part` - The component of the URL to extract. Valid values are:
+    ///   - `"HOST"` - The hostname (e.g., "example.com")
+    ///   - `"PATH"` - The path portion (e.g., "/path/to/resource")
+    ///   - `"QUERY"` - The query string or a specific query parameter
+    ///   - `"REF"` - The fragment/anchor (the part after #)
+    ///   - `"PROTOCOL"` - The URL scheme (e.g., "https", "http")
+    ///   - `"FILE"` - The path with query string (e.g., "/path?query=value")
+    ///   - `"AUTHORITY"` - The authority component (host:port)
+    ///   - `"USERINFO"` - The user information (username:password)
+    /// * `key` - Optional parameter used only with `"QUERY"`. When provided, extracts
+    ///   the value of the specific query parameter with this key name.
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(Some(String))` - The extracted URL component as a string
+    /// * `Ok(None)` - If the requested component doesn't exist or is empty
+    /// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed
+    fn parse(value: &str, part: &str, key: Option<&str>) -> Result<Option<String>> {
+        let url: std::result::Result<Url, ParseError> = Url::parse(value);
+        if let Err(ParseError::RelativeUrlWithoutBase) = url {
+            return if !value.contains("://") {
+                // Schemeless URLs are treated as relative URIs (like java.net.URI).
+                // Manually parse path, query, and fragment components.
+                let (without_fragment, fragment) = match value.split_once('#') {
+                    Some((before, frag)) => (before, Some(frag)),
+                    None => (value, None),
+                };
+                let (path, query) = match without_fragment.split_once('?') {
+                    Some((p, q)) => (p, Some(q)),
+                    None => (without_fragment, None),
+                };
+                Ok(match part {
+                    "PATH" => Some(path.to_string()),
+                    "QUERY" => match key {
+                        None => query.map(String::from),
+                        Some(key) => query.and_then(|q| {
+                            q.split('&')
+                                .filter_map(|pair| pair.split_once('='))
+                                .find(|(k, _)| *k == key)
+                                .map(|(_, v)| v.to_string())
+                        }),
+                    },
+                    "REF" => fragment.map(String::from),
+                    "FILE" => {
+                        // FILE = path + query (without fragment)
+                        Some(without_fragment.to_string())
+                    }
+                    // HOST, PROTOCOL, AUTHORITY, USERINFO → NULL
+                    _ => None,
+                })
+            } else {
+                Err(exec_datafusion_err!(
+                    "The url is invalid: {value}. Use `try_parse_url` to tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"
+                ))
+            };
+        };
+        url.map_err(|e| exec_datafusion_err!("{e:?}"))
+            .map(|url| match part {
+                "HOST" => url.host_str().map(String::from),
+                "PATH" => {
+                    let path: String = url.path().to_string();
+                    let path: String = if path == "/" { "".to_string() } else { path };
+                    Some(path)
+                }
+                "QUERY" => match key {
+                    None => url.query().map(String::from),
+                    Some(key) => url
+                        .query_pairs()
+                        .find(|(k, _)| k == key)
+                        .map(|(_, v)| v.into_owned()),
+                },
+                "REF" => url.fragment().map(String::from),
+                "PROTOCOL" => Some(url.scheme().to_string()),
+                "FILE" => {
+                    let path = url.path();
+                    match url.query() {
+                        Some(query) => Some(format!("{path}?{query}")),
+                        None => Some(path.to_string()),
+                    }
+                }
+                "AUTHORITY" => Some(url.authority().to_string()),
+                "USERINFO" => {
+                    let username = url.username();
+                    if username.is_empty() {
+                        return None;
+                    }
+                    match url.password() {
+                        Some(password) => Some(format!("{username}:{password}")),
+                        None => Some(username.to_string()),
+                    }
+                }
+                _ => None,
+            })
+    }
+}
+
+impl ScalarUDFImpl for ParseUrl {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "parse_url"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_parse_url, vec![])(&args)
+    }
+}
+
+/// Core implementation of URL parsing function.
+///
+/// # Arguments
+///
+/// * `args` - A slice of ArrayRef containing the input arrays:
+///   - `args[0]` - URL array: The URLs to parse
+///   - `args[1]` - Part array: The URL components to extract (HOST, PATH, QUERY, etc.)
+///   - `args[2]` - Key array (optional): For QUERY part, the specific parameter names to extract
+///
+/// # Return Value
+///
+/// Returns `Result<ArrayRef>` containing:
+/// - A string array with extracted URL components
+/// - `None` values where extraction failed or component doesn't exist
+/// - The output array type (StringArray or LargeStringArray) is determined by input types
+fn spark_parse_url(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_parse_url(args, |x| x)
+}
+
+pub fn spark_handled_parse_url(
+    args: &[ArrayRef],
+    handler_err: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+) -> Result<ArrayRef> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!(
+            "{} expects 2 or 3 arguments, but got {}",
+            "`parse_url`",
+            args.len()
+        );
+    }
+    // Required arguments
+    let url = &args[0];
+    let part = &args[1];
+
+    if args.len() == 3 {
+        // In this case, the 'key' argument is passed
+        let key = &args[2];
+
+        match (url.data_type(), part.data_type(), key.data_type()) {
+            (DataType::Utf8, DataType::Utf8, DataType::Utf8) => {
+                process_parse_url::<_, _, _, StringArray>(
+                    as_string_array(url)?,
+                    as_string_array(part)?,
+                    as_string_array(key)?,
+                    handler_err,
+                    true,
+                )
+            }
+            (DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
+                process_parse_url::<_, _, _, StringViewArray>(
+                    as_string_view_array(url)?,
+                    as_string_view_array(part)?,
+                    as_string_view_array(key)?,
+                    handler_err,
+                    true,
+                )
+            }
+            (DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => {
+                process_parse_url::<_, _, _, LargeStringArray>(
+                    as_large_string_array(url)?,
+                    as_large_string_array(part)?,
+                    as_large_string_array(key)?,
+                    handler_err,
+                    true,
+                )
+            }
+            _ => exec_err!(
+                "`parse_url` expects STRING arguments, got ({}, {}, {})",
+                url.data_type(),
+                part.data_type(),
+                key.data_type()
+            ),
+        }
+    } else {
+        // The 'key' argument is omitted, assume all values are null
+        // Create 'null' string array for 'key' argument
+        let mut builder: GenericStringBuilder<i32> = GenericStringBuilder::new();
+        for _ in 0..args[0].len() {
+            builder.append_null();
+        }
+        let key = builder.finish();
+
+        match (url.data_type(), part.data_type()) {
+            (DataType::Utf8, DataType::Utf8) => {
+                process_parse_url::<_, _, _, StringArray>(
+                    as_string_array(url)?,
+                    as_string_array(part)?,
+                    &key,
+                    handler_err,
+                    false,
+                )
+            }
+            (DataType::Utf8View, DataType::Utf8View) => {
+                process_parse_url::<_, _, _, StringViewArray>(
+                    as_string_view_array(url)?,
+                    as_string_view_array(part)?,
+                    &key,
+                    handler_err,
+                    false,
+                )
+            }
+            (DataType::LargeUtf8, DataType::LargeUtf8) => {
+                process_parse_url::<_, _, _, LargeStringArray>(
+                    as_large_string_array(url)?,
+                    as_large_string_array(part)?,
+                    &key,
+                    handler_err,
+                    false,
+                )
+            }
+            _ => exec_err!(
+                "`parse_url` expects STRING arguments, got ({}, {})",
+                url.data_type(),
+                part.data_type()
+            ),
+        }
+    }
+}
+
+fn process_parse_url<'a, A, B, C, T>(
+    url_array: &'a A,
+    part_array: &'a B,
+    key_array: &'a C,
+    handle: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+    has_key_arg: bool,
+) -> Result<ArrayRef>
+where
+    &'a A: StringArrayType<'a>,
+    &'a B: StringArrayType<'a>,
+    &'a C: StringArrayType<'a>,
+    T: Array + FromIterator<Option<String>> + 'static,
+{
+    url_array
+        .iter()
+        .zip(part_array.iter())
+        .zip(key_array.iter())
+        .map(|((url, part), key)| {
+            // Spark returns NULL when the third argument is explicitly NULL
+            if has_key_arg && key.is_none() {
+                return Ok(None);
+            }
+            if let (Some(url), Some(part)) = (url, part) {
+                handle(ParseUrl::parse(url, part, key))
+            } else {
+                Ok(None)
+            }
+        })
+        .collect::<Result<T>>()
+        .map(|array| Arc::new(array) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{ArrayRef, Int32Array, StringArray};
+    use datafusion_common::Result;
+    use std::array::from_ref;
+    use std::sync::Arc;
+
+    fn sa(vals: &[Option<&str>]) -> ArrayRef {
+        Arc::new(StringArray::from(vals.to_vec())) as ArrayRef
+    }
+
+    #[test]
+    fn test_parse_host() -> Result<()> {
+        let got = ParseUrl::parse("https://example.com/a?x=1", "HOST", None)?;
+        assert_eq!(got, Some("example.com".to_string()));
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_query_no_key_vs_with_key() -> Result<()> {
+        let got_all = ParseUrl::parse("https://ex.com/p?a=1&b=2", "QUERY", None)?;
+        assert_eq!(got_all, Some("a=1&b=2".to_string()));
+
+        let got_a = ParseUrl::parse("https://ex.com/p?a=1&b=2", "QUERY", Some("a"))?;
+        assert_eq!(got_a, Some("1".to_string()));
+
+        let got_c = ParseUrl::parse("https://ex.com/p?a=1&b=2", "QUERY", Some("c"))?;
+        assert_eq!(got_c, None);
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_ref_protocol_userinfo_file_authority() -> Result<()> {
+        let url = "ftp://user:pwd@ftp.example.com:21/files?x=1#frag";
+        assert_eq!(ParseUrl::parse(url, "REF", None)?, Some("frag".to_string()));
+        assert_eq!(
+            ParseUrl::parse(url, "PROTOCOL", None)?,
+            Some("ftp".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse(url, "USERINFO", None)?,
+            Some("user:pwd".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse(url, "FILE", None)?,
+            Some("/files?x=1".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse(url, "AUTHORITY", None)?,
+            Some("user:pwd@ftp.example.com".to_string())
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_path_root_is_empty_string() -> Result<()> {
+        let got = ParseUrl::parse("https://example.com/", "PATH", None)?;
+        assert_eq!(got, Some("".to_string()));
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_schemeless_url() -> Result<()> {
+        // Spark's java.net.URI treats schemeless strings as relative URIs.
+        // Simple schemeless string: no query, no fragment.
+        assert_eq!(
+            ParseUrl::parse("notaurl", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "AUTHORITY", None)?, None);
+        assert_eq!(ParseUrl::parse("notaurl", "USERINFO", None)?, None);
+
+        // Schemeless URL with query string
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "FILE", None)?,
+            Some("notaurl?key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
+            Some("key=value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("key"))?,
+            Some("value".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "QUERY", Some("missing"))?,
+            None
+        );
+        assert_eq!(ParseUrl::parse("notaurl?key=value", "HOST", None)?, None);
+        assert_eq!(
+            ParseUrl::parse("notaurl?key=value", "PROTOCOL", None)?,
+            None
+        );
+
+        // Schemeless URL with fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "REF", None)?,
+            Some("reference".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl#reference", "FILE", None)?,
+            Some("notaurl".to_string())
+        );
+
+        // Schemeless URL with both query and fragment
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "PATH", None)?,
+            Some("notaurl".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", None)?,
+            Some("a=1&b=2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", Some("b"))?,
+            Some("2".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "REF", None)?,
+            Some("frag".to_string())
+        );
+        assert_eq!(
+            ParseUrl::parse("notaurl?a=1&b=2#frag", "FILE", None)?,
+            Some("notaurl?a=1&b=2".to_string())
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_utf8_two_args() -> Result<()> {
+        let urls = sa(&[Some("https://example.com/a?x=1"), Some("https://ex.com/")]);
+        let parts = sa(&[Some("HOST"), Some("PATH")]);
+
+        let out = spark_handled_parse_url(&[urls, parts], |x| x)?;
+        let out_sa = out.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(out_sa.len(), 2);
+        assert_eq!(out_sa.value(0), "example.com");
+        assert_eq!(out_sa.value(1), "");
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_utf8_three_args_query_key() -> Result<()> {
+        let urls = sa(&[
+            Some("https://example.com/a?x=1&y=2"),
+            Some("https://ex.com/?a=1"),
+        ]);
+        let parts = sa(&[Some("QUERY"), Some("QUERY")]);
+        let keys = sa(&[Some("y"), Some("b")]);
+
+        let out = spark_handled_parse_url(&[urls, parts, keys], |x| x)?;
+        let out_sa = out.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(out_sa.len(), 2);
+        assert_eq!(out_sa.value(0), "2");
+        assert!(out_sa.is_null(1));
+        Ok(())
+    }
+
+    #[test]
+    fn test_spark_userinfo_and_nulls() -> Result<()> {
+        let urls = sa(&[
+            Some("ftp://user:pwd@ftp.example.com:21/files"),
+            Some("https://example.com"),
+            None,
+        ]);
+        let parts = sa(&[Some("USERINFO"), Some("USERINFO"), Some("USERINFO")]);
+
+        let out = spark_handled_parse_url(&[urls, parts], |x| x)?;
+        let out_sa = out.as_any().downcast_ref::<StringArray>().unwrap();
+
+        assert_eq!(out_sa.len(), 3);
+        assert_eq!(out_sa.value(0), "user:pwd");
+        assert!(out_sa.is_null(1));
+        assert!(out_sa.is_null(2));
+        Ok(())
+    }
+
+    #[test]
+    fn test_invalid_arg_count() {
+        let urls = sa(&[Some("https://example.com")]);
+        let err = spark_handled_parse_url(from_ref(&urls), |x| x).unwrap_err();
+        assert!(format!("{err}").contains("expects 2 or 3 arguments"));
+
+        let parts = sa(&[Some("HOST")]);
+        let keys = sa(&[Some("x")]);
+        let err =
+            spark_handled_parse_url(&[urls, parts, keys, sa(&[Some("extra")])], |x| x)
+                .unwrap_err();
+        assert!(format!("{err}").contains("expects 2 or 3 arguments"));
+    }
+
+    #[test]
+    fn test_non_string_types_error() {
+        let urls = sa(&[Some("https://example.com")]);
+        let bad_part = Arc::new(Int32Array::from(vec![1])) as ArrayRef;
+
+        let err = spark_handled_parse_url(&[urls, bad_part], |x| x).unwrap_err();
+        let msg = format!("{err}");
+        assert!(msg.contains("expects STRING arguments"));
+    }
+}
diff --git a/datafusion/spark/src/function/url/try_parse_url.rs b/datafusion/spark/src/function/url/try_parse_url.rs
new file mode 100644
index 0000000000000..4f6c5bb940fec
--- /dev/null
+++ b/datafusion/spark/src/function/url/try_parse_url.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use crate::function::url::parse_url::{ParseUrl, spark_handled_parse_url};
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType;
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+/// TRY_PARSE_URL function for tolerant URL component extraction (never errors; returns NULL on invalid or missing parts).
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#try_parse_url>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct TryParseUrl {
+    signature: Signature,
+}
+
+impl Default for TryParseUrl {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TryParseUrl {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![TypeSignature::String(2), TypeSignature::String(3)],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TryParseUrl {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "try_parse_url"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        let parse_url: ParseUrl = ParseUrl::new();
+        parse_url.return_type(arg_types)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_try_parse_url, vec![])(&args)
+    }
+}
+
+fn spark_try_parse_url(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_parse_url(args, |x| match x {
+        Err(_) => Ok(None),
+        result => result,
+    })
+}
diff --git a/datafusion/spark/src/function/url/try_url_decode.rs b/datafusion/spark/src/function/url/try_url_decode.rs
new file mode 100644
index 0000000000000..58013236d5ec9
--- /dev/null
+++ b/datafusion/spark/src/function/url/try_url_decode.rs
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType;
+
+use datafusion_common::Result;
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+use crate::function::url::url_decode::{UrlDecode, spark_handled_url_decode};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct TryUrlDecode {
+    signature: Signature,
+    url_decoder: UrlDecode,
+}
+
+impl Default for TryUrlDecode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TryUrlDecode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+            url_decoder: UrlDecode::new(),
+        }
+    }
+}
+
+impl ScalarUDFImpl for TryUrlDecode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "try_url_decode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        self.url_decoder.return_type(arg_types)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_try_url_decode, vec![])(&args)
+    }
+}
+
+fn spark_try_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_url_decode(args, |x| match x {
+        Err(_) => Ok(None),
+        result => result,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::array::StringArray;
+    use datafusion_common::{Result, cast::as_string_array};
+
+    use super::*;
+
+    #[test]
+    fn test_try_decode_error_handled() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid percent encoded character
+            // Valid cases
+            Some("https%3A%2F%2Fspark.apache.org"),
+            None,
+        ]));
+
+        let expected =
+            StringArray::from(vec![None, Some("https://spark.apache.org"), None]);
+
+        let result = spark_try_url_decode(&[input as ArrayRef])?;
+        let result = as_string_array(&result)?;
+
+        assert_eq!(&expected, result);
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/url/url_decode.rs b/datafusion/spark/src/function/url/url_decode.rs
new file mode 100644
index 0000000000000..e4a9cf6acd3e7
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_decode.rs
@@ -0,0 +1,261 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::borrow::Cow;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_datafusion_err, exec_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use percent_encoding::percent_decode;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlDecode {
+    signature: Signature,
+}
+
+impl Default for UrlDecode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl UrlDecode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+
+    /// Decodes a URL-encoded string from application/x-www-form-urlencoded format.
+    /// Although the `url::form_urlencoded` support decoding, it does not return error when the string is malformed
+    ///     For example: "%2s" is not a valid percent-encoding, the `decode` function from `url::form_urlencoded`
+    ///                  will ignore this instead of return error
+    /// This function reproduce the same decoding process, plus an extra validation step
+    /// See <https://github.com/servo/rust-url/blob/b06048d70d4cc9cf4ffb277f06cfcebd53b2141e/form_urlencoded/src/lib.rs#L70-L76>
+    ///
+    /// # Arguments
+    ///
+    /// * `value` - The URL-encoded string to decode
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(String)` - The decoded string
+    /// * `Err(DataFusionError)` - If the input is malformed or contains invalid UTF-8
+    ///
+    fn decode(value: &str) -> Result<String> {
+        // Check if the string has valid percent encoding
+        Self::validate_percent_encoding(value)?;
+
+        let replaced = Self::replace_plus(value.as_bytes());
+        percent_decode(&replaced)
+            .decode_utf8()
+            .map_err(|e| exec_datafusion_err!("Invalid UTF-8 sequence: {e}"))
+            .map(|parsed| parsed.into_owned())
+    }
+
+    /// Replace b'+' with b' '
+    /// See: <https://github.com/servo/rust-url/blob/dbd526178ed9276176602dd039022eba89e8fc93/form_urlencoded/src/lib.rs#L79-L93>
+    fn replace_plus(input: &[u8]) -> Cow<'_, [u8]> {
+        match input.iter().position(|&b| b == b'+') {
+            None => Cow::Borrowed(input),
+            Some(first_position) => {
+                let mut replaced = input.to_owned();
+                replaced[first_position] = b' ';
+                for byte in &mut replaced[first_position + 1..] {
+                    if *byte == b'+' {
+                        *byte = b' ';
+                    }
+                }
+                Cow::Owned(replaced)
+            }
+        }
+    }
+
+    /// Validate percent-encoding of the string
+    fn validate_percent_encoding(value: &str) -> Result<()> {
+        let bytes = value.as_bytes();
+        let mut i = 0;
+
+        while i < bytes.len() {
+            if bytes[i] == b'%' {
+                // Check if we have at least 2 more characters
+                if i + 2 >= bytes.len() {
+                    return exec_err!(
+                        "Invalid percent-encoding: incomplete sequence at position {}",
+                        i
+                    );
+                }
+
+                let hex1 = bytes[i + 1];
+                let hex2 = bytes[i + 2];
+
+                if !hex1.is_ascii_hexdigit() || !hex2.is_ascii_hexdigit() {
+                    return exec_err!(
+                        "Invalid percent-encoding: invalid hex sequence '%{}{}' at position {}",
+                        hex1 as char,
+                        hex2 as char,
+                        i
+                    );
+                }
+                i += 3;
+            } else {
+                i += 1;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ScalarUDFImpl for UrlDecode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "url_decode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() != 1 {
+            return plan_err!(
+                "{} expects 1 argument, but got {}",
+                self.name(),
+                arg_types.len()
+            );
+        }
+        // As the type signature is already checked, we can safely return the type of the first argument
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_url_decode, vec![])(&args)
+    }
+}
+
+/// Core implementation of URL decoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the URL-encoded strings to decode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing decoded strings
+/// * `Err(DataFusionError)` - If validation fails or invalid arguments are provided
+///
+fn spark_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    spark_handled_url_decode(args, |x| x)
+}
+
+pub fn spark_handled_url_decode(
+    args: &[ArrayRef],
+    err_handle_fn: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+) -> Result<ArrayRef> {
+    if args.len() != 1 {
+        return exec_err!("`url_decode` expects 1 argument");
+    }
+
+    match &args[0].data_type() {
+        DataType::Utf8 => as_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<StringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::LargeUtf8 => as_large_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<LargeStringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::Utf8View => as_string_view_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlDecode::decode).transpose())
+            .map(&err_handle_fn)
+            .collect::<Result<StringViewArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        other => exec_err!("`url_decode`: Expr must be STRING, got {other:?}"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::StringArray;
+    use datafusion_common::Result;
+
+    use super::*;
+
+    #[test]
+    fn test_decode() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("https%3A%2F%2Fspark.apache.org"),
+            Some("inva+lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("%7E%21%40%23%24%25%5E%26%2A%28%29%5F%2B"),
+            Some("%E4%BD%A0%E5%A5%BD"),
+            Some(""),
+            None,
+        ]));
+        let expected = StringArray::from(vec![
+            Some("https://spark.apache.org"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+            Some("~!@#$%^&*()_+"),
+            Some("你好"),
+            Some(""),
+            None,
+        ]);
+
+        let result = spark_url_decode(&[input as ArrayRef])?;
+        let result = as_string_array(&result)?;
+
+        assert_eq!(&expected, result);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_decode_error() -> Result<()> {
+        let input = Arc::new(StringArray::from(vec![
+            Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid percent encoded character
+            // Valid cases
+            Some("https%3A%2F%2Fspark.apache.org"),
+            None,
+        ]));
+
+        let result = spark_url_decode(&[input]);
+        assert!(
+            result.is_err_and(|e| e.to_string().contains("Invalid percent-encoding"))
+        );
+
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/url/url_encode.rs b/datafusion/spark/src/function/url/url_encode.rs
new file mode 100644
index 0000000000000..7292eb530a6ae
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_encode.rs
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+    as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{Result, exec_err, plan_err};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use url::form_urlencoded::byte_serialize;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlEncode {
+    signature: Signature,
+}
+
+impl Default for UrlEncode {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl UrlEncode {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+
+    /// Encode a string to application/x-www-form-urlencoded format.
+    ///
+    /// # Arguments
+    ///
+    /// * `value` - The string to encode
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(String)` - The encoded string
+    ///
+    fn encode(value: &str) -> Result<String> {
+        Ok(byte_serialize(value.as_bytes()).collect::<String>())
+    }
+}
+
+impl ScalarUDFImpl for UrlEncode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "url_encode"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() != 1 {
+            return plan_err!(
+                "{} expects 1 argument, but got {}",
+                self.name(),
+                arg_types.len()
+            );
+        }
+        // As the type signature is already checked, we can safely return the type of the first argument
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let ScalarFunctionArgs { args, .. } = args;
+        make_scalar_function(spark_url_encode, vec![])(&args)
+    }
+}
+
+/// Core implementation of URL encoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the strings to encode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing encoded strings
+/// * `Err(DataFusionError)` - If invalid arguments are provided
+///
+fn spark_url_encode(args: &[ArrayRef]) -> Result<ArrayRef> {
+    if args.len() != 1 {
+        return exec_err!("`url_encode` expects 1 argument");
+    }
+
+    match &args[0].data_type() {
+        DataType::Utf8 => as_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<StringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::LargeUtf8 => as_large_string_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<LargeStringArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        DataType::Utf8View => as_string_view_array(&args[0])?
+            .iter()
+            .map(|x| x.map(UrlEncode::encode).transpose())
+            .collect::<Result<StringViewArray>>()
+            .map(|array| Arc::new(array) as ArrayRef),
+        other => exec_err!("`url_encode`: Expr must be STRING, got {other:?}"),
+    }
+}
diff --git a/datafusion/spark/src/function/utils.rs b/datafusion/spark/src/function/utils.rs
index 85af4bb927ca5..e272d91d8a70e 100644
--- a/datafusion/spark/src/function/utils.rs
+++ b/datafusion/spark/src/function/utils.rs
@@ -23,8 +23,9 @@ pub mod test {
     /// $EXPECTED_TYPE is the expected value type
     /// $EXPECTED_DATA_TYPE is the expected result type
     /// $ARRAY_TYPE is the column type after function applied
+    /// $CONFIG_OPTIONS config options to pass to function
     macro_rules! test_scalar_function {
-        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
+        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident, $CONFIG_OPTIONS:expr) => {
             let expected: datafusion_common::Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
             let func = $FUNC;
 
@@ -33,13 +34,13 @@ pub mod test {
                 .enumerate()
                 .map(|(idx, arg)| {
 
-                let nullable = match arg {
-                    datafusion_expr::ColumnarValue::Scalar(scalar) => scalar.is_null(),
-                    datafusion_expr::ColumnarValue::Array(a) => a.null_count() > 0,
-                };
+                    let nullable = match arg {
+                        datafusion_expr::ColumnarValue::Scalar(scalar) => scalar.is_null(),
+                        datafusion_expr::ColumnarValue::Array(a) => a.null_count() > 0,
+                    };
 
                 std::sync::Arc::new(arrow::datatypes::Field::new(format!("arg_{idx}"), arg.data_type(), nullable))
-            })
+                })
                 .collect::<Vec<_>>();
 
             let cardinality = $ARGS
@@ -51,8 +52,8 @@ pub mod test {
                 .unwrap_or(1);
 
             let scalar_arguments = $ARGS.iter().map(|arg| match arg {
-                datafusion_expr::ColumnarValue::Scalar(scalar) => Some(scalar.clone()),
-                datafusion_expr::ColumnarValue::Array(_) => None,
+                    datafusion_expr::ColumnarValue::Scalar(scalar) => Some(scalar.clone()),
+                    datafusion_expr::ColumnarValue::Array(_) => None,
             }).collect::<Vec<_>>();
             let scalar_arguments_refs = scalar_arguments.iter().map(|arg| arg.as_ref()).collect::<Vec<_>>();
 
@@ -64,43 +65,58 @@ pub mod test {
 
             match expected {
                 Ok(expected) => {
-                    let return_field = return_field.unwrap();
-                    assert_eq!(return_field.data_type(), &$EXPECTED_DATA_TYPE);
+                    if let Ok(return_field) = return_field {
+                        assert_eq!(return_field.data_type(), &$EXPECTED_DATA_TYPE);
 
-                    let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
-                        args: $ARGS,
-                        number_rows: cardinality,
-                        return_field,
-                        arg_fields: arg_fields.clone(),
-                    });
-                    assert_eq!(result.is_ok(), true, "function returned an error: {}", result.unwrap_err());
-
-                    let result = result.unwrap().to_array(cardinality).expect("Failed to convert to array");
-                    let result = result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to type");
-                    assert_eq!(result.data_type(), &$EXPECTED_DATA_TYPE);
+                        match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
+                            args: $ARGS,
+                            number_rows: cardinality,
+                            return_field,
+                            arg_fields: arg_fields.clone(),
+                            config_options: $CONFIG_OPTIONS,
+                        }) {
+                            Ok(col_value) => {
+                                match col_value.to_array(cardinality) {
+                                    Ok(array) => {
+                                        let result = array
+                                            .as_any()
+                                            .downcast_ref::<$ARRAY_TYPE>()
+                                            .expect("Failed to convert to type");
+                                        assert_eq!(result.data_type(), &$EXPECTED_DATA_TYPE);
 
-                    // value is correct
-                    match expected {
-                        Some(v) => assert_eq!(result.value(0), v),
-                        None => assert!(result.is_null(0)),
-                    };
-                }
-                Err(expected_error) => {
-                    if return_field.is_err() {
-                        match return_field {
-                            Ok(_) => assert!(false, "expected error"),
-                            Err(error) => { datafusion_common::assert_contains!(expected_error.strip_backtrace(), error.strip_backtrace()); }
+                                       // value is correct
+                                        match expected {
+                                            Some(v) => assert_eq!(result.value(0), v),
+                                            None => assert!(result.is_null(0)),
+                                        };
+                                    }
+                                    Err(err) => {
+                                        panic!("Failed to convert to array: {err}");
+                                    }
+                                }
+                            }
+                            Err(err) => {
+                                panic!("function returned an error: {err}");
+                            }
                         }
+                    } else {
+                        panic!("Expected return_field to be Ok but got Err");
                     }
-                    else {
-                        let return_field = return_field.unwrap();
-
+                }
+                Err(expected_error) => {
+                    if let Err(error) = &return_field {
+                        datafusion_common::assert_contains!(
+                            expected_error.strip_backtrace(),
+                            error.strip_backtrace()
+                        );
+                    } else if let Ok(value) = return_field {
                         // invoke is expected error - cannot use .expect_err() due to Debug not being implemented
-                        match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
+                        match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs {
                             args: $ARGS,
                             number_rows: cardinality,
-                            return_field,
+                            return_field: value,
                             arg_fields,
+                            config_options: $CONFIG_OPTIONS,
                         }) {
                             Ok(_) => assert!(false, "expected error"),
                             Err(error) => {
@@ -111,6 +127,18 @@ pub mod test {
                 }
             };
         };
+
+        ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
+            test_scalar_function!(
+                $FUNC,
+                $ARGS,
+                $EXPECTED,
+                $EXPECTED_TYPE,
+                $EXPECTED_DATA_TYPE,
+                $ARRAY_TYPE,
+                std::sync::Arc::new(datafusion_common::config::ConfigOptions::default())
+            )
+        };
     }
 
     pub(crate) use test_scalar_function;
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index 1fe5b6ecac8f4..9575f560b8d0e 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -19,29 +19,121 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Spark Expression packages for [DataFusion].
 //!
-//! This crate contains a collection of various Spark expression packages for DataFusion,
+//! This crate contains a collection of various Spark function packages for DataFusion,
 //! implemented using the extension API.
 //!
 //! [DataFusion]: https://crates.io/crates/datafusion
 //!
-//! # Available Packages
+//!
+//! # Available Function Packages
 //! See the list of [modules](#modules) in this crate for available packages.
 //!
-//! # Using A Package
-//! You can register all functions in all packages using the [`register_all`] function.
+//! # Example: using all function packages
+//!
+//! You can register all the functions in all packages using the [`register_all`]
+//! function as shown below. Any existing functions will be overwritten, with these
+//! Spark functions taking priority.
+//!
+//! ```
+//! # use datafusion_execution::FunctionRegistry;
+//! # use datafusion_expr::{ScalarUDF, AggregateUDF, WindowUDF};
+//! # use datafusion_expr::planner::ExprPlanner;
+//! # use datafusion_common::Result;
+//! # use std::collections::HashSet;
+//! # use std::sync::Arc;
+//! # // Note: We can't use a real SessionContext here because the
+//! # // `datafusion_spark` crate has no dependence on the DataFusion crate
+//! # // thus use a dummy SessionContext that has enough of the implementation
+//! # struct SessionContext {}
+//! # impl FunctionRegistry for SessionContext {
+//! #    fn register_udf(&mut self, _udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> { Ok (None) }
+//! #    fn udfs(&self) -> HashSet<String> { unimplemented!() }
+//! #    fn udafs(&self) -> HashSet<String> { unimplemented!() }
+//! #    fn udwfs(&self) -> HashSet<String> { unimplemented!() }
+//! #    fn udf(&self, _name: &str) -> Result<Arc<ScalarUDF>> { unimplemented!() }
+//! #    fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {unimplemented!() }
+//! #    fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> { unimplemented!() }
+//! #    fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> { unimplemented!() }
+//! # }
+//! # impl SessionContext {
+//! #   fn new() -> Self { SessionContext {} }
+//! #   async fn sql(&mut self, _query: &str) -> Result<()> { Ok(()) }
+//! #  }
+//! #
+//! # async fn stub() -> Result<()> {
+//! // Create a new session context
+//! let mut ctx = SessionContext::new();
+//! // Register all Spark functions with the context
+//! datafusion_spark::register_all(&mut ctx)?;
+//! // Run a query using the `sha2` function which is now available and has Spark semantics
+//! let df = ctx.sql("SELECT sha2('The input String', 256)").await?;
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: calling a specific function in Rust
+//!
+//! Each package also exports an `expr_fn` submodule that create [`Expr`]s for
+//! invoking functions via rust using a fluent style. For example, to invoke the
+//! `sha2` function, you can use the following code:
+//!
+//! ```rust
+//! # use datafusion_expr::{col, lit};
+//! use datafusion_spark::expr_fn::sha2;
+//! // Create the expression `sha2(my_data, 256)`
+//! let expr = sha2(col("my_data"), lit(256));
+//! ```
+//!
+//! # Example: using the Spark expression planner
+//!
+//! The [`planner::SparkFunctionPlanner`] provides Spark-compatible expression
+//! planning, such as mapping SQL `EXTRACT` expressions to Spark's `date_part`
+//! function. To use it, register it with your session context:
+//!
+//! ```ignore
+//! use std::sync::Arc;
+//! use datafusion::prelude::SessionContext;
+//! use datafusion_spark::planner::SparkFunctionPlanner;
 //!
-//! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke
-//! functions using a fluent style. For example:
+//! let mut ctx = SessionContext::new();
+//! // Register the Spark expression planner
+//! ctx.register_expr_planner(Arc::new(SparkFunctionPlanner))?;
+//! // Now EXTRACT expressions will use Spark semantics
+//! let df = ctx.sql("SELECT EXTRACT(YEAR FROM timestamp_col) FROM my_table").await?;
+//! ```
 //!
 //![`Expr`]: datafusion_expr::Expr
+//!
+//! # Example: enabling Apache Spark features with SessionStateBuilder
+//!
+//! The recommended way to enable Apache Spark compatibility is to use the
+//! `SessionStateBuilderSpark` extension trait. This registers all
+//! Apache Spark functions (scalar, aggregate, window, and table) as well as the Apache Spark
+//! expression planner.
+//!
+//! Enable the `core` feature in your `Cargo.toml`:
+//! ```toml
+//! datafusion-spark = { version = "X", features = ["core"] }
+//! ```
+//!
+//! Then use the extension trait - see [`SessionStateBuilderSpark::with_spark_features`]
+//! for an example.
 
 pub mod function;
+pub mod planner;
+
+#[cfg(feature = "core")]
+mod session_state;
+
+#[cfg(feature = "core")]
+pub use session_state::SessionStateBuilderSpark;
 
 use datafusion_catalog::TableFunction;
 use datafusion_common::Result;
@@ -51,10 +143,11 @@ use log::debug;
 use std::sync::Arc;
 
 /// Fluent-style API for creating `Expr`s
-#[allow(unused)]
+#[expect(unused_imports)]
 pub mod expr_fn {
     pub use super::function::aggregate::expr_fn::*;
     pub use super::function::array::expr_fn::*;
+    pub use super::function::bitmap::expr_fn::*;
     pub use super::function::bitwise::expr_fn::*;
     pub use super::function::collection::expr_fn::*;
     pub use super::function::conditional::expr_fn::*;
@@ -69,8 +162,8 @@ pub mod expr_fn {
     pub use super::function::math::expr_fn::*;
     pub use super::function::misc::expr_fn::*;
     pub use super::function::predicate::expr_fn::*;
-    pub use super::function::r#struct::expr_fn::*;
     pub use super::function::string::expr_fn::*;
+    pub use super::function::r#struct::expr_fn::*;
     pub use super::function::table::expr_fn::*;
     pub use super::function::url::expr_fn::*;
     pub use super::function::window::expr_fn::*;
@@ -81,6 +174,7 @@ pub mod expr_fn {
 pub fn all_default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
     function::array::functions()
         .into_iter()
+        .chain(function::bitmap::functions())
         .chain(function::bitwise::functions())
         .chain(function::collection::functions())
         .chain(function::conditional::functions())
@@ -117,7 +211,8 @@ pub fn all_default_table_functions() -> Vec<Arc<TableFunction>> {
     function::table::functions()
 }
 
-/// Registers all enabled packages with a [`FunctionRegistry`]
+/// Registers all enabled packages with a [`FunctionRegistry`], overriding any existing
+/// functions if there is a name clash.
 pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
     let scalar_functions: Vec<Arc<ScalarUDF>> = all_default_scalar_functions();
     scalar_functions.into_iter().try_for_each(|udf| {
diff --git a/datafusion/spark/src/planner.rs b/datafusion/spark/src/planner.rs
new file mode 100644
index 0000000000000..2dafbb1f9a570
--- /dev/null
+++ b/datafusion/spark/src/planner.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
+
+#[derive(Default, Debug)]
+pub struct SparkFunctionPlanner;
+
+impl ExprPlanner for SparkFunctionPlanner {
+    fn plan_extract(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::function::datetime::date_part(), args),
+        )))
+    }
+
+    fn plan_substring(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::function::string::substring(), args),
+        )))
+    }
+}
diff --git a/datafusion/spark/src/session_state.rs b/datafusion/spark/src/session_state.rs
new file mode 100644
index 0000000000000..e39de3a5888ea
--- /dev/null
+++ b/datafusion/spark/src/session_state.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::execution::SessionStateBuilder;
+
+use crate::planner::SparkFunctionPlanner;
+use crate::{
+    all_default_aggregate_functions, all_default_scalar_functions,
+    all_default_table_functions, all_default_window_functions,
+};
+
+/// Extension trait for adding Apache Spark features to [`SessionStateBuilder`].
+///
+/// This trait provides a convenient way to register all Apache Spark-compatible
+/// functions and planners with a DataFusion session.
+///
+/// # Example
+///
+/// ```rust
+/// use datafusion::execution::SessionStateBuilder;
+/// use datafusion_spark::SessionStateBuilderSpark;
+///
+/// // Create a SessionState with Apache Spark features enabled
+/// // note: the order matters here, `with_spark_features` should be
+/// // called after `with_default_features` to overwrite any existing functions
+/// let state = SessionStateBuilder::new()
+///     .with_default_features()
+///     .with_spark_features()
+///     .build();
+/// ```
+pub trait SessionStateBuilderSpark {
+    /// Adds all expr_planners, scalar, aggregate, window and table functions
+    /// compatible with Apache Spark.
+    ///
+    /// Note: This overwrites any previously registered items with the same name.
+    fn with_spark_features(self) -> Self;
+}
+
+impl SessionStateBuilderSpark for SessionStateBuilder {
+    fn with_spark_features(mut self) -> Self {
+        self.expr_planners()
+            .get_or_insert_with(Vec::new)
+            // planners are evaluated in order of insertion. Push Apache Spark function planner to the front
+            // to take precedence over others
+            .insert(0, Arc::new(SparkFunctionPlanner));
+
+        self.scalar_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_scalar_functions());
+
+        self.aggregate_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_aggregate_functions());
+
+        self.window_functions()
+            .get_or_insert_with(Vec::new)
+            .extend(all_default_window_functions());
+
+        self.table_functions()
+            .get_or_insert_with(HashMap::new)
+            .extend(
+                all_default_table_functions()
+                    .into_iter()
+                    .map(|f| (f.name().to_string(), f)),
+            );
+
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_session_state_with_spark_features() {
+        let state = SessionStateBuilder::new().with_spark_features().build();
+
+        assert!(
+            state.scalar_functions().contains_key("sha2"),
+            "Apache Spark scalar function 'sha2' should be registered"
+        );
+
+        assert!(
+            state.aggregate_functions().contains_key("try_sum"),
+            "Apache Spark aggregate function 'try_sum' should be registered"
+        );
+
+        assert!(
+            !state.expr_planners().is_empty(),
+            "Apache Spark expr planners should be registered"
+        );
+    }
+}
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index b778db46769d0..cc299ce507099 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -31,6 +31,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -43,11 +46,17 @@ unicode_expressions = []
 unparser = []
 recursive_protection = ["dep:recursive"]
 
+# Note the sql planner should not depend directly on the datafusion-function packages
+# so that it can be used in a standalone manner with other function implementations.
+#
+# They are used for testing purposes only, so they are in the dev-dependencies section.
 [dependencies]
 arrow = { workspace = true }
 bigdecimal = { workspace = true }
-datafusion-common = { workspace = true, default-features = true }
-datafusion-expr = { workspace = true }
+chrono = { workspace = true }
+datafusion-common = { workspace = true, features = ["sql"] }
+datafusion-expr = { workspace = true, features = ["sql"] }
+datafusion-functions-nested = { workspace = true, features = ["sql"] }
 indexmap = { workspace = true }
 log = { workspace = true }
 recursive = { workspace = true, optional = true }
@@ -56,11 +65,12 @@ sqlparser = { workspace = true }
 
 [dev-dependencies]
 ctor = { workspace = true }
+# please do not move these dependencies to the main dependencies section
 datafusion-functions = { workspace = true, default-features = true }
 datafusion-functions-aggregate = { workspace = true }
-datafusion-functions-nested = { workspace = true }
+datafusion-functions-nested = { workspace = true, features = ["sql"] }
 datafusion-functions-window = { workspace = true }
 env_logger = { workspace = true }
 insta = { workspace = true }
-paste = "^1.0"
+itertools = { workspace = true }
 rstest = { workspace = true }
diff --git a/datafusion/sql/README.md b/datafusion/sql/README.md
index 98f3c4faa2ec0..d0e5e498e514c 100644
--- a/datafusion/sql/README.md
+++ b/datafusion/sql/README.md
@@ -17,17 +17,24 @@
   under the License.
 -->
 
-# DataFusion SQL Query Planner
+# Apache DataFusion SQL Query Planner
 
 This crate provides a general purpose SQL query planner that can parse SQL and translate queries into logical
-plans. Although this crate is used by the [DataFusion][df] query engine, it was designed to be easily usable from any
+plans. Although this crate is used by the [Apache DataFusion] query engine, it was designed to be easily usable from any
 project that requires a SQL query planner and does not make any assumptions about how the resulting logical plan
 will be translated to a physical plan. For example, there is no concept of row-based versus columnar execution in the
 logical plan.
 
+Note that the [`datafusion`] crate re-exports this module. If you are already
+using the [`datafusion`] crate in your project, there is no reason to use this
+crate directly in your project as well.
+
+[apache datafusion]: https://datafusion.apache.org/
+[`datafusion`]: https://crates.io/crates/datafusion
+
 ## Example Usage
 
-See the [examples](examples) directory for fully working examples.
+See the [examples] directory for fully working examples.
 
 Here is an example of producing a logical plan from a SQL string.
 
@@ -62,8 +69,8 @@ fn main() {
 ```
 
 This is the logical plan that is produced from this example. Note that this is an **unoptimized**
-logical plan. The [datafusion-optimizer](https://crates.io/crates/datafusion-optimizer) crate provides a query
-optimizer that can be applied to plans produced by this crate.
+logical plan. The [datafusion-optimizer] crate provides a query optimizer that can be applied to
+plans produced by this crate.
 
 ```
 Sort: state_tax DESC NULLS FIRST
@@ -80,4 +87,5 @@ Sort: state_tax DESC NULLS FIRST
             TableScan: orders
 ```
 
-[df]: https://crates.io/crates/datafusion
+[examples]: examples
+[datafusion-optimizer]: https://crates.io/crates/datafusion-optimizer
diff --git a/datafusion/sql/examples/sql.rs b/datafusion/sql/examples/sql.rs
index 2c0bb86cd8087..dbedaf3f15b8d 100644
--- a/datafusion/sql/examples/sql.rs
+++ b/datafusion/sql/examples/sql.rs
@@ -20,11 +20,11 @@ use std::{collections::HashMap, sync::Arc};
 use arrow::datatypes::{DataType, Field, Schema};
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::{plan_err, Result, TableReference};
-use datafusion_expr::planner::ExprPlanner;
+use datafusion_common::{Result, TableReference, plan_err};
 use datafusion_expr::WindowUDF;
+use datafusion_expr::planner::ExprPlanner;
 use datafusion_expr::{
-    logical_plan::builder::LogicalTableSource, AggregateUDF, ScalarUDF, TableSource,
+    AggregateUDF, ScalarUDF, TableSource, logical_plan::builder::LogicalTableSource,
 };
 use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions_aggregate::count::count_udaf;
diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs
index 3650aea9c3c20..18766d7056355 100644
--- a/datafusion/sql/src/cte.rs
+++ b/datafusion/sql/src/cte.rs
@@ -19,11 +19,9 @@ use std::sync::Arc;
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
-use arrow::datatypes::Schema;
 use datafusion_common::{
-    not_impl_err, plan_err,
+    Result, not_impl_err, plan_err,
     tree_node::{TreeNode, TreeNodeRecursion},
-    Result,
 };
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, TableSource};
 use sqlparser::ast::{Query, SetExpr, SetOperator, With};
@@ -47,7 +45,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
             // Create a logical plan for the CTE
             let cte_plan = if is_recursive {
-                self.recursive_cte(cte_name.clone(), *cte.query, planner_context)?
+                self.recursive_cte(&cte_name, *cte.query, planner_context)?
             } else {
                 self.non_recursive_cte(*cte.query, planner_context)?
             };
@@ -71,7 +69,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     fn recursive_cte(
         &self,
-        cte_name: String,
+        cte_name: &str,
         mut cte_query: Query,
         planner_context: &mut PlannerContext,
     ) -> Result<LogicalPlan> {
@@ -93,7 +91,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             } => (left, right, set_quantifier),
             other => {
                 // If the query is not a UNION, then it is not a recursive CTE
-                cte_query.body = Box::new(other);
+                *cte_query.body = other;
                 return self.non_recursive_cte(cte_query, planner_context);
             }
         };
@@ -135,10 +133,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
         // ---------- Step 2: Create a temporary relation ------------------
         // Step 2.1: Create a table source for the temporary relation
-        let work_table_source = self.context_provider.create_cte_work_table(
-            &cte_name,
-            Arc::new(Schema::from(static_plan.schema().as_ref())),
-        )?;
+        let work_table_source = self
+            .context_provider
+            .create_cte_work_table(cte_name, Arc::clone(static_plan.schema().inner()))?;
 
         // Step 2.2: Create a temporary relation logical plan that will be used
         // as the input to the recursive term
@@ -149,14 +146,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         )?
         .build()?;
 
-        let name = cte_name.clone();
+        let name = cte_name.to_string();
 
         // Step 2.3: Register the temporary relation in the planning context
         // For all the self references in the variadic term, we'll replace it
         // with the temporary relation we created above by temporarily registering
         // it as a CTE. This temporary relation in the planning context will be
         // replaced by the actual CTE plan once we're done with the planning.
-        planner_context.insert_cte(cte_name.clone(), work_table_plan);
+        planner_context.insert_cte(cte_name.to_string(), work_table_plan);
 
         // ---------- Step 3: Compile the recursive term ------------------
         // this uses the named_relation we inserted above to resolve the
@@ -168,7 +165,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // if not, it is a non-recursive CTE
         if !has_work_table_reference(&recursive_plan, &work_table_source) {
             // Remove the work table plan from the context
-            planner_context.remove_cte(&cte_name);
+            planner_context.remove_cte(cte_name);
             // Compile it as a non-recursive CTE
             return self.set_operation_to_plan(
                 SetOperator::Union,
@@ -193,11 +190,11 @@ fn has_work_table_reference(
 ) -> bool {
     let mut has_reference = false;
     plan.apply(|node| {
-        if let LogicalPlan::TableScan(scan) = node {
-            if Arc::ptr_eq(&scan.source, work_table_source) {
-                has_reference = true;
-                return Ok(TreeNodeRecursion::Stop);
-            }
+        if let LogicalPlan::TableScan(scan) = node
+            && Arc::ptr_eq(&scan.source, work_table_source)
+        {
+            has_reference = true;
+            return Ok(TreeNodeRecursion::Stop);
         }
         Ok(TreeNodeRecursion::Continue)
     })
diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs
index 1c06f5ee926f9..4e9025e02e0c7 100644
--- a/datafusion/sql/src/expr/binary_op.rs
+++ b/datafusion/sql/src/expr/binary_op.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use crate::planner::{ContextProvider, SqlToRel};
-use datafusion_common::{not_impl_err, Result};
+use datafusion_common::{Result, not_impl_err};
 use datafusion_expr::Operator;
 use sqlparser::ast::BinaryOperator;
 
 impl<S: ContextProvider> SqlToRel<'_, S> {
-    pub(crate) fn parse_sql_binary_op(&self, op: BinaryOperator) -> Result<Operator> {
+    pub(crate) fn parse_sql_binary_op(&self, op: &BinaryOperator) -> Result<Operator> {
         match op {
             BinaryOperator::Gt => Ok(Operator::Gt),
             BinaryOperator::GtEq => Ok(Operator::GtEq),
@@ -68,6 +68,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             BinaryOperator::Question => Ok(Operator::Question),
             BinaryOperator::QuestionAnd => Ok(Operator::QuestionAnd),
             BinaryOperator::QuestionPipe => Ok(Operator::QuestionPipe),
+            BinaryOperator::Custom(s) if s == ":" => Ok(Operator::Colon),
             _ => not_impl_err!("Unsupported binary operator: {:?}", op),
         }
     }
diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs
index 97ff7bf199040..3ec699ae57624 100644
--- a/datafusion/sql/src/expr/function.rs
+++ b/datafusion/sql/src/expr/function.rs
@@ -19,18 +19,20 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use arrow::datatypes::DataType;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
-    DFSchema, Dependency, Diagnostic, Result, Span,
+    DFSchema, Dependency, Diagnostic, Result, Span, internal_datafusion_err,
+    internal_err, not_impl_err, plan_datafusion_err, plan_err,
 };
-use datafusion_expr::expr::{ScalarFunction, Unnest, WildcardOptions};
-use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr};
 use datafusion_expr::{
-    expr, Expr, ExprFunctionExt, ExprSchemable, WindowFrame, WindowFunctionDefinition,
+    Expr, ExprSchemable, SortExpr, WindowFrame, WindowFunctionDefinition,
+    arguments::ArgumentName,
+    expr,
+    expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction},
+    planner::{PlannerResult, RawAggregateExpr, RawWindowExpr},
 };
 use sqlparser::ast::{
     DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg,
     FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments,
-    NullTreatment, ObjectName, OrderByExpr, Spanned, WindowType,
+    ObjectName, OrderByExpr, Spanned, WindowType,
 };
 
 /// Suggest a valid function based on an invalid input function name
@@ -93,6 +95,8 @@ struct FunctionArgs {
     distinct: bool,
     /// WITHIN GROUP clause, if any
     within_group: Vec<OrderByExpr>,
+    /// Was the function called without parenthesis, i.e. could this also be a column reference?
+    function_without_parentheses: bool,
 }
 
 impl FunctionArgs {
@@ -115,9 +119,10 @@ impl FunctionArgs {
                 order_by: vec![],
                 over,
                 filter,
-                null_treatment,
+                null_treatment: null_treatment.map(|v| v.into()),
                 distinct: false,
                 within_group,
+                function_without_parentheses: args == FunctionArguments::None,
             });
         };
 
@@ -148,36 +153,45 @@ impl FunctionArgs {
                 FunctionArgumentClause::OrderBy(oby) => {
                     if order_by.is_some() {
                         if !within_group.is_empty() {
-                            return plan_err!("ORDER BY clause is only permitted in WITHIN GROUP clause when a WITHIN GROUP is used");
+                            return plan_err!(
+                                "ORDER BY clause is only permitted in WITHIN GROUP clause when a WITHIN GROUP is used"
+                            );
                         }
-                        return not_impl_err!("Calling {name}: Duplicated ORDER BY clause in function arguments");
+                        return not_impl_err!(
+                            "Calling {name}: Duplicated ORDER BY clause in function arguments"
+                        );
                     }
                     order_by = Some(oby);
                 }
                 FunctionArgumentClause::Limit(limit) => {
                     return not_impl_err!(
                         "Calling {name}: LIMIT not supported in function arguments: {limit}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::OnOverflow(overflow) => {
                     return not_impl_err!(
                         "Calling {name}: ON OVERFLOW not supported in function arguments: {overflow}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::Having(having) => {
                     return not_impl_err!(
                         "Calling {name}: HAVING not supported in function arguments: {having}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::Separator(sep) => {
                     return not_impl_err!(
                         "Calling {name}: SEPARATOR not supported in function arguments: {sep}"
-                    )
+                    );
                 }
                 FunctionArgumentClause::JsonNullClause(jn) => {
                     return not_impl_err!(
                         "Calling {name}: JSON NULL clause not supported in function arguments: {jn}"
-                    )
+                    );
+                }
+                FunctionArgumentClause::JsonReturningClause(jr) => {
+                    return not_impl_err!(
+                        "Calling {name}: JSON RETURNING clause not supported in function arguments: {jr}"
+                    );
                 }
             }
         }
@@ -196,13 +210,17 @@ impl FunctionArgs {
             order_by,
             over,
             filter,
-            null_treatment,
+            null_treatment: null_treatment.map(|v| v.into()),
             distinct,
             within_group,
+            function_without_parentheses: false,
         })
     }
 }
 
+// Helper type for extracting WITHIN GROUP ordering and prepended args
+type WithinGroupExtraction = (Vec<SortExpr>, Vec<Expr>, Vec<Option<ArgumentName>>);
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
     pub(super) fn sql_function_to_expr(
         &self,
@@ -212,7 +230,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     ) -> Result<Expr> {
         let function_args = FunctionArgs::try_new(function)?;
         let FunctionArgs {
-            name,
+            name: object_name,
             args,
             order_by,
             over,
@@ -220,37 +238,81 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             null_treatment,
             distinct,
             within_group,
+            function_without_parentheses,
         } = function_args;
 
         if over.is_some() && !within_group.is_empty() {
-            return plan_err!("OVER and WITHIN GROUP clause are can not be used together. \
-                OVER is for window function, whereas WITHIN GROUP is for ordered set aggregate function");
+            return plan_err!(
+                "OVER and WITHIN GROUP clause cannot be used together. \
+                OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions"
+            );
+        }
+
+        if !order_by.is_empty() && !within_group.is_empty() {
+            return plan_err!(
+                "ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function"
+            );
         }
 
         // If function is a window function (it has an OVER clause),
         // it shouldn't have ordering requirement as function argument
         // required ordering should be defined in OVER clause.
         let is_function_window = over.is_some();
-        let sql_parser_span = name.0[0].span();
-        let name = if name.0.len() > 1 {
+        let sql_parser_span = object_name.0[0].span();
+        let name = if object_name.0.len() > 1 {
             // DF doesn't handle compound identifiers
             // (e.g. "foo.bar") for function names yet
-            name.to_string()
+            object_name.to_string()
         } else {
-            match name.0[0].as_ident() {
+            match object_name.0[0].as_ident() {
                 Some(ident) => crate::utils::normalize_ident(ident.clone()),
                 None => {
                     return plan_err!(
                         "Expected an identifier in function name, but found {:?}",
-                        name.0[0]
-                    )
+                        object_name.0[0]
+                    );
                 }
             }
         };
 
-        if name.eq("make_map") {
+        // handle make_map and map functions
+        // make_map always uses plan_make_map: make_map(k1, v1, k2, v2, ...)
+        // map has 2 syntaxes:
+        //     1. map([keys], [values]) - two arrays that get zipped
+        //     2. map(k1, v1, k2, v2, ...) - variadic pairs (uses plan_make_map)
+        let use_plan_make_map = match name.as_str() {
+            "make_map" => true,
+            "map" => {
+                // for map, check if this is the first syntax variant (two-array)
+                let args =
+                    self.function_args_to_expr(args.clone(), schema, planner_context)?;
+
+                let is_two_array_syntax = args.len() == 2
+                    && args.iter().all(|arg| {
+                        matches!(
+                            arg.get_type(schema),
+                            Ok(DataType::List(_))
+                                | Ok(DataType::LargeList(_))
+                                | Ok(DataType::FixedSizeList(_, _))
+                        )
+                    });
+
+                // map function with variadic syntax requires non-empty list of arguments
+                if !is_two_array_syntax && args.is_empty() {
+                    return plan_err!(
+                        "Function 'map' expected at least one argument but received 0"
+                    );
+                }
+
+                !is_two_array_syntax
+            }
+            _ => false,
+        };
+
+        if use_plan_make_map {
             let mut fn_args =
                 self.function_args_to_expr(args.clone(), schema, planner_context)?;
+
             for planner in self.context_provider.get_expr_planners().iter() {
                 match planner.plan_make_map(fn_args)? {
                     PlannerResult::Planned(expr) => return Ok(expr),
@@ -260,8 +322,45 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
         // User-defined function (UDF) should have precedence
         if let Some(fm) = self.context_provider.get_function_meta(&name) {
-            let args = self.function_args_to_expr(args, schema, planner_context)?;
-            return Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fm, args)));
+            let (args, arg_names) =
+                self.function_args_to_expr_with_names(args, schema, planner_context)?;
+
+            let resolved_args = if arg_names.iter().any(|name| name.is_some()) {
+                if let Some(param_names) = &fm.signature().parameter_names {
+                    datafusion_expr::arguments::resolve_function_arguments(
+                        param_names,
+                        args,
+                        arg_names,
+                    )?
+                } else {
+                    return plan_err!(
+                        "Function '{}' does not support named arguments",
+                        fm.name()
+                    );
+                }
+            } else {
+                args
+            };
+
+            // After resolution, all arguments are positional
+            let inner = ScalarFunction::new_udf(fm, resolved_args);
+
+            if name.eq_ignore_ascii_case(inner.name()) {
+                return Ok(Expr::ScalarFunction(inner));
+            } else {
+                // If the function is called by an alias, a verbose string representation is created
+                // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias`
+                // to ensure the output column name matches the user's query.
+                let arg_names = inner
+                    .args
+                    .iter()
+                    .map(|arg| arg.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let verbose_alias = format!("{name}({arg_names})");
+
+                return Ok(Expr::ScalarFunction(inner).alias(verbose_alias));
+            }
         }
 
         // Build Unnest expression
@@ -335,14 +434,46 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             };
 
             if let Ok(fun) = self.find_window_func(&name) {
-                let args = self.function_args_to_expr(args, schema, planner_context)?;
+                let (args, arg_names) =
+                    self.function_args_to_expr_with_names(args, schema, planner_context)?;
+
+                let resolved_args = if arg_names.iter().any(|name| name.is_some()) {
+                    let signature = match &fun {
+                        WindowFunctionDefinition::AggregateUDF(udaf) => udaf.signature(),
+                        WindowFunctionDefinition::WindowUDF(udwf) => udwf.signature(),
+                    };
+
+                    if let Some(param_names) = &signature.parameter_names {
+                        datafusion_expr::arguments::resolve_function_arguments(
+                            param_names,
+                            args,
+                            arg_names,
+                        )?
+                    } else {
+                        return plan_err!(
+                            "Window function '{}' does not support named arguments",
+                            name
+                        );
+                    }
+                } else {
+                    args
+                };
+
+                // Plan FILTER clause if present
+                let filter = filter
+                    .map(|e| self.sql_expr_to_logical_expr(*e, schema, planner_context))
+                    .transpose()?
+                    .map(Box::new);
+
                 let mut window_expr = RawWindowExpr {
                     func_def: fun,
-                    args,
+                    args: resolved_args,
                     partition_by,
                     order_by,
                     window_frame,
+                    filter,
                     null_treatment,
+                    distinct: function_args.distinct,
                 };
 
                 for planner in self.context_provider.get_expr_planners().iter() {
@@ -358,23 +489,45 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     partition_by,
                     order_by,
                     window_frame,
+                    filter,
                     null_treatment,
+                    distinct,
                 } = window_expr;
 
-                return Expr::WindowFunction(expr::WindowFunction::new(func_def, args))
-                    .partition_by(partition_by)
-                    .order_by(order_by)
-                    .window_frame(window_frame)
-                    .null_treatment(null_treatment)
-                    .build();
+                let inner = WindowFunction {
+                    fun: func_def,
+                    params: expr::WindowFunctionParams {
+                        args,
+                        partition_by,
+                        order_by,
+                        window_frame,
+                        filter,
+                        null_treatment,
+                        distinct,
+                    },
+                };
+
+                if name.eq_ignore_ascii_case(inner.fun.name()) {
+                    return Ok(Expr::WindowFunction(Box::new(inner)));
+                } else {
+                    // If the function is called by an alias, a verbose string representation is created
+                    // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias`
+                    // to ensure the output column name matches the user's query.
+                    let arg_names = inner
+                        .params
+                        .args
+                        .iter()
+                        .map(|arg| arg.to_string())
+                        .collect::<Vec<_>>()
+                        .join(",");
+                    let verbose_alias = format!("{name}({arg_names})");
+
+                    return Ok(Expr::WindowFunction(Box::new(inner)).alias(verbose_alias));
+                }
             }
         } else {
             // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function
             if let Some(fm) = self.context_provider.get_aggregate_meta(&name) {
-                if fm.is_ordered_set_aggregate() && within_group.is_empty() {
-                    return plan_err!("WITHIN GROUP clause is required when calling ordered set aggregate function({})", fm.name());
-                }
-
                 if null_treatment.is_some() && !fm.supports_null_handling_clause() {
                     return plan_err!(
                         "[IGNORE | RESPECT] NULLS are not permitted for {}",
@@ -382,36 +535,46 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     );
                 }
 
-                let mut args =
-                    self.function_args_to_expr(args, schema, planner_context)?;
+                let (mut args, mut arg_names) =
+                    self.function_args_to_expr_with_names(args, schema, planner_context)?;
 
-                let order_by = if fm.is_ordered_set_aggregate() {
-                    let within_group = self.order_by_to_sort_expr(
-                        within_group,
-                        schema,
-                        planner_context,
-                        false,
-                        None,
-                    )?;
-
-                    // add target column expression in within group clause to function arguments
-                    if !within_group.is_empty() {
-                        args = within_group
-                            .iter()
-                            .map(|sort| sort.expr.clone())
-                            .chain(args)
-                            .collect::<Vec<_>>();
-                    }
-                    (!within_group.is_empty()).then_some(within_group)
+                // UDAFs must opt-in via `supports_within_group_clause()` to
+                // accept a WITHIN GROUP clause.
+                let supports_within_group = fm.supports_within_group_clause();
+
+                if !within_group.is_empty() && !supports_within_group {
+                    return plan_err!(
+                        "WITHIN GROUP is only supported for ordered-set aggregate functions"
+                    );
+                }
+
+                // If the UDAF supports WITHIN GROUP, convert the ordering into
+                // sort expressions and prepend them as unnamed function args.
+                let order_by = if supports_within_group {
+                    let (within_group_sorts, new_args, new_arg_names) = self
+                        .extract_and_prepend_within_group_args(
+                            within_group,
+                            args,
+                            arg_names,
+                            schema,
+                            planner_context,
+                        )?;
+                    args = new_args;
+                    arg_names = new_arg_names;
+                    within_group_sorts
                 } else {
-                    let order_by = self.order_by_to_sort_expr(
+                    let order_by = if !order_by.is_empty() {
+                        order_by
+                    } else {
+                        within_group
+                    };
+                    self.order_by_to_sort_expr(
                         order_by,
                         schema,
                         planner_context,
                         true,
                         None,
-                    )?;
-                    (!order_by.is_empty()).then_some(order_by)
+                    )?
                 };
 
                 let filter: Option<Box<Expr>> = filter
@@ -419,9 +582,26 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     .transpose()?
                     .map(Box::new);
 
+                let resolved_args = if arg_names.iter().any(|name| name.is_some()) {
+                    if let Some(param_names) = &fm.signature().parameter_names {
+                        datafusion_expr::arguments::resolve_function_arguments(
+                            param_names,
+                            args,
+                            arg_names,
+                        )?
+                    } else {
+                        return plan_err!(
+                            "Aggregate function '{}' does not support named arguments",
+                            fm.name()
+                        );
+                    }
+                } else {
+                    args
+                };
+
                 let mut aggregate_expr = RawAggregateExpr {
                     func: fm,
-                    args,
+                    args: resolved_args,
                     distinct,
                     filter,
                     order_by,
@@ -443,16 +623,59 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     null_treatment,
                 } = aggregate_expr;
 
-                return Ok(Expr::AggregateFunction(expr::AggregateFunction::new_udf(
+                let inner = expr::AggregateFunction::new_udf(
                     func,
                     args,
                     distinct,
                     filter,
                     order_by,
                     null_treatment,
-                )));
+                );
+
+                if name.eq_ignore_ascii_case(inner.func.name()) {
+                    return Ok(Expr::AggregateFunction(inner));
+                } else {
+                    // If the function is called by an alias, a verbose string representation is created
+                    // (e.g., "my_alias(arg1, arg2)") and the expression is wrapped in an `Alias`
+                    // to ensure the output column name matches the user's query.
+                    let arg_names = inner
+                        .params
+                        .args
+                        .iter()
+                        .map(|arg| arg.to_string())
+                        .collect::<Vec<_>>()
+                        .join(",");
+                    let verbose_alias = format!("{name}({arg_names})");
+
+                    return Ok(Expr::AggregateFunction(inner).alias(verbose_alias));
+                }
             }
         }
+
+        // workaround for https://github.com/apache/datafusion-sqlparser-rs/issues/1909
+        if function_without_parentheses {
+            let maybe_ids = object_name
+                .0
+                .iter()
+                .map(|part| part.as_ident().cloned().ok_or(()))
+                .collect::<Result<Vec<_>, ()>>();
+            if let Ok(ids) = maybe_ids {
+                if ids.len() == 1 {
+                    return self.sql_identifier_to_expr(
+                        ids.into_iter().next().unwrap(),
+                        schema,
+                        planner_context,
+                    );
+                } else {
+                    return self.sql_compound_identifier_to_expr(
+                        ids,
+                        schema,
+                        planner_context,
+                    );
+                }
+            }
+        }
+
         // Could not find the relevant function, so return an error
         if let Some(suggested_func_name) =
             suggest_valid_function(&name, is_function_window, self.context_provider)
@@ -514,14 +737,32 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
+        let (expr, _) =
+            self.sql_fn_arg_to_logical_expr_with_name(sql, schema, planner_context)?;
+        Ok(expr)
+    }
+
+    fn sql_fn_arg_to_logical_expr_with_name(
+        &self,
+        sql: FunctionArg,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<(Expr, Option<ArgumentName>)> {
         match sql {
             FunctionArg::Named {
-                name: _,
+                name,
                 arg: FunctionArgExpr::Expr(arg),
                 operator: _,
-            } => self.sql_expr_to_logical_expr(arg, schema, planner_context),
+            } => {
+                let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?;
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
+                Ok((expr, Some(arg_name)))
+            }
             FunctionArg::Named {
-                name: _,
+                name,
                 arg: FunctionArgExpr::Wildcard,
                 operator: _,
             } => {
@@ -530,11 +771,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier: None,
                     options: Box::new(WildcardOptions::default()),
                 };
-
-                Ok(expr)
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
+                Ok((expr, Some(arg_name)))
             }
             FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => {
-                self.sql_expr_to_logical_expr(arg, schema, planner_context)
+                let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?;
+                Ok((expr, None))
             }
             FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => {
                 #[expect(deprecated)]
@@ -542,8 +787,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier: None,
                     options: Box::new(WildcardOptions::default()),
                 };
-
-                Ok(expr)
+                Ok((expr, None))
             }
             FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(object_name)) => {
                 let qualifier = self.object_name_to_table_reference(object_name)?;
@@ -558,8 +802,36 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier: qualifier.into(),
                     options: Box::new(WildcardOptions::default()),
                 };
-
-                Ok(expr)
+                Ok((expr, None))
+            }
+            // PostgreSQL dialect uses ExprNamed variant with expression for name
+            FunctionArg::ExprNamed {
+                name: SQLExpr::Identifier(name),
+                arg: FunctionArgExpr::Expr(arg),
+                operator: _,
+            } => {
+                let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?;
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
+                Ok((expr, Some(arg_name)))
+            }
+            FunctionArg::ExprNamed {
+                name: SQLExpr::Identifier(name),
+                arg: FunctionArgExpr::Wildcard,
+                operator: _,
+            } => {
+                #[expect(deprecated)]
+                let expr = Expr::Wildcard {
+                    qualifier: None,
+                    options: Box::new(WildcardOptions::default()),
+                };
+                let arg_name = ArgumentName {
+                    value: name.value,
+                    is_quoted: name.quote_style.is_some(),
+                };
+                Ok((expr, Some(arg_name)))
             }
             _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"),
         }
@@ -576,12 +848,65 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             .collect::<Result<Vec<Expr>>>()
     }
 
+    pub(super) fn function_args_to_expr_with_names(
+        &self,
+        args: Vec<FunctionArg>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<(Vec<Expr>, Vec<Option<ArgumentName>>)> {
+        let results: Result<Vec<(Expr, Option<ArgumentName>)>> = args
+            .into_iter()
+            .map(|a| {
+                self.sql_fn_arg_to_logical_expr_with_name(a, schema, planner_context)
+            })
+            .collect();
+
+        let pairs = results?;
+        let (exprs, names): (Vec<Expr>, Vec<Option<ArgumentName>>) =
+            pairs.into_iter().unzip();
+        Ok((exprs, names))
+    }
+
+    fn extract_and_prepend_within_group_args(
+        &self,
+        within_group: Vec<OrderByExpr>,
+        mut args: Vec<Expr>,
+        mut arg_names: Vec<Option<ArgumentName>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<WithinGroupExtraction> {
+        let within_group = self.order_by_to_sort_expr(
+            within_group,
+            schema,
+            planner_context,
+            false,
+            None,
+        )?;
+
+        if !within_group.is_empty() {
+            let within_group_count = within_group.len();
+            arg_names = std::iter::repeat_n(None, within_group_count)
+                .chain(arg_names)
+                .collect();
+
+            args = within_group
+                .iter()
+                .map(|sort| sort.expr.clone())
+                .chain(args)
+                .collect::<Vec<_>>();
+        }
+
+        Ok((within_group, args, arg_names))
+    }
+
     pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> {
         // Check argument type, array types are supported
         match arg.get_type(schema)? {
             DataType::List(_)
             | DataType::LargeList(_)
             | DataType::FixedSizeList(_, _)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_)
             | DataType::Struct(_) => Ok(()),
             DataType::Null => {
                 not_impl_err!("unnest() does not support null yet")
diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs
index 7c276ce53e35d..cca09df0db027 100644
--- a/datafusion/sql/src/expr/identifier.rs
+++ b/datafusion/sql/src/expr/identifier.rs
@@ -15,14 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::Field;
+use arrow::datatypes::FieldRef;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::{
-    internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema,
-    DataFusionError, Result, Span, TableReference,
+    Column, DFSchema, Result, Span, TableReference, assert_or_internal_err,
+    exec_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::planner::PlannerResult;
 use datafusion_expr::{Case, Expr};
 use sqlparser::ast::{CaseWhen, Expr as SQLExpr, Ident};
+use std::sync::Arc;
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_expr::UNNAMED_TABLE;
@@ -35,16 +37,21 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
         let id_span = id.span;
-        if id.value.starts_with('@') {
+        if id.value.starts_with('@') && id.quote_style.is_none() {
             // TODO: figure out if ScalarVariables should be insensitive.
             let var_names = vec![id.value];
-            let ty = self
+            let field = self
                 .context_provider
-                .get_variable_type(&var_names)
+                .get_variable_field(&var_names)
+                .or_else(|| {
+                    self.context_provider
+                        .get_variable_type(&var_names)
+                        .map(|ty| ty.into_nullable_field_ref())
+                })
                 .ok_or_else(|| {
                     plan_datafusion_err!("variable {var_names:?} has no type information")
                 })?;
-            Ok(Expr::ScalarVariable(ty, var_names))
+            Ok(Expr::ScalarVariable(field, var_names))
         } else {
             // Don't use `col()` here because it will try to
             // interpret names with '.' as if they were
@@ -60,22 +67,22 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(),
                     normalize_ident,
                 );
-                if self.options.collect_spans {
-                    if let Some(span) = Span::try_from_sqlparser_span(id_span) {
-                        column.spans_mut().add_span(span);
-                    }
+                if self.options.collect_spans
+                    && let Some(span) = Span::try_from_sqlparser_span(id_span)
+                {
+                    column.spans_mut().add_span(span);
                 }
                 return Ok(Expr::Column(column));
             }
 
             // Check the outer query schema
-            if let Some(outer) = planner_context.outer_query_schema() {
+            for outer in planner_context.outer_schemas_iter() {
                 if let Ok((qualifier, field)) =
                     outer.qualified_field_with_unqualified_name(normalize_ident.as_str())
                 {
                     // Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
                     return Ok(Expr::OuterReferenceColumn(
-                        field.data_type().clone(),
+                        Arc::clone(field),
                         Column::from((qualifier, field)),
                     ));
                 }
@@ -83,10 +90,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
             // Default case
             let mut column = Column::new_unqualified(normalize_ident);
-            if self.options.collect_spans {
-                if let Some(span) = Span::try_from_sqlparser_span(id_span) {
-                    column.spans_mut().add_span(span);
-                }
+            if self.options.collect_spans
+                && let Some(span) = Span::try_from_sqlparser_span(id_span)
+            {
+                column.spans_mut().add_span(span);
             }
             Ok(Expr::Column(column))
         }
@@ -98,29 +105,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        if ids.len() < 2 {
-            return internal_err!("Not a compound identifier: {ids:?}");
-        }
+        assert_or_internal_err!(ids.len() >= 2, "Not a compound identifier: {ids:?}");
 
         let ids_span = Span::union_iter(
             ids.iter()
                 .filter_map(|id| Span::try_from_sqlparser_span(id.span)),
         );
 
-        if ids[0].value.starts_with('@') {
+        if ids[0].value.starts_with('@') && ids[0].quote_style.is_none() {
             let var_names: Vec<_> = ids
                 .into_iter()
                 .map(|id| self.ident_normalizer.normalize(id))
                 .collect();
-            let ty = self
+            let field = self
                 .context_provider
-                .get_variable_type(&var_names)
+                .get_variable_field(&var_names)
+                .or_else(|| {
+                    self.context_provider
+                        .get_variable_type(&var_names)
+                        .map(|ty| ty.into_nullable_field_ref())
+                })
                 .ok_or_else(|| {
-                    DataFusionError::Execution(format!(
-                        "variable {var_names:?} has no type information"
-                    ))
+                    exec_datafusion_err!("variable {var_names:?} has no type information")
                 })?;
-            Ok(Expr::ScalarVariable(ty, var_names))
+            Ok(Expr::ScalarVariable(field, var_names))
         } else {
             let ids = ids
                 .into_iter()
@@ -151,10 +159,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 // Found matching field with no spare identifier(s)
                 Some((field, qualifier, _nested_names)) => {
                     let mut column = Column::from((qualifier, field));
-                    if self.options.collect_spans {
-                        if let Some(span) = ids_span {
-                            column.spans_mut().add_span(span);
-                        }
+                    if self.options.collect_spans
+                        && let Some(span) = ids_span
+                    {
+                        column.spans_mut().add_span(span);
                     }
                     Ok(Expr::Column(column))
                 }
@@ -165,48 +173,43 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         not_impl_err!("compound identifier: {ids:?}")
                     } else {
                         // Check the outer_query_schema and try to find a match
-                        if let Some(outer) = planner_context.outer_query_schema() {
+                        for outer in planner_context.outer_schemas_iter() {
                             let search_result = search_dfschema(&ids, outer);
-                            match search_result {
+                            let result = match search_result {
                                 // Found matching field with spare identifier(s) for nested field(s) in structure
                                 Some((field, qualifier, nested_names))
                                     if !nested_names.is_empty() =>
                                 {
-                                    // TODO: remove when can support nested identifiers for OuterReferenceColumn
+                                    // TODO: remove this when we have support for nested identifiers for OuterReferenceColumn
                                     not_impl_err!(
                                         "Nested identifiers are not yet supported for OuterReferenceColumn {}",
-                                        Column::from((qualifier, field)).quoted_flat_name()
+                                        Column::from((qualifier, field))
+                                            .quoted_flat_name()
                                     )
                                 }
                                 // Found matching field with no spare identifier(s)
                                 Some((field, qualifier, _nested_names)) => {
                                     // Found an exact match on a qualified name in the outer plan schema, so this is an outer reference column
                                     Ok(Expr::OuterReferenceColumn(
-                                        field.data_type().clone(),
+                                        Arc::clone(field),
                                         Column::from((qualifier, field)),
                                     ))
                                 }
                                 // Found no matching field, will return a default
-                                None => {
-                                    let s = &ids[0..ids.len()];
-                                    // safe unwrap as s can never be empty or exceed the bounds
-                                    let (relation, column_name) =
-                                        form_identifier(s).unwrap();
-                                    Ok(Expr::Column(Column::new(relation, column_name)))
-                                }
-                            }
-                        } else {
-                            let s = &ids[0..ids.len()];
-                            // Safe unwrap as s can never be empty or exceed the bounds
-                            let (relation, column_name) = form_identifier(s).unwrap();
-                            let mut column = Column::new(relation, column_name);
-                            if self.options.collect_spans {
-                                if let Some(span) = ids_span {
-                                    column.spans_mut().add_span(span);
-                                }
-                            }
-                            Ok(Expr::Column(column))
+                                None => continue,
+                            };
+                            return result;
+                        }
+                        // Safe unwrap as column name can never be empty or exceed the bounds
+                        let (relation, column_name) =
+                            form_identifier(&ids[0..ids.len()]).unwrap();
+                        let mut column = Column::new(relation, column_name);
+                        if self.options.collect_spans
+                            && let Some(span) = ids_span
+                        {
+                            column.spans_mut().add_span(span);
                         }
+                        Ok(Expr::Column(column))
                     }
                 }
             }
@@ -294,7 +297,7 @@ fn search_dfschema<'ids, 'schema>(
     ids: &'ids [String],
     schema: &'schema DFSchema,
 ) -> Option<(
-    &'schema Field,
+    &'schema FieldRef,
     Option<&'schema TableReference>,
     &'ids [String],
 )> {
@@ -459,8 +462,8 @@ mod test {
     fn test_form_identifier() -> Result<()> {
         let err = form_identifier(&[]).expect_err("empty identifiers didn't fail");
         let expected = "Internal error: Incorrect number of identifiers: 0.\n\
-        This was likely caused by a bug in DataFusion's code and we would \
-        welcome that you file an bug report in our issue tracker";
+         This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this \
+         by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues";
         assert!(expected.starts_with(&err.strip_backtrace()));
 
         let ids = vec!["a".to_string()];
@@ -497,8 +500,8 @@ mod test {
         ])
         .expect_err("too many identifiers didn't fail");
         let expected = "Internal error: Incorrect number of identifiers: 5.\n\
-        This was likely caused by a bug in DataFusion's code and we would \
-        welcome that you file an bug report in our issue tracker";
+         This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this \
+         by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues";
         assert!(expected.starts_with(&err.strip_backtrace()));
 
         Ok(())
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index d29ccdc6a7e9e..79d2bd6ad847a 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -20,24 +20,27 @@ use datafusion_expr::planner::{
     PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr,
 };
 use sqlparser::ast::{
-    AccessExpr, BinaryOperator, CastFormat, CastKind, DataType as SQLDataType,
-    DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry,
-    StructField, Subscript, TrimWhereField, Value, ValueWithSpan,
+    AccessExpr, BinaryOperator, CastFormat, CastKind, CeilFloorKind,
+    DataType as SQLDataType, DateTimeField, DictionaryField, Expr as SQLExpr,
+    ExprWithAlias as SQLExprWithAlias, JsonPath, MapEntry, StructField, Subscript,
+    TrimWhereField, TypedString, Value, ValueWithSpan,
 };
 
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result,
-    ScalarValue,
+    DFSchema, Result, ScalarValue, internal_datafusion_err, internal_err, not_impl_err,
+    plan_err,
 };
 
 use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::expr::SetQuantifier;
 use datafusion_expr::expr::{InList, WildcardOptions};
 use datafusion_expr::{
-    lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal,
-    Operator, TryCast,
+    Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal,
+    Operator, TryCast, lit,
 };
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
+use datafusion_functions_nested::expr_fn::array_has;
 
 mod binary_op;
 mod function;
@@ -139,7 +142,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let RawBinaryExpr { op, left, right } = binary_expr;
         Ok(Expr::BinaryExpr(BinaryExpr::new(
             Box::new(left),
-            self.parse_sql_binary_op(op)?,
+            self.parse_sql_binary_op(&op)?,
             Box::new(right),
         )))
     }
@@ -215,7 +218,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
             SQLExpr::Extract { field, expr, .. } => {
                 let mut extract_args = vec![
-                    Expr::Literal(ScalarValue::from(format!("{field}"))),
+                    Expr::Literal(ScalarValue::from(format!("{field}")), None),
                     self.sql_expr_to_logical_expr(*expr, schema, planner_context)?,
                 ];
 
@@ -254,6 +257,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 operand,
                 conditions,
                 else_result,
+                case_token: _,
+                end_token: _,
             } => self.sql_case_identifier_to_expr(
                 operand,
                 conditions,
@@ -262,36 +267,48 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 planner_context,
             ),
 
+            SQLExpr::Cast { array: true, .. } => {
+                not_impl_err!("`CAST(... AS type ARRAY`) not supported")
+            }
+
             SQLExpr::Cast {
                 kind: CastKind::Cast | CastKind::DoubleColon,
                 expr,
                 data_type,
                 format,
-            } => self.sql_cast_to_expr(*expr, data_type, format, schema, planner_context),
+                array: false,
+            } => {
+                self.sql_cast_to_expr(*expr, &data_type, format, schema, planner_context)
+            }
 
             SQLExpr::Cast {
                 kind: CastKind::TryCast | CastKind::SafeCast,
                 expr,
                 data_type,
                 format,
+                array: false,
             } => {
                 if let Some(format) = format {
                     return not_impl_err!("CAST with format is not supported: {format}");
                 }
 
-                Ok(Expr::TryCast(TryCast::new(
+                Ok(Expr::TryCast(TryCast::new_from_field(
                     Box::new(self.sql_expr_to_logical_expr(
                         *expr,
                         schema,
                         planner_context,
                     )?),
-                    self.convert_data_type(&data_type)?,
+                    self.convert_data_type_to_field(&data_type)?,
                 )))
             }
 
-            SQLExpr::TypedString { data_type, value } => Ok(Expr::Cast(Cast::new(
+            SQLExpr::TypedString(TypedString {
+                data_type,
+                value,
+                uses_odbc_syntax: _,
+            }) => Ok(Expr::Cast(Cast::new_from_field(
                 Box::new(lit(value.into_string().unwrap())),
-                self.convert_data_type(&data_type)?,
+                self.convert_data_type_to_field(&data_type)?,
             ))),
 
             SQLExpr::IsNull(expr) => Ok(Expr::IsNull(Box::new(
@@ -446,6 +463,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 substring_from,
                 substring_for,
                 special: _,
+                shorthand: _,
             } => self.sql_substring_to_expr(
                 expr,
                 substring_from,
@@ -487,14 +505,28 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 self.sql_grouping_sets_to_expr(exprs, schema, planner_context)
             }
 
-            SQLExpr::Floor {
-                expr,
-                field: _field,
-            } => self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context),
-            SQLExpr::Ceil {
-                expr,
-                field: _field,
-            } => self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context),
+            SQLExpr::Floor { expr, field } => match field {
+                CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => {
+                    self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context)
+                }
+                CeilFloorKind::DateTimeField(_) => {
+                    not_impl_err!("FLOOR with datetime is not supported")
+                }
+                CeilFloorKind::Scale(_) => {
+                    not_impl_err!("FLOOR with scale is not supported")
+                }
+            },
+            SQLExpr::Ceil { expr, field } => match field {
+                CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => {
+                    self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context)
+                }
+                CeilFloorKind::DateTimeField(_) => {
+                    not_impl_err!("CEIL with datetime is not supported")
+                }
+                CeilFloorKind::Scale(_) => {
+                    not_impl_err!("CEIL with scale is not supported")
+                }
+            },
             SQLExpr::Overlay {
                 expr,
                 overlay_what,
@@ -527,7 +559,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
 
             SQLExpr::Struct { values, fields } => {
-                self.parse_struct(schema, planner_context, values, fields)
+                self.parse_struct(schema, planner_context, values, &fields)
             }
             SQLExpr::Position { expr, r#in } => {
                 self.sql_position_to_expr(*expr, *r#in, schema, planner_context)
@@ -549,7 +581,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     _ => {
                         return not_impl_err!(
                             "Unsupported ast node in sqltorel: {time_zone:?}"
-                        )
+                        );
                     }
                 },
             ))),
@@ -566,32 +598,44 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 // ANY/SOME are equivalent, this field specifies which the user
                 // specified but it doesn't affect the plan so ignore the field
                 is_some: _,
-            } => {
-                let mut binary_expr = RawBinaryExpr {
-                    op: compare_op,
-                    left: self.sql_expr_to_logical_expr(
-                        *left,
-                        schema,
-                        planner_context,
-                    )?,
-                    right: self.sql_expr_to_logical_expr(
-                        *right,
-                        schema,
-                        planner_context,
-                    )?,
-                };
-                for planner in self.context_provider.get_expr_planners() {
-                    match planner.plan_any(binary_expr)? {
-                        PlannerResult::Planned(expr) => {
-                            return Ok(expr);
-                        }
-                        PlannerResult::Original(expr) => {
-                            binary_expr = expr;
-                        }
+            } => match *right {
+                SQLExpr::Subquery(subquery) => self.parse_set_comparison_subquery(
+                    *left,
+                    *subquery,
+                    &compare_op,
+                    SetQuantifier::Any,
+                    schema,
+                    planner_context,
+                ),
+                _ => {
+                    if compare_op != BinaryOperator::Eq {
+                        plan_err!(
+                            "Unsupported AnyOp: '{compare_op}', only '=' is supported"
+                        )
+                    } else {
+                        let left_expr =
+                            self.sql_to_expr(*left, schema, planner_context)?;
+                        let right_expr =
+                            self.sql_to_expr(*right, schema, planner_context)?;
+                        Ok(array_has(right_expr, left_expr))
                     }
                 }
-                not_impl_err!("AnyOp not supported by ExprPlanner: {binary_expr:?}")
-            }
+            },
+            SQLExpr::AllOp {
+                left,
+                compare_op,
+                right,
+            } => match *right {
+                SQLExpr::Subquery(subquery) => self.parse_set_comparison_subquery(
+                    *left,
+                    *subquery,
+                    &compare_op,
+                    SetQuantifier::All,
+                    schema,
+                    planner_context,
+                ),
+                _ => not_impl_err!("ALL only supports subquery comparison currently"),
+            },
             #[expect(deprecated)]
             SQLExpr::Wildcard(_token) => Ok(Expr::Wildcard {
                 qualifier: None,
@@ -603,17 +647,43 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 options: Box::new(WildcardOptions::default()),
             }),
             SQLExpr::Tuple(values) => self.parse_tuple(schema, planner_context, values),
+            SQLExpr::JsonAccess { value, path } => {
+                self.parse_json_access(schema, planner_context, value, &path)
+            }
             _ => not_impl_err!("Unsupported ast node in sqltorel: {sql:?}"),
         }
     }
 
+    fn parse_json_access(
+        &self,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+        value: Box<SQLExpr>,
+        path: &JsonPath,
+    ) -> Result<Expr> {
+        let json_path = path.to_string();
+        let json_path = if let Some(json_path) = json_path.strip_prefix(":") {
+            // sqlparser's JsonPath display adds an extra `:` at the beginning.
+            json_path.to_owned()
+        } else {
+            json_path
+        };
+        self.build_logical_expr(
+            BinaryOperator::Custom(":".to_owned()),
+            self.sql_to_expr(*value, schema, planner_context)?,
+            // pass json path as a string literal, let the impl parse it when needed.
+            Expr::Literal(ScalarValue::Utf8(Some(json_path)), None),
+            schema,
+        )
+    }
+
     /// Parses a struct(..) expression and plans it creation
     fn parse_struct(
         &self,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
         values: Vec<SQLExpr>,
-        fields: Vec<StructField>,
+        fields: &[StructField],
     ) -> Result<Expr> {
         if !fields.is_empty() {
             return not_impl_err!("Struct fields are not supported yet");
@@ -644,8 +714,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         values: Vec<SQLExpr>,
     ) -> Result<Expr> {
         match values.first() {
-            Some(SQLExpr::Identifier(_)) | Some(SQLExpr::Value(_)) => {
-                self.parse_struct(schema, planner_context, values, vec![])
+            Some(SQLExpr::Identifier(_))
+            | Some(SQLExpr::Value(_))
+            | Some(SQLExpr::CompoundIdentifier(_)) => {
+                self.parse_struct(schema, planner_context, values, &[])
             }
             None => not_impl_err!("Empty tuple not supported yet"),
             _ => {
@@ -805,13 +877,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         )))
     }
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn sql_like_to_expr(
         &self,
         negated: bool,
         expr: SQLExpr,
         pattern: SQLExpr,
-        escape_char: Option<String>,
+        escape_char: Option<Value>,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
         case_insensitive: bool,
@@ -821,13 +893,16 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return not_impl_err!("ANY in LIKE expression");
         }
         let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?;
-        let escape_char = if let Some(char) = escape_char {
-            if char.len() != 1 {
-                return plan_err!("Invalid escape character in LIKE expression");
+        let escape_char = match escape_char {
+            Some(Value::SingleQuotedString(char)) if char.len() == 1 => {
+                Some(char.chars().next().unwrap())
             }
-            Some(char.chars().next().unwrap())
-        } else {
-            None
+            Some(value) => {
+                return plan_err!(
+                    "Invalid escape character in LIKE expression. Expected a single character wrapped with single quotes, got {value}"
+                );
+            }
+            None => None,
         };
         Ok(Expr::Like(Like::new(
             negated,
@@ -843,7 +918,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         negated: bool,
         expr: SQLExpr,
         pattern: SQLExpr,
-        escape_char: Option<String>,
+        escape_char: Option<Value>,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
@@ -852,13 +927,16 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         if pattern_type != DataType::Utf8 && pattern_type != DataType::Null {
             return plan_err!("Invalid pattern in SIMILAR TO expression");
         }
-        let escape_char = if let Some(char) = escape_char {
-            if char.len() != 1 {
-                return plan_err!("Invalid escape character in SIMILAR TO expression");
+        let escape_char = match escape_char {
+            Some(Value::SingleQuotedString(char)) if char.len() == 1 => {
+                Some(char.chars().next().unwrap())
             }
-            Some(char.chars().next().unwrap())
-        } else {
-            None
+            Some(value) => {
+                return plan_err!(
+                    "Invalid escape character in SIMILAR TO expression. Expected a single character wrapped with single quotes, got {value}"
+                );
+            }
+            None => None,
         };
         Ok(Expr::SimilarTo(Like::new(
             negated,
@@ -953,7 +1031,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn sql_cast_to_expr(
         &self,
         expr: SQLExpr,
-        data_type: SQLDataType,
+        data_type: &SQLDataType,
         format: Option<CastFormat>,
         schema: &DFSchema,
         planner_context: &mut PlannerContext,
@@ -962,12 +1040,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return not_impl_err!("CAST with format is not supported: {format}");
         }
 
-        let dt = self.convert_data_type(&data_type)?;
+        let dt = self.convert_data_type_to_field(data_type)?;
         let expr = self.sql_expr_to_logical_expr(expr, schema, planner_context)?;
 
         // numeric constants are treated as seconds (rather as nanoseconds)
         // to align with postgres / duckdb semantics
-        let expr = match &dt {
+        let expr = match dt.data_type() {
             DataType::Timestamp(TimeUnit::Nanosecond, tz)
                 if expr.get_type(schema)? == DataType::Int64 =>
             {
@@ -979,7 +1057,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             _ => expr,
         };
 
-        Ok(Expr::Cast(Cast::new(Box::new(expr), dt)))
+        Ok(Expr::Cast(Cast::new_from_field(Box::new(expr), dt)))
     }
 
     /// Extracts the root expression and access chain from a compound expression.
@@ -1173,8 +1251,8 @@ mod tests {
     use sqlparser::dialect::GenericDialect;
     use sqlparser::parser::Parser;
 
-    use datafusion_common::config::ConfigOptions;
     use datafusion_common::TableReference;
+    use datafusion_common::config::ConfigOptions;
     use datafusion_expr::logical_plan::builder::LogicalTableSource;
     use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF};
 
@@ -1255,46 +1333,42 @@ mod tests {
     }
 
     macro_rules! test_stack_overflow {
-        ($num_expr:expr) => {
-            paste::item! {
-                #[test]
-                fn [<test_stack_overflow_ $num_expr>]() {
-                    let schema = DFSchema::empty();
-                    let mut planner_context = PlannerContext::default();
-
-                    let expr_str = (0..$num_expr)
-                        .map(|i| format!("column1 = 'value{:?}'", i))
-                        .collect::<Vec<String>>()
-                        .join(" OR ");
-
-                    let dialect = GenericDialect{};
-                    let mut parser = Parser::new(&dialect)
-                        .try_with_sql(expr_str.as_str())
-                        .unwrap();
-                    let sql_expr = parser.parse_expr().unwrap();
-
-                    let context_provider = TestContextProvider::new();
-                    let sql_to_rel = SqlToRel::new(&context_provider);
-
-                    // Should not stack overflow
-                    sql_to_rel.sql_expr_to_logical_expr(
-                        sql_expr,
-                        &schema,
-                        &mut planner_context,
-                    ).unwrap();
-                }
+        ($name:ident, $num_expr:expr) => {
+            #[test]
+            fn $name() {
+                let schema = DFSchema::empty();
+                let mut planner_context = PlannerContext::default();
+
+                let expr_str = (0..$num_expr)
+                    .map(|i| format!("column1 = 'value{:?}'", i))
+                    .collect::<Vec<String>>()
+                    .join(" OR ");
+
+                let dialect = GenericDialect {};
+                let mut parser = Parser::new(&dialect)
+                    .try_with_sql(expr_str.as_str())
+                    .unwrap();
+                let sql_expr = parser.parse_expr().unwrap();
+
+                let context_provider = TestContextProvider::new();
+                let sql_to_rel = SqlToRel::new(&context_provider);
+
+                // Should not stack overflow
+                sql_to_rel
+                    .sql_expr_to_logical_expr(sql_expr, &schema, &mut planner_context)
+                    .unwrap();
             }
         };
     }
 
-    test_stack_overflow!(64);
-    test_stack_overflow!(128);
-    test_stack_overflow!(256);
-    test_stack_overflow!(512);
-    test_stack_overflow!(1024);
-    test_stack_overflow!(2048);
-    test_stack_overflow!(4096);
-    test_stack_overflow!(8192);
+    test_stack_overflow!(test_stack_overflow_64, 64);
+    test_stack_overflow!(test_stack_overflow_128, 128);
+    test_stack_overflow!(test_stack_overflow_256, 256);
+    test_stack_overflow!(test_stack_overflow_512, 512);
+    test_stack_overflow!(test_stack_overflow_1024, 1024);
+    test_stack_overflow!(test_stack_overflow_2048, 2048);
+    test_stack_overflow!(test_stack_overflow_4096, 4096);
+    test_stack_overflow!(test_stack_overflow_8192, 8192);
     #[test]
     fn test_sql_to_expr_with_alias() {
         let schema = DFSchema::empty();
diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs
index d357c3753e13b..faecfbcfecc05 100644
--- a/datafusion/sql/src/expr/order_by.rs
+++ b/datafusion/sql/src/expr/order_by.rs
@@ -17,7 +17,7 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_common::{
-    not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, Result,
+    Column, DFSchema, Result, not_impl_err, plan_datafusion_err, plan_err,
 };
 use datafusion_expr::expr::Sort;
 use datafusion_expr::{Expr, SortExpr};
@@ -63,14 +63,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
         let mut sort_expr_vec = Vec::with_capacity(order_by_exprs.len());
 
-        let make_sort_expr =
-            |expr: Expr, asc: Option<bool>, nulls_first: Option<bool>| {
-                let asc = asc.unwrap_or(true);
-                // When asc is true, by default nulls last to be consistent with postgres
-                // postgres rule: https://www.postgresql.org/docs/current/queries-order.html
-                let nulls_first = nulls_first.unwrap_or(!asc);
-                Sort::new(expr, asc, nulls_first)
-            };
+        let make_sort_expr = |expr: Expr,
+                              asc: Option<bool>,
+                              nulls_first: Option<bool>| {
+            let asc = asc.unwrap_or(true);
+            let nulls_first = nulls_first
+                .unwrap_or_else(|| self.options.default_null_ordering.nulls_first(asc));
+            Sort::new(expr, asc, nulls_first)
+        };
 
         for order_by_expr in order_by_exprs {
             let OrderByExpr {
diff --git a/datafusion/sql/src/expr/subquery.rs b/datafusion/sql/src/expr/subquery.rs
index 602d39233d587..662c44f6f2620 100644
--- a/datafusion/sql/src/expr/subquery.rs
+++ b/datafusion/sql/src/expr/subquery.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{plan_err, DFSchema, Diagnostic, Result, Span, Spans};
-use datafusion_expr::expr::{Exists, InSubquery};
+use datafusion_common::{DFSchema, Diagnostic, Result, Span, Spans, plan_err};
+use datafusion_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
 use datafusion_expr::{Expr, LogicalPlan, Subquery};
 use sqlparser::ast::Expr as SQLExpr;
-use sqlparser::ast::{Query, SelectItem, SetExpr};
+use sqlparser::ast::{BinaryOperator, Query, SelectItem, SetExpr};
 use std::sync::Arc;
 
 impl<S: ContextProvider> SqlToRel<'_, S> {
@@ -31,11 +31,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(input_schema.clone().into());
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
         Ok(Expr::Exists(Exists {
             subquery: Subquery {
                 subquery: Arc::new(sub_plan),
@@ -54,27 +53,26 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
 
         let mut spans = Spans::new();
-        if let SetExpr::Select(select) = subquery.body.as_ref() {
+        if let SetExpr::Select(select) = &subquery.body.as_ref() {
             for item in &select.projection {
-                if let SelectItem::UnnamedExpr(SQLExpr::Identifier(ident)) = item {
-                    if let Some(span) = Span::try_from_sqlparser_span(ident.span) {
-                        spans.add_span(span);
-                    }
+                if let SelectItem::UnnamedExpr(SQLExpr::Identifier(ident)) = item
+                    && let Some(span) = Span::try_from_sqlparser_span(ident.span)
+                {
+                    spans.add_span(span);
                 }
             }
         }
 
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
 
         self.validate_single_column(
             &sub_plan,
-            spans.clone(),
+            &spans,
             "Too many columns! The subquery should only return one column",
             "Select only one column in the subquery",
         )?;
@@ -98,25 +96,24 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         input_schema: &DFSchema,
         planner_context: &mut PlannerContext,
     ) -> Result<Expr> {
-        let old_outer_query_schema =
-            planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
         let mut spans = Spans::new();
         if let SetExpr::Select(select) = subquery.body.as_ref() {
             for item in &select.projection {
-                if let SelectItem::ExprWithAlias { alias, .. } = item {
-                    if let Some(span) = Span::try_from_sqlparser_span(alias.span) {
-                        spans.add_span(span);
-                    }
+                if let SelectItem::ExprWithAlias { alias, .. } = item
+                    && let Some(span) = Span::try_from_sqlparser_span(alias.span)
+                {
+                    spans.add_span(span);
                 }
             }
         }
         let sub_plan = self.query_to_plan(subquery, planner_context)?;
         let outer_ref_columns = sub_plan.all_out_ref_exprs();
-        planner_context.set_outer_query_schema(old_outer_query_schema);
+        planner_context.pop_outer_query_schema();
 
         self.validate_single_column(
             &sub_plan,
-            spans.clone(),
+            &spans,
             "Too many columns! The subquery should only return one column",
             "Select only one column in the subquery",
         )?;
@@ -131,7 +128,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn validate_single_column(
         &self,
         sub_plan: &LogicalPlan,
-        spans: Spans,
+        spans: &Spans,
         error_message: &str,
         help_message: &str,
     ) -> Result<()> {
@@ -148,7 +145,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     fn build_multi_column_diagnostic(
         &self,
-        spans: Spans,
+        spans: &Spans,
         error_message: &str,
         help_message: &str,
     ) -> Diagnostic {
@@ -162,4 +159,50 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         diagnostic.add_help(help_message, None);
         diagnostic
     }
+
+    pub(super) fn parse_set_comparison_subquery(
+        &self,
+        left_expr: SQLExpr,
+        subquery: Query,
+        compare_op: &BinaryOperator,
+        quantifier: SetQuantifier,
+        input_schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<Expr> {
+        planner_context.append_outer_query_schema(Arc::new(input_schema.clone()));
+
+        let mut spans = Spans::new();
+        if let SetExpr::Select(select) = subquery.body.as_ref() {
+            for item in &select.projection {
+                if let SelectItem::ExprWithAlias { alias, .. } = item
+                    && let Some(span) = Span::try_from_sqlparser_span(alias.span)
+                {
+                    spans.add_span(span);
+                }
+            }
+        }
+
+        let sub_plan = self.query_to_plan(subquery, planner_context)?;
+        let outer_ref_columns = sub_plan.all_out_ref_exprs();
+        planner_context.pop_outer_query_schema();
+
+        self.validate_single_column(
+            &sub_plan,
+            &spans,
+            "Too many columns! The subquery should only return one column",
+            "Select only one column in the subquery",
+        )?;
+
+        let expr_obj = self.sql_to_expr(left_expr, input_schema, planner_context)?;
+        Ok(Expr::SetComparison(SetComparison::new(
+            Box::new(expr_obj),
+            Subquery {
+                subquery: Arc::new(sub_plan),
+                outer_ref_columns,
+                spans,
+            },
+            self.parse_sql_binary_op(compare_op)?,
+            quantifier,
+        )))
+    }
 }
diff --git a/datafusion/sql/src/expr/substring.rs b/datafusion/sql/src/expr/substring.rs
index 59c78bc713cc4..d3b56097c1f58 100644
--- a/datafusion/sql/src/expr/substring.rs
+++ b/datafusion/sql/src/expr/substring.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_err};
 use datafusion_common::{DFSchema, Result, ScalarValue};
-use datafusion_expr::planner::PlannerResult;
-use datafusion_expr::Expr;
+use datafusion_common::{not_impl_err, plan_err};
+use datafusion_expr::{Expr, planner::PlannerResult};
+
 use sqlparser::ast::Expr as SQLExpr;
 
 impl<S: ContextProvider> SqlToRel<'_, S> {
@@ -51,7 +51,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             (None, Some(for_expr)) => {
                 let arg =
                     self.sql_expr_to_logical_expr(*expr, schema, planner_context)?;
-                let from_logic = Expr::Literal(ScalarValue::Int64(Some(1)));
+                let from_logic = Expr::Literal(ScalarValue::Int64(Some(1)), None);
                 let for_logic =
                     self.sql_expr_to_logical_expr(*for_expr, schema, planner_context)?;
                 vec![arg, from_logic, for_logic]
@@ -62,12 +62,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     substring_from: None,
                     substring_for: None,
                     special: false,
+                    shorthand: false,
                 };
 
                 return plan_err!("Substring without for/from is not valid {orig_sql:?}");
             }
         };
 
+        // Try to plan the substring expression using one of the registered planners
         for planner in self.context_provider.get_expr_planners() {
             match planner.plan_substring(substring_args)? {
                 PlannerResult::Planned(expr) => return Ok(expr),
@@ -78,7 +80,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
 
         not_impl_err!(
-            "Substring not supported by UserDefinedExtensionPlanners: {substring_args:?}"
+            "Substring could not be planned by registered expr planner. \
+                        Hint: Please try with `unicode_expressions` DataFusion feature enabled"
         )
     }
 }
diff --git a/datafusion/sql/src/expr/unary_op.rs b/datafusion/sql/src/expr/unary_op.rs
index e0c94543f6013..cd118c0fdd5c5 100644
--- a/datafusion/sql/src/expr/unary_op.rs
+++ b/datafusion/sql/src/expr/unary_op.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_err, DFSchema, Diagnostic, Result};
+use datafusion_common::{DFSchema, Diagnostic, Result, not_impl_err, plan_err};
 use datafusion_expr::{
-    type_coercion::{is_interval, is_timestamp},
     Expr, ExprSchemable,
+    type_coercion::{is_interval, is_timestamp},
 };
 use sqlparser::ast::{Expr as SQLExpr, UnaryOperator, Value, ValueWithSpan};
 
@@ -38,10 +38,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             UnaryOperator::Plus => {
                 let operand =
                     self.sql_expr_to_logical_expr(expr, schema, planner_context)?;
-                let (data_type, _) = operand.data_type_and_nullable(schema)?;
+                let field = operand.to_field(schema)?.1;
+                let data_type = field.data_type();
                 if data_type.is_numeric()
-                    || is_interval(&data_type)
-                    || is_timestamp(&data_type)
+                    || is_interval(data_type)
+                    || is_timestamp(data_type)
                 {
                     Ok(operand)
                 } else {
diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs
index b77f5eaf45da2..bd75ac36306fb 100644
--- a/datafusion/sql/src/expr/value.rs
+++ b/datafusion/sql/src/expr/value.rs
@@ -17,20 +17,20 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use arrow::compute::kernels::cast_utils::{
-    parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit,
+    IntervalParseConfig, IntervalUnit, parse_interval_month_day_nano_config,
 };
 use arrow::datatypes::{
-    i256, DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+    DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, FieldRef, i256,
 };
 use bigdecimal::num_bigint::BigInt;
 use bigdecimal::{BigDecimal, Signed, ToPrimitive};
 use datafusion_common::{
-    internal_datafusion_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result,
-    ScalarValue,
+    DFSchema, DataFusionError, Result, ScalarValue, internal_datafusion_err,
+    not_impl_err, plan_err,
 };
 use datafusion_expr::expr::{BinaryExpr, Placeholder};
 use datafusion_expr::planner::PlannerResult;
-use datafusion_expr::{lit, Expr, Operator};
+use datafusion_expr::{Expr, Operator, lit};
 use log::debug;
 use sqlparser::ast::{
     BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value, ValueWithSpan,
@@ -45,12 +45,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     pub(crate) fn parse_value(
         &self,
         value: Value,
-        param_data_types: &[DataType],
+        param_data_types: &[FieldRef],
     ) -> Result<Expr> {
         match value {
             Value::Number(n, _) => self.parse_sql_number(&n, false),
             Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => Ok(lit(s)),
-            Value::Null => Ok(Expr::Literal(ScalarValue::Null)),
+            Value::Null => Ok(Expr::Literal(ScalarValue::Null, None)),
             Value::Boolean(n) => Ok(lit(n)),
             Value::Placeholder(param) => {
                 Self::create_placeholder_expr(param, param_data_types)
@@ -86,10 +86,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             return Ok(lit(n));
         }
 
-        if !negative {
-            if let Ok(n) = unsigned_number.parse::<u64>() {
-                return Ok(lit(n));
-            }
+        if !negative && let Ok(n) = unsigned_number.parse::<u64>() {
+            return Ok(lit(n));
         }
 
         if self.options.parse_float_as_decimal {
@@ -104,13 +102,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 
     /// Create a placeholder expression
-    /// This is the same as Postgres's prepare statement syntax in which a placeholder starts with `$` sign and then
-    /// number 1, 2, ... etc. For example, `$1` is the first placeholder; $2 is the second one and so on.
+    /// Both named (`$foo`) and positional (`$1`, `$2`, ...) placeholder styles are supported.
     fn create_placeholder_expr(
         param: String,
-        param_data_types: &[DataType],
+        param_data_types: &[FieldRef],
     ) -> Result<Expr> {
-        // Parse the placeholder as a number because it is the only support from sqlparser and postgres
+        // Try to parse the placeholder as a number. If the placeholder does not have a valid
+        // positional value, assume we have a named placeholder.
         let index = param[1..].parse::<usize>();
         let idx = match index {
             Ok(0) => {
@@ -121,19 +119,31 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Ok(index) => index - 1,
             Err(_) => {
                 return if param_data_types.is_empty() {
-                    Ok(Expr::Placeholder(Placeholder::new(param, None)))
+                    Ok(Expr::Placeholder(Placeholder::new_with_field(param, None)))
                 } else {
-                    // when PREPARE Statement, param_data_types length is always 0
-                    plan_err!("Invalid placeholder, not a number: {param}")
+                    // FIXME: This branch is shared by params from PREPARE and CREATE FUNCTION, but
+                    // only CREATE FUNCTION currently supports named params. For now, we rewrite
+                    // these to positional params.
+                    let named_param_pos = param_data_types
+                        .iter()
+                        .position(|v| v.name() == &param[1..]);
+                    match named_param_pos {
+                        Some(pos) => Ok(Expr::Placeholder(Placeholder::new_with_field(
+                            format!("${}", pos + 1),
+                            param_data_types.get(pos).cloned(),
+                        ))),
+                        None => plan_err!("Unknown placeholder: {param}"),
+                    }
                 };
             }
         };
         // Check if the placeholder is in the parameter list
+        // FIXME: In the CREATE FUNCTION branch, param_type = None should raise an error
         let param_type = param_data_types.get(idx);
         // Data type of the parameter
         debug!("type of param {param} param_data_types[idx]: {param_type:?}");
 
-        Ok(Expr::Placeholder(Placeholder::new(
+        Ok(Expr::Placeholder(Placeholder::new_with_field(
             param,
             param_type.cloned(),
         )))
@@ -169,86 +179,91 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         }
 
-        not_impl_err!("Could not plan array literal. Hint: Please try with `nested_expressions` DataFusion feature enabled")
+        not_impl_err!(
+            "Could not plan array literal. Hint: Please try with `nested_expressions` DataFusion feature enabled"
+        )
     }
 
     /// Convert a SQL interval expression to a DataFusion logical plan
     /// expression
-    #[allow(clippy::only_used_in_recursion)]
     pub(super) fn sql_interval_to_expr(
         &self,
         negative: bool,
         interval: Interval,
     ) -> Result<Expr> {
-        if interval.leading_precision.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with leading_precision {:?}",
-                interval.leading_precision
-            );
-        }
-
-        if interval.last_field.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with last_field {:?}",
-                interval.last_field
-            );
-        }
+        sql_interval_to_expr_impl(negative, interval)
+    }
+}
 
-        if interval.fractional_seconds_precision.is_some() {
-            return not_impl_err!(
-                "Unsupported Interval Expression with fractional_seconds_precision {:?}",
-                interval.fractional_seconds_precision
-            );
-        }
+fn sql_interval_to_expr_impl(negative: bool, interval: Interval) -> Result<Expr> {
+    if interval.leading_precision.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with leading_precision {:?}",
+            interval.leading_precision
+        );
+    }
 
-        if let SQLExpr::BinaryOp { left, op, right } = *interval.value {
-            let df_op = match op {
-                BinaryOperator::Plus => Operator::Plus,
-                BinaryOperator::Minus => Operator::Minus,
-                _ => {
-                    return not_impl_err!("Unsupported interval operator: {op:?}");
-                }
-            };
-            let left_expr = self.sql_interval_to_expr(
-                negative,
-                Interval {
-                    value: left,
-                    leading_field: interval.leading_field.clone(),
-                    leading_precision: None,
-                    last_field: None,
-                    fractional_seconds_precision: None,
-                },
-            )?;
-            let right_expr = self.sql_interval_to_expr(
-                false,
-                Interval {
-                    value: right,
-                    leading_field: interval.leading_field,
-                    leading_precision: None,
-                    last_field: None,
-                    fractional_seconds_precision: None,
-                },
-            )?;
-            return Ok(Expr::BinaryExpr(BinaryExpr::new(
-                Box::new(left_expr),
-                df_op,
-                Box::new(right_expr),
-            )));
-        }
+    if interval.last_field.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with last_field {:?}",
+            interval.last_field
+        );
+    }
 
-        let value = interval_literal(*interval.value, negative)?;
+    if interval.fractional_seconds_precision.is_some() {
+        return not_impl_err!(
+            "Unsupported Interval Expression with fractional_seconds_precision {:?}",
+            interval.fractional_seconds_precision
+        );
+    }
 
-        // leading_field really means the unit if specified
-        // For example, "month" in  `INTERVAL '5' month`
-        let value = match interval.leading_field.as_ref() {
-            Some(leading_field) => format!("{value} {leading_field}"),
-            None => value,
+    if let SQLExpr::BinaryOp { left, op, right } = *interval.value {
+        let df_op = match op {
+            BinaryOperator::Plus => Operator::Plus,
+            BinaryOperator::Minus => Operator::Minus,
+            _ => {
+                return not_impl_err!("Unsupported interval operator: {op:?}");
+            }
         };
-
-        let config = IntervalParseConfig::new(IntervalUnit::Second);
-        let val = parse_interval_month_day_nano_config(&value, config)?;
-        Ok(lit(ScalarValue::IntervalMonthDayNano(Some(val))))
+        let left_expr = sql_interval_to_expr_impl(
+            negative,
+            Interval {
+                value: left,
+                leading_field: interval.leading_field.clone(),
+                leading_precision: None,
+                last_field: None,
+                fractional_seconds_precision: None,
+            },
+        )?;
+        let right_expr = sql_interval_to_expr_impl(
+            false,
+            Interval {
+                value: right,
+                leading_field: interval.leading_field,
+                leading_precision: None,
+                last_field: None,
+                fractional_seconds_precision: None,
+            },
+        )?;
+        return Ok(Expr::BinaryExpr(BinaryExpr::new(
+            Box::new(left_expr),
+            df_op,
+            Box::new(right_expr),
+        )));
     }
+
+    let value = interval_literal(*interval.value, negative)?;
+
+    // leading_field really means the unit if specified
+    // For example, "month" in  `INTERVAL '5' month`
+    let value = match interval.leading_field.as_ref() {
+        Some(leading_field) => format!("{value} {leading_field}"),
+        None => value,
+    };
+
+    let config = IntervalParseConfig::new(IntervalUnit::Second);
+    let val = parse_interval_month_day_nano_config(&value, config)?;
+    Ok(lit(ScalarValue::IntervalMonthDayNano(Some(val))))
 }
 
 fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result<String> {
@@ -282,14 +297,12 @@ fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result<String> {
             interval_literal(*expr, negative)?
         }
         _ => {
-            return not_impl_err!("Unsupported interval argument. Expected string literal or number, got: {interval_value:?}");
+            return not_impl_err!(
+                "Unsupported interval argument. Expected string literal or number, got: {interval_value:?}"
+            );
         }
     };
-    if negative {
-        Ok(format!("-{s}"))
-    } else {
-        Ok(s)
-    }
+    if negative { Ok(format!("-{s}")) } else { Ok(s) }
 }
 
 /// Try to decode bytes from hex literal string.
@@ -380,11 +393,10 @@ fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
                 int_val
             )
         })?;
-        Ok(Expr::Literal(ScalarValue::Decimal128(
-            Some(val),
-            precision as u8,
-            scale as i8,
-        )))
+        Ok(Expr::Literal(
+            ScalarValue::Decimal128(Some(val), precision as u8, scale as i8),
+            None,
+        ))
     } else if precision <= DECIMAL256_MAX_PRECISION as u64 {
         let val = bigint_to_i256(&int_val).ok_or_else(|| {
             // Failures are unexpected here as we have already checked the precision
@@ -393,11 +405,10 @@ fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
                 int_val
             )
         })?;
-        Ok(Expr::Literal(ScalarValue::Decimal256(
-            Some(val),
-            precision as u8,
-            scale as i8,
-        )))
+        Ok(Expr::Literal(
+            ScalarValue::Decimal256(Some(val), precision as u8, scale as i8),
+            None,
+        ))
     } else {
         not_impl_err!(
             "Decimal precision {} exceeds the maximum supported precision: {}",
@@ -483,17 +494,18 @@ mod tests {
         ];
         for (input, expect) in cases {
             let output = parse_decimal(input, true).unwrap();
-            assert_eq!(output, Expr::Literal(expect.arithmetic_negate().unwrap()));
+            assert_eq!(
+                output,
+                Expr::Literal(expect.arithmetic_negate().unwrap(), None)
+            );
 
             let output = parse_decimal(input, false).unwrap();
-            assert_eq!(output, Expr::Literal(expect));
+            assert_eq!(output, Expr::Literal(expect, None));
         }
 
         // scale < i8::MIN
         assert_eq!(
-            parse_decimal("1e129", false)
-                .unwrap_err()
-                .strip_backtrace(),
+            parse_decimal("1e129", false).unwrap_err().strip_backtrace(),
             "This feature is not implemented: Decimal scale -129 exceeds the minimum supported scale: -128"
         );
 
diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs
index 7e11f160a3977..7fef670933f9a 100644
--- a/datafusion/sql/src/lib.rs
+++ b/datafusion/sql/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![deny(clippy::clone_on_ref_ptr)]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! This crate provides:
 //!
diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index 9731eebad167d..1ecf90b7947c3 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -20,17 +20,17 @@
 //! This parser implements DataFusion specific statements such as
 //! `CREATE EXTERNAL TABLE`
 
-use datafusion_common::config::SqlParserOptions;
 use datafusion_common::DataFusionError;
-use datafusion_common::{sql_err, Diagnostic, Span};
-use sqlparser::ast::{ExprWithAlias, OrderByOptions};
+use datafusion_common::config::SqlParserOptions;
+use datafusion_common::{Diagnostic, Span, sql_err};
+use sqlparser::ast::{ExprWithAlias, Ident, OrderByOptions};
 use sqlparser::tokenizer::TokenWithSpan;
 use sqlparser::{
     ast::{
         ColumnDef, ColumnOptionDef, ObjectName, OrderByExpr, Query,
         Statement as SQLStatement, TableConstraint, Value,
     },
-    dialect::{keywords::Keyword, Dialect, GenericDialect},
+    dialect::{Dialect, GenericDialect, keywords::Keyword},
     parser::{Parser, ParserError},
     tokenizer::{Token, Tokenizer, Word},
 };
@@ -58,7 +58,7 @@ fn parse_file_type(s: &str) -> Result<String, DataFusionError> {
 /// Syntax:
 /// ```sql
 /// EXPLAIN <ANALYZE> <VERBOSE> [FORMAT format] statement
-///```
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ExplainStatement {
     /// `EXPLAIN ANALYZE ..`
@@ -188,7 +188,9 @@ pub(crate) type LexOrdering = Vec<OrderByExpr>;
 /// Syntax:
 ///
 /// ```text
-/// CREATE EXTERNAL TABLE
+/// CREATE
+/// [ OR REPLACE ]
+/// EXTERNAL TABLE
 /// [ IF NOT EXISTS ]
 /// <TABLE_NAME>[ (<column_definition>) ]
 /// STORED AS <file_type>
@@ -221,6 +223,8 @@ pub struct CreateExternalTable {
     pub order_exprs: Vec<LexOrdering>,
     /// Option to not error if table already exists
     pub if_not_exists: bool,
+    /// Option to replace table content if table already exists
+    pub or_replace: bool,
     /// Whether the table is a temporary table
     pub temporary: bool,
     /// Infinite streams?
@@ -239,7 +243,34 @@ impl fmt::Display for CreateExternalTable {
         }
         write!(f, "{} ", self.name)?;
         write!(f, "STORED AS {} ", self.file_type)?;
-        write!(f, "LOCATION {} ", self.location)
+        if !self.order_exprs.is_empty() {
+            write!(f, "WITH ORDER (")?;
+            let mut first = true;
+            for expr in self.order_exprs.iter().flatten() {
+                if !first {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{expr}")?;
+                first = false;
+            }
+            write!(f, ") ")?;
+        }
+        write!(f, "LOCATION {}", self.location)
+    }
+}
+
+/// DataFusion extension for `RESET`
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ResetStatement {
+    /// Reset a single configuration variable (stored as provided)
+    Variable(ObjectName),
+}
+
+impl fmt::Display for ResetStatement {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ResetStatement::Variable(name) => write!(f, "RESET {name}"),
+        }
     }
 }
 
@@ -260,6 +291,8 @@ pub enum Statement {
     CopyTo(CopyToStatement),
     /// EXPLAIN for extensions
     Explain(ExplainStatement),
+    /// Extension: `RESET`
+    Reset(ResetStatement),
 }
 
 impl fmt::Display for Statement {
@@ -269,6 +302,7 @@ impl fmt::Display for Statement {
             Statement::CreateExternalTable(stmt) => write!(f, "{stmt}"),
             Statement::CopyTo(stmt) => write!(f, "{stmt}"),
             Statement::Explain(stmt) => write!(f, "{stmt}"),
+            Statement::Reset(stmt) => write!(f, "{stmt}"),
         }
     }
 }
@@ -304,8 +338,7 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {};
 /// # use datafusion_sql::parser::DFParserBuilder;
 /// # use datafusion_common::Result;
 /// # fn test() -> Result<()> {
-/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2")
-///   .build()?;
+/// let mut parser = DFParserBuilder::new("SELECT * FROM foo; SELECT 1 + 2").build()?;
 /// // parse the SQL into DFStatements
 /// let statements = parser.parse_statements()?;
 /// assert_eq!(statements.len(), 2);
@@ -320,38 +353,59 @@ const DEFAULT_DIALECT: GenericDialect = GenericDialect {};
 /// # use datafusion_sql::sqlparser::dialect::MySqlDialect;
 /// # use datafusion_sql::sqlparser::ast::Expr;
 /// # fn test() -> Result<()> {
-/// let dialect = MySqlDialect{}; // Parse using MySQL dialect
+/// let dialect = MySqlDialect {}; // Parse using MySQL dialect
 /// let mut parser = DFParserBuilder::new("1 + 2")
-///   .with_dialect(&dialect)
-///   .build()?;
+///     .with_dialect(&dialect)
+///     .build()?;
 /// // parse 1+2 into an sqlparser::ast::Expr
 /// let res = parser.parse_expr()?;
-/// assert!(matches!(res.expr, Expr::BinaryOp {..}));
+/// assert!(matches!(res.expr, Expr::BinaryOp { .. }));
 /// # Ok(())
 /// # }
 /// ```
-pub struct DFParserBuilder<'a> {
-    /// The SQL string to parse
-    sql: &'a str,
+pub struct DFParserBuilder<'a, 'b> {
+    /// Parser input: either raw SQL or tokens
+    input: ParserInput<'a>,
     /// The Dialect to use (defaults to [`GenericDialect`]
-    dialect: &'a dyn Dialect,
+    dialect: &'b dyn Dialect,
     /// The recursion limit while parsing
     recursion_limit: usize,
 }
 
-impl<'a> DFParserBuilder<'a> {
+/// Describes a possible input for parser
+pub enum ParserInput<'a> {
+    /// Raw SQL. Tokenization will be performed automatically as a
+    /// part of [`DFParserBuilder::build`]
+    Sql(&'a str),
+    /// Tokens
+    Tokens(Vec<TokenWithSpan>),
+}
+
+impl<'a> From<&'a str> for ParserInput<'a> {
+    fn from(sql: &'a str) -> Self {
+        Self::Sql(sql)
+    }
+}
+
+impl From<Vec<TokenWithSpan>> for ParserInput<'static> {
+    fn from(tokens: Vec<TokenWithSpan>) -> Self {
+        Self::Tokens(tokens)
+    }
+}
+
+impl<'a, 'b> DFParserBuilder<'a, 'b> {
     /// Create a new parser builder for the specified tokens using the
     /// [`GenericDialect`].
-    pub fn new(sql: &'a str) -> Self {
+    pub fn new(input: impl Into<ParserInput<'a>>) -> Self {
         Self {
-            sql,
+            input: input.into(),
             dialect: &DEFAULT_DIALECT,
             recursion_limit: DEFAULT_RECURSION_LIMIT,
         }
     }
 
     /// Adjust the parser builder's dialect. Defaults to [`GenericDialect`]
-    pub fn with_dialect(mut self, dialect: &'a dyn Dialect) -> Self {
+    pub fn with_dialect(mut self, dialect: &'b dyn Dialect) -> Self {
         self.dialect = dialect;
         self
     }
@@ -362,12 +416,18 @@ impl<'a> DFParserBuilder<'a> {
         self
     }
 
-    pub fn build(self) -> Result<DFParser<'a>, DataFusionError> {
-        let mut tokenizer = Tokenizer::new(self.dialect, self.sql);
-        // Convert TokenizerError -> ParserError
-        let tokens = tokenizer
-            .tokenize_with_location()
-            .map_err(ParserError::from)?;
+    /// Build resulting parser
+    pub fn build(self) -> Result<DFParser<'b>, DataFusionError> {
+        let tokens = match self.input {
+            ParserInput::Tokens(tokens) => tokens,
+            ParserInput::Sql(sql) => {
+                let mut tokenizer = Tokenizer::new(self.dialect, sql);
+                // Convert TokenizerError -> ParserError
+                tokenizer
+                    .tokenize_with_location()
+                    .map_err(ParserError::from)?
+            }
+        };
 
         Ok(DFParser {
             parser: Parser::new(self.dialect)
@@ -413,13 +473,18 @@ impl<'a> DFParser<'a> {
         parser.parse_statements()
     }
 
+    pub fn parse_sql_into_expr(sql: &str) -> Result<ExprWithAlias, DataFusionError> {
+        DFParserBuilder::new(sql).build()?.parse_into_expr()
+    }
+
     pub fn parse_sql_into_expr_with_dialect(
         sql: &str,
         dialect: &dyn Dialect,
     ) -> Result<ExprWithAlias, DataFusionError> {
-        let mut parser = DFParserBuilder::new(sql).with_dialect(dialect).build()?;
-
-        parser.parse_expr()
+        DFParserBuilder::new(sql)
+            .with_dialect(dialect)
+            .build()?
+            .parse_into_expr()
     }
 
     /// Parse a sql string into one or [`Statement`]s
@@ -436,7 +501,7 @@ impl<'a> DFParser<'a> {
                 break;
             }
             if expecting_statement_delimiter {
-                return self.expected("end of statement", self.parser.peek_token());
+                return self.expected("end of statement", &self.parser.peek_token());
             }
 
             let statement = self.parse_statement()?;
@@ -450,7 +515,7 @@ impl<'a> DFParser<'a> {
     fn expected<T>(
         &self,
         expected: &str,
-        found: TokenWithSpan,
+        found: &TokenWithSpan,
     ) -> Result<T, DataFusionError> {
         let sql_parser_span = found.span;
         let span = Span::try_from_sqlparser_span(sql_parser_span);
@@ -465,6 +530,19 @@ impl<'a> DFParser<'a> {
         )
     }
 
+    fn expect_token(
+        &mut self,
+        expected: &str,
+        token: &Token,
+    ) -> Result<(), DataFusionError> {
+        let next_token = self.parser.peek_token_ref();
+        if next_token.token != *token {
+            self.expected(expected, next_token)
+        } else {
+            Ok(())
+        }
+    }
+
     /// Parse a new expression
     pub fn parse_statement(&mut self) -> Result<Statement, DataFusionError> {
         match self.parser.peek_token().token {
@@ -488,6 +566,10 @@ impl<'a> DFParser<'a> {
                         self.parser.next_token(); // EXPLAIN
                         self.parse_explain()
                     }
+                    Keyword::RESET => {
+                        self.parser.next_token(); // RESET
+                        self.parse_reset()
+                    }
                     _ => {
                         // use sqlparser-rs parser
                         self.parse_and_handle_statement()
@@ -514,6 +596,16 @@ impl<'a> DFParser<'a> {
         Ok(self.parser.parse_expr_with_alias()?)
     }
 
+    /// Parses the entire SQL string into an expression.
+    ///
+    /// In contrast to [`DFParser::parse_expr`], this function will report an error if the input
+    /// contains any trailing, unparsed tokens.
+    pub fn parse_into_expr(&mut self) -> Result<ExprWithAlias, DataFusionError> {
+        let expr = self.parse_expr()?;
+        self.expect_token("end of expression", &Token::EOF)?;
+        Ok(expr)
+    }
+
     /// Helper method to parse a statement and handle errors consistently, especially for recursion limits
     fn parse_and_handle_statement(&mut self) -> Result<Statement, DataFusionError> {
         self.parser
@@ -521,13 +613,13 @@ impl<'a> DFParser<'a> {
             .map(|stmt| Statement::Statement(Box::from(stmt)))
             .map_err(|e| match e {
                 ParserError::RecursionLimitExceeded => DataFusionError::SQL(
-                    ParserError::RecursionLimitExceeded,
+                    Box::new(ParserError::RecursionLimitExceeded),
                     Some(format!(
                         " (current limit: {})",
                         self.options.recursion_limit
                     )),
                 ),
-                other => DataFusionError::SQL(other, None),
+                other => DataFusionError::SQL(Box::new(other), None),
             })
     }
 
@@ -575,7 +667,9 @@ impl<'a> DFParser<'a> {
                     Keyword::WITH => {
                         self.parser.expect_keyword(Keyword::HEADER)?;
                         self.parser.expect_keyword(Keyword::ROW)?;
-                        return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')")?;
+                        return parser_err!(
+                            "WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"
+                        )?;
                     }
                     Keyword::PARTITIONED => {
                         self.parser.expect_keyword(Keyword::BY)?;
@@ -591,11 +685,11 @@ impl<'a> DFParser<'a> {
                     }
                 }
             } else {
-                let token = self.parser.next_token();
+                let token = self.parser.peek_token();
                 if token == Token::EOF || token == Token::SemiColon {
                     break;
                 } else {
-                    return self.expected("end of statement or ;", token)?;
+                    return self.expected("end of statement or ;", &token)?;
                 }
             }
         }
@@ -632,7 +726,7 @@ impl<'a> DFParser<'a> {
                         // Unquoted namespaced keys have to conform to the syntax
                         // "<WORD>[\.<WORD>]*". If we have a key that breaks this
                         // pattern, error out:
-                        return self.expected("key name", next_token);
+                        return self.expected("key name", &next_token);
                     }
                 }
                 Ok(parts.join("."))
@@ -640,7 +734,7 @@ impl<'a> DFParser<'a> {
             Token::SingleQuotedString(s) => Ok(s),
             Token::DoubleQuotedString(s) => Ok(s),
             Token::EscapedStringLiteral(s) => Ok(s),
-            _ => self.expected("key name", next_token),
+            _ => self.expected("key name", &next_token),
         }
     }
 
@@ -659,7 +753,7 @@ impl<'a> DFParser<'a> {
             Token::DoubleQuotedString(s) => Ok(Value::DoubleQuotedString(s)),
             Token::EscapedStringLiteral(s) => Ok(Value::EscapedStringLiteral(s)),
             Token::Number(n, l) => Ok(Value::Number(n, l)),
-            _ => self.expected("string or numeric value", next_token),
+            _ => self.expected("string or numeric value", &next_token),
         }
     }
 
@@ -679,6 +773,47 @@ impl<'a> DFParser<'a> {
         }))
     }
 
+    /// Parse a SQL `RESET`
+    pub fn parse_reset(&mut self) -> Result<Statement, DataFusionError> {
+        let mut parts: Vec<String> = Vec::new();
+        let mut expecting_segment = true;
+
+        loop {
+            let next_token = self.parser.peek_token();
+            match &next_token.token {
+                Token::Word(word) => {
+                    self.parser.next_token();
+                    parts.push(word.value.clone());
+                    expecting_segment = false;
+                }
+                Token::SingleQuotedString(s)
+                | Token::DoubleQuotedString(s)
+                | Token::EscapedStringLiteral(s) => {
+                    self.parser.next_token();
+                    parts.push(s.clone());
+                    expecting_segment = false;
+                }
+                Token::Period => {
+                    self.parser.next_token();
+                    if expecting_segment || parts.is_empty() {
+                        return self.expected("configuration parameter", &next_token);
+                    }
+                    expecting_segment = true;
+                }
+                Token::EOF | Token::SemiColon => break,
+                _ => return self.expected("configuration parameter", &next_token),
+            }
+        }
+
+        if parts.is_empty() || expecting_segment {
+            return self.expected("configuration parameter", &self.parser.peek_token());
+        }
+
+        let idents: Vec<Ident> = parts.into_iter().map(Ident::new).collect();
+        let variable = ObjectName::from(idents);
+        Ok(Statement::Reset(ResetStatement::Variable(variable)))
+    }
+
     pub fn parse_explain_format(&mut self) -> Result<Option<String>, DataFusionError> {
         if !self.parser.parse_keyword(Keyword::FORMAT) {
             return Ok(None);
@@ -689,18 +824,33 @@ impl<'a> DFParser<'a> {
             Token::Word(w) => Ok(w.value),
             Token::SingleQuotedString(w) => Ok(w),
             Token::DoubleQuotedString(w) => Ok(w),
-            _ => self.expected("an explain format such as TREE", next_token),
+            _ => self.expected("an explain format such as TREE", &next_token),
         }?;
         Ok(Some(format))
     }
 
     /// Parse a SQL `CREATE` statement handling `CREATE EXTERNAL TABLE`
     pub fn parse_create(&mut self) -> Result<Statement, DataFusionError> {
-        if self.parser.parse_keyword(Keyword::EXTERNAL) {
-            self.parse_create_external_table(false)
-        } else if self.parser.parse_keyword(Keyword::UNBOUNDED) {
-            self.parser.expect_keyword(Keyword::EXTERNAL)?;
-            self.parse_create_external_table(true)
+        // TODO: Change sql parser to take in `or_replace: bool` inside parse_create()
+        if self
+            .parser
+            .parse_keywords(&[Keyword::OR, Keyword::REPLACE, Keyword::EXTERNAL])
+        {
+            self.parse_create_external_table(false, true)
+        } else if self.parser.parse_keywords(&[
+            Keyword::OR,
+            Keyword::REPLACE,
+            Keyword::UNBOUNDED,
+            Keyword::EXTERNAL,
+        ]) {
+            self.parse_create_external_table(true, true)
+        } else if self.parser.parse_keyword(Keyword::EXTERNAL) {
+            self.parse_create_external_table(false, false)
+        } else if self
+            .parser
+            .parse_keywords(&[Keyword::UNBOUNDED, Keyword::EXTERNAL])
+        {
+            self.parse_create_external_table(true, false)
         } else {
             Ok(Statement::Statement(Box::from(self.parser.parse_create()?)))
         }
@@ -719,7 +869,7 @@ impl<'a> DFParser<'a> {
                 let identifier = self.parser.parse_identifier()?;
                 partitions.push(identifier.to_string());
             } else {
-                return self.expected("partition name", self.parser.peek_token());
+                return self.expected("partition name", &self.parser.peek_token());
             }
             let comma = self.parser.consume_token(&Token::Comma);
             if self.parser.consume_token(&Token::RParen) {
@@ -728,7 +878,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after partition definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -799,7 +949,7 @@ impl<'a> DFParser<'a> {
             } else {
                 return self.expected(
                     "column name or constraint definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
             let comma = self.parser.consume_token(&Token::Comma);
@@ -809,7 +959,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after column definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -829,7 +979,7 @@ impl<'a> DFParser<'a> {
                 } else {
                     return self.expected(
                         "constraint details after CONSTRAINT <name>",
-                        self.parser.peek_token(),
+                        &self.parser.peek_token(),
                     );
                 }
             } else if let Some(option) = self.parser.parse_optional_column_option()? {
@@ -848,15 +998,22 @@ impl<'a> DFParser<'a> {
     fn parse_create_external_table(
         &mut self,
         unbounded: bool,
+        or_replace: bool,
     ) -> Result<Statement, DataFusionError> {
         let temporary = self
             .parser
             .parse_one_of_keywords(&[Keyword::TEMP, Keyword::TEMPORARY])
             .is_some();
+
         self.parser.expect_keyword(Keyword::TABLE)?;
         let if_not_exists =
             self.parser
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
+
+        if if_not_exists && or_replace {
+            return parser_err!("'IF NOT EXISTS' cannot coexist with 'REPLACE'");
+        }
+
         let table_name = self.parser.parse_object_name(true)?;
         let (mut columns, constraints) = self.parse_columns()?;
 
@@ -896,15 +1053,21 @@ impl<'a> DFParser<'a> {
                         } else {
                             self.parser.expect_keyword(Keyword::HEADER)?;
                             self.parser.expect_keyword(Keyword::ROW)?;
-                            return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS (format.has_header true)")?;
+                            return parser_err!(
+                                "WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS (format.has_header true)"
+                            )?;
                         }
                     }
                     Keyword::DELIMITER => {
-                        return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS (format.delimiter ',')")?;
+                        return parser_err!(
+                            "DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS (format.delimiter ',')"
+                        )?;
                     }
                     Keyword::COMPRESSION => {
                         self.parser.expect_keyword(Keyword::TYPE)?;
-                        return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS (format.compression gzip)")?;
+                        return parser_err!(
+                            "COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS (format.compression gzip)"
+                        )?;
                     }
                     Keyword::PARTITIONED => {
                         self.parser.expect_keyword(Keyword::BY)?;
@@ -943,11 +1106,11 @@ impl<'a> DFParser<'a> {
                     }
                 }
             } else {
-                let token = self.parser.next_token();
+                let token = self.parser.peek_token();
                 if token == Token::EOF || token == Token::SemiColon {
                     break;
                 } else {
-                    return self.expected("end of statement or ;", token)?;
+                    return self.expected("end of statement or ;", &token)?;
                 }
             }
         }
@@ -972,6 +1135,7 @@ impl<'a> DFParser<'a> {
             table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]),
             order_exprs: builder.order_exprs,
             if_not_exists,
+            or_replace,
             temporary,
             unbounded,
             options: builder.options.unwrap_or(Vec::new()),
@@ -985,7 +1149,7 @@ impl<'a> DFParser<'a> {
         let token = self.parser.next_token();
         match &token.token {
             Token::Word(w) => parse_file_type(&w.value),
-            _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", token),
+            _ => self.expected("one of ARROW, PARQUET, NDJSON, or CSV", &token),
         }
     }
 
@@ -1008,7 +1172,7 @@ impl<'a> DFParser<'a> {
             } else if !comma {
                 return self.expected(
                     "',' or ')' after option definition",
-                    self.parser.peek_token(),
+                    &self.parser.peek_token(),
                 );
             }
         }
@@ -1021,9 +1185,11 @@ mod tests {
     use super::*;
     use datafusion_common::assert_contains;
     use sqlparser::ast::Expr::Identifier;
-    use sqlparser::ast::{BinaryOperator, DataType, Expr, Ident};
+    use sqlparser::ast::{
+        BinaryOperator, DataType, ExactNumberInfo, Expr, Ident, ValueWithSpan,
+    };
     use sqlparser::dialect::SnowflakeDialect;
-    use sqlparser::tokenizer::Span;
+    use sqlparser::tokenizer::{Location, Span, Whitespace};
 
     fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), DataFusionError> {
         let statements = DFParser::parse_sql(sql)?;
@@ -1080,6 +1246,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1097,6 +1264,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1115,6 +1283,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1133,6 +1302,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![(
@@ -1154,6 +1324,7 @@ mod tests {
             table_partition_cols: vec!["p1".to_string(), "p2".to_string()],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1182,6 +1353,7 @@ mod tests {
                 table_partition_cols: vec![],
                 order_exprs: vec![],
                 if_not_exists: false,
+                or_replace: false,
                 temporary: false,
                 unbounded: false,
                 options: vec![(
@@ -1203,6 +1375,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1220,6 +1393,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1237,6 +1411,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1245,8 +1420,7 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // positive case: it is ok for avro files not to have columns specified
-        let sql =
-            "CREATE EXTERNAL TABLE IF NOT EXISTS t STORED AS PARQUET LOCATION 'foo.parquet'";
+        let sql = "CREATE EXTERNAL TABLE IF NOT EXISTS t STORED AS PARQUET LOCATION 'foo.parquet'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![],
@@ -1255,6 +1429,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: true,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1262,9 +1437,27 @@ mod tests {
         });
         expect_parse_ok(sql, expected)?;
 
-        // positive case: column definition allowed in 'partition by' clause
+        // positive case: or replace
         let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int) LOCATION 'foo.csv'";
+            "CREATE OR REPLACE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet'";
+        let expected = Statement::CreateExternalTable(CreateExternalTable {
+            name: name.clone(),
+            columns: vec![],
+            file_type: "PARQUET".to_string(),
+            location: "foo.parquet".into(),
+            table_partition_cols: vec![],
+            order_exprs: vec![],
+            if_not_exists: false,
+            or_replace: true,
+            temporary: false,
+            unbounded: false,
+            options: vec![],
+            constraints: vec![],
+        });
+        expect_parse_ok(sql, expected)?;
+
+        // positive case: column definition allowed in 'partition by' clause
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int) LOCATION 'foo.csv'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![
@@ -1276,6 +1469,7 @@ mod tests {
             table_partition_cols: vec!["p1".to_string()],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1284,17 +1478,18 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // negative case: mixed column defs and column names in `PARTITIONED BY` clause
-        let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int, c1) LOCATION 'foo.csv'";
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int, c1) LOCATION 'foo.csv'";
         expect_parse_error(
             sql,
             "SQL error: ParserError(\"Expected: a data type name, found: ) at Line: 1, Column: 73\")",
         );
 
         // negative case: mixed column defs and column names in `PARTITIONED BY` clause
-        let sql =
-            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c1, p1 int) LOCATION 'foo.csv'";
-        expect_parse_error(sql, "SQL error: ParserError(\"Expected: ',' or ')' after partition definition, found: int at Line: 1, Column: 70\")");
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (c1, p1 int) LOCATION 'foo.csv'";
+        expect_parse_error(
+            sql,
+            "SQL error: ParserError(\"Expected: ',' or ')' after partition definition, found: int at Line: 1, Column: 70\")",
+        );
 
         // positive case: additional options (one entry) can be specified
         let sql =
@@ -1307,6 +1502,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![("k1".into(), Value::SingleQuotedString("v1".into()))],
@@ -1315,8 +1511,7 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // positive case: additional options (multiple entries) can be specified
-        let sql =
-            "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2) LOCATION 'blahblah'";
+        let sql = "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2) LOCATION 'blahblah'";
         let expected = Statement::CreateExternalTable(CreateExternalTable {
             name: name.clone(),
             columns: vec![],
@@ -1325,6 +1520,7 @@ mod tests {
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![
@@ -1336,15 +1532,17 @@ mod tests {
         expect_parse_ok(sql, expected)?;
 
         // Ordered Col
-        let sqls = ["CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS LAST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS LAST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS FIRST) LOCATION 'foo.csv'",
-                        "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS LAST) LOCATION 'foo.csv'"];
+        let sqls = [
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 NULLS LAST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 DESC NULLS LAST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS FIRST) LOCATION 'foo.csv'",
+            "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 ASC NULLS LAST) LOCATION 'foo.csv'",
+        ];
         let expected = vec![
             (None, None),
             (None, Some(true)),
@@ -1373,6 +1571,7 @@ mod tests {
                     with_fill: None,
                 }]],
                 if_not_exists: false,
+                or_replace: false,
                 temporary: false,
                 unbounded: false,
                 options: vec![],
@@ -1420,6 +1619,7 @@ mod tests {
                 },
             ]],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1460,6 +1660,7 @@ mod tests {
                 with_fill: None,
             }]],
             if_not_exists: false,
+            or_replace: false,
             temporary: false,
             unbounded: false,
             options: vec![],
@@ -1467,7 +1668,7 @@ mod tests {
         });
         expect_parse_ok(sql, expected)?;
 
-        // Most complete CREATE EXTERNAL TABLE statement possible
+        // Most complete CREATE EXTERNAL TABLE statement possible (using IF NOT EXISTS)
         let sql = "
             CREATE UNBOUNDED EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float)
             STORED AS PARQUET
@@ -1483,7 +1684,7 @@ mod tests {
             name: name.clone(),
             columns: vec![
                 make_column_def("c1", DataType::Int(None)),
-                make_column_def("c2", DataType::Float(None)),
+                make_column_def("c2", DataType::Float(ExactNumberInfo::None)),
             ],
             file_type: "PARQUET".to_string(),
             location: "foo.parquet".into(),
@@ -1509,6 +1710,75 @@ mod tests {
                 with_fill: None,
             }]],
             if_not_exists: true,
+            or_replace: false,
+            temporary: false,
+            unbounded: true,
+            options: vec![
+                (
+                    "format.compression".into(),
+                    Value::SingleQuotedString("zstd".into()),
+                ),
+                (
+                    "format.delimiter".into(),
+                    Value::SingleQuotedString("*".into()),
+                ),
+                (
+                    "ROW_GROUP_SIZE".into(),
+                    Value::SingleQuotedString("1024".into()),
+                ),
+                ("TRUNCATE".into(), Value::SingleQuotedString("NO".into())),
+                (
+                    "format.has_header".into(),
+                    Value::SingleQuotedString("true".into()),
+                ),
+            ],
+            constraints: vec![],
+        });
+        expect_parse_ok(sql, expected)?;
+
+        // Most complete CREATE EXTERNAL TABLE statement possible (using OR REPLACE)
+        let sql = "
+            CREATE OR REPLACE UNBOUNDED EXTERNAL TABLE t (c1 int, c2 float)
+            STORED AS PARQUET
+            WITH ORDER (c1 - c2 ASC)
+            PARTITIONED BY (c1)
+            LOCATION 'foo.parquet'
+            OPTIONS ('format.compression' 'zstd',
+                     'format.delimiter' '*',
+                     'ROW_GROUP_SIZE' '1024',
+                     'TRUNCATE' 'NO',
+                     'format.has_header' 'true')";
+        let expected = Statement::CreateExternalTable(CreateExternalTable {
+            name: name.clone(),
+            columns: vec![
+                make_column_def("c1", DataType::Int(None)),
+                make_column_def("c2", DataType::Float(ExactNumberInfo::None)),
+            ],
+            file_type: "PARQUET".to_string(),
+            location: "foo.parquet".into(),
+            table_partition_cols: vec!["c1".into()],
+            order_exprs: vec![vec![OrderByExpr {
+                expr: Expr::BinaryOp {
+                    left: Box::new(Identifier(Ident {
+                        value: "c1".to_owned(),
+                        quote_style: None,
+                        span: Span::empty(),
+                    })),
+                    op: BinaryOperator::Minus,
+                    right: Box::new(Identifier(Ident {
+                        value: "c2".to_owned(),
+                        quote_style: None,
+                        span: Span::empty(),
+                    })),
+                },
+                options: OrderByOptions {
+                    asc: Some(true),
+                    nulls_first: None,
+                },
+                with_fill: None,
+            }]],
+            if_not_exists: false,
+            or_replace: true,
             temporary: false,
             unbounded: true,
             options: vec![
@@ -1683,8 +1953,7 @@ mod tests {
     #[test]
     fn copy_to_multi_options() -> Result<(), DataFusionError> {
         // order of options is preserved
-        let sql =
-            "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy, 'execution.keep_partition_by_columns' true)";
+        let sql = "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy, 'execution.keep_partition_by_columns' true)";
 
         let expected_options = vec![
             (
@@ -1783,4 +2052,155 @@ mod tests {
             "SQL error: RecursionLimitExceeded (current limit: 1)"
         );
     }
+
+    #[test]
+    fn test_multistatement() {
+        let sql = "COPY foo TO bar STORED AS CSV; \
+             CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv'; \
+             RESET var;";
+        let statements = DFParser::parse_sql(sql).unwrap();
+        assert_eq!(
+            statements,
+            vec![
+                Statement::CopyTo(CopyToStatement {
+                    source: object_name("foo"),
+                    target: "bar".to_string(),
+                    partitioned_by: vec![],
+                    stored_as: Some("CSV".to_owned()),
+                    options: vec![],
+                }),
+                {
+                    let name = ObjectName::from(vec![Ident::from("t")]);
+                    let display = None;
+                    Statement::CreateExternalTable(CreateExternalTable {
+                        name: name.clone(),
+                        columns: vec![make_column_def("c1", DataType::Int(display))],
+                        file_type: "CSV".to_string(),
+                        location: "foo.csv".into(),
+                        table_partition_cols: vec![],
+                        order_exprs: vec![],
+                        if_not_exists: false,
+                        or_replace: false,
+                        temporary: false,
+                        unbounded: false,
+                        options: vec![],
+                        constraints: vec![],
+                    })
+                },
+                {
+                    let name = ObjectName::from(vec![Ident::from("var")]);
+                    Statement::Reset(ResetStatement::Variable(name))
+                }
+            ]
+        );
+    }
+
+    #[test]
+    fn test_custom_tokens() {
+        // Span mock.
+        let span = Span {
+            start: Location { line: 0, column: 0 },
+            end: Location { line: 0, column: 0 },
+        };
+        let tokens = vec![
+            TokenWithSpan {
+                token: Token::make_keyword("SELECT"),
+                span,
+            },
+            TokenWithSpan {
+                token: Token::Whitespace(Whitespace::Space),
+                span,
+            },
+            TokenWithSpan {
+                token: Token::Placeholder("1".to_string()),
+                span,
+            },
+        ];
+
+        let statements = DFParserBuilder::new(tokens)
+            .build()
+            .unwrap()
+            .parse_statements()
+            .unwrap();
+        assert_eq!(statements.len(), 1);
+    }
+
+    fn expect_parse_expr_ok(sql: &str, expected: ExprWithAlias) {
+        let expr = DFParser::parse_sql_into_expr(sql).unwrap();
+        assert_eq!(expr, expected, "actual:\n{expr:#?}");
+    }
+
+    /// Parses sql and asserts that the expected error message was found
+    fn expect_parse_expr_error(sql: &str, expected_error: &str) {
+        match DFParser::parse_sql_into_expr(sql) {
+            Ok(expr) => {
+                panic!("Expected parse error for '{sql}', but was successful: {expr:#?}");
+            }
+            Err(e) => {
+                let error_message = e.to_string();
+                assert!(
+                    error_message.contains(expected_error),
+                    "Expected error '{expected_error}' not found in actual error '{error_message}'"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn literal() {
+        expect_parse_expr_ok(
+            "1234",
+            ExprWithAlias {
+                expr: Expr::Value(ValueWithSpan::from(Value::Number(
+                    "1234".to_string(),
+                    false,
+                ))),
+                alias: None,
+            },
+        )
+    }
+
+    #[test]
+    fn literal_with_alias() {
+        expect_parse_expr_ok(
+            "1234 as foo",
+            ExprWithAlias {
+                expr: Expr::Value(ValueWithSpan::from(Value::Number(
+                    "1234".to_string(),
+                    false,
+                ))),
+                alias: Some(Ident::from("foo")),
+            },
+        )
+    }
+
+    #[test]
+    fn literal_with_alias_and_trailing_tokens() {
+        expect_parse_expr_error(
+            "1234 as foo.bar",
+            "Expected: end of expression, found: .",
+        )
+    }
+
+    #[test]
+    fn literal_with_alias_and_trailing_whitespace() {
+        expect_parse_expr_ok(
+            "1234 as foo   ",
+            ExprWithAlias {
+                expr: Expr::Value(ValueWithSpan::from(Value::Number(
+                    "1234".to_string(),
+                    false,
+                ))),
+                alias: Some(Ident::from("foo")),
+            },
+        )
+    }
+
+    #[test]
+    fn literal_with_alias_and_trailing_whitespace_and_token() {
+        expect_parse_expr_error(
+            "1234 as foo    bar",
+            "Expected: end of expression, found: bar",
+        )
+    }
 }
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 5a1f3cdf69c39..b7e270e4f0570 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -17,28 +17,29 @@
 
 //! [`SqlToRel`]: SQL Query Planner (produces [`LogicalPlan`] from SQL AST)
 use std::collections::HashMap;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::vec;
 
+use crate::utils::make_decimal_type;
 use arrow::datatypes::*;
+use datafusion_common::TableReference;
 use datafusion_common::config::SqlParserOptions;
+use datafusion_common::datatype::{DataTypeExt, FieldExt};
 use datafusion_common::error::add_possible_columns_to_diag;
-use datafusion_common::TableReference;
+use datafusion_common::{DFSchema, DataFusionError, Result, not_impl_err, plan_err};
 use datafusion_common::{
-    field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic,
-    SchemaError,
+    DFSchemaRef, Diagnostic, SchemaError, field_not_found, internal_err,
+    plan_datafusion_err,
 };
-use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result};
 use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder};
+pub use datafusion_expr::planner::ContextProvider;
 use datafusion_expr::utils::find_column_exprs;
-use datafusion_expr::{col, Expr};
+use datafusion_expr::{Expr, col};
 use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo, TimezoneInfo};
 use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
 use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias};
 
-use crate::utils::make_decimal_type;
-pub use datafusion_expr::planner::ContextProvider;
-
 /// SQL parser options
 #[derive(Debug, Clone, Copy)]
 pub struct ParserOptions {
@@ -52,8 +53,10 @@ pub struct ParserOptions {
     pub enable_options_value_normalization: bool,
     /// Whether to collect spans
     pub collect_spans: bool,
-    /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning.
-    pub map_varchar_to_utf8view: bool,
+    /// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
+    pub map_string_types_to_utf8view: bool,
+    /// Default null ordering for sorting expressions.
+    pub default_null_ordering: NullOrdering,
 }
 
 impl ParserOptions {
@@ -72,9 +75,12 @@ impl ParserOptions {
             parse_float_as_decimal: false,
             enable_ident_normalization: true,
             support_varchar_with_length: true,
-            map_varchar_to_utf8view: true,
+            map_string_types_to_utf8view: true,
             enable_options_value_normalization: false,
             collect_spans: false,
+            // By default, `nulls_max` is used to follow Postgres's behavior.
+            // postgres rule: https://www.postgresql.org/docs/current/queries-order.html
+            default_null_ordering: NullOrdering::NullsMax,
         }
     }
 
@@ -112,9 +118,9 @@ impl ParserOptions {
         self
     }
 
-    /// Sets the `map_varchar_to_utf8view` option.
-    pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self {
-        self.map_varchar_to_utf8view = value;
+    /// Sets the `map_string_types_to_utf8view` option.
+    pub fn with_map_string_types_to_utf8view(mut self, value: bool) -> Self {
+        self.map_string_types_to_utf8view = value;
         self
     }
 
@@ -129,6 +135,12 @@ impl ParserOptions {
         self.collect_spans = value;
         self
     }
+
+    /// Sets the `default_null_ordering` option.
+    pub fn with_default_null_ordering(mut self, value: NullOrdering) -> Self {
+        self.default_null_ordering = value;
+        self
+    }
 }
 
 impl Default for ParserOptions {
@@ -143,14 +155,66 @@ impl From<&SqlParserOptions> for ParserOptions {
             parse_float_as_decimal: options.parse_float_as_decimal,
             enable_ident_normalization: options.enable_ident_normalization,
             support_varchar_with_length: options.support_varchar_with_length,
-            map_varchar_to_utf8view: options.map_varchar_to_utf8view,
+            map_string_types_to_utf8view: options.map_string_types_to_utf8view,
             enable_options_value_normalization: options
                 .enable_options_value_normalization,
             collect_spans: options.collect_spans,
+            default_null_ordering: options.default_null_ordering.as_str().into(),
+        }
+    }
+}
+
+/// Represents the null ordering for sorting expressions.
+#[derive(Debug, Clone, Copy)]
+pub enum NullOrdering {
+    /// Nulls appear last in ascending order.
+    NullsMax,
+    /// Nulls appear first in descending order.
+    NullsMin,
+    /// Nulls appear first.
+    NullsFirst,
+    /// Nulls appear last.
+    NullsLast,
+}
+
+impl NullOrdering {
+    /// Evaluates the null ordering based on the given ascending flag.
+    ///
+    /// # Returns
+    /// * `true` if nulls should appear first.
+    /// * `false` if nulls should appear last.
+    pub fn nulls_first(&self, asc: bool) -> bool {
+        match self {
+            Self::NullsMax => !asc,
+            Self::NullsMin => asc,
+            Self::NullsFirst => true,
+            Self::NullsLast => false,
         }
     }
 }
 
+impl FromStr for NullOrdering {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "nulls_max" => Ok(Self::NullsMax),
+            "nulls_min" => Ok(Self::NullsMin),
+            "nulls_first" => Ok(Self::NullsFirst),
+            "nulls_last" => Ok(Self::NullsLast),
+            _ => plan_err!(
+                "Unknown null ordering: Expected one of 'nulls_first', 'nulls_last', 'nulls_min' or 'nulls_max'. Got {s}"
+            ),
+        }
+    }
+}
+
+impl From<&str> for NullOrdering {
+    fn from(s: &str) -> Self {
+        Self::from_str(s).unwrap_or(Self::NullsMax)
+    }
+}
+
 /// Ident Normalizer
 #[derive(Debug)]
 pub struct IdentNormalizer {
@@ -189,17 +253,18 @@ impl IdentNormalizer {
 /// This helps resolve scoping issues of CTEs.
 /// By using cloning, a subquery can inherit CTEs from the outer query
 /// and can also define its own private CTEs without affecting the outer query.
-///
 #[derive(Debug, Clone)]
 pub struct PlannerContext {
     /// Data types for numbered parameters ($1, $2, etc), if supplied
     /// in `PREPARE` statement
-    prepare_param_data_types: Arc<Vec<DataType>>,
+    prepare_param_data_types: Arc<Vec<FieldRef>>,
     /// Map of CTE name to logical plan of the WITH clause.
     /// Use `Arc<LogicalPlan>` to allow cheap cloning
     ctes: HashMap<String, Arc<LogicalPlan>>,
-    /// The query schema of the outer query plan, used to resolve the columns in subquery
-    outer_query_schema: Option<DFSchemaRef>,
+
+    /// The queries schemas of outer query relations, used to resolve the outer referenced
+    /// columns in subquery (recursive aware)
+    outer_queries_schemas_stack: Vec<DFSchemaRef>,
     /// The joined schemas of all FROM clauses planned so far. When planning LATERAL
     /// FROM clauses, this should become a suffix of the `outer_query_schema`.
     outer_from_schema: Option<DFSchemaRef>,
@@ -219,7 +284,7 @@ impl PlannerContext {
         Self {
             prepare_param_data_types: Arc::new(vec![]),
             ctes: HashMap::new(),
-            outer_query_schema: None,
+            outer_queries_schemas_stack: vec![],
             outer_from_schema: None,
             create_table_schema: None,
         }
@@ -228,25 +293,48 @@ impl PlannerContext {
     /// Update the PlannerContext with provided prepare_param_data_types
     pub fn with_prepare_param_data_types(
         mut self,
-        prepare_param_data_types: Vec<DataType>,
+        prepare_param_data_types: Vec<FieldRef>,
     ) -> Self {
         self.prepare_param_data_types = prepare_param_data_types.into();
         self
     }
 
-    // Return a reference to the outer query's schema
-    pub fn outer_query_schema(&self) -> Option<&DFSchema> {
-        self.outer_query_schema.as_ref().map(|s| s.as_ref())
+    /// Return the stack of outer relations' schemas, the outer most
+    /// relation are at the first entry
+    pub fn outer_queries_schemas(&self) -> &[DFSchemaRef] {
+        &self.outer_queries_schemas_stack
+    }
+
+    /// Return an iterator of the subquery relations' schemas, innermost
+    /// relation is returned first.
+    ///
+    /// This order corresponds to the order of resolution when looking up column
+    /// references in subqueries, which start from the innermost relation and
+    /// then look up the outer relations one by one until a match is found or no
+    /// more outer relation exist.
+    ///
+    /// NOTE this is *REVERSED* order of [`Self::outer_queries_schemas`]
+    ///
+    /// This is useful to resolve the column reference in the subquery by
+    /// looking up the outer query schemas one by one.
+    pub fn outer_schemas_iter(&self) -> impl Iterator<Item = &DFSchemaRef> {
+        self.outer_queries_schemas_stack.iter().rev()
     }
 
     /// Sets the outer query schema, returning the existing one, if
     /// any
-    pub fn set_outer_query_schema(
-        &mut self,
-        mut schema: Option<DFSchemaRef>,
-    ) -> Option<DFSchemaRef> {
-        std::mem::swap(&mut self.outer_query_schema, &mut schema);
-        schema
+    pub fn append_outer_query_schema(&mut self, schema: DFSchemaRef) {
+        self.outer_queries_schemas_stack.push(schema);
+    }
+
+    /// The schema of the adjacent outer relation
+    pub fn latest_outer_query_schema(&self) -> Option<&DFSchemaRef> {
+        self.outer_queries_schemas_stack.last()
+    }
+
+    /// Remove the schema of the adjacent outer relation
+    pub fn pop_outer_query_schema(&mut self) -> Option<DFSchemaRef> {
+        self.outer_queries_schemas_stack.pop()
     }
 
     pub fn set_table_schema(
@@ -285,7 +373,7 @@ impl PlannerContext {
     }
 
     /// Return the types of parameters (`$1`, `$2`, etc) if known
-    pub fn prepare_param_data_types(&self) -> &[DataType] {
+    pub fn prepare_param_data_types(&self) -> &[FieldRef] {
         &self.prepare_param_data_types
     }
 
@@ -366,16 +454,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         let mut fields = Vec::with_capacity(columns.len());
 
         for column in columns {
-            let data_type = self.convert_data_type(&column.data_type)?;
+            let data_type = self.convert_data_type_to_field(&column.data_type)?;
             let not_nullable = column
                 .options
                 .iter()
                 .any(|x| x.option == ColumnOption::NotNull);
-            fields.push(Field::new(
-                self.ident_normalizer.normalize(column.name),
-                data_type,
-                !not_nullable,
-            ));
+            fields.push(
+                data_type
+                    .as_ref()
+                    .clone()
+                    .with_name(self.ident_normalizer.normalize(column.name))
+                    .with_nullable(!not_nullable),
+            );
         }
 
         Ok(Schema::new(fields))
@@ -391,7 +481,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         // Default expressions are restricted, column references are not allowed
         let empty_schema = DFSchema::empty();
         let error_desc = |e: DataFusionError| match e {
-            DataFusionError::SchemaError(SchemaError::FieldNotFound { .. }, _) => {
+            DataFusionError::SchemaError(ref err, _)
+                if matches!(**err, SchemaError::FieldNotFound { .. }) =>
+            {
                 plan_datafusion_err!(
                     "Column reference is not allowed in the DEFAULT expression : {}",
                     e
@@ -483,13 +575,19 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                     }
                 }
                 .map_err(|err: DataFusionError| match &err {
-                    DataFusionError::SchemaError(
-                        SchemaError::FieldNotFound {
+                    DataFusionError::SchemaError(inner, _)
+                        if matches!(
+                            inner.as_ref(),
+                            SchemaError::FieldNotFound { .. }
+                        ) =>
+                    {
+                        let SchemaError::FieldNotFound {
                             field,
                             valid_fields,
-                        },
-                        _,
-                    ) => {
+                        } = inner.as_ref()
+                        else {
+                            unreachable!()
+                        };
                         let mut diagnostic = if let Some(relation) = &col.relation {
                             Diagnostic::new_error(
                                 format!(
@@ -517,40 +615,45 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             })
     }
 
-    pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) -> Result<DataType> {
+    pub(crate) fn convert_data_type_to_field(
+        &self,
+        sql_type: &SQLDataType,
+    ) -> Result<FieldRef> {
         // First check if any of the registered type_planner can handle this type
-        if let Some(type_planner) = self.context_provider.get_type_planner() {
-            if let Some(data_type) = type_planner.plan_type(sql_type)? {
-                return Ok(data_type);
-            }
+        if let Some(type_planner) = self.context_provider.get_type_planner()
+            && let Some(data_type) = type_planner.plan_type_field(sql_type)?
+        {
+            return Ok(data_type);
         }
 
         // If no type_planner can handle this type, use the default conversion
         match sql_type {
             SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) => {
                 // Arrays may be multi-dimensional.
-                let inner_data_type = self.convert_data_type(inner_sql_type)?;
-                Ok(DataType::new_list(inner_data_type, true))
+                Ok(self.convert_data_type_to_field(inner_sql_type)?.into_list())
             }
             SQLDataType::Array(ArrayElemTypeDef::SquareBracket(
                 inner_sql_type,
                 maybe_array_size,
             )) => {
-                let inner_data_type = self.convert_data_type(inner_sql_type)?;
+                let inner_field = self.convert_data_type_to_field(inner_sql_type)?;
                 if let Some(array_size) = maybe_array_size {
-                    Ok(DataType::new_fixed_size_list(
-                        inner_data_type,
-                        *array_size as i32,
-                        true,
-                    ))
+                    let array_size: i32 = (*array_size).try_into().map_err(|_| {
+                        plan_datafusion_err!(
+                            "Array size must be a positive 32 bit integer, got {array_size}"
+                        )
+                    })?;
+                    Ok(inner_field.into_fixed_size_list(array_size))
                 } else {
-                    Ok(DataType::new_list(inner_data_type, true))
+                    Ok(inner_field.into_list())
                 }
             }
             SQLDataType::Array(ArrayElemTypeDef::None) => {
                 not_impl_err!("Arrays with unspecified type is not supported")
             }
-            other => self.convert_simple_data_type(other),
+            other => Ok(self
+                .convert_simple_data_type(other)?
+                .into_nullable_field_ref()),
         }
     }
 
@@ -577,7 +680,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                     please set `support_varchar_with_length` to be true"
                     ),
                     _ => {
-                        if self.options.map_varchar_to_utf8view {
+                        if self.options.map_string_types_to_utf8view {
                             Ok(DataType::Utf8View)
                         } else {
                             Ok(DataType::Utf8)
@@ -601,13 +704,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 )
             }
             SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => {
-                Ok(DataType::Utf8)
+                if self.options.map_string_types_to_utf8view {
+                    Ok(DataType::Utf8View)
+                } else {
+                    Ok(DataType::Utf8)
+                }
             }
             SQLDataType::Timestamp(precision, tz_info)
                 if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) =>
             {
-                let tz = if matches!(tz_info, TimezoneInfo::Tz)
-                    || matches!(tz_info, TimezoneInfo::WithTimeZone)
+                let tz = if *tz_info == TimezoneInfo::Tz
+                    || *tz_info == TimezoneInfo::WithTimeZone
                 {
                     // Timestamp With Time Zone
                     // INPUT : [SQLDataType]   TimestampTz + [Config] Time Zone
@@ -628,13 +735,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             }
             SQLDataType::Date => Ok(DataType::Date32),
             SQLDataType::Time(None, tz_info) => {
-                if matches!(tz_info, TimezoneInfo::None)
-                    || matches!(tz_info, TimezoneInfo::WithoutTimeZone)
+                if *tz_info == TimezoneInfo::None
+                    || *tz_info == TimezoneInfo::WithoutTimeZone
                 {
                     Ok(DataType::Time64(TimeUnit::Nanosecond))
                 } else {
                     // We don't support TIMETZ and TIME WITH TIME ZONE for now
-                    not_impl_err!("Unsupported SQL type {sql_type:?}")
+                    not_impl_err!("Unsupported SQL type {sql_type}")
                 }
             }
             SQLDataType::Numeric(exact_number_info)
@@ -646,25 +753,26 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                         (Some(precision), Some(scale))
                     }
                 };
-                make_decimal_type(precision, scale)
+                make_decimal_type(precision, scale.map(|s| s as u64))
             }
             SQLDataType::Bytea => Ok(DataType::Binary),
-            SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano)),
+            SQLDataType::Interval { fields, precision } => {
+                if fields.is_some() || precision.is_some() {
+                    return not_impl_err!("Unsupported SQL type {sql_type}");
+                }
+                Ok(DataType::Interval(IntervalUnit::MonthDayNano))
+            }
             SQLDataType::Struct(fields, _) => {
                 let fields = fields
                     .iter()
                     .enumerate()
-                    .map(|(idx, field)| {
-                        let data_type = self.convert_data_type(&field.field_type)?;
-                        let field_name = match &field.field_name {
+                    .map(|(idx, sql_struct_field)| {
+                        let field = self.convert_data_type_to_field(&sql_struct_field.field_type)?;
+                        let field_name = match &sql_struct_field.field_name {
                             Some(ident) => ident.clone(),
                             None => Ident::new(format!("c{idx}")),
                         };
-                        Ok(Arc::new(Field::new(
-                            self.ident_normalizer.normalize(field_name),
-                            data_type,
-                            true,
-                        )))
+                        Ok(field.as_ref().clone().with_name(self.ident_normalizer.normalize(field_name)))
                     })
                     .collect::<Result<Vec<_>>>()?;
                 Ok(DataType::Struct(Fields::from(fields)))
@@ -735,8 +843,24 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             | SQLDataType::AnyType
             | SQLDataType::Table(_)
             | SQLDataType::VarBit(_)
-            | SQLDataType::GeometricType(_) => {
-                not_impl_err!("Unsupported SQL type {sql_type:?}")
+            | SQLDataType::UTinyInt
+            | SQLDataType::USmallInt
+            | SQLDataType::HugeInt
+            | SQLDataType::UHugeInt
+            | SQLDataType::UBigInt
+            | SQLDataType::TimestampNtz{..}
+            | SQLDataType::NamedTable { .. }
+            | SQLDataType::TsVector
+            | SQLDataType::TsQuery
+            | SQLDataType::GeometricType(_)
+            | SQLDataType::DecimalUnsigned(_) // deprecated mysql type
+            | SQLDataType::FloatUnsigned(_) // deprecated mysql type
+            | SQLDataType::RealUnsigned // deprecated mysql type
+            | SQLDataType::DecUnsigned(_) // deprecated mysql type
+            | SQLDataType::DoubleUnsigned(_) // deprecated mysql type
+            | SQLDataType::DoublePrecisionUnsigned // deprecated mysql type
+            => {
+                not_impl_err!("Unsupported SQL type {sql_type}")
             }
         }
     }
diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs
index f42a3ad138c4a..1b7bb856a592b 100644
--- a/datafusion/sql/src/query.rs
+++ b/datafusion/sql/src/query.rs
@@ -20,15 +20,17 @@ use std::sync::Arc;
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use crate::stack::StackGuard;
-use datafusion_common::{not_impl_err, Constraints, DFSchema, Result};
-use datafusion_expr::expr::Sort;
+use datafusion_common::{Constraints, DFSchema, Result, not_impl_err};
+use datafusion_expr::expr::{Sort, WildcardOptions};
 
+use datafusion_expr::select_expr::SelectExpr;
 use datafusion_expr::{
     CreateMemoryTable, DdlStatement, Distinct, Expr, LogicalPlan, LogicalPlanBuilder,
 };
 use sqlparser::ast::{
-    Expr as SQLExpr, Ident, Offset as SQLOffset, OrderBy, OrderByExpr, OrderByKind,
-    Query, SelectInto, SetExpr,
+    Expr as SQLExpr, ExprWithAliasAndOrderBy, Ident, LimitClause, Offset, OffsetRows,
+    OrderBy, OrderByExpr, OrderByKind, PipeOperator, Query, SelectInto, SetExpr,
+    SetOperator, SetQuantifier, TableAlias,
 };
 use sqlparser::tokenizer::Span;
 
@@ -44,18 +46,34 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let mut query_plan_context = outer_planner_context.clone();
         let planner_context = &mut query_plan_context;
 
-        if let Some(with) = query.with {
+        let Query {
+            with,
+            body,
+            order_by,
+            limit_clause,
+            fetch,
+            locks: _,
+            for_clause: _,
+            settings: _,
+            format_clause: _,
+            pipe_operators,
+        } = query;
+
+        if fetch.is_some() {
+            return not_impl_err!("FETCH clause is not supported yet");
+        }
+
+        if let Some(with) = with {
             self.plan_with_clause(with, planner_context)?;
         }
 
-        let set_expr = *query.body;
-        match set_expr {
+        let set_expr = *body;
+        let plan = match set_expr {
             SetExpr::Select(mut select) => {
                 let select_into = select.into.take();
                 let plan =
-                    self.select_to_plan(*select, query.order_by, planner_context)?;
-                let plan =
-                    self.limit(plan, query.offset, query.limit, planner_context)?;
+                    self.select_to_plan(*select, order_by.clone(), planner_context)?;
+                let plan = self.limit(plan, limit_clause.clone(), planner_context)?;
                 // Process the `SELECT INTO` after `LIMIT`.
                 self.select_into(plan, select_into)
             }
@@ -68,7 +86,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     let _guard = StackGuard::new(256 * 1024);
                     self.set_expr_to_plan(other, planner_context)
                 }?;
-                let oby_exprs = to_order_by_exprs(query.order_by)?;
+                let oby_exprs = to_order_by_exprs(order_by)?;
                 let order_by_rex = self.order_by_to_sort_expr(
                     oby_exprs,
                     plan.schema(),
@@ -77,32 +95,204 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     None,
                 )?;
                 let plan = self.order_by(plan, order_by_rex)?;
-                self.limit(plan, query.offset, query.limit, planner_context)
+                self.limit(plan, limit_clause, planner_context)
+            }
+        }?;
+
+        self.pipe_operators(plan, pipe_operators, planner_context)
+    }
+
+    /// Apply pipe operators to a plan
+    fn pipe_operators(
+        &self,
+        mut plan: LogicalPlan,
+        pipe_operators: Vec<PipeOperator>,
+        planner_context: &mut PlannerContext,
+    ) -> Result<LogicalPlan> {
+        for pipe_operator in pipe_operators {
+            plan = self.pipe_operator(plan, pipe_operator, planner_context)?;
+        }
+        Ok(plan)
+    }
+
+    /// Apply a pipe operator to a plan
+    fn pipe_operator(
+        &self,
+        plan: LogicalPlan,
+        pipe_operator: PipeOperator,
+        planner_context: &mut PlannerContext,
+    ) -> Result<LogicalPlan> {
+        match pipe_operator {
+            PipeOperator::Where { expr } => {
+                self.plan_selection(Some(expr), plan, planner_context)
+            }
+            PipeOperator::OrderBy { exprs } => {
+                let sort_exprs = self.order_by_to_sort_expr(
+                    exprs,
+                    plan.schema(),
+                    planner_context,
+                    true,
+                    None,
+                )?;
+                self.order_by(plan, sort_exprs)
             }
+            PipeOperator::Limit { expr, offset } => self.limit(
+                plan,
+                Some(LimitClause::LimitOffset {
+                    limit: Some(expr),
+                    offset: offset.map(|offset| Offset {
+                        value: offset,
+                        rows: OffsetRows::None,
+                    }),
+                    limit_by: vec![],
+                }),
+                planner_context,
+            ),
+            PipeOperator::Select { exprs } => {
+                let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_));
+                let select_exprs =
+                    self.prepare_select_exprs(&plan, exprs, empty_from, planner_context)?;
+                self.project(plan, select_exprs)
+            }
+            PipeOperator::Extend { exprs } => {
+                let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_));
+                let extend_exprs =
+                    self.prepare_select_exprs(&plan, exprs, empty_from, planner_context)?;
+                let all_exprs =
+                    std::iter::once(SelectExpr::Wildcard(WildcardOptions::default()))
+                        .chain(extend_exprs)
+                        .collect();
+                self.project(plan, all_exprs)
+            }
+            PipeOperator::As { alias } => self.apply_table_alias(
+                plan,
+                TableAlias {
+                    name: alias,
+                    // Apply to all fields
+                    columns: vec![],
+                    explicit: true,
+                },
+            ),
+            PipeOperator::Union {
+                set_quantifier,
+                queries,
+            } => self.pipe_operator_set(
+                plan,
+                SetOperator::Union,
+                set_quantifier,
+                queries,
+                planner_context,
+            ),
+            PipeOperator::Intersect {
+                set_quantifier,
+                queries,
+            } => self.pipe_operator_set(
+                plan,
+                SetOperator::Intersect,
+                set_quantifier,
+                queries,
+                planner_context,
+            ),
+            PipeOperator::Except {
+                set_quantifier,
+                queries,
+            } => self.pipe_operator_set(
+                plan,
+                SetOperator::Except,
+                set_quantifier,
+                queries,
+                planner_context,
+            ),
+            PipeOperator::Aggregate {
+                full_table_exprs,
+                group_by_expr,
+            } => self.pipe_operator_aggregate(
+                plan,
+                full_table_exprs,
+                group_by_expr,
+                planner_context,
+            ),
+            PipeOperator::Join(join) => {
+                self.parse_relation_join(plan, join, planner_context)
+            }
+
+            x => not_impl_err!("`{x}` pipe operator is not supported yet"),
+        }
+    }
+
+    /// Handle Union/Intersect/Except pipe operators
+    fn pipe_operator_set(
+        &self,
+        mut plan: LogicalPlan,
+        set_operator: SetOperator,
+        set_quantifier: SetQuantifier,
+        queries: Vec<Query>,
+        planner_context: &mut PlannerContext,
+    ) -> Result<LogicalPlan> {
+        for query in queries {
+            let right_plan = self.query_to_plan(query, planner_context)?;
+            plan = self.set_operation_to_plan(
+                set_operator,
+                plan,
+                right_plan,
+                set_quantifier,
+            )?;
         }
+
+        Ok(plan)
     }
 
     /// Wrap a plan in a limit
     fn limit(
         &self,
         input: LogicalPlan,
-        skip: Option<SQLOffset>,
-        fetch: Option<SQLExpr>,
+        limit_clause: Option<LimitClause>,
         planner_context: &mut PlannerContext,
     ) -> Result<LogicalPlan> {
-        if skip.is_none() && fetch.is_none() {
+        let Some(limit_clause) = limit_clause else {
             return Ok(input);
-        }
+        };
 
-        // skip and fetch expressions are not allowed to reference columns from the input plan
         let empty_schema = DFSchema::empty();
 
-        let skip = skip
-            .map(|o| self.sql_to_expr(o.value, &empty_schema, planner_context))
-            .transpose()?;
-        let fetch = fetch
-            .map(|e| self.sql_to_expr(e, &empty_schema, planner_context))
-            .transpose()?;
+        let (skip, fetch, limit_by_exprs) = match limit_clause {
+            LimitClause::LimitOffset {
+                limit,
+                offset,
+                limit_by,
+            } => {
+                let skip = offset
+                    .map(|o| self.sql_to_expr(o.value, &empty_schema, planner_context))
+                    .transpose()?;
+
+                let fetch = limit
+                    .map(|e| self.sql_to_expr(e, &empty_schema, planner_context))
+                    .transpose()?;
+
+                let limit_by_exprs = limit_by
+                    .into_iter()
+                    .map(|e| self.sql_to_expr(e, &empty_schema, planner_context))
+                    .collect::<Result<Vec<_>>>()?;
+
+                (skip, fetch, limit_by_exprs)
+            }
+            LimitClause::OffsetCommaLimit { offset, limit } => {
+                let skip =
+                    Some(self.sql_to_expr(offset, &empty_schema, planner_context)?);
+                let fetch =
+                    Some(self.sql_to_expr(limit, &empty_schema, planner_context)?);
+                (skip, fetch, vec![])
+            }
+        };
+
+        if !limit_by_exprs.is_empty() {
+            return not_impl_err!("LIMIT BY clause is not supported yet");
+        }
+
+        if skip.is_none() && fetch.is_none() {
+            return Ok(input);
+        }
+
         LogicalPlanBuilder::from(input)
             .limit_by_expr(skip, fetch)?
             .build()
@@ -128,6 +318,45 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }
     }
 
+    /// Handle AGGREGATE pipe operator
+    fn pipe_operator_aggregate(
+        &self,
+        plan: LogicalPlan,
+        full_table_exprs: Vec<ExprWithAliasAndOrderBy>,
+        group_by_expr: Vec<ExprWithAliasAndOrderBy>,
+        planner_context: &mut PlannerContext,
+    ) -> Result<LogicalPlan> {
+        let plan_schema = plan.schema();
+        let process_expr =
+            |expr_with_alias_and_order_by: ExprWithAliasAndOrderBy,
+             planner_context: &mut PlannerContext| {
+                let expr_with_alias = expr_with_alias_and_order_by.expr;
+                let sql_expr = expr_with_alias.expr;
+                let alias = expr_with_alias.alias;
+
+                let df_expr = self.sql_to_expr(sql_expr, plan_schema, planner_context)?;
+
+                match alias {
+                    Some(alias_ident) => df_expr.alias_if_changed(alias_ident.value),
+                    None => Ok(df_expr),
+                }
+            };
+
+        let aggr_exprs: Vec<Expr> = full_table_exprs
+            .into_iter()
+            .map(|e| process_expr(e, planner_context))
+            .collect::<Result<Vec<_>>>()?;
+
+        let group_by_exprs: Vec<Expr> = group_by_expr
+            .into_iter()
+            .map(|e| process_expr(e, planner_context))
+            .collect::<Result<Vec<_>>>()?;
+
+        LogicalPlanBuilder::from(plan)
+            .aggregate(group_by_exprs, aggr_exprs)?
+            .build()
+    }
+
     /// Wrap the logical plan in a `SelectInto`
     fn select_into(
         &self,
@@ -138,7 +367,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Some(into) => Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(
                 CreateMemoryTable {
                     name: self.object_name_to_table_reference(into.name)?,
-                    constraints: Constraints::empty(),
+                    constraints: Constraints::default(),
                     input: Arc::new(plan),
                     if_not_exists: false,
                     or_replace: false,
@@ -182,7 +411,7 @@ pub(crate) fn to_order_by_exprs_with_select(
                             quote_style: None,
                             span: Span::empty(),
                         }),
-                        options: order_by_options.clone(),
+                        options: order_by_options,
                         with_fill: None,
                     }),
                     // TODO: Support other types of expressions
diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs
index 8a3c20e3971b8..8e1a8817309f0 100644
--- a/datafusion/sql/src/relation/join.rs
+++ b/datafusion/sql/src/relation/join.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{not_impl_err, plan_datafusion_err, Column, Result};
+use datafusion_common::{Column, Result, not_impl_err, plan_datafusion_err};
 use datafusion_expr::{JoinType, LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::{
     Join, JoinConstraint, JoinOperator, ObjectName, TableFactor, TableWithJoins,
@@ -43,7 +43,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         Ok(left)
     }
 
-    fn parse_relation_join(
+    pub(crate) fn parse_relation_join(
         &self,
         left: LogicalPlan,
         join: Join,
@@ -95,7 +95,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             JoinOperator::FullOuter(constraint) => {
                 self.parse_join(left, right, constraint, JoinType::Full, planner_context)
             }
-            JoinOperator::CrossJoin => self.parse_cross_join(left, right),
+            JoinOperator::CrossJoin(JoinConstraint::None) => {
+                self.parse_cross_join(left, right)
+            }
             other => not_impl_err!("Unsupported JOIN operator {other:?}"),
         }
     }
@@ -142,7 +144,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                                         "Expected identifier in USING clause"
                                     )
                                 })
-                                .map(|ident| self.ident_normalizer.normalize(ident.clone()))
+                                .map(|ident| Column::from_name(self.ident_normalizer.normalize(ident.clone())))
                         }
                     })
                     .collect::<Result<Vec<_>>>()?;
diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs
index 88a32a218341d..6558763ca4e42 100644
--- a/datafusion/sql/src/relation/mod.rs
+++ b/datafusion/sql/src/relation/mod.rs
@@ -21,22 +21,125 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{
-    not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, Spans, TableReference,
+    DFSchema, Diagnostic, Result, Span, Spans, TableReference, not_impl_err, plan_err,
 };
 use datafusion_expr::builder::subquery_alias;
-use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, expr::Unnest};
 use datafusion_expr::{Subquery, SubqueryAlias};
 use sqlparser::ast::{FunctionArg, FunctionArgExpr, Spanned, TableFactor};
 
 mod join;
 
+struct SqlToRelRelationContext<'a, 'b, S: ContextProvider> {
+    planner: &'a SqlToRel<'b, S>,
+    planner_context: &'a mut PlannerContext,
+}
+
+// Implement RelationPlannerContext
+impl<'a, 'b, S: ContextProvider> RelationPlannerContext
+    for SqlToRelRelationContext<'a, 'b, S>
+{
+    fn context_provider(&self) -> &dyn ContextProvider {
+        self.planner.context_provider
+    }
+
+    fn plan(&mut self, relation: TableFactor) -> Result<LogicalPlan> {
+        self.planner.create_relation(relation, self.planner_context)
+    }
+
+    fn sql_to_expr(
+        &mut self,
+        expr: sqlparser::ast::Expr,
+        schema: &DFSchema,
+    ) -> Result<Expr> {
+        self.planner.sql_to_expr(expr, schema, self.planner_context)
+    }
+
+    fn sql_expr_to_logical_expr(
+        &mut self,
+        expr: sqlparser::ast::Expr,
+        schema: &DFSchema,
+    ) -> Result<Expr> {
+        self.planner
+            .sql_expr_to_logical_expr(expr, schema, self.planner_context)
+    }
+
+    fn normalize_ident(&self, ident: sqlparser::ast::Ident) -> String {
+        self.planner.ident_normalizer.normalize(ident)
+    }
+
+    fn object_name_to_table_reference(
+        &self,
+        name: sqlparser::ast::ObjectName,
+    ) -> Result<TableReference> {
+        self.planner.object_name_to_table_reference(name)
+    }
+}
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
-    /// Create a `LogicalPlan` that scans the named relation
+    /// Create a `LogicalPlan` that scans the named relation.
+    ///
+    /// First tries any registered extension planners. If no extension handles
+    /// the relation, falls back to the default planner.
     fn create_relation(
         &self,
         relation: TableFactor,
         planner_context: &mut PlannerContext,
     ) -> Result<LogicalPlan> {
+        let planned_relation =
+            match self.create_extension_relation(relation, planner_context)? {
+                RelationPlanning::Planned(planned) => planned,
+                RelationPlanning::Original(original) => {
+                    Box::new(self.create_default_relation(*original, planner_context)?)
+                }
+            };
+
+        let optimized_plan = optimize_subquery_sort(planned_relation.plan)?.data;
+        if let Some(alias) = planned_relation.alias {
+            self.apply_table_alias(optimized_plan, alias)
+        } else {
+            Ok(optimized_plan)
+        }
+    }
+
+    fn create_extension_relation(
+        &self,
+        relation: TableFactor,
+        planner_context: &mut PlannerContext,
+    ) -> Result<RelationPlanning> {
+        let planners = self.context_provider.get_relation_planners();
+        if planners.is_empty() {
+            return Ok(RelationPlanning::Original(Box::new(relation)));
+        }
+
+        let mut current_relation = relation;
+        for planner in planners.iter() {
+            let mut context = SqlToRelRelationContext {
+                planner: self,
+                planner_context,
+            };
+
+            match planner.plan_relation(current_relation, &mut context)? {
+                RelationPlanning::Planned(planned) => {
+                    return Ok(RelationPlanning::Planned(planned));
+                }
+                RelationPlanning::Original(original) => {
+                    current_relation = *original;
+                }
+            }
+        }
+
+        Ok(RelationPlanning::Original(Box::new(current_relation)))
+    }
+
+    fn create_default_relation(
+        &self,
+        relation: TableFactor,
+        planner_context: &mut PlannerContext,
+    ) -> Result<PlannedRelation> {
         let relation_span = relation.span();
         let (plan, alias) = match relation {
             TableFactor::Table {
@@ -57,7 +160,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                                     planner_context,
                                 )
                             } else {
-                                plan_err!("Unsupported function argument type: {:?}", arg)
+                                plan_err!("Unsupported function argument type: {}", arg)
                             }
                         })
                         .collect::<Vec<_>>();
@@ -66,7 +169,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         .get_table_function_source(&tbl_func_name, args)?;
                     let plan = LogicalPlanBuilder::scan(
                         TableReference::Bare {
-                            table: "tmp_table".into(),
+                            table: format!("{tbl_func_name}()").into(),
                         },
                         provider,
                         None,
@@ -154,6 +257,36 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     "UNNEST table factor with offset is not supported yet"
                 );
             }
+            TableFactor::Function {
+                name, args, alias, ..
+            } => {
+                let tbl_func_ref = self.object_name_to_table_reference(name)?;
+                let schema = planner_context
+                    .outer_queries_schemas()
+                    .last()
+                    .cloned()
+                    .unwrap_or_else(|| Arc::new(DFSchema::empty()));
+                let func_args = args
+                    .into_iter()
+                    .map(|arg| match arg {
+                        FunctionArg::Unnamed(FunctionArgExpr::Expr(expr))
+                        | FunctionArg::Named {
+                            arg: FunctionArgExpr::Expr(expr),
+                            ..
+                        } => {
+                            self.sql_expr_to_logical_expr(expr, &schema, planner_context)
+                        }
+                        _ => plan_err!("Unsupported function argument: {arg:?}"),
+                    })
+                    .collect::<Result<Vec<Expr>>>()?;
+                let provider = self
+                    .context_provider
+                    .get_table_function_source(tbl_func_ref.table(), func_args)?;
+                let plan =
+                    LogicalPlanBuilder::scan(tbl_func_ref.table(), provider, None)?
+                        .build()?;
+                (plan, alias)
+            }
             // @todo Support TableFactory::TableFunction?
             _ => {
                 return not_impl_err!(
@@ -161,13 +294,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 );
             }
         };
-
-        let optimized_plan = optimize_subquery_sort(plan)?.data;
-        if let Some(alias) = alias {
-            self.apply_table_alias(optimized_plan, alias)
-        } else {
-            Ok(optimized_plan)
-        }
+        Ok(PlannedRelation::new(plan, alias))
     }
 
     pub(crate) fn create_relation_subquery(
@@ -184,20 +311,24 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let old_from_schema = planner_context
             .set_outer_from_schema(None)
             .unwrap_or_else(|| Arc::new(DFSchema::empty()));
-        let new_query_schema = match planner_context.outer_query_schema() {
-            Some(old_query_schema) => {
+        let outer_query_schema = planner_context.pop_outer_query_schema();
+        let new_query_schema = match outer_query_schema {
+            Some(ref old_query_schema) => {
                 let mut new_query_schema = old_from_schema.as_ref().clone();
-                new_query_schema.merge(old_query_schema);
-                Some(Arc::new(new_query_schema))
+                new_query_schema.merge(old_query_schema.as_ref());
+                Arc::new(new_query_schema)
             }
-            None => Some(Arc::clone(&old_from_schema)),
+            None => Arc::clone(&old_from_schema),
         };
-        let old_query_schema = planner_context.set_outer_query_schema(new_query_schema);
+        planner_context.append_outer_query_schema(new_query_schema);
 
         let plan = self.create_relation(subquery, planner_context)?;
         let outer_ref_columns = plan.all_out_ref_exprs();
 
-        planner_context.set_outer_query_schema(old_query_schema);
+        planner_context.pop_outer_query_schema();
+        if let Some(schema) = outer_query_schema {
+            planner_context.append_outer_query_schema(schema);
+        }
         planner_context.set_outer_from_schema(Some(old_from_schema));
 
         // We can omit the subquery wrapper if there are no columns
@@ -234,7 +365,8 @@ fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>>
     // 2. RANK / ROW_NUMBER ... => Handled by a `WindowAggr` and its requirements.
     // 3. LIMIT => Handled by a `Sort`, so we need to search for it.
     let mut has_limit = false;
-    let new_plan = plan.transform_down(|c| {
+
+    plan.transform_down(|c| {
         if let LogicalPlan::Limit(_) = c {
             has_limit = true;
             return Ok(Transformed::no(c));
@@ -249,6 +381,5 @@ fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>>
             }
             _ => Ok(Transformed::no(c)),
         }
-    });
-    new_plan
+    })
 }
diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs
index 9e909f66fa97a..955dbb86602a3 100644
--- a/datafusion/sql/src/resolve.rs
+++ b/datafusion/sql/src/resolve.rs
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::TableReference;
 use std::collections::BTreeSet;
 use std::ops::ControlFlow;
 
+use datafusion_common::{DataFusionError, Result};
+
+use crate::TableReference;
 use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement};
 use crate::planner::object_name_to_table_reference;
 use sqlparser::ast::*;
@@ -45,27 +47,40 @@ const INFORMATION_SCHEMA_TABLES: &[&str] = &[
     PARAMETERS,
 ];
 
+// Collect table/CTE references as `TableReference`s and normalize them during traversal.
+// This avoids a second normalization/conversion pass after visiting the AST.
 struct RelationVisitor {
-    relations: BTreeSet<ObjectName>,
-    all_ctes: BTreeSet<ObjectName>,
-    ctes_in_scope: Vec<ObjectName>,
+    relations: BTreeSet<TableReference>,
+    all_ctes: BTreeSet<TableReference>,
+    ctes_in_scope: Vec<TableReference>,
+    enable_ident_normalization: bool,
 }
 
 impl RelationVisitor {
     /// Record the reference to `relation`, if it's not a CTE reference.
-    fn insert_relation(&mut self, relation: &ObjectName) {
-        if !self.relations.contains(relation) && !self.ctes_in_scope.contains(relation) {
-            self.relations.insert(relation.clone());
+    fn insert_relation(&mut self, relation: &ObjectName) -> ControlFlow<DataFusionError> {
+        match object_name_to_table_reference(
+            relation.clone(),
+            self.enable_ident_normalization,
+        ) {
+            Ok(relation) => {
+                if !self.relations.contains(&relation)
+                    && !self.ctes_in_scope.contains(&relation)
+                {
+                    self.relations.insert(relation);
+                }
+                ControlFlow::Continue(())
+            }
+            Err(e) => ControlFlow::Break(e),
         }
     }
 }
 
 impl Visitor for RelationVisitor {
-    type Break = ();
+    type Break = DataFusionError;
 
-    fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<()> {
-        self.insert_relation(relation);
-        ControlFlow::Continue(())
+    fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<Self::Break> {
+        self.insert_relation(relation)
     }
 
     fn pre_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
@@ -78,10 +93,16 @@ impl Visitor for RelationVisitor {
                 if !with.recursive {
                     // This is a bit hackish as the CTE will be visited again as part of visiting `q`,
                     // but thankfully `insert_relation` is idempotent.
-                    let _ = cte.visit(self);
+                    cte.visit(self)?;
+                }
+                let cte_name = ObjectName::from(vec![cte.alias.name.clone()]);
+                match object_name_to_table_reference(
+                    cte_name,
+                    self.enable_ident_normalization,
+                ) {
+                    Ok(cte_ref) => self.ctes_in_scope.push(cte_ref),
+                    Err(e) => return ControlFlow::Break(e),
                 }
-                self.ctes_in_scope
-                    .push(ObjectName::from(vec![cte.alias.name.clone()]));
             }
         }
         ControlFlow::Continue(())
@@ -97,13 +118,13 @@ impl Visitor for RelationVisitor {
         ControlFlow::Continue(())
     }
 
-    fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<()> {
+    fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<Self::Break> {
         if let Statement::ShowCreate {
             obj_type: ShowCreateObject::Table | ShowCreateObject::View,
             obj_name,
         } = statement
         {
-            self.insert_relation(obj_name)
+            self.insert_relation(obj_name)?;
         }
 
         // SHOW statements will later be rewritten into a SELECT from the information_schema
@@ -120,34 +141,53 @@ impl Visitor for RelationVisitor {
         );
         if requires_information_schema {
             for s in INFORMATION_SCHEMA_TABLES {
-                self.relations.insert(ObjectName::from(vec![
+                // Information schema references are synthesized here, so convert directly.
+                let obj = ObjectName::from(vec![
                     Ident::new(INFORMATION_SCHEMA),
                     Ident::new(*s),
-                ]));
+                ]);
+                match object_name_to_table_reference(obj, self.enable_ident_normalization)
+                {
+                    Ok(tbl_ref) => {
+                        self.relations.insert(tbl_ref);
+                    }
+                    Err(e) => return ControlFlow::Break(e),
+                }
             }
         }
         ControlFlow::Continue(())
     }
 }
 
-fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) {
+fn control_flow_to_result(flow: ControlFlow<DataFusionError>) -> Result<()> {
+    match flow {
+        ControlFlow::Continue(()) => Ok(()),
+        ControlFlow::Break(err) => Err(err),
+    }
+}
+
+fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) -> Result<()> {
     match statement {
         DFStatement::Statement(s) => {
-            let _ = s.as_ref().visit(visitor);
+            control_flow_to_result(s.as_ref().visit(visitor))?;
         }
         DFStatement::CreateExternalTable(table) => {
-            visitor.relations.insert(table.name.clone());
+            control_flow_to_result(visitor.insert_relation(&table.name))?;
         }
         DFStatement::CopyTo(CopyToStatement { source, .. }) => match source {
             CopyToSource::Relation(table_name) => {
-                visitor.insert_relation(table_name);
+                control_flow_to_result(visitor.insert_relation(table_name))?;
             }
             CopyToSource::Query(query) => {
-                let _ = query.visit(visitor);
+                control_flow_to_result(query.visit(visitor))?;
             }
         },
-        DFStatement::Explain(explain) => visit_statement(&explain.statement, visitor),
+        DFStatement::Explain(explain) => {
+            visit_statement(&explain.statement, visitor)?;
+        }
+        DFStatement::Reset(_) => {}
     }
+    Ok(())
 }
 
 /// Collects all tables and views referenced in the SQL statement. CTEs are collected separately.
@@ -175,38 +215,32 @@ fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) {
 /// ## Example with CTEs  
 ///  
 /// ```  
-/// # use datafusion_sql::parser::DFParser;  
+/// # use datafusion_sql::parser::DFParser;
 /// # use datafusion_sql::resolve::resolve_table_references;
-/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";  
-/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();  
-/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();  
+/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";
+/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
 /// assert_eq!(table_refs.len(), 0);
-/// assert_eq!(ctes.len(), 1);  
-/// assert_eq!(ctes[0].to_string(), "my_cte");  
+/// assert_eq!(ctes.len(), 1);
+/// assert_eq!(ctes[0].to_string(), "my_cte");
 /// ```
 pub fn resolve_table_references(
     statement: &crate::parser::Statement,
     enable_ident_normalization: bool,
-) -> datafusion_common::Result<(Vec<TableReference>, Vec<TableReference>)> {
+) -> Result<(Vec<TableReference>, Vec<TableReference>)> {
     let mut visitor = RelationVisitor {
         relations: BTreeSet::new(),
         all_ctes: BTreeSet::new(),
         ctes_in_scope: vec![],
+        enable_ident_normalization,
     };
 
-    visit_statement(statement, &mut visitor);
-
-    let table_refs = visitor
-        .relations
-        .into_iter()
-        .map(|x| object_name_to_table_reference(x, enable_ident_normalization))
-        .collect::<datafusion_common::Result<_>>()?;
-    let ctes = visitor
-        .all_ctes
-        .into_iter()
-        .map(|x| object_name_to_table_reference(x, enable_ident_normalization))
-        .collect::<datafusion_common::Result<_>>()?;
-    Ok((table_refs, ctes))
+    visit_statement(statement, &mut visitor)?;
+
+    Ok((
+        visitor.relations.into_iter().collect(),
+        visitor.all_ctes.into_iter().collect(),
+    ))
 }
 
 #[cfg(test)]
@@ -269,4 +303,57 @@ mod tests {
         assert_eq!(ctes.len(), 1);
         assert_eq!(ctes[0].to_string(), "nodes");
     }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference() {
+        use crate::parser::DFParser;
+
+        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "barbaz");
+        // Quoted reference should still resolve to the CTE when normalization is on
+        assert_eq!(table_refs.len(), 0);
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_normalization_off() {
+        use crate::parser::DFParser;
+
+        let query = r#"with barbaz as (select 1) select * from "barbaz""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "barbaz");
+        // Even with normalization off, quoted reference matches same-case CTE name
+        assert_eq!(table_refs.len(), 0);
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_on() {
+        use crate::parser::DFParser;
+
+        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
+        // CTE name is normalized to lowercase, quoted reference preserves case, so they differ
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "foobar");
+        assert_eq!(table_refs.len(), 1);
+        assert_eq!(table_refs[0].to_string(), "FOObar");
+    }
+
+    #[test]
+    fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_off() {
+        use crate::parser::DFParser;
+
+        let query = r#"with FOObar as (select 1) select * from "FOObar""#;
+        let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
+        let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
+        // Without normalization, cases match exactly, so quoted reference resolves to the CTE
+        assert_eq!(ctes.len(), 1);
+        assert_eq!(ctes[0].to_string(), "FOObar");
+        assert_eq!(table_refs.len(), 0);
+    }
 }
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index 9fad274b51c06..7e291afa04b6e 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -22,14 +22,14 @@ use std::sync::Arc;
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use crate::query::to_order_by_exprs_with_select;
 use crate::utils::{
+    CheckColumnsMustReferenceAggregatePurpose, CheckColumnsSatisfyExprsPurpose,
     check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
     resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up,
-    CheckColumnsSatisfyExprsPurpose,
 };
 
 use datafusion_common::error::DataFusionErrorBuilder;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-use datafusion_common::{not_impl_err, plan_err, Result};
+use datafusion_common::{Column, DFSchema, Result, not_impl_err, plan_err};
 use datafusion_common::{RecursionUnnestOption, UnnestOptions};
 use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions};
 use datafusion_expr::expr_rewriter::{
@@ -41,16 +41,32 @@ use datafusion_expr::utils::{
 };
 use datafusion_expr::{
     Aggregate, Expr, Filter, GroupingSet, LogicalPlan, LogicalPlanBuilder,
-    LogicalPlanBuilderOptions, Partitioning,
+    LogicalPlanBuilderOptions, Partitioning, SortExpr,
 };
 
 use indexmap::IndexMap;
 use sqlparser::ast::{
-    visit_expressions_mut, Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr,
-    OrderBy, SelectItemQualifiedWildcardKind, WildcardAdditionalOptions, WindowType,
+    Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderBy,
+    SelectItemQualifiedWildcardKind, WildcardAdditionalOptions, WindowType,
+    visit_expressions_mut,
 };
 use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins};
 
+/// Result of the `aggregate` function, containing the aggregate plan and
+/// rewritten expressions that reference the aggregate output columns.
+struct AggregatePlanResult {
+    /// The aggregate logical plan
+    plan: LogicalPlan,
+    /// SELECT expressions rewritten to reference aggregate output columns
+    select_exprs: Vec<Expr>,
+    /// HAVING expression rewritten to reference aggregate output columns
+    having_expr: Option<Expr>,
+    /// QUALIFY expression rewritten to reference aggregate output columns
+    qualify_expr: Option<Expr>,
+    /// ORDER BY expressions rewritten to reference aggregate output columns
+    order_by_exprs: Vec<SortExpr>,
+}
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
     /// Generate a logic plan from an SQL select
     pub(super) fn select_to_plan(
@@ -66,9 +82,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         if !select.lateral_views.is_empty() {
             return not_impl_err!("LATERAL VIEWS");
         }
-        if select.qualify.is_some() {
-            return not_impl_err!("QUALIFY");
-        }
+
         if select.top.is_some() {
             return not_impl_err!("TOP");
         }
@@ -86,6 +100,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // Handle named windows before processing the projection expression
         check_conflicting_windows(&select.named_window)?;
         self.match_window_definitions(&mut select.projection, &select.named_window)?;
+
         // Process the SELECT expressions
         let select_exprs = self.prepare_select_exprs(
             &base_plan,
@@ -148,12 +163,6 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             })
             .transpose()?;
 
-        // The outer expressions we will search through for aggregates.
-        // Aggregates may be sourced from the SELECT list or from the HAVING expression.
-        let aggr_expr_haystack = select_exprs.iter().chain(having_expr_opt.iter());
-        // All of the aggregate expressions (deduplicated).
-        let aggr_exprs = find_aggregate_exprs(aggr_expr_haystack);
-
         // All of the group by expressions
         let group_by_exprs = if let GroupByExpr::Expressions(exprs, _) = select.group_by {
             exprs
@@ -198,22 +207,85 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .collect()
         };
 
+        // Optionally the QUALIFY expression.
+        let qualify_expr_opt = select
+            .qualify
+            .map::<Result<Expr>, _>(|qualify_expr| {
+                let qualify_expr = self.sql_expr_to_logical_expr(
+                    qualify_expr,
+                    &combined_schema,
+                    planner_context,
+                )?;
+                // This step "dereferences" any aliases in the QUALIFY clause.
+                //
+                // This is how we support queries with QUALIFY expressions that
+                // refer to aliased columns.
+                //
+                // For example:
+                //
+                //   select row_number() over (PARTITION BY id) as rk from users qualify rk > 1;
+                //
+                // are rewritten as, respectively:
+                //
+                //   select row_number() over (PARTITION BY id) as rk from users qualify row_number() over (PARTITION BY id) > 1;
+                //
+                let qualify_expr = resolve_aliases_to_exprs(qualify_expr, &alias_map)?;
+                normalize_col(qualify_expr, &projected_plan)
+            })
+            .transpose()?;
+
+        // The outer expressions we will search through for aggregates.
+        // First, find aggregates in SELECT, HAVING, and QUALIFY
+        let select_having_qualify_aggrs = find_aggregate_exprs(
+            select_exprs
+                .iter()
+                .chain(having_expr_opt.iter())
+                .chain(qualify_expr_opt.iter()),
+        );
+
+        // Find aggregates in ORDER BY
+        let order_by_aggrs = find_aggregate_exprs(order_by_rex.iter().map(|s| &s.expr));
+
+        // Combine: all aggregates from SELECT/HAVING/QUALIFY, plus ORDER BY aggregates
+        // that aren't already in SELECT/HAVING/QUALIFY
+        let mut aggr_exprs = select_having_qualify_aggrs;
+        for order_by_aggr in order_by_aggrs {
+            if !aggr_exprs.iter().any(|e| e == &order_by_aggr) {
+                aggr_exprs.push(order_by_aggr);
+            }
+        }
+
         // Process group by, aggregation or having
-        let (plan, mut select_exprs_post_aggr, having_expr_post_aggr) = if !group_by_exprs
-            .is_empty()
-            || !aggr_exprs.is_empty()
-        {
+        let AggregatePlanResult {
+            plan,
+            select_exprs: mut select_exprs_post_aggr,
+            having_expr: having_expr_post_aggr,
+            qualify_expr: qualify_expr_post_aggr,
+            order_by_exprs: order_by_rex,
+        } = if !group_by_exprs.is_empty() || !aggr_exprs.is_empty() {
             self.aggregate(
                 &base_plan,
                 &select_exprs,
                 having_expr_opt.as_ref(),
+                qualify_expr_opt.as_ref(),
+                &order_by_rex,
                 &group_by_exprs,
                 &aggr_exprs,
             )?
         } else {
             match having_expr_opt {
-                Some(having_expr) => return plan_err!("HAVING clause references: {having_expr} must appear in the GROUP BY clause or be used in an aggregate function"),
-                None => (base_plan.clone(), select_exprs.clone(), having_expr_opt)
+                Some(having_expr) => {
+                    return plan_err!(
+                        "HAVING clause references: {having_expr} must appear in the GROUP BY clause or be used in an aggregate function"
+                    );
+                }
+                None => AggregatePlanResult {
+                    plan: base_plan.clone(),
+                    select_exprs: select_exprs.clone(),
+                    having_expr: having_expr_opt,
+                    qualify_expr: qualify_expr_opt,
+                    order_by_exprs: order_by_rex,
+                },
             }
         };
 
@@ -225,9 +297,17 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             plan
         };
 
-        // Process window function
-        let window_func_exprs = find_window_exprs(&select_exprs_post_aggr);
+        // The outer expressions we will search through for window functions.
+        // Window functions may be sourced from the SELECT list or from the QUALIFY expression.
+        let windows_expr_haystack = select_exprs_post_aggr
+            .iter()
+            .chain(qualify_expr_post_aggr.iter());
+        // All of the window expressions (deduplicated and rewritten to reference aggregates as
+        // columns from input).
+        let window_func_exprs = find_window_exprs(windows_expr_haystack);
 
+        // Process window functions after aggregation as they can reference
+        // aggregate functions in their body
         let plan = if window_func_exprs.is_empty() {
             plan
         } else {
@@ -242,12 +322,46 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             plan
         };
 
+        // Process QUALIFY clause after window functions
+        // QUALIFY filters the results of window functions, similar to how HAVING filters aggregates
+        let plan = if let Some(qualify_expr) = qualify_expr_post_aggr {
+            // Validate that QUALIFY is used with window functions
+            if window_func_exprs.is_empty() {
+                return plan_err!(
+                    "QUALIFY clause requires window functions in the SELECT list or QUALIFY clause"
+                );
+            }
+
+            // now attempt to resolve columns and replace with fully-qualified columns
+            let windows_projection_exprs = window_func_exprs
+                .iter()
+                .map(|expr| resolve_columns(expr, &plan))
+                .collect::<Result<Vec<Expr>>>()?;
+
+            // Rewrite the qualify expression to reference columns from the window plan
+            let qualify_expr_post_window =
+                rebase_expr(&qualify_expr, &windows_projection_exprs, &plan)?;
+
+            // Validate that the qualify expression can be resolved from the window plan schema
+            self.validate_schema_satisfies_exprs(
+                plan.schema(),
+                std::slice::from_ref(&qualify_expr_post_window),
+            )?;
+
+            LogicalPlanBuilder::from(plan)
+                .filter(qualify_expr_post_window)?
+                .build()?
+        } else {
+            plan
+        };
+
         // Try processing unnest expression or do the final projection
         let plan = self.try_process_unnest(plan, select_exprs_post_aggr)?;
 
         // Process distinct clause
         let plan = match select.distinct {
             None => Ok(plan),
+            Some(Distinct::All) => Ok(plan),
             Some(Distinct::Distinct) => {
                 LogicalPlanBuilder::from(plan).distinct()?.build()
             }
@@ -256,7 +370,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     || !group_by_exprs.is_empty()
                     || !window_func_exprs.is_empty()
                 {
-                    return not_impl_err!("DISTINCT ON expressions with GROUP BY, aggregation or window functions are not supported ");
+                    return not_impl_err!(
+                        "DISTINCT ON expressions with GROUP BY, aggregation or window functions are not supported "
+                    );
                 }
 
                 let on_expr = on_expr
@@ -293,7 +409,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             plan
         };
 
-        self.order_by(plan, order_by_rex)
+        let plan = self.order_by(plan, order_by_rex)?;
+        Ok(plan)
     }
 
     /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection
@@ -307,6 +424,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
         let mut intermediate_plan = input;
         let mut intermediate_select_exprs = select_exprs;
+        // Fast path: If there is are no unnests in the select_exprs, wrap the plan in a projection
+        if !intermediate_select_exprs
+            .iter()
+            .any(has_unnest_expr_recursively)
+        {
+            return LogicalPlanBuilder::from(intermediate_plan)
+                .project(intermediate_select_exprs)?
+                .build();
+        }
 
         // Each expr in select_exprs can contains multiple unnest stage
         // The transformation happen bottom up, one at a time for each iteration
@@ -374,6 +500,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     fn try_process_aggregate_unnest(&self, input: LogicalPlan) -> Result<LogicalPlan> {
         match input {
+            // Fast path if there are no unnest in group by
+            LogicalPlan::Aggregate(ref agg)
+                if !&agg.group_expr.iter().any(has_unnest_expr_recursively) =>
+            {
+                Ok(input)
+            }
             LogicalPlan::Aggregate(agg) => {
                 let agg_expr = agg.aggr_expr.clone();
                 let (new_input, new_group_by_exprs) =
@@ -497,7 +629,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         Ok((intermediate_plan, intermediate_select_exprs))
     }
 
-    fn plan_selection(
+    pub(crate) fn plan_selection(
         &self,
         selection: Option<SQLExpr>,
         plan: LogicalPlan,
@@ -506,11 +638,6 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         match selection {
             Some(predicate_expr) => {
                 let fallback_schemas = plan.fallback_normalize_schemas();
-                let outer_query_schema = planner_context.outer_query_schema().cloned();
-                let outer_query_schema_vec = outer_query_schema
-                    .as_ref()
-                    .map(|schema| vec![schema])
-                    .unwrap_or_else(Vec::new);
 
                 let filter_expr =
                     self.sql_to_expr(predicate_expr, plan.schema(), planner_context)?;
@@ -526,9 +653,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 let mut using_columns = HashSet::new();
                 expr_to_columns(&filter_expr, &mut using_columns)?;
+                let mut schema_stack: Vec<Vec<&DFSchema>> =
+                    vec![vec![plan.schema()], fallback_schemas];
+                for sc in planner_context.outer_schemas_iter() {
+                    schema_stack.push(vec![sc.as_ref()]);
+                }
+
                 let filter_expr = normalize_col_with_schemas_and_ambiguity_check(
                     filter_expr,
-                    &[&[plan.schema()], &fallback_schemas, &outer_query_schema_vec],
+                    schema_stack
+                        .iter()
+                        .map(|sc| sc.as_slice())
+                        .collect::<Vec<&[&DFSchema]>>()
+                        .as_slice(),
                     &[using_columns],
                 )?;
 
@@ -578,7 +715,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 
     /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions.
-    fn prepare_select_exprs(
+    pub(crate) fn prepare_select_exprs(
         &self,
         plan: &LogicalPlan,
         projection: Vec<SelectItem>,
@@ -587,6 +724,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     ) -> Result<Vec<SelectExpr>> {
         let mut prepared_select_exprs = vec![];
         let mut error_builder = DataFusionErrorBuilder::new();
+
         for expr in projection {
             match self.sql_select_to_rex(expr, plan, empty_from, planner_context) {
                 Ok(expr) => prepared_select_exprs.push(expr),
@@ -655,7 +793,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     SelectItemQualifiedWildcardKind::Expr(_) => {
                         return plan_err!(
                             "Qualified wildcard with expression not supported"
-                        )
+                        );
                     }
                 };
                 let qualifier = self.object_name_to_table_reference(object_name)?;
@@ -737,7 +875,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     }
 
     /// Wrap a plan in a projection
-    fn project(&self, input: LogicalPlan, expr: Vec<SelectExpr>) -> Result<LogicalPlan> {
+    pub(crate) fn project(
+        &self,
+        input: LogicalPlan,
+        expr: Vec<SelectExpr>,
+    ) -> Result<LogicalPlan> {
         // convert to Expr for validate_schema_satisfies_exprs
         let exprs = expr
             .iter()
@@ -753,8 +895,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
     /// Create an aggregate plan.
     ///
-    /// An aggregate plan consists of grouping expressions, aggregate expressions, and an
-    /// optional HAVING expression (which is a filter on the output of the aggregate).
+    /// An aggregate plan consists of grouping expressions, aggregate expressions, an
+    /// optional HAVING expression (which is a filter on the output of the aggregate),
+    /// and an optional QUALIFY clause which may reference aggregates.
     ///
     /// # Arguments
     ///
@@ -762,27 +905,35 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     ///   "having" expressions must all be resolvable from this plan.
     /// * `select_exprs`    - The projection expressions from the SELECT clause.
     /// * `having_expr_opt` - Optional HAVING clause.
+    /// * `qualify_expr_opt` - Optional QUALIFY clause.
     /// * `group_by_exprs`  - Grouping expressions from the GROUP BY clause. These can be column
     ///   references or more complex expressions.
     /// * `aggr_exprs`      - Aggregate expressions, such as `SUM(a)` or `COUNT(1)`.
     ///
     /// # Return
     ///
-    /// The return value is a triplet of the following items:
+    /// The return value is a quadruplet of the following items:
     ///
     /// * `plan`                   - A [LogicalPlan::Aggregate] plan for the newly created aggregate.
     /// * `select_exprs_post_aggr` - The projection expressions rewritten to reference columns from
     ///   the aggregate
     /// * `having_expr_post_aggr`  - The "having" expression rewritten to reference a column from
     ///   the aggregate
+    /// * `qualify_expr_post_aggr`  - The "qualify" expression rewritten to reference a column from
+    ///   the aggregate
+    /// * `order_by_post_aggr`     - The ORDER BY expressions rewritten to reference columns from
+    ///   the aggregate
+    #[expect(clippy::too_many_arguments)]
     fn aggregate(
         &self,
         input: &LogicalPlan,
         select_exprs: &[Expr],
         having_expr_opt: Option<&Expr>,
+        qualify_expr_opt: Option<&Expr>,
+        order_by_exprs: &[SortExpr],
         group_by_exprs: &[Expr],
         aggr_exprs: &[Expr],
-    ) -> Result<(LogicalPlan, Vec<Expr>, Option<Expr>)> {
+    ) -> Result<AggregatePlanResult> {
         // create the aggregate plan
         let options =
             LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true);
@@ -846,7 +997,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         check_columns_satisfy_exprs(
             &column_exprs_post_aggr,
             &select_exprs_post_aggr,
-            CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate,
+            CheckColumnsSatisfyExprsPurpose::Aggregate(
+                CheckColumnsMustReferenceAggregatePurpose::Projection,
+            ),
         )?;
 
         // Rewrite the HAVING expression to use the columns produced by the
@@ -858,7 +1011,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             check_columns_satisfy_exprs(
                 &column_exprs_post_aggr,
                 std::slice::from_ref(&having_expr_post_aggr),
-                CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate,
+                CheckColumnsSatisfyExprsPurpose::Aggregate(
+                    CheckColumnsMustReferenceAggregatePurpose::Having,
+                ),
             )?;
 
             Some(having_expr_post_aggr)
@@ -866,7 +1021,89 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             None
         };
 
-        Ok((plan, select_exprs_post_aggr, having_expr_post_aggr))
+        // Rewrite the QUALIFY expression to use the columns produced by the
+        // aggregation.
+        let qualify_expr_post_aggr = if let Some(qualify_expr) = qualify_expr_opt {
+            let qualify_expr_post_aggr =
+                rebase_expr(qualify_expr, &aggr_projection_exprs, input)?;
+
+            check_columns_satisfy_exprs(
+                &column_exprs_post_aggr,
+                std::slice::from_ref(&qualify_expr_post_aggr),
+                CheckColumnsSatisfyExprsPurpose::Aggregate(
+                    CheckColumnsMustReferenceAggregatePurpose::Qualify,
+                ),
+            )?;
+
+            Some(qualify_expr_post_aggr)
+        } else {
+            None
+        };
+
+        // Rewrite the ORDER BY expressions to use the columns produced by the
+        // aggregation. If an ORDER BY expression matches a SELECT expression
+        // (ignoring aliases), use the SELECT's output column name to avoid
+        // duplication when the SELECT expression has an alias.
+        let order_by_post_aggr = order_by_exprs
+            .iter()
+            .map(|sort_expr| {
+                let rewritten_expr =
+                    rebase_expr(&sort_expr.expr, &aggr_projection_exprs, input)?;
+
+                // Check if this ORDER BY expression matches any aliased SELECT expression
+                // If so, use the SELECT's alias instead of the raw expression
+                let final_expr = select_exprs_post_aggr
+                    .iter()
+                    .find_map(|select_expr| {
+                        // Only consider aliased expressions
+                        if let Expr::Alias(alias) = select_expr {
+                            let rewritten_unaliased = match &rewritten_expr {
+                                Expr::Alias(a) => a.expr.as_ref(),
+                                other => other,
+                            };
+                            if alias.expr.as_ref() == rewritten_unaliased {
+                                return Some(Expr::Column(Column::new_unqualified(
+                                    alias.name.clone(),
+                                )));
+                            }
+                        }
+                        None
+                    })
+                    .unwrap_or(rewritten_expr);
+
+                Ok(sort_expr.with_expr(final_expr))
+            })
+            .collect::<Result<Vec<SortExpr>>>()?;
+
+        let all_valid_exprs: Vec<Expr> = column_exprs_post_aggr
+            .iter()
+            .cloned()
+            .chain(select_exprs_post_aggr.iter().filter_map(|e| {
+                if let Expr::Alias(alias) = e {
+                    Some(Expr::Column(Column::new_unqualified(alias.name.clone())))
+                } else {
+                    None
+                }
+            }))
+            .collect();
+
+        let order_by_exprs_only: Vec<Expr> =
+            order_by_post_aggr.iter().map(|s| s.expr.clone()).collect();
+        check_columns_satisfy_exprs(
+            &all_valid_exprs,
+            &order_by_exprs_only,
+            CheckColumnsSatisfyExprsPurpose::Aggregate(
+                CheckColumnsMustReferenceAggregatePurpose::OrderBy,
+            ),
+        )?;
+
+        Ok(AggregatePlanResult {
+            plan,
+            select_exprs: select_exprs_post_aggr,
+            having_expr: having_expr_post_aggr,
+            qualify_expr: qualify_expr_post_aggr,
+            order_by_exprs: order_by_post_aggr,
+        })
     }
 
     // If the projection is done over a named window, that window
@@ -886,33 +1123,32 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             {
                 let mut err = None;
                 let _ = visit_expressions_mut(expr, |expr| {
-                    if let SQLExpr::Function(f) = expr {
-                        if let Some(WindowType::NamedWindow(ident)) = &f.over {
-                            let normalized_ident =
-                                self.ident_normalizer.normalize(ident.clone());
-                            for (
-                                NamedWindowDefinition(_, window_expr),
-                                normalized_window_ident,
-                            ) in named_windows.iter()
-                            {
-                                if normalized_ident.eq(normalized_window_ident) {
-                                    f.over = Some(match window_expr {
-                                        NamedWindowExpr::NamedWindow(ident) => {
-                                            WindowType::NamedWindow(ident.clone())
-                                        }
-                                        NamedWindowExpr::WindowSpec(spec) => {
-                                            WindowType::WindowSpec(spec.clone())
-                                        }
-                                    })
-                                }
-                            }
-                            // All named windows must be defined with a WindowSpec.
-                            if let Some(WindowType::NamedWindow(ident)) = &f.over {
-                                err =
-                                    Some(plan_err!("The window {ident} is not defined!"));
-                                return ControlFlow::Break(());
+                    if let SQLExpr::Function(f) = expr
+                        && let Some(WindowType::NamedWindow(ident)) = &f.over
+                    {
+                        let normalized_ident =
+                            self.ident_normalizer.normalize(ident.clone());
+                        for (
+                            NamedWindowDefinition(_, window_expr),
+                            normalized_window_ident,
+                        ) in named_windows.iter()
+                        {
+                            if normalized_ident.eq(normalized_window_ident) {
+                                f.over = Some(match window_expr {
+                                    NamedWindowExpr::NamedWindow(ident) => {
+                                        WindowType::NamedWindow(ident.clone())
+                                    }
+                                    NamedWindowExpr::WindowSpec(spec) => {
+                                        WindowType::WindowSpec(spec.clone())
+                                    }
+                                })
                             }
                         }
+                        // All named windows must be defined with a WindowSpec.
+                        if let Some(WindowType::NamedWindow(ident)) = &f.over {
+                            err = Some(plan_err!("The window {ident} is not defined!"));
+                            return ControlFlow::Break(());
+                        }
                     }
                     ControlFlow::Continue(())
                 });
@@ -939,3 +1175,17 @@ fn check_conflicting_windows(window_defs: &[NamedWindowDefinition]) -> Result<()
     }
     Ok(())
 }
+
+/// Returns true if the expression recursively contains an `Expr::Unnest` expression
+fn has_unnest_expr_recursively(expr: &Expr) -> bool {
+    let mut has_unnest = false;
+    let _ = expr.apply(|e| {
+        if let Expr::Unnest(_) = e {
+            has_unnest = true;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    });
+    has_unnest
+}
diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs
index 5b65e1c045bdc..d4e771cb48585 100644
--- a/datafusion/sql/src/set_expr.rs
+++ b/datafusion/sql/src/set_expr.rs
@@ -17,7 +17,7 @@
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_common::{
-    not_impl_err, plan_err, DataFusionError, Diagnostic, Result, Span,
+    DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err,
 };
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned};
diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index 458b3ac132179..b91e38e53776a 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -22,49 +22,50 @@ use std::sync::Arc;
 
 use crate::parser::{
     CopyToSource, CopyToStatement, CreateExternalTable, DFParser, ExplainStatement,
-    LexOrdering, Statement as DFStatement,
+    LexOrdering, ResetStatement, Statement as DFStatement,
 };
 use crate::planner::{
-    object_name_to_qualifier, ContextProvider, PlannerContext, SqlToRel,
+    ContextProvider, PlannerContext, SqlToRel, object_name_to_qualifier,
 };
 use crate::utils::normalize_ident;
 
-use arrow::datatypes::{DataType, Fields};
+use arrow::datatypes::{Field, FieldRef, Fields};
 use datafusion_common::error::_plan_err;
 use datafusion_common::parsers::CompressionTypeVariant;
 use datafusion_common::{
-    exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, schema_err,
-    unqualified_field_not_found, Column, Constraint, Constraints, DFSchema, DFSchemaRef,
-    DataFusionError, Result, ScalarValue, SchemaError, SchemaReference, TableReference,
-    ToDFSchema,
+    Column, Constraint, Constraints, DFSchema, DFSchemaRef, DataFusionError, Result,
+    ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, exec_err,
+    internal_err, not_impl_err, plan_datafusion_err, plan_err, schema_err,
+    unqualified_field_not_found,
 };
 use datafusion_expr::dml::{CopyTo, InsertOp};
 use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check;
-use datafusion_expr::logical_plan::builder::project;
 use datafusion_expr::logical_plan::DdlStatement;
+use datafusion_expr::logical_plan::builder::project;
 use datafusion_expr::utils::expr_to_columns;
 use datafusion_expr::{
-    cast, col, Analyze, CreateCatalog, CreateCatalogSchema,
+    Analyze, CreateCatalog, CreateCatalogSchema,
     CreateExternalTable as PlanCreateExternalTable, CreateFunction, CreateFunctionBody,
     CreateIndex as PlanCreateIndex, CreateMemoryTable, CreateView, Deallocate,
     DescribeTable, DmlStatement, DropCatalogSchema, DropFunction, DropTable, DropView,
     EmptyRelation, Execute, Explain, ExplainFormat, Expr, ExprSchemable, Filter,
-    LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare, SetVariable,
-    SortExpr, Statement as PlanStatement, ToStringifiedPlan, TransactionAccessMode,
-    TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart,
-    Volatility, WriteOp,
+    LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare,
+    ResetVariable, SetVariable, SortExpr, Statement as PlanStatement, ToStringifiedPlan,
+    TransactionAccessMode, TransactionConclusion, TransactionEnd,
+    TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, cast, col,
 };
 use sqlparser::ast::{
-    self, BeginTransactionKind, NullsDistinctOption, ShowStatementIn,
-    ShowStatementOptions, SqliteOnConflict, TableObject, UpdateTableFromKind,
-    ValueWithSpan,
+    self, BeginTransactionKind, CheckConstraint, ForeignKeyConstraint, IndexColumn,
+    IndexType, NullsDistinctOption, OrderByExpr, OrderByOptions, PrimaryKeyConstraint,
+    Set, ShowStatementIn, ShowStatementOptions, SqliteOnConflict, TableObject,
+    UniqueConstraint, Update, UpdateTableFromKind, ValueWithSpan,
 };
 use sqlparser::ast::{
     Assignment, AssignmentTarget, ColumnDef, CreateIndex, CreateTable,
     CreateTableOptions, Delete, DescribeAlias, Expr as SQLExpr, FromTable, Ident, Insert,
-    ObjectName, ObjectType, OneOrManyWithParens, Query, SchemaName, SetExpr,
-    ShowCreateObject, ShowStatementFilter, Statement, TableConstraint, TableFactor,
-    TableWithJoins, TransactionMode, UnaryOperator, Value,
+    ObjectName, ObjectType, Query, SchemaName, SetExpr, ShowCreateObject,
+    ShowStatementFilter, Statement, TableConstraint, TableFactor, TableWithJoins,
+    TransactionMode, UnaryOperator, Value,
 };
 use sqlparser::parser::ParserError::ParserError;
 
@@ -102,56 +103,98 @@ fn get_schema_name(schema_name: &SchemaName) -> String {
 /// Construct `TableConstraint`(s) for the given columns by iterating over
 /// `columns` and extracting individual inline constraint definitions.
 fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConstraint> {
-    let mut constraints = vec![];
+    let mut constraints: Vec<TableConstraint> = vec![];
     for column in columns {
         for ast::ColumnOptionDef { name, option } in &column.options {
             match option {
-                ast::ColumnOption::Unique {
-                    is_primary: false,
+                ast::ColumnOption::Unique(UniqueConstraint {
                     characteristics,
-                } => constraints.push(TableConstraint::Unique {
+                    name,
+                    index_name: _index_name,
+                    index_type_display: _index_type_display,
+                    index_type: _index_type,
+                    columns: _column,
+                    index_options: _index_options,
+                    nulls_distinct: _nulls_distinct,
+                }) => constraints.push(TableConstraint::Unique(UniqueConstraint {
                     name: name.clone(),
-                    columns: vec![column.name.clone()],
-                    characteristics: *characteristics,
                     index_name: None,
                     index_type_display: ast::KeyOrIndexDisplay::None,
                     index_type: None,
+                    columns: vec![IndexColumn {
+                        column: OrderByExpr {
+                            expr: SQLExpr::Identifier(column.name.clone()),
+                            options: OrderByOptions {
+                                asc: None,
+                                nulls_first: None,
+                            },
+                            with_fill: None,
+                        },
+                        operator_class: None,
+                    }],
                     index_options: vec![],
+                    characteristics: *characteristics,
                     nulls_distinct: NullsDistinctOption::None,
-                }),
-                ast::ColumnOption::Unique {
-                    is_primary: true,
+                })),
+                ast::ColumnOption::PrimaryKey(PrimaryKeyConstraint {
                     characteristics,
-                } => constraints.push(TableConstraint::PrimaryKey {
-                    name: name.clone(),
-                    columns: vec![column.name.clone()],
-                    characteristics: *characteristics,
-                    index_name: None,
-                    index_type: None,
-                    index_options: vec![],
-                }),
-                ast::ColumnOption::ForeignKey {
+                    name: _name,
+                    index_name: _index_name,
+                    index_type: _index_type,
+                    columns: _columns,
+                    index_options: _index_options,
+                }) => {
+                    constraints.push(TableConstraint::PrimaryKey(PrimaryKeyConstraint {
+                        name: name.clone(),
+                        index_name: None,
+                        index_type: None,
+                        columns: vec![IndexColumn {
+                            column: OrderByExpr {
+                                expr: SQLExpr::Identifier(column.name.clone()),
+                                options: OrderByOptions {
+                                    asc: None,
+                                    nulls_first: None,
+                                },
+                                with_fill: None,
+                            },
+                            operator_class: None,
+                        }],
+                        index_options: vec![],
+                        characteristics: *characteristics,
+                    }))
+                }
+                ast::ColumnOption::ForeignKey(ForeignKeyConstraint {
                     foreign_table,
                     referred_columns,
                     on_delete,
                     on_update,
                     characteristics,
-                } => constraints.push(TableConstraint::ForeignKey {
-                    name: name.clone(),
-                    columns: vec![],
-                    foreign_table: foreign_table.clone(),
-                    referred_columns: referred_columns.to_vec(),
-                    on_delete: *on_delete,
-                    on_update: *on_update,
-                    characteristics: *characteristics,
-                }),
-                ast::ColumnOption::Check(expr) => {
-                    constraints.push(TableConstraint::Check {
+                    name: _name,
+                    index_name: _index_name,
+                    columns: _columns,
+                    match_kind: _match_kind,
+                }) => {
+                    constraints.push(TableConstraint::ForeignKey(ForeignKeyConstraint {
                         name: name.clone(),
-                        expr: Box::new(expr.clone()),
-                    })
-                }
-                // Other options are not constraint related.
+                        index_name: None,
+                        columns: vec![],
+                        foreign_table: foreign_table.clone(),
+                        referred_columns: referred_columns.clone(),
+                        on_delete: *on_delete,
+                        on_update: *on_update,
+                        match_kind: None,
+                        characteristics: *characteristics,
+                    }))
+                }
+                ast::ColumnOption::Check(CheckConstraint {
+                    name,
+                    expr,
+                    enforced: _enforced,
+                }) => constraints.push(TableConstraint::Check(CheckConstraint {
+                    name: name.clone(),
+                    expr: expr.clone(),
+                    enforced: None,
+                })),
                 ast::ColumnOption::Default(_)
                 | ast::ColumnOption::Null
                 | ast::ColumnOption::NotNull
@@ -168,7 +211,9 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
                 | ast::ColumnOption::Policy(_)
                 | ast::ColumnOption::Tags(_)
                 | ast::ColumnOption::Alias(_)
-                | ast::ColumnOption::Collation(_) => {}
+                | ast::ColumnOption::Srid(_)
+                | ast::ColumnOption::Collation(_)
+                | ast::ColumnOption::Invisible => {}
             }
         }
     }
@@ -188,6 +233,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 format,
                 statement,
             }) => self.explain_to_plan(verbose, analyze, format, *statement),
+            DFStatement::Reset(statement) => self.reset_statement_to_plan(statement),
         }
     }
 
@@ -215,10 +261,20 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     ) -> Result<LogicalPlan> {
         match statement {
             Statement::ExplainTable {
-                describe_alias: DescribeAlias::Describe, // only parse 'DESCRIBE table_name' and not 'EXPLAIN table_name'
+                describe_alias: DescribeAlias::Describe | DescribeAlias::Desc, // only parse 'DESCRIBE table_name' or 'DESC table_name' and not 'EXPLAIN table_name'
                 table_name,
                 ..
             } => self.describe_table_to_plan(table_name),
+            Statement::Explain {
+                describe_alias: DescribeAlias::Describe | DescribeAlias::Desc, // only parse 'DESCRIBE statement' or 'DESC statement' and not 'EXPLAIN statement'
+                statement,
+                ..
+            } => match *statement {
+                Statement::Query(query) => self.describe_query_to_plan(*query),
+                _ => {
+                    not_impl_err!("Describing statements other than SELECT not supported")
+                }
+            },
             Statement::Explain {
                 verbose,
                 statement,
@@ -233,13 +289,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
             Statement::Query(query) => self.query_to_plan(*query, planner_context),
             Statement::ShowVariable { variable } => self.show_variable_to_plan(&variable),
-            Statement::SetVariable {
-                local,
-                hivevar,
-                variables,
-                value,
-            } => self.set_variable_to_plan(local, hivevar, &variables, value),
-
+            Statement::Set(statement) => self.set_statement_to_plan(statement),
             Statement::CreateTable(CreateTable {
                 temporary,
                 external,
@@ -254,18 +304,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 name,
                 columns,
                 constraints,
-                table_properties,
-                with_options,
                 if_not_exists,
                 or_replace,
                 without_rowid,
                 like,
                 clone,
-                engine,
                 comment,
-                auto_increment_offset,
-                default_charset,
-                collation,
                 on_commit,
                 on_cluster,
                 primary_key,
@@ -273,7 +317,6 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 partition_by,
                 cluster_by,
                 clustered_by,
-                options,
                 strict,
                 copy_grants,
                 enable_schema_evolution,
@@ -290,145 +333,170 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 catalog,
                 catalog_sync,
                 storage_serialization_policy,
-            }) if table_properties.is_empty() && with_options.is_empty() => {
+                inherits,
+                table_options: CreateTableOptions::None,
+                dynamic,
+                version,
+                target_lag,
+                warehouse,
+                refresh_mode,
+                initialize,
+                require_user,
+                partition_of,
+                for_values,
+            }) => {
                 if temporary {
-                    return not_impl_err!("Temporary tables not supported")?;
+                    return not_impl_err!("Temporary tables not supported");
                 }
                 if external {
-                    return not_impl_err!("External tables not supported")?;
+                    return not_impl_err!("External tables not supported");
                 }
                 if global.is_some() {
-                    return not_impl_err!("Global tables not supported")?;
+                    return not_impl_err!("Global tables not supported");
                 }
                 if transient {
-                    return not_impl_err!("Transient tables not supported")?;
+                    return not_impl_err!("Transient tables not supported");
                 }
                 if volatile {
-                    return not_impl_err!("Volatile tables not supported")?;
+                    return not_impl_err!("Volatile tables not supported");
                 }
                 if hive_distribution != ast::HiveDistributionStyle::NONE {
                     return not_impl_err!(
                         "Hive distribution not supported: {hive_distribution:?}"
-                    )?;
+                    );
                 }
-                if !matches!(
-                    hive_formats,
-                    Some(ast::HiveFormat {
-                        row_format: None,
-                        serde_properties: None,
-                        storage: None,
-                        location: None,
-                    })
-                ) {
-                    return not_impl_err!(
-                        "Hive formats not supported: {hive_formats:?}"
-                    )?;
+                if hive_formats.is_some()
+                    && !matches!(
+                        hive_formats,
+                        Some(ast::HiveFormat {
+                            row_format: None,
+                            serde_properties: None,
+                            storage: None,
+                            location: None,
+                        })
+                    )
+                {
+                    return not_impl_err!("Hive formats not supported: {hive_formats:?}");
                 }
                 if file_format.is_some() {
-                    return not_impl_err!("File format not supported")?;
+                    return not_impl_err!("File format not supported");
                 }
                 if location.is_some() {
-                    return not_impl_err!("Location not supported")?;
+                    return not_impl_err!("Location not supported");
                 }
                 if without_rowid {
-                    return not_impl_err!("Without rowid not supported")?;
+                    return not_impl_err!("Without rowid not supported");
                 }
                 if like.is_some() {
-                    return not_impl_err!("Like not supported")?;
+                    return not_impl_err!("Like not supported");
                 }
                 if clone.is_some() {
-                    return not_impl_err!("Clone not supported")?;
-                }
-                if engine.is_some() {
-                    return not_impl_err!("Engine not supported")?;
+                    return not_impl_err!("Clone not supported");
                 }
                 if comment.is_some() {
-                    return not_impl_err!("Comment not supported")?;
-                }
-                if auto_increment_offset.is_some() {
-                    return not_impl_err!("Auto increment offset not supported")?;
-                }
-                if default_charset.is_some() {
-                    return not_impl_err!("Default charset not supported")?;
-                }
-                if collation.is_some() {
-                    return not_impl_err!("Collation not supported")?;
+                    return not_impl_err!("Comment not supported");
                 }
                 if on_commit.is_some() {
-                    return not_impl_err!("On commit not supported")?;
+                    return not_impl_err!("On commit not supported");
                 }
                 if on_cluster.is_some() {
-                    return not_impl_err!("On cluster not supported")?;
+                    return not_impl_err!("On cluster not supported");
                 }
                 if primary_key.is_some() {
-                    return not_impl_err!("Primary key not supported")?;
+                    return not_impl_err!("Primary key not supported");
                 }
                 if order_by.is_some() {
-                    return not_impl_err!("Order by not supported")?;
+                    return not_impl_err!("Order by not supported");
                 }
                 if partition_by.is_some() {
-                    return not_impl_err!("Partition by not supported")?;
+                    return not_impl_err!("Partition by not supported");
                 }
                 if cluster_by.is_some() {
-                    return not_impl_err!("Cluster by not supported")?;
+                    return not_impl_err!("Cluster by not supported");
                 }
                 if clustered_by.is_some() {
-                    return not_impl_err!("Clustered by not supported")?;
-                }
-                if options.is_some() {
-                    return not_impl_err!("Options not supported")?;
+                    return not_impl_err!("Clustered by not supported");
                 }
                 if strict {
-                    return not_impl_err!("Strict not supported")?;
+                    return not_impl_err!("Strict not supported");
                 }
                 if copy_grants {
-                    return not_impl_err!("Copy grants not supported")?;
+                    return not_impl_err!("Copy grants not supported");
                 }
                 if enable_schema_evolution.is_some() {
-                    return not_impl_err!("Enable schema evolution not supported")?;
+                    return not_impl_err!("Enable schema evolution not supported");
                 }
                 if change_tracking.is_some() {
-                    return not_impl_err!("Change tracking not supported")?;
+                    return not_impl_err!("Change tracking not supported");
                 }
                 if data_retention_time_in_days.is_some() {
-                    return not_impl_err!("Data retention time in days not supported")?;
+                    return not_impl_err!("Data retention time in days not supported");
                 }
                 if max_data_extension_time_in_days.is_some() {
                     return not_impl_err!(
                         "Max data extension time in days not supported"
-                    )?;
+                    );
                 }
                 if default_ddl_collation.is_some() {
-                    return not_impl_err!("Default DDL collation not supported")?;
+                    return not_impl_err!("Default DDL collation not supported");
                 }
                 if with_aggregation_policy.is_some() {
-                    return not_impl_err!("With aggregation policy not supported")?;
+                    return not_impl_err!("With aggregation policy not supported");
                 }
                 if with_row_access_policy.is_some() {
-                    return not_impl_err!("With row access policy not supported")?;
+                    return not_impl_err!("With row access policy not supported");
                 }
                 if with_tags.is_some() {
-                    return not_impl_err!("With tags not supported")?;
+                    return not_impl_err!("With tags not supported");
                 }
                 if iceberg {
-                    return not_impl_err!("Iceberg not supported")?;
+                    return not_impl_err!("Iceberg not supported");
                 }
                 if external_volume.is_some() {
-                    return not_impl_err!("External volume not supported")?;
+                    return not_impl_err!("External volume not supported");
                 }
                 if base_location.is_some() {
-                    return not_impl_err!("Base location not supported")?;
+                    return not_impl_err!("Base location not supported");
                 }
                 if catalog.is_some() {
-                    return not_impl_err!("Catalog not supported")?;
+                    return not_impl_err!("Catalog not supported");
                 }
                 if catalog_sync.is_some() {
-                    return not_impl_err!("Catalog sync not supported")?;
+                    return not_impl_err!("Catalog sync not supported");
                 }
                 if storage_serialization_policy.is_some() {
-                    return not_impl_err!("Storage serialization policy not supported")?;
+                    return not_impl_err!("Storage serialization policy not supported");
+                }
+                if inherits.is_some() {
+                    return not_impl_err!("Table inheritance not supported");
+                }
+                if dynamic {
+                    return not_impl_err!("Dynamic tables not supported");
+                }
+                if version.is_some() {
+                    return not_impl_err!("Version not supported");
+                }
+                if target_lag.is_some() {
+                    return not_impl_err!("Target lag not supported");
+                }
+                if warehouse.is_some() {
+                    return not_impl_err!("Warehouse not supported");
+                }
+                if refresh_mode.is_some() {
+                    return not_impl_err!("Refresh mode not supported");
+                }
+                if initialize.is_some() {
+                    return not_impl_err!("Initialize not supported");
+                }
+                if require_user {
+                    return not_impl_err!("Require user not supported");
+                }
+                if partition_of.is_some() {
+                    return not_impl_err!("PARTITION OF not supported");
+                }
+                if for_values.is_some() {
+                    return not_impl_err!("PARTITION OF .. FOR VALUES .. not supported");
                 }
-
                 // Merge inline constraints and existing constraints
                 let mut all_constraints = constraints;
                 let inline_constraints = calc_inline_constraints_from_columns(&columns);
@@ -451,10 +519,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         let plan = if has_columns {
                             if schema.fields().len() != input_schema.fields().len() {
                                 return plan_err!(
-                            "Mismatch: {} columns specified, but result has {} columns",
-                            schema.fields().len(),
-                            input_schema.fields().len()
-                        );
+                                    "Mismatch: {} columns specified, but result has {} columns",
+                                    schema.fields().len(),
+                                    input_schema.fields().len()
+                                );
                             }
                             let input_fields = input_schema.fields();
                             let project_exprs = schema
@@ -519,8 +587,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                 }
             }
-
-            Statement::CreateView {
+            Statement::CreateView(ast::CreateView {
                 or_replace,
                 materialized,
                 name,
@@ -534,7 +601,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 temporary,
                 to,
                 params,
-            } => {
+                or_alter,
+                secure,
+                name_before_not_exists,
+            }) => {
                 if materialized {
                     return not_impl_err!("Materialized views not supported")?;
                 }
@@ -556,7 +626,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 // put the statement back together temporarily to get the SQL
                 // string representation
-                let stmt = Statement::CreateView {
+                let stmt = Statement::CreateView(ast::CreateView {
                     or_replace,
                     materialized,
                     name,
@@ -570,16 +640,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     temporary,
                     to,
                     params,
-                };
+                    or_alter,
+                    secure,
+                    name_before_not_exists,
+                });
                 let sql = stmt.to_string();
-                let Statement::CreateView {
+                let Statement::CreateView(ast::CreateView {
                     name,
                     columns,
                     query,
                     or_replace,
                     temporary,
                     ..
-                } = stmt
+                }) = stmt
                 else {
                     return internal_err!("Unreachable code in create view");
                 };
@@ -617,6 +690,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             Statement::CreateSchema {
                 schema_name,
                 if_not_exists,
+                ..
             } => Ok(LogicalPlan::Ddl(DdlStatement::CreateCatalogSchema(
                 CreateCatalogSchema {
                     schema_name: get_schema_name(&schema_name),
@@ -643,6 +717,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 restrict: _,
                 purge: _,
                 temporary: _,
+                table: _,
             } => {
                 // We don't support cascade and purge for now.
                 // nor do we support multiple object names
@@ -672,18 +747,31 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                     ObjectType::Schema => {
                         let name = match name {
-                            TableReference::Bare { table } => Ok(SchemaReference::Bare { schema: table }),
-                            TableReference::Partial { schema, table } => Ok(SchemaReference::Full { schema: table, catalog: schema }),
-                            TableReference::Full { catalog: _, schema: _, table: _ } => {
-                                Err(ParserError("Invalid schema specifier (has 3 parts)".to_string()))
+                            TableReference::Bare { table } => {
+                                Ok(SchemaReference::Bare { schema: table })
+                            }
+                            TableReference::Partial { schema, table } => {
+                                Ok(SchemaReference::Full {
+                                    schema: table,
+                                    catalog: schema,
+                                })
                             }
+                            TableReference::Full {
+                                catalog: _,
+                                schema: _,
+                                table: _,
+                            } => Err(ParserError(
+                                "Invalid schema specifier (has 3 parts)".to_string(),
+                            )),
                         }?;
-                        Ok(LogicalPlan::Ddl(DdlStatement::DropCatalogSchema(DropCatalogSchema {
-                            name,
-                            if_exists,
-                            cascade,
-                            schema: DFSchemaRef::new(DFSchema::empty()),
-                        })))
+                        Ok(LogicalPlan::Ddl(DdlStatement::DropCatalogSchema(
+                            DropCatalogSchema {
+                                name,
+                                if_exists,
+                                cascade,
+                                schema: DFSchemaRef::new(DFSchema::empty()),
+                            },
+                        )))
                     }
                     _ => not_impl_err!(
                         "Only `DROP TABLE/VIEW/SCHEMA  ...` statement is supported currently"
@@ -696,14 +784,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 statement,
             } => {
                 // Convert parser data types to DataFusion data types
-                let mut data_types: Vec<DataType> = data_types
+                let mut fields: Vec<FieldRef> = data_types
                     .into_iter()
-                    .map(|t| self.convert_data_type(&t))
+                    .map(|t| self.convert_data_type_to_field(&t))
                     .collect::<Result<_>>()?;
 
                 // Create planner context with parameters
-                let mut planner_context = PlannerContext::new()
-                    .with_prepare_param_data_types(data_types.clone());
+                let mut planner_context =
+                    PlannerContext::new().with_prepare_param_data_types(fields.clone());
 
                 // Build logical plan for inner statement of the prepare statement
                 let plan = self.sql_statement_to_plan_with_context_impl(
@@ -711,21 +799,21 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     &mut planner_context,
                 )?;
 
-                if data_types.is_empty() {
-                    let map_types = plan.get_parameter_types()?;
+                if fields.is_empty() {
+                    let map_types = plan.get_parameter_fields()?;
                     let param_types: Vec<_> = (1..=map_types.len())
                         .filter_map(|i| {
                             let key = format!("${i}");
                             map_types.get(&key).and_then(|opt| opt.clone())
                         })
                         .collect();
-                    data_types.extend(param_types.iter().cloned());
+                    fields.extend(param_types.iter().cloned());
                     planner_context.with_prepare_param_data_types(param_types);
                 }
 
                 Ok(LogicalPlan::Statement(PlanStatement::Prepare(Prepare {
                     name: ident_to_string(&name),
-                    data_types,
+                    fields,
                     input: Arc::new(plan),
                 })))
             }
@@ -738,6 +826,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 has_parentheses: _,
                 immediate,
                 into,
+                output,
+                default,
             } => {
                 // `USING` is a MySQL-specific syntax and currently not supported.
                 if !using.is_empty() {
@@ -753,6 +843,16 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if !into.is_empty() {
                     return not_impl_err!("Execute statement with INTO is not supported");
                 }
+                if output {
+                    return not_impl_err!(
+                        "Execute statement with OUTPUT is not supported"
+                    );
+                }
+                if default {
+                    return not_impl_err!(
+                        "Execute statement with DEFAULT is not supported"
+                    );
+                }
                 let empty_schema = DFSchema::empty();
                 let parameters = parameters
                     .into_iter()
@@ -895,11 +995,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 has_table_keyword,
                 settings,
                 format_clause,
+                insert_token: _, // record the location the `INSERT` token
+                optimizer_hint,
             }) => {
                 let table_name = match table {
                     TableObject::TableName(table_name) => table_name,
                     TableObject::TableFunction(_) => {
-                        return not_impl_err!("INSERT INTO Table functions not supported")
+                        return not_impl_err!(
+                            "INSERT INTO Table functions not supported"
+                        );
                     }
                 };
                 if let Some(or) = or {
@@ -948,36 +1052,57 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if format_clause.is_some() {
                     plan_err!("Inserts with format clause not supported")?;
                 }
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
+                }
                 // optional keywords don't change behavior
                 let _ = into;
                 let _ = has_table_keyword;
                 self.insert_to_plan(table_name, columns, source, overwrite, replace_into)
             }
-            Statement::Update {
+            Statement::Update(Update {
                 table,
                 assignments,
                 from,
                 selection,
                 returning,
                 or,
-            } => {
-                let froms =
+                limit,
+                update_token: _,
+                optimizer_hint,
+            }) => {
+                let from_clauses =
                     from.map(|update_table_from_kind| match update_table_from_kind {
-                        UpdateTableFromKind::BeforeSet(froms) => froms,
-                        UpdateTableFromKind::AfterSet(froms) => froms,
+                        UpdateTableFromKind::BeforeSet(from_clauses) => from_clauses,
+                        UpdateTableFromKind::AfterSet(from_clauses) => from_clauses,
                     });
                 // TODO: support multiple tables in UPDATE SET FROM
-                if froms.as_ref().is_some_and(|f| f.len() > 1) {
-                    plan_err!("Multiple tables in UPDATE SET FROM not yet supported")?;
+                if from_clauses.as_ref().is_some_and(|f| f.len() > 1) {
+                    not_impl_err!(
+                        "Multiple tables in UPDATE SET FROM not yet supported"
+                    )?;
                 }
-                let update_from = froms.and_then(|mut f| f.pop());
+                let update_from = from_clauses.and_then(|mut f| f.pop());
+
+                // UPDATE ... FROM is currently not working
+                // TODO fix https://github.com/apache/datafusion/issues/19950
+                if update_from.is_some() {
+                    return not_impl_err!("UPDATE ... FROM is not supported");
+                }
+
                 if returning.is_some() {
                     plan_err!("Update-returning clause not yet supported")?;
                 }
                 if or.is_some() {
                     plan_err!("ON conflict not supported")?;
                 }
-                self.update_to_plan(table, assignments, update_from, selection)
+                if limit.is_some() {
+                    return not_impl_err!("Update-limit clause not supported")?;
+                }
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
+                }
+                self.update_to_plan(table, &assignments, update_from, selection)
             }
 
             Statement::Delete(Delete {
@@ -988,6 +1113,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 from,
                 order_by,
                 limit,
+                delete_token: _,
+                optimizer_hint,
             }) => {
                 if !tables.is_empty() {
                     plan_err!("DELETE <TABLE> not supported")?;
@@ -1005,12 +1132,12 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     plan_err!("Delete-order-by clause not yet supported")?;
                 }
 
-                if limit.is_some() {
-                    plan_err!("Delete-limit clause not yet supported")?;
+                if optimizer_hint.is_some() {
+                    plan_err!("Optimizer hints not supported")?;
                 }
 
                 let table_name = self.get_delete_target(from)?;
-                self.delete_to_plan(table_name, selection)
+                self.delete_to_plan(&table_name, selection, limit)
             }
 
             Statement::StartTransaction {
@@ -1019,8 +1146,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 modifier,
                 transaction,
                 statements,
-                exception_statements,
                 has_end_keyword,
+                exception,
             } => {
                 if let Some(modifier) = modifier {
                     return not_impl_err!(
@@ -1032,7 +1159,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         "Transaction with multiple statements not supported"
                     );
                 }
-                if exception_statements.is_some() {
+                if exception.is_some() {
                     return not_impl_err!(
                         "Transaction with exception statements not supported"
                     );
@@ -1040,7 +1167,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 if has_end_keyword {
                     return not_impl_err!("Transaction with END keyword not supported");
                 }
-                self.validate_transaction_kind(transaction)?;
+                self.validate_transaction_kind(transaction.as_ref())?;
                 let isolation_level: ast::TransactionIsolationLevel = modes
                     .iter()
                     .filter_map(|m: &TransactionMode| match m {
@@ -1129,7 +1256,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 ..
             }) => {
                 let return_type = match return_type {
-                    Some(t) => Some(self.convert_data_type(&t)?),
+                    Some(t) => Some(self.convert_data_type_to_field(&t)?),
                     None => None,
                 };
                 let mut planner_context = PlannerContext::new();
@@ -1140,7 +1267,8 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         let function_args = function_args
                             .into_iter()
                             .map(|arg| {
-                                let data_type = self.convert_data_type(&arg.data_type)?;
+                                let data_type =
+                                    self.convert_data_type_to_field(&arg.data_type)?;
 
                                 let default_expr = match arg.default_expr {
                                     Some(expr) => Some(self.sql_to_expr(
@@ -1153,7 +1281,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                                 Ok(OperateFunctionArg {
                                     name: arg.name,
                                     default_expr,
-                                    data_type,
+                                    data_type: data_type.data_type().clone(),
                                 })
                             })
                             .collect::<Result<Vec<OperateFunctionArg>>>();
@@ -1161,6 +1289,27 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     }
                     None => None,
                 };
+                // Validate default arguments
+                let first_default = match args.as_ref() {
+                    Some(arg) => arg.iter().position(|t| t.default_expr.is_some()),
+                    None => None,
+                };
+                let last_non_default = match args.as_ref() {
+                    Some(arg) => arg
+                        .iter()
+                        .rev()
+                        .position(|t| t.default_expr.is_none())
+                        .map(|reverse_pos| arg.len() - reverse_pos - 1),
+                    None => None,
+                };
+                if let (Some(pos_default), Some(pos_non_default)) =
+                    (first_default, last_non_default)
+                    && pos_non_default > pos_default
+                {
+                    return plan_err!(
+                        "Non-default arguments cannot follow default arguments."
+                    );
+                }
                 // At the moment functions can't be qualified `schema.name`
                 let name = match &name.0[..] {
                     [] => exec_err!("Function should have name")?,
@@ -1171,17 +1320,47 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 // Convert resulting expression to data fusion expression
                 //
                 let arg_types = args.as_ref().map(|arg| {
-                    arg.iter().map(|t| t.data_type.clone()).collect::<Vec<_>>()
+                    arg.iter()
+                        .map(|t| {
+                            let name = match t.name.clone() {
+                                Some(name) => name.value,
+                                None => "".to_string(),
+                            };
+                            Arc::new(Field::new(name, t.data_type.clone(), true))
+                        })
+                        .collect::<Vec<_>>()
                 });
+                // Validate parameter style
+                if let Some(ref fields) = arg_types {
+                    let count_positional =
+                        fields.iter().filter(|f| f.name() == "").count();
+                    if !(count_positional == 0 || count_positional == fields.len()) {
+                        return plan_err!(
+                            "All function arguments must use either named or positional style."
+                        );
+                    }
+                }
                 let mut planner_context = PlannerContext::new()
                     .with_prepare_param_data_types(arg_types.unwrap_or_default());
 
                 let function_body = match function_body {
                     Some(r) => Some(self.sql_to_expr(
                         match r {
-                            ast::CreateFunctionBody::AsBeforeOptions(expr) => expr,
+                            // `link_symbol` indicates if the primary expression contains the name of shared library file.
+                            ast::CreateFunctionBody::AsBeforeOptions{body: expr, link_symbol: _link_symbol} => expr,
                             ast::CreateFunctionBody::AsAfterOptions(expr) => expr,
                             ast::CreateFunctionBody::Return(expr) => expr,
+                            ast::CreateFunctionBody::AsBeginEnd(_) => {
+                                return not_impl_err!(
+                                    "BEGIN/END enclosed function body syntax is not supported"
+                                )?;
+                            }
+                            ast::CreateFunctionBody::AsReturnExpr(_)
+                            | ast::CreateFunctionBody::AsReturnSelect(_) => {
+                                return not_impl_err!(
+                                    "AS RETURN function syntax is not supported"
+                                )?
+                            }
                         },
                         &DFSchema::empty(),
                         &mut planner_context,
@@ -1203,7 +1382,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     or_replace,
                     temporary,
                     name,
-                    return_type,
+                    return_type: return_type.map(|f| f.data_type().clone()),
                     args,
                     params,
                     schema: DFSchemaRef::new(DFSchema::empty()),
@@ -1211,11 +1390,11 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
 
                 Ok(LogicalPlan::Ddl(statement))
             }
-            Statement::DropFunction {
+            Statement::DropFunction(ast::DropFunction {
                 if_exists,
                 func_desc,
-                ..
-            } => {
+                drop_behavior: _,
+            }) => {
                 // According to postgresql documentation it can be only one function
                 // specified in drop statement
                 if let Some(desc) = func_desc.first() {
@@ -1235,6 +1414,60 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     exec_err!("Function name not provided")
                 }
             }
+            Statement::Truncate(ast::Truncate {
+                table_names,
+                partitions,
+                identity,
+                cascade,
+                on_cluster,
+                table,
+                if_exists,
+            }) => {
+                let _ = table; // Support TRUNCATE TABLE and TRUNCATE syntax
+                if table_names.len() != 1 {
+                    return not_impl_err!(
+                        "TRUNCATE with multiple tables is not supported"
+                    );
+                }
+
+                let target = &table_names[0];
+                if target.only {
+                    return not_impl_err!("TRUNCATE with ONLY is not supported");
+                }
+                if partitions.is_some() {
+                    return not_impl_err!("TRUNCATE with PARTITION is not supported");
+                }
+                if identity.is_some() {
+                    return not_impl_err!(
+                        "TRUNCATE with RESTART/CONTINUE IDENTITY is not supported"
+                    );
+                }
+                if cascade.is_some() {
+                    return not_impl_err!(
+                        "TRUNCATE with CASCADE/RESTRICT is not supported"
+                    );
+                }
+                if on_cluster.is_some() {
+                    return not_impl_err!("TRUNCATE with ON CLUSTER is not supported");
+                }
+                if if_exists {
+                    return not_impl_err!("TRUNCATE .. with IF EXISTS is not supported");
+                }
+                let table = self.object_name_to_table_reference(target.name.clone())?;
+                let source = self.context_provider.get_table_source(table.clone())?;
+
+                // TRUNCATE does not operate on input rows. The EmptyRelation is a logical placeholder
+                // since the real operation is executed directly by the TableProvider's truncate() hook.
+                Ok(LogicalPlan::Dml(DmlStatement::new(
+                    table.clone(),
+                    source,
+                    WriteOp::Truncate,
+                    Arc::new(LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row: false,
+                        schema: DFSchemaRef::new(DFSchema::empty()),
+                    })),
+                )))
+            }
             Statement::CreateIndex(CreateIndex {
                 name,
                 table_name,
@@ -1251,9 +1484,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     .get_table_source(table.clone())?
                     .schema()
                     .to_dfschema_ref()?;
-                let using: Option<String> = using.as_ref().map(ident_to_string);
+                let using: Option<String> =
+                    using.as_ref().map(|index_type| match index_type {
+                        IndexType::Custom(ident) => ident_to_string(ident),
+                        _ => index_type.to_string().to_ascii_lowercase(),
+                    });
+                let order_by_exprs: Vec<OrderByExpr> =
+                    columns.into_iter().map(|col| col.column).collect();
                 let columns = self.order_by_to_sort_expr(
-                    columns,
+                    order_by_exprs,
                     &table_schema,
                     planner_context,
                     false,
@@ -1329,6 +1568,19 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         }))
     }
 
+    fn describe_query_to_plan(&self, query: Query) -> Result<LogicalPlan> {
+        let plan = self.query_to_plan(query, &mut PlannerContext::new())?;
+
+        let schema = Arc::new(plan.schema().as_arrow().clone());
+
+        let output_schema = DFSchema::try_from(LogicalPlan::describe_schema()).unwrap();
+
+        Ok(LogicalPlan::DescribeTable(DescribeTable {
+            schema,
+            output_schema: Arc::new(output_schema),
+        }))
+    }
+
     fn copy_to_plan(&self, statement: CopyToStatement) -> Result<LogicalPlan> {
         // Determine if source is table or query and handle accordingly
         let copy_source = statement.source;
@@ -1388,13 +1640,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             .map(|f| f.name().to_owned())
             .collect();
 
-        Ok(LogicalPlan::Copy(CopyTo {
-            input: Arc::new(input),
-            output_url: statement.target,
-            file_type,
+        Ok(LogicalPlan::Copy(CopyTo::new(
+            Arc::new(input),
+            statement.target,
             partition_by,
-            options: options_map,
-        }))
+            file_type,
+            options_map,
+        )))
     }
 
     fn build_order_by(
@@ -1412,23 +1664,23 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         .map(|order_by_expr| {
                             let ordered_expr = &order_by_expr.expr;
                             let ordered_expr = ordered_expr.to_owned();
-                            let ordered_expr = self
-                                .sql_expr_to_logical_expr(
-                                    ordered_expr,
-                                    schema,
-                                    planner_context,
-                                )
-                                .unwrap();
+                            let ordered_expr = self.sql_expr_to_logical_expr(
+                                ordered_expr,
+                                schema,
+                                planner_context,
+                            )?;
                             let asc = order_by_expr.options.asc.unwrap_or(true);
                             let nulls_first =
-                                order_by_expr.options.nulls_first.unwrap_or(!asc);
+                                order_by_expr.options.nulls_first.unwrap_or_else(|| {
+                                    self.options.default_null_ordering.nulls_first(asc)
+                                });
 
-                            SortExpr::new(ordered_expr, asc, nulls_first)
+                            Ok(SortExpr::new(ordered_expr, asc, nulls_first))
                         })
-                        .collect::<Vec<SortExpr>>();
-                    result
+                        .collect::<Result<Vec<SortExpr>>>()?;
+                    Ok(result)
                 })
-                .collect::<Vec<Vec<SortExpr>>>();
+                .collect::<Result<Vec<Vec<SortExpr>>>>()?;
 
             return Ok(results);
         }
@@ -1471,6 +1723,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             unbounded,
             options,
             constraints,
+            or_replace,
         } = statement;
 
         // Merge inline constraints and existing constraints
@@ -1512,21 +1765,18 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let constraints =
             self.new_constraint_from_table_constraints(&all_constraints, &df_schema)?;
         Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(
-            PlanCreateExternalTable {
-                schema: df_schema,
-                name,
-                location,
-                file_type,
-                table_partition_cols,
-                if_not_exists,
-                temporary,
-                definition,
-                order_exprs: ordered_exprs,
-                unbounded,
-                options: options_map,
-                constraints,
-                column_defaults,
-            },
+            PlanCreateExternalTable::builder(name, location, file_type, df_schema)
+                .with_partition_cols(table_partition_cols)
+                .with_if_not_exists(if_not_exists)
+                .with_or_replace(or_replace)
+                .with_temporary(temporary)
+                .with_definition(definition)
+                .with_order_exprs(ordered_exprs)
+                .with_unbounded(unbounded)
+                .with_options(options_map)
+                .with_constraints(constraints)
+                .with_column_defaults(column_defaults)
+                .build(),
         )))
     }
 
@@ -1535,13 +1785,21 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn get_constraint_column_indices(
         &self,
         df_schema: &DFSchemaRef,
-        columns: &[Ident],
+        columns: &[IndexColumn],
         constraint_name: &str,
     ) -> Result<Vec<usize>> {
         let field_names = df_schema.field_names();
         columns
             .iter()
-            .map(|ident| {
+            .map(|index_column| {
+                let expr = &index_column.column.expr;
+                let ident = if let SQLExpr::Identifier(ident) = expr {
+                    ident
+                } else {
+                    return Err(plan_datafusion_err!(
+                        "Column name for {constraint_name} must be an identifier: {expr}"
+                    ));
+                };
                 let column = self.ident_normalizer.normalize(ident.clone());
                 field_names
                     .iter()
@@ -1564,8 +1822,17 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let constraints = constraints
             .iter()
             .map(|c: &TableConstraint| match c {
-                TableConstraint::Unique { name, columns, .. } => {
-                    let constraint_name = match name {
+                TableConstraint::Unique(UniqueConstraint {
+                    name,
+                    index_name: _,
+                    index_type_display: _,
+                    index_type: _,
+                    columns,
+                    index_options: _,
+                    characteristics: _,
+                    nulls_distinct: _,
+                }) => {
+                    let constraint_name = match &name {
                         Some(name) => &format!("unique constraint with name '{name}'"),
                         None => "unique constraint",
                     };
@@ -1577,7 +1844,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     )?;
                     Ok(Constraint::Unique(indices))
                 }
-                TableConstraint::PrimaryKey { columns, .. } => {
+                TableConstraint::PrimaryKey(PrimaryKeyConstraint {
+                    name: _,
+                    index_name: _,
+                    index_type: _,
+                    columns,
+                    index_options: _,
+                    characteristics: _,
+                }) => {
                     // Get primary key indices in the schema
                     let indices = self.get_constraint_column_indices(
                         df_schema,
@@ -1670,10 +1944,15 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 vec![plan.to_stringified(PlanType::InitialLogicalPlan)];
 
             // default to configuration value
+            // verbose mode only supports indent format
             let options = self.context_provider.options();
-            let format = format.as_ref().unwrap_or(&options.explain.format);
-
-            let format: ExplainFormat = format.parse()?;
+            let format = if verbose {
+                ExplainFormat::Indent
+            } else if let Some(format) = format {
+                ExplainFormat::from_str(&format)?
+            } else {
+                options.explain.format.clone()
+            };
 
             Ok(LogicalPlan::Explain(Explain {
                 verbose,
@@ -1724,7 +2003,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .iter()
                 .any(|opt| opt.key == variable);
 
-            if !is_valid_variable {
+            // Check if it's a runtime variable
+            let is_runtime_variable = variable.starts_with("datafusion.runtime.");
+
+            if !is_valid_variable && !is_runtime_variable {
                 return plan_err!(
                     "'{variable}' is not a variable which can be viewed with 'SHOW'"
                 );
@@ -1739,70 +2021,86 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         self.statement_to_plan(rewrite.pop_front().unwrap())
     }
 
-    fn set_variable_to_plan(
-        &self,
-        local: bool,
-        hivevar: bool,
-        variables: &OneOrManyWithParens<ObjectName>,
-        value: Vec<SQLExpr>,
-    ) -> Result<LogicalPlan> {
-        if local {
-            return not_impl_err!("LOCAL is not supported");
-        }
-
-        if hivevar {
-            return not_impl_err!("HIVEVAR is not supported");
-        }
+    fn set_statement_to_plan(&self, statement: Set) -> Result<LogicalPlan> {
+        match statement {
+            Set::SingleAssignment {
+                scope,
+                hivevar,
+                variable,
+                values,
+            } => {
+                if scope.is_some() {
+                    return not_impl_err!("SET with scope modifiers is not supported");
+                }
 
-        let variable = match variables {
-            OneOrManyWithParens::One(v) => object_name_to_string(v),
-            OneOrManyWithParens::Many(vs) => {
-                return not_impl_err!(
-                    "SET only supports single variable assignment: {vs:?}"
-                );
-            }
-        };
-        let mut variable_lower = variable.to_lowercase();
+                if hivevar {
+                    return not_impl_err!("SET HIVEVAR is not supported");
+                }
 
-        if variable_lower == "timezone" || variable_lower == "time.zone" {
-            // We could introduce alias in OptionDefinition if this string matching thing grows
-            variable_lower = "datafusion.execution.time_zone".to_string();
-        }
+                let variable = object_name_to_string(&variable);
+                let mut variable_lower = variable.to_lowercase();
 
-        // Parse value string from Expr
-        let value_string = match &value[0] {
-            SQLExpr::Identifier(i) => ident_to_string(i),
-            SQLExpr::Value(v) => match crate::utils::value_to_string(&v.value) {
-                None => {
-                    return plan_err!("Unsupported Value {}", value[0]);
+                // Map PostgreSQL "timezone" and MySQL "time.zone" aliases to DataFusion's canonical name
+                if variable_lower == "timezone" || variable_lower == "time.zone" {
+                    variable_lower = "datafusion.execution.time_zone".to_string();
                 }
-                Some(v) => v,
-            },
-            // For capture signed number e.g. +8, -8
-            SQLExpr::UnaryOp { op, expr } => match op {
-                UnaryOperator::Plus => format!("+{expr}"),
-                UnaryOperator::Minus => format!("-{expr}"),
-                _ => {
-                    return plan_err!("Unsupported Value {}", value[0]);
+
+                if values.len() != 1 {
+                    return plan_err!("SET only supports single value assignment");
                 }
-            },
-            _ => {
-                return plan_err!("Unsupported Value {}", value[0]);
+
+                let value_string = match &values[0] {
+                    SQLExpr::Identifier(i) => ident_to_string(i),
+                    SQLExpr::Value(v) => match crate::utils::value_to_string(&v.value) {
+                        None => {
+                            return plan_err!("Unsupported value {:?}", v.value);
+                        }
+                        Some(s) => s,
+                    },
+                    SQLExpr::UnaryOp { op, expr } => match op {
+                        UnaryOperator::Plus => format!("+{expr}"),
+                        UnaryOperator::Minus => format!("-{expr}"),
+                        _ => return plan_err!("Unsupported unary op {:?}", op),
+                    },
+                    _ => return plan_err!("Unsupported expr {:?}", values[0]),
+                };
+
+                Ok(LogicalPlan::Statement(PlanStatement::SetVariable(
+                    SetVariable {
+                        variable: variable_lower,
+                        value: value_string,
+                    },
+                )))
             }
-        };
+            other => not_impl_err!("SET variant not implemented yet: {other:?}"),
+        }
+    }
+
+    fn reset_statement_to_plan(&self, statement: ResetStatement) -> Result<LogicalPlan> {
+        match statement {
+            ResetStatement::Variable(variable) => {
+                let variable = object_name_to_string(&variable);
+                let mut variable_lower = variable.to_lowercase();
 
-        let statement = PlanStatement::SetVariable(SetVariable {
-            variable: variable_lower,
-            value: value_string,
-        });
+                // Map PostgreSQL "timezone" and MySQL "time.zone" aliases to DataFusion's canonical name
+                if variable_lower == "timezone" || variable_lower == "time.zone" {
+                    variable_lower = "datafusion.execution.time_zone".to_string();
+                }
 
-        Ok(LogicalPlan::Statement(statement))
+                Ok(LogicalPlan::Statement(PlanStatement::ResetVariable(
+                    ResetVariable {
+                        variable: variable_lower,
+                    },
+                )))
+            }
+        }
     }
 
     fn delete_to_plan(
         &self,
-        table_name: ObjectName,
+        table_name: &ObjectName,
         predicate_expr: Option<SQLExpr>,
+        limit: Option<SQLExpr>,
     ) -> Result<LogicalPlan> {
         // Do a table lookup to verify the table exists
         let table_ref = self.object_name_to_table_reference(table_name.clone())?;
@@ -1816,7 +2114,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 .build()?;
         let mut planner_context = PlannerContext::new();
 
-        let source = match predicate_expr {
+        let mut source = match predicate_expr {
             None => scan,
             Some(predicate_expr) => {
                 let filter_expr =
@@ -1833,6 +2131,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             }
         };
 
+        if let Some(limit) = limit {
+            let empty_schema = DFSchema::empty();
+            let limit = self.sql_to_expr(limit, &empty_schema, &mut planner_context)?;
+            source = LogicalPlanBuilder::from(source)
+                .limit_by_expr(None, Some(limit))?
+                .build()?
+        }
+
         let plan = LogicalPlan::Dml(DmlStatement::new(
             table_ref,
             table_source,
@@ -1845,7 +2151,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
     fn update_to_plan(
         &self,
         table: TableWithJoins,
-        assignments: Vec<Assignment>,
+        assignments: &[Assignment],
         from: Option<TableWithJoins>,
         predicate_expr: Option<SQLExpr>,
     ) -> Result<LogicalPlan> {
@@ -1922,10 +2228,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         )?;
                         // Update placeholder's datatype to the type of the target column
                         if let Expr::Placeholder(placeholder) = &mut expr {
-                            placeholder.data_type = placeholder
-                                .data_type
+                            placeholder.field = placeholder
+                                .field
                                 .take()
-                                .or_else(|| Some(field.data_type().clone()));
+                                .or_else(|| Some(Arc::clone(field)));
                         }
                         // Cast to target column type, if necessary
                         expr.cast_to(field.data_type(), source.schema())?
@@ -1968,8 +2274,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         // Do a table lookup to verify the table exists
         let table_name = self.object_name_to_table_reference(table_name)?;
         let table_source = self.context_provider.get_table_source(table_name.clone())?;
-        let arrow_schema = (*table_source.schema()).clone();
-        let table_schema = DFSchema::try_from(arrow_schema)?;
+        let table_schema = DFSchema::try_from(table_source.schema())?;
 
         // Get insert fields and target table's value indices
         //
@@ -1990,9 +2295,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             let mut value_indices = vec![None; table_schema.fields().len()];
             let fields = columns
                 .into_iter()
-                .map(|c| self.ident_normalizer.normalize(c))
                 .enumerate()
                 .map(|(i, c)| {
+                    let c = self.ident_normalizer.normalize(c);
                     let column_index = table_schema
                         .index_of_column_by_name(None, &c)
                         .ok_or_else(|| unqualified_field_not_found(&c, &table_schema))?;
@@ -2004,7 +2309,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                     } else {
                         value_indices[column_index] = Some(i);
                     }
-                    Ok(table_schema.field(column_index).clone())
+                    Ok(Arc::clone(table_schema.field(column_index)))
                 })
                 .collect::<Result<Vec<_>>>()?;
             (Fields::from(fields), value_indices)
@@ -2030,8 +2335,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                                 idx + 1
                             )
                         })?;
-                        let dt = field.data_type().clone();
-                        let _ = prepare_param_data_types.insert(name, dt);
+                        let _ = prepare_param_data_types.insert(name, Arc::clone(field));
                     }
                 }
             }
@@ -2065,7 +2369,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                         .cloned()
                         .unwrap_or_else(|| {
                             // If there is no default for the column, then the default is NULL
-                            Expr::Literal(ScalarValue::Null)
+                            Expr::Literal(ScalarValue::Null, None)
                         })
                         .cast_to(target_field.data_type(), &DFSchema::empty())?,
                 };
@@ -2078,7 +2382,9 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             (false, false) => InsertOp::Append,
             (true, false) => InsertOp::Overwrite,
             (false, true) => InsertOp::Replace,
-            (true, true) => plan_err!("Conflicting insert operations: `overwrite` and `replace_into` cannot both be true")?,
+            (true, true) => plan_err!(
+                "Conflicting insert operations: `overwrite` and `replace_into` cannot both be true"
+            )?,
         };
 
         let plan = LogicalPlan::Dml(DmlStatement::new(
@@ -2253,7 +2559,7 @@ ON p.function_name = r.routine_name
 
     fn validate_transaction_kind(
         &self,
-        kind: Option<BeginTransactionKind>,
+        kind: Option<&BeginTransactionKind>,
     ) -> Result<()> {
         match kind {
             // BEGIN
diff --git a/datafusion/sql/src/unparser/ast.rs b/datafusion/sql/src/unparser/ast.rs
index d9ade822aa005..8446a44b07e35 100644
--- a/datafusion/sql/src/unparser/ast.rs
+++ b/datafusion/sql/src/unparser/ast.rs
@@ -19,7 +19,9 @@ use core::fmt;
 use std::ops::ControlFlow;
 
 use sqlparser::ast::helpers::attached_token::AttachedToken;
-use sqlparser::ast::{self, visit_expressions_mut, OrderByKind, SelectFlavor};
+use sqlparser::ast::{
+    self, LimitClause, OrderByKind, SelectFlavor, visit_expressions_mut,
+};
 
 #[derive(Clone)]
 pub struct QueryBuilder {
@@ -36,7 +38,6 @@ pub struct QueryBuilder {
     distinct_union: bool,
 }
 
-#[allow(dead_code)]
 impl QueryBuilder {
     pub fn with(&mut self, value: Option<ast::With>) -> &mut Self {
         self.with = value;
@@ -100,14 +101,17 @@ impl QueryBuilder {
                 None => return Err(Into::into(UninitializedFieldError::from("body"))),
             },
             order_by,
-            limit: self.limit.clone(),
-            limit_by: self.limit_by.clone(),
-            offset: self.offset.clone(),
+            limit_clause: Some(LimitClause::LimitOffset {
+                limit: self.limit.clone(),
+                offset: self.offset.clone(),
+                limit_by: self.limit_by.clone(),
+            }),
             fetch: self.fetch.clone(),
             locks: self.locks.clone(),
             for_clause: self.for_clause.clone(),
             settings: None,
             format_clause: None,
+            pipe_operators: vec![],
         })
     }
     fn create_empty() -> Self {
@@ -135,7 +139,16 @@ impl Default for QueryBuilder {
 pub struct SelectBuilder {
     distinct: Option<ast::Distinct>,
     top: Option<ast::Top>,
-    projection: Vec<ast::SelectItem>,
+    /// Projection items for the SELECT clause.
+    ///
+    /// This field uses `Option` to distinguish between three distinct states:
+    /// - `None`: No projection has been set (not yet initialized)
+    /// - `Some(vec![])`: Empty projection explicitly set (generates `SELECT FROM ...` or `SELECT 1 FROM ...`)
+    /// - `Some(vec![SelectItem::Wildcard(...)])`: Wildcard projection (generates `SELECT * FROM ...`)
+    /// - `Some(vec![...])`: Non-empty projection with specific columns/expressions
+    ///
+    /// Use `projection()` to set this field and `already_projected()` to check if it has been set.
+    projection: Option<Vec<ast::SelectItem>>,
     into: Option<ast::SelectInto>,
     from: Vec<TableWithJoinsBuilder>,
     lateral_views: Vec<ast::LateralView>,
@@ -143,7 +156,7 @@ pub struct SelectBuilder {
     group_by: Option<ast::GroupByExpr>,
     cluster_by: Vec<ast::Expr>,
     distribute_by: Vec<ast::Expr>,
-    sort_by: Vec<ast::Expr>,
+    sort_by: Vec<ast::OrderByExpr>,
     having: Option<ast::Expr>,
     named_window: Vec<ast::NamedWindowDefinition>,
     qualify: Option<ast::Expr>,
@@ -151,7 +164,6 @@ pub struct SelectBuilder {
     flavor: Option<SelectFlavor>,
 }
 
-#[allow(dead_code)]
 impl SelectBuilder {
     pub fn distinct(&mut self, value: Option<ast::Distinct>) -> &mut Self {
         self.distinct = value;
@@ -162,16 +174,37 @@ impl SelectBuilder {
         self
     }
     pub fn projection(&mut self, value: Vec<ast::SelectItem>) -> &mut Self {
-        self.projection = value;
+        self.projection = Some(value);
         self
     }
     pub fn pop_projections(&mut self) -> Vec<ast::SelectItem> {
-        let ret = self.projection.clone();
-        self.projection.clear();
-        ret
+        self.projection.take().unwrap_or_default()
     }
+    /// Returns true if a projection has been explicitly set via `projection()`.
+    ///
+    /// This method is used to determine whether the SELECT clause has already been
+    /// defined, which helps avoid creating duplicate projection nodes during query
+    /// unparsing. It returns `true` for both empty and non-empty projections.
+    ///
+    /// # Returns
+    ///
+    /// - `true` if `projection()` has been called (regardless of whether it was empty or not)
+    /// - `false` if no projection has been set yet
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// let mut builder = SelectBuilder::default();
+    /// assert!(!builder.already_projected());
+    ///
+    /// builder.projection(vec![]);
+    /// assert!(builder.already_projected()); // true even for empty projection
+    ///
+    /// builder.projection(vec![SelectItem::Wildcard(...)]);
+    /// assert!(builder.already_projected()); // true for non-empty projection
+    /// ```
     pub fn already_projected(&self) -> bool {
-        !self.projection.is_empty()
+        self.projection.is_some()
     }
     pub fn into(&mut self, value: Option<ast::SelectInto>) -> &mut Self {
         self.into = value;
@@ -260,7 +293,7 @@ impl SelectBuilder {
         self.distribute_by = value;
         self
     }
-    pub fn sort_by(&mut self, value: Vec<ast::Expr>) -> &mut Self {
+    pub fn sort_by(&mut self, value: Vec<ast::OrderByExpr>) -> &mut Self {
         self.sort_by = value;
         self
     }
@@ -282,10 +315,12 @@ impl SelectBuilder {
     }
     pub fn build(&self) -> Result<ast::Select, BuilderError> {
         Ok(ast::Select {
+            optimizer_hint: None,
             distinct: self.distinct.clone(),
+            select_modifiers: None,
             top_before_distinct: false,
             top: self.top.clone(),
-            projection: self.projection.clone(),
+            projection: self.projection.clone().unwrap_or_default(),
             into: self.into.clone(),
             from: self
                 .from
@@ -297,7 +332,7 @@ impl SelectBuilder {
             group_by: match self.group_by {
                 Some(ref value) => value.clone(),
                 None => {
-                    return Err(Into::into(UninitializedFieldError::from("group_by")))
+                    return Err(Into::into(UninitializedFieldError::from("group_by")));
                 }
             },
             cluster_by: self.cluster_by.clone(),
@@ -307,21 +342,22 @@ impl SelectBuilder {
             named_window: self.named_window.clone(),
             qualify: self.qualify.clone(),
             value_table_mode: self.value_table_mode,
-            connect_by: None,
+            connect_by: Vec::new(),
             window_before_qualify: false,
             prewhere: None,
             select_token: AttachedToken::empty(),
             flavor: match self.flavor {
-                Some(ref value) => value.clone(),
+                Some(ref value) => *value,
                 None => return Err(Into::into(UninitializedFieldError::from("flavor"))),
             },
+            exclude: None,
         })
     }
     fn create_empty() -> Self {
         Self {
             distinct: Default::default(),
             top: Default::default(),
-            projection: Default::default(),
+            projection: None,
             into: Default::default(),
             from: Default::default(),
             lateral_views: Default::default(),
@@ -350,7 +386,6 @@ pub struct TableWithJoinsBuilder {
     joins: Vec<ast::Join>,
 }
 
-#[allow(dead_code)]
 impl TableWithJoinsBuilder {
     pub fn relation(&mut self, value: RelationBuilder) -> &mut Self {
         self.relation = Some(value);
@@ -396,9 +431,8 @@ pub struct RelationBuilder {
     relation: Option<TableFactorBuilder>,
 }
 
-#[allow(dead_code)]
 #[derive(Clone)]
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 enum TableFactorBuilder {
     Table(TableRelationBuilder),
     Derived(DerivedRelationBuilder),
@@ -406,7 +440,6 @@ enum TableFactorBuilder {
     Empty,
 }
 
-#[allow(dead_code)]
 impl RelationBuilder {
     pub fn has_relation(&self) -> bool {
         self.relation.is_some()
@@ -478,7 +511,6 @@ pub struct TableRelationBuilder {
     index_hints: Vec<ast::TableIndexHints>,
 }
 
-#[allow(dead_code)]
 impl TableRelationBuilder {
     pub fn name(&mut self, value: ast::ObjectName) -> &mut Self {
         self.name = Some(value);
@@ -552,7 +584,6 @@ pub struct DerivedRelationBuilder {
     alias: Option<ast::TableAlias>,
 }
 
-#[allow(dead_code)]
 impl DerivedRelationBuilder {
     pub fn lateral(&mut self, value: bool) -> &mut Self {
         self.lateral = Some(value);
@@ -575,10 +606,11 @@ impl DerivedRelationBuilder {
             subquery: match self.subquery {
                 Some(ref value) => value.clone(),
                 None => {
-                    return Err(Into::into(UninitializedFieldError::from("subquery")))
+                    return Err(Into::into(UninitializedFieldError::from("subquery")));
                 }
             },
             alias: self.alias.clone(),
+            sample: None,
         })
     }
     fn create_empty() -> Self {
@@ -604,7 +636,6 @@ pub struct UnnestRelationBuilder {
     with_ordinality: bool,
 }
 
-#[allow(dead_code)]
 impl UnnestRelationBuilder {
     pub fn alias(&mut self, value: Option<ast::TableAlias>) -> &mut Self {
         self.alias = value;
@@ -705,10 +736,10 @@ impl From<String> for BuilderError {
 impl fmt::Display for BuilderError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::UninitializedField(ref field) => {
+            Self::UninitializedField(field) => {
                 write!(f, "`{field}` must be initialized")
             }
-            Self::ValidationError(ref error) => write!(f, "{error}"),
+            Self::ValidationError(error) => write!(f, "{error}"),
         }
     }
 }
diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs
index a7bde967f2fa4..fe278a0e1edc0 100644
--- a/datafusion/sql/src/unparser/dialect.rs
+++ b/datafusion/sql/src/unparser/dialect.rs
@@ -18,10 +18,12 @@
 use std::{collections::HashMap, sync::Arc};
 
 use super::{
-    utils::character_length_to_sql, utils::date_part_to_sql,
-    utils::sqlite_date_trunc_to_sql, utils::sqlite_from_unixtime_to_sql, Unparser,
+    Unparser, utils::character_length_to_sql, utils::date_part_to_sql,
+    utils::sqlite_date_trunc_to_sql, utils::sqlite_from_unixtime_to_sql,
 };
+use arrow::array::timezone::Tz;
 use arrow::datatypes::TimeUnit;
+use chrono::DateTime;
 use datafusion_common::Result;
 use datafusion_expr::Expr;
 use regex::Regex;
@@ -197,6 +199,66 @@ pub trait Dialect: Send + Sync {
     fn unnest_as_table_factor(&self) -> bool {
         false
     }
+
+    /// Allows the dialect to override column alias unparsing if the dialect has specific rules.
+    /// Returns None if the default unparsing should be used, or Some(String) if there is
+    /// a custom implementation for the alias.
+    fn col_alias_overrides(&self, _alias: &str) -> Result<Option<String>> {
+        Ok(None)
+    }
+
+    /// Allows the dialect to support the QUALIFY clause
+    ///
+    /// Some dialects, like Postgres, do not support the QUALIFY clause
+    fn supports_qualify(&self) -> bool {
+        true
+    }
+
+    /// Allows the dialect to override logic of formatting datetime with tz into string.
+    fn timestamp_with_tz_to_string(&self, dt: DateTime<Tz>, _unit: TimeUnit) -> String {
+        dt.to_string()
+    }
+
+    /// Whether the dialect supports an empty select list such as `SELECT FROM table`.
+    ///
+    /// An empty select list returns rows without any column data, which is useful for:
+    /// - Counting rows: `SELECT FROM users WHERE active = true` (combined with `COUNT(*)`)
+    /// - Testing row existence without retrieving column data
+    /// - Performance optimization when only row counts or existence checks are needed
+    ///
+    /// # Default
+    ///
+    /// Returns `false` for maximum compatibility across SQL dialects. When `false`,
+    /// the unparser falls back to `SELECT 1 FROM table`.
+    ///
+    /// # Implementation Note
+    ///
+    /// Specific dialects should override this method to return `true` if they support
+    /// the empty select list syntax (e.g., PostgreSQL).
+    ///
+    /// # Example SQL Output
+    ///
+    /// ```sql
+    /// -- When supported:
+    /// SELECT FROM users WHERE active = true;
+    ///
+    /// -- Fallback when unsupported:
+    /// SELECT 1 FROM users WHERE active = true;
+    /// ```
+    fn supports_empty_select_list(&self) -> bool {
+        false
+    }
+
+    /// Override the default string literal unparsing.
+    ///
+    /// Returns `Some(ast::Expr)` to replace the default single-quoted string,
+    /// or `None` to use the default behavior.
+    ///
+    /// For example, MSSQL requires non-ASCII strings to use national string
+    /// literal syntax (`N'datafusion資料融合'`).
+    fn string_literal_to_sql(&self, _s: &str) -> Option<ast::Expr> {
+        None
+    }
 }
 
 /// `IntervalStyle` to use for unparsing
@@ -247,19 +309,30 @@ impl Dialect for DefaultDialect {
         let id_upper = identifier.to_uppercase();
         // Special case ignore "ID", see https://github.com/sqlparser-rs/sqlparser-rs/issues/1382
         // ID is a keyword in ClickHouse, but we don't want to quote it when unparsing SQL here
-        if (id_upper != "ID" && ALL_KEYWORDS.contains(&id_upper.as_str()))
+        // Also quote identifiers with uppercase letters since unquoted identifiers are
+        // normalized to lowercase by the SQL parser, which would break case-sensitive schemas
+        let needs_quote = (id_upper != "ID" && ALL_KEYWORDS.contains(&id_upper.as_str()))
             || !identifier_regex.is_match(identifier)
-        {
-            Some('"')
-        } else {
-            None
-        }
+            || identifier.chars().any(|c| c.is_ascii_uppercase());
+        if needs_quote { Some('"') } else { None }
     }
 }
 
 pub struct PostgreSqlDialect {}
 
 impl Dialect for PostgreSqlDialect {
+    fn supports_qualify(&self) -> bool {
+        false
+    }
+
+    fn requires_derived_table_alias(&self) -> bool {
+        true
+    }
+
+    fn supports_empty_select_list(&self) -> bool {
+        true
+    }
+
     fn identifier_quote_style(&self, _: &str) -> Option<char> {
         Some('"')
     }
@@ -310,6 +383,7 @@ impl PostgreSqlDialect {
                     kind: ast::CastKind::Cast,
                     expr: Box::new(expr.clone()),
                     data_type: ast::DataType::Numeric(ast::ExactNumberInfo::None),
+                    array: false,
                     format: None,
                 };
             }
@@ -394,11 +468,26 @@ impl Dialect for DuckDBDialect {
 
         Ok(None)
     }
+
+    fn timestamp_with_tz_to_string(&self, dt: DateTime<Tz>, unit: TimeUnit) -> String {
+        let format = match unit {
+            TimeUnit::Second => "%Y-%m-%d %H:%M:%S%:z",
+            TimeUnit::Millisecond => "%Y-%m-%d %H:%M:%S%.3f%:z",
+            TimeUnit::Microsecond => "%Y-%m-%d %H:%M:%S%.6f%:z",
+            TimeUnit::Nanosecond => "%Y-%m-%d %H:%M:%S%.9f%:z",
+        };
+
+        dt.format(format).to_string()
+    }
 }
 
 pub struct MySqlDialect {}
 
 impl Dialect for MySqlDialect {
+    fn supports_qualify(&self) -> bool {
+        false
+    }
+
     fn identifier_quote_style(&self, _: &str) -> Option<char> {
         Some('`')
     }
@@ -460,6 +549,10 @@ impl Dialect for MySqlDialect {
 pub struct SqliteDialect {}
 
 impl Dialect for SqliteDialect {
+    fn supports_qualify(&self) -> bool {
+        false
+    }
+
     fn identifier_quote_style(&self, _: &str) -> Option<char> {
         Some('`')
     }
@@ -480,6 +573,14 @@ impl Dialect for SqliteDialect {
         false
     }
 
+    fn timestamp_cast_dtype(
+        &self,
+        _time_unit: &TimeUnit,
+        _tz: &Option<Arc<str>>,
+    ) -> ast::DataType {
+        ast::DataType::Text
+    }
+
     fn scalar_function_to_sql_overrides(
         &self,
         unparser: &Unparser,
@@ -500,6 +601,49 @@ impl Dialect for SqliteDialect {
     }
 }
 
+#[derive(Default)]
+pub struct BigQueryDialect {}
+
+impl Dialect for BigQueryDialect {
+    fn identifier_quote_style(&self, _: &str) -> Option<char> {
+        Some('`')
+    }
+
+    fn col_alias_overrides(&self, alias: &str) -> Result<Option<String>> {
+        // Check if alias contains any special characters not supported by BigQuery col names
+        // https://cloud.google.com/bigquery/docs/schemas#flexible-column-names
+        let special_chars: [char; 20] = [
+            '!', '"', '$', '(', ')', '*', ',', '.', '/', ';', '?', '@', '[', '\\', ']',
+            '^', '`', '{', '}', '~',
+        ];
+
+        if alias.chars().any(|c| special_chars.contains(&c)) {
+            let mut encoded_name = String::new();
+            for c in alias.chars() {
+                if special_chars.contains(&c) {
+                    encoded_name.push_str(&format!("_{}", c as u32));
+                } else {
+                    encoded_name.push(c);
+                }
+            }
+            Ok(Some(encoded_name))
+        } else {
+            Ok(Some(alias.to_string()))
+        }
+    }
+
+    fn unnest_as_table_factor(&self) -> bool {
+        true
+    }
+}
+
+impl BigQueryDialect {
+    #[must_use]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
 pub struct CustomDialect {
     identifier_quote_style: Option<char>,
     supports_nulls_first_in_sort: bool,
@@ -553,17 +697,6 @@ impl Default for CustomDialect {
     }
 }
 
-impl CustomDialect {
-    // Create a CustomDialect
-    #[deprecated(since = "41.0.0", note = "please use `CustomDialectBuilder` instead")]
-    pub fn new(identifier_quote_style: Option<char>) -> Self {
-        Self {
-            identifier_quote_style,
-            ..Default::default()
-        }
-    }
-}
-
 impl Dialect for CustomDialect {
     fn identifier_quote_style(&self, _: &str) -> Option<char> {
         self.identifier_quote_style
diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs
index 41cb811d19d9c..54c8eeb1252d9 100644
--- a/datafusion/sql/src/unparser/expr.rs
+++ b/datafusion/sql/src/unparser/expr.rs
@@ -15,34 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_expr::expr::{AggregateFunctionParams, Unnest, WindowFunctionParams};
 use sqlparser::ast::Value::SingleQuotedString;
 use sqlparser::ast::{
-    self, Array, BinaryOperator, CaseWhen, Expr as AstExpr, Function, Ident, Interval,
-    ObjectName, OrderByOptions, Subscript, TimezoneInfo, UnaryOperator, ValueWithSpan,
+    self, Array, BinaryOperator, CaseWhen, DuplicateTreatment, Expr as AstExpr, Function,
+    Ident, Interval, ObjectName, OrderByOptions, Subscript, TimezoneInfo, UnaryOperator,
+    ValueWithSpan,
 };
 use std::sync::Arc;
 use std::vec;
 
-use super::dialect::IntervalStyle;
 use super::Unparser;
+use super::dialect::IntervalStyle;
 use arrow::array::{
+    ArrayRef, Date32Array, Date64Array, PrimitiveArray,
     types::{
         ArrowTemporalType, Time32MillisecondType, Time32SecondType,
         Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
         TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
     },
-    ArrayRef, Date32Array, Date64Array, PrimitiveArray,
 };
-use arrow::datatypes::{DataType, Decimal128Type, Decimal256Type, DecimalType};
+use arrow::datatypes::{
+    DataType, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, DecimalType,
+    FieldRef,
+};
 use arrow::util::display::array_value_to_string;
 use datafusion_common::{
-    internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, Result,
-    ScalarValue,
+    Column, Result, ScalarValue, assert_eq_or_internal_err, assert_or_internal_err,
+    internal_datafusion_err, internal_err, not_impl_err, plan_err,
 };
 use datafusion_expr::{
-    expr::{Alias, Exists, InList, ScalarFunction, Sort, WindowFunction},
     Between, BinaryExpr, Case, Cast, Expr, GroupingSet, Like, Operator, TryCast,
+    expr::{Alias, Exists, InList, ScalarFunction, SetQuantifier, Sort, WindowFunction},
 };
 use sqlparser::ast::helpers::attached_token::AttachedToken;
 use sqlparser::tokenizer::Span;
@@ -67,9 +72,8 @@ use sqlparser::tokenizer::Span;
 /// use datafusion_expr::{col, lit};
 /// use datafusion_sql::unparser::expr_to_sql;
 /// let expr = col("a").gt(lit(4)); // form an expression `a > 4`
-/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr
-/// // use the Display impl to convert to SQL text
-/// assert_eq!(sql.to_string(), "(a > 4)")
+/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr, using
+/// assert_eq!(sql.to_string(), "(a > 4)"); // use Display impl for SQL text
 /// ```
 ///
 /// [`SqlToRel::sql_to_expr`]: crate::planner::SqlToRel::sql_to_expr
@@ -182,24 +186,27 @@ impl Unparser<'_> {
                     operand,
                     conditions,
                     else_result,
+                    case_token: AttachedToken::empty(),
+                    end_token: AttachedToken::empty(),
                 })
             }
-            Expr::Cast(Cast { expr, data_type }) => {
-                Ok(self.cast_to_sql(expr, data_type)?)
-            }
-            Expr::Literal(value) => Ok(self.scalar_to_sql(value)?),
+            Expr::Cast(Cast { expr, field }) => Ok(self.cast_to_sql(expr, field)?),
+            Expr::Literal(value, _) => Ok(self.scalar_to_sql(value)?),
             Expr::Alias(Alias { expr, name: _, .. }) => self.expr_to_sql_inner(expr),
-            Expr::WindowFunction(WindowFunction {
-                fun,
-                params:
-                    WindowFunctionParams {
-                        args,
-                        partition_by,
-                        order_by,
-                        window_frame,
-                        ..
-                    },
-            }) => {
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction {
+                    fun,
+                    params:
+                        WindowFunctionParams {
+                            args,
+                            partition_by,
+                            order_by,
+                            window_frame,
+                            filter,
+                            distinct,
+                            ..
+                        },
+                } = window_fun.as_ref();
                 let func_name = fun.name();
 
                 let args = self.function_args_to_sql(args)?;
@@ -255,11 +262,15 @@ impl Unparser<'_> {
                         span: Span::empty(),
                     }]),
                     args: ast::FunctionArguments::List(ast::FunctionArgumentList {
-                        duplicate_treatment: None,
+                        duplicate_treatment: distinct
+                            .then_some(DuplicateTreatment::Distinct),
                         args,
                         clauses: vec![],
                     }),
-                    filter: None,
+                    filter: filter
+                        .as_ref()
+                        .map(|f| self.expr_to_sql_inner(f).map(Box::new))
+                        .transpose()?,
                     null_treatment: None,
                     over,
                     within_group: vec![],
@@ -277,7 +288,7 @@ impl Unparser<'_> {
                 negated: *negated,
                 expr: Box::new(self.expr_to_sql_inner(expr)?),
                 pattern: Box::new(self.expr_to_sql_inner(pattern)?),
-                escape_char: escape_char.map(|c| c.to_string()),
+                escape_char: escape_char.map(|c| SingleQuotedString(c.to_string())),
                 any: false,
             }),
             Expr::Like(Like {
@@ -292,7 +303,8 @@ impl Unparser<'_> {
                         negated: *negated,
                         expr: Box::new(self.expr_to_sql_inner(expr)?),
                         pattern: Box::new(self.expr_to_sql_inner(pattern)?),
-                        escape_char: escape_char.map(|c| c.to_string()),
+                        escape_char: escape_char
+                            .map(|c| SingleQuotedString(c.to_string())),
                         any: false,
                     })
                 } else {
@@ -300,7 +312,8 @@ impl Unparser<'_> {
                         negated: *negated,
                         expr: Box::new(self.expr_to_sql_inner(expr)?),
                         pattern: Box::new(self.expr_to_sql_inner(pattern)?),
-                        escape_char: escape_char.map(|c| c.to_string()),
+                        escape_char: escape_char
+                            .map(|c| SingleQuotedString(c.to_string())),
                         any: false,
                     })
                 }
@@ -321,16 +334,15 @@ impl Unparser<'_> {
                     Some(filter) => Some(Box::new(self.expr_to_sql_inner(filter)?)),
                     None => None,
                 };
-                let within_group = if agg.func.is_ordered_set_aggregate() {
-                    order_by
-                        .as_ref()
-                        .unwrap_or(&Vec::new())
-                        .iter()
-                        .map(|sort_expr| self.sort_to_sql(sort_expr))
-                        .collect::<Result<Vec<_>>>()?
-                } else {
-                    Vec::new()
-                };
+                let within_group: Vec<ast::OrderByExpr> =
+                    if agg.func.supports_within_group_clause() {
+                        order_by
+                            .iter()
+                            .map(|sort_expr| self.sort_to_sql(sort_expr))
+                            .collect::<Result<Vec<ast::OrderByExpr>>>()?
+                    } else {
+                        Vec::new()
+                    };
                 Ok(ast::Expr::Function(Function {
                     name: ObjectName::from(vec![Ident {
                         value: func_name.to_string(),
@@ -339,7 +351,7 @@ impl Unparser<'_> {
                     }]),
                     args: ast::FunctionArguments::List(ast::FunctionArgumentList {
                         duplicate_treatment: distinct
-                            .then_some(ast::DuplicateTreatment::Distinct),
+                            .then_some(DuplicateTreatment::Distinct),
                         args,
                         clauses: vec![],
                     }),
@@ -381,6 +393,33 @@ impl Unparser<'_> {
                     negated: insubq.negated,
                 })
             }
+            Expr::SetComparison(set_cmp) => {
+                let left = Box::new(self.expr_to_sql_inner(set_cmp.expr.as_ref())?);
+                let sub_statement =
+                    self.plan_to_sql(set_cmp.subquery.subquery.as_ref())?;
+                let sub_query = if let ast::Statement::Query(inner_query) = sub_statement
+                {
+                    inner_query
+                } else {
+                    return plan_err!(
+                        "Subquery must be a Query, but found {sub_statement:?}"
+                    );
+                };
+                let compare_op = self.op_to_sql(&set_cmp.op)?;
+                match set_cmp.quantifier {
+                    SetQuantifier::Any => Ok(ast::Expr::AnyOp {
+                        left,
+                        compare_op,
+                        right: Box::new(ast::Expr::Subquery(sub_query)),
+                        is_some: false,
+                    }),
+                    SetQuantifier::All => Ok(ast::Expr::AllOp {
+                        left,
+                        compare_op,
+                        right: Box::new(ast::Expr::Subquery(sub_query)),
+                    }),
+                }
+            }
             Expr::Exists(Exists { subquery, negated }) => {
                 let sub_statement = self.plan_to_sql(subquery.subquery.as_ref())?;
                 let sub_query = if let ast::Statement::Query(inner_query) = sub_statement
@@ -435,9 +474,7 @@ impl Unparser<'_> {
                 })
             }
             Expr::ScalarVariable(_, ids) => {
-                if ids.is_empty() {
-                    return internal_err!("Not a valid ScalarVariable");
-                }
+                assert_or_internal_err!(!ids.is_empty(), "Not a valid ScalarVariable");
 
                 Ok(if ids.len() == 1 {
                     ast::Expr::Identifier(
@@ -451,12 +488,13 @@ impl Unparser<'_> {
                     )
                 })
             }
-            Expr::TryCast(TryCast { expr, data_type }) => {
+            Expr::TryCast(TryCast { expr, field }) => {
                 let inner_expr = self.expr_to_sql_inner(expr)?;
                 Ok(ast::Expr::Cast {
                     kind: ast::CastKind::TryCast,
                     expr: Box::new(inner_expr),
-                    data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                    data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                    array: false,
                     format: None,
                 })
             }
@@ -581,9 +619,11 @@ impl Unparser<'_> {
     }
 
     fn array_element_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("array_element must have exactly 2 arguments");
-        }
+        assert_eq_or_internal_err!(
+            args.len(),
+            2,
+            "array_element must have exactly 2 arguments"
+        );
         let array = self.expr_to_sql(&args[0])?;
         let index = self.expr_to_sql(&args[1])?;
         Ok(ast::Expr::CompoundFieldAccess {
@@ -593,15 +633,16 @@ impl Unparser<'_> {
     }
 
     fn named_struct_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() % 2 != 0 {
-            return internal_err!("named_struct must have an even number of arguments");
-        }
+        assert_or_internal_err!(
+            args.len().is_multiple_of(2),
+            "named_struct must have an even number of arguments"
+        );
 
         let args = args
             .chunks_exact(2)
             .map(|chunk| {
                 let key = match &chunk[0] {
-                    Expr::Literal(ScalarValue::Utf8(Some(s))) => self.new_ident_quoted_if_needs(s.to_string()),
+                    Expr::Literal(ScalarValue::Utf8(Some(s)), _) => self.new_ident_quoted_if_needs(s.to_string()),
                     _ => return internal_err!("named_struct expects even arguments to be strings, but received: {:?}", &chunk[0])
                 };
 
@@ -616,37 +657,66 @@ impl Unparser<'_> {
     }
 
     fn get_field_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("get_field must have exactly 2 arguments");
+        if args.len() < 2 {
+            return internal_err!(
+                "get_field must have at least 2 arguments, got {}",
+                args.len()
+            );
         }
 
-        let mut id = match &args[0] {
-            Expr::Column(col) => match self.col_to_sql(col)? {
-                ast::Expr::Identifier(ident) => vec![ident],
-                ast::Expr::CompoundIdentifier(idents) => idents,
-                other => return internal_err!("expected col_to_sql to return an Identifier or CompoundIdentifier, but received: {:?}", other),
-            },
-            _ => return internal_err!("get_field expects first argument to be column, but received: {:?}", &args[0]),
-        };
+        // Extract all field names (args[1..])
+        let mut fields = Vec::with_capacity(args.len() - 1);
+        for arg in &args[1..] {
+            let field = match arg {
+                Expr::Literal(lit, _) => self.new_ident_quoted_if_needs(lit.to_string()),
+                _ => {
+                    return internal_err!(
+                        "get_field expects field arguments to be strings, but received: {:?}",
+                        arg
+                    );
+                }
+            };
+            fields.push(field);
+        }
 
-        let field = match &args[1] {
-            Expr::Literal(lit) => self.new_ident_quoted_if_needs(lit.to_string()),
+        match &args[0] {
+            Expr::Column(col) => {
+                let mut id = match self.col_to_sql(col)? {
+                    ast::Expr::Identifier(ident) => vec![ident],
+                    ast::Expr::CompoundIdentifier(idents) => idents,
+                    other => {
+                        return internal_err!(
+                            "expected col_to_sql to return an Identifier or CompoundIdentifier, but received: {:?}",
+                            other
+                        );
+                    }
+                };
+                id.extend(fields);
+                Ok(ast::Expr::CompoundIdentifier(id))
+            }
+            Expr::ScalarFunction(struct_expr) => {
+                let root = self
+                    .scalar_function_to_sql(struct_expr.func.name(), &struct_expr.args)?;
+                let access_chain = fields
+                    .into_iter()
+                    .map(|field| ast::AccessExpr::Dot(ast::Expr::Identifier(field)))
+                    .collect();
+                Ok(ast::Expr::CompoundFieldAccess {
+                    root: Box::new(root),
+                    access_chain,
+                })
+            }
             _ => {
-                return internal_err!(
-                "get_field expects second argument to be a string, but received: {:?}",
-                &args[0]
-            )
+                internal_err!(
+                    "get_field expects first argument to be column or scalar function, but received: {:?}",
+                    &args[0]
+                )
             }
-        };
-        id.push(field);
-
-        Ok(ast::Expr::CompoundIdentifier(id))
+        }
     }
 
     fn map_to_sql(&self, args: &[Expr]) -> Result<ast::Expr> {
-        if args.len() != 2 {
-            return internal_err!("map must have exactly 2 arguments");
-        }
+        assert_eq_or_internal_err!(args.len(), 2, "map must have exactly 2 arguments");
 
         let ast::Expr::Array(Array { elem: keys, .. }) = self.expr_to_sql(&args[0])?
         else {
@@ -709,13 +779,21 @@ impl Unparser<'_> {
     }
 
     pub fn col_to_sql(&self, col: &Column) -> Result<ast::Expr> {
+        // Replace the column name if the dialect has an override
+        let col_name =
+            if let Some(rewritten_name) = self.dialect.col_alias_overrides(&col.name)? {
+                rewritten_name
+            } else {
+                col.name.to_string()
+            };
+
         if let Some(table_ref) = &col.relation {
             let mut id = if self.dialect.full_qualified_col() {
                 table_ref.to_vec()
             } else {
                 vec![table_ref.table().to_string()]
             };
-            id.push(col.name.to_string());
+            id.push(col_name);
             return Ok(ast::Expr::CompoundIdentifier(
                 id.iter()
                     .map(|i| self.new_ident_quoted_if_needs(i.to_string()))
@@ -723,7 +801,7 @@ impl Unparser<'_> {
             ));
         }
         Ok(ast::Expr::Identifier(
-            self.new_ident_quoted_if_needs(col.name.to_string()),
+            self.new_ident_quoted_if_needs(col_name),
         ))
     }
 
@@ -1016,6 +1094,7 @@ impl Unparser<'_> {
             Operator::Question => Ok(BinaryOperator::Question),
             Operator::QuestionAnd => Ok(BinaryOperator::QuestionAnd),
             Operator::QuestionPipe => Ok(BinaryOperator::QuestionPipe),
+            Operator::Colon => Ok(BinaryOperator::Custom(":".to_owned())),
         }
     }
 
@@ -1027,8 +1106,19 @@ impl Unparser<'_> {
     where
         i64: From<T::Native>,
     {
+        let time_unit = match T::DATA_TYPE {
+            DataType::Timestamp(unit, _) => unit,
+            _ => {
+                return Err(internal_datafusion_err!(
+                    "Expected Timestamp, got {:?}",
+                    T::DATA_TYPE
+                ));
+            }
+        };
+
         let ts = if let Some(tz) = tz {
-            v.to_array()?
+            let dt = v
+                .to_array()?
                 .as_any()
                 .downcast_ref::<PrimitiveArray<T>>()
                 .ok_or(internal_datafusion_err!(
@@ -1037,8 +1127,8 @@ impl Unparser<'_> {
                 .value_as_datetime_with_tz(0, tz.parse()?)
                 .ok_or(internal_datafusion_err!(
                     "Unable to convert {v:?} to DateTime"
-                ))?
-                .to_string()
+                ))?;
+            self.dialect.timestamp_with_tz_to_string(dt, time_unit)
         } else {
             v.to_array()?
                 .as_any()
@@ -1053,20 +1143,11 @@ impl Unparser<'_> {
                 .to_string()
         };
 
-        let time_unit = match T::DATA_TYPE {
-            DataType::Timestamp(unit, _) => unit,
-            _ => {
-                return Err(internal_datafusion_err!(
-                    "Expected Timestamp, got {:?}",
-                    T::DATA_TYPE
-                ))
-            }
-        };
-
         Ok(ast::Expr::Cast {
             kind: ast::CastKind::Cast,
             expr: Box::new(ast::Expr::value(SingleQuotedString(ts))),
             data_type: self.dialect.timestamp_cast_dtype(&time_unit, &None),
+            array: false,
             format: None,
         })
     }
@@ -1089,30 +1170,36 @@ impl Unparser<'_> {
             kind: ast::CastKind::Cast,
             expr: Box::new(ast::Expr::value(SingleQuotedString(time))),
             data_type: ast::DataType::Time(None, TimezoneInfo::None),
+            array: false,
             format: None,
         })
     }
 
     // Explicit type cast on ast::Expr::Value is not needed by underlying engine for certain types
     // For example: CAST(Utf8("binary_value") AS Binary) and  CAST(Utf8("dictionary_value") AS Dictionary)
-    fn cast_to_sql(&self, expr: &Expr, data_type: &DataType) -> Result<ast::Expr> {
+    fn cast_to_sql(&self, expr: &Expr, field: &FieldRef) -> Result<ast::Expr> {
         let inner_expr = self.expr_to_sql_inner(expr)?;
+        let data_type = field.data_type();
         match inner_expr {
             ast::Expr::Value(_) => match data_type {
-                DataType::Dictionary(_, _) | DataType::Binary | DataType::BinaryView => {
+                DataType::Dictionary(_, _) | DataType::Binary | DataType::BinaryView
+                    if field.metadata().is_empty() =>
+                {
                     Ok(inner_expr)
                 }
                 _ => Ok(ast::Expr::Cast {
                     kind: ast::CastKind::Cast,
                     expr: Box::new(inner_expr),
-                    data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                    data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                    array: false,
                     format: None,
                 }),
             },
             _ => Ok(ast::Expr::Cast {
                 kind: ast::CastKind::Cast,
                 expr: Box::new(inner_expr),
-                data_type: self.arrow_dtype_to_ast_dtype(data_type)?,
+                data_type: self.arrow_dtype_to_ast_dtype(field)?,
+                array: false,
                 format: None,
             }),
         }
@@ -1147,6 +1234,20 @@ impl Unparser<'_> {
                 Ok(ast::Expr::value(ast::Value::Number(f_val, false)))
             }
             ScalarValue::Float64(None) => Ok(ast::Expr::value(ast::Value::Null)),
+            ScalarValue::Decimal32(Some(value), precision, scale) => {
+                Ok(ast::Expr::value(ast::Value::Number(
+                    Decimal32Type::format_decimal(*value, *precision, *scale),
+                    false,
+                )))
+            }
+            ScalarValue::Decimal32(None, ..) => Ok(ast::Expr::value(ast::Value::Null)),
+            ScalarValue::Decimal64(Some(value), precision, scale) => {
+                Ok(ast::Expr::value(ast::Value::Number(
+                    Decimal64Type::format_decimal(*value, *precision, *scale),
+                    false,
+                )))
+            }
+            ScalarValue::Decimal64(None, ..) => Ok(ast::Expr::value(ast::Value::Null)),
             ScalarValue::Decimal128(Some(value), precision, scale) => {
                 Ok(ast::Expr::value(ast::Value::Number(
                     Decimal128Type::format_decimal(*value, *precision, *scale),
@@ -1193,18 +1294,17 @@ impl Unparser<'_> {
                 Ok(ast::Expr::value(ast::Value::Number(ui.to_string(), false)))
             }
             ScalarValue::UInt64(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::Utf8(Some(str)) => {
-                Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
-            }
-            ScalarValue::Utf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::Utf8View(Some(str)) => {
-                Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
-            }
-            ScalarValue::Utf8View(None) => Ok(ast::Expr::value(ast::Value::Null)),
-            ScalarValue::LargeUtf8(Some(str)) => {
+            ScalarValue::Utf8(Some(str))
+            | ScalarValue::Utf8View(Some(str))
+            | ScalarValue::LargeUtf8(Some(str)) => {
+                if let Some(expr) = self.dialect.string_literal_to_sql(str) {
+                    return Ok(expr);
+                }
                 Ok(ast::Expr::value(SingleQuotedString(str.to_string())))
             }
-            ScalarValue::LargeUtf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
+            ScalarValue::Utf8(None)
+            | ScalarValue::Utf8View(None)
+            | ScalarValue::LargeUtf8(None) => Ok(ast::Expr::value(ast::Value::Null)),
             ScalarValue::Binary(Some(_)) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Binary(None) => Ok(ast::Expr::value(ast::Value::Null)),
             ScalarValue::BinaryView(Some(_)) => {
@@ -1240,6 +1340,7 @@ impl Unparser<'_> {
                         date.to_string(),
                     ))),
                     data_type: ast::DataType::Date,
+                    array: false,
                     format: None,
                 })
             }
@@ -1263,6 +1364,7 @@ impl Unparser<'_> {
                         datetime.to_string(),
                     ))),
                     data_type: self.ast_type_for_date64_in_cast(),
+                    array: false,
                     format: None,
                 })
             }
@@ -1349,6 +1451,7 @@ impl Unparser<'_> {
             ScalarValue::Map(_) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Union(..) => not_impl_err!("Unsupported scalar: {v:?}"),
             ScalarValue::Dictionary(_k, v) => self.scalar_to_sql(v),
+            ScalarValue::RunEndEncoded(_, _, v) => self.scalar_to_sql(v),
         }
     }
 
@@ -1378,7 +1481,9 @@ impl Unparser<'_> {
             };
             return Ok(ast::Expr::Interval(interval));
         } else if months != 0 {
-            return not_impl_err!("Unsupported Interval scalar with both Month and DayTime for IntervalStyle::MySQL");
+            return not_impl_err!(
+                "Unsupported Interval scalar with both Month and DayTime for IntervalStyle::MySQL"
+            );
         }
 
         // DAY only
@@ -1566,7 +1671,9 @@ impl Unparser<'_> {
                         };
                         Ok(ast::Expr::Interval(interval))
                     } else {
-                        not_impl_err!("Unsupported IntervalMonthDayNano scalar with both Month and DayTime for IntervalStyle::SQLStandard")
+                        not_impl_err!(
+                            "Unsupported IntervalMonthDayNano scalar with both Month and DayTime for IntervalStyle::SQLStandard"
+                        )
                     }
                 }
                 _ => not_impl_err!(
@@ -1620,10 +1727,11 @@ impl Unparser<'_> {
         }))
     }
 
-    fn arrow_dtype_to_ast_dtype(&self, data_type: &DataType) -> Result<ast::DataType> {
+    fn arrow_dtype_to_ast_dtype(&self, field: &FieldRef) -> Result<ast::DataType> {
+        let data_type = field.data_type();
         match data_type {
             DataType::Null => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Boolean => Ok(ast::DataType::Bool),
             DataType::Int8 => Ok(ast::DataType::TinyInt(None)),
@@ -1635,9 +1743,9 @@ impl Unparser<'_> {
             DataType::UInt32 => Ok(ast::DataType::IntegerUnsigned(None)),
             DataType::UInt64 => Ok(ast::DataType::BigIntUnsigned(None)),
             DataType::Float16 => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
-            DataType::Float32 => Ok(ast::DataType::Float(None)),
+            DataType::Float32 => Ok(ast::DataType::Float(ast::ExactNumberInfo::None)),
             DataType::Float64 => Ok(self.dialect.float64_ast_dtype()),
             DataType::Timestamp(time_unit, tz) => {
                 Ok(self.dialect.timestamp_cast_dtype(time_unit, tz))
@@ -1645,53 +1753,61 @@ impl Unparser<'_> {
             DataType::Date32 => Ok(self.dialect.date32_cast_dtype()),
             DataType::Date64 => Ok(self.ast_type_for_date64_in_cast()),
             DataType::Time32(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Time64(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Duration(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
-            DataType::Interval(_) => Ok(ast::DataType::Interval),
+            DataType::Interval(_) => Ok(ast::DataType::Interval {
+                fields: None,
+                precision: None,
+            }),
             DataType::Binary => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::FixedSizeBinary(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::LargeBinary => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::BinaryView => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Utf8 => Ok(self.dialect.utf8_cast_dtype()),
             DataType::LargeUtf8 => Ok(self.dialect.large_utf8_cast_dtype()),
             DataType::Utf8View => Ok(self.dialect.utf8_cast_dtype()),
             DataType::List(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::FixedSizeList(_, _) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::LargeList(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::ListView(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::LargeListView(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Struct(_) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
             DataType::Union(_, _) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
+            }
+            DataType::Dictionary(_, val) => {
+                self.arrow_dtype_to_ast_dtype(&val.clone().into_nullable_field_ref())
             }
-            DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val),
-            DataType::Decimal128(precision, scale)
+            DataType::RunEndEncoded(_, val) => self.arrow_dtype_to_ast_dtype(val),
+            DataType::Decimal32(precision, scale)
+            | DataType::Decimal64(precision, scale)
+            | DataType::Decimal128(precision, scale)
             | DataType::Decimal256(precision, scale) => {
                 let mut new_precision = *precision as u64;
                 let mut new_scale = *scale as u64;
@@ -1701,14 +1817,14 @@ impl Unparser<'_> {
                 }
 
                 Ok(ast::DataType::Decimal(
-                    ast::ExactNumberInfo::PrecisionAndScale(new_precision, new_scale),
+                    ast::ExactNumberInfo::PrecisionAndScale(
+                        new_precision,
+                        new_scale as i64,
+                    ),
                 ))
             }
             DataType::Map(_, _) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
-            }
-            DataType::RunEndEncoded(_, _) => {
-                not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
+                not_impl_err!("Unsupported DataType: conversion: {data_type}")
             }
         }
     }
@@ -1723,15 +1839,16 @@ mod tests {
     use arrow::array::{LargeListArray, ListArray};
     use arrow::datatypes::{DataType::Int8, Field, Int32Type, Schema, TimeUnit};
     use ast::ObjectName;
+    use datafusion_common::datatype::DataTypeExt;
     use datafusion_common::{Spans, TableReference};
     use datafusion_expr::expr::WildcardOptions;
     use datafusion_expr::{
-        case, cast, col, cube, exists, grouping_set, interval_datetime_lit,
-        interval_year_month_lit, lit, not, not_exists, out_ref_col, placeholder, rollup,
-        table_scan, try_cast, when, ColumnarValue, ScalarFunctionArgs, ScalarUDF,
-        ScalarUDFImpl, Signature, Volatility, WindowFrame, WindowFunctionDefinition,
+        ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+        Volatility, WindowFrame, WindowFunctionDefinition, case, cast, col, cube, exists,
+        grouping_set, interval_datetime_lit, interval_year_month_lit, lit, not,
+        not_exists, out_ref_col, placeholder, rollup, table_scan, try_cast, when,
     };
-    use datafusion_expr::{interval_month_day_nano_lit, ExprFunctionExt};
+    use datafusion_expr::{ExprFunctionExt, interval_month_day_nano_lit};
     use datafusion_functions::datetime::from_unixtime::FromUnixtimeFunc;
     use datafusion_functions::expr_fn::{get_field, named_struct};
     use datafusion_functions_aggregate::count::count_udaf;
@@ -1750,7 +1867,7 @@ mod tests {
     use super::*;
 
     /// Mocked UDF
-    #[derive(Debug)]
+    #[derive(Debug, PartialEq, Eq, Hash)]
     struct DummyUDF {
         signature: Signature,
     }
@@ -1825,34 +1942,25 @@ mod tests {
                 r#"CASE WHEN a IS NOT NULL THEN true ELSE false END"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Date64,
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::Date64)),
                 r#"CAST(a AS DATETIME)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Timestamp(
-                        TimeUnit::Nanosecond,
-                        Some("+08:00".into()),
-                    ),
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(col("a")),
+                    DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
+                )),
                 r#"CAST(a AS TIMESTAMP WITH TIME ZONE)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Timestamp(TimeUnit::Millisecond, None),
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(col("a")),
+                    DataType::Timestamp(TimeUnit::Millisecond, None),
+                )),
                 r#"CAST(a AS TIMESTAMP)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::UInt32,
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::UInt32)),
                 r#"CAST(a AS INTEGER UNSIGNED)"#,
             ),
             (
@@ -1910,87 +2018,87 @@ mod tests {
                 r#"a LIKE 'foo' ESCAPE 'o'"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date64(Some(0))),
+                Expr::Literal(ScalarValue::Date64(Some(0)), None),
                 r#"CAST('1970-01-01 00:00:00' AS DATETIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date64(Some(10000))),
+                Expr::Literal(ScalarValue::Date64(Some(10000)), None),
                 r#"CAST('1970-01-01 00:00:10' AS DATETIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date64(Some(-10000))),
+                Expr::Literal(ScalarValue::Date64(Some(-10000)), None),
                 r#"CAST('1969-12-31 23:59:50' AS DATETIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date32(Some(0))),
+                Expr::Literal(ScalarValue::Date32(Some(0)), None),
                 r#"CAST('1970-01-01' AS DATE)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date32(Some(10))),
+                Expr::Literal(ScalarValue::Date32(Some(10)), None),
                 r#"CAST('1970-01-11' AS DATE)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Date32(Some(-1))),
+                Expr::Literal(ScalarValue::Date32(Some(-1)), None),
                 r#"CAST('1969-12-31' AS DATE)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampSecond(Some(10001), None)),
+                Expr::Literal(ScalarValue::TimestampSecond(Some(10001), None), None),
                 r#"CAST('1970-01-01 02:46:41' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampSecond(
-                    Some(10001),
-                    Some("+08:00".into()),
-                )),
+                Expr::Literal(
+                    ScalarValue::TimestampSecond(Some(10001), Some("+08:00".into())),
+                    None,
+                ),
                 r#"CAST('1970-01-01 10:46:41 +08:00' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampMillisecond(Some(10001), None)),
+                Expr::Literal(ScalarValue::TimestampMillisecond(Some(10001), None), None),
                 r#"CAST('1970-01-01 00:00:10.001' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampMillisecond(
-                    Some(10001),
-                    Some("+08:00".into()),
-                )),
+                Expr::Literal(
+                    ScalarValue::TimestampMillisecond(Some(10001), Some("+08:00".into())),
+                    None,
+                ),
                 r#"CAST('1970-01-01 08:00:10.001 +08:00' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampMicrosecond(Some(10001), None)),
+                Expr::Literal(ScalarValue::TimestampMicrosecond(Some(10001), None), None),
                 r#"CAST('1970-01-01 00:00:00.010001' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampMicrosecond(
-                    Some(10001),
-                    Some("+08:00".into()),
-                )),
+                Expr::Literal(
+                    ScalarValue::TimestampMicrosecond(Some(10001), Some("+08:00".into())),
+                    None,
+                ),
                 r#"CAST('1970-01-01 08:00:00.010001 +08:00' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampNanosecond(Some(10001), None)),
+                Expr::Literal(ScalarValue::TimestampNanosecond(Some(10001), None), None),
                 r#"CAST('1970-01-01 00:00:00.000010001' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::TimestampNanosecond(
-                    Some(10001),
-                    Some("+08:00".into()),
-                )),
+                Expr::Literal(
+                    ScalarValue::TimestampNanosecond(Some(10001), Some("+08:00".into())),
+                    None,
+                ),
                 r#"CAST('1970-01-01 08:00:00.000010001 +08:00' AS TIMESTAMP)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Time32Second(Some(10001))),
+                Expr::Literal(ScalarValue::Time32Second(Some(10001)), None),
                 r#"CAST('02:46:41' AS TIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Time32Millisecond(Some(10001))),
+                Expr::Literal(ScalarValue::Time32Millisecond(Some(10001)), None),
                 r#"CAST('00:00:10.001' AS TIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Time64Microsecond(Some(10001))),
+                Expr::Literal(ScalarValue::Time64Microsecond(Some(10001)), None),
                 r#"CAST('00:00:00.010001' AS TIME)"#,
             ),
             (
-                Expr::Literal(ScalarValue::Time64Nanosecond(Some(10001))),
+                Expr::Literal(ScalarValue::Time64Nanosecond(Some(10001)), None),
                 r#"CAST('00:00:00.000010001' AS TIME)"#,
             ),
             (sum(col("a")), r#"sum(a)"#),
@@ -2019,7 +2127,7 @@ mod tests {
                 "count(*) FILTER (WHERE true)",
             ),
             (
-                Expr::WindowFunction(WindowFunction {
+                Expr::from(WindowFunction {
                     fun: WindowFunctionDefinition::WindowUDF(row_number_udwf()),
                     params: WindowFunctionParams {
                         args: vec![col("col")],
@@ -2027,13 +2135,15 @@ mod tests {
                         order_by: vec![],
                         window_frame: WindowFrame::new(None),
                         null_treatment: None,
+                        distinct: false,
+                        filter: None,
                     },
                 }),
                 r#"row_number(col) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)"#,
             ),
             (
                 #[expect(deprecated)]
-                Expr::WindowFunction(WindowFunction {
+                Expr::from(WindowFunction {
                     fun: WindowFunctionDefinition::AggregateUDF(count_udaf()),
                     params: WindowFunctionParams {
                         args: vec![Expr::Wildcard {
@@ -2052,9 +2162,11 @@ mod tests {
                             ),
                         ),
                         null_treatment: None,
+                        distinct: false,
+                        filter: Some(Box::new(col("a").gt(lit(100)))),
                     },
                 }),
-                r#"count(*) OVER (ORDER BY a DESC NULLS FIRST RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING)"#,
+                r#"count(*) FILTER (WHERE (a > 100)) OVER (ORDER BY a DESC NULLS FIRST RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING)"#,
             ),
             (col("a").is_not_null(), r#"a IS NOT NULL"#),
             (col("a").is_null(), r#"a IS NULL"#),
@@ -2105,12 +2217,15 @@ mod tests {
                 r#"TRY_CAST(a AS INTEGER UNSIGNED)"#,
             ),
             (
-                Expr::ScalarVariable(Int8, vec![String::from("@a")]),
+                Expr::ScalarVariable(
+                    Int8.into_nullable_field_ref(),
+                    vec![String::from("@a")],
+                ),
                 r#"@a"#,
             ),
             (
                 Expr::ScalarVariable(
-                    Int8,
+                    Int8.into_nullable_field_ref(),
                     vec![String::from("@root"), String::from("foo")],
                 ),
                 r#"@root.foo"#,
@@ -2135,26 +2250,35 @@ mod tests {
             (col("need quoted").eq(lit(1)), r#"("need quoted" = 1)"#),
             // See test_interval_scalar_to_expr for interval literals
             (
-                (col("a") + col("b")).gt(Expr::Literal(ScalarValue::Decimal128(
-                    Some(100123),
-                    28,
-                    3,
-                ))),
+                (col("a") + col("b")).gt(Expr::Literal(
+                    ScalarValue::Decimal32(Some(1123), 4, 3),
+                    None,
+                )),
+                r#"((a + b) > 1.123)"#,
+            ),
+            (
+                (col("a") + col("b")).gt(Expr::Literal(
+                    ScalarValue::Decimal64(Some(1123), 4, 3),
+                    None,
+                )),
+                r#"((a + b) > 1.123)"#,
+            ),
+            (
+                (col("a") + col("b")).gt(Expr::Literal(
+                    ScalarValue::Decimal128(Some(100123), 28, 3),
+                    None,
+                )),
                 r#"((a + b) > 100.123)"#,
             ),
             (
-                (col("a") + col("b")).gt(Expr::Literal(ScalarValue::Decimal256(
-                    Some(100123.into()),
-                    28,
-                    3,
-                ))),
+                (col("a") + col("b")).gt(Expr::Literal(
+                    ScalarValue::Decimal256(Some(100123.into()), 28, 3),
+                    None,
+                )),
                 r#"((a + b) > 100.123)"#,
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(col("a")),
-                    data_type: DataType::Decimal128(10, -2),
-                }),
+                Expr::Cast(Cast::new(Box::new(col("a")), DataType::Decimal128(10, -2))),
                 r#"CAST(a AS DECIMAL(12,0))"#,
             ),
             (
@@ -2183,28 +2307,50 @@ mod tests {
                 "MAP {'a': 1, 'b': 2}",
             ),
             (
-                Expr::Literal(ScalarValue::Dictionary(
-                    Box::new(DataType::Int32),
-                    Box::new(ScalarValue::Utf8(Some("foo".into()))),
-                )),
+                Expr::Literal(
+                    ScalarValue::Dictionary(
+                        Box::new(DataType::Int32),
+                        Box::new(ScalarValue::Utf8(Some("foo".into()))),
+                    ),
+                    None,
+                ),
                 "'foo'",
             ),
             (
-                Expr::Literal(ScalarValue::List(Arc::new(
-                    ListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(vec![
+                Expr::Literal(
+                    ScalarValue::RunEndEncoded(
+                        Field::new("run_ends", DataType::Int32, false).into(),
+                        Field::new("values", DataType::Utf8, true).into(),
+                        Box::new(ScalarValue::Utf8(Some("foo".into()))),
+                    ),
+                    None,
+                ),
+                "'foo'",
+            ),
+            (
+                Expr::Literal(
+                    ScalarValue::List(Arc::new(ListArray::from_iter_primitive::<
+                        Int32Type,
+                        _,
+                        _,
+                    >(vec![Some(vec![
                         Some(1),
                         Some(2),
                         Some(3),
-                    ])]),
-                ))),
+                    ])]))),
+                    None,
+                ),
                 "[1, 2, 3]",
             ),
             (
-                Expr::Literal(ScalarValue::LargeList(Arc::new(
-                    LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![Some(
-                        vec![Some(1), Some(2), Some(3)],
-                    )]),
-                ))),
+                Expr::Literal(
+                    ScalarValue::LargeList(Arc::new(
+                        LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+                            Some(vec![Some(1), Some(2), Some(3)]),
+                        ]),
+                    )),
+                    None,
+                ),
                 "[1, 2, 3]",
             ),
             (
@@ -2250,7 +2396,6 @@ mod tests {
 
         let expected = r#"('a' > 4)"#;
         assert_eq!(actual, expected);
-
         Ok(())
     }
 
@@ -2280,10 +2425,7 @@ mod tests {
                 .build();
             let unparser = Unparser::new(&dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Date64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Date64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2305,10 +2447,7 @@ mod tests {
                 .build();
             let unparser = Unparser::new(&dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Float64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2509,11 +2648,17 @@ mod tests {
     #[test]
     fn test_float_scalar_to_expr() {
         let tests = [
-            (Expr::Literal(ScalarValue::Float64(Some(3f64))), "3.0"),
-            (Expr::Literal(ScalarValue::Float64(Some(3.1f64))), "3.1"),
-            (Expr::Literal(ScalarValue::Float32(Some(-2f32))), "-2.0"),
+            (Expr::Literal(ScalarValue::Float64(Some(3f64)), None), "3.0"),
             (
-                Expr::Literal(ScalarValue::Float32(Some(-2.989f32))),
+                Expr::Literal(ScalarValue::Float64(Some(3.1f64)), None),
+                "3.1",
+            ),
+            (
+                Expr::Literal(ScalarValue::Float32(Some(-2f32)), None),
+                "-2.0",
+            ),
+            (
+                Expr::Literal(ScalarValue::Float32(Some(-2.989f32)), None),
                 "-2.989",
             ),
         ];
@@ -2532,21 +2677,23 @@ mod tests {
     fn test_cast_value_to_binary_expr() {
         let tests = [
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some(
-                        "blah".to_string(),
-                    )))),
-                    data_type: DataType::Binary,
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(Expr::Literal(
+                        ScalarValue::Utf8(Some("blah".to_string())),
+                        None,
+                    )),
+                    DataType::Binary,
+                )),
                 "'blah'",
             ),
             (
-                Expr::Cast(Cast {
-                    expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some(
-                        "blah".to_string(),
-                    )))),
-                    data_type: DataType::BinaryView,
-                }),
+                Expr::Cast(Cast::new(
+                    Box::new(Expr::Literal(
+                        ScalarValue::Utf8(Some("blah".to_string())),
+                        None,
+                    )),
+                    DataType::BinaryView,
+                )),
                 "'blah'",
             ),
         ];
@@ -2577,10 +2724,7 @@ mod tests {
         ] {
             let unparser = Unparser::new(dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2636,7 +2780,10 @@ mod tests {
             let expr = ScalarUDF::new_from_impl(
                 datafusion_functions::datetime::date_part::DatePartFunc::new(),
             )
-            .call(vec![Expr::Literal(ScalarValue::new_utf8(unit)), col("x")]);
+            .call(vec![
+                Expr::Literal(ScalarValue::new_utf8(unit), None),
+                col("x"),
+            ]);
 
             let ast = unparser.expr_to_sql(&expr)?;
             let actual = format!("{ast}");
@@ -2660,10 +2807,7 @@ mod tests {
             [(default_dialect, "BIGINT"), (mysql_dialect, "SIGNED")]
         {
             let unparser = Unparser::new(&dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Int64,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Int64));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2688,10 +2832,7 @@ mod tests {
             [(default_dialect, "INTEGER"), (mysql_dialect, "SIGNED")]
         {
             let unparser = Unparser::new(&dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: DataType::Int32,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), DataType::Int32));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2727,10 +2868,7 @@ mod tests {
             (&mysql_dialect, &timestamp_with_tz, "DATETIME"),
         ] {
             let unparser = Unparser::new(dialect);
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type: data_type.clone(),
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type.clone()));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2756,10 +2894,10 @@ mod tests {
             (&mysql_dialect, "DATETIME"),
         ] {
             let unparser = Unparser::new(dialect);
-            let expr = Expr::Literal(ScalarValue::TimestampMillisecond(
-                Some(1738285549123),
+            let expr = Expr::Literal(
+                ScalarValue::TimestampMillisecond(Some(1738285549123), None),
                 None,
-            ));
+            );
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2783,10 +2921,7 @@ mod tests {
         ] {
             let unparser = Unparser::new(dialect);
 
-            let expr = Expr::Cast(Cast {
-                expr: Box::new(col("a")),
-                data_type,
-            });
+            let expr = Expr::Cast(Cast::new(Box::new(col("a")), data_type));
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = format!("{ast}");
@@ -2823,15 +2958,80 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_mssql_dialect_national_literal() -> Result<()> {
+        struct MsSqlDialect;
+
+        impl Dialect for MsSqlDialect {
+            fn identifier_quote_style(&self, _identifier: &str) -> Option<char> {
+                Some('[')
+            }
+
+            fn string_literal_to_sql(&self, s: &str) -> Option<ast::Expr> {
+                if !s.is_ascii() {
+                    Some(ast::Expr::value(ast::Value::NationalStringLiteral(
+                        s.to_string(),
+                    )))
+                } else {
+                    None
+                }
+            }
+        }
+
+        let dialect = MsSqlDialect;
+        let unparser = Unparser::new(&dialect);
+
+        // Get nation string literal for the custom mssql dialect
+        for (s, expected) in [
+            ("national string", "'national string'"),
+            ("datafusion資料融合", "N'datafusion資料融合'"),
+        ] {
+            let expr = Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::Utf8View(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::LargeUtf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+        }
+
+        let dialect = DefaultDialect {};
+        let unparser = Unparser::new(&dialect);
+
+        // Get normal string literal for default dialect
+        for (s, expected) in [
+            ("national string", "'national string'"),
+            ("datafusion資料融合", "'datafusion資料融合'"),
+        ] {
+            let expr = Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::Utf8View(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+
+            let expr = Expr::Literal(ScalarValue::LargeUtf8(Some(s.to_string())), None);
+            let ast = unparser.expr_to_sql(&expr)?;
+            assert_eq!(ast.to_string(), expected);
+        }
+        Ok(())
+    }
+
     #[test]
     fn test_cast_value_to_dict_expr() {
         let tests = [(
-            Expr::Cast(Cast {
-                expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some(
-                    "variation".to_string(),
-                )))),
-                data_type: DataType::Dictionary(Box::new(Int8), Box::new(DataType::Utf8)),
-            }),
+            Expr::Cast(Cast::new(
+                Box::new(Expr::Literal(
+                    ScalarValue::Utf8(Some("variation".to_string())),
+                    None,
+                )),
+                DataType::Dictionary(Box::new(Int8), Box::new(DataType::Utf8)),
+            )),
             "'variation'",
         )];
         for (value, expected) in tests {
@@ -2863,11 +3063,8 @@ mod tests {
                     datafusion_functions::math::round::RoundFunc::new(),
                 )),
                 args: vec![
-                    Expr::Cast(Cast {
-                        expr: Box::new(col("a")),
-                        data_type: DataType::Float64,
-                    }),
-                    Expr::Literal(ScalarValue::Int64(Some(2))),
+                    Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)),
+                    Expr::Literal(ScalarValue::Int64(Some(2)), None),
                 ],
             });
             let ast = unparser.expr_to_sql(&expr)?;
@@ -2902,7 +3099,7 @@ mod tests {
             let func = WindowFunctionDefinition::WindowUDF(rank_udwf());
             let mut window_func = WindowFunction::new(func, vec![]);
             window_func.params.order_by = vec![Sort::new(col("a"), true, true)];
-            let expr = Expr::WindowFunction(window_func);
+            let expr = Expr::from(window_func);
             let ast = unparser.expr_to_sql(&expr)?;
 
             let actual = ast.to_string();
@@ -3007,7 +3204,7 @@ mod tests {
                     datafusion_functions::datetime::date_trunc::DateTruncFunc::new(),
                 )),
                 args: vec![
-                    Expr::Literal(ScalarValue::Utf8(Some(precision.to_string()))),
+                    Expr::Literal(ScalarValue::Utf8(Some(precision.to_string())), None),
                     col("date_col"),
                 ],
             });
@@ -3028,10 +3225,31 @@ mod tests {
 
         let unparser = Unparser::new(&dialect);
 
-        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&DataType::Dictionary(
-            Box::new(DataType::Int32),
-            Box::new(DataType::Utf8),
-        ))?;
+        let arrow_field = Arc::new(Field::new(
+            "",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ));
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&arrow_field)?;
+
+        assert_eq!(ast_dtype, ast::DataType::Varchar(None));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_run_end_encoded_to_sql() -> Result<()> {
+        let dialect = CustomDialectBuilder::new().build();
+
+        let unparser = Unparser::new(&dialect);
+
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(
+            &DataType::RunEndEncoded(
+                Field::new("run_ends", DataType::Int32, false).into(),
+                Field::new("values", DataType::Utf8, true).into(),
+            )
+            .into_nullable_field_ref(),
+        )?;
 
         assert_eq!(ast_dtype, ast::DataType::Varchar(None));
 
@@ -3045,7 +3263,8 @@ mod tests {
             .build();
         let unparser = Unparser::new(&dialect);
 
-        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&DataType::Utf8View)?;
+        let arrow_field = Arc::new(Field::new("", DataType::Utf8View, true));
+        let ast_dtype = unparser.arrow_dtype_to_ast_dtype(&arrow_field)?;
 
         assert_eq!(ast_dtype, ast::DataType::Char(None));
 
@@ -3107,4 +3326,101 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_cast_timestamp_sqlite() -> Result<()> {
+        let dialect: Arc<dyn Dialect> = Arc::new(SqliteDialect {});
+
+        let unparser = Unparser::new(dialect.as_ref());
+        let expr = Expr::Cast(Cast::new(
+            Box::new(col("a")),
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+        ));
+
+        let ast = unparser.expr_to_sql(&expr)?;
+
+        let actual = ast.to_string();
+        let expected = "CAST(`a` AS TEXT)".to_string();
+
+        assert_eq!(actual, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_timestamp_with_tz_format() -> Result<()> {
+        let default_dialect: Arc<dyn Dialect> =
+            Arc::new(CustomDialectBuilder::new().build());
+
+        let duckdb_dialect: Arc<dyn Dialect> = Arc::new(DuckDBDialect::new());
+
+        for (dialect, scalar, expected) in [
+            (
+                Arc::clone(&default_dialect),
+                ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())),
+                "CAST('2025-09-15 11:00:00 +00:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&default_dialect),
+                ScalarValue::TimestampMillisecond(
+                    Some(1757934000123),
+                    Some("+01:00".into()),
+                ),
+                "CAST('2025-09-15 12:00:00.123 +01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&default_dialect),
+                ScalarValue::TimestampMicrosecond(
+                    Some(1757934000123456),
+                    Some("-01:00".into()),
+                ),
+                "CAST('2025-09-15 10:00:00.123456 -01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&default_dialect),
+                ScalarValue::TimestampNanosecond(
+                    Some(1757934000123456789),
+                    Some("+00:00".into()),
+                ),
+                "CAST('2025-09-15 11:00:00.123456789 +00:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&duckdb_dialect),
+                ScalarValue::TimestampSecond(Some(1757934000), Some("+00:00".into())),
+                "CAST('2025-09-15 11:00:00+00:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&duckdb_dialect),
+                ScalarValue::TimestampMillisecond(
+                    Some(1757934000123),
+                    Some("+01:00".into()),
+                ),
+                "CAST('2025-09-15 12:00:00.123+01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&duckdb_dialect),
+                ScalarValue::TimestampMicrosecond(
+                    Some(1757934000123456),
+                    Some("-01:00".into()),
+                ),
+                "CAST('2025-09-15 10:00:00.123456-01:00' AS TIMESTAMP)",
+            ),
+            (
+                Arc::clone(&duckdb_dialect),
+                ScalarValue::TimestampNanosecond(
+                    Some(1757934000123456789),
+                    Some("+00:00".into()),
+                ),
+                "CAST('2025-09-15 11:00:00.123456789+00:00' AS TIMESTAMP)",
+            ),
+        ] {
+            let unparser = Unparser::new(dialect.as_ref());
+
+            let expr = Expr::Literal(scalar, None);
+
+            let actual = format!("{}", unparser.expr_to_sql(&expr)?);
+            assert_eq!(actual, expected);
+        }
+        Ok(())
+    }
 }
diff --git a/datafusion/sql/src/unparser/extension_unparser.rs b/datafusion/sql/src/unparser/extension_unparser.rs
index b778130ca5a27..6633b38cf27cc 100644
--- a/datafusion/sql/src/unparser/extension_unparser.rs
+++ b/datafusion/sql/src/unparser/extension_unparser.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::unparser::ast::{QueryBuilder, RelationBuilder, SelectBuilder};
 use crate::unparser::Unparser;
+use crate::unparser::ast::{QueryBuilder, RelationBuilder, SelectBuilder};
 use datafusion_expr::UserDefinedLogicalNode;
 use sqlparser::ast::Statement;
 
@@ -64,7 +64,7 @@ pub enum UnparseWithinStatementResult {
 }
 
 /// The result of unparsing a custom logical node to a statement.
-#[allow(clippy::large_enum_variant)]
+#[expect(clippy::large_enum_variant)]
 pub enum UnparseToStatementResult {
     /// If the custom logical node was successfully unparsed to a statement.
     Modified(Statement),
diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs
index e89e25ddb15ac..9f770f9f45e1d 100644
--- a/datafusion/sql/src/unparser/plan.rs
+++ b/datafusion/sql/src/unparser/plan.rs
@@ -16,38 +16,38 @@
 // under the License.
 
 use super::{
+    Unparser,
     ast::{
         BuilderError, DerivedRelationBuilder, QueryBuilder, RelationBuilder,
         SelectBuilder, TableRelationBuilder, TableWithJoinsBuilder,
     },
     rewrite::{
-        inject_column_aliases_into_subquery, normalize_union_schema,
+        TableAliasRewriter, inject_column_aliases_into_subquery, normalize_union_schema,
         rewrite_plan_for_sort_on_non_projected_fields,
-        subquery_alias_inner_query_and_columns, TableAliasRewriter,
+        subquery_alias_inner_query_and_columns,
     },
     utils::{
         find_agg_node_within_select, find_unnest_node_within_select,
         find_window_nodes_within_select, try_transform_to_simple_table_scan_with_filters,
         unproject_sort_expr, unproject_unnest_expr, unproject_window_exprs,
     },
-    Unparser,
 };
-use crate::unparser::ast::UnnestRelationBuilder;
 use crate::unparser::extension_unparser::{
     UnparseToStatementResult, UnparseWithinStatementResult,
 };
 use crate::unparser::utils::{find_unnest_node_until_relation, unproject_agg_exprs};
+use crate::unparser::{ast::UnnestRelationBuilder, rewrite::rewrite_qualify};
 use crate::utils::UNNEST_PLACEHOLDER;
 use datafusion_common::{
+    Column, DataFusionError, Result, ScalarValue, TableReference, assert_or_internal_err,
     internal_err, not_impl_err,
     tree_node::{TransformedResult, TreeNode},
-    Column, DataFusionError, Result, ScalarValue, TableReference,
 };
 use datafusion_expr::expr::OUTER_REFERENCE_COLUMN_PREFIX;
 use datafusion_expr::{
-    expr::Alias, BinaryExpr, Distinct, Expr, JoinConstraint, JoinType, LogicalPlan,
+    BinaryExpr, Distinct, Expr, JoinConstraint, JoinType, LogicalPlan,
     LogicalPlanBuilder, Operator, Projection, SortExpr, TableScan, Unnest,
-    UserDefinedLogicalNode,
+    UserDefinedLogicalNode, expr::Alias,
 };
 use sqlparser::ast::{self, Ident, OrderByKind, SetExpr, TableAliasColumnDef};
 use std::{sync::Arc, vec};
@@ -81,9 +81,13 @@ use std::{sync::Arc, vec};
 ///     .unwrap()
 ///     .build()
 ///     .unwrap();
-/// let sql = plan_to_sql(&plan).unwrap(); // convert to AST
+/// // convert to AST
+/// let sql = plan_to_sql(&plan).unwrap();
 /// // use the Display impl to convert to SQL text
-/// assert_eq!(sql.to_string(), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"")
+/// assert_eq!(
+///     sql.to_string(),
+///     "SELECT \"table\".id, \"table\".\"value\" FROM \"table\""
+/// )
 /// ```
 ///
 /// [`SqlToRel::sql_statement_to_plan`]: crate::planner::SqlToRel::sql_statement_to_plan
@@ -95,7 +99,10 @@ pub fn plan_to_sql(plan: &LogicalPlan) -> Result<ast::Statement> {
 
 impl Unparser<'_> {
     pub fn plan_to_sql(&self, plan: &LogicalPlan) -> Result<ast::Statement> {
-        let plan = normalize_union_schema(plan)?;
+        let mut plan = normalize_union_schema(plan)?;
+        if !self.dialect.supports_qualify() {
+            plan = rewrite_qualify(plan)?;
+        }
 
         match plan {
             LogicalPlan::Projection(_)
@@ -333,7 +340,7 @@ impl Unparser<'_> {
     ) -> Result<()> {
         match plan {
             LogicalPlan::TableScan(scan) => {
-                if let Some(unparsed_table_scan) = Self::unparse_table_scan_pushdown(
+                if let Some(unparsed_table_scan) = self.unparse_table_scan_pushdown(
                     plan,
                     None,
                     select.already_projected(),
@@ -377,20 +384,19 @@ impl Unparser<'_> {
                 } else {
                     None
                 };
-                if self.dialect.unnest_as_table_factor() && unnest_input_type.is_some() {
-                    if let LogicalPlan::Unnest(unnest) = &p.input.as_ref() {
-                        if let Some(unnest_relation) =
-                            self.try_unnest_to_table_factor_sql(unnest)?
-                        {
-                            relation.unnest(unnest_relation);
-                            return self.select_to_sql_recursively(
-                                p.input.as_ref(),
-                                query,
-                                select,
-                                relation,
-                            );
-                        }
-                    }
+                if self.dialect.unnest_as_table_factor()
+                    && unnest_input_type.is_some()
+                    && let LogicalPlan::Unnest(unnest) = &p.input.as_ref()
+                    && let Some(unnest_relation) =
+                        self.try_unnest_to_table_factor_sql(unnest)?
+                {
+                    relation.unnest(unnest_relation);
+                    return self.select_to_sql_recursively(
+                        p.input.as_ref(),
+                        query,
+                        select,
+                        relation,
+                    );
                 }
 
                 // If it's a unnest projection, we should provide the table column alias
@@ -428,6 +434,18 @@ impl Unparser<'_> {
                         unproject_agg_exprs(filter.predicate.clone(), agg, None)?;
                     let filter_expr = self.expr_to_sql(&unprojected)?;
                     select.having(Some(filter_expr));
+                } else if let (Some(window), true) = (
+                    find_window_nodes_within_select(
+                        plan,
+                        None,
+                        select.already_projected(),
+                    ),
+                    self.dialect.supports_qualify(),
+                ) {
+                    let unprojected =
+                        unproject_window_exprs(filter.predicate.clone(), &window)?;
+                    let filter_expr = self.expr_to_sql(&unprojected)?;
+                    select.qualify(Some(filter_expr));
                 } else {
                     let filter_expr = self.expr_to_sql(&filter.predicate)?;
                     select.selection(Some(filter_expr));
@@ -466,6 +484,7 @@ impl Unparser<'_> {
                             "Offset operator only valid in a statement context."
                         );
                     };
+
                     query.offset(Some(ast::Offset {
                         rows: ast::OffsetRows::None,
                         value: self.expr_to_sql(skip)?,
@@ -480,16 +499,6 @@ impl Unparser<'_> {
                 )
             }
             LogicalPlan::Sort(sort) => {
-                // Sort can be top-level plan for derived table
-                if select.already_projected() {
-                    return self.derive_with_dialect_alias(
-                        "derived_sort",
-                        plan,
-                        relation,
-                        false,
-                        vec![],
-                    );
-                }
                 let Some(query_ref) = query else {
                     return internal_err!(
                         "Sort operator only valid in a statement context."
@@ -565,18 +574,17 @@ impl Unparser<'_> {
 
                 // If this distinct is the parent of a Union and we're in a query context,
                 // then we need to unparse as a `UNION` rather than a `UNION ALL`.
-                if let Distinct::All(input) = distinct {
-                    if matches!(input.as_ref(), LogicalPlan::Union(_)) {
-                        if let Some(query_mut) = query.as_mut() {
-                            query_mut.distinct_union();
-                            return self.select_to_sql_recursively(
-                                input.as_ref(),
-                                query,
-                                select,
-                                relation,
-                            );
-                        }
-                    }
+                if let Distinct::All(input) = distinct
+                    && matches!(input.as_ref(), LogicalPlan::Union(_))
+                    && let Some(query_mut) = query.as_mut()
+                {
+                    query_mut.distinct_union();
+                    return self.select_to_sql_recursively(
+                        input.as_ref(),
+                        query,
+                        select,
+                        relation,
+                    );
                 }
 
                 let (select_distinct, input) = match distinct {
@@ -695,13 +703,6 @@ impl Unparser<'_> {
                     join_filters.as_ref(),
                 )?;
 
-                self.select_to_sql_recursively(
-                    right_plan.as_ref(),
-                    query,
-                    select,
-                    &mut right_relation,
-                )?;
-
                 let right_projection: Option<Vec<ast::SelectItem>> = if !already_projected
                 {
                     Some(select.pop_projections())
@@ -714,7 +715,8 @@ impl Unparser<'_> {
                     | JoinType::LeftAnti
                     | JoinType::LeftMark
                     | JoinType::RightSemi
-                    | JoinType::RightAnti => {
+                    | JoinType::RightAnti
+                    | JoinType::RightMark => {
                         let mut query_builder = QueryBuilder::default();
                         let mut from = TableWithJoinsBuilder::default();
                         let mut exists_select: SelectBuilder = SelectBuilder::default();
@@ -738,7 +740,8 @@ impl Unparser<'_> {
                         let negated = match join.join_type {
                             JoinType::LeftSemi
                             | JoinType::RightSemi
-                            | JoinType::LeftMark => false,
+                            | JoinType::LeftMark
+                            | JoinType::RightMark => false,
                             JoinType::LeftAnti | JoinType::RightAnti => true,
                             _ => unreachable!(),
                         };
@@ -746,13 +749,25 @@ impl Unparser<'_> {
                             subquery: Box::new(query_builder.build()?),
                             negated,
                         };
-                        if join.join_type == JoinType::LeftMark {
-                            let (table_ref, _) = right_plan.schema().qualified_field(0);
-                            let column = self
-                                .col_to_sql(&Column::new(table_ref.cloned(), "mark"))?;
-                            select.replace_mark(&column, &exists_expr);
-                        } else {
-                            select.selection(Some(exists_expr));
+
+                        match join.join_type {
+                            JoinType::LeftMark | JoinType::RightMark => {
+                                let source_schema =
+                                    if join.join_type == JoinType::LeftMark {
+                                        right_plan.schema()
+                                    } else {
+                                        left_plan.schema()
+                                    };
+                                let (table_ref, _) = source_schema.qualified_field(0);
+                                let column = self.col_to_sql(&Column::new(
+                                    table_ref.cloned(),
+                                    "mark",
+                                ))?;
+                                select.replace_mark(&column, &exists_expr);
+                            }
+                            _ => {
+                                select.selection(Some(exists_expr));
+                            }
                         }
                         if let Some(projection) = left_projection {
                             select.projection(projection);
@@ -785,7 +800,7 @@ impl Unparser<'_> {
 
                             let projection = left_projection
                                 .into_iter()
-                                .chain(right_projection.into_iter())
+                                .chain(right_projection)
                                 .collect();
                             select.projection(projection);
                         }
@@ -797,7 +812,7 @@ impl Unparser<'_> {
             LogicalPlan::SubqueryAlias(plan_alias) => {
                 let (plan, mut columns) =
                     subquery_alias_inner_query_and_columns(plan_alias);
-                let unparsed_table_scan = Self::unparse_table_scan_pushdown(
+                let unparsed_table_scan = self.unparse_table_scan_pushdown(
                     plan,
                     Some(plan_alias.alias.clone()),
                     select.already_projected(),
@@ -820,7 +835,7 @@ impl Unparser<'_> {
                             Err(e) => {
                                 return internal_err!(
                                     "Failed to transform SubqueryAlias plan: {e}"
-                                )
+                                );
                             }
                         };
 
@@ -860,9 +875,10 @@ impl Unparser<'_> {
                     .map(|input| self.select_to_sql_expr(input, query))
                     .collect::<Result<Vec<_>>>()?;
 
-                if input_exprs.len() < 2 {
-                    return internal_err!("UNION operator requires at least 2 inputs");
-                }
+                assert_or_internal_err!(
+                    input_exprs.len() >= 2,
+                    "UNION operator requires at least 2 inputs"
+                );
 
                 let set_quantifier =
                     if query.as_ref().is_some_and(|q| q.is_distinct_union()) {
@@ -930,12 +946,11 @@ impl Unparser<'_> {
                 }
             }
             LogicalPlan::Unnest(unnest) => {
-                if !unnest.struct_type_columns.is_empty() {
-                    return internal_err!(
-                        "Struct type columns are not currently supported in UNNEST: {:?}",
-                        unnest.struct_type_columns
-                    );
-                }
+                assert_or_internal_err!(
+                    unnest.struct_type_columns.is_empty(),
+                    "Struct type columns are not currently supported in UNNEST: {:?}",
+                    unnest.struct_type_columns
+                );
 
                 // In the case of UNNEST, the Unnest node is followed by a duplicate Projection node that we should skip.
                 // Otherwise, there will be a duplicate SELECT clause.
@@ -988,15 +1003,14 @@ impl Unparser<'_> {
     ///
     /// `outer_ref` is the display result of [Expr::OuterReferenceColumn]
     fn check_unnest_placeholder_with_outer_ref(expr: &Expr) -> Option<UnnestInputType> {
-        if let Expr::Alias(Alias { expr, .. }) = expr {
-            if let Expr::Column(Column { name, .. }) = expr.as_ref() {
-                if let Some(prefix) = name.strip_prefix(UNNEST_PLACEHOLDER) {
-                    if prefix.starts_with(&format!("({OUTER_REFERENCE_COLUMN_PREFIX}(")) {
-                        return Some(UnnestInputType::OuterReference);
-                    }
-                    return Some(UnnestInputType::Scalar);
-                }
+        if let Expr::Alias(Alias { expr, .. }) = expr
+            && let Expr::Column(Column { name, .. }) = expr.as_ref()
+            && let Some(prefix) = name.strip_prefix(UNNEST_PLACEHOLDER)
+        {
+            if prefix.starts_with(&format!("({OUTER_REFERENCE_COLUMN_PREFIX}(")) {
+                return Some(UnnestInputType::OuterReference);
             }
+            return Some(UnnestInputType::Scalar);
         }
         None
     }
@@ -1038,6 +1052,7 @@ impl Unparser<'_> {
     /// Try to unparse a table scan with pushdown operations into a new subquery plan.
     /// If the table scan is without any pushdown operations, return None.
     fn unparse_table_scan_pushdown(
+        &self,
         plan: &LogicalPlan,
         alias: Option<TableReference>,
         already_projected: bool,
@@ -1064,41 +1079,37 @@ impl Unparser<'_> {
                 //
                 // Example:
                 //   select t1.c1 from t1 where t1.c1 > 1 -> select a.c1 from t1 as a where a.c1 > 1
-                if let Some(ref alias) = alias {
-                    if table_scan.projection.is_some() || !table_scan.filters.is_empty() {
-                        builder = builder.alias(alias.clone())?;
-                    }
+                if let Some(ref alias) = alias
+                    && (table_scan.projection.is_some() || !table_scan.filters.is_empty())
+                {
+                    builder = builder.alias(alias.clone())?;
                 }
 
                 // Avoid creating a duplicate Projection node, which would result in an additional subquery if a projection already exists.
                 // For example, if the `optimize_projection` rule is applied, there will be a Projection node, and duplicate projection
                 // information included in the TableScan node.
-                if !already_projected {
-                    if let Some(project_vec) = &table_scan.projection {
-                        if project_vec.is_empty() {
-                            builder = builder.project(vec![Expr::Literal(
-                                ScalarValue::Int64(Some(1)),
-                            )])?;
-                        } else {
-                            let project_columns = project_vec
-                                .iter()
-                                .cloned()
-                                .map(|i| {
-                                    let schema = table_scan.source.schema();
-                                    let field = schema.field(i);
-                                    if alias.is_some() {
-                                        Column::new(alias.clone(), field.name().clone())
-                                    } else {
-                                        Column::new(
-                                            Some(table_scan.table_name.clone()),
-                                            field.name().clone(),
-                                        )
-                                    }
-                                })
-                                .collect::<Vec<_>>();
-                            builder = builder.project(project_columns)?;
-                        };
-                    }
+                if !already_projected && let Some(project_vec) = &table_scan.projection {
+                    if project_vec.is_empty() {
+                        builder = builder.project(self.empty_projection_fallback())?;
+                    } else {
+                        let project_columns = project_vec
+                            .iter()
+                            .cloned()
+                            .map(|i| {
+                                let schema = table_scan.source.schema();
+                                let field = schema.field(i);
+                                if alias.is_some() {
+                                    Column::new(alias.clone(), field.name().clone())
+                                } else {
+                                    Column::new(
+                                        Some(table_scan.table_name.clone()),
+                                        field.name().clone(),
+                                    )
+                                }
+                            })
+                            .collect::<Vec<_>>();
+                        builder = builder.project(project_columns)?;
+                    };
                 }
 
                 let filter_expr: Result<Option<Expr>> = table_scan
@@ -1131,32 +1142,33 @@ impl Unparser<'_> {
                 // So we will append the alias to this subquery.
                 // Example:
                 //   select * from t1 limit 10 -> (select * from t1 limit 10) as a
-                if let Some(alias) = alias {
-                    if table_scan.projection.is_none() && table_scan.filters.is_empty() {
-                        builder = builder.alias(alias)?;
-                    }
+                if let Some(alias) = alias
+                    && table_scan.projection.is_none()
+                    && table_scan.filters.is_empty()
+                {
+                    builder = builder.alias(alias)?;
                 }
 
                 Ok(Some(builder.build()?))
             }
             LogicalPlan::SubqueryAlias(subquery_alias) => {
-                let ret = Self::unparse_table_scan_pushdown(
+                let ret = self.unparse_table_scan_pushdown(
                     &subquery_alias.input,
                     Some(subquery_alias.alias.clone()),
                     already_projected,
                 )?;
-                if let Some(alias) = alias {
-                    if let Some(plan) = ret {
-                        let plan = LogicalPlanBuilder::new(plan).alias(alias)?.build()?;
-                        return Ok(Some(plan));
-                    }
+                if let Some(alias) = alias
+                    && let Some(plan) = ret
+                {
+                    let plan = LogicalPlanBuilder::new(plan).alias(alias)?.build()?;
+                    return Ok(Some(plan));
                 }
                 Ok(ret)
             }
             // SubqueryAlias could be rewritten to a plan with a projection as the top node by [rewrite::subquery_alias_inner_query_and_columns].
             // The inner table scan could be a scan with pushdown operations.
             LogicalPlan::Projection(projection) => {
-                if let Some(plan) = Self::unparse_table_scan_pushdown(
+                if let Some(plan) = self.unparse_table_scan_pushdown(
                     &projection.input,
                     alias.clone(),
                     already_projected,
@@ -1198,9 +1210,18 @@ impl Unparser<'_> {
             Expr::Alias(Alias { expr, name, .. }) => {
                 let inner = self.expr_to_sql(expr)?;
 
+                // Determine the alias name to use
+                let col_name = if let Some(rewritten_name) =
+                    self.dialect.col_alias_overrides(name)?
+                {
+                    rewritten_name.to_string()
+                } else {
+                    name.to_string()
+                };
+
                 Ok(ast::SelectItem::ExprWithAlias {
                     expr: inner,
-                    alias: self.new_ident_quoted_if_needs(name.to_string()),
+                    alias: self.new_ident_quoted_if_needs(col_name),
                 })
             }
             _ => {
@@ -1233,7 +1254,7 @@ impl Unparser<'_> {
                 ast::JoinConstraint::None => {
                     // Inner joins with no conditions or filters are not valid SQL in most systems,
                     // return a CROSS JOIN instead
-                    ast::JoinOperator::CrossJoin
+                    ast::JoinOperator::CrossJoin(constraint)
                 }
             },
             JoinType::Left => ast::JoinOperator::LeftOuter(constraint),
@@ -1243,7 +1264,9 @@ impl Unparser<'_> {
             JoinType::LeftSemi => ast::JoinOperator::LeftSemi(constraint),
             JoinType::RightAnti => ast::JoinOperator::RightAnti(constraint),
             JoinType::RightSemi => ast::JoinOperator::RightSemi(constraint),
-            JoinType::LeftMark => unimplemented!("Unparsing of Left Mark join type"),
+            JoinType::LeftMark | JoinType::RightMark => {
+                unimplemented!("Unparsing of Mark join type")
+            }
         })
     }
 
@@ -1372,12 +1395,24 @@ impl Unparser<'_> {
         ast::TableAlias {
             name: self.new_ident_quoted_if_needs(alias),
             columns,
+            explicit: true,
         }
     }
 
     fn dml_to_sql(&self, plan: &LogicalPlan) -> Result<ast::Statement> {
         not_impl_err!("Unsupported plan: {plan:?}")
     }
+
+    /// Generates appropriate projection expression for empty projection lists.
+    /// Returns an empty vec for dialects supporting empty select lists,
+    /// or a dummy literal `1` for other dialects.
+    fn empty_projection_fallback(&self) -> Vec<Expr> {
+        if self.dialect.supports_empty_select_list() {
+            Vec::new()
+        } else {
+            vec![Expr::Literal(ScalarValue::Int64(Some(1)), None)]
+        }
+    }
 }
 
 impl From<BuilderError> for DataFusionError {
diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs
index aa480cf4fff92..ec1b17cd28a91 100644
--- a/datafusion/sql/src/unparser/rewrite.rs
+++ b/datafusion/sql/src/unparser/rewrite.rs
@@ -20,8 +20,8 @@ use std::{collections::HashSet, sync::Arc};
 use arrow::datatypes::Schema;
 use datafusion_common::tree_node::TreeNodeContainer;
 use datafusion_common::{
-    tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
     Column, HashMap, Result, TableReference,
+    tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter},
 };
 use datafusion_expr::expr::{Alias, UNNEST_COLUMN_PREFIX};
 use datafusion_expr::{Expr, LogicalPlan, Projection, Sort, SortExpr};
@@ -100,6 +100,71 @@ fn rewrite_sort_expr_for_union(exprs: Vec<SortExpr>) -> Result<Vec<SortExpr>> {
     Ok(sort_exprs)
 }
 
+/// Rewrite Filter plans that have a Window as their input by inserting a SubqueryAlias.
+///
+/// When a Filter directly operates on a Window plan, it can cause issues during SQL unparsing
+/// because window functions in a WHERE clause are not valid SQL. The solution is to wrap
+/// the Window plan in a SubqueryAlias, effectively creating a derived table.
+///
+/// Example transformation:
+///
+/// Filter: condition
+///   Window: window_function
+///     TableScan: table
+///
+/// becomes:
+///
+/// Filter: condition
+///   SubqueryAlias: __qualify_subquery
+///     Projection: table.column1, table.column2
+///       Window: window_function
+///         TableScan: table
+pub(super) fn rewrite_qualify(plan: LogicalPlan) -> Result<LogicalPlan> {
+    let transformed_plan = plan.transform_up(|plan| match plan {
+        // Check if the filter's input is a Window plan
+        LogicalPlan::Filter(mut filter) => {
+            if matches!(&*filter.input, LogicalPlan::Window(_)) {
+                // Create a SubqueryAlias around the Window plan
+                let qualifier = filter
+                    .input
+                    .schema()
+                    .iter()
+                    .find_map(|(q, _)| q)
+                    .map(|q| q.to_string())
+                    .unwrap_or_else(|| "__qualify_subquery".to_string());
+
+                // for Postgres, name of column for 'rank() over (...)' is 'rank'
+                // but in Datafusion, it is 'rank() over (...)'
+                // without projection, it's still an invalid sql in Postgres
+
+                let project_exprs = filter
+                    .input
+                    .schema()
+                    .iter()
+                    .map(|(_, f)| datafusion_expr::col(f.name()).alias(f.name()))
+                    .collect::<Vec<_>>();
+
+                let input =
+                    datafusion_expr::LogicalPlanBuilder::from(Arc::clone(&filter.input))
+                        .project(project_exprs)?
+                        .build()?;
+
+                let subquery_alias =
+                    datafusion_expr::SubqueryAlias::try_new(Arc::new(input), qualifier)?;
+
+                filter.input = Arc::new(LogicalPlan::SubqueryAlias(subquery_alias));
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            } else {
+                Ok(Transformed::no(LogicalPlan::Filter(filter)))
+            }
+        }
+
+        _ => Ok(Transformed::no(plan)),
+    });
+
+    transformed_plan.data()
+}
+
 /// Rewrite logic plan for query that order by columns are not in projections
 /// Plan before rewrite:
 ///
@@ -246,7 +311,7 @@ pub(super) fn subquery_alias_inner_query_and_columns(
     //     Projection: j1.j1_id AS id
     //       Projection: j1.j1_id
     for (i, inner_expr) in inner_projection.expr.iter().enumerate() {
-        let Expr::Alias(ref outer_alias) = &outer_projections.expr[i] else {
+        let Expr::Alias(outer_alias) = &outer_projections.expr[i] else {
             return (plan, vec![]);
         };
 
@@ -295,15 +360,14 @@ pub(super) fn find_unnest_column_alias(
         if projection.expr.len() != 1 {
             return (plan, None);
         }
-        if let Some(Expr::Alias(alias)) = projection.expr.first() {
-            if alias
+        if let Some(Expr::Alias(alias)) = projection.expr.first()
+            && alias
                 .expr
                 .schema_name()
                 .to_string()
                 .starts_with(&format!("{UNNEST_COLUMN_PREFIX}("))
-            {
-                return (projection.input.as_ref(), Some(alias.name.clone()));
-            }
+        {
+            return (projection.input.as_ref(), Some(alias.name.clone()));
         }
     }
     (plan, None)
diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs
index c36ffbfe5ecfb..f539c0ddc1e87 100644
--- a/datafusion/sql/src/unparser/utils.rs
+++ b/datafusion/sql/src/unparser/utils.rs
@@ -18,17 +18,17 @@
 use std::{cmp::Ordering, sync::Arc, vec};
 
 use super::{
-    dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle,
-    rewrite::TableAliasRewriter, Unparser,
+    Unparser, dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle,
+    rewrite::TableAliasRewriter,
 };
 use datafusion_common::{
+    Column, DataFusionError, Result, ScalarValue, assert_eq_or_internal_err,
     internal_err,
     tree_node::{Transformed, TransformedResult, TreeNode},
-    Column, DataFusionError, Result, ScalarValue,
 };
 use datafusion_expr::{
-    expr, utils::grouping_set_to_exprlist, Aggregate, Expr, LogicalPlan,
-    LogicalPlanBuilder, Projection, SortExpr, Unnest, Window,
+    Aggregate, Expr, LogicalPlan, LogicalPlanBuilder, Projection, SortExpr, Unnest,
+    Window, expr, utils::grouping_set_to_exprlist,
 };
 
 use indexmap::IndexSet;
@@ -166,14 +166,12 @@ pub(crate) fn unproject_unnest_expr(expr: Expr, unnest: &Unnest) -> Result<Expr>
                 // Check if the column is among the columns to run unnest on. 
                 // Currently, only List/Array columns (defined in `list_type_columns`) are supported for unnesting. 
                 if unnest.list_type_columns.iter().any(|e| e.1.output_column.name == col_ref.name) {
-                    if let Ok(idx) = unnest.schema.index_of_column(col_ref) {
-                        if let LogicalPlan::Projection(Projection { expr, .. }) = unnest.input.as_ref() {
-                            if let Some(unprojected_expr) = expr.get(idx) {
+                    if let Ok(idx) = unnest.schema.index_of_column(col_ref)
+                        && let LogicalPlan::Projection(Projection { expr, .. }) = unnest.input.as_ref()
+                            && let Some(unprojected_expr) = expr.get(idx) {
                                 let unnest_expr = Expr::Unnest(expr::Unnest::new(unprojected_expr.clone()));
                                 return Ok(Transformed::yes(unnest_expr));
                             }
-                        }
-                    }
                     return internal_err!(
                         "Tried to unproject unnest expr for column '{}' that was not found in the provided Unnest!", &col_ref.name
                     );
@@ -203,7 +201,7 @@ pub(crate) fn unproject_agg_exprs(
                     windows.and_then(|w| find_window_expr(w, &c.name).cloned())
                 {
                     // Window function can contain an aggregation columns, e.g., 'avg(sum(ss_sales_price)) over ...' that needs to be unprojected
-                    return Ok(Transformed::yes(unproject_agg_exprs(unprojected_expr, agg, None)?));
+                    Ok(Transformed::yes(unproject_agg_exprs(unprojected_expr, agg, None)?))
                 } else {
                     internal_err!(
                         "Tried to unproject agg expr for column '{}' that was not found in the provided Aggregate!", &c.name
@@ -291,14 +289,14 @@ pub(crate) fn unproject_sort_expr(
                     }
 
                     // In case of aggregation there could be columns containing aggregation functions we need to unproject
-                    if let Some(agg) = agg {
-                        if agg.schema.is_column_from_schema(&col) {
-                            return Ok(Transformed::yes(unproject_agg_exprs(
-                                Expr::Column(col),
-                                agg,
-                                None,
-                            )?));
-                        }
+                    if let Some(agg) = agg
+                        && agg.schema.is_column_from_schema(&col)
+                    {
+                        return Ok(Transformed::yes(unproject_agg_exprs(
+                            Expr::Column(col),
+                            agg,
+                            None,
+                        )?));
                     }
 
                     // If SELECT and ORDER BY contain the same expression with a scalar function, the ORDER BY expression will
@@ -306,14 +304,12 @@ pub(crate) fn unproject_sort_expr(
                     // to transform it back to the actual expression.
                     if let LogicalPlan::Projection(Projection { expr, schema, .. }) =
                         input
+                        && let Ok(idx) = schema.index_of_column(&col)
+                        && let Some(Expr::ScalarFunction(scalar_fn)) = expr.get(idx)
                     {
-                        if let Ok(idx) = schema.index_of_column(&col) {
-                            if let Some(Expr::ScalarFunction(scalar_fn)) = expr.get(idx) {
-                                return Ok(Transformed::yes(Expr::ScalarFunction(
-                                    scalar_fn.clone(),
-                                )));
-                            }
-                        }
+                        return Ok(Transformed::yes(Expr::ScalarFunction(
+                            scalar_fn.clone(),
+                        )));
                     }
 
                     Ok(Transformed::no(Expr::Column(col)))
@@ -422,7 +418,7 @@ pub(crate) fn date_part_to_sql(
     match (style, date_part_args.len()) {
         (DateFieldExtractStyle::Extract, 2) => {
             let date_expr = unparser.expr_to_sql(&date_part_args[1])?;
-            if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] {
+            if let Expr::Literal(ScalarValue::Utf8(Some(field)), _) = &date_part_args[0] {
                 let field = match field.to_lowercase().as_str() {
                     "year" => ast::DateTimeField::Year,
                     "month" => ast::DateTimeField::Month,
@@ -443,7 +439,7 @@ pub(crate) fn date_part_to_sql(
         (DateFieldExtractStyle::Strftime, 2) => {
             let column = unparser.expr_to_sql(&date_part_args[1])?;
 
-            if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] {
+            if let Expr::Literal(ScalarValue::Utf8(Some(field)), _) = &date_part_args[0] {
                 let field = match field.to_lowercase().as_str() {
                     "year" => "%Y",
                     "month" => "%m",
@@ -520,18 +516,18 @@ pub(crate) fn sqlite_from_unixtime_to_sql(
     unparser: &Unparser,
     from_unixtime_args: &[Expr],
 ) -> Result<Option<ast::Expr>> {
-    if from_unixtime_args.len() != 1 {
-        return internal_err!(
-            "from_unixtime for SQLite expects 1 argument, found {}",
-            from_unixtime_args.len()
-        );
-    }
+    assert_eq_or_internal_err!(
+        from_unixtime_args.len(),
+        1,
+        "from_unixtime for SQLite expects 1 argument, found {}",
+        from_unixtime_args.len()
+    );
 
     Ok(Some(unparser.scalar_function_to_sql(
         "datetime",
         &[
             from_unixtime_args[0].clone(),
-            Expr::Literal(ScalarValue::Utf8(Some("unixepoch".to_string()))),
+            Expr::Literal(ScalarValue::Utf8(Some("unixepoch".to_string())), None),
         ],
     )?))
 }
@@ -547,14 +543,14 @@ pub(crate) fn sqlite_date_trunc_to_sql(
     unparser: &Unparser,
     date_trunc_args: &[Expr],
 ) -> Result<Option<ast::Expr>> {
-    if date_trunc_args.len() != 2 {
-        return internal_err!(
-            "date_trunc for SQLite expects 2 arguments, found {}",
-            date_trunc_args.len()
-        );
-    }
-
-    if let Expr::Literal(ScalarValue::Utf8(Some(unit))) = &date_trunc_args[0] {
+    assert_eq_or_internal_err!(
+        date_trunc_args.len(),
+        2,
+        "date_trunc for SQLite expects 2 arguments, found {}",
+        date_trunc_args.len()
+    );
+
+    if let Expr::Literal(ScalarValue::Utf8(Some(unit)), _) = &date_trunc_args[0] {
         let format = match unit.to_lowercase().as_str() {
             "year" => "%Y",
             "month" => "%Y-%m",
@@ -568,7 +564,7 @@ pub(crate) fn sqlite_date_trunc_to_sql(
         return Ok(Some(unparser.scalar_function_to_sql(
             "strftime",
             &[
-                Expr::Literal(ScalarValue::Utf8(Some(format.to_string()))),
+                Expr::Literal(ScalarValue::Utf8(Some(format.to_string())), None),
                 date_trunc_args[1].clone(),
             ],
         )?));
diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs
index 8496be1d7f9aa..1a76dd69f46c5 100644
--- a/datafusion/sql/src/utils.rs
+++ b/datafusion/sql/src/utils.rs
@@ -20,14 +20,14 @@
 use std::vec;
 
 use arrow::datatypes::{
-    DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE,
+    DECIMAL_DEFAULT_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DataType,
 };
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
 };
 use datafusion_common::{
-    exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, Diagnostic,
-    HashMap, Result, ScalarValue,
+    Column, DFSchemaRef, Diagnostic, HashMap, Result, ScalarValue,
+    assert_or_internal_err, exec_datafusion_err, exec_err, internal_err, plan_err,
 };
 use datafusion_expr::builder::get_struct_unnested_columns;
 use datafusion_expr::expr::{
@@ -35,7 +35,7 @@ use datafusion_expr::expr::{
 };
 use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs};
 use datafusion_expr::{
-    col, expr_vec_fmt, ColumnUnnestList, Expr, ExprSchemable, LogicalPlan,
+    ColumnUnnestList, Expr, ExprSchemable, LogicalPlan, col, expr_vec_fmt,
 };
 
 use indexmap::IndexMap;
@@ -92,26 +92,41 @@ pub(crate) fn rebase_expr(
         .data()
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum CheckColumnsMustReferenceAggregatePurpose {
+    Projection,
+    Having,
+    Qualify,
+    OrderBy,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub(crate) enum CheckColumnsSatisfyExprsPurpose {
-    ProjectionMustReferenceAggregate,
-    HavingMustReferenceAggregate,
+    Aggregate(CheckColumnsMustReferenceAggregatePurpose),
 }
 
 impl CheckColumnsSatisfyExprsPurpose {
     fn message_prefix(&self) -> &'static str {
         match self {
-            CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => {
+            Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::Projection) => {
                 "Column in SELECT must be in GROUP BY or an aggregate function"
             }
-            CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => {
+            Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::Having) => {
                 "Column in HAVING must be in GROUP BY or an aggregate function"
             }
+            Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::Qualify) => {
+                "Column in QUALIFY must be in GROUP BY or an aggregate function"
+            }
+            Self::Aggregate(CheckColumnsMustReferenceAggregatePurpose::OrderBy) => {
+                "Column in ORDER BY must be in GROUP BY or an aggregate function"
+            }
         }
     }
 
     fn diagnostic_message(&self, expr: &Expr) -> String {
-        format!("'{expr}' must appear in GROUP BY clause because it's not an aggregate expression")
+        format!(
+            "'{expr}' must appear in GROUP BY clause because it's not an aggregate expression"
+        )
     }
 }
 
@@ -162,7 +177,7 @@ fn check_column_satisfies_expr(
             purpose.diagnostic_message(expr),
             expr.spans().and_then(|spans| spans.first()),
         )
-        .with_help(format!("Either add '{expr}' to GROUP BY clause, or use an aggregare function like ANY_VALUE({expr})"), None);
+        .with_help(format!("Either add '{expr}' to GROUP BY clause, or use an aggregate function like ANY_VALUE({expr})"), None);
 
         return plan_err!(
             "{}: While expanding wildcard, column \"{}\" must appear in the GROUP BY clause or must be part of an aggregate function, currently only \"{}\" appears in the SELECT clause satisfies this requirement",
@@ -198,7 +213,7 @@ pub(crate) fn resolve_positions_to_exprs(
     match expr {
         // sql_expr_to_logical_expr maps number to i64
         // https://github.com/apache/datafusion/blob/8d175c759e17190980f270b5894348dc4cff9bbf/datafusion/src/sql/planner.rs#L882-L887
-        Expr::Literal(ScalarValue::Int64(Some(position)))
+        Expr::Literal(ScalarValue::Int64(Some(position)), _)
             if position > 0_i64 && position <= select_exprs.len() as i64 =>
         {
             let index = (position - 1) as usize;
@@ -208,9 +223,10 @@ pub(crate) fn resolve_positions_to_exprs(
                 _ => select_expr.clone(),
             })
         }
-        Expr::Literal(ScalarValue::Int64(Some(position))) => plan_err!(
+        Expr::Literal(ScalarValue::Int64(Some(position)), _) => plan_err!(
             "Cannot find column with position {} in SELECT clause. Valid columns: 1 to {}",
-            position, select_exprs.len()
+            position,
+            select_exprs.len()
         ),
         _ => Ok(expr),
     }
@@ -241,15 +257,21 @@ pub fn window_expr_common_partition_keys(window_exprs: &[Expr]) -> Result<&[Expr
     let all_partition_keys = window_exprs
         .iter()
         .map(|expr| match expr {
-            Expr::WindowFunction(WindowFunction {
-                params: WindowFunctionParams { partition_by, .. },
-                ..
-            }) => Ok(partition_by),
-            Expr::Alias(Alias { expr, .. }) => match expr.as_ref() {
-                Expr::WindowFunction(WindowFunction {
+            Expr::WindowFunction(window_fun) => {
+                let WindowFunction {
                     params: WindowFunctionParams { partition_by, .. },
                     ..
-                }) => Ok(partition_by),
+                } = window_fun.as_ref();
+                Ok(partition_by)
+            }
+            Expr::Alias(Alias { expr, .. }) => match expr.as_ref() {
+                Expr::WindowFunction(window_fun) => {
+                    let WindowFunction {
+                        params: WindowFunctionParams { partition_by, .. },
+                        ..
+                    } = window_fun.as_ref();
+                    Ok(partition_by)
+                }
                 expr => exec_err!("Impossibly got non-window expr {expr:?}"),
             },
             expr => exec_err!("Impossibly got non-window expr {expr:?}"),
@@ -258,9 +280,7 @@ pub fn window_expr_common_partition_keys(window_exprs: &[Expr]) -> Result<&[Expr
     let result = all_partition_keys
         .iter()
         .min_by_key(|s| s.len())
-        .ok_or_else(|| {
-            DataFusionError::Execution("No window expressions found".to_owned())
-        })?;
+        .ok_or_else(|| exec_datafusion_err!("No window expressions found"))?;
     Ok(result)
 }
 
@@ -275,7 +295,7 @@ pub(crate) fn make_decimal_type(
         (Some(p), Some(s)) => (p as u8, s as i8),
         (Some(p), None) => (p as u8, 0),
         (None, Some(_)) => {
-            return plan_err!("Cannot specify only scale for decimal data type")
+            return plan_err!("Cannot specify only scale for decimal data type");
         }
         (None, None) => (DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE),
     };
@@ -311,6 +331,8 @@ pub(crate) fn value_to_string(value: &Value) -> Option<String> {
         Value::Number(_, _) | Value::Boolean(_) => Some(value.to_string()),
         Value::UnicodeStringLiteral(s) => Some(s.to_string()),
         Value::EscapedStringLiteral(s) => Some(s.to_string()),
+        Value::QuoteDelimitedStringLiteral(s)
+        | Value::NationalQuoteDelimitedStringLiteral(s) => Some(s.value.to_string()),
         Value::DoubleQuotedString(_)
         | Value::NationalStringLiteral(_)
         | Value::SingleQuotedByteStringLiteral(_)
@@ -354,7 +376,7 @@ pub(crate) fn rewrite_recursive_unnests_bottom_up(
 pub const UNNEST_PLACEHOLDER: &str = "__unnest_placeholder";
 
 /*
-This is only usedful when used with transform down up
+This is only useful when used with transform down up
 A full example of how the transformation works:
  */
 struct RecursiveUnnestRewriter<'a> {
@@ -386,6 +408,24 @@ impl RecursiveUnnestRewriter<'_> {
             .collect()
     }
 
+    /// Check if the current expression is at the root level for struct unnest purposes.
+    /// This is true if:
+    /// 1. The expression IS the root expression, OR
+    /// 2. The root expression is an Alias wrapping this expression
+    ///
+    /// This allows `unnest(struct_col) AS alias` to work, where the alias is simply
+    /// ignored for struct unnest (matching DuckDB behavior).
+    fn is_at_struct_allowed_root(&self, expr: &Expr) -> bool {
+        if expr == self.root_expr {
+            return true;
+        }
+        // Allow struct unnest when root is an alias wrapping the unnest
+        if let Expr::Alias(Alias { expr: inner, .. }) = self.root_expr {
+            return inner.as_ref() == expr;
+        }
+        false
+    }
+
     fn transform(
         &mut self,
         level: usize,
@@ -404,30 +444,31 @@ impl RecursiveUnnestRewriter<'_> {
         // This is due to the fact that unnest transformation should keep the original
         // column name as is, to comply with group by and order by
         let placeholder_column = Column::from_name(placeholder_name.clone());
-
-        let (data_type, _) = expr_in_unnest.data_type_and_nullable(self.input_schema)?;
+        let field = expr_in_unnest.to_field(self.input_schema)?.1;
+        let data_type = field.data_type();
 
         match data_type {
             DataType::Struct(inner_fields) => {
-                if !struct_allowed {
-                    return internal_err!("unnest on struct can only be applied at the root level of select expression");
-                }
+                assert_or_internal_err!(
+                    struct_allowed,
+                    "unnest on struct can only be applied at the root level of select expression"
+                );
                 push_projection_dedupl(
                     self.inner_projection_exprs,
                     expr_in_unnest.clone().alias(placeholder_name.clone()),
                 );
                 self.columns_unnestings
                     .insert(Column::from_name(placeholder_name.clone()), None);
-                Ok(
-                    get_struct_unnested_columns(&placeholder_name, &inner_fields)
-                        .into_iter()
-                        .map(Expr::Column)
-                        .collect(),
-                )
+                Ok(get_struct_unnested_columns(&placeholder_name, inner_fields)
+                    .into_iter()
+                    .map(Expr::Column)
+                    .collect())
             }
             DataType::List(_)
             | DataType::FixedSizeList(_, _)
-            | DataType::LargeList(_) => {
+            | DataType::LargeList(_)
+            | DataType::ListView(_)
+            | DataType::LargeListView(_) => {
                 push_projection_dedupl(
                     self.inner_projection_exprs,
                     expr_in_unnest.clone().alias(placeholder_name.clone()),
@@ -459,13 +500,13 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
     type Node = Expr;
 
     /// This downward traversal needs to keep track of:
-    /// - Whether or not some unnest expr has been visited from the top util the current node
+    /// - Whether or not some unnest expr has been visited from the top until the current node
     /// - If some unnest expr has been visited, maintain a stack of such information, this
     ///   is used to detect if some recursive unnest expr exists (e.g **unnest(unnest(unnest(3d column))))**
     fn f_down(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
         if let Expr::Unnest(ref unnest_expr) = expr {
-            let (data_type, _) =
-                unnest_expr.expr.data_type_and_nullable(self.input_schema)?;
+            let field = unnest_expr.expr.to_field(self.input_schema)?.1;
+            let data_type = field.data_type();
             self.consecutive_unnest.push(Some(unnest_expr.clone()));
             // if expr inside unnest is a struct, do not consider
             // the next unnest as consecutive unnest (if any)
@@ -518,7 +559,6 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
     ///                          / /
     ///                       column2
     /// ```
-    ///
     fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>> {
         if let Expr::Unnest(ref traversing_unnest) = expr {
             if traversing_unnest == self.top_most_unnest.as_ref().unwrap() {
@@ -542,13 +582,14 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
                 let most_inner = unnest_stack.first().unwrap();
                 let inner_expr = most_inner.expr.as_ref();
                 // unnest(unnest(struct_arr_col)) is not allow to be done recursively
-                // it needs to be splitted into multiple unnest logical plan
+                // it needs to be split into multiple unnest logical plan
                 // unnest(struct_arr)
                 //  unnest(struct_arr_col) as struct_arr
                 // instead of unnest(struct_arr_col, depth = 2)
 
                 let unnest_recursion = unnest_stack.len();
-                let struct_allowed = (&expr == self.root_expr) && unnest_recursion == 1;
+                let struct_allowed =
+                    self.is_at_struct_allowed_root(&expr) && unnest_recursion == 1;
 
                 let mut transformed_exprs = self.transform(
                     unnest_recursion,
@@ -556,7 +597,9 @@ impl TreeNodeRewriter for RecursiveUnnestRewriter<'_> {
                     inner_expr,
                     struct_allowed,
                 )?;
-                if struct_allowed {
+                // Only set transformed_root_exprs for struct unnest (which returns multiple expressions).
+                // For list unnest (single expression), we let the normal rewrite handle the alias.
+                if struct_allowed && transformed_exprs.len() > 1 {
                     self.transformed_root_exprs = Some(transformed_exprs.clone());
                 }
                 return Ok(Transformed::new(
@@ -660,7 +703,7 @@ mod tests {
     use arrow::datatypes::{DataType as ArrowDataType, Field, Fields, Schema};
     use datafusion_common::{Column, DFSchema, Result};
     use datafusion_expr::{
-        col, lit, unnest, ColumnUnnestList, EmptyRelation, LogicalPlan,
+        ColumnUnnestList, EmptyRelation, LogicalPlan, col, lit, unnest,
     };
     use datafusion_functions::core::expr_ext::FieldAccessor;
     use datafusion_functions_aggregate::expr_fn::count;
@@ -686,7 +729,7 @@ mod tests {
                 ),
             })
             .collect();
-        let l_formatted: Vec<String> = l.iter().map(|i| i.to_string()).collect();
+        let l_formatted: Vec<String> = l.iter().map(|i| (*i).to_string()).collect();
         assert_eq!(l_formatted, r_formatted);
     }
 
@@ -732,13 +775,15 @@ mod tests {
         // Only the bottom most unnest exprs are transformed
         assert_eq!(
             transformed_exprs,
-            vec![col("__unnest_placeholder(3d_col,depth=2)")
-                .alias("UNNEST(UNNEST(3d_col))")
-                .add(
-                    col("__unnest_placeholder(3d_col,depth=2)")
-                        .alias("UNNEST(UNNEST(3d_col))")
-                )
-                .add(col("i64_col"))]
+            vec![
+                col("__unnest_placeholder(3d_col,depth=2)")
+                    .alias("UNNEST(UNNEST(3d_col))")
+                    .add(
+                        col("__unnest_placeholder(3d_col,depth=2)")
+                            .alias("UNNEST(UNNEST(3d_col))")
+                    )
+                    .add(col("i64_col"))
+            ]
         );
         column_unnests_eq(
             vec![
@@ -774,7 +819,9 @@ mod tests {
             ]
         );
         column_unnests_eq(
-            vec!["__unnest_placeholder(3d_col)=>[__unnest_placeholder(3d_col,depth=2)|depth=2, __unnest_placeholder(3d_col,depth=1)|depth=1]"],
+            vec![
+                "__unnest_placeholder(3d_col)=>[__unnest_placeholder(3d_col,depth=2)|depth=2, __unnest_placeholder(3d_col,depth=1)|depth=1]",
+            ],
             &unnest_placeholder_columns,
         );
         // Still reference struct_col in original schema but with alias,
@@ -866,9 +913,11 @@ mod tests {
         // Only transform the unnest children
         assert_eq!(
             transformed_exprs,
-            vec![col("__unnest_placeholder(array_col,depth=1)")
-                .alias("UNNEST(array_col)")
-                .add(lit(1i64))]
+            vec![
+                col("__unnest_placeholder(array_col,depth=1)")
+                    .alias("UNNEST(array_col)")
+                    .add(lit(1i64))
+            ]
         );
 
         // Keep appending to the current vector
diff --git a/datafusion/sql/src/values.rs b/datafusion/sql/src/values.rs
index dd8957c95470d..c8cdf1254f33f 100644
--- a/datafusion/sql/src/values.rs
+++ b/datafusion/sql/src/values.rs
@@ -18,7 +18,7 @@
 use std::sync::Arc;
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
-use datafusion_common::{DFSchema, Result};
+use datafusion_common::{DFSchema, Result, not_impl_err};
 use datafusion_expr::{LogicalPlan, LogicalPlanBuilder};
 use sqlparser::ast::Values as SQLValues;
 
@@ -31,7 +31,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         let SQLValues {
             explicit_row: _,
             rows,
+            value_keyword,
         } = values;
+        if value_keyword {
+            return not_impl_err!(
+                "`VALUE` keyword not supported. Did you mean `VALUES`?"
+            )?;
+        }
 
         let empty_schema = Arc::new(DFSchema::empty());
         let values = rows
diff --git a/datafusion/sql/tests/cases/collection.rs b/datafusion/sql/tests/cases/collection.rs
index 59704d6445b35..06a876dcfc9eb 100644
--- a/datafusion/sql/tests/cases/collection.rs
+++ b/datafusion/sql/tests/cases/collection.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion_common::{assert_contains, DataFusionError};
+use datafusion_common::{DataFusionError, assert_contains};
 use datafusion_sql::planner::SqlToRel;
 use sqlparser::{dialect::GenericDialect, parser::Parser};
 
@@ -42,9 +42,11 @@ fn test_collect_select_items() {
     let error = do_query(query);
     let errors = error.iter().collect::<Vec<_>>();
     assert_eq!(errors.len(), 2);
-    assert!(errors[0]
-        .to_string()
-        .contains("No field named first_namex."));
+    assert!(
+        errors[0]
+            .to_string()
+            .contains("No field named first_namex.")
+    );
     assert_contains!(errors[1].to_string(), "No field named last_namex.");
 }
 
diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs
index b3fc5dea9efff..7a729739469d3 100644
--- a/datafusion/sql/tests/cases/diagnostic.rs
+++ b/datafusion/sql/tests/cases/diagnostic.rs
@@ -69,10 +69,12 @@ fn do_query(sql: &'static str) -> Diagnostic {
 /// ## Example
 ///
 /// ```rust
-/// let spans = get_spans("SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars");
-/// // whole is                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-/// // left is                                  ^^^^^
-/// // right is                                                          ^^
+/// let spans = get_spans(
+///     "SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars",
+///     // whole is           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+///     // left is            ^^^^^
+///     // right is                                    ^^
+/// );
 /// dbg!(&spans["whole"]);
 /// dbg!(&spans["left"]);
 /// dbg!(&spans["right"]);
@@ -184,7 +186,7 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> {
     let diag = do_query(query);
     assert_snapshot!(diag.message, @"'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression");
     assert_eq!(diag.span, Some(spans["a"]));
-    assert_snapshot!(diag.helps[0].message, @"Either add 'person.first_name' to GROUP BY clause, or use an aggregare function like ANY_VALUE(person.first_name)");
+    assert_snapshot!(diag.helps[0].message, @"Either add 'person.first_name' to GROUP BY clause, or use an aggregate function like ANY_VALUE(person.first_name)");
     Ok(())
 }
 
@@ -202,8 +204,7 @@ fn test_ambiguous_reference() -> Result<()> {
 
 #[test]
 fn test_incompatible_types_binary_arithmetic() -> Result<()> {
-    let query =
-        "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person";
+    let query = "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person";
     let spans = get_spans(query);
     let diag = do_query(query);
     assert_snapshot!(diag.message, @"expressions have incompatible types");
diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs
index b3cc49c310718..396f619400c74 100644
--- a/datafusion/sql/tests/cases/params.rs
+++ b/datafusion/sql/tests/cases/params.rs
@@ -16,10 +16,14 @@
 // under the License.
 
 use crate::logical_plan;
-use arrow::datatypes::DataType;
-use datafusion_common::{assert_contains, ParamValues, ScalarValue};
+use arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion_common::{
+    ParamValues, ScalarValue, assert_contains,
+    metadata::{ScalarAndMetadata, format_type_and_metadata},
+};
 use datafusion_expr::{LogicalPlan, Prepare, Statement};
 use insta::assert_snapshot;
+use itertools::Itertools as _;
 use std::collections::HashMap;
 
 pub struct ParameterTest<'a> {
@@ -36,7 +40,7 @@ impl ParameterTest<'_> {
         let expected_types: HashMap<String, Option<DataType>> = self
             .expected_types
             .iter()
-            .map(|(k, v)| (k.to_string(), v.clone()))
+            .map(|(k, v)| ((*k).to_string(), v.clone()))
             .collect();
 
         assert_eq!(actual_types, expected_types);
@@ -50,12 +54,42 @@ impl ParameterTest<'_> {
     }
 }
 
+pub struct ParameterTestWithMetadata<'a> {
+    pub sql: &'a str,
+    pub expected_types: Vec<(&'a str, Option<FieldRef>)>,
+    pub param_values: Vec<ScalarAndMetadata>,
+}
+
+impl ParameterTestWithMetadata<'_> {
+    pub fn run(&self) -> String {
+        let plan = logical_plan(self.sql).unwrap();
+
+        let actual_types = plan.get_parameter_fields().unwrap();
+        let expected_types: HashMap<String, Option<FieldRef>> = self
+            .expected_types
+            .iter()
+            .map(|(k, v)| ((*k).to_string(), v.clone()))
+            .collect();
+
+        assert_eq!(actual_types, expected_types);
+
+        let plan_with_params = plan
+            .clone()
+            .with_param_values(ParamValues::List(self.param_values.clone()))
+            .unwrap();
+
+        format!("** Initial Plan:\n{plan}\n** Final Plan:\n{plan_with_params}")
+    }
+}
+
 fn generate_prepare_stmt_and_data_types(sql: &str) -> (LogicalPlan, String) {
     let plan = logical_plan(sql).unwrap();
     let data_types = match &plan {
-        LogicalPlan::Statement(Statement::Prepare(Prepare { data_types, .. })) => {
-            format!("{data_types:?}")
-        }
+        LogicalPlan::Statement(Statement::Prepare(Prepare { fields, .. })) => fields
+            .iter()
+            .map(|f| format_type_and_metadata(f.data_type(), Some(f.metadata())))
+            .join(", ")
+            .to_string(),
         _ => panic!("Expected a Prepare statement"),
     };
     (plan, data_types)
@@ -69,9 +103,7 @@ fn test_prepare_statement_to_plan_panic_param_format() {
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err().strip_backtrace(),
-        @r###"
-        Error during planning: Invalid placeholder, not a number: $foo
-        "###
+        @"Error during planning: Unknown placeholder: $foo"
     );
 }
 
@@ -83,9 +115,7 @@ fn test_prepare_statement_to_plan_panic_param_zero() {
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err().strip_backtrace(),
-        @r###"
-        Error during planning: Invalid placeholder, zero is not a valid index: $0
-        "###
+        @"Error during planning: Invalid placeholder, zero is not a valid index: $0"
     );
 }
 
@@ -94,10 +124,12 @@ fn test_prepare_statement_to_plan_panic_prepare_wrong_syntax() {
     // param is not number following the $ sign
     // panic due to error returned from the parser
     let sql = "PREPARE AS SELECT id, age  FROM person WHERE age = $foo";
-    assert!(logical_plan(sql)
-        .unwrap_err()
-        .strip_backtrace()
-        .contains("Expected: AS, found: SELECT"))
+    assert!(
+        logical_plan(sql)
+            .unwrap_err()
+            .strip_backtrace()
+            .contains("Expected: AS, found: SELECT")
+    )
 }
 
 #[test]
@@ -107,7 +139,7 @@ fn test_prepare_statement_to_plan_panic_no_relation_and_constant_param() {
     let plan = logical_plan(sql).unwrap_err().strip_backtrace();
     assert_snapshot!(
         plan,
-        @r"Schema error: No field named id."
+        @"Schema error: No field named id."
     );
 }
 
@@ -160,7 +192,7 @@ fn test_prepare_statement_to_plan_no_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32]"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -188,7 +220,7 @@ fn test_prepare_statement_to_plan_no_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[]"#);
+    assert_snapshot!(dt, @"");
 
     ///////////////////
     // replace params with values
@@ -216,9 +248,7 @@ fn test_prepare_statement_to_plan_one_param_no_value_panic() {
         plan.with_param_values(param_values)
         .unwrap_err()
         .strip_backtrace(),
-        @r###"
-        Error during planning: Expected 1 parameters, got 0
-        "###);
+        @"Error during planning: Expected 1 parameters, got 0");
 }
 
 #[test]
@@ -233,9 +263,7 @@ fn test_prepare_statement_to_plan_one_param_one_value_different_type_panic() {
         plan.with_param_values(param_values)
             .unwrap_err()
             .strip_backtrace(),
-        @r###"
-        Error during planning: Expected parameter of type Int32, got Float64 at index 0
-        "###
+        @"Error during planning: Expected parameter of type Int32, got Float64 at index 0"
     );
 }
 
@@ -251,9 +279,7 @@ fn test_prepare_statement_to_plan_no_param_on_value_panic() {
         plan.with_param_values(param_values)
             .unwrap_err()
             .strip_backtrace(),
-        @r###"
-        Error during planning: Expected 0 parameters, got 1
-        "###
+        @"Error during planning: Expected 0 parameters, got 1"
     );
 }
 
@@ -266,10 +292,10 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         @r#"
     Prepare: "my_plan" [Int32]
       Projection: $1
-        EmptyRelation
+        EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32]"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -279,7 +305,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         plan_with_params,
         @r"
     Projection: Int32(10) AS $1
-      EmptyRelation
+      EmptyRelation: rows=1
     "
     );
 
@@ -291,10 +317,10 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         @r#"
     Prepare: "my_plan" [Int32]
       Projection: Int64(1) + $1
-        EmptyRelation
+        EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32]"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -304,7 +330,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         plan_with_params,
         @r"
     Projection: Int64(1) + Int32(10) AS Int64(1) + $1
-      EmptyRelation
+      EmptyRelation: rows=1
     "
     );
 
@@ -316,10 +342,10 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         @r#"
     Prepare: "my_plan" [Int32, Float64]
       Projection: Int64(1) + $1 + $2
-        EmptyRelation
+        EmptyRelation: rows=1
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32, Float64]"#);
+    assert_snapshot!(dt, @"Int32, Float64");
 
     ///////////////////
     // replace params with values
@@ -332,7 +358,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
         plan_with_params,
         @r"
     Projection: Int64(1) + Int32(10) + Float64(10) AS Int64(1) + $1 + $2
-      EmptyRelation
+      EmptyRelation: rows=1
     "
     );
 }
@@ -340,8 +366,7 @@ fn test_prepare_statement_to_plan_params_as_constants() {
 #[test]
 fn test_infer_types_from_join() {
     let test = ParameterTest {
-        sql:
-            "SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
+        sql: "SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
         expected_types: vec![("$1", Some(DataType::Int32))],
         param_values: vec![ScalarValue::Int32(Some(10))],
     };
@@ -368,7 +393,7 @@ fn test_prepare_statement_infer_types_from_join() {
     let test = ParameterTest {
         sql: "PREPARE my_plan AS SELECT id, order_id FROM person JOIN orders ON id = customer_id and age = $1",
         expected_types: vec![("$1", Some(DataType::Int32))],
-        param_values: vec![ScalarValue::Int32(Some(10))]
+        param_values: vec![ScalarValue::Int32(Some(10))],
     };
 
     assert_snapshot!(
@@ -492,7 +517,7 @@ fn test_infer_types_subquery() {
     let test = ParameterTest {
         sql: "SELECT id, age FROM person WHERE age = (select max(age) from person where id = $1)",
         expected_types: vec![("$1", Some(DataType::UInt32))],
-        param_values: vec![ScalarValue::UInt32(Some(10))]
+        param_values: vec![ScalarValue::UInt32(Some(10))],
     };
 
     assert_snapshot!(
@@ -525,7 +550,7 @@ fn test_prepare_statement_infer_types_subquery() {
     let test = ParameterTest {
         sql: "PREPARE my_plan AS SELECT id, age FROM person WHERE age = (select max(age) from person where id = $1)",
         expected_types: vec![("$1", Some(DataType::UInt32))],
-        param_values: vec![ScalarValue::UInt32(Some(10))]
+        param_values: vec![ScalarValue::UInt32(Some(10))],
     };
 
     assert_snapshot!(
@@ -632,11 +657,11 @@ fn test_insert_infer() {
         @r#"
     ** Initial Plan:
     Dml: op=[Insert Into] table=[person]
-      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀
         Values: ($1, $2, $3)
     ** Final Plan:
     Dml: op=[Insert Into] table=[person]
-      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀
         Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3)
     "#
     );
@@ -655,7 +680,7 @@ fn test_prepare_statement_insert_infer() {
             ScalarValue::UInt32(Some(1)),
             ScalarValue::from("Alan"),
             ScalarValue::from("Turing"),
-        ]
+        ],
     };
     assert_snapshot!(
         test.run(),
@@ -663,11 +688,11 @@ fn test_prepare_statement_insert_infer() {
     ** Initial Plan:
     Prepare: "my_plan" [UInt32, Utf8, Utf8]
       Dml: op=[Insert Into] table=[person]
-        Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀
+        Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀
           Values: ($1, $2, $3)
     ** Final Plan:
     Dml: op=[Insert Into] table=[person]
-      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀
         Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3)
     "#
     );
@@ -686,7 +711,7 @@ fn test_prepare_statement_to_plan_one_param() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32]"#);
+    assert_snapshot!(dt, @"Int32");
 
     ///////////////////
     // replace params with values
@@ -703,6 +728,147 @@ fn test_prepare_statement_to_plan_one_param() {
     );
 }
 
+#[test]
+fn test_update_infer_with_metadata() {
+    // Here the uuid field is inferred as nullable because it appears in the filter
+    // (and not in the update values, where its nullability would be inferred)
+    let uuid_field = Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+        [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())].into(),
+    );
+    let uuid_bytes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let expected_types = vec![
+        (
+            "$1",
+            Some(Field::new("last_name", DataType::Utf8, false).into()),
+        ),
+        ("$2", Some(uuid_field.clone().with_name("id").into())),
+    ];
+    let param_values = vec![
+        ScalarAndMetadata::from(ScalarValue::from("Turing")),
+        ScalarAndMetadata::new(
+            ScalarValue::FixedSizeBinary(16, Some(uuid_bytes)),
+            Some(uuid_field.metadata().into()),
+        ),
+    ];
+
+    // Check a normal update
+    let test = ParameterTestWithMetadata {
+        sql: "update person_with_uuid_extension set last_name=$1 where id=$2",
+        expected_types: expected_types.clone(),
+        param_values: param_values.clone(),
+    };
+
+    assert_snapshot!(
+        test.run(),
+        @r#"
+    ** Initial Plan:
+    Dml: op=[Update] table=[person_with_uuid_extension]
+      Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, $1 AS last_name
+        Filter: person_with_uuid_extension.id = $2
+          TableScan: person_with_uuid_extension
+    ** Final Plan:
+    Dml: op=[Update] table=[person_with_uuid_extension]
+      Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, Utf8("Turing") AS last_name
+        Filter: person_with_uuid_extension.id = FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} }
+          TableScan: person_with_uuid_extension
+    "#
+    );
+
+    // Check a prepared update
+    let test = ParameterTestWithMetadata {
+        sql: "PREPARE my_plan AS update person_with_uuid_extension set last_name=$1 where id=$2",
+        expected_types,
+        param_values,
+    };
+
+    assert_snapshot!(
+        test.run(),
+        @r#"
+    ** Initial Plan:
+    Prepare: "my_plan" [Utf8, FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>]
+      Dml: op=[Update] table=[person_with_uuid_extension]
+        Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, $1 AS last_name
+          Filter: person_with_uuid_extension.id = $2
+            TableScan: person_with_uuid_extension
+    ** Final Plan:
+    Dml: op=[Update] table=[person_with_uuid_extension]
+      Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, Utf8("Turing") AS last_name
+        Filter: person_with_uuid_extension.id = FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} }
+          TableScan: person_with_uuid_extension
+    "#
+    );
+}
+
+#[test]
+fn test_insert_infer_with_metadata() {
+    let uuid_field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata(
+        [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())].into(),
+    );
+    let uuid_bytes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let expected_types = vec![
+        ("$1", Some(uuid_field.clone().with_name("id").into())),
+        (
+            "$2",
+            Some(Field::new("first_name", DataType::Utf8, false).into()),
+        ),
+        (
+            "$3",
+            Some(Field::new("last_name", DataType::Utf8, false).into()),
+        ),
+    ];
+    let param_values = vec![
+        ScalarAndMetadata::new(
+            ScalarValue::FixedSizeBinary(16, Some(uuid_bytes)),
+            Some(uuid_field.metadata().into()),
+        ),
+        ScalarAndMetadata::from(ScalarValue::from("Alan")),
+        ScalarAndMetadata::from(ScalarValue::from("Turing")),
+    ];
+
+    // Check a normal insert
+    let test = ParameterTestWithMetadata {
+        sql: "insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)",
+        expected_types: expected_types.clone(),
+        param_values: param_values.clone(),
+    };
+
+    assert_snapshot!(
+        test.run(),
+        @r#"
+    ** Initial Plan:
+    Dml: op=[Insert Into] table=[person_with_uuid_extension]
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name
+        Values: ($1, $2, $3)
+    ** Final Plan:
+    Dml: op=[Insert Into] table=[person_with_uuid_extension]
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name
+        Values: (FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3)
+    "#
+    );
+
+    // Check a prepared insert
+    let test = ParameterTestWithMetadata {
+        sql: "PREPARE my_plan AS insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)",
+        expected_types,
+        param_values,
+    };
+
+    assert_snapshot!(
+        test.run(),
+        @r#"
+    ** Initial Plan:
+    Prepare: "my_plan" [FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>, Utf8, Utf8]
+      Dml: op=[Insert Into] table=[person_with_uuid_extension]
+        Projection: column1 AS id, column2 AS first_name, column3 AS last_name
+          Values: ($1, $2, $3)
+    ** Final Plan:
+    Dml: op=[Insert Into] table=[person_with_uuid_extension]
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name
+        Values: (FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3)
+    "#
+    );
+}
+
 #[test]
 fn test_prepare_statement_to_plan_data_type() {
     let sql = "PREPARE my_plan(DOUBLE) AS SELECT id, age  FROM person WHERE age = $1";
@@ -719,7 +885,7 @@ fn test_prepare_statement_to_plan_data_type() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Float64]"#);
+    assert_snapshot!(dt, @"Float64");
 
     ///////////////////
     // replace params with values still succeed and use Float64
@@ -746,31 +912,31 @@ fn test_prepare_statement_to_plan_multi_params() {
     assert_snapshot!(
         plan,
         @r#"
-    Prepare: "my_plan" [Int32, Utf8, Float64, Int32, Float64, Utf8]
+    Prepare: "my_plan" [Int32, Utf8View, Float64, Int32, Float64, Utf8View]
       Projection: person.id, person.age, $6
         Filter: person.age IN ([$1, $4]) AND person.salary > $3 AND person.salary < $5 OR person.first_name < $2
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32, Utf8, Float64, Int32, Float64, Utf8]"#);
+    assert_snapshot!(dt, @"Int32, Utf8View, Float64, Int32, Float64, Utf8View");
 
     ///////////////////
     // replace params with values
     let param_values = vec![
         ScalarValue::Int32(Some(10)),
-        ScalarValue::from("abc"),
+        ScalarValue::Utf8View(Some("abc".into())),
         ScalarValue::Float64(Some(100.0)),
         ScalarValue::Int32(Some(20)),
         ScalarValue::Float64(Some(200.0)),
-        ScalarValue::from("xyz"),
+        ScalarValue::Utf8View(Some("xyz".into())),
     ];
 
     let plan_with_params = plan.with_param_values(param_values).unwrap();
     assert_snapshot!(
         plan_with_params,
         @r#"
-    Projection: person.id, person.age, Utf8("xyz") AS $6
-      Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8("abc")
+    Projection: person.id, person.age, Utf8View("xyz") AS $6
+      Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8View("abc")
         TableScan: person
     "#
     );
@@ -797,7 +963,7 @@ fn test_prepare_statement_to_plan_having() {
               TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Int32, Float64, Float64, Float64]"#);
+    assert_snapshot!(dt, @"Int32, Float64, Float64, Float64");
 
     ///////////////////
     // replace params with values
@@ -811,13 +977,13 @@ fn test_prepare_statement_to_plan_having() {
     let plan_with_params = plan.with_param_values(param_values).unwrap();
     assert_snapshot!(
         plan_with_params,
-        @r#"
+        @r"
     Projection: person.id, sum(person.age)
       Filter: sum(person.age) < Int32(10) AND sum(person.age) > Int64(10) OR sum(person.age) IN ([Float64(200), Float64(300)])
         Aggregate: groupBy=[[person.id]], aggr=[[sum(person.age)]]
           Filter: person.salary > Float64(100)
             TableScan: person
-    "#
+    "
     );
 }
 
@@ -836,18 +1002,18 @@ fn test_prepare_statement_to_plan_limit() {
           TableScan: person
     "#
     );
-    assert_snapshot!(dt, @r#"[Int64, Int64]"#);
+    assert_snapshot!(dt, @"Int64, Int64");
 
     // replace params with values
     let param_values = vec![ScalarValue::Int64(Some(10)), ScalarValue::Int64(Some(200))];
     let plan_with_params = plan.with_param_values(param_values).unwrap();
     assert_snapshot!(
         plan_with_params,
-        @r#"
+        @r"
     Limit: skip=10, fetch=200
       Projection: person.id
         TableScan: person
-    "#
+    "
     );
 }
 
@@ -882,5 +1048,8 @@ fn test_prepare_statement_bad_list_idx() {
     let param_values = ParamValues::List(vec![]);
 
     let err = plan.replace_params_with_values(&param_values).unwrap_err();
-    assert_contains!(err.to_string(), "Error during planning: Failed to parse placeholder id: invalid digit found in string");
+    assert_contains!(
+        err.to_string(),
+        "Error during planning: Failed to parse placeholder id: invalid digit found in string"
+    );
 }
diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs
index b4697c2fe473b..be110ab07e6a1 100644
--- a/datafusion/sql/tests/cases/plan_to_sql.rs
+++ b/datafusion/sql/tests/cases/plan_to_sql.rs
@@ -16,16 +16,19 @@
 // under the License.
 
 use arrow::datatypes::{DataType, Field, Schema};
+
 use datafusion_common::{
-    assert_contains, Column, DFSchema, DFSchemaRef, DataFusionError, Result,
-    TableReference,
+    Column, DFSchema, DFSchemaRef, DataFusionError, Result, TableReference,
+    assert_contains,
 };
+use datafusion_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion_expr::test::function_stub::{
     count_udaf, max_udaf, min_udaf, sum, sum_udaf,
 };
 use datafusion_expr::{
-    cast, col, lit, table_scan, wildcard, EmptyRelation, Expr, Extension, LogicalPlan,
-    LogicalPlanBuilder, Union, UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+    EmptyRelation, Expr, Extension, LogicalPlan, LogicalPlanBuilder, Union,
+    UserDefinedLogicalNode, UserDefinedLogicalNodeCore, WindowFrame,
+    WindowFunctionDefinition, cast, col, lit, table_scan, wildcard,
 };
 use datafusion_functions::unicode;
 use datafusion_functions_aggregate::grouping::grouping_udaf;
@@ -34,11 +37,11 @@ use datafusion_functions_nested::map::map_udf;
 use datafusion_functions_window::rank::rank_udwf;
 use datafusion_sql::planner::{ContextProvider, PlannerContext, SqlToRel};
 use datafusion_sql::unparser::dialect::{
-    CustomDialectBuilder, DefaultDialect as UnparserDefaultDialect, DefaultDialect,
-    Dialect as UnparserDialect, MySqlDialect as UnparserMySqlDialect,
+    BigQueryDialect, CustomDialectBuilder, DefaultDialect as UnparserDefaultDialect,
+    DefaultDialect, Dialect as UnparserDialect, MySqlDialect as UnparserMySqlDialect,
     PostgreSqlDialect as UnparserPostgreSqlDialect, SqliteDialect,
 };
-use datafusion_sql::unparser::{expr_to_sql, plan_to_sql, Unparser};
+use datafusion_sql::unparser::{Unparser, expr_to_sql, plan_to_sql};
 use insta::assert_snapshot;
 use sqlparser::ast::Statement;
 use std::hash::Hash;
@@ -51,6 +54,7 @@ use datafusion_expr::builder::{
     project, subquery_alias, table_scan_with_filter_and_fetch, table_scan_with_filters,
 };
 use datafusion_functions::core::planner::CoreFunctionPlanner;
+use datafusion_functions::unicode::planner::UnicodeFunctionPlanner;
 use datafusion_functions_nested::extract::array_element_udf;
 use datafusion_functions_nested::planner::{FieldAccessPlanner, NestedFunctionPlanner};
 use datafusion_sql::unparser::ast::{
@@ -66,26 +70,26 @@ use sqlparser::parser::Parser;
 #[test]
 fn test_roundtrip_expr_1() {
     let expr = roundtrip_expr(TableReference::bare("person"), "age > 35").unwrap();
-    assert_snapshot!(expr, @r#"(age > 35)"#);
+    assert_snapshot!(expr, @"(age > 35)");
 }
 
 #[test]
 fn test_roundtrip_expr_2() {
     let expr = roundtrip_expr(TableReference::bare("person"), "id = '10'").unwrap();
-    assert_snapshot!(expr, @r#"(id = '10')"#);
+    assert_snapshot!(expr, @"(id = '10')");
 }
 
 #[test]
 fn test_roundtrip_expr_3() {
     let expr =
         roundtrip_expr(TableReference::bare("person"), "CAST(id AS VARCHAR)").unwrap();
-    assert_snapshot!(expr, @r#"CAST(id AS VARCHAR)"#);
+    assert_snapshot!(expr, @"CAST(id AS VARCHAR)");
 }
 
 #[test]
 fn test_roundtrip_expr_4() {
     let expr = roundtrip_expr(TableReference::bare("person"), "sum((age * 2))").unwrap();
-    assert_snapshot!(expr, @r#"sum((age * 2))"#);
+    assert_snapshot!(expr, @"sum((age * 2))");
 }
 
 fn roundtrip_expr(table: TableReference, sql: &str) -> Result<String> {
@@ -94,7 +98,7 @@ fn roundtrip_expr(table: TableReference, sql: &str) -> Result<String> {
     let state = MockSessionState::default().with_aggregate_function(sum_udaf());
     let context = MockContextProvider { state };
     let schema = context.get_table_source(table)?.schema();
-    let df_schema = DFSchema::try_from(schema.as_ref().clone())?;
+    let df_schema = DFSchema::try_from(schema)?;
     let sql_to_rel = SqlToRel::new(&context);
     let expr =
         sql_to_rel.sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new())?;
@@ -282,7 +286,7 @@ fn roundtrip_crossjoin() -> Result<()> {
         plan_roundtrip,
         @r"
     Projection: j1.j1_id, j2.j2_string
-      Cross Join: 
+      Cross Join:
         TableScan: j1
         TableScan: j2
     "
@@ -307,7 +311,8 @@ macro_rules! roundtrip_statement_with_dialect_helper {
             .with_aggregate_function(max_udaf())
             .with_aggregate_function(min_udaf())
             .with_expr_planner(Arc::new(CoreFunctionPlanner::default()))
-            .with_expr_planner(Arc::new(NestedFunctionPlanner));
+            .with_expr_planner(Arc::new(NestedFunctionPlanner))
+            .with_expr_planner(Arc::new(FieldAccessPlanner));
 
         let context = MockContextProvider { state };
         let sql_to_rel = SqlToRel::new(&context);
@@ -329,9 +334,7 @@ fn roundtrip_statement_with_dialect_1() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min from j1 ta order by min(ta.j1_id) limit 10;",
         parser_dialect: MySqlDialect {},
         unparser_dialect: UnparserMySqlDialect {},
-        // top projection sort gets derived into a subquery
-        // for MySQL, this subquery needs an alias
-        expected: @"SELECT `j1_min` FROM (SELECT min(`ta`.`j1_id`) AS `j1_min`, min(`ta`.`j1_id`) FROM `j1` AS `ta` ORDER BY min(`ta`.`j1_id`) ASC) AS `derived_sort` LIMIT 10",
+        expected: @"SELECT min(`ta`.`j1_id`) AS `j1_min` FROM `j1` AS `ta` ORDER BY `j1_min` ASC LIMIT 10",
     );
     Ok(())
 }
@@ -342,9 +345,7 @@ fn roundtrip_statement_with_dialect_2() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min from j1 ta order by min(ta.j1_id) limit 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        // top projection sort still gets derived into a subquery in default dialect
-        // except for the default dialect, the subquery is left non-aliased
-        expected: @"SELECT j1_min FROM (SELECT min(ta.j1_id) AS j1_min, min(ta.j1_id) FROM j1 AS ta ORDER BY min(ta.j1_id) ASC NULLS LAST) LIMIT 10",
+        expected: @"SELECT min(ta.j1_id) AS j1_min FROM j1 AS ta ORDER BY j1_min ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -355,7 +356,7 @@ fn roundtrip_statement_with_dialect_3() -> Result<(), DataFusionError> {
         sql: "select min(ta.j1_id) as j1_min, max(tb.j1_max) from j1 ta, (select distinct max(ta.j1_id) as j1_max from j1 ta order by max(ta.j1_id)) tb order by min(ta.j1_id) limit 10;",
         parser_dialect: MySqlDialect {},
         unparser_dialect: UnparserMySqlDialect {},
-        expected: @"SELECT `j1_min`, `max(tb.j1_max)` FROM (SELECT min(`ta`.`j1_id`) AS `j1_min`, max(`tb`.`j1_max`), min(`ta`.`j1_id`) FROM `j1` AS `ta` CROSS JOIN (SELECT `j1_max` FROM (SELECT DISTINCT max(`ta`.`j1_id`) AS `j1_max` FROM `j1` AS `ta`) AS `derived_distinct`) AS `tb` ORDER BY min(`ta`.`j1_id`) ASC) AS `derived_sort` LIMIT 10",
+        expected: @"SELECT min(`ta`.`j1_id`) AS `j1_min`, max(`tb`.`j1_max`) FROM `j1` AS `ta` CROSS JOIN (SELECT DISTINCT max(`ta`.`j1_id`) AS `j1_max` FROM `j1` AS `ta`) AS `tb` ORDER BY `j1_min` ASC LIMIT 10",
     );
     Ok(())
 }
@@ -399,7 +400,7 @@ fn roundtrip_statement_with_dialect_7() -> Result<(), DataFusionError> {
         sql: "select ta.j1_id from j1 ta order by j1_id limit 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT ta.j1_id FROM j1 AS ta ORDER BY ta.j1_id ASC NULLS LAST LIMIT 10"#,
+        expected: @"SELECT ta.j1_id FROM j1 AS ta ORDER BY ta.j1_id ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -414,7 +415,7 @@ fn roundtrip_statement_with_dialect_8() -> Result<(), DataFusionError> {
                   LIMIT 10;",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_id FROM j1 UNION ALL SELECT tb.j2_id AS j1_id FROM j2 AS tb ORDER BY j1_id ASC NULLS LAST LIMIT 10"#,
+        expected: @"SELECT j1.j1_id FROM j1 UNION ALL SELECT tb.j2_id AS j1_id FROM j2 AS tb ORDER BY j1_id ASC NULLS LAST LIMIT 10",
     );
     Ok(())
 }
@@ -426,7 +427,7 @@ fn roundtrip_statement_with_dialect_9() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string from j1 order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string FROM j1 ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string FROM j1 ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -437,7 +438,7 @@ fn roundtrip_statement_with_dialect_10() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string AS a from j1 order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string AS a FROM j1 ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string AS a FROM j1 ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -448,7 +449,7 @@ fn roundtrip_statement_with_dialect_11() -> Result<(), DataFusionError> {
         sql: "SELECT j1_string from j1 join j2 on j1.j1_id = j2.j2_id order by j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT j1.j1_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST"#,
+        expected: @"SELECT j1.j1_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id ASC NULLS LAST",
     );
     Ok(())
 }
@@ -478,7 +479,7 @@ fn roundtrip_statement_with_dialect_12() -> Result<(), DataFusionError> {
                   abc.j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string, abc.j2_string FROM (SELECT DISTINCT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -500,7 +501,7 @@ fn roundtrip_statement_with_dialect_13() -> Result<(), DataFusionError> {
             ",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT OUTER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)"#,
+        expected: @"SELECT agg.string_count FROM (SELECT j1.j1_id, min(j2.j2_string) FROM j1 LEFT OUTER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id) AS agg (id, string_count)",
     );
     Ok(())
 }
@@ -534,7 +535,7 @@ fn roundtrip_statement_with_dialect_14() -> Result<(), DataFusionError> {
                   abc.j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string, abc.j2_string FROM (SELECT j1.j1_id, j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) GROUP BY j1.j1_id, j1.j1_string, j2.j2_string ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -564,7 +565,7 @@ fn roundtrip_statement_with_dialect_15() -> Result<(), DataFusionError> {
                   j2_string",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST"#,
+        expected: @"SELECT abc.j1_string FROM (SELECT j1.j1_string, j2.j2_string FROM j1 INNER JOIN j2 ON (j1.j1_id = j2.j2_id) ORDER BY j1.j1_id DESC NULLS FIRST, j2.j2_id DESC NULLS FIRST LIMIT 10) AS abc ORDER BY abc.j2_string ASC NULLS LAST",
     );
     Ok(())
 }
@@ -575,7 +576,7 @@ fn roundtrip_statement_with_dialect_16() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id from j1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)",
     );
     Ok(())
 }
@@ -586,7 +587,7 @@ fn roundtrip_statement_with_dialect_17() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id as id from j1) AS c",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT j1.j1_id AS id FROM j1) AS c"#,
+        expected: @"SELECT c.id FROM (SELECT j1.j1_id AS id FROM j1) AS c",
     );
     Ok(())
 }
@@ -598,7 +599,7 @@ fn roundtrip_statement_with_dialect_18() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id + 1 * 3 from j1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (j1.j1_id + (1 * 3)) FROM j1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (j1.j1_id + (1 * 3)) FROM j1) AS c (id)",
     );
     Ok(())
 }
@@ -610,7 +611,7 @@ fn roundtrip_statement_with_dialect_19() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT distinct (j1_id + 1 * 3) FROM j1 LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT DISTINCT (j1.j1_id + (1 * 3)) FROM j1 LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT DISTINCT (j1.j1_id + (1 * 3)) FROM j1 LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -621,7 +622,7 @@ fn roundtrip_statement_with_dialect_20() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT j1_id + 1 FROM j1 ORDER BY j1_id DESC LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (j1.j1_id + 1) FROM j1 ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (j1.j1_id + 1) FROM j1 ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -632,7 +633,7 @@ fn roundtrip_statement_with_dialect_21() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT CAST((CAST(j1_id as BIGINT) + 1) as int) * 10 FROM j1 LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (CAST((CAST(j1.j1_id AS BIGINT) + 1) AS INTEGER) * 10) FROM j1 LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (CAST((CAST(j1.j1_id AS BIGINT) + 1) AS INTEGER) * 10) FROM j1 LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -643,7 +644,7 @@ fn roundtrip_statement_with_dialect_22() -> Result<(), DataFusionError> {
         sql: "SELECT id FROM (SELECT CAST(j1_id as BIGINT) + 1 FROM j1 ORDER BY j1_id LIMIT 1) AS c (id)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT c.id FROM (SELECT (CAST(j1.j1_id AS BIGINT) + 1) FROM j1 ORDER BY j1.j1_id ASC NULLS LAST LIMIT 1) AS c (id)"#,
+        expected: @"SELECT c.id FROM (SELECT (CAST(j1.j1_id AS BIGINT) + 1) FROM j1 ORDER BY j1.j1_id ASC NULLS LAST LIMIT 1) AS c (id)",
     );
     Ok(())
 }
@@ -654,7 +655,7 @@ fn roundtrip_statement_with_dialect_23() -> Result<(), DataFusionError> {
         sql: "SELECT temp_j.id2 FROM (SELECT j1_id, j1_string FROM j1) AS temp_j(id2, string2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: UnparserDefaultDialect {},
-        expected: @r#"SELECT temp_j.id2 FROM (SELECT j1.j1_id, j1.j1_string FROM j1) AS temp_j (id2, string2)"#,
+        expected: @"SELECT temp_j.id2 FROM (SELECT j1.j1_id, j1.j1_string FROM j1) AS temp_j (id2, string2)",
     );
     Ok(())
 }
@@ -665,7 +666,7 @@ fn roundtrip_statement_with_dialect_24() -> Result<(), DataFusionError> {
         sql: "SELECT temp_j.id2 FROM (SELECT j1_id, j1_string FROM j1) AS temp_j(id2, string2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2`, `j1`.`j1_string` AS `string2` FROM `j1`) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2`, `j1`.`j1_string` AS `string2` FROM `j1`) AS `temp_j`",
     );
     Ok(())
 }
@@ -676,7 +677,7 @@ fn roundtrip_statement_with_dialect_25() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM (SELECT j1_id + 1 FROM j1) AS temp_j(id2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT (`j1`.`j1_id` + 1) AS `id2` FROM `j1`) AS `temp_j`",
     );
     Ok(())
 }
@@ -687,7 +688,7 @@ fn roundtrip_statement_with_dialect_26() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM (SELECT j1_id FROM j1 LIMIT 1) AS temp_j(id2)",
         parser_dialect: GenericDialect {},
         unparser_dialect: SqliteDialect {},
-        expected: @r#"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`"#,
+        expected: @"SELECT `temp_j`.`id2` FROM (SELECT `j1`.`j1_id` AS `id2` FROM `j1` LIMIT 1) AS `temp_j`",
     );
     Ok(())
 }
@@ -756,7 +757,7 @@ fn roundtrip_statement_with_dialect_32() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3])",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))) FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))) FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -781,7 +782,7 @@ fn roundtrip_statement_with_dialect_34() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) AS t1 (c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)"#,
+        expected: @"SELECT t1.c1 FROM UNNEST([1, 2, 3]) AS t1 (c1)",
     );
     Ok(())
 }
@@ -795,7 +796,7 @@ fn roundtrip_statement_with_dialect_35() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]), j1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))), j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) CROSS JOIN j1"#,
+        expected: @"SELECT UNNEST(make_array(Int64(1),Int64(2),Int64(3))), j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) CROSS JOIN j1",
     );
     Ok(())
 }
@@ -809,7 +810,7 @@ fn roundtrip_statement_with_dialect_36() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) JOIN j1 ON u.c1 = j1.j1_id",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.c1, j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)"#,
+        expected: @"SELECT u.c1, j1.j1_id, j1.j1_string FROM UNNEST([1, 2, 3]) AS u (c1) INNER JOIN j1 ON (u.c1 = j1.j1_id)",
     );
     Ok(())
 }
@@ -823,7 +824,7 @@ fn roundtrip_statement_with_dialect_37() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM UNNEST([1,2,3]) u(c1) UNION ALL SELECT * FROM UNNEST([4,5,6]) u(c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.c1 FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT u.c1 FROM UNNEST([4, 5, 6]) AS u (c1)"#,
+        expected: @"SELECT u.c1 FROM UNNEST([1, 2, 3]) AS u (c1) UNION ALL SELECT u.c1 FROM UNNEST([4, 5, 6]) AS u (c1)",
     );
     Ok(())
 }
@@ -837,7 +838,7 @@ fn roundtrip_statement_with_dialect_38() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3])",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT * FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT * FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -851,7 +852,7 @@ fn roundtrip_statement_with_dialect_39() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3]) as c1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3]) AS c1"#,
+        expected: @"SELECT UNNEST([1, 2, 3]) AS c1",
     );
     Ok(())
 }
@@ -865,7 +866,7 @@ fn roundtrip_statement_with_dialect_40() -> Result<(), DataFusionError> {
         sql: "SELECT UNNEST([1,2,3]), 1",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))), Int64(1)"#,
+        expected: @"SELECT UNNEST([1, 2, 3]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3))), Int64(1)",
     );
     Ok(())
 }
@@ -879,7 +880,7 @@ fn roundtrip_statement_with_dialect_41() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.array_col, u.struct_col, UNNEST(outer_ref(u.array_col)) FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col)"#,
+        expected: @"SELECT u.array_col, u.struct_col, UNNEST(outer_ref(u.array_col)) FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col)",
     );
     Ok(())
 }
@@ -893,7 +894,7 @@ fn roundtrip_statement_with_dialect_42() -> Result<(), DataFusionError> {
         sql: "SELECT * FROM unnest_table u, UNNEST(u.array_col) AS t1 (c1)",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)"#,
+        expected: @"SELECT u.array_col, u.struct_col, t1.c1 FROM unnest_table AS u CROSS JOIN UNNEST(u.array_col) AS t1 (c1)",
     );
     Ok(())
 }
@@ -907,7 +908,7 @@ fn roundtrip_statement_with_dialect_43() -> Result<(), DataFusionError> {
         sql: "SELECT unnest([1, 2, 3, 4]) from unnest([1, 2, 3]);",
         parser_dialect: GenericDialect {},
         unparser_dialect: unparser,
-        expected: @r#"SELECT UNNEST([1, 2, 3, 4]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3),Int64(4))) FROM UNNEST([1, 2, 3])"#,
+        expected: @"SELECT UNNEST([1, 2, 3, 4]) AS UNNEST(make_array(Int64(1),Int64(2),Int64(3),Int64(4))) FROM UNNEST([1, 2, 3])",
     );
     Ok(())
 }
@@ -923,6 +924,41 @@ fn roundtrip_statement_with_dialect_45() -> Result<(), DataFusionError> {
     Ok(())
 }
 
+#[test]
+fn roundtrip_statement_with_dialect_special_char_alias() -> Result<(), DataFusionError> {
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select min(a) as \"min(a)\" from (select 1 as a)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: BigQueryDialect {},
+        expected: @"SELECT min(`a`) AS `min_40a_41` FROM (SELECT 1 AS `a`)",
+    );
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: BigQueryDialect {},
+        expected: @"SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)",
+    );
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select a as \"a*\", b , c as \"c@\" from (select 1 as a , 2 as b, 3 as c)",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: BigQueryDialect {},
+        expected: @"SELECT `a` AS `a_42`, `b`, `c` AS `c_64` FROM (SELECT 1 AS `a`, 2 AS `b`, 3 AS `c`)",
+    );
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select * from (select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)) where \"a*\" = 1",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: BigQueryDialect {},
+        expected: @"SELECT `a_42`, `b_64` FROM (SELECT `a` AS `a_42`, `b` AS `b_64` FROM (SELECT 1 AS `a`, 2 AS `b`)) WHERE (`a_42` = 1)",
+    );
+    roundtrip_statement_with_dialect_helper!(
+        sql: "select * from (select a as \"a*\", b as \"b@\" from (select 1 as a , 2 as b)) where \"a*\" = 1",
+        parser_dialect: GenericDialect {},
+        unparser_dialect: UnparserDefaultDialect {},
+        expected: @r#"SELECT "a*", "b@" FROM (SELECT a AS "a*", b AS "b@" FROM (SELECT 1 AS a, 2 AS b)) WHERE ("a*" = 1)"#,
+    );
+    Ok(())
+}
+
 #[test]
 fn test_unnest_logical_plan() -> Result<()> {
     let query = "select unnest(struct_col), unnest(array_col), struct_col, array_col from unnest_table";
@@ -939,11 +975,12 @@ fn test_unnest_logical_plan() -> Result<()> {
     let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: __unnest_placeholder(unnest_table.struct_col).field1, __unnest_placeholder(unnest_table.struct_col).field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
-  Unnest: lists[__unnest_placeholder(unnest_table.array_col)|depth=1] structs[__unnest_placeholder(unnest_table.struct_col)]
-    Projection: unnest_table.struct_col AS __unnest_placeholder(unnest_table.struct_col), unnest_table.array_col AS __unnest_placeholder(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
-      TableScan: unnest_table"#
+        @r"
+    Projection: __unnest_placeholder(unnest_table.struct_col).field1, __unnest_placeholder(unnest_table.struct_col).field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
+      Unnest: lists[__unnest_placeholder(unnest_table.array_col)|depth=1] structs[__unnest_placeholder(unnest_table.struct_col)]
+        Projection: unnest_table.struct_col AS __unnest_placeholder(unnest_table.struct_col), unnest_table.array_col AS __unnest_placeholder(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
+          TableScan: unnest_table
+    "
     );
 
     Ok(())
@@ -1197,6 +1234,84 @@ fn test_table_scan_with_empty_projection_in_plan_to_sql_3() {
     );
 }
 
+#[test]
+fn test_table_scan_with_empty_projection_in_plan_to_sql_postgres() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_empty_projection_and_none_projection_helper(
+        table_name,
+        schema,
+        Some(vec![]),
+    );
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT FROM "table""#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_in_plan_to_sql_default_dialect() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_empty_projection_and_none_projection_helper(
+        table_name,
+        schema,
+        Some(vec![]),
+    );
+    let unparser = Unparser::new(&UnparserDefaultDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT 1 FROM "table""#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_and_filter_postgres() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_filter_and_fetch(
+        Some(table_name),
+        &schema,
+        Some(vec![]),
+        vec![col("id").gt(lit(10))],
+        None,
+    )
+    .unwrap()
+    .build()
+    .unwrap();
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT FROM "table" WHERE ("table"."id" > 10)"#
+    );
+}
+
+#[test]
+fn test_table_scan_with_empty_projection_and_filter_default_dialect() {
+    let schema = test_schema();
+    let table_name = "table";
+    let plan = table_scan_with_filter_and_fetch(
+        Some(table_name),
+        &schema,
+        Some(vec![]),
+        vec![col("id").gt(lit(10))],
+        None,
+    )
+    .unwrap()
+    .build()
+    .unwrap();
+    let unparser = Unparser::new(&UnparserDefaultDialect {});
+    let sql = unparser.plan_to_sql(&plan).unwrap();
+    assert_snapshot!(
+        sql,
+        @r#"SELECT 1 FROM "table" WHERE ("table".id > 10)"#
+    );
+}
+
 fn table_scan_with_empty_projection_and_none_projection_helper(
     table_name: &str,
     table_schema: Schema,
@@ -1266,7 +1381,7 @@ fn test_pretty_roundtrip() -> Result<()> {
         let expr =
             sql_to_rel.sql_to_expr(sql_expr, &df_schema, &mut PlannerContext::new())?;
         let round_trip_sql = unparser.expr_to_sql(&expr)?.to_string();
-        assert_eq!(pretty.to_string(), round_trip_sql);
+        assert_eq!((*pretty).to_string(), round_trip_sql);
 
         // verify that the pretty string parses to the same underlying Expr
         let pretty_sql_expr = Parser::new(&GenericDialect {})
@@ -1302,7 +1417,11 @@ where
             .with_aggregate_function(grouping_udaf())
             .with_window_function(rank_udwf())
             .with_scalar_function(Arc::new(unicode::substr().as_ref().clone()))
-            .with_scalar_function(make_array_udf()),
+            .with_scalar_function(make_array_udf())
+            .with_expr_planner(Arc::new(CoreFunctionPlanner::default()))
+            .with_expr_planner(Arc::new(UnicodeFunctionPlanner))
+            .with_expr_planner(Arc::new(NestedFunctionPlanner))
+            .with_expr_planner(Arc::new(FieldAccessPlanner)),
     };
     let sql_to_rel = SqlToRel::new(&context);
     let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap();
@@ -1346,7 +1465,7 @@ fn test_table_scan_alias() -> Result<()> {
     let sql = plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT * FROM (SELECT t1.id FROM t1 WHERE (t1.id > 5)) AS a"#
+        @"SELECT * FROM (SELECT t1.id FROM t1 WHERE (t1.id > 5)) AS a"
     );
 
     let table_scan_with_two_filter = table_scan_with_filters(
@@ -1361,7 +1480,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_two_filter = plan_to_sql(&table_scan_with_two_filter)?;
     assert_snapshot!(
         table_scan_with_two_filter,
-        @r#"SELECT a.id FROM t1 AS a WHERE ((a.id > 1) AND (a.age < 2))"#
+        @"SELECT a.id FROM t1 AS a WHERE ((a.id > 1) AND (a.age < 2))"
     );
 
     let table_scan_with_fetch =
@@ -1372,7 +1491,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_fetch = plan_to_sql(&table_scan_with_fetch)?;
     assert_snapshot!(
         table_scan_with_fetch,
-        @r#"SELECT a.id FROM (SELECT * FROM t1 LIMIT 10) AS a"#
+        @"SELECT a.id FROM (SELECT * FROM t1 LIMIT 10) AS a"
     );
 
     let table_scan_with_pushdown_all = table_scan_with_filter_and_fetch(
@@ -1388,7 +1507,7 @@ fn test_table_scan_alias() -> Result<()> {
     let table_scan_with_pushdown_all = plan_to_sql(&table_scan_with_pushdown_all)?;
     assert_snapshot!(
         table_scan_with_pushdown_all,
-        @r#"SELECT a.id FROM (SELECT a.id, a.age FROM t1 AS a WHERE (a.id > 1) LIMIT 10) AS a"#
+        @"SELECT a.id FROM (SELECT a.id, a.age FROM t1 AS a WHERE (a.id > 1) LIMIT 10) AS a"
     );
     Ok(())
 }
@@ -1404,21 +1523,21 @@ fn test_table_scan_pushdown() -> Result<()> {
     let scan_with_projection = plan_to_sql(&scan_with_projection)?;
     assert_snapshot!(
         scan_with_projection,
-        @r#"SELECT t1.id, t1.age FROM t1"#
+        @"SELECT t1.id, t1.age FROM t1"
     );
 
     let scan_with_projection = table_scan(Some("t1"), &schema, Some(vec![1]))?.build()?;
     let scan_with_projection = plan_to_sql(&scan_with_projection)?;
     assert_snapshot!(
         scan_with_projection,
-        @r#"SELECT t1.age FROM t1"#
+        @"SELECT t1.age FROM t1"
     );
 
     let scan_with_no_projection = table_scan(Some("t1"), &schema, None)?.build()?;
     let scan_with_no_projection = plan_to_sql(&scan_with_no_projection)?;
     assert_snapshot!(
         scan_with_no_projection,
-        @r#"SELECT * FROM t1"#
+        @"SELECT * FROM t1"
     );
 
     let table_scan_with_projection_alias =
@@ -1429,7 +1548,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_alias)?;
     assert_snapshot!(
         table_scan_with_projection_alias,
-        @r#"SELECT ta.id, ta.age FROM t1 AS ta"#
+        @"SELECT ta.id, ta.age FROM t1 AS ta"
     );
 
     let table_scan_with_projection_alias =
@@ -1440,7 +1559,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_alias)?;
     assert_snapshot!(
         table_scan_with_projection_alias,
-        @r#"SELECT ta.age FROM t1 AS ta"#
+        @"SELECT ta.age FROM t1 AS ta"
     );
 
     let table_scan_with_no_projection_alias = table_scan(Some("t1"), &schema, None)?
@@ -1450,7 +1569,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_no_projection_alias)?;
     assert_snapshot!(
         table_scan_with_no_projection_alias,
-        @r#"SELECT * FROM t1 AS ta"#
+        @"SELECT * FROM t1 AS ta"
     );
 
     let query_from_table_scan_with_projection = LogicalPlanBuilder::from(
@@ -1462,7 +1581,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&query_from_table_scan_with_projection)?;
     assert_snapshot!(
         query_from_table_scan_with_projection,
-        @r#"SELECT t1.id, t1.age FROM t1"#
+        @"SELECT t1.id, t1.age FROM t1"
     );
 
     let query_from_table_scan_with_two_projections = LogicalPlanBuilder::from(
@@ -1475,7 +1594,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&query_from_table_scan_with_two_projections)?;
     assert_snapshot!(
         query_from_table_scan_with_two_projections,
-        @r#"SELECT t1.id, t1.age FROM (SELECT t1.id, t1.age FROM t1)"#
+        @"SELECT t1.id, t1.age FROM (SELECT t1.id, t1.age FROM t1)"
     );
 
     let table_scan_with_filter = table_scan_with_filters(
@@ -1488,7 +1607,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter = plan_to_sql(&table_scan_with_filter)?;
     assert_snapshot!(
         table_scan_with_filter,
-        @r#"SELECT * FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT * FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_two_filter = table_scan_with_filters(
@@ -1501,7 +1620,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_two_filter = plan_to_sql(&table_scan_with_two_filter)?;
     assert_snapshot!(
         table_scan_with_two_filter,
-        @r#"SELECT * FROM t1 WHERE ((t1.id > 1) AND (t1.age < 2))"#
+        @"SELECT * FROM t1 WHERE ((t1.id > 1) AND (t1.age < 2))"
     );
 
     let table_scan_with_filter_alias = table_scan_with_filters(
@@ -1515,7 +1634,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter_alias = plan_to_sql(&table_scan_with_filter_alias)?;
     assert_snapshot!(
         table_scan_with_filter_alias,
-        @r#"SELECT * FROM t1 AS ta WHERE (ta.id > ta.age)"#
+        @"SELECT * FROM t1 AS ta WHERE (ta.id > ta.age)"
     );
 
     let table_scan_with_projection_and_filter = table_scan_with_filters(
@@ -1529,7 +1648,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_filter)?;
     assert_snapshot!(
         table_scan_with_projection_and_filter,
-        @r#"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_projection_and_filter = table_scan_with_filters(
@@ -1543,7 +1662,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_filter)?;
     assert_snapshot!(
         table_scan_with_projection_and_filter,
-        @r#"SELECT t1.age FROM t1 WHERE (t1.id > t1.age)"#
+        @"SELECT t1.age FROM t1 WHERE (t1.id > t1.age)"
     );
 
     let table_scan_with_inline_fetch =
@@ -1552,7 +1671,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_inline_fetch = plan_to_sql(&table_scan_with_inline_fetch)?;
     assert_snapshot!(
         table_scan_with_inline_fetch,
-        @r#"SELECT * FROM t1 LIMIT 10"#
+        @"SELECT * FROM t1 LIMIT 10"
     );
 
     let table_scan_with_projection_and_inline_fetch = table_scan_with_filter_and_fetch(
@@ -1567,7 +1686,7 @@ fn test_table_scan_pushdown() -> Result<()> {
         plan_to_sql(&table_scan_with_projection_and_inline_fetch)?;
     assert_snapshot!(
         table_scan_with_projection_and_inline_fetch,
-        @r#"SELECT t1.id, t1.age FROM t1 LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 LIMIT 10"
     );
 
     let table_scan_with_all = table_scan_with_filter_and_fetch(
@@ -1581,7 +1700,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_all = plan_to_sql(&table_scan_with_all)?;
     assert_snapshot!(
         table_scan_with_all,
-        @r#"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age) LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 WHERE (t1.id > t1.age) LIMIT 10"
     );
 
     let table_scan_with_additional_filter = table_scan_with_filters(
@@ -1595,7 +1714,7 @@ fn test_table_scan_pushdown() -> Result<()> {
     let table_scan_with_filter = plan_to_sql(&table_scan_with_additional_filter)?;
     assert_snapshot!(
         table_scan_with_filter,
-        @r#"SELECT * FROM t1 WHERE (t1.id = 5) AND (t1.id > t1.age)"#
+        @"SELECT * FROM t1 WHERE (t1.id = 5) AND (t1.id > t1.age)"
     );
 
     Ok(())
@@ -1616,7 +1735,7 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
     let sql = plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT t1.id, t1.age FROM t1 ORDER BY t1.age ASC NULLS FIRST LIMIT 10"#
+        @"SELECT t1.id, t1.age FROM t1 ORDER BY t1.age ASC NULLS FIRST LIMIT 10"
     );
     Ok(())
 }
@@ -1746,7 +1865,7 @@ fn test_interval_lhs_eq() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')"#
+        @"SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')"
     )
 }
 
@@ -1758,7 +1877,7 @@ fn test_interval_lhs_lt() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')"#
+        @"SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')"
     )
 }
 
@@ -1767,7 +1886,7 @@ fn test_without_offset() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1"#
+        @"SELECT 1"
     )
 }
 
@@ -1776,7 +1895,7 @@ fn test_with_offset0() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1 offset 0");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1 OFFSET 0"#
+        @"SELECT 1 OFFSET 0"
     )
 }
 
@@ -1785,7 +1904,7 @@ fn test_with_offset95() {
     let statement = generate_round_trip_statement(MySqlDialect {}, "select 1 offset 95");
     assert_snapshot!(
         statement,
-        @r#"SELECT 1 OFFSET 95"#
+        @"SELECT 1 OFFSET 95"
     )
 }
 
@@ -1798,7 +1917,7 @@ fn test_order_by_to_sql_1() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, sum(person.id) FROM person GROUP BY person.id, person.first_name ORDER BY sum(person.id) ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"#
+        @"SELECT person.id, person.first_name, sum(person.id) FROM person GROUP BY person.id, person.first_name ORDER BY sum(person.id) ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"
     );
 }
 
@@ -1811,7 +1930,7 @@ fn test_order_by_to_sql_2() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, sum(person.id) AS total_sum FROM person GROUP BY person.id, person.first_name ORDER BY total_sum ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"#
+        @"SELECT person.id, person.first_name, sum(person.id) AS total_sum FROM person GROUP BY person.id, person.first_name ORDER BY total_sum ASC NULLS LAST, person.first_name DESC NULLS FIRST, person.id ASC NULLS LAST, person.first_name ASC NULLS LAST LIMIT 10"
     );
 }
 
@@ -1823,7 +1942,7 @@ fn test_order_by_to_sql_3() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT person.id, person.first_name, substr(person.first_name, 0, 5) FROM person ORDER BY person.id ASC NULLS LAST, substr(person.first_name, 0, 5) ASC NULLS LAST"#
+        @"SELECT person.id, person.first_name, substr(person.first_name, 0, 5) FROM person ORDER BY person.id ASC NULLS LAST, substr(person.first_name, 0, 5) ASC NULLS LAST"
     );
 }
 
@@ -1865,7 +1984,7 @@ fn test_complex_order_by_with_grouping() -> Result<()> {
     }, {
         assert_snapshot!(
             sql,
-            @r#"SELECT j1.j1_id, j1.j1_string, lochierarchy FROM (SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy, grouping(j1.j1_string), grouping(j1.j1_id) FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string) ORDER BY (grouping(j1.j1_id) + grouping(j1.j1_string)) DESC NULLS FIRST, CASE WHEN ((grouping(j1.j1_id) + grouping(j1.j1_string)) = 0) THEN j1.j1_id END ASC NULLS LAST) LIMIT 100"#
+            @"SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string) ORDER BY lochierarchy DESC NULLS FIRST, CASE WHEN (lochierarchy = 0) THEN j1.j1_id END ASC NULLS LAST LIMIT 100"
         );
     });
 
@@ -1877,6 +1996,7 @@ fn test_aggregation_to_sql() {
     let sql = r#"SELECT id, first_name,
         SUM(id) AS total_sum,
         SUM(id) OVER (PARTITION BY first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS moving_sum,
+        SUM(id) FILTER (WHERE id > 50 AND first_name = 'John') OVER (PARTITION BY first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS filtered_sum,
         MAX(SUM(id)) OVER (PARTITION BY first_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS max_total,
         rank() OVER (PARTITION BY grouping(id) + grouping(age), CASE WHEN grouping(age) = 0 THEN id END ORDER BY sum(id) DESC) AS rank_within_parent_1,
         rank() OVER (PARTITION BY grouping(age) + grouping(id), CASE WHEN (CAST(grouping(age) AS BIGINT) = 0) THEN id END ORDER BY sum(id) DESC) AS rank_within_parent_2
@@ -1885,7 +2005,7 @@ fn test_aggregation_to_sql() {
     let statement = generate_round_trip_statement(GenericDialect {}, sql);
     assert_snapshot!(
         statement,
-        @"SELECT person.id, person.first_name, sum(person.id) AS total_sum, sum(person.id) OVER (PARTITION BY person.first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS moving_sum, max(sum(person.id)) OVER (PARTITION BY person.first_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS max_total, rank() OVER (PARTITION BY (grouping(person.id) + grouping(person.age)), CASE WHEN (grouping(person.age) = 0) THEN person.id END ORDER BY sum(person.id) DESC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank_within_parent_1, rank() OVER (PARTITION BY (grouping(person.age) + grouping(person.id)), CASE WHEN (CAST(grouping(person.age) AS BIGINT) = 0) THEN person.id END ORDER BY sum(person.id) DESC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank_within_parent_2 FROM person GROUP BY person.id, person.first_name",
+        @"SELECT person.id, person.first_name, sum(person.id) AS total_sum, sum(person.id) OVER (PARTITION BY person.first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS moving_sum, sum(person.id) FILTER (WHERE ((person.id > 50) AND (person.first_name = 'John'))) OVER (PARTITION BY person.first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS filtered_sum, max(sum(person.id)) OVER (PARTITION BY person.first_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS max_total, rank() OVER (PARTITION BY (grouping(person.id) + grouping(person.age)), CASE WHEN (grouping(person.age) = 0) THEN person.id END ORDER BY sum(person.id) DESC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank_within_parent_1, rank() OVER (PARTITION BY (grouping(person.age) + grouping(person.id)), CASE WHEN (CAST(grouping(person.age) AS BIGINT) = 0) THEN person.id END ORDER BY sum(person.id) DESC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank_within_parent_2 FROM person GROUP BY person.id, person.first_name",
     );
 }
 
@@ -1897,7 +2017,7 @@ fn test_unnest_to_sql_1() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT UNNEST(unnest_table.array_col) AS u1, unnest_table.struct_col, unnest_table.array_col FROM unnest_table WHERE (unnest_table.array_col <> NULL) ORDER BY unnest_table.struct_col ASC NULLS LAST, unnest_table.array_col ASC NULLS LAST"#
+        @"SELECT UNNEST(unnest_table.array_col) AS u1, unnest_table.struct_col, unnest_table.array_col FROM unnest_table WHERE (unnest_table.array_col <> NULL) ORDER BY unnest_table.struct_col ASC NULLS LAST, unnest_table.array_col ASC NULLS LAST"
     );
 }
 
@@ -1909,7 +2029,7 @@ fn test_unnest_to_sql_2() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT UNNEST([1, 2, 2, 5, NULL]) AS u1"#
+        @"SELECT UNNEST([1, 2, 2, 5, NULL]) AS u1"
     );
 }
 
@@ -1921,7 +2041,7 @@ fn test_join_with_no_conditions() {
     );
     assert_snapshot!(
         statement,
-        @r#"SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2"#
+        @"SELECT j1.j1_id, j1.j1_string FROM j1 CROSS JOIN j2"
     );
 }
 
@@ -2024,13 +2144,14 @@ fn test_unparse_extension_to_statement() -> Result<()> {
     let sql = unparser.plan_to_sql(&extension)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT j1.j1_id, j1.j1_string FROM j1"#
+        @"SELECT j1.j1_id, j1.j1_string FROM j1"
     );
 
     if let Some(err) = plan_to_sql(&extension).err() {
         assert_contains!(
             err.to_string(),
-            "This feature is not implemented: Unsupported extension node: MockUserDefinedLogicalPlan");
+            "This feature is not implemented: Unsupported extension node: MockUserDefinedLogicalPlan"
+        );
     } else {
         panic!("Expected error");
     }
@@ -2089,7 +2210,7 @@ fn test_unparse_extension_to_sql() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT j1.j1_id AS user_id FROM (SELECT j1.j1_id, j1.j1_string FROM j1)"#
+        @"SELECT j1.j1_id AS user_id FROM (SELECT j1.j1_id, j1.j1_string FROM j1)"
     );
 
     if let Some(err) = plan_to_sql(&plan).err() {
@@ -2130,15 +2251,13 @@ fn test_unparse_optimized_multi_union() -> Result<()> {
     });
     assert_snapshot!(
         unparser.plan_to_sql(&plan)?,
-        @r#"SELECT 1 AS x, 'a' AS y UNION ALL SELECT 1 AS x, 'b' AS y UNION ALL SELECT 2 AS x, 'a' AS y UNION ALL SELECT 2 AS x, 'c' AS y"#
+        @"SELECT 1 AS x, 'a' AS y UNION ALL SELECT 1 AS x, 'b' AS y UNION ALL SELECT 2 AS x, 'a' AS y UNION ALL SELECT 2 AS x, 'c' AS y"
     );
 
     let plan = LogicalPlan::Union(Union {
-        inputs: vec![project(
-            empty.clone(),
-            vec![lit(1).alias("x"), lit("a").alias("y")],
-        )?
-        .into()],
+        inputs: vec![
+            project(empty.clone(), vec![lit(1).alias("x"), lit("a").alias("y")])?.into(),
+        ],
         schema: dfschema.clone(),
     });
 
@@ -2211,7 +2330,7 @@ fn test_unparse_subquery_alias_with_table_pushdown() -> Result<()> {
     let sql = unparser.plan_to_sql(&plan)?;
     assert_snapshot!(
         sql,
-        @r#"SELECT customer_view.c_custkey, customer_view.c_name, customer_view.custkey_plus FROM (SELECT customer.c_custkey, (CAST(customer.c_custkey AS BIGINT) + 1) AS custkey_plus, customer.c_name FROM (SELECT customer.c_custkey, customer.c_name FROM customer AS customer) AS customer) AS customer_view"#
+        @"SELECT customer_view.c_custkey, customer_view.c_name, customer_view.custkey_plus FROM (SELECT customer.c_custkey, (CAST(customer.c_custkey AS BIGINT) + 1) AS custkey_plus, customer.c_name FROM (SELECT customer.c_custkey, customer.c_name FROM customer AS customer) AS customer) AS customer_view"
     );
     Ok(())
 }
@@ -2478,6 +2597,90 @@ fn test_unparse_left_semi_join_with_table_scan_projection() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_unparse_window() -> Result<()> {
+    // SubqueryAlias: t
+    // Projection: t.k, t.v, rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r
+    //     Filter: rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = UInt64(1)
+    //     WindowAggr: windowExpr=[[rank() PARTITION BY [t.k] ORDER BY [t.v ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+    //         TableScan: t projection=[k, v]
+
+    let schema = Schema::new(vec![
+        Field::new("k", DataType::Int32, false),
+        Field::new("v", DataType::Int32, false),
+    ]);
+    let window_expr = Expr::WindowFunction(Box::new(WindowFunction {
+        fun: WindowFunctionDefinition::WindowUDF(rank_udwf()),
+        params: WindowFunctionParams {
+            args: vec![],
+            partition_by: vec![col("k")],
+            order_by: vec![col("v").sort(true, true)],
+            window_frame: WindowFrame::new(None),
+            null_treatment: None,
+            distinct: false,
+            filter: None,
+        },
+    }));
+    let table = table_scan(Some("test"), &schema, Some(vec![0, 1]))?.build()?;
+    let plan = LogicalPlanBuilder::window_plan(table, vec![window_expr.clone()])?;
+
+    let name = plan.schema().fields().last().unwrap().name().clone();
+    let plan = LogicalPlanBuilder::from(plan)
+        .filter(col(name.clone()).eq(lit(1i64)))?
+        .project(vec![col("k"), col("v"), col(name)])?
+        .build()?;
+
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @r#"SELECT "test"."k", "test"."v", "rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "test"."k" AS "k", "test"."v" AS "v", rank() OVER (PARTITION BY "test"."k" ORDER BY "test"."v" ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM "test") AS "test" WHERE ("rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" = 1)"#
+    );
+
+    let unparser = Unparser::new(&UnparserMySqlDialect {});
+    let sql = unparser.plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"
+    );
+
+    let unparser = Unparser::new(&SqliteDialect {});
+    let sql = unparser.plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @"SELECT `test`.`k`, `test`.`v`, `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM (SELECT `test`.`k` AS `k`, `test`.`v` AS `v`, rank() OVER (PARTITION BY `test`.`k` ORDER BY `test`.`v` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` FROM `test`) AS `test` WHERE (`rank() PARTITION BY [test.k] ORDER BY [test.v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` = 1)"
+    );
+
+    let unparser = Unparser::new(&DefaultDialect {});
+    let sql = unparser.plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @"SELECT test.k, test.v, rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) FROM test QUALIFY (rank() OVER (PARTITION BY test.k ORDER BY test.v ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) = 1)"
+    );
+
+    // without table qualifier
+    let table = table_scan(Some("test"), &schema, Some(vec![0, 1]))?.build()?;
+    let table = LogicalPlanBuilder::from(table)
+        .project(vec![col("k").alias("k"), col("v").alias("v")])?
+        .build()?;
+    let plan = LogicalPlanBuilder::window_plan(table, vec![window_expr])?;
+
+    let name = plan.schema().fields().last().unwrap().name().clone();
+    let plan = LogicalPlanBuilder::from(plan)
+        .filter(col(name.clone()).eq(lit(1i64)))?
+        .project(vec![col("k"), col("v"), col(name)])?
+        .build()?;
+
+    let unparser = Unparser::new(&UnparserPostgreSqlDialect {});
+    let sql = unparser.plan_to_sql(&plan)?;
+    assert_snapshot!(
+        sql,
+        @r#"SELECT "k", "v", "rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "k" AS "k", "v" AS "v", rank() OVER (PARTITION BY "k" ORDER BY "v" ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" FROM (SELECT "test"."k" AS "k", "test"."v" AS "v" FROM "test") AS "derived_projection") AS "__qualify_subquery" WHERE ("rank() PARTITION BY [k] ORDER BY [v ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" = 1)"#
+    );
+
+    Ok(())
+}
+
 #[test]
 fn test_like_filter() {
     let statement = generate_round_trip_statement(
@@ -2561,3 +2764,96 @@ fn test_not_ilike_filter_with_escape() {
         @"SELECT person.first_name FROM person WHERE person.first_name NOT ILIKE 'A!_%' ESCAPE '!'"
     );
 }
+
+#[test]
+fn test_struct_expr() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"WITH test AS (SELECT STRUCT(STRUCT('Product Name' as name) as product) AS metadata) SELECT metadata.product FROM test WHERE metadata.product.name  = 'Product Name'"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT test."metadata".product FROM (SELECT {product: {"name": 'Product Name'}} AS "metadata") AS test WHERE (test."metadata".product."name" = 'Product Name')"#
+    );
+
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"WITH test AS (SELECT STRUCT(STRUCT('Product Name' as name) as product) AS metadata) SELECT metadata.product FROM test WHERE metadata['product']['name']  = 'Product Name'"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT test."metadata".product FROM (SELECT {product: {"name": 'Product Name'}} AS "metadata") AS test WHERE (test."metadata".product."name" = 'Product Name')"#
+    );
+}
+
+#[test]
+fn test_struct_expr2() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT STRUCT(STRUCT('Product Name' as name) as product)['product']['name']  = 'Product Name';"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT ({product: {"name": 'Product Name'}}.product."name" = 'Product Name')"#
+    );
+}
+
+#[test]
+fn test_struct_expr3() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"WITH
+                test AS (
+                    SELECT
+                        STRUCT (
+                            STRUCT (
+                                STRUCT ('Product Name' as name) as product
+                            ) AS metadata
+                        ) AS c1
+                )
+            SELECT
+                c1.metadata.product.name
+            FROM
+                test"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT test.c1."metadata".product."name" FROM (SELECT {"metadata": {product: {"name": 'Product Name'}}} AS c1) AS test"#
+    );
+}
+
+#[test]
+fn test_json_access_1() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field') FROM j1"#
+    );
+}
+
+#[test]
+fn test_json_access_2() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field[0] FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field[0]') FROM j1"#
+    );
+}
+
+#[test]
+fn test_json_access_3() {
+    let statement = generate_round_trip_statement(
+        GenericDialect {},
+        r#"SELECT j1_string:field.inner1['inner2'] FROM j1"#,
+    );
+    assert_snapshot!(
+        statement,
+        @r#"SELECT (j1.j1_string : 'field.inner1[''inner2'']') FROM j1"#
+    );
+}
diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs
index ee1b761970def..5caade300290f 100644
--- a/datafusion/sql/tests/common/mod.rs
+++ b/datafusion/sql/tests/common/mod.rs
@@ -23,8 +23,9 @@ use std::{sync::Arc, vec};
 
 use arrow::datatypes::*;
 use datafusion_common::config::ConfigOptions;
+use datafusion_common::datatype::DataTypeExt;
 use datafusion_common::file_options::file_type::FileType;
-use datafusion_common::{plan_err, DFSchema, GetExt, Result, TableReference};
+use datafusion_common::{DFSchema, GetExt, Result, TableReference, plan_err};
 use datafusion_expr::planner::{ExprPlanner, PlannerResult, TypePlanner};
 use datafusion_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, WindowUDF};
 use datafusion_functions_nested::expr_fn::make_array;
@@ -151,14 +152,36 @@ impl ContextProvider for MockContextProvider {
                 ),
                 Field::new("😀", DataType::Int32, false),
             ])),
+            "person_with_uuid_extension" => Ok(Schema::new(vec![
+                Field::new("id", DataType::FixedSizeBinary(16), false).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+                Field::new("first_name", DataType::Utf8, false),
+                Field::new("last_name", DataType::Utf8, false),
+            ])),
             "orders" => Ok(Schema::new(vec![
                 Field::new("order_id", DataType::UInt32, false),
+                Field::new("o_orderkey", DataType::UInt32, false),
+                Field::new("o_custkey", DataType::UInt32, false),
+                Field::new("o_orderstatus", DataType::Utf8, false),
                 Field::new("customer_id", DataType::UInt32, false),
+                Field::new("o_totalprice", DataType::Decimal128(15, 2), false),
                 Field::new("o_item_id", DataType::Utf8, false),
                 Field::new("qty", DataType::Int32, false),
                 Field::new("price", DataType::Float64, false),
                 Field::new("delivered", DataType::Boolean, false),
             ])),
+            "customer" => Ok(Schema::new(vec![
+                Field::new("c_custkey", DataType::UInt32, false),
+                Field::new("c_name", DataType::Utf8, false),
+                Field::new("c_address", DataType::Utf8, false),
+                Field::new("c_nationkey", DataType::UInt32, false),
+                Field::new("c_phone", DataType::Utf8, false),
+                Field::new("c_acctbal", DataType::Float64, false),
+                Field::new("c_mktsegment", DataType::Utf8, false),
+                Field::new("c_comment", DataType::Utf8, false),
+            ])),
             "array" => Ok(Schema::new(vec![
                 Field::new(
                     "left",
@@ -178,8 +201,10 @@ impl ContextProvider for MockContextProvider {
                 ),
             ])),
             "lineitem" => Ok(Schema::new(vec![
+                Field::new("l_orderkey", DataType::UInt32, false),
                 Field::new("l_item_id", DataType::UInt32, false),
                 Field::new("l_description", DataType::Utf8, false),
+                Field::new("l_extendedprice", DataType::Decimal128(15, 2), false),
                 Field::new("price", DataType::Float64, false),
             ])),
             "aggregate_test_100" => Ok(Schema::new(vec![
@@ -219,6 +244,11 @@ impl ContextProvider for MockContextProvider {
                     false,
                 ),
             ])),
+            "@quoted_identifier_names_table" => Ok(Schema::new(vec![Field::new(
+                "@column",
+                DataType::UInt32,
+                false,
+            )])),
             _ => plan_err!("No table named: {} found", name.table()),
         };
 
@@ -236,8 +266,11 @@ impl ContextProvider for MockContextProvider {
         self.state.aggregate_functions.get(name).cloned()
     }
 
-    fn get_variable_type(&self, _: &[String]) -> Option<DataType> {
-        unimplemented!()
+    fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType> {
+        match variable_names {
+            [var] if var == "@variable" => Some(DataType::Date32),
+            _ => unimplemented!(),
+        }
     }
 
     fn get_window_meta(&self, name: &str) -> Option<Arc<WindowUDF>> {
@@ -309,8 +342,17 @@ impl TableSource for EmptyTable {
 pub struct CustomTypePlanner {}
 
 impl TypePlanner for CustomTypePlanner {
-    fn plan_type(&self, sql_type: &sqlparser::ast::DataType) -> Result<Option<DataType>> {
+    fn plan_type_field(
+        &self,
+        sql_type: &sqlparser::ast::DataType,
+    ) -> Result<Option<FieldRef>> {
         match sql_type {
+            sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new(
+                Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+            ))),
             sqlparser::ast::DataType::Datetime(precision) => {
                 let precision = match precision {
                     Some(0) => TimeUnit::Second,
@@ -319,7 +361,9 @@ impl TypePlanner for CustomTypePlanner {
                     None | Some(9) => TimeUnit::Nanosecond,
                     _ => unreachable!(),
                 };
-                Ok(Some(DataType::Timestamp(precision, None)))
+                Ok(Some(
+                    DataType::Timestamp(precision, None).into_nullable_field_ref(),
+                ))
             }
             _ => Ok(None),
         }
diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs
index 4be7953aefc0a..29c17be69ce5f 100644
--- a/datafusion/sql/tests/sql_integration.rs
+++ b/datafusion/sql/tests/sql_integration.rs
@@ -15,34 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// This lint violation is acceptable for tests, so suppress for now
+// Issue: <https://github.com/apache/datafusion/issues/18503>
+#![expect(clippy::needless_pass_by_value)]
+
 use std::any::Any;
+use std::hash::Hash;
 #[cfg(test)]
 use std::sync::Arc;
 use std::vec;
 
 use arrow::datatypes::{TimeUnit::Nanosecond, *};
 use common::MockContextProvider;
-use datafusion_common::{assert_contains, DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, assert_contains};
 use datafusion_expr::{
-    col, logical_plan::LogicalPlan, test::function_stub::sum_udaf, ColumnarValue,
-    CreateIndex, DdlStatement, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
-    Volatility,
+    ColumnarValue, CreateIndex, DdlStatement, ScalarFunctionArgs, ScalarUDF,
+    ScalarUDFImpl, Signature, Volatility, col, logical_plan::LogicalPlan,
+    test::function_stub::sum_udaf,
 };
 use datafusion_functions::{string, unicode};
 use datafusion_sql::{
     parser::DFParser,
-    planner::{ParserOptions, SqlToRel},
+    planner::{NullOrdering, ParserOptions, SqlToRel},
 };
 
 use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState};
 use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions_aggregate::{
-    approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf,
-    min_max::min_udaf,
+    approx_median::approx_median_udaf,
+    average::avg_udaf,
+    count::count_udaf,
+    grouping::grouping_udaf,
+    min_max::{max_udaf, min_udaf},
 };
-use datafusion_functions_aggregate::{average::avg_udaf, grouping::grouping_udaf};
 use datafusion_functions_nested::make_array::make_array_udf;
-use datafusion_functions_window::rank::rank_udwf;
+use datafusion_functions_window::{rank::rank_udwf, row_number::row_number_udwf};
 use insta::{allow_duplicates, assert_snapshot};
 use rstest::rstest;
 use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect};
@@ -57,10 +64,10 @@ fn parse_decimals_1() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Int64(1)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Int64(1)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -71,10 +78,10 @@ fn parse_decimals_2() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Int64(1)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Int64(1)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -85,10 +92,10 @@ fn parse_decimals_3() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(1),1,1)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(1),1,1)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -99,10 +106,10 @@ fn parse_decimals_4() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(1),2,2)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(1),2,2)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -113,10 +120,10 @@ fn parse_decimals_5() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(10),2,1)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(10),2,1)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -127,10 +134,10 @@ fn parse_decimals_6() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(1001),4,2)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(1001),4,2)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -141,10 +148,10 @@ fn parse_decimals_7() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(1000000000000000000000),22,2)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(1000000000000000000000),22,2)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -155,10 +162,10 @@ fn parse_decimals_8() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: UInt64(18446744073709551615)
-          EmptyRelation
-        "#
+        @r"
+    Projection: UInt64(18446744073709551615)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -169,10 +176,10 @@ fn parse_decimals_9() {
     let plan = logical_plan_with_options(sql, options).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Decimal128(Some(18446744073709551616),20,0)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Decimal128(Some(18446744073709551616),20,0)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -184,9 +191,9 @@ fn parse_ident_normalization_1() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: character_length(Utf8("str"))
-          EmptyRelation
-        "#
+    Projection: character_length(Utf8("str"))
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -198,9 +205,9 @@ fn parse_ident_normalization_2() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: concat(Utf8("Hello"), Utf8("World"))
-          EmptyRelation
-        "#
+    Projection: concat(Utf8("Hello"), Utf8("World"))
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -211,10 +218,10 @@ fn parse_ident_normalization_3() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age
+      TableScan: person
+    "
     );
 }
 
@@ -225,10 +232,26 @@ fn parse_ident_normalization_4() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age
+      TableScan: person
+    "
+    );
+}
+
+#[test]
+fn within_group_rejected_for_non_ordered_set_udaf() {
+    // MIN is order-sensitive by nature but does not implement the
+    // ordered-set `WITHIN GROUP` opt-in. The planner must reject
+    // explicit `WITHIN GROUP` syntax for functions that do not
+    // advertise `supports_within_group_clause()`.
+    let sql = "SELECT min(c1) WITHIN GROUP (ORDER BY c1) FROM person";
+    let err = logical_plan(sql)
+        .expect_err("expected planning to fail for MIN WITHIN GROUP")
+        .to_string();
+    assert_contains!(
+        err,
+        "WITHIN GROUP is only supported for ordered-set aggregate functions"
     );
 }
 
@@ -241,9 +264,7 @@ fn parse_ident_normalization_5() {
         .strip_backtrace();
     assert_snapshot!(
         plan,
-        @r#"
-        Error during planning: No table named: PERSON found
-        "#
+        @"Error during planning: No table named: PERSON found"
     );
 }
 
@@ -254,10 +275,10 @@ fn parse_ident_normalization_6() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: UPPERCASE_test.Id
-          TableScan: UPPERCASE_test
-        "#
+        @r"
+    Projection: UPPERCASE_test.Id
+      TableScan: UPPERCASE_test
+    "
     );
 }
 
@@ -268,10 +289,10 @@ fn parse_ident_normalization_7() {
     let plan = logical_plan_with_options(sql, parser_option).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: UPPERCASE_test.Id, UPPERCASE_test.lower
-          TableScan: UPPERCASE_test
-        "#
+        @r"
+    Projection: UPPERCASE_test.Id, UPPERCASE_test.lower
+      TableScan: UPPERCASE_test
+    "
     );
 }
 
@@ -280,10 +301,10 @@ fn select_no_relation() {
     let plan = logical_plan("SELECT 1").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: Int64(1)
-          EmptyRelation
-        "#
+        @r"
+    Projection: Int64(1)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -292,10 +313,10 @@ fn test_real_f32() {
     let plan = logical_plan("SELECT CAST(1.1 AS REAL)").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(Float64(1.1) AS Float32)
-          EmptyRelation
-        "#
+        @r"
+    Projection: CAST(Float64(1.1) AS Float32)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -304,10 +325,10 @@ fn test_int_decimal_default() {
     let plan = logical_plan("SELECT CAST(10 AS DECIMAL)").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(Int64(10) AS Decimal128(38, 10))
-          EmptyRelation
-        "#
+        @r"
+    Projection: CAST(Int64(10) AS Decimal128(38, 10))
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -316,10 +337,10 @@ fn test_int_decimal_no_scale() {
     let plan = logical_plan("SELECT CAST(10 AS DECIMAL(5))").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(Int64(10) AS Decimal128(5, 0))
-          EmptyRelation
-        "#
+        @r"
+    Projection: CAST(Int64(10) AS Decimal128(5, 0))
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -328,10 +349,10 @@ fn test_tinyint() {
     let plan = logical_plan("SELECT CAST(6 AS TINYINT)").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(Int64(6) AS Int8)
-          EmptyRelation
-        "#
+        @r"
+    Projection: CAST(Int64(6) AS Int8)
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -340,11 +361,11 @@ fn cast_from_subquery() {
     let plan = logical_plan("SELECT CAST (a AS FLOAT) FROM (SELECT 1 AS a)").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(a AS Float32)
-          Projection: Int64(1) AS a
-            EmptyRelation
-        "#
+        @r"
+    Projection: CAST(a AS Float32)
+      Projection: Int64(1) AS a
+        EmptyRelation: rows=1
+    "
     );
 }
 
@@ -353,11 +374,11 @@ fn try_cast_from_aggregation() {
     let plan = logical_plan("SELECT TRY_CAST(sum(age) AS FLOAT) FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: TRY_CAST(sum(person.age) AS Float32)
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: TRY_CAST(sum(person.age) AS Float32)
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -369,7 +390,7 @@ fn cast_to_invalid_decimal_type_precision_0() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 0, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 0, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -380,10 +401,10 @@ fn cast_to_invalid_decimal_type_precision_gt_38() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: CAST(Int64(10) AS Decimal256(39, 0))
-          EmptyRelation
-        "#
+        @r"
+    Projection: CAST(Int64(10) AS Decimal256(39, 0))
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -395,7 +416,7 @@ fn cast_to_invalid_decimal_type_precision_gt_76() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 79, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 79, scale = 0) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -407,7 +428,7 @@ fn cast_to_invalid_decimal_type_precision_lt_scale() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Decimal(precision = 5, scale = 10) should satisfy `0 < precision <= 76`, and `scale <= precision`."
+        @"Error during planning: Decimal(precision = 5, scale = 10) should satisfy `0 < precision <= 76`, and `scale <= precision`."
     );
 }
 
@@ -418,9 +439,9 @@ fn plan_create_table_with_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0])]
+      EmptyRelation: rows=0
+    "#
     );
 
     let sql = "create table person (id int primary key, name string)";
@@ -428,9 +449,9 @@ fn plan_create_table_with_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0])]
+      EmptyRelation: rows=0
+    "#
     );
 
     let sql =
@@ -439,9 +460,9 @@ fn plan_create_table_with_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0]), Unique([1])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0]), Unique([1])]
+      EmptyRelation: rows=0
+    "#
     );
 
     let sql = "create table person (id int, name varchar,  primary key(name,  id));";
@@ -449,9 +470,9 @@ fn plan_create_table_with_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([1, 0])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([1, 0])]
+      EmptyRelation: rows=0
+    "#
     );
 }
 
@@ -462,9 +483,9 @@ fn plan_create_table_with_multi_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0]), PrimaryKey([1])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[PrimaryKey([0]), PrimaryKey([1])]
+      EmptyRelation: rows=0
+    "#
     );
 }
 
@@ -475,9 +496,9 @@ fn plan_create_table_with_unique() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[Unique([0])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[Unique([0])]
+      EmptyRelation: rows=0
+    "#
     );
 }
 
@@ -488,9 +509,9 @@ fn plan_create_table_no_pk() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" }
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" }
+      EmptyRelation: rows=0
+    "#
     );
 }
 
@@ -501,9 +522,9 @@ fn plan_create_table_check_constraint() {
     assert_snapshot!(
         plan,
         @r#"
-        CreateMemoryTable: Bare { table: "person" } constraints=[Unique([0])]
-          EmptyRelation
-        "#
+    CreateMemoryTable: Bare { table: "person" } constraints=[Unique([0])]
+      EmptyRelation: rows=0
+    "#
     );
 }
 
@@ -513,9 +534,7 @@ fn plan_start_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadWrite Serializable
-        "#
+        @"TransactionStart: ReadWrite Serializable"
     );
 }
 
@@ -525,9 +544,7 @@ fn plan_start_transaction_isolation() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadWrite ReadCommitted
-        "#
+        @"TransactionStart: ReadWrite ReadCommitted"
     );
 }
 
@@ -537,9 +554,7 @@ fn plan_start_transaction_read_only() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly Serializable
-        "#
+        @"TransactionStart: ReadOnly Serializable"
     );
 }
 
@@ -549,9 +564,7 @@ fn plan_start_transaction_fully_qualified() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly ReadCommitted
-        "#
+        @"TransactionStart: ReadOnly ReadCommitted"
     );
 }
 
@@ -565,9 +578,7 @@ isolation level repeatable read
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionStart: ReadOnly RepeatableRead
-        "#
+        @"TransactionStart: ReadOnly RepeatableRead"
     );
 }
 
@@ -577,9 +588,7 @@ fn plan_commit_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Commit chain:=false
-        "#
+        @"TransactionEnd: Commit chain:=false"
     );
 }
 
@@ -589,9 +598,7 @@ fn plan_commit_transaction_chained() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Commit chain:=true
-        "#
+        @"TransactionEnd: Commit chain:=true"
     );
 }
 
@@ -601,9 +608,7 @@ fn plan_rollback_transaction() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Rollback chain:=false
-        "#
+        @"TransactionEnd: Rollback chain:=false"
     );
 }
 
@@ -613,9 +618,7 @@ fn plan_rollback_transaction_chained() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        TransactionEnd: Rollback chain:=true
-        "#
+        @"TransactionEnd: Rollback chain:=true"
     );
 }
 
@@ -625,10 +628,10 @@ fn plan_copy_to() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        CopyTo: format=csv output_url=output.csv options: ()
-          TableScan: test_decimal
-        "#
+        @r"
+    CopyTo: format=csv output_url=output.csv options: ()
+      TableScan: test_decimal
+    "
     );
 }
 
@@ -638,11 +641,11 @@ fn plan_explain_copy_to() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Explain
-          CopyTo: format=csv output_url=output.csv options: ()
-            TableScan: test_decimal
-        "#
+        @r"
+    Explain
+      CopyTo: format=csv output_url=output.csv options: ()
+        TableScan: test_decimal
+    "
     );
 }
 
@@ -652,11 +655,11 @@ fn plan_explain_copy_to_format() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Explain
-          CopyTo: format=csv output_url=output.tbl options: ()
-            TableScan: test_decimal
-        "#
+        @r"
+    Explain
+      CopyTo: format=csv output_url=output.tbl options: ()
+        TableScan: test_decimal
+    "
     );
 }
 
@@ -668,10 +671,10 @@ fn plan_insert() {
     assert_snapshot!(
         plan,
         @r#"
-        Dml: op=[Insert Into] table=[person]
-          Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀
-            Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing"))
-        "#
+    Dml: op=[Insert Into] table=[person]
+      Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀
+        Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing"))
+    "#
     );
 }
 
@@ -681,11 +684,11 @@ fn plan_insert_no_target_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Insert Into] table=[test_decimal]
-          Projection: column1 AS id, column2 AS price
-            Values: (CAST(Int64(1) AS Int32), CAST(Int64(2) AS Decimal128(10, 2))), (CAST(Int64(3) AS Int32), CAST(Int64(4) AS Decimal128(10, 2)))
-        "#
+        @r"
+    Dml: op=[Insert Into] table=[test_decimal]
+      Projection: column1 AS id, column2 AS price
+        Values: (CAST(Int64(1) AS Int32), CAST(Int64(2) AS Decimal128(10, 2))), (CAST(Int64(3) AS Int32), CAST(Int64(4) AS Decimal128(10, 2)))
+    "
     );
 }
 
@@ -728,17 +731,17 @@ fn plan_update() {
     assert_snapshot!(
         plan,
         @r#"
-        Dml: op=[Update] table=[person]
-          Projection: person.id AS id, person.first_name AS first_name, Utf8("Kay") AS last_name, person.age AS age, person.state AS state, person.salary AS salary, person.birth_date AS birth_date, person.😀 AS 😀
-            Filter: person.id = Int64(1)
-              TableScan: person
-        "#
+    Dml: op=[Update] table=[person]
+      Projection: person.id AS id, person.first_name AS first_name, Utf8("Kay") AS last_name, person.age AS age, person.state AS state, person.salary AS salary, person.birth_date AS birth_date, person.😀 AS 😀
+        Filter: person.id = Int64(1)
+          TableScan: person
+    "#
     );
 }
 
 #[rstest]
-#[case::missing_assignement_target("UPDATE person SET doesnotexist = true")]
-#[case::missing_assignement_expression("UPDATE person SET age = doesnotexist + 42")]
+#[case::missing_assignment_target("UPDATE person SET doesnotexist = true")]
+#[case::missing_assignment_expression("UPDATE person SET age = doesnotexist + 42")]
 #[case::missing_selection_expression(
     "UPDATE person SET age = 42 WHERE doesnotexist = true"
 )]
@@ -754,11 +757,11 @@ fn plan_delete() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Delete] table=[person]
-          Filter: person.id = Int64(1)
-            TableScan: person
-        "#
+        @r"
+    Dml: op=[Delete] table=[person]
+      Filter: person.id = Int64(1)
+        TableScan: person
+    "
     );
 }
 
@@ -769,11 +772,11 @@ fn plan_delete_quoted_identifier_case_sensitive() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Dml: op=[Delete] table=[SomeCatalog.SomeSchema.UPPERCASE_test]
-          Filter: SomeCatalog.SomeSchema.UPPERCASE_test.Id = Int64(1)
-            TableScan: SomeCatalog.SomeSchema.UPPERCASE_test
-        "#
+        @r"
+    Dml: op=[Delete] table=[SomeCatalog.SomeSchema.UPPERCASE_test]
+      Filter: SomeCatalog.SomeSchema.UPPERCASE_test.Id = Int64(1)
+        TableScan: SomeCatalog.SomeSchema.UPPERCASE_test
+    "
     );
 }
 
@@ -791,9 +794,7 @@ fn select_repeated_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -802,10 +803,10 @@ fn select_scalar_func_with_literal_no_relation() {
     let plan = logical_plan("SELECT sqrt(9)").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: sqrt(Int64(9))
-          EmptyRelation
-        "#
+        @r"
+    Projection: sqrt(Int64(9))
+      EmptyRelation: rows=1
+    "
     );
 }
 
@@ -817,10 +818,10 @@ fn select_simple_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: person.state = Utf8("CO")
-            TableScan: person
-        "#
+    Projection: person.id, person.first_name, person.last_name
+      Filter: person.state = Utf8("CO")
+        TableScan: person
+    "#
     );
 }
 
@@ -845,11 +846,11 @@ fn select_neg_filter() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: NOT person.state
-            TableScan: person
-        "#
+        @r"
+    Projection: person.id, person.first_name, person.last_name
+      Filter: NOT person.state
+        TableScan: person
+    "
     );
 }
 
@@ -861,10 +862,10 @@ fn select_compound_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.id, person.first_name, person.last_name
-          Filter: person.state = Utf8("CO") AND person.age >= Int64(21) AND person.age <= Int64(65)
-            TableScan: person
-        "#
+    Projection: person.id, person.first_name, person.last_name
+      Filter: person.state = Utf8("CO") AND person.age >= Int64(21) AND person.age <= Int64(65)
+        TableScan: person
+    "#
     );
 }
 
@@ -874,11 +875,11 @@ fn test_timestamp_filter() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state
-          Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(Second, None)) AS Timestamp(Nanosecond, None))
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state
+      Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(s)) AS Timestamp(ns))
+        TableScan: person
+    "
     );
 }
 
@@ -889,10 +890,10 @@ fn test_date_filter() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.state
-          Filter: person.birth_date < CAST(Utf8("2020-01-01") AS Date32)
-            TableScan: person
-        "#
+    Projection: person.state
+      Filter: person.birth_date < CAST(Utf8("2020-01-01") AS Date32)
+        TableScan: person
+    "#
     );
 }
 
@@ -909,11 +910,11 @@ fn select_all_boolean_operators() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age, person.first_name, person.last_name
-          Filter: person.age = Int64(21) AND person.age != Int64(21) AND person.age > Int64(21) AND person.age >= Int64(21) AND person.age < Int64(65) AND person.age <= Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age, person.first_name, person.last_name
+      Filter: person.age = Int64(21) AND person.age != Int64(21) AND person.age > Int64(21) AND person.age >= Int64(21) AND person.age < Int64(65) AND person.age <= Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -923,11 +924,11 @@ fn select_between() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state
-          Filter: person.age BETWEEN Int64(21) AND Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state
+      Filter: person.age BETWEEN Int64(21) AND Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -937,11 +938,11 @@ fn select_between_negated() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state
-          Filter: person.age NOT BETWEEN Int64(21) AND Int64(65)
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state
+      Filter: person.age NOT BETWEEN Int64(21) AND Int64(65)
+        TableScan: person
+    "
     );
 }
 
@@ -958,14 +959,14 @@ fn select_nested() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: b.fn2, b.last_name
-          SubqueryAlias: b
-            Projection: a.fn1 AS fn2, a.last_name, a.birth_date
-              SubqueryAlias: a
-                Projection: person.first_name AS fn1, person.last_name, person.birth_date, person.age
-                  TableScan: person
-        "#
+        @r"
+    Projection: b.fn2, b.last_name
+      SubqueryAlias: b
+        Projection: a.fn1 AS fn2, a.last_name, a.birth_date
+          SubqueryAlias: a
+            Projection: person.first_name AS fn1, person.last_name, person.birth_date, person.age
+              TableScan: person
+    "
     );
 }
 
@@ -982,29 +983,29 @@ fn select_nested_with_filters() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: a.fn1, a.age
-          Filter: a.fn1 = Utf8("X") AND a.age < Int64(30)
-            SubqueryAlias: a
-              Projection: person.first_name AS fn1, person.age
-                Filter: person.age > Int64(20)
-                  TableScan: person
-        "#
+    Projection: a.fn1, a.age
+      Filter: a.fn1 = Utf8("X") AND a.age < Int64(30)
+        SubqueryAlias: a
+          Projection: person.first_name AS fn1, person.age
+            Filter: person.age > Int64(20)
+              TableScan: person
+    "#
     );
 }
 
 #[test]
 fn table_with_column_alias() {
-    let sql = "SELECT a, b, c
-                   FROM lineitem l (a, b, c)";
+    let sql = "SELECT a, b, c, d, e
+                   FROM lineitem l (a, b, c, d, e)";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: l.a, l.b, l.c
-          SubqueryAlias: l
-            Projection: lineitem.l_item_id AS a, lineitem.l_description AS b, lineitem.price AS c
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: l.a, l.b, l.c, l.d, l.e
+      SubqueryAlias: l
+        Projection: lineitem.l_orderkey AS a, lineitem.l_item_id AS b, lineitem.l_description AS c, lineitem.l_extendedprice AS d, lineitem.price AS e
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1016,7 +1017,7 @@ fn table_with_column_alias_number_cols() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: Source table contains 3 columns but only 2 names given as column alias"
+        @"Error during planning: Source table contains 5 columns but only 2 names given as column alias"
     );
 }
 
@@ -1027,7 +1028,7 @@ fn select_with_ambiguous_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Schema error: Ambiguous reference to unqualified field id"
+        @"Schema error: Ambiguous reference to unqualified field id"
     );
 }
 
@@ -1038,14 +1039,14 @@ fn join_with_ambiguous_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.id
-          Inner Join: Using a.id = b.id
-            SubqueryAlias: a
-              TableScan: person
-            SubqueryAlias: b
-              TableScan: person
-        "#
+        @r"
+    Projection: a.id
+      Inner Join: Using a.id = b.id
+        SubqueryAlias: a
+          TableScan: person
+        SubqueryAlias: b
+          TableScan: person
+    "
     );
 }
 
@@ -1055,14 +1056,14 @@ fn natural_left_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.l_item_id
-          Left Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price
-            SubqueryAlias: a
-              TableScan: lineitem
-            SubqueryAlias: b
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: a.l_item_id
+      Left Join: Using a.l_orderkey = b.l_orderkey, a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.l_extendedprice = b.l_extendedprice, a.price = b.price
+        SubqueryAlias: a
+          TableScan: lineitem
+        SubqueryAlias: b
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1072,14 +1073,14 @@ fn natural_right_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: a.l_item_id
-          Right Join: Using a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.price = b.price
-            SubqueryAlias: a
-              TableScan: lineitem
-            SubqueryAlias: b
-              TableScan: lineitem
-        "#
+        @r"
+    Projection: a.l_item_id
+      Right Join: Using a.l_orderkey = b.l_orderkey, a.l_item_id = b.l_item_id, a.l_description = b.l_description, a.l_extendedprice = b.l_extendedprice, a.price = b.price
+        SubqueryAlias: a
+          TableScan: lineitem
+        SubqueryAlias: b
+          TableScan: lineitem
+    "
     );
 }
 
@@ -1092,7 +1093,7 @@ fn select_with_having() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r"Error during planning: HAVING clause references: person.age > Int64(100) AND person.age < Int64(200) must appear in the GROUP BY clause or be used in an aggregate function"
+        @"Error during planning: HAVING clause references: person.age > Int64(100) AND person.age < Int64(200) must appear in the GROUP BY clause or be used in an aggregate function"
     );
 }
 
@@ -1105,9 +1106,7 @@ fn select_with_having_referencing_column_not_in_select() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: HAVING clause references: person.first_name = Utf8("M") must appear in the GROUP BY clause or be used in an aggregate function
-        "#
+        @r#"Error during planning: HAVING clause references: person.first_name = Utf8("M") must appear in the GROUP BY clause or be used in an aggregate function"#
     );
 }
 
@@ -1121,9 +1120,7 @@ fn select_with_having_refers_to_invalid_column() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.id, max(person.age)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.id, max(person.age)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1136,9 +1133,7 @@ fn select_with_having_referencing_column_nested_in_select_expression() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: HAVING clause references: person.age > Int64(100) must appear in the GROUP BY clause or be used in an aggregate function
-        "#
+        @"Error during planning: HAVING clause references: person.age > Int64(100) must appear in the GROUP BY clause or be used in an aggregate function"
     );
 }
 
@@ -1163,12 +1158,12 @@ fn select_aggregate_with_having_that_reuses_aggregate() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age)
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age)
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1181,11 +1176,11 @@ fn select_aggregate_with_having_with_aggregate_not_in_select() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: max(person.age)
-          Filter: max(person.first_name) > Utf8("M")
-            Aggregate: groupBy=[[]], aggr=[[max(person.age), max(person.first_name)]]
-              TableScan: person
-        "#
+    Projection: max(person.age)
+      Filter: max(person.first_name) > Utf8("M")
+        Aggregate: groupBy=[[]], aggr=[[max(person.age), max(person.first_name)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1198,9 +1193,7 @@ fn select_aggregate_with_having_referencing_column_not_in_select() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "count(*)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.first_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "count(*)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1213,12 +1206,12 @@ fn select_aggregate_aliased_with_having_referencing_aggregate_by_its_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age) AS max_age
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age) AS max_age
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1230,12 +1223,12 @@ fn select_aggregate_aliased_with_having_that_reuses_aggregate_but_not_by_its_ali
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.age) AS max_age
-          Filter: max(person.age) < Int64(30)
-            Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: max(person.age) AS max_age
+      Filter: max(person.age) < Int64(30)
+        Aggregate: groupBy=[[]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1249,11 +1242,11 @@ fn select_aggregate_with_group_by_with_having() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: person.first_name = Utf8("M")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name, max(person.age)
+      Filter: person.first_name = Utf8("M")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1267,13 +1260,13 @@ fn select_aggregate_with_group_by_with_having_and_where() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) < Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              Filter: person.id > Int64(5)
-                TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) < Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          Filter: person.id > Int64(5)
+            TableScan: person
+    "
     );
 }
 
@@ -1287,13 +1280,13 @@ fn select_aggregate_with_group_by_with_having_and_where_filtering_on_aggregate_c
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) < Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              Filter: person.id > Int64(5) AND person.age > Int64(18)
-                TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) < Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          Filter: person.id > Int64(5) AND person.age > Int64(18)
+            TableScan: person
+    "
     );
 }
 
@@ -1307,17 +1300,17 @@ fn select_aggregate_with_group_by_with_having_using_column_by_alias() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name AS fn, max(person.age)
-          Filter: max(person.age) > Int64(2) AND person.first_name = Utf8("M")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name AS fn, max(person.age)
+      Filter: max(person.age) > Int64(2) AND person.first_name = Utf8("M")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
 #[test]
-fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_their_aliases(
-) {
+fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_their_aliases()
+ {
     let sql = "SELECT first_name AS fn, MAX(age) AS max_age
                    FROM person
                    GROUP BY first_name
@@ -1326,11 +1319,11 @@ fn select_aggregate_with_group_by_with_having_using_columns_with_and_without_the
     assert_snapshot!(
         plan,
         @r#"
-        Projection: person.first_name AS fn, max(person.age) AS max_age
-          Filter: max(person.age) > Int64(2) AND max(person.age) < Int64(5) AND person.first_name = Utf8("M") AND person.first_name = Utf8("N")
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+    Projection: person.first_name AS fn, max(person.age) AS max_age
+      Filter: max(person.age) > Int64(2) AND max(person.age) < Int64(5) AND person.first_name = Utf8("M") AND person.first_name = Utf8("N")
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "#
     );
 }
 
@@ -1343,12 +1336,12 @@ fn select_aggregate_with_group_by_with_having_that_reuses_aggregate() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1362,9 +1355,7 @@ fn select_aggregate_with_group_by_with_having_referencing_column_not_in_group_by
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.last_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.first_name, max(person.age)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in HAVING must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.last_name" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.first_name, max(person.age)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1377,12 +1368,12 @@ fn select_aggregate_with_group_by_with_having_that_reuses_aggregate_multiple_tim
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND max(person.age) < Int64(200)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND max(person.age) < Int64(200)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1395,12 +1386,12 @@ fn select_aggregate_with_group_by_with_having_using_aggregate_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND min(person.id) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND min(person.id) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1414,18 +1405,18 @@ fn select_aggregate_aliased_with_group_by_with_having_referencing_aggregate_by_i
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age) AS max_age
-          Filter: max(person.age) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age) AS max_age
+      Filter: max(person.age) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
 #[test]
-fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compound_aggregate_by_its_alias(
-) {
+fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compound_aggregate_by_its_alias()
+ {
     let sql = "SELECT first_name, MAX(age) + 1 AS max_age_plus_one
                    FROM person
                    GROUP BY first_name
@@ -1433,18 +1424,18 @@ fn select_aggregate_compound_aliased_with_group_by_with_having_referencing_compo
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age) + Int64(1) AS max_age_plus_one
-          Filter: max(person.age) + Int64(1) > Int64(100)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age) + Int64(1) AS max_age_plus_one
+      Filter: max(person.age) + Int64(1) > Int64(100)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age)]]
+          TableScan: person
+    "
     );
 }
 
 #[test]
-fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not_in_select(
-) {
+fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not_in_select()
+ {
     let sql = "SELECT first_name, MAX(age)
                    FROM person
                    GROUP BY first_name
@@ -1452,12 +1443,12 @@ fn select_aggregate_with_group_by_with_having_using_derived_column_aggregate_not
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND min(person.id - Int64(2)) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id - Int64(2))]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND min(person.id - Int64(2)) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), min(person.id - Int64(2))]]
+          TableScan: person
+    "
     );
 }
 
@@ -1470,12 +1461,12 @@ fn select_aggregate_with_group_by_with_having_using_count_star_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.first_name, max(person.age)
-          Filter: max(person.age) > Int64(100) AND count(*) < Int64(50)
-            Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), count(*)]]
-              TableScan: person
-        "#
+        @r"
+    Projection: person.first_name, max(person.age)
+      Filter: max(person.age) > Int64(100) AND count(*) < Int64(50)
+        Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.age), count(*)]]
+          TableScan: person
+    "
     );
 }
 
@@ -1485,10 +1476,10 @@ fn select_binary_expr() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + person.salary
-          TableScan: person
-        "#
+        @r"
+    Projection: person.age + person.salary
+      TableScan: person
+    "
     );
 }
 
@@ -1498,10 +1489,10 @@ fn select_binary_expr_nested() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: (person.age + person.salary) / Int64(2)
-          TableScan: person
-        "#
+        @r"
+    Projection: (person.age + person.salary) / Int64(2)
+      TableScan: person
+    "
     );
 }
 
@@ -1510,11 +1501,11 @@ fn select_simple_aggregate() {
     let plan = logical_plan("SELECT MIN(age) FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age)
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age)
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1523,11 +1514,11 @@ fn test_sum_aggregate() {
     let plan = logical_plan("SELECT sum(age) from person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: sum(person.age)
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: sum(person.age)
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1545,9 +1536,7 @@ fn select_simple_aggregate_repeated_aggregate() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1556,11 +1545,11 @@ fn select_simple_aggregate_repeated_aggregate_with_single_alias() {
     let plan = logical_plan("SELECT MIN(age), MIN(age) AS a FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age), min(person.age) AS a
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age), min(person.age) AS a
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1569,11 +1558,11 @@ fn select_simple_aggregate_repeated_aggregate_with_unique_aliases() {
     let plan = logical_plan("SELECT MIN(age) AS a, MIN(age) AS b FROM person").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age) AS a, min(person.age) AS b
-          Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age) AS a, min(person.age) AS b
+      Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1585,11 +1574,11 @@ fn select_from_typed_string_values() {
     assert_snapshot!(
         plan,
         @r#"
-        Projection: t.col1, t.col2
-          SubqueryAlias: t
-            Projection: column1 AS col1, column2 AS col2
-              Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(Nanosecond, None)), CAST(Utf8("2004-04-09") AS Date32))
-        "#
+    Projection: t.col1, t.col2
+      SubqueryAlias: t
+        Projection: column1 AS col1, column2 AS col2
+          Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(ns)), CAST(Utf8("2004-04-09") AS Date32))
+    "#
     );
 }
 
@@ -1600,9 +1589,7 @@ fn select_simple_aggregate_repeated_aggregate_with_repeated_aliases() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1613,11 +1600,11 @@ fn select_simple_aggregate_with_groupby() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age), max(person.age)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age), max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1628,11 +1615,11 @@ fn select_simple_aggregate_with_groupby_with_aliases() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state AS a, min(person.age) AS b
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state AS a, min(person.age) AS b
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1643,9 +1630,7 @@ fn select_simple_aggregate_with_groupby_with_aliases_repeated() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1655,11 +1640,11 @@ fn select_simple_aggregate_with_groupby_column_unselected() {
         logical_plan("SELECT MIN(age), MAX(age) FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.age), max(person.age)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.age), max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1670,9 +1655,7 @@ fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist()
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀".
-        "#
+        @r#"Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀"."#
     );
 }
 
@@ -1690,9 +1673,7 @@ fn select_interval_out_of_range() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Arrow error: Invalid argument error: Unable to represent 100000000000000000 days in a signed 32-bit integer
-        "#
+        @"Arrow error: Invalid argument error: Unable to represent 100000000000000000 days in a signed 32-bit integer"
     );
 }
 
@@ -1702,11 +1683,11 @@ fn select_simple_aggregate_with_groupby_and_column_is_in_aggregate_and_groupby()
         logical_plan("SELECT MAX(first_name) FROM person GROUP BY first_name").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.first_name)
-          Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: max(person.first_name)
+      Aggregate: groupBy=[[person.first_name]], aggr=[[max(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1716,21 +1697,21 @@ fn select_simple_aggregate_with_groupby_can_use_positions() {
         .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, person.age AS b, count(Int64(1))
-          Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, person.age AS b, count(Int64(1))
+      Aggregate: groupBy=[[person.state, person.age]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
     let plan = logical_plan("SELECT state, age AS b, count(1) FROM person GROUP BY 2, 1")
         .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, person.age AS b, count(Int64(1))
-          Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, person.age AS b, count(Int64(1))
+      Aggregate: groupBy=[[person.age, person.state]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1741,9 +1722,7 @@ fn select_simple_aggregate_with_groupby_position_out_of_range() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Cannot find column with position 0 in SELECT clause. Valid columns: 1 to 2
-        "#
+        @"Error during planning: Cannot find column with position 0 in SELECT clause. Valid columns: 1 to 2"
     );
 
     let sql2 = "SELECT state, MIN(age) FROM person GROUP BY 5";
@@ -1751,9 +1730,7 @@ fn select_simple_aggregate_with_groupby_position_out_of_range() {
 
     assert_snapshot!(
         err2.strip_backtrace(),
-        @r#"
-        Error during planning: Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 2
-        "#
+        @"Error during planning: Cannot find column with position 5 in SELECT clause. Valid columns: 1 to 2"
     );
 }
 
@@ -1763,11 +1740,11 @@ fn select_simple_aggregate_with_groupby_can_use_alias() {
         logical_plan("SELECT state AS a, MIN(age) AS b FROM person GROUP BY a").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state AS a, min(person.age) AS b
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state AS a, min(person.age) AS b
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1778,9 +1755,7 @@ fn select_simple_aggregate_with_groupby_aggregate_repeated() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them.
-        "#
+        @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them."#
     );
 }
 
@@ -1791,11 +1766,11 @@ fn select_simple_aggregate_with_groupby_aggregate_repeated_and_one_has_alias() {
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age), min(person.age) AS ma
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age), min(person.age) AS ma
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1805,11 +1780,11 @@ fn select_simple_aggregate_with_groupby_non_column_expression_unselected() {
         logical_plan("SELECT MIN(first_name) FROM person GROUP BY age + 1").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1820,22 +1795,22 @@ fn select_simple_aggregate_with_groupby_non_column_expression_selected_and_resol
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + Int64(1), min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + Int64(1), min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
     let plan =
         logical_plan("SELECT MIN(first_name), age + 1 FROM person GROUP BY age + 1")
             .unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: min(person.first_name), person.age + Int64(1)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: min(person.first_name), person.age + Int64(1)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1846,11 +1821,11 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_resolva
     ).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + Int64(1) / Int64(2) * person.age + Int64(1), min(person.first_name)
-          Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + Int64(1) / Int64(2) * person.age + Int64(1), min(person.first_name)
+      Aggregate: groupBy=[[person.age + Int64(1)]], aggr=[[min(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1863,9 +1838,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_nested_and_not_res
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1876,9 +1849,7 @@ fn select_simple_aggregate_with_groupby_non_column_expression_and_its_column_sel
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.age + Int64(1), min(person.first_name)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -1888,11 +1859,11 @@ fn select_simple_aggregate_nested_in_binary_expr_with_groupby() {
         logical_plan("SELECT state, MIN(age) < 10 FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age) < Int64(10)
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age) < Int64(10)
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1902,11 +1873,11 @@ fn select_simple_aggregate_and_nested_groupby_column() {
         logical_plan("SELECT MAX(first_name), age + 1 FROM person GROUP BY age").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: max(person.first_name), person.age + Int64(1)
-          Aggregate: groupBy=[[person.age]], aggr=[[max(person.first_name)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: max(person.first_name), person.age + Int64(1)
+      Aggregate: groupBy=[[person.age]], aggr=[[max(person.first_name)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1915,11 +1886,11 @@ fn select_aggregate_compounded_with_groupby_column() {
     let plan = logical_plan("SELECT age + MIN(salary) FROM person GROUP BY age").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.age + min(person.salary)
-          Aggregate: groupBy=[[person.age]], aggr=[[min(person.salary)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.age + min(person.salary)
+      Aggregate: groupBy=[[person.age]], aggr=[[min(person.salary)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1929,11 +1900,11 @@ fn select_aggregate_with_non_column_inner_expression_with_groupby() {
         logical_plan("SELECT state, MIN(age + 1) FROM person GROUP BY state").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: person.state, min(person.age + Int64(1))
-          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age + Int64(1))]]
-            TableScan: person
-        "#
+        @r"
+    Projection: person.state, min(person.age + Int64(1))
+      Aggregate: groupBy=[[person.state]], aggr=[[min(person.age + Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1943,11 +1914,11 @@ fn select_count_one() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: count(Int64(1))
-  Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-    TableScan: person
-"#
+        @r"
+    Projection: count(Int64(1))
+      Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+        TableScan: person
+    "
     );
 }
 
@@ -1957,11 +1928,11 @@ fn select_count_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: count(person.id)
-  Aggregate: groupBy=[[]], aggr=[[count(person.id)]]
-    TableScan: person
-"#
+        @r"
+    Projection: count(person.id)
+      Aggregate: groupBy=[[]], aggr=[[count(person.id)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1971,11 +1942,11 @@ fn select_approx_median() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: approx_median(person.age)
-  Aggregate: groupBy=[[]], aggr=[[approx_median(person.age)]]
-    TableScan: person
-"#
+        @r"
+    Projection: approx_median(person.age)
+      Aggregate: groupBy=[[]], aggr=[[approx_median(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -1985,10 +1956,10 @@ fn select_scalar_func() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sqrt(person.age)
-  TableScan: person
-"#
+        @r"
+    Projection: sqrt(person.age)
+      TableScan: person
+    "
     );
 }
 
@@ -1998,10 +1969,10 @@ fn select_aliased_scalar_func() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sqrt(person.age) AS square_people
-  TableScan: person
-"#
+        @r"
+    Projection: sqrt(person.age) AS square_people
+      TableScan: person
+    "
     );
 }
 
@@ -2012,11 +1983,11 @@ fn select_where_nullif_division() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3 / (aggregate_test_100.c4 + aggregate_test_100.c5)
-  Filter: aggregate_test_100.c3 / nullif(aggregate_test_100.c4 + aggregate_test_100.c5, Int64(0)) > Float64(0.1)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3 / (aggregate_test_100.c4 + aggregate_test_100.c5)
+      Filter: aggregate_test_100.c3 / nullif(aggregate_test_100.c4 + aggregate_test_100.c5, Int64(0)) > Float64(0.1)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2026,11 +1997,11 @@ fn select_where_with_negative_operator() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3
-  Filter: aggregate_test_100.c3 > Float64(-0.1) AND (- aggregate_test_100.c4) > Int64(0)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3
+      Filter: aggregate_test_100.c3 > Float64(-0.1) AND (- aggregate_test_100.c4) > Int64(0)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2040,11 +2011,11 @@ fn select_where_with_positive_operator() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c3
-  Filter: aggregate_test_100.c3 > Float64(0.1) AND aggregate_test_100.c4 > Int64(0)
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c3
+      Filter: aggregate_test_100.c3 > Float64(0.1) AND aggregate_test_100.c4 > Int64(0)
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2056,11 +2027,11 @@ fn select_where_compound_identifiers() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: public.aggregate_test_100.c3
-  Filter: public.aggregate_test_100.c3 > Float64(0.1)
-    TableScan: public.aggregate_test_100
-"#
+        @r"
+    Projection: public.aggregate_test_100.c3
+      Filter: public.aggregate_test_100.c3 > Float64(0.1)
+        TableScan: public.aggregate_test_100
+    "
     );
 }
 
@@ -2070,11 +2041,11 @@ fn select_order_by_index() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2084,11 +2055,11 @@ fn select_order_by_multiple_index() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST, person.age ASC NULLS LAST
-  Projection: person.id, person.state, person.age
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST, person.age ASC NULLS LAST
+      Projection: person.id, person.state, person.age
+        TableScan: person
+    "
     );
 }
 
@@ -2101,9 +2072,7 @@ fn select_order_by_index_of_0() {
 
     assert_snapshot!(
         err,
-        @r#"
-        Error during planning: Order by index starts at 1 for column indexes
-        "#
+        @"Error during planning: Order by index starts at 1 for column indexes"
     );
 }
 
@@ -2116,9 +2085,7 @@ fn select_order_by_index_oob() {
 
     assert_snapshot!(
         err,
-        @r#"
-        Error during planning: Order by column out of bounds, specified: 2, max: 1
-        "#
+        @"Error during planning: Order by column out of bounds, specified: 2, max: 1"
     );
 }
 
@@ -2128,11 +2095,11 @@ fn select_with_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2142,11 +2109,11 @@ fn select_order_by_desc() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id DESC NULLS FIRST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id DESC NULLS FIRST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2155,21 +2122,21 @@ fn select_order_by_nulls_last() {
     let plan = logical_plan("SELECT id FROM person ORDER BY id DESC NULLS LAST").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id DESC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id DESC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 
     let plan = logical_plan("SELECT id FROM person ORDER BY id NULLS LAST").unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: person.id ASC NULLS LAST
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Sort: person.id ASC NULLS LAST
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -2179,11 +2146,11 @@ fn select_group_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.state
-  Aggregate: groupBy=[[person.state]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.state
+      Aggregate: groupBy=[[person.state]], aggr=[[]]
+        TableScan: person
+    "
     );
 }
 
@@ -2193,11 +2160,11 @@ fn select_group_by_columns_not_in_select() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: max(person.age)
-  Aggregate: groupBy=[[person.state]], aggr=[[max(person.age)]]
-    TableScan: person
-"#
+        @r"
+    Projection: max(person.age)
+      Aggregate: groupBy=[[person.state]], aggr=[[max(person.age)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2207,11 +2174,11 @@ fn select_group_by_count_star() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.state, count(*)
-  Aggregate: groupBy=[[person.state]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.state, count(*)
+      Aggregate: groupBy=[[person.state]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2221,11 +2188,11 @@ fn select_group_by_needs_projection() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-        Projection: count(person.state), person.state
-          Aggregate: groupBy=[[person.state]], aggr=[[count(person.state)]]
-            TableScan: person
-        "#
+        @r"
+    Projection: count(person.state), person.state
+      Aggregate: groupBy=[[person.state]], aggr=[[count(person.state)]]
+        TableScan: person
+    "
     );
 }
 
@@ -2235,11 +2202,11 @@ fn select_7480_1() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: aggregate_test_100.c1, min(aggregate_test_100.c12)
-  Aggregate: groupBy=[[aggregate_test_100.c1, aggregate_test_100.c13]], aggr=[[min(aggregate_test_100.c12)]]
-    TableScan: aggregate_test_100
-"#
+        @r"
+    Projection: aggregate_test_100.c1, min(aggregate_test_100.c12)
+      Aggregate: groupBy=[[aggregate_test_100.c1, aggregate_test_100.c13]], aggr=[[min(aggregate_test_100.c12)]]
+        TableScan: aggregate_test_100
+    "
     );
 }
 
@@ -2250,9 +2217,7 @@ fn select_7480_2() {
 
     assert_snapshot!(
         err.strip_backtrace(),
-        @r#"
-        Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "aggregate_test_100.c13" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "aggregate_test_100.c1, min(aggregate_test_100.c12)" appears in the SELECT clause satisfies this requirement
-        "#
+        @r#"Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "aggregate_test_100.c13" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "aggregate_test_100.c1, min(aggregate_test_100.c12)" appears in the SELECT clause satisfies this requirement"#
     );
 }
 
@@ -2262,9 +2227,7 @@ fn create_external_table_csv() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2274,9 +2237,7 @@ fn create_external_table_with_pk() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" } constraints=[PrimaryKey([0])]
-    "#
+        @r#"CreateExternalTable: Bare { table: "t" } constraints=[PrimaryKey([0])]"#
     );
 }
 
@@ -2286,9 +2247,7 @@ fn create_external_table_wih_schema() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Partial { schema: "staging", table: "foo" }
-"#
+        @r#"CreateExternalTable: Partial { schema: "staging", table: "foo" }"#
     );
 }
 
@@ -2298,9 +2257,7 @@ fn create_schema_with_quoted_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "quoted_schema_name"
-"#
+        @r#"CreateCatalogSchema: "quoted_schema_name""#
     );
 }
 
@@ -2310,9 +2267,7 @@ fn create_schema_with_quoted_unnormalized_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "Foo"
-"#
+        @r#"CreateCatalogSchema: "Foo""#
     );
 }
 
@@ -2322,9 +2277,7 @@ fn create_schema_with_unquoted_normalized_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateCatalogSchema: "foo"
-"#
+        @r#"CreateCatalogSchema: "foo""#
     );
 }
 
@@ -2334,9 +2287,7 @@ fn create_external_table_custom() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "dt" }
-"#
+        @r#"CreateExternalTable: Bare { table: "dt" }"#
     );
 }
 
@@ -2346,9 +2297,7 @@ fn create_external_table_csv_no_schema() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2361,16 +2310,14 @@ fn create_external_table_with_compression_type() {
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.gz' OPTIONS ('format.compression' 'gzip')",
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.bz2' OPTIONS ('format.compression' 'bzip2')",
         "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD LOCATION 'foo.unk' OPTIONS ('format.compression' 'gzip')",
-         ];
+    ];
 
     allow_duplicates! {
         for sql in sqls {
             let plan = logical_plan(sql).unwrap();
             assert_snapshot!(
                 plan,
-                @r#"
-                CreateExternalTable: Bare { table: "t" }
-                "#
+                @r#"CreateExternalTable: Bare { table: "t" }"#
             );
         }
 
@@ -2392,9 +2339,7 @@ fn create_external_table_with_compression_type() {
 
             assert_snapshot!(
                 err.strip_backtrace(),
-                @r#"
-                Error during planning: File compression type cannot be set for PARQUET, AVRO, or ARROW files.
-                "#
+                @"Error during planning: File compression type cannot be set for PARQUET, AVRO, or ARROW files."
             );
 
         }
@@ -2407,9 +2352,7 @@ fn create_external_table_parquet() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2419,9 +2362,7 @@ fn create_external_table_parquet_sort_order() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "foo" }
-"#
+        @r#"CreateExternalTable: Bare { table: "foo" }"#
     );
 }
 
@@ -2441,9 +2382,7 @@ fn create_external_table_parquet_no_schema_sort_order() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-CreateExternalTable: Bare { table: "t" }
-"#
+        @r#"CreateExternalTable: Bare { table: "t" }"#
     );
 }
 
@@ -2456,12 +2395,12 @@ fn equijoin_explicit_syntax() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2474,12 +2413,12 @@ fn equijoin_with_condition() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2492,12 +2431,12 @@ fn left_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Left Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1) AND person.age < Int64(30)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Left Join:  Filter: person.id = orders.customer_id AND orders.order_id > Int64(1) AND person.age < Int64(30)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2510,12 +2449,12 @@ fn right_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Right Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Right Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2528,12 +2467,12 @@ fn full_equijoin_with_conditions() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Full Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Full Join:  Filter: person.id = orders.customer_id AND person.id > Int64(1) AND orders.order_id < Int64(100)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2546,12 +2485,12 @@ fn join_with_table_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -2564,13 +2503,13 @@ fn join_with_using() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.first_name, person.id
-  Inner Join: Using person.id = person2.id
-    TableScan: person
-    SubqueryAlias: person2
-      TableScan: person
-"#
+        @r"
+    Projection: person.first_name, person.id
+      Inner Join: Using person.id = person2.id
+        TableScan: person
+        SubqueryAlias: person2
+          TableScan: person
+    "
     );
 }
 
@@ -2583,14 +2522,14 @@ fn equijoin_explicit_syntax_3_tables() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id, lineitem.l_description
-  Inner Join:  Filter: orders.o_item_id = lineitem.l_item_id
-    Inner Join:  Filter: person.id = orders.customer_id
-      TableScan: person
-      TableScan: orders
-    TableScan: lineitem
-"#
+        @r"
+    Projection: person.id, orders.order_id, lineitem.l_description
+      Inner Join:  Filter: orders.o_item_id = lineitem.l_item_id
+        Inner Join:  Filter: person.id = orders.customer_id
+          TableScan: person
+          TableScan: orders
+        TableScan: lineitem
+    "
     );
 }
 
@@ -2602,11 +2541,11 @@ fn boolean_literal_in_condition_expression() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id
-  Filter: orders.delivered = Boolean(false) OR orders.delivered = Boolean(true)
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id
+      Filter: orders.delivered = Boolean(false) OR orders.delivered = Boolean(true)
+        TableScan: orders
+    "
     );
 }
 
@@ -2616,14 +2555,14 @@ fn union() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: orders.order_id
-      TableScan: orders
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: orders.order_id
+          TableScan: orders
+        Projection: orders.order_id
+          TableScan: orders
+    "
     );
 }
 
@@ -2633,16 +2572,16 @@ fn union_by_name_different_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: order_id, NULL AS Int64(1)
-      Projection: orders.order_id
-        TableScan: orders
-    Projection: order_id, Int64(1)
-      Projection: orders.order_id, Int64(1)
-        TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: order_id, NULL AS Int64(1)
+          Projection: orders.order_id
+            TableScan: orders
+        Projection: order_id, Int64(1)
+          Projection: orders.order_id, Int64(1)
+            TableScan: orders
+    "
     );
 }
 
@@ -2652,14 +2591,14 @@ fn union_by_name_same_column_names() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Distinct:
-  Union
-    Projection: orders.order_id
-      TableScan: orders
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Distinct:
+      Union
+        Projection: orders.order_id
+          TableScan: orders
+        Projection: orders.order_id
+          TableScan: orders
+    "
     );
 }
 
@@ -2669,13 +2608,13 @@ fn union_all() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: orders.order_id
-    TableScan: orders
-  Projection: orders.order_id
-    TableScan: orders
-"#
+        @r"
+    Union
+      Projection: orders.order_id
+        TableScan: orders
+      Projection: orders.order_id
+        TableScan: orders
+    "
     );
 }
 
@@ -2686,15 +2625,15 @@ fn union_all_by_name_different_columns() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: order_id, NULL AS Int64(1)
-    Projection: orders.order_id
-      TableScan: orders
-  Projection: order_id, Int64(1)
-    Projection: orders.order_id, Int64(1)
-      TableScan: orders
-"#
+        @r"
+    Union
+      Projection: order_id, NULL AS Int64(1)
+        Projection: orders.order_id
+          TableScan: orders
+      Projection: order_id, Int64(1)
+        Projection: orders.order_id, Int64(1)
+          TableScan: orders
+    "
     );
 }
 
@@ -2704,15 +2643,15 @@ fn union_all_by_name_same_column_names() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Union
-  Projection: order_id
-    Projection: orders.order_id
-      TableScan: orders
-  Projection: order_id
-    Projection: orders.order_id
-      TableScan: orders
-"#
+        @r"
+    Union
+      Projection: order_id
+        Projection: orders.order_id
+          TableScan: orders
+      Projection: order_id
+        Projection: orders.order_id
+          TableScan: orders
+    "
     );
 }
 
@@ -2722,11 +2661,11 @@ fn empty_over() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2736,11 +2675,11 @@ fn empty_over_with_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2750,11 +2689,11 @@ fn empty_over_dup_with_alias() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid_dup
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max_oid_dup
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2764,12 +2703,12 @@ fn empty_over_dup_with_different_sort() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id AS oid, max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.order_id) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.order_id) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2779,11 +2718,11 @@ fn empty_over_plus() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty * Float64(1.1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2793,11 +2732,11 @@ fn empty_over_multiple() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, avg(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2816,11 +2755,11 @@ fn over_partition_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -2842,12 +2781,12 @@ fn over_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2857,12 +2796,12 @@ fn over_order_by_with_window_frame_double_end() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2872,12 +2811,12 @@ fn over_order_by_with_window_frame_single_end() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2887,12 +2826,12 @@ fn over_order_by_with_window_frame_single_end_groups() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2914,12 +2853,12 @@ fn over_order_by_two_sort_keys() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id + Int64(1) ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -2942,13 +2881,13 @@ fn over_order_by_sort_keys_sorting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
     );
 }
 
@@ -2969,13 +2908,13 @@ fn over_order_by_sort_keys_sorting_prefix_compacting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: orders
+    "
     );
 }
 
@@ -3001,14 +2940,14 @@ fn over_order_by_sort_keys_sorting_global_order_compacting() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: orders.order_id ASC NULLS LAST
-  Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-    WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-      WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-        WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-          TableScan: orders
-"#
+        @r"
+    Sort: orders.order_id ASC NULLS LAST
+      Projection: orders.order_id, max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+        WindowAggr: windowExpr=[[sum(orders.qty) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+          WindowAggr: windowExpr=[[max(orders.qty) ORDER BY [orders.qty ASC NULLS LAST, orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            WindowAggr: windowExpr=[[min(orders.qty) ORDER BY [orders.order_id ASC NULLS LAST, orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+              TableScan: orders
+    "
     );
 }
 
@@ -3028,11 +2967,11 @@ fn over_partition_by_order_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3047,16 +2986,15 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDE
 /// ```
 #[test]
 fn over_partition_by_order_by_no_dup() {
-    let sql =
-        "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3074,17 +3012,16 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orde
 /// ```
 #[test]
 fn over_partition_by_order_by_mix_up() {
-    let sql =
-            "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.qty] ORDER BY [orders.order_id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -3101,17 +3038,16 @@ Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id, orde
 /// FIXME: for now we are not detecting prefix of sorting keys in order to save one sort exec phase
 #[test]
 fn over_partition_by_order_by_mix_up_prefix() {
-    let sql =
-            "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders";
+    let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-      TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ORDER BY [orders.qty ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        WindowAggr: windowExpr=[[min(orders.qty) PARTITION BY [orders.order_id, orders.qty] ORDER BY [orders.price ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: orders
+    "
     );
 }
 
@@ -3122,11 +3058,11 @@ fn approx_median_window() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[approx_median(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -3137,9 +3073,9 @@ fn select_typed_date_string() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: CAST(Utf8("2020-12-10") AS Date32) AS date
-  EmptyRelation
-"#
+    Projection: CAST(Utf8("2020-12-10") AS Date32) AS date
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -3150,9 +3086,9 @@ fn select_typed_time_string() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: CAST(Utf8("08:09:10.123") AS Time64(Nanosecond)) AS time
-  EmptyRelation
-"#
+    Projection: CAST(Utf8("08:09:10.123") AS Time64(ns)) AS time
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -3162,10 +3098,10 @@ fn select_multibyte_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.😀
-  TableScan: person
-"#
+        @r"
+    Projection: person.😀
+      TableScan: person
+    "
     );
 }
 
@@ -3210,11 +3146,11 @@ fn select_groupby_orderby() {
                 plan,
                 // expect that this is not an ambiguous reference
                 @r#"
-        Sort: birth_date ASC NULLS LAST
-          Projection: avg(person.age) AS value, date_trunc(Utf8("month"), person.birth_date) AS birth_date
-            Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
-              TableScan: person
-        "#
+            Sort: birth_date ASC NULLS LAST
+              Projection: avg(person.age) AS value, date_trunc(Utf8("month"), person.birth_date) AS birth_date
+                Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
+                  TableScan: person
+            "#
             );
         }
     }
@@ -3230,11 +3166,60 @@ fn select_groupby_orderby() {
     assert_snapshot!(
         plan,
         @r#"
-Sort: avg(person.age) + avg(person.age) ASC NULLS LAST
-  Projection: avg(person.age) + avg(person.age), date_trunc(Utf8("month"), person.birth_date) AS birth_date
-    Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
-      TableScan: person
-"#
+    Sort: avg(person.age) + avg(person.age) ASC NULLS LAST
+      Projection: avg(person.age) + avg(person.age), date_trunc(Utf8("month"), person.birth_date) AS birth_date
+        Aggregate: groupBy=[[person.birth_date]], aggr=[[avg(person.age)]]
+          TableScan: person
+    "#
+    );
+}
+
+#[test]
+fn select_groupby_orderby_aggregate_on_non_selected_column() {
+    let sql = "SELECT state FROM person GROUP BY state ORDER BY MIN(age)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.state
+      Sort: min(person.age) ASC NULLS LAST
+        Projection: person.state, min(person.age)
+          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn select_groupby_orderby_multiple_aggregates_on_non_selected_columns() {
+    let sql =
+        "SELECT state FROM person GROUP BY state ORDER BY MIN(age), MAX(salary) DESC";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.state
+      Sort: min(person.age) ASC NULLS LAST, max(person.salary) DESC NULLS FIRST
+        Projection: person.state, min(person.age), max(person.salary)
+          Aggregate: groupBy=[[person.state]], aggr=[[min(person.age), max(person.salary)]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn select_groupby_orderby_aggregate_on_non_selected_column_original_issue() {
+    let sql = "SELECT id FROM person GROUP BY id ORDER BY min(age)";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.id
+      Sort: min(person.age) ASC NULLS LAST
+        Projection: person.id, min(person.age)
+          Aggregate: groupBy=[[person.id]], aggr=[[min(person.age)]]
+            TableScan: person
+    "
     );
 }
 
@@ -3297,6 +3282,7 @@ fn logical_plan_with_dialect_and_options(
         .with_aggregate_function(max_udaf())
         .with_aggregate_function(grouping_udaf())
         .with_window_function(rank_udwf())
+        .with_window_function(row_number_udwf())
         .with_expr_planner(Arc::new(CoreFunctionPlanner::default()));
 
     let context = MockContextProvider { state };
@@ -3311,7 +3297,7 @@ fn make_udf(name: &'static str, args: Vec<DataType>, return_type: DataType) -> S
 }
 
 /// Mocked UDF
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct DummyUDF {
     name: &'static str,
     signature: Signature,
@@ -3355,9 +3341,10 @@ fn parse_decimals_parser_options() -> ParserOptions {
         parse_float_as_decimal: true,
         enable_ident_normalization: false,
         support_varchar_with_length: false,
-        map_varchar_to_utf8view: true,
+        map_string_types_to_utf8view: true,
         enable_options_value_normalization: false,
         collect_spans: false,
+        default_null_ordering: NullOrdering::NullsMax,
     }
 }
 
@@ -3366,9 +3353,10 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions
         parse_float_as_decimal: true,
         enable_ident_normalization: false,
         support_varchar_with_length: false,
-        map_varchar_to_utf8view: true,
+        map_string_types_to_utf8view: true,
         enable_options_value_normalization: false,
         collect_spans: false,
+        default_null_ordering: NullOrdering::NullsMax,
     }
 }
 
@@ -3377,9 +3365,10 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions {
         parse_float_as_decimal: true,
         enable_ident_normalization: true,
         support_varchar_with_length: false,
-        map_varchar_to_utf8view: true,
+        map_string_types_to_utf8view: true,
         enable_options_value_normalization: false,
         collect_spans: false,
+        default_null_ordering: NullOrdering::NullsMax,
     }
 }
 
@@ -3389,10 +3378,10 @@ fn select_partially_qualified_column() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: public.person.first_name
-  TableScan: public.person
-"#
+        @r"
+    Projection: public.person.first_name
+      TableScan: public.person
+    "
     );
 }
 
@@ -3403,15 +3392,15 @@ fn cross_join_not_to_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id
-  Filter: person.id = person.age
-    Cross Join: 
-      Cross Join: 
-        TableScan: person
-        TableScan: orders
-      TableScan: lineitem
-"#
+        @r"
+    Projection: person.id
+      Filter: person.id = person.age
+        Cross Join:
+          Cross Join:
+            TableScan: person
+            TableScan: orders
+          TableScan: lineitem
+    "
     );
 }
 
@@ -3421,14 +3410,14 @@ fn join_with_aliases() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: peeps.id, folks.first_name
-  Inner Join:  Filter: peeps.id = folks.id
-    SubqueryAlias: peeps
-      TableScan: person
-    SubqueryAlias: folks
-      TableScan: person
-"#
+        @r"
+    Projection: peeps.id, folks.first_name
+      Inner Join:  Filter: peeps.id = folks.id
+        SubqueryAlias: peeps
+          TableScan: person
+        SubqueryAlias: folks
+          TableScan: person
+    "
     );
 }
 
@@ -3439,9 +3428,9 @@ fn negative_interval_plus_interval_in_projection() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
-  EmptyRelation
-"#
+    Projection: IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -3452,9 +3441,9 @@ fn complex_interval_expression_in_projection() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -3, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
-  EmptyRelation
-"#
+    Projection: IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: -3, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -3465,9 +3454,9 @@ fn negative_sum_intervals_in_projection() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: (- IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") + (- IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 4, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 7, nanoseconds: 0 }")))
-  EmptyRelation
-"#
+    Projection: (- IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 2, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") + (- IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 4, nanoseconds: 0 }") + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 7, nanoseconds: 0 }")))
+      EmptyRelation: rows=1
+    "#
     );
 }
 
@@ -3478,9 +3467,9 @@ fn date_plus_interval_in_projection() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: test.t_date32 + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
-  TableScan: test
-"#
+    Projection: test.t_date32 + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }")
+      TableScan: test
+    "#
     );
 }
 
@@ -3494,10 +3483,10 @@ fn date_plus_interval_in_filter() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: test.t_date64
-  Filter: test.t_date64 BETWEEN CAST(Utf8("1999-12-31") AS Date32) AND CAST(Utf8("1999-12-31") AS Date32) + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 30, nanoseconds: 0 }")
-    TableScan: test
-"#
+    Projection: test.t_date64
+      Filter: test.t_date64 BETWEEN CAST(Utf8("1999-12-31") AS Date32) AND CAST(Utf8("1999-12-31") AS Date32) + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 30, nanoseconds: 0 }")
+        TableScan: test
+    "#
     );
 }
 
@@ -3510,16 +3499,16 @@ fn exists_subquery() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id
-  Filter: EXISTS (<subquery>)
-    Subquery:
-      Projection: person.first_name
-        Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+        @r"
+    Projection: p.id
+      Filter: EXISTS (<subquery>)
+        Subquery:
+          Projection: person.first_name
+            Filter: person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+              TableScan: person
+        SubqueryAlias: p
           TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+    "
     );
 }
 
@@ -3535,21 +3524,21 @@ fn exists_subquery_schema_outer_schema_overlap() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id
-  Filter: person.id = p.id AND EXISTS (<subquery>)
-    Subquery:
-      Projection: person.first_name
-        Filter: person.id = p2.id AND person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
-          Cross Join: 
+        @r"
+    Projection: person.id
+      Filter: person.id = p.id AND EXISTS (<subquery>)
+        Subquery:
+          Projection: person.first_name
+            Filter: person.id = p2.id AND person.last_name = outer_ref(p.last_name) AND person.state = outer_ref(p.state)
+              Cross Join:
+                TableScan: person
+                SubqueryAlias: p2
+                  TableScan: person
+        Cross Join:
+          TableScan: person
+          SubqueryAlias: p
             TableScan: person
-            SubqueryAlias: p2
-              TableScan: person
-    Cross Join: 
-      TableScan: person
-      SubqueryAlias: p
-        TableScan: person
-"#
+    "
     );
 }
 
@@ -3560,15 +3549,15 @@ fn in_subquery_uncorrelated() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id
-  Filter: p.id IN (<subquery>)
-    Subquery:
-      Projection: person.id
-        TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+        @r"
+    Projection: p.id
+      Filter: p.id IN (<subquery>)
+        Subquery:
+          Projection: person.id
+            TableScan: person
+        SubqueryAlias: p
+          TableScan: person
+    "
     );
 }
 
@@ -3580,35 +3569,34 @@ fn not_in_subquery_correlated() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: p.id
-  Filter: p.id NOT IN (<subquery>)
-    Subquery:
-      Projection: person.id
-        Filter: person.last_name = outer_ref(p.last_name) AND person.state = Utf8("CO")
+    Projection: p.id
+      Filter: p.id NOT IN (<subquery>)
+        Subquery:
+          Projection: person.id
+            Filter: person.last_name = outer_ref(p.last_name) AND person.state = Utf8("CO")
+              TableScan: person
+        SubqueryAlias: p
           TableScan: person
-    SubqueryAlias: p
-      TableScan: person
-"#
+    "#
     );
 }
 
 #[test]
 fn scalar_subquery() {
-    let sql =
-        "SELECT p.id, (SELECT MAX(id) FROM person WHERE last_name = p.last_name) FROM person p";
+    let sql = "SELECT p.id, (SELECT MAX(id) FROM person WHERE last_name = p.last_name) FROM person p";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p.id, (<subquery>)
-  Subquery:
-    Projection: max(person.id)
-      Aggregate: groupBy=[[]], aggr=[[max(person.id)]]
-        Filter: person.last_name = outer_ref(p.last_name)
-          TableScan: person
-  SubqueryAlias: p
-    TableScan: person
-"#
+        @r"
+    Projection: p.id, (<subquery>)
+      Subquery:
+        Projection: max(person.id)
+          Aggregate: groupBy=[[]], aggr=[[max(person.id)]]
+            Filter: person.last_name = outer_ref(p.last_name)
+              TableScan: person
+      SubqueryAlias: p
+        TableScan: person
+    "
     );
 }
 
@@ -3624,20 +3612,20 @@ fn scalar_subquery_reference_outer_field() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: j1.j1_string, j2.j2_string
-  Filter: j1.j1_id = j2.j2_id - Int64(1) AND j2.j2_id < (<subquery>)
-    Subquery:
-      Projection: count(*)
-        Aggregate: groupBy=[[]], aggr=[[count(*)]]
-          Filter: outer_ref(j2.j2_id) = j1.j1_id AND j1.j1_id = j3.j3_id
-            Cross Join: 
-              TableScan: j1
-              TableScan: j3
-    Cross Join: 
-      TableScan: j1
-      TableScan: j2
-"#
+        @r"
+    Projection: j1.j1_string, j2.j2_string
+      Filter: j1.j1_id = j2.j2_id - Int64(1) AND j2.j2_id < (<subquery>)
+        Subquery:
+          Projection: count(*)
+            Aggregate: groupBy=[[]], aggr=[[count(*)]]
+              Filter: outer_ref(j2.j2_id) = j1.j1_id AND j1.j1_id = j3.j3_id
+                Cross Join:
+                  TableScan: j1
+                  TableScan: j3
+        Cross Join:
+          TableScan: j1
+          TableScan: j2
+    "
     );
 }
 
@@ -3648,11 +3636,11 @@ fn aggregate_with_rollup() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3663,11 +3651,11 @@ fn aggregate_with_rollup_with_grouping() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, grouping(person.state), grouping(person.age), grouping(person.state) + grouping(person.age), count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[grouping(person.state), grouping(person.age), count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, grouping(person.state), grouping(person.age), grouping(person.state) + grouping(person.age), count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.state, person.age))]], aggr=[[grouping(person.state), grouping(person.age), count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3689,12 +3677,12 @@ fn rank_partition_grouping() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: sum(person.age) AS total_sum, person.state, person.last_name, grouping(person.state) + grouping(person.last_name) AS x, rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS the_rank
-  WindowAggr: windowExpr=[[rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-    Aggregate: groupBy=[[ROLLUP (person.state, person.last_name)]], aggr=[[sum(person.age), grouping(person.state), grouping(person.last_name)]]
-      TableScan: person
-"#
+        @r"
+    Projection: sum(person.age) AS total_sum, person.state, person.last_name, grouping(person.state) + grouping(person.last_name) AS x, rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS the_rank
+      WindowAggr: windowExpr=[[rank() PARTITION BY [grouping(person.state) + grouping(person.last_name), CASE WHEN grouping(person.last_name) = Int64(0) THEN person.state END] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+        Aggregate: groupBy=[[ROLLUP (person.state, person.last_name)]], aggr=[[sum(person.age), grouping(person.state), grouping(person.last_name)]]
+          TableScan: person
+    "
     );
 }
 
@@ -3705,11 +3693,11 @@ fn aggregate_with_cube() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.age), (person.id, person.state, person.age))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id), (person.id, person.state), (person.id, person.age), (person.id, person.state, person.age))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3719,10 +3707,10 @@ fn round_decimal() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: round(test_decimal.price / Int64(3), Int64(2))
-  TableScan: test_decimal
-"#
+        @r"
+    Projection: round(test_decimal.price / Int64(3), Int64(2))
+      TableScan: test_decimal
+    "
     );
 }
 
@@ -3732,11 +3720,11 @@ fn aggregate_with_grouping_sets() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.state, person.age, count(*)
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.state), (person.id, person.state, person.age), (person.id, person.id, person.state))]], aggr=[[count(*)]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.state, person.age, count(*)
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.state), (person.id, person.state, person.age), (person.id, person.id, person.state))]], aggr=[[count(*)]]
+        TableScan: person
+    "
     );
 }
 
@@ -3748,12 +3736,12 @@ fn join_on_disjunction_condition() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id OR person.age > Int64(30)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id OR person.age > Int64(30)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3766,11 +3754,11 @@ fn join_on_complex_condition() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id AND (person.age > Int64(30) OR person.last_name = Utf8("X"))
-    TableScan: person
-    TableScan: orders
-"#
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id AND (person.age > Int64(30) OR person.last_name = Utf8("X"))
+        TableScan: person
+        TableScan: orders
+    "#
     );
 }
 
@@ -3782,11 +3770,11 @@ fn hive_aggregate_with_filter() -> Result<()> {
 
     assert_snapshot!(
         plan,
-        @r##"
-        Projection: sum(person.age) FILTER (WHERE person.age > Int64(4))
-          Aggregate: groupBy=[[]], aggr=[[sum(person.age) FILTER (WHERE person.age > Int64(4))]]
-            TableScan: person
-        "##
+        @r"
+    Projection: sum(person.age) FILTER (WHERE person.age > Int64(4))
+      Aggregate: groupBy=[[]], aggr=[[sum(person.age) FILTER (WHERE person.age > Int64(4))]]
+        TableScan: person
+    "
     );
 
     Ok(())
@@ -3802,14 +3790,13 @@ fn order_by_unaliased_name() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: z, q
-  Sort: p.state ASC NULLS LAST
-    Projection: p.state AS z, sum(p.age) AS q, p.state
-      Aggregate: groupBy=[[p.state]], aggr=[[sum(p.age)]]
-        SubqueryAlias: p
-          TableScan: person
-"#
+        @r"
+    Sort: z ASC NULLS LAST
+      Projection: p.state AS z, sum(p.age) AS q
+        Aggregate: groupBy=[[p.state]], aggr=[[sum(p.age)]]
+          SubqueryAlias: p
+            TableScan: person
+    "
     );
 }
 
@@ -3820,9 +3807,7 @@ fn order_by_ambiguous_name() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field age
-        "###
+        @"Schema error: Ambiguous reference to unqualified field age"
     );
 }
 
@@ -3833,9 +3818,7 @@ fn group_by_ambiguous_name() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field age
-        "###
+        @"Schema error: Ambiguous reference to unqualified field age"
     );
 }
 
@@ -3845,24 +3828,24 @@ fn test_zero_offset_with_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=0, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=0, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
     // Flip the order of LIMIT and OFFSET in the query. Plan should remain the same.
     let sql = "SELECT id FROM person WHERE person.id > 100 OFFSET 0 LIMIT 5;";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=0, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=0, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3872,12 +3855,12 @@ fn test_offset_no_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=5, fetch=None
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=5, fetch=None
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3887,27 +3870,34 @@ fn test_offset_after_limit() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=3, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=3, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
+#[test]
+fn fetch_clause_is_not_supported() {
+    let sql = "SELECT 1 FETCH NEXT 1 ROW ONLY";
+    let err = logical_plan(sql).unwrap_err();
+    assert_contains!(err.to_string(), "FETCH clause is not supported yet");
+}
+
 #[test]
 fn test_offset_before_limit() {
     let sql = "select id from person where person.id > 100 OFFSET 3 LIMIT 5;";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Limit: skip=3, fetch=5
-  Projection: person.id
-    Filter: person.id > Int64(100)
-      TableScan: person
-"#
+        @r"
+    Limit: skip=3, fetch=5
+      Projection: person.id
+        Filter: person.id > Int64(100)
+          TableScan: person
+    "
     );
 }
 
@@ -3917,11 +3907,11 @@ fn test_distribute_by() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Repartition: DistributeBy(person.state)
-  Projection: person.id
-    TableScan: person
-"#
+        @r"
+    Repartition: DistributeBy(person.state)
+      Projection: person.id
+        TableScan: person
+    "
     );
 }
 
@@ -3953,12 +3943,12 @@ fn test_constant_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3971,12 +3961,12 @@ fn test_right_left_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -3989,12 +3979,12 @@ fn test_single_column_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + Int64(10) = orders.customer_id * Int64(2)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + Int64(10) = orders.customer_id * Int64(2)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4007,12 +3997,12 @@ fn test_multiple_column_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id * Int64(2) - orders.price
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id * Int64(2) - orders.price
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4025,12 +4015,12 @@ fn test_left_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id + person.age + Int64(10) = orders.customer_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4043,12 +4033,12 @@ fn test_right_expr_eq_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Inner Join:  Filter: person.id = orders.customer_id * Int64(2) - orders.price
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Inner Join:  Filter: person.id = orders.customer_id * Int64(2) - orders.price
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4061,12 +4051,12 @@ fn test_noneq_with_filter_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Inner Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Inner Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // left join
     let sql = "SELECT person.id, person.first_name \
@@ -4075,12 +4065,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Left Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Left Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // right join
     let sql = "SELECT person.id, person.first_name \
@@ -4089,12 +4079,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Right Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Right Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
     // full join
     let sql = "SELECT person.id, person.first_name \
@@ -4103,12 +4093,12 @@ Projection: person.id, person.first_name
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.first_name
-  Full Join:  Filter: person.age > Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.first_name
+      Full Join:  Filter: person.age > Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4123,12 +4113,12 @@ fn test_one_side_constant_full_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, orders.order_id
-  Full Join:  Filter: person.id = Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, orders.order_id
+      Full Join:  Filter: person.id = Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4141,12 +4131,12 @@ fn test_select_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.customer_id * Int64(2), person.id + Int64(10)
-  Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.customer_id * Int64(2), person.id + Int64(10)
+      Inner Join:  Filter: orders.customer_id * Int64(2) = person.id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4157,11 +4147,11 @@ fn test_select_order_by() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: Utf8("1")
-  Sort: person.id ASC NULLS LAST
-    Projection: Utf8("1"), person.id
-      TableScan: person
-"#
+    Projection: Utf8("1")
+      Sort: person.id ASC NULLS LAST
+        Projection: Utf8("1"), person.id
+          TableScan: person
+    "#
     );
 }
 
@@ -4176,9 +4166,109 @@ fn test_select_distinct_order_by() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Error during planning: For SELECT DISTINCT, ORDER BY expressions person.id must appear in select list
-        "###
+        @"Error during planning: For SELECT DISTINCT, ORDER BY expressions person.id must appear in select list"
+    );
+}
+
+#[test]
+fn test_select_qualify_basic() {
+    let sql = "SELECT person.id, ROW_NUMBER() OVER (PARTITION BY person.age ORDER BY person.id) as rn FROM person QUALIFY rn = 1";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.id, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+      Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1)
+        WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: person
+    "
+    );
+}
+
+#[test]
+fn test_select_qualify_aggregate_reference() {
+    let sql = "
+        SELECT
+            person.id,
+            ROW_NUMBER() OVER (PARTITION BY person.id ORDER BY person.id) as rn
+        FROM person
+        GROUP BY
+            person.id
+        QUALIFY rn = 1 AND SUM(person.age) > 0";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.id, row_number() PARTITION BY [person.id] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+      Filter: row_number() PARTITION BY [person.id] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1) AND sum(person.age) > Int64(0)
+        WindowAggr: windowExpr=[[row_number() PARTITION BY [person.id] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          Aggregate: groupBy=[[person.id]], aggr=[[sum(person.age)]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn test_select_qualify_aggregate_reference_within_window_function() {
+    let sql = "
+        SELECT
+            person.id
+        FROM person
+        GROUP BY
+            person.id
+        QUALIFY ROW_NUMBER() OVER (PARTITION BY person.id ORDER BY SUM(person.age) DESC) = 1";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.id
+      Filter: row_number() PARTITION BY [person.id] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = Int64(1)
+        WindowAggr: windowExpr=[[row_number() PARTITION BY [person.id] ORDER BY [sum(person.age) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          Aggregate: groupBy=[[person.id]], aggr=[[sum(person.age)]]
+            TableScan: person
+    "
+    );
+}
+
+#[test]
+fn test_select_qualify_aggregate_invalid_column_reference() {
+    let sql = "
+        SELECT
+            person.id
+        FROM person
+        GROUP BY
+            person.id
+        QUALIFY ROW_NUMBER() OVER (PARTITION BY person.id ORDER BY person.age DESC) = 1";
+    let err = logical_plan(sql).unwrap_err();
+    assert_snapshot!(
+        err.strip_backtrace(),
+        @r#"Error during planning: Column in QUALIFY must be in GROUP BY or an aggregate function: While expanding wildcard, column "person.age" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "person.id" appears in the SELECT clause satisfies this requirement"#
+    );
+}
+
+#[test]
+fn test_select_qualify_without_window_function() {
+    let sql = "SELECT person.id FROM person QUALIFY person.id > 1";
+    let err = logical_plan(sql).unwrap_err();
+    assert_eq!(
+        err.strip_backtrace(),
+        "Error during planning: QUALIFY clause requires window functions in the SELECT list or QUALIFY clause"
+    );
+}
+
+#[test]
+fn test_select_qualify_complex_condition() {
+    let sql = "SELECT person.id, person.age, ROW_NUMBER() OVER (PARTITION BY person.age ORDER BY person.id) as rn, RANK() OVER (ORDER BY person.salary) as rank FROM person QUALIFY rn <= 2 AND rank <= 5";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r"
+    Projection: person.id, person.age, row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn, rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rank
+      Filter: row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(2) AND rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(5)
+        WindowAggr: windowExpr=[[rank() ORDER BY [person.salary ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          WindowAggr: windowExpr=[[row_number() PARTITION BY [person.age] ORDER BY [person.id ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: person
+    "
     );
 }
 
@@ -4191,10 +4281,6 @@ fn test_select_distinct_order_by() {
     "SELECT id, number FROM person LATERAL VIEW explode(numbers) exploded_table AS number",
     "This feature is not implemented: LATERAL VIEWS"
 )]
-#[case::select_qualify_unsupported(
-    "SELECT i, p, o FROM person QUALIFY ROW_NUMBER() OVER (PARTITION BY p ORDER BY o) = 1",
-    "This feature is not implemented: QUALIFY"
-)]
 #[case::select_top_unsupported(
     "SELECT TOP (5) * FROM person",
     "This feature is not implemented: TOP"
@@ -4211,17 +4297,16 @@ fn test_select_unsupported_syntax_errors(#[case] sql: &str, #[case] error: &str)
 
 #[test]
 fn select_order_by_with_cast() {
-    let sql =
-        "SELECT first_name AS first_name FROM (SELECT first_name AS first_name FROM person) ORDER BY CAST(first_name as INT)";
+    let sql = "SELECT first_name AS first_name FROM (SELECT first_name AS first_name FROM person) ORDER BY CAST(first_name as INT)";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Sort: CAST(person.first_name AS Int32) ASC NULLS LAST
-  Projection: person.first_name
-    Projection: person.first_name
-      TableScan: person
-"#
+        @r"
+    Sort: CAST(person.first_name AS Int32) ASC NULLS LAST
+      Projection: person.first_name
+        Projection: person.first_name
+          TableScan: person
+    "
     );
 }
 
@@ -4246,12 +4331,12 @@ fn test_duplicated_left_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id * Int64(2) = orders.order_id
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id * Int64(2) = orders.order_id
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4265,12 +4350,12 @@ fn test_duplicated_right_join_key_inner_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id = orders.customer_id + Int64(10)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: person.id * Int64(2) = orders.customer_id + Int64(10) AND person.id = orders.customer_id + Int64(10)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4288,9 +4373,7 @@ fn test_ambiguous_column_references_in_on_join() {
 
     assert_snapshot!(
         err,
-        @r###"
-        Schema error: Ambiguous reference to unqualified field id
-        "###
+        @"Schema error: Ambiguous reference to unqualified field id"
     );
 }
 
@@ -4303,14 +4386,14 @@ fn test_ambiguous_column_references_with_in_using_join() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: p1.id, p1.age, p2.id
-  Inner Join: Using p1.id = p2.id
-    SubqueryAlias: p1
-      TableScan: person
-    SubqueryAlias: p2
-      TableScan: person
-"#
+        @r"
+    Projection: p1.id, p1.age, p2.id
+      Inner Join: Using p1.id = p2.id
+        SubqueryAlias: p1
+          TableScan: person
+        SubqueryAlias: p2
+          TableScan: person
+    "
     );
 }
 
@@ -4323,12 +4406,12 @@ fn test_inner_join_with_cast_key() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Inner Join:  Filter: CAST(person.id AS Int32) = CAST(orders.customer_id AS Int32)
-    TableScan: person
-    TableScan: orders
-"#
+        @r"
+    Projection: person.id, person.age
+      Inner Join:  Filter: CAST(person.id AS Int32) = CAST(orders.customer_id AS Int32)
+        TableScan: person
+        TableScan: orders
+    "
     );
 }
 
@@ -4342,11 +4425,11 @@ fn test_multi_grouping_sets() {
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age))]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.age
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age))]], aggr=[[]]
+        TableScan: person
+    "
     );
     let sql = "SELECT person.id, person.age
             FROM person
@@ -4357,11 +4440,11 @@ Projection: person.id, person.age
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: person.id, person.age
-  Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age, person.salary, person.state), (person.id, person.age, person.salary, person.state, person.birth_date), (person.id, person.age), (person.id, person.age, person.state), (person.id, person.age, person.state, person.birth_date))]], aggr=[[]]
-    TableScan: person
-"#
+        @r"
+    Projection: person.id, person.age
+      Aggregate: groupBy=[[GROUPING SETS ((person.id, person.age, person.salary), (person.id, person.age, person.salary, person.state), (person.id, person.age, person.salary, person.state, person.birth_date), (person.id, person.age), (person.id, person.age, person.state), (person.id, person.age, person.state, person.birth_date))]], aggr=[[]]
+        TableScan: person
+    "
     );
 }
 
@@ -4374,9 +4457,7 @@ fn test_field_not_found_window_function() {
 
     assert_snapshot!(
         order_by_err,
-        @r###"
-        Schema error: No field named a.
-        "###
+        @"Schema error: No field named a."
     );
 
     let partition_by_sql = "SELECT count() OVER (PARTITION BY a);";
@@ -4386,20 +4467,18 @@ fn test_field_not_found_window_function() {
 
     assert_snapshot!(
         partition_by_err,
-        @r###"
-        Schema error: No field named a.
-        "###
+        @"Schema error: No field named a."
     );
 
     let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY orders.order_id) from orders";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @r#"
-Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-  WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: orders
-"#
+        @r"
+    Projection: orders.order_id, max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+      WindowAggr: windowExpr=[[max(orders.qty) PARTITION BY [orders.order_id] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+        TableScan: orders
+    "
     );
 }
 
@@ -4411,7 +4490,7 @@ fn test_parse_escaped_string_literal_value() {
         plan,
         @r#"
     Projection: character_length(Utf8("\r\n")) AS len
-      EmptyRelation
+      EmptyRelation: rows=1
     "#
     );
     let sql = "SELECT character_length(E'\r\n') AS len";
@@ -4419,26 +4498,64 @@ fn test_parse_escaped_string_literal_value() {
     assert_snapshot!(
         plan,
         @r#"
-Projection: character_length(Utf8("
-")) AS len
-  EmptyRelation
-"#
+    Projection: character_length(Utf8("
+    ")) AS len
+      EmptyRelation: rows=1
+    "#
     );
     let sql =
         r"SELECT character_length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' AS unicode";
     let plan = logical_plan(sql).unwrap();
     assert_snapshot!(
         plan,
-        @"Projection: character_length(Utf8(\"%\")) AS len, Utf8(\"K\") AS hex, Utf8(\"\u{1}\") AS unicode\n  EmptyRelation"
+        @r#"
+    Projection: character_length(Utf8("%")) AS len, Utf8("K") AS hex, Utf8("") AS unicode
+      EmptyRelation: rows=1
+    "#
     );
 
     let sql = r"SELECT character_length(E'\000') AS len";
 
     assert_snapshot!(
         logical_plan(sql).unwrap_err(),
-        @r###"
-        SQL error: TokenizerError("Unterminated encoded string literal at Line: 1, Column: 25")
-        "###
+        @r#"SQL error: TokenizerError("Unterminated encoded string literal at Line: 1, Column: 25")"#
+    );
+}
+
+#[test]
+fn test_parse_quoted_column_name_with_at_sign() {
+    let sql = r"SELECT `@column` FROM `@quoted_identifier_names_table`";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: @quoted_identifier_names_table.@column
+      TableScan: @quoted_identifier_names_table
+    "#
+    );
+
+    let sql = r"SELECT `@quoted_identifier_names_table`.`@column` FROM `@quoted_identifier_names_table`";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: @quoted_identifier_names_table.@column
+      TableScan: @quoted_identifier_names_table
+    "#
+    );
+}
+
+#[test]
+fn test_variable_identifier() {
+    let sql = r"SELECT t_date32 FROM test WHERE t_date32 = @variable";
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: test.t_date32
+      Filter: test.t_date32 = @variable
+        TableScan: test
+    "#
     );
 }
 
@@ -4484,7 +4601,7 @@ fn assert_field_not_found(mut err: DataFusionError, name: &str) {
         }
     };
     match err {
-        DataFusionError::SchemaError { .. } => {
+        DataFusionError::SchemaError(_, _) => {
             let msg = format!("{err}");
             let expected = format!("Schema error: No field named {name}.");
             if !msg.starts_with(&expected) {
@@ -4522,6 +4639,30 @@ fn test_no_functions_registered() {
     );
 }
 
+#[test]
+fn test_no_substring_registered() {
+    // substring requires an expression planner
+    let sql = "SELECT SUBSTRING(foo, bar, baz) FROM person";
+    let err = logical_plan(sql).expect_err("query should have failed");
+
+    assert_snapshot!(
+        err.strip_backtrace(),
+        @"This feature is not implemented: Substring could not be planned by registered expr planner. Hint: Please try with `unicode_expressions` DataFusion feature enabled"
+    );
+}
+
+#[test]
+fn test_no_substring_registered_alt_syntax() {
+    // Alternate syntax for substring
+    let sql = "SELECT SUBSTRING(foo FROM bar) FROM person";
+    let err = logical_plan(sql).expect_err("query should have failed");
+
+    assert_snapshot!(
+        err.strip_backtrace(),
+        @"This feature is not implemented: Substring could not be planned by registered expr planner. Hint: Please try with `unicode_expressions` DataFusion feature enabled"
+    );
+}
+
 #[test]
 fn test_custom_type_plan() -> Result<()> {
     let sql = "SELECT DATETIME '2001-01-01 18:00:00'";
@@ -4537,7 +4678,7 @@ fn test_custom_type_plan() -> Result<()> {
     let err = planner.statement_to_plan(ast.pop_front().unwrap());
     assert_contains!(
         err.unwrap_err().to_string(),
-        "This feature is not implemented: Unsupported SQL type Datetime(None)"
+        "This feature is not implemented: Unsupported SQL type DATETIME"
     );
 
     fn plan_sql(sql: &str) -> LogicalPlan {
@@ -4558,20 +4699,20 @@ fn test_custom_type_plan() -> Result<()> {
 
     assert_snapshot!(
         plan,
-        @r###"
-        Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None))
-          EmptyRelation
-        "###
+        @r#"
+    Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns))
+      EmptyRelation: rows=1
+    "#
     );
 
     let plan = plan_sql("SELECT CAST(TIMESTAMP '2001-01-01 18:00:00' AS DATETIME)");
 
     assert_snapshot!(
         plan,
-        @r###"
-        Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)) AS Timestamp(Nanosecond, None))
-          EmptyRelation
-        "###
+        @r#"
+    Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)) AS Timestamp(ns))
+      EmptyRelation: rows=1
+    "#
     );
 
     let plan = plan_sql(
@@ -4580,65 +4721,52 @@ fn test_custom_type_plan() -> Result<()> {
 
     assert_snapshot!(
         plan,
-        @r###"
-        Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(Nanosecond, None)))
-          EmptyRelation
-        "###
+        @r#"
+    Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(ns)))
+      EmptyRelation: rows=1
+    "#
     );
 
+    let plan = plan_sql("SELECT UUID '00010203-0405-0607-0809-000102030506'");
+    assert_snapshot!(
+        plan,
+        @r#"
+    Projection: CAST(Utf8("00010203-0405-0607-0809-000102030506") AS FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>)
+      EmptyRelation: rows=1
+    "#
+    );
     Ok(())
 }
 
-fn error_message_test(sql: &str, err_msg_starts_with: &str) {
+fn error_message(sql: &str) -> String {
     let err = logical_plan(sql).expect_err("query should have failed");
-    assert!(
-        err.strip_backtrace().starts_with(err_msg_starts_with),
-        "Expected error to start with '{}', but got: '{}'",
-        err_msg_starts_with,
-        err.strip_backtrace(),
-    );
+    err.strip_backtrace()
 }
 
 #[test]
 fn test_error_message_invalid_scalar_function_signature() {
-    error_message_test(
-        "select sqrt()",
-        "Error during planning: 'sqrt' does not support zero arguments",
-    );
-    error_message_test(
-        "select sqrt(1, 2)",
-        "Error during planning: Failed to coerce arguments",
+    assert!(
+        error_message("select sqrt()").starts_with(
+            r"Error during planning: 'sqrt' does not support zero arguments"
+        )
     );
+    assert!(error_message("select sqrt(1, 2)").starts_with(r"Error during planning: Failed to coerce arguments to satisfy a call to 'sqrt' function: coercion from Int64, Int64 to the signature Exact(Int64) failed"));
 }
 
 #[test]
 fn test_error_message_invalid_aggregate_function_signature() {
-    error_message_test(
-        "select sum()",
-        "Error during planning: Execution error: Function 'sum' user-defined coercion failed with \"Execution error: sum function requires 1 argument, got 0\"",
-    );
-    // We keep two different prefixes because they clarify each other.
-    // It might be incorrect, and we should consider keeping only one.
-    error_message_test(
-        "select max(9, 3)",
-        "Error during planning: Execution error: Function 'max' user-defined coercion failed",
-    );
+    assert!(error_message("select sum()").starts_with(r"Error during planning: Execution error: Function 'sum' user-defined coercion failed with: Execution error: sum function requires 1 argument, got 0"));
+    assert!(error_message("select max(9, 3)").starts_with(r"Error during planning: Execution error: Function 'max' user-defined coercion failed with: Execution error: min/max was called with 2 arguments. It requires only 1"));
 }
 
 #[test]
 fn test_error_message_invalid_window_function_signature() {
-    error_message_test(
-        "select rank(1) over()",
-        "Error during planning: The function 'rank' expected zero argument but received 1",
-    );
+    assert!(error_message("select rank(1) over()").starts_with(r"Error during planning: The function 'rank' expected zero argument but received 1"));
 }
 
 #[test]
 fn test_error_message_invalid_window_aggregate_function_signature() {
-    error_message_test(
-        "select sum() over()",
-        "Error during planning: Execution error: Function 'sum' user-defined coercion failed with \"Execution error: sum function requires 1 argument, got 0\"",
-    );
+    assert!(error_message("select sum() over()").starts_with(r"Error during planning: Execution error: Function 'sum' user-defined coercion failed with: Execution error: sum function requires 1 argument, got 0"));
 }
 
 // Test issue: https://github.com/apache/datafusion/issues/14058
@@ -4660,7 +4788,11 @@ fn test_using_join_wildcard_schema() {
     // Only columns from one join side should be present
     let expected_fields = vec![
         "o1.order_id".to_string(),
+        "o1.o_orderkey".to_string(),
+        "o1.o_custkey".to_string(),
+        "o1.o_orderstatus".to_string(),
         "o1.customer_id".to_string(),
+        "o1.o_totalprice".to_string(),
         "o1.o_item_id".to_string(),
         "o1.qty".to_string(),
         "o1.price".to_string(),
@@ -4714,3 +4846,70 @@ fn test_using_join_wildcard_schema() {
         ]
     );
 }
+
+#[test]
+fn test_2_nested_lateral_join_with_the_deepest_join_referencing_the_outer_most_relation()
+{
+    let sql = "SELECT * FROM j1 j1_outer, LATERAL (
+    SELECT * FROM j1 j1_inner, LATERAL (
+        SELECT * FROM j2 WHERE j1_inner.j1_id = j2_id and j1_outer.j1_id=j2_id
+    ) as j2
+) as j2";
+
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+         plan,
+         @r#"
+Projection: j1_outer.j1_id, j1_outer.j1_string, j2.j1_id, j2.j1_string, j2.j2_id, j2.j2_string
+  Cross Join:
+    SubqueryAlias: j1_outer
+      TableScan: j1
+    SubqueryAlias: j2
+      Subquery:
+        Projection: j1_inner.j1_id, j1_inner.j1_string, j2.j2_id, j2.j2_string
+          Cross Join:
+            SubqueryAlias: j1_inner
+              TableScan: j1
+            SubqueryAlias: j2
+              Subquery:
+                Projection: j2.j2_id, j2.j2_string
+                  Filter: outer_ref(j1_inner.j1_id) = j2.j2_id AND outer_ref(j1_outer.j1_id) = j2.j2_id
+                    TableScan: j2
+"#
+    );
+}
+
+#[test]
+fn test_correlated_recursive_scalar_subquery_with_level_3_scalar_subquery_referencing_level1_relation()
+ {
+    let sql = "select c_custkey from customer
+            where c_acctbal < (
+            select sum(o_totalprice) from orders
+            where o_custkey = c_custkey
+            and o_totalprice < (
+            select sum(l_extendedprice) as price from lineitem where l_orderkey = o_orderkey
+            and l_extendedprice < c_acctbal
+        )
+        ) order by c_custkey";
+
+    let plan = logical_plan(sql).unwrap();
+    assert_snapshot!(
+         plan,
+         @r#"
+Sort: customer.c_custkey ASC NULLS LAST
+  Projection: customer.c_custkey
+    Filter: customer.c_acctbal < (<subquery>)
+      Subquery:
+        Projection: sum(orders.o_totalprice)
+          Aggregate: groupBy=[[]], aggr=[[sum(orders.o_totalprice)]]
+            Filter: orders.o_custkey = outer_ref(customer.c_custkey) AND orders.o_totalprice < (<subquery>)
+              Subquery:
+                Projection: sum(lineitem.l_extendedprice) AS price
+                  Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice)]]
+                    Filter: lineitem.l_orderkey = outer_ref(orders.o_orderkey) AND lineitem.l_extendedprice < outer_ref(customer.c_acctbal)
+                      TableScan: lineitem
+              TableScan: orders
+      TableScan: customer
+"#
+    );
+}
diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml
index 4e7b60cdd8f37..b00fbe466728e 100644
--- a/datafusion/sqllogictest/Cargo.toml
+++ b/datafusion/sqllogictest/Cargo.toml
@@ -30,6 +30,9 @@ version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -42,28 +45,26 @@ async-trait = { workspace = true }
 bigdecimal = { workspace = true }
 bytes = { workspace = true, optional = true }
 chrono = { workspace = true, optional = true }
-clap = { version = "4.5.39", features = ["derive", "env"] }
+clap = { version = "4.5.60", features = ["derive", "env"] }
 datafusion = { workspace = true, default-features = true, features = ["avro"] }
-datafusion-spark = { workspace = true, default-features = true }
+datafusion-spark = { workspace = true, features = ["core"] }
+datafusion-substrait = { workspace = true, default-features = true }
 futures = { workspace = true }
 half = { workspace = true, default-features = true }
-indicatif = "0.17"
+indicatif = "0.18"
 itertools = { workspace = true }
 log = { workspace = true }
 object_store = { workspace = true }
-postgres-protocol = { version = "0.6.7", optional = true }
-postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true }
-rust_decimal = { version = "1.37.1", features = ["tokio-pg"] }
+postgres-types = { version = "0.2.12", features = ["derive", "with-chrono-0_4"], optional = true }
 # When updating the following dependency verify that sqlite test file regeneration works correctly
 # by running the regenerate_sqlite_files.sh script.
-sqllogictest = "0.28.2"
+sqllogictest = "0.29.1"
 sqlparser = { workspace = true }
 tempfile = { workspace = true }
-testcontainers = { version = "0.24", features = ["default"], optional = true }
-testcontainers-modules = { version = "0.12", features = ["postgres"], optional = true }
-thiserror = "2.0.12"
+testcontainers-modules = { workspace = true, features = ["postgres"], optional = true }
+thiserror = "2.0.18"
 tokio = { workspace = true }
-tokio-postgres = { version = "0.7.12", optional = true }
+tokio-postgres = { version = "0.7.16", optional = true }
 
 [features]
 avro = ["datafusion/avro"]
@@ -72,14 +73,16 @@ postgres = [
     "bytes",
     "chrono",
     "postgres-types",
-    "postgres-protocol",
-    "testcontainers",
     "testcontainers-modules",
     "tokio-postgres",
 ]
+parquet_encryption = [
+    "datafusion/parquet_encryption",
+]
 
 [dev-dependencies]
 env_logger = { workspace = true }
+regex = { workspace = true }
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 
 [[test]]
diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md
index a71f920a4279c..7d84ad23d5905 100644
--- a/datafusion/sqllogictest/README.md
+++ b/datafusion/sqllogictest/README.md
@@ -17,23 +17,29 @@
   under the License.
 -->
 
-# DataFusion sqllogictest
+# Apache DataFusion sqllogictest
 
-[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
 
-This crate is a submodule of DataFusion that contains an implementation of [sqllogictest](https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki).
+This crate is a submodule of DataFusion that contains an implementation of [sqllogictest].
 
-[df]: https://crates.io/crates/datafusion
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
+[sqllogictest]: https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki
 
 ## Overview
 
-This crate uses [sqllogictest-rs](https://github.com/risinglightdb/sqllogictest-rs) to parse and run `.slt` files in the
-[`test_files`](test_files) directory of this crate or the [`data/sqlite`](https://github.com/apache/datafusion-testing/tree/main/data/sqlite)
-directory of the [datafusion-testing](https://github.com/apache/datafusion-testing) crate.
+This crate uses [sqllogictest-rs] to parse and run `.slt` files in the [`test_files`] directory of
+this crate or the [`data/sqlite`] directory of the [datafusion-testing] repository.
+
+[sqllogictest-rs]: https://github.com/risinglightdb/sqllogictest-rs
+[`test_files`]: test_files
+[`data/sqlite`]: https://github.com/apache/datafusion-testing/tree/main/data/sqlite
+[datafusion-testing]: https://github.com/apache/datafusion-testing
 
 ## Testing setup
 
-1. `rustup update stable` DataFusion uses the latest stable release of rust
+1. `rustup update stable` DataFusion uses the latest stable release of Rust
 2. `git submodule init`
 3. `git submodule update --init --remote --recursive`
 
@@ -64,6 +70,36 @@ cargo test --test sqllogictests -- ddl --complete
 RUST_LOG=debug cargo test --test sqllogictests -- ddl
 ```
 
+### Per-file timing summary
+
+The sqllogictest runner can emit deterministic per-file elapsed timings to help
+identify slow test files.
+
+By default (`--timing-summary auto`), timing summary output is disabled in local
+TTY runs and shows a top-slowest summary in non-TTY/CI runs.
+
+`--timing-top-n` / `SLT_TIMING_TOP_N` must be a positive integer (`>= 1`).
+
+```shell
+# Show top 10 slowest files (good for CI)
+cargo test --test sqllogictests -- --timing-summary top --timing-top-n 10
+```
+
+```shell
+# Show full per-file timing table
+cargo test --test sqllogictests -- --timing-summary full
+```
+
+```shell
+# Same controls via environment variables
+SLT_TIMING_SUMMARY=top SLT_TIMING_TOP_N=15 cargo test --test sqllogictests
+```
+
+```shell
+# Optional debug logging for per-task slow files (>30s), disabled by default
+SLT_TIMING_DEBUG_SLOW_FILES=1 cargo test --test sqllogictests
+```
+
 ## Cookbook: Adding Tests
 
 1. Add queries
@@ -136,6 +172,17 @@ select substr('Andrew Lamb', 1, 6), '|'
 Andrew |
 ```
 
+## Cookbook: Ignoring volatile output
+
+Sometimes parts of a result change every run (timestamps, counters, etc.). To keep the rest of the snapshot checked in, replace those fragments with the `<slt:ignore>` marker inside the expected block. During validation the marker acts like a wildcard, so only the surrounding text must match.
+
+```text
+query TT
+EXPLAIN ANALYZE SELECT * FROM generate_series(100);
+----
+Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+```
+
 # Reference
 
 ## Running tests: Validation Mode
@@ -291,6 +338,27 @@ Tests that need to write temporary files should write (only) to this
 directory to ensure they do not interfere with others concurrently
 running tests.
 
+## Running tests: Substrait round-trip mode
+
+This mode will run all the .slt test files in validation mode, adding a Substrait conversion round-trip for each
+generated DataFusion logical plan (SQL statement → DF logical → Substrait → DF logical → DF physical → execute).
+
+Not all statements will be round-tripped, some statements like CREATE, INSERT, SET or EXPLAIN statements will be
+issued as is, but any other statement will be round-tripped to/from Substrait.
+
+_WARNING_: as there are still a lot of failures in this mode (https://github.com/apache/datafusion/issues/16248),
+it is not enforced in the CI, instead, it needs to be run manually with the following command:
+
+```shell
+cargo test --test sqllogictests -- --substrait-round-trip
+```
+
+For focusing on one specific failing test, a file:line filter can be used:
+
+```shell
+cargo test --test sqllogictests -- --substrait-round-trip binary.slt:23
+```
+
 ## `.slt` file format
 
 [`sqllogictest`] was originally written for SQLite to verify the
diff --git a/datafusion/sqllogictest/bin/postgres_container.rs b/datafusion/sqllogictest/bin/postgres_container.rs
index 411562a7ccc74..fde5937760074 100644
--- a/datafusion/sqllogictest/bin/postgres_container.rs
+++ b/datafusion/sqllogictest/bin/postgres_container.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::Options;
+use ContainerCommands::{FetchHost, FetchPort};
 use datafusion::common::Result;
 use log::info;
 use std::env::set_var;
 use std::future::Future;
 use std::sync::LazyLock;
 use std::{env, thread};
-use testcontainers::core::IntoContainerPort;
-use testcontainers::runners::AsyncRunner;
-use testcontainers::ImageExt;
 use testcontainers_modules::postgres;
+use testcontainers_modules::testcontainers::ImageExt;
+use testcontainers_modules::testcontainers::core::IntoContainerPort;
+use testcontainers_modules::testcontainers::runners::AsyncRunner;
 use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
-use tokio::sync::{mpsc, Mutex};
-use ContainerCommands::{FetchHost, FetchPort};
+use tokio::sync::{Mutex, mpsc};
 
 #[derive(Debug)]
 pub enum ContainerCommands {
@@ -86,7 +86,9 @@ pub async fn initialize_postgres_container(options: &Options) -> Result<()> {
         let pg_uri = format!("postgresql://postgres:postgres@{db_host}:{db_port}/test");
         info!("Postgres uri is {pg_uri}");
 
-        set_var("PG_URI", pg_uri);
+        unsafe {
+            set_var("PG_URI", pg_uri);
+        }
     } else {
         // close receiver
         POSTGRES_IN.rx.lock().await.close();
diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs
index 4d6bce208887b..38a763504282f 100644
--- a/datafusion/sqllogictest/bin/sqllogictests.rs
+++ b/datafusion/sqllogictest/bin/sqllogictests.rs
@@ -15,13 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use clap::Parser;
+use clap::{ColorChoice, Parser, ValueEnum};
 use datafusion::common::instant::Instant;
 use datafusion::common::utils::get_available_parallelism;
-use datafusion::common::{exec_err, DataFusionError, Result};
+use datafusion::common::{DataFusionError, Result, exec_datafusion_err, exec_err};
+use datafusion_sqllogictest::TestFile;
 use datafusion_sqllogictest::{
-    df_value_validator, read_dir_recursive, setup_scratch_dir, should_skip_file,
-    should_skip_record, value_normalizer, DataFusion, Filter, TestContext,
+    CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip, Filter,
+    TestContext, df_value_validator, read_dir_recursive, setup_scratch_dir,
+    should_skip_file, should_skip_record, value_normalizer,
 };
 use futures::stream::StreamExt;
 use indicatif::{
@@ -31,8 +33,8 @@ use itertools::Itertools;
 use log::Level::Info;
 use log::{info, log_enabled};
 use sqllogictest::{
-    parse_file, strict_column_validator, AsyncDB, Condition, MakeConnection, Normalizer,
-    Record, Validator,
+    AsyncDB, Condition, MakeConnection, Normalizer, Record, Validator, parse_file,
+    strict_column_validator,
 };
 
 #[cfg(feature = "postgres")]
@@ -40,8 +42,14 @@ use crate::postgres_container::{
     initialize_postgres_container, terminate_postgres_container,
 };
 use datafusion::common::runtime::SpawnedTask;
-use std::ffi::OsStr;
+use futures::FutureExt;
+use std::fs;
+use std::io::{IsTerminal, stderr, stdout};
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
 
 #[cfg(feature = "postgres")]
 mod postgres_container;
@@ -49,8 +57,24 @@ mod postgres_container;
 const TEST_DIRECTORY: &str = "test_files/";
 const DATAFUSION_TESTING_TEST_DIRECTORY: &str = "../../datafusion-testing/data/";
 const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_";
+const TPCH_PREFIX: &str = "tpch";
 const SQLITE_PREFIX: &str = "sqlite";
 const ERRS_PER_FILE_LIMIT: usize = 10;
+const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES";
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)]
+enum TimingSummaryMode {
+    Auto,
+    Off,
+    Top,
+    Full,
+}
+
+#[derive(Debug)]
+struct FileTiming {
+    relative_path: PathBuf,
+    elapsed: Duration,
+}
 
 pub fn main() -> Result<()> {
     tokio::runtime::Builder::new_multi_thread()
@@ -93,6 +117,7 @@ async fn run_tests() -> Result<()> {
     env_logger::init();
 
     let options: Options = Parser::parse();
+    let timing_debug_slow_files = is_env_truthy(TIMING_DEBUG_SLOW_FILES_ENV);
     if options.list {
         // nextest parses stdout, so print messages to stderr
         eprintln!("NOTICE: --list option unsupported, quitting");
@@ -102,8 +127,16 @@ async fn run_tests() -> Result<()> {
         // to stdout and return OK so they can continue listing other tests.
         return Ok(());
     }
+
     options.warn_on_ignored();
 
+    // Print parallelism info for debugging CI performance
+    eprintln!(
+        "Running with {} test threads (available parallelism: {})",
+        options.test_threads,
+        get_available_parallelism()
+    );
+
     #[cfg(feature = "postgres")]
     initialize_postgres_container(&options).await?;
 
@@ -119,11 +152,33 @@ async fn run_tests() -> Result<()> {
     .unwrap()
     .progress_chars("##-");
 
+    let colored_output = options.is_colored();
+
     let start = Instant::now();
 
     let test_files = read_test_files(&options)?;
+
+    // Perform scratch file sanity check
+    let scratch_errors = scratch_file_check(&test_files)?;
+    if !scratch_errors.is_empty() {
+        eprintln!("Scratch file sanity check failed:");
+        for error in &scratch_errors {
+            eprintln!("  {error}");
+        }
+
+        eprintln!(
+            "\nTemporary file check failed. Please ensure that within each test file, any scratch file created is placed under a folder with the same name as the test file (without extension).\nExample: inside `join.slt`, temporary files must be created under `.../scratch/join/`\n"
+        );
+
+        return exec_err!("sqllogictests scratch file check failed");
+    }
+
     let num_tests = test_files.len();
-    let errors: Vec<_> = futures::stream::iter(test_files)
+    // For CI environments without TTY, print progress periodically
+    let is_ci = !stderr().is_terminal();
+    let completed_count = Arc::new(AtomicUsize::new(0));
+
+    let file_results: Vec<_> = futures::stream::iter(test_files)
         .map(|test_file| {
             let validator = if options.include_sqlite
                 && test_file.relative_path.starts_with(SQLITE_PREFIX)
@@ -137,59 +192,176 @@ async fn run_tests() -> Result<()> {
             let m_style_clone = m_style.clone();
             let filters = options.filters.clone();
 
+            let relative_path = test_file.relative_path.clone();
+            let relative_path_for_timing = test_file.relative_path.clone();
+
+            let currently_running_sql_tracker = CurrentlyExecutingSqlTracker::new();
+            let currently_running_sql_tracker_clone =
+                currently_running_sql_tracker.clone();
+            let file_start = Instant::now();
             SpawnedTask::spawn(async move {
-                match (options.postgres_runner, options.complete) {
-                    (false, false) => {
+                let result = match (
+                    options.postgres_runner,
+                    options.complete,
+                    options.substrait_round_trip,
+                ) {
+                    (_, _, true) => {
+                        run_test_file_substrait_round_trip(
+                            test_file,
+                            validator,
+                            m_clone,
+                            m_style_clone,
+                            filters.as_ref(),
+                            currently_running_sql_tracker_clone,
+                            colored_output,
+                        )
+                        .await
+                    }
+                    (false, false, _) => {
                         run_test_file(
                             test_file,
                             validator,
                             m_clone,
                             m_style_clone,
                             filters.as_ref(),
+                            currently_running_sql_tracker_clone,
+                            colored_output,
                         )
-                        .await?
+                        .await
                     }
-                    (false, true) => {
-                        run_complete_file(test_file, validator, m_clone, m_style_clone)
-                            .await?
+                    (false, true, _) => {
+                        run_complete_file(
+                            test_file,
+                            validator,
+                            m_clone,
+                            m_style_clone,
+                            currently_running_sql_tracker_clone,
+                        )
+                        .await
                     }
-                    (true, false) => {
+                    (true, false, _) => {
                         run_test_file_with_postgres(
                             test_file,
                             validator,
                             m_clone,
                             m_style_clone,
                             filters.as_ref(),
+                            currently_running_sql_tracker_clone,
                         )
-                        .await?
+                        .await
                     }
-                    (true, true) => {
+                    (true, true, _) => {
                         run_complete_file_with_postgres(
                             test_file,
                             validator,
                             m_clone,
                             m_style_clone,
+                            currently_running_sql_tracker_clone,
                         )
-                        .await?
+                        .await
                     }
+                };
+
+                let elapsed = file_start.elapsed();
+                if timing_debug_slow_files && elapsed.as_secs() > 30 {
+                    eprintln!(
+                        "Slow file: {} took {:.1}s",
+                        relative_path_for_timing.display(),
+                        elapsed.as_secs_f64()
+                    );
                 }
-                Ok(()) as Result<()>
+
+                (result, elapsed)
             })
             .join()
+            .map(move |result| {
+                let elapsed = match &result {
+                    Ok((_, elapsed)) => *elapsed,
+                    Err(_) => Duration::ZERO,
+                };
+
+                (
+                    result.map(|(thread_result, _)| thread_result),
+                    relative_path,
+                    currently_running_sql_tracker,
+                    elapsed,
+                )
+            })
         })
         // run up to num_cpus streams in parallel
-        .buffer_unordered(get_available_parallelism())
-        .flat_map(|result| {
-            // Filter out any Ok() leaving only the DataFusionErrors
-            futures::stream::iter(match result {
-                // Tokio panic error
-                Err(e) => Some(DataFusionError::External(Box::new(e))),
-                Ok(thread_result) => thread_result.err(),
-            })
+        .buffer_unordered(options.test_threads)
+        .inspect({
+            let completed_count = Arc::clone(&completed_count);
+            move |_| {
+                let completed = completed_count.fetch_add(1, Ordering::Relaxed) + 1;
+                // In CI (no TTY), print progress every 10% or every 50 files
+                if is_ci && (completed.is_multiple_of(50) || completed == num_tests) {
+                    eprintln!(
+                        "Progress: {}/{} files completed ({:.0}%)",
+                        completed,
+                        num_tests,
+                        (completed as f64 / num_tests as f64) * 100.0
+                    );
+                }
+            }
         })
         .collect()
         .await;
 
+    let mut file_timings: Vec<FileTiming> = file_results
+        .iter()
+        .map(|(_, path, _, elapsed)| FileTiming {
+            relative_path: path.clone(),
+            elapsed: *elapsed,
+        })
+        .collect();
+
+    file_timings.sort_by(|a, b| {
+        b.elapsed
+            .cmp(&a.elapsed)
+            .then_with(|| a.relative_path.cmp(&b.relative_path))
+    });
+
+    print_timing_summary(&options, &m, is_ci, &file_timings)?;
+
+    let errors: Vec<_> = file_results
+        .into_iter()
+        .filter_map(|(result, test_file_path, current_sql, _)| {
+            // Filter out any Ok() leaving only the DataFusionErrors
+            match result {
+                Err(e) => {
+                    let error = DataFusionError::External(Box::new(e));
+                    let current_sql = current_sql.get_currently_running_sqls();
+
+                    if current_sql.is_empty() {
+                        Some(error.context(format!(
+                            "failure in {} with no currently running sql tracked",
+                            test_file_path.display()
+                        )))
+                    } else if current_sql.len() == 1 {
+                        let sql = &current_sql[0];
+                        Some(error.context(format!(
+                            "failure in {} for sql {sql}",
+                            test_file_path.display()
+                        )))
+                    } else {
+                        let sqls = current_sql
+                            .iter()
+                            .enumerate()
+                            .map(|(i, sql)| format!("\n[{}]: {}", i + 1, sql))
+                            .collect::<String>();
+                        Some(error.context(format!(
+                            "failure in {} for multiple currently running sqls: {}",
+                            test_file_path.display(),
+                            sqls
+                        )))
+                    }
+                }
+                Ok(thread_result) => thread_result.err(),
+            }
+        })
+        .collect();
+
     m.println(format!(
         "Completed {} test files in {}",
         num_tests,
@@ -210,12 +382,119 @@ async fn run_tests() -> Result<()> {
     }
 }
 
+fn print_timing_summary(
+    options: &Options,
+    progress: &MultiProgress,
+    is_ci: bool,
+    file_timings: &[FileTiming],
+) -> Result<()> {
+    let mode = options.timing_summary_mode(is_ci);
+    if mode == TimingSummaryMode::Off || file_timings.is_empty() {
+        return Ok(());
+    }
+
+    let top_n = options.timing_top_n;
+    debug_assert!(matches!(
+        mode,
+        TimingSummaryMode::Top | TimingSummaryMode::Full
+    ));
+    let count = if mode == TimingSummaryMode::Full {
+        file_timings.len()
+    } else {
+        top_n
+    };
+
+    progress.println("Per-file elapsed summary (deterministic):")?;
+    for (idx, timing) in file_timings.iter().take(count).enumerate() {
+        progress.println(format!(
+            "{:>3}. {:>8.3}s  {}",
+            idx + 1,
+            timing.elapsed.as_secs_f64(),
+            timing.relative_path.display()
+        ))?;
+    }
+
+    if mode != TimingSummaryMode::Full && file_timings.len() > count {
+        progress.println(format!(
+            "... {} more files omitted (use --timing-summary full to show all)",
+            file_timings.len() - count
+        ))?;
+    }
+
+    Ok(())
+}
+
+fn is_env_truthy(name: &str) -> bool {
+    std::env::var_os(name)
+        .and_then(|value| value.into_string().ok())
+        .is_some_and(|value| {
+            matches!(
+                value.trim().to_ascii_lowercase().as_str(),
+                "1" | "true" | "yes" | "on"
+            )
+        })
+}
+
+fn parse_timing_top_n(arg: &str) -> std::result::Result<usize, String> {
+    let parsed = arg
+        .parse::<usize>()
+        .map_err(|error| format!("invalid value '{arg}': {error}"))?;
+    if parsed == 0 {
+        return Err("must be >= 1".to_string());
+    }
+    Ok(parsed)
+}
+
+async fn run_test_file_substrait_round_trip(
+    test_file: TestFile,
+    validator: Validator,
+    mp: MultiProgress,
+    mp_style: ProgressStyle,
+    filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    colored_output: bool,
+) -> Result<()> {
+    let TestFile {
+        path,
+        relative_path,
+    } = test_file;
+    let Some(test_ctx) = TestContext::try_new_for_test_file(&relative_path).await else {
+        info!("Skipping: {}", path.display());
+        return Ok(());
+    };
+    setup_scratch_dir(&relative_path)?;
+
+    let count: u64 = get_record_count(&path, "DatafusionSubstraitRoundTrip".to_string());
+    let pb = mp.add(ProgressBar::new(count));
+
+    pb.set_style(mp_style);
+    pb.set_message(format!("{:?}", &relative_path));
+
+    let mut runner = sqllogictest::Runner::new(|| async {
+        Ok(DataFusionSubstraitRoundTrip::new(
+            test_ctx.session_ctx().clone(),
+            relative_path.clone(),
+            pb.clone(),
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone()))
+    });
+    runner.add_label("DatafusionSubstraitRoundTrip");
+    runner.with_column_validator(strict_column_validator);
+    runner.with_normalizer(value_normalizer);
+    runner.with_validator(validator);
+    let res = run_file_in_runner(path, runner, filters, colored_output).await;
+    pb.finish_and_clear();
+    res
+}
+
 async fn run_test_file(
     test_file: TestFile,
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
     filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    colored_output: bool,
 ) -> Result<()> {
     let TestFile {
         path,
@@ -238,13 +517,14 @@ async fn run_test_file(
             test_ctx.session_ctx().clone(),
             relative_path.clone(),
             pb.clone(),
-        ))
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone()))
     });
     runner.add_label("Datafusion");
     runner.with_column_validator(strict_column_validator);
     runner.with_normalizer(value_normalizer);
     runner.with_validator(validator);
-    let result = run_file_in_runner(path, runner, filters).await;
+    let result = run_file_in_runner(path, runner, filters, colored_output).await;
     pb.finish_and_clear();
     result
 }
@@ -253,6 +533,7 @@ async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
     path: PathBuf,
     mut runner: sqllogictest::Runner<D, M>,
     filters: &[Filter],
+    colored_output: bool,
 ) -> Result<()> {
     let path = path.canonicalize()?;
     let records =
@@ -266,7 +547,11 @@ async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
             continue;
         }
         if let Err(err) = runner.run_async(record).await {
-            errs.push(format!("{err}"));
+            if colored_output {
+                errs.push(format!("{}", err.display(true)));
+            } else {
+                errs.push(format!("{err}"));
+            }
         }
     }
 
@@ -289,6 +574,7 @@ async fn run_file_in_runner<D: AsyncDB, M: MakeConnection<Conn = D>>(
     Ok(())
 }
 
+#[expect(clippy::needless_pass_by_value)]
 fn get_record_count(path: &PathBuf, label: String) -> u64 {
     let records: Vec<Record<<DataFusion as AsyncDB>::ColumnType>> =
         parse_file(path).unwrap();
@@ -332,6 +618,7 @@ async fn run_test_file_with_postgres(
     mp: MultiProgress,
     mp_style: ProgressStyle,
     filters: &[Filter],
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion_sqllogictest::Postgres;
     let TestFile {
@@ -347,13 +634,17 @@ async fn run_test_file_with_postgres(
     pb.set_message(format!("{:?}", &relative_path));
 
     let mut runner = sqllogictest::Runner::new(|| {
-        Postgres::connect(relative_path.clone(), pb.clone())
+        Postgres::connect_with_tracked_sql(
+            relative_path.clone(),
+            pb.clone(),
+            currently_executing_sql_tracker.clone(),
+        )
     });
     runner.add_label("postgres");
     runner.with_column_validator(strict_column_validator);
     runner.with_normalizer(value_normalizer);
     runner.with_validator(validator);
-    let result = run_file_in_runner(path, runner, filters).await;
+    let result = run_file_in_runner(path, runner, filters, false).await;
     pb.finish_and_clear();
     result
 }
@@ -365,6 +656,7 @@ async fn run_test_file_with_postgres(
     _mp: MultiProgress,
     _mp_style: ProgressStyle,
     _filters: &[Filter],
+    _currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion::common::plan_err;
     plan_err!("Can not run with postgres as postgres feature is not enabled")
@@ -375,6 +667,7 @@ async fn run_complete_file(
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     let TestFile {
         path,
@@ -400,7 +693,8 @@ async fn run_complete_file(
             test_ctx.session_ctx().clone(),
             relative_path.clone(),
             pb.clone(),
-        ))
+        )
+        .with_currently_executing_sql_tracker(currently_executing_sql_tracker.clone()))
     });
 
     let col_separator = " ";
@@ -414,9 +708,7 @@ async fn run_complete_file(
         )
         .await
         // Can't use e directly because it isn't marked Send, so turn it into a string.
-        .map_err(|e| {
-            DataFusionError::Execution(format!("Error completing {relative_path:?}: {e}"))
-        });
+        .map_err(|e| exec_datafusion_err!("Error completing {relative_path:?}: {e}"));
 
     pb.finish_and_clear();
 
@@ -429,6 +721,7 @@ async fn run_complete_file_with_postgres(
     validator: Validator,
     mp: MultiProgress,
     mp_style: ProgressStyle,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion_sqllogictest::Postgres;
     let TestFile {
@@ -448,7 +741,11 @@ async fn run_complete_file_with_postgres(
     pb.set_message(format!("{:?}", &relative_path));
 
     let mut runner = sqllogictest::Runner::new(|| {
-        Postgres::connect(relative_path.clone(), pb.clone())
+        Postgres::connect_with_tracked_sql(
+            relative_path.clone(),
+            pb.clone(),
+            currently_executing_sql_tracker.clone(),
+        )
     });
     runner.add_label("postgres");
     runner.with_column_validator(strict_column_validator);
@@ -466,9 +763,7 @@ async fn run_complete_file_with_postgres(
         )
         .await
         // Can't use e directly because it isn't marked Send, so turn it into a string.
-        .map_err(|e| {
-            DataFusionError::Execution(format!("Error completing {relative_path:?}: {e}"))
-        });
+        .map_err(|e| exec_datafusion_err!("Error completing {relative_path:?}: {e}"));
 
     pb.finish_and_clear();
 
@@ -481,81 +776,40 @@ async fn run_complete_file_with_postgres(
     _validator: Validator,
     _mp: MultiProgress,
     _mp_style: ProgressStyle,
+    _currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 ) -> Result<()> {
     use datafusion::common::plan_err;
     plan_err!("Can not run with postgres as postgres feature is not enabled")
 }
 
-/// Represents a parsed test file
-#[derive(Debug)]
-struct TestFile {
-    /// The absolute path to the file
-    pub path: PathBuf,
-    /// The relative path of the file (used for display)
-    pub relative_path: PathBuf,
-}
-
-impl TestFile {
-    fn new(path: PathBuf) -> Self {
-        let p = path.to_string_lossy();
-        let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) {
-            p.strip_prefix(TEST_DIRECTORY).unwrap()
-        } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) {
-            p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap()
-        } else {
-            ""
-        });
-
-        Self {
-            path,
-            relative_path,
-        }
-    }
-
-    fn is_slt_file(&self) -> bool {
-        self.path.extension() == Some(OsStr::new("slt"))
-    }
-
-    fn check_sqlite(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with(SQLITE_PREFIX) {
-            return true;
-        }
-
-        options.include_sqlite
-    }
-
-    fn check_tpch(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with("tpch") {
-            return true;
-        }
+fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
+    let prefixes: &[&str] = if options.include_sqlite {
+        &[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY]
+    } else {
+        &[TEST_DIRECTORY]
+    };
 
-        options.include_tpch
-    }
-}
+    let directories = prefixes
+        .iter()
+        .map(|prefix| {
+            read_dir_recursive(prefix).map_err(|e| {
+                exec_datafusion_err!("Error reading test directory {prefix}: {e}")
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
 
-fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
-    let mut paths = read_dir_recursive(TEST_DIRECTORY)?
+    let mut paths = directories
         .into_iter()
-        .map(TestFile::new)
+        .flatten()
+        .map(|p| TestFile::new(p, prefixes))
         .filter(|f| options.check_test_file(&f.path))
         .filter(|f| f.is_slt_file())
-        .filter(|f| f.check_tpch(options))
-        .filter(|f| f.check_sqlite(options))
+        .filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) || options.include_tpch)
+        .filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) || options.include_sqlite)
         .filter(|f| options.check_pg_compat_file(f.path.as_path()))
         .collect::<Vec<_>>();
-    if options.include_sqlite {
-        let mut sqlite_paths = read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)?
-            .into_iter()
-            .map(TestFile::new)
-            .filter(|f| options.check_test_file(&f.path))
-            .filter(|f| f.is_slt_file())
-            .filter(|f| f.check_sqlite(options))
-            .filter(|f| options.check_pg_compat_file(f.path.as_path()))
-            .collect::<Vec<_>>();
-
-        paths.append(&mut sqlite_paths)
-    }
 
+    paths.sort_unstable();
     Ok(paths)
 }
 
@@ -578,6 +832,14 @@ struct Options {
     )]
     postgres_runner: bool,
 
+    #[clap(
+        long,
+        conflicts_with = "complete",
+        conflicts_with = "postgres_runner",
+        help = "Before executing each query, convert its logical plan to Substrait and from Substrait back to its logical plan"
+    )]
+    substrait_round_trip: bool,
+
     #[clap(long, env = "INCLUDE_SQLITE", help = "Include sqlite files")]
     include_sqlite: bool,
 
@@ -626,9 +888,55 @@ struct Options {
         help = "IGNORED (for compatibility with built-in rust test runner)"
     )]
     nocapture: bool,
+
+    #[clap(
+        long,
+        help = "Number of threads used for running tests in parallel",
+        default_value_t = get_available_parallelism()
+    )]
+    test_threads: usize,
+
+    #[clap(
+        long,
+        env = "SLT_TIMING_SUMMARY",
+        value_enum,
+        default_value_t = TimingSummaryMode::Auto,
+        help = "Per-file timing summary mode: auto|off|top|full"
+    )]
+    timing_summary: TimingSummaryMode,
+
+    #[clap(
+        long,
+        env = "SLT_TIMING_TOP_N",
+        default_value_t = 10,
+        value_parser = parse_timing_top_n,
+        help = "Number of files to show when timing summary mode is auto/top (must be >= 1)"
+    )]
+    timing_top_n: usize,
+
+    #[clap(
+        long,
+        value_name = "MODE",
+        help = "Control colored output",
+        default_value_t = ColorChoice::Auto
+    )]
+    color: ColorChoice,
 }
 
 impl Options {
+    fn timing_summary_mode(&self, is_ci: bool) -> TimingSummaryMode {
+        match self.timing_summary {
+            TimingSummaryMode::Auto => {
+                if is_ci {
+                    TimingSummaryMode::Top
+                } else {
+                    TimingSummaryMode::Off
+                }
+            }
+            mode => mode,
+        }
+    }
+
     /// Because this test can be run as a cargo test, commands like
     ///
     /// ```shell
@@ -667,4 +975,99 @@ impl Options {
             eprintln!("WARNING: Ignoring `--show-output` compatibility option");
         }
     }
+
+    /// Determine if colour output should be enabled, respecting --color, NO_COLOR, CARGO_TERM_COLOR, and terminal detection
+    fn is_colored(&self) -> bool {
+        // NO_COLOR takes precedence
+        if std::env::var_os("NO_COLOR").is_some() {
+            return false;
+        }
+
+        match self.color {
+            ColorChoice::Always => true,
+            ColorChoice::Never => false,
+            ColorChoice::Auto => {
+                // CARGO_TERM_COLOR takes precedence over auto-detection
+                let cargo_term_color = <ColorChoice as FromStr>::from_str(
+                    &std::env::var("CARGO_TERM_COLOR")
+                        .unwrap_or_else(|_| "auto".to_string()),
+                )
+                .unwrap_or(ColorChoice::Auto);
+                match cargo_term_color {
+                    ColorChoice::Always => true,
+                    ColorChoice::Never => false,
+                    ColorChoice::Auto => {
+                        // Auto for both CLI argument and CARGO_TERM_COLOR,
+                        // then use colors by default for non-dumb terminals
+                        stdout().is_terminal()
+                            && std::env::var("TERM").unwrap_or_default() != "dumb"
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Performs scratch file check for all test files.
+///
+/// Scratch file rule: In each .slt test file, the temporary file created must
+/// be under a folder that is has the same name as the test file.
+/// e.g. In `join.slt`, temporary files must be created under `.../scratch/join/`
+///
+/// See: <https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest#running-tests-scratchdir>
+///
+/// This function searches for `scratch/[target]/...` patterns and verifies
+/// that the target matches the file name.
+///
+/// Returns a vector of error strings for incorrectly created scratch files.
+fn scratch_file_check(test_files: &[TestFile]) -> Result<Vec<String>> {
+    let mut errors = Vec::new();
+
+    // Search for any scratch/[target]/... patterns and check if they match the file name
+    let scratch_pattern = regex::Regex::new(r"scratch/([^/]+)/").unwrap();
+
+    for test_file in test_files {
+        // Get the file content
+        let content = match fs::read_to_string(&test_file.path) {
+            Ok(content) => content,
+            Err(e) => {
+                errors.push(format!(
+                    "Failed to read file {}: {}",
+                    test_file.path.display(),
+                    e
+                ));
+                continue;
+            }
+        };
+
+        // Get the expected target name (file name without extension)
+        let expected_target = match test_file.path.file_stem() {
+            Some(stem) => stem.to_string_lossy().to_string(),
+            None => {
+                errors.push(format!("File {} has no stem", test_file.path.display()));
+                continue;
+            }
+        };
+
+        let lines: Vec<&str> = content.lines().collect();
+
+        for (line_num, line) in lines.iter().enumerate() {
+            if let Some(captures) = scratch_pattern.captures(line)
+                && let Some(found_target) = captures.get(1)
+            {
+                let found_target = found_target.as_str();
+                if found_target != expected_target {
+                    errors.push(format!(
+                        "File {}:{}: scratch target '{}' does not match file name '{}'",
+                        test_file.path.display(),
+                        line_num + 1,
+                        found_target,
+                        expected_target
+                    ));
+                }
+            }
+        }
+    }
+
+    Ok(errors)
 }
diff --git a/datafusion/sqllogictest/data/composite_order.csv b/datafusion/sqllogictest/data/composite_order.csv
new file mode 100644
index 0000000000000..b2c5e881bd605
--- /dev/null
+++ b/datafusion/sqllogictest/data/composite_order.csv
@@ -0,0 +1,8 @@
+a,b
+1,0
+0,2
+1,2
+0,4
+5,0
+3,3
+4,3
diff --git a/datafusion/sqllogictest/regenerate/sqllogictests.rs b/datafusion/sqllogictest/regenerate/sqllogictests.rs
index edad16bc84b1c..a50c4ae1cb7b1 100644
--- a/datafusion/sqllogictest/regenerate/sqllogictests.rs
+++ b/datafusion/sqllogictest/regenerate/sqllogictests.rs
@@ -497,7 +497,7 @@ async fn run_complete_file_with_postgres(
         .await
         // Can't use e directly because it isn't marked Send, so turn it into a string.
         .map_err(|e| {
-            DataFusionError::Execution(format!("Error completing {relative_path:?}: {e}"))
+            exec_datafusion_err!("Failed to complete test file {relative_path:?}: {e}")
         });
 
     pb.finish_and_clear();
diff --git a/datafusion/sqllogictest/src/engines/conversion.rs b/datafusion/sqllogictest/src/engines/conversion.rs
index 92ab64059bbda..3e519042f4ee0 100644
--- a/datafusion/sqllogictest/src/engines/conversion.rs
+++ b/datafusion/sqllogictest/src/engines/conversion.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::{i256, Decimal128Type, Decimal256Type, DecimalType};
+use arrow::datatypes::{Decimal128Type, Decimal256Type, DecimalType, i256};
 use bigdecimal::BigDecimal;
 use half::f16;
-use rust_decimal::prelude::*;
+use std::str::FromStr;
 
 /// Represents a constant for NULL string in your database.
 pub const NULL_STR: &str = "NULL";
@@ -35,7 +35,8 @@ pub(crate) fn varchar_to_str(value: &str) -> String {
     if value.is_empty() {
         "(empty)".to_string()
     } else {
-        value.trim_end_matches('\n').to_string()
+        // Escape nulls so that github renders them correctly in the webui
+        value.trim_end_matches('\n').replace("\u{0000}", "\\0")
     }
 }
 
@@ -114,13 +115,14 @@ pub(crate) fn decimal_256_to_str(value: i256, scale: i8) -> String {
 }
 
 #[cfg(feature = "postgres")]
-pub(crate) fn decimal_to_str(value: Decimal) -> String {
-    big_decimal_to_str(BigDecimal::from_str(&value.to_string()).unwrap(), None)
+pub(crate) fn decimal_to_str(value: BigDecimal) -> String {
+    big_decimal_to_str(value, None)
 }
 
 /// Converts a `BigDecimal` to its plain string representation, optionally rounding to a specified number of decimal places.
 ///
 /// If `round_digits` is `None`, the value is rounded to 12 decimal places by default.
+#[expect(clippy::needless_pass_by_value)]
 pub(crate) fn big_decimal_to_str(value: BigDecimal, round_digits: Option<i64>) -> String {
     // Round the value to limit the number of decimal places
     let value = value.round(round_digits.unwrap_or(12)).normalized();
@@ -131,7 +133,7 @@ pub(crate) fn big_decimal_to_str(value: BigDecimal, round_digits: Option<i64>) -
 #[cfg(test)]
 mod tests {
     use super::big_decimal_to_str;
-    use bigdecimal::{num_bigint::BigInt, BigDecimal};
+    use bigdecimal::{BigDecimal, num_bigint::BigInt};
 
     macro_rules! assert_decimal_str_eq {
         ($integer:expr, $scale:expr, $round_digits:expr, $expected:expr) => {
diff --git a/datafusion/sqllogictest/src/engines/currently_executed_sql.rs b/datafusion/sqllogictest/src/engines/currently_executed_sql.rs
new file mode 100644
index 0000000000000..5b1979b4ee9a9
--- /dev/null
+++ b/datafusion/sqllogictest/src/engines/currently_executed_sql.rs
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::sync::atomic::AtomicUsize;
+use std::sync::{Arc, Mutex};
+
+/// Hold the currently executed SQL statements.
+/// This is used to save the currently running SQLs in case of a crash.
+#[derive(Clone)]
+pub struct CurrentlyExecutingSqlTracker {
+    /// The index of the SQL statement.
+    /// Used to uniquely identify each SQL statement even if they are the same.
+    sql_index: Arc<AtomicUsize>,
+    /// Lock to store the currently executed SQL statement.
+    /// It DOES NOT hold the lock for the duration of query execution and only execute the lock
+    /// when updating the currently executed SQL statement to allow for saving the last executed SQL
+    /// in case of a crash.
+    currently_executed_sqls: Arc<Mutex<HashMap<usize, String>>>,
+}
+
+impl Default for CurrentlyExecutingSqlTracker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CurrentlyExecutingSqlTracker {
+    pub fn new() -> Self {
+        Self {
+            sql_index: Arc::new(AtomicUsize::new(0)),
+            currently_executed_sqls: Arc::new(Mutex::new(HashMap::new())),
+        }
+    }
+
+    /// Set the currently executed SQL statement.
+    ///
+    /// Returns a key to use to remove the SQL statement when done.
+    ///
+    /// We are not returning a guard that will automatically remove the SQL statement when dropped.
+    /// as on panic the drop can be called, and it will remove the SQL statement before we can log it.
+    #[must_use = "The returned index must be used to remove the SQL statement when done."]
+    pub fn set_sql(&self, sql: impl Into<String>) -> usize {
+        let index = self
+            .sql_index
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .insert(index, sql.into());
+        index
+    }
+
+    /// Remove the currently executed SQL statement by the provided key that was returned by [`Self::set_sql`].
+    pub fn remove_sql(&self, index: usize) {
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .remove(&index);
+    }
+
+    /// Get the currently executed SQL statements.
+    pub fn get_currently_running_sqls(&self) -> Vec<String> {
+        self.currently_executed_sqls
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .values()
+            .cloned()
+            .collect()
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/error.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/error.rs
index a60ae1012f9cf..f4e1a967e4834 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_engine/error.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_engine/error.rs
@@ -28,7 +28,7 @@ pub type Result<T, E = DFSqlLogicTestError> = std::result::Result<T, E>;
 pub enum DFSqlLogicTestError {
     /// Error from sqllogictest-rs
     #[error("SqlLogicTest error(from sqllogictest-rs crate): {0}")]
-    SqlLogicTest(#[from] TestError),
+    SqlLogicTest(#[from] Box<TestError>),
     /// Error from datafusion
     #[error("DataFusion error: {}", .0.strip_backtrace())]
     DataFusion(#[from] DataFusionError),
diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
index 0d832bb3062dd..bad9a1dd3fc48 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
@@ -19,53 +19,47 @@ use super::super::conversion::*;
 use super::error::{DFSqlLogicTestError, Result};
 use crate::engines::output::DFColumnType;
 use arrow::array::{Array, AsArray};
-use arrow::datatypes::Fields;
+use arrow::datatypes::{Fields, Schema};
 use arrow::util::display::ArrayFormatter;
 use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch};
-use datafusion::common::DataFusionError;
+use datafusion::common::internal_datafusion_err;
 use datafusion::config::ConfigField;
 use std::path::PathBuf;
 use std::sync::LazyLock;
 
 /// Converts `batches` to a result as expected by sqllogictest.
 pub fn convert_batches(
+    schema: &Schema,
     batches: Vec<RecordBatch>,
     is_spark_path: bool,
 ) -> Result<Vec<Vec<String>>> {
-    if batches.is_empty() {
-        Ok(vec![])
-    } else {
-        let schema = batches[0].schema();
-        let mut rows = vec![];
-        for batch in batches {
-            // Verify schema
-            if !schema.contains(&batch.schema()) {
-                return Err(DFSqlLogicTestError::DataFusion(DataFusionError::Internal(
-                    format!(
-                        "Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}",
-                        &schema,
-                        batch.schema()
-                    ),
-                )));
-            }
-
-            // Convert a single batch to a `Vec<Vec<String>>` for comparison, flatten expanded rows, and normalize each.
-            let new_rows = (0..batch.num_rows())
-                .map(|row| {
-                    batch
-                        .columns()
-                        .iter()
-                        .map(|col| cell_to_string(col, row, is_spark_path))
-                        .collect::<Result<Vec<String>>>()
-                })
-                .collect::<Result<Vec<Vec<String>>>>()?
-                .into_iter()
-                .flat_map(expand_row)
-                .map(normalize_paths);
-            rows.extend(new_rows);
+    let mut rows = vec![];
+    for batch in batches {
+        // Verify schema
+        if !schema.contains(&batch.schema()) {
+            return Err(DFSqlLogicTestError::DataFusion(internal_datafusion_err!(
+                "Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}",
+                &schema,
+                batch.schema()
+            )));
         }
-        Ok(rows)
+
+        // Convert a single batch to a `Vec<Vec<String>>` for comparison, flatten expanded rows, and normalize each.
+        let new_rows = (0..batch.num_rows())
+            .map(|row| {
+                batch
+                    .columns()
+                    .iter()
+                    .map(|col| cell_to_string(col, row, is_spark_path))
+                    .collect::<Result<Vec<String>>>()
+            })
+            .collect::<Result<Vec<Vec<String>>>>()?
+            .into_iter()
+            .flat_map(expand_row)
+            .map(normalize_paths);
+        rows.extend(new_rows);
     }
+    Ok(rows)
 }
 
 /// special case rows that have newlines in them (like explain plans)
@@ -191,9 +185,8 @@ macro_rules! get_row_value {
 /// [NULL Values and empty strings]: https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings
 ///
 /// Floating numbers are rounded to have a consistent representation with the Postgres runner.
-///
 pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result<String> {
-    if !col.is_valid(row) {
+    if col.is_null(row) {
         // represent any null value with the string "NULL"
         Ok(NULL_STR.to_string())
     } else {
diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
index a01ac7e2f9855..c682d081f88f4 100644
--- a/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
+++ b/datafusion/sqllogictest/src/engines/datafusion_engine/runner.rs
@@ -15,10 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 use std::{path::PathBuf, time::Duration};
 
-use super::{error::Result, normalize, DFSqlLogicTestError};
+use super::{DFSqlLogicTestError, error::Result, normalize};
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
+use crate::engines::output::{DFColumnType, DFOutput};
+use crate::is_spark_path;
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::physical_plan::common::collect;
@@ -30,24 +34,45 @@ use log::{debug, log_enabled, warn};
 use sqllogictest::DBOutput;
 use tokio::time::Instant;
 
-use crate::engines::output::{DFColumnType, DFOutput};
-use crate::is_spark_path;
-
 pub struct DataFusion {
     ctx: SessionContext,
     relative_path: PathBuf,
     pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    default_config: HashMap<String, Option<String>>,
 }
 
 impl DataFusion {
     pub fn new(ctx: SessionContext, relative_path: PathBuf, pb: ProgressBar) -> Self {
+        let default_config = ctx
+            .state()
+            .config()
+            .options()
+            .entries()
+            .iter()
+            .map(|e| (e.key.clone(), e.value.clone()))
+            .collect();
+
         Self {
             ctx,
             relative_path,
             pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
+            default_config,
         }
     }
 
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        mut self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        self.currently_executing_sql_tracker = currently_executing_sql_tracker;
+        self
+    }
+
     fn update_slow_count(&self) {
         let msg = self.pb.message();
         let split: Vec<&str> = msg.split(" ").collect();
@@ -79,10 +104,14 @@ impl sqllogictest::AsyncDB for DataFusion {
             );
         }
 
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
         let start = Instant::now();
         let result = run_query(&self.ctx, is_spark_path(&self.relative_path), sql).await;
         let duration = start.elapsed();
 
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
         if duration.gt(&Duration::from_millis(500)) {
             self.update_slow_count();
         }
@@ -116,6 +145,49 @@ impl sqllogictest::AsyncDB for DataFusion {
     async fn shutdown(&mut self) {}
 }
 
+impl Drop for DataFusion {
+    fn drop(&mut self) {
+        let mut changed = false;
+
+        for e in self.ctx.state().config().options().entries() {
+            let default_entry = self.default_config.remove(&e.key);
+
+            if let Some(default_entry) = default_entry
+                && default_entry.as_ref() != e.value.as_ref()
+            {
+                if !changed {
+                    changed = true;
+                    self.pb.println(format!(
+                        "SLT file {} left modified configuration",
+                        self.relative_path.display()
+                    ));
+                }
+
+                let default = default_entry.as_deref().unwrap_or("NULL");
+                let current = e.value.as_deref().unwrap_or("NULL");
+
+                self.pb
+                    .println(format!("  {}: {} -> {}", e.key, default, current));
+            }
+        }
+
+        // Any remaining entries were present initially but removed during execution
+        for (key, value) in &self.default_config {
+            if !changed {
+                changed = true;
+                self.pb.println(format!(
+                    "SLT file {} left modified configuration",
+                    self.relative_path.display()
+                ));
+            }
+
+            let default = value.as_deref().unwrap_or("NULL");
+
+            self.pb.println(format!("  {key}: {default} -> NULL"));
+        }
+    }
+}
+
 async fn run_query(
     ctx: &SessionContext,
     is_spark_path: bool,
@@ -124,11 +196,12 @@ async fn run_query(
     let df = ctx.sql(sql.into().as_str()).await?;
     let task_ctx = Arc::new(df.task_ctx());
     let plan = df.create_physical_plan().await?;
+    let schema = plan.schema();
 
     let stream = execute_stream(plan, task_ctx)?;
     let types = normalize::convert_schema_to_types(stream.schema().fields());
     let results: Vec<RecordBatch> = collect(stream).await?;
-    let rows = normalize::convert_batches(results, is_spark_path)?;
+    let rows = normalize::convert_batches(&schema, results, is_spark_path)?;
 
     if rows.is_empty() && types.is_empty() {
         Ok(DBOutput::StatementComplete(0))
diff --git a/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/mod.rs b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/mod.rs
new file mode 100644
index 0000000000000..9ff077c67d8c1
--- /dev/null
+++ b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/mod.rs
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod runner;
+
+pub use runner::*;
diff --git a/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs
new file mode 100644
index 0000000000000..d4b4377e30875
--- /dev/null
+++ b/datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs
@@ -0,0 +1,176 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+use std::{path::PathBuf, time::Duration};
+
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
+use crate::engines::datafusion_engine::Result;
+use crate::engines::output::{DFColumnType, DFOutput};
+use crate::{DFSqlLogicTestError, convert_batches, convert_schema_to_types};
+use arrow::record_batch::RecordBatch;
+use async_trait::async_trait;
+use datafusion::logical_expr::LogicalPlan;
+use datafusion::physical_plan::common::collect;
+use datafusion::physical_plan::execute_stream;
+use datafusion::prelude::SessionContext;
+use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
+use datafusion_substrait::logical_plan::producer::to_substrait_plan;
+use indicatif::ProgressBar;
+use log::Level::{Debug, Info};
+use log::{debug, log_enabled, warn};
+use sqllogictest::DBOutput;
+use tokio::time::Instant;
+
+pub struct DataFusionSubstraitRoundTrip {
+    ctx: SessionContext,
+    relative_path: PathBuf,
+    pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+}
+
+impl DataFusionSubstraitRoundTrip {
+    pub fn new(ctx: SessionContext, relative_path: PathBuf, pb: ProgressBar) -> Self {
+        Self {
+            ctx,
+            relative_path,
+            pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
+        }
+    }
+
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        Self {
+            currently_executing_sql_tracker,
+            ..self
+        }
+    }
+
+    fn update_slow_count(&self) {
+        let msg = self.pb.message();
+        let split: Vec<&str> = msg.split(" ").collect();
+        let mut current_count = 0;
+
+        if split.len() > 2 {
+            // third match will be current slow count
+            current_count = split[2].parse::<i32>().unwrap();
+        }
+
+        current_count += 1;
+
+        self.pb
+            .set_message(format!("{} - {} took > 500 ms", split[0], current_count));
+    }
+}
+
+#[async_trait]
+impl sqllogictest::AsyncDB for DataFusionSubstraitRoundTrip {
+    type Error = DFSqlLogicTestError;
+    type ColumnType = DFColumnType;
+
+    async fn run(&mut self, sql: &str) -> Result<DFOutput> {
+        if log_enabled!(Debug) {
+            debug!(
+                "[{}] Running query: \"{}\"",
+                self.relative_path.display(),
+                sql
+            );
+        }
+
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
+        let start = Instant::now();
+        let result = run_query_substrait_round_trip(&self.ctx, sql).await;
+        let duration = start.elapsed();
+
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
+        if duration.gt(&Duration::from_millis(500)) {
+            self.update_slow_count();
+        }
+
+        self.pb.inc(1);
+
+        if log_enabled!(Info) && duration.gt(&Duration::from_secs(2)) {
+            warn!(
+                "[{}] Running query took more than 2 sec ({duration:?}): \"{sql}\"",
+                self.relative_path.display()
+            );
+        }
+
+        result
+    }
+
+    /// Engine name of current database.
+    fn engine_name(&self) -> &str {
+        "DataFusionSubstraitRoundTrip"
+    }
+
+    /// `DataFusion` calls this function to perform sleep.
+    ///
+    /// The default implementation is `std::thread::sleep`, which is universal to any async runtime
+    /// but would block the current thread. If you are running in tokio runtime, you should override
+    /// this by `tokio::time::sleep`.
+    async fn sleep(dur: Duration) {
+        tokio::time::sleep(dur).await;
+    }
+
+    async fn shutdown(&mut self) {}
+}
+
+async fn run_query_substrait_round_trip(
+    ctx: &SessionContext,
+    sql: impl Into<String>,
+) -> Result<DFOutput> {
+    let df = ctx.sql(sql.into().as_str()).await?;
+    let task_ctx = Arc::new(df.task_ctx());
+
+    let state = ctx.state();
+    let round_tripped_plan = match df.logical_plan() {
+        // Substrait does not handle these plans
+        LogicalPlan::Ddl(_)
+        | LogicalPlan::Explain(_)
+        | LogicalPlan::Dml(_)
+        | LogicalPlan::Copy(_)
+        | LogicalPlan::DescribeTable(_)
+        | LogicalPlan::Statement(_) => df.logical_plan().clone(),
+        // For any other plan, convert to Substrait
+        logical_plan => {
+            let plan = to_substrait_plan(logical_plan, &state)?;
+            from_substrait_plan(&state, &plan).await?
+        }
+    };
+
+    let physical_plan = state.create_physical_plan(&round_tripped_plan).await?;
+    let schema = physical_plan.schema();
+    let stream = execute_stream(physical_plan, task_ctx)?;
+    let types = convert_schema_to_types(stream.schema().fields());
+    let results: Vec<RecordBatch> = collect(stream).await?;
+    let rows = convert_batches(&schema, results, false)?;
+
+    if rows.is_empty() && types.is_empty() {
+        Ok(DBOutput::StatementComplete(0))
+    } else {
+        Ok(DBOutput::Rows { types, rows })
+    }
+}
diff --git a/datafusion/sqllogictest/src/engines/mod.rs b/datafusion/sqllogictest/src/engines/mod.rs
index 3569dea701761..ee2987db07593 100644
--- a/datafusion/sqllogictest/src/engines/mod.rs
+++ b/datafusion/sqllogictest/src/engines/mod.rs
@@ -17,16 +17,21 @@
 
 /// Implementation of sqllogictest for datafusion.
 mod conversion;
+mod currently_executed_sql;
 mod datafusion_engine;
+mod datafusion_substrait_roundtrip_engine;
 mod output;
 
-pub use datafusion_engine::convert_batches;
-pub use datafusion_engine::convert_schema_to_types;
 pub use datafusion_engine::DFSqlLogicTestError;
 pub use datafusion_engine::DataFusion;
+pub use datafusion_engine::convert_batches;
+pub use datafusion_engine::convert_schema_to_types;
+pub use datafusion_substrait_roundtrip_engine::DataFusionSubstraitRoundTrip;
 pub use output::DFColumnType;
 pub use output::DFOutput;
 
+pub use currently_executed_sql::CurrentlyExecutingSqlTracker;
+
 #[cfg(feature = "postgres")]
 mod postgres_engine;
 
diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
index 68816626bf672..c3f266dcd1b62 100644
--- a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
+++ b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use async_trait::async_trait;
+use bigdecimal::BigDecimal;
 use bytes::Bytes;
 use datafusion::common::runtime::SpawnedTask;
 use futures::{SinkExt, StreamExt};
@@ -27,16 +28,13 @@ use std::str::FromStr;
 use std::time::Duration;
 
 use super::conversion::*;
+use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker;
 use crate::engines::output::{DFColumnType, DFOutput};
 use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
 use indicatif::ProgressBar;
 use postgres_types::Type;
-use rust_decimal::Decimal;
 use tokio::time::Instant;
-use tokio_postgres::{Column, Row};
-use types::PgRegtype;
-
-mod types;
+use tokio_postgres::{SimpleQueryMessage, SimpleQueryRow};
 
 // default connect string, can be overridden by the `PG_URL` environment variable
 const PG_URI: &str = "postgresql://postgres@127.0.0.1/test";
@@ -59,6 +57,7 @@ pub struct Postgres {
     /// Relative test file path
     relative_path: PathBuf,
     pb: ProgressBar,
+    currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
 }
 
 impl Postgres {
@@ -76,8 +75,8 @@ impl Postgres {
     ///
     /// See https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html#url for format
     pub async fn connect(relative_path: PathBuf, pb: ProgressBar) -> Result<Self> {
-        let uri =
-            std::env::var("PG_URI").map_or(PG_URI.to_string(), std::convert::identity);
+        let uri = std::env::var("PG_URI")
+            .map_or_else(|_| PG_URI.to_string(), std::convert::identity);
 
         info!("Using postgres connection string: {uri}");
 
@@ -93,7 +92,7 @@ impl Postgres {
 
         let spawned_task = SpawnedTask::spawn(async move {
             if let Err(e) = connection.await {
-                log::error!("Postgres connection error: {:?}", e);
+                log::error!("Postgres connection error: {e:?}");
             }
         });
 
@@ -118,9 +117,34 @@ impl Postgres {
             spawned_task: Some(spawned_task),
             relative_path,
             pb,
+            currently_executing_sql_tracker: CurrentlyExecutingSqlTracker::default(),
         })
     }
 
+    /// Creates a runner for executing queries against an existing postgres connection
+    /// with a tracker for currently executing SQL statements.
+    pub async fn connect_with_tracked_sql(
+        relative_path: PathBuf,
+        pb: ProgressBar,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Result<Self> {
+        let conn = Self::connect(relative_path, pb).await?;
+        Ok(conn.with_currently_executing_sql_tracker(currently_executing_sql_tracker))
+    }
+
+    /// Add a tracker that will track the currently executed SQL statement.
+    ///
+    /// This is useful for logging and debugging purposes.
+    pub fn with_currently_executing_sql_tracker(
+        self,
+        currently_executing_sql_tracker: CurrentlyExecutingSqlTracker,
+    ) -> Self {
+        Self {
+            currently_executing_sql_tracker,
+            ..self
+        }
+    }
+
     fn get_client(&mut self) -> &mut tokio_postgres::Client {
         self.client.as_mut().expect("client is shutdown")
     }
@@ -242,6 +266,8 @@ impl sqllogictest::AsyncDB for Postgres {
             sql
         );
 
+        let tracked_sql = self.currently_executing_sql_tracker.set_sql(sql);
+
         let lower_sql = sql.trim_start().to_ascii_lowercase();
 
         let is_query_sql = {
@@ -258,16 +284,32 @@ impl sqllogictest::AsyncDB for Postgres {
 
         if lower_sql.starts_with("copy") {
             self.pb.inc(1);
-            return self.run_copy_command(sql).await;
+            let result = self.run_copy_command(sql).await;
+            self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
+            return result;
         }
 
         if !is_query_sql {
             self.get_client().execute(sql, &[]).await?;
+            self.currently_executing_sql_tracker.remove_sql(tracked_sql);
             self.pb.inc(1);
             return Ok(DBOutput::StatementComplete(0));
         }
+        // Use a prepared statement to get the output column types
+        let statement = self.get_client().prepare(sql).await?;
+        let types: Vec<Type> = statement
+            .columns()
+            .iter()
+            .map(|c| c.type_().clone())
+            .collect();
+
+        // Run the actual query using the "simple query" protocol that returns all
+        // rows as text. Doing this avoids having to convert values from the binary
+        // format to strings, which is somewhat tricky for numeric types.
+        // See https://github.com/apache/datafusion/pull/19666#discussion_r2668090587
         let start = Instant::now();
-        let rows = self.get_client().query(sql, &[]).await?;
+        let messages = self.get_client().simple_query(sql).await?;
         let duration = start.elapsed();
 
         if duration.gt(&Duration::from_millis(500)) {
@@ -276,28 +318,16 @@ impl sqllogictest::AsyncDB for Postgres {
 
         self.pb.inc(1);
 
-        let types: Vec<Type> = if rows.is_empty() {
-            self.get_client()
-                .prepare(sql)
-                .await?
-                .columns()
-                .iter()
-                .map(|c| c.type_().clone())
-                .collect()
-        } else {
-            rows[0]
-                .columns()
-                .iter()
-                .map(|c| c.type_().clone())
-                .collect()
-        };
+        self.currently_executing_sql_tracker.remove_sql(tracked_sql);
+
+        let rows = convert_rows(&types, &messages);
 
         if rows.is_empty() && types.is_empty() {
             Ok(DBOutput::StatementComplete(0))
         } else {
             Ok(DBOutput::Rows {
                 types: convert_types(types),
-                rows: convert_rows(rows),
+                rows,
             })
         }
     }
@@ -316,58 +346,68 @@ impl sqllogictest::AsyncDB for Postgres {
     }
 }
 
-fn convert_rows(rows: Vec<Row>) -> Vec<Vec<String>> {
-    rows.iter()
+fn convert_rows(types: &[Type], messages: &[SimpleQueryMessage]) -> Vec<Vec<String>> {
+    messages
+        .iter()
+        .filter_map(|message| match message {
+            SimpleQueryMessage::Row(row) => Some(row),
+            _ => None,
+        })
         .map(|row| {
-            row.columns()
+            types
                 .iter()
                 .enumerate()
-                .map(|(idx, column)| cell_to_string(row, column, idx))
+                .map(|(idx, column_type)| cell_to_string(row, column_type, idx))
                 .collect::<Vec<String>>()
         })
         .collect::<Vec<_>>()
 }
 
-macro_rules! make_string {
-    ($row:ident, $idx:ident, $t:ty) => {{
-        let value: Option<$t> = $row.get($idx);
-        match value {
-            Some(value) => value.to_string(),
-            None => NULL_STR.to_string(),
+fn cell_to_string(row: &SimpleQueryRow, column_type: &Type, idx: usize) -> String {
+    // simple_query returns text values, so we parse by Postgres type to keep
+    // normalization aligned with the DataFusion engine output.
+    let value = row.get(idx);
+    match (column_type, value) {
+        (_, None) => NULL_STR.to_string(),
+        (&Type::CHAR, Some(value)) => value
+            .as_bytes()
+            .first()
+            .map(|byte| (*byte as i8).to_string())
+            .unwrap_or_else(|| NULL_STR.to_string()),
+        (&Type::INT2, Some(value)) => value.parse::<i16>().unwrap().to_string(),
+        (&Type::INT4, Some(value)) => value.parse::<i32>().unwrap().to_string(),
+        (&Type::INT8, Some(value)) => value.parse::<i64>().unwrap().to_string(),
+        (&Type::NUMERIC, Some(value)) => {
+            decimal_to_str(BigDecimal::from_str(value).unwrap())
         }
-    }};
-    ($row:ident, $idx:ident, $t:ty, $convert:ident) => {{
-        let value: Option<$t> = $row.get($idx);
-        match value {
-            Some(value) => $convert(value).to_string(),
-            None => NULL_STR.to_string(),
+        // Parse date/time strings explicitly to avoid locale-specific formatting.
+        (&Type::DATE, Some(value)) => NaiveDate::parse_from_str(value, "%Y-%m-%d")
+            .unwrap()
+            .to_string(),
+        (&Type::TIME, Some(value)) => NaiveTime::parse_from_str(value, "%H:%M:%S%.f")
+            .unwrap()
+            .to_string(),
+        (&Type::TIMESTAMP, Some(value)) => {
+            let parsed = NaiveDateTime::parse_from_str(value, "%Y-%m-%d %H:%M:%S%.f")
+                .or_else(|_| NaiveDateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%S%.f"))
+                .unwrap();
+            format!("{parsed:?}")
         }
-    }};
-}
-
-fn cell_to_string(row: &Row, column: &Column, idx: usize) -> String {
-    match column.type_().clone() {
-        Type::CHAR => make_string!(row, idx, i8),
-        Type::INT2 => make_string!(row, idx, i16),
-        Type::INT4 => make_string!(row, idx, i32),
-        Type::INT8 => make_string!(row, idx, i64),
-        Type::NUMERIC => make_string!(row, idx, Decimal, decimal_to_str),
-        Type::DATE => make_string!(row, idx, NaiveDate),
-        Type::TIME => make_string!(row, idx, NaiveTime),
-        Type::TIMESTAMP => {
-            let value: Option<NaiveDateTime> = row.get(idx);
-            value
-                .map(|d| format!("{d:?}"))
-                .unwrap_or_else(|| "NULL".to_string())
+        (&Type::BOOL, Some(value)) => {
+            let parsed = match value {
+                "t" | "true" | "TRUE" => true,
+                "f" | "false" | "FALSE" => false,
+                _ => panic!("Unsupported boolean value: {value}"),
+            };
+            bool_to_str(parsed)
         }
-        Type::BOOL => make_string!(row, idx, bool, bool_to_str),
-        Type::BPCHAR | Type::VARCHAR | Type::TEXT => {
-            make_string!(row, idx, &str, varchar_to_str)
+        (&Type::BPCHAR | &Type::VARCHAR | &Type::TEXT, Some(value)) => {
+            varchar_to_str(value)
         }
-        Type::FLOAT4 => make_string!(row, idx, f32, f32_to_str),
-        Type::FLOAT8 => make_string!(row, idx, f64, f64_to_str),
-        Type::REGTYPE => make_string!(row, idx, PgRegtype),
-        _ => unimplemented!("Unsupported type: {}", column.type_().name()),
+        (&Type::FLOAT4, Some(value)) => f32_to_str(value.parse::<f32>().unwrap()),
+        (&Type::FLOAT8, Some(value)) => f64_to_str(value.parse::<f64>().unwrap()),
+        (&Type::REGTYPE, Some(value)) => value.to_string(),
+        _ => unimplemented!("Unsupported type: {}", column_type.name()),
     }
 }
 
diff --git a/datafusion/sqllogictest/src/filters.rs b/datafusion/sqllogictest/src/filters.rs
index 44482236f7c5b..568fa3f66676e 100644
--- a/datafusion/sqllogictest/src/filters.rs
+++ b/datafusion/sqllogictest/src/filters.rs
@@ -120,10 +120,10 @@ pub fn should_skip_record<D: AsyncDB>(
         if !loc.file().contains(&filter.file_substring) {
             continue;
         }
-        if let Some(line_num) = filter.line_number {
-            if loc.line() != line_num {
-                continue;
-            }
+        if let Some(line_num) = filter.line_number
+            && loc.line() != line_num
+        {
+            continue;
         }
 
         // This filter matches both file name substring and the exact
@@ -142,12 +142,11 @@ fn statement_is_skippable(statement: &Statement) -> bool {
 
     // Cannot skip SELECT INTO statements, as they can also create tables
     // that further test cases will use.
-    if let SqlStatement::Query(v) = sql_stmt.as_ref() {
-        if let SetExpr::Select(v) = v.body.as_ref() {
-            if v.into.is_some() {
-                return false;
-            }
-        }
+    if let SqlStatement::Query(v) = sql_stmt.as_ref()
+        && let SetExpr::Select(v) = v.body.as_ref()
+        && v.into.is_some()
+    {
+        return false;
     }
 
     // Only SELECT and EXPLAIN statements can be skipped, as any other
diff --git a/datafusion/sqllogictest/src/lib.rs b/datafusion/sqllogictest/src/lib.rs
index cf190f1a93cad..bb12c58bdcc20 100644
--- a/datafusion/sqllogictest/src/lib.rs
+++ b/datafusion/sqllogictest/src/lib.rs
@@ -19,7 +19,7 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
@@ -27,13 +27,16 @@
 //! DataFusion sqllogictest driver
 
 mod engines;
+mod test_file;
 
-pub use engines::convert_batches;
-pub use engines::convert_schema_to_types;
+pub use engines::CurrentlyExecutingSqlTracker;
 pub use engines::DFColumnType;
 pub use engines::DFOutput;
 pub use engines::DFSqlLogicTestError;
 pub use engines::DataFusion;
+pub use engines::DataFusionSubstraitRoundTrip;
+pub use engines::convert_batches;
+pub use engines::convert_schema_to_types;
 
 #[cfg(feature = "postgres")]
 pub use engines::Postgres;
@@ -44,4 +47,5 @@ mod util;
 
 pub use filters::*;
 pub use test_context::TestContext;
+pub use test_file::TestFile;
 pub use util::*;
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index 143e3ef1a89ba..8bd0cabcb05b0 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -15,11 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
+use std::vec;
 
 use arrow::array::{
     Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
@@ -29,22 +31,28 @@ use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UnionFields};
 use arrow::record_batch::RecordBatch;
 use datafusion::catalog::{
-    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session,
+    CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, Session,
+};
+use datafusion::common::{DataFusionError, Result, not_impl_err};
+use datafusion::functions::math::abs;
+use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
+use datafusion::logical_expr::{
+    ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+    Volatility, create_udf,
 };
-use datafusion::common::DataFusionError;
-use datafusion::logical_expr::{create_udf, ColumnarValue, Expr, ScalarUDF, Volatility};
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::prelude::*;
 use datafusion::{
     datasource::{MemTable, TableProvider, TableType},
     prelude::{CsvReadOptions, SessionContext},
 };
+use datafusion_spark::SessionStateBuilderSpark;
 
 use crate::is_spark_path;
 use async_trait::async_trait;
 use datafusion::common::cast::as_float64_array;
-use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::execution::SessionStateBuilder;
+use datafusion::execution::runtime_env::RuntimeEnv;
 use log::info;
 use tempfile::TempDir;
 
@@ -74,22 +82,26 @@ impl TestContext {
             // hardcode target partitions so plans are deterministic
             .with_target_partitions(4);
         let runtime = Arc::new(RuntimeEnv::default());
-        let mut state = SessionStateBuilder::new()
+
+        let mut state_builder = SessionStateBuilder::new()
             .with_config(config)
             .with_runtime_env(runtime)
-            .with_default_features()
-            .build();
+            .with_default_features();
 
         if is_spark_path(relative_path) {
-            info!("Registering Spark functions");
-            datafusion_spark::register_all(&mut state)
-                .expect("Can not register Spark functions");
+            state_builder = state_builder.with_spark_features();
         }
 
+        let state = state_builder.build();
+
         let mut test_ctx = TestContext::new(SessionContext::new_with_state(state));
 
         let file_name = relative_path.file_name().unwrap().to_str().unwrap();
         match file_name {
+            "cte_quoted_reference.slt" => {
+                info!("Registering strict catalog provider for CTE tests");
+                register_strict_orders_catalog(test_ctx.session_ctx());
+            }
             "information_schema_table_types.slt" => {
                 info!("Registering local temporary table");
                 register_temp_table(test_ctx.session_ctx()).await;
@@ -133,6 +145,10 @@ impl TestContext {
                 info!("Registering table with union column");
                 register_union_table(test_ctx.session_ctx())
             }
+            "async_udf.slt" => {
+                info!("Registering dummy async udf");
+                register_async_abs_udf(test_ctx.session_ctx())
+            }
             _ => {
                 info!("Using default SessionContext");
             }
@@ -161,6 +177,104 @@ impl TestContext {
     }
 }
 
+// ==============================================================================
+// Strict Catalog / Schema Provider (sqllogictest-only)
+// ==============================================================================
+//
+// The goal of `cte_quoted_reference.slt` is to exercise end-to-end query planning
+// while detecting *unexpected* catalog lookups.
+//
+// Specifically, if DataFusion incorrectly treats a CTE reference (e.g. `"barbaz"`)
+// as a real table reference, the planner will attempt to resolve it through the
+// schema provider. The types below deliberately `panic!` on any lookup other than
+// the one table we expect (`orders`).
+//
+// This makes the "extra provider lookup" bug observable in an end-to-end test,
+// rather than being silently ignored by default providers that return `Ok(None)`
+// for unknown tables.
+
+#[derive(Debug)]
+struct StrictOrdersCatalog {
+    schema: Arc<dyn SchemaProvider>,
+}
+
+impl CatalogProvider for StrictOrdersCatalog {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        vec!["public".to_string()]
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        (name == "public").then(|| Arc::clone(&self.schema))
+    }
+}
+
+#[derive(Debug)]
+struct StrictOrdersSchema {
+    orders: Arc<dyn TableProvider>,
+}
+
+#[async_trait]
+impl SchemaProvider for StrictOrdersSchema {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        vec!["orders".to_string()]
+    }
+
+    async fn table(
+        &self,
+        name: &str,
+    ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> {
+        match name {
+            "orders" => Ok(Some(Arc::clone(&self.orders))),
+            other => panic!(
+                "unexpected table lookup: {other}. This maybe indicates a CTE reference was \
+                 incorrectly treated as a catalog table reference."
+            ),
+        }
+    }
+
+    fn table_exist(&self, name: &str) -> bool {
+        name == "orders"
+    }
+}
+
+fn register_strict_orders_catalog(ctx: &SessionContext) {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "order_id",
+        DataType::Int32,
+        false,
+    )]));
+
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![Arc::new(Int32Array::from(vec![1, 2]))],
+    )
+    .expect("record batch should be valid");
+
+    let orders =
+        MemTable::try_new(schema, vec![vec![batch]]).expect("memtable should be valid");
+
+    let schema_provider: Arc<dyn SchemaProvider> = Arc::new(StrictOrdersSchema {
+        orders: Arc::new(orders),
+    });
+
+    // Override the default "datafusion" catalog for this test file so that any
+    // unexpected lookup is caught immediately.
+    ctx.register_catalog(
+        "datafusion",
+        Arc::new(StrictOrdersCatalog {
+            schema: schema_provider,
+        }),
+    );
+}
+
 #[cfg(feature = "avro")]
 pub async fn register_avro_tables(ctx: &mut TestContext) {
     use datafusion::prelude::AvroReadOptions;
@@ -235,7 +349,7 @@ pub async fn register_temp_table(ctx: &SessionContext) {
 
     #[async_trait]
     impl TableProvider for TestTable {
-        fn as_any(&self) -> &dyn std::any::Any {
+        fn as_any(&self) -> &dyn Any {
             self
         }
 
@@ -426,14 +540,15 @@ fn create_example_udf() -> ScalarUDF {
 
 fn register_union_table(ctx: &SessionContext) {
     let union = UnionArray::try_new(
-        UnionFields::new(
+        UnionFields::try_new(
             // typeids: 3 for int, 1 for string
             vec![3, 1],
             vec![
                 Field::new("int", DataType::Int32, false),
                 Field::new("string", DataType::Utf8, false),
             ],
-        ),
+        )
+        .unwrap(),
         ScalarBuffer::from(vec![3, 1, 3]),
         None,
         vec![
@@ -458,3 +573,48 @@ fn register_union_table(ctx: &SessionContext) {
 
     ctx.register_batch("union_table", batch).unwrap();
 }
+
+fn register_async_abs_udf(ctx: &SessionContext) {
+    #[derive(Debug, PartialEq, Eq, Hash)]
+    struct AsyncAbs {
+        inner_abs: Arc<ScalarUDF>,
+    }
+    impl AsyncAbs {
+        fn new() -> Self {
+            AsyncAbs { inner_abs: abs() }
+        }
+    }
+    impl ScalarUDFImpl for AsyncAbs {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn name(&self) -> &str {
+            "async_abs"
+        }
+
+        fn signature(&self) -> &Signature {
+            self.inner_abs.signature()
+        }
+
+        fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+            self.inner_abs.return_type(arg_types)
+        }
+
+        fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+            not_impl_err!("{} can only be called from async contexts", self.name())
+        }
+    }
+    #[async_trait]
+    impl AsyncScalarUDFImpl for AsyncAbs {
+        async fn invoke_async_with_args(
+            &self,
+            args: ScalarFunctionArgs,
+        ) -> Result<ColumnarValue> {
+            return self.inner_abs.invoke_with_args(args);
+        }
+    }
+    let async_abs = AsyncAbs::new();
+    let udf = AsyncScalarUDF::new(Arc::new(async_abs));
+    ctx.register_udf(udf.into_scalar_udf());
+}
diff --git a/datafusion/sqllogictest/src/test_file.rs b/datafusion/sqllogictest/src/test_file.rs
new file mode 100644
index 0000000000000..c44cae133639b
--- /dev/null
+++ b/datafusion/sqllogictest/src/test_file.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::ffi::OsStr;
+use std::path::{Path, PathBuf};
+use std::sync::LazyLock;
+
+/// Represents a parsed test file
+///
+/// Note there is a custom Ord implementation that sorts test files by:
+/// 1. Hard coded test priority (lower runs first),
+/// 2. Relative path as deterministic tie-breaker.
+#[derive(Debug, PartialEq, Eq)]
+pub struct TestFile {
+    /// The absolute path to the file
+    pub path: PathBuf,
+    /// The relative path of the file (used for display)
+    pub relative_path: PathBuf,
+}
+
+impl TestFile {
+    /// Create a new [`TestFile`] from the given path, stripping any of the
+    /// known test directory prefixes for the relative path.
+    pub fn new(path: PathBuf, prefixes: &[&str]) -> Self {
+        let p = path.to_string_lossy();
+        for prefix in prefixes {
+            if p.starts_with(prefix) {
+                let relative_path = PathBuf::from(p.strip_prefix(prefix).unwrap());
+                return Self {
+                    path,
+                    relative_path,
+                };
+            }
+        }
+        let relative_path = PathBuf::from("");
+
+        Self {
+            path,
+            relative_path,
+        }
+    }
+
+    /// Returns true if the file has a .slt extension, indicating it is a sqllogictest file.
+    pub fn is_slt_file(&self) -> bool {
+        self.path.extension() == Some(OsStr::new("slt"))
+    }
+
+    /// Returns true if the relative path starts with the given prefix, which
+    /// can be used to filter tests by subdirectory or filename patterns.
+    pub fn relative_path_starts_with(&self, prefix: impl AsRef<Path>) -> bool {
+        self.relative_path.starts_with(prefix)
+    }
+}
+
+impl PartialOrd for TestFile {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for TestFile {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        let self_path = &self.relative_path;
+        let other_path = &other.relative_path;
+
+        let priority_self = TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY);
+        let priority_other = TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY);
+
+        priority_self
+            .cmp(priority_other)
+            .then_with(|| self_path.cmp(other_path)) // Tie-breaker: lexicographic order of relative paths.
+            // Final tie-breaker keeps Ord consistent with Eq when relative paths collide.
+            .then_with(|| self.path.cmp(&other.path))
+    }
+}
+
+/// TEST PRIORITY
+///
+/// Heuristically prioritize some test to run earlier.
+///
+/// Prioritizes test to run earlier if they are known to be long running (as
+/// each test file itself is run sequentially, but multiple test files are run
+/// in parallel.
+///
+/// Tests not listed here will run after the listed tests in deterministic
+/// lexicographic order by relative path.
+///
+/// You can find the top longest running tests by running `--timing-summary`
+/// mode. For example
+///
+/// ```shell
+/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
+/// ...
+/// Per-file elapsed summary (deterministic):
+/// 1.    3.568s  aggregate.slt
+/// 2.    3.464s  joins.slt
+/// 3.    3.336s  imdb.slt
+/// 4.    3.085s  push_down_filter_regression.slt
+/// 5.    2.926s  aggregate_skip_partial.slt
+/// 6.    2.453s  array.slt
+/// 7.    2.399s  window.slt
+/// 8.    2.198s  group_by.slt
+/// 9.    1.281s  clickbench.slt
+/// 10.    1.058s  datetime/timestamps.slt
+/// ```
+const TEST_PRIORITY_ENTRIES: &[&str] = &[
+    "aggregate.slt", //  longest-running files go first
+    "joins.slt",
+    "imdb.slt",
+    "push_down_filter_regression.slt",
+    "aggregate_skip_partial.slt",
+    "array.slt",
+    "window.slt",
+    "group_by.slt",
+    "clickbench.slt",
+    "datetime/timestamps.slt",
+];
+
+/// Default priority for tests not in the priority map. Tests with lower
+/// priority values run first.
+const DEFAULT_PRIORITY: usize = 100;
+
+static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
+    TEST_PRIORITY_ENTRIES
+        .iter()
+        .enumerate()
+        .map(|(priority, path)| (PathBuf::from(path), priority))
+        .collect()
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn prioritized_files_are_first() {
+        let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"];
+        input.extend(TEST_PRIORITY_ENTRIES.iter());
+        input.push("q_unlisted.slt");
+
+        let mut sorted = to_test_files(input);
+        sorted.sort_unstable();
+
+        println!("Sorted input: {sorted:?}");
+
+        // the prioritized files should be first, in the order specified by TEST_PRIORITY_ENTRIES
+        for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) {
+            assert!(
+                TEST_PRIORITY.contains_key(&file.relative_path),
+                "Expected prioritized file {file:?} not found in input {sorted:?}"
+            );
+        }
+        // last three files should be the unlisted ones in deterministic order
+        let expected_files =
+            to_test_files(["a_unlisted.slt", "q_unlisted.slt", "z_unlisted.slt"]);
+        assert!(
+            sorted.ends_with(&expected_files),
+            "Expected unlisted files {expected_files:?} at the end in deterministic order of {sorted:?}"
+        );
+    }
+
+    fn to_test_files<'a>(files: impl IntoIterator<Item = &'a str>) -> Vec<TestFile> {
+        files
+            .into_iter()
+            .map(|f| TestFile {
+                path: PathBuf::from(f),
+                relative_path: PathBuf::from(f),
+            })
+            .collect()
+    }
+}
diff --git a/datafusion/sqllogictest/src/util.rs b/datafusion/sqllogictest/src/util.rs
index 695fe463fa676..b0cf32266ea31 100644
--- a/datafusion/sqllogictest/src/util.rs
+++ b/datafusion/sqllogictest/src/util.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{exec_datafusion_err, Result};
+use datafusion::common::{Result, exec_datafusion_err};
 use itertools::Itertools;
 use log::Level::Warn;
 use log::{info, log_enabled, warn};
@@ -44,7 +44,7 @@ pub fn setup_scratch_dir(name: &Path) -> Result<()> {
 /// Trailing whitespace from lines in SLT will typically be removed, but do not fail if it is not
 /// If particular test wants to cover trailing whitespace on a value,
 /// it should project additional non-whitespace column on the right.
-#[allow(clippy::ptr_arg)]
+#[expect(clippy::ptr_arg)]
 pub fn value_normalizer(s: &String) -> String {
     s.trim_end().to_string()
 }
@@ -82,6 +82,10 @@ pub fn df_value_validator(
     actual: &[Vec<String>],
     expected: &[String],
 ) -> bool {
+    // Support ignore marker <slt:ignore> to skip volatile parts of output.
+    const IGNORE_MARKER: &str = "<slt:ignore>";
+    let contains_ignore_marker = expected.iter().any(|line| line.contains(IGNORE_MARKER));
+
     let normalized_expected = expected.iter().map(normalizer).collect::<Vec<_>>();
     let normalized_actual = actual
         .iter()
@@ -89,13 +93,39 @@ pub fn df_value_validator(
         .map(|str| str.trim_end().to_string())
         .collect_vec();
 
+    // If ignore marker present, perform fragment-based matching on the full snapshot.
+    if contains_ignore_marker {
+        let expected_snapshot = normalized_expected.join("\n");
+        let actual_snapshot = normalized_actual.join("\n");
+        let fragments: Vec<&str> = expected_snapshot.split(IGNORE_MARKER).collect();
+        let mut pos = 0;
+        for (i, frag) in fragments.iter().enumerate() {
+            if frag.is_empty() {
+                continue;
+            }
+            if let Some(idx) = actual_snapshot[pos..].find(frag) {
+                // Edge case: The following example is expected to fail
+                // Actual - 'foo bar baz'
+                // Expected - 'bar <slt:ignore>'
+                if (i == 0) && (idx != 0) {
+                    return false;
+                }
+
+                pos += idx + frag.len();
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
     if log_enabled!(Warn) && normalized_actual != normalized_expected {
         warn!("df validation failed. actual vs expected:");
         for i in 0..normalized_actual.len() {
             warn!("[{i}] {}<eol>", normalized_actual[i]);
             warn!(
                 "[{i}] {}<eol>",
-                if normalized_expected.len() >= i {
+                if normalized_expected.len() > i {
                     &normalized_expected[i]
                 } else {
                     "No more results"
@@ -110,3 +140,20 @@ pub fn df_value_validator(
 pub fn is_spark_path(relative_path: &Path) -> bool {
     relative_path.starts_with("spark/")
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Validation should fail for the below case:
+    // Actual - 'foo bar baz'
+    // Expected - 'bar <slt:ignore>'
+    #[test]
+    fn ignore_marker_does_not_skip_leading_text() {
+        // Actual snapshot contains unexpected prefix before the expected fragment.
+        let actual = vec![vec!["foo bar baz".to_string()]];
+        let expected = vec!["bar <slt:ignore>".to_string()];
+
+        assert!(!df_value_validator(value_normalizer, &actual, &expected));
+    }
+}
diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt
index 9aeaaacb10718..2b33452184bc0 100644
--- a/datafusion/sqllogictest/test_files/agg_func_substitute.slt
+++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt
@@ -46,11 +46,10 @@ physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 
 query TT
@@ -66,11 +65,10 @@ physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT a, ARRAY_AGG(c ORDER BY c)[1 + 100] as result
@@ -85,11 +83,10 @@ physical_plan
 01)ProjectionExec: expr=[a@0 as a, nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[nth_value(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query II
 SELECT a, ARRAY_AGG(c ORDER BY c)[1] as result
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
index 52b1e1c22fdf5..cf894a494ad90 100644
--- a/datafusion/sqllogictest/test_files/aggregate.slt
+++ b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -32,10 +32,12 @@ CREATE EXTERNAL TABLE aggregate_test_100 (
   c10 BIGINT UNSIGNED NOT NULL,
   c11 FLOAT NOT NULL,
   c12 DOUBLE NOT NULL,
-  c13 VARCHAR NOT NULL
+  c13 VARCHAR NOT NULL,
+  c14 DATE NOT NULL,
+  c15 TIMESTAMP NOT NULL,
 )
 STORED AS CSV
-LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv'
 OPTIONS ('format.has_header' 'true');
 
 statement ok
@@ -127,55 +129,49 @@ CREATE TABLE group_median_table_nullable (
 # Error tests
 #######
 
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT SUM(c2) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+
+# WITHIN GROUP rejected for non-ordered-set UDAF
+# MIN does not implement ordered-set semantics (`supports_within_group_clause()`),
+# so the planner should reject the WITHIN GROUP syntax.
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT MIN(c) WITHIN GROUP (ORDER BY c) FROM (VALUES (1),(2)) as t(c);
+
+
 # https://github.com/apache/datafusion/issues/3353
 statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)"
 SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100
 
 # csv_query_approx_percentile_cont_with_weight
-statement error
+statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
 SELECT approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c1) FROM aggregate_test_100
-----
-DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from [Utf8View, Int8, Float64] to the signature OneOf([Exact([Int8, Int8, Float64]), Exact([Int16, Int16, Float64]), Exact([Int32, Int32, Float64]), Exact([Int64, Int64, Float64]), Exact([UInt8, UInt8, Float64]), Exact([UInt16, UInt16, Float64]), Exact([UInt32, UInt32, Float64]), Exact([UInt64, UInt64, Float64]), Exact([Float32, Float32, Float64]), Exact([Float64, Float64, Float64])]) failed
-
 
-statement error
+statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
 SELECT approx_percentile_cont_with_weight(c1, 0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
-----
-DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from [Int16, Utf8View, Float64] to the signature OneOf([Exact([Int8, Int8, Float64]), Exact([Int16, Int16, Float64]), Exact([Int32, Int32, Float64]), Exact([Int64, Int64, Float64]), Exact([UInt8, UInt8, Float64]), Exact([UInt16, UInt16, Float64]), Exact([UInt32, UInt32, Float64]), Exact([UInt64, UInt64, Float64]), Exact([Float32, Float32, Float64]), Exact([Float64, Float64, Float64])]) failed
 
-
-statement error
+statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function
 SELECT approx_percentile_cont_with_weight(c2, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
-----
-DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont_with_weight' function: coercion from [Int16, Int8, Utf8View] to the signature OneOf([Exact([Int8, Int8, Float64]), Exact([Int16, Int16, Float64]), Exact([Int32, Int32, Float64]), Exact([Int64, Int64, Float64]), Exact([UInt8, UInt8, Float64]), Exact([UInt16, UInt16, Float64]), Exact([UInt32, UInt32, Float64]), Exact([UInt64, UInt64, Float64]), Exact([Float32, Float32, Float64]), Exact([Float64, Float64, Float64])]) failed
-
 
 # csv_query_approx_percentile_cont_with_histogram_bins
-statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\.
+statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\.
 SELECT c1, approx_percentile_cont(0.95, -1000) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 
-statement error
+statement error Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function
 SELECT approx_percentile_cont(0.95, c1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
-----
-DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from [Int16, Float64, Utf8View] to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed
-
-
 
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Int16, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)*
+statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int16, Float64, Float64 to the signature OneOf(.*) failed(.|\n)*
 SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
 
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from \[Float64, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)*
+statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Float64, Float64, Float64 to the signature OneOf(.*) failed(.|\n)*
 SELECT approx_percentile_cont(0.95, 111.1) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
 
-statement error DataFusion error: This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal
+statement error DataFusion error: Error during planning: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal
 SELECT approx_percentile_cont(c12) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
 
-statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal
+statement error DataFusion error: Error during planning: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal
 SELECT approx_percentile_cont(0.95, c5) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
 
-statement error DataFusion error: This feature is not implemented: Conflicting ordering requirements in aggregate functions is not supported
-SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5), approx_percentile_cont(0.2) WITHIN GROUP (ORDER BY c12) FROM aggregate_test_100
-
 statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for approx_percentile_cont
 SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5) IGNORE NULLS FROM aggregate_test_100
 
@@ -186,7 +182,7 @@ statement error DataFusion error: This feature is not implemented: Only a single
 SELECT approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c5, c12) FROM aggregate_test_100
 
 # Not supported over sliding windows
-query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause are can not be used together. OVER is for window function, whereas WITHIN GROUP is for ordered set aggregate function
+query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together. OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions
 SELECT approx_percentile_cont(0.5)
 WITHIN GROUP (ORDER BY c3)
 OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW)
@@ -222,6 +218,56 @@ query error Execution error: In an aggregate with DISTINCT, ORDER BY expressions
 SELECT array_agg(DISTINCT c13 ORDER BY c13, c12)
 FROM aggregate_test_100
 
+query ?? rowsort
+with tbl as (SELECT * FROM (VALUES ('xxx', 'yyy'), ('xxx', 'yyy'), ('xxx2', 'yyy2')) AS t(x, y))
+select
+    array_agg(x order by x) as x_agg,
+    array_agg(y order by y) as y_agg
+from tbl
+group by all
+----
+[xxx, xxx, xxx2] [yyy, yyy, yyy2]
+
+query ??
+SELECT
+    (SELECT array_agg(c12 ORDER BY c12) FROM aggregate_test_100),
+    (SELECT array_agg(c13 ORDER BY c13) FROM aggregate_test_100)
+----
+[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8]
+
+query ??
+SELECT
+    array_agg(c12 ORDER BY c12),
+    array_agg(c13 ORDER BY c13)
+FROM aggregate_test_100
+----
+[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8]
+
+query ?? rowsort
+with tbl as (SELECT * FROM (VALUES ('xxx', 'yyy'), ('xxx', 'yyy'), ('xxx2', 'yyy2')) AS t(x, y))
+select
+    array_agg(distinct x order by x) as x_agg,
+    array_agg(distinct y order by y) as y_agg
+from tbl
+group by all
+----
+[xxx, xxx2] [yyy, yyy2]
+
+query ??
+SELECT
+    (SELECT array_agg(DISTINCT c12 ORDER BY c12) FROM aggregate_test_100),
+    (SELECT array_agg(DISTINCT c13 ORDER BY c13) FROM aggregate_test_100)
+----
+[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8]
+
+query ??
+SELECT
+    array_agg(DISTINCT c12 ORDER BY c12),
+    array_agg(DISTINCT c13 ORDER BY c13)
+FROM aggregate_test_100
+----
+[0.01479305307777301, 0.02182578039211991, 0.03968347085780355, 0.04429073092078406, 0.047343434291126085, 0.04893135681998029, 0.0494924465469434, 0.05573662213439634, 0.05636955101974106, 0.061029375346466685, 0.07260475960924484, 0.09465635123783445, 0.12357539988406441, 0.152498292971736, 0.16301110515739792, 0.1640882545084913, 0.1754261586710173, 0.17592486905979987, 0.17909035118828576, 0.18628859265874176, 0.19113293583306745, 0.2145232647388039, 0.21535402343780985, 0.24899794314659673, 0.2537253407987472, 0.2667177795079635, 0.27159190516490006, 0.2739938529235548, 0.28534428578703896, 0.2944158618048994, 0.296036538664718, 0.3051364088814128, 0.30585375151301186, 0.3114712539863804, 0.3231750610081745, 0.32869374687050157, 0.33639590659276175, 0.3600766362333053, 0.36936304600612724, 0.38870280983958583, 0.39144436569161134, 0.40342283197779727, 0.4094218353587008, 0.40975383525297016, 0.42073125331890115, 0.4273123318932347, 0.42950521730777025, 0.4830878559436823, 0.5081765563442366, 0.5437595540422571, 0.5590205548347534, 0.5593249815276734, 0.5603062368164834, 0.560333188635217, 0.5614503754617461, 0.565352842229935, 0.574210838214554, 0.5759450483859969, 0.5773498217058918, 0.5991138115095911, 0.6009475544728957, 0.6108938307533, 0.6316565296547284, 0.6404495093354053, 0.6405262429561641, 0.6425694115212065, 0.658671129040488, 0.6668423897406515, 0.6864391962767343, 0.7035635283169166, 0.7325106678655877, 0.7328050041291218, 0.7614304100703713, 0.7631239070049998, 0.7670021786149205, 0.7697753383420857, 0.7764360990307122, 0.7784918983501654, 0.7973920072996036, 0.819715865079681, 0.8506721053047003, 0.8813167497816289, 0.8824879447595726, 0.9185813970744787, 0.9231889896940375, 0.9237877978193884, 0.9255031346434324, 0.9293883502480845, 0.9294097332465232, 0.9463098243875633, 0.946325164889271, 0.9491397432856566, 0.9567595541247681, 0.9706712283358269, 0.9723580396501548, 0.9748360509016578, 0.9800193410444061, 0.980809631269599, 0.991517828651004, 0.9965400387585364] [0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8, 3BEOHQsMEFZ58VcNTOJYShTBpAPzbt, 4HX6feIvmNXBN7XGqgO4YVBkhu8GDI, 4JznSdBajNWhu4hRQwjV1FjTTxY68i, 52mKlRE3aHCBZtjECq6sY9OqVf8Dze, 56MZa5O1hVtX4c5sbnCfxuX5kDChqI, 6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ, 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW, 6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE, 6x93sxYioWuq5c9Kkk8oTAAORM7cH0, 802bgTGl6Bk5TlkPYYTxp5JkKyaYUA, 8LIh0b6jmDGm87BmIyjdxNIpX4ugjD, 90gAtmGEeIqUTbo1ZrxCvWtsseukXC, 9UbObCsVkmYpJGcGrgfK90qOnwb2Lj, AFGCj7OWlEB5QfniEFgonMq90Tq5uH, ALuRhobVWbnQTTWZdSOk0iVe8oYFhW, Amn2K87Db5Es3dFQO9cw9cvpAM6h35, AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz, BJqx5WokrmrrezZA0dUbleMYkG5U2O, BPtQMxnuSPpxMExYV9YkDa6cAN7GP3, BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE, C2GT5KVyOPZpgKVl110TyZO0NcJ434, DuJNG8tufSqW0ZstHqWj3aGvFLMg4A, EcCuckwsF3gV1Ecgmh5v4KM8g1ozif, ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU, F7NSTjWvQJyBburN7CXRUlbgp2dIrA, Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u, H5j5ZHy1FGesOAHjkQEDYCucbpKWRu, HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g, IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr, IZTkHMLvIKuiLjhDjYMmIHxh166we4, Ig1QcuKsjHXkproePdERo2w0mYzIqd, JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ, JN0VclewmjwYlSl8386MlWv5rEhWCz, JafwVLSVk5AVoXFuzclesQ000EE2k1, KJFcmTVjdkCMv94wYCtfHMFhzyRsmH, Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn, Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV, LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW, MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ, MeSTAXq8gVxVjbEjgkvU9YLte0X9uE, NEhyk8uIx4kEULJGa8qIyFjjBcP2G6, O66j6PaYuZhEUtqV6fuU7TyjM2WxC5, OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh, OPwBqCEK5PWTjWaiOyL45u2NLTaDWv, Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0, Ow5PGpfTm4dXCfTDsXAOTatXRoAydR, QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv, QJYm7YRA3YetcBHI5wkMZeLXVmfuNy, QYlaIAnJA6r8rlAb6f59wcxvcPcWFf, RilTlL1tKkPOUFuzmLydHAVZwv1OGl, Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH, TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX, TtDKUZxzVxsq758G6AWPSYuZgVgbcl, VDhtJkYjAYPykCgOU9x3v7v3t4SO1a, VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4, Vp3gmWunM5A7wOC9YW2JroFqTWjvTi, WHmjWk2AY4c6m7DA4GitUx6nmb1yYS, XemNcT1xp61xcM1Qz3wZ1VECCnq06O, Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK, aDxBtor7Icd9C5hnTvvw5NrIre740e, akiiY5N0I44CMwEnBL6RTBk7BRkxEj, b3b9esRhTzFEawbs6XhpKnD9ojutHB, bgK1r6v3BCTh0aejJUhkA1Hn6idXGp, cBGc0kSm32ylBDnxogG727C0uhZEYZ, cq4WSAIFwx3wwTUS5bp1wCe71R6U5I, dVdvo6nUD5FgCgsbOZLds28RyGTpnx, e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG, f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX, fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG, gTpyQnEODMcpsPnJMZC66gh33i3m0b, gpo8K5qtYePve6jyPt6xgJx4YOVjms, gxfHWUF8XgY2KdFxigxvNEXe2V2XMl, i6RQVXKUh7MzuGMDaNclUYnFUAireU, ioEncce3mPOXD2hWhpZpCPWGATG6GU, jQimhdepw3GKmioWUlVSWeBVRKFkY3, l7uwDoTepWwnAP0ufqtHJS3CRi7RfP, lqhzgLsXZ8JhtpeeUWWNbMz8PHI705, m6jD0LBIQWaMfenwRCTANI9eOdyyto, mhjME0zBHbrK6NMkytMTQzOssOa1gF, mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS, nYVJnVicpGRqKZibHyBAmtmzBXAFfT, oHJMNvWuunsIMIWFnYG31RCfkOo2V7, oLZ21P2JEDooxV1pU31cIxQHEeeoLu, okOkcWflkNXIy4R8LzmySyY1EC3sYd, pLk3i59bZwd5KBZrI1FiweYTd5hteG, pTeu0WMjBRTaNRT15rLCuEh3tBJVc5, qnPOOmslCJaT45buUisMRnM0rc77EK, t6fQUjJejPcjc04wHvHTPe55S65B4V, ukOiFGGFnQJDHFgZxHMpvhD3zybF0M, ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8, waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs, wwXqSGKLyBQyPkonlzBNYUJTCo4LRS, xipQ93429ksjNcXPX5326VSg1xJZcW, y7C453hRWd4E7ImjNDWlpexB8nUqjh, ydkwycaISlYSlEq3TlkS2m15I2pcp8]
+
 statement ok
 CREATE EXTERNAL TABLE agg_order (
 c1 INT NOT NULL,
@@ -333,6 +379,59 @@ select array_sort(c1), array_sort(c2) from (
 statement ok
 drop table array_agg_distinct_list_table;
 
+# Test array_agg with DISTINCT and IGNORE NULLS (regression test for issue #19735)
+query ?
+SELECT array_sort(ARRAY_AGG(DISTINCT x IGNORE NULLS)) as result
+FROM (VALUES (1), (2), (NULL), (2), (NULL), (1)) AS t(x);
+----
+[1, 2]
+
+# Test that non-DISTINCT aggregates also preserve IGNORE NULLS when mixed with DISTINCT
+# This tests the two-phase aggregation rewrite in SingleDistinctToGroupBy
+query I?
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  array_sort(ARRAY_AGG(y IGNORE NULLS)) as y_agg
+FROM (VALUES
+  (1, 10),
+  (1, 20),
+  (2, 30),
+  (3, NULL),
+  (3, 40),
+  (NULL, 50)
+) AS t(x, y)
+----
+3 [10, 20, 30, 40, 50]
+
+# Test that FILTER clause is preserved in two-phase aggregation rewrite
+query II
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  SUM(y) FILTER (WHERE y > 15) as filtered_sum
+FROM (VALUES
+  (1, 10),
+  (1, 20),
+  (2, 5),
+  (2, 30),
+  (3, 25)
+) AS t(x, y)
+----
+3 75
+
+# Test that ORDER BY is preserved in two-phase aggregation rewrite
+query I?
+SELECT
+  COUNT(DISTINCT x) as distinct_count,
+  ARRAY_AGG(y ORDER BY y DESC) as ordered_agg
+FROM (VALUES
+  (1, 10),
+  (1, 30),
+  (2, 20),
+  (2, 40)
+) AS t(x, y)
+----
+2 [40, 30, 20, 10]
+
 statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1
 SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100
 
@@ -376,32 +475,31 @@ logical_plan
 04)------SubqueryAlias: a
 05)--------Union
 06)----------Projection: Int64(1) AS id, Int64(2) AS foo
-07)------------EmptyRelation
+07)------------EmptyRelation: rows=1
 08)----------Projection: Int64(1) AS id, Int64(NULL) AS foo
-09)------------EmptyRelation
+09)------------EmptyRelation: rows=1
 10)----------Projection: Int64(1) AS id, Int64(NULL) AS foo
-11)------------EmptyRelation
+11)------------EmptyRelation: rows=1
 12)----------Projection: Int64(1) AS id, Int64(3) AS foo
-13)------------EmptyRelation
+13)------------EmptyRelation: rows=1
 14)----------Projection: Int64(1) AS id, Int64(2) AS foo
-15)------------EmptyRelation
+15)------------EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[array_length(array_agg(DISTINCT a.foo)@1) as array_length(array_agg(DISTINCT a.foo)), sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))]
 02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
-05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
-06)----------UnionExec
-07)------------ProjectionExec: expr=[1 as id, 2 as foo]
-08)--------------PlaceholderRowExec
-09)------------ProjectionExec: expr=[1 as id, NULL as foo]
-10)--------------PlaceholderRowExec
-11)------------ProjectionExec: expr=[1 as id, NULL as foo]
-12)--------------PlaceholderRowExec
-13)------------ProjectionExec: expr=[1 as id, 3 as foo]
-14)--------------PlaceholderRowExec
-15)------------ProjectionExec: expr=[1 as id, 2 as foo]
-16)--------------PlaceholderRowExec
+03)----RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
+04)------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted
+05)--------UnionExec
+06)----------ProjectionExec: expr=[1 as id, 2 as foo]
+07)------------PlaceholderRowExec
+08)----------ProjectionExec: expr=[1 as id, NULL as foo]
+09)------------PlaceholderRowExec
+10)----------ProjectionExec: expr=[1 as id, NULL as foo]
+11)------------PlaceholderRowExec
+12)----------ProjectionExec: expr=[1 as id, 3 as foo]
+13)------------PlaceholderRowExec
+14)----------ProjectionExec: expr=[1 as id, 2 as foo]
+15)------------PlaceholderRowExec
 
 
 # FIX: custom absolute values
@@ -455,6 +553,12 @@ SELECT bit_xor(distinct c5 % 2) FROM aggregate_test_100
 ----
 -2
 
+# edge case for null accumulator state fields
+query ???I
+SELECT bit_and(NULL), bit_or(NULL), bit_xor(NULL), approx_distinct(NULL) from aggregate_test_100
+----
+NULL NULL NULL 0
+
 # csv_query_covariance_1
 query R
 SELECT covar_pop(c2, c12) FROM aggregate_test_100
@@ -467,6 +571,16 @@ SELECT covar(c2, c12) FROM aggregate_test_100
 ----
 -0.079969012479
 
+query R
+SELECT covar_pop(arrow_cast(c2, 'Float16'), arrow_cast(c12, 'Float16')) FROM aggregate_test_100
+----
+-0.079163311005
+
+query R
+SELECT covar(arrow_cast(c2, 'Float16'), arrow_cast(c12, 'Float16')) FROM aggregate_test_100
+----
+-0.079962940409
+
 # single_row_query_covar_1
 query R
 select covar_samp(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
@@ -521,7 +635,7 @@ SELECT corr(c2, c12) FROM aggregate_test_100
 query R
 select corr(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
 ----
-0
+NULL
 
 # all_nulls_query_correlation
 query R
@@ -555,6 +669,70 @@ from data
 ----
 1
 
+# group correlation_query_with_nans_f32
+query IR
+select id, corr(f, b)
+from values
+    (1, 1, 'nan'::float),
+    (2, 'nan'::float, 1),
+    (3, 'nan'::float, null),
+    (4, null, 'nan'::float),
+    (5, 'nan'::float, 'nan'::float),
+    (5, 1, 1),
+    (5, 2, 2),
+    (6, 'nan'::float, 'nan'::float) t(id, f, b)
+group by id
+order by id
+----
+1 NULL
+2 NULL
+3 NULL
+4 NULL
+5 NaN
+6 NaN
+
+# correlation_query_with_nans_f32
+query RR
+with data as (
+    select 'nan'::float as f, 'nan'::float as b
+)
+select corr(f, b), corr('nan'::float, 'nan'::float)
+from data
+----
+NaN NaN
+
+# group correlation_query_with_nans_f64
+query IR
+select id, corr(f, b)
+from values
+    (1, 1, 'nan'::double),
+    (2, 'nan'::double, 1),
+    (3, 'nan'::double, null),
+    (4, null, 'nan'::float),
+    (5, 'nan'::double, 'nan'::double),
+    (5, 1, 1),
+    (5, 2, 2),
+    (6, 'nan'::double, 'nan'::double) t(id, f, b)
+group by id
+order by id
+----
+1 NULL
+2 NULL
+3 NULL
+4 NULL
+5 NaN
+6 NaN
+
+# correlation_query_with_nans_f64
+query RR
+with data as (
+    select 'nan'::double as f, 'nan'::double as b
+)
+select corr(f, b), corr('nan'::double, 'nan'::double)
+from data
+----
+NaN NaN
+
 # csv_query_variance_1
 query R
 SELECT var_pop(c2) FROM aggregate_test_100
@@ -585,8 +763,10 @@ SELECT var(distinct c2) FROM aggregate_test_100
 ----
 2.5
 
-statement error DataFusion error: This feature is not implemented: VAR\(DISTINCT\) aggregations are not available
+query RR
 SELECT var(c2), var(distinct c2) FROM aggregate_test_100
+----
+1.886363636364 2.5
 
 # csv_query_distinct_variance_population
 query R
@@ -594,8 +774,10 @@ SELECT var_pop(distinct c2) FROM aggregate_test_100
 ----
 2
 
-statement error DataFusion error: This feature is not implemented: VAR_POP\(DISTINCT\) aggregations are not available
+query RR
 SELECT var_pop(c2), var_pop(distinct c2) FROM aggregate_test_100
+----
+1.8675 2
 
 # csv_query_variance_5
 query R
@@ -689,10 +871,6 @@ SELECT c2, var_samp(c12) FROM aggregate_test_100 WHERE c12 > 0.90 GROUP BY c2 OR
 4 NULL
 5 0.000269544643
 
-# Use PostgresSQL dialect
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 # csv_query_stddev_12
 query IR
 SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.95) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2
@@ -703,9 +881,30 @@ SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.95) FROM aggregate_test_100 GROUP
 4 NULL
 5 NULL
 
-# Restore the default dialect
 statement ok
-set datafusion.sql_parser.dialect = 'Generic';
+CREATE TABLE t (
+  a DOUBLE,
+  b BIGINT,
+  c INT
+) AS VALUES
+(1.0, 10, -5),
+(2.0, 20, -5),
+(3.0, 20, 4);
+
+# https://github.com/apache/datafusion/issues/15291
+query III
+WITH s AS (
+    SELECT
+        COUNT(a) FILTER (WHERE (b * b) - 3600 <= b),
+	COUNT(a) FILTER (WHERE (b * b) - 3000 <= b AND (c >= 0)),
+	COUNT(a) FILTER (WHERE (b * b) - 3000 <= b AND (c >= 0) AND (c >= 0))
+    FROM t
+) SELECT * FROM s
+----
+3 1 1
+
+statement ok
+DROP TABLE t
 
 # csv_query_stddev_13
 query IR
@@ -785,6 +984,13 @@ SELECT approx_median(distinct col_i8) FROM median_table
 statement error DataFusion error: This feature is not implemented: APPROX_MEDIAN\(DISTINCT\) aggregations are not available
 SELECT approx_median(col_i8), approx_median(distinct col_i8) FROM median_table
 
+# null handling clauses not supported
+query error DataFusion error: Error during planning: \[IGNORE \| RESPECT\] NULLS are not permitted for median
+SELECT median(c2) IGNORE NULLS FROM aggregate_test_100
+
+query error DataFusion error: Error during planning: \[IGNORE \| RESPECT\] NULLS are not permitted for median
+SELECT median(c2) RESPECT NULLS FROM aggregate_test_100
+
 # median_i16
 query I
 SELECT median(col_i16) FROM median_table
@@ -851,6 +1057,295 @@ SELECT approx_median(col_f64_nan) FROM median_table
 ----
 NaN
 
+
+# median_i8_overflow_negative
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-85, 'Int8')), (arrow_cast(-56, 'Int8'))) AS t(v);
+----
+-70
+
+# median_i8_overflow_positive
+# Test overflow with positive values: 100 + 120 = 220 > 127 (max i8)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(100, 'Int8')), (arrow_cast(120, 'Int8'))) AS t(v);
+----
+110
+
+# median_u8_overflow
+# Test unsigned overflow: 200 + 250 = 450 > 255 (max u8)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(200, 'UInt8')), (arrow_cast(250, 'UInt8'))) AS t(v);
+----
+225
+
+# median_i8_no_overflow_normal_case
+# Normal case that doesn't overflow for comparison
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(4, 'Int8')), (arrow_cast(5, 'Int8'))) AS t(v);
+----
+4
+
+# median_i8_max_values
+# Test with both i8::MAX values: 127 + 127 = 254 > 127, overflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(127, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v);
+----
+127
+
+# median_i8_min_values
+# Test with both i8::MIN values: -128 + -128 = -256 < -128, underflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(-128, 'Int8'))) AS t(v);
+----
+-128
+
+# median_i8_min_max_values
+# Test with i8::MIN and i8::MAX: -128 + 127 = -1, no overflow, median = 0 (truncated from -0.5)
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v);
+----
+0
+
+# median_u8_max_values
+# Test with both u8::MAX values: 255 + 255 = 510 > 255, overflow
+query I
+SELECT median(v) FROM (VALUES (arrow_cast(255, 'UInt8')), (arrow_cast(255, 'UInt8'))) AS t(v);
+----
+255
+
+# median_sliding_window
+statement ok
+CREATE TABLE median_window_test (
+    timestamp INT,
+    tags VARCHAR,
+    value DOUBLE
+);
+
+statement ok
+INSERT INTO median_window_test (timestamp, tags, value) VALUES
+(1, 'tag1', 10.0),
+(2, 'tag1', 20.0),
+(3, 'tag1', 30.0),
+(4, 'tag1', 40.0),
+(5, 'tag1', 50.0),
+(1, 'tag2', 60.0),
+(2, 'tag2', 70.0),
+(3, 'tag2', 80.0),
+(4, 'tag2', 90.0),
+(5, 'tag2', 100.0);
+
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
+    ) AS value_median_3
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 15
+2 tag1 20 20
+3 tag1 30 30
+4 tag1 40 40
+5 tag1 50 45
+1 tag2 60 65
+2 tag2 70 70
+3 tag2 80 80
+4 tag2 90 90
+5 tag2 100 95
+
+# median_non_sliding_window
+query ITRRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_median_unbounded_preceding,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+    ) AS value_median_unbounded_both,
+    median(value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+    ) AS value_median_unbounded_following
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 30 30
+2 tag1 20 15 30 35
+3 tag1 30 20 30 40
+4 tag1 40 25 30 45
+5 tag1 50 30 30 50
+1 tag2 60 60 80 80
+2 tag2 70 65 80 85
+3 tag2 80 70 80 90
+4 tag2 90 75 80 95
+5 tag2 100 80 80 100
+
+###########
+# Issue #19612: Test that percentile_cont produces correct results
+# in window frame queries. Previously percentile_cont consumed its internal state
+# during evaluate(), causing incorrect results when called multiple times.
+###########
+
+# Test percentile_cont sliding window (same as median)
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
+    ) AS value_percentile_50
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 15
+2 tag1 20 20
+3 tag1 30 30
+4 tag1 40 40
+5 tag1 50 45
+1 tag2 60 65
+2 tag2 70 70
+3 tag2 80 80
+4 tag2 90 90
+5 tag2 100 95
+
+# Test percentile_cont non-sliding window
+query ITRRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS value_percentile_unbounded_preceding,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+    ) AS value_percentile_unbounded_both,
+    percentile_cont(value, 0.5) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+    ) AS value_percentile_unbounded_following
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 30 30
+2 tag1 20 15 30 35
+3 tag1 30 20 30 40
+4 tag1 40 25 30 45
+5 tag1 50 30 30 50
+1 tag2 60 60 80 80
+2 tag2 70 65 80 85
+3 tag2 80 70 80 90
+4 tag2 90 75 80 95
+5 tag2 100 80 80 100
+
+# Test percentile_cont with different percentile values
+query ITRRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    percentile_cont(value, 0.25) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p25,
+    percentile_cont(value, 0.75) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS p75
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10 10
+2 tag1 20 12.5 17.5
+3 tag1 30 15 25
+4 tag1 40 17.5 32.5
+5 tag1 50 20 40
+1 tag2 60 60 60
+2 tag2 70 62.5 67.5
+3 tag2 80 65 75
+4 tag2 90 67.5 82.5
+5 tag2 100 70 90
+
+
+# Test distinct median non-sliding window
+query ITRR
+SELECT
+    timestamp,
+    tags,
+    value,
+    median(DISTINCT value) OVER (
+        PARTITION BY tags
+        ORDER BY timestamp
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS distinct_median
+FROM median_window_test
+ORDER BY tags, timestamp;
+----
+1 tag1 10 10
+2 tag1 20 15
+3 tag1 30 20
+4 tag1 40 25
+5 tag1 50 30
+1 tag2 60 60
+2 tag2 70 65
+3 tag2 80 70
+4 tag2 90 75
+5 tag2 100 80
+
+statement ok
+DROP TABLE median_window_test;
+
+query RT
+select approx_median(arrow_cast(col_f32, 'Float16')), arrow_typeof(approx_median(arrow_cast(col_f32, 'Float16'))) from median_table;
+----
+2.75 Float16
+
+# This shouldn't be NaN, see:
+# https://github.com/apache/datafusion/issues/18945
+query RT
+select
+  percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')),
+  arrow_typeof(percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')))
+from median_table;
+----
+2.75 Float16
+
+query RT
+select
+  approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')),
+  arrow_typeof(approx_percentile_cont(0.5) within group (order by arrow_cast(col_f32, 'Float16')))
+from median_table;
+----
+2.75 Float16
+
+query ?T
+select approx_median(NULL), arrow_typeof(approx_median(NULL)) from median_table;
+----
+NULL Null
+
 # median decimal
 statement ok
 create table t(c decimal(10, 4)) as values (0.0001), (0.0002), (0.0003), (0.0004), (0.0005), (0.0006);
@@ -1126,11 +1621,9 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[median(alias1)]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=4
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------AggregateExec: mode=Partial, gby=[c@0 as alias1], aggr=[]
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[c@0 as alias1], aggr=[]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t;
@@ -1273,11 +1766,23 @@ SELECT COUNT(2) FROM aggregate_test_100
 # ----
 # 100 99
 
+# csv_query_approx_count_literal_null
+query I
+SELECT approx_distinct(null)
+----
+0
+
 # csv_query_approx_count_dupe_expr_aliased
 query II
 SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_100
 ----
-100 100
+99 99
+
+# csv_query_approx_count_date_timestamp
+query IIIII
+SELECT approx_distinct(c14) AS a, approx_distinct(c15) AS b, approx_distinct(arrow_cast(c15, 'Date64')), approx_distinct(arrow_cast(c15, 'Time32(Second)')) as c, approx_distinct(arrow_cast(c15, 'Time64(Nanosecond)')) AS d FROM aggregate_test_100
+----
+18 60 60 60 60
 
 ## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
 ## data, asserting the estimated quantiles are ±5% their actual values.
@@ -1303,7 +1808,7 @@ SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_10
 ## Column `c12` is omitted due to a large relative error (~10%) due to the small
 ## float values.
 
-#csv_query_approx_percentile_cont (c2)
+# csv_query_approx_percentile_cont (c2)
 query B
 SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c2) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100
 ----
@@ -1319,6 +1824,23 @@ SELECT (ABS(1 - CAST(approx_percentile_cont(0.9) WITHIN GROUP (ORDER BY c2) AS D
 ----
 true
 
+
+# csv_query_approx_percentile_cont (c2, alternate syntax, should be the same as above)
+query B
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query B
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
+query B
+SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) AS q FROM aggregate_test_100
+----
+true
+
 # csv_query_approx_percentile_cont (c3)
 query B
 SELECT (ABS(1 - CAST(approx_percentile_cont(0.1) WITHIN GROUP (ORDER BY c3) AS DOUBLE) / -95.3) < 0.05) AS q FROM aggregate_test_100
@@ -1475,6 +1997,19 @@ SELECT APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(
 ----
 NULL
 
+# percentile_cont_with_weight_with_nulls
+query I
+SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(w, 0.5) WITHIN GROUP (ORDER BY v)
+FROM (VALUES (1, 1), (2, 1), (3, 1), (4, NULL), (NULL, 1), (NULL, NULL)) as t (v, w);
+----
+2
+
+# percentile_cont_with_weight_nulls_only
+query I
+SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(1, 0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v);
+----
+NULL
+
 #
 # percentile_cont edge cases
 #
@@ -1522,11 +2057,12 @@ statement ok
 INSERT INTO t1 VALUES (TRUE);
 
 # ISSUE: https://github.com/apache/datafusion/issues/12716
-# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN' and returns 'inf'
+# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN'
+# With weight=0, the data point does not contribute, so result is NULL
 query R
 SELECT approx_percentile_cont_with_weight(0, 0) WITHIN GROUP (ORDER BY 'NaN'::DOUBLE) FROM t1 WHERE t1.v1;
 ----
-Infinity
+NULL
 
 statement ok
 DROP TABLE t1;
@@ -1759,6 +2295,40 @@ c 122
 d 124
 e 115
 
+
+# csv_query_approx_percentile_cont_with_weight (should be the same as above)
+query TI
+SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 73
+b 68
+c 122
+d 124
+e 115
+
+
+# using approx_percentile_cont on 2 columns with same signature
+query TII
+SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.95) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 5 73
+b 5 68
+c 5 122
+d 5 124
+e 5 115
+
+# error is unique to this UDAF
+query TRR
+SELECT c1, avg(c2) AS c2, avg(c3) AS c3 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 2.857142857143 -18.333333333333
+b 3.263157894737 -5.842105263158
+c 2.666666666667 -1.333333333333
+d 2.444444444444 25.444444444444
+e 3 40.333333333333
+
+
+
 query TI
 SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
@@ -1778,6 +2348,17 @@ c 122
 d 124
 e 115
 
+# csv_query_approx_percentile_cont_with_weight alternate syntax
+query TI
+SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 73
+b 68
+c 122
+d 124
+e 115
+
+
 query TI
 SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 DESC) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
@@ -1800,11 +2381,21 @@ e 115
 query TI
 SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 74
+a 65
 b 68
-c 123
-d 124
-e 115
+c 122
+d 123
+e 110
+
+# approx_percentile_cont_with_weight with centroids
+query TI
+SELECT c1, approx_percentile_cont_with_weight(c2, 0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
+----
+a 65
+b 68
+c 122
+d 123
+e 110
 
 # csv_query_sum_crossjoin
 query TTI
@@ -2284,7 +2875,7 @@ drop table t;
 
 # test count with largeutf8
 statement ok
-create table t (c string) as values 
+create table t (c string) as values
   (arrow_cast('a', 'LargeUtf8')),
   (arrow_cast('b', 'LargeUtf8')),
   (arrow_cast(null, 'LargeUtf8')),
@@ -2294,10 +2885,10 @@ create table t (c string) as values
 query T
 select arrow_typeof(c) from t;
 ----
-Utf8
-Utf8
-Utf8
-Utf8
+Utf8View
+Utf8View
+Utf8View
+Utf8View
 
 query IT
 select count(c), arrow_typeof(count(c)) from t;
@@ -2564,33 +3155,143 @@ select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t;
 statement ok
 drop table t;
 
-# covariance_f64_4
+# correlation_f64_1
 statement ok
-create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0);
+create table t (c1 double, c2 double) as values (1, 4), (2, 5), (3, 6);
 
-query RT
-select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t;
+query RT rowsort
+select corr(c1, c2), arrow_typeof(corr(c1, c2)) from t;
 ----
-0.903333333333 Float64
-
-statement ok
-drop table t;
+1 Float64
 
-# covariance_f64_5
+# correlation with different numeric types (create test data)
 statement ok
-create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0);
+CREATE OR REPLACE TABLE corr_test(
+  int8_col TINYINT,
+  int16_col SMALLINT,
+  int32_col INT,
+  int64_col BIGINT,
+  uint32_col INT UNSIGNED,
+  float32_col FLOAT,
+  float64_col DOUBLE
+) as VALUES
+(1, 10, 100, 1000, 10000, 1.1, 10.1),
+(2, 20, 200, 2000, 20000, 2.2, 20.2),
+(3, 30, 300, 3000, 30000, 3.3, 30.3),
+(4, 40, 400, 4000, 40000, 4.4, 40.4),
+(5, 50, 500, 5000, 50000, 5.5, 50.5);
 
-query RT
-select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t;
+# correlation using int32 and float64
+query R
+SELECT corr(int32_col, float64_col) FROM corr_test;
 ----
-0.602222222222 Float64
+1
 
-statement ok
-drop table t;
+# correlation using int64 and int32
+query R
+SELECT corr(int64_col, int32_col) FROM corr_test;
+----
+1
 
-# covariance_f64_6
-statement ok
-create table t (c1 double, c2 double) as values (1.0, 4.0), (2.0, 5.0), (3.0, 6.0), (1.1, 4.4), (2.2, 5.5), (3.3, 6.6);
+# correlation using float32 and int8
+query R
+SELECT corr(float32_col, int8_col) FROM corr_test;
+----
+1
+
+# correlation using uint32 and int16
+query R
+SELECT corr(uint32_col, int16_col) FROM corr_test;
+----
+1
+
+# correlation with nulls
+statement ok
+CREATE OR REPLACE TABLE corr_nulls(
+  x INT,
+  y DOUBLE
+) as VALUES
+(1, 10.0),
+(2, 20.0),
+(NULL, 30.0),
+(4, NULL),
+(5, 50.0);
+
+# correlation with some nulls (should skip null pairs)
+query R
+SELECT corr(x, y) FROM corr_nulls;
+----
+1
+
+# correlation with single row (should return NULL)
+statement ok
+CREATE OR REPLACE TABLE corr_single_row(
+  x INT,
+  y DOUBLE
+) as VALUES
+(1, 10.0);
+
+query R
+SELECT corr(x, y) FROM corr_single_row;
+----
+NULL
+
+# correlation with all nulls
+statement ok
+CREATE OR REPLACE TABLE corr_all_nulls(
+  x INT,
+  y DOUBLE
+) as VALUES
+(NULL, NULL),
+(NULL, NULL);
+
+query R
+SELECT corr(x, y) FROM corr_all_nulls;
+----
+NULL
+
+statement ok
+drop table corr_test;
+
+statement ok
+drop table corr_nulls;
+
+statement ok
+drop table corr_single_row;
+
+statement ok
+drop table corr_all_nulls;
+
+# covariance_f64_4
+statement ok
+drop table if exists t;
+
+statement ok
+create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0);
+
+query RT
+select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t;
+----
+0.903333333333 Float64
+
+statement ok
+drop table t;
+
+# covariance_f64_5
+statement ok
+create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0);
+
+query RT
+select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t;
+----
+0.602222222222 Float64
+
+statement ok
+drop table t;
+
+# covariance_f64_6
+statement ok
+create table t (c1 double, c2 double) as values (1.0, 4.0), (2.0, 5.0), (3.0, 6.0), (1.1, 4.4), (2.2, 5.5), (3.3, 6.6);
 
 query RT
 select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t;
@@ -3096,6 +3797,395 @@ c 4
 d 4
 e 4
 
+#####################
+## percentile_cont tests (exact percentile calculation)
+#####################
+
+# Test error conditions for percentile_cont
+statement error DataFusion error: Error during planning: Percentile value must be between 0.0 and 1.0 inclusive
+SELECT percentile_cont(1.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
+
+statement error DataFusion error: Error during planning: Percentile value must be between 0.0 and 1.0 inclusive
+SELECT percentile_cont(-0.1) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
+
+statement error DataFusion error: Error during planning: Percentile value for 'PERCENTILE_CONT' must be a literal
+SELECT percentile_cont(c2) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
+
+statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for percentile_cont
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) IGNORE NULLS FROM aggregate_test_100
+
+statement error DataFusion error: Error during planning: \[IGNORE | RESPECT\] NULLS are not permitted for percentile_cont
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) RESPECT NULLS FROM aggregate_test_100
+
+statement error DataFusion error: This feature is not implemented: Only a single ordering expression is permitted in a WITHIN GROUP clause
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3, c2) FROM aggregate_test_100
+
+# Not supported over sliding windows
+query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together
+SELECT percentile_cont(0.5)
+WITHIN GROUP (ORDER BY c3)
+OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW)
+FROM aggregate_test_100
+
+# Test basic percentile_cont with WITHIN GROUP syntax
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+3
+
+query R
+SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+1
+
+query R
+SELECT percentile_cont(1.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+5
+
+# Ensure percentile_cont simplification rewrites to min/max plans
+query TT
+EXPLAIN SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 ASC NULLS LAST]]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY c2 DESC) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(Float64(0)) WITHIN GROUP [aggregate_test_100.c2 DESC NULLS FIRST]]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(c2, 0.0) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(aggregate_test_100.c2,Float64(0))]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(0))]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(0))]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query TT
+EXPLAIN SELECT percentile_cont(c2, 1.0) FROM aggregate_test_100;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(CAST(aggregate_test_100.c2 AS Float64)) AS percentile_cont(aggregate_test_100.c2,Float64(1))]]
+02)--TableScan: aggregate_test_100 projection=[c2]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(1))]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[percentile_cont(aggregate_test_100.c2,Float64(1))]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2], file_type=csv, has_header=true
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+2
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+4
+
+# Test that percentile_cont(0.5) equals median
+query I
+SELECT median(c2) FROM aggregate_test_100
+----
+3
+
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+3
+
+# Test with descending order
+query R
+SELECT percentile_cont(0.95) WITHIN GROUP (ORDER BY c3 DESC) FROM aggregate_test_100
+----
+-101.25
+
+query R
+SELECT percentile_cont(0.05) WITHIN GROUP (ORDER BY c3 DESC) FROM aggregate_test_100
+----
+118.099998
+
+# Test with GROUP BY
+query TR
+SELECT c1, percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1
+----
+a -25
+b 17
+c 1
+d 46.5
+e 64
+
+query TR
+SELECT c1, percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1
+----
+a 65
+b 68
+c 118
+d 123.299998
+e 112
+
+# Test with NULLs
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v)
+----
+2
+
+# Test with all NULLs
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (CAST(NULL as INT))) as t (v)
+----
+NULL
+
+# Test with empty set
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1)) as t (v) WHERE v > 10
+----
+NULL
+
+# Test with single value
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (42)) as t (v)
+----
+42
+
+# Test with float values for interpolation
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v)
+----
+2.5
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v)
+----
+1.75
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1.0), (2.0), (3.0), (4.0)) as t (v)
+----
+3.25
+
+# Test with various numeric types
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c7) FROM aggregate_test_100
+----
+134.5
+
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c8) FROM aggregate_test_100
+----
+30634
+
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c11) FROM aggregate_test_100
+----
+0.4906719
+
+# Test edge case with two values (tests interpolation)
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v)
+----
+15
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v)
+----
+12.5
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (10.0), (20.0)) as t (v)
+----
+17.5
+
+# Test integer inputs requiring interpolation (should return float)
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v)
+----
+2.5
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v)
+----
+1.75
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4)) as t (v)
+----
+3.25
+
+# Test with exact percentile values (no interpolation needed)
+query R
+SELECT percentile_cont(0.0) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+1
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+2
+
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+3
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+4
+
+query R
+SELECT percentile_cont(1.0) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+5
+
+# Test with negative numbers
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v)
+----
+0
+
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v)
+----
+-5
+
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v) FROM (VALUES (-10), (-5), (0), (5), (10)) as t (v)
+----
+5
+
+# Test comparison: percentile_cont should give exact results
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
+----
+15.5
+
+# Compare with approx_percentile_cont (should be close but may not be exact)
+query B
+SELECT ABS(percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) - approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY c3)) < 5 FROM aggregate_test_100
+----
+true
+
+# Test percentile_cont without WITHIN GROUP clause (alternate syntax)
+query R
+SELECT percentile_cont(c2, 0.5) FROM aggregate_test_100
+----
+3
+
+query R
+SELECT percentile_cont(c2, 0.0) FROM aggregate_test_100
+----
+1
+
+query R
+SELECT percentile_cont(c2, 1.0) FROM aggregate_test_100
+----
+5
+
+query R
+SELECT percentile_cont(c2, 0.25) FROM aggregate_test_100
+----
+2
+
+query R
+SELECT percentile_cont(c2, 0.75) FROM aggregate_test_100
+----
+4
+
+# Verify alternate syntax gives same results as WITHIN GROUP syntax
+query B
+SELECT percentile_cont(c2, 0.5) = percentile_cont(0.5) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+----
+true
+
+query B
+SELECT percentile_cont(c3, 0.5) = percentile_cont(0.5) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100
+----
+true
+
+# Test alternate syntax with GROUP BY
+query TR
+SELECT c1, percentile_cont(c3, 0.5) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1
+----
+a -25
+b 17
+c 1
+d 46.5
+e 64
+
+# Verify alternate syntax with GROUP BY gives same results as WITHIN GROUP
+query TB
+SELECT c1, percentile_cont(c3, 0.95) = percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1
+----
+a true
+b true
+c true
+d true
+e true
+
+# Test ascending vs descending equivalence: percentile_cont(0.4) ASC should equal percentile_cont(0.6) DESC
+# This tests the mathematical property that the pth percentile ascending = (1-p)th percentile descending
+# Using a simple controlled dataset to demonstrate the property
+
+# Show 0.4 ascending
+query R
+SELECT percentile_cont(0.4) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+2.6
+
+# Show 0.6 descending (should be same as 0.4 ascending)
+query R
+SELECT percentile_cont(0.6) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (1), (2), (3), (4), (5)) as t (v)
+----
+2.6
+
+# Show 0.3 ascending
+query R
+SELECT percentile_cont(0.3) WITHIN GROUP (ORDER BY v) FROM (VALUES (10), (20), (30), (40), (50)) as t (v)
+----
+21.99999
+
+# Show 0.7 descending (should be same as 0.3 ascending)
+query R
+SELECT percentile_cont(0.7) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (10), (20), (30), (40), (50)) as t (v)
+----
+22
+
+# Show 0.25 ascending on larger dataset
+query R
+SELECT percentile_cont(0.25) WITHIN GROUP (ORDER BY v) FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) as t (v)
+----
+2.75
+
+# Show 0.75 descending (should be same as 0.25 ascending)
+query R
+SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY v DESC) FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) as t (v)
+----
+2.75
+
 # array_agg_zero
 query ?
 SELECT ARRAY_AGG([])
@@ -3934,12 +5024,22 @@ as values
 query ????
 SELECT min(column1), min(column2), min(column3), min(column4) FROM d;
 ----
-0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 0.002 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000004 secs
+0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 0.002 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000004 secs
+
+query ????
+SELECT max(column1), max(column2), max(column3), max(column4) FROM d;
+----
+0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs
+
+query ????
+SELECT avg(column1), avg(column2), avg(column3), avg(column4) FROM d;
+----
+0 days 0 hours 0 mins 6 secs 0 days 0 hours 0 mins 0.012 secs 0 days 0 hours 0 mins 0.000018 secs 0 days 0 hours 0 mins 0.000000024 secs
 
 query ????
-SELECT max(column1), max(column2), max(column3), max(column4) FROM d;
+SELECT sum(column1), sum(column2), sum(column3), sum(column4) FROM d;
 ----
-0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs
+0 days 0 hours 0 mins 12 secs 0 days 0 hours 0 mins 0.024 secs 0 days 0 hours 0 mins 0.000036 secs 0 days 0 hours 0 mins 0.000000048 secs
 
 # GROUP BY follows a different code path
 query ????I
@@ -3952,6 +5052,16 @@ SELECT max(column1), max(column2), max(column3), max(column4), column5 FROM d GR
 ----
 0 days 0 hours 0 mins 11 secs 0 days 0 hours 0 mins 0.022 secs 0 days 0 hours 0 mins 0.000033 secs 0 days 0 hours 0 mins 0.000000044 secs 1
 
+query ????I
+SELECT avg(column1), avg(column2), avg(column3), avg(column4), column5 FROM d GROUP BY column5;
+----
+0 days 0 hours 0 mins 6 secs 0 days 0 hours 0 mins 0.012 secs 0 days 0 hours 0 mins 0.000018 secs 0 days 0 hours 0 mins 0.000000024 secs 1
+
+query ????I
+SELECT sum(column1), sum(column2), sum(column3), sum(column4), column5 FROM d GROUP BY column5;
+----
+0 days 0 hours 0 mins 12 secs 0 days 0 hours 0 mins 0.024 secs 0 days 0 hours 0 mins 0.000036 secs 0 days 0 hours 0 mins 0.000000048 secs 1
+
 statement ok
 INSERT INTO d VALUES
   (arrow_cast(3, 'Duration(Second)'), arrow_cast(1, 'Duration(Millisecond)'), arrow_cast(7, 'Duration(Microsecond)'), arrow_cast(2, 'Duration(Nanosecond)'), 1),
@@ -3967,6 +5077,16 @@ SELECT min(column1), min(column2), min(column3), min(column4), column5 FROM d GR
 ----
 0 days 0 hours 0 mins 0 secs 0 days 0 hours 0 mins 0.001 secs 0 days 0 hours 0 mins 0.000003 secs 0 days 0 hours 0 mins 0.000000002 secs 1
 
+query ????I
+SELECT avg(column1), avg(column2), avg(column3), avg(column4), column5 FROM d GROUP BY column5 ORDER BY column5;
+----
+0 days 0 hours 0 mins 3 secs 0 days 0 hours 0 mins 0.008 secs 0 days 0 hours 0 mins 0.000012 secs 0 days 0 hours 0 mins 0.000000014 secs 1
+
+query ????I
+SELECT sum(column1), sum(column2), sum(column3), sum(column4), column5 FROM d GROUP BY column5 ORDER BY column5;
+----
+0 days 0 hours 0 mins 15 secs 0 days 0 hours 0 mins 0.034 secs 0 days 0 hours 0 mins 0.000048 secs 0 days 0 hours 0 mins 0.000000058 secs 1
+
 statement ok
 drop table d;
 
@@ -4257,6 +5377,50 @@ DROP VIEW binary_views
 statement ok
 DROP TABLE strings;
 
+############ FixedSizeBinary ############
+
+statement ok
+CREATE TABLE binaries
+AS VALUES
+ (X'000103', 1),
+ (X'000104', 1),
+ (X'000101', 3),
+ (X'000103', 1),
+ (X'000102', 1),
+ (NULL, 1),
+ (NULL, 4),
+ (X'000104', 1),
+ (X'000109', 2),
+ (X'000103', 1),
+ (X'000101', 2);
+
+statement ok
+CREATE VIEW fixed_size_binary_views
+AS SELECT arrow_cast(column1, 'FixedSizeBinary(3)') as value, column2 as id FROM binaries;
+
+query I?
+SELECT id, MIN(value) FROM fixed_size_binary_views GROUP BY id ORDER BY id;
+----
+1 000102
+2 000101
+3 000101
+4 NULL
+
+query I?
+SELECT id, MAX(value) FROM fixed_size_binary_views GROUP BY id ORDER BY id;
+----
+1 000104
+2 000109
+3 000101
+4 NULL
+
+statement ok
+DROP VIEW fixed_size_binary_views;
+
+statement ok
+DROP TABLE binaries;
+
+
 #################
 # End min_max on strings/binary with null values and groups
 #################
@@ -4366,10 +5530,10 @@ as values
 statement ok
 create table t as
 select
-  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as nanos,
-  arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros,
-  arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis,
-  arrow_cast(column1, 'Timestamp(Second, None)') as secs,
+  arrow_cast(column1, 'Timestamp(ns)') as nanos,
+  arrow_cast(column1, 'Timestamp(µs)') as micros,
+  arrow_cast(column1, 'Timestamp(ms)') as millis,
+  arrow_cast(column1, 'Timestamp(s)') as secs,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))') as nanos_utc,
   arrow_cast(column1, 'Timestamp(Microsecond, Some("UTC"))') as micros_utc,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("UTC"))') as millis_utc,
@@ -4452,7 +5616,7 @@ SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag
 
 # aggregate_duration_array_agg
 query T?
-SELECT tag, array_agg(millis - arrow_cast(secs, 'Timestamp(Millisecond, None)')) FROM t GROUP BY tag ORDER BY tag;
+SELECT tag, array_agg(millis - arrow_cast(secs, 'Timestamp(ms)')) FROM t GROUP BY tag ORDER BY tag;
 ----
 X [0 days 0 hours 0 mins 0.011 secs, 0 days 0 hours 0 mins 0.123 secs]
 Y [NULL, 0 days 0 hours 0 mins 0.432 secs]
@@ -4482,9 +5646,7 @@ statement ok
 create table t as
 select
   arrow_cast(column1, 'Date32') as date32,
-  -- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can use this
-  -- arrow_cast(column1, 'Date64') as date64,
-  arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64,
+  arrow_cast(column1, 'Date64') as date64,
   column2 as names,
   column3 as tag
 from t_source;
@@ -4810,7 +5972,7 @@ statement ok
 create table t (c1 decimal(10, 0), c2 int) as values (null, null), (null, null), (null, null);
 
 query RTIT
-select 
+select
   sum(c1), arrow_typeof(sum(c1)),
   sum(c2), arrow_typeof(sum(c2))
 from t;
@@ -4887,10 +6049,6 @@ select c2, count(DISTINCT cast(c1 AS DECIMAL(10, 2))) from d_table GROUP BY c2 O
 A 2
 B 2
 
-# Use PostgresSQL dialect
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 # Creating the table
 statement ok
 CREATE TABLE test_table (c1 INT, c2 INT, c3 INT)
@@ -5020,33 +6178,45 @@ select c3, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c4), sum(c4) fro
 700.1 2 15.15 30.3 10.1 20.2 0 NULL
 NULL 1 10.1 10.1 10.1 10.1 0 NULL
 
-# Restore the default dialect
-statement ok
-set datafusion.sql_parser.dialect = 'Generic';
-
 ## Multiple distinct aggregates and dictionaries
 statement ok
-create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));
+create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')), (1, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));
 
 query IT
-select * from dict_test;
+select * from dict_test order by column1, column2;
 ----
+1 bar
+1 foo
 1 foo
 2 bar
 
 query II
-select count(distinct column1), count(distinct column2) from dict_test group by column1;
+select count(distinct column1), count(distinct column2) from dict_test group by column1 order by column1;
 ----
-1 1
+1 2
 1 1
 
 statement ok
 drop table dict_test;
 
+## count distinct dictionary with null values
+statement ok
+create table dict_null_test as
+    select arrow_cast(NULL, 'Dictionary(Int32, Utf8)') as d
+    from (values (1), (2), (3), (4), (5));
+
+query I
+select count(distinct d) from dict_null_test;
+----
+0
+
+statement ok
+drop table dict_null_test;
+
 # avg_duration
 
 statement ok
-create table d as values 
+create table d as values
   (arrow_cast(1, 'Duration(Second)'), arrow_cast(2, 'Duration(Millisecond)'), arrow_cast(3, 'Duration(Microsecond)'), arrow_cast(4, 'Duration(Nanosecond)'), 1),
   (arrow_cast(11, 'Duration(Second)'), arrow_cast(22, 'Duration(Millisecond)'), arrow_cast(33, 'Duration(Microsecond)'), arrow_cast(44, 'Duration(Nanosecond)'), 1);
 
@@ -5100,7 +6270,7 @@ FROM d WHERE column1 IS NOT NULL;
 
 # Centered average window function
 query I??
-SELECT column5, column1, avg(column1) OVER (ORDER BY column5 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as centered_avg 
+SELECT column5, column1, avg(column1) OVER (ORDER BY column5 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as centered_avg
 FROM d WHERE column1 IS NOT NULL;
 ----
 1 0 days 0 hours 0 mins 1 secs 0 days 0 hours 0 mins 6 secs
@@ -5168,8 +6338,10 @@ select avg(distinct x_dict) from value_dict;
 ----
 3
 
-query error
+query RR
 select avg(x_dict), avg(distinct x_dict) from value_dict;
+----
+2.625 3
 
 query I
 select min(x_dict) from value_dict;
@@ -5285,16 +6457,12 @@ logical_plan
 04)------TableScan: aggregate_test_100 projection=[c1, c3]
 physical_plan
 01)CoalescePartitionsExec: fetch=5
-02)--AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([c3@0, min(aggregate_test_100.c1)@1], 4), input_partitions=4
-05)--------AggregateExec: mode=Partial, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3], aggr=[min(aggregate_test_100.c1)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
+02)--AggregateExec: mode=SinglePartitioned, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5]
+03)----AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3], aggr=[min(aggregate_test_100.c1)]
+04)------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
 
 
 #
@@ -5319,7 +6487,7 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[5]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], file_type=csv, has_header=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c3], file_type=csv, has_header=true
 
 query I
 SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 order by c3 limit 5;
@@ -5343,7 +6511,7 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
 
 query II
 SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 order by c2, c3 limit 5 offset 4;
@@ -5375,10 +6543,9 @@ physical_plan
 07)------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
 08)--------------CoalescePartitionsExec
 09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+10)------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
 
 query I
 SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c3 order by c3 limit 4;
@@ -5404,7 +6571,7 @@ physical_plan
 04)------CoalescePartitionsExec
 05)--------AggregateExec: mode=Partial, gby=[c2@1 as c2, c3@2 as c3], aggr=[max(aggregate_test_100.c1)]
 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
 # TODO(msirek): Extend checking in LimitedDistinctAggregation equal groupings to ignore the order of columns
 # in the group-by column lists, so the limit could be pushed to the lowest AggregateExec in this case
@@ -5428,7 +6595,7 @@ physical_plan
 08)--------------CoalescePartitionsExec
 09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
 10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
 
 query II
 SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c3, c2 order by c3, c2 limit 3 offset 10;
@@ -5452,7 +6619,7 @@ physical_plan
 04)------CoalescePartitionsExec
 05)--------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[]
 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
 
 query II
 SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3;
@@ -5479,7 +6646,7 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], file_type=csv, has_header=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c3], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.enable_distinct_aggregation_soft_limit = true;
@@ -5551,7 +6718,7 @@ NULL NULL 3 NULL 1 4 0 8 0
 
 # regr_*() basic tests
 query RRIRRRRRR
-select 
+select
     regr_slope(column2, column1),
     regr_intercept(column2, column1),
     regr_count(column2, column1),
@@ -5566,7 +6733,7 @@ from (values (1,2), (2,4), (3,6));
 2 0 3 1 2 4 2 8 4
 
 query RRIRRRRRR
-select 
+select
     regr_slope(c12, c11),
     regr_intercept(c12, c11),
     regr_count(c12, c11),
@@ -5580,11 +6747,16 @@ from aggregate_test_100;
 ----
 0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695
 
-
+query R
+select
+    regr_slope(arrow_cast(c12, 'Float16'), arrow_cast(c11, 'Float16'))
+from aggregate_test_100;
+----
+0.051477733249
 
 # regr_*() functions ignore NULLs
 query RRIRRRRRR
-select 
+select
     regr_slope(column2, column1),
     regr_intercept(column2, column1),
     regr_count(column2, column1),
@@ -5599,7 +6771,7 @@ from (values (1,NULL), (2,4), (3,6));
 2 0 2 1 2.5 5 0.5 2 1
 
 query RRIRRRRRR
-select 
+select
     regr_slope(column2, column1),
     regr_intercept(column2, column1),
     regr_count(column2, column1),
@@ -5614,7 +6786,7 @@ from (values (1,NULL), (NULL,4), (3,6));
 NULL NULL 1 NULL 3 6 0 0 0
 
 query RRIRRRRRR
-select 
+select
     regr_slope(column2, column1),
     regr_intercept(column2, column1),
     regr_count(column2, column1),
@@ -5629,8 +6801,8 @@ from (values (1,NULL), (NULL,4), (NULL,NULL));
 NULL NULL 0 NULL NULL NULL NULL NULL NULL
 
 query TRRIRRRRRR rowsort
-select 
-    column3, 
+select
+    column3,
     regr_slope(column2, column1),
     regr_intercept(column2, column1),
     regr_count(column2, column1),
@@ -5654,7 +6826,7 @@ statement ok
 set datafusion.execution.batch_size = 1;
 
 query RRIRRRRRR
-select 
+select
     regr_slope(c12, c11),
     regr_intercept(c12, c11),
     regr_count(c12, c11),
@@ -5672,7 +6844,7 @@ statement ok
 set datafusion.execution.batch_size = 2;
 
 query RRIRRRRRR
-select 
+select
     regr_slope(c12, c11),
     regr_intercept(c12, c11),
     regr_count(c12, c11),
@@ -5690,7 +6862,7 @@ statement ok
 set datafusion.execution.batch_size = 3;
 
 query RRIRRRRRR
-select 
+select
     regr_slope(c12, c11),
     regr_intercept(c12, c11),
     regr_count(c12, c11),
@@ -5874,6 +7046,136 @@ GROUP BY dummy
 ----
 text1
 
+
+# Test string_agg with ORDER BY clasuses (issue #17011)
+statement ok
+create table t (k varchar, v int);
+
+statement ok
+insert into t values ('a', 2), ('b', 3), ('c', 1), ('d', null);
+
+query T
+select string_agg(k, ',' order by k) from t;
+----
+a,b,c,d
+
+query T
+select string_agg(k, ',' order by k desc) from t;
+----
+d,c,b,a
+
+query T
+select string_agg(k, ',' order by v) from t;
+----
+c,a,b,d
+
+query T
+select string_agg(k, ',' order by v nulls first) from t;
+----
+d,c,a,b
+
+query T
+select string_agg(k, ',' order by v desc) from t;
+----
+d,b,a,c
+
+query T
+select string_agg(k, ',' order by v desc nulls last) from t;
+----
+b,a,c,d
+
+query T
+-- odd indexes should appear first, ties solved by v
+select string_agg(k, ',' order by v % 2 == 0, v) from t;
+----
+c,b,a,d
+
+query T
+-- odd indexes should appear first, ties solved by v desc
+select string_agg(k, ',' order by v % 2 == 0, v desc) from t;
+----
+b,c,a,d
+
+query T
+select string_agg(k, ',' order by
+  case
+    when k = 'a' then 3
+    when k = 'b' then 0
+    when k = 'c' then 2
+    when k = 'd' then 1
+  end)
+from t;
+----
+b,d,c,a
+
+query T
+select string_agg(k, ',' order by
+  case
+    when k = 'a' then 3
+    when k = 'b' then 0
+    when k = 'c' then 2
+    when k = 'd' then 1
+  end desc)
+from t;
+----
+a,c,d,b
+
+# Test explain / reverse_expr for string_agg
+query TT
+explain select string_agg(k, ',' order by v) from t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v ASC NULLS LAST]]]
+02)--TableScan: t projection=[k, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[string_agg(t.k,Utf8(",")) ORDER BY [t.v ASC NULLS LAST]]
+02)--SortExec: expr=[v@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query T
+select string_agg(k, ',' order by v) from t;
+----
+c,a,b,d
+
+query TT
+explain select string_agg(k, ',' order by v desc) from t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]]
+02)--TableScan: t projection=[k, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[string_agg(t.k,Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]
+02)--SortExec: expr=[v@1 DESC], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query T
+select string_agg(k, ',' order by v desc) from t;
+----
+d,b,a,c
+
+# Call string_agg with both ASC and DESC orderings, and expect only one sort
+# (because the aggregate can handle reversed inputs)
+query TT
+explain select string_agg(k, ',' order by v asc), string_agg(k, ',' order by v desc) from t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[string_agg(t.k, Utf8(",")) ORDER BY [t.v ASC NULLS LAST], string_agg(t.k, Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]]
+02)--TableScan: t projection=[k, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[string_agg(t.k,Utf8(",")) ORDER BY [t.v ASC NULLS LAST], string_agg(t.k,Utf8(",")) ORDER BY [t.v DESC NULLS FIRST]]
+02)--SortExec: expr=[v@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+select string_agg(k, ',' order by v asc), string_agg(k, ',' order by v desc) from t;
+----
+c,a,b,d d,b,a,c
+
+
+statement ok
+drop table t;
+
+
 # Tests for aggregating with NaN values
 statement ok
 CREATE TABLE float_table (
@@ -5888,7 +7190,7 @@ CREATE TABLE float_table (
 
 # Test string_agg with largeutf8
 statement ok
-create table string_agg_large_utf8 (c string) as values 
+create table string_agg_large_utf8 (c string) as values
   (arrow_cast('a', 'LargeUtf8')),
   (arrow_cast('b', 'LargeUtf8')),
   (arrow_cast('c', 'LargeUtf8'))
@@ -5943,7 +7245,7 @@ select count(*) from (select count(*) a, count(*) b from (select 1));
 
 # UTF8 string matters for string to &[u8] conversion, add it to prevent regression
 statement ok
-create table distinct_count_string_table as values 
+create table distinct_count_string_table as values
     (1, 'a', 'longstringtest_a', '台灣'),
     (2, 'b', 'longstringtest_b1', '日本'),
     (2, 'b', 'longstringtest_b2', '中國'),
@@ -6307,7 +7609,7 @@ physical_plan
 01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]
 02)--CoalescePartitionsExec
 03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 # test last to first
@@ -6321,7 +7623,7 @@ physical_plan
 01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]
 02)--CoalescePartitionsExec
 03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], file_type=csv, has_header=true
 
 # test building plan with aggreagte sum
@@ -6356,11 +7658,16 @@ statement ok
 drop table employee_csv;
 
 # test null literal handling in supported aggregate functions
-query I??III?T
+query I??????T
 select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
 ----
 0 NULL NULL NULL NULL NULL NULL NULL
 
+query TTT
+SELECT arrow_typeof(bit_and(NULL)), arrow_typeof(bit_or(NULL)), arrow_typeof(bit_xor(NULL))
+----
+Null Null Null
+
 statement ok
 create table having_test(v1 int, v2 int)
 
@@ -6388,14 +7695,11 @@ logical_plan
 03)----Aggregate: groupBy=[[having_test.v1, having_test.v2]], aggr=[[max(having_test.v1)]]
 04)------TableScan: having_test projection=[v1, v2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: max(having_test.v1)@2 = 3, projection=[v1@0, v2@1]
-03)----AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: max(having_test.v1)@2 = 3, projection=[v1@0, v2@1]
+02)--AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+03)----RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query error
@@ -6528,32 +7832,31 @@ logical_plan
 04)------SubqueryAlias: a
 05)--------Union
 06)----------Projection: Int64(1) AS id, Int64(2) AS foo
-07)------------EmptyRelation
+07)------------EmptyRelation: rows=1
 08)----------Projection: Int64(1) AS id, Int64(4) AS foo
-09)------------EmptyRelation
+09)------------EmptyRelation: rows=1
 10)----------Projection: Int64(1) AS id, Int64(5) AS foo
-11)------------EmptyRelation
+11)------------EmptyRelation: rows=1
 12)----------Projection: Int64(1) AS id, Int64(3) AS foo
-13)------------EmptyRelation
+13)------------EmptyRelation: rows=1
 14)----------Projection: Int64(1) AS id, Int64(2) AS foo
-15)------------EmptyRelation
+15)------------EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST]@1 as last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))]
 02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
-05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
-06)----------UnionExec
-07)------------ProjectionExec: expr=[1 as id, 2 as foo]
-08)--------------PlaceholderRowExec
-09)------------ProjectionExec: expr=[1 as id, 4 as foo]
-10)--------------PlaceholderRowExec
-11)------------ProjectionExec: expr=[1 as id, 5 as foo]
-12)--------------PlaceholderRowExec
-13)------------ProjectionExec: expr=[1 as id, 3 as foo]
-14)--------------PlaceholderRowExec
-15)------------ProjectionExec: expr=[1 as id, 2 as foo]
-16)--------------PlaceholderRowExec
+03)----RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5
+04)------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted
+05)--------UnionExec
+06)----------ProjectionExec: expr=[1 as id, 2 as foo]
+07)------------PlaceholderRowExec
+08)----------ProjectionExec: expr=[1 as id, 4 as foo]
+09)------------PlaceholderRowExec
+10)----------ProjectionExec: expr=[1 as id, 5 as foo]
+11)------------PlaceholderRowExec
+12)----------ProjectionExec: expr=[1 as id, 3 as foo]
+13)------------PlaceholderRowExec
+14)----------ProjectionExec: expr=[1 as id, 2 as foo]
+15)------------PlaceholderRowExec
 
 # SortExec is removed if it is coming after one-row producing AggregateExec's having an empty group by expression
 query TT
@@ -6570,7 +7873,7 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(aggregate_test_100.c5)]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[c5], file_type=csv, has_header=true
 
 statement count 0
 drop table aggregate_test_100;
@@ -6682,8 +7985,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(Int64(1)), 2 as count()]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(Int64(1)), count(Int64(1))@0 as count()]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query II
 select count(1), count(*) from t;
@@ -6698,8 +8002,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(Int64(1)), 2 as count(*)]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(Int64(1)), count(Int64(1))@0 as count(*)]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query II
 select count(), count(*) from t;
@@ -6714,8 +8019,9 @@ logical_plan
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[2 as count(), 2 as count(*)]
-02)--PlaceholderRowExec
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(), count(Int64(1))@0 as count(*)]
+02)--ProjectionExec: expr=[2 as count(Int64(1))]
+03)----PlaceholderRowExec
 
 query TT
 explain select count(1) * count(2) from t;
@@ -6744,21 +8050,21 @@ group0 -14
 group1 100
 
 # group median i16 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i16) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -16334
 group1 100
 
 # group median i32 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -1073741774
 group1 100
 
 # group median i64 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_i64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 -4611686018427387854
@@ -6772,56 +8078,56 @@ group0 50
 group1 100
 
 # group median u16 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u16) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median u32 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median u64 non-nullable
-query TI
+query TI rowsort
 SELECT col_group, median(col_u64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 50
 group1 100
 
 # group median f32 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f32) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 2.75
 group1 3.2
 
 # group median f64 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f64) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 2.75
 group1 3.3
 
 # group median f64_nan non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_f64_nan) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 NaN
 group1 NaN
 
 # group median decimal128 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_decimal128) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 0.0002
 group1 0.0003
 
 # group median decimal256 non-nullable
-query TR
+query TR rowsort
 SELECT col_group, median(col_decimal256) FROM group_median_table_non_nullable GROUP BY col_group
 ----
 group0 0.0002
@@ -6937,6 +8243,38 @@ SELECT a, median(b), arrow_typeof(median(b)) FROM group_median_all_nulls GROUP B
 group0 NULL Int32
 group1 NULL Int32
 
+statement ok
+create table t_decimal (c decimal(10, 4)) as values (100.00), (125.00), (175.00), (200.00), (200.00), (300.00), (null), (null);
+
+# Test avg_distinct for Decimal128
+query RT
+select avg(distinct c), arrow_typeof(avg(distinct c)) from t_decimal;
+----
+180 Decimal128(14, 8)
+
+statement ok
+drop table t_decimal;
+
+# Test avg_distinct for Decimal256
+statement ok
+create table t_decimal256 (c decimal(50, 2)) as values
+  (100.00),
+  (125.00),
+  (175.00),
+  (200.00),
+  (200.00),
+  (300.00),
+  (null),
+  (null);
+
+query RT
+select avg(distinct c), arrow_typeof(avg(distinct c)) from t_decimal256;
+----
+180 Decimal256(54, 6)
+
+statement ok
+drop table t_decimal256;
+
 query I
 with test AS (SELECT i as c1, i + 1 as c2 FROM generate_series(1, 10) t(i))
 select count(*) from test WHERE 1 = 1;
@@ -7031,3 +8369,418 @@ VALUES
 );
 ----
 {a: 1, b: 2, c: 3} {a: 1, b: 2, c: 4}
+
+query TI
+SELECT column1, COUNT(DISTINCT column2) FROM (
+VALUES
+  ('x', arrow_cast('NAN','Float64')),
+  ('x', arrow_cast('NAN','Float64'))
+) GROUP BY 1 ORDER BY 1;
+----
+x 1
+
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT array_agg(a_varchar) WITHIN GROUP (ORDER BY a_varchar)
+FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
+
+
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT array_agg(DISTINCT a_varchar) WITHIN GROUP (ORDER BY a_varchar)
+FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
+
+
+query error Error during planning: ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function
+SELECT array_agg(a_varchar order by a_varchar) WITHIN GROUP (ORDER BY a_varchar)
+FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
+
+# distinct average
+statement ok
+create table distinct_avg (a int, b double, c decimal(10, 4), d decimal(50, 2)) as values
+  (3, null, 100.2562, 90251.21),
+  (2, null, 100.2562, null),
+  (5, 100.5, null, 10000000.11),
+  (5, 1.0, 100.2563, -1.0),
+  (5, 44.112, -132.12, null),
+  (null, 1.0, 100.2562, 90251.21),
+  (5, 100.5, -100.2562, -10000000.11),
+  (1, 4.09, 4222.124, 0.0),
+  (5, 100.5, null, 10000000.11),
+  (5, 100.5, 1.1, 1.0),
+  (4, null, 4222.124, null),
+  (null, null, null, null)
+;
+
+# Need two columns to ensure single_distinct_to_group_by rule doesn't kick in, so we know our actual avg(distinct) code is being tested
+query RTRTRTRTRRRR
+select
+    avg(distinct a),
+    arrow_typeof(avg(distinct a)),
+    avg(distinct b),
+    arrow_typeof(avg(distinct b)),
+    avg(distinct c),
+    arrow_typeof(avg(distinct c)),
+    avg(distinct d),
+    arrow_typeof(avg(distinct d)),
+    avg(a),
+	avg(b),
+    avg(c),
+	avg(d)
+from distinct_avg;
+----
+3 Float64 37.4255 Float64 698.56005 Decimal128(14, 8) 15041.868333 Decimal256(54, 6) 4 56.52525 957.11074444 1272562.81625
+
+query RRRR rowsort
+select
+    avg(distinct a),
+    avg(distinct b),
+    avg(distinct c),
+    avg(distinct d)
+from distinct_avg
+group by b;
+----
+1 4.09 4222.124 0
+3 NULL 2161.1901 90251.21
+5 1 100.25625 45125.105
+5 100.5 -49.5781 0.333333
+5 44.112 -132.12 NULL
+
+query RRRR
+select
+    avg(distinct a),
+    avg(distinct b),
+    avg(distinct c),
+    avg(distinct d)
+from distinct_avg
+where a is null and b is null and c is null and d is null;
+----
+NULL NULL NULL NULL
+
+statement ok
+drop table distinct_avg;
+
+query R
+select percentile_cont(null, 0.5);
+----
+NULL
+
+# Test string_agg window frame behavior (fix for issue #19612)
+statement ok
+CREATE TABLE string_agg_window_test (
+    id INT,
+    grp VARCHAR,
+    val VARCHAR
+);
+
+statement ok
+INSERT INTO string_agg_window_test (id, grp, val) VALUES
+(1, 'A', 'a'),
+(2, 'A', 'b'),
+(3, 'A', 'c'),
+(1, 'B', 'x'),
+(2, 'B', 'y'),
+(3, 'B', 'z');
+
+# Test string_agg with ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+# The function should maintain state correctly across multiple evaluate() calls
+query ITT
+SELECT
+    id,
+    grp,
+    string_agg(val, ',') OVER (
+        PARTITION BY grp
+        ORDER BY id
+        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+    ) AS cumulative_string
+FROM string_agg_window_test
+ORDER BY grp, id;
+----
+1 A a
+2 A a,b
+3 A a,b,c
+1 B x
+2 B x,y
+3 B x,y,z
+
+statement ok
+DROP TABLE string_agg_window_test;
+
+# Enable streaming aggregation by limiting partitions and ensuring sorted input
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+# Setup data
+statement ok
+CREATE TABLE stream_test (
+    g INT,
+    x DOUBLE,
+    y DOUBLE,
+    i INT,
+    b BOOLEAN,
+    s VARCHAR
+) AS VALUES
+(1, 1.0, 1.0, 1, true, 'a'), (1, 2.0, 2.0, 2, true, 'b'),
+(2, 1.0, 5.0, 3, false, 'c'), (2, 2.0, 5.0, 4, true, 'd'),
+(3, 1.0, 1.0, 7, false, 'e'), (3, 2.0, 2.0, 8, false, 'f');
+
+# Test comprehensive aggregates with streaming
+# This verifies that CORR and other aggregates work together in a streaming plan (ordering_mode=Sorted)
+
+# Basic Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  COUNT(*),
+  SUM(x),
+  AVG(x),
+  MEAN(x),
+  MIN(x),
+  MAX(y),
+  BIT_AND(i),
+  BIT_OR(i),
+  BIT_XOR(i),
+  BOOL_AND(b),
+  BOOL_OR(b),
+  MEDIAN(x),
+  GROUPING(g),
+  VAR(x),
+  VAR_SAMP(x),
+  VAR_POP(x),
+  VAR_SAMPLE(x),
+  VAR_POPULATION(x),
+  STDDEV(x),
+  STDDEV_SAMP(x),
+  STDDEV_POP(x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, count(Int64(1)) AS count(*), sum(stream_test.x), avg(stream_test.x), avg(stream_test.x) AS mean(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), Int32(0) AS grouping(stream_test.g), var(stream_test.x), var(stream_test.x) AS var_samp(stream_test.x), var_pop(stream_test.x), var(stream_test.x) AS var_sample(stream_test.x), var_pop(stream_test.x) AS var_population(stream_test.x), stddev(stream_test.x), stddev(stream_test.x) AS stddev_samp(stream_test.x), stddev_pop(stream_test.x)
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[count(Int64(1)), sum(stream_test.x), avg(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), var(stream_test.x), var_pop(stream_test.x), stddev(stream_test.x), stddev_pop(stream_test.x)]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, y, i, b]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, count(Int64(1))@1 as count(*), sum(stream_test.x)@2 as sum(stream_test.x), avg(stream_test.x)@3 as avg(stream_test.x), avg(stream_test.x)@3 as mean(stream_test.x), min(stream_test.x)@4 as min(stream_test.x), max(stream_test.y)@5 as max(stream_test.y), bit_and(stream_test.i)@6 as bit_and(stream_test.i), bit_or(stream_test.i)@7 as bit_or(stream_test.i), bit_xor(stream_test.i)@8 as bit_xor(stream_test.i), bool_and(stream_test.b)@9 as bool_and(stream_test.b), bool_or(stream_test.b)@10 as bool_or(stream_test.b), median(stream_test.x)@11 as median(stream_test.x), 0 as grouping(stream_test.g), var(stream_test.x)@12 as var(stream_test.x), var(stream_test.x)@12 as var_samp(stream_test.x), var_pop(stream_test.x)@13 as var_pop(stream_test.x), var(stream_test.x)@12 as var_sample(stream_test.x), var_pop(stream_test.x)@13 as var_population(stream_test.x), stddev(stream_test.x)@14 as stddev(stream_test.x), stddev(stream_test.x)@14 as stddev_samp(stream_test.x), stddev_pop(stream_test.x)@15 as stddev_pop(stream_test.x)]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[count(Int64(1)), sum(stream_test.x), avg(stream_test.x), min(stream_test.x), max(stream_test.y), bit_and(stream_test.i), bit_or(stream_test.i), bit_xor(stream_test.i), bool_and(stream_test.b), bool_or(stream_test.b), median(stream_test.x), var(stream_test.x), var_pop(stream_test.x), stddev(stream_test.x), stddev_pop(stream_test.x)], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IIRRRRRIIIBBRIRRRRRRRR
+SELECT
+  g,
+  COUNT(*),
+  SUM(x),
+  AVG(x),
+  MEAN(x),
+  MIN(x),
+  MAX(y),
+  BIT_AND(i),
+  BIT_OR(i),
+  BIT_XOR(i),
+  BOOL_AND(b),
+  BOOL_OR(b),
+  MEDIAN(x),
+  GROUPING(g),
+  VAR(x),
+  VAR_SAMP(x),
+  VAR_POP(x),
+  VAR_SAMPLE(x),
+  VAR_POPULATION(x),
+  STDDEV(x),
+  STDDEV_SAMP(x),
+  STDDEV_POP(x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 2 3 1.5 1.5 1 2 0 3 3 true true 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+2 2 3 1.5 1.5 1 5 0 7 7 false true 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+3 2 3 1.5 1.5 1 2 0 15 15 false false 1.5 0 0.5 0.5 0.25 0.5 0.25 0.707106781187 0.707106781187 0.5
+
+# Ordered Aggregates (by x)
+query TT
+EXPLAIN SELECT
+  g,
+  ARRAY_AGG(x ORDER BY x),
+  ARRAY_AGG(DISTINCT x ORDER BY x),
+  FIRST_VALUE(x ORDER BY x),
+  LAST_VALUE(x ORDER BY x),
+  NTH_VALUE(x, 1 ORDER BY x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Aggregate: groupBy=[[stream_test.g]], aggr=[[array_agg(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], array_agg(DISTINCT stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], first_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], last_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], nth_value(stream_test.x, Int64(1)) ORDER BY [stream_test.x ASC NULLS LAST]]]
+03)----Sort: stream_test.g ASC NULLS LAST, fetch=10000
+04)------TableScan: stream_test projection=[g, x]
+physical_plan
+01)AggregateExec: mode=Single, gby=[g@0 as g], aggr=[array_agg(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], array_agg(DISTINCT stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], first_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], last_value(stream_test.x) ORDER BY [stream_test.x ASC NULLS LAST], nth_value(stream_test.x,Int64(1)) ORDER BY [stream_test.x ASC NULLS LAST]], ordering_mode=Sorted
+02)--SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST, x@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I??RRR
+SELECT
+  g,
+  ARRAY_AGG(x ORDER BY x),
+  ARRAY_AGG(DISTINCT x ORDER BY x),
+  FIRST_VALUE(x ORDER BY x),
+  LAST_VALUE(x ORDER BY x),
+  NTH_VALUE(x, 1 ORDER BY x)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 [1.0, 2.0] [1.0, 2.0] 1 2 1
+2 [1.0, 2.0] [1.0, 2.0] 1 2 1
+3 [1.0, 2.0] [1.0, 2.0] 1 2 1
+
+# Ordered Aggregates (by s)
+query TT
+EXPLAIN SELECT
+  g,
+  ARRAY_AGG(s ORDER BY s),
+  STRING_AGG(s, '|' ORDER BY s),
+  STRING_AGG(DISTINCT s, '|' ORDER BY s)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Aggregate: groupBy=[[stream_test.g]], aggr=[[array_agg(stream_test.s) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(stream_test.s, Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(DISTINCT stream_test.s, Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST]]]
+03)----Sort: stream_test.g ASC NULLS LAST, fetch=10000
+04)------TableScan: stream_test projection=[g, s]
+physical_plan
+01)AggregateExec: mode=Single, gby=[g@0 as g], aggr=[array_agg(stream_test.s) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(stream_test.s,Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST], string_agg(DISTINCT stream_test.s,Utf8("|")) ORDER BY [stream_test.s ASC NULLS LAST]], ordering_mode=Sorted
+02)--SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST, s@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I?TT
+SELECT
+  g,
+  ARRAY_AGG(s ORDER BY s),
+  STRING_AGG(s, '|' ORDER BY s),
+  STRING_AGG(DISTINCT s, '|' ORDER BY s)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 [a, b] a|b a|b
+2 [c, d] c|d c|d
+3 [e, f] e|f e|f
+
+# Statistical & Regression Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  CORR(x, y),
+  COVAR(x, y),
+  COVAR_SAMP(x, y),
+  COVAR_POP(x, y),
+  REGR_SXX(x, y),
+  REGR_SXY(x, y),
+  REGR_SYY(x, y),
+  REGR_AVGX(x, y),
+  REGR_AVGY(x, y),
+  REGR_COUNT(x, y),
+  REGR_SLOPE(x, y),
+  REGR_INTERCEPT(x, y),
+  REGR_R2(x, y)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y) AS covar(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[corr(stream_test.x, stream_test.y), covar_samp(stream_test.x, stream_test.y), covar_pop(stream_test.x, stream_test.y), regr_sxx(stream_test.x, stream_test.y), regr_sxy(stream_test.x, stream_test.y), regr_syy(stream_test.x, stream_test.y), regr_avgx(stream_test.x, stream_test.y), regr_avgy(stream_test.x, stream_test.y), regr_count(stream_test.x, stream_test.y), regr_slope(stream_test.x, stream_test.y), regr_intercept(stream_test.x, stream_test.y), regr_r2(stream_test.x, stream_test.y)]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, y]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, corr(stream_test.x,stream_test.y)@1 as corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y)@2 as covar(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y)@2 as covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y)@3 as covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y)@4 as regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y)@5 as regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y)@6 as regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y)@7 as regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y)@8 as regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y)@9 as regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y)@10 as regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y)@11 as regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)@12 as regr_r2(stream_test.x,stream_test.y)]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[corr(stream_test.x,stream_test.y), covar_samp(stream_test.x,stream_test.y), covar_pop(stream_test.x,stream_test.y), regr_sxx(stream_test.x,stream_test.y), regr_sxy(stream_test.x,stream_test.y), regr_syy(stream_test.x,stream_test.y), regr_avgx(stream_test.x,stream_test.y), regr_avgy(stream_test.x,stream_test.y), regr_count(stream_test.x,stream_test.y), regr_slope(stream_test.x,stream_test.y), regr_intercept(stream_test.x,stream_test.y), regr_r2(stream_test.x,stream_test.y)], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IRRRRRRRRRIRRR
+SELECT
+  g,
+  CORR(x, y),
+  COVAR(x, y),
+  COVAR_SAMP(x, y),
+  COVAR_POP(x, y),
+  REGR_SXX(x, y),
+  REGR_SXY(x, y),
+  REGR_SYY(x, y),
+  REGR_AVGX(x, y),
+  REGR_AVGY(x, y),
+  REGR_COUNT(x, y),
+  REGR_SLOPE(x, y),
+  REGR_INTERCEPT(x, y),
+  REGR_R2(x, y)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 1 0.5 0.5 0.25 0.5 0.5 0.5 1.5 1.5 2 1 0 1
+2 NULL 0 0 0 0 0 0.5 5 1.5 2 NULL NULL NULL
+3 1 0.5 0.5 0.25 0.5 0.5 0.5 1.5 1.5 2 1 0 1
+
+# Approximate and Ordered-Set Aggregates
+query TT
+EXPLAIN SELECT
+  g,
+  APPROX_DISTINCT(i),
+  APPROX_MEDIAN(x),
+  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  QUANTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(1.0, 0.5) WITHIN GROUP (ORDER BY x),
+  PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(x, 1.0, 0.5)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+logical_plan
+01)Sort: stream_test.g ASC NULLS LAST
+02)--Projection: stream_test.g, approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST] AS quantile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))
+03)----Aggregate: groupBy=[[stream_test.g]], aggr=[[approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(stream_test.x, Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], approx_percentile_cont(stream_test.x, Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(stream_test.x, Float64(1), Float64(0.5)) ORDER BY [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x, Float64(0.5)), approx_percentile_cont(stream_test.x, Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x, Float64(1), Float64(0.5))]]
+04)------Sort: stream_test.g ASC NULLS LAST, fetch=10000
+05)--------TableScan: stream_test projection=[g, x, i]
+physical_plan
+01)ProjectionExec: expr=[g@0 as g, approx_distinct(stream_test.i)@1 as approx_distinct(stream_test.i), approx_median(stream_test.x)@2 as approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@3 as percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@3 as quantile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@4 as approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST]@5 as approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5))@6 as percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5))@7 as approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))@8 as approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))]
+02)--AggregateExec: mode=Single, gby=[g@0 as g], aggr=[approx_distinct(stream_test.i), approx_median(stream_test.x), percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont(Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], approx_percentile_cont_with_weight(Float64(1),Float64(0.5)) WITHIN GROUP [stream_test.x ASC NULLS LAST], percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont(stream_test.x,Float64(0.5)), approx_percentile_cont_with_weight(stream_test.x,Float64(1),Float64(0.5))], ordering_mode=Sorted
+03)----SortExec: TopK(fetch=10000), expr=[g@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query IIRRRRRRRR
+SELECT
+  g,
+  APPROX_DISTINCT(i),
+  APPROX_MEDIAN(x),
+  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  QUANTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(1.0, 0.5) WITHIN GROUP (ORDER BY x),
+  PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT(x, 0.5),
+  APPROX_PERCENTILE_CONT_WITH_WEIGHT(x, 1.0, 0.5)
+FROM (SELECT * FROM stream_test ORDER BY g LIMIT 10000)
+GROUP BY g
+ORDER BY g;
+----
+1 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+2 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+3 2 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.5
+
+statement ok
+DROP TABLE stream_test;
+
+# Restore default target partitions
+statement ok
+set datafusion.execution.target_partitions = 4;
diff --git a/datafusion/sqllogictest/test_files/aggregate_repartition.slt b/datafusion/sqllogictest/test_files/aggregate_repartition.slt
new file mode 100644
index 0000000000000..eeece7862341b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/aggregate_repartition.slt
@@ -0,0 +1,130 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Reproducer for https://github.com/apache/datafusion/issues/18341
+# Tests for aggregate repartition behavior
+# Comparing CSV vs Parquet execution plans for GROUP BY queries
+
+# Create CSV version of the dimension data
+query I
+COPY (
+  SELECT * FROM (VALUES 
+    ('prod', 100, 'A'),
+    ('dev', 200, 'B'),
+    ('test', 150, 'A'),
+    ('prod', 300, 'C'),
+    ('dev', 250, 'B')
+  ) AS t(env, value, category)
+)
+TO 'test_files/scratch/aggregate_repartition/dim.csv'
+STORED AS CSV
+OPTIONS ('format.has_header' 'true');
+----
+5
+
+# Create Parquet version of the dimension data
+query I
+COPY (
+  SELECT * FROM (VALUES 
+    ('prod', 100, 'A'),
+    ('dev', 200, 'B'),
+    ('test', 150, 'A'),
+    ('prod', 300, 'C'),
+    ('dev', 250, 'B')
+  ) AS t(env, value, category)
+)
+TO 'test_files/scratch/aggregate_repartition/dim.parquet'
+STORED AS PARQUET;
+----
+5
+
+# Create external table for CSV
+statement ok
+CREATE EXTERNAL TABLE dim_csv
+STORED AS CSV 
+LOCATION 'test_files/scratch/aggregate_repartition/dim.csv'
+OPTIONS ('format.has_header' 'true');
+
+# Create external table for Parquet
+statement ok
+CREATE EXTERNAL TABLE dim_parquet
+STORED AS PARQUET 
+LOCATION 'test_files/scratch/aggregate_repartition/dim.parquet';
+
+# Test 1: EXPLAIN query for CSV table with GROUP BY
+# This plans looks reasonable
+query TT
+EXPLAIN SELECT env, count(*) FROM dim_csv GROUP BY env;
+----
+logical_plan
+01)Projection: dim_csv.env, count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[dim_csv.env]], aggr=[[count(Int64(1))]]
+03)----TableScan: dim_csv projection=[env]
+physical_plan
+01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))]
+03)----RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4
+04)------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.csv]]}, projection=[env], file_type=csv, has_header=true
+
+# Test 2: EXPLAIN query for Parquet table with GROUP BY
+
+query TT
+EXPLAIN SELECT env, count(*) FROM dim_parquet GROUP BY env;
+----
+logical_plan
+01)Projection: dim_parquet.env, count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[dim_parquet.env]], aggr=[[count(Int64(1))]]
+03)----TableScan: dim_parquet projection=[env]
+physical_plan
+01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))]
+03)----RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet
+
+# Verify the queries actually work and return the same results
+query TI rowsort
+SELECT env, count(*) FROM dim_csv GROUP BY env;
+----
+dev 2
+prod 2
+test 1
+
+query TI rowsort
+SELECT env, count(*) FROM dim_parquet GROUP BY env;
+----
+dev 2
+prod 2
+test 1
+
+# Test 3: Change target partitions to 1 to have single-aggregate plan
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+query TT
+EXPLAIN SELECT env, count(*) FROM dim_parquet GROUP BY env;
+----
+logical_plan
+01)Projection: dim_parquet.env, count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[dim_parquet.env]], aggr=[[count(Int64(1))]]
+03)----TableScan: dim_parquet projection=[env]
+physical_plan
+01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)]
+02)--AggregateExec: mode=Single, gby=[env@0 as env], aggr=[count(Int64(1))]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet
diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
index 8755918cd16c2..c16a6f442427f 100644
--- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
+++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
@@ -69,9 +69,6 @@ set datafusion.execution.target_partitions = 2;
 statement ok
 set datafusion.execution.batch_size = 1;
 
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 # Grouping by unique fields allows to check all accumulators
 query ITIIII
 SELECT c5, c1,
@@ -178,18 +175,32 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
 -2117946883 d 1 0 0 0
 -2098805236 c 1 0 0 0
 
-# FIXME: add bool_and(v3) column when issue fixed
-# ISSUE https://github.com/apache/datafusion/issues/11846
-query TBBB rowsort
-select v1, bool_or(v2), bool_and(v2), bool_or(v3)
+query IT????
+SELECT c5, c1,
+       ARRAY_AGG(c3),
+       ARRAY_AGG(CASE WHEN c1 = 'a' THEN c3 ELSE NULL END),
+       ARRAY_AGG(c3) FILTER (WHERE c1 = 'b'),
+       ARRAY_AGG(CASE WHEN c1 = 'a' THEN c3 ELSE NULL END) FILTER (WHERE c1 = 'b')
+FROM aggregate_test_100
+GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
+----
+-2141999138 c [-2] [NULL] NULL NULL
+-2141451704 a [-72] [-72] NULL NULL
+-2138770630 b [63] [NULL] [63] [NULL]
+-2117946883 d [-59] [NULL] NULL NULL
+-2098805236 c [22] [NULL] NULL NULL
+
+# Regression test for https://github.com/apache/datafusion/issues/11846
+query TBBBB rowsort
+select v1, bool_or(v2), bool_and(v2), bool_or(v3), bool_and(v3)
 from aggregate_test_100_bool
 group by v1
 ----
-a true false true
-b true false true
-c true false false
-d true false false
-e true false NULL
+a true false true true
+b true false true true
+c true false false false
+d true false false false
+e true false NULL NULL
 
 query TBBB rowsort
 select v1,
@@ -248,6 +259,19 @@ SELECT c2, count(c1), count(c5), count(c11) FROM aggregate_test_100 GROUP BY c2
 4 23 23 23
 5 14 14 14
 
+# Test array_agg; we sort the output to ensure deterministic results
+query I??
+SELECT c2,
+       array_sort(array_agg(c5)),
+       array_sort(array_agg(c3) FILTER (WHERE c3 > 0))
+FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
+----
+1 [-1991133944, -1882293856, -1448995523, -1383162419, -1339586153, -1331533190, -1176490478, -1143802338, -928766616, -644225469, -335410409, 383352709, 431378678, 794623392, 994303988, 1171968280, 1188089983, 1213926989, 1325868318, 1413111008, 2106705285, 2143473091] [12, 29, 36, 38, 41, 54, 57, 70, 71, 83, 103, 120, 125]
+2 [-2138770630, -1927628110, -1908480893, -1899175111, -1808210365, -1660426473, -1222533990, -1090239422, -1011669561, -800561771, -587831330, -537142430, -168758331, -108973366, 49866617, 370975815, 439738328, 715235348, 1354539333, 1593800404, 2033001162, 2053379412] [1, 29, 31, 45, 49, 52, 52, 63, 68, 93, 97, 113, 122]
+3 [-2141999138, -2141451704, -2098805236, -1302295658, -903316089, -421042466, -382483011, -346989627, 141218956, 240273900, 397430452, 670497898, 912707948, 1299719633, 1337043149, 1436496767, 1489733240, 1738331255, 2030965207] [13, 13, 14, 17, 17, 22, 71, 73, 77, 97, 104, 112, 123]
+4 [-1885422396, -1813935549, -1009656194, -673237643, -237425046, -4229382, 61035129, 427197269, 434021400, 659422734, 702611616, 762932956, 852509237, 1282464673, 1423957796, 1544188174, 1579876740, 1902023838, 1991172974, 1993193190, 2047637360, 2051224722, 2064155045] [3, 5, 17, 30, 47, 55, 65, 73, 74, 96, 97, 102, 123]
+5 [-2117946883, -842693467, -629486480, -467659022, -134213907, 41423756, 586844478, 623103518, 706441268, 1188285940, 1689098844, 1824882165, 1955646088, 2025611582] [36, 62, 64, 68, 118]
+
 # Test min / max for int / float
 query IIIRR
 SELECT c2, min(c5), max(c5), min(c11), max(c11) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
@@ -393,19 +417,6 @@ c 2.666666666667 0.425241138254
 d 2.444444444444 0.541519476308
 e 3 0.505440263521
 
-# FIXME: add bool_and(v3) column when issue fixed
-# ISSUE https://github.com/apache/datafusion/issues/11846
-query TBBB rowsort
-select v1, bool_or(v2), bool_and(v2), bool_or(v3)
-from aggregate_test_100_bool
-group by v1
-----
-a true false true
-b true false true
-c true false false
-d true false false
-e true false NULL
-
 query TBBB rowsort
 select v1,
       bool_or(v2) FILTER (WHERE v1 = 'a' OR v1 = 'c' OR v1 = 'e'),
@@ -420,10 +431,6 @@ c true false NULL
 d NULL false NULL
 e true false NULL
 
-# Enabling PG dialect for filtered aggregates tests
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 # Test count with filter
 query III
 SELECT
diff --git a/datafusion/sqllogictest/test_files/aggregates_simplify.slt b/datafusion/sqllogictest/test_files/aggregates_simplify.slt
new file mode 100644
index 0000000000000..9aa3ecf7a29f8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/aggregates_simplify.slt
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######
+# Tests for aggregate optimizations / simplifications
+#######
+
+statement ok
+CREATE TABLE sum_simplify_t AS VALUES (1, 100), (1, 200), (2, 100), (NULL, NULL);
+
+# Baseline SUM of an expression
+query I
+SELECT SUM(column1 + 1) FROM sum_simplify_t;
+----
+7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# Mixed aggregate expressions with type validation
+query TI
+SELECT arrow_typeof(SUM(column1)), SUM(column1 + 1) FROM sum_simplify_t;
+----
+Int64 7
+
+query TT
+EXPLAIN SELECT arrow_typeof(SUM(column1)), SUM(column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[arrow_typeof(sum(sum_simplify_t.column1)@0) as arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1)@0 as sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))@1 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Duplicate aggregate expressions
+query II
+SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
+----
+7 7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_b
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_b]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# constant aggregate expressions
+query II
+SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
+----
+12 12
+
+query TT
+EXPLAIN SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: __common_expr_1 AS sum(Int64(2) + Int64(1)), __common_expr_1 AS sum(Int64(3))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(Int64(3)) AS __common_expr_1]]
+03)----TableScan: sum_simplify_t projection=[]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(2) + Int64(1)), __common_expr_1@0 as sum(Int64(3))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[__common_expr_1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+# Duplicated expression across multiple aggregate arguments.
+query II
+SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+7 10
+
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Reordered expressions that still compute the same thing
+query II
+SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+7 10
+
+query TT
+EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1) + __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+02)--Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum(sum_simplify_t.column1)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1)@0 + 2 * count(sum_simplify_t.column1)@1 as sum(sum_simplify_t.column1 + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# DISTINCT aggregates with different arguments
+query II
+SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
+----
+5 7
+
+query TT
+EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# DISTINCT and non-DISTINCT aggregates
+query II
+SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+5 7
+
+query TT
+EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(alias1) AS sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2) AS sum(sum_simplify_t.column1 + Int64(1))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(alias1), sum(alias2)]]
+03)----Aggregate: groupBy=[[__common_expr_1 AS alias1]], aggr=[[sum(__common_expr_1) AS alias2]]
+04)------Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1
+05)--------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(alias1)@0 as sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2)@1 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--AggregateExec: mode=Final, gby=[], aggr=[sum(alias1), sum(alias2)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(alias1), sum(alias2)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[alias2]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as alias1], aggr=[alias2]
+08)--------------ProjectionExec: expr=[column1@0 + 1 as __common_expr_1]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# FILTER clauses with different aggregate arguments
+query II
+SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
+----
+3 NULL
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]]
+02)--TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# FILTER clauses with the same aggregate argument
+query II
+SELECT
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
+FROM sum_simplify_t;
+----
+3 3
+
+query TT
+EXPLAIN SELECT
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
+    SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
+FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_b
+02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]]
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_b]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Same aggregate argument with different FILTER predicates
+query II
+SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
+----
+3 7
+
+query TT
+EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]]
+02)--Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1, sum_simplify_t.column1
+03)----TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]
+02)--ProjectionExec: expr=[column1@0 + 1 as __common_expr_1, column1@0 as column1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# volatile aggregate arguments
+query B
+SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
+----
+true
+
+query TT
+EXPLAIN SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: sum(random() + Int64(2)) > sum(random() + Int64(1)) AS sum(random() + Int64(1)) < sum(random() + Int64(2))
+02)--Aggregate: groupBy=[[]], aggr=[[sum(random() + Float64(1)) AS sum(random() + Int64(1)), sum(random() + Float64(2)) AS sum(random() + Int64(2))]]
+03)----TableScan: sum_simplify_t projection=[]
+physical_plan
+01)ProjectionExec: expr=[sum(random() + Int64(2))@1 > sum(random() + Int64(1))@0 as sum(random() + Int64(1)) < sum(random() + Int64(2))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(random() + Int64(1)), sum(random() + Int64(2))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks grouped aggregates with explicit ORDER BY return deterministic row order.
+query III
+SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
+----
+200 2 3
+100 5 7
+NULL NULL NULL
+
+query TT
+EXPLAIN SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
+----
+logical_plan
+01)Sort: sum_simplify_t.column2 DESC NULLS LAST
+02)--Projection: sum_simplify_t.column2, sum(sum_simplify_t.column1) + __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1) + Int64(2) * __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(2))
+03)----Projection: CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1, sum_simplify_t.column2, sum(sum_simplify_t.column1)
+04)------Aggregate: groupBy=[[sum_simplify_t.column2]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+05)--------TableScan: sum_simplify_t projection=[column1, column2]
+physical_plan
+01)SortPreservingMergeExec: [column2@0 DESC NULLS LAST]
+02)--SortExec: expr=[column2@0 DESC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[column2@0 as column2, sum(sum_simplify_t.column1)@1 + count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1)@1 + 2 * count(sum_simplify_t.column1)@2 as sum(sum_simplify_t.column1 + Int64(2))]
+04)------AggregateExec: mode=FinalPartitioned, gby=[column2@0 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+05)--------RepartitionExec: partitioning=Hash([column2@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[column2@1 as column2], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks commutative forms of equivalent aggregate arguments are simplified consistently.
+query II
+SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+7 7
+
+query TT
+EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
+----
+logical_plan
+01)Projection: __common_expr_1 AS sum(Int64(1) + sum_simplify_t.column1), __common_expr_1 AS sum(sum_simplify_t.column1 + Int64(1))
+02)--Projection: sum(sum_simplify_t.column1) + CAST(count(sum_simplify_t.column1) AS Int64) AS __common_expr_1
+03)----Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]]
+04)------TableScan: sum_simplify_t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(1) + sum_simplify_t.column1), __common_expr_1@0 as sum(sum_simplify_t.column1 + Int64(1))]
+02)--ProjectionExec: expr=[sum(sum_simplify_t.column1)@0 + count(sum_simplify_t.column1)@1 as __common_expr_1]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), count(sum_simplify_t.column1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Checks unsigned overflow edge case from PR discussion using transformed SUM arguments.
+statement ok
+CREATE TABLE IF NOT EXISTS tbl (val INTEGER UNSIGNED);
+
+statement ok
+INSERT INTO tbl VALUES (4294967295);
+
+statement ok
+INSERT INTO tbl VALUES (4294967295);
+
+# Checks transformed SUM results for unsigned max values are preserved.
+query TII
+SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
+----
+Int64 8589934592 8589934594
+
+query TT
+EXPLAIN SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
+----
+logical_plan
+01)Projection: arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))
+02)--Projection: sum(tbl.val) + __common_expr_1 AS sum(tbl.val + Int64(1)), sum(tbl.val) + Int64(2) * __common_expr_1 AS sum(tbl.val + Int64(2))
+03)----Projection: CAST(count(tbl.val) AS Int64) AS __common_expr_1, sum(tbl.val)
+04)------Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS tbl.val), count(__common_expr_2 AS tbl.val)]]
+05)--------Projection: CAST(tbl.val AS Int64) AS __common_expr_2
+06)----------TableScan: tbl projection=[val]
+physical_plan
+01)ProjectionExec: expr=[arrow_typeof(sum(tbl.val + Int64(1))@0) as arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1))@0 as sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))@1 as sum(tbl.val + Int64(2))]
+02)--ProjectionExec: expr=[sum(tbl.val)@0 + count(tbl.val)@1 as sum(tbl.val + Int64(1)), sum(tbl.val)@0 + 2 * count(tbl.val)@1 as sum(tbl.val + Int64(2))]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)]
+04)------ProjectionExec: expr=[CAST(val@0 AS Int64) as __common_expr_2]
+05)--------DataSourceExec: partitions=1, partition_sizes=[2]
+
+# Checks equivalent rewritten form (SUM + COUNT terms) matches transformed SUM semantics.
+query RR
+SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
+----
+8589934592 8589934594
+
+query TT
+EXPLAIN SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
+----
+logical_plan
+01)Projection: __common_expr_1 + CAST(count(tbl.val) AS Decimal128(20, 0)) AS sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1 AS sum(tbl.val) + CAST(Int64(2) * count(tbl.val) AS Decimal128(20, 0))
+02)--Projection: CAST(sum(tbl.val) AS Decimal128(20, 0)) AS __common_expr_1, count(tbl.val)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(CAST(tbl.val AS UInt64)), count(tbl.val)]]
+04)------TableScan: tbl projection=[val]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 + CAST(count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1@0 + CAST(2 * count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(2) * count(tbl.val)]
+02)--ProjectionExec: expr=[CAST(sum(tbl.val)@0 AS Decimal128(20, 0)) as __common_expr_1, count(tbl.val)@1 as count(tbl.val)]
+03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)]
+04)------DataSourceExec: partitions=1, partition_sizes=[2]
+
+statement ok
+DROP TABLE IF EXISTS tbl;
+
+statement ok
+DROP TABLE sum_simplify_t;
diff --git a/datafusion/sqllogictest/test_files/aggregates_topk.slt b/datafusion/sqllogictest/test_files/aggregates_topk.slt
index cc1693843848a..19ead8965ed01 100644
--- a/datafusion/sqllogictest/test_files/aggregates_topk.slt
+++ b/datafusion/sqllogictest/test_files/aggregates_topk.slt
@@ -46,11 +46,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TI
 select * from (select trace_id, MAX(timestamp) max_ts from traces t group by trace_id) where trace_id != 'b' order by max_ts desc limit 3;
@@ -110,11 +108,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MIN(timestamp) from traces group by trace_id order by MIN(timestamp) desc limit 4;
@@ -127,11 +123,9 @@ physical_plan
 01)SortPreservingMergeExec: [min(traces.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[min(traces.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[min(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MAX(timestamp) from traces group by trace_id order by MAX(timestamp) asc limit 4;
@@ -144,11 +138,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces.timestamp)@1 ASC NULLS LAST], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces.timestamp)@1 ASC NULLS LAST], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select trace_id, MAX(timestamp) from traces group by trace_id order by trace_id asc limit 4;
@@ -161,11 +153,9 @@ physical_plan
 01)SortPreservingMergeExec: [trace_id@0 ASC NULLS LAST], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[trace_id@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.timestamp)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TI
 select trace_id, max(timestamp) from traces group by trace_id order by MAX(timestamp) desc limit 4;
@@ -205,6 +195,70 @@ a -1 -1
 NULL 0 0
 a 1 1
 
+statement ok
+CREATE TABLE string_topk(category varchar, val varchar) AS VALUES
+('x', 'apple'),
+('x', 'zebra'),
+('y', 'banana'),
+('y', 'apricot'),
+('z', 'mango');
+
+statement ok
+CREATE VIEW string_topk_view AS
+SELECT
+  arrow_cast(category, 'Utf8View') AS category,
+  arrow_cast(val, 'Utf8View') AS val
+FROM
+  string_topk;
+
+query TT
+select category, max(val) from string_topk group by category order by max(val) desc limit 2;
+----
+x zebra
+z mango
+
+query TT
+explain select category, max(val) max_val from string_topk group by category order by max_val desc limit 2;
+----
+logical_plan
+01)Sort: max_val DESC NULLS FIRST, fetch=2
+02)--Projection: string_topk.category, max(string_topk.val) AS max_val
+03)----Aggregate: groupBy=[[string_topk.category]], aggr=[[max(string_topk.val)]]
+04)------TableScan: string_topk projection=[category, val]
+physical_plan
+01)SortPreservingMergeExec: [max_val@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_val@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[category@0 as category, max(string_topk.val)@1 as max_val]
+04)------AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[max(string_topk.val)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[max(string_topk.val)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+select category, max(val) from string_topk_view group by category order by max(val) desc limit 2;
+----
+x zebra
+z mango
+
+query TT
+explain select category, max(val) max_val from string_topk_view group by category order by max_val desc limit 2;
+----
+logical_plan
+01)Sort: max_val DESC NULLS FIRST, fetch=2
+02)--Projection: string_topk_view.category, max(string_topk_view.val) AS max_val
+03)----Aggregate: groupBy=[[string_topk_view.category]], aggr=[[max(string_topk_view.val)]]
+04)------SubqueryAlias: string_topk_view
+05)--------Projection: string_topk.category AS category, string_topk.val AS val
+06)----------TableScan: string_topk projection=[category, val]
+physical_plan
+01)SortPreservingMergeExec: [max_val@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_val@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[category@0 as category, max(string_topk_view.val)@1 as max_val]
+04)------AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[max(string_topk_view.val)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[max(string_topk_view.val)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
 query TII
 select trace_id, min(other), MIN(timestamp) from traces group by trace_id order by MIN(timestamp), MIN(other) limit 4;
 ----
@@ -213,6 +267,30 @@ a -1 -1
 NULL 0 0
 c 1 2
 
+# Regression tests for string max with ORDER BY ... LIMIT to ensure schema stability
+query TT
+select trace_id, max(trace_id) as max_trace from traces group by trace_id order by max_trace desc limit 2;
+----
+c c
+b b
+
+query TT
+explain select trace_id, max(trace_id) as max_trace from traces group by trace_id order by max_trace desc limit 2;
+----
+logical_plan
+01)Sort: max_trace DESC NULLS FIRST, fetch=2
+02)--Projection: traces.trace_id, max(traces.trace_id) AS max_trace
+03)----Aggregate: groupBy=[[traces.trace_id]], aggr=[[max(traces.trace_id)]]
+04)------TableScan: traces projection=[trace_id]
+physical_plan
+01)SortPreservingMergeExec: [max_trace@1 DESC], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[max_trace@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[trace_id@0 as trace_id, max(traces.trace_id)@1 as max_trace]
+04)------AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces.trace_id)], lim=[2]
+05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces.trace_id)], lim=[2]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
 
 # Setting to map varchar to utf8view, to test PR https://github.com/apache/datafusion/pull/15152
 # Before the PR, the test case would not work because the Utf8View will not be supported by the TopK aggregation
@@ -235,11 +313,9 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces_utf8view.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces_utf8view.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_utf8view.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 # Also add LargeUtf8 to test PR https://github.com/apache/datafusion/pull/15152
@@ -263,12 +339,128 @@ physical_plan
 01)SortPreservingMergeExec: [max(traces_largeutf8.timestamp)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(traces_largeutf8.timestamp)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=4
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([trace_id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[trace_id@0 as trace_id], aggr=[max(traces_largeutf8.timestamp)], lim=[4]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
+## Test GROUP BY with ORDER BY on the same column (no aggregate functions)
+statement ok
+CREATE TABLE ids(id int, value int) AS VALUES
+(1, 10),
+(2, 20),
+(3, 30),
+(4, 40),
+(1, 50),
+(2, 60),
+(5, 70);
+
+query TT
+explain select id from ids group by id order by id desc limit 3;
+----
+logical_plan
+01)Sort: ids.id DESC NULLS FIRST, fetch=3
+02)--Aggregate: groupBy=[[ids.id]], aggr=[[]]
+03)----TableScan: ids projection=[id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[], lim=[3]
+04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], lim=[3]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select id from ids group by id order by id desc limit 3;
+----
+5
+4
+3
+
+query TT
+explain select id from ids group by id order by id asc limit 2;
+----
+logical_plan
+01)Sort: ids.id ASC NULLS LAST, fetch=2
+02)--Aggregate: groupBy=[[ids.id]], aggr=[[]]
+03)----TableScan: ids projection=[id]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=2
+02)--SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[], lim=[2]
+04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[], lim=[2]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select id from ids group by id order by id asc limit 2;
+----
+1
+2
+
+# Test with larger limit than distinct values
+query I
+select id from ids group by id order by id desc limit 100;
+----
+5
+4
+3
+2
+1
+
+# Test with bigint group by
+statement ok
+CREATE TABLE values_table (value INT, category BIGINT) AS VALUES
+(10, 100),
+(20, 200),
+(30, 300),
+(40, 400),
+(50, 500),
+(20, 200),
+(10, 100),
+(40, 400);
+
+query TT
+explain select category from values_table group by category order by category desc limit 3;
+----
+logical_plan
+01)Sort: values_table.category DESC NULLS FIRST, fetch=3
+02)--Aggregate: groupBy=[[values_table.category]], aggr=[[]]
+03)----TableScan: values_table projection=[category]
+physical_plan
+01)SortPreservingMergeExec: [category@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[category@0 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[category@0 as category], aggr=[], lim=[3]
+04)------RepartitionExec: partitioning=Hash([category@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[category@0 as category], aggr=[], lim=[3]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select category from values_table group by category order by category desc limit 3;
+----
+500
+400
+300
+
+# Test with integer group by
+query I
+select value from values_table group by value order by value asc limit 3;
+----
+10
+20
+30
+
+# Test DISTINCT semantics are preserved
+query I
+select count(*) from (select category from values_table group by category order by category desc limit 3);
+----
+3
+
+statement ok
+drop table values_table;
+
+statement ok
+drop table ids;
+
 statement ok
 drop table traces;
diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt
index d89ba600d7a6b..7e6050d8e62f6 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -310,7 +310,7 @@ AS VALUES
 statement ok
 CREATE TABLE fixed_size_array_has_table_2D
 AS VALUES
-  (arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3], [4,5], [6,7]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([4,5], [6,7], [1,2]), 'FixedSizeList(3, List(Int64))')),
+  (arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1,3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3], [4,5], [6,7]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([4,5], [6,7], [1,2,3]), 'FixedSizeList(3, List(Int64))')),
   (arrow_cast(make_array([3,4], [5]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(5, 3), 'FixedSizeList(2, Int64)'), arrow_cast(make_array([1,2,3,4], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([1,2,3], [5,6,7], [8,9,10]), 'FixedSizeList(3, List(Int64))'))
 ;
 
@@ -362,6 +362,14 @@ AS VALUES
   (make_array(NULL, NULL, NULL), 2)
 ;
 
+statement ok
+CREATE TABLE array_has_table_empty
+AS VALUES
+  (make_array(1, 3, 5), 1),
+  (make_array(), 1),
+  (NULL, 1)
+;
+
 statement ok
 CREATE TABLE array_distinct_table_1D
 AS VALUES
@@ -495,6 +503,17 @@ AS
 FROM array_intersect_table_1D_UTF8
 ;
 
+statement ok
+CREATE TABLE array_intersect_table_1D_NULL
+AS VALUES
+  ([1, 2, 2, 3], [2, 3, 4]),
+  ([2, 3, 3], [3]),
+  ([3], [3, 3, 4]),
+  (null, [3, 4]),
+  ([1, 2], null),
+  (null, null)
+;
+
 statement ok
 CREATE TABLE array_intersect_table_2D
 AS VALUES
@@ -687,7 +706,7 @@ SELECT array_length([now()])
 query ?
 select [abs(-1.2), sin(-1), log(2), ceil(3.141)]
 ----
-[1.2, -0.8414709848078965, 0.3010299801826477, 4.0]
+[1.2, -0.8414709848078965, 0.30102999566398114, 4.0]
 
 ## array literal with nested types
 query ???
@@ -702,13 +721,13 @@ select
 query TTT
 select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from arrays;
 ----
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
+List(List(Int64)) List(Float64) List(Utf8)
 
 # arrays table
 query ???
@@ -1174,7 +1193,7 @@ select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))
 query T
 select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')));
 ----
-List(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(LargeList(Int64))
 
 
 query ???
@@ -1513,6 +1532,11 @@ select input, array_max(input) from (select make_array(d - 1, d, d + 1) input fr
 [29, 30, 31] 31
 [NULL, NULL, NULL] NULL
 
+query II
+select array_max(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_max(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+3 1
+
 query II
 select array_max(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_max(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
 ----
@@ -1527,6 +1551,96 @@ NULL
 query error DataFusion error: Error during planning: 'array_max' does not support zero arguments
 select array_max();
 
+## array_min
+
+query I
+select array_min(make_array(5, 3, 6, 4));
+----
+3
+
+query I
+select array_min(make_array(5, 3, 4, NULL, 6, NULL));
+----
+3
+
+query ?
+select array_min(make_array(NULL, NULL));
+----
+NULL
+
+query T
+select array_min(make_array('h', 'e', 'o', 'l', 'l'));
+----
+e
+
+query T
+select array_min(make_array('h', 'e', 'l', NULL, 'l', 'o', NULL));
+----
+e
+
+query B
+select array_min(make_array(false, true, false, true));
+----
+false
+
+query B
+select array_min(make_array(false, true, NULL, false, true));
+----
+false
+
+query D
+select array_min(make_array(DATE '1992-09-01', DATE '1993-03-01', DATE '1999-05-01', DATE '1985-11-01'));
+----
+1985-11-01
+
+query D
+select array_min(make_array(DATE '1995-09-01', DATE '1999-05-01', DATE '1993-03-01', NULL));
+----
+1993-03-01
+
+query P
+select array_min(make_array(TIMESTAMP '1992-09-01', TIMESTAMP '1995-06-01', TIMESTAMP '1984-10-01'));
+----
+1984-10-01T00:00:00
+
+query P
+select array_min(make_array(NULL, TIMESTAMP '1996-10-01', TIMESTAMP '1995-06-01'));
+----
+1995-06-01T00:00:00
+
+query R
+select array_min(make_array(5.1, -3.2, 6.3, 4.9));
+----
+-3.2
+
+query ?I
+select input, array_min(input) from (select make_array(d - 1, d, d + 1) input from (values (0), (10), (20), (30), (NULL)) t(d))
+----
+[-1, 0, 1] -1
+[9, 10, 11] 9
+[19, 20, 21] 19
+[29, 30, 31] 29
+[NULL, NULL, NULL] NULL
+
+query II
+select array_min(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_min(arrow_cast(make_array(1), 'LargeList(Int64)'));
+----
+1 1
+
+query II
+select array_min(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_min(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
+----
+1 1
+
+query ?
+select array_min(make_array());
+----
+NULL
+
+# Testing with empty arguments should result in an error
+query error DataFusion error: Error during planning: 'array_min' does not support zero arguments
+select array_min();
+
 
 ## array_pop_back (aliases: `list_pop_back`)
 
@@ -1840,11 +1954,30 @@ select array_slice(make_array(1, 2, 3, 4, 5), 5, 1, -2), array_slice(make_array(
 ----
 [5, 3, 1] [o, l, h]
 
+# Test NULL stride
+query ??
+select array_slice(make_array(1, 2, 3, 4, 5), 1, 5, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 1, 5, NULL);
+----
+NULL NULL
+
+# Test NULL stride
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 1, 5, NULL),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 5, NULL);
+----
+NULL NULL
+
 query ??
 select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2, 4), array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 1, 2);
 ----
 [2, 3, 4] [h, e]
 
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 2, 4),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 1, 2);
+----
+[2, 3, 4] [h, e]
+
 # array_slice scalar function #2 (with positive indexes; full array)
 query ??
 select array_slice(make_array(1, 2, 3, 4, 5), 0, 6), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 0, 5);
@@ -1856,6 +1989,20 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 0,
 ----
 [1, 2, 3, 4, 5] [h, e, l, l, o]
 
+# TODO make error message nicer: https://github.com/apache/datafusion/issues/19004
+# Expected output (once supported):
+# ----
+# [1, 2, 3, 4, 5] [h, e, l, l, o]
+query error Failed to coerce arguments to satisfy a call to 'array_slice' function:
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'ListView(Int64)'), 0, 6),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'ListView(Utf8)'), 0, 5);
+
+query ??
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 0, 6),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 0, 5);
+----
+[1, 2, 3, 4, 5] [h, e, l, l, o]
+
 # array_slice scalar function #3 (with positive indexes; first index = second index)
 query ??
 select array_slice(make_array(1, 2, 3, 4, 5), 4, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, 3);
@@ -1889,6 +2036,15 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 2,
 ----
 [2, 3, 4, 5] [l, l, o]
 
+# TODO: Enable once array_slice supports LargeListView types.
+# Expected output (once supported):
+# ----
+# [2, 3, 4, 5] [l, l, o]
+query error Failed to coerce arguments to satisfy a call to 'array_slice' function:
+select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeListView(Int64)'), 2, 6),
+       array_slice(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeListView(Utf8)'), 3, 7);
+
+
 # array_slice scalar function #6 (with positive indexes; nested array)
 query ?
 select array_slice(make_array(make_array(1, 2, 3, 4, 5), make_array(6, 7, 8, 9, 10)), 1, 1);
@@ -2330,6 +2486,20 @@ select array_sort(make_array(1, 3, null, 5, NULL, -5)), array_sort(make_array(1,
 ----
 [NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
 
+query ???
+select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'LargeList(Int64)')),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'ASC'),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'LargeList(Int64)'), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
+query ???
+select array_sort(arrow_cast(make_array(1, 3, null, 5, NULL, -5), 'FixedSizeList(6, Int64)')),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'ASC'),
+       array_sort(arrow_cast(make_array(1, 3, null, 2), 'FixedSizeList(4, Int64)'), 'desc', 'NULLS FIRST');
+----
+[NULL, NULL, -5, 1, 3, 5] [NULL, 1, 2, 3] [NULL, 3, 2, 1]
+
 query ?
 select array_sort(column1, 'DESC', 'NULLS LAST') from arrays_values;
 ----
@@ -2407,6 +2577,31 @@ NULL NULL
 NULL NULL
 NULL NULL
 
+# maintains inner nullability
+query ?T
+select array_sort(column1), arrow_typeof(array_sort(column1))
+from values
+  (arrow_cast([], 'List(non-null Int32)')),
+  (arrow_cast(NULL, 'List(non-null Int32)')),
+  (arrow_cast([1, 3, 5, -5], 'List(non-null Int32)'))
+;
+----
+[] List(non-null Int32)
+NULL List(non-null Int32)
+[-5, 1, 3, 5] List(non-null Int32)
+
+query ?T
+select column1, arrow_typeof(column1)
+from values (array_sort(arrow_cast([1, 3, 5, -5], 'LargeList(non-null Int32)')));
+----
+[-5, 1, 3, 5] LargeList(non-null Int32)
+
+query ?T
+select column1, arrow_typeof(column1)
+from values (array_sort(arrow_cast([1, 3, 5, -5], 'FixedSizeList(4 x non-null Int32)')));
+----
+[-5, 1, 3, 5] List(non-null Int32)
+
 query ?
 select array_sort([struct('foo', 3), struct('foo', 1), struct('bar', 1)])
 ----
@@ -2695,7 +2890,6 @@ select array_append(column1, arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3
 
 # DuckDB: [4]
 # ClickHouse: Null
-# Since they dont have the same result, we just follow Postgres, return error
 query ?
 select array_prepend(4, NULL);
 ----
@@ -3062,6 +3256,99 @@ drop table array_repeat_table;
 statement ok
 drop table large_array_repeat_table;
 
+# array_repeat: arrays with NULL counts
+statement ok
+create table array_repeat_null_count_table
+as values
+(1, 2),
+(2, null),
+(3, 1),
+(4, -1),
+(null, null);
+
+query I?
+select column1, array_repeat(column1, column2) from array_repeat_null_count_table;
+----
+1 [1, 1]
+2 NULL
+3 [3]
+4 []
+NULL NULL
+
+statement ok
+drop table array_repeat_null_count_table
+
+# array_repeat: nested arrays with NULL counts
+statement ok
+create table array_repeat_nested_null_count_table
+as values
+([[1, 2], [3, 4]], 2),
+([[5, 6], [7, 8]], null),
+([[null, null], [9, 10]], 1),
+(null, 3),
+([[11, 12]], -1);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_nested_null_count_table;
+----
+[[1, 2], [3, 4]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+[[5, 6], [7, 8]] NULL
+[[NULL, NULL], [9, 10]] [[[NULL, NULL], [9, 10]]]
+NULL [NULL, NULL, NULL]
+[[11, 12]] []
+
+statement ok
+drop table array_repeat_nested_null_count_table
+
+# array_repeat edge cases: empty arrays
+query ???
+select array_repeat([], 3), array_repeat([], 0), array_repeat([], null);
+----
+[[], [], []] [] NULL
+
+query ??
+select array_repeat(null::int, 0), array_repeat(null::int, null);
+----
+[] NULL
+
+# array_repeat LargeList with NULL count
+statement ok
+create table array_repeat_large_list_null_table
+as values
+(arrow_cast([1, 2, 3], 'LargeList(Int64)'), 2),
+(arrow_cast([4, 5], 'LargeList(Int64)'), null),
+(arrow_cast(null, 'LargeList(Int64)'), 3);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_large_list_null_table;
+----
+[1, 2, 3] [[1, 2, 3], [1, 2, 3]]
+[4, 5] NULL
+NULL [NULL, NULL, NULL]
+
+statement ok
+drop table array_repeat_large_list_null_table
+
+# array_repeat edge cases: LargeList nested with NULL count
+statement ok
+create table array_repeat_large_nested_null_table
+as values
+(arrow_cast([[1, 2], [3, 4]], 'LargeList(List(Int64))'), 2),
+(arrow_cast([[5, 6], [7, 8]], 'LargeList(List(Int64))'), null),
+(arrow_cast([[null, null]], 'LargeList(List(Int64))'), 1),
+(null, 3);
+
+query ??
+select column1, array_repeat(column1, column2) from array_repeat_large_nested_null_table;
+----
+[[1, 2], [3, 4]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
+[[5, 6], [7, 8]] NULL
+[[NULL, NULL]] [[[NULL, NULL]]]
+NULL [NULL, NULL, NULL]
+
+statement ok
+drop table array_repeat_large_nested_null_table
+
 ## array_concat (aliases: `array_cat`, `list_concat`, `list_cat`)
 
 # test with empty array
@@ -3070,6 +3357,42 @@ select array_concat([]);
 ----
 []
 
+# test with NULL array
+query ?
+select array_concat(NULL::integer[]);
+----
+NULL
+
+# test with multiple NULL arrays
+query ?
+select array_concat(NULL::integer[], NULL::integer[]);
+----
+NULL
+
+# test with NULL LargeList
+query ?
+select array_concat(arrow_cast(NULL::string[], 'LargeList(Utf8)'));
+----
+NULL
+
+# test with NULL FixedSizeList
+query ?
+select array_concat(arrow_cast(NULL::string[], 'FixedSizeList(2, Utf8)'));
+----
+NULL
+
+# test with mix of NULL and empty arrays
+query ?
+select array_concat(NULL::integer[], []);
+----
+[]
+
+# test with mix of NULL and non-empty arrays
+query ?
+select array_concat(NULL::integer[], [1, 2, 3]);
+----
+[1, 2, 3]
+
 # Concatenating strings arrays
 query ?
 select array_concat(
@@ -3079,6 +3402,22 @@ select array_concat(
 ----
 [1, 2, 3]
 
+query ?
+select array_concat(
+  arrow_cast(['1', '2'], 'LargeList(Utf8)'),
+  arrow_cast(['3'], 'LargeList(Utf8)')
+);
+----
+[1, 2, 3]
+
+query ?
+select array_concat(
+  arrow_cast(['1', '2'], 'FixedSizeList(2, Utf8)'),
+  arrow_cast(['3'], 'FixedSizeList(1, Utf8)')
+);
+----
+[1, 2, 3]
+
 # Concatenating string arrays
 query ?
 select array_concat(
@@ -3112,10 +3451,26 @@ select
     array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]),
     arrow_typeof(array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]));
 ----
-[1, 2, 3] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+[1, 2, 3] List(Utf8View)
+
+# array_concat with NULL elements inside arrays
+query ?
+select array_concat([1, NULL, 3], [NULL, 5]);
+----
+[1, NULL, 3, NULL, 5]
+
+query ?
+select array_concat([NULL, NULL], [1, 2], [NULL]);
+----
+[NULL, NULL, 1, 2, NULL]
+
+query ?
+select array_concat([NULL, NULL], [NULL, NULL]);
+----
+[NULL, NULL, NULL, NULL]
 
 # array_concat error
-query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with "Error during planning: array_concat does not support type Int64"
+query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with: Error during planning: array_concat does not support type Int64
 select array_concat(1, 2);
 
 # array_concat scalar function #1
@@ -3367,10 +3722,16 @@ select array_concat(make_array(column3), column1, column2) from arrays_values_v2
 ## array_position (aliases: `list_position`, `array_indexof`, `list_indexof`)
 
 ## array_position with NULL (follow PostgreSQL)
-#query I
-#select array_position([1, 2, 3, 4, 5], null), array_position(NULL, 1);
-#----
-#NULL NULL
+query II
+select array_position([1, 2, 3, 4, 5], arrow_cast(NULL, 'Int64')), array_position(arrow_cast(NULL, 'List(Int64)'), 1);
+----
+NULL NULL
+
+# array_position with no match (incl. empty array) returns NULL
+query II
+select array_position([], 1), array_position([2], 1);
+----
+NULL NULL
 
 # array_position scalar function #1
 query III
@@ -3383,6 +3744,11 @@ select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'),
 ----
 3 5 1
 
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+----
+3 5 1
+
 # array_position scalar function #2 (with optional argument)
 query III
 select array_position(['h', 'e', 'l', 'l', 'o'], 'l', 4), array_position([1, 2, 5, 4, 5], 5, 4), array_position([1, 1, 1], 1, 2);
@@ -3394,6 +3760,11 @@ select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'),
 ----
 4 5 2
 
+query III
+select array_position(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l', 4), array_position(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5, 4), array_position(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1, 2);
+----
+4 5 2
+
 # array_position scalar function #3 (element is list)
 query II
 select array_position(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_position(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]);
@@ -3525,47 +3896,158 @@ select array_position(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [11, 12, 13]),
 NULL 6 4
 NULL 1 NULL
 
-## array_positions (aliases: `list_positions`)
-
-query ?
-select array_positions([1, 2, 3, 4, 5], null);
+# array_position with NULL element in haystack array (NULL = NULL semantics)
+query III
+select array_position([1, NULL, 3], arrow_cast(NULL, 'Int64')), array_position([NULL, 2, 3], arrow_cast(NULL, 'Int64')), array_position([1, 2, NULL], arrow_cast(NULL, 'Int64'));
 ----
-[]
-
-#TODO: https://github.com/apache/datafusion/issues/7142
-# array_positions with NULL (follow PostgreSQL)
-#query ?
-#select array_positions(null, 1);
-#----
-#NULL
+2 1 3
 
-# array_positions scalar function #1
-query ???
-select array_positions(['h', 'e', 'l', 'l', 'o'], 'l'), array_positions([1, 2, 3, 4, 5], 5), array_positions([1, 1, 1], 1);
+query I
+select array_position(arrow_cast([1, NULL, 3], 'LargeList(Int64)'), arrow_cast(NULL, 'Int64'));
 ----
-[3, 4] [5] [1, 2, 3]
+2
 
-query ???
-select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+# array_position with NULL element in array and start_from
+query II
+select array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 2), array_position([NULL, 1, NULL, 2], arrow_cast(NULL, 'Int64'), 1);
 ----
-[3, 4] [5] [1, 2, 3]
+3 1
 
-query ???
-select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+# array_position with column array and scalar element
+query IIII
+select array_position(column1, 3), array_position(column1, 10), array_position(column1, 20), array_position(column1, 999) from arrays_values_without_nulls;
 ----
-[3, 4] [5] [1, 2, 3]
+3 10 NULL NULL
+NULL NULL 10 NULL
+NULL NULL NULL NULL
+NULL NULL NULL NULL
 
-# array_positions scalar function #2 (element is list)
-query ?
-select array_positions(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), [2, 1, 3]);
+query II
+select array_position(column1, 3), array_position(column1, 20) from large_arrays_values_without_nulls;
 ----
-[2, 4]
+3 NULL
+NULL 10
+NULL NULL
+NULL NULL
 
-query ?
-select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'LargeList(List(Int64))'), [2, 1, 3]);
+query II
+select array_position(column1, 3), array_position(column1, 20) from fixed_size_arrays_values_without_nulls;
 ----
-[2, 4]
-
+3 NULL
+NULL 10
+NULL NULL
+NULL NULL
+
+# array_position with column array, scalar element, and scalar start_from
+query II
+select array_position(column1, 3, 1), array_position(column1, 3, 4) from arrays_values_without_nulls;
+----
+3 NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+query II
+select array_position(column1, 3, 1), array_position(column1, 3, 4) from large_arrays_values_without_nulls;
+----
+3 NULL
+NULL NULL
+NULL NULL
+NULL NULL
+
+# array_position with column array, scalar element, and column start_from
+query I
+select array_position(column1, 3, column3) from arrays_values_without_nulls;
+----
+3
+NULL
+NULL
+NULL
+
+# array_position with scalar haystack, scalar element, and column start_from
+query I
+select array_position([1, 2, 1, 2], 2, column3) from arrays_values_without_nulls;
+----
+2
+2
+4
+4
+
+# array_position start_from boundary cases
+query IIII
+select array_position([1, 2, 3], 3, 3), array_position([1, 2, 3], 1, 2), array_position([1, 2, 3], 1, 1), array_position([1, 2, 3], 3, 4);
+----
+3 NULL 1 NULL
+
+query II
+select array_position([1, 2, 3], 3, 4), array_position([1], 1, 2);
+----
+NULL NULL
+
+# array_position with empty array in various contexts
+query II
+select array_position(arrow_cast(make_array(), 'List(Int64)'), 1), array_position(arrow_cast(make_array(), 'LargeList(Int64)'), 1);
+----
+NULL NULL
+
+# FixedSizeList with start_from
+query II
+select array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 1, 2), array_position(arrow_cast([1, 2, 3, 1, 2], 'FixedSizeList(5, Int64)'), 2, 4);
+----
+4 5
+
+query I
+select array_position(arrow_cast(['a', 'b', 'c', 'b'], 'FixedSizeList(4, Utf8)'), 'b', 3);
+----
+4
+
+## array_positions (aliases: `list_positions`)
+
+# array_positions with empty array
+query ?
+select array_positions(arrow_cast(make_array(), 'List(Int64)'), 1);
+----
+[]
+
+query ?
+select array_positions([1, 2, 3, 4, 5], null);
+----
+[]
+
+#TODO: https://github.com/apache/datafusion/issues/7142
+# array_positions with NULL (follow PostgreSQL)
+#query ?
+#select array_positions(null, 1);
+#----
+#NULL
+
+# array_positions scalar function #1
+query ???
+select array_positions(['h', 'e', 'l', 'l', 'o'], 'l'), array_positions([1, 2, 3, 4, 5], 5), array_positions([1, 1, 1], 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'LargeList(Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'LargeList(Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+query ???
+select array_positions(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), 'l'), array_positions(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), 5), array_positions(arrow_cast([1, 1, 1], 'FixedSizeList(3, Int64)'), 1);
+----
+[3, 4] [5] [1, 2, 3]
+
+# array_positions scalar function #2 (element is list)
+query ?
+select array_positions(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), [2, 1, 3]);
+----
+[2, 4]
+
+query ?
+select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'LargeList(List(Int64))'), [2, 1, 3]);
+----
+[2, 4]
+
 query ?
 select array_positions(arrow_cast(make_array([1, 2, 3], [2, 1, 3], [1, 5, 6], [2, 1, 3], [4, 5, 6]), 'FixedSizeList(5, List(Int64))'), [2, 1, 3]);
 ----
@@ -3696,6 +4178,14 @@ select
 ----
 [1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
 
+query ???
+select
+  array_replace(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
+  array_replace(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
+  array_replace(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 4, 5, 4, 6, 7] [1, 2, 3]
+
 # array_replace scalar function #2 (element is list)
 query ??
 select
@@ -3727,6 +4217,21 @@ select
 ----
 [[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select
+  array_replace(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
 # list_replace scalar function #3 (function alias `list_replace`)
 query ???
 select list_replace(
@@ -3868,6 +4373,14 @@ select
 ----
 [1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
 
+query ???
+select
+  array_replace_n(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3, 2),
+  array_replace_n(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0, 2),
+  array_replace_n(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0, 3);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 4, 6, 7] [1, 2, 3]
+
 # array_replace_n scalar function #2 (element is list)
 query ??
 select
@@ -3903,6 +4416,23 @@ select
 ----
 [[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select
+  array_replace_n(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+      [4, 5, 6],
+      [1, 1, 1],
+      2
+    ),
+  array_replace_n(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4],
+    2
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
 # list_replace_n scalar function #3 (function alias `array_replace_n`)
 query ???
 select
@@ -4059,6 +4589,14 @@ select
 ----
 [1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
 
+query ???
+select
+  array_replace_all(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int64)'), 2, 3),
+  array_replace_all(arrow_cast(make_array(1, 4, 4, 5, 4, 6, 7), 'FixedSizeList(7, Int64)'), 4, 0),
+  array_replace_all(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4, 0);
+----
+[1, 3, 3, 4] [1, 0, 0, 5, 0, 6, 7] [1, 2, 3]
+
 # array_replace_all scalar function #2 (element is list)
 query ??
 select
@@ -4090,6 +4628,21 @@ select
 ----
 [[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select
+  array_replace_all(
+    arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [4, 5, 6],
+    [1, 1, 1]
+  ),
+  array_replace_all(
+    arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'),
+    [2, 3, 4],
+    [3, 1, 4]
+  );
+----
+[[1, 2, 3], [1, 1, 1], [5, 5, 5], [1, 1, 1], [7, 8, 9]] [[1, 3, 2], [3, 1, 4], [3, 1, 4], [5, 3, 1], [1, 3, 2]]
+
 # list_replace_all scalar function #3 (function alias `array_replace_all`)
 query ???
 select
@@ -4318,7 +4871,7 @@ NULL [baz] baz
 query T
 SELECT arrow_typeof(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd'));
 ----
-List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(Utf8View)
 
 # expect a,b,c,d. make_array forces all types to be of a common type (see above)
 query T
@@ -4355,6 +4908,16 @@ select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5,
 ----
 [1, 2, 3, 4, 5, 6]
 
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
+----
+[1, 2, 3, 4, 5, 6]
+
+query ?
+select array_union(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6], 'FixedSizeList(2, Int64)'));
+----
+[1, 2, 3, 4, 5, 6]
+
 # array_union scalar function #2
 query ?
 select array_union([1, 2, 3, 4], [5, 6, 7, 8]);
@@ -4429,10 +4992,11 @@ select array_union(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'LargeList
 []
 
 # array_union scalar function #7
-query ?
-select array_union([[null]], []);
-----
-[[]]
+# re-enable when https://github.com/apache/arrow-rs/issues/9227 is fixed
+# query ?
+# select array_union([[null]], []);
+# ----
+# [[]]
 
 query error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'array_union' function:
 select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([], 'LargeList(Int64)'));
@@ -4452,12 +5016,12 @@ select array_union(arrow_cast([[null]], 'LargeList(List(Int64))'), arrow_cast([[
 query ?
 select array_union(null, []);
 ----
-[]
+NULL
 
 query ?
 select array_union(null, arrow_cast([], 'LargeList(Int64)'));
 ----
-[]
+NULL
 
 # array_union scalar function #10
 query ?
@@ -4469,23 +5033,23 @@ NULL
 query ?
 select array_union([1, 1, 2, 2, 3, 3], null);
 ----
-[1, 2, 3]
+NULL
 
 query ?
 select array_union(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
 ----
-[1, 2, 3]
+NULL
 
 # array_union scalar function #12
 query ?
 select array_union(null, [1, 1, 2, 2, 3, 3]);
 ----
-[1, 2, 3]
+NULL
 
 query ?
 select array_union(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
 ----
-[1, 2, 3]
+NULL
 
 # array_union scalar function #13
 query ?
@@ -4509,6 +5073,46 @@ select array_union(arrow_cast(['hello'], 'LargeList(Utf8)'), arrow_cast(['hello'
 ----
 [hello, datafusion]
 
+query ?
+select array_union(column1, column2)
+from array_intersect_table_1D_NULL;
+----
+[1, 2, 3, 4]
+[2, 3]
+[3, 4]
+NULL
+NULL
+NULL
+
+query ?
+select array_union(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_union([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
+
+query ?
+select array_intersect(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_intersect([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
+
+query ?
+select array_except(arrow_cast(null, 'List(Int64)'), [1, 2]);
+----
+NULL
+
+query ?
+select array_except([1, 2], arrow_cast(null, 'List(Int64)'));
+----
+NULL
 
 # list_to_string scalar function #4 (function alias `array_to_string`)
 query TTT
@@ -4554,6 +5158,11 @@ select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'LargeList(Utf8)'),
 ----
 h,e,l,l,o 1-2-3-4-5 1|2|3
 
+query TTT
+select array_to_string(arrow_cast(['h', 'e', 'l', 'l', 'o'], 'FixedSizeList(5, Utf8)'), ','), array_to_string(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)'), '-'), array_to_string(arrow_cast([1.0, 2.0, 3.0], 'FixedSizeList(3, Float64)'), '|');
+----
+h,e,l,l,o 1-2-3-4-5 1|2|3
+
 # array_to_string scalar function with nulls #2
 query TTT
 select array_to_string(make_array('h', NULL, NULL, NULL, 'o'), ',', '-'), array_to_string(make_array(NULL, 2, NULL, 4, 5), '-', 'nil'), array_to_string(make_array(1.0, NULL, 3.0), '|', '0');
@@ -4565,6 +5174,43 @@ select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'Large
 ----
 h,-,-,-,o nil-2-nil-4-5 1|0|3
 
+query TTT
+select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'FixedSizeList(5, Utf8)'), ',', '-'), array_to_string(arrow_cast(make_array(NULL, 2, NULL, 4, 5), 'FixedSizeList(5, Int64)'), '-', 'nil'), array_to_string(arrow_cast(make_array(1.0, NULL, 3.0), 'FixedSizeList(3, Float64)'), '|', '0');
+----
+h,-,-,-,o nil-2-nil-4-5 1|0|3
+
+# array_to_string float formatting: special values and longer decimals
+query TTT
+select
+  array_to_string(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), '|'),
+  array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'LargeList(Float64)'), '|'),
+  array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'FixedSizeList(5, Float64)'), '|');
+----
+NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567
+
+# array_to_string float formatting: scientific-notation inputs
+query T
+select array_to_string(
+  make_array(
+    CAST('1E20' AS DOUBLE),
+    CAST('-1e+20' AS DOUBLE),
+    CAST('6.02214076e23' AS DOUBLE),
+    CAST('1.2345e6' AS DOUBLE),
+    CAST('1e-5' AS DOUBLE),
+    CAST('-1e-5' AS DOUBLE),
+    CAST('9.1093837015e-31' AS DOUBLE),
+    CAST('-2.5e-4' AS DOUBLE)
+  ),
+  '|'
+);
+----
+100000000000000000000|-100000000000000000000|602214076000000000000000|1234500|0.00001|-0.00001|0.00000000000000000000000000000091093837015|-0.00025
+
+query T
+select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-');
+----
+-,a,-
+
 # array_to_string with columns #1
 
 # For reference
@@ -4651,6 +5297,87 @@ NULL 1.2.3
 51_52_*_54_55_56_57_58_59_60 1.2.3
 61_62_63_64_65_66_67_68_69_70 1.2.3
 
+# array_to_string with per-row null_string column
+statement ok
+CREATE TABLE test_null_str_col AS VALUES
+  (make_array(1, NULL, 3), ',', 'N/A'),
+  (make_array(NULL, 5, NULL), ',', 'MISSING'),
+  (make_array(10, NULL, 12), '-', 'X'),
+  (make_array(20, NULL, 21), '-', NULL);
+
+query T
+SELECT array_to_string(column1, column2, column3) FROM test_null_str_col;
+----
+1,N/A,3
+MISSING,5,MISSING
+10-X-12
+20-21
+
+statement ok
+DROP TABLE test_null_str_col;
+
+# array_to_string with decimal values
+query T
+select array_to_string(arrow_cast(make_array(1.5, NULL, 3.14), 'List(Decimal128(10, 2))'), ',', 'N');
+----
+1.50,N,3.14
+
+# array_to_string with date values
+query T
+select array_to_string(arrow_cast(make_array('2024-01-15', '2024-06-30', '2024-12-25'), 'List(Date32)'), ',');
+----
+2024-01-15,2024-06-30,2024-12-25
+
+query T
+select array_to_string(arrow_cast(make_array('2024-01-15', NULL, '2024-12-25'), 'List(Date32)'), ',', 'N');
+----
+2024-01-15,N,2024-12-25
+
+# array_to_string with timestamp values
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Second, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Second, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Millisecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Millisecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Microsecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Microsecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Nanosecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Nanosecond, None)')), '|');
+----
+2024-01-15T10:30:00|2024-06-30T15:45:00
+
+# array_to_string with time values
+query T
+select array_to_string(make_array(arrow_cast('10:30:00', 'Time32(Second)'), arrow_cast('15:45:00', 'Time32(Second)')), ',');
+----
+10:30:00,15:45:00
+
+query T
+select array_to_string(make_array(arrow_cast('10:30:00', 'Time64(Microsecond)'), arrow_cast('15:45:00', 'Time64(Microsecond)')), ',');
+----
+10:30:00,15:45:00
+
+# array_to_string with interval values
+query T
+select array_to_string(make_array(interval '1 year 2 months', interval '3 days 4 hours'), ',');
+----
+14 mons,3 days 4 hours
+
+# array_to_string with duration values
+query T
+select array_to_string(make_array(arrow_cast(1000, 'Duration(Millisecond)'), arrow_cast(2000, 'Duration(Millisecond)')), ',');
+----
+PT1S,PT2S
+
+
 ## cardinality
 
 # cardinality scalar function
@@ -4689,12 +5416,17 @@ select cardinality(arrow_cast([[1, 2], [3, 4], [5, 6]], 'FixedSizeList(3, List(I
 query II
 select cardinality(make_array()), cardinality(make_array(make_array()))
 ----
-NULL 0
+0 0
+
+query II
+select cardinality([]), cardinality([]::int[]) as with_cast
+----
+0 0
 
 query II
 select cardinality(arrow_cast(make_array(), 'LargeList(Int64)')), cardinality(arrow_cast(make_array(make_array()), 'LargeList(List(Int64))'))
 ----
-NULL 0
+0 0
 
 #TODO
 #https://github.com/apache/datafusion/issues/9158
@@ -4703,6 +5435,12 @@ NULL 0
 #----
 #NULL 0
 
+# cardinality of NULL arrays should return NULL
+query II
+select cardinality(NULL), cardinality(arrow_cast(NULL, 'LargeList(Int64)'))
+----
+NULL NULL
+
 # cardinality with columns
 query III
 select cardinality(column1), cardinality(column2), cardinality(column3) from arrays;
@@ -4796,21 +5534,47 @@ select
  array_remove(make_array(1, null, 2), null),
  array_remove(make_array(1, null, 2, null), null);
 ----
-[1, 2] [1, 2, NULL]
+NULL NULL
 
 query ??
 select
  array_remove(arrow_cast(make_array(1, null, 2), 'LargeList(Int64)'), null),
  array_remove(arrow_cast(make_array(1, null, 2, null), 'LargeList(Int64)'), null);
 ----
-[1, 2] [1, 2, NULL]
+NULL NULL
 
 query ??
 select
  array_remove(arrow_cast(make_array(1, null, 2), 'FixedSizeList(3, Int64)'), null),
  array_remove(arrow_cast(make_array(1, null, 2, null), 'FixedSizeList(4, Int64)'), null);
 ----
-[1, 2] [1, 2, NULL]
+NULL NULL
+
+# array_remove with null element from column
+query ?
+select array_remove(column1, column2) from (values
+  (make_array(1, 2, 3), 2),
+  (make_array(4, 5, 6), null),
+  (make_array(7, 8, 9), 8),
+  (null, 1)
+) as t(column1, column2);
+----
+[1, 3]
+NULL
+[7, 9]
+NULL
+
+# array_remove with null element from column (LargeList)
+query ?
+select array_remove(column1, column2) from (values
+  (arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 2),
+  (arrow_cast(make_array(4, 5, 6), 'LargeList(Int64)'), null),
+  (arrow_cast(make_array(7, 8, 9), 'LargeList(Int64)'), 8)
+) as t(column1, column2);
+----
+[1, 3]
+NULL
+[7, 9]
 
 # array_remove scalar function #2 (element is list)
 query ??
@@ -4830,6 +5594,12 @@ select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5
 ----
 [[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
+       array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]]
+
 # list_remove scalar function #3 (function alias `array_remove`)
 query ???
 select list_remove(make_array(1, 2, 2, 1, 1), 2), list_remove(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove(make_array('h', 'e', 'l', 'l', 'o'), 'l');
@@ -4947,18 +5717,84 @@ select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [1
 
 ## array_remove_n (aliases: `list_remove_n`)
 
+# array_remove_n with null element scalar
+query ??
+select array_remove_n(make_array(1, 2, 2, 1, 1), NULL, 2),
+       array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2);
+----
+NULL [1, 1, 1]
+
+# array_remove_n with null element scalar (LargeList)
+query ??
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), NULL, 2),
+       array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2, 2);
+----
+NULL [1, 1, 1]
+
+# array_remove_n with null element from column
+query ?
+select array_remove_n(column1, column2, column3) from (values
+  (make_array(1, 2, 2, 1, 1), 2, 2),
+  (make_array(3, 4, 4, 3, 3), null, 2),
+  (make_array(5, 6, 6, 5, 5), 6, 1),
+  (null, 1, 1)
+) as t(column1, column2, column3);
+----
+[1, 1, 1]
+NULL
+[5, 6, 5, 5]
+NULL
+
+# array_remove_n with null element from column (LargeList)
+query ?
+select array_remove_n(column1, column2, column3) from (values
+  (arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2, 2),
+  (arrow_cast(make_array(3, 4, 4, 3, 3), 'LargeList(Int64)'), null, 2),
+  (arrow_cast(make_array(5, 6, 6, 5, 5), 'LargeList(Int64)'), 6, 1)
+) as t(column1, column2, column3);
+----
+[1, 1, 1]
+NULL
+[5, 6, 5, 5]
+
 # array_remove_n scalar function #1
 query ???
 select array_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), array_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), array_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
 ----
 [1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
 
+query ???
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int32)'), 2, 2),
+       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float32)'), 1.0, 2),
+       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
+query ???
+select array_remove_n(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int32)'), 2, 2),
+       array_remove_n(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float32)'), 1.0, 2),
+       array_remove_n(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l', 3);
+----
+[1, 1, 1] [2.0, 2.0, 1.0] [h, e, o]
+
 # array_remove_n scalar function #2 (element is list)
 query ??
 select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6], 2), array_remove_n(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4], 2);
 ----
 [[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6], 2),
+       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4], 2);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
+query ??
+select array_remove_n(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6], 2),
+       array_remove_n(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4], 2);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
 # list_remove_n scalar function #3 (function alias `array_remove_n`)
 query ???
 select list_remove_n(make_array(1, 2, 2, 1, 1), 2, 2), list_remove_n(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0, 2), list_remove_n(make_array('h', 'e', 'l', 'l', 'o'), 'l', 3);
@@ -5013,7 +5849,33 @@ select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12],
 query ?
 select array_remove_all(make_array(1, 2, 2, 1, 1), NULL);
 ----
-[1, 2, 2, 1, 1]
+NULL
+
+# array_remove_all with null element from column
+query ?
+select array_remove_all(column1, column2) from (values
+  (make_array(1, 2, 2, 1, 1), 2),
+  (make_array(3, 4, 4, 3, 3), null),
+  (make_array(5, 6, 6, 5, 5), 6),
+  (null, 1)
+) as t(column1, column2);
+----
+[1, 1, 1]
+NULL
+[5, 5, 5]
+NULL
+
+# array_remove_all with null element from column (LargeList)
+query ?
+select array_remove_all(column1, column2) from (values
+  (arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
+  (arrow_cast(make_array(3, 4, 4, 3, 3), 'LargeList(Int64)'), null),
+  (arrow_cast(make_array(5, 6, 6, 5, 5), 'LargeList(Int64)'), 6)
+) as t(column1, column2);
+----
+[1, 1, 1]
+NULL
+[5, 5, 5]
 
 # array_remove_all scalar function #1
 query ???
@@ -5021,6 +5883,13 @@ select array_remove_all(make_array(1, 2, 2, 1, 1), 2), array_remove_all(make_arr
 ----
 [1, 1, 1] [2.0, 2.0] [h, e, o]
 
+query ???
+select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2),
+       array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0),
+       array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l');
+----
+[1, 1, 1] [2.0, 2.0] [h, e, o]
+
 query ???
 select array_remove_all(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2), array_remove_all(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0), array_remove_all(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 'l');
 ----
@@ -5038,6 +5907,12 @@ select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [
 ----
 [[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
 
+query ??
+select array_remove_all(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, FixedSizeList(3, Int64))'), [4, 5, 6]),
+       array_remove_all(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]),  'FixedSizeList(5, FixedSizeList(3, Int64))'), [2, 3, 4]);
+----
+[[1, 2, 3], [5, 5, 5], [7, 8, 9]] [[1, 3, 2], [5, 3, 1], [1, 3, 2]]
+
 # list_remove_all scalar function #3 (function alias `array_remove_all`)
 query ???
 select list_remove_all(make_array(1, 2, 2, 1, 1), 2), list_remove_all(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 1.0), list_remove_all(make_array('h', 'e', 'l', 'l', 'o'), 'l');
@@ -5700,6 +6575,30 @@ false
 false
 false
 
+# array_has([1, 3, 5], 1) -> true (array contains element)
+# array_has([], 1) -> false (empty array, not null)
+# array_has(null, 1) -> null (null array)
+query BB
+select array_has(column1, column2), array_has(null, column2)
+from array_has_table_empty;
+----
+true NULL
+false NULL
+NULL NULL
+
+# Test for issue: array_has should return false for empty arrays, not null
+# This test demonstrates the correct behavior with COALESCE to show the distinction
+# array_has([1, 3, 5], 1) -> 'true'
+# array_has([], 1) -> 'false' (empty array should return false)
+# array_has(null, 1) -> 'null' (null array should return null)
+query ?T
+SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null')
+from array_has_table_empty;
+----
+[1, 3, 5] true
+[] false
+NULL null
+
 query B
 select array_has(column1, column2)
 from fixed_size_array_has_table_1D;
@@ -5707,14 +6606,13 @@ from fixed_size_array_has_table_1D;
 true
 false
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query BB
-#select array_has_all(column3, column4),
-#       array_has_any(column5, column6)
-#from fixed_size_array_has_table_1D;
-#----
-#true true
-#false false
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D;
+----
+true true
+false false
 
 query BBB
 select array_has(column1, column2),
@@ -5741,14 +6639,13 @@ from fixed_size_array_has_table_1D_Float;
 true
 false
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query BB
-#select array_has_all(column3, column4),
-#       array_has_any(column5, column6)
-#from fixed_size_array_has_table_1D_Float;
-#----
-#true true
-#false true
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D_Float;
+----
+true true
+false true
 
 query BBB
 select array_has(column1, column2),
@@ -5775,14 +6672,27 @@ from fixed_size_array_has_table_1D_Boolean;
 false
 true
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query BB
-#select array_has_all(column3, column4),
-#       array_has_any(column5, column6)
-#from fixed_size_array_has_table_1D_Boolean;
-#----
-#true true
-#true true
+query BB
+select array_has_all(column3, column4),
+       array_has_any(column5, column6)
+from fixed_size_array_has_table_1D_Boolean;
+----
+true true
+true true
+
+query BBBBBBBB
+select array_has_all(column3, arrow_cast(column4,'LargeList(Boolean)')),
+       array_has_any(column5, arrow_cast(column6,'LargeList(Boolean)')),
+       array_has_all(column3, arrow_cast(column4,'List(Boolean)')),
+       array_has_any(column5, arrow_cast(column6,'List(Boolean)')),
+       array_has_all(arrow_cast(column3, 'LargeList(Boolean)'), column4),
+       array_has_any(arrow_cast(column5, 'LargeList(Boolean)'), column6),
+       array_has_all(arrow_cast(column3, 'List(Boolean)'), column4),
+       array_has_any(arrow_cast(column5, 'List(Boolean)'), column6)
+from fixed_size_array_has_table_1D_Boolean;
+----
+true true true true true true true true
+true true true true true true true true
 
 query BBB
 select array_has(column1, column2),
@@ -5832,13 +6742,12 @@ from fixed_size_array_has_table_2D;
 false
 false
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query B
-#select array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
-#from fixed_size_array_has_table_2D;
-#----
-#true
-#false
+query B
+select array_has_all(arrow_cast(column3, 'LargeList(List(Int64))'), arrow_cast(column4, 'LargeList(List(Int64))'))
+from fixed_size_array_has_table_2D;
+----
+true
+false
 
 query B
 select array_has_all(column1, column2)
@@ -5854,13 +6763,12 @@ from array_has_table_2D_float;
 true
 false
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query B
-#select array_has_all(column1, column2)
-#from fixed_size_array_has_table_2D_float;
-#----
-#false
-#false
+query B
+select array_has_all(column1, column2)
+from fixed_size_array_has_table_2D_float;
+----
+false
+false
 
 query B
 select array_has(column1, column2) from array_has_table_3D;
@@ -5925,6 +6833,13 @@ NULL NULL false false
 false false NULL false
 false false false NULL
 
+# Row 1: [[NULL,2],[3,NULL]], [1.1,2.2,3.3], ['L','o','r','e','m']
+# Row 2: [[3,4],[5,6]], [NULL,5.5,6.6], ['i','p',NULL,'u','m']
+# Row 3: [[5,6],[7,8]], [7.7,8.8,9.9], ['d',NULL,'l','o','r']
+# Row 4: [[7,NULL],[9,10]], [10.1,NULL,12.2], ['s','i','t','a','b']
+# Row 5: NULL, [13.3,14.4,15.5], ['a','m','e','t','x']
+# Row 6: [[11,12],[13,14]], NULL, [',','a','b','c','d']
+# Row 7: [[15,16],[NULL,18]], [16.6,17.7,18.8], NULL
 query BBBB
 select array_has(column1, make_array(5, 6)),
        array_has(column1, make_array(7, NULL)),
@@ -5936,9 +6851,9 @@ false false false true
 true false true false
 true false false true
 false true false false
-false false false false
-false false false false
-false false false false
+NULL NULL false false
+false false NULL false
+false false false NULL
 
 query BBBB
 select array_has_all(make_array(1,2,3), []),
@@ -5985,25 +6900,24 @@ select array_has_all(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_ca
 ----
 true false true false false false true true false false true false true
 
-#TODO: array_has_all and array_has_any cannot handle FixedSizeList
-#query BBBBBBBBBBBBB
-#select array_has_all(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3), 'FixedSizeList(2, Int64)')),
-#       array_has_all(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 4), 'FixedSizeList(2, Int64)')),
-#       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2]), 'FixedSizeList(1, List(Int64))')),
-#       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,3]), 'FixedSizeList(1, List(Int64))')),
-#       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'FixedSizeList(3, List(Int64))')),
-#       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1]]), 'FixedSizeList(1, List(List(Int64)))')),
-#       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))')),
-#       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1,10,100), 'FixedSizeList(3, Int64)')),
-#       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(10, 100),'FixedSizeList(2, Int64)')),
-#       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'FixedSizeList(2, List(Int64))')),
-#       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'FixedSizeList(2, List(Int64))')),
-#       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'FixedSizeList(1, List(List(Int64)))')),
-#       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'FixedSizeList(2, List(List(Int64)))'))
-#;
-#----
-#true false true false false false true true false false true false true
-
+query BBBBBBBBBBBBB
+select array_has_all(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 3), 'FixedSizeList(2, Int64)')),
+       array_has_all(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1, 4), 'FixedSizeList(2, Int64)')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2]), 'FixedSizeList(1, List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,3]), 'FixedSizeList(1, List(Int64))')),
+       array_has_all(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,2], [3,4], [5,6]), 'FixedSizeList(3, List(Int64))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_all(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(1,10,100), 'FixedSizeList(3, Int64)')),
+       array_has_any(arrow_cast(make_array(1,2,3),'FixedSizeList(3, Int64)'), arrow_cast(make_array(10, 100),'FixedSizeList(2, Int64)')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([1,10], [10,4]), 'FixedSizeList(2, List(Int64))')),
+       array_has_any(arrow_cast(make_array([1,2], [3,4]), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array([10,20], [3,4]), 'FixedSizeList(2, List(Int64))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3], [4,5,6]]), 'FixedSizeList(1, List(List(Int64)))')),
+       array_has_any(arrow_cast(make_array([[1,2,3]]), 'FixedSizeList(1, List(List(Int64)))'), arrow_cast(make_array([[1,2,3]], [[4,5,6]]), 'FixedSizeList(2, List(List(Int64)))'))
+;
+----
+true false true false false false true true false false true false true
+
 # rewrite various array_has operations to InList where the haystack is a literal list
 # NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists
 
@@ -6023,18 +6937,16 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
 query I
 with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
@@ -6052,18 +6964,16 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
 query I
 with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
@@ -6081,27 +6991,23 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
-# FIXME: due to rewrite below not working, this is _extremely_ slow to evaluate
-# query I
-# with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
-# select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
-# ----
-# 1
+query I
+with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
+select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
+----
+1
 
-# FIXME: array_has with large list haystack not currently rewritten to InList
 query TT
 explain with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
 select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
@@ -6112,18 +7018,16 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)))
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32))
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
 query I
 with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
@@ -6141,18 +7045,16 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
 query I
 with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
@@ -6172,18 +7074,16 @@ logical_plan
 03)----SubqueryAlias: test
 04)------SubqueryAlias: t
 05)--------Projection:
-06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
-07)------------TableScan: tmp_table projection=[value]
+06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL)
+07)------------TableScan: generate_series() projection=[value]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-05)--------ProjectionExec: expr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
+05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IS NOT NULL OR NULL, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
 
 # any operator
 query ?
@@ -6270,7 +7170,7 @@ from array_distinct_table_2D;
 ----
 [[1, 2], [3, 4], [5, 6]]
 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-[NULL, [5, 6]]
+[[5, 6], NULL]
 
 query ?
 select array_distinct(column1)
@@ -6302,7 +7202,273 @@ from array_distinct_table_2D_fixed;
 ----
 [[1, 2], [3, 4], [5, 6]]
 [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
-[NULL, [5, 6]]
+[[5, 6], NULL]
+
+## arrays_zip (aliases: `list_zip`)
+
+# Spark example: arrays_zip(array(1, 2, 3), array(2, 3, 4))
+query ?
+select arrays_zip([1, 2, 3], [2, 3, 4]);
+----
+[{1: 1, 2: 2}, {1: 2, 2: 3}, {1: 3, 2: 4}]
+
+# Spark example: arrays_zip(array(1, 2), array(2, 3), array(3, 4))
+query ?
+select arrays_zip([1, 2], [2, 3], [3, 4]);
+----
+[{1: 1, 2: 2, 3: 3}, {1: 2, 2: 3, 3: 4}]
+
+# basic: two integer arrays of equal length
+query ?
+select arrays_zip([1, 2, 3], [10, 20, 30]);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# basic: two arrays with different element types (int + string)
+query ?
+select arrays_zip([1, 2, 3], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: 2, 2: b}, {1: 3, 2: c}]
+
+# three arrays of equal length
+query ?
+select arrays_zip([1, 2, 3], [10, 20, 30], [100, 200, 300]);
+----
+[{1: 1, 2: 10, 3: 100}, {1: 2, 2: 20, 3: 200}, {1: 3, 2: 30, 3: 300}]
+
+# four arrays of equal length
+query ?
+select arrays_zip([1], [2], [3], [4]);
+----
+[{1: 1, 2: 2, 3: 3, 4: 4}]
+
+# mixed element types: float + boolean
+query ?
+select arrays_zip([1.5, 2.5], [true, false]);
+----
+[{1: 1.5, 2: true}, {1: 2.5, 2: false}]
+
+# different length arrays: shorter array padded with NULLs
+query ?
+select arrays_zip([1, 2], [3, 4, 5]);
+----
+[{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]
+
+# different length arrays: first longer
+query ?
+select arrays_zip([1, 2, 3], [10]);
+----
+[{1: 1, 2: 10}, {1: 2, 2: NULL}, {1: 3, 2: NULL}]
+
+# different length: one single element, other three elements
+query ?
+select arrays_zip([1], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: NULL, 2: b}, {1: NULL, 2: c}]
+
+# empty arrays
+query ?
+select arrays_zip([], []);
+----
+[]
+
+# one empty, one non-empty
+query ?
+select arrays_zip([], [1, 2, 3]);
+----
+[{1: NULL, 2: 1}, {1: NULL, 2: 2}, {1: NULL, 2: 3}]
+
+# NULL elements inside arrays
+query ?
+select arrays_zip([1, NULL, 3], ['a', 'b', 'c']);
+----
+[{1: 1, 2: a}, {1: NULL, 2: b}, {1: 3, 2: c}]
+
+# all NULL elements
+query ?
+select arrays_zip([NULL::int, NULL, NULL], [NULL::text, NULL, NULL]);
+----
+[{1: NULL, 2: NULL}, {1: NULL, 2: NULL}, {1: NULL, 2: NULL}]
+
+# both args are NULL (entire list null)
+query ?
+select arrays_zip(NULL::int[], NULL::int[]);
+----
+NULL
+
+# one arg is NULL list, other is real array
+query ?
+select arrays_zip(NULL::int[], [1, 2, 3]);
+----
+[{1: NULL, 2: 1}, {1: NULL, 2: 2}, {1: NULL, 2: 3}]
+
+# real array + NULL list
+query ?
+select arrays_zip([1, 2], NULL::text[]);
+----
+[{1: 1, 2: NULL}, {1: 2, 2: NULL}]
+
+# column-level test with multiple rows
+query ?
+select arrays_zip(a, b) from (values ([1, 2], [10, 20]), ([3, 4, 5], [30]), ([6], [60, 70])) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}]
+[{1: 3, 2: 30}, {1: 4, 2: NULL}, {1: 5, 2: NULL}]
+[{1: 6, 2: 60}, {1: NULL, 2: 70}]
+
+# column-level test with NULL rows
+query ?
+select arrays_zip(a, b) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}]
+[{1: NULL, 2: 30}, {1: NULL, 2: 40}]
+[{1: 5, 2: NULL}, {1: 6, 2: NULL}]
+
+# column-level test with single argument
+query ?
+select arrays_zip(a) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 1}, {1: 2}]
+NULL
+[{1: 5}, {1: 6}]
+
+query ?
+select arrays_zip(b) from (values ([1, 2], [10, 20]), (null, [30, 40]), ([5, 6], null)) as t(a, b);
+----
+[{1: 10}, {1: 20}]
+[{1: 30}, {1: 40}]
+NULL
+
+# No input
+query error Error during planning: 'arrays_zip' does not support zero arguments
+select arrays_zip();
+
+# Non-array input
+query error DataFusion error: Execution error: arrays_zip expects array arguments, got Int64
+select arrays_zip(1, 2);
+
+# null input
+query ?
+select arrays_zip(null)
+----
+NULL
+
+# single empty array
+query ?
+select arrays_zip([])
+----
+[]
+
+
+# single array of null
+query ?
+select arrays_zip([null])
+----
+[{1: NULL}]
+
+query ?
+select arrays_zip([NULL::int])
+----
+[{1: NULL}]
+
+query ?
+select arrays_zip([NULL::int[]])
+----
+[{1: NULL}]
+
+# alias: list_zip
+query ?
+select list_zip([1, 2], [3, 4]);
+----
+[{1: 1, 2: 3}, {1: 2, 2: 4}]
+
+# column test: total values equal (3 each) but per-row lengths differ
+# a: [1]     b: [10, 20]   → row 0: a has 1, b has 2
+# a: [2, 3]  b: [30]       → row 1: a has 2, b has 1
+# total a values = 3, total b values = 3 (same!) but rows are misaligned
+query ?
+select arrays_zip(a, b) from (values ([1], [10, 20]), ([2, 3], [30])) as t(a, b);
+----
+[{1: 1, 2: 10}, {1: NULL, 2: 20}]
+[{1: 2, 2: 30}, {1: 3, 2: NULL}]
+
+# single element arrays
+query ?
+select arrays_zip([42], ['hello']);
+----
+[{1: 42, 2: hello}]
+
+# single argument
+query ?
+select arrays_zip([1, 2, 3]);
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip with LargeList inputs
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# arrays_zip with LargeList different lengths (padding)
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2), 'LargeList(Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: NULL, 2: 30}]
+
+# single argument from LargeList
+query ?
+select arrays_zip(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'));
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip with FixedSizeList inputs
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'),
+    arrow_cast(make_array(10, 20, 30), 'FixedSizeList(3, Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# single argument from FixedSizeList
+query ?
+select arrays_zip(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'));
+----
+[{1: 1}, {1: 2}, {1: 3}]
+
+# arrays_zip mixing List and LargeList
+query ?
+select arrays_zip(
+    [1, 2, 3],
+    arrow_cast(make_array(10, 20, 30), 'LargeList(Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: 30}]
+
+# arrays_zip mixing List and FixedSizeList with different lengths (padding)
+query ?
+select arrays_zip(
+    [1, 2, 3],
+    arrow_cast(make_array(10, 20), 'FixedSizeList(2, Int64)')
+);
+----
+[{1: 1, 2: 10}, {1: 2, 2: 20}, {1: 3, 2: NULL}]
+
+# arrays_zip with LargeList and FixedSizeList mixed types
+query ?
+select arrays_zip(
+    arrow_cast(make_array(1, 2), 'LargeList(Int64)'),
+    arrow_cast(make_array('a', 'b'), 'FixedSizeList(2, Utf8)')
+);
+----
+[{1: 1, 2: a}, {1: 2, 2: b}]
 
 query ???
 select array_intersect(column1, column2),
@@ -6337,7 +7503,7 @@ select array_intersect(column1, column2),
        array_intersect(column5, column6)
 from array_intersect_table_1D_Boolean;
 ----
-[] [false, true] [false]
+[] [true, false] [false]
 [false] [true] [true]
 
 query ???
@@ -6346,7 +7512,7 @@ select array_intersect(column1, column2),
        array_intersect(column5, column6)
 from large_array_intersect_table_1D_Boolean;
 ----
-[] [false, true] [false]
+[] [true, false] [false]
 [false] [true] [true]
 
 query ???
@@ -6355,8 +7521,8 @@ select array_intersect(column1, column2),
        array_intersect(column5, column6)
 from array_intersect_table_1D_UTF8;
 ----
-[bc] [arrow, rust] []
-[] [arrow, datafusion, rust] [arrow, rust]
+[bc] [rust, arrow] []
+[] [datafusion, rust, arrow] [rust, arrow]
 
 query ???
 select array_intersect(column1, column2),
@@ -6364,8 +7530,19 @@ select array_intersect(column1, column2),
        array_intersect(column5, column6)
 from large_array_intersect_table_1D_UTF8;
 ----
-[bc] [arrow, rust] []
-[] [arrow, datafusion, rust] [arrow, rust]
+[bc] [rust, arrow] []
+[] [datafusion, rust, arrow] [rust, arrow]
+
+query ?
+select array_intersect(column1, column2)
+from array_intersect_table_1D_NULL;
+----
+[2, 3]
+[3]
+[3]
+NULL
+NULL
+NULL
 
 query ??
 select array_intersect(column1, column2),
@@ -6434,6 +7611,17 @@ SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow
 ----
 [2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
 
+query ??????
+SELECT  array_intersect(arrow_cast(make_array(1,2,3), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,3,4), 'FixedSizeList(3, Int64)')),
+        array_intersect(arrow_cast(make_array(1,3,5), 'FixedSizeList(3, Int64)'), arrow_cast(make_array(2,4,6), 'FixedSizeList(3, Int64)')),
+        array_intersect(arrow_cast(make_array('aa','bb','cc'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'FixedSizeList(3, Utf8)')),
+        array_intersect(arrow_cast(make_array(true, false), 'FixedSizeList(2, Boolean)'), arrow_cast(make_array(true), 'FixedSizeList(1, Boolean)')),
+        array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'FixedSizeList(3, Float64)')),
+        array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'FixedSizeList(3, List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'FixedSizeList(3, List(Int64))'))
+;
+----
+[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]]
+
 query ?
 select array_intersect([], []);
 ----
@@ -6447,27 +7635,27 @@ select array_intersect(arrow_cast([], 'LargeList(Int64)'), arrow_cast([], 'Large
 query ?
 select array_intersect([1, 1, 2, 2, 3, 3], null);
 ----
-[]
+NULL
 
 query ?
 select array_intersect(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null);
 ----
-[]
+NULL
 
 query ?
 select array_intersect(null, [1, 1, 2, 2, 3, 3]);
 ----
-[]
+NULL
 
 query ?
 select array_intersect(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'));
 ----
-[]
+NULL
 
 query ?
 select array_intersect([], null);
 ----
-[]
+NULL
 
 query ?
 select array_intersect([[1,2,3]], [[]]);
@@ -6482,17 +7670,17 @@ select array_intersect([[null]], [[]]);
 query ?
 select array_intersect(arrow_cast([], 'LargeList(Int64)'), null);
 ----
-[]
+NULL
 
 query ?
 select array_intersect(null, []);
 ----
-[]
+NULL
 
 query ?
 select array_intersect(null, arrow_cast([], 'LargeList(Int64)'));
 ----
-[]
+NULL
 
 query ?
 select array_intersect(null, null);
@@ -6564,6 +7752,23 @@ select range(5),
 ----
 [0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
 
+# Ensure can coerce from other valid types
+query ???????????
+select range(5),
+       range(2, 5),
+       range(2, 10, 3),
+       range(10, 2, -3),
+       range(arrow_cast(1, 'Int8'), 5, -1),
+       range(arrow_cast(1, 'Int16'), arrow_cast(-5, 'Int8'), 1),
+       range(arrow_cast(1, 'Int32'), arrow_cast(-5, 'Int16'), arrow_cast(-1, 'Int8')),
+       range(DATE '1992-09-01', DATE '1993-03-01', arrow_cast('1 MONTH', 'Interval(YearMonth)')),
+       range(DATE '1993-02-01', arrow_cast(DATE '1993-01-01', 'Date64'), INTERVAL '-1' DAY),
+       range(arrow_cast(DATE '1989-04-01', 'Date64'), DATE '1993-03-01', INTERVAL '1' YEAR),
+       range(arrow_cast(DATE '1993-03-01', 'Date64'), arrow_cast(DATE '1989-04-01', 'Date64'), INTERVAL '1' YEAR)
+;
+----
+[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] []
+
 # Test range with zero step
 query error DataFusion error: Execution error: step can't be 0 for function range\(start \[, stop, step\]\)
 select range(1, 1, 0);
@@ -6729,6 +7934,17 @@ select generate_series('2021-01-01'::timestamp, '2021-01-01T15:00:00'::timestamp
 ----
 [2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
 
+# Other timestamp types are coerced to nanosecond
+query ?
+select generate_series(arrow_cast('2021-01-01'::timestamp, 'Timestamp(s)'), '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
+
+query ?
+select generate_series('2021-01-01'::timestamp, arrow_cast('2021-01-01T15:00:00'::timestamp, 'Timestamp(µs)'), INTERVAL '1' HOUR);
+----
+[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00]
+
 query ?
 select generate_series('2021-01-01T00:00:00EST'::timestamp, '2021-01-01T15:00:00-12:00'::timestamp, INTERVAL '1' HOUR);
 ----
@@ -6746,9 +7962,18 @@ select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond,
 [2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00]
 
 ## mixing types for timestamps is not supported
-query error DataFusion error: Internal error: Unexpected argument type for GENERATE_SERIES : Date32
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
 select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR);
 
+## mixing types not allowed even if an argument is null
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL);
+
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series(1, '2024-01-01', '2025-01-02');
+
+query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature
+select generate_series('2024-01-01'::timestamp, '2025-01-02', interval '1 day');
 
 ## should return NULL
 query ?
@@ -6767,11 +7992,6 @@ select generate_series(DATE '1992-09-01', DATE '1993-03-01', NULL);
 ----
 NULL
 
-query ?
-select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL);
-----
-NULL
-
 query ?
 select generate_series(NULL, DATE '1993-03-01', INTERVAL '1' YEAR);
 ----
@@ -6854,7 +8074,7 @@ query error DataFusion error: Execution error: step can't be 0 for function gene
 select generate_series(1, 1, 0);
 
 # Test generate_series with zero step
-query error DataFusion error: Execution error: Interval argument to GENERATE_SERIES must not be 0
+query error DataFusion error: Execution error: Interval argument to generate_series must not be 0
 select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '0' MINUTE);
 
 # Test generate_series with big steps
@@ -7003,7 +8223,7 @@ select array_except(column1, column2) from array_except_table;
 [2]
 []
 NULL
-[1, 2]
+NULL
 NULL
 
 statement ok
@@ -7024,7 +8244,7 @@ select array_except(column1, column2) from array_except_nested_list_table;
 ----
 [[1, 2]]
 [[3]]
-[[1, 2], [3]]
+NULL
 NULL
 []
 
@@ -7063,7 +8283,7 @@ select array_except(column1, column2) from array_except_table_ut8;
 ----
 [b, c]
 [a, bc]
-[a, bc, def]
+NULL
 NULL
 
 statement ok
@@ -7085,7 +8305,7 @@ select array_except(column1, column2) from array_except_table_bool;
 [true]
 [true]
 [false]
-[true, false]
+NULL
 NULL
 
 statement ok
@@ -7094,7 +8314,7 @@ drop table array_except_table_bool;
 query ?
 select array_except([], null);
 ----
-[]
+NULL
 
 query ?
 select array_except([], []);
@@ -7111,6 +8331,16 @@ select array_except(null, null)
 ----
 NULL
 
+query ?
+select array_except(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)'));
+----
+[1, 2]
+
+query ?
+select array_except(arrow_cast([1, 2, 3, 4], 'FixedSizeList(4, Int64)'), arrow_cast([5, 6, 3, 4], 'FixedSizeList(4, Int64)'));
+----
+[1, 2]
+
 ### Array operators tests
 
 
@@ -7193,7 +8423,7 @@ explain select [1,2,3] @> [1,3];
 ----
 logical_plan
 01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
-02)--EmptyRelation
+02)--EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
 02)--PlaceholderRowExec
@@ -7216,7 +8446,7 @@ explain select [1,3] <@ [1,2,3];
 ----
 logical_plan
 01)Projection: Boolean(true) AS array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))
-02)--EmptyRelation
+02)--EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[true as array_has_all(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(1),Int64(3)))]
 02)--PlaceholderRowExec
@@ -7258,8 +8488,8 @@ CREATE EXTERNAL TABLE fixed_size_list_array STORED AS PARQUET LOCATION '../core/
 query T
 select arrow_typeof(f0) from fixed_size_list_array;
 ----
-FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2)
-FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2)
+FixedSizeList(2 x Int64)
+FixedSizeList(2 x Int64)
 
 query ?
 select * from fixed_size_list_array;
@@ -7288,8 +8518,8 @@ select make_array(arrow_cast(f0, 'List(Int64)')) from fixed_size_list_array
 query T
 select arrow_typeof(make_array(arrow_cast(f0, 'List(Int64)'))) from fixed_size_list_array
 ----
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(List(Int64))
+List(List(Int64))
 
 query ?
 select make_array(f0) from fixed_size_list_array
@@ -7300,8 +8530,8 @@ select make_array(f0) from fixed_size_list_array
 query T
 select arrow_typeof(make_array(f0)) from fixed_size_list_array
 ----
-List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
-List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(FixedSizeList(2 x Int64))
+List(FixedSizeList(2 x Int64))
 
 query ?
 select array_concat(column1, [7]) from arrays_values_v2;
@@ -7330,7 +8560,7 @@ select flatten(make_array(1, 2, 1, 3, 2)),
 
 query ???
 select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'LargeList(Int64)')),
-       flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
+       flatten(arrow_cast(make_array([1], null, [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
        flatten(arrow_cast(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]), 'LargeList(LargeList(LargeList(Float64)))'));
 ----
 [1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
@@ -7342,6 +8572,14 @@ select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)'))
 ----
 [1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
 
+query ??TT
+select flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))')),
+       flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')),
+       arrow_typeof(flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))'))),
+       arrow_typeof(flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')));
+----
+[1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] LargeList(Int64) LargeList(FixedSizeList(1 x Float64))
+
 # flatten with column values
 query ????
 select flatten(column1),
@@ -7674,6 +8912,11 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 1);
 ----
 [1]
 
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 1);
+----
+[1]
+
 # array_resize scalar function #2
 query ?
 select array_resize(make_array(1, 2, 3), 5);
@@ -7685,6 +8928,11 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5);
 ----
 [1, 2, 3, NULL, NULL]
 
+query ?
+select array_resize(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 5);
+----
+[1, 2, 3, NULL, NULL]
+
 # array_resize scalar function #3
 query ?
 select array_resize(make_array(1, 2, 3), 5, 4);
@@ -7803,11 +9051,13 @@ select array_reverse(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array
 ----
 [3, 2, 1] [1]
 
-#TODO: support after FixedSizeList type coercion
-#query ??
-#select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'));
-#----
-#[3, 2, 1] [1]
+query ????
+select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')),
+  array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')),
+  array_reverse(arrow_cast(make_array(1, NULL, 3), 'FixedSizeList(3, Int64)')),
+  array_reverse(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)'));
+----
+[3, 2, 1] [1] [3, NULL, 1] [NULL, NULL, NULL]
 
 query ??
 select array_reverse(NULL), array_reverse([]);
@@ -7826,6 +9076,23 @@ NULL NULL
 [60, 59, 58, 57, 56, 55, 54, NULL, 52, 51] [51, 52, NULL, 54, 55, 56, 57, 58, 59, 60]
 [70, 69, 68, 67, 66, 65, 64, 63, 62, 61] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
 
+statement ok
+CREATE TABLE test_reverse_fixed_size AS VALUES
+  (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')),
+  (NULL);
+
+query ?
+SELECT array_reverse(column1) FROM test_reverse_fixed_size;
+----
+[3, 2, 1]
+[6, 5, 4]
+[9, 8, NULL]
+NULL
+
+statement ok
+DROP TABLE test_reverse_fixed_size;
 
 # Test defining a table with array columns
 statement ok
@@ -7851,19 +9118,19 @@ select * from test_create_array_table;
 query T
 select arrow_typeof(a) from test_create_array_table;
 ----
-List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(Int32)
 
 query T
 select arrow_typeof(c) from test_create_array_table;
 ----
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(List(Int32))
 
 # Test casting to array types
 # issue: https://github.com/apache/datafusion/issues/9440
 query ??T
 select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]);
 ----
-[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+[1, 2, 3] [[1]] List(Utf8View)
 
 # test empty arrays return length
 # issue: https://github.com/apache/datafusion/pull/12459
@@ -7883,12 +9150,49 @@ create table fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5,6]);
 query T
 select arrow_typeof(a) from fixed_size_col_table;
 ----
-FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3)
-FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3)
+FixedSizeList(3 x Int32)
+FixedSizeList(3 x Int32)
 
-statement error
+query ? rowsort
+SELECT DISTINCT a FROM fixed_size_col_table
+----
+[1, 2, 3]
+[4, 5, 6]
+
+query ?I rowsort
+SELECT a, count(*) FROM fixed_size_col_table GROUP BY a
+----
+[1, 2, 3] 1
+[4, 5, 6] 1
+
+statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2
 create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]);
 
+# https://github.com/apache/datafusion/issues/16187
+# should be NULL in case of out of bounds for Null Type
+query ?
+select [named_struct('a', 1, 'b', null)][-2];
+----
+NULL
+
+statement ok
+COPY (select [[true, false], [false, true]] a, [false, true] b union select [[null, null]], null) to 'test_files/scratch/array/array_has/single_file.parquet' stored as parquet;
+
+statement ok
+CREATE EXTERNAL TABLE array_has STORED AS PARQUET location 'test_files/scratch/array/array_has/single_file.parquet';
+
+query B
+select array_contains(a, b) from array_has order by 1 nulls last;
+----
+true
+NULL
+
+# Expected output (once supported):
+# ----
+# [5, 4, 3, 2, 1]
+query error
+select array_reverse(arrow_cast(make_array(1, 2, 3, 4, 5), 'ListView(Int64)'));
+
 ### Delete tables
 
 statement ok
@@ -8067,3 +9371,6 @@ drop table values_all_empty;
 
 statement ok
 drop table fixed_size_col_table;
+
+statement ok
+drop table array_has;
diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt
index 30f322cf98fcd..c3bc967bafb9e 100644
--- a/datafusion/sqllogictest/test_files/arrow_files.slt
+++ b/datafusion/sqllogictest/test_files/arrow_files.slt
@@ -19,12 +19,17 @@
 ## Arrow Files Format support
 #############
 
+# We using fixed arrow file to test for sqllogictests, and this arrow field is writing with arrow-ipc utf8,
+# so when we decode to read it's also loading utf8.
+# Currently, so we disable the map_string_types_to_utf8view
+statement ok
+set datafusion.sql_parser.map_string_types_to_utf8view = false;
 
 statement ok
 
 CREATE EXTERNAL TABLE arrow_simple
 STORED AS ARROW
-LOCATION '../core/tests/data/example.arrow';
+LOCATION '../datasource-arrow/tests/data/example.arrow';
 
 
 # physical plan
@@ -32,7 +37,7 @@ query TT
 EXPLAIN SELECT * FROM arrow_simple
 ----
 logical_plan TableScan: arrow_simple projection=[f0, f1, f2]
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow]]}, projection=[f0, f1, f2], file_type=arrow
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow]]}, projection=[f0, f1, f2], file_type=arrow
 
 # correct content
 query ITB
@@ -45,8 +50,8 @@ SELECT * FROM arrow_simple
 
 # Ensure that local files can not be read by default (a potential security issue)
 # (url table is only supported when DynamicFileCatalog is enabled)
-statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/example.arrow' not found
-SELECT * FROM '../core/tests/data/example.arrow';
+statement error DataFusion error: Error during planning: table 'datafusion.public.../datasource-arrow/tests/data/example.arrow' not found
+SELECT * FROM '../datasource-arrow/tests/data/example.arrow';
 
 # ARROW partitioned table
 statement ok
@@ -123,3 +128,263 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/
 # Errors in partition filters should be reported
 query error Divide by zero error
 SELECT f0 FROM arrow_partitioned WHERE CASE WHEN true THEN 1 / 0 ELSE part END = 1;
+
+#############
+## Arrow IPC stream format support
+#############
+
+# Test CREATE EXTERNAL TABLE with stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream.arrow';
+
+# physical plan for stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream
+----
+logical_plan TableScan: arrow_stream projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# stream format should return same data as file format
+query ITB
+SELECT * FROM arrow_stream
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+# Verify both file and stream formats return identical results
+query ITB
+SELECT * FROM arrow_simple ORDER BY f0
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+query ITB
+SELECT * FROM arrow_stream ORDER BY f0
+----
+1 foo true
+2 bar NULL
+3 baz false
+4 NULL true
+
+# Both formats should support projection pushdown
+query IT
+SELECT f0, f1 FROM arrow_simple ORDER BY f0
+----
+1 foo
+2 bar
+3 baz
+4 NULL
+
+query IT
+SELECT f0, f1 FROM arrow_stream ORDER BY f0
+----
+1 foo
+2 bar
+3 baz
+4 NULL
+
+# Both formats should support filtering
+query ITB
+SELECT * FROM arrow_simple WHERE f0 > 2 ORDER BY f0
+----
+3 baz false
+4 NULL true
+
+query ITB
+SELECT * FROM arrow_stream WHERE f0 > 2 ORDER BY f0
+----
+3 baz false
+4 NULL true
+
+# Test aggregations on stream format
+query I
+SELECT COUNT(*) FROM arrow_stream
+----
+4
+
+query I
+SELECT SUM(f0) FROM arrow_stream
+----
+10
+
+query I
+SELECT MAX(f0) FROM arrow_stream
+----
+4
+
+query I
+SELECT MIN(f0) FROM arrow_stream WHERE f0 IS NOT NULL
+----
+1
+
+# Test aggregations on file format for comparison
+query I
+SELECT COUNT(*) FROM arrow_simple
+----
+4
+
+query I
+SELECT SUM(f0) FROM arrow_simple
+----
+10
+
+# Test joins between file and stream formats
+query ITBITB
+SELECT a.f0, a.f1, a.f2, b.f0, b.f1, b.f2
+FROM arrow_simple a
+JOIN arrow_stream b ON a.f0 = b.f0
+WHERE a.f0 <= 2
+ORDER BY a.f0
+----
+1 foo true 1 foo true
+2 bar NULL 2 bar NULL
+
+# Test that both formats work in UNION
+query ITB
+SELECT * FROM arrow_simple WHERE f0 = 1
+UNION ALL
+SELECT * FROM arrow_stream WHERE f0 = 2
+ORDER BY f0
+----
+1 foo true
+2 bar NULL
+
+# Test GROUP BY on stream format
+query BI
+SELECT f2, COUNT(*) as cnt FROM arrow_stream GROUP BY f2 ORDER BY f2
+----
+false 1
+true 2
+NULL 1
+
+# Test DISTINCT on stream format
+query B
+SELECT DISTINCT f2 FROM arrow_stream ORDER BY f2
+----
+false
+true
+NULL
+
+# Test subquery with stream format
+query I
+SELECT f0 FROM arrow_simple WHERE f0 IN (SELECT f0 FROM arrow_stream WHERE f0 < 3) ORDER BY f0
+----
+1
+2
+
+# ARROW partitioned table (stream format)
+statement ok
+CREATE EXTERNAL TABLE arrow_partitioned_stream (
+    part Int,
+    f0 Bigint,
+    f1 String,
+    f2 Boolean
+)
+STORED AS ARROW
+LOCATION '../core/tests/data/partitioned_table_arrow_stream/'
+PARTITIONED BY (part);
+
+# select wildcard
+query ITBI
+SELECT * FROM arrow_partitioned_stream ORDER BY f0;
+----
+1 foo true 123
+2 bar false 123
+3 baz true 456
+4 NULL NULL 456
+
+# select all fields
+query IITB
+SELECT part, f0, f1, f2 FROM arrow_partitioned_stream ORDER BY f0;
+----
+123 1 foo true
+123 2 bar false
+456 3 baz true
+456 4 NULL NULL
+
+# select without partition column
+query IB
+SELECT f0, f2 FROM arrow_partitioned_stream ORDER BY f0
+----
+1 true
+2 false
+3 true
+4 NULL
+
+# select only partition column
+query I
+SELECT part FROM arrow_partitioned_stream ORDER BY part
+----
+123
+123
+456
+456
+
+# select without any table-related columns in projection
+query I
+SELECT 1 FROM arrow_partitioned_stream
+----
+1
+1
+1
+1
+
+# select with partition filter
+query I
+SELECT f0 FROM arrow_partitioned_stream WHERE part = 123 ORDER BY f0
+----
+1
+2
+
+# select with partition filter should scan only one directory
+query TT
+EXPLAIN SELECT f0 FROM arrow_partitioned_stream WHERE part = 456
+----
+logical_plan TableScan: arrow_partitioned_stream projection=[f0], full_filters=[arrow_partitioned_stream.part = Int32(456)]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table_arrow_stream/part=456/data.arrow]]}, projection=[f0], file_type=arrow_stream
+
+
+# Errors in partition filters should be reported
+query error Divide by zero error
+SELECT f0 FROM arrow_partitioned_stream WHERE CASE WHEN true THEN 1 / 0 ELSE part END = 1;
+
+# Test CREATE EXTERNAL TABLE with empty stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream_empty
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream_empty.arrow'; 
+
+# physical plan for empty stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream_empty
+----
+logical_plan TableScan: arrow_stream_empty projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream_empty.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# stream format should return same data as file format
+query ITB
+SELECT * FROM arrow_stream_empty
+----
+
+# Test CREATE EXTERNAL TABLE with corrupted stream format
+statement ok
+CREATE EXTERNAL TABLE arrow_stream_corrupted_metadata_length
+STORED AS ARROW
+LOCATION '../datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow'; 
+
+# physical plan for corrupted stream format
+query TT
+EXPLAIN SELECT * FROM arrow_stream_corrupted_metadata_length
+----
+logical_plan TableScan: arrow_stream_corrupted_metadata_length projection=[f0, f1, f2]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream_corrupted_metadata_length.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
+
+# querying corrupted stream format should result in error
+query error DataFusion error: Arrow error: Parser error: Unsupported message header type in IPC stream: 'NONE'
+SELECT * FROM arrow_stream_corrupted_metadata_length
diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt
index 654218531f1db..e00909ad5fc59 100644
--- a/datafusion/sqllogictest/test_files/arrow_typeof.slt
+++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt
@@ -61,13 +61,13 @@ Decimal128(38, 10)
 query T
 SELECT arrow_typeof(now()::timestamp)
 ----
-Timestamp(Nanosecond, None)
+Timestamp(ns)
 
 # arrow_typeof_timestamp_utc
 query T
 SELECT arrow_typeof(now())
 ----
-Timestamp(Nanosecond, Some("+00:00"))
+Timestamp(ns)
 
 # arrow_typeof_timestamp_date32(
 query T
@@ -95,10 +95,13 @@ SELECT arrow_cast('1', 'Int16')
 query error
 SELECT arrow_cast('1')
 
-query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string
+query error DataFusion error: Error during planning: Function 'arrow_cast' requires String, but received Int64 \(DataType: Int64\)
 SELECT arrow_cast('1', 43)
 
-query error Error unrecognized word: unknown
+query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string
+SELECT arrow_cast('1', arrow_cast('Utf8', 'Utf8'))
+
+query error DataFusion error: Execution error: Unsupported type 'unknown'\. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'\. Error unknown token: unknown
 SELECT arrow_cast('1', 'unknown')
 
 # Round Trip tests:
@@ -120,17 +123,17 @@ SELECT
   arrow_typeof(arrow_cast('foo', 'Utf8View')) as col_utf8_view,
   arrow_typeof(arrow_cast('foo', 'Binary')) as col_binary,
   arrow_typeof(arrow_cast('foo', 'LargeBinary')) as col_large_binary,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)')) as col_ts_s,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)')) as col_ts_ms,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)')) as col_ts_us,
-  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)')) as col_ts_ns,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)')) as col_ts_s,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ms)')) as col_ts_ms,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(µs)')) as col_ts_us,
+  arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ns)')) as col_ts_ns,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, Some("+08:00"))')) as col_tstz_s,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, Some("+08:00"))')) as col_tstz_ms,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, Some("+08:00"))')) as col_tstz_us,
   arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, Some("+08:00"))')) as col_tstz_ns,
   arrow_typeof(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) as col_dict
 ----
-Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Timestamp(Second, Some("+08:00")) Timestamp(Millisecond, Some("+08:00")) Timestamp(Microsecond, Some("+08:00")) Timestamp(Nanosecond, Some("+08:00")) Dictionary(Int32, Utf8)
+Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns) Timestamp(s, "+08:00") Timestamp(ms, "+08:00") Timestamp(µs, "+08:00") Timestamp(ns, "+08:00") Dictionary(Int32, Utf8)
 
 
 
@@ -239,10 +242,10 @@ drop table foo
 
 statement ok
 create table foo as select
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') as col_ts_s,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)') as col_ts_ms,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)') as col_ts_us,
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)') as col_ts_ns
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)') as col_ts_s,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ms)') as col_ts_ms,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(µs)') as col_ts_us,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(ns)') as col_ts_ns
 ;
 
 ## Ensure each column in the table has the expected type
@@ -255,7 +258,7 @@ SELECT
   arrow_typeof(col_ts_ns)
   FROM foo;
 ----
-Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None)
+Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns)
 
 
 statement ok
@@ -316,7 +319,7 @@ select arrow_cast(interval '30 minutes', 'Duration(Second)');
 ----
 0 days 0 hours 30 mins 0 secs
 
-query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to Duration\(Second\)
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*This feature is not implemented: Unsupported CAST from Utf8 to Duration\(s\)
 select arrow_cast('30 minutes', 'Duration(Second)');
 
 
@@ -337,7 +340,7 @@ select arrow_cast(timestamp '2000-01-01T00:00:00Z', 'Timestamp(Nanosecond, Some(
 ----
 2000-01-01T00:00:00+08:00
 
-statement error DataFusion error: Arrow error: Parser error: Invalid timezone "\+25:00": failed to parse timezone
+statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Parser error: Invalid timezone "\+25:00": failed to parse timezone
 select arrow_cast(timestamp '2000-01-01T00:00:00', 'Timestamp(Nanosecond, Some( "+25:00" ))');
 
 
@@ -357,12 +360,12 @@ select arrow_cast(make_array(1, 2, 3), 'List(Int64)');
 query T
 select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'List(Int64)'));
 ----
-List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(Int64)
 
 query T
 select arrow_typeof(arrow_cast(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'), 'List(List(Int64))'));
 ----
-List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(List(Int64))
 
 ## LargeList
 
@@ -380,12 +383,12 @@ select arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)');
 query T
 select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'));
 ----
-LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+LargeList(Int64)
 
 query T
 select arrow_typeof(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'));
 ----
-LargeList(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+LargeList(LargeList(Int64))
 
 ## FixedSizeList
 
@@ -406,7 +409,7 @@ select arrow_cast([1], 'FixedSizeList(1, Int64)');
 ----
 [1]
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast to FixedSizeList\(4\): value at index 0 has length 3
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast to FixedSizeList\(4\): value at index 0 has length 3
 select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(4, Int64)');
 
 query ?
@@ -417,7 +420,7 @@ select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)');
 query T
 select arrow_typeof(arrow_cast(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 'FixedSizeList(3, Int64)'));
 ----
-FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3)
+FixedSizeList(3 x Int64)
 
 query ?
 select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)');
diff --git a/datafusion/sqllogictest/test_files/async_udf.slt b/datafusion/sqllogictest/test_files/async_udf.slt
new file mode 100644
index 0000000000000..0708b59e519a0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/async_udf.slt
@@ -0,0 +1,101 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+create table data(x int) as values (-10), (2);
+
+# Async udf can be used in aggregation
+query I
+select min(async_abs(x)) from data;
+----
+2
+
+query TT
+explain select min(async_abs(x)) from data;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(async_abs(data.x))]]
+02)--TableScan: data projection=[x]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[min(async_abs(data.x))]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[min(async_abs(data.x))]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Async udf can be used in aggregation with group by
+query I rowsort
+select min(async_abs(x)) from data group by async_abs(x);
+----
+10
+2
+
+query TT
+explain select min(async_abs(x)) from data group by async_abs(x);
+----
+logical_plan
+01)Projection: min(async_abs(data.x))
+02)--Aggregate: groupBy=[[__common_expr_1 AS async_abs(data.x)]], aggr=[[min(__common_expr_1 AS async_abs(data.x))]]
+03)----Projection: async_abs(data.x) AS __common_expr_1
+04)------TableScan: data projection=[x]
+physical_plan
+01)ProjectionExec: expr=[min(async_abs(data.x))@1 as min(async_abs(data.x))]
+02)--AggregateExec: mode=FinalPartitioned, gby=[async_abs(data.x)@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))]
+03)----RepartitionExec: partitioning=Hash([async_abs(data.x)@0], 4), input_partitions=4
+04)------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as async_abs(data.x)], aggr=[min(async_abs(data.x))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------ProjectionExec: expr=[__async_fn_0@1 as __common_expr_1]
+07)------------AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Async udf can be used in filter
+query I
+select * from data where async_abs(x) < 5;
+----
+2
+
+query TT
+explain select * from data where async_abs(x) < 5;
+----
+logical_plan
+01)Filter: async_abs(data.x) < Int32(5)
+02)--TableScan: data projection=[x]
+physical_plan
+01)FilterExec: __async_fn_0@1 < 5, projection=[x@0]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Async udf can be used in projection
+query I rowsort
+select async_abs(x) from data;
+----
+10
+2
+
+query TT
+explain select async_abs(x) from data;
+----
+logical_plan
+01)Projection: async_abs(data.x)
+02)--TableScan: data projection=[x]
+physical_plan
+01)ProjectionExec: expr=[__async_fn_0@1 as async_abs(data.x)]
+02)--AsyncFuncExec: async_expr=[async_expr(name=__async_fn_0, expr=async_abs(x@0))]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt
index 4573af1d59b1b..2ad60c0082e87 100644
--- a/datafusion/sqllogictest/test_files/avro.slt
+++ b/datafusion/sqllogictest/test_files/avro.slt
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Currently, the avro not support Utf8View type, so we disable the map_varchar_to_utf8view
+# Currently, the avro not support Utf8View type, so we disable the map_string_types_to_utf8view
 # After https://github.com/apache/arrow-rs/issues/7262 released, we can remove this setting
 statement ok
-set datafusion.sql_parser.map_varchar_to_utf8view = false;
+set datafusion.sql_parser.map_string_types_to_utf8view = false;
 
 statement ok
 CREATE EXTERNAL TABLE alltypes_plain (
diff --git a/datafusion/sqllogictest/test_files/binary.slt b/datafusion/sqllogictest/test_files/binary.slt
index 1077c32e46f35..54ac51d9e780d 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -311,3 +311,13 @@ Foo foo Foo foo
 NULL NULL NULL NULL
 Bar Bar Bar Bar
 FooBar fooBar FooBar fooBar
+
+# show helpful error msg when Binary type is used with string functions
+query error DataFusion error: Error during planning: Function 'split_part' requires String, but received Binary \(DataType: Binary\)\.\n\nHint: Binary types are not automatically coerced to String\. Use CAST\(column AS VARCHAR\) to convert Binary data to String\.
+SELECT split_part(binary, '~', 2) FROM t WHERE binary IS NOT NULL LIMIT 1;
+
+# ensure the suggested CAST workaround works
+query T
+SELECT split_part(CAST(binary AS VARCHAR), 'o', 2) FROM t WHERE binary = X'466f6f';
+----
+(empty)
diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt
index 21913005e26ba..3953878ceb666 100644
--- a/datafusion/sqllogictest/test_files/case.slt
+++ b/datafusion/sqllogictest/test_files/case.slt
@@ -383,9 +383,10 @@ SELECT column2, column3, column4  FROM t;
 ----
 {foo: a, xxx: b} {xxx: c, foo: d} {xxx: e}
 
-# coerce structs with different field orders,
-# (note the *value*s are from column2 but the field name is 'xxx', as the coerced
-# type takes the field name from the last argument (column3)
+# coerce structs with different field orders
+# With name-based struct coercion, matching fields by name:
+# column2={foo:a, xxx:b} unified with column3={xxx:c, foo:d}
+# Result uses the THEN branch's field order (when executed): {xxx: b, foo: a}
 query ?
 SELECT
   case
@@ -394,9 +395,10 @@ SELECT
   end
 FROM t;
 ----
-{xxx: a, foo: b}
+{xxx: b, foo: a}
 
 # coerce structs with different field orders
+# When ELSE branch executes, uses its field order: {xxx: c, foo: d}
 query ?
 SELECT
   case
@@ -407,8 +409,9 @@ FROM t;
 ----
 {xxx: c, foo: d}
 
-# coerce structs with subset of fields
-query error Failed to coerce then
+# coerce structs with subset of fields - field count mismatch causes type coercion failure
+# column3 has 2 fields but column4 has only 1 field
+query error DataFusion error: type_coercion\ncaused by\nError during planning: Failed to coerce then .* and else .* to common types in CASE WHEN expression
 SELECT
   case
     when column1 > 0 then column3
@@ -467,6 +470,7 @@ FROM t;
 ----
 [{foo: blarg}]
 
+# mix of then and else
 query II
 SELECT v, CASE WHEN v != 0 THEN 10/v ELSE 42 END FROM (VALUES (0), (1), (2)) t(v)
 ----
@@ -474,11 +478,418 @@ SELECT v, CASE WHEN v != 0 THEN 10/v ELSE 42 END FROM (VALUES (0), (1), (2)) t(v
 1 10
 2 5
 
+# when expressions is always false, then branch should never be evaluated
 query II
 SELECT v, CASE WHEN v < 0 THEN 10/0 ELSE 1 END FROM (VALUES (1), (2)) t(v)
 ----
 1 1
 2 1
 
+# when expressions is always true, else branch should never be evaluated
+query II
+SELECT v, CASE WHEN v > 0 THEN 1 ELSE 10/0 END FROM (VALUES (1), (2)) t(v)
+----
+1 1
+2 1
+
+
+# lazy evaluation of multiple when branches, else branch should never be evaluated
+query II
+SELECT v, CASE WHEN v == 1 THEN -1 WHEN v == 2 THEN -2 WHEN v == 3 THEN -3  ELSE 10/0 END FROM (VALUES (1), (2), (3)) t(v)
+----
+1 -1
+2 -2
+3 -3
+
+# covers the InfallibleExprOrNull evaluation strategy
+query II
+SELECT v, CASE WHEN v THEN 1 END FROM (VALUES (1), (2), (3), (NULL)) t(v)
+----
+1 1
+2 1
+3 1
+NULL NULL
+
 statement ok
 drop table t
+
+query I
+SELECT case when true then 1 / 1 else 1 / 0 end;
+----
+1
+
+query I
+SELECT case when false then 1 / 0 else 1 / 1 end;
+----
+1
+
+# Else branch evaluation with case expression, 1 when branch, null input
+query I
+SELECT CASE a WHEN 'a' THEN 0 ELSE 1 END FROM (VALUES (NULL)) t(a)
+----
+1
+
+# Else branch evaluation with case expression, 2 when branches, null input
+query I
+SELECT CASE a WHEN 'a' THEN 0 WHEN 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL)) t(a)
+----
+2
+
+# Else branch evaluation without case expression, 1 when branch, null input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 ELSE 1 END FROM (VALUES (NULL)) t(a)
+----
+1
+
+# Else branch evaluation without case expression, 2 when branches, null input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL)) t(a)
+----
+2
+
+# Else branch evaluation with case expression, 1 when branch, non-null input
+query I
+SELECT CASE a WHEN 'a' THEN 0 ELSE 1 END FROM (VALUES ('z')) t(a)
+----
+1
+
+# Else branch evaluation with case expression, 2 when branches, non-null input
+query I
+SELECT CASE a WHEN 'a' THEN 0 WHEN 'b' THEN 1 ELSE 2 END FROM (VALUES ('z')) t(a)
+----
+2
+
+# Else branch evaluation without case expression, 1 when branch, non-null input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 ELSE 1 END FROM (VALUES ('z')) t(a)
+----
+1
+
+# Else branch evaluation without case expression, 2 when branches, non-null input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES ('z')) t(a)
+----
+2
+
+# Else branch evaluation with case expression, 1 when branch, mixed input
+query I
+SELECT CASE a WHEN 'a' THEN 0 ELSE 1 END FROM (VALUES (NULL), ('z')) t(a)
+----
+1
+1
+
+# Else branch evaluation with case expression, 2 when branches, mixed input
+query I
+SELECT CASE a WHEN 'a' THEN 0 WHEN 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL), ('z')) t(a)
+----
+2
+2
+
+# Else branch evaluation without case expression, 1 when branch, mixed input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 ELSE 1 END FROM (VALUES (NULL), ('z')) t(a)
+----
+1
+1
+
+# Else branch evaluation without case expression, 2 when branches, mixed input
+query I
+SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL), ('z')) t(a)
+----
+2
+2
+
+# The `WHEN 1/0` is not effectively reachable in this query and should never be executed
+query T
+SELECT CASE a WHEN 1 THEN 'a' WHEN 2 THEN 'b' WHEN 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a)
+----
+a
+b
+
+# The `WHEN 1/0` is not effectively reachable in this query and should never be executed
+query T
+SELECT CASE WHEN a = 1 THEN 'a' WHEN a = 2 THEN 'b' WHEN a = 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a)
+----
+a
+b
+
+# The `WHEN 1/0` is not effectively reachable in this query and should never be executed
+query T
+SELECT CASE WHEN a = 0 THEN 'a' WHEN 1 / a = 1 THEN 'b' ELSE 'c' END FROM (VALUES (0), (1), (2)) t(a)
+----
+a
+b
+c
+
+query I
+SELECT CASE WHEN d != 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+1
+NULL
+-1
+
+query I
+SELECT CASE WHEN d > 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+1
+NULL
+NULL
+
+query I
+SELECT CASE WHEN d < 0 THEN n / d ELSE NULL END FROM (VALUES (1, 1), (1, 0), (1, -1)) t(n,d)
+----
+NULL
+NULL
+-1
+
+# single WHEN, no ELSE (absent)
+query I
+SELECT CASE WHEN a > 0 THEN b END
+FROM (VALUES (1, 10), (0, 20)) AS t(a, b);
+----
+10
+NULL
+
+# single WHEN, explicit ELSE NULL
+query I
+SELECT CASE WHEN a > 0 THEN b ELSE NULL END
+FROM (VALUES (1, 10), (0, 20)) AS t(a, b);
+----
+10
+NULL
+
+# fallible THEN expression should only be evaluated on true rows
+query I
+SELECT CASE WHEN a > 0 THEN 10 / a END
+FROM (VALUES (1), (0)) AS t(a);
+----
+10
+NULL
+
+# all-false path returns typed NULLs
+query I
+SELECT CASE WHEN a < 0 THEN b END
+FROM (VALUES (1, 10), (2, 20)) AS t(a, b);
+----
+NULL
+NULL
+
+# EvalMethod::WithExpression using subset of all selected columns in case expression
+query III
+SELECT CASE a1 WHEN 1 THEN a1 WHEN 2 THEN a2 WHEN 3 THEN b END, b, c
+FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1 10 100
+2 20 200
+30 30 300
+
+# EvalMethod::NoExpression using subset of all selected columns in case expression
+query III
+SELECT CASE WHEN a1 = 1 THEN a2 WHEN a2 = 2 THEN a1 WHEN 3 THEN b END, b, c
+FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1 10 100
+2 20 200
+30 30 300
+
+# EvalMethod::ExpressionOrExpression using subset of all selected columns in case expression
+query III
+SELECT CASE WHEN a1 = 1 THEN a2 ELSE b END, b, c
+FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1 10 100
+20 20 200
+30 30 300
+
+# EvalMethod::WithExpression using all selected columns in case expression
+query I
+SELECT CASE a1 WHEN 1 THEN a1 WHEN 2 THEN a2 WHEN 3 THEN NULL END
+FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1
+2
+NULL
+
+# EvalMethod::NoExpression using all selected columns in case expression
+query I
+SELECT CASE WHEN a1 = 1 THEN a2 WHEN a2 = 2 THEN a1 WHEN 3 THEN NULL END
+FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1
+2
+NULL
+
+# EvalMethod::ExpressionOrExpression using all selected columns in case expression
+query I
+SELECT CASE WHEN a1 = 1 THEN a2 ELSE NULL END
+FROM (SELECT a as a1, a as a2 FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c));
+----
+1
+NULL
+NULL
+
+# Nested case with projection
+query III
+SELECT CASE WHEN a = -1 THEN b WHEN a = -2 THEN -b END, b, c
+FROM (
+  SELECT b, c, CASE WHEN a1 = 1 THEN -a2 WHEN a1 = 2 THEN -a1 END as a
+  FROM (SELECT a as a1, a as a2, b, c FROM (VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300)) t(a, b, c))
+);
+----
+10 10 100
+-20 20 200
+NULL 30 300
+
+# Case-with-expression that was incorrectly classified as not-nullable, but evaluates to null
+query I
+SELECT CASE 0 WHEN 0 THEN NULL WHEN SUM(1) + COUNT(*) THEN 10 ELSE 20 END
+----
+NULL
+
+query TT
+EXPLAIN SELECT CASE WHEN CASE WHEN a IS NOT NULL THEN a ELSE 1 END IS NOT NULL THEN a ELSE 1 END FROM (
+    VALUES (10), (20), (30)
+  ) t(a);
+----
+logical_plan
+01)Projection: t.a AS CASE WHEN CASE WHEN t.a IS NOT NULL THEN t.a ELSE Int64(1) END IS NOT NULL THEN t.a ELSE Int64(1) END
+02)--SubqueryAlias: t
+03)----Projection: column1 AS a
+04)------Values: (Int64(10)), (Int64(20)), (Int64(30))
+physical_plan
+01)ProjectionExec: expr=[column1@0 as CASE WHEN CASE WHEN t.a IS NOT NULL THEN t.a ELSE Int64(1) END IS NOT NULL THEN t.a ELSE Int64(1) END]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#####
+# CASE with literal characters (to test lookup table CASE optimization)
+#####
+statement ok
+create table source (letter varchar) as values ('a'), ('b'), (NULL), ('c'), ('a'), ('c'), ('d');
+
+# Table with different string types
+statement ok
+create table letters as
+select
+  arrow_cast(letter, 'Utf8') as letter_utf8,
+  arrow_cast(letter, 'LargeUtf8') as letter_large_utf8,
+  arrow_cast(letter, 'Utf8View') as letter_utf8_view,
+  arrow_cast(letter, 'Dictionary(Int32, Utf8)') as letter_string_dict,
+from source;
+
+
+query TIIIII
+select
+  letter_utf8 as letter
+  ,CASE letter_utf8       WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8
+  ,CASE letter_large_utf8 WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as large_utf8
+  ,CASE letter_utf8_view  WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8_view
+  ,CASE letter_string_dict WHEN 'b' THEN 1 WHEN 'a' THEN 2 WHEN 'd' THEN 3 ELSE 0 END as string_dict
+  ,CASE letter_utf8       WHEN 'b' THEN 1 WHEN NULL THEN 2 WHEN 'd' THEN 3 ELSE 0 END as utf8_with_null
+FROM letters;
+----
+a 2 2 2 2 0
+b 1 1 1 1 1
+NULL 0 0 0 0 0
+c 0 0 0 0 0
+a 2 2 2 2 0
+c 0 0 0 0 0
+d 3 3 3 3 3
+
+statement ok
+create table letters_binary as
+select
+  arrow_cast(letter, 'Binary') as letter_binary,
+  arrow_cast(letter, 'LargeBinary') as letter_large_binary,
+  arrow_cast(letter, 'BinaryView') as letter_binary_view,
+  arrow_cast(letter, 'Dictionary(Int32, Binary)') as letter_binary_dict,
+  arrow_cast(arrow_cast(letter, 'Binary'), 'FixedSizeBinary(1)') as letter_fsb
+from source;
+
+query ?IIIII
+select
+    letter_binary as letter
+    ,CASE letter_binary       WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary
+    ,CASE letter_large_binary WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as large_binary
+    ,CASE letter_binary_view  WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary_view
+    ,CASE letter_binary_dict  WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as binary_dict
+    ,CASE letter_fsb          WHEN X'62' THEN 1 WHEN X'61' THEN 2 WHEN X'64' THEN 3 ELSE 0 END as fsb
+FROM letters_binary;
+----
+61 2 2 2 2 2
+62 1 1 1 1 1
+NULL 0 0 0 0 0
+63 0 0 0 0 0
+61 2 2 2 2 2
+63 0 0 0 0 0
+64 3 3 3 3 3
+
+statement ok
+drop table source;
+
+
+statement ok
+drop table letters;
+
+statement ok
+drop table letters_binary;
+
+# Tests for CASE with boolean expressions
+statement ok
+create table booleans (b boolean) as values (true), (false), (null), (true), (null), (false);
+
+query BIII
+select
+    b as boolean_value
+    ,CASE b WHEN true THEN 1 WHEN false THEN 2 ELSE 0 END as boolean_case
+    ,CASE b WHEN false THEN 1 WHEN true THEN 2 ELSE 0 END as boolean_case_rev
+    ,CASE b WHEN true THEN 1 WHEN NULL THEN 2 WHEN false THEN 3 ELSE 0 END as boolean_with_nulls
+FROM booleans;
+----
+true 1 2 1
+false 2 1 3
+NULL 0 0 0
+true 1 2 1
+NULL 0 0 0
+false 2 1 3
+
+statement ok
+drop table booleans;
+
+# Tests for CASE with floating point literals
+statement ok
+create table float_source (f float) as values (1.0), (2.0), (null), (3.5), (2.0), (null);
+
+statement ok
+create table floats as
+select
+    arrow_cast(f, 'Float16') as f16,
+    arrow_cast(f, 'Float32') as f32,
+    arrow_cast(f, 'Float64') as f64,
+    arrow_cast(f, 'Dictionary(Int32, Float32)') as f32_dict,
+from float_source;
+
+query RTTTT
+select
+    f32 as float_value
+    ,CASE f16      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f16_case
+    ,CASE f32      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f32_case
+    ,CASE f64      WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f64_case
+    ,CASE f32_dict WHEN 1.0 THEN 'one'  WHEN 3.5 THEN 'three_point_five' WHEN 2.0 THEN 'two' ELSE 'N/A' END as f32_dict_case
+FROM floats;
+----
+1 one one one one
+2 two two two two
+NULL N/A N/A N/A N/A
+3.5 three_point_five three_point_five three_point_five three_point_five
+2 two two two two
+NULL N/A N/A N/A N/A
+
+statement ok
+drop table float_source;
+
+statement ok
+drop table floats;
+
+#####
+# End of lookup table CASE tests
+#####
diff --git a/datafusion/sqllogictest/test_files/cast.slt b/datafusion/sqllogictest/test_files/cast.slt
index 3466354e54d71..916895b8be1eb 100644
--- a/datafusion/sqllogictest/test_files/cast.slt
+++ b/datafusion/sqllogictest/test_files/cast.slt
@@ -89,3 +89,39 @@ select * from t0 where v0<1e100;
 
 statement ok
 drop table t0;
+
+
+# ensure that automatically casting with "datafusion.optimizer.expand_views_at_output" does not
+# change the column name
+
+statement ok
+create table t(a int, b varchar);
+
+statement ok
+set datafusion.optimizer.expand_views_at_output = true;
+
+query TT
+explain select * from t;
+----
+logical_plan
+01)Projection: t.a, CAST(t.b AS LargeUtf8) AS b
+02)--TableScan: t projection=[a, b]
+physical_plan
+01)ProjectionExec: expr=[a@0 as a, CAST(b@1 AS LargeUtf8) as b]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+
+query TT
+explain select b from t;
+----
+logical_plan
+01)Projection: CAST(t.b AS LargeUtf8) AS b
+02)--TableScan: t projection=[b]
+physical_plan
+01)ProjectionExec: expr=[CAST(b@0 AS LargeUtf8) as b]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+
+statement ok
+set datafusion.optimizer.expand_views_at_output = false;
+
+statement ok
+drop table t;
diff --git a/datafusion/sqllogictest/test_files/clickbench.slt b/datafusion/sqllogictest/test_files/clickbench.slt
index 4c60a4365ee26..4e9849e3650a8 100644
--- a/datafusion/sqllogictest/test_files/clickbench.slt
+++ b/datafusion/sqllogictest/test_files/clickbench.slt
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
-# This file contains the clickbench schema and queries
-# and the first 10 rows of data. Since ClickBench contains case sensitive queries
-# this is also a good test of that usecase too
+## Notes: This file contains the ClickBench schema and queries and the first 10
+## rows of data. Since ClickBench contains case sensitive identifiers (e.g.
+## "EventDate") this is also a good test of that usecase too
 
 # create.sql came from
 # https://github.com/ClickHouse/ClickBench/blob/8b9e3aa05ea18afa427f14909ddc678b8ef0d5e6/datafusion/create.sql
@@ -26,53 +25,230 @@
 # COPY (SELECT * FROM 'hits.parquet' LIMIT 10) TO 'clickbench_hits_10.parquet' (FORMAT PARQUET);
 
 statement ok
-CREATE EXTERNAL TABLE hits
+CREATE EXTERNAL TABLE hits_raw
 STORED AS PARQUET
 LOCATION '../core/tests/data/clickbench_hits_10.parquet';
 
+# ClickBench encodes EventDate as UInt16 days since epoch.
+# So we define this view to convert it to the correct DATE type (this is done
+# in the ClickBench runner as well, see https://github.com/ClickHouse/ClickBench/pull/803
+statement ok
+CREATE VIEW hits AS
+SELECT * EXCEPT ("EventDate"),
+       CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
+FROM hits_raw;
+
+# Verify EventDate transformation from UInt16 to DATE
+
+query D
+SELECT "EventDate" FROM hits LIMIT 1;
+----
+2013-07-15
+
+# Verify the raw value is still UInt16 in hits_raw
+query I
+SELECT "EventDate" FROM hits_raw LIMIT 1;
+----
+15901
 
 # queries.sql came from
 # https://github.com/ClickHouse/ClickBench/blob/8b9e3aa05ea18afa427f14909ddc678b8ef0d5e6/datafusion/queries.sql
 
+## Q0
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits;
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[]
+physical_plan
+01)ProjectionExec: expr=[10 as count(*)]
+02)--PlaceholderRowExec
+
 query I
 SELECT COUNT(*) FROM hits;
 ----
 10
 
+## Q1
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------Projection:
+05)--------Filter: hits_raw.AdvEngineID != Int16(0)
+06)----------TableScan: hits_raw projection=[AdvEngineID], partial_filters=[hits_raw.AdvEngineID != Int16(0)]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: AdvEngineID@0 != 0, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@40 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)]
+
 query I
 SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
 ----
 0
 
+query TT
+EXPLAIN SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
+----
+logical_plan
+01)Projection: sum(hits.AdvEngineID), count(Int64(1)) AS count(*), avg(hits.ResolutionWidth)
+02)--Aggregate: groupBy=[[]], aggr=[[sum(CAST(hits.AdvEngineID AS Int64)), count(Int64(1)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[ResolutionWidth, AdvEngineID]
+physical_plan
+01)ProjectionExec: expr=[sum(hits.AdvEngineID)@0 as sum(hits.AdvEngineID), count(Int64(1))@1 as count(*), avg(hits.ResolutionWidth)@2 as avg(hits.ResolutionWidth)]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ResolutionWidth, AdvEngineID], file_type=parquet
+
 query IIR
 SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
 ----
 0 10 0
 
+## Q3
+query TT
+EXPLAIN SELECT AVG("UserID") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[avg(CAST(hits.UserID AS Float64))]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[UserID]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[avg(hits.UserID)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query R
 SELECT AVG("UserID") FROM hits;
 ----
 -304548765855551740
 
+## Q4
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "UserID") FROM hits;
+----
+logical_plan
+01)Projection: count(alias1) AS count(DISTINCT hits.UserID)
+02)--Aggregate: groupBy=[[]], aggr=[[count(alias1)]]
+03)----Aggregate: groupBy=[[hits.UserID AS alias1]], aggr=[[]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID]
+physical_plan
+01)ProjectionExec: expr=[count(alias1)@0 as count(DISTINCT hits.UserID)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(alias1)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[UserID@0 as alias1], aggr=[]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query I
 SELECT COUNT(DISTINCT "UserID") FROM hits;
 ----
 5
 
+## Q5
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
+----
+logical_plan
+01)Projection: count(alias1) AS count(DISTINCT hits.SearchPhrase)
+02)--Aggregate: groupBy=[[]], aggr=[[count(alias1)]]
+03)----Aggregate: groupBy=[[hits.SearchPhrase AS alias1]], aggr=[[]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[SearchPhrase]
+physical_plan
+01)ProjectionExec: expr=[count(alias1)@0 as count(DISTINCT hits.SearchPhrase)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(alias1)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as alias1], aggr=[]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet
+
 query I
 SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
 ----
 1
 
-query II
+## Q6
+query TT
+EXPLAIN SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[min(hits.EventDate), max(hits.EventDate)]]
+02)--SubqueryAlias: hits
+03)----Projection: CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+04)------TableScan: hits_raw projection=[EventDate]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[min(hits.EventDate), max(hits.EventDate)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CAST(CAST(EventDate@5 AS Int32) AS Date32) as EventDate], file_type=parquet
+
+query DD
 SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
 ----
-15901 15901
+2013-07-15 2013-07-15
+
+## Q7
+query TT
+EXPLAIN SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST
+02)--Projection: hits.AdvEngineID, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.AdvEngineID]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.AdvEngineID != Int16(0)
+06)----------TableScan: hits_raw projection=[AdvEngineID], partial_filters=[hits_raw.AdvEngineID != Int16(0)]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@1 DESC]
+02)--SortExec: expr=[count(*)@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[AdvEngineID@0 as AdvEngineID, count(Int64(1))@1 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([AdvEngineID@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[AdvEngineID@0 as AdvEngineID], aggr=[count(Int64(1))]
+07)------------FilterExec: AdvEngineID@0 != 0
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[AdvEngineID], file_type=parquet, predicate=AdvEngineID@40 != 0, pruning_predicate=AdvEngineID_null_count@2 != row_count@3 AND (AdvEngineID_min@0 != 0 OR 0 != AdvEngineID_max@1), required_guarantees=[AdvEngineID not in (0)]
 
 query II
 SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
 ----
 
+## Q8
+query TT
+EXPLAIN SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.RegionID, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.RegionID]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.RegionID, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[RegionID, UserID]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[RegionID@0 as RegionID, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([RegionID@0, alias1@1], 4), input_partitions=1
+09)----------------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID, UserID@1 as alias1], aggr=[]
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[RegionID, UserID], file_type=parquet
+
 query II rowsort
 SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
 ----
@@ -81,6 +257,25 @@ SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" O
 39 1
 839 2
 
+## Q9
+query TT
+EXPLAIN SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.RegionID, sum(hits.AdvEngineID), count(Int64(1)) AS count(*) AS c, avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)
+03)----Aggregate: groupBy=[[hits.RegionID]], aggr=[[sum(CAST(hits.AdvEngineID AS Int64)), count(Int64(1)), avg(CAST(hits.ResolutionWidth AS Float64)), count(DISTINCT hits.UserID)]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[RegionID, UserID, ResolutionWidth, AdvEngineID]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[RegionID@0 as RegionID, sum(hits.AdvEngineID)@1 as sum(hits.AdvEngineID), count(Int64(1))@2 as c, avg(hits.ResolutionWidth)@3 as avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)@4 as count(DISTINCT hits.UserID)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[RegionID@0 as RegionID], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)]
+05)--------RepartitionExec: partitioning=Hash([RegionID@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[RegionID@0 as RegionID], aggr=[sum(hits.AdvEngineID), count(Int64(1)), avg(hits.ResolutionWidth), count(DISTINCT hits.UserID)]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[RegionID, UserID, ResolutionWidth, AdvEngineID], file_type=parquet
+
 query IIIRI rowsort
 SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
 ----
@@ -89,26 +284,167 @@ SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), CO
 39 0 1 0 1
 839 0 6 0 2
 
+## Q10
+query TT
+EXPLAIN SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.MobilePhoneModel, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.MobilePhoneModel]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.MobilePhoneModel, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.MobilePhoneModel != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, MobilePhoneModel], partial_filters=[hits_raw.MobilePhoneModel != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[MobilePhoneModel@0 as MobilePhoneModel, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([MobilePhoneModel@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[MobilePhoneModel@0 as MobilePhoneModel], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[MobilePhoneModel@0 as MobilePhoneModel, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([MobilePhoneModel@0, alias1@1], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[MobilePhoneModel@1 as MobilePhoneModel, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: MobilePhoneModel@1 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, MobilePhoneModel], file_type=parquet, predicate=MobilePhoneModel@34 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 !=  OR  != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()]
+
 query TI
 SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q11
+query TT
+EXPLAIN SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.MobilePhone, hits.MobilePhoneModel, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.MobilePhone, hits.MobilePhoneModel]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.MobilePhone, hits.MobilePhoneModel, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.MobilePhoneModel != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, MobilePhone, MobilePhoneModel], partial_filters=[hits_raw.MobilePhoneModel != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, count(alias1)@2 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[MobilePhone@0 as MobilePhone, MobilePhoneModel@1 as MobilePhoneModel, alias1@2 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([MobilePhone@0, MobilePhoneModel@1, alias1@2], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[MobilePhone@1 as MobilePhone, MobilePhoneModel@2 as MobilePhoneModel, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: MobilePhoneModel@2 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, MobilePhone, MobilePhoneModel], file_type=parquet, predicate=MobilePhoneModel@34 != , pruning_predicate=MobilePhoneModel_null_count@2 != row_count@3 AND (MobilePhoneModel_min@0 !=  OR  != MobilePhoneModel_max@1), required_guarantees=[MobilePhoneModel not in ()]
+
 query ITI
 SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q12
+query TT
+EXPLAIN SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------FilterExec: SearchPhrase@0 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query TI
 SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q13
+query TT
+EXPLAIN SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
+----
+logical_plan
+01)Sort: u DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, count(alias1) AS u
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[count(alias1)]]
+04)------Aggregate: groupBy=[[hits.SearchPhrase, hits.UserID AS alias1]], aggr=[[]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[UserID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [u@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[u@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, count(alias1)@1 as u]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(alias1)]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@0 as SearchPhrase], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase, alias1@1 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([SearchPhrase@0, alias1@1], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[SearchPhrase@1 as SearchPhrase, UserID@0 as alias1], aggr=[]
+10)------------------FilterExec: SearchPhrase@1 != 
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query TI
 SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
 ----
 
+## Q14
+query TT
+EXPLAIN SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchEngineID, hits.SearchPhrase, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchEngineID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[SearchEngineID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchEngineID@0, SearchPhrase@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchEngineID@0 as SearchEngineID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------FilterExec: SearchPhrase@1 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query ITI
 SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q15
+query TT
+EXPLAIN SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, count(Int64(1))@1 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@0 as UserID], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet
+
 query II rowsort
 SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -118,6 +454,25 @@ SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIM
 519640690937130534 2
 7418527520126366595 1
 
+## Q16
+query TT
+EXPLAIN SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, hits.SearchPhrase, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID, SearchPhrase]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet
+
 query ITI rowsort
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -127,6 +482,24 @@ SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPh
 519640690937130534 (empty) 2
 7418527520126366595 (empty) 1
 
+## Q17
+query TT
+EXPLAIN SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.UserID, hits.SearchPhrase, count(Int64(1)) AS count(*)
+02)--Limit: skip=0, fetch=10
+03)----Aggregate: groupBy=[[hits.UserID, hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[UserID, SearchPhrase]
+physical_plan
+01)ProjectionExec: expr=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase, count(Int64(1))@2 as count(*)]
+02)--CoalescePartitionsExec: fetch=10
+03)----AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+04)------RepartitionExec: partitioning=Hash([UserID@0, SearchPhrase@1], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[UserID@0 as UserID, SearchPhrase@1 as SearchPhrase], aggr=[count(Int64(1))]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID, SearchPhrase], file_type=parquet
+
 query ITI rowsort
 SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
 ----
@@ -136,6 +509,25 @@ SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPh
 519640690937130534 (empty) 2
 7418527520126366595 (empty) 1
 
+## Q18
+query TT
+EXPLAIN SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(*) DESC NULLS FIRST, fetch=10
+02)--Projection: hits.UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)) AS m, hits.SearchPhrase, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[hits.UserID, date_part(Utf8("MINUTE"), to_timestamp_seconds(hits.EventTime)), hits.SearchPhrase]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[EventTime, UserID, SearchPhrase]
+physical_plan
+01)SortPreservingMergeExec: [count(*)@3 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(*)@3 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[UserID@0 as UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1 as m, SearchPhrase@2 as SearchPhrase, count(Int64(1))@3 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[UserID@0 as UserID, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1 as date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([UserID@0, date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime))@1, SearchPhrase@2], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[UserID@1 as UserID, date_part(MINUTE, to_timestamp_seconds(EventTime@0)) as date_part(Utf8("MINUTE"),to_timestamp_seconds(hits.EventTime)), SearchPhrase@2 as SearchPhrase], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, UserID, SearchPhrase], file_type=parquet
+
 query IITI rowsort
 SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
 ----
@@ -150,60 +542,340 @@ SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "S
 519640690937130534 36 (empty) 1
 7418527520126366595 18 (empty) 1
 
+## Q19
+query TT
+EXPLAIN SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
+----
+logical_plan
+01)SubqueryAlias: hits
+02)--Filter: hits_raw.UserID = Int64(435090932899640449)
+03)----TableScan: hits_raw projection=[UserID], partial_filters=[hits_raw.UserID = Int64(435090932899640449)]
+physical_plan
+01)FilterExec: UserID@0 = 435090932899640449
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[UserID], file_type=parquet, predicate=UserID@9 = 435090932899640449, pruning_predicate=UserID_null_count@2 != row_count@3 AND UserID_min@0 <= 435090932899640449 AND 435090932899640449 <= UserID_max@1, required_guarantees=[UserID in (435090932899640449)]
+
 query I
 SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
 ----
 
+## Q20
+query TT
+EXPLAIN SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
+03)----SubqueryAlias: hits
+04)------Projection:
+05)--------Filter: hits_raw.URL LIKE Utf8View("%google%")
+06)----------TableScan: hits_raw projection=[URL], partial_filters=[hits_raw.URL LIKE Utf8View("%google%")]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+05)--------FilterExec: URL@0 LIKE %google%, projection=[]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet, predicate=URL@13 LIKE %google%
+
 query I
 SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
 ----
 0
 
+## Q21
+query TT
+EXPLAIN SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, min(hits.URL), count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[min(hits.URL), count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.URL LIKE Utf8View("%google%") AND hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[URL, SearchPhrase], partial_filters=[hits_raw.URL LIKE Utf8View("%google%"), hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, min(hits.URL)@1 as min(hits.URL), count(Int64(1))@2 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[min(hits.URL), count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@1 as SearchPhrase], aggr=[min(hits.URL), count(Int64(1))]
+07)------------FilterExec: URL@0 LIKE %google% AND SearchPhrase@1 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL, SearchPhrase], file_type=parquet, predicate=URL@13 LIKE %google% AND SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@4 != row_count@5 AND (SearchPhrase_min@2 !=  OR  != SearchPhrase_max@3), required_guarantees=[SearchPhrase not in ()]
+
 query TTI
 SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q22
+query TT
+EXPLAIN SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchPhrase, min(hits.URL), min(hits.Title), count(Int64(1)) AS count(*) AS c, count(DISTINCT hits.UserID)
+03)----Aggregate: groupBy=[[hits.SearchPhrase]], aggr=[[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]]
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.Title LIKE Utf8View("%Google%") AND hits_raw.URL NOT LIKE Utf8View("%.google.%") AND hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[Title, UserID, URL, SearchPhrase], partial_filters=[hits_raw.Title LIKE Utf8View("%Google%"), hits_raw.URL NOT LIKE Utf8View("%.google.%"), hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@3 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@3 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase, min(hits.URL)@1 as min(hits.URL), min(hits.Title)@2 as min(hits.Title), count(Int64(1))@3 as c, count(DISTINCT hits.UserID)@4 as count(DISTINCT hits.UserID)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchPhrase@0 as SearchPhrase], aggr=[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]
+05)--------RepartitionExec: partitioning=Hash([SearchPhrase@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchPhrase@3 as SearchPhrase], aggr=[min(hits.URL), min(hits.Title), count(Int64(1)), count(DISTINCT hits.UserID)]
+07)------------FilterExec: Title@0 LIKE %Google% AND URL@2 NOT LIKE %.google.% AND SearchPhrase@3 != 
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Title, UserID, URL, SearchPhrase], file_type=parquet, predicate=Title@2 LIKE %Google% AND URL@13 NOT LIKE %.google.% AND SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@6 != row_count@7 AND (SearchPhrase_min@4 !=  OR  != SearchPhrase_max@5), required_guarantees=[SearchPhrase not in ()]
+
 query TTTII
 SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
 ----
 
-query IITIIIIIIIIIITTIIIIIIIIIITIIITIIIITTIIITIIIIIIIIIITIIIIITIIIIIITIIIIIIIIIITTTTIIIIIIIITITTITTTTTTTTTTIIII
+## Q23
+query TT
+EXPLAIN SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
+----
+logical_plan
+01)Sort: hits.EventTime ASC NULLS LAST, fetch=10
+02)--SubqueryAlias: hits
+03)----Projection: hits_raw.WatchID, hits_raw.JavaEnable, hits_raw.Title, hits_raw.GoodEvent, hits_raw.EventTime, hits_raw.CounterID, hits_raw.ClientIP, hits_raw.RegionID, hits_raw.UserID, hits_raw.CounterClass, hits_raw.OS, hits_raw.UserAgent, hits_raw.URL, hits_raw.Referer, hits_raw.IsRefresh, hits_raw.RefererCategoryID, hits_raw.RefererRegionID, hits_raw.URLCategoryID, hits_raw.URLRegionID, hits_raw.ResolutionWidth, hits_raw.ResolutionHeight, hits_raw.ResolutionDepth, hits_raw.FlashMajor, hits_raw.FlashMinor, hits_raw.FlashMinor2, hits_raw.NetMajor, hits_raw.NetMinor, hits_raw.UserAgentMajor, hits_raw.UserAgentMinor, hits_raw.CookieEnable, hits_raw.JavascriptEnable, hits_raw.IsMobile, hits_raw.MobilePhone, hits_raw.MobilePhoneModel, hits_raw.Params, hits_raw.IPNetworkID, hits_raw.TraficSourceID, hits_raw.SearchEngineID, hits_raw.SearchPhrase, hits_raw.AdvEngineID, hits_raw.IsArtifical, hits_raw.WindowClientWidth, hits_raw.WindowClientHeight, hits_raw.ClientTimeZone, hits_raw.ClientEventTime, hits_raw.SilverlightVersion1, hits_raw.SilverlightVersion2, hits_raw.SilverlightVersion3, hits_raw.SilverlightVersion4, hits_raw.PageCharset, hits_raw.CodeVersion, hits_raw.IsLink, hits_raw.IsDownload, hits_raw.IsNotBounce, hits_raw.FUniqID, hits_raw.OriginalURL, hits_raw.HID, hits_raw.IsOldCounter, hits_raw.IsEvent, hits_raw.IsParameter, hits_raw.DontCountHits, hits_raw.WithHash, hits_raw.HitColor, hits_raw.LocalEventTime, hits_raw.Age, hits_raw.Sex, hits_raw.Income, hits_raw.Interests, hits_raw.Robotness, hits_raw.RemoteIP, hits_raw.WindowName, hits_raw.OpenerName, hits_raw.HistoryLength, hits_raw.BrowserLanguage, hits_raw.BrowserCountry, hits_raw.SocialNetwork, hits_raw.SocialAction, hits_raw.HTTPError, hits_raw.SendTiming, hits_raw.DNSTiming, hits_raw.ConnectTiming, hits_raw.ResponseStartTiming, hits_raw.ResponseEndTiming, hits_raw.FetchTiming, hits_raw.SocialSourceNetworkID, hits_raw.SocialSourcePage, hits_raw.ParamPrice, hits_raw.ParamOrderID, hits_raw.ParamCurrency, hits_raw.ParamCurrencyID, hits_raw.OpenstatServiceName, hits_raw.OpenstatCampaignID, hits_raw.OpenstatAdID, hits_raw.OpenstatSourceID, hits_raw.UTMSource, hits_raw.UTMMedium, hits_raw.UTMCampaign, hits_raw.UTMContent, hits_raw.UTMTerm, hits_raw.FromTag, hits_raw.HasGCLID, hits_raw.RefererHash, hits_raw.URLHash, hits_raw.CLID, CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+04)------Filter: hits_raw.URL LIKE Utf8View("%google%")
+05)--------TableScan: hits_raw projection=[WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID], partial_filters=[hits_raw.URL LIKE Utf8View("%google%")]
+physical_plan
+01)SortPreservingMergeExec: [EventTime@4 ASC NULLS LAST], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[EventTime@4 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, JavaEnable@1 as JavaEnable, Title@2 as Title, GoodEvent@3 as GoodEvent, EventTime@4 as EventTime, CounterID@6 as CounterID, ClientIP@7 as ClientIP, RegionID@8 as RegionID, UserID@9 as UserID, CounterClass@10 as CounterClass, OS@11 as OS, UserAgent@12 as UserAgent, URL@13 as URL, Referer@14 as Referer, IsRefresh@15 as IsRefresh, RefererCategoryID@16 as RefererCategoryID, RefererRegionID@17 as RefererRegionID, URLCategoryID@18 as URLCategoryID, URLRegionID@19 as URLRegionID, ResolutionWidth@20 as ResolutionWidth, ResolutionHeight@21 as ResolutionHeight, ResolutionDepth@22 as ResolutionDepth, FlashMajor@23 as FlashMajor, FlashMinor@24 as FlashMinor, FlashMinor2@25 as FlashMinor2, NetMajor@26 as NetMajor, NetMinor@27 as NetMinor, UserAgentMajor@28 as UserAgentMajor, UserAgentMinor@29 as UserAgentMinor, CookieEnable@30 as CookieEnable, JavascriptEnable@31 as JavascriptEnable, IsMobile@32 as IsMobile, MobilePhone@33 as MobilePhone, MobilePhoneModel@34 as MobilePhoneModel, Params@35 as Params, IPNetworkID@36 as IPNetworkID, TraficSourceID@37 as TraficSourceID, SearchEngineID@38 as SearchEngineID, SearchPhrase@39 as SearchPhrase, AdvEngineID@40 as AdvEngineID, IsArtifical@41 as IsArtifical, WindowClientWidth@42 as WindowClientWidth, WindowClientHeight@43 as WindowClientHeight, ClientTimeZone@44 as ClientTimeZone, ClientEventTime@45 as ClientEventTime, SilverlightVersion1@46 as SilverlightVersion1, SilverlightVersion2@47 as SilverlightVersion2, SilverlightVersion3@48 as SilverlightVersion3, SilverlightVersion4@49 as SilverlightVersion4, PageCharset@50 as PageCharset, CodeVersion@51 as CodeVersion, IsLink@52 as IsLink, IsDownload@53 as IsDownload, IsNotBounce@54 as IsNotBounce, FUniqID@55 as FUniqID, OriginalURL@56 as OriginalURL, HID@57 as HID, IsOldCounter@58 as IsOldCounter, IsEvent@59 as IsEvent, IsParameter@60 as IsParameter, DontCountHits@61 as DontCountHits, WithHash@62 as WithHash, HitColor@63 as HitColor, LocalEventTime@64 as LocalEventTime, Age@65 as Age, Sex@66 as Sex, Income@67 as Income, Interests@68 as Interests, Robotness@69 as Robotness, RemoteIP@70 as RemoteIP, WindowName@71 as WindowName, OpenerName@72 as OpenerName, HistoryLength@73 as HistoryLength, BrowserLanguage@74 as BrowserLanguage, BrowserCountry@75 as BrowserCountry, SocialNetwork@76 as SocialNetwork, SocialAction@77 as SocialAction, HTTPError@78 as HTTPError, SendTiming@79 as SendTiming, DNSTiming@80 as DNSTiming, ConnectTiming@81 as ConnectTiming, ResponseStartTiming@82 as ResponseStartTiming, ResponseEndTiming@83 as ResponseEndTiming, FetchTiming@84 as FetchTiming, SocialSourceNetworkID@85 as SocialSourceNetworkID, SocialSourcePage@86 as SocialSourcePage, ParamPrice@87 as ParamPrice, ParamOrderID@88 as ParamOrderID, ParamCurrency@89 as ParamCurrency, ParamCurrencyID@90 as ParamCurrencyID, OpenstatServiceName@91 as OpenstatServiceName, OpenstatCampaignID@92 as OpenstatCampaignID, OpenstatAdID@93 as OpenstatAdID, OpenstatSourceID@94 as OpenstatSourceID, UTMSource@95 as UTMSource, UTMMedium@96 as UTMMedium, UTMCampaign@97 as UTMCampaign, UTMContent@98 as UTMContent, UTMTerm@99 as UTMTerm, FromTag@100 as FromTag, HasGCLID@101 as HasGCLID, RefererHash@102 as RefererHash, URLHash@103 as URLHash, CLID@104 as CLID, CAST(CAST(EventDate@5 AS Int32) AS Date32) as EventDate]
+04)------FilterExec: URL@13 LIKE %google%
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID], file_type=parquet, predicate=URL@13 LIKE %google% AND DynamicFilter [ empty ]
+
+query IITIIIIIIIIITTIIIIIIIIIITIIITIIIITTIIITIIIIIIIIIITIIIIITIIIIIITIIIIIIIIIITTTTIIIIIIIITITTITTTTTTTTTTIIIID
 SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
 ----
 
+## Q24
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.SearchPhrase
+02)--Sort: hits.EventTime ASC NULLS LAST, fetch=10
+03)----Projection: hits.SearchPhrase, hits.EventTime
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[EventTime, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase]
+02)--SortPreservingMergeExec: [EventTime@1 ASC NULLS LAST], fetch=10
+03)----SortExec: TopK(fetch=10), expr=[EventTime@1 ASC NULLS LAST], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[SearchPhrase@1 as SearchPhrase, EventTime@0 as EventTime]
+05)--------FilterExec: SearchPhrase@1 != 
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
 ----
 
+## Q25
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Sort: hits.SearchPhrase ASC NULLS LAST, fetch=10
+02)--SubqueryAlias: hits
+03)----Filter: hits_raw.SearchPhrase != Utf8View("")
+04)------TableScan: hits_raw projection=[SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [SearchPhrase@0 ASC NULLS LAST], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[SearchPhrase@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----FilterExec: SearchPhrase@0 != 
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
 ----
 
+## Q26
+query TT
+EXPLAIN SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
+----
+logical_plan
+01)Projection: hits.SearchPhrase
+02)--Sort: hits.EventTime ASC NULLS LAST, hits.SearchPhrase ASC NULLS LAST, fetch=10
+03)----Projection: hits.SearchPhrase, hits.EventTime
+04)------SubqueryAlias: hits
+05)--------Filter: hits_raw.SearchPhrase != Utf8View("")
+06)----------TableScan: hits_raw projection=[EventTime, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)ProjectionExec: expr=[SearchPhrase@0 as SearchPhrase]
+02)--SortPreservingMergeExec: [EventTime@1 ASC NULLS LAST, SearchPhrase@0 ASC NULLS LAST], fetch=10
+03)----SortExec: TopK(fetch=10), expr=[EventTime@1 ASC NULLS LAST, SearchPhrase@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[SearchPhrase@1 as SearchPhrase, EventTime@0 as EventTime]
+05)--------FilterExec: SearchPhrase@1 != 
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 !=  AND DynamicFilter [ empty ], pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query T
 SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
 ----
 
+## Q27
+query TT
+EXPLAIN SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+----
+logical_plan
+01)Sort: l DESC NULLS FIRST, fetch=25
+02)--Projection: hits.CounterID, avg(length(hits.URL)) AS l, count(Int64(1)) AS count(*) AS c
+03)----Filter: count(Int64(1)) > Int64(100000)
+04)------Aggregate: groupBy=[[hits.CounterID]], aggr=[[avg(CAST(character_length(hits.URL) AS length(hits.URL) AS Float64)), count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.URL != Utf8View("")
+07)------------TableScan: hits_raw projection=[CounterID, URL], partial_filters=[hits_raw.URL != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [l@1 DESC], fetch=25
+02)--SortExec: TopK(fetch=25), expr=[l@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[CounterID@0 as CounterID, avg(length(hits.URL))@1 as l, count(Int64(1))@2 as c]
+04)------FilterExec: count(Int64(1))@2 > 100000
+05)--------AggregateExec: mode=FinalPartitioned, gby=[CounterID@0 as CounterID], aggr=[avg(length(hits.URL)), count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([CounterID@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[CounterID@0 as CounterID], aggr=[avg(length(hits.URL)), count(Int64(1))]
+08)--------------FilterExec: URL@1 != 
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CounterID, URL], file_type=parquet, predicate=URL@13 != , pruning_predicate=URL_null_count@2 != row_count@3 AND (URL_min@0 !=  OR  != URL_max@1), required_guarantees=[URL not in ()]
+
 query IRI
 SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
 ----
 
+## Q28
+query TT
+EXPLAIN SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
+----
+logical_plan
+01)Sort: l DESC NULLS FIRST, fetch=25
+02)--Projection: regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1")) AS k, avg(length(hits.Referer)) AS l, count(Int64(1)) AS count(*) AS c, min(hits.Referer)
+03)----Filter: count(Int64(1)) > Int64(100000)
+04)------Aggregate: groupBy=[[regexp_replace(hits.Referer, Utf8View("^https?://(?:www\.)?([^/]+)/.*$"), Utf8View("\1")) AS regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))]], aggr=[[avg(CAST(character_length(hits.Referer) AS length(hits.Referer) AS Float64)), count(Int64(1)), min(hits.Referer)]]
+05)--------SubqueryAlias: hits
+06)----------Filter: hits_raw.Referer != Utf8View("")
+07)------------TableScan: hits_raw projection=[Referer], partial_filters=[hits_raw.Referer != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [l@1 DESC], fetch=25
+02)--SortExec: TopK(fetch=25), expr=[l@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0 as k, avg(length(hits.Referer))@1 as l, count(Int64(1))@2 as c, min(hits.Referer)@3 as min(hits.Referer)]
+04)------FilterExec: count(Int64(1))@2 > 100000
+05)--------AggregateExec: mode=FinalPartitioned, gby=[regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0 as regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))], aggr=[avg(length(hits.Referer)), count(Int64(1)), min(hits.Referer)]
+06)----------RepartitionExec: partitioning=Hash([regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[regexp_replace(Referer@0, ^https?://(?:www\.)?([^/]+)/.*$, \1) as regexp_replace(hits.Referer,Utf8("^https?://(?:www\.)?([^/]+)/.*$"),Utf8("\1"))], aggr=[avg(length(hits.Referer)), count(Int64(1)), min(hits.Referer)]
+08)--------------FilterExec: Referer@0 != 
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Referer], file_type=parquet, predicate=Referer@14 != , pruning_predicate=Referer_null_count@2 != row_count@3 AND (Referer_min@0 !=  OR  != Referer_max@1), required_guarantees=[Referer not in ()]
+
 query TRIT
 SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
 ----
 
+## Q29
+query TT
+EXPLAIN SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
+----
+logical_plan
+01)Projection: sum(hits.ResolutionWidth), sum(hits.ResolutionWidth) + __common_expr_1 AS sum(hits.ResolutionWidth + Int64(1)), sum(hits.ResolutionWidth) + Int64(2) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(2)), sum(hits.ResolutionWidth) + Int64(3) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(3)), sum(hits.ResolutionWidth) + Int64(4) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(4)), sum(hits.ResolutionWidth) + Int64(5) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(5)), sum(hits.ResolutionWidth) + Int64(6) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(6)), sum(hits.ResolutionWidth) + Int64(7) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(7)), sum(hits.ResolutionWidth) + Int64(8) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(8)), sum(hits.ResolutionWidth) + Int64(9) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(9)), sum(hits.ResolutionWidth) + Int64(10) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(10)), sum(hits.ResolutionWidth) + Int64(11) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(11)), sum(hits.ResolutionWidth) + Int64(12) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(12)), sum(hits.ResolutionWidth) + Int64(13) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(13)), sum(hits.ResolutionWidth) + Int64(14) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(14)), sum(hits.ResolutionWidth) + Int64(15) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(15)), sum(hits.ResolutionWidth) + Int64(16) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(16)), sum(hits.ResolutionWidth) + Int64(17) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(17)), sum(hits.ResolutionWidth) + Int64(18) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(18)), sum(hits.ResolutionWidth) + Int64(19) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(19)), sum(hits.ResolutionWidth) + Int64(20) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(20)), sum(hits.ResolutionWidth) + Int64(21) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(21)), sum(hits.ResolutionWidth) + Int64(22) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(22)), sum(hits.ResolutionWidth) + Int64(23) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(23)), sum(hits.ResolutionWidth) + Int64(24) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(24)), sum(hits.ResolutionWidth) + Int64(25) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(25)), sum(hits.ResolutionWidth) + Int64(26) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(26)), sum(hits.ResolutionWidth) + Int64(27) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(27)), sum(hits.ResolutionWidth) + Int64(28) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(28)), sum(hits.ResolutionWidth) + Int64(29) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(29)), sum(hits.ResolutionWidth) + Int64(30) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(30)), sum(hits.ResolutionWidth) + Int64(31) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(31)), sum(hits.ResolutionWidth) + Int64(32) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(32)), sum(hits.ResolutionWidth) + Int64(33) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(33)), sum(hits.ResolutionWidth) + Int64(34) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(34)), sum(hits.ResolutionWidth) + Int64(35) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(35)), sum(hits.ResolutionWidth) + Int64(36) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(36)), sum(hits.ResolutionWidth) + Int64(37) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(37)), sum(hits.ResolutionWidth) + Int64(38) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(38)), sum(hits.ResolutionWidth) + Int64(39) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(39)), sum(hits.ResolutionWidth) + Int64(40) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(40)), sum(hits.ResolutionWidth) + Int64(41) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(41)), sum(hits.ResolutionWidth) + Int64(42) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(42)), sum(hits.ResolutionWidth) + Int64(43) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(43)), sum(hits.ResolutionWidth) + Int64(44) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(44)), sum(hits.ResolutionWidth) + Int64(45) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(45)), sum(hits.ResolutionWidth) + Int64(46) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(46)), sum(hits.ResolutionWidth) + Int64(47) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(47)), sum(hits.ResolutionWidth) + Int64(48) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(48)), sum(hits.ResolutionWidth) + Int64(49) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(49)), sum(hits.ResolutionWidth) + Int64(50) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(50)), sum(hits.ResolutionWidth) + Int64(51) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(51)), sum(hits.ResolutionWidth) + Int64(52) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(52)), sum(hits.ResolutionWidth) + Int64(53) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(53)), sum(hits.ResolutionWidth) + Int64(54) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(54)), sum(hits.ResolutionWidth) + Int64(55) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(55)), sum(hits.ResolutionWidth) + Int64(56) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(56)), sum(hits.ResolutionWidth) + Int64(57) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(57)), sum(hits.ResolutionWidth) + Int64(58) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(58)), sum(hits.ResolutionWidth) + Int64(59) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(59)), sum(hits.ResolutionWidth) + Int64(60) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(60)), sum(hits.ResolutionWidth) + Int64(61) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(61)), sum(hits.ResolutionWidth) + Int64(62) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(62)), sum(hits.ResolutionWidth) + Int64(63) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(63)), sum(hits.ResolutionWidth) + Int64(64) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(64)), sum(hits.ResolutionWidth) + Int64(65) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(65)), sum(hits.ResolutionWidth) + Int64(66) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(66)), sum(hits.ResolutionWidth) + Int64(67) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(67)), sum(hits.ResolutionWidth) + Int64(68) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(68)), sum(hits.ResolutionWidth) + Int64(69) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(69)), sum(hits.ResolutionWidth) + Int64(70) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(70)), sum(hits.ResolutionWidth) + Int64(71) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(71)), sum(hits.ResolutionWidth) + Int64(72) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(72)), sum(hits.ResolutionWidth) + Int64(73) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(73)), sum(hits.ResolutionWidth) + Int64(74) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(74)), sum(hits.ResolutionWidth) + Int64(75) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(75)), sum(hits.ResolutionWidth) + Int64(76) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(76)), sum(hits.ResolutionWidth) + Int64(77) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(77)), sum(hits.ResolutionWidth) + Int64(78) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(78)), sum(hits.ResolutionWidth) + Int64(79) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(79)), sum(hits.ResolutionWidth) + Int64(80) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(80)), sum(hits.ResolutionWidth) + Int64(81) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(81)), sum(hits.ResolutionWidth) + Int64(82) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(82)), sum(hits.ResolutionWidth) + Int64(83) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(83)), sum(hits.ResolutionWidth) + Int64(84) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(84)), sum(hits.ResolutionWidth) + Int64(85) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(85)), sum(hits.ResolutionWidth) + Int64(86) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(86)), sum(hits.ResolutionWidth) + Int64(87) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(87)), sum(hits.ResolutionWidth) + Int64(88) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(88)), sum(hits.ResolutionWidth) + Int64(89) * __common_expr_1 AS sum(hits.ResolutionWidth + Int64(89))
+02)--Projection: CAST(count(hits.ResolutionWidth) AS Int64) AS __common_expr_1, sum(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_2 AS hits.ResolutionWidth), count(__common_expr_2 AS hits.ResolutionWidth)]]
+04)------Projection: CAST(hits.ResolutionWidth AS Int64) AS __common_expr_2
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[ResolutionWidth]
+physical_plan
+01)ProjectionExec: expr=[sum(hits.ResolutionWidth)@0 as sum(hits.ResolutionWidth), sum(hits.ResolutionWidth)@0 + count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(1)), sum(hits.ResolutionWidth)@0 + 2 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(2)), sum(hits.ResolutionWidth)@0 + 3 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(3)), sum(hits.ResolutionWidth)@0 + 4 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(4)), sum(hits.ResolutionWidth)@0 + 5 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(5)), sum(hits.ResolutionWidth)@0 + 6 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(6)), sum(hits.ResolutionWidth)@0 + 7 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(7)), sum(hits.ResolutionWidth)@0 + 8 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(8)), sum(hits.ResolutionWidth)@0 + 9 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(9)), sum(hits.ResolutionWidth)@0 + 10 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(10)), sum(hits.ResolutionWidth)@0 + 11 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(11)), sum(hits.ResolutionWidth)@0 + 12 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(12)), sum(hits.ResolutionWidth)@0 + 13 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(13)), sum(hits.ResolutionWidth)@0 + 14 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(14)), sum(hits.ResolutionWidth)@0 + 15 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(15)), sum(hits.ResolutionWidth)@0 + 16 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(16)), sum(hits.ResolutionWidth)@0 + 17 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(17)), sum(hits.ResolutionWidth)@0 + 18 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(18)), sum(hits.ResolutionWidth)@0 + 19 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(19)), sum(hits.ResolutionWidth)@0 + 20 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(20)), sum(hits.ResolutionWidth)@0 + 21 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(21)), sum(hits.ResolutionWidth)@0 + 22 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(22)), sum(hits.ResolutionWidth)@0 + 23 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(23)), sum(hits.ResolutionWidth)@0 + 24 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(24)), sum(hits.ResolutionWidth)@0 + 25 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(25)), sum(hits.ResolutionWidth)@0 + 26 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(26)), sum(hits.ResolutionWidth)@0 + 27 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(27)), sum(hits.ResolutionWidth)@0 + 28 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(28)), sum(hits.ResolutionWidth)@0 + 29 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(29)), sum(hits.ResolutionWidth)@0 + 30 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(30)), sum(hits.ResolutionWidth)@0 + 31 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(31)), sum(hits.ResolutionWidth)@0 + 32 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(32)), sum(hits.ResolutionWidth)@0 + 33 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(33)), sum(hits.ResolutionWidth)@0 + 34 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(34)), sum(hits.ResolutionWidth)@0 + 35 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(35)), sum(hits.ResolutionWidth)@0 + 36 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(36)), sum(hits.ResolutionWidth)@0 + 37 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(37)), sum(hits.ResolutionWidth)@0 + 38 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(38)), sum(hits.ResolutionWidth)@0 + 39 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(39)), sum(hits.ResolutionWidth)@0 + 40 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(40)), sum(hits.ResolutionWidth)@0 + 41 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(41)), sum(hits.ResolutionWidth)@0 + 42 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(42)), sum(hits.ResolutionWidth)@0 + 43 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(43)), sum(hits.ResolutionWidth)@0 + 44 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(44)), sum(hits.ResolutionWidth)@0 + 45 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(45)), sum(hits.ResolutionWidth)@0 + 46 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(46)), sum(hits.ResolutionWidth)@0 + 47 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(47)), sum(hits.ResolutionWidth)@0 + 48 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(48)), sum(hits.ResolutionWidth)@0 + 49 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(49)), sum(hits.ResolutionWidth)@0 + 50 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(50)), sum(hits.ResolutionWidth)@0 + 51 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(51)), sum(hits.ResolutionWidth)@0 + 52 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(52)), sum(hits.ResolutionWidth)@0 + 53 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(53)), sum(hits.ResolutionWidth)@0 + 54 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(54)), sum(hits.ResolutionWidth)@0 + 55 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(55)), sum(hits.ResolutionWidth)@0 + 56 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(56)), sum(hits.ResolutionWidth)@0 + 57 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(57)), sum(hits.ResolutionWidth)@0 + 58 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(58)), sum(hits.ResolutionWidth)@0 + 59 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(59)), sum(hits.ResolutionWidth)@0 + 60 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(60)), sum(hits.ResolutionWidth)@0 + 61 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(61)), sum(hits.ResolutionWidth)@0 + 62 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(62)), sum(hits.ResolutionWidth)@0 + 63 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(63)), sum(hits.ResolutionWidth)@0 + 64 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(64)), sum(hits.ResolutionWidth)@0 + 65 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(65)), sum(hits.ResolutionWidth)@0 + 66 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(66)), sum(hits.ResolutionWidth)@0 + 67 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(67)), sum(hits.ResolutionWidth)@0 + 68 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(68)), sum(hits.ResolutionWidth)@0 + 69 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(69)), sum(hits.ResolutionWidth)@0 + 70 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(70)), sum(hits.ResolutionWidth)@0 + 71 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(71)), sum(hits.ResolutionWidth)@0 + 72 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(72)), sum(hits.ResolutionWidth)@0 + 73 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(73)), sum(hits.ResolutionWidth)@0 + 74 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(74)), sum(hits.ResolutionWidth)@0 + 75 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(75)), sum(hits.ResolutionWidth)@0 + 76 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(76)), sum(hits.ResolutionWidth)@0 + 77 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(77)), sum(hits.ResolutionWidth)@0 + 78 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(78)), sum(hits.ResolutionWidth)@0 + 79 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(79)), sum(hits.ResolutionWidth)@0 + 80 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(80)), sum(hits.ResolutionWidth)@0 + 81 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(81)), sum(hits.ResolutionWidth)@0 + 82 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(82)), sum(hits.ResolutionWidth)@0 + 83 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(83)), sum(hits.ResolutionWidth)@0 + 84 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(84)), sum(hits.ResolutionWidth)@0 + 85 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(85)), sum(hits.ResolutionWidth)@0 + 86 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(86)), sum(hits.ResolutionWidth)@0 + 87 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(87)), sum(hits.ResolutionWidth)@0 + 88 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(88)), sum(hits.ResolutionWidth)@0 + 89 * count(hits.ResolutionWidth)@1 as sum(hits.ResolutionWidth + Int64(89))]
+02)--AggregateExec: mode=Single, gby=[], aggr=[sum(hits.ResolutionWidth), count(hits.ResolutionWidth)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[CAST(ResolutionWidth@20 AS Int64) as __common_expr_2], file_type=parquet
+
 query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
 SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
 ----
 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 800 810 820 830 840 850 860 870 880 890
 
+## Q30
+query TT
+EXPLAIN SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.SearchEngineID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.SearchEngineID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.ClientIP, hits_raw.IsRefresh, hits_raw.ResolutionWidth, hits_raw.SearchEngineID
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[SearchEngineID@0 as SearchEngineID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([SearchEngineID@0, ClientIP@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[SearchEngineID@3 as SearchEngineID, ClientIP@0 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------FilterExec: SearchPhrase@4 != , projection=[ClientIP@0, IsRefresh@1, ResolutionWidth@2, SearchEngineID@3]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ClientIP, IsRefresh, ResolutionWidth, SearchEngineID, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query IIIIR
 SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q31
+query TT
+EXPLAIN SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.WatchID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.WatchID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.WatchID, hits_raw.ClientIP, hits_raw.IsRefresh, hits_raw.ResolutionWidth
+06)----------Filter: hits_raw.SearchPhrase != Utf8View("")
+07)------------TableScan: hits_raw projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth, SearchPhrase], partial_filters=[hits_raw.SearchPhrase != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------FilterExec: SearchPhrase@4 != , projection=[WatchID@0, ClientIP@1, IsRefresh@2, ResolutionWidth@3]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth, SearchPhrase], file_type=parquet, predicate=SearchPhrase@39 != , pruning_predicate=SearchPhrase_null_count@2 != row_count@3 AND (SearchPhrase_min@0 !=  OR  != SearchPhrase_max@1), required_guarantees=[SearchPhrase not in ()]
+
 query IIIIR
 SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
 
+## Q32
+query TT
+EXPLAIN SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.WatchID, hits.ClientIP, count(Int64(1)) AS count(*) AS c, sum(hits.IsRefresh), avg(hits.ResolutionWidth)
+03)----Aggregate: groupBy=[[hits.WatchID, hits.ClientIP]], aggr=[[count(Int64(1)), sum(CAST(hits.IsRefresh AS Int64)), avg(CAST(hits.ResolutionWidth AS Float64))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[WatchID@0 as WatchID, ClientIP@1 as ClientIP, count(Int64(1))@2 as c, sum(hits.IsRefresh)@3 as sum(hits.IsRefresh), avg(hits.ResolutionWidth)@4 as avg(hits.ResolutionWidth)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+05)--------RepartitionExec: partitioning=Hash([WatchID@0, ClientIP@1], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[WatchID@0 as WatchID, ClientIP@1 as ClientIP], aggr=[count(Int64(1)), sum(hits.IsRefresh), avg(hits.ResolutionWidth)]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[WatchID, ClientIP, IsRefresh, ResolutionWidth], file_type=parquet
+
 query IIIIR rowsort
 SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
 ----
@@ -218,6 +890,25 @@ SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWi
 8924809397503602651 -1216690514 1 0 0
 9110818468285196899 -1216690514 1 0 0
 
+## Q33
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.URL, count(Int64(1)) AS count(*) AS c
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[URL]
+physical_plan
+01)SortPreservingMergeExec: [c@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet
+
 query TI rowsort
 SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
 ----
@@ -228,6 +919,25 @@ http://bonprix.ru/index.ru/cinema/art/A00387,3797); ru)&bL 1
 http://holodilnik.ru/russia/05jul2013&model=0 1
 http://tours/Ekategoriya%2F&sr=http://slovareniye 1
 
+## Q34
+query TT
+EXPLAIN SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: Int64(1), hits.URL, count(Int64(1)) AS c
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------TableScan: hits_raw projection=[URL]
+physical_plan
+01)SortPreservingMergeExec: [c@2 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@2 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[1 as Int64(1), URL@0 as URL, count(Int64(1))@1 as c]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[URL], file_type=parquet
+
 query ITI rowsort
 SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
 ----
@@ -238,6 +948,27 @@ SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT
 1 http://holodilnik.ru/russia/05jul2013&model=0 1
 1 http://tours/Ekategoriya%2F&sr=http://slovareniye 1
 
+## Q35
+query TT
+EXPLAIN SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
+----
+logical_plan
+01)Sort: c DESC NULLS FIRST, fetch=10
+02)--Projection: hits.ClientIP, __common_expr_1 - Int64(1) AS hits.ClientIP - Int64(1), __common_expr_1 - Int64(2) AS hits.ClientIP - Int64(2), __common_expr_1 - Int64(3) AS hits.ClientIP - Int64(3), count(Int64(1)) AS c
+03)----Projection: CAST(hits.ClientIP AS Int64) AS __common_expr_1, hits.ClientIP, count(Int64(1))
+04)------Aggregate: groupBy=[[hits.ClientIP]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------TableScan: hits_raw projection=[ClientIP]
+physical_plan
+01)SortPreservingMergeExec: [c@4 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[c@4 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[ClientIP@1 as ClientIP, __common_expr_1@0 - 1 as hits.ClientIP - Int64(1), __common_expr_1@0 - 2 as hits.ClientIP - Int64(2), __common_expr_1@0 - 3 as hits.ClientIP - Int64(3), count(Int64(1))@2 as c]
+04)------ProjectionExec: expr=[CAST(ClientIP@0 AS Int64) as __common_expr_1, ClientIP@0 as ClientIP, count(Int64(1))@1 as count(Int64(1))]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[ClientIP@0 as ClientIP], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([ClientIP@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[ClientIP@0 as ClientIP], aggr=[count(Int64(1))]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[ClientIP], file_type=parquet
+
 query IIIII rowsort
 SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
 ----
@@ -246,46 +977,256 @@ SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c
 1568366281 1568366280 1568366279 1568366278 2
 1615432634 1615432633 1615432632 1615432631 1
 
+## Q36
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
+----
+logical_plan
+01)Sort: pageviews DESC NULLS FIRST, fetch=10
+02)--Projection: hits.URL, count(Int64(1)) AS count(*) AS pageviews
+03)----Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.URL
+06)----------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.DontCountHits = Int16(0) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.URL != Utf8View("")
+07)------------TableScan: hits_raw projection=[EventDate, CounterID, URL, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.DontCountHits = Int16(0), hits_raw.IsRefresh = Int16(0), hits_raw.URL != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [pageviews@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as pageviews]
+04)------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+07)------------FilterExec: CounterID@1 = 62 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits@4 = 0 AND IsRefresh@3 = 0 AND URL@2 != , projection=[URL@2]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits@61 = 0 AND IsRefresh@15 = 0 AND URL@13 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND URL_null_count@15 != row_count@3 AND (URL_min@13 !=  OR  != URL_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URL not in ()]
+
 query TI
 SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
 ----
 
+## Q37
+query TT
+EXPLAIN SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
+----
+logical_plan
+01)Sort: pageviews DESC NULLS FIRST, fetch=10
+02)--Projection: hits.Title, count(Int64(1)) AS count(*) AS pageviews
+03)----Aggregate: groupBy=[[hits.Title]], aggr=[[count(Int64(1))]]
+04)------SubqueryAlias: hits
+05)--------Projection: hits_raw.Title
+06)----------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.DontCountHits = Int16(0) AND hits_raw.IsRefresh = Int16(0) AND hits_raw.Title != Utf8View("")
+07)------------TableScan: hits_raw projection=[Title, EventDate, CounterID, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.DontCountHits = Int16(0), hits_raw.IsRefresh = Int16(0), hits_raw.Title != Utf8View("")]
+physical_plan
+01)SortPreservingMergeExec: [pageviews@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[Title@0 as Title, count(Int64(1))@1 as pageviews]
+04)------AggregateExec: mode=FinalPartitioned, gby=[Title@0 as Title], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=Hash([Title@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[Title@0 as Title], aggr=[count(Int64(1))]
+07)------------FilterExec: CounterID@2 = 62 AND CAST(CAST(EventDate@1 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@1 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits@4 = 0 AND IsRefresh@3 = 0 AND Title@0 != , projection=[Title@0]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[Title, EventDate, CounterID, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits@61 = 0 AND IsRefresh@15 = 0 AND Title@2 != , pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND DontCountHits_null_count@9 != row_count@3 AND DontCountHits_min@7 <= 0 AND 0 <= DontCountHits_max@8 AND IsRefresh_null_count@12 != row_count@3 AND IsRefresh_min@10 <= 0 AND 0 <= IsRefresh_max@11 AND Title_null_count@15 != row_count@3 AND (Title_min@13 !=  OR  != Title_max@14), required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), Title not in ()]
+
 query TI
 SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
 ----
 
+## Q38
+query TT
+EXPLAIN SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=1010
+03)----Projection: hits.URL, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.URL]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URL
+07)------------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.IsRefresh = Int16(0) AND hits_raw.IsLink != Int16(0) AND hits_raw.IsDownload = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, URL, IsRefresh, IsLink, IsDownload], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.IsRefresh = Int16(0), hits_raw.IsLink != Int16(0), hits_raw.IsDownload = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@1 DESC], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[pageviews@1 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[URL@0 as URL, count(Int64(1))@1 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([URL@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[URL@0 as URL], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@3 = 0 AND IsLink@4 != 0 AND IsDownload@5 = 0, projection=[URL@2]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, IsRefresh, IsLink, IsDownload], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@15 = 0 AND IsLink@52 != 0 AND IsDownload@53 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND IsLink_null_count@12 != row_count@3 AND (IsLink_min@10 != 0 OR 0 != IsLink_max@11) AND IsDownload_null_count@15 != row_count@3 AND IsDownload_min@13 <= 0 AND 0 <= IsDownload_max@14, required_guarantees=[CounterID in (62), IsDownload in (0), IsLink not in (0), IsRefresh in (0)]
+
 query TI
 SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
 ----
 
+## Q39
+query TT
+EXPLAIN SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=1010
+03)----Projection: hits.TraficSourceID, hits.SearchEngineID, hits.AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END AS src, hits.URL AS dst, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.TraficSourceID, hits.SearchEngineID, hits.AdvEngineID, CASE WHEN hits.SearchEngineID = Int16(0) AND hits.AdvEngineID = Int16(0) THEN hits.Referer ELSE Utf8View("") END AS CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, hits.URL]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URL, hits_raw.Referer, hits_raw.TraficSourceID, hits_raw.SearchEngineID, hits_raw.AdvEngineID
+07)------------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.IsRefresh = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, URL, Referer, IsRefresh, TraficSourceID, SearchEngineID, AdvEngineID], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.IsRefresh = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@5 DESC], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[pageviews@5 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3 as src, URL@4 as dst, count(Int64(1))@5 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[TraficSourceID@0 as TraficSourceID, SearchEngineID@1 as SearchEngineID, AdvEngineID@2 as AdvEngineID, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3 as CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, URL@4 as URL], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([TraficSourceID@0, SearchEngineID@1, AdvEngineID@2, CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END@3, URL@4], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[TraficSourceID@2 as TraficSourceID, SearchEngineID@3 as SearchEngineID, AdvEngineID@4 as AdvEngineID, CASE WHEN SearchEngineID@3 = 0 AND AdvEngineID@4 = 0 THEN Referer@1 ELSE  END as CASE WHEN hits.SearchEngineID = Int64(0) AND hits.AdvEngineID = Int64(0) THEN hits.Referer ELSE Utf8("") END, URL@0 as URL], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@4 = 0, projection=[URL@2, Referer@3, TraficSourceID@5, SearchEngineID@6, AdvEngineID@7]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, URL, Referer, IsRefresh, TraficSourceID, SearchEngineID, AdvEngineID], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@15 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8, required_guarantees=[CounterID in (62), IsRefresh in (0)]
+
 query IIITTI
 SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
 ----
 
-query III
+## Q40
+query TT
+EXPLAIN SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
+----
+logical_plan
+01)Limit: skip=100, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=110
+03)----Projection: hits.URLHash, hits.EventDate, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.URLHash, hits.EventDate]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.URLHash, CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) AS EventDate
+07)------------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.IsRefresh = Int16(0) AND (hits_raw.TraficSourceID = Int16(-1) OR hits_raw.TraficSourceID = Int16(6)) AND hits_raw.RefererHash = Int64(3594120000172545465)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, IsRefresh, TraficSourceID, RefererHash, URLHash], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.IsRefresh = Int16(0), hits_raw.TraficSourceID = Int16(-1) OR hits_raw.TraficSourceID = Int16(6), hits_raw.RefererHash = Int64(3594120000172545465)]
+physical_plan
+01)GlobalLimitExec: skip=100, fetch=10
+02)--SortPreservingMergeExec: [pageviews@2 DESC], fetch=110
+03)----SortExec: TopK(fetch=110), expr=[pageviews@2 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[URLHash@0 as URLHash, EventDate@1 as EventDate, count(Int64(1))@2 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([URLHash@0, EventDate@1], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[URLHash@0 as URLHash, EventDate@1 as EventDate], aggr=[count(Int64(1))]
+08)--------------ProjectionExec: expr=[URLHash@1 as URLHash, CAST(CAST(EventDate@0 AS Int32) AS Date32) as EventDate]
+09)----------------FilterExec: CounterID@1 = 62 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@2 = 0 AND (TraficSourceID@3 = -1 OR TraficSourceID@3 = 6) AND RefererHash@4 = 3594120000172545465, projection=[EventDate@0, URLHash@5]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, IsRefresh, TraficSourceID, RefererHash, URLHash], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@15 = 0 AND (TraficSourceID@37 = -1 OR TraficSourceID@37 = 6) AND RefererHash@102 = 3594120000172545465, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND (TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= -1 AND -1 <= TraficSourceID_max@11 OR TraficSourceID_null_count@12 != row_count@3 AND TraficSourceID_min@10 <= 6 AND 6 <= TraficSourceID_max@11) AND RefererHash_null_count@15 != row_count@3 AND RefererHash_min@13 <= 3594120000172545465 AND 3594120000172545465 <= RefererHash_max@14, required_guarantees=[CounterID in (62), IsRefresh in (0), RefererHash in (3594120000172545465), TraficSourceID in (-1, 6)]
+
+query IDI
 SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
 ----
 
+## Q41
+query TT
+EXPLAIN SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
+----
+logical_plan
+01)Limit: skip=10000, fetch=10
+02)--Sort: pageviews DESC NULLS FIRST, fetch=10010
+03)----Projection: hits.WindowClientWidth, hits.WindowClientHeight, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[hits.WindowClientWidth, hits.WindowClientHeight]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.WindowClientWidth, hits_raw.WindowClientHeight
+07)------------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31") AND hits_raw.IsRefresh = Int16(0) AND hits_raw.DontCountHits = Int16(0) AND hits_raw.URLHash = Int64(2868770270353813622)
+08)--------------TableScan: hits_raw projection=[EventDate, CounterID, IsRefresh, WindowClientWidth, WindowClientHeight, DontCountHits, URLHash], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-01"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-31"), hits_raw.IsRefresh = Int16(0), hits_raw.DontCountHits = Int16(0), hits_raw.URLHash = Int64(2868770270353813622)]
+physical_plan
+01)GlobalLimitExec: skip=10000, fetch=10
+02)--SortPreservingMergeExec: [pageviews@2 DESC], fetch=10010
+03)----SortExec: TopK(fetch=10010), expr=[pageviews@2 DESC], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight, count(Int64(1))@2 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([WindowClientWidth@0, WindowClientHeight@1], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[WindowClientWidth@0 as WindowClientWidth, WindowClientHeight@1 as WindowClientHeight], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@1 = 62 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@0 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@2 = 0 AND DontCountHits@5 = 0 AND URLHash@6 = 2868770270353813622, projection=[WindowClientWidth@3, WindowClientHeight@4]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventDate, CounterID, IsRefresh, WindowClientWidth, WindowClientHeight, DontCountHits, URLHash], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-01 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh@15 = 0 AND DontCountHits@61 = 0 AND URLHash@103 = 2868770270353813622, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-01 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-31 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11 AND URLHash_null_count@15 != row_count@3 AND URLHash_min@13 <= 2868770270353813622 AND 2868770270353813622 <= URLHash_max@14, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0), URLHash in (2868770270353813622)]
+
 query III
 SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
 ----
 
+## Q42
+query TT
+EXPLAIN SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
+----
+logical_plan
+01)Limit: skip=1000, fetch=10
+02)--Sort: date_trunc(Utf8("minute"), m) ASC NULLS LAST, fetch=1010
+03)----Projection: date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime)) AS m, count(Int64(1)) AS count(*) AS pageviews
+04)------Aggregate: groupBy=[[date_trunc(Utf8("minute"), to_timestamp_seconds(hits.EventTime))]], aggr=[[count(Int64(1))]]
+05)--------SubqueryAlias: hits
+06)----------Projection: hits_raw.EventTime
+07)------------Filter: hits_raw.CounterID = Int32(62) AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-14") AND CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-15") AND hits_raw.IsRefresh = Int16(0) AND hits_raw.DontCountHits = Int16(0)
+08)--------------TableScan: hits_raw projection=[EventTime, EventDate, CounterID, IsRefresh, DontCountHits], partial_filters=[hits_raw.CounterID = Int32(62), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) >= Date32("2013-07-14"), CAST(CAST(hits_raw.EventDate AS Int32) AS Date32) <= Date32("2013-07-15"), hits_raw.IsRefresh = Int16(0), hits_raw.DontCountHits = Int16(0)]
+physical_plan
+01)GlobalLimitExec: skip=1000, fetch=10
+02)--SortPreservingMergeExec: [date_trunc(minute, m@0) ASC NULLS LAST], fetch=1010
+03)----SortExec: TopK(fetch=1010), expr=[date_trunc(minute, m@0) ASC NULLS LAST], preserve_partitioning=[true]
+04)------ProjectionExec: expr=[date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0 as m, count(Int64(1))@1 as pageviews]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0 as date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))], aggr=[count(Int64(1))]
+06)----------RepartitionExec: partitioning=Hash([date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[date_trunc(minute, to_timestamp_seconds(EventTime@0)) as date_trunc(Utf8("minute"),to_timestamp_seconds(hits.EventTime))], aggr=[count(Int64(1))]
+08)--------------FilterExec: CounterID@2 = 62 AND CAST(CAST(EventDate@1 AS Int32) AS Date32) >= 2013-07-14 AND CAST(CAST(EventDate@1 AS Int32) AS Date32) <= 2013-07-15 AND IsRefresh@3 = 0 AND DontCountHits@4 = 0, projection=[EventTime@0]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[EventTime, EventDate, CounterID, IsRefresh, DontCountHits], file_type=parquet, predicate=CounterID@6 = 62 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) >= 2013-07-14 AND CAST(CAST(EventDate@5 AS Int32) AS Date32) <= 2013-07-15 AND IsRefresh@15 = 0 AND DontCountHits@61 = 0, pruning_predicate=CounterID_null_count@2 != row_count@3 AND CounterID_min@0 <= 62 AND 62 <= CounterID_max@1 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_max@4 AS Int32) AS Date32) >= 2013-07-14 AND EventDate_null_count@5 != row_count@3 AND CAST(CAST(EventDate_min@6 AS Int32) AS Date32) <= 2013-07-15 AND IsRefresh_null_count@9 != row_count@3 AND IsRefresh_min@7 <= 0 AND 0 <= IsRefresh_max@8 AND DontCountHits_null_count@12 != row_count@3 AND DontCountHits_min@10 <= 0 AND 0 <= DontCountHits_max@11, required_guarantees=[CounterID in (62), DontCountHits in (0), IsRefresh in (0)]
+
 query PI
 SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
 ----
 
 # Clickbench "Extended" queries that test count distinct
 
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT hits.SearchPhrase), count(DISTINCT hits.MobilePhone), count(DISTINCT hits.MobilePhoneModel)]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[MobilePhone, MobilePhoneModel, SearchPhrase]
+physical_plan
+01)AggregateExec: mode=Single, gby=[], aggr=[count(DISTINCT hits.SearchPhrase), count(DISTINCT hits.MobilePhone), count(DISTINCT hits.MobilePhoneModel)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[MobilePhone, MobilePhoneModel, SearchPhrase], file_type=parquet
+
 query III
 SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
 ----
 1 1 1
 
+query TT
+EXPLAIN SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserCountry), count(DISTINCT hits.BrowserLanguage)]]
+02)--SubqueryAlias: hits
+03)----TableScan: hits_raw projection=[HitColor, BrowserLanguage, BrowserCountry]
+physical_plan
+01)ProjectionExec: expr=[1 as count(DISTINCT hits.HitColor), 1 as count(DISTINCT hits.BrowserCountry), 1 as count(DISTINCT hits.BrowserLanguage)]
+02)--PlaceholderRowExec
+
 query III
 SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
 ----
 1 1 1
 
+query TT
+EXPLAIN SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
+----
+logical_plan
+01)Sort: count(DISTINCT hits.SocialNetwork) DESC NULLS FIRST, fetch=10
+02)--Aggregate: groupBy=[[hits.BrowserCountry]], aggr=[[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]]
+03)----SubqueryAlias: hits
+04)------TableScan: hits_raw projection=[HitColor, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction]
+physical_plan
+01)SortPreservingMergeExec: [count(DISTINCT hits.SocialNetwork)@1 DESC], fetch=10
+02)--SortExec: TopK(fetch=10), expr=[count(DISTINCT hits.SocialNetwork)@1 DESC], preserve_partitioning=[true]
+03)----AggregateExec: mode=FinalPartitioned, gby=[BrowserCountry@0 as BrowserCountry], aggr=[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]
+04)------RepartitionExec: partitioning=Hash([BrowserCountry@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[BrowserCountry@2 as BrowserCountry], aggr=[count(DISTINCT hits.SocialNetwork), count(DISTINCT hits.HitColor), count(DISTINCT hits.BrowserLanguage), count(DISTINCT hits.SocialAction)]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/clickbench_hits_10.parquet]]}, projection=[HitColor, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction], file_type=parquet
+
 query TIIII
 SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
 ----
@@ -293,4 +1234,7 @@ SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitCo
 
 
 statement ok
-drop table hits;
+drop view hits;
+
+statement ok
+drop table hits_raw;
diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt
index 9740bade5e27b..9e5b71b871299 100644
--- a/datafusion/sqllogictest/test_files/coalesce.slt
+++ b/datafusion/sqllogictest/test_files/coalesce.slt
@@ -199,14 +199,14 @@ select
   coalesce(array[1, 2], array[3, 4]),
   arrow_typeof(coalesce(array[1, 2], array[3, 4]));
 ----
-[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+[1, 2] List(Int64)
 
 query ?T
 select
   coalesce(null, array[3, 4]),
   arrow_typeof(coalesce(array[1, 2], array[3, 4]));
 ----
-[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+[3, 4] List(Int64)
 
 # coalesce with array
 query ?T
@@ -214,7 +214,7 @@ select
   coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]),
   arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]));
 ----
-[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+[1, 2] List(Int64)
 
 # test dict(int32, utf8)
 statement ok
diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt
index 5eeb05e814ace..402ac8e8512bf 100644
--- a/datafusion/sqllogictest/test_files/copy.slt
+++ b/datafusion/sqllogictest/test_files/copy.slt
@@ -200,6 +200,17 @@ physical_plan
 01)DataSinkExec: sink=ParquetSink(file_groups=[])
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
+# Verify ParquetSink exposes rows_written, bytes_written, and elapsed_compute metrics
+# Use a query with Sort and Projection to verify metrics across all operators
+query TT
+EXPLAIN ANALYZE COPY (SELECT col1, upper(col2) AS col2_upper FROM source_table ORDER BY col1) TO 'test_files/scratch/copy/table_metrics/' STORED AS PARQUET;
+----
+Plan with Metrics
+01)DataSinkExec: sink=ParquetSink(file_groups=[]), metrics=[elapsed_compute=<slt:ignore>, bytes_written=<slt:ignore>, rows_written=2]
+02)--SortExec: expr=[col1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=<slt:ignore>, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0]
+03)----ProjectionExec: expr=[col1@0 as col1, upper(col2@1) as col2_upper], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+04)------DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+
 # Copy to directory as partitioned files with keep_partition_by_columns enabled
 query I
 COPY (values ('1', 'a'), ('2', 'b'), ('3', 'c')) TO 'test_files/scratch/copy/partitioned_table4/' STORED AS parquet PARTITIONED BY (column1)
@@ -332,7 +343,6 @@ OPTIONS (
 'format.dictionary_enabled' false,
 'format.statistics_enabled' page,
 'format.statistics_enabled::col2' none,
-'format.max_statistics_size' 123,
 'format.bloom_filter_fpp' 0.001,
 'format.bloom_filter_ndv' 100,
 'format.metadata::key' 'value'
@@ -427,6 +437,21 @@ select * from validate_parquet_single;
 1 Foo
 2 Bar
 
+# copy 0 rows to a single parquet file output
+query I
+COPY (SELECT 1 AS id WHERE FALSE) TO 'test_files/scratch/copy/table_no_rows.parquet';
+----
+0
+
+statement ok
+CREATE EXTERNAL TABLE validate_parquet_single_no_rows STORED AS PARQUET LOCATION 'test_files/scratch/copy/table_no_rows.parquet';
+
+# validate the parquet file contains 0 rows.
+query I
+SELECT count(id) FROM validate_parquet_single_no_rows;
+----
+0
+
 # copy from table to folder of compressed json files
 query I
 COPY source_table  to 'test_files/scratch/copy/table_json_gz' STORED AS JSON OPTIONS ('format.compression' gzip);
diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt
index d38d3490fed47..a1c0e6303a765 100644
--- a/datafusion/sqllogictest/test_files/count_star_rule.slt
+++ b/datafusion/sqllogictest/test_files/count_star_rule.slt
@@ -1,4 +1,4 @@
-# Licensed to the Apache Software Foundation (ASF) under one
+# Licensed to the Apache Software Foundation (ASF) under onecount_star
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
@@ -34,7 +34,7 @@ logical_plan
 01)Projection: count(Int64(1)) AS count()
 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
 03)----SubqueryAlias: t
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[1 as count()]
 02)--PlaceholderRowExec
@@ -49,11 +49,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, count(Int64(1))@1 as count()]
 02)--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 0;
@@ -65,14 +63,11 @@ logical_plan
 04)------TableScan: t1 projection=[a]
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, count(Int64(1))@1 as cnt]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: count(Int64(1))@1 > 0
-04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: count(Int64(1))@1 > 0
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))]
+04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT t1.a, COUNT() AS cnt FROM t1 GROUP BY t1.a HAVING COUNT() > 1;
@@ -88,7 +83,7 @@ logical_plan
 03)----TableScan: t1 projection=[a]
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a]
-02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: partitions=1, partition_sizes=[1]
 
diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt
index 03cb5edb5fcce..0b15a7f8ec5dd 100644
--- a/datafusion/sqllogictest/test_files/create_external_table.slt
+++ b/datafusion/sqllogictest/test_files/create_external_table.slt
@@ -264,7 +264,7 @@ logical_plan
 02)--TableScan: t projection=[id]
 physical_plan
 01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id], file_type=parquet, reverse_row_groups=true
 
 statement ok
 DROP TABLE t;
@@ -297,3 +297,9 @@ CREATE EXTERNAL TABLE staging.foo STORED AS parquet LOCATION '../../parquet-test
 # Create external table with qualified name, but no schema should error
 statement error DataFusion error: Error during planning: failed to resolve schema: release
 CREATE EXTERNAL TABLE release.bar STORED AS parquet LOCATION '../../parquet-testing/data/alltypes_plain.parquet';
+
+# Cannot create external table alongside `if_not_exists` and `or_replace`
+statement error DataFusion error: SQL error: ParserError\("'IF NOT EXISTS' cannot coexist with 'REPLACE'"\)
+CREATE OR REPLACE EXTERNAL TABLE IF NOT EXISTS t_conflict(c1 int)
+STORED AS CSV
+LOCATION 'foo.csv';
diff --git a/datafusion/sqllogictest/test_files/create_function.slt b/datafusion/sqllogictest/test_files/create_function.slt
index 4f0c53c36ca1a..4e82c0866ee23 100644
--- a/datafusion/sqllogictest/test_files/create_function.slt
+++ b/datafusion/sqllogictest/test_files/create_function.slt
@@ -21,11 +21,6 @@
 ## Note that DataFusion provides a pluggable system for creating functions
 ## but has no built in support for doing so.
 
-# Use PostgresSQL dialect (until we upgrade to sqlparser 0.44, where CREATE FUNCTION)
-# is supported in the Generic dialect (the default)
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 # Create function will fail unless a user supplied function factory is supplied
 statement error DataFusion error: Invalid or Unsupported Configuration: Function factory has not been configured
 CREATE FUNCTION foo (DOUBLE) RETURNS DOUBLE RETURN $1 + $2;
diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt
index 32320a06f4fb0..65f694eb370fb 100644
--- a/datafusion/sqllogictest/test_files/cte.slt
+++ b/datafusion/sqllogictest/test_files/cte.slt
@@ -33,7 +33,7 @@ EXPLAIN WITH "NUMBERS" AS (SELECT 1 as a, 2 as b, 3 as c) SELECT "NUMBERS".* FRO
 logical_plan
 01)SubqueryAlias: NUMBERS
 02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
 02)--PlaceholderRowExec
@@ -42,6 +42,63 @@ physical_plan
 statement error DataFusion error: Error during planning: WITH query name "a" specified more than once
 WITH a AS (SELECT 1), a AS (SELECT 2) SELECT * FROM a;
 
+statement ok
+CREATE TABLE orders AS VALUES (1), (2);
+
+##########
+## CTE Reference Resolution
+##########
+
+# These tests exercise CTE reference resolution with and without identifier
+# normalization. The session is configured with a strict catalog/schema provider
+# (see `datafusion/sqllogictest/src/test_context.rs`) that only provides the
+# `orders` table and panics on any unexpected table lookup.
+#
+# This makes it observable if DataFusion incorrectly treats a CTE reference as a
+# catalog lookup.
+#
+# Refs: https://github.com/apache/datafusion/issues/18932
+#
+# NOTE: This test relies on a strict catalog/schema provider registered in
+# `datafusion/sqllogictest/src/test_context.rs` that provides only the `orders`
+# table and panics on unexpected lookups.
+
+statement ok
+set datafusion.sql_parser.enable_ident_normalization = true;
+
+query I
+with barbaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with BarBaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with barbaz as (select * from orders) select * from barbaz;
+----
+1
+2
+
+statement ok
+set datafusion.sql_parser.enable_ident_normalization = false;
+
+query I
+with barbaz as (select * from orders) select * from "barbaz";
+----
+1
+2
+
+query I
+with barbaz as (select * from orders) select * from barbaz;
+----
+1
+2
+
 # Test disabling recursive CTE
 statement ok
 set datafusion.execution.enable_recursive_ctes = false;
@@ -58,18 +115,6 @@ WITH RECURSIVE nodes AS (
 statement ok
 set datafusion.execution.enable_recursive_ctes = true;
 
-
-# DISTINCT UNION is not supported
-query error DataFusion error: This feature is not implemented: Recursive queries with a distinct 'UNION' \(in which the previous iteration's results will be de\-duplicated\) is not supported
-WITH RECURSIVE nodes AS (
-    SELECT 1 as id
-    UNION
-    SELECT id + 1 as id
-    FROM nodes
-    WHERE id < 3
-) SELECT * FROM nodes
-
-
 # trivial recursive CTE works
 query I rowsort
 WITH RECURSIVE nodes AS (
@@ -107,20 +152,54 @@ logical_plan
 01)SubqueryAlias: nodes
 02)--RecursiveQuery: is_distinct=false
 03)----Projection: Int64(1) AS id
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 05)----Projection: nodes.id + Int64(1) AS id
 06)------Filter: nodes.id < Int64(10)
-07)--------TableScan: nodes
+07)--------TableScan: nodes projection=[id]
 physical_plan
 01)RecursiveQueryExec: name=nodes, is_distinct=false
 02)--ProjectionExec: expr=[1 as id]
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[id@0 + 1 as id]
-06)------CoalesceBatchesExec: target_batch_size=8192
-07)--------FilterExec: id@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=nodes
+06)------FilterExec: id@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=nodes
+
+# simple deduplicating recursive CTE works
+query I
+WITH RECURSIVE nodes AS (
+    SELECT id from (VALUES (1), (2)) nodes(id)
+    UNION
+    SELECT id + 1 as id
+    FROM nodes
+    WHERE id < 4
+)
+SELECT * FROM nodes
+----
+1
+2
+3
+4
+
+# deduplicating recursive CTE with two variables works
+query II
+WITH RECURSIVE ranges AS (
+    SELECT min, max from (VALUES (1, 1), (2, 2)) ranges(min, max)
+    UNION
+    SELECT min, max + 1 as max
+    FROM ranges
+    WHERE max < 4
+)
+SELECT * FROM ranges
+----
+1 1
+2 2
+1 2
+2 3
+1 3
+2 4
+1 4
 
 # setup
 statement ok
@@ -152,21 +231,19 @@ logical_plan
 01)Sort: balances.time ASC NULLS LAST, balances.name ASC NULLS LAST, balances.account_balance ASC NULLS LAST
 02)--SubqueryAlias: balances
 03)----RecursiveQuery: is_distinct=false
-04)------Projection: balance.time, balance.name, balance.account_balance
-05)--------TableScan: balance
-06)------Projection: balances.time + Int64(1) AS time, balances.name, balances.account_balance + Int64(10) AS account_balance
-07)--------Filter: balances.time < Int64(10)
-08)----------TableScan: balances
+04)------TableScan: balance projection=[time, name, account_balance]
+05)------Projection: balances.time + Int64(1) AS time, balances.name, balances.account_balance + Int64(10) AS account_balance
+06)--------Filter: balances.time < Int64(10)
+07)----------TableScan: balances projection=[time, name, account_balance]
 physical_plan
 01)SortExec: expr=[time@0 ASC NULLS LAST, name@1 ASC NULLS LAST, account_balance@2 ASC NULLS LAST], preserve_partitioning=[false]
 02)--RecursiveQueryExec: name=balances, is_distinct=false
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/balance.csv]]}, projection=[time, name, account_balance], file_type=csv, has_header=true
 04)----CoalescePartitionsExec
 05)------ProjectionExec: expr=[time@0 + 1 as time, name@1 as name, account_balance@2 + 10 as account_balance]
-06)--------CoalesceBatchesExec: target_batch_size=2
-07)----------FilterExec: time@0 < 10
-08)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------------WorkTableExec: name=balances
+06)--------FilterExec: time@0 < 10
+07)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------------WorkTableExec: name=balances
 
 # recursive CTE with static term derived from table works
 # note that this is run with batch size set to 2. This should produce multiple batches per iteration since the input
@@ -646,21 +723,51 @@ ORDER BY
 3 1400 1
 1 2700 2
 
-#expect error from recursive CTE with nested recursive terms
-query error DataFusion error: This feature is not implemented: Recursive queries cannot be nested
+#nested recursive ctes
+query I
 WITH RECURSIVE outer_cte AS (
     SELECT 1 as a
     UNION ALL (
-        WITH  RECURSIVE nested_cte AS (
+        WITH RECURSIVE nested_cte AS (
            SELECT 1 as a
            UNION ALL
-           SELECT a+2 as a
-	   FROM nested_cte where a < 3
-         )
-    SELECT outer_cte.a +2
-    FROM outer_cte JOIN nested_cte USING(a)
-    WHERE nested_cte.a < 4
-   )
+           SELECT a + 2 as a
+           FROM nested_cte where a < 3
+        )
+        SELECT outer_cte.a + 2 as a
+        FROM outer_cte JOIN nested_cte USING(a)
+        WHERE nested_cte.a < 4
+    )
+)
+SELECT a FROM outer_cte;
+----
+1
+3
+5
+
+# Check that CTE name shadowing is returning an error
+query error DataFusion error: Error during planning: WITH query name "outer_cte" specified more than once
+WITH RECURSIVE outer_cte AS (
+    SELECT 1 as a
+    UNION ALL (
+        WITH RECURSIVE nested_cte AS (
+           SELECT 1 as a
+           UNION ALL (
+               WITH RECURSIVE outer_cte AS (
+                    SELECT 1 as a
+                    UNION ALL
+                    SELECT a + 2 as a
+                    FROM outer_cte where a < 3
+               )
+               SELECT nested_cte.a + outer_cte.a as a
+               FROM nested_cte JOIN outer_cte USING(a)
+               WHERE outer_cte_cte.a < 8
+           )
+        )
+        SELECT outer_cte.a + nested_cte.a as a
+        FROM outer_cte JOIN nested_cte USING(a)
+        WHERE nested_cte.a < 8
+    )
 )
 SELECT a FROM outer_cte;
 
@@ -720,14 +827,14 @@ logical_plan
 01)SubqueryAlias: recursive_cte
 02)--RecursiveQuery: is_distinct=false
 03)----Projection: Int64(1) AS val
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 05)----Projection: Int64(2) AS val
 06)------Cross Join: 
 07)--------Filter: recursive_cte.val < Int64(2)
 08)----------TableScan: recursive_cte
 09)--------SubqueryAlias: sub_cte
 10)----------Projection: Int64(2) AS val
-11)------------EmptyRelation
+11)------------EmptyRelation: rows=1
 physical_plan
 01)RecursiveQueryExec: name=recursive_cte, is_distinct=false
 02)--ProjectionExec: expr=[1 as val]
@@ -735,12 +842,11 @@ physical_plan
 04)--ProjectionExec: expr=[2 as val]
 05)----CrossJoinExec
 06)------CoalescePartitionsExec
-07)--------CoalesceBatchesExec: target_batch_size=8182
-08)----------FilterExec: val@0 < 2
-09)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)--------------WorkTableExec: name=recursive_cte
-11)------ProjectionExec: expr=[2 as val]
-12)--------PlaceholderRowExec
+07)--------FilterExec: val@0 < 2
+08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)------------WorkTableExec: name=recursive_cte
+10)------ProjectionExec: expr=[2 as val]
+11)--------PlaceholderRowExec
 
 # Test issue: https://github.com/apache/datafusion/issues/9794
 # Non-recursive term and recursive term have different types
@@ -765,7 +871,7 @@ WITH RECURSIVE my_cte AS (
 
 # Test issue: https://github.com/apache/datafusion/issues/9794
 # Non-recursive term and recursive term have different types, and cannot be casted
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'abc' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'abc' to value of Int64 type
 WITH RECURSIVE my_cte AS (
     SELECT 1 AS a
     UNION ALL
@@ -869,7 +975,7 @@ explain with numbers(a,b,c) as (select 1 as x, 2 as y, 3 as z) select * from num
 logical_plan
 01)SubqueryAlias: numbers
 02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
 02)--PlaceholderRowExec
@@ -880,7 +986,7 @@ explain with numbers(a,b,c) as (select 1,2,3) select * from numbers;
 logical_plan
 01)SubqueryAlias: numbers
 02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
 02)--PlaceholderRowExec
@@ -891,7 +997,7 @@ explain with numbers as (select 1 as a, 2 as b, 3 as c) select * from numbers;
 logical_plan
 01)SubqueryAlias: numbers
 02)--Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
 02)--PlaceholderRowExec
@@ -909,10 +1015,9 @@ logical_plan
 04)----SubqueryAlias: cte
 05)------TableScan: person projection=[id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8182
-02)--HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(id@0, id@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -931,7 +1036,7 @@ logical_plan
 02)--TableScan: j1 projection=[a]
 03)--SubqueryAlias: j2
 04)----Projection: Int64(1)
-05)------EmptyRelation
+05)------EmptyRelation: rows=1
 physical_plan
 01)CrossJoinExec
 02)--DataSourceExec: partitions=1, partition_sizes=[0]
@@ -948,53 +1053,162 @@ query TT
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
 ----
 logical_plan
 01)SubqueryAlias: numbers
 02)--RecursiveQuery: is_distinct=false
 03)----Projection: Int64(1) AS n
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 05)----Projection: numbers.n + Int64(1)
 06)------Filter: numbers.n < Int64(10)
-07)--------TableScan: numbers
+07)--------TableScan: numbers projection=[n]
 physical_plan
 01)RecursiveQueryExec: name=numbers, is_distinct=false
 02)--ProjectionExec: expr=[1 as n]
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)]
-06)------CoalesceBatchesExec: target_batch_size=8182
-07)--------FilterExec: n@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=numbers
+06)------FilterExec: n@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=numbers
 
 query TT
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
 ----
 logical_plan
 01)SubqueryAlias: numbers
 02)--RecursiveQuery: is_distinct=false
 03)----Projection: Int64(1) AS n
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 05)----Projection: numbers.n + Int64(1)
 06)------Filter: numbers.n < Int64(10)
-07)--------TableScan: numbers
+07)--------TableScan: numbers projection=[n]
 physical_plan
 01)RecursiveQueryExec: name=numbers, is_distinct=false
 02)--ProjectionExec: expr=[1 as n]
 03)----PlaceholderRowExec
 04)--CoalescePartitionsExec
 05)----ProjectionExec: expr=[n@0 + 1 as numbers.n + Int64(1)]
-06)------CoalesceBatchesExec: target_batch_size=8182
-07)--------FilterExec: n@0 < 10
-08)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)------------WorkTableExec: name=numbers
+06)------FilterExec: n@0 < 10
+07)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)----------WorkTableExec: name=numbers
+
+# Test for issue #16998: SortExec shares DynamicFilterPhysicalExpr across multiple executions
+query II
+with recursive r as (
+  select 0 as k, 0 as v
+  union all
+  (
+    select *
+    from r
+    order by v
+    limit 1
+  )
+)
+select *
+from r
+limit 5;
+----
+0 0
+0 0
+0 0
+0 0
+0 0
+
+query TT
+explain
+with recursive r as (
+  select 0 as k, 0 as v
+  union all
+  (
+    select *
+    from r
+    order by v
+    limit 1
+  )
+)
+select *
+from r
+limit 5;
+----
+logical_plan
+01)SubqueryAlias: r
+02)--Limit: skip=0, fetch=5
+03)----RecursiveQuery: is_distinct=false
+04)------Projection: Int64(0) AS k, Int64(0) AS v
+05)--------EmptyRelation: rows=1
+06)------Sort: r.v ASC NULLS LAST, fetch=1
+07)--------TableScan: r projection=[k, v]
+physical_plan
+01)GlobalLimitExec: skip=0, fetch=5
+02)--RecursiveQueryExec: name=r, is_distinct=false
+03)----ProjectionExec: expr=[0 as k, 0 as v]
+04)------PlaceholderRowExec
+05)----SortExec: TopK(fetch=1), expr=[v@1 ASC NULLS LAST], preserve_partitioning=[false]
+06)------WorkTableExec: name=r
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE closure STORED as CSV LOCATION '../core/tests/data/recursive_cte/closure.csv' OPTIONS ('format.has_header' 'true');
+
+# transitive closure with loop
+query II
+WITH RECURSIVE trans AS (
+    SELECT * FROM closure
+    UNION
+    SELECT l.start, r.end
+    FROM trans as l, closure AS r
+    WHERE l.end = r.start
+) SELECT * FROM trans ORDER BY start, end
+----
+1 1
+1 2
+1 3
+1 4
+2 1
+2 2
+2 3
+2 4
+4 1
+4 2
+4 3
+4 4
+
+query TT
+EXPLAIN WITH RECURSIVE trans AS (
+    SELECT * FROM closure
+    UNION
+    SELECT l.start, r.end
+    FROM trans as l, closure AS r
+    WHERE l.end = r.start
+) SELECT * FROM trans
+----
+logical_plan
+01)SubqueryAlias: trans
+02)--RecursiveQuery: is_distinct=true
+03)----Projection: closure.start, closure.end
+04)------TableScan: closure
+05)----Projection: l.start, r.end
+06)------Inner Join: l.end = r.start
+07)--------SubqueryAlias: l
+08)----------TableScan: trans
+09)--------SubqueryAlias: r
+10)----------TableScan: closure
+physical_plan
+01)RecursiveQueryExec: name=trans, is_distinct=true
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/closure.csv]]}, projection=[start, end], file_type=csv, has_header=true
+03)--CoalescePartitionsExec
+04)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(end@1, start@0)], projection=[start@0, end@3]
+05)------RepartitionExec: partitioning=Hash([end@1], 4), input_partitions=1
+06)--------WorkTableExec: name=trans
+07)------RepartitionExec: partitioning=Hash([start@0], 4), input_partitions=1
+08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/closure.csv]]}, projection=[start, end], file_type=csv, has_header=true
 
 statement count 0
 set datafusion.execution.enable_recursive_ctes = false;
@@ -1003,5 +1217,5 @@ query error DataFusion error: This feature is not implemented: Recursive CTEs ar
 explain WITH RECURSIVE numbers AS (
   select 1 as n
   UNION ALL
-  select n + 1 FROM numbers WHERE N < 10
+  select n + 1 FROM numbers WHERE n < 10
 ) select * from numbers;
diff --git a/datafusion/sqllogictest/test_files/date_bin_errors.slt b/datafusion/sqllogictest/test_files/date_bin_errors.slt
new file mode 100644
index 0000000000000..b6cda471d7afa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/date_bin_errors.slt
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DATE_BIN error handling with out-of-range values
+
+# Test case from issue #20219 - should return NULL instead of panicking
+query P
+select date_bin(interval '1637426858 months', to_timestamp_millis(1040292460), timestamp '1984-01-07 00:00:00');
+----
+NULL
+
+# Negative timestamp with month interval - should return NULL instead of panicking  
+query P
+select date_bin(interval '1 month', to_timestamp_millis(-1040292460), timestamp '1984-01-07 00:00:00');
+----
+NULL
+
+# Large stride causing overflow - should return NULL
+query P
+select date_bin(
+  interval '1637426858 months',
+  timestamp '1969-12-31 00:00:00',
+  timestamp '1984-01-07 00:00:00'
+);
+----
+NULL
+
+# Another large stride test
+query P
+select date_bin(
+  interval '1637426858 months',
+  to_timestamp_millis(-1040292000),
+  timestamp '1984-01-07 00:00:00'
+) as b;
+----
+NULL
+
+# Test with 1900-01-01 timestamp
+query P
+select date_bin(
+  interval '1637426858 months',
+  to_timestamp_millis(-2208988800000),
+  timestamp '1984-01-07 00:00:00'
+) as b;
+----
+NULL
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt
new file mode 100644
index 0000000000000..8eb5cc176f365
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_date.slt
@@ -0,0 +1,15 @@
+# date - date → integer
+# Subtract dates, producing the number of days elapsed
+# date '2001-10-01' - date '2001-09-28' → 3
+# This aligns with PostgreSQL, DuckDB, and MySQL behavior
+# Resolved by: https://github.com/apache/datafusion/issues/19528
+
+query I
+SELECT '2001-10-01'::date - '2001-09-28'::date
+----
+3
+
+query T
+SELECT arrow_typeof('2001-10-01'::date - '2001-09-28'::date)
+----
+Int64
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt
new file mode 100644
index 0000000000000..512c507d9478c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_integer.slt
@@ -0,0 +1,89 @@
+# date + integer → date
+# Add a number of days to a date
+# date '2001-09-28' + 7 → 2001-10-05
+
+query D
+SELECT '2001-09-28'::date + 7
+----
+2001-10-05
+
+query D
+SELECT 7 + '2001-09-28'::date
+----
+2001-10-05
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + 7)
+----
+Date32
+
+query D
+SELECT arrow_cast('2001-09-28', 'Date64') + 7
+----
+2001-10-05T00:00:00
+
+query D
+SELECT 7::smallint + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::smallint unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::int unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::bigint + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7::bigint unsigned + '2001-09-28'::date
+----
+2001-10-05
+
+query D
+SELECT 7 + arrow_cast('2001-09-28', 'Date64')
+----
+2001-10-05T00:00:00
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28', 'Date64') + 7)
+----
+Date64
+
+# date - integer → date
+# Subtract a number of days from a date
+# date '2001-10-01' - 7 → 2001-09-24
+
+query D
+SELECT '2001-10-01'::date - 7
+----
+2001-09-24
+
+query D
+SELECT arrow_cast('2001-10-01', 'Date64') - 7
+----
+2001-09-24T00:00:00
+
+query T
+SELECT arrow_typeof('2001-10-01'::date - 7)
+----
+Date32
+
+query error Invalid arithmetic operation
+SELECT 7 - '2001-10-01'::date
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date * 7
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date / 7
+
+query error Invalid date arithmetic operation
+SELECT '2001-10-01'::date % 7
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt
new file mode 100644
index 0000000000000..ad2e7ed496f79
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt
@@ -0,0 +1,37 @@
+# postgresql behavior
+#
+# date + interval → timestamp
+# Add an interval to a date
+# date '2001-09-28' + interval '1 hour' → 2001-09-28 01:00:00
+#
+# note that while the above reflects what postgresql does
+# in the case of datafusion/arrow that is not the case. The
+# result will be date32/date64
+#
+# Tracking issue: https://github.com/apache/datafusion/issues/19527
+
+query D
+SELECT '2001-09-28'::date + interval '1 hour'
+----
+2001-09-28
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + interval '1 hour')
+----
+Date32
+
+# postgresql behavior
+#
+# date - interval → timestamp
+# Subtract an interval from a date
+# date '2001-09-28' - interval '1 hour' → 2001-09-27 23:00:00
+
+query D
+SELECT '2001-09-28'::date - interval '25 hour'
+----
+2001-09-27
+
+query T
+SELECT arrow_typeof('2001-09-28'::date - interval '25 hour')
+----
+Date32
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt
new file mode 100644
index 0000000000000..8e85c8f90580e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_date_time.slt
@@ -0,0 +1,115 @@
+# date + time → timestamp
+# Add a time-of-day to a date
+# date '2001-09-28' + time '03:00' → 2001-09-28 03:00:00
+
+query P
+SELECT '2001-09-28'::date + '03:00'::time
+----
+2001-09-28T03:00:00
+
+query P
+SELECT '03:00'::time + '2001-09-28'::date
+----
+2001-09-28T03:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28'::date + '03:00'::time)
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28'::date - '03:00'::time
+----
+2001-09-27T21:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time32(Second)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time64(Microsecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date32') + arrow_cast('03:00', 'Time64(Nanosecond)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00', 'Time32(Second)')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.123', 'Time32(Millisecond)')
+----
+2001-09-28T03:00:00.123
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.123456', 'Time64(Microsecond)')
+----
+2001-09-28T03:00:00.123456
+
+query P
+SELECT arrow_cast('2001-09-28', 'Date64') + arrow_cast('03:00:00.001234567', 'Time64(Nanosecond)')
+----
+2001-09-28T03:00:00.001234567
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Second)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Millisecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time64(Microsecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time64(Nanosecond)') + arrow_cast('2001-09-28', 'Date32')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00', 'Time32(Second)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00
+
+query P
+SELECT arrow_cast('03:00:00.123', 'Time32(Millisecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.123
+
+query P
+SELECT arrow_cast('03:00:00.123456', 'Time64(Microsecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.123456
+
+query P
+SELECT arrow_cast('03:00:00.001234567', 'Time64(Nanosecond)') + arrow_cast('2001-09-28', 'Date64')
+----
+2001-09-28T03:00:00.001234567
+
+query error Invalid arithmetic operation
+SELECT '03:00'::time - '2001-09-28'::date
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date * '03:00'::time
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date / '03:00'::time
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28'::date % '03:00'::time
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt b/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt
new file mode 100644
index 0000000000000..d48d2b59c8bee
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_interval_double.slt
@@ -0,0 +1,41 @@
+# interval * double precision → interval
+# Multiply an interval by a scalar
+# interval '1 second' * 900 → 00:15:00
+# interval '1 day' * 21 → 21 days
+# interval '1 hour' * 3.5 → 03:30:00
+
+# these currently do not work - https://github.com/apache/arrow-rs/issues/9030
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 second' * 900
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT 900 * interval '1 second'
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 day' * 21
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT interval '1 hour' * 3.5
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT 3.5 * interval '1 hour'
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) \* Interval\(MonthDayNano\)
+SELECT arrow_typeof(interval '1 second' * 900)
+
+# interval / double precision → interval
+# Divide an interval by a scalar
+# interval '1 hour' / 1.5 → 00:40:00
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) / Interval\(MonthDayNano\)
+SELECT interval '1 hour' / 1.5
+
+
+query error Invalid interval arithmetic operation: Interval\(MonthDayNano\) / Interval\(MonthDayNano\)
+SELECT arrow_typeof(interval '1 hour' / 1.5)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt
new file mode 100644
index 0000000000000..d8a701356b6e3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_interval_interval.slt
@@ -0,0 +1,27 @@
+# interval + interval → interval
+# Add intervals
+# interval '1 day' + interval '1 hour' → 1 day 01:00:00
+
+query ?
+SELECT interval '1 day' + interval '1 hour'
+----
+1 days 1 hours
+
+query T
+SELECT arrow_typeof(interval '1 day' + interval '1 hour')
+----
+Interval(MonthDayNano)
+
+# interval - interval → interval
+# Subtract intervals
+# interval '1 day' - interval '1 hour' → 1 day -01:00:00
+
+query ?
+SELECT interval '1 day' - interval '1 hour'
+----
+1 days -1 hours
+
+query T
+SELECT arrow_typeof(interval '1 day' - interval '1 hour')
+----
+Interval(MonthDayNano)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt
new file mode 100644
index 0000000000000..52ef046bf22da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_negate_interval.slt
@@ -0,0 +1,13 @@
+# - interval → interval
+# Negate an interval
+# - interval '23 hours' → -23:00:00
+
+query ?
+SELECT - interval '23 hours'
+----
+-23 hours
+
+query T
+SELECT arrow_typeof(- interval '23 hours')
+----
+Interval(MonthDayNano)
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt
new file mode 100644
index 0000000000000..997eae9b1bd8b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_time_interval.slt
@@ -0,0 +1,70 @@
+# postgresql behavior
+#
+# time + interval → time
+# Add an interval to a time
+# time '01:00' + interval '3 hours' → 04:00:00
+#
+# note that while the above reflects what postgresql does
+# in the case of datafusion/arrow that is not the case. The
+# result will be an interval, not a time.
+
+query ?
+SELECT '01:00'::time + interval '3 hours'
+----
+4 hours
+
+query T
+SELECT arrow_typeof('01:00'::time + interval '3 hours')
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '22:00'::time + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT interval '3 hours' + '22:00'::time
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time32(Second)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time32(Millisecond)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time64(Microsecond)') + interval '3 hours'
+----
+25 hours
+
+query ?
+SELECT arrow_cast('22:00', 'Time64(Nanosecond)') + interval '3 hours'
+----
+25 hours
+
+# postgresql behavior
+#
+# time - interval → time
+# Subtract an interval from a time
+# time '05:00' - interval '2 hours' → 03:00:00
+
+query ?
+SELECT '05:00'::time - interval '2 hours'
+----
+3 hours
+
+query T
+SELECT arrow_typeof('05:00'::time - interval '2 hours')
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '02:00'::time - interval '3 hours'
+----
+-1 hours
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt b/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt
new file mode 100644
index 0000000000000..4cf081970e2f9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_time_time.slt
@@ -0,0 +1,47 @@
+# time - time → interval
+# Subtract times
+# time '05:00' - time '03:00' → 02:00:00
+
+query ?
+SELECT '05:00'::time - '03:00'::time
+----
+2 hours
+
+query T
+SELECT arrow_typeof('05:00'::time - '03:00'::time)
+----
+Interval(MonthDayNano)
+
+query ?
+SELECT '05:00'::time + '03:00'::time
+----
+8 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time32(Second)') - arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time32(Second)') - arrow_cast('03:00', 'Time64(Microsecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time64(Microsecond)') - arrow_cast('03:00', 'Time32(Millisecond)')
+----
+2 hours
+
+query ?
+SELECT arrow_cast('05:00', 'Time64(Nanosecond)') - arrow_cast('03:00', 'Time32(Second)')
+----
+2 hours
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time * '03:00'::time
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time / '03:00'::time
+
+query error Invalid interval arithmetic operation
+SELECT '05:00'::time % '03:00'::time
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt
new file mode 100644
index 0000000000000..aeeebe73db701
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_duration.slt
@@ -0,0 +1,147 @@
+# timestamp + duration → timestamp
+# Add an duration to a timestamp
+# timestamp '2001-09-28 01:00' + arrow_cast(12345000000000, 'Duration(Nanosecond)') → 2001-09-29 00:00:00
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345, 'Duration(Second)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345, 'Duration(Second)');
+----
+2001-09-27T21:34:15
+
+query P
+SELECT arrow_cast(12345, 'Duration(Second)') + '2001-09-28T01:00:00'::timestamp;
+----
+2001-09-28T04:25:45
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345, 'Duration(Second)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000, 'Duration(Millisecond)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(ns)
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-28T04:25:45.000000999
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp - arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-27T21:34:14.999999001
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + arrow_cast(12345000000999, 'Duration(Nanosecond)'))
+----
+Timestamp(ns)
+
+# test with other timestamp timeunits beyond the default ns
+
+# second +/- millisecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') - arrow_cast(12345000, 'Duration(Millisecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000, 'Duration(Millisecond)'))
+----
+Timestamp(s)
+
+# second +/- microsecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Second)') + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(s)
+
+# millisecond +/- nanosecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') - arrow_cast(12345000000999, 'Duration(Nanosecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000999, 'Duration(Nanosecond)'))
+----
+Timestamp(ms)
+
+# millisecond +/- microsecond
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-28T04:25:45
+
+query P
+SELECT arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') - arrow_cast(12345000000, 'Duration(Microsecond)');
+----
+2001-09-27T21:34:15
+
+query T
+SELECT arrow_typeof(arrow_cast('2001-09-28T01:00:00', 'Timestamp(Millisecond)') + arrow_cast(12345000000, 'Duration(Microsecond)'))
+----
+Timestamp(ms)
+
+# while timestamp + duration makes sense, duration - timestamp does not
+query error Invalid arithmetic operation: Duration\(ns\) - Timestamp\(ns\)
+SELECT arrow_cast(12345, 'Duration(Second)') - '2001-09-28T01:00:00'::timestamp;
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp * arrow_cast(12345, 'Duration(Second)');
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp % arrow_cast(12345, 'Duration(Second)');
+
+query error Invalid timestamp arithmetic operation
+SELECT '2001-09-28T01:00:00'::timestamp / arrow_cast(12345, 'Duration(Second)');
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt
new file mode 100644
index 0000000000000..aaf629f1f07da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_interval.slt
@@ -0,0 +1,36 @@
+# timestamp + interval → timestamp
+# Add an interval to a timestamp
+# timestamp '2001-09-28 01:00' + interval '23 hours' → 2001-09-29 00:00:00
+
+query P
+SELECT '2001-09-28T01:00:00'::timestamp + interval '23 hours'
+----
+2001-09-29T00:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28T01:00:00'::timestamp + interval '23 hours')
+----
+Timestamp(ns)
+
+# timestamp - interval → timestamp
+# Subtract an interval from a timestamp
+# timestamp '2001-09-28 23:00' - interval '23 hours' → 2001-09-28 00:00:00
+
+query P
+SELECT '2001-09-28T23:00:00'::timestamp - interval '23 hours'
+----
+2001-09-28T00:00:00
+
+query T
+SELECT arrow_typeof('2001-09-28T23:00:00'::timestamp - interval '23 hours')
+----
+Timestamp(ns)
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) \* Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp * interval '23 hours'
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) / Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp / interval '23 hours'
+
+query error Cannot coerce arithmetic expression Timestamp\(ns\) % Interval\(MonthDayNano\) to valid types
+SELECT '2001-09-28T23:00:00'::timestamp % interval '23 hours'
diff --git a/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt
new file mode 100644
index 0000000000000..975365ae22ebe
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/arith_timestamp_timestamp.slt
@@ -0,0 +1,13 @@
+# timestamp - timestamp → interval
+# Subtract timestamps (converting 24-hour intervals into days, similarly to justify_hours())
+# timestamp '2001-09-29 03:00' - timestamp '2001-07-27 12:00' → 63 days 15:00:00
+
+query ?
+SELECT '2001-09-29T03:00:00'::timestamp - '2001-07-27T12:00:00'::timestamp
+----
+63 days 15 hours 0 mins 0.000000000 secs
+
+query T
+SELECT arrow_typeof('2001-09-29T03:00:00'::timestamp - '2001-07-27T12:00:00'::timestamp)
+----
+Duration(ns)
diff --git a/datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt b/datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt
new file mode 100644
index 0000000000000..1b9c3cddeecec
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/current_date_timezone.slt
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## current_date with timezone tests
+##########
+
+# Test 1: Verify current_date is consistent within the same query (default UTC)
+query B
+SELECT current_date() = current_date();
+----
+true
+
+# Test 2: Verify alias 'today' works the same as current_date
+query B
+SELECT current_date() = today();
+----
+true
+
+# Test 3: Set timezone to +05:00 and verify current_date is still stable
+statement ok
+SET datafusion.execution.time_zone = '+05:00';
+
+query B
+SELECT current_date() = current_date();
+----
+true
+
+#Test 4: Verify current_date matches cast(now() as date) in the same timezone
+query B
+SELECT current_date() = cast(now() as date);
+----
+true
+
+# Test 5: Test with negative offset timezone
+statement ok
+SET datafusion.execution.time_zone = '-08:00';
+
+query B
+SELECT current_date() = today();
+----
+true
+
+# Test 6: Test with named timezone (America/New_York)
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+query B
+SELECT current_date() = current_date();
+----
+true
+
+# Test 7: Verify date type is preserved
+query T
+SELECT arrow_typeof(current_date());
+----
+Date32
+
+# Test 8: Reset to UTC
+statement ok
+SET datafusion.execution.time_zone = '+00:00';
+
+query B
+SELECT current_date() = today();
+----
+true
diff --git a/datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt b/datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt
new file mode 100644
index 0000000000000..c80c4b51d5ac8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/datetime/current_time_timezone.slt
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## current_time with timezone tests
+##########
+
+# Test 1: Verify current_time is consistent within the same query (default UTC)
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 2: Verify data type is correct
+query T
+SELECT arrow_typeof(current_time());
+----
+Time64(ns)
+
+# Test 3: Set timezone to +08:00 and verify current_time is still stable
+statement ok
+SET datafusion.execution.time_zone = '+08:00';
+
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 4: Verify current_time returns Time64 type in different timezone
+query T
+SELECT arrow_typeof(current_time());
+----
+Time64(ns)
+
+# Test 5: Test with negative offset timezone
+statement ok
+SET datafusion.execution.time_zone = '-05:00';
+
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 6: Test with named timezone (America/New_York)
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 7: Verify current_time is stable within a query
+query B
+SELECT 
+  current_time() = current_time() AND
+  current_time() = current_time();
+----
+true
+
+# Test 8: Reset to UTC
+statement ok
+SET datafusion.execution.time_zone = '+00:00';
+
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 9: Verify current_time with Asia/Tokyo timezone
+statement ok
+SET datafusion.execution.time_zone = 'Asia/Tokyo';
+
+query B
+SELECT current_time() = current_time();
+----
+true
+
+# Test 10: Verify current_time with Europe/London timezone
+statement ok
+SET datafusion.execution.time_zone = 'Europe/London';
+
+query B
+SELECT current_time() = current_time();
+----
+true
diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/datetime/date_part.slt
similarity index 50%
rename from datafusion/sqllogictest/test_files/expr/date_part.slt
rename to datafusion/sqllogictest/test_files/datetime/date_part.slt
index 39c42cbe1e97f..07dc1302b9ece 100644
--- a/datafusion/sqllogictest/test_files/expr/date_part.slt
+++ b/datafusion/sqllogictest/test_files/datetime/date_part.slt
@@ -19,7 +19,7 @@
 # for the same function).
 
 
-## Begin tests fo rdate_part with columns and timestamp's with timezones
+## Begin tests for date_part with columns and timestamp's with timezones
 
 # Source data table has
 # timestamps with millisecond (very common timestamp precision) and nanosecond (maximum precision) timestamps
@@ -40,30 +40,32 @@ with t as (values
 )
 SELECT
   -- nanoseconds, with no, utc, and local timezone
-  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as ts_nano_no_tz,
+  arrow_cast(column1, 'Timestamp(ns)') as ts_nano_no_tz,
+  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as ts_nano_no_tz_old_format,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))') as ts_nano_utc,
   arrow_cast(column1, 'Timestamp(Nanosecond, Some("America/New_York"))') as ts_nano_eastern,
   -- milliseconds, with no, utc, and local timezone
-  arrow_cast(column1, 'Timestamp(Millisecond, None)') as ts_milli_no_tz,
+  arrow_cast(column1, 'Timestamp(ms)') as ts_milli_no_tz,
+  arrow_cast(column1, 'Timestamp(Millisecond, None)') as ts_milli_no_tz_old_format,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("UTC"))') as ts_milli_utc,
   arrow_cast(column1, 'Timestamp(Millisecond, Some("America/New_York"))') as ts_milli_eastern
 FROM t;
 
 
-query PPPPPP
+query PPPPPPPP
 SELECT * FROM source_ts;
 ----
-2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00
-2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00
-2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00
-2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00
-2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00
-2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00
-2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00
-2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00
-2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
-2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456Z 2019-12-31T19:00:00.123456-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
-2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789Z 2019-12-31T19:00:00.123456789-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00 2020-01-01T00:00:00 2020-01-01T00:00:00 2020-01-01T00:00:00Z 2019-12-31T19:00:00-05:00
+2021-01-01T00:00:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00 2021-01-01T00:00:00 2021-01-01T00:00:00 2021-01-01T00:00:00Z 2020-12-31T19:00:00-05:00
+2020-09-01T00:00:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00 2020-09-01T00:00:00 2020-09-01T00:00:00 2020-09-01T00:00:00Z 2020-08-31T20:00:00-04:00
+2020-01-25T00:00:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00 2020-01-25T00:00:00 2020-01-25T00:00:00 2020-01-25T00:00:00Z 2020-01-24T19:00:00-05:00
+2020-01-24T00:00:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00 2020-01-24T00:00:00 2020-01-24T00:00:00 2020-01-24T00:00:00Z 2020-01-23T19:00:00-05:00
+2020-01-01T12:00:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00 2020-01-01T12:00:00 2020-01-01T12:00:00 2020-01-01T12:00:00Z 2020-01-01T07:00:00-05:00
+2020-01-01T00:30:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00 2020-01-01T00:30:00 2020-01-01T00:30:00 2020-01-01T00:30:00Z 2019-12-31T19:30:00-05:00
+2020-01-01T00:00:30 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00 2020-01-01T00:00:30 2020-01-01T00:00:30 2020-01-01T00:00:30Z 2019-12-31T19:00:30-05:00
+2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456 2020-01-01T00:00:00.123456Z 2019-12-31T19:00:00.123456-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
+2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789 2020-01-01T00:00:00.123456789Z 2019-12-31T19:00:00.123456789-05:00 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123 2020-01-01T00:00:00.123Z 2019-12-31T19:00:00.123-05:00
 
 # date_part (year) with columns and explicit timestamp
 query IIIIII
@@ -81,6 +83,23 @@ SELECT date_part('year', ts_nano_no_tz), date_part('year', ts_nano_utc), date_pa
 2020 2020 2019 2020 2020 2019
 2020 2020 2019 2020 2020 2019
 
+# date_part (isoyear) with columns and explicit timestamp
+query IIIIII
+SELECT date_part('isoyear', ts_nano_no_tz), date_part('isoyear', ts_nano_utc), date_part('isoyear', ts_nano_eastern), date_part('isoyear', ts_milli_no_tz), date_part('isoyear', ts_milli_utc), date_part('isoyear', ts_milli_eastern)  FROM source_ts;
+----
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+2020 2020 2020 2020 2020 2020
+
+
 # date_part (month)
 query IIIIII
 SELECT date_part('month', ts_nano_no_tz), date_part('month', ts_nano_utc), date_part('month', ts_nano_eastern), date_part('month', ts_milli_no_tz), date_part('month', ts_milli_utc), date_part('month', ts_milli_eastern)  FROM source_ts;
@@ -193,6 +212,22 @@ SELECT date_part('microsecond', ts_nano_no_tz), date_part('microsecond', ts_nano
 123456 123456 123456 123000 123000 123000
 123456 123456 123456 123000 123000 123000
 
+# date_part (nanosecond)
+query IIIIII
+SELECT date_part('nanosecond', ts_nano_no_tz), date_part('nanosecond', ts_nano_utc), date_part('nanosecond', ts_nano_eastern), date_part('nanosecond', ts_milli_no_tz), date_part('nanosecond', ts_milli_utc), date_part('nanosecond', ts_milli_eastern)  FROM source_ts;
+----
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+30000000000 30000000000 30000000000 30000000000 30000000000 30000000000
+123000000 123000000 123000000 123000000 123000000 123000000
+123456000 123456000 123456000 123000000 123000000 123000000
+123456789 123456789 123456789 123000000 123000000 123000000
+
 ### Cleanup
 statement ok
 drop table source_ts;
@@ -228,6 +263,26 @@ SELECT EXTRACT('year' FROM  timestamp '2020-09-08T12:00:00+00:00')
 ----
 2020
 
+query I
+SELECT date_part('ISOYEAR', CAST('2000-01-01' AS DATE))
+----
+1999
+
+query I
+SELECT EXTRACT(isoyear FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
+query I
+SELECT EXTRACT("isoyear" FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
+query I
+SELECT EXTRACT('isoyear' FROM  timestamp '2020-09-08T12:00:00+00:00')
+----
+2020
+
 query I
 SELECT date_part('QUARTER', CAST('2000-01-01' AS DATE))
 ----
@@ -394,6 +449,12 @@ SELECT arrow_typeof(date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00'
 ----
 Int32
 
+# nanosecond can exceed Int32 and returns Int64
+query T
+SELECT arrow_typeof(date_part('nanosecond', to_timestamp('2020-09-08T12:12:00+00:00')))
+----
+Int64
+
 query I
 SELECT EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
@@ -409,8 +470,11 @@ SELECT EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+# note the output is more than Int32 can store
+query I
 SELECT EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
 
 query I
 SELECT EXTRACT("second" FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
@@ -427,8 +491,10 @@ SELECT EXTRACT("microsecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00'
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT EXTRACT("nanosecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
 
 query I
 SELECT EXTRACT('second' FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
@@ -445,9 +511,10 @@ SELECT EXTRACT('microsecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00'
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT EXTRACT('nanosecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00')
-
+----
+12123456780
 
 # Keep precision when coercing Utf8 to Timestamp
 query I
@@ -465,9 +532,10 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00')
-
+----
+12123456780
 
 query I
 SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00')
@@ -484,8 +552,30 @@ SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00')
 ----
 12123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00')
+----
+12123456780
+
+query I
+SELECT EXTRACT(nanosecond FROM ts)
+FROM (VALUES
+  (timestamp '2020-09-08T12:00:12.12345678+00:00'),
+  (NULL::timestamp)
+) AS t(ts)
+----
+12123456780
+NULL
+
+query I
+SELECT date_part('nanosecond', ts)
+FROM (VALUES
+  (timestamp '2020-09-08T12:00:12.12345678+00:00'),
+  (NULL::timestamp)
+) AS t(ts)
+----
+12123456780
+NULL
 
 # test_date_part_time
 
@@ -540,8 +630,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50'::time, 'Time32(Second)'))
 ----
 50000000
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50'::time, 'Time32(Second)'))
+----
+50000000000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50'::time, 'Time32(Second)'))
@@ -604,8 +696,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123'::time, 'Time32(Millise
 ----
 50123000
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)'))
+----
+50123000000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)'))
@@ -668,8 +762,10 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123456'::time, 'Time64(Micr
 ----
 50123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT extract(nanosecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)'))
+----
+50123456000
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)'))
@@ -758,8 +854,10 @@ SELECT extract(us from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond
 ----
 50123456
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query I
 SELECT date_part('nanosecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)'))
+----
+50123456789
 
 query R
 SELECT date_part('epoch', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)'))
@@ -865,9 +963,15 @@ SELECT extract(month from arrow_cast('20 months', 'Interval(YearMonth)'))
 ----
 8
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Interval\(YearMonth\)
+SELECT extract(isoyear from arrow_cast('10 years', 'Interval(YearMonth)'))
+
 query error DataFusion error: Arrow error: Compute error: Year does not support: Interval\(DayTime\)
 SELECT extract(year from arrow_cast('10 days', 'Interval(DayTime)'))
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Interval\(DayTime\)
+SELECT extract(isoyear from arrow_cast('10 days', 'Interval(DayTime)'))
+
 query error DataFusion error: Arrow error: Compute error: Month does not support: Interval\(DayTime\)
 SELECT extract(month from arrow_cast('10 days', 'Interval(DayTime)'))
 
@@ -936,6 +1040,57 @@ SELECT extract(second from arrow_cast(NULL, 'Interval(MonthDayNano)'))
 ----
 NULL
 
+# extract epoch from intervals
+query R
+SELECT extract(epoch from interval '15 minutes')
+----
+900
+
+query R
+SELECT extract(epoch from interval '1 hour')
+----
+3600
+
+query R
+SELECT extract(epoch from interval '1 day')
+----
+86400
+
+query R
+SELECT extract(epoch from interval '1 month')
+----
+2592000
+
+query R
+SELECT extract(epoch from arrow_cast('3 days', 'Interval(DayTime)'))
+----
+259200
+
+query R
+SELECT extract(epoch from arrow_cast('100 milliseconds', 'Interval(MonthDayNano)'))
+----
+0.1
+
+query R
+SELECT extract(epoch from arrow_cast('500 microseconds', 'Interval(MonthDayNano)'))
+----
+0.0005
+
+query R
+SELECT extract(epoch from arrow_cast('2500 nanoseconds', 'Interval(MonthDayNano)'))
+----
+0.0000025
+
+query R
+SELECT extract(epoch from arrow_cast('1 month 2 days 500 milliseconds', 'Interval(MonthDayNano)'))
+----
+2764800.5
+
+query R
+SELECT extract(epoch from arrow_cast('2 months', 'Interval(YearMonth)'))
+----
+5184000
+
 statement ok
 create table t (id int, i interval) as values
   (0, interval '5 months 1 day 10 nanoseconds'),
@@ -1005,12 +1160,15 @@ SELECT extract(day from arrow_cast(864000, 'Duration(Second)'))
 ----
 10
 
-query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(Second\)
+query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(s\)
 SELECT extract(month from arrow_cast(864000, 'Duration(Second)'))
 
-query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(Second\)
+query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(s\)
 SELECT extract(year from arrow_cast(864000, 'Duration(Second)'))
 
+query error DataFusion error: Arrow error: Compute error: YearISO does not support: Duration\(s\)
+SELECT extract(isoyear from arrow_cast(864000, 'Duration(Second)'))
+
 query I
 SELECT extract(day from arrow_cast(NULL, 'Duration(Second)'))
 ----
@@ -1023,6 +1181,11 @@ SELECT (date_part('year', now()) = EXTRACT(year FROM now()))
 ----
 true
 
+query B
+SELECT (date_part('isoyear', now()) = EXTRACT(isoyear FROM now()))
+----
+true
+
 query B
 SELECT (date_part('quarter', now()) = EXTRACT(quarter FROM now()))
 ----
@@ -1068,5 +1231,588 @@ SELECT (date_part('microsecond', now()) = EXTRACT(microsecond FROM now()))
 ----
 true
 
-query error DataFusion error: This feature is not implemented: Date part Nanosecond not supported
+query B
 SELECT (date_part('nanosecond', now()) = EXTRACT(nanosecond FROM now()))
+----
+true
+
+
+query I
+SELECT date_part('ISODOW', CAST('2000-01-01' AS DATE))
+----
+5
+
+query I
+SELECT EXTRACT(isodow FROM to_timestamp('2020-09-08T12:00:00+00:00'))
+----
+1
+
+query I
+SELECT EXTRACT("isodow" FROM to_timestamp('2020-09-08T12:00:00+00:00'))
+----
+1
+
+query I
+SELECT EXTRACT('isodow' FROM to_timestamp('2020-09-08T12:00:00+00:00'))
+----
+1
+
+## Preimage tests
+
+statement ok
+create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01');
+
+# Simple optimizations, col on LHS
+
+query D
+select c from t1 where extract(year from c) = 2024;
+----
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) <> 2024;
+----
+1990-01-01
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) > 2024;
+----
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) < 2024;
+----
+1990-01-01
+
+query D
+select c from t1 where extract(year from c) >= 2024;
+----
+2024-01-01
+2030-01-01
+
+query D
+select c from t1 where extract(year from c) <= 2024;
+----
+1990-01-01
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) is not distinct from 2024
+----
+2024-01-01
+
+query D
+select c from t1 where extract(year from c) is distinct from 2024
+----
+NULL
+1990-01-01
+2030-01-01
+
+# IN list optimization
+query D
+select c from t1 where extract(year from c) in (1990, 2024);
+----
+1990-01-01
+2024-01-01
+
+# NOT IN list optimization (NULL does not satisfy NOT IN)
+query D
+select c from t1 where extract(year from c) not in (1990, 2024);
+----
+2030-01-01
+
+# Check that date_part is not in the explain statements
+
+query TT
+explain select c from t1 where extract (year from c) = 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) <> 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) > 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) < 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) >= 2024
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) <= 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) is not distinct from 2024
+----
+logical_plan
+01)Filter: t1.c IS NOT NULL AND t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 IS NOT NULL AND c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) is distinct from 2024
+----
+logical_plan
+01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (year from c) in (1990, 2024)
+----
+logical_plan
+01)Filter: t1.c >= Date32("1990-01-01") AND t1.c < Date32("1991-01-01") OR t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 1990-01-01 AND c@0 < 1991-01-01 OR c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Simple optimizations, column on RHS
+
+query D
+select c from t1 where 2024 = extract(year from c);
+----
+2024-01-01
+
+query D
+select c from t1 where 2024 <> extract(year from c);
+----
+1990-01-01
+2030-01-01
+
+query D
+select c from t1 where 2024 < extract(year from c);
+----
+2030-01-01
+
+query D
+select c from t1 where 2024 > extract(year from c);
+----
+1990-01-01
+
+query D
+select c from t1 where 2024 <= extract(year from c);
+----
+2024-01-01
+2030-01-01
+
+query D
+select c from t1 where 2024 >= extract(year from c);
+----
+1990-01-01
+2024-01-01
+
+query D
+select c from t1 where 2024 is not distinct from extract(year from c);
+----
+2024-01-01
+
+query D
+select c from t1 where 2024 is distinct from extract(year from c);
+----
+NULL
+1990-01-01
+2030-01-01
+
+# Check explain statements for optimizations for other interval types
+
+query TT
+explain select c from t1 where extract (quarter from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(QUARTER, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (month from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MONTH, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (week from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(WEEK, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (day from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DAY, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (hour from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(HOUR, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (minute from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MINUTE, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (second from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(SECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (millisecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MILLISECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (microsecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(MICROSECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (nanosecond from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int64(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(NANOSECOND, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (dow from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DOW, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (doy from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(DOY, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (epoch from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(EPOCH, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c from t1 where extract (isodow from c) = 2024
+----
+logical_plan
+01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024)
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: date_part(ISODOW, c@0) = 2024
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Simple optimize different datatypes
+
+statement ok
+create table t2(
+    c1_date32 DATE,
+    c2_ts_sec timestamp,
+    c3_ts_mili timestamp,
+    c4_ts_micro timestamp,
+    c5_ts_nano timestamp
+) as VALUES
+    (NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL),
+    ('1990-05-20',
+     '1990-05-20T00:00:10'::timestamp,
+     '1990-05-20T00:00:10.987'::timestamp,
+     '1990-05-20T00:00:10.987654'::timestamp,
+     '1990-05-20T00:00:10.987654321'::timestamp),
+    ('2024-01-01',
+     '2024-01-01T00:00:00'::timestamp,
+     '2024-01-01T00:00:00.123'::timestamp,
+     '2024-01-01T00:00:00.123456'::timestamp,
+     '2024-01-01T00:00:00.123456789'::timestamp),
+    ('2030-12-31',
+     '2030-12-31T23:59:59'::timestamp,
+     '2030-12-31T23:59:59.001'::timestamp,
+     '2030-12-31T23:59:59.001234'::timestamp,
+     '2030-12-31T23:59:59.001234567'::timestamp)
+;
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) = 2024;
+----
+2024-01-01
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) <> 2024;
+----
+1990-05-20
+2030-12-31
+
+query P
+select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024;
+----
+2030-12-31T23:59:59
+
+query P
+select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024;
+----
+1990-05-20T00:00:10.987
+
+query P
+select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024;
+----
+2024-01-01T00:00:00.123456
+2030-12-31T23:59:59.001234
+
+query P
+select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024;
+----
+1990-05-20T00:00:10.987654321
+2024-01-01T00:00:00.123456789
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024
+----
+2024-01-01
+
+query D
+select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024
+----
+NULL
+1990-05-20
+2030-12-31
+
+# Check that date_part is not in the explain statements for other datatypes
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) = 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024
+----
+logical_plan
+01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None)
+02)--TableScan: t2 projection=[c2_ts_sec]
+physical_plan
+01)FilterExec: c2_ts_sec@0 >= 1735689600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024
+----
+logical_plan
+01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None)
+02)--TableScan: t2 projection=[c3_ts_mili]
+physical_plan
+01)FilterExec: c3_ts_mili@0 < 1704067200000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024
+----
+logical_plan
+01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None)
+02)--TableScan: t2 projection=[c4_ts_micro]
+physical_plan
+01)FilterExec: c4_ts_micro@0 >= 1704067200000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024
+----
+logical_plan
+01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None)
+02)--TableScan: t2 projection=[c5_ts_nano]
+physical_plan
+01)FilterExec: c5_ts_nano@0 < 1735689600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 IS NOT NULL AND t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01")
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 IS NOT NULL AND c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024
+----
+logical_plan
+01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL
+02)--TableScan: t2 projection=[c1_date32]
+physical_plan
+01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Preimage with timestamp with America/New_York timezone
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+statement ok
+create table t3(
+    c1_ts_tz timestamptz
+) as VALUES
+    (NULL),
+    ('2024-01-01T04:59:59Z'::timestamptz), -- local 2023-12-31 23:59:59 -05
+    ('2024-01-01T05:00:00Z'::timestamptz), -- local 2024-01-01 00:00:00 -05
+    ('2025-01-01T04:59:59Z'::timestamptz), -- local 2024-12-31 23:59:59 -05
+    ('2025-01-01T05:00:00Z'::timestamptz)  -- local 2025-01-01 00:00:00 -05
+;
+
+query P
+select c1_ts_tz
+from t3
+where extract(year from c1_ts_tz) = 2024
+order by c1_ts_tz
+----
+2024-01-01T00:00:00-05:00
+2024-12-31T23:59:59-05:00
+
+query TT
+explain select c1_ts_tz from t3 where extract(year from c1_ts_tz) = 2024
+----
+logical_plan
+01)Filter: t3.c1_ts_tz >= TimestampNanosecond(1704085200000000000, Some("America/New_York")) AND t3.c1_ts_tz < TimestampNanosecond(1735707600000000000, Some("America/New_York"))
+02)--TableScan: t3 projection=[c1_ts_tz]
+physical_plan
+01)FilterExec: c1_ts_tz@0 >= 1704085200000000000 AND c1_ts_tz@0 < 1735707600000000000
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+RESET datafusion.execution.time_zone;
+
+# Test non-Int32 rhs argument
+
+query D
+select c from t1 where extract(year from c) = cast(2024 as bigint);
+----
+2024-01-01
+
+query TT
+explain select c from t1 where extract (year from c) = cast(2024 as bigint)
+----
+logical_plan
+01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01")
+02)--TableScan: t1 projection=[c]
+physical_plan
+01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/datetime/dates.slt
similarity index 62%
rename from datafusion/sqllogictest/test_files/dates.slt
rename to datafusion/sqllogictest/test_files/datetime/dates.slt
index 148f0dfe64bb7..d2a7360b120c6 100644
--- a/datafusion/sqllogictest/test_files/dates.slt
+++ b/datafusion/sqllogictest/test_files/datetime/dates.slt
@@ -85,16 +85,14 @@ g
 h
 
 ## Plan error when compare Utf8 and timestamp in where clause
-statement error DataFusion error: type_coercion\ncaused by\nError during planning: Cannot coerce arithmetic expression Timestamp\(Nanosecond, Some\("\+00:00"\)\) \+ Utf8 to valid types
+statement error
 select i_item_desc from test
 where d3_date > now() + '5 days';
-
-# DATE minus DATE
-# https://github.com/apache/arrow-rs/issues/4383
-query ?
-SELECT DATE '2023-04-09' - DATE '2023-04-02';
 ----
-7 days 0 hours 0 mins 0 secs
+DataFusion error: type_coercion
+caused by
+Error during planning: Cannot coerce arithmetic expression Timestamp(ns) + Utf8 to valid types
+
 
 # DATE minus Timestamp
 query ?
@@ -108,6 +106,18 @@ SELECT '2023-01-01T00:00:00'::timestamp - DATE '2021-01-01';
 ----
 730 days 0 hours 0 mins 0.000000000 secs
 
+# NULL with DATE arithmetic should yield NULL (but Int64 type)
+query I
+SELECT NULL - DATE '1984-02-28';
+----
+NULL
+
+query I
+SELECT DATE '1984-02-28' - NULL
+----
+NULL
+
+
 # to_date_test
 statement ok
 create table to_date_t1(ts bigint) as VALUES
@@ -148,12 +158,114 @@ SELECT to_date('21311111');
 statement error DataFusion error: Arrow error:
 SELECT to_date('213111111');
 
+# verify date cast with tinyint input
+query DDDDDD
+SELECT to_date(null::tinyint), to_date(0::tinyint), to_date(19::tinyint), to_date(1::tinyint), to_date(-1::tinyint), to_date((0-1)::tinyint)
+----
+NULL 1970-01-01 1970-01-20 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with smallint input
+query DDDDDD
+SELECT to_date(null::smallint), to_date(0::smallint), to_date(19234::smallint), to_date(1::smallint), to_date(-1::smallint), to_date((0-1)::smallint)
+----
+NULL 1970-01-01 2022-08-30 1970-01-02 1969-12-31 1969-12-31
+
 # verify date cast with integer input
 query DDDDDD
 SELECT to_date(null), to_date(0), to_date(19266320), to_date(1), to_date(-1), to_date(0-1)
 ----
 NULL 1970-01-01 +54719-05-25 1970-01-02 1969-12-31 1969-12-31
 
+# verify date cast with bigint input
+query DDDDDD
+SELECT to_date(null::bigint), to_date(0::bigint), to_date(191234::bigint), to_date(1::bigint), to_date(-1::bigint), to_date((0-1)::bigint)
+----
+NULL 1970-01-01 2493-07-31 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with unsigned tinyint input
+query DDDD
+SELECT to_date(null::tinyint unsigned), to_date(0::tinyint unsigned), to_date(192::tinyint unsigned), to_date(1::tinyint unsigned)
+----
+NULL 1970-01-01 1970-07-12 1970-01-02
+
+# verify date cast with unsigned smallint input
+query DDDD
+SELECT to_date(null::smallint unsigned), to_date(0::smallint unsigned), to_date(19260::smallint unsigned), to_date(1::smallint unsigned)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02
+
+# verify date cast with unsigned int input
+query DDDD
+SELECT to_date(null::int unsigned), to_date(0::int unsigned), to_date(19260::int unsigned), to_date(1::int unsigned)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02
+
+# verify date cast with unsigned bigint input
+query DDDD
+SELECT to_date(null::bigint unsigned), to_date(0::bigint unsigned), to_date(19260000::bigint unsigned), to_date(1::bigint unsigned)
+----
+NULL 1970-01-01 +54702-02-03 1970-01-02
+
+# verify date cast with real input (float32)
+query DDDDDD
+SELECT to_date(null::real), to_date(0.0::real), to_date(19260.1::real), to_date(1.1::real), to_date(-1.1::real), to_date(0-1.1::real)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with double input (float64)
+query DDDDDD
+SELECT to_date(null::double), to_date(0.0::double), to_date(19260.1::double), to_date(1.1::double), to_date(-1.1::double), to_date(0-1.1::double)
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with decimal32 input (Decimal32)
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal32(8,2)')), to_date(arrow_cast(0.0, 'Decimal32(8,2)')), to_date(arrow_cast(19260.1, 'Decimal32(8,2)')), to_date(arrow_cast(1.1, 'Decimal32(8,2)')), to_date(arrow_cast(-1.1, 'Decimal32(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal32(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal64 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal64(8,2)')), to_date(arrow_cast(0.0, 'Decimal64(8,2)')), to_date(arrow_cast(19260.1, 'Decimal64(8,2)')), to_date(arrow_cast(1.1, 'Decimal64(8,2)')), to_date(arrow_cast(-1.1, 'Decimal64(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal64(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal128 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal128(8,2)')), to_date(arrow_cast(0.0, 'Decimal128(8,2)')), to_date(arrow_cast(19260.1, 'Decimal128(8,2)')), to_date(arrow_cast(1.1, 'Decimal128(8,2)')), to_date(arrow_cast(-1.1, 'Decimal128(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal128(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with decimal input (Decimal128)
+query DDDDDD
+SELECT to_date(null::decimal(10,2)), to_date(0.0::decimal(10,2)), to_date(19260.1::decimal(10,2)), to_date(1.1::decimal(10,2)), to_date(-1.1::decimal(10,2)), to_date(0-1.1::decimal(10,2))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with Decimal256 input
+query DDDDDD
+SELECT to_date(arrow_cast(null, 'Decimal256(8,2)')), to_date(arrow_cast(0.0, 'Decimal256(8,2)')), to_date(arrow_cast(19260.1, 'Decimal256(8,2)')), to_date(arrow_cast(1.1, 'Decimal256(8,2)')), to_date(arrow_cast(-1.1, 'Decimal256(8,2)')), to_date(0-arrow_cast(1.1, 'Decimal256(8,2)'))
+----
+NULL 1970-01-01 2022-09-25 1970-01-02 1969-12-31 1969-12-31
+
+# verify date cast with date input
+query DDDD
+SELECT to_date('2024-12-1'::date), to_date('1920-01-12'::date), to_date('1970-01-01'::date), to_date('-0200-07-22'::date)
+----
+2024-12-01 1920-01-12 1970-01-01 -0200-07-22
+
+# verify date cast with date64 input
+query DDDD
+SELECT to_date(arrow_cast('2024-12-1', 'Date64')), to_date(arrow_cast('1920-01-12', 'Date64')), to_date(arrow_cast('1970-01-01', 'Date64')), to_date(arrow_cast(-863999913600000, 'Date64'))
+----
+2024-12-01 1920-01-12 1970-01-01 -25410-12-07
+
+# verify date cast with timestamp input
+query DD
+SELECT to_date('2024-12-01T00:32:45'::timestamp), to_date('1677-12-01T00:32:45'::timestamp)
+----
+2024-12-01 1677-12-01
+
 # verify date output types
 query TTT
 SELECT arrow_typeof(to_date(1)), arrow_typeof(to_date(null)), arrow_typeof(to_date('2023-01-10 12:34:56.000'))
@@ -300,6 +412,14 @@ select to_date('2022-01-23', '%Y-%m-%d');
 ----
 2022-01-23
 
+# invalid date_trunc format
+query error DataFusion error: Execution error: Unsupported date_trunc granularity: ''. Supported values are: microsecond, millisecond, second, minute, hour, day, week, month, quarter, year
+SELECT date_trunc('', to_date('2022-02-23', '%Y-%m-%d'))
+
+# invalid date_trunc format
+query error DataFusion error: Execution error: Unsupported date_trunc granularity: 'invalid'. Supported values are: microsecond, millisecond, second, minute, hour, day, week, month, quarter, year
+SELECT date_trunc('invalid', to_date('2022-02-23', '%Y-%m-%d'))
+
 query PPPP
 select
     date_trunc('YEAR', to_date('2022-02-23', '%Y-%m-%d')),
diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/datetime/interval.slt
similarity index 99%
rename from datafusion/sqllogictest/test_files/interval.slt
rename to datafusion/sqllogictest/test_files/datetime/interval.slt
index 1ef3048ddc66a..8c5a4382ed2c8 100644
--- a/datafusion/sqllogictest/test_files/interval.slt
+++ b/datafusion/sqllogictest/test_files/datetime/interval.slt
@@ -444,7 +444,7 @@ select '1 month'::interval + '1980-01-01T12:00:00'::timestamp;
 query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types
 select '1 month'::interval - '1980-01-01'::date;
 
-query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types
+query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types
 select '1 month'::interval - '1980-01-01T12:00:00'::timestamp;
 
 # interval (array) + date / timestamp (array)
@@ -466,7 +466,7 @@ select i + ts from t;
 query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types
 select i - d from t;
 
-query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types
+query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types
 select i - ts from t;
 
 # interval unit abreiviation and plurals
@@ -530,7 +530,7 @@ SELECT interval '5 day' hour
 query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types
 select '1 month'::interval - d from t;
 
-query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types
+query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types
 select '1 month'::interval - ts from t;
 
 # interval + date
diff --git a/datafusion/sqllogictest/test_files/interval_mysql.slt b/datafusion/sqllogictest/test_files/datetime/interval_mysql.slt
similarity index 100%
rename from datafusion/sqllogictest/test_files/interval_mysql.slt
rename to datafusion/sqllogictest/test_files/datetime/interval_mysql.slt
diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
similarity index 62%
rename from datafusion/sqllogictest/test_files/timestamps.slt
rename to datafusion/sqllogictest/test_files/datetime/timestamps.slt
index 44d0f1f97d4d5..875d7aa4c478d 100644
--- a/datafusion/sqllogictest/test_files/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -19,10 +19,10 @@
 ## Common timestamp data
 #
 # ts_data:        Int64 nanoseconds
-# ts_data_nanos:  Timestamp(Nanosecond, None)
-# ts_data_micros: Timestamp(Microsecond, None)
-# ts_data_millis: Timestamp(Millisecond, None)
-# ts_data_secs:   Timestamp(Second, None)
+# ts_data_nanos:  Timestamp(ns)
+# ts_data_micros: Timestamp(µs)
+# ts_data_millis: Timestamp(ms)
+# ts_data_secs:   Timestamp(s)
 ##########
 
 # Create timestamp tables with different precisions but the same logical values
@@ -34,16 +34,19 @@ create table ts_data(ts bigint, value int) as values
   (1599565349190855123, 3);
 
 statement ok
-create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond, None)') as ts, value from ts_data;
+create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(ns)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, None)') as ts, value from ts_data;
+create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(µs)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(Millisecond, None)') as ts, value from ts_data;
+create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(ms)') as ts, value from ts_data;
 
 statement ok
-create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(Second, None)') as ts, value from ts_data;
+create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(s)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_micros_kolkata as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, Some("Asia/Kolkata"))') as ts, value from ts_data;
 
 
 ##########
@@ -73,6 +76,21 @@ true
 ##########
 ## Current time Tests
 ##########
+statement ok
+SET TIME ZONE = '+08'
+
+query T
+select arrow_typeof(now());
+----
+Timestamp(ns, "+08")
+
+query I
+SELECT count(1) result FROM (SELECT now() as n) a WHERE n > '2000-01-01'::date;
+----
+1
+
+statement ok
+SET TIME ZONE = '+00'
 
 query B
 select cast(now() as time) = current_time();
@@ -175,6 +193,117 @@ SELECT TIMESTAMPTZ '2000-01-01T01:01:01'
 ----
 2000-01-01T01:01:01Z
 
+statement ok
+RESET datafusion.execution.time_zone
+
+##########
+## cast tests
+##########
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1 AS float) AS timestamp(0))) AS t1,
+    (SELECT CAST(CAST(one AS float) AS timestamp(0)) FROM (SELECT 1 AS one)) AS t2,
+    (SELECT CAST(CAST(one AS float) AS timestamp(0)) FROM (VALUES (1)) t(one)) AS t3,
+    (SELECT CAST(CAST(1 AS double) AS timestamp(0))) AS t4,
+    (SELECT CAST(CAST(one AS double) AS timestamp(0)) FROM (SELECT 1 AS one)) AS t5,
+    (SELECT CAST(CAST(one AS double) AS timestamp(0)) FROM (VALUES (1)) t(one)) AS t6
+)
+----
+true 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1 AS float) AS timestamp(3))) AS t1,
+    (SELECT CAST(CAST(one AS float) AS timestamp(3)) FROM (SELECT 1 AS one)) AS t2,
+    (SELECT CAST(CAST(one AS float) AS timestamp(3)) FROM (VALUES (1)) t(one)) AS t3,
+    (SELECT CAST(CAST(1 AS double) AS timestamp(3))) AS t4,
+    (SELECT CAST(CAST(one AS double) AS timestamp(3)) FROM (SELECT 1 AS one)) AS t5,
+    (SELECT CAST(CAST(one AS double) AS timestamp(3)) FROM (VALUES (1)) t(one)) AS t6
+)
+----
+true 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1 AS float) AS timestamp(6))) AS t1,
+    (SELECT CAST(CAST(one AS float) AS timestamp(6)) FROM (SELECT 1 AS one)) AS t2,
+    (SELECT CAST(CAST(one AS float) AS timestamp(6)) FROM (VALUES (1)) t(one)) AS t3,
+    (SELECT CAST(CAST(1 AS double) AS timestamp(6))) AS t4,
+    (SELECT CAST(CAST(one AS double) AS timestamp(6)) FROM (SELECT 1 AS one)) AS t5,
+    (SELECT CAST(CAST(one AS double) AS timestamp(6)) FROM (VALUES (1)) t(one)) AS t6
+)
+----
+true 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1 AS float) AS timestamp(9))) AS t1,
+    (SELECT CAST(CAST(one AS float) AS timestamp(9)) FROM (SELECT 1 AS one)) AS t2,
+    (SELECT CAST(CAST(one AS float) AS timestamp(9)) FROM (VALUES (1)) t(one)) AS t3,
+    (SELECT CAST(CAST(1 AS double) AS timestamp(9))) AS t4,
+    (SELECT CAST(CAST(one AS double) AS timestamp(9)) FROM (SELECT 1 AS one)) AS t5,
+    (SELECT CAST(CAST(one AS double) AS timestamp(9)) FROM (VALUES (1)) t(one)) AS t6
+)
+----
+true 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1.125 AS float) AS timestamp(0))) AS t1,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(0)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t2,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(0)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t3,
+    (SELECT CAST(CAST(1.125 AS double) AS timestamp(0))) AS t4,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(0)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t5,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(0)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t6
+)
+----
+true 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01 1970-01-01T00:00:01
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1.125 AS float) AS timestamp(3))) AS t1,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(3)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t2,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(3)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t3,
+    (SELECT CAST(CAST(1.125 AS double) AS timestamp(3))) AS t4,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(3)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t5,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(3)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t6
+)
+----
+true 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001 1970-01-01T00:00:00.001
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1.125 AS float) AS timestamp(6))) AS t1,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(6)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t2,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(6)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t3,
+    (SELECT CAST(CAST(1.125 AS double) AS timestamp(6))) AS t4,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(6)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t5,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(6)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t6
+)
+----
+true 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000001
+
+query BPPPPPP
+SELECT t1 = t2 AND t1 = t3 AND t1 = t4 AND t1 = t5 AND t1 = t6, *
+FROM (SELECT
+    (SELECT CAST(CAST(1.125 AS float) AS timestamp(9))) AS t1,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(9)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t2,
+    (SELECT CAST(CAST(one_and_a_bit AS float) AS timestamp(9)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t3,
+    (SELECT CAST(CAST(1.125 AS double) AS timestamp(9))) AS t4,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(9)) FROM (SELECT 1.125 AS one_and_a_bit)) AS t5,
+    (SELECT CAST(CAST(one_and_a_bit AS double) AS timestamp(9)) FROM (VALUES (1.125)) t(one_and_a_bit)) AS t6
+)
+----
+true 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
+
 
 ##########
 ## to_timestamp tests
@@ -394,12 +523,12 @@ SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08 12
 query PPP
 SELECT to_timestamp(1.1) as c1, cast(1.1 as timestamp) as c2, 1.1::timestamp as c3;
 ----
-1970-01-01T00:00:01.100 1970-01-01T00:00:01.100 1970-01-01T00:00:01.100
+1970-01-01T00:00:01.100 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
 
 query PPP
 SELECT to_timestamp(-1.1) as c1, cast(-1.1 as timestamp) as c2, (-1.1)::timestamp as c3;
 ----
-1969-12-31T23:59:58.900 1969-12-31T23:59:58.900 1969-12-31T23:59:58.900
+1969-12-31T23:59:58.900 1969-12-31T23:59:59.999999999 1969-12-31T23:59:59.999999999
 
 query PPP
 SELECT to_timestamp(0.0) as c1, cast(0.0 as timestamp) as c2, 0.0::timestamp as c3;
@@ -409,24 +538,47 @@ SELECT to_timestamp(0.0) as c1, cast(0.0 as timestamp) as c2, 0.0::timestamp as
 query PPP
 SELECT to_timestamp(1.23456789) as c1, cast(1.23456789 as timestamp) as c2, 1.23456789::timestamp as c3;
 ----
-1970-01-01T00:00:01.234567890 1970-01-01T00:00:01.234567890 1970-01-01T00:00:01.234567890
+1970-01-01T00:00:01.234567890 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
 
 query PPP
 SELECT to_timestamp(123456789.123456789) as c1, cast(123456789.123456789 as timestamp) as c2, 123456789.123456789::timestamp as c3;
 ----
-1973-11-29T21:33:09.123456784 1973-11-29T21:33:09.123456784 1973-11-29T21:33:09.123456784
+1973-11-29T21:33:09.123456784 1970-01-01T00:00:00.123456789 1970-01-01T00:00:00.123456789
+
+## to_timestamp float vectorized inputs
+query PPP
+SELECT
+  to_timestamp(x) as c1,
+  cast(x as timestamp) as c2,
+  x::timestamp as c3
+FROM (
+  VALUES
+    (1.1),
+    (-1.1),
+    (0.0),
+    (1.23456789),
+    (123456789.123456789),
+    (NULL)
+) t(x);
+----
+1970-01-01T00:00:01.100 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
+1969-12-31T23:59:58.900 1969-12-31T23:59:59.999999999 1969-12-31T23:59:59.999999999
+1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00
+1970-01-01T00:00:01.234567890 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
+1973-11-29T21:33:09.123456784 1970-01-01T00:00:00.123456789 1970-01-01T00:00:00.123456789
+NULL NULL NULL
 
 # to_timestamp Decimal128 inputs
 
 query PPP
 SELECT to_timestamp(arrow_cast(1.1, 'Decimal128(2,1)')) as c1, cast(arrow_cast(1.1, 'Decimal128(2,1)') as timestamp) as c2, arrow_cast(1.1, 'Decimal128(2,1)')::timestamp as c3;
 ----
-1970-01-01T00:00:01.100 1970-01-01T00:00:01.100 1970-01-01T00:00:01.100
+1970-01-01T00:00:01.100 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
 
 query PPP
 SELECT to_timestamp(arrow_cast(-1.1, 'Decimal128(2,1)')) as c1, cast(arrow_cast(-1.1, 'Decimal128(2,1)') as timestamp) as c2, arrow_cast(-1.1, 'Decimal128(2,1)')::timestamp as c3;
 ----
-1969-12-31T23:59:58.900 1969-12-31T23:59:58.900 1969-12-31T23:59:58.900
+1969-12-31T23:59:58.900 1969-12-31T23:59:59.999999999 1969-12-31T23:59:59.999999999
 
 query PPP
 SELECT to_timestamp(arrow_cast(0.0, 'Decimal128(2,1)')) as c1, cast(arrow_cast(0.0, 'Decimal128(2,1)') as timestamp) as c2, arrow_cast(0.0, 'Decimal128(2,1)')::timestamp as c3;
@@ -436,12 +588,12 @@ SELECT to_timestamp(arrow_cast(0.0, 'Decimal128(2,1)')) as c1, cast(arrow_cast(0
 query PPP
 SELECT to_timestamp(arrow_cast(1.23456789, 'Decimal128(9,8)')) as c1, cast(arrow_cast(1.23456789, 'Decimal128(9,8)') as timestamp) as c2, arrow_cast(1.23456789, 'Decimal128(9,8)')::timestamp as c3;
 ----
-1970-01-01T00:00:01.234567890 1970-01-01T00:00:01.234567890 1970-01-01T00:00:01.234567890
+1970-01-01T00:00:01.234567890 1970-01-01T00:00:00.000000001 1970-01-01T00:00:00.000000001
 
 query PPP
 SELECT to_timestamp(arrow_cast(123456789.123456789, 'Decimal128(18,9)')) as c1, cast(arrow_cast(123456789.123456789, 'Decimal128(18,9)') as timestamp) as c2, arrow_cast(123456789.123456789, 'Decimal128(18,9)')::timestamp as c3;
 ----
-1973-11-29T21:33:09.123456784 1973-11-29T21:33:09.123456784 1973-11-29T21:33:09.123456784
+1973-11-29T21:33:09.123456784 1970-01-01T00:00:00.123456789 1970-01-01T00:00:00.123456789
 
 
 # from_unixtime
@@ -534,11 +686,7 @@ select date '1994-01-01' - interval '1' day as date;
 ----
 1993-12-31
 
-
 # cast_string_to_time()
-statement ok
-set datafusion.optimizer.skip_failed_rules = false
-
 query DDDD
 select
         time '08:09:10.123456789' as time_nano,
@@ -548,21 +696,17 @@ select
 ----
 08:09:10.123456789 13:14:15.123456 13:14:15.123 13:14:15
 
-query error Cannot cast string 'not a time' to value of Time64\(Nanosecond\) type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'not a time' to value of Time64\(ns\) type
 SELECT TIME 'not a time' as time;
 
 # invalid time
-query error Cannot cast string '24:01:02' to value of Time64\(Nanosecond\) type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string '24:01:02' to value of Time64\(ns\) type
 SELECT TIME '24:01:02' as time;
 
 # invalid timezone
 query error Arrow error: Parser error: Invalid timezone "ZZ": failed to parse timezone
 SELECT TIMESTAMP '2023-12-05T21:58:10.45ZZ';
 
-statement ok
-set datafusion.optimizer.skip_failed_rules = true
-
-
 # cast_to_timestamp_twice
 query P
 select to_timestamp(a) from (select to_timestamp(1) as a) A;
@@ -627,6 +771,18 @@ select to_timestamp_seconds(cast (1 as int));
 ## test date_bin function
 ##########
 
+# NULL stride should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00', TIMESTAMP '2023-01-01 12:00:00')
+----
+NULL
+
+# NULL stride should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00')
+----
+NULL
+
 # invalid second arg type
 query error
 SELECT DATE_BIN(INTERVAL '0 second', 25, TIMESTAMP '1970-01-01T00:00:00Z')
@@ -649,6 +805,81 @@ FROM (
   ) as t (time, val)
 group by time;
 
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '00:00:00')
+----
+14:30:00
+
+# Supports Month-Day-Nano nanosecond interval
+query D
+SELECT DATE_BIN(INTERVAL '10 nanoseconds', TIME '14:38:50.000000016', TIME '00:00:00')
+----
+14:38:50.000000010
+
+# Supports Month-Day-Nano nanosecond interval via fractions
+query D
+SELECT DATE_BIN(INTERVAL '0.000000010 seconds', TIME '14:38:50.000000016', TIME '00:00:00')
+----
+14:38:50.000000010
+
+# Supports Month-Day-Nano microsecond interval
+query D
+SELECT DATE_BIN(INTERVAL '5 microseconds', TIME '14:38:50.000006', TIME '00:00:00')
+----
+14:38:50.000005
+
+# stride by 7 days
+query error DataFusion error: Execution error: DATE_BIN stride for TIME input must be less than 1 day
+SELECT DATE_BIN(INTERVAL '7 days', TIME '14:38:50', TIME '00:00:00')
+
+# stride by 25 hours
+query error DataFusion error: Execution error: DATE_BIN stride for TIME input must be less than 1 day
+SELECT DATE_BIN(INTERVAL '25 hours', TIME '14:38:50', TIME '00:00:00')
+
+# stride by 23 hours, 59 minutes 59 seconds
+query D
+SELECT DATE_BIN(INTERVAL '23 hours 59 minutes 59 seconds', TIME '14:38:50', TIME '00:00:00')
+----
+00:00:00
+
+# mixed types
+query error Failed to coerce arguments to satisfy a call to 'date_bin' function:
+SELECT DATE_BIN(INTERVAL '23 hours', TIME '14:38:50', TIMESTAMP '2022-08-03 14:38:50.000000006Z')
+
+# mixed types
+query error Failed to coerce arguments to satisfy a call to 'date_bin' function:
+SELECT DATE_BIN(INTERVAL '23 hours', TIMESTAMP '2022-08-03 14:38:50.000000006Z', TIME '14:38:50')
+
+# Can coerce all string arguments
+query D
+SELECT DATE_BIN('15 minutes', '14:38:50'::time, '00:00:00'::time)
+----
+14:30:00
+
+# Call in two arguments (should be the same as the above query)
+query B
+SELECT DATE_BIN('15 minutes', '14:38:50'::time) = DATE_BIN('15 minutes', '14:38:50'::time, '00:00:00'::time)
+----
+true
+
+# Shift forward by 5 minutes
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '00:05:00')
+----
+14:35:00
+
+# Shift backward by 5 minutes
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '23:55:00')
+----
+14:25:00
+
+# origin after source, TIME in previous bucket
+query D
+SELECT DATE_BIN(INTERVAL '15 minutes', TIME '14:38:50', TIME '14:40:00')
+----
+14:25:00
+
 query P
 SELECT DATE_BIN(INTERVAL '15 minutes', TIMESTAMP '2022-08-03 14:38:50Z', TIMESTAMP '1970-01-01T00:00:00Z')
 ----
@@ -769,7 +1000,7 @@ from (values
 query T
 SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_micros(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z'))
 ----
-Timestamp(Microsecond, None)
+Timestamp(µs)
 
 query P
 SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')
@@ -787,7 +1018,7 @@ from (values
 query T
 SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z'))
 ----
-Timestamp(Millisecond, None)
+Timestamp(ms)
 
 query P
 SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')
@@ -805,7 +1036,7 @@ from (values
 query T
 SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z'))
 ----
-Timestamp(Second, None)
+Timestamp(s)
 
 # month interval with INTERVAL keyword in date_bin with default start time
 query P
@@ -1360,13 +1591,13 @@ second 2020-09-08T13:42:29
 
 # test date trunc on different timestamp scalar types and ensure they are consistent
 query P rowsort
-SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Second, None)')) as ts
+SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(s)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Nanosecond, None)')) as ts
+SELECT DATE_TRUNC('second', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(ns)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Microsecond, None)')) as ts
+SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(µs)')) as ts
   UNION ALL
-SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Millisecond, None)')) as ts
+SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(ms)')) as ts
 ----
 2023-08-03T00:00:00
 2023-08-03T00:00:00
@@ -1401,24 +1632,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to berlin
 query PT
 select ts, arrow_typeof(ts) from timestamp_utc order by ts;
 ----
-2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T00:00:00Z Timestamp(ns, "UTC")
+2024-10-27T00:30:00Z Timestamp(ns, "UTC")
+2024-10-27T01:30:00Z Timestamp(ns, "UTC")
+2024-10-27T02:00:00Z Timestamp(ns, "UTC")
+2024-10-27T02:30:00Z Timestamp(ns, "UTC")
+2024-10-27T03:00:00Z Timestamp(ns, "UTC")
+2024-10-27T03:30:00Z Timestamp(ns, "UTC")
 
 query PT
 select ts, arrow_typeof(ts) from timestamp_berlin order by ts;
 ----
-2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
-2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T02:00:00+02:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T02:30:00+02:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T02:30:00+01:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T03:00:00+01:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T03:30:00+01:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T04:00:00+01:00 Timestamp(ns, "Europe/Berlin")
+2024-10-27T04:30:00+01:00 Timestamp(ns, "Europe/Berlin")
 
 #  date trunc in utc with DST
 query PPPP
@@ -1485,24 +1716,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo
 query PT
 select ts, arrow_typeof(ts) from timestamp_utc order by ts;
 ----
-2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC"))
-2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T01:00:00Z Timestamp(ns, "UTC")
+2018-11-04T01:30:00Z Timestamp(ns, "UTC")
+2018-11-04T02:30:00Z Timestamp(ns, "UTC")
+2018-11-04T03:00:00Z Timestamp(ns, "UTC")
+2018-11-04T03:30:00Z Timestamp(ns, "UTC")
+2018-11-04T04:00:00Z Timestamp(ns, "UTC")
+2018-11-04T04:30:00Z Timestamp(ns, "UTC")
 
 query PT
 select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts;
 ----
-2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
-2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-03T22:00:00-03:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-03T22:30:00-03:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-03T23:30:00-03:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-04T01:00:00-02:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-04T01:30:00-02:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-04T02:00:00-02:00 Timestamp(ns, "America/Sao_Paulo")
+2018-11-04T02:30:00-02:00 Timestamp(ns, "America/Sao_Paulo")
 
 #  date trunc in utc with DST
 query PPPP
@@ -1548,6 +1779,30 @@ SELECT DATE_TRUNC('second', '2022-08-03 14:38:50Z');
 ----
 2022-08-03T14:38:50
 
+# DATE_TRUNC handling of times before the unix epoch (issue 18334)
+query PPPPPPPPPPP
+SELECT
+    d,
+    DATE_TRUNC('year', d),
+    DATE_TRUNC('quarter', d),
+    DATE_TRUNC('month', d),
+    DATE_TRUNC('week', d),
+    DATE_TRUNC('day', d),
+    DATE_TRUNC('hour', d),
+    DATE_TRUNC('minute', d),
+    DATE_TRUNC('second', d),
+    DATE_TRUNC('millisecond', d),
+    DATE_TRUNC('microsecond', d),
+FROM (VALUES
+    (TIMESTAMP '1900-06-15 07:09:00'),
+    (TIMESTAMP '1970-01-01 00:00:00'),
+    (TIMESTAMP '2024-12-31 23:39:01.123456789')
+) AS t(d);
+----
+1900-06-15T07:09:00 1900-01-01T00:00:00 1900-04-01T00:00:00 1900-06-01T00:00:00 1900-06-11T00:00:00 1900-06-15T00:00:00 1900-06-15T07:00:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00
+1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1969-12-29T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00
+2024-12-31T23:39:01.123456789 2024-01-01T00:00:00 2024-10-01T00:00:00 2024-12-01T00:00:00 2024-12-30T00:00:00 2024-12-31T00:00:00 2024-12-31T23:00:00 2024-12-31T23:39:00 2024-12-31T23:39:01 2024-12-31T23:39:01.123 2024-12-31T23:39:01.123456
+
 # Test that interval can add a timestamp
 query P
 SELECT timestamp '2013-07-01 12:00:00' + INTERVAL '8' DAY;
@@ -1658,7 +1913,7 @@ SELECT ts1 + i FROM foo;
 2003-07-12T01:31:15.000123463
 
 # Timestamp + Timestamp => error
-query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\)
+query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(ns\) \+ Timestamp\(ns\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(ns\) \+ Timestamp\(ns\)
 SELECT ts1 + ts2
 FROM foo;
 
@@ -1710,27 +1965,6 @@ true false true true
 
 
 
-##########
-## Common timestamp data
-##########
-
-statement ok
-drop table ts_data
-
-statement ok
-drop table ts_data_nanos
-
-statement ok
-drop table ts_data_micros
-
-statement ok
-drop table ts_data_millis
-
-statement ok
-drop table ts_data_secs
-
-
-
 ##########
 ## Timezone impact on scalar functions
 #
@@ -2105,19 +2339,19 @@ SET TIME ZONE = '+05:00'
 
 statement ok
 CREATE TABLE foo (time TIMESTAMPTZ) AS VALUES
-    ('2020-01-01T00:00:00+05:00'), 
+    ('2020-01-01T00:00:00+05:00'),
     ('2020-01-01T01:00:00+05:00'),
     ('2020-01-01T02:00:00+05:00'),
     ('2020-01-01T03:00:00+05:00')
 
 statement ok
-SET TIME ZONE = '+00'
+RESET datafusion.execution.time_zone
 
 # verify column type
 query T
 SELECT arrow_typeof(time) FROM foo LIMIT 1
 ----
-Timestamp(Nanosecond, Some("+05:00"))
+Timestamp(ns, "+05:00")
 
 # check date_trunc
 query P
@@ -2132,27 +2366,80 @@ SELECT date_trunc('day', time) FROM foo
 query T
 SELECT arrow_typeof(date_trunc('day', time)) FROM foo LIMIT 1
 ----
-Timestamp(Nanosecond, Some("+05:00"))
+Timestamp(ns, "+05:00")
 
 query T
 select arrow_typeof(date_trunc('minute', to_timestamp_seconds(61)))
 ----
-Timestamp(Second, None)
+Timestamp(s)
 
 query T
 select arrow_typeof(date_trunc('second', to_timestamp_millis(61)))
 ----
-Timestamp(Millisecond, None)
+Timestamp(ms)
 
 query T
 select arrow_typeof(date_trunc('millisecond', to_timestamp_micros(61)))
 ----
-Timestamp(Microsecond, None)
+Timestamp(µs)
 
 query T
 select arrow_typeof(date_trunc('microsecond', to_timestamp(61)))
 ----
-Timestamp(Nanosecond, None)  
+Timestamp(ns)
+
+##########
+## date_trunc with Time types
+##########
+
+# Truncate time to hour
+query D
+SELECT date_trunc('hour', TIME '14:30:45');
+----
+14:00:00
+
+# Truncate time to minute
+query D
+SELECT date_trunc('minute', TIME '14:30:45');
+----
+14:30:00
+
+# Truncate time to second (removes fractional seconds)
+query D
+SELECT date_trunc('second', TIME '14:30:45.123456789');
+----
+14:30:45
+
+# Truncate time to millisecond
+query D
+SELECT date_trunc('millisecond', TIME '14:30:45.123456789');
+----
+14:30:45.123
+
+# Truncate time to microsecond
+query D
+SELECT date_trunc('microsecond', TIME '14:30:45.123456789');
+----
+14:30:45.123456
+
+# Return type should be Time64(ns)
+query T
+SELECT arrow_typeof(date_trunc('hour', TIME '14:30:45'));
+----
+Time64(ns)
+
+# Error for granularities not valid for Time types
+query error date_trunc does not support 'day' granularity for Time types
+SELECT date_trunc('day', TIME '14:30:45');
+
+query error date_trunc does not support 'week' granularity for Time types
+SELECT date_trunc('week', TIME '14:30:45');
+
+query error date_trunc does not support 'month' granularity for Time types
+SELECT date_trunc('month', TIME '14:30:45');
+
+query error date_trunc does not support 'year' granularity for Time types
+SELECT date_trunc('year', TIME '14:30:45');
 
 # check date_bin
 query P
@@ -2167,7 +2454,7 @@ SELECT date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00') FROM foo
 query T
 SELECT arrow_typeof(date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00')) FROM foo LIMIT 1
 ----
-Timestamp(Nanosecond, Some("+05:00"))
+Timestamp(ns, "+05:00")
 
 
 # timestamp comparison with and without timezone
@@ -2190,17 +2477,17 @@ NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:5
 # verify timestamp syntax styles are consistent
 query BBBBBBBBBBBBB
 SELECT to_timestamp(null) is null as c1,
-       null::timestamp is null as c2, 
-       cast(null as timestamp) is null as c3, 
-       to_timestamp(0) = 0::timestamp as c4, 
-       to_timestamp(1926632005) = 1926632005::timestamp as c5, 
-       to_timestamp(1) = 1::timestamp as c6, 
-       to_timestamp(-1) = -1::timestamp as c7, 
+       null::timestamp is null as c2,
+       cast(null as timestamp) is null as c3,
+       to_timestamp(0) = 0::timestamp as c4,
+       to_timestamp(1926632005) = 1926632005::timestamp as c5,
+       to_timestamp(1) = 1::timestamp as c6,
+       to_timestamp(-1) = -1::timestamp as c7,
        to_timestamp(0-1) = (0-1)::timestamp as c8,
-       to_timestamp(0) = cast(0 as timestamp) as c9, 
-       to_timestamp(1926632005) = cast(1926632005 as timestamp) as c10, 
-       to_timestamp(1) = cast(1 as timestamp) as c11, 
-       to_timestamp(-1) = cast(-1 as timestamp) as c12, 
+       to_timestamp(0) = cast(0 as timestamp) as c9,
+       to_timestamp(1926632005) = cast(1926632005 as timestamp) as c10,
+       to_timestamp(1) = cast(1 as timestamp) as c11,
+       to_timestamp(-1) = cast(-1 as timestamp) as c12,
        to_timestamp(0-1) = cast(0-1 as timestamp) as c13
 ----
 true true true true true true true true true true true true true
@@ -2209,14 +2496,14 @@ true true true true true true true true true true true true true
 query TTT
 SELECT arrow_typeof(to_timestamp(1)), arrow_typeof(to_timestamp(null)), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000'))
 ----
-Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None)
+Timestamp(ns) Timestamp(ns) Timestamp(ns)
 
 # verify timestamp output types using timestamp literal syntax
 query BBBBBB
-SELECT arrow_typeof(to_timestamp(1)) = arrow_typeof(1::timestamp) as c1, 
+SELECT arrow_typeof(to_timestamp(1)) = arrow_typeof(1::timestamp) as c1,
        arrow_typeof(to_timestamp(null)) = arrow_typeof(null::timestamp) as c2,
        arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) = arrow_typeof('2023-01-10 12:34:56.000'::timestamp) as c3,
-       arrow_typeof(to_timestamp(1)) = arrow_typeof(cast(1 as timestamp)) as c4, 
+       arrow_typeof(to_timestamp(1)) = arrow_typeof(cast(1 as timestamp)) as c4,
        arrow_typeof(to_timestamp(null)) = arrow_typeof(cast(null as timestamp)) as c5,
        arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) = arrow_typeof(cast('2023-01-10 12:34:56.000' as timestamp)) as c6
 ----
@@ -2225,7 +2512,7 @@ true true true true true true
 # known issues. currently overflows (expects default precision to be microsecond instead of nanoseconds. Work pending)
 #verify extreme values
 #query PPPPPPPP
-#SELECT to_timestamp(-62125747200), to_timestamp(1926632005177), -62125747200::timestamp, 1926632005177::timestamp, cast(-62125747200 as timestamp), cast(1926632005177 as timestamp)
+#SELECT to_timestamp(-62125747200), to_timestamp(1926632005177), -62125747200::timestamp as t1, 1926632005177::timestamp, cast(-62125747200 as timestamp), cast(1926632005177 as timestamp) as t2
 #----
 #0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37
 
@@ -2245,7 +2532,7 @@ NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:5
 query TTT
 SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f'))
 ----
-Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None)
+Timestamp(ns) Timestamp(ns) Timestamp(ns)
 
 # to_timestamp with invalid formatting
 query error input contains invalid characters
@@ -2431,7 +2718,7 @@ drop table ts_utf8_data
 ##########
 
 query B
-select arrow_cast(now(), 'Date64') < arrow_cast('2022-02-02 02:02:02', 'Timestamp(Nanosecond, None)');
+select arrow_cast(now(), 'Date64') < arrow_cast('2022-02-02 02:02:02', 'Timestamp(ns)');
 ----
 false
 
@@ -2460,13 +2747,13 @@ drop table table_a
 ##########
 
 statement ok
-create table table_a (ts timestamp) as values 
-    ('2020-09-08T11:42:29Z'::timestamp), 
+create table table_a (ts timestamp) as values
+    ('2020-09-08T11:42:29Z'::timestamp),
     ('2020-09-08T12:42:29Z'::timestamp),
     ('2020-09-08T13:42:29Z'::timestamp)
 
 statement ok
-create table table_b (ts timestamp) as values 
+create table table_b (ts timestamp) as values
     ('2020-09-08T11:42:29.190Z'::timestamp),
     ('2020-09-08T13:42:29.190Z'::timestamp),
     ('2020-09-08T12:42:29.190Z'::timestamp)
@@ -2551,8 +2838,8 @@ SELECT t1.ts, t1.ts + INTERVAL '1' SECOND FROM t1;
 query PT
 SELECT t1.ts::timestamptz, arrow_typeof(t1.ts::timestamptz) FROM t1;
 ----
-2018-07-01T06:00:00Z Timestamp(Nanosecond, Some("+00"))
-2018-07-01T07:00:00Z Timestamp(Nanosecond, Some("+00"))
+2018-07-01T06:00:00 Timestamp(ns)
+2018-07-01T07:00:00 Timestamp(ns)
 
 query D
 SELECT 0::TIME
@@ -2588,8 +2875,8 @@ statement ok
 drop table t1
 
 statement ok
-create table table_a (val int, ts1 timestamp, ts2 timestamp) as values 
-    (1, '2018-07-01T06:00:00'::timestamp, '2018-07-01T07:00:00'::timestamp), 
+create table table_a (val int, ts1 timestamp, ts2 timestamp) as values
+    (1, '2018-07-01T06:00:00'::timestamp, '2018-07-01T07:00:00'::timestamp),
     (2, '2018-07-01T07:00:00'::timestamp, '2018-07-01T08:00:00'::timestamp)
 
 query I?
@@ -2702,8 +2989,12 @@ select make_date(t.year, t.month, '4') from table_nums t;
 statement ok
 insert into table_nums values (2024, null, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query D
 select make_date(t.year, t.month, t.day) from table_nums t;
+----
+2024-01-23
+2023-11-30
+NULL
 
 statement ok
 drop table table_nums;
@@ -2722,701 +3013,2381 @@ select make_date(t.year, t.month, t.day) from table_strings t;
 statement ok
 insert into table_strings values (2024, null, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query D
 select make_date(t.year, t.month, t.day) from table_strings t;
+----
+2024-01-23
+2023-11-30
+NULL
 
 statement ok
 drop table table_strings;
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 13, 23
+query error DataFusion error: Execution error: Month value '13' is out of range
 select make_date(2024, 13, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 1, 32
-select make_date(2024, 01, 32);
+query error DataFusion error: Execution error: Day value '32' is out of range
+select make_date(2024, 1, 32);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 0, 23
+query error DataFusion error: Execution error: Month value '0' is out of range
 select make_date(2024, 0, 23);
 
 query error DataFusion error: Execution error: Month value '\-1' is out of range
 select make_date(2024, -1, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from 2024, 12, 0
+query error DataFusion error: Execution error: Day value '0' is out of range
 select make_date(2024, 12, 0);
 
-query error DataFusion error: Execution error: Day value '\-1' is out of range
+query error DataFusion error: Execution error: Month value '13' is out of range
 select make_date(2024, 13, -1);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(null, 1, 23);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Cast error: Cannot cast string '' to value of Int32 type
 select make_date('', 1, 23);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(2024, null, 23);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 select make_date(2024, '', 27);
 
-query error DataFusion error: Execution error: Unable to parse date from null/empty value
+query D
 select make_date(2024, 1, null);
+----
+NULL
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 select make_date(2024, 1, '');
 
+query error DataFusion error: Execution error: Unable to parse date from 2024, 11, 31
+select make_date(2024, 11, 31);
+
+query D
+select make_date(null, 1::bigint, 14::bigint unsigned);
+----
+NULL
+
+query error DataFusion error: Error during planning: Function 'make_date' expects 3 arguments but received 1
+select make_date(1);
+
+query error DataFusion error: Error during planning: Function 'make_date' requires Int32, but received Interval\(MonthDayNano\) \(DataType: Interval\(MonthDayNano\)\).
+select make_date(interval '1 day', '2001-05-21'::timestamp, '2001-05-21'::timestamp);
 
 ##########
-## to_char tests
+## make time tests
 ##########
 
-statement ok
-create table formats (
-    dates date,
-    times time,
-    timestamps timestamp,
-    date_format varchar,
-    time_format varchar,
-    timestamp_format varchar)
-as values
-    ('2000-01-01'::date, '23:45:01'::time, '2024-01-01 06:00:00'::timestamp, '%d:%m:%Y', '%H-%M-%S', '%d:%m:%Y %H-%M-%S'),
-    ('2003-04-05'::date, '04:56:32'::time, '2025-01-01 23:59:58'::timestamp, '%d:%m:%Y', '%H::%M::%S', '%d:%m:%Y %H-%M-%S');
+query D
+select make_time(22, 1, 27);
+----
+22:01:27
 
+query D
+select make_time(22, 0, 0);
+----
+22:00:00
 
-query T
-select to_char(dates, date_format) from formats;
+query D
+select make_time(0, 0, 0);
 ----
-01:01:2000
-05:04:2003
+00:00:00
 
-query T
-select date_format(dates, date_format) from formats;
+query D
+select make_time(22, 2, 29);
 ----
-01:01:2000
-05:04:2003
+22:02:29
 
-query T
-select to_char(times, time_format) from formats;
+query D
+select make_time(null, null, null);
 ----
-23-45-01
-04::56::32
+NULL
 
-query T
-select to_char(timestamps, date_format) from formats;
+query D
+select make_time(null, 1, 23);
 ----
-01:01:2024
-01:01:2025
+NULL
 
-query T
-select to_char(timestamps, timestamp_format) from formats;
+query D
+select make_time(22, null, 23);
 ----
-01:01:2024 06-00-00
-01:01:2025 23-59-58
+NULL
 
-query T
-select to_char('2000-02-03'::date, '%Y:%d:%m');
+query D
+select make_time(22, 1, null);
 ----
-2000:03:02
+NULL
 
-query T
-select to_char(arrow_cast(12345::int, 'Time32(Second)'), '%H-%M-%S')
+query D
+select make_time('22', '01', '27');
 ----
-03-25-45
+22:01:27
 
-query T
-select to_char(arrow_cast(12344567::int, 'Time32(Millisecond)'), '%H-%M-%S %f')
+query D
+select make_time(12 + 11, '01', '27');
 ----
-03-25-44 567000000
+23:01:27
 
-query T
-select to_char(arrow_cast(12344567000, 'Time64(Microsecond)'), '%H-%M-%S %f')
+query D
+select make_time(22::tinyint, 01::tinyint, 27::tinyint);
 ----
-03-25-44 567000000
+22:01:27
 
-query T
-select to_char(arrow_cast(12344567890000, 'Time64(Nanosecond)'), '%H-%M-%S %f')
+query D
+select make_time(22::smallint, 01::smallint, 27::smallint);
 ----
-03-25-44 567890000
+22:01:27
 
-query T
-select to_char(arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Second, None)'), '%d-%m-%Y %H-%M-%S')
+query D
+select make_time(22::int, 01::int, 27::int);
 ----
-03-08-2023 14-38-50
+22:01:27
 
-query T
-select to_char(arrow_cast('2023-09-04'::date, 'Timestamp(Second, Some("UTC"))'), '%Y-%m-%dT%H:%M:%S%.3f');
+query D
+select make_time(22::bigint, 01::bigint, 27::bigint);
 ----
-2023-09-04T00:00:00.000
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), 'pretty');
+query D
+select make_time(22::tinyint unsigned, 01::tinyint unsigned, 27::tinyint unsigned);
 ----
-1 days 10 hours 17 mins 36 secs
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), 'iso8601');
+query D
+select make_time(22::smallint unsigned, 01::smallint unsigned, 27::smallint unsigned);
 ----
-PT123456S
+22:01:27
 
-query T
-select to_char(arrow_cast(123456, 'Duration(Second)'), null);
+query D
+select make_time(22::int unsigned, 01::int unsigned, 27::int unsigned);
 ----
-NULL
+22:01:27
 
-query error DataFusion error: Execution error: Cast error: Format error
-SELECT to_char(timestamps, '%X%K') from formats;
+query D
+select make_time(22::bigint unsigned, 01::bigint unsigned, 27::bigint unsigned);
+----
+22:01:27
 
-query error DataFusion error: Execution error: Cast error: Format error
-SELECT to_char('2000-02-03'::date, '%X%K');
+query D
+select make_time(arrow_cast(22, 'Int32'), arrow_cast(1, 'Int32'), arrow_cast(27, 'Int32'));
+----
+22:01:27
 
-query T
-SELECT to_char(timestamps, null) from formats;
+query D
+select make_time(arrow_cast(22, 'Int64'), arrow_cast(1, 'Int64'), arrow_cast(27, 'Int64'));
 ----
-NULL
-NULL
+22:01:27
 
+query D
+select make_time(arrow_cast('22', 'Utf8'), arrow_cast('1', 'Utf8'), arrow_cast('27', 'Utf8'));
+----
+22:01:27
+
+query D
+select make_time(arrow_cast('22', 'Utf8View'), arrow_cast('1', 'Utf8View'), arrow_cast('27', 'Utf8View'));
+----
+22:01:27
+
+query D
+select make_time(arrow_cast('22', 'LargeUtf8'), arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'LargeUtf8'));
+----
+22:01:27
+
+query D
+select make_time(22, arrow_cast('1', 'Int64'), arrow_cast('27', 'UInt32'));
+----
+22:01:27
+
+query D
+select make_time(22, arrow_cast('1', 'UInt64'), arrow_cast('27', 'UInt32'));
+----
+22:01:27
+
+query D
+select make_time(arrow_cast('22', 'Utf8'), arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'Utf8'));
+----
+22:01:27
+
+query D
+select make_time(22, arrow_cast('1', 'LargeUtf8'), arrow_cast('27', 'Utf8'));
+----
+22:01:27
+
+query error Can't cast value 18446744073709551615 to type Int32
+select make_time(22, 18446744073709551615, 27);
+
+query T
+select arrow_typeof(make_time(22, 1, 27));
+----
+Time32(s)
+
+statement ok
+create table table_nums (hour int, minute int, second int) as values
+    (22, 1, 23),
+    (20, 11, 30);
+
+query D
+select make_time(t.hour, t.minute, t.second) from table_nums t;
+----
+22:01:23
+20:11:30
+
+query D
+select make_time(21, t.minute, t.second) from table_nums t;
+----
+21:01:23
+21:11:30
+
+query D
+select make_time(t.hour, 3, t.second) from table_nums t;
+----
+22:03:23
+20:03:30
+
+query D
+select make_time(t.hour, t.minute, 4) from table_nums t;
+----
+22:01:04
+20:11:04
+
+query D
+select make_time('21', t.minute, t.second) from table_nums t;
+----
+21:01:23
+21:11:30
+
+query D
+select make_time(t.hour, '3', t.second) from table_nums t;
+----
+22:03:23
+20:03:30
+
+query D
+select make_time(t.hour, t.minute, '4') from table_nums t;
+----
+22:01:04
+20:11:04
+
+statement ok
+insert into table_nums values (25, null, 77);
+
+query D
+select make_time(t.hour, t.minute, t.second) from table_nums t;
+----
+22:01:23
+20:11:30
+NULL
+
+statement ok
+drop table table_nums;
+
+statement ok
+create table table_strings (hour varchar(4), minute varchar(2), second varchar(2)) as values
+    ('22', '1', '23'),
+    ('23', '11', '30');
+
+query D
+select make_time(t.hour, t.minute, t.second) from table_strings t;
+----
+22:01:23
+23:11:30
+
+statement ok
+insert into table_strings values ('33', null, '23');
+
+query D
+select make_time(t.hour, t.minute, t.second) from table_strings t;
+----
+22:01:23
+23:11:30
+NULL
+
+statement ok
+insert into table_strings values ('33', '12', '23');
+
+query error DataFusion error: Execution error: Hour value '33' is out of range
+select make_time(t.hour, t.minute, t.second) from table_strings t;
+
+statement ok
+drop table table_strings;
+
+query error Function 'make_time' expects 3 arguments but received 1
+select make_time(22);
+
+query error Function 'make_time' expects 3 arguments but received 2
+select make_time(22, 22);
+
+query error DataFusion error: Execution error: Hour value '26' is out of range
+select make_time(26, 13, 23);
+
+query error DataFusion error: Execution error: Second value '62' is out of range
+select make_time(22, 01, 62);
+
+query error DataFusion error: Execution error: Minute value '64' is out of range
+select make_time(22, 64, 23);
+
+query error DataFusion error: Execution error: Hour value '-1' is out of range
+select make_time(-1, 12, 0);
+
+query error DataFusion error: Execution error: Minute value '-1' is out of range
+select make_time(22, -1, 23);
+
+query error DataFusion error: Execution error: Second value '-1' is out of range
+select make_time(22, 13, -1);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time('', 1, 23);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time(22, '', 27);
+
+query error Cannot cast string '' to value of Int32 type
+select make_time(22, 1, '');
+
+query error DataFusion error: Error during planning: Function 'make_time' requires Int32, but received Float64 \(DataType: Float64\)
+select make_time(arrow_cast(22, 'Float64'), 1, '');
+
+##########
+## to_time tests
+##########
+
+# Basic time parsing
+
+query D
+select to_time('12:30:45');
+----
+12:30:45
+
+query D
+select to_time('00:00:00');
+----
+00:00:00
+
+query D
+select to_time('23:59:59');
+----
+23:59:59
+
+query D
+select to_time('08:15:30');
+----
+08:15:30
+
+# Time with fractional seconds
+
+query D
+select to_time('12:30:45.123');
+----
+12:30:45.123
+
+query D
+select to_time('12:30:45.123456789');
+----
+12:30:45.123456789
+
+# Time with custom format
+
+query D
+select to_time('12-30-45', '%H-%M-%S');
+----
+12:30:45
+
+query D
+select to_time('14/25/30', '%H/%M/%S');
+----
+14:25:30
+
+query D
+select to_time('02:30:45 PM', '%I:%M:%S %p');
+----
+14:30:45
+
+# Null handling
+
+query D
+select to_time(null);
+----
+NULL
+
+# Return type check
+
+query T
+select arrow_typeof(to_time('12:30:45'));
+----
+Time64(ns)
+
+# Table input
+
+statement ok
+create table time_strings (time_str varchar) as values
+    ('12:30:45'),
+    ('23:59:59'),
+    ('00:00:00');
+
+query D
+select to_time(time_str) from time_strings;
+----
+12:30:45
+23:59:59
+00:00:00
+
+statement ok
+drop table time_strings;
+
+# Error cases
+
+query error Error parsing 'not_a_time' as time
+select to_time('not_a_time');
+
+query error Error parsing '25:00:00' as time
+select to_time('25:00:00');
+
+# Out of range minutes
+query error Error parsing '12:60:00' as time
+select to_time('12:60:00');
+
+# Out of range seconds (61 is invalid, 60 is allowed as leap second)
+query error Error parsing '12:30:61' as time
+select to_time('12:30:61');
+
+query error does not support zero arguments
+select to_time();
+
+# StringView type support
+
+query D
+select to_time(arrow_cast('08:15:30', 'Utf8View'));
+----
+08:15:30
+
+# LargeUtf8 type support
+
+query D
+select to_time(arrow_cast('14:45:00', 'LargeUtf8'));
+----
+14:45:00
+
+# HH:MM default parsing (no seconds)
+
+query D
+select to_time('14:30');
+----
+14:30:00
+
+query D
+select to_time('09:05');
+----
+09:05:00
+
+query D
+select to_time('00:00');
+----
+00:00:00
+
+# Timestamp input - extract time portion
+
+query D
+select to_time(to_timestamp('2024-01-15 14:30:45'));
+----
+14:30:45
+
+query D
+select to_time('2024-03-20 09:15:30'::timestamp);
+----
+09:15:30
+
+query D
+select to_time('2024-06-15 23:59:59.123456789'::timestamp);
+----
+23:59:59.123456789
+
+query D
+select to_time('2024-01-01 00:00:00'::timestamp);
+----
+00:00:00
+
+# Timestamp with timezone
+
+query D
+select to_time(to_timestamp('2024-01-15T14:30:45+00:00'));
+----
+14:30:45
+
+# Null timestamp
+
+query D
+select to_time(null::timestamp);
+----
+NULL
+
+# Return type check with timestamp input
+
+query T
+select arrow_typeof(to_time(to_timestamp('2024-01-15 12:30:45')));
+----
+Time64(ns)
+
+# Timestamp with timezone offset - to_timestamp parses and normalizes to UTC
+# 14:30:45 in UTC-5 = 19:30:45 in UTC, and Arrow stores as UTC internally
+
+query D
+select to_time(to_timestamp('2024-01-15T14:30:45-05:00'));
+----
+19:30:45
+
+# Timestamp without timezone - time portion is extracted as-is
+
+query D
+select to_time('2024-03-20 09:15:30'::timestamp);
+----
+09:15:30
+
+# Timestamp with timezone (timestamptz) - time is extracted as stored
+# Note: AT TIME ZONE labels the naive timestamp but doesn't convert the time value
+
+query D
+select to_time('2024-03-20 09:15:30'::timestamp AT TIME ZONE 'America/Los_Angeles');
+----
+09:15:30
+
+# Timestamp before epoch (1969-12-31 23:30:00 UTC)
+
+query D
+select to_time(to_timestamp('1969-12-31T23:30:00+00:00'));
+----
+23:30:00
+
+query D
+select to_time(to_timestamp('1960-06-15T08:45:30+00:00'));
+----
+08:45:30
+
+##########
+## to_char tests
+##########
+
+statement ok
+create table formats (
+    dates date,
+    times time,
+    timestamps timestamp,
+    date_format varchar,
+    time_format varchar,
+    timestamp_format varchar)
+as values
+    ('2000-01-01'::date, '23:45:01'::time, '2024-01-01 06:00:00'::timestamp, '%d:%m:%Y', '%H-%M-%S', '%d:%m:%Y %H-%M-%S'),
+    ('2003-04-05'::date, '04:56:32'::time, '2025-01-01 23:59:58'::timestamp, '%d:%m:%Y', '%H::%M::%S', '%d:%m:%Y %H-%M-%S');
+
+
+query T
+select to_char(dates, date_format) from formats;
+----
+01:01:2000
+05:04:2003
+
+query T
+select date_format(dates, date_format) from formats;
+----
+01:01:2000
+05:04:2003
+
+query T
+select date_format(dates, time_format) from formats;
+----
+00-00-00
+00::00::00
+
+query T
+select date_format(dates, timestamp_format) from formats;
+----
+01:01:2000 00-00-00
+05:04:2003 00-00-00
+
+query T
+select to_char(times, time_format) from formats;
+----
+23-45-01
+04::56::32
+
+query T
+select to_char(timestamps, date_format) from formats;
+----
+01:01:2024
+01:01:2025
+
+query T
+select to_char(timestamps, timestamp_format) from formats;
+----
+01:01:2024 06-00-00
+01:01:2025 23-59-58
+
+query T
+select to_char('2000-02-03'::date, '%Y:%d:%m');
+----
+2000:03:02
+
+query T
+select to_char(arrow_cast(12345::int, 'Time32(Second)'), '%H-%M-%S')
+----
+03-25-45
+
+query T
+select to_char(arrow_cast(12344567::int, 'Time32(Millisecond)'), '%H-%M-%S %f')
+----
+03-25-44 567000000
+
+query T
+select to_char(arrow_cast(12344567000, 'Time64(Microsecond)'), '%H-%M-%S %f')
+----
+03-25-44 567000000
+
+query T
+select to_char(arrow_cast(12344567890000, 'Time64(Nanosecond)'), '%H-%M-%S %f')
+----
+03-25-44 567890000
+
+query T
+select to_char(arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(s)'), '%d-%m-%Y %H-%M-%S')
+----
+03-08-2023 14-38-50
+
+query T
+select to_char(arrow_cast('2023-09-04'::date, 'Timestamp(Second, Some("UTC"))'), '%Y-%m-%dT%H:%M:%S%.3f');
+----
+2023-09-04T00:00:00.000
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), 'pretty');
+----
+1 days 10 hours 17 mins 36 secs
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), 'iso8601');
+----
+PT123456S
+
+query T
+select to_char(arrow_cast(123456, 'Duration(Second)'), null);
+----
+NULL
+
+query error DataFusion error: Arrow error: Cast error: Format error
+SELECT to_char(timestamps, '%X%K') from formats;
+
+query error DataFusion error: Arrow error: Cast error: Format error
+SELECT to_char('2000-02-03'::date, '%X%K');
+
+query T
+SELECT to_char(timestamps, null) from formats;
+----
+NULL
+NULL
+
+query T
+SELECT to_char(null, '%d-%m-%Y');
+----
+NULL
+
+query T
+SELECT to_char(date_column, '%Y-%m-%d')
+FROM (VALUES
+    (DATE '2020-09-01'),
+    (NULL)
+) AS t(date_column);
+----
+2020-09-01
+NULL
+
+query T
+SELECT to_char(date_column, '%Y-%m-%d')
+FROM (VALUES
+    (NULL),
+    (DATE '2020-09-01')
+) AS t(date_column);
+----
+NULL
+2020-09-01
+
+query T
+SELECT to_char(column1, column2)
+FROM
+(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S'));
+----
+NULL
+01:01:2025 23-59-58
+
+query T
+select to_char('2020-01-01 00:10:20.123'::timestamp at time zone 'America/New_York', '%Y-%m-%d %H:%M:%S.%3f');
+----
+2020-01-01 00:10:20.123
+
+# Null values with array format
+query T
+SELECT to_char(column1, column2)
+FROM (VALUES
+    (DATE '2020-09-01', '%Y-%m-%d'),
+    (NULL, '%Y-%m-%d'),
+    (DATE '2020-09-02', NULL),
+    (NULL, NULL)
+);
+----
+2020-09-01
+NULL
+NULL
+NULL
+
+statement ok
+drop table formats;
+
+##########
+## to_unixtime tests
+##########
+
+query I
+select to_unixtime('2020-09-08T12:00:00+00:00');
+----
+1599566400
+
+query I
+select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Second, Some("+05:30"))'));
+----
+1673638290
+
+query I
+select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(ms)'));
+----
+1673658090
+
+query I
+select to_unixtime('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
+----
+1673638290
+
+query I
+select to_unixtime('03:59:00.123456789 05-17-2023', '%c', '%+', '%H:%M:%S%.f %m-%d-%Y');
+----
+1684295940
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Date64'));
+----
+1599566400
+
+query I
+select to_unixtime(arrow_cast('2020-09-08', 'Date32'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_seconds('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_millis('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_micros('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(to_timestamp_nanos('2020-09-08'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'Int32'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'Int64'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200.414, 'Float64'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(-1, 'Int8'));
+----
+-1
+
+query I
+select to_unixtime(arrow_cast(null, 'Int8'));
+----
+NULL
+
+query I
+select to_unixtime(arrow_cast(1000, 'Int16'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast(255, 'UInt8'));
+----
+255
+
+query I
+select to_unixtime(arrow_cast(65535, 'UInt16'));
+----
+65535
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'UInt32'));
+----
+1599523200
+
+query I
+select to_unixtime(arrow_cast(1599523200, 'UInt64'));
+----
+1599523200
+
+query error DataFusion error: Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int64
+select to_unixtime(arrow_cast(18446744073709551615, 'UInt64'));
+
+query I
+select to_unixtime(arrow_cast(1000.12, 'Float16'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast(1000.414, 'Float32'));
+----
+1000
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Utf8View'));
+----
+1599566400
+
+query I
+select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'LargeUtf8'));
+----
+1599566400
+
+##########
+## Tests for the "AT TIME ZONE" clause
+##########
+
+query P
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'UTC';
+----
+2000-12-01T04:04:12Z
+
+query P
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New_York';
+----
+2000-12-01T04:04:12-05:00
+
+query P
+SELECT '2024-03-30 00:00:20' AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20'::timestamp AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20Z' AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T01:00:20+01:00
+
+query P
+SELECT '2024-03-30 00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels';
+----
+2024-03-30T00:00:20+01:00
+
+## date-time strings that already have a explicit timezone can be used with AT TIME ZONE
+
+# same time zone as provided date-time
+query P
+SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'America/New_York';
+----
+2000-12-01T04:04:12-05:00
+
+# different time zone than provided date-time
+query P
+SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'Europe/Berlin';
+----
+2000-12-01T10:04:12+01:00
+
+# longform timezones need whitespace converted to underscore
+statement error
+SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New York';
+
+# abbreviated timezone is not supported
+statement error
+SELECT '2023-03-12 02:00:00' AT TIME ZONE 'EDT';
+
+# Test current_time without parentheses
+query B
+select current_time = current_time;
+----
+true
+
+# Test temporal coercion for UTC
+query ?
+select arrow_cast('2024-06-17T11:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+----
+0 days -1 hours 0 mins 0.000000 secs
+
+query ?
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+----
+0 days 1 hours 0 mins 0.000000 secs
+
+query ?
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+00:00"))');
+----
+0 days 1 hours 0 mins 0.000000 secs
+
+# not supported: coercion across timezones
+query error
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+
+query error
+select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+
+##########
+## Test to_local_time function
+##########
+
+# invalid number of arguments -- no argument
+statement error
+select to_local_time();
+
+# invalid number of arguments -- more than 1 argument
+statement error
+select to_local_time('2024-04-01T00:00:20Z'::timestamp, 'some string');
+
+# invalid argument data type
+statement error DataFusion error: Error during planning: Function 'to_local_time' requires Timestamp, but received String \(DataType: Utf8\)
+select to_local_time('2024-04-01T00:00:20Z');
+
+# invalid timezone
+statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Parser error: Invalid timezone "Europe/timezone": failed to parse timezone
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/timezone');
+
+# valid query
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp);
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE '+05:00');
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels');
+----
+2024-04-01T00:00:20
+
+query P
+select to_local_time(NULL);
+----
+NULL
+
+query PT
+select
+  to_local_time(arrow_cast(null, 'Timestamp(s, "Asia/Tokyo")')),
+  arrow_typeof(to_local_time(arrow_cast(null, 'Timestamp(s, "Asia/Tokyo")')));
+----
+NULL Timestamp(s)
+
+query PTPT
+select
+  time,
+  arrow_typeof(time) as type,
+  to_local_time(time) as to_local_time,
+  arrow_typeof(to_local_time(time)) as to_local_time_type
+from (
+  select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time
+);
+----
+2024-04-01T00:00:20+02:00 Timestamp(ns, "Europe/Brussels") 2024-04-01T00:00:20 Timestamp(ns)
+
+# use to_local_time() in date_bin()
+query P
+select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels'));
+----
+2024-04-01T00:00:00
+
+query P
+select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels')) AT TIME ZONE 'Europe/Brussels';
+----
+2024-04-01T00:00:00+02:00
+
+# test using to_local_time() on array values
+statement ok
+create table t AS
+VALUES
+  (NULL),
+  ('2024-01-01T00:00:01Z'),
+  ('2024-02-01T00:00:01Z'),
+  ('2024-03-01T00:00:01Z'),
+  ('2024-04-01T00:00:01Z'),
+  ('2024-05-01T00:00:01Z'),
+  ('2024-06-01T00:00:01Z'),
+  ('2024-07-01T00:00:01Z'),
+  ('2024-08-01T00:00:01Z'),
+  ('2024-09-01T00:00:01Z'),
+  ('2024-10-01T00:00:01Z'),
+  ('2024-11-01T00:00:01Z'),
+  ('2024-12-01T00:00:01Z')
+;
+
+statement ok
+create view t_utc as
+select column1::timestamp AT TIME ZONE 'UTC' as "column1"
+from t;
+
+statement ok
+create view t_timezone as
+select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
+from t;
+
+query PPT
+select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+
+query PPT
+select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns)
+
+query PPT
+select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone;
+----
+NULL NULL Timestamp(ns)
+2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(ns)
+2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(ns)
+2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(ns)
+2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(ns)
+2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(ns)
+2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(ns)
+2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(ns)
+2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(ns)
+2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(ns)
+2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(ns)
+2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(ns)
+2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(ns)
+
+# combine to_local_time() with date_bin()
+query P
+select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_utc;
+----
+NULL
+2024-01-01T00:00:00+01:00
+2024-02-01T00:00:00+01:00
+2024-03-01T00:00:00+01:00
+2024-04-01T00:00:00+02:00
+2024-05-01T00:00:00+02:00
+2024-06-01T00:00:00+02:00
+2024-07-01T00:00:00+02:00
+2024-08-01T00:00:00+02:00
+2024-09-01T00:00:00+02:00
+2024-10-01T00:00:00+02:00
+2024-11-01T00:00:00+01:00
+2024-12-01T00:00:00+01:00
+
+query P
+select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_timezone;
+----
+NULL
+2024-01-01T00:00:00+01:00
+2024-02-01T00:00:00+01:00
+2024-03-01T00:00:00+01:00
+2024-04-01T00:00:00+02:00
+2024-05-01T00:00:00+02:00
+2024-06-01T00:00:00+02:00
+2024-07-01T00:00:00+02:00
+2024-08-01T00:00:00+02:00
+2024-09-01T00:00:00+02:00
+2024-10-01T00:00:00+02:00
+2024-11-01T00:00:00+01:00
+2024-12-01T00:00:00+01:00
+
+statement ok
+drop table t;
+
+statement ok
+drop view t_utc;
+
+statement ok
+drop view t_timezone;
+
+# test comparisons across timestamps
+statement ok
+create table t AS
+VALUES
+  ('2024-01-01T00:00:01Z'),
+  ('2024-02-01T00:00:01Z'),
+  ('2024-03-01T00:00:01Z')
+;
+
+statement ok
+create view t_utc as
+select column1::timestamp AT TIME ZONE 'UTC' as "column1"
+from t;
+
+statement ok
+create view t_europe as
+select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
+from t;
+
+query P
+SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-01-01T00:00:01Z
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+----
+2024-01-01T00:00:01+01:00
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles');
+----
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
+----
+2023-12-31T23:00:01Z
+2024-01-01T00:00:01Z
+2024-01-31T23:00:01Z
+2024-02-01T00:00:01Z
+2024-02-29T23:00:01Z
+2024-03-01T00:00:01Z
+
+query P
+SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
+----
+2024-01-01T00:00:01+01:00
+2024-01-01T01:00:01+01:00
+2024-02-01T00:00:01+01:00
+2024-02-01T01:00:01+01:00
+2024-03-01T00:00:01+01:00
+2024-03-01T01:00:01+01:00
+
+query P
+SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp)
+----
+2021-02-03T04:05:06
+
+query ?
+SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp);
+----
+[2020-01-01T04:00:00, 2021-01-01T01:02:03]
+
+query P
+SELECT * FROM VALUES
+ ('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'),
+ ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
+----
+2023-12-31T15:00:00-08:00
+2024-02-01T00:00:00-08:00
+
+query P
+SELECT * FROM VALUES
+ ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'),
+ ('2023-12-31T23:00:00' AT TIME ZONE 'UTC');
+----
+2024-02-01T08:00:00Z
+2023-12-31T23:00:00Z
+
+# interval vs. duration comparison
+query B
+select (now() - now()) < interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) <= interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) = interval '0 seconds';
+----
+true
+
+query B
+select (now() - now()) != interval '1 seconds';
+----
+true
+
+query B
+select (now() - now()) > interval '-1 seconds';
+----
+true
+
+query B
+select (now() - now()) >= interval '-1 seconds';
+----
+true
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '200 nanoseconds';
+----
+true
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '100 nanoseconds';
+----
+false
+
+query B
+select arrow_cast(123, 'Duration(Nanosecond)') < interval '1 seconds';
+----
+true
+
+query B
+select interval '1 seconds' < arrow_cast(123, 'Duration(Nanosecond)')
+----
+false
+
+# interval as LHS
+query B
+select interval '2 seconds' = interval '2 seconds';
+----
+true
+
+query B
+select interval '1 seconds' < interval '2 seconds';
+----
+true
+
+statement ok
+drop table t;
+
+statement ok
+drop view t_utc;
+
+statement ok
+drop view t_europe;
+
+# TODO: In Postgres, '-1' is unknown type and interpreted to float8 so they don't fail on this query
+query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
+select to_timestamp('-1');
+
+query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
+select to_timestamp(arrow_cast('-1', 'Utf8'));
+
+query P
+SELECT CAST(CAST(1   AS decimal(17,2)) AS timestamp(3)) AS a UNION ALL
+SELECT CAST(CAST(one AS decimal(17,2)) AS timestamp(3)) AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(ns)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(ns)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(µs)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(µs)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.000001
+1970-01-01T00:00:00.000001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(ms)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(ms)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1   AS decimal(17,2)), 'Timestamp(s)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,2)), 'Timestamp(s)') AS a FROM (VALUES (1)) t(one);
+----
+1970-01-01T00:00:01
+1970-01-01T00:00:01
+
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(ns)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(ns)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(µs)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(µs)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.000001
+1970-01-01T00:00:00.000001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(ms)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(ms)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:00.001
+1970-01-01T00:00:00.001
+
+query P
+SELECT arrow_cast(CAST(1.123 AS decimal(17,3)), 'Timestamp(s)') AS a UNION ALL
+SELECT arrow_cast(CAST(one AS decimal(17,3)), 'Timestamp(s)') AS a FROM (VALUES (1.123)) t(one);
+----
+1970-01-01T00:00:01
+1970-01-01T00:00:01
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT DATE '2005-09-10' AS a)
+----
+Date32 2005-09-10 2005-09-10 2005-09-10 2005-09-10
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a)
+----
+Timestamp(ns) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00
+
+query TTTTT
+SELECT
+    arrow_typeof(a),
+    CAST(a AS varchar),
+    arrow_cast(a, 'Utf8'),
+    arrow_cast(a, 'Utf8View'),
+    arrow_cast(a, 'LargeUtf8')
+FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a)
+----
+Timestamp(ns) 2005-09-10T11:31:00 2005-09-10T11:31:00 2005-09-10T11:31:00 2005-09-10T11:31:00
+
+query P
+SELECT
+    date_trunc('millisecond', ts)
+FROM ts_data_micros_kolkata
+----
+2020-09-08T19:12:29.190+05:30
+2020-09-08T18:12:29.190+05:30
+2020-09-08T17:12:29.190+05:30
+
+
+##########
+## Casting between timestamp with and without timezone
+##########
+
+# Test casting from Timestamp(Nanosecond, Some("UTC")) to Timestamp(ns)
+# Verifies that the underlying nanosecond values are preserved when removing timezone
+
+# Verify input type
 query T
-SELECT to_char(null, '%d-%m-%Y');
+SELECT arrow_typeof(arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))'));
+----
+Timestamp(ns, "UTC")
+
+# Verify output type after casting
+query T
+SELECT arrow_typeof(arrow_cast(arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))'), 'Timestamp(ns)'));
+----
+Timestamp(ns)
+
+# Verify values are preserved when casting from timestamp with timezone to timestamp without timezone
+query P rowsort
+SELECT arrow_cast(column1, 'Timestamp(ns)')
+FROM (VALUES
+  (arrow_cast(1, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(2, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(3, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(4, 'Timestamp(Nanosecond, Some("UTC"))')),
+  (arrow_cast(5, 'Timestamp(Nanosecond, Some("UTC"))'))
+) t;
+----
+1970-01-01T00:00:00.000000001
+1970-01-01T00:00:00.000000002
+1970-01-01T00:00:00.000000003
+1970-01-01T00:00:00.000000004
+1970-01-01T00:00:00.000000005
+
+# Test casting from Timestamp(ns) to Timestamp(Nanosecond, Some("UTC"))
+# Verifies that the underlying nanosecond values are preserved when adding timezone
+
+# Verify input type
+query T
+SELECT arrow_typeof(arrow_cast(1, 'Timestamp(ns)'));
+----
+Timestamp(ns)
+
+# Verify output type after casting
+query T
+SELECT arrow_typeof(arrow_cast(arrow_cast(1, 'Timestamp(ns)'), 'Timestamp(Nanosecond, Some("UTC"))'));
+----
+Timestamp(ns, "UTC")
+
+# Verify values are preserved when casting from timestamp without timezone to timestamp with timezone
+query P rowsort
+SELECT arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))')
+FROM (VALUES
+  (arrow_cast(1, 'Timestamp(ns)')),
+  (arrow_cast(2, 'Timestamp(ns)')),
+  (arrow_cast(3, 'Timestamp(ns)')),
+  (arrow_cast(4, 'Timestamp(ns)')),
+  (arrow_cast(5, 'Timestamp(ns)'))
+) t;
+----
+1970-01-01T00:00:00.000000001Z
+1970-01-01T00:00:00.000000002Z
+1970-01-01T00:00:00.000000003Z
+1970-01-01T00:00:00.000000004Z
+1970-01-01T00:00:00.000000005Z
+
+
+##########
+## to_timestamp functions with all numeric types
+##########
+
+# Test to_timestamp with all integer types
+# Int8
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:01:40
+
+# Int16
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:16:40
+
+# Int32
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp(arrow_cast(0, 'Int64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:01:40
+
+# UInt16
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:16:40
+
+# UInt32
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'UInt32'));
+----
+1970-01-02T00:00:00
+
+# UInt64
+query P
+SELECT to_timestamp(arrow_cast(0, 'UInt64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(86400, 'UInt64'));
+----
+1970-01-02T00:00:00
+
+# Float16
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float16'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float16'));
+----
+1970-01-01T00:00:01.500
+
+# Float32
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float32'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float32'));
+----
+1970-01-01T00:00:01.500
+
+# Float64
+query P
+SELECT to_timestamp(arrow_cast(0.0, 'Float64'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Float64'));
+----
+1970-01-01T00:00:01.500
+
+# Test to_timestamp_seconds with all integer types
+# Int8
+query P
+SELECT to_timestamp_seconds(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:01:40
+
+# Int16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:16:40
+
+# Int32
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Int64'));
+----
+1970-01-02T00:00:00
+
+# UInt8
+query P
+SELECT to_timestamp_seconds(arrow_cast(100, 'UInt8'));
+----
+1970-01-01T00:01:40
+
+# UInt16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1000, 'UInt16'));
+----
+1970-01-01T00:16:40
+
+# UInt32
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'UInt32'));
+----
+1970-01-02T00:00:00
+
+# UInt64
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'UInt64'));
+----
+1970-01-02T00:00:00
+
+# Float16
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float16'));
+----
+1970-01-01T00:00:01
+
+# Float32
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float32'));
+----
+1970-01-01T00:00:01
+
+# Float64
+query P
+SELECT to_timestamp_seconds(arrow_cast(1.9, 'Float64'));
+----
+1970-01-01T00:00:01
+
+# Test to_timestamp_millis with all integer types
+# Int8
+query P
+SELECT to_timestamp_millis(arrow_cast(0, 'Int8'));
+----
+1970-01-01T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(100, 'Int8'));
+----
+1970-01-01T00:00:00.100
+
+# Int16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:00:01
+
+# Int32
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Int32'));
+----
+1970-01-02T00:00:00
+
+# Int64
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Int64'));
 ----
-NULL
+1970-01-02T00:00:00
 
-query T
-SELECT to_char(date_column, '%Y-%m-%d')
-FROM (VALUES 
-    (DATE '2020-09-01'),
-    (NULL)
-) AS t(date_column);
+# UInt8
+query P
+SELECT to_timestamp_millis(arrow_cast(100, 'UInt8'));
 ----
-2020-09-01
-NULL
+1970-01-01T00:00:00.100
 
-query T
-SELECT to_char(date_column, '%Y-%m-%d')
-FROM (VALUES 
-    (NULL),
-    (DATE '2020-09-01')
-) AS t(date_column);
+# UInt16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'UInt16'));
 ----
-NULL
-2020-09-01
+1970-01-01T00:00:01
 
-query T
-SELECT to_char(column1, column2)
-FROM
-(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S'));
+# UInt32
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'UInt32'));
 ----
-NULL
-01:01:2025 23-59-58
+1970-01-02T00:00:00
 
-query T
-select to_char('2020-01-01 00:10:20.123'::timestamp at time zone 'America/New_York', '%Y-%m-%d %H:%M:%S.%3f');
+# UInt64
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'UInt64'));
 ----
-2020-01-01 00:10:20.123
+1970-01-02T00:00:00
 
-statement ok
-drop table formats;
+# Float16
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Float16'));
+----
+1970-01-01T00:00:01
 
-##########
-## to_unixtime tests
-##########
+# Float32
+query P
+SELECT to_timestamp_millis(arrow_cast(1000.9, 'Float32'));
+----
+1970-01-01T00:00:01
 
-query I
-select to_unixtime('2020-09-08T12:00:00+00:00');
+# Float64
+query P
+SELECT to_timestamp_millis(arrow_cast(1000.9, 'Float64'));
 ----
-1599566400
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Second, Some("+05:30"))'));
+# Test to_timestamp_micros with all integer types
+# Int8
+query P
+SELECT to_timestamp_micros(arrow_cast(0, 'Int8'));
 ----
-1673638290
+1970-01-01T00:00:00
 
-query I
-select to_unixtime(arrow_cast(to_timestamp('2023-01-14T01:01:30'), 'Timestamp(Millisecond, None)'));
+query P
+SELECT to_timestamp_micros(arrow_cast(100, 'Int8'));
 ----
-1673658090
+1970-01-01T00:00:00.000100
 
-query I
-select to_unixtime('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
+# Int16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'Int16'));
 ----
-1673638290
+1970-01-01T00:00:00.001
 
-query I
-select to_unixtime('03:59:00.123456789 05-17-2023', '%c', '%+', '%H:%M:%S%.f %m-%d-%Y');
+# Int32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'Int32'));
 ----
-1684295940
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(arrow_cast('2020-09-08T12:00:00+00:00', 'Date64'));
+# Int64
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Int64'));
 ----
-1599566400
+1970-01-02T00:00:00
 
-query I
-select to_unixtime(arrow_cast('2020-09-08', 'Date32'));
+# UInt8
+query P
+SELECT to_timestamp_micros(arrow_cast(100, 'UInt8'));
 ----
-1599523200
+1970-01-01T00:00:00.000100
 
-query I
-select to_unixtime(to_timestamp('2020-09-08'));
+# UInt16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'UInt16'));
 ----
-1599523200
+1970-01-01T00:00:00.001
 
-query I
-select to_unixtime(to_timestamp_seconds('2020-09-08'));
+# UInt32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'UInt32'));
 ----
-1599523200
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(to_timestamp_millis('2020-09-08'));
+# UInt64
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'UInt64'));
 ----
-1599523200
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(to_timestamp_micros('2020-09-08'));
+# Float16
+query P
+SELECT to_timestamp_micros(arrow_cast(1000, 'Float16'));
 ----
-1599523200
+1970-01-01T00:00:00.001
 
-query I
-select to_unixtime(to_timestamp_nanos('2020-09-08'));
+# Float32
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000.9, 'Float32'));
 ----
-1599523200
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(arrow_cast(1599523200, 'Int32'));
+# Float64
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000.9, 'Float64'));
 ----
-1599523200
+1970-01-01T00:00:01
 
-query I
-select to_unixtime(arrow_cast(1599523200, 'Int64'));
+# Test to_timestamp_nanos with all integer types
+# Int8
+query P
+SELECT to_timestamp_nanos(arrow_cast(0, 'Int8'));
 ----
-1599523200
+1970-01-01T00:00:00
 
-query I
-select to_unixtime(arrow_cast(1599523200.414, 'Float64'));
+query P
+SELECT to_timestamp_nanos(arrow_cast(100, 'Int8'));
 ----
-1599523200
+1970-01-01T00:00:00.000000100
 
-##########
-## Tests for the "AT TIME ZONE" clause
-##########
+# Int16
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000, 'Int16'));
+----
+1970-01-01T00:00:00.000001
 
+# Int32
 query P
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'UTC';
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'Int32'));
 ----
-2000-12-01T04:04:12Z
+1970-01-01T00:00:01
 
+# Int64
 query P
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New_York';
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Int64'));
 ----
-2000-12-01T04:04:12-05:00
+1970-01-02T00:00:00
 
+# UInt8
 query P
-SELECT '2024-03-30 00:00:20' AT TIME ZONE 'Europe/Brussels';
+SELECT to_timestamp_nanos(arrow_cast(100, 'UInt8'));
 ----
-2024-03-30T00:00:20+01:00
+1970-01-01T00:00:00.000000100
 
+# UInt16
 query P
-SELECT '2024-03-30 00:00:20'::timestamp AT TIME ZONE 'Europe/Brussels';
+SELECT to_timestamp_nanos(arrow_cast(1000, 'UInt16'));
 ----
-2024-03-30T00:00:20+01:00
+1970-01-01T00:00:00.000001
 
+# UInt32
 query P
-SELECT '2024-03-30 00:00:20Z' AT TIME ZONE 'Europe/Brussels';
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'UInt32'));
 ----
-2024-03-30T01:00:20+01:00
+1970-01-01T00:00:01
 
+# UInt64
 query P
-SELECT '2024-03-30 00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels';
+SELECT to_timestamp_nanos(arrow_cast(1000000000, 'UInt64'));
 ----
-2024-03-30T00:00:20+01:00
+1970-01-01T00:00:01
 
-## date-time strings that already have a explicit timezone can be used with AT TIME ZONE
+# Float16
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000, 'Float16'));
+----
+1970-01-01T00:00:00.000001
 
-# same time zone as provided date-time
+# Float32
 query P
-SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'America/New_York';
+SELECT to_timestamp_nanos(arrow_cast(1000000000.9, 'Float32'));
 ----
-2000-12-01T04:04:12-05:00
+1970-01-01T00:00:01
 
-# different time zone than provided date-time
+# Float64
 query P
-SELECT '2000-12-01T04:04:12-05:00' AT TIME ZONE 'Europe/Berlin';
+SELECT to_timestamp_nanos(arrow_cast(1000000000.9, 'Float64'));
 ----
-2000-12-01T10:04:12+01:00
+1970-01-01T00:00:01
 
-# longform timezones need whitespace converted to underscore
-statement error
-SELECT '2000-12-01 04:04:12' AT TIME ZONE 'America/New York';
+# Verify arrow_typeof for all to_timestamp functions with various input types
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'Int8')));
+----
+Timestamp(ns)
 
-# abbreviated timezone is not supported
-statement error
-SELECT '2023-03-12 02:00:00' AT TIME ZONE 'EDT';
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ns)
 
-# Test current_time without parentheses
-query B
-select current_time = current_time;
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0.0, 'Float32')));
 ----
-true
+Timestamp(ns)
 
-# Test temporal coercion for UTC
-query ?
-select arrow_cast('2024-06-17T11:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'Int8')));
 ----
-0 days -1 hours 0 mins 0.000000 secs
+Timestamp(s)
 
-query ?
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("UTC"))');
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'UInt64')));
 ----
-0 days 1 hours 0 mins 0.000000 secs
+Timestamp(s)
 
-query ?
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+00:00"))');
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0.0, 'Float32')));
 ----
-0 days 1 hours 0 mins 0.000000 secs
+Timestamp(s)
 
-# not supported: coercion across timezones
-query error
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("UTC"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'Int8')));
+----
+Timestamp(ms)
 
-query error
-select arrow_cast('2024-06-17T13:00:00', 'Timestamp(Nanosecond, Some("+00:00"))') - arrow_cast('2024-06-17T12:00:00', 'Timestamp(Microsecond, Some("+01:00"))');
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ms)
 
-##########
-## Test to_local_time function
-##########
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(ms)
 
-# invalid number of arguments -- no argument
-statement error
-select to_local_time();
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'Int8')));
+----
+Timestamp(µs)
 
-# invalid number of arguments -- more than 1 argument
-statement error
-select to_local_time('2024-04-01T00:00:20Z'::timestamp, 'some string');
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'UInt64')));
+----
+Timestamp(µs)
 
-# invalid argument data type
-statement error The to_local_time function can only accept Timestamp as the arg got Utf8
-select to_local_time('2024-04-01T00:00:20Z');
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(µs)
 
-# invalid timezone
-statement error DataFusion error: Arrow error: Parser error: Invalid timezone "Europe/timezone": failed to parse timezone
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/timezone');
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'Int8')));
+----
+Timestamp(ns)
 
-# valid query
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'UInt64')));
+----
+Timestamp(ns)
+
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0.0, 'Float32')));
+----
+Timestamp(ns)
+
+# Test decimal type support for all to_timestamp functions
+# Decimal32
 query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp);
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal32(5,1)'));
 ----
-2024-04-01T00:00:20
+1970-01-01T00:00:01.500
 
 query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE '+05:00');
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal32(9,0)'));
 ----
-2024-04-01T00:00:20
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(1000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:01
+
+query P
+SELECT to_timestamp_micros(arrow_cast(1000000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:01
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(1000000, 'Decimal32(9,0)'));
+----
+1970-01-01T00:00:00.001
+
+# Decimal64
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal64(10,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal64(18,0)'));
+----
+1970-01-02T00:00:00
+
+# Decimal128
+query P
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal128(10,1)'));
+----
+1970-01-01T00:00:01.500
+
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal128(10,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal128(15,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal128(15,0)'));
+----
+1970-01-02T00:00:00
+
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal128(20,0)'));
+----
+1970-01-02T00:00:00
 
+# Decimal256
 query P
-select to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels');
+SELECT to_timestamp(arrow_cast(1.5, 'Decimal256(10,1)'));
 ----
-2024-04-01T00:00:20
+1970-01-01T00:00:01.500
 
-query PTPT
-select
-  time,
-  arrow_typeof(time) as type,
-  to_local_time(time) as to_local_time,
-  arrow_typeof(to_local_time(time)) as to_local_time_type
-from (
-  select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time
-);
+query P
+SELECT to_timestamp_seconds(arrow_cast(86400, 'Decimal256(38,0)'));
 ----
-2024-04-01T00:00:20+02:00 Timestamp(Nanosecond, Some("Europe/Brussels")) 2024-04-01T00:00:20 Timestamp(Nanosecond, None)
+1970-01-02T00:00:00
 
-# use to_local_time() in date_bin()
 query P
-select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels'));
+SELECT to_timestamp_millis(arrow_cast(86400000, 'Decimal256(38,0)'));
 ----
-2024-04-01T00:00:00
+1970-01-02T00:00:00
 
 query P
-select date_bin(interval '1 day', to_local_time('2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels')) AT TIME ZONE 'Europe/Brussels';
+SELECT to_timestamp_micros(arrow_cast(86400000000, 'Decimal256(38,0)'));
 ----
-2024-04-01T00:00:00+02:00
+1970-01-02T00:00:00
 
-# test using to_local_time() on array values
-statement ok
-create table t AS
-VALUES
-  ('2024-01-01T00:00:01Z'),
-  ('2024-02-01T00:00:01Z'),
-  ('2024-03-01T00:00:01Z'),
-  ('2024-04-01T00:00:01Z'),
-  ('2024-05-01T00:00:01Z'),
-  ('2024-06-01T00:00:01Z'),
-  ('2024-07-01T00:00:01Z'),
-  ('2024-08-01T00:00:01Z'),
-  ('2024-09-01T00:00:01Z'),
-  ('2024-10-01T00:00:01Z'),
-  ('2024-11-01T00:00:01Z'),
-  ('2024-12-01T00:00:01Z')
-;
+query P
+SELECT to_timestamp_nanos(arrow_cast(86400000000000, 'Decimal256(38,0)'));
+----
+1970-01-02T00:00:00
 
-statement ok
-create view t_utc as
-select column1::timestamp AT TIME ZONE 'UTC' as "column1"
-from t;
+# Verify arrow_typeof for decimal inputs
+query T
+SELECT arrow_typeof(to_timestamp(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(ns)
 
-statement ok
-create view t_timezone as
-select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
-from t;
+query T
+SELECT arrow_typeof(to_timestamp_seconds(arrow_cast(0, 'Decimal128(10,0)')));
+----
+Timestamp(s)
 
-query PPT
-select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc;
+query T
+SELECT arrow_typeof(to_timestamp_millis(arrow_cast(0, 'Decimal128(10,0)')));
 ----
-2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None)
-2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None)
-2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None)
-2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None)
-2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None)
-2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None)
-2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None)
-2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None)
-2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None)
-2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None)
-2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None)
-2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None)
+Timestamp(ms)
 
-query PPT
-select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc;
+query T
+SELECT arrow_typeof(to_timestamp_micros(arrow_cast(0, 'Decimal128(10,0)')));
 ----
-2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None)
-2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None)
-2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None)
-2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None)
-2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None)
-2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None)
-2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None)
-2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None)
-2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None)
-2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None)
-2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None)
-2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None)
+Timestamp(µs)
 
-query PPT
-select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone;
+query T
+SELECT arrow_typeof(to_timestamp_nanos(arrow_cast(0, 'Decimal128(10,0)')));
 ----
-2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(Nanosecond, None)
-2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(Nanosecond, None)
-2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(Nanosecond, None)
-2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(Nanosecond, None)
-2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(Nanosecond, None)
-2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(Nanosecond, None)
-2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(Nanosecond, None)
-2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(Nanosecond, None)
-2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(Nanosecond, None)
-2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(Nanosecond, None)
-2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(Nanosecond, None)
-2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(Nanosecond, None)
+Timestamp(ns)
+
+# Test decimal array inputs for to_timestamp
+statement ok
+CREATE TABLE test_decimal_timestamps (
+    d128 DECIMAL(20, 9),
+    d256 DECIMAL(40, 9)
+) AS VALUES
+    (1.5, 1.5),
+    (86400.123456789, 86400.123456789),
+    (0.0, 0.0),
+    (NULL, NULL);
 
-# combine to_local_time() with date_bin()
 query P
-select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_utc;
+SELECT to_timestamp(d128) FROM test_decimal_timestamps ORDER BY d128 NULLS LAST;
 ----
-2024-01-01T00:00:00+01:00
-2024-02-01T00:00:00+01:00
-2024-03-01T00:00:00+01:00
-2024-04-01T00:00:00+02:00
-2024-05-01T00:00:00+02:00
-2024-06-01T00:00:00+02:00
-2024-07-01T00:00:00+02:00
-2024-08-01T00:00:00+02:00
-2024-09-01T00:00:00+02:00
-2024-10-01T00:00:00+02:00
-2024-11-01T00:00:00+01:00
-2024-12-01T00:00:00+01:00
+1970-01-01T00:00:00
+1970-01-01T00:00:01.500
+1970-01-02T00:00:00.123456789
+NULL
 
 query P
-select date_bin(interval '1 day', to_local_time(column1)) AT TIME ZONE 'Europe/Brussels' as date_bin from t_timezone;
+SELECT to_timestamp(d256) FROM test_decimal_timestamps ORDER BY d256 NULLS LAST;
 ----
-2024-01-01T00:00:00+01:00
-2024-02-01T00:00:00+01:00
-2024-03-01T00:00:00+01:00
-2024-04-01T00:00:00+02:00
-2024-05-01T00:00:00+02:00
-2024-06-01T00:00:00+02:00
-2024-07-01T00:00:00+02:00
-2024-08-01T00:00:00+02:00
-2024-09-01T00:00:00+02:00
-2024-10-01T00:00:00+02:00
-2024-11-01T00:00:00+01:00
-2024-12-01T00:00:00+01:00
+1970-01-01T00:00:00
+1970-01-01T00:00:01.500
+1970-01-02T00:00:00.123456789
+NULL
 
 statement ok
-drop table t;
+DROP TABLE test_decimal_timestamps;
 
-statement ok
-drop view t_utc;
+# Test negative values
+# to_timestamp with negative seconds
+# Int8
+query P
+SELECT to_timestamp(arrow_cast(-1, 'Int8'));
+----
+1969-12-31T23:59:59
 
-statement ok
-drop view t_timezone;
+# Int16
+query P
+SELECT to_timestamp(arrow_cast(-1, 'Int16'));
+----
+1969-12-31T23:59:59
 
-# test comparisons across timestamps
-statement ok
-create table t AS
-VALUES
-  ('2024-01-01T00:00:01Z'),
-  ('2024-02-01T00:00:01Z'),
-  ('2024-03-01T00:00:01Z')
-;
+# Int32
+query P
+SELECT to_timestamp(arrow_cast(-86400, 'Int32'));
+----
+1969-12-31T00:00:00
 
-statement ok
-create view t_utc as
-select column1::timestamp AT TIME ZONE 'UTC' as "column1"
-from t;
+# Int64
+query P
+SELECT to_timestamp(arrow_cast(-1, 'Int64'));
+----
+1969-12-31T23:59:59
 
-statement ok
-create view t_europe as
-select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
-from t;
+# Float64
+query P
+SELECT to_timestamp(arrow_cast(-0.5, 'Float64'));
+----
+1969-12-31T23:59:59.500
 
+# to_timestamp_seconds with negative values
+# Int8
 query P
-SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int8'));
 ----
-2024-01-01T00:00:01Z
-2024-02-01T00:00:01Z
+1969-12-31T23:59:59
 
+# Int16
 query P
-SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int16'));
 ----
-2024-02-01T00:00:01+01:00
+1969-12-31T23:59:59
 
+# Int32
 query P
-SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
+SELECT to_timestamp_seconds(arrow_cast(-86400, 'Int32'));
 ----
-2024-01-01T00:00:01+01:00
-2024-02-01T00:00:01+01:00
+1969-12-31T00:00:00
 
+# Int64
 query P
-SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles');
+SELECT to_timestamp_seconds(arrow_cast(-1, 'Int64'));
 ----
-2024-02-01T00:00:01Z
+1969-12-31T23:59:59
 
+# to_timestamp_millis with negative values
+# Int8
 query P
-SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int8'));
 ----
-2023-12-31T23:00:01Z
-2024-01-01T00:00:01Z
-2024-01-31T23:00:01Z
-2024-02-01T00:00:01Z
-2024-02-29T23:00:01Z
-2024-03-01T00:00:01Z
+1969-12-31T23:59:59.999
 
+# Int16
 query P
-SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int16'));
 ----
-2024-01-01T00:00:01+01:00
-2024-01-01T01:00:01+01:00
-2024-02-01T00:00:01+01:00
-2024-02-01T01:00:01+01:00
-2024-03-01T00:00:01+01:00
-2024-03-01T01:00:01+01:00
+1969-12-31T23:59:59.999
 
+# Int32
 query P
-SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp)
+SELECT to_timestamp_millis(arrow_cast(-1000, 'Int32'));
 ----
-2021-02-03T04:05:06
+1969-12-31T23:59:59
 
-query ?
-SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp);
+# Int64
+query P
+SELECT to_timestamp_millis(arrow_cast(-1, 'Int64'));
 ----
-[2020-01-01T04:00:00, 2021-01-01T01:02:03]
+1969-12-31T23:59:59.999
 
+# to_timestamp_micros with negative values
+# Int8
 query P
-SELECT * FROM VALUES
- ('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'),
- ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int8'));
 ----
-2023-12-31T15:00:00-08:00
-2024-02-01T00:00:00-08:00
+1969-12-31T23:59:59.999999
 
+# Int16
 query P
-SELECT * FROM VALUES
- ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'),
- ('2023-12-31T23:00:00' AT TIME ZONE 'UTC');
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int16'));
 ----
-2024-02-01T08:00:00Z
-2023-12-31T23:00:00Z
+1969-12-31T23:59:59.999999
 
-# interval vs. duration comparison
-query B
-select (now() - now()) < interval '1 seconds';
+# Int32
+query P
+SELECT to_timestamp_micros(arrow_cast(-1000000, 'Int32'));
 ----
-true
+1969-12-31T23:59:59
 
-query B
-select (now() - now()) <= interval '1 seconds';
+# Int64
+query P
+SELECT to_timestamp_micros(arrow_cast(-1, 'Int64'));
 ----
-true
+1969-12-31T23:59:59.999999
 
-query B
-select (now() - now()) = interval '0 seconds';
+# to_timestamp_nanos with negative values
+# Int8
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int8'));
 ----
-true
+1969-12-31T23:59:59.999999999
 
-query B
-select (now() - now()) != interval '1 seconds';
+# Int16
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int16'));
 ----
-true
+1969-12-31T23:59:59.999999999
 
-query B
-select (now() - now()) > interval '-1 seconds';
+# Int32
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1000000000, 'Int32'));
 ----
-true
+1969-12-31T23:59:59
 
-query B
-select (now() - now()) >= interval '-1 seconds';
+# Int64
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1000000000, 'Int64'));
 ----
-true
+1969-12-31T23:59:59
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '200 nanoseconds';
+query P
+SELECT to_timestamp_nanos(arrow_cast(-1, 'Int64'));
 ----
-true
+1969-12-31T23:59:59.999999999
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '100 nanoseconds';
+# Test large unsigned values
+query P
+SELECT to_timestamp_seconds(arrow_cast(4294967295, 'UInt64'));
 ----
-false
+2106-02-07T06:28:15
 
-query B
-select arrow_cast(123, 'Duration(Nanosecond)') < interval '1 seconds';
+# Large UInt64 value for milliseconds
+query P
+SELECT to_timestamp_millis(arrow_cast(4294967295000, 'UInt64'));
 ----
-true
+2106-02-07T06:28:15
 
-query B
-select interval '1 seconds' < arrow_cast(123, 'Duration(Nanosecond)')
+# Test UInt64 value larger than i64::MAX (9223372036854775808 = i64::MAX + 1)
+query error Cast error: Can't cast value 9223372036854775808 to type Int64
+SELECT to_timestamp_nanos(arrow_cast(9223372036854775808, 'UInt64'));
+
+# Test boundary values for to_timestamp
+query P
+SELECT to_timestamp(arrow_cast(9223372036, 'Int64'));
 ----
-false
+2262-04-11T23:47:16
 
-# interval as LHS
-query B
-select interval '2 seconds' = interval '2 seconds';
+# Minimum value for to_timestamp
+query P
+SELECT to_timestamp(arrow_cast(-9223372036, 'Int64'));
 ----
-true
+1677-09-21T00:12:44
 
-query B
-select interval '1 seconds' < interval '2 seconds';
+# Overflow error when value exceeds valid range
+query error Arithmetic overflow
+SELECT to_timestamp(arrow_cast(9223372037, 'Int64'));
+
+# Float truncation behavior
+query P
+SELECT to_timestamp_seconds(arrow_cast(-1.9, 'Float64'));
 ----
-true
+1969-12-31T23:59:59
+
+query P
+SELECT to_timestamp_millis(arrow_cast(-1.9, 'Float64'));
+----
+1969-12-31T23:59:59.999
+
+
+##########
+## Common timestamp data
+##########
 
 statement ok
-drop table t;
+drop table ts_data
 
 statement ok
-drop view t_utc;
+drop table ts_data_nanos
 
 statement ok
-drop view t_europe;
+drop table ts_data_micros
 
-# TODO: In Postgres, '-1' is unknown type and interpreted to float8 so they don't fail on this query
-query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
-select to_timestamp('-1');
+statement ok
+drop table ts_data_millis
 
-query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from '\-1': timestamp must contain at least 10 characters
-select to_timestamp(arrow_cast('-1', 'Utf8'));
+statement ok
+drop table ts_data_secs
+
+statement ok
+drop table ts_data_micros_kolkata
+
+##########
+## Test to_timestamp with scalar float inputs
+##########
+
+statement ok
+create table test_to_timestamp_scalar(id int, name varchar) as values
+  (1, 'foo'),
+  (2, 'bar');
+
+query P
+SELECT to_timestamp(123.5, name) FROM test_to_timestamp_scalar ORDER BY id;
+----
+1970-01-01T00:02:03.500
+1970-01-01T00:02:03.500
+
+query P
+SELECT to_timestamp(456.789::float, name) FROM test_to_timestamp_scalar ORDER BY id;
+----
+1970-01-01T00:07:36.789001464
+1970-01-01T00:07:36.789001464
+
+query P
+SELECT to_timestamp(arrow_cast(100.5, 'Float16'), name) FROM test_to_timestamp_scalar ORDER BY id;
+----
+1970-01-01T00:01:40.500
+1970-01-01T00:01:40.500
+
+statement ok
+drop table test_to_timestamp_scalar
+
+# date_bin with NULL interval should return NULL, not a planning error
+query P
+SELECT date_bin(NULL, TIMESTAMP '2023-01-01 12:30:00', TIMESTAMP '2023-01-01 12:00:00')
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt
index 1e95e426f3e08..0579659832feb 100644
--- a/datafusion/sqllogictest/test_files/ddl.slt
+++ b/datafusion/sqllogictest/test_files/ddl.slt
@@ -272,7 +272,7 @@ drop table my_table
 
 # select_into
 statement ok
-SELECT* INTO my_table FROM (SELECT * FROM aggregate_simple)
+SELECT * INTO my_table FROM (SELECT * FROM aggregate_simple)
 
 query RRB rowsort
 SELECT * FROM my_table order by c1 LIMIT 1
@@ -312,7 +312,7 @@ DROP TABLE aggregate_simple
 
 # Arrow format
 statement ok
-CREATE external table arrow_simple STORED as ARROW LOCATION '../core/tests/data/example.arrow';
+CREATE external table arrow_simple STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow';
 
 query ITB rowsort
 SELECT * FROM arrow_simple order by f1 LIMIT 1
@@ -587,7 +587,7 @@ statement ok
 CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true');
 
 # Should not recreate the same EXTERNAL table
-statement error Execution error: Table 'aggregate_simple' already exists
+statement error Execution error: External table 'aggregate_simple' already exists
 CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true');
 
 statement ok
@@ -607,6 +607,55 @@ CREATE TABLE table_without_values(field1 BIGINT, field2 BIGINT);
 statement error Execution error: 'IF NOT EXISTS' cannot coexist with 'REPLACE'
 CREATE OR REPLACE TABLE IF NOT EXISTS table_without_values(field1 BIGINT, field2 BIGINT);
 
+# CREATE OR REPLACE
+statement ok
+CREATE OR REPLACE EXTERNAL TABLE aggregate_simple_repl
+STORED AS CSV
+LOCATION '../core/tests/data/aggregate_simple.csv'
+OPTIONS ('format.has_header' 'true');
+
+statement ok
+CREATE OR REPLACE EXTERNAL TABLE aggregate_simple_repl
+STORED AS CSV
+LOCATION '../core/tests/data/aggregate_simple.csv'
+OPTIONS ('format.has_header' 'true');
+
+# Create replacement table for table that doesn't already exist
+statement ok
+DROP TABLE IF EXISTS aggregate_table;
+
+statement ok
+CREATE OR REPLACE EXTERNAL TABLE aggregate_table
+STORED AS CSV
+LOCATION '../core/tests/data/aggregate_simple.csv'
+OPTIONS ('format.has_header' 'true');
+
+query TTT
+DESCRIBE aggregate_table;
+----
+c1 Float64 YES
+c2 Float64 YES
+c3 Boolean YES
+
+# Create replacement table with different format for table that doesn't already exist
+query I
+COPY (SELECT * FROM (VALUES (1),(2),(3)) AS t(id))
+TO 'test_files/scratch/ddl/test_table'
+STORED AS PARQUET;
+----
+3
+
+statement ok
+CREATE OR REPLACE EXTERNAL TABLE aggregate_table
+STORED AS PARQUET
+LOCATION 'test_files/scratch/ddl/test_table';
+
+
+query TTT
+DESCRIBE aggregate_table;
+----
+id Int64 YES
+
 # Should insert into an empty table
 statement ok
 insert into table_without_values values (1, 2), (2, 3), (2, 4);
@@ -658,9 +707,9 @@ CREATE EXTERNAL TABLE empty STORED AS CSV LOCATION '../core/tests/data/empty.csv
 query TTI
 select column_name, data_type, ordinal_position from information_schema.columns where table_name='empty';;
 ----
-c1 Utf8 0
-c2 Utf8 1
-c3 Utf8 2
+c1 Null 0
+c2 Null 1
+c3 Null 2
 
 
 ## should allow any type of exprs as values
@@ -747,7 +796,7 @@ logical_plan
 02)--Values: (Int64(1), Int64(2), Int64(3))
 
 query TT
-explain CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../core/tests/data/example.arrow';
+explain CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow';
 ----
 logical_plan CreateExternalTable: Bare { table: "tty" }
 
@@ -755,7 +804,7 @@ statement ok
 set datafusion.explain.logical_plan_only=false;
 
 statement error DataFusion error: This feature is not implemented: Temporary tables not supported
-CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../core/tests/data/example.arrow';
+CREATE EXTERNAL TEMPORARY TABLE tty STORED as ARROW LOCATION '../datasource-arrow/tests/data/example.arrow';
 
 statement error DataFusion error: This feature is not implemented: Temporary views not supported
 CREATE TEMPORARY VIEW y AS VALUES (1,2,3);
@@ -818,7 +867,7 @@ query TTTTTT
 show columns FROM table_with_pk;
 ----
 datafusion public table_with_pk sn Int32 NO
-datafusion public table_with_pk ts Timestamp(Nanosecond, Some("+00:00")) NO
+datafusion public table_with_pk ts Timestamp(ns) NO
 datafusion public table_with_pk currency Utf8View NO
 datafusion public table_with_pk amount Float32 YES
 
@@ -828,7 +877,7 @@ drop table table_with_pk;
 statement ok
 set datafusion.catalog.information_schema = false;
 
-# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_varchar_to_utf8view to true
+# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_string_types_to_utf8view to true
 statement ok
 CREATE TABLE t1(c1 VARCHAR(10) NOT NULL, c2 VARCHAR);
 
@@ -839,7 +888,7 @@ c1 Utf8View NO
 c2 Utf8View YES
 
 statement ok
-set datafusion.sql_parser.map_varchar_to_utf8view = true;
+set datafusion.sql_parser.map_string_types_to_utf8view = true;
 
 statement ok
 CREATE TABLE t2(c1 VARCHAR(10) NOT NULL, c2 VARCHAR);
diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt
index 089910785ad9d..5485a5fd30141 100644
--- a/datafusion/sqllogictest/test_files/decimal.slt
+++ b/datafusion/sqllogictest/test_files/decimal.slt
@@ -614,22 +614,11 @@ select a / b from foo;
 ----
 0.2
 
-statement ok
-create table t as values (arrow_cast(123, 'Decimal256(5,2)'));
-
-# make sure query below runs in single partition
-# otherwise error message may not be deterministic
-statement ok
-set datafusion.execution.target_partitions = 1;
-
 query R
-select AVG(column1) from t;
+select AVG(column1) from values (arrow_cast(123, 'Decimal256(5,2)'));
 ----
 123
 
-statement ok
-drop table t;
-
 statement ok
 CREATE EXTERNAL TABLE decimal256_simple (
 c1 DECIMAL(50,6) NOT NULL,
@@ -747,3 +736,502 @@ SELECT
     cast(cast('5.20' as decimal(4,2)) as decimal(3,2))
 ----
 0 5.2
+
+query RR
+SELECT
+    arrow_cast(1.23,'Decimal128(3,2)') -  arrow_cast(123, 'UInt64') as subtration_uint,
+    arrow_cast(1.23,'Decimal128(3,2)') -  arrow_cast(123, 'Int64') as subtration_int
+----
+-121.77 -121.77
+
+query RR
+SELECT
+    arrow_cast(1.23,'Decimal128(3,2)') +  arrow_cast(123, 'UInt64') as addition_uint,
+    arrow_cast(1.23,'Decimal128(3,2)') +  arrow_cast(123, 'Int64') as addition_int
+----
+124.23 124.23
+
+query RR
+SELECT
+    arrow_cast(1.23,'Decimal128(3,2)') *  arrow_cast(123, 'UInt64') as mulitplication_uint,
+    arrow_cast(1.23,'Decimal128(3,2)') *  arrow_cast(123, 'Int64') as multiplication_int
+----
+151.29 151.29
+
+query RR
+SELECT
+    arrow_cast(1.23,'Decimal128(3,2)') /  arrow_cast(123, 'UInt64') as divison_uint,
+    arrow_cast(1.23,'Decimal128(3,2)') /  arrow_cast(123, 'Int64') as divison_int
+----
+0.01 0.01
+
+query TR
+with tt as (
+    select arrow_cast(133333333333333333333333333333333333333333333.34, 'Decimal256(50, 2)') as v1
+) select arrow_typeof(v1 + 1.5), v1 + 1.5 from tt;
+----
+Float64 133333333333333330000000000000000000000000000
+
+# Following tests only make sense if numbers are parsed as decimals
+# Remove when `parse_float_as_decimal` is true by default (#14612)
+statement ok
+set datafusion.sql_parser.parse_float_as_decimal = true;
+
+# round should keep decimals when parse_float_as_decimal is enabled
+query TR
+select arrow_typeof(round(173975140545.855, 2)),
+       round(173975140545.855, 2);
+----
+Decimal128(15, 2) 173975140545.86
+
+# smoke test for decimal parsing
+query RT
+select 100000000000000000000000000000000000::decimal(38,0), arrow_typeof(100000000000000000000000000000000000::decimal(38,0));
+----
+100000000000000000000000000000000000 Decimal128(38, 0)
+
+# log for small decimal32
+query R
+select log(arrow_cast(100, 'Decimal32(9, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal32(9, 2)'));
+----
+2
+
+query R
+select log(2.0, arrow_cast(12345.67, 'Decimal32(9, 2)'));
+----
+13.591717513272
+
+# log for small decimal64
+query R
+select log(arrow_cast(100, 'Decimal64(18, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal64(18, 2)'));
+----
+2
+
+query R
+select log(2.0, arrow_cast(12345.6789, 'Decimal64(15, 4)'));
+----
+13.591718553311
+
+
+# log for small decimal128
+query R
+select log(arrow_cast(100, 'Decimal128(38, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal128(38, 2)'));
+----
+2
+
+# log for small decimal256
+query R
+select log(arrow_cast(100, 'Decimal256(76, 0)'));
+----
+2
+
+query R
+select log(arrow_cast(100, 'Decimal256(76, 2)'));
+----
+2
+
+# log(10^21) for large decimal128
+query R
+select log(10, 1000000000000000000000::decimal(38,0));
+----
+21
+
+# log(10^35) for large decimal128
+# Must be 35 if parsed as decimal; 34 for floats
+query R
+select log(100000000000000000000000000000000000::decimal(38,0))
+----
+35
+
+# Decimal overflow for 10^38
+query error Arrow error: Invalid argument error: .* is too large to store in a Decimal128 of precision 38. Max is
+select log(100000000000000000000000000000000000000::decimal(38,0))
+
+# log(10^35) for decimal256 for a value able to fit i128
+query R
+select log(100000000000000000000000000000000000::decimal(76,0));
+----
+35
+
+# log(10^50) for decimal256 for a value larger than i128 (uses f64 fallback)
+query R
+select log(100000000000000000000000000000000000000000000000000::decimal(76,0));
+----
+50
+
+# log(10^35) for decimal128 with explicit base
+query R
+select log(10, 100000000000000000000000000000000000::decimal(38,0));
+----
+35
+
+# log(10^35) for decimal256 with explicit base - only float as a base
+query R
+select log(10.0, 100000000000000000000000000000000000::decimal(76,0));
+----
+35
+
+# log(10^35) for decimal128 with explicit decimal base
+query R
+select log(10::decimal(38, 0), 100000000000000000000000000000000000::decimal(38,0));
+----
+35
+
+# log(10^35) for decimal128 with another base
+query R
+select log(2, 100000000000000000000000000000000000::decimal(38,0));
+----
+116.267483321058
+
+# log(10^35) for decimal128 with another base
+query R
+select log(2.0, 100000000000000000000000000000000000::decimal(38,0));
+----
+116.267483321058
+
+# log with non-integer base (fallback to f64)
+query R
+select log(2.5, 100::decimal(38,0));
+----
+5.025883189464
+
+# null cases
+query R
+select log(null, 100);
+----
+NULL
+
+query R
+select log(null, 100000000000000000000000000000000000::decimal(38,0));
+----
+NULL
+
+query R
+select log(null);
+----
+NULL
+
+query R
+select log(2.0, null);
+----
+NULL
+
+# log with negative scale decimals
+# Using scientific notation to create decimals with negative scales
+# 1e4 = 10000 with scale -4, log10(10000) = 4.0
+query R
+select log(1e4);
+----
+4
+
+# log with negative scale and explicit base 10
+query R
+select log(10, 1e4);
+----
+4
+
+# log with negative scale and base 2
+# 8e1 = 80 with scale -1, log2(80) ≈ 6.321928
+query R
+select log(2.0, 8e1);
+----
+6.321928094887
+
+# log with negative scale and base 2 (another value)
+# 16e1 = 160 with scale -1, log2(160) ≈ 7.321928
+query R
+select log(2.0, 16e1);
+----
+7.321928094887
+
+# log with negative scale -3
+# 5e3 = 5000 with scale -3, log10(5000) ≈ 3.69897
+query R
+select log(5e3);
+----
+3.698970004336
+
+# log with negative scale array values
+query R rowsort
+select log(value) from (values (1e3), (1e4), (1e5)) as t(value);
+----
+3
+4
+5
+
+# log with negative scale and different bases
+query R rowsort
+select log(base, 1e4) from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# log(decimal32) with negative scale
+# 1e4 = 10000 with scale -4
+query R
+select log(CAST(1e4 AS DECIMAL(9, -4)));
+----
+4
+
+# log(decimal32) with negative scale and base 2
+# 8e1 = 80, log2(80) ≈ 6.321928
+query R
+select log(2.0, CAST(8e1 AS DECIMAL(9, -1)));
+----
+6.321928094887
+
+
+# log(decimal64) with negative scale
+# 5e3 = 5000, log10(5000) ≈ 3.69897
+query R
+select log(CAST(5e3 AS DECIMAL(18, -3)));
+----
+3.698970004336
+
+# log(decimal64) with negative scale and different bases
+query R rowsort
+select log(base, CAST(1e4 AS DECIMAL(18, -4)))
+from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# log(decimal128) with negative scale and base 2
+# 8e1 = 80, log2(80) ≈ 6.321928
+query R
+select log(2.0, CAST(8e1 AS DECIMAL(38, -1)));
+----
+6.321928094887
+
+
+# log(decimal128) with negative scale and different bases
+query R rowsort
+select log(base, CAST(1e4 AS DECIMAL(38, -4)))
+from (values (10.0), (2.0), (3.0)) as t(base);
+----
+13.287712379549
+4
+8.383613097158
+
+# Test log of a decimal value between 0 and 1 (e.g., 0.5)
+query R
+SELECT log(10, arrow_cast(0.5, 'Decimal32(5, 1)'))
+----
+-0.301029995664
+
+query R
+SELECT log(10, arrow_cast(1 , 'Decimal32(5, 1)'))
+----
+0
+
+# power with decimals
+
+query RT
+SELECT power(2::decimal(38, 0), 4), arrow_typeof(power(2::decimal(38, 0), 4));
+----
+16 Decimal128(38, 0)
+
+query RT
+SELECT power(10000000000::decimal(38, 0), 2), arrow_typeof(power(10000000000::decimal(38, 0), 2));
+----
+100000000000000000000 Decimal128(38, 0)
+
+query R
+SELECT power(2.5, 4)
+----
+39
+
+query R
+SELECT power(2.5, 1)
+----
+2.5
+
+query R
+SELECT power(2.5, 0)
+----
+1
+
+query R
+SELECT power(1e4, 2)
+----
+100000000
+
+# int64 base with decimal exponent (coerced to float computation)
+query R
+SELECT power(10, -2.0)
+----
+0.01
+
+query R
+SELECT power(2, -0.5)
+----
+0.707106781187
+
+# query error Unsupported data type Decimal128\(2, 1\) for power function
+# SELECT power(2.5, 4.0)
+
+# power() with very large exponent returns infinity (Float64 behavior)
+query R
+SELECT power(2, 100000000000)
+----
+Infinity
+
+# Negative exponent now works (fallback to f64)
+query RT
+SELECT power(2::decimal(38, 0), -5), arrow_typeof(power(2::decimal(38, 0), -5));
+----
+0 Decimal128(38, 0)
+
+# Negative exponent with scale preserves decimal places
+query RT
+SELECT power(4::decimal(38, 5), -1), arrow_typeof(power(4::decimal(38, 5), -1));
+----
+0.25 Decimal128(38, 5)
+
+# Expected to have `16 Decimal128(38, 0)`
+# Due to type coericion, it becomes Float -> Float -> Float
+query RT
+SELECT power(2::decimal(38, 0), 4), arrow_typeof(power(2::decimal(38, 0), 4));
+----
+16 Decimal128(38, 0)
+
+# Arbitrary scale
+query RT
+SELECT power(2.5::decimal(38, 3), 4), arrow_typeof(power(2.5::decimal(38, 3), 4));
+----
+39.062 Decimal128(38, 3)
+
+query RT
+SELECT power(2.5, 4.0), arrow_typeof(power(2.5, 4.0));
+----
+39 Decimal128(2, 1)
+
+# Non-integer exponent now works (fallback to f64)
+query RT
+SELECT power(2.5, 4.2), arrow_typeof(power(2.5, 4.2));
+----
+46.9 Decimal128(2, 1)
+
+query error Compute error: Cannot use non-finite exp: NaN
+SELECT power(2::decimal(38, 0), arrow_cast('NaN','Float64'))
+
+query error Compute error: Cannot use non-finite exp: inf
+SELECT power(2::decimal(38, 0), arrow_cast('INF','Float64'))
+
+# Floating above u32::max now works (fallback to f64, returns infinity which is an error)
+query error Arrow error: Arithmetic overflow: Result of 2\^5000000000.1 is not finite
+SELECT power(2::decimal(38, 0), 5000000000.1)
+
+# Integer Above u32::max - still goes through integer path which fails
+query error Arrow error: Arithmetic overflow: Unsupported exp value
+SELECT power(2::decimal(38, 0), 5000000000)
+
+query ?T
+SELECT power(arrow_cast(2, 'Decimal32(5, 0)'), 4), arrow_typeof(power(arrow_cast(2, 'Decimal32(5, 0)'), 4));
+----
+16 Decimal32(5, 0)
+
+query ?T
+SELECT power(arrow_cast(2, 'Decimal64(5, 0)'), 4), arrow_typeof(power(arrow_cast(2, 'Decimal64(5, 0)'), 4));
+----
+16 Decimal64(5, 0)
+
+query RT
+SELECT power(2::decimal(76, 0), 4), arrow_typeof(power(2::decimal(76, 0), 4));
+----
+16 Decimal256(76, 0)
+
+query R
+SELECT power(2.0, null)
+----
+NULL
+
+# Array variants of power function
+query RR rowsort
+SELECT distinct c1*100000, power(c1*100000, 2) from decimal_simple;
+----
+1 1
+2 4
+3 9
+4 16
+5 25
+
+query RR rowsort
+SELECT distinct c1*100000, power(c1*100000, 2.0) from decimal_simple;
+----
+1 1
+2 4
+3 9
+4 16
+5 25
+
+# Set parse_float_as_decimal to false to test float parsing
+statement ok
+set datafusion.sql_parser.parse_float_as_decimal = false;
+
+# smoke test for decimal parsing
+query R
+select 100000000000000000000000000000000000::decimal(38,0)
+----
+99999999999999996863366107917975552
+
+# log(10^35) for decimal128 with explicit decimal base
+# Float parsing is rounding down, but log uses float computation so result rounds to 35
+query R
+select log(10, 100000000000000000000000000000000000::decimal(38,0));
+----
+35
+
+# log(10^35) for large decimal128 if parsed as float
+# Float parsing is rounding down, but log uses float computation so result rounds to 35
+query R
+select log(100000000000000000000000000000000000::decimal(38,0))
+----
+35
+
+# Result is decimal since argument is decimal regardless decimals-as-floats parsing
+query R
+SELECT power(10000000000::decimal(38, 0), 2);
+----
+100000000000000000000
+
+query RT
+SELECT power(10000000000::decimal(38, 0), 2),
+       arrow_typeof(power(10000000000::decimal(38, 0), 2));
+----
+100000000000000000000 Decimal128(38, 0)
+
+query R
+SELECT power(2.5, 4.0)
+----
+39.0625
+
+query R
+SELECT power(2.5, 4)
+----
+39.0625
+
+query R
+SELECT power(2, null)
+----
+NULL
+
+query error Arrow error: Invalid argument error: 1.10 is too large to store in a Decimal128 of precision 2. Max is 0.99
+select cast(1.1 as decimal(2, 2)) + 1;
diff --git a/datafusion/sqllogictest/test_files/delete.slt b/datafusion/sqllogictest/test_files/delete.slt
index 258318f09423c..b01eb6f5e9ec7 100644
--- a/datafusion/sqllogictest/test_files/delete.slt
+++ b/datafusion/sqllogictest/test_files/delete.slt
@@ -34,7 +34,9 @@ explain delete from t1;
 logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by existing columns
@@ -45,7 +47,9 @@ logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by existing columns, using qualified and unqualified names
@@ -56,7 +60,9 @@ logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Filtered by a mix of columns and literal predicates
@@ -67,7 +73,9 @@ logical_plan
 01)Dml: op=[Delete] table=[t1]
 02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND Int64(1) = Int64(1) AND Boolean(true)
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Delete)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 
 # Deleting by columns that do not exist returns an error
@@ -105,3 +113,30 @@ logical_plan
 05)--------TableScan: t2
 06)----TableScan: t1
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression InSubquery(InSubquery { expr: Column(Column { relation: Some(Bare { table: "t1" }), name: "a" }), subquery: <subquery>, negated: false })
+
+
+# Delete with limit
+
+query TT
+explain delete from t1 limit 10
+----
+logical_plan
+01)Dml: op=[Delete] table=[t1]
+02)--Limit: skip=0, fetch=10
+03)----TableScan: t1
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
+
+
+query TT
+explain delete from t1 where a = 1 and b = '2' limit 10
+----
+logical_plan
+01)Dml: op=[Delete] table=[t1]
+02)--Limit: skip=0, fetch=10
+03)----Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Utf8("2") AS Utf8View)
+04)------TableScan: t1
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt
index e4cb30628eec5..88347965c67a5 100644
--- a/datafusion/sqllogictest/test_files/describe.slt
+++ b/datafusion/sqllogictest/test_files/describe.slt
@@ -83,6 +83,62 @@ float_col Float32 YES
 double_col Float64 YES
 date_string_col Utf8View YES
 string_col Utf8View YES
-timestamp_col Timestamp(Nanosecond, None) YES
+timestamp_col Timestamp(ns) YES
 year Int32 YES
 month Int32 YES
+
+# Test DESC alias functionality
+statement ok
+CREATE TABLE test_desc_table (id INT, name VARCHAR);
+
+# Test DESC works the same as DESCRIBE
+query TTT
+DESC test_desc_table;
+----
+id Int32 YES
+name Utf8View YES
+
+query TTT
+DESCRIBE test_desc_table;
+----
+id Int32 YES
+name Utf8View YES
+
+# Test with qualified table names
+statement ok
+CREATE TABLE public.test_qualified (col1 INT);
+
+query TTT
+DESC public.test_qualified;
+----
+col1 Int32 YES
+
+# Test error cases
+statement error
+DESC nonexistent_table;
+
+##########
+# Describe statement
+##########
+
+# Test describing the schema of a simple statement
+query TTT
+DESCRIBE SELECT 1;
+----
+Int64(1) Int64 NO
+
+# Insert some data into the existing test table...
+statement ok
+INSERT INTO test_desc_table (id, name) VALUES (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, 'Alice');
+
+# ... and describe the schema of a more complex query
+query TTT
+DESCRIBE SELECT name, COUNT(*) AS name_count FROM test_desc_table
+    GROUP BY name HAVING COUNT(*) > 1 ORDER BY name_count DESC;
+----
+name Utf8View YES
+name_count Int64 NO
+
+# Describing a statement that's not a query is not supported
+statement error Describing statements other than SELECT not supported
+DESCRIBE CREATE TABLE test_desc_table (id INT, name VARCHAR);
diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt
index d241e61f33ffd..511061cf82f06 100644
--- a/datafusion/sqllogictest/test_files/dictionary.slt
+++ b/datafusion/sqllogictest/test_files/dictionary.slt
@@ -36,7 +36,7 @@ SELECT
     arrow_cast(column3, 'Utf8') as f2,
     arrow_cast(column4, 'Utf8') as f3,
     arrow_cast(column5, 'Float64') as f4,
-    arrow_cast(column6, 'Timestamp(Nanosecond, None)') as time
+    arrow_cast(column6, 'Timestamp(ns)') as time
 FROM (
     VALUES
     -- equivalent to the following line protocol data
@@ -85,7 +85,7 @@ f1 Float64 YES
 f2 Utf8 YES
 f3 Utf8 YES
 f4 Float64 YES
-time Timestamp(Nanosecond, None) YES
+time Timestamp(ns) YES
 
 # in list with dictionary input
 query BBB
@@ -111,7 +111,7 @@ SELECT
     arrow_cast(column1, 'Dictionary(Int32, Utf8)') as type,
     arrow_cast(column2, 'Dictionary(Int32, Utf8)') as tag_id,
     arrow_cast(column3, 'Float64') as f5,
-    arrow_cast(column4, 'Timestamp(Nanosecond, None)') as time
+    arrow_cast(column4, 'Timestamp(ns)') as time
 FROM (
     VALUES
     -- equivalent to the following line protocol data
@@ -157,7 +157,7 @@ DESCRIBE m2;
 type Dictionary(Int32, Utf8) YES
 tag_id Dictionary(Int32, Utf8) YES
 f5 Float64 YES
-time Timestamp(Nanosecond, None) YES
+time Timestamp(ns) YES
 
 query I
 select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00';
@@ -410,9 +410,8 @@ logical_plan
 01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column2@1 = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # try literal = col to verify order doesn't matter
 # filter should not cast column2
@@ -423,9 +422,8 @@ logical_plan
 01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column2@1 = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 # Now query using an integer which must be coerced into a dictionary string
@@ -441,9 +439,8 @@ logical_plan
 01)Filter: test.column2 = Dictionary(Int32, Utf8("1"))
 02)--TableScan: test projection=[column1, column2]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column2@1 = 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column2@1 = 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Window Functions
 query I
@@ -456,4 +453,4 @@ statement ok
 CREATE TABLE test0 AS VALUES ('foo',1), ('bar',2), ('foo',3);
 
 statement ok
-COPY (SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1, column2 FROM test0) TO 'test_files/scratch/copy/part_dict_test' STORED AS PARQUET PARTITIONED BY (column1);
+COPY (SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1, column2 FROM test0) TO 'test_files/scratch/dictionary/part_dict_test' STORED AS PARQUET PARTITIONED BY (column1);
diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt
index b4a491619e893..23a269b525f38 100644
--- a/datafusion/sqllogictest/test_files/distinct_on.slt
+++ b/datafusion/sqllogictest/test_files/distinct_on.slt
@@ -98,11 +98,10 @@ physical_plan
 02)--SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 03)----SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
 # ON expressions are not a sub-set of the ORDER BY expressions
 query error SELECT DISTINCT ON expressions must match initial ORDER BY expressions
diff --git a/datafusion/sqllogictest/test_files/dml_delete.slt b/datafusion/sqllogictest/test_files/dml_delete.slt
new file mode 100644
index 0000000000000..3dae431ada377
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/dml_delete.slt
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## DELETE tests for MemTable
+##########
+
+# Test basic DELETE with WHERE clause
+statement ok
+CREATE TABLE test_delete AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+query I
+DELETE FROM test_delete WHERE column1 > 1;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_delete;
+----
+1 a
+
+statement ok
+DROP TABLE test_delete;
+
+# Test DELETE all rows (no WHERE clause)
+statement ok
+CREATE TABLE test_delete_all AS VALUES (1, 'x'), (2, 'y'), (3, 'z');
+
+query I
+DELETE FROM test_delete_all;
+----
+3
+
+query I
+SELECT COUNT(*) FROM test_delete_all;
+----
+0
+
+statement ok
+DROP TABLE test_delete_all;
+
+# Test DELETE with compound predicate (AND)
+statement ok
+CREATE TABLE test_delete_compound AS VALUES (1, 10), (2, 20), (3, 30), (4, 40);
+
+query I
+DELETE FROM test_delete_compound WHERE column1 > 1 AND column2 < 40;
+----
+2
+
+query II rowsort
+SELECT * FROM test_delete_compound;
+----
+1 10
+4 40
+
+statement ok
+DROP TABLE test_delete_compound;
+
+# Test DELETE with OR predicate
+statement ok
+CREATE TABLE test_delete_or AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd');
+
+query I
+DELETE FROM test_delete_or WHERE column1 = 1 OR column1 = 4;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_delete_or;
+----
+2 b
+3 c
+
+statement ok
+DROP TABLE test_delete_or;
+
+# Test DELETE with no matching rows
+statement ok
+CREATE TABLE test_delete_nomatch AS VALUES (1, 'a'), (2, 'b');
+
+query I
+DELETE FROM test_delete_nomatch WHERE column1 > 100;
+----
+0
+
+query IT rowsort
+SELECT * FROM test_delete_nomatch;
+----
+1 a
+2 b
+
+statement ok
+DROP TABLE test_delete_nomatch;
+
+# Test DELETE with IS NULL predicate
+statement ok
+CREATE TABLE test_delete_null(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_delete_null VALUES (1, 'one'), (2, NULL), (3, 'three');
+
+query I
+DELETE FROM test_delete_null WHERE name IS NULL;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_delete_null;
+----
+1 one
+3 three
+
+statement ok
+DROP TABLE test_delete_null;
+
+# Test DELETE with NULL predicate (SQL three-valued logic)
+# When comparing with NULL, the predicate evaluates to NULL, not true/false
+# Rows where predicate is NULL should NOT be deleted
+statement ok
+CREATE TABLE test_delete_null_pred(id INT, value INT);
+
+statement ok
+INSERT INTO test_delete_null_pred VALUES (1, 10), (2, NULL), (3, 30);
+
+# This predicate evaluates to NULL for row with id=2 (because NULL > 15 is NULL)
+# Only row with id=3 should be deleted (30 > 15 is true)
+# Row with id=2 should be kept (NULL > 15 is NULL, not true)
+query I
+DELETE FROM test_delete_null_pred WHERE value > 15;
+----
+1
+
+query II rowsort
+SELECT * FROM test_delete_null_pred;
+----
+1 10
+2 NULL
+
+statement ok
+DROP TABLE test_delete_null_pred;
+
+# Test multiple DELETEs on same table (state persistence)
+statement ok
+CREATE TABLE test_multi_delete AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e');
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 1;
+----
+1
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 3;
+----
+1
+
+query I
+DELETE FROM test_multi_delete WHERE column1 = 5;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_multi_delete;
+----
+2 b
+4 d
+
+statement ok
+DROP TABLE test_multi_delete;
+
+# Test DELETE with IN predicate
+statement ok
+CREATE TABLE test_delete_in AS VALUES (1), (2), (3), (4), (5);
+
+query I
+DELETE FROM test_delete_in WHERE column1 IN (2, 4);
+----
+2
+
+query I rowsort
+SELECT * FROM test_delete_in;
+----
+1
+3
+5
+
+statement ok
+DROP TABLE test_delete_in;
diff --git a/datafusion/sqllogictest/test_files/dml_update.slt b/datafusion/sqllogictest/test_files/dml_update.slt
new file mode 100644
index 0000000000000..10f74ae3970da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/dml_update.slt
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## UPDATE tests for MemTable
+##########
+
+# Test basic UPDATE with WHERE clause
+statement ok
+CREATE TABLE test_update AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+query I
+UPDATE test_update SET column2 = 'updated' WHERE column1 = 2;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_update;
+----
+1 a
+2 updated
+3 c
+
+statement ok
+DROP TABLE test_update;
+
+# Test UPDATE all rows (no WHERE clause)
+statement ok
+CREATE TABLE test_update_all AS VALUES (1, 'x'), (2, 'y'), (3, 'z');
+
+query I
+UPDATE test_update_all SET column2 = 'changed';
+----
+3
+
+query IT rowsort
+SELECT * FROM test_update_all;
+----
+1 changed
+2 changed
+3 changed
+
+statement ok
+DROP TABLE test_update_all;
+
+# Test UPDATE multiple columns
+statement ok
+CREATE TABLE test_update_multi(id INT, name VARCHAR, value INT);
+
+statement ok
+INSERT INTO test_update_multi VALUES (1, 'one', 10), (2, 'two', 20), (3, 'three', 30);
+
+query I
+UPDATE test_update_multi SET name = 'updated', value = 99 WHERE id = 2;
+----
+1
+
+query ITI rowsort
+SELECT * FROM test_update_multi;
+----
+1 one 10
+2 updated 99
+3 three 30
+
+statement ok
+DROP TABLE test_update_multi;
+
+# Test UPDATE with compound predicate (AND)
+statement ok
+CREATE TABLE test_update_compound AS VALUES (1, 10), (2, 20), (3, 30), (4, 40);
+
+query I
+UPDATE test_update_compound SET column2 = 0 WHERE column1 > 1 AND column2 < 40;
+----
+2
+
+query II rowsort
+SELECT * FROM test_update_compound;
+----
+1 10
+2 0
+3 0
+4 40
+
+statement ok
+DROP TABLE test_update_compound;
+
+# Test UPDATE with OR predicate
+statement ok
+CREATE TABLE test_update_or AS VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd');
+
+query I
+UPDATE test_update_or SET column2 = 'modified' WHERE column1 = 1 OR column1 = 4;
+----
+2
+
+query IT rowsort
+SELECT * FROM test_update_or;
+----
+1 modified
+2 b
+3 c
+4 modified
+
+statement ok
+DROP TABLE test_update_or;
+
+# Test UPDATE with no matching rows
+statement ok
+CREATE TABLE test_update_nomatch AS VALUES (1, 'a'), (2, 'b');
+
+query I
+UPDATE test_update_nomatch SET column2 = 'new' WHERE column1 > 100;
+----
+0
+
+query IT rowsort
+SELECT * FROM test_update_nomatch;
+----
+1 a
+2 b
+
+statement ok
+DROP TABLE test_update_nomatch;
+
+# Test UPDATE with NULL predicate (SQL three-valued logic)
+# Rows where predicate is NULL should NOT be updated
+statement ok
+CREATE TABLE test_update_null_pred(id INT, value INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_null_pred VALUES (1, 10, 'a'), (2, NULL, 'b'), (3, 30, 'c');
+
+# This predicate evaluates to NULL for row with id=2 (because NULL > 15 is NULL)
+# Only row with id=3 should be updated (30 > 15 is true)
+# Row with id=2 should keep its original name 'b' (NULL > 15 is NULL, not true)
+query I
+UPDATE test_update_null_pred SET name = 'updated' WHERE value > 15;
+----
+1
+
+query IIT rowsort
+SELECT * FROM test_update_null_pred;
+----
+1 10 a
+2 NULL b
+3 30 updated
+
+statement ok
+DROP TABLE test_update_null_pred;
+
+# Test UPDATE with arithmetic expression (SET column = column * 2)
+statement ok
+CREATE TABLE test_update_expr AS VALUES (1, 10), (2, 20), (3, 30);
+
+query I
+UPDATE test_update_expr SET column2 = column2 * 2 WHERE column1 = 2;
+----
+1
+
+query II rowsort
+SELECT * FROM test_update_expr;
+----
+1 10
+2 40
+3 30
+
+statement ok
+DROP TABLE test_update_expr;
+
+# Test UPDATE setting column to NULL
+statement ok
+CREATE TABLE test_update_null(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_null VALUES (1, 'one'), (2, 'two'), (3, 'three');
+
+query I
+UPDATE test_update_null SET name = NULL WHERE id = 2;
+----
+1
+
+query IT rowsort
+SELECT * FROM test_update_null;
+----
+1 one
+2 NULL
+3 three
+
+statement ok
+DROP TABLE test_update_null;
+
+# Test UPDATE with CASE expression in SET
+statement ok
+CREATE TABLE test_update_case AS VALUES (1, 'low'), (50, 'medium'), (100, 'high');
+
+query I
+UPDATE test_update_case SET column2 = CASE
+    WHEN column1 < 25 THEN 'small'
+    WHEN column1 < 75 THEN 'medium'
+    ELSE 'large'
+END;
+----
+3
+
+query IT rowsort
+SELECT * FROM test_update_case;
+----
+1 small
+100 large
+50 medium
+
+statement ok
+DROP TABLE test_update_case;
+
+# Test UPDATE with column reference (SET a = b)
+statement ok
+CREATE TABLE test_update_col_ref AS VALUES (1, 10, 100), (2, 20, 200);
+
+query I
+UPDATE test_update_col_ref SET column1 = column2 WHERE column3 = 100;
+----
+1
+
+query III rowsort
+SELECT * FROM test_update_col_ref;
+----
+10 10 100
+2 20 200
+
+statement ok
+DROP TABLE test_update_col_ref;
+
+# Test UPDATE with invalid column name (error case)
+statement ok
+CREATE TABLE test_update_error(id INT, name VARCHAR);
+
+statement ok
+INSERT INTO test_update_error VALUES (1, 'test');
+
+statement error No field named nonexistent
+UPDATE test_update_error SET nonexistent = 'value';
+
+statement ok
+DROP TABLE test_update_error;
+
+# Test UPDATE with expression that would error on non-matching rows
+# Regression test: expressions should only be evaluated on rows that match
+# the WHERE clause, not all rows. This prevents divide-by-zero errors
+# on rows that won't be updated.
+statement ok
+CREATE TABLE test_update_div(id INT, divisor INT, result INT);
+
+statement ok
+INSERT INTO test_update_div VALUES (1, 0, 0), (2, 2, 0), (3, 5, 0);
+
+# This should succeed: 1/divisor is only evaluated where divisor != 0
+# Row 1 (divisor=0) is excluded by WHERE clause and expression is NOT evaluated
+query I
+UPDATE test_update_div SET result = 100 / divisor WHERE divisor != 0;
+----
+2
+
+query III rowsort
+SELECT * FROM test_update_div;
+----
+1 0 0
+2 2 50
+3 5 20
+
+statement ok
+DROP TABLE test_update_div;
diff --git a/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt
new file mode 100644
index 0000000000000..d5202a1d9570d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/dynamic_filter_pushdown_config.slt
@@ -0,0 +1,827 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for dynamic filter pushdown configuration options
+# - enable_topk_dynamic_filter_pushdown (for TopK dynamic filters)
+# - enable_join_dynamic_filter_pushdown (for Join dynamic filters)
+# - enable_aggregate_dynamic_filter_pushdown (for Aggregate dynamic filters)
+# - enable_dynamic_filter_pushdown (controls all three)
+
+# Setup: Create parquet test files
+statement ok
+CREATE TABLE test_data(id INT, value INT, name VARCHAR) AS VALUES
+(1, 100, 'a'),
+(2, 200, 'b'),
+(3, 300, 'c'),
+(4, 400, 'd'),
+(5, 500, 'e'),
+(6, 600, 'f'),
+(7, 700, 'g'),
+(8, 800, 'h'),
+(9, 900, 'i'),
+(10, 1000, 'j');
+
+statement ok
+CREATE TABLE join_left(id INT, data VARCHAR) AS VALUES
+(1, 'left1'),
+(2, 'left2'),
+(3, 'left3'),
+(4, 'left4'),
+(5, 'left5');
+
+statement ok
+CREATE TABLE join_right(id INT, info VARCHAR) AS VALUES
+(1, 'right1'),
+(3, 'right3'),
+(5, 'right5');
+
+# Copy data to parquet files
+query I
+COPY test_data TO 'test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet' STORED AS PARQUET;
+----
+10
+
+query I
+COPY join_left TO 'test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet' STORED AS PARQUET;
+----
+5
+
+query I
+COPY join_right TO 'test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet' STORED AS PARQUET;
+----
+3
+
+# Create external tables from parquet files
+statement ok
+CREATE EXTERNAL TABLE test_parquet(id INT, value INT, name VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE left_parquet(id INT, data VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE right_parquet(id INT, info VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet';
+
+# Test 1: TopK dynamic filter pushdown with Parquet
+query TT
+EXPLAIN SELECT * FROM test_parquet ORDER BY value DESC LIMIT 3;
+----
+logical_plan
+01)Sort: test_parquet.value DESC NULLS FIRST, fetch=3
+02)--TableScan: test_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+query TT
+EXPLAIN ANALYZE SELECT id, value AS v, value + id as name FROM test_parquet where value > 3 ORDER BY v DESC LIMIT 3;
+----
+Plan with Metrics
+01)SortPreservingMergeExec: [v@1 DESC], fetch=3, metrics=[output_rows=3, <slt:ignore>]
+02)--SortExec: TopK(fetch=3), expr=[v@1 DESC], preserve_partitioning=[true], filter=[v@1 IS NULL OR v@1 > 800], metrics=[output_rows=3, <slt:ignore>]
+03)----ProjectionExec: expr=[id@0 as id, value@1 as v, value@1 + id@0 as name], metrics=[output_rows=10, <slt:ignore>]
+04)------FilterExec: value@1 > 3, metrics=[output_rows=10, <slt:ignore>, selectivity=100% (10/10)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, metrics=[output_rows=10, <slt:ignore>]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value], file_type=parquet, predicate=value@1 > 3 AND DynamicFilter [ value@1 IS NULL OR value@1 > 800 ], pruning_predicate=value_null_count@1 != row_count@2 AND value_max@0 > 3 AND (value_null_count@1 > 0 OR value_null_count@1 != row_count@2 AND value_max@0 > 800), required_guarantees=[], metrics=[output_rows=10, elapsed_compute=1ns, output_bytes=80.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched -> 1 fully matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=210, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=18% (210/1.15 K)]
+
+statement ok
+set datafusion.explain.analyze_level = dev;
+
+query III
+SELECT id, value AS v, value + id as name FROM test_parquet where value > 3 ORDER BY v DESC LIMIT 3;
+----
+10 1000 1010
+9 900 909
+8 800 808
+
+# Disable TopK dynamic filter pushdown
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = false;
+
+query TT
+EXPLAIN SELECT * FROM test_parquet ORDER BY value DESC LIMIT 3;
+----
+logical_plan
+01)Sort: test_parquet.value DESC NULLS FIRST, fetch=3
+02)--TableScan: test_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[value@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value, name], file_type=parquet
+
+query IIT
+SELECT id, value AS v, name FROM (SELECT * FROM test_parquet UNION ALL SELECT * FROM test_parquet) ORDER BY v DESC LIMIT 3;
+----
+10 1000 j
+10 1000 j
+9 900 i
+
+# Re-enable for next tests
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
+
+# Test 2: Join dynamic filter pushdown with Parquet
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Disable Join dynamic filter pushdown
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = false;
+
+# Without Join filter, HashJoin should NOT have filter=DynamicFilter
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# Re-enable for next tests
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
+
+# Test 2b: Dynamic filter pushdown for non-inner join types
+# LEFT JOIN: optimizer swaps to physical Right join (build=right_parquet, probe=left_parquet).
+# Dynamic filter is NOT pushed because Right join needs all probe rows in output.
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+LEFT JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Left Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# LEFT JOIN correctness: all left rows appear, unmatched right rows produce NULLs
+query ITT
+SELECT l.id, l.data, r.info
+FROM left_parquet l
+LEFT JOIN right_parquet r ON l.id = r.id
+ORDER BY l.id;
+----
+1 left1 right1
+2 left2 NULL
+3 left3 right3
+4 left4 NULL
+5 left5 right5
+
+# RIGHT JOIN: optimizer swaps to physical Left join (build=right_parquet, probe=left_parquet).
+# Physical Left join generates a self-generated dynamic filter on the probe side.
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+RIGHT JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Right Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# RIGHT JOIN correctness: all right rows appear, unmatched left rows produce NULLs
+query ITT
+SELECT l.id, l.data, r.info
+FROM left_parquet l
+RIGHT JOIN right_parquet r ON l.id = r.id
+ORDER BY r.id;
+----
+1 left1 right1
+3 left3 right3
+5 left5 right5
+
+# FULL JOIN: dynamic filter should NOT be pushed (both sides must preserve all rows)
+query TT
+EXPLAIN SELECT l.id, r.id as rid, l.data, r.info
+FROM left_parquet l
+FULL JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, r.id AS rid, l.data, r.info
+02)--Full Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@2 as id, id@0 as rid, data@3 as data, info@1 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# LEFT SEMI JOIN: optimizer swaps to RightSemi (build=right_parquet, probe=left_parquet)
+# and pushes the self-generated filter to the right side (left parquet).
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r);
+----
+logical_plan
+01)LeftSemi Join: l.id = __correlated_sq_1.id
+02)--SubqueryAlias: l
+03)----TableScan: left_parquet projection=[id, data]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT SEMI JOIN (physical LeftSemi): reverse table roles so optimizer keeps LeftSemi
+# (right_parquet has 3 rows < left_parquet has 5 rows, so no swap occurs).
+# Physical LeftSemi generates a self-generated dynamic filter on the probe side.
+query TT
+EXPLAIN SELECT r.*
+FROM right_parquet r
+WHERE r.id IN (SELECT l.id FROM left_parquet l);
+----
+logical_plan
+01)LeftSemi Join: r.id = __correlated_sq_1.id
+02)--SubqueryAlias: r
+03)----TableScan: right_parquet projection=[id, info]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: l
+06)------TableScan: left_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT SEMI (physical LeftSemi) correctness: only right rows with matching left ids
+query IT rowsort
+SELECT r.*
+FROM right_parquet r
+WHERE r.id IN (SELECT l.id FROM left_parquet l);
+----
+1 right1
+3 right3
+5 right5
+
+# LEFT ANTI JOIN: both self generated and parent filters can push to the
+# preserved (left/build) side.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r);
+----
+logical_plan
+01)LeftAnti Join: l.id = __correlated_sq_1.id
+02)--SubqueryAlias: l
+03)----TableScan: left_parquet projection=[id, data]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT MARK JOIN: the OR prevents decorrelation to LeftSemi, so the optimizer
+# uses LeftMark. Self-generated dynamic filter pushes to the probe side.
+query TT
+EXPLAIN SELECT r.id, r.info
+FROM right_parquet r
+WHERE EXISTS (SELECT 1 FROM left_parquet l WHERE r.id = l.id)
+   OR r.id = 999;
+----
+logical_plan
+01)Projection: r.id, r.info
+02)--Filter: __correlated_sq_1.mark OR r.id = Int32(999)
+03)----LeftMark Join: r.id = __correlated_sq_1.id
+04)------SubqueryAlias: r
+05)--------TableScan: right_parquet projection=[id, info]
+06)------SubqueryAlias: __correlated_sq_1
+07)--------SubqueryAlias: l
+08)----------TableScan: left_parquet projection=[id]
+physical_plan
+01)FilterExec: mark@2 OR id@0 = 999, projection=[id@0, info@1]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# LEFT MARK correctness: all right rows match EXISTS, so all 3 appear
+query IT rowsort
+SELECT r.id, r.info
+FROM right_parquet r
+WHERE EXISTS (SELECT 1 FROM left_parquet l WHERE r.id = l.id)
+   OR r.id = 999;
+----
+1 right1
+3 right3
+5 right5
+
+# Test 2c: Parent dynamic filter (from TopK) pushed through semi/anti joins
+# Sort on the join key (id) so the TopK dynamic filter pushes to BOTH sides.
+
+# SEMI JOIN with TopK parent: TopK generates a dynamic filter on `id` (join
+# key) that pushes through the RightSemi join to both the build and probe sides
+# as well as the HashJoinExec pushing the self-generated filter to the
+# right-hand side of the join.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+logical_plan
+01)Sort: l.id ASC NULLS LAST, fetch=2
+02)--LeftSemi Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+
+# Correctness check
+query IT
+SELECT l.*
+FROM left_parquet l
+WHERE l.id IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+1 left1
+3 left3
+
+# ANTI JOIN with TopK parent: TopK generates a dynamic filter on `id` (join
+# key) that pushes through the LeftAnti join to both the preserved and
+# non-preserved sides. The HashJoin pushes the self-generated filter to the
+# right hand side of the LeftAnti join.
+query TT
+EXPLAIN SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+logical_plan
+01)Sort: l.id ASC NULLS LAST, fetch=2
+02)--LeftAnti Join: l.id = __correlated_sq_1.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: __correlated_sq_1
+06)------SubqueryAlias: r
+07)--------TableScan: right_parquet projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] AND DynamicFilter [ empty ]
+
+# Correctness check
+query IT
+SELECT l.*
+FROM left_parquet l
+WHERE l.id NOT IN (SELECT r.id FROM right_parquet r)
+ORDER BY l.id LIMIT 2;
+----
+2 left2
+4 left4
+
+# Test 3: Test independent control
+
+# Disable TopK, keep Join enabled
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = false;
+
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
+
+# Join should still have dynamic filter
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Enable TopK, disable Join
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = false;
+
+# Join should NOT have dynamic filter
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# Test 4: Aggregate dynamic filter pushdown
+
+# Prepare aggregate-specific parquet data without statistics so aggregate statistics optimizer
+# doesn't pre-compute results.
+statement ok
+CREATE TABLE agg_source(category VARCHAR, score INT) AS VALUES
+('alpha', 10),
+('alpha', 25),
+('beta', 5),
+('beta', 12),
+('gamma', 42),
+('gamma', 8);
+
+statement ok
+SET datafusion.execution.parquet.statistics_enabled = 'none';
+
+statement ok
+COPY agg_source TO 'test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet' STORED AS PARQUET;
+
+statement ok
+SET datafusion.execution.parquet.statistics_enabled = 'page';
+
+statement ok
+CREATE EXTERNAL TABLE agg_parquet(category VARCHAR, score INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet';
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Aggregate dynamic filter should be pushed into the scan when enabled
+# Expecting a `DynamicFilter` inside parquet scanner's predicate
+query TT
+EXPLAIN SELECT MAX(score) FROM agg_parquet WHERE category = 'alpha'
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(agg_parquet.score)]]
+02)--Projection: agg_parquet.score
+03)----Filter: agg_parquet.category = Utf8View("alpha")
+04)------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_parquet.score)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_parquet.score)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha AND DynamicFilter [ empty ], pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+# Test 4b: COUNT + MAX — DynamicFilter should NOT appear here in mixed aggregates
+
+query TT
+EXPLAIN SELECT COUNT(*), MAX(score) FROM agg_parquet WHERE category = 'alpha';
+----
+logical_plan
+01)Projection: count(Int64(1)) AS count(*), max(agg_parquet.score)
+02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1)), max(agg_parquet.score)]]
+03)----Projection: agg_parquet.score
+04)------Filter: agg_parquet.category = Utf8View("alpha")
+05)--------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)ProjectionExec: expr=[count(Int64(1))@0 as count(*), max(agg_parquet.score)@1 as max(agg_parquet.score)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1)), max(agg_parquet.score)]
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1)), max(agg_parquet.score)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+# Disable aggregate dynamic filters only
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = false;
+
+# Expecting no `DynamicFilter` inside parquet scanner's predicate
+query TT
+EXPLAIN SELECT MAX(score) FROM agg_parquet WHERE category = 'alpha'
+----
+logical_plan
+01)Aggregate: groupBy=[[]], aggr=[[max(agg_parquet.score)]]
+02)--Projection: agg_parquet.score
+03)----Filter: agg_parquet.category = Utf8View("alpha")
+04)------TableScan: agg_parquet projection=[category, score], partial_filters=[agg_parquet.category = Utf8View("alpha")]
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_parquet.score)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_parquet.score)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/agg_data.parquet]]}, projection=[score], file_type=parquet, predicate=category@0 = alpha, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1, required_guarantees=[category in (alpha)]
+
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = false;
+
+# Test 5: Backward compatibility
+
+# First, set both new configs to specific values
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
+
+statement ok
+set datafusion.catalog.information_schema = true
+
+# Setting the config should override both
+statement ok
+SET datafusion.optimizer.enable_dynamic_filter_pushdown = false;
+
+# Verify all configs are now false
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown';
+----
+false
+
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown';
+----
+false
+
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown';
+----
+false
+
+statement ok
+set datafusion.catalog.information_schema = false
+
+# Join should NOT have dynamic filter
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_dynamic_filter_pushdown = true;
+
+statement ok
+set datafusion.catalog.information_schema = true
+
+# Verify all configs are now true
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_topk_dynamic_filter_pushdown';
+----
+true
+
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_join_dynamic_filter_pushdown';
+----
+true
+
+query T
+SELECT value FROM information_schema.df_settings
+WHERE name = 'datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown';
+----
+true
+
+statement ok
+set datafusion.catalog.information_schema = false
+
+# Join should have dynamic filter again
+query TT
+EXPLAIN SELECT l.*, r.info
+FROM left_parquet l
+INNER JOIN right_parquet r ON l.id = r.id;
+----
+logical_plan
+01)Projection: l.id, l.data, r.info
+02)--Inner Join: l.id = r.id
+03)----SubqueryAlias: l
+04)------TableScan: left_parquet projection=[id, data]
+05)----SubqueryAlias: r
+06)------TableScan: right_parquet projection=[id, info]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, data@2 as data, info@0 as info]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[info@1, id@2, data@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_right.parquet]]}, projection=[id, info], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/join_left.parquet]]}, projection=[id, data], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Test 6: Regression test for issue #20213 - dynamic filter applied to wrong table
+# when subquery join has same column names on both sides.
+#
+# The bug: when an outer join pushes a DynamicFilter for column "k" through an
+# inner join where both sides have a column named "k", the name-based routing
+# incorrectly pushed the filter to BOTH sides instead of only the correct one.
+# This caused wrong results (0 rows instead of expected).
+
+# Create tables with same column names (k, v) on both sides
+statement ok
+CREATE TABLE issue_20213_t1(k INT, v INT) AS
+SELECT i as k, i as v FROM generate_series(1, 1000) t(i);
+
+statement ok
+CREATE TABLE issue_20213_t2(k INT, v INT) AS
+SELECT i + 100 as k, i as v FROM generate_series(1, 100) t(i);
+
+# Use small row groups to make statistics-based pruning more likely to manifest the bug
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 10;
+
+query I
+COPY issue_20213_t1 TO 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t1.parquet' STORED AS PARQUET;
+----
+1000
+
+query I
+COPY issue_20213_t2 TO 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t2.parquet' STORED AS PARQUET;
+----
+100
+
+# Reset row group size
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 1000000;
+
+statement ok
+CREATE EXTERNAL TABLE t1_20213(k INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t1.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t2_20213(k INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/dynamic_filter_pushdown_config/issue_20213_t2.parquet';
+
+# The query from issue #20213: subquery joins t1 and t2 on v, then outer
+# join uses t2's k column. The dynamic filter on k from the outer join
+# must only apply to t2 (k range 101-200), NOT to t1 (k range 1-1000).
+query I
+SELECT count(*) FROM (
+    SELECT t2_20213.k as k, t1_20213.k as k2
+    FROM t1_20213
+    JOIN t2_20213 ON t1_20213.v = t2_20213.v
+) a
+JOIN t2_20213 b ON a.k = b.k
+WHERE b.v < 10;
+----
+9
+
+# Also verify with SELECT * to catch row-level correctness
+query IIII rowsort
+SELECT * FROM (
+    SELECT t2_20213.k as k, t1_20213.k as k2
+    FROM t1_20213
+    JOIN t2_20213 ON t1_20213.v = t2_20213.v
+) a
+JOIN t2_20213 b ON a.k = b.k
+WHERE b.v < 10;
+----
+101 1 101 1
+102 2 102 2
+103 3 103 3
+104 4 104 4
+105 5 105 5
+106 6 106 6
+107 7 107 7
+108 8 108 8
+109 9 109 9
+
+statement ok
+DROP TABLE issue_20213_t1;
+
+statement ok
+DROP TABLE issue_20213_t2;
+
+statement ok
+DROP TABLE t1_20213;
+
+statement ok
+DROP TABLE t2_20213;
+
+# Cleanup
+
+statement ok
+DROP TABLE test_data;
+
+statement ok
+DROP TABLE join_left;
+
+statement ok
+DROP TABLE join_right;
+
+statement ok
+DROP TABLE test_parquet;
+
+statement ok
+DROP TABLE left_parquet;
+
+statement ok
+DROP TABLE right_parquet;
+
+statement ok
+DROP TABLE agg_source;
+
+statement ok
+DROP TABLE agg_parquet;
+
+# Reset configs to defaults
+statement ok
+SET datafusion.optimizer.enable_topk_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.optimizer.enable_join_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
+statement ok
+SET datafusion.optimizer.enable_dynamic_filter_pushdown = true;
diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt
index 960e81f4d14c5..c68d59819ea63 100644
--- a/datafusion/sqllogictest/test_files/encoding.slt
+++ b/datafusion/sqllogictest/test_files/encoding.slt
@@ -15,6 +15,52 @@
 # specific language governing permissions and limitations
 # under the License.
 
+query T
+SELECT encode(arrow_cast('tom', 'Utf8View'),'base64');
+----
+dG9t
+
+query T
+SELECT encode(arrow_cast('tommy', 'Utf8View'),'base64pad');
+----
+dG9tbXk=
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9t', 'Utf8View'),'base64'), 'Utf8');
+----
+tom
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'Utf8View'),'base64pad'), 'Utf8');
+----
+tommy
+
+query T
+SELECT encode(arrow_cast('tom', 'BinaryView'),'base64');
+----
+dG9t
+
+query T
+SELECT encode(arrow_cast('tommy', 'BinaryView'),'base64pad');
+----
+dG9tbXk=
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9t', 'BinaryView'),'base64'), 'Utf8');
+----
+tom
+
+query T
+SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'BinaryView'),'base64pad'), 'Utf8');
+----
+tommy
+
+# test for hex digest
+query T
+select encode(digest('hello', 'sha256'), 'hex');
+----
+2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
+
 statement ok
 CREATE TABLE test(
   num INT,
@@ -23,52 +69,65 @@ CREATE TABLE test(
   hex_field TEXT
 ) as VALUES
   (0, 'abc',  encode('abc', 'base64'), encode('abc', 'hex')),
-  (1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
+  (1, 'qweqw', encode('qweqw', 'base64') || '=', encode('qweqw', 'hex')),
   (2, NULL, NULL, NULL),
   (3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex'))
 ;
 
 # errors
-query error 1st argument should be Utf8 or Binary or Null, got Int64
+query error DataFusion error: Error during planning: Function 'encode' requires Binary, but received Int64 \(DataType: Int64\)
 select encode(12, 'hex');
 
-query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex
-select encode(bin_field, 'non_encoding') from test;
-
-query error 1st argument should be Utf8 or Binary or Null, got Int64
+query error DataFusion error: Error during planning: Function 'decode' requires Binary, but received Int64 \(DataType: Int64\)
 select decode(12, 'hex');
 
-query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex
-select decode(hex_field, 'non_encoding') from test;
+query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex
+select encode('', 'non_encoding');
+
+query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex
+select decode('', 'non_encoding');
 
-query error
+query error DataFusion error: Execution error: Encoding must be a non-null string
+select decode('', null) from test;
+
+query error DataFusion error: This feature is not implemented: Encoding must be a scalar; array specified encoding is not yet supported
+select decode('', hex_field) from test;
+
+query error DataFusion error: Error during planning: Function 'to_hex' requires Integer, but received String \(DataType: Utf8View\)
 select to_hex(hex_field) from test;
 
-query error
-select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8');
+query error DataFusion error: Execution error: Failed to decode value using base64
+select decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64');
 
 # Arrays tests
 query T
 SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
 ----
 616263
-717765717765
+7177657177
 NULL
 8f50d3f60eae370ddbf85c86219c55108a350165
 
-query T
-SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
+query TTTTTT
+SELECT
+  arrow_cast(decode(arrow_cast(base64_field, 'Utf8'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'LargeUtf8'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'Utf8View'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'Binary'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'LargeBinary'), 'base64'), 'Utf8'),
+  arrow_cast(decode(arrow_cast(base64_field, 'BinaryView'), 'base64'), 'Utf8')
+FROM test ORDER BY num;
 ----
-abc
-qweqwe
-NULL
-8f50d3f60eae370ddbf85c86219c55108a350165
+abc abc abc abc abc abc
+qweqw qweqw qweqw qweqw qweqw qweqw
+NULL NULL NULL NULL NULL NULL
+8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165 8f50d3f60eae370ddbf85c86219c55108a350165
 
 query T
 SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
 ----
 abc
-qweqwe
+qweqw
 NULL
 8f50d3f60eae370ddbf85c86219c55108a350165
 
@@ -85,44 +144,115 @@ select encode(bin_field, 'base64') FROM test WHERE num = 3;
 ----
 j1DT9g6uNw3b+FyGIZxVEIo1AWU
 
+query T
+select encode(bin_field, 'base64pad') FROM test WHERE num = 3;
+----
+j1DT9g6uNw3b+FyGIZxVEIo1AWU=
+
 query B
 select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
 ----
 true
 
-# test for Utf8View support for encode
+query B
+select decode(encode(bin_field, 'base64pad'), 'base64pad') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
+----
+true
+
 statement ok
-CREATE TABLE test_source AS VALUES
-    ('Andrew', 'X'),
-    ('Xiangpeng', 'Xiangpeng'),
-    ('Raphael', 'R'),
-    (NULL, 'R');
+drop table test
 
+# test for Utf8View support for encode
 statement ok
 CREATE TABLE test_utf8view AS
 select
   arrow_cast(column1, 'Utf8View') AS column1_utf8view,
   arrow_cast(column2, 'Utf8View') AS column2_utf8view
-FROM test_source;
+FROM VALUES
+    ('Andrew', 'X'),
+    ('Xiangpeng', 'Xiangpeng'),
+    ('Raphael', 'R'),
+    (NULL, 'R');
 
 query TTTTTT
 SELECT
-  column1_utf8view,
   encode(column1_utf8view, 'base64') AS column1_base64,
+  encode(column1_utf8view, 'base64pad') AS column1_base64pad,
   encode(column1_utf8view, 'hex') AS column1_hex,
-  
-  column2_utf8view,
   encode(column2_utf8view, 'base64') AS column2_base64,
+  encode(column2_utf8view, 'base64pad') AS column2_base64pad,
   encode(column2_utf8view, 'hex') AS column2_hex
 FROM test_utf8view;
 ----
-Andrew QW5kcmV3 416e64726577 X WA 58
-Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
-Raphael UmFwaGFlbA 5261706861656c R Ug 52
-NULL NULL NULL R Ug 52
+QW5kcmV3 QW5kcmV3 416e64726577 WA WA== 58
+WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67 WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67
+UmFwaGFlbA UmFwaGFlbA== 5261706861656c Ug Ug== 52
+NULL NULL NULL Ug Ug== 52
 
-# test for hex digest
-query T
-select encode(digest('hello', 'sha256'), 'hex');
+query TTTTTT
+SELECT
+  encode(arrow_cast(column1_utf8view, 'Utf8'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'LargeUtf8'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'Utf8View'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'Binary'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'LargeBinary'), 'base64'),
+  encode(arrow_cast(column1_utf8view, 'BinaryView'), 'base64')
+FROM test_utf8view;
 ----
-2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
+QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3
+WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n
+UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA
+NULL NULL NULL NULL NULL NULL
+
+
+query TTTTTT
+SELECT
+  encode(arrow_cast(column1_utf8view, 'Utf8'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'LargeUtf8'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'Utf8View'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'Binary'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'LargeBinary'), 'base64pad'),
+  encode(arrow_cast(column1_utf8view, 'BinaryView'), 'base64pad')
+FROM test_utf8view;
+----
+QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3
+WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n
+UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA==
+NULL NULL NULL NULL NULL NULL
+
+statement ok
+drop table test_utf8view
+
+# FixedSizeBinary support
+statement ok
+CREATE TABLE test_fsb AS
+SELECT arrow_cast(X'0123456789ABCDEF', 'FixedSizeBinary(8)') as fsb_col;
+
+query ???
+SELECT
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64'), 'base64'),
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64pad'), 'base64pad'),
+  decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'hex'), 'hex');
+----
+0123456789abcdef 0123456789abcdef 0123456789abcdef
+
+query ???
+SELECT
+  decode(encode(column1, 'base64'), 'base64'),
+  decode(encode(column1, 'base64pad'), 'base64pad'),
+  decode(encode(column1, 'hex'), 'hex')
+FROM values
+  (arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)')),
+  (arrow_cast(X'ffffffffffffffff', 'FixedSizeBinary(8)'));
+----
+0123456789abcdef 0123456789abcdef 0123456789abcdef
+ffffffffffffffff ffffffffffffffff ffffffffffffffff
+
+query error DataFusion error: Execution error: Failed to decode value using base64
+select decode('invalid', 'base64');
+
+query error DataFusion error: Execution error: Failed to decode value using base64pad
+select decode('invalid', 'base64pad');
+
+query error DataFusion error: Execution error: Failed to decode value using hex
+select decode('invalid', 'hex');
diff --git a/datafusion/sqllogictest/test_files/encrypted_parquet.slt b/datafusion/sqllogictest/test_files/encrypted_parquet.slt
new file mode 100644
index 0000000000000..d580b7d1ad2b8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/encrypted_parquet.slt
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test parquet encryption and decryption in DataFusion SQL.
+# See datafusion/common/src/config.rs for equivalent rust code
+
+statement count 0
+CREATE EXTERNAL TABLE encrypted_parquet_table
+(
+double_field double,
+float_field float
+)
+STORED AS PARQUET LOCATION 'test_files/scratch/encrypted_parquet/' OPTIONS (
+    -- Configure encryption for reading and writing Parquet files
+    -- Encryption properties
+    'format.crypto.file_encryption.encrypt_footer' 'true',
+    'format.crypto.file_encryption.footer_key_as_hex' '30313233343536373839303132333435',  -- b"0123456789012345"
+    'format.crypto.file_encryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" 
+    'format.crypto.file_encryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451" 
+    -- Decryption properties
+    'format.crypto.file_decryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345" 
+    'format.crypto.file_decryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450" 
+    'format.crypto.file_decryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451"
+)
+
+statement count 0
+CREATE TABLE temp_table (
+    double_field double,
+    float_field float
+)
+
+query I
+INSERT INTO temp_table VALUES(-1.0, -1.0)
+----
+1
+
+query I
+INSERT INTO temp_table VALUES(1.0, 2.0)
+----
+1
+
+query I
+INSERT INTO temp_table VALUES(3.0, 4.0)
+----
+1
+
+query I
+INSERT INTO temp_table VALUES(5.0, 6.0)
+----
+1
+
+query I
+INSERT INTO TABLE encrypted_parquet_table(double_field, float_field) SELECT * FROM temp_table
+----
+4
+
+query RR
+SELECT * FROM encrypted_parquet_table
+WHERE double_field > 0.0 AND float_field > 0.0
+ORDER BY double_field
+----
+1 2
+3 4
+5 6
+
+statement count 0
+CREATE EXTERNAL TABLE parquet_table
+(
+double_field double,
+float_field float
+)
+STORED AS PARQUET LOCATION 'test_files/scratch/encrypted_parquet/'
+
+query error DataFusion error: Parquet error: Parquet error: Parquet file has an encrypted footer but decryption properties were not provided
+SELECT * FROM parquet_table
diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt
index dc7a53adf889d..20c1db5cb1511 100644
--- a/datafusion/sqllogictest/test_files/errors.slt
+++ b/datafusion/sqllogictest/test_files/errors.slt
@@ -61,7 +61,7 @@ WITH t AS (WITH u as (SELECT 1) SELECT 1) SELECT * from u
 
 # select_wildcard_without_table
 statement error Error during planning: SELECT \* with no tables specified is not valid
-SELECT * 
+SELECT *
 
 # invalid_qualified_table_references
 statement error Error during planning: table 'datafusion\.nonexistentschema\.aggregate_test_100' not found
@@ -74,6 +74,11 @@ statement error DataFusion error: Error during planning: Unsupported compound id
 SELECT COUNT(*) FROM way.too.many.namespaces.as.ident.prefixes.aggregate_test_100
 
 
+# fetch_clause_not_supported
+statement error FETCH clause is not supported yet
+SELECT 1 FETCH NEXT 1 ROW ONLY
+
+
 
 #
 # Wrong scalar function signature
@@ -120,7 +125,7 @@ from aggregate_test_100
 order by c9
 
 # WindowFunction wrong signature
-statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to 'nth_value' function: coercion from \[Int32, Int64, Int64\] to the signature OneOf\(\[Any\(0\), Any\(1\), Any\(2\)\]\) failed
+statement error DataFusion error: Error during planning: Internal error: Function 'nth_value' failed to match any signature
 select
 c9,
 nth_value(c5, 2, 3) over (order by c9) as nv1
@@ -145,10 +150,10 @@ SELECT
    LIMIT 5;
 
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'foo' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'foo' to value of Int64 type
 create table foo as values (1), ('foo');
 
-query error user-defined coercion failed
+query error DataFusion error: Error during planning: Substring without for/from is not valid
 select 1 group by substr('');
 
 # Error in filter should be reported
@@ -168,8 +173,9 @@ CREATE TABLE tab0(col0 INTEGER, col1 INTEGER, col2 INTEGER);
 statement ok
 INSERT INTO tab0 VALUES(83,0,38);
 
-query error DataFusion error: Arrow error: Divide by zero error
+query I
 SELECT DISTINCT - 84 FROM tab0 AS cor0 WHERE NOT + 96 / + col1 <= NULL GROUP BY col1, col0;
+----
 
 statement ok
 create table a(timestamp int, birthday int, ts int, tokens int, amp int, staamp int);
diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt
index 2df8a9dfbae45..9916892058569 100644
--- a/datafusion/sqllogictest/test_files/explain.slt
+++ b/datafusion/sqllogictest/test_files/explain.slt
@@ -43,10 +43,9 @@ logical_plan
 02)--Filter: aggregate_test_100.c2 > Int8(10)
 03)----TableScan: aggregate_test_100 projection=[c1, c2], partial_filters=[aggregate_test_100.c2 > Int8(10)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: c2@1 > 10, projection=[c1@0]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
+01)FilterExec: c2@1 > 10, projection=[c1@0]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
 
 # explain_csv_exec_scan_config
 
@@ -177,7 +176,8 @@ initial_logical_plan
 logical_plan after resolve_grouping_function SAME TEXT AS ABOVE
 logical_plan after type_coercion SAME TEXT AS ABOVE
 analyzed_logical_plan SAME TEXT AS ABOVE
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -190,7 +190,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -198,8 +197,11 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c]
-logical_plan after eliminate_nested_union SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
 logical_plan after eliminate_join SAME TEXT AS ABOVE
@@ -212,7 +214,6 @@ logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
 logical_plan after eliminate_limit SAME TEXT AS ABOVE
 logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
-logical_plan after eliminate_one_union SAME TEXT AS ABOVE
 logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
 logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
 logical_plan after push_down_limit SAME TEXT AS ABOVE
@@ -220,13 +221,15 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE
 logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
 logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
 logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
 logical_plan after optimize_projections SAME TEXT AS ABOVE
 logical_plan TableScan: simple_explain_test projection=[a, b, c]
 initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 initial_physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]
 initial_physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, schema=[a:Int32;N, b:Int32;N, c:Int32;N]
 physical_plan after OutputRequirements
-01)OutputRequirementExec
+01)OutputRequirementExec: order_by=[], dist_by=Unspecified
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan after aggregate_statistics SAME TEXT AS ABOVE
 physical_plan after join_selection SAME TEXT AS ABOVE
@@ -237,11 +240,16 @@ physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
 physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
+physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
 physical_plan after LimitPushdown SAME TEXT AS ABOVE
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
+physical_plan after EnsureCooperative SAME TEXT AS ABOVE
+physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
 physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]
@@ -287,22 +295,22 @@ CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-t
 query TT
 EXPLAIN SELECT * FROM alltypes_plain limit 10;
 ----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 
 # explain verbose with both collect & show statistics on
 query TT
 EXPLAIN VERBOSE SELECT * FROM alltypes_plain limit 10;
 ----
 initial_physical_plan
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 initial_physical_plan_with_schema
-01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 physical_plan after OutputRequirements
-01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)OutputRequirementExec: order_by=[], dist_by=Unspecified, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 physical_plan after aggregate_statistics SAME TEXT AS ABOVE
 physical_plan after join_selection SAME TEXT AS ABOVE
 physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE
@@ -312,16 +320,21 @@ physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
-physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
+physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
+physical_plan after EnsureCooperative SAME TEXT AS ABOVE
+physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 
 
 statement ok
@@ -335,13 +348,13 @@ initial_physical_plan
 01)GlobalLimitExec: skip=0, fetch=10
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
 initial_physical_plan_with_stats
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
 initial_physical_plan_with_schema
-01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 physical_plan after OutputRequirements
-01)OutputRequirementExec
+01)OutputRequirementExec: order_by=[], dist_by=Unspecified
 02)--GlobalLimitExec: skip=0, fetch=10
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
 physical_plan after aggregate_statistics SAME TEXT AS ABOVE
@@ -353,17 +366,22 @@ physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
 physical_plan after EnforceSorting SAME TEXT AS ABOVE
 physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
-physical_plan after coalesce_batches SAME TEXT AS ABOVE
 physical_plan after OutputRequirements
 01)GlobalLimitExec: skip=0, fetch=10
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
 physical_plan after LimitAggregation SAME TEXT AS ABOVE
+physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
 physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
 physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
+physical_plan after EnsureCooperative SAME TEXT AS ABOVE
+physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
 physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet
-physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
+physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: ScanBytes=Exact(32)),(Col[1]: ScanBytes=Inexact(24)),(Col[2]: ScanBytes=Exact(32)),(Col[3]: ScanBytes=Exact(32)),(Col[4]: ScanBytes=Exact(32)),(Col[5]: ScanBytes=Exact(64)),(Col[6]: ScanBytes=Exact(32)),(Col[7]: ScanBytes=Exact(64)),(Col[8]: ScanBytes=Inexact(88)),(Col[9]: ScanBytes=Inexact(49)),(Col[10]: ScanBytes=Exact(64))]]
+physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(ns);N]
 
 
 statement ok
@@ -379,7 +397,7 @@ explain select make_array(make_array(1, 2, 3), make_array(4, 5, 6));
 ----
 logical_plan
 01)Projection: List([[1, 2, 3], [4, 5, 6]]) AS make_array(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(4),Int64(5),Int64(6)))
-02)--EmptyRelation
+02)--EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[[[1, 2, 3], [4, 5, 6]] as make_array(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(4),Int64(5),Int64(6)))]
 02)--PlaceholderRowExec
@@ -389,7 +407,7 @@ explain select [[1, 2, 3], [4, 5, 6]];
 ----
 logical_plan
 01)Projection: List([[1, 2, 3], [4, 5, 6]]) AS make_array(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(4),Int64(5),Int64(6)))
-02)--EmptyRelation
+02)--EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[[[1, 2, 3], [4, 5, 6]] as make_array(make_array(Int64(1),Int64(2),Int64(3)),make_array(Int64(4),Int64(5),Int64(6)))]
 02)--PlaceholderRowExec
@@ -401,7 +419,7 @@ explain select struct(1, 2.3, 'abc');
 ----
 logical_plan
 01)Projection: Struct({c0:1,c1:2.3,c2:abc}) AS struct(Int64(1),Float64(2.3),Utf8("abc"))
-02)--EmptyRelation
+02)--EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[{c0:1,c1:2.3,c2:abc} as struct(Int64(1),Float64(2.3),Utf8("abc"))]
 02)--PlaceholderRowExec
@@ -420,14 +438,11 @@ logical_plan
 01)LeftSemi Join: 
 02)--TableScan: t1 projection=[a]
 03)--SubqueryAlias: __correlated_sq_1
-04)----Projection:
-05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-06)--------TableScan: t2 projection=[]
+04)----EmptyRelation: rows=1
 physical_plan
 01)NestedLoopJoinExec: join_type=LeftSemi
 02)--DataSourceExec: partitions=1, partition_sizes=[0]
-03)--ProjectionExec: expr=[]
-04)----PlaceholderRowExec
+03)--PlaceholderRowExec
 
 statement ok
 drop table t1;
@@ -514,11 +529,112 @@ explain format 123 select * from values (1);
 query error DataFusion error: Error during planning: EXPLAIN VERBOSE with FORMAT is not supported
 explain verbose format tree select * from values (1);
 
+# valid explain format
+query error DataFusion error: Invalid or Unsupported Configuration: Invalid explain format. Expected 'indent', 'tree', 'pgjson' or 'graphviz'. Got 'xxx'
+set datafusion.explain.format = "xxx";
+
+# verbose uses indent mode even when a different mode (e.g tree) is set
+
+statement ok
+set datafusion.explain.format = "tree";
+
+query TT
+EXPLAIN VERBOSE SELECT a, b, c FROM simple_explain_test
+----
+initial_logical_plan
+01)Projection: simple_explain_test.a, simple_explain_test.b, simple_explain_test.c
+02)--TableScan: simple_explain_test
+logical_plan after resolve_grouping_function SAME TEXT AS ABOVE
+logical_plan after type_coercion SAME TEXT AS ABOVE
+analyzed_logical_plan SAME TEXT AS ABOVE
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
+logical_plan after simplify_expressions SAME TEXT AS ABOVE
+logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
+logical_plan after eliminate_join SAME TEXT AS ABOVE
+logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
+logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
+logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
+logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
+logical_plan after eliminate_filter SAME TEXT AS ABOVE
+logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
+logical_plan after eliminate_limit SAME TEXT AS ABOVE
+logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
+logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
+logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
+logical_plan after push_down_limit SAME TEXT AS ABOVE
+logical_plan after push_down_filter SAME TEXT AS ABOVE
+logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
+logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
+logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
+logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c]
+logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE
+logical_plan after optimize_unions SAME TEXT AS ABOVE
+logical_plan after simplify_expressions SAME TEXT AS ABOVE
+logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
+logical_plan after eliminate_join SAME TEXT AS ABOVE
+logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
+logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
+logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
+logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
+logical_plan after eliminate_filter SAME TEXT AS ABOVE
+logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
+logical_plan after eliminate_limit SAME TEXT AS ABOVE
+logical_plan after propagate_empty_relation SAME TEXT AS ABOVE
+logical_plan after filter_null_join_keys SAME TEXT AS ABOVE
+logical_plan after eliminate_outer_join SAME TEXT AS ABOVE
+logical_plan after push_down_limit SAME TEXT AS ABOVE
+logical_plan after push_down_filter SAME TEXT AS ABOVE
+logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE
+logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE
+logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE
+logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE
+logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE
+logical_plan after optimize_projections SAME TEXT AS ABOVE
+logical_plan TableScan: simple_explain_test projection=[a, b, c]
+initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
+initial_physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]
+initial_physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, schema=[a:Int32;N, b:Int32;N, c:Int32;N]
+physical_plan after OutputRequirements
+01)OutputRequirementExec: order_by=[], dist_by=Unspecified
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
+physical_plan after aggregate_statistics SAME TEXT AS ABOVE
+physical_plan after join_selection SAME TEXT AS ABOVE
+physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE
+physical_plan after FilterPushdown SAME TEXT AS ABOVE
+physical_plan after EnforceDistribution SAME TEXT AS ABOVE
+physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE
+physical_plan after EnforceSorting SAME TEXT AS ABOVE
+physical_plan after OptimizeAggregateOrder SAME TEXT AS ABOVE
+physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
+physical_plan after LimitAggregation SAME TEXT AS ABOVE
+physical_plan after LimitPushPastWindows SAME TEXT AS ABOVE
+physical_plan after HashJoinBuffering SAME TEXT AS ABOVE
+physical_plan after LimitPushdown SAME TEXT AS ABOVE
+physical_plan after TopKRepartition SAME TEXT AS ABOVE
+physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
+physical_plan after PushdownSort SAME TEXT AS ABOVE
+physical_plan after EnsureCooperative SAME TEXT AS ABOVE
+physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE
+physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true
+physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]
+physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, schema=[a:Int32;N, b:Int32;N, c:Int32;N]
+
+# Set back to original default value
+statement ok
+set datafusion.explain.format = "indent";
+
 # no such thing as json mode
-query error DataFusion error: Error during planning: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'json'
+query error DataFusion error: Invalid or Unsupported Configuration: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'json'
 explain format json select * from values (1);
 
-query error DataFusion error: Error during planning: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'foo'
+query error DataFusion error: Invalid or Unsupported Configuration: Invalid explain format\. Expected 'indent', 'tree', 'pgjson' or 'graphviz'\. Got 'foo'
 explain format foo select * from values (1);
 
 # pgjson mode
@@ -530,11 +646,11 @@ logical_plan
 02)--{
 03)----"Plan": {
 04)------"Node Type": "Values",
-05)------"Output": [
-06)--------"column1"
-07)------],
-08)------"Plans": [],
-09)------"Values": "(Int64(1))"
+05)------"Values": "(Int64(1))",
+06)------"Plans": [],
+07)------"Output": [
+08)--------"column1"
+09)------]
 10)----}
 11)--}
 12)]
diff --git a/datafusion/sqllogictest/test_files/explain_analyze.slt b/datafusion/sqllogictest/test_files/explain_analyze.slt
new file mode 100644
index 0000000000000..e109b32a95ed1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/explain_analyze.slt
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+query TT
+EXPLAIN ANALYZE SELECT * FROM generate_series(100);
+----
+Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+
+# --------------------------------------------
+# Test ProjectionExec's per-expression metrics
+# --------------------------------------------
+
+statement ok
+set datafusion.explain.analyze_level = dev;
+
+# 1 expr
+# Expect metric `expr_0_eval_time` exists in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[value@0 as a], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=64.0 KB, output_batches=1, expr_0_eval_time=<slt:ignore>]
+<slt:ignore>
+
+# 2 exprs
+# Expect metrics `expr_0_eval_time` and `expr_1_eval_time` exist in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a+1, pow(a,2)
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[a@0 + 1 as t1.a + Int64(1), power(CAST(a@0 AS Float64), 2) as pow(t1.a,Int64(2))], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=1632.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+<slt:ignore>
+
+# common expressions
+# Expect metrics `expr_0_eval_time` and `expr_1_eval_time` exist in ProjectionExec
+query TT
+EXPLAIN ANALYZE
+SELECT a+1, a+1 as another_a_plus_one
+FROM generate_series(1, 100) as t1(a);
+----
+Plan with Metrics
+01)ProjectionExec: expr=[__common_expr_1@0 as t1.a + Int64(1), __common_expr_1@0 as another_a_plus_one], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>, expr_1_eval_time=<slt:ignore>]
+02)--ProjectionExec: expr=[a@0 + 1 as __common_expr_1], metrics=[output_rows=100, elapsed_compute=<slt:ignore>, output_bytes=800.0 B, output_batches=1, expr_0_eval_time=<slt:ignore>]
+<slt:ignore>
+
+statement ok
+reset datafusion.explain.analyze_level;
diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt
index 15bf615765713..3a183a7357430 100644
--- a/datafusion/sqllogictest/test_files/explain_tree.slt
+++ b/datafusion/sqllogictest/test_files/explain_tree.slt
@@ -166,32 +166,26 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 # Aggregate
 query TT
@@ -210,44 +204,38 @@ physical_plan
 10)│      FinalPartitioned     │
 11)└─────────────┬─────────────┘
 12)┌─────────────┴─────────────┐
-13)│    CoalesceBatchesExec    │
+13)│      RepartitionExec      │
 14)│    --------------------   │
-15)│     target_batch_size:    │
-16)│            8192           │
-17)└─────────────┬─────────────┘
-18)┌─────────────┴─────────────┐
-19)│      RepartitionExec      │
-20)│    --------------------   │
-21)│ partition_count(in->out): │
-22)│           4 -> 4          │
-23)│                           │
-24)│    partitioning_scheme:   │
-25)│  Hash([string_col@0], 4)  │
-26)└─────────────┬─────────────┘
-27)┌─────────────┴─────────────┐
-28)│       AggregateExec       │
-29)│    --------------------   │
-30)│           aggr:           │
-31)│   sum(table1.bigint_col)  │
-32)│                           │
-33)│    group_by: string_col   │
-34)│       mode: Partial       │
-35)└─────────────┬─────────────┘
-36)┌─────────────┴─────────────┐
-37)│      RepartitionExec      │
-38)│    --------------------   │
-39)│ partition_count(in->out): │
-40)│           1 -> 4          │
-41)│                           │
-42)│    partitioning_scheme:   │
-43)│     RoundRobinBatch(4)    │
-44)└─────────────┬─────────────┘
-45)┌─────────────┴─────────────┐
-46)│       DataSourceExec      │
-47)│    --------------------   │
-48)│          files: 1         │
-49)│        format: csv        │
-50)└───────────────────────────┘
+15)│ partition_count(in->out): │
+16)│           4 -> 4          │
+17)│                           │
+18)│    partitioning_scheme:   │
+19)│  Hash([string_col@0], 4)  │
+20)└─────────────┬─────────────┘
+21)┌─────────────┴─────────────┐
+22)│       AggregateExec       │
+23)│    --------------------   │
+24)│           aggr:           │
+25)│   sum(table1.bigint_col)  │
+26)│                           │
+27)│    group_by: string_col   │
+28)│       mode: Partial       │
+29)└─────────────┬─────────────┘
+30)┌─────────────┴─────────────┐
+31)│      RepartitionExec      │
+32)│    --------------------   │
+33)│ partition_count(in->out): │
+34)│           1 -> 4          │
+35)│                           │
+36)│    partitioning_scheme:   │
+37)│     RoundRobinBatch(4)    │
+38)└─────────────┬─────────────┘
+39)┌─────────────┴─────────────┐
+40)│       DataSourceExec      │
+41)│    --------------------   │
+42)│          files: 1         │
+43)│        format: csv        │
+44)└───────────────────────────┘
 
 
 # Limit
@@ -280,58 +268,54 @@ physical_plan
 06)┌─────────────┴─────────────┐
 07)│       DataSourceExec      │
 08)│    --------------------   │
-09)│        bytes: 3120        │
+09)│        bytes: 1024        │
 10)│       format: memory      │
 11)│          rows: 2          │
 12)└───────────────────────────┘
 
 # 2 Joins
 query TT
-explain SELECT table1.string_col, table2.date_col FROM table1 JOIN table2 ON table1.int_col = table2.int_col;
+EXPLAIN SELECT table1.string_col, table2.date_col
+FROM table1 
+JOIN table2 
+ON 
+  (table1.int_col = table2.int_col)
+  AND (((table1.int_col + table2.int_col) % 2) = 0)
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│            on:            ├──────────────┐
-11)│    (int_col = int_col)    │              │
-12)└─────────────┬─────────────┘              │
-13)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-14)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-15)│    --------------------   ││    --------------------   │
-16)│     target_batch_size:    ││     target_batch_size:    │
-17)│            8192           ││            8192           │
-18)└─────────────┬─────────────┘└─────────────┬─────────────┘
+04)│     date_col: date_col    │
+05)│                           │
+06)│        string_col:        │
+07)│         string_col        │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│        HashJoinExec       │
+11)│    --------------------   │
+12)│          filter:          │
+13)│ CAST(int_col + int_col AS │
+14)│       Int64) % 2 = 0      ├──────────────┐
+15)│                           │              │
+16)│            on:            │              │
+17)│    (int_col = int_col)    │              │
+18)└─────────────┬─────────────┘              │
 19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-20)│      RepartitionExec      ││      RepartitionExec      │
+20)│       DataSourceExec      ││      RepartitionExec      │
 21)│    --------------------   ││    --------------------   │
-22)│ partition_count(in->out): ││ partition_count(in->out): │
-23)│           4 -> 4          ││           4 -> 4          │
+22)│          files: 1         ││ partition_count(in->out): │
+23)│      format: parquet      ││           1 -> 4          │
 24)│                           ││                           │
-25)│    partitioning_scheme:   ││    partitioning_scheme:   │
-26)│    Hash([int_col@0], 4)   ││    Hash([int_col@0], 4)   │
-27)└─────────────┬─────────────┘└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-29)│      RepartitionExec      ││      RepartitionExec      │
-30)│    --------------------   ││    --------------------   │
-31)│ partition_count(in->out): ││ partition_count(in->out): │
-32)│           1 -> 4          ││           1 -> 4          │
-33)│                           ││                           │
-34)│    partitioning_scheme:   ││    partitioning_scheme:   │
-35)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-36)└─────────────┬─────────────┘└─────────────┬─────────────┘
-37)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-38)│       DataSourceExec      ││       DataSourceExec      │
-39)│    --------------------   ││    --------------------   │
-40)│          files: 1         ││          files: 1         │
-41)│        format: csv        ││      format: parquet      │
-42)└───────────────────────────┘└───────────────────────────┘
+25)│                           ││    partitioning_scheme:   │
+26)│                           ││     RoundRobinBatch(4)    │
+27)└───────────────────────────┘└─────────────┬─────────────┘
+28)-----------------------------┌─────────────┴─────────────┐
+29)-----------------------------│       DataSourceExec      │
+30)-----------------------------│    --------------------   │
+31)-----------------------------│          files: 1         │
+32)-----------------------------│        format: csv        │
+33)-----------------------------└───────────────────────────┘
 
 # 3 Joins
 query TT
@@ -353,60 +337,41 @@ physical_plan
 07)│         string_col        │
 08)└─────────────┬─────────────┘
 09)┌─────────────┴─────────────┐
-10)│    CoalesceBatchesExec    │
+10)│        HashJoinExec       │
 11)│    --------------------   │
-12)│     target_batch_size:    │
-13)│            8192           │
-14)└─────────────┬─────────────┘
-15)┌─────────────┴─────────────┐
-16)│        HashJoinExec       │
-17)│    --------------------   │
-18)│            on:            ├──────────────┐
-19)│    (int_col = int_col)    │              │
-20)└─────────────┬─────────────┘              │
-21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│       DataSourceExec      ││    CoalesceBatchesExec    │
-23)│    --------------------   ││    --------------------   │
-24)│        bytes: 1560        ││     target_batch_size:    │
-25)│       format: memory      ││            8192           │
-26)│          rows: 1          ││                           │
-27)└───────────────────────────┘└─────────────┬─────────────┘
-28)-----------------------------┌─────────────┴─────────────┐
-29)-----------------------------│        HashJoinExec       │
-30)-----------------------------│    --------------------   │
-31)-----------------------------│            on:            ├──────────────┐
-32)-----------------------------│    (int_col = int_col)    │              │
-33)-----------------------------└─────────────┬─────────────┘              │
-34)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-35)-----------------------------│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-36)-----------------------------│    --------------------   ││    --------------------   │
-37)-----------------------------│     target_batch_size:    ││     target_batch_size:    │
-38)-----------------------------│            8192           ││            8192           │
-39)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-40)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-41)-----------------------------│      RepartitionExec      ││      RepartitionExec      │
-42)-----------------------------│    --------------------   ││    --------------------   │
-43)-----------------------------│ partition_count(in->out): ││ partition_count(in->out): │
-44)-----------------------------│           4 -> 4          ││           4 -> 4          │
-45)-----------------------------│                           ││                           │
-46)-----------------------------│    partitioning_scheme:   ││    partitioning_scheme:   │
-47)-----------------------------│    Hash([int_col@0], 4)   ││    Hash([int_col@0], 4)   │
-48)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-49)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-50)-----------------------------│      RepartitionExec      ││      RepartitionExec      │
-51)-----------------------------│    --------------------   ││    --------------------   │
-52)-----------------------------│ partition_count(in->out): ││ partition_count(in->out): │
-53)-----------------------------│           1 -> 4          ││           1 -> 4          │
-54)-----------------------------│                           ││                           │
-55)-----------------------------│    partitioning_scheme:   ││    partitioning_scheme:   │
-56)-----------------------------│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-57)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-58)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-59)-----------------------------│       DataSourceExec      ││       DataSourceExec      │
-60)-----------------------------│    --------------------   ││    --------------------   │
-61)-----------------------------│          files: 1         ││          files: 1         │
-62)-----------------------------│        format: csv        ││      format: parquet      │
-63)-----------------------------└───────────────────────────┘└───────────────────────────┘
+12)│            on:            ├──────────────┐
+13)│    (int_col = int_col)    │              │
+14)└─────────────┬─────────────┘              │
+15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+16)│       DataSourceExec      ││       ProjectionExec      │
+17)│    --------------------   ││    --------------------   │
+18)│         bytes: 512        ││     date_col: date_col    │
+19)│       format: memory      ││      int_col: int_col     │
+20)│          rows: 1          ││                           │
+21)│                           ││        string_col:        │
+22)│                           ││         string_col        │
+23)└───────────────────────────┘└─────────────┬─────────────┘
+24)-----------------------------┌─────────────┴─────────────┐
+25)-----------------------------│        HashJoinExec       │
+26)-----------------------------│    --------------------   │
+27)-----------------------------│            on:            ├──────────────┐
+28)-----------------------------│    (int_col = int_col)    │              │
+29)-----------------------------└─────────────┬─────────────┘              │
+30)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+31)-----------------------------│       DataSourceExec      ││      RepartitionExec      │
+32)-----------------------------│    --------------------   ││    --------------------   │
+33)-----------------------------│          files: 1         ││ partition_count(in->out): │
+34)-----------------------------│      format: parquet      ││           1 -> 4          │
+35)-----------------------------│                           ││                           │
+36)-----------------------------│         predicate:        ││    partitioning_scheme:   │
+37)-----------------------------│  DynamicFilter [ empty ]  ││     RoundRobinBatch(4)    │
+38)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘
+39)----------------------------------------------------------┌─────────────┴─────────────┐
+40)----------------------------------------------------------│       DataSourceExec      │
+41)----------------------------------------------------------│    --------------------   │
+42)----------------------------------------------------------│          files: 1         │
+43)----------------------------------------------------------│        format: csv        │
+44)----------------------------------------------------------└───────────────────────────┘
 
 # Long Filter (demonstrate what happens with wrapping)
 query TT
@@ -416,36 +381,30 @@ WHERE string_col != 'foo' AND string_col != 'bar' AND string_col != 'a really lo
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│   string_col != foo AND   │
-12)│      string_col != bar    │
-13)│     AND string_col != a   │
-14)│     really long string    │
-15)│          constant         │
-16)└─────────────┬─────────────┘
-17)┌─────────────┴─────────────┐
-18)│      RepartitionExec      │
-19)│    --------------------   │
-20)│ partition_count(in->out): │
-21)│           1 -> 4          │
-22)│                           │
-23)│    partitioning_scheme:   │
-24)│     RoundRobinBatch(4)    │
-25)└─────────────┬─────────────┘
-26)┌─────────────┴─────────────┐
-27)│       DataSourceExec      │
-28)│    --------------------   │
-29)│          files: 1         │
-30)│        format: csv        │
-31)└───────────────────────────┘
+04)│         predicate:        │
+05)│   string_col != foo AND   │
+06)│      string_col != bar    │
+07)│     AND string_col != a   │
+08)│     really long string    │
+09)│          constant         │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│      RepartitionExec      │
+13)│    --------------------   │
+14)│ partition_count(in->out): │
+15)│           1 -> 4          │
+16)│                           │
+17)│    partitioning_scheme:   │
+18)│     RoundRobinBatch(4)    │
+19)└─────────────┬─────────────┘
+20)┌─────────────┴─────────────┐
+21)│       DataSourceExec      │
+22)│    --------------------   │
+23)│          files: 1         │
+24)│        format: csv        │
+25)└───────────────────────────┘
 
 # Check maximum line limit.
 query TT
@@ -454,17 +413,17 @@ WHERE string_col != 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│       string_col !=       │
-12)│        aaaaaaaaaaaa       │
+04)│         predicate:        │
+05)│       string_col !=       │
+06)│        aaaaaaaaaaaa       │
+07)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+08)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+09)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+10)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+11)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
+12)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 13)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 14)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 15)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
@@ -485,29 +444,23 @@ physical_plan
 30)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 31)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
 32)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-33)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-34)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-35)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-36)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-37)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-38)│aaaaaaaaaaaaaaaaaaaaaaaaaaa│
-39)│            ...            │
-40)└─────────────┬─────────────┘
-41)┌─────────────┴─────────────┐
-42)│      RepartitionExec      │
-43)│    --------------------   │
-44)│ partition_count(in->out): │
-45)│           1 -> 4          │
-46)│                           │
-47)│    partitioning_scheme:   │
-48)│     RoundRobinBatch(4)    │
-49)└─────────────┬─────────────┘
-50)┌─────────────┴─────────────┐
-51)│       DataSourceExec      │
-52)│    --------------------   │
-53)│          files: 1         │
-54)│        format: csv        │
-55)└───────────────────────────┘
+33)│            ...            │
+34)└─────────────┬─────────────┘
+35)┌─────────────┴─────────────┐
+36)│      RepartitionExec      │
+37)│    --------------------   │
+38)│ partition_count(in->out): │
+39)│           1 -> 4          │
+40)│                           │
+41)│    partitioning_scheme:   │
+42)│     RoundRobinBatch(4)    │
+43)└─────────────┬─────────────┘
+44)┌─────────────┴─────────────┐
+45)│       DataSourceExec      │
+46)│    --------------------   │
+47)│          files: 1         │
+48)│        format: csv        │
+49)└───────────────────────────┘
 
 # Check exactly the render width.
 query TT
@@ -516,32 +469,26 @@ WHERE string_col != 'aaaaaaaaaaaaa';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│string_col != aaaaaaaaaaaaa│
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│string_col != aaaaaaaaaaaaa│
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 # Check with the render witdth + 1.
 query TT
@@ -550,34 +497,28 @@ WHERE string_col != 'aaaaaaaaaaaaaaa';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
-09)│    --------------------   │
-10)│         predicate:        │
-11)│       string_col !=       │
-12)│        aaaaaaaaaaaa       │
-13)│            aaa            │
-14)└─────────────┬─────────────┘
-15)┌─────────────┴─────────────┐
-16)│      RepartitionExec      │
-17)│    --------------------   │
-18)│ partition_count(in->out): │
-19)│           1 -> 4          │
-20)│                           │
-21)│    partitioning_scheme:   │
-22)│     RoundRobinBatch(4)    │
-23)└─────────────┬─────────────┘
-24)┌─────────────┴─────────────┐
-25)│       DataSourceExec      │
-26)│    --------------------   │
-27)│          files: 1         │
-28)│        format: csv        │
-29)└───────────────────────────┘
+04)│         predicate:        │
+05)│       string_col !=       │
+06)│        aaaaaaaaaaaa       │
+07)│            aaa            │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│      RepartitionExec      │
+11)│    --------------------   │
+12)│ partition_count(in->out): │
+13)│           1 -> 4          │
+14)│                           │
+15)│    partitioning_scheme:   │
+16)│     RoundRobinBatch(4)    │
+17)└─────────────┬─────────────┘
+18)┌─────────────┴─────────────┐
+19)│       DataSourceExec      │
+20)│    --------------------   │
+21)│          files: 1         │
+22)│        format: csv        │
+23)└───────────────────────────┘
 
 # Query with filter on csv
 query TT
@@ -585,32 +526,26 @@ explain SELECT int_col FROM table1 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: csv        │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: csv        │
+21)└───────────────────────────┘
 
 
 # Query with filter on parquet
@@ -619,35 +554,29 @@ explain SELECT int_col FROM table2 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│      format: parquet      │
-27)│                           │
-28)│         predicate:        │
-29)│     string_col != foo     │
-30)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│      format: parquet      │
+21)│                           │
+22)│         predicate:        │
+23)│     string_col != foo     │
+24)└───────────────────────────┘
 
 # Query with filter on memory
 query TT
@@ -655,24 +584,18 @@ explain SELECT int_col FROM table3 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│       DataSourceExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│       DataSourceExec      │
-15)│    --------------------   │
-16)│        bytes: 1560        │
-17)│       format: memory      │
-18)│          rows: 1          │
-19)└───────────────────────────┘
+10)│         bytes: 512        │
+11)│       format: memory      │
+12)│          rows: 1          │
+13)└───────────────────────────┘
 
 # Query with filter on json
 query TT
@@ -680,32 +603,26 @@ explain SELECT int_col FROM table4 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│        format: json       │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│        format: json       │
+21)└───────────────────────────┘
 
 # Query with filter on arrow
 query TT
@@ -713,32 +630,26 @@ explain SELECT int_col FROM table5 WHERE string_col != 'foo';
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│         FilterExec        │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
+04)│         predicate:        │
+05)│     string_col != foo     │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│         FilterExec        │
+08)│      RepartitionExec      │
 09)│    --------------------   │
-10)│         predicate:        │
-11)│     string_col != foo     │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│       format: arrow       │
-27)└───────────────────────────┘
+10)│ partition_count(in->out): │
+11)│           1 -> 4          │
+12)│                           │
+13)│    partitioning_scheme:   │
+14)│     RoundRobinBatch(4)    │
+15)└─────────────┬─────────────┘
+16)┌─────────────┴─────────────┐
+17)│       DataSourceExec      │
+18)│    --------------------   │
+19)│          files: 1         │
+20)│       format: arrow       │
+21)└───────────────────────────┘
 
 
 # Query with window agg.
@@ -1017,33 +928,11 @@ explain SELECT int_col, bigint_col, int_col+bigint_col AS sum_col FROM table2;
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│       ProjectionExec      │
+02)│       DataSourceExec      │
 03)│    --------------------   │
-04)│        bigint_col:        │
-05)│         bigint_col        │
-06)│                           │
-07)│      int_col: int_col     │
-08)│                           │
-09)│          sum_col:         │
-10)│  CAST(int_col AS Int64) + │
-11)│         bigint_col        │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│      RepartitionExec      │
-15)│    --------------------   │
-16)│ partition_count(in->out): │
-17)│           1 -> 4          │
-18)│                           │
-19)│    partitioning_scheme:   │
-20)│     RoundRobinBatch(4)    │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│       DataSourceExec      │
-24)│    --------------------   │
-25)│          files: 1         │
-26)│      format: parquet      │
-27)└───────────────────────────┘
-
+04)│          files: 1         │
+05)│      format: parquet      │
+06)└───────────────────────────┘
 
 # Query with projection on memory
 query TT
@@ -1065,7 +954,7 @@ physical_plan
 13)┌─────────────┴─────────────┐
 14)│       DataSourceExec      │
 15)│    --------------------   │
-16)│        bytes: 1560        │
+16)│         bytes: 512        │
 17)│       format: memory      │
 18)│          rows: 1          │
 19)└───────────────────────────┘
@@ -1186,69 +1075,40 @@ explain select * from table1 inner join table2 on table1.int_col = table2.int_co
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│            on:            │
-11)│ (int_col = int_col), (CAST├──────────────┐
-12)│   (table1.string_col AS   │              │
-13)│         Utf8View) =       │              │
-14)│         string_col)       │              │
-15)└─────────────┬─────────────┘              │
-16)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-17)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-18)│    --------------------   ││    --------------------   │
-19)│     target_batch_size:    ││     target_batch_size:    │
-20)│            8192           ││            8192           │
-21)└─────────────┬─────────────┘└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-23)│      RepartitionExec      ││      RepartitionExec      │
-24)│    --------------------   ││    --------------------   │
-25)│ partition_count(in->out): ││ partition_count(in->out): │
-26)│           4 -> 4          ││           4 -> 4          │
-27)│                           ││                           │
-28)│    partitioning_scheme:   ││    partitioning_scheme:   │
-29)│   Hash([int_col@0, CAST   ││      Hash([int_col@0,     │
-30)│     (table1.string_col    ││       string_col@1],      │
-31)│     AS Utf8View)@4], 4)   ││             4)            │
-32)└─────────────┬─────────────┘└─────────────┬─────────────┘
-33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-34)│       ProjectionExec      ││      RepartitionExec      │
-35)│    --------------------   ││    --------------------   │
-36)│ CAST(table1.string_col AS ││ partition_count(in->out): │
-37)│         Utf8View):        ││           1 -> 4          │
-38)│     CAST(string_col AS    ││                           │
-39)│          Utf8View)        ││    partitioning_scheme:   │
-40)│                           ││     RoundRobinBatch(4)    │
-41)│        bigint_col:        ││                           │
-42)│         bigint_col        ││                           │
-43)│                           ││                           │
-44)│     date_col: date_col    ││                           │
-45)│      int_col: int_col     ││                           │
-46)│                           ││                           │
-47)│        string_col:        ││                           │
-48)│         string_col        ││                           │
-49)└─────────────┬─────────────┘└─────────────┬─────────────┘
-50)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-51)│      RepartitionExec      ││       DataSourceExec      │
-52)│    --------------------   ││    --------------------   │
-53)│ partition_count(in->out): ││          files: 1         │
-54)│           1 -> 4          ││      format: parquet      │
-55)│                           ││                           │
-56)│    partitioning_scheme:   ││                           │
-57)│     RoundRobinBatch(4)    ││                           │
-58)└─────────────┬─────────────┘└───────────────────────────┘
-59)┌─────────────┴─────────────┐
-60)│       DataSourceExec      │
-61)│    --------------------   │
-62)│          files: 1         │
-63)│        format: csv        │
-64)└───────────────────────────┘
+04)│        bigint_col:        │
+05)│         bigint_col        │
+06)│                           │
+07)│     date_col: date_col    │
+08)│      int_col: int_col     │
+09)│                           │
+10)│        string_col:        │
+11)│         string_col        │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│        HashJoinExec       │
+15)│    --------------------   │
+16)│            on:            │
+17)│   (int_col = int_col),    ├──────────────┐
+18)│       (string_col =       │              │
+19)│         string_col)       │              │
+20)└─────────────┬─────────────┘              │
+21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+22)│       DataSourceExec      ││      RepartitionExec      │
+23)│    --------------------   ││    --------------------   │
+24)│          files: 1         ││ partition_count(in->out): │
+25)│      format: parquet      ││           1 -> 4          │
+26)│                           ││                           │
+27)│                           ││    partitioning_scheme:   │
+28)│                           ││     RoundRobinBatch(4)    │
+29)└───────────────────────────┘└─────────────┬─────────────┘
+30)-----------------------------┌─────────────┴─────────────┐
+31)-----------------------------│       DataSourceExec      │
+32)-----------------------------│    --------------------   │
+33)-----------------------------│          files: 1         │
+34)-----------------------------│        format: csv        │
+35)-----------------------------└───────────────────────────┘
 
 # Query with outer hash join.
 query TT
@@ -1256,71 +1116,42 @@ explain select * from table1 left outer join table2 on table1.int_col = table2.i
 ----
 physical_plan
 01)┌───────────────────────────┐
-02)│    CoalesceBatchesExec    │
+02)│       ProjectionExec      │
 03)│    --------------------   │
-04)│     target_batch_size:    │
-05)│            8192           │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│        HashJoinExec       │
-09)│    --------------------   │
-10)│      join_type: Left      │
-11)│                           │
-12)│            on:            ├──────────────┐
-13)│ (int_col = int_col), (CAST│              │
-14)│   (table1.string_col AS   │              │
-15)│         Utf8View) =       │              │
-16)│         string_col)       │              │
-17)└─────────────┬─────────────┘              │
-18)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-19)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
-20)│    --------------------   ││    --------------------   │
-21)│     target_batch_size:    ││     target_batch_size:    │
-22)│            8192           ││            8192           │
-23)└─────────────┬─────────────┘└─────────────┬─────────────┘
-24)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-25)│      RepartitionExec      ││      RepartitionExec      │
-26)│    --------------------   ││    --------------------   │
-27)│ partition_count(in->out): ││ partition_count(in->out): │
-28)│           4 -> 4          ││           4 -> 4          │
-29)│                           ││                           │
-30)│    partitioning_scheme:   ││    partitioning_scheme:   │
-31)│   Hash([int_col@0, CAST   ││      Hash([int_col@0,     │
-32)│     (table1.string_col    ││       string_col@1],      │
-33)│     AS Utf8View)@4], 4)   ││             4)            │
-34)└─────────────┬─────────────┘└─────────────┬─────────────┘
-35)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-36)│       ProjectionExec      ││      RepartitionExec      │
-37)│    --------------------   ││    --------------------   │
-38)│ CAST(table1.string_col AS ││ partition_count(in->out): │
-39)│         Utf8View):        ││           1 -> 4          │
-40)│     CAST(string_col AS    ││                           │
-41)│          Utf8View)        ││    partitioning_scheme:   │
-42)│                           ││     RoundRobinBatch(4)    │
-43)│        bigint_col:        ││                           │
-44)│         bigint_col        ││                           │
-45)│                           ││                           │
-46)│     date_col: date_col    ││                           │
-47)│      int_col: int_col     ││                           │
-48)│                           ││                           │
-49)│        string_col:        ││                           │
-50)│         string_col        ││                           │
-51)└─────────────┬─────────────┘└─────────────┬─────────────┘
-52)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-53)│      RepartitionExec      ││       DataSourceExec      │
-54)│    --------------------   ││    --------------------   │
-55)│ partition_count(in->out): ││          files: 1         │
-56)│           1 -> 4          ││      format: parquet      │
-57)│                           ││                           │
-58)│    partitioning_scheme:   ││                           │
-59)│     RoundRobinBatch(4)    ││                           │
-60)└─────────────┬─────────────┘└───────────────────────────┘
-61)┌─────────────┴─────────────┐
-62)│       DataSourceExec      │
-63)│    --------------------   │
-64)│          files: 1         │
-65)│        format: csv        │
-66)└───────────────────────────┘
+04)│        bigint_col:        │
+05)│         bigint_col        │
+06)│                           │
+07)│     date_col: date_col    │
+08)│      int_col: int_col     │
+09)│                           │
+10)│        string_col:        │
+11)│         string_col        │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│        HashJoinExec       │
+15)│    --------------------   │
+16)│      join_type: Right     │
+17)│                           │
+18)│            on:            ├──────────────┐
+19)│   (int_col = int_col),    │              │
+20)│       (string_col =       │              │
+21)│         string_col)       │              │
+22)└─────────────┬─────────────┘              │
+23)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+24)│       DataSourceExec      ││      RepartitionExec      │
+25)│    --------------------   ││    --------------------   │
+26)│          files: 1         ││ partition_count(in->out): │
+27)│      format: parquet      ││           1 -> 4          │
+28)│                           ││                           │
+29)│                           ││    partitioning_scheme:   │
+30)│                           ││     RoundRobinBatch(4)    │
+31)└───────────────────────────┘└─────────────┬─────────────┘
+32)-----------------------------┌─────────────┴─────────────┐
+33)-----------------------------│       DataSourceExec      │
+34)-----------------------------│    --------------------   │
+35)-----------------------------│          files: 1         │
+36)-----------------------------│        format: csv        │
+37)-----------------------------└───────────────────────────┘
 
 # Query with nested loop join.
 query TT
@@ -1333,41 +1164,11 @@ physical_plan
 04)│    join_type: LeftSemi    │              │
 05)└─────────────┬─────────────┘              │
 06)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-07)│       DataSourceExec      ││       ProjectionExec      │
+07)│       DataSourceExec      ││     PlaceholderRowExec    │
 08)│    --------------------   ││                           │
 09)│          files: 1         ││                           │
 10)│        format: csv        ││                           │
-11)└───────────────────────────┘└─────────────┬─────────────┘
-12)-----------------------------┌─────────────┴─────────────┐
-13)-----------------------------│       AggregateExec       │
-14)-----------------------------│    --------------------   │
-15)-----------------------------│       aggr: count(1)      │
-16)-----------------------------│        mode: Final        │
-17)-----------------------------└─────────────┬─────────────┘
-18)-----------------------------┌─────────────┴─────────────┐
-19)-----------------------------│   CoalescePartitionsExec  │
-20)-----------------------------└─────────────┬─────────────┘
-21)-----------------------------┌─────────────┴─────────────┐
-22)-----------------------------│       AggregateExec       │
-23)-----------------------------│    --------------------   │
-24)-----------------------------│       aggr: count(1)      │
-25)-----------------------------│       mode: Partial       │
-26)-----------------------------└─────────────┬─────────────┘
-27)-----------------------------┌─────────────┴─────────────┐
-28)-----------------------------│      RepartitionExec      │
-29)-----------------------------│    --------------------   │
-30)-----------------------------│ partition_count(in->out): │
-31)-----------------------------│           1 -> 4          │
-32)-----------------------------│                           │
-33)-----------------------------│    partitioning_scheme:   │
-34)-----------------------------│     RoundRobinBatch(4)    │
-35)-----------------------------└─────────────┬─────────────┘
-36)-----------------------------┌─────────────┴─────────────┐
-37)-----------------------------│       DataSourceExec      │
-38)-----------------------------│    --------------------   │
-39)-----------------------------│          files: 1         │
-40)-----------------------------│      format: parquet      │
-41)-----------------------------└───────────────────────────┘
+11)└───────────────────────────┘└───────────────────────────┘
 
 # Query with cross join.
 query TT
@@ -1378,21 +1179,11 @@ physical_plan
 02)│       CrossJoinExec       ├──────────────┐
 03)└─────────────┬─────────────┘              │
 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-05)│       DataSourceExec      ││      RepartitionExec      │
+05)│       DataSourceExec      ││       DataSourceExec      │
 06)│    --------------------   ││    --------------------   │
-07)│          files: 1         ││ partition_count(in->out): │
-08)│        format: csv        ││           1 -> 4          │
-09)│                           ││                           │
-10)│                           ││    partitioning_scheme:   │
-11)│                           ││     RoundRobinBatch(4)    │
-12)└───────────────────────────┘└─────────────┬─────────────┘
-13)-----------------------------┌─────────────┴─────────────┐
-14)-----------------------------│       DataSourceExec      │
-15)-----------------------------│    --------------------   │
-16)-----------------------------│          files: 1         │
-17)-----------------------------│      format: parquet      │
-18)-----------------------------└───────────────────────────┘
-
+07)│          files: 1         ││          files: 1         │
+08)│        format: csv        ││      format: parquet      │
+09)└───────────────────────────┘└───────────────────────────┘
 
 # Query with sort merge join.
 statement ok
@@ -1415,7 +1206,7 @@ physical_plan
 11)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 12)│       DataSourceExec      ││       DataSourceExec      │
 13)│    --------------------   ││    --------------------   │
-14)│        bytes: 6040        ││        bytes: 6040        │
+14)│        bytes: 5932        ││        bytes: 5932        │
 15)│       format: memory      ││       format: memory      │
 16)│          rows: 1          ││          rows: 1          │
 17)└───────────────────────────┘└───────────────────────────┘
@@ -1497,42 +1288,27 @@ physical_plan
 25)│      FinalPartitioned     ││      FinalPartitioned     │
 26)└─────────────┬─────────────┘└─────────────┬─────────────┘
 27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-28)│    CoalesceBatchesExec    ││    CoalesceBatchesExec    │
+28)│      RepartitionExec      ││      RepartitionExec      │
 29)│    --------------------   ││    --------------------   │
-30)│     target_batch_size:    ││     target_batch_size:    │
-31)│            8192           ││            8192           │
-32)└─────────────┬─────────────┘└─────────────┬─────────────┘
-33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-34)│      RepartitionExec      ││      RepartitionExec      │
-35)│    --------------------   ││    --------------------   │
-36)│ partition_count(in->out): ││ partition_count(in->out): │
-37)│           4 -> 4          ││           4 -> 4          │
-38)│                           ││                           │
-39)│    partitioning_scheme:   ││    partitioning_scheme:   │
-40)│     Hash([name@0], 4)     ││     Hash([name@0], 4)     │
+30)│ partition_count(in->out): ││ partition_count(in->out): │
+31)│           1 -> 4          ││           1 -> 4          │
+32)│                           ││                           │
+33)│    partitioning_scheme:   ││    partitioning_scheme:   │
+34)│     Hash([name@0], 4)     ││     Hash([name@0], 4)     │
+35)└─────────────┬─────────────┘└─────────────┬─────────────┘
+36)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+37)│       AggregateExec       ││       AggregateExec       │
+38)│    --------------------   ││    --------------------   │
+39)│       group_by: name      ││       group_by: name      │
+40)│       mode: Partial       ││       mode: Partial       │
 41)└─────────────┬─────────────┘└─────────────┬─────────────┘
 42)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-43)│      RepartitionExec      ││      RepartitionExec      │
+43)│       DataSourceExec      ││       DataSourceExec      │
 44)│    --------------------   ││    --------------------   │
-45)│ partition_count(in->out): ││ partition_count(in->out): │
-46)│           1 -> 4          ││           1 -> 4          │
-47)│                           ││                           │
-48)│    partitioning_scheme:   ││    partitioning_scheme:   │
-49)│     RoundRobinBatch(4)    ││     RoundRobinBatch(4)    │
-50)└─────────────┬─────────────┘└─────────────┬─────────────┘
-51)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-52)│       AggregateExec       ││       AggregateExec       │
-53)│    --------------------   ││    --------------------   │
-54)│       group_by: name      ││       group_by: name      │
-55)│       mode: Partial       ││       mode: Partial       │
-56)└─────────────┬─────────────┘└─────────────┬─────────────┘
-57)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-58)│       DataSourceExec      ││       DataSourceExec      │
-59)│    --------------------   ││    --------------------   │
-60)│        bytes: 1320        ││        bytes: 1312        │
-61)│       format: memory      ││       format: memory      │
-62)│          rows: 1          ││          rows: 1          │
-63)└───────────────────────────┘└───────────────────────────┘
+45)│         bytes: 288        ││         bytes: 280        │
+46)│       format: memory      ││       format: memory      │
+47)│          rows: 1          ││          rows: 1          │
+48)└───────────────────────────┘└───────────────────────────┘
 
 # Test explain tree for UnionExec
 query TT
@@ -1548,14 +1324,14 @@ physical_plan
 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 05)│       DataSourceExec      ││       ProjectionExec      │
 06)│    --------------------   ││    --------------------   │
-07)│        bytes: 1320        ││   id: CAST(id AS Int32)   │
+07)│         bytes: 288        ││   id: CAST(id AS Int32)   │
 08)│       format: memory      ││         name: name        │
 09)│          rows: 1          ││                           │
 10)└───────────────────────────┘└─────────────┬─────────────┘
 11)-----------------------------┌─────────────┴─────────────┐
 12)-----------------------------│       DataSourceExec      │
 13)-----------------------------│    --------------------   │
-14)-----------------------------│        bytes: 1312        │
+14)-----------------------------│         bytes: 280        │
 15)-----------------------------│       format: memory      │
 16)-----------------------------│          rows: 1          │
 17)-----------------------------└───────────────────────────┘
@@ -1593,31 +1369,25 @@ physical_plan
 05)│       ASC NULLS LAST      │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
-15)│    --------------------   │
-16)│   predicate: ticker = A   │
-17)└─────────────┬─────────────┘
-18)┌─────────────┴─────────────┐
-19)│      RepartitionExec      │
-20)│    --------------------   │
-21)│ partition_count(in->out): │
-22)│           1 -> 4          │
-23)│                           │
-24)│    partitioning_scheme:   │
-25)│     RoundRobinBatch(4)    │
-26)└─────────────┬─────────────┘
-27)┌─────────────┴─────────────┐
-28)│     StreamingTableExec    │
-29)│    --------------------   │
-30)│       infinite: true      │
-31)│        limit: None        │
-32)└───────────────────────────┘
+10)│   predicate: ticker = A   │
+11)└─────────────┬─────────────┘
+12)┌─────────────┴─────────────┐
+13)│      RepartitionExec      │
+14)│    --------------------   │
+15)│ partition_count(in->out): │
+16)│           1 -> 4          │
+17)│                           │
+18)│    partitioning_scheme:   │
+19)│     RoundRobinBatch(4)    │
+20)└─────────────┬─────────────┘
+21)┌─────────────┴─────────────┐
+22)│     StreamingTableExec    │
+23)│    --------------------   │
+24)│       infinite: true      │
+25)│        limit: None        │
+26)└───────────────────────────┘
 
 
 # constant ticker, CAST(time AS DATE) = time, order by time
@@ -1633,33 +1403,27 @@ physical_plan
 04)│    time ASC NULLS LAST    │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│     target_batch_size:    │
-10)│            8192           │
-11)└─────────────┬─────────────┘
-12)┌─────────────┴─────────────┐
-13)│         FilterExec        │
-14)│    --------------------   │
-15)│         predicate:        │
-16)│  ticker = A AND CAST(time │
-17)│      AS Date32) = date    │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│         predicate:        │
+10)│  ticker = A AND CAST(time │
+11)│      AS Date32) = date    │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│      RepartitionExec      │
+15)│    --------------------   │
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 # same thing but order by date
 query TT
@@ -1674,33 +1438,27 @@ physical_plan
 04)│    date ASC NULLS LAST    │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│     target_batch_size:    │
-10)│            8192           │
-11)└─────────────┬─────────────┘
-12)┌─────────────┴─────────────┐
-13)│         FilterExec        │
-14)│    --------------------   │
-15)│         predicate:        │
-16)│  ticker = A AND CAST(time │
-17)│      AS Date32) = date    │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│         predicate:        │
+10)│  ticker = A AND CAST(time │
+11)│      AS Date32) = date    │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│      RepartitionExec      │
+15)│    --------------------   │
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 # same thing but order by ticker
 query TT
@@ -1713,33 +1471,27 @@ physical_plan
 02)│   CoalescePartitionsExec  │
 03)└─────────────┬─────────────┘
 04)┌─────────────┴─────────────┐
-05)│    CoalesceBatchesExec    │
+05)│         FilterExec        │
 06)│    --------------------   │
-07)│     target_batch_size:    │
-08)│            8192           │
-09)└─────────────┬─────────────┘
-10)┌─────────────┴─────────────┐
-11)│         FilterExec        │
-12)│    --------------------   │
-13)│         predicate:        │
-14)│  ticker = A AND CAST(time │
-15)│      AS Date32) = date    │
-16)└─────────────┬─────────────┘
-17)┌─────────────┴─────────────┐
-18)│      RepartitionExec      │
-19)│    --------------------   │
-20)│ partition_count(in->out): │
-21)│           1 -> 4          │
-22)│                           │
-23)│    partitioning_scheme:   │
-24)│     RoundRobinBatch(4)    │
-25)└─────────────┬─────────────┘
-26)┌─────────────┴─────────────┐
-27)│     StreamingTableExec    │
-28)│    --------------------   │
-29)│       infinite: true      │
-30)│        limit: None        │
-31)└───────────────────────────┘
+07)│         predicate:        │
+08)│  ticker = A AND CAST(time │
+09)│      AS Date32) = date    │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│      RepartitionExec      │
+13)│    --------------------   │
+14)│ partition_count(in->out): │
+15)│           1 -> 4          │
+16)│                           │
+17)│    partitioning_scheme:   │
+18)│     RoundRobinBatch(4)    │
+19)└─────────────┬─────────────┘
+20)┌─────────────┴─────────────┐
+21)│     StreamingTableExec    │
+22)│    --------------------   │
+23)│       infinite: true      │
+24)│        limit: None        │
+25)└───────────────────────────┘
 
 
 # same thing but order by time, date
@@ -1756,33 +1508,67 @@ physical_plan
 05)│       ASC NULLS LAST      │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
-12)└─────────────┬─────────────┘
-13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
-15)│    --------------------   │
-16)│         predicate:        │
-17)│  ticker = A AND CAST(time │
-18)│      AS Date32) = date    │
-19)└─────────────┬─────────────┘
-20)┌─────────────┴─────────────┐
-21)│      RepartitionExec      │
-22)│    --------------------   │
-23)│ partition_count(in->out): │
-24)│           1 -> 4          │
-25)│                           │
-26)│    partitioning_scheme:   │
-27)│     RoundRobinBatch(4)    │
-28)└─────────────┬─────────────┘
-29)┌─────────────┴─────────────┐
-30)│     StreamingTableExec    │
-31)│    --------------------   │
-32)│       infinite: true      │
-33)│        limit: None        │
-34)└───────────────────────────┘
+10)│         predicate:        │
+11)│  ticker = A AND CAST(time │
+12)│      AS Date32) = date    │
+13)└─────────────┬─────────────┘
+14)┌─────────────┴─────────────┐
+15)│      RepartitionExec      │
+16)│    --------------------   │
+17)│ partition_count(in->out): │
+18)│           1 -> 4          │
+19)│                           │
+20)│    partitioning_scheme:   │
+21)│     RoundRobinBatch(4)    │
+22)└─────────────┬─────────────┘
+23)┌─────────────┴─────────────┐
+24)│     StreamingTableExec    │
+25)│    --------------------   │
+26)│       infinite: true      │
+27)│        limit: None        │
+28)└───────────────────────────┘
+
+
+
+# query
+query TT
+explain SELECT * FROM data 
+WHERE date = '2006-01-02'
+ORDER BY "ticker", "time"
+LIMIT 5;
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│  SortPreservingMergeExec  │
+03)│    --------------------   │
+04)│          limit: 5         │
+05)│                           │
+06)│   ticker ASC NULLS LAST,  │
+07)│     time ASC NULLS LAST   │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│         FilterExec        │
+11)│    --------------------   │
+12)│         predicate:        │
+13)│     date = 2006-01-02     │
+14)└─────────────┬─────────────┘
+15)┌─────────────┴─────────────┐
+16)│      RepartitionExec      │
+17)│    --------------------   │
+18)│ partition_count(in->out): │
+19)│           1 -> 4          │
+20)│                           │
+21)│    partitioning_scheme:   │
+22)│     RoundRobinBatch(4)    │
+23)└─────────────┬─────────────┘
+24)┌─────────────┴─────────────┐
+25)│     StreamingTableExec    │
+26)│    --------------------   │
+27)│       infinite: true      │
+28)│        limit: None        │
+29)└───────────────────────────┘
 
 
 
@@ -1801,32 +1587,26 @@ physical_plan
 05)│     time ASC NULLS LAST   │
 06)└─────────────┬─────────────┘
 07)┌─────────────┴─────────────┐
-08)│    CoalesceBatchesExec    │
+08)│         FilterExec        │
 09)│    --------------------   │
-10)│     target_batch_size:    │
-11)│            8192           │
+10)│         predicate:        │
+11)│     date = 2006-01-02     │
 12)└─────────────┬─────────────┘
 13)┌─────────────┴─────────────┐
-14)│         FilterExec        │
+14)│      RepartitionExec      │
 15)│    --------------------   │
-16)│         predicate:        │
-17)│     date = 2006-01-02     │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+16)│ partition_count(in->out): │
+17)│           1 -> 4          │
+18)│                           │
+19)│    partitioning_scheme:   │
+20)│     RoundRobinBatch(4)    │
+21)└─────────────┬─────────────┘
+22)┌─────────────┴─────────────┐
+23)│     StreamingTableExec    │
+24)│    --------------------   │
+25)│       infinite: true      │
+26)│        limit: None        │
+27)└───────────────────────────┘
 
 
 
@@ -1856,30 +1636,24 @@ physical_plan
 12)│                           ││         id: id + 1        │
 13)└───────────────────────────┘└─────────────┬─────────────┘
 14)-----------------------------┌─────────────┴─────────────┐
-15)-----------------------------│    CoalesceBatchesExec    │
+15)-----------------------------│         FilterExec        │
 16)-----------------------------│    --------------------   │
-17)-----------------------------│     target_batch_size:    │
-18)-----------------------------│            8192           │
-19)-----------------------------└─────────────┬─────────────┘
-20)-----------------------------┌─────────────┴─────────────┐
-21)-----------------------------│         FilterExec        │
-22)-----------------------------│    --------------------   │
-23)-----------------------------│     predicate: id < 10    │
-24)-----------------------------└─────────────┬─────────────┘
-25)-----------------------------┌─────────────┴─────────────┐
-26)-----------------------------│      RepartitionExec      │
-27)-----------------------------│    --------------------   │
-28)-----------------------------│ partition_count(in->out): │
-29)-----------------------------│           1 -> 4          │
-30)-----------------------------│                           │
-31)-----------------------------│    partitioning_scheme:   │
-32)-----------------------------│     RoundRobinBatch(4)    │
-33)-----------------------------└─────────────┬─────────────┘
-34)-----------------------------┌─────────────┴─────────────┐
-35)-----------------------------│       WorkTableExec       │
-36)-----------------------------│    --------------------   │
-37)-----------------------------│        name: nodes        │
-38)-----------------------------└───────────────────────────┘
+17)-----------------------------│     predicate: id < 10    │
+18)-----------------------------└─────────────┬─────────────┘
+19)-----------------------------┌─────────────┴─────────────┐
+20)-----------------------------│      RepartitionExec      │
+21)-----------------------------│    --------------------   │
+22)-----------------------------│ partition_count(in->out): │
+23)-----------------------------│           1 -> 4          │
+24)-----------------------------│                           │
+25)-----------------------------│    partitioning_scheme:   │
+26)-----------------------------│     RoundRobinBatch(4)    │
+27)-----------------------------└─────────────┬─────────────┘
+28)-----------------------------┌─────────────┴─────────────┐
+29)-----------------------------│       WorkTableExec       │
+30)-----------------------------│    --------------------   │
+31)-----------------------------│        name: nodes        │
+32)-----------------------------└───────────────────────────┘
 
 query TT
 explain COPY (VALUES (1, 'foo', 1, '2023-01-01'), (2, 'bar', 2, '2023-01-02'), (3, 'baz', 3, '2023-01-03'))
@@ -1899,7 +1673,7 @@ physical_plan
 11)┌─────────────┴─────────────┐
 12)│       DataSourceExec      │
 13)│    --------------------   │
-14)│        bytes: 2672        │
+14)│        bytes: 2576        │
 15)│       format: memory      │
 16)│          rows: 1          │
 17)└───────────────────────────┘
@@ -1922,7 +1696,7 @@ physical_plan
 11)┌─────────────┴─────────────┐
 12)│       DataSourceExec      │
 13)│    --------------------   │
-14)│        bytes: 2672        │
+14)│        bytes: 2576        │
 15)│       format: memory      │
 16)│          rows: 1          │
 17)└───────────────────────────┘
@@ -1945,7 +1719,7 @@ physical_plan
 11)┌─────────────┴─────────────┐
 12)│       DataSourceExec      │
 13)│    --------------------   │
-14)│        bytes: 2672        │
+14)│        bytes: 2576        │
 15)│       format: memory      │
 16)│          rows: 1          │
 17)└───────────────────────────┘
@@ -1999,25 +1773,17 @@ physical_plan
 38)│          skip: 6          │
 39)└─────────────┬─────────────┘
 40)┌─────────────┴─────────────┐
-41)│    CoalesceBatchesExec    │
+41)│         FilterExec        │
 42)│    --------------------   │
-43)│          limit: 9         │
-44)│                           │
-45)│     target_batch_size:    │
-46)│            8192           │
-47)└─────────────┬─────────────┘
-48)┌─────────────┴─────────────┐
-49)│         FilterExec        │
-50)│    --------------------   │
-51)│      predicate: a > 3     │
-52)└─────────────┬─────────────┘
-53)┌─────────────┴─────────────┐
-54)│       DataSourceExec      │
-55)│    --------------------   │
-56)│         bytes: 160        │
-57)│       format: memory      │
-58)│          rows: 1          │
-59)└───────────────────────────┘
+43)│      predicate: a > 3     │
+44)└─────────────┬─────────────┘
+45)┌─────────────┴─────────────┐
+46)│       DataSourceExec      │
+47)│    --------------------   │
+48)│         bytes: 160        │
+49)│       format: memory      │
+50)│          rows: 1          │
+51)└───────────────────────────┘
 
 # clean up
 statement ok
@@ -2047,33 +1813,25 @@ physical_plan
 04)│          limit: 5         │
 05)└─────────────┬─────────────┘
 06)┌─────────────┴─────────────┐
-07)│    CoalesceBatchesExec    │
+07)│         FilterExec        │
 08)│    --------------------   │
-09)│          limit: 5         │
-10)│                           │
-11)│     target_batch_size:    │
-12)│            8192           │
-13)└─────────────┬─────────────┘
-14)┌─────────────┴─────────────┐
-15)│         FilterExec        │
-16)│    --------------------   │
-17)│     predicate: c3 > 0     │
-18)└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐
-20)│      RepartitionExec      │
-21)│    --------------------   │
-22)│ partition_count(in->out): │
-23)│           1 -> 4          │
-24)│                           │
-25)│    partitioning_scheme:   │
-26)│     RoundRobinBatch(4)    │
-27)└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐
-29)│     StreamingTableExec    │
-30)│    --------------------   │
-31)│       infinite: true      │
-32)│        limit: None        │
-33)└───────────────────────────┘
+09)│     predicate: c3 > 0     │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│      RepartitionExec      │
+13)│    --------------------   │
+14)│ partition_count(in->out): │
+15)│           1 -> 4          │
+16)│                           │
+17)│    partitioning_scheme:   │
+18)│     RoundRobinBatch(4)    │
+19)└─────────────┬─────────────┘
+20)┌─────────────┴─────────────┐
+21)│     StreamingTableExec    │
+22)│    --------------------   │
+23)│       infinite: true      │
+24)│        limit: None        │
+25)└───────────────────────────┘
 
 # Test explain tree for PlaceholderRowExec
 query TT
@@ -2088,3 +1846,234 @@ physical_plan
 06)┌─────────────┴─────────────┐
 07)│     PlaceholderRowExec    │
 08)└───────────────────────────┘
+
+
+# Test explain for large plans
+
+statement ok
+CREATE TABLE t (k int)
+
+# By default, the plan of this large query is cropped
+query TT
+EXPLAIN SELECT * FROM t t1, t t2, t t3, t t4, t t5, t t6, t t7, t t8, t t9, t t10 
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│       CrossJoinExec       ├────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+03)└─────────────┬─────────────┘                                                                                                                                                                                                                                        
+04)┌─────────────┴─────────────┐                                                                                                                                                                                                                                        
+05)│       CrossJoinExec       │                                                                                                                                                                                                                                        
+06)│                           │                                                                                                                                                                                                                                        
+07)│                           ├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              
+08)│                           │                                                                                                                                                                                                                         │              
+09)│                           │                                                                                                                                                                                                                         │              
+10)└─────────────┬─────────────┘                                                                                                                                                                                                                         │              
+11)┌─────────────┴─────────────┐                                                                                                                                                                                                           ┌─────────────┴─────────────┐
+12)│       CrossJoinExec       │                                                                                                                                                                                                           │       DataSourceExec      │
+13)│                           │                                                                                                                                                                                                           │    --------------------   │
+14)│                           ├────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+15)│                           │                                                                                                                                                                                            │              │       format: memory      │
+16)│                           │                                                                                                                                                                                            │              │          rows: 0          │
+17)└─────────────┬─────────────┘                                                                                                                                                                                            │              └───────────────────────────┘
+18)┌─────────────┴─────────────┐                                                                                                                                                                              ┌─────────────┴─────────────┐
+19)│       CrossJoinExec       │                                                                                                                                                                              │       DataSourceExec      │
+20)│                           │                                                                                                                                                                              │    --------------------   │
+21)│                           ├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+22)│                           │                                                                                                                                                               │              │       format: memory      │
+23)│                           │                                                                                                                                                               │              │          rows: 0          │
+24)└─────────────┬─────────────┘                                                                                                                                                               │              └───────────────────────────┘
+25)┌─────────────┴─────────────┐                                                                                                                                                 ┌─────────────┴─────────────┐
+26)│       CrossJoinExec       │                                                                                                                                                 │       DataSourceExec      │
+27)│                           │                                                                                                                                                 │    --------------------   │
+28)│                           ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+29)│                           │                                                                                                                                  │              │       format: memory      │
+30)│                           │                                                                                                                                  │              │          rows: 0          │
+31)└─────────────┬─────────────┘                                                                                                                                  │              └───────────────────────────┘
+32)┌─────────────┴─────────────┐                                                                                                                    ┌─────────────┴─────────────┐
+33)│       CrossJoinExec       │                                                                                                                    │       DataSourceExec      │
+34)│                           │                                                                                                                    │    --------------------   │
+35)│                           ├─────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+36)│                           │                                                                                                     │              │       format: memory      │
+37)│                           │                                                                                                     │              │          rows: 0          │
+38)└─────────────┬─────────────┘                                                                                                     │              └───────────────────────────┘
+39)┌─────────────┴─────────────┐                                                                                       ┌─────────────┴─────────────┐
+40)│       CrossJoinExec       │                                                                                       │       DataSourceExec      │
+41)│                           │                                                                                       │    --------------------   │
+42)│                           ├────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+43)│                           │                                                                        │              │       format: memory      │
+44)│                           │                                                                        │              │          rows: 0          │
+45)└─────────────┬─────────────┘                                                                        │              └───────────────────────────┘
+46)┌─────────────┴─────────────┐                                                          ┌─────────────┴─────────────┐
+47)│       CrossJoinExec       │                                                          │       DataSourceExec      │
+48)│                           │                                                          │    --------------------   │
+49)│                           ├───────────────────────────────────────────┐              │          bytes: 0         │
+50)│                           │                                           │              │       format: memory      │
+51)│                           │                                           │              │          rows: 0          │
+52)└─────────────┬─────────────┘                                           │              └───────────────────────────┘
+53)┌─────────────┴─────────────┐                             ┌─────────────┴─────────────┐
+54)│       CrossJoinExec       │                             │       DataSourceExec      │
+55)│                           │                             │    --------------------   │
+56)│                           ├──────────────┐              │          bytes: 0         │
+57)│                           │              │              │       format: memory      │
+58)│                           │              │              │          rows: 0          │
+59)└─────────────┬─────────────┘              │              └───────────────────────────┘
+60)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+61)│       DataSourceExec      ││       DataSourceExec      │
+62)│    --------------------   ││    --------------------   │
+63)│          bytes: 0         ││          bytes: 0         │
+64)│       format: memory      ││       format: memory      │
+65)│          rows: 0          ││          rows: 0          │
+66)└───────────────────────────┘└───────────────────────────┘
+
+# Setting the tree_maximum_render_size to 0 will allow the entire plan to be rendered
+statement ok
+SET datafusion.explain.tree_maximum_render_width = 0
+
+query TT
+EXPLAIN SELECT * FROM t t1, t t2, t t3, t t4, t t5, t t6, t t7, t t8, t t9, t t10 
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│       CrossJoinExec       ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+03)└─────────────┬─────────────┘                                                                                                                                                                                                                                                      │
+04)┌─────────────┴─────────────┐                                                                                                                                                                                                                                        ┌─────────────┴─────────────┐
+05)│       CrossJoinExec       │                                                                                                                                                                                                                                        │       DataSourceExec      │
+06)│                           │                                                                                                                                                                                                                                        │    --------------------   │
+07)│                           ├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+08)│                           │                                                                                                                                                                                                                         │              │       format: memory      │
+09)│                           │                                                                                                                                                                                                                         │              │          rows: 0          │
+10)└─────────────┬─────────────┘                                                                                                                                                                                                                         │              └───────────────────────────┘
+11)┌─────────────┴─────────────┐                                                                                                                                                                                                           ┌─────────────┴─────────────┐
+12)│       CrossJoinExec       │                                                                                                                                                                                                           │       DataSourceExec      │
+13)│                           │                                                                                                                                                                                                           │    --------------------   │
+14)│                           ├────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+15)│                           │                                                                                                                                                                                            │              │       format: memory      │
+16)│                           │                                                                                                                                                                                            │              │          rows: 0          │
+17)└─────────────┬─────────────┘                                                                                                                                                                                            │              └───────────────────────────┘
+18)┌─────────────┴─────────────┐                                                                                                                                                                              ┌─────────────┴─────────────┐
+19)│       CrossJoinExec       │                                                                                                                                                                              │       DataSourceExec      │
+20)│                           │                                                                                                                                                                              │    --------------------   │
+21)│                           ├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+22)│                           │                                                                                                                                                               │              │       format: memory      │
+23)│                           │                                                                                                                                                               │              │          rows: 0          │
+24)└─────────────┬─────────────┘                                                                                                                                                               │              └───────────────────────────┘
+25)┌─────────────┴─────────────┐                                                                                                                                                 ┌─────────────┴─────────────┐
+26)│       CrossJoinExec       │                                                                                                                                                 │       DataSourceExec      │
+27)│                           │                                                                                                                                                 │    --------------------   │
+28)│                           ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+29)│                           │                                                                                                                                  │              │       format: memory      │
+30)│                           │                                                                                                                                  │              │          rows: 0          │
+31)└─────────────┬─────────────┘                                                                                                                                  │              └───────────────────────────┘
+32)┌─────────────┴─────────────┐                                                                                                                    ┌─────────────┴─────────────┐
+33)│       CrossJoinExec       │                                                                                                                    │       DataSourceExec      │
+34)│                           │                                                                                                                    │    --------------------   │
+35)│                           ├─────────────────────────────────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+36)│                           │                                                                                                     │              │       format: memory      │
+37)│                           │                                                                                                     │              │          rows: 0          │
+38)└─────────────┬─────────────┘                                                                                                     │              └───────────────────────────┘
+39)┌─────────────┴─────────────┐                                                                                       ┌─────────────┴─────────────┐
+40)│       CrossJoinExec       │                                                                                       │       DataSourceExec      │
+41)│                           │                                                                                       │    --------------------   │
+42)│                           ├────────────────────────────────────────────────────────────────────────┐              │          bytes: 0         │
+43)│                           │                                                                        │              │       format: memory      │
+44)│                           │                                                                        │              │          rows: 0          │
+45)└─────────────┬─────────────┘                                                                        │              └───────────────────────────┘
+46)┌─────────────┴─────────────┐                                                          ┌─────────────┴─────────────┐
+47)│       CrossJoinExec       │                                                          │       DataSourceExec      │
+48)│                           │                                                          │    --------------------   │
+49)│                           ├───────────────────────────────────────────┐              │          bytes: 0         │
+50)│                           │                                           │              │       format: memory      │
+51)│                           │                                           │              │          rows: 0          │
+52)└─────────────┬─────────────┘                                           │              └───────────────────────────┘
+53)┌─────────────┴─────────────┐                             ┌─────────────┴─────────────┐
+54)│       CrossJoinExec       │                             │       DataSourceExec      │
+55)│                           │                             │    --------------------   │
+56)│                           ├──────────────┐              │          bytes: 0         │
+57)│                           │              │              │       format: memory      │
+58)│                           │              │              │          rows: 0          │
+59)└─────────────┬─────────────┘              │              └───────────────────────────┘
+60)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+61)│       DataSourceExec      ││       DataSourceExec      │
+62)│    --------------------   ││    --------------------   │
+63)│          bytes: 0         ││          bytes: 0         │
+64)│       format: memory      ││       format: memory      │
+65)│          rows: 0          ││          rows: 0          │
+66)└───────────────────────────┘└───────────────────────────┘
+
+# Setting the tree_maximum_render_size to a smaller size
+statement ok
+SET datafusion.explain.tree_maximum_render_width = 60
+
+query TT
+EXPLAIN SELECT * FROM t t1, t t2, t t3, t t4, t t5, t t6, t t7, t t8, t t9, t t10 
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│       CrossJoinExec       ├──────────────────────────────────────────────────────────        
+03)└─────────────┬─────────────┘
+04)┌─────────────┴─────────────┐
+05)│       CrossJoinExec       │
+06)│                           │
+07)│                           ├──────────────────────────────────────────────────────────        
+08)│                           │
+09)│                           │
+10)└─────────────┬─────────────┘
+11)┌─────────────┴─────────────┐
+12)│       CrossJoinExec       │
+13)│                           │
+14)│                           ├──────────────────────────────────────────────────────────        
+15)│                           │
+16)│                           │
+17)└─────────────┬─────────────┘
+18)┌─────────────┴─────────────┐
+19)│       CrossJoinExec       │
+20)│                           │
+21)│                           ├──────────────────────────────────────────────────────────        
+22)│                           │
+23)│                           │
+24)└─────────────┬─────────────┘
+25)┌─────────────┴─────────────┐
+26)│       CrossJoinExec       │
+27)│                           │
+28)│                           ├──────────────────────────────────────────────────────────        
+29)│                           │
+30)│                           │
+31)└─────────────┬─────────────┘
+32)┌─────────────┴─────────────┐
+33)│       CrossJoinExec       │
+34)│                           │
+35)│                           ├──────────────────────────────────────────────────────────        
+36)│                           │
+37)│                           │
+38)└─────────────┬─────────────┘
+39)┌─────────────┴─────────────┐
+40)│       CrossJoinExec       │
+41)│                           │
+42)│                           ├──────────────────────────────────────────────────────────        
+43)│                           │
+44)│                           │
+45)└─────────────┬─────────────┘
+46)┌─────────────┴─────────────┐
+47)│       CrossJoinExec       │
+48)│                           │
+49)│                           ├───────────────────────────────────────────┐
+50)│                           │                                           │
+51)│                           │                                           │
+52)└─────────────┬─────────────┘                                           │
+53)┌─────────────┴─────────────┐                             ┌─────────────┴─────────────┐        
+54)│       CrossJoinExec       │                             │       DataSourceExec      │        
+55)│                           │                             │    --------------------   │        
+56)│                           ├──────────────┐              │          bytes: 0         │        
+57)│                           │              │              │       format: memory      │        
+58)│                           │              │              │          rows: 0          │        
+59)└─────────────┬─────────────┘              │              └───────────────────────────┘        
+60)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+61)│       DataSourceExec      ││       DataSourceExec      │
+62)│    --------------------   ││    --------------------   │
+63)│          bytes: 0         ││          bytes: 0         │
+64)│       format: memory      ││       format: memory      │
+65)│          rows: 0          ││          rows: 0          │
+66)└───────────────────────────┘└───────────────────────────┘
+
+statement ok
+DROP TABLE t
diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt
index e4d0b72338569..a6341bc686f74 100644
--- a/datafusion/sqllogictest/test_files/expr.slt
+++ b/datafusion/sqllogictest/test_files/expr.slt
@@ -22,7 +22,7 @@ SELECT true, false, false = false, true = false
 true false true false
 
 # test_mathematical_expressions_with_null
-query RRRRRRRRRRRRRRRRRRRRRRRRIIIRRRRRRBB
+query RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRBB
 SELECT
     sqrt(NULL),
     cbrt(NULL),
@@ -60,7 +60,7 @@ SELECT
     isnan(NULL),
     iszero(NULL)
 ----
-NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
+NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL
 
 # test_array_cast_invalid_timezone_will_panic
 statement error Parser error: Invalid timezone "Foo": failed to parse timezone
@@ -424,12 +424,24 @@ SELECT chr(CAST(NULL AS int))
 ----
 NULL
 
-statement error DataFusion error: Execution error: null character not permitted.
+query T
 SELECT chr(CAST(0 AS int))
+----
+\0
 
-statement error DataFusion error: Execution error: requested character too large for encoding.
+statement error DataFusion error: Execution error: invalid Unicode scalar value: 9223372036854775807
 SELECT chr(CAST(9223372036854775807 AS bigint))
 
+statement error DataFusion error: Execution error: invalid Unicode scalar value: 1114112
+SELECT chr(CAST(1114112 AS bigint))
+
+statement error DataFusion error: Execution error: invalid Unicode scalar value: -1
+SELECT chr(CAST(-1 AS bigint))
+
+# surrogate code point (invalid scalar value)
+statement error DataFusion error: Execution error: invalid Unicode scalar value: 55297
+SELECT chr(CAST(55297 AS bigint))
+
 query T
 SELECT concat('a','b','c')
 ----
@@ -492,6 +504,25 @@ abc
 statement ok
 drop table foo
 
+# concat_ws with a Utf8View column as separator
+statement ok
+create table test_concat_ws_sep (sep varchar, val1 varchar, val2 varchar) as values (',', 'foo', 'bar'), ('|', 'a', 'b');
+
+query T
+SELECT concat_ws(arrow_cast(sep, 'Utf8View'), val1, val2) FROM test_concat_ws_sep ORDER BY val1
+----
+a|b
+foo,bar
+
+query T
+SELECT concat_ws(arrow_cast(sep, 'LargeUtf8'), val1, val2) FROM test_concat_ws_sep ORDER BY val1
+----
+a|b
+foo,bar
+
+statement ok
+drop table test_concat_ws_sep
+
 query T
 SELECT initcap('')
 ----
@@ -587,7 +618,7 @@ select repeat('-1.2', arrow_cast(3, 'Int32'));
 ----
 -1.2-1.2-1.2
 
-query error DataFusion error: Error during planning: Internal error: Expect TypeSignatureClass::Native\(LogicalType\(Native\(Int64\), Int64\)\) but received NativeType::Float64, DataType: Float64
+query error DataFusion error: Error during planning: Function 'repeat' requires Int64, but received Float64 \(DataType: Float64\)
 select repeat('-1.2', 3.2);
 
 query T
@@ -670,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
 ----
 (empty)
 
+query T
+SELECT split_part('a,b', '', 1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', -1)
+----
+a,b
+
+query T
+SELECT split_part('a,b', '', 2)
+----
+(empty)
+
+query T
+SELECT split_part('a,b', '', -2)
+----
+(empty)
+
 statement error DataFusion error: Execution error: field position must not be zero
 SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
 
@@ -698,6 +749,11 @@ SELECT to_hex(2147483647)
 ----
 7fffffff
 
+query T
+SELECT to_hex(CAST(2147483647 as BIGINT UNSIGNED))
+----
+7fffffff
+
 query T
 SELECT to_hex(9223372036854775807)
 ----
@@ -708,6 +764,27 @@ SELECT to_hex(CAST(NULL AS int))
 ----
 NULL
 
+query T
+SELECT to_hex(0)
+----
+0
+
+# negative values (two's complement encoding)
+query T
+SELECT to_hex(-1)
+----
+ffffffffffffffff
+
+query T
+SELECT to_hex(CAST(-1 AS INT))
+----
+ffffffffffffffff
+
+query T
+SELECT to_hex(CAST(255 AS TINYINT UNSIGNED))
+----
+ff
+
 query T
 SELECT trim(' tom ')
 ----
@@ -1059,6 +1136,213 @@ SELECT '2' NOT IN ('a','b',NULL,1)
 ----
 NULL
 
+# ========================================================================
+# Comprehensive IN LIST tests with NULL handling
+# These tests validate SQL three-valued logic for IN operations
+# ========================================================================
+
+# test_in_list_null_literals
+# NULL IN (any_list) should always return NULL per SQL three-valued logic
+
+query B
+SELECT NULL IN (1, 1)
+----
+NULL
+
+query B
+SELECT NULL IN (NULL, 1)
+----
+NULL
+
+query B
+SELECT NULL IN (NULL, NULL)
+----
+NULL
+
+# test_in_list_with_columns
+# Create test table for column-based IN LIST tests
+
+statement ok
+CREATE OR REPLACE TABLE in_list_test(b INT) AS VALUES (1), (2), (3), (4), (NULL);
+
+# Test: b IN (1, 2) with various values
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 1;
+----
+true
+
+query IB
+SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (1, 2) ORDER BY b;
+----
+1 true
+2 true
+
+query IB
+SELECT b, b IN (1, 2) FROM in_list_test WHERE b IN (3, 4) ORDER BY b;
+----
+3 false
+4 false
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 1;
+----
+true
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b = 3;
+----
+false
+
+query B
+SELECT b IN (1, 2) FROM in_list_test WHERE b IS NULL;
+----
+NULL
+
+# Test: b IN (NULL, 1) - list contains NULL
+
+query B
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 1;
+----
+true
+
+query B
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b = 2;
+----
+NULL
+
+query B
+SELECT b IN (NULL, 1) FROM in_list_test WHERE b IS NULL;
+----
+NULL
+
+# Test: b IN (NULL, NULL) - list contains only NULLs
+
+query B
+SELECT b IN (NULL, NULL) FROM in_list_test WHERE b = 1;
+----
+NULL
+
+query B
+SELECT b IN (NULL, NULL) FROM in_list_test WHERE b IS NULL;
+----
+NULL
+
+# Test: literal IN (list_with_column) - column appears in the list
+
+statement ok
+CREATE OR REPLACE TABLE in_list_col_test(b INT) AS VALUES (1), (3), (NULL);
+
+query B
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 1;
+----
+true
+
+query B
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b = 3;
+----
+false
+
+query B
+SELECT 1 IN (2, b) FROM in_list_col_test WHERE b IS NULL;
+----
+NULL
+
+# Test: b IN (1, b) - column references itself in list
+
+query B
+SELECT b IN (1, b) FROM in_list_col_test WHERE b = 1;
+----
+true
+
+query B
+SELECT b IN (1, b) FROM in_list_col_test WHERE b = 3;
+----
+true
+
+query B
+SELECT b IN (1, b) FROM in_list_col_test WHERE b IS NULL;
+----
+NULL
+
+# test_in_list_tuples
+# Test tuple/row-wise IN comparisons using struct syntax
+# Note: Using arrow_cast for precise type control
+
+# (NULL, NULL) IN ((1, 2)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(1, 2))
+----
+false
+
+# (NULL, NULL) IN ((NULL, 1)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), 1))
+----
+false
+
+# (NULL, NULL) IN ((NULL, NULL)) => TRUE (exact match)
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')))
+----
+true
+
+# (NULL, 1) IN ((1, 2)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(1, 2))
+----
+false
+
+# (NULL, 1) IN ((NULL, 1)) => TRUE (exact match)
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), 1))
+----
+true
+
+# (NULL, 1) IN ((NULL, NULL)) => FALSE
+query B
+SELECT struct(arrow_cast(NULL, 'Int32'), 1) IN (struct(arrow_cast(NULL, 'Int32'), arrow_cast(NULL, 'Int32')))
+----
+false
+
+# (1, 2) IN ((1, 2)) => TRUE
+query B
+SELECT struct(1, 2) IN (struct(1, 2))
+----
+true
+
+# (1, 3) IN ((1, 2)) => FALSE
+query B
+SELECT struct(1, 3) IN (struct(1, 2))
+----
+false
+
+# (4, 4) IN ((1, 2)) => FALSE
+query B
+SELECT struct(4, 4) IN (struct(1, 2))
+----
+false
+
+# (1, 1) IN ((NULL, 1)) => FALSE
+query B
+SELECT struct(1, 1) IN (struct(NULL, 1))
+----
+false
+
+# (1, 1) IN ((NULL, NULL)) => FALSE
+query B
+SELECT struct(1, 1) IN (struct(NULL, NULL))
+----
+false
+
+# Cleanup test tables
+
+statement ok
+DROP TABLE in_list_test;
+
+statement ok
+DROP TABLE in_list_col_test;
+
 query T
 SELECT encode('tom','base64');
 ----
@@ -1184,6 +1468,11 @@ SELECT md5('tom');
 ----
 34b7da764b21d298ef307d04d8152dc5
 
+query T
+SELECT md5(arrow_cast('tom', 'Dictionary(Int32, Utf8)'));
+----
+34b7da764b21d298ef307d04d8152dc5
+
 query ?
 SELECT digest('tom','md5');
 ----
@@ -2072,9 +2361,6 @@ host1 1.1 101
 host2 2.2 202
 host3 3.3 303
 
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
-
 statement ok
 create table t (a float) as values (1), (2), (3);
 
@@ -2094,9 +2380,6 @@ physical_plan
 statement ok
 drop table t;
 
-statement ok
-set datafusion.sql_parser.dialect = 'Generic';
-
 # test between expression with null
 query I
 select 1 where null between null and null;
@@ -2126,3 +2409,26 @@ query T
 select E'foo\t\tbar';
 ----
 foo		bar
+
+statement ok
+create table t (a float) as values (1), (null), (3);
+
+# https://github.com/apache/datafusion/issues/17055
+# is not null did not correctly infer as boolean in udf argument position
+query B
+select greatest(a is not null, false) from t;
+----
+true
+false
+true
+
+# same for is null
+query B
+select greatest(a is null, false) from t;
+----
+false
+true
+false
+
+statement ok
+drop table t;
diff --git a/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt b/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
index a09d8ce26ddfb..633e19f7915db 100644
--- a/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
+++ b/datafusion/sqllogictest/test_files/filter_without_sort_exec.slt
@@ -38,10 +38,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [date@0 ASC NULLS LAST, time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # constant ticker, CAST(time AS DATE) = time, order by time
 query TT
@@ -55,10 +54,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by date
 query TT
@@ -72,10 +70,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [date@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by ticker
 query TT
@@ -89,10 +86,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)CoalescePartitionsExec
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # same thing but order by time, date
 query TT
@@ -106,10 +102,9 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [time@2 ASC NULLS LAST, date@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: ticker@1 = A AND CAST(time@2 AS Date32) = date@0
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
 
 # CAST(time AS DATE) <> date (should require a sort)
 # no physical plan due to sort breaking pipeline
@@ -147,7 +142,50 @@ logical_plan
 03)----TableScan: data projection=[date, ticker, time]
 physical_plan
 01)SortPreservingMergeExec: [ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: date@0 = 2006-01-02
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+02)--FilterExec: date@0 = 2006-01-02
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------StreamingTableExec: partition_sizes=1, projection=[date, ticker, time], infinite_source=true, output_ordering=[date@0 ASC NULLS LAST, ticker@1 ASC NULLS LAST, time@2 ASC NULLS LAST]
+
+# schema adaptation cast should preserve ordering (regression test for cast properties)
+statement ok
+COPY (
+  SELECT arrow_cast(column1, 'Int32') AS b
+  FROM (VALUES (1), (2), (3), (4))
+) TO 'test_files/scratch/filter_without_sort_exec/cast_ordering.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE cast_ordered (
+    b BIGINT
+)
+STORED AS PARQUET
+WITH ORDER (b)
+LOCATION 'test_files/scratch/filter_without_sort_exec/';
+
+statement ok
+CREATE EXTERNAL TABLE cast_physical
+STORED AS PARQUET
+LOCATION 'test_files/scratch/filter_without_sort_exec/';
+
+query T
+SELECT DISTINCT arrow_typeof(b) FROM cast_physical;
+----
+Int32
+
+query T
+SELECT DISTINCT arrow_typeof(b) FROM cast_ordered;
+----
+Int64
+
+query TT
+EXPLAIN SELECT b FROM cast_ordered WHERE b > 1 ORDER BY b;
+----
+logical_plan
+01)Sort: cast_ordered.b ASC NULLS LAST
+02)--Filter: cast_ordered.b > Int64(1)
+03)----TableScan: cast_ordered projection=[b], partial_filters=[cast_ordered.b > Int64(1)]
+physical_plan
+01)SortPreservingMergeExec: [b@0 ASC NULLS LAST]
+02)--FilterExec: b@0 > 1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/filter_without_sort_exec/cast_ordering.parquet]]}, projection=[b], output_ordering=[b@0 ASC NULLS LAST], file_type=parquet, predicate=b@0 > 1, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 1, required_guarantees=[]
diff --git a/datafusion/sqllogictest/test_files/float16.slt b/datafusion/sqllogictest/test_files/float16.slt
index 5e59c730f0787..699eb81844a40 100644
--- a/datafusion/sqllogictest/test_files/float16.slt
+++ b/datafusion/sqllogictest/test_files/float16.slt
@@ -51,13 +51,14 @@ NULL NULL NULL NULL NULL NULL
 NaN NaN NaN NaN NaN NaN
 
 # Try coercing with literal NULL
-query error
+query R
 select column1 + NULL from float16s;
 ----
-DataFusion error: type_coercion
-caused by
-Error during planning: Cannot automatically convert Null to Float16
-
+NULL
+NULL
+NULL
+NULL
+NULL
 
 # Test coercions with equality
 query BBBBBB
@@ -78,11 +79,14 @@ false false false false false false
 
 
 # Try coercing with literal NULL
-query error
+query B
 select column1 = NULL from float16s;
 ----
-DataFusion error: Error during planning: Cannot infer common argument type for comparison operation Float16 = Null
-
+NULL
+NULL
+NULL
+NULL
+NULL
 
 # Cleanup
 statement ok
diff --git a/datafusion/sqllogictest/test_files/floor_preimage.slt b/datafusion/sqllogictest/test_files/floor_preimage.slt
new file mode 100644
index 0000000000000..93302b3d7a2f6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/floor_preimage.slt
@@ -0,0 +1,308 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Floor Preimage Tests
+##
+## Tests for floor function preimage optimization:
+## floor(col) = N transforms to col >= N AND col < N + 1
+##
+## Uses representative types only (Float64, Int32, Decimal128).
+## Unit tests cover all type variants.
+##########
+
+# Setup: Single table with representative types
+statement ok
+CREATE TABLE test_data (
+    id INT,
+    float_val DOUBLE,
+    int_val INT,
+    decimal_val DECIMAL(10,2)
+) AS VALUES
+    (1, 5.3,   100, 100.00),
+    (2, 5.7,   101, 100.50),
+    (3, 6.0,   102, 101.00),
+    (4, 6.5,   -5,  101.99),
+    (5, 7.0,   0,   102.00),
+    (6, NULL,  NULL, NULL);
+
+##########
+## Data Correctness Tests
+##########
+
+# Float64: floor(x) = 5 matches values in [5.0, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = arrow_cast(5, 'Float64');
+----
+1
+2
+
+# Int32: floor(x) = 100 matches values in [100, 101)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = 100;
+----
+1
+
+# Decimal128: floor(x) = 100 matches values in [100.00, 101.00)
+query I rowsort
+SELECT id FROM test_data WHERE floor(decimal_val) = arrow_cast(100, 'Decimal128(10,2)');
+----
+1
+2
+
+# Negative value: floor(x) = -5 matches values in [-5, -4)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = -5;
+----
+4
+
+# Zero value: floor(x) = 0 matches values in [0, 1)
+query I rowsort
+SELECT id FROM test_data WHERE floor(int_val) = 0;
+----
+5
+
+# Column on RHS (same result as LHS)
+query I rowsort
+SELECT id FROM test_data WHERE arrow_cast(5, 'Float64') = floor(float_val);
+----
+1
+2
+
+# IS NOT DISTINCT FROM (excludes NULLs)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IS NOT DISTINCT FROM arrow_cast(5, 'Float64');
+----
+1
+2
+
+# IS DISTINCT FROM (includes NULLs)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IS DISTINCT FROM arrow_cast(5, 'Float64');
+----
+3
+4
+5
+6
+
+# Non-integer literal (empty result - floor returns integers)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = arrow_cast(5.5, 'Float64');
+----
+
+# IN list: floor(x) IN (5, 7) matches [5.0, 6.0) and [7.0, 8.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+1
+2
+5
+
+# NOT IN list: floor(x) NOT IN (5, 7) excludes matching ranges and NULLs
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) NOT IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+3
+4
+
+##########
+## EXPLAIN Tests - Plan Optimization
+##########
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+# 1. Basic: Float64 - floor(col) = N transforms to col >= N AND col < N+1
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 2. Basic: Int32 - transformed (coerced to Float64)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(int_val) = 100;
+----
+logical_plan
+01)Projection: test_data.id, test_data.float_val, test_data.int_val, test_data.decimal_val
+02)--Filter: __common_expr_3 >= Float64(100) AND __common_expr_3 < Float64(101)
+03)----Projection: CAST(test_data.int_val AS Float64) AS __common_expr_3, test_data.id, test_data.float_val, test_data.int_val, test_data.decimal_val
+04)------TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 3. Basic: Decimal128 - same transformation
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(decimal_val) = arrow_cast(100, 'Decimal128(10,2)');
+----
+logical_plan
+01)Filter: test_data.decimal_val >= Decimal128(Some(10000),10,2) AND test_data.decimal_val < Decimal128(Some(10100),10,2)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 4. Column on RHS - same transformation
+query TT
+EXPLAIN SELECT * FROM test_data WHERE arrow_cast(5, 'Float64') = floor(float_val);
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 5. IS NOT DISTINCT FROM - adds IS NOT NULL
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IS NOT DISTINCT FROM arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val IS NOT NULL AND test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 6. IS DISTINCT FROM - includes NULL check
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IS DISTINCT FROM arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(5) OR test_data.float_val >= Float64(6) OR test_data.float_val IS NULL
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 7. Non-optimizable: non-integer literal (original predicate preserved)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = arrow_cast(5.5, 'Float64');
+----
+logical_plan
+01)Filter: floor(test_data.float_val) = Float64(5.5)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 8. Non-optimizable: extreme float literal (2^53) where n+1 loses precision, so preimage returns None
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) = 9007199254740992;
+----
+logical_plan
+01)Filter: floor(test_data.float_val) = Float64(9007199254740992)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# 9. IN list: each list item is rewritten with preimage and OR-ed together
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) IN (arrow_cast(5, 'Float64'), arrow_cast(7, 'Float64'));
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5) AND test_data.float_val < Float64(6) OR test_data.float_val >= Float64(7) AND test_data.float_val < Float64(8)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Data correctness: floor(col) = 2^53 returns no rows (no value in test_data has floor exactly 2^53)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) = 9007199254740992;
+----
+
+##########
+## Other Comparison Operators
+##
+## The preimage framework automatically handles all comparison operators:
+##   floor(x) <> N  ->  x < N OR x >= N+1
+##   floor(x) > N   ->  x >= N+1
+##   floor(x) < N   ->  x < N
+##   floor(x) >= N  ->  x >= N
+##   floor(x) <= N  ->  x < N+1
+##########
+
+# Data correctness tests for other operators
+
+# Not equals: floor(x) <> 5 matches values outside [5.0, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) <> arrow_cast(5, 'Float64');
+----
+3
+4
+5
+
+# Greater than: floor(x) > 5 matches values in [6.0, inf)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) > arrow_cast(5, 'Float64');
+----
+3
+4
+5
+
+# Less than: floor(x) < 6 matches values in (-inf, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) < arrow_cast(6, 'Float64');
+----
+1
+2
+
+# Greater than or equal: floor(x) >= 5 matches values in [5.0, inf)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) >= arrow_cast(5, 'Float64');
+----
+1
+2
+3
+4
+5
+
+# Less than or equal: floor(x) <= 5 matches values in (-inf, 6.0)
+query I rowsort
+SELECT id FROM test_data WHERE floor(float_val) <= arrow_cast(5, 'Float64');
+----
+1
+2
+
+# EXPLAIN tests showing optimized transformations
+
+# Not equals: floor(x) <> 5 -> x < 5 OR x >= 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) <> arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(5) OR test_data.float_val >= Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Greater than: floor(x) > 5 -> x >= 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) > arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Less than: floor(x) < 6 -> x < 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) < arrow_cast(6, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Greater than or equal: floor(x) >= 5 -> x >= 5
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) >= arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val >= Float64(5)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+# Less than or equal: floor(x) <= 5 -> x < 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE floor(float_val) <= arrow_cast(5, 'Float64');
+----
+logical_plan
+01)Filter: test_data.float_val < Float64(6)
+02)--TableScan: test_data projection=[id, float_val, int_val, decimal_val]
+
+##########
+## Cleanup
+##########
+
+statement ok
+DROP TABLE test_data;
diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt
index 20f79622a62c6..ee11dc973bbd7 100644
--- a/datafusion/sqllogictest/test_files/functions.slt
+++ b/datafusion/sqllogictest/test_files/functions.slt
@@ -193,10 +193,25 @@ SELECT substr('alphabet', 3, CAST(NULL AS int))
 ----
 NULL
 
-statement error The first argument of the substr function can only be a string, but got Int64
+query T
+SELECT substr(NULL, 1, 2)
+----
+NULL
+
+query T
+SELECT substr('alphabet', 1, NULL)
+----
+NULL
+
+query T
+SELECT substr('alphabet', NULL, 2)
+----
+NULL
+
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3)
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3, 4)
 
 query T
@@ -224,6 +239,11 @@ SELECT translate('12345', '143', NULL)
 ----
 NULL
 
+query T
+SELECT translate(arrow_cast('12345', 'LargeUtf8'), '143', 'ax')
+----
+a2x5
+
 statement ok
 CREATE TABLE test(
   c1 VARCHAR
@@ -415,6 +435,11 @@ SELECT upper(arrow_cast('foo', 'Dictionary(Int32, Utf8)'))
 ----
 FOO
 
+query T
+SELECT upper(arrow_cast(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)'))
+----
+FOO
+
 query T
 SELECT upper('árvore ação αβγ')
 ----
@@ -425,6 +450,26 @@ SELECT upper(arrow_cast('árvore ação αβγ', 'Dictionary(Int32, Utf8)'))
 ----
 ÁRVORE AÇÃO ΑΒΓ
 
+query T
+SELECT arrow_typeof(upper('foo'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(upper(arrow_cast('foo', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(upper(arrow_cast('foo', 'Utf8View')))
+----
+Utf8View
+
+query T
+SELECT arrow_typeof(upper(arrow_cast(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)')))
+----
+Utf8View
+
 query T
 SELECT btrim('   foo  ')
 ----
@@ -470,6 +515,11 @@ SELECT lower(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'))
 ----
 foobar
 
+query T
+SELECT lower(arrow_cast(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)'))
+----
+foobar
+
 query T
 SELECT lower('ÁRVORE AÇÃO ΑΒΓ')
 ----
@@ -480,6 +530,26 @@ SELECT lower(arrow_cast('ÁRVORE AÇÃO ΑΒΓ', 'Dictionary(Int32, Utf8)'))
 ----
 árvore ação αβγ
 
+query T
+SELECT arrow_typeof(lower('FOObar'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(lower(arrow_cast('FOObar', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(lower(arrow_cast('FOObar', 'Utf8View')))
+----
+Utf8View
+
+query T
+SELECT arrow_typeof(lower(arrow_cast(arrow_cast('FOObar', 'Dictionary(Int32, Utf8)'), 'Dictionary(Int32, Utf8View)')))
+----
+Utf8View
+
 query T
 SELECT ltrim('   foo')
 ----
@@ -521,6 +591,15 @@ SELECT trim(arrow_cast('  foo  ', 'Dictionary(Int32, Utf8)'))
 ----
 foo
 
+# Verify that trim, ltrim, and rtrim only strip spaces by default,
+# not other whitespace characters (tabs, newlines, etc.)
+query III
+SELECT length(trim(chr(9) || 'foo' || chr(10))),
+       length(ltrim(chr(9) || 'foo')),
+       length(rtrim('foo' || chr(10)))
+----
+5 4 4
+
 query I
 SELECT bit_length('foo')
 ----
@@ -858,7 +937,7 @@ SELECT greatest(-1, 1, 2.3, 123456789, 3 + 5, -(-4), abs(-9.0))
 123456789
 
 
-query error Function 'greatest' user-defined coercion failed with "Error during planning: greatest was called without any arguments. It requires at least 1."
+query error Function 'greatest' user-defined coercion failed with: Error during planning: greatest was called without any arguments. It requires at least 1.
 SELECT greatest()
 
 query I
@@ -1056,7 +1135,7 @@ SELECT least(-1, 1, 2.3, 123456789, 3 + 5, -(-4), abs(-9.0))
 -1
 
 
-query error Function 'least' user-defined coercion failed with "Error during planning: least was called without any arguments. It requires at least 1."
+query error Function 'least' user-defined coercion failed with: Error during planning: least was called without any arguments. It requires at least 1.
 SELECT least()
 
 query I
diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt
index 9e67018ecd0b9..294841552a66d 100644
--- a/datafusion/sqllogictest/test_files/group_by.slt
+++ b/datafusion/sqllogictest/test_files/group_by.slt
@@ -2017,15 +2017,13 @@ physical_plan
 02)--SortExec: expr=[col0@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[col0@0 as col0, last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]@3 as last_col1]
 04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)]
-12)----------------------DataSourceExec: partitions=1, partition_sizes=[3]
-13)----------------------DataSourceExec: partitions=1, partition_sizes=[3]
+05)--------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1]
+09)----------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(col0@0, col0@0)]
+10)------------------DataSourceExec: partitions=1, partition_sizes=[3]
+11)------------------DataSourceExec: partitions=1, partition_sizes=[3]
 
 # Columns in the table are a,b,c,d. Source is DataSourceExec which is ordered by
 # a,b,c column. Column a has cardinality 2, column b has cardinality 4.
@@ -2506,12 +2504,16 @@ TUR [100.0, 75.0] 175
 # test_ordering_sensitive_aggregation3
 # When different aggregators have conflicting requirements, we cannot satisfy all of them in current implementation.
 # test below should raise Plan Error.
-statement error DataFusion error: This feature is not implemented: Conflicting ordering requirements in aggregate functions is not supported
+query ??? rowsort
 SELECT ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts,
     ARRAY_AGG(s.amount ORDER BY s.amount ASC) AS amounts2,
     ARRAY_AGG(s.amount ORDER BY s.sn ASC) AS amounts3
   FROM sales_global AS s
   GROUP BY s.country
+----
+[100.0, 75.0] [75.0, 100.0] [75.0, 100.0]
+[200.0, 50.0] [50.0, 200.0] [50.0, 200.0]
+[80.0, 30.0] [30.0, 80.0] [30.0, 80.0]
 
 # test_ordering_sensitive_aggregation4
 # If aggregators can work with bounded memory (Sorted or PartiallySorted mode), we should append requirement to
@@ -2940,10 +2942,9 @@ physical_plan
 02)--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate]
 03)----AggregateExec: mode=Single, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency], aggr=[last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]]
 04)------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITIPTR rowsort
 SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate
@@ -2984,11 +2985,9 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@2 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TRR
 SELECT country, FIRST_VALUE(amount ORDER BY ts ASC) AS fv1,
@@ -3020,11 +3019,9 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST]@1 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]@2 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=1
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[first_value(sales_global.amount) ORDER BY [sales_global.ts ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.ts DESC NULLS FIRST]]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query TRR
@@ -3184,12 +3181,11 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@1 as array_agg1]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
-08)--------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]]
+07)------------SortExec: expr=[amount@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query T?
 SELECT country, ARRAY_AGG(amount ORDER BY amount ASC) AS array_agg1
@@ -3220,12 +3216,11 @@ physical_plan
 02)--SortExec: expr=[country@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[country@0 as country, array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@1 as amounts, first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST]@2 as fv1, last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]@3 as fv2]
 04)------AggregateExec: mode=FinalPartitioned, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], first_value(sales_global.amount) ORDER BY [sales_global.amount ASC NULLS LAST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
-08)--------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([country@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[country@0 as country], aggr=[array_agg(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST], last_value(sales_global.amount) ORDER BY [sales_global.amount DESC NULLS FIRST]]
+07)------------SortExec: expr=[amount@1 DESC], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query T?RR
 SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts,
@@ -3421,11 +3416,10 @@ physical_plan
 02)--SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[sn@0 as sn, amount@1 as amount, 2 * CAST(sn@0 AS Int64) as Int64(2) * s.sn]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@0 as sn, amount@1 as amount], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IRI
 SELECT s.sn, s.amount, 2*s.sn
@@ -3490,13 +3484,12 @@ physical_plan
 02)--SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[sn@0 as sn, sum(l.amount)@2 as sum(l.amount), amount@1 as amount]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, amount@1 as amount], aggr=[sum(l.amount)]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@1 as sn, amount@2 as amount], aggr=[sum(l.amount)]
-08)--------------NestedLoopJoinExec: join_type=Inner, filter=sn@0 >= sn@1, projection=[amount@1, sn@2, amount@3]
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-10)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-11)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([sn@0, amount@1], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@1 as sn, amount@2 as amount], aggr=[sum(l.amount)]
+07)------------NestedLoopJoinExec: join_type=Inner, filter=sn@0 >= sn@1, projection=[amount@1, sn@2, amount@3]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[2]
+09)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+10)----------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IRR
 SELECT r.sn, SUM(l.amount), r.amount
@@ -3637,13 +3630,12 @@ physical_plan
 02)--SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount]
 04)------AggregateExec: mode=FinalPartitioned, gby=[sn@0 as sn, zip_code@1 as zip_code, country@2 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=4
-06)----------RepartitionExec: partitioning=Hash([sn@0, zip_code@1, country@2, ts@3, currency@4, amount@5, sum_amount@6], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount]
-10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([sn@0, zip_code@1, country@2, ts@3, currency@4, amount@5, sum_amount@6], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+08)--------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount]
+09)----------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+10)------------------DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 query ITIPTRR
@@ -3865,11 +3857,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST]@1 as first_a, last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]@2 as last_c]
 02)--AggregateExec: mode=FinalPartitioned, gby=[d@0 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], last_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST]]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([d@0], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[d@2 as d], aggr=[first_value(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.a ASC NULLS LAST], first_value(multiple_ordered_table.c) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query II rowsort
 SELECT FIRST_VALUE(a ORDER BY a ASC) as first_a,
@@ -3935,12 +3926,11 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd]
 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
-07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+05)------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
+06)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # reset partition number to 8.
 statement ok
@@ -3977,11 +3967,10 @@ logical_plan
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[c@0 as c, b@1 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
 02)--SortExec: expr=[c@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 # drop table multiple_ordered_table_with_pk
 statement ok
@@ -4018,11 +4007,10 @@ logical_plan
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[c@0 as c, b@1 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
 02)--SortExec: expr=[c@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([c@0, b@1], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 statement ok
 set datafusion.execution.target_partitions = 1;
@@ -4061,7 +4049,7 @@ logical_plan
 05)--------TableScan: multiple_ordered_table_with_pk projection=[b, c, d]
 physical_plan
 01)ProjectionExec: expr=[c@0 as c, sum1@2 as sum1, sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as sumb]
-02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 03)----ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
 04)------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
@@ -4091,14 +4079,13 @@ logical_plan
 10)----------TableScan: multiple_ordered_table_with_pk projection=[b, c, d]
 physical_plan
 01)ProjectionExec: expr=[c@0 as c, c@2 as c, sum1@1 as sum1, sum1@3 as sum1]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, b@1)], projection=[c@0, sum1@2, c@3, sum1@5]
-04)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
-05)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
-07)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
-08)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
-09)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, b@1)], projection=[c@0, sum1@2, c@3, sum1@5]
+03)----ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
+04)------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
+06)----ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1]
+07)------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0])
+08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT lhs.c, rhs.c, lhs.sum1, rhs.sum1
@@ -4240,11 +4227,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(DISTINCT t1.x)@1 as sum(DISTINCT t1.x), max(DISTINCT t1.x)@2 as max(DISTINCT t1.x)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[y@0 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
-05)--------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-06)----------AggregateExec: mode=Partial, gby=[y@1 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=1
+04)------AggregateExec: mode=Partial, gby=[y@1 as y], aggr=[sum(DISTINCT t1.x), max(DISTINCT t1.x)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN SELECT SUM(DISTINCT CAST(x AS DOUBLE)), MAX(DISTINCT CAST(x AS DOUBLE)) FROM t1 GROUP BY y;
@@ -4257,15 +4242,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(alias1)@1 as sum(DISTINCT t1.x), max(alias1)@2 as max(DISTINCT t1.x)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
-05)--------AggregateExec: mode=Partial, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[y@0 as y, alias1@1 as alias1], aggr=[]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([y@0, alias1@1], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------AggregateExec: mode=Partial, gby=[y@1 as y, CAST(x@0 AS Float64) as alias1], aggr=[]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([y@0], 8), input_partitions=8
+04)------AggregateExec: mode=Partial, gby=[y@0 as y], aggr=[sum(alias1), max(alias1)]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[y@0 as y, alias1@1 as alias1], aggr=[]
+06)----------RepartitionExec: partitioning=Hash([y@0, alias1@1], 8), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[y@1 as y, CAST(x@0 AS Float64) as alias1], aggr=[]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # create an unbounded table that contains ordered timestamp.
 statement ok
@@ -4294,11 +4276,10 @@ physical_plan
 01)SortPreservingMergeExec: [time_chunks@0 DESC], fetch=5
 02)--ProjectionExec: expr=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as time_chunks]
 03)----AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, preserve_order=true, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC
-06)----------AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0) as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-08)--------------StreamingTableExec: partition_sizes=1, projection=[ts], infinite_source=true, output_ordering=[ts@0 DESC]
+04)------RepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, preserve_order=true, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC
+05)--------AggregateExec: mode=Partial, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0) as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted
+06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+07)------------StreamingTableExec: partition_sizes=1, projection=[ts], infinite_source=true, output_ordering=[ts@0 DESC]
 
 query P
 SELECT date_bin('15 minutes', ts) as time_chunks
@@ -4348,12 +4329,11 @@ physical_plan
 01)SortPreservingMergeExec: [months@0 DESC], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[months@0 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months]
-04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false
+04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[], lim=[5]
+05)--------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[], lim=[5]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false
 
 query I
 SELECT extract(month from ts) as months
@@ -4392,7 +4372,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [name@0 DESC, time_chunks@1 DESC], fetch=5
 02)--ProjectionExec: expr=[name@0 as name, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@1) as time_chunks]
-03)----RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1, maintains_sort_order=true
 04)------StreamingTableExec: partition_sizes=1, projection=[name, ts], infinite_source=true, output_ordering=[name@0 DESC, ts@1 DESC]
 
 statement ok
@@ -4461,19 +4441,13 @@ physical_plan
 02)--SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c1@0 as c1, count(alias1)@1 as count(DISTINCT aggregate_test_100.c2), min(alias1)@2 as min(DISTINCT aggregate_test_100.c2), sum(alias2)@3 as sum(aggregate_test_100.c3), max(alias3)@4 as max(aggregate_test_100.c4)]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1, alias1@1 as alias1], aggr=[alias2, alias3]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------RepartitionExec: partitioning=Hash([c1@0, alias1@1], 8), input_partitions=8
-11)--------------------AggregateExec: mode=Partial, gby=[c1@0 as c1, c2@1 as alias1], aggr=[alias2, alias3]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4], file_type=csv, has_header=true
-
-# Use PostgreSQL dialect
-statement ok
-set datafusion.sql_parser.dialect = 'Postgres';
+05)--------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
+06)----------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[count(alias1), min(alias1), sum(alias2), max(alias3)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1, alias1@1 as alias1], aggr=[alias2, alias3]
+08)--------------RepartitionExec: partitioning=Hash([c1@0, alias1@1], 8), input_partitions=8
+09)----------------AggregateExec: mode=Partial, gby=[c1@0 as c1, c2@1 as alias1], aggr=[alias2, alias3]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4], file_type=csv, has_header=true
 
 query II
 SELECT c2, count(distinct c3) FILTER (WHERE c1 != 'a') FROM aggregate_test_100 GROUP BY c2 ORDER BY c2;
@@ -4493,10 +4467,6 @@ SELECT c2, count(distinct c3) FILTER (WHERE c1 != 'a'), count(c5) FILTER (WHERE
 4 19 18
 5 11 9
 
-# Restore the default dialect
-statement ok
-set datafusion.sql_parser.dialect = 'Generic';
-
 statement ok
 drop table aggregate_test_100;
 
@@ -4535,19 +4505,20 @@ LIMIT 5
 query ITIPTR rowsort
 SELECT r.*
 FROM sales_global_with_pk as l, sales_global_with_pk as r
+ORDER BY 1, 2, 3, 4, 5, 6
 LIMIT 5
 ----
 0 GRC 0 2022-01-01T06:00:00 EUR 30
-1 FRA 1 2022-01-01T08:00:00 EUR 50
-1 FRA 3 2022-01-02T12:00:00 EUR 200
-1 TUR 2 2022-01-01T11:30:00 TRY 75
-1 TUR 4 2022-01-03T10:00:00 TRY 100
+0 GRC 0 2022-01-01T06:00:00 EUR 30
+0 GRC 0 2022-01-01T06:00:00 EUR 30
+0 GRC 0 2022-01-01T06:00:00 EUR 30
+0 GRC 0 2022-01-01T06:00:00 EUR 30
 
 # Create a table with timestamp data
 statement ok
 CREATE TABLE src_table (
-	t1 TIMESTAMP,
-	c2 INT
+  t1 TIMESTAMP,
+  c2 INT
 ) AS VALUES
 ('2020-12-10T00:00:00.00Z', 0),
 ('2020-12-11T00:00:00.00Z', 1),
@@ -4592,8 +4563,8 @@ STORED AS CSV;
 # Create a table from the generated CSV files:
 statement ok
 CREATE EXTERNAL TABLE timestamp_table (
-	t1 TIMESTAMP,
-	c2 INT,
+  t1 TIMESTAMP,
+  c2 INT,
 )
 STORED AS CSV
 LOCATION 'test_files/scratch/group_by/timestamp_table'
@@ -4638,11 +4609,10 @@ physical_plan
 01)SortPreservingMergeExec: [max(timestamp_table.t1)@1 DESC], fetch=4
 02)--SortExec: TopK(fetch=4), expr=[max(timestamp_table.t1)@1 DESC], preserve_partitioning=[true]
 03)----AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([c2@0], 8), input_partitions=8
-06)----------AggregateExec: mode=Partial, gby=[c2@1 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
-08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/0.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/2.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/3.csv]]}, projection=[t1, c2], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c2@0], 8), input_partitions=8
+05)--------AggregateExec: mode=Partial, gby=[c2@1 as c2], aggr=[max(timestamp_table.t1)], lim=[4]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
+07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/0.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/2.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/group_by/timestamp_table/3.csv]]}, projection=[t1, c2], file_type=csv, has_header=true
 
 # Clean up
 statement ok
@@ -5174,10 +5144,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }"),keywords_stream.ts,Utf8("2000-01-01"))@0 as ts_chunk, count(keywords_stream.keyword)@1 as alert_keyword_count]
 02)--AggregateExec: mode=Single, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }, ts@0, 946684800000000000) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 }"),keywords_stream.ts,Utf8("2000-01-01"))], aggr=[count(keywords_stream.keyword)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(keyword@0, keyword@1)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(keyword@0, keyword@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[3]
+05)------DataSourceExec: partitions=1, partition_sizes=[3]
 
 query PI
 SELECT
@@ -5219,17 +5188,17 @@ statement ok
 create table t(a int, b bytea) as values (1, 0xa), (1, 0xa), (2, null), (null, 0xb), (null, 0xb);
 
 query I?I
-select a, b, count(*) from t group by grouping sets ((a, b), (a), (b));
+select a, b, count(*) from t group by grouping sets ((a, b), (a), (b)) order by a, b;
 ----
 1 0a 2
-2 NULL 1
-NULL 0b 2
 1 NULL 2
 2 NULL 1
-NULL NULL 2
+2 NULL 1
 NULL 0a 2
-NULL NULL 1
 NULL 0b 2
+NULL 0b 2
+NULL NULL 2
+NULL NULL 1
 
 statement ok
 drop table t;
@@ -5239,13 +5208,13 @@ statement ok
 create table t(a int, b bytea) as values (1, 0xa), (1, 0xa), (2, 0xb), (3, 0xb), (3, 0xb);
 
 query I?I
-select a, b, count(*) from t group by grouping sets ((a, b), (a), (b));
+select a, b, count(*) from t group by grouping sets ((a, b), (a), (b)) order by a, b;
 ----
 1 0a 2
-2 0b 1
-3 0b 2
 1 NULL 2
+2 0b 1
 2 NULL 1
+3 0b 2
 3 NULL 2
 NULL 0a 2
 NULL 0b 3
@@ -5509,7 +5478,7 @@ create table source as values
 ;
 
 statement ok
-create view t as select column1 as a, arrow_cast(column2, 'Timestamp(Nanosecond, None)') as b from source;
+create view t as select column1 as a, arrow_cast(column2, 'Timestamp(ns)') as b from source;
 
 query IPI
 select a, b, count(*) from t group by a, b order by a, b;
@@ -5559,7 +5528,7 @@ SELECT
     arrow_cast('2024-01-01T00:00:00Z'::timestamptz, 'Timestamp(Second, Some("+08:00"))') AS ts
 GROUP BY ts, text
 ----
-foo 2024-01-01T08:00:00+08:00
+foo 2024-01-01T00:00:00+08:00
 
 # Test multi group by int + Decimal128
 statement ok
diff --git a/datafusion/sqllogictest/test_files/grouping.slt b/datafusion/sqllogictest/test_files/grouping.slt
index 64d040d012f99..3d38576bdbf5f 100644
--- a/datafusion/sqllogictest/test_files/grouping.slt
+++ b/datafusion/sqllogictest/test_files/grouping.slt
@@ -212,3 +212,15 @@ select c1, grouping(c1, c2) from test group by CUBE(c1);
 
 statement error zero arguments
 select c1, grouping() from test group by CUBE(c1);
+
+# grouping_sets_with_empty_set
+query I
+SELECT COUNT(*) FROM test GROUP BY GROUPING SETS (());
+----
+2
+
+# grouping_sets_with_empty_set
+query I
+SELECT SUM(v1) FROM generate_series(10) AS t1(v1) GROUP BY GROUPING SETS(())
+----
+55
diff --git a/datafusion/sqllogictest/test_files/grouping_set_repartition.slt b/datafusion/sqllogictest/test_files/grouping_set_repartition.slt
new file mode 100644
index 0000000000000..16ab90651c8b3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/grouping_set_repartition.slt
@@ -0,0 +1,246 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for ROLLUP/CUBE/GROUPING SETS with multiple partitions
+#
+# This tests the fix for https://github.com/apache/datafusion/issues/19849
+# where ROLLUP queries produced incorrect results with multiple partitions
+# because subset partitioning satisfaction was incorrectly applied.
+#
+# The bug manifests when:
+# 1. UNION ALL of subqueries each with hash-partitioned aggregates
+# 2. Outer ROLLUP groups by more columns than inner hash partitioning
+# 3. InterleaveExec preserves the inner hash partitioning
+# 4. Optimizer incorrectly uses subset satisfaction, skipping necessary repartition
+#
+# The fix ensures that when hash partitioning includes __grouping_id,
+# subset satisfaction is disabled and proper RepartitionExec is inserted.
+##########
+
+##########
+# SETUP: Create partitioned parquet files to simulate distributed data
+##########
+
+statement ok
+set datafusion.execution.target_partitions = 4;
+
+statement ok
+set datafusion.optimizer.repartition_aggregations = true;
+
+# Create partition 1
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('store', 'nike', 100),
+    ('store', 'nike', 200),
+    ('store', 'adidas', 150)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=1/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 2
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('store', 'adidas', 250),
+    ('web', 'nike', 300),
+    ('web', 'nike', 400)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=2/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 3
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('web', 'adidas', 350),
+    ('web', 'adidas', 450),
+    ('catalog', 'nike', 500)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=3/data.parquet'
+STORED AS PARQUET;
+
+# Create partition 4
+statement ok
+COPY (SELECT column1 as channel, column2 as brand, column3 as amount FROM (VALUES
+    ('catalog', 'nike', 600),
+    ('catalog', 'adidas', 550),
+    ('catalog', 'adidas', 650)
+))
+TO 'test_files/scratch/grouping_set_repartition/part=4/data.parquet'
+STORED AS PARQUET;
+
+# Create external table pointing to the partitioned data
+statement ok
+CREATE EXTERNAL TABLE sales (channel VARCHAR, brand VARCHAR, amount INT)
+STORED AS PARQUET
+PARTITIONED BY (part INT)
+LOCATION 'test_files/scratch/grouping_set_repartition/';
+
+##########
+# TEST 1: UNION ALL + ROLLUP pattern (similar to TPC-DS q14)
+# This query pattern triggers the subset satisfaction bug because:
+# - Each UNION ALL branch has hash partitioning on (brand)
+# - The outer ROLLUP requires hash partitioning on (channel, brand, __grouping_id)
+# - Without the fix, subset satisfaction incorrectly skips repartition
+#
+# Verify the physical plan includes RepartitionExec with __grouping_id
+##########
+
+query TT
+EXPLAIN SELECT channel, brand, SUM(total) as grand_total
+FROM (
+    SELECT 'store' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'store'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'web' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'web'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'catalog' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'catalog'
+    GROUP BY brand
+) sub
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+logical_plan
+01)Sort: sub.channel ASC NULLS FIRST, sub.brand ASC NULLS FIRST
+02)--Projection: sub.channel, sub.brand, sum(sub.total) AS grand_total
+03)----Aggregate: groupBy=[[ROLLUP (sub.channel, sub.brand)]], aggr=[[sum(sub.total)]]
+04)------SubqueryAlias: sub
+05)--------Union
+06)----------Projection: Utf8("store") AS channel, sales.brand, sum(sales.amount) AS total
+07)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+08)--------------Projection: sales.brand, sales.amount
+09)----------------Filter: sales.channel = Utf8View("store")
+10)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("store")]
+11)----------Projection: Utf8("web") AS channel, sales.brand, sum(sales.amount) AS total
+12)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+13)--------------Projection: sales.brand, sales.amount
+14)----------------Filter: sales.channel = Utf8View("web")
+15)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("web")]
+16)----------Projection: Utf8("catalog") AS channel, sales.brand, sum(sales.amount) AS total
+17)------------Aggregate: groupBy=[[sales.brand]], aggr=[[sum(CAST(sales.amount AS Int64))]]
+18)--------------Projection: sales.brand, sales.amount
+19)----------------Filter: sales.channel = Utf8View("catalog")
+20)------------------TableScan: sales projection=[channel, brand, amount], partial_filters=[sales.channel = Utf8View("catalog")]
+physical_plan
+01)SortPreservingMergeExec: [channel@0 ASC, brand@1 ASC]
+02)--SortExec: expr=[channel@0 ASC, brand@1 ASC], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[channel@0 as channel, brand@1 as brand, sum(sub.total)@3 as grand_total]
+04)------AggregateExec: mode=FinalPartitioned, gby=[channel@0 as channel, brand@1 as brand, __grouping_id@2 as __grouping_id], aggr=[sum(sub.total)]
+05)--------RepartitionExec: partitioning=Hash([channel@0, brand@1, __grouping_id@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[(NULL as channel, NULL as brand), (channel@0 as channel, NULL as brand), (channel@0 as channel, brand@1 as brand)], aggr=[sum(sub.total)]
+07)------------InterleaveExec
+08)--------------ProjectionExec: expr=[store as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+10)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+11)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+12)----------------------FilterExec: channel@0 = store, projection=[brand@1, amount@2]
+13)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = store, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= store AND store <= channel_max@1, required_guarantees=[channel in (store)]
+14)--------------ProjectionExec: expr=[web as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+15)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+16)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+17)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+18)----------------------FilterExec: channel@0 = web, projection=[brand@1, amount@2]
+19)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = web, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= web AND web <= channel_max@1, required_guarantees=[channel in (web)]
+20)--------------ProjectionExec: expr=[catalog as channel, brand@0 as brand, sum(sales.amount)@1 as total]
+21)----------------AggregateExec: mode=FinalPartitioned, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+22)------------------RepartitionExec: partitioning=Hash([brand@0], 4), input_partitions=4
+23)--------------------AggregateExec: mode=Partial, gby=[brand@0 as brand], aggr=[sum(sales.amount)]
+24)----------------------FilterExec: channel@0 = catalog, projection=[brand@1, amount@2]
+25)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=1/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=2/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=3/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/grouping_set_repartition/part=4/data.parquet]]}, projection=[channel, brand, amount], file_type=parquet, predicate=channel@0 = catalog, pruning_predicate=channel_null_count@2 != row_count@3 AND channel_min@0 <= catalog AND catalog <= channel_max@1, required_guarantees=[channel in (catalog)]
+
+query TTI rowsort
+SELECT channel, brand, SUM(total) as grand_total
+FROM (
+    SELECT 'store' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'store'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'web' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'web'
+    GROUP BY brand
+    UNION ALL
+    SELECT 'catalog' as channel, brand, SUM(amount) as total
+    FROM sales WHERE channel = 'catalog'
+    GROUP BY brand
+) sub
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# TEST 2: Simple ROLLUP (baseline test)
+##########
+
+query TTI rowsort
+SELECT channel, brand, SUM(amount) as total
+FROM sales
+GROUP BY ROLLUP(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# TEST 3: Verify CUBE also works correctly
+##########
+
+query TTI rowsort
+SELECT channel, brand, SUM(amount) as total
+FROM sales
+GROUP BY CUBE(channel, brand)
+ORDER BY channel NULLS FIRST, brand NULLS FIRST;
+----
+NULL NULL 4500
+NULL adidas 2400
+NULL nike 2100
+catalog NULL 2300
+catalog adidas 1200
+catalog nike 1100
+store NULL 700
+store adidas 400
+store nike 300
+web NULL 1500
+web adidas 800
+web nike 700
+
+##########
+# CLEANUP
+##########
+
+statement ok
+DROP TABLE sales;
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
index 108c844f20b4c..aeeb3481c76b9 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -216,12 +216,16 @@ datafusion.catalog.location NULL
 datafusion.catalog.newlines_in_values false
 datafusion.execution.batch_size 8192
 datafusion.execution.coalesce_batches true
-datafusion.execution.collect_statistics false
+datafusion.execution.collect_statistics true
+datafusion.execution.enable_ansi_mode false
 datafusion.execution.enable_recursive_ctes true
 datafusion.execution.enforce_batch_size_in_joins false
+datafusion.execution.hash_join_buffering_capacity 0
 datafusion.execution.keep_partition_by_columns false
+datafusion.execution.listing_table_factory_infer_partitions true
 datafusion.execution.listing_table_ignore_subdirectory true
 datafusion.execution.max_buffered_batches_per_output_file 2
+datafusion.execution.max_spill_file_size_bytes 134217728
 datafusion.execution.meta_fetch_concurrency 32
 datafusion.execution.minimum_parallel_output_files 4
 datafusion.execution.objectstore_writer_buffer_size 10485760
@@ -241,11 +245,12 @@ datafusion.execution.parquet.dictionary_enabled true
 datafusion.execution.parquet.dictionary_page_size_limit 1048576
 datafusion.execution.parquet.enable_page_index true
 datafusion.execution.parquet.encoding NULL
+datafusion.execution.parquet.force_filter_selections false
+datafusion.execution.parquet.max_predicate_cache_size NULL
 datafusion.execution.parquet.max_row_group_size 1048576
-datafusion.execution.parquet.max_statistics_size 4096
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1
-datafusion.execution.parquet.metadata_size_hint NULL
+datafusion.execution.parquet.metadata_size_hint 524288
 datafusion.execution.parquet.pruning true
 datafusion.execution.parquet.pushdown_filters false
 datafusion.execution.parquet.reorder_filters false
@@ -253,9 +258,11 @@ datafusion.execution.parquet.schema_force_view_types true
 datafusion.execution.parquet.skip_arrow_metadata false
 datafusion.execution.parquet.skip_metadata true
 datafusion.execution.parquet.statistics_enabled page
-datafusion.execution.parquet.statistics_truncate_length NULL
+datafusion.execution.parquet.statistics_truncate_length 64
 datafusion.execution.parquet.write_batch_size 1024
 datafusion.execution.parquet.writer_version 1.0
+datafusion.execution.perfect_hash_join_min_key_density 0.15
+datafusion.execution.perfect_hash_join_small_build_threshold 1024
 datafusion.execution.planning_concurrency 13
 datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8
 datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000
@@ -263,16 +270,19 @@ datafusion.execution.skip_physical_aggregate_schema_check false
 datafusion.execution.soft_max_rows_per_output_file 50000000
 datafusion.execution.sort_in_place_threshold_bytes 1048576
 datafusion.execution.sort_spill_reservation_bytes 10485760
+datafusion.execution.spill_compression uncompressed
 datafusion.execution.split_file_groups_by_statistics false
 datafusion.execution.target_partitions 7
-datafusion.execution.time_zone +00:00
+datafusion.execution.time_zone NULL
 datafusion.execution.use_row_number_estimates_to_optimize_partitioning false
+datafusion.explain.analyze_level dev
 datafusion.explain.format indent
 datafusion.explain.logical_plan_only false
 datafusion.explain.physical_plan_only false
 datafusion.explain.show_schema false
 datafusion.explain.show_sizes true
 datafusion.explain.show_statistics false
+datafusion.explain.tree_maximum_render_width 240
 datafusion.format.date_format %Y-%m-%d
 datafusion.format.datetime_format %Y-%m-%dT%H:%M:%S%.f
 datafusion.format.duration_format pretty
@@ -284,17 +294,29 @@ datafusion.format.timestamp_tz_format NULL
 datafusion.format.types_info false
 datafusion.optimizer.allow_symmetric_joins_without_pruning true
 datafusion.optimizer.default_filter_selectivity 20
+datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true
+datafusion.optimizer.enable_dynamic_filter_pushdown true
+datafusion.optimizer.enable_join_dynamic_filter_pushdown true
+datafusion.optimizer.enable_leaf_expression_pushdown true
+datafusion.optimizer.enable_piecewise_merge_join false
 datafusion.optimizer.enable_round_robin_repartition true
+datafusion.optimizer.enable_sort_pushdown true
 datafusion.optimizer.enable_topk_aggregation true
+datafusion.optimizer.enable_topk_dynamic_filter_pushdown true
+datafusion.optimizer.enable_topk_repartition true
+datafusion.optimizer.enable_window_limits true
 datafusion.optimizer.expand_views_at_output false
 datafusion.optimizer.filter_null_join_keys false
+datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150
+datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072
 datafusion.optimizer.hash_join_single_partition_threshold 1048576
 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072
 datafusion.optimizer.max_passes 3
 datafusion.optimizer.prefer_existing_sort false
 datafusion.optimizer.prefer_existing_union false
 datafusion.optimizer.prefer_hash_join true
+datafusion.optimizer.preserve_file_partitions 0
 datafusion.optimizer.repartition_aggregations true
 datafusion.optimizer.repartition_file_min_size 10485760
 datafusion.optimizer.repartition_file_scans true
@@ -302,12 +324,20 @@ datafusion.optimizer.repartition_joins true
 datafusion.optimizer.repartition_sorts true
 datafusion.optimizer.repartition_windows true
 datafusion.optimizer.skip_failed_rules false
+datafusion.optimizer.subset_repartition_threshold 4
 datafusion.optimizer.top_down_join_key_reordering true
+datafusion.runtime.list_files_cache_limit 1M
+datafusion.runtime.list_files_cache_ttl NULL
+datafusion.runtime.max_temp_directory_size 100G
+datafusion.runtime.memory_limit unlimited
+datafusion.runtime.metadata_cache_limit 50M
+datafusion.runtime.temp_directory NULL
 datafusion.sql_parser.collect_spans false
+datafusion.sql_parser.default_null_ordering nulls_max
 datafusion.sql_parser.dialect generic
 datafusion.sql_parser.enable_ident_normalization true
 datafusion.sql_parser.enable_options_value_normalization false
-datafusion.sql_parser.map_varchar_to_utf8view true
+datafusion.sql_parser.map_string_types_to_utf8view true
 datafusion.sql_parser.parse_float_as_decimal false
 datafusion.sql_parser.recursion_limit 50
 datafusion.sql_parser.support_varchar_with_length true
@@ -326,12 +356,16 @@ datafusion.catalog.location NULL Location scanned to load tables for `default` s
 datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
 datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
 datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
-datafusion.execution.collect_statistics false Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false.
+datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.
+datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero,   numeric overflow, or invalid casts raise runtime errors rather than returning   `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum   representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default.
 datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
 datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.
+datafusion.execution.hash_join_buffering_capacity 0 How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it.
 datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches
+datafusion.execution.listing_table_factory_infer_partitions true Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema).
 datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).
 datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption
+datafusion.execution.max_spill_file_size_bytes 134217728 Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB
 datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics
 datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.
 datafusion.execution.objectstore_writer_buffer_size 10485760 Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.
@@ -343,7 +377,7 @@ datafusion.execution.parquet.bloom_filter_on_read true (reading) Use any availab
 datafusion.execution.parquet.bloom_filter_on_write false (writing) Write bloom filters for all columns when creating parquet files
 datafusion.execution.parquet.coerce_int96 NULL (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.
 datafusion.execution.parquet.column_index_truncate_length 64 (writing) Sets column index truncate length
-datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.
+datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.
 datafusion.execution.parquet.created_by datafusion (writing) Sets "created by" property
 datafusion.execution.parquet.data_page_row_count_limit 20000 (writing) Sets best effort maximum number of rows in data page
 datafusion.execution.parquet.data_pagesize_limit 1048576 (writing) Sets best effort maximum size of data page in bytes
@@ -351,11 +385,12 @@ datafusion.execution.parquet.dictionary_enabled true (writing) Sets if dictionar
 datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets best effort maximum dictionary page size, in bytes
 datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.
 datafusion.execution.parquet.encoding NULL (writing)  Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting
+datafusion.execution.parquet.force_filter_selections false (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows.
+datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching.
 datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.
-datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.
-datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer
+datafusion.execution.parquet.metadata_size_hint 524288 (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed.
 datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
 datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
 datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
@@ -363,9 +398,11 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par
 datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>
 datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
 datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
-datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting
-datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes
+datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting
+datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in rows
 datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0"
+datafusion.execution.perfect_hash_join_min_key_density 0.15 The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.
+datafusion.execution.perfect_hash_join_small_build_threshold 1024 A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.
 datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
 datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input
 datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode
@@ -373,16 +410,19 @@ datafusion.execution.skip_physical_aggregate_schema_check false When set to true
 datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max
 datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.
 datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).
+datafusion.execution.spill_compression uncompressed Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed.
 datafusion.execution.split_file_groups_by_statistics false Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental
 datafusion.execution.target_partitions 7 Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system
-datafusion.execution.time_zone +00:00 The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour
+datafusion.execution.time_zone NULL The default time zone Some functions, e.g. `now` return timestamps in this time zone
 datafusion.execution.use_row_number_estimates_to_optimize_partitioning false Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.
+datafusion.explain.analyze_level dev Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers.
 datafusion.explain.format indent Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.
 datafusion.explain.logical_plan_only false When set to true, the explain statement will only print logical plans
 datafusion.explain.physical_plan_only false When set to true, the explain statement will only print physical plans
 datafusion.explain.show_schema false When set to true, the explain statement will print schema information
 datafusion.explain.show_sizes true When set to true, the explain statement will print the partition sizes
 datafusion.explain.show_statistics false When set to true, the explain statement will print operator statistics for physical plans
+datafusion.explain.tree_maximum_render_width 240 (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit.
 datafusion.format.date_format %Y-%m-%d Date format for date arrays
 datafusion.format.datetime_format %Y-%m-%dT%H:%M:%S%.f Format for DateTime arrays
 datafusion.format.duration_format pretty Duration format. Can be either `"pretty"` or `"ISO8601"`
@@ -394,17 +434,29 @@ datafusion.format.timestamp_tz_format NULL Timestamp format for timestamp with t
 datafusion.format.types_info false Show types in visual representation batches
 datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.
 datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).
+datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.
+datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
+datafusion.optimizer.enable_join_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.
+datafusion.optimizer.enable_leaf_expression_pushdown true When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes.
+datafusion.optimizer.enable_piecewise_merge_join false When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.
 datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores
+datafusion.optimizer.enable_sort_pushdown true Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true
 datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible
+datafusion.optimizer.enable_topk_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase.
+datafusion.optimizer.enable_topk_repartition true When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle.
+datafusion.optimizer.enable_window_limits true When set to true, the optimizer will attempt to push limit operations past window functions, if possible
 datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.
 datafusion.optimizer.filter_null_join_keys false When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.
+datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values 150 Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
+datafusion.optimizer.hash_join_inlist_pushdown_max_size 131072 Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins.
 datafusion.optimizer.hash_join_single_partition_threshold 1048576 The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition
 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition
 datafusion.optimizer.max_passes 3 Number of times that the optimizer will attempt to optimize the plan
 datafusion.optimizer.prefer_existing_sort false When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec`  and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.
 datafusion.optimizer.prefer_existing_union false When set to true, the optimizer will not attempt to convert Union to Interleave
 datafusion.optimizer.prefer_hash_join true When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory
+datafusion.optimizer.preserve_file_partitions 0 Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used:     - preserve_file_partitions=0: Disable it.     - preserve_file_partitions=1: Always enable it.     - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N.     This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions.
 datafusion.optimizer.repartition_aggregations true Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level
 datafusion.optimizer.repartition_file_min_size 10485760 Minimum total files size in bytes to perform file scan repartitioning.
 datafusion.optimizer.repartition_file_scans true When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation.
@@ -412,12 +464,20 @@ datafusion.optimizer.repartition_joins true Should DataFusion repartition data u
 datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below ```text      "SortExec: [a@0 ASC]",      "  CoalescePartitionsExec",      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ``` would turn into the plan below which performs better in multithreaded environments ```text      "SortPreservingMergeExec: [a@0 ASC]",      "  SortExec: [a@0 ASC]",      "    RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ```
 datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level
 datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail
+datafusion.optimizer.subset_repartition_threshold 4 Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): ```text     Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a])     If current partitions (3) < threshold (4), repartition:     AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)]       RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3         AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)]           DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3)     If current partitions (8) >= threshold (4), use subset satisfaction:     AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)]       DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ```
 datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys
+datafusion.runtime.list_files_cache_limit 1M Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.list_files_cache_ttl NULL TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.
+datafusion.runtime.max_temp_directory_size 100G Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.memory_limit unlimited Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.metadata_cache_limit 50M Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.
+datafusion.runtime.temp_directory NULL The path to the temporary file directory.
 datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.
+datafusion.sql_parser.default_null_ordering nulls_max Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>
 datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
 datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
 datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.
-datafusion.sql_parser.map_varchar_to_utf8view true If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8`  during SQL planning. Default is false.
+datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.
 datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
 datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries
 datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.
@@ -439,14 +499,14 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch
 query TT
 SHOW TIME ZONE
 ----
-datafusion.execution.time_zone +00:00
+datafusion.execution.time_zone NULL
 
 # show_timezone_default_utc
 # https://github.com/apache/datafusion/issues/3255
 query TT
 SHOW TIMEZONE
 ----
-datafusion.execution.time_zone +00:00
+datafusion.execution.time_zone NULL
 
 
 # show_time_zone_default_utc_verbose
@@ -454,14 +514,14 @@ datafusion.execution.time_zone +00:00
 query TTT
 SHOW TIME ZONE VERBOSE
 ----
-datafusion.execution.time_zone +00:00 The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour
+datafusion.execution.time_zone NULL The default time zone Some functions, e.g. `now` return timestamps in this time zone
 
 # show_timezone_default_utc
 # https://github.com/apache/datafusion/issues/3255
 query TTT
 SHOW TIMEZONE VERBOSE
 ----
-datafusion.execution.time_zone +00:00 The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour
+datafusion.execution.time_zone NULL The default time zone Some functions, e.g. `now` return timestamps in this time zone
 
 
 # show empty verbose
@@ -685,6 +745,54 @@ SHOW CREATE TABLE abc;
 ----
 datafusion public abc CREATE EXTERNAL TABLE abc STORED AS CSV LOCATION ../../testing/data/csv/aggregate_test_100.csv
 
+# show_external_create_table_with_order
+statement ok
+CREATE EXTERNAL TABLE abc_ordered
+STORED AS CSV
+WITH ORDER (c1)
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+OPTIONS ('format.has_header' 'true');
+
+query TTTT
+SHOW CREATE TABLE abc_ordered;
+----
+datafusion public abc_ordered CREATE EXTERNAL TABLE abc_ordered STORED AS CSV WITH ORDER (c1) LOCATION ../../testing/data/csv/aggregate_test_100.csv
+
+statement ok
+DROP TABLE abc_ordered;
+
+# show_external_create_table_with_multiple_order_columns
+statement ok
+CREATE EXTERNAL TABLE abc_multi_order
+STORED AS CSV
+WITH ORDER (c1, c2 DESC)
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+OPTIONS ('format.has_header' 'true');
+
+query TTTT
+SHOW CREATE TABLE abc_multi_order;
+----
+datafusion public abc_multi_order CREATE EXTERNAL TABLE abc_multi_order STORED AS CSV WITH ORDER (c1, c2 DESC) LOCATION ../../testing/data/csv/aggregate_test_100.csv
+
+statement ok
+DROP TABLE abc_multi_order;
+
+# show_external_create_table_with_order_nulls
+statement ok
+CREATE EXTERNAL TABLE abc_order_nulls
+STORED AS CSV
+WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST)
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+OPTIONS ('format.has_header' 'true');
+
+query TTTT
+SHOW CREATE TABLE abc_order_nulls;
+----
+datafusion public abc_order_nulls CREATE EXTERNAL TABLE abc_order_nulls STORED AS CSV WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST) LOCATION ../../testing/data/csv/aggregate_test_100.csv
+
+statement ok
+DROP TABLE abc_order_nulls;
+
 # string_agg has different arg_types but same return type. Test avoiding duplicate entries for the same function.
 query TTT
 select routine_name, data_type, function_type from information_schema.routines where routine_name = 'string_agg';
@@ -695,14 +803,11 @@ string_agg String AGGREGATE
 query TTTTTTTBTTTT rowsort
 select * from information_schema.routines where routine_name = 'date_trunc' OR routine_name = 'string_agg' OR routine_name = 'rank' ORDER BY routine_name
 ----
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Microsecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Microsecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Millisecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Millisecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Nanosecond, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Nanosecond, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Second, None) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(Second, Some("+TZ")) SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Date SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true String SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Time(ns) SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(ns) SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+datafusion public date_trunc datafusion public date_trunc FUNCTION true Timestamp(ns, "+TZ") SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
 datafusion public rank datafusion public rank FUNCTION true NULL WINDOW Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. rank()
 datafusion public string_agg datafusion public string_agg FUNCTION true String AGGREGATE Concatenates the values of string expressions and places separator values between them. If ordering is required, strings are concatenated in the specified order. This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the first argument expression. string_agg([DISTINCT] expression, delimiter [ORDER BY expression])
 
@@ -715,30 +820,21 @@ false
 query TTTITTTTBI
 select * from information_schema.parameters where specific_name = 'date_trunc' OR specific_name = 'string_agg' OR specific_name = 'rank' ORDER BY specific_name, rid, data_type;
 ----
+datafusion public date_trunc 1 OUT NULL Date NULL false 0
+datafusion public date_trunc 2 IN expression Date NULL false 0
 datafusion public date_trunc 1 IN precision String NULL false 0
-datafusion public date_trunc 2 IN expression Timestamp(Microsecond, None) NULL false 0
-datafusion public date_trunc 1 OUT NULL Timestamp(Microsecond, None) NULL false 0
 datafusion public date_trunc 1 IN precision String NULL false 1
-datafusion public date_trunc 2 IN expression Timestamp(Microsecond, Some("+TZ")) NULL false 1
-datafusion public date_trunc 1 OUT NULL Timestamp(Microsecond, Some("+TZ")) NULL false 1
+datafusion public date_trunc 2 IN expression String NULL false 1
+datafusion public date_trunc 1 OUT NULL String NULL false 1
 datafusion public date_trunc 1 IN precision String NULL false 2
-datafusion public date_trunc 2 IN expression Timestamp(Millisecond, None) NULL false 2
-datafusion public date_trunc 1 OUT NULL Timestamp(Millisecond, None) NULL false 2
+datafusion public date_trunc 2 IN expression Time(ns) NULL false 2
+datafusion public date_trunc 1 OUT NULL Time(ns) NULL false 2
 datafusion public date_trunc 1 IN precision String NULL false 3
-datafusion public date_trunc 2 IN expression Timestamp(Millisecond, Some("+TZ")) NULL false 3
-datafusion public date_trunc 1 OUT NULL Timestamp(Millisecond, Some("+TZ")) NULL false 3
+datafusion public date_trunc 2 IN expression Timestamp(ns) NULL false 3
+datafusion public date_trunc 1 OUT NULL Timestamp(ns) NULL false 3
 datafusion public date_trunc 1 IN precision String NULL false 4
-datafusion public date_trunc 2 IN expression Timestamp(Nanosecond, None) NULL false 4
-datafusion public date_trunc 1 OUT NULL Timestamp(Nanosecond, None) NULL false 4
-datafusion public date_trunc 1 IN precision String NULL false 5
-datafusion public date_trunc 2 IN expression Timestamp(Nanosecond, Some("+TZ")) NULL false 5
-datafusion public date_trunc 1 OUT NULL Timestamp(Nanosecond, Some("+TZ")) NULL false 5
-datafusion public date_trunc 1 IN precision String NULL false 6
-datafusion public date_trunc 2 IN expression Timestamp(Second, None) NULL false 6
-datafusion public date_trunc 1 OUT NULL Timestamp(Second, None) NULL false 6
-datafusion public date_trunc 1 IN precision String NULL false 7
-datafusion public date_trunc 2 IN expression Timestamp(Second, Some("+TZ")) NULL false 7
-datafusion public date_trunc 1 OUT NULL Timestamp(Second, Some("+TZ")) NULL false 7
+datafusion public date_trunc 2 IN expression Timestamp(ns, "+TZ") NULL false 4
+datafusion public date_trunc 1 OUT NULL Timestamp(ns, "+TZ") NULL false 4
 datafusion public string_agg 2 IN delimiter Null NULL false 0
 datafusion public string_agg 1 IN expression String NULL false 0
 datafusion public string_agg 1 OUT NULL String NULL false 0
@@ -764,14 +860,11 @@ repeat String 1 OUT 0
 query TT??TTT rowsort
 show functions like 'date_trunc';
 ----
-date_trunc Timestamp(Microsecond, None) [precision, expression] [String, Timestamp(Microsecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Microsecond, Some("+TZ")) [precision, expression] [String, Timestamp(Microsecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Millisecond, None) [precision, expression] [String, Timestamp(Millisecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Millisecond, Some("+TZ")) [precision, expression] [String, Timestamp(Millisecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Nanosecond, None) [precision, expression] [String, Timestamp(Nanosecond, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Nanosecond, Some("+TZ")) [precision, expression] [String, Timestamp(Nanosecond, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Second, None) [precision, expression] [String, Timestamp(Second, None)] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
-date_trunc Timestamp(Second, Some("+TZ")) [precision, expression] [String, Timestamp(Second, Some("+TZ"))] SCALAR Truncates a timestamp value to a specified precision. date_trunc(precision, expression)
+date_trunc Date [precision, expression] [String, Date] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc String [precision, expression] [String, String] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Time(ns) [precision, expression] [String, Time(ns)] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Timestamp(ns) [precision, expression] [String, Timestamp(ns)] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
+date_trunc Timestamp(ns, "+TZ") [precision, expression] [String, Timestamp(ns, "+TZ")] SCALAR Truncates a timestamp or time value to a specified precision. date_trunc(precision, expression)
 
 statement ok
 show functions
diff --git a/datafusion/sqllogictest/test_files/information_schema_columns.slt b/datafusion/sqllogictest/test_files/information_schema_columns.slt
index d348a764fa85f..c733b3baa7a47 100644
--- a/datafusion/sqllogictest/test_files/information_schema_columns.slt
+++ b/datafusion/sqllogictest/test_files/information_schema_columns.slt
@@ -42,7 +42,7 @@ my_catalog my_schema table_with_many_types float64_col 1 NULL YES Float64 NULL N
 my_catalog my_schema table_with_many_types int32_col 0 NULL NO Int32 NULL NULL 32 2 NULL NULL NULL
 my_catalog my_schema table_with_many_types large_binary_col 5 NULL NO LargeBinary NULL 9223372036854775807 NULL NULL NULL NULL NULL
 my_catalog my_schema table_with_many_types large_utf8_col 3 NULL NO LargeUtf8 NULL 9223372036854775807 NULL NULL NULL NULL NULL
-my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(Nanosecond, None) NULL NULL NULL NULL NULL NULL NULL
+my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(ns) NULL NULL NULL NULL NULL NULL NULL
 my_catalog my_schema table_with_many_types utf8_col 2 NULL YES Utf8 NULL 2147483647 NULL NULL NULL NULL NULL
 
 # Cleanup
diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt
index 8a9c01d36308d..e7b9e77dfef58 100644
--- a/datafusion/sqllogictest/test_files/insert.slt
+++ b/datafusion/sqllogictest/test_files/insert.slt
@@ -68,12 +68,10 @@ physical_plan
 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2]
 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST]
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1]
-05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 query I
 INSERT INTO table_without_values SELECT
@@ -128,12 +126,10 @@ physical_plan
 01)DataSinkExec: sink=MemoryTable (partitions=1)
 02)--CoalescePartitionsExec
 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 
@@ -169,7 +165,7 @@ ORDER BY c1
 ----
 logical_plan
 01)Dml: op=[Insert Into] table=[table_without_values]
-02)--Projection: a1 AS a1, a2 AS a2
+02)--Projection: a1, a2
 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST
 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1
 05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]]
@@ -179,12 +175,10 @@ physical_plan
 02)--ProjectionExec: expr=[a1@0 as a1, a2@1 as a2]
 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST]
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as a2, c1@0 as c1]
-05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 query I
diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt
index 24982dfc28a75..4702b0b9ca3b6 100644
--- a/datafusion/sqllogictest/test_files/insert_to_external.slt
+++ b/datafusion/sqllogictest/test_files/insert_to_external.slt
@@ -175,6 +175,34 @@ select * from partitioned_insert_test order by a,b,c
 1 20 200
 2 20 200
 
+statement count 0
+CREATE EXTERNAL TABLE
+partitioned_insert_test_readback
+STORED AS csv
+LOCATION 'test_files/scratch/insert_to_external/insert_to_partitioned/';
+
+query TTT
+describe partitioned_insert_test_readback;
+----
+c Int64 YES
+a Dictionary(UInt16, Utf8) NO
+b Dictionary(UInt16, Utf8) NO
+
+query ITT
+select * from partitioned_insert_test_readback order by a,b,c;
+----
+1 10 100
+1 10 200
+1 20 100
+2 20 100
+1 20 200
+2 20 200
+
+query I
+select count(*) from partitioned_insert_test_readback where b=100;
+----
+3
+
 statement ok
 CREATE EXTERNAL TABLE
 partitioned_insert_test_verify(c bigint)
@@ -333,6 +361,41 @@ select * from directory_test;
 1 2
 3 4
 
+statement count 0
+CREATE EXTERNAL TABLE
+directory_with_dots_test(a bigint, b bigint)
+STORED AS parquet
+LOCATION 'test_files/scratch/insert_to_external/external_versioned_parquet_table.v0/';
+
+query I
+INSERT INTO directory_with_dots_test values (1, 2), (3, 4);
+----
+2
+
+query II
+select * from directory_with_dots_test;
+----
+1 2
+3 4
+
+statement count 0
+CREATE EXTERNAL TABLE
+directory_with_dots_readback
+STORED AS parquet
+LOCATION 'test_files/scratch/insert_to_external/external_versioned_parquet_table.v0/';
+
+query TTT
+describe directory_with_dots_readback
+----
+a Int64 YES
+b Int64 YES
+
+query II
+select * from directory_with_dots_readback
+----
+1 2
+3 4
+
 statement ok
 CREATE EXTERNAL TABLE
 table_without_values(field1 BIGINT NULL, field2 BIGINT NULL)
@@ -359,12 +422,10 @@ physical_plan
 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2]
 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST]
 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1]
-05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 query I
 INSERT INTO table_without_values SELECT
@@ -420,12 +481,10 @@ physical_plan
 01)DataSinkExec: sink=ParquetSink(file_groups=[])
 02)--CoalescePartitionsExec
 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c4, c9], file_type=csv, has_header=true
 
 
 
diff --git a/datafusion/sqllogictest/test_files/issue_17138.slt b/datafusion/sqllogictest/test_files/issue_17138.slt
new file mode 100644
index 0000000000000..de9cb4bcf77bb
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/issue_17138.slt
@@ -0,0 +1,36 @@
+statement ok
+CREATE TABLE tab1(col0 INTEGER, col1 INTEGER, col2 INTEGER)
+
+statement ok
+INSERT INTO tab1 VALUES(51,14,96)
+
+query R
+SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1
+----
+NULL
+
+query TT
+EXPLAIN SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1
+----
+logical_plan
+01)Projection: Float64(NULL) AS col0
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[NULL as col0]
+02)--PlaceholderRowExec
+
+# Similar, with a few more arithmetic operations
+query R
+SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1
+----
+NULL
+
+query TT
+EXPLAIN SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1
+----
+logical_plan
+01)Projection: Float64(NULL) AS col0
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[NULL as col0]
+02)--PlaceholderRowExec
diff --git a/datafusion/sqllogictest/test_files/join.slt.part b/datafusion/sqllogictest/test_files/join.slt.part
index 19763ab0083f8..c0a838c97d552 100644
--- a/datafusion/sqllogictest/test_files/join.slt.part
+++ b/datafusion/sqllogictest/test_files/join.slt.part
@@ -681,7 +681,7 @@ select col2, col4 from t1 full outer join t2 on col1 = col3
 query TT
 explain select * from t1 join t2 on false;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 physical_plan EmptyExec
 
 # Make batch size smaller than table row number. to introduce parallelism to the plan.
@@ -776,10 +776,9 @@ logical_plan
 03)--SubqueryAlias: t2
 04)----TableScan: t1 projection=[a, b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Reset the configs to old values
 statement ok
@@ -849,51 +848,52 @@ logical_plan
 05)----TableScan: department projection=[dept_name]
 physical_plan
 01)ProjectionExec: expr=[emp_id@1 as emp_id, name@2 as name, dept_name@0 as dept_name]
-02)--NestedLoopJoinExec: join_type=Right, filter=name@0 = Alice OR name@0 = Bob
+02)--NestedLoopJoinExec: join_type=Right, filter=join_proj_push_down_1@0, projection=[dept_name@0, emp_id@1, name@2]
 03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----ProjectionExec: expr=[emp_id@0 as emp_id, name@1 as name, name@1 = Alice OR name@1 = Bob as join_proj_push_down_1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
-query ITT
+query ITT rowsort
 SELECT e.emp_id, e.name, d.dept_name
 FROM employees AS e
 LEFT JOIN department AS d
 ON (e.name = 'Alice' OR e.name = 'Bob');
 ----
-1 Alice HR
 1 Alice Engineering
+1 Alice HR
 1 Alice Sales
-2 Bob HR
 2 Bob Engineering
+2 Bob HR
 2 Bob Sales
 3 Carol NULL
 
 # neither RIGHT OUTER JOIN
-query ITT
+query ITT rowsort
 SELECT e.emp_id, e.name, d.dept_name
 FROM department AS d
 RIGHT JOIN employees AS e
 ON (e.name = 'Alice' OR e.name = 'Bob');
 ----
-1 Alice HR
 1 Alice Engineering
+1 Alice HR
 1 Alice Sales
-2 Bob HR
 2 Bob Engineering
+2 Bob HR
 2 Bob Sales
 3 Carol NULL
 
 # neither FULL OUTER JOIN
-query ITT
+query ITT rowsort
 SELECT e.emp_id, e.name, d.dept_name
 FROM department AS d
 FULL JOIN employees AS e
 ON (e.name = 'Alice' OR e.name = 'Bob');
 ----
-1 Alice HR
 1 Alice Engineering
+1 Alice HR
 1 Alice Sales
-2 Bob HR
 2 Bob Engineering
+2 Bob HR
 2 Bob Sales
 3 Carol NULL
 
@@ -935,10 +935,9 @@ logical_plan
 06)----TableScan: department projection=[dept_name]
 physical_plan
 01)CrossJoinExec
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: name@1 = Alice OR name@1 = Bob
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)--DataSourceExec: partitions=1, partition_sizes=[1]
+02)--FilterExec: name@1 = Alice OR name@1 = Bob
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # expect no row for Carol
 query ITT
@@ -974,24 +973,21 @@ ON e.emp_id = d.emp_id
 WHERE ((dept_name != 'Engineering' AND e.name = 'Alice') OR (name != 'Alice' AND e.name = 'Carol'));
 ----
 logical_plan
-01)Filter: d.dept_name != Utf8View("Engineering") AND e.name = Utf8View("Alice") OR e.name != Utf8View("Alice") AND e.name = Utf8View("Carol")
+01)Filter: d.dept_name != Utf8View("Engineering") AND e.name = Utf8View("Alice") OR e.name = Utf8View("Carol")
 02)--Projection: e.emp_id, e.name, d.dept_name
 03)----Left Join: e.emp_id = d.emp_id
 04)------SubqueryAlias: e
-05)--------Filter: employees.name = Utf8View("Alice") OR employees.name != Utf8View("Alice") AND employees.name = Utf8View("Carol")
+05)--------Filter: employees.name = Utf8View("Alice") OR employees.name = Utf8View("Carol")
 06)----------TableScan: employees projection=[emp_id, name]
 07)------SubqueryAlias: d
 08)--------TableScan: department projection=[emp_id, dept_name]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: dept_name@2 != Engineering AND name@1 = Alice OR name@1 != Alice AND name@1 = Carol
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: name@1 = Alice OR name@1 != Alice AND name@1 = Carol
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: dept_name@2 != Engineering AND name@1 = Alice OR name@1 = Carol
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3]
+04)------FilterExec: name@1 = Alice OR name@1 = Carol
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITT
 SELECT e.emp_id, e.name, d.dept_name
@@ -1179,16 +1175,14 @@ logical_plan
 06)--------TableScan: t5 projection=[v0, v1, v2, v3, v4]
 07)----TableScan: t0 projection=[v0, v1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(t1.v0 AS Float64)@6, v1@1)], filter=v1@1 + CAST(v0@0 AS Float64) > 0, projection=[v0@0, v1@1, v2@3, v3@4, v4@5, v0@7, v1@8]
-03)----CoalescePartitionsExec
-04)------ProjectionExec: expr=[v0@0 as v0, v1@1 as v1, v0@2 as v0, v2@3 as v2, v3@4 as v3, v4@5 as v4, CAST(v0@0 AS Float64) as CAST(t1.v0 AS Float64)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[0]
-09)--------------DataSourceExec: partitions=1, partition_sizes=[0]
-10)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(t1.v0 AS Float64)@6, v1@1)], filter=v1@1 + CAST(v0@0 AS Float64) > 0, projection=[v0@0, v1@1, v2@3, v3@4, v4@5, v0@7, v1@8]
+02)--CoalescePartitionsExec
+03)----ProjectionExec: expr=[v0@0 as v0, v1@1 as v1, v0@2 as v0, v2@3 as v2, v3@4 as v3, v4@5 as v4, CAST(v0@0 AS Float64) as CAST(t1.v0 AS Float64)]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6]
+06)----------DataSourceExec: partitions=1, partition_sizes=[0]
+07)----------DataSourceExec: partitions=1, partition_sizes=[0]
+08)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 
 
@@ -1374,20 +1368,17 @@ logical_plan
 07)--TableScan: s projection=[b]
 physical_plan
 01)ProjectionExec: expr=[col0@1 as col0, col1@2 as col1, a@3 as a, b@0 as b]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(s.b AS Int64)@1, col1@1)], projection=[b@0, col0@2, col1@3, a@4]
-04)------ProjectionExec: expr=[b@0 as b, CAST(b@0 AS Int64) as CAST(s.b AS Int64)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)------ProjectionExec: expr=[col0@1 as col0, col1@2 as col1, a@0 as a]
-07)--------CoalesceBatchesExec: target_batch_size=8192
-08)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(f.a AS Int64)@1, col0@0)], projection=[a@0, col0@2, col1@3]
-09)------------ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as CAST(f.a AS Int64)]
-10)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------------ProjectionExec: expr=[CAST(x@0 AS Int64) + 1 as col0, CAST(y@1 AS Int64) + 1 as col1]
-12)--------------RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
-13)----------------CoalesceBatchesExec: target_batch_size=8192
-14)------------------FilterExec: y@1 = x@0
-15)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(s.b AS Int64)@1, col1@1)], projection=[b@0, col0@2, col1@3, a@4]
+03)----ProjectionExec: expr=[b@0 as b, CAST(b@0 AS Int64) as CAST(s.b AS Int64)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)----ProjectionExec: expr=[col0@1 as col0, col1@2 as col1, a@0 as a]
+06)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(f.a AS Int64)@1, col0@0)], projection=[a@0, col0@2, col1@3]
+07)--------ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as CAST(f.a AS Int64)]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
+09)--------ProjectionExec: expr=[CAST(x@0 AS Int64) + 1 as col0, CAST(y@1 AS Int64) + 1 as col1]
+10)----------RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
+11)------------FilterExec: y@1 = x@0
+12)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table pairs;
@@ -1431,17 +1422,14 @@ logical_plan
 06)--------TableScan: t1 projection=[v0, v1]
 physical_plan
 01)ProjectionExec: expr=[v0@1 as v0, v1@2 as v1, sum(t1.v1)@0 as sum(t1.v1)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(v0@1, v0@0)], projection=[sum(t1.v1)@0, v0@2, v1@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t1.v1)@1 as sum(t1.v1), v0@0 as v0]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([v0@0], 4), input_partitions=4
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------AggregateExec: mode=Partial, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(v0@1, v0@0)], projection=[sum(t1.v1)@0, v0@2, v1@3]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t1.v1)@1 as sum(t1.v1), v0@0 as v0]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
+06)----------RepartitionExec: partitioning=Hash([v0@0], 4), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[v0@0 as v0], aggr=[sum(t1.v1)]
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+09)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 SELECT *
@@ -1461,10 +1449,9 @@ logical_plan
 02)--TableScan: t0 projection=[v0, v1]
 03)--TableScan: t1 projection=[v0, v1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t0.v0 = t1.v0);
@@ -1503,3 +1490,13 @@ drop table t1;
 
 statement ok
 drop table t0;
+
+# SQLancer fuzzed query (https://github.com/apache/datafusion/issues/14015)
+statement ok
+create table t1(v1 int, v2 int);
+
+query error DataFusion error: Error during planning: Column in ORDER BY must be in GROUP BY or an aggregate function
+select v1 from t1 as tt1 natural join t1 as tt2 group by v1 order by v2;
+
+statement ok
+drop table t1;
diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
index a1efc1317b4aa..59f3d8285af49 100644
--- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
+++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt
@@ -55,11 +55,10 @@ logical_plan
 07)--------TableScan: annotated_data projection=[a, c]
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST], fetch=5
-02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1]
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], file_type=csv, has_header=true
-05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1], fetch=5
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], file_type=csv, has_header=true
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # preserve_inner_join
 query IIII nosort
@@ -89,20 +88,22 @@ logical_plan
 02)--Projection: t2.a AS a2, t2.b
 03)----RightSemi Join: t1.d = t2.d, t1.c = t2.c
 04)------SubqueryAlias: t1
-05)--------TableScan: annotated_data projection=[c, d]
-06)------SubqueryAlias: t2
-07)--------Filter: annotated_data.d = Int32(3)
-08)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3)]
+05)--------Filter: annotated_data.d = Int32(3)
+06)----------TableScan: annotated_data projection=[c, d], partial_filters=[annotated_data.d = Int32(3)]
+07)------SubqueryAlias: t2
+08)--------Filter: annotated_data.d = Int32(3)
+09)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3)]
 physical_plan
 01)SortPreservingMergeExec: [a2@0 ASC NULLS LAST, b@1 ASC NULLS LAST], fetch=10
 02)--ProjectionExec: expr=[a@0 as a2, b@1 as b]
-03)----CoalesceBatchesExec: target_batch_size=8192, fetch=10
-04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(d@1, d@3), (c@0, c@2)], projection=[a@0, b@1]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], file_type=csv, has_header=true
-06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------FilterExec: d@3 = 3
-08)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(d@1, d@3), (c@0, c@2)], projection=[a@0, b@1], fetch=10
+04)------CoalescePartitionsExec
+05)--------FilterExec: d@1 = 3
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], file_type=csv, has_header=true
+08)------FilterExec: d@3 = 3
+09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 # preserve_right_semi_join
 query II nosort
diff --git a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
new file mode 100644
index 0000000000000..8246f489c446d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
@@ -0,0 +1,307 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test IS NOT DISTINCT FROM join functionality
+# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins
+# to equijoins with proper null equality handling
+
+statement ok
+CREATE TABLE t0 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t1 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t2 (
+    id INT,
+    val INT
+)
+
+statement ok
+INSERT INTO t0 VALUES
+(1, 10),
+(2, NULL),
+(5, 50)
+
+statement ok
+INSERT INTO t1 VALUES
+(1, 10),
+(2, NULL),
+(3, 30),
+(6, NULL)
+
+statement ok
+INSERT INTO t2 VALUES
+(1, 10),
+(2, NULL),
+(4, 40),
+(6, 6)
+
+# Test basic IS NOT DISTINCT FROM join functionality
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+1 1 10 10
+2 2 NULL NULL
+6 2 NULL NULL
+
+# Test that IS NOT DISTINCT FROM join produces HashJoin when used alone
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.val = t2.val
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+set datafusion.explain.format = "tree";
+
+# Tree explain should highlight null equality semantics
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│       ProjectionExec      │
+03)│    --------------------   │
+04)│         t1_id: id         │
+05)│         t2_id: id         │
+06)│          val: val         │
+07)└─────────────┬─────────────┘
+08)┌─────────────┴─────────────┐
+09)│        HashJoinExec       │
+10)│    --------------------   │
+11)│      NullsEqual: true     ├──────────────┐
+12)│                           │              │
+13)│      on: (val = val)      │              │
+14)└─────────────┬─────────────┘              │
+15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+16)│       DataSourceExec      ││       DataSourceExec      │
+17)│    --------------------   ││    --------------------   │
+18)│         bytes: 288        ││         bytes: 288        │
+19)│       format: memory      ││       format: memory      │
+20)│          rows: 1          ││          rows: 1          │
+21)└───────────────────────────┘└───────────────────────────┘
+
+statement ok
+set datafusion.explain.format = "indent";
+
+# For nested expression comparision, it should still able to be converted to Hash Join
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS NOT DISTINCT FROM 11);
+----
+1 1 10 10
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS NOT DISTINCT FROM 11);
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + Int64(1)
+03)----Filter: CAST(t1.val AS Int64) + Int64(1) IS NOT DISTINCT FROM Int64(11)
+04)------TableScan: t1 projection=[id, val]
+05)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
+09)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Mixed join predicate with `IS DISTINCT FROM` and `IS NOT DISTINCT FROM`
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS DISTINCT FROM (t2.val % 3));
+----
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS DISTINCT FROM (t2.val % 3));
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + Int64(1) Filter: CAST(t1.val AS Int64) % Int64(3) IS DISTINCT FROM CAST(t2.val AS Int64) % Int64(3)
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4], NullsEqual: true
+03)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t1.val + Int64(1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)----ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) + 1 as t2.val + Int64(1)]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test mixed equal and IS NOT DISTINCT FROM conditions
+# The `IS NOT DISTINCT FROM` expr should NOT in HashJoin's `on` predicate
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.id = t2.id AND t1.val IS NOT DISTINCT FROM t2.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.id = t2.id Filter: t1.val IS NOT DISTINCT FROM t2.val
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val@0 IS NOT DISTINCT FROM val@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test the mixed condition join result
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.id = t2.id AND t1.val IS NOT DISTINCT FROM t2.val
+----
+1 1 10 10
+2 2 NULL NULL
+
+# Test 3 table join
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+JOIN t0 ON t1.val IS NOT DISTINCT FROM t0.val
+----
+1 1 10 10
+2 2 NULL NULL
+6 2 NULL NULL
+
+# Ensure there is HashJoin in the plan
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+JOIN t0 ON t1.val IS NOT DISTINCT FROM t0.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.val = t0.val
+03)----Inner Join: t1.val = t2.val
+04)------TableScan: t1 projection=[id, val]
+05)------TableScan: t2 projection=[id, val]
+06)----TableScan: t0 projection=[val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as val]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)], projection=[id@1, val@2, id@3, val@4], NullsEqual: true
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)], NullsEqual: true
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Test IS NOT DISTINCT FROM with multiple columns
+statement ok
+CREATE TABLE t3 (
+    id INT,
+    val1 INT,
+    val2 INT
+)
+
+statement ok
+CREATE TABLE t4 (
+    id INT,
+    val1 INT,
+    val2 INT
+)
+
+statement ok
+INSERT INTO t3 VALUES
+(1, 10, 100),
+(2, NULL, 200),
+(3, 30, NULL)
+
+statement ok
+INSERT INTO t4 VALUES
+(1, 10, 100),
+(2, NULL, 200),
+(3, 30, NULL)
+
+# Test multiple IS NOT DISTINCT FROM conditions - should produce HashJoin
+query TT rowsort
+EXPLAIN SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+FROM t3
+JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2)
+----
+01)Projection: t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4 as val1, val2@2 as val2, val2@5 as val2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1), (val2@2, val2@2)], NullsEqual: true
+02)--Inner Join: t3.val1 = t4.val1, t3.val2 = t4.val2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+03)----TableScan: t3 projection=[id, val1, val2]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----TableScan: t4 projection=[id, val1, val2]
+logical_plan
+physical_plan
+
+# Test the multiple IS NOT DISTINCT FROM join result
+query IIIIII
+SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2
+FROM t3
+JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2)
+----
+1 1 10 10 100 100
+2 2 NULL NULL 200 200
+3 3 30 30 NULL NULL
+
+statement ok
+drop table t0;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
+
+statement ok
+drop table t3;
+
+statement ok
+drop table t4;
diff --git a/datafusion/sqllogictest/test_files/join_limit_pushdown.slt b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt
new file mode 100644
index 0000000000000..6bb23c1b4c243
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/join_limit_pushdown.slt
@@ -0,0 +1,269 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for limit pushdown into joins
+
+# need to use a single partition for deterministic results
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+set datafusion.optimizer.prefer_hash_join = true;
+
+# Create test tables
+statement ok
+CREATE TABLE t1 (a INT, b VARCHAR) AS VALUES
+  (1, 'one'),
+  (2, 'two'),
+  (3, 'three'),
+  (4, 'four'),
+  (5, 'five');
+
+statement ok
+CREATE TABLE t2 (x INT, y VARCHAR) AS VALUES
+  (1, 'alpha'),
+  (2, 'beta'),
+  (3, 'gamma'),
+  (6, 'delta'),
+  (7, 'epsilon');
+
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 2;
+----
+1 1
+2 2
+
+# Right join is converted to Left join with projection - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 RIGHT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+logical_plan
+01)Limit: skip=0, fetch=3
+02)--Right Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----Limit: skip=0, fetch=3
+05)------TableScan: t2 projection=[x], fetch=3
+physical_plan
+01)ProjectionExec: expr=[a@1 as a, x@0 as x]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(x@0, a@0)], fetch=3
+03)----DataSourceExec: partitions=1, partition_sizes=[1], fetch=3
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 RIGHT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+1 1
+2 2
+3 3
+
+# Left join supports fetch pushdown
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 LEFT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+logical_plan
+01)Limit: skip=0, fetch=3
+02)--Left Join: t1.a = t2.x
+03)----Limit: skip=0, fetch=3
+04)------TableScan: t1 projection=[a], fetch=3
+05)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, x@0)], fetch=3
+02)--DataSourceExec: partitions=1, partition_sizes=[1], fetch=3
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 LEFT JOIN t2 ON t1.a = t2.x LIMIT 3;
+----
+1 1
+2 2
+3 3
+
+
+# Full join supports fetch pushdown
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 FULL OUTER JOIN t2 ON t1.a = t2.x LIMIT 4;
+----
+logical_plan
+01)Limit: skip=0, fetch=4
+02)--Full Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, x@0)], fetch=4
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Note: FULL OUTER JOIN order is not deterministic, so we just check count
+query I
+SELECT COUNT(*) FROM (SELECT t1.a, t2.x FROM t1 FULL OUTER JOIN t2 ON t1.a = t2.x LIMIT 4);
+----
+4
+
+# EXISTS becomes left semi join - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t2.x FROM t2 WHERE EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--LeftSemi Join: t2.x = __correlated_sq_1.a
+03)----TableScan: t2 projection=[x]
+04)----SubqueryAlias: __correlated_sq_1
+05)------TableScan: t1 projection=[a]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(x@0, a@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+SELECT t2.x FROM t2 WHERE EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 2;
+----
+1
+2
+
+# NOT EXISTS becomes LeftAnti - fetch pushdown is supported
+query TT
+EXPLAIN SELECT t2.x FROM t2 WHERE NOT EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 1;
+----
+logical_plan
+01)Limit: skip=0, fetch=1
+02)--LeftAnti Join: t2.x = __correlated_sq_1.a
+03)----TableScan: t2 projection=[x]
+04)----SubqueryAlias: __correlated_sq_1
+05)------TableScan: t1 projection=[a]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(x@0, a@0)], fetch=1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+SELECT t2.x FROM t2 WHERE NOT EXISTS (SELECT 1 FROM t1 WHERE t1.a = t2.x) LIMIT 1;
+----
+6
+
+# Inner join should push
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 1 OFFSET 1;
+----
+logical_plan
+01)Limit: skip=1, fetch=1
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)GlobalLimitExec: skip=1, fetch=1
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 1 OFFSET 1;
+----
+2 2
+
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 0;
+----
+logical_plan EmptyRelation: rows=0
+physical_plan EmptyExec
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 0;
+----
+
+statement ok
+CREATE TABLE t3 (p INT, q VARCHAR) AS VALUES
+  (1, 'foo'),
+  (2, 'bar'),
+  (3, 'baz');
+
+query TT
+EXPLAIN SELECT t1.a, t2.x, t3.p 
+FROM t1 
+INNER JOIN t2 ON t1.a = t2.x 
+INNER JOIN t3 ON t2.x = t3.p 
+LIMIT 2;
+----
+logical_plan
+01)Limit: skip=0, fetch=2
+02)--Inner Join: t2.x = t3.p
+03)----Inner Join: t1.a = t2.x
+04)------TableScan: t1 projection=[a]
+05)------TableScan: t2 projection=[x]
+06)----TableScan: t3 projection=[p]
+physical_plan
+01)ProjectionExec: expr=[a@1 as a, x@2 as x, p@0 as p]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(p@0, x@1)], fetch=2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+06)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query III
+SELECT t1.a, t2.x, t3.p 
+FROM t1 
+INNER JOIN t2 ON t1.a = t2.x 
+INNER JOIN t3 ON t2.x = t3.p 
+LIMIT 2;
+----
+1 1 1
+2 2 2
+
+# Try larger limit
+query TT
+EXPLAIN SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 100;
+----
+logical_plan
+01)Limit: skip=0, fetch=100
+02)--Inner Join: t1.a = t2.x
+03)----TableScan: t1 projection=[a]
+04)----TableScan: t2 projection=[x]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, x@0)], fetch=100
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.a, t2.x FROM t1 INNER JOIN t2 ON t1.a = t2.x LIMIT 100;
+----
+1 1
+2 2
+3 3
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
+
+statement ok
+DROP TABLE t3;
diff --git a/datafusion/sqllogictest/test_files/join_lists.slt b/datafusion/sqllogictest/test_files/join_lists.slt
new file mode 100644
index 0000000000000..0a48a4f9203ec
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/join_lists.slt
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+
+## Ensure test coverage for NLJ using joining on LISTS
+
+## Reproducer for https://github.com/apache/datafusion/issues/18070
+
+statement ok
+CREATE TABLE categories_raw
+AS SELECT arrow_cast('cat_' || value, 'Utf8View')  AS category_id FROM generate_series(1, 5);
+
+statement ok
+CREATE TABLE places
+AS SELECT column1 as id, column2 as fsq_category_ids, column3 as date_refreshed
+FROM VALUES
+    (1, ['cat_1', 'cat_2', 'cat_3'], DATE '2023-05-10'),
+    (2, ['cat_4', 'cat_5'], DATE '2021-12-01'),
+    (3, ['cat_6', 'cat_7', 'cat_8', 'cat_9'], DATE '2024-01-15'); --> NOTE these categories do not exist in categories_raw
+
+
+query I
+WITH categories_arr AS (
+    SELECT array_agg(category_id) AS category_ids FROM categories_raw LIMIT 500
+)
+SELECT COUNT(*)
+    FROM places p
+    WHERE array_has_any(p.fsq_category_ids, (SELECT category_ids FROM categories_arr));
+----
+2
+
+query I
+WITH categories_arr AS (
+    SELECT array_agg(category_id) AS category_ids FROM categories_raw LIMIT 500
+)
+SELECT COUNT(*)
+    FROM places p
+    WHERE id <> 1 AND array_has_any(p.fsq_category_ids, (SELECT category_ids FROM categories_arr));
+----
+1
+
+# cleanup
+statement ok
+DROP TABLE categories_raw;
+
+statement ok
+DROP TABLE places;
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt
index ccecb9494331b..136a68573562a 100644
--- a/datafusion/sqllogictest/test_files/joins.slt
+++ b/datafusion/sqllogictest/test_files/joins.slt
@@ -24,7 +24,7 @@ statement ok
 set datafusion.execution.target_partitions = 2;
 
 statement ok
-set datafusion.execution.batch_size = 2;
+set datafusion.execution.batch_size = 8192;
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -57,15 +57,15 @@ statement ok
 CREATE TABLE join_t3(s3 struct<id INT>)
   AS VALUES
   (NULL),
-  (struct(1)),
-  (struct(2));
+    ({id: 1}),
+    ({id: 2});
 
 statement ok
 CREATE TABLE join_t4(s4 struct<id INT>)
   AS VALUES
   (NULL),
-  (struct(2)),
-  (struct(3));
+    ({id: 2}),
+    ({id: 3});
 
 # Left semi anti join
 
@@ -146,10 +146,10 @@ AS VALUES
 statement ok
 CREATE TABLE test_timestamps_table as
 SELECT
-  arrow_cast(ts::timestamp::bigint, 'Timestamp(Nanosecond, None)') as nanos,
-  arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(Microsecond, None)') as micros,
-  arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(Millisecond, None)') as millis,
-  arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(Second, None)') as secs,
+  arrow_cast(ts::timestamp::bigint, 'Timestamp(ns)') as nanos,
+  arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(µs)') as micros,
+  arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(ms)') as millis,
+  arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(s)') as secs,
   names
 FROM
   test_timestamps_table_source;
@@ -549,64 +549,64 @@ statement ok
 set datafusion.optimizer.repartition_joins = true
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1, t2 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1, t2 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE 1=1 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE 1=1 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN t2 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN t2 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITITI rowsort
 SELECT * FROM (SELECT t1_id, t1_name FROM t1 UNION ALL SELECT t1_id, t1_name FROM t1) AS t1 CROSS JOIN t2
@@ -685,64 +685,64 @@ statement ok
 set datafusion.optimizer.repartition_joins = false
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1, t2 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1, t2 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE 1=1 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE 1=1 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITT nosort
-SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN t2 ORDER BY t1_id
+SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN t2 ORDER BY t1_id, t1_name, t2_name
 ----
-11 a z
-11 a y
-11 a x
 11 a w
-22 b z
-22 b y
-22 b x
+11 a x
+11 a y
+11 a z
 22 b w
-33 c z
-33 c y
-33 c x
+22 b x
+22 b y
+22 b z
 33 c w
-44 d z
-44 d y
-44 d x
+33 c x
+33 c y
+33 c z
 44 d w
+44 d x
+44 d y
+44 d z
 
 query ITITI rowsort
 SELECT * FROM (SELECT t1_id, t1_name FROM t1 UNION ALL SELECT t1_id, t1_name FROM t1) AS t1 CROSS JOIN t2
@@ -1339,14 +1339,12 @@ logical_plan
 05)------TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
-04)------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
+03)----AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Join on struct
 query TT
@@ -1359,11 +1357,10 @@ logical_plan
 02)--TableScan: join_t3 projection=[s3]
 03)--TableScan: join_t4 projection=[s4]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(s3@0, s4@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(s3@0, s4@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ??
 select join_t3.s3, join_t4.s4
@@ -1373,7 +1370,7 @@ inner join join_t4 on join_t3.s3 = join_t4.s4
 {id: 2} {id: 2}
 
 # join with struct key and nulls
-# Note that intersect or except applies `null_equals_null` as true for Join.
+# Note that intersect or except applies `null_equality` as `NullEquality::NullEqualsNull` for Join.
 query ?
 SELECT * FROM join_t3
 EXCEPT
@@ -1397,14 +1394,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
-05)--------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[t1_id@0 as t1_id], aggr=[count(Int64(1))]
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 EXPLAIN
@@ -1426,14 +1421,12 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(alias1)]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([alias1@0], 2), input_partitions=2
-08)--------------AggregateExec: mode=Partial, gby=[t1_id@0 as alias1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([alias1@0], 2), input_partitions=2
+07)------------AggregateExec: mode=Partial, gby=[t1_id@0 as alias1], aggr=[]
+08)--------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+11)------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -1492,15 +1485,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1518,15 +1510,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Both side expr key inner join
 
@@ -1546,15 +1537,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1572,15 +1562,14 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)]
+08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Left side expr key inner join
 
@@ -1601,12 +1590,11 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1625,12 +1613,11 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Right side expr key inner join
 
@@ -1651,14 +1638,13 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1677,14 +1663,13 @@ logical_plan
 04)----TableScan: join_t2 projection=[t2_id]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Select wildcard with expr key inner join
 
@@ -1703,12 +1688,11 @@ logical_plan
 02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int]
 03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 set datafusion.optimizer.repartition_joins = true;
@@ -1725,12 +1709,11 @@ logical_plan
 02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int]
 03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
-05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 #####
 # Config teardown
@@ -2052,20 +2035,19 @@ physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id]
 02)--NestedLoopJoinExec: join_type=Inner, filter=t1_id@0 > t2_id@1
 03)----CoalescePartitionsExec
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)----CoalesceBatchesExec: target_batch_size=2
-09)------FilterExec: t1_id@0 > 10
-10)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----FilterExec: t1_id@0 > 10
+08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT join_t1.t1_id, join_t2.t2_id
 FROM join_t1
 INNER JOIN join_t2 ON join_t1.t1_id > join_t2.t2_id
 WHERE join_t1.t1_id > 10 AND join_t2.t2_int > 1
+ORDER BY 1
 ----
 22 11
 33 11
@@ -2091,20 +2073,19 @@ logical_plan
 physical_plan
 01)NestedLoopJoinExec: join_type=Right, filter=t1_id@0 < t2_id@1
 02)--CoalescePartitionsExec
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------FilterExec: t1_id@0 > 22
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)--CoalesceBatchesExec: target_batch_size=2
-08)----FilterExec: t2_id@0 > 11
-09)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)--------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: t1_id@0 > 22
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--FilterExec: t2_id@0 > 11
+07)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query II
 SELECT join_t1.t1_id, join_t2.t2_id
 FROM (select t1_id from join_t1 where join_t1.t1_id > 22) as join_t1
 RIGHT JOIN (select t2_id from join_t2 where join_t2.t2_id > 11) as join_t2
         ON join_t1.t1_id < join_t2.t2_id
+ORDER BY 1, 2
 ----
 33 44
 33 55
@@ -2151,6 +2132,7 @@ WHERE EXISTS (
     FROM join_t2
     WHERE join_t1.t1_id + 1 > join_t2.t2_id * 2
 )
+ORDER BY 1
 ----
 22 b 2
 33 c 3
@@ -2167,6 +2149,7 @@ WHERE EXISTS (
     FROM join_t2
     WHERE join_t1.t1_id + 1 > join_t2.t2_id * 2
 )
+ORDER BY 1
 ----
 22 b 2
 33 c 3
@@ -2567,11 +2550,10 @@ logical_plan
 04)--SubqueryAlias: t2
 05)----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(millis@2, millis@2)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(millis@2, millis@2)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)----DataSourceExec: partitions=1, partition_sizes=[1]
 
 # left_join_using_2
 query II
@@ -2739,17 +2721,13 @@ logical_plan
 04)--SubqueryAlias: t2
 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
 physical_plan
-01)SortMergeJoin: join_type=Inner, on=[(c1@0, c1@0)]
+01)SortMergeJoinExec: join_type=Inner, on=[(c1@0, c1@0)]
 02)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
-08)----CoalesceBatchesExec: target_batch_size=2
-09)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-10)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true]
+06)----RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # sort_merge_join_on_date32 inner sort merge join on data type (Date32)
 query DDRTDDRT rowsort
@@ -2770,18 +2748,15 @@ logical_plan
 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
 physical_plan
 01)ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4]
-02)--SortMergeJoin: join_type=Right, on=[(CAST(t1.c3 AS Decimal128(10, 2))@4, c3@2)]
+02)--SortMergeJoinExec: join_type=Right, on=[(CAST(t1.c3 AS Decimal128(10, 2))@4, c3@2)]
 03)----SortExec: expr=[CAST(t1.c3 AS Decimal128(10, 2))@4 ASC], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=2
-05)--------RepartitionExec: partitioning=Hash([CAST(t1.c3 AS Decimal128(10, 2))@4], 2), input_partitions=2
-06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true]
-10)------CoalesceBatchesExec: target_batch_size=2
-11)--------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=2
-12)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------RepartitionExec: partitioning=Hash([CAST(t1.c3 AS Decimal128(10, 2))@4], 2), input_partitions=2
+05)--------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true]
+09)------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=1
+10)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # sort_merge_join_on_decimal right join on data type (Decimal)
 query DDRTDDRT rowsort
@@ -2833,12 +2808,11 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id I
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IT rowsort
 SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id IN (SELECT t2_id FROM left_semi_anti_join_table_t2 t2) ORDER BY t1_id
@@ -2869,12 +2843,11 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOI
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IT
 SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOIN left_semi_anti_join_table_t2 t2 ON (t1_id = t2_id) ORDER BY t1_id
@@ -2926,12 +2899,11 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id I
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IT rowsort
 SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id IN (SELECT t2_id FROM left_semi_anti_join_table_t2 t2) ORDER BY t1_id
@@ -2962,12 +2934,11 @@ explain SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOI
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IT
 SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOIN left_semi_anti_join_table_t2 t2 ON (t1_id = t2_id) ORDER BY t1_id
@@ -3020,12 +2991,11 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHER
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITI rowsort
 SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHERE EXISTS (SELECT * FROM right_semi_anti_join_table_t2 t2 where t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id
@@ -3037,12 +3007,11 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGH
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITI rowsort
 SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGHT SEMI JOIN right_semi_anti_join_table_t1 t1 on (t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id
@@ -3092,12 +3061,11 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHER
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITI rowsort
 SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHERE EXISTS (SELECT * FROM right_semi_anti_join_table_t2 t2 where t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id
@@ -3109,12 +3077,11 @@ explain SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGH
 ----
 physical_plan
 01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query ITI rowsort
 SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGHT SEMI JOIN right_semi_anti_join_table_t1 t1 on (t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id
@@ -3190,17 +3157,13 @@ logical_plan
 08)------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [rn1@5 ASC NULLS LAST]
-02)--SortMergeJoin: join_type=Inner, on=[(a@1, a@1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-09)----CoalesceBatchesExec: target_batch_size=2
-10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-11)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Inner, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+05)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+07)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+08)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # sort merge join should propagate ordering equivalence of the right side
 # for right join. Hence final requirement rn1 ASC is already satisfied at
@@ -3224,22 +3187,18 @@ logical_plan
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [rn1@10 ASC NULLS LAST]
-02)--SortMergeJoin: join_type=Right, on=[(a@1, a@1)]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-07)----CoalesceBatchesExec: target_batch_size=2
-08)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST
-09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-12)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Right, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+05)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+06)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+07)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.prefer_existing_sort = false;
 
-# SortMergeJoin should add ordering equivalences of
+# SortMergeJoinExec should add ordering equivalences of
 # right table as lexicographical append to the global ordering
 # below query shouldn't add any SortExec for order by clause.
 # since its requirement is already satisfied at the output of SortMergeJoinExec
@@ -3265,22 +3224,15 @@ logical_plan
 10)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@11 ASC NULLS LAST]
-02)--SortExec: expr=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@11 ASC NULLS LAST], preserve_partitioning=[true]
-03)----SortMergeJoin: join_type=Inner, on=[(a@1, a@1)]
-04)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-11)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true]
-12)--------CoalesceBatchesExec: target_batch_size=2
-13)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2
-14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-17)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortMergeJoinExec: join_type=Inner, on=[(a@1, a@1)]
+03)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+04)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+05)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+07)----RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=1, maintains_sort_order=true
+08)------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+09)--------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.prefer_hash_join = true;
@@ -3310,12 +3262,11 @@ logical_plan
 07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+04)----BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # hash join should propagate ordering equivalence of the right side for RIGHT ANTI join.
 # Hence final requirement rn1 ASC is already satisfied at the end of HashJoinExec.
@@ -3337,12 +3288,31 @@ logical_plan
 07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 08)----------TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
-05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1]
+04)----BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Test ordering preservation for RIGHT join
+query TT
+EXPLAIN SELECT *
+FROM annotated_data as l_table
+RIGHT JOIN (SELECT * FROM annotated_data) as r_table
+ON l_table.b = r_table.b
+ORDER BY r_table.a ASC NULLS FIRST, r_table.b, r_table.c, l_table.a ASC NULLS FIRST;
+----
+logical_plan
+01)Sort: r_table.a ASC NULLS FIRST, r_table.b ASC NULLS LAST, r_table.c ASC NULLS LAST, l_table.a ASC NULLS FIRST
+02)--Right Join: l_table.b = r_table.b
+03)----SubqueryAlias: l_table
+04)------TableScan: annotated_data projection=[a0, a, b, c, d]
+05)----SubqueryAlias: r_table
+06)------TableScan: annotated_data projection=[a0, a, b, c, d]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@2, b@2)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT l.a, LAST_VALUE(r.b ORDER BY r.a ASC NULLS FIRST) as last_col1
@@ -3364,10 +3334,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
 02)--AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # create a table where there more than one valid ordering
 # that describes table.
@@ -3412,12 +3381,11 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd]
 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
-06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
-07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+05)------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n]
+06)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # run query above in multiple partitions
 statement ok
@@ -3446,22 +3414,15 @@ logical_plan
 08)----------TableScan: annotated_data projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC]
-02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
-03)----ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
-04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2
-07)------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]
-08)--------------CoalesceBatchesExec: target_batch_size=2
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)]
-10)------------------CoalesceBatchesExec: target_batch_size=2
-11)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
-14)------------------CoalesceBatchesExec: target_batch_size=2
-15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-16)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+02)--ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1]
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
+04)------RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC
+05)--------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0])
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)]
+07)------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1, maintains_sort_order=true
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+09)------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1, maintains_sort_order=true
+10)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT *
@@ -3477,7 +3438,7 @@ logical_plan
 physical_plan
 01)NestedLoopJoinExec: join_type=Inner, filter=a@1 < a@0
 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Currently datafusion can pushdown filter conditions with scalar UDF into
@@ -3494,10 +3455,11 @@ logical_plan
 04)--SubqueryAlias: t2
 05)----TableScan: annotated_data projection=[a0, a, b, c, d]
 physical_plan
-01)NestedLoopJoinExec: join_type=Inner, filter=example(CAST(a@0 AS Float64), CAST(a@1 AS Float64)) > 3
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
-03)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+01)NestedLoopJoinExec: join_type=Inner, filter=example(join_proj_push_down_1@0, join_proj_push_down_2@1) > 3, projection=[a0@0, a@1, b@2, c@3, d@4, a0@6, a@7, b@8, c@9, d@10]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d, CAST(a@1 AS Float64) as join_proj_push_down_1], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2]
+04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 ####
 # Config teardown
@@ -3554,7 +3516,6 @@ AS VALUES
 query IT
 SELECT t1_id, t1_name FROM join_test_left WHERE t1_id NOT IN (SELECT t2_id FROM join_test_right) ORDER BY t1_id;
 ----
-NULL e
 
 ####
 # join_partitioned_test
@@ -3604,23 +3565,22 @@ logical_plan
 02)--SubqueryAlias: a
 03)----Union
 04)------Projection: Int64(1) AS c, Int64(2) AS d
-05)--------EmptyRelation
+05)--------EmptyRelation: rows=1
 06)------Projection: Int64(1) AS c, Int64(3) AS d
-07)--------EmptyRelation
+07)--------EmptyRelation: rows=1
 08)--SubqueryAlias: rhs
 09)----Projection: Int64(1) AS e, Int64(3) AS f
-10)------EmptyRelation
+10)------EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[c@2 as c, d@3 as d, e@0 as e, f@1 as f]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
-04)------ProjectionExec: expr=[1 as e, 3 as f]
-05)--------PlaceholderRowExec
-06)------UnionExec
-07)--------ProjectionExec: expr=[1 as c, 2 as d]
-08)----------PlaceholderRowExec
-09)--------ProjectionExec: expr=[1 as c, 3 as d]
-10)----------PlaceholderRowExec
+02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
+03)----ProjectionExec: expr=[1 as e, 3 as f]
+04)------PlaceholderRowExec
+05)----UnionExec
+06)------ProjectionExec: expr=[1 as c, 2 as d]
+07)--------PlaceholderRowExec
+08)------ProjectionExec: expr=[1 as c, 3 as d]
+09)--------PlaceholderRowExec
 
 query IIII rowsort
 SELECT * FROM (
@@ -3647,23 +3607,22 @@ logical_plan
 02)--SubqueryAlias: a
 03)----Union
 04)------Projection: Int64(1) AS c, Int64(2) AS d
-05)--------EmptyRelation
+05)--------EmptyRelation: rows=1
 06)------Projection: Int64(1) AS c, Int64(3) AS d
-07)--------EmptyRelation
+07)--------EmptyRelation: rows=1
 08)--SubqueryAlias: rhs
 09)----Projection: Int64(1) AS e, Int64(3) AS f
-10)------EmptyRelation
+10)------EmptyRelation: rows=1
 physical_plan
 01)ProjectionExec: expr=[c@2 as c, d@3 as d, e@0 as e, f@1 as f]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
-04)------ProjectionExec: expr=[1 as e, 3 as f]
-05)--------PlaceholderRowExec
-06)------UnionExec
-07)--------ProjectionExec: expr=[1 as c, 2 as d]
-08)----------PlaceholderRowExec
-09)--------ProjectionExec: expr=[1 as c, 3 as d]
-10)----------PlaceholderRowExec
+02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(e@0, c@0)]
+03)----ProjectionExec: expr=[1 as e, 3 as f]
+04)------PlaceholderRowExec
+05)----UnionExec
+06)------ProjectionExec: expr=[1 as c, 2 as d]
+07)--------PlaceholderRowExec
+08)------ProjectionExec: expr=[1 as c, 3 as d]
+09)--------PlaceholderRowExec
 
 query IIII rowsort
 SELECT * FROM (
@@ -3687,7 +3646,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 as a WHERE 1=0
 ) AS a INNER JOIN (SELECT 1 as a) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Inner join with empty right table
 query TT
@@ -3695,7 +3654,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a
 ) AS a INNER JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left join with empty left table
 query TT
@@ -3703,7 +3662,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 as a WHERE 1=0
 ) AS a LEFT JOIN (SELECT 1 as a) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left join with empty left and empty right table
 query TT
@@ -3711,7 +3670,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 as a WHERE 1=0
 ) AS a LEFT JOIN (SELECT 1 as a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Right join with empty right table
 query TT
@@ -3719,7 +3678,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a
 ) AS a RIGHT JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Right join with empty right and empty left table
 query TT
@@ -3727,7 +3686,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 as a WHERE 1=0
 ) AS a RIGHT JOIN (SELECT 1 as a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left SEMI join with empty left table
 query TT
@@ -3735,7 +3694,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a
 ) AS a LEFT SEMI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left SEMI join with empty right table
 query TT
@@ -3743,7 +3702,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a WHERE 1=0
 ) AS a LEFT SEMI JOIN (SELECT 1 AS a) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Right SEMI join with empty left table
 query TT
@@ -3751,7 +3710,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a WHERE 1=0
 ) AS a RIGHT SEMI JOIN (SELECT 1 AS a) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Right SEMI join with empty right table
 query TT
@@ -3759,7 +3718,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a
 ) AS a RIGHT SEMI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left ANTI join with empty left table
 query TT
@@ -3767,7 +3726,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a WHERE 1=0
 ) AS a LEFT ANTI JOIN (SELECT 1 AS a) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Right ANTI join with empty right table
 query TT
@@ -3775,7 +3734,7 @@ EXPLAIN SELECT * FROM (
     SELECT 1 AS a
 ) AS a RIGHT ANTI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # FULL OUTER join with empty left and empty right table
 query TT
@@ -3783,7 +3742,7 @@ EXPLAIN SELECT * FROM (
 	SELECT 1 as a WHERE 1=0
 ) AS a FULL JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a;
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 # Left ANTI join with empty right table
 query TT
@@ -3794,7 +3753,7 @@ EXPLAIN SELECT * FROM (
 logical_plan
 01)SubqueryAlias: a
 02)--Projection: Int64(1) AS a
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 
 # Right ANTI join with empty left table
 query TT
@@ -3805,7 +3764,7 @@ EXPLAIN SELECT * FROM (
 logical_plan
 01)SubqueryAlias: b
 02)--Projection: Int64(1) AS a
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 
 
 statement ok
@@ -3868,11 +3827,10 @@ logical_plan
 06)------TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
+03)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[2]
+05)----DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 
@@ -3926,10 +3884,9 @@ logical_plan
 05)----TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
-05)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)----DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 # Null build indices:
@@ -3986,11 +3943,10 @@ logical_plan
 06)------TableScan: right_table_no_nulls projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@2 as a, b@3 as b, a@0 as a, b@1 as b]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
-04)------SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(b@1, b@1)]
+03)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false]
+04)------DataSourceExec: partitions=1, partition_sizes=[2]
+05)----DataSourceExec: partitions=1, partition_sizes=[2]
 
 
 # Test CROSS JOIN LATERAL syntax (planning)
@@ -3998,7 +3954,7 @@ query TT
 explain select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i);
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--SubqueryAlias: t1
 03)----TableScan: join_t1 projection=[t1_id, t1_name]
 04)--SubqueryAlias: series
@@ -4008,13 +3964,13 @@ logical_plan
 08)----------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t1.t1_int)))
 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[]
 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))
-11)----------------EmptyRelation
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" })
+11)----------------EmptyRelation: rows=1
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" })
 
 
 # Test CROSS JOIN LATERAL syntax (execution)
 # TODO: https://github.com/apache/datafusion/issues/10048
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\)
+query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\)
 select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i);
 
 
@@ -4033,13 +3989,13 @@ logical_plan
 08)----------Projection: __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)),depth=1) AS UNNEST(generate_series(Int64(1),outer_ref(t2.t1_int)))
 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[]
 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))
-11)----------------EmptyRelation
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" })
+11)----------------EmptyRelation: rows=1
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" })
 
 
 # Test INNER JOIN LATERAL syntax (execution)
 # TODO: https://github.com/apache/datafusion/issues/10048
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\)
+query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\)
 select t1_id, t1_name, i from join_t1 t2 inner join lateral (select * from unnest(generate_series(1, t1_int))) as series(i) on(t1_id > i);
 
 # Test RIGHT JOIN LATERAL syntax (unsupported)
@@ -4093,12 +4049,10 @@ physical_plan
 01)SortExec: expr=[sn@1 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[ts@1 as ts, sn@0 as sn, amount@2 as amount, currency@3 as currency, CAST(amount@2 AS Float32) * last_value(e.rate)@4 as amount_usd]
 03)----AggregateExec: mode=Single, gby=[sn@1 as sn, ts@0 as ts, amount@2 as amount, currency@3 as currency], aggr=[last_value(e.rate)]
-04)------CoalesceBatchesExec: target_batch_size=3
-05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@3, currency_from@1)], filter=ts@0 >= ts@1, projection=[ts@0, sn@1, amount@2, currency@3, rate@6]
-06)----------DataSourceExec: partitions=1, partition_sizes=[0]
-07)----------CoalesceBatchesExec: target_batch_size=3
-08)------------FilterExec: currency_to@2 = USD, projection=[ts@0, currency_from@1, rate@3]
-09)--------------DataSourceExec: partitions=1, partition_sizes=[0]
+04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@3, currency_from@1)], filter=ts@0 >= ts@1, projection=[ts@0, sn@1, amount@2, currency@3, rate@6]
+05)--------DataSourceExec: partitions=1, partition_sizes=[0]
+06)--------FilterExec: currency_to@2 = USD, projection=[ts@0, currency_from@1, rate@3]
+07)----------DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement ok
 DROP TABLE sales_global;
@@ -4123,9 +4077,9 @@ logical_plan
 03)----TableScan: left_table projection=[a, b, c]
 04)----TableScan: right_table projection=[x, y, z]
 physical_plan
-01)NestedLoopJoinExec: join_type=Inner, filter=a@0 < x@1
-02)--DataSourceExec: partitions=1, partition_sizes=[0]
-03)--SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false]
+01)SortExec: expr=[x@3 ASC NULLS LAST], preserve_partitioning=[false]
+02)--NestedLoopJoinExec: join_type=Inner, filter=a@0 < x@1
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
 04)----DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
@@ -4137,11 +4091,10 @@ logical_plan
 03)----TableScan: left_table projection=[a, b, c]
 04)----TableScan: right_table projection=[x, y, z]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, y@1)], filter=a@0 < x@1
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false]
-05)------DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, y@1)], filter=a@0 < x@1
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--SortExec: expr=[x@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)----DataSourceExec: partitions=1, partition_sizes=[0]
 
 # Test full join with limit
 statement ok
@@ -4160,23 +4113,43 @@ AS VALUES
 (3, 3, true),
 (3, 3, false);
 
-query IIIIB
-SELECT * FROM t0 FULL JOIN t1 ON t0.c1 = t1.c1 LIMIT 2;
+query IIIIB rowsort
+-- Note: using LIMIT value higher than cardinality before LIMIT to avoid query non-determinism
+SELECT * FROM t0 FULL JOIN t1 ON t0.c1 = t1.c1 LIMIT 20;
 ----
-2 2 2 2 true
+1 1 NULL NULL NULL
 2 2 2 2 false
-
-query IIIIB
-SELECT * FROM t0 FULL JOIN t1 ON t0.c2 >= t1.c2 LIMIT 2;
-----
 2 2 2 2 true
-3 3 2 2 true
+3 3 3 3 false
+3 3 3 3 true
+4 4 NULL NULL NULL
 
-query IIIIB
-SELECT * FROM t0 FULL JOIN t1 ON t0.c1 = t1.c1 AND t0.c2 >= t1.c2 LIMIT 2;
+query IIIIB rowsort
+-- Note: using LIMIT value higher than cardinality before LIMIT to avoid query non-determinism
+SELECT * FROM t0 FULL JOIN t1 ON t0.c2 >= t1.c2 LIMIT 20;
 ----
+1 1 NULL NULL NULL
+2 2 2 2 false
 2 2 2 2 true
+3 3 2 2 false
+3 3 2 2 true
+3 3 3 3 false
+3 3 3 3 true
+4 4 2 2 false
+4 4 2 2 true
+4 4 3 3 false
+4 4 3 3 true
+
+query IIIIB rowsort
+-- Note: using LIMIT value higher than cardinality before LIMIT to avoid query non-determinism
+SELECT * FROM t0 FULL JOIN t1 ON t0.c1 = t1.c1 AND t0.c2 >= t1.c2 LIMIT 20;
+----
+1 1 NULL NULL NULL
 2 2 2 2 false
+2 2 2 2 true
+3 3 3 3 false
+3 3 3 3 true
+4 4 NULL NULL NULL
 
 ## Test !join.on.is_empty() && join.filter.is_none()
 query TT
@@ -4188,12 +4161,11 @@ logical_plan
 03)----TableScan: t0 projection=[c1, c2]
 04)----TableScan: t1 projection=[c1, c2, c3]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)]
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[2]
+03)--DataSourceExec: partitions=1, partition_sizes=[2]
 
-## Test join.on.is_empty() && join.filter.is_some()
+## Test join.on.is_empty() && join.filter.is_some() -> single filter now a PWMJ
 query TT
 EXPLAIN SELECT * FROM t0 FULL JOIN t1 ON t0.c2 >= t1.c2 LIMIT 2;
 ----
@@ -4205,8 +4177,8 @@ logical_plan
 physical_plan
 01)GlobalLimitExec: skip=0, fetch=2
 02)--NestedLoopJoinExec: join_type=Full, filter=c2@0 >= c2@1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)----DataSourceExec: partitions=1, partition_sizes=[2]
 
 ## Test !join.on.is_empty() && join.filter.is_some()
 query TT
@@ -4218,10 +4190,9 @@ logical_plan
 03)----TableScan: t0 projection=[c1, c2]
 04)----TableScan: t1 projection=[c1, c2, c3]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], filter=c2@0 >= c2@1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
-04)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(c1@0, c1@0)], filter=c2@0 >= c2@1, fetch=2
+02)--DataSourceExec: partitions=1, partition_sizes=[2]
+03)--DataSourceExec: partitions=1, partition_sizes=[2]
 
 ## Add more test cases for join limit pushdown
 statement ok
@@ -4236,23 +4207,23 @@ set datafusion.execution.target_partitions = 1;
 # Note we use csv as MemoryExec does not support limit push down (so doesn't manifest
 # bugs if limits are improperly pushed down)
 query I
-COPY (values (1), (2), (3), (4), (5))  TO 'test_files/scratch/limit/t1.csv'
+COPY (values (1), (2), (3), (4), (5))  TO 'test_files/scratch/joins/t1.csv'
 STORED AS CSV
 ----
 5
 
 # store t2 in different order so the top N rows are not the same as the top N rows of t1
 query I
-COPY (values (5), (4), (3), (2), (1))  TO 'test_files/scratch/limit/t2.csv'
+COPY (values (5), (4), (3), (2), (1))  TO 'test_files/scratch/joins/t2.csv'
 STORED AS CSV
 ----
 5
 
 statement ok
-create external table t1(a int) stored as CSV location 'test_files/scratch/limit/t1.csv';
+create external table t1(a int) stored as CSV location 'test_files/scratch/joins/t1.csv';
 
 statement ok
-create external table t2(b int) stored as CSV location 'test_files/scratch/limit/t2.csv';
+create external table t2(b int) stored as CSV location 'test_files/scratch/joins/t2.csv';
 
 ######
 ## LEFT JOIN w/ LIMIT
@@ -4272,6 +4243,7 @@ select * from t1 LEFT JOIN t2 ON t1.a = t2.b LIMIT 2;
 1 1
 
 # can only push down to t1 (preserved side)
+# limit pushdown supported for left join - both to join and probe side
 query TT
 explain select * from t1 LEFT JOIN t2 ON t1.a = t2.b LIMIT 2;
 ----
@@ -4282,10 +4254,9 @@ logical_plan
 04)------TableScan: t1 projection=[a], fetch=2
 05)----TableScan: t2 projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t1.csv]]}, projection=[a], limit=2, file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t2.csv]]}, projection=[b], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Left, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], limit=2, file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
 
 ######
 ## RIGHT JOIN w/ LIMIT
@@ -4316,10 +4287,9 @@ logical_plan
 04)----Limit: skip=0, fetch=2
 05)------TableScan: t2 projection=[b], fetch=2
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t1.csv]]}, projection=[a], file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t2.csv]]}, projection=[b], limit=2, file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Right, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], limit=2, file_type=csv, has_header=true
 
 ######
 ## FULL JOIN w/ LIMIT
@@ -4343,7 +4313,7 @@ select * from t1 FULL JOIN t2 ON t1.a = t2.b LIMIT 2;
 4 4
 
 
-# can't push limit for full outer join
+# full outer join supports fetch pushdown
 query TT
 explain select * from t1 FULL JOIN t2 ON t1.a = t2.b LIMIT 2;
 ----
@@ -4353,10 +4323,9 @@ logical_plan
 03)----TableScan: t1 projection=[a]
 04)----TableScan: t2 projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3, fetch=2
-02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, b@0)]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t1.csv]]}, projection=[a], file_type=csv, has_header=true
-04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/t2.csv]]}, projection=[b], file_type=csv, has_header=true
+01)HashJoinExec: mode=CollectLeft, join_type=Full, on=[(a@0, b@0)], fetch=2
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t1.csv]]}, projection=[a], file_type=csv, has_header=true
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/joins/t2.csv]]}, projection=[b], file_type=csv, has_header=true
 
 statement ok
 drop table t1;
@@ -4394,11 +4363,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
 02)--AggregateExec: mode=Single, gby=[], aggr=[count(Int64(1))]
-03)----ProjectionExec: expr=[]
-04)------CoalesceBatchesExec: target_batch_size=3
-05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(binary_col@0, binary_col@0)]
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(binary_col@0, binary_col@0)], projection=[]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Test hash join sort push down
 # Issue: https://github.com/apache/datafusion/issues/13559
@@ -4424,16 +4391,12 @@ logical_plan
 07)----------TableScan: test projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [c@2 DESC]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
-04)------CoalescePartitionsExec
-05)--------CoalesceBatchesExec: target_batch_size=3
-06)----------FilterExec: b@1 > 3, projection=[a@0]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)------SortExec: expr=[c@2 DESC], preserve_partitioning=[true]
-10)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
+03)----CoalescePartitionsExec
+04)------FilterExec: b@1 > 3, projection=[a@0]
+05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+06)----SortExec: expr=[c@2 DESC], preserve_partitioning=[true]
+07)------DataSourceExec: partitions=2, partition_sizes=[1, 1]
 
 query TT
 explain select * from test where a in (select a from test where b > 3) order by c desc nulls last;
@@ -4448,16 +4411,12 @@ logical_plan
 07)----------TableScan: test projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [c@2 DESC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=3
-03)----HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
-04)------CoalescePartitionsExec
-05)--------CoalesceBatchesExec: target_batch_size=3
-06)----------FilterExec: b@1 > 3, projection=[a@0]
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)------SortExec: expr=[c@2 DESC NULLS LAST], preserve_partitioning=[true]
-10)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(a@0, a@0)]
+03)----CoalescePartitionsExec
+04)------FilterExec: b@1 > 3, projection=[a@0]
+05)--------DataSourceExec: partitions=2, partition_sizes=[1, 1]
+06)----SortExec: expr=[c@2 DESC NULLS LAST], preserve_partitioning=[true]
+07)------DataSourceExec: partitions=2, partition_sizes=[1, 1]
 
 query III
 select * from test where a in (select a from test where b > 3) order by c desc nulls first;
@@ -4495,10 +4454,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2, state@5]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2, state@5]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT age FROM (SELECT * FROM person a join person b using (id, age, state));
@@ -4511,10 +4469,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[age@1]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[age@1]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT a.* FROM person a join person b using (id, age);
@@ -4527,10 +4484,9 @@ logical_plan
 05)----SubqueryAlias: b
 06)------TableScan: person projection=[id, age]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)], projection=[id@0, age@1, state@2]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT a.*, b.* FROM person a join person b using (id, age);
@@ -4542,10 +4498,9 @@ logical_plan
 04)--SubqueryAlias: b
 05)----TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT * FROM person a join person b using (id, age, state) join person c using (id, age, state);
@@ -4562,19 +4517,17 @@ logical_plan
 09)----SubqueryAlias: c
 10)------TableScan: person projection=[id, age, state]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
-03)----CoalesceBatchesExec: target_batch_size=3
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0), (age@1, age@1), (state@2, state@2)], projection=[id@0, age@1, state@2]
-05)--------DataSourceExec: partitions=1, partition_sizes=[0]
-06)--------DataSourceExec: partitions=1, partition_sizes=[0]
-07)----DataSourceExec: partitions=1, partition_sizes=[0]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[0]
+05)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 query TT
 explain SELECT * FROM person a NATURAL JOIN lineitem b;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--SubqueryAlias: a
 03)----TableScan: person projection=[id, age, state]
 04)--SubqueryAlias: b
@@ -4594,10 +4547,9 @@ logical_plan
 04)----SubqueryAlias: lineitem2
 05)------TableScan: lineitem projection=[c1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c1@0, c1@0)], projection=[c1@0]
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
-04)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c1@0, c1@0)], projection=[c1@0]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -4621,20 +4573,20 @@ query TT
 explain SELECT j1_string, j2_string FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string]
 03)--SubqueryAlias: j2
 04)----Projection: j2.j2_string
 05)------Subquery:
 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id
 07)----------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
 
 query TT
 explain SELECT * FROM j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id), LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--Inner Join: CAST(j2.j2_id AS Int64) = CAST(j3.j3_id AS Int64) - Int64(2)
 03)----Inner Join: j1.j1_id = j2.j2_id
 04)------TableScan: j1 projection=[j1_string, j1_id]
@@ -4644,60 +4596,60 @@ logical_plan
 08)----Subquery:
 09)------Filter: j3.j3_string = outer_ref(j2.j2_string)
 10)--------TableScan: j3 projection=[j3_string, j3_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Utf8View, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" })
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" })
 
 query TT
 explain SELECT * FROM j1, LATERAL (SELECT * FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id = j2_id) as j2) as j2;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
 03)--SubqueryAlias: j2
 04)----Subquery:
-05)------Cross Join: 
+05)------Cross Join:
 06)--------TableScan: j1 projection=[j1_string, j1_id]
 07)--------SubqueryAlias: j2
 08)----------Subquery:
 09)------------Filter: outer_ref(j1.j1_id) = j2.j2_id
 10)--------------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
 
 query TT
 explain SELECT j1_string, j2_string FROM j1 LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true);
 ----
 logical_plan
-01)Left Join: 
+01)Left Join:
 02)--TableScan: j1 projection=[j1_string]
 03)--SubqueryAlias: j2
 04)----Projection: j2.j2_string
 05)------Subquery:
 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id
 07)----------TableScan: j2 projection=[j2_string, j2_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
 
 query TT
 explain SELECT * FROM j1, (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true));
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
-03)--Left Join: 
+03)--Left Join:
 04)----TableScan: j2 projection=[j2_string, j2_id]
 05)----SubqueryAlias: j3
 06)------Subquery:
 07)--------Filter: outer_ref(j1.j1_id) + outer_ref(j2.j2_id) = j3.j3_id
 08)----------TableScan: j3 projection=[j3_string, j3_id]
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Int32, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" })
 
 query TT
 explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2;
 ----
 logical_plan
-01)Cross Join: 
+01)Cross Join:
 02)--TableScan: j1 projection=[j1_string, j1_id]
 03)--SubqueryAlias: j2
 04)----Projection: Int64(1)
-05)------EmptyRelation
+05)------EmptyRelation: rows=1
 physical_plan
 01)CrossJoinExec
 02)--DataSourceExec: partitions=1, partition_sizes=[0]
@@ -4730,12 +4682,11 @@ logical_plan
 02)--TableScan: person projection=[id]
 03)--TableScan: orders projection=[customer_id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=3
-02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(person.id + Int64(10)@1, orders.customer_id * Int64(2)@1)], projection=[id@0, customer_id@2]
-03)----ProjectionExec: expr=[id@0 as id, CAST(id@0 AS Int64) + 10 as person.id + Int64(10)]
-04)------DataSourceExec: partitions=1, partition_sizes=[0]
-05)----ProjectionExec: expr=[customer_id@0 as customer_id, CAST(customer_id@0 AS Int64) * 2 as orders.customer_id * Int64(2)]
-06)------DataSourceExec: partitions=1, partition_sizes=[0]
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(person.id + Int64(10)@1, orders.customer_id * Int64(2)@1)], projection=[id@0, customer_id@2]
+02)--ProjectionExec: expr=[id@0 as id, CAST(id@0 AS Int64) + 10 as person.id + Int64(10)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)--ProjectionExec: expr=[customer_id@0 as customer_id, CAST(customer_id@0 AS Int64) * 2 as orders.customer_id * Int64(2)]
+05)----DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement count 0
 drop table person;
@@ -4790,3 +4741,666 @@ DROP TABLE compound_field_table_t;
 
 statement ok
 DROP TABLE compound_field_table_u;
+
+
+statement ok
+CREATE TABLE t1 (k INT, v INT);
+
+statement ok
+CREATE TABLE t2 (k INT, v INT);
+
+statement ok
+INSERT INTO t1
+  SELECT value AS k, value AS v
+  FROM range(1, 10001) AS t(value);
+
+statement ok
+INSERT INTO t2 VALUES (1, 1);
+
+## The TopK(Sort with fetch) should not be pushed down to the hash join
+query TT
+explain
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON t1.k = t2.k
+ORDER BY t1.k
+LIMIT 2;
+----
+logical_plan
+01)Sort: t1.k ASC NULLS LAST, fetch=2
+02)--LeftAnti Join: t1.k = t2.k
+03)----TableScan: t1 projection=[k, v]
+04)----TableScan: t2 projection=[k]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[k@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)----DataSourceExec: partitions=1, partition_sizes=[3334]
+
+
+query II
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON t1.k = t2.k
+ORDER BY t1.k
+LIMIT 2;
+----
+2 2
+3 3
+
+
+## Test left anti join without limit, we should support push down sort to the left side
+query TT
+explain
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON t1.k = t2.k
+ORDER BY t1.k;
+----
+logical_plan
+01)Sort: t1.k ASC NULLS LAST
+02)--LeftAnti Join: t1.k = t2.k
+03)----TableScan: t1 projection=[k, v]
+04)----TableScan: t2 projection=[k]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--SortExec: expr=[k@0 ASC NULLS LAST], preserve_partitioning=[false]
+04)----DataSourceExec: partitions=1, partition_sizes=[3334]
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
+
+
+# Test hash joins with an empty build relation (empty build relation optimization)
+
+statement ok
+CREATE TABLE t1 (k1 int, v1 int);
+
+statement ok
+CREATE TABLE t2 (k2 int, v2 int);
+
+statement ok
+INSERT INTO t1 SELECT i AS k, 1 FROM generate_series(1, 30000) t(i);
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+# INNER JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+JOIN t2 ON k1 = k2
+----
+physical_plan
+01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query IIII
+SELECT sum(k1), sum(v1), sum(k2), sum(v2)
+FROM t1
+JOIN t2 ON k1 = k2
+----
+NULL NULL NULL NULL
+
+# LEFT JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+LEFT JOIN t2 ON k1 = k2
+----
+physical_plan
+01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query IIII
+SELECT sum(k1), sum(v1), sum(k2), sum(v2)
+FROM t1
+LEFT JOIN t2 ON k1 = k2
+----
+450015000 30000 NULL NULL
+
+# RIGHT JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+RIGHT JOIN t2 ON k1 = k2
+----
+physical_plan
+01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query IIII
+SELECT sum(k1), sum(v1), sum(k2), sum(v2)
+FROM t1
+RIGHT JOIN t2 ON k1 = k2
+----
+NULL NULL NULL NULL
+
+# LEFT SEMI JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+LEFT SEMI JOIN t2 ON k1 = k2
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT sum(k1), sum(v1)
+FROM t1
+LEFT SEMI JOIN t2 ON k1 = k2
+----
+NULL NULL
+
+# RIGHT SEMI JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+RIGHT SEMI JOIN t2 ON k1 = k2
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftSemi, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT sum(k2), sum(v2)
+FROM t1
+RIGHT SEMI JOIN t2 ON k1 = k2
+----
+NULL NULL
+
+# LEFT ANTI JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT sum(k1), sum(v1)
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+----
+450015000 30000
+
+# RIGHT ANTI JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+RIGHT ANTI JOIN t2 ON k1 = k2
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(k2@0, k1@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
+03)--DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT sum(k2), sum(v2)
+FROM t1
+RIGHT ANTI JOIN t2 ON k1 = k2
+----
+NULL NULL
+
+# FULL JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+FULL JOIN t2 ON k1 = k2
+----
+physical_plan
+01)ProjectionExec: expr=[k1@2 as k1, v1@3 as v1, k2@0 as k2, v2@1 as v2]
+02)--HashJoinExec: mode=CollectLeft, join_type=Full, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query IIII
+SELECT sum(k1), sum(v1), sum(k2), sum(v2)
+FROM t1
+FULL JOIN t2 ON k1 = k2
+----
+450015000 30000 NULL NULL
+
+# LEFT MARK JOIN
+query TT
+EXPLAIN
+SELECT *
+FROM t2
+WHERE k2 > 0
+    OR EXISTS (
+        SELECT *
+        FROM t1
+        WHERE k2 = k1
+    )
+----
+physical_plan
+01)FilterExec: k2@0 > 0 OR mark@2, projection=[k2@0, v2@1]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(k2@0, k1@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT *
+FROM t2
+WHERE k2 > 0
+    OR EXISTS (
+        SELECT *
+        FROM t1
+        WHERE k2 = k1
+    )
+----
+
+# Projection inside the join (changes the output schema)
+query TT
+EXPLAIN
+SELECT distinct(v1)
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+----
+physical_plan
+01)AggregateExec: mode=Single, gby=[v1@0 as v1], aggr=[]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)], projection=[v1@1]
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query I
+SELECT distinct(v1)
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+----
+1
+
+# Both sides empty
+query TT
+EXPLAIN
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+WHERE k1 < 0
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k2@0, k1@0)]
+02)--FilterExec: k2@0 < 0
+03)----DataSourceExec: partitions=1, partition_sizes=[0]
+04)--FilterExec: k1@0 < 0
+05)----DataSourceExec: partitions=1, partition_sizes=[10000]
+
+query II
+SELECT *
+FROM t1
+LEFT ANTI JOIN t2 ON k1 = k2
+WHERE k1 < 0
+----
+
+# Also check that the reported number of output rows/batches are correct in the "empty build side"
+# optimization.
+# Issue: https://github.com/apache/datafusion/issues/20809
+query TT
+EXPLAIN ANALYZE
+WITH t1 (k) AS (
+    VALUES (1), (2)
+), t2 (k) AS (
+    VALUES (1)
+)
+SELECT *
+FROM t1
+LEFT ANTI JOIN (
+    SELECT *
+    FROM t2
+    WHERE k <> 1
+) t2 ON t1.k = t2.k;
+----
+Plan with Metrics
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(k@0, k@0)], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, array_map_created_count=0, build_input_batches=0, build_input_rows=0, input_batches=1, input_rows=2, build_mem_used=<slt:ignore>, build_time=<slt:ignore>, join_time=<slt:ignore>, avg_fanout=N/A (0/0), probe_hit_rate=0% (0/2)]     
+02)--ProjectionExec: expr=[column1@0 as k], metrics=[output_rows=0, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=0, expr_0_eval_time=<slt:ignore>]
+03)----FilterExec: column1@0 != 1, metrics=[output_rows=0, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=0, selectivity=0% (0/1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+05)--ProjectionExec: expr=[column1@0 as k], metrics=[output_rows=2, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, output_batches=1, expr_0_eval_time=<slt:ignore>]
+06)----DataSourceExec: partitions=1, partition_sizes=[1], metrics=[]
+
+query I
+WITH t1 (k) AS (
+    VALUES (1), (2)
+), t2 (k) AS (
+    VALUES (1)
+)
+SELECT *
+FROM t1
+LEFT ANTI JOIN (
+    SELECT *
+    FROM t2
+    WHERE k <> 1
+) t2 ON t1.k = t2.k;
+----
+1
+2
+
+# Mark testing
+statement ok
+CREATE OR REPLACE TABLE t1(b INT, c INT, d INT);
+
+statement ok
+INSERT INTO t1 VALUES
+  (10, 5, 3),
+  ( 1, 7, 8),
+  ( 2, 9, 7),
+  ( 3, 8,10),
+  ( 5, 6, 6),
+  ( 0, 4, 9),
+  ( 4, 8, 7),
+  (100,6, 5);
+
+query I rowsort
+SELECT c
+  FROM t1
+ WHERE c > d
+    OR EXISTS(SELECT 1 FROM t1 AS x WHERE x.b<t1.b)
+    OR (c <= d-2 OR c >= d+2)
+----
+4
+5
+6
+6
+7
+8
+8
+9
+
+# PiecewiseMergeJoin Test
+statement ok
+set datafusion.optimizer.enable_piecewise_merge_join = true;
+
+query II
+SELECT join_t1.t1_id, join_t2.t2_id
+FROM join_t1
+INNER JOIN join_t2 ON join_t1.t1_id > join_t2.t2_id
+WHERE join_t1.t1_id > 10 AND join_t2.t2_int > 1
+ORDER BY 1
+----
+22 11
+33 11
+44 11
+
+query TT
+EXPLAIN
+SELECT join_t1.t1_id, join_t2.t2_id
+FROM join_t1
+INNER JOIN join_t2 ON join_t1.t1_id > join_t2.t2_id
+WHERE join_t1.t1_id > 10 AND join_t2.t2_int > 1
+ORDER BY 1
+----
+physical_plan
+01)SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--PiecewiseMergeJoin: operator=Gt, join_type=Inner, on=(t1_id > t2_id)
+03)----SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
+04)------FilterExec: t1_id@0 > 10
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+07)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+DROP TABLE t1;
+
+statement ok
+DROP TABLE t2;
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
+
+statement ok
+set datafusion.optimizer.enable_piecewise_merge_join = false;
+
+# Test hash join with columns named c0, c1, c2
+# These names match the internal naming pattern in inlist_builder.rs
+# Regression test for https://github.com/apache/datafusion/pull/18393#discussion_r2601145291
+
+statement ok
+CREATE TABLE t1_c_source(c0 INT, c1 VARCHAR, c2 INT) AS VALUES
+(1, 'a', 100),
+(2, 'b', 200),
+(3, 'c', 300);
+
+statement ok
+CREATE TABLE t2_c_source(c0 INT, c1 VARCHAR) AS VALUES
+(1, 'x'),
+(3, 'z');
+
+query I
+COPY t1_c_source TO 'test_files/scratch/joins/t1_c.parquet' STORED AS PARQUET;
+----
+3
+
+query I
+COPY t2_c_source TO 'test_files/scratch/joins/t2_c.parquet' STORED AS PARQUET;
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE t1_c(c0 INT, c1 VARCHAR, c2 INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/joins/t1_c.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t2_c(c0 INT, c1 VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/joins/t2_c.parquet';
+
+# Test single-column join with column named c0
+query ITI rowsort
+SELECT t1.c0, t1.c1, t1.c2
+FROM t1_c t1
+INNER JOIN t2_c t2 ON t1.c0 = t2.c0;
+----
+1 a 100
+3 c 300
+
+# Test multi-column join with columns named c0, c1
+query ITI rowsort
+SELECT t1.c0, t1.c1, t1.c2
+FROM t1_c t1
+INNER JOIN t2_c t2 ON t1.c0 = t2.c0 AND t1.c1 = t2.c1;
+----
+
+statement ok
+DROP TABLE t1_c_source;
+
+statement ok
+DROP TABLE t2_c_source;
+
+statement ok
+DROP TABLE t1_c;
+
+statement ok
+DROP TABLE t2_c;
+
+# Reproducer of https://github.com/apache/datafusion/issues/19067
+statement count 0
+set datafusion.explain.physical_plan_only = true;
+
+# Setup Left Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_left AS
+SELECT
+  column1 as id,
+  arrow_cast(decode(column2, 'hex'), 'FixedSizeBinary(4)') as join_key
+FROM (VALUES
+  (1, 'AAAAAAAA'),
+  (2, 'BBBBBBBB'),
+  (3, 'CCCCCCCC')
+);
+
+# Setup Right Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_right AS
+SELECT
+  arrow_cast(decode(column1, 'hex'), 'FixedSizeBinary(4)') as join_key,
+  column2 as value
+FROM (VALUES
+  ('AAAAAAAA', 1000),
+  ('BBBBBBBB', 2000)
+);
+
+# Perform Left Join. Third row should contain NULL in `right_key`.
+query I??I
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+1 aaaaaaaa aaaaaaaa 1000
+2 bbbbbbbb bbbbbbbb 2000
+3 cccccccc NULL NULL
+
+# Ensure usage of HashJoinExec
+query TT
+EXPLAIN
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@2 as id, join_key@3 as left_key, join_key@0 as right_key, value@1 as value]
+03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(join_key@0, join_key@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement count 0
+set datafusion.explain.physical_plan_only = false;
+
+statement count 0
+DROP TABLE issue_19067_left;
+
+statement count 0
+DROP TABLE issue_19067_right;
+
+# Test that empty projections pushed into joins produce correct row counts at runtime.
+# When count(1) is used over a RIGHT/FULL JOIN, the optimizer embeds an empty projection
+# (projection=[]) into the HashJoinExec. This validates that the runtime batch construction
+# handles zero-column output correctly, preserving the correct number of rows.
+
+statement ok
+CREATE TABLE empty_proj_left AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+statement ok
+CREATE TABLE empty_proj_right AS VALUES (1, 'x'), (2, 'y'), (4, 'z');
+
+query I
+SELECT count(1) FROM empty_proj_left RIGHT JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+3
+
+query I
+SELECT count(1) FROM empty_proj_left FULL JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+4
+
+statement count 0
+DROP TABLE empty_proj_left;
+
+statement count 0
+DROP TABLE empty_proj_right;
+
+# Issue #20437: HashJoin panic with dictionary-encoded columns in multi-key joins
+# https://github.com/apache/datafusion/issues/20437
+
+statement ok
+CREATE TABLE issue_20437_small AS
+SELECT id, arrow_cast(region, 'Dictionary(Int32, Utf8)') AS region
+FROM (VALUES (1, 'west'), (2, 'west')) AS t(id, region);
+
+statement ok
+CREATE TABLE issue_20437_large AS
+SELECT id, region, value
+FROM (VALUES (1, 'west', 100), (2, 'west', 200), (3, 'east', 300)) AS t(id, region, value);
+
+query ITI
+SELECT s.id, s.region, l.value
+FROM issue_20437_small s
+JOIN issue_20437_large l ON s.id = l.id AND s.region = l.region
+ORDER BY s.id;
+----
+1 west 100
+2 west 200
+
+statement count 0
+DROP TABLE issue_20437_small;
+
+statement count 0
+DROP TABLE issue_20437_large;
+
+# Test count(*) with right semi/anti joins returns correct row counts
+# issue: https://github.com/apache/datafusion/issues/20669 
+
+statement ok
+CREATE TABLE t1 (k INT, v INT);
+
+statement ok
+CREATE TABLE t2 (k INT, v INT);
+
+statement ok
+INSERT INTO t1 SELECT i AS k, i AS v FROM generate_series(1, 100) t(i);
+
+statement ok
+INSERT INTO t2 VALUES (1, 1);
+
+query I
+WITH t AS (
+    SELECT *
+    FROM t1
+    LEFT ANTI JOIN t2 ON t1.k = t2.k
+)
+SELECT count(*)
+FROM t;
+----
+99
+
+query I
+WITH t AS (
+    SELECT *
+    FROM t1
+    LEFT SEMI JOIN t2 ON t1.k = t2.k
+)
+SELECT count(*)
+FROM t;
+----
+1
+
+statement count 0
+DROP TABLE t1;
+
+statement count 0
+DROP TABLE t2;
diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt
index b46b8c49d6623..60bec4213db02 100644
--- a/datafusion/sqllogictest/test_files/json.slt
+++ b/datafusion/sqllogictest/test_files/json.slt
@@ -146,3 +146,31 @@ EXPLAIN SELECT id FROM json_partitioned_test WHERE part = 2
 ----
 logical_plan TableScan: json_partitioned_test projection=[id], full_filters=[json_partitioned_test.part = Int32(2)]
 physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table_json/part=2/data.json]]}, projection=[id], file_type=json
+
+##########
+## JSON Array Format Tests
+##########
+
+# Test reading JSON array format file with newline_delimited=false
+statement ok
+CREATE EXTERNAL TABLE json_array_test
+STORED AS JSON
+OPTIONS ('format.newline_delimited' 'false')
+LOCATION '../core/tests/data/json_array.json';
+
+query IT rowsort
+SELECT a, b FROM json_array_test
+----
+1 hello
+2 world
+3 test
+
+statement ok
+DROP TABLE json_array_test;
+
+# Test that reading JSON array format WITHOUT newline_delimited option fails
+# (default is newline_delimited=true which can't parse array format correctly)
+statement error Not valid JSON
+CREATE EXTERNAL TABLE json_array_as_ndjson
+STORED AS JSON
+LOCATION '../core/tests/data/json_array.json';
diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt
index 2f8944f462a18..ff3c49485a286 100644
--- a/datafusion/sqllogictest/test_files/limit.slt
+++ b/datafusion/sqllogictest/test_files/limit.slt
@@ -377,9 +377,8 @@ physical_plan
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 06)----------ProjectionExec: expr=[]
 07)------------GlobalLimitExec: skip=6, fetch=3
-08)--------------CoalesceBatchesExec: target_batch_size=8192, fetch=9
-09)----------------FilterExec: a@0 > 3
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+08)--------------FilterExec: a@0 > 3, fetch=9
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query I
 SELECT COUNT(*) FROM (SELECT a FROM t1 WHERE a > 3 LIMIT 3 OFFSET 6);
@@ -405,11 +404,9 @@ logical_plan
 02)--TableScan: t1000 projection=[i]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[i@0 as i], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([i@0], 4), input_partitions=4
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------AggregateExec: mode=Partial, gby=[i@0 as i], aggr=[]
-06)----------DataSourceExec: partitions=1
+02)--RepartitionExec: partitioning=Hash([i@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[i@0 as i], aggr=[]
+04)------DataSourceExec: partitions=1
 
 statement ok
 set datafusion.explain.show_sizes = true;
@@ -637,11 +634,10 @@ physical_plan
 02)--SortPreservingMergeExec: [b@0 DESC], fetch=3
 03)----SortExec: TopK(fetch=3), expr=[b@0 DESC], preserve_partitioning=[true]
 04)------AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(ordered_table.a)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(ordered_table.a)]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(ordered_table.a)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
 # Applying offset & limit when multiple streams from union
 # the plan must still have a global limit to apply the offset
@@ -663,15 +659,14 @@ logical_plan
 physical_plan
 01)GlobalLimitExec: skip=4, fetch=10
 02)--SortPreservingMergeExec: [c@0 DESC], fetch=14
-03)----UnionExec
-04)------SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
+03)----SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
+04)------UnionExec
 05)--------ProjectionExec: expr=[CAST(c@0 AS Int64) as c]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
-08)------SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
-09)--------ProjectionExec: expr=[CAST(d@0 AS Int64) as c]
-10)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[d], file_type=csv, has_header=true
+08)--------ProjectionExec: expr=[CAST(d@0 AS Int64) as c]
+09)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[d], file_type=csv, has_header=true
 
 # Applying LIMIT & OFFSET to subquery.
 query III
@@ -684,19 +679,19 @@ ON t1.b = t2.b
 ORDER BY t1.b desc, c desc, c2 desc;
 ----
 3 98 96
-3 98 89
+3 98 87
 3 98 82
 3 98 79
 3 97 96
-3 97 89
+3 97 87
 3 97 82
 3 97 79
 3 96 96
-3 96 89
+3 96 87
 3 96 82
 3 96 79
 3 95 96
-3 95 89
+3 95 87
 3 95 82
 3 95 79
 
@@ -711,8 +706,8 @@ ON t1.b = t2.b
 ORDER BY t1.b desc, c desc, c2 desc
 OFFSET 3 LIMIT 2;
 ----
-3 99 82
-3 99 79
+3 98 79
+3 97 96
 
 statement ok
 drop table ordered_table;
@@ -800,7 +795,7 @@ CREATE TABLE src_table (
 # File 1:
 query I
 COPY (SELECT * FROM src_table where part_key = 1)
-TO 'test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet'
+TO 'test_files/scratch/limit/test_limit_with_partitions/part-0.parquet'
 STORED AS PARQUET;
 ----
 3
@@ -808,7 +803,7 @@ STORED AS PARQUET;
 # File 2:
 query I
 COPY (SELECT * FROM src_table where part_key = 2)
-TO 'test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet'
+TO 'test_files/scratch/limit/test_limit_with_partitions/part-1.parquet'
 STORED AS PARQUET;
 ----
 4
@@ -816,7 +811,7 @@ STORED AS PARQUET;
 # File 3:
 query I
 COPY (SELECT * FROM src_table where part_key = 3)
-TO 'test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet'
+TO 'test_files/scratch/limit/test_limit_with_partitions/part-2.parquet'
 STORED AS PARQUET;
 ----
 3
@@ -828,13 +823,14 @@ CREATE EXTERNAL TABLE test_limit_with_partitions
   value INT
 )
 STORED AS PARQUET
-LOCATION 'test_files/scratch/parquet/test_limit_with_partitions/';
+LOCATION 'test_files/scratch/limit/test_limit_with_partitions/';
 
 query TT
 explain
 with selection as (
     select *
     from test_limit_with_partitions
+    order by part_key
     limit 1
 )
 select 1 as foo
@@ -847,19 +843,19 @@ logical_plan
 02)--Sort: selection.part_key ASC NULLS LAST, fetch=1000
 03)----Projection: Int64(1) AS foo, selection.part_key
 04)------SubqueryAlias: selection
-05)--------Limit: skip=0, fetch=1
-06)----------TableScan: test_limit_with_partitions projection=[part_key], fetch=1
+05)--------Sort: test_limit_with_partitions.part_key ASC NULLS LAST, fetch=1
+06)----------TableScan: test_limit_with_partitions projection=[part_key]
 physical_plan
-01)ProjectionExec: expr=[foo@0 as foo]
-02)--SortExec: TopK(fetch=1000), expr=[part_key@1 ASC NULLS LAST], preserve_partitioning=[false]
-03)----ProjectionExec: expr=[1 as foo, part_key@0 as part_key]
-04)------CoalescePartitionsExec: fetch=1
-05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], limit=1, file_type=parquet
+01)ProjectionExec: expr=[1 as foo]
+02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1
+03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit/test_limit_with_partitions/part-2.parquet]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 query I
 with selection as (
     select *
     from test_limit_with_partitions
+    order by part_key
     limit 1
 )
 select 1 as foo
diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt
new file mode 100644
index 0000000000000..72672b707d4f5
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/limit_pruning.slt
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE tracking_data AS VALUES
+-- ***** Row Group 0 *****
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+-- ***** Row Group 1 *****
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+-- ***** Row Group 2 *****
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+-- ***** Row Group 3 *****
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as species, column2 as s FROM tracking_data)
+TO 'test_files/scratch/limit_pruning/data.parquet'
+STORED AS PARQUET
+OPTIONS (
+  'format.max_row_group_size' '3'
+);
+
+statement ok
+drop table tracking_data;
+
+statement ok
+CREATE EXTERNAL TABLE tracking_data
+STORED AS PARQUET
+LOCATION 'test_files/scratch/limit_pruning/data.parquet';
+
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+# row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched
+# limit_pruned_row_groups=2 total → 0 matched
+query TT
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3;
+----
+Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=2 total → 2 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (171/2.35 K)]
+
+# limit_pruned_row_groups=0 total → 0 matched
+# because of order by, scan needs to preserve sort, so limit pruning is disabled
+query TT
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (521/2.35 K)]
+
+statement ok
+drop table tracking_data;
+
+statement ok
+reset datafusion.explain.analyze_level;
diff --git a/datafusion/sqllogictest/test_files/limit_single_row_batches.slt b/datafusion/sqllogictest/test_files/limit_single_row_batches.slt
new file mode 100644
index 0000000000000..9f626816e2146
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/limit_single_row_batches.slt
@@ -0,0 +1,22 @@
+
+#  minimize batch size to 1 in order to trigger different code paths
+statement ok
+set datafusion.execution.batch_size = '1';
+
+# ----
+# tests with target partition set to 1
+# ----
+statement ok
+set datafusion.execution.target_partitions = '1';
+
+
+statement ok
+CREATE TABLE filter_limit (i INT) as values (1), (2);
+
+query I
+SELECT COUNT(*) FROM (SELECT i FROM filter_limit WHERE i <> 0 LIMIT 1);
+----
+1
+
+statement ok
+DROP TABLE filter_limit;
diff --git a/datafusion/sqllogictest/test_files/listing_table_partitions.slt b/datafusion/sqllogictest/test_files/listing_table_partitions.slt
new file mode 100644
index 0000000000000..52433429cfe80
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/listing_table_partitions.slt
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+copy (values('foo'), ('bar'))
+to 'test_files/scratch/listing_table_partitions/single_part/a=1/file1.parquet';
+----
+2
+
+query I
+copy (values('baz'))
+to 'test_files/scratch/listing_table_partitions/single_part/a=1/file2.parquet';
+----
+1
+
+statement count 0
+create external table single_part
+stored as parquet location 'test_files/scratch/listing_table_partitions/single_part/';
+
+query TT
+select * from single_part order by (column1);
+----
+bar 1
+baz 1
+foo 1
+
+query I
+copy (values('foo'), ('bar')) to 'test_files/scratch/listing_table_partitions/multi_part/a=1/b=100/file1.parquet';
+----
+2
+
+query I
+copy (values('baz')) to 'test_files/scratch/listing_table_partitions/multi_part/a=1/b=200/file1.parquet';
+----
+1
+
+statement count 0
+create external table multi_part
+stored as parquet location 'test_files/scratch/listing_table_partitions/multi_part/';
+
+query TTT
+select * from multi_part where b=200; 
+----
+baz 1 200
+
+statement count 0
+set datafusion.execution.listing_table_factory_infer_partitions = false;
+
+statement count 0
+create external table infer_disabled
+stored as parquet location 'test_files/scratch/listing_table_partitions/multi_part/';
+
+query T
+select * from infer_disabled order by (column1);
+----
+bar
+baz
+foo
+
+statement count 0
+set datafusion.execution.listing_table_factory_infer_partitions = true;
diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt
index 890d1f2e9250e..4298320d4aaba 100644
--- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt
+++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt
@@ -17,10 +17,10 @@
 
 # Test file with different schema order but generating correct statistics for table
 statement ok
-COPY (SELECT * FROM values (1, 'a'), (2, 'b') t(int_col, str_col)) to 'test_files/scratch/table/1.parquet';
+COPY (SELECT * FROM values (1, 'a'), (2, 'b') t(int_col, str_col)) to 'test_files/scratch/listing_table_statistics/1.parquet';
 
 statement ok
-COPY (SELECT * FROM values ('c', 3), ('d', -1) t(str_col, int_col)) to 'test_files/scratch/table/2.parquet';
+COPY (SELECT * FROM values ('c', 3), ('d', -1) t(str_col, int_col)) to 'test_files/scratch/listing_table_statistics/2.parquet';
 
 statement ok
 set datafusion.execution.collect_statistics = true;
@@ -29,13 +29,13 @@ statement ok
 set datafusion.explain.show_statistics = true;
 
 statement ok
-create external table t stored as parquet location 'test_files/scratch/table';
+create external table t stored as parquet location 'test_files/scratch/listing_table_statistics';
 
 query TT
 explain format indent select * from t;
 ----
 logical_plan TableScan: t projection=[int_col, str_col]
-physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]]
+physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Absent, [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0) ScanBytes=Exact(32)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0) ScanBytes=Inexact(100))]]
 
 statement ok
 drop table t;
diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt
index 42a4ba6218016..7ea54464d3e99 100644
--- a/datafusion/sqllogictest/test_files/map.slt
+++ b/datafusion/sqllogictest/test_files/map.slt
@@ -43,8 +43,8 @@ LOCATION '../core/tests/data/parquet_map.parquet';
 query TTT
 describe data;
 ----
-ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
-strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
+ints Map("entries": non-null Struct("key": non-null Utf8, "value": non-null Int64), unsorted) NO
+strings Map("entries": non-null Struct("key": non-null Utf8, "value": non-null Utf8), unsorted) NO
 timestamp Utf8View NO
 
 query ??T
@@ -113,9 +113,8 @@ logical_plan
 01)Filter: table_with_map.int_field > Int64(0)
 02)--TableScan: table_with_map projection=[int_field, map_field]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int_field@0 > 0
-03)----DataSourceExec: partitions=1, partition_sizes=[0]
+01)FilterExec: int_field@0 > 0
+02)--DataSourceExec: partitions=1, partition_sizes=[0]
 
 statement ok
 drop table table_with_map;
@@ -155,7 +154,7 @@ SELECT MAKE_MAP('POST', 41, 'HEAD', 53, 'PATCH', 30);
 ----
 {POST: 41, HEAD: 53, PATCH: 30}
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'ab' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'ab' to value of Int64 type
 SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30);
 
 # Map keys can not be NULL
@@ -175,6 +174,16 @@ SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, 30]);
 ----
 {POST: 41, HEAD: 33, PATCH: 30}
 
+query ?
+SELECT MAP('type', 'test');
+----
+{type: test}
+
+query ?
+SELECT MAP('a', 2, 'b', 3);
+----
+{a: 2, b: 3}
+
 query ?
 SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]);
 ----
@@ -188,10 +197,10 @@ SELECT MAP([[1,2], [3,4]], ['a', 'b']);
 query error
 SELECT MAP()
 
-query error DataFusion error: Execution error: map function requires 2 arguments, got 1
+query error DataFusion error: Error during planning: make_map requires an even number of arguments
 SELECT MAP(['POST', 'HEAD'])
 
-query error DataFusion error: Execution error: Expected list, large_list or fixed_size_list, got Null
+query error DataFusion error: Execution error: map key cannot be null
 SELECT MAP(null, [41, 33, 30]);
 
 query error DataFusion error: Execution error: map requires key and value lists to have the same length
@@ -233,7 +242,7 @@ SELECT map(column5, column6) FROM duplicate_keys_table;
 
 # key is a nested type
 query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[1, 2\]
-SELECT MAP([[1,2], [1,2], [NULL]], [41, 33, null]);
+SELECT MAP([[1,2], [1,2]], [41, 33]);
 
 query error DataFusion error: Execution error: map key must be unique, duplicate key found: \[\{1:1\}\]
 SELECT MAP([Map {1:'1'}, Map {1:'1'}, Map {2:'2'}], [41, 33, null]);
@@ -281,8 +290,12 @@ SELECT map(column8, column9) FROM t;
 {[4]: b}
 {[1, 2]: c}
 
-query error
+query ?
 SELECT map(column6, column7) FROM t;
+----
+{[1, 2]: POST}
+{[3]: PUT}
+{[5]: NULL}
 
 query ?
 select Map {column6: column7} from t;
@@ -523,26 +536,46 @@ SELECT MAP { 'a': 1, 'b': 3 };
 ----
 {a: 1, b: 3}
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
 SELECT MAP { 'a': 1, 2: 3 };
 
-# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key
-# query ?
-# SELECT MAP { 1: 'a', 2: 'b', 3: 'c' }[1];
-# ----
-# a
+# accessing map with non-string key
+query T
+SELECT MAP { 1: 'a', 2: 'b', 3: 'c' }[1];
+----
+a
 
-# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key
-# query ?
-# SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}];
-# ----
-# 1
+# accessing map with string key
+query I
+SELECT MAP { 'a': 1, 'b': 2, 'c': 3 }['a'];
+----
+1
+
+# accessing map with non-string key in case expression
+query I
+SELECT (CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END)['x']; 
+----
+100
 
-# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key
-# query ?
-# SELECT MAKE_MAP(1, null, 2, 33, 3, null)[2];
-# ----
-# 33
+# fix accessing map with nested key
+query I
+SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {1:'a', 2:'b'}];
+----
+1
+
+query I
+SELECT MAP { MAP {1:'a', 2:'b'}:1, MAP {1:'c', 2:'d'}:2 }[MAP {2:'b', 1:'a'}];
+----
+NULL
+
+# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with empty map as key
+# TODO(https://github.com/apache/datafusion/pull/18394): Test accessing map with null map as key
+
+# accessing map with non-string key
+query I
+SELECT MAKE_MAP(1, null, 2, 33, 3, null)[2];
+----
+33
 
 ## cardinality
 
@@ -651,6 +684,57 @@ select map_extract(column1, 1), map_extract(column1, 5), map_extract(column1, 7)
 [NULL] [[4, NULL, 6]] [NULL]
 [NULL] [NULL] [[1, NULL, 3]]
 
+# Tests for map_entries
+
+query ?
+SELECT map_entries(MAP { 'a': 1, 'b': 3 });
+----
+[{key: a, value: 1}, {key: b, value: 3}]
+
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
+SELECT map_entries(MAP { 'a': 1, 2: 3 });
+
+query ?
+SELECT map_entries(MAP {'a':1, 'b':2, 'c':3 }) FROM t;
+----
+[{key: a, value: 1}, {key: b, value: 2}, {key: c, value: 3}]
+[{key: a, value: 1}, {key: b, value: 2}, {key: c, value: 3}]
+[{key: a, value: 1}, {key: b, value: 2}, {key: c, value: 3}]
+
+query ?
+SELECT map_entries(Map{column1: column2, column3: column4}) FROM t;
+----
+[{key: a, value: 1}, {key: k1, value: 10}]
+[{key: b, value: 2}, {key: k3, value: 30}]
+[{key: d, value: 4}, {key: k5, value: 50}]
+
+query ?
+SELECT map_entries(map(column5, column6)) FROM t;
+----
+[{key: k1, value: 1}, {key: k2, value: 2}]
+[{key: k3, value: 3}]
+[{key: k5, value: 5}]
+
+query ?
+SELECT map_entries(map(column8, column9)) FROM t;
+----
+[{key: [1, 2, 3], value: a}]
+[{key: [4], value: b}]
+[{key: [1, 2], value: c}]
+
+query ?
+SELECT map_entries(Map{});
+----
+[]
+
+query ?
+SELECT map_entries(column1) from map_array_table_1;
+----
+[{key: 1, value: [1, NULL, 3]}, {key: 2, value: [4, NULL, 6]}, {key: 3, value: [7, 8, 9]}]
+[{key: 4, value: [1, NULL, 3]}, {key: 5, value: [4, NULL, 6]}, {key: 6, value: [7, 8, 9]}]
+[{key: 7, value: [1, NULL, 3]}, {key: 8, value: [9, NULL, 6]}, {key: 9, value: [7, 8, 9]}]
+NULL
+
 # Tests for map_keys
 
 query ?
@@ -658,7 +742,7 @@ SELECT map_keys(MAP { 'a': 1, 'b': 3 });
 ----
 [a, b]
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
 SELECT map_keys(MAP { 'a': 1, 2: 3 });
 
 query ?
@@ -705,7 +789,7 @@ NULL
 
 # Tests for map_values
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
 SELECT map_values(MAP { 'a': 1, 2: 3 });
 
 query ?
@@ -782,5 +866,12 @@ select column3[true] from tt;
 ----
 3
 
+# https://github.com/apache/datafusion/issues/16187
+# should be NULL in case of out of bounds for Null Type
+query ?
+select map_values(map([named_struct('a', 1, 'b', null)], [named_struct('a', 1, 'b', null)]))[0] as a;
+----
+NULL
+
 statement ok
 drop table tt;
diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt
index e206aa16b8a99..d571fcd947134 100644
--- a/datafusion/sqllogictest/test_files/math.slt
+++ b/datafusion/sqllogictest/test_files/math.slt
@@ -111,12 +111,44 @@ SELECT isnan(1.0::FLOAT), isnan('NaN'::FLOAT), isnan(-'NaN'::FLOAT), isnan(NULL:
 ----
 false true true NULL
 
+# isnan: non-float numeric inputs are never NaN
+query BBBB
+SELECT isnan(1::INT), isnan(0::INT), isnan(NULL::INT), isnan(123::BIGINT)
+----
+false false NULL false
+
+query BBBB
+SELECT isnan(1::INT UNSIGNED), isnan(0::INT UNSIGNED), isnan(NULL::INT UNSIGNED), isnan(255::TINYINT UNSIGNED)
+----
+false false NULL false
+
+query BBBB
+SELECT isnan(1::DECIMAL(10,2)), isnan(0::DECIMAL(10,2)), isnan(NULL::DECIMAL(10,2)), isnan(-1::DECIMAL(10,2))
+----
+false false NULL false
+
 # iszero
 query BBBB
 SELECT iszero(1.0), iszero(0.0), iszero(-0.0), iszero(NULL)
 ----
 false true true NULL
 
+# iszero: integers / unsigned / decimals
+query BBBB
+SELECT iszero(1::INT), iszero(0::INT), iszero(NULL::INT), iszero(-1::INT)
+----
+false true NULL false
+
+query BBBB
+SELECT iszero(1::INT UNSIGNED), iszero(0::INT UNSIGNED), iszero(NULL::INT UNSIGNED), iszero(255::TINYINT UNSIGNED)
+----
+false true NULL false
+
+query BBBB
+SELECT iszero(1::DECIMAL(10,2)), iszero(0::DECIMAL(10,2)), iszero(NULL::DECIMAL(10,2)), iszero(-1::DECIMAL(10,2))
+----
+false true NULL false
+
 # abs: empty argument
 statement error
 SELECT abs();
@@ -126,29 +158,29 @@ statement error
 SELECT abs(1, 2);
 
 # abs: unsupported argument type
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 SELECT abs('foo');
 
 # abs: numeric string
 # TODO: In Postgres, '-1.2' is unknown type and interpreted to float8 so they don't fail on this query
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 select abs('-1.2');
 
-query error DataFusion error: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String
+query error DataFusion error: Error during planning: Function 'abs' expects Numeric but received String
 select abs(arrow_cast('-1.2', 'Utf8'));
 
 statement ok
 CREATE TABLE test_nullable_integer(
-    c1 TINYINT, 
-    c2 SMALLINT, 
-    c3 INT, 
-    c4 BIGINT, 
-    c5 TINYINT UNSIGNED, 
-    c6 SMALLINT UNSIGNED, 
-    c7 INT UNSIGNED, 
-    c8 BIGINT UNSIGNED, 
+    c1 TINYINT,
+    c2 SMALLINT,
+    c3 INT,
+    c4 BIGINT,
+    c5 TINYINT UNSIGNED,
+    c6 SMALLINT UNSIGNED,
+    c7 INT UNSIGNED,
+    c8 BIGINT UNSIGNED,
     dataset TEXT
-    ) 
+    )
     AS VALUES
     (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'nulls'),
     (0, 0, 0, 0, 0, 0, 0, 0, 'zeros'),
@@ -237,7 +269,7 @@ SELECT c8%0 FROM test_nullable_integer
 
 # abs: return type
 query TTTTTTTT rowsort
-select 
+select
    arrow_typeof(abs(c1)), arrow_typeof(abs(c2)), arrow_typeof(abs(c3)), arrow_typeof(abs(c4)),
    arrow_typeof(abs(c5)), arrow_typeof(abs(c6)), arrow_typeof(abs(c7)), arrow_typeof(abs(c8))
 from test_nullable_integer limit 1
@@ -285,13 +317,13 @@ drop table test_nullable_integer
 
 statement ok
 CREATE TABLE test_non_nullable_integer(
-    c1 TINYINT NOT NULL, 
-    c2 SMALLINT NOT NULL, 
-    c3 INT NOT NULL, 
-    c4 BIGINT NOT NULL, 
-    c5 TINYINT UNSIGNED NOT NULL, 
-    c6 SMALLINT UNSIGNED NOT NULL, 
-    c7 INT UNSIGNED NOT NULL, 
+    c1 TINYINT NOT NULL,
+    c2 SMALLINT NOT NULL,
+    c3 INT NOT NULL,
+    c4 BIGINT NOT NULL,
+    c5 TINYINT UNSIGNED NOT NULL,
+    c6 SMALLINT UNSIGNED NOT NULL,
+    c7 INT UNSIGNED NOT NULL,
     c8 BIGINT UNSIGNED NOT NULL
     );
 
@@ -363,7 +395,7 @@ CREATE TABLE test_nullable_float(
     c2 double
     ) AS VALUES
     (-1.0, -1.0),
-    (1.0, 1.0), 
+    (1.0, 1.0),
     (NULL, NULL),
     (0., 0.),
     ('NaN'::double, 'NaN'::double);
@@ -412,7 +444,7 @@ Float32 Float64
 
 # abs: floats
 query RR rowsort
-SELECT abs(c1), abs(c2) from test_nullable_float 
+SELECT abs(c1), abs(c2) from test_nullable_float
 ----
 0 0
 1 1
@@ -420,6 +452,17 @@ SELECT abs(c1), abs(c2) from test_nullable_float
 NULL NULL
 NaN NaN
 
+# f16
+query TR rowsort
+SELECT arrow_typeof(abs(arrow_cast(c1, 'Float16'))), abs(arrow_cast(c1, 'Float16'))
+FROM test_nullable_float
+----
+Float16 0
+Float16 1
+Float16 1
+Float16 NULL
+Float16 NaN
+
 statement ok
 drop table test_nullable_float
 
@@ -428,7 +471,7 @@ statement ok
 CREATE TABLE test_non_nullable_float(
     c1 float NOT NULL,
     c2 double NOT NULL
-    ); 
+    );
 
 query I
 INSERT INTO test_non_nullable_float VALUES
@@ -478,27 +521,27 @@ drop table test_non_nullable_float
 statement ok
 CREATE TABLE test_nullable_decimal(
     c1 DECIMAL(10, 2),    /* Decimal128 */
-    c2 DECIMAL(38, 10),   /* Decimal128 with max precision */ 
+    c2 DECIMAL(38, 10),   /* Decimal128 with max precision */
     c3 DECIMAL(40, 2),    /* Decimal256 */
-    c4 DECIMAL(76, 10)    /* Decimal256 with max precision */ 
- ) AS VALUES 
-    (0, 0, 0, 0), 
+    c4 DECIMAL(76, 10)    /* Decimal256 with max precision */
+ ) AS VALUES
+    (0, 0, 0, 0),
     (NULL, NULL, NULL, NULL);
 
 query I
 INSERT into test_nullable_decimal values
     (
-        -99999999.99, 
-        '-9999999999999999999999999999.9999999999', 
-        '-99999999999999999999999999999999999999.99', 
+        -99999999.99,
+        '-9999999999999999999999999999.9999999999',
+        '-99999999999999999999999999999999999999.99',
         '-999999999999999999999999999999999999999999999999999999999999999999.9999999999'
-    ), 
+    ),
     (
-        99999999.99, 
-        '9999999999999999999999999999.9999999999', 
-        '99999999999999999999999999999999999999.99', 
+        99999999.99,
+        '9999999999999999999999999999.9999999999',
+        '99999999999999999999999999999999999999.99',
         '999999999999999999999999999999999999999999999999999999999999999999.9999999999'
-    ) 
+    )
 ----
 2
 
@@ -533,9 +576,9 @@ SELECT c1%0 FROM test_nullable_decimal WHERE c1 IS NOT NULL;
 
 # abs: return type
 query TTTT
-SELECT 
-    arrow_typeof(abs(c1)), 
-    arrow_typeof(abs(c2)), 
+SELECT
+    arrow_typeof(abs(c1)),
+    arrow_typeof(abs(c2)),
     arrow_typeof(abs(c3)),
     arrow_typeof(abs(c4))
 FROM test_nullable_decimal limit 1
@@ -552,11 +595,11 @@ SELECT abs(c1), abs(c2), abs(c3), abs(c4) FROM test_nullable_decimal
 NULL NULL NULL NULL
 
 statement ok
-drop table test_nullable_decimal  
+drop table test_nullable_decimal
 
 
 statement ok
-CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL); 
+CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL);
 
 query I
 INSERT INTO test_non_nullable_decimal VALUES(1)
@@ -569,13 +612,13 @@ SELECT c1*0 FROM test_non_nullable_decimal
 0
 
 query error DataFusion error: Arrow error: Divide by zero error
-SELECT c1/0 FROM test_non_nullable_decimal 
+SELECT c1/0 FROM test_non_nullable_decimal
 
 query error DataFusion error: Arrow error: Divide by zero error
-SELECT c1%0 FROM test_non_nullable_decimal 
+SELECT c1%0 FROM test_non_nullable_decimal
 
 statement ok
-drop table test_non_nullable_decimal 
+drop table test_non_nullable_decimal
 
 statement ok
 CREATE TABLE signed_integers(
@@ -615,7 +658,7 @@ NULL NULL NULL
 
 # scalar maxes and/or negative 1
 query III
-select 
+select
   gcd(9223372036854775807, -9223372036854775808), -- i64::MAX, i64::MIN
   gcd(9223372036854775807, -1), -- i64::MAX, -1
   gcd(-9223372036854775808, -1); -- i64::MIN, -1
@@ -685,12 +728,132 @@ query error DataFusion error: Arrow error: Compute error: Signed integer overflo
 select lcm(2, 9223372036854775803);
 
 
-query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 2107754225 \^ 1221660777
+## pow/power
+
+# pow() with integer base and negative float exponent (verifies type coercion)
+query R
+SELECT pow(2, -0.5)
+----
+0.707106781187
+
+# pow() with negative integer base and negative float exponent (returns NaN)
+query R
+SELECT pow(-2, -0.5)
+----
+NaN
+
+# pow() with zero base and negative exponent (returns Infinity)
+query R
+SELECT pow(0, -0.5)
+----
+Infinity
+
+# pow() with integer base of 1 and negative exponent
+query R
+SELECT pow(1, -0.5)
+----
+1
+
+# pow() with large integer base and small negative exponent
+query R
+SELECT pow(1000, -0.1)
+----
+0.501187233627
+
+# pow() with integer base and negative integer exponent returns float (like PostgreSQL)
+query R
+SELECT pow(2, -2)
+----
+0.25
+
+# power() with very large exponent returns infinity (Float64 behavior)
+query R
 select power(2107754225, 1221660777);
+----
+Infinity
+
+query R rowsort
+select power(base::double, exponent::double)
+from values
+  (2.0, 2.0),
+  (5.0, 4.0),
+  (2.0, 3.0),
+  (3.0, 4.0) as t(base, exponent);
+----
+4
+625
+8
+81
+
+query R rowsort
+select power(base::bigint, exponent::bigint)
+from values
+  (2, 2),
+  (5, 4),
+  (2, 3),
+  (3, 4),
+  (2, NULL) as t(base, exponent);
+----
+4
+625
+8
+81
+NULL
+
+query RT rowsort
+select
+  power(base::decimal(38, 0), exponent::decimal(38, 0)),
+  arrow_typeof(power(base::decimal(38, 0), exponent::decimal(38, 0)))
+from values
+  (0, 4),
+  (5, 0),
+  (2, 2),
+  (5, 4),
+  (2, 3),
+  (3, 4) as t(base, exponent);
+----
+0 Decimal128(38, 0)
+1 Decimal128(38, 0)
+4 Decimal128(38, 0)
+625 Decimal128(38, 0)
+8 Decimal128(38, 0)
+81 Decimal128(38, 0)
+
+query RT
+select
+  pow(2.5::decimal(2, 1), 4::bigint),
+  arrow_typeof(pow(2.5::decimal(2, 1), 4::bigint));
+----
+39 Decimal128(2, 1)
 
 # factorial overflow
-query error DataFusion error: Arrow error: Compute error: Overflow happened on FACTORIAL\(350943270\)
+query error DataFusion error: Execution error: Overflow happened on FACTORIAL\(350943270\)
 select FACTORIAL(350943270);
 
 statement ok
 drop table signed_integers
+
+# Null propagation for log
+query TT
+EXPLAIN SELECT log(NULL, c2) from aggregate_simple;
+----
+logical_plan
+01)Projection: Float64(NULL) AS log(NULL,aggregate_simple.c2)
+02)--TableScan: aggregate_simple projection=[]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_simple.csv]]}, projection=[NULL as log(NULL,aggregate_simple.c2)], file_type=csv, has_header=true
+
+# Float 16/32/64 for log
+query RT
+SELECT log(2.5, arrow_cast(10.9, 'Float16')), arrow_typeof(log(2.5, arrow_cast(10.9, 'Float16')));
+----
+2.6074219 Float16
+
+query RT
+SELECT log(2.5, 10.9::float), arrow_typeof(log(2.5, 10.9::float));
+----
+2.606992 Float32
+
+query RT
+SELECT log(2.5, 10.9::double), arrow_typeof(log(2.5, 10.9::double));
+----
+2.606992198152 Float64
diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt
index 7252c84caf141..6ed461debb3b4 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -24,6 +24,22 @@
 ## in the test harness as there is no way to define schema
 ## with metadata in SQL.
 
+query ITTPT
+select * from table_with_metadata;
+----
+1 NULL NULL 2020-09-08T13:42:29.190855123 no_foo
+NULL bar l_bar 2020-09-08T13:42:29.190855123 no_bar
+3 baz l_baz 2020-09-08T13:42:29.190855123 no_baz
+
+query TTT
+describe table_with_metadata;
+----
+id Int32 YES
+name Utf8 YES
+l_name Utf8 YES
+ts Timestamp(ns) NO
+nonnull_name Utf8 NO
+
 query IT
 select id, name from table_with_metadata;
 ----
@@ -165,7 +181,7 @@ GROUP BY ts
 ORDER BY ts
 LIMIT 1;
 ----
-2020-09-08T13:42:29.190855123Z
+2020-09-08T13:42:29.190855123
 
 
 
@@ -235,7 +251,84 @@ order by 1 asc nulls last;
 3 1
 NULL 1
 
+# Reproducer for https://github.com/apache/datafusion/issues/18337
+# this query should not get an internal error
+query TI
+SELECT
+  'foo' AS name,
+  COUNT(
+    CASE
+      WHEN prev_value = 'no_bar' AND value = 'no_baz' THEN 1
+      ELSE NULL
+      END
+     ) AS count_rises
+FROM
+  (
+    SELECT
+      nonnull_name as value,
+      LAG(nonnull_name) OVER (ORDER BY ts) AS prev_value
+    FROM
+      table_with_metadata
+);
+----
+foo 1
 
+# Regression test: first_value should preserve metadata
+query IT
+select first_value(id order by id asc nulls last), arrow_metadata(first_value(id order by id asc nulls last), 'metadata_key')
+from table_with_metadata;
+----
+1 the id field
+
+# Regression test: last_value should preserve metadata
+query IT
+select last_value(id order by id asc nulls first), arrow_metadata(last_value(id order by id asc nulls first), 'metadata_key')
+from table_with_metadata;
+----
+3 the id field
+
+# Regression test: DISTINCT ON should preserve metadata (uses first_value internally)
+query ITTT
+select distinct on (id) id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from table_with_metadata order by id asc nulls last;
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Regression test: DISTINCT should preserve metadata
+query ITTT
+with res AS (
+  select distinct id, name from table_with_metadata
+)
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from res
+order by id asc nulls last;
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Regression test: grouped columns should preserve metadata
+query ITTT
+with res AS (
+  select name, count(*), id
+  from table_with_metadata
+  group by id, name
+)
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 'metadata_key')
+from res
+order by id asc nulls last, name asc nulls last
+----
+1 the id field NULL the name field
+3 the id field baz the name field
+NULL the id field bar the name field
+
+# Test arrow_metadata with single argument (returns Map)
+query ?
+select arrow_metadata(id) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
 
 statement ok
 drop table table_with_metadata;
diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
index e8700b1fea275..7feefc169fcab 100644
--- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
+++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt
@@ -46,7 +46,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a_big@0 ASC NULLS LAST, b@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -62,7 +62,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Cast to larger types as well as preserving ordering
@@ -83,7 +83,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a_big@1 ASC NULLS LAST, b@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, CAST(a@0 AS Int64) as a_big, b@1 as b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # test for common rename
@@ -97,9 +97,7 @@ logical_plan
 01)Sort: a_big ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST
 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b
 03)----TableScan: multiple_ordered_table projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN
@@ -111,9 +109,7 @@ logical_plan
 01)Sort: multiple_ordered_table.a ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST
 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b
 03)----TableScan: multiple_ordered_table projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 
 # test for cast Utf8
@@ -129,13 +125,13 @@ ORDER BY a_str ASC, b ASC;
 ----
 logical_plan
 01)Sort: a_str ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST
-02)--Projection: CAST(multiple_ordered_table.a AS Utf8) AS a_str, multiple_ordered_table.b
+02)--Projection: CAST(multiple_ordered_table.a AS Utf8View) AS a_str, multiple_ordered_table.b
 03)----TableScan: multiple_ordered_table projection=[a, b]
 physical_plan
 01)SortPreservingMergeExec: [a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST]
 02)--SortExec: expr=[a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true]
-03)----ProjectionExec: expr=[CAST(a@0 AS Utf8) as a_str, b@1 as b]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----ProjectionExec: expr=[CAST(a@0 AS Utf8View) as a_str, b@1 as b]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # We cannot determine a+b is ordered from the
@@ -170,5 +166,5 @@ physical_plan
 01)SortPreservingMergeExec: [sum_expr@0 ASC NULLS LAST]
 02)--SortExec: expr=[sum_expr@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[CAST(a@0 + b@1 AS Int64) as sum_expr, a@0 as a, b@1 as b]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
diff --git a/datafusion/sqllogictest/test_files/named_arguments.slt b/datafusion/sqllogictest/test_files/named_arguments.slt
new file mode 100644
index 0000000000000..07b6cc6a79a0c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/named_arguments.slt
@@ -0,0 +1,272 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## Tests for Named Arguments (PostgreSQL-style param => value syntax)
+#############
+
+# Test positional arguments still work (baseline)
+query T
+SELECT substr('hello world', 7, 5);
+----
+world
+
+# Test named arguments in order
+query T
+SELECT substr(str => 'hello world', start_pos => 7, length => 5);
+----
+world
+
+# Test named arguments out of order
+query T
+SELECT substr(length => 5, str => 'hello world', start_pos => 7);
+----
+world
+
+# Test mixed positional and named arguments
+query T
+SELECT substr('hello world', start_pos => 7, length => 5);
+----
+world
+
+# Test with only 2 parameters (length optional)
+query T
+SELECT substr(str => 'hello world', start_pos => 7);
+----
+world
+
+# Test all parameters named with substring alias
+query T
+SELECT substring(str => 'hello', start_pos => 1, length => 3);
+----
+hel
+
+# Error: positional argument after named argument
+query error DataFusion error: Error during planning: Positional argument.*follows named argument
+SELECT substr(str => 'hello', 1, 3);
+
+# Error: unknown parameter name
+query error DataFusion error: Error during planning: Unknown parameter name 'invalid'
+SELECT substr(invalid => 'hello', start_pos => 1, length => 3);
+
+# Error: duplicate parameter name
+query error DataFusion error: Error during planning: Parameter 'str' specified multiple times
+SELECT substr(str => 'hello', str => 'world', start_pos => 1);
+
+# Test case-insensitive parameter names (unquoted identifiers)
+query T
+SELECT substr(STR => 'hello world', START_POS => 7, LENGTH => 5);
+----
+world
+
+# Test case-insensitive with mixed case
+query T
+SELECT substr(Str => 'hello world', Start_Pos => 7);
+----
+world
+
+# Error: quoted identifiers are case-sensitive per SQL standards
+# "STR" does not match parameter "str" (wrong case)
+query error DataFusion error: Error during planning: Unknown parameter name 'STR'
+SELECT substr("STR" => 'hello world', "start_pos" => 7);
+
+# Error: wrong number of arguments
+# This query provides only 1 argument but substr requires 2 or 3
+query error Function 'substr' failed to match any signature
+SELECT substr(str => 'hello world');
+
+#############
+## PostgreSQL Dialect Tests (uses ExprNamed variant)
+#############
+
+statement ok
+set datafusion.sql_parser.dialect = 'PostgreSQL';
+
+# Test named arguments in order
+query T
+SELECT substr(str => 'hello world', start_pos => 7, length => 5);
+----
+world
+
+# Test named arguments out of order
+query T
+SELECT substr(length => 5, str => 'hello world', start_pos => 7);
+----
+world
+
+# Test mixed positional and named arguments
+query T
+SELECT substr('hello world', start_pos => 7, length => 5);
+----
+world
+
+# Test with only 2 parameters (length optional)
+query T
+SELECT substr(str => 'hello world', start_pos => 7);
+----
+world
+
+# Reset to default dialect
+statement ok
+set datafusion.sql_parser.dialect = 'Generic';
+
+#############
+## MsSQL Dialect Tests (does NOT support => operator)
+#############
+
+statement ok
+set datafusion.sql_parser.dialect = 'MsSQL';
+
+# Error: MsSQL dialect does not support => operator
+query error DataFusion error: SQL error: ParserError\("Expected: \), found: => at Line: 1, Column: 19"\)
+SELECT substr(str => 'hello world', start_pos => 7, length => 5);
+
+# Reset to default dialect
+statement ok
+set datafusion.sql_parser.dialect = 'Generic';
+
+#############
+## Aggregate UDF Tests - using corr(y, x) function
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE correlation_test(col1 DOUBLE, col2 DOUBLE) AS VALUES
+  (1.0, 2.0),
+  (2.0, 4.0),
+  (3.0, 6.0),
+  (4.0, 8.0);
+
+# Test positional arguments (baseline)
+query R
+SELECT corr(col1, col2) FROM correlation_test;
+----
+1
+
+# Test named arguments out of order (proves named args work for aggregates)
+query R
+SELECT corr(x => col2, y => col1) FROM correlation_test;
+----
+1
+
+# Error: function doesn't support named arguments (count has no parameter names)
+query error DataFusion error: Error during planning: Aggregate function 'count' does not support named arguments
+SELECT count(value => col1) FROM correlation_test;
+
+# Cleanup
+statement ok
+DROP TABLE correlation_test;
+
+#############
+## Aggregate UDF with WITHIN GROUP Tests - using percentile_cont(expression, percentile)
+## This tests the special handling where WITHIN GROUP ORDER BY expressions are prepended to args
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE percentile_test(salary DOUBLE) AS VALUES
+  (50000.0),
+  (60000.0),
+  (70000.0),
+  (80000.0),
+  (90000.0);
+
+# Test positional arguments (baseline) - standard call without WITHIN GROUP
+query R
+SELECT percentile_cont(salary, 0.5) FROM percentile_test;
+----
+70000
+
+# Test WITHIN GROUP with positional argument
+query R
+SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+70000
+
+# Test WITHIN GROUP with named argument for percentile
+# The ORDER BY expression (salary) is prepended internally, becoming: percentile_cont(salary, 0.5)
+# We use named argument for percentile, which should work correctly
+query R
+SELECT percentile_cont(percentile => 0.5) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+70000
+
+# Verify the WITHIN GROUP prepending logic with different percentile value
+query R
+SELECT percentile_cont(percentile => 0.25) WITHIN GROUP (ORDER BY salary) FROM percentile_test;
+----
+60000
+
+# Cleanup
+statement ok
+DROP TABLE percentile_test;
+
+#############
+## Window UDF Tests - using lead(expression, offset, default) function
+#############
+
+# Setup test data
+statement ok
+CREATE TABLE window_test(id INT, value INT) AS VALUES
+  (1, 10),
+  (2, 20),
+  (3, 30),
+  (4, 40);
+
+# Test positional arguments (baseline)
+query II
+SELECT id, lead(value, 1, 0) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 0
+
+# Test named arguments out of order (proves named args work for window functions)
+query II
+SELECT id, lead(default => 0, offset => 1, expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 0
+
+# Test with 1 argument (offset and default use defaults)
+query II
+SELECT id, lead(expr => value) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 20
+2 30
+3 40
+4 NULL
+
+# Test with 2 arguments (default uses default)
+query II
+SELECT id, lead(expr => value, offset => 2) OVER (ORDER BY id) FROM window_test ORDER BY id;
+----
+1 30
+2 40
+3 NULL
+4 NULL
+
+# Error: function doesn't support named arguments (row_number has no parameter names)
+query error DataFusion error: Error during planning: Window function 'row_number' does not support named arguments
+SELECT row_number(value => 1) OVER (ORDER BY id) FROM window_test;
+
+# Cleanup
+statement ok
+DROP TABLE window_test;
diff --git a/datafusion/sqllogictest/test_files/null_aware_anti_join.slt b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt
new file mode 100644
index 0000000000000..5907a85a9b923
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt
@@ -0,0 +1,453 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#############
+## Null-Aware Anti Join Tests
+## Tests for automatic null-aware semantics in NOT IN subqueries
+#############
+
+statement ok
+CREATE TABLE outer_table(id INT, value TEXT) AS VALUES
+(1, 'a'),
+(2, 'b'),
+(3, 'c'),
+(4, 'd'),
+(NULL, 'e');
+
+statement ok
+CREATE TABLE inner_table_no_null(id INT, value TEXT) AS VALUES
+(2, 'x'),
+(4, 'y');
+
+statement ok
+CREATE TABLE inner_table_with_null(id INT, value TEXT) AS VALUES
+(2, 'x'),
+(NULL, 'y');
+
+#############
+## Test 1: NOT IN with no NULLs - should behave like regular anti join
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null);
+----
+1 a
+3 c
+
+# Verify the plan uses LeftAnti join
+query TT
+EXPLAIN SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null);
+----
+logical_plan
+01)LeftAnti Join: outer_table.id = __correlated_sq_1.id
+02)--TableScan: outer_table projection=[id, value]
+03)--SubqueryAlias: __correlated_sq_1
+04)----TableScan: inner_table_no_null projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 2: NOT IN with NULL in subquery - should return 0 rows (null-aware semantics)
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_with_null);
+----
+
+# Verify the result is empty even though there are rows in outer_table
+# that don't match the non-NULL value (2) in the subquery.
+# This is correct null-aware behavior: if subquery contains NULL, result is unknown.
+
+#############
+## Test 3: NOT IN with NULL in outer table but not in subquery
+## NULL rows from outer should not appear in output
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null) AND id IS NOT NULL;
+----
+1 a
+3 c
+
+#############
+## Test 4: Test with all NULL subquery
+#############
+
+statement ok
+CREATE TABLE all_null_table(id INT) AS VALUES (NULL), (NULL);
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM all_null_table);
+----
+
+#############
+## Test 5: Test with empty subquery - should return all rows
+#############
+
+statement ok
+CREATE TABLE empty_table(id INT, value TEXT);
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM empty_table);
+----
+1 a
+2 b
+3 c
+4 d
+NULL e
+
+#############
+## Test 6: NOT IN with complex expression
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id + 1 NOT IN (SELECT id FROM inner_table_no_null);
+----
+2 b
+4 d
+
+#############
+## Test 7: NOT IN with complex expression and NULL in subquery
+#############
+
+query IT rowsort
+SELECT * FROM outer_table WHERE id + 1 NOT IN (SELECT id FROM inner_table_with_null);
+----
+
+#############
+## Test 8: Multiple NOT IN conditions (AND)
+#############
+
+statement ok
+CREATE TABLE inner_table2(id INT) AS VALUES (1), (3);
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_no_null)
+  AND id NOT IN (SELECT id FROM inner_table2);
+----
+
+#############
+## Test 9: Multiple NOT IN conditions (OR)
+#############
+
+# KNOWN LIMITATION: Mark joins used for OR conditions don't support null-aware semantics.
+# The NULL row is incorrectly returned here. According to SQL semantics:
+# - NULL NOT IN (2, 4) = UNKNOWN
+# - NULL NOT IN (1, 3) = UNKNOWN
+# - UNKNOWN OR UNKNOWN = UNKNOWN (should be filtered out)
+# But mark joins treat NULL keys as non-matching (FALSE), so:
+# - NULL mark column = FALSE
+# - NOT FALSE OR NOT FALSE = TRUE OR TRUE = TRUE (incorrectly included)
+# TODO: Implement null-aware support for mark joins to fix this
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_no_null)
+   OR id NOT IN (SELECT id FROM inner_table2);
+----
+1 a
+2 b
+3 c
+4 d
+NULL e
+
+#############
+## Test 10: NOT IN with WHERE clause in subquery
+#############
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT id FROM inner_table_with_null WHERE value = 'x');
+----
+1 a
+3 c
+4 d
+
+# Note: The NULL row from inner_table_with_null is filtered out by WHERE clause,
+# so this behaves like regular anti join (not null-aware)
+
+#############
+## Test 11: Verify NULL-aware flag is set for LeftAnti joins
+#############
+
+# Check that the physical plan shows null-aware anti join
+# Note: The exact format may vary, but we should see LeftAnti join type
+query TT
+EXPLAIN SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_with_null);
+----
+logical_plan
+01)LeftAnti Join: outer_table.id = __correlated_sq_1.id
+02)--TableScan: outer_table projection=[id, value]
+03)--SubqueryAlias: __correlated_sq_1
+04)----TableScan: inner_table_with_null projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 12: Correlated NOT IN subquery with NULL
+#############
+
+statement ok
+CREATE TABLE orders(order_id INT, customer_id INT) AS VALUES
+(1, 100),
+(2, 200),
+(3, 300);
+
+statement ok
+CREATE TABLE payments(payment_id INT, order_id INT) AS VALUES
+(1, 1),
+(2, NULL);
+
+# Find orders that don't have payments
+# Should return empty because there's a NULL in payments.order_id
+query I rowsort
+SELECT order_id FROM orders
+WHERE order_id NOT IN (SELECT order_id FROM payments);
+----
+
+#############
+## Test 13: NOT IN with DISTINCT in subquery
+#############
+
+statement ok
+CREATE TABLE duplicates_with_null(id INT) AS VALUES
+(2),
+(2),
+(NULL),
+(NULL);
+
+query IT rowsort
+SELECT * FROM outer_table
+WHERE id NOT IN (SELECT DISTINCT id FROM duplicates_with_null);
+----
+
+#############
+## Test 14: NOT EXISTS vs NOT IN - Demonstrating the difference
+#############
+
+# NOT EXISTS should NOT use null-aware semantics
+# It uses two-valued logic (TRUE/FALSE), not three-valued logic (TRUE/FALSE/UNKNOWN)
+
+# Setup tables for comparison
+statement ok
+CREATE TABLE customers(id INT, name TEXT) AS VALUES
+(1, 'Alice'),
+(2, 'Bob'),
+(3, 'Charlie'),
+(NULL, 'Dave');
+
+statement ok
+CREATE TABLE banned(id INT) AS VALUES
+(2),
+(NULL);
+
+# Test 14a: NOT IN with NULL in subquery - Returns EMPTY (null-aware)
+query IT rowsort
+SELECT * FROM customers WHERE id NOT IN (SELECT id FROM banned);
+----
+
+# Test 14b: NOT EXISTS with NULL in subquery - Returns rows (NOT null-aware)
+# This should return (1, 'Alice'), (3, 'Charlie'), (NULL, 'Dave')
+# Because NOT EXISTS uses two-valued logic: NULL = NULL is FALSE, so no match found
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
+----
+1 Alice
+3 Charlie
+NULL Dave
+
+# Test 14c: Verify with EXPLAIN that NOT EXISTS doesn't use null-aware
+query TT
+EXPLAIN SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM banned b WHERE c.id = b.id);
+----
+logical_plan
+01)LeftAnti Join: c.id = __correlated_sq_1.id
+02)--SubqueryAlias: c
+03)----TableScan: customers projection=[id, name]
+04)--SubqueryAlias: __correlated_sq_1
+05)----SubqueryAlias: b
+06)------TableScan: banned projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(id@0, id@0)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+03)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+#############
+## Test 15: NOT EXISTS - No NULLs
+#############
+
+statement ok
+CREATE TABLE active_customers(id INT) AS VALUES (1), (3);
+
+# Should return only Bob (id=2) and Dave (id=NULL)
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM active_customers a WHERE c.id = a.id);
+----
+2 Bob
+NULL Dave
+
+#############
+## Test 16: NOT EXISTS - Correlated subquery
+#############
+
+statement ok
+CREATE TABLE orders_test(order_id INT, customer_id INT) AS VALUES
+(1, 100),
+(2, 200),
+(3, NULL);
+
+statement ok
+CREATE TABLE customers_test(customer_id INT, name TEXT) AS VALUES
+(100, 'Alice'),
+(200, 'Bob'),
+(300, 'Charlie'),
+(NULL, 'Unknown');
+
+# Find customers with no orders
+# Should return Charlie (300) and Unknown (NULL)
+query IT rowsort
+SELECT * FROM customers_test c
+WHERE NOT EXISTS (
+    SELECT 1 FROM orders_test o WHERE o.customer_id = c.customer_id
+);
+----
+300 Charlie
+NULL Unknown
+
+#############
+## Test 17: NOT EXISTS with all NULL subquery
+#############
+
+statement ok
+CREATE TABLE all_null_banned(id INT) AS VALUES (NULL), (NULL);
+
+# NOT EXISTS should return all rows because NULL = NULL is FALSE (no matches)
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (SELECT 1 FROM all_null_banned b WHERE c.id = b.id);
+----
+1 Alice
+2 Bob
+3 Charlie
+NULL Dave
+
+# Compare with NOT IN which returns empty
+query IT rowsort
+SELECT * FROM customers WHERE id NOT IN (SELECT id FROM all_null_banned);
+----
+
+#############
+## Test 18: Nested NOT EXISTS and NOT IN
+#############
+
+# NOT EXISTS outside, NOT IN inside - should work correctly
+query IT rowsort
+SELECT * FROM customers c
+WHERE NOT EXISTS (
+    SELECT 1 FROM banned b
+    WHERE c.id = b.id
+    AND b.id NOT IN (SELECT id FROM active_customers)
+);
+----
+1 Alice
+3 Charlie
+NULL Dave
+
+#############
+## Test from GitHub issue #10583
+## Tests NOT IN with NULL in subquery result - should return empty result
+#############
+
+statement ok
+CREATE TABLE test_table(c1 INT, c2 INT) AS VALUES
+(1, 1),
+(2, 2),
+(3, 3),
+(4, NULL),
+(NULL, 0);
+
+# When subquery contains NULL, NOT IN should return empty result
+# because NULL NOT IN (values including NULL) is UNKNOWN for all rows
+query II rowsort
+SELECT * FROM test_table WHERE (c1 NOT IN (SELECT c2 FROM test_table)) = true;
+----
+
+# NOTE: The correlated subquery version from issue #10583:
+# SELECT * FROM test_table t1 WHERE c1 NOT IN (SELECT c2 FROM test_table t2 WHERE t1.c1 = t2.c1)
+# is not yet supported because it creates a multi-column join (correlation + NOT IN condition).
+# This is a known limitation - currently only supports single column null-aware anti joins.
+# This will be addressed in next Phase (multi-column support).
+
+#############
+## Cleanup
+#############
+
+statement ok
+DROP TABLE test_table;
+
+statement ok
+DROP TABLE outer_table;
+
+statement ok
+DROP TABLE inner_table_no_null;
+
+statement ok
+DROP TABLE inner_table_with_null;
+
+statement ok
+DROP TABLE all_null_table;
+
+statement ok
+DROP TABLE empty_table;
+
+statement ok
+DROP TABLE inner_table2;
+
+statement ok
+DROP TABLE orders;
+
+statement ok
+DROP TABLE payments;
+
+statement ok
+DROP TABLE duplicates_with_null;
+
+statement ok
+DROP TABLE customers;
+
+statement ok
+DROP TABLE banned;
+
+statement ok
+DROP TABLE active_customers;
+
+statement ok
+DROP TABLE orders_test;
+
+statement ok
+DROP TABLE customers_test;
+
+statement ok
+DROP TABLE all_null_banned;
diff --git a/datafusion/sqllogictest/test_files/nullif.slt b/datafusion/sqllogictest/test_files/nullif.slt
index 18642f6971ca8..7b4c59b263947 100644
--- a/datafusion/sqllogictest/test_files/nullif.slt
+++ b/datafusion/sqllogictest/test_files/nullif.slt
@@ -112,7 +112,7 @@ select nullif(1.0, 2);
 ----
 1
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type
 select nullif(2, 'a');
 
 query T
@@ -174,3 +174,39 @@ query T
 SELECT NULLIF(arrow_cast('a', 'Utf8View'), null);
 ----
 a
+
+# Test with dictionary-encoded strings
+# This tests the fix for: "Dictionary(UInt32, Utf8) and Utf8 is not comparable"
+statement ok
+CREATE TABLE dict_test_base(
+  col1 TEXT,
+  col2 TEXT
+) as VALUES
+  ('foo', 'bar'),
+  ('bar', 'bar'),
+  ('baz', 'bar')
+;
+
+# Dictionary cast with string literal
+query T rowsort
+SELECT NULLIF(arrow_cast(col1, 'Dictionary(Int32, Utf8)'), 'bar') FROM dict_test_base;
+----
+NULL
+baz
+foo
+
+# String with dictionary cast
+query T rowsort
+SELECT NULLIF(col2, arrow_cast(col1, 'Dictionary(Int32, Utf8)')) FROM dict_test_base;
+----
+NULL
+bar
+bar
+
+# Both as dictionaries
+query T rowsort
+SELECT NULLIF(arrow_cast(col1, 'Dictionary(Int32, Utf8)'), arrow_cast('bar', 'Dictionary(Int32, Utf8)')) FROM dict_test_base;
+----
+NULL
+baz
+foo
diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt
index daab54307cc20..7f78b02baccdb 100644
--- a/datafusion/sqllogictest/test_files/nvl.slt
+++ b/datafusion/sqllogictest/test_files/nvl.slt
@@ -114,7 +114,7 @@ SELECT NVL(1, 3);
 ----
 1
 
-query I
+query B
 SELECT NVL(NULL, NULL);
 ----
 NULL
@@ -148,3 +148,38 @@ query T
 SELECT NVL(arrow_cast('a', 'Utf8View'), NULL);
 ----
 a
+
+# nvl is implemented as a case, and short-circuits evaluation
+# so the following query should not error
+query I
+SELECT NVL(1, 1/0);
+----
+1
+
+# but this one should
+query error DataFusion error: Arrow error: Divide by zero error
+SELECT NVL(NULL, 1/0);
+
+# Expect the query plan to show nvl as a case expression
+query I
+select NVL(int_field, 9999) FROM test;
+----
+1
+2
+3
+9999
+4
+9999
+
+# Expect the query plan to show nvl as a case expression
+query TT
+EXPLAIN select NVL(int_field, 9999) FROM test;
+----
+logical_plan
+01)Projection: CASE WHEN __common_expr_1 IS NOT NULL THEN __common_expr_1 ELSE Int64(9999) END AS nvl(test.int_field,Int64(9999))
+02)--Projection: CAST(test.int_field AS Int64) AS __common_expr_1
+03)----TableScan: test projection=[int_field]
+physical_plan
+01)ProjectionExec: expr=[CASE WHEN __common_expr_1@0 IS NOT NULL THEN __common_expr_1@0 ELSE 9999 END as nvl(test.int_field,Int64(9999))]
+02)--ProjectionExec: expr=[CAST(int_field@0 AS Int64) as __common_expr_1]
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/operator.slt b/datafusion/sqllogictest/test_files/operator.slt
index a651eda99684b..e50fa721c8850 100644
--- a/datafusion/sqllogictest/test_files/operator.slt
+++ b/datafusion/sqllogictest/test_files/operator.slt
@@ -262,6 +262,15 @@ from numeric_types;
 ----
 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64 Float64
 
+############### NULL arithmetic ###############
+
+# select both nulls with basic arithmetic operations
+query IIIII
+select null + null,  null - null, null * null, null / null, null % null;
+----
+NULL NULL NULL NULL NULL
+
+
 ###############
 # Test for comparison with constants uses efficient types
 # Expect the physical plans to compare with constants of the same type
@@ -278,9 +287,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < 5 AND uint64 < 5 AND float64 < 5 AND decimal < 5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 < 5 AND uint64@7 < 5 AND float64@9 < 5 AND decimal@10 < Some(500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 < 5 AND uint64@7 < 5 AND float64@9 < 5 AND decimal@10 < Some(500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < negative  integer (expect no casts)
 query TT
@@ -288,9 +296,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < -5 AND uint64 < -5 AND float64 < -5 AND decimal < -5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 < -5 AND CAST(uint64@7 AS Decimal128(20, 0)) < Some(-5),20,0 AND float64@9 < -5 AND decimal@10 < Some(-500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 < -5 AND CAST(uint64@7 AS Decimal128(20, 0)) < Some(-5),20,0 AND float64@9 < -5 AND decimal@10 < Some(-500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < decimal (expect casts for integers to float)
 query TT
@@ -298,9 +305,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < 5.1 AND uint64 < 5.1 AND float64 < 5.1 AND decimal < 5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) < 5.1 AND CAST(uint64@7 AS Float64) < 5.1 AND float64@9 < 5.1 AND decimal@10 < Some(510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) < 5.1 AND CAST(uint64@7 AS Float64) < 5.1 AND float64@9 < 5.1 AND decimal@10 < Some(510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## < negative decimal (expect casts for integers to float)
 query TT
@@ -308,9 +314,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 < -5.1 AND uint64 < -5.1 AND float64 < -5.1 AND decimal < -5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) < -5.1 AND CAST(uint64@7 AS Float64) < -5.1 AND float64@9 < -5.1 AND decimal@10 < Some(-510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) < -5.1 AND CAST(uint64@7 AS Float64) < -5.1 AND float64@9 < -5.1 AND decimal@10 < Some(-510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 ############### Equality ###############
@@ -321,9 +326,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = 5 AND uint64 = 5 AND float64 = 5 AND decimal = 5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 = 5 AND uint64@7 = 5 AND float64@9 = 5 AND decimal@10 = Some(500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 = 5 AND uint64@7 = 5 AND float64@9 = 5 AND decimal@10 = Some(500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = negative  integer (expect no casts)
 query TT
@@ -331,9 +335,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = -5 AND uint64 = -5 AND float64 = -5 AND decimal = -5;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: int64@3 = -5 AND CAST(uint64@7 AS Decimal128(20, 0)) = Some(-5),20,0 AND float64@9 = -5 AND decimal@10 = Some(-500),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: int64@3 = -5 AND CAST(uint64@7 AS Decimal128(20, 0)) = Some(-5),20,0 AND float64@9 = -5 AND decimal@10 = Some(-500),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = decimal (expect casts for integers to float)
 query TT
@@ -341,9 +344,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = 5.1 AND uint64 = 5.1 AND float64 = 5.1 AND decimal = 5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) = 5.1 AND CAST(uint64@7 AS Float64) = 5.1 AND float64@9 = 5.1 AND decimal@10 = Some(510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) = 5.1 AND CAST(uint64@7 AS Float64) = 5.1 AND float64@9 = 5.1 AND decimal@10 = Some(510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ## = negative decimal (expect casts for integers to float)
 query TT
@@ -351,9 +353,8 @@ EXPLAIN SELECT * FROM numeric_types
 WHERE  int64 = -5.1 AND uint64 = -5.1 AND float64 = -5.1 AND decimal = -5.1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(int64@3 AS Float64) = -5.1 AND CAST(uint64@7 AS Float64) = -5.1 AND float64@9 = -5.1 AND decimal@10 = Some(-510),5,2
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: CAST(int64@3 AS Float64) = -5.1 AND CAST(uint64@7 AS Float64) = -5.1 AND float64@9 = -5.1 AND decimal@10 = Some(-510),5,2
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/options.slt b/datafusion/sqllogictest/test_files/options.slt
index 71ff12e8cc507..0d1583dbc0086 100644
--- a/datafusion/sqllogictest/test_files/options.slt
+++ b/datafusion/sqllogictest/test_files/options.slt
@@ -23,7 +23,6 @@
 statement ok
 create table a(c0 int) as values (1), (2);
 
-# Expect coalesce and default batch size
 query TT
 explain SELECT * FROM a WHERE c0 < 1;
 ----
@@ -31,9 +30,8 @@ logical_plan
 01)Filter: a.c0 < Int32(1)
 02)--TableScan: a projection=[c0]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: c0@0 < 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: c0@0 < 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 ##
 # test_disable_coalesce
@@ -72,9 +70,8 @@ logical_plan
 01)Filter: a.c0 < Int32(1)
 02)--TableScan: a projection=[c0]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=1234
-02)--FilterExec: c0@0 < 1
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: c0@0 < 1
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 statement ok
diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt
index 3fc90a6459f27..892a42ad61443 100644
--- a/datafusion/sqllogictest/test_files/order.slt
+++ b/datafusion/sqllogictest/test_files/order.slt
@@ -94,6 +94,98 @@ NULL three
 1 one
 2 two
 
+statement ok
+set datafusion.sql_parser.default_null_ordering = 'nulls_min';
+
+# test asc with `nulls_min` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num
+----
+NULL three
+1 one
+2 two
+
+# test desc with `nulls_min` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC
+----
+2 two
+1 one
+NULL three
+
+statement ok
+set datafusion.sql_parser.default_null_ordering = 'nulls_first';
+
+# test asc with `nulls_first` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num
+----
+NULL three
+1 one
+2 two
+
+# test desc with `nulls_first` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC
+----
+NULL three
+2 two
+1 one
+
+
+statement ok
+set datafusion.sql_parser.default_null_ordering = 'nulls_last';
+
+# test asc with `nulls_last` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num
+----
+1 one
+2 two
+NULL three
+
+# test desc with `nulls_last` null ordering
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC
+----
+2 two
+1 one
+NULL three
+
+statement ok
+set datafusion.sql_parser.default_null_ordering = '';
+
+# test asc with an empty `default_null_ordering`. Expected to use the default null ordering which is `nulls_max`
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num
+----
+1 one
+2 two
+NULL three
+
+# test desc with an empty `default_null_ordering`. Expected to use the default null ordering which is `nulls_max`
+
+query IT
+SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (null, 'three')) AS t (num,letter) ORDER BY num DESC
+----
+NULL three
+2 two
+1 one
+
+statement error DataFusion error: Error during planning: Unsupported value Null
+set datafusion.sql_parser.default_null_ordering = null;
+
+# reset to default null ordering
+statement ok
+set datafusion.sql_parser.default_null_ordering = 'nulls_max';
+
 # sort
 
 statement ok
@@ -327,6 +419,42 @@ select column1 + column2 from foo group by column1, column2 ORDER BY column2 des
 7
 3
 
+# Test ordering by aggregate on non-selected column (issue #18683)
+# Previously failed with "Schema error: No field named foo.column2"
+query I
+select column1 from foo group by column1 order by min(column2);
+----
+1
+3
+5
+
+# Test ordering by aggregate expression on non-selected columns
+query I
+select column1 from foo group by column1 order by min(column2) + max(column2);
+----
+1
+3
+5
+
+# Test ordering by multiple aggregates on non-selected columns
+query I
+select column1 from foo group by column1 order by min(column2), max(column2);
+----
+1
+3
+5
+
+# Test GROUP BY alias with ORDER BY column index
+# Regression test: GROUP BY an aliased column, ORDER BY using column index
+query TI
+with t as (select 'foo' as x)
+select x, count(*) as "Count"
+from t
+group by x
+order by 2 desc;
+----
+foo 1
+
 # Test issue: https://github.com/apache/datafusion/issues/11549
 query I
 select column1 from foo order by log(column2);
@@ -343,6 +471,54 @@ select column1 from foo order by column2 % 2, column2;
 3
 5
 
+# ORDER BY aggregate expression that is aliased in SELECT
+query II
+select column1, min(column2) as min_val from foo group by column1 order by min(column2);
+----
+1 2
+3 4
+5 6
+
+# ORDER BY aggregate with alias, using DESC
+query II rowsort
+select column1, count(*) as cnt from foo group by column1 order by count(*) desc;
+----
+1 1
+3 1
+5 1
+
+# ORDER BY aggregate not in SELECT, while other aggregates in SELECT are aliased
+query I
+select column1 from foo group by column1 order by max(column2);
+----
+1
+3
+5
+
+# SELECT has composite expression containing the aggregate, plus standalone alias
+query III
+select column1, min(column2) + max(column2) as range_val, min(column2) as min_val from foo group by column1 order by min(column2);
+----
+1 4 2
+3 8 4
+5 12 6
+
+# ORDER BY aggregate that matches multiple aliased SELECT expressions
+query III
+select column1, min(column2) as first_min, min(column2) as second_min from foo group by column1 order by min(column2);
+----
+1 2 2
+3 4 4
+5 6 6
+
+# ORDER BY with CAST on aliased aggregate
+query II
+select column1, min(column2) as min_val from foo group by column1 order by CAST(min(column2) AS BIGINT);
+----
+1 2
+3 4
+5 6
+
 # Cleanup
 statement ok
 drop table foo;
@@ -469,7 +645,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [result@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[b@1 + a@0 + c@2 as result]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_orderings=[[a@0 ASC NULLS LAST], [b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true
 
 statement ok
@@ -500,7 +676,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [db15@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 900000000000 }, ts@0, 1659537600000000000) as db15]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=false
 
 query TT
@@ -515,7 +691,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [dt_day@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[date_trunc(DAY, ts@0) as dt_day]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=false
 
 statement ok
@@ -558,7 +734,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [atan_c11@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[atan(c11@0) as atan_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11], output_ordering=[c11@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -573,7 +749,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [ceil_c11@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[ceil(c11@0) as ceil_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11], output_ordering=[c11@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
@@ -583,12 +759,12 @@ query TT
 ----
 logical_plan
 01)Sort: log_c11_base_c12 ASC NULLS LAST
-02)--Projection: log(aggregate_test_100.c12, CAST(aggregate_test_100.c11 AS Float64)) AS log_c11_base_c12
+02)--Projection: log(aggregate_test_100.c12, aggregate_test_100.c11) AS log_c11_base_c12
 03)----TableScan: aggregate_test_100 projection=[c11, c12]
 physical_plan
 01)SortPreservingMergeExec: [log_c11_base_c12@0 ASC NULLS LAST]
-02)--ProjectionExec: expr=[log(c12@1, CAST(c11@0 AS Float64)) as log_c11_base_c12]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+02)--ProjectionExec: expr=[log(c12@1, c11@0) as log_c11_base_c12]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
@@ -598,12 +774,12 @@ ORDER BY log_c12_base_c11 DESC NULLS LAST;
 ----
 logical_plan
 01)Sort: log_c12_base_c11 DESC NULLS LAST
-02)--Projection: log(CAST(aggregate_test_100.c11 AS Float64), aggregate_test_100.c12) AS log_c12_base_c11
+02)--Projection: log(aggregate_test_100.c11, aggregate_test_100.c12) AS log_c12_base_c11
 03)----TableScan: aggregate_test_100 projection=[c11, c12]
 physical_plan
 01)SortPreservingMergeExec: [log_c12_base_c11@0 DESC NULLS LAST]
-02)--ProjectionExec: expr=[log(CAST(c11@0 AS Float64), c12@1) as log_c12_base_c11]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+02)--ProjectionExec: expr=[log(c11@0, c12@1) as log_c12_base_c11]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c11, c12], output_orderings=[[c11@0 ASC NULLS LAST], [c12@1 DESC NULLS LAST]], file_type=csv, has_header=true
 
 statement ok
@@ -674,6 +850,13 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/
 query error DataFusion error: Error during planning: Column a is not in schema
 CREATE EXTERNAL TABLE dt (a_id integer, a_str string, a_bool boolean) STORED AS CSV WITH ORDER (a ASC) LOCATION 'file://path/to/table';
 
+
+# Create external table with order column expression that can't be planned
+# This is currently expected to fail, but should not panic
+query error DataFusion error: Schema error: No field named a\.
+CREATE EXTERNAL TABLE dt STORED AS CSV WITH ORDER (a || b) LOCATION 'file://path/to/table';
+
+
 # Sort with duplicate sort expressions
 # Table is sorted multiple times on the same column name and should not fail
 statement ok
@@ -794,20 +977,16 @@ physical_plan
 03)----InterleaveExec
 04)------ProjectionExec: expr=[0 as m, t@0 as t]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=2
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-09)----------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
-10)------------------ProjectionExec: expr=[column1@0 as t]
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------ProjectionExec: expr=[1 as m, t@0 as t]
-13)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
-14)----------CoalesceBatchesExec: target_batch_size=8192
-15)------------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=2
-16)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)----------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
-18)------------------ProjectionExec: expr=[column1@0 as t]
-19)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=1
+07)------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
+08)--------------ProjectionExec: expr=[column1@0 as t]
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)------ProjectionExec: expr=[1 as m, t@0 as t]
+11)--------AggregateExec: mode=FinalPartitioned, gby=[t@0 as t], aggr=[]
+12)----------RepartitionExec: partitioning=Hash([t@0], 2), input_partitions=1
+13)------------AggregateExec: mode=Partial, gby=[t@0 as t], aggr=[]
+14)--------------ProjectionExec: expr=[column1@0 as t]
+15)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 #####
 # Multi column sorting with lists
@@ -925,10 +1104,10 @@ ORDER BY SUM(column1)
 
 # ORDER BY with a GROUP BY clause
 query I
-SELECT SUM(column1) 
-  FROM foo 
-GROUP BY column2 
-ORDER BY SUM(column1) 
+SELECT SUM(column1)
+  FROM foo
+GROUP BY column2
+ORDER BY SUM(column1)
 ----
 0
 2
@@ -940,12 +1119,12 @@ ORDER BY SUM(column1)
 
 # ORDER BY with a GROUP BY clause and a HAVING clause
 query I
-SELECT 
-  SUM(column1) 
-FROM foo 
-GROUP BY column2 
-HAVING SUM(column1) < 3 
-ORDER BY SUM(column1) 
+SELECT
+  SUM(column1)
+FROM foo
+GROUP BY column2
+HAVING SUM(column1) < 3
+ORDER BY SUM(column1)
 ----
 0
 2
@@ -958,8 +1137,8 @@ SELECT SUM(column1) FROM foo ORDER BY SUM(column1)
 ----
 16
 
-# Order by unprojected aggregate expressions is not supported
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression AggregateFunction
+# Order by unprojected aggregate expressions requires GROUP BY
+query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function
 SELECT column2 FROM foo ORDER BY SUM(column1)
 
 statement ok
@@ -1046,7 +1225,7 @@ physical_plan
 01)SortPreservingMergeExec: [c_str@0 ASC NULLS LAST], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[c_str@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[CAST(c@0 AS Utf8View) as c_str]
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 
@@ -1076,11 +1255,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c_bigint@0 ASC NULLS LAST], fetch=5
 02)--ProjectionExec: expr=[CAST(c@0 AS Int64) as c_bigint]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
-drop table ordered_table; 
+drop table ordered_table;
 
 
 # ABS(x) breaks the ordering if x's range contains both negative and positive values.
@@ -1112,11 +1291,11 @@ physical_plan
 01)SortPreservingMergeExec: [abs_c@0 ASC NULLS LAST], fetch=5
 02)--SortExec: TopK(fetch=5), expr=[abs_c@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[abs(c@0) as abs_c]
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 statement ok
-drop table ordered_table; 
+drop table ordered_table;
 
 # ABS(x) preserves the ordering if x's range falls into positive values.
 # Since x is defined as INT UNSIGNED, its range is assumed to be from 0 to INF.
@@ -1146,7 +1325,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [abs_c@0 ASC NULLS LAST], fetch=5
 02)--ProjectionExec: expr=[abs(c@0) as abs_c]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Boolean to integer casts preserve the order.
@@ -1172,7 +1351,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[CAST(inc_col@0 > desc_col@1 AS Int32) as c]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[inc_col, desc_col], output_orderings=[[inc_col@0 ASC NULLS LAST], [desc_col@1 DESC]], file_type=csv, has_header=true
 
 # Union a query with the actual data and one with a constant
@@ -1195,7 +1374,7 @@ logical_plan
 03)----TableScan: ordered_table projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@0 + b@1 as sum1]
-02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 03)----SortExec: TopK(fetch=1), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
@@ -1234,7 +1413,7 @@ logical_plan
 03)----TableScan: ordered_table projection=[a, b]
 physical_plan
 01)ProjectionExec: expr=[a@0 + b@1 as sum1]
-02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 03)----SortExec: TopK(fetch=1), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], file_type=csv, has_header=true
 
@@ -1258,13 +1437,10 @@ logical_plan
 08)--------TableScan: ordered_table projection=[a0, b, c, d]
 physical_plan
 01)SortPreservingMergeExec: [d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], fetch=2
-02)--UnionExec
-03)----SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, b@0 ASC NULLS LAST], preserve_partitioning=[false]
-04)------ProjectionExec: expr=[b@1 as b, c@2 as c, a@0 as a, NULL as a0, d@3 as d]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true
-06)----SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], preserve_partitioning=[false]
-07)------ProjectionExec: expr=[b@1 as b, c@2 as c, NULL as a, a0@0 as a0, d@3 as d]
-08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true
+02)--SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----UnionExec
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, a, NULL as a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, NULL as a, a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # Test: run the query from above
 query IIIII
@@ -1299,9 +1475,9 @@ logical_plan
 02)--Projection: CASE WHEN name = Utf8("name1") THEN Float64(0) WHEN name = Utf8("name2") THEN Float64(0.5) END AS a
 03)----Union
 04)------Projection: Utf8("name1") AS name
-05)--------EmptyRelation
+05)--------EmptyRelation: rows=1
 06)------Projection: Utf8("name2") AS name
-07)--------EmptyRelation
+07)--------EmptyRelation: rows=1
 physical_plan
 01)SortPreservingMergeExec: [a@0 DESC]
 02)--ProjectionExec: expr=[CASE WHEN name@0 = name1 THEN 0 WHEN name@0 = name2 THEN 0.5 END as a]
@@ -1419,3 +1595,40 @@ SELECT address, zip FROM addresses ORDER BY ALL;
 111 Duck Duck Goose Ln 11111
 111 Duck Duck Goose Ln 11111-0001
 123 Quack Blvd 11111
+
+# Create a table with an order clause that's not a simple column reference
+statement ok
+CREATE EXTERNAL TABLE ordered (
+  a  BIGINT NOT NULL,
+  b  BIGINT NOT NULL
+)
+STORED AS CSV
+LOCATION 'data/composite_order.csv'
+OPTIONS ('format.has_header' 'true')
+WITH ORDER (a + b);
+
+# Simple query should be just a table scan
+query TT
+EXPLAIN SELECT * from ordered;
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, b], output_ordering=[a@0 + b@1 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Query ordered by the declared order should be just a table scan
+query TT
+EXPLAIN SELECT * from ordered ORDER BY (a + b);
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, b], output_ordering=[a@0 + b@1 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Order equivalence handling should make this query a simple table scan
+query TT
+EXPLAIN SELECT * from ordered ORDER BY -(a + b) desc nulls last;
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, b], output_ordering=[a@0 + b@1 ASC NULLS LAST], file_type=csv, has_header=true
+
+# Ordering by another column requires a sort
+query TT
+EXPLAIN SELECT * from ordered ORDER BY a;
+----
+physical_plan
+01)SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/data/composite_order.csv]]}, projection=[a, b], output_ordering=[a@0 + b@1 ASC NULLS LAST], file_type=csv, has_header=true
diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt
index abc6fdab3c8a0..be713b963b451 100644
--- a/datafusion/sqllogictest/test_files/parquet.slt
+++ b/datafusion/sqllogictest/test_files/parquet.slt
@@ -21,6 +21,10 @@
 statement ok
 set datafusion.execution.target_partitions = 2;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 # Create a table as a data source
 statement ok
 CREATE TABLE src_table (
@@ -130,8 +134,7 @@ STORED AS PARQUET;
 ----
 3
 
-# Check output plan again, expect no "output_ordering" clause in the physical_plan -> ParquetExec,
-# due to there being more files than partitions:
+# Check output plan again
 query TT
 EXPLAIN SELECT int_col, string_col
 FROM test_table
@@ -142,8 +145,7 @@ logical_plan
 02)--TableScan: test_table projection=[int_col, string_col]
 physical_plan
 01)SortPreservingMergeExec: [string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST]
-02)--SortExec: expr=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], file_type=parquet
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}, projection=[int_col, string_col], output_ordering=[string_col@1 ASC NULLS LAST, int_col@0 ASC NULLS LAST], file_type=parquet
 
 
 # Perform queries using MIN and MAX
@@ -270,7 +272,7 @@ FROM (
 ) t
 GROUP BY 1
 ----
-Timestamp(Millisecond, Some("UTC")) 2014-08-27T14:00:00Z 131072
+Timestamp(ms, "UTC") 2014-08-27T14:00:00Z 131072
 
 # Test config listing_table_ignore_subdirectory:
 
@@ -304,6 +306,54 @@ select count(*) from listing_table;
 ----
 12
 
+# Test table pointing to the folder with parquet files(ends with /)
+statement ok
+CREATE EXTERNAL TABLE listing_table_folder_0
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet/test_table/';
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = true;
+
+# scan file: 0.parquet 1.parquet 2.parquet
+query I
+select count(*) from listing_table_folder_0;
+----
+9
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = false;
+
+# scan file: 0.parquet 1.parquet 2.parquet 3.parquet
+query I
+select count(*) from listing_table_folder_0;
+----
+12
+
+# Test table pointing to the folder with parquet files(doesn't end with /)
+statement ok
+CREATE EXTERNAL TABLE listing_table_folder_1
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet/test_table';
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = true;
+
+# scan file: 0.parquet 1.parquet 2.parquet
+query I
+select count(*) from listing_table_folder_1;
+----
+9
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = false;
+
+# scan file: 0.parquet 1.parquet 2.parquet 3.parquet
+query I
+select count(*) from listing_table_folder_1;
+----
+12
+
 # Clean up
 statement ok
 DROP TABLE timestamp_with_tz;
@@ -408,10 +458,9 @@ logical_plan
 01)Filter: CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%") AND CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_default projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[CAST(binary_as_string_default.binary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.largebinary_col AS Utf8View) LIKE Utf8View("%a%"), CAST(binary_as_string_default.binaryview_col AS Utf8View) LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
+01)FilterExec: CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=CAST(binary_col@0 AS Utf8View) LIKE %a% AND CAST(largebinary_col@1 AS Utf8View) LIKE %a% AND CAST(binaryview_col@2 AS Utf8View) LIKE %a%
 
 
 statement ok
@@ -456,10 +505,9 @@ logical_plan
 01)Filter: binary_as_string_option.binary_col LIKE Utf8View("%a%") AND binary_as_string_option.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_option.binaryview_col LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_option projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_option.binary_col LIKE Utf8View("%a%"), binary_as_string_option.largebinary_col LIKE Utf8View("%a%"), binary_as_string_option.binaryview_col LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+01)FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
 
 
 statement ok
@@ -507,10 +555,9 @@ logical_plan
 01)Filter: binary_as_string_both.binary_col LIKE Utf8View("%a%") AND binary_as_string_both.largebinary_col LIKE Utf8View("%a%") AND binary_as_string_both.binaryview_col LIKE Utf8View("%a%")
 02)--TableScan: binary_as_string_both projection=[binary_col, largebinary_col, binaryview_col], partial_filters=[binary_as_string_both.binary_col LIKE Utf8View("%a%"), binary_as_string_both.largebinary_col LIKE Utf8View("%a%"), binary_as_string_both.binaryview_col LIKE Utf8View("%a%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+01)FilterExec: binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/binary_as_string.parquet]]}, projection=[binary_col, largebinary_col, binaryview_col], file_type=parquet, predicate=binary_col@0 LIKE %a% AND largebinary_col@1 LIKE %a% AND binaryview_col@2 LIKE %a%
 
 
 statement ok
@@ -622,10 +669,9 @@ logical_plan
 01)Filter: foo.column1 LIKE Utf8View("f%")
 02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 LIKE f%
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
+01)FilterExec: column1@0 LIKE f%
+02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]
 
 statement ok
 drop table foo
@@ -643,7 +689,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet';
 query TTT
 describe int96_from_spark
 ----
-a Timestamp(Nanosecond, None) YES
+a Timestamp(ns) YES
 
 # Note that the values are read as nanosecond precision
 query P
@@ -672,7 +718,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet';
 query TTT
 describe int96_from_spark;
 ----
-a Timestamp(Millisecond, None) YES
+a Timestamp(ms) YES
 
 # Per https://github.com/apache/parquet-testing/blob/6e851ddd768d6af741c7b15dc594874399fc3cff/data/int96_from_spark.md?plain=1#L37
 # these values should be
@@ -696,7 +742,7 @@ select * from int96_from_spark
 9999-12-31T03:00:00
 2024-12-30T23:00:00
 NULL
-ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(Millisecond, None)
+ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(ms)
 
 # Cleanup / reset default setting
 statement ok
@@ -704,3 +750,142 @@ drop table int96_from_spark;
 
 statement ok
 set datafusion.execution.parquet.coerce_int96 = ns;
+
+
+### Tests for metadata caching
+
+# Create temporary data
+query I
+COPY (
+  SELECT 'k-' || i as k, i as v
+  FROM generate_series(1, 20000) t(i)
+  ORDER BY k
+)
+TO 'test_files/scratch/parquet/cache_metadata.parquet'
+OPTIONS (MAX_ROW_GROUP_SIZE 4096, DATA_PAGE_ROW_COUNT_LIMIT 2048);
+----
+20000
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet/cache_metadata.parquet';
+
+query TI
+select * from t where k = 'k-1000' or k = 'k-9999' order by k
+----
+k-1000 1000
+k-9999 9999
+
+query IT
+select v, k from t where (v between 1 and 2) or (v between 9999 and 10000) order by v
+----
+1 k-1
+2 k-2
+9999 k-9999
+10000 k-10000
+
+# Updating the file should invalidate the cache. Otherwise, the following queries would fail
+# (e.g., with "Arrow: Parquet argument error: External: incomplete frame").
+query I
+COPY (
+  SELECT 'k-' || i as k, 20000 - i as v
+  FROM generate_series(1, 20000) t(i)
+  ORDER BY k
+)
+TO 'test_files/scratch/parquet/cache_metadata.parquet'
+OPTIONS (MAX_ROW_GROUP_SIZE 4096, DATA_PAGE_ROW_COUNT_LIMIT 2048);
+----
+20000
+
+query TI
+select * from t where k = 'k-1000' or k = 'k-9999' order by k
+----
+k-1000 19000
+k-9999 10001
+
+query IT
+select v, k from t where (v between 1 and 2) or (v between 9999 and 10000) order by v
+----
+1 k-19999
+2 k-19998
+9999 k-10001
+10000 k-10000
+
+statement ok
+DROP TABLE t;
+
+# Partitioned files should be independently cached. Otherwise, the following queries might fail.
+statement ok
+COPY (
+  SELECT i % 10 as part, 'k-' || i as k, i as v 
+  FROM generate_series(0, 9) t(i)
+  ORDER BY k
+)
+TO 'test_files/scratch/parquet/cache_metadata_partitioned.parquet'
+PARTITIONED BY (part);
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+PARTITIONED BY (part)
+LOCATION 'test_files/scratch/parquet/cache_metadata_partitioned.parquet';
+
+query TTI
+select part, k, v from t where k = 'k-0'
+----
+0 k-0 0
+
+query TTI
+select part, k, v from t where k = 'k-5'
+----
+5 k-5 5
+
+query TTI
+select part, k, v from t where k = 'k-9'
+----
+9 k-9 9
+
+query TTI
+select part, k, v from t order by k
+----
+0 k-0 0
+1 k-1 1
+2 k-2 2
+3 k-3 3
+4 k-4 4
+5 k-5 5
+6 k-6 6
+7 k-7 7
+8 k-8 8
+9 k-9 9
+
+statement ok
+DROP TABLE t;
+
+# Regression test for files with stats on some columns and not others
+# See https://github.com/apache/datafusion/pull/18276
+
+query I
+COPY (SELECT 1::int AS a, 2::int as b)
+TO 'test_files/scratch/parquet/mixed_stats.parquet'
+STORED AS PARQUET OPTIONS (
+  'STATISTICS_ENABLED::b' 'none'
+);
+----
+1
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet/mixed_stats.parquet';
+
+query I
+SELECT b
+FROM t
+WHERE b = 2;
+----
+2
+
+statement ok
+DROP TABLE t;
diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
index 1b6ae13fbe771..85f9549357138 100644
--- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
@@ -75,6 +75,30 @@ NULL
 NULL
 NULL
 
+query T
+select a from t_pushdown where b > 2 ORDER BY a;
+----
+baz
+foo
+NULL
+NULL
+NULL
+
+query TT
+EXPLAIN select a from t where b > 2 ORDER BY a;
+----
+logical_plan
+01)Sort: t.a ASC NULLS LAST
+02)--Projection: t.a
+03)----Filter: t.b > Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b > Int32(2)]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
 query TT
 EXPLAIN select a from t_pushdown where b > 2 ORDER BY a;
 ----
@@ -88,6 +112,209 @@ physical_plan
 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
 
+query T
+select a from t where b = 2 ORDER BY b;
+----
+bar
+
+query T
+select a from t_pushdown where b = 2 ORDER BY b;
+----
+bar
+
+query TT
+EXPLAIN select a from t where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t.a
+02)--Sort: t.b ASC NULLS LAST
+03)----Filter: t.b = Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--FilterExec: b@1 = 2, projection=[a@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+
+query TT
+EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t_pushdown.a
+02)--Sort: t_pushdown.b ASC NULLS LAST
+03)----Filter: t_pushdown.b = Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+
+# If we set the setting to `true` it override's the table's setting
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+query T
+select a from t where b > 2 ORDER BY a;
+----
+baz
+foo
+NULL
+NULL
+NULL
+
+query T
+select a from t_pushdown where b > 2 ORDER BY a;
+----
+baz
+foo
+NULL
+NULL
+NULL
+
+query TT
+EXPLAIN select a from t where b > 2 ORDER BY a;
+----
+logical_plan
+01)Sort: t.a ASC NULLS LAST
+02)--Projection: t.a
+03)----Filter: t.b > Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b > Int32(2)]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+query TT
+EXPLAIN select a from t_pushdown where b > 2 ORDER BY a;
+----
+logical_plan
+01)Sort: t_pushdown.a ASC NULLS LAST
+02)--Projection: t_pushdown.a
+03)----Filter: t_pushdown.b > Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b > Int32(2)]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+query T
+select a from t where b = 2 ORDER BY b;
+----
+bar
+
+query T
+select a from t_pushdown where b = 2 ORDER BY b;
+----
+bar
+
+query TT
+EXPLAIN select a from t where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t.a
+02)--Sort: t.b ASC NULLS LAST
+03)----Filter: t.b = Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+
+query TT
+EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t_pushdown.a
+02)--Sort: t_pushdown.b ASC NULLS LAST
+03)----Filter: t_pushdown.b = Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+
+# If we reset the default the table created without pushdown goes back to disabling it
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+query T
+select a from t where b > 2 ORDER BY a;
+----
+baz
+foo
+NULL
+NULL
+NULL
+
+query T
+select a from t_pushdown where b > 2 ORDER BY a;
+----
+baz
+foo
+NULL
+NULL
+NULL
+
+query TT
+EXPLAIN select a from t where b > 2 ORDER BY a;
+----
+logical_plan
+01)Sort: t.a ASC NULLS LAST
+02)--Projection: t.a
+03)----Filter: t.b > Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b > Int32(2)]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+query TT
+EXPLAIN select a from t_pushdown where b > 2 ORDER BY a;
+----
+logical_plan
+01)Sort: t_pushdown.a ASC NULLS LAST
+02)--Projection: t_pushdown.a
+03)----Filter: t_pushdown.b > Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b > Int32(2)]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+query T
+select a from t where b = 2 ORDER BY b;
+----
+bar
+
+query T
+select a from t_pushdown where b = 2 ORDER BY b;
+----
+bar
+
+query TT
+EXPLAIN select a from t where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t.a
+02)--Sort: t.b ASC NULLS LAST
+03)----Filter: t.b = Int32(2)
+04)------TableScan: t projection=[a, b], partial_filters=[t.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--FilterExec: b@1 = 2, projection=[a@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
+
+query TT
+EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t_pushdown.a
+02)--Sort: t_pushdown.b ASC NULLS LAST
+03)----Filter: t_pushdown.b = Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
 
 # When filter pushdown *is* enabled, ParquetExec can filter exactly,
 # not just metadata, so we expect to see no FilterExec
@@ -111,10 +338,26 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: b@1 > 2, projection=[a@0]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
-06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+03)----FilterExec: b@1 > 2, projection=[a@0]
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+query T
+select a from t_pushdown where b = 2 ORDER BY b;
+----
+bar
+
+query TT
+EXPLAIN select a from t_pushdown where b = 2 ORDER BY b;
+----
+logical_plan
+01)Projection: t_pushdown.a
+02)--Sort: t_pushdown.b ASC NULLS LAST
+03)----Filter: t_pushdown.b = Int32(2)
+04)------TableScan: t_pushdown projection=[a, b], partial_filters=[t_pushdown.b = Int32(2)]
+physical_plan
+01)CoalescePartitionsExec
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 = 2, pruning_predicate=b_null_count@2 != row_count@3 AND b_min@0 <= 2 AND 2 <= b_max@1, required_guarantees=[b in (2)]
 
 # also test querying on columns that are not in all the files
 query T
@@ -155,6 +398,20 @@ physical_plan
 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[b], file_type=parquet, predicate=a@0 = bar, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)]
 
+
+# should not push down volatile predicates such as RANDOM
+# expect that the random predicate is evaluated after the scan
+query TT
+EXPLAIN select a from t_pushdown where b > random();
+----
+logical_plan
+01)Projection: t_pushdown.a
+02)--Filter: CAST(t_pushdown.b AS Float64) > random()
+03)----TableScan: t_pushdown projection=[a, b]
+physical_plan
+01)FilterExec: CAST(b@1 AS Float64) > random(), projection=[a@0]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet
+
 ## cleanup
 statement ok
 DROP TABLE t;
@@ -209,17 +466,13 @@ EXPLAIN select * from t_pushdown where part != val
 logical_plan
 01)Filter: t_pushdown.val != t_pushdown.part
 02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != t_pushdown.part]
-physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: val@0 != part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
-04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
+physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != part@1
 
-# If we reference only a partition column it gets evaluted during the listing phase
+# If we reference only a partition column it gets evaluated during the listing phase
 query TT
 EXPLAIN select * from t_pushdown where part != 'a';
 ----
-logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8("a")]
+logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8View("a")]
 physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
 
 # And if we reference only a file column it gets pushed down
@@ -227,8 +480,8 @@ query TT
 EXPLAIN select * from t_pushdown where val != 'c';
 ----
 logical_plan
-01)Filter: t_pushdown.val != Utf8("c")
-02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8("c")]
+01)Filter: t_pushdown.val != Utf8View("c")
+02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8View("c")]
 physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)]
 
 # If we have a mix of filters:
@@ -239,10 +492,451 @@ query TT
 EXPLAIN select * from t_pushdown where val != 'd' AND val != 'c' AND part = 'a' AND part != val;
 ----
 logical_plan
-01)Filter: t_pushdown.val != Utf8("d") AND t_pushdown.val != Utf8("c") AND t_pushdown.val != t_pushdown.part
-02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val != Utf8("d"), t_pushdown.val != Utf8("c"), t_pushdown.val != t_pushdown.part]
+01)Filter: t_pushdown.val != Utf8View("d") AND t_pushdown.val != Utf8View("c") AND t_pushdown.val != t_pushdown.part
+02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val != Utf8View("d"), t_pushdown.val != Utf8View("c"), t_pushdown.val != t_pushdown.part]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c AND val@0 != part@1, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)]
+
+# The order of filters should not matter
+query TT
+EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val;
+----
+logical_plan
+01)Filter: t_pushdown.val = t_pushdown.part
+02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1
+
+query TT
+select val, part from t_pushdown where part = 'a' AND part = val;
+----
+a a
+
+query TT
+EXPLAIN select val, part from t_pushdown where part = val AND part = 'a';
+----
+logical_plan
+01)Filter: t_pushdown.val = t_pushdown.part
+02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1
+
+query TT
+select val, part from t_pushdown where part = val AND part = 'a';
+----
+a a
+
+statement ok
+COPY (
+    SELECT
+        '00000000000000000000000000000001' AS trace_id,
+        '2023-10-01 00:00:00'::timestamptz AS start_timestamp,
+        'prod' as deployment_environment
+)
+TO 'test_files/scratch/parquet_filter_pushdown/data/1.parquet';
+
+statement ok
+COPY (
+    SELECT
+        '00000000000000000000000000000002' AS trace_id,
+        '2024-10-01 00:00:00'::timestamptz AS start_timestamp,
+        'staging' as deployment_environment
+)
+TO 'test_files/scratch/parquet_filter_pushdown/data/2.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t1 STORED AS PARQUET LOCATION 'test_files/scratch/parquet_filter_pushdown/data/';
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+query T
+SELECT deployment_environment
+FROM t1
+WHERE trace_id = '00000000000000000000000000000002'
+ORDER BY start_timestamp, trace_id;
+----
+staging
+
+query P
+SELECT start_timestamp
+FROM t1
+WHERE trace_id = '00000000000000000000000000000002' AND deployment_environment = 'staging'
+ORDER BY start_timestamp, trace_id
+LIMIT 1;
+----
+2024-10-01T00:00:00
+
+###
+# Array function predicate pushdown tests
+# These tests verify that array_has, array_has_all, and array_has_any predicates
+# are correctly pushed down to the DataSourceExec node
+###
+
+# Create test data with array columns
+statement ok
+COPY (
+    SELECT 1 as id, ['rust', 'performance'] as tags
+    UNION ALL
+    SELECT 2 as id, ['python', 'javascript'] as tags
+    UNION ALL
+    SELECT 3 as id, ['rust', 'webassembly'] as tags
+)
+TO 'test_files/scratch/parquet_filter_pushdown/array_data/data.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE array_test STORED AS PARQUET LOCATION 'test_files/scratch/parquet_filter_pushdown/array_data/';
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Test array_has predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has(tags, 'rust') ORDER BY id;
+----
+1 [rust, performance]
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has(tags, 'rust') ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has(array_test.tags, Utf8("rust"))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has(array_test.tags, Utf8("rust"))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has(tags@1, rust)
+
+# Test array_has_all predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust', 'performance']) ORDER BY id;
+----
+1 [rust, performance]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust', 'performance']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_all(array_test.tags, List([rust, performance]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_all(array_test.tags, List([rust, performance]))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_all(tags@1, [rust, performance])
+
+# Test array_has_any predicate pushdown
+query I?
+SELECT id, tags FROM array_test WHERE array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+2 [python, javascript]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_any(array_test.tags, List([python, go]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_any(array_test.tags, List([python, go]))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_any(tags@1, [python, go])
+
+# Test complex predicate with OR
+query I?
+SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust']) OR array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+1 [rust, performance]
+2 [python, javascript]
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE array_has_all(tags, ['rust']) OR array_has_any(tags, ['python', 'go']) ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_has_all(array_test.tags, List([rust])) OR array_has_any(array_test.tags, List([python, go]))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_has_all(array_test.tags, List([rust])) OR array_has_any(array_test.tags, List([python, go]))]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: val@0 != part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)]
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=array_has_all(tags@1, [rust]) OR array_has_any(tags@1, [python, go])
+
+# Test array function with other predicates
+query I?
+SELECT id, tags FROM array_test WHERE id > 1 AND array_has(tags, 'rust') ORDER BY id;
+----
+3 [rust, webassembly]
+
+query TT
+EXPLAIN SELECT id, tags FROM array_test WHERE id > 1 AND array_has(tags, 'rust') ORDER BY id;
+----
+logical_plan
+01)Sort: array_test.id ASC NULLS LAST
+02)--Filter: array_test.id > Int64(1) AND array_has(array_test.tags, Utf8("rust"))
+03)----TableScan: array_test projection=[id, tags], partial_filters=[array_test.id > Int64(1), array_has(array_test.tags, Utf8("rust"))]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/array_data/data.parquet]]}, projection=[id, tags], file_type=parquet, predicate=id@0 > 1 AND array_has(tags@1, rust), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+###
+# Test filter pushdown through UNION with mixed support
+# This tests the case where one child supports filter pushdown (parquet) and one doesn't (memory table)
+###
+
+# enable filter pushdown
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.optimizer.max_passes = 0;
+
+# Create memory table with matching schema (a: VARCHAR, b: BIGINT)
+statement ok
+CREATE TABLE t_union_mem(a VARCHAR, b BIGINT) AS VALUES ('qux', 4), ('quux', 5);
+
+# Create parquet table with matching schema
+statement ok
+CREATE EXTERNAL TABLE t_union_parquet(a VARCHAR, b BIGINT) STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet';
+
+# Query results combining memory table and Parquet with filter
+query I rowsort
+SELECT b FROM (
+  SELECT a, b FROM t_union_mem
+  UNION ALL
+  SELECT a, b FROM t_union_parquet
+) WHERE b > 2;
+----
+3
+4
+5
+50
+
+# Explain the union query - filter should be pushed to parquet but not memory table
+query TT
+EXPLAIN SELECT b FROM (
+  SELECT a, b FROM t_union_mem
+  UNION ALL
+  SELECT a, b FROM t_union_parquet
+) WHERE b > 2;
+----
+logical_plan
+01)Projection: b
+02)--Filter: b > Int64(2)
+03)----Union
+04)------Projection: t_union_mem.a, t_union_mem.b
+05)--------TableScan: t_union_mem
+06)------Projection: t_union_parquet.a, t_union_parquet.b
+07)--------TableScan: t_union_parquet
+physical_plan
+01)UnionExec
+02)--FilterExec: b@0 > 2
+03)----DataSourceExec: partitions=1, partition_sizes=[1]
+04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet]]}, projection=[b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[]
+
+# Clean up union test tables
+statement ok
+DROP TABLE t_union_mem;
+
+statement ok
+DROP TABLE t_union_parquet;
+
+# Cleanup settings
+statement ok
+set datafusion.optimizer.max_passes = 3;
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+
+# Regression test for https://github.com/apache/datafusion/issues/20696
+# Multi-column INNER JOIN with dictionary fails
+# when parquet pushdown filters are enabled.
+
+
+statement ok
+COPY (
+  SELECT
+    to_timestamp_nanos(time_ns) AS time,
+    arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state,
+    arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city,
+    temp
+  FROM (
+    VALUES
+      (200, 'CA', 'LA', 90.0),
+      (250, 'MA', 'Boston', 72.4),
+      (100, 'MA', 'Boston', 70.4),
+      (350, 'CA', 'LA', 90.0)
+  ) AS t(time_ns, state, city, temp)
+)
+TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/data.parquet';
+
+statement ok
+COPY (
+  SELECT
+    to_timestamp_nanos(time_ns) AS time,
+    arrow_cast(state, 'Dictionary(Int32, Utf8)') AS state,
+    arrow_cast(city, 'Dictionary(Int32, Utf8)') AS city,
+    temp,
+    reading
+  FROM (
+    VALUES
+      (250, 'MA', 'Boston', 53.4, 51.0),
+      (100, 'MA', 'Boston', 50.4, 50.0)
+  ) AS t(time_ns, state, city, temp, reading)
+)
+TO 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/data.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE h2o_parquet_20696 STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/h2o/';
+
+statement ok
+CREATE EXTERNAL TABLE o2_parquet_20696 STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/issue_20696/o2/';
+
+# Query should work both with and without filters
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+query RRR
+SELECT
+  h2o_parquet_20696.temp AS h2o_temp,
+  o2_parquet_20696.temp AS o2_temp,
+  o2_parquet_20696.reading
+FROM h2o_parquet_20696
+INNER JOIN o2_parquet_20696
+  ON h2o_parquet_20696.time = o2_parquet_20696.time
+  AND h2o_parquet_20696.state = o2_parquet_20696.state
+  AND h2o_parquet_20696.city = o2_parquet_20696.city
+WHERE h2o_parquet_20696.time >= '1970-01-01T00:00:00.000000050Z'
+  AND h2o_parquet_20696.time <= '1970-01-01T00:00:00.000000300Z';
+----
+72.4 53.4 51
+70.4 50.4 50
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+query RRR
+SELECT
+  h2o_parquet_20696.temp AS h2o_temp,
+  o2_parquet_20696.temp AS o2_temp,
+  o2_parquet_20696.reading
+FROM h2o_parquet_20696
+INNER JOIN o2_parquet_20696
+  ON h2o_parquet_20696.time = o2_parquet_20696.time
+  AND h2o_parquet_20696.state = o2_parquet_20696.state
+  AND h2o_parquet_20696.city = o2_parquet_20696.city
+WHERE h2o_parquet_20696.time >= '1970-01-01T00:00:00.000000050Z'
+  AND h2o_parquet_20696.time <= '1970-01-01T00:00:00.000000300Z';
+----
+72.4 53.4 51
+70.4 50.4 50
+
+# Cleanup
+statement ok
+DROP TABLE h2o_parquet_20696;
+
+statement ok
+DROP TABLE o2_parquet_20696;
+
+# Cleanup settings
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+##########
+# Regression test: filter pushdown with Struct columns in schema
+#
+# When a schema has Struct columns, Arrow field indices diverge from Parquet
+# leaf indices (Struct children become separate leaves). A filter on a
+# primitive column *after* a Struct must use the correct Parquet leaf index.
+#
+# Schema:
+#   Arrow:   col_a=0  struct_col=1              col_b=2
+#   Parquet: col_a=0  struct_col.x=1  struct_col.y=2  col_b=3
+##########
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+COPY (
+  SELECT
+    column1 as col_a,
+    column2 as struct_col,
+    column3 as col_b
+  FROM VALUES
+    (1, {x: 10, y: 100}, 'aaa'),
+    (2, {x: 20, y: 200}, 'target'),
+    (3, {x: 30, y: 300}, 'zzz')
+) TO 'test_files/scratch/parquet_filter_pushdown/struct_filter.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE t_struct_filter
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/struct_filter.parquet';
+
+# Filter on col_b (the primitive column after the struct).
+# Before the fix, this returned 0 rows because the filter read struct_col.y
+# (Parquet leaf 2) instead of col_b (Parquet leaf 3).
+query IT
+SELECT col_a, col_b FROM t_struct_filter WHERE col_b = 'target';
+----
+2 target
+
+# Clean up
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+statement ok
+DROP TABLE t_struct_filter;
+
+##########
+# Regression test for https://github.com/apache/datafusion/issues/20937
+#
+# Dynamic filter pushdown fails when joining VALUES against
+# Dictionary-encoded Parquet columns. The InListExpr's ArrayStaticFilter
+# unwraps the needle Dictionary but not the stored in_array, causing a
+# make_comparator(Utf8, Dictionary) type mismatch.
+##########
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.execution.parquet.reorder_filters = true;
+
+statement ok
+COPY (
+  SELECT
+    arrow_cast(chr(65 + (row_num % 26)), 'Dictionary(Int32, Utf8)') as tag1,
+    row_num * 1.0 as value
+  FROM (SELECT unnest(range(0, 10000)) as row_num)
+) TO 'test_files/scratch/parquet_filter_pushdown/dict_filter_bug.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE dict_filter_bug
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet_filter_pushdown/dict_filter_bug.parquet';
+
+query TR
+SELECT t.tag1, t.value
+FROM dict_filter_bug t
+JOIN (VALUES ('A'), ('B')) AS v(c1)
+ON t.tag1 = v.c1
+ORDER BY t.tag1, t.value
+LIMIT 4;
+----
+A 0
+A 26
+A 52
+A 78
+
+# Cleanup
+statement ok
+set datafusion.execution.parquet.pushdown_filters = false;
+
+statement ok
+set datafusion.execution.parquet.reorder_filters = false;
+
+statement ok
+DROP TABLE dict_filter_bug;
diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
index a10243f627209..fd3a40ca17079 100644
--- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
@@ -38,20 +38,22 @@ CREATE TABLE src_table (
   bigint_col BIGINT,
   date_col DATE,
   overlapping_col INT,
-  constant_col INT
+  constant_col INT,
+  nulls_first_col INT,
+  nulls_last_col INT
 ) AS VALUES
 -- first file
-(1, 3, 'aaa', 100, 1, 0, 0),
-(2, 2, 'bbb', 200, 2, 1, 0),
-(3, 1, 'ccc', 300, 3, 2, 0),
+(1, 3, 'aaa', 100, 1, 0, 0, NULL, 1),
+(2, 2, 'bbb', 200, 2, 1, 0, NULL, 2),
+(3, 1, 'ccc', 300, 3, 2, 0, 1, 3),
 -- second file
-(4, 6, 'ddd', 400, 4, 0, 0),
-(5, 5, 'eee', 500, 5, 1, 0),
-(6, 4, 'fff', 600, 6, 2, 0),
+(4, 6, 'ddd', 400, 4, 0, 0, 2, 4),
+(5, 5, 'eee', 500, 5, 1, 0, 3, 5),
+(6, 4, 'fff', 600, 6, 2, 0, 4, 6),
 -- third file
-(7, 9, 'ggg', 700, 7, 3, 0),
-(8, 8, 'hhh', 800, 8, 4, 0),
-(9, 7, 'iii', 900, 9, 5, 0);
+(7, 9, 'ggg', 700, 7, 3, 0, 5, 7),
+(8, 8, 'hhh', 800, 8, 4, 0, 6, NULL),
+(9, 7, 'iii', 900, 9, 5, 0, 7, NULL);
 
 # Setup 3 files, in particular more files than there are partitions
 
@@ -90,11 +92,18 @@ CREATE EXTERNAL TABLE test_table (
   bigint_col BIGINT NOT NULL,
   date_col DATE NOT NULL,
   overlapping_col INT NOT NULL,
-  constant_col INT NOT NULL
+  constant_col INT NOT NULL,
+  nulls_first_col INT,
+  nulls_last_col INT
 )
 STORED AS PARQUET
 PARTITIONED BY (partition_col)
-WITH ORDER (int_col ASC NULLS LAST, bigint_col ASC NULLS LAST)
+WITH ORDER (
+    int_col ASC NULLS LAST,
+    bigint_col ASC NULLS LAST,
+    nulls_first_col ASC NULLS FIRST,
+    nulls_last_col ASC NULLS LAST
+)
 LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table';
 
 # Order by numeric columns
@@ -102,33 +111,33 @@ LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table';
 # DataFusion doesn't currently support string column statistics
 # This should not require a sort.
 query TT
-EXPLAIN SELECT int_col, bigint_col
+EXPLAIN SELECT int_col, bigint_col, nulls_first_col, nulls_last_col
 FROM test_table
-ORDER BY int_col, bigint_col;
+ORDER BY int_col, bigint_col, nulls_first_col NULLS FIRST, nulls_last_col NULLS LAST;
 ----
 logical_plan
-01)Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST
-02)--TableScan: test_table projection=[int_col, bigint_col]
+01)Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST, test_table.nulls_first_col ASC NULLS FIRST, test_table.nulls_last_col ASC NULLS LAST
+02)--TableScan: test_table projection=[int_col, bigint_col, nulls_first_col, nulls_last_col]
 physical_plan
-01)SortPreservingMergeExec: [int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST]
-02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet]]}, projection=[int_col, bigint_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST], file_type=parquet
+01)SortPreservingMergeExec: [int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST, nulls_first_col@2 ASC, nulls_last_col@3 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet]]}, projection=[int_col, bigint_col, nulls_first_col, nulls_last_col], output_ordering=[int_col@0 ASC NULLS LAST, bigint_col@1 ASC NULLS LAST, nulls_first_col@2 ASC, nulls_last_col@3 ASC NULLS LAST], file_type=parquet
 
 # Another planning test, but project on a column with unsupported statistics
 # We should be able to ignore this and look at only the relevant statistics
 query TT
 EXPLAIN SELECT string_col
 FROM test_table
-ORDER BY int_col, bigint_col;
+ORDER BY int_col, bigint_col, nulls_first_col NULLS FIRST, nulls_last_col NULLS LAST;
 ----
 logical_plan
 01)Projection: test_table.string_col
-02)--Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST
-03)----Projection: test_table.string_col, test_table.int_col, test_table.bigint_col
-04)------TableScan: test_table projection=[int_col, string_col, bigint_col]
+02)--Sort: test_table.int_col ASC NULLS LAST, test_table.bigint_col ASC NULLS LAST, test_table.nulls_first_col ASC NULLS FIRST, test_table.nulls_last_col ASC NULLS LAST
+03)----Projection: test_table.string_col, test_table.int_col, test_table.bigint_col, test_table.nulls_first_col, test_table.nulls_last_col
+04)------TableScan: test_table projection=[int_col, string_col, bigint_col, nulls_first_col, nulls_last_col]
 physical_plan
 01)ProjectionExec: expr=[string_col@0 as string_col]
-02)--SortPreservingMergeExec: [int_col@1 ASC NULLS LAST, bigint_col@2 ASC NULLS LAST]
-03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet]]}, projection=[string_col, int_col, bigint_col], output_ordering=[int_col@1 ASC NULLS LAST, bigint_col@2 ASC NULLS LAST], file_type=parquet
+02)--SortPreservingMergeExec: [int_col@1 ASC NULLS LAST, bigint_col@2 ASC NULLS LAST, nulls_first_col@3 ASC, nulls_last_col@4 ASC NULLS LAST]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet]]}, projection=[string_col, int_col, bigint_col, nulls_first_col, nulls_last_col], output_ordering=[int_col@1 ASC NULLS LAST, bigint_col@2 ASC NULLS LAST, nulls_first_col@3 ASC, nulls_last_col@4 ASC NULLS LAST], file_type=parquet
 
 # Clean up & recreate but sort on descending column
 statement ok
@@ -265,5 +274,4 @@ logical_plan
 02)--TableScan: test_table projection=[constant_col]
 physical_plan
 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST]
-02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], file_type=parquet
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], output_ordering=[constant_col@0 ASC NULLS LAST], file_type=parquet
diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt
index c707b9f5bbd54..8c77fb96ba75c 100644
--- a/datafusion/sqllogictest/test_files/parquet_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt
@@ -46,7 +46,7 @@ statement ok
 set datafusion.explain.show_statistics = true;
 
 ######
-# By default, the statistics are not gathered
+# By default, the statistics are gathered
 ######
 
 # Recreate the table to pick up the current setting
@@ -59,18 +59,16 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
-05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
 
 # cleanup
 statement ok
 DROP TABLE test_table;
 
 ######
-# When the setting is true, the statistics are gathered
+# When the setting is true, statistics are gathered
 ######
 
 statement ok
@@ -86,11 +84,9 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
-05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(10), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(40), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0) ScanBytes=Inexact(40))]]
 
 # cleanup
 statement ok
@@ -114,11 +110,9 @@ query TT
 EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
 ----
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
-04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
-05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+01)FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
 
 # cleanup
 statement ok
diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt
index d14b6ca81f67e..fcc12226e47c5 100644
--- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt
+++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt
@@ -48,7 +48,7 @@ COPY aggregate_test_100_by_sql
 ###
 ## Setup test for datafusion
 ###
-onlyif DataFusion
+skipif postgres
 statement ok
 CREATE EXTERNAL TABLE aggregate_test_100_by_sql (
   c1  VARCHAR NOT NULL,
diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt
index 25b4924715caa..4453aa1489a1b 100644
--- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt
+++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt
@@ -49,7 +49,7 @@ COPY aggregate_test_100_by_sql
 ###
 ## Setup test for datafusion
 ###
-onlyif DataFusion
+skipif postgres
 statement ok
 CREATE EXTERNAL TABLE aggregate_test_100_by_sql (
   c1  VARCHAR NOT NULL,
diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt
index e02c19016790d..f8e0770271309 100644
--- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt
+++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt
@@ -46,7 +46,7 @@ COPY aggregate_test_100_by_sql
 ###
 ## Setup test for datafusion
 ###
-onlyif DataFusion
+skipif postgres
 statement ok
 CREATE EXTERNAL TABLE aggregate_test_100_by_sql (
   c1  VARCHAR NOT NULL,
diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt
index edad3747a2030..f967d79a6d952 100644
--- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt
+++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt
@@ -46,7 +46,7 @@ COPY aggregate_test_100_by_sql
 ###
 ## Setup test for datafusion
 ###
-onlyif DataFusion
+skipif postgres
 statement ok
 CREATE EXTERNAL TABLE aggregate_test_100_by_sql (
   c1  VARCHAR NOT NULL,
diff --git a/datafusion/sqllogictest/test_files/pipe_operator.slt b/datafusion/sqllogictest/test_files/pipe_operator.slt
new file mode 100644
index 0000000000000..5908b3d6b2a4d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/pipe_operator.slt
@@ -0,0 +1,197 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# BigQuery supports the pipe operator syntax
+# TODO: Make the Generic dialect support the pipe operator syntax
+statement ok
+set datafusion.sql_parser.dialect = 'BigQuery';
+
+statement ok
+CREATE TABLE test(
+    a INT,
+    b FLOAT,
+    c VARCHAR,
+    n VARCHAR
+) AS VALUES
+  (1, 1.1, 'a', NULL),
+  (2, 2.2, 'b', NULL),
+  (3, 3.3, 'c', NULL)
+;
+
+# WHERE pipe
+query IRTT
+SELECT *
+FROM test
+|> WHERE a > 1
+----
+2 2.2 b NULL
+3 3.3 c NULL
+
+# ORDER BY pipe
+query IRTT
+SELECT *
+FROM test
+|> ORDER BY a DESC
+----
+3 3.3 c NULL
+2 2.2 b NULL
+1 1.1 a NULL
+
+# ORDER BY pipe, limit
+query IRTT
+SELECT *
+FROM test
+|> ORDER BY a DESC
+|> LIMIT 1
+----
+3 3.3 c NULL
+
+# SELECT pipe
+query I
+SELECT *
+FROM test
+|> SELECT a
+----
+1
+2
+3
+
+# EXTEND pipe
+query IRR
+SELECT *
+FROM test
+|> SELECT a, b
+|> EXTEND a + b AS a_plus_b
+----
+1 1.1 2.1
+2 2.2 4.2
+3 3.3 6.3
+
+query IRR
+SELECT *
+FROM test
+|> SELECT a, b
+|> where a = 1
+|> EXTEND a + b AS a_plus_b
+----
+1 1.1 2.1
+
+# AS pipe
+query I
+SELECT *
+FROM test
+|> as test_pipe
+|> select test_pipe.a
+----
+1
+2
+3
+
+# UNION pipe
+query I
+SELECT *
+FROM test
+|> select a
+|> UNION ALL (
+  SELECT a FROM test
+);
+----
+1
+2
+3
+1
+2
+3
+
+# INTERSECT pipe
+query I rowsort
+SELECT * FROM range(0,3)
+|> INTERSECT DISTINCT
+    (SELECT * FROM range(1,3));
+----
+1
+2
+
+# EXCEPT pipe
+query I rowsort
+select * from range(0,10)
+|> EXCEPT DISTINCT (select * from range(5,10));
+----
+0
+1
+2
+3
+4
+
+# AGGREGATE pipe
+query II
+(
+  SELECT 'apples' AS item, 2 AS sales
+  UNION ALL
+  SELECT 'bananas' AS item, 5 AS sales
+  UNION ALL
+  SELECT 'apples' AS item, 7 AS sales
+)
+|> AGGREGATE COUNT(*) AS num_items, SUM(sales) AS total_sales;
+----
+3 14
+
+query TII rowsort
+(
+  SELECT 'apples' AS item, 2 AS sales
+  UNION ALL
+  SELECT 'bananas' AS item, 5 AS sales
+  UNION ALL
+  SELECT 'apples' AS item, 7 AS sales
+)
+|> AGGREGATE COUNT(*) AS num_items, SUM(sales) AS total_sales
+   GROUP BY item;
+----
+apples 2 9
+bananas 1 5
+
+query TII rowsort
+(
+  SELECT 'apples' AS item, 2 AS sales
+  UNION ALL
+  SELECT 'bananas' AS item, 5 AS sales
+  UNION ALL
+  SELECT 'apples' AS item, 7 AS sales
+)
+|> AGGREGATE COUNT(*) AS num_items, SUM(sales) AS total_sales
+   GROUP BY item
+|> WHERE num_items > 1;
+----
+apples 2 9
+
+# JOIN pipe
+query TII
+(
+  SELECT 'apples' AS item, 2 AS sales
+  UNION ALL
+  SELECT 'bananas' AS item, 5 AS sales
+)
+|> AS produce_sales
+|> LEFT JOIN
+     (
+       SELECT "apples" AS item, 123 AS id
+     ) AS produce_data
+   ON produce_sales.item = produce_data.item
+|> SELECT produce_sales.item, sales, id;
+----
+apples 2 123
+bananas 5 NULL
diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt
index b4b31fa78a692..7d33814b8bdbf 100644
--- a/datafusion/sqllogictest/test_files/predicates.slt
+++ b/datafusion/sqllogictest/test_files/predicates.slt
@@ -668,20 +668,15 @@ logical_plan
 05)----Filter: (part.p_brand = Utf8View("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#34") AND part.p_size <= Int32(15)) AND part.p_size >= Int32(1)
 06)------TableScan: part projection=[p_partkey, p_brand, p_size], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8View("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#34") AND part.p_size <= Int32(15)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_partkey@0]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=true
-09)----CoalesceBatchesExec: target_batch_size=8192
-10)------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-11)--------CoalesceBatchesExec: target_batch_size=8192
-12)----------FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_size@2 >= 1
-13)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-14)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand, p_size], file_type=csv, has_header=true
+01)HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_partkey@0]
+02)--RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+03)----FilterExec: l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=true
+06)--RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+07)----FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_size@2 >= 1
+08)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand, p_size], file_type=csv, has_header=true
 
 ########
 # TPCH Q19 - Pull predicates to inner join (simplified)
@@ -761,21 +756,59 @@ logical_plan
 physical_plan
 01)AggregateExec: mode=SinglePartitioned, gby=[p_partkey@2 as p_partkey], aggr=[sum(lineitem.l_extendedprice), avg(lineitem.l_discount), count(DISTINCT partsupp.ps_suppkey)]
 02)--ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, p_partkey@3 as p_partkey, ps_suppkey@0 as ps_suppkey]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(ps_partkey@0, p_partkey@2)], projection=[ps_suppkey@1, l_extendedprice@2, l_discount@3, p_partkey@4]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_partkey@3]
-08)------------CoalesceBatchesExec: target_batch_size=8192
-09)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-10)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], file_type=csv, has_header=true
-12)------------CoalesceBatchesExec: target_batch_size=8192
-13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)----------------CoalesceBatchesExec: target_batch_size=8192
-15)------------------FilterExec: p_brand@1 = Brand#12 OR p_brand@1 = Brand#23, projection=[p_partkey@0]
-16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], file_type=csv, has_header=true
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(ps_partkey@0, p_partkey@2)], projection=[ps_suppkey@1, l_extendedprice@2, l_discount@3, p_partkey@4]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_partkey@3]
+06)--------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=1
+07)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], file_type=csv, has_header=true
+08)--------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+09)----------FilterExec: p_brand@1 = Brand#12 OR p_brand@1 = Brand#23, projection=[p_partkey@0]
+10)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], file_type=csv, has_header=true
+
+# Simplification of a binary operator with a NULL value
+
+statement ok
+create table t(x int) as values (1), (2), (3);
+
+query TT
+EXPLAIN FORMAT INDENT SELECT x > NULL FROM t;
+----
+logical_plan
+01)Projection: Boolean(NULL) AS t.x > NULL
+02)--TableScan: t projection=[]
+physical_plan
+01)ProjectionExec: expr=[NULL as t.x > NULL]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+EXPLAIN FORMAT INDENT SELECT * FROM t WHERE x > NULL;
+----
+logical_plan EmptyRelation: rows=0
+physical_plan EmptyExec
+
+query TT
+EXPLAIN FORMAT INDENT SELECT * FROM t WHERE x < 5 AND (10 * NULL < x);
+----
+logical_plan
+01)Filter: t.x < Int32(5) AND Boolean(NULL)
+02)--TableScan: t projection=[x]
+physical_plan
+01)FilterExec: x@0 < 5 AND NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TT
+EXPLAIN FORMAT INDENT SELECT * FROM t WHERE x < 5 OR (10 * NULL < x);
+----
+logical_plan
+01)Filter: t.x < Int32(5) OR Boolean(NULL)
+02)--TableScan: t projection=[x]
+physical_plan
+01)FilterExec: x@0 < 5 OR NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table t;
 
 # Inlist simplification
 
@@ -785,7 +818,7 @@ create table t(x int) as values (1), (2), (3);
 query TT
 explain select x from t where x IN (1,2,3) AND x IN (4,5);
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 physical_plan EmptyExec
 
 query TT
@@ -801,15 +834,35 @@ logical_plan
 01)Filter: t.x = Int32(5)
 02)--TableScan: t projection=[x]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: x@0 = 5
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: x@0 = 5
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select x from t where x NOT IN (1,2,3,4,5) AND x IN (1,2,3);
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 physical_plan EmptyExec
 
+query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression InSubquery\(InSubquery \{ expr: Literal\(Int64\(NULL\), None\), subquery: <subquery>, negated: false \}\)
+WITH empty AS (SELECT 10 WHERE false)
+SELECT
+    NULL IN (SELECT * FROM empty), -- should be false, as the right side is empty relation
+    NULL NOT IN (SELECT * FROM empty) -- should be true, as the right side is empty relation
+FROM (SELECT 1) t;
+
+query I
+WITH empty AS (SELECT 10 WHERE false)
+SELECT * FROM (SELECT 1) t
+WHERE NOT (NULL IN (SELECT * FROM empty)); -- all rows should be returned
+----
+1
+
+query I
+WITH empty AS (SELECT 10 WHERE false)
+SELECT * FROM (SELECT 1) t
+WHERE NULL NOT IN (SELECT * FROM empty); -- all rows should be returned
+----
+1
+
 statement ok
 drop table t;
diff --git a/datafusion/sqllogictest/test_files/prepare.slt b/datafusion/sqllogictest/test_files/prepare.slt
index d61603ae65588..8e8b1cd8e6ad0 100644
--- a/datafusion/sqllogictest/test_files/prepare.slt
+++ b/datafusion/sqllogictest/test_files/prepare.slt
@@ -34,7 +34,7 @@ statement error DataFusion error: SQL error: ParserError
 PREPARE AS SELECT id, age  FROM person WHERE age = $foo;
 
 # param following a non-number, $foo, not supported
-statement error Invalid placeholder, not a number: \$foo
+statement error Unknown placeholder: \$foo
 PREPARE my_plan(INT) AS SELECT id, age  FROM person WHERE age = $foo;
 
 # not specify table hence cannot specify columns
@@ -204,9 +204,11 @@ EXECUTE my_plan6(20.0);
 statement error Cast error: Cannot cast string 'foo' to value of Int32 type
 EXECUTE my_plan6('foo');
 
-# TODO: support non-literal expressions
-statement error Unsupported parameter type
-EXECUTE my_plan6(10 + 20);
+# support non-literal expressions
+query II
+EXECUTE my_plan6(10 + 10);
+----
+1 20
 
 statement ok
 DEALLOCATE my_plan6;
@@ -327,3 +329,47 @@ EXECUTE my_plan('a', 'b');
 ----
 1 a
 2 b
+
+statement ok
+SET datafusion.explain.logical_plan_only=false;
+
+statement ok
+DEALLOCATE my_plan
+
+statement ok
+SET datafusion.explain.logical_plan_only=true;
+
+# Prepare with alias
+query TT
+EXPLAIN PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two;
+----
+logical_plan
+01)Prepare: "my_plan" [Int32, Int32]
+02)--Projection: $1 AS one, $2 AS two
+03)----EmptyRelation: rows=1
+
+statement ok
+PREPARE my_plan(INT, INT) AS SELECT $1 AS one, $2 AS two;
+
+query II
+EXECUTE my_plan(1, 2)
+----
+1 2
+
+statement ok
+SET datafusion.explain.logical_plan_only=false;
+
+statement ok
+DEALLOCATE my_plan
+
+
+statement ok
+PREPARE my_plan AS SELECT a, b FROM (VALUES ($1, $2)) AS t(a, b);
+
+query II
+EXECUTE my_plan(1, 2)
+----
+1 2
+
+statement ok
+DEALLOCATE my_plan
diff --git a/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt
new file mode 100644
index 0000000000000..297094fab16e7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/preserve_file_partitioning.slt
@@ -0,0 +1,724 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for preserve_file_partitions optimization
+#
+# Data Model:
+# - Fact table: Hive-partitioned by f_dkey, sorted by f_dkey, timestamp
+#   Schema: timestamp TIMESTAMP, value FLOAT64, partition column: f_dkey STRING
+#
+# - Dimension table: Single file for CollectLeft joins
+#   Schema: d_dkey STRING, env STRING, service STRING, host STRING
+#
+# Key benefits demonstrated:
+# - Eliminates RepartitionExec for aggregates/joins/windows on partition columns
+# - Eliminates SortExec when data is already sorted by partition + order columns
+# - Uses SinglePartitioned aggregation mode
+##########
+
+##########
+# SETUP: Configuration and Data Generation
+##########
+
+statement ok
+set datafusion.execution.target_partitions = 3;
+
+# Create fact table partitioned by f_dkey
+# Each partition has data sorted by timestamp
+# Partition: f_dkey=A
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 95.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 102.3),
+    (TIMESTAMP '2023-01-01T09:00:20', 98.7),
+    (TIMESTAMP '2023-01-01T09:12:20', 105.1),
+    (TIMESTAMP '2023-01-01T09:12:30', 100.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 150.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 120.8)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Partition: f_dkey=B
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 75.2),
+    (TIMESTAMP '2023-01-01T09:00:10', 82.4),
+    (TIMESTAMP '2023-01-01T09:00:20', 78.9),
+    (TIMESTAMP '2023-01-01T09:00:30', 85.6),
+    (TIMESTAMP '2023-01-01T09:12:30', 80.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 120.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 92.3)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Partition: f_dkey=C
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 285.7),
+    (TIMESTAMP '2023-01-01T09:00:20', 310.2),
+    (TIMESTAMP '2023-01-01T09:00:30', 295.8),
+    (TIMESTAMP '2023-01-01T09:00:40', 300.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 250.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 275.4)
+))
+TO 'test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+7
+
+# Create dimension table (single file for CollectLeft joins)
+query I
+COPY (SELECT column1 as d_dkey, column2 as env, column3 as service, column4 as host FROM (VALUES
+    ('A', 'dev', 'log', 'ma'),
+    ('B', 'prod', 'log', 'ma'),
+    ('C', 'prod', 'log', 'vim'),
+    ('D', 'prod', 'trace', 'vim')
+))
+TO 'test_files/scratch/preserve_file_partitioning/dimension/data.parquet'
+STORED AS PARQUET;
+----
+4
+
+# Create hive-partitioned dimension table (3 partitions matching fact_table)
+# For testing Partitioned joins with matching partition counts
+query I
+COPY (SELECT 'dev' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT 'prod' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT 'prod' as env, 'log' as service)
+TO 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+# Create high-cardinality fact table (5 partitions > 3 target_partitions)
+# For testing partition merging with consistent hashing
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 100.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 200.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 400.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 500.0)
+))
+TO 'test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+##########
+# TABLE DECLARATIONS
+##########
+
+# Fact table without ordering (for basic aggregate tests)
+statement ok
+CREATE EXTERNAL TABLE fact_table (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/fact/';
+
+# Fact table with ordering (for sort elimination tests)
+statement ok
+CREATE EXTERNAL TABLE fact_table_ordered (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+WITH ORDER (f_dkey ASC, timestamp ASC)
+LOCATION 'test_files/scratch/preserve_file_partitioning/fact/';
+
+# Dimension table (for join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table (d_dkey STRING, env STRING, service STRING, host STRING)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/preserve_file_partitioning/dimension/';
+
+# Hive-partitioned dimension table (3 partitions matching fact_table for Partitioned join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table_partitioned (env STRING, service STRING)
+STORED AS PARQUET
+PARTITIONED BY (d_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/dimension_partitioned/';
+
+# 'High'-cardinality fact table (5 partitions > 3 target_partitions)
+statement ok
+CREATE EXTERNAL TABLE high_cardinality_table (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+LOCATION 'test_files/scratch/preserve_file_partitioning/high_cardinality/';
+
+##########
+# TEST 1: Basic Aggregate - Without Optimization
+# Shows RepartitionExec and two-phase aggregation
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: fact_table.f_dkey, count(Int64(1)) AS count(*), sum(fact_table.value)
+02)--Aggregate: groupBy=[[fact_table.f_dkey]], aggr=[[count(Int64(1)), sum(fact_table.value)]]
+03)----TableScan: fact_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(fact_table.value)@2 as sum(fact_table.value)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+03)----RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+A 7 772.4
+B 7 614.4
+C 7 2017.6
+
+##########
+# TEST 2: Basic Aggregate - With Optimization
+# Shows SinglePartitioned mode, no RepartitionExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: fact_table.f_dkey, count(Int64(1)) AS count(*), sum(fact_table.value)
+02)--Aggregate: groupBy=[[fact_table.f_dkey]], aggr=[[count(Int64(1)), sum(fact_table.value)]]
+03)----TableScan: fact_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(fact_table.value)@2 as sum(fact_table.value)]
+02)--AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(fact_table.value)]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results with optimization match results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value) FROM fact_table GROUP BY f_dkey;
+----
+A 7 772.4
+B 7 614.4
+C 7 2017.6
+
+##########
+# TEST 3: Aggregate with ORDER BY - Without Optimization
+# Shows SortExec and RepartitionExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), avg(fact_table_ordered.value)@2 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------SortExec: expr=[f_dkey@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+07)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet
+
+# Verify results without optimization
+query TIR
+SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+A 7 110.342857142857
+B 7 87.771428571429
+C 7 288.228571428571
+
+##########
+# TEST 4: Aggregate with ORDER BY - With Optimization
+# No SortExec, no RepartitionExec, just SortPreservingMergeExec
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), avg(fact_table_ordered.value)@2 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet
+
+query TIR
+SELECT f_dkey, count(*), avg(value) FROM fact_table_ordered GROUP BY f_dkey ORDER BY f_dkey;
+----
+A 7 110.342857142857
+B 7 87.771428571429
+C 7 288.228571428571
+
+##########
+# TEST 5: Join with Hash Partitioning Propagation - Without Optimization
+# CollectLeft join followed by RepartitionExec and SortExec for aggregate
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+logical_plan
+01)Sort: f.f_dkey ASC NULLS LAST
+02)--Projection: f.f_dkey, max(d.env), max(d.service), count(Int64(1)) AS count(*), sum(f.value)
+03)----Aggregate: groupBy=[[f.f_dkey]], aggr=[[max(d.env), max(d.service), count(Int64(1)), sum(f.value)]]
+04)------Projection: f.value, f.f_dkey, d.env, d.service
+05)--------Inner Join: f.f_dkey = d.d_dkey
+06)----------SubqueryAlias: f
+07)------------TableScan: fact_table_ordered projection=[value, f_dkey]
+08)----------SubqueryAlias: d
+09)------------Filter: dimension_table.service = Utf8View("log")
+10)--------------TableScan: dimension_table projection=[d_dkey, env, service], partial_filters=[dimension_table.service = Utf8View("log")]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, max(d.env)@1 as max(d.env), max(d.service)@2 as max(d.service), count(Int64(1))@3 as count(*), sum(f.value)@4 as sum(f.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+04)------SortExec: expr=[f_dkey@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+07)------------ProjectionExec: expr=[value@2 as value, f_dkey@3 as f_dkey, env@0 as env, service@1 as service]
+08)--------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@0, f_dkey@1)], projection=[env@1, service@2, value@3, f_dkey@4]
+09)----------------CoalescePartitionsExec
+10)------------------FilterExec: service@2 = log
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+13)----------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results without optimization
+query TTTIR rowsort
+SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+A dev log 7 772.4
+B prod log 7 614.4
+C prod log 7 2017.6
+
+##########
+# TEST 6: Join with Hash Partitioning Propagation - With Optimization
+# Hash partitioning propagates through join, no RepartitionExec/SortExec after join
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+logical_plan
+01)Sort: f.f_dkey ASC NULLS LAST
+02)--Projection: f.f_dkey, max(d.env), max(d.service), count(Int64(1)) AS count(*), sum(f.value)
+03)----Aggregate: groupBy=[[f.f_dkey]], aggr=[[max(d.env), max(d.service), count(Int64(1)), sum(f.value)]]
+04)------Projection: f.value, f.f_dkey, d.env, d.service
+05)--------Inner Join: f.f_dkey = d.d_dkey
+06)----------SubqueryAlias: f
+07)------------TableScan: fact_table_ordered projection=[value, f_dkey]
+08)----------SubqueryAlias: d
+09)------------Filter: dimension_table.service = Utf8View("log")
+10)--------------TableScan: dimension_table projection=[d_dkey, env, service], partial_filters=[dimension_table.service = Utf8View("log")]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, max(d.env)@1 as max(d.env), max(d.service)@2 as max(d.service), count(Int64(1))@3 as count(*), sum(f.value)@4 as sum(f.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[max(d.env), max(d.service), count(Int64(1)), sum(f.value)], ordering_mode=Sorted
+04)------ProjectionExec: expr=[value@2 as value, f_dkey@3 as f_dkey, env@0 as env, service@1 as service]
+05)--------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@0, f_dkey@1)], projection=[env@1, service@2, value@3, f_dkey@4]
+06)----------CoalescePartitionsExec
+07)------------FilterExec: service@2 = log
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension/data.parquet]]}, projection=[d_dkey, env, service], file_type=parquet, predicate=service@2 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+10)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], output_ordering=[f_dkey@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+query TTTIR rowsort
+SELECT f.f_dkey, MAX(d.env), MAX(d.service), count(*), sum(f.value)
+FROM fact_table_ordered f
+INNER JOIN dimension_table d ON f.f_dkey = d.d_dkey
+WHERE d.service = 'log'
+GROUP BY f.f_dkey
+ORDER BY f.f_dkey;
+----
+A dev log 7 772.4
+B prod log 7 614.4
+C prod log 7 2017.6
+
+##########
+# TEST 7: Window Function - Without Optimization
+# Shows RepartitionExec and SortExec (hash repartition destroys ordering)
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----SortExec: expr=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------RepartitionExec: partitioning=Hash([f_dkey@2], 3), input_partitions=3
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without optimization (limited for readability)
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 8: Window Function - With Optimization
+# No RepartitionExec, no SortExec (data already sorted by f_dkey, timestamp)
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (PARTITION BY f_dkey ORDER BY timestamp) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 9: High-Cardinality Partitions (more partitions than target_partitions)
+# Since num_partitions > target_partitions (5 > 3), files are merged using
+# round-robin assignment to ensure exactly target_partitions groups are created.
+##########
+
+# First verify results without optimization
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+# Now with optimization - verify plan shows SinglePartitioned mode and no RepartitionExec
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+# Verify the plan uses SinglePartitioned mode with no RepartitionExec
+# The 5 partitions are merged into 3 file groups using round-robin assignment
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM high_cardinality_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: high_cardinality_table.f_dkey, count(Int64(1)) AS count(*), sum(high_cardinality_table.value)
+02)--Aggregate: groupBy=[[high_cardinality_table.f_dkey]], aggr=[[count(Int64(1)), sum(high_cardinality_table.value)]]
+03)----TableScan: high_cardinality_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(high_cardinality_table.value)@2 as sum(high_cardinality_table.value)]
+02)--AggregateExec: mode=SinglePartitioned, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+# Verify results with optimization match results without optimization
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+query R
+SELECT sum(value) FROM high_cardinality_table;
+----
+1500
+
+##########
+# Test 10: Threshold higher than distinct partition count
+##########
+# When preserve_file_partitions threshold is higher than the number of distinct
+# partition values, the optimization should NOT apply and we fall back to split_files.
+# The high_cardinality_table has 5 distinct partition values (A, B, C, D, E).
+# Setting threshold to 10 means we need at least 10 distinct partitions to enable
+# Hash partitioning, so this should show RepartitionExec in the plan.
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 10;
+
+# Verify the plan falls back to regular aggregation with RepartitionExec
+query TT
+EXPLAIN SELECT f_dkey, count(*), sum(value) FROM high_cardinality_table GROUP BY f_dkey;
+----
+logical_plan
+01)Projection: high_cardinality_table.f_dkey, count(Int64(1)) AS count(*), sum(high_cardinality_table.value)
+02)--Aggregate: groupBy=[[high_cardinality_table.f_dkey]], aggr=[[count(Int64(1)), sum(high_cardinality_table.value)]]
+03)----TableScan: high_cardinality_table projection=[value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@0 as f_dkey, count(Int64(1))@1 as count(*), sum(high_cardinality_table.value)@2 as sum(high_cardinality_table.value)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+03)----RepartitionExec: partitioning=Hash([f_dkey@0], 3), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey], aggr=[count(Int64(1)), sum(high_cardinality_table.value)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=C/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/high_cardinality/f_dkey=E/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+query TIR rowsort
+SELECT f_dkey, count(*), sum(value)
+FROM high_cardinality_table
+GROUP BY f_dkey;
+----
+A 1 100
+B 1 200
+C 1 300
+D 1 400
+E 1 500
+
+##########
+# TEST 11: Partitioned Join with Matching Partition Counts - Without Optimization
+# fact_table (3 partitions) joins dimension_table_partitioned (3 partitions)
+# Shows RepartitionExec added when preserve_file_partitions is disabled
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 0;
+
+# Force Partitioned join mode (not CollectLeft)
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold = 0;
+
+statement ok
+set datafusion.optimizer.hash_join_single_partition_threshold_rows = 0;
+
+query TT
+EXPLAIN SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+logical_plan
+01)Aggregate: groupBy=[[f.f_dkey, d.env]], aggr=[[sum(f.value)]]
+02)--Projection: f.value, f.f_dkey, d.env
+03)----Inner Join: f.f_dkey = d.d_dkey
+04)------SubqueryAlias: f
+05)--------TableScan: fact_table projection=[value, f_dkey]
+06)------SubqueryAlias: d
+07)--------TableScan: dimension_table_partitioned projection=[env, d_dkey]
+physical_plan
+01)AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, env@1 as env], aggr=[sum(f.value)]
+02)--RepartitionExec: partitioning=Hash([f_dkey@0, env@1], 3), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey, env@2 as env], aggr=[sum(f.value)]
+04)------ProjectionExec: expr=[value@1 as value, f_dkey@2 as f_dkey, env@0 as env]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@1)], projection=[env@0, value@2, f_dkey@3]
+06)----------RepartitionExec: partitioning=Hash([d_dkey@1], 3), input_partitions=3
+07)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet]]}, projection=[env, d_dkey], file_type=parquet
+08)----------RepartitionExec: partitioning=Hash([f_dkey@1], 3), input_partitions=3
+09)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+query TTR rowsort
+SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+A dev 772.4
+B prod 614.4
+C prod 2017.6
+
+##########
+# TEST 12: Partitioned Join with Matching Partition Counts - With Optimization
+# Both tables have 3 partitions matching target_partitions=3
+# No RepartitionExec needed for join - partitions already satisfy the requirement
+# Dynamic filter pushdown is disabled in this mode because preserve_file_partitions
+# reports Hash partitioning for Hive-style file groups, which are not hash-routed.
+##########
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+query TT
+EXPLAIN SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+logical_plan
+01)Aggregate: groupBy=[[f.f_dkey, d.env]], aggr=[[sum(f.value)]]
+02)--Projection: f.value, f.f_dkey, d.env
+03)----Inner Join: f.f_dkey = d.d_dkey
+04)------SubqueryAlias: f
+05)--------TableScan: fact_table projection=[value, f_dkey]
+06)------SubqueryAlias: d
+07)--------TableScan: dimension_table_partitioned projection=[env, d_dkey]
+physical_plan
+01)AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, env@1 as env], aggr=[sum(f.value)]
+02)--RepartitionExec: partitioning=Hash([f_dkey@0, env@1], 3), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[f_dkey@1 as f_dkey, env@2 as env], aggr=[sum(f.value)]
+04)------ProjectionExec: expr=[value@1 as value, f_dkey@2 as f_dkey, env@0 as env]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@1)], projection=[env@0, value@2, f_dkey@3]
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/dimension_partitioned/d_dkey=C/data.parquet]]}, projection=[env, d_dkey], file_type=parquet
+07)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/preserve_file_partitioning/fact/f_dkey=C/data.parquet]]}, projection=[value, f_dkey], file_type=parquet
+
+query TTR rowsort
+SELECT f.f_dkey, d.env, sum(f.value)
+FROM fact_table f
+INNER JOIN dimension_table_partitioned d ON f.f_dkey = d.d_dkey
+GROUP BY f.f_dkey, d.env;
+----
+A dev 772.4
+B prod 614.4
+C prod 2017.6
+
+##########
+# CLEANUP
+##########
+
+statement ok
+DROP TABLE fact_table;
+
+statement ok
+DROP TABLE fact_table_ordered;
+
+statement ok
+DROP TABLE dimension_table;
+
+statement ok
+DROP TABLE dimension_table_partitioned;
+
+statement ok
+DROP TABLE high_cardinality_table;
diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt
index 0f0cbac1fa323..e18114bc51ca8 100644
--- a/datafusion/sqllogictest/test_files/projection.slt
+++ b/datafusion/sqllogictest/test_files/projection.slt
@@ -167,12 +167,12 @@ set datafusion.explain.logical_plan_only = false
 
 # project cast dictionary
 query T
-SELECT 
-    CASE 
+SELECT
+    CASE
         WHEN cpu_load_short.host IS NULL THEN ''
         ELSE cpu_load_short.host
     END AS host
-FROM 
+FROM
     cpu_load_short;
 ----
 host1
@@ -252,3 +252,29 @@ physical_plan
 
 statement ok
 drop table t;
+
+# Regression test for
+# https://github.com/apache/datafusion/issues/17513
+
+query I
+COPY (select 1 as a, 2 as b)
+TO 'test_files/scratch/projection/17513.parquet'
+STORED AS PARQUET;
+----
+1
+
+statement ok
+create external table t1 stored as parquet location 'test_files/scratch/projection/17513.parquet';
+
+query TT
+explain format indent
+select from t1 where t1.a > 1;
+----
+logical_plan
+01)Projection:
+02)--Filter: t1.a > Int64(1)
+03)----TableScan: t1 projection=[a], partial_filters=[t1.a > Int64(1)]
+physical_plan
+01)FilterExec: a@0 > 1, projection=[]
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection/17513.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 > 1, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > 1, required_guarantees=[]
diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt
new file mode 100644
index 0000000000000..0161bf4118f38
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt
@@ -0,0 +1,1992 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for projection pushdown behavior with get_field expressions
+#
+# This file tests the ExtractTrivialProjections optimizer rule and
+# physical projection pushdown for:
+# - get_field expressions (struct field access like s['foo'])
+# - Pushdown through Filter, Sort, and TopK operators
+# - Multi-partition scenarios with SortPreservingMergeExec
+##########
+
+#####################
+# Section 1: Setup - Single Partition Tests
+#####################
+
+# Set target_partitions = 1 for deterministic plan output
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+# Create parquet file with struct column containing value and label fields
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {value: 100, label: 'alpha'}),
+        (2, {value: 200, label: 'beta'}),
+        (3, {value: 150, label: 'gamma'}),
+        (4, {value: 300, label: 'delta'}),
+        (5, {value: 250, label: 'epsilon'})
+) TO 'test_files/scratch/projection_pushdown/simple.parquet'
+STORED AS PARQUET;
+
+# Create table for simple struct tests
+statement ok
+CREATE EXTERNAL TABLE simple_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/simple.parquet';
+
+# Create parquet file with nested struct column
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as nested
+    FROM VALUES
+        (1, {outer: {inner: 10, name: 'one'}, extra: 'x'}),
+        (2, {outer: {inner: 20, name: 'two'}, extra: 'y'}),
+        (3, {outer: {inner: 30, name: 'three'}, extra: 'z'})
+) TO 'test_files/scratch/projection_pushdown/nested.parquet'
+STORED AS PARQUET;
+
+# Create table for nested struct tests
+statement ok
+CREATE EXTERNAL TABLE nested_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/nested.parquet';
+
+# Create parquet file with nullable struct column
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {value: 100, label: 'alpha'}),
+        (2, NULL),
+        (3, {value: 150, label: 'gamma'}),
+        (4, NULL),
+        (5, {value: 250, label: 'epsilon'})
+) TO 'test_files/scratch/projection_pushdown/nullable.parquet'
+STORED AS PARQUET;
+
+# Create table for nullable struct tests
+statement ok
+CREATE EXTERNAL TABLE nullable_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/nullable.parquet';
+
+
+#####################
+# Section 2: Basic get_field Pushdown (Projection above scan)
+#####################
+
+###
+# Test 2.1: Simple s['value']
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+query TT
+EXPLAIN SELECT s['label'] FROM simple_struct;
+----
+logical_plan
+01)Projection: get_field(simple_struct.s, Utf8("label"))
+02)--TableScan: simple_struct projection=[s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet
+
+# Verify correctness
+query T
+SELECT s['label'] FROM simple_struct ORDER BY s['label'];
+----
+alpha
+beta
+delta
+epsilon
+gamma
+
+###
+# Test 2.2: Multiple get_field expressions
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label"))
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet
+
+# Verify correctness
+query IIT
+SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id;
+----
+1 100 alpha
+2 200 beta
+3 150 gamma
+4 300 delta
+5 250 epsilon
+
+###
+# Test 2.3: Nested s['outer']['inner']
+###
+
+query TT
+EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct;
+----
+logical_plan
+01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner"))
+02)--TableScan: nested_struct projection=[id, nested]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id;
+----
+1 10
+2 20
+3 30
+
+###
+# Test 2.4: s['value'] + 1
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+1 101
+2 201
+3 151
+4 301
+5 251
+
+###
+# Test 2.5: s['label'] || '_suffix'
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct;
+----
+logical_plan
+01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix")
+02)--TableScan: simple_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet
+
+# Verify correctness
+query IT
+SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id;
+----
+1 alpha_suffix
+2 beta_suffix
+3 gamma_suffix
+4 delta_suffix
+5 epsilon_suffix
+
+
+#####################
+# Section 3: Projection Pushdown Through FilterExec
+#####################
+
+###
+# Test 3.1: Simple get_field through Filter
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 2
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 2 ORDER BY id;
+----
+3 150
+4 300
+5 250
+
+###
+# Test 3.2: s['value'] + 1 through Filter
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1)
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)]
+02)--FilterExec: id@1 > 2
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2 ORDER BY id;
+----
+3 151
+4 301
+5 251
+
+###
+# Test 3.3: Filter on get_field expression
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label]
+02)--Filter: __datafusion_extracted_1 > Int64(150)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]]
+02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet, predicate=get_field(s@1, value) > 150
+
+# Verify correctness
+query IT
+SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150 ORDER BY id;
+----
+2 beta
+4 delta
+5 epsilon
+
+
+#####################
+# Section 4: Projection Pushdown Through SortExec (no LIMIT)
+#####################
+
+###
+# Test 4.1: Simple get_field through Sort
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+###
+# Test 4.2: s['value'] + 1 through Sort - split projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id;
+----
+1 101
+2 201
+3 151
+4 301
+5 251
+
+###
+# Test 4.3: Sort by get_field expression
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value'];
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY s['value'];
+----
+1 100
+3 150
+2 200
+5 250
+4 300
+
+###
+# Test 4.4: Projection with duplicate column through Sort
+# The projection expands the number of columns from 3 to 4 by introducing `col_b_dup`
+###
+
+statement ok
+COPY (
+    SELECT
+        column1 as col_a,
+        column2 as col_b,
+        column3 as col_c
+    FROM VALUES
+        (1, 2, 3),
+        (4, 5, 6),
+        (7, 8, 9)
+) TO 'test_files/scratch/projection_pushdown/three_cols.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE three_cols STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/three_cols.parquet';
+
+query TT
+EXPLAIN SELECT col_a, col_b, col_c, col_b as col_b_dup FROM three_cols ORDER BY col_a;
+----
+logical_plan
+01)Sort: three_cols.col_a ASC NULLS LAST
+02)--Projection: three_cols.col_a, three_cols.col_b, three_cols.col_c, three_cols.col_b AS col_b_dup
+03)----TableScan: three_cols projection=[col_a, col_b, col_c]
+physical_plan
+01)SortExec: expr=[col_a@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/three_cols.parquet]]}, projection=[col_a, col_b, col_c, col_b@1 as col_b_dup], file_type=parquet
+
+# Verify correctness
+query IIII
+SELECT col_a, col_b, col_c, col_b as col_b_dup FROM three_cols ORDER BY col_a DESC;
+----
+7 8 9 8
+4 5 6 5
+1 2 3 2
+
+statement ok
+DROP TABLE three_cols;
+
+
+#####################
+# Section 5: Projection Pushdown Through TopK (ORDER BY + LIMIT)
+#####################
+
+###
+# Test 5.1: Simple get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100
+2 200
+3 150
+
+###
+# Test 5.2: s['value'] + 1 through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 201
+3 151
+
+###
+# Test 5.3: Multiple get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100 alpha
+2 200 beta
+3 150 gamma
+
+###
+# Test 5.4: Nested get_field through TopK
+###
+
+query TT
+EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: nested_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner"))
+03)----TableScan: nested_struct projection=[id, nested]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT 2;
+----
+1 10
+2 20
+
+###
+# Test 5.5: String concat through TopK
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix")
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IT
+SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 alpha_suffix
+2 beta_suffix
+3 gamma_suffix
+
+
+#####################
+# Section 6: Combined Operators (Filter + Sort/TopK)
+#####################
+
+###
+# Test 6.1: Filter + Sort + get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'];
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'];
+----
+3 150
+2 200
+5 250
+4 300
+
+###
+# Test 6.2: Filter + TopK + get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value'] LIMIT 2;
+----
+3 150
+2 200
+
+###
+# Test 6.3: Filter + TopK + get_field with arithmetic
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1)
+03)----Filter: simple_struct.id > Int64(1)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)]
+03)----FilterExec: id@1 > 1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+2 201
+3 151
+
+
+#####################
+# Section 7: Multi-Partition Tests
+#####################
+
+# Set target_partitions = 4 for parallel execution
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Create 5 parquet files (more than partitions) for parallel tests
+statement ok
+COPY (SELECT 1 as id, {value: 100, label: 'alpha'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part1.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 2 as id, {value: 200, label: 'beta'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part2.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 3 as id, {value: 150, label: 'gamma'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part3.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 4 as id, {value: 300, label: 'delta'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part4.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (SELECT 5 as id, {value: 250, label: 'epsilon'} as s)
+TO 'test_files/scratch/projection_pushdown/multi/part5.parquet'
+STORED AS PARQUET;
+
+# Create table from multiple parquet files
+statement ok
+CREATE EXTERNAL TABLE multi_struct STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/multi/';
+
+###
+# Test 7.1: Multi-partition Sort with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value"))
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct ORDER BY id;
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+###
+# Test 7.2: Multi-partition TopK with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value"))
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) as multi_struct.s[value]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3;
+----
+1 100
+2 200
+3 150
+
+###
+# Test 7.3: Multi-partition TopK with arithmetic (non-trivial stays above merge)
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1)
+03)----TableScan: multi_struct projection=[id, s]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, get_field(s@1, value) + 1 as multi_struct.s[value] + Int64(1)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 201
+3 151
+
+###
+# Test 7.4: Multi-partition Filter with get_field
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id;
+----
+logical_plan
+01)Sort: multi_struct.id ASC NULLS LAST
+02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value]
+03)----Filter: multi_struct.id > Int64(2)
+04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id
+05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]]
+04)------FilterExec: id@1 > 2
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
+06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id;
+----
+3 150
+4 300
+5 250
+
+###
+# Test 7.5: Aggregation with get_field (CoalescePartitions)
+###
+
+query TT
+EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'];
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value])
+02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]]
+03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2
+04)------TableScan: multi_struct projection=[s]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])]
+02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)]
+03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3
+04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)]
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet
+
+# Verify correctness
+query TI
+SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label'] ORDER BY s['label'];
+----
+alpha 100
+beta 200
+delta 300
+epsilon 250
+gamma 150
+
+
+#####################
+# Section 8: Edge Cases
+#####################
+
+# Reset to single partition for edge case tests
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+###
+# Test 8.1: get_field on nullable struct column
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM nullable_struct;
+----
+logical_plan
+01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value"))
+02)--TableScan: nullable_struct projection=[id, s]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet
+
+# Verify correctness (NULL struct returns NULL field)
+query II
+SELECT id, s['value'] FROM nullable_struct ORDER BY id;
+----
+1 100
+2 NULL
+3 150
+4 NULL
+5 250
+
+###
+# Test 8.2: get_field returning NULL values
+###
+
+query TT
+EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL;
+----
+logical_plan
+01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label]
+02)--Filter: __datafusion_extracted_1 IS NOT NULL
+03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2
+04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]]
+02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet, predicate=get_field(s@1, value) IS NOT NULL
+
+# Verify correctness
+query IT
+SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL ORDER BY id;
+----
+1 alpha
+3 gamma
+5 epsilon
+
+###
+# Test 8.3: Mixed trivial and non-trivial in same projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, value) + 10 as simple_struct.s[value] + Int64(10), get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIIT
+SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 100 110 alpha
+2 200 210 beta
+3 150 160 gamma
+
+###
+# Test 8.4: Literal projection through TopK
+###
+
+query TT
+EXPLAIN SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, Int64(42) AS constant
+03)----TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as constant], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, 42 as constant FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 42
+2 42
+3 42
+
+###
+# Test 8.5: Simple column through TopK (baseline comparison)
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY id LIMIT 3;
+----
+1
+2
+3
+
+
+#####################
+# Section 9: Coverage Tests - Edge Cases for Uncovered Code Paths
+#####################
+
+###
+# Test 9.1: TopK with computed projection
+###
+
+query TT
+EXPLAIN SELECT id, id + 100 as computed FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, simple_struct.id + Int64(100) AS computed
+03)----TableScan: simple_struct projection=[id]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, id@0 + 100 as computed], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, id + 100 as computed FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 102
+3 103
+
+###
+# Test 9.2: Duplicate get_field expressions (same expression referenced twice)
+# Common subexpression elimination happens in the logical plan, and the physical
+# plan extracts the shared get_field for efficient computation
+###
+
+query TT
+EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value
+02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1
+03)----Filter: simple_struct.id > Int64(2)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value]
+02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1]
+03)----FilterExec: id@1 > 2
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+
+query TT
+EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2 ORDER BY doubled;
+----
+300
+500
+600
+
+###
+# Test 9.3: Projection with only get_field expressions through Filter
+###
+
+query TT
+EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]]
+02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query IT
+SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2 ORDER BY s['value'];
+----
+150 gamma
+250 epsilon
+300 delta
+
+###
+# Test 9.4: Mixed column reference with get_field in expression through TopK
+# Tests column remapping in finalize_outer_exprs when outer expr references both extracted and original columns
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + id as combined FROM simple_struct ORDER BY id LIMIT 3;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=3
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + simple_struct.id AS combined
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + id@0 as combined], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT id, s['value'] + id as combined FROM simple_struct ORDER BY id LIMIT 3;
+----
+1 101
+2 202
+3 153
+
+###
+# Test 9.5: Multiple get_field from same struct in expression through Filter
+# Tests extraction when base struct is shared across multiple get_field calls
+###
+
+query TT
+EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS Int64) AS score
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score]
+02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1 ORDER BY score;
+----
+305
+404
+507
+605
+
+
+#####################
+# Section 10: Literal with get_field Expressions
+#####################
+
+###
+# Test 10.1: Literal constant + get_field in same projection
+# Tests projection with both trivial (literal) and get_field expressions
+###
+
+query TT
+EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, 42 as answer, get_field(s@1, label) as simple_struct.s[label]], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT 2;
+----
+1 42 alpha
+2 42 beta
+
+###
+# Test 10.2: Multiple non-trivial get_field expressions together
+# Tests arithmetic on one field and string concat on another in same projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct ORDER BY id LIMIT 2;
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, fetch=2
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test")
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 100 as simple_struct.s[value] + Int64(100), get_field(s@1, label) || _test as simple_struct.s[label] || Utf8("_test")], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query IIT
+SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct ORDER BY id LIMIT 2;
+----
+1 200 alpha_test
+2 300 beta_test
+
+#####################
+# Section 11: FilterExec Projection Pushdown - Handling Predicate Column Requirements
+#####################
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 2;
+----
+2 200
+3 150
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5);
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5))
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[]
+
+# Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5))
+# That's: id=2,3 (1<id<4) and id=5 (id=5 from second branch)
+query I
+SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5) ORDER BY s['value'];
+----
+150
+200
+250
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND id < 5;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[]
+
+# Verify correctness - should return rows where 1 < id < 5 (id=2,3,4)
+query I
+SELECT s['value'] FROM simple_struct WHERE id > 1 AND id < 5 ORDER BY s['value'];
+----
+150
+200
+300
+
+query TT
+EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id
+02)--Filter: simple_struct.id > Int64(1)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id]
+02)--FilterExec: id@2 > 1
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+# Verify correctness - note that id is now at index 2 in the augmented projection
+query ITI
+SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1 ORDER BY id LIMIT 3;
+----
+200 beta 2
+150 gamma 3
+300 delta 4
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4;
+----
+logical_plan
+01)Projection: __datafusion_extracted_2 AS simple_struct.s[value]
+02)--Filter: character_length(__datafusion_extracted_1) > Int32(4)
+03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2
+04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]]
+02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet, predicate=character_length(get_field(s@1, label)) > 4
+
+# Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3)
+# Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7)
+# So: alpha, gamma, delta, epsilon (not beta which has 4 characters)
+query I
+SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4 ORDER BY s['value'];
+----
+100
+150
+250
+300
+
+#####################
+# Section 11a: ProjectionExec on top of a SortExec with missing Sort Columns
+#####################
+
+###
+# Test 11a.1: Sort by dropped column
+# Selects only id, drops s entirely, but sorts by s['value']
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'];
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'];
+----
+1
+3
+2
+5
+4
+
+###
+# Test 11a.2: Multiple sort columns with partial selection
+# Selects only id and s['value'], but sorts by id and s['label']
+# One sort column (s['label']) is not selected but needed for ordering
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label'];
+----
+logical_plan
+01)Projection: simple_struct.id, simple_struct.s[value]
+02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]]
+02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label'];
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+
+###
+# Test 11a.3: TopK with dropped sort column
+# Same as test 11a.1 but with LIMIT
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2;
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2;
+----
+1
+3
+
+###
+# Test 11a.4: Sort by derived expression with dropped column
+# Projects only id, sorts by s['value'] * 2 (derived expression)
+# Sort column is computed but requires base columns not in projection
+###
+
+query TT
+EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2;
+----
+logical_plan
+01)Projection: simple_struct.id
+02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST
+03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1
+04)------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@0 as id]
+02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet
+
+# Verify correctness
+query I
+SELECT id FROM simple_struct ORDER BY s['value'] * 2;
+----
+1
+3
+2
+5
+4
+
+###
+# Test 11a.5: All sort columns selected
+# All columns needed for sorting are included in projection
+###
+
+query TT
+EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value'];
+----
+logical_plan
+01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST
+02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value"))
+03)----TableScan: simple_struct projection=[id, s]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet
+
+# Verify correctness
+query II
+SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value'];
+----
+1 100
+2 200
+3 150
+4 300
+5 250
+
+#####################
+# Section 12: Join Tests - get_field Extraction from Join Nodes
+#####################
+
+# Create a second table for join tests
+statement ok
+COPY (
+    SELECT
+        column1 as id,
+        column2 as s
+    FROM VALUES
+        (1, {role: 'admin', level: 10}),
+        (2, {role: 'user', level: 5}),
+        (3, {role: 'guest', level: 1}),
+        (4, {role: 'admin', level: 8}),
+        (5, {role: 'user', level: 3})
+) TO 'test_files/scratch/projection_pushdown/join_right.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE join_right STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/join_right.parquet';
+
+###
+# Test 12.1: Join with get_field in equijoin condition
+# Tests extraction from join ON clause - get_field on each side routed appropriately
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10;
+----
+logical_plan
+01)Projection: simple_struct.id, join_right.id
+02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - value = level * 10
+# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250)
+# join_right: (1,10), (2,5), (3,1), (4,8), (5,3)
+# Matches: simple_struct.value=100 matches join_right.level*10=100 (level=10, id=1)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10
+ORDER BY simple_struct.id;
+----
+1 1
+
+###
+# Test 12.2: Join with get_field in non-equi filter
+# Tests extraction from join filter expression - left side only
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 150;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(150)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)]
+06)--TableScan: join_right projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 150
+04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - id matches and value > 150
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 150
+ORDER BY simple_struct.id;
+----
+2 2
+4 4
+5 5
+
+###
+# Test 12.3: Join with get_field from both sides in filter
+# Tests extraction routing to both left and right inputs
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(100)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)]
+06)--Projection: join_right.id
+07)----Filter: __datafusion_extracted_2 > Int64(3)
+08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 100
+04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1]
+05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=get_field(s@1, level) > 3 AND DynamicFilter [ empty ]
+
+# Verify correctness - id matches, value > 100, and level > 3
+# Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250)
+# Of those, level > 3: 2(5), 4(8), 5(3) -> only 2 and 4
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3
+ORDER BY simple_struct.id;
+----
+2 2
+4 4
+
+###
+# Test 12.4: Join with get_field in SELECT projection
+# Tests that get_field in output columns pushes down through the join
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, simple_struct.s['label'], join_right.s['role']
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role]
+02)--Inner Join: simple_struct.id = join_right.id
+03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[__datafusion_extracted_1@0, id@1, __datafusion_extracted_2@2]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query ITT
+SELECT simple_struct.id, simple_struct.s['label'], join_right.s['role']
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+ORDER BY simple_struct.id;
+----
+1 alpha admin
+2 beta user
+3 gamma guest
+4 delta admin
+5 epsilon user
+
+###
+# Test 12.5: Join without get_field (baseline - no extraction needed)
+# Verifies no unnecessary projections are added when there's nothing to extract
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id;
+----
+logical_plan
+01)Inner Join: simple_struct.id = join_right.id
+02)--TableScan: simple_struct projection=[id]
+03)--TableScan: join_right projection=[id]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+ORDER BY simple_struct.id;
+----
+1 1
+2 2
+3 3
+4 4
+5 5
+
+###
+# Test 12.6: Left Join with get_field extraction
+# Tests extraction works correctly with outer joins
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, simple_struct.s['value'], join_right.s['level']
+FROM simple_struct
+LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5;
+----
+logical_plan
+01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level]
+02)--Left Join: simple_struct.id = join_right.id
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: join_right.id, __datafusion_extracted_3
+06)------Filter: __datafusion_extracted_1 > Int64(5)
+07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3
+08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)]
+physical_plan
+01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_2@0 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[__datafusion_extracted_2@0, id@1, __datafusion_extracted_3@3]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet
+04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2]
+05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet, predicate=get_field(s@1, level) > 5 AND DynamicFilter [ empty ]
+
+# Verify correctness - left join with level > 5 condition
+# Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8)
+query III
+SELECT simple_struct.id, simple_struct.s['value'], join_right.s['level']
+FROM simple_struct
+LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5
+ORDER BY simple_struct.id;
+----
+1 100 10
+2 200 NULL
+3 150 NULL
+4 300 8
+5 250 NULL
+
+#####################
+# Section 13: RepartitionExec tests
+#####################
+
+# Set target partitions to 32 -> this forces a RepartitionExec
+statement ok
+SET datafusion.execution.target_partitions = 32;
+
+query TT
+EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS simple_struct.s[value]
+02)--Filter: simple_struct.id > Int64(2)
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+#####################
+# Section 14: SubqueryAlias tests
+#####################
+
+# Reset target partitions
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+# get_field pushdown through subquery alias with filter
+query TT
+EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS t.s[value]
+02)--SubqueryAlias: t
+03)----Projection: __datafusion_extracted_1
+04)------Filter: simple_struct.id > Int64(2)
+05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2 ORDER BY t.id;
+----
+150
+300
+250
+
+# Multiple get_field through subquery alias with sort
+query TT
+EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t ORDER BY t.s['value'];
+----
+logical_plan
+01)Sort: t.s[value] ASC NULLS LAST
+02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label]
+03)----SubqueryAlias: t
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2
+05)--------TableScan: simple_struct projection=[s]
+physical_plan
+01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet
+
+# Verify correctness
+query IT
+SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t ORDER BY t.s['value'];
+----
+100 alpha
+150 gamma
+200 beta
+250 epsilon
+300 delta
+
+# Nested subquery aliases
+query TT
+EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS u.s[value]
+02)--SubqueryAlias: u
+03)----SubqueryAlias: t
+04)------Projection: __datafusion_extracted_1
+05)--------Filter: simple_struct.id > Int64(2)
+06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]]
+02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2 ORDER BY u.id;
+----
+150
+300
+250
+
+# get_field in filter through subquery alias
+query TT
+EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 200;
+----
+logical_plan
+01)SubqueryAlias: t
+02)--Projection: simple_struct.id
+03)----Filter: __datafusion_extracted_1 > Int64(200)
+04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)]
+physical_plan
+01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=get_field(s@1, value) > 200
+
+# Verify correctness
+query I
+SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 200 ORDER BY t.id;
+----
+4
+5
+
+#####################
+# Section 15: UNION ALL tests
+#####################
+
+# get_field on UNION ALL result
+query TT
+EXPLAIN SELECT s['value'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t;
+----
+logical_plan
+01)Projection: __datafusion_extracted_1 AS t.s[value]
+02)--SubqueryAlias: t
+03)----Union
+04)------Projection: __datafusion_extracted_1
+05)--------Filter: simple_struct.id <= Int64(3)
+06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)]
+08)------Projection: __datafusion_extracted_1
+09)--------Filter: simple_struct.id > Int64(3)
+10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]]
+02)--UnionExec
+03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[]
+05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0]
+06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[]
+
+# Verify correctness
+query I
+SELECT s['value'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+100
+150
+200
+250
+300
+
+# Multiple get_field on UNION ALL with ORDER BY
+query TT
+EXPLAIN SELECT s['value'], s['label'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+logical_plan
+01)Sort: t.s[value] ASC NULLS LAST
+02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label]
+03)----SubqueryAlias: t
+04)------Union
+05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2
+06)----------Filter: simple_struct.id <= Int64(3)
+07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)]
+09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2
+10)----------Filter: simple_struct.id > Int64(3)
+11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id
+12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)]
+physical_plan
+01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST]
+02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]]
+04)------UnionExec
+05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[]
+07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1]
+08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[]
+
+# Verify correctness
+query IT
+SELECT s['value'], s['label'] FROM (
+    SELECT s FROM simple_struct WHERE id <= 3
+    UNION ALL
+    SELECT s FROM simple_struct WHERE id > 3
+) t ORDER BY s['value'];
+----
+100 alpha
+150 gamma
+200 beta
+250 epsilon
+300 delta
+
+#####################
+# Section 16: Aggregate / Join edge-case tests
+# Translated from unit tests in extract_leaf_expressions.rs
+#####################
+
+###
+# Test 16.1: Projection with get_field above Aggregate
+# Aggregate blocks pushdown, so the get_field stays in the top projection.
+# (mirrors test_projection_with_leaf_expr_above_aggregate)
+###
+
+query TT
+EXPLAIN SELECT s['label'] IS NOT NULL AS has_label, COUNT(1)
+FROM simple_struct GROUP BY s;
+----
+logical_plan
+01)Projection: get_field(simple_struct.s, Utf8("label")) IS NOT NULL AS has_label, count(Int64(1))
+02)--Aggregate: groupBy=[[simple_struct.s]], aggr=[[count(Int64(1))]]
+03)----TableScan: simple_struct projection=[s]
+physical_plan
+01)ProjectionExec: expr=[get_field(s@0, label) IS NOT NULL as has_label, count(Int64(1))@1 as count(Int64(1))]
+02)--AggregateExec: mode=Single, gby=[s@0 as s], aggr=[count(Int64(1))]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet
+
+# Verify correctness - all labels are non-null
+query BI
+SELECT s['label'] IS NOT NULL AS has_label, COUNT(1)
+FROM simple_struct GROUP BY s ORDER BY COUNT(1);
+----
+true 1
+true 1
+true 1
+true 1
+true 1
+
+###
+# Test 16.2: Join with get_field filter on qualified right side
+# The get_field on join_right.s['role'] must be routed to the right input only.
+# (mirrors test_extract_from_join_qualified_right_side)
+###
+
+query TT
+EXPLAIN
+SELECT s.s['value'], j.s['role']
+FROM join_right j
+INNER JOIN simple_struct s ON s.id = j.id
+WHERE s.s['value'] > j.s['level'];
+----
+logical_plan
+01)Projection: __datafusion_extracted_3 AS s.s[value], __datafusion_extracted_4 AS j.s[role]
+02)--Inner Join: j.id = s.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2
+03)----SubqueryAlias: j
+04)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_4, join_right.id
+05)--------TableScan: join_right projection=[id, s]
+06)----SubqueryAlias: s
+07)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_3, simple_struct.id
+08)--------TableScan: simple_struct projection=[id, s]
+physical_plan
+01)ProjectionExec: expr=[__datafusion_extracted_3@1 as s.s[value], __datafusion_extracted_4@0 as j.s[role]]
+02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@2, id@2)], filter=__datafusion_extracted_1@1 > __datafusion_extracted_2@0, projection=[__datafusion_extracted_4@1, __datafusion_extracted_3@4]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, get_field(s@1, role) as __datafusion_extracted_4, id], file_type=parquet
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - only admin roles match (ids 1 and 4)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right
+  ON simple_struct.id = join_right.id
+  AND join_right.s['role'] = 'admin'
+ORDER BY simple_struct.id;
+----
+1 1
+4 4
+
+###
+# Test 16.3: Join with cross-input get_field comparison in WHERE
+# get_field from each side is extracted and routed to its respective input independently.
+# (mirrors test_extract_from_join_cross_input_expression)
+###
+
+query TT
+EXPLAIN SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > join_right.s['level'];
+----
+logical_plan
+01)Projection: simple_struct.id, join_right.id
+02)--Inner Join: simple_struct.id = join_right.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2
+03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id
+04)------TableScan: simple_struct projection=[id, s]
+05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id
+06)------TableScan: join_right projection=[id, s]
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], filter=__datafusion_extracted_1@0 > __datafusion_extracted_2@1, projection=[id@1, id@3]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet
+03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify correctness - all rows match since value >> level for all ids
+# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250)
+# join_right:    (1,10),  (2,5),   (3,1),   (4,8),   (5,3)
+query II
+SELECT simple_struct.id, join_right.id
+FROM simple_struct
+INNER JOIN join_right ON simple_struct.id = join_right.id
+WHERE simple_struct.s['value'] > join_right.s['level']
+ORDER BY simple_struct.id;
+----
+1 1
+2 2
+3 3
+4 4
+5 5
+
+# =========================================================================
+# Regression: user-provided __datafusion_extracted aliases must not
+# collide with optimizer-generated ones
+# (https://github.com/apache/datafusion/issues/20430)
+# =========================================================================
+
+statement ok
+COPY ( select {f1: 1, f2: 2} as s
+) TO 'test_files/scratch/projection_pushdown/test.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/projection_pushdown/test.parquet';
+
+# Verify that the user-provided __datafusion_extracted_2 alias is preserved
+# and the optimizer skips to _3 and _4 for its generated aliases.
+query TT
+EXPLAIN SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+logical_plan
+01)Projection: __datafusion_extracted_2
+02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1)
+03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2
+04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)]
+physical_plan
+01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet, predicate=CASE WHEN get_field(s@0, f1) IS NOT NULL THEN get_field(s@0, f1) ELSE get_field(s@0, f2) END = 1
+
+query I
+SELECT
+    get_field(s, 'f1') AS __datafusion_extracted_2
+FROM t
+WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
+----
+1
diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt
deleted file mode 100644
index ed948dd11439a..0000000000000
--- a/datafusion/sqllogictest/test_files/push_down_filter.slt
+++ /dev/null
@@ -1,272 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Test push down filter
-
-statement ok
-set datafusion.explain.physical_plan_only = true;
-
-statement ok
-CREATE TABLE IF NOT EXISTS v AS VALUES(1,[1,2,3]),(2,[3,4,5]);
-
-query I
-select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
-----
-3
-4
-5
-
-# test push down filter for unnest with filter on non-unnest column
-# filter plan is pushed down into projection plan
-query TT
-explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
-02)--UnnestExec
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: column1@0 = 2
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query I
-select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
-----
-4
-5
-
-# test push down filter for unnest with filter on unnest column
-query TT
-explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as __unnest_placeholder(v.column2,depth=1)]
-06)----------UnnestExec
-07)------------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query II
-select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
-----
-4 2
-5 2
-
-# Could push the filter (column1 = 2) down below unnest
-query TT
-explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
-04)------UnnestExec
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------FilterExec: column1@0 = 2
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-query II
-select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
-----
-3 2
-4 2
-5 2
-
-# only non-unnest filter in AND clause could be pushed down
-query TT
-explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
-----
-physical_plan
-01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3 OR column1@1 = 2
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------UnnestExec
-06)----------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-statement ok
-drop table v;
-
-# test with unnest struct, should not push down filter
-statement ok
-CREATE TABLE d AS VALUES(1,[named_struct('a', 1, 'b', 2)]),(2,[named_struct('a', 3, 'b', 4), named_struct('a', 5, 'b', 6)]);
-
-query I?
-select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
-----
-1 {a: 1, b: 2}
-
-query TT
-explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
-----
-physical_plan
-01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-05)--------UnnestExec
-06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)]
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-
-
-
-statement ok
-drop table d;
-
-
-# Test push down filter with limit for parquet
-statement ok
-set datafusion.execution.parquet.pushdown_filters = true;
-
-# this one is also required to make DF skip second file due to "sufficient" amount of rows
-statement ok
-set datafusion.execution.collect_statistics = true;
-
-# Create a table as a data source
-statement ok
-CREATE TABLE src_table (
-    part_key INT,
-    value INT
-) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100), (3, 4), (3, 5), (3, 6);
-
-
-# There will be more than 2 records filtered from the table to check that `limit 1` actually applied.
-# Setup 3 files, i.e., as many as there are partitions:
-
-# File 1:
-query I
-COPY (SELECT * FROM src_table where part_key = 1)
-TO 'test_files/scratch/parquet/test_filter_with_limit/part-0.parquet'
-STORED AS PARQUET;
-----
-3
-
-# File 2:
-query I
-COPY (SELECT * FROM src_table where part_key = 2)
-TO 'test_files/scratch/parquet/test_filter_with_limit/part-1.parquet'
-STORED AS PARQUET;
-----
-4
-
-# File 3:
-query I
-COPY (SELECT * FROM src_table where part_key = 3)
-TO 'test_files/scratch/parquet/test_filter_with_limit/part-2.parquet'
-STORED AS PARQUET;
-----
-3
-
-statement ok
-CREATE EXTERNAL TABLE test_filter_with_limit
-(
-  part_key INT,
-  value INT
-)
-STORED AS PARQUET
-LOCATION 'test_files/scratch/parquet/test_filter_with_limit/';
-
-query TT
-explain select * from test_filter_with_limit where value = 2 limit 1;
-----
-physical_plan
-01)CoalescePartitionsExec: fetch=1
-02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)]
-
-query II
-select * from test_filter_with_limit where value = 2 limit 1;
-----
-2 2
-
-
-# Tear down test_filter_with_limit table:
-statement ok
-DROP TABLE test_filter_with_limit;
-
-# Tear down src_table table:
-statement ok
-DROP TABLE src_table;
-
-
-query I
-COPY (VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10))
-TO 'test_files/scratch/push_down_filter/t.parquet'
-STORED AS PARQUET;
-----
-10
-
-statement ok
-CREATE EXTERNAL TABLE t
-(
-  a INT
-)
-STORED AS PARQUET
-LOCATION 'test_files/scratch/push_down_filter/t.parquet';
-
-
-# The predicate should not have a column cast  when the value is a valid i32
-query TT
-explain select a from t where a = '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
-
-# The predicate should not have a column cast  when the value is a valid i32
-query TT
-explain select a from t where a != '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)]
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '99999999999';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '99.99';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99
-
-# The predicate should still have the column cast when the value is a NOT valid i32
-query TT
-explain select a from t where a = '';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 
-
-# The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information.
-query TT
-explain select a from t where cast(a as string) = '100';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
-
-# The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost).
-query TT
-explain select a from t where CAST(a AS string) = '0123';
-----
-physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123
-
-
-statement ok
-drop table t;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt b/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt
new file mode 100644
index 0000000000000..2e5f7c317fd43
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_outer_joins.slt
@@ -0,0 +1,264 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+# check LEFT/RIGHT joins with filter pushdown to both relations (when possible)
+
+statement ok
+create table t1(k int, v int);
+
+statement ok
+create table t2(k int, v int);
+
+statement ok
+insert into t1 values
+  (1, 10),
+  (2, 20),
+  (3, 30),
+  (null, 40),
+  (50, null),
+  (null, null);
+
+statement ok
+insert into t2 values
+  (1, 11),
+  (2, 21),
+  (2, 22),
+  (null, 41),
+  (51, null),
+  (null, null);
+
+statement ok
+set datafusion.explain.physical_plan_only = false;
+
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
+
+# left join + filter on join key -> pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.k > 1;
+----
+2 20 2 21
+2 20 2 22
+3 30 NULL NULL
+50 NULL NULL NULL
+
+# left join + filter on another column -> not pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.v > 1;
+----
+1 10 1 11
+2 20 2 21
+2 20 2 22
+3 30 NULL NULL
+NULL 40 NULL NULL
+
+# left join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 left join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)Left Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 left join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+3 30 NULL NULL
+50 NULL NULL NULL
+NULL 40 NULL NULL
+
+
+# right join + filter on join key -> pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.k > 1;
+----
+2 20 2 21
+2 20 2 22
+
+# right join + filter on another column -> not pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.v > 1;
+----
+1 10 1 11
+2 20 2 21
+2 20 2 22
+
+# right join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 right join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)Inner Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k, v]
+
+query IIII rowsort
+select * from t1 right join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+
+
+# left anti join + filter on join key -> pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 1;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 1;
+----
+3 30
+50 NULL
+
+# left anti join + filter on another column -> not pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.v > 1;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.v > Int32(1)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.v > 1;
+----
+3 30
+NULL 40
+
+# left anti join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+logical_plan
+01)LeftAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(3) OR t1.v > Int32(20)
+03)----TableScan: t1 projection=[k, v]
+04)--TableScan: t2 projection=[k]
+
+query II rowsort
+select * from t1 left anti join t2 on t1.k = t2.k where t1.k > 3 or t1.v > 20;
+----
+3 30
+50 NULL
+NULL 40
+
+
+# right anti join + filter on join key -> pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 1;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--Filter: t1.k > Int32(1)
+03)----TableScan: t1 projection=[k]
+04)--Filter: t2.k > Int32(1)
+05)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 1;
+----
+51 NULL
+
+# right anti join + filter on another column -> not pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.v > 1;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--TableScan: t1 projection=[k]
+03)--Filter: t2.v > Int32(1)
+04)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.v > 1;
+----
+NULL 41
+
+# right anti join + or + filter on another column -> not pushed
+query TT
+explain select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 3 or t2.v > 20;
+----
+logical_plan
+01)RightAnti Join: t1.k = t2.k
+02)--TableScan: t1 projection=[k]
+03)--Filter: t2.k > Int32(3) OR t2.v > Int32(20)
+04)----TableScan: t2 projection=[k, v]
+
+query II rowsort
+select * from t1 right anti join t2 on t1.k = t2.k where t2.k > 3 or t2.v > 20;
+----
+51 NULL
+NULL 41
+
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt
new file mode 100644
index 0000000000000..e1c83c8c330d8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt
@@ -0,0 +1,188 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+# Test push down filter with limit for parquet
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+# this one is also required to make DF skip second file due to "sufficient" amount of rows
+statement ok
+set datafusion.execution.collect_statistics = true;
+
+# Create a table as a data source
+statement ok
+CREATE TABLE src_table (
+    part_key INT,
+    value INT
+) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100), (3, 4), (3, 5), (3, 6);
+
+
+# There will be more than 2 records filtered from the table to check that `limit 1` actually applied.
+# Setup 3 files, i.e., as many as there are partitions:
+
+# File 1:
+query I
+COPY (SELECT * FROM src_table where part_key = 1)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-0.parquet'
+STORED AS PARQUET;
+----
+3
+
+# File 2:
+query I
+COPY (SELECT * FROM src_table where part_key = 2)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-1.parquet'
+STORED AS PARQUET;
+----
+4
+
+# File 3:
+query I
+COPY (SELECT * FROM src_table where part_key = 3)
+TO 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-2.parquet'
+STORED AS PARQUET;
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE test_filter_with_limit
+(
+  part_key INT,
+  value INT
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/test_filter_with_limit/';
+
+query TT
+explain select * from test_filter_with_limit where value = 2 limit 1;
+----
+physical_plan
+01)CoalescePartitionsExec: fetch=1
+02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)]
+
+query II
+select * from test_filter_with_limit where value = 2 limit 1;
+----
+2 2
+
+
+# Tear down test_filter_with_limit table:
+statement ok
+DROP TABLE test_filter_with_limit;
+
+# Tear down src_table table:
+statement ok
+DROP TABLE src_table;
+
+
+query I
+COPY (VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10))
+TO 'test_files/scratch/push_down_filter_parquet/t.parquet'
+STORED AS PARQUET;
+----
+10
+
+statement ok
+CREATE EXTERNAL TABLE t
+(
+  a INT
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/push_down_filter_parquet/t.parquet';
+
+
+# The predicate should not have a column cast  when the value is a valid i32
+query TT
+explain select a from t where a = '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
+
+# The predicate should not have a column cast  when the value is a valid i32
+query TT
+explain select a from t where a != '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)]
+
+# The predicate should still have the column cast when the value is a NOT valid i32
+query TT
+explain select a from t where a = '99999999999';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999
+
+# The predicate should still have the column cast when the value is a NOT valid i32
+query TT
+explain select a from t where a = '99.99';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99
+
+# The predicate should still have the column cast when the value is a NOT valid i32
+query TT
+explain select a from t where a = '';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 
+
+# The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information.
+query TT
+explain select a from t where cast(a as string) = '100';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)]
+
+# The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost).
+query TT
+explain select a from t where CAST(a AS string) = '0123';
+----
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8View) = 0123
+
+
+# Test dynamic filter pushdown with swapped join inputs (issue #17196)
+# Create tables with different sizes to force join input swapping
+statement ok
+copy (select i as k from generate_series(1, 100) t(i)) to 'test_files/scratch/push_down_filter_parquet/small_table.parquet';
+
+statement ok
+copy (select i as k, i as v from generate_series(1, 1000) t(i)) to 'test_files/scratch/push_down_filter_parquet/large_table.parquet';
+
+statement ok
+create external table small_table stored as parquet location 'test_files/scratch/push_down_filter_parquet/small_table.parquet';
+
+statement ok
+create external table large_table stored as parquet location 'test_files/scratch/push_down_filter_parquet/large_table.parquet';
+
+# Test that dynamic filter is applied to the correct table after join input swapping
+# The small_table should be the build side, large_table should be the probe side with dynamic filter
+query TT
+explain select * from small_table join large_table on small_table.k = large_table.k where large_table.v >= 50;
+----
+physical_plan
+01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(k@0, k@0)]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/small_table.parquet]]}, projection=[k], file_type=parquet
+03)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/large_table.parquet]]}, projection=[k, v], file_type=parquet, predicate=v@1 >= 50 AND DynamicFilter [ empty ], pruning_predicate=v_null_count@1 != row_count@2 AND v_max@0 >= 50, required_guarantees=[]
+
+statement ok
+drop table small_table;
+
+statement ok
+drop table large_table;
+
+statement ok
+drop table t;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_regression.slt b/datafusion/sqllogictest/test_files/push_down_filter_regression.slt
new file mode 100644
index 0000000000000..dd652d6721798
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_regression.slt
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+# Regression test for https://github.com/apache/datafusion/issues/17188
+query I
+COPY (select i as k, i as v from generate_series(1, 10000000) as t(i))
+TO 'test_files/scratch/push_down_filter_regression/t2.parquet'
+STORED AS PARQUET
+OPTIONS ('format.compression' 'uncompressed');
+----
+10000000
+
+statement ok
+create external table t2 stored as parquet location 'test_files/scratch/push_down_filter_regression/t2.parquet';
+
+statement ok
+create external table t1 (k bigint not null) stored as parquet location 'test_files/scratch/push_down_filter_regression/t2.parquet';
+
+# The failure before https://github.com/apache/datafusion/pull/17197 was non-deterministic and random
+# So we'll run the same query a couple of times just to have more certainty it's fixed
+# Sorry about the spam in this slt test...
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+query III rowsort
+select *
+from t1
+join t2 on t1.k = t2.k
+where v = 1 or v = 10000000
+order by t1.k, t2.v;
+----
+1 1 1
+10000000 10000000 10000000
+
+# Regression test for https://github.com/apache/datafusion/issues/17512
+
+query I
+COPY (
+    SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp
+)
+TO 'test_files/scratch/push_down_filter_regression/17512.parquet'
+STORED AS PARQUET
+OPTIONS ('format.compression' 'uncompressed');
+----
+1
+
+statement ok
+CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter_regression/17512.parquet';
+
+query I
+SELECT 1
+FROM (
+    SELECT start_timestamp
+    FROM records
+    WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz
+) AS t
+WHERE t.start_timestamp::time < '00:00:01'::time;
+----
+1
+
+# Test aggregate dynamic filter pushdown
+# Note: most of the test coverage lives in `datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs`
+# , to compare dynamic filter content easier. Here the tests are simple end-to-end
+# exercises.
+
+statement ok
+set datafusion.explain.format = 'indent';
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+statement ok
+set datafusion.execution.target_partitions = 2;
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+set datafusion.optimizer.enable_dynamic_filter_pushdown = true;
+
+statement ok
+set datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown = true;
+
+statement ok
+create external table agg_dyn_test stored as parquet location '../core/tests/data/test_statistics_per_partition';
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id) from agg_dyn_test where id > 1;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[]
+
+query I
+select max(id) from agg_dyn_test where id > 1;
+----
+4
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id) from agg_dyn_test where (id+1) > 1;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=CAST(id@0 AS Int64) + 1 > 1 AND DynamicFilter [ empty ]
+
+# Expect dynamic filter available inside data source
+query TT
+explain select max(id), min(id) from agg_dyn_test where id < 10;
+----
+physical_plan
+01)AggregateExec: mode=Final, gby=[], aggr=[max(agg_dyn_test.id), min(agg_dyn_test.id)]
+02)--CoalescePartitionsExec
+03)----AggregateExec: mode=Partial, gby=[], aggr=[max(agg_dyn_test.id), min(agg_dyn_test.id)]
+04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 < 10 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 < 10, required_guarantees=[]
+
+# Dynamic filter should not be available for grouping sets
+query TT
+explain select max(id) from agg_dyn_test where id < 10
+group by grouping sets ((), (id))
+----
+physical_plan
+01)ProjectionExec: expr=[max(agg_dyn_test.id)@2 as max(agg_dyn_test.id)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, __grouping_id@1 as __grouping_id], aggr=[max(agg_dyn_test.id)]
+03)----RepartitionExec: partitioning=Hash([id@0, __grouping_id@1], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[(NULL as id), (id@0 as id)], aggr=[max(agg_dyn_test.id)]
+05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 < 10, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 < 10, required_guarantees=[]
+
+statement ok
+drop table agg_dyn_test;
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt b/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt
new file mode 100644
index 0000000000000..58fe24e2e2ccd
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/push_down_filter_unnest.slt
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test push down filter
+
+statement ok
+set datafusion.explain.physical_plan_only = true;
+
+statement ok
+CREATE TABLE IF NOT EXISTS v AS VALUES(1,[1,2,3]),(2,[3,4,5]);
+
+query I
+select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
+----
+3
+4
+5
+
+# test push down filter for unnest with filter on non-unnest column
+# filter plan is pushed down into projection plan
+query TT
+explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
+02)--UnnestExec
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
+05)--------FilterExec: column1@0 = 2, projection=[column2@1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query I
+select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
+----
+4
+5
+
+# test push down filter for unnest with filter on unnest column
+query TT
+explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------UnnestExec
+05)--------ProjectionExec: expr=[column2@0 as __unnest_placeholder(v.column2)]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
+----
+4 2
+5 2
+
+# Could push the filter (column1 = 2) down below unnest
+query TT
+explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3
+03)----UnnestExec
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
+06)----------FilterExec: column1@0 = 2
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
+----
+3 2
+4 2
+5 2
+
+# only non-unnest filter in AND clause could be pushed down
+query TT
+explain select uc2, column1 from  (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2;
+----
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(v.column2,depth=1)@0 as uc2, column1@1 as column1]
+02)--FilterExec: __unnest_placeholder(v.column2,depth=1)@0 > 3 OR column1@1 = 2
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------UnnestExec
+05)--------ProjectionExec: expr=[column2@1 as __unnest_placeholder(v.column2), column1@0 as column1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table v;
+
+# test with unnest struct, should not push down filter
+statement ok
+CREATE TABLE d AS VALUES(1,[named_struct('a', 1, 'b', 2)]),(2,[named_struct('a', 3, 'b', 4), named_struct('a', 5, 'b', 6)]);
+
+query I?
+select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
+----
+1 {a: 1, b: 2}
+
+query TT
+explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1;
+----
+physical_plan
+01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o]
+02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)]
+05)--------UnnestExec
+06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)]
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table d;
+
+statement ok
+CREATE TABLE d AS VALUES (named_struct('a', 1, 'b', 2)), (named_struct('a', 3, 'b', 4)), (named_struct('a', 5, 'b', 6));
+
+query II
+select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
+----
+5 6
+
+query TT
+explain select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
+----
+physical_plan
+01)FilterExec: __unnest_placeholder(d.column1).b@1 > 5
+02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----UnnestExec
+04)------ProjectionExec: expr=[column1@0 as __unnest_placeholder(d.column1)]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table d;
diff --git a/datafusion/sqllogictest/test_files/pwmj.slt b/datafusion/sqllogictest/test_files/pwmj.slt
new file mode 100644
index 0000000000000..295eb94318ee5
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/pwmj.slt
@@ -0,0 +1,346 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+statement ok
+set datafusion.optimizer.enable_piecewise_merge_join = true;
+
+statement ok
+CREATE TABLE join_t1 (t1_id INT);
+
+statement ok
+CREATE TABLE join_t2 (t2_id INT, t2_name TEXT, t2_int INT);
+
+statement ok
+INSERT INTO join_t1 VALUES (11), (22), (33), (44);
+
+statement ok
+INSERT INTO join_t2 VALUES
+  (11, 'z', 3),
+  (22, 'y', 1),
+  (44, 'x', 3),
+  (55, 'w', 3);
+
+query II
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id > t2.t2_id          
+WHERE t1.t1_id > 10              
+  AND t2.t2_int > 1               
+ORDER BY 1;
+----
+22 11
+33 11
+44 11
+
+# Checking `SELECT *`
+query IITI
+SELECT *
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id > t2.t2_id          
+WHERE t1.t1_id > 10              
+  AND t2.t2_int > 1               
+ORDER BY 1;
+----
+22 11 z 3
+33 11 z 3
+44 11 z 3
+
+query TT
+EXPLAIN
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id > t2.t2_id          
+WHERE t1.t1_id > 10              
+  AND t2.t2_int > 1               
+ORDER BY 1;
+----
+logical_plan
+01)Sort: t1.t1_id ASC NULLS LAST
+02)--Inner Join:  Filter: t1.t1_id > t2.t2_id
+03)----SubqueryAlias: t1
+04)------Filter: join_t1.t1_id > Int32(10)
+05)--------TableScan: join_t1 projection=[t1_id]
+06)----SubqueryAlias: t2
+07)------Projection: join_t2.t2_id
+08)--------Filter: join_t2.t2_int > Int32(1)
+09)----------TableScan: join_t2 projection=[t2_id, t2_int]
+physical_plan
+01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST]
+02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----PiecewiseMergeJoin: operator=Gt, join_type=Inner, on=(t1_id > t2_id)
+04)------SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
+05)--------FilterExec: t1_id@0 > 10
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_int@1 > 1, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id >= t2.t2_id
+WHERE t1.t1_id >= 22
+  AND t2.t2_int = 3
+ORDER BY 1,2;
+----
+22 11
+33 11
+44 11
+44 44
+
+query TT
+EXPLAIN
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id >= t2.t2_id
+WHERE t1.t1_id >= 22
+  AND t2.t2_int = 3
+ORDER BY 1,2;
+----
+logical_plan
+01)Sort: t1.t1_id ASC NULLS LAST, t2.t2_id ASC NULLS LAST
+02)--Inner Join:  Filter: t1.t1_id >= t2.t2_id
+03)----SubqueryAlias: t1
+04)------Filter: join_t1.t1_id >= Int32(22)
+05)--------TableScan: join_t1 projection=[t1_id]
+06)----SubqueryAlias: t2
+07)------Projection: join_t2.t2_id
+08)--------Filter: join_t2.t2_int = Int32(3)
+09)----------TableScan: join_t2 projection=[t2_id, t2_int]
+physical_plan
+01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST, t2_id@1 ASC NULLS LAST]
+02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----PiecewiseMergeJoin: operator=GtEq, join_type=Inner, on=(t1_id >= t2_id)
+04)------SortExec: expr=[t1_id@0 ASC], preserve_partitioning=[false]
+05)--------FilterExec: t1_id@0 >= 22
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_int@1 = 3, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id < t2.t2_id
+WHERE t2.t2_int >= 3
+ORDER BY 1,2;
+----
+11 55
+11 44
+22 55
+22 44
+33 55
+33 44
+44 55
+
+query TT
+EXPLAIN
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id < t2.t2_id
+WHERE t2.t2_int >= 3
+ORDER BY 1,2;
+----
+logical_plan
+01)Sort: t1.t1_id ASC NULLS LAST, t2.t2_id ASC NULLS LAST
+02)--Inner Join:  Filter: t1.t1_id < t2.t2_id
+03)----SubqueryAlias: t1
+04)------TableScan: join_t1 projection=[t1_id]
+05)----SubqueryAlias: t2
+06)------Projection: join_t2.t2_id
+07)--------Filter: join_t2.t2_int >= Int32(3)
+08)----------TableScan: join_t2 projection=[t2_id, t2_int]
+physical_plan
+01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST, t2_id@1 ASC NULLS LAST]
+02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----PiecewiseMergeJoin: operator=Lt, join_type=Inner, on=(t1_id < t2_id)
+04)------SortExec: expr=[t1_id@0 DESC], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)--------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+
+query II
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id < (t2.t2_id + 1)
+WHERE t2.t2_int >= 3
+ORDER BY 1,2;
+----
+11 11
+11 44
+11 55
+22 44
+22 55
+33 44
+33 55
+44 44
+44 55
+
+query TT
+EXPLAIN
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id < (t2.t2_id + 1)
+WHERE t2.t2_int >= 3
+ORDER BY 1,2;
+----
+logical_plan
+01)Sort: t1.t1_id ASC NULLS LAST, t2.t2_id ASC NULLS LAST
+02)--Inner Join:  Filter: CAST(t1.t1_id AS Int64) < CAST(t2.t2_id AS Int64) + Int64(1)
+03)----SubqueryAlias: t1
+04)------TableScan: join_t1 projection=[t1_id]
+05)----SubqueryAlias: t2
+06)------Projection: join_t2.t2_id
+07)--------Filter: join_t2.t2_int >= Int32(3)
+08)----------TableScan: join_t2 projection=[t2_id, t2_int]
+physical_plan
+01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST, t2_id@1 ASC NULLS LAST]
+02)--SortExec: expr=[t1_id@0 ASC NULLS LAST, t2_id@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----PiecewiseMergeJoin: operator=Lt, join_type=Inner, on=(CAST(t1_id AS Int64) < CAST(t2_id AS Int64) + 1)
+04)------SortExec: expr=[CAST(t1_id@0 AS Int64) DESC], preserve_partitioning=[false]
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+07)--------FilterExec: t2_int@1 >= 3, projection=[t2_id@0]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id <= t2.t2_id
+WHERE t1.t1_id IN (11, 44)
+  AND t2.t2_name <> 'y'
+ORDER BY 1,2;
+----
+11 55
+11 44
+11 11
+44 55
+44 44
+
+query TT
+EXPLAIN
+SELECT t1.t1_id, t2.t2_id
+FROM join_t1 t1
+JOIN join_t2 t2
+  ON t1.t1_id <= t2.t2_id
+WHERE t1.t1_id IN (11, 44)
+  AND t2.t2_name <> 'y'
+ORDER BY 1,2;
+----
+logical_plan
+01)Sort: t1.t1_id ASC NULLS LAST, t2.t2_id ASC NULLS LAST
+02)--Inner Join:  Filter: t1.t1_id <= t2.t2_id
+03)----SubqueryAlias: t1
+04)------Filter: join_t1.t1_id = Int32(11) OR join_t1.t1_id = Int32(44)
+05)--------TableScan: join_t1 projection=[t1_id]
+06)----SubqueryAlias: t2
+07)------Projection: join_t2.t2_id
+08)--------Filter: join_t2.t2_name != Utf8View("y")
+09)----------TableScan: join_t2 projection=[t2_id, t2_name]
+physical_plan
+01)SortPreservingMergeExec: [t1_id@0 ASC NULLS LAST, t2_id@1 ASC NULLS LAST]
+02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----PiecewiseMergeJoin: operator=LtEq, join_type=Inner, on=(t1_id <= t2_id)
+04)------SortExec: expr=[t1_id@0 DESC], preserve_partitioning=[false]
+05)--------FilterExec: t1_id@0 = 11 OR t1_id@0 = 44
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------FilterExec: t2_name@1 != y, projection=[t2_id@0]
+09)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+CREATE TABLE null_join_t1 (id INT);
+
+statement ok
+CREATE TABLE null_join_t2 (id INT);
+
+statement ok
+INSERT INTO null_join_t1 VALUES (1), (2), (NULL);
+
+statement ok
+INSERT INTO null_join_t2 VALUES (1), (NULL), (3);
+
+query II
+SELECT t1.id AS left_id, t2.id AS right_id
+FROM null_join_t1 t1
+JOIN null_join_t2 t2
+  ON t1.id > t2.id
+ORDER BY 1,2;
+----
+2 1
+
+# Verify this will offload this query to Nested Loop Join
+query II
+SELECT t1.id AS left_id, t2.id AS right_id
+FROM null_join_t1 t1
+JOIN null_join_t2 t2
+  ON t1.id < (t1.id + t2.id)
+ORDER BY 1,2;
+----
+1 1
+1 3
+2 1
+2 3
+
+query TT
+EXPLAIN
+SELECT t1.id AS left_id, t2.id AS right_id
+FROM null_join_t1 t1
+JOIN null_join_t2 t2
+  ON t1.id < (t1.id + t2.id)
+ORDER BY 1,2;
+----
+logical_plan
+01)Sort: left_id ASC NULLS LAST, right_id ASC NULLS LAST
+02)--Projection: t1.id AS left_id, t2.id AS right_id
+03)----Inner Join:  Filter: t1.id < t1.id + t2.id
+04)------SubqueryAlias: t1
+05)--------TableScan: null_join_t1 projection=[id]
+06)------SubqueryAlias: t2
+07)--------TableScan: null_join_t2 projection=[id]
+physical_plan
+01)SortExec: expr=[left_id@0 ASC NULLS LAST, right_id@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@0 as left_id, id@1 as right_id]
+03)----NestedLoopJoinExec: join_type=Inner, filter=id@0 < id@0 + id@1
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+query II
+SELECT t1.id AS left_id, t2.id AS right_id
+FROM null_join_t1 t1
+JOIN null_join_t2 t2
+  ON t1.id < t2.id
+ORDER BY 1,2;
+----
+1 3 
+2 3 
+
+statement ok
+set datafusion.optimizer.enable_piecewise_merge_join = false;
diff --git a/datafusion/sqllogictest/test_files/qualify.slt b/datafusion/sqllogictest/test_files/qualify.slt
new file mode 100644
index 0000000000000..ce58e3998cf57
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/qualify.slt
@@ -0,0 +1,363 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## QUALIFY Clause Tests
+##########
+
+# Create test data
+statement ok
+CREATE TABLE users (
+  id INT,
+  name VARCHAR,
+  age INT,
+  salary DECIMAL(10,2),
+  dept VARCHAR
+) AS VALUES
+(1, 'Alice', 25, 50000.00, 'Engineering'),
+(2, 'Bob', 30, 60000.00, 'Engineering'),
+(3, 'Charlie', 25, 55000.00, 'Engineering'),
+(4, 'Diana', 35, 70000.00, 'Marketing'),
+(5, 'Eve', 30, 65000.00, 'Marketing'),
+(6, 'Frank', 25, 52000.00, 'Engineering'),
+(7, 'Grace', 35, 75000.00, 'Marketing'),
+(8, 'Henry', 30, 62000.00, 'Engineering');
+
+# Basic QUALIFY with ROW_NUMBER
+query ITI
+SELECT id, name, ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) as rn 
+FROM users 
+QUALIFY rn = 1
+ORDER BY dept, id;
+----
+8 Henry 1
+7 Grace 1
+
+# QUALIFY with RANK
+query ITI
+SELECT id, name, RANK() OVER (ORDER BY salary DESC) as rank 
+FROM users 
+QUALIFY rank <= 3
+ORDER BY rank, id;
+----
+7 Grace 1
+4 Diana 2
+5 Eve 3
+
+# QUALIFY with DENSE_RANK
+query ITI
+SELECT id, name, DENSE_RANK() OVER (PARTITION BY dept ORDER BY age) as dense_rank 
+FROM users 
+QUALIFY dense_rank <= 2
+ORDER BY dept, dense_rank, id;
+----
+1 Alice 1
+3 Charlie 1
+6 Frank 1
+2 Bob 2
+8 Henry 2
+5 Eve 1
+4 Diana 2
+7 Grace 2
+
+# QUALIFY with complex condition
+query ITII
+SELECT id, name, ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) as rn,
+       RANK() OVER (ORDER BY age) as age_rank
+FROM users 
+QUALIFY rn <= 2 AND age_rank <= 5
+ORDER BY dept, rn, id;
+----
+8 Henry 1 4
+2 Bob 2 4
+
+# QUALIFY with LAG function
+query ITRR
+SELECT id, name, salary, LAG(salary) OVER (PARTITION BY dept ORDER BY id) as prev_salary
+FROM users 
+QUALIFY prev_salary IS NOT NULL AND salary > prev_salary
+ORDER BY dept, id;
+----
+2 Bob 60000 50000
+8 Henry 62000 52000
+7 Grace 75000 65000
+
+# QUALIFY with LEAD function
+query ITRR
+SELECT id, name, salary, LEAD(salary) OVER (PARTITION BY dept ORDER BY id) as next_salary
+FROM users 
+QUALIFY next_salary IS NOT NULL AND salary < next_salary
+ORDER BY dept, id;
+----
+1 Alice 50000 60000
+6 Frank 52000 62000
+5 Eve 65000 75000
+
+# QUALIFY with NTILE
+query ITI
+SELECT id, name, NTILE(3) OVER (PARTITION BY dept ORDER BY salary DESC) as tile
+FROM users 
+QUALIFY tile = 1
+ORDER BY dept, id;
+----
+2 Bob 1
+8 Henry 1
+7 Grace 1
+
+# QUALIFY with PERCENT_RANK
+query ITR
+SELECT id, name, PERCENT_RANK() OVER (PARTITION BY dept ORDER BY salary) as pct_rank
+FROM users 
+QUALIFY pct_rank >= 0.5
+ORDER BY dept, pct_rank, id;
+----
+3 Charlie 0.5
+2 Bob 0.75
+8 Henry 1
+4 Diana 0.5
+7 Grace 1
+
+# QUALIFY with CUME_DIST
+query ITR
+SELECT id, name, CUME_DIST() OVER (PARTITION BY dept ORDER BY age) as cume_dist
+FROM users 
+QUALIFY cume_dist >= 0.75
+ORDER BY dept, cume_dist, id;
+----
+2 Bob 1
+8 Henry 1
+4 Diana 1
+7 Grace 1
+
+# QUALIFY with multiple window functions
+query ITIII
+SELECT id, name, 
+       ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) as rn,
+       RANK() OVER (ORDER BY age) as age_rank,
+       DENSE_RANK() OVER (PARTITION BY dept ORDER BY age) as dept_age_rank
+FROM users 
+QUALIFY rn <= 2 AND age_rank <= 4 AND dept_age_rank <= 2
+ORDER BY dept, rn, id;
+----
+8 Henry 1 4 2
+2 Bob 2 4 2
+
+# QUALIFY with arithmetic expressions
+query ITRI
+SELECT id, name, salary, 
+       ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) as rn
+FROM users 
+QUALIFY rn = 1 AND salary > 60000
+ORDER BY dept, id;
+----
+8 Henry 62000 1
+7 Grace 75000 1
+
+# QUALIFY with string functions
+query ITI
+SELECT id, name, 
+       ROW_NUMBER() OVER (PARTITION BY dept ORDER BY name) as rn
+FROM users 
+QUALIFY rn = 1
+ORDER BY dept, id;
+----
+1 Alice 1
+4 Diana 1
+
+# window function with aggregate function
+query ITI
+SELECT id, name, COUNT(*) OVER (PARTITION BY dept) as cnt
+FROM users 
+QUALIFY cnt > 4
+ORDER BY dept, id;
+----
+1 Alice 5
+2 Bob 5
+3 Charlie 5
+6 Frank 5
+8 Henry 5
+
+# QUALIFY with HAVING
+query TR
+SELECT dept, AVG(salary) OVER (PARTITION BY dept) as r
+FROM users
+WHERE salary > 5000
+GROUP BY dept, salary
+HAVING SUM(salary) > 20000
+QUALIFY r > 60000 
+----
+Marketing 70000
+Marketing 70000
+Marketing 70000
+
+# QUALIFY with aggregate function reference from projection
+query TR
+SELECT dept, SUM(salary) AS s
+FROM users
+GROUP BY dept
+QUALIFY RANK() OVER (ORDER BY dept DESC) = 1 AND s > 1000
+ORDER BY dept;
+----
+Marketing 210000
+
+# QUALIFY with aggregate function
+query T
+SELECT dept
+FROM users
+GROUP BY dept
+QUALIFY RANK() OVER (ORDER BY dept DESC) = 1 AND SUM(salary) > 1000
+ORDER BY dept;
+----
+Marketing
+
+# QUALIFY with aggregate function within window function
+query TR
+SELECT dept, SUM(salary) AS s
+FROM users
+GROUP BY dept
+QUALIFY RANK() OVER (ORDER BY SUM(salary) DESC) = 1
+ORDER BY dept;
+----
+Engineering 279000
+
+# QUALIFY with aggregate function reference from projection within window function
+query TR
+SELECT dept, SUM(salary) AS s
+FROM users
+GROUP BY dept
+QUALIFY RANK() OVER (ORDER BY s DESC) = 1
+ORDER BY dept;
+----
+Engineering 279000
+
+# Error: QUALIFY without window functions
+query error
+SELECT id, name FROM users QUALIFY id > 1;
+
+# Window function in QUALIFY
+query IT
+SELECT id, name FROM users QUALIFY COUNT(*) OVER () > 1 ORDER BY id;
+----
+1 Alice
+2 Bob
+3 Charlie
+4 Diana
+5 Eve
+6 Frank
+7 Grace
+8 Henry
+
+# verify the logical plan and physical plan
+query TT
+EXPLAIN SELECT id, name FROM users QUALIFY COUNT(*) OVER () > 1 ORDER BY id;
+----
+logical_plan
+01)Sort: users.id ASC NULLS LAST
+02)--Projection: users.id, users.name
+03)----Filter: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING > Int64(1)
+04)------WindowAggr: windowExpr=[[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+05)--------TableScan: users projection=[id, name]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--FilterExec: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 > 1, projection=[id@0, name@1]
+03)----WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# plan row_number()
+query TT
+explain select row_number() over (PARTITION BY dept) as rk from users qualify rk > 1;
+----
+logical_plan
+01)Projection: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rk
+02)--Filter: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING > UInt64(1)
+03)----Projection: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+04)------WindowAggr: windowExpr=[[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+05)--------TableScan: users projection=[dept]
+physical_plan
+01)ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 as rk]
+02)--FilterExec: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 > 1
+03)----ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+04)------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+05)--------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[false]
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# plan with window function and group by
+query TT
+EXPLAIN SELECT dept, AVG(salary) OVER (PARTITION BY dept) as r
+FROM users
+WHERE salary > 5000
+GROUP BY dept, salary
+HAVING SUM(salary) > 20000
+QUALIFY r > 60000
+----
+logical_plan
+01)Projection: users.dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS r
+02)--Filter: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING > Decimal128(Some(60000000000),14,6)
+03)----Projection: users.dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+04)------WindowAggr: windowExpr=[[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+05)--------Projection: users.dept, users.salary
+06)----------Filter: sum(users.salary) > Decimal128(Some(2000000),20,2)
+07)------------Aggregate: groupBy=[[users.dept, users.salary]], aggr=[[sum(users.salary)]]
+08)--------------Filter: users.salary > Decimal128(Some(500000),10,2)
+09)----------------TableScan: users projection=[salary, dept]
+physical_plan
+01)ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as r]
+02)--FilterExec: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 > Some(60000000000),14,6
+03)----ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+04)------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+05)--------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4
+07)------------FilterExec: sum(users.salary)@2 > Some(2000000),20,2, projection=[dept@0, salary@1]
+08)--------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept, salary@1 as salary], aggr=[sum(users.salary)]
+09)----------------RepartitionExec: partitioning=Hash([dept@0, salary@1], 4), input_partitions=4
+10)------------------AggregateExec: mode=Partial, gby=[dept@1 as dept, salary@0 as salary], aggr=[sum(users.salary)]
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------------FilterExec: salary@0 > Some(500000),10,2
+13)------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# plan with aggregate function
+query TT
+EXPLAIN SELECT dept, SUM(salary) AS s
+FROM users
+GROUP BY dept
+QUALIFY RANK() OVER (ORDER BY s DESC) = 1
+ORDER BY dept;
+----
+logical_plan
+01)Sort: users.dept ASC NULLS LAST
+02)--Projection: users.dept, sum(users.salary) AS s
+03)----Filter: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW = UInt64(1)
+04)------WindowAggr: windowExpr=[[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+05)--------Aggregate: groupBy=[[users.dept]], aggr=[[sum(users.salary)]]
+06)----------TableScan: users projection=[salary, dept]
+physical_plan
+01)SortPreservingMergeExec: [dept@0 ASC NULLS LAST]
+02)--SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[dept@0 as dept, sum(users.salary)@1 as s]
+04)------FilterExec: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 = 1, projection=[dept@0, sum(users.salary)@1]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+07)------------SortPreservingMergeExec: [sum(users.salary)@1 DESC]
+08)--------------SortExec: expr=[sum(users.salary)@1 DESC], preserve_partitioning=[true]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept], aggr=[sum(users.salary)]
+10)------------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=1
+11)--------------------AggregateExec: mode=Partial, gby=[dept@1 as dept], aggr=[sum(users.salary)]
+12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Clean up
+statement ok
+DROP TABLE users; 
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_instr.slt b/datafusion/sqllogictest/test_files/regexp/regexp_instr.slt
new file mode 100644
index 0000000000000..d4e98e6431678
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_instr.slt
@@ -0,0 +1,196 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Import common test data
+include ./init_data.slt.part
+
+query I
+SELECT regexp_instr('123123123123123', '(12)3');
+----
+1
+
+query I
+SELECT regexp_instr('123123123123', '123', 1);
+----
+1
+
+query I
+SELECT regexp_instr('123123123123', '123', 3);
+----
+4
+
+query I
+SELECT regexp_instr('123123123123', '123', 33);
+----
+0
+
+query I
+SELECT regexp_instr('ABCABCABCABC', 'Abc', 1, 2, '');
+----
+0
+
+query I
+SELECT regexp_instr('ABCABCABCABC', 'Abc', 1, 2, 'i');
+----
+4
+
+query I
+SELECT
+    regexp_instr(
+        'The quick brown fox jumps over the lazy dog.',
+        ' (quick) (brown) (fox)',
+        1,
+        1,
+        'i',
+        2   -- subexpression_number (2 for second group)
+    );
+----
+11
+
+statement error
+External error: query failed: DataFusion error: Arrow error: Compute error: regexp_instr() requires start to be 1 based
+SELECT regexp_instr('123123123123', '123', 0);
+
+statement error
+External error: query failed: DataFusion error: Arrow error: Compute error: regexp_instr() requires start to be 1 based
+SELECT regexp_instr('123123123123', '123', -3);
+
+query I
+SELECT regexp_instr(str, pattern) FROM regexp_test_data;
+----
+NULL
+1
+1
+0
+0
+0
+0
+1
+1
+1
+1
+1
+
+query I
+SELECT regexp_instr(str, pattern, start) FROM regexp_test_data;
+----
+NULL
+1
+1
+0
+0
+0
+0
+0
+3
+4
+1
+2
+
+
+statement ok
+CREATE TABLE t_stringview AS
+SELECT
+  arrow_cast(str, 'Utf8View') AS str,
+  arrow_cast(pattern, 'Utf8View') AS pattern,
+  arrow_cast(start, 'Int64') AS start
+FROM regexp_test_data;
+
+query I
+SELECT regexp_instr(str, pattern, start) FROM t_stringview;
+----
+NULL
+1
+1
+0
+0
+0
+0
+0
+3
+4
+1
+2
+
+query I
+SELECT regexp_instr(
+  arrow_cast(str, 'Utf8'),
+  arrow_cast(pattern, 'LargeUtf8'),
+  arrow_cast(start, 'Int32')
+) FROM t_stringview;
+----
+NULL
+1
+1
+0
+0
+0
+0
+0
+3
+4
+1
+2
+
+query I
+SELECT regexp_instr(NULL, NULL);
+----
+NULL
+
+query I
+SELECT regexp_instr(NULL, 'a');
+----
+NULL
+
+query I
+SELECT regexp_instr('a', NULL);
+----
+NULL
+
+query I
+SELECT regexp_instr('😀abcdef', 'abc');
+----
+2
+
+
+statement ok
+CREATE TABLE empty_table (str varchar, pattern varchar, start int);
+
+query I
+SELECT regexp_instr(str, pattern, start) FROM empty_table;
+----
+
+statement ok
+INSERT INTO empty_table VALUES
+  ('a', NULL, 1),
+  (NULL, 'a', 1),
+  (NULL, NULL, 1),
+  (NULL, NULL, NULL);
+
+query I
+SELECT regexp_instr(str, pattern, start) FROM empty_table;
+----
+NULL
+NULL
+NULL
+NULL
+
+statement ok
+DROP TABLE t_stringview;
+
+statement ok
+DROP TABLE empty_table;
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
index 8c407ea2e7608..2b304c8de1a3c 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
@@ -169,8 +169,7 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
 true
 
 statement error
-Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata
-: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: false, metadata: {} })
+Error during planning: Cannot infer common argument type for regex operation List(Field { name: "item", data_type: Int64, nullable: true, metadata: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, metadata: {} })
 select [1,2] ~ [3];
 
 query B
@@ -252,9 +251,8 @@ logical_plan
 01)Filter: dict_table.column1 LIKE Utf8("%oo%")
 02)--TableScan: dict_table projection=[column1]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 LIKE %oo%
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: column1@0 LIKE %oo%
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Ensure casting / coercion works for all operators
 # (there should be no casts to Utf8)
@@ -278,3 +276,68 @@ drop table strings
 
 statement ok
 drop table dict_table
+
+# Ensure that regexp_like is rewritten to use the (more optimized) regex operators
+statement ok
+create table regexp_test as values
+  ('foobar', 'i'),
+  ('Foo', 'i'),
+  ('bar', 'mi') ;
+
+# Expressions that can be rewritten to use the ~ operator (which is more optimized)
+# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function)
+query TT
+explain select
+  regexp_like(column1, 'fo.*'),
+  regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~* Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))
+02)--TableScan: regexp_test projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[column1@0 ~ fo.* as regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+  regexp_like(column1, 'fo.*'),
+  regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+true true
+false true
+false false
+
+# Expressions that can not be rewritten to use the ~ / ~* operators
+# (expect the plans to use the REGEXP_LIKE function)
+query TT
+explain select
+  regexp_like(column1, 'f.*r', 'mi'), -- args
+  regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")), regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2)
+02)--TableScan: regexp_test projection=[column1, column2]
+physical_plan
+01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")), regexp_like(column1@0, f.*r, column2@1) as regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+  regexp_like(column1, 'f.*r', 'mi'), -- args
+  regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+true true
+false false
+false false
+
+query TT
+select * from regexp_test where regexp_like('f', regexp_replace((('v\r') like ('f_*sP6H1*')), '339629555', '-1459539013'));
+----
+
+
+statement ok
+drop table if exists dict_table;
diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt
index 70666346e2cab..54e445f212422 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -44,11 +44,9 @@ logical_plan
 02)--TableScan: parquet_table projection=[column1, column2]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
-04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
+02)--RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
 
 # disable round robin repartitioning
 statement ok
@@ -62,10 +60,9 @@ logical_plan
 02)--TableScan: parquet_table projection=[column1, column2]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
-04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
+02)--RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=1
+03)----AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
 
 
 # Cleanup
@@ -122,10 +119,9 @@ logical_plan
 03)----TableScan: sink_table projection=[c1, c2, c3]
 physical_plan
 01)CoalescePartitionsExec: fetch=5
-02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5
-03)----FilterExec: c3@2 > 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
-05)--------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
+02)--FilterExec: c3@2 > 0, fetch=5
+03)----RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+04)------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true
 
 # Start repratition on empty column test.
 # See https://github.com/apache/datafusion/issues/12057
diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt
index 2b30de572c8cc..c9c2f91257081 100644
--- a/datafusion/sqllogictest/test_files/repartition_scan.slt
+++ b/datafusion/sqllogictest/test_files/repartition_scan.slt
@@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4;
 statement ok
 set datafusion.optimizer.repartition_file_min_size = 1;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 ###################
 ### Parquet tests
 ###################
@@ -59,9 +63,8 @@ logical_plan
 01)Filter: parquet_table.column1 != Int32(42)
 02)--TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..137], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:137..274], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:274..411], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:411..547]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # disable round robin repartitioning
 statement ok
@@ -75,9 +78,8 @@ logical_plan
 01)Filter: parquet_table.column1 != Int32(42)
 02)--TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..137], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:137..274], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:274..411], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:411..547]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # enable round robin repartitioning again
 statement ok
@@ -100,9 +102,8 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST]
 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: column1@0 != 42
-05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:272..538, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..278], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:278..547]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+03)----FilterExec: column1@0 != 42
+04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 
 ## Read the files as though they are ordered
@@ -136,9 +137,8 @@ logical_plan
 03)----TableScan: parquet_table_with_order projection=[column1], partial_filters=[parquet_table_with_order.column1 != Int32(42)]
 physical_plan
 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: column1@0 != 42
-04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..269], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..273], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:273..547], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:269..538]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
+02)--FilterExec: column1@0 != 42
+03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)]
 
 # Cleanup
 statement ok
@@ -183,9 +183,8 @@ logical_plan
 01)Filter: csv_table.column1 != Int32(42)
 02)--TableScan: csv_table projection=[column1], partial_filters=[csv_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:5..10], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:10..15], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:15..18]]}, projection=[column1], file_type=csv, has_header=true
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:5..10], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:10..15], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:15..18]]}, projection=[column1], file_type=csv, has_header=true
 
 # Cleanup
 statement ok
@@ -226,9 +225,8 @@ logical_plan
 01)Filter: json_table.column1 != Int32(42)
 02)--TableScan: json_table projection=[column1], partial_filters=[json_table.column1 != Int32(42)]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: column1@0 != 42
-03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:0..18], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:18..36], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:36..54], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:54..70]]}, projection=[column1], file_type=json
+01)FilterExec: column1@0 != 42
+02)--DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:0..18], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:18..36], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:36..54], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json:54..70]]}, projection=[column1], file_type=json
 
 # Cleanup
 statement ok
@@ -244,7 +242,7 @@ DROP TABLE json_table;
 statement ok
 CREATE EXTERNAL TABLE arrow_table
 STORED AS ARROW
-LOCATION '../core/tests/data/example.arrow';
+LOCATION '../datasource-arrow/tests/data/example.arrow';
 
 
 # It would be great to see the file read as "4" groups with even sizes (offsets) eventually
@@ -253,7 +251,7 @@ query TT
 EXPLAIN SELECT * FROM arrow_table
 ----
 logical_plan TableScan: arrow_table projection=[f0, f1, f2]
-physical_plan DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:0..461], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:461..922], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:922..1383], [WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow:1383..1842]]}, projection=[f0, f1, f2], file_type=arrow
+physical_plan DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:0..461], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:461..922], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:922..1383], [WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example.arrow:1383..1842]]}, projection=[f0, f1, f2], file_type=arrow
 
 # correct content
 query ITB
diff --git a/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt
new file mode 100644
index 0000000000000..e2c9fa4237939
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/repartition_subset_satisfaction.slt
@@ -0,0 +1,526 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+# Tests for Subset Partitioning Optimization
+#
+# Subset partitioning allows Hash([a]) to satisfy Hash([a, b]) requirements
+# when the required partitioning expressions are a strict subset of the
+# current partitioning expressions.
+##########
+
+##########
+# SETUP: Configuration and Data Generation
+##########
+
+statement ok
+set datafusion.optimizer.enable_round_robin_repartition = false;
+
+statement ok
+set datafusion.execution.target_partitions = 3;
+
+statement ok
+set datafusion.optimizer.preserve_file_partitions = 1;
+
+# Create fact table partitioned by f_dkey (3 partitions)
+# Each partition has data sorted by timestamp
+# Partition: f_dkey=A
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 95.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 102.3),
+    (TIMESTAMP '2023-01-01T09:00:20', 98.7),
+    (TIMESTAMP '2023-01-01T09:12:20', 105.1),
+    (TIMESTAMP '2023-01-01T09:12:30', 100.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 150.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 120.8)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet'
+STORED AS PARQUET;
+
+# Partition: f_dkey=B
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 75.2),
+    (TIMESTAMP '2023-01-01T09:00:10', 82.4),
+    (TIMESTAMP '2023-01-01T09:00:20', 78.9),
+    (TIMESTAMP '2023-01-01T09:00:30', 85.6),
+    (TIMESTAMP '2023-01-01T09:12:30', 80.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 120.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 92.3)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet'
+STORED AS PARQUET;
+
+# Partition: f_dkey=C
+statement ok
+COPY (SELECT column1 as timestamp, column2 as value FROM (VALUES
+    (TIMESTAMP '2023-01-01T09:00:00', 300.5),
+    (TIMESTAMP '2023-01-01T09:00:10', 285.7),
+    (TIMESTAMP '2023-01-01T09:00:20', 310.2),
+    (TIMESTAMP '2023-01-01T09:00:30', 295.8),
+    (TIMESTAMP '2023-01-01T09:00:40', 300.0),
+    (TIMESTAMP '2023-01-01T09:12:40', 250.0),
+    (TIMESTAMP '2023-01-01T09:12:50', 275.4)
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet'
+STORED AS PARQUET;
+
+# Create dimension table partitioned by d_dkey (4 partitions)
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('dev', 'log', 'ma')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'log', 'ma')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'log', 'vim')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+query I
+COPY (SELECT column1 as env, column2 as service, column3 as host FROM (VALUES
+    ('prod', 'trace', 'vim')
+))
+TO 'test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet'
+STORED AS PARQUET;
+----
+1
+
+##########
+# TABLE DECLARATIONS
+##########
+
+# Fact table with ordering
+statement ok
+CREATE EXTERNAL TABLE fact_table_ordered (timestamp TIMESTAMP, value DOUBLE)
+STORED AS PARQUET
+PARTITIONED BY (f_dkey STRING)
+WITH ORDER (f_dkey ASC, timestamp ASC)
+LOCATION 'test_files/scratch/repartition_subset_satisfaction/fact/';
+
+# Dimension table (for join tests)
+statement ok
+CREATE EXTERNAL TABLE dimension_table (env STRING, service STRING, host STRING)
+STORED AS PARQUET
+PARTITIONED BY (d_dkey STRING)
+LOCATION 'test_files/scratch/repartition_subset_satisfaction/dimension/';
+
+##########
+# TEST 1: Basic Aggregate with Subset Partitioning
+# Demonstrates that GROUP BY [f_dkey, time_bin] can use
+# file partitioning on just [f_dkey]
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST, time_bin ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp) AS time_bin, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as time_bin, count(Int64(1))@2 as count(*), avg(fact_table_ordered.value)@3 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------SortExec: expr=[f_dkey@0 ASC NULLS LAST, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([f_dkey@0, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[f_dkey@2 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+07)------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without subset satisfaction
+query TPIR rowsort
+SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+A 2023-01-01T09:00:00 3 98.833333333333
+A 2023-01-01T09:12:00 1 105.1
+A 2023-01-01T09:12:30 3 123.6
+B 2023-01-01T09:00:00 3 78.833333333333
+B 2023-01-01T09:00:30 1 85.6
+B 2023-01-01T09:12:30 3 97.433333333333
+C 2023-01-01T09:00:00 3 298.8
+C 2023-01-01T09:00:30 2 297.9
+C 2023-01-01T09:12:30 2 262.7
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+logical_plan
+01)Sort: fact_table_ordered.f_dkey ASC NULLS LAST, time_bin ASC NULLS LAST
+02)--Projection: fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp) AS time_bin, count(Int64(1)) AS count(*), avg(fact_table_ordered.value)
+03)----Aggregate: groupBy=[[fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)]], aggr=[[count(Int64(1)), avg(fact_table_ordered.value)]]
+04)------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)@1 as time_bin, count(Int64(1))@2 as count(*), avg(fact_table_ordered.value)@3 as avg(fact_table_ordered.value)]
+03)----AggregateExec: mode=SinglePartitioned, gby=[f_dkey@2 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)], aggr=[count(Int64(1)), avg(fact_table_ordered.value)], ordering_mode=Sorted
+04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results match with subset satisfaction
+query TPIR rowsort
+SELECT f_dkey, date_bin(INTERVAL '30 seconds', timestamp) as time_bin,
+       COUNT(*), AVG(value)
+FROM fact_table_ordered
+GROUP BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+ORDER BY f_dkey, time_bin;
+----
+A 2023-01-01T09:00:00 3 98.833333333333
+A 2023-01-01T09:12:00 1 105.1
+A 2023-01-01T09:12:30 3 123.6
+B 2023-01-01T09:00:00 3 78.833333333333
+B 2023-01-01T09:00:30 1 85.6
+B 2023-01-01T09:12:30 3 97.433333333333
+C 2023-01-01T09:00:00 3 298.8
+C 2023-01-01T09:00:30 2 297.9
+C 2023-01-01T09:12:30 2 262.7
+
+##########
+# TEST 2: Window Functions with Subset Partitioning
+# Demonstrates that PARTITION BY [f_dkey, time_bin] can use
+# file partitioning on just [f_dkey]
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }\"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----SortExec: expr=[f_dkey@2 ASC NULLS LAST, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0) ASC NULLS LAST, timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------RepartitionExec: partitioning=Hash([f_dkey@2, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@0)], 3), input_partitions=3
+05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results without subset satisfaction
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered;
+----
+logical_plan
+01)Projection: fact_table_ordered.f_dkey, fact_table_ordered.timestamp, fact_table_ordered.value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn
+02)--WindowAggr: windowExpr=[[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+03)----TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)ProjectionExec: expr=[f_dkey@2 as f_dkey, timestamp@0 as timestamp, value@1 as value, row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rn]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [fact_table_ordered.f_dkey, date_bin(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }\"),fact_table_ordered.timestamp)] ORDER BY [fact_table_ordered.timestamp ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet
+
+# Verify results match with subset satisfaction
+query TPRI rowsort
+SELECT f_dkey, timestamp, value,
+       ROW_NUMBER() OVER (
+           PARTITION BY f_dkey, date_bin(INTERVAL '30 seconds', timestamp)
+           ORDER BY timestamp
+       ) as rn
+FROM fact_table_ordered
+WHERE timestamp < TIMESTAMP '2023-01-01T09:00:30';
+----
+A 2023-01-01T09:00:00 95.5 1
+A 2023-01-01T09:00:10 102.3 2
+A 2023-01-01T09:00:20 98.7 3
+B 2023-01-01T09:00:00 75.2 1
+B 2023-01-01T09:00:10 82.4 2
+B 2023-01-01T09:00:20 78.9 3
+C 2023-01-01T09:00:00 300.5 1
+C 2023-01-01T09:00:10 285.7 2
+C 2023-01-01T09:00:20 310.2 3
+
+##########
+# TEST 3: Complex Join and Aggregate with Subset Partitioning
+# Demonstrates subset partitioning with joins and nested aggregations
+##########
+
+# With subset repartitioning forced (disables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 4;
+
+query TT
+EXPLAIN SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+logical_plan
+01)Sort: a.env ASC NULLS LAST, a.time_bin ASC NULLS LAST
+02)--Projection: a.env, a.time_bin, avg(a.max_bin_value) AS avg_max_value
+03)----Aggregate: groupBy=[[a.env, a.time_bin]], aggr=[[avg(a.max_bin_value)]]
+04)------SubqueryAlias: a
+05)--------Projection: date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp) AS time_bin, j.env, max(j.value) AS max_bin_value
+06)----------Aggregate: groupBy=[[j.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), j.timestamp), j.env]], aggr=[[max(j.value)]]
+07)------------SubqueryAlias: j
+08)--------------Projection: f.f_dkey, d.env, f.timestamp, f.value
+09)----------------Inner Join: d.d_dkey = f.f_dkey
+10)------------------SubqueryAlias: d
+11)--------------------Projection: dimension_table.env, dimension_table.d_dkey
+12)----------------------Filter: dimension_table.service = Utf8View("log")
+13)------------------------TableScan: dimension_table projection=[env, service, d_dkey], partial_filters=[dimension_table.service = Utf8View("log")]
+14)------------------SubqueryAlias: f
+15)--------------------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value]
+04)------AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)]
+05)--------RepartitionExec: partitioning=Hash([env@0, time_bin@1], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)]
+07)------------ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value]
+08)--------------AggregateExec: mode=FinalPartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@2 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+09)----------------SortExec: expr=[f_dkey@0 ASC NULLS LAST, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 ASC NULLS LAST], preserve_partitioning=[true]
+10)------------------RepartitionExec: partitioning=Hash([f_dkey@0, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1, env@2], 3), input_partitions=3
+11)--------------------AggregateExec: mode=Partial, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+12)----------------------ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value]
+13)------------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4]
+14)--------------------------CoalescePartitionsExec
+15)----------------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2]
+16)------------------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+17)--------------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results without subset satisfaction
+query TPR rowsort
+SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+dev 2023-01-01T09:00:00 102.3
+dev 2023-01-01T09:12:00 105.1
+dev 2023-01-01T09:12:30 150
+prod 2023-01-01T09:00:00 196.3
+prod 2023-01-01T09:00:30 192.8
+prod 2023-01-01T09:12:30 197.7
+
+# With subset logic enabled (default - enables subset optimization)
+statement ok
+set datafusion.optimizer.subset_repartition_threshold = 1;
+
+query TT
+EXPLAIN SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+logical_plan
+01)Sort: a.env ASC NULLS LAST, a.time_bin ASC NULLS LAST
+02)--Projection: a.env, a.time_bin, avg(a.max_bin_value) AS avg_max_value
+03)----Aggregate: groupBy=[[a.env, a.time_bin]], aggr=[[avg(a.max_bin_value)]]
+04)------SubqueryAlias: a
+05)--------Projection: date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp) AS time_bin, j.env, max(j.value) AS max_bin_value
+06)----------Aggregate: groupBy=[[j.f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"), j.timestamp), j.env]], aggr=[[max(j.value)]]
+07)------------SubqueryAlias: j
+08)--------------Projection: f.f_dkey, d.env, f.timestamp, f.value
+09)----------------Inner Join: d.d_dkey = f.f_dkey
+10)------------------SubqueryAlias: d
+11)--------------------Projection: dimension_table.env, dimension_table.d_dkey
+12)----------------------Filter: dimension_table.service = Utf8View("log")
+13)------------------------TableScan: dimension_table projection=[env, service, d_dkey], partial_filters=[dimension_table.service = Utf8View("log")]
+14)------------------SubqueryAlias: f
+15)--------------------TableScan: fact_table_ordered projection=[timestamp, value, f_dkey]
+physical_plan
+01)SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST]
+02)--SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value]
+04)------AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)]
+05)--------RepartitionExec: partitioning=Hash([env@0, time_bin@1], 3), input_partitions=3
+06)----------AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)]
+07)------------ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value]
+08)--------------AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1])
+09)----------------ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value]
+10)------------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4]
+11)--------------------CoalescePartitionsExec
+12)----------------------FilterExec: service@1 = log, projection=[env@0, d_dkey@2]
+13)------------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=A/data.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=D/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/dimension/d_dkey=C/data.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)]
+14)--------------------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=A/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=B/data.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_subset_satisfaction/fact/f_dkey=C/data.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Verify results match with subset satisfaction
+query TPR rowsort
+SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value
+FROM
+(
+    SELECT  f_dkey,
+            date_bin(INTERVAL '30 seconds', timestamp) AS time_bin,
+            env,
+            MAX(value) AS max_bin_value
+    FROM
+        (
+        SELECT
+            f.f_dkey,
+            d.env,
+            d.service,
+            d.host,
+            f.timestamp,
+            f.value
+        FROM dimension_table d
+        INNER JOIN fact_table_ordered f ON d.d_dkey = f.f_dkey
+        WHERE service = 'log'
+        ) AS j
+    GROUP BY f_dkey, time_bin, env
+) AS a
+GROUP BY env, time_bin
+ORDER BY env, time_bin;
+----
+dev 2023-01-01T09:00:00 102.3
+dev 2023-01-01T09:12:00 105.1
+dev 2023-01-01T09:12:30 150
+prod 2023-01-01T09:00:00 196.3
+prod 2023-01-01T09:00:30 192.8
+prod 2023-01-01T09:12:30 197.7
+
+##########
+# CLEANUP
+##########
+
+statement ok
+DROP TABLE fact_table_ordered;
+
+statement ok
+DROP TABLE dimension_table;
diff --git a/datafusion/sqllogictest/test_files/run_end_encoded.slt b/datafusion/sqllogictest/test_files/run_end_encoded.slt
new file mode 100644
index 0000000000000..1f0a9b4eb3fd8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/run_end_encoded.slt
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Run-End Encoded (REE) array support in aggregations
+# This tests that REE arrays can be used as GROUP BY keys (requires proper hashing support)
+
+# Create a table with REE-encoded sensor IDs using arrow_cast
+# First create primitive arrays, then cast to REE in a second step
+statement ok
+CREATE TABLE sensor_readings AS
+WITH raw_data AS (
+  SELECT * FROM (
+    VALUES
+      ('sensor_A', 22),
+      ('sensor_A', 23),
+      ('sensor_B', 20),
+      ('sensor_A', 24)
+  ) AS t(sensor_id, temperature)
+)
+SELECT
+  arrow_cast(sensor_id, 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') AS sensor_id,
+  temperature
+FROM raw_data;
+
+# Test basic aggregation with REE column as GROUP BY key
+query ?RI rowsort
+SELECT
+    sensor_id,
+    AVG(temperature) AS avg_temp,
+    COUNT(*) AS reading_count
+FROM sensor_readings
+GROUP BY sensor_id;
+----
+sensor_A 23 3
+sensor_B 20 1
+
+# Test DISTINCT with REE column
+query ? rowsort
+SELECT DISTINCT sensor_id
+FROM sensor_readings;
+----
+sensor_A
+sensor_B
diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt
index f583d659fd4f5..e91ec1cb848ba 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -309,6 +309,62 @@ select ceil(a), ceil(b), ceil(c) from small_floats;
 1 0 0
 1 0 1
 
+# ceil with scale parameter(Scale not supported)
+query error DataFusion error: This feature is not implemented: CEIL with scale is not supported
+select ceil(100.1234, 1)
+
+# ceil with datetime parameter (not supported)
+query error DataFusion error: This feature is not implemented: CEIL with datetime is not supported
+select ceil(100.1234 to year)
+
+# ceil with decimal argument
+query RRRR
+select
+  ceil(arrow_cast(1.23,'Decimal128(10,2)')),
+  ceil(arrow_cast(-1.23,'Decimal128(10,2)')),
+  ceil(arrow_cast(123.00,'Decimal128(10,2)')),
+  ceil(arrow_cast(-123.00,'Decimal128(10,2)'));
+----
+2 -1 123 -123
+
+# ceil overflow with limited precision
+query error Decimal overflow while applying ceil
+select ceil(arrow_cast(9.23,'Decimal128(3,2)'));
+
+# ceil with decimal32 argument (ensure decimal output)
+query TTTTTTTT
+select
+  arrow_typeof(ceil(arrow_cast(9.01,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-9.01,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(-9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(10.00,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(10.00,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-0.99,'Decimal32(7,2)'))),
+  arrow_cast(ceil(arrow_cast(-0.99,'Decimal32(7,2)')), 'Utf8');
+----
+Decimal32(7, 2) 10.00 Decimal32(7, 2) -9.00 Decimal32(7, 2) 10.00 Decimal32(7, 2) 0.00
+
+# ceil with decimal64 zero scale
+query TTTT
+select
+  arrow_typeof(ceil(arrow_cast(123456789,'Decimal64(18,0)'))),
+  arrow_cast(ceil(arrow_cast(123456789,'Decimal64(18,0)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast(-987654321,'Decimal64(18,0)'))),
+  arrow_cast(ceil(arrow_cast(-987654321,'Decimal64(18,0)')), 'Utf8');
+----
+Decimal64(18, 0) 123456789 Decimal64(18, 0) -987654321
+
+# ceil with decimal256 argument
+query TTTT
+select
+  arrow_typeof(ceil(arrow_cast('9999999999999999999999999999999999.01','Decimal256(38,2)'))),
+  arrow_cast(ceil(arrow_cast('9999999999999999999999999999999999.01','Decimal256(38,2)')), 'Utf8'),
+  arrow_typeof(ceil(arrow_cast('-9999999999999999999999999999999999.01','Decimal256(38,2)'))),
+  arrow_cast(ceil(arrow_cast('-9999999999999999999999999999999999.01','Decimal256(38,2)')), 'Utf8');
+----
+Decimal256(38, 2) 10000000000000000000000000000000000.00 Decimal256(38, 2) -9999999999999999999999999999999999.00
+
 ## degrees
 
 # degrees scalar function
@@ -448,6 +504,62 @@ select floor(a), floor(b), floor(c) from signed_integers;
 2 -1000 123
 4 NULL NULL
 
+# floor with scale parameter(Scale not supported)
+query error DataFusion error: This feature is not implemented: FLOOR with scale is not supported
+select floor(a, 1)
+
+# floor with datetime parameter ( not supported)
+query error DataFusion error: This feature is not implemented: FLOOR with datetime is not supported
+select floor(a to year)
+
+# floor with decimal argument
+query RRRR
+select
+  floor(arrow_cast(1.23,'Decimal128(10,2)')),
+  floor(arrow_cast(-1.23,'Decimal128(10,2)')),
+  floor(arrow_cast(123.00,'Decimal128(10,2)')),
+  floor(arrow_cast(-123.00,'Decimal128(10,2)'));
+----
+1 -2 123 -123
+
+# floor overflow with limited precision
+query error Decimal overflow while applying floor
+select floor(arrow_cast(-9.23,'Decimal128(3,2)'));
+
+# floor with decimal32 argument (ensure decimal output)
+query TTTTTTTT
+select
+  arrow_typeof(floor(arrow_cast(9.99,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(9.99,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-9.01,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(-9.01,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(10.00,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(10.00,'Decimal32(7,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-0.01,'Decimal32(7,2)'))),
+  arrow_cast(floor(arrow_cast(-0.01,'Decimal32(7,2)')), 'Utf8');
+----
+Decimal32(7, 2) 9.00 Decimal32(7, 2) -10.00 Decimal32(7, 2) 10.00 Decimal32(7, 2) -1.00
+
+# floor with decimal64 zero scale
+query TTTT
+select
+  arrow_typeof(floor(arrow_cast(123456789,'Decimal64(18,0)'))),
+  arrow_cast(floor(arrow_cast(123456789,'Decimal64(18,0)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast(-987654321,'Decimal64(18,0)'))),
+  arrow_cast(floor(arrow_cast(-987654321,'Decimal64(18,0)')), 'Utf8');
+----
+Decimal64(18, 0) 123456789 Decimal64(18, 0) -987654321
+
+# floor with decimal256 argument
+query TTTT
+select
+  arrow_typeof(floor(arrow_cast('9999999999999999999999999999999999.99','Decimal256(38,2)'))),
+  arrow_cast(floor(arrow_cast('9999999999999999999999999999999999.99','Decimal256(38,2)')), 'Utf8'),
+  arrow_typeof(floor(arrow_cast('-9999999999999999999999999999999999.99','Decimal256(38,2)'))),
+  arrow_cast(floor(arrow_cast('-9999999999999999999999999999999999.99','Decimal256(38,2)')), 'Utf8');
+----
+Decimal256(38, 2) 9999999999999999999999999999999999.00 Decimal256(38, 2) -10000000000000000000000000000000000.00
+
 ## ln
 
 # ln scalar function
@@ -523,7 +635,7 @@ query RRR rowsort
 select log(a, 64) a, log(b), log(10, b) from unsigned_integers;
 ----
 3 NULL NULL
-3.7855785 4 4
+3.785578521429 4 4
 6 3 3
 Infinity 2 2
 
@@ -653,11 +765,11 @@ select nanvl(null, 64);
 ----
 NULL
 
-# nanvl scalar nulls #1
+# nanvl scalar nulls #1 - x is not NaN, so return x even if y is NULL
 query R rowsort
 select nanvl(2, null);
 ----
-NULL
+2
 
 # nanvl scalar nulls #2
 query R rowsort
@@ -730,26 +842,26 @@ select pi(), pi() / 2, pi() / 3;
 
 ## power
 
-# power scalar function
-query III rowsort
+# power scalar function (always returns Float64, like PostgreSQL)
+query RRR rowsort
 select power(2, 0), power(2, 1), power(2, 2);
 ----
 1 2 4
 
 # power scalar nulls
-query I rowsort
+query R rowsort
 select power(null, 64);
 ----
 NULL
 
 # power scalar nulls #1
-query I rowsort
+query R rowsort
 select power(2, null);
 ----
 NULL
 
 # power scalar nulls #2
-query I rowsort
+query R rowsort
 select power(null, null);
 ----
 NULL
@@ -811,13 +923,103 @@ select round(a), round(b), round(c) from small_floats;
 
 # round with too large
 #  max Int32 is 2147483647
-query error DataFusion error: Execution error: Invalid values for decimal places: Cast error: Can't cast value 2147483648 to type Int32
+query error round decimal_places 2147483648 is out of supported i32 range
 select round(3.14, 2147483648);
 
 # with array
-query error DataFusion error: Execution error: Invalid values for decimal places: Cast error: Can't cast value 2147483649 to type Int32
+query error Arrow error: Cast error: Can't cast value 2147483649 to type Int32
 select round(column1, column2) from values (3.14, 2), (3.14, 3), (3.14, 2147483649);
 
+# round decimal should not cast to float
+# scale reduces to match decimal_places
+query TR
+select arrow_typeof(round('173975140545.855'::decimal(38,10), 2)),
+       round('173975140545.855'::decimal(38,10), 2);
+----
+Decimal128(38, 2) 173975140545.86
+
+# round decimal ties away from zero
+query RRRR
+select round('1.5'::decimal(2,1), 0),
+       round('-1.5'::decimal(2,1), 0),
+       round('2.5'::decimal(2,1), 0),
+       round('-2.5'::decimal(2,1), 0);
+----
+2 -2 3 -3
+
+# round decimal negative places (left of decimal)
+query TR
+select arrow_typeof(round('12345.55'::decimal(10,2), -1)),
+       round('12345.55'::decimal(10,2), -1);
+----
+Decimal128(10, 0) 12350
+
+# round decimal scale 0 negative places (carry can require extra precision)
+query TR
+select arrow_typeof(round('99'::decimal(2,0), -1)),
+       round('99'::decimal(2,0), -1);
+----
+Decimal128(3, 0) 100
+
+# round decimal256 keeps decimals
+query TR
+select arrow_typeof(round('1234.5678'::decimal(50,4), 2)),
+       round('1234.5678'::decimal(50,4), 2);
+----
+Decimal256(50, 2) 1234.57
+
+# round decimal with carry-over (reduce scale)
+# Scale reduces from 1 to 0, allowing extra digit for carry-over
+query TRRR
+select arrow_typeof(round('999.9'::decimal(4,1))),
+       round('999.9'::decimal(4,1)),
+       round('-999.9'::decimal(4,1)),
+       round('99.99'::decimal(4,2));
+----
+Decimal128(4, 0) 1000 -1000 100
+
+# round decimal with carry-over and non-literal decimal_places (increase precision)
+# Scale can't be reduced when decimal_places isn't a constant, so precision increases.
+query TR
+select arrow_typeof(round(val, dp)), round(val, dp)
+from (values (cast('999.9' as decimal(4,1)), 0)) as t(val, dp);
+----
+Decimal128(5, 1) 1000
+
+# round decimal at max precision now works (scale reduction handles overflow)
+query TR
+select arrow_typeof(round('9999999999999999999999999999999999999.9'::decimal(38,1))),
+       round('9999999999999999999999999999999999999.9'::decimal(38,1));
+----
+Decimal128(38, 0) 10000000000000000000000000000000000000
+
+# round decimal at max precision with non-literal decimal_places can overflow
+query error Decimal overflow: rounded value exceeds precision 38
+select round(val, dp)
+from (values (cast('9999999999999999999999999999999999999.9' as decimal(38,1)), 0)) as t(val, dp);
+
+# round decimal with negative scale
+query TRRR
+select arrow_typeof(round(cast(500 as decimal(10,-2)), -3)),
+       round(cast(500 as decimal(10,-2)), -3),
+       round(cast(400 as decimal(10,-2)), -3),
+       round(cast(-500 as decimal(10,-2)), -3);
+----
+Decimal128(10, -3) 1000 0 -1000
+
+# round decimal with negative scale and carry-over
+query TR
+select arrow_typeof(round(cast(999999999900 as decimal(10,-2)), -3)),
+       round(cast(999999999900 as decimal(10,-2)), -3);
+----
+Decimal128(10, -3) 1000000000000
+
+# round decimal with very small decimal_places (i32::MIN) should not error
+query TR
+select arrow_typeof(round('123.45'::decimal(5,2), -2147483648)),
+       round('123.45'::decimal(5,2), -2147483648);
+----
+Decimal128(5, 0) 0
 
 ## signum
 
@@ -1023,7 +1225,7 @@ from small_floats;
 ----
 0.447 0.4 0.447
 0.707 0.7 0.707
-0.837 0.8 0.837
+0.836 0.8 0.836
 1 1 1
 
 ## bitwise and
@@ -1169,6 +1371,14 @@ select a << b, c << d, e << f from signed_integers;
 33554432 123 10485760
 NULL NULL NULL
 
+## bitwise operations should reject non-integer types
+
+query error DataFusion error: Error during planning: Cannot infer common type for bitwise operation Float32 & Float32
+select arrow_cast(1, 'Float32') & arrow_cast(2, 'Float32');
+
+query error DataFusion error: Error during planning: Cannot infer common type for bitwise operation Date32 & Date32
+select arrow_cast(1, 'Date32') & arrow_cast(2, 'Date32');
+
 statement ok
 drop table unsigned_integers;
 
@@ -1759,7 +1969,7 @@ CREATE TABLE test(
 (-14, -14, -14.5, -14.5),
 (NULL, NULL, NULL, NULL);
 
-query IRRRIR rowsort
+query RRRRRR rowsort
 SELECT power(i32, exp_i) as power_i32,
        power(i64, exp_f) as power_i64,
        pow(f32, exp_i) as power_f32,
@@ -1832,7 +2042,7 @@ query TT
 EXPLAIN SELECT letter, letter = LEFT('APACHE', 1) FROM simple_string;
 ----
 logical_plan
-01)Projection: simple_string.letter, simple_string.letter = Utf8("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1))
+01)Projection: simple_string.letter, simple_string.letter = Utf8View("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1))
 02)--TableScan: simple_string projection=[letter]
 physical_plan
 01)ProjectionExec: expr=[letter@0 as letter, letter@0 = A as simple_string.letter = left(Utf8("APACHE"),Int64(1))]
@@ -1867,9 +2077,9 @@ D false
 
 # test string_temporal_coercion
 query BBBBBBBBBB
-select 
-  arrow_cast(to_timestamp('2020-01-01 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == '2020-01-01T01:01:11',
-  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, None)') == arrow_cast('2020-01-02T01:01:11', 'LargeUtf8'),
+select
+  arrow_cast(to_timestamp('2020-01-01 01:01:11.1234567890Z'), 'Timestamp(s)') == '2020-01-01T01:01:11',
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(s)') == arrow_cast('2020-01-02T01:01:11', 'LargeUtf8'),
   arrow_cast(to_timestamp('2020-01-03 01:01:11.1234567890Z'), 'Time32(Second)') == '01:01:11',
   arrow_cast(to_timestamp('2020-01-04 01:01:11.1234567890Z'), 'Time32(Second)') == arrow_cast('01:01:11', 'LargeUtf8'),
   arrow_cast(to_timestamp('2020-01-05 01:01:11.1234567890Z'), 'Time64(Microsecond)') == '01:01:11.123456',
@@ -1927,7 +2137,7 @@ select position('' in '')
 ----
 1
 
-query error DataFusion error: Error during planning: Internal error: Expect TypeSignatureClass::Native\(LogicalType\(Native\(String\), String\)\) but received NativeType::Int64, DataType: Int64
+query error DataFusion error: Error during planning: Function 'strpos' requires String, but received Int64 \(DataType: Int64\).
 select position(1 in 1)
 
 query I
diff --git a/datafusion/sqllogictest/test_files/schema_evolution.slt b/datafusion/sqllogictest/test_files/schema_evolution.slt
index 5572c4a5ffef3..e29aa14f13e92 100644
--- a/datafusion/sqllogictest/test_files/schema_evolution.slt
+++ b/datafusion/sqllogictest/test_files/schema_evolution.slt
@@ -138,3 +138,147 @@ select * from parquet_table where c > 11.0;
 ----
 bzz 300 13.7
 foo 200 12.6
+
+##########
+# Projection tests - selecting subset of columns
+# These tests verify column reordering and projection work correctly
+# with schema evolution (addresses E2E column reordering concern)
+##########
+
+# Select only column a
+query T rowsort
+select a from parquet_table;
+----
+NULL
+bzz
+foo
+foo
+foo
+foo
+foo
+foo
+foo
+
+# Select columns in different order than table schema (c, a instead of a, b, c)
+query RT rowsort
+select c, a from parquet_table;
+----
+10.5 foo
+12.6 foo
+13.7 bzz
+NULL NULL
+NULL foo
+NULL foo
+NULL foo
+NULL foo
+NULL foo
+
+# Select single column that's missing in some files
+query I rowsort
+select b from parquet_table;
+----
+1
+10
+100
+2
+200
+3
+300
+NULL
+NULL
+
+##########
+# Projection with filter tests
+##########
+
+# Projection with equality filter
+query TI rowsort
+select a, b from parquet_table where a = 'foo';
+----
+foo 1
+foo 100
+foo 2
+foo 200
+foo 3
+foo NULL
+foo NULL
+
+# Projection with range filter on projected column
+query IR rowsort
+select b, c from parquet_table where b > 5;
+----
+10 NULL
+100 10.5
+200 12.6
+300 13.7
+
+# Projection excluding filtered column (filter on c, project a)
+query T rowsort
+select a from parquet_table where c > 11.0;
+----
+bzz
+foo
+
+##########
+# Complex filter tests - OR combinations and IS NOT NULL
+##########
+
+# OR combination
+query TIR rowsort
+select * from parquet_table where a = 'foo' OR b > 100;
+----
+bzz 300 13.7
+foo 1 NULL
+foo 100 10.5
+foo 2 NULL
+foo 200 12.6
+foo 3 NULL
+foo NULL NULL
+foo NULL NULL
+
+# IS NOT NULL on column a
+query TIR rowsort
+select * from parquet_table where a IS NOT NULL;
+----
+bzz 300 13.7
+foo 1 NULL
+foo 100 10.5
+foo 2 NULL
+foo 200 12.6
+foo 3 NULL
+foo NULL NULL
+foo NULL NULL
+
+# IS NOT NULL on column c (missing in most files)
+query TIR rowsort
+select * from parquet_table where c IS NOT NULL;
+----
+bzz 300 13.7
+foo 100 10.5
+foo 200 12.6
+
+# Combined conditions with NULL checks
+query TIR rowsort
+select * from parquet_table where a IS NULL OR (b IS NOT NULL AND b > 5);
+----
+NULL 10 NULL
+bzz 300 13.7
+foo 100 10.5
+foo 200 12.6
+
+##########
+# Multi-column predicates
+##########
+
+# AND across columns with different availability
+query TIR rowsort
+select * from parquet_table where a = 'foo' AND b > 50;
+----
+foo 100 10.5
+foo 200 12.6
+
+# Filter on multiple columns from reordered file (File4 has b, a, c order)
+query TIR rowsort
+select * from parquet_table where b = 100 AND c = 10.5;
+----
+foo 100 10.5
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index aa14faf984e40..553ccb74dedb1 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -408,7 +408,7 @@ VALUES (1,2,3,4,5,6,7,8,9,10,11,12,13,NULL,'F',3.5)
 
 # Test non-literal expressions in VALUES
 query II
-VALUES (1, CASE WHEN RANDOM() > 0.5 THEN 1 ELSE 1 END), 
+VALUES (1, CASE WHEN RANDOM() > 0.5 THEN 1 ELSE 1 END),
        (2, CASE WHEN RANDOM() > 0.5 THEN 2 ELSE 2 END);
 ----
 1 1
@@ -558,7 +558,7 @@ EXPLAIN SELECT * FROM ((SELECT column1 FROM foo) "T1" CROSS JOIN (SELECT column2
 ----
 logical_plan
 01)SubqueryAlias: F
-02)--Cross Join: 
+02)--Cross Join:
 03)----SubqueryAlias: T1
 04)------TableScan: foo projection=[column1]
 05)----SubqueryAlias: T2
@@ -820,7 +820,7 @@ SELECT ALL c1 FROM aggregate_simple order by c1
 0.00005
 0.00005
 
-# select distinct
+# SELECT DISTINCT
 query RRB rowsort
 SELECT DISTINCT * FROM aggregate_simple
 ----
@@ -830,6 +830,31 @@ SELECT DISTINCT * FROM aggregate_simple
 0.00004 0.000000000004 false
 0.00005 0.000000000005 true
 
+# select ALL (inverse of distinct)
+query RRB rowsort
+SELECT ALL * FROM aggregate_simple;
+----
+0.00001 0.000000000001 true
+0.00002 0.000000000002 false
+0.00002 0.000000000002 false
+0.00003 0.000000000003 true
+0.00003 0.000000000003 true
+0.00003 0.000000000003 true
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00004 0.000000000004 false
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+0.00005 0.000000000005 true
+
+
+# select distinct all (
+query error DataFusion error: SQL error: ParserError\("Cannot specify DISTINCT then ALL at Line: 1, Column: 8"\)
+SELECT DISTINCT ALL * FROM aggregate_simple
+
 # select distinct with projection and order by
 query R
 SELECT DISTINCT c1 FROM aggregate_simple order by c1
@@ -1404,7 +1429,7 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a@0 as a, a@0 + b@1 as annotated_data_finite2.a + annotated_data_finite2.b]
-03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # since query below doesn't computation
@@ -1420,9 +1445,7 @@ logical_plan
 01)Sort: annotated_data_finite2.a ASC NULLS LAST
 02)--Projection: annotated_data_finite2.a, annotated_data_finite2.b, Int64(2)
 03)----TableScan: annotated_data_finite2 projection=[a, b]
-physical_plan
-01)ProjectionExec: expr=[a@0 as a, b@1 as b, 2 as Int64(2)]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, 2 as Int64(2)], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a
@@ -1440,10 +1463,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1461,10 +1483,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1482,10 +1503,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is constant for column a and b
@@ -1503,10 +1523,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)]
 physical_plan
 01)SortPreservingMergeExec: [a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: a@1 = 0 AND b@2 = 0
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: a@1 = 0 AND b@2 = 0
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # source is ordered by a,b,c
 # when filter result is when filter contains or
@@ -1525,10 +1544,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
 02)--SortExec: expr=[c@3 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------FilterExec: a@1 = 0 OR b@2 = 0
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+03)----FilterExec: a@1 = 0 OR b@2 = 0
+04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 # When ordering lost during projection, we shouldn't keep the SortExec.
 # in the final physical plan.
@@ -1550,13 +1568,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c2@0 as c2, count(Int64(1))@1 as count(*)]
 02)--AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[count(Int64(1))]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------RepartitionExec: partitioning=Hash([c2@0], 2), input_partitions=2
-05)--------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[count(Int64(1))]
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------ProjectionExec: expr=[c2@0 as c2]
-08)--------------SortExec: TopK(fetch=4), expr=[c1@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c1], file_type=csv, has_header=true
+03)----RepartitionExec: partitioning=Hash([c2@0], 2), input_partitions=2
+04)------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[count(Int64(1))]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+06)----------ProjectionExec: expr=[c2@0 as c2]
+07)------------SortExec: TopK(fetch=4), expr=[c1@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c1], file_type=csv, has_header=true
 
 # FilterExec can track equality of non-column expressions.
 # plan below shouldn't have a SortExec because given column 'a' is ordered.
@@ -1573,10 +1590,9 @@ logical_plan
 03)----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[CAST(round(CAST(annotated_data_finite2.b AS Float64)) AS Int32) = annotated_data_finite2.a]
 physical_plan
 01)SortPreservingMergeExec: [CAST(round(CAST(b@2 AS Float64)) AS Int32) ASC NULLS LAST]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----FilterExec: CAST(round(CAST(b@2 AS Float64)) AS Int32) = a@1
-04)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
+02)--FilterExec: CAST(round(CAST(b@2 AS Float64)) AS Int32) = a@1
+03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 
 statement ok
@@ -1641,7 +1657,7 @@ query II
 SELECT
 CASE WHEN B.x > 0 THEN A.x / B.x ELSE 0 END AS value1,
 CASE WHEN B.x > 0 AND B.y > 0 THEN A.x / B.x ELSE 0 END AS value3
-FROM t AS A, (SELECT * FROM t WHERE x = 0) AS B; 
+FROM t AS A, (SELECT * FROM t WHERE x = 0) AS B;
 ----
 0 0
 0 0
@@ -1656,10 +1672,10 @@ query TT
 explain select coalesce(1, y/x), coalesce(2, y/x) from t;
 ----
 logical_plan
-01)Projection: coalesce(Int64(1), CAST(t.y / t.x AS Int64)), coalesce(Int64(2), CAST(t.y / t.x AS Int64))
-02)--TableScan: t projection=[x, y]
+01)Projection: Int64(1) AS coalesce(Int64(1),t.y / t.x), Int64(2) AS coalesce(Int64(2),t.y / t.x)
+02)--TableScan: t projection=[]
 physical_plan
-01)ProjectionExec: expr=[coalesce(1, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(1),t.y / t.x), coalesce(2, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(2),t.y / t.x)]
+01)ProjectionExec: expr=[1 as coalesce(Int64(1),t.y / t.x), 2 as coalesce(Int64(2),t.y / t.x)]
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
@@ -1686,11 +1702,17 @@ physical_plan
 02)--ProjectionExec: expr=[y@1 = 0 as __common_expr_1, x@0 as x, y@1 as y]
 03)----DataSourceExec: partitions=1, partition_sizes=[1]
 
-# due to the reason describe in https://github.com/apache/datafusion/issues/8927,
-# the following queries will fail
-query error
+query II
 select coalesce(1, y/x), coalesce(2, y/x) from t;
+----
+1 2
+1 2
+1 2
+1 2
+1 2
 
+# due to the reason describe in https://github.com/apache/datafusion/issues/8927,
+# the following queries will fail
 query error
 SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t;
 
@@ -1769,7 +1791,7 @@ DROP TABLE test;
 query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from 'I AM NOT A TIMESTAMP': error parsing date
 SELECT to_timestamp('I AM NOT A TIMESTAMP');
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 SELECT CAST('' AS int);
 
 # See issue: https://github.com/apache/datafusion/issues/8978
@@ -1871,3 +1893,70 @@ select *, count(*) over() as ta from t;
 
 statement count 0
 drop table t;
+
+# test "user" column
+# See https://github.com/apache/datafusion/issues/14141
+statement count 0
+create table t_with_user(a int, user text) as values (1,'test'), (2,null), (3,'foo');
+
+query T
+select t_with_user.user from t_with_user;
+----
+test
+NULL
+foo
+
+query IT
+select * from t_with_user where t_with_user.user = 'foo';
+----
+3 foo
+
+query T
+select user from t_with_user;
+----
+test
+NULL
+foo
+
+query IT
+select * from t_with_user where user = 'foo';
+----
+3 foo
+
+# test "current_time" column
+# See https://github.com/apache/datafusion/issues/14141
+statement count 0
+create table t_with_current_time(a int, current_time text) as values (1,'now'), (2,null), (3,'later');
+
+# here it's clear the column was meant
+query B
+select t_with_current_time.current_time is not null from t_with_current_time;
+----
+true
+false
+true
+
+# here it's the function
+query B
+select current_time is not null from t_with_current_time;
+----
+true
+true
+true
+
+# and here it's the column again
+query B
+select "current_time" is not null from t_with_current_time;
+----
+true
+false
+true
+
+# https://github.com/apache/datafusion/issues/20215
+statement count 0
+CREATE TABLE t0;
+
+query I
+SELECT COUNT(*) FROM t0 AS tt0 WHERE (4==(3/0));
+----
+0
diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt
index bb4ac920d0327..375d34925114e 100644
--- a/datafusion/sqllogictest/test_files/set_variable.slt
+++ b/datafusion/sqllogictest/test_files/set_variable.slt
@@ -244,3 +244,239 @@ SET TIME ZONE = 'Asia/Taipei2'
 
 statement error Arrow error: Parser error: Invalid timezone "Asia/Taipei2": failed to parse timezone
 SELECT '2000-01-01T00:00:00'::TIMESTAMP::TIMESTAMPTZ
+
+# reset variable restores default
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET datafusion.execution.batch_size = 1024
+
+query TT
+SHOW datafusion.execution.batch_size
+----
+datafusion.execution.batch_size 1024
+
+statement ok
+RESET datafusion.execution.batch_size
+
+query TT
+SHOW datafusion.execution.batch_size
+----
+datafusion.execution.batch_size 8192
+
+# reset variable with NULL default
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET datafusion.execution.parquet.max_predicate_cache_size = '123'
+
+query TT
+SHOW datafusion.execution.parquet.max_predicate_cache_size
+----
+datafusion.execution.parquet.max_predicate_cache_size 123
+
+statement ok
+RESET datafusion.execution.parquet.max_predicate_cache_size
+
+query TT
+SHOW datafusion.execution.parquet.max_predicate_cache_size
+----
+datafusion.execution.parquet.max_predicate_cache_size NULL
+
+# reset time zone via aliases
+statement ok
+set datafusion.catalog.information_schema = true
+
+statement ok
+SET TIMEZONE = '-03:00'
+
+statement ok
+RESET TIMEZONE
+
+query TT
+SHOW TIMEZONE
+----
+datafusion.execution.time_zone NULL
+
+statement ok
+SET TIME ZONE = '+09:00'
+
+statement ok
+RESET timezone
+
+query TT
+SHOW TIME ZONE
+----
+datafusion.execution.time_zone NULL
+
+# reset runtime variables
+statement ok
+SET datafusion.runtime.memory_limit = '1M'
+
+statement ok
+RESET datafusion.runtime.memory_limit
+
+statement ok
+SET datafusion.runtime.max_temp_directory_size = '1M'
+
+statement ok
+RESET datafusion.runtime.max_temp_directory_size
+
+statement ok
+SET datafusion.runtime.metadata_cache_limit = '1M'
+
+statement ok
+RESET datafusion.runtime.metadata_cache_limit
+
+statement ok
+SET datafusion.runtime.temp_directory = './'
+
+statement ok
+RESET datafusion.runtime.temp_directory
+
+# test memory limit effect
+statement ok
+SET datafusion.runtime.memory_limit = '1K'
+
+# This query should fail with low memory
+statement error Not enough memory to continue external sort
+EXPLAIN ANALYZE SELECT * FROM generate_series(1, 1000) AS t1(v1) ORDER BY v1
+
+statement ok
+RESET datafusion.runtime.memory_limit
+
+# This query should succeed after resetting memory limit
+statement ok
+EXPLAIN ANALYZE SELECT * FROM generate_series(1, 1000) AS t1(v1) ORDER BY v1
+
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '1K'
+
+statement ok
+RESET datafusion.runtime.list_files_cache_limit
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '1m'
+
+statement ok
+RESET datafusion.runtime.list_files_cache_ttl
+
+# reset invalid variable - typo in namespace
+statement error DataFusion error: Invalid or Unsupported Configuration: Could not find config namespace "dataexplosion"
+RESET dataexplosion.execution.batch_size
+
+# reset invalid variable - wrong namespace prefix
+statement error DataFusion error: Invalid or Unsupported Configuration: Config value "exec" not found on ConfigOptions
+RESET datafusion.exec.batch_size
+
+# reset invalid variable - typo in field name
+statement error DataFusion error: Invalid or Unsupported Configuration: Config value "batches_size" not found on ExecutionOptions
+RESET datafusion.execution.batches_size
+
+# reset invalid variable - extra suffix on valid field
+statement error DataFusion error: Invalid or Unsupported Configuration: Config field is a scalar usize and does not have nested field "bar"
+RESET datafusion.execution.batch_size.bar
+
+############
+## Test runtime configuration variables
+############
+
+# Test SHOW runtime.memory_limit (default value)
+query TT
+SHOW datafusion.runtime.memory_limit
+----
+datafusion.runtime.memory_limit unlimited
+
+# Test SET and SHOW runtime.memory_limit
+statement ok
+SET datafusion.runtime.memory_limit = '100M'
+
+query TT
+SHOW datafusion.runtime.memory_limit
+----
+datafusion.runtime.memory_limit 100M
+
+# Test SET and SHOW runtime.max_temp_directory_size
+statement ok
+SET datafusion.runtime.max_temp_directory_size = '10G'
+
+query TT
+SHOW datafusion.runtime.max_temp_directory_size
+----
+datafusion.runtime.max_temp_directory_size 10G
+
+# Test SET and SHOW runtime.metadata_cache_limit
+statement ok
+SET datafusion.runtime.metadata_cache_limit = '200M'
+
+query TT
+SHOW datafusion.runtime.metadata_cache_limit
+----
+datafusion.runtime.metadata_cache_limit 200M
+
+# Test SET and SHOW runtime.list_files_cache_limit
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '2M'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_limit
+----
+datafusion.runtime.list_files_cache_limit 2M
+
+# Test SET and SHOW runtime.list_files_cache_ttl
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '90s'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_ttl
+----
+datafusion.runtime.list_files_cache_ttl 1m30s
+
+# Note: runtime.temp_directory shows the actual temp directory path with a unique suffix,
+# so we cannot test the exact value. We verify it exists in information_schema instead.
+
+# Test that all runtime variables appear in information_schema.df_settings
+query T
+SELECT name FROM information_schema.df_settings WHERE name LIKE 'datafusion.runtime.%' ORDER BY name
+----
+datafusion.runtime.list_files_cache_limit
+datafusion.runtime.list_files_cache_ttl
+datafusion.runtime.max_temp_directory_size
+datafusion.runtime.memory_limit
+datafusion.runtime.metadata_cache_limit
+datafusion.runtime.temp_directory
+
+statement error DataFusion error: Error during planning: Unsupported value Null
+SET datafusion.runtime.memory_limit = NULL
+
+statement error DataFusion error: Error during planning: Unsupported value Null
+SET datafusion.runtime.list_files_cache_ttl = NULL
+
+statement error DataFusion error: Error during planning: Duration should not be empty or blank for 'datafusion.runtime.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = ' '
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '18446744073709551615s'
+
+statement error DataFusion error: Error during planning: Failed to parse number from duration '18446744073709551616s' for 'datafusion.runtime.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '18446744073709551616s'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m10s'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825861m'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60 \+ secs' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '307445734561825860m60s'
+
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '1m18446744073709551555s'
+
+statement error DataFusion error: Error during planning: Duration has overflowed allowed maximum limit due to 'mins \* 60 \+ secs' when setting 'datafusion\.runtime\.list_files_cache_ttl'
+SET datafusion.runtime.list_files_cache_ttl = '1m18446744073709551556s'
diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt
index 075ccafcfd2e0..58ec7a1b262c3 100644
--- a/datafusion/sqllogictest/test_files/simplify_expr.slt
+++ b/datafusion/sqllogictest/test_files/simplify_expr.slt
@@ -26,9 +26,8 @@ logical_plan
 01)Filter: t.a = Int32(3)
 02)--TableScan: t projection=[a]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: a@0 = 3
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: a@0 = 3
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 # test regex exprs
 query TT
@@ -38,20 +37,33 @@ logical_plan
 01)Filter: t.b IS NOT NULL
 02)--TableScan: t projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: b@0 IS NOT NULL
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: b@0 IS NOT NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 query TT
 explain select b from t where b !~ '.*'
 ----
 logical_plan
-01)Filter: t.b = Utf8("")
+01)Filter: t.b IS NULL AND Boolean(NULL)
 02)--TableScan: t projection=[b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: b@0 = 
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: b@0 IS NULL AND NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query TB
+WITH vals(id, col) AS (
+    VALUES
+      (1, 'foo'::text),
+      (2, ''::text),
+      (3, NULL::text)
+)
+SELECT col, col !~ '.*'
+FROM vals
+ORDER BY id
+----
+foo false
+(empty) false
+NULL NULL
 
 query T
 select b from t where b ~ '.*'
@@ -70,9 +82,8 @@ logical_plan
 01)Filter: t.a IS NOT NULL OR Boolean(NULL)
 02)--TableScan: t projection=[a, b]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192
-02)--FilterExec: a@0 IS NOT NULL OR NULL
-03)----DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: a@0 IS NOT NULL OR NULL
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t;
@@ -107,3 +118,31 @@ query B
 SELECT a / NULL::DECIMAL(4,3) > 1.2::decimal(2,1) FROM VALUES (1) AS t(a);
 ----
 NULL
+
+query TT
+explain SELECT CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END AS a;
+----
+logical_plan
+01)Projection: Map([{"x":"100"}]) AS a
+02)--EmptyRelation: rows=1
+physical_plan
+01)ProjectionExec: expr=[[{x:100}] as a]
+02)--PlaceholderRowExec
+
+# Simplify expr = L1 AND expr != L2 to expr = L1 when L1 != L2
+query TT
+EXPLAIN SELECT
+    v = 1 AND v != 0 as opt1,
+    v = 2 AND v != 2 as noopt1,
+    v != 3 AND v = 4 as opt2,
+    v != 5 AND v = 5 as noopt2
+FROM (VALUES (0), (1), (2)) t(v)
+----
+logical_plan
+01)Projection: t.v = Int64(1) AS opt1, t.v = Int64(2) AND t.v != Int64(2) AS noopt1, t.v = Int64(4) AS opt2, t.v != Int64(5) AND t.v = Int64(5) AS noopt2
+02)--SubqueryAlias: t
+03)----Projection: column1 AS v
+04)------Values: (Int64(0)), (Int64(1)), (Int64(2))
+physical_plan
+01)ProjectionExec: expr=[column1@0 = 1 as opt1, column1@0 = 2 AND column1@0 != 2 as noopt1, column1@0 = 4 as opt2, column1@0 != 5 AND column1@0 = 5 as noopt2]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
diff --git a/datafusion/sqllogictest/test_files/simplify_predicates.slt b/datafusion/sqllogictest/test_files/simplify_predicates.slt
new file mode 100644
index 0000000000000..c2a21ea7103c3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/simplify_predicates.slt
@@ -0,0 +1,246 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test cases for predicate simplification feature
+# Basic redundant comparison simplification
+
+statement ok
+set datafusion.explain.logical_plan_only=true;
+
+statement ok
+CREATE TABLE test_data (
+    int_col INT,
+    float_col FLOAT,
+    str_col VARCHAR,
+    date_col DATE,
+    bool_col BOOLEAN
+);
+
+# x > 5 AND x > 6 should simplify to x > 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > 5 AND int_col > 6;
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(6)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x > 5 AND x >= 6 should simplify to x >= 6
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > 5 AND int_col >= 6;
+----
+logical_plan
+01)Filter: test_data.int_col >= Int32(6)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x < 10 AND x <= 8 should simplify to x <= 8
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col < 10 AND int_col <= 8;
+----
+logical_plan
+01)Filter: test_data.int_col <= Int32(8)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x > 5 AND x > 6 AND x > 7 should simplify to x > 7
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > 5 AND int_col > 6 AND int_col > 7;
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(7)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x > 5 AND y < 10 AND x > 6 AND y < 8 should simplify to x > 6 AND y < 8
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > 5 AND float_col < 10 AND int_col > 6 AND float_col < 8;
+----
+logical_plan
+01)Filter: test_data.float_col < Float32(8) AND test_data.int_col > Int32(6)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x = 7 AND x = 7 should simplify to x = 7
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col = 7 AND int_col = 7;
+----
+logical_plan
+01)Filter: test_data.int_col = Int32(7)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x = 7 AND x = 6 should simplify to false
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col = 7 AND int_col = 6;
+----
+logical_plan EmptyRelation: rows=0
+
+# TODO: x = 7 AND x < 2 should simplify to false
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col = 7 AND int_col < 2;
+----
+logical_plan
+01)Filter: test_data.int_col = Int32(7) AND test_data.int_col < Int32(2)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+
+# TODO: x = 7 AND x > 5 should simplify to x = 7
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col = 7 AND int_col > 5;
+----
+logical_plan
+01)Filter: test_data.int_col = Int32(7) AND test_data.int_col > Int32(5)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# str_col > 'apple' AND str_col > 'banana' should simplify to str_col > 'banana'
+query TT
+EXPLAIN SELECT * FROM test_data WHERE str_col > 'apple' AND str_col > 'banana';
+----
+logical_plan
+01)Filter: test_data.str_col > Utf8View("banana")
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# date_col > '2023-01-01' AND date_col > '2023-02-01' should simplify to date_col > '2023-02-01'
+query TT
+EXPLAIN SELECT * FROM test_data WHERE date_col > '2023-01-01' AND date_col > '2023-02-01';
+----
+logical_plan
+01)Filter: test_data.date_col > Date32("2023-02-01")
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+query TT
+EXPLAIN SELECT * FROM test_data WHERE bool_col = true AND bool_col = false;
+----
+logical_plan
+01)Filter: test_data.bool_col AND NOT test_data.bool_col
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+
+# This shouldn't be simplified since they're different relationships
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > float_col AND int_col > 5;
+----
+logical_plan
+01)Filter: CAST(test_data.int_col AS Float32) > test_data.float_col AND test_data.int_col > Int32(5)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# Should simplify the int_col predicates but preserve the others
+query TT
+EXPLAIN SELECT * FROM test_data
+WHERE int_col > 5
+  AND int_col > 10
+  AND str_col LIKE 'A%'
+  AND float_col BETWEEN 1 AND 100;
+----
+logical_plan
+01)Filter: test_data.str_col LIKE Utf8View("A%") AND test_data.float_col >= Float32(1) AND test_data.float_col <= Float32(100) AND test_data.int_col > Int32(10)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+statement ok
+CREATE TABLE test_data2 (
+    id INT,
+    value INT
+);
+
+query TT
+EXPLAIN SELECT t1.int_col, t2.value
+FROM test_data t1
+JOIN test_data2 t2 ON t1.int_col = t2.id
+WHERE t1.int_col > 5
+  AND t1.int_col > 10
+  AND t2.value < 100
+  AND t2.value < 50;
+----
+logical_plan
+01)Projection: t1.int_col, t2.value
+02)--Inner Join: t1.int_col = t2.id
+03)----SubqueryAlias: t1
+04)------Filter: test_data.int_col > Int32(10)
+05)--------TableScan: test_data projection=[int_col]
+06)----SubqueryAlias: t2
+07)------Filter: test_data2.value < Int32(50) AND test_data2.id > Int32(10)
+08)--------TableScan: test_data2 projection=[id, value]
+
+# Handling negated predicates
+# NOT (x < 10) AND NOT (x < 5) should simplify to NOT (x < 10)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE NOT (int_col < 10) AND NOT (int_col < 5);
+----
+logical_plan
+01)Filter: test_data.int_col >= Int32(10)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x > 5 AND x < 10 should be preserved (can't be simplified)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col > 5 AND int_col < 10;
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(5) AND test_data.int_col < Int32(10)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# 5 < x AND 3 < x should simplify to 5 < x
+query TT
+EXPLAIN SELECT * FROM test_data WHERE 5 < int_col AND 3 < int_col;
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(5)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# CAST(x AS FLOAT) > 5.0 AND CAST(x AS FLOAT) > 6.0 should simplify
+query TT
+EXPLAIN SELECT * FROM test_data WHERE CAST(int_col AS FLOAT) > 5.0 AND CAST(int_col AS FLOAT) > 6.0;
+----
+logical_plan
+01)Filter: CAST(CAST(test_data.int_col AS Float32) AS Float64) > Float64(6)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# x = 5 AND x = 6 (logically impossible)
+query TT
+EXPLAIN SELECT * FROM test_data WHERE int_col = 5 AND int_col = 6;
+----
+logical_plan EmptyRelation: rows=0
+
+# (x > 5 OR y < 10) AND (x > 6 OR y < 8)
+# This is more complex but could still benefit from some simplification
+query TT
+EXPLAIN SELECT * FROM test_data
+WHERE (int_col > 5 OR float_col < 10)
+  AND (int_col > 6 OR float_col < 8);
+----
+logical_plan
+01)Filter: (test_data.int_col > Int32(5) OR test_data.float_col < Float32(10)) AND (test_data.int_col > Int32(6) OR test_data.float_col < Float32(8))
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+# Combination of AND and OR with simplifiable predicates
+query TT
+EXPLAIN SELECT * FROM test_data
+WHERE (int_col > 5 AND int_col > 6)
+   OR (float_col < 10 AND float_col < 8);
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(5) AND test_data.int_col > Int32(6) OR test_data.float_col < Float32(10) AND test_data.float_col < Float32(8)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+
+query TT
+EXPLAIN SELECT * FROM (
+  SELECT * FROM test_data 
+  WHERE int_col > 1 AND int_col < 10
+) WHERE int_col >= 1 AND int_col <= 10;
+----
+logical_plan
+01)Filter: test_data.int_col > Int32(1) AND test_data.int_col < Int32(10)
+02)--TableScan: test_data projection=[int_col, float_col, str_col, date_col, bool_col]
+
+
+statement ok
+set datafusion.explain.logical_plan_only=false;
diff --git a/datafusion/sqllogictest/test_files/slt_features.slt b/datafusion/sqllogictest/test_files/slt_features.slt
new file mode 100644
index 0000000000000..5075ed10eae9a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/slt_features.slt
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# =================================
+# Test sqllogictest runner features
+# =================================
+
+# --------------------------
+# Test `<slt:ignore>` marker
+# --------------------------
+query T
+select 'DataFusion'
+----
+<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+Data<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>Fusion
+
+query T
+select 'Apache DataFusion';
+----
+<slt:ignore>Data<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+DataFusion<slt:ignore>
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>DataFusion
+
+query T
+select 'DataFusion'
+----
+<slt:ignore>DataFusion<slt:ignore>
+
+query I
+select * from generate_series(3);
+----
+0
+1
+<slt:ignore>
+3
+
+query I
+select * from generate_series(3);
+----
+<slt:ignore>
+1
+<slt:ignore>
+<slt:ignore>
diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt
index c17fe8dfc7e6f..d2fa37ef76da8 100644
--- a/datafusion/sqllogictest/test_files/sort_merge_join.slt
+++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt
@@ -37,7 +37,7 @@ logical_plan
 02)--TableScan: t1 projection=[a, b]
 03)--TableScan: t2 projection=[a, b]
 physical_plan
-01)SortMergeJoin: join_type=Inner, on=[(a@0, a@0)], filter=CAST(b@1 AS Int64) * 50 <= CAST(b@0 AS Int64)
+01)SortMergeJoinExec: join_type=Inner, on=[(a@0, a@0)], filter=CAST(b@1 AS Int64) * 50 <= CAST(b@0 AS Int64)
 02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
 03)----DataSourceExec: partitions=1, partition_sizes=[1]
 04)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
@@ -833,9 +833,110 @@ t2 as (
 11 14
 12 15
 
+statement ok
+set datafusion.execution.batch_size = 8192;
+
+
+######
+## Tests for Binary, LargeBinary, BinaryView, FixedSizeBinary join keys
+######
+statement ok
+create table t1(x varchar, id1 int) as values ('aa', 1), ('bb', 2), ('aa', 3), (null, 4), ('ee', 5);
+
+statement ok
+create table t2(y varchar, id2 int) as values ('ee', 10), ('bb', 20), ('cc', 30), ('cc', 40), (null, 50);
+
+# Binary join keys
+query ?I?I
+with t1 as (select arrow_cast(x, 'Binary') as x, id1 from t1),
+     t2 as (select arrow_cast(y, 'Binary') as y, id2 from t2)
+select * from t1 join t2 on t1.x = t2.y order by id1, id2
+----
+6262 2 6262 20
+6565 5 6565 10
+
+# LargeBinary join keys
+query ?I?I
+with t1 as (select arrow_cast(x, 'LargeBinary') as x, id1 from t1),
+     t2 as (select arrow_cast(y, 'LargeBinary') as y, id2 from t2)
+select * from t1 join t2 on t1.x = t2.y order by id1, id2
+----
+6262 2 6262 20
+6565 5 6565 10
+
+# BinaryView join keys
+query ?I?I
+with t1 as (select arrow_cast(x, 'BinaryView') as x, id1 from t1),
+     t2 as (select arrow_cast(y, 'BinaryView') as y, id2 from t2)
+select * from t1 join t2 on t1.x = t2.y order by id1, id2
+----
+6262 2 6262 20
+6565 5 6565 10
+
+# FixedSizeBinary join keys
+query ?I?I
+with t1 as (select arrow_cast(arrow_cast(x, 'Binary'), 'FixedSizeBinary(2)') as x, id1 from t1),
+     t2 as (select arrow_cast(arrow_cast(y, 'Binary'), 'FixedSizeBinary(2)') as y, id2 from t2)
+select * from t1 join t2 on t1.x = t2.y order by id1, id2
+----
+6262 2 6262 20
+6565 5 6565 10
+
+statement ok
+drop table t1;
+
+statement ok
+drop table t2;
+
 # return sql params back to default values
 statement ok
 set datafusion.optimizer.prefer_hash_join = true;
 
+##########
+## Tests for equijoins with different column counts
+##########
+
 statement ok
-set datafusion.execution.batch_size = 8192;
+set datafusion.optimizer.prefer_hash_join = false;
+
+statement ok
+DROP TABLE IF EXISTS t1;
+
+statement ok
+CREATE TABLE t1(a int, b int) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+DROP TABLE IF EXISTS t2;
+
+statement ok
+CREATE TABLE t2(a int, b int, c int) AS VALUES (4, 101, 1001), (3, 201, 2001), (2, 250, 3001);
+
+statement ok
+DROP TABLE IF EXISTS t3;
+
+statement ok
+CREATE TABLE t3(x int) AS VALUES (1);
+
+query IIIII
+SELECT * FROM t2 RIGHT JOIN t1 on t1.a = t2.a AND t1.b < t2.b
+----
+NULL NULL NULL 1 100
+2 250 3001 2 200
+NULL NULL NULL 3 300
+
+query IIIII
+SELECT * FROM t1 LEFT JOIN t2 on t1.a = t2.a AND t1.b < t2.b
+----
+1 100 NULL NULL NULL
+2 200 2 250 3001
+3 300 NULL NULL NULL
+
+# Small table for LeftMark
+
+# LeftMark equijoin with different columns count
+query III rowsort
+SELECT t2.a, t2.b, t2.c
+FROM t2
+WHERE t2.a > 3 OR t2.a IN (SELECT t3.x FROM t3 WHERE t2.b < 150)
+----
+4 101 1001
diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt
new file mode 100644
index 0000000000000..99f26b66d458b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt
@@ -0,0 +1,1634 @@
+#Sort Pushdown for ordered Parquet files
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1: Sort Pushdown for ordered Parquet files
+# Create a sorted dataset
+statement ok
+CREATE TABLE sorted_data(id INT, value INT, name VARCHAR) AS VALUES
+(1, 100, 'a'),
+(2, 200, 'b'),
+(3, 300, 'c'),
+(4, 400, 'd'),
+(5, 500, 'e'),
+(6, 600, 'f'),
+(7, 700, 'g'),
+(8, 800, 'h'),
+(9, 900, 'i'),
+(10, 1000, 'j');
+
+# Copy to parquet with sorting
+query I
+COPY (SELECT * FROM sorted_data ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_data.parquet';
+----
+10
+
+statement ok
+CREATE EXTERNAL TABLE sorted_parquet(id INT, value INT, name VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/sorted_data.parquet'
+WITH ORDER (id ASC);
+
+# Test 1.1: Sort pushdown with DESC (opposite of ASC)
+# Should show reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Test 1.2: Verify results are correct
+query IIT
+SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+10 1000 j
+9 900 i
+8 800 h
+
+# Test 1.3: Should NOT apply for ASC (same direction)
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id ASC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id ASC NULLS LAST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 1.4: Disable sort pushdown
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1.5: With OFFSET
+query TT
+EXPLAIN SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3 OFFSET 2;
+----
+logical_plan
+01)Limit: skip=2, fetch=3
+02)--Sort: sorted_parquet.id DESC NULLS FIRST, fetch=5
+03)----TableScan: sorted_parquet projection=[id, value, name]
+physical_plan
+01)GlobalLimitExec: skip=2, fetch=3
+02)--SortExec: TopK(fetch=5), expr=[id@0 DESC], preserve_partitioning=[false]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_data.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IIT
+SELECT * FROM sorted_parquet ORDER BY id DESC LIMIT 3 OFFSET 2;
+----
+8 800 h
+7 700 g
+6 600 f
+
+# Test 1.6: Reverse scan with row selection (page index pruning)
+# This tests that when reverse_row_groups=true, the RowSelection is also properly reversed
+
+# Create a dataset with multiple row groups and enable page index
+statement ok
+CREATE TABLE multi_rg_data(id INT, category VARCHAR, value INT) AS VALUES
+(1, 'alpha', 10),
+(2, 'alpha', 20),
+(3, 'beta', 30),
+(4, 'beta', 40),
+(5, 'gamma', 50),
+(6, 'gamma', 60),
+(7, 'delta', 70),
+(8, 'delta', 80);
+
+# Write with small row groups (2 rows each = 4 row groups)
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 2;
+
+query I
+COPY (SELECT * FROM multi_rg_data ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_rg_sorted.parquet';
+----
+8
+
+# Reset row group size
+statement ok
+SET datafusion.execution.parquet.max_row_group_size = 1048576;
+
+statement ok
+CREATE EXTERNAL TABLE multi_rg_sorted(id INT, category VARCHAR, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_rg_sorted.parquet'
+WITH ORDER (id ASC);
+
+# Enable page index for better pruning
+statement ok
+SET datafusion.execution.parquet.enable_page_index = true;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Test with reverse scan and filter that prunes some row groups
+# This will create a RowSelection with partial row group scans
+query TT
+EXPLAIN SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+logical_plan
+01)Sort: multi_rg_sorted.id DESC NULLS FIRST, fetch=5
+02)--Filter: multi_rg_sorted.category = Utf8View("alpha") OR multi_rg_sorted.category = Utf8View("gamma")
+03)----TableScan: multi_rg_sorted projection=[id, category, value], partial_filters=[multi_rg_sorted.category = Utf8View("alpha") OR multi_rg_sorted.category = Utf8View("gamma")]
+physical_plan
+01)SortExec: TopK(fetch=5), expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_rg_sorted.parquet]]}, projection=[id, category, value], file_type=parquet, predicate=(category@1 = alpha OR category@1 = gamma) AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=category_null_count@2 != row_count@3 AND category_min@0 <= alpha AND alpha <= category_max@1 OR category_null_count@2 != row_count@3 AND category_min@0 <= gamma AND gamma <= category_max@1, required_guarantees=[category in (alpha, gamma)]
+
+# Verify the results are correct despite reverse scanning with row selection
+# Expected: gamma values (6, 5) then alpha values (2, 1), in DESC order by id
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+6 gamma 60
+5 gamma 50
+2 alpha 20
+1 alpha 10
+
+# Test with more complex selection pattern
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('beta', 'delta')
+ORDER BY id DESC;
+----
+8 delta 80
+7 delta 70
+4 beta 40
+3 beta 30
+
+# Test forward scan for comparison (should give same logical results in ASC order)
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id ASC;
+----
+1 alpha 10
+2 alpha 20
+5 gamma 50
+6 gamma 60
+
+# Disable reverse scan and verify it still works
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query ITI
+SELECT * FROM multi_rg_sorted
+WHERE category IN ('alpha', 'gamma')
+ORDER BY id DESC LIMIT 5;
+----
+6 gamma 60
+5 gamma 50
+2 alpha 20
+1 alpha 10
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 1.7: Sort pushdown with more than one partition
+# Create multiple parquet files to trigger it
+
+# Split data into multiple files
+statement ok
+CREATE TABLE sorted_data_part1(id INT, value INT, name VARCHAR) AS VALUES
+(1, 100, 'a'),
+(2, 200, 'b'),
+(3, 300, 'c');
+
+statement ok
+CREATE TABLE sorted_data_part2(id INT, value INT, name VARCHAR) AS VALUES
+(4, 400, 'd'),
+(5, 500, 'e'),
+(6, 600, 'f');
+
+statement ok
+CREATE TABLE sorted_data_part3(id INT, value INT, name VARCHAR) AS VALUES
+(7, 700, 'g'),
+(8, 800, 'h'),
+(9, 900, 'i'),
+(10, 1000, 'j');
+
+# Create directory for multi-file parquet
+query I
+COPY (SELECT * FROM sorted_data_part1 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part1.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM sorted_data_part2 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part2.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM sorted_data_part3 ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/sorted_multi/part3.parquet';
+----
+4
+
+# Create external table pointing to directory with multiple files
+statement ok
+CREATE EXTERNAL TABLE sorted_parquet_multi(id INT, value INT, name VARCHAR)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/sorted_multi/'
+WITH ORDER (id ASC);
+
+# Enable multiple partitions
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Now we should see RepartitionExec because we have 3 input partitions (3 files)
+query TT
+EXPLAIN SELECT * FROM sorted_parquet_multi ORDER BY id DESC LIMIT 3;
+----
+logical_plan
+01)Sort: sorted_parquet_multi.id DESC NULLS FIRST, fetch=3
+02)--TableScan: sorted_parquet_multi projection=[id, value, name]
+physical_plan
+01)SortPreservingMergeExec: [id@0 DESC], fetch=3
+02)--SortExec: TopK(fetch=3), expr=[id@0 DESC], preserve_partitioning=[true]
+03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/sorted_multi/part3.parquet]]}, projection=[id, value, name], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify correctness with repartitioning and multiple files
+query IIT
+SELECT * FROM sorted_parquet_multi ORDER BY id DESC LIMIT 3;
+----
+10 1000 j
+9 900 i
+8 800 h
+
+# Test ASC order (should not trigger reverse scan)
+query IIT
+SELECT * FROM sorted_parquet_multi ORDER BY id ASC LIMIT 3;
+----
+1 100 a
+2 200 b
+3 300 c
+
+# Cleanup
+statement ok
+DROP TABLE sorted_data_part1;
+
+statement ok
+DROP TABLE sorted_data_part2;
+
+statement ok
+DROP TABLE sorted_data_part3;
+
+statement ok
+DROP TABLE sorted_parquet_multi;
+
+# Reset to default
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+# Cleanup
+statement ok
+DROP TABLE multi_rg_data;
+
+statement ok
+DROP TABLE multi_rg_sorted;
+
+statement ok
+SET datafusion.execution.parquet.enable_page_index = false;
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+# Cleanup
+statement ok
+DROP TABLE sorted_data;
+
+statement ok
+DROP TABLE sorted_parquet;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+
+# Test 2: Sort pushdown with constant column filtering
+# This tests the case where a leading sort column becomes constant through WHERE filtering
+
+# Create a multi-column sorted dataset (like time-series data)
+statement ok
+CREATE TABLE timeseries_data(timeframe VARCHAR, period_end INT, value DOUBLE) AS VALUES
+('daily', 1, 100.0),
+('daily', 2, 150.0),
+('daily', 3, 200.0),
+('weekly', 1, 500.0),
+('weekly', 2, 600.0),
+('weekly', 3, 700.0),
+('monthly', 1, 1000.0),
+('monthly', 2, 1100.0),
+('monthly', 3, 1200.0),
+('quarterly', 1, 5000.0),
+('quarterly', 2, 5500.0),
+('quarterly', 3, 6000.0);
+
+# Copy to parquet with multi-column sorting (timeframe ASC, period_end ASC)
+query I
+COPY (SELECT * FROM timeseries_data ORDER BY timeframe ASC, period_end ASC)
+TO 'test_files/scratch/sort_pushdown/timeseries_sorted.parquet';
+----
+12
+
+statement ok
+CREATE EXTERNAL TABLE timeseries_parquet(timeframe VARCHAR, period_end INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/timeseries_sorted.parquet'
+WITH ORDER (timeframe ASC, period_end ASC);
+
+# Test 2.1: Query with constant prefix filter and DESC on remaining column
+# WHERE timeframe='quarterly' makes the first sort column constant
+# ORDER BY period_end DESC should trigger reverse scan because:
+# File ordering: [timeframe ASC, period_end ASC]
+# After filtering timeframe='quarterly': effectively [period_end ASC]
+# Request: [period_end DESC] -> exact reverse!
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=timeframe@0 = quarterly AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Test 2.2: Verify the results are correct
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+quarterly 3 6000
+quarterly 2 5500
+
+# Test 2.3: Same filter but ASC order (should not trigger reverse scan, ordering already satisfied)
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end ASC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end ASC NULLS LAST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], limit=2, output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=timeframe@0 = quarterly, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Test 2.4: Verify ASC results
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end ASC
+LIMIT 2;
+----
+quarterly 1 5000
+quarterly 2 5500
+
+# Test 2.5: Test with different constant value
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'weekly'
+ORDER BY period_end DESC;
+----
+weekly 3 700
+weekly 2 600
+weekly 1 500
+
+# Test 2.6: Test without constant filter (no reverse scan because need both columns)
+# Request: [timeframe ASC, period_end DESC]
+# File has: [timeframe ASC, period_end ASC]
+# These are NOT reverse of each other - only second column is reversed
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+ORDER BY timeframe ASC, period_end DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timeseries_parquet.timeframe ASC NULLS LAST, timeseries_parquet.period_end DESC NULLS FIRST, fetch=3
+02)--TableScan: timeseries_parquet projection=[timeframe, period_end, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[timeframe@0 ASC NULLS LAST, period_end@1 DESC], preserve_partitioning=[false], sort_prefix=[timeframe@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Test 2.7: Disable sort pushdown and verify filter still works
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = false;
+
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("quarterly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("quarterly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=timeframe@0 = quarterly AND DynamicFilter [ empty ], pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= quarterly AND quarterly <= timeframe_max@1, required_guarantees=[timeframe in (quarterly)]
+
+# Results should still be correct
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'quarterly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+quarterly 3 6000
+quarterly 2 5500
+
+# Re-enable
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 2.8: Test with IN clause (multiple constant values)
+# Note: IN clause with multiple values means timeframe is NOT constant
+# (could be 'daily' or 'weekly'), so the first sort column cannot be eliminated.
+# Without a constant first column, we cannot reverse scan based on just period_end DESC.
+# The physical plan should NOT show reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe IN ('daily', 'weekly')
+ORDER BY period_end DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timeseries_parquet.period_end DESC NULLS FIRST, fetch=3
+02)--Filter: timeseries_parquet.timeframe = Utf8View("daily") OR timeseries_parquet.timeframe = Utf8View("weekly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("daily") OR timeseries_parquet.timeframe = Utf8View("weekly")]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], output_ordering=[timeframe@0 ASC NULLS LAST, period_end@1 ASC NULLS LAST], file_type=parquet, predicate=(timeframe@0 = daily OR timeframe@0 = weekly) AND DynamicFilter [ empty ], pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= daily AND daily <= timeframe_max@1 OR timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= weekly AND weekly <= timeframe_max@1, required_guarantees=[timeframe in (daily, weekly)]
+
+# Test 2.9: Complex case - literal constant in sort expression itself
+# The literal 'constant' is ignored in sort analysis
+# After stripping: ORDER BY period_end DESC
+# With WHERE timeframe='monthly' making first column constant
+# File: [period_end ASC] (after constant column removal)
+# Request: [period_end DESC] -> exact reverse, triggers reverse scan
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE timeframe = 'monthly'
+ORDER BY 'constant', period_end DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: Utf8("constant") ASC NULLS LAST, timeseries_parquet.period_end DESC NULLS FIRST, fetch=2
+02)--Filter: timeseries_parquet.timeframe = Utf8View("monthly")
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.timeframe = Utf8View("monthly")]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[period_end@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=timeframe@0 = monthly AND DynamicFilter [ empty ], reverse_row_groups=true, pruning_predicate=timeframe_null_count@2 != row_count@3 AND timeframe_min@0 <= monthly AND monthly <= timeframe_max@1, required_guarantees=[timeframe in (monthly)]
+
+# Verify results
+query TIR
+SELECT * FROM timeseries_parquet
+WHERE timeframe = 'monthly'
+ORDER BY period_end DESC
+LIMIT 2;
+----
+monthly 3 1200
+monthly 2 1100
+
+# Test 2.10: Filter on non-leading sort column
+# File order: [timeframe ASC, period_end ASC]
+# Filter: period_end = 2 (makes second column constant)
+# Request: [timeframe DESC]
+# After constant column removal: File has [timeframe ASC], Request wants [timeframe DESC]
+# This is exact reverse -> triggers reverse scan
+query TT
+EXPLAIN SELECT * FROM timeseries_parquet
+WHERE period_end = 2
+ORDER BY timeframe DESC;
+----
+logical_plan
+01)Sort: timeseries_parquet.timeframe DESC NULLS FIRST
+02)--Filter: timeseries_parquet.period_end = Int32(2)
+03)----TableScan: timeseries_parquet projection=[timeframe, period_end, value], partial_filters=[timeseries_parquet.period_end = Int32(2)]
+physical_plan
+01)SortExec: expr=[timeframe@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timeseries_sorted.parquet]]}, projection=[timeframe, period_end, value], file_type=parquet, predicate=period_end@1 = 2, reverse_row_groups=true, pruning_predicate=period_end_null_count@2 != row_count@3 AND period_end_min@0 <= 2 AND 2 <= period_end_max@1, required_guarantees=[period_end in (2)]
+
+# Cleanup
+statement ok
+DROP TABLE timeseries_data;
+
+statement ok
+DROP TABLE timeseries_parquet;
+
+# Reset to default
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
+
+
+# Test 3: Sort pushdown with monotonic functions
+# This tests that reverse scan works when sort expressions involve monotonic functions
+
+# Create test data with timestamp column
+statement ok
+CREATE TABLE timestamp_data(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE) AS VALUES
+(1, TIMESTAMP '2024-01-15 10:00:00', 1000, 100.0),
+(2, TIMESTAMP '2024-01-20 11:00:00', 1500, 105.0),
+(3, TIMESTAMP '2024-01-25 12:00:00', 2000, 110.0),
+(4, TIMESTAMP '2024-02-05 09:00:00', 1200, 108.0),
+(5, TIMESTAMP '2024-02-15 14:00:00', 1800, 112.0),
+(6, TIMESTAMP '2024-02-25 15:00:00', 2200, 115.0),
+(7, TIMESTAMP '2024-03-10 09:00:00', 1300, 113.0),
+(8, TIMESTAMP '2024-03-18 14:00:00', 1900, 118.0),
+(9, TIMESTAMP '2024-03-28 15:00:00', 2300, 120.0);
+
+# Copy to parquet with sorting by timestamp ASC
+query I
+COPY (SELECT * FROM timestamp_data ORDER BY ts ASC)
+TO 'test_files/scratch/sort_pushdown/timestamp_sorted.parquet';
+----
+9
+
+# Test 3.1: Simple monotonic function - date_trunc
+# Create external table with file ordering that conceptually includes date_trunc
+# File is actually sorted by [ts ASC], but conceptually [date_trunc('month', ts) ASC, ts ASC]
+statement ok
+CREATE EXTERNAL TABLE timestamp_parquet(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/timestamp_sorted.parquet'
+WITH ORDER (ts ASC);
+
+# Query with ORDER BY ts DESC
+# File ordering: [ts ASC]
+# Request: [ts DESC]
+# This should trigger reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM timestamp_parquet
+ORDER BY ts DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: timestamp_parquet.ts DESC NULLS FIRST, fetch=3
+02)--TableScan: timestamp_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timestamp_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify results
+query IPIR
+SELECT * FROM timestamp_parquet
+ORDER BY ts DESC
+LIMIT 3;
+----
+9 2024-03-28T15:00:00 2300 120
+8 2024-03-18T14:00:00 1900 118
+7 2024-03-10T09:00:00 1300 113
+
+# Test 3.2: Monotonic function in ORDER BY - date_trunc DESC
+# File ordering: [ts ASC]
+# Request: [date_trunc('day', ts) DESC]
+# Since date_trunc is monotonic with ts, reversed file ordering [ts DESC] satisfies [date_trunc DESC]
+query TT
+EXPLAIN SELECT * FROM timestamp_parquet
+ORDER BY date_trunc('day', ts) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: date_trunc(Utf8("day"), timestamp_parquet.ts) DESC NULLS FIRST, fetch=3
+02)--TableScan: timestamp_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[date_trunc(day, ts@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/timestamp_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+# Verify results (descending day)
+query IPIR
+SELECT * FROM timestamp_parquet
+ORDER BY date_trunc('day', ts) DESC
+LIMIT 3;
+----
+9 2024-03-28T15:00:00 2300 120
+8 2024-03-18T14:00:00 1900 118
+7 2024-03-10T09:00:00 1300 113
+
+# Test 3.3: Multi-column scenario with explicit monotonic function in file ordering
+# Create a table where we explicitly declare the ordering includes the monotonic function
+# This simulates files that are partitioned/sorted by [date_trunc('month', ts) ASC, ts ASC]
+
+# Create a new parquet file sorted by [ts ASC] (which implies date_trunc ordering)
+statement ok
+CREATE TABLE multi_month_data(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE) AS VALUES
+-- January 2024
+(1, TIMESTAMP '2024-01-05 09:30:00', 1000, 100.0),
+(2, TIMESTAMP '2024-01-15 14:30:00', 1500, 105.0),
+(3, TIMESTAMP '2024-01-25 15:59:00', 2000, 110.0),
+-- February 2024
+(4, TIMESTAMP '2024-02-03 09:30:00', 1200, 108.0),
+(5, TIMESTAMP '2024-02-14 12:00:00', 1800, 112.0),
+(6, TIMESTAMP '2024-02-28 15:59:00', 2200, 115.0),
+-- March 2024
+(7, TIMESTAMP '2024-03-01 09:30:00', 1300, 113.0),
+(8, TIMESTAMP '2024-03-15 14:00:00', 1900, 118.0),
+(9, TIMESTAMP '2024-03-29 15:59:00', 2300, 120.0);
+
+query I
+COPY (SELECT * FROM multi_month_data ORDER BY ts ASC)
+TO 'test_files/scratch/sort_pushdown/multi_month_sorted.parquet';
+----
+9
+
+# Declare the file has ordering [ts ASC]
+# Conceptually this means [date_trunc('month', ts) ASC, ts ASC] due to monotonicity
+statement ok
+CREATE EXTERNAL TABLE multi_month_parquet(id INT, ts TIMESTAMP, volume BIGINT, price DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_month_sorted.parquet'
+WITH ORDER (ts ASC);
+
+# Test 3.3a: Request ORDER BY ts DESC (opposite direction)
+# File: [ts ASC]
+# Request: [ts DESC]
+# Should trigger reverse_row_groups=true
+query TT
+EXPLAIN SELECT * FROM multi_month_parquet
+ORDER BY ts DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: multi_month_parquet.ts DESC NULLS FIRST, fetch=2
+02)--TableScan: multi_month_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_month_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IPIR
+SELECT * FROM multi_month_parquet
+ORDER BY ts DESC
+LIMIT 2;
+----
+9 2024-03-29T15:59:00 2300 120
+8 2024-03-15T14:00:00 1900 118
+
+# Test 3.3b: Request ORDER BY date_trunc('month', ts) DESC, ts DESC
+# File: [ts ASC] (which implies [date_trunc('month', ts) ASC, ts ASC])
+# Request: [date_trunc('month', ts) DESC, ts DESC]
+# The reversed file ordering [ts DESC] satisfies this because:
+# - date_trunc is monotonic with ts
+# - So [ts DESC] implies [date_trunc('month', ts) DESC, ts DESC]
+query TT
+EXPLAIN SELECT * FROM multi_month_parquet
+ORDER BY date_trunc('month', ts) DESC, ts DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: date_trunc(Utf8("month"), multi_month_parquet.ts) DESC NULLS FIRST, multi_month_parquet.ts DESC NULLS FIRST, fetch=2
+02)--TableScan: multi_month_parquet projection=[id, ts, volume, price]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[date_trunc(month, ts@1) DESC, ts@1 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_month_sorted.parquet]]}, projection=[id, ts, volume, price], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IPIR
+SELECT * FROM multi_month_parquet
+ORDER BY date_trunc('month', ts) DESC, ts DESC
+LIMIT 2;
+----
+9 2024-03-29T15:59:00 2300 120
+8 2024-03-15T14:00:00 1900 118
+
+# Test 3.4: CAST as a monotonic function
+statement ok
+CREATE TABLE int_data(id INT, small_val SMALLINT, big_val BIGINT) AS VALUES
+(1, 10, 100),
+(2, 20, 200),
+(3, 30, 300),
+(4, 40, 400),
+(5, 50, 500);
+
+query I
+COPY (SELECT * FROM int_data ORDER BY small_val ASC)
+TO 'test_files/scratch/sort_pushdown/int_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE int_parquet(id INT, small_val SMALLINT, big_val BIGINT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/int_sorted.parquet'
+WITH ORDER (small_val ASC);
+
+# CAST preserves ordering: CAST(small_val AS BIGINT) is monotonic with small_val
+query TT
+EXPLAIN SELECT * FROM int_parquet
+ORDER BY CAST(small_val AS BIGINT) DESC
+LIMIT 2;
+----
+logical_plan
+01)Sort: CAST(int_parquet.small_val AS Int64) DESC NULLS FIRST, fetch=2
+02)--TableScan: int_parquet projection=[id, small_val, big_val]
+physical_plan
+01)SortExec: TopK(fetch=2), expr=[CAST(small_val@1 AS Int64) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/int_sorted.parquet]]}, projection=[id, small_val, big_val], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query III
+SELECT * FROM int_parquet
+ORDER BY CAST(small_val AS BIGINT) DESC
+LIMIT 2;
+----
+5 50 500
+4 40 400
+
+# Test 3.5: CEIL as a monotonic function
+statement ok
+CREATE TABLE float_data(id INT, value DOUBLE) AS VALUES
+(1, 1.1),
+(2, 2.3),
+(3, 3.5),
+(4, 4.7),
+(5, 5.9);
+
+query I
+COPY (SELECT * FROM float_data ORDER BY value ASC)
+TO 'test_files/scratch/sort_pushdown/float_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE float_parquet(id INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/float_sorted.parquet'
+WITH ORDER (value ASC);
+
+# CEIL is monotonic increasing
+query TT
+EXPLAIN SELECT * FROM float_parquet
+ORDER BY CEIL(value) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: ceil(float_parquet.value) DESC NULLS FIRST, fetch=3
+02)--TableScan: float_parquet projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[ceil(value@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/float_sorted.parquet]]}, projection=[id, value], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
+
+query IR
+SELECT * FROM float_parquet
+ORDER BY CEIL(value) DESC
+LIMIT 3;
+----
+5 5.9
+4 4.7
+3 3.5
+
+# Test 3.6: Negative case - ABS is NOT monotonic over mixed positive/negative range
+statement ok
+CREATE TABLE signed_data(id INT, value DOUBLE) AS VALUES
+(1, -5.0),
+(2, -3.0),
+(3, -1.0),
+(4, 2.0),
+(5, 4.0);
+
+query I
+COPY (SELECT * FROM signed_data ORDER BY value ASC)
+TO 'test_files/scratch/sort_pushdown/signed_sorted.parquet';
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE signed_parquet(id INT, value DOUBLE)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/signed_sorted.parquet'
+WITH ORDER (value ASC);
+
+# ABS is NOT monotonic over the full range [-5, 4], so should NOT trigger reverse scan
+query TT
+EXPLAIN SELECT * FROM signed_parquet
+ORDER BY ABS(value) DESC
+LIMIT 3;
+----
+logical_plan
+01)Sort: abs(signed_parquet.value) DESC NULLS FIRST, fetch=3
+02)--TableScan: signed_parquet projection=[id, value]
+physical_plan
+01)SortExec: TopK(fetch=3), expr=[abs(value@1) DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/signed_sorted.parquet]]}, projection=[id, value], output_ordering=[value@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
+
+# Results should still be correct (no optimization applied)
+query IR
+SELECT * FROM signed_parquet
+ORDER BY ABS(value) DESC
+LIMIT 3;
+----
+1 -5
+5 4
+2 -3
+
+# Test 3.7: Aggregate ORDER BY expression should keep SortExec
+# Source pattern declared on parquet scan: [x ASC, y ASC].
+# Requested pattern in ORDER BY: [x ASC, CAST(y AS BIGINT) % 2 ASC].
+# Example for x=1 input y order 1,2,3 gives bucket order 1,0,1, which does not
+# match requested bucket ASC order. SortExec is required above AggregateExec.
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE agg_expr_data(x INT, y INT, v INT) AS VALUES
+(1, 1, 10),
+(1, 2, 20),
+(1, 3, 30),
+(2, 1, 40),
+(2, 2, 50),
+(2, 3, 60);
+
+query I
+COPY (SELECT * FROM agg_expr_data ORDER BY x, y)
+TO 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet';
+----
+6
+
+statement ok
+CREATE EXTERNAL TABLE agg_expr_parquet(x INT, y INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet'
+WITH ORDER (x ASC, y ASC);
+
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y % Int64(2) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64) % Int64(2)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)SortExec: expr=[x@0 ASC NULLS LAST, agg_expr_parquet.y % Int64(2)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) % 2 as agg_expr_parquet.y % Int64(2)], aggr=[sum(agg_expr_parquet.v)], ordering_mode=PartiallySorted([0])
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+# Expected output pattern from ORDER BY [x, bucket]:
+# rows grouped by x, and within each x bucket appears as 0 then 1.
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+1 0 20
+1 1 40
+2 0 50
+2 1 100
+
+# Test 3.8: Aggregate ORDER BY monotonic expression can push down (no SortExec)
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) as agg_expr_parquet.y], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+1 1 10
+1 2 20
+1 3 30
+2 1 40
+2 2 50
+2 3 60
+
+# Test 3.9: Aggregate ORDER BY aggregate output should keep SortExec
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+logical_plan
+01)Sort: sum(agg_expr_parquet.v) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[sum(agg_expr_parquet.v)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+1 60
+2 150
+
+# Test 3.10: Aggregate with non-preserved input order should keep SortExec
+# v is not part of the order by
+query TT
+EXPLAIN SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+logical_plan
+01)Sort: agg_expr_parquet.v ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.v]], aggr=[[sum(CAST(agg_expr_parquet.y AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[y, v]
+physical_plan
+01)SortExec: expr=[v@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[v@1 as v], aggr=[sum(agg_expr_parquet.y)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[y, v], file_type=parquet
+
+query II
+SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+10 1
+20 2
+30 3
+40 1
+50 2
+60 3
+
+# Test 3.11: Aggregate ORDER BY non-column expression (unsatisfied) keeps SortExec
+# (though note in theory DataFusion could figure out that data sorted by x will also be sorted by x+1)
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+logical_plan
+01)Sort: CAST(agg_expr_parquet.x AS Int64) + Int64(1) DESC NULLS FIRST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[CAST(x@0 AS Int64) + 1 DESC], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+2 150
+1 60
+
+# Test 3.12: Aggregate ORDER BY non-column expression (unsatisfied) keeps SortExec
+# (though note in theory DataFusion could figure out that data sorted by x will also be sorted by x+1)
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY 2 * x ASC;
+----
+logical_plan
+01)Sort: Int64(2) * CAST(agg_expr_parquet.x AS Int64) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[2 * CAST(x@0 AS Int64) ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY 2 * x ASC;
+----
+1 60
+2 150
+
+# Test 4: Reversed filesystem order with inferred ordering
+# Create 3 parquet files with non-overlapping id ranges, named so filesystem
+# order is OPPOSITE to data order. Each file is internally sorted by id ASC.
+# Force target_partitions=1 so all files end up in one file group, which is
+# where the inter-file ordering bug manifests.
+# Without inter-file validation, the optimizer would incorrectly trust the
+# inferred ordering and remove SortExec.
+
+# Save current target_partitions and set to 1 to force single file group
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE reversed_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+statement ok
+CREATE TABLE reversed_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE reversed_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+query I
+COPY (SELECT * FROM reversed_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/a_high.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/c_low.parquet';
+----
+3
+
+# External table with NO "WITH ORDER" — relies on inferred ordering from parquet metadata
+statement ok
+CREATE EXTERNAL TABLE reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/';
+
+# Test 4.1: SortExec must be present because files are not in inter-file order
+query TT
+EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 4.2: Results must be correct
+query II
+SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 5: Overlapping files with inferred ordering
+# Create files with overlapping id ranges
+
+statement ok
+CREATE TABLE overlap_x(id INT, value INT) AS VALUES (1, 100), (3, 300), (5, 500);
+
+statement ok
+CREATE TABLE overlap_y(id INT, value INT) AS VALUES (2, 200), (4, 400), (6, 600);
+
+query I
+COPY (SELECT * FROM overlap_x ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_x.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM overlap_y ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_y.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE overlap_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/overlap/';
+
+# Test 5.1: SortExec must be present because files have overlapping ranges
+query TT
+EXPLAIN SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: overlap_parquet.id ASC NULLS LAST
+02)--TableScan: overlap_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_x.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_y.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 5.2: Results must be correct
+query II
+SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+
+# Test 6: WITH ORDER + reversed filesystem order
+# Same file setup as Test 4 but explicitly declaring ordering via WITH ORDER.
+# Even with WITH ORDER, the optimizer should detect that inter-file order is wrong
+# and keep SortExec.
+
+statement ok
+CREATE EXTERNAL TABLE reversed_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 6.1: SortExec must be present despite WITH ORDER
+query TT
+EXPLAIN SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_with_order_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 6.2: Results must be correct
+query II
+SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7: Correctly ordered multi-file single group (positive case)
+# Files are in CORRECT inter-file order within a single group.
+# The validation should PASS and SortExec should be eliminated.
+
+statement ok
+CREATE TABLE correct_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE correct_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE correct_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+query I
+COPY (SELECT * FROM correct_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/c_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE correct_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 7.1: SortExec should be ELIMINATED — files are in correct inter-file order
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_parquet.id ASC NULLS LAST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 7.2: Results must be correct
+query II
+SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7.3: DESC query on correctly ordered ASC files should still use SortExec
+# Note: reverse_row_groups=true reverses the file list in the plan display
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: correct_parquet.id DESC NULLS FIRST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet]]}, projection=[id, value], file_type=parquet, reverse_row_groups=true
+
+query II
+SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+6 600
+5 500
+4 400
+3 300
+2 200
+1 100
+
+# Test 8: DESC ordering with files in wrong inter-file DESC order
+# Create files internally sorted by id DESC, but named so filesystem order
+# is WRONG for DESC ordering (low values first in filesystem order).
+
+statement ok
+CREATE TABLE desc_low(id INT, value INT) AS VALUES (3, 300), (2, 200), (1, 100);
+
+statement ok
+CREATE TABLE desc_high(id INT, value INT) AS VALUES (9, 900), (8, 800), (7, 700);
+
+query I
+COPY (SELECT * FROM desc_low ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM desc_high ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE desc_reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/desc_reversed/'
+WITH ORDER (id DESC);
+
+# Test 8.1: SortExec must be present — files are in wrong inter-file DESC order
+# (a_low has 1-3, b_high has 7-9; for DESC, b_high should come first)
+query TT
+EXPLAIN SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: desc_reversed_parquet.id DESC NULLS FIRST
+02)--TableScan: desc_reversed_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 8.2: Results must be correct
+query II
+SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+3 300
+2 200
+1 100
+
+# Test 9: Multi-column sort key validation
+# Files have (category, id) ordering. Files share a boundary value on category='B'
+# so column-level min/max statistics overlap on the primary key column.
+# The validation conservatively rejects this because column-level stats can't
+# precisely represent row-level boundaries for multi-column keys.
+
+statement ok
+CREATE TABLE multi_col_a(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20), ('B', 1, 30);
+
+statement ok
+CREATE TABLE multi_col_b(category VARCHAR, id INT, value INT) AS VALUES
+('B', 2, 40), ('C', 1, 50), ('C', 2, 60);
+
+query I
+COPY (SELECT * FROM multi_col_a ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/a_first.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM multi_col_b ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/b_second.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.1: SortExec is present — validation conservatively rejects because
+# column-level stats overlap on category='B' across both files
+query TT
+EXPLAIN SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_parquet.category ASC NULLS LAST, multi_col_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_parquet projection=[category, id, value]
+physical_plan
+01)SortExec: expr=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/a_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/b_second.parquet]]}, projection=[category, id, value], file_type=parquet
+
+# Test 9.2: Results must be correct
+query TII
+SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+C 1 50
+C 2 60
+
+# Test 9.3: Multi-column sort with non-overlapping primary key across files
+# When files don't overlap on the primary column, validation succeeds.
+
+statement ok
+CREATE TABLE multi_col_x(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20);
+
+statement ok
+CREATE TABLE multi_col_y(category VARCHAR, id INT, value INT) AS VALUES
+('B', 1, 30), ('B', 2, 40);
+
+query I
+COPY (SELECT * FROM multi_col_x ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet';
+----
+2
+
+query I
+COPY (SELECT * FROM multi_col_y ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet';
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_clean_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col_clean/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.3a: SortExec should be eliminated — non-overlapping primary column
+query TT
+EXPLAIN SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_clean_parquet.category ASC NULLS LAST, multi_col_clean_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_clean_parquet projection=[category, id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet]]}, projection=[category, id, value], output_ordering=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], file_type=parquet
+
+# Test 9.3b: Results must be correct
+query TII
+SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+
+# Test 10: Correctly ordered files WITH ORDER (positive counterpart to Test 6)
+# Files in correct_parquet are in correct ASC order — WITH ORDER should pass validation
+# and SortExec should be eliminated.
+
+statement ok
+CREATE EXTERNAL TABLE correct_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 10.1: SortExec should be ELIMINATED — files are in correct order
+query TT
+EXPLAIN SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: correct_with_order_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 10.2: Results must be correct
+query II
+SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 11: Multiple file groups (target_partitions > 1) — each group has one file
+# When files are spread across separate partitions (one file per group), each
+# partition is trivially sorted and SortPreservingMergeExec handles the merge.
+
+# Restore higher target_partitions so files go into separate groups
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+statement ok
+CREATE EXTERNAL TABLE multi_partition_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 11.1: With separate partitions, each file is trivially sorted.
+# SortPreservingMergeExec merges, no SortExec needed per-partition.
+query TT
+EXPLAIN SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: multi_partition_parquet.id ASC NULLS LAST
+02)--TableScan: multi_partition_parquet projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 11.2: Results must be correct
+query II
+SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Restore target_partitions to 1 for remaining cleanup
+statement ok
+SET datafusion.execution.target_partitions = 2;
+
+# Cleanup
+statement ok
+DROP TABLE reversed_high;
+
+statement ok
+DROP TABLE reversed_mid;
+
+statement ok
+DROP TABLE reversed_low;
+
+statement ok
+DROP TABLE reversed_parquet;
+
+statement ok
+DROP TABLE overlap_x;
+
+statement ok
+DROP TABLE overlap_y;
+
+statement ok
+DROP TABLE overlap_parquet;
+
+statement ok
+DROP TABLE reversed_with_order_parquet;
+
+statement ok
+DROP TABLE correct_low;
+
+statement ok
+DROP TABLE correct_mid;
+
+statement ok
+DROP TABLE correct_high;
+
+statement ok
+DROP TABLE correct_parquet;
+
+statement ok
+DROP TABLE desc_low;
+
+statement ok
+DROP TABLE desc_high;
+
+statement ok
+DROP TABLE desc_reversed_parquet;
+
+statement ok
+DROP TABLE multi_col_a;
+
+statement ok
+DROP TABLE multi_col_b;
+
+statement ok
+DROP TABLE multi_col_parquet;
+
+statement ok
+DROP TABLE multi_col_x;
+
+statement ok
+DROP TABLE multi_col_y;
+
+statement ok
+DROP TABLE multi_col_clean_parquet;
+
+statement ok
+DROP TABLE correct_with_order_parquet;
+
+statement ok
+DROP TABLE multi_partition_parquet;
+
+statement ok
+DROP TABLE timestamp_data;
+
+statement ok
+DROP TABLE timestamp_parquet;
+
+statement ok
+DROP TABLE multi_month_data;
+
+statement ok
+DROP TABLE multi_month_parquet;
+
+statement ok
+DROP TABLE int_data;
+
+statement ok
+DROP TABLE int_parquet;
+
+statement ok
+DROP TABLE float_data;
+
+statement ok
+DROP TABLE float_parquet;
+
+statement ok
+DROP TABLE signed_data;
+
+statement ok
+DROP TABLE signed_parquet;
+
+statement ok
+DROP TABLE agg_expr_data;
+
+statement ok
+DROP TABLE agg_expr_parquet;
+
+statement ok
+SET datafusion.optimizer.enable_sort_pushdown = true;
diff --git a/datafusion/sqllogictest/test_files/spark/README.md b/datafusion/sqllogictest/test_files/spark/README.md
index 0a7bb92371b58..e61001c6e42e5 100644
--- a/datafusion/sqllogictest/test_files/spark/README.md
+++ b/datafusion/sqllogictest/test_files/spark/README.md
@@ -21,6 +21,16 @@
 
 This directory contains test files for the `spark` test suite.
 
+## RoadMap
+
+Implementing the `datafusion-spark` compatible functions project is still a work in progress.
+Many of the tests in this directory are commented out and are waiting for help with implementation.
+
+For more information please see:
+
+- [The `datafusion-spark` Epic](https://github.com/apache/datafusion/issues/15914)
+- [Spark Test Generation Script] (https://github.com/apache/datafusion/pull/16409#issuecomment-2972618052)
+
 ## Testing Guide
 
 When testing Spark functions:
@@ -29,6 +39,18 @@ When testing Spark functions:
 - Test cases should only contain `SELECT` statements with the function being tested
 - Add explicit casts to input values to ensure the correct data type is used (e.g., `0::INT`)
   - Explicit casting is necessary because DataFusion and Spark do not infer data types in the same way
+- If the Spark built-in function under test behaves differently in ANSI SQL mode, please wrap your test cases like this example:
+
+```sql
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+# Functions under test
+select abs((-128)::TINYINT)
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+```
 
 ### Finding Test Cases
 
diff --git a/datafusion/sqllogictest/test_files/spark/aggregate/avg.slt b/datafusion/sqllogictest/test_files/spark/aggregate/avg.slt
new file mode 100644
index 0000000000000..6ae647989aee9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/aggregate/avg.slt
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query R
+SELECT avg(a) FROM (VALUES (10::INT), (20::INT), (30::INT), (40::INT), (50::INT)) AS t(a);
+----
+30
+
+query R
+SELECT avg(a) FROM (VALUES (40::INT), (23::INT), (17::INT), (40::INT), (NULL)) AS t(a);
+----
+30
+
+query R
+SELECT avg(a) FROM (VALUES (0::INT), (0::INT)) AS t(a);
+----
+0
+
+query IR
+SELECT a % 2 AS g, avg(a)
+FROM (VALUES (40), (23), (17), (40), (30)) AS t(a)
+GROUP BY g
+ORDER BY g;
+----
+0 36.666666666666664
+1 20
+
+query IR
+SELECT a % 2 AS g, avg(a)
+FROM (VALUES (10::INT), (20::INT), (30::INT), (40::INT), (50::INT)) AS t(a)
+GROUP BY g
+ORDER BY g;
+----
+0 30
+
+query IR
+SELECT a, avg(a)
+FROM (VALUES (0::INT), (0::INT)) AS t(a)
+GROUP BY a
+ORDER BY a;
+----
+0 0
diff --git a/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt b/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt
new file mode 100644
index 0000000000000..2bd80e2e13283
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/aggregate/collect.slt
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (2), (3)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (2), (2), (3), (1)) AS t(a);
+----
+[1, 2, 2, 3, 1]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (1), (NULL), (3)) AS t(a);
+----
+[1, 3]
+
+query ?
+SELECT collect_list(a) FROM (VALUES (CAST(NULL AS INT)), (NULL), (NULL)) AS t(a);
+----
+[]
+
+query I?
+SELECT g, collect_list(a)
+FROM (VALUES (1, 10), (1, 20), (2, 30), (2, 30), (1, 10)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10, 20, 10]
+2 [30, 30]
+
+query I?
+SELECT g, collect_list(a)
+FROM (VALUES (1, 10), (1, NULL), (2, 20), (2, NULL)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10]
+2 [20]
+
+# we need to wrap collect_set with array_sort to have consistent outputs
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (2), (3)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (2), (2), (3), (1)) AS t(a);
+----
+[1, 2, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (1), (NULL), (3)) AS t(a);
+----
+[1, 3]
+
+query ?
+SELECT array_sort(collect_set(a)) FROM (VALUES (CAST(NULL AS INT)), (NULL), (NULL)) AS t(a);
+----
+[]
+
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1, 10), (1, 20), (2, 30), (2, 30), (1, 10)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10, 20]
+2 [30]
+
+query I?
+SELECT g, array_sort(collect_set(a))
+FROM (VALUES (1, 10), (1, NULL), (1, NULL), (2, 20), (2, NULL)) AS t(g, a)
+GROUP BY g
+ORDER BY g;
+----
+1 [10]
+2 [20]
diff --git a/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt b/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt
new file mode 100644
index 0000000000000..0f440a97dd1cc
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/aggregate/try_sum.slt
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (1), (2), (3) AS tab(x);
+----
+6
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (NULL), (2), (NULL) AS tab(x);
+----
+2
+
+query I
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST(9223372036854775807 AS BIGINT)), (1) AS tab(x);
+----
+NULL
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (1.5), (2.5), (3.0) AS tab(x);
+----
+7
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (1e308), (1e308) AS tab(x);
+----
+Infinity
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST('NaN' AS DOUBLE)), (1.0) AS tab(x);
+----
+NaN
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (CAST('Infinity' AS DOUBLE)), (1.0) AS tab(x);
+----
+Infinity
+
+# Decimal
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(10,2) '1.23'), (DECIMAL(10,2) '4.77') AS tab(x);
+----
+6
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(10,2) '1.00'), (NULL), (DECIMAL(10,2) '2.50') AS tab(x);
+----
+3.5
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(5,0) '90000'), (DECIMAL(5,0) '20000') AS tab(x);
+----
+110000
+
+query R
+SELECT try_sum(x) AS sum_x FROM VALUES (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111'),
+                                      (DECIMAL(38,0) '11111111111111111111111111111111111111') AS tab(x);
+----
+NULL
+
+#Group By
+query TI
+SELECT g, try_sum(x) AS sum_x
+FROM VALUES
+  ('bad', CAST(9223372036854775807 AS BIGINT)),
+  ('bad', 1),
+  ('ok', 10),
+  ('ok', NULL),
+  ('ok', 5) AS tab(g, x)
+GROUP BY g
+ORDER BY g;
+----
+bad NULL
+ok 15
+
+query R
+SELECT try_sum(col) FROM VALUES (NULL), (NULL) AS tab(col);
+----
+NULL
+
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('-Infinity' AS DOUBLE)), (CAST('Infinity' AS DOUBLE)) AS tab(col);
+----
+NaN
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('-Infinity' AS DOUBLE)), (CAST('-Infinity' AS DOUBLE)) AS tab(col);
+----
+-Infinity
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST('Infinity' AS FLOAT)), (CAST(1.0 AS FLOAT)) AS tab(col);
+----
+Infinity
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (-0.0), (0.0) AS tab(col);
+----
+0
+
+# need be 0.0
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST(-0.0 AS DOUBLE)), (CAST(0.0 AS DOUBLE)) AS tab(col);
+----
+0
+
+query R
+SELECT try_sum(col) AS sum_x FROM VALUES (CAST(-5.5 AS DECIMAL(10,2))), (CAST(5.5 AS DECIMAL(10,2))) AS tab(col);
+----
+0
+
+# Compare double 0.0 vs decimal 0.00
+query RR
+SELECT 0.0 AS double_zero, CAST(0.0 AS DECIMAL(10,2)) AS decimal_zero;
+----
+0 0
diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt
new file mode 100644
index 0000000000000..79dca1c10a7d0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/array.slt
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query ?
+SELECT array(1, 2, 3);
+----
+[1, 2, 3]
+
+
+query ?
+SELECT array('a', 'b');
+----
+[a, b]
+
+
+query ?
+SELECT array();
+----
+[]
+
+query ??
+SELECT array(), array(array());
+----
+[] [[]]
+
+
+query ?
+SELECT array(null);
+----
+[NULL]
+
+
+query ?
+SELECT array(1, NULL, 3);
+----
+[1, NULL, 3]
+
+
+query ?
+SELECT array['hello', '', null, 'nULl', 'nULlx', 'aa"bb', 'mm\nn', 'uu,vv', 'yy zz'];
+----
+[hello, , NULL, nULl, nULlx, aa"bb, mm\nn, uu,vv, yy zz]
+
+query ?
+SELECT array(array(1,2),array(3,4));
+----
+[[1, 2], [3, 4]]
+
+
+query ?
+SELECT array(array(1), array(2,3,4));
+----
+[[1], [2, 3, 4]]
+
+query ?
+SELECT array(array(1,2));
+----
+[[1, 2]]
+
+query ?
+SELECT array(arrow_cast(array(1), 'LargeList(Int64)'));
+----
+[[1]]
+
+query ?
+SELECT array(arrow_cast(array(1), 'LargeList(Int64)'), arrow_cast(array(), 'LargeList(Int64)'));
+----
+[[1], []]
+
+query ?
+SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3));
+----
+[[1, 2], [3]]
diff --git a/datafusion/sqllogictest/test_files/spark/array/array_contains.slt b/datafusion/sqllogictest/test_files/spark/array/array_contains.slt
new file mode 100644
index 0000000000000..db9ac6b122e3f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/array_contains.slt
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible array_contains function.
+# Spark semantics: if element is found -> true; if not found and array has nulls -> null; if not found and no nulls -> false.
+
+###
+### Scalar tests
+###
+
+# Element found in array
+query B
+SELECT array_contains(array(1, 2, 3), 2);
+----
+true
+
+# Element not found, no nulls in array
+query B
+SELECT array_contains(array(1, 2, 3), 4);
+----
+false
+
+# Element not found, array has null elements -> null
+query B
+SELECT array_contains(array(1, NULL, 3), 2);
+----
+NULL
+
+# Element found, array has null elements -> true (nulls don't matter)
+query B
+SELECT array_contains(array(1, NULL, 3), 1);
+----
+true
+
+# Element found at the end, array has null elements -> true
+query B
+SELECT array_contains(array(1, NULL, 3), 3);
+----
+true
+
+# Null array -> null
+query B
+SELECT array_contains(NULL, 1);
+----
+NULL
+
+# Null element -> null
+query B
+SELECT array_contains(array(1, 2, 3), NULL);
+----
+NULL
+
+# Empty array, element not found -> false
+query B
+SELECT array_contains(array(), 1);
+----
+false
+
+# Array with only nulls, element not found -> null
+query B
+SELECT array_contains(array(NULL, NULL), 1);
+----
+NULL
+
+# String array, element found
+query B
+SELECT array_contains(array('a', 'b', 'c'), 'b');
+----
+true
+
+# String array, element not found, no nulls
+query B
+SELECT array_contains(array('a', 'b', 'c'), 'd');
+----
+false
+
+# String array, element not found, has null
+query B
+SELECT array_contains(array('a', NULL, 'c'), 'd');
+----
+NULL
+
+###
+### Columnar tests with a table
+###
+
+statement ok
+CREATE TABLE test_arrays AS VALUES
+  (1, make_array(1, 2, 3),       10),
+  (2, make_array(4, NULL, 6),    5),
+  (3, make_array(7, 8, 9),       10),
+  (4, NULL,                      1),
+  (5, make_array(10, NULL, NULL), 10);
+
+# Column needle against column array
+query IBB
+SELECT column1,
+       array_contains(column2, column3),
+       array_contains(column2, 10)
+FROM test_arrays
+ORDER BY column1;
+----
+1 false false
+2 NULL NULL
+3 false false
+4 NULL NULL
+5 true true
+
+statement ok
+DROP TABLE test_arrays;
+
+###
+### Nested array tests
+###
+
+# Nested array element found
+query B
+SELECT array_contains(array(array(1, 2), array(3, 4)), array(3, 4));
+----
+true
+
+# Nested array element not found, no nulls
+query B
+SELECT array_contains(array(array(1, 2), array(3, 4)), array(5, 6));
+----
+false
diff --git a/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt b/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt
new file mode 100644
index 0000000000000..19181aae0fc55
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/array_repeat.slt
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query ?
+SELECT array_repeat('123', 2);
+----
+[123, 123]
+
+query ?
+SELECT array_repeat('123', 0);
+----
+[]
+
+query ?
+SELECT array_repeat('123', -1);
+----
+[]
+
+query ?
+SELECT array_repeat('123', CAST('2' AS INT));
+----
+[123, 123]
+
+query ?
+SELECT array_repeat(123, 3);
+----
+[123, 123, 123]
+
+query ?
+SELECT array_repeat('2001-09-28T01:00:00'::timestamp, 2);
+----
+[2001-09-28T01:00:00, 2001-09-28T01:00:00]
+
+query ?
+SELECT array_repeat(array_repeat('123', CAST('2' AS INT)), CAST('3' AS INT));
+----
+[[123, 123], [123, 123], [123, 123]]
+
+query ?
+SELECT array_repeat(['123'], 2);
+----
+[[123], [123]]
+
+query ?
+SELECT array_repeat(NULL, 2);
+----
+NULL
+
+query ?
+SELECT array_repeat([NULL], 2);
+----
+[[NULL], [NULL]]
+
+query ?
+SELECT array_repeat(['123', NULL], 2);
+----
+[[123, NULL], [123, NULL]]
+
+query ?
+SELECT array_repeat('123', CAST(NULL AS INT));
+----
+NULL
+
+query ?
+SELECT array_repeat(column1, column2)
+FROM VALUES
+('123', 2),
+('123', 0),
+('123', -1),
+(NULL, 1),
+('123', NULL);
+----
+[123, 123]
+[]
+[]
+NULL
+NULL
+
+
+query ?
+SELECT array_repeat(column1, column2)
+FROM VALUES
+(['123'], 2),
+([], 2),
+([NULL], 2);
+----
+[[123], [123]]
+[[], []]
+[[NULL], [NULL]]
diff --git a/datafusion/sqllogictest/test_files/spark/array/sequence.slt b/datafusion/sqllogictest/test_files/spark/array/sequence.slt
new file mode 100644
index 0000000000000..bb4aa06bfd257
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/sequence.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sequence(1, 5);
+## PySpark 3.5.5 Result: {'sequence(1, 5)': [1, 2, 3, 4, 5], 'typeof(sequence(1, 5))': 'array<int>', 'typeof(1)': 'int', 'typeof(5)': 'int'}
+#query
+#SELECT sequence(1::int, 5::int);
+
+## Original Query: SELECT sequence(5, 1);
+## PySpark 3.5.5 Result: {'sequence(5, 1)': [5, 4, 3, 2, 1], 'typeof(sequence(5, 1))': 'array<int>', 'typeof(5)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT sequence(5::int, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt
new file mode 100644
index 0000000000000..01d319b619dab
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test shuffle function with simple arrays
+query ?
+SELECT shuffle([1, 2, 3, 4, 5, NULL], 1);
+----
+[1, 4, NULL, 2, 5, 3]
+
+# Test shuffle function with string arrays
+query ?
+SELECT shuffle(['a', 'b', 'c', 'd', 'e', 'f'], 1);
+----
+[a, d, f, b, e, c]
+
+# Test shuffle function with empty array
+query ?
+SELECT shuffle([]);
+----
+[]
+
+# Test shuffle function with single element
+query ?
+SELECT shuffle([42]);
+----
+[42]
+
+# Test shuffle function with null array
+query ?
+SELECT shuffle(NULL);
+----
+NULL
+
+# Test shuffle function with fixed size list arrays
+query ?
+SELECT shuffle(arrow_cast([1, 2, NULL, 3, 4, 5], 'FixedSizeList(6, Int64)'), 1);
+----
+[1, 3, 5, 2, 4, NULL]
+
+# Test shuffle on table data with different list types
+statement ok
+CREATE TABLE test_shuffle_list_types AS VALUES
+  ([1, 2, 3, 4]),
+  ([5, 6, 7, 8, 9]),
+  ([10]),
+  (NULL),
+  ([]);
+
+# Test shuffle with large list from table
+query ?
+SELECT shuffle(column1, 1) FROM test_shuffle_list_types;
+----
+[1, 4, 3, 2]
+[8, 9, 6, 5, 7]
+[10]
+NULL
+[]
+
+# Test fixed size list table
+statement ok
+CREATE TABLE test_shuffle_fixed_size AS VALUES
+  (arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)')),
+  (arrow_cast([NULL, 8, 9], 'FixedSizeList(3, Int64)')),
+  (NULL);
+
+# Test shuffle with fixed size list from table
+query ?
+SELECT shuffle(column1, 1) FROM test_shuffle_fixed_size;
+----
+[1, 2, 3]
+[4, 6, 5]
+[9, NULL, 8]
+NULL
+
+query ?
+SELECT shuffle(['2001-09-28T01:00:00'::timestamp, '2001-08-28T01:00:00'::timestamp, '2001-07-28T01:00:00'::timestamp, '2001-06-28T01:00:00'::timestamp, '2001-05-28T01:00:00'::timestamp], 1);
+----
+[2001-09-28T01:00:00, 2001-06-28T01:00:00, 2001-07-28T01:00:00, 2001-08-28T01:00:00, 2001-05-28T01:00:00]
+
+query ?
+SELECT shuffle(shuffle([1, 20, NULL, 3, 100, NULL, 98, 99], 1), 1);
+----
+[1, 99, NULL, 98, 100, NULL, 3, 20]
+
+query ?
+SELECT shuffle([' ', NULL, 'abc'], 1);
+----
+[ , NULL, abc]
+
+query ?
+SELECT shuffle([1, 2, 3, 4], CAST('2' AS INT));
+----
+[1, 4, 2, 3]
+
+query ?
+SELECT shuffle(['ab'], NULL);
+----
+[ab]
+
+query ?
+SELECT shuffle(shuffle([3, 3], NULL), NULL);
+----
+[3, 3]
+
+# Clean up
+statement ok
+DROP TABLE test_shuffle_list_types;
+
+statement ok
+DROP TABLE test_shuffle_fixed_size;
diff --git a/datafusion/sqllogictest/test_files/spark/array/slice.slt b/datafusion/sqllogictest/test_files/spark/array/slice.slt
new file mode 100644
index 0000000000000..4aba076aba6ba
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/array/slice.slt
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query ?
+SELECT slice([], 2, 2);
+----
+[]
+
+query ?
+SELECT slice([1, 2, 3, 4], 2, 2);
+----
+[2, 3]
+
+query ?
+SELECT slice([1, 2, 3, 4], 1, 100);
+----
+[1, 2, 3, 4]
+
+query ?
+SELECT slice([1, 2, 3, 4], -2, 2);
+----
+[3, 4]
+
+query ?
+SELECT slice([1, 2, 3, 4], 100, 2);
+----
+[]
+
+query ?
+SELECT slice([1, 2, 3, 4], -200, 2);
+----
+[]
+
+query error DataFusion error: Execution error: Length must be non-negative, but got -2
+SELECT slice([1, 2, 3, 4], 2, -2);
+
+query error DataFusion error: Execution error: Length must be non-negative, but got -2
+SELECT slice([1, 2, 3, 4], -2, -2);
+
+query error DataFusion error: Execution error: Start index must not be zero
+SELECT slice([1, 2, 3, 4], 0, -2);
+
+query ?
+SELECT slice([NULL, NULL, NULL, NULL, NULL], 2, 2);
+----
+[NULL, NULL]
+
+query ?
+SELECT slice(arrow_cast(NULL, 'FixedSizeList(1, Int64)'), 2, 2);
+----
+NULL
+
+query ?
+SELECT slice([1, 2, 3, 4], NULL, 2);
+----
+NULL
+
+query ?
+SELECT slice([1, 2, 3, 4], 2, NULL);
+----
+NULL
+
+
+query ?
+SELECT slice(column1, column2, column3)
+FROM VALUES
+([1, 2, 3, 4], 2, 2),
+([1, 2, 3, 4], 1, 100),
+([1, 2, 3, 4], -2, 2),
+([], 2, 2),
+([1, 2, 3, 4], 100, 2),
+([1, 2, 3, 4], -200, 2),
+([NULL, NULL, NULL, NULL, NULL], 2, 2),
+(arrow_cast(NULL, 'FixedSizeList(1, Int64)'), 2, 2),
+([1, 2, 3, 4], NULL, 2),
+([1, 2, 3, 4], 2, NULL);
+----
+[2, 3]
+[1, 2, 3, 4]
+[3, 4]
+[]
+[]
+[]
+[NULL, NULL]
+NULL
+NULL
+NULL
+
+query ?
+SELECT slice(['2001-09-28T01:00:00'::timestamp, '2001-08-28T01:00:00'::timestamp, '2001-07-28T01:00:00'::timestamp, '2001-06-28T01:00:00'::timestamp, '2001-05-28T01:00:00'::timestamp], 1, 3);
+----
+[2001-09-28T01:00:00, 2001-08-28T01:00:00, 2001-07-28T01:00:00]
+
+query ?
+SELECT slice(slice([1, 2, 3, 4], 1, 3), 1, 2);
+----
+[1, 2]
+
+query ?
+SELECT slice([1, 2, 3, 4], CAST('2' AS INT), 4);
+----
+[2, 3, 4]
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt
new file mode 100644
index 0000000000000..4af3193a5db31
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bit_position.slt
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(3, 'Int8'));
+----
+2
+
+query I
+SELECT bitmap_bit_position(arrow_cast(7, 'Int8'));
+----
+6
+
+query I
+SELECT bitmap_bit_position(arrow_cast(15, 'Int8'));
+----
+14
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(256, 'Int16'));
+----
+255
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1024, 'Int16'));
+----
+1023
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-32768, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(16384, 'Int16'));
+----
+16383
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(65536, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1048576, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-2147483648, 'Int32'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(1073741824, 'Int32'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int32'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(4294967296, 'Int64'));
+----
+32767
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-1, 'Int64'));
+----
+1
+
+query I
+SELECT bitmap_bit_position(arrow_cast(-9223372036854775808, 'Int64'));
+----
+0
+
+query I
+SELECT bitmap_bit_position(arrow_cast(9223372036854775807, 'Int64'));
+----
+32766
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
new file mode 100644
index 0000000000000..2a6e190b31eab
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(127, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-64, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(257, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(32767, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-256, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int32'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(65537, 'Int32'));
+----
+3
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(2147483647, 'Int32'));
+----
+65536
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int32'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65536, 'Int32'));
+----
+-2
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int64'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(4294967297, 'Int64'));
+----
+131073
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(9223372036854775807, 'Int64'));
+----
+281474976710656
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int64'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-4294967296, 'Int64'));
+----
+-131072
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-9223372036854775808, 'Int64'));
+----
+-281474976710656
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
new file mode 100644
index 0000000000000..39dca512226b2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT bitmap_count(X'1010');
+----
+2
+
+query I
+SELECT bitmap_count(X'FFFF');
+----
+16
+
+query I
+SELECT bitmap_count(X'0');
+----
+0
+
+query I
+SELECT bitmap_count(a) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+# Tests with different binary types
+query I
+SELECT bitmap_count(arrow_cast(a, 'LargeBinary')) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'BinaryView')) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int32, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int8, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int16, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int64, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt
new file mode 100644
index 0000000000000..8ec886d02e78f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt
@@ -0,0 +1,242 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT bit_count(0);
+## PySpark 3.5.5 Result: {'bit_count(0)': 0, 'typeof(bit_count(0))': 'int', 'typeof(0)': 'int'}
+
+# Basic tests with different integer types
+query I
+SELECT bit_count(0::int);
+----
+0
+
+query I
+SELECT bit_count(1::int);
+----
+1
+
+query I
+SELECT bit_count(7::int);
+----
+3
+
+query I
+SELECT bit_count(15::int);
+----
+4
+
+query I
+SELECT bit_count(255::int);
+----
+8
+
+query I
+SELECT bit_count(1023::int);
+----
+10
+
+# Tests with negative numbers (two's complement)
+query I
+SELECT bit_count(-1::int);
+----
+64
+
+query I
+SELECT bit_count(-2::int);
+----
+63
+
+query I
+SELECT bit_count(-3::int);
+----
+63
+
+# Tests with different integer types
+query I
+SELECT bit_count(arrow_cast(0, 'Int8'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(15, 'Int8'));
+----
+4
+
+query I
+SELECT bit_count(arrow_cast(-1, 'Int8'));
+----
+64
+
+query I
+SELECT bit_count(arrow_cast(0, 'Int16'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(255, 'Int16'));
+----
+8
+
+query I
+SELECT bit_count(arrow_cast(-1, 'Int16'));
+----
+64
+
+query I
+SELECT bit_count(arrow_cast(0, 'Int64'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(255, 'Int64'));
+----
+8
+
+query I
+SELECT bit_count(arrow_cast(-1, 'Int64'));
+----
+64
+
+# Tests with unsigned integer types
+query I
+SELECT bit_count(arrow_cast(0, 'UInt8'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(255, 'UInt8'));
+----
+8
+
+query I
+SELECT bit_count(arrow_cast(0, 'UInt16'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(65535, 'UInt16'));
+----
+16
+
+query I
+SELECT bit_count(arrow_cast(0, 'UInt32'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(4294967295, 'UInt32'));
+----
+32
+
+query I
+SELECT bit_count(arrow_cast(0, 'UInt64'));
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(18446744073709551615, 'UInt64'));
+----
+64
+
+# Tests with NULL values
+query I
+SELECT bit_count(arrow_cast(NULL, 'Int32'));
+----
+NULL
+
+query I
+SELECT bit_count(arrow_cast(NULL, 'Int8'));
+----
+NULL
+
+query I
+SELECT bit_count(arrow_cast(NULL, 'UInt64'));
+----
+NULL
+
+# Tests with edge cases
+query I
+SELECT bit_count(arrow_cast(0, 'Int32')) as zero_count;
+----
+0
+
+query I
+SELECT bit_count(arrow_cast(1, 'Int32')) as one_count;
+----
+1
+
+query I
+SELECT bit_count(arrow_cast(2, 'Int32')) as two_count;
+----
+1
+
+query I
+SELECT bit_count(arrow_cast(3, 'Int32')) as three_count;
+----
+2
+
+query I
+SELECT bit_count(arrow_cast(4, 'Int32')) as four_count;
+----
+1
+
+query I
+SELECT bit_count(arrow_cast(5, 'Int32')) as five_count;
+----
+2
+
+# Tests with large numbers
+query I
+SELECT bit_count(arrow_cast(2147483647, 'Int32'));
+----
+31
+
+query I
+SELECT bit_count(arrow_cast(-2147483648, 'Int32'));
+----
+33
+
+query I
+SELECT bit_count(arrow_cast(9223372036854775807, 'Int64'));
+----
+63
+
+query I
+SELECT bit_count(arrow_cast(-9223372036854775808, 'Int64'));
+----
+1
+
+query I
+SELECT bit_count(true);
+----
+1
+
+query I
+SELECT bit_count(false);
+----
+0
+
+query I
+SELECT bit_count(cast(null as boolean));
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt
new file mode 100644
index 0000000000000..faba0b66c4f20
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/bit_get.slt
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT bit_get(11, 0);
+## PySpark 3.5.5 Result: {'bit_get(11, 0)': 1, 'typeof(bit_get(11, 0))': 'tinyint', 'typeof(11)': 'int', 'typeof(0)': 'int'}
+query I
+SELECT bit_get(11, 0);
+----
+1
+
+## Original Query: SELECT bit_get(11, 2);
+## PySpark 3.5.5 Result: {'bit_get(11, 2)': 0, 'typeof(bit_get(11, 2))': 'tinyint', 'typeof(11)': 'int', 'typeof(2)': 'int'}
+query I
+SELECT bit_get(11, 2);
+----
+0
+
+## Test additional cases
+query I
+SELECT bit_get(11, 3);
+----
+1
+
+query I
+SELECT bit_get(255, 7);
+----
+1
+
+query I
+SELECT bit_get(255, 8);
+----
+0
+
+query I
+SELECT bit_get(0, 0);
+----
+0
+
+## Test edge cases
+statement error DataFusion error: Arrow error: Compute error: bit_get: position -1 is out of bounds. Expected pos < 64 and pos >= 0
+SELECT bit_get(11, -1);
+
+statement error DataFusion error: Arrow error: Compute error: bit_get: position 64 is out of bounds. Expected pos < 64 and pos >= 0
+SELECT bit_get(11, 64);
+
+## Test null inputs
+query I
+SELECT bit_get(NULL, 0);
+----
+NULL
+
+query I
+SELECT bit_get(NULL::int, 0);
+----
+NULL
+
+query I
+SELECT bit_get(11, NULL);
+----
+NULL
+
+query I
+SELECT bit_get(11, NULL::int);
+----
+NULL
+
+query I
+SELECT bit_get(11::tinyint, 0);
+----
+1
+
+query I
+SELECT bit_get(11::bigint, 0);
+----
+1
+
+query I
+SELECT bit_get(11, 3::bigint);
+----
+1
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bitwise_not.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bitwise_not.slt
new file mode 100644
index 0000000000000..5f51cd68ef94f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/bitwise_not.slt
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT bitwise_not(0);
+## PySpark 3.5.5 Result: {'bitwise_not(0)': -1, 'typeof(bitwise_not(0))': 'int', 'typeof(0)': 'int'}
+
+# Basic tests with different integer types
+query I
+SELECT bitwise_not(0::int);
+----
+-1
+
+query I
+SELECT bitwise_not(1::int);
+----
+-2
+
+query I
+SELECT bitwise_not(7::int);
+----
+-8
+
+query I
+SELECT bitwise_not(15::int);
+----
+-16
+
+query I
+SELECT bitwise_not(255::int);
+----
+-256
+
+query I
+SELECT bitwise_not(1023::int);
+----
+-1024
+
+# Tests with negative numbers (two's complement)
+query I
+SELECT bitwise_not(-1::int);
+----
+0
+
+query I
+SELECT bitwise_not(-2::int);
+----
+1
+
+query I
+SELECT bitwise_not(-3::int);
+----
+2
+
+# Tests with different integer types
+query I
+SELECT bitwise_not(arrow_cast(0, 'Int8'));
+----
+-1
+
+query I
+SELECT bitwise_not(arrow_cast(15, 'Int8'));
+----
+-16
+
+query I
+SELECT bitwise_not(arrow_cast(-1, 'Int8'));
+----
+0
+
+query I
+SELECT bitwise_not(arrow_cast(0, 'Int16'));
+----
+-1
+
+query I
+SELECT bitwise_not(arrow_cast(255, 'Int16'));
+----
+-256
+
+query I
+SELECT bitwise_not(arrow_cast(-1, 'Int16'));
+----
+0
+
+query I
+SELECT bitwise_not(arrow_cast(0, 'Int32'));
+----
+-1
+
+query I
+SELECT bitwise_not(arrow_cast(255, 'Int32'));
+----
+-256
+
+query I
+SELECT bitwise_not(arrow_cast(-1, 'Int32'));
+----
+0
+
+query I
+SELECT bitwise_not(arrow_cast(0, 'Int64'));
+----
+-1
+
+query I
+SELECT bitwise_not(arrow_cast(255, 'Int64'));
+----
+-256
+
+query I
+SELECT bitwise_not(arrow_cast(-1, 'Int64'));
+----
+0
+
+# Tests with NULL values
+query I
+SELECT bitwise_not(arrow_cast(NULL, 'Int32'));
+----
+NULL
+
+query I
+SELECT bitwise_not(arrow_cast(NULL, 'Int8'));
+----
+NULL
+
+query I
+SELECT bitwise_not(arrow_cast(NULL, 'Int64'));
+----
+NULL
+
+# Tests with edge cases
+query I
+SELECT bitwise_not(arrow_cast(0, 'Int32')) as zero_not;
+----
+-1
+
+query I
+SELECT bitwise_not(arrow_cast(1, 'Int32')) as one_not;
+----
+-2
+
+query I
+SELECT bitwise_not(arrow_cast(2, 'Int32')) as two_not;
+----
+-3
+
+query I
+SELECT bitwise_not(arrow_cast(3, 'Int32')) as three_not;
+----
+-4
+
+query I
+SELECT bitwise_not(arrow_cast(4, 'Int32')) as four_not;
+----
+-5
+
+query I
+SELECT bitwise_not(arrow_cast(5, 'Int32')) as five_not;
+----
+-6
+
+# Tests with large numbers
+query I
+SELECT bitwise_not(arrow_cast(2147483647, 'Int32'));
+----
+-2147483648
+
+query I
+SELECT bitwise_not(arrow_cast(-2147483648, 'Int32'));
+----
+2147483647
+
+query I
+SELECT bitwise_not(arrow_cast(9223372036854775807, 'Int64'));
+----
+-9223372036854775808
+
+query I
+SELECT bitwise_not(arrow_cast(-9223372036854775808, 'Int64'));
+----
+9223372036854775807
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt b/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt
new file mode 100644
index 0000000000000..7cfdfe8257277
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT getbit(11, 0);
+## PySpark 3.5.5 Result: {'getbit(11, 0)': 1, 'typeof(getbit(11, 0))': 'tinyint', 'typeof(11)': 'int', 'typeof(0)': 'int'}
+query I
+SELECT getbit(11, 0);
+----
+1
+
+## Original Query: SELECT getbit(11, 2);
+## PySpark 3.5.5 Result: {'getbit(11, 2)': 0, 'typeof(getbit(11, 2))': 'tinyint', 'typeof(11)': 'int', 'typeof(2)': 'int'}
+query I
+SELECT getbit(11, 2);
+----
+0
+
+## Test additional cases
+query I
+SELECT getbit(11, 3);
+----
+1
+
+query I
+SELECT getbit(255, 7);
+----
+1
+
+query I
+SELECT getbit(255, 8);
+----
+0
+
+query I
+SELECT getbit(0, 0);
+----
+0
+
+## Test edge cases
+statement error DataFusion error: Arrow error: Compute error: bit_get: position -1 is out of bounds. Expected pos < 64 and pos >= 0
+SELECT getbit(11, -1);
+
+statement error DataFusion error: Arrow error: Compute error: bit_get: position 64 is out of bounds. Expected pos < 64 and pos >= 0
+SELECT getbit(11, 64);
+
+## Test null inputs
+query I
+SELECT getbit(NULL, 0);
+----
+NULL
+
+query I
+SELECT getbit(11, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt b/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt
new file mode 100644
index 0000000000000..3dd43509b1769
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/shiftright.slt
@@ -0,0 +1,262 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT shiftright(4, 1);
+## PySpark 3.5.5 Result: {'shiftright(4, 1)': 2, 'typeof(shiftright(4, 1))': 'int', 'typeof(4)': 'int', 'typeof(1)': 'int'}
+
+# Basic shiftright tests
+query I
+SELECT shiftright(4::int, 1::int);
+----
+2
+
+query I
+SELECT shiftright(8::int, 2::int);
+----
+2
+
+query I
+SELECT shiftright(16::int, 3::int);
+----
+2
+
+# Different data types
+query I
+SELECT shiftright(4::bigint, 1::int);
+----
+2
+
+query I
+SELECT shiftright(8::bigint, 2::int);
+----
+2
+
+query I
+SELECT shiftright(4::int, 1::bigint);
+----
+2
+
+# Large shifts (should handle modulo correctly)
+query I
+SELECT shiftright(1::int, 32::int);
+----
+1
+
+query I
+SELECT shiftright(2::int, 33::int);
+----
+1
+
+query I
+SELECT shiftright(3::int, 64::int);
+----
+3
+
+# Negative shifts
+query I
+SELECT shiftright(4::int, -1::int);
+----
+0
+
+query I
+SELECT shiftright(8::int, -2::int);
+----
+0
+
+query I
+SELECT shiftright(16::int, -3::int);
+----
+0
+
+# Zero shifts
+query I
+SELECT shiftright(5::int, 0::int);
+----
+5
+
+query I
+SELECT shiftright(0::int, 5::int);
+----
+0
+
+# Edge cases - signed right shift preserves sign
+query I
+SELECT shiftright(-4::int, 1::int);
+----
+-2
+
+query I
+SELECT shiftright(-8::int, 2::int);
+----
+-2
+
+query I
+SELECT shiftright(-16::int, 3::int);
+----
+-2
+
+query I
+SELECT shiftright(2147483647::int, 1::int);
+----
+1073741823
+
+# Null handling
+query I
+SELECT shiftright(NULL::int, 1::int);
+----
+NULL
+
+query I
+SELECT shiftright(1::int, NULL::int);
+----
+NULL
+
+query I
+SELECT shiftright(NULL::int, NULL::int);
+----
+NULL
+
+query I
+select shiftright(3::int,-31);
+----
+1
+
+query I
+select shiftright(3::int,-32);
+----
+3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, 1),
+(8, 2),
+(16, 3),
+(32, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+2 Int32
+2 Int32
+2 Int32
+NULL Int32
+NULL Int32
+
+# big shifts
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 32),
+(2, 33),
+(3, 64)
+t(value, shift)
+----
+1 Int32
+1 Int32
+3 Int32
+
+# negative shift
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# For signed integers, right shift preserves sign bit
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(-4, 1),
+(-8, 2),
+(-16, 3)
+t(value, shift)
+----
+-2 Int32
+-2 Int32
+-2 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 Int64
+2 Int64
+2 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt32
+2 UInt32
+2 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftright(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftright(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt64
+2 UInt64
+2 UInt64
+
+# pure null handling
+query IT
+SELECT shiftright(null, 1), arrow_typeof(shiftright(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftright(null, null), arrow_typeof(shiftright(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftright(1::bigint, null), arrow_typeof(shiftright(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt b/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt
new file mode 100644
index 0000000000000..b9ef530b36238
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitwise/shiftrightunsigned.slt
@@ -0,0 +1,251 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT shiftrightunsigned(4, 1);
+## PySpark 3.5.5 Result: {'shiftrightunsigned(4, 1)': 2, 'typeof(shiftrightunsigned(4, 1))': 'int', 'typeof(4)': 'int', 'typeof(1)': 'int'}
+
+# Basic shiftrightunsigned tests
+query I
+SELECT shiftrightunsigned(4::int, 1::int);
+----
+2
+
+query I
+SELECT shiftrightunsigned(8::int, 2::int);
+----
+2
+
+query I
+SELECT shiftrightunsigned(16::int, 3::int);
+----
+2
+
+# Different data types
+query I
+SELECT shiftrightunsigned(4::bigint, 1::int);
+----
+2
+
+query I
+SELECT shiftrightunsigned(8::bigint, 2::int);
+----
+2
+
+query I
+SELECT shiftrightunsigned(4::int, 1::bigint);
+----
+2
+
+# Large shifts (should handle modulo correctly)
+query I
+SELECT shiftrightunsigned(1::int, 32::int);
+----
+1
+
+query I
+SELECT shiftrightunsigned(2::int, 33::int);
+----
+1
+
+query I
+SELECT shiftrightunsigned(3::int, 64::int);
+----
+3
+
+# Negative shifts
+query I
+SELECT shiftrightunsigned(4::int, -1::int);
+----
+0
+
+query I
+SELECT shiftrightunsigned(8::int, -2::int);
+----
+0
+
+query I
+SELECT shiftrightunsigned(16::int, -3::int);
+----
+0
+
+# Zero shifts
+query I
+SELECT shiftrightunsigned(5::int, 0::int);
+----
+5
+
+query I
+SELECT shiftrightunsigned(0::int, 5::int);
+----
+0
+
+# Edge cases - unsigned right shift treats negative values as large positive
+query I
+SELECT shiftrightunsigned(-4::int, 1::int);
+----
+2147483646
+
+query I
+SELECT shiftrightunsigned(-8::int, 2::int);
+----
+1073741822
+
+query I
+SELECT shiftrightunsigned(-16::int, 3::int);
+----
+536870910
+
+query I
+SELECT shiftrightunsigned(2147483647::int, 1::int);
+----
+1073741823
+
+
+# Null handling
+query I
+SELECT shiftrightunsigned(NULL::int, 1::int);
+----
+NULL
+
+query I
+SELECT shiftrightunsigned(1::int, NULL::int);
+----
+NULL
+
+query I
+SELECT shiftrightunsigned(NULL::int, NULL::int);
+----
+NULL
+
+query I
+select shiftrightunsigned(3::int,-31);
+----
+1
+
+query I
+select shiftrightunsigned(3::int,-32);
+----
+3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, 1),
+(8, 2),
+(16, 3),
+(32, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+2 Int32
+2 Int32
+2 Int32
+NULL Int32
+NULL Int32
+
+# negative shift
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# Negative values are treated as large positive values
+# -4 as u32 = 4294967292, 4294967292 >>> 1 = 2147483646
+# -8 as u32 = 4294967288, 4294967288 >>> 2 = 1073741822
+# -16 as u32 = 4294967280, 4294967280 >>> 3 = 536870910
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(-4, 1),
+(-8, 2),
+(-16, 3)
+t(value, shift)
+----
+2147483646 Int32
+1073741822 Int32
+536870910 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 Int64
+2 Int64
+2 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt32
+2 UInt32
+2 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftrightunsigned(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftrightunsigned(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (4, 1), (8, 2), (16, 3) t(value, shift)
+----
+2 UInt64
+2 UInt64
+2 UInt64
+
+# pure null handling
+query IT
+SELECT shiftrightunsigned(null, 1), arrow_typeof(shiftrightunsigned(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftrightunsigned(null, null), arrow_typeof(shiftrightunsigned(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftrightunsigned(1::bigint, null), arrow_typeof(shiftrightunsigned(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/collection/concat.slt b/datafusion/sqllogictest/test_files/spark/collection/concat.slt
new file mode 100644
index 0000000000000..911975d9c72d9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/collection/concat.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT concat('Spark', 'SQL');
+## PySpark 3.5.5 Result: {'concat(Spark, SQL)': 'SparkSQL', 'typeof(concat(Spark, SQL))': 'string', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT concat('Spark'::string, 'SQL'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/collection/reverse.slt b/datafusion/sqllogictest/test_files/spark/collection/reverse.slt
new file mode 100644
index 0000000000000..f49c7c2a8c2b0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/collection/reverse.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT reverse('Spark SQL');
+## PySpark 3.5.5 Result: {'reverse(Spark SQL)': 'LQS krapS', 'typeof(reverse(Spark SQL))': 'string', 'typeof(Spark SQL)': 'string'}
+#query
+#SELECT reverse('Spark SQL'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/collection/size.slt b/datafusion/sqllogictest/test_files/spark/collection/size.slt
new file mode 100644
index 0000000000000..106760eebfe42
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/collection/size.slt
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT size(array(1, 2, 3));
+## PySpark 3.5.5 Result: {'size(array(1, 2, 3))': 3}
+
+# Basic array
+query I
+SELECT size(make_array(1, 2, 3));
+----
+3
+
+# Nested array
+query I
+SELECT size(make_array(make_array(1, 2), make_array(3, 4, 5)));
+----
+2
+
+# LargeList tests
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3), 'LargeList(Int32)'));
+----
+3
+
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'));
+----
+5
+
+# FixedSizeList tests
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int32)'));
+----
+3
+
+query I
+SELECT size(arrow_cast(make_array(1, 2, 3, 4), 'FixedSizeList(4, Int32)'));
+----
+4
+
+# Map size tests
+query I
+SELECT size(map(make_array('a', 'b', 'c'), make_array(1, 2, 3)));
+----
+3
+
+query I
+SELECT size(map(make_array('a'), make_array(1)));
+----
+1
+
+# Empty array
+query I
+SELECT size(arrow_cast(make_array(), 'List(Int32)'));
+----
+0
+
+
+# Array with NULL elements (size counts elements including NULLs)
+query I
+SELECT size(make_array(1, NULL, 3));
+----
+3
+
+# NULL array returns -1 (Spark behavior)
+query I
+SELECT size(NULL::int[]);
+----
+-1
+
+
+# Empty map
+query I
+SELECT size(map(arrow_cast(make_array(), 'List(Utf8)'), arrow_cast(make_array(), 'List(Int32)')));
+----
+0
+
+# String array
+query I
+SELECT size(make_array('hello', 'world'));
+----
+2
+
+# Boolean array
+query I
+SELECT size(make_array(true, false, true));
+----
+3
+
+# Float array
+query I
+SELECT size(make_array(1.5, 2.5, 3.5, 4.5));
+----
+4
+
+# Array column tests (with NULL values)
+query I
+SELECT size(column1) FROM VALUES ([1]), ([1,2]), ([]), (NULL);
+----
+1
+2
+0
+-1
+
+# Map column tests (with NULL values)
+query I
+SELECT size(column1) FROM VALUES (map(['a'], [1])), (map(['a','b'], [1,2])), (NULL);
+----
+1
+2
+-1
diff --git a/datafusion/sqllogictest/test_files/spark/conditional/coalesce.slt b/datafusion/sqllogictest/test_files/spark/conditional/coalesce.slt
new file mode 100644
index 0000000000000..3af8110ad6f38
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conditional/coalesce.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT coalesce(NULL, 1, NULL);
+## PySpark 3.5.5 Result: {'coalesce(NULL, 1, NULL)': 1, 'typeof(coalesce(NULL, 1, NULL))': 'int', 'typeof(NULL)': 'void', 'typeof(1)': 'int'}
+#query
+#SELECT coalesce(NULL::void, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/conditional/if.slt b/datafusion/sqllogictest/test_files/spark/conditional/if.slt
new file mode 100644
index 0000000000000..b4380e065b987
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conditional/if.slt
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Basic IF function tests
+
+# Test basic true condition
+query T
+SELECT if(true, 'yes', 'no');
+----
+yes
+
+# Test basic false condition
+query T
+SELECT if(false, 'yes', 'no');
+----
+no
+
+# Test with comparison operators
+query T
+SELECT if(1 < 2, 'a', 'b');
+----
+a
+
+query T
+SELECT if(1 > 2, 'a', 'b');
+----
+b
+
+
+## Numeric type tests
+
+# Test with integers
+query I
+SELECT if(true, 10, 20);
+----
+10
+
+query I
+SELECT if(false, 10, 20);
+----
+20
+
+# Test with different integer types
+query I
+SELECT if(true, 100, 200);
+----
+100
+
+## Float type tests
+
+# Test with floating point numbers
+query R
+SELECT if(true, 1.5, 2.5);
+----
+1.5
+
+query R
+SELECT if(false, 1.5, 2.5);
+----
+2.5
+
+## String type tests
+
+# Test with different string values
+query T
+SELECT if(true, 'hello', 'world');
+----
+hello
+
+query T
+SELECT if(false, 'hello', 'world');
+----
+world
+
+## NULL handling tests
+
+# Test with NULL condition
+query T
+SELECT if(NULL, 'yes', 'no');
+----
+no
+
+query T
+SELECT if(NOT NULL, 'yes', 'no');
+----
+no
+
+# Test with NULL true value
+query T
+SELECT if(true, NULL, 'no');
+----
+NULL
+
+# Test with NULL false value
+query T
+SELECT if(false, 'yes', NULL);
+----
+NULL
+
+# Test with all NULL
+query ?
+SELECT if(true, NULL, NULL);
+----
+NULL
+
+## Type coercion tests
+
+# Test integer to float coercion
+query R
+SELECT if(true, 10, 20.5);
+----
+10
+
+query R
+SELECT if(false, 10, 20.5);
+----
+20.5
+
+# Test float to integer coercion
+query R
+SELECT if(true, 10.5, 20);
+----
+10.5
+
+query R
+SELECT if(false, 10.5, 20);
+----
+20
+
+statement error Int64 is not a boolean or null
+SELECT if(1, 10.5, 20);
+
+
+statement error Utf8 is not a boolean or null
+SELECT if('x', 10.5, 20);
+
+query II
+SELECT v, IF(v < 0, 10/0, 1) FROM (VALUES (1), (2)) t(v)
+----
+1 1
+2 1
+
+query I
+SELECT IF(true, 1 / 1, 1 / 0);
+----
+1
diff --git a/datafusion/sqllogictest/test_files/spark/conditional/nullif.slt b/datafusion/sqllogictest/test_files/spark/conditional/nullif.slt
new file mode 100644
index 0000000000000..1a4c80e3baaeb
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conditional/nullif.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT nullif(2, 2);
+## PySpark 3.5.5 Result: {'nullif(2, 2)': None, 'typeof(nullif(2, 2))': 'int', 'typeof(2)': 'int'}
+#query
+#SELECT nullif(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/conditional/nvl2.slt b/datafusion/sqllogictest/test_files/spark/conditional/nvl2.slt
new file mode 100644
index 0000000000000..c5ea2f8f1f360
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/conditional/nvl2.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT nvl2(NULL, 2, 1);
+## PySpark 3.5.5 Result: {'nvl2(NULL, 2, 1)': 1, 'typeof(nvl2(NULL, 2, 1))': 'int', 'typeof(NULL)': 'void', 'typeof(2)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT nvl2(NULL::void, 2::int, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/csv/schema_of_csv.slt b/datafusion/sqllogictest/test_files/spark/csv/schema_of_csv.slt
new file mode 100644
index 0000000000000..eaa31c9d5c9cb
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/csv/schema_of_csv.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT schema_of_csv('1,abc');
+## PySpark 3.5.5 Result: {'schema_of_csv(1,abc)': 'STRUCT<_c0: INT, _c1: STRING>', 'typeof(schema_of_csv(1,abc))': 'string', 'typeof(1,abc)': 'string'}
+#query
+#SELECT schema_of_csv('1,abc'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt b/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt
new file mode 100644
index 0000000000000..55a493ffefe26
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/add_months.slt
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query D
+SELECT add_months('2016-07-30'::date, 1::int);
+----
+2016-08-30
+
+query D
+SELECT add_months('2016-07-30'::date, 0::int);
+----
+2016-07-30
+
+query D
+SELECT add_months('2016-07-30'::date, 10000::int);
+----
+2849-11-30
+
+# Test integer overflow
+# TODO: Enable with next arrow upgrade (>=58.0.0)
+# query D
+# SELECT add_months('2016-07-30'::date, 2147483647::int);
+# ----
+# NULL
+
+query D
+SELECT add_months('2016-07-30'::date, -5::int);
+----
+2016-02-29
+
+# Test with NULL values
+query D
+SELECT add_months(NULL::date, 1::int);
+----
+NULL
+
+query D
+SELECT add_months('2016-07-30'::date, NULL::int);
+----
+NULL
+
+query D
+SELECT add_months(NULL::date, NULL::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/convert_timezone.slt b/datafusion/sqllogictest/test_files/spark/datetime/convert_timezone.slt
new file mode 100644
index 0000000000000..54c9e616cf05e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/convert_timezone.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT convert_timezone('Europe/Brussels', 'America/Los_Angeles', timestamp_ntz'2021-12-06 00:00:00');
+## PySpark 3.5.5 Result: {"convert_timezone(Europe/Brussels, America/Los_Angeles, TIMESTAMP_NTZ '2021-12-06 00:00:00')": datetime.datetime(2021, 12, 5, 15, 0), "typeof(convert_timezone(Europe/Brussels, America/Los_Angeles, TIMESTAMP_NTZ '2021-12-06 00:00:00'))": 'timestamp_ntz', 'typeof(Europe/Brussels)': 'string', 'typeof(America/Los_Angeles)': 'string', "typeof(TIMESTAMP_NTZ '2021-12-06 00:00:00')": 'timestamp_ntz'}
+#query
+#SELECT convert_timezone('Europe/Brussels'::string, 'America/Los_Angeles'::string, TIMESTAMP_NTZ '2021-12-06 00:00:00'::timestamp_ntz);
+
+## Original Query: SELECT convert_timezone('Europe/Brussels', timestamp_ntz'2021-12-05 15:00:00');
+## PySpark 3.5.5 Result: {"convert_timezone(current_timezone(), Europe/Brussels, TIMESTAMP_NTZ '2021-12-05 15:00:00')": datetime.datetime(2021, 12, 6, 0, 0), "typeof(convert_timezone(current_timezone(), Europe/Brussels, TIMESTAMP_NTZ '2021-12-05 15:00:00'))": 'timestamp_ntz', 'typeof(Europe/Brussels)': 'string', "typeof(TIMESTAMP_NTZ '2021-12-05 15:00:00')": 'timestamp_ntz'}
+#query
+#SELECT convert_timezone('Europe/Brussels'::string, TIMESTAMP_NTZ '2021-12-05 15:00:00'::timestamp_ntz);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/curdate.slt b/datafusion/sqllogictest/test_files/spark/datetime/curdate.slt
new file mode 100644
index 0000000000000..21ec4c0305aa0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/curdate.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT curdate();
+## PySpark 3.5.5 Result: {'current_date()': datetime.date(2025, 6, 14), 'typeof(current_date())': 'date'}
+#query
+#SELECT curdate();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/current_date.slt b/datafusion/sqllogictest/test_files/spark/datetime/current_date.slt
new file mode 100644
index 0000000000000..cd187901777f4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/current_date.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_date();
+## PySpark 3.5.5 Result: {'current_date()': datetime.date(2025, 6, 14), 'typeof(current_date())': 'date'}
+#query
+#SELECT current_date();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/current_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/current_timestamp.slt
new file mode 100644
index 0000000000000..f3e4f5856aca6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/current_timestamp.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_timestamp();
+## PySpark 3.5.5 Result: {'current_timestamp()': datetime.datetime(2025, 6, 14, 23, 57, 38, 948981), 'typeof(current_timestamp())': 'timestamp'}
+#query
+#SELECT current_timestamp();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/current_timezone.slt b/datafusion/sqllogictest/test_files/spark/datetime/current_timezone.slt
new file mode 100644
index 0000000000000..db3d8d40742d7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/current_timezone.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_timezone();
+## PySpark 3.5.5 Result: {'current_timezone()': 'America/Los_Angeles', 'typeof(current_timezone())': 'string'}
+#query
+#SELECT current_timezone();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt
new file mode 100644
index 0000000000000..cb407a6453696
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_add.slt
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT date_add('2016-07-30', 1);
+## PySpark 3.5.5 Result: {'date_add(2016-07-30, 1)': datetime.date(2016, 7, 31), 'typeof(date_add(2016-07-30, 1))': 'date', 'typeof(2016-07-30)': 'string', 'typeof(1)': 'int'}
+
+# Basic date_add tests
+query D
+SELECT date_add('2016-07-30'::date, 1::int);
+----
+2016-07-31
+
+query D
+SELECT date_add('2016-07-30'::date, arrow_cast(1, 'Int8'));
+----
+2016-07-31
+
+query D
+SELECT date_add('2016-07-30'::date, arrow_cast(1, 'Int8'));
+----
+2016-07-31
+
+query D
+SELECT date_add('2016-07-30'::date, 0::int);
+----
+2016-07-30
+
+query I
+SELECT date_add('2016-07-30'::date, 2147483647::int)::int;
+----
+-2147466637
+
+query I
+SELECT date_add('1969-01-01'::date, 2147483647::int)::int;
+----
+2147483282
+
+query D
+SELECT date_add('2016-07-30'::date, 100000::int);
+----
+2290-05-15
+
+# Test with negative day values (should subtract days)
+query D
+SELECT date_add('2016-07-30'::date, -5::int);
+----
+2016-07-25
+
+# Test with NULL values
+query D
+SELECT date_add(NULL::date, 1::int);
+----
+NULL
+
+query D
+SELECT date_add('2016-07-30'::date, NULL::int);
+----
+NULL
+
+query D
+SELECT date_add(NULL::date, NULL::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt
new file mode 100644
index 0000000000000..b0952d6a43510
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# date input
+query I
+SELECT date_diff('2009-07-30'::date, '2009-07-31'::date);
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31'::date, '2009-07-30'::date);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31'::string, '2009-07-30'::date);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31'::timestamp, '2009-07-30'::date);
+----
+1
+
+# Date64 input
+query I
+SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), arrow_cast('2009-07-30', 'Date64'));
+----
+1
+
+query I
+SELECT date_diff(arrow_cast('2009-07-30', 'Date64'), arrow_cast('2009-07-31', 'Date64'));
+----
+-1
+
+# Mixed Date32 and Date64 input
+query I
+SELECT date_diff('2009-07-31'::date, arrow_cast('2009-07-30', 'Date64'));
+----
+1
+
+query I
+SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), '2009-07-30'::date);
+----
+1
+
+
+# Same date returns 0
+query I
+SELECT date_diff('2009-07-30'::date, '2009-07-30'::date);
+----
+0
+
+# Large difference
+query I
+SELECT date_diff('2020-01-01'::date, '1970-01-01'::date);
+----
+18262
+
+# timestamp input
+query I
+SELECT date_diff('2009-07-30 12:34:56'::timestamp, '2009-07-31 23:45:01'::timestamp);
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timestamp);
+----
+1
+
+query I
+SELECT date_diff('2009-07-31 23:45:01'::string, '2009-07-30 12:34:56'::timestamp);
+----
+1
+
+# string input
+query I
+SELECT date_diff('2009-07-30', '2009-07-31');
+----
+-1
+
+query I
+SELECT date_diff('2009-07-31', '2009-07-30');
+----
+1
+
+# NULL handling
+query I
+SELECT date_diff(NULL::date, '2009-07-30'::date);
+----
+NULL
+
+query I
+SELECT date_diff('2009-07-31'::date, NULL::date);
+----
+NULL
+
+query I
+SELECT date_diff(NULL::date, NULL::date);
+----
+NULL
+
+query I
+SELECT date_diff(column1, column2)
+FROM VALUES
+('2009-07-30'::date, '2009-07-31'::date),
+('2009-07-31'::date, '2009-07-30'::date),
+(NULL::date, '2009-07-30'::date),
+('2009-07-31'::date, NULL::date),
+(NULL::date, NULL::date);
+----
+-1
+1
+NULL
+NULL
+NULL
+
+
+# Alias datediff
+query I
+SELECT datediff('2009-07-30'::date, '2009-07-31'::date);
+----
+-1
+
+query I
+SELECT datediff(column1, column2)
+FROM VALUES
+('2009-07-30'::date, '2009-07-31'::date),
+('2009-07-31'::date, '2009-07-30'::date),
+(NULL::date, '2009-07-30'::date),
+('2009-07-31'::date, NULL::date),
+(NULL::date, NULL::date);
+----
+-1
+1
+NULL
+NULL
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_format.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_format.slt
new file mode 100644
index 0000000000000..1242518dee3f5
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_format.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT date_format('2016-04-08', 'y');
+## PySpark 3.5.5 Result: {'date_format(2016-04-08, y)': '2016', 'typeof(date_format(2016-04-08, y))': 'string', 'typeof(2016-04-08)': 'string', 'typeof(y)': 'string'}
+#query
+#SELECT date_format('2016-04-08'::string, 'y'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt
new file mode 100644
index 0000000000000..48216bd551692
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_part.slt
@@ -0,0 +1,276 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# YEAR
+query I
+SELECT date_part('YEAR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YEARS'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('Y'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT date_part('YRS'::string, '2000-01-01'::date);
+----
+2000
+
+# YEAROFWEEK
+query I
+SELECT date_part('YEAROFWEEK'::string, '2000-01-01'::date);
+----
+1999
+
+# QUARTER
+query I
+SELECT date_part('QUARTER'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('QTR'::string, '2000-01-01'::date);
+----
+1
+
+# MONTH
+query I
+SELECT date_part('MONTH'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MON'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MONS'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('MONTHS'::string, '2000-01-01'::date);
+----
+1
+
+# WEEK
+query I
+SELECT date_part('WEEK'::string, '2000-01-01'::date);
+----
+52
+
+query I
+SELECT date_part('WEEKS'::string, '2000-01-01'::date);
+----
+52
+
+query I
+SELECT date_part('W'::string, '2000-01-01'::date);
+----
+52
+
+# DAYS
+query I
+SELECT date_part('DAY'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('D'::string, '2000-01-01'::date);
+----
+1
+
+query I
+SELECT date_part('DAYS'::string, '2000-01-01'::date);
+----
+1
+
+# DAYOFWEEK
+query I
+SELECT date_part('DAYOFWEEK'::string, '2000-01-01'::date);
+----
+7
+
+query I
+SELECT date_part('DOW'::string, '2000-01-01'::date);
+----
+7
+
+# DAYOFWEEK_ISO
+query I
+SELECT date_part('DAYOFWEEK_ISO'::string, '2000-01-01'::date);
+----
+6
+
+query I
+SELECT date_part('DOW_ISO'::string, '2000-01-01'::date);
+----
+6
+
+# DOY
+query I
+SELECT date_part('DOY'::string, '2000-01-01'::date);
+----
+1
+
+# HOUR
+query I
+SELECT date_part('HOUR'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('H'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HOURS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HR'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+query I
+SELECT date_part('HRS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+12
+
+# MINUTE
+query I
+SELECT date_part('MINUTE'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('M'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MIN'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MINS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+query I
+SELECT date_part('MINUTES'::string, '2000-01-01 12:30:45'::timestamp);
+----
+30
+
+# SECOND
+query I
+SELECT date_part('SECOND'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('S'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SEC'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SECONDS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+query I
+SELECT date_part('SECS'::string, '2000-01-01 12:30:45'::timestamp);
+----
+45
+
+# NULL input
+query I
+SELECT date_part('year'::string, NULL::timestamp);
+----
+NULL
+
+query error Internal error: First argument of `DATE_PART` must be non-null scalar Utf8
+SELECT date_part(NULL::string, '2000-01-01'::date);
+
+# Invalid part
+query error DataFusion error: Execution error: Date part 'test' not supported
+SELECT date_part('test'::string, '2000-01-01'::date);
+
+query I
+SELECT date_part('year', column1)
+FROM VALUES
+('2022-03-15'::date),
+('1999-12-31'::date),
+('2000-01-01'::date),
+(NULL::date);
+----
+2022
+1999
+2000
+NULL
+
+query I
+SELECT date_part('minutes', column1)
+FROM VALUES
+('2022-03-15 12:30:45'::timestamp),
+('1999-12-31 12:32:45'::timestamp),
+('2000-01-01 12:00:45'::timestamp),
+(NULL::timestamp);
+----
+30
+32
+0
+NULL
+
+# alias datepart
+query I
+SELECT datepart('YEAR'::string, '2000-01-01'::date);
+----
+2000
+
+query I
+SELECT datepart('year', column1)
+FROM VALUES
+('2022-03-15'::date),
+('1999-12-31'::date),
+('2000-01-01'::date),
+(NULL::date);
+----
+2022
+1999
+2000
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt
new file mode 100644
index 0000000000000..bf36ebd867d19
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_sub.slt
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT date_sub('2016-07-30', 1);
+## PySpark 3.5.5 Result: {'date_sub(2016-07-30, 1)': datetime.date(2016, 7, 29), 'typeof(date_sub(2016-07-30, 1))': 'date', 'typeof(2016-07-30)': 'string', 'typeof(1)': 'int'}
+
+# Basic date_sub tests
+query D
+SELECT date_sub('2016-07-30'::date, 1::int);
+----
+2016-07-29
+
+query D
+SELECT date_sub('2016-07-30'::date, arrow_cast(1, 'Int8'));
+----
+2016-07-29
+
+query D
+SELECT date_sub('2016-07-30'::date, arrow_cast(1, 'Int16'));
+----
+2016-07-29
+
+query D
+SELECT date_sub('2016-07-30'::date, 0::int);
+----
+2016-07-30
+
+query I
+SELECT date_sub('1969-01-01'::date, 2147483647::int)::int;
+----
+2147483284
+
+query D
+SELECT date_sub('2016-07-30'::date, 100000::int);
+----
+1742-10-15
+
+# Test with negative day values (should add days)
+query D
+SELECT date_sub('2016-07-30'::date, -1::int);
+----
+2016-07-31
+
+query D
+SELECT date_sub('2016-07-30'::date, -5::int);
+----
+2016-08-04
+
+# Test with NULL values
+query D
+SELECT date_sub(NULL::date, 1::int);
+----
+NULL
+
+query D
+SELECT date_sub('2016-07-30'::date, NULL::int);
+----
+NULL
+
+query D
+SELECT date_sub(NULL::date, NULL::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt
new file mode 100644
index 0000000000000..7fc1583bb9310
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/date_trunc.slt
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# YEAR - truncate to first date of year, time zeroed
+query P
+SELECT date_trunc('YEAR', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+query P
+SELECT date_trunc('YYYY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+query P
+SELECT date_trunc('YY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-01-01T00:00:00
+
+# QUARTER - truncate to first date of quarter, time zeroed
+query P
+SELECT date_trunc('QUARTER', '2015-05-05T09:32:05.123456'::timestamp);
+----
+2015-04-01T00:00:00
+
+# MONTH - truncate to first date of month, time zeroed
+query P
+SELECT date_trunc('MONTH', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+query P
+SELECT date_trunc('MM', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+query P
+SELECT date_trunc('MON', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-01T00:00:00
+
+# WEEK - truncate to Monday of the week, time zeroed
+query P
+SELECT date_trunc('WEEK', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-02T00:00:00
+
+# DAY - zero out time part
+query P
+SELECT date_trunc('DAY', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T00:00:00
+
+query P
+SELECT date_trunc('DD', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T00:00:00
+
+# HOUR - zero out minute and second with fraction
+query P
+SELECT date_trunc('HOUR', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:00:00
+
+# MINUTE - zero out second with fraction
+query P
+SELECT date_trunc('MINUTE', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:00
+
+# SECOND - zero out fraction
+query P
+SELECT date_trunc('SECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05
+
+# MILLISECOND - zero out microseconds
+query P
+SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05.123
+
+# MICROSECOND - everything remains
+query P
+SELECT date_trunc('MICROSECOND', '2015-03-05T09:32:05.123456'::timestamp);
+----
+2015-03-05T09:32:05.123456
+
+query P
+SELECT date_trunc('YEAR', column1)
+FROM VALUES
+('2015-03-05T09:32:05.123456'::timestamp),
+('2020-11-15T22:45:30.654321'::timestamp),
+('1999-07-20T14:20:10.000001'::timestamp),
+(NULL::timestamp);
+----
+2015-01-01T00:00:00
+2020-01-01T00:00:00
+1999-01-01T00:00:00
+NULL
+
+# String input
+query P
+SELECT date_trunc('YEAR', '2015-03-05T09:32:05.123456');
+----
+2015-01-01T00:00:00
+
+# Null handling
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: First argument of `DATE_TRUNC` must be non-null scalar Utf8
+SELECT date_trunc(NULL, '2015-03-05T09:32:05.123456');
+
+query P
+SELECT date_trunc('YEAR', NULL::timestamp);
+----
+NULL
+
+# incorrect format
+query error DataFusion error: Execution error: Unsupported date_trunc granularity: 'test'. Supported values are: microsecond, millisecond, second, minute, hour, day, week, month, quarter, year
+SELECT date_trunc('test', '2015-03-05T09:32:05.123456');
+
+# Timezone handling - Spark-compatible behavior
+# Spark converts timestamps to session timezone before truncating for coarse granularities
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, Some("UTC"))'));
+----
+2024-07-15T00:00:00Z
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, None)'));
+----
+2024-07-15T00:00:00
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+# This timestamp is 03:30 UTC = 23:30 EDT (previous day) on July 14
+# With session timezone, truncation happens in America/New_York timezone
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, Some("UTC"))'));
+----
+2024-07-14T00:00:00Z
+
+query P
+SELECT date_trunc('DAY', arrow_cast(timestamp '2024-07-15T03:30:00', 'Timestamp(Microsecond, None)'));
+----
+2024-07-15T00:00:00
+
+statement ok
+RESET datafusion.execution.time_zone;
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/dateadd.slt b/datafusion/sqllogictest/test_files/spark/datetime/dateadd.slt
new file mode 100644
index 0000000000000..c369989616f6c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/dateadd.slt
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT dateadd('2016-07-30', 1);
+## PySpark 3.5.5 Result: {'date_add(2016-07-30, 1)': datetime.date(2016, 7, 31), 'typeof(date_add(2016-07-30, 1))': 'date', 'typeof(2016-07-30)': 'string', 'typeof(1)': 'int'}
+
+# Basic dateadd tests (alias for date_add)
+query D
+SELECT dateadd('2016-07-30'::date, 1::int);
+----
+2016-07-31
+
+query D
+SELECT dateadd('2016-07-30'::date, 0::int);
+----
+2016-07-30
+
+# Test with negative day values (should subtract days)
+
+query D
+SELECT dateadd('2016-07-30'::date, -5::int);
+----
+2016-07-25
+
+# Test with NULL values
+query D
+SELECT dateadd(NULL::date, 1::int);
+----
+NULL
+
+query D
+SELECT dateadd('2016-07-30'::date, NULL::int);
+----
+NULL
+
+query D
+SELECT dateadd(NULL::date, NULL::int);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/day.slt b/datafusion/sqllogictest/test_files/spark/datetime/day.slt
new file mode 100644
index 0000000000000..35b73d67f5fd1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/day.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT day('2009-07-30');
+## PySpark 3.5.5 Result: {'day(2009-07-30)': 30, 'typeof(day(2009-07-30))': 'int', 'typeof(2009-07-30)': 'string'}
+#query
+#SELECT day('2009-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/dayofmonth.slt b/datafusion/sqllogictest/test_files/spark/datetime/dayofmonth.slt
new file mode 100644
index 0000000000000..4e4e9ff4a23b3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/dayofmonth.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT dayofmonth('2009-07-30');
+## PySpark 3.5.5 Result: {'dayofmonth(2009-07-30)': 30, 'typeof(dayofmonth(2009-07-30))': 'int', 'typeof(2009-07-30)': 'string'}
+#query
+#SELECT dayofmonth('2009-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/dayofweek.slt b/datafusion/sqllogictest/test_files/spark/datetime/dayofweek.slt
new file mode 100644
index 0000000000000..cc885818f62ff
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/dayofweek.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT dayofweek('2009-07-30');
+## PySpark 3.5.5 Result: {'dayofweek(2009-07-30)': 5, 'typeof(dayofweek(2009-07-30))': 'int', 'typeof(2009-07-30)': 'string'}
+#query
+#SELECT dayofweek('2009-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/dayofyear.slt b/datafusion/sqllogictest/test_files/spark/datetime/dayofyear.slt
new file mode 100644
index 0000000000000..7ffab98dac84a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/dayofyear.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT dayofyear('2016-04-09');
+## PySpark 3.5.5 Result: {'dayofyear(2016-04-09)': 100, 'typeof(dayofyear(2016-04-09))': 'int', 'typeof(2016-04-09)': 'string'}
+#query
+#SELECT dayofyear('2016-04-09'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt
new file mode 100644
index 0000000000000..5a39bda0a651b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/from_utc_timestamp.slt
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# String inputs
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'UTC'::string);
+----
+2016-08-31T00:00:00
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'Asia/Seoul'::string);
+----
+2016-08-31T09:00:00
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, 'America/New_York'::string);
+----
+2016-08-30T20:00:00
+
+# String inputs with offsets
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string);
+----
+2018-03-13T13:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string);
+----
+2018-03-13T00:18:23
+
+# Timestamp inputs
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string);
+----
+2018-03-13T13:18:23
+
+query P
+SELECT from_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string);
+----
+2018-03-13T00:18:23
+
+# Null inputs
+query P
+SELECT from_utc_timestamp(NULL::string, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp(NULL::timestamp, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp('2016-08-31'::string, NULL::string);
+----
+NULL
+
+query P
+SELECT from_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::string, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::string, 'UTC'::string),
+('2016-08-31'::string, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string),
+(NULL::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, NULL::string);
+----
+2016-08-31T09:00:00
+2018-03-13T13:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-30T20:00:00
+2018-03-13T00:18:23
+NULL
+NULL
+
+query P
+SELECT from_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-31T09:00:00
+2018-03-13T13:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-30T20:00:00
+2018-03-13T00:18:23
+NULL
+NULL
+
+query P
+SELECT from_utc_timestamp(arrow_cast(column1, 'Timestamp(Microsecond, Some("Asia/Seoul"))'), column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-31T09:00:00+09:00
+2018-03-13T13:18:23+09:00
+2016-08-31T00:00:00+09:00
+2018-03-13T04:18:23+09:00
+2016-08-30T20:00:00+09:00
+2018-03-13T00:18:23+09:00
+NULL
+NULL
+
+
+# DST edge cases
+query P
+SELECT from_utc_timestamp('2020-03-31T13:40:00'::timestamp, 'America/New_York'::string);
+----
+2020-03-31T09:40:00
+
+
+query P
+SELECT from_utc_timestamp('2020-11-04T14:06:40'::timestamp, 'America/New_York'::string);
+----
+2020-11-04T09:06:40
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/hour.slt b/datafusion/sqllogictest/test_files/spark/datetime/hour.slt
new file mode 100644
index 0000000000000..b595a98b579c8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/hour.slt
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT hour('2009-07-30 12:58:59');
+## PySpark 3.5.5 Result: {'hour(2009-07-30 12:58:59)': 12, 'typeof(hour(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
+query I
+SELECT hour('2009-07-30 12:58:59'::timestamp);
+----
+12
+
+# Test with different hours
+query I
+SELECT hour('2009-07-30 00:00:00'::timestamp);
+----
+0
+
+query I
+SELECT hour('2009-07-30 23:59:59'::timestamp);
+----
+23
+
+# Test with NULL
+query I
+SELECT hour(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/last_day.slt b/datafusion/sqllogictest/test_files/spark/datetime/last_day.slt
new file mode 100644
index 0000000000000..6dee48de9555d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/last_day.slt
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+query D
+SELECT last_day('2009-01-12'::DATE);
+----
+2009-01-31
+
+
+query D
+SELECT last_day('2015-02-28'::DATE);
+----
+2015-02-28
+
+query D
+SELECT last_day('2015-03-27'::DATE);
+----
+2015-03-31
+
+query D
+SELECT last_day('2015-04-26'::DATE);
+----
+2015-04-30
+
+query D
+SELECT last_day('2015-05-25'::DATE);
+----
+2015-05-31
+
+query D
+SELECT last_day('2015-06-24'::DATE);
+----
+2015-06-30
+
+query D
+SELECT last_day('2015-07-23'::DATE);
+----
+2015-07-31
+
+query D
+SELECT last_day('2015-08-01'::DATE);
+----
+2015-08-31
+
+query D
+SELECT last_day('2015-09-02'::DATE);
+----
+2015-09-30
+
+query D
+SELECT last_day('2015-10-03'::DATE);
+----
+2015-10-31
+
+query D
+SELECT last_day('2015-11-04'::DATE);
+----
+2015-11-30
+
+query D
+SELECT last_day('2015-12-05'::DATE);
+----
+2015-12-31
+
+
+query D
+SELECT last_day('2016-01-06'::DATE);
+----
+2016-01-31
+
+query D
+SELECT last_day('2016-02-07'::DATE);
+----
+2016-02-29
+
+
+query D
+SELECT last_day(null::DATE);
+----
+NULL
+
+
+statement error Failed to coerce arguments to satisfy a call to 'last_day' function
+select last_day('foo');
+
+
+statement error Failed to coerce arguments to satisfy a call to 'last_day' function
+select last_day(123);
+
+
+statement error 'last_day' does not support zero arguments
+select last_day();
+
+statement error Failed to coerce arguments to satisfy a call to 'last_day' function
+select last_day(last_day('2016-02-07'::string, 'foo'));
+
+statement error Failed to coerce arguments to satisfy a call to 'last_day' function
+select last_day(last_day('2016-02-31'::string));
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/localtimestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/localtimestamp.slt
new file mode 100644
index 0000000000000..36fd451382d04
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/localtimestamp.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT localtimestamp();
+## PySpark 3.5.5 Result: {'localtimestamp()': datetime.datetime(2025, 6, 14, 23, 57, 39, 529742), 'typeof(localtimestamp())': 'timestamp_ntz'}
+#query
+#SELECT localtimestamp();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_date.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_date.slt
new file mode 100644
index 0000000000000..b95347f976e95
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_date.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_date(2013, 7, 15);
+## PySpark 3.5.5 Result: {'make_date(2013, 7, 15)': datetime.date(2013, 7, 15), 'typeof(make_date(2013, 7, 15))': 'date', 'typeof(2013)': 'int', 'typeof(7)': 'int', 'typeof(15)': 'int'}
+#query
+#SELECT make_date(2013::int, 7::int, 15::int);
+
+## Original Query: SELECT make_date(2019, 7, NULL);
+## PySpark 3.5.5 Result: {'make_date(2019, 7, NULL)': None, 'typeof(make_date(2019, 7, NULL))': 'date', 'typeof(2019)': 'int', 'typeof(7)': 'int', 'typeof(NULL)': 'void'}
+#query
+#SELECT make_date(2019::int, 7::int, NULL::void);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt
new file mode 100644
index 0000000000000..1223b777d1d63
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_dt_interval.slt
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_dt_interval(1, 12, 30, 01.001001);
+## PySpark 3.5.5 Result: {'make_dt_interval(1, 12, 30, 1.001001)': datetime.timedelta(days=1, seconds=45001, microseconds=1001), 'typeof(make_dt_interval(1, 12, 30, 1.001001))': 'interval day to second', 'typeof(1)': 'int', 'typeof(12)': 'int', 'typeof(30)': 'int', 'typeof(1.001001)': 'decimal(7,6)'}
+query ?
+SELECT make_dt_interval(1::int, 12::int, 30::int, 1.001001::decimal(7,6));
+----
+1 days 12 hours 30 mins 1.001001 secs
+
+## Original Query: SELECT make_dt_interval(100, null, 3);
+## PySpark 3.5.5 Result: {'make_dt_interval(100, NULL, 3, 0.000000)': None, 'typeof(make_dt_interval(100, NULL, 3, 0.000000))': 'interval day to second', 'typeof(100)': 'int', 'typeof(NULL)': 'void', 'typeof(3)': 'int'}
+query ?
+SELECT make_dt_interval(100::int, NULL, 3::int);
+----
+NULL
+
+## Original Query: SELECT make_dt_interval(2);
+## PySpark 3.5.5 Result: {'make_dt_interval(2, 0, 0, 0.000000)': datetime.timedelta(days=2), 'typeof(make_dt_interval(2, 0, 0, 0.000000))': 'interval day to second', 'typeof(2)': 'int'}
+query ?
+SELECT make_dt_interval(2::int);
+----
+2 days 0 hours 0 mins 0.000000 secs
+
+# null
+query ?
+SELECT (make_dt_interval(null, 0, 0, 0))
+----
+NULL
+
+query ?
+SELECT (make_dt_interval(0, null, 0, 0))
+----
+NULL
+
+query ?
+SELECT (make_dt_interval(0, 0, null, 0))
+----
+NULL
+
+query ?
+SELECT (make_dt_interval(0, 0, 0, null))
+----
+NULL
+
+# zero arguments - returns default zero duration
+query ?
+SELECT (make_dt_interval()) AS make_dt_interval
+----
+0 days 0 hours 0 mins 0.000000 secs
+
+
+query ?
+SELECT (make_dt_interval(1)) AS make_dt_interval
+----
+1 days 0 hours 0 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(1, 1)) AS make_dt_interval
+----
+1 days 1 hours 0 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(1, 1, 1)) AS make_dt_interval
+----
+1 days 1 hours 1 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(1, 1, 1, 1)) AS make_dt_interval
+----
+1 days 1 hours 1 mins 1.000000 secs
+
+
+# all 0 values
+query ?
+SELECT (make_dt_interval(0, 0, 0, 0))
+----
+0 days 0 hours 0 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(-1, 24, 0, 0)) df
+----
+0 days 0 hours 0 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(1, -24, 0, 0)) dt
+----
+0 days 0 hours 0 mins 0.000000 secs
+
+query ?
+SELECT (make_dt_interval(0, 0, 0, 0.1))
+----
+0 days 0 hours 0 mins 0.100000 secs
+
+
+# doctest https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.make_dt_interval.html
+# extract only the value make_dt_interval
+
+query ?
+SELECT MAKE_DT_INTERVAL(day) AS interval_val
+FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec);
+----
+1 days 0 hours 0 mins 0.000000 secs
+
+query ?
+SELECT MAKE_DT_INTERVAL(day, hour) AS interval_val
+FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec);
+----
+1 days 12 hours 0 mins 0.000000 secs
+
+query ?
+SELECT MAKE_DT_INTERVAL(day, hour, min) AS interval_val
+FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec);
+----
+1 days 12 hours 30 mins 0.000000 secs
+
+query ?
+SELECT MAKE_DT_INTERVAL(day, hour, min, sec) AS interval_val
+FROM VALUES (1, 12, 30, 1.001001) AS t(day, hour, min, sec);
+----
+1 days 12 hours 30 mins 1.001001 secs
+
+query ?
+SELECT MAKE_DT_INTERVAL(1, 12, 30, 1.001001)
+----
+1 days 12 hours 30 mins 1.001001 secs
+
+query ?
+SELECT MAKE_DT_INTERVAL(1, 12, 30, 1.001001);
+----
+1 days 12 hours 30 mins 1.001001 secs
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt
new file mode 100644
index 0000000000000..a796094979d97
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_interval.slt
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+query IIIIIIR?
+SELECT
+  y, m, w, d, h, mi, s,
+  make_interval(y, m, w, d, h, mi, s) AS interval
+FROM VALUES
+  (NULL,2,   3,   4,   5,   6,   7.5),
+  (1,   NULL,3,   4,   5,   6,   7.5),
+  (1,   2,   NULL,4,   5,   6,   7.5),
+  (1,   2,   3,   NULL,5,   6,   7.5),
+  (1,   2,   3,   4,   NULL,6,   7.5),
+  (1,   2,   3,   4,   5,   NULL,7.5),
+  (1,   2,   3,   4,   5,   6,   CAST(NULL AS DOUBLE)),
+  (1,   1,   1,   1,   1,   1,   1.0)
+AS v(y, m, w, d, h, mi, s);
+----
+NULL 2 3 4 5 6 7.5 NULL
+1 NULL 3 4 5 6 7.5 NULL
+1 2 NULL 4 5 6 7.5 NULL
+1 2 3 NULL 5 6 7.5 NULL
+1 2 3 4 NULL 6 7.5 NULL
+1 2 3 4 5 NULL 7.5 NULL
+1 2 3 4 5 6 NULL NULL
+1 1 1 1 1 1 1 13 mons 8 days 1 hours 1 mins 1.000000000 secs
+
+query IIIIIIR?
+SELECT
+  y, m, w, d, h, mi, s,
+  make_interval(y, m, w, d, h, mi, s) AS interval
+FROM VALUES
+  (0,   0,   0,   0,   0,   0,   arrow_cast('NaN','Float64'))
+AS v(y, m, w, d, h, mi, s);
+----
+0 0 0 0 0 0 NaN NULL
+
+query IIIIIIR?
+SELECT
+  y, m, w, d, h, mi, s,
+  make_interval(y, m, w, d, h, mi, s) AS interval
+FROM VALUES
+  (0,   0,   0,   0,   0,   0,   CAST('Infinity' AS DOUBLE))
+AS v(y, m, w, d, h, mi, s);
+----
+0 0 0 0 0 0 Infinity NULL
+
+query IIIIIIR?
+SELECT
+  y, m, w, d, h, mi, s,
+  make_interval(y, m, w, d, h, mi, s) AS interval
+FROM VALUES
+  (0,   0,   0,   0,   0,   0,   CAST('-Infinity' AS DOUBLE))
+AS v(y, m, w, d, h, mi, s);
+----
+0 0 0 0 0 0 -Infinity NULL
+
+query ?
+SELECT make_interval(2147483647, 1, 0, 0, 0, 0, 0.0);
+----
+NULL
+
+query ?
+SELECT make_interval(0, 0, 2147483647, 1, 0, 0, 0.0);
+----
+NULL
+
+query ?
+SELECT make_interval(0, 0, 0, 0, 2147483647, 1, 0.0);
+----
+NULL
+
+query T
+SELECT make_interval(0, 0, 0, 0, 0, 0, 0.0) || '';
+----
+0 secs
+
+query T
+SELECT make_interval() || '';
+----
+0 secs
+
+query ?
+SELECT INTERVAL '1' SECOND AS iv;
+----
+1.000000000 secs
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp.slt
new file mode 100644
index 0000000000000..262154186c8e0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887);
+## PySpark 3.5.5 Result: {'make_timestamp(2014, 12, 28, 6, 30, 45.887)': datetime.datetime(2014, 12, 28, 6, 30, 45, 887000), 'typeof(make_timestamp(2014, 12, 28, 6, 30, 45.887))': 'timestamp', 'typeof(2014)': 'int', 'typeof(12)': 'int', 'typeof(28)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(45.887)': 'decimal(5,3)'}
+#query
+#SELECT make_timestamp(2014::int, 12::int, 28::int, 6::int, 30::int, 45.887::decimal(5,3));
+
+## Original Query: SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887, 'CET');
+## PySpark 3.5.5 Result: {'make_timestamp(2014, 12, 28, 6, 30, 45.887, CET)': datetime.datetime(2014, 12, 27, 21, 30, 45, 887000), 'typeof(make_timestamp(2014, 12, 28, 6, 30, 45.887, CET))': 'timestamp', 'typeof(2014)': 'int', 'typeof(12)': 'int', 'typeof(28)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(45.887)': 'decimal(5,3)', 'typeof(CET)': 'string'}
+#query
+#SELECT make_timestamp(2014::int, 12::int, 28::int, 6::int, 30::int, 45.887::decimal(5,3), 'CET'::string);
+
+## Original Query: SELECT make_timestamp(2019, 6, 30, 23, 59, 1);
+## PySpark 3.5.5 Result: {'make_timestamp(2019, 6, 30, 23, 59, 1)': datetime.datetime(2019, 6, 30, 23, 59, 1), 'typeof(make_timestamp(2019, 6, 30, 23, 59, 1))': 'timestamp', 'typeof(2019)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(23)': 'int', 'typeof(59)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT make_timestamp(2019::int, 6::int, 30::int, 23::int, 59::int, 1::int);
+
+## Original Query: SELECT make_timestamp(2019, 6, 30, 23, 59, 60);
+## PySpark 3.5.5 Result: {'make_timestamp(2019, 6, 30, 23, 59, 60)': datetime.datetime(2019, 7, 1, 0, 0), 'typeof(make_timestamp(2019, 6, 30, 23, 59, 60))': 'timestamp', 'typeof(2019)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(23)': 'int', 'typeof(59)': 'int', 'typeof(60)': 'int'}
+#query
+#SELECT make_timestamp(2019::int, 6::int, 30::int, 23::int, 59::int, 60::int);
+
+## Original Query: SELECT make_timestamp(null, 7, 22, 15, 30, 0);
+## PySpark 3.5.5 Result: {'make_timestamp(NULL, 7, 22, 15, 30, 0)': None, 'typeof(make_timestamp(NULL, 7, 22, 15, 30, 0))': 'timestamp', 'typeof(NULL)': 'void', 'typeof(7)': 'int', 'typeof(22)': 'int', 'typeof(15)': 'int', 'typeof(30)': 'int', 'typeof(0)': 'int'}
+#query
+#SELECT make_timestamp(NULL::void, 7::int, 22::int, 15::int, 30::int, 0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ltz.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ltz.slt
new file mode 100644
index 0000000000000..ce5e07f663c4d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ltz.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887);
+## PySpark 3.5.5 Result: {'make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887)': datetime.datetime(2014, 12, 28, 6, 30, 45, 887000), 'typeof(make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887))': 'timestamp', 'typeof(2014)': 'int', 'typeof(12)': 'int', 'typeof(28)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(45.887)': 'decimal(5,3)'}
+#query
+#SELECT make_timestamp_ltz(2014::int, 12::int, 28::int, 6::int, 30::int, 45.887::decimal(5,3));
+
+## Original Query: SELECT make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887, 'CET');
+## PySpark 3.5.5 Result: {'make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887, CET)': datetime.datetime(2014, 12, 27, 21, 30, 45, 887000), 'typeof(make_timestamp_ltz(2014, 12, 28, 6, 30, 45.887, CET))': 'timestamp', 'typeof(2014)': 'int', 'typeof(12)': 'int', 'typeof(28)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(45.887)': 'decimal(5,3)', 'typeof(CET)': 'string'}
+#query
+#SELECT make_timestamp_ltz(2014::int, 12::int, 28::int, 6::int, 30::int, 45.887::decimal(5,3), 'CET'::string);
+
+## Original Query: SELECT make_timestamp_ltz(2019, 6, 30, 23, 59, 60);
+## PySpark 3.5.5 Result: {'make_timestamp_ltz(2019, 6, 30, 23, 59, 60)': datetime.datetime(2019, 7, 1, 0, 0), 'typeof(make_timestamp_ltz(2019, 6, 30, 23, 59, 60))': 'timestamp', 'typeof(2019)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(23)': 'int', 'typeof(59)': 'int', 'typeof(60)': 'int'}
+#query
+#SELECT make_timestamp_ltz(2019::int, 6::int, 30::int, 23::int, 59::int, 60::int);
+
+## Original Query: SELECT make_timestamp_ltz(null, 7, 22, 15, 30, 0);
+## PySpark 3.5.5 Result: {'make_timestamp_ltz(NULL, 7, 22, 15, 30, 0)': None, 'typeof(make_timestamp_ltz(NULL, 7, 22, 15, 30, 0))': 'timestamp', 'typeof(NULL)': 'void', 'typeof(7)': 'int', 'typeof(22)': 'int', 'typeof(15)': 'int', 'typeof(30)': 'int', 'typeof(0)': 'int'}
+#query
+#SELECT make_timestamp_ltz(NULL::void, 7::int, 22::int, 15::int, 30::int, 0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ntz.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ntz.slt
new file mode 100644
index 0000000000000..fbbe37655eb7a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_timestamp_ntz.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_timestamp_ntz(2014, 12, 28, 6, 30, 45.887);
+## PySpark 3.5.5 Result: {'make_timestamp_ntz(2014, 12, 28, 6, 30, 45.887)': datetime.datetime(2014, 12, 28, 6, 30, 45, 887000), 'typeof(make_timestamp_ntz(2014, 12, 28, 6, 30, 45.887))': 'timestamp_ntz', 'typeof(2014)': 'int', 'typeof(12)': 'int', 'typeof(28)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(45.887)': 'decimal(5,3)'}
+#query
+#SELECT make_timestamp_ntz(2014::int, 12::int, 28::int, 6::int, 30::int, 45.887::decimal(5,3));
+
+## Original Query: SELECT make_timestamp_ntz(2019, 6, 30, 23, 59, 60);
+## PySpark 3.5.5 Result: {'make_timestamp_ntz(2019, 6, 30, 23, 59, 60)': datetime.datetime(2019, 7, 1, 0, 0), 'typeof(make_timestamp_ntz(2019, 6, 30, 23, 59, 60))': 'timestamp_ntz', 'typeof(2019)': 'int', 'typeof(6)': 'int', 'typeof(30)': 'int', 'typeof(23)': 'int', 'typeof(59)': 'int', 'typeof(60)': 'int'}
+#query
+#SELECT make_timestamp_ntz(2019::int, 6::int, 30::int, 23::int, 59::int, 60::int);
+
+## Original Query: SELECT make_timestamp_ntz(null, 7, 22, 15, 30, 0);
+## PySpark 3.5.5 Result: {'make_timestamp_ntz(NULL, 7, 22, 15, 30, 0)': None, 'typeof(make_timestamp_ntz(NULL, 7, 22, 15, 30, 0))': 'timestamp_ntz', 'typeof(NULL)': 'void', 'typeof(7)': 'int', 'typeof(22)': 'int', 'typeof(15)': 'int', 'typeof(30)': 'int', 'typeof(0)': 'int'}
+#query
+#SELECT make_timestamp_ntz(NULL::void, 7::int, 22::int, 15::int, 30::int, 0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/make_ym_interval.slt b/datafusion/sqllogictest/test_files/spark/datetime/make_ym_interval.slt
new file mode 100644
index 0000000000000..9429a3a5306ed
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/make_ym_interval.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT make_ym_interval(-1, 1);
+## PySpark 3.5.5 Result: {'make_ym_interval(-1, 1)': -11, 'typeof(make_ym_interval(-1, 1))': 'interval year to month', 'typeof(-1)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT make_ym_interval(-1::int, 1::int);
+
+## Original Query: SELECT make_ym_interval(1, 0);
+## PySpark 3.5.5 Result: {'make_ym_interval(1, 0)': 12, 'typeof(make_ym_interval(1, 0))': 'interval year to month', 'typeof(1)': 'int', 'typeof(0)': 'int'}
+#query
+#SELECT make_ym_interval(1::int, 0::int);
+
+## Original Query: SELECT make_ym_interval(1, 2);
+## PySpark 3.5.5 Result: {'make_ym_interval(1, 2)': 14, 'typeof(make_ym_interval(1, 2))': 'interval year to month', 'typeof(1)': 'int', 'typeof(2)': 'int'}
+#query
+#SELECT make_ym_interval(1::int, 2::int);
+
+## Original Query: SELECT make_ym_interval(2);
+## PySpark 3.5.5 Result: {'make_ym_interval(2, 0)': 24, 'typeof(make_ym_interval(2, 0))': 'interval year to month', 'typeof(2)': 'int'}
+#query
+#SELECT make_ym_interval(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/minute.slt b/datafusion/sqllogictest/test_files/spark/datetime/minute.slt
new file mode 100644
index 0000000000000..8792c544736d0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/minute.slt
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT minute('2009-07-30 12:58:59');
+## PySpark 3.5.5 Result: {'minute(2009-07-30 12:58:59)': 58, 'typeof(minute(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
+query I
+SELECT minute('2009-07-30 12:58:59'::timestamp);
+----
+58
+
+# Test with different minutes
+query I
+SELECT minute('2009-07-30 12:00:00'::timestamp);
+----
+0
+
+query I
+SELECT minute('2009-07-30 12:59:59'::timestamp);
+----
+59
+
+# Test with NULL
+query I
+SELECT minute(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/month.slt b/datafusion/sqllogictest/test_files/spark/datetime/month.slt
new file mode 100644
index 0000000000000..17a34352d16f3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/month.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT month('2016-07-30');
+## PySpark 3.5.5 Result: {'month(2016-07-30)': 7, 'typeof(month(2016-07-30))': 'int', 'typeof(2016-07-30)': 'string'}
+#query
+#SELECT month('2016-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/months_between.slt b/datafusion/sqllogictest/test_files/spark/datetime/months_between.slt
new file mode 100644
index 0000000000000..c2526761655db
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/months_between.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT months_between('1997-02-28 10:30:00', '1996-10-30');
+## PySpark 3.5.5 Result: {'months_between(1997-02-28 10:30:00, 1996-10-30, true)': 3.94959677, 'typeof(months_between(1997-02-28 10:30:00, 1996-10-30, true))': 'double', 'typeof(1997-02-28 10:30:00)': 'string', 'typeof(1996-10-30)': 'string'}
+#query
+#SELECT months_between('1997-02-28 10:30:00'::string, '1996-10-30'::string);
+
+## Original Query: SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false);
+## PySpark 3.5.5 Result: {'months_between(1997-02-28 10:30:00, 1996-10-30, false)': 3.9495967741935485, 'typeof(months_between(1997-02-28 10:30:00, 1996-10-30, false))': 'double', 'typeof(1997-02-28 10:30:00)': 'string', 'typeof(1996-10-30)': 'string', 'typeof(false)': 'boolean'}
+#query
+#SELECT months_between('1997-02-28 10:30:00'::string, '1996-10-30'::string, false::boolean);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt b/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt
new file mode 100644
index 0000000000000..872d1f2b58eb6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+query D
+SELECT next_day('2015-01-14'::DATE, 'TU'::string);
+----
+2015-01-20
+
+query D
+SELECT next_day('2015-07-27'::DATE, 'Sun'::string);
+----
+2015-08-02
+
+query D
+SELECT next_day('2015-07-27'::DATE, 'Sat'::string);
+----
+2015-08-01
+
+query error Failed to coerce arguments to satisfy a call to 'next_day' function
+SELECT next_day('2015-07-27'::DATE);
+
+query error Failed to coerce arguments to satisfy a call to 'next_day' function
+SELECT next_day('Sun'::string);
+
+query error 'next_day' does not support zero arguments
+SELECT next_day();
+
+query error Failed to coerce arguments to satisfy a call to 'next_day' function
+SELECT next_day(1::int, 'Sun'::string);
+
+query error Failed to coerce arguments to satisfy a call to 'next_day' function
+SELECT next_day('2015-07-27'::DATE, 'Sat'::string, 'Sun'::string);
+
+query error Failed to coerce arguments to satisfy a call to 'next_day' function
+SELECT next_day('invalid_date'::string, 'Mon'::string);
+
+query D
+SELECT next_day('2000-01-01'::DATE, 2.0::float);
+----
+NULL
+
+query D
+SELECT next_day('2020-01-01'::DATE, 'invalid_day'::string);
+----
+NULL
+
+query error Cast error: Cannot cast string '2015-13-32' to value of Date32 type
+SELECT next_day('2015-13-32'::DATE, 'Sun'::string);
+
+query D
+SELECT next_day(a, b)
+FROM VALUES
+    ('2000-01-01'::DATE, 'Mon'::string),
+    (NULL::DATE, NULL::string),
+    (NULL::DATE, 'Mon'::string),
+    ('2015-01-14'::DATE, NULL::string) as t(a, b);
+----
+2000-01-03
+NULL
+NULL
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/now.slt b/datafusion/sqllogictest/test_files/spark/datetime/now.slt
new file mode 100644
index 0000000000000..985140c1ac442
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/now.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT now();
+## PySpark 3.5.5 Result: {'now()': datetime.datetime(2025, 6, 14, 23, 57, 39, 982956), 'typeof(now())': 'timestamp'}
+#query
+#SELECT now();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/quarter.slt b/datafusion/sqllogictest/test_files/spark/datetime/quarter.slt
new file mode 100644
index 0000000000000..27b6728b0b7bb
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/quarter.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT quarter('2016-08-31');
+## PySpark 3.5.5 Result: {'quarter(2016-08-31)': 3, 'typeof(quarter(2016-08-31))': 'int', 'typeof(2016-08-31)': 'string'}
+#query
+#SELECT quarter('2016-08-31'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/second.slt b/datafusion/sqllogictest/test_files/spark/datetime/second.slt
new file mode 100644
index 0000000000000..7a99dd8967b02
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/second.slt
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT second('2009-07-30 12:58:59');
+## PySpark 3.5.5 Result: {'second(2009-07-30 12:58:59)': 59, 'typeof(second(2009-07-30 12:58:59))': 'int', 'typeof(2009-07-30 12:58:59)': 'string'}
+query I
+SELECT second('2009-07-30 12:58:59'::timestamp);
+----
+59
+
+# Test with different seconds
+query I
+SELECT second('2009-07-30 12:58:00'::timestamp);
+----
+0
+
+query I
+SELECT second('2009-07-30 12:58:30'::timestamp);
+----
+30
+
+# Test with NULL
+query I
+SELECT second(NULL::timestamp);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt
new file mode 100644
index 0000000000000..35ffa483bb068
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# HOUR - zero out minute and second with fraction
+query D
+SELECT time_trunc('HOUR', '09:32:05.123456'::time);
+----
+09:00:00
+
+# MINUTE - zero out second with fraction
+query D
+SELECT time_trunc('MINUTE', '09:32:05.123456'::time);
+----
+09:32:00
+
+# SECOND - zero out fraction
+query D
+SELECT time_trunc('SECOND', '09:32:05.123456'::time);
+----
+09:32:05
+
+# MILLISECOND - zero out microseconds
+query D
+SELECT time_trunc('MILLISECOND', '09:32:05.123456'::time);
+----
+09:32:05.123
+
+# MICROSECOND - everything remains
+query D
+SELECT time_trunc('MICROSECOND', '09:32:05.123456'::time);
+----
+09:32:05.123456
+
+query D
+SELECT time_trunc('HOUR', column1)
+FROM VALUES
+('09:32:05.123456'::time),
+('22:45:30.654321'::time),
+('14:20:10.000001'::time),
+(NULL::time);
+----
+09:00:00
+22:00:00
+14:00:00
+NULL
+
+
+# Null handling
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: First argument of `TIME_TRUNC` must be non-null scalar Utf8
+SELECT time_trunc(NULL, '09:32:05.123456'::time);
+
+query D
+SELECT time_trunc('HOUR', NULL::time);
+----
+NULL
+
+# incorrect format
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond
+SELECT time_trunc('test', '09:32:05.123456'::time);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/timestamp_micros.slt b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_micros.slt
new file mode 100644
index 0000000000000..19a52c981075f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_micros.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT timestamp_micros(1230219000123123);
+## PySpark 3.5.5 Result: {'timestamp_micros(1230219000123123)': datetime.datetime(2008, 12, 25, 7, 30, 0, 123123), 'typeof(timestamp_micros(1230219000123123))': 'timestamp', 'typeof(1230219000123123)': 'bigint'}
+#query
+#SELECT timestamp_micros(1230219000123123::bigint);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/timestamp_millis.slt b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_millis.slt
new file mode 100644
index 0000000000000..7dc092549fffa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_millis.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT timestamp_millis(1230219000123);
+## PySpark 3.5.5 Result: {'timestamp_millis(1230219000123)': datetime.datetime(2008, 12, 25, 7, 30, 0, 123000), 'typeof(timestamp_millis(1230219000123))': 'timestamp', 'typeof(1230219000123)': 'bigint'}
+#query
+#SELECT timestamp_millis(1230219000123::bigint);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/timestamp_seconds.slt b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_seconds.slt
new file mode 100644
index 0000000000000..8e14c1dfe1f2b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/timestamp_seconds.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT timestamp_seconds(1230219000);
+## PySpark 3.5.5 Result: {'timestamp_seconds(1230219000)': datetime.datetime(2008, 12, 25, 7, 30), 'typeof(timestamp_seconds(1230219000))': 'timestamp', 'typeof(1230219000)': 'int'}
+#query
+#SELECT timestamp_seconds(1230219000::int);
+
+## Original Query: SELECT timestamp_seconds(1230219000.123);
+## PySpark 3.5.5 Result: {'timestamp_seconds(1230219000.123)': datetime.datetime(2008, 12, 25, 7, 30, 0, 123000), 'typeof(timestamp_seconds(1230219000.123))': 'timestamp', 'typeof(1230219000.123)': 'decimal(13,3)'}
+#query
+#SELECT timestamp_seconds(1230219000.123::decimal(13,3));
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_date.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_date.slt
new file mode 100644
index 0000000000000..3863cfb2baae7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_date.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_date('2009-07-30 04:17:52');
+## PySpark 3.5.5 Result: {'to_date(2009-07-30 04:17:52)': datetime.date(2009, 7, 30), 'typeof(to_date(2009-07-30 04:17:52))': 'date', 'typeof(2009-07-30 04:17:52)': 'string'}
+#query
+#SELECT to_date('2009-07-30 04:17:52'::string);
+
+## Original Query: SELECT to_date('2016-12-31', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'to_date(2016-12-31, yyyy-MM-dd)': datetime.date(2016, 12, 31), 'typeof(to_date(2016-12-31, yyyy-MM-dd))': 'date', 'typeof(2016-12-31)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT to_date('2016-12-31'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp.slt
new file mode 100644
index 0000000000000..39f77620fa771
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_timestamp('2016-12-31 00:12:00');
+## PySpark 3.5.5 Result: {'to_timestamp(2016-12-31 00:12:00)': datetime.datetime(2016, 12, 31, 0, 12), 'typeof(to_timestamp(2016-12-31 00:12:00))': 'timestamp', 'typeof(2016-12-31 00:12:00)': 'string'}
+#query
+#SELECT to_timestamp('2016-12-31 00:12:00'::string);
+
+## Original Query: SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'to_timestamp(2016-12-31, yyyy-MM-dd)': datetime.datetime(2016, 12, 31, 0, 0), 'typeof(to_timestamp(2016-12-31, yyyy-MM-dd))': 'timestamp', 'typeof(2016-12-31)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT to_timestamp('2016-12-31'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ltz.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ltz.slt
new file mode 100644
index 0000000000000..c7c43a2bcc56d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ltz.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_timestamp_ltz('2016-12-31 00:12:00');
+## PySpark 3.5.5 Result: {'to_timestamp_ltz(2016-12-31 00:12:00)': datetime.datetime(2016, 12, 31, 0, 12), 'typeof(to_timestamp_ltz(2016-12-31 00:12:00))': 'timestamp', 'typeof(2016-12-31 00:12:00)': 'string'}
+#query
+#SELECT to_timestamp_ltz('2016-12-31 00:12:00'::string);
+
+## Original Query: SELECT to_timestamp_ltz('2016-12-31', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'to_timestamp_ltz(2016-12-31, yyyy-MM-dd)': datetime.datetime(2016, 12, 31, 0, 0), 'typeof(to_timestamp_ltz(2016-12-31, yyyy-MM-dd))': 'timestamp', 'typeof(2016-12-31)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT to_timestamp_ltz('2016-12-31'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ntz.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ntz.slt
new file mode 100644
index 0000000000000..11c4e4cbe257f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_timestamp_ntz.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_timestamp_ntz('2016-12-31 00:12:00');
+## PySpark 3.5.5 Result: {'to_timestamp_ntz(2016-12-31 00:12:00)': datetime.datetime(2016, 12, 31, 0, 12), 'typeof(to_timestamp_ntz(2016-12-31 00:12:00))': 'timestamp_ntz', 'typeof(2016-12-31 00:12:00)': 'string'}
+#query
+#SELECT to_timestamp_ntz('2016-12-31 00:12:00'::string);
+
+## Original Query: SELECT to_timestamp_ntz('2016-12-31', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'to_timestamp_ntz(2016-12-31, yyyy-MM-dd)': datetime.datetime(2016, 12, 31, 0, 0), 'typeof(to_timestamp_ntz(2016-12-31, yyyy-MM-dd))': 'timestamp_ntz', 'typeof(2016-12-31)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT to_timestamp_ntz('2016-12-31'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_unix_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_unix_timestamp.slt
new file mode 100644
index 0000000000000..53c1902094a50
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_unix_timestamp.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_unix_timestamp('2016-04-08', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'to_unix_timestamp(2016-04-08, yyyy-MM-dd)': 1460098800, 'typeof(to_unix_timestamp(2016-04-08, yyyy-MM-dd))': 'bigint', 'typeof(2016-04-08)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT to_unix_timestamp('2016-04-08'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt
new file mode 100644
index 0000000000000..086716e5bcd0e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/to_utc_timestamp.slt
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# String inputs
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'UTC'::string);
+----
+2016-08-31T00:00:00
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'Asia/Seoul'::string);
+----
+2016-08-30T15:00:00
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, 'America/New_York'::string);
+----
+2016-08-31T04:00:00
+
+# String inputs with offsets
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string);
+----
+2018-03-12T19:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string);
+----
+2018-03-13T08:18:23
+
+# Timestamp inputs
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string);
+----
+2018-03-13T04:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string);
+----
+2018-03-12T19:18:23
+
+query P
+SELECT to_utc_timestamp('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string);
+----
+2018-03-13T08:18:23
+
+# Null inputs
+query P
+SELECT to_utc_timestamp(NULL::string, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp(NULL::timestamp, 'Asia/Seoul'::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp('2016-08-31'::string, NULL::string);
+----
+NULL
+
+query P
+SELECT to_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::string, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::string, 'UTC'::string),
+('2016-08-31'::string, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::string, 'America/New_York'::string),
+(NULL::string, 'Asia/Seoul'::string),
+('2016-08-31'::string, NULL::string);
+----
+2016-08-30T15:00:00
+2018-03-12T19:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-31T04:00:00
+2018-03-13T08:18:23
+NULL
+NULL
+
+query P
+SELECT to_utc_timestamp(column1, column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-30T15:00:00
+2018-03-12T19:18:23
+2016-08-31T00:00:00
+2018-03-13T04:18:23
+2016-08-31T04:00:00
+2018-03-13T08:18:23
+NULL
+NULL
+
+query P
+SELECT to_utc_timestamp(arrow_cast(column1, 'Timestamp(Microsecond, Some("Asia/Seoul"))'), column2)
+FROM VALUES
+('2016-08-31'::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'Asia/Seoul'::string),
+('2016-08-31'::timestamp, 'UTC'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'UTC'::string),
+('2016-08-31'::timestamp, 'America/New_York'::string),
+('2018-03-13T06:18:23+02:00'::timestamp, 'America/New_York'::string),
+(NULL::timestamp, 'Asia/Seoul'::string),
+('2018-03-13T06:18:23+00:00'::timestamp, NULL::string);
+----
+2016-08-30T15:00:00+09:00
+2018-03-12T19:18:23+09:00
+2016-08-31T00:00:00+09:00
+2018-03-13T04:18:23+09:00
+2016-08-31T04:00:00+09:00
+2018-03-13T08:18:23+09:00
+NULL
+NULL
+
+
+# DST edge cases
+query P
+SELECT to_utc_timestamp('2020-03-31T13:40:00'::timestamp, 'America/New_York'::string);
+----
+2020-03-31T17:40:00
+
+
+query P
+SELECT to_utc_timestamp('2020-11-04T14:06:40'::timestamp, 'America/New_York'::string);
+----
+2020-11-04T19:06:40
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt b/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt
new file mode 100644
index 0000000000000..aa26d7bd0ef06
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/trunc.slt
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# YEAR - truncate to first date of year
+query D
+SELECT trunc('2009-02-12'::date, 'YEAR'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'YYYY'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'YY'::string);
+----
+2009-01-01
+
+# QUARTER - truncate to first date of quarter
+query D
+SELECT trunc('2009-02-12'::date, 'QUARTER'::string);
+----
+2009-01-01
+
+# MONTH - truncate to first date of month
+query D
+SELECT trunc('2009-02-12'::date, 'MONTH'::string);
+----
+2009-02-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'MM'::string);
+----
+2009-02-01
+
+query D
+SELECT trunc('2009-02-12'::date, 'MON'::string);
+----
+2009-02-01
+
+# WEEK - truncate to Monday of the week
+query D
+SELECT trunc('2009-02-12'::date, 'WEEK'::string);
+----
+2009-02-09
+
+# string input
+query D
+SELECT trunc('2009-02-12'::string, 'YEAR'::string);
+----
+2009-01-01
+
+query D
+SELECT trunc(column1, 'YEAR'::string)
+FROM VALUES
+('2009-02-12'::date),
+('2000-02-12'::date),
+('2042-02-12'::date),
+(NULL::date);
+----
+2009-01-01
+2000-01-01
+2042-01-01
+NULL
+
+# Null handling
+query D
+SELECT trunc(NULL::date, 'YEAR'::string);
+----
+NULL
+
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: Second argument of `TRUNC` must be non-null scalar Utf8
+SELECT trunc('2009-02-12'::date, NULL::string);
+
+# incorrect format
+query error DataFusion error: Optimizer rule 'simplify_expressions' failed\ncaused by\nError during planning: The format argument of `TRUNC` must be one of: year, yy, yyyy, month, mm, mon, day, week, quarter.
+SELECT trunc('2009-02-12'::date, 'test'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/try_to_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/try_to_timestamp.slt
new file mode 100644
index 0000000000000..23b788125ed0e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/try_to_timestamp.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_to_timestamp('2016-12-31 00:12:00');
+## PySpark 3.5.5 Result: {'try_to_timestamp(2016-12-31 00:12:00)': datetime.datetime(2016, 12, 31, 0, 12), 'typeof(try_to_timestamp(2016-12-31 00:12:00))': 'timestamp', 'typeof(2016-12-31 00:12:00)': 'string'}
+#query
+#SELECT try_to_timestamp('2016-12-31 00:12:00'::string);
+
+## Original Query: SELECT try_to_timestamp('2016-12-31', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'try_to_timestamp(2016-12-31, yyyy-MM-dd)': datetime.datetime(2016, 12, 31, 0, 0), 'typeof(try_to_timestamp(2016-12-31, yyyy-MM-dd))': 'timestamp', 'typeof(2016-12-31)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT try_to_timestamp('2016-12-31'::string, 'yyyy-MM-dd'::string);
+
+## Original Query: SELECT try_to_timestamp('foo', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'try_to_timestamp(foo, yyyy-MM-dd)': None, 'typeof(try_to_timestamp(foo, yyyy-MM-dd))': 'timestamp', 'typeof(foo)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT try_to_timestamp('foo'::string, 'yyyy-MM-dd'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/unix.slt b/datafusion/sqllogictest/test_files/spark/datetime/unix.slt
new file mode 100644
index 0000000000000..9dd39acd7f1de
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/unix.slt
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Unix Date tests
+
+query I
+SELECT unix_date('1970-01-02'::date);
+----
+1
+
+query I
+SELECT unix_date('1900-01-02'::date);
+----
+-25566
+
+
+query I
+SELECT unix_date(arrow_cast('1970-01-02', 'Date64'));
+----
+1
+
+query I
+SELECT unix_date(NULL::date);
+----
+NULL
+
+query error Function 'unix_date' requires Date, but received String \(DataType: Utf8View\)
+SELECT unix_date('1970-01-02'::string);
+
+# Unix Micro Tests
+
+query I
+SELECT unix_micros('1970-01-01 00:00:01Z'::timestamp);
+----
+1000000
+
+query I
+SELECT unix_micros('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799000000
+
+query I
+SELECT unix_micros(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199000000
+
+query I
+SELECT unix_micros(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1000000
+
+query I
+SELECT unix_micros(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_micros' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_micros('1970-01-01 00:00:01Z'::string);
+
+
+# Unix Millis Tests
+
+query I
+SELECT unix_millis('1970-01-01 00:00:01Z'::timestamp);
+----
+1000
+
+query I
+SELECT unix_millis('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799000
+
+query I
+SELECT unix_millis(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199000
+
+query I
+SELECT unix_millis(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1000
+
+query I
+SELECT unix_millis(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_millis' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_millis('1970-01-01 00:00:01Z'::string);
+
+
+# Unix Seconds Tests
+
+query I
+SELECT unix_seconds('1970-01-01 00:00:01Z'::timestamp);
+----
+1
+
+query I
+SELECT unix_seconds('1900-01-01 00:00:01Z'::timestamp);
+----
+-2208988799
+
+query I
+SELECT unix_seconds(arrow_cast('1970-01-01 00:00:01+02:00', 'Timestamp(Microsecond, None)'));
+----
+-7199
+
+query I
+SELECT unix_seconds(arrow_cast('1970-01-01 00:00:01Z', 'Timestamp(Second, None)'));
+----
+1
+
+query I
+SELECT unix_seconds(NULL::timestamp);
+----
+NULL
+
+query error Function 'unix_seconds' requires Timestamp, but received String \(DataType: Utf8View\)
+SELECT unix_seconds('1970-01-01 00:00:01Z'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/unix_timestamp.slt b/datafusion/sqllogictest/test_files/spark/datetime/unix_timestamp.slt
new file mode 100644
index 0000000000000..bc597912bc85b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/unix_timestamp.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT unix_timestamp('2016-04-08', 'yyyy-MM-dd');
+## PySpark 3.5.5 Result: {'unix_timestamp(2016-04-08, yyyy-MM-dd)': 1460098800, 'typeof(unix_timestamp(2016-04-08, yyyy-MM-dd))': 'bigint', 'typeof(2016-04-08)': 'string', 'typeof(yyyy-MM-dd)': 'string'}
+#query
+#SELECT unix_timestamp('2016-04-08'::string, 'yyyy-MM-dd'::string);
+
+## Original Query: SELECT unix_timestamp();
+## PySpark 3.5.5 Result: {'unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)': 1749970660, 'typeof(unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss))': 'bigint'}
+#query
+#SELECT unix_timestamp();
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/weekday.slt b/datafusion/sqllogictest/test_files/spark/datetime/weekday.slt
new file mode 100644
index 0000000000000..b4f5444e8a2da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/weekday.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT weekday('2009-07-30');
+## PySpark 3.5.5 Result: {'weekday(2009-07-30)': 3, 'typeof(weekday(2009-07-30))': 'int', 'typeof(2009-07-30)': 'string'}
+#query
+#SELECT weekday('2009-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/weekofyear.slt b/datafusion/sqllogictest/test_files/spark/datetime/weekofyear.slt
new file mode 100644
index 0000000000000..30e69341d97d1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/weekofyear.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT weekofyear('2008-02-20');
+## PySpark 3.5.5 Result: {'weekofyear(2008-02-20)': 8, 'typeof(weekofyear(2008-02-20))': 'int', 'typeof(2008-02-20)': 'string'}
+#query
+#SELECT weekofyear('2008-02-20'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/year.slt b/datafusion/sqllogictest/test_files/spark/datetime/year.slt
new file mode 100644
index 0000000000000..6577522736c07
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/datetime/year.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT year('2016-07-30');
+## PySpark 3.5.5 Result: {'year(2016-07-30)': 2016, 'typeof(year(2016-07-30))': 'int', 'typeof(2016-07-30)': 'string'}
+#query
+#SELECT year('2016-07-30'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/hash/crc32.slt b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
new file mode 100644
index 0000000000000..df5588c75837d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT crc32('Spark');
+## PySpark 3.5.5 Result: {'crc32(Spark)': 1557323817, 'typeof(crc32(Spark))': 'bigint', 'typeof(Spark)': 'string'}
+
+# Basic crc32 tests
+query I
+SELECT crc32('Spark');
+----
+1557323817
+
+query I
+SELECT crc32('');
+----
+0
+
+query I
+SELECT crc32(arrow_cast('', 'Binary'));
+----
+0
+
+# Test with different types
+query I
+SELECT crc32(NULL);
+----
+NULL
+
+query I
+SELECT crc32(arrow_cast('Spark', 'LargeUtf8'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast('Spark', 'Utf8View'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast('Spark', 'Utf8'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast('Spark', 'Binary'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast(arrow_cast('Spark', 'Binary'), 'FixedSizeBinary(5)'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast('Spark', 'LargeBinary'));
+----
+1557323817
+
+query I
+SELECT crc32(arrow_cast('Spark', 'BinaryView'));
+----
+1557323817
+
+query I
+select crc32(arrow_cast(null, 'Dictionary(Int32, Utf8)'))
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/hash/md5.slt b/datafusion/sqllogictest/test_files/spark/hash/md5.slt
new file mode 100644
index 0000000000000..f1a4b82e291ad
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/hash/md5.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT md5('Spark');
+## PySpark 3.5.5 Result: {'md5(Spark)': '8cde774d6f7333752ed72cacddb05126', 'typeof(md5(Spark))': 'string', 'typeof(Spark)': 'string'}
+#query
+#SELECT md5('Spark'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/hash/sha.slt b/datafusion/sqllogictest/test_files/spark/hash/sha.slt
new file mode 100644
index 0000000000000..c7710aa6a763f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/hash/sha.slt
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sha('Spark');
+## PySpark 3.5.5 Result: {'sha(Spark)': '85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c', 'typeof(sha(Spark))': 'string', 'typeof(Spark)': 'string'}
+
+# Basic sha tests
+query T
+SELECT sha('Spark');
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha(NULL);
+----
+NULL
+
+query T
+SELECT sha('');
+----
+da39a3ee5e6b4b0d3255bfef95601890afd80709
+
+# Test with LargeUtf8 (using CAST to ensure type)
+query T
+SELECT sha(arrow_cast('Spark', 'LargeUtf8'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+# Test with Utf8View (using CAST to ensure type)
+query T
+SELECT sha(arrow_cast('Spark', 'Utf8View'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+# Test with Binary
+query T
+SELECT sha(arrow_cast('Spark', 'Binary'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+# Test with LargeBinary
+query T
+SELECT sha(arrow_cast('Spark', 'LargeBinary'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+# Test with BinaryView
+query T
+SELECT sha(arrow_cast('Spark', 'BinaryView'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
diff --git a/datafusion/sqllogictest/test_files/spark/hash/sha1.slt b/datafusion/sqllogictest/test_files/spark/hash/sha1.slt
new file mode 100644
index 0000000000000..5185c45d090bd
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/hash/sha1.slt
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sha1('Spark');
+## PySpark 3.5.5 Result: {'sha1(Spark)': '85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c', 'typeof(sha1(Spark))': 'string', 'typeof(Spark)': 'string'}
+
+# Basic sha1 tests
+query T
+SELECT sha1('Spark');
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha('Spark');
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1('');
+----
+da39a3ee5e6b4b0d3255bfef95601890afd80709
+
+# Test with different types
+query T
+SELECT sha1(NULL);
+----
+NULL
+
+query T
+SELECT sha1(arrow_cast('Spark', 'LargeUtf8'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast('Spark', 'Utf8View'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast('Spark', 'Utf8'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast('Spark', 'Binary'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast(arrow_cast('Spark', 'Binary'), 'FixedSizeBinary(5)'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast('Spark', 'LargeBinary'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+
+query T
+SELECT sha1(arrow_cast('Spark', 'BinaryView'));
+----
+85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
diff --git a/datafusion/sqllogictest/test_files/spark/hash/sha2.slt b/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
index e2341df164baa..07f70947fe926 100644
--- a/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
+++ b/datafusion/sqllogictest/test_files/spark/hash/sha2.slt
@@ -18,48 +18,115 @@
 query T
 SELECT sha2('Spark', 0::INT);
 ----
-529BC3B07127ECB7E53A4DCF1991D9152C24537D919178022B2C42657F79A26B
+529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
 
 query T
 SELECT sha2('Spark', 256::INT);
 ----
-529BC3B07127ECB7E53A4DCF1991D9152C24537D919178022B2C42657F79A26B
+529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
 
 query T
 SELECT sha2('Spark', 224::INT);
 ----
-DBEAB94971678D36AF2195851C0F7485775A2A7C60073D62FC04549C
+dbeab94971678d36af2195851c0f7485775a2a7c60073d62fc04549c
 
 query T
 SELECT sha2('Spark', 384::INT);
 ----
-1E40B8D06C248A1CC32428C22582B6219D072283078FA140D9AD297ECADF2CABEFC341B857AD36226AA8D6D79F2AB67D
+1e40b8d06c248a1cc32428c22582b6219d072283078fa140d9ad297ecadf2cabefc341b857ad36226aa8d6d79f2ab67d
 
 query T
 SELECT sha2('Spark', 512::INT);
 ----
-44844A586C54C9A212DA1DBFE05C5F1705DE1AF5FDA1F0D36297623249B279FD8F0CCEC03F888F4FB13BF7CD83FDAD58591C797F81121A23CFDD5E0897795238
+44844a586c54c9a212da1dbfe05c5f1705de1af5fda1f0d36297623249b279fd8f0ccec03f888f4fb13bf7cd83fdad58591c797f81121a23cfdd5e0897795238
+
+query T
+SELECT sha2('Spark', 128::INT);
+----
+NULL
 
 query T
 SELECT sha2(expr, 256::INT) FROM VALUES ('foo'), ('bar') AS t(expr);
 ----
-2C26B46B68FFC68FF99B453C1D30413413422D706483BFA0F98A5E886266E7AE
-FCDE2B2EDBA56BF408601FB721FE9B5C338D10EE429EA04FAE5511B68FBF8FB9
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9
+
+query T
+SELECT sha2(expr, 128::INT) FROM VALUES ('foo'), ('bar') AS t(expr);
+----
+NULL
+NULL
+
+query T
+SELECT sha2('foo', bit_length) FROM VALUES (0::INT), (256::INT), (224::INT), (384::INT), (512::INT), (128::INT) AS t(bit_length);
+----
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+98c11ffdfdd540676b1a137cb1a22b2a70350c9a44171d6b1180c6be5cbb2ee3f79d532c8a1dd9ef2e8e08e752a3babb
+f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254d8820772c5518a2c5a8c0c7f7eda19594a7eb539453e1ed7
+NULL
 
 query T
-SELECT sha2('foo', bit_length) FROM VALUES (0::INT), (256::INT), (224::INT), (384::INT), (512::INT) AS t(bit_length);
+SELECT sha2(expr, bit_length) FROM VALUES ('foo',0::INT), ('bar',224::INT), ('baz',384::INT), ('qux',512::INT), ('qux',128::INT) AS t(expr, bit_length);
 ----
-2C26B46B68FFC68FF99B453C1D30413413422D706483BFA0F98A5E886266E7AE
-2C26B46B68FFC68FF99B453C1D30413413422D706483BFA0F98A5E886266E7AE
-0808F64E60D58979FCB676C96EC938270DEA42445AEEFCD3A4E6F8DB
-98C11FFDFDD540676B1A137CB1A22B2A70350C9A44171D6B1180C6BE5CBB2EE3F79D532C8A1DD9EF2E8E08E752A3BABB
-F7FBBA6E0636F890E56FBBF3283E524C6FA3204AE298382D624741D0DC6638326E282C41BE5E4254D8820772C5518A2C5A8C0C7F7EDA19594A7EB539453E1ED7
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+07daf010de7f7f0d8d76a76eb8d1eb40182c8d1e7a3877a6686c9bf0
+967004d25de4abc1bd6a7c9a216254a5ac0733e8ad96dc9f1ea0fad9619da7c32d654ec8ad8ba2f9b5728fed6633bd91
+8c6be9ed448a34883a13a13f4ead4aefa036b67dcda59020c01e57ea075ea8a4792d428f2c6fd0c09d1c49994d6c22789336e062188df29572ed07e7f9779c52
+NULL
 
+# All string types
+query T
+SELECT sha2(arrow_cast('foo', 'Utf8'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'LargeUtf8'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'Utf8View'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+# All binary types
+query T
+SELECT sha2(arrow_cast('foo', 'Binary'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'LargeBinary'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+query T
+SELECT sha2(arrow_cast('foo', 'BinaryView'), bit_length) FROM VALUES (224::INT), (256::INT) AS t(bit_length);
+----
+0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db
+2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae
+
+
+# Null cases
+query T
+select sha2(null, 0);
+----
+NULL
+
+query T
+select sha2('a', null);
+----
+NULL
 
 query T
-SELECT sha2(expr, bit_length) FROM VALUES ('foo',0::INT), ('bar',224::INT), ('baz',384::INT), ('qux',512::INT) AS t(expr, bit_length);
+select sha2('a', null::int);
 ----
-2C26B46B68FFC68FF99B453C1D30413413422D706483BFA0F98A5E886266E7AE
-07DAF010DE7F7F0D8D76A76EB8D1EB40182C8D1E7A3877A6686C9BF0
-967004D25DE4ABC1BD6A7C9A216254A5AC0733E8AD96DC9F1EA0FAD9619DA7C32D654EC8AD8BA2F9B5728FED6633BD91
-8C6BE9ED448A34883A13A13F4EAD4AEFA036B67DCDA59020C01E57EA075EA8A4792D428F2C6FD0C09D1C49994D6C22789336E062188DF29572ED07E7F9779C52
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/json/get_json_object.slt b/datafusion/sqllogictest/test_files/spark/json/get_json_object.slt
new file mode 100644
index 0000000000000..7917ee1168766
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/json/get_json_object.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT get_json_object('{"a":"b"}', '$.a');
+## PySpark 3.5.5 Result: {'get_json_object({"a":"b"}, $.a)': 'b', 'typeof(get_json_object({"a":"b"}, $.a))': 'string', 'typeof({"a":"b"})': 'string', 'typeof($.a)': 'string'}
+#query
+#SELECT get_json_object('{"a":"b"}'::string, '$.a'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/json/json_object_keys.slt b/datafusion/sqllogictest/test_files/spark/json/json_object_keys.slt
new file mode 100644
index 0000000000000..ce399c5820a27
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/json/json_object_keys.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}');
+## PySpark 3.5.5 Result: {'json_object_keys({"f1":"abc","f2":{"f3":"a", "f4":"b"}})': ['f1', 'f2'], 'typeof(json_object_keys({"f1":"abc","f2":{"f3":"a", "f4":"b"}}))': 'array<string>', 'typeof({"f1":"abc","f2":{"f3":"a", "f4":"b"}})': 'string'}
+#query
+#SELECT json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'::string);
+
+## Original Query: SELECT json_object_keys('{"key": "value"}');
+## PySpark 3.5.5 Result: {'json_object_keys({"key": "value"})': ['key'], 'typeof(json_object_keys({"key": "value"}))': 'array<string>', 'typeof({"key": "value"})': 'string'}
+#query
+#SELECT json_object_keys('{"key": "value"}'::string);
+
+## Original Query: SELECT json_object_keys('{}');
+## PySpark 3.5.5 Result: {'json_object_keys({})': [], 'typeof(json_object_keys({}))': 'array<string>', 'typeof({})': 'string'}
+#query
+#SELECT json_object_keys('{}'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
new file mode 100644
index 0000000000000..c0c424946709f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/json/json_tuple.slt
@@ -0,0 +1,154 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible json_tuple function
+# https://spark.apache.org/docs/latest/api/sql/index.html#json_tuple
+#
+# Test cases derived from Spark JsonExpressionsSuite:
+# https://github.com/apache/spark/blob/master/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+
+# Scalar: hive key 1
+query ?
+SELECT json_tuple('{"f1":"value1","f2":"value2","f3":3,"f5":5.23}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value1, c1: value2, c2: 3, c3: NULL, c4: 5.23}
+
+# Scalar: hive key 2
+query ?
+SELECT json_tuple('{"f1":"value12","f3":"value3","f2":2,"f4":4.01}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value12, c1: 2, c2: value3, c3: 4.01, c4: NULL}
+
+# Scalar: hive key 3
+query ?
+SELECT json_tuple('{"f1":"value13","f4":"value44","f3":"value33","f2":2,"f5":5.01}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: value13, c1: 2, c2: value33, c3: value44, c4: 5.01}
+
+# Scalar: null JSON input
+query ?
+SELECT json_tuple(NULL::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+NULL
+
+# Scalar: null and empty values
+query ?
+SELECT json_tuple('{"f1":"","f5":null}'::STRING, 'f1'::STRING, 'f2'::STRING, 'f3'::STRING, 'f4'::STRING, 'f5'::STRING);
+----
+{c0: , c1: NULL, c2: NULL, c3: NULL, c4: NULL}
+
+# Scalar: invalid JSON (array)
+query ?
+SELECT json_tuple('[invalid JSON string]'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (start only)
+query ?
+SELECT json_tuple('{'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (no closing brace)
+query ?
+SELECT json_tuple('{"foo":"bar"'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (backslash)
+query ?
+SELECT json_tuple('\'::STRING, 'f1'::STRING);
+----
+NULL
+
+# Scalar: invalid JSON (quoted string, not an object)
+query ?
+SELECT json_tuple('"quote'::STRING, '"quote'::STRING);
+----
+NULL
+
+# Scalar: empty JSON object
+query ?
+SELECT json_tuple('{}'::STRING, 'a'::STRING);
+----
+{c0: NULL}
+
+# Array: multi-row test
+query ?
+SELECT json_tuple(col, 'f1'::STRING, 'f2'::STRING) FROM (VALUES
+    ('{"f1":"a","f2":"b"}'::STRING),
+    (NULL::STRING),
+    ('{"f1":"c"}'::STRING),
+    ('invalid'::STRING)
+) AS t(col);
+----
+{c0: a, c1: b}
+NULL
+{c0: c, c1: NULL}
+NULL
+
+# Array: SPARK-21677 null field key
+query ?
+SELECT json_tuple(col1, col2, col3, col4) FROM (VALUES
+    ('{"f1":1,"f2":2}'::STRING, 'f1'::STRING, NULL::STRING, 'f2'::STRING)
+) AS t(col1, col2, col3, col4);
+----
+{c0: 1, c1: NULL, c2: 2}
+
+# Array: SPARK-21804 repeated field
+query ?
+SELECT json_tuple(col1, col2, col3, col4) FROM (VALUES
+    ('{"f1":1,"f2":2}'::STRING, 'f1'::STRING, NULL::STRING, 'f1'::STRING)
+) AS t(col1, col2, col3, col4);
+----
+{c0: 1, c1: NULL, c2: 1}
+
+# Edge case: both json and field key are null
+query ?
+SELECT json_tuple(NULL::STRING, NULL::STRING);
+----
+NULL
+
+# Edge case: empty string json and empty string key
+query ?
+SELECT json_tuple(''::STRING, ''::STRING);
+----
+NULL
+
+# Edge case: mixed upper/lower case keys
+query ?
+SELECT json_tuple('{"Name":"Alice","name":"bob","NAME":"Charlie"}'::STRING, 'Name'::STRING, 'name'::STRING, 'NAME'::STRING);
+----
+{c0: Alice, c1: bob, c2: Charlie}
+
+# Edge case: UTF-8 Chinese characters
+query ?
+SELECT json_tuple('{"姓名":"小明","城市":"台北"}'::STRING, '姓名'::STRING, '城市'::STRING);
+----
+{c0: 小明, c1: 台北}
+
+# Edge case: UTF-8 Cyrillic characters
+query ?
+SELECT json_tuple('{"имя":"Иван","город":"Москва"}'::STRING, 'имя'::STRING, 'город'::STRING);
+----
+{c0: Иван, c1: Москва}
+
+# Verify return type with arrow_typeof
+query T
+SELECT arrow_typeof(json_tuple('{"a":1}'::STRING, 'a'::STRING));
+----
+Struct("c0": Utf8)
diff --git a/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt b/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt
new file mode 100644
index 0000000000000..a26b0435c9291
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/map/map_from_arrays.slt
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Spark doctests
+query ?
+SELECT map_from_arrays(array(1.0, 3.0), array('2', '4'));
+----
+{1.0: 2, 3.0: 4}
+
+query ?
+SELECT map_from_arrays(array(2, 5), array('a', 'b'));
+----
+{2: a, 5: b}
+
+query ?
+SELECT map_from_arrays(array(1, 2), array('a', NULL));
+----
+{1: a, 2: NULL}
+
+query ?
+SELECT map_from_arrays(cast(array() as array<int>), cast(array() as array<string>));
+----
+{}
+
+# Tests with DataType:Null input arrays
+query ?
+SELECT map_from_arrays(NULL, NULL);
+----
+NULL
+
+query ?
+SELECT map_from_arrays(array(1), NULL);
+----
+NULL
+
+query ?
+SELECT map_from_arrays(NULL, array(1));
+----
+NULL
+
+# Tests with different inner lists lengths
+query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths
+SELECT map_from_arrays(array(1, 2, 3), array('a', 'b'));
+
+query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths
+SELECT map_from_arrays(array(), array('a', 'b'));
+
+query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths
+SELECT map_from_arrays(array(1, 2, 3), array());
+
+query error DataFusion error: Execution error: map_deduplicate_keys: keys and values lists in the same row must have equal lengths
+select map_from_arrays(a, b)
+from values 
+    (array[1], array[1]),
+    (array[2, 3, 4], array[2, 3]),
+    (array[5], array[4])
+as tab(a, b);
+
+#Test with multiple rows: good, empty and nullable
+query ?
+select map_from_arrays(a, b)
+from values 
+    (array[1], array['a']), 
+    (NULL, NULL),
+    (array[1,2,3], NULL),
+    (NULL, array['b', 'c']), 
+    (array[4, 5], array['d', 'e']), 
+    (array[], array[]),
+    (array[6, 7, 8], array['f', 'g', 'h']) 
+as tab(a, b);
+----
+{1: a}
+NULL
+NULL
+NULL
+{4: d, 5: e}
+{}
+{6: f, 7: g, 8: h}
+
+# Test with complex types
+query ?
+SELECT map_from_arrays(array(array('a', 'b'), array('c', 'd')), array(struct(1, 2, 3), struct(4, 5, 6)));
+----
+{[a, b]: {c0: 1, c1: 2, c2: 3}, [c, d]: {c0: 4, c1: 5, c2: 6}}
+
+# Test with nested function calls
+query ?
+SELECT
+    map_from_arrays(
+        array['outer_key1', 'outer_key2'],
+        array[
+            -- value for outer_key1: a map itself
+            map_from_arrays(
+                array['inner_a', 'inner_b'],
+                array[1, 2]
+            ),
+            -- value for outer_key2: another map
+            map_from_arrays(
+                array['inner_x', 'inner_y', 'inner_z'],
+                array[10, 20, 30]
+            )
+        ]
+    ) AS nested_map;
+----
+{outer_key1: {inner_a: 1, inner_b: 2}, outer_key2: {inner_x: 10, inner_y: 20, inner_z: 30}}
+
+# Test with duplicate keys
+query ?
+SELECT map_from_arrays(array(true, false, true), array('a', NULL, 'b'));
+----
+{false: NULL, true: b}
+
+# Tests with different list types
+query ?
+SELECT map_from_arrays(arrow_cast(array(2, 5), 'LargeList(Int32)'), arrow_cast(array('a', 'b'), 'FixedSizeList(2, Utf8)'));
+----
+{2: a, 5: b}
+
+query ?
+SELECT map_from_arrays(arrow_cast(array('a', 'b', 'c'), 'FixedSizeList(3, Utf8)'), arrow_cast(array(1, 2, 3), 'LargeList(Int32)'));
+----
+{a: 1, b: 2, c: 3}
diff --git a/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt b/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt
new file mode 100644
index 0000000000000..19b46886a027e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/map/map_from_entries.slt
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Spark doctests
+query ?
+SELECT map_from_entries(array[struct(1, 'a'), struct(2, 'b')]);
+----
+{1: a, 2: b}
+
+query ?
+SELECT map_from_entries(array[struct(1, cast(null as string)), struct(2, 'b')]);
+----
+{1: NULL, 2: b}
+
+query ?
+SELECT map_from_entries(data) 
+from values 
+    (array[struct(1, 'a'), struct(2, 'b')]), 
+    (array[struct(3, 'c')]) 
+as tab(data);
+----
+{1: a, 2: b}
+{3: c}
+
+# Tests with NULL and empty input structarrays
+query ?
+SELECT map_from_entries(data) 
+from values 
+    (cast(array[] as array<struct<int, string>>)), 
+    (cast(NULL as array<struct<int, string>>)) 
+as tab(data);
+----
+{}
+NULL
+
+# Test with NULL key, should fail
+query error DataFusion error: Arrow error: Invalid argument error: Found unmasked nulls for non-nullable StructArray field "key"
+SELECT map_from_entries(array[struct(NULL, 1)]);
+
+# Tests with NULL and array of Null type, should fail
+query error DataFusion error: Execution error: map_from_entries: expected array<struct<key, value>>, got Null
+SELECT map_from_entries(NULL);
+
+query error DataFusion error: Execution error: map_from_entries: expected array<struct<key, value>>, got Null
+SELECT map_from_entries(array[NULL]);
+
+# Test with NULL array and NULL entries in arrays
+# output is NULL if any entry is NULL
+query ?
+SELECT map_from_entries(data)
+from values
+    (
+        array[
+            struct(1 as a, 'a' as b), 
+            cast(NULL as struct<a int, b string>), 
+            cast(NULL as struct<a int, b string>)
+        ]
+    ),
+    (NULL),
+    (
+        array[
+            struct(2 as a, 'b' as b), 
+            struct(3 as a, 'c' as b)
+        ]
+    ),
+    (
+        array[
+            struct(4 as a, 'd' as b),
+            cast(NULL as struct<a int, b string>),
+            struct(5 as a, 'e' as b),
+            struct(6 as a, 'f' as b)
+        ]
+    )
+as tab(data);
+----
+NULL
+NULL
+{2: b, 3: c}
+NULL
+
+#Test with multiple rows: good, empty and nullable
+query ?
+SELECT map_from_entries(data) 
+from values 
+    (NULL), 
+    (array[
+        struct(1 as a, 'b' as b), 
+        struct(2 as a, cast(NULL as string) as b), 
+        struct(3 as a, 'd' as b)
+    ]), 
+    (array[]),
+    (NULL) 
+as tab(data);
+----
+NULL
+{1: b, 2: NULL, 3: d}
+{}
+NULL
+
+# Test with complex types
+query ?
+SELECT map_from_entries(array[
+    struct(array('a', 'b'), struct(1, 2, 3)), 
+    struct(array('c', 'd'), struct(4, 5, 6))
+]);
+----
+{[a, b]: {c0: 1, c1: 2, c2: 3}, [c, d]: {c0: 4, c1: 5, c2: 6}}
+
+# Test with nested function calls
+query ?
+SELECT
+    map_from_entries(
+        array[
+            struct(
+                'outer_key1', 
+                -- value for outer_key1: a map itself
+                map_from_entries(
+                    array[
+                        struct('inner_a', 1), 
+                        struct('inner_b', 2)
+                    ]
+                )
+            ),
+            struct(
+                'outer_key2',
+                -- value for outer_key2: another map
+                map_from_entries(
+                    array[
+                        struct('inner_x', 10), 
+                        struct('inner_y', 20), 
+                        struct('inner_z', 30)
+                    ]
+                )
+            )
+        ]
+    ) AS nested_map;
+----
+{outer_key1: {inner_a: 1, inner_b: 2}, outer_key2: {inner_x: 10, inner_y: 20, inner_z: 30}}
+
+# Test with duplicate keys
+query ?
+SELECT map_from_entries(array(
+    struct(true, 'a'), 
+    struct(false, 'b'), 
+    struct(true, 'c'),
+    struct(false, cast(NULL as string)), 
+    struct(true, 'd')
+));
+----
+{false: NULL, true: d}
diff --git a/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt b/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt
new file mode 100644
index 0000000000000..30d1672aef0ae
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/map/str_to_map.slt
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for Spark-compatible str_to_map function
+# https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map
+#
+# Test cases derived from Spark test("StringToMap"):
+# https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala#L525-L618
+
+# s0: Basic test with default delimiters
+query ?
+SELECT str_to_map('a:1,b:2,c:3');
+----
+{a: 1, b: 2, c: 3}
+
+# s1: Preserve spaces in values
+query ?
+SELECT str_to_map('a: ,b:2');
+----
+{a:  , b: 2}
+
+# s2: Custom key-value delimiter '='
+query ?
+SELECT str_to_map('a=1,b=2,c=3', ',', '=');
+----
+{a: 1, b: 2, c: 3}
+
+# s3: Empty string returns map with empty key and NULL value
+query ?
+SELECT str_to_map('', ',', '=');
+----
+{: NULL}
+
+# s4: Custom pair delimiter '_'
+query ?
+SELECT str_to_map('a:1_b:2_c:3', '_', ':');
+----
+{a: 1, b: 2, c: 3}
+
+# s5: Single key without value returns NULL value
+query ?
+SELECT str_to_map('a');
+----
+{a: NULL}
+
+# s6: Custom delimiters '&' and '='
+query ?
+SELECT str_to_map('a=1&b=2&c=3', '&', '=');
+----
+{a: 1, b: 2, c: 3}
+
+# Duplicate keys: EXCEPTION policy (Spark 3.0+ default)
+# TODO: Add LAST_WIN policy tests when spark.sql.mapKeyDedupPolicy config is supported
+statement error
+Duplicate map key
+SELECT str_to_map('a:1,b:2,a:3');
+
+# Additional tests (DataFusion-specific)
+
+# NULL input returns NULL
+query ?
+SELECT str_to_map(NULL, ',', ':');
+----
+NULL
+
+# Explicit 3-arg form
+query ?
+SELECT str_to_map('a:1,b:2,c:3', ',', ':');
+----
+{a: 1, b: 2, c: 3}
+
+# Missing key-value delimiter results in NULL value
+query ?
+SELECT str_to_map('a,b:2', ',', ':');
+----
+{a: NULL, b: 2}
+
+# Multi-row test
+query ?
+SELECT str_to_map(col) FROM (VALUES ('a:1,b:2'), ('x:9'), (NULL)) AS t(col);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
+
+# Multi-row with custom delimiter
+query ?
+SELECT str_to_map(col, ',', '=') FROM (VALUES ('a=1,b=2'), ('x=9'), (NULL)) AS t(col);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
+
+# Per-row delimiters: each row can have different delimiters
+query ?
+SELECT str_to_map(col1, col2, col3) FROM (VALUES ('a=1,b=2', ',', '='), ('x#9', ',', '#'), (NULL, ',', '=')) AS t(col1, col2, col3);
+----
+{a: 1, b: 2}
+{x: 9}
+NULL
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/spark/math/abs.slt b/datafusion/sqllogictest/test_files/spark/math/abs.slt
new file mode 100644
index 0000000000000..94092caab9854
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/abs.slt
@@ -0,0 +1,213 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT abs(-1);
+## PySpark 3.5.5 Result: {'abs(-1)': 1, 'typeof(abs(-1))': 'int', 'typeof(-1)': 'int'}
+
+# Scalar input
+## Scalar input: signed int and NULL
+query IIIIR
+SELECT abs(-127::TINYINT), abs(-32767::SMALLINT), abs(-2147483647::INT), abs(-9223372036854775807::BIGINT), abs(NULL);
+----
+127 32767 2147483647 9223372036854775807 NULL
+
+## Scalar input: signed int minimal values
+## See https://github.com/apache/datafusion/issues/18794 for operator precedence
+query IIII
+select abs((-128)::TINYINT), abs((-32768)::SMALLINT), abs((-2147483648)::INT), abs((-9223372036854775808)::BIGINT);
+----
+-128 -32768 -2147483648 -9223372036854775808
+
+## Scalar input: Spark ANSI mode, signed int minimal values
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+query error DataFusion error: Arrow error: Compute error: Int8 overflow on abs\(\-128\)
+select abs((-128)::TINYINT);
+
+query error DataFusion error: Arrow error: Compute error: Int16 overflow on abs\(\-32768\)
+select abs((-32768)::SMALLINT);
+
+query error DataFusion error: Arrow error: Compute error: Int32 overflow on abs\(\-2147483648\)
+select abs((-2147483648)::INT);
+
+query error DataFusion error: Arrow error: Compute error: Int64 overflow on abs\(\-9223372036854775808\)
+select abs((-9223372036854775808)::BIGINT);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+## Scalar input: float, NULL, NaN, -0, infinity, -infinity
+query RRRRRRRRRRRR
+SELECT abs(-1.0::FLOAT), abs(0.::FLOAT), abs(-0.::FLOAT), abs(-0::FLOAT), abs(NULL::FLOAT), abs('NaN'::FLOAT), abs('inf'::FLOAT), abs('+inf'::FLOAT), abs('-inf'::FLOAT), abs('infinity'::FLOAT), abs('+infinity'::FLOAT), abs('-infinity'::FLOAT);
+----
+1 0 0 0 NULL NaN Infinity Infinity Infinity Infinity Infinity Infinity
+
+## Scalar input: double, NULL, NaN, -0, infinity, -infinity
+query RRRRRRRRRRRR
+SELECT abs(-1.0::DOUBLE), abs(0.::DOUBLE), abs(-0.::DOUBLE), abs(-0::DOUBLE), abs(NULL::DOUBLE), abs('NaN'::DOUBLE), abs('inf'::DOUBLE), abs('+inf'::DOUBLE), abs('-inf'::DOUBLE), abs('infinity'::DOUBLE), abs('+infinity'::DOUBLE), abs('-infinity'::DOUBLE);
+----
+1 0 0 0 NULL NaN Infinity Infinity Infinity Infinity Infinity Infinity
+
+## Scalar input: decimal128
+query RRR
+SELECT abs(('-99999999.99')::DECIMAL(10, 2)), abs(0::DECIMAL(10, 2)), abs(NULL::DECIMAL(10, 2));
+----
+99999999.99 0 NULL
+
+query RRR
+SELECT abs(('-9999999999999999999999999999.9999999999')::DECIMAL(38, 10)), abs(0::DECIMAL(38, 10)), abs(NULL::DECIMAL(38, 10));
+----
+9999999999999999999999999999.9999999999 0 NULL
+
+## Scalar input: decimal256
+query RRR
+SELECT abs(('-99999999999999999999999999999999999999.99')::DECIMAL(40, 2)), abs(0::DECIMAL(40, 2)), abs(NULL::DECIMAL(40, 2));
+----
+99999999999999999999999999999999999999.99 0 NULL
+
+query RRR
+SELECT abs(('-999999999999999999999999999999999999999999999999999999999999999999.9999999999')::DECIMAL(76, 10)), abs(0::DECIMAL(76, 10)), abs(NULL::DECIMAL(76, 10));
+----
+999999999999999999999999999999999999999999999999999999999999999999.9999999999 0 NULL
+
+
+# Array input
+## Array input: signed int, signed int minimal values and NULL
+query I
+SELECT abs(a) FROM (VALUES (-127::TINYINT), ((-128)::TINYINT), (NULL)) AS t(a);
+----
+127
+-128
+NULL
+
+query I
+select abs(a) FROM (VALUES (-32767::SMALLINT), ((-32768)::SMALLINT), (NULL)) AS t(a);
+----
+32767
+-32768
+NULL
+
+query I
+select abs(a) FROM (VALUES (-2147483647::INT), ((-2147483648)::INT), (NULL)) AS t(a);
+----
+2147483647
+-2147483648
+NULL
+
+query I
+select abs(a) FROM (VALUES (-9223372036854775807::BIGINT), ((-9223372036854775808)::BIGINT), (NULL)) AS t(a);
+----
+9223372036854775807
+-9223372036854775808
+NULL
+
+## Array Input: Spark ANSI mode, signed int minimal values
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+query error DataFusion error: Arrow error: Compute error: Int8Array overflow on abs\(\-128\)
+SELECT abs(a) FROM (VALUES (-127::TINYINT), ((-128)::TINYINT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int16Array overflow on abs\(\-32768\)
+select abs(a) FROM (VALUES (-32767::SMALLINT), ((-32768)::SMALLINT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int32Array overflow on abs\(\-2147483648\)
+select abs(a) FROM (VALUES (-2147483647::INT), ((-2147483648)::INT)) AS t(a);
+
+query error DataFusion error: Arrow error: Compute error: Int64Array overflow on abs\(\-9223372036854775808\)
+select abs(a) FROM (VALUES (-9223372036854775807::BIGINT), ((-9223372036854775808)::BIGINT)) AS t(a);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+## Array input: float, NULL, NaN, -0, infinity, -infinity
+query R
+SELECT abs(a) FROM (VALUES (-1.0::FLOAT), (0.::FLOAT), (-0.::FLOAT), (-0::FLOAT), (NULL::FLOAT), ('NaN'::FLOAT), ('inf'::FLOAT), ('+inf'::FLOAT), ('-inf'::FLOAT), ('infinity'::FLOAT), ('+infinity'::FLOAT), ('-infinity'::FLOAT)) AS t(a);
+----
+1
+0
+0
+0
+NULL
+NaN
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+
+
+## Array input: double, NULL, NaN, -0, infinity, -infinity
+query R
+SELECT abs(a) FROM (VALUES (-1.0::DOUBLE), (0.::DOUBLE), (-0.::DOUBLE), (-0::DOUBLE), (NULL::DOUBLE), ('NaN'::DOUBLE), ('inf'::DOUBLE), ('+inf'::DOUBLE), ('-inf'::DOUBLE), ('infinity'::DOUBLE), ('+infinity'::DOUBLE), ('-infinity'::DOUBLE)) AS t(a);
+----
+1
+0
+0
+0
+NULL
+NaN
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+Infinity
+
+## Array input: decimal128
+query R
+SELECT abs(a) FROM (VALUES (('-99999999.99')::DECIMAL(10, 2)), (0::DECIMAL(10, 2)), (NULL::DECIMAL(10, 2))) AS t(a);
+----
+99999999.99
+0
+NULL
+
+query R
+SELECT abs(a) FROM (VALUES (('-9999999999999999999999999999.9999999999')::DECIMAL(38, 10)), (0::DECIMAL(38, 10)), (NULL::DECIMAL(38, 10))) AS t(a);
+----
+9999999999999999999999999999.9999999999
+0
+NULL
+
+## Array input: decimal256
+query R
+SELECT abs(a) FROM (VALUES (('-99999999999999999999999999999999999999.99')::DECIMAL(40, 2)), (0::DECIMAL(40, 2)), (NULL::DECIMAL(40, 2))) AS t(a);
+----
+99999999999999999999999999999999999999.99
+0
+NULL
+
+query R
+SELECT abs(a) FROM (VALUES (('-999999999999999999999999999999999999999999999999999999999999999999.9999999999')::DECIMAL(76, 10)), (0::DECIMAL(76, 10)), (NULL::DECIMAL(76, 10))) AS t(a);
+----
+999999999999999999999999999999999999999999999999999999999999999999.9999999999
+0
+NULL
+
+## Original Query: SELECT abs(INTERVAL -'1-1' YEAR TO MONTH);
+## PySpark 3.5.5 Result: {"abs(INTERVAL '-1-1' YEAR TO MONTH)": 13, "typeof(abs(INTERVAL '-1-1' YEAR TO MONTH))": 'interval year to month', "typeof(INTERVAL '-1-1' YEAR TO MONTH)": 'interval year to month'}
+#query
+#SELECT abs(INTERVAL '-1-1' YEAR TO MONTH::interval year to month);
+# See GitHub issue for ANSI interval support: https://github.com/apache/datafusion/issues/18793
diff --git a/datafusion/sqllogictest/test_files/spark/math/acos.slt b/datafusion/sqllogictest/test_files/spark/math/acos.slt
new file mode 100644
index 0000000000000..76ee5694254b8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/acos.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT acos(1);
+## PySpark 3.5.5 Result: {'ACOS(1)': 0.0, 'typeof(ACOS(1))': 'double', 'typeof(1)': 'int'}
+#query
+#SELECT acos(1::int);
+
+## Original Query: SELECT acos(2);
+## PySpark 3.5.5 Result: {'ACOS(2)': nan, 'typeof(ACOS(2))': 'double', 'typeof(2)': 'int'}
+#query
+#SELECT acos(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/acosh.slt b/datafusion/sqllogictest/test_files/spark/math/acosh.slt
new file mode 100644
index 0000000000000..45b4537419ea6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/acosh.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT acosh(0);
+## PySpark 3.5.5 Result: {'ACOSH(0)': nan, 'typeof(ACOSH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT acosh(0::int);
+
+## Original Query: SELECT acosh(1);
+## PySpark 3.5.5 Result: {'ACOSH(1)': 0.0, 'typeof(ACOSH(1))': 'double', 'typeof(1)': 'int'}
+#query
+#SELECT acosh(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/asin.slt b/datafusion/sqllogictest/test_files/spark/math/asin.slt
new file mode 100644
index 0000000000000..5c6d265ff036e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/asin.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT asin(0);
+## PySpark 3.5.5 Result: {'ASIN(0)': 0.0, 'typeof(ASIN(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT asin(0::int);
+
+## Original Query: SELECT asin(2);
+## PySpark 3.5.5 Result: {'ASIN(2)': nan, 'typeof(ASIN(2))': 'double', 'typeof(2)': 'int'}
+#query
+#SELECT asin(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/asinh.slt b/datafusion/sqllogictest/test_files/spark/math/asinh.slt
new file mode 100644
index 0000000000000..7d965dea2bd77
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/asinh.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT asinh(0);
+## PySpark 3.5.5 Result: {'ASINH(0)': 0.0, 'typeof(ASINH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT asinh(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/atan.slt b/datafusion/sqllogictest/test_files/spark/math/atan.slt
new file mode 100644
index 0000000000000..b5817b08049ce
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/atan.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT atan(0);
+## PySpark 3.5.5 Result: {'ATAN(0)': 0.0, 'typeof(ATAN(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT atan(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/atan2.slt b/datafusion/sqllogictest/test_files/spark/math/atan2.slt
new file mode 100644
index 0000000000000..eb644854c402d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/atan2.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT atan2(0, 0);
+## PySpark 3.5.5 Result: {'ATAN2(0, 0)': 0.0, 'typeof(ATAN2(0, 0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT atan2(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/atanh.slt b/datafusion/sqllogictest/test_files/spark/math/atanh.slt
new file mode 100644
index 0000000000000..7e79f8c7bee58
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/atanh.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT atanh(0);
+## PySpark 3.5.5 Result: {'ATANH(0)': 0.0, 'typeof(ATANH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT atanh(0::int);
+
+## Original Query: SELECT atanh(2);
+## PySpark 3.5.5 Result: {'ATANH(2)': nan, 'typeof(ATANH(2))': 'double', 'typeof(2)': 'int'}
+#query
+#SELECT atanh(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/bin.slt b/datafusion/sqllogictest/test_files/spark/math/bin.slt
new file mode 100644
index 0000000000000..b2e2aadde44b6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/bin.slt
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT bin(arrow_cast(NULL, 'Int8'));
+----
+NULL
+
+query T
+SELECT bin(arrow_cast(0, 'Int8'));
+----
+0
+
+query T
+SELECT bin(arrow_cast(13, 'Int8'));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(13.36, 'Float16'));
+----
+1101
+
+query T
+SELECT bin(13.3::decimal(3,1));
+----
+1101
+
+query T
+SELECT bin(arrow_cast(-13, 'Int8'));
+----
+1111111111111111111111111111111111111111111111111111111111110011
+
+query T
+SELECT bin(arrow_cast(256, 'Int16'));
+----
+100000000
+
+query T
+SELECT bin(arrow_cast(-32768, 'Int16'));
+----
+1111111111111111111111111111111111111111111111111000000000000000
+
+query T
+SELECT bin(arrow_cast(-2147483648, 'Int32'));
+----
+1111111111111111111111111111111110000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(1073741824, 'Int32'));
+----
+1000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(-9223372036854775808, 'Int64'));
+----
+1000000000000000000000000000000000000000000000000000000000000000
+
+query T
+SELECT bin(arrow_cast(9223372036854775807, 'Int64'));
+----
+111111111111111111111111111111111111111111111111111111111111111
diff --git a/datafusion/sqllogictest/test_files/spark/math/bround.slt b/datafusion/sqllogictest/test_files/spark/math/bround.slt
new file mode 100644
index 0000000000000..afdc9c635c9a7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/bround.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT bround(2.5, 0);
+## PySpark 3.5.5 Result: {'bround(2.5, 0)': Decimal('2'), 'typeof(bround(2.5, 0))': 'decimal(2,0)', 'typeof(2.5)': 'decimal(2,1)', 'typeof(0)': 'int'}
+#query
+#SELECT bround(2.5::decimal(2,1), 0::int);
+
+## Original Query: SELECT bround(25, -1);
+## PySpark 3.5.5 Result: {'bround(25, -1)': 20, 'typeof(bround(25, -1))': 'int', 'typeof(25)': 'int', 'typeof(-1)': 'int'}
+#query
+#SELECT bround(25::int, -1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/cbrt.slt b/datafusion/sqllogictest/test_files/spark/math/cbrt.slt
new file mode 100644
index 0000000000000..f0aea17ff0b9b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/cbrt.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT cbrt(27.0);
+## PySpark 3.5.5 Result: {'CBRT(27.0)': 3.0, 'typeof(CBRT(27.0))': 'double', 'typeof(27.0)': 'decimal(3,1)'}
+#query
+#SELECT cbrt(27.0::decimal(3,1));
diff --git a/datafusion/sqllogictest/test_files/spark/math/ceil.slt b/datafusion/sqllogictest/test_files/spark/math/ceil.slt
new file mode 100644
index 0000000000000..c87a29b61fd49
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/ceil.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ceil(-0.1);
+## PySpark 3.5.5 Result: {'CEIL(-0.1)': Decimal('0'), 'typeof(CEIL(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
+#query
+#SELECT ceil(-0.1::decimal(1,1));
+
+## Original Query: SELECT ceil(3.1411, -3);
+## PySpark 3.5.5 Result: {'ceil(3.1411, -3)': Decimal('1000'), 'typeof(ceil(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
+#query
+#SELECT ceil(3.1411::decimal(5,4), -3::int);
+
+## Original Query: SELECT ceil(3.1411, 3);
+## PySpark 3.5.5 Result: {'ceil(3.1411, 3)': Decimal('3.142'), 'typeof(ceil(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
+#query
+#SELECT ceil(3.1411::decimal(5,4), 3::int);
+
+## Original Query: SELECT ceil(5);
+## PySpark 3.5.5 Result: {'CEIL(5)': 5, 'typeof(CEIL(5))': 'bigint', 'typeof(5)': 'int'}
+#query
+#SELECT ceil(5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/ceiling.slt b/datafusion/sqllogictest/test_files/spark/math/ceiling.slt
new file mode 100644
index 0000000000000..2b761faef47df
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/ceiling.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ceiling(-0.1);
+## PySpark 3.5.5 Result: {'ceiling(-0.1)': Decimal('0'), 'typeof(ceiling(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
+#query
+#SELECT ceiling(-0.1::decimal(1,1));
+
+## Original Query: SELECT ceiling(3.1411, -3);
+## PySpark 3.5.5 Result: {'ceiling(3.1411, -3)': Decimal('1000'), 'typeof(ceiling(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
+#query
+#SELECT ceiling(3.1411::decimal(5,4), -3::int);
+
+## Original Query: SELECT ceiling(3.1411, 3);
+## PySpark 3.5.5 Result: {'ceiling(3.1411, 3)': Decimal('3.142'), 'typeof(ceiling(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
+#query
+#SELECT ceiling(3.1411::decimal(5,4), 3::int);
+
+## Original Query: SELECT ceiling(5);
+## PySpark 3.5.5 Result: {'ceiling(5)': 5, 'typeof(ceiling(5))': 'bigint', 'typeof(5)': 'int'}
+#query
+#SELECT ceiling(5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/conv.slt b/datafusion/sqllogictest/test_files/spark/math/conv.slt
new file mode 100644
index 0000000000000..371fd3e746bd3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/conv.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT conv('100', 2, 10);
+## PySpark 3.5.5 Result: {'conv(100, 2, 10)': '4', 'typeof(conv(100, 2, 10))': 'string', 'typeof(100)': 'string', 'typeof(2)': 'int', 'typeof(10)': 'int'}
+#query
+#SELECT conv('100'::string, 2::int, 10::int);
+
+## Original Query: SELECT conv(-10, 16, -10);
+## PySpark 3.5.5 Result: {'conv(-10, 16, -10)': '-16', 'typeof(conv(-10, 16, -10))': 'string', 'typeof(-10)': 'int', 'typeof(16)': 'int'}
+#query
+#SELECT conv(-10::int, 16::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/cos.slt b/datafusion/sqllogictest/test_files/spark/math/cos.slt
new file mode 100644
index 0000000000000..a473c257553b1
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/cos.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT cos(0);
+## PySpark 3.5.5 Result: {'COS(0)': 1.0, 'typeof(COS(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT cos(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/cosh.slt b/datafusion/sqllogictest/test_files/spark/math/cosh.slt
new file mode 100644
index 0000000000000..97b3a2eb01cb8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/cosh.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT cosh(0);
+## PySpark 3.5.5 Result: {'COSH(0)': 1.0, 'typeof(COSH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT cosh(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/cot.slt b/datafusion/sqllogictest/test_files/spark/math/cot.slt
new file mode 100644
index 0000000000000..5bb010337addf
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/cot.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT cot(1);
+## PySpark 3.5.5 Result: {'COT(1)': 0.6420926159343306, 'typeof(COT(1))': 'double', 'typeof(1)': 'int'}
+#query
+#SELECT cot(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/csc.slt b/datafusion/sqllogictest/test_files/spark/math/csc.slt
new file mode 100644
index 0000000000000..837704113da4c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/csc.slt
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT csc(1);
+## PySpark 3.5.5 Result: {'CSC(1)': 1.1883951057781212, 'typeof(CSC(1))': 'double', 'typeof(1)': 'int'}
+
+query R
+SELECT csc(1::INT);
+----
+1.188395105778121
+
+query R
+SELECT csc(a) FROM (VALUES (0::INT), (1::INT), (-1::INT), (null)) AS t(a);
+----
+Infinity
+1.188395105778121
+-1.188395105778121
+NULL
+
+query R
+SELECT csc(a) FROM (VALUES (pi()), (-pi()), (pi()/2) , (arrow_cast('NAN','Float32'))) AS t(a);
+----
+8165619676597685
+-8165619676597685
+1
+NaN
diff --git a/datafusion/sqllogictest/test_files/spark/math/degrees.slt b/datafusion/sqllogictest/test_files/spark/math/degrees.slt
new file mode 100644
index 0000000000000..5ca7bacb8a6a6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/degrees.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT degrees(3.141592653589793);
+## PySpark 3.5.5 Result: {'DEGREES(3.141592653589793)': 180.0, 'typeof(DEGREES(3.141592653589793))': 'double', 'typeof(3.141592653589793)': 'decimal(16,15)'}
+#query
+#SELECT degrees(3.141592653589793::decimal(16,15));
diff --git a/datafusion/sqllogictest/test_files/spark/math/e.slt b/datafusion/sqllogictest/test_files/spark/math/e.slt
new file mode 100644
index 0000000000000..c8e23d3b0900b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/e.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT e();
+## PySpark 3.5.5 Result: {'E()': 2.718281828459045, 'typeof(E())': 'double'}
+#query
+#SELECT e();
diff --git a/datafusion/sqllogictest/test_files/spark/math/exp.slt b/datafusion/sqllogictest/test_files/spark/math/exp.slt
new file mode 100644
index 0000000000000..671684f9855da
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/exp.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT exp(0);
+## PySpark 3.5.5 Result: {'EXP(0)': 1.0, 'typeof(EXP(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT exp(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/expm1.slt b/datafusion/sqllogictest/test_files/spark/math/expm1.slt
index 96d4abb0414b3..647a5ba341d0a 100644
--- a/datafusion/sqllogictest/test_files/spark/math/expm1.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/expm1.slt
@@ -30,3 +30,8 @@ SELECT expm1(a) FROM (VALUES (0::INT), (1::INT)) AS t(a);
 ----
 0
 1.718281828459045
+
+query R
+SELECT expm1(0.0::double);
+----
+0
diff --git a/datafusion/sqllogictest/test_files/spark/math/factorial.slt b/datafusion/sqllogictest/test_files/spark/math/factorial.slt
new file mode 100644
index 0000000000000..f8eae5d95ab85
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/factorial.slt
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT factorial(5);
+## PySpark 3.5.5 Result: {'factorial(5)': 120, 'typeof(factorial(5))': 'bigint', 'typeof(5)': 'int'}
+query I
+SELECT factorial(5::INT);
+----
+120
+
+query I
+SELECT factorial(a)
+FROM VALUES
+    (-1::INT),
+    (0::INT), (1::INT), (2::INT), (3::INT), (4::INT), (5::INT), (6::INT), (7::INT), (8::INT), (9::INT), (10::INT),
+    (11::INT), (12::INT), (13::INT), (14::INT), (15::INT), (16::INT), (17::INT), (18::INT), (19::INT), (20::INT),
+    (21::INT),
+    (NULL) AS t(a);
+----
+NULL
+1
+1
+2
+6
+24
+120
+720
+5040
+40320
+362880
+3628800
+39916800
+479001600
+6227020800
+87178291200
+1307674368000
+20922789888000
+355687428096000
+6402373705728000
+121645100408832000
+2432902008176640000
+NULL
+NULL
+
+query error Error during planning: Failed to coerce arguments to satisfy a call to 'factorial' function
+SELECT factorial(5::BIGINT);
diff --git a/datafusion/sqllogictest/test_files/spark/math/floor.slt b/datafusion/sqllogictest/test_files/spark/math/floor.slt
new file mode 100644
index 0000000000000..d39d47ab1fee8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/floor.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT floor(-0.1);
+## PySpark 3.5.5 Result: {'FLOOR(-0.1)': Decimal('-1'), 'typeof(FLOOR(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
+#query
+#SELECT floor(-0.1::decimal(1,1));
+
+## Original Query: SELECT floor(3.1411, -3);
+## PySpark 3.5.5 Result: {'floor(3.1411, -3)': Decimal('0'), 'typeof(floor(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
+#query
+#SELECT floor(3.1411::decimal(5,4), -3::int);
+
+## Original Query: SELECT floor(3.1411, 3);
+## PySpark 3.5.5 Result: {'floor(3.1411, 3)': Decimal('3.141'), 'typeof(floor(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
+#query
+#SELECT floor(3.1411::decimal(5,4), 3::int);
+
+## Original Query: SELECT floor(5);
+## PySpark 3.5.5 Result: {'FLOOR(5)': 5, 'typeof(FLOOR(5))': 'bigint', 'typeof(5)': 'int'}
+#query
+#SELECT floor(5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/greatest.slt b/datafusion/sqllogictest/test_files/spark/math/greatest.slt
new file mode 100644
index 0000000000000..ff1143d5fcafa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/greatest.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT greatest(10, 9, 2, 4, 3);
+## PySpark 3.5.5 Result: {'greatest(10, 9, 2, 4, 3)': 10, 'typeof(greatest(10, 9, 2, 4, 3))': 'int', 'typeof(10)': 'int', 'typeof(9)': 'int', 'typeof(2)': 'int', 'typeof(4)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT greatest(10::int, 9::int, 2::int, 4::int, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/hex.slt b/datafusion/sqllogictest/test_files/spark/math/hex.slt
index 24db1a318358a..17e9ff432890d 100644
--- a/datafusion/sqllogictest/test_files/spark/math/hex.slt
+++ b/datafusion/sqllogictest/test_files/spark/math/hex.slt
@@ -38,3 +38,48 @@ SELECT hex(a) from VALUES ('foo'), (NULL), ('foobarbaz') AS t(a);
 666F6F
 NULL
 666F6F62617262617A
+
+statement ok
+CREATE TABLE t_utf8view as VALUES (arrow_cast('foo', 'Utf8View')), (NULL), (arrow_cast('foobarbaz', 'Utf8View'));
+
+query T
+SELECT hex(column1) FROM t_utf8view;
+----
+666F6F
+NULL
+666F6F62617262617A
+
+query T
+SELECT hex(column1) FROM VALUES (arrow_cast('hello', 'LargeBinary')), (NULL), (arrow_cast('world', 'LargeBinary'));
+----
+68656C6C6F
+NULL
+776F726C64
+
+statement error Function 'hex' expects 1 arguments but received 2
+SELECT hex(1, 2);
+
+query T
+SELECT hex(arrow_cast('test', 'LargeBinary')) as lar_b;
+----
+74657374
+
+statement ok
+CREATE TABLE t_dict_binary AS
+SELECT arrow_cast(column1, 'Dictionary(Int32, Binary)') as dict_col
+FROM VALUES ('foo'), ('bar'), ('foo'), (NULL), ('baz'), ('bar');
+
+query T
+SELECT hex(dict_col) FROM t_dict_binary;
+----
+666F6F
+626172
+666F6F
+NULL
+62617A
+626172
+
+query T
+SELECT arrow_typeof(hex(dict_col)) FROM t_dict_binary LIMIT 1;
+----
+Dictionary(Int32, Utf8)
diff --git a/datafusion/sqllogictest/test_files/spark/math/hypot.slt b/datafusion/sqllogictest/test_files/spark/math/hypot.slt
new file mode 100644
index 0000000000000..1349be0a95ee7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/hypot.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT hypot(3, 4);
+## PySpark 3.5.5 Result: {'HYPOT(3, 4)': 5.0, 'typeof(HYPOT(3, 4))': 'double', 'typeof(3)': 'int', 'typeof(4)': 'int'}
+#query
+#SELECT hypot(3::int, 4::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/least.slt b/datafusion/sqllogictest/test_files/spark/math/least.slt
new file mode 100644
index 0000000000000..f17bc2aed9885
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/least.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT least(10, 9, 2, 4, 3);
+## PySpark 3.5.5 Result: {'least(10, 9, 2, 4, 3)': 2, 'typeof(least(10, 9, 2, 4, 3))': 'int', 'typeof(10)': 'int', 'typeof(9)': 'int', 'typeof(2)': 'int', 'typeof(4)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT least(10::int, 9::int, 2::int, 4::int, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/ln.slt b/datafusion/sqllogictest/test_files/spark/math/ln.slt
new file mode 100644
index 0000000000000..d3245f76736e7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/ln.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ln(1);
+## PySpark 3.5.5 Result: {'ln(1)': 0.0, 'typeof(ln(1))': 'double', 'typeof(1)': 'int'}
+#query
+#SELECT ln(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/log.slt b/datafusion/sqllogictest/test_files/spark/math/log.slt
new file mode 100644
index 0000000000000..0ea3de7f1bf0d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/log.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT log(10, 100);
+## PySpark 3.5.5 Result: {'LOG(10, 100)': 2.0, 'typeof(LOG(10, 100))': 'double', 'typeof(10)': 'int', 'typeof(100)': 'int'}
+#query
+#SELECT log(10::int, 100::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/log10.slt b/datafusion/sqllogictest/test_files/spark/math/log10.slt
new file mode 100644
index 0000000000000..95e518f2eb804
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/log10.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT log10(10);
+## PySpark 3.5.5 Result: {'LOG10(10)': 1.0, 'typeof(LOG10(10))': 'double', 'typeof(10)': 'int'}
+#query
+#SELECT log10(10::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/log1p.slt b/datafusion/sqllogictest/test_files/spark/math/log1p.slt
new file mode 100644
index 0000000000000..359051c62120e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/log1p.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT log1p(0);
+## PySpark 3.5.5 Result: {'LOG1P(0)': 0.0, 'typeof(LOG1P(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT log1p(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/log2.slt b/datafusion/sqllogictest/test_files/spark/math/log2.slt
new file mode 100644
index 0000000000000..2706c0fad4bdd
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/log2.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT log2(2);
+## PySpark 3.5.5 Result: {'LOG2(2)': 1.0, 'typeof(LOG2(2))': 'double', 'typeof(2)': 'int'}
+#query
+#SELECT log2(2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/mod.slt b/datafusion/sqllogictest/test_files/spark/math/mod.slt
new file mode 100644
index 0000000000000..68c0f59f48125
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/mod.slt
@@ -0,0 +1,273 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT MOD(2, 1.8);
+## PySpark 3.5.5 Result: {'mod(2, 1.8)': Decimal('0.2'), 'typeof(mod(2, 1.8))': 'decimal(2,1)', 'typeof(2)': 'int', 'typeof(1.8)': 'decimal(2,1)'}
+query R
+SELECT MOD(2::int, 1.8::decimal(2,1));
+----
+0.2
+
+# Basic integer modulo operations
+query I
+SELECT MOD(10::int, 3::int) as mod_1;
+----
+1
+
+query I
+SELECT MOD(7::int, 2::int) as mod_2;
+----
+1
+
+query I
+SELECT MOD(15::int, 4::int) as mod_3;
+----
+3
+
+query I
+SELECT MOD(100::int, 30::int) as mod_4;
+----
+10
+
+query I
+SELECT MOD(50::int, 25::int) as mod_5;
+----
+0
+
+query I
+SELECT MOD(200::int, 60::int) as mod_6;
+----
+20
+
+# Float modulo operations
+query R
+SELECT MOD(10.5::float8, 3.0::float8) as mod_float_1;
+----
+1.5
+
+query R
+SELECT MOD(7.2::float8, 2.5::float8) as mod_float_2;
+----
+2.2
+
+query R
+SELECT MOD(15.8::float8, 4.2::float8) as mod_float_3;
+----
+3.2
+
+# Mixed type operations
+query R
+SELECT MOD(10::int, 3.0::float8) as mod_mixed_1;
+----
+1
+
+query R
+SELECT MOD(10.5::float8, 3::int) as mod_mixed_2;
+----
+1.5
+
+# NULL value handling
+query I
+SELECT MOD(NULL::int, 3::int) as mod_null_1;
+----
+NULL
+
+query I
+SELECT MOD(10::int, NULL::int) as mod_null_2;
+----
+NULL
+
+query I
+SELECT MOD(NULL::int, NULL::int) as mod_null_3;
+----
+NULL
+
+# Special values: NaN and Infinity
+query R
+SELECT MOD(5.0::float8, 'NaN'::float8) as mod_nan_1;
+----
+NaN
+
+query R
+SELECT MOD('NaN'::float8, 2.0::float8) as mod_nan_2;
+----
+NaN
+
+query R
+SELECT MOD('NaN'::float8, 'Infinity'::float8) as mod_nan_3;
+----
+NaN
+
+query R
+SELECT MOD('Infinity'::float8, 'NaN'::float8) as mod_nan_4;
+----
+NaN
+
+query R
+SELECT MOD(5.0::float8, 'Infinity'::float8) as mod_inf_1;
+----
+5
+
+query R
+SELECT MOD('Infinity'::float8, 2.0::float8) as mod_inf_2;
+----
+NaN
+
+# Decimal operations
+query R
+SELECT MOD(2.5::decimal(3,1), 1.2::decimal(2,1)) as mod_decimal_1;
+----
+0.1
+
+query R
+SELECT MOD(10.0::decimal(3,1), 3.0::decimal(2,1)) as mod_decimal_2;
+----
+1
+
+# Division by zero returns NULL in legacy mode (ANSI off)
+query I
+SELECT MOD(10::int, 0::int) as mod_div_zero_1;
+----
+NULL
+
+query I
+SELECT MOD(-7::int, 0::int) as mod_div_zero_2;
+----
+NULL
+
+query R
+SELECT MOD(10.5::float8, 0.0::float8) as mod_div_zero_float;
+----
+NaN
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT MOD(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+# Edge cases
+query I
+SELECT MOD(0::int, 5::int) as mod_zero_1;
+----
+0
+
+query I
+SELECT MOD(5::int, 1::int) as mod_zero_2;
+----
+0
+
+query I
+SELECT MOD(-10::int, 3::int) as mod_negative_1;
+----
+-1
+
+query I
+SELECT MOD(10::int, -3::int) as mod_negative_2;
+----
+1
+
+query I
+SELECT MOD(-10::int, -3::int) as mod_negative_3;
+----
+-1
+
+# Multiple MOD operations
+query I
+SELECT MOD(MOD(100::int, 30::int), 5::int) as mod_nested_1;
+----
+0
+
+query I
+SELECT MOD(10::int, MOD(7::int, 3::int)) as mod_nested_2;
+----
+0
+
+# MOD with different data types
+query I
+SELECT MOD(10::int8, 3::int8) as mod_int8;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'Int16'), arrow_cast(3, 'Int16')) as mod_int16;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'Int32'), arrow_cast(3, 'Int32')) as mod_int32;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'Int64'), arrow_cast(3, 'Int64')) as mod_int64;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'UInt16'), arrow_cast(3, 'UInt16')) as mod_int16;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'UInt32'), arrow_cast(3, 'UInt32')) as mod_int32;
+----
+1
+
+query I
+SELECT MOD(arrow_cast(10, 'UInt64'), arrow_cast(3, 'UInt64')) as mod_int64;
+----
+1
+
+query R
+SELECT MOD(10::float4, 3::float4) as mod_float4;
+----
+1
+
+query R
+SELECT MOD(10::float8, 3::float8) as mod_float8;
+----
+1
+
+# MOD in expressions
+query I
+SELECT MOD(10::int + 5::int, 3::int) as mod_expr_1;
+----
+0
+
+query I
+SELECT MOD(10::int, 2::int + 1::int) as mod_expr_2;
+----
+1
+
+query I
+SELECT MOD(10::int * 2::int, 5::int) as mod_expr_3;
+----
+0
diff --git a/datafusion/sqllogictest/test_files/spark/math/negative.slt b/datafusion/sqllogictest/test_files/spark/math/negative.slt
new file mode 100644
index 0000000000000..40bfaf791fe81
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/negative.slt
@@ -0,0 +1,331 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT negative(1);
+## PySpark 3.5.5 Result: {'negative(1)': -1, 'typeof(negative(1))': 'int', 'typeof(1)': 'int'}
+
+# Test negative with integer
+query I
+SELECT negative(1::int);
+----
+-1
+
+# Test negative with positive integer
+query I
+SELECT negative(42::int);
+----
+-42
+
+# Test negative with negative integer
+query I
+SELECT negative(-10::int);
+----
+10
+
+# Test negative with zero
+query I
+SELECT negative(0::int);
+----
+0
+
+# Test negative with bigint
+query I
+SELECT negative(9223372036854775807::bigint);
+----
+-9223372036854775807
+
+# Test negative with negative bigint
+query I
+SELECT negative(-100::bigint);
+----
+100
+
+# Test negative with smallint
+query I
+SELECT negative(32767::smallint);
+----
+-32767
+
+# Test negative with float
+query R
+SELECT negative(3.14::float);
+----
+-3.14
+
+# Test negative with negative float
+query R
+SELECT negative(-2.5::float);
+----
+2.5
+
+# Test negative with double
+query R
+SELECT negative(3.14159265358979::double);
+----
+-3.14159265358979
+
+# Test negative with negative double
+query R
+SELECT negative(-1.5::double);
+----
+1.5
+
+# Test negative with decimal
+query R
+SELECT negative(123.456::decimal(10,3));
+----
+-123.456
+
+# Test negative with negative decimal
+query R
+SELECT negative(-99.99::decimal(10,2));
+----
+99.99
+
+# Test negative with NULL
+query I
+SELECT negative(NULL::int);
+----
+NULL
+
+# Test negative with column values
+statement ok
+CREATE TABLE test_negative (id int, value int) AS VALUES (1, 10), (2, -20), (3, 0), (4, NULL);
+
+query II rowsort
+SELECT id, negative(value) FROM test_negative;
+----
+1 -10
+2 20
+3 0
+4 NULL
+
+statement ok
+DROP TABLE test_negative;
+
+# Test negative in expressions
+query I
+SELECT negative(5) + 3;
+----
+-2
+
+# Test nested negative
+query I
+SELECT negative(negative(7));
+----
+7
+
+# Test negative with large numbers
+query R
+SELECT negative(1234567890.123456::double);
+----
+-1234567890.123456
+
+# Test wrap-around: negative of minimum int (should wrap to same value)
+# Using table to avoid constant folding overflow during optimization
+statement ok
+CREATE TABLE min_values_int AS VALUES (-2147483648);
+
+query I
+SELECT negative(column1::int) FROM min_values_int;
+----
+-2147483648
+
+statement ok
+DROP TABLE min_values_int;
+
+# Test wrap-around: negative of minimum bigint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_bigint AS VALUES (-9223372036854775808);
+
+query I
+SELECT negative(column1::bigint) FROM min_values_bigint;
+----
+-9223372036854775808
+
+statement ok
+DROP TABLE min_values_bigint;
+
+# Test wrap-around: negative of minimum smallint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_smallint AS VALUES (-32768);
+
+query I
+SELECT negative(column1::smallint) FROM min_values_smallint;
+----
+-32768
+
+statement ok
+DROP TABLE min_values_smallint;
+
+# Test wrap-around: negative of minimum tinyint (should wrap to same value)
+statement ok
+CREATE TABLE min_values_tinyint AS VALUES (-128);
+
+query I
+SELECT negative(column1::tinyint) FROM min_values_tinyint;
+----
+-128
+
+statement ok
+DROP TABLE min_values_tinyint;
+
+# Test overflow: negative of positive infinity (float)
+query R
+SELECT negative('Infinity'::float);
+----
+-Infinity
+
+# Test overflow: negative of negative infinity (float)
+query R
+SELECT negative('-Infinity'::float);
+----
+Infinity
+
+# Test overflow: negative of positive infinity (double)
+query R
+SELECT negative('Infinity'::double);
+----
+-Infinity
+
+# Test overflow: negative of negative infinity (double)
+query R
+SELECT negative('-Infinity'::double);
+----
+Infinity
+
+# Test overflow: negative of NaN (float)
+query R
+SELECT negative('NaN'::float);
+----
+NaN
+
+# Test overflow: negative of NaN (double)
+query R
+SELECT negative('NaN'::double);
+----
+NaN
+
+# Test overflow: negative of maximum float value
+query R
+SELECT negative(3.4028235e38::float);
+----
+-340282350000000000000000000000000000000
+
+# Test overflow: negative of minimum float value
+query R
+SELECT negative(-3.4028235e38::float);
+----
+340282350000000000000000000000000000000
+
+# Test overflow: negative of maximum double value
+query R
+SELECT negative(1.7976931348623157e308::double);
+----
+-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+# Test overflow: negative of minimum double value
+query R
+SELECT negative(-1.7976931348623157e308::double);
+----
+179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+# Test negative with CalendarIntervalType (IntervalMonthDayNano)
+# Spark make_interval creates CalendarInterval
+query ?
+SELECT negative(make_interval(1, 2, 3, 4, 5, 6, 7.5));
+----
+-14 mons -25 days -5 hours -6 mins -7.500000000 secs
+
+# Test negative with negative CalendarIntervalType
+query ?
+SELECT negative(make_interval(-2, -5, -1, -10, -3, -30, -15.25));
+----
+29 mons 17 days 3 hours 30 mins 15.250000000 secs
+
+# Test negative with CalendarInterval from table
+statement ok
+CREATE TABLE interval_test AS VALUES
+  (make_interval(1, 2, 0, 5, 0, 0, 0.0)),
+  (make_interval(-3, -1, 0, -2, 0, 0, 0.0));
+
+query ? rowsort
+SELECT negative(column1) FROM interval_test;
+----
+-14 mons -5 days
+37 mons 2 days
+
+statement ok
+DROP TABLE interval_test;
+
+## ANSI mode tests: overflow detection
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+# Test ANSI mode: negative of minimum values should error (overflow)
+query error DataFusion error: Execution error: Int8 overflow on negative\(\-128\)
+SELECT negative((-128)::tinyint);
+
+query error DataFusion error: Execution error: Int16 overflow on negative\(\-32768\)
+SELECT negative((-32768)::smallint);
+
+query error DataFusion error: Execution error: Int32 overflow on negative\(\-2147483648\)
+SELECT negative((-2147483648)::int);
+
+query error DataFusion error: Execution error: Int64 overflow on negative\(\-9223372036854775808\)
+SELECT negative((-9223372036854775808)::bigint);
+
+# Test ANSI mode: negative of (MIN+1) should succeed (boundary test)
+query I
+SELECT negative((-127)::tinyint);
+----
+127
+
+query I
+SELECT negative((-32767)::smallint);
+----
+32767
+
+query I
+SELECT negative((-2147483647)::int);
+----
+2147483647
+
+query I
+SELECT negative((-9223372036854775807)::bigint);
+----
+9223372036854775807
+
+# Test ANSI mode: array with MIN value should error
+statement ok
+CREATE TABLE min_values_ansi AS VALUES (-2147483648);
+
+query error DataFusion error: Execution error: Int32 overflow on negative\(\-2147483648\)
+SELECT negative(column1::int) FROM min_values_ansi;
+
+statement ok
+DROP TABLE min_values_ansi;
+
+# Reset ANSI mode to false
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
diff --git a/datafusion/sqllogictest/test_files/spark/math/pi.slt b/datafusion/sqllogictest/test_files/spark/math/pi.slt
new file mode 100644
index 0000000000000..4b94e09bc9383
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/pi.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT pi();
+## PySpark 3.5.5 Result: {'PI()': 3.141592653589793, 'typeof(PI())': 'double'}
+#query
+#SELECT pi();
diff --git a/datafusion/sqllogictest/test_files/spark/math/pmod.slt b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
new file mode 100644
index 0000000000000..aa4a197ba470f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/pmod.slt
@@ -0,0 +1,354 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+# Basic PMOD tests with positive integers
+query I
+SELECT pmod(10::int, 3::int) as pmod_1;
+----
+1
+
+query I
+SELECT pmod(7::int, 2::int) as pmod_2;
+----
+1
+
+query I
+SELECT pmod(15::int, 4::int) as pmod_3;
+----
+3
+
+# PMOD tests with negative integers (should return positive remainder)
+query I
+SELECT pmod(-10::int, 3::int) as pmod_negative_1;
+----
+2
+
+query I
+SELECT pmod(-7::int, 3::int) as pmod_negative_2;
+----
+2
+
+query I
+SELECT pmod(-15::int, 4::int) as pmod_negative_3;
+----
+1
+
+query I
+SELECT pmod(-5::int, 5::int) as pmod_negative_4;
+----
+0
+
+# PMOD tests with zero
+query I
+SELECT pmod(0::int, 5::int) as pmod_zero_1;
+----
+0
+
+query I
+SELECT pmod(10::int, 0::int) as pmod_zero_2;
+----
+NULL
+
+query I
+SELECT pmod(-7::int, 0::int) as pmod_zero_3;
+----
+NULL
+
+# Division by zero errors in ANSI mode
+statement ok
+set datafusion.execution.enable_ansi_mode = true;
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(10::int, 0::int);
+
+statement error DataFusion error: Arrow error: Divide by zero error
+SELECT pmod(-7::int, 0::int);
+
+statement ok
+set datafusion.execution.enable_ansi_mode = false;
+
+# PMOD tests with NULL values
+query I
+SELECT pmod(NULL::int, 3::int) as pmod_null_1;
+----
+NULL
+
+query I
+SELECT pmod(10::int, NULL::int) as pmod_null_2;
+----
+NULL
+
+query I
+SELECT pmod(NULL::int, NULL::int) as pmod_null_3;
+----
+NULL
+
+# PMOD tests with large integers
+query I
+SELECT pmod(100::int, 30::int) as pmod_large_1;
+----
+10
+
+query I
+SELECT pmod(-100::int, 30::int) as pmod_large_2;
+----
+20
+
+query I
+SELECT pmod(200::int, 60::int) as pmod_large_3;
+----
+20
+
+query I
+SELECT pmod(-200::int, 60::int) as pmod_large_4;
+----
+40
+
+# PMOD tests with edge cases
+query I
+SELECT pmod(-1::int, 5::int) as pmod_edge_1;
+----
+4
+
+query I
+SELECT pmod(1::int, 5::int) as pmod_edge_2;
+----
+1
+
+query I
+SELECT pmod(-5::int, 5::int) as pmod_edge_3;
+----
+0
+
+query I
+SELECT pmod(5::int, 5::int) as pmod_edge_4;
+----
+0
+
+query I
+SELECT pmod(-6::int, 5::int) as pmod_edge_5;
+----
+4
+
+query I
+SELECT pmod(6::int, 5::int) as pmod_edge_6;
+----
+1
+
+# PMOD tests with negative divisors
+query I
+SELECT pmod(10::int, -3::int) as pmod_neg_div_1;
+----
+1
+
+query I
+SELECT pmod(-7::int, -3::int) as pmod_neg_div_2;
+----
+-1
+
+query I
+SELECT pmod(15::int, -4::int) as pmod_neg_div_3;
+----
+3
+
+# PMOD tests with floating point numbers
+query R
+SELECT pmod(10.5::float8, 3.0::float8) as pmod_float_1;
+----
+1.5
+
+query R
+SELECT pmod(-7.2::float8, 3.0::float8) as pmod_float_2;
+----
+1.8
+
+query R
+SELECT pmod(15.8::float8, 4.2::float8) as pmod_float_3;
+----
+3.2
+
+query R
+SELECT pmod(-15.8::float8, 4.2::float8) as pmod_float_4;
+----
+1
+
+query R
+SELECT pmod(5.0::float8, 2.5::float8) as pmod_float_5;
+----
+0
+
+query R
+SELECT pmod(-5.0::float8, 2.5::float8) as pmod_float_6;
+----
+0
+
+# PMOD tests with float32
+query R
+SELECT pmod(10.5::float4, 3.0::float4) as pmod_float32_1;
+----
+1.5
+
+query R
+SELECT CAST(pmod(CAST(-7.2 AS float4), CAST(3.0 AS float4)) AS DECIMAL(3,1)) as pmod_float32_2;
+----
+1.8
+
+query R
+SELECT CAST(pmod(15.8::float4, 4.2::float4) AS DECIMAL(3,1)) as pmod_float32_3;
+----
+3.2
+
+query R
+SELECT CAST(pmod(-15.8::float4, 4.2::float4) AS DECIMAL(3,1)) as pmod_float32_4;
+----
+1
+
+# PMOD tests with special float values
+query R
+SELECT pmod('NaN'::float8, 2.0::float8) as pmod_nan_1;
+----
+NaN
+
+query R
+SELECT pmod(5.0::float8, 'NaN'::float8) as pmod_nan_2;
+----
+NaN
+
+query R
+SELECT pmod('Infinity'::float8, 2.0::float8) as pmod_inf_1;
+----
+NaN
+
+query R
+SELECT pmod(5.0::float8, 'Infinity'::float8) as pmod_inf_2;
+----
+5
+
+query R
+SELECT pmod(-5.0::float8, 'Infinity'::float8) as pmod_inf_3;
+----
+NaN
+
+query R
+SELECT pmod('NaN'::float8, 'Infinity'::float8) as pmod_nan_inf_1;
+----
+NaN
+
+query R
+SELECT pmod('Infinity'::float8, 'NaN'::float8) as pmod_inf_nan_1;
+----
+NaN
+
+# PMOD tests with decimal types
+query R
+SELECT pmod(2.5::decimal(3,1), 1.2::decimal(2,1)) as pmod_decimal_1;
+----
+0.1
+
+query R
+SELECT pmod(-2.5::decimal(3,1), 1.2::decimal(2,1)) as pmod_decimal_2;
+----
+1.1
+
+query R
+SELECT pmod(10.0::decimal(3,1), 3.0::decimal(2,1)) as pmod_decimal_3;
+----
+1
+
+query R
+SELECT pmod(-10.0::decimal(3,1), 3.0::decimal(2,1)) as pmod_decimal_4;
+----
+2
+
+# PMOD tests with different integer types
+query I
+SELECT pmod(10::int8, 3::int8) as pmod_int8_1;
+----
+1
+
+query I
+SELECT pmod(-10::int8, 3::int8) as pmod_int8_2;
+----
+2
+
+query I
+SELECT pmod(arrow_cast(10, 'Int16'), arrow_cast(3, 'Int16')) as pmod_int16_1;
+----
+1
+
+query I
+SELECT pmod(arrow_cast(-10, 'Int16'), arrow_cast(3, 'Int16')) as pmod_int16_2;
+----
+2
+
+query I
+SELECT pmod(arrow_cast(10, 'Int64'), arrow_cast(3, 'Int64')) as pmod_int64_1;
+----
+1
+
+query I
+SELECT pmod(arrow_cast(-10, 'Int64'), arrow_cast(3, 'Int64')) as pmod_int64_2;
+----
+2
+
+# PMOD tests with unsigned integers
+query I
+SELECT pmod(arrow_cast(10, 'UInt8'), arrow_cast(3, 'UInt8')) as pmod_uint8_1;
+----
+1
+
+query I
+SELECT pmod(arrow_cast(10, 'UInt16'), arrow_cast(3, 'UInt16')) as pmod_uint16_1;
+----
+1
+
+query I
+SELECT pmod(arrow_cast(10, 'UInt32'), arrow_cast(3, 'UInt32')) as pmod_uint32_1;
+----
+1
+
+query I
+SELECT pmod(arrow_cast(10, 'UInt64'), arrow_cast(3, 'UInt64')) as pmod_uint64_1;
+----
+1
+
+# PMOD tests with scalar values
+query I
+SELECT pmod(10, 3) as pmod_scalar_1;
+----
+1
+
+query I
+SELECT pmod(-10, 3) as pmod_scalar_2;
+----
+2
+
+query R
+SELECT pmod(10.5, 3.0) as pmod_scalar_3;
+----
+1.5
+
+query R
+SELECT pmod(-7.2, 3.0) as pmod_scalar_4;
+----
+1.8
diff --git a/datafusion/sqllogictest/test_files/spark/math/positive.slt b/datafusion/sqllogictest/test_files/spark/math/positive.slt
new file mode 100644
index 0000000000000..5e1be0f4b4678
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/positive.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT positive(1);
+## PySpark 3.5.5 Result: {'(+ 1)': 1, 'typeof((+ 1))': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT positive(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/pow.slt b/datafusion/sqllogictest/test_files/spark/math/pow.slt
new file mode 100644
index 0000000000000..55b6f65b81235
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/pow.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT pow(2, 3);
+## PySpark 3.5.5 Result: {'pow(2, 3)': 8.0, 'typeof(pow(2, 3))': 'double', 'typeof(2)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT pow(2::int, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/power.slt b/datafusion/sqllogictest/test_files/spark/math/power.slt
new file mode 100644
index 0000000000000..f82056c6d941b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/power.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT power(2, 3);
+## PySpark 3.5.5 Result: {'POWER(2, 3)': 8.0, 'typeof(POWER(2, 3))': 'double', 'typeof(2)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT power(2::int, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/radians.slt b/datafusion/sqllogictest/test_files/spark/math/radians.slt
new file mode 100644
index 0000000000000..bccda62c542ff
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/radians.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT radians(180);
+## PySpark 3.5.5 Result: {'RADIANS(180)': 3.141592653589793, 'typeof(RADIANS(180))': 'double', 'typeof(180)': 'int'}
+#query
+#SELECT radians(180::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/rand.slt b/datafusion/sqllogictest/test_files/spark/math/rand.slt
new file mode 100644
index 0000000000000..53b4c6f822218
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/rand.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT rand();
+## PySpark 3.5.5 Result: {'rand()': 0.949892358232337, 'typeof(rand())': 'double'}
+#query
+#SELECT rand();
+
+## Original Query: SELECT rand(0);
+## PySpark 3.5.5 Result: {'rand(0)': 0.7604953758285915, 'typeof(rand(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT rand(0::int);
+
+## Original Query: SELECT rand(null);
+## PySpark 3.5.5 Result: {'rand(NULL)': 0.7604953758285915, 'typeof(rand(NULL))': 'double', 'typeof(NULL)': 'void'}
+#query
+#SELECT rand(NULL::void);
diff --git a/datafusion/sqllogictest/test_files/spark/math/randn.slt b/datafusion/sqllogictest/test_files/spark/math/randn.slt
new file mode 100644
index 0000000000000..daf81babd02c4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/randn.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT randn();
+## PySpark 3.5.5 Result: {'randn()': 1.498983714060803, 'typeof(randn())': 'double'}
+#query
+#SELECT randn();
+
+## Original Query: SELECT randn(0);
+## PySpark 3.5.5 Result: {'randn(0)': 1.6034991609278433, 'typeof(randn(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT randn(0::int);
+
+## Original Query: SELECT randn(null);
+## PySpark 3.5.5 Result: {'randn(NULL)': 1.6034991609278433, 'typeof(randn(NULL))': 'double', 'typeof(NULL)': 'void'}
+#query
+#SELECT randn(NULL::void);
diff --git a/datafusion/sqllogictest/test_files/spark/math/random.slt b/datafusion/sqllogictest/test_files/spark/math/random.slt
new file mode 100644
index 0000000000000..280a81b8888c0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/random.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT random();
+## PySpark 3.5.5 Result: {'rand()': 0.7460731389309176, 'typeof(rand())': 'double'}
+#query
+#SELECT random();
+
+## Original Query: SELECT random(0);
+## PySpark 3.5.5 Result: {'rand(0)': 0.7604953758285915, 'typeof(rand(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT random(0::int);
+
+## Original Query: SELECT random(null);
+## PySpark 3.5.5 Result: {'rand(NULL)': 0.7604953758285915, 'typeof(rand(NULL))': 'double', 'typeof(NULL)': 'void'}
+#query
+#SELECT random(NULL::void);
diff --git a/datafusion/sqllogictest/test_files/spark/math/rint.slt b/datafusion/sqllogictest/test_files/spark/math/rint.slt
new file mode 100644
index 0000000000000..2cae3cbf58fd3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/rint.slt
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT rint(12.3456);
+## PySpark 3.5.5 Result: {'rint(12.3456)': 12.0, 'typeof(rint(12.3456))': 'double', 'typeof(12.3456)': 'decimal(6,4)'}
+query R
+SELECT rint(12.3456);
+----
+12
+
+## Test additional cases
+query R
+SELECT rint(-12.3456);
+----
+-12
+
+query R
+SELECT rint(arrow_cast(-12.3456, 'Float32'));
+----
+-12
+
+## Test int
+query R
+SELECT rint(arrow_cast(12, 'UInt8'));
+----
+12
+
+query R
+SELECT rint(arrow_cast(-12, 'Int8'));
+----
+-12
+
+query R
+SELECT rint(arrow_cast(12, 'UInt16'));
+----
+12
+
+query R
+SELECT rint(arrow_cast(-12, 'Int16'));
+----
+-12
+
+query R
+SELECT rint(arrow_cast(12, 'UInt32'));
+----
+12
+
+query R
+SELECT rint(arrow_cast(-12, 'Int32'));
+----
+-12
+
+query R
+SELECT rint(arrow_cast(12, 'UInt64'));
+----
+12
+
+query R
+SELECT rint(arrow_cast(-12, 'Int64'));
+----
+-12
+
+query R
+SELECT rint(2.5);
+----
+2
+
+query R
+SELECT rint(3.5);
+----
+4
+
+query R
+SELECT rint(-2.5);
+----
+-2
+
+query R
+SELECT rint(-3.5);
+----
+-4
+
+query R
+SELECT rint(0.0);
+----
+0
+
+query R
+SELECT rint(42);
+----
+42
+
+## Test with null
+query R
+SELECT rint(NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/math/round.slt b/datafusion/sqllogictest/test_files/spark/math/round.slt
new file mode 100644
index 0000000000000..bc1f6b72247a0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/round.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT round(2.5, 0);
+## PySpark 3.5.5 Result: {'round(2.5, 0)': Decimal('3'), 'typeof(round(2.5, 0))': 'decimal(2,0)', 'typeof(2.5)': 'decimal(2,1)', 'typeof(0)': 'int'}
+#query
+#SELECT round(2.5::decimal(2,1), 0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/sec.slt b/datafusion/sqllogictest/test_files/spark/math/sec.slt
new file mode 100644
index 0000000000000..c95d583ce9154
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/sec.slt
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sec(0);
+## PySpark 3.5.5 Result: {'SEC(0)': 1.0, 'typeof(SEC(0))': 'double', 'typeof(0)': 'int'}
+query R
+SELECT sec(0::int);
+----
+1
+
+query R
+SELECT sec(a) FROM (VALUES (0::INT), (1::INT), (-1::INT), (null)) AS t(a);
+----
+1
+1.850815717680926
+1.850815717680926
+NULL
+
+query R
+SELECT sec(a) FROM (VALUES (pi()), (3 * pi()/2), (pi()/2) , (arrow_cast('NAN','Float32'))) AS t(a);
+----
+-1
+-5443746451065123
+16331239353195370
+NaN
diff --git a/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt b/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt
new file mode 100644
index 0000000000000..c8ddeb6740871
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/shiftleft.slt
@@ -0,0 +1,246 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT shiftleft(2, 1);
+## PySpark 3.5.5 Result: {'shiftleft(2, 1)': 4, 'typeof(shiftleft(2, 1))': 'int', 'typeof(2)': 'int', 'typeof(1)': 'int'}
+
+# Basic shiftleft tests
+query I
+SELECT shiftleft(2::int, 1::int);
+----
+4
+
+query I
+SELECT shiftleft(1::int, 2::int);
+----
+4
+
+query I
+SELECT shiftleft(3::int, 3::int);
+----
+24
+
+# Different data types
+query I
+SELECT shiftleft(2::bigint, 1::int);
+----
+4
+
+query I
+SELECT shiftleft(1::bigint, 2::int);
+----
+4
+
+query I
+SELECT shiftleft(2::int, 1::bigint);
+----
+4
+
+# Large shifts (should handle modulo correctly)
+query I
+SELECT shiftleft(1::int, 32::int);
+----
+1
+
+query I
+SELECT shiftleft(2::int, 33::int);
+----
+4
+
+query I
+SELECT shiftleft(3::int, 64::int);
+----
+3
+
+# Negative shifts
+query I
+SELECT shiftleft(4::int, -1::int);
+----
+0
+
+query I
+SELECT shiftleft(8::int, -2::int);
+----
+0
+
+query I
+SELECT shiftleft(16::int, -3::int);
+----
+0
+
+# Zero shifts
+query I
+SELECT shiftleft(5::int, 0::int);
+----
+5
+
+query I
+SELECT shiftleft(0::int, 5::int);
+----
+0
+
+# Edge cases
+query I
+SELECT shiftleft(2147483647::int, 1::int);
+----
+-2
+
+query I
+SELECT shiftleft(-1::int, 1::int);
+----
+-2
+
+# Multiple values in a table
+query I
+SELECT shiftleft(value, shift) FROM (VALUES (1, 1), (2, 2), (3, 3), (4, 4)) AS t(value, shift);
+----
+2
+8
+24
+64
+
+# Null handling
+query I
+SELECT shiftleft(NULL::int, 1::int);
+----
+NULL
+
+query I
+SELECT shiftleft(1::int, NULL::int);
+----
+NULL
+
+query I
+SELECT shiftleft(NULL::int, NULL::int);
+----
+NULL
+
+query I
+select shiftleft(3::int,-31);
+----
+6
+
+query I
+select shiftleft(3::int,-32);
+----
+3
+
+# i32 + nulls
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 1),
+(2, 2),
+(3, 3),
+(4, 4),
+(null, 2),
+(8, null)
+t(value, shift)
+----
+2 Int32
+8 Int32
+24 Int32
+64 Int32
+NULL Int32
+NULL Int32
+
+# big shifts
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(1, 32),
+(2, 33),
+(3, 64)
+t(value, shift)
+----
+1 Int32
+4 Int32
+3 Int32
+
+# negative shift
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int32'), shift))
+FROM VALUES
+(4, -1),
+(8, -2),
+(16, -3)
+t(value, shift)
+----
+0 Int32
+0 Int32
+0 Int32
+
+# i64 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'Int64'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'Int64'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 Int64
+8 Int64
+24 Int64
+
+# u32 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'UInt32'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'UInt32'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 UInt32
+8 UInt32
+24 UInt32
+
+# u64 value
+query IT
+SELECT
+	shiftleft(arrow_cast(value, 'UInt64'), shift),
+	arrow_typeof(shiftleft(arrow_cast(value, 'UInt64'), shift))
+FROM VALUES (1, 1), (2, 2), (3, 3) t(value, shift)
+----
+2 UInt64
+8 UInt64
+24 UInt64
+
+# pure null handling
+query IT
+SELECT shiftleft(null, 1), arrow_typeof(shiftleft(null, 1));
+----
+NULL Int32
+
+query IT
+SELECT shiftleft(null, null), arrow_typeof(shiftleft(null, null));
+----
+NULL Int32
+
+query IT
+SELECT shiftleft(1::bigint, null), arrow_typeof(shiftleft(1::bigint, null));
+----
+NULL Int64
diff --git a/datafusion/sqllogictest/test_files/spark/math/sign.slt b/datafusion/sqllogictest/test_files/spark/math/sign.slt
new file mode 100644
index 0000000000000..e135f4b13d063
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/sign.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sign(40);
+## PySpark 3.5.5 Result: {'sign(40)': 1.0, 'typeof(sign(40))': 'double', 'typeof(40)': 'int'}
+#query
+#SELECT sign(40::int);
+
+## Original Query: SELECT sign(INTERVAL -'100' YEAR);
+## PySpark 3.5.5 Result: {"sign(INTERVAL '-100' YEAR)": -1.0, "typeof(sign(INTERVAL '-100' YEAR))": 'double', "typeof(INTERVAL '-100' YEAR)": 'interval year'}
+#query
+#SELECT sign(INTERVAL '-100' YEAR::interval year);
diff --git a/datafusion/sqllogictest/test_files/spark/math/signum.slt b/datafusion/sqllogictest/test_files/spark/math/signum.slt
new file mode 100644
index 0000000000000..5557f5fe32721
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/signum.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT signum(40);
+## PySpark 3.5.5 Result: {'SIGNUM(40)': 1.0, 'typeof(SIGNUM(40))': 'double', 'typeof(40)': 'int'}
+#query
+#SELECT signum(40::int);
+
+## Original Query: SELECT signum(INTERVAL -'100' YEAR);
+## PySpark 3.5.5 Result: {"SIGNUM(INTERVAL '-100' YEAR)": -1.0, "typeof(SIGNUM(INTERVAL '-100' YEAR))": 'double', "typeof(INTERVAL '-100' YEAR)": 'interval year'}
+#query
+#SELECT signum(INTERVAL '-100' YEAR::interval year);
diff --git a/datafusion/sqllogictest/test_files/spark/math/sin.slt b/datafusion/sqllogictest/test_files/spark/math/sin.slt
new file mode 100644
index 0000000000000..418a6fafdff8d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/sin.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sin(0);
+## PySpark 3.5.5 Result: {'SIN(0)': 0.0, 'typeof(SIN(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT sin(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/sinh.slt b/datafusion/sqllogictest/test_files/spark/math/sinh.slt
new file mode 100644
index 0000000000000..6d24d387e210c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/sinh.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sinh(0);
+## PySpark 3.5.5 Result: {'SINH(0)': 0.0, 'typeof(SINH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT sinh(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/sqrt.slt b/datafusion/sqllogictest/test_files/spark/math/sqrt.slt
new file mode 100644
index 0000000000000..10b896eec9651
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/sqrt.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sqrt(4);
+## PySpark 3.5.5 Result: {'SQRT(4)': 2.0, 'typeof(SQRT(4))': 'double', 'typeof(4)': 'int'}
+#query
+#SELECT sqrt(4::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/tan.slt b/datafusion/sqllogictest/test_files/spark/math/tan.slt
new file mode 100644
index 0000000000000..4699893d2bd59
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/tan.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT tan(0);
+## PySpark 3.5.5 Result: {'TAN(0)': 0.0, 'typeof(TAN(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT tan(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/tanh.slt b/datafusion/sqllogictest/test_files/spark/math/tanh.slt
new file mode 100644
index 0000000000000..1511adb5b3724
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/tanh.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT tanh(0);
+## PySpark 3.5.5 Result: {'TANH(0)': 0.0, 'typeof(TANH(0))': 'double', 'typeof(0)': 'int'}
+#query
+#SELECT tanh(0::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/try_add.slt b/datafusion/sqllogictest/test_files/spark/math/try_add.slt
new file mode 100644
index 0000000000000..f3f83158289fa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/try_add.slt
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_add(1, 2);
+## PySpark 3.5.5 Result: {'try_add(1, 2)': 3, 'typeof(try_add(1, 2))': 'int', 'typeof(1)': 'int', 'typeof(2)': 'int'}
+#query
+#SELECT try_add(1::int, 2::int);
+
+## Original Query: SELECT try_add(2147483647, 1);
+## PySpark 3.5.5 Result: {'try_add(2147483647, 1)': None, 'typeof(try_add(2147483647, 1))': 'int', 'typeof(2147483647)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT try_add(2147483647::int, 1::int);
+
+## Original Query: SELECT try_add(date'2021-01-01', 1);
+## PySpark 3.5.5 Result: {"try_add(DATE '2021-01-01', 1)": datetime.date(2021, 1, 2), "typeof(try_add(DATE '2021-01-01', 1))": 'date', "typeof(DATE '2021-01-01')": 'date', 'typeof(1)': 'int'}
+#query
+#SELECT try_add(DATE '2021-01-01'::date, 1::int);
+
+## Original Query: SELECT try_add(date'2021-01-01', interval 1 year);
+## PySpark 3.5.5 Result: {"try_add(DATE '2021-01-01', INTERVAL '1' YEAR)": datetime.date(2022, 1, 1), "typeof(try_add(DATE '2021-01-01', INTERVAL '1' YEAR))": 'date', "typeof(DATE '2021-01-01')": 'date', "typeof(INTERVAL '1' YEAR)": 'interval year'}
+#query
+#SELECT try_add(DATE '2021-01-01'::date, INTERVAL '1' YEAR::interval year);
+
+## Original Query: SELECT try_add(interval 1 year, interval 2 year);
+## PySpark 3.5.5 Result: {"try_add(INTERVAL '1' YEAR, INTERVAL '2' YEAR)": 36, "typeof(try_add(INTERVAL '1' YEAR, INTERVAL '2' YEAR))": 'interval year', "typeof(INTERVAL '1' YEAR)": 'interval year', "typeof(INTERVAL '2' YEAR)": 'interval year'}
+#query
+#SELECT try_add(INTERVAL '1' YEAR::interval year, INTERVAL '2' YEAR::interval year);
+
+## Original Query: SELECT try_add(timestamp'2021-01-01 00:00:00', interval 1 day);
+## PySpark 3.5.5 Result: {"try_add(TIMESTAMP '2021-01-01 00:00:00', INTERVAL '1' DAY)": datetime.datetime(2021, 1, 2, 0, 0), "typeof(try_add(TIMESTAMP '2021-01-01 00:00:00', INTERVAL '1' DAY))": 'timestamp', "typeof(TIMESTAMP '2021-01-01 00:00:00')": 'timestamp', "typeof(INTERVAL '1' DAY)": 'interval day'}
+#query
+#SELECT try_add(TIMESTAMP '2021-01-01 00:00:00'::timestamp, INTERVAL '1' DAY::interval day);
diff --git a/datafusion/sqllogictest/test_files/spark/math/try_divide.slt b/datafusion/sqllogictest/test_files/spark/math/try_divide.slt
new file mode 100644
index 0000000000000..405872f9ca0f8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/try_divide.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_divide(1, 0);
+## PySpark 3.5.5 Result: {'try_divide(1, 0)': None, 'typeof(try_divide(1, 0))': 'double', 'typeof(1)': 'int', 'typeof(0)': 'int'}
+#query
+#SELECT try_divide(1::int, 0::int);
+
+## Original Query: SELECT try_divide(2L, 2L);
+## PySpark 3.5.5 Result: {'try_divide(2, 2)': 1.0, 'typeof(try_divide(2, 2))': 'double', 'typeof(2)': 'bigint'}
+#query
+#SELECT try_divide(2::bigint);
+
+## Original Query: SELECT try_divide(3, 2);
+## PySpark 3.5.5 Result: {'try_divide(3, 2)': 1.5, 'typeof(try_divide(3, 2))': 'double', 'typeof(3)': 'int', 'typeof(2)': 'int'}
+#query
+#SELECT try_divide(3::int, 2::int);
+
+## Original Query: SELECT try_divide(interval 2 month, 0);
+## PySpark 3.5.5 Result: {"try_divide(INTERVAL '2' MONTH, 0)": None, "typeof(try_divide(INTERVAL '2' MONTH, 0))": 'interval year to month', "typeof(INTERVAL '2' MONTH)": 'interval month', 'typeof(0)': 'int'}
+#query
+#SELECT try_divide(INTERVAL '2' MONTH::interval month, 0::int);
+
+## Original Query: SELECT try_divide(interval 2 month, 2);
+## PySpark 3.5.5 Result: {"try_divide(INTERVAL '2' MONTH, 2)": 1, "typeof(try_divide(INTERVAL '2' MONTH, 2))": 'interval year to month', "typeof(INTERVAL '2' MONTH)": 'interval month', 'typeof(2)': 'int'}
+#query
+#SELECT try_divide(INTERVAL '2' MONTH::interval month, 2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/try_multiply.slt b/datafusion/sqllogictest/test_files/spark/math/try_multiply.slt
new file mode 100644
index 0000000000000..c495a758e2346
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/try_multiply.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_multiply(-2147483648, 10);
+## PySpark 3.5.5 Result: {'try_multiply(-2147483648, 10)': None, 'typeof(try_multiply(-2147483648, 10))': 'int', 'typeof(-2147483648)': 'int', 'typeof(10)': 'int'}
+#query
+#SELECT try_multiply(-2147483648::int, 10::int);
+
+## Original Query: SELECT try_multiply(2, 3);
+## PySpark 3.5.5 Result: {'try_multiply(2, 3)': 6, 'typeof(try_multiply(2, 3))': 'int', 'typeof(2)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT try_multiply(2::int, 3::int);
+
+## Original Query: SELECT try_multiply(interval 2 year, 3);
+## PySpark 3.5.5 Result: {"try_multiply(INTERVAL '2' YEAR, 3)": 72, "typeof(try_multiply(INTERVAL '2' YEAR, 3))": 'interval year to month', "typeof(INTERVAL '2' YEAR)": 'interval year', 'typeof(3)': 'int'}
+#query
+#SELECT try_multiply(INTERVAL '2' YEAR::interval year, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/math/try_subtract.slt b/datafusion/sqllogictest/test_files/spark/math/try_subtract.slt
new file mode 100644
index 0000000000000..4ce4c480b91c2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/try_subtract.slt
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_subtract(-2147483648, 1);
+## PySpark 3.5.5 Result: {'try_subtract(-2147483648, 1)': None, 'typeof(try_subtract(-2147483648, 1))': 'int', 'typeof(-2147483648)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT try_subtract(-2147483648::int, 1::int);
+
+## Original Query: SELECT try_subtract(2, 1);
+## PySpark 3.5.5 Result: {'try_subtract(2, 1)': 1, 'typeof(try_subtract(2, 1))': 'int', 'typeof(2)': 'int', 'typeof(1)': 'int'}
+#query
+#SELECT try_subtract(2::int, 1::int);
+
+## Original Query: SELECT try_subtract(date'2021-01-01', interval 1 year);
+## PySpark 3.5.5 Result: {"try_subtract(DATE '2021-01-01', INTERVAL '1' YEAR)": datetime.date(2020, 1, 1), "typeof(try_subtract(DATE '2021-01-01', INTERVAL '1' YEAR))": 'date', "typeof(DATE '2021-01-01')": 'date', "typeof(INTERVAL '1' YEAR)": 'interval year'}
+#query
+#SELECT try_subtract(DATE '2021-01-01'::date, INTERVAL '1' YEAR::interval year);
+
+## Original Query: SELECT try_subtract(date'2021-01-02', 1);
+## PySpark 3.5.5 Result: {"try_subtract(DATE '2021-01-02', 1)": datetime.date(2021, 1, 1), "typeof(try_subtract(DATE '2021-01-02', 1))": 'date', "typeof(DATE '2021-01-02')": 'date', 'typeof(1)': 'int'}
+#query
+#SELECT try_subtract(DATE '2021-01-02'::date, 1::int);
+
+## Original Query: SELECT try_subtract(interval 2 year, interval 1 year);
+## PySpark 3.5.5 Result: {"try_subtract(INTERVAL '2' YEAR, INTERVAL '1' YEAR)": 12, "typeof(try_subtract(INTERVAL '2' YEAR, INTERVAL '1' YEAR))": 'interval year', "typeof(INTERVAL '2' YEAR)": 'interval year', "typeof(INTERVAL '1' YEAR)": 'interval year'}
+#query
+#SELECT try_subtract(INTERVAL '2' YEAR::interval year, INTERVAL '1' YEAR::interval year);
+
+## Original Query: SELECT try_subtract(timestamp'2021-01-02 00:00:00', interval 1 day);
+## PySpark 3.5.5 Result: {"try_subtract(TIMESTAMP '2021-01-02 00:00:00', INTERVAL '1' DAY)": datetime.datetime(2021, 1, 1, 0, 0), "typeof(try_subtract(TIMESTAMP '2021-01-02 00:00:00', INTERVAL '1' DAY))": 'timestamp', "typeof(TIMESTAMP '2021-01-02 00:00:00')": 'timestamp', "typeof(INTERVAL '1' DAY)": 'interval day'}
+#query
+#SELECT try_subtract(TIMESTAMP '2021-01-02 00:00:00'::timestamp, INTERVAL '1' DAY::interval day);
diff --git a/datafusion/sqllogictest/test_files/spark/math/unhex.slt b/datafusion/sqllogictest/test_files/spark/math/unhex.slt
new file mode 100644
index 0000000000000..051d8826c8a6c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/unhex.slt
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Basic hex string
+query ?
+SELECT unhex('537061726B2053514C');
+----
+537061726b2053514c
+
+query T
+SELECT arrow_cast(unhex('537061726B2053514C'), 'Utf8');
+----
+Spark SQL
+
+# Lowercase hex
+query ?
+SELECT unhex('616263');
+----
+616263
+
+query T
+SELECT arrow_cast(unhex('616263'), 'Utf8');
+----
+abc
+
+# Odd length hex (left pad with 0)
+query ?
+SELECT unhex(a) FROM VALUES ('1A2B3'), ('1'), ('ABC'), ('123') AS t(a);
+----
+01a2b3
+01
+0abc
+0123
+
+# Null input
+query ?
+SELECT unhex(NULL);
+----
+NULL
+
+# Invalid hex characters
+query ?
+SELECT unhex('GGHH');
+----
+NULL
+
+# Empty hex string
+query T
+SELECT arrow_cast(unhex(''), 'Utf8');
+----
+(empty)
+
+# Array with mixed case
+query ?
+SELECT unhex(a) FROM VALUES ('4a4B4c'), ('F'), ('A'), ('AbCdEf'), ('123abc'), ('41 42'), ('00'), ('FF') AS t(a);
+----
+4a4b4c
+0f
+0a
+abcdef
+123abc
+NULL
+00
+ff
+
+# LargeUtf8 type
+statement ok
+CREATE TABLE t_large_utf8 AS VALUES (arrow_cast('414243', 'LargeUtf8')), (NULL);
+
+query ?
+SELECT unhex(column1) FROM t_large_utf8;
+----
+414243
+NULL
+
+# Utf8View type
+statement ok
+CREATE TABLE t_utf8view AS VALUES (arrow_cast('414243', 'Utf8View')), (NULL);
+
+query ?
+SELECT unhex(column1) FROM t_utf8view;
+----
+414243
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/math/width_bucket.slt b/datafusion/sqllogictest/test_files/spark/math/width_bucket.slt
new file mode 100644
index 0000000000000..d2661ceb9d3bb
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/math/width_bucket.slt
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+query I
+SELECT width_bucket(-0.9, 5.2, 0.5, 2)
+----
+3
+
+query I
+SELECT width_bucket(-2.1, 1.3, 3.4, 3)
+----
+0
+
+query I
+SELECT width_bucket(5.3, 0.2, 10.6, 5)
+----
+3
+
+query I
+SELECT width_bucket(8.1, 0.0, 5.7, 4)
+----
+5
+
+query I
+SELECT width_bucket(INTERVAL '0' DAY, INTERVAL '0' DAY, INTERVAL '10' DAY, 10)
+----
+1
+
+query I
+SELECT width_bucket(INTERVAL '0' YEAR, INTERVAL '0' YEAR, INTERVAL '10' YEAR, 10)
+----
+1
+
+query I
+SELECT width_bucket(INTERVAL '1' DAY, INTERVAL '0' DAY, INTERVAL '10' DAY, 10)
+----
+2
+
+query I
+SELECT width_bucket(INTERVAL '1' YEAR, INTERVAL '0' YEAR, INTERVAL '10' YEAR, 10)
+----
+2
+
+# test of sail
+query I
+SELECT width_bucket(0.0, 10.0, 0.0, 5)
+----
+6
+
+query I
+SELECT width_bucket(10.0, 0.0, 10.0, 5)
+----
+6
+
+query I
+SELECT width_bucket(10.0, 0.0, 0.0, 5)
+----
+NULL
+
+# lo == hi
+query I
+SELECT width_bucket(10.0, 0.0, 0.0, 5);
+----
+NULL
+
+# n <= 0
+query I
+SELECT width_bucket(5.0, 0.0, 10.0, 0);
+----
+NULL
+
+query I
+SELECT width_bucket(arrow_cast('NaN','Float64'),5.0, 0.0, 5)
+----
+NULL
+
+query I
+SELECT width_bucket(5.0, arrow_cast('NaN','Float64'), 0.0, 5)
+----
+NULL
+
+query I
+SELECT width_bucket(5.0, 0.0, arrow_cast('NaN','Float64'), 5)
+----
+NULL
+
+query I
+SELECT width_bucket(INTERVAL '1' YEAR, INTERVAL '5' YEAR, INTERVAL '5' YEAR, 10)
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/misc/assert_true.slt b/datafusion/sqllogictest/test_files/spark/misc/assert_true.slt
new file mode 100644
index 0000000000000..99330233aabdd
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/assert_true.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT assert_true(0 < 1);
+## PySpark 3.5.5 Result: {"assert_true((0 < 1), '(0 < 1)' is not true!)": None, "typeof(assert_true((0 < 1), '(0 < 1)' is not true!))": 'void', 'typeof((0 < 1))': 'boolean'}
+#query
+#SELECT assert_true((0 < 1)::boolean);
diff --git a/datafusion/sqllogictest/test_files/spark/misc/current_catalog.slt b/datafusion/sqllogictest/test_files/spark/misc/current_catalog.slt
new file mode 100644
index 0000000000000..b0cb488233c93
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/current_catalog.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_catalog();
+## PySpark 3.5.5 Result: {'current_catalog()': 'spark_catalog', 'typeof(current_catalog())': 'string'}
+#query
+#SELECT current_catalog();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/current_database.slt b/datafusion/sqllogictest/test_files/spark/misc/current_database.slt
new file mode 100644
index 0000000000000..0883db29a0a64
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/current_database.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_database();
+## PySpark 3.5.5 Result: {'current_database()': 'default', 'typeof(current_database())': 'string'}
+#query
+#SELECT current_database();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/current_schema.slt b/datafusion/sqllogictest/test_files/spark/misc/current_schema.slt
new file mode 100644
index 0000000000000..630734431df35
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/current_schema.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_schema();
+## PySpark 3.5.5 Result: {'current_database()': 'default', 'typeof(current_database())': 'string'}
+#query
+#SELECT current_schema();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/current_user.slt b/datafusion/sqllogictest/test_files/spark/misc/current_user.slt
new file mode 100644
index 0000000000000..17cfbd292e1db
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/current_user.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT current_user();
+## PySpark 3.5.5 Result: {'current_user()': 'r', 'typeof(current_user())': 'string'}
+#query
+#SELECT current_user();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/equal_null.slt b/datafusion/sqllogictest/test_files/spark/misc/equal_null.slt
new file mode 100644
index 0000000000000..88999d997d2db
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/equal_null.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT equal_null(1, '11');
+## PySpark 3.5.5 Result: {'equal_null(1, 11)': False, 'typeof(equal_null(1, 11))': 'boolean', 'typeof(1)': 'int', 'typeof(11)': 'string'}
+#query
+#SELECT equal_null(1::int, '11'::string);
+
+## Original Query: SELECT equal_null(3, 3);
+## PySpark 3.5.5 Result: {'equal_null(3, 3)': True, 'typeof(equal_null(3, 3))': 'boolean', 'typeof(3)': 'int'}
+#query
+#SELECT equal_null(3::int);
+
+## Original Query: SELECT equal_null(NULL, 'abc');
+## PySpark 3.5.5 Result: {'equal_null(NULL, abc)': False, 'typeof(equal_null(NULL, abc))': 'boolean', 'typeof(NULL)': 'void', 'typeof(abc)': 'string'}
+#query
+#SELECT equal_null(NULL::void, 'abc'::string);
+
+## Original Query: SELECT equal_null(NULL, NULL);
+## PySpark 3.5.5 Result: {'equal_null(NULL, NULL)': True, 'typeof(equal_null(NULL, NULL))': 'boolean', 'typeof(NULL)': 'void'}
+#query
+#SELECT equal_null(NULL::void);
+
+## Original Query: SELECT equal_null(true, NULL);
+## PySpark 3.5.5 Result: {'equal_null(true, NULL)': False, 'typeof(equal_null(true, NULL))': 'boolean', 'typeof(true)': 'boolean', 'typeof(NULL)': 'void'}
+#query
+#SELECT equal_null(true::boolean, NULL::void);
diff --git a/datafusion/sqllogictest/test_files/spark/misc/input_file_block_length.slt b/datafusion/sqllogictest/test_files/spark/misc/input_file_block_length.slt
new file mode 100644
index 0000000000000..4f227d7c4d779
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/input_file_block_length.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT input_file_block_length();
+## PySpark 3.5.5 Result: {'input_file_block_length()': -1, 'typeof(input_file_block_length())': 'bigint'}
+#query
+#SELECT input_file_block_length();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/input_file_block_start.slt b/datafusion/sqllogictest/test_files/spark/misc/input_file_block_start.slt
new file mode 100644
index 0000000000000..c60c616328b57
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/input_file_block_start.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT input_file_block_start();
+## PySpark 3.5.5 Result: {'input_file_block_start()': -1, 'typeof(input_file_block_start())': 'bigint'}
+#query
+#SELECT input_file_block_start();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/input_file_name.slt b/datafusion/sqllogictest/test_files/spark/misc/input_file_name.slt
new file mode 100644
index 0000000000000..0379d6d0f5db8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/input_file_name.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT input_file_name();
+## PySpark 3.5.5 Result: {'input_file_name()': '', 'typeof(input_file_name())': 'string'}
+#query
+#SELECT input_file_name();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/java_method.slt b/datafusion/sqllogictest/test_files/spark/misc/java_method.slt
new file mode 100644
index 0000000000000..bb6db98de7e9b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/java_method.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT java_method('java.util.UUID', 'randomUUID');
+## PySpark 3.5.5 Result: {'java_method(java.util.UUID, randomUUID)': 'e0d43859-1003-4f43-bfff-f2e3c34981e2', 'typeof(java_method(java.util.UUID, randomUUID))': 'string', 'typeof(java.util.UUID)': 'string', 'typeof(randomUUID)': 'string'}
+#query
+#SELECT java_method('java.util.UUID'::string, 'randomUUID'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/misc/monotonically_increasing_id.slt b/datafusion/sqllogictest/test_files/spark/misc/monotonically_increasing_id.slt
new file mode 100644
index 0000000000000..00f6b4a1192ad
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/monotonically_increasing_id.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT monotonically_increasing_id();
+## PySpark 3.5.5 Result: {'monotonically_increasing_id()': 0, 'typeof(monotonically_increasing_id())': 'bigint'}
+#query
+#SELECT monotonically_increasing_id();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/reflect.slt b/datafusion/sqllogictest/test_files/spark/misc/reflect.slt
new file mode 100644
index 0000000000000..223f692f7abda
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/reflect.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT reflect('java.util.UUID', 'randomUUID');
+## PySpark 3.5.5 Result: {'reflect(java.util.UUID, randomUUID)': 'bcf8f6e4-0d46-41a1-bc3c-9f793c8f8aa8', 'typeof(reflect(java.util.UUID, randomUUID))': 'string', 'typeof(java.util.UUID)': 'string', 'typeof(randomUUID)': 'string'}
+#query
+#SELECT reflect('java.util.UUID'::string, 'randomUUID'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/misc/spark_partition_id.slt b/datafusion/sqllogictest/test_files/spark/misc/spark_partition_id.slt
new file mode 100644
index 0000000000000..57993103f8c4b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/spark_partition_id.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT spark_partition_id();
+## PySpark 3.5.5 Result: {'SPARK_PARTITION_ID()': 0, 'typeof(SPARK_PARTITION_ID())': 'int'}
+#query
+#SELECT spark_partition_id();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/typeof.slt b/datafusion/sqllogictest/test_files/spark/misc/typeof.slt
new file mode 100644
index 0000000000000..e930b65baa052
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/typeof.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT typeof(1);
+## PySpark 3.5.5 Result: {'typeof(1)': 'int', 'typeof(typeof(1))': 'string'}
+#query
+#SELECT typeof(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/misc/user.slt b/datafusion/sqllogictest/test_files/spark/misc/user.slt
new file mode 100644
index 0000000000000..fc63c6108536a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/user.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT user();
+## PySpark 3.5.5 Result: {'current_user()': 'r', 'typeof(current_user())': 'string'}
+#query
+#SELECT user();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/uuid.slt b/datafusion/sqllogictest/test_files/spark/misc/uuid.slt
new file mode 100644
index 0000000000000..223bd71447ca0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/uuid.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT uuid();
+## PySpark 3.5.5 Result: {'uuid()': '96981e67-62f6-49bc-a6f4-2f9bc676edda', 'typeof(uuid())': 'string'}
+#query
+#SELECT uuid();
diff --git a/datafusion/sqllogictest/test_files/spark/misc/version.slt b/datafusion/sqllogictest/test_files/spark/misc/version.slt
new file mode 100644
index 0000000000000..d01e0c9d962d6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/misc/version.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT version();
+## PySpark 3.5.5 Result: {'version()': '3.5.5 7c29c664cdc9321205a98a14858aaf8daaa19db2', 'typeof(version())': 'string'}
+#query
+#SELECT version();
diff --git a/datafusion/sqllogictest/test_files/spark/predicate/ilike.slt b/datafusion/sqllogictest/test_files/spark/predicate/ilike.slt
new file mode 100644
index 0000000000000..68e8b1c59aeb6
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/predicate/ilike.slt
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ilike('Spark', '_Park');
+## PySpark 3.5.5 Result: {'ilike(Spark, _Park)': True, 'typeof(ilike(Spark, _Park))': 'boolean', 'typeof(Spark)': 'string', 'typeof(_Park)': 'string'}
+query B
+SELECT ilike('Spark'::string, '_Park'::string);
+----
+true
+
+query B
+SELECT ilike('Spark',  arrow_cast('_Park', 'LargeUtf8'));
+----
+true
+
+query B
+SELECT ilike(arrow_cast('Spark', 'Utf8View'),  arrow_cast('_Park', 'LargeUtf8'));
+----
+true
+
+query B
+SELECT ilike('Spark'::string, '_park'::string);
+----
+true
+
+query B
+SELECT ilike('SPARK'::string, '_park'::string);
+----
+true
+
+query B
+SELECT ilike('Spark'::string, 'SP%'::string);
+----
+true
+
+query B
+SELECT ilike('Spark'::string, '%ARK'::string);
+----
+true
+
+query B
+SELECT ilike('Spark'::string, 'xyz'::string);
+----
+false
+
+query B
+SELECT ilike(NULL::string, '_park'::string);
+----
+NULL
+
+query B
+SELECT ilike('Spark'::string, NULL::string);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/predicate/isnotnull.slt b/datafusion/sqllogictest/test_files/spark/predicate/isnotnull.slt
new file mode 100644
index 0000000000000..3fd5d6cea0719
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/predicate/isnotnull.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT isnotnull(1);
+## PySpark 3.5.5 Result: {'(1 IS NOT NULL)': True, 'typeof((1 IS NOT NULL))': 'boolean', 'typeof(1)': 'int'}
+#query
+#SELECT isnotnull(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/predicate/isnull.slt b/datafusion/sqllogictest/test_files/spark/predicate/isnull.slt
new file mode 100644
index 0000000000000..7c2290fa3d026
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/predicate/isnull.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT isnull(1);
+## PySpark 3.5.5 Result: {'(1 IS NULL)': False, 'typeof((1 IS NULL))': 'boolean', 'typeof(1)': 'int'}
+#query
+#SELECT isnull(1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/predicate/like.slt b/datafusion/sqllogictest/test_files/spark/predicate/like.slt
new file mode 100644
index 0000000000000..35cd8a4eaf3ed
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/predicate/like.slt
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT like('Spark', '_park');
+## PySpark 3.5.5 Result: {'Spark LIKE _park': True, 'typeof(Spark LIKE _park)': 'boolean', 'typeof(Spark)': 'string', 'typeof(_park)': 'string'}
+query B
+SELECT like('Spark'::string, '_park'::string);
+----
+true
+
+query B
+SELECT ilike('Spark',  arrow_cast('_park', 'LargeUtf8'));
+----
+true
+
+query B
+SELECT ilike(arrow_cast('Spark', 'Utf8View'),  arrow_cast('_park', 'LargeUtf8'));
+----
+true
+
+query B
+SELECT like('Spark'::string, '_Park'::string);
+----
+false
+
+query B
+SELECT like('SPARK'::string, '_park'::string);
+----
+false
+
+query B
+SELECT like('Spark'::string, 'Sp%'::string);
+----
+true
+
+query B
+SELECT like('Spark'::string, 'SP%'::string);
+----
+false
+
+query B
+SELECT like('Spark'::string, '%ark'::string);
+----
+true
+
+query B
+SELECT like('Spark'::string, '%ARK'::string);
+----
+false
+
+query B
+SELECT like('Spark'::string, 'xyz'::string);
+----
+false
+
+query B
+SELECT like(NULL::string, '_park'::string);
+----
+NULL
+
+query B
+SELECT like('Spark'::string, NULL::string);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/string/base64.slt b/datafusion/sqllogictest/test_files/spark/string/base64.slt
new file mode 100644
index 0000000000000..dbd266f65a132
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/base64.slt
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT base64('Spark SQL'::string);
+----
+U3BhcmsgU1FM
+
+query T
+SELECT base64('Spark SQ'::string);
+----
+U3BhcmsgU1E=
+
+query T
+SELECT base64('Spark S'::string);
+----
+U3BhcmsgUw==
+
+query T
+SELECT base64('Spark SQL'::bytea);
+----
+U3BhcmsgU1FM
+
+query T
+SELECT base64(NULL::string);
+----
+NULL
+
+query T
+SELECT base64(NULL::bytea);
+----
+NULL
+
+query T
+SELECT base64(column1)
+FROM VALUES
+('Spark SQL'::bytea),
+('Spark SQ'::bytea),
+('Spark S'::bytea),
+(NULL::bytea);
+----
+U3BhcmsgU1FM
+U3BhcmsgU1E=
+U3BhcmsgUw==
+NULL
+
+query error Error during planning: Function 'base64' requires Binary, but received Int32 \(DataType: Int32\)
+SELECT base64(12::integer);
+
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::string), 'Utf8');
+----
+Spark SQL
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1E='::string), 'Utf8');
+----
+Spark SQ
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgUw=='::string), 'Utf8');
+----
+Spark S
+
+query T
+SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::bytea), 'Utf8');
+----
+Spark SQL
+
+query ?
+SELECT unbase64(NULL::string);
+----
+NULL
+
+query ?
+SELECT unbase64(NULL::bytea);
+----
+NULL
+
+query T
+SELECT arrow_cast(unbase64(column1), 'Utf8')
+FROM VALUES
+('U3BhcmsgU1FM'::string),
+('U3BhcmsgU1E='::string),
+('U3BhcmsgUw=='::string),
+(NULL::string);
+----
+Spark SQL
+Spark SQ
+Spark S
+NULL
+
+query error Failed to decode value using base64
+SELECT unbase64('123'::string);
+
+query error Failed to decode value using base64
+SELECT unbase64('123'::bytea);
+
+query error Error during planning: Function 'unbase64' requires Binary, but received Int32 \(DataType: Int32\)
+SELECT unbase64(12::integer);
diff --git a/datafusion/sqllogictest/test_files/spark/string/bit_length.slt b/datafusion/sqllogictest/test_files/spark/string/bit_length.slt
new file mode 100644
index 0000000000000..457d8cf034719
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/bit_length.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT bit_length('Spark SQL');
+## PySpark 3.5.5 Result: {'bit_length(Spark SQL)': 72, 'typeof(bit_length(Spark SQL))': 'int', 'typeof(Spark SQL)': 'string'}
+#query
+#SELECT bit_length('Spark SQL'::string);
+
+## Original Query: SELECT bit_length(x'537061726b2053514c');
+## PySpark 3.5.5 Result: {"bit_length(X'537061726B2053514C')": 72, "typeof(bit_length(X'537061726B2053514C'))": 'int', "typeof(X'537061726B2053514C')": 'binary'}
+#query
+#SELECT bit_length(X'537061726B2053514C'::binary);
diff --git a/datafusion/sqllogictest/test_files/spark/string/btrim.slt b/datafusion/sqllogictest/test_files/spark/string/btrim.slt
new file mode 100644
index 0000000000000..bf25bd652c81e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/btrim.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT btrim('    SparkSQL   ');
+## PySpark 3.5.5 Result: {'btrim(    SparkSQL   )': 'SparkSQL', 'typeof(btrim(    SparkSQL   ))': 'string', 'typeof(    SparkSQL   )': 'string'}
+#query
+#SELECT btrim('    SparkSQL   '::string);
+
+## Original Query: SELECT btrim('SSparkSQLS', 'SL');
+## PySpark 3.5.5 Result: {'btrim(SSparkSQLS, SL)': 'parkSQ', 'typeof(btrim(SSparkSQLS, SL))': 'string', 'typeof(SSparkSQLS)': 'string', 'typeof(SL)': 'string'}
+#query
+#SELECT btrim('SSparkSQLS'::string, 'SL'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/char.slt b/datafusion/sqllogictest/test_files/spark/string/char.slt
index d8fc11f6d5127..299e2a04136ed 100644
Binary files a/datafusion/sqllogictest/test_files/spark/string/char.slt and b/datafusion/sqllogictest/test_files/spark/string/char.slt differ
diff --git a/datafusion/sqllogictest/test_files/spark/string/char_length.slt b/datafusion/sqllogictest/test_files/spark/string/char_length.slt
new file mode 100644
index 0000000000000..d9f86d45d291d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/char_length.slt
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT CHAR_LENGTH('Spark SQL ');
+----
+10
+
+query I
+SELECT char_length('Spark SQL ');
+----
+10
+
+query I
+SELECT char_length(x'537061726b2053514c');
+----
+9
diff --git a/datafusion/sqllogictest/test_files/spark/string/character_length.slt b/datafusion/sqllogictest/test_files/spark/string/character_length.slt
new file mode 100644
index 0000000000000..644741416e53c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/character_length.slt
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT CHARACTER_LENGTH('Spark SQL ');
+----
+10
+
+query I
+SELECT character_length('Spark SQL ');
+----
+10
+
+query I
+SELECT character_length(x'537061726b2053514c');
+----
+9
diff --git a/datafusion/sqllogictest/test_files/spark/string/chr.slt b/datafusion/sqllogictest/test_files/spark/string/chr.slt
new file mode 100644
index 0000000000000..69ec4fca394b2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/chr.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT chr(65);
+## PySpark 3.5.5 Result: {'chr(65)': 'A', 'typeof(chr(65))': 'string', 'typeof(65)': 'int'}
+#query
+#SELECT chr(65::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt
new file mode 100644
index 0000000000000..97e7b57f7d06e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT concat('Spark', 'SQL');
+----
+SparkSQL
+
+# Test two Utf8View inputs: value and return type
+query TT
+SELECT concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View')), arrow_typeof(concat(arrow_cast('Spark', 'Utf8View'), arrow_cast('SQL', 'Utf8View')));
+----
+SparkSQL Utf8View
+
+query T
+SELECT concat('Spark', 'SQL', NULL);
+----
+NULL
+
+query T
+SELECT concat('', '1', '', '2');
+----
+12
+
+query T
+SELECT concat();
+----
+(empty)
+
+query T
+SELECT concat('');
+----
+(empty)
+
+
+query T
+SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a, 'b', 'c') order by 1 nulls last;
+----
+abc
+NULL
+
+# Test mixed types: Utf8View + Utf8
+query TT
+SELECT concat(arrow_cast('hello', 'Utf8View'), ' world'), arrow_typeof(concat(arrow_cast('hello', 'Utf8View'), ' world'));
+----
+hello world Utf8View
+
+# Test Utf8 + LargeUtf8 => return type LargeUtf8
+query TT
+SELECT concat('a', arrow_cast('b', 'LargeUtf8')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8')));
+----
+ab LargeUtf8
+
+# Test all three types mixed together
+query TT
+SELECT concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')), arrow_typeof(concat('a', arrow_cast('b', 'LargeUtf8'), arrow_cast('c', 'Utf8View')));
+----
+abc Utf8View
diff --git a/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt b/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt
new file mode 100644
index 0000000000000..62df636bba9ce
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT concat_ws(' ', 'Spark', 'SQL');
+## PySpark 3.5.5 Result: {'concat_ws( , Spark, SQL)': 'Spark SQL', 'typeof(concat_ws( , Spark, SQL))': 'string', 'typeof( )': 'string', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT concat_ws(' '::string, 'Spark'::string, 'SQL'::string);
+
+## Original Query: SELECT concat_ws('/', 'foo', null, 'bar');
+## PySpark 3.5.5 Result: {'concat_ws(/, foo, NULL, bar)': 'foo/bar', 'typeof(concat_ws(/, foo, NULL, bar))': 'string', 'typeof(/)': 'string', 'typeof(foo)': 'string', 'typeof(NULL)': 'void', 'typeof(bar)': 'string'}
+#query
+#SELECT concat_ws('/'::string, 'foo'::string, NULL::void, 'bar'::string);
+
+## Original Query: SELECT concat_ws('s');
+## PySpark 3.5.5 Result: {'concat_ws(s)': '', 'typeof(concat_ws(s))': 'string', 'typeof(s)': 'string'}
+#query
+#SELECT concat_ws('s'::string);
+
+## Original Query: SELECT concat_ws(null, 'Spark', 'SQL');
+## PySpark 3.5.5 Result: {'concat_ws(NULL, Spark, SQL)': None, 'typeof(concat_ws(NULL, Spark, SQL))': 'string', 'typeof(NULL)': 'void', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT concat_ws(NULL::void, 'Spark'::string, 'SQL'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/contains.slt b/datafusion/sqllogictest/test_files/spark/string/contains.slt
new file mode 100644
index 0000000000000..1bfb61fc00e37
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/contains.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT contains('Spark SQL', 'SPARK');
+## PySpark 3.5.5 Result: {'contains(Spark SQL, SPARK)': False, 'typeof(contains(Spark SQL, SPARK))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(SPARK)': 'string'}
+#query
+#SELECT contains('Spark SQL'::string, 'SPARK'::string);
+
+## Original Query: SELECT contains('Spark SQL', 'Spark');
+## PySpark 3.5.5 Result: {'contains(Spark SQL, Spark)': True, 'typeof(contains(Spark SQL, Spark))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(Spark)': 'string'}
+#query
+#SELECT contains('Spark SQL'::string, 'Spark'::string);
+
+## Original Query: SELECT contains('Spark SQL', null);
+## PySpark 3.5.5 Result: {'contains(Spark SQL, NULL)': None, 'typeof(contains(Spark SQL, NULL))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(NULL)': 'void'}
+#query
+#SELECT contains('Spark SQL'::string, NULL::void);
+
+## Original Query: SELECT contains(x'537061726b2053514c', x'537061726b');
+## PySpark 3.5.5 Result: {"contains(X'537061726B2053514C', X'537061726B')": True, "typeof(contains(X'537061726B2053514C', X'537061726B'))": 'boolean', "typeof(X'537061726B2053514C')": 'binary', "typeof(X'537061726B')": 'binary'}
+#query
+#SELECT contains(X'537061726B2053514C'::binary, X'537061726B'::binary);
diff --git a/datafusion/sqllogictest/test_files/spark/string/decode.slt b/datafusion/sqllogictest/test_files/spark/string/decode.slt
new file mode 100644
index 0000000000000..a427fe40389e8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/decode.slt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
+## PySpark 3.5.5 Result: {'decode(2, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle, Non domestic)': 'San Francisco', 'typeof(decode(2, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle, Non domestic))': 'string', 'typeof(2)': 'int', 'typeof(1)': 'int', 'typeof(Southlake)': 'string', 'typeof(San Francisco)': 'string', 'typeof(3)': 'int', 'typeof(New Jersey)': 'string', 'typeof(4)': 'int', 'typeof(Seattle)': 'string', 'typeof(Non domestic)': 'string'}
+#query
+#SELECT decode(2::int, 1::int, 'Southlake'::string, 'San Francisco'::string, 3::int, 'New Jersey'::string, 4::int, 'Seattle'::string, 'Non domestic'::string);
+
+## Original Query: SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle');
+## PySpark 3.5.5 Result: {'decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle)': None, 'typeof(decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle))': 'string', 'typeof(6)': 'int', 'typeof(1)': 'int', 'typeof(Southlake)': 'string', 'typeof(2)': 'int', 'typeof(San Francisco)': 'string', 'typeof(3)': 'int', 'typeof(New Jersey)': 'string', 'typeof(4)': 'int', 'typeof(Seattle)': 'string'}
+#query
+#SELECT decode(6::int, 1::int, 'Southlake'::string, 2::int, 'San Francisco'::string, 3::int, 'New Jersey'::string, 4::int, 'Seattle'::string);
+
+## Original Query: SELECT decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
+## PySpark 3.5.5 Result: {'decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle, Non domestic)': 'Non domestic', 'typeof(decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle, Non domestic))': 'string', 'typeof(6)': 'int', 'typeof(1)': 'int', 'typeof(Southlake)': 'string', 'typeof(2)': 'int', 'typeof(San Francisco)': 'string', 'typeof(3)': 'int', 'typeof(New Jersey)': 'string', 'typeof(4)': 'int', 'typeof(Seattle)': 'string', 'typeof(Non domestic)': 'string'}
+#query
+#SELECT decode(6::int, 1::int, 'Southlake'::string, 2::int, 'San Francisco'::string, 3::int, 'New Jersey'::string, 4::int, 'Seattle'::string, 'Non domestic'::string);
+
+## Original Query: SELECT decode(null, 6, 'Spark', NULL, 'SQL', 4, 'rocks');
+## PySpark 3.5.5 Result: {'decode(NULL, 6, Spark, NULL, SQL, 4, rocks)': 'SQL', 'typeof(decode(NULL, 6, Spark, NULL, SQL, 4, rocks))': 'string', 'typeof(NULL)': 'void', 'typeof(6)': 'int', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string', 'typeof(4)': 'int', 'typeof(rocks)': 'string'}
+#query
+#SELECT decode(NULL::void, 6::int, 'Spark'::string, 'SQL'::string, 4::int, 'rocks'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/elt.slt b/datafusion/sqllogictest/test_files/spark/string/elt.slt
new file mode 100644
index 0000000000000..12917d17e1e47
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/elt.slt
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT elt(1, 'scala', 'java');
+## PySpark 3.5.5 Result: {'elt(1, scala, java)': 'scala', 'typeof(elt(1, scala, java))': 'string', 'typeof(1)': 'int', 'typeof(scala)': 'string', 'typeof(java)': 'string'}
+query T
+SELECT elt(1::int, 'scala'::string, 'java'::string);
+----
+scala
+
+## Original Query: SELECT elt(2, 'a', 1);
+## PySpark 3.5.5 Result: {'elt(2, a, 1)': '1', 'typeof(elt(2, a, 1))': 'string', 'typeof(2)': 'int', 'typeof(a)': 'string', 'typeof(1)': 'int'}
+query T
+SELECT elt(2::int, 'a'::string, 1::int);
+----
+1
+
+query T
+SELECT elt(11::int, 10, 20)
+----
+NULL
+
+query T
+SELECT elt(1::int, 10, 20)
+----
+10
+
+query T
+SELECT elt(1::int, null, 20)
+----
+NULL
+
+query T
+SELECT elt(1::int, 10, null)
+----
+10
+
+query T
+SELECT elt(1, 10, null)
+----
+10
diff --git a/datafusion/sqllogictest/test_files/spark/string/encode.slt b/datafusion/sqllogictest/test_files/spark/string/encode.slt
new file mode 100644
index 0000000000000..4ad02316f4f3f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/encode.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT encode('abc', 'utf-8');
+## PySpark 3.5.5 Result: {'encode(abc, utf-8)': bytearray(b'abc'), 'typeof(encode(abc, utf-8))': 'binary', 'typeof(abc)': 'string', 'typeof(utf-8)': 'string'}
+#query
+#SELECT encode('abc'::string, 'utf-8'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/endswith.slt b/datafusion/sqllogictest/test_files/spark/string/endswith.slt
new file mode 100644
index 0000000000000..35ada546f8bf4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/endswith.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT endswith('Spark SQL', 'SQL');
+## PySpark 3.5.5 Result: {'endswith(Spark SQL, SQL)': True, 'typeof(endswith(Spark SQL, SQL))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT endswith('Spark SQL'::string, 'SQL'::string);
+
+## Original Query: SELECT endswith('Spark SQL', 'Spark');
+## PySpark 3.5.5 Result: {'endswith(Spark SQL, Spark)': False, 'typeof(endswith(Spark SQL, Spark))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(Spark)': 'string'}
+#query
+#SELECT endswith('Spark SQL'::string, 'Spark'::string);
+
+## Original Query: SELECT endswith('Spark SQL', null);
+## PySpark 3.5.5 Result: {'endswith(Spark SQL, NULL)': None, 'typeof(endswith(Spark SQL, NULL))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(NULL)': 'void'}
+#query
+#SELECT endswith('Spark SQL'::string, NULL::void);
+
+## Original Query: SELECT endswith(x'537061726b2053514c', x'53514c');
+## PySpark 3.5.5 Result: {"endswith(X'537061726B2053514C', X'53514C')": True, "typeof(endswith(X'537061726B2053514C', X'53514C'))": 'boolean', "typeof(X'537061726B2053514C')": 'binary', "typeof(X'53514C')": 'binary'}
+#query
+#SELECT endswith(X'537061726B2053514C'::binary, X'53514C'::binary);
+
+## Original Query: SELECT endswith(x'537061726b2053514c', x'537061726b');
+## PySpark 3.5.5 Result: {"endswith(X'537061726B2053514C', X'537061726B')": False, "typeof(endswith(X'537061726B2053514C', X'537061726B'))": 'boolean', "typeof(X'537061726B2053514C')": 'binary', "typeof(X'537061726B')": 'binary'}
+#query
+#SELECT endswith(X'537061726B2053514C'::binary, X'537061726B'::binary);
diff --git a/datafusion/sqllogictest/test_files/spark/string/find_in_set.slt b/datafusion/sqllogictest/test_files/spark/string/find_in_set.slt
new file mode 100644
index 0000000000000..690d03ffa475f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/find_in_set.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT find_in_set('ab','abc,b,ab,c,def');
+## PySpark 3.5.5 Result: {'find_in_set(ab, abc,b,ab,c,def)': 3, 'typeof(find_in_set(ab, abc,b,ab,c,def))': 'int', 'typeof(ab)': 'string', 'typeof(abc,b,ab,c,def)': 'string'}
+#query
+#SELECT find_in_set('ab'::string, 'abc,b,ab,c,def'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/format_number.slt b/datafusion/sqllogictest/test_files/spark/string/format_number.slt
new file mode 100644
index 0000000000000..a56b8d004c912
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/format_number.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT format_number(12332.123456, '##################.###');
+## PySpark 3.5.5 Result: {'format_number(12332.123456, ##################.###)': '12332.123', 'typeof(format_number(12332.123456, ##################.###))': 'string', 'typeof(12332.123456)': 'decimal(11,6)', 'typeof(##################.###)': 'string'}
+#query
+#SELECT format_number(12332.123456::decimal(11,6), '##################.###'::string);
+
+## Original Query: SELECT format_number(12332.123456, 4);
+## PySpark 3.5.5 Result: {'format_number(12332.123456, 4)': '12,332.1235', 'typeof(format_number(12332.123456, 4))': 'string', 'typeof(12332.123456)': 'decimal(11,6)', 'typeof(4)': 'int'}
+#query
+#SELECT format_number(12332.123456::decimal(11,6), 4::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/format_string.slt b/datafusion/sqllogictest/test_files/spark/string/format_string.slt
new file mode 100644
index 0000000000000..8ba3cfc951cdc
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/format_string.slt
@@ -0,0 +1,2315 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+# ================================
+# Basic format_string tests
+# ================================
+
+## Basic string formatting
+query T
+SELECT format_string('Hello World %s', 'DataFusion');
+----
+Hello World DataFusion
+
+query T
+SELECT format_string(arrow_cast('Hello World %s', 'LargeUtf8'), 'DataFusion');
+----
+Hello World DataFusion
+
+query T
+SELECT format_string(arrow_cast('Hello World %s', 'Utf8View'), 'DataFusion');
+----
+Hello World DataFusion
+
+## Basic integer formatting
+query T
+SELECT format_string('Hello World %d %s', 100, 'days');
+----
+Hello World 100 days
+
+## Multiple string arguments
+query T
+SELECT format_string('%s %s %s', 'Hello', 'World', 'Test');
+----
+Hello World Test
+
+## Format without arguments (just return the format string)
+query T
+SELECT format_string('Hello World');
+----
+Hello World
+
+# ================================
+# Integer formatting tests
+# ================================
+
+## Decimal integer formatting
+query T
+SELECT format_string('Value: %d', 42);
+----
+Value: 42
+
+## Hexadecimal integer formatting (lowercase)
+query T
+SELECT format_string('Hex: %x', 255);
+----
+Hex: ff
+
+## Hexadecimal integer formatting (uppercase)
+query T
+SELECT format_string('Hex: %X', 255);
+----
+Hex: FF
+
+## Octal integer formatting
+query T
+SELECT format_string('Octal: %o', 64);
+----
+Octal: 100
+
+## Integer with width padding
+query T
+SELECT format_string('Padded: %5d', 42);
+----
+Padded:    42
+
+## Integer with zero padding
+query T
+SELECT format_string('Zero padded: %05d', 42);
+----
+Zero padded: 00042
+
+## Left-aligned integer
+query T
+SELECT format_string('Left: %-5d|', 42);
+----
+Left: 42   |
+
+## Integer with force sign
+query T
+SELECT format_string('Signed: %+d', 42);
+----
+Signed: +42
+
+## Negative integer
+query T
+SELECT format_string('Negative: %d', -42);
+----
+Negative: -42
+
+# ================================
+# Float formatting tests
+# ================================
+
+## Basic float formatting
+query T
+SELECT format_string('Float: %f', 3.14159);
+----
+Float: 3.141590
+
+query T
+SELECT format_string('Float: %f', 30.0);
+----
+Float: 30.000000
+
+## Float with precision
+query T
+SELECT format_string('Precision: %.2f', 3.14159);
+----
+Precision: 3.14
+
+## Scientific notation (lowercase)
+query T
+SELECT format_string('Scientific: %e', 1234.5);
+----
+Scientific: 1.234500e+03
+
+## Scientific notation (uppercase)
+query T
+SELECT format_string('Scientific: %E', 1234.5);
+----
+Scientific: 1.234500E+03
+
+## Compact float (lowercase)
+query T
+SELECT format_string('Compact: %g', 1234.5);
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('Compact: %g', CAST(123456789.1 AS DOUBLE));
+----
+Compact: 1.23457e+08
+
+## Compact float (uppercase)
+query T
+SELECT format_string('Compact: %G', 1234.5);
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('Compact: %G', CAST(123456789.1 AS DOUBLE));
+----
+Compact: 1.23457E+08
+
+## Float with width and precision
+query T
+SELECT format_string('Formatted: %10.2f', 3.14159);
+----
+Formatted:       3.14
+
+## Float zero padding
+query T
+SELECT format_string('Zero: %08.2f', 3.14);
+----
+Zero: 00003.14
+
+## Float with left alignment
+query T
+SELECT format_string('Left: %-10.2f|', 3.14);
+----
+Left: 3.14      |
+
+## Float with space sign (positive)
+query T
+SELECT format_string('Space: % .2f', 3.14);
+----
+Space:  3.14
+
+## Float with space sign (negative)
+query T
+SELECT format_string('Space: % .2f', -3.14);
+----
+Space: -3.14
+
+## Float with force sign (positive)
+query T
+SELECT format_string('Force: %+.2f', 3.14);
+----
+Force: +3.14
+
+## Float with force sign (negative)
+query T
+SELECT format_string('Force: %+.2f', -3.14);
+----
+Force: -3.14
+
+## Float with precision 0
+query T
+SELECT format_string('Precision 0: %.0f', 3.14);
+----
+Precision 0: 3
+
+## Float with precision 0 (rounds up)
+query T
+SELECT format_string('Precision 0: %.0f', 3.6);
+----
+Precision 0: 4
+
+## Float with precision 0 and alternate form
+query T
+SELECT format_string('Alt form: %#.0f', 3.14);
+----
+Alt form: 3.
+
+## Scientific notation with precision 0
+query T
+SELECT format_string('Sci: %.0e', 1234.5);
+----
+Sci: 1e+03
+
+## Compact format with precision 0
+query T
+SELECT format_string('Compact: %.0g', 1234.5);
+----
+Compact: 1e+03
+
+# ================================
+# Boolean formatting tests
+# ================================
+
+## Boolean lowercase
+query T
+SELECT format_string('Bool: %b', true);
+----
+Bool: true
+
+## Boolean uppercase
+query T
+SELECT format_string('Bool: %B', false);
+----
+Bool: FALSE
+
+## Boolean with width
+query T
+SELECT format_string('Bool: %6b', true);
+----
+Bool:   true
+
+## Boolean with invalid ARGUMENT
+statement error
+SELECT format_string('Bool: %6b', 1)
+
+# ================================
+# String formatting tests
+# ================================
+
+## String formatting
+query T
+SELECT format_string('String: %s', 'DataFusion');
+----
+String: DataFusion
+
+## String with width
+query T
+SELECT format_string('Padded: %10s|', 'test');
+----
+Padded:       test|
+
+## String left-aligned
+query T
+SELECT format_string('Left: %-10s|', 'test');
+----
+Left: test      |
+
+## String with precision (truncation)
+query T
+SELECT format_string('Truncated: %.3s', 'DataFusion');
+----
+Truncated: Dat
+
+## String uppercase conversion
+query T
+SELECT format_string('Upper: %S', 'datafusion');
+----
+Upper: DATAFUSION
+
+# ================================
+# Character formatting tests
+# ================================
+
+## Character formatting from integer
+query T
+SELECT format_string('Char: %c', 97);
+----
+Char: a
+
+## Character uppercase
+query T
+SELECT format_string('Char: %C', 97);
+----
+Char: A
+
+## Character with width padding
+query T
+SELECT format_string('Char: %5c', 65);
+----
+Char:     A
+
+## Character with left alignment
+query T
+SELECT format_string('Char: %-5c|', 65);
+----
+Char: A    |
+
+## Character uppercase with width
+query T
+SELECT format_string('Char: %5C', 97);
+----
+Char:     A
+
+## Character uppercase with left alignment
+query T
+SELECT format_string('Char: %-5C|', 97);
+----
+Char: A    |
+
+## Character with invalid ARGUMENT
+statement error
+SELECT format_string('Char: %5c', true);
+
+# ================================
+# Time formatting tests
+# ================================
+
+## Hour formatting (24-hour)
+query T
+SELECT format_string('Hour: %tH', TIMESTAMP '2023-12-25 14:30:45');
+----
+Hour: 14
+
+## Hour formatting (12-hour)
+query T
+SELECT format_string('Hour: %tI', TIMESTAMP '2023-12-25 14:30:45');
+----
+Hour: 02
+
+## Minute formatting
+query T
+SELECT format_string('Minute: %tM', TIMESTAMP '2023-12-25 14:30:45');
+----
+Minute: 30
+
+## Second formatting
+query T
+SELECT format_string('Second: %tS', TIMESTAMP '2023-12-25 14:30:45');
+----
+Second: 45
+
+## AM/PM marker
+query T
+SELECT format_string('AM/PM: %tp', TIMESTAMP '2023-12-25 14:30:45');
+----
+AM/PM: pm
+
+## AM/PM marker uppercase
+query T
+SELECT format_string('AM/PM: %Tp', TIMESTAMP '2023-12-25 14:30:45');
+----
+AM/PM: PM
+
+## AM/PM marker uppercase (morning)
+query T
+SELECT format_string('AM/PM: %Tp', TIMESTAMP '2023-12-25 09:30:45');
+----
+AM/PM: AM
+
+## Year formatting
+query T
+SELECT format_string('Year: %tY', TIMESTAMP '2023-12-25 14:30:45');
+----
+Year: 2023
+
+## Year formatting uppercase
+query T
+SELECT format_string('Year: %TY', TIMESTAMP '2023-12-25 14:30:45');
+----
+Year: 2023
+
+## Month formatting
+query T
+SELECT format_string('Month: %tm', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: 12
+
+## Day formatting
+query T
+SELECT format_string('Day: %td', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day: 25
+
+## Time formatting (HH:MM)
+query T
+SELECT format_string('Time: %tR', TIMESTAMP '2023-12-25 14:30:45');
+----
+Time: 14:30
+
+## Time formatting (HH:MM:SS)
+query T
+SELECT format_string('Time: %tT', TIMESTAMP '2023-12-25 14:30:45');
+----
+Time: 14:30:45
+
+## Date formatting (MM/DD/YY)
+query T
+SELECT format_string('Date: %tD', TIMESTAMP '2023-12-25 14:30:45');
+----
+Date: 12/25/23
+
+## ISO date formatting (YYYY-MM-DD)
+query T
+SELECT format_string('ISO Date: %tF', TIMESTAMP '2023-12-25 14:30:45');
+----
+ISO Date: 2023-12-25
+
+## Complex date formatting (Sun Jul 20 16:17:00 EDT 1969)
+query B
+SELECT format_string('Date: %tc', TIMESTAMP '1969-07-20 16:17:00') LIKE 'Date: Sun Jul 20 16:17:00 % 1969';
+----
+true
+
+
+## Hour formatting (24-hour no padding)
+query T
+SELECT format_string('Hour: %tk', TIMESTAMP '2023-12-25 04:30:45');
+----
+Hour: 4
+
+## Hour formatting (12-hour no padding)
+query T
+SELECT format_string('Hour: %tl', TIMESTAMP '2023-12-25 14:30:45');
+----
+Hour: 2
+
+## Milliseconds formatting
+query T
+SELECT format_string('Milliseconds: %tL', TIMESTAMP '2023-12-25 14:30:45.123');
+----
+Milliseconds: 123
+
+## Nanoseconds formatting
+query T
+SELECT format_string('Nanoseconds: %tN', TIMESTAMP '2023-12-25 14:30:45.123456789');
+----
+Nanoseconds: 123456789
+
+## Timezone offset (RFC 822)
+query T
+SELECT format_string('Timezone: %tz', TIMESTAMP '2023-12-25 14:30:45');
+----
+Timezone: +0000
+
+## Timezone abbreviation
+query T
+SELECT format_string('Timezone: %tZ', from_unixtime(1599572549, 'America/New_York'));
+----
+Timezone: UTC
+
+## Seconds since epoch
+query T
+SELECT format_string('Epoch seconds: %ts', TIMESTAMP '1970-01-01 00:00:01');
+----
+Epoch seconds: 1
+
+## Milliseconds since epoch
+query T
+SELECT format_string('Epoch millis: %tQ', TIMESTAMP '1970-01-01 00:00:01');
+----
+Epoch millis: 1000
+
+## Full month name
+query T
+SELECT format_string('Month: %tB', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: December
+
+## Full month name uppercase
+query T
+SELECT format_string('Month: %TB', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: DECEMBER
+
+## Abbreviated month name
+query T
+SELECT format_string('Month: %tb', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: Dec
+
+## Abbreviated month name uppercase
+query T
+SELECT format_string('Month: %Tb', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: DEC
+
+## Same as %tb
+query T
+SELECT format_string('Month: %th', TIMESTAMP '2023-12-25 14:30:45');
+----
+Month: Dec
+
+## Full day of week
+query T
+SELECT format_string('Day: %tA', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day: Monday
+
+## Full day of week uppercase
+query T
+SELECT format_string('Day: %TA', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day: MONDAY
+
+## Abbreviated day of week
+query T
+SELECT format_string('Day: %ta', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day: Mon
+
+## Abbreviated day of week uppercase
+query T
+SELECT format_string('Day: %Ta', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day: MON
+
+## Century (year/100)
+query T
+SELECT format_string('Century: %tC', TIMESTAMP '2023-12-25 14:30:45');
+----
+Century: 20
+
+## Two-digit year
+query T
+SELECT format_string('Year: %ty', TIMESTAMP '2023-12-25 14:30:45');
+----
+Year: 23
+
+## Day of year
+query T
+SELECT format_string('Day of year: %tj', TIMESTAMP '2023-12-25 14:30:45');
+----
+Day of year: 359
+
+## Day of month (no padding)
+query T
+SELECT format_string('Day: %te', TIMESTAMP '2023-12-05 14:30:45');
+----
+Day: 5
+
+## 12-hour time with AM/PM
+query T
+SELECT format_string('Time: %tr', TIMESTAMP '2023-12-25 14:30:45');
+----
+Time: 02:30:45 PM
+
+statement error
+SELECT format_string('Time: %t', TIMESTAMP '2023-12-25 14:30:45');
+
+statement error
+SELECT format_string('Time: %T', TIMESTAMP '2023-12-25 14:30:45');
+
+
+statement error
+SELECT format_string('Time: %tx', TIMESTAMP '2023-12-25 14:30:45');
+
+statement error
+SELECT format_string('Time: %Tx', TIMESTAMP '2023-12-25 14:30:45');
+
+
+
+# ================================
+# Decimal formatting tests
+# ================================
+
+## Decimal formatting
+query T
+SELECT format_string('Decimal: %f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Decimal: 123.456000
+
+## Decimal with precision
+query T
+SELECT format_string('Decimal: %.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Decimal: 123.46
+
+## Decimal scientific notation
+query T
+SELECT format_string('Scientific: %e', CAST(1234.5 AS DECIMAL(10,2)));
+----
+Scientific: 1.234500e+03
+
+## Decimal with width padding
+query T
+SELECT format_string('Padded: %10.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Padded:     123.46
+
+## Decimal with zero padding
+query T
+SELECT format_string('Zero padded: %010.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Zero padded: 0000123.46
+
+## Decimal with left adjustment
+query T
+SELECT format_string('Left: %-10.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Left: 123.46
+
+## Decimal with plus sign
+query T
+SELECT format_string('Plus: %+.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Plus: +123.46
+
+## Decimal with space sign
+query T
+SELECT format_string('Space: % .2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Space:  123.46
+
+## Negative decimal with plus sign
+query T
+SELECT format_string('Negative: %+.2f', CAST(-123.456 AS DECIMAL(10,3)));
+----
+Negative: -123.46
+
+## Negative decimal with space sign
+query T
+SELECT format_string('Negative: % .2f', CAST(-123.456 AS DECIMAL(10,3)));
+----
+Negative: -123.46
+
+## Decimal with width and plus sign
+query T
+SELECT format_string('Width+Plus: %+10.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Width+Plus:    +123.46
+
+## Decimal with zero padding and plus sign
+query T
+SELECT format_string('Zero+Plus: %+010.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Zero+Plus: +000123.46
+
+## Decimal with left adjustment and plus sign
+query T
+SELECT format_string('Left+Plus: %-+10.2f', CAST(123.456 AS DECIMAL(10,3)));
+----
+Left+Plus: +123.46
+
+## Decimal scientific notation with width
+query T
+SELECT format_string('Sci Width: %15.2e', CAST(1234.5 AS DECIMAL(10,2)));
+----
+Sci Width:        1.23e+03
+
+## Decimal scientific notation with zero padding
+query T
+SELECT format_string('Sci Zero: %015.2e', CAST(1234.5 AS DECIMAL(10,2)));
+----
+Sci Zero: 00000001.23e+03
+
+## Decimal scientific notation with plus sign
+query T
+SELECT format_string('Sci Plus: %+.2e', CAST(1234.5 AS DECIMAL(10,2)));
+----
+Sci Plus: +1.23e+03
+
+## Decimal compact format with width
+query T
+SELECT format_string('Compact: %10.2g', CAST(123.456 AS DECIMAL(10,3)));
+----
+Compact:    1.2e+02
+
+## Decimal compact format with plus sign
+query T
+SELECT format_string('Compact+: %+.2g', CAST(123.456 AS DECIMAL(10,3)));
+----
+Compact+: +1.2e+02
+
+statement error
+SELECT format_string('Compact+: %+.2g', 1);
+
+# ================================
+# Special cases and edge cases
+# ================================
+
+## Literal percent sign
+query T
+SELECT format_string('Percent: %%');
+----
+Percent: %
+
+## Newline character
+query T
+SELECT format_string('Line1%nLine2');
+----
+ 
+01)Line1
+02)Line2
+
+## Multiple format specifiers
+query T
+SELECT format_string('String: %s, Integer: %d, Float: %.2f', 'test', 42, 3.14159);
+----
+String: test, Integer: 42, Float: 3.14
+
+## Mixed width and precision
+query T
+SELECT format_string('Mixed: %10s %5d %.2f', 'hello', 123, 45.678);
+----
+Mixed:      hello   123 45.68
+
+# ================================
+# NULL handling tests
+# ================================
+
+## NULL format string
+query T
+SELECT format_string(NULL, 'test');
+----
+NULL
+
+query T
+SELECT format_string(arrow_cast(NULL, 'Utf8'), 'test');
+----
+NULL
+
+query T
+SELECT format_string(arrow_cast(NULL, 'LargeUtf8'), 'test');
+----
+NULL
+
+query T
+SELECT format_string(arrow_cast(NULL, 'Utf8View'), 'test');
+----
+NULL
+
+## NULL argument with string format
+query T
+SELECT format_string('Value: %s', NULL);
+----
+Value: null
+
+## NULL with string format (uppercase)
+query T
+SELECT format_string('Upper: %S', NULL);
+----
+Upper: NULL
+
+
+## NULL argument with string format
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Utf8'));
+----
+Value: null
+
+## NULL with string format (uppercase)
+query T
+SELECT format_string('Upper: %S', arrow_cast(NULL, 'Utf8'));
+----
+Upper: NULL
+
+## NULL argument with string format
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'LargeUtf8'));
+----
+Value: null
+
+## NULL with string format (uppercase)
+query T
+SELECT format_string('Upper: %S', arrow_cast(NULL, 'LargeUtf8'));
+----
+Upper: NULL
+
+## NULL argument with string format
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Utf8View'));
+----
+Value: null
+
+## NULL with string format (uppercase)
+query T
+SELECT format_string('Upper: %S', arrow_cast(NULL, 'Utf8View'));
+----
+Upper: NULL
+
+## NULL with integer format using arrow_cast
+query T
+SELECT format_string('Integer: %d', arrow_cast(NULL, 'Int32'));
+----
+Integer: null
+
+## NULL with hex format (lowercase) using arrow_cast
+query T
+SELECT format_string('Hex: %x', arrow_cast(NULL, 'Int32'));
+----
+Hex: null
+
+## NULL with hex format (uppercase) using arrow_cast
+query T
+SELECT format_string('Hex: %X', arrow_cast(NULL, 'Int32'));
+----
+Hex: NULL
+
+## NULL with octal format using arrow_cast
+query T
+SELECT format_string('Octal: %o', arrow_cast(NULL, 'Int32'));
+----
+Octal: null
+
+## NULL with float format using arrow_cast
+query T
+SELECT format_string('Float: %f', arrow_cast(NULL, 'Float64'));
+----
+Float: null
+
+## NULL with float and precision using arrow_cast
+query T
+SELECT format_string('Float: %.2f', arrow_cast(NULL, 'Float64'));
+----
+Float: nu
+
+## NULL with scientific notation (lowercase) using arrow_cast
+query T
+SELECT format_string('Scientific: %e', arrow_cast(NULL, 'Float64'));
+----
+Scientific: null
+
+## NULL with scientific notation (uppercase) using arrow_cast
+query T
+SELECT format_string('Scientific: %E', arrow_cast(NULL, 'Float64'));
+----
+Scientific: NULL
+
+## NULL with compact float (lowercase) using arrow_cast
+query T
+SELECT format_string('Compact: %g', arrow_cast(NULL, 'Float64'));
+----
+Compact: null
+
+## NULL with compact float and precision (lowercase) using arrow_cast
+query T
+SELECT format_string('Float: %.3g', arrow_cast(NULL, 'Float64'));
+----
+Float: nul
+
+## NULL with compact float (uppercase) using arrow_cast
+query T
+SELECT format_string('Compact: %G', arrow_cast(NULL, 'Float64'));
+----
+Compact: NULL
+
+## NULL with compact float and precision (uppercase) using arrow_cast
+query T
+SELECT format_string('Float: %.3G', arrow_cast(NULL, 'Float64'));
+----
+Float: NUL
+
+## NULL with hex float (lowercase) using arrow_cast
+query T
+SELECT format_string('Hex float: %a', arrow_cast(NULL, 'Float64'));
+----
+Hex float: null
+
+## NULL with hex float (uppercase) using arrow_cast
+query T
+SELECT format_string('Hex float: %A', arrow_cast(NULL, 'Float64'));
+----
+Hex float: NULL
+
+# ## NULL with float and precision using arrow_cast
+# query T
+# SELECT format_string('Float: %.2f', arrow_cast(NULL, 'Float16'));
+# ----
+# Float: nu
+
+## NULL with boolean format (lowercase) using arrow_cast
+query T
+SELECT format_string('Bool: %b', arrow_cast(NULL, 'Boolean'));
+----
+Bool: false
+
+## NULL with boolean format (uppercase) using arrow_cast
+query T
+SELECT format_string('Bool: %B', arrow_cast(NULL, 'Boolean'));
+----
+Bool: FALSE
+
+## NULL with character format (lowercase) using arrow_cast
+query T
+SELECT format_string('Char: %c', arrow_cast(NULL, 'Int32'));
+----
+Char: null
+
+## NULL with character format (uppercase) using arrow_cast
+query T
+SELECT format_string('Char: %C', arrow_cast(NULL, 'Int32'));
+----
+Char: NULL
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Hour: %tH', arrow_cast(NULL, 'Timestamp(ns)'));
+----
+Hour: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ns)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Time32(Second)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Time32(Millisecond)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Time64(Microsecond)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Time64(Nanosecond)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(s)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ms)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(µs)'));
+----
+Month: null
+
+## NULL with timestamp format using arrow_cast
+query T
+SELECT format_string('Month: %tB', arrow_cast(NULL, 'Timestamp(ns)'));
+----
+Month: null
+
+## NULL with decimal format using arrow_cast
+query T
+SELECT format_string('Decimal: %f', arrow_cast(NULL, 'Decimal128(10, 2)'));
+----
+Decimal: null
+
+## NULL Int8 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Int8'));
+----
+Value: null
+
+## NULL Int16 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Int16'));
+----
+Value: null
+
+## NULL Int64 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Int64'));
+----
+Value: null
+
+## NULL UInt8 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'UInt8'));
+----
+Value: null
+
+## NULL UInt16 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'UInt16'));
+----
+Value: null
+
+## NULL UInt32 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'UInt32'));
+----
+Value: null
+
+## NULL UInt64 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'UInt64'));
+----
+Value: null
+
+## NULL Float32 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Float32'));
+----
+Value: null
+
+## NULL Float64 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Float64'));
+----
+Value: null
+
+## NULL Timestamp with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Timestamp(ns)'));
+----
+Value: null
+
+## NULL Date32 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Date32'));
+----
+Value: null
+
+## NULL Date64 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Date64'));
+----
+Value: null
+
+## NULL Decimal128 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Decimal128(10, 2)'));
+----
+Value: null
+
+## NULL Decimal256 with string format using arrow_cast
+query T
+SELECT format_string('Value: %s', arrow_cast(NULL, 'Decimal256(20, 3)'));
+----
+Value: null
+
+
+# ================================
+# Error cases (should fail)
+# ================================
+
+## Format string expects arguments but none provided
+statement error
+SELECT format_string('Value: %d');
+
+statement error
+SELECT format_string(1);
+
+## Too few arguments for format specifiers
+statement error
+SELECT format_string('Values: %d %s', 42);
+
+## Invalid conversion for data type
+statement error
+SELECT format_string('Value: %d', 'not_a_number');
+
+statement error
+SELECT format_string('Value: %k', 'string');
+
+# ================================
+# Positional argument tests
+# ================================
+
+## Positional arguments
+query T
+SELECT format_string('%2$s %1$d', 42, 'test');
+----
+test 42
+
+## Reuse positional arguments
+query T
+SELECT format_string('%1$s %1$s', 'repeat');
+----
+repeat repeat
+
+## Mixed positional and sequential
+query T
+SELECT format_string('%2$s %s %1$d', 42, 'middle', 'end');
+----
+middle 42 42
+
+statement error
+SELECT format_string('%$s', 'test');
+
+# ================================
+# Flag combination tests
+# ================================
+
+## Alternate form with hex
+query T
+SELECT format_string('Hex: %#x', 255);
+----
+Hex: 0xff
+
+## Alternate form with octal
+query T
+SELECT format_string('Octal: %#o', 64);
+----
+Octal: 0100
+
+## Space sign with positive number
+query T
+SELECT format_string('Space: % d', 42);
+----
+Space:  42
+
+## Grouping separator (if supported)
+query T
+SELECT format_string('Grouped: %,d', 1234567);
+----
+Grouped: 1,234,567
+
+## Parentheses for negative numbers
+query T
+SELECT format_string('Negative: %(d', -42);
+----
+Negative: (42)
+
+# ================================
+# Array/Column tests
+# ================================
+
+## Test with array values
+statement ok
+CREATE TABLE test_format(fmt STRING, val1 STRING, val2 INT) AS VALUES 
+  ('Hello %s %d', 'World', 1),
+  ('Float: %2$d %1$s', '3.14159', 2),
+  (NULL, '3.14159', 3);
+
+query T
+SELECT format_string(arrow_cast(fmt, 'Utf8'), val1, val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(arrow_cast(fmt, 'LargeUtf8'), val1, val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(arrow_cast(fmt, 'Utf8View'), val1, val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(fmt, arrow_cast(val1, 'LargeUtf8'), val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(fmt, arrow_cast(val1, 'Utf8'), val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(fmt, arrow_cast(val1, 'Utf8View'), val2) FROM test_format;
+----
+Hello World 1
+Float: 2 3.14159
+NULL
+
+query T
+SELECT format_string(arrow_cast('Hello %s %d', 'Utf8'), val1, val2) FROM test_format;
+----
+Hello World 1
+Hello 3.14159 2
+Hello 3.14159 3
+
+query T
+SELECT format_string(arrow_cast('Hello %s %d', 'LargeUtf8'), val1, val2) FROM test_format;
+----
+Hello World 1
+Hello 3.14159 2
+Hello 3.14159 3
+
+query T
+SELECT format_string(arrow_cast('Hello %s %d', 'Utf8View'), val1, val2) FROM test_format;
+----
+Hello World 1
+Hello 3.14159 2
+Hello 3.14159 3
+
+statement ok
+DROP TABLE test_format;
+
+# ================================
+# Type-specific conversion tests
+# ================================
+
+## Boolean with string formats
+query T
+SELECT format_string('Value: %s', arrow_cast(true, 'Boolean'));
+----
+Value: true
+
+query T
+SELECT format_string('Value: %S', arrow_cast(false, 'Boolean'));
+----
+Value: FALSE
+
+## Int8 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(127, 'Int8'));
+----
+Decimal: 127
+
+query T
+SELECT format_string('Hex: %x', arrow_cast(127, 'Int8'));
+----
+Hex: 7f
+
+query T
+SELECT format_string('Hex: %X', arrow_cast(127, 'Int8'));
+----
+Hex: 7F
+
+query T
+SELECT format_string('Octal: %o', arrow_cast(127, 'Int8'));
+----
+Octal: 177
+
+query T
+SELECT format_string('Char: %c', arrow_cast(65, 'Int8'));
+----
+Char: A
+
+query T
+SELECT format_string('Char: %C', arrow_cast(97, 'Int8'));
+----
+Char: A
+
+query T
+SELECT format_string('Char: %c', arrow_cast(65, 'UInt32'));
+----
+Char: A
+
+query T
+SELECT format_string('Char: %C', arrow_cast(97, 'UInt32'));
+----
+Char: A
+
+query T
+SELECT format_string('Char: %c', arrow_cast(65, 'UInt64'));
+----
+Char: A
+
+query T
+SELECT format_string('Char: %C', arrow_cast(97, 'UInt64'));
+----
+Char: A
+
+query T
+SELECT format_string('String: %s', arrow_cast(127, 'Int8'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %S', arrow_cast(127, 'Int8'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %s', arrow_cast(127, 'UInt8'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %S', arrow_cast(127, 'UInt8'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %s', arrow_cast(127, 'UInt16'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %S', arrow_cast(127, 'UInt16'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %s', arrow_cast(127, 'Int32'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %S', arrow_cast(127, 'Int32'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %s', arrow_cast(127, 'UInt64'));
+----
+String: 127
+
+query T
+SELECT format_string('String: %S', arrow_cast(127, 'UInt64'));
+----
+String: 127
+
+## Int16 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(32767, 'Int16'));
+----
+Decimal: 32767
+
+query T
+SELECT format_string('Hex: %x', arrow_cast(32767, 'Int16'));
+----
+Hex: 7fff
+
+query T
+SELECT format_string('Hex: %X', arrow_cast(32767, 'Int16'));
+----
+Hex: 7FFF
+
+query T
+SELECT format_string('Octal: %o', arrow_cast(32767, 'Int16'));
+----
+Octal: 77777
+
+query T
+SELECT format_string('Char: %c', arrow_cast(8364, 'Int16'));
+----
+Char: €
+
+query T
+SELECT format_string('String: %s', arrow_cast(32767, 'Int16'));
+----
+String: 32767
+
+query T
+SELECT format_string('NaN: %s', CAST('NaN' AS DOUBLE));
+----
+NaN: NaN
+
+query T
+SELECT format_string('Infinity: %s', CAST('+Inf' AS DOUBLE));
+----
+Infinity: Infinity
+
+query T
+SELECT format_string('Infinity: %s', CAST('-Inf' AS DOUBLE));
+----
+Infinity: -Infinity
+
+query T
+SELECT format_string('NaN: %S', CAST('NaN' AS DOUBLE));
+----
+NaN: NAN
+
+query T
+SELECT format_string('Infinity: %S', CAST('+Inf' AS DOUBLE));
+----
+Infinity: INFINITY
+
+query T
+SELECT format_string('Infinity: %S', CAST('-Inf' AS DOUBLE));
+----
+Infinity: -INFINITY
+
+## Int32 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(2147483647, 'Int32'));
+----
+Decimal: 2147483647
+
+query T
+SELECT format_string('Hex: %x', arrow_cast(255, 'Int32'));
+----
+Hex: ff
+
+query T
+SELECT format_string('Octal: %o', arrow_cast(511, 'Int32'));
+----
+Octal: 777
+
+query T
+SELECT format_string('Char: %c', arrow_cast(128512, 'Int32'));
+----
+Char: 😀
+
+## UInt8 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(255, 'UInt8'));
+----
+Decimal: 255
+
+query T
+SELECT format_string('Hex: %x', arrow_cast(255, 'UInt8'));
+----
+Hex: ff
+
+query T
+SELECT format_string('Octal: %o', arrow_cast(255, 'UInt8'));
+----
+Octal: 377
+
+query T
+SELECT format_string('Char: %c', arrow_cast(65, 'UInt8'));
+----
+Char: A
+
+## UInt16 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(65535, 'UInt16'));
+----
+Decimal: 65535
+
+query T
+SELECT format_string('Hex: %X', arrow_cast(65535, 'UInt16'));
+----
+Hex: FFFF
+
+query T
+SELECT format_string('Char: %c', arrow_cast(9733, 'UInt16'));
+----
+Char: ★
+
+## UInt32 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(4294967295, 'UInt32'));
+----
+Decimal: 4294967295
+
+query T
+SELECT format_string('Hex: %x', arrow_cast(4294967295, 'UInt32'));
+----
+Hex: ffffffff
+
+query T
+SELECT format_string('String: %s', arrow_cast(4294967295, 'UInt32'));
+----
+String: 4294967295
+
+## UInt64 with various formats
+query T
+SELECT format_string('Decimal: %d', arrow_cast(18446744073709551615, 'UInt64'));
+----
+Decimal: 18446744073709551615
+
+query T
+SELECT format_string('Hex: %X', arrow_cast(18446744073709551615, 'UInt64'));
+----
+Hex: FFFFFFFFFFFFFFFF
+
+## Float16 with various formats
+query T
+SELECT format_string('Float: %f', arrow_cast(3.14, 'Float16'));
+----
+Float: 3.140625
+
+query T
+SELECT format_string('Scientific: %e', arrow_cast(3.14, 'Float16'));
+----
+Scientific: 3.140625e+00
+
+query T
+SELECT format_string('Scientific: %E', arrow_cast(3.14, 'Float16'));
+----
+Scientific: 3.140625E+00
+
+query T
+SELECT format_string('Compact: %g', arrow_cast(3.14, 'Float16'));
+----
+Compact: 3.14063
+
+query T
+SELECT format_string('Compact: %G', arrow_cast(3.14, 'Float16'));
+----
+Compact: 3.14063
+
+query T
+SELECT format_string('String: %s', arrow_cast(3.14, 'Float16'));
+----
+String: 3.140625
+
+query T
+SELECT format_string('String: %S', arrow_cast(3.14, 'Float16'));
+----
+String: 3.140625
+
+query T
+SELECT format_string('Hex float: %a', arrow_cast(3.14, 'Float16'));
+----
+Hex float: 0x1.92p1
+
+query T
+SELECT format_string('Hex float: %A', arrow_cast(3.14, 'Float16'));
+----
+Hex float: 0X1.92P1
+
+## Float32 with various formats
+query T
+SELECT format_string('Float: %f', arrow_cast(3.14159, 'Float32'));
+----
+Float: 3.141590
+
+query T
+SELECT format_string('Scientific: %e', arrow_cast(1234.5, 'Float32'));
+----
+Scientific: 1.234500e+03
+
+query T
+SELECT format_string('Compact: %g', arrow_cast(1234.5, 'Float32'));
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('String: %s', arrow_cast(3.14159, 'Float32'));
+----
+String: 3.14159
+
+query T
+SELECT format_string('Hex float: %a', arrow_cast(3.14, 'Float32'));
+----
+Hex float: 0x1.91eb86p1
+
+query T
+SELECT format_string('Hex float: %A', arrow_cast(3.14, 'Float32'));
+----
+Hex float: 0X1.91EB86P1
+
+## Float64 with various formats
+
+query T
+SELECT format_string('String: %s', arrow_cast(3.14159, 'Float64'));
+----
+String: 3.14159
+
+query T
+SELECT format_string('String: %S', arrow_cast(3.14159, 'Float64'));
+----
+String: 3.14159
+
+## Decimal128 with various formats
+query T
+SELECT format_string('Float: %f', arrow_cast(123.456, 'Decimal128(10, 3)'));
+----
+Float: 123.456000
+
+query T
+SELECT format_string('Scientific: %e', arrow_cast(1234.5, 'Decimal128(10, 2)'));
+----
+Scientific: 1.234500e+03
+
+query T
+SELECT format_string('Scientific: %E', arrow_cast(1234.5, 'Decimal128(10, 2)'));
+----
+Scientific: 1.234500E+03
+
+query T
+SELECT format_string('Compact: %g', arrow_cast(1234.5, 'Decimal128(10, 2)'));
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('Compact: %G', arrow_cast(1234.5, 'Decimal128(10, 2)'));
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('String: %s', arrow_cast(123.456, 'Decimal128(10, 3)'));
+----
+String: 123456
+
+query T
+SELECT format_string('String: %S', arrow_cast(123.456, 'Decimal128(10, 3)'));
+----
+String: 123456
+
+## Decimal256 with various formats
+query T
+SELECT format_string('Float: %f', arrow_cast(123.456, 'Decimal256(20, 3)'));
+----
+Float: 123.456000
+
+query T
+SELECT format_string('Scientific: %e', arrow_cast(1234.5, 'Decimal256(20, 2)'));
+----
+Scientific: 1.234500e+03
+
+query T
+SELECT format_string('Compact: %g', arrow_cast(1234.5, 'Decimal256(20, 2)'));
+----
+Compact: 1234.5
+
+query T
+SELECT format_string('String: %s', arrow_cast(123.456, 'Decimal256(20, 3)'));
+----
+String: 123456
+
+## Time32Second with time formats
+query T
+SELECT format_string('Hour: %tH', arrow_cast(52245::int, 'Time32(Second)'));
+----
+Hour: 14
+
+query T
+SELECT format_string('Minute: %tM', arrow_cast(52245::int, 'Time32(Second)'));
+----
+Minute: 30
+
+query T
+SELECT format_string('String: %s', arrow_cast(52245::int, 'Time32(Second)'));
+----
+String: 52245
+
+query T
+SELECT format_string('String: %S', arrow_cast(52245::int, 'Time32(Second)'));
+----
+String: 52245
+
+## Time32Millisecond with time formats
+query T
+SELECT format_string('Hour: %tH', arrow_cast(52245000::int, 'Time32(Millisecond)'));
+----
+Hour: 14
+
+query T
+SELECT format_string('Second: %tS', arrow_cast(52245000::int, 'Time32(Millisecond)'));
+----
+Second: 45
+
+query T
+SELECT format_string('String: %s', arrow_cast(52245000::int, 'Time32(Millisecond)'));
+----
+String: 52245000
+
+## Time64Microsecond with time formats
+query T
+SELECT format_string('Hour: %tH', arrow_cast(52245000000, 'Time64(Microsecond)'));
+----
+Hour: 14
+
+query T
+SELECT format_string('Time: %tT', arrow_cast(52245000000, 'Time64(Microsecond)'));
+----
+Time: 14:30:45
+
+query T
+SELECT format_string('String: %s', arrow_cast(52245000000, 'Time64(Microsecond)'));
+----
+String: 52245000000
+
+## Time64Nanosecond with time formats
+query T
+SELECT format_string('Hour: %tH', arrow_cast(52245000000000, 'Time64(Nanosecond)'));
+----
+Hour: 14
+
+query T
+SELECT format_string('AM/PM: %tp', arrow_cast(52245000000000, 'Time64(Nanosecond)'));
+----
+AM/PM: pm
+
+query T
+SELECT format_string('String: %s', arrow_cast(52245000000000, 'Time64(Nanosecond)'));
+----
+String: 52245000000000
+
+## TimestampSecond with time formats
+query T
+SELECT format_string('Year: %tY', arrow_cast(1703512245, 'Timestamp(s)'));
+----
+Year: 2023
+
+query T
+SELECT format_string('Month: %tm', arrow_cast(1703512245, 'Timestamp(s)'));
+----
+Month: 12
+
+query T
+SELECT format_string('String: %s', arrow_cast(1703512245, 'Timestamp(s)'));
+----
+String: 1703512245
+
+query T
+SELECT format_string('String: %S', arrow_cast(1703512245, 'Timestamp(s)'));
+----
+String: 1703512245
+
+## TimestampMillisecond with time formats
+query T
+SELECT format_string('ISO Date: %tF', arrow_cast(1703512245000, 'Timestamp(ms)'));
+----
+ISO Date: 2023-12-25
+
+query T
+SELECT format_string('String: %s', arrow_cast(1703512245000, 'Timestamp(ms)'));
+----
+String: 1703512245000
+
+## TimestampMicrosecond with time formats
+query T
+SELECT format_string('Date: %tD', arrow_cast(1703512245000000, 'Timestamp(µs)'));
+----
+Date: 12/25/23
+
+query T
+SELECT format_string('String: %s', arrow_cast(1703512245000000, 'Timestamp(µs)'));
+----
+String: 1703512245000000
+
+query T
+SELECT format_string('String: %s', arrow_cast('2020-01-02 01:01:11.1234567890Z', 'Timestamp(ns)'));
+----
+String: 1577926871123456789
+
+## Date32 with time formats
+query T
+SELECT format_string('Year: %tY', arrow_cast(19716, 'Date32'));
+----
+Year: 2023
+
+query T
+SELECT format_string('Month: %tB', arrow_cast(19716, 'Date32'));
+----
+Month: December
+
+query T
+SELECT format_string('String: %s', arrow_cast(19716, 'Date32'));
+----
+String: 19716
+
+query T
+SELECT format_string('String: %S', arrow_cast(19716, 'Date32'));
+----
+String: 19716
+
+## Date64 with time formats
+query T
+SELECT format_string('Year: %tY', arrow_cast(19716, 'Date64'));
+----
+Year: 2023
+
+query T
+SELECT format_string('Month: %tB', arrow_cast(19716, 'Date64'));
+----
+Month: December
+
+query T
+SELECT format_string('String: %s', arrow_cast(19716, 'Date64'));
+----
+String: 19716
+
+query T
+SELECT format_string('String: %S', arrow_cast(19716, 'Date64'));
+----
+String: 19716
+
+## Date64 with invalid ARGUMENT
+statement error
+SELECT format_string('String: %tY', true);
+
+# ================================
+# General formatting tests (%h, %H)
+# ================================
+
+# Not implemented yet. Can be implemented after https://github.com/apache/datafusion/pull/17093 is merged
+## Hash value formatting (lowercase)
+statement error
+SELECT format_string('Hash: %h', 'test');
+# ----
+# Hash: ec06e15a
+
+## Hash value formatting (uppercase)
+statement error
+SELECT format_string('Hash: %H', 'test');
+# ----
+# Hash: EC06E15A
+
+## Hash with width
+statement error
+SELECT format_string('Hash: %10h', 'test');
+# ----
+# Hash:   ec06e15a
+
+# ================================
+# Hexadecimal floating point tests
+# ================================
+
+## Hexadecimal float (lowercase)
+query T
+SELECT format_string('Hex float: %a', 15.9375);
+----
+Hex float: 0x1.fep3
+
+## Hexadecimal float (uppercase)
+query T
+SELECT format_string('Hex float: %A', 15.9375);
+----
+Hex float: 0X1.FEP3
+
+## Hexadecimal float with precision
+query T
+SELECT format_string('Hex float: %.10a', 15.9375);
+----
+Hex float: 0x1.fe00000000p3
+
+query T
+SELECT format_string('%a', 12.3456);
+----
+0x1.8b0f27bb2fec5p3
+
+## Hexadecimal float with zero
+query T
+SELECT format_string('Hex float: %a', 0.0);
+----
+Hex float: 0x0.0p0
+
+## Hexadecimal float with negative value
+query T
+SELECT format_string('Hex float: %a', -15.9375);
+----
+Hex float: -0x1.fep3
+
+## Hexadecimal float with very small value
+query T
+SELECT format_string('Hex float: %a', 0.0000152587890625);
+----
+Hex float: 0x1.0p-16
+
+## Hexadecimal float with force sign
+query T
+SELECT format_string('Hex float: %+a', 15.9375);
+----
+Hex float: +0x1.fep3
+
+## Hexadecimal float with space sign (positive)
+query T
+SELECT format_string('Hex float: % a', 15.9375);
+----
+Hex float:  0x1.fep3
+
+## Hexadecimal float with space sign (negative)
+query T
+SELECT format_string('Hex float: % a', -15.9375);
+----
+Hex float: -0x1.fep3
+
+## Hexadecimal float uppercase with space sign
+query T
+SELECT format_string('Hex float: % A', 15.9375);
+----
+Hex float:  0X1.FEP3
+
+## Hexadecimal float with width
+query T
+SELECT format_string('Hex float: %20a', 15.9375);
+----
+Hex float:             0x1.fep3
+
+## Hexadecimal float with zero padding
+query T
+SELECT format_string('Hex float: %020a', 15.9375);
+----
+Hex float: 0x0000000000001.fep3
+
+## Hexadecimal float with alternate form and precision
+query T
+SELECT format_string('Hex float: %#.5a', 1.0);
+----
+Hex float: 0x1.00000p0
+
+## Hexadecimal float uppercase with force sign
+query T
+SELECT format_string('Hex float: %+A', -15.9375);
+----
+Hex float: -0X1.FEP3
+
+## Hexadecimal float with left alignment
+query T
+SELECT format_string('Hex float: %-20a', 15.9375);
+----
+Hex float: 0x1.fep3
+
+## Hexadecimal float with subnormal number (Float64)
+query T
+SELECT format_string('Hex float: %a', 2.2250738585072014e-308);
+----
+Hex float: 0x1.0p-1022
+
+## Hexadecimal float with smallest subnormal (Float64)
+query T
+SELECT format_string('Hex float: %a', 5.0e-324);
+----
+Hex float: 0x0.0000000000001p-1022
+
+## Hexadecimal float uppercase with subnormal
+query T
+SELECT format_string('Hex float: %A', 5.0e-324);
+----
+Hex float: 0X0.0000000000001P-1022
+
+## Hexadecimal float with subnormal and precision
+query T
+SELECT format_string('Hex float: %.20a', 2.2250738585072014e-308);
+----
+Hex float: 0x1.00000000000000000000p-1022
+
+## Hexadecimal float with negative subnormal
+query T
+SELECT format_string('Hex float: %a', -5.0e-324);
+----
+Hex float: -0x0.0000000000001p-1022
+
+## Hexadecimal float with subnormal and precision 5
+query T
+SELECT format_string('Hex float: %.5a', 5.0e-324);
+----
+Hex float: 0x1.00000p-1074
+
+## Hexadecimal float with subnormal and precision 10
+query T
+SELECT format_string('Hex float: %.10a', 5.0e-324);
+----
+Hex float: 0x1.0000000000p-1074
+
+## Hexadecimal float with subnormal and precision 13 (full)
+query T
+SELECT format_string('Hex float: %.13a', 5.0e-324);
+----
+Hex float: 0x0.0000000000001p-1022
+
+## Hexadecimal float with larger subnormal and precision
+query T
+SELECT format_string('Hex float: %.5a', 2.225e-308);
+----
+Hex float: 0x1.fffbap-1023
+
+## Hexadecimal float with subnormal and precision 0
+query T
+SELECT format_string('Hex float: %.0a', 5.0e-324);
+----
+Hex float: 0x1.0p-1074
+
+query T
+SELECT format_string('Hex float: %.2a', 5.0e-324);
+----
+Hex float: 0x1.00p-1074
+
+
+query T
+SELECT format_string('Hex float: %.2a', 5.0e-323);
+----
+Hex float: 0x1.40p-1071
+
+query T
+SELECT format_string('Hex float: %.0a', 5.0e-323);
+----
+Hex float: 0x1.4p-1071
+
+# ================================
+# Relative indexing tests
+# ================================
+
+## Relative indexing with <
+query T
+SELECT format_string('%s %<s %<s', 'repeat');
+----
+repeat repeat repeat
+
+## Mixed relative and positional indexing
+query T
+SELECT format_string('%2$s %<s %1$d', 42, 'test');
+----
+test test 42
+
+statement error
+SELECT format_string('%<s %<s', 'repeat');
+
+# ================================
+# Extended flag combination tests
+# ================================
+
+## Combine + and 0 flags
+query T
+SELECT format_string('Signed zero-padded: %+08d', 42);
+----
+Signed zero-padded: +0000042
+
+## Combine space and width
+query T
+SELECT format_string('Space padded: % 8d', 42);
+----
+Space padded:       42
+
+## Combine # flag with float
+query T
+SELECT format_string('Always decimal: %#.0f', 42.0);
+----
+Always decimal: 42.
+
+## Combine # flag with float
+query T
+SELECT format_string('Always decimal: %#.0f', 40.0);
+----
+Always decimal: 40.
+
+## Combine multiple flags with hex
+query T
+SELECT format_string('Hex: %#08x', 255);
+----
+Hex: 0x0000ff
+
+# ================================
+# Special numeric values
+# ================================
+
+## Float NaN
+query T
+SELECT format_string('NaN: %f', CAST('NaN' AS DOUBLE));
+----
+NaN: NaN
+
+## Float positive infinity
+query T
+SELECT format_string('Infinity: %f', CAST('+Inf' AS DOUBLE));
+----
+Infinity: Infinity
+
+## Float negative infinity
+query T
+SELECT format_string('Negative Infinity: %f', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -Infinity
+
+## Float negative infinity with parentheses
+query T
+SELECT format_string('Negative Infinity: %(f', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: (Infinity)
+
+## NaN with scientific notation (lowercase)
+query T
+SELECT format_string('NaN: %e', CAST('NaN' AS DOUBLE));
+----
+NaN: NaN
+
+## NaN with scientific notation (uppercase)
+query T
+SELECT format_string('NaN: %E', CAST('NaN' AS DOUBLE));
+----
+NaN: NAN
+
+## NaN with compact format (lowercase)
+query T
+SELECT format_string('NaN: %g', CAST('NaN' AS DOUBLE));
+----
+NaN: NaN
+
+## NaN with compact format (uppercase)
+query T
+SELECT format_string('NaN: %G', CAST('NaN' AS DOUBLE));
+----
+NaN: NAN
+
+## NaN with hex float (lowercase)
+query T
+SELECT format_string('NaN: %a', CAST('NaN' AS DOUBLE));
+----
+NaN: NaN
+
+## NaN with hex float (uppercase)
+query T
+SELECT format_string('NaN: %A', CAST('NaN' AS DOUBLE));
+----
+NaN: NAN
+
+## Positive infinity with scientific notation (lowercase)
+query T
+SELECT format_string('Infinity: %e', CAST('+Inf' AS DOUBLE));
+----
+Infinity: Infinity
+
+## Positive infinity with scientific notation (uppercase)
+query T
+SELECT format_string('Infinity: %E', CAST('+Inf' AS DOUBLE));
+----
+Infinity: INFINITY
+
+## Positive infinity with compact format (lowercase)
+query T
+SELECT format_string('Infinity: %g', CAST('+Inf' AS DOUBLE));
+----
+Infinity: Infinity
+
+## Positive infinity with compact format (uppercase)
+query T
+SELECT format_string('Infinity: %G', CAST('+Inf' AS DOUBLE));
+----
+Infinity: INFINITY
+
+## Positive infinity with hex float (lowercase)
+query T
+SELECT format_string('Infinity: %a', CAST('+Inf' AS DOUBLE));
+----
+Infinity: Infinity
+
+## Positive infinity with hex float (uppercase)
+query T
+SELECT format_string('Infinity: %A', CAST('+Inf' AS DOUBLE));
+----
+Infinity: INFINITY
+
+## Negative infinity with scientific notation (lowercase)
+query T
+SELECT format_string('Negative Infinity: %e', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -Infinity
+
+## Negative infinity with scientific notation (uppercase)
+query T
+SELECT format_string('Negative Infinity: %E', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -INFINITY
+
+## Negative infinity with compact format (lowercase)
+query T
+SELECT format_string('Negative Infinity: %g', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -Infinity
+
+## Negative infinity with compact format (uppercase)
+query T
+SELECT format_string('Negative Infinity: %G', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -INFINITY
+
+## Negative infinity with hex float (lowercase)
+query T
+SELECT format_string('Negative Infinity: %a', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -Infinity
+
+## Negative infinity with hex float (uppercase)
+query T
+SELECT format_string('Negative Infinity: %A', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: -INFINITY
+
+## Positive infinity with force sign
+query T
+SELECT format_string('Infinity: %+f', CAST('+Inf' AS DOUBLE));
+----
+Infinity: +Infinity
+
+## Negative infinity with scientific notation and parentheses
+query T
+SELECT format_string('Negative Infinity: %(e', CAST('-Inf' AS DOUBLE));
+----
+Negative Infinity: (Infinity)
+
+# ================================
+# Large number tests
+# ================================
+
+## Large integer
+query T
+SELECT format_string('Large: %d', 9223372036854775807);
+----
+Large: 9223372036854775807
+
+## Very large integer with grouping
+query T
+SELECT format_string('Grouped large: %,d', 9223372036854775807);
+----
+Grouped large: 9,223,372,036,854,775,807
+
+# ================================
+# Precision edge cases
+# ================================
+
+## Zero precision with decimal
+query T
+SELECT format_string('Zero precision: %.0f', 3.14159);
+----
+Zero precision: 3
+
+## High precision float
+query T
+SELECT format_string('High precision: %.10f', 3.14159);
+----
+High precision: 3.1415900000
+
+## Scientific notation with precision
+query T
+SELECT format_string('Scientific: %.3e', 1234.56789);
+----
+Scientific: 1.235e+03
+
+# ================================
+# Width edge cases
+# ================================
+
+## Very wide formatting
+query T
+SELECT format_string('Wide: %20s|', 'test');
+----
+Wide:                 test|
+
+## Wide with left alignment
+query T
+SELECT format_string('Wide left: %-20s|', 'test');
+----
+Wide left: test                |
+
+# ================================
+# Character encoding tests
+# ================================
+
+## Unicode character
+query T
+SELECT format_string('Unicode: %c', 8364);
+----
+Unicode: €
+
+## High Unicode codepoint
+query T
+SELECT format_string('Emoji: %c', 128512);
+----
+Emoji: 😀
+
+# ================================
+# Advanced error cases
+# ================================
+
+## Invalid flag combination (+ and space)
+statement error
+SELECT format_string('%+ d', 42);
+
+## Invalid flag combination (+ and space)
+statement error
+SELECT format_string('% +d', 42);
+
+## Invalid flag combination (- and 0)
+statement error
+SELECT format_string('%-0d', 42);
+
+## Width without value for line separator
+statement error
+SELECT format_string('%5n');
+
+# ================================
+# BigInteger-like tests
+# ================================
+
+## Very large number in different bases
+query T
+SELECT format_string('Large hex: %x', 9223372036854775807::BIGINT);
+----
+Large hex: 7fffffffffffffff
+
+## Large octal
+query T
+SELECT format_string('Large octal: %o', 9223372036854775807::BIGINT);
+----
+Large octal: 777777777777777777777
diff --git a/datafusion/sqllogictest/test_files/spark/string/initcap.slt b/datafusion/sqllogictest/test_files/spark/string/initcap.slt
new file mode 100644
index 0000000000000..eb4fe04632b43
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/initcap.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT initcap('sPark sql');
+## PySpark 3.5.5 Result: {'initcap(sPark sql)': 'Spark Sql', 'typeof(initcap(sPark sql))': 'string', 'typeof(sPark sql)': 'string'}
+#query
+#SELECT initcap('sPark sql'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/instr.slt b/datafusion/sqllogictest/test_files/spark/string/instr.slt
new file mode 100644
index 0000000000000..dd332ad4a4dc2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/instr.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT instr('SparkSQL', 'SQL');
+## PySpark 3.5.5 Result: {'instr(SparkSQL, SQL)': 6, 'typeof(instr(SparkSQL, SQL))': 'int', 'typeof(SparkSQL)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT instr('SparkSQL'::string, 'SQL'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/lcase.slt b/datafusion/sqllogictest/test_files/spark/string/lcase.slt
new file mode 100644
index 0000000000000..45093aae64822
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/lcase.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT lcase('SparkSql');
+## PySpark 3.5.5 Result: {'lcase(SparkSql)': 'sparksql', 'typeof(lcase(SparkSql))': 'string', 'typeof(SparkSql)': 'string'}
+#query
+#SELECT lcase('SparkSql'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/left.slt b/datafusion/sqllogictest/test_files/spark/string/left.slt
new file mode 100644
index 0000000000000..a6413b763397a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/left.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT left('Spark SQL', 3);
+## PySpark 3.5.5 Result: {'left(Spark SQL, 3)': 'Spa', 'typeof(left(Spark SQL, 3))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(3)': 'int'}
+#query
+#SELECT left('Spark SQL'::string, 3::int);
diff --git a/benchmarks/requirements.txt b/datafusion/sqllogictest/test_files/spark/string/len.slt
similarity index 89%
rename from benchmarks/requirements.txt
rename to datafusion/sqllogictest/test_files/spark/string/len.slt
index 20a5a2bddbf20..3dd359f4a7213 100644
--- a/benchmarks/requirements.txt
+++ b/datafusion/sqllogictest/test_files/spark/string/len.slt
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -15,4 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-rich
+query I
+SELECT len('Spark SQL ');
+----
+10
+
+query I
+SELECT len(x'537061726b2053514c');
+----
+9
diff --git a/docs/requirements.txt b/datafusion/sqllogictest/test_files/spark/string/length.slt
similarity index 88%
rename from docs/requirements.txt
rename to datafusion/sqllogictest/test_files/spark/string/length.slt
index bd030fb670446..be453b0c4ca78 100644
--- a/docs/requirements.txt
+++ b/datafusion/sqllogictest/test_files/spark/string/length.slt
@@ -5,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -15,9 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-sphinx
-pydata-sphinx-theme==0.8.0
-myst-parser
-maturin
-jinja2
-setuptools>=48.0.0
+query I
+SELECT length('Spark SQL ');
+----
+10
+
+query I
+SELECT length(x'537061726b2053514c');
+----
+9
diff --git a/datafusion/sqllogictest/test_files/spark/string/levenshtein.slt b/datafusion/sqllogictest/test_files/spark/string/levenshtein.slt
new file mode 100644
index 0000000000000..086f03642e574
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/levenshtein.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT levenshtein('kitten', 'sitting');
+## PySpark 3.5.5 Result: {'levenshtein(kitten, sitting)': 3, 'typeof(levenshtein(kitten, sitting))': 'int', 'typeof(kitten)': 'string', 'typeof(sitting)': 'string'}
+#query
+#SELECT levenshtein('kitten'::string, 'sitting'::string);
+
+## Original Query: SELECT levenshtein('kitten', 'sitting', 2);
+## PySpark 3.5.5 Result: {'levenshtein(kitten, sitting, 2)': -1, 'typeof(levenshtein(kitten, sitting, 2))': 'int', 'typeof(kitten)': 'string', 'typeof(sitting)': 'string', 'typeof(2)': 'int'}
+#query
+#SELECT levenshtein('kitten'::string, 'sitting'::string, 2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/locate.slt b/datafusion/sqllogictest/test_files/spark/string/locate.slt
new file mode 100644
index 0000000000000..369d505ac9c64
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/locate.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT locate('bar', 'foobarbar');
+## PySpark 3.5.5 Result: {'locate(bar, foobarbar, 1)': 4, 'typeof(locate(bar, foobarbar, 1))': 'int', 'typeof(bar)': 'string', 'typeof(foobarbar)': 'string'}
+#query
+#SELECT locate('bar'::string, 'foobarbar'::string);
+
+## Original Query: SELECT locate('bar', 'foobarbar', 5);
+## PySpark 3.5.5 Result: {'locate(bar, foobarbar, 5)': 7, 'typeof(locate(bar, foobarbar, 5))': 'int', 'typeof(bar)': 'string', 'typeof(foobarbar)': 'string', 'typeof(5)': 'int'}
+#query
+#SELECT locate('bar'::string, 'foobarbar'::string, 5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/lower.slt b/datafusion/sqllogictest/test_files/spark/string/lower.slt
new file mode 100644
index 0000000000000..c59f380a99bc2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/lower.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT lower('SparkSql');
+## PySpark 3.5.5 Result: {'lower(SparkSql)': 'sparksql', 'typeof(lower(SparkSql))': 'string', 'typeof(SparkSql)': 'string'}
+#query
+#SELECT lower('SparkSql'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/lpad.slt b/datafusion/sqllogictest/test_files/spark/string/lpad.slt
new file mode 100644
index 0000000000000..858ae8e2215b8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/lpad.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT lpad('hi', 1, '??');
+## PySpark 3.5.5 Result: {'lpad(hi, 1, ??)': 'h', 'typeof(lpad(hi, 1, ??))': 'string', 'typeof(hi)': 'string', 'typeof(1)': 'int', 'typeof(??)': 'string'}
+#query
+#SELECT lpad('hi'::string, 1::int, '??'::string);
+
+## Original Query: SELECT lpad('hi', 5);
+## PySpark 3.5.5 Result: {'lpad(hi, 5,  )': '   hi', 'typeof(lpad(hi, 5,  ))': 'string', 'typeof(hi)': 'string', 'typeof(5)': 'int'}
+#query
+#SELECT lpad('hi'::string, 5::int);
+
+## Original Query: SELECT lpad('hi', 5, '??');
+## PySpark 3.5.5 Result: {'lpad(hi, 5, ??)': '???hi', 'typeof(lpad(hi, 5, ??))': 'string', 'typeof(hi)': 'string', 'typeof(5)': 'int', 'typeof(??)': 'string'}
+#query
+#SELECT lpad('hi'::string, 5::int, '??'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/ltrim.slt b/datafusion/sqllogictest/test_files/spark/string/ltrim.slt
new file mode 100644
index 0000000000000..a190eae9141b8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/ltrim.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ltrim('    SparkSQL   ');
+## PySpark 3.5.5 Result: {'ltrim(    SparkSQL   )': 'SparkSQL   ', 'typeof(ltrim(    SparkSQL   ))': 'string', 'typeof(    SparkSQL   )': 'string'}
+#query
+#SELECT ltrim('    SparkSQL   '::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/luhn_check.slt b/datafusion/sqllogictest/test_files/spark/string/luhn_check.slt
new file mode 100644
index 0000000000000..ccb17323b24dc
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/luhn_check.slt
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query B
+SELECT luhn_check('79927398713'::string);
+----
+true
+
+
+query B
+SELECT luhn_check('79927398714'::string);
+----
+false
+
+
+query B
+SELECT luhn_check('8112189876'::string);
+----
+true
+
+query B
+select luhn_check('4111111111111111'::string);
+----
+true
+
+query B
+select luhn_check('5500000000000004'::string);
+----
+true
+
+query B
+select luhn_check('340000000000009'::string);
+----
+true
+
+query B
+select luhn_check('6011000000000004'::string);
+----
+true
+
+
+query B
+select luhn_check('6011000000000005'::string);
+----
+false
+
+
+query B
+select luhn_check('378282246310006'::string);
+----
+false
+
+
+query B
+select luhn_check('0'::string);
+----
+true
+
+
+query B
+select luhn_check('79927398713'::string)
+----
+true
+
+query B
+select luhn_check('4417123456789113'::string)
+----
+true
+
+query B
+select luhn_check('7992 7398 714'::string)
+----
+false
+
+query B
+select luhn_check('79927398714'::string)
+----
+false
+
+query B
+select luhn_check('4111111111111111    '::string)
+----
+false
+
+
+query B
+select luhn_check('4111111 111111111'::string)
+----
+false
+
+query B
+select luhn_check(' 4111111111111111'::string)
+----
+false
+
+query B
+select luhn_check(''::string)
+----
+false
+
+query B
+select luhn_check('  ')
+----
+false
+
+
+query B
+select luhn_check('510B105105105106'::string)
+----
+false
+
+
+query B
+select luhn_check('ABCDED'::string)
+----
+false
+
+query B
+select luhn_check(null);
+----
+NULL
+
+query B
+select luhn_check(6011111111111117::BIGINT)
+----
+true
+
+
+query B
+select luhn_check(6011111111111118::BIGINT)
+----
+false
+
+
+query B
+select luhn_check(123.456::decimal(6,3))
+----
+false
+
+query B
+SELECT luhn_check(a) FROM (VALUES ('79927398713'::string), ('79927398714'::string)) AS t(a);
+----
+true
+false
diff --git a/datafusion/sqllogictest/test_files/spark/string/mask.slt b/datafusion/sqllogictest/test_files/spark/string/mask.slt
new file mode 100644
index 0000000000000..45ea093d36eae
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/mask.slt
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT mask('AbCD123-@$#');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, X, x, n, NULL)': 'XxXXnnn-@$#', 'typeof(mask(AbCD123-@$#, X, x, n, NULL))': 'string', 'typeof(AbCD123-@$#)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', 'Q');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, Q, x, n, NULL)': 'QxQQnnn-@$#', 'typeof(mask(AbCD123-@$#, Q, x, n, NULL))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(Q)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, 'Q'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', 'Q', 'q');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, Q, q, n, NULL)': 'QqQQnnn-@$#', 'typeof(mask(AbCD123-@$#, Q, q, n, NULL))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(Q)': 'string', 'typeof(q)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, 'Q'::string, 'q'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', 'Q', 'q', 'd');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, Q, q, d, NULL)': 'QqQQddd-@$#', 'typeof(mask(AbCD123-@$#, Q, q, d, NULL))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(Q)': 'string', 'typeof(q)': 'string', 'typeof(d)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, 'Q'::string, 'q'::string, 'd'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', 'Q', 'q', 'd', 'o');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, Q, q, d, o)': 'QqQQdddoooo', 'typeof(mask(AbCD123-@$#, Q, q, d, o))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(Q)': 'string', 'typeof(q)': 'string', 'typeof(d)': 'string', 'typeof(o)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, 'Q'::string, 'q'::string, 'd'::string, 'o'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', NULL, 'q', 'd', 'o');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, NULL, q, d, o)': 'AqCDdddoooo', 'typeof(mask(AbCD123-@$#, NULL, q, d, o))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(NULL)': 'void', 'typeof(q)': 'string', 'typeof(d)': 'string', 'typeof(o)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, NULL::void, 'q'::string, 'd'::string, 'o'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', NULL, NULL, 'd', 'o');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, NULL, NULL, d, o)': 'AbCDdddoooo', 'typeof(mask(AbCD123-@$#, NULL, NULL, d, o))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(NULL)': 'void', 'typeof(d)': 'string', 'typeof(o)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, NULL::void, 'd'::string, 'o'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', NULL, NULL, NULL, 'o');
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, NULL, NULL, NULL, o)': 'AbCD123oooo', 'typeof(mask(AbCD123-@$#, NULL, NULL, NULL, o))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(NULL)': 'void', 'typeof(o)': 'string'}
+#query
+#SELECT mask('AbCD123-@$#'::string, NULL::void, 'o'::string);
+
+## Original Query: SELECT mask('AbCD123-@$#', NULL, NULL, NULL, NULL);
+## PySpark 3.5.5 Result: {'mask(AbCD123-@$#, NULL, NULL, NULL, NULL)': 'AbCD123-@$#', 'typeof(mask(AbCD123-@$#, NULL, NULL, NULL, NULL))': 'string', 'typeof(AbCD123-@$#)': 'string', 'typeof(NULL)': 'void'}
+#query
+#SELECT mask('AbCD123-@$#'::string, NULL::void);
+
+## Original Query: SELECT mask('abcd-EFGH-8765-4321');
+## PySpark 3.5.5 Result: {'mask(abcd-EFGH-8765-4321, X, x, n, NULL)': 'xxxx-XXXX-nnnn-nnnn', 'typeof(mask(abcd-EFGH-8765-4321, X, x, n, NULL))': 'string', 'typeof(abcd-EFGH-8765-4321)': 'string'}
+#query
+#SELECT mask('abcd-EFGH-8765-4321'::string);
+
+## Original Query: SELECT mask('abcd-EFGH-8765-4321', 'Q');
+## PySpark 3.5.5 Result: {'mask(abcd-EFGH-8765-4321, Q, x, n, NULL)': 'xxxx-QQQQ-nnnn-nnnn', 'typeof(mask(abcd-EFGH-8765-4321, Q, x, n, NULL))': 'string', 'typeof(abcd-EFGH-8765-4321)': 'string', 'typeof(Q)': 'string'}
+#query
+#SELECT mask('abcd-EFGH-8765-4321'::string, 'Q'::string);
+
+## Original Query: SELECT mask(NULL);
+## PySpark 3.5.5 Result: {'mask(NULL, X, x, n, NULL)': None, 'typeof(mask(NULL, X, x, n, NULL))': 'string', 'typeof(NULL)': 'void'}
+#query
+#SELECT mask(NULL::void);
+
+## Original Query: SELECT mask(NULL, NULL, NULL, NULL, 'o');
+## PySpark 3.5.5 Result: {'mask(NULL, NULL, NULL, NULL, o)': None, 'typeof(mask(NULL, NULL, NULL, NULL, o))': 'string', 'typeof(NULL)': 'void', 'typeof(o)': 'string'}
+#query
+#SELECT mask(NULL::void, 'o'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/octet_length.slt b/datafusion/sqllogictest/test_files/spark/string/octet_length.slt
new file mode 100644
index 0000000000000..5042581aef2e3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/octet_length.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT octet_length('Spark SQL');
+## PySpark 3.5.5 Result: {'octet_length(Spark SQL)': 9, 'typeof(octet_length(Spark SQL))': 'int', 'typeof(Spark SQL)': 'string'}
+#query
+#SELECT octet_length('Spark SQL'::string);
+
+## Original Query: SELECT octet_length(x'537061726b2053514c');
+## PySpark 3.5.5 Result: {"octet_length(X'537061726B2053514C')": 9, "typeof(octet_length(X'537061726B2053514C'))": 'int', "typeof(X'537061726B2053514C')": 'binary'}
+#query
+#SELECT octet_length(X'537061726B2053514C'::binary);
diff --git a/datafusion/sqllogictest/test_files/spark/string/position.slt b/datafusion/sqllogictest/test_files/spark/string/position.slt
new file mode 100644
index 0000000000000..76ce29e4706dd
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/position.slt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT position('bar', 'foobarbar');
+## PySpark 3.5.5 Result: {'position(bar, foobarbar, 1)': 4, 'typeof(position(bar, foobarbar, 1))': 'int', 'typeof(bar)': 'string', 'typeof(foobarbar)': 'string'}
+#query
+#SELECT position('bar'::string, 'foobarbar'::string);
+
+## Original Query: SELECT position('bar', 'foobarbar', 5);
+## PySpark 3.5.5 Result: {'position(bar, foobarbar, 5)': 7, 'typeof(position(bar, foobarbar, 5))': 'int', 'typeof(bar)': 'string', 'typeof(foobarbar)': 'string', 'typeof(5)': 'int'}
+#query
+#SELECT position('bar'::string, 'foobarbar'::string, 5::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/printf.slt b/datafusion/sqllogictest/test_files/spark/string/printf.slt
new file mode 100644
index 0000000000000..f248c4434b59c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/printf.slt
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT printf("Hello World %d %s", 100, "days");
+## PySpark 3.5.5 Result: {'printf(Hello World %d %s, 100, days)': 'Hello World 100 days', 'typeof(printf(Hello World %d %s, 100, days))': 'string', 'typeof(Hello World %d %s)': 'string', 'typeof(100)': 'int', 'typeof(days)': 'string'}
+
+query T
+SELECT printf('Hello World %d %s'::string, 100::int, 'days'::string);
+----
+Hello World 100 days
diff --git a/datafusion/sqllogictest/test_files/spark/string/repeat.slt b/datafusion/sqllogictest/test_files/spark/string/repeat.slt
new file mode 100644
index 0000000000000..5ca4166f9f4e0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/repeat.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT repeat('123', 2);
+## PySpark 3.5.5 Result: {'repeat(123, 2)': '123123', 'typeof(repeat(123, 2))': 'string', 'typeof(123)': 'string', 'typeof(2)': 'int'}
+#query
+#SELECT repeat('123'::string, 2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/replace.slt b/datafusion/sqllogictest/test_files/spark/string/replace.slt
new file mode 100644
index 0000000000000..a5430febb7cf4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/replace.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT replace('ABCabc', 'abc', 'DEF');
+## PySpark 3.5.5 Result: {'replace(ABCabc, abc, DEF)': 'ABCDEF', 'typeof(replace(ABCabc, abc, DEF))': 'string', 'typeof(ABCabc)': 'string', 'typeof(abc)': 'string', 'typeof(DEF)': 'string'}
+#query
+#SELECT replace('ABCabc'::string, 'abc'::string, 'DEF'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/right.slt b/datafusion/sqllogictest/test_files/spark/string/right.slt
new file mode 100644
index 0000000000000..a65a18b4906ce
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/right.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT right('Spark SQL', 3);
+## PySpark 3.5.5 Result: {'right(Spark SQL, 3)': 'SQL', 'typeof(right(Spark SQL, 3))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(3)': 'int'}
+#query
+#SELECT right('Spark SQL'::string, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/rpad.slt b/datafusion/sqllogictest/test_files/spark/string/rpad.slt
new file mode 100644
index 0000000000000..f0451ade900d0
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/rpad.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT rpad('hi', 1, '??');
+## PySpark 3.5.5 Result: {'rpad(hi, 1, ??)': 'h', 'typeof(rpad(hi, 1, ??))': 'string', 'typeof(hi)': 'string', 'typeof(1)': 'int', 'typeof(??)': 'string'}
+#query
+#SELECT rpad('hi'::string, 1::int, '??'::string);
+
+## Original Query: SELECT rpad('hi', 5);
+## PySpark 3.5.5 Result: {'rpad(hi, 5,  )': 'hi   ', 'typeof(rpad(hi, 5,  ))': 'string', 'typeof(hi)': 'string', 'typeof(5)': 'int'}
+#query
+#SELECT rpad('hi'::string, 5::int);
+
+## Original Query: SELECT rpad('hi', 5, '??');
+## PySpark 3.5.5 Result: {'rpad(hi, 5, ??)': 'hi???', 'typeof(rpad(hi, 5, ??))': 'string', 'typeof(hi)': 'string', 'typeof(5)': 'int', 'typeof(??)': 'string'}
+#query
+#SELECT rpad('hi'::string, 5::int, '??'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/rtrim.slt b/datafusion/sqllogictest/test_files/spark/string/rtrim.slt
new file mode 100644
index 0000000000000..b6d3b10abdb5f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/rtrim.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT rtrim('    SparkSQL   ');
+## PySpark 3.5.5 Result: {'rtrim(    SparkSQL   )': '    SparkSQL', 'typeof(rtrim(    SparkSQL   ))': 'string', 'typeof(    SparkSQL   )': 'string'}
+#query
+#SELECT rtrim('    SparkSQL   '::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/sentences.slt b/datafusion/sqllogictest/test_files/spark/string/sentences.slt
new file mode 100644
index 0000000000000..317dca0e47139
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/sentences.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT sentences('Hi there! Good morning.');
+## PySpark 3.5.5 Result: {'sentences(Hi there! Good morning., , )': [['Hi', 'there'], ['Good', 'morning']], 'typeof(sentences(Hi there! Good morning., , ))': 'array<array<string>>', 'typeof(Hi there! Good morning.)': 'string'}
+#query
+#SELECT sentences('Hi there! Good morning.'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
new file mode 100644
index 0000000000000..f0c46e10fd1de
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT soundex('Miller');
+## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
+#query
+#SELECT soundex('Miller'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/space.slt b/datafusion/sqllogictest/test_files/spark/string/space.slt
new file mode 100644
index 0000000000000..388f679c4da73
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/space.slt
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT concat(space(1::INT), 'Spark');
+----
+ Spark
+
+query T
+SELECT concat(space(5::INT), 'Spark');
+----
+     Spark
+
+query T
+SELECT space(0::INT);
+----
+(empty)
+
+query T
+SELECT space(-1::INT);
+----
+(empty)
+
+query T
+SELECT space(NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/string/split_part.slt b/datafusion/sqllogictest/test_files/spark/string/split_part.slt
new file mode 100644
index 0000000000000..0561a03ecf75d
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/split_part.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT split_part('11.12.13', '.', 3);
+## PySpark 3.5.5 Result: {'split_part(11.12.13, ., 3)': '13', 'typeof(split_part(11.12.13, ., 3))': 'string', 'typeof(11.12.13)': 'string', 'typeof(.)': 'string', 'typeof(3)': 'int'}
+#query
+#SELECT split_part('11.12.13'::string, '.'::string, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/startswith.slt b/datafusion/sqllogictest/test_files/spark/string/startswith.slt
new file mode 100644
index 0000000000000..f75f9d080dfac
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/startswith.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT startswith('Spark SQL', 'SQL');
+## PySpark 3.5.5 Result: {'startswith(Spark SQL, SQL)': False, 'typeof(startswith(Spark SQL, SQL))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(SQL)': 'string'}
+#query
+#SELECT startswith('Spark SQL'::string, 'SQL'::string);
+
+## Original Query: SELECT startswith('Spark SQL', 'Spark');
+## PySpark 3.5.5 Result: {'startswith(Spark SQL, Spark)': True, 'typeof(startswith(Spark SQL, Spark))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(Spark)': 'string'}
+#query
+#SELECT startswith('Spark SQL'::string, 'Spark'::string);
+
+## Original Query: SELECT startswith('Spark SQL', null);
+## PySpark 3.5.5 Result: {'startswith(Spark SQL, NULL)': None, 'typeof(startswith(Spark SQL, NULL))': 'boolean', 'typeof(Spark SQL)': 'string', 'typeof(NULL)': 'void'}
+#query
+#SELECT startswith('Spark SQL'::string, NULL::void);
+
+## Original Query: SELECT startswith(x'537061726b2053514c', x'53514c');
+## PySpark 3.5.5 Result: {"startswith(X'537061726B2053514C', X'53514C')": False, "typeof(startswith(X'537061726B2053514C', X'53514C'))": 'boolean', "typeof(X'537061726B2053514C')": 'binary', "typeof(X'53514C')": 'binary'}
+#query
+#SELECT startswith(X'537061726B2053514C'::binary, X'53514C'::binary);
+
+## Original Query: SELECT startswith(x'537061726b2053514c', x'537061726b');
+## PySpark 3.5.5 Result: {"startswith(X'537061726B2053514C', X'537061726B')": True, "typeof(startswith(X'537061726B2053514C', X'537061726B'))": 'boolean', "typeof(X'537061726B2053514C')": 'binary', "typeof(X'537061726B')": 'binary'}
+#query
+#SELECT startswith(X'537061726B2053514C'::binary, X'537061726B'::binary);
diff --git a/datafusion/sqllogictest/test_files/spark/string/substring.slt b/datafusion/sqllogictest/test_files/spark/string/substring.slt
new file mode 100644
index 0000000000000..5bf2fdf2fb954
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/substring.slt
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query T
+SELECT substring('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substring('Spark SQL'::string, 5::int);
+----
+k SQL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 1::int);
+----
+a
+
+# Test negative start
+query T
+SELECT substring('Spark SQL'::string, -3::int);
+----
+SQL
+
+query T
+SELECT substring('Spark SQL'::string, -3::int, 2::int);
+----
+SQ
+
+# Test length exceeding string length
+query T
+SELECT substring('Spark SQL'::string, 2::int, 700::int);
+----
+park SQL
+
+# Test start position beyond string length
+query T
+SELECT substring('Spark SQL'::string, 30::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, -30::int);
+----
+Spark SQL
+
+# Test negative length
+query T
+SELECT substring('Spark SQL'::string, 3::int, -1::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 0::int);
+----
+(empty)
+
+# Test unicode strings
+query T
+SELECT substring('joséésoj'::string, 5::int);
+----
+ésoj
+
+query T
+SELECT substring('joséésoj'::string, 5::int, 2::int);
+----
+és
+
+# NULL handling
+query T
+SELECT substring('Spark SQL'::string, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 5::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 3::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, NULL::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substring(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL
+
+# alias substr
+
+query T
+SELECT substr('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substr(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substr(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/string/substring_index.slt b/datafusion/sqllogictest/test_files/spark/string/substring_index.slt
new file mode 100644
index 0000000000000..b434d9fa5edc4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/substring_index.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT substring_index('www.apache.org', '.', 2);
+## PySpark 3.5.5 Result: {'substring_index(www.apache.org, ., 2)': 'www.apache', 'typeof(substring_index(www.apache.org, ., 2))': 'string', 'typeof(www.apache.org)': 'string', 'typeof(.)': 'string', 'typeof(2)': 'int'}
+#query
+#SELECT substring_index('www.apache.org'::string, '.'::string, 2::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/to_binary.slt b/datafusion/sqllogictest/test_files/spark/string/to_binary.slt
new file mode 100644
index 0000000000000..d8efa323f2c52
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/to_binary.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_binary('abc', 'utf-8');
+## PySpark 3.5.5 Result: {'to_binary(abc, utf-8)': bytearray(b'abc'), 'typeof(to_binary(abc, utf-8))': 'binary', 'typeof(abc)': 'string', 'typeof(utf-8)': 'string'}
+#query
+#SELECT to_binary('abc'::string, 'utf-8'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/to_char.slt b/datafusion/sqllogictest/test_files/spark/string/to_char.slt
new file mode 100644
index 0000000000000..88d88bbb8ad9f
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/to_char.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_char(-12454.8, '99G999D9S');
+## PySpark 3.5.5 Result: {'to_char(-12454.8, 99G999D9S)': '12,454.8-', 'typeof(to_char(-12454.8, 99G999D9S))': 'string', 'typeof(-12454.8)': 'decimal(6,1)', 'typeof(99G999D9S)': 'string'}
+#query
+#SELECT to_char(-12454.8::decimal(6,1), '99G999D9S'::string);
+
+## Original Query: SELECT to_char(12454, '99G999');
+## PySpark 3.5.5 Result: {'to_char(12454, 99G999)': '12,454', 'typeof(to_char(12454, 99G999))': 'string', 'typeof(12454)': 'int', 'typeof(99G999)': 'string'}
+#query
+#SELECT to_char(12454::int, '99G999'::string);
+
+## Original Query: SELECT to_char(454, '999');
+## PySpark 3.5.5 Result: {'to_char(454, 999)': '454', 'typeof(to_char(454, 999))': 'string', 'typeof(454)': 'int', 'typeof(999)': 'string'}
+#query
+#SELECT to_char(454::int, '999'::string);
+
+## Original Query: SELECT to_char(454.00, '000D00');
+## PySpark 3.5.5 Result: {'to_char(454.00, 000D00)': '454.00', 'typeof(to_char(454.00, 000D00))': 'string', 'typeof(454.00)': 'decimal(5,2)', 'typeof(000D00)': 'string'}
+#query
+#SELECT to_char(454.00::decimal(5,2), '000D00'::string);
+
+## Original Query: SELECT to_char(78.12, '$99.99');
+## PySpark 3.5.5 Result: {'to_char(78.12, $99.99)': '$78.12', 'typeof(to_char(78.12, $99.99))': 'string', 'typeof(78.12)': 'decimal(4,2)', 'typeof($99.99)': 'string'}
+#query
+#SELECT to_char(78.12::decimal(4,2), '$99.99'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/to_number.slt b/datafusion/sqllogictest/test_files/spark/string/to_number.slt
new file mode 100644
index 0000000000000..ffbee15aca4d2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/to_number.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_number('$78.12', '$99.99');
+## PySpark 3.5.5 Result: {'to_number($78.12, $99.99)': Decimal('78.12'), 'typeof(to_number($78.12, $99.99))': 'decimal(4,2)', 'typeof($78.12)': 'string', 'typeof($99.99)': 'string'}
+#query
+#SELECT to_number('$78.12'::string, '$99.99'::string);
+
+## Original Query: SELECT to_number('12,454', '99,999');
+## PySpark 3.5.5 Result: {'to_number(12,454, 99,999)': Decimal('12454'), 'typeof(to_number(12,454, 99,999))': 'decimal(5,0)', 'typeof(12,454)': 'string', 'typeof(99,999)': 'string'}
+#query
+#SELECT to_number('12,454'::string, '99,999'::string);
+
+## Original Query: SELECT to_number('12,454.8-', '99,999.9S');
+## PySpark 3.5.5 Result: {'to_number(12,454.8-, 99,999.9S)': Decimal('-12454.8'), 'typeof(to_number(12,454.8-, 99,999.9S))': 'decimal(6,1)', 'typeof(12,454.8-)': 'string', 'typeof(99,999.9S)': 'string'}
+#query
+#SELECT to_number('12,454.8-'::string, '99,999.9S'::string);
+
+## Original Query: SELECT to_number('454', '999');
+## PySpark 3.5.5 Result: {'to_number(454, 999)': Decimal('454'), 'typeof(to_number(454, 999))': 'decimal(3,0)', 'typeof(454)': 'string', 'typeof(999)': 'string'}
+#query
+#SELECT to_number('454'::string, '999'::string);
+
+## Original Query: SELECT to_number('454.00', '000.00');
+## PySpark 3.5.5 Result: {'to_number(454.00, 000.00)': Decimal('454.00'), 'typeof(to_number(454.00, 000.00))': 'decimal(5,2)', 'typeof(454.00)': 'string', 'typeof(000.00)': 'string'}
+#query
+#SELECT to_number('454.00'::string, '000.00'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/to_varchar.slt b/datafusion/sqllogictest/test_files/spark/string/to_varchar.slt
new file mode 100644
index 0000000000000..51662b89e5580
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/to_varchar.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT to_varchar(-12454.8, '99G999D9S');
+## PySpark 3.5.5 Result: {'to_char(-12454.8, 99G999D9S)': '12,454.8-', 'typeof(to_char(-12454.8, 99G999D9S))': 'string', 'typeof(-12454.8)': 'decimal(6,1)', 'typeof(99G999D9S)': 'string'}
+#query
+#SELECT to_varchar(-12454.8::decimal(6,1), '99G999D9S'::string);
+
+## Original Query: SELECT to_varchar(12454, '99G999');
+## PySpark 3.5.5 Result: {'to_char(12454, 99G999)': '12,454', 'typeof(to_char(12454, 99G999))': 'string', 'typeof(12454)': 'int', 'typeof(99G999)': 'string'}
+#query
+#SELECT to_varchar(12454::int, '99G999'::string);
+
+## Original Query: SELECT to_varchar(454, '999');
+## PySpark 3.5.5 Result: {'to_char(454, 999)': '454', 'typeof(to_char(454, 999))': 'string', 'typeof(454)': 'int', 'typeof(999)': 'string'}
+#query
+#SELECT to_varchar(454::int, '999'::string);
+
+## Original Query: SELECT to_varchar(454.00, '000D00');
+## PySpark 3.5.5 Result: {'to_char(454.00, 000D00)': '454.00', 'typeof(to_char(454.00, 000D00))': 'string', 'typeof(454.00)': 'decimal(5,2)', 'typeof(000D00)': 'string'}
+#query
+#SELECT to_varchar(454.00::decimal(5,2), '000D00'::string);
+
+## Original Query: SELECT to_varchar(78.12, '$99.99');
+## PySpark 3.5.5 Result: {'to_char(78.12, $99.99)': '$78.12', 'typeof(to_char(78.12, $99.99))': 'string', 'typeof(78.12)': 'decimal(4,2)', 'typeof($99.99)': 'string'}
+#query
+#SELECT to_varchar(78.12::decimal(4,2), '$99.99'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/translate.slt b/datafusion/sqllogictest/test_files/spark/string/translate.slt
new file mode 100644
index 0000000000000..53ea41a7ac31e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/translate.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT translate('AaBbCc', 'abc', '123');
+## PySpark 3.5.5 Result: {'translate(AaBbCc, abc, 123)': 'A1B2C3', 'typeof(translate(AaBbCc, abc, 123))': 'string', 'typeof(AaBbCc)': 'string', 'typeof(abc)': 'string', 'typeof(123)': 'string'}
+#query
+#SELECT translate('AaBbCc'::string, 'abc'::string, '123'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/trim.slt b/datafusion/sqllogictest/test_files/spark/string/trim.slt
new file mode 100644
index 0000000000000..725bab5e69623
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/trim.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT trim('    SparkSQL   ');
+## PySpark 3.5.5 Result: {'trim(    SparkSQL   )': 'SparkSQL', 'typeof(trim(    SparkSQL   ))': 'string', 'typeof(    SparkSQL   )': 'string'}
+#query
+#SELECT trim('    SparkSQL   '::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/try_to_binary.slt b/datafusion/sqllogictest/test_files/spark/string/try_to_binary.slt
new file mode 100644
index 0000000000000..211520be1e48b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/try_to_binary.slt
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_to_binary('abc', 'utf-8');
+## PySpark 3.5.5 Result: {'try_to_binary(abc, utf-8)': bytearray(b'abc'), 'typeof(try_to_binary(abc, utf-8))': 'binary', 'typeof(abc)': 'string', 'typeof(utf-8)': 'string'}
+#query
+#SELECT try_to_binary('abc'::string, 'utf-8'::string);
+
+## Original Query: select try_to_binary('a!', 'base64');
+## PySpark 3.5.5 Result: {'try_to_binary(a!, base64)': None, 'typeof(try_to_binary(a!, base64))': 'binary', 'typeof(a!)': 'string', 'typeof(base64)': 'string'}
+#query
+#SELECT try_to_binary('a!'::string, 'base64'::string);
+
+## Original Query: select try_to_binary('abc', 'invalidFormat');
+## PySpark 3.5.5 Result: {'try_to_binary(abc, invalidFormat)': None, 'typeof(try_to_binary(abc, invalidFormat))': 'binary', 'typeof(abc)': 'string', 'typeof(invalidFormat)': 'string'}
+#query
+#SELECT try_to_binary('abc'::string, 'invalidFormat'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/try_to_number.slt b/datafusion/sqllogictest/test_files/spark/string/try_to_number.slt
new file mode 100644
index 0000000000000..10be9e2180be8
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/try_to_number.slt
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT try_to_number('$78.12', '$99.99');
+## PySpark 3.5.5 Result: {'try_to_number($78.12, $99.99)': Decimal('78.12'), 'typeof(try_to_number($78.12, $99.99))': 'decimal(4,2)', 'typeof($78.12)': 'string', 'typeof($99.99)': 'string'}
+#query
+#SELECT try_to_number('$78.12'::string, '$99.99'::string);
+
+## Original Query: SELECT try_to_number('12,454', '99,999');
+## PySpark 3.5.5 Result: {'try_to_number(12,454, 99,999)': Decimal('12454'), 'typeof(try_to_number(12,454, 99,999))': 'decimal(5,0)', 'typeof(12,454)': 'string', 'typeof(99,999)': 'string'}
+#query
+#SELECT try_to_number('12,454'::string, '99,999'::string);
+
+## Original Query: SELECT try_to_number('12,454.8-', '99,999.9S');
+## PySpark 3.5.5 Result: {'try_to_number(12,454.8-, 99,999.9S)': Decimal('-12454.8'), 'typeof(try_to_number(12,454.8-, 99,999.9S))': 'decimal(6,1)', 'typeof(12,454.8-)': 'string', 'typeof(99,999.9S)': 'string'}
+#query
+#SELECT try_to_number('12,454.8-'::string, '99,999.9S'::string);
+
+## Original Query: SELECT try_to_number('454', '999');
+## PySpark 3.5.5 Result: {'try_to_number(454, 999)': Decimal('454'), 'typeof(try_to_number(454, 999))': 'decimal(3,0)', 'typeof(454)': 'string', 'typeof(999)': 'string'}
+#query
+#SELECT try_to_number('454'::string, '999'::string);
+
+## Original Query: SELECT try_to_number('454.00', '000.00');
+## PySpark 3.5.5 Result: {'try_to_number(454.00, 000.00)': Decimal('454.00'), 'typeof(try_to_number(454.00, 000.00))': 'decimal(5,2)', 'typeof(454.00)': 'string', 'typeof(000.00)': 'string'}
+#query
+#SELECT try_to_number('454.00'::string, '000.00'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/ucase.slt b/datafusion/sqllogictest/test_files/spark/string/ucase.slt
new file mode 100644
index 0000000000000..00860c697399e
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/ucase.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT ucase('SparkSql');
+## PySpark 3.5.5 Result: {'ucase(SparkSql)': 'SPARKSQL', 'typeof(ucase(SparkSql))': 'string', 'typeof(SparkSql)': 'string'}
+#query
+#SELECT ucase('SparkSql'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/string/upper.slt b/datafusion/sqllogictest/test_files/spark/string/upper.slt
new file mode 100644
index 0000000000000..91c92940332a7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/string/upper.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT upper('SparkSql');
+## PySpark 3.5.5 Result: {'upper(SparkSql)': 'SPARKSQL', 'typeof(upper(SparkSql))': 'string', 'typeof(SparkSql)': 'string'}
+#query
+#SELECT upper('SparkSql'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/struct/named_struct.slt b/datafusion/sqllogictest/test_files/spark/struct/named_struct.slt
new file mode 100644
index 0000000000000..83b24f6d041f2
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/struct/named_struct.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT named_struct("a", 1, "b", 2, "c", 3);
+## PySpark 3.5.5 Result: {'named_struct(a, 1, b, 2, c, 3)': Row(a=1, b=2, c=3), 'typeof(named_struct(a, 1, b, 2, c, 3))': 'struct<a:int,b:int,c:int>', 'typeof(a)': 'string', 'typeof(1)': 'int', 'typeof(b)': 'string', 'typeof(2)': 'int', 'typeof(c)': 'string', 'typeof(3)': 'int'}
+#query
+#SELECT named_struct('a'::string, 1::int, 'b'::string, 2::int, 'c'::string, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/struct/struct.slt b/datafusion/sqllogictest/test_files/spark/struct/struct.slt
new file mode 100644
index 0000000000000..fe23e249701f5
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/struct/struct.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT struct(1, 2, 3);
+## PySpark 3.5.5 Result: {'struct(1, 2, 3)': Row(col1=1, col2=2, col3=3), 'typeof(struct(1, 2, 3))': 'struct<col1:int,col2:int,col3:int>', 'typeof(1)': 'int', 'typeof(2)': 'int', 'typeof(3)': 'int'}
+#query
+#SELECT struct(1::int, 2::int, 3::int);
diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
new file mode 100644
index 0000000000000..7a5051d50e2ce
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt
@@ -0,0 +1,310 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT parse_url('http://spark.apache.org/path?query=1'::string, 'HOST'::string);
+----
+spark.apache.org
+
+query T
+SELECT parse_url('http://spark.apache.org/path?query=1'::string, 'QUERY'::string);
+----
+query=1
+
+query T
+SELECT parse_url('http://spark.apache.org/path?query=1'::string, 'QUERY'::string, 'query'::string);
+----
+1
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'HOST'::string);
+----
+spark.apache.org
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'PATH'::string);
+----
+/path
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'QUERY'::string);
+----
+query=1
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'REF'::string);
+----
+Ref
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'PROTOCOL'::string);
+----
+http
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'FILE'::string);
+----
+/path?query=1
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'AUTHORITY'::string);
+----
+userinfo@spark.apache.org
+
+query T
+SELECT parse_url('http://userinfo@spark.apache.org/path?query=1#Ref'::string, 'USERINFO'::string);
+----
+userinfo
+
+query T
+SELECT parse_url('https://example.com/a?x=1', 'QUERY', 'x');
+----
+1
+
+query T
+SELECT parse_url('https://example.com/a?x=1', 'query', 'x');
+----
+NULL
+
+query T
+SELECT parse_url('www.example.com/path?x=1', 'HOST');
+----
+NULL
+
+query T
+SELECT parse_url('www.example.com/path?x=1', 'host');
+----
+NULL
+
+query T
+SELECT parse_url('https://example.com/?a=1', 'QUERY', 'b');
+----
+NULL
+
+query T
+SELECT parse_url('https://example.com/?a=1', 'query', 'b');
+----
+NULL
+
+query T
+SELECT parse_url('https://example.com/path#frag', 'REF');
+----
+frag
+
+query T
+SELECT parse_url('https://example.com/path#frag', 'ref');
+----
+NULL
+
+query T
+SELECT parse_url('ftp://user:pwd@ftp.example.com:21/files', 'USERINFO');
+----
+user:pwd
+
+query T
+SELECT parse_url('ftp://user:pwd@ftp.example.com:21/files', 'userinfo');
+----
+NULL
+
+query T
+SELECT parse_url('http://[2001:db8::2]:8080/index.html?ok=1', 'HOST');
+----
+[2001:db8::2]
+
+query T
+SELECT parse_url('http://[2001:db8::2]:8080/index.html?ok=1', 'host');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'HOST');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'host');
+----
+NULL
+
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
+query T
+SELECT parse_url('https://example.com', 'PATH');
+----
+(empty)
+
+query T
+SELECT parse_url('https://example.com', 'path');
+----
+NULL
+
+query T
+SELECT parse_url('https://example.com/a/b?x=1&y=2#frag', 'PROTOCOL');
+----
+https
+
+query T
+SELECT parse_url('https://example.com/a/b?x=1&y=2#frag', 'protocol');
+----
+NULL
+
+query T
+SELECT parse_url('https://ex.com/?Tag=ok', 'QUERY', 'tag');
+----
+NULL
+
+query T
+SELECT parse_url('https://ex.com/?Tag=ok', 'query', 'tag');
+----
+NULL
+
+statement error 'parse_url' does not support zero arguments
+SELECT parse_url();
+
+query error DataFusion error: Execution error: The url is invalid: inva lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate invalid URL and return NULL instead\. SQLSTATE: 22P02
+SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third arg is NULL
+query T
+SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
new file mode 100644
index 0000000000000..a0e42a16483f3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
@@ -0,0 +1,260 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/blob/b6095cc7fccaf016b47f009ba93b2357dc781a7d/python/pysail/tests/spark/function/test_try_parse_url.txt
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+query T
+SELECT try_parse_url('https://example.com/a?x=1', 'QUERY', 'x');
+----
+1
+
+query T
+SELECT try_parse_url('https://example.com/a?x=1', 'query', 'x');
+----
+NULL
+
+query T
+SELECT try_parse_url('www.example.com/path?x=1', 'HOST');
+----
+NULL
+
+query T
+SELECT try_parse_url('www.example.com/path?x=1', 'host');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://example.com/?a=1', 'QUERY', 'b');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://example.com/?a=1', 'query', 'b');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://example.com/path#frag', 'REF');
+----
+frag
+
+query T
+SELECT try_parse_url('https://example.com/path#frag', 'ref');
+----
+NULL
+
+query T
+SELECT try_parse_url('ftp://user:pwd@ftp.example.com:21/files', 'USERINFO');
+----
+user:pwd
+
+query T
+SELECT try_parse_url('ftp://user:pwd@ftp.example.com:21/files', 'userinfo');
+----
+NULL
+
+query T
+SELECT try_parse_url('http://[2001:db8::2]:8080/index.html?ok=1', 'HOST');
+----
+[2001:db8::2]
+
+query T
+SELECT try_parse_url('http://[2001:db8::2]:8080/index.html?ok=1', 'host');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'HOST');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'host');
+----
+NULL
+
+# Schemeless URLs: Spark java.net.URI behavior
+# Simple schemeless string
+query T
+SELECT try_parse_url('notaurl', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'FILE');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl', 'PROTOCOL');
+----
+NULL
+
+query T
+SELECT try_parse_url('notaurl', 'QUERY');
+----
+NULL
+
+# Schemeless URL with query string
+query T
+SELECT try_parse_url('notaurl?key=value', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'FILE');
+----
+notaurl?key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY');
+----
+key=value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'QUERY', 'key');
+----
+value
+
+query T
+SELECT try_parse_url('notaurl?key=value', 'HOST');
+----
+NULL
+
+# Schemeless URL with fragment
+query T
+SELECT try_parse_url('notaurl#reference', 'REF');
+----
+reference
+
+query T
+SELECT try_parse_url('notaurl#reference', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl#reference', 'FILE');
+----
+notaurl
+
+# Schemeless URL with both query and fragment
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'PATH');
+----
+notaurl
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY');
+----
+a=1&b=2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
+----
+2
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'REF');
+----
+frag
+
+query T
+SELECT try_parse_url('notaurl?a=1&b=2#frag', 'FILE');
+----
+notaurl?a=1&b=2
+
+query T
+SELECT try_parse_url('https://example.com', 'PATH');
+----
+(empty)
+
+query T
+SELECT try_parse_url('https://example.com', 'path');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://example.com/a/b?x=1&y=2#frag', 'PROTOCOL');
+----
+https
+
+query T
+SELECT try_parse_url('https://example.com/a/b?x=1&y=2#frag', 'protocol');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://ex.com/?Tag=ok', 'QUERY', 'tag');
+----
+NULL
+
+query T
+SELECT try_parse_url('https://ex.com/?Tag=ok', 'query', 'tag');
+----
+NULL
+
+query T
+SELECT try_parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
+----
+NULL
+
+# NULL argument handling (Sail PR #1393)
+# NULL URL should return NULL
+query T
+SELECT try_parse_url(NULL, 'HOST');
+----
+NULL
+
+# NULL part should return NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL);
+----
+NULL
+
+# Both NULL should return NULL
+query T
+SELECT try_parse_url(NULL, NULL);
+----
+NULL
+
+# NULL URL with 3 args
+query T
+SELECT try_parse_url(NULL, 'QUERY', 'key');
+----
+NULL
+
+# NULL part with 3 args
+query T
+SELECT try_parse_url('https://example.com/path?query=1', NULL, 'key');
+----
+NULL
+
+# NULL key with 3 args (valid URL and part) - Spark returns NULL when third arg is NULL
+query T
+SELECT try_parse_url('https://example.com/path?query=1', 'QUERY', NULL);
+----
+NULL
+
+# All three NULL
+query T
+SELECT try_parse_url(NULL, NULL, NULL);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
new file mode 100644
index 0000000000000..559c77af97e9a
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT try_url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT try_url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT try_url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT try_url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT try_url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT try_url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Handled invalid percent encoding error
+query T
+SELECT try_url_decode('https%3%2F%2Fspark.apache.org'::string);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_decode.slt b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
new file mode 100644
index 0000000000000..61399aa0ef2e7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Invalid percent encoding case
+query error DataFusion error: Execution error: Invalid percent\-encoding: invalid hex sequence '%3%' at position 5
+SELECT url_decode('https%3%2F%2Fspark.apache.org'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_encode.slt b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
new file mode 100644
index 0000000000000..3d7a42f19384b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT url_encode('https://spark.apache.org');
+----
+https%3A%2F%2Fspark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'LargeUtf8'));
+----
+https%3A%2F%2Fspark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'Utf8View'));
+----
+https%3A%2F%2Fspark.apache.org
diff --git a/datafusion/sqllogictest/test_files/spark/xml/xpath.slt b/datafusion/sqllogictest/test_files/spark/xml/xpath.slt
new file mode 100644
index 0000000000000..d1ff9239216c9
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/xml/xpath.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b');
+## PySpark 3.5.5 Result: {'xpath(<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>, a/b)': [None, None, None], 'typeof(xpath(<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>, a/b))': 'array<string>', 'typeof(<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>)': 'string', 'typeof(a/b)': 'string'}
+#query
+#SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>'::string, 'a/b'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/xml/xpath_boolean.slt b/datafusion/sqllogictest/test_files/spark/xml/xpath_boolean.slt
new file mode 100644
index 0000000000000..8a5dc693eb893
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/xml/xpath_boolean.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT xpath_boolean('<a><b>1</b></a>','a/b');
+## PySpark 3.5.5 Result: {'xpath_boolean(<a><b>1</b></a>, a/b)': True, 'typeof(xpath_boolean(<a><b>1</b></a>, a/b))': 'boolean', 'typeof(<a><b>1</b></a>)': 'string', 'typeof(a/b)': 'string'}
+#query
+#SELECT xpath_boolean('<a><b>1</b></a>'::string, 'a/b'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/xml/xpath_string.slt b/datafusion/sqllogictest/test_files/spark/xml/xpath_string.slt
new file mode 100644
index 0000000000000..cfabf467edfaa
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/xml/xpath_string.slt
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file was originally created by a porting script from:
+#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
+# This file is part of the implementation of the datafusion-spark function library.
+# For more information, please see:
+#   https://github.com/apache/datafusion/issues/15914
+
+## Original Query: SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c');
+## PySpark 3.5.5 Result: {'xpath_string(<a><b>b</b><c>cc</c></a>, a/c)': 'cc', 'typeof(xpath_string(<a><b>b</b><c>cc</c></a>, a/c))': 'string', 'typeof(<a><b>b</b><c>cc</c></a>)': 'string', 'typeof(a/c)': 'string'}
+#query
+#SELECT xpath_string('<a><b>b</b><c>cc</c></a>'::string, 'a/c'::string);
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt
index 79b783f89a614..a07eab3357141 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -132,10 +132,10 @@ SELECT substr('Hello🌏世界', 5, 3)
 ----
 o🌏世
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3)
 
-statement error The first argument of the substr function can only be a string, but got Int64
+statement error Function 'substr' failed to match any signature
 SELECT substr(1, 3, 4)
 
 statement error Execution error: negative substring length not allowed
@@ -207,6 +207,25 @@ SELECT ends_with('foobar', 'foo')
 ----
 false
 
+query B
+SELECT ends_with(a, '%bar') from (values ('foobar'), ('foo%bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT ends_with(a, '_bar') from (values ('foobar'), ('foo_bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT ends_with(a, '\_bar') from (values ('foobar'), ('foo\\bar'), ('foo\_bar')) as t(a);
+----
+false
+false
+true
+
 query I
 SELECT levenshtein('kitten', 'sitting')
 ----
@@ -303,6 +322,26 @@ SELECT regexp_replace(arrow_cast('foobar', 'Dictionary(Int32, Utf8)'), 'bar', 'x
 ----
 fooxx
 
+query T
+SELECT regexp_replace(arrow_cast('foobar', 'LargeUtf8'), 'bar', 'xx', 'gi')
+----
+fooxx
+
+query T
+SELECT regexp_replace(arrow_cast('foobar', 'Utf8View'), 'bar', 'xx', 'gi')
+----
+fooxx
+
+query T
+SELECT regexp_replace('foobar', arrow_cast('bar', 'LargeUtf8'), 'xx', 'gi')
+----
+fooxx
+
+query T
+SELECT regexp_replace('foobar', arrow_cast('bar', 'Utf8View'), 'xx', 'gi')
+----
+fooxx
+
 query T
 SELECT repeat('foo', 3)
 ----
@@ -350,6 +389,21 @@ SELECT reverse(arrow_cast('abcde', 'Utf8View'))
 ----
 edcba
 
+query T
+SELECT arrow_typeof(reverse('abcde'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(reverse(arrow_cast('abcde', 'LargeUtf8')))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(reverse(arrow_cast('abcde', 'Utf8View')))
+----
+Utf8View
+
 query T
 SELECT reverse(arrow_cast('abcde', 'Dictionary(Int32, Utf8)'))
 ----
@@ -826,6 +880,26 @@ SELECT starts_with('foobar', 'bar')
 ----
 false
 
+
+query B
+SELECT starts_with(a, 'foo%') from (values ('foobar'), ('foo%bar')) as t(a);
+----
+false
+true
+
+query B
+SELECT starts_with(a, 'foo\_') from (values ('foobar'), ('foo\\_bar'), ('foo\_bar')) as t(a);
+----
+false
+false
+true
+
+query B
+SELECT starts_with(a, 'foo_') from (values ('foobar'), ('foo_bar')) as t(a);
+----
+false
+true
+
 query TT
 select '   ', '|'
 ----
@@ -1709,3 +1783,25 @@ SELECT
 ;
 ----
 48 176 32 40
+
+# translate preserves input string type
+
+query T
+SELECT translate(arrow_cast('12345', 'Utf8View'), '143', 'ax')
+----
+a2x5
+
+query T
+SELECT arrow_typeof(translate('12345', '143', 'ax'))
+----
+Utf8
+
+query T
+SELECT arrow_typeof(translate(arrow_cast('12345', 'LargeUtf8'), '143', 'ax'))
+----
+LargeUtf8
+
+query T
+SELECT arrow_typeof(translate(arrow_cast('12345', 'Utf8View'), '143', 'ax'))
+----
+Utf8View
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index 6268c4ccdb1a5..2884c3518610d 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -528,7 +528,7 @@ FROM test_basic_operator;
 Andrew
 Xiangpeng
 Raphael
-under_scrre
+under_scare
 percent
 (empty)
 (empty)
@@ -542,10 +542,10 @@ SELECT
   TRANSLATE(unicode_1, 'foo', 'bar') as c
 FROM test_basic_operator;
 ----
-databusirn📊🔥
-databusirn数据融合
-databusirnДатаФусион
-un iść crre
+databusian📊🔥
+databusian数据融合
+databusianДатаФусион
+un iść care
 pan Tadeusz ma iść w kąt
 (empty)
 (empty)
@@ -993,25 +993,27 @@ NULL NULL NULL NULL
 # Test FIND_IN_SET
 # --------------------------------------
 
-query IIII
+query IIIIII
 SELECT
   FIND_IN_SET(ascii_1, 'a,b,c,d'),
   FIND_IN_SET(ascii_1, 'Andrew,Xiangpeng,Raphael'),
   FIND_IN_SET(unicode_1, 'a,b,c,d'),
-  FIND_IN_SET(unicode_1, 'datafusion📊🔥,datafusion数据融合,datafusionДатаФусион')
+  FIND_IN_SET(unicode_1, 'datafusion📊🔥,datafusion数据融合,datafusionДатаФусион'),
+  FIND_IN_SET(NULL, unicode_1),
+  FIND_IN_SET(unicode_1, NULL)
 FROM test_basic_operator;
 ----
-0 1 0 1
-0 2 0 2
-0 3 0 3
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-0 0 0 0
-NULL NULL NULL NULL
-NULL NULL NULL NULL
+0 1 0 1 NULL NULL
+0 2 0 2 NULL NULL
+0 3 0 3 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+0 0 0 0 NULL NULL
+NULL NULL NULL NULL NULL NULL
+NULL NULL NULL NULL NULL NULL
 
 # --------------------------------------
 # Test || operator
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt
index a72c8f5744849..4639103f9e8e3 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -370,7 +370,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6
+01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f\_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f\_o%") AS c5, test.column1_utf8view LIKE Utf8View("f\_o%") AS c6
 02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]
 
 ## Test STARTS_WITH works with column arguments
@@ -642,7 +642,17 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: concat_ws(Utf8(", "), test.column1_utf8view, test.column2_utf8view) AS c
+01)Projection: concat_ws(Utf8View(", "), test.column1_utf8view, test.column2_utf8view) AS c
+02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
+
+## Ensure CONCAT_WS simplification preserves Utf8View for merged literals
+query TT
+EXPLAIN SELECT
+  concat_ws(', ', column1_utf8view, 'foo', 'bar', column2_utf8view) as c
+FROM test;
+----
+logical_plan
+01)Projection: concat_ws(Utf8View(", "), test.column1_utf8view, Utf8View("foo, bar"), test.column2_utf8view) AS c
 02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
 
 ## Ensure no casts for CONTAINS
@@ -784,7 +794,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: regexp_like(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k
+01)Projection: test.column1_utf8view ~ Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k
 02)--TableScan: test projection=[column1_utf8view]
 
 ## Ensure no casts for REGEXP_MATCH
@@ -804,7 +814,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k
+01)Projection: regexp_replace(test.column1_utf8view, Utf8View("^https?://(?:www\.)?([^/]+)/.*$"), Utf8View("\1")) AS k
 02)--TableScan: test projection=[column1_utf8view]
 
 ## Ensure no casts for REPEAT
@@ -988,7 +998,7 @@ query TT
 EXPLAIN SELECT NVL(column1_utf8view, 'a') as c2 FROM test;
 ----
 logical_plan
-01)Projection: nvl(test.column1_utf8view, Utf8View("a")) AS c2
+01)Projection: CASE WHEN test.column1_utf8view IS NOT NULL THEN test.column1_utf8view ELSE Utf8View("a") END AS c2
 02)--TableScan: test projection=[column1_utf8view]
 
 ## Ensure no casts for nullif
@@ -1100,7 +1110,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
+01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1
 02)--TableScan: test projection=[column1_utf8view]
 
 # `~*` operator (regex match case-insensitive)
diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt
index 46e15a4d6d10d..53a1bb4ec6751 100644
--- a/datafusion/sqllogictest/test_files/struct.slt
+++ b/datafusion/sqllogictest/test_files/struct.slt
@@ -38,9 +38,9 @@ CREATE TABLE struct_values (
     s1 struct<INT>,
     s2 struct<a INT,b VARCHAR>
 ) AS VALUES
-  (struct(1), struct(1, 'string1')),
-  (struct(2), struct(2, 'string2')),
-  (struct(3), struct(3, 'string3'))
+  (struct(1), struct(1 AS a, 'string1' AS b)),
+  (struct(2), struct(2 AS a, 'string2' AS b)),
+  (struct(3), struct(3 AS a, 'string3' AS b))
 ;
 
 query ??
@@ -53,9 +53,9 @@ select * from struct_values;
 query TT
 select arrow_typeof(s1), arrow_typeof(s2) from struct_values;
 ----
-Struct(c0 Int32) Struct(a Int32, b Utf8View)
-Struct(c0 Int32) Struct(a Int32, b Utf8View)
-Struct(c0 Int32) Struct(a Int32, b Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
+Struct("c0": Int32) Struct("a": Int32, "b": Utf8View)
 
 
 # struct[i]
@@ -229,12 +229,12 @@ select named_struct('field_a', 1, 'field_b', 2);
 query T
 select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3));
 ----
-Struct(first Int64, second Int64, third Int64)
+Struct("first": Int64, "second": Int64, "third": Int64)
 
 query T
 select arrow_typeof({'first': 1, 'second': 2, 'third': 3});
 ----
-Struct(first Int64, second Int64, third Int64)
+Struct("first": Int64, "second": Int64, "third": Int64)
 
 # test nested struct literal
 query ?
@@ -271,12 +271,33 @@ select a from values where (a, c) = (1, 'a');
 ----
 1
 
+query I
+select a from values as v where (v.a, v.c) = (1, 'a');
+----
+1
+
+query I
+select a from values as v where (v.a, v.c) != (1, 'a');
+----
+2
+3
+
+query I
+select a from values as v where (v.a, v.c) = (1, 'b');
+----
+
 query I
 select a from values where (a, c) IN ((1, 'a'), (2, 'b'));
 ----
 1
 2
 
+query I
+select a from values as v where (v.a, v.c) IN ((1, 'a'), (2, 'b'));
+----
+1
+2
+
 statement ok
 drop table values;
 
@@ -376,7 +397,8 @@ drop view complex_view;
 
 # struct with different keys r1 and r2 is not valid
 statement ok
-create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
+create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values
+  (struct('red' AS r1, 1 AS c), struct('blue' AS r2, 2.3 AS c));
 
 # Expect same keys for struct type but got mismatched pair r1,c and r2,c
 query error
@@ -387,12 +409,13 @@ drop table t;
 
 # struct with the same key
 statement ok
-create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
+create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values
+  (struct('red' AS r, 1 AS c), struct('blue' AS r, 2.3 AS c));
 
 query T
 select arrow_typeof([a, b]) from t;
 ----
-List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(Struct("r": Utf8View, "c": Float32))
 
 query ?
 select [a, b] from t;
@@ -421,9 +444,9 @@ CREATE TABLE struct_values (
     s1 struct(a int, b varchar),
     s2 struct(a int, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1, 'string1')),
-  (row(2, 'blue'), row(2, 'string2')),
-  (row(3, 'green'), row(3, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1, b: 'string1'}),
+  ({a: 2, b: 'blue'}, {a: 2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 3, b: 'string3'})
 ;
 
 statement ok
@@ -431,8 +454,8 @@ drop table struct_values;
 
 statement ok
 create table t (c1 struct(r varchar, b int), c2 struct(r varchar, b float)) as values (
-    row('red', 2),
-    row('blue', 2.3)
+    {r: 'red', b: 2},
+    {r: 'blue', b: 2.3}
 );
 
 query ??
@@ -443,12 +466,12 @@ select * from t;
 query T
 select arrow_typeof(c1) from t;
 ----
-Struct(r Utf8View, b Int32)
+Struct("r": Utf8View, "b": Int32)
 
 query T
 select arrow_typeof(c2) from t;
 ----
-Struct(r Utf8View, b Float32)
+Struct("r": Utf8View, "b": Float32)
 
 statement ok
 drop table t;
@@ -465,15 +488,12 @@ select * from t;
 query T
 select arrow_typeof(column1) from t;
 ----
-Struct(r Utf8, c Float64)
-Struct(r Utf8, c Float64)
+Struct("r": Utf8, "c": Float64)
+Struct("r": Utf8, "c": Float64)
 
 statement ok
 drop table t;
 
-query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Float64 type
-create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'});
-
 ##################################
 ## Test Coalesce with Struct
 ##################################
@@ -483,9 +503,9 @@ CREATE TABLE t (
     s1 struct(a int, b varchar),
     s2 struct(a float, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1.1, 'string1')),
-  (row(2, 'blue'), row(2.2, 'string2')),
-  (row(3, 'green'), row(33.2, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
+  ({a: 2, b: 'blue'}, {a: 2.2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
 ;
 
 query ?
@@ -498,9 +518,9 @@ select coalesce(s1) from t;
 query T
 select arrow_typeof(coalesce(s1, s2)) from t;
 ----
-Struct(a Float32, b Utf8View)
-Struct(a Float32, b Utf8View)
-Struct(a Float32, b Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
 
 statement ok
 drop table t;
@@ -510,9 +530,9 @@ CREATE TABLE t (
     s1 struct(a int, b varchar),
     s2 struct(a float, b varchar)
 ) AS VALUES
-  (row(1, 'red'), row(1.1, 'string1')),
-  (null, row(2.2, 'string2')),
-  (row(3, 'green'), row(33.2, 'string3'))
+  ({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
+  (null, {a: 2.2, b: 'string2'}),
+  ({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
 ;
 
 query ?
@@ -525,23 +545,19 @@ select coalesce(s1, s2) from t;
 query T
 select arrow_typeof(coalesce(s1, s2)) from t;
 ----
-Struct(a Float32, b Utf8View)
-Struct(a Float32, b Utf8View)
-Struct(a Float32, b Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
+Struct("a": Float32, "b": Utf8View)
 
 statement ok
 drop table t;
 
-# row() with incorrect order
-statement error DataFusion error: Arrow error: Cast error: Cannot cast string 'blue' to value of Float32 type
-create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values 
-    (row('red', 1), row(2.3, 'blue')),
-    (row('purple', 1), row('green', 2.3));
+# row() with incorrect order - row() is positional, not name-based
+statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'blue' to value of Float32 type
+create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values
+  ({r: 'red', c: 1}, {r: 2.3, c: 'blue'}),
+  ({r: 'purple', c: 1}, {r: 'green', c: 2.3});
 
-# out of order struct literal
-# TODO: This query should not fail
-statement error DataFusion error: Arrow error: Cast error: Cannot cast string 'b' to value of Int32 type
-create table t(a struct(r varchar, c int)) as values ({r: 'a', c: 1}), ({c: 2, r: 'b'});
 
 ##################################
 ## Test Array of Struct
@@ -552,46 +568,33 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}];
 ----
 [{r: a, c: 1}, {r: b, c: 2}]
 
-# Can't create a list of struct with different field types
-query error
-select [{r: 'a', c: 1}, {c: 2, r: 'b'}];
 
 statement ok
-create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('a', 1), row('b', 2.3));
+create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values ({r: 'a', c: 1}, {r: 'b', c: 2.3});
 
 query T
 select arrow_typeof([a, b]) from t;
 ----
-List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })
+List(Struct("r": Utf8View, "c": Float32))
 
 statement ok
 drop table t;
 
-# create table with different struct type is fine
-statement ok
-create table t(a struct(r varchar, c int), b struct(c float, r varchar)) as values (row('a', 1), row(2.3, 'b'));
-
-# create array with different struct type is not valid
-query error
-select arrow_typeof([a, b]) from t;
-
-statement ok
-drop table t;
 
 statement ok
-create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values (row('a', 1, 2.3), row('b', 2.3, 2));
+create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values ({r: 'a', c: 1, g: 2.3}, {r: 'b', c: 2.3, g: 2});
 
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(a) from t;
 ----
-Struct(r Utf8View, c Int32, g Float32)
+Struct("r": Utf8View, "c": Int32, "g": Float32)
 
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(b) from t;
 ----
-Struct(r Utf8View, c Float32, g Int32)
+Struct("r": Utf8View, "c": Float32, "g": Int32)
 
 statement ok
 drop table t;
@@ -601,7 +604,7 @@ drop table t;
 # This tests accessing struct fields using the subscript notation with string literals
 
 statement ok
-create table test (struct_field struct(substruct int)) as values (struct(1));
+create table test (struct_field struct(substruct int)) as values ({substruct: 1});
 
 query ??
 select *
@@ -614,7 +617,7 @@ statement ok
 DROP TABLE test;
 
 statement ok
-create table test (struct_field struct(substruct struct(subsubstruct int))) as values (struct(struct(1)));
+create table test (struct_field struct(substruct struct(subsubstruct int))) as values ({substruct: {subsubstruct: 1}});
 
 query ??
 select *
@@ -632,3 +635,1035 @@ test1.struct_field['substruct']['subsubstruct'] = test2.struct_field['substruct'
 
 statement ok
 drop table test;
+
+# Test nested get_field with multiple arguments
+query I
+select get_field({'a': {'b': 1}}, 'a', 'b');
+----
+1
+
+# Test nested get_field with three levels
+query I
+select get_field({'a': {'b': {'c': 42}}}, 'a', 'b', 'c');
+----
+42
+
+# Test type validation error - accessing field on non-struct
+query error Cannot access field at argument 2: type Int64 is not Struct, Map, or Null
+select get_field({'a': 1}, 'a', 'b');
+
+# Test that bracket syntax produces a single get_field call (not nested)
+# We use a table column to prevent constant folding
+statement ok
+create table explain_test (s struct(a struct(b int))) as values ({'a': {'b': 1}});
+
+query TT
+explain select s['a']['b'] from explain_test;
+----
+logical_plan
+01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b"))
+02)--TableScan: explain_test projection=[s]
+physical_plan
+01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement ok
+drop table explain_test;
+
+# Test with nested struct in table
+statement ok
+create table nested_test (s struct(inner struct(val int))) as values ({'inner': {'val': 100}});
+
+query I
+select s['inner']['val'] from nested_test;
+----
+100
+
+query I
+select get_field(s, 'inner', 'val') from nested_test;
+----
+100
+
+statement ok
+drop table nested_test;
+
+# Test mixed struct and map access
+query I
+select get_field({'m': map(['k'], [42])}, 'm', 'k');
+----
+42
+
+# Test nested map access
+query I
+select {'m': map(['outer'], [map(['inner'], [99])])}['m']['outer']['inner'];
+----
+99
+
+###############################################
+# Additional tests for nested get_field support
+###############################################
+
+# Backwards compatibility: original 2-argument form still works
+query I
+select get_field({'a': 1}, 'a');
+----
+1
+
+# Error: get_field with no arguments
+query error get_field requires at least 2 arguments, got 0
+select get_field();
+
+# Error: get_field with only 1 argument
+query error get_field requires at least 2 arguments, got 1
+select get_field({'a': 1});
+
+# Null handling: null at base should return null
+query I
+select get_field(CAST(NULL AS STRUCT(a STRUCT(b INT))), 'a', 'b');
+----
+NULL
+
+# Null handling: null in middle of chain
+statement ok
+create table null_mid_test (s STRUCT(a STRUCT(b INT)));
+
+statement ok
+insert into null_mid_test values ({'a': NULL});
+
+query I
+select s['a']['b'] from null_mid_test;
+----
+NULL
+
+query I
+select get_field(s, 'a', 'b') from null_mid_test;
+----
+NULL
+
+statement ok
+drop table null_mid_test;
+
+# Type validation error at argument 3
+query error Cannot access field at argument 3: type Int64 is not Struct, Map, or Null
+select get_field({'a': {'b': 2}}, 'a', 'b', 'c');
+
+# Type validation error at argument 4
+query error Cannot access field at argument 4: type Int64 is not Struct, Map, or Null
+select get_field({'a': {'b': {'c': 3}}}, 'a', 'b', 'c', 'd');
+
+# Non-existent field at first level
+query error Field x not found in struct
+select get_field({'a': 1}, 'x');
+
+# Non-existent field at second level
+query error Field x not found in struct
+select get_field({'a': {'b': 1}}, 'a', 'x');
+
+# Deep nesting: 5-level access
+query I
+select get_field({'l1': {'l2': {'l3': {'l4': {'l5': 42}}}}}, 'l1', 'l2', 'l3', 'l4', 'l5');
+----
+42
+
+# Deep nesting: 5-level access via bracket syntax
+query I
+select {'l1': {'l2': {'l3': {'l4': {'l5': 99}}}}}['l1']['l2']['l3']['l4']['l5'];
+----
+99
+
+# Mixed array and struct access: array index should break the batching
+statement ok
+create table mixed_access_test (data STRUCT(items STRUCT(name VARCHAR)[]) );
+
+statement ok
+insert into mixed_access_test values ({'items': [{'name': 'first'}, {'name': 'second'}]});
+
+query T
+select data['items'][1]['name'] from mixed_access_test;
+----
+first
+
+query T
+select data['items'][2]['name'] from mixed_access_test;
+----
+second
+
+statement ok
+drop table mixed_access_test;
+
+# Nullable parent propagation: null parent should propagate
+statement ok
+create table nullable_parent_test (s STRUCT(a STRUCT(b INT)));
+
+statement ok
+insert into nullable_parent_test values ({'a': {'b': 1}}), (NULL);
+
+query I
+select s['a']['b'] from nullable_parent_test;
+----
+1
+NULL
+
+statement ok
+drop table nullable_parent_test;
+
+# Test struct casting with field reordering - string fields
+query ?
+SELECT CAST({b: 'b_value', a: 'a_value'} AS STRUCT(a VARCHAR, b VARCHAR));
+----
+{a: a_value, b: b_value}
+
+# Test struct casting with field reordering - integer fields
+query ?
+SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT));
+----
+{a: 4, b: 3}
+
+# Test with type casting AND field reordering
+query ?
+SELECT CAST({b: 3, a: 4} AS STRUCT(a BIGINT, b INT));
+----
+{a: 4, b: 3}
+
+# Test casting with explicit field names
+query ?
+SELECT CAST({a: 1, b: 'x'} AS STRUCT(a INT, b VARCHAR));
+----
+{a: 1, b: x}
+
+# Test with missing field - should insert nulls
+query ?
+SELECT CAST({a: 1} AS STRUCT(a INT, b INT));
+----
+{a: 1, b: NULL}
+
+# Test with extra source field - should be ignored
+query ?
+SELECT CAST({a: 1, b: 2, extra: 3} AS STRUCT(a INT, b INT));
+----
+{a: 1, b: 2}
+
+# Test no overlap with mismatched field count - should fail because no field names match
+statement error DataFusion error: (Plan error|Error during planning|This feature is not implemented): (Cannot cast struct: at least one field name must match between source and target|Cannot cast struct with 3 fields to 2 fields without name overlap|Unsupported CAST from Struct)
+SELECT CAST(struct(1, 'x', 'y') AS STRUCT(a INT, b VARCHAR));
+
+# Test nested struct with field reordering
+query ?
+SELECT CAST(
+  {inner: {y: 2, x: 1}}
+  AS STRUCT(inner STRUCT(x INT, y INT))
+);
+----
+{inner: {x: 1, y: 2}}
+
+# Test field reordering with table data
+statement ok
+CREATE TABLE struct_reorder_test (
+  data STRUCT(b INT, a VARCHAR)
+) AS VALUES
+  ({b: 100, a: 'first'}),
+  ({b: 200, a: 'second'}),
+  ({b: 300, a: 'third'})
+;
+
+query ?
+SELECT CAST(data AS STRUCT(a VARCHAR, b INT)) AS casted_data FROM struct_reorder_test ORDER BY data['b'];
+----
+{a: first, b: 100}
+{a: second, b: 200}
+{a: third, b: 300}
+
+statement ok
+drop table struct_reorder_test;
+
+# Test casting struct with multiple levels of nesting and reordering
+query ?
+SELECT CAST(
+  {level1: {z: 100, y: 'inner', x: 1}}
+  AS STRUCT(level1 STRUCT(x INT, y VARCHAR, z INT))
+);
+----
+{level1: {x: 1, y: inner, z: 100}}
+
+# Test field reordering with nulls in source
+query ?
+SELECT CAST(
+  {b: NULL::INT, a: 42}
+  AS STRUCT(a INT, b INT)
+);
+----
+{a: 42, b: NULL}
+
+# Test casting preserves struct-level nulls
+query ?
+SELECT CAST(NULL::STRUCT(b INT, a INT) AS STRUCT(a INT, b INT));
+----
+NULL
+
+############################
+# Implicit Coercion Tests with CREATE TABLE AS VALUES
+############################
+
+# Test implicit coercion with same field order, different types
+statement ok
+create table t as values({r: 'a', c: 1}), ({r: 'b', c: 2.3});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("r": Utf8, "c": Float64)
+
+query ?
+select * from t order by column1.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nullable fields (same order)
+statement ok
+create table t as values({a: 1, b: 'x'}), ({a: 2, b: 'y'});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("a": Int64, "b": Utf8)
+
+query ?
+select * from t order by column1.a;
+----
+{a: 1, b: x}
+{a: 2, b: y}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nested structs (same field order)
+statement ok
+create table t as 
+  select {outer: {x: 1, y: 2}} as column1
+  union all
+  select {outer: {x: 3, y: 4}};
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("outer": Struct("x": Int64, "y": Int64))
+
+query ?
+select column1 from t order by column1.outer.x;
+----
+{outer: {x: 1, y: 2}}
+{outer: {x: 3, y: 4}}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with type widening (Int32 -> Int64)
+statement ok
+create table t as values({id: 1, val: 100}), ({id: 2, val: 9223372036854775807});
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("id": Int64, "val": Int64)
+
+query ?
+select * from t order by column1.id;
+----
+{id: 1, val: 100}
+{id: 2, val: 9223372036854775807}
+
+statement ok
+drop table t;
+
+# Test implicit coercion with nested struct and type coercion
+statement ok
+create table t as 
+  select {name: 'Alice', data: {score: 100, active: true}} as column1
+  union all
+  select {name: 'Bob', data: {score: 200, active: false}};
+
+query T
+select arrow_typeof(column1) from t limit 1;
+----
+Struct("name": Utf8, "data": Struct("score": Int64, "active": Boolean))
+
+query ?
+select column1 from t order by column1.name;
+----
+{name: Alice, data: {score: 100, active: true}}
+{name: Bob, data: {score: 200, active: false}}
+
+statement ok
+drop table t;
+
+############################
+# Field Reordering Tests (using explicit CAST)
+############################
+
+# Test explicit cast with field reordering in VALUES - basic case
+query ?
+select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT));
+----
+{r: b, c: 2.3}
+
+# Test explicit cast with field reordering - multiple rows
+query ?
+select * from (values 
+  (CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT))),
+  (CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT)))
+) order by column1.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+# Test table with explicit cast for field reordering
+statement ok
+create table t as select CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT)) as s
+union all
+select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT));
+
+query T
+select arrow_typeof(s) from t limit 1;
+----
+Struct("r": Utf8View, "c": Float32)
+
+query ?
+select * from t order by s.r;
+----
+{r: a, c: 1.0}
+{r: b, c: 2.3}
+
+statement ok
+drop table t;
+
+# Test field reordering with nullable fields using CAST
+query ?
+select CAST({b: NULL, a: 42} AS STRUCT(a INT, b INT));
+----
+{a: 42, b: NULL}
+
+# Test field reordering with nested structs using CAST
+query ?
+select CAST({outer: {y: 4, x: 3}} AS STRUCT(outer STRUCT(x INT, y INT)));
+----
+{outer: {x: 3, y: 4}}
+
+# Test complex nested field reordering
+query ?
+select CAST(
+  {data: {active: false, score: 200}, name: 'Bob'}
+  AS STRUCT(name VARCHAR, data STRUCT(score INT, active BOOLEAN))
+);
+----
+{name: Bob, data: {score: 200, active: false}}
+
+############################
+# Array Literal Tests with Struct Field Reordering (Implicit Coercion)
+############################
+
+# Test array literal with reordered struct fields - implicit coercion by name
+# Field order in unified schema is determined during type coercion
+query ?
+select [{r: 'a', c: 1}, {c: 2.3, r: 'b'}];
+----
+[{c: 1.0, r: a}, {c: 2.3, r: b}]
+
+# Test array literal with same-named fields but different order
+# Fields are reordered during coercion
+query ?
+select [{a: 1, b: 2}, {b: 3, a: 4}];
+----
+[{b: 2, a: 1}, {b: 3, a: 4}]
+
+# Test array literal with explicit cast to unify struct schemas with partial overlap
+# Use CAST to explicitly unify schemas when fields don't match completely
+query ?
+select [
+  CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)),
+  CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT))
+];
+----
+[{a: 1, b: 2, c: NULL}, {a: NULL, b: 3, c: 4}]
+
+# Test NULL handling in array literals with reordered but matching fields
+query ?
+select [{a: NULL, b: 1}, {b: 2, a: NULL}];
+----
+[{b: 1, a: NULL}, {b: 2, a: NULL}]
+
+# Verify arrow_typeof for array with reordered struct fields
+# The unified schema type follows the coercion order
+query T
+select arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]);
+----
+List(Struct("y": Int64, "x": Int64))
+
+# Test array of structs with matching nested fields in different order
+# Inner nested fields are also reordered during coercion
+query ?
+select [
+  {id: 1, info: {name: 'Alice', age: 30}},
+  {info: {age: 25, name: 'Bob'}, id: 2}
+];
+----
+[{info: {age: 30, name: Alice}, id: 1}, {info: {age: 25, name: Bob}, id: 2}]
+
+# Test nested arrays with matching struct fields (different order)
+query ?
+select [[{x: 1, y: 2}], [{y: 4, x: 3}]];
+----
+[[{x: 1, y: 2}], [{x: 3, y: 4}]]
+
+# Test array literal with float type coercion across elements
+query ?
+select [{val: 1}, {val: 2.5}];
+----
+[{val: 1.0}, {val: 2.5}]
+
+############################
+# Dynamic Array Construction Tests (from Table Columns)
+############################
+
+# Setup test table with struct columns for dynamic array construction
+statement ok
+create table t_complete_overlap (
+  s1 struct(x int, y int),
+  s2 struct(y int, x int)
+) as values
+  ({x: 1, y: 2}, {y: 3, x: 4}),
+  ({x: 5, y: 6}, {y: 7, x: 8});
+
+# Test 1: Complete overlap - same fields, different order
+# Verify arrow_typeof for dynamically created array
+query T
+select arrow_typeof([s1, s2]) from t_complete_overlap limit 1;
+----
+List(Struct("y": Int32, "x": Int32))
+
+# Verify values are correctly mapped by name in the array
+# Field order follows the second column's field order
+query ?
+select [s1, s2] from t_complete_overlap order by s1.x;
+----
+[{y: 2, x: 1}, {y: 3, x: 4}]
+[{y: 6, x: 5}, {y: 7, x: 8}]
+
+statement ok
+drop table t_complete_overlap;
+
+# Test 2: Partial overlap - some shared fields between columns
+# Note: Columns must have the exact same field set for array construction to work
+# Test with identical field set (all fields present in both columns)
+statement ok
+create table t_partial_overlap (
+  col_a struct(name VARCHAR, age int, active boolean),
+  col_b struct(age int, name VARCHAR, active boolean)
+) as values
+  ({name: 'Alice', age: 30, active: true}, {age: 25, name: 'Bob', active: false}),
+  ({name: 'Charlie', age: 35, active: true}, {age: 40, name: 'Diana', active: false});
+
+# Verify unified type includes all fields from both structs
+query T
+select arrow_typeof([col_a, col_b]) from t_partial_overlap limit 1;
+----
+List(Struct("age": Int32, "name": Utf8View, "active": Boolean))
+
+# Verify values are correctly mapped by name in the array
+# Field order follows the second column's field order
+query ?
+select [col_a, col_b] from t_partial_overlap order by col_a.name;
+----
+[{age: 30, name: Alice, active: true}, {age: 25, name: Bob, active: false}]
+[{age: 35, name: Charlie, active: true}, {age: 40, name: Diana, active: false}]
+
+statement ok
+drop table t_partial_overlap;
+
+# Test 3: Complete field set matching (no CAST needed)
+# Schemas already align; confirm unified type and values
+statement ok
+create table t_with_cast (
+  col_x struct(id int, description VARCHAR),
+  col_y struct(id int, description VARCHAR)
+) as values
+  ({id: 1, description: 'First'}, {id: 10, description: 'First Value'}),
+  ({id: 2, description: 'Second'}, {id: 20, description: 'Second Value'});
+
+# Verify type unification with all fields
+query T
+select arrow_typeof([col_x, col_y]) from t_with_cast limit 1;
+----
+List(Struct("id": Int32, "description": Utf8View))
+
+# Verify values remain aligned by name
+query ?
+select [col_x, col_y] from t_with_cast order by col_x.id;
+----
+[{id: 1, description: First}, {id: 10, description: First Value}]
+[{id: 2, description: Second}, {id: 20, description: Second Value}]
+
+statement ok
+drop table t_with_cast;
+
+# Test 4: Explicit CAST for partial field overlap scenarios
+# When columns have different field sets, use explicit CAST to unify schemas
+query ?
+select [
+  CAST({id: 1} AS STRUCT(id INT, description VARCHAR)),
+  CAST({id: 10, description: 'Value'} AS STRUCT(id INT, description VARCHAR))
+];
+----
+[{id: 1, description: NULL}, {id: 10, description: Value}]
+
+# Test 5: Complex nested structs with field reordering
+# Nested fields must have the exact same field set for array construction
+statement ok
+create table t_nested (
+  col_1 struct(id int, outer struct(x int, y int)),
+  col_2 struct(id int, outer struct(x int, y int))
+) as values
+  ({id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}),
+  ({id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}});
+
+# Verify nested struct in unified schema
+query T
+select arrow_typeof([col_1, col_2]) from t_nested limit 1;
+----
+List(Struct("id": Int32, "outer": Struct("x": Int32, "y": Int32)))
+
+# Verify nested field values are correctly mapped
+query ?
+select [col_1, col_2] from t_nested order by col_1.id;
+----
+[{id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}]
+[{id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}}]
+
+statement ok
+drop table t_nested;
+
+# Test 6: NULL handling with matching field sets
+statement ok
+create table t_nulls (
+  col_a struct(val int, flag boolean),
+  col_b struct(val int, flag boolean)
+) as values
+  ({val: 1, flag: true}, {val: 10, flag: false}),
+  ({val: NULL, flag: false}, {val: NULL, flag: true});
+
+# Verify NULL values are preserved
+query ?
+select [col_a, col_b] from t_nulls order by col_a.val;
+----
+[{val: 1, flag: true}, {val: 10, flag: false}]
+[{val: NULL, flag: false}, {val: NULL, flag: true}]
+
+statement ok
+drop table t_nulls;
+
+# Test 7: Multiple columns with complete field matching
+statement ok
+create table t_multi (
+  col1 struct(a int, b int, c int),
+  col2 struct(a int, b int, c int)
+) as values
+  ({a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}),
+  ({a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60});
+
+# Verify array with complete field matching
+query T
+select arrow_typeof([col1, col2]) from t_multi limit 1;
+----
+List(Struct("a": Int32, "b": Int32, "c": Int32))
+
+# Verify values are correctly unified
+query ?
+select [col1, col2] from t_multi order by col1.a;
+----
+[{a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}]
+[{a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60}]
+
+statement ok
+drop table t_multi;
+
+############################
+# Comprehensive Implicit Struct Coercion Suite
+############################
+
+# Test 1: VALUES clause with field reordering coerced by name into declared schema
+statement ok
+create table implicit_values_reorder (
+  s struct(a int, b int)
+) as values
+  ({a: 1, b: 2}),
+  ({b: 3, a: 4});
+
+query T
+select arrow_typeof(s) from implicit_values_reorder limit 1;
+----
+Struct("a": Int32, "b": Int32)
+
+query ?
+select s from implicit_values_reorder order by s.a;
+----
+{a: 1, b: 2}
+{a: 4, b: 3}
+
+statement ok
+drop table implicit_values_reorder;
+
+# Test 2: Array literal coercion with reordered struct fields
+query IIII
+select 
+  [{a: 1, b: 2}, {b: 3, a: 4}][1]['a'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][1]['b'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][2]['a'],
+  [{a: 1, b: 2}, {b: 3, a: 4}][2]['b'];
+----
+1 2 4 3
+
+# Test 3: Array construction from columns with reordered struct fields
+statement ok
+create table struct_columns_order (
+  s1 struct(a int, b int),
+  s2 struct(b int, a int)
+) as values
+  ({a: 1, b: 2}, {b: 3, a: 4}),
+  ({a: 5, b: 6}, {b: 7, a: 8});
+
+query IIII
+select 
+  [s1, s2][1]['a'],
+  [s1, s2][1]['b'],
+  [s1, s2][2]['a'],
+  [s1, s2][2]['b']
+from struct_columns_order
+order by s1['a'];
+----
+1 2 4 3
+5 6 8 7
+
+statement ok
+drop table struct_columns_order;
+
+# Test 4: UNION with struct field reordering
+query II
+select s['a'], s['b']
+from (
+  select {a: 1, b: 2} as s
+  union all
+  select {b: 3, a: 4} as s
+) t
+order by s['a'];
+----
+1 2
+4 3
+
+# Test 5: CTE with struct coercion across branches
+query II
+with 
+  t1 as (select {a: 1, b: 2} as s),
+  t2 as (select {b: 3, a: 4} as s)
+select t1.s['a'], t1.s['b'] from t1
+union all
+select t2.s['a'], t2.s['b'] from t2
+order by 1;
+----
+1 2
+4 3
+
+# Test 6: Struct aggregation retains name-based mapping
+statement ok
+create table agg_structs_reorder (
+  k int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (1, {y: 3, x: 4}),
+  (2, {x: 5, y: 6});
+
+query I?
+select k, array_agg(s) from agg_structs_reorder group by k order by k;
+----
+1 [{x: 1, y: 2}, {x: 4, y: 3}]
+2 [{x: 5, y: 6}]
+
+statement ok
+drop table agg_structs_reorder;
+
+# Test 7: Nested struct coercion with reordered inner fields
+query IIII
+with nested as (
+  select [{outer: {inner: 1, value: 2}}, {outer: {value: 3, inner: 4}}] as arr
+)
+select 
+  arr[1]['outer']['inner'],
+  arr[1]['outer']['value'],
+  arr[2]['outer']['inner'],
+  arr[2]['outer']['value']
+from nested;
+----
+1 2 4 3
+
+# Test 8: Partial name overlap - currently errors (field count mismatch detected)
+# This is a documented limitation: structs must have exactly same field set for coercion
+query error DataFusion error: Error during planning: Inconsistent data type across values list
+select column1 from (values ({a: 1, b: 2}), ({b: 3, c: 4})) order by column1['a'];
+
+# Negative test: mismatched struct field counts are rejected (documented limitation)
+query error DataFusion error: .*
+select [{a: 1}, {a: 2, b: 3}];
+
+# Test 9: INSERT with name-based struct coercion into target schema
+statement ok
+create table target_struct_insert (s struct(a int, b int));
+
+statement ok
+insert into target_struct_insert values ({b: 1, a: 2});
+
+query ?
+select s from target_struct_insert;
+----
+{a: 2, b: 1}
+
+statement ok
+drop table target_struct_insert;
+
+# Test 10: CASE expression with different struct field orders
+query II
+select 
+  (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['a'] as a_val,
+  (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['b'] as b_val;
+----
+1 2
+
+############################
+# JOIN Coercion Tests
+############################
+
+# Test: Struct coercion in JOIN ON condition
+statement ok
+create table t_left (
+  id int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (2, {x: 3, y: 4});
+
+statement ok
+create table t_right (
+  id int,
+  s struct(y int, x int)
+) as values
+  (1, {y: 2, x: 1}),
+  (2, {y: 4, x: 3});
+
+# JOIN on reordered struct fields - matched by name
+query IIII
+select t_left.id, t_left.s['x'], t_left.s['y'], t_right.id
+from t_left
+join t_right on t_left.s = t_right.s
+order by t_left.id;
+----
+1 1 2 1
+2 3 4 2
+
+statement ok
+drop table t_left;
+
+statement ok
+drop table t_right;
+
+# Test: Struct coercion with filtered JOIN
+statement ok
+create table orders (
+  order_id int,
+  customer struct(name varchar, id int)
+) as values
+  (1, {name: 'Alice', id: 100}),
+  (2, {name: 'Bob', id: 101}),
+  (3, {name: 'Charlie', id: 102});
+
+statement ok
+create table customers (
+  customer_id int,
+  info struct(id int, name varchar)
+) as values
+  (100, {id: 100, name: 'Alice'}),
+  (101, {id: 101, name: 'Bob'}),
+  (103, {id: 103, name: 'Diana'});
+
+# Join with struct field reordering - names matched, not positions
+query I
+select count(*) from orders
+join customers on orders.customer = customers.info
+where orders.order_id <= 2;
+----
+2
+
+statement ok
+drop table orders;
+
+statement ok
+drop table customers;
+
+############################
+# WHERE Predicate Coercion Tests
+############################
+
+# Test: Struct equality in WHERE clause with field reordering
+statement ok
+create table t_where (
+  id int,
+  s struct(x int, y int)
+) as values
+  (1, {x: 1, y: 2}),
+  (2, {x: 3, y: 4}),
+  (3, {x: 1, y: 2});
+
+# WHERE clause with struct comparison - coerced by name
+query I
+select id from t_where
+where s = {y: 2, x: 1}
+order by id;
+----
+1
+3
+
+statement ok
+drop table t_where;
+
+# Test: Struct IN clause with reordering
+statement ok
+create table t_in (
+  id int,
+  s struct(a int, b varchar)
+) as values
+  (1, {a: 1, b: 'x'}),
+  (2, {a: 2, b: 'y'}),
+  (3, {a: 1, b: 'x'});
+
+# IN clause with reordered struct literals
+query I
+select id from t_in
+where s in ({b: 'x', a: 1}, {b: 'y', a: 2})
+order by id;
+----
+1
+2
+3
+
+statement ok
+drop table t_in;
+
+# Test: Struct BETWEEN (not supported, but documents limitation)
+# Structs don't support BETWEEN, but can use comparison operators
+
+statement ok
+create table t_between (
+  id int,
+  s struct(val int)
+) as values
+  (1, {val: 10}),
+  (2, {val: 20}),
+  (3, {val: 30});
+
+# Comparison via field extraction works
+query I
+select id from t_between
+where s['val'] >= 20
+order by id;
+----
+2
+3
+
+statement ok
+drop table t_between;
+
+############################
+# Window Function Coercion Tests
+############################
+
+# Test: Struct in window function PARTITION BY
+statement ok
+create table t_window (
+  id int,
+  s struct(category int, value int)
+) as values
+  (1, {category: 1, value: 10}),
+  (2, {category: 1, value: 20}),
+  (3, {category: 2, value: 30}),
+  (4, {category: 2, value: 40});
+
+# Window partition on struct field via extraction
+query III
+select 
+  id,
+  s['value'],
+  row_number() over (partition by s['category'] order by s['value'])
+from t_window
+order by id;
+----
+1 10 1
+2 20 2
+3 30 1
+4 40 2
+
+statement ok
+drop table t_window;
+
+# Test: Struct in window function ORDER BY with coercion
+statement ok
+create table t_rank (
+  id int,
+  s struct(rank_val int, group_id int)
+) as values
+  (1, {rank_val: 10, group_id: 1}),
+  (2, {rank_val: 20, group_id: 1}),
+  (3, {rank_val: 15, group_id: 2});
+
+# Window ranking with struct field extraction
+query III
+select 
+  id,
+  s['rank_val'],
+  rank() over (partition by s['group_id'] order by s['rank_val'])
+from t_rank
+order by id;
+----
+1 10 1
+2 20 2
+3 15 1
+
+statement ok
+drop table t_rank;
+
+# Test: Aggregate function with struct coercion across window partitions
+statement ok
+create table t_agg_window (
+  id int,
+  partition_id int,
+  s struct(amount int)
+) as values
+  (1, 1, {amount: 100}),
+  (2, 1, {amount: 200}),
+  (3, 2, {amount: 150});
+
+# Running sum via extracted struct field
+query III
+select 
+  id,
+  partition_id,
+  sum(s['amount']) over (partition by partition_id order by id)
+from t_agg_window
+order by id;
+----
+1 1 100
+2 1 300
+3 2 150
+
+statement ok
+drop table t_agg_window;
diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt
index 796570633f67c..9c7c2ddb5d85c 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -201,18 +201,16 @@ logical_plan
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1
@@ -236,18 +234,16 @@ logical_plan
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int * Float64(1)) + Int64(1)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int * Float64(1)) + Int64(1)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int * Float64(1))@1 + 1 as sum(t2.t2_int * Float64(1)) + Int64(1), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int * Float64(1)) + Int64(1)@0, t1_id@2]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int * Float64(1))@1 + 1 as sum(t2.t2_int * Float64(1)) + Int64(1), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query IR rowsort
 SELECT t1_id, (SELECT sum(t2_int * 1.0) + 1 FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1
@@ -271,18 +267,16 @@ logical_plan
 07)----------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-07)------------CoalesceBatchesExec: target_batch_size=2
-08)--------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-09)----------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
-12)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
+05)--------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+06)----------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[2]
+10)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id group by t2_id, 'a') as t2_sum from t1
@@ -307,20 +301,17 @@ logical_plan
 08)------------TableScan: t2 projection=[t2_id, t2_int]
 physical_plan
 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum]
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
-04)------CoalescePartitionsExec
-05)--------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------FilterExec: sum(t2.t2_int)@1 < 3
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-09)----------------CoalesceBatchesExec: target_batch_size=2
-10)------------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
-11)--------------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)------------------------DataSourceExec: partitions=1, partition_sizes=[1]
-14)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(t2_id@1, t1_id@0)], projection=[sum(t2.t2_int)@0, t1_id@2]
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[sum(t2.t2_int)@1 as sum(t2.t2_int), t2_id@0 as t2_id]
+05)--------FilterExec: sum(t2.t2_int)@1 < 3
+06)----------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+07)------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4
+08)--------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: partitions=1, partition_sizes=[2]
+11)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 query II rowsort
 SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id having sum(t2_int) < 3) as t2_sum from t1
@@ -439,7 +430,7 @@ SELECT t1_id, t1_name, t1_int, (select t2_id, t2_name FROM t2 WHERE t2.t2_id = t
 
 #subquery_not_allowed
 #In/Exist Subquery is not allowed in ORDER BY clause.
-statement error DataFusion error: Invalid \(non-executable\) plan after Analyzer\ncaused by\nError during planning: In/Exist subquery can only be used in Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, but was used in \[Sort: t1.t1_int IN \(<subquery>\) ASC NULLS LAST\]
+statement error DataFusion error: Invalid \(non-executable\) plan after Analyzer\ncaused by\nError during planning: In/Exist/SetComparison subquery can only be used in Projection, Filter, TableScan, Window functions, Aggregate and Join plan nodes, but was used in \[Sort: t1.t1_int IN \(<subquery>\) ASC NULLS LAST\]
 SELECT t1_id, t1_name, t1_int FROM t1 order by t1_int in (SELECT t2_int FROM t2 WHERE t1.t1_id > t1.t1_int)
 
 #non_aggregated_correlated_scalar_subquery
@@ -499,7 +490,7 @@ logical_plan
 03)----TableScan: t1 projection=[t1_id, t1_int]
 04)----SubqueryAlias: __scalar_sq_1
 05)------Projection: Int64(1) AS a
-06)--------EmptyRelation
+06)--------EmptyRelation: rows=1
 
 query II rowsort
 SELECT t1_id, (SELECT a FROM (select 1 as a) WHERE a = t1.t1_int) as t2_int from t1
@@ -619,7 +610,7 @@ logical_plan
 01)LeftSemi Join: 
 02)--TableScan: t1 projection=[t1_id, t1_name]
 03)--SubqueryAlias: __correlated_sq_1
-04)----EmptyRelation
+04)----EmptyRelation: rows=1
 
 #exists_subquery_with_limit
 #de-correlated, limit is removed
@@ -644,7 +635,7 @@ SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id
 query TT
 explain SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0)
 ----
-logical_plan EmptyRelation
+logical_plan EmptyRelation: rows=0
 
 query IT rowsort
 SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0)
@@ -701,7 +692,7 @@ logical_plan
 01)Projection: t1.t1_id, __scalar_sq_1.t2_id AS t2_id
 02)--Left Join: 
 03)----TableScan: t1 projection=[t1_id]
-04)----EmptyRelation
+04)----EmptyRelation: rows=0
 
 query II rowsort
 SELECT t1_id, (SELECT t2_id FROM t2 limit 0) FROM t1
@@ -1189,13 +1180,11 @@ logical_plan
 05)------SubqueryAlias: __correlated_sq_1
 06)--------TableScan: t2 projection=[t2_id]
 physical_plan
-01)CoalesceBatchesExec: target_batch_size=2
-02)--FilterExec: t1_id@0 > 40 OR NOT mark@3, projection=[t1_id@0, t1_name@1, t1_int@2]
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(t1_id@0, t2_id@0)]
-05)--------DataSourceExec: partitions=1, partition_sizes=[1]
-06)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-07)----------DataSourceExec: partitions=1, partition_sizes=[1]
+01)FilterExec: t1_id@0 > 40 OR NOT mark@3, projection=[t1_id@0, t1_name@1, t1_int@2]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightMark, on=[(t2_id@0, t1_id@0)]
+03)----DataSourceExec: partitions=1, partition_sizes=[2]
+04)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)------DataSourceExec: partitions=1, partition_sizes=[2]
 
 statement ok
 set datafusion.explain.logical_plan_only = true;
@@ -1453,9 +1442,7 @@ logical_plan
 01)LeftSemi Join: 
 02)--TableScan: t1 projection=[a]
 03)--SubqueryAlias: __correlated_sq_1
-04)----Projection:
-05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
-06)--------TableScan: t2 projection=[]
+04)----EmptyRelation: rows=1
 
 statement count 0
 drop table t1;
@@ -1482,3 +1469,198 @@ logical_plan
 
 statement count 0
 drop table person;
+
+# Set comparison subqueries (ANY/ALL)
+statement ok
+create table set_cmp_t(v int) as values (1), (6), (10);
+
+statement ok
+create table set_cmp_s(v int) as values (5), (null);
+
+statement ok
+create table set_cmp_empty(v int);
+
+query I rowsort
+select v from set_cmp_t where v > any(select v from set_cmp_s);
+----
+10
+6
+
+query I rowsort
+select v from set_cmp_t where v < all(select v from set_cmp_empty);
+----
+1
+10
+6
+
+statement count 0
+drop table set_cmp_t;
+
+statement count 0
+drop table set_cmp_s;
+
+statement count 0
+drop table set_cmp_empty;
+
+query TT
+explain select v from (values (1), (6), (10)) set_cmp_t(v) where v > any(select v from (values (5), (null)) set_cmp_s(v));
+----
+logical_plan
+01)Projection: set_cmp_t.v
+02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL)
+03)----LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_3.v IS TRUE
+04)------Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND Boolean(NULL)
+05)--------LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_2.v IS NULL
+06)----------Filter: __correlated_sq_1.mark OR Boolean(NULL)
+07)------------LeftMark Join:  Filter: set_cmp_t.v > __correlated_sq_1.v IS TRUE
+08)--------------SubqueryAlias: set_cmp_t
+09)----------------Projection: column1 AS v
+10)------------------Values: (Int64(1)), (Int64(6)), (Int64(10))
+11)--------------SubqueryAlias: __correlated_sq_1
+12)----------------SubqueryAlias: set_cmp_s
+13)------------------Projection: column1 AS v
+14)--------------------Values: (Int64(5)), (Int64(NULL))
+15)----------SubqueryAlias: __correlated_sq_2
+16)------------SubqueryAlias: set_cmp_s
+17)--------------Projection: column1 AS v
+18)----------------Values: (Int64(5)), (Int64(NULL))
+19)------SubqueryAlias: __correlated_sq_3
+20)--------SubqueryAlias: set_cmp_s
+21)----------Projection: column1 AS v
+22)------------Values: (Int64(5)), (Int64(NULL))
+
+# correlated_recursive_scalar_subquery_with_level_3_exists_subquery_referencing_level1_relation
+query TT
+explain select c_custkey from customer
+where c_acctbal < (
+    select sum(o_totalprice) from orders
+    where o_custkey = c_custkey
+    and exists (
+        select * from lineitem where l_orderkey = o_orderkey
+        and l_extendedprice < c_acctbal
+    )
+) order by c_custkey;
+----
+logical_plan
+01)Sort: customer.c_custkey ASC NULLS LAST
+02)--Projection: customer.c_custkey
+03)----Inner Join: customer.c_custkey = __scalar_sq_2.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_2.sum(orders.o_totalprice)
+04)------TableScan: customer projection=[c_custkey, c_acctbal]
+05)------SubqueryAlias: __scalar_sq_2
+06)--------Projection: sum(orders.o_totalprice), orders.o_custkey
+07)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]]
+08)------------Projection: orders.o_custkey, orders.o_totalprice
+09)--------------LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_extendedprice < customer.c_acctbal
+10)----------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice]
+11)----------------SubqueryAlias: __correlated_sq_1
+12)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice]
+
+# correlated_recursive_scalar_subquery_with_level_3_in_subquery_referencing_level1_relation
+query TT
+explain select c_custkey from customer
+where c_acctbal < (
+    select sum(o_totalprice) from orders
+    where o_custkey = c_custkey
+    and o_totalprice in (
+        select l_extendedprice as price from lineitem where l_orderkey = o_orderkey
+        and l_extendedprice < c_acctbal
+    )
+) order by c_custkey;
+----
+logical_plan
+01)Sort: customer.c_custkey ASC NULLS LAST
+02)--Projection: customer.c_custkey
+03)----Inner Join: customer.c_custkey = __scalar_sq_2.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_2.sum(orders.o_totalprice)
+04)------TableScan: customer projection=[c_custkey, c_acctbal]
+05)------SubqueryAlias: __scalar_sq_2
+06)--------Projection: sum(orders.o_totalprice), orders.o_custkey
+07)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]]
+08)------------Projection: orders.o_custkey, orders.o_totalprice
+09)--------------LeftSemi Join: orders.o_totalprice = __correlated_sq_1.price, orders.o_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_extendedprice < customer.c_acctbal
+10)----------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice]
+11)----------------SubqueryAlias: __correlated_sq_1
+12)------------------Projection: lineitem.l_extendedprice AS price, lineitem.l_extendedprice, lineitem.l_orderkey
+13)--------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice]
+
+# Setup tables for recursive correlation tests
+statement ok
+CREATE TABLE employees (
+    employee_id INTEGER,
+    employee_name VARCHAR,
+    dept_id INTEGER,
+    salary DECIMAL
+);
+
+statement ok
+CREATE TABLE project_assignments (
+    project_id INTEGER,
+    employee_id INTEGER,
+    priority INTEGER
+);
+
+# Provided recursive scalar subquery explain case
+query TT
+EXPLAIN SELECT e1.employee_name, e1.salary
+FROM employees e1
+WHERE e1.salary > (
+    SELECT AVG(e2.salary)
+    FROM employees e2
+    WHERE e2.dept_id = e1.dept_id
+    AND e2.salary > (
+        SELECT AVG(e3.salary)
+        FROM employees e3
+        WHERE e3.dept_id = e1.dept_id
+    )
+);
+----
+logical_plan
+01)Projection: e1.employee_name, e1.salary
+02)--Inner Join: e1.dept_id = __scalar_sq_1.dept_id Filter: CAST(e1.salary AS Decimal128(38, 14)) > __scalar_sq_1.avg(e2.salary)
+03)----SubqueryAlias: e1
+04)------TableScan: employees projection=[employee_name, dept_id, salary]
+05)----SubqueryAlias: __scalar_sq_1
+06)------Projection: avg(e2.salary), e2.dept_id
+07)--------Aggregate: groupBy=[[e2.dept_id]], aggr=[[avg(e2.salary)]]
+08)----------Projection: e2.dept_id, e2.salary
+09)------------Inner Join:  Filter: CAST(e2.salary AS Decimal128(38, 14)) > __scalar_sq_2.avg(e3.salary) AND __scalar_sq_2.dept_id = e1.dept_id
+10)--------------SubqueryAlias: e2
+11)----------------TableScan: employees projection=[dept_id, salary]
+12)--------------SubqueryAlias: __scalar_sq_2
+13)----------------Projection: avg(e3.salary), e3.dept_id
+14)------------------Aggregate: groupBy=[[e3.dept_id]], aggr=[[avg(e3.salary)]]
+15)--------------------SubqueryAlias: e3
+16)----------------------TableScan: employees projection=[dept_id, salary]
+
+# Check shadowing: `dept_id` should resolve to the nearest outer relation (`e2`)
+# in the innermost subquery rather than the outermost
+query TT
+EXPLAIN SELECT e1.employee_id
+FROM employees e1
+WHERE EXISTS (
+    SELECT 1
+    FROM employees e2
+    WHERE EXISTS (
+        SELECT 1
+        FROM project_assignments p
+        WHERE p.project_id = dept_id
+    )
+);
+----
+logical_plan
+01)LeftSemi Join:
+02)--SubqueryAlias: e1
+03)----TableScan: employees projection=[employee_id]
+04)--SubqueryAlias: __correlated_sq_2
+05)----Projection:
+06)------LeftSemi Join: e2.dept_id = __correlated_sq_1.project_id
+07)--------SubqueryAlias: e2
+08)----------TableScan: employees projection=[dept_id]
+09)--------SubqueryAlias: __correlated_sq_1
+10)----------SubqueryAlias: p
+11)------------TableScan: project_assignments projection=[project_id]
+
+statement count 0
+drop table employees;
+
+statement count 0
+drop table project_assignments;
diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt
index d993515f4de99..dd474f3fa1054 100644
--- a/datafusion/sqllogictest/test_files/subquery_sort.slt
+++ b/datafusion/sqllogictest/test_files/subquery_sort.slt
@@ -100,7 +100,7 @@ physical_plan
 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r]
 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9]
-04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true
 
@@ -126,7 +126,7 @@ physical_plan
 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r]
 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9]
-04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -150,11 +150,10 @@ physical_plan
 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c3@2 DESC, c9@3 ASC NULLS LAST], preserve_partitioning=[true]
 04)------ProjectionExec: expr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@1 as c1, first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@2 as c2, first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@3 as c3, first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]@4 as c9]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true
+06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(sink_table.c1) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c2) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c3) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST], first_value(sink_table.c9) ORDER BY [sink_table.c1 ASC NULLS LAST, sink_table.c3 DESC NULLS FIRST, sink_table.c9 ASC NULLS LAST]]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c9], file_type=csv, has_header=true
 
 
 query TI
diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt
index 7d318c50bacf4..f0e00ffc69233 100644
--- a/datafusion/sqllogictest/test_files/table_functions.slt
+++ b/datafusion/sqllogictest/test_files/table_functions.slt
@@ -153,31 +153,34 @@ SELECT * FROM generate_series(1, 5, NULL)
 query TT
 EXPLAIN SELECT * FROM generate_series(1, 5)
 ----
-logical_plan TableScan: tmp_table projection=[value]
+logical_plan TableScan: generate_series() projection=[value]
 physical_plan LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=5, batch_size=8192]
 
 #
 # Test generate_series with invalid arguments
 #
 
-query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series
+query I
 SELECT * FROM generate_series(5, 1)
+----
 
-query error DataFusion error: Error during planning: start is smaller than end, but increment is negative: cannot generate infinite series
+query I
 SELECT * FROM generate_series(-6, 6, -1)
+----
 
-query error DataFusion error: Error during planning: step cannot be zero
+query error DataFusion error: Error during planning: Step cannot be zero
 SELECT * FROM generate_series(-6, 6, 0)
 
-query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series
+query I
 SELECT * FROM generate_series(6, -6, 1)
+----
 
 
 statement error DataFusion error: Error during planning: generate_series function requires 1 to 3 arguments
 SELECT * FROM generate_series(1, 2, 3, 4)
 
 
-statement error DataFusion error: Error during planning: First argument must be an integer literal
+statement error DataFusion error: Error during planning: Argument \#1 must be an INTEGER, TIMESTAMP, DATE or NULL, got Utf8
 SELECT * FROM generate_series('foo', 'bar')
 
 # UDF and UDTF `generate_series` can be used simultaneously
@@ -188,6 +191,21 @@ SELECT generate_series(1, t1.end) FROM generate_series(3, 5) as t1(end)
 [1, 2, 3, 4]
 [1, 2, 3]
 
+# join with projection on generate_series
+query I
+select g1.value from generate_series(1, 3) g1 CROSS JOIN generate_series(1, 3) g2;
+----
+1
+1
+1
+2
+2
+2
+3
+3
+3
+
+
 # Test range table function
 query I
 SELECT * FROM range(6)
@@ -220,6 +238,12 @@ SELECT * FROM range(3, 6)
 4
 5
 
+query I rowsort
+SELECT * FROM range(1, 1+2)
+----
+1
+2
+
 # #generated_data > batch_size
 query I
 SELECT count(v1) FROM range(-66666,66666) t1(v1)
@@ -270,36 +294,252 @@ SELECT * FROM range(1, 5, NULL)
 query TT
 EXPLAIN SELECT * FROM range(1, 5)
 ----
-logical_plan TableScan: tmp_table projection=[value]
+logical_plan TableScan: range() projection=[value]
 physical_plan LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192]
 
 #
 # Test range with invalid arguments
 #
 
-query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series
+query I
 SELECT * FROM range(5, 1)
+----
 
-query error DataFusion error: Error during planning: start is smaller than end, but increment is negative: cannot generate infinite series
+query I
 SELECT * FROM range(-6, 6, -1)
+----
 
-query error DataFusion error: Error during planning: step cannot be zero
+query error DataFusion error: Error during planning: Step cannot be zero
 SELECT * FROM range(-6, 6, 0)
 
-query error DataFusion error: Error during planning: start is bigger than end, but increment is positive: cannot generate infinite series
+query I
 SELECT * FROM range(6, -6, 1)
+----
 
 
 statement error DataFusion error: Error during planning: range function requires 1 to 3 arguments
 SELECT * FROM range(1, 2, 3, 4)
 
 
-statement error DataFusion error: Error during planning: First argument must be an integer literal
+statement error DataFusion error: Error during planning: Argument \#1 must be an INTEGER, TIMESTAMP, DATE or NULL, got Utf8
 SELECT * FROM range('foo', 'bar')
 
+statement error DataFusion error: Error during planning: Argument #2 must be an INTEGER or NULL, got Literal\(Utf8\("bar"\), None\)
+SELECT * FROM range(1, 'bar')
+
 # UDF and UDTF `range` can be used simultaneously
 query ? rowsort
 SELECT range(1, t1.end) FROM range(3, 5) as t1(end)
 ----
 [1, 2, 3]
 [1, 2]
+
+#
+# Test timestamp ranges
+#
+
+# Basic timestamp range with 1 day interval
+query P rowsort
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-04T00:00:00', INTERVAL '1' DAY)
+----
+2023-01-01T00:00:00
+2023-01-02T00:00:00
+2023-01-03T00:00:00
+
+# Timestamp range with hour interval
+query P rowsort
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-01T03:00:00', INTERVAL '1' HOUR)
+----
+2023-01-01T00:00:00
+2023-01-01T01:00:00
+2023-01-01T02:00:00
+
+# Timestamp range with month interval
+query P rowsort
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-04-01T00:00:00', INTERVAL '1' MONTH)
+----
+2023-01-01T00:00:00
+2023-02-01T00:00:00
+2023-03-01T00:00:00
+
+# Timestamp generate_series (includes end)
+query P rowsort
+SELECT * FROM generate_series(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-03T00:00:00', INTERVAL '1' DAY)
+----
+2023-01-01T00:00:00
+2023-01-02T00:00:00
+2023-01-03T00:00:00
+
+# Timestamp range with timezone
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00+00:00', TIMESTAMP '2023-01-03T00:00:00+00:00', INTERVAL '1' DAY)
+----
+2023-01-01T00:00:00
+2023-01-02T00:00:00
+
+# Negative timestamp range (going backwards)
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '-1' DAY)
+----
+2023-01-03T00:00:00
+2023-01-02T00:00:00
+
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '1' DAY)
+----
+
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-02T00:00:00', INTERVAL '-1' DAY)
+----
+
+query error DataFusion error: Error during planning: range function with timestamps requires exactly 3 arguments
+SELECT * FROM range(TIMESTAMP '2023-01-03T00:00:00', TIMESTAMP '2023-01-01T00:00:00')
+
+# Single timestamp (start == end)
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-01T00:00:00', INTERVAL '1' DAY)  
+----
+
+# Timestamp range with NULL values
+query P
+SELECT * FROM range(NULL::TIMESTAMP, TIMESTAMP '2023-01-03T00:00:00', INTERVAL '1' DAY)
+----
+
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', NULL::TIMESTAMP, INTERVAL '1' DAY)
+----
+
+# No interval gives no rows
+query P
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-03T00:00:00', NULL::INTERVAL)
+----
+
+# Zero-length interval gives error
+query error DataFusion error: Error during planning: Step interval cannot be zero
+SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-01-03T00:00:00', INTERVAL '0' DAY)
+
+# Timezone-aware
+query P
+SELECT * FROM range(TIMESTAMPTZ '2023-02-01T00:00:00-07:00', TIMESTAMPTZ '2023-02-01T09:00:00+01:00', INTERVAL '1' HOUR);
+----
+2023-02-01T07:00:00
+
+# Basic date range with hour interval
+query P
+SELECT * FROM range(DATE '1992-01-01', DATE '1992-01-03', INTERVAL '6' HOUR);
+----
+1992-01-01T00:00:00
+1992-01-01T06:00:00
+1992-01-01T12:00:00
+1992-01-01T18:00:00
+1992-01-02T00:00:00
+1992-01-02T06:00:00
+1992-01-02T12:00:00
+1992-01-02T18:00:00
+
+# Date range with day interval
+query P
+SELECT * FROM range(DATE '1992-09-01', DATE '1992-09-05', INTERVAL '1' DAY);
+----
+1992-09-01T00:00:00
+1992-09-02T00:00:00
+1992-09-03T00:00:00
+1992-09-04T00:00:00
+
+# Date range with month interval
+query P
+SELECT * FROM range(DATE '1992-09-01', DATE '1993-01-01', INTERVAL '1' MONTH);
+----
+1992-09-01T00:00:00
+1992-10-01T00:00:00
+1992-11-01T00:00:00
+1992-12-01T00:00:00
+
+# Date range generate_series includes end
+query P
+SELECT * FROM generate_series(DATE '1992-09-01', DATE '1992-09-03', INTERVAL '1' DAY);
+----
+1992-09-01T00:00:00
+1992-09-02T00:00:00
+1992-09-03T00:00:00
+
+query TT
+EXPLAIN SELECT * FROM generate_series(DATE '1992-09-01', DATE '1992-09-03', INTERVAL '1' DAY);
+----
+logical_plan TableScan: generate_series() projection=[value]
+physical_plan LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=715305600000000000, end=715478400000000000, batch_size=8192]
+
+# Backwards date range
+query P
+SELECT * FROM range(DATE '1992-09-05', DATE '1992-09-01', INTERVAL '-1' DAY);
+----
+1992-09-05T00:00:00
+1992-09-04T00:00:00
+1992-09-03T00:00:00
+1992-09-02T00:00:00
+
+# NULL handling for dates
+query P
+SELECT * FROM range(DATE '1992-09-01', NULL::DATE, INTERVAL '1' MONTH)
+----
+
+query TT
+EXPLAIN SELECT * FROM range(DATE '1992-09-01', NULL::DATE, INTERVAL '1' MONTH)
+----
+logical_plan TableScan: range() projection=[value]
+physical_plan LazyMemoryExec: partitions=1, batch_generators=[range: empty]
+
+query P
+SELECT * FROM range(NULL::DATE, DATE '1992-09-01', INTERVAL '1' MONTH)
+----
+
+query P
+SELECT * FROM range(DATE '1992-09-01', DATE '1992-10-01', NULL::INTERVAL)
+----
+
+query P
+SELECT * FROM range(DATE '2023-01-03', DATE '2023-01-01', INTERVAL '1' DAY)
+----
+
+query P
+SELECT * FROM range(DATE '2023-01-01', DATE '2023-01-02', INTERVAL '-1' DAY)
+----
+
+query error DataFusion error: Error during planning: range function with dates requires exactly 3 arguments
+SELECT * FROM range(DATE '2023-01-01', DATE '2023-01-03')
+
+# Table function as relation
+statement ok
+CREATE OR REPLACE TABLE json_table (c INT) AS VALUES (1), (2);
+
+query II
+SELECT c, f.*  FROM json_table, LATERAL generate_series(1,2) f;
+----
+1 1
+1 2
+2 1
+2 2
+
+
+# Test generate_series in a recursive CTE to ensure the state is correctly reset
+query I rowsort
+WITH RECURSIVE t AS (
+    SELECT 1 i
+    UNION ALL
+    SELECT g.i
+    FROM generate_series(1, 1) g(i), t
+)
+SELECT *
+FROM t
+LIMIT 10;
+----
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
diff --git a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt
new file mode 100644
index 0000000000000..d48e41d1204de
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## to_timestamp timezone tests
+##########
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 1: Default timezone (None) with naive timestamp
+## Naive timestamps (without explicit timezone) should be interpreted as UTC by default
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+## Test 2: Explicit UTC timezone ('Z' suffix)
+## Explicit timezone should be respected regardless of session timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+## Test 3: Explicit timezone offset (+05:00)
+## Explicit offset should be respected - this is 13:42:29+05:00 which is 08:42:29 UTC
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+## Test 4: Explicit timezone offset without colon (+0500)
+## Should handle offset formats without colons
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+0500');
+----
+2020-09-08T08:42:29
+
+## Test 5: Negative timezone offset
+query P
+SELECT to_timestamp('2020-09-08T13:42:29-03:30');
+----
+2020-09-08T17:12:29
+
+## Test 6: Configure session timezone to America/New_York
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 7: Naive timestamp with configured timezone
+## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4)
+## So this should become '2020-09-08T13:42:29-04:00'
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 8: Explicit UTC should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T09:42:29-04:00
+
+## Test 9: Explicit offset should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T04:42:29-04:00
+
+## Test 10: Check arrow_typeof returns timstamp in configured timezone
+## Result should be Timestamp(Nanosecond, "America/New_York") regardless of input timezone
+query T
+SELECT arrow_typeof(to_timestamp('2020-09-08T13:42:29Z'));
+----
+Timestamp(ns, "America/New_York")
+
+## Test 11: Configure to offset-based timezone
+statement ok
+SET datafusion.execution.time_zone = '+05:30';
+
+## Test 12: Naive timestamp with offset-based configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29+05:30
+
+## Test 13: Reset to None
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 14: Naive timestamp
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 15: to_timestamp with format string - naive timestamp with session timezone
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29', '%Y-%m-%d %H:%M:%S');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 16: to_timestamp with format string - explicit timezone should be respected
+statement ok
+SET datafusion.execution.time_zone = 'UTC';
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 +0000', '%Y-%m-%d %H:%M:%S %z');
+----
+2020-09-08T13:42:29Z
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 America/Toronto', '%Y-%m-%d %H:%M:%S %Z');
+----
+2020-09-08T17:42:29Z
+
+query error Error parsing timestamp from '2020-09-08 13:42:29America/Toronto' using format '%Y-%m-%d %H:%M:%S%Z': '%Z' is only supported at the end of the format string preceded by a space
+SELECT to_timestamp('2020-09-08 13:42:29America/Toronto', '%Y-%m-%d %H:%M:%S%Z');
+
+## Test 17: Test all precision variants respect timezone
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## to_timestamp_seconds
+query P
+SELECT to_timestamp_seconds('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## to_timestamp_millis
+query P
+SELECT to_timestamp_millis('2020-09-08T13:42:29.123');
+----
+2020-09-08T13:42:29.123-04:00
+
+## to_timestamp_micros
+query P
+SELECT to_timestamp_micros('2020-09-08T13:42:29.123456');
+----
+2020-09-08T13:42:29.123456-04:00
+
+## to_timestamp_nanos
+query P
+SELECT to_timestamp_nanos('2020-09-08T13:42:29.123456789');
+----
+2020-09-08T13:42:29.123456789-04:00
+
+## test integers
+query T
+select arrow_typeof(to_timestamp_seconds(61))
+----
+Timestamp(s, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_millis(61))
+----
+Timestamp(ms, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_micros(61))
+----
+Timestamp(µs, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_nanos(61))
+----
+Timestamp(ns, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp(61))
+----
+Timestamp(ns, "America/New_York")
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt
index ce23fe26528c3..8a1fef0722297 100644
--- a/datafusion/sqllogictest/test_files/topk.slt
+++ b/datafusion/sqllogictest/test_files/topk.slt
@@ -53,7 +53,7 @@ query I
 select * from (select * from topk limit 8) order by x limit 3;
 ----
 0
-1
+2
 2
 
 
@@ -316,7 +316,7 @@ explain select number, letter, age from partial_sorted order by number desc, let
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 
 # Explain variations of the above query with different orderings, and different sort prefixes.
@@ -326,28 +326,28 @@ explain select number, letter, age from partial_sorted order by age desc limit 3
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[age@2 DESC], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 query TT
 explain select number, letter, age from partial_sorted order by number desc, letter desc limit 3;
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 query TT
 explain select number, letter, age from partial_sorted order by number asc limit 3;
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], file_type=parquet, predicate=DynamicFilter [ empty ], reverse_row_groups=true
 
 query TT
 explain select number, letter, age from partial_sorted order by letter asc, number desc limit 3;
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[letter@1 ASC NULLS LAST, number@0 DESC], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Explicit NULLS ordering cases (reversing the order of the NULLS on the number and letter orderings)
 query TT
@@ -355,14 +355,14 @@ explain select number, letter, age from partial_sorted order by number desc, let
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC], preserve_partitioning=[false], sort_prefix=[number@0 DESC]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 query TT
 explain select number, letter, age from partial_sorted order by number desc NULLS LAST, letter asc limit 3;
 ----
 physical_plan
 01)SortExec: TopK(fetch=3), expr=[number@0 DESC NULLS LAST, letter@1 ASC NULLS LAST], preserve_partitioning=[false]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 
 # Verify that the sort prefix is correctly computed on the normalized ordering (removing redundant aliased columns)
@@ -370,9 +370,8 @@ query TT
 explain select number, letter, age, number as column4, letter as column5 from partial_sorted order by number desc, column4 desc, letter asc, column5 asc, age desc limit 3;
 ----
 physical_plan
-01)SortExec: TopK(fetch=3), expr=[number@0 DESC, column4@3 DESC, letter@1 ASC NULLS LAST, column5@4 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST]
-02)--ProjectionExec: expr=[number@0 as number, letter@1 as letter, age@2 as age, number@0 as column4, letter@1 as column5]
-03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet
+01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age, number@0 as column4, letter@1 as column5], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Verify that the sort prefix is correctly computed over normalized, order-maintaining projections (number + 1, number, number + 1, age)
 query TT
@@ -380,11 +379,11 @@ explain select number + 1 as number_plus, number, number + 1 as other_number_plu
 ----
 physical_plan
 01)SortPreservingMergeExec: [number_plus@0 DESC, number@1 DESC, other_number_plus@2 DESC, age@3 ASC NULLS LAST], fetch=3
-02)--SortExec: TopK(fetch=3), expr=[number_plus@0 DESC, number@1 DESC, other_number_plus@2 DESC, age@3 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[number_plus@0 DESC, number@1 DESC]
+02)--SortExec: TopK(fetch=3), expr=[number_plus@0 DESC, number@1 DESC, age@3 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[number_plus@0 DESC, number@1 DESC]
 03)----ProjectionExec: expr=[__common_expr_1@0 as number_plus, number@1 as number, __common_expr_1@0 as other_number_plus, age@2 as age]
 04)------ProjectionExec: expr=[CAST(number@0 AS Int64) + 1 as __common_expr_1, number@0 as number, age@1 as age]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, age], output_ordering=[number@0 DESC], file_type=parquet
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, age], output_ordering=[number@0 DESC], file_type=parquet, predicate=DynamicFilter [ empty ]
 
 # Cleanup
 statement ok
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
index 4a6ad5eddfb79..84237e4393377 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q1.slt.part
@@ -51,10 +51,8 @@ physical_plan
 02)--SortExec: expr=[l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus, sum(lineitem.l_quantity)@2 as sum_qty, sum(lineitem.l_extendedprice)@3 as sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@4 as sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax)@5 as sum_charge, avg(lineitem.l_quantity)@6 as avg_qty, avg(lineitem.l_extendedprice)@7 as avg_price, avg(lineitem.l_discount)@8 as avg_disc, count(Int64(1))@9 as count_order]
 04)------AggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
-08)--------------ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_1, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------FilterExec: l_shipdate@6 <= 1998-09-02, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5]
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1))]
+07)------------ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_1, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
+08)--------------FilterExec: l_shipdate@6 <= 1998-09-02, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5]
+09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
index 04de9153a0474..62649148bf058 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q10.slt.part
@@ -73,34 +73,20 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[revenue@2 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-20)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], file_type=csv, has_header=false
-22)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-24)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)----------------------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1]
-26)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-27)--------------------------CoalesceBatchesExec: target_batch_size=8192
-28)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-29)------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)--------------------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
-31)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], file_type=csv, has_header=false
-32)------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-34)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10]
+08)--------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
+10)------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
+12)----------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], file_type=csv, has_header=false
+14)----------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+15)------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1]
+16)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+17)------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+18)--------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
+19)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], file_type=csv, has_header=false
+20)--------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+21)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
index a6225daae4362..a31579eb1e09d 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q11.slt.part
@@ -75,51 +75,35 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=10), expr=[value@1 DESC], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value]
-03)----NestedLoopJoinExec: join_type=Inner, filter=CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Decimal128(38, 15)) > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@1, projection=[ps_partkey@0, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1]
-04)------CoalescePartitionsExec
-05)--------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-06)----------CoalesceBatchesExec: target_batch_size=8192
+03)----NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@0, projection=[ps_partkey@0, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1, sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@3]
+04)------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as sum(partsupp.ps_supplycost * partsupp.ps_availqty), CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) as join_proj_push_down_1]
+05)--------CoalescePartitionsExec
+06)----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
 07)------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
 08)--------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
-17)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
-18)----------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-20)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-22)--------------------CoalesceBatchesExec: target_batch_size=8192
-23)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-24)------------------------CoalesceBatchesExec: target_batch_size=8192
-25)--------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
-26)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-27)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-28)------ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)]
-29)--------AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-30)----------CoalescePartitionsExec
-31)------------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
-32)--------------CoalesceBatchesExec: target_batch_size=8192
-33)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1]
-34)------------------CoalesceBatchesExec: target_batch_size=8192
-35)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-36)----------------------CoalesceBatchesExec: target_batch_size=8192
-37)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4]
-38)--------------------------CoalesceBatchesExec: target_batch_size=8192
-39)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-40)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
-41)--------------------------CoalesceBatchesExec: target_batch_size=8192
-42)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-43)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-44)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-45)------------------CoalesceBatchesExec: target_batch_size=8192
-46)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-47)----------------------CoalesceBatchesExec: target_batch_size=8192
-48)------------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
-49)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-50)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[ps_partkey@0, ps_availqty@1, ps_supplycost@2]
+10)------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_availqty@2, ps_supplycost@3, s_nationkey@5]
+12)----------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
+13)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
+14)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+15)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+16)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+17)--------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
+18)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+19)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+20)------ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)]
+21)--------AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
+22)----------CoalescePartitionsExec
+23)------------AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)]
+24)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_availqty@0, ps_supplycost@1]
+25)----------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+26)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@0, s_suppkey@0)], projection=[ps_availqty@1, ps_supplycost@2, s_nationkey@4]
+27)--------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
+28)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], file_type=csv, has_header=false
+29)--------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+30)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+31)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+32)------------------FilterExec: n_name@1 = GERMANY, projection=[n_nationkey@0]
+33)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+34)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
index f7344daed8c7a..b152fde02f060 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q12.slt.part
@@ -63,16 +63,11 @@ physical_plan
 02)--SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count]
 04)------AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_shipmode@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------FilterExec: (l_shipmode@4 = MAIL OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1994-01-01 AND l_receiptdate@3 < 1995-01-01, projection=[l_orderkey@0, l_shipmode@4]
-14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], file_type=csv, has_header=false
-15)------------------CoalesceBatchesExec: target_batch_size=8192
-16)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-17)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderpriority], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([l_shipmode@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3]
+08)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+09)----------------FilterExec: (l_shipmode@4 = MAIL OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1994-01-01 AND l_receiptdate@3 < 1995-01-01, projection=[l_orderkey@0, l_shipmode@4]
+10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], file_type=csv, has_header=false
+11)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+12)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderpriority], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
index 96f3bd6edf324..94e0848bfcce1 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q13.slt.part
@@ -57,19 +57,13 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[custdist@1 DESC, c_count@0 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c_count@0 as c_count, count(Int64(1))@1 as custdist]
 04)------AggregateExec: mode=FinalPartitioned, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([c_count@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
-08)--------------ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count]
-09)----------------AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey], file_type=csv, has_header=false
-16)----------------------CoalesceBatchesExec: target_batch_size=8192
-17)------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-18)--------------------------CoalesceBatchesExec: target_batch_size=8192
-19)----------------------------FilterExec: o_comment@2 NOT LIKE %special%requests%, projection=[o_orderkey@0, o_custkey@1]
-20)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([c_count@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(Int64(1))]
+07)------------ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count]
+08)--------------AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)]
+09)----------------HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1]
+10)------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey], file_type=csv, has_header=false
+12)------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+13)--------------------FilterExec: o_comment@2 NOT LIKE %special%requests%, projection=[o_orderkey@0, o_custkey@1]
+14)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_comment], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
index 8d8dd68c3d7bd..a9ac517f287d0 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q14.slt.part
@@ -46,14 +46,9 @@ physical_plan
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
 05)--------ProjectionExec: expr=[l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as __common_expr_1, p_type@2 as p_type]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2]
-12)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-13)--------------CoalesceBatchesExec: target_batch_size=8192
-14)----------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-15)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-16)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4]
+07)------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+08)--------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2]
+09)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+10)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=1
+11)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
index 0636a033b25a3..ae0c0a93a3552 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q15.slt.part
@@ -73,30 +73,22 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST]
 02)--SortExec: expr=[s_suppkey@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(max(revenue0.total_revenue)@0, total_revenue@4)], projection=[s_suppkey@1, s_name@2, s_address@3, s_phone@4, total_revenue@5]
-05)--------AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)]
-06)----------CoalescePartitionsExec
-07)------------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)]
-08)--------------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
-09)----------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
-12)----------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
-15)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-16)--------CoalesceBatchesExec: target_batch_size=8192
-17)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, supplier_no@0)], projection=[s_suppkey@0, s_name@1, s_address@2, s_phone@3, total_revenue@5]
-18)------------CoalesceBatchesExec: target_batch_size=8192
-19)--------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-20)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_phone], file_type=csv, has_header=false
-22)------------ProjectionExec: expr=[l_suppkey@0 as supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
-23)--------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-24)----------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
-26)--------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-27)----------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
-29)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(max(revenue0.total_revenue)@0, total_revenue@4)], projection=[s_suppkey@1, s_name@2, s_address@3, s_phone@4, total_revenue@5]
+04)------AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)]
+05)--------CoalescePartitionsExec
+06)----------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)]
+07)------------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
+08)--------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+09)----------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
+10)------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+11)--------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
+12)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+13)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, supplier_no@0)], projection=[s_suppkey@0, s_name@1, s_address@2, s_phone@3, total_revenue@5]
+14)--------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+15)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_phone], file_type=csv, has_header=false
+16)--------ProjectionExec: expr=[l_suppkey@0 as supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue]
+17)----------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+18)------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4
+19)--------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+20)----------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2]
+21)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
index edc452284cf99..b01110b567ca8 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q16.slt.part
@@ -69,31 +69,20 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt]
 04)------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 4), input_partitions=4
-11)--------------------AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[]
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(ps_suppkey@0, s_suppkey@0)]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, p_partkey@0)], projection=[ps_suppkey@1, p_brand@3, p_type@4, p_size@5]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-20)--------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey], file_type=csv, has_header=false
-21)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-23)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------FilterExec: p_brand@1 != Brand#45 AND p_type@2 NOT LIKE MEDIUM POLISHED% AND Use p_size@3 IN (SET) ([Literal { value: Int32(49) }, Literal { value: Int32(14) }, Literal { value: Int32(23) }, Literal { value: Int32(45) }, Literal { value: Int32(19) }, Literal { value: Int32(3) }, Literal { value: Int32(36) }, Literal { value: Int32(9) }])
-25)------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-26)--------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_type, p_size], file_type=csv, has_header=false
-27)--------------------------CoalesceBatchesExec: target_batch_size=8192
-28)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-29)------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)--------------------------------FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0]
-31)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-32)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_comment], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)]
+07)------------AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[]
+08)--------------RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 4), input_partitions=4
+09)----------------AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[]
+10)------------------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(ps_suppkey@0, s_suppkey@0)]
+11)--------------------CoalescePartitionsExec
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, p_partkey@0)], projection=[ps_suppkey@1, p_brand@3, p_type@4, p_size@5]
+13)------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey], file_type=csv, has_header=false
+15)------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+16)--------------------------FilterExec: p_brand@1 != Brand#45 AND p_type@2 NOT LIKE MEDIUM POLISHED% AND p_size@3 IN (SET) ([49, 14, 23, 45, 19, 3, 36, 9])
+17)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+18)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_type, p_size], file_type=csv, has_header=false
+19)--------------------FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0]
+20)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+21)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_comment], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
index 51a0d096428c0..83294d61a1698 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q17.slt.part
@@ -55,22 +55,16 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_quantity@1, l_extendedprice@2, p_partkey@3]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice], file_type=csv, has_header=false
-12)----------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)--------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX, projection=[p_partkey@0]
-16)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_container], file_type=csv, has_header=false
-18)------------ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey]
-19)--------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
-20)----------------CoalesceBatchesExec: target_batch_size=8192
-21)------------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-22)--------------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
-23)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=false
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1]
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_quantity@1, l_extendedprice@2, p_partkey@3]
+07)------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice], file_type=csv, has_header=false
+09)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+10)--------------FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX, projection=[p_partkey@0]
+11)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_container], file_type=csv, has_header=false
+13)----------ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey]
+14)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
+15)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+16)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)]
+17)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
index 55da5371671e8..617051d602bd6 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q18.slt.part
@@ -69,32 +69,19 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST]
 02)--SortExec: expr=[o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true]
-03)----AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------RepartitionExec: partitioning=Hash([c_name@0, c_custkey@1, o_orderkey@2, o_orderdate@3, o_totalprice@4], 4), input_partitions=4
-06)----------AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@2, l_orderkey@0)]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-17)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-18)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name], file_type=csv, has_header=false
-19)----------------------------CoalesceBatchesExec: target_batch_size=8192
-20)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-21)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], file_type=csv, has_header=false
-22)--------------------CoalesceBatchesExec: target_batch_size=8192
-23)----------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-24)------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
-25)----------------CoalesceBatchesExec: target_batch_size=8192
-26)------------------FilterExec: sum(lineitem.l_quantity)@1 > Some(30000),25,2, projection=[l_orderkey@0]
-27)--------------------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
-28)----------------------CoalesceBatchesExec: target_batch_size=8192
-29)------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-30)--------------------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
-31)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
+03)----AggregateExec: mode=SinglePartitioned, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)]
+04)------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@2, l_orderkey@0)]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6]
+06)----------RepartitionExec: partitioning=Hash([o_orderkey@2], 4), input_partitions=4
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5]
+08)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name], file_type=csv, has_header=false
+10)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+11)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate], file_type=csv, has_header=false
+12)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+13)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
+14)--------FilterExec: sum(lineitem.l_quantity)@1 > Some(30000),25,2, projection=[l_orderkey@0]
+15)----------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
+16)------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+17)--------------AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)]
+18)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_quantity], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
index 3b15fb3d8e533..72c21e060fa66 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q19.slt.part
@@ -68,16 +68,11 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3]
-11)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false
-12)------------CoalesceBatchesExec: target_batch_size=8192
-13)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-14)----------------CoalesceBatchesExec: target_batch_size=8192
-15)------------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND p_size@2 <= 15) AND p_size@2 >= 1
-16)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-17)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_extendedprice@2, l_discount@3]
+06)----------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4
+07)------------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3]
+08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], file_type=csv, has_header=false
+09)----------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+10)------------FilterExec: (p_brand@1 = Brand#12 AND p_container@3 IN (SET) ([SM CASE, SM BOX, SM PACK, SM PKG]) AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_container@3 IN (SET) ([MED BAG, MED BOX, MED PKG, MED PACK]) AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_container@3 IN (SET) ([LG CASE, LG BOX, LG PACK, LG PKG]) AND p_size@2 <= 15) AND p_size@2 >= 1
+11)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+12)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_size, p_container], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
index b2e0fb0cd1cc0..d5ff6724402ad 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q2.slt.part
@@ -102,79 +102,47 @@ physical_plan
 01)SortPreservingMergeExec: [s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], fetch=10
 02)--SortExec: TopK(fetch=10), expr=[s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment]
-04)------CoalesceBatchesExec: target_batch_size=8192
-05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8]
-06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@9, r_regionkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4
-16)------------------------------ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_name@3 as s_name, s_address@4 as s_address, s_nationkey@5 as s_nationkey, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@2 as ps_supplycost]
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_supplycost@3, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1]
-27)----------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-28)------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_mfgr, p_type, p_size], file_type=csv, has_header=false
-29)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-31)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-32)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-34)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], file_type=csv, has_header=false
-36)--------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-38)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-39)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
-40)------------------CoalesceBatchesExec: target_batch_size=8192
-41)--------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-42)----------------------CoalesceBatchesExec: target_batch_size=8192
-43)------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
-44)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-45)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
-46)----------CoalesceBatchesExec: target_batch_size=8192
-47)------------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4
-48)--------------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey]
-49)----------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
-50)------------------CoalesceBatchesExec: target_batch_size=8192
-51)--------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-52)----------------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
-53)------------------------CoalesceBatchesExec: target_batch_size=8192
-54)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1]
-55)----------------------------CoalesceBatchesExec: target_batch_size=8192
-56)------------------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4
-57)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-58)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4]
-59)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-60)--------------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-61)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-62)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4]
-63)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-64)----------------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
-65)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-66)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-67)----------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-68)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-69)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-70)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-71)--------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-72)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-73)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
-74)----------------------------CoalesceBatchesExec: target_batch_size=8192
-75)------------------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-76)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-77)----------------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
-78)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-79)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+04)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8]
+05)--------RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 4), input_partitions=4
+06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@9, r_regionkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, ps_supplycost@7, n_name@8]
+07)------------RepartitionExec: partitioning=Hash([n_regionkey@9], 4), input_partitions=4
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@4, n_nationkey@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@10, n_regionkey@11]
+09)----------------RepartitionExec: partitioning=Hash([s_nationkey@4], 4), input_partitions=4
+10)------------------ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_name@3 as s_name, s_address@4 as s_address, s_nationkey@5 as s_nationkey, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@2 as ps_supplycost]
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@2, s_suppkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_supplycost@3, s_name@5, s_address@6, s_nationkey@7, s_phone@8, s_acctbal@9, s_comment@10]
+12)----------------------RepartitionExec: partitioning=Hash([ps_suppkey@2], 4), input_partitions=4
+13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4]
+14)--------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+15)----------------------------FilterExec: p_size@3 = 15 AND p_type@2 LIKE %BRASS, projection=[p_partkey@0, p_mfgr@1]
+16)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_mfgr, p_type, p_size], file_type=csv, has_header=false
+18)--------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+19)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+20)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+21)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment], file_type=csv, has_header=false
+22)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+23)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
+24)------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+25)--------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
+26)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+27)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+28)--------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4
+29)----------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey]
+30)------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
+31)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+32)----------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)]
+33)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1]
+34)--------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4
+35)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4]
+36)------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+37)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4]
+38)----------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4
+39)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+40)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+41)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+42)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+43)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
+44)--------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+45)----------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0]
+46)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+47)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
index 0b994de411ea3..426a1cbaa4e22 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q20.slt.part
@@ -83,44 +83,28 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [s_name@0 ASC NULLS LAST]
 02)--SortExec: expr=[s_name@0 ASC NULLS LAST], preserve_partitioning=[true]
-03)----CoalesceBatchesExec: target_batch_size=8192
-04)------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[s_suppkey@0, s_name@1, s_address@2]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], file_type=csv, has_header=false
-13)----------------CoalesceBatchesExec: target_batch_size=8192
-14)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-15)--------------------CoalesceBatchesExec: target_batch_size=8192
-16)----------------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0]
-17)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-18)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-19)--------CoalesceBatchesExec: target_batch_size=8192
-20)----------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
-21)------------CoalesceBatchesExec: target_batch_size=8192
-22)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1]
-23)----------------CoalesceBatchesExec: target_batch_size=8192
-24)------------------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4
-25)--------------------CoalesceBatchesExec: target_batch_size=8192
-26)----------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, p_partkey@0)]
-27)------------------------CoalesceBatchesExec: target_batch_size=8192
-28)--------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
-29)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], file_type=csv, has_header=false
-30)------------------------CoalesceBatchesExec: target_batch_size=8192
-31)--------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-32)----------------------------CoalesceBatchesExec: target_batch_size=8192
-33)------------------------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0]
-34)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-35)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
-36)----------------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey]
-37)------------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
-38)--------------------CoalesceBatchesExec: target_batch_size=8192
-39)----------------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4
-40)------------------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
-41)--------------------------CoalesceBatchesExec: target_batch_size=8192
-42)----------------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2]
-43)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], file_type=csv, has_header=false
+03)----HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2]
+04)------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[s_suppkey@0, s_name@1, s_address@2]
+06)----------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_nationkey], file_type=csv, has_header=false
+08)----------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+09)------------FilterExec: n_name@1 = CANADA, projection=[n_nationkey@0]
+10)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+12)------RepartitionExec: partitioning=Hash([ps_suppkey@0], 4), input_partitions=4
+13)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1]
+14)----------RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 4), input_partitions=4
+15)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(ps_partkey@0, p_partkey@0)]
+16)--------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4
+17)----------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_availqty], file_type=csv, has_header=false
+18)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+19)----------------FilterExec: p_name@1 LIKE forest%, projection=[p_partkey@0]
+20)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+21)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
+22)----------ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey]
+23)------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
+24)--------------RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 4), input_partitions=4
+25)----------------AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)]
+26)------------------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2]
+27)--------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
index e52171524007e..5e9192d677532 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q21.slt.part
@@ -94,50 +94,30 @@ physical_plan
 02)--SortExec: expr=[numwait@1 DESC, s_name@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[s_name@0 as s_name, count(Int64(1))@1 as numwait]
 04)------AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([s_name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@1, n_nationkey@0)], projection=[s_name@0, l_orderkey@2, l_suppkey@3]
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@2, o_orderkey@0)], projection=[s_name@0, s_nationkey@1, l_orderkey@2, l_suppkey@3]
-20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4
-22)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4]
-24)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-26)--------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-27)----------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], file_type=csv, has_header=false
-28)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-29)------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-30)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-31)----------------------------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
-32)------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
-33)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)----------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-35)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)--------------------------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0]
-37)----------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], file_type=csv, has_header=false
-38)------------------------------CoalesceBatchesExec: target_batch_size=8192
-39)--------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-40)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-41)------------------------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0]
-42)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-43)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-44)----------------------CoalesceBatchesExec: target_batch_size=8192
-45)------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-46)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], file_type=csv, has_header=false
-47)------------------CoalesceBatchesExec: target_batch_size=8192
-48)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-49)----------------------CoalesceBatchesExec: target_batch_size=8192
-50)------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
-51)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([s_name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(Int64(1))]
+07)------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0]
+08)--------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0
+09)----------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@1, n_nationkey@0)], projection=[s_name@0, l_orderkey@2, l_suppkey@3]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@1], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@2, o_orderkey@0)], projection=[s_name@0, s_nationkey@1, l_orderkey@2, l_suppkey@3]
+13)------------------------RepartitionExec: partitioning=Hash([l_orderkey@2], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4]
+15)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+16)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_nationkey], file_type=csv, has_header=false
+17)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+18)------------------------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
+19)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+20)------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+21)--------------------------FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0]
+22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderstatus], file_type=csv, has_header=false
+23)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+24)----------------------FilterExec: n_name@1 = SAUDI ARABIA, projection=[n_nationkey@0]
+25)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+26)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+27)----------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+28)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey], file_type=csv, has_header=false
+29)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+30)----------------FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1]
+31)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
index 828bf967d8f4a..add578c3b079d 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q22.slt.part
@@ -78,28 +78,23 @@ physical_plan
 02)--SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[cntrycode@0 as cntrycode, count(Int64(1))@1 as numcust, sum(custsale.c_acctbal)@2 as totacctbal]
 04)------AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([cntrycode@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
-08)--------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------NestedLoopJoinExec: join_type=Inner, filter=CAST(c_acctbal@0 AS Decimal128(19, 6)) > avg(customer.c_acctbal)@1
+05)--------RepartitionExec: partitioning=Hash([cntrycode@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(Int64(1)), sum(custsale.c_acctbal)]
+07)------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal]
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------NestedLoopJoinExec: join_type=Inner, filter=join_proj_push_down_1@1 > avg(customer.c_acctbal)@0, projection=[c_phone@0, c_acctbal@1, avg(customer.c_acctbal)@3]
+10)------------------ProjectionExec: expr=[c_phone@0 as c_phone, c_acctbal@1 as c_acctbal, CAST(c_acctbal@1 AS Decimal128(19, 6)) as join_proj_push_down_1]
 11)--------------------CoalescePartitionsExec
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: substr(c_phone@1, 1, 2) IN ([Literal { value: Utf8View("13") }, Literal { value: Utf8View("31") }, Literal { value: Utf8View("23") }, Literal { value: Utf8View("29") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("18") }, Literal { value: Utf8View("17") }])
-18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
-20)--------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4
-22)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false
-23)--------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)]
-24)----------------------CoalescePartitionsExec
-25)------------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)]
-26)--------------------------CoalesceBatchesExec: target_batch_size=8192
-27)----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN ([Literal { value: Utf8View("13") }, Literal { value: Utf8View("31") }, Literal { value: Utf8View("23") }, Literal { value: Utf8View("29") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("18") }, Literal { value: Utf8View("17") }]), projection=[c_acctbal@1]
-28)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-29)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false
+12)----------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2]
+13)------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
+14)--------------------------FilterExec: substr(c_phone@1, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17])
+15)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+16)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], file_type=csv, has_header=false
+17)------------------------RepartitionExec: partitioning=Hash([o_custkey@0], 4), input_partitions=4
+18)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_custkey], file_type=csv, has_header=false
+19)------------------AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)]
+20)--------------------CoalescePartitionsExec
+21)----------------------AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)]
+22)------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND substr(c_phone@0, 1, 2) IN (SET) ([13, 31, 23, 29, 30, 18, 17]), projection=[c_acctbal@1]
+23)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+24)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
index d982ec32e9547..7fec4e5f5d624 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q3.slt.part
@@ -61,29 +61,17 @@ physical_plan
 01)SortPreservingMergeExec: [revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], fetch=10
 02)--SortExec: TopK(fetch=10), expr=[revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority]
-04)------AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([l_orderkey@0, o_orderdate@1, o_shippriority@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0]
-18)----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-19)------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_mktsegment], file_type=csv, has_header=false
-20)--------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-22)------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------FilterExec: o_orderdate@2 < 1995-03-15
-24)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], file_type=csv, has_header=false
-25)------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-27)----------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
-29)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+04)------AggregateExec: mode=SinglePartitioned, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5]
+06)----------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4]
+08)--------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
+09)----------------FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_mktsegment], file_type=csv, has_header=false
+12)--------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+13)----------------FilterExec: o_orderdate@2 < 1995-03-15
+14)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], file_type=csv, has_header=false
+15)----------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+16)------------FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2]
+17)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
index f7de3cd3c967c..0007666f15365 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q4.slt.part
@@ -57,18 +57,12 @@ physical_plan
 02)--SortExec: expr=[o_orderpriority@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[o_orderpriority@0 as o_orderpriority, count(Int64(1))@1 as order_count]
 04)------AggregateExec: mode=FinalPartitioned, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([o_orderpriority@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2]
-14)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], file_type=csv, has_header=false
-15)------------------CoalesceBatchesExec: target_batch_size=8192
-16)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-17)----------------------CoalesceBatchesExec: target_batch_size=8192
-18)------------------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0]
-19)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([o_orderpriority@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(Int64(1))]
+07)------------HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1]
+08)--------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+09)----------------FilterExec: o_orderdate@1 >= 1993-07-01 AND o_orderdate@1 < 1993-10-01, projection=[o_orderkey@0, o_orderpriority@2]
+10)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate, o_orderpriority], file_type=csv, has_header=false
+11)--------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+12)----------------FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0]
+13)------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_commitdate, l_receiptdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
index 15636056b8714..d854001f3cc4c 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q5.slt.part
@@ -71,50 +71,29 @@ physical_plan
 02)--SortExec: expr=[revenue@1 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[n_name@0 as n_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as revenue]
 04)------AggregateExec: mode=FinalPartitioned, gby=[n_name@0 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([n_name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
-08)--------------CoalesceBatchesExec: target_batch_size=8192
-09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@2]
-10)------------------CoalesceBatchesExec: target_batch_size=8192
-11)--------------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
-12)----------------------CoalesceBatchesExec: target_batch_size=8192
-13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5]
-14)--------------------------CoalesceBatchesExec: target_batch_size=8192
-15)----------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-16)------------------------------CoalesceBatchesExec: target_batch_size=8192
-17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5]
-18)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-19)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4
-20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192
-21)----------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5]
-22)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-23)--------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4
-24)----------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-25)------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2]
-26)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-27)----------------------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-28)------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-29)--------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-30)--------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-31)----------------------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
-32)------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-33)--------------------------------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1]
-34)----------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-35)------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)--------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-37)----------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
-38)----------------------------------CoalesceBatchesExec: target_batch_size=8192
-39)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=4
-40)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-41)----------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-42)--------------------------CoalesceBatchesExec: target_batch_size=8192
-43)----------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-44)------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-45)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
-46)------------------CoalesceBatchesExec: target_batch_size=8192
-47)--------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-48)----------------------CoalesceBatchesExec: target_batch_size=8192
-49)------------------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0]
-50)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-51)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([n_name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]
+07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@2]
+08)--------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
+09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, n_name@4, n_regionkey@5]
+10)------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0), (c_nationkey@0, s_nationkey@1)], projection=[l_extendedprice@2, l_discount@3, s_nationkey@5]
+12)----------------------RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 4), input_partitions=4
+13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5]
+14)--------------------------RepartitionExec: partitioning=Hash([o_orderkey@1], 4), input_partitions=4
+15)----------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2]
+16)------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+18)------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4
+19)--------------------------------FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1]
+20)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+21)--------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+22)----------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
+23)----------------------RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 4), input_partitions=1
+24)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+25)------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+26)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name, n_regionkey], file_type=csv, has_header=false
+27)--------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+28)----------------FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0]
+29)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+30)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
index b1e5d2869a8c5..eb9063d691712 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q6.slt.part
@@ -38,6 +38,5 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(5),15,2 AND l_discount@2 <= Some(7),15,2 AND l_quantity@0 < Some(2400),15,2, projection=[l_extendedprice@1, l_discount@2]
-07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+05)--------FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(5),15,2 AND l_discount@2 <= Some(7),15,2 AND l_quantity@0 < Some(2400),15,2, projection=[l_extendedprice@1, l_discount@2]
+06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
index 291d56e43f2df..b4e70993396e6 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part
@@ -88,53 +88,32 @@ physical_plan
 02)--SortExec: expr=[supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue]
 04)------AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
-08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[l_extendedprice@0, l_discount@1, l_shipdate@2, n_name@4, n_name@6]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@0, n_nationkey@0)], projection=[l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@4, n_name@6]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([s_nationkey@0], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@4, c_custkey@0)], projection=[s_nationkey@0, l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@6]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([o_custkey@4], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-29)--------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-30)----------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-31)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-32)------------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-33)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)----------------------------------------------------------FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31
-35)------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
-36)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-38)------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey], file_type=csv, has_header=false
-39)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-40)--------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-41)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-42)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-43)----------------------------CoalesceBatchesExec: target_batch_size=8192
-44)------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-45)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-46)----------------------------------FilterExec: n_name@1 = FRANCE OR n_name@1 = GERMANY
-47)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-48)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-49)--------------------CoalesceBatchesExec: target_batch_size=8192
-50)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-51)------------------------CoalesceBatchesExec: target_batch_size=8192
-52)--------------------------FilterExec: n_name@1 = GERMANY OR n_name@1 = FRANCE
-53)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-54)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)]
+07)------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[l_extendedprice@0, l_discount@1, l_shipdate@2, n_name@4, n_name@6]
+09)----------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@0, n_nationkey@0)], projection=[l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@4, n_name@6]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@0], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@4, c_custkey@0)], projection=[s_nationkey@0, l_extendedprice@1, l_discount@2, l_shipdate@3, c_nationkey@6]
+13)------------------------RepartitionExec: partitioning=Hash([o_custkey@4], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6]
+15)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@1], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+18)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+19)--------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+20)----------------------------------FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31
+21)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], file_type=csv, has_header=false
+22)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+23)------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey], file_type=csv, has_header=false
+24)------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+25)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+26)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+27)----------------------FilterExec: n_name@1 = FRANCE OR n_name@1 = GERMANY
+28)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+29)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+30)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
+31)------------------FilterExec: n_name@1 = GERMANY OR n_name@1 = FRANCE
+32)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+33)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
index 50171c528db6d..12f19d43d40e7 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part
@@ -94,69 +94,40 @@ physical_plan
 02)--SortExec: expr=[o_year@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[o_year@0 as o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END)@1 AS Decimal128(12, 2)) / CAST(sum(all_nations.volume)@2 AS Decimal128(12, 2)) AS Decimal128(15, 2)) as mkt_share]
 04)------AggregateExec: mode=FinalPartitioned, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
-08)--------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@2, n_name@4]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-29)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5]
-31)------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-32)--------------------------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
-33)----------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)------------------------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5]
-35)--------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-36)----------------------------------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-37)------------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-38)--------------------------------------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0]
-39)----------------------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-40)------------------------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
-41)--------------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-42)----------------------------------------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
-43)------------------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
-44)------------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-45)--------------------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-46)----------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-47)------------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-48)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-49)------------------------------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-50)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-51)----------------------------------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31
-52)------------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
-53)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-54)----------------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4
-55)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-56)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
-57)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-58)--------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-59)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-60)------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
-61)----------------------------CoalesceBatchesExec: target_batch_size=8192
-62)------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-63)--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-64)----------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
-65)--------------------CoalesceBatchesExec: target_batch_size=8192
-66)----------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
-67)------------------------CoalesceBatchesExec: target_batch_size=8192
-68)--------------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0]
-69)----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-70)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]
+07)------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@2, n_name@4]
+09)----------------RepartitionExec: partitioning=Hash([n_regionkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@3, n_regionkey@4, n_name@6]
+11)--------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@4, n_nationkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@3, n_regionkey@6]
+13)------------------------RepartitionExec: partitioning=Hash([c_nationkey@4], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_custkey@3, c_custkey@0)], projection=[l_extendedprice@0, l_discount@1, s_nationkey@2, o_orderdate@4, c_nationkey@6]
+15)----------------------------RepartitionExec: partitioning=Hash([o_custkey@3], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_extendedprice@1, l_discount@2, s_nationkey@3, o_custkey@5, o_orderdate@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@1, s_suppkey@0)], projection=[l_orderkey@0, l_extendedprice@2, l_discount@3, s_nationkey@5]
+19)------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@1], 4), input_partitions=4
+20)--------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5]
+21)----------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+22)------------------------------------------FilterExec: p_type@1 = ECONOMY ANODIZED STEEL, projection=[p_partkey@0]
+23)--------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+24)----------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], file_type=csv, has_header=false
+25)----------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
+26)------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount], file_type=csv, has_header=false
+27)------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+28)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+29)--------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+30)----------------------------------FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31
+31)------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], file_type=csv, has_header=false
+32)----------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=1
+33)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_nationkey], file_type=csv, has_header=false
+34)------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+35)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], file_type=csv, has_header=false
+36)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+37)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+38)----------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4
+39)------------------FilterExec: r_name@1 = AMERICA, projection=[r_regionkey@0]
+40)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+41)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
index 3b31c1bc2e8e3..4ec434c90368f 100644
--- a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
+++ b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part
@@ -79,48 +79,29 @@ physical_plan
 02)--SortExec: TopK(fetch=10), expr=[nation@0 ASC NULLS LAST, o_year@1 DESC], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[nation@0 as nation, o_year@1 as o_year, sum(profit.amount)@2 as sum_profit]
 04)------AggregateExec: mode=FinalPartitioned, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
-08)--------------ProjectionExec: expr=[n_name@5 as nation, date_part(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount]
-09)----------------CoalesceBatchesExec: target_batch_size=8192
-10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[l_quantity@0, l_extendedprice@1, l_discount@2, ps_supplycost@4, o_orderdate@5, n_name@7]
-11)--------------------CoalesceBatchesExec: target_batch_size=8192
-12)----------------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
-13)------------------------CoalesceBatchesExec: target_batch_size=8192
-14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_quantity@1, l_extendedprice@2, l_discount@3, s_nationkey@4, ps_supplycost@5, o_orderdate@7]
-15)----------------------------CoalesceBatchesExec: target_batch_size=8192
-16)------------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
-17)--------------------------------CoalesceBatchesExec: target_batch_size=8192
-18)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, ps_suppkey@1), (l_partkey@1, ps_partkey@0)], projection=[l_orderkey@0, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@9]
-19)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-20)--------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@2, l_partkey@1], 4), input_partitions=4
-21)----------------------------------------CoalesceBatchesExec: target_batch_size=8192
-22)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, s_suppkey@0)], projection=[l_orderkey@0, l_partkey@1, l_suppkey@2, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@7]
-23)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-24)----------------------------------------------RepartitionExec: partitioning=Hash([l_suppkey@2], 4), input_partitions=4
-25)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-26)--------------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6]
-27)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-28)------------------------------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
-29)--------------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-30)----------------------------------------------------------FilterExec: p_name@1 LIKE %green%, projection=[p_partkey@0]
-31)------------------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-32)--------------------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
-33)----------------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-34)------------------------------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
-35)--------------------------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount], file_type=csv, has_header=false
-36)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192
-37)----------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4
-38)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-39)--------------------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
-40)------------------------------------CoalesceBatchesExec: target_batch_size=8192
-41)--------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1, ps_partkey@0], 4), input_partitions=4
-42)----------------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
-43)----------------------------CoalesceBatchesExec: target_batch_size=8192
-44)------------------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
-45)--------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate], file_type=csv, has_header=false
-46)--------------------CoalesceBatchesExec: target_batch_size=8192
-47)----------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4
-48)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-49)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
+05)--------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)]
+07)------------ProjectionExec: expr=[n_name@5 as nation, date_part(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount]
+08)--------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[l_quantity@0, l_extendedprice@1, l_discount@2, ps_supplycost@4, o_orderdate@5, n_name@7]
+09)----------------RepartitionExec: partitioning=Hash([s_nationkey@3], 4), input_partitions=4
+10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_quantity@1, l_extendedprice@2, l_discount@3, s_nationkey@4, ps_supplycost@5, o_orderdate@7]
+11)--------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4
+12)----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, ps_suppkey@1), (l_partkey@1, ps_partkey@0)], projection=[l_orderkey@0, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@9]
+13)------------------------RepartitionExec: partitioning=Hash([l_suppkey@2, l_partkey@1], 4), input_partitions=4
+14)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_suppkey@2, s_suppkey@0)], projection=[l_orderkey@0, l_partkey@1, l_suppkey@2, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@7]
+15)----------------------------RepartitionExec: partitioning=Hash([l_suppkey@2], 4), input_partitions=4
+16)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6]
+17)--------------------------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4
+18)----------------------------------FilterExec: p_name@1 LIKE %green%, projection=[p_partkey@0]
+19)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+20)--------------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], file_type=csv, has_header=false
+21)--------------------------------RepartitionExec: partitioning=Hash([l_partkey@1], 4), input_partitions=4
+22)----------------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount], file_type=csv, has_header=false
+23)----------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=1
+24)------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], file_type=csv, has_header=false
+25)------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1, ps_partkey@0], 4), input_partitions=4
+26)--------------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], file_type=csv, has_header=false
+27)--------------------RepartitionExec: partitioning=Hash([o_orderkey@0], 4), input_partitions=4
+28)----------------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_orderdate], file_type=csv, has_header=false
+29)----------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=1
+30)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], file_type=csv, has_header=false
diff --git a/datafusion/sqllogictest/test_files/truncate.slt b/datafusion/sqllogictest/test_files/truncate.slt
new file mode 100644
index 0000000000000..ad3ccbb1a7cf4
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/truncate.slt
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Truncate Tests
+##########
+
+statement ok
+create table t1(a int, b varchar, c double, d int);
+
+statement ok
+insert into t1 values (1, 'abc', 3.14, 4), (2, 'def', 2.71, 5);
+
+# Truncate all rows from table
+query TT
+explain truncate table t1;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[t1]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 't1'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
+
+# Test TRUNCATE with fully qualified table name
+statement ok
+create schema test_schema;
+
+statement ok
+create table test_schema.t5(a int);
+
+query TT
+explain truncate table test_schema.t5;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[test_schema.t5]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 'test_schema.t5'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
+
+# Test TRUNCATE with CASCADE option
+statement error TRUNCATE with CASCADE/RESTRICT is not supported
+TRUNCATE TABLE t1 CASCADE;
+
+# Test TRUNCATE with multiple tables
+statement error TRUNCATE with multiple tables is not supported
+TRUNCATE TABLE t1, t2;
+
+statement error TRUNCATE with PARTITION is not supported
+TRUNCATE TABLE t1 PARTITION (p1);
+
+statement error TRUNCATE with ONLY is not supported
+TRUNCATE ONLY t1;
+
+statement error TRUNCATE with RESTART/CONTINUE IDENTITY is not supported
+TRUNCATE TABLE t1 RESTART IDENTITY;
+
+# Test TRUNCATE without TABLE keyword
+query TT
+explain truncate t1;
+----
+logical_plan
+01)Dml: op=[Truncate] table=[t1]
+02)--EmptyRelation: rows=0
+physical_plan_error
+01)TRUNCATE operation on table 't1'
+02)caused by
+03)This feature is not implemented: TRUNCATE not supported for Base table
diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt
index 2c6079bc7039d..7039e66b38b15 100644
--- a/datafusion/sqllogictest/test_files/type_coercion.slt
+++ b/datafusion/sqllogictest/test_files/type_coercion.slt
@@ -47,7 +47,7 @@ query error DataFusion error: Error during planning: Cannot coerce arithmetic ex
 select interval '1 month' - '2023-05-01'::date;
 
 # interval - timestamp
-query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types
+query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types
 SELECT interval '1 month' - '2023-05-01 12:30:00'::timestamp;
 
 # dictionary(int32, utf8) -> utf8
@@ -128,9 +128,9 @@ EXPLAIN SELECT 1, 2 UNION ALL SELECT 3, 4
 logical_plan
 01)Union
 02)--Projection: Int64(1) AS Int64(1), Int64(2) AS Int64(2)
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 04)--Projection: Int64(3) AS Int64(1), Int64(4) AS Int64(2)
-05)----EmptyRelation
+05)----EmptyRelation: rows=1
 
 # union_with_incompatible_data_type()
 query error Incompatible inputs for Union: Previous inputs were of type Interval\(MonthDayNano\), but got incompatible type Int64 on column 'Int64\(1\)'
@@ -143,9 +143,9 @@ EXPLAIN SELECT 1 a UNION ALL SELECT 1.1 a
 logical_plan
 01)Union
 02)--Projection: CAST(Int64(1) AS Float64) AS a
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 04)--Projection: Float64(1.1) AS a
-05)----EmptyRelation
+05)----EmptyRelation: rows=1
 
 # union_with_null()
 query TT
@@ -154,9 +154,9 @@ EXPLAIN SELECT NULL a UNION ALL SELECT 1.1 a
 logical_plan
 01)Union
 02)--Projection: CAST(NULL AS Float64) AS a
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 04)--Projection: Float64(1.1) AS a
-05)----EmptyRelation
+05)----EmptyRelation: rows=1
 
 # union_with_float_and_string()
 query TT
@@ -165,9 +165,9 @@ EXPLAIN SELECT 'a' a UNION ALL SELECT 1.1 a
 logical_plan
 01)Union
 02)--Projection: Utf8("a") AS a
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 04)--Projection: CAST(Float64(1.1) AS Utf8) AS a
-05)----EmptyRelation
+05)----EmptyRelation: rows=1
 
 # union_with_multiply_cols()
 query TT
@@ -176,9 +176,9 @@ EXPLAIN SELECT 'a' a, 1 b UNION ALL SELECT 1.1 a, 1.1 b
 logical_plan
 01)Union
 02)--Projection: Utf8("a") AS a, CAST(Int64(1) AS Float64) AS b
-03)----EmptyRelation
+03)----EmptyRelation: rows=1
 04)--Projection: CAST(Float64(1.1) AS Utf8) AS a, Float64(1.1) AS b
-05)----EmptyRelation
+05)----EmptyRelation: rows=1
 
 # sorted_union_with_different_types_and_group_by()
 query TT
@@ -193,12 +193,12 @@ logical_plan
 04)------Aggregate: groupBy=[[x.a]], aggr=[[]]
 05)--------SubqueryAlias: x
 06)----------Projection: Int64(1) AS a
-07)------------EmptyRelation
+07)------------EmptyRelation: rows=1
 08)----Projection: x.a
 09)------Aggregate: groupBy=[[x.a]], aggr=[[]]
 10)--------SubqueryAlias: x
 11)----------Projection: Float64(1.1) AS a
-12)------------EmptyRelation
+12)------------EmptyRelation: rows=1
 
 # union_with_binary_expr_and_cast()
 query TT
@@ -212,12 +212,12 @@ logical_plan
 03)----Aggregate: groupBy=[[CAST(Float64(0) + CAST(x.a AS Float64) AS Int32)]], aggr=[[]]
 04)------SubqueryAlias: x
 05)--------Projection: Int64(1) AS a
-06)----------EmptyRelation
+06)----------EmptyRelation: rows=1
 07)--Projection: Float64(2.1) + x.a AS Float64(0) + x.a
 08)----Aggregate: groupBy=[[Float64(2.1) + CAST(x.a AS Float64)]], aggr=[[]]
 09)------SubqueryAlias: x
 10)--------Projection: Int64(1) AS a
-11)----------EmptyRelation
+11)----------EmptyRelation: rows=1
 
 # union_with_aliases()
 query TT
@@ -231,12 +231,12 @@ logical_plan
 03)----Aggregate: groupBy=[[x.a]], aggr=[[]]
 04)------SubqueryAlias: x
 05)--------Projection: Int64(1) AS a
-06)----------EmptyRelation
+06)----------EmptyRelation: rows=1
 07)--Projection: x.a AS a1
 08)----Aggregate: groupBy=[[x.a]], aggr=[[]]
 09)------SubqueryAlias: x
 10)--------Projection: Float64(1.1) AS a
-11)----------EmptyRelation
+11)----------EmptyRelation: rows=1
 
 # union_with_incompatible_data_types()
 query error Incompatible inputs for Union: Previous inputs were of type Utf8, but got incompatible type Boolean on column 'a'
@@ -254,3 +254,51 @@ DROP TABLE orders;
 ########################################
 ## Test type coercion with UNIONs end ##
 ########################################
+
+# https://github.com/apache/datafusion/issues/15661
+# LIKE is a string pattern matching operator and is not supported for nested types.
+
+statement ok
+CREATE TABLE t0(v0 BIGINT, v1 STRING, v2 BOOLEAN);
+
+statement ok
+INSERT INTO t0(v0, v2) VALUES (123, true);
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE ((REGEXP_MATCH(t0.v1, t0.v1)) NOT LIKE (REGEXP_MATCH(t0.v1, t0.v1, 'jH')));
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) NOT LIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) LIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) ILIKE [];
+
+query error There isn't a common type to coerce .* in .* expression
+SELECT true FROM t0 WHERE (REGEXP_MATCH(t0.v1, t0.v1)) NOT ILIKE [];
+
+statement ok
+DROP TABLE t0;
+
+#############################################################
+## Test validation for functions with empty argument lists ##
+#############################################################
+
+# https://github.com/apache/datafusion/issues/20201
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE (STARTS_WITH() IS NULL);
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE (STARTS_WITH() IS NOT NULL);
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 'a') WHERE (STARTS_WITH() SIMILAR TO 'abc%');
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE CAST(STARTS_WITH() AS STRING) = 'x';
+
+query error does not support zero arguments
+SELECT * FROM (SELECT 1) WHERE TRY_CAST(STARTS_WITH() AS INT) = 1;
\ No newline at end of file
diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt
index d549f555f9d8b..d858d0ae3ea4e 100644
--- a/datafusion/sqllogictest/test_files/union.slt
+++ b/datafusion/sqllogictest/test_files/union.slt
@@ -230,19 +230,17 @@ logical_plan
 02)--Union
 03)----TableScan: t1 projection=[name]
 04)----TableScan: t2 projection=[name]
-05)----Projection: t2.name || Utf8("_new") AS name
+05)----Projection: t2.name || Utf8View("_new") AS name
 06)------TableScan: t2 projection=[name]
 physical_plan
 01)AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-02)--CoalesceBatchesExec: target_batch_size=8192
-03)----RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
-05)--------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-06)----------UnionExec
-07)------------DataSourceExec: partitions=1, partition_sizes=[1]
-08)------------DataSourceExec: partitions=1, partition_sizes=[1]
-09)------------ProjectionExec: expr=[name@0 || _new as name]
-10)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=3
+03)----AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+04)------UnionExec
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
+06)--------DataSourceExec: partitions=1, partition_sizes=[1]
+07)--------ProjectionExec: expr=[name@0 || _new as name]
+08)----------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # nested_union_all
 query T rowsort
@@ -266,7 +264,7 @@ logical_plan
 01)Union
 02)--TableScan: t1 projection=[name]
 03)--TableScan: t2 projection=[name]
-04)--Projection: t2.name || Utf8("_new") AS name
+04)--Projection: t2.name || Utf8View("_new") AS name
 05)----TableScan: t2 projection=[name]
 physical_plan
 01)UnionExec
@@ -307,31 +305,27 @@ logical_plan
 10)------TableScan: t1 projection=[id, name]
 physical_plan
 01)UnionExec
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0, CAST(t2.id AS Int32)@2), (name@1, name@1)]
-04)------CoalescePartitionsExec
-05)--------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
-12)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-13)----------DataSourceExec: partitions=1, partition_sizes=[1]
-14)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name]
-15)----CoalesceBatchesExec: target_batch_size=2
-16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1]
-17)--------CoalescePartitionsExec
-18)----------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
-19)------------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
-20)--------------CoalesceBatchesExec: target_batch_size=2
-21)----------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
-22)------------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
-23)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-24)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
-25)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-26)----------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
+05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+07)----AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
+08)------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
+09)--------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
+10)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+11)------------DataSourceExec: partitions=1, partition_sizes=[1]
+12)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name]
+13)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual: true
+14)------CoalescePartitionsExec
+15)--------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS Int32) as CAST(t2.id AS Int32)]
+16)----------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as name], aggr=[]
+17)------------RepartitionExec: partitioning=Hash([id@0, name@1], 4), input_partitions=4
+18)--------------AggregateExec: mode=Partial, gby=[id@0 as id, name@1 as name], aggr=[]
+19)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+20)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+21)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+22)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 
 query IT rowsort
@@ -377,28 +371,24 @@ logical_plan
 09)----TableScan: t1 projection=[name]
 physical_plan
 01)UnionExec
-02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)]
-04)------CoalescePartitionsExec
-05)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-06)----------CoalesceBatchesExec: target_batch_size=2
-07)------------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-08)--------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-11)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-12)--------DataSourceExec: partitions=1, partition_sizes=[1]
-13)--CoalesceBatchesExec: target_batch_size=2
-14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)]
-15)------CoalescePartitionsExec
-16)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-17)----------CoalesceBatchesExec: target_batch_size=2
-18)------------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-19)--------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-20)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-21)------------------DataSourceExec: partitions=1, partition_sizes=[1]
-22)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-23)--------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
+03)----CoalescePartitionsExec
+04)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+05)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+09)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------DataSourceExec: partitions=1, partition_sizes=[1]
+11)--HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0, name@0)], NullsEqual: true
+12)----CoalescePartitionsExec
+13)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+14)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+15)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+16)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+17)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+18)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+19)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # union_upcast_types
 query TT
@@ -413,15 +403,14 @@ logical_plan
 06)------TableScan: aggregate_test_100 projection=[c1, c3]
 physical_plan
 01)SortPreservingMergeExec: [c9@1 DESC], fetch=5
-02)--UnionExec
-03)----SortExec: TopK(fetch=5), expr=[c9@1 DESC], preserve_partitioning=[true]
+02)--SortExec: TopK(fetch=5), expr=[c9@1 DESC], preserve_partitioning=[true]
+03)----UnionExec
 04)------ProjectionExec: expr=[c1@0 as c1, CAST(c9@1 AS Decimal128(20, 0)) as c9]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
-07)----SortExec: TopK(fetch=5), expr=[c9@1 DESC], preserve_partitioning=[true]
-08)------ProjectionExec: expr=[c1@0 as c1, CAST(c3@1 AS Decimal128(20, 0)) as c9]
-09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
+07)------ProjectionExec: expr=[c1@0 as c1, CAST(c3@1 AS Decimal128(20, 0)) as c9]
+08)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], file_type=csv, has_header=true
 
 query TR
 SELECT c1, c9 FROM aggregate_test_100 UNION ALL SELECT c1, c3 FROM aggregate_test_100 ORDER BY c9 DESC LIMIT 5
@@ -454,17 +443,15 @@ physical_plan
 02)--AggregateExec: mode=SinglePartitioned, gby=[name@0 as name], aggr=[count(Int64(1))]
 03)----InterleaveExec
 04)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-05)--------CoalesceBatchesExec: target_batch_size=2
-06)----------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
-10)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
-11)--------CoalesceBatchesExec: target_batch_size=2
-12)----------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
-13)------------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
-14)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+09)------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
+10)--------RepartitionExec: partitioning=Hash([name@0], 4), input_partitions=4
+11)----------AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[]
+12)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+13)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # Union with limit push down 3 children test case
 query TT
@@ -496,35 +483,36 @@ logical_plan
 11)--------------------TableScan: aggregate_test_100 projection=[c1, c13], partial_filters=[aggregate_test_100.c13 != Utf8View("C2GT5KVyOPZpgKVl110TyZO0NcJ434")]
 12)----Projection: Int64(1) AS cnt
 13)------Limit: skip=0, fetch=3
-14)--------EmptyRelation
+14)--------EmptyRelation: rows=1
 15)----Projection: lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS cnt
 16)------Limit: skip=0, fetch=3
 17)--------WindowAggr: windowExpr=[[lead(b.c1, Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 18)----------SubqueryAlias: b
 19)------------Projection: Int64(1) AS c1
-20)--------------EmptyRelation
+20)--------------EmptyRelation: rows=1
 physical_plan
 01)CoalescePartitionsExec: fetch=3
 02)--UnionExec
 03)----ProjectionExec: expr=[count(Int64(1))@0 as cnt]
-04)------AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
-05)--------CoalescePartitionsExec
-06)----------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
-07)------------ProjectionExec: expr=[]
-08)--------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
-09)----------------CoalesceBatchesExec: target_batch_size=2
+04)------GlobalLimitExec: skip=0, fetch=3
+05)--------AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
+06)----------CoalescePartitionsExec
+07)------------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
+08)--------------ProjectionExec: expr=[]
+09)----------------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
 10)------------------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
 11)--------------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
-12)----------------------CoalesceBatchesExec: target_batch_size=2
-13)------------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
-16)----ProjectionExec: expr=[1 as cnt]
-17)------PlaceholderRowExec
+12)----------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
+13)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
+15)----ProjectionExec: expr=[1 as cnt]
+16)------GlobalLimitExec: skip=0, fetch=3
+17)--------PlaceholderRowExec
 18)----ProjectionExec: expr=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as cnt]
-19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-20)--------ProjectionExec: expr=[1 as c1]
-21)----------PlaceholderRowExec
+19)------GlobalLimitExec: skip=0, fetch=3
+20)--------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+21)----------ProjectionExec: expr=[1 as c1]
+22)------------PlaceholderRowExec
 
 
 ########
@@ -604,8 +592,7 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 02)--UnionExec
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
-04)----ProjectionExec: expr=[c1a@0 as c1]
-05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a], output_ordering=[c1a@0 ASC NULLS LAST], file_type=csv, has_header=true
+04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a@0 as c1], file_type=csv, has_header=true
 
 statement ok
 drop table t1
@@ -621,11 +608,11 @@ logical_plan
 01)Union
 02)--Projection: Int64(1) AS a
 03)----Aggregate: groupBy=[[Int64(1)]], aggr=[[]]
-04)------EmptyRelation
+04)------EmptyRelation: rows=1
 05)--Projection: Int64(2) AS a
-06)----EmptyRelation
+06)----EmptyRelation: rows=1
 07)--Projection: Int64(3) AS a
-08)----EmptyRelation
+08)----EmptyRelation: rows=1
 physical_plan
 01)UnionExec
 02)--ProjectionExec: expr=[Int64(1)@0 as a]
@@ -648,12 +635,12 @@ logical_plan
 03)----Aggregate: groupBy=[[a.n]], aggr=[[count(Int64(1))]]
 04)------SubqueryAlias: a
 05)--------Projection: Int64(5) AS n
-06)----------EmptyRelation
+06)----------EmptyRelation: rows=1
 07)--Projection: b.x AS count, b.y AS n
 08)----SubqueryAlias: b
 09)------Projection: Int64(1) AS x, max(Int64(10)) AS y
 10)--------Aggregate: groupBy=[[]], aggr=[[max(Int64(10))]]
-11)----------EmptyRelation
+11)----------EmptyRelation: rows=1
 physical_plan
 01)UnionExec
 02)--ProjectionExec: expr=[count(Int64(1))@1 as count, n@0 as n]
@@ -836,14 +823,12 @@ logical_plan
 physical_plan
 01)CoalescePartitionsExec
 02)--UnionExec
-03)----CoalesceBatchesExec: target_batch_size=2
-04)------FilterExec: c1@0 = a
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
-07)----CoalesceBatchesExec: target_batch_size=2
-08)------FilterExec: c1@0 = a
-09)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-10)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
+03)----FilterExec: c1@0 = a
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
+06)----FilterExec: c1@0 = a
+07)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], file_type=csv, has_header=true
 
 # Clean up after the test
 statement ok
@@ -916,19 +901,19 @@ physical_plan
 03)----SortExec: expr=[y@0 ASC NULLS LAST], preserve_partitioning=[true]
 04)------ProjectionExec: expr=[CAST(y@0 AS Int64) as y]
 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-06)----------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------DataSourceExec: partitions=1, partition_sizes=[2]
 07)----SortExec: expr=[y@0 ASC NULLS LAST], preserve_partitioning=[false]
 08)------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # optimize_subquery_sort in create_relation removes Sort so the result is not sorted.
 query I
-SELECT * FROM v1;
+SELECT * FROM v1 ORDER BY 1;
 ----
-20
-40
+1
 3
 3
-1
+20
+40
 
 query TT
 explain SELECT * FROM v1;
@@ -943,7 +928,7 @@ physical_plan
 01)UnionExec
 02)--ProjectionExec: expr=[CAST(y@0 AS Int64) as y]
 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: partitions=1, partition_sizes=[1]
+04)------DataSourceExec: partitions=1, partition_sizes=[2]
 05)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement count 0
@@ -954,3 +939,40 @@ drop table u1;
 
 statement count 0
 drop table u2;
+
+# repro for https://github.com/apache/datafusion/issues/18327
+# should not error
+query TITT
+  WITH typ(oid, typnamespace, typname, typtype) AS (
+                      SELECT * FROM (VALUES (1, 10, 't1', 'b'))
+            UNION ALL SELECT * FROM (VALUES (2, NULL, 't2', 'b'))
+            UNION ALL SELECT * FROM (VALUES (3, 12, 't3', NULL))
+       )
+       , ns(oid, nspname) AS (VALUES (1, 'ns1'), (2, 'ns2'))
+    SELECT ns.nspname, typ.oid, typ.typname, typ.typtype
+      FROM typ JOIN ns ON (ns.oid = typ.typnamespace)
+     WHERE typ.typtype IN ('b','r','m','e','d')
+     ORDER BY CASE WHEN typ.typtype IN ('b','e','p') THEN 0
+                   WHEN typ.typtype = 'r' THEN 1
+              END
+----
+
+# Add another row with a non-NULL value `m` which is retained by the
+# filter but not matching any WHEN branch m?
+query TITT
+  WITH typ(oid, typnamespace, typname, typtype) AS (
+                      SELECT * FROM (VALUES (1, 10, 't1', 'b'))
+            UNION ALL SELECT * FROM (VALUES (2, NULL, 't2', 'b'))
+            UNION ALL SELECT * FROM (VALUES (3, 12, 't3', NULL))
+            UNION ALL SELECT * FROM (VALUES (4, 40, 't3', 'm'))
+       ), ns(oid, nspname) AS (
+         VALUES (1, 'ns1'), (2, 'ns2'), (40, 'ns3')
+       )
+    SELECT ns.nspname, typ.oid, typ.typname, typ.typtype
+      FROM typ JOIN ns ON (ns.oid = typ.typnamespace)
+     WHERE typ.typtype IN ('b','r','m','e','d')
+     ORDER BY CASE WHEN typ.typtype IN ('b','e','p') THEN 0
+                   WHEN typ.typtype = 'r' THEN 1
+              END
+----
+ns3 4 t3 m
diff --git a/datafusion/sqllogictest/test_files/union_by_name.slt b/datafusion/sqllogictest/test_files/union_by_name.slt
index 233885618f832..6a1608d5d1348 100644
--- a/datafusion/sqllogictest/test_files/union_by_name.slt
+++ b/datafusion/sqllogictest/test_files/union_by_name.slt
@@ -334,90 +334,8 @@ select x, y, z from t3 union all by name select z, y, x from t4 order by x;
 a b c
 a b c
 
-
-# FIXME: The following should pass without error, but currently it is failing
-# due to differing record batch schemas when the SLT runner collects results.
-# This is due to the following issue: https://github.com/apache/datafusion/issues/15394#issue-2943811768
-#
-# More context can be found here: https://github.com/apache/datafusion/pull/15242#issuecomment-2746563234
-query error
+query TTTT rowsort
 select x, y, z from t3 union all by name select z, y, x, 'd' as zz from t3;
 ----
-DataFusion error: Internal error: Schema mismatch. Previously had
-Schema {
-    fields: [
-        Field {
-            name: "x",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "y",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "z",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "zz",
-            data_type: Utf8,
-            nullable: false,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-    ],
-    metadata: {},
-}
-
-Got:
-Schema {
-    fields: [
-        Field {
-            name: "x",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "y",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "z",
-            data_type: Utf8View,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-        Field {
-            name: "zz",
-            data_type: Utf8,
-            nullable: true,
-            dict_id: 0,
-            dict_is_ordered: false,
-            metadata: {},
-        },
-    ],
-    metadata: {},
-}.
-This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker
+a b c NULL
+a b c d
diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt
index 92e6f9995ae36..ba499679a9a80 100644
--- a/datafusion/sqllogictest/test_files/unnest.slt
+++ b/datafusion/sqllogictest/test_files/unnest.slt
@@ -58,6 +58,20 @@ select unnest(struct(1,2,3));
 ----
 1 2 3
 
+## Basic unnest expression in select struct with alias (alias is ignored for struct unnest)
+query III
+select unnest(struct(1,2,3)) as ignored_alias;
+----
+1 2 3
+
+## Verify schema output for struct unnest with alias (alias is ignored)
+query TTT
+describe select unnest(struct(1,2,3)) as ignored_alias;
+----
+__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c0 Int64 YES
+__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c1 Int64 YES
+__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c2 Int64 YES
+
 ## Basic unnest list expression in from clause
 query I
 select * from unnest([1,2,3]);
@@ -511,7 +525,7 @@ x y [30, 40, 50]
 query error DataFusion error: type_coercion\ncaused by\nThis feature is not implemented: Unnest should be rewritten to LogicalPlan::Unnest before type coercion
 select sum(unnest(generate_series(1,10)));
 
-query error DataFusion error: Internal error: unnest on struct can only be applied at the root level of select expression
+query error DataFusion error: Internal error: Assertion failed: struct_allowed: unnest on struct can only be applied at the root level of select expression
 select arrow_typeof(unnest(column5)) from unnest_table;
 
 query T
@@ -652,15 +666,15 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn
 logical_plan
 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3
 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[]
-03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3
+03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3
 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
 06)----------TableScan: recursive_unnest_table projection=[column3]
 physical_plan
 01)ProjectionExec: expr=[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2)@0 as UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), column3@1 as column3]
 02)--UnnestExec
-03)----ProjectionExec: expr=[get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0, c1) as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3]
-04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------ProjectionExec: expr=[get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0, c1) as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3]
 05)--------UnnestExec
 06)----------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
 07)------------DataSourceExec: partitions=1, partition_sizes=[1]
@@ -798,9 +812,21 @@ NULL 1
 query error DataFusion error: Error during planning: Column in SELECT must be in GROUP BY or an aggregate function: While expanding wildcard, column "nested_unnest_table\.column1" must appear in the GROUP BY clause or must be part of an aggregate function, currently only "UNNEST\(nested_unnest_table\.column1\)\[c0\]" appears in the SELECT clause satisfies this requirement
 select unnest(column1) c1 from nested_unnest_table group by c1.c0;
 
-# TODO: this query should work. see issue: https://github.com/apache/datafusion/issues/12794
-query error DataFusion error: Internal error: unnest on struct can only be applied at the root level of select expression
+## Unnest struct with alias - alias is ignored (same as DuckDB behavior)
+## See: https://github.com/apache/datafusion/issues/12794
+query TT?
 select unnest(column1) c1 from nested_unnest_table
+----
+a b {c0: c}
+d e {c0: f}
+
+## Verify schema output for struct unnest with alias (alias is ignored)
+query TTT
+describe select unnest(column1) c1 from nested_unnest_table;
+----
+__unnest_placeholder(nested_unnest_table.column1).c0 Utf8 YES
+__unnest_placeholder(nested_unnest_table.column1).c1 Utf8 YES
+__unnest_placeholder(nested_unnest_table.column1).c2 Struct("c0": Utf8) YES
 
 query II??I??
 select unnest(column5), * from unnest_table;
@@ -863,11 +889,11 @@ select count(*) from (select unnest(range(0, 100000)) id) t inner join (select u
 # Test implicit LATERAL support for UNNEST
 # Issue: https://github.com/apache/datafusion/issues/13659
 # TODO: https://github.com/apache/datafusion/issues/10048
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\)
+query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\)
 select * from unnest_table u, unnest(u.column1);
 
 # Test implicit LATERAL support for UNNEST (INNER JOIN)
-query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\)
+query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\)
 select * from unnest_table u INNER JOIN unnest(u.column1) AS t(column1) ON u.column3 = t.column1;
 
 # Test implicit LATERAL planning for UNNEST
@@ -875,15 +901,15 @@ query TT
 explain select * from unnest_table u, unnest(u.column1);
 ----
 logical_plan
-01)Cross Join:
+01)Cross Join: 
 02)--SubqueryAlias: u
 03)----TableScan: unnest_table projection=[column1, column2, column3, column4, column5]
 04)--Subquery:
 05)----Projection: __unnest_placeholder(outer_ref(u.column1),depth=1) AS UNNEST(outer_ref(u.column1))
 06)------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[]
 07)--------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1))
-08)----------EmptyRelation
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" })
+08)----------EmptyRelation: rows=1
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" })
 
 # Test implicit LATERAL planning for UNNEST (INNER JOIN)
 query TT
@@ -898,8 +924,8 @@ logical_plan
 06)------Projection: __unnest_placeholder(outer_ref(u.column1),depth=1) AS column1
 07)--------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[]
 08)----------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1))
-09)------------EmptyRelation
-physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" })
+09)------------EmptyRelation: rows=1
+physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" })
 
 # uncorrelated EXISTS with unnest
 query I
@@ -941,3 +967,365 @@ where min_height * width1 = (
 )
 ----
 4 7 4 28
+
+## Unnest with ordering on unrelated column is preserved
+query TT
+EXPLAIN WITH unnested AS (SELECT
+    ROW_NUMBER() OVER () AS generated_id,
+    unnest(array[value]) as ar
+  FROM range(1,5)) SELECT array_agg(ar) FROM unnested group by generated_id;
+----
+logical_plan
+01)Projection: array_agg(unnested.ar)
+02)--Aggregate: groupBy=[[unnested.generated_id]], aggr=[[array_agg(unnested.ar)]]
+03)----SubqueryAlias: unnested
+04)------Projection: generated_id, __unnest_placeholder(make_array(range().value),depth=1) AS UNNEST(make_array(range().value)) AS ar
+05)--------Unnest: lists[__unnest_placeholder(make_array(range().value))|depth=1] structs[]
+06)----------Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS generated_id, make_array(range().value) AS __unnest_placeholder(make_array(range().value))
+07)------------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+08)--------------TableScan: range() projection=[value]
+physical_plan
+01)ProjectionExec: expr=[array_agg(unnested.ar)@1 as array_agg(unnested.ar)]
+02)--AggregateExec: mode=FinalPartitioned, gby=[generated_id@0 as generated_id], aggr=[array_agg(unnested.ar)], ordering_mode=Sorted
+03)----SortExec: expr=[generated_id@0 ASC NULLS LAST], preserve_partitioning=[true]
+04)------RepartitionExec: partitioning=Hash([generated_id@0], 4), input_partitions=4
+05)--------AggregateExec: mode=Partial, gby=[generated_id@0 as generated_id], aggr=[array_agg(unnested.ar)], ordering_mode=Sorted
+06)----------ProjectionExec: expr=[generated_id@0 as generated_id, __unnest_placeholder(make_array(range().value),depth=1)@1 as ar]
+07)------------UnnestExec
+08)--------------ProjectionExec: expr=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as generated_id, make_array(value@0) as __unnest_placeholder(make_array(range().value))]
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, maintains_sort_order=true
+10)------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+11)--------------------LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192]
+
+# Unnest array where data is already ordered by column2 (100, 200, 300, 400)
+statement ok
+COPY (
+  SELECT * FROM VALUES
+    ([1,2,3], 100),
+    ([3],     200),
+    ([],      300),
+    ([3,1],   400)
+  ORDER BY column2
+ ) TO 'test_files/scratch/unnest/ordered_array.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/unnest/ordered_array.parquet'
+WITH ORDER (column2)
+
+query ?I
+SELECT * FROM t;
+----
+[1, 2, 3] 100
+[3] 200
+[] 300
+[3, 1] 400
+
+# Data is sorted on column2 already, so no need to sort again
+query II
+SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
+----
+1 100
+2 100
+3 100
+3 200
+3 400
+1 400
+
+# Explain should not have a SortExec
+query TT
+EXPLAIN SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
+----
+logical_plan
+01)Sort: t.column2 ASC NULLS LAST
+02)--Projection: __unnest_placeholder(t.column1,depth=1) AS UNNEST(t.column1), t.column2
+03)----Unnest: lists[__unnest_placeholder(t.column1)|depth=1] structs[]
+04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
+05)--------TableScan: t projection=[column1, column2]
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as UNNEST(t.column1), column2@1 as column2]
+02)--UnnestExec
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+
+# Explain should have a SortExec at the top because we order by the output of the unnest (i.e. discarding the ordering)
+query TT
+EXPLAIN SELECT UNNEST(column1) as unnested, column2 FROM t ORDER BY 1;
+----
+logical_plan
+01)Sort: unnested ASC NULLS LAST
+02)--Projection: __unnest_placeholder(t.column1,depth=1) AS UNNEST(t.column1) AS unnested, t.column2
+03)----Unnest: lists[__unnest_placeholder(t.column1)|depth=1] structs[]
+04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
+05)--------TableScan: t projection=[column1, column2]
+physical_plan
+01)SortExec: expr=[unnested@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as unnested, column2@1 as column2]
+03)----UnnestExec
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+
+# cleanup
+statement ok
+drop table t;
+
+# Unnest tuple where the data is already sorted by column 1
+statement ok
+COPY (
+  SELECT * FROM VALUES
+    (100, [3,2,1], 'a'),
+    (200, [1,2,3], 'b'),
+    (300, [3,1,2], 'c')
+  ORDER BY column1
+ ) TO 'test_files/scratch/unnest/ordered_tuples.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/unnest/ordered_tuples.parquet'
+WITH ORDER (column1)
+
+query I?T
+SELECT * FROM t;
+----
+100 [3, 2, 1] a
+200 [1, 2, 3] b
+300 [3, 1, 2] c
+
+# Put the columns in a tuple and unnest, we need to sort because we discard ordering of unnested columns
+query TT
+EXPLAIN WITH unnested AS (
+  SELECT unnest((column1, column2, column3))
+  FROM t
+) SELECT * FROM unnested order by 1;
+----
+logical_plan
+01)Sort: unnested.__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0 ASC NULLS LAST
+02)--SubqueryAlias: unnested
+03)----Unnest: lists[] structs[__unnest_placeholder(struct(t.column1,t.column2,t.column3))]
+04)------Projection: struct(t.column1, t.column2, t.column3) AS __unnest_placeholder(struct(t.column1,t.column2,t.column3))
+05)--------TableScan: t projection=[column1, column2, column3]
+physical_plan
+01)SortExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--UnnestExec
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))], file_type=parquet
+
+# cleanup
+statement ok
+drop table t;
+
+# Unnest struct where data is already ordered by column2 (100, 200, 300, 400)
+statement ok
+COPY (
+  SELECT * FROM VALUES
+    (named_struct('s1', 1, 's2', 2, 's3', 3), 100),
+    (named_struct('s1', 1, 's2', 3, 's3', 2), 200),
+    (named_struct('s1', 2, 's2', 1, 's3', 3), 300),
+    (named_struct('s1', 3, 's2', 2, 's3', 1), 400)
+  ORDER BY column2
+ ) TO 'test_files/scratch/unnest/ordered_struct.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/unnest/ordered_struct.parquet'
+WITH ORDER (column2)
+
+query ?I
+SELECT * FROM t;
+----
+{s1: 1, s2: 2, s3: 3} 100
+{s1: 1, s2: 3, s3: 2} 200
+{s1: 2, s2: 1, s3: 3} 300
+{s1: 3, s2: 2, s3: 1} 400
+
+# data is sorted on column2 already, so no need to sort again
+query IIII
+SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
+----
+1 2 3 100
+1 3 2 200
+2 1 3 300
+3 2 1 400
+
+# Explain should not have a SortExec
+query TT
+EXPLAIN SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
+----
+logical_plan
+01)Sort: t.column2 ASC NULLS LAST
+02)--Unnest: lists[] structs[__unnest_placeholder(t.column1)]
+03)----Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
+04)------TableScan: t projection=[column1, column2]
+physical_plan
+01)UnnestExec
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
+
+# cleanup
+statement ok
+drop table t;
+
+# Unnest nested array (unnesting twice), struct, and array, where data is already ordered by column4 (100, 200, 300, 400)
+statement ok
+COPY (
+  SELECT * FROM VALUES
+    ([[1],[2],[3]], [1,2,3], named_struct('s1', 1, 's2', 2, 's3', 3),  100),
+    ([[1],[3],[2]], [3],     named_struct('s1', 1, 's2', 3, 's3', 2),  200),
+    ([[2],[1],[3]], [],      named_struct('s1', 2, 's2', 1, 's3', 3),  300),
+    ([[3],[2],[1]], [3,1],   named_struct('s1', 3, 's2', 2, 's3', 1),  400)
+  ORDER BY column4
+ ) TO 'test_files/scratch/unnest/ordered_struct_arrays.parquet';
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/unnest/ordered_struct_arrays.parquet'
+WITH ORDER (column4)
+
+query ???I
+SELECT * FROM t;
+----
+[[1], [2], [3]] [1, 2, 3] {s1: 1, s2: 2, s3: 3} 100
+[[1], [3], [2]] [3] {s1: 1, s2: 3, s3: 2} 200
+[[2], [1], [3]] [] {s1: 2, s2: 1, s3: 3} 300
+[[3], [2], [1]] [3, 1] {s1: 3, s2: 2, s3: 1} 400
+
+# data is sorted on column4 already, so no need to sort again
+query IIIIII
+SELECT UNNEST(UNNEST(column1)), UNNEST(column2), UNNEST(column3), column4 FROM t ORDER BY column4;
+----
+1 1 1 2 3 100
+NULL 2 1 2 3 100
+NULL 3 1 2 3 100
+2 1 1 2 3 100
+NULL 2 1 2 3 100
+NULL 3 1 2 3 100
+3 1 1 2 3 100
+NULL 2 1 2 3 100
+NULL 3 1 2 3 100
+1 3 1 3 2 200
+3 3 1 3 2 200
+2 3 1 3 2 200
+2 NULL 2 1 3 300
+1 NULL 2 1 3 300
+3 NULL 2 1 3 300
+3 3 3 2 1 400
+NULL 1 3 2 1 400
+2 3 3 2 1 400
+NULL 1 3 2 1 400
+1 3 3 2 1 400
+NULL 1 3 2 1 400
+
+# Explain should not have a SortExec
+query TT
+EXPLAIN SELECT UNNEST(UNNEST(column1)), UNNEST(column2), UNNEST(column3), column4 FROM t ORDER BY column4;
+----
+logical_plan
+01)Sort: t.column4 ASC NULLS LAST
+02)--Projection: __unnest_placeholder(t.column1,depth=2) AS UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1) AS UNNEST(t.column2), __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3, t.column4
+03)----Unnest: lists[__unnest_placeholder(t.column1)|depth=2, __unnest_placeholder(t.column2)|depth=1] structs[__unnest_placeholder(t.column3)]
+04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2 AS __unnest_placeholder(t.column2), t.column3 AS __unnest_placeholder(t.column3), t.column4
+05)--------TableScan: t projection=[column1, column2, column3, column4]
+physical_plan
+01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2@3 as __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3@4 as __unnest_placeholder(t.column3).s3, column4@5 as column4]
+02)--UnnestExec
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet
+
+# cleanup
+statement ok
+drop table t;
+
+########################################
+# Unnest ListView / LargeListView Tests #
+########################################
+
+## Basic unnest ListView in select list
+query I
+select unnest(arrow_cast([1,2,3], 'ListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest ListView in from clause
+query I
+select * from unnest(arrow_cast([1,2,3], 'ListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest LargeListView in select list
+query I
+select unnest(arrow_cast([1,2,3], 'LargeListView(Int64)'));
+----
+1
+2
+3
+
+## Basic unnest LargeListView in from clause
+query I
+select * from unnest(arrow_cast([1,2,3], 'LargeListView(Int64)'));
+----
+1
+2
+3
+
+## Unnest ListView with range
+query I
+select unnest(arrow_cast(range(1, 3), 'ListView(Int64)'));
+----
+1
+2
+
+## Unnest LargeListView with range
+query I
+select * from unnest(arrow_cast(range(1, 3), 'LargeListView(Int64)'));
+----
+1
+2
+
+## Multiple unnest with ListView columns from a table
+query III
+select
+    unnest(column1),
+    unnest(arrow_cast(column2, 'ListView(Int64)')),
+    unnest(arrow_cast(column4, 'LargeListView(Int64)'))
+from unnest_table where column4 is not null;
+----
+1 7 13
+2 NULL 14
+3 NULL NULL
+4 8 15
+5 9 16
+NULL 10 NULL
+NULL NULL 17
+NULL NULL 18
+
+## Unnest ListView with null elements
+query I
+select unnest(arrow_cast([1, null, 3], 'ListView(Int64)'));
+----
+1
+NULL
+3
+
+## Unnest empty ListView
+query I
+select unnest(arrow_cast([], 'ListView(Int64)'));
+----
+
+## Unnest ListView of strings
+query T
+select unnest(arrow_cast(['a','b','c'], 'ListView(Utf8)'));
+----
+a
+b
+c
+
+## Unnest LargeListView of strings
+query T
+select unnest(arrow_cast(['a','b','c'], 'LargeListView(Utf8)'));
+----
+a
+b
+c
diff --git a/datafusion/sqllogictest/test_files/update.slt b/datafusion/sqllogictest/test_files/update.slt
index 9f2c16b21106f..1cd2b626e3b8e 100644
--- a/datafusion/sqllogictest/test_files/update.slt
+++ b/datafusion/sqllogictest/test_files/update.slt
@@ -33,7 +33,9 @@ logical_plan
 01)Dml: op=[Update] table=[t1]
 02)--Projection: CAST(Int64(1) AS Int32) AS a, CAST(Int64(2) AS Utf8View) AS b, Float64(3) AS c, CAST(NULL AS Int32) AS d
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 query TT
 explain update t1 set a=c+1, b=a, c=c+1.0, d=b;
@@ -42,7 +44,9 @@ logical_plan
 01)Dml: op=[Update] table=[t1]
 02)--Projection: CAST(t1.c + CAST(Int64(1) AS Float64) AS Int32) AS a, CAST(t1.a AS Utf8View) AS b, t1.c + Float64(1) AS c, CAST(t1.b AS Int32) AS d
 03)----TableScan: t1
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+physical_plan
+01)CooperativeExec
+02)--DmlResultExec: rows_affected=0
 
 statement ok
 create table t2(a int, b varchar, c double, d int);
@@ -63,35 +67,48 @@ logical_plan
 physical_plan_error This feature is not implemented: Physical plan does not support logical expression ScalarSubquery(<subquery>)
 
 # set from other table
-query TT
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+query error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
 explain update t1 set b = t2.b, c = t2.a, d = 1 from t2 where t1.a = t2.a and t1.b > 'foo' and t2.c > 1.0;
-----
-logical_plan
-01)Dml: op=[Update] table=[t1]
-02)--Projection: t1.a AS a, t2.b AS b, CAST(t2.a AS Float64) AS c, CAST(Int64(1) AS Int32) AS d
-03)----Filter: t1.a = t2.a AND t1.b > CAST(Utf8("foo") AS Utf8View) AND t2.c > Float64(1)
-04)------Cross Join: 
-05)--------TableScan: t1
-06)--------TableScan: t2
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+
+# test update from other table with actual data
+statement ok
+insert into t1 values (1, 'zoo', 2.0, 10), (2, 'qux', 3.0, 20), (3, 'bar', 4.0, 30);
 
 statement ok
-create table t3(a int, b varchar, c double, d int);
+insert into t2 values (1, 'updated_b', 5.0, 40), (2, 'updated_b2', 2.5, 50), (4, 'updated_b3', 1.5, 60);
+
+# UPDATE ... FROM is currently unsupported - qualifier stripping breaks source column references
+# causing assignments like 'b = t2.b' to resolve to target table's 'b' instead of source table's 'b'
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
+update t1 set b = t2.b, c = t2.a, d = 1 from t2 where t1.a = t2.a and t1.b > 'foo' and t2.c > 1.0;
 
 # set from multiple tables, DataFusion only supports from one table
-query error DataFusion error: Error during planning: Multiple tables in UPDATE SET FROM not yet supported
+statement error DataFusion error: This feature is not implemented: Multiple tables in UPDATE SET FROM not yet supported
 explain update t1 set b = t2.b, c = t3.a, d = 1 from t2, t3 where t1.a = t2.a and t1.a = t3.a;
 
 # test table alias
-query TT
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
 explain update t1 as T set b = t2.b, c = t.a, d = 1 from t2 where t.a = t2.a and t.b > 'foo' and t2.c > 1.0;
-----
-logical_plan
-01)Dml: op=[Update] table=[t1]
-02)--Projection: t.a AS a, t2.b AS b, CAST(t.a AS Float64) AS c, CAST(Int64(1) AS Int32) AS d
-03)----Filter: t.a = t2.a AND t.b > CAST(Utf8("foo") AS Utf8View) AND t2.c > Float64(1)
-04)------Cross Join: 
-05)--------SubqueryAlias: t
-06)----------TableScan: t1
-07)--------TableScan: t2
-physical_plan_error This feature is not implemented: Unsupported logical plan: Dml(Update)
+
+# test update with table alias with actual data
+statement ok
+delete from t1;
+
+statement ok
+delete from t2;
+
+statement ok
+insert into t1 values (1, 'zebra', 1.5, 5), (2, 'wolf', 2.0, 10), (3, 'apple', 3.5, 15);
+
+statement ok
+insert into t2 values (1, 'new_val', 2.0, 100), (2, 'new_val2', 1.5, 200);
+
+# UPDATE ... FROM is currently unsupported
+# TODO fix https://github.com/apache/datafusion/issues/19950
+statement error DataFusion error: This feature is not implemented: UPDATE ... FROM is not supported
+update t1 as T set b = t2.b, c = t.a, d = 1 from t2 where t.a = t2.a and t.b > 'foo' and t2.c > 1.0;
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index c86921012f9bc..61faf4dc9650f 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -263,30 +263,29 @@ logical_plan
 07)------------SubqueryAlias: _sample_data
 08)--------------Union
 09)----------------Projection: Int64(1) AS a, Utf8("aa") AS b
-10)------------------EmptyRelation
+10)------------------EmptyRelation: rows=1
 11)----------------Projection: Int64(3) AS a, Utf8("aa") AS b
-12)------------------EmptyRelation
+12)------------------EmptyRelation: rows=1
 13)----------------Projection: Int64(5) AS a, Utf8("bb") AS b
-14)------------------EmptyRelation
+14)------------------EmptyRelation: rows=1
 15)----------------Projection: Int64(7) AS a, Utf8("bb") AS b
-16)------------------EmptyRelation
+16)------------------EmptyRelation: rows=1
 physical_plan
 01)SortPreservingMergeExec: [b@0 ASC NULLS LAST]
 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[b@0 as b, max(d.a)@1 as max_a]
 04)------AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[max(d.a)]
-05)--------CoalesceBatchesExec: target_batch_size=8192
-06)----------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[max(d.a)], ordering_mode=Sorted
-08)--------------UnionExec
-09)----------------ProjectionExec: expr=[1 as a, aa as b]
-10)------------------PlaceholderRowExec
-11)----------------ProjectionExec: expr=[3 as a, aa as b]
-12)------------------PlaceholderRowExec
-13)----------------ProjectionExec: expr=[5 as a, bb as b]
-14)------------------PlaceholderRowExec
-15)----------------ProjectionExec: expr=[7 as a, bb as b]
-16)------------------PlaceholderRowExec
+05)--------RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[max(d.a)], ordering_mode=Sorted
+07)------------UnionExec
+08)--------------ProjectionExec: expr=[1 as a, aa as b]
+09)----------------PlaceholderRowExec
+10)--------------ProjectionExec: expr=[3 as a, aa as b]
+11)----------------PlaceholderRowExec
+12)--------------ProjectionExec: expr=[5 as a, bb as b]
+13)----------------PlaceholderRowExec
+14)--------------ProjectionExec: expr=[7 as a, bb as b]
+15)----------------PlaceholderRowExec
 
 # Check actual result:
 query TI
@@ -348,31 +347,30 @@ logical_plan
 09)----------------SubqueryAlias: _sample_data
 10)------------------Union
 11)--------------------Projection: Int64(1) AS a, Utf8("aa") AS b
-12)----------------------EmptyRelation
+12)----------------------EmptyRelation: rows=1
 13)--------------------Projection: Int64(3) AS a, Utf8("aa") AS b
-14)----------------------EmptyRelation
+14)----------------------EmptyRelation: rows=1
 15)--------------------Projection: Int64(5) AS a, Utf8("bb") AS b
-16)----------------------EmptyRelation
+16)----------------------EmptyRelation: rows=1
 17)--------------------Projection: Int64(7) AS a, Utf8("bb") AS b
-18)----------------------EmptyRelation
+18)----------------------EmptyRelation: rows=1
 physical_plan
 01)SortPreservingMergeExec: [b@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[b@0 as b, max(d.a)@1 as max_a, max(d.seq)@2 as max(d.seq)]
 03)----AggregateExec: mode=SinglePartitioned, gby=[b@2 as b], aggr=[max(d.a), max(d.seq)], ordering_mode=Sorted
 04)------ProjectionExec: expr=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as seq, a@0 as a, b@1 as b]
-05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 06)----------SortExec: expr=[b@1 ASC NULLS LAST, a@0 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=8192
-08)--------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4
-09)----------------UnionExec
-10)------------------ProjectionExec: expr=[1 as a, aa as b]
-11)--------------------PlaceholderRowExec
-12)------------------ProjectionExec: expr=[3 as a, aa as b]
-13)--------------------PlaceholderRowExec
-14)------------------ProjectionExec: expr=[5 as a, bb as b]
-15)--------------------PlaceholderRowExec
-16)------------------ProjectionExec: expr=[7 as a, bb as b]
-17)--------------------PlaceholderRowExec
+07)------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4
+08)--------------UnionExec
+09)----------------ProjectionExec: expr=[1 as a, aa as b]
+10)------------------PlaceholderRowExec
+11)----------------ProjectionExec: expr=[3 as a, aa as b]
+12)------------------PlaceholderRowExec
+13)----------------ProjectionExec: expr=[5 as a, bb as b]
+14)------------------PlaceholderRowExec
+15)----------------ProjectionExec: expr=[7 as a, bb as b]
+16)------------------PlaceholderRowExec
 
 
 # check actual result
@@ -942,22 +940,22 @@ CREATE TABLE table1 (
 
 statement ok
 INSERT INTO table1 (bar, foo, time) VALUES
-(200.0, 'me', '1970-01-01T00:00:00.000000010Z'),
-(1.0, 'me', '1970-01-01T00:00:00.000000030Z'),
-(1.0, 'me', '1970-01-01T00:00:00.000000040Z'),
-(2.0, 'you', '1970-01-01T00:00:00.000000020Z');
+(200.0, 'me', '1970-01-01T00:00:00.000000010'),
+(1.0, 'me', '1970-01-01T00:00:00.000000030'),
+(1.0, 'me', '1970-01-01T00:00:00.000000040'),
+(2.0, 'you', '1970-01-01T00:00:00.000000020');
 
 query TP
 SELECT foo, first_value(time ORDER BY time DESC NULLS LAST) AS time FROM table1 GROUP BY foo ORDER BY foo;
 ----
-me 1970-01-01T00:00:00.000000040Z
-you 1970-01-01T00:00:00.000000020Z
+me 1970-01-01T00:00:00.000000040
+you 1970-01-01T00:00:00.000000020
 
 query TP
 SELECT foo, last_value(time ORDER BY time DESC NULLS LAST) AS time FROM table1 GROUP BY foo ORDER BY foo;
 ----
-me 1970-01-01T00:00:00.000000010Z
-you 1970-01-01T00:00:00.000000020Z
+me 1970-01-01T00:00:00.000000010
+you 1970-01-01T00:00:00.000000020
 
 statement ok
 drop table table1;
@@ -1241,9 +1239,9 @@ logical_plan
 05)--------TableScan: aggregate_test_100 projection=[c8, c9]
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum2]
-02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 03)----ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c8@0 ASC NULLS LAST], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c8, c9], file_type=csv, has_header=true
 
@@ -1262,9 +1260,9 @@ logical_plan
 05)--------TableScan: aggregate_test_100 projection=[c2, c9]
 physical_plan
 01)ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true
 
@@ -1286,10 +1284,10 @@ logical_plan
 physical_plan
 01)SortExec: expr=[c2@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
-06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int8(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 07)------------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false]
 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true
 
@@ -1311,17 +1309,14 @@ logical_plan
 05)--------TableScan: aggregate_test_100 projection=[c1, c2, c4]
 physical_plan
 01)ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@2 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
-02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-04)------CoalesceBatchesExec: target_batch_size=4096
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
-07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-09)----------------CoalesceBatchesExec: target_batch_size=4096
-10)------------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=2
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c4], file_type=csv, has_header=true
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+05)--------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c4], file_type=csv, has_header=true
 
 
 # test_window_agg_sort_reversed_plan
@@ -1343,9 +1338,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-05)--------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
 query III
@@ -1362,6 +1357,110 @@ SELECT
 4144173353 20935849039 28472563256
 4076864659 24997484146 28118515915
 
+# Only 1 SortExec was added, and limit 100 was turned into limit 10
+query TT
+EXPLAIN SELECT
+    c9,
+    SUM(c9) OVER(ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum1,
+    SUM(c9) OVER(ORDER BY c9 DESC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum2
+    FROM (
+        SELECT c9,
+        FROM aggregate_test_100
+        ORDER BY c9 DESC
+        LIMIT 100
+    )
+    LIMIT 5
+----
+logical_plan
+01)Projection: aggregate_test_100.c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING AS sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING AS sum2
+02)--Limit: skip=0, fetch=5
+03)----WindowAggr: windowExpr=[[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING]]
+04)------WindowAggr: windowExpr=[[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING]]
+05)--------Sort: aggregate_test_100.c9 DESC NULLS FIRST, fetch=100
+06)----------TableScan: aggregate_test_100 projection=[c9]
+physical_plan
+01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
+
+# ensure limit pushdown can handle bigger preceding instead of following
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query III
+SELECT
+    c9,
+    SUM(c9) OVER(ORDER BY c9 ASC ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING) as sum1,
+    SUM(c9) OVER(ORDER BY c9 DESC ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING) as sum2
+    FROM aggregate_test_100
+    LIMIT 5
+----
+4268716378 24997484146 8498370520
+4229654142 29012926487 12714811027
+4216440507 28743001064 16858984380
+4144173353 28472563256 20935849039
+4076864659 28118515915 24997484146
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query III
+SELECT
+    c9,
+    SUM(c9) OVER(ORDER BY c9 ASC ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING) as sum1,
+    SUM(c9) OVER(ORDER BY c9 DESC ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING) as sum2
+    FROM aggregate_test_100
+    LIMIT 5
+----
+4268716378 24997484146 8498370520
+4229654142 29012926487 12714811027
+4216440507 28743001064 16858984380
+4144173353 28472563256 20935849039
+4076864659 28118515915 24997484146
+
+# test_window_agg_sort_reversed_plan
+# Only 1 SortExec was added, limit & skip are pushed down
+query TT
+EXPLAIN SELECT
+    c9,
+    SUM(c9) OVER(ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum1,
+    SUM(c9) OVER(ORDER BY c9 DESC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum2
+    FROM aggregate_test_100
+    LIMIT 5
+    OFFSET 5
+----
+logical_plan
+01)Projection: aggregate_test_100.c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING AS sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING AS sum2
+02)--Limit: skip=5, fetch=5
+03)----WindowAggr: windowExpr=[[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING]]
+04)------WindowAggr: windowExpr=[[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING]]
+05)--------TableScan: aggregate_test_100 projection=[c9]
+physical_plan
+01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2]
+02)--GlobalLimitExec: skip=5, fetch=5
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------SortExec: TopK(fetch=15), expr=[c9@0 DESC], preserve_partitioning=[false]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
+
+query III
+SELECT
+    c9,
+    SUM(c9) OVER(ORDER BY c9 ASC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum1,
+    SUM(c9) OVER(ORDER BY c9 DESC ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING) as sum2
+    FROM aggregate_test_100
+    LIMIT 5
+    OFFSET 5
+----
+4061635107 29012926487 27741341640
+4015442341 28743001064 27423817254
+3998790955 28472563256 27079733310
+3959216334 28118515915 26689577379
+3862393166 27741341640 26284746231
+
 # test_window_agg_sort_reversed_plan_builtin
 query TT
 EXPLAIN SELECT
@@ -1384,8 +1483,8 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as fv1, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as fv2, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as lag1, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as lag2, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as lead1, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as lead2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)), is_causal: false }, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -1427,9 +1526,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as rn1, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as rn2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-04)------SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+04)------SortExec: TopK(fetch=10), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -1469,10 +1568,10 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as rn2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-04)------SortExec: expr=[c9@2 ASC NULLS LAST, c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+04)------SortExec: TopK(fetch=10), expr=[c9@2 ASC NULLS LAST, c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 07)------------SortExec: expr=[c9@2 DESC, c1@0 DESC], preserve_partitioning=[false]
 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9], file_type=csv, has_header=true
 
@@ -1551,19 +1650,19 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as a, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as b, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as c, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as d, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@7 as e, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as f, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as g, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as i, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as j, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as l, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as m, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@15 as n, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as o, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as p, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as a1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as b1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as c1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as d1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@9 as e1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as f1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as g1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as j1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as l1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as m1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as n1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as o1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as h11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as j11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as k11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as l11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@10 as m11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as n11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as o11]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
+03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
 04)------ProjectionExec: expr=[c1@0 as c1, c3@2 as c3, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@4 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@6 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@7 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@8 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@9 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@10 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@11 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@12 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@14 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@15 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@18 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 06)----------SortExec: expr=[c3@2 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false]
-07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 08)--------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 ASC], preserve_partitioning=[false]
-09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 10)------------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 DESC], preserve_partitioning=[false]
-11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }]
-12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
+11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }]
+12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
 13)------------------------SortExec: expr=[c3@2 DESC NULLS LAST], preserve_partitioning=[false]
-14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
-15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }]
+15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 16)------------------------------SortExec: expr=[c3@2 DESC, c1@0 ASC NULLS LAST], preserve_partitioning=[false]
 17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/null_cases.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
@@ -1637,9 +1736,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
 
 
@@ -1681,9 +1780,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
 
 query III
@@ -1727,9 +1826,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c3@1 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }]
+03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }]
 04)------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, c3@2 as c3, c9@3 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 06)----------SortPreservingMergeExec: [__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST]
 07)------------SortExec: expr=[__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
 08)--------------ProjectionExec: expr=[c3@1 + c4@2 as __common_expr_1, c2@0 as c2, c3@1 as c3, c9@3 as c9]
@@ -1779,13 +1878,11 @@ physical_plan
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
 05)--------ProjectionExec: expr=[]
 06)----------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[]
-07)------------CoalesceBatchesExec: target_batch_size=4096
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
-10)------------------CoalesceBatchesExec: target_batch_size=4096
-11)--------------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
-12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-13)------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
+07)------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+08)--------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[]
+09)----------------FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434, projection=[c1@0]
+10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c13], file_type=csv, has_header=true
 
 
 query I
@@ -1821,16 +1918,14 @@ logical_plan
 06)----------TableScan: aggregate_test_100 projection=[c2, c3, c9]
 physical_plan
 01)SortPreservingMergeExec: [c3@0 ASC NULLS LAST], fetch=5
-02)--ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2]
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-04)------SortExec: expr=[c3@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-10)------------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true
+02)--SortExec: TopK(fetch=5), expr=[c3@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=1, maintains_sort_order=true
+06)----------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+08)--------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false]
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true
 
 
 
@@ -1839,7 +1934,7 @@ SELECT c3,
     SUM(c9) OVER(ORDER BY c3 DESC, c9 DESC, c2 ASC) as sum1,
     SUM(c9) OVER(PARTITION BY c3 ORDER BY c9 DESC ) as sum2
     FROM aggregate_test_100
-    ORDER BY c3
+    ORDER BY c3, c9 DESC
     LIMIT 5
 ----
 -117 219796664156 3023531799
@@ -1864,12 +1959,10 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]
-03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
 
 query TI
 SELECT c1, ROW_NUMBER() OVER (PARTITION BY c1) as rn1 FROM aggregate_test_100 ORDER BY c1 ASC
@@ -1993,12 +2086,10 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, rn1@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]
-03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=4096
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
 
 statement ok
 set datafusion.optimizer.repartition_sorts = true;
@@ -2019,15 +2110,13 @@ logical_plan
 physical_plan
 01)SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2]
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 04)------SortPreservingMergeExec: [c9@1 ASC NULLS LAST]
 05)--------SortExec: expr=[c9@1 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(3)), is_causal: false }], mode=[Sorted]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted]
 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[true]
-08)--------------CoalesceBatchesExec: target_batch_size=4096
-09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
+08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true
 
 # test_window_agg_with_global_limit
 statement ok
@@ -2046,7 +2135,7 @@ physical_plan
 02)--AggregateExec: mode=Final, gby=[], aggr=[array_agg(aggregate_test_100.c13)]
 03)----CoalescePartitionsExec
 04)------AggregateExec: mode=Partial, gby=[], aggr=[array_agg(aggregate_test_100.c13)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
 06)----------SortExec: TopK(fetch=1), expr=[c13@0 ASC NULLS LAST], preserve_partitioning=[false]
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c13], file_type=csv, has_header=true
 
@@ -2107,11 +2196,11 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4]
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 04)------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c9@3 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]
-05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false]
 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true
 
@@ -2162,15 +2251,14 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@1 as c9, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 04)------ProjectionExec: expr=[c2@0 as c2, c9@2 as c9, c1_alias@3 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]
-05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 06)----------ProjectionExec: expr=[c2@1 as c2, c8@2 as c8, c9@3 as c9, c1_alias@4 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING]
-07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false]
-10)------------------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c8@2 as c8, c9@3 as c9, c1@0 as c1_alias]
-11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9, c1@0 as c1_alias], file_type=csv, has_header=true
 
 query IIIII
 SELECT c9,
@@ -2208,9 +2296,9 @@ physical_plan
 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2]
 02)--SortExec: TopK(fetch=5), expr=[c9@2 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum1, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING@4 as sum2, c9@1 as c9]
-04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Ok(Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Groups, start_bound: Preceding(UInt64(5)), end_bound: Preceding(UInt64(3)), is_causal: true }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING": nullable Float64 }, frame: GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING], mode=[Sorted]
 05)--------ProjectionExec: expr=[c1@0 as c1, c9@2 as c9, c12@3 as c12, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING]
-06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Groups, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false]
 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9, c12], file_type=csv, has_header=true
 
@@ -2244,7 +2332,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2281,7 +2369,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2318,7 +2406,7 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[rn1@1 DESC], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2358,7 +2446,7 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[rn1@1 ASC NULLS LAST, c9@0 ASC NULLS LAST], preserve_partitioning=[false], sort_prefix=[rn1@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2433,7 +2521,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2455,7 +2543,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c5@0 as c5, c9@1 as c9, row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rn1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Decimal128(None,21,0)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[CAST(c9@1 AS Decimal128(20, 0)) + CAST(c5@0 AS Decimal128(20, 0)) DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5, c9], file_type=csv, has_header=true
 
@@ -2476,7 +2564,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, CAST(row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 AS Int64) as rn1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -2492,16 +2580,16 @@ statement ok
 set datafusion.optimizer.skip_failed_rules = true
 
 # Error is returned from the physical plan.
-query error Cannot cast Utf8\("1 DAY"\) to Int8
+query error Cannot cast 1 DAY to Int8
 SELECT
   COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN '1 DAY' PRECEDING AND '2 DAY' FOLLOWING)
   FROM aggregate_test_100;
 
 statement ok
-set datafusion.optimizer.skip_failed_rules = true
+set datafusion.optimizer.skip_failed_rules = false
 
 # Error is returned from the logical plan.
-query error Cannot cast Utf8\("1 DAY"\) to Int8
+query error Cannot cast 1 DAY to Int8
 SELECT
   COUNT(c1) OVER (ORDER BY c2 RANGE BETWEEN '1 DAY' PRECEDING AND '2 DAY' FOLLOWING)
   FROM aggregate_test_100;
@@ -2581,12 +2669,11 @@ physical_plan
 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, sum3@2 as sum3, min1@3 as min1, min2@4 as min2, min3@5 as min3, max1@6 as max1, max2@7 as max2, max3@8 as max3, cnt1@9 as cnt1, cnt2@10 as cnt2, sumr1@11 as sumr1, sumr2@12 as sumr2, sumr3@13 as sumr3, minr1@14 as minr1, minr2@15 as minr2, minr3@16 as minr3, maxr1@17 as maxr1, maxr2@18 as maxr2, maxr3@19 as maxr3, cntr1@20 as cntr1, cntr2@21 as cntr2, sum4@22 as sum4, cnt3@23 as cnt3]
 02)--SortExec: TopK(fetch=5), expr=[inc_col@24 DESC], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as sum1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@14 as sum2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@15 as sum3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as min1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as min2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as min3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as max1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as max2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as max3, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@22 as cnt1, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@23 as cnt2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@2 as sumr1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@3 as sumr2, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sumr3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as minr1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@6 as minr2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@7 as minr3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as maxr1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as maxr2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as maxr3, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@11 as cntr1, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@12 as cntr2, sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@24 as sum4, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@25 as cnt3, inc_col@1 as inc_col]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, inc_col@3 as inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@5 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@6 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@7 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@12 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@13 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@14 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@15 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@23 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@25 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING]
-06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)), is_causal: false }, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(5)), end_bound: Following(Int32(1)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(4)), end_bound: Following(Int32(8)), is_causal: false }, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(8)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(4)), end_bound: Following(Int32(1)), is_causal: false }, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(8)), end_bound: Following(Int32(1)), is_causal: false }, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(1)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(5)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(5)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(2)), end_bound: Following(Int32(6)), is_causal: false }, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(8)), is_causal: false }], mode=[Sorted]
-08)--------------ProjectionExec: expr=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col, desc_col@2 as desc_col]
-09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col, desc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
+06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING": Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts, inc_col, desc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIIIIIIIIIIIIIIIII
 SELECT
@@ -2667,8 +2754,8 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[ts@0 DESC], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[ts@0 as ts, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2]
-03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)), is_causal: false }, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING], mode=[Sorted]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIIIIIIIIIIIIIIIIII
@@ -2739,10 +2826,9 @@ physical_plan
 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, min1@2 as min1, min2@3 as min2, max1@4 as max1, max2@5 as max2, count1@6 as count1, count2@7 as count2, avg1@8 as avg1, avg2@9 as avg2]
 02)--SortExec: TopK(fetch=5), expr=[inc_col@10 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as sum1, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as min1, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as min2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as max1, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as max2, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as count1, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as count2, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@13 as avg1, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@8 as avg2, inc_col@3 as inc_col]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)), is_causal: false }, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Ok(Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)), is_causal: false }, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Ok(Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(5)), is_causal: false }], mode=[Sorted]
-05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)), is_causal: false }, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)), is_causal: false }, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)), is_causal: false }, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)), is_causal: false }, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: Following(Int32(3)), is_causal: false }], mode=[Sorted]
-06)----------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col]
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted]
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts, inc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIRR
 SELECT
@@ -2791,8 +2877,8 @@ physical_plan
 01)ProjectionExec: expr=[first_value1@0 as first_value1, first_value2@1 as first_value2, last_value1@2 as last_value1, last_value2@3 as last_value2, nth_value1@4 as nth_value1]
 02)--SortExec: TopK(fetch=5), expr=[inc_col@5 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@4 as first_value1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@2 as first_value2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as last_value1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as last_value2, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as nth_value1, inc_col@1 as inc_col]
-04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIII
@@ -2835,8 +2921,8 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted]
 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col]
 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST]
 
@@ -2880,8 +2966,8 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Ok(Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(3)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted]
 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col]
 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST]
 
@@ -2980,12 +3066,12 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Preceding(UInt64(1)), is_causal: true }], mode=[Linear]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)), is_causal: false }], mode=[PartiallySorted([1, 0])]
-05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[PartiallySorted([0])]
-07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow, is_causal: true }], mode=[PartiallySorted([0, 1])]
-08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Linear]
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[PartiallySorted([1, 0])]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[PartiallySorted([0])]
+07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0, 1])]
+08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 09)----------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
 10)------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
 
@@ -3048,19 +3134,18 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[c@2 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12]
-03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Preceding(UInt64(1)), is_causal: true }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Sorted]
 04)------SortExec: expr=[d@4 ASC NULLS LAST, a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
-05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)), is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Sorted]
 06)----------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
-07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
 08)--------------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
-09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
+09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[Sorted]
 10)------------------SortExec: expr=[a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
-11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]
+11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted]
 12)----------------------SortExec: expr=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false]
-13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)), is_causal: false }, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)), is_causal: false }], mode=[Sorted]
-14)--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true
+13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted]
+14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[CAST(c@3 AS Int64) as __common_expr_1, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true
 
 query IIIIIIIIIIIIIII
 SELECT a, b, c,
@@ -3120,10 +3205,9 @@ logical_plan
 06)----------TableScan: annotated_data_infinite2 projection=[a0, a, b, c, d]
 physical_plan
 01)ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1]
-02)--CoalesceBatchesExec: target_batch_size=4096, fetch=5
-03)----FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50
-04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
+02)--FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50, fetch=5
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST]
 
 # Top level sort is pushed down through BoundedWindowAggExec as its SUM result does already satisfy the required
 # global order. The existing sort is for the second-term lexicographical ordering requirement, which is being
@@ -3144,7 +3228,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
@@ -3229,11 +3313,11 @@ logical_plan
 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d]
 physical_plan
 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4]
-02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]
+02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear]
 03)----ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[PartiallySorted([0])]
-06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])]
+06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 07)------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
 08)--------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
 
@@ -3260,22 +3344,18 @@ logical_plan
 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d]
 physical_plan
 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4]
-02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST
-05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-07)------------CoalesceBatchesExec: target_batch_size=4096
-08)--------------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[PartiallySorted([0])]
-10)------------------CoalesceBatchesExec: target_batch_size=4096
-11)--------------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-13)------------------------CoalesceBatchesExec: target_batch_size=4096
-14)--------------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
-15)----------------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-16)------------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-17)--------------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
+02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear]
+03)----RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST
+04)------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+06)----------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])]
+08)--------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+10)------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST
+11)--------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
+12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
+13)------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
 
 # reset the partition number 1 again
 statement ok
@@ -3329,10 +3409,10 @@ logical_plan
 physical_plan
 01)SortExec: TopK(fetch=5), expr=[c3@0 ASC NULLS LAST], preserve_partitioning=[false]
 02)--ProjectionExec: expr=[c3@0 as c3, max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1]
-03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false]
 05)--------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
-06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 07)------------SortExec: expr=[c11@1 ASC NULLS LAST], preserve_partitioning=[false]
 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], file_type=csv, has_header=true
 
@@ -3373,7 +3453,7 @@ physical_plan
 01)ProjectionExec: expr=[min1@0 as min1, max1@1 as max1]
 02)--SortExec: TopK(fetch=5), expr=[c3@2 ASC NULLS LAST], preserve_partitioning=[false]
 03)----ProjectionExec: expr=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min1, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max1, c3@0 as c3]
-04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow, is_causal: false }, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false]
 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c12], file_type=csv, has_header=true
 
@@ -3425,10 +3505,9 @@ logical_plan
 02)--Filter: multiple_ordered_table.b = Int32(0)
 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)]
 physical_plan
-01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-02)--CoalesceBatchesExec: target_batch_size=4096
-03)----FilterExec: b@2 = 0
-04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
+01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+02)--FilterExec: b@2 = 0
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
 
 # Since column b is constant after filter b=0,
 # window requirement b ASC, d ASC can be satisfied
@@ -3443,11 +3522,10 @@ logical_plan
 02)--Filter: multiple_ordered_table.b = Int32(0)
 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)]
 physical_plan
-01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 02)--SortExec: expr=[d@4 ASC NULLS LAST], preserve_partitioning=[false]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------FilterExec: b@2 = 0
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
+03)----FilterExec: b@2 = 0
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true
 
 
 # Create an unbounded source where there is multiple orderings.
@@ -3480,9 +3558,9 @@ logical_plan
 05)--------TableScan: multiple_ordered_table projection=[a, b, c, d]
 physical_plan
 01)ProjectionExec: expr=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max1]
-02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 03)----ProjectionExec: expr=[c@2 as c, d@3 as d, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
@@ -3499,10 +3577,9 @@ logical_plan
 04)------TableScan: multiple_ordered_table projection=[c, d], partial_filters=[multiple_ordered_table.d = Int32(0)]
 physical_plan
 01)ProjectionExec: expr=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max_c]
-02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------FilterExec: d@1 = 0
-05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
+02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----FilterExec: d@1 = 0
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 explain SELECT SUM(d) OVER(PARTITION BY c ORDER BY a ASC)
@@ -3514,7 +3591,7 @@ logical_plan
 03)----TableScan: multiple_ordered_table projection=[a, c, d]
 physical_plan
 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query TT
@@ -3527,7 +3604,7 @@ logical_plan
 03)----TableScan: multiple_ordered_table projection=[a, b, c, d]
 physical_plan
 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true
 
 query I
@@ -3569,7 +3646,7 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c@0 as c, nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nv1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }]
+03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }]
 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query II
@@ -3620,11 +3697,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST]
 02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW@5 as avg_d]
-03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Ok(Field { name: "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: CurrentRow, is_causal: false }], mode=[Linear]
-04)------CoalesceBatchesExec: target_batch_size=4096
-05)--------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]]
+03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear]
+04)------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=1, maintains_sort_order=true
+05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]]
 
 # CTAS with NTILE function
 statement ok
@@ -3955,7 +4030,7 @@ logical_plan
 03)----TableScan: table_with_pk projection=[sn, ts, currency, amount]
 physical_plan
 01)ProjectionExec: expr=[sn@0 as sn, ts@1 as ts, currency@2 as currency, amount@3 as amount, sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1]
-02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]
+02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 03)----SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[false]
 04)------DataSourceExec: partitions=1, partition_sizes=[1]
 
@@ -4074,9 +4149,9 @@ logical_plan
 physical_plan
 01)ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2, sum1@3 as sum1]
 02)--GlobalLimitExec: skip=0, fetch=5
-03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }]
+03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }]
 04)------ProjectionExec: expr=[c3@0 as c3, c4@1 as c4, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1]
-05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int16(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 06)----------SortExec: expr=[c3@0 + c4@1 DESC], preserve_partitioning=[false]
 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c4, c9], file_type=csv, has_header=true
 
@@ -4115,13 +4190,10 @@ logical_plan
 04)------TableScan: a projection=[a]
 physical_plan
 01)ProjectionExec: expr=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
-02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=4096
-07)------------FilterExec: a@0 = 1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+03)----RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1
+04)------FilterExec: a@0 = 1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query I
 select ROW_NUMBER() over (partition by a) from (select * from a where a = 1);
@@ -4138,13 +4210,10 @@ logical_plan
 04)------TableScan: a projection=[a]
 physical_plan
 01)ProjectionExec: expr=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
-02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted]
-03)----CoalesceBatchesExec: target_batch_size=4096
-04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2
-05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-06)----------CoalesceBatchesExec: target_batch_size=4096
-07)------------FilterExec: a@0 = 1
-08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
+02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted]
+03)----RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=1
+04)------FilterExec: a@0 = 1
+05)--------DataSourceExec: partitions=1, partition_sizes=[1]
 
 # LAG window function IGNORE/RESPECT NULLS support with ascending order and default offset 1
 query TTTTTT
@@ -4318,9 +4387,9 @@ LIMIT 5;
 ----
 78 50
 63 38
-3 53
+NULL 19
 24 31
-14 94
+24 56
 
 # result should be same with above, when LAG/LEAD algorithm work with pruned data.
 # decreasing batch size, causes data to be produced in smaller chunks at the source.
@@ -4337,9 +4406,12 @@ LIMIT 5;
 ----
 78 50
 63 38
-3 53
+NULL 19
 24 31
-14 94
+24 56
+
+statement ok
+set datafusion.execution.batch_size = 100;
 
 # Tests schema and data are in sync for mixed nulls and not nulls values for builtin window function
 query T
@@ -4938,11 +5010,11 @@ FROM (SELECT c1, c2, ROW_NUMBER() OVER() as rn
     FROM t
     LIMIT 5)
 GROUP BY rn
-ORDER BY rn;
+ORDER BY 1, 2, 3
 ----
 1 a 1
-2 b 2
 1 a 3
+2 b 2
 3 NULL 4
 NULL a4 5
 
@@ -5181,6 +5253,10 @@ order by c1;
 3 1 1
 3 10 2
 
+
+statement ok
+set datafusion.execution.batch_size = 1;
+
 # push filter since it uses a partition column
 query TT
 explain select c1, c2, rank
@@ -5200,14 +5276,12 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 04)------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-05)--------CoalesceBatchesExec: target_batch_size=1
-06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------FilterExec: c1@0 = 2 OR c1@0 = 3
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+06)----------FilterExec: c1@0 = 2 OR c1@0 = 3
+07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+08)--------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5242,14 +5316,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c2@1 >= 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c2@1 >= 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5284,16 +5355,13 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c2@1 = 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-06)----------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------CoalesceBatchesExec: target_batch_size=1
-10)------------------FilterExec: c1@0 = 1
-11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c2@1 = 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+07)------------FilterExec: c1@0 = 1
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5325,14 +5393,11 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank]
-03)----CoalesceBatchesExec: target_batch_size=1
-04)------FilterExec: c1@0 = 1 OR c2@1 = 10
-05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-07)------------CoalesceBatchesExec: target_batch_size=1
-08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-10)------------------DataSourceExec: partitions=1, partition_sizes=[1]
+03)----FilterExec: c1@0 = 1 OR c2@1 = 10
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+07)------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query III
 select c1, c2, rank
@@ -5370,18 +5435,15 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST]
 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2]
-04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=1
-07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
-08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-10)------------------CoalesceBatchesExec: target_batch_size=1
-11)--------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-12)----------------------CoalesceBatchesExec: target_batch_size=1
-13)------------------------FilterExec: c1@0 > 1
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)----------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
+07)------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
+10)------------------FilterExec: c1@0 > 1
+11)--------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+12)----------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 select c1, c2, rank1, rank2
@@ -5421,18 +5483,14 @@ physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST]
 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true]
 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2]
-04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-06)----------CoalesceBatchesExec: target_batch_size=1
-07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
-08)--------------CoalesceBatchesExec: target_batch_size=1
-09)----------------FilterExec: c2@1 > 1
-10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
-11)--------------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
-12)----------------------CoalesceBatchesExec: target_batch_size=1
-13)------------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2
-14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-15)----------------------------DataSourceExec: partitions=1, partition_sizes=[1]
+06)----------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2
+07)------------FilterExec: c2@1 > 1
+08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true]
+10)------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1
+11)--------------------DataSourceExec: partitions=1, partition_sizes=[1]
 
 query IIII
 select c1, c2, rank1, rank2
@@ -5488,11 +5546,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, sum_c9@1 ASC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as sum_c9]
-03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-04)------CoalesceBatchesExec: target_batch_size=1
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT SUM(c9) OVER() as sum_c9 FROM aggregate_test_100_ordered ORDER BY sum_c9;
@@ -5504,9 +5560,10 @@ logical_plan
 04)------TableScan: aggregate_test_100_ordered projection=[c9]
 physical_plan
 01)ProjectionExec: expr=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as sum_c9]
-02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true
 
+
 query TT
 EXPLAIN SELECT c1, MIN(c5) OVER(PARTITION BY c1) as min_c5 FROM aggregate_test_100_ordered ORDER BY c1, min_c5 DESC NULLS LAST;
 ----
@@ -5518,11 +5575,9 @@ logical_plan
 physical_plan
 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, min_c5@1 DESC NULLS LAST]
 02)--ProjectionExec: expr=[c1@0 as c1, min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as min_c5]
-03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-04)------CoalesceBatchesExec: target_batch_size=1
-05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST
-06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
-07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c5], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
+03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=1, maintains_sort_order=true
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c5], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true
 
 query TT
 EXPLAIN SELECT MAX(c5) OVER() as max_c5 FROM aggregate_test_100_ordered ORDER BY max_c5;
@@ -5534,7 +5589,7 @@ logical_plan
 04)------TableScan: aggregate_test_100_ordered projection=[c5]
 physical_plan
 01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5]
-02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true
 
 query II rowsort
@@ -5642,3 +5697,433 @@ WINDOW
 3 7
 4 11
 5 16
+
+
+# window with distinct operation
+statement ok
+CREATE TABLE table_test_distinct_count (
+    k VARCHAR,
+    v Int,
+    time TIMESTAMP WITH TIME ZONE
+);
+
+statement ok
+INSERT INTO table_test_distinct_count (k, v, time) VALUES
+    ('a', 1, '1970-01-01T00:01:00.00'),
+    ('a', 1, '1970-01-01T00:02:00.00'),
+    ('a', 1, '1970-01-01T00:03:00.00'),
+    ('a', 2, '1970-01-01T00:03:00.00'),
+    ('a', 1, '1970-01-01T00:04:00.00'),
+    ('b', 3, '1970-01-01T00:01:00.00'),
+    ('b', 3, '1970-01-01T00:02:00.00'),
+    ('b', 4, '1970-01-01T00:03:00.00'),
+    ('b', 4, '1970-01-01T00:03:00.00');
+
+query TPII
+SELECT
+    k,
+    time,
+    COUNT(v) OVER (
+        PARTITION BY k
+        ORDER BY time
+        RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+    ) AS normal_count,
+    COUNT(DISTINCT v) OVER (
+        PARTITION BY k
+        ORDER BY time
+        RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+    ) AS distinct_count
+FROM table_test_distinct_count
+ORDER BY k, time;
+----
+a 1970-01-01T00:01:00 1 1
+a 1970-01-01T00:02:00 2 1
+a 1970-01-01T00:03:00 4 2
+a 1970-01-01T00:03:00 4 2
+a 1970-01-01T00:04:00 4 2
+b 1970-01-01T00:01:00 1 1
+b 1970-01-01T00:02:00 2 1
+b 1970-01-01T00:03:00 4 2
+b 1970-01-01T00:03:00 4 2
+
+
+query TT
+EXPLAIN SELECT
+    k,
+    time,
+    COUNT(v) OVER (
+        PARTITION BY k
+        ORDER BY time
+        RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+    ) AS normal_count,
+    COUNT(DISTINCT v) OVER (
+        PARTITION BY k
+        ORDER BY time
+        RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+    ) AS distinct_count
+FROM table_test_distinct_count
+ORDER BY k, time;
+----
+logical_plan
+01)Sort: table_test_distinct_count.k ASC NULLS LAST, table_test_distinct_count.time ASC NULLS LAST
+02)--Projection: table_test_distinct_count.k, table_test_distinct_count.time, count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW AS normal_count, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW AS distinct_count
+03)----WindowAggr: windowExpr=[[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW AS count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW AS count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW]]
+04)------TableScan: table_test_distinct_count projection=[k, v, time]
+physical_plan
+01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[k@0 as k, time@2 as time, count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as normal_count, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as distinct_count]
+03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[k@0 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([k@0], 2), input_partitions=2
+06)----------DataSourceExec: partitions=2, partition_sizes=[5, 4]
+
+
+# Add testing for distinct sum
+query TPII
+SELECT
+  k,
+  time,
+  SUM(v) OVER (
+    PARTITION BY k
+    ORDER BY time
+    RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+  )               AS sum_v,
+  SUM(DISTINCT v) OVER (
+    PARTITION BY k
+    ORDER BY time
+    RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+  )       AS sum_distinct_v
+FROM table_test_distinct_count
+ORDER BY k, time;
+----
+a 1970-01-01T00:01:00 1 1
+a 1970-01-01T00:02:00 2 1
+a 1970-01-01T00:03:00 5 3
+a 1970-01-01T00:03:00 5 3
+a 1970-01-01T00:04:00 5 3
+b 1970-01-01T00:01:00 3 3
+b 1970-01-01T00:02:00 6 3
+b 1970-01-01T00:03:00 14 7
+b 1970-01-01T00:03:00 14 7
+
+
+
+query TT
+EXPLAIN SELECT
+  k,
+  time,
+  SUM(v) OVER (
+    PARTITION BY k
+    ORDER BY time
+    RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+  )               AS sum_v,
+  SUM(DISTINCT v) OVER (
+    PARTITION BY k
+    ORDER BY time
+    RANGE BETWEEN INTERVAL '2 minutes' PRECEDING AND CURRENT ROW
+  )       AS sum_distinct_v
+FROM table_test_distinct_count
+ORDER BY k, time;
+----
+logical_plan
+01)Sort: table_test_distinct_count.k ASC NULLS LAST, table_test_distinct_count.time ASC NULLS LAST
+02)--Projection: table_test_distinct_count.k, table_test_distinct_count.time, sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW AS sum_v, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW AS sum_distinct_v
+03)----WindowAggr: windowExpr=[[sum(__common_expr_1) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW AS sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW, sum(DISTINCT __common_expr_1) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW AS sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW]]
+04)------Projection: CAST(table_test_distinct_count.v AS Int64) AS __common_expr_1, table_test_distinct_count.k, table_test_distinct_count.time
+05)--------TableScan: table_test_distinct_count projection=[k, v, time]
+physical_plan
+01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST]
+02)--ProjectionExec: expr=[k@1 as k, time@2 as time, sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as sum_v, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as sum_distinct_v]
+03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[k@1 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([k@1], 2), input_partitions=2
+06)----------ProjectionExec: expr=[CAST(v@1 AS Int64) as __common_expr_1, k@0 as k, time@2 as time]
+07)------------DataSourceExec: partitions=2, partition_sizes=[5, 4]
+
+
+# FILTER clause with window functions
+
+# Verify FILTER clause with non-aggregate window functions fails with a clear message
+query error DataFusion error: Error during planning: FILTER clause can only be used with aggregate window functions\. Found in 'row_number\(\) FILTER \(WHERE test\.c1 > Int64\(0\)\) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING'
+SELECT
+c1,
+ROW_NUMBER() FILTER(WHERE c1 > 0) OVER () as rn1
+FROM test
+LIMIT 5
+
+
+query error DataFusion error: Error during planning: FILTER clause can only be used with aggregate window functions\. Found in 'first_value\(test\.c1\) FILTER \(WHERE test\.c1 > Int64\(0\)\) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING'
+SELECT
+c1,
+FIRST_VALUE(c1) FILTER(WHERE c1 > 0) OVER () as rn1
+FROM test
+LIMIT 5
+
+
+query error DataFusion error: Error during planning: FILTER clause can only be used with aggregate window functions\. Found in 'lag\(test\.c1\) FILTER \(WHERE test\.c1 > Int64\(0\)\) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING'
+SELECT
+c1,
+LAG(c1) FILTER(WHERE c1 > 0) OVER () as rn1
+FROM test
+LIMIT 5
+
+
+# Check error propagation from filter to window function
+query error
+SELECT
+c1,
+SUM(c2) FILTER (WHERE c2 >= []) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum1,
+FROM test
+LIMIT 5
+----
+DataFusion error: type_coercion
+caused by
+Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(Null)
+
+
+
+# EXPLAIN should display the filters
+query TT
+EXPLAIN SELECT
+c1,
+c2,
+SUM(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum1,
+SUM(c2) FILTER (WHERE c2 >= 2 AND c2 < 4 AND c1 > 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum2,
+COUNT(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as count1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 2 AND c2 < 4 AND c1 > 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg2,
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+logical_plan
+01)Sort: test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST, fetch=5
+02)--Projection: test.c1, test.c2, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS sum1, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS sum2, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS count1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS array_agg1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS array_agg2
+03)----WindowAggr: windowExpr=[[sum(test.c2) FILTER (WHERE __common_expr_1 AS test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE __common_expr_2) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE __common_expr_1 AS test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE __common_expr_1 AS test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE __common_expr_2) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------Projection: __common_expr_3 AS __common_expr_1, __common_expr_3 AND test.c2 < Int64(4) AND test.c1 > Int32(0) AS __common_expr_2, test.c1, test.c2
+05)--------Projection: test.c2 >= Int64(2) AS __common_expr_3, test.c1, test.c2
+06)----------TableScan: test projection=[c1, c2]
+physical_plan
+01)ProjectionExec: expr=[c1@2 as c1, c2@3 as c2, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as count1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as array_agg1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as array_agg2]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortPreservingMergeExec: [c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], fetch=5
+05)--------SortExec: TopK(fetch=5), expr=[c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-0.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-1.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-2.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-3.csv]]}, projection=[c2@1 >= 2 as __common_expr_1, c2@1 >= 2 AND c2@1 < 4 AND c1@0 > 0 as __common_expr_2, c1, c2], file_type=csv, has_header=false
+
+
+# FILTER filters out some rows
+query IIIII??
+SELECT
+c1,
+c2,
+SUM(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum1,
+SUM(c2) FILTER (WHERE c2 >= 2 AND c2 < 4 AND c1 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum2,
+COUNT(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as count1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 2) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 2 AND c2 < 4 AND c1 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg2,
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 NULL NULL 0 NULL NULL
+0 1 NULL NULL 0 NULL NULL
+0 2 2 2 1 [2] [2]
+0 3 5 5 2 [2, 3] [2, 3]
+0 4 9 5 3 [2, 3, 4] [2, 3]
+
+
+# FILTER filters out no rows
+query IIIII??
+SELECT
+c1,
+c2,
+SUM(c2) FILTER (WHERE c2 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum1,
+SUM(c2) FILTER (WHERE c2 >= 0 AND c2 < 1000 AND c1 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum2,
+COUNT(c2) FILTER (WHERE c2 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as count1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 0 AND c2 < 1000 AND c1 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg2,
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 0 0 1 [0] [0]
+0 1 1 1 2 [0, 1] [0, 1]
+0 2 3 3 3 [0, 1, 2] [0, 1, 2]
+0 3 6 6 4 [0, 1, 2, 3] [0, 1, 2, 3]
+0 4 10 10 5 [0, 1, 2, 3, 4] [0, 1, 2, 3, 4]
+
+
+# FILTER filters out every row
+query IIIII??
+SELECT
+c1,
+c2,
+SUM(c2) FILTER (WHERE c2 == -1) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum1,
+SUM(c2) FILTER (WHERE c2 >= 0 AND c2 < 0 AND c1 >= 0) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sum2,
+COUNT(c2) FILTER (WHERE c2 == -1) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as count1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 1000) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg1,
+ARRAY_AGG(c2) FILTER (WHERE c2 >= 0 AND c2 < 1000 AND c1 >= 1000) OVER (ORDER BY c1, c2 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as array_agg2,
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 NULL NULL 0 NULL NULL
+0 1 NULL NULL 0 NULL NULL
+0 2 NULL NULL 0 NULL NULL
+0 3 NULL NULL 0 NULL NULL
+0 4 NULL NULL 0 NULL NULL
+
+# regression test for https://github.com/apache/datafusion/issues/17401
+query I
+WITH source AS (
+    SELECT
+        1 AS n,
+        '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8,
+        '' AS a9, '' AS a10, '' AS a11, '' AS a12
+)
+SELECT
+    sum(n) OVER (PARTITION BY
+        a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12
+    )
+FROM source;
+----
+1
+
+# regression test for https://github.com/apache/datafusion/issues/17401
+query I
+WITH source AS (
+    SELECT
+        1 AS n,
+        '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8,
+        '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16,
+        '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24,
+        '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32,
+        '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40
+)
+SELECT
+    sum(n) OVER (PARTITION BY
+        a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20,
+        a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40
+    )
+FROM source;
+----
+1
+
+# regression test for https://github.com/apache/datafusion/issues/17401
+query I
+WITH source AS (
+    SELECT
+        1 AS n,
+        '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8,
+        '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16,
+        '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24,
+        '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32,
+        '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40
+)
+SELECT
+    sum(n) OVER (PARTITION BY
+        a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20,
+        a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40
+    )
+FROM (
+    SELECT * FROM source
+    ORDER BY a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20,
+        a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40
+);
+----
+1
+
+# regression test for https://github.com/apache/datafusion/issues/17401
+query I
+WITH source AS (
+    SELECT
+        1 AS n,
+        '' AS a1, '' AS a2, '' AS a3, '' AS a4, '' AS a5, '' AS a6, '' AS a7, '' AS a8,
+        '' AS a9, '' AS a10, '' AS a11, '' AS a12, '' AS a13, '' AS a14, '' AS a15, '' AS a16,
+        '' AS a17, '' AS a18, '' AS a19, '' AS a20, '' AS a21, '' AS a22, '' AS a23, '' AS a24,
+        '' AS a25, '' AS a26, '' AS a27, '' AS a28, '' AS a29, '' AS a30, '' AS a31, '' AS a32,
+        '' AS a33, '' AS a34, '' AS a35, '' AS a36, '' AS a37, '' AS a38, '' AS a39, '' AS a40
+)
+SELECT
+    sum(n) OVER (PARTITION BY
+        a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20,
+        a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40
+    )
+FROM (
+    SELECT * FROM source
+    WHERE a1 = '' AND a2 = '' AND a3 = '' AND a4 = '' AND a5 = '' AND a6 = '' AND a7 = '' AND a8 = ''
+        AND a9 = '' AND a10 = '' AND a11 = '' AND a12 = '' AND a13 = '' AND a14 = '' AND a15 = '' AND a16 = ''
+        AND a17 = '' AND a18 = '' AND a19 = '' AND a20 = '' AND a21 = '' AND a22 = '' AND a23 = '' AND a24 = ''
+        AND a25 = '' AND a26 = '' AND a27 = '' AND a28 = '' AND a29 = '' AND a30 = '' AND a31 = '' AND a32 = ''
+        AND a33 = '' AND a34 = '' AND a35 = '' AND a36 = '' AND a37 = '' AND a38 = '' AND a39 = '' AND a40 = ''
+    ORDER BY a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20,
+        a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, a40
+);
+----
+1
+
+# window_with_subquery_rewritten_to_join
+# the optimizer `scalar_subquery_to_join` rewrites
+# `WHERE acctbal > ( SELECT AVG(acctbal) FROM suppliers)` into a Join,
+# breaking the input schema passed to the window function above.
+# See: https://github.com/apache/datafusion/issues/17770
+query I
+WITH suppliers AS (
+  SELECT *
+  FROM (VALUES (1, 10.0), (1, 20.0)) AS t(nation, acctbal)
+)
+SELECT
+  ROW_NUMBER() OVER (PARTITION BY nation ORDER BY acctbal DESC) AS rn
+FROM suppliers AS s
+WHERE acctbal > (
+  SELECT AVG(acctbal) FROM suppliers
+);
+----
+1
+
+# Regression test for https://github.com/apache/datafusion/issues/20194
+# Window function with CASE WHEN in ORDER BY combined with NVL filter
+# should not trigger SanityCheckPlan error from equivalence normalization
+# replacing literals in sort expressions with complex filter expressions.
+statement ok
+CREATE TABLE issue_20194_t1 (
+  value_1_1 decimal(25) NULL,
+  value_1_2 int NULL,
+  value_1_3 bigint NULL
+);
+
+statement ok
+CREATE TABLE issue_20194_t2 (
+  value_2_1 bigint NULL,
+  value_2_2 varchar(140) NULL,
+  value_2_3 varchar(140) NULL
+);
+
+statement ok
+INSERT INTO issue_20194_t1 (value_1_1, value_1_2, value_1_3) VALUES (6774502793, 10040029, 1120);
+
+statement ok
+INSERT INTO issue_20194_t2 (value_2_1, value_2_2, value_2_3) VALUES (1120, '0', '0');
+
+query RII
+SELECT
+  t1.value_1_1, t1.value_1_2,
+  ROW_NUMBER() OVER (
+    PARTITION BY t1.value_1_1, t1.value_1_2
+    ORDER BY
+      CASE WHEN t2.value_2_2 = '0' THEN 1 ELSE 0 END ASC,
+      CASE WHEN t2.value_2_3 = '0' THEN 1 ELSE 0 END ASC
+  ) AS ord
+FROM issue_20194_t1 t1
+INNER JOIN issue_20194_t2 t2
+  ON t1.value_1_3 = t2.value_2_1
+  AND nvl(t2.value_2_3, '0') = '0';
+----
+6774502793 10040029 1
+
+statement ok
+DROP TABLE issue_20194_t1;
+
+statement ok
+DROP TABLE issue_20194_t2;
diff --git a/datafusion/sqllogictest/test_files/window_limits.slt b/datafusion/sqllogictest/test_files/window_limits.slt
new file mode 100644
index 0000000000000..5c06e7f04ec1c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/window_limits.slt
@@ -0,0 +1,766 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# see https://datafusion.apache.org/user-guide/sql/window_functions.html#syntax for field names & examples
+statement ok
+CREATE EXTERNAL TABLE employees (
+  depname VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  empno INT NOT NULL,
+  salary BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL,
+  hire_date DATE NOT NULL,
+  c15 TIMESTAMP NOT NULL,
+)
+STORED AS CSV
+LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv'
+OPTIONS ('format.has_header' 'true');
+
+# lead defaults to 1 and should grow limit
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT LEAD(empno) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+299
+363
+417
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT LEAD(empno) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+299
+363
+417
+
+query TT
+EXPLAIN
+SELECT LEAD(empno) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+logical_plan
+01)Projection: lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+02)--Limit: skip=0, fetch=3
+03)----WindowAggr: windowExpr=[[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+02)--GlobalLimitExec: skip=0, fetch=3
+03)----BoundedWindowAggExec: wdw=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=4), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# lead defaults can lookahead by any amount and should grow limit
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT LEAD(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+363
+417
+794
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT LEAD(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+363
+417
+794
+
+query TT
+EXPLAIN
+SELECT LEAD(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+logical_plan
+01)Projection: lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+02)--Limit: skip=0, fetch=3
+03)----WindowAggr: windowExpr=[[lead(employees.empno, Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+02)--GlobalLimitExec: skip=0, fetch=3
+03)----BoundedWindowAggExec: wdw=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# Should use the max of leads
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query IIII
+SELECT
+  empno,
+  LEAD(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead1,
+  LEAD(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead3,
+  LEAD(salary, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead5
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 28774375 557517119 4015442341
+299 1865307672 4061635107 3542840110
+363 557517119 4015442341 1088543984
+417 4061635107 3542840110 1362369177
+794 4015442341 1088543984 145294611
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query IIII
+SELECT
+  empno,
+  LEAD(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead1,
+  LEAD(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead3,
+  LEAD(salary, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead5
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 28774375 557517119 4015442341
+299 1865307672 4061635107 3542840110
+363 557517119 4015442341 1088543984
+417 4061635107 3542840110 1362369177
+794 4015442341 1088543984 145294611
+
+query TT
+EXPLAIN
+SELECT
+  empno,
+  LEAD(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead1,
+  LEAD(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead3,
+  LEAD(salary, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead5
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+logical_plan
+01)Sort: employees.empno ASC NULLS LAST, fetch=5
+02)--Projection: employees.empno, lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS lead1, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS lead3, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS lead5
+03)----WindowAggr: windowExpr=[[lead(employees.salary, Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary, Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary, Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno, salary]
+physical_plan
+01)ProjectionExec: expr=[empno@0 as empno, lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead1, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as lead3, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lead5]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=10), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true
+
+# 2 < 3... nth_value should not grow the limit
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT NTH_VALUE(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+NULL
+299
+299
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT NTH_VALUE(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+NULL
+299
+299
+
+query TT
+EXPLAIN
+SELECT NTH_VALUE(empno, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+logical_plan
+01)Projection: nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+02)--Limit: skip=0, fetch=3
+03)----WindowAggr: windowExpr=[[nth_value(employees.empno, Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+02)--GlobalLimitExec: skip=0, fetch=3
+03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# 5 > 3... nth_value still won't grow the limit - it's causal
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT NTH_VALUE(empno, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+NULL
+NULL
+NULL
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT NTH_VALUE(empno, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+NULL
+NULL
+NULL
+
+query TT
+EXPLAIN
+SELECT NTH_VALUE(empno, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM employees LIMIT 3
+----
+logical_plan
+01)Projection: nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+02)--Limit: skip=0, fetch=3
+03)----WindowAggr: windowExpr=[[nth_value(employees.empno, Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+02)--GlobalLimitExec: skip=0, fetch=3
+03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# aggregate functions shouldn't affect the window
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query TIIRII
+SELECT
+  depname,
+  empno,
+  SUM(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_sum,
+  AVG(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_avg,
+  MIN(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_min,
+  MAX(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_max
+FROM employees
+LIMIT 5;
+----
+a 102 3276123488 3276123488 3276123488 3276123488
+e 299 3304897863 1652448931.5 28774375 3276123488
+a 363 5170205535 1723401845 28774375 3276123488
+e 417 5727722654 1431930663.5 28774375 3276123488
+d 794 9789357761 1957871552.2 28774375 4061635107
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query TIIRII
+SELECT
+  depname,
+  empno,
+  SUM(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_sum,
+  AVG(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_avg,
+  MIN(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_min,
+  MAX(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_max
+FROM employees
+LIMIT 5;
+----
+a 102 3276123488 3276123488 3276123488 3276123488
+e 299 3304897863 1652448931.5 28774375 3276123488
+a 363 5170205535 1723401845 28774375 3276123488
+e 417 5727722654 1431930663.5 28774375 3276123488
+d 794 9789357761 1957871552.2 28774375 4061635107
+
+query TT
+EXPLAIN
+SELECT
+  depname,
+  empno,
+  SUM(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_sum,
+  AVG(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_avg,
+  MIN(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_min,
+  MAX(salary)       OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_max
+FROM employees
+LIMIT 5;
+----
+logical_plan
+01)Projection: employees.depname, employees.empno, sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_sum, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_avg, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_min, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_max
+02)--Limit: skip=0, fetch=5
+03)----WindowAggr: windowExpr=[[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(CAST(employees.salary AS Float64)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[depname, empno, salary]
+physical_plan
+01)ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as running_sum, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as running_avg, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as running_min, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as running_max]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[empno@1 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
+
+# ranking functions that don't affect the limit
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query IIII
+SELECT
+  empno,
+  row_number() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rn,
+  rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk,
+  dense_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS drnk
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 1 1 1
+299 2 2 2
+363 3 3 3
+417 4 4 4
+794 5 5 5
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query IIII
+SELECT
+  empno,
+  row_number() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rn,
+  rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk,
+  dense_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS drnk
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 1 1 1
+299 2 2 2
+363 3 3 3
+417 4 4 4
+794 5 5 5
+
+query TT
+EXPLAIN
+SELECT
+  empno,
+  row_number() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rn,
+  rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk,
+  dense_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS drnk
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+logical_plan
+01)Sort: employees.empno ASC NULLS LAST, fetch=5
+02)--Projection: employees.empno, row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rnk, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS drnk
+03)----WindowAggr: windowExpr=[[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[empno@0 as empno, row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rnk, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as drnk]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# Unoptimizable global ranking functions
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query IRRI
+SELECT
+  empno,
+  percent_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pr,
+  cume_dist() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cd,
+  ntile(4) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nt
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 0 0.01 1
+299 0.010101010101 0.02 1
+363 0.020202020202 0.03 1
+417 0.030303030303 0.04 1
+794 0.040404040404 0.05 1
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query IRRI
+SELECT
+  empno,
+  percent_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pr,
+  cume_dist() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cd,
+  ntile(4) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nt
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 0 0.01 1
+299 0.010101010101 0.02 1
+363 0.020202020202 0.03 1
+417 0.030303030303 0.04 1
+794 0.040404040404 0.05 1
+
+query TT
+EXPLAIN
+SELECT
+  empno,
+  percent_rank() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pr,
+  cume_dist() OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cd,
+  ntile(4) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS nt
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+logical_plan
+01)Sort: employees.empno ASC NULLS LAST, fetch=5
+02)--Projection: employees.empno, percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS pr, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS cd, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS nt
+03)----WindowAggr: windowExpr=[[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno]
+physical_plan
+01)ProjectionExec: expr=[empno@0 as empno, percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as pr, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as cd, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as nt]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----WindowAggExec: wdw=[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }]
+04)------SortExec: expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true
+
+# Analytical functions that don't lookahead
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query IIIII
+SELECT
+  empno,
+  first_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS fv,
+  lag(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS l1,
+  last_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lv,
+  nth_value(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS n3
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 3276123488 NULL 3276123488 NULL
+299 3276123488 3276123488 28774375 NULL
+363 3276123488 28774375 1865307672 1865307672
+417 3276123488 1865307672 557517119 1865307672
+794 3276123488 557517119 4061635107 1865307672
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query IIIII
+SELECT
+  empno,
+  first_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS fv,
+  lag(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS l1,
+  last_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lv,
+  nth_value(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS n3
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+102 3276123488 NULL 3276123488 NULL
+299 3276123488 3276123488 28774375 NULL
+363 3276123488 28774375 1865307672 1865307672
+417 3276123488 1865307672 557517119 1865307672
+794 3276123488 557517119 4061635107 1865307672
+
+query TT
+EXPLAIN
+SELECT
+  empno,
+  first_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS fv,
+  lag(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS l1,
+  last_value(salary) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lv,
+  nth_value(salary, 3) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS n3
+FROM employees
+ORDER BY empno
+LIMIT 5;
+----
+logical_plan
+01)Sort: employees.empno ASC NULLS LAST, fetch=5
+02)--Projection: employees.empno, first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS fv, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS l1, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS lv, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS n3
+03)----WindowAggr: windowExpr=[[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary, Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary, Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno, salary]
+physical_plan
+01)ProjectionExec: expr=[empno@0 as empno, first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as fv, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as l1, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lv, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as n3]
+02)--GlobalLimitExec: skip=0, fetch=5
+03)----BoundedWindowAggExec: wdw=[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true
+
+# should handle partition by unoptimized
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query TIII
+SELECT depname, empno, salary, SUM(salary) OVER (
+        PARTITION BY depname
+        ORDER BY empno
+        ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+    ) AS running_sum
+FROM employees
+ORDER BY depname
+LIMIT 5
+----
+a 102 3276123488 3276123488
+a 363 1865307672 5141431160
+a 829 4015442341 5880750013
+a 2555 145294611 4160736952
+a 2809 754775609 900070220
+
+query TT
+EXPLAIN
+SELECT depname, empno, salary, SUM(salary) OVER (
+        PARTITION BY depname
+        ORDER BY empno
+        ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+    ) AS running_sum
+FROM employees
+ORDER BY depname
+LIMIT 5
+----
+logical_plan
+01)Sort: employees.depname ASC NULLS LAST, fetch=5
+02)--Projection: employees.depname, employees.empno, employees.salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW AS running_sum
+03)----WindowAggr: windowExpr=[[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[depname, empno, salary]
+physical_plan
+01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5
+02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum]
+03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1
+06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
+
+# should handle partition by optimized
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query TIII
+SELECT depname, empno, salary, SUM(salary) OVER (
+        PARTITION BY depname
+        ORDER BY empno
+        ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+    ) AS running_sum
+FROM employees
+ORDER BY depname
+LIMIT 5
+----
+a 102 3276123488 3276123488
+a 363 1865307672 5141431160
+a 829 4015442341 5880750013
+a 2555 145294611 4160736952
+a 2809 754775609 900070220
+
+query TT
+EXPLAIN
+SELECT depname, empno, salary, SUM(salary) OVER (
+        PARTITION BY depname
+        ORDER BY empno
+        ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+    ) AS running_sum
+FROM employees
+ORDER BY depname
+LIMIT 5
+----
+logical_plan
+01)Sort: employees.depname ASC NULLS LAST, fetch=5
+02)--Projection: employees.depname, employees.empno, employees.salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW AS running_sum
+03)----WindowAggr: windowExpr=[[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[depname, empno, salary]
+physical_plan
+01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5
+02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum]
+03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST]
+05)--------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true
+06)----------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true
+
+# unbounded following
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT LEAD(salary) OVER (ORDER BY empno ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
+FROM employees
+LIMIT 5;
+----
+28774375
+1865307672
+557517119
+4061635107
+4015442341
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT LEAD(salary) OVER (ORDER BY empno ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
+FROM employees
+LIMIT 5;
+----
+28774375
+1865307672
+557517119
+4061635107
+4015442341
+
+# RANGE
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query I
+SELECT LEAD(salary) OVER (ORDER BY empno RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+FROM employees
+LIMIT 5;
+----
+28774375
+1865307672
+557517119
+4061635107
+4015442341
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query I
+SELECT LEAD(salary) OVER (ORDER BY empno RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+FROM employees
+LIMIT 5;
+----
+28774375
+1865307672
+557517119
+4061635107
+4015442341
+
+# multiple windows
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query II
+SELECT
+    LEAD(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+    LEAD(salary, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+FROM employees
+LIMIT 5;
+----
+28774375 4015442341
+1865307672 3542840110
+557517119 1088543984
+4061635107 1362369177
+4015442341 145294611
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query II
+SELECT
+    LEAD(salary, 1) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+    LEAD(salary, 5) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+FROM employees
+LIMIT 5;
+----
+28774375 4015442341
+1865307672 3542840110
+557517119 1088543984
+4061635107 1362369177
+4015442341 145294611
+
+# sliding
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query III
+SELECT
+    empno,
+    salary,
+    SUM(salary) OVER (ORDER BY empno ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS sliding_sum
+FROM employees
+LIMIT 3;
+----
+102 3276123488 3276123488
+299 28774375 3304897863
+363 1865307672 5170205535
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query III
+SELECT
+    empno,
+    salary,
+    SUM(salary) OVER (ORDER BY empno ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS sliding_sum
+FROM employees
+LIMIT 3;
+----
+102 3276123488 3276123488
+299 28774375 3304897863
+363 1865307672 5170205535
+
+# sliding lead
+statement ok
+set datafusion.optimizer.enable_window_limits = false;
+
+query III
+SELECT
+    empno,
+    salary,
+    LEAD(salary, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead2
+FROM employees
+LIMIT 3;
+----
+102 3276123488 1865307672
+299 28774375 557517119
+363 1865307672 4061635107
+
+statement ok
+set datafusion.optimizer.enable_window_limits = true;
+
+query III
+SELECT
+    empno,
+    salary,
+    LEAD(salary, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead2
+FROM employees
+LIMIT 3;
+----
+102 3276123488 1865307672
+299 28774375 557517119
+363 1865307672 4061635107
+
+query TT
+EXPLAIN
+SELECT
+    empno,
+    salary,
+    LEAD(salary, 2) OVER (ORDER BY empno ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lead2
+FROM employees
+LIMIT 3;
+----
+logical_plan
+01)Projection: employees.empno, employees.salary, lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS lead2
+02)--Limit: skip=0, fetch=3
+03)----WindowAggr: windowExpr=[[lead(employees.salary, Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+04)------TableScan: employees projection=[empno, salary]
+physical_plan
+01)ProjectionExec: expr=[empno@0 as empno, salary@1 as salary, lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead2]
+02)--GlobalLimitExec: skip=0, fetch=3
+03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false]
+05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true
diff --git a/datafusion/sqllogictest/test_files/window_topk_pushdown.slt b/datafusion/sqllogictest/test_files/window_topk_pushdown.slt
new file mode 100644
index 0000000000000..2c33566736745
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/window_topk_pushdown.slt
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for the TopKRepartition optimizer rule.
+#
+# When a partitioned window function has ORDER BY + LIMIT, the optimizer
+# can push a TopK (Sort with fetch) below the hash repartition to reduce
+# the volume of data flowing through the shuffle.
+#
+# The optimization is correct when the hash partition key is a prefix of
+# the sort key, because all rows with the same partition key land in the
+# same output partition.
+
+statement ok
+CREATE EXTERNAL TABLE employees (
+  depname VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  empno INT NOT NULL,
+  salary BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL,
+  hire_date DATE NOT NULL,
+  c15 TIMESTAMP NOT NULL
+)
+STORED AS CSV
+LOCATION '../../testing/data/csv/aggregate_test_100_with_dates.csv'
+OPTIONS ('format.has_header' 'true');
+
+# Use multiple partitions to trigger hash repartitioning for the window function
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+###
+### Results correctness: both enabled and disabled must produce the same output
+###
+
+# Disabled: baseline results without the optimization
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = false;
+
+query TI
+SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+a 1
+a 2
+a 3
+
+# Enabled: results must match baseline
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = true;
+
+query TI
+SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+a 1
+a 2
+a 3
+
+###
+### Plan shape: disabled should have TopK only above repartition
+###
+
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = false;
+
+query TT
+EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+logical_plan
+01)Projection: employees.depname, running_total
+02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3
+03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno
+04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+05)--------TableScan: employees projection=[depname, empno]
+physical_plan
+01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total]
+02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3
+03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno]
+04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1
+07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true
+
+###
+### Plan shape: enabled should have TopK on BOTH sides of the repartition
+###
+
+statement ok
+SET datafusion.optimizer.enable_topk_repartition = true;
+
+query TT
+EXPLAIN SELECT depname, SUM(1) OVER (PARTITION BY depname ORDER BY empno ASC ROWS UNBOUNDED PRECEDING) as running_total
+FROM employees
+ORDER BY depname, empno
+LIMIT 3;
+----
+logical_plan
+01)Projection: employees.depname, running_total
+02)--Sort: employees.depname ASC NULLS LAST, employees.empno ASC NULLS LAST, fetch=3
+03)----Projection: employees.depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS running_total, employees.empno
+04)------WindowAggr: windowExpr=[[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+05)--------TableScan: employees projection=[depname, empno]
+physical_plan
+01)ProjectionExec: expr=[depname@0 as depname, running_total@1 as running_total]
+02)--SortPreservingMergeExec: [depname@0 ASC NULLS LAST, empno@2 ASC NULLS LAST], fetch=3
+03)----ProjectionExec: expr=[depname@0 as depname, sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as running_total, empno@1 as empno]
+04)------BoundedWindowAggExec: wdw=[sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(Int64(1)) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted]
+05)--------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true], sort_prefix=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST]
+06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=1, maintains_sort_order=true
+07)------------SortExec: TopK(fetch=3), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true]
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno], file_type=csv, has_header=true
diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml
index e4ca7bc46c804..a0f203cec8db6 100644
--- a/datafusion/substrait/Cargo.toml
+++ b/datafusion/substrait/Cargo.toml
@@ -27,6 +27,9 @@ license = { workspace = true }
 authors = { workspace = true }
 rust-version = { workspace = true }
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -34,19 +37,21 @@ workspace = true
 async-recursion = "1.0"
 async-trait = { workspace = true }
 chrono = { workspace = true }
-datafusion = { workspace = true }
+datafusion = { workspace = true, features = ["sql"] }
+half = { workspace = true }
 itertools = { workspace = true }
 object_store = { workspace = true }
-pbjson-types = { workspace = true }
+# We need to match the version in substrait, so we don't use the workspace version here
+pbjson-types = { version = "0.8.0" }
 prost = { workspace = true }
-substrait = { version = "0.56", features = ["serde"] }
+substrait = { version = "0.63.0", features = ["serde"] }
 url = { workspace = true }
 tokio = { workspace = true, features = ["fs"] }
 
 [dev-dependencies]
-datafusion = { workspace = true, features = ["nested_expressions"] }
+datafusion = { workspace = true, features = ["nested_expressions", "unicode_expressions"] }
 datafusion-functions-aggregate = { workspace = true }
-serde_json = "1.0"
+serde_json = { workspace = true }
 tokio = { workspace = true }
 insta = { workspace = true }
 
diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md
index 8e7f99b7df380..d18d7bda5e3b0 100644
--- a/datafusion/substrait/README.md
+++ b/datafusion/substrait/README.md
@@ -19,9 +19,12 @@
 
 # Apache DataFusion Substrait
 
-This crate contains a [Substrait] producer and consumer for [Apache DataFusion]
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format.
+
+This crate is a submodule of DataFusion that provides a [Substrait] producer and consumer for DataFusion
 plans. See [API Docs] for details and examples.
 
+[apache arrow]: https://arrow.apache.org/
+[apache datafusion]: https://datafusion.apache.org/
 [substrait]: https://substrait.io
-[apache datafusion]: https://datafusion.apache.org
 [api docs]: https://docs.rs/datafusion-substrait/latest
diff --git a/datafusion/substrait/src/extensions.rs b/datafusion/substrait/src/extensions.rs
index c74061f2c9f3c..78c357f3b8886 100644
--- a/datafusion/substrait/src/extensions.rs
+++ b/datafusion/substrait/src/extensions.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{plan_err, DataFusionError, HashMap};
+use datafusion::common::{DataFusionError, HashMap, plan_err};
+use substrait::proto::extensions::SimpleExtensionDeclaration;
 use substrait::proto::extensions::simple_extension_declaration::{
     ExtensionFunction, ExtensionType, ExtensionTypeVariation, MappingType,
 };
-use substrait::proto::extensions::SimpleExtensionDeclaration;
 
 /// Substrait uses [SimpleExtensions](https://substrait.io/extensions/#simple-extensions) to define
 /// behavior of plans in addition to what's supported directly by the protobuf definitions.
@@ -38,13 +38,15 @@ impl Extensions {
     /// Registers a function and returns the anchor (reference) to it. If the function has already
     /// been registered, it returns the existing anchor.
     /// Function names are case-insensitive (converted to lowercase).
-    pub fn register_function(&mut self, function_name: String) -> u32 {
+    pub fn register_function(&mut self, function_name: &str) -> u32 {
         let function_name = function_name.to_lowercase();
 
         // Some functions are named differently in Substrait default extensions than in DF
         // Rename those to match the Substrait extensions for interoperability
         let function_name = match function_name.as_str() {
             "substr" => "substring".to_string(),
+            "log" => "logb".to_string(),
+            "isnan" => "is_nan".to_string(),
             _ => function_name,
         };
 
@@ -62,7 +64,7 @@ impl Extensions {
 
     /// Registers a type and returns the anchor (reference) to it. If the type has already
     /// been registered, it returns the existing anchor.
-    pub fn register_type(&mut self, type_name: String) -> u32 {
+    pub fn register_type(&mut self, type_name: &str) -> u32 {
         let type_name = type_name.to_lowercase();
         match self.types.iter().find(|(_, t)| *t == &type_name) {
             Some((type_anchor, _)) => *type_anchor, // Type has been registered
@@ -115,7 +117,7 @@ impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
         let mut extensions = vec![];
         for (f_anchor, f_name) in val.functions {
             let function_extension = ExtensionFunction {
-                extension_uri_reference: u32::MAX,
+                extension_urn_reference: u32::MAX,
                 function_anchor: f_anchor,
                 name: f_name,
             };
@@ -127,7 +129,7 @@ impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
 
         for (t_anchor, t_name) in val.types {
             let type_extension = ExtensionType {
-                extension_uri_reference: u32::MAX, // https://github.com/apache/datafusion/issues/11545
+                extension_urn_reference: u32::MAX, // https://github.com/apache/datafusion/issues/11545
                 type_anchor: t_anchor,
                 name: t_name,
             };
@@ -139,7 +141,7 @@ impl From<Extensions> for Vec<SimpleExtensionDeclaration> {
 
         for (tv_anchor, tv_name) in val.type_variations {
             let type_variation_extension = ExtensionTypeVariation {
-                extension_uri_reference: u32::MAX, // We don't register proper extension URIs yet
+                extension_urn_reference: u32::MAX, // We don't register proper extension URNs yet
                 type_variation_anchor: tv_anchor,
                 name: tv_name,
             };
diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs
index 0f2fbf199be35..0819fd3a592f9 100644
--- a/datafusion/substrait/src/lib.rs
+++ b/datafusion/substrait/src/lib.rs
@@ -19,10 +19,11 @@
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 // Make sure fast / cheap clones on Arc are explicit:
 // https://github.com/apache/datafusion/issues/11143
 #![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 
 //! Serialize / Deserialize DataFusion Plans to [Substrait.io]
 //!
@@ -66,19 +67,24 @@
 //! # use datafusion::arrow::array::{Int32Array, RecordBatch};
 //! # use datafusion_substrait::logical_plan;
 //! // Create a plan that scans table 't'
-//!  let ctx = SessionContext::new();
-//!  let batch = RecordBatch::try_from_iter(vec![("x", Arc::new(Int32Array::from(vec![42])) as _)])?;
-//!  ctx.register_batch("t", batch)?;
-//!  let df = ctx.sql("SELECT x from t").await?;
-//!  let plan = df.into_optimized_plan()?;
+//! let ctx = SessionContext::new();
+//! let batch = RecordBatch::try_from_iter(vec![(
+//!     "x",
+//!     Arc::new(Int32Array::from(vec![42])) as _,
+//! )])?;
+//! ctx.register_batch("t", batch)?;
+//! let df = ctx.sql("SELECT x from t").await?;
+//! let plan = df.into_optimized_plan()?;
 //!
-//!  // Convert the plan into a substrait (protobuf) Plan
-//!  let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?;
+//! // Convert the plan into a substrait (protobuf) Plan
+//! let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx.state())?;
 //!
-//!  // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan
-//!  let logical_round_trip = logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan).await?;
-//!  let logical_round_trip = ctx.state().optimize(&logical_round_trip)?;
-//!  assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
+//! // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan
+//! let logical_round_trip =
+//!     logical_plan::consumer::from_substrait_plan(&ctx.state(), &substrait_plan)
+//!         .await?;
+//! let logical_round_trip = ctx.state().optimize(&logical_round_trip)?;
+//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip));
 //! # Ok(())
 //! # }
 //! ```
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
index 7687d9f7642ab..096eef7ae3b0e 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/aggregate_function.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::logical_plan::consumer::{
-    from_substrait_func_args, substrait_fun_name, SubstraitConsumer,
+    SubstraitConsumer, from_substrait_func_args, substrait_fun_name,
 };
-use datafusion::common::{not_impl_datafusion_err, plan_err, DFSchema, ScalarValue};
+use datafusion::common::{DFSchema, ScalarValue, not_impl_datafusion_err, plan_err};
 use datafusion::execution::FunctionRegistry;
-use datafusion::logical_expr::{expr, Expr, SortExpr};
+use datafusion::logical_expr::{Expr, SortExpr, expr};
 use std::sync::Arc;
 use substrait::proto::AggregateFunction;
 
@@ -30,7 +30,7 @@ pub async fn from_substrait_agg_func(
     f: &AggregateFunction,
     input_schema: &DFSchema,
     filter: Option<Box<Expr>>,
-    order_by: Option<Vec<SortExpr>>,
+    order_by: Vec<SortExpr>,
     distinct: bool,
 ) -> datafusion::common::Result<Arc<Expr>> {
     let Some(fn_signature) = consumer
@@ -60,7 +60,7 @@ pub async fn from_substrait_agg_func(
     // we inject a dummy argument that does not affect the query, but allows
     // us to bypass this limitation.
     let args = if udaf.name() == "count" && args.is_empty() {
-        vec![Expr::Literal(ScalarValue::Int64(Some(1)))]
+        vec![Expr::Literal(ScalarValue::Int64(Some(1)), None)]
     } else {
         args
     };
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs b/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
index 5e8d3d93065f4..3dd62afe8f193 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/cast.rs
@@ -15,9 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::types::from_substrait_type_without_names;
-use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{substrait_err, DFSchema};
+use crate::logical_plan::consumer::{
+    SubstraitConsumer, field_from_substrait_type_without_names,
+};
+use datafusion::common::{DFSchema, substrait_err};
 use datafusion::logical_expr::{Cast, Expr, TryCast};
 use substrait::proto::expression as substrait_expression;
 use substrait::proto::expression::cast::FailureBehavior::ReturnNull;
@@ -37,11 +38,11 @@ pub async fn from_cast(
                     )
                     .await?,
             );
-            let data_type = from_substrait_type_without_names(consumer, output_type)?;
+            let field = field_from_substrait_type_without_names(consumer, output_type)?;
             if cast.failure_behavior() == ReturnNull {
-                Ok(Expr::TryCast(TryCast::new(input_expr, data_type)))
+                Ok(Expr::TryCast(TryCast::new_from_field(input_expr, field)))
             } else {
-                Ok(Expr::Cast(Cast::new(input_expr, data_type)))
+                Ok(Expr::Cast(Cast::new_from_field(input_expr, field)))
             }
         }
         None => substrait_err!("Cast expression without output type is not allowed"),
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs b/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
index 90b5b6418149b..dae6c625ef55b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/field_reference.rs
@@ -16,34 +16,51 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, Column, DFSchema};
+use datafusion::common::{Column, DFSchema, not_impl_err, substrait_err};
 use datafusion::logical_expr::Expr;
+use std::sync::Arc;
+use substrait::proto::expression::FieldReference;
 use substrait::proto::expression::field_reference::ReferenceType::DirectReference;
+use substrait::proto::expression::field_reference::RootType;
 use substrait::proto::expression::reference_segment::ReferenceType::StructField;
-use substrait::proto::expression::FieldReference;
 
 pub async fn from_field_reference(
-    _consumer: &impl SubstraitConsumer,
+    consumer: &impl SubstraitConsumer,
     field_ref: &FieldReference,
     input_schema: &DFSchema,
 ) -> datafusion::common::Result<Expr> {
-    from_substrait_field_reference(field_ref, input_schema)
+    from_substrait_field_reference(consumer, field_ref, input_schema)
 }
 
 pub(crate) fn from_substrait_field_reference(
+    consumer: &impl SubstraitConsumer,
     field_ref: &FieldReference,
     input_schema: &DFSchema,
 ) -> datafusion::common::Result<Expr> {
     match &field_ref.reference_type {
         Some(DirectReference(direct)) => match &direct.reference_type.as_ref() {
-            Some(StructField(x)) => match &x.child.as_ref() {
-                Some(_) => not_impl_err!(
-                    "Direct reference StructField with child is not supported"
-                ),
-                None => Ok(Expr::Column(Column::from(
-                    input_schema.qualified_field(x.field as usize),
-                ))),
-            },
+            Some(StructField(struct_field)) => {
+                if struct_field.child.is_some() {
+                    return not_impl_err!(
+                        "Direct reference StructField with child is not supported"
+                    );
+                }
+                let field_idx = struct_field.field as usize;
+                match &field_ref.root_type {
+                    Some(RootType::RootReference(_)) | None => Ok(Expr::Column(
+                        Column::from(input_schema.qualified_field(field_idx)),
+                    )),
+                    Some(RootType::OuterReference(outer_ref)) => {
+                        resolve_outer_reference(consumer, outer_ref, field_idx)
+                    }
+                    Some(RootType::Expression(_)) => not_impl_err!(
+                        "Expression root type in field reference is not supported"
+                    ),
+                    Some(RootType::LambdaParameterReference(_)) => not_impl_err!(
+                        "Lambda parameter reference in field reference is not yet supported"
+                    ),
+                }
+            }
             _ => not_impl_err!(
                 "Direct reference with types other than StructField is not supported"
             ),
@@ -51,3 +68,20 @@ pub(crate) fn from_substrait_field_reference(
         _ => not_impl_err!("unsupported field ref type"),
     }
 }
+
+fn resolve_outer_reference(
+    consumer: &impl SubstraitConsumer,
+    outer_ref: &substrait::proto::expression::field_reference::OuterReference,
+    field_idx: usize,
+) -> datafusion::common::Result<Expr> {
+    let steps_out = outer_ref.steps_out as usize;
+    let Some(outer_schema) = consumer.get_outer_schema(steps_out) else {
+        return substrait_err!(
+            "OuterReference with steps_out={steps_out} \
+             but no outer schema is available"
+        );
+    };
+    let (qualifier, field) = outer_schema.qualified_field(field_idx);
+    let col = Column::from((qualifier, field));
+    Ok(Expr::OuterReferenceColumn(Arc::clone(field), col))
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs b/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
index 0b610b61b1dea..cae5ecb6e5a8b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/function_arguments.rs
@@ -16,10 +16,10 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, DFSchema};
+use datafusion::common::{DFSchema, not_impl_err};
 use datafusion::logical_expr::Expr;
-use substrait::proto::function_argument::ArgType;
 use substrait::proto::FunctionArgument;
+use substrait::proto::function_argument::ArgType;
 
 /// Convert Substrait FunctionArguments to DataFusion Exprs
 pub async fn from_substrait_func_args(
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
index 5adc137d9a43a..d7d7a69581f05 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/literal.rs
@@ -15,43 +15,46 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::types::from_substrait_type;
-use crate::logical_plan::consumer::utils::{next_struct_field_name, DEFAULT_TIMEZONE};
 use crate::logical_plan::consumer::SubstraitConsumer;
-#[allow(deprecated)]
+use crate::logical_plan::consumer::types::from_substrait_type;
+use crate::logical_plan::consumer::utils::{DEFAULT_TIMEZONE, next_struct_field_name};
+use crate::variation_const::FLOAT_16_TYPE_NAME;
+#[expect(deprecated)]
 use crate::variation_const::{
     DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
     INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_NAME,
     INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
+    LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
+    TIME_64_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
     TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF,
     TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
     VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
-use datafusion::arrow::array::{new_empty_array, AsArray, MapArray};
+use datafusion::arrow::array::{AsArray, MapArray, new_empty_array};
 use datafusion::arrow::buffer::OffsetBuffer;
 use datafusion::arrow::datatypes::{Field, IntervalDayTime, IntervalMonthDayNano};
 use datafusion::arrow::temporal_conversions::NANOSECONDS;
 use datafusion::common::scalar::ScalarStructBuilder;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_datafusion_err, substrait_err, ScalarValue,
+    ScalarValue, not_impl_err, plan_err, substrait_datafusion_err, substrait_err,
 };
 use datafusion::logical_expr::Expr;
+use prost::Message;
 use std::sync::Arc;
 use substrait::proto;
-use substrait::proto::expression::literal::user_defined::Val;
+use substrait::proto::expression::Literal;
+use substrait::proto::expression::literal::user_defined::{TypeAnchorType, Val};
 use substrait::proto::expression::literal::{
-    interval_day_to_second, IntervalCompound, IntervalDayToSecond, IntervalYearToMonth,
-    LiteralType,
+    IntervalCompound, IntervalDayToSecond, IntervalYearToMonth, LiteralType,
+    interval_day_to_second,
 };
-use substrait::proto::expression::Literal;
 
 pub async fn from_literal(
     consumer: &impl SubstraitConsumer,
     expr: &Literal,
 ) -> datafusion::common::Result<Expr> {
     let scalar_value = from_substrait_literal_without_names(consumer, expr)?;
-    Ok(Expr::Literal(scalar_value))
+    Ok(Expr::Literal(scalar_value, None))
 }
 
 pub(crate) fn from_substrait_literal_without_names(
@@ -99,9 +102,10 @@ pub(crate) fn from_substrait_literal(
         },
         Some(LiteralType::Fp32(f)) => ScalarValue::Float32(Some(*f)),
         Some(LiteralType::Fp64(f)) => ScalarValue::Float64(Some(*f)),
+        #[expect(deprecated)]
         Some(LiteralType::Timestamp(t)) => {
             // Kept for backwards compatibility, new plans should use PrecisionTimestamp(Tz) instead
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             match lit.type_variation_reference {
                 TIMESTAMP_SECOND_TYPE_VARIATION_REF => {
                     ScalarValue::TimestampSecond(Some(*t), None)
@@ -155,6 +159,45 @@ pub(crate) fn from_substrait_literal(
             }
         },
         Some(LiteralType::Date(d)) => ScalarValue::Date32(Some(*d)),
+        Some(LiteralType::PrecisionTime(pt)) => match pt.precision {
+            0 => match lit.type_variation_reference {
+                TIME_32_TYPE_VARIATION_REF => {
+                    ScalarValue::Time32Second(Some(pt.value as i32))
+                }
+                others => {
+                    return substrait_err!("Unknown type variation reference {others}");
+                }
+            },
+            3 => match lit.type_variation_reference {
+                TIME_32_TYPE_VARIATION_REF => {
+                    ScalarValue::Time32Millisecond(Some(pt.value as i32))
+                }
+                others => {
+                    return substrait_err!("Unknown type variation reference {others}");
+                }
+            },
+            6 => match lit.type_variation_reference {
+                TIME_64_TYPE_VARIATION_REF => {
+                    ScalarValue::Time64Microsecond(Some(pt.value))
+                }
+                others => {
+                    return substrait_err!("Unknown type variation reference {others}");
+                }
+            },
+            9 => match lit.type_variation_reference {
+                TIME_64_TYPE_VARIATION_REF => {
+                    ScalarValue::Time64Nanosecond(Some(pt.value))
+                }
+                others => {
+                    return substrait_err!("Unknown type variation reference {others}");
+                }
+            },
+            p => {
+                return not_impl_err!(
+                    "Unsupported Substrait precision {p} for PrecisionTime"
+                );
+            }
+        },
         Some(LiteralType::String(s)) => match lit.type_variation_reference {
             DEFAULT_CONTAINER_TYPE_VARIATION_REF => ScalarValue::Utf8(Some(s.clone())),
             LARGE_CONTAINER_TYPE_VARIATION_REF => ScalarValue::LargeUtf8(Some(s.clone())),
@@ -343,20 +386,25 @@ pub(crate) fn from_substrait_literal(
             use interval_day_to_second::PrecisionMode;
             // DF only supports millisecond precision, so for any more granular type we lose precision
             let milliseconds = match precision_mode {
+                #[expect(deprecated)]
                 Some(PrecisionMode::Microseconds(ms)) => ms / 1000,
-                None =>
+                None => {
                     if *subseconds != 0 {
-                        return substrait_err!("Cannot set subseconds field of IntervalDayToSecond without setting precision");
+                        return substrait_err!(
+                            "Cannot set subseconds field of IntervalDayToSecond without setting precision"
+                        );
                     } else {
                         0_i32
                     }
+                }
                 Some(PrecisionMode::Precision(0)) => *subseconds as i32 * 1000,
                 Some(PrecisionMode::Precision(3)) => *subseconds as i32,
                 Some(PrecisionMode::Precision(6)) => (subseconds / 1000) as i32,
                 Some(PrecisionMode::Precision(9)) => (subseconds / 1000 / 1000) as i32,
                 _ => {
                     return not_impl_err!(
-                    "Unsupported Substrait interval day to second precision mode: {precision_mode:?}")
+                        "Unsupported Substrait interval day to second precision mode: {precision_mode:?}"
+                    );
                 }
             };
 
@@ -400,8 +448,6 @@ pub(crate) fn from_substrait_literal(
                 return Ok(value);
             }
 
-            // TODO: remove the code below once the producer has been updated
-
             // Helper function to prevent duplicating this code - can be inlined once the non-extension path is removed
             let interval_month_day_nano =
                 |user_defined: &proto::expression::literal::UserDefined| -> datafusion::common::Result<ScalarValue> {
@@ -428,28 +474,64 @@ pub(crate) fn from_substrait_literal(
                     )))
                 };
 
-            if let Some(name) = consumer
-                .get_extensions()
-                .types
-                .get(&user_defined.type_reference)
-            {
+            let type_ref = match user_defined.type_anchor_type {
+                Some(TypeAnchorType::TypeReference(ref_val)) => ref_val,
+                Some(TypeAnchorType::TypeAliasReference(_)) => {
+                    return not_impl_err!(
+                        "Type alias references in user-defined literals are not yet supported"
+                    );
+                }
+                None => 0,
+            };
+
+            if let Some(name) = consumer.get_extensions().types.get(&type_ref) {
                 match name.as_ref() {
+                    FLOAT_16_TYPE_NAME => {
+                        // Rules for encoding fp16 Substrait literals are defined as part of Arrow here:
+                        //
+                        // https://github.com/apache/arrow/blame/bab558061696ddc1841148d6210424b12923d48e/format/substrait/extension_types.yaml#L112
+
+                        let Some(value) = user_defined.val.as_ref() else {
+                            return substrait_err!("Float16 value is empty");
+                        };
+                        let Val::Value(value_any) = value else {
+                            return substrait_err!(
+                                "Float16 value is not a value type literal"
+                            );
+                        };
+                        if value_any.type_url != "google.protobuf.UInt32Value" {
+                            return substrait_err!(
+                                "Float16 value is not a google.protobuf.UInt32Value"
+                            );
+                        }
+                        let decoded_value =
+                            pbjson_types::UInt32Value::decode(value_any.value.clone())
+                                .map_err(|err| {
+                                    substrait_datafusion_err!(
+                                        "Failed to decode float16 value: {err}"
+                                    )
+                                })?;
+                        let u32_bytes = decoded_value.value.to_le_bytes();
+                        let f16_val =
+                            half::f16::from_le_bytes(u32_bytes[0..2].try_into().unwrap());
+                        return Ok(ScalarValue::Float16(Some(f16_val)));
+                    }
                     // Kept for backwards compatibility - producers should use IntervalCompound instead
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     INTERVAL_MONTH_DAY_NANO_TYPE_NAME => {
                         interval_month_day_nano(user_defined)?
                     }
                     _ => {
                         return not_impl_err!(
-                        "Unsupported Substrait user defined type with ref {} and name {}",
-                        user_defined.type_reference,
-                        name
-                    )
+                            "Unsupported Substrait user defined type with ref {} and name {}",
+                            type_ref,
+                            name
+                        );
                     }
                 }
             } else {
-                #[allow(deprecated)]
-                match user_defined.type_reference {
+                #[expect(deprecated)]
+                match type_ref {
                     // Kept for backwards compatibility, producers should useIntervalYearToMonth instead
                     INTERVAL_YEAR_MONTH_TYPE_REF => {
                         let Some(Val::Value(raw_val)) = user_defined.val.as_ref() else {
@@ -492,8 +574,8 @@ pub(crate) fn from_substrait_literal(
                     _ => {
                         return not_impl_err!(
                             "Unsupported Substrait user defined type literal with ref {}",
-                            user_defined.type_reference
-                        )
+                            type_ref
+                        );
                     }
                 }
             }
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
index b3ec2e37811fb..295456e95f9f3 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/mod.rs
@@ -21,6 +21,7 @@ mod field_reference;
 mod function_arguments;
 mod if_then;
 mod literal;
+mod nested;
 mod scalar_function;
 mod singular_or_list;
 mod subquery;
@@ -32,18 +33,19 @@ pub use field_reference::*;
 pub use function_arguments::*;
 pub use if_then::*;
 pub use literal::*;
+pub use nested::*;
 pub use scalar_function::*;
 pub use singular_or_list::*;
 pub use subquery::*;
 pub use window_function::*;
 
 use crate::extensions::Extensions;
-use crate::logical_plan::consumer::utils::rename_field;
 use crate::logical_plan::consumer::{
-    from_substrait_named_struct, DefaultSubstraitConsumer, SubstraitConsumer,
+    DefaultSubstraitConsumer, SubstraitConsumer, from_substrait_named_struct,
+    rename_field,
 };
 use datafusion::arrow::datatypes::Field;
-use datafusion::common::{not_impl_err, plan_err, substrait_err, DFSchema, DFSchemaRef};
+use datafusion::common::{DFSchema, DFSchemaRef, not_impl_err, plan_err, substrait_err};
 use datafusion::execution::SessionState;
 use datafusion::logical_expr::{Expr, ExprSchemable};
 use substrait::proto::expression::RexType;
@@ -88,12 +90,16 @@ pub async fn from_substrait_rex(
                 consumer.consume_subquery(expr.as_ref(), input_schema).await
             }
             RexType::Nested(expr) => consumer.consume_nested(expr, input_schema).await,
+            #[expect(deprecated)]
             RexType::Enum(expr) => consumer.consume_enum(expr, input_schema).await,
             RexType::DynamicParameter(expr) => {
                 consumer.consume_dynamic_parameter(expr, input_schema).await
             }
+            RexType::Lambda(_) | RexType::LambdaInvocation(_) => {
+                not_impl_err!("Lambda expressions are not yet supported")
+            }
         },
-        None => substrait_err!("Expression must set rex_type: {:?}", expression),
+        None => substrait_err!("Expression must set rex_type: {expression:?}"),
     }
 }
 
@@ -116,15 +122,14 @@ pub async fn from_substrait_extended_expr(
         return not_impl_err!("Type variation extensions are not supported");
     }
 
-    let consumer = DefaultSubstraitConsumer {
-        extensions: &extensions,
-        state,
-    };
+    let consumer = DefaultSubstraitConsumer::new(&extensions, state);
 
     let input_schema = DFSchemaRef::new(match &extended_expr.base_schema {
         Some(base_schema) => from_substrait_named_struct(&consumer, base_schema),
         None => {
-            plan_err!("required property `base_schema` missing from Substrait ExtendedExpression message")
+            plan_err!(
+                "required property `base_schema` missing from Substrait ExtendedExpression message"
+            )
         }
     }?);
 
@@ -137,22 +142,21 @@ pub async fn from_substrait_extended_expr(
                 not_impl_err!("Measure expressions are not yet supported")
             }
             None => {
-                plan_err!("required property `expr_type` missing from Substrait ExpressionReference message")
+                plan_err!(
+                    "required property `expr_type` missing from Substrait ExpressionReference message"
+                )
             }
         }?;
         let expr = consumer
             .consume_expression(scalar_expr, &input_schema)
             .await?;
-        let (output_type, expected_nullability) =
-            expr.data_type_and_nullable(&input_schema)?;
-        let output_field = Field::new("", output_type, expected_nullability);
+        let output_field = expr.to_field(&input_schema)?.1;
         let mut names_idx = 0;
         let output_field = rename_field(
             &output_field,
             &substrait_expr.output_names,
             expr_idx,
             &mut names_idx,
-            /*rename_self=*/ true,
         )?;
         exprs.push((expr, output_field));
     }
@@ -199,13 +203,13 @@ mod tests {
     use crate::logical_plan::consumer::*;
     use datafusion::common::DFSchema;
     use datafusion::logical_expr::Expr;
-    use substrait::proto::expression::window_function::BoundsType;
-    use substrait::proto::expression::RexType;
     use substrait::proto::Expression;
+    use substrait::proto::expression::RexType;
+    use substrait::proto::expression::window_function::BoundsType;
 
     #[tokio::test]
-    async fn window_function_with_range_unit_and_no_order_by(
-    ) -> datafusion::common::Result<()> {
+    async fn window_function_with_range_unit_and_no_order_by()
+    -> datafusion::common::Result<()> {
         let substrait = Expression {
             rex_type: Some(RexType::WindowFunction(
                 substrait::proto::expression::WindowFunction {
@@ -222,7 +226,7 @@ mod tests {
         // Just registering a single function (index 0) so that the plan
         // does not throw a "function not found" error.
         let mut extensions = Extensions::default();
-        extensions.register_function("count".to_string());
+        extensions.register_function("count");
         consumer.extensions = &extensions;
 
         match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? {
@@ -249,7 +253,7 @@ mod tests {
         let mut consumer = test_consumer();
 
         let mut extensions = Extensions::default();
-        extensions.register_function("count".to_string());
+        extensions.register_function("count");
         consumer.extensions = &extensions;
 
         match from_substrait_rex(&consumer, &substrait, &DFSchema::empty()).await? {
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs b/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs
new file mode 100644
index 0000000000000..f94a701342826
--- /dev/null
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/nested.rs
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::logical_plan::consumer::SubstraitConsumer;
+use datafusion::common::{DFSchema, not_impl_err, substrait_err};
+use datafusion::execution::FunctionRegistry;
+use datafusion::logical_expr::Expr;
+use substrait::proto::expression::Nested;
+use substrait::proto::expression::nested::NestedType;
+
+/// Converts a Substrait [Nested] expression into a DataFusion [Expr].
+///
+/// Substrait Nested expressions represent complex type constructors (list, struct, map)
+/// where elements are full expressions rather than just literals. This is used by
+/// producers that emit `Nested { list: ... }` for array construction, as opposed to
+/// `Literal { list: ... }` which only supports scalar values.
+pub async fn from_nested(
+    consumer: &impl SubstraitConsumer,
+    nested: &Nested,
+    input_schema: &DFSchema,
+) -> datafusion::common::Result<Expr> {
+    let Some(nested_type) = &nested.nested_type else {
+        return substrait_err!("Nested expression requires a nested_type");
+    };
+
+    match nested_type {
+        NestedType::List(list) => {
+            if list.values.is_empty() {
+                return substrait_err!(
+                    "Empty Nested lists are not supported; use Literal.empty_list instead"
+                );
+            }
+
+            let mut args = Vec::with_capacity(list.values.len());
+            for value in &list.values {
+                args.push(consumer.consume_expression(value, input_schema).await?);
+            }
+
+            let make_array_udf = consumer.get_function_registry().udf("make_array")?;
+            Ok(Expr::ScalarFunction(
+                datafusion::logical_expr::expr::ScalarFunction::new_udf(
+                    make_array_udf,
+                    args,
+                ),
+            ))
+        }
+        NestedType::Struct(_) => {
+            not_impl_err!("Nested struct expressions are not yet supported")
+        }
+        NestedType::Map(_) => {
+            not_impl_err!("Nested map expressions are not yet supported")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::logical_plan::consumer::utils::tests::test_consumer;
+    use substrait::proto::expression::Literal;
+    use substrait::proto::expression::nested::List;
+    use substrait::proto::{self, Expression};
+
+    fn make_i64_literal(value: i64) -> Expression {
+        Expression {
+            rex_type: Some(proto::expression::RexType::Literal(Literal {
+                nullable: false,
+                type_variation_reference: 0,
+                literal_type: Some(proto::expression::literal::LiteralType::I64(value)),
+            })),
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_list_with_literals() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: false,
+            type_variation_reference: 0,
+            nested_type: Some(NestedType::List(List {
+                values: vec![
+                    make_i64_literal(1),
+                    make_i64_literal(2),
+                    make_i64_literal(3),
+                ],
+            })),
+        };
+
+        let expr = from_nested(&consumer, &nested, &schema).await?;
+        assert_eq!(
+            format!("{expr}"),
+            "make_array(Int64(1), Int64(2), Int64(3))"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_list_empty_rejected() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: true,
+            type_variation_reference: 0,
+            nested_type: Some(NestedType::List(List { values: vec![] })),
+        };
+
+        let result = from_nested(&consumer, &nested, &schema).await;
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Empty Nested lists are not supported")
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_missing_type() -> datafusion::common::Result<()> {
+        let consumer = test_consumer();
+        let schema = DFSchema::empty();
+        let nested = Nested {
+            nullable: false,
+            type_variation_reference: 0,
+            nested_type: None,
+        };
+
+        let result = from_nested(&consumer, &nested, &schema).await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("nested_type"));
+
+        Ok(())
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
index 027b61124ead0..10fe58862e021 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/scalar_function.rs
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_func_args, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_func_args};
 use datafusion::common::Result;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_err, DFSchema, DataFusionError, ScalarValue,
+    DFSchema, DataFusionError, ScalarValue, not_impl_err, plan_err, substrait_err,
 };
 use datafusion::execution::FunctionRegistry;
-use datafusion::logical_expr::{expr, BinaryExpr, Expr, Like, Operator};
+use datafusion::logical_expr::{Between, BinaryExpr, Expr, Like, Operator, expr};
 use std::vec::Drain;
 use substrait::proto::expression::ScalarFunction;
-use substrait::proto::function_argument::ArgType;
 
 pub async fn from_scalar_function(
     consumer: &impl SubstraitConsumer,
@@ -41,12 +40,21 @@ pub async fn from_scalar_function(
             f.function_reference
         );
     };
+
     let fn_name = substrait_fun_name(fn_signature);
     let args = from_substrait_func_args(consumer, &f.arguments, input_schema).await?;
 
+    let udf_func = consumer.get_function_registry().udf(fn_name).or_else(|e| {
+        if let Some(alt_name) = substrait_to_df_name(fn_name) {
+            consumer.get_function_registry().udf(alt_name).or(Err(e))
+        } else {
+            Err(e)
+        }
+    });
+
     // try to first match the requested function into registered udfs, then built-in ops
     // and finally built-in expressions
-    if let Ok(func) = consumer.get_function_registry().udf(fn_name) {
+    if let Ok(func) = udf_func {
         Ok(Expr::ScalarFunction(expr::ScalarFunction::new_udf(
             func.to_owned(),
             args,
@@ -54,29 +62,28 @@ pub async fn from_scalar_function(
     } else if let Some(op) = name_to_op(fn_name) {
         if args.len() < 2 {
             return not_impl_err!(
-                        "Expect at least two arguments for binary operator {op:?}, the provided number of operators is {:?}",
-                       f.arguments.len()
-                    );
+                "Expect at least two arguments for binary operator {op:?}, the provided number of operators is {:?}",
+                f.arguments.len()
+            );
         }
         // In those cases we build a balanced tree of BinaryExprs
         arg_list_to_binary_op_tree(op, args)
     } else if let Some(builder) = BuiltinExprBuilder::try_from_name(fn_name) {
-        builder.build(consumer, f, input_schema).await
+        builder.build(consumer, f, args).await
     } else {
         not_impl_err!("Unsupported function name: {fn_name:?}")
     }
 }
 
 pub fn substrait_fun_name(name: &str) -> &str {
-    let name = match name.rsplit_once(':') {
+    (match name.rsplit_once(':') {
         // Since 0.32.0, Substrait requires the function names to be in a compound format
         // https://substrait.io/extensions/#function-signature-compound-names
         // for example, `add:i8_i8`.
         // On the consumer side, we don't really care about the signature though, just the name.
         Some((name, _)) => name,
         None => name,
-    };
-    name
+    }) as _
 }
 
 pub fn name_to_op(name: &str) -> Option<Operator> {
@@ -113,6 +120,13 @@ pub fn name_to_op(name: &str) -> Option<Operator> {
     }
 }
 
+pub fn substrait_to_df_name(name: &str) -> Option<&str> {
+    match name {
+        "is_nan" => Some("isnan"),
+        _ => None,
+    }
+}
+
 /// Build a balanced tree of binary operations from a binary operator and a list of arguments.
 ///
 /// For example, `OR` `(a, b, c, d, e)` will be converted to: `OR(OR(a, OR(b, c)), OR(d, e))`.
@@ -162,9 +176,11 @@ struct BuiltinExprBuilder {
 impl BuiltinExprBuilder {
     pub fn try_from_name(name: &str) -> Option<Self> {
         match name {
-            "not" | "like" | "ilike" | "is_null" | "is_not_null" | "is_true"
-            | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
-            | "is_not_unknown" | "negative" | "negate" => Some(Self {
+            "not" | "like" | "ilike" | "like_match" | "like_imatch"
+            | "like_not_match" | "like_not_imatch" | "is_null" | "is_not_null"
+            | "is_true" | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
+            | "is_not_unknown" | "negative" | "negate" | "and_not" | "xor"
+            | "between" | "logb" => Some(Self {
                 expr_name: name.to_string(),
             }),
             _ => None,
@@ -175,15 +191,22 @@ impl BuiltinExprBuilder {
         self,
         consumer: &impl SubstraitConsumer,
         f: &ScalarFunction,
-        input_schema: &DFSchema,
+        args: Vec<Expr>,
     ) -> Result<Expr> {
         match self.expr_name.as_str() {
-            "like" => Self::build_like_expr(consumer, false, f, input_schema).await,
-            "ilike" => Self::build_like_expr(consumer, true, f, input_schema).await,
+            "like" => Self::build_like_expr(false, false, f, args).await,
+            "ilike" => Self::build_like_expr(true, false, f, args).await,
+            "like_match" => Self::build_like_expr(false, false, f, args).await,
+            "like_imatch" => Self::build_like_expr(true, false, f, args).await,
+            "like_not_match" => Self::build_like_expr(false, true, f, args).await,
+            "like_not_imatch" => Self::build_like_expr(true, true, f, args).await,
             "not" | "negative" | "negate" | "is_null" | "is_not_null" | "is_true"
             | "is_false" | "is_not_true" | "is_not_false" | "is_unknown"
-            | "is_not_unknown" => {
-                Self::build_unary_expr(consumer, &self.expr_name, f, input_schema).await
+            | "is_not_unknown" => Self::build_unary_expr(&self.expr_name, args).await,
+            "and_not" | "xor" => Self::build_binary_expr(&self.expr_name, args).await,
+            "between" => Self::build_between_expr(&self.expr_name, args).await,
+            "logb" => {
+                Self::build_custom_handling_expr(consumer, &self.expr_name, args).await
             }
             _ => {
                 not_impl_err!("Unsupported builtin expression: {}", self.expr_name)
@@ -191,21 +214,11 @@ impl BuiltinExprBuilder {
         }
     }
 
-    async fn build_unary_expr(
-        consumer: &impl SubstraitConsumer,
-        fn_name: &str,
-        f: &ScalarFunction,
-        input_schema: &DFSchema,
-    ) -> Result<Expr> {
-        if f.arguments.len() != 1 {
-            return substrait_err!("Expect one argument for {fn_name} expr");
-        }
-        let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else {
-            return substrait_err!("Invalid arguments type for {fn_name} expr");
+    async fn build_unary_expr(fn_name: &str, args: Vec<Expr>) -> Result<Expr> {
+        let [arg] = match args.try_into() {
+            Ok(args_arr) => args_arr,
+            Err(_) => return substrait_err!("Expected one argument for {fn_name} expr"),
         };
-        let arg = consumer
-            .consume_expression(expr_substrait, input_schema)
-            .await?;
         let arg = Box::new(arg);
 
         let expr = match fn_name {
@@ -226,49 +239,39 @@ impl BuiltinExprBuilder {
     }
 
     async fn build_like_expr(
-        consumer: &impl SubstraitConsumer,
         case_insensitive: bool,
+        negated: bool,
         f: &ScalarFunction,
-        input_schema: &DFSchema,
+        args: Vec<Expr>,
     ) -> Result<Expr> {
         let fn_name = if case_insensitive { "ILIKE" } else { "LIKE" };
-        if f.arguments.len() != 2 && f.arguments.len() != 3 {
+        if args.len() != 2 && args.len() != 3 {
             return substrait_err!("Expect two or three arguments for `{fn_name}` expr");
         }
 
-        let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else {
-            return substrait_err!("Invalid arguments type for `{fn_name}` expr");
+        let mut args_iter = args.into_iter();
+        let Some(expr) = args_iter.next() else {
+            return substrait_err!("Missing first argument for {fn_name} expression");
         };
-        let expr = consumer
-            .consume_expression(expr_substrait, input_schema)
-            .await?;
-        let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else {
-            return substrait_err!("Invalid arguments type for `{fn_name}` expr");
+        let Some(pattern) = args_iter.next() else {
+            return substrait_err!("Missing second argument for {fn_name} expression");
         };
-        let pattern = consumer
-            .consume_expression(pattern_substrait, input_schema)
-            .await?;
 
         // Default case: escape character is Literal(Utf8(None))
         let escape_char = if f.arguments.len() == 3 {
-            let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type
-            else {
-                return substrait_err!("Invalid arguments type for `{fn_name}` expr");
+            let Some(escape_char_expr) = args_iter.next() else {
+                return substrait_err!("Missing third argument for {fn_name} expression");
             };
 
-            let escape_char_expr = consumer
-                .consume_expression(escape_char_substrait, input_schema)
-                .await?;
-
             match escape_char_expr {
-                Expr::Literal(ScalarValue::Utf8(escape_char_string)) => {
+                Expr::Literal(ScalarValue::Utf8(escape_char_string), _) => {
                     // Convert Option<String> to Option<char>
                     escape_char_string.and_then(|s| s.chars().next())
                 }
                 _ => {
                     return substrait_err!(
-                    "Expect Utf8 literal for escape char, but found {escape_char_expr:?}"
-                )
+                        "Expect Utf8 literal for escape char, but found {escape_char_expr:?}"
+                    );
                 }
             }
         } else {
@@ -276,13 +279,87 @@ impl BuiltinExprBuilder {
         };
 
         Ok(Expr::Like(Like {
-            negated: false,
+            negated,
             expr: Box::new(expr),
             pattern: Box::new(pattern),
             escape_char,
             case_insensitive,
         }))
     }
+
+    async fn build_binary_expr(fn_name: &str, args: Vec<Expr>) -> Result<Expr> {
+        let [a, b] = match args.try_into() {
+            Ok(args_arr) => args_arr,
+            Err(_) => {
+                return substrait_err!("Expected two arguments for `{fn_name}` expr");
+            }
+        };
+        match fn_name {
+            "and_not" => Ok(Self::build_and_not_expr(a, b)),
+            "xor" => Ok(Self::build_xor_expr(a, b)),
+            _ => not_impl_err!("Unsupported builtin expression: {}", fn_name),
+        }
+    }
+
+    fn build_and_not_expr(a: Expr, b: Expr) -> Expr {
+        a.and(Expr::Not(Box::new(b)))
+    }
+
+    fn build_xor_expr(a: Expr, b: Expr) -> Expr {
+        let or_expr = a.clone().or(b.clone());
+        let and_expr = a.and(b);
+        Self::build_and_not_expr(or_expr, and_expr)
+    }
+
+    async fn build_between_expr(fn_name: &str, args: Vec<Expr>) -> Result<Expr> {
+        let [expression, low, high] = match args.try_into() {
+            Ok(args_arr) => args_arr,
+            Err(_) => {
+                return substrait_err!("Expected three arguments for `{fn_name}` expr");
+            }
+        };
+
+        Ok(Expr::Between(Between {
+            expr: Box::new(expression),
+            negated: false,
+            low: Box::new(low),
+            high: Box::new(high),
+        }))
+    }
+
+    //This handles any functions that require custom handling
+    async fn build_custom_handling_expr(
+        consumer: &impl SubstraitConsumer,
+        fn_name: &str,
+        args: Vec<Expr>,
+    ) -> Result<Expr> {
+        match fn_name {
+            "logb" => Self::build_logb_expr(consumer, args).await,
+            _ => not_impl_err!("Unsupported custom handled expression: {}", fn_name),
+        }
+    }
+
+    async fn build_logb_expr(
+        consumer: &impl SubstraitConsumer,
+        args: Vec<Expr>,
+    ) -> Result<Expr> {
+        if args.len() != 2 {
+            return substrait_err!("Expect two arguments for logb function");
+        }
+
+        let mut args = args;
+        args.swap(0, 1);
+
+        //The equivalent of logb in DataFusion is the log function (which has its arguments in reverse order)
+        if let Ok(func) = consumer.get_function_registry().udf("log") {
+            Ok(Expr::ScalarFunction(expr::ScalarFunction::new_udf(
+                func.to_owned(),
+                args,
+            )))
+        } else {
+            not_impl_err!("Unsupported function name: logb")
+        }
+    }
 }
 
 #[cfg(test)]
@@ -337,7 +414,7 @@ mod tests {
     fn int64_literals(integers: &[i64]) -> Vec<Expr> {
         integers
             .iter()
-            .map(|value| Expr::Literal(ScalarValue::Int64(Some(*value))))
+            .map(|value| Expr::Literal(ScalarValue::Int64(Some(*value)), None))
             .collect()
     }
 
@@ -369,4 +446,131 @@ mod tests {
         assert_snapshot!(expr.to_string(), @"Int64(1) OR Int64(2) OR Int64(3) OR Int64(4)");
         Ok(())
     }
+
+    //Test that DataFusion can consume scalar functions that have a different name in Substrait
+    #[tokio::test]
+    async fn test_substrait_to_df_name_mapping() -> Result<()> {
+        // Build substrait extensions (we are using only one function)
+        let mut extensions = Extensions::default();
+        //is_nan is one of the functions that has a different name in Substrait (mapping is in substrait_to_df_name())
+        extensions.functions.insert(0, String::from("is_nan:fp32"));
+        // Build substrait consumer
+        let consumer = DefaultSubstraitConsumer::new(&extensions, &TEST_SESSION_STATE);
+
+        // Build arguments for the function call
+        let arg = FunctionArgument {
+            arg_type: Some(ArgType::Value(Expression {
+                rex_type: Some(RexType::Literal(Literal {
+                    nullable: false,
+                    type_variation_reference: 0,
+                    literal_type: Some(LiteralType::Fp32(1.0)),
+                })),
+            })),
+        };
+        let arguments = vec![arg];
+        let func = ScalarFunction {
+            function_reference: 0,
+            arguments,
+            ..Default::default()
+        };
+        // Trivial input schema
+        let schema = Schema::new(vec![Field::new("a", DataType::Float32, false)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+
+        // Consume the expression and ensure we don't get an error
+        let _ = consumer.consume_scalar_function(&func, &df_schema).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_like_match_conversion() -> Result<()> {
+        // 1. Setup the consumer with the "like_match" function registered
+        let mut extensions = Extensions::default();
+        extensions
+            .functions
+            .insert(0, "like_match:str_str".to_string());
+        extensions
+            .functions
+            .insert(1, "like_not_match:str_str".to_string());
+        extensions
+            .functions
+            .insert(2, "like_imatch:str_str".to_string());
+
+        let consumer = DefaultSubstraitConsumer::new(&extensions, &TEST_SESSION_STATE);
+
+        // 2. Create the arguments (column "a" and pattern "%foo%")
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+
+        let col_arg = FunctionArgument {
+            arg_type: Some(ArgType::Value(Expression {
+                rex_type: Some(RexType::Selection(Box::new(
+                    substrait::proto::expression::FieldReference {
+                        reference_type: Some(substrait::proto::expression::field_reference::ReferenceType::DirectReference(
+                            substrait::proto::expression::ReferenceSegment {
+                                reference_type: Some(substrait::proto::expression::reference_segment::ReferenceType::StructField(
+                                    Box::new(substrait::proto::expression::reference_segment::StructField {
+                                        field: 0,
+                                        child: None,
+                                    })
+                                )),
+                            }
+                        )),
+                        root_type: Some(substrait::proto::expression::field_reference::RootType::RootReference(
+                            substrait::proto::expression::field_reference::RootReference {}
+                        )),
+                    }
+                ))),
+            })),
+        };
+
+        let pattern_arg = FunctionArgument {
+            arg_type: Some(ArgType::Value(Expression {
+                rex_type: Some(RexType::Literal(Literal {
+                    nullable: false,
+                    type_variation_reference: 0,
+                    literal_type: Some(LiteralType::String("foo".to_string())),
+                })),
+            })),
+        };
+
+        // 3. Test "like_match" (Standard LIKE)
+        let func_like = ScalarFunction {
+            function_reference: 0,
+            arguments: vec![col_arg.clone(), pattern_arg.clone()],
+            ..Default::default()
+        };
+
+        let result = consumer
+            .consume_scalar_function(&func_like, &df_schema)
+            .await?;
+
+        if let Expr::Like(like) = result {
+            assert!(!like.negated);
+            assert!(!like.case_insensitive);
+            assert_eq!(format!("{}", like.pattern), "Utf8(\"foo\")");
+        } else {
+            panic!("Expected Expr::Like, got {result:?}");
+        }
+
+        // 4. Test "like_not_match" (NOT LIKE)
+        let func_not_like = ScalarFunction {
+            function_reference: 1,
+            arguments: vec![col_arg.clone(), pattern_arg.clone()],
+            ..Default::default()
+        };
+
+        let result = consumer
+            .consume_scalar_function(&func_not_like, &df_schema)
+            .await?;
+
+        if let Expr::Like(like) = result {
+            assert!(like.negated);
+            assert!(!like.case_insensitive);
+        } else {
+            panic!("Expected Expr::Like (negated), got {result:?}");
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs b/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
index 6d44ebcce5908..3937ee7b15fde 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/singular_or_list.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_rex_vec, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_rex_vec};
 use datafusion::common::DFSchema;
-use datafusion::logical_expr::expr::InList;
 use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::expr::InList;
 use substrait::proto::expression::SingularOrList;
 
 pub async fn from_singular_or_list(
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
index f7e4c2bb0fbd1..83cf8400eebfc 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/subquery.rs
@@ -16,13 +16,32 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{substrait_err, DFSchema, Spans};
-use datafusion::logical_expr::expr::{Exists, InSubquery};
-use datafusion::logical_expr::{Expr, Subquery};
+use datafusion::common::{DFSchema, Spans, substrait_datafusion_err, substrait_err};
+use datafusion::logical_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
+use datafusion::logical_expr::{Expr, LogicalPlan, Operator, Subquery};
 use std::sync::Arc;
+use substrait::proto::Rel;
 use substrait::proto::expression as substrait_expression;
-use substrait::proto::expression::subquery::set_predicate::PredicateOp;
 use substrait::proto::expression::subquery::SubqueryType;
+use substrait::proto::expression::subquery::set_comparison::{ComparisonOp, ReductionOp};
+use substrait::proto::expression::subquery::set_predicate::PredicateOp;
+
+/// Consume a subquery relation, making the enclosing query's schema
+/// available for resolving correlated column references.
+///
+/// Substrait represents correlated references using `OuterReference`
+/// field references with a `steps_out` depth. To resolve these,
+/// the consumer maintains a stack of outer schemas.
+async fn consume_subquery_rel(
+    consumer: &impl SubstraitConsumer,
+    rel: &Rel,
+    outer_schema: &DFSchema,
+) -> datafusion::common::Result<LogicalPlan> {
+    consumer.push_outer_schema(Arc::new(outer_schema.clone()));
+    let result = consumer.consume_rel(rel).await;
+    consumer.pop_outer_schema();
+    result
+}
 
 pub async fn from_subquery(
     consumer: &impl SubstraitConsumer,
@@ -33,12 +52,16 @@ pub async fn from_subquery(
         Some(subquery_type) => match subquery_type {
             SubqueryType::InPredicate(in_predicate) => {
                 if in_predicate.needles.len() != 1 {
-                    substrait_err!("InPredicate Subquery type must have exactly one Needle expression")
+                    substrait_err!(
+                        "InPredicate Subquery type must have exactly one Needle expression"
+                    )
                 } else {
                     let needle_expr = &in_predicate.needles[0];
                     let haystack_expr = &in_predicate.haystack;
                     if let Some(haystack_expr) = haystack_expr {
-                        let haystack_expr = consumer.consume_rel(haystack_expr).await?;
+                        let haystack_expr =
+                            consume_subquery_rel(consumer, haystack_expr, input_schema)
+                                .await?;
                         let outer_refs = haystack_expr.all_out_ref_exprs();
                         Ok(Expr::InSubquery(InSubquery {
                             expr: Box::new(
@@ -61,9 +84,12 @@ pub async fn from_subquery(
                 }
             }
             SubqueryType::Scalar(query) => {
-                let plan = consumer
-                    .consume_rel(&(query.input.clone()).unwrap_or_default())
-                    .await?;
+                let plan = consume_subquery_rel(
+                    consumer,
+                    &(query.input.clone()).unwrap_or_default(),
+                    input_schema,
+                )
+                .await?;
                 let outer_ref_columns = plan.all_out_ref_exprs();
                 Ok(Expr::ScalarSubquery(Subquery {
                     subquery: Arc::new(plan),
@@ -76,9 +102,12 @@ pub async fn from_subquery(
                     // exist
                     PredicateOp::Exists => {
                         let relation = &predicate.tuples;
-                        let plan = consumer
-                            .consume_rel(&relation.clone().unwrap_or_default())
-                            .await?;
+                        let plan = consume_subquery_rel(
+                            consumer,
+                            &relation.clone().unwrap_or_default(),
+                            input_schema,
+                        )
+                        .await?;
                         let outer_ref_columns = plan.all_out_ref_exprs();
                         Ok(Expr::Exists(Exists::new(
                             Subquery {
@@ -90,13 +119,57 @@ pub async fn from_subquery(
                         )))
                     }
                     other_type => substrait_err!(
-                        "unimplemented type {:?} for set predicate",
-                        other_type
+                        "unimplemented type {other_type:?} for set predicate"
                     ),
                 }
             }
-            other_type => {
-                substrait_err!("Subquery type {:?} not implemented", other_type)
+            SubqueryType::SetComparison(comparison) => {
+                let left = comparison.left.as_ref().ok_or_else(|| {
+                    substrait_datafusion_err!("SetComparison requires a left expression")
+                })?;
+                let right = comparison.right.as_ref().ok_or_else(|| {
+                    substrait_datafusion_err!("SetComparison requires a right relation")
+                })?;
+                let reduction_op = match ReductionOp::try_from(comparison.reduction_op) {
+                    Ok(ReductionOp::Any) => SetQuantifier::Any,
+                    Ok(ReductionOp::All) => SetQuantifier::All,
+                    _ => {
+                        return substrait_err!(
+                            "Unsupported reduction op for SetComparison: {}",
+                            comparison.reduction_op
+                        );
+                    }
+                };
+                let comparison_op = match ComparisonOp::try_from(comparison.comparison_op)
+                {
+                    Ok(ComparisonOp::Eq) => Operator::Eq,
+                    Ok(ComparisonOp::Ne) => Operator::NotEq,
+                    Ok(ComparisonOp::Lt) => Operator::Lt,
+                    Ok(ComparisonOp::Gt) => Operator::Gt,
+                    Ok(ComparisonOp::Le) => Operator::LtEq,
+                    Ok(ComparisonOp::Ge) => Operator::GtEq,
+                    _ => {
+                        return substrait_err!(
+                            "Unsupported comparison op for SetComparison: {}",
+                            comparison.comparison_op
+                        );
+                    }
+                };
+
+                let left_expr = consumer.consume_expression(left, input_schema).await?;
+                let plan = consume_subquery_rel(consumer, right, input_schema).await?;
+                let outer_ref_columns = plan.all_out_ref_exprs();
+
+                Ok(Expr::SetComparison(SetComparison::new(
+                    Box::new(left_expr),
+                    Subquery {
+                        subquery: Arc::new(plan),
+                        outer_ref_columns,
+                        spans: Spans::new(),
+                    },
+                    comparison_op,
+                    reduction_op,
+                )))
             }
         },
         None => {
diff --git a/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs b/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
index 10a92a686b599..1f6f602a2ab73 100644
--- a/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/expr/window_function.rs
@@ -16,19 +16,19 @@
 // under the License.
 
 use crate::logical_plan::consumer::{
-    from_substrait_func_args, from_substrait_rex_vec, from_substrait_sorts,
-    substrait_fun_name, SubstraitConsumer,
+    SubstraitConsumer, from_substrait_func_args, from_substrait_rex_vec,
+    from_substrait_sorts, substrait_fun_name,
 };
 use datafusion::common::{
-    not_impl_err, plan_datafusion_err, plan_err, substrait_err, DFSchema, ScalarValue,
+    DFSchema, ScalarValue, not_impl_err, plan_datafusion_err, plan_err, substrait_err,
 };
 use datafusion::execution::FunctionRegistry;
 use datafusion::logical_expr::expr::WindowFunctionParams;
 use datafusion::logical_expr::{
-    expr, Expr, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    Expr, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, expr,
 };
-use substrait::proto::expression::window_function::{Bound, BoundsType};
 use substrait::proto::expression::WindowFunction;
+use substrait::proto::expression::window_function::{Bound, BoundsType};
 use substrait::proto::expression::{
     window_function::bound as SubstraitBound, window_function::bound::Kind as BoundKind,
 };
@@ -94,12 +94,12 @@ pub async fn from_window_function(
     // we inject a dummy argument that does not affect the query, but allows
     // us to bypass this limitation.
     let args = if fun.name() == "count" && window.arguments.is_empty() {
-        vec![Expr::Literal(ScalarValue::Int64(Some(1)))]
+        vec![Expr::Literal(ScalarValue::Int64(Some(1)), None)]
     } else {
         from_substrait_func_args(consumer, &window.arguments, input_schema).await?
     };
 
-    Ok(Expr::WindowFunction(expr::WindowFunction {
+    Ok(Expr::from(expr::WindowFunction {
         fun,
         params: WindowFunctionParams {
             args,
@@ -111,7 +111,9 @@ pub async fn from_window_function(
             .await?,
             order_by,
             window_frame,
+            filter: None,
             null_treatment: None,
+            distinct: false,
         },
     }))
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/plan.rs b/datafusion/substrait/src/logical_plan/consumer/plan.rs
index f994f792a17ea..407980c4a7f4b 100644
--- a/datafusion/substrait/src/logical_plan/consumer/plan.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/plan.rs
@@ -20,9 +20,9 @@ use super::{DefaultSubstraitConsumer, SubstraitConsumer};
 use crate::extensions::Extensions;
 use datafusion::common::{not_impl_err, plan_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::{col, Aggregate, LogicalPlan, Projection};
+use datafusion::logical_expr::{Aggregate, LogicalPlan, Projection, col};
 use std::sync::Arc;
-use substrait::proto::{plan_rel, Plan};
+use substrait::proto::{Plan, plan_rel};
 
 /// Convert Substrait Plan to DataFusion LogicalPlan
 pub async fn from_substrait_plan(
@@ -35,10 +35,7 @@ pub async fn from_substrait_plan(
         return not_impl_err!("Type variation extensions are not supported");
     }
 
-    let consumer = DefaultSubstraitConsumer {
-        extensions: &extensions,
-        state,
-    };
+    let consumer = DefaultSubstraitConsumer::new(&extensions, state);
     from_substrait_plan_with_consumer(&consumer, plan).await
 }
 
@@ -53,38 +50,75 @@ pub async fn from_substrait_plan_with_consumer(
                 Some(rt) => match rt {
                     plan_rel::RelType::Rel(rel) => Ok(consumer.consume_rel(rel).await?),
                     plan_rel::RelType::Root(root) => {
-                        let plan = consumer.consume_rel(root.input.as_ref().unwrap()).await?;
+                        let plan =
+                            consumer.consume_rel(root.input.as_ref().unwrap()).await?;
                         if root.names.is_empty() {
                             // Backwards compatibility for plans missing names
                             return Ok(plan);
                         }
-                        let renamed_schema = make_renamed_schema(plan.schema(), &root.names)?;
-                        if renamed_schema.has_equivalent_names_and_types(plan.schema()).is_ok() {
+                        let renamed_schema =
+                            make_renamed_schema(plan.schema(), &root.names)?;
+                        if renamed_schema
+                            .has_equivalent_names_and_types(plan.schema())
+                            .is_ok()
+                        {
                             // Nothing to do if the schema is already equivalent
                             return Ok(plan);
                         }
                         match plan {
                             // If the last node of the plan produces expressions, bake the renames into those expressions.
                             // This isn't necessary for correctness, but helps with roundtrip tests.
-                            LogicalPlan::Projection(p) => Ok(LogicalPlan::Projection(Projection::try_new(rename_expressions(p.expr, p.input.schema(), renamed_schema.fields())?, p.input)?)),
+                            LogicalPlan::Projection(p) => {
+                                Ok(LogicalPlan::Projection(Projection::try_new(
+                                    rename_expressions(
+                                        p.expr,
+                                        p.input.schema(),
+                                        renamed_schema.fields(),
+                                    )?,
+                                    p.input,
+                                )?))
+                            }
                             LogicalPlan::Aggregate(a) => {
-                                let (group_fields, expr_fields) = renamed_schema.fields().split_at(a.group_expr.len());
-                                let new_group_exprs = rename_expressions(a.group_expr, a.input.schema(), group_fields)?;
-                                let new_aggr_exprs = rename_expressions(a.aggr_expr, a.input.schema(), expr_fields)?;
-                                Ok(LogicalPlan::Aggregate(Aggregate::try_new(a.input, new_group_exprs, new_aggr_exprs)?))
-                            },
+                                let (group_fields, expr_fields) =
+                                    renamed_schema.fields().split_at(a.group_expr.len());
+                                let new_group_exprs = rename_expressions(
+                                    a.group_expr,
+                                    a.input.schema(),
+                                    group_fields,
+                                )?;
+                                let new_aggr_exprs = rename_expressions(
+                                    a.aggr_expr,
+                                    a.input.schema(),
+                                    expr_fields,
+                                )?;
+                                Ok(LogicalPlan::Aggregate(Aggregate::try_new(
+                                    a.input,
+                                    new_group_exprs,
+                                    new_aggr_exprs,
+                                )?))
+                            }
                             // There are probably more plans where we could bake things in, can add them later as needed.
                             // Otherwise, add a new Project to handle the renaming.
-                            _ => Ok(LogicalPlan::Projection(Projection::try_new(rename_expressions(plan.schema().columns().iter().map(|c| col(c.to_owned())), plan.schema(), renamed_schema.fields())?, Arc::new(plan))?))
+                            _ => Ok(LogicalPlan::Projection(Projection::try_new(
+                                rename_expressions(
+                                    plan.schema()
+                                        .columns()
+                                        .iter()
+                                        .map(|c| col(c.to_owned())),
+                                    plan.schema(),
+                                    renamed_schema.fields(),
+                                )?,
+                                Arc::new(plan),
+                            )?)),
                         }
                     }
                 },
-                None => plan_err!("Cannot parse plan relation: None")
+                None => plan_err!("Cannot parse plan relation: None"),
             }
-        },
+        }
         _ => not_impl_err!(
             "Substrait plan with more than 1 relation trees not supported. Number of relation trees: {:?}",
             plan.relations.len()
-        )
+        ),
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
index 9421bb17c1628..da57751f6ad84 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/aggregate_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_agg_func, from_substrait_sorts};
 use crate::logical_plan::consumer::{NameTracker, SubstraitConsumer};
-use datafusion::common::{not_impl_err, DFSchemaRef};
+use crate::logical_plan::consumer::{from_substrait_agg_func, from_substrait_sorts};
+use datafusion::common::{DFSchemaRef, not_impl_err};
 use datafusion::logical_expr::{Expr, GroupingSet, LogicalPlan, LogicalPlanBuilder};
+use substrait::proto::AggregateRel;
 use substrait::proto::aggregate_function::AggregationInvocation;
 use substrait::proto::aggregate_rel::Grouping;
-use substrait::proto::AggregateRel;
 
 pub async fn from_aggregate_rel(
     consumer: &impl SubstraitConsumer,
@@ -40,6 +40,7 @@ pub async fn from_aggregate_rel(
         let mut aggr_exprs = vec![];
 
         match agg.groupings.len() {
+            0 => {}
             1 => {
                 group_exprs.extend_from_slice(
                     &from_substrait_grouping(
@@ -88,14 +89,8 @@ pub async fn from_aggregate_rel(
                         _ if f.invocation == AggregationInvocation::All as i32 => false,
                         _ => false,
                     };
-                    let order_by = if !f.sorts.is_empty() {
-                        Some(
-                            from_substrait_sorts(consumer, &f.sorts, input.schema())
-                                .await?,
-                        )
-                    } else {
-                        None
-                    };
+                    let order_by =
+                        from_substrait_sorts(consumer, &f.sorts, input.schema()).await?;
 
                     from_substrait_agg_func(
                         consumer,
@@ -127,7 +122,7 @@ pub async fn from_aggregate_rel(
     }
 }
 
-#[allow(deprecated)]
+#[expect(deprecated)]
 async fn from_substrait_grouping(
     consumer: &impl SubstraitConsumer,
     grouping: &Grouping,
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/cross_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/cross_rel.rs
index a91366e47742d..25c66a8e22972 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/cross_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/cross_rel.rs
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::utils::requalify_sides_if_needed;
 use crate::logical_plan::consumer::SubstraitConsumer;
 use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder};
+
+use datafusion::logical_expr::requalify_sides_if_needed;
+
 use substrait::proto::CrossRel;
 
 pub async fn from_cross_rel(
@@ -30,6 +32,6 @@ pub async fn from_cross_rel(
     let right = LogicalPlanBuilder::from(
         consumer.consume_rel(cross.right.as_ref().unwrap()).await?,
     );
-    let (left, right) = requalify_sides_if_needed(left, right)?;
+    let (left, right, _requalified) = requalify_sides_if_needed(left, right)?;
     left.cross_join(right.build()?)?.build()
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
index d326fff44bbbd..b275e523f5861 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/exchange_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::from_substrait_field_reference;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::from_substrait_field_reference;
 use datafusion::common::{not_impl_err, substrait_err};
 use datafusion::logical_expr::{LogicalPlan, Partitioning, Repartition};
 use std::sync::Arc;
-use substrait::proto::exchange_rel::ExchangeKind;
 use substrait::proto::ExchangeRel;
+use substrait::proto::exchange_rel::ExchangeKind;
 
 pub async fn from_exchange_rel(
     consumer: &impl SubstraitConsumer,
@@ -42,7 +42,8 @@ pub async fn from_exchange_rel(
             let mut partition_columns = vec![];
             let input_schema = input.schema();
             for field_ref in &scatter_fields.fields {
-                let column = from_substrait_field_reference(field_ref, input_schema)?;
+                let column =
+                    from_substrait_field_reference(consumer, field_ref, input_schema)?;
                 partition_columns.push(column);
             }
             Partitioning::Hash(partition_columns, exchange.partition_count as usize)
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
index 74161d8600ea6..12a8a77199b1a 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/fetch_rel.rs
@@ -17,9 +17,9 @@
 
 use crate::logical_plan::consumer::SubstraitConsumer;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, DFSchema, DFSchemaRef};
-use datafusion::logical_expr::{lit, LogicalPlan, LogicalPlanBuilder};
-use substrait::proto::{fetch_rel, FetchRel};
+use datafusion::common::{DFSchema, DFSchemaRef, not_impl_err};
+use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder, lit};
+use substrait::proto::{FetchRel, fetch_rel};
 
 #[async_recursion]
 pub async fn from_fetch_rel(
@@ -30,6 +30,7 @@ pub async fn from_fetch_rel(
         let input = LogicalPlanBuilder::from(consumer.consume_rel(input).await?);
         let empty_schema = DFSchemaRef::new(DFSchema::empty());
         let offset = match &fetch.offset_mode {
+            #[expect(deprecated)]
             Some(fetch_rel::OffsetMode::Offset(offset)) => Some(lit(*offset)),
             Some(fetch_rel::OffsetMode::OffsetExpr(expr)) => {
                 Some(consumer.consume_expression(expr, &empty_schema).await?)
@@ -37,6 +38,7 @@ pub async fn from_fetch_rel(
             None => None,
         };
         let count = match &fetch.count_mode {
+            #[expect(deprecated)]
             Some(fetch_rel::CountMode::Count(count)) => {
                 // -1 means that ALL records should be returned, equivalent to None
                 (*count != -1).then(|| lit(*count))
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
index 881157dcfa662..3604630d6f0bb 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs
@@ -15,14 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::utils::requalify_sides_if_needed;
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::common::{not_impl_err, plan_err, Column, JoinType};
+use datafusion::common::{Column, JoinType, NullEquality, not_impl_err, plan_err};
+use datafusion::logical_expr::requalify_sides_if_needed;
 use datafusion::logical_expr::utils::split_conjunction;
 use datafusion::logical_expr::{
     BinaryExpr, Expr, LogicalPlan, LogicalPlanBuilder, Operator,
 };
-use substrait::proto::{join_rel, JoinRel};
+
+use substrait::proto::{JoinRel, join_rel};
 
 pub async fn from_join_rel(
     consumer: &impl SubstraitConsumer,
@@ -38,7 +39,7 @@ pub async fn from_join_rel(
     let right = LogicalPlanBuilder::from(
         consumer.consume_rel(join.right.as_ref().unwrap()).await?,
     );
-    let (left, right) = requalify_sides_if_needed(left, right)?;
+    let (left, right, _requalified) = requalify_sides_if_needed(left, right)?;
 
     let join_type = from_substrait_jointype(join.r#type)?;
     // The join condition expression needs full input schema and not the output schema from join since we lose columns from
@@ -59,19 +60,30 @@ pub async fn from_join_rel(
                 split_eq_and_noneq_join_predicate_with_nulls_equality(&on);
             let (left_cols, right_cols): (Vec<_>, Vec<_>) =
                 itertools::multiunzip(join_ons);
+            let null_equality = if nulls_equal_nulls {
+                NullEquality::NullEqualsNull
+            } else {
+                NullEquality::NullEqualsNothing
+            };
             left.join_detailed(
                 right.build()?,
                 join_type,
                 (left_cols, right_cols),
                 join_filter,
-                nulls_equal_nulls,
+                null_equality,
             )?
             .build()
         }
         None => {
             let on: Vec<String> = vec![];
-            left.join_detailed(right.build()?, join_type, (on.clone(), on), None, false)?
-                .build()
+            left.join_detailed(
+                right.build()?,
+                join_type,
+                (on.clone(), on),
+                None,
+                NullEquality::NullEqualsNothing,
+            )?
+            .build()
         }
     }
 }
@@ -86,7 +98,7 @@ fn split_eq_and_noneq_join_predicate_with_nulls_equality(
     let mut nulls_equal_nulls = false;
 
     for expr in exprs {
-        #[allow(clippy::collapsible_match)]
+        #[expect(clippy::collapsible_match)]
         match expr {
             Expr::BinaryExpr(binary_expr) => match binary_expr {
                 x @ (BinaryExpr {
@@ -132,9 +144,12 @@ fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result<JoinTyp
             join_rel::JoinType::LeftAnti => Ok(JoinType::LeftAnti),
             join_rel::JoinType::LeftSemi => Ok(JoinType::LeftSemi),
             join_rel::JoinType::LeftMark => Ok(JoinType::LeftMark),
+            join_rel::JoinType::RightMark => Ok(JoinType::RightMark),
+            join_rel::JoinType::RightAnti => Ok(JoinType::RightAnti),
+            join_rel::JoinType::RightSemi => Ok(JoinType::RightSemi),
             _ => plan_err!("unsupported join type {substrait_join_type:?}"),
         }
     } else {
-        plan_err!("invalid join type variant {join_type:?}")
+        plan_err!("invalid join type variant {join_type}")
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs b/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
index a83ddd8997b29..038ada115b9d8 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/mod.rs
@@ -37,16 +37,16 @@ pub use read_rel::*;
 pub use set_rel::*;
 pub use sort_rel::*;
 
-use crate::logical_plan::consumer::utils::NameTracker;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::utils::NameTracker;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, substrait_datafusion_err, substrait_err, Column};
+use datafusion::common::{Column, not_impl_err, substrait_datafusion_err, substrait_err};
 use datafusion::logical_expr::builder::project;
 use datafusion::logical_expr::{Expr, LogicalPlan, Projection};
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
 use substrait::proto::rel_common::{Emit, EmitKind};
-use substrait::proto::{rel_common, Rel, RelCommon};
+use substrait::proto::{Rel, RelCommon, rel_common};
 
 /// Convert Substrait Rel to DataFusion DataFrame
 #[async_recursion]
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
index 8ece6392974ef..d216d4ecf3188 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::utils::NameTracker;
 use crate::logical_plan::consumer::SubstraitConsumer;
+use crate::logical_plan::consumer::utils::NameTracker;
 use async_recursion::async_recursion;
-use datafusion::common::{not_impl_err, Column};
+use datafusion::common::{Column, not_impl_err};
 use datafusion::logical_expr::builder::project;
 use datafusion::logical_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
 use std::collections::HashSet;
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
index 47af44c692aeb..832110e11131c 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/read_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::logical_plan::consumer::SubstraitConsumer;
 use crate::logical_plan::consumer::from_substrait_literal;
 use crate::logical_plan::consumer::from_substrait_named_struct;
 use crate::logical_plan::consumer::utils::ensure_schema_compatibility;
-use crate::logical_plan::consumer::SubstraitConsumer;
 use datafusion::common::{
-    not_impl_err, plan_err, substrait_datafusion_err, substrait_err, DFSchema,
-    DFSchemaRef, TableReference,
+    DFSchema, DFSchemaRef, TableReference, not_impl_err, plan_err,
+    substrait_datafusion_err, substrait_err,
 };
 use datafusion::datasource::provider_as_source;
 use datafusion::logical_expr::utils::split_conjunction_owned;
@@ -30,12 +30,12 @@ use datafusion::logical_expr::{
 };
 use std::sync::Arc;
 use substrait::proto::expression::MaskExpression;
-use substrait::proto::read_rel::local_files::file_or_files::PathType::UriFile;
 use substrait::proto::read_rel::ReadType;
+use substrait::proto::read_rel::local_files::file_or_files::PathType::UriFile;
 use substrait::proto::{Expression, ReadRel};
 use url::Url;
 
-#[allow(deprecated)]
+#[expect(deprecated)]
 pub async fn from_read_rel(
     consumer: &impl SubstraitConsumer,
     read: &ReadRel,
@@ -114,41 +114,61 @@ pub async fn from_read_rel(
             .await
         }
         Some(ReadType::VirtualTable(vt)) => {
-            if vt.values.is_empty() {
+            if vt.values.is_empty() && vt.expressions.is_empty() {
                 return Ok(LogicalPlan::EmptyRelation(EmptyRelation {
                     produce_one_row: false,
                     schema: DFSchemaRef::new(substrait_schema),
                 }));
             }
 
-            let values = vt
-                .values
-                .iter()
-                .map(|row| {
-                    let mut name_idx = 0;
-                    let lits = row
-                        .fields
-                        .iter()
-                        .map(|lit| {
-                            name_idx += 1; // top-level names are provided through schema
-                            Ok(Expr::Literal(from_substrait_literal(
-                                consumer,
-                                lit,
-                                &named_struct.names,
-                                &mut name_idx,
-                            )?))
-                        })
-                        .collect::<datafusion::common::Result<_>>()?;
-                    if name_idx != named_struct.names.len() {
+            // Check for produce_one_row pattern in both old (values) and new (expressions) formats.
+            // A VirtualTable with exactly one row containing only empty/default fields represents
+            // an EmptyRelation with produce_one_row=true. This pattern is used for queries without
+            // a FROM clause (e.g., "SELECT 1 AS one") where a single phantom row is needed to
+            // provide a context for evaluating scalar expressions. This is conceptually similar to
+            // the SQL "DUAL" table (see: https://en.wikipedia.org/wiki/DUAL_table) which some
+            // databases provide as a single-row source for selecting constant expressions when no
+            // real table is present.
+            let is_produce_one_row = (vt.values.len() == 1
+                && vt.expressions.is_empty()
+                && substrait_schema.fields().is_empty()
+                && vt.values[0].fields.is_empty())
+                || (vt.expressions.len() == 1
+                    && vt.values.is_empty()
+                    && substrait_schema.fields().is_empty()
+                    && vt.expressions[0].fields.is_empty());
+
+            if is_produce_one_row {
+                return Ok(LogicalPlan::EmptyRelation(EmptyRelation {
+                    produce_one_row: true,
+                    schema: DFSchemaRef::new(substrait_schema),
+                }));
+            }
+
+            let values = if !vt.expressions.is_empty() {
+                let mut exprs = vec![];
+                for row in &vt.expressions {
+                    let mut row_exprs = vec![];
+                    for expression in &row.fields {
+                        let expr = consumer
+                            .consume_expression(expression, &substrait_schema)
+                            .await?;
+                        row_exprs.push(expr);
+                    }
+                    // For expressions, validate against top-level schema fields, not nested names
+                    if row_exprs.len() != substrait_schema.fields().len() {
                         return substrait_err!(
-                                "Names list must match exactly to nested schema, but found {} uses for {} names",
-                                name_idx,
-                                named_struct.names.len()
-                            );
+                            "Field count mismatch: expected {} fields but found {} in virtual table row",
+                            substrait_schema.fields().len(),
+                            row_exprs.len()
+                        );
                     }
-                    Ok(lits)
-                })
-                .collect::<datafusion::common::Result<_>>()?;
+                    exprs.push(row_exprs);
+                }
+                exprs
+            } else {
+                convert_literal_rows(consumer, vt, named_struct)?
+            };
 
             Ok(LogicalPlan::Values(Values {
                 schema: DFSchemaRef::new(substrait_schema),
@@ -197,11 +217,51 @@ pub async fn from_read_rel(
             .await
         }
         _ => {
-            not_impl_err!("Unsupported ReadType: {:?}", read.read_type)
+            not_impl_err!("Unsupported Readtype: {:?}", read.read_type)
         }
     }
 }
 
+/// Converts Substrait literal rows from a VirtualTable into DataFusion expressions.
+///
+/// This function processes the deprecated `values` field of VirtualTable, converting
+/// each literal value into a `Expr::Literal` while tracking and validating the name
+/// indices against the provided named struct schema.
+fn convert_literal_rows(
+    consumer: &impl SubstraitConsumer,
+    vt: &substrait::proto::read_rel::VirtualTable,
+    named_struct: &substrait::proto::NamedStruct,
+) -> datafusion::common::Result<Vec<Vec<Expr>>> {
+    #[expect(deprecated)]
+    vt.values
+        .iter()
+        .map(|row| {
+            let mut name_idx = 0;
+            let lits = row
+                .fields
+                .iter()
+                .map(|lit| {
+                    name_idx += 1; // top-level names are provided through schema
+                    Ok(Expr::Literal(from_substrait_literal(
+                        consumer,
+                        lit,
+                        &named_struct.names,
+                        &mut name_idx,
+                    )?, None))
+                })
+                .collect::<datafusion::common::Result<_>>()?;
+            if name_idx != named_struct.names.len() {
+                return substrait_err!(
+                    "Names list must match exactly to nested schema, but found {} uses for {} names",
+                    name_idx,
+                    named_struct.names.len()
+                );
+            }
+            Ok(lits)
+        })
+        .collect::<datafusion::common::Result<_>>()
+}
+
 pub fn apply_masking(
     schema: DFSchema,
     mask_expression: &::core::option::Option<MaskExpression>,
@@ -218,9 +278,7 @@ pub fn apply_masking(
                 let fields = column_indices
                     .iter()
                     .map(|i| schema.qualified_field(*i))
-                    .map(|(qualifier, field)| {
-                        (qualifier.cloned(), Arc::new(field.clone()))
-                    })
+                    .map(|(qualifier, field)| (qualifier.cloned(), Arc::clone(field)))
                     .collect();
 
                 Ok(DFSchema::new_with_metadata(
@@ -264,7 +322,7 @@ fn apply_projection(
             let fields = column_indices
                 .iter()
                 .map(|i| df_schema.qualified_field(*i))
-                .map(|(qualifier, field)| (qualifier.cloned(), Arc::new(field.clone())))
+                .map(|(qualifier, field)| (qualifier.cloned(), Arc::clone(field)))
                 .collect();
 
             scan.projected_schema = DFSchemaRef::new(DFSchema::new_with_metadata(
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
index 6688a80f52746..36bf8dbae4a92 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/set_rel.rs
@@ -81,7 +81,7 @@ async fn intersect_rels(
             rel,
             consumer.consume_rel(input).await?,
             is_all,
-        )?
+        )?;
     }
 
     Ok(rel)
@@ -95,7 +95,8 @@ async fn except_rels(
     let mut rel = consumer.consume_rel(&rels[0]).await?;
 
     for input in &rels[1..] {
-        rel = LogicalPlanBuilder::except(rel, consumer.consume_rel(input).await?, is_all)?
+        rel =
+            LogicalPlanBuilder::except(rel, consumer.consume_rel(input).await?, is_all)?;
     }
 
     Ok(rel)
diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
index 56ca0ba03857d..24f6829c20394 100644
--- a/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/rel/sort_rel.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::consumer::{from_substrait_sorts, SubstraitConsumer};
+use crate::logical_plan::consumer::{SubstraitConsumer, from_substrait_sorts};
 use datafusion::common::not_impl_err;
 use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder};
 use substrait::proto::SortRel;
diff --git a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
index 5392dd77b576b..14385888a8de4 100644
--- a/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/substrait_consumer.rs
@@ -18,7 +18,7 @@
 use super::{
     from_aggregate_rel, from_cast, from_cross_rel, from_exchange_rel, from_fetch_rel,
     from_field_reference, from_filter_rel, from_if_then, from_join_rel, from_literal,
-    from_project_rel, from_read_rel, from_scalar_function, from_set_rel,
+    from_nested, from_project_rel, from_read_rel, from_scalar_function, from_set_rel,
     from_singular_or_list, from_sort_rel, from_subquery, from_substrait_rel,
     from_substrait_rex, from_window_function,
 };
@@ -27,11 +27,11 @@ use async_trait::async_trait;
 use datafusion::arrow::datatypes::DataType;
 use datafusion::catalog::TableProvider;
 use datafusion::common::{
-    not_impl_err, substrait_err, DFSchema, ScalarValue, TableReference,
+    DFSchema, ScalarValue, TableReference, not_impl_err, substrait_err,
 };
 use datafusion::execution::{FunctionRegistry, SessionState};
 use datafusion::logical_expr::{Expr, Extension, LogicalPlan};
-use std::sync::Arc;
+use std::sync::{Arc, RwLock};
 use substrait::proto;
 use substrait::proto::expression as substrait_expression;
 use substrait::proto::expression::{
@@ -39,9 +39,9 @@ use substrait::proto::expression::{
     SingularOrList, SwitchExpression, WindowFunction,
 };
 use substrait::proto::{
-    r#type, AggregateRel, ConsistentPartitionWindowRel, CrossRel, DynamicParameter,
-    ExchangeRel, Expression, ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel,
-    FetchRel, FilterRel, JoinRel, ProjectRel, ReadRel, Rel, SetRel, SortRel,
+    AggregateRel, ConsistentPartitionWindowRel, CrossRel, DynamicParameter, ExchangeRel,
+    Expression, ExtensionLeafRel, ExtensionMultiRel, ExtensionSingleRel, FetchRel,
+    FilterRel, JoinRel, ProjectRel, ReadRel, Rel, SetRel, SortRel, r#type,
 };
 
 #[async_trait]
@@ -141,7 +141,15 @@ use substrait::proto::{
 ///
 ///     // and user-defined literals
 ///     fn consume_user_defined_literal(&self, literal: &proto::expression::literal::UserDefined) -> Result<ScalarValue> {
-///         let type_string = self.extensions.types.get(&literal.type_reference).unwrap();
+///         // extract type_reference from the new TypeAnchorType oneof
+///         let type_ref = match literal.type_anchor_type {
+///             Some(proto::expression::literal::user_defined::TypeAnchorType::TypeReference(r)) => r,
+///             Some(proto::expression::literal::user_defined::TypeAnchorType::TypeAliasReference(_)) => {
+///                 return not_impl_err!("Type alias references are not yet supported")
+///             }
+///             None => 0,
+///         };
+///         let type_string = self.extensions.types.get(&type_ref).unwrap();
 ///         match type_string.as_str() {
 ///             "u!foo" => not_impl_err!("handle foo conversion"),
 ///             "u!bar" => not_impl_err!("handle bar conversion"),
@@ -150,7 +158,6 @@ use substrait::proto::{
 ///     }
 /// }
 /// ```
-///
 pub trait SubstraitConsumer: Send + Sync + Sized {
     async fn resolve_table_ref(
         &self,
@@ -343,10 +350,10 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
 
     async fn consume_nested(
         &self,
-        _expr: &Nested,
-        _input_schema: &DFSchema,
+        expr: &Nested,
+        input_schema: &DFSchema,
     ) -> datafusion::common::Result<Expr> {
-        not_impl_err!("Nested expression not supported")
+        from_nested(self, expr, input_schema).await
     }
 
     async fn consume_enum(
@@ -365,6 +372,26 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
         not_impl_err!("Dynamic Parameter expression not supported")
     }
 
+    // Outer Schema Stack
+    // These methods manage a stack of outer schemas for correlated subquery support.
+    // When entering a subquery, the enclosing query's schema is pushed onto the stack.
+    // Field references with OuterReference root_type use these to resolve columns.
+
+    /// Push an outer schema onto the stack when entering a subquery.
+    fn push_outer_schema(&self, _schema: Arc<DFSchema>) {}
+
+    /// Pop an outer schema from the stack when leaving a subquery.
+    fn pop_outer_schema(&self) {}
+
+    /// Get the outer schema at the given nesting depth.
+    /// `steps_out = 1` is the immediately enclosing query, `steps_out = 2`
+    /// is two levels out, etc. Returns `None` if `steps_out` is 0 or
+    /// exceeds the current nesting depth (the caller should treat this as
+    /// an error in the Substrait plan).
+    fn get_outer_schema(&self, _steps_out: usize) -> Option<Arc<DFSchema>> {
+        None
+    }
+
     // User-Defined Functionality
 
     // The details of extension relations, and how to handle them, are fully up to users to specify.
@@ -425,10 +452,22 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
         &self,
         user_defined_literal: &proto::expression::literal::UserDefined,
     ) -> datafusion::common::Result<ScalarValue> {
-        substrait_err!(
-            "Missing handler for user-defined literals {}",
-            user_defined_literal.type_reference
-        )
+        let type_ref = match user_defined_literal.type_anchor_type {
+            Some(
+                proto::expression::literal::user_defined::TypeAnchorType::TypeReference(
+                    ref_val,
+                ),
+            ) => ref_val,
+            Some(
+                proto::expression::literal::user_defined::TypeAnchorType::TypeAliasReference(_),
+            ) => {
+                return not_impl_err!(
+                    "Type alias references in user-defined literals are not yet supported"
+                )
+            }
+            None => 0,
+        };
+        substrait_err!("Missing handler for user-defined literals {}", type_ref)
     }
 }
 
@@ -438,11 +477,16 @@ pub trait SubstraitConsumer: Send + Sync + Sized {
 pub struct DefaultSubstraitConsumer<'a> {
     pub(super) extensions: &'a Extensions,
     pub(super) state: &'a SessionState,
+    outer_schemas: RwLock<Vec<Arc<DFSchema>>>,
 }
 
 impl<'a> DefaultSubstraitConsumer<'a> {
     pub fn new(extensions: &'a Extensions, state: &'a SessionState) -> Self {
-        DefaultSubstraitConsumer { extensions, state }
+        DefaultSubstraitConsumer {
+            extensions,
+            state,
+            outer_schemas: RwLock::new(Vec::new()),
+        }
     }
 }
 
@@ -466,6 +510,24 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
         self.state
     }
 
+    fn push_outer_schema(&self, schema: Arc<DFSchema>) {
+        self.outer_schemas.write().unwrap().push(schema);
+    }
+
+    fn pop_outer_schema(&self) {
+        self.outer_schemas.write().unwrap().pop();
+    }
+
+    fn get_outer_schema(&self, steps_out: usize) -> Option<Arc<DFSchema>> {
+        let schemas = self.outer_schemas.read().unwrap();
+        // steps_out=1 → last element, steps_out=2 → second-to-last, etc.
+        // Returns None for steps_out=0 or steps_out > stack depth.
+        schemas
+            .len()
+            .checked_sub(steps_out)
+            .and_then(|idx| schemas.get(idx).cloned())
+    }
+
     async fn consume_extension_leaf(
         &self,
         rel: &ExtensionLeafRel,
@@ -493,8 +555,8 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
             .deserialize_logical_plan(&ext_detail.type_url, &ext_detail.value)?;
         let Some(input_rel) = &rel.input else {
             return substrait_err!(
-                    "ExtensionSingleRel missing input rel, try using ExtensionLeafRel instead"
-                );
+                "ExtensionSingleRel missing input rel, try using ExtensionLeafRel instead"
+            );
         };
         let input_plan = self.consume_rel(input_rel).await?;
         let plan = plan.with_exprs_and_inputs(plan.expressions(), vec![input_plan])?;
@@ -521,3 +583,79 @@ impl SubstraitConsumer for DefaultSubstraitConsumer<'_> {
         Ok(LogicalPlan::Extension(Extension { node: plan }))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::logical_plan::consumer::utils::tests::test_consumer;
+    use datafusion::arrow::datatypes::{DataType, Field, Schema};
+
+    fn make_schema(fields: &[(&str, DataType)]) -> Arc<DFSchema> {
+        let arrow_fields: Vec<Field> = fields
+            .iter()
+            .map(|(name, dt)| Field::new(*name, dt.clone(), true))
+            .collect();
+        Arc::new(
+            DFSchema::try_from(Schema::new(arrow_fields))
+                .expect("failed to create schema"),
+        )
+    }
+
+    #[test]
+    fn test_get_outer_schema_empty_stack() {
+        let consumer = test_consumer();
+
+        // No schemas pushed — any steps_out should return None
+        assert!(consumer.get_outer_schema(0).is_none());
+        assert!(consumer.get_outer_schema(1).is_none());
+        assert!(consumer.get_outer_schema(2).is_none());
+    }
+
+    #[test]
+    fn test_get_outer_schema_single_level() {
+        let consumer = test_consumer();
+
+        let schema_a = make_schema(&[("a", DataType::Int64)]);
+        consumer.push_outer_schema(Arc::clone(&schema_a));
+
+        // steps_out=1 returns the one pushed schema
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields().len(), 1);
+        assert_eq!(result.fields()[0].name(), "a");
+
+        // steps_out=0 and steps_out=2 are out of range
+        assert!(consumer.get_outer_schema(0).is_none());
+        assert!(consumer.get_outer_schema(2).is_none());
+
+        consumer.pop_outer_schema();
+        assert!(consumer.get_outer_schema(1).is_none());
+    }
+
+    #[test]
+    fn test_get_outer_schema_nested() {
+        let consumer = test_consumer();
+
+        let schema_a = make_schema(&[("a", DataType::Int64)]);
+        let schema_b = make_schema(&[("b", DataType::Utf8)]);
+
+        consumer.push_outer_schema(Arc::clone(&schema_a));
+        consumer.push_outer_schema(Arc::clone(&schema_b));
+
+        // steps_out=1 returns the most recent (schema_b)
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields()[0].name(), "b");
+
+        // steps_out=2 returns the grandparent (schema_a)
+        let result = consumer.get_outer_schema(2).unwrap();
+        assert_eq!(result.fields()[0].name(), "a");
+
+        // steps_out=3 exceeds depth
+        assert!(consumer.get_outer_schema(3).is_none());
+
+        // Pop one level — now steps_out=1 returns schema_a
+        consumer.pop_outer_schema();
+        let result = consumer.get_outer_schema(1).unwrap();
+        assert_eq!(result.fields()[0].name(), "a");
+        assert!(consumer.get_outer_schema(2).is_none());
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/consumer/types.rs b/datafusion/substrait/src/logical_plan/consumer/types.rs
index 7bc30e433d868..2493ac1e5ad57 100644
--- a/datafusion/substrait/src/logical_plan/consumer/types.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/types.rs
@@ -15,28 +15,40 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use super::utils::{next_struct_field_name, DEFAULT_TIMEZONE};
 use super::SubstraitConsumer;
-#[allow(deprecated)]
+use super::utils::{DEFAULT_TIMEZONE, from_substrait_precision, next_struct_field_name};
+#[expect(deprecated)]
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF,
     DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
-    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
+    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_INTERVAL_DAY_TYPE_VARIATION_REF,
+    DEFAULT_MAP_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
+    DICTIONARY_MAP_TYPE_VARIATION_REF, DURATION_INTERVAL_DAY_TYPE_VARIATION_REF,
     INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_NAME,
     INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
+    LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
+    TIME_64_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF,
     TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF,
     TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
     VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
+use crate::variation_const::{FLOAT_16_TYPE_NAME, NULL_TYPE_NAME};
 use datafusion::arrow::datatypes::{
-    DataType, Field, Fields, IntervalUnit, Schema, TimeUnit,
+    DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit,
 };
+use datafusion::common::datatype::DataTypeExt;
 use datafusion::common::{
-    not_impl_err, substrait_datafusion_err, substrait_err, DFSchema,
+    DFSchema, not_impl_err, substrait_datafusion_err, substrait_err,
 };
 use std::sync::Arc;
-use substrait::proto::{r#type, NamedStruct, Type};
+use substrait::proto::{NamedStruct, Type, r#type};
+
+pub(crate) fn field_from_substrait_type_without_names(
+    consumer: &impl SubstraitConsumer,
+    dt: &Type,
+) -> datafusion::common::Result<FieldRef> {
+    Ok(from_substrait_type_without_names(consumer, dt)?.into_nullable_field_ref())
+}
 
 pub(crate) fn from_substrait_type_without_names(
     consumer: &impl SubstraitConsumer,
@@ -45,6 +57,16 @@ pub(crate) fn from_substrait_type_without_names(
     from_substrait_type(consumer, dt, &[], &mut 0)
 }
 
+pub fn field_from_substrait_type(
+    consumer: &impl SubstraitConsumer,
+    dt: &Type,
+    dfs_names: &[String],
+    name_idx: &mut usize,
+) -> datafusion::common::Result<FieldRef> {
+    // We could add nullability here now that we are returning a Field
+    Ok(from_substrait_type(consumer, dt, dfs_names, name_idx)?.into_nullable_field_ref())
+}
+
 pub fn from_substrait_type(
     consumer: &impl SubstraitConsumer,
     dt: &Type,
@@ -84,9 +106,10 @@ pub fn from_substrait_type(
             },
             r#type::Kind::Fp32(_) => Ok(DataType::Float32),
             r#type::Kind::Fp64(_) => Ok(DataType::Float64),
+            #[expect(deprecated)]
             r#type::Kind::Timestamp(ts) => {
                 // Kept for backwards compatibility, new plans should use PrecisionTimestamp(Tz) instead
-                #[allow(deprecated)]
+                #[expect(deprecated)]
                 match ts.type_variation_reference {
                     TIMESTAMP_SECOND_TYPE_VARIATION_REF => {
                         Ok(DataType::Timestamp(TimeUnit::Second, None))
@@ -106,29 +129,24 @@ pub fn from_substrait_type(
                 }
             }
             r#type::Kind::PrecisionTimestamp(pts) => {
-                let unit = match pts.precision {
-                    0 => Ok(TimeUnit::Second),
-                    3 => Ok(TimeUnit::Millisecond),
-                    6 => Ok(TimeUnit::Microsecond),
-                    9 => Ok(TimeUnit::Nanosecond),
-                    p => not_impl_err!(
-                        "Unsupported Substrait precision {p} for PrecisionTimestamp"
-                    ),
-                }?;
+                let unit = from_substrait_precision(pts.precision, "PrecisionTimestamp")?;
                 Ok(DataType::Timestamp(unit, None))
             }
             r#type::Kind::PrecisionTimestampTz(pts) => {
-                let unit = match pts.precision {
-                    0 => Ok(TimeUnit::Second),
-                    3 => Ok(TimeUnit::Millisecond),
-                    6 => Ok(TimeUnit::Microsecond),
-                    9 => Ok(TimeUnit::Nanosecond),
-                    p => not_impl_err!(
-                        "Unsupported Substrait precision {p} for PrecisionTimestampTz"
-                    ),
-                }?;
+                let unit =
+                    from_substrait_precision(pts.precision, "PrecisionTimestampTz")?;
                 Ok(DataType::Timestamp(unit, Some(DEFAULT_TIMEZONE.into())))
             }
+            r#type::Kind::PrecisionTime(pt) => {
+                let time_unit = from_substrait_precision(pt.precision, "PrecisionTime")?;
+                match pt.type_variation_reference {
+                    TIME_32_TYPE_VARIATION_REF => Ok(DataType::Time32(time_unit)),
+                    TIME_64_TYPE_VARIATION_REF => Ok(DataType::Time64(time_unit)),
+                    v => not_impl_err!(
+                        "Unsupported Substrait type variation {v} of type {s_kind:?}"
+                    ),
+                }
+            }
             r#type::Kind::Date(date) => match date.type_variation_reference {
                 DATE_32_TYPE_VARIATION_REF => Ok(DataType::Date32),
                 DATE_64_TYPE_VARIATION_REF => Ok(DataType::Date64),
@@ -180,24 +198,32 @@ pub fn from_substrait_type(
                 let value_type = map.value.as_ref().ok_or_else(|| {
                     substrait_datafusion_err!("Map type must have value type")
                 })?;
-                let key_field = Arc::new(Field::new(
-                    "key",
-                    from_substrait_type(consumer, key_type, dfs_names, name_idx)?,
-                    false,
-                ));
-                let value_field = Arc::new(Field::new(
-                    "value",
-                    from_substrait_type(consumer, value_type, dfs_names, name_idx)?,
-                    true,
-                ));
-                Ok(DataType::Map(
-                    Arc::new(Field::new_struct(
-                        "entries",
-                        [key_field, value_field],
-                        false, // The inner map field is always non-nullable (Arrow #1697),
+                let key_type =
+                    from_substrait_type(consumer, key_type, dfs_names, name_idx)?;
+                let value_type =
+                    from_substrait_type(consumer, value_type, dfs_names, name_idx)?;
+
+                match map.type_variation_reference {
+                    DEFAULT_MAP_TYPE_VARIATION_REF => {
+                        let key_field = Arc::new(Field::new("key", key_type, false));
+                        let value_field = Arc::new(Field::new("value", value_type, true));
+                        Ok(DataType::Map(
+                            Arc::new(Field::new_struct(
+                                "entries",
+                                [key_field, value_field],
+                                false, // The inner map field is always non-nullable (Arrow #1697),
+                            )),
+                            false, // whether keys are sorted
+                        ))
+                    }
+                    DICTIONARY_MAP_TYPE_VARIATION_REF => Ok(DataType::Dictionary(
+                        Box::new(key_type),
+                        Box::new(value_type),
                     )),
-                    false, // whether keys are sorted
-                ))
+                    v => not_impl_err!(
+                        "Unsupported Substrait type variation {v} of type {s_kind:?}"
+                    ),
+                }
             }
             r#type::Kind::Decimal(d) => match d.type_variation_reference {
                 DECIMAL_128_TYPE_VARIATION_REF => {
@@ -213,7 +239,23 @@ pub fn from_substrait_type(
             r#type::Kind::IntervalYear(_) => {
                 Ok(DataType::Interval(IntervalUnit::YearMonth))
             }
-            r#type::Kind::IntervalDay(_) => Ok(DataType::Interval(IntervalUnit::DayTime)),
+            r#type::Kind::IntervalDay(i) => match i.type_variation_reference {
+                DEFAULT_INTERVAL_DAY_TYPE_VARIATION_REF => {
+                    Ok(DataType::Interval(IntervalUnit::DayTime))
+                }
+                DURATION_INTERVAL_DAY_TYPE_VARIATION_REF => {
+                    let duration_unit = match i.precision {
+                        Some(p) => from_substrait_precision(p, "Duration"),
+                        None => {
+                            not_impl_err!("Missing Substrait precision for Duration")
+                        }
+                    }?;
+                    Ok(DataType::Duration(duration_unit))
+                }
+                v => not_impl_err!(
+                    "Unsupported Substrait type variation {v} of type {s_kind:?}"
+                ),
+            },
             r#type::Kind::IntervalCompound(_) => {
                 Ok(DataType::Interval(IntervalUnit::MonthDayNano))
             }
@@ -225,18 +267,22 @@ pub fn from_substrait_type(
                 // TODO: remove the code below once the producer has been updated
                 if let Some(name) = consumer.get_extensions().types.get(&u.type_reference)
                 {
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     match name.as_ref() {
                         // Kept for backwards compatibility, producers should use IntervalCompound instead
-                        INTERVAL_MONTH_DAY_NANO_TYPE_NAME => Ok(DataType::Interval(IntervalUnit::MonthDayNano)),
+                        INTERVAL_MONTH_DAY_NANO_TYPE_NAME => {
+                            Ok(DataType::Interval(IntervalUnit::MonthDayNano))
+                        }
+                        FLOAT_16_TYPE_NAME => Ok(DataType::Float16),
+                        NULL_TYPE_NAME => Ok(DataType::Null),
                         _ => not_impl_err!(
-                                "Unsupported Substrait user defined type with ref {} and variation {}",
-                                u.type_reference,
-                                u.type_variation_reference
-                            ),
+                            "Unsupported Substrait user defined type with ref {} and variation {}",
+                            u.type_reference,
+                            u.type_variation_reference
+                        ),
                     }
                 } else {
-                    #[allow(deprecated)]
+                    #[expect(deprecated)]
                     match u.type_reference {
                         // Kept for backwards compatibility, producers should use IntervalYear instead
                         INTERVAL_YEAR_MONTH_TYPE_REF => {
@@ -251,10 +297,10 @@ pub fn from_substrait_type(
                             Ok(DataType::Interval(IntervalUnit::MonthDayNano))
                         }
                         _ => not_impl_err!(
-                        "Unsupported Substrait user defined type with ref {} and variation {}",
-                        u.type_reference,
-                        u.type_variation_reference
-                    ),
+                            "Unsupported Substrait user defined type with ref {} and variation {}",
+                            u.type_reference,
+                            u.type_variation_reference
+                        ),
                     }
                 }
             }
@@ -282,7 +328,7 @@ pub fn from_substrait_named_struct(
         })?,
         &base_schema.names,
         &mut name_idx,
-    );
+    )?;
     if name_idx != base_schema.names.len() {
         return substrait_err!(
             "Names list must match exactly to nested schema, but found {} uses for {} names",
@@ -290,7 +336,7 @@ pub fn from_substrait_named_struct(
             base_schema.names.len()
         );
     }
-    DFSchema::try_from(Schema::new(fields?))
+    DFSchema::try_from(Schema::new(fields))
 }
 
 fn from_substrait_struct_type(
diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs
index a267971ff8d3c..59cdf4a8fc93f 100644
--- a/datafusion/substrait/src/logical_plan/consumer/utils.rs
+++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs
@@ -16,18 +16,19 @@
 // under the License.
 
 use crate::logical_plan::consumer::SubstraitConsumer;
-use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema};
+use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit, UnionFields};
 use datafusion::common::{
-    not_impl_err, substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef,
-    TableReference,
+    DFSchema, DFSchemaRef, exec_err, not_impl_err, substrait_datafusion_err,
+    substrait_err,
 };
 use datafusion::logical_expr::expr::Sort;
-use datafusion::logical_expr::{Cast, Expr, ExprSchemable, LogicalPlanBuilder};
+use datafusion::logical_expr::{Cast, Expr, ExprSchemable};
+use datafusion::sql::TableReference;
 use std::collections::HashSet;
 use std::sync::Arc;
+use substrait::proto::SortField;
 use substrait::proto::sort_field::SortDirection;
 use substrait::proto::sort_field::SortKind::{ComparisonFunctionReference, Direction};
-use substrait::proto::SortField;
 
 // Substrait PrecisionTimestampTz indicates that the timestamp is relative to UTC, which
 // is the same as the expectation for any non-empty timezone in DF, so any non-empty timezone
@@ -36,33 +37,6 @@ use substrait::proto::SortField;
 // https://github.com/apache/arrow-rs/blob/ee5694078c86c8201549654246900a4232d531a9/arrow-cast/src/cast/mod.rs#L1749).
 pub(super) const DEFAULT_TIMEZONE: &str = "UTC";
 
-/// (Re)qualify the sides of a join if needed, i.e. if the columns from one side would otherwise
-/// conflict with the columns from the other.
-/// Substrait doesn't currently allow specifying aliases, neither for columns nor for tables. For
-/// Substrait the names don't matter since it only refers to columns by indices, however DataFusion
-/// requires columns to be uniquely identifiable, in some places (see e.g. DFSchema::check_names).
-pub(super) fn requalify_sides_if_needed(
-    left: LogicalPlanBuilder,
-    right: LogicalPlanBuilder,
-) -> datafusion::common::Result<(LogicalPlanBuilder, LogicalPlanBuilder)> {
-    let left_cols = left.schema().columns();
-    let right_cols = right.schema().columns();
-    if left_cols.iter().any(|l| {
-        right_cols.iter().any(|r| {
-            l == r || (l.name == r.name && (l.relation.is_none() || r.relation.is_none()))
-        })
-    }) {
-        // These names have no connection to the original plan, but they'll make the columns
-        // (mostly) unique.
-        Ok((
-            left.alias(TableReference::bare("left"))?,
-            right.alias(TableReference::bare("right"))?,
-        ))
-    } else {
-        Ok((left, right))
-    }
-}
-
 pub(super) fn next_struct_field_name(
     column_idx: usize,
     dfs_names: &[String],
@@ -81,98 +55,169 @@ pub(super) fn next_struct_field_name(
     }
 }
 
-pub(super) fn rename_field(
+/// Traverse through the field, renaming the provided field itself and all its inner struct fields.
+pub fn rename_field(
     field: &Field,
     dfs_names: &Vec<String>,
     unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}"
     name_idx: &mut usize,        // Index into dfs_names
-    rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name
 ) -> datafusion::common::Result<Field> {
-    let name = if rename_self {
-        next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?
-    } else {
-        field.name().to_string()
-    };
-    match field.data_type() {
+    let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?;
+    rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx)
+}
+
+/// Rename the field's data type but not the field itself.
+pub fn rename_fields_data_type(
+    field: Field,
+    dfs_names: &Vec<String>,
+    name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<Field> {
+    let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?;
+    Ok(field.with_data_type(dt))
+}
+
+/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields.
+pub fn rename_data_type(
+    data_type: &DataType,
+    dfs_names: &Vec<String>,
+    name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<DataType> {
+    match data_type {
         DataType::Struct(children) => {
             let children = children
                 .iter()
                 .enumerate()
-                .map(|(child_idx, f)| {
-                    rename_field(
-                        f.as_ref(),
-                        dfs_names,
-                        child_idx,
-                        name_idx,
-                        /*rename_self=*/ true,
-                    )
+                .map(|(field_idx, f)| {
+                    rename_field(f.as_ref(), dfs_names, field_idx, name_idx)
                 })
                 .collect::<datafusion::common::Result<_>>()?;
-            Ok(field
-                .to_owned()
-                .with_name(name)
-                .with_data_type(DataType::Struct(children)))
+            Ok(DataType::Struct(children))
         }
-        DataType::List(inner) => {
-            let renamed_inner = rename_field(
-                inner.as_ref(),
+        DataType::List(inner) => Ok(DataType::List(Arc::new(rename_fields_data_type(
+            inner.as_ref().to_owned(),
+            dfs_names,
+            name_idx,
+        )?))),
+        DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new(
+            rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?,
+        ))),
+        DataType::ListView(inner) => Ok(DataType::ListView(Arc::new(
+            rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?,
+        ))),
+        DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new(
+            rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?,
+        ))),
+        DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList(
+            Arc::new(rename_fields_data_type(
+                inner.as_ref().to_owned(),
                 dfs_names,
-                0,
                 name_idx,
-                /*rename_self=*/ false,
-            )?;
-            Ok(field
-                .to_owned()
-                .with_data_type(DataType::List(FieldRef::new(renamed_inner)))
-                .with_name(name))
+            )?),
+            *len,
+        )),
+        DataType::Map(entries, sorted) => {
+            let entries_data_type = match entries.data_type() {
+                DataType::Struct(fields) => {
+                    // This should be two fields, normally "key" and "value", but not guaranteed
+                    let fields = fields
+                        .iter()
+                        .map(|f| {
+                            rename_fields_data_type(
+                                f.as_ref().to_owned(),
+                                dfs_names,
+                                name_idx,
+                            )
+                        })
+                        .collect::<datafusion::common::Result<_>>()?;
+                    Ok(DataType::Struct(fields))
+                }
+                _ => exec_err!("Expected map type to contain an inner struct type"),
+            }?;
+            Ok(DataType::Map(
+                Arc::new(
+                    entries
+                        .as_ref()
+                        .to_owned()
+                        .with_data_type(entries_data_type),
+                ),
+                *sorted,
+            ))
+        }
+        DataType::Dictionary(key_type, value_type) => {
+            // Dicts probably shouldn't contain structs, but support them just in case one does
+            Ok(DataType::Dictionary(
+                Box::new(rename_data_type(key_type, dfs_names, name_idx)?),
+                Box::new(rename_data_type(value_type, dfs_names, name_idx)?),
+            ))
         }
-        DataType::LargeList(inner) => {
-            let renamed_inner = rename_field(
-                inner.as_ref(),
+        DataType::RunEndEncoded(run_ends_field, values_field) => {
+            // At least the run_ends_field shouldn't contain names (since it should be i16/i32/i64),
+            // but we'll try renaming its datatype just in case.
+            let run_ends_field = rename_fields_data_type(
+                run_ends_field.as_ref().clone(),
                 dfs_names,
-                0,
                 name_idx,
-                /*rename_self= */ false,
             )?;
-            Ok(field
-                .to_owned()
-                .with_data_type(DataType::LargeList(FieldRef::new(renamed_inner)))
-                .with_name(name))
+            let values_field = rename_fields_data_type(
+                values_field.as_ref().clone(),
+                dfs_names,
+                name_idx,
+            )?;
+
+            Ok(DataType::RunEndEncoded(
+                Arc::new(run_ends_field),
+                Arc::new(values_field),
+            ))
         }
-        DataType::Map(inner, sorted) => match inner.data_type() {
-            DataType::Struct(key_and_value) if key_and_value.len() == 2 => {
-                let renamed_keys = rename_field(
-                    key_and_value[0].as_ref(),
-                    dfs_names,
-                    0,
-                    name_idx,
-                    /*rename_self=*/ false,
-                )?;
-                let renamed_values = rename_field(
-                    key_and_value[1].as_ref(),
-                    dfs_names,
-                    0,
-                    name_idx,
-                    /*rename_self=*/ false,
-                )?;
-                Ok(field
-                    .to_owned()
-                    .with_data_type(DataType::Map(
-                        Arc::new(Field::new(
-                            inner.name(),
-                            DataType::Struct(Fields::from(vec![
-                                renamed_keys,
-                                renamed_values,
-                            ])),
-                            inner.is_nullable(),
-                        )),
-                        *sorted,
+        DataType::Union(fields, mode) => {
+            let fields = fields
+                .iter()
+                .map(|(i, f)| {
+                    Ok((
+                        i,
+                        Arc::new(rename_fields_data_type(
+                            f.as_ref().clone(),
+                            dfs_names,
+                            name_idx,
+                        )?),
                     ))
-                    .with_name(name))
-            }
-            _ => substrait_err!("Map fields must contain a Struct with exactly 2 fields"),
-        },
-        _ => Ok(field.to_owned().with_name(name)),
+                })
+                .collect::<datafusion::common::Result<UnionFields>>()?;
+            Ok(DataType::Union(fields, *mode))
+        }
+        // Explicitly listing the rest (which can not contain inner fields needing renaming)
+        // to ensure we're exhaustive
+        DataType::Null
+        | DataType::Boolean
+        | DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Float16
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Timestamp(_, _)
+        | DataType::Date32
+        | DataType::Date64
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Duration(_)
+        | DataType::Interval(_)
+        | DataType::Binary
+        | DataType::FixedSizeBinary(_)
+        | DataType::LargeBinary
+        | DataType::BinaryView
+        | DataType::Utf8
+        | DataType::LargeUtf8
+        | DataType::Utf8View
+        | DataType::Decimal32(_, _)
+        | DataType::Decimal64(_, _)
+        | DataType::Decimal128(_, _)
+        | DataType::Decimal256(_, _) => Ok(data_type.clone()),
     }
 }
 
@@ -190,13 +235,8 @@ pub(super) fn make_renamed_schema(
         .iter()
         .enumerate()
         .map(|(field_idx, (q, f))| {
-            let renamed_f = rename_field(
-                f.as_ref(),
-                dfs_names,
-                field_idx,
-                &mut name_idx,
-                /*rename_self=*/ true,
-            )?;
+            let renamed_f =
+                rename_field(f.as_ref(), dfs_names, field_idx, &mut name_idx)?;
             Ok((q.cloned(), renamed_f))
         })
         .collect::<datafusion::common::Result<Vec<_>>>()?
@@ -207,7 +247,8 @@ pub(super) fn make_renamed_schema(
         return substrait_err!(
             "Names list must match exactly to nested schema, but found {} uses for {} names",
             name_idx,
-            dfs_names.len());
+            dfs_names.len()
+        );
     }
 
     DFSchema::from_field_specific_qualified_schema(
@@ -319,35 +360,71 @@ fn compatible_nullabilities(
 }
 
 pub(super) struct NameTracker {
-    seen_names: HashSet<String>,
-}
-
-pub(super) enum NameTrackerStatus {
-    NeverSeen,
-    SeenBefore,
+    /// Tracks seen schema names (from expr.schema_name()).
+    /// Used to detect duplicates that would fail validate_unique_names.
+    seen_schema_names: HashSet<String>,
+    /// Tracks column names that have been seen with a qualifier.
+    /// Used to detect ambiguous references (qualified + unqualified with same name).
+    qualified_names: HashSet<String>,
+    /// Tracks column names that have been seen without a qualifier.
+    /// Used to detect ambiguous references.
+    unqualified_names: HashSet<String>,
 }
 
 impl NameTracker {
     pub(super) fn new() -> Self {
         NameTracker {
-            seen_names: HashSet::default(),
+            seen_schema_names: HashSet::default(),
+            qualified_names: HashSet::default(),
+            unqualified_names: HashSet::default(),
         }
     }
-    pub(super) fn get_unique_name(
-        &mut self,
-        name: String,
-    ) -> (String, NameTrackerStatus) {
-        match self.seen_names.insert(name.clone()) {
-            true => (name, NameTrackerStatus::NeverSeen),
-            false => {
-                let mut counter = 0;
-                loop {
-                    let candidate_name = format!("{name}__temp__{counter}");
-                    if self.seen_names.insert(candidate_name.clone()) {
-                        return (candidate_name, NameTrackerStatus::SeenBefore);
-                    }
-                    counter += 1;
-                }
+
+    /// Check if the expression would cause a conflict either in:
+    /// 1. validate_unique_names (duplicate schema_name)
+    /// 2. DFSchema::check_names (ambiguous reference)
+    fn would_conflict(&self, expr: &Expr) -> bool {
+        let (qualifier, name) = expr.qualified_name();
+        let schema_name = expr.schema_name().to_string();
+        self.would_conflict_inner((qualifier, &name), &schema_name)
+    }
+
+    fn would_conflict_inner(
+        &self,
+        qualified_name: (Option<TableReference>, &str),
+        schema_name: &str,
+    ) -> bool {
+        // Check for duplicate schema_name (would fail validate_unique_names)
+        if self.seen_schema_names.contains(schema_name) {
+            return true;
+        }
+
+        // Check for ambiguous reference (would fail DFSchema::check_names)
+        // This happens when a qualified field and unqualified field have the same name
+        let (qualifier, name) = qualified_name;
+        match qualifier {
+            Some(_) => {
+                // Adding a qualified name - conflicts if unqualified version exists
+                self.unqualified_names.contains(name)
+            }
+            None => {
+                // Adding an unqualified name - conflicts if qualified version exists
+                self.qualified_names.contains(name)
+            }
+        }
+    }
+
+    fn insert(&mut self, expr: &Expr) {
+        let schema_name = expr.schema_name().to_string();
+        self.seen_schema_names.insert(schema_name);
+
+        let (qualifier, name) = expr.qualified_name();
+        match qualifier {
+            Some(_) => {
+                self.qualified_names.insert(name);
+            }
+            None => {
+                self.unqualified_names.insert(name);
             }
         }
     }
@@ -356,10 +433,25 @@ impl NameTracker {
         &mut self,
         expr: Expr,
     ) -> datafusion::common::Result<Expr> {
-        match self.get_unique_name(expr.name_for_alias()?) {
-            (_, NameTrackerStatus::NeverSeen) => Ok(expr),
-            (name, NameTrackerStatus::SeenBefore) => Ok(expr.alias(name)),
+        if !self.would_conflict(&expr) {
+            self.insert(&expr);
+            return Ok(expr);
         }
+
+        // Name collision - need to generate a unique alias
+        let schema_name = expr.schema_name().to_string();
+        let mut counter = 0;
+        let candidate_name = loop {
+            let candidate_name = format!("{schema_name}__temp__{counter}");
+            // .alias always produces an unqualified name so check for conflicts accordingly.
+            if !self.would_conflict_inner((None, &candidate_name), &candidate_name) {
+                break candidate_name;
+            }
+            counter += 1;
+        };
+        let candidate_expr = expr.alias(&candidate_name);
+        self.insert(&candidate_expr);
+        Ok(candidate_expr)
     }
 }
 
@@ -412,15 +504,31 @@ pub async fn from_substrait_sorts(
     Ok(sorts)
 }
 
+pub(crate) fn from_substrait_precision(
+    precision: i32,
+    type_name: &str,
+) -> datafusion::common::Result<TimeUnit> {
+    match precision {
+        0 => Ok(TimeUnit::Second),
+        3 => Ok(TimeUnit::Millisecond),
+        6 => Ok(TimeUnit::Microsecond),
+        9 => Ok(TimeUnit::Nanosecond),
+        precision => {
+            not_impl_err!("Unsupported Substrait precision {precision}, for {type_name}")
+        }
+    }
+}
+
 #[cfg(test)]
 pub(crate) mod tests {
-    use super::make_renamed_schema;
+    use super::{NameTracker, make_renamed_schema};
     use crate::extensions::Extensions;
     use crate::logical_plan::consumer::DefaultSubstraitConsumer;
     use datafusion::arrow::datatypes::{DataType, Field};
     use datafusion::common::DFSchema;
     use datafusion::error::Result;
     use datafusion::execution::SessionState;
+    use datafusion::logical_expr::{Expr, col};
     use datafusion::prelude::SessionContext;
     use datafusion::sql::TableReference;
     use std::collections::HashMap;
@@ -473,17 +581,29 @@ pub(crate) mod tests {
             ),
             (
                 Some(table_ref.clone()),
-                Arc::new(Field::new_map(
+                Arc::new(Field::new_large_list(
                     "7",
+                    Arc::new(Field::new_struct(
+                        "item",
+                        vec![Field::new("8", DataType::Int32, false)],
+                        false,
+                    )),
+                    false,
+                )),
+            ),
+            (
+                Some(table_ref.clone()),
+                Arc::new(Field::new_map(
+                    "9",
                     "entries",
                     Arc::new(Field::new_struct(
                         "keys",
-                        vec![Field::new("8", DataType::Int32, false)],
+                        vec![Field::new("10", DataType::Int32, false)],
                         false,
                     )),
                     Arc::new(Field::new_struct(
                         "values",
-                        vec![Field::new("9", DataType::Int32, false)],
+                        vec![Field::new("11", DataType::Int32, false)],
                         false,
                     )),
                     false,
@@ -504,17 +624,19 @@ pub(crate) mod tests {
             "h".to_string(),
             "i".to_string(),
             "j".to_string(),
+            "k".to_string(),
+            "l".to_string(),
         ];
         let renamed_schema = make_renamed_schema(&schema, &dfs_names)?;
 
-        assert_eq!(renamed_schema.fields().len(), 4);
+        assert_eq!(renamed_schema.fields().len(), 5);
         assert_eq!(
-            *renamed_schema.field(0),
-            Field::new("a", DataType::Int32, false)
+            renamed_schema.field(0),
+            &Arc::new(Field::new("a", DataType::Int32, false))
         );
         assert_eq!(
-            *renamed_schema.field(1),
-            Field::new_struct(
+            renamed_schema.field(1),
+            &Arc::new(Field::new_struct(
                 "b",
                 vec![
                     Field::new("c", DataType::Int32, false),
@@ -525,11 +647,11 @@ pub(crate) mod tests {
                     )
                 ],
                 false,
-            )
+            ))
         );
         assert_eq!(
-            *renamed_schema.field(2),
-            Field::new_list(
+            renamed_schema.field(2),
+            &Arc::new(Field::new_list(
                 "f",
                 Arc::new(Field::new_struct(
                     "item",
@@ -537,27 +659,158 @@ pub(crate) mod tests {
                     false,
                 )),
                 false,
-            )
+            ))
         );
         assert_eq!(
-            *renamed_schema.field(3),
-            Field::new_map(
+            renamed_schema.field(3),
+            &Arc::new(Field::new_large_list(
                 "h",
+                Arc::new(Field::new_struct(
+                    "item",
+                    vec![Field::new("i", DataType::Int32, false)],
+                    false,
+                )),
+                false,
+            ))
+        );
+        assert_eq!(
+            renamed_schema.field(4),
+            &Arc::new(Field::new_map(
+                "j",
                 "entries",
                 Arc::new(Field::new_struct(
                     "keys",
-                    vec![Field::new("i", DataType::Int32, false)],
+                    vec![Field::new("k", DataType::Int32, false)],
                     false,
                 )),
                 Arc::new(Field::new_struct(
                     "values",
-                    vec![Field::new("j", DataType::Int32, false)],
+                    vec![Field::new("l", DataType::Int32, false)],
                     false,
                 )),
                 false,
                 false,
-            )
+            ))
         );
         Ok(())
     }
+
+    #[test]
+    fn name_tracker_unique_names_pass_through() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First expression should pass through unchanged
+        let expr1 = col("a");
+        let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
+        assert_eq!(result1, col("a"));
+
+        // Different name should also pass through unchanged
+        let expr2 = col("b");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("b"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_duplicate_schema_name_gets_alias() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First expression with name "a"
+        let expr1 = col("a");
+        let result1 = tracker.get_uniquely_named_expr(expr1)?;
+        assert_eq!(result1, col("a"));
+
+        // Second expression with same name "a" should get aliased
+        let expr2 = col("a");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("a").alias("a__temp__0"));
+
+        // Third expression with same name "a" should get a different alias
+        let expr3 = col("a");
+        let result3 = tracker.get_uniquely_named_expr(expr3)?;
+        assert_eq!(result3, col("a").alias("a__temp__1"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_qualified_then_unqualified_conflicts() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: qualified column "table.a"
+        let qualified_col =
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
+        let result1 = tracker.get_uniquely_named_expr(qualified_col)?;
+        assert_eq!(
+            result1,
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
+        );
+
+        // Second: unqualified column "a" - should conflict (ambiguous reference)
+        let unqualified_col = col("a");
+        let result2 = tracker.get_uniquely_named_expr(unqualified_col)?;
+        // Should be aliased to avoid ambiguous reference
+        assert_eq!(result2, col("a").alias("a__temp__0"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_unqualified_then_qualified_conflicts() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: unqualified column "a"
+        let unqualified_col = col("a");
+        let result1 = tracker.get_uniquely_named_expr(unqualified_col)?;
+        assert_eq!(result1, col("a"));
+
+        // Second: qualified column "table.a" - should conflict (ambiguous reference)
+        let qualified_col =
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"));
+        let result2 = tracker.get_uniquely_named_expr(qualified_col)?;
+        // Should be aliased to avoid ambiguous reference
+        assert_eq!(
+            result2,
+            Expr::Column(datafusion::common::Column::new(Some("table"), "a"))
+                .alias("table.a__temp__0")
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_different_qualifiers_no_conflict() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: qualified column "table1.a"
+        let col1 = Expr::Column(datafusion::common::Column::new(Some("table1"), "a"));
+        let result1 = tracker.get_uniquely_named_expr(col1.clone())?;
+        assert_eq!(result1, col1);
+
+        // Second: qualified column "table2.a" - different qualifier, different schema_name
+        // so should NOT conflict
+        let col2 = Expr::Column(datafusion::common::Column::new(Some("table2"), "a"));
+        let result2 = tracker.get_uniquely_named_expr(col2.clone())?;
+        assert_eq!(result2, col2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn name_tracker_aliased_expressions() -> Result<()> {
+        let mut tracker = NameTracker::new();
+
+        // First: col("x").alias("result")
+        let expr1 = col("x").alias("result");
+        let result1 = tracker.get_uniquely_named_expr(expr1.clone())?;
+        assert_eq!(result1, col("x").alias("result"));
+
+        // Second: col("y").alias("result") - same alias name, should conflict
+        let expr2 = col("y").alias("result");
+        let result2 = tracker.get_uniquely_named_expr(expr2)?;
+        assert_eq!(result2, col("y").alias("result").alias("result__temp__0"));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
index 0619b497532d8..3713f8934f19f 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/aggregate_function.rs
@@ -43,14 +43,10 @@ pub fn from_aggregate_function(
                 null_treatment: _null_treatment,
             },
     } = agg_fn;
-    let sorts = if let Some(order_by) = order_by {
-        order_by
-            .iter()
-            .map(|expr| to_substrait_sort_field(producer, expr, schema))
-            .collect::<datafusion::common::Result<Vec<_>>>()?
-    } else {
-        vec![]
-    };
+    let sorts = order_by
+        .iter()
+        .map(|expr| to_substrait_sort_field(producer, expr, schema))
+        .collect::<datafusion::common::Result<Vec<_>>>()?;
     let mut arguments: Vec<FunctionArgument> = vec![];
     for arg in args {
         arguments.push(FunctionArgument {
@@ -58,7 +54,7 @@ pub fn from_aggregate_function(
         });
     }
     let function_anchor = producer.register_function(func.name().to_string());
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Ok(Measure {
         measure: Some(AggregateFunction {
             function_reference: function_anchor,
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/cast.rs b/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
index b69474f09ee43..2a5a6fe5c3758 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/cast.rs
@@ -15,32 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_type, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_type_from_field};
 use crate::variation_const::DEFAULT_TYPE_VARIATION_REF;
 use datafusion::common::{DFSchemaRef, ScalarValue};
 use datafusion::logical_expr::{Cast, Expr, TryCast};
+use substrait::proto::Expression;
 use substrait::proto::expression::cast::FailureBehavior;
 use substrait::proto::expression::literal::LiteralType;
 use substrait::proto::expression::{Literal, RexType};
-use substrait::proto::Expression;
 
 pub fn from_cast(
     producer: &mut impl SubstraitProducer,
     cast: &Cast,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    let Cast { expr, data_type } = cast;
+    let Cast { expr, field } = cast;
     // since substrait Null must be typed, so if we see a cast(null, dt), we make it a typed null
-    if let Expr::Literal(lit) = expr.as_ref() {
+    if let Expr::Literal(lit, _) = expr.as_ref() {
         // only the untyped(a null scalar value) null literal need this special handling
         // since all other kind of nulls are already typed and can be handled by substrait
         // e.g. null::<Int32Type> or null::<Utf8Type>
-        if matches!(lit, ScalarValue::Null) {
+        if *lit == ScalarValue::Null {
             let lit = Literal {
                 nullable: true,
                 type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
-                literal_type: Some(LiteralType::Null(to_substrait_type(
-                    data_type, true,
+                literal_type: Some(LiteralType::Null(to_substrait_type_from_field(
+                    producer, field,
                 )?)),
             };
             return Ok(Expression {
@@ -51,7 +51,7 @@ pub fn from_cast(
     Ok(Expression {
         rex_type: Some(RexType::Cast(Box::new(
             substrait::proto::expression::Cast {
-                r#type: Some(to_substrait_type(data_type, true)?),
+                r#type: Some(to_substrait_type_from_field(producer, field)?),
                 input: Some(Box::new(producer.handle_expr(expr, schema)?)),
                 failure_behavior: FailureBehavior::ThrowException.into(),
             },
@@ -64,11 +64,11 @@ pub fn from_try_cast(
     cast: &TryCast,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    let TryCast { expr, data_type } = cast;
+    let TryCast { expr, field } = cast;
     Ok(Expression {
         rex_type: Some(RexType::Cast(Box::new(
             substrait::proto::expression::Cast {
-                r#type: Some(to_substrait_type(data_type, true)?),
+                r#type: Some(to_substrait_type_from_field(producer, field)?),
                 input: Some(Box::new(producer.handle_expr(expr, schema)?)),
                 failure_behavior: FailureBehavior::ReturnNull.into(),
             },
@@ -79,7 +79,9 @@ pub fn from_try_cast(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::logical_plan::producer::to_substrait_extended_expr;
+    use crate::logical_plan::producer::{
+        DefaultSubstraitProducer, to_substrait_extended_expr, to_substrait_type,
+    };
     use datafusion::arrow::datatypes::{DataType, Field};
     use datafusion::common::DFSchema;
     use datafusion::execution::SessionStateBuilder;
@@ -92,7 +94,9 @@ mod tests {
         let empty_schema = DFSchemaRef::new(DFSchema::empty());
         let field = Field::new("out", DataType::Int32, false);
 
-        let expr = Expr::Literal(ScalarValue::Null)
+        let mut producer = DefaultSubstraitProducer::new(&state);
+
+        let expr = Expr::Literal(ScalarValue::Null, None)
             .cast_to(&DataType::Int32, &empty_schema)
             .unwrap();
 
@@ -107,7 +111,7 @@ mod tests {
                 nullable: true,
                 type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
                 literal_type: Some(LiteralType::Null(
-                    to_substrait_type(&DataType::Int32, true).unwrap(),
+                    to_substrait_type(&mut producer, &DataType::Int32, true).unwrap(),
                 )),
             };
             let expected = Expression {
@@ -119,7 +123,7 @@ mod tests {
         }
 
         // a typed null should not be folded
-        let expr = Expr::Literal(ScalarValue::Int64(None))
+        let expr = Expr::Literal(ScalarValue::Int64(None), None)
             .cast_to(&DataType::Int32, &empty_schema)
             .unwrap();
 
@@ -131,13 +135,16 @@ mod tests {
             typed_null.referred_expr[0].expr_type.as_ref().unwrap()
         {
             let cast_expr = substrait::proto::expression::Cast {
-                r#type: Some(to_substrait_type(&DataType::Int32, true).unwrap()),
+                r#type: Some(
+                    to_substrait_type(&mut producer, &DataType::Int32, true).unwrap(),
+                ),
                 input: Some(Box::new(Expression {
                     rex_type: Some(RexType::Literal(Literal {
                         nullable: true,
                         type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
                         literal_type: Some(LiteralType::Null(
-                            to_substrait_type(&DataType::Int64, true).unwrap(),
+                            to_substrait_type(&mut producer, &DataType::Int64, true)
+                                .unwrap(),
                         )),
                     })),
                 })),
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs b/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
index d1d80ca545ff2..aa34317a6e292 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/field_reference.rs
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::common::{substrait_err, Column, DFSchemaRef};
+use datafusion::common::{Column, DFSchemaRef, substrait_err};
 use datafusion::logical_expr::Expr;
+use substrait::proto::Expression;
 use substrait::proto::expression::field_reference::{
     ReferenceType, RootReference, RootType,
 };
 use substrait::proto::expression::{
-    reference_segment, FieldReference, ReferenceSegment, RexType,
+    FieldReference, ReferenceSegment, RexType, reference_segment,
 };
-use substrait::proto::Expression;
 
 pub fn from_column(
     col: &Column,
@@ -76,6 +76,22 @@ pub(crate) fn try_to_substrait_field_reference(
     }
 }
 
+/// Convert an outer reference column to a Substrait field reference.
+/// Outer reference columns reference columns from an outer query scope in correlated subqueries.
+/// We convert them the same way as regular columns since the subquery plan will be
+/// reconstructed with the proper schema context during consumption.
+pub fn from_outer_reference_column(
+    col: &Column,
+    schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    // OuterReferenceColumn is converted similarly to a regular column reference.
+    // The schema provided should be the schema context in which the outer reference
+    // column appears. During Substrait round-trip, the consumer will reconstruct
+    // the outer reference based on the subquery context.
+    let index = schema.index_of_column(col)?;
+    substrait_field_ref(index)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs b/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
index a34959ead76de..2c10b26436f50 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/if_then.rs
@@ -18,9 +18,9 @@
 use crate::logical_plan::producer::SubstraitProducer;
 use datafusion::common::DFSchemaRef;
 use datafusion::logical_expr::Case;
+use substrait::proto::Expression;
 use substrait::proto::expression::if_then::IfClause;
 use substrait::proto::expression::{IfThen, RexType};
-use substrait::proto::Expression;
 
 pub fn from_case(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
index 31f4866bdc851..bbed7ee9be417 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/literal.rs
@@ -15,24 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_type, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_type};
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DECIMAL_128_TYPE_VARIATION_REF,
-    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
+    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, FLOAT_16_TYPE_NAME,
+    LARGE_CONTAINER_TYPE_VARIATION_REF, TIME_32_TYPE_VARIATION_REF,
+    TIME_64_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
     VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
 use datafusion::arrow::array::{Array, GenericListArray, OffsetSizeTrait};
 use datafusion::arrow::temporal_conversions::NANOSECONDS;
-use datafusion::common::{exec_err, not_impl_err, ScalarValue};
+use datafusion::common::{ScalarValue, exec_err, not_impl_err};
 use substrait::proto::expression::literal::interval_day_to_second::PrecisionMode;
 use substrait::proto::expression::literal::map::KeyValue;
 use substrait::proto::expression::literal::{
     Decimal, IntervalCompound, IntervalDayToSecond, IntervalYearToMonth, List,
-    LiteralType, Map, PrecisionTimestamp, Struct,
+    LiteralType, Map, PrecisionTime, PrecisionTimestamp, Struct,
 };
 use substrait::proto::expression::{Literal, RexType};
-use substrait::proto::{r#type, Expression};
+use substrait::proto::{Expression, r#type};
 
 pub fn from_literal(
     producer: &mut impl SubstraitProducer,
@@ -60,6 +61,7 @@ pub(crate) fn to_substrait_literal(
             nullable: true,
             type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
             literal_type: Some(LiteralType::Null(to_substrait_type(
+                producer,
                 &value.data_type(),
                 true,
             )?)),
@@ -93,6 +95,41 @@ pub(crate) fn to_substrait_literal(
             LiteralType::I64(*n as i64),
             UNSIGNED_INTEGER_TYPE_VARIATION_REF,
         ),
+        ScalarValue::Float16(Some(f)) => {
+            // Rules for encoding fp16 Substrait literals are defined as part of Arrow here:
+            //
+            // https://github.com/apache/arrow/blame/bab558061696ddc1841148d6210424b12923d48e/format/substrait/extension_types.yaml#L112
+            //
+            // fp16 literals are encoded as user defined literals with
+            // a google.protobuf.UInt32Value message where the lower 16 bits are
+            // the fp16 value.
+            let type_anchor = producer.register_type(FLOAT_16_TYPE_NAME.to_string());
+
+            // The spec says "lower 16 bits" but neglects to mention the endianness.
+            // Let's just use little-endian for now.
+            //
+            // See https://github.com/apache/arrow/issues/47846
+            let f_bytes = f.to_le_bytes();
+            let value = u32::from_le_bytes([f_bytes[0], f_bytes[1], 0, 0]);
+
+            let value = pbjson_types::UInt32Value { value };
+            let encoded_value = prost::Message::encode_to_vec(&value);
+            (
+                LiteralType::UserDefined(
+                    substrait::proto::expression::literal::UserDefined {
+                        type_anchor_type: Some(substrait::proto::expression::literal::user_defined::TypeAnchorType::TypeReference(type_anchor)),
+                        type_parameters: vec![],
+                        val: Some(substrait::proto::expression::literal::user_defined::Val::Value(
+                            pbjson_types::Any {
+                                type_url: "google.protobuf.UInt32Value".to_string(),
+                                value: encoded_value.into(),
+                            },
+                        )),
+                    },
+                ),
+                DEFAULT_TYPE_VARIATION_REF,
+            )
+        }
         ScalarValue::Float32(Some(f)) => {
             (LiteralType::Fp32(*f), DEFAULT_TYPE_VARIATION_REF)
         }
@@ -240,7 +277,7 @@ pub(crate) fn to_substrait_literal(
         ),
         ScalarValue::Map(m) => {
             let map = if m.is_empty() || m.value(0).is_empty() {
-                let mt = to_substrait_type(m.data_type(), m.is_nullable())?;
+                let mt = to_substrait_type(producer, m.data_type(), m.is_nullable())?;
                 let mt = match mt {
                     substrait::proto::Type {
                         kind: Some(r#type::Kind::Map(mt)),
@@ -280,6 +317,34 @@ pub(crate) fn to_substrait_literal(
             };
             (map, DEFAULT_CONTAINER_TYPE_VARIATION_REF)
         }
+        ScalarValue::Time32Second(Some(t)) => (
+            LiteralType::PrecisionTime(PrecisionTime {
+                precision: 0,
+                value: *t as i64,
+            }),
+            TIME_32_TYPE_VARIATION_REF,
+        ),
+        ScalarValue::Time32Millisecond(Some(t)) => (
+            LiteralType::PrecisionTime(PrecisionTime {
+                precision: 3,
+                value: *t as i64,
+            }),
+            TIME_32_TYPE_VARIATION_REF,
+        ),
+        ScalarValue::Time64Microsecond(Some(t)) => (
+            LiteralType::PrecisionTime(PrecisionTime {
+                precision: 6,
+                value: *t,
+            }),
+            TIME_64_TYPE_VARIATION_REF,
+        ),
+        ScalarValue::Time64Nanosecond(Some(t)) => (
+            LiteralType::PrecisionTime(PrecisionTime {
+                precision: 9,
+                value: *t,
+            }),
+            TIME_64_TYPE_VARIATION_REF,
+        ),
         ScalarValue::Struct(s) => (
             LiteralType::Struct(Struct {
                 fields: s
@@ -325,12 +390,13 @@ fn convert_array_to_literal_list<T: OffsetSizeTrait>(
         .collect::<datafusion::common::Result<Vec<_>>>()?;
 
     if values.is_empty() {
-        let lt = match to_substrait_type(array.data_type(), array.is_nullable())? {
-            substrait::proto::Type {
-                kind: Some(r#type::Kind::List(lt)),
-            } => lt.as_ref().to_owned(),
-            _ => unreachable!(),
-        };
+        let lt =
+            match to_substrait_type(producer, array.data_type(), array.is_nullable())? {
+                substrait::proto::Type {
+                    kind: Some(r#type::Kind::List(lt)),
+                } => lt.as_ref().to_owned(),
+                _ => unreachable!(),
+            };
         Ok(LiteralType::EmptyList(lt))
     } else {
         Ok(LiteralType::List(List { values }))
@@ -347,8 +413,8 @@ mod tests {
     use datafusion::arrow::datatypes::{
         DataType, Field, IntervalDayTime, IntervalMonthDayNano,
     };
-    use datafusion::common::scalar::ScalarStructBuilder;
     use datafusion::common::Result;
+    use datafusion::common::scalar::ScalarStructBuilder;
     use datafusion::prelude::SessionContext;
     use std::sync::Arc;
 
@@ -398,6 +464,18 @@ mod tests {
             round_trip_literal(ScalarValue::TimestampNanosecond(ts, tz))?;
         }
 
+        // Test Time32 literals
+        round_trip_literal(ScalarValue::Time32Second(Some(45296)))?;
+        round_trip_literal(ScalarValue::Time32Second(None))?;
+        round_trip_literal(ScalarValue::Time32Millisecond(Some(45296789)))?;
+        round_trip_literal(ScalarValue::Time32Millisecond(None))?;
+
+        // Test Time64 literals
+        round_trip_literal(ScalarValue::Time64Microsecond(Some(45296789123)))?;
+        round_trip_literal(ScalarValue::Time64Microsecond(None))?;
+        round_trip_literal(ScalarValue::Time64Nanosecond(Some(45296789123000)))?;
+        round_trip_literal(ScalarValue::Time64Nanosecond(None))?;
+
         round_trip_literal(ScalarValue::List(ScalarValue::new_list_nullable(
             &[ScalarValue::Float32(Some(1.0))],
             &DataType::Float32,
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/mod.rs b/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
index fbc4d3754df0b..d130961596dc9 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/mod.rs
@@ -37,13 +37,13 @@ pub use window_function::*;
 
 use crate::logical_plan::producer::utils::flatten_names;
 use crate::logical_plan::producer::{
-    to_substrait_named_struct, DefaultSubstraitProducer, SubstraitProducer,
+    DefaultSubstraitProducer, SubstraitProducer, to_substrait_named_struct,
 };
 use datafusion::arrow::datatypes::Field;
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef};
+use datafusion::common::{DFSchemaRef, internal_err, not_impl_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::expr::Alias;
 use datafusion::logical_expr::Expr;
+use datafusion::logical_expr::expr::Alias;
 use substrait::proto::expression_reference::ExprType;
 use substrait::proto::{Expression, ExpressionReference, ExtendedExpression};
 use substrait::version;
@@ -78,13 +78,13 @@ pub fn to_substrait_extended_expr(
             })
         })
         .collect::<datafusion::common::Result<Vec<_>>>()?;
-    let substrait_schema = to_substrait_named_struct(schema)?;
+    let substrait_schema = to_substrait_named_struct(&mut producer, schema)?;
 
     let extensions = producer.get_extensions();
     Ok(Box::new(ExtendedExpression {
         advanced_extensions: None,
         expected_type_urls: vec![],
-        extension_uris: vec![],
+        extension_urns: vec![],
         extensions: extensions.into(),
         version: Some(version::version_with_producer("datafusion")),
         referred_expr: substrait_exprs,
@@ -109,7 +109,7 @@ pub fn to_substrait_rex(
         Expr::ScalarVariable(_, _) => {
             not_impl_err!("Cannot convert {expr:?} to Substrait")
         }
-        Expr::Literal(expr) => producer.handle_literal(expr),
+        Expr::Literal(expr, _) => producer.handle_literal(expr),
         Expr::BinaryExpr(expr) => producer.handle_binary_expr(expr, schema),
         Expr::Like(expr) => producer.handle_like(expr, schema),
         Expr::SimilarTo(_) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
@@ -135,16 +135,17 @@ pub fn to_substrait_rex(
         }
         Expr::WindowFunction(expr) => producer.handle_window_function(expr, schema),
         Expr::InList(expr) => producer.handle_in_list(expr, schema),
-        Expr::Exists(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
+        Expr::Exists(expr) => producer.handle_exists(expr, schema),
         Expr::InSubquery(expr) => producer.handle_in_subquery(expr, schema),
-        Expr::ScalarSubquery(expr) => {
-            not_impl_err!("Cannot convert {expr:?} to Substrait")
-        }
+        Expr::SetComparison(expr) => producer.handle_set_comparison(expr, schema),
+        Expr::ScalarSubquery(expr) => producer.handle_scalar_subquery(expr, schema),
         #[expect(deprecated)]
         Expr::Wildcard { .. } => not_impl_err!("Cannot convert {expr:?} to Substrait"),
         Expr::GroupingSet(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
         Expr::Placeholder(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
         Expr::OuterReferenceColumn(_, _) => {
+            // OuterReferenceColumn requires tracking outer query schema context for correlated
+            // subqueries. This is a complex feature that is not yet implemented.
             not_impl_err!("Cannot convert {expr:?} to Substrait")
         }
         Expr::Unnest(expr) => not_impl_err!("Cannot convert {expr:?} to Substrait"),
@@ -172,7 +173,7 @@ mod tests {
         let state = SessionStateBuilder::default().build();
 
         // One expression, empty input schema
-        let expr = Expr::Literal(ScalarValue::Int32(Some(42)));
+        let expr = Expr::Literal(ScalarValue::Int32(Some(42)), None);
         let field = Field::new("out", DataType::Int32, false);
         let empty_schema = DFSchemaRef::new(DFSchema::empty());
         let substrait =
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
index 1172c43319c66..9f70e903a0bd9 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{to_substrait_literal_expr, SubstraitProducer};
-use datafusion::common::{not_impl_err, DFSchemaRef, ScalarValue};
-use datafusion::logical_expr::{expr, Between, BinaryExpr, Expr, Like, Operator};
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_literal_expr};
+use datafusion::common::{DFSchemaRef, ScalarValue, not_impl_err};
+use datafusion::logical_expr::{Between, BinaryExpr, Expr, Like, Operator, expr};
 use substrait::proto::expression::{RexType, ScalarFunction};
 use substrait::proto::function_argument::ArgType;
 use substrait::proto::{Expression, FunctionArgument};
@@ -34,8 +34,10 @@ pub fn from_scalar_function(
         });
     }
 
+    let arguments = custom_argument_handler(fun.name(), arguments);
+
     let function_anchor = producer.register_function(fun.name().to_string());
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Ok(Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -47,6 +49,25 @@ pub fn from_scalar_function(
     })
 }
 
+// Handle functions that require custom handling for their arguments (e.g. log)
+pub fn custom_argument_handler(
+    name: &str,
+    args: Vec<FunctionArgument>,
+) -> Vec<FunctionArgument> {
+    match name {
+        "log" => {
+            if args.len() == 2 {
+                let mut args = args;
+                args.swap(0, 1);
+                args
+            } else {
+                args
+            }
+        }
+        _ => args,
+    }
+}
+
 pub fn from_unary_expr(
     producer: &mut impl SubstraitProducer,
     expr: &Expr,
@@ -134,7 +155,7 @@ fn make_substrait_like_expr(
         },
     ];
 
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     let substrait_like = Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -148,7 +169,7 @@ fn make_substrait_like_expr(
     if negated {
         let function_anchor = producer.register_function("not".to_string());
 
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         Ok(Expression {
             rex_type: Some(RexType::ScalarFunction(ScalarFunction {
                 function_reference: function_anchor,
@@ -196,7 +217,7 @@ pub fn make_binary_op_scalar_func(
     op: Operator,
 ) -> Expression {
     let function_anchor = producer.register_function(operator_to_name(op).to_string());
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Expression {
         rex_type: Some(RexType::ScalarFunction(ScalarFunction {
             function_reference: function_anchor,
@@ -323,5 +344,6 @@ pub fn operator_to_name(op: Operator) -> &'static str {
         Operator::BitwiseXor => "bitwise_xor",
         Operator::BitwiseShiftRight => "bitwise_shift_right",
         Operator::BitwiseShiftLeft => "bitwise_shift_left",
+        Operator::Colon => "colon",
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs b/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
index 1c0b6dcc154bc..fd09a60d5eadc 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/singular_or_list.rs
@@ -15,12 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::SubstraitProducer;
+use crate::logical_plan::producer::{SubstraitProducer, negate};
 use datafusion::common::DFSchemaRef;
 use datafusion::logical_expr::expr::InList;
-use substrait::proto::expression::{RexType, ScalarFunction, SingularOrList};
-use substrait::proto::function_argument::ArgType;
-use substrait::proto::{Expression, FunctionArgument};
+use substrait::proto::Expression;
+use substrait::proto::expression::{RexType, SingularOrList};
 
 pub fn from_in_list(
     producer: &mut impl SubstraitProducer,
@@ -46,20 +45,7 @@ pub fn from_in_list(
     };
 
     if *negated {
-        let function_anchor = producer.register_function("not".to_string());
-
-        #[allow(deprecated)]
-        Ok(Expression {
-            rex_type: Some(RexType::ScalarFunction(ScalarFunction {
-                function_reference: function_anchor,
-                arguments: vec![FunctionArgument {
-                    arg_type: Some(ArgType::Value(substrait_or_list)),
-                }],
-                output_type: None,
-                args: vec![],
-                options: vec![],
-            })),
-        })
+        Ok(negate(producer, substrait_or_list))
     } else {
         Ok(substrait_or_list)
     }
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs b/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
index c1ee78c68c258..97699c2132781 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/subquery.rs
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::SubstraitProducer;
-use datafusion::common::DFSchemaRef;
-use datafusion::logical_expr::expr::InSubquery;
-use substrait::proto::expression::subquery::InPredicate;
-use substrait::proto::expression::{RexType, ScalarFunction};
-use substrait::proto::function_argument::ArgType;
-use substrait::proto::{Expression, FunctionArgument};
+use crate::logical_plan::producer::{SubstraitProducer, negate};
+use datafusion::common::{DFSchemaRef, substrait_err};
+use datafusion::logical_expr::expr::{Exists, InSubquery, SetComparison, SetQuantifier};
+use datafusion::logical_expr::{Operator, Subquery};
+use substrait::proto::Expression;
+use substrait::proto::expression::RexType;
+use substrait::proto::expression::subquery::set_comparison::{ComparisonOp, ReductionOp};
+use substrait::proto::expression::subquery::{InPredicate, Scalar, SetPredicate};
 
 pub fn from_in_subquery(
     producer: &mut impl SubstraitProducer,
@@ -52,21 +53,111 @@ pub fn from_in_subquery(
         ))),
     };
     if *negated {
-        let function_anchor = producer.register_function("not".to_string());
-
-        #[allow(deprecated)]
-        Ok(Expression {
-            rex_type: Some(RexType::ScalarFunction(ScalarFunction {
-                function_reference: function_anchor,
-                arguments: vec![FunctionArgument {
-                    arg_type: Some(ArgType::Value(substrait_subquery)),
-                }],
-                output_type: None,
-                args: vec![],
-                options: vec![],
-            })),
-        })
+        Ok(negate(producer, substrait_subquery))
     } else {
         Ok(substrait_subquery)
     }
 }
+
+fn comparison_op_to_proto(op: &Operator) -> datafusion::common::Result<ComparisonOp> {
+    match op {
+        Operator::Eq => Ok(ComparisonOp::Eq),
+        Operator::NotEq => Ok(ComparisonOp::Ne),
+        Operator::Lt => Ok(ComparisonOp::Lt),
+        Operator::Gt => Ok(ComparisonOp::Gt),
+        Operator::LtEq => Ok(ComparisonOp::Le),
+        Operator::GtEq => Ok(ComparisonOp::Ge),
+        _ => substrait_err!("Unsupported operator {op:?} for SetComparison subquery"),
+    }
+}
+
+fn reduction_op_to_proto(
+    quantifier: &SetQuantifier,
+) -> datafusion::common::Result<ReductionOp> {
+    match quantifier {
+        SetQuantifier::Any => Ok(ReductionOp::Any),
+        SetQuantifier::All => Ok(ReductionOp::All),
+    }
+}
+
+pub fn from_set_comparison(
+    producer: &mut impl SubstraitProducer,
+    set_comparison: &SetComparison,
+    schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let comparison_op = comparison_op_to_proto(&set_comparison.op)? as i32;
+    let reduction_op = reduction_op_to_proto(&set_comparison.quantifier)? as i32;
+    let left = producer.handle_expr(set_comparison.expr.as_ref(), schema)?;
+    let subquery_plan =
+        producer.handle_plan(set_comparison.subquery.subquery.as_ref())?;
+
+    Ok(Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::SetComparison(
+                        Box::new(substrait::proto::expression::subquery::SetComparison {
+                            reduction_op,
+                            comparison_op,
+                            left: Some(Box::new(left)),
+                            right: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    })
+}
+
+/// Convert DataFusion ScalarSubquery to Substrait Scalar subquery type
+pub fn from_scalar_subquery(
+    producer: &mut impl SubstraitProducer,
+    subquery: &Subquery,
+    _schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let subquery_plan = producer.handle_plan(subquery.subquery.as_ref())?;
+
+    Ok(Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::Scalar(
+                        Box::new(Scalar {
+                            input: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    })
+}
+
+/// Convert DataFusion Exists expression to Substrait SetPredicate subquery type
+pub fn from_exists(
+    producer: &mut impl SubstraitProducer,
+    exists: &Exists,
+    _schema: &DFSchemaRef,
+) -> datafusion::common::Result<Expression> {
+    let subquery_plan = producer.handle_plan(exists.subquery.subquery.as_ref())?;
+
+    let substrait_exists = Expression {
+        rex_type: Some(RexType::Subquery(Box::new(
+            substrait::proto::expression::Subquery {
+                subquery_type: Some(
+                    substrait::proto::expression::subquery::SubqueryType::SetPredicate(
+                        Box::new(SetPredicate {
+                            predicate_op: substrait::proto::expression::subquery::set_predicate::PredicateOp::Exists as i32,
+                            tuples: Some(subquery_plan),
+                        }),
+                    ),
+                ),
+            },
+        ))),
+    };
+
+    if exists.negated {
+        Ok(negate(producer, substrait_exists))
+    } else {
+        Ok(substrait_exists)
+    }
+}
diff --git a/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs b/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
index 17e71f2d7c147..5d5f31cf116b0 100644
--- a/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
+++ b/datafusion/substrait/src/logical_plan/producer/expr/window_function.rs
@@ -15,16 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::utils::substrait_sort_field;
 use crate::logical_plan::producer::SubstraitProducer;
-use datafusion::common::{not_impl_err, DFSchemaRef, ScalarValue};
+use crate::logical_plan::producer::utils::substrait_sort_field;
+use datafusion::common::{DFSchemaRef, ScalarValue, not_impl_err};
 use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams};
 use datafusion::logical_expr::{WindowFrame, WindowFrameBound, WindowFrameUnits};
+use substrait::proto::expression::RexType;
+use substrait::proto::expression::WindowFunction as SubstraitWindowFunction;
 use substrait::proto::expression::window_function::bound as SubstraitBound;
 use substrait::proto::expression::window_function::bound::Kind as BoundKind;
 use substrait::proto::expression::window_function::{Bound, BoundsType};
-use substrait::proto::expression::RexType;
-use substrait::proto::expression::WindowFunction as SubstraitWindowFunction;
 use substrait::proto::function_argument::ArgType;
 use substrait::proto::{Expression, FunctionArgument, SortField};
 
@@ -42,6 +42,8 @@ pub fn from_window_function(
                 order_by,
                 window_frame,
                 null_treatment: _,
+                distinct: _,
+                filter: _,
             },
     } = window_fn;
     // function reference
@@ -84,7 +86,7 @@ fn make_substrait_window_function(
     bounds: (Bound, Bound),
     bounds_type: BoundsType,
 ) -> Expression {
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Expression {
         rex_type: Some(RexType::WindowFunction(SubstraitWindowFunction {
             function_reference,
diff --git a/datafusion/substrait/src/logical_plan/producer/plan.rs b/datafusion/substrait/src/logical_plan/producer/plan.rs
index 7d5b7754122d6..3b58720dba832 100644
--- a/datafusion/substrait/src/logical_plan/producer/plan.rs
+++ b/datafusion/substrait/src/logical_plan/producer/plan.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    to_substrait_named_struct, DefaultSubstraitProducer, SubstraitProducer,
+    DefaultSubstraitProducer, SubstraitProducer, to_substrait_named_struct,
 };
 use datafusion::execution::SessionState;
 use datafusion::logical_expr::{LogicalPlan, SubqueryAlias};
-use substrait::proto::{plan_rel, Plan, PlanRel, Rel, RelRoot};
+use substrait::proto::{Plan, PlanRel, Rel, RelRoot, plan_rel};
 use substrait::version;
 
 /// Convert DataFusion LogicalPlan to Substrait Plan
@@ -36,7 +36,7 @@ pub fn to_substrait_plan(
     let plan_rels = vec![PlanRel {
         rel_type: Some(plan_rel::RelType::Root(RelRoot {
             input: Some(*producer.handle_plan(plan)?),
-            names: to_substrait_named_struct(plan.schema())?.names,
+            names: to_substrait_named_struct(&mut producer, plan.schema())?.names,
         })),
     }];
 
@@ -44,12 +44,13 @@ pub fn to_substrait_plan(
     let extensions = producer.get_extensions();
     Ok(Box::new(Plan {
         version: Some(version::version_with_producer("datafusion")),
-        extension_uris: vec![],
+        extension_urns: vec![],
         extensions: extensions.into(),
         relations: plan_rels,
         advanced_extensions: None,
         expected_type_urls: vec![],
         parameter_bindings: vec![],
+        type_aliases: vec![],
     }))
 }
 
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
index 4abd283a7ee0b..dec94b0422257 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs
@@ -16,10 +16,11 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    from_aggregate_function, substrait_field_ref, SubstraitProducer,
+    SubstraitProducer, from_aggregate_function, substrait_field_ref,
 };
-use datafusion::common::{internal_err, not_impl_err, DFSchemaRef, DataFusionError};
+use datafusion::common::{DFSchemaRef, internal_err, not_impl_err};
 use datafusion::logical_expr::expr::Alias;
+use datafusion::logical_expr::utils::powerset;
 use datafusion::logical_expr::{Aggregate, Distinct, Expr, GroupingSet};
 use substrait::proto::aggregate_rel::{Grouping, Measure};
 use substrait::proto::rel::RelType;
@@ -63,7 +64,7 @@ pub fn from_distinct(
                 .map(substrait_field_ref)
                 .collect::<datafusion::common::Result<Vec<_>>>()?;
 
-            #[allow(deprecated)]
+            #[expect(deprecated)]
             Ok(Box::new(Rel {
                 rel_type: Some(RelType::Aggregate(Box::new(AggregateRel {
                     common: None,
@@ -91,10 +92,22 @@ pub fn to_substrait_groupings(
     let groupings = match exprs.len() {
         1 => match &exprs[0] {
             Expr::GroupingSet(gs) => match gs {
-                GroupingSet::Cube(_) => Err(DataFusionError::NotImplemented(
-                    "GroupingSet CUBE is not yet supported".to_string(),
-                )),
-                GroupingSet::GroupingSets(sets) => Ok(sets
+                GroupingSet::Cube(set) => {
+                    // Generate power set of grouping expressions
+                    let cube_sets = powerset(set)?;
+                    cube_sets
+                        .iter()
+                        .map(|set| {
+                            parse_flat_grouping_exprs(
+                                producer,
+                                &set.iter().map(|v| (*v).clone()).collect::<Vec<_>>(),
+                                schema,
+                                &mut ref_group_exprs,
+                            )
+                        })
+                        .collect::<datafusion::common::Result<Vec<_>>>()
+                }
+                GroupingSet::GroupingSets(sets) => sets
                     .iter()
                     .map(|set| {
                         parse_flat_grouping_exprs(
@@ -104,14 +117,13 @@ pub fn to_substrait_groupings(
                             &mut ref_group_exprs,
                         )
                     })
-                    .collect::<datafusion::common::Result<Vec<_>>>()?),
+                    .collect::<datafusion::common::Result<Vec<_>>>(),
                 GroupingSet::Rollup(set) => {
                     let mut sets: Vec<Vec<Expr>> = vec![vec![]];
                     for i in 0..set.len() {
                         sets.push(set[..=i].to_vec());
                     }
-                    Ok(sets
-                        .iter()
+                    sets.iter()
                         .rev()
                         .map(|set| {
                             parse_flat_grouping_exprs(
@@ -121,7 +133,7 @@ pub fn to_substrait_groupings(
                                 &mut ref_group_exprs,
                             )
                         })
-                        .collect::<datafusion::common::Result<Vec<_>>>()?)
+                        .collect::<datafusion::common::Result<Vec<_>>>()
                 }
             },
             _ => Ok(vec![parse_flat_grouping_exprs(
@@ -156,7 +168,7 @@ pub fn parse_flat_grouping_exprs(
         ref_group_exprs.push(rex);
         expression_references.push((ref_group_exprs.len() - 1) as u32);
     }
-    #[allow(deprecated)]
+    #[expect(deprecated)]
     Ok(Grouping {
         grouping_expressions,
         expression_references,
@@ -169,12 +181,14 @@ pub fn to_substrait_agg_measure(
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Measure> {
     match expr {
-        Expr::AggregateFunction(agg_fn) => from_aggregate_function(producer, agg_fn, schema),
+        Expr::AggregateFunction(agg_fn) => {
+            from_aggregate_function(producer, agg_fn, schema)
+        }
         Expr::Alias(Alias { expr, .. }) => {
             to_substrait_agg_measure(producer, expr, schema)
         }
         _ => internal_err!(
-            "Expression must be compatible with aggregation. Unsupported expression: {:?}. ExpressionType: {:?}",
+            "Expression must be compatible with aggregation. Unsupported expression: {:?}. Expressiontype: {}",
             expr,
             expr.variant_name()
         ),
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
index 9e0ef8905f432..50c4b3da86cbe 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/exchange_rel.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    try_to_substrait_field_reference, SubstraitProducer,
+    SubstraitProducer, try_to_substrait_field_reference,
 };
 use datafusion::common::not_impl_err;
 use datafusion::logical_expr::{Partitioning, Repartition};
@@ -35,7 +35,7 @@ pub fn from_repartition(
         Partitioning::DistributeBy(_) => {
             return not_impl_err!(
                 "Physical plan does not support DistributeBy partitioning"
-            )
+            );
         }
     };
     // ref: https://substrait.io/relations/physical_relations/#exchange-types
@@ -53,7 +53,7 @@ pub fn from_repartition(
         Partitioning::DistributeBy(_) => {
             return not_impl_err!(
                 "Physical plan does not support DistributeBy partitioning"
-            )
+            );
         }
     };
     let exchange_rel = ExchangeRel {
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
index 4706401d558ec..e878b3816ff42 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/fetch_rel.rs
@@ -20,7 +20,7 @@ use datafusion::common::DFSchema;
 use datafusion::logical_expr::Limit;
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
-use substrait::proto::{fetch_rel, FetchRel, Rel};
+use substrait::proto::{FetchRel, Rel, fetch_rel};
 
 pub fn from_limit(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/join.rs b/datafusion/substrait/src/logical_plan/producer/rel/join.rs
index 79564ad5daf1e..cbf5593ffc86c 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/join.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/join.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{make_binary_op_scalar_func, SubstraitProducer};
-use datafusion::common::{not_impl_err, DFSchemaRef, JoinConstraint, JoinType};
+use crate::logical_plan::producer::{SubstraitProducer, make_binary_op_scalar_func};
+use datafusion::common::{
+    DFSchemaRef, JoinConstraint, JoinType, NullEquality, not_impl_err,
+};
 use datafusion::logical_expr::{Expr, Join, Operator};
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
-use substrait::proto::{join_rel, Expression, JoinRel, Rel};
+use substrait::proto::{Expression, JoinRel, Rel, join_rel};
 
 pub fn from_join(
     producer: &mut impl SubstraitProducer,
@@ -44,10 +46,9 @@ pub fn from_join(
 
     // map the left and right columns to binary expressions in the form `l = r`
     // build a single expression for the ON condition, such as `l.a = r.a AND l.b = r.b`
-    let eq_op = if join.null_equals_null {
-        Operator::IsNotDistinctFrom
-    } else {
-        Operator::Eq
+    let eq_op = match join.null_equality {
+        NullEquality::NullEqualsNothing => Operator::Eq,
+        NullEquality::NullEqualsNull => Operator::IsNotDistinctFrom,
     };
     let join_on = to_substrait_join_expr(producer, &join.on, eq_op, &in_join_schema)?;
 
@@ -113,8 +114,8 @@ fn to_substrait_jointype(join_type: JoinType) -> join_rel::JoinType {
         JoinType::LeftAnti => join_rel::JoinType::LeftAnti,
         JoinType::LeftSemi => join_rel::JoinType::LeftSemi,
         JoinType::LeftMark => join_rel::JoinType::LeftMark,
-        JoinType::RightAnti | JoinType::RightSemi => {
-            unimplemented!()
-        }
+        JoinType::RightMark => join_rel::JoinType::RightMark,
+        JoinType::RightAnti => join_rel::JoinType::RightAnti,
+        JoinType::RightSemi => join_rel::JoinType::RightSemi,
     }
 }
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
index 0190dca12bf53..33920cdf86f7a 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/project_rel.rs
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{substrait_field_ref, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, substrait_field_ref};
 use datafusion::logical_expr::{Projection, Window};
 use substrait::proto::rel::RelType;
 use substrait::proto::rel_common::EmitKind;
 use substrait::proto::rel_common::EmitKind::Emit;
-use substrait::proto::{rel_common, ProjectRel, Rel, RelCommon};
+use substrait::proto::{ProjectRel, Rel, RelCommon, rel_common};
 
 pub fn from_projection(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
index e4e0ab11c65ac..8dfbb36d3767d 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/read_rel.rs
@@ -16,19 +16,83 @@
 // under the License.
 
 use crate::logical_plan::producer::{
-    to_substrait_literal, to_substrait_named_struct, SubstraitProducer,
+    SubstraitProducer, to_substrait_literal, to_substrait_named_struct,
 };
-use datafusion::common::{not_impl_err, substrait_datafusion_err, DFSchema, ToDFSchema};
+use datafusion::common::{DFSchema, ToDFSchema, substrait_datafusion_err};
 use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{EmptyRelation, Expr, TableScan, Values};
+use datafusion::scalar::ScalarValue;
 use std::sync::Arc;
-use substrait::proto::expression::literal::Struct;
-use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
 use substrait::proto::expression::MaskExpression;
+use substrait::proto::expression::literal::Struct as LiteralStruct;
+use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
+use substrait::proto::expression::nested::Struct as NestedStruct;
 use substrait::proto::read_rel::{NamedTable, ReadType, VirtualTable};
 use substrait::proto::rel::RelType;
 use substrait::proto::{ReadRel, Rel};
 
+/// Converts rows of literal expressions into Substrait literal structs.
+///
+/// Each row is expected to contain only `Expr::Literal` or `Expr::Alias` wrapping literals.
+/// Aliases are unwrapped and the underlying literal is converted.
+fn convert_literal_rows(
+    producer: &mut impl SubstraitProducer,
+    rows: &[Vec<Expr>],
+) -> datafusion::common::Result<Vec<LiteralStruct>> {
+    rows.iter()
+        .map(|row| {
+            let fields = row
+                .iter()
+                .map(|expr| match expr {
+                    Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
+                    Expr::Alias(alias) => match alias.expr.as_ref() {
+                        // The schema gives us the names, so we can skip aliases
+                        Expr::Literal(sv, _) => to_substrait_literal(producer, sv),
+                        _ => Err(substrait_datafusion_err!(
+                            "Only literal types can be aliased in Virtual Tables, got: {}",
+                            alias.expr.variant_name()
+                        )),
+                    },
+                    _ => Err(substrait_datafusion_err!(
+                        "Only literal types and aliases are supported in Virtual Tables, got: {}",
+                        expr.variant_name()
+                    )),
+                })
+                .collect::<datafusion::common::Result<_>>()?;
+            Ok(LiteralStruct { fields })
+        })
+        .collect()
+}
+
+/// Converts rows of arbitrary expressions into Substrait nested structs.
+///
+/// Validates that each row has the expected schema length and converts each expression
+/// using the producer's expression handler.
+fn convert_expression_rows(
+    producer: &mut impl SubstraitProducer,
+    rows: &[Vec<Expr>],
+    schema_len: usize,
+    empty_schema: &Arc<DFSchema>,
+) -> datafusion::common::Result<Vec<NestedStruct>> {
+    rows.iter()
+        .map(|row| {
+            if row.len() != schema_len {
+                return Err(substrait_datafusion_err!(
+                    "Names list must match exactly to nested schema, but found {} uses for {} names",
+                    row.len(),
+                    schema_len
+                ));
+            }
+
+            let fields = row
+                .iter()
+                .map(|expr| producer.handle_expr(expr, empty_schema))
+                .collect::<datafusion::common::Result<_>>()?;
+            Ok(NestedStruct { fields })
+        })
+        .collect()
+}
+
 pub fn from_table_scan(
     producer: &mut impl SubstraitProducer,
     scan: &TableScan,
@@ -48,7 +112,7 @@ pub fn from_table_scan(
     });
 
     let table_schema = scan.source.schema().to_dfschema_ref()?;
-    let base_schema = to_substrait_named_struct(&table_schema)?;
+    let base_schema = to_substrait_named_struct(producer, &table_schema)?;
 
     let filter_option = if scan.filters.is_empty() {
         None
@@ -83,23 +147,61 @@ pub fn from_table_scan(
     }))
 }
 
-pub fn from_empty_relation(e: &EmptyRelation) -> datafusion::common::Result<Box<Rel>> {
-    if e.produce_one_row {
-        return not_impl_err!("Producing a row from empty relation is unsupported");
-    }
-    #[allow(deprecated)]
+/// Encodes an EmptyRelation as a Substrait VirtualTable.
+///
+/// EmptyRelation represents a relation with no input data. When `produce_one_row` is true,
+/// it generates a single row with all fields set to their default values (typically NULL).
+/// This is used for queries without a FROM clause, such as "SELECT 1 AS one" or
+/// "SELECT current_timestamp()".
+///
+/// When `produce_one_row` is false, it represents a truly empty relation with no rows,
+/// used in optimizations or as a placeholder.
+pub fn from_empty_relation(
+    producer: &mut impl SubstraitProducer,
+    e: &EmptyRelation,
+) -> datafusion::common::Result<Box<Rel>> {
+    let base_schema = to_substrait_named_struct(producer, &e.schema)?;
+
+    let read_type = if e.produce_one_row {
+        // Create one row with default scalar values for each field in the schema.
+        // For example, an Int32 field gets Int32(NULL), a Utf8 field gets Utf8(NULL), etc.
+        // This represents the "phantom row" that provides a context for evaluating
+        // scalar expressions in queries without a FROM clause.
+        let fields = e
+            .schema
+            .fields()
+            .iter()
+            .map(|f| {
+                let scalar = ScalarValue::try_from(f.data_type())?;
+                to_substrait_literal(producer, &scalar)
+            })
+            .collect::<datafusion::common::Result<_>>()?;
+
+        ReadType::VirtualTable(VirtualTable {
+            // Use deprecated 'values' field instead of 'expressions' because the consumer's
+            // nested expression support (RexType::Nested) is not yet implemented.
+            // The 'values' field uses literal::Struct which the consumer can properly
+            // deserialize with field name preservation.
+            #[expect(deprecated)]
+            values: vec![LiteralStruct { fields }],
+            expressions: vec![],
+        })
+    } else {
+        ReadType::VirtualTable(VirtualTable {
+            #[expect(deprecated)]
+            values: vec![],
+            expressions: vec![],
+        })
+    };
     Ok(Box::new(Rel {
         rel_type: Some(RelType::Read(Box::new(ReadRel {
             common: None,
-            base_schema: Some(to_substrait_named_struct(&e.schema)?),
+            base_schema: Some(base_schema),
             filter: None,
             best_effort_filter: None,
             projection: None,
             advanced_extension: None,
-            read_type: Some(ReadType::VirtualTable(VirtualTable {
-                values: vec![],
-                expressions: vec![],
-            })),
+            read_type: Some(read_type),
         }))),
     }))
 }
@@ -108,41 +210,37 @@ pub fn from_values(
     producer: &mut impl SubstraitProducer,
     v: &Values,
 ) -> datafusion::common::Result<Box<Rel>> {
-    let values = v
-        .values
-        .iter()
-        .map(|row| {
-            let fields = row
-                .iter()
-                .map(|v| match v {
-                    Expr::Literal(sv) => to_substrait_literal(producer, sv),
-                    Expr::Alias(alias) => match alias.expr.as_ref() {
-                        // The schema gives us the names, so we can skip aliases
-                        Expr::Literal(sv) => to_substrait_literal(producer, sv),
-                        _ => Err(substrait_datafusion_err!(
-                                    "Only literal types can be aliased in Virtual Tables, got: {}", alias.expr.variant_name()
-                                )),
-                    },
-                    _ => Err(substrait_datafusion_err!(
-                                "Only literal types and aliases are supported in Virtual Tables, got: {}", v.variant_name()
-                            )),
-                })
-                .collect::<datafusion::common::Result<_>>()?;
-            Ok(Struct { fields })
+    let schema_len = v.schema.fields().len();
+    let empty_schema = Arc::new(DFSchema::empty());
+
+    let use_literals = v.values.iter().all(|row| {
+        row.iter().all(|expr| match expr {
+            Expr::Literal(_, _) => true,
+            Expr::Alias(alias) => matches!(alias.expr.as_ref(), Expr::Literal(_, _)),
+            _ => false,
         })
-        .collect::<datafusion::common::Result<_>>()?;
-    #[allow(deprecated)]
+    });
+
+    let (values, expressions) = if use_literals {
+        let values = convert_literal_rows(producer, &v.values)?;
+        (values, vec![])
+    } else {
+        let expressions =
+            convert_expression_rows(producer, &v.values, schema_len, &empty_schema)?;
+        (vec![], expressions)
+    };
     Ok(Box::new(Rel {
         rel_type: Some(RelType::Read(Box::new(ReadRel {
             common: None,
-            base_schema: Some(to_substrait_named_struct(&v.schema)?),
+            base_schema: Some(to_substrait_named_struct(producer, &v.schema)?),
             filter: None,
             best_effort_filter: None,
             projection: None,
             advanced_extension: None,
+            #[expect(deprecated)]
             read_type: Some(ReadType::VirtualTable(VirtualTable {
                 values,
-                expressions: vec![],
+                expressions,
             })),
         }))),
     }))
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
index 58ddfca3617ae..41482c11854bb 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/set_rel.rs
@@ -18,7 +18,7 @@
 use crate::logical_plan::producer::SubstraitProducer;
 use datafusion::logical_expr::Union;
 use substrait::proto::rel::RelType;
-use substrait::proto::{set_rel, Rel, SetRel};
+use substrait::proto::{Rel, SetRel, set_rel};
 
 pub fn from_union(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs b/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
index aaa8be1635600..d4520a4c37b14 100644
--- a/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
+++ b/datafusion/substrait/src/logical_plan/producer/rel/sort_rel.rs
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{substrait_sort_field, SubstraitProducer};
+use crate::logical_plan::producer::{SubstraitProducer, substrait_sort_field};
 use crate::variation_const::DEFAULT_TYPE_VARIATION_REF;
 use datafusion::logical_expr::Sort;
 use substrait::proto::expression::literal::LiteralType;
 use substrait::proto::expression::{Literal, RexType};
 use substrait::proto::rel::RelType;
-use substrait::proto::{fetch_rel, Expression, FetchRel, Rel, SortRel};
+use substrait::proto::{Expression, FetchRel, Rel, SortRel, fetch_rel};
 
 pub fn from_sort(
     producer: &mut impl SubstraitProducer,
diff --git a/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs b/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
index 56edfac5769cf..51d2c0ca8e783 100644
--- a/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer/substrait_producer.rs
@@ -18,20 +18,24 @@
 use crate::extensions::Extensions;
 use crate::logical_plan::producer::{
     from_aggregate, from_aggregate_function, from_alias, from_between, from_binary_expr,
-    from_case, from_cast, from_column, from_distinct, from_empty_relation, from_filter,
-    from_in_list, from_in_subquery, from_join, from_like, from_limit, from_literal,
-    from_projection, from_repartition, from_scalar_function, from_sort,
-    from_subquery_alias, from_table_scan, from_try_cast, from_unary_expr, from_union,
-    from_values, from_window, from_window_function, to_substrait_rel, to_substrait_rex,
+    from_case, from_cast, from_column, from_distinct, from_empty_relation, from_exists,
+    from_filter, from_in_list, from_in_subquery, from_join, from_like, from_limit,
+    from_literal, from_projection, from_repartition, from_scalar_function,
+    from_scalar_subquery, from_set_comparison, from_sort, from_subquery_alias,
+    from_table_scan, from_try_cast, from_unary_expr, from_union, from_values,
+    from_window, from_window_function, to_substrait_rel, to_substrait_rex,
 };
-use datafusion::common::{substrait_err, Column, DFSchemaRef, ScalarValue};
-use datafusion::execution::registry::SerializerRegistry;
+use datafusion::common::{Column, DFSchemaRef, ScalarValue, substrait_err};
 use datafusion::execution::SessionState;
-use datafusion::logical_expr::expr::{Alias, InList, InSubquery, WindowFunction};
+use datafusion::execution::registry::SerializerRegistry;
+use datafusion::logical_expr::Subquery;
+use datafusion::logical_expr::expr::{
+    Alias, Exists, InList, InSubquery, SetComparison, WindowFunction,
+};
 use datafusion::logical_expr::{
-    expr, Aggregate, Between, BinaryExpr, Case, Cast, Distinct, EmptyRelation, Expr,
-    Extension, Filter, Join, Like, Limit, LogicalPlan, Projection, Repartition, Sort,
-    SubqueryAlias, TableScan, TryCast, Union, Values, Window,
+    Aggregate, Between, BinaryExpr, Case, Cast, Distinct, EmptyRelation, Expr, Extension,
+    Filter, Join, Like, Limit, LogicalPlan, Projection, Repartition, Sort, SubqueryAlias,
+    TableScan, TryCast, Union, Values, Window, expr,
 };
 use pbjson_types::Any as ProtoAny;
 use substrait::proto::aggregate_rel::Measure;
@@ -67,7 +71,11 @@ use substrait::proto::{
 /// impl SubstraitProducer for CustomSubstraitProducer {
 ///
 ///     fn register_function(&mut self, signature: String) -> u32 {
-///        self.extensions.register_function(signature)
+///        self.extensions.register_function(&signature)
+///     }
+///
+///     fn register_type(&mut self, type_name: String) -> u32 {
+///         self.extensions.register_type(&type_name)
 ///     }
 ///
 ///     fn get_extensions(self) -> Extensions {
@@ -114,6 +122,15 @@ pub trait SubstraitProducer: Send + Sync + Sized {
     /// there is one. Otherwise, it should generate a new anchor.
     fn register_function(&mut self, signature: String) -> u32;
 
+    /// Within a Substrait plan, user defined types are referenced using type anchors that are stored at
+    /// the top level of the [Plan](substrait::proto::Plan) within
+    /// [ExtensionType](substrait::proto::extensions::simple_extension_declaration::ExtensionType)
+    /// messages.
+    ///
+    /// When given a type name, this method should return the existing anchor for it if
+    /// there is one. Otherwise, it should generate a new anchor.
+    fn register_type(&mut self, name: String) -> u32;
+
     /// Consume the producer to generate the [Extensions] for the Substrait plan based on the
     /// functions that have been registered
     fn get_extensions(self) -> Extensions;
@@ -182,7 +199,7 @@ pub trait SubstraitProducer: Send + Sync + Sized {
         &mut self,
         plan: &EmptyRelation,
     ) -> datafusion::common::Result<Box<Rel>> {
-        from_empty_relation(plan)
+        from_empty_relation(self, plan)
     }
 
     fn handle_subquery_alias(
@@ -211,7 +228,9 @@ pub trait SubstraitProducer: Send + Sync + Sized {
         &mut self,
         _plan: &Extension,
     ) -> datafusion::common::Result<Box<Rel>> {
-        substrait_err!("Specify handling for LogicalPlan::Extension by implementing the SubstraitProducer trait")
+        substrait_err!(
+            "Specify handling for LogicalPlan::Extension by implementing the SubstraitProducer trait"
+        )
     }
 
     // Expression Methods
@@ -346,6 +365,29 @@ pub trait SubstraitProducer: Send + Sync + Sized {
     ) -> datafusion::common::Result<Expression> {
         from_in_subquery(self, in_subquery, schema)
     }
+
+    fn handle_set_comparison(
+        &mut self,
+        set_comparison: &SetComparison,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_set_comparison(self, set_comparison, schema)
+    }
+    fn handle_scalar_subquery(
+        &mut self,
+        subquery: &Subquery,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_scalar_subquery(self, subquery, schema)
+    }
+
+    fn handle_exists(
+        &mut self,
+        exists: &Exists,
+        schema: &DFSchemaRef,
+    ) -> datafusion::common::Result<Expression> {
+        from_exists(self, exists, schema)
+    }
 }
 
 pub struct DefaultSubstraitProducer<'a> {
@@ -364,7 +406,11 @@ impl<'a> DefaultSubstraitProducer<'a> {
 
 impl SubstraitProducer for DefaultSubstraitProducer<'_> {
     fn register_function(&mut self, fn_name: String) -> u32 {
-        self.extensions.register_function(fn_name)
+        self.extensions.register_function(&fn_name)
+    }
+
+    fn register_type(&mut self, type_name: String) -> u32 {
+        self.extensions.register_type(&type_name)
     }
 
     fn get_extensions(self) -> Extensions {
diff --git a/datafusion/substrait/src/logical_plan/producer/types.rs b/datafusion/substrait/src/logical_plan/producer/types.rs
index 61b7a79095d57..fa58949e6ecd2 100644
--- a/datafusion/substrait/src/logical_plan/producer/types.rs
+++ b/datafusion/substrait/src/logical_plan/producer/types.rs
@@ -16,28 +16,50 @@
 // under the License.
 
 use crate::logical_plan::producer::utils::flatten_names;
+use crate::logical_plan::producer::{SubstraitProducer, to_substrait_precision};
 use crate::variation_const::{
     DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF,
     DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
-    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
-    LARGE_CONTAINER_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF,
-    VIEW_CONTAINER_TYPE_VARIATION_REF,
+    DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_INTERVAL_DAY_TYPE_VARIATION_REF,
+    DEFAULT_MAP_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
+    DICTIONARY_MAP_TYPE_VARIATION_REF, DURATION_INTERVAL_DAY_TYPE_VARIATION_REF,
+    FLOAT_16_TYPE_NAME, LARGE_CONTAINER_TYPE_VARIATION_REF, NULL_TYPE_NAME,
+    TIME_32_TYPE_VARIATION_REF, TIME_64_TYPE_VARIATION_REF,
+    UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF,
 };
-use datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
-use datafusion::common::{internal_err, not_impl_err, plan_err, DFSchemaRef};
-use substrait::proto::{r#type, NamedStruct};
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, IntervalUnit};
+use datafusion::common::{DFSchemaRef, not_impl_err, plan_err};
+use substrait::proto::{NamedStruct, r#type};
 
 pub(crate) fn to_substrait_type(
+    producer: &mut impl SubstraitProducer,
     dt: &DataType,
     nullable: bool,
 ) -> datafusion::common::Result<substrait::proto::Type> {
-    let nullability = if nullable {
+    to_substrait_type_from_field(producer, &Field::new("", dt.clone(), nullable).into())
+}
+
+pub(crate) fn to_substrait_type_from_field(
+    producer: &mut impl SubstraitProducer,
+    field: &FieldRef,
+) -> datafusion::common::Result<substrait::proto::Type> {
+    let nullability = if field.is_nullable() {
         r#type::Nullability::Nullable as i32
     } else {
         r#type::Nullability::Required as i32
     };
-    match dt {
-        DataType::Null => internal_err!("Null cast is not valid"),
+    match field.data_type() {
+        DataType::Null => {
+            let type_anchor = producer.register_type(NULL_TYPE_NAME.to_string());
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::UserDefined(r#type::UserDefined {
+                    type_reference: type_anchor,
+                    type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
+                    nullability,
+                    type_parameters: vec![],
+                })),
+            })
+        }
         DataType::Boolean => Ok(substrait::proto::Type {
             kind: Some(r#type::Kind::Bool(r#type::Boolean {
                 type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
@@ -92,7 +114,17 @@ pub(crate) fn to_substrait_type(
                 nullability,
             })),
         }),
-        // Float16 is not supported in Substrait
+        DataType::Float16 => {
+            let type_anchor = producer.register_type(FLOAT_16_TYPE_NAME.to_string());
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::UserDefined(r#type::UserDefined {
+                    type_reference: type_anchor,
+                    type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
+                    nullability,
+                    type_parameters: vec![],
+                })),
+            })
+        }
         DataType::Float32 => Ok(substrait::proto::Type {
             kind: Some(r#type::Kind::Fp32(r#type::Fp32 {
                 type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
@@ -106,12 +138,7 @@ pub(crate) fn to_substrait_type(
             })),
         }),
         DataType::Timestamp(unit, tz) => {
-            let precision = match unit {
-                TimeUnit::Second => 0,
-                TimeUnit::Millisecond => 3,
-                TimeUnit::Microsecond => 6,
-                TimeUnit::Nanosecond => 9,
-            };
+            let precision = to_substrait_precision(unit);
             let kind = match tz {
                 None => r#type::Kind::PrecisionTimestamp(r#type::PrecisionTimestamp {
                     type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
@@ -131,6 +158,26 @@ pub(crate) fn to_substrait_type(
             };
             Ok(substrait::proto::Type { kind: Some(kind) })
         }
+        DataType::Time32(unit) => {
+            let precision = to_substrait_precision(unit);
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::PrecisionTime(r#type::PrecisionTime {
+                    precision,
+                    type_variation_reference: TIME_32_TYPE_VARIATION_REF,
+                    nullability,
+                })),
+            })
+        }
+        DataType::Time64(unit) => {
+            let precision = to_substrait_precision(unit);
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::PrecisionTime(r#type::PrecisionTime {
+                    precision,
+                    type_variation_reference: TIME_64_TYPE_VARIATION_REF,
+                    nullability,
+                })),
+            })
+        }
         DataType::Date32 => Ok(substrait::proto::Type {
             kind: Some(r#type::Kind::Date(r#type::Date {
                 type_variation_reference: DATE_32_TYPE_VARIATION_REF,
@@ -153,7 +200,7 @@ pub(crate) fn to_substrait_type(
                 }),
                 IntervalUnit::DayTime => Ok(substrait::proto::Type {
                     kind: Some(r#type::Kind::IntervalDay(r#type::IntervalDay {
-                        type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
+                        type_variation_reference: DEFAULT_INTERVAL_DAY_TYPE_VARIATION_REF,
                         nullability,
                         precision: Some(3), // DayTime precision is always milliseconds
                     })),
@@ -171,6 +218,16 @@ pub(crate) fn to_substrait_type(
                 }
             }
         }
+        DataType::Duration(duration_unit) => {
+            let precision = to_substrait_precision(duration_unit);
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::IntervalDay(r#type::IntervalDay {
+                    type_variation_reference: DURATION_INTERVAL_DAY_TYPE_VARIATION_REF,
+                    nullability,
+                    precision: Some(precision),
+                })),
+            })
+        }
         DataType::Binary => Ok(substrait::proto::Type {
             kind: Some(r#type::Kind::Binary(r#type::Binary {
                 type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF,
@@ -215,7 +272,8 @@ pub(crate) fn to_substrait_type(
             })),
         }),
         DataType::List(inner) => {
-            let inner_type = to_substrait_type(inner.data_type(), inner.is_nullable())?;
+            let inner_type =
+                to_substrait_type(producer, inner.data_type(), inner.is_nullable())?;
             Ok(substrait::proto::Type {
                 kind: Some(r#type::Kind::List(Box::new(r#type::List {
                     r#type: Some(Box::new(inner_type)),
@@ -225,7 +283,8 @@ pub(crate) fn to_substrait_type(
             })
         }
         DataType::LargeList(inner) => {
-            let inner_type = to_substrait_type(inner.data_type(), inner.is_nullable())?;
+            let inner_type =
+                to_substrait_type(producer, inner.data_type(), inner.is_nullable())?;
             Ok(substrait::proto::Type {
                 kind: Some(r#type::Kind::List(Box::new(r#type::List {
                     r#type: Some(Box::new(inner_type)),
@@ -236,29 +295,42 @@ pub(crate) fn to_substrait_type(
         }
         DataType::Map(inner, _) => match inner.data_type() {
             DataType::Struct(key_and_value) if key_and_value.len() == 2 => {
-                let key_type = to_substrait_type(
-                    key_and_value[0].data_type(),
-                    key_and_value[0].is_nullable(),
-                )?;
-                let value_type = to_substrait_type(
-                    key_and_value[1].data_type(),
-                    key_and_value[1].is_nullable(),
-                )?;
+                let key_type = to_substrait_type_from_field(producer, &key_and_value[0])?;
+                let value_type =
+                    to_substrait_type_from_field(producer, &key_and_value[1])?;
                 Ok(substrait::proto::Type {
                     kind: Some(r#type::Kind::Map(Box::new(r#type::Map {
                         key: Some(Box::new(key_type)),
                         value: Some(Box::new(value_type)),
-                        type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF,
+                        type_variation_reference: DEFAULT_MAP_TYPE_VARIATION_REF,
                         nullability,
                     }))),
                 })
             }
             _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"),
         },
+        DataType::Dictionary(key_type, value_type) => {
+            let key_type = to_substrait_type_from_field(
+                producer,
+                &Field::new("", key_type.as_ref().clone(), field.is_nullable()).into(),
+            )?;
+            let value_type = to_substrait_type_from_field(
+                producer,
+                &Field::new("", value_type.as_ref().clone(), field.is_nullable()).into(),
+            )?;
+            Ok(substrait::proto::Type {
+                kind: Some(r#type::Kind::Map(Box::new(r#type::Map {
+                    key: Some(Box::new(key_type)),
+                    value: Some(Box::new(value_type)),
+                    type_variation_reference: DICTIONARY_MAP_TYPE_VARIATION_REF,
+                    nullability,
+                }))),
+            })
+        }
         DataType::Struct(fields) => {
             let field_types = fields
                 .iter()
-                .map(|field| to_substrait_type(field.data_type(), field.is_nullable()))
+                .map(|field| to_substrait_type_from_field(producer, field))
                 .collect::<datafusion::common::Result<Vec<_>>>()?;
             Ok(substrait::proto::Type {
                 kind: Some(r#type::Kind::Struct(r#type::Struct {
@@ -284,11 +356,12 @@ pub(crate) fn to_substrait_type(
                 precision: *p as i32,
             })),
         }),
-        _ => not_impl_err!("Unsupported cast type: {dt:?}"),
+        _ => not_impl_err!("Unsupported cast type: {field}"),
     }
 }
 
 pub(crate) fn to_substrait_named_struct(
+    producer: &mut impl SubstraitProducer,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<NamedStruct> {
     let mut names = Vec::with_capacity(schema.fields().len());
@@ -300,7 +373,7 @@ pub(crate) fn to_substrait_named_struct(
         types: schema
             .fields()
             .iter()
-            .map(|f| to_substrait_type(f.data_type(), f.is_nullable()))
+            .map(|f| to_substrait_type_from_field(producer, f))
             .collect::<datafusion::common::Result<_>>()?,
         type_variation_reference: DEFAULT_TYPE_VARIATION_REF,
         nullability: r#type::Nullability::Required as i32,
@@ -317,14 +390,18 @@ mod tests {
     use super::*;
     use crate::logical_plan::consumer::tests::test_consumer;
     use crate::logical_plan::consumer::{
-        from_substrait_named_struct, from_substrait_type_without_names,
+        DefaultSubstraitConsumer, from_substrait_named_struct,
+        from_substrait_type_without_names,
     };
-    use datafusion::arrow::datatypes::{Field, Fields, Schema};
+    use crate::logical_plan::producer::DefaultSubstraitProducer;
+    use datafusion::arrow::datatypes::{Field, Fields, Schema, TimeUnit};
     use datafusion::common::{DFSchema, Result};
+    use datafusion::prelude::SessionContext;
     use std::sync::Arc;
 
     #[test]
     fn round_trip_types() -> Result<()> {
+        round_trip_type(DataType::Null)?;
         round_trip_type(DataType::Boolean)?;
         round_trip_type(DataType::Int8)?;
         round_trip_type(DataType::UInt8)?;
@@ -344,6 +421,10 @@ mod tests {
             round_trip_type(DataType::Timestamp(TimeUnit::Nanosecond, tz))?;
         }
 
+        round_trip_type(DataType::Time32(TimeUnit::Second))?;
+        round_trip_type(DataType::Time32(TimeUnit::Millisecond))?;
+        round_trip_type(DataType::Time64(TimeUnit::Microsecond))?;
+        round_trip_type(DataType::Time64(TimeUnit::Nanosecond))?;
         round_trip_type(DataType::Date32)?;
         round_trip_type(DataType::Date64)?;
         round_trip_type(DataType::Binary)?;
@@ -375,6 +456,10 @@ mod tests {
             .into(),
             false,
         ))?;
+        round_trip_type(DataType::Dictionary(
+            Box::new(DataType::Utf8),
+            Box::new(DataType::Int32),
+        ))?;
 
         round_trip_type(DataType::Struct(
             vec![
@@ -388,16 +473,29 @@ mod tests {
         round_trip_type(DataType::Interval(IntervalUnit::MonthDayNano))?;
         round_trip_type(DataType::Interval(IntervalUnit::DayTime))?;
 
+        round_trip_type(DataType::Duration(TimeUnit::Second))?;
+        round_trip_type(DataType::Duration(TimeUnit::Millisecond))?;
+        round_trip_type(DataType::Duration(TimeUnit::Microsecond))?;
+        round_trip_type(DataType::Duration(TimeUnit::Nanosecond))?;
+
         Ok(())
     }
 
     fn round_trip_type(dt: DataType) -> Result<()> {
-        println!("Checking round trip of {dt:?}");
+        println!("Checking round trip of {dt}");
+
+        let state = SessionContext::default().state();
+        let mut producer = DefaultSubstraitProducer::new(&state);
 
         // As DataFusion doesn't consider nullability as a property of the type, but field,
         // it doesn't matter if we set nullability to true or false here.
-        let substrait = to_substrait_type(&dt, true)?;
-        let consumer = test_consumer();
+        let substrait = to_substrait_type(&mut producer, &dt, true)?;
+
+        // Get the extensions from the producer so the consumer can look up
+        // any registered user-defined types (like "null" or "f16")
+        let extensions = producer.get_extensions();
+        let consumer = DefaultSubstraitConsumer::new(&extensions, &state);
+
         let roundtrip_dt = from_substrait_type_without_names(&consumer, &substrait)?;
         assert_eq!(dt, roundtrip_dt);
         Ok(())
@@ -419,7 +517,10 @@ mod tests {
             Field::new("trailer", DataType::Float64, true),
         ]))?);
 
-        let named_struct = to_substrait_named_struct(&schema)?;
+        let state = SessionContext::default().state();
+        let mut producer = DefaultSubstraitProducer::new(&state);
+
+        let named_struct = to_substrait_named_struct(&mut producer, &schema)?;
 
         // Struct field names should be flattened DFS style
         // List field names should be omitted
diff --git a/datafusion/substrait/src/logical_plan/producer/utils.rs b/datafusion/substrait/src/logical_plan/producer/utils.rs
index 5429e4a1ad889..e8310f4acd31e 100644
--- a/datafusion/substrait/src/logical_plan/producer/utils.rs
+++ b/datafusion/substrait/src/logical_plan/producer/utils.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 use crate::logical_plan::producer::SubstraitProducer;
-use datafusion::arrow::datatypes::{DataType, Field};
-use datafusion::common::{plan_err, DFSchemaRef};
+use datafusion::arrow::datatypes::{DataType, Field, TimeUnit};
+use datafusion::common::{DFSchemaRef, plan_err};
 use datafusion::logical_expr::SortExpr;
 use substrait::proto::sort_field::{SortDirection, SortKind};
-use substrait::proto::SortField;
+use substrait::proto::{Expression, SortField};
 
 // Substrait wants a list of all field names, including nested fields from structs,
 // also from within e.g. lists and maps. However, it does not want the list and map field names
@@ -76,3 +76,37 @@ pub(crate) fn substrait_sort_field(
         sort_kind: Some(SortKind::Direction(d as i32)),
     })
 }
+
+pub(crate) fn to_substrait_precision(time_unit: &TimeUnit) -> i32 {
+    match time_unit {
+        TimeUnit::Second => 0,
+        TimeUnit::Millisecond => 3,
+        TimeUnit::Microsecond => 6,
+        TimeUnit::Nanosecond => 9,
+    }
+}
+
+/// Wraps an expression with a `not()` function.
+pub(crate) fn negate(
+    producer: &mut impl SubstraitProducer,
+    expr: Expression,
+) -> Expression {
+    let function_anchor = producer.register_function("not".to_string());
+
+    #[expect(deprecated)]
+    Expression {
+        rex_type: Some(substrait::proto::expression::RexType::ScalarFunction(
+            substrait::proto::expression::ScalarFunction {
+                function_reference: function_anchor,
+                arguments: vec![substrait::proto::FunctionArgument {
+                    arg_type: Some(substrait::proto::function_argument::ArgType::Value(
+                        expr,
+                    )),
+                }],
+                output_type: None,
+                args: vec![],
+                options: vec![],
+            },
+        )),
+    }
+}
diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs
index 4990054ac7fc7..ccaf1abec4245 100644
--- a/datafusion/substrait/src/physical_plan/consumer.rs
+++ b/datafusion/substrait/src/physical_plan/consumer.rs
@@ -37,11 +37,11 @@ use async_recursion::async_recursion;
 use chrono::DateTime;
 use datafusion::datasource::memory::DataSourceExec;
 use object_store::ObjectMeta;
-use substrait::proto::r#type::{Kind, Nullability};
-use substrait::proto::read_rel::local_files::file_or_files::PathType;
 use substrait::proto::Type;
+use substrait::proto::read_rel::local_files::file_or_files::PathType;
+use substrait::proto::r#type::{Kind, Nullability};
 use substrait::proto::{
-    expression::MaskExpression, read_rel::ReadType, rel::RelType, Rel,
+    Rel, expression::MaskExpression, read_rel::ReadType, rel::RelType,
 };
 
 /// Convert Substrait Rel to DataFusion ExecutionPlan
@@ -53,7 +53,6 @@ pub async fn from_substrait_rel(
 ) -> Result<Arc<dyn ExecutionPlan>> {
     let mut base_config_builder;
 
-    let source = Arc::new(ParquetSource::default());
     match &rel.rel_type {
         Some(RelType::Read(read)) => {
             if read.filter.is_some() || read.best_effort_filter.is_some() {
@@ -80,9 +79,10 @@ pub async fn from_substrait_rel(
                 .collect::<Result<Vec<Field>>>()
             {
                 Ok(fields) => {
+                    let schema = Arc::new(Schema::new(fields));
+                    let source = Arc::new(ParquetSource::new(Arc::clone(&schema)));
                     base_config_builder = FileScanConfigBuilder::new(
                         ObjectStoreUrl::local_filesystem(),
-                        Arc::new(Schema::new(fields)),
                         source,
                     );
                 }
@@ -119,20 +119,14 @@ pub async fn from_substrait_rel(
                         .unwrap();
                         let size = 0;
 
-                        let partitioned_file = PartitionedFile {
-                            object_meta: ObjectMeta {
+                        let partitioned_file =
+                            PartitionedFile::new_from_meta(ObjectMeta {
                                 last_modified: last_modified.into(),
                                 location: path.into(),
                                 size,
                                 e_tag: None,
                                 version: None,
-                            },
-                            partition_values: vec![],
-                            range: None,
-                            statistics: None,
-                            extensions: None,
-                            metadata_size_hint: None,
-                        };
+                            });
 
                         let part_index = file.partition_index as usize;
                         while part_index >= file_groups.len() {
@@ -144,16 +138,16 @@ pub async fn from_substrait_rel(
                     base_config_builder =
                         base_config_builder.with_file_groups(file_groups);
 
-                    if let Some(MaskExpression { select, .. }) = &read.projection {
-                        if let Some(projection) = &select.as_ref() {
-                            let column_indices: Vec<usize> = projection
-                                .struct_items
-                                .iter()
-                                .map(|item| item.field as usize)
-                                .collect();
-                            base_config_builder =
-                                base_config_builder.with_projection(Some(column_indices));
-                        }
+                    if let Some(MaskExpression { select, .. }) = &read.projection
+                        && let Some(projection) = &select.as_ref()
+                    {
+                        let column_indices: Vec<usize> = projection
+                            .struct_items
+                            .iter()
+                            .map(|item| item.field as usize)
+                            .collect();
+                        base_config_builder = base_config_builder
+                            .with_projection_indices(Some(column_indices))?;
                     }
 
                     Ok(
@@ -166,7 +160,7 @@ pub async fn from_substrait_rel(
                 ),
             }
         }
-        _ => not_impl_err!("Unsupported RelType: {:?}", rel.rel_type),
+        _ => not_impl_err!("Unsupported Reltype: {:?}", rel.rel_type),
     }
 }
 
diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs
index cb725a7277fd3..7a2da70352b00 100644
--- a/datafusion/substrait/src/physical_plan/producer.rs
+++ b/datafusion/substrait/src/physical_plan/producer.rs
@@ -25,23 +25,23 @@ use crate::variation_const::{
 use datafusion::arrow::datatypes::DataType;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::error::{DataFusionError, Result};
-use datafusion::physical_plan::{displayable, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, displayable};
 
 use datafusion::datasource::physical_plan::ParquetSource;
-use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
+use substrait::proto::ReadRel;
+use substrait::proto::Rel;
 use substrait::proto::expression::MaskExpression;
-use substrait::proto::r#type::{
-    Binary, Boolean, Fp64, Kind, Nullability, String as SubstraitString, Struct, I64,
-};
-use substrait::proto::read_rel::local_files::file_or_files::ParquetReadOptions;
-use substrait::proto::read_rel::local_files::file_or_files::{FileFormat, PathType};
-use substrait::proto::read_rel::local_files::FileOrFiles;
+use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
 use substrait::proto::read_rel::LocalFiles;
 use substrait::proto::read_rel::ReadType;
+use substrait::proto::read_rel::local_files::FileOrFiles;
+use substrait::proto::read_rel::local_files::file_or_files::ParquetReadOptions;
+use substrait::proto::read_rel::local_files::file_or_files::{FileFormat, PathType};
 use substrait::proto::rel::RelType;
-use substrait::proto::ReadRel;
-use substrait::proto::Rel;
-use substrait::proto::{extensions, NamedStruct, Type};
+use substrait::proto::r#type::{
+    Binary, Boolean, Fp64, I64, Kind, Nullability, String as SubstraitString, Struct,
+};
+use substrait::proto::{NamedStruct, Type, extensions};
 
 /// Convert DataFusion ExecutionPlan to Substrait Rel
 pub fn to_substrait_rel(
@@ -51,84 +51,84 @@ pub fn to_substrait_rel(
         HashMap<String, u32>,
     ),
 ) -> Result<Box<Rel>> {
-    if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
-        if let Some((file_config, _)) =
+    if let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>()
+        && let Some((file_config, _)) =
             data_source_exec.downcast_to_file_source::<ParquetSource>()
-        {
-            let mut substrait_files = vec![];
-            for (partition_index, files) in file_config.file_groups.iter().enumerate() {
-                for file in files.iter() {
-                    substrait_files.push(FileOrFiles {
-                        partition_index: partition_index.try_into().unwrap(),
-                        start: 0,
-                        length: file.object_meta.size,
-                        path_type: Some(PathType::UriPath(
-                            file.object_meta.location.as_ref().to_string(),
-                        )),
-                        file_format: Some(FileFormat::Parquet(ParquetReadOptions {})),
-                    });
-                }
+    {
+        let mut substrait_files = vec![];
+        for (partition_index, files) in file_config.file_groups.iter().enumerate() {
+            for file in files.iter() {
+                substrait_files.push(FileOrFiles {
+                    partition_index: partition_index.try_into().unwrap(),
+                    start: 0,
+                    length: file.object_meta.size,
+                    path_type: Some(PathType::UriPath(
+                        file.object_meta.location.as_ref().to_string(),
+                    )),
+                    file_format: Some(FileFormat::Parquet(ParquetReadOptions {})),
+                });
             }
+        }
 
-            let mut names = vec![];
-            let mut types = vec![];
+        let mut names = vec![];
+        let mut types = vec![];
 
-            for field in file_config.file_schema.fields.iter() {
-                match to_substrait_type(field.data_type(), field.is_nullable()) {
-                    Ok(t) => {
-                        names.push(field.name().clone());
-                        types.push(t);
-                    }
-                    Err(e) => return Err(e),
+        for field in file_config.file_schema().fields.iter() {
+            match to_substrait_type(field.data_type(), field.is_nullable()) {
+                Ok(t) => {
+                    names.push(field.name().clone());
+                    types.push(t);
                 }
+                Err(e) => return Err(e),
             }
+        }
 
-            let type_info = Struct {
-                types,
-                // FIXME: duckdb doesn't set this field, keep it as default variant 0.
-                // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1106-L1127
-                type_variation_reference: 0,
-                nullability: Nullability::Required.into(),
-            };
+        let type_info = Struct {
+            types,
+            // FIXME: duckdb doesn't set this field, keep it as default variant 0.
+            // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1106-L1127
+            type_variation_reference: 0,
+            nullability: Nullability::Required.into(),
+        };
 
-            let mut select_struct = None;
-            if let Some(projection) = file_config.projection.as_ref() {
-                let struct_items = projection
-                    .iter()
-                    .map(|index| StructItem {
-                        field: *index as i32,
-                        // FIXME: duckdb sets this to None, but it's not clear why.
-                        // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191
-                        child: None,
-                    })
-                    .collect();
+        let mut select_struct = None;
+        if let Some(projection) = file_config.file_source().projection().as_ref() {
+            let struct_items = projection
+                .column_indices()
+                .into_iter()
+                .map(|index| StructItem {
+                    field: index as i32,
+                    // FIXME: duckdb sets this to None, but it's not clear why.
+                    // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191
+                    child: None,
+                })
+                .collect();
 
-                select_struct = Some(StructSelect { struct_items });
-            }
+            select_struct = Some(StructSelect { struct_items });
+        }
 
-            return Ok(Box::new(Rel {
-                rel_type: Some(RelType::Read(Box::new(ReadRel {
-                    common: None,
-                    base_schema: Some(NamedStruct {
-                        names,
-                        r#struct: Some(type_info),
-                    }),
-                    filter: None,
-                    best_effort_filter: None,
-                    projection: Some(MaskExpression {
-                        select: select_struct,
-                        // FIXME: duckdb set this to true, but it's not clear why.
-                        // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1186.
-                        maintain_singular_struct: true,
-                    }),
+        return Ok(Box::new(Rel {
+            rel_type: Some(RelType::Read(Box::new(ReadRel {
+                common: None,
+                base_schema: Some(NamedStruct {
+                    names,
+                    r#struct: Some(type_info),
+                }),
+                filter: None,
+                best_effort_filter: None,
+                projection: Some(MaskExpression {
+                    select: select_struct,
+                    // FIXME: duckdb set this to true, but it's not clear why.
+                    // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1186.
+                    maintain_singular_struct: true,
+                }),
+                advanced_extension: None,
+                read_type: Some(ReadType::LocalFiles(LocalFiles {
+                    items: substrait_files,
                     advanced_extension: None,
-                    read_type: Some(ReadType::LocalFiles(LocalFiles {
-                        items: substrait_files,
-                        advanced_extension: None,
-                    })),
-                }))),
-            }));
-        }
+                })),
+            }))),
+        }));
     }
     Err(DataFusionError::Substrait(format!(
         "Unsupported plan in Substrait physical plan producer: {}",
diff --git a/datafusion/substrait/src/serializer.rs b/datafusion/substrait/src/serializer.rs
index 4a9e5d55ce055..ee71bc3121afe 100644
--- a/datafusion/substrait/src/serializer.rs
+++ b/datafusion/substrait/src/serializer.rs
@@ -46,6 +46,7 @@ pub async fn serialize(
         .open(path)
         .await?;
     file.write_all(&protobuf_out).await?;
+    file.flush().await?;
     Ok(())
 }
 
diff --git a/datafusion/substrait/src/variation_const.rs b/datafusion/substrait/src/variation_const.rs
index e5bebf8e11819..b1a922899e976 100644
--- a/datafusion/substrait/src/variation_const.rs
+++ b/datafusion/substrait/src/variation_const.rs
@@ -50,11 +50,24 @@ pub const TIMESTAMP_NANO_TYPE_VARIATION_REF: u32 = 3;
 
 pub const DATE_32_TYPE_VARIATION_REF: u32 = 0;
 pub const DATE_64_TYPE_VARIATION_REF: u32 = 1;
+pub const TIME_32_TYPE_VARIATION_REF: u32 = 0;
+pub const TIME_64_TYPE_VARIATION_REF: u32 = 1;
 pub const DEFAULT_CONTAINER_TYPE_VARIATION_REF: u32 = 0;
 pub const LARGE_CONTAINER_TYPE_VARIATION_REF: u32 = 1;
 pub const VIEW_CONTAINER_TYPE_VARIATION_REF: u32 = 2;
+pub const DEFAULT_MAP_TYPE_VARIATION_REF: u32 = 0;
+pub const DICTIONARY_MAP_TYPE_VARIATION_REF: u32 = 1;
 pub const DECIMAL_128_TYPE_VARIATION_REF: u32 = 0;
 pub const DECIMAL_256_TYPE_VARIATION_REF: u32 = 1;
+/// Used for the arrow type [`DataType::Interval`] with [`IntervalUnit::DayTime`].
+///
+/// [`DataType::Interval`]: datafusion::arrow::datatypes::DataType::Interval
+/// [`IntervalUnit::DayTime`]: datafusion::arrow::datatypes::IntervalUnit::DayTime
+pub const DEFAULT_INTERVAL_DAY_TYPE_VARIATION_REF: u32 = 0;
+/// Used for the arrow type [`DataType::Duration`].
+///
+/// [`DataType::Duration`]: datafusion::arrow::datatypes::DataType::Duration
+pub const DURATION_INTERVAL_DAY_TYPE_VARIATION_REF: u32 = 1;
 
 // For [user-defined types](https://substrait.io/types/type_classes/#user-defined-types).
 /// For [`DataType::Interval`] with [`IntervalUnit::YearMonth`].
@@ -96,7 +109,7 @@ pub const INTERVAL_DAY_TIME_TYPE_REF: u32 = 2;
 /// [`ScalarValue::IntervalMonthDayNano`]: datafusion::common::ScalarValue::IntervalMonthDayNano
 #[deprecated(
     since = "41.0.0",
-    note = "Use Substrait `IntervalCompund` type instead"
+    note = "Use Substrait `IntervalCompound` type instead"
 )]
 pub const INTERVAL_MONTH_DAY_NANO_TYPE_REF: u32 = 3;
 
@@ -106,6 +119,14 @@ pub const INTERVAL_MONTH_DAY_NANO_TYPE_REF: u32 = 3;
 /// [`IntervalUnit::MonthDayNano`]: datafusion::arrow::datatypes::IntervalUnit::MonthDayNano
 #[deprecated(
     since = "43.0.0",
-    note = "Use Substrait `IntervalCompund` type instead"
+    note = "Use Substrait `IntervalCompound` type instead"
 )]
 pub const INTERVAL_MONTH_DAY_NANO_TYPE_NAME: &str = "interval-month-day-nano";
+
+/// Defined in <https://github.com/apache/arrow/blame/main/format/substrait/extension_types.yaml>
+pub const FLOAT_16_TYPE_NAME: &str = "fp16";
+
+/// For [`DataType::Null`]
+///
+/// [`DataType::Null`]: datafusion::arrow::datatypes::DataType::Null
+pub const NULL_TYPE_NAME: &str = "null";
diff --git a/datafusion/substrait/tests/cases/aggregation_tests.rs b/datafusion/substrait/tests/cases/aggregation_tests.rs
new file mode 100644
index 0000000000000..92a41850b208d
--- /dev/null
+++ b/datafusion/substrait/tests/cases/aggregation_tests.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests to verify aggregation relation handling in Substrait
+
+#[cfg(test)]
+mod tests {
+    use crate::utils::test::{add_plan_schemas_to_ctx, read_json};
+    use datafusion::common::Result;
+    use datafusion::dataframe::DataFrame;
+    use datafusion::prelude::SessionContext;
+    use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
+    use insta::assert_snapshot;
+
+    #[tokio::test]
+    async fn no_grouping_set() -> Result<()> {
+        let proto_plan =
+            read_json("tests/testdata/test_plans/aggregate_groupings/no_groupings.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+            plan,
+            @r"
+        Aggregate: groupBy=[[]], aggr=[[sum(c0) AS summation]]
+          EmptyRelation: rows=0
+        "
+        );
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn one_grouping_set() -> Result<()> {
+        let proto_plan = read_json(
+            "tests/testdata/test_plans/aggregate_groupings/single_grouping.json",
+        );
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+            plan,
+            @r"
+        Aggregate: groupBy=[[c0]], aggr=[[sum(c0) AS summation]]
+          EmptyRelation: rows=0
+        "
+        );
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
+}
diff --git a/datafusion/substrait/tests/cases/builtin_expr_semantics_tests.rs b/datafusion/substrait/tests/cases/builtin_expr_semantics_tests.rs
new file mode 100644
index 0000000000000..c7ca669b27c84
--- /dev/null
+++ b/datafusion/substrait/tests/cases/builtin_expr_semantics_tests.rs
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! There are some Substrait functions that are semantically equivalent to nested built-in expressions, such as xor:bool_bool and and_not:bool_bool
+//! This module tests that the semantics of these functions are correct roundtripped
+
+#[cfg(test)]
+mod tests {
+    use crate::utils::test::add_plan_schemas_to_ctx;
+    use datafusion::arrow::util::pretty;
+    use datafusion::common::Result;
+    use datafusion::prelude::DataFrame;
+    use datafusion::prelude::SessionContext;
+    use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
+    use datafusion_substrait::logical_plan::producer::to_substrait_plan;
+    use std::fs::File;
+    use std::io::BufReader;
+    use substrait::proto::Plan;
+
+    // Helper function to test scalar function semantics and roundtrip conversion
+    async fn test_scalar_fn_semantics(
+        file_path: &str,
+        expected_results: Vec<&str>,
+    ) -> Result<()> {
+        let path = format!("tests/testdata/test_plans/{file_path}");
+        let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
+            File::open(path).expect("file not found"),
+        ))
+        .expect("failed to parse json");
+
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto).await?;
+
+        // Test correct semantics of function
+        let df = DataFrame::new(ctx.state().clone(), plan.clone());
+        let results = df.collect().await?;
+        let pretty_results = pretty::pretty_format_batches(&results)?.to_string();
+        assert_eq!(
+            pretty_results.trim().lines().collect::<Vec<_>>(),
+            expected_results
+        );
+
+        // Test roundtrip semantics
+        let proto = to_substrait_plan(&plan, &ctx.state())?;
+        let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+        let df2 = DataFrame::new(ctx.state().clone(), plan2.clone());
+        let results2 = df2.collect().await?;
+        let pretty_results2 = pretty::pretty_format_batches(&results2)?.to_string();
+        assert_eq!(
+            pretty_results2.trim().lines().collect::<Vec<_>>(),
+            expected_results
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_xor_semantics() -> Result<()> {
+        let expected = vec![
+            "+-------+-------+--------+",
+            "| a     | b     | result |",
+            "+-------+-------+--------+",
+            "| true  | true  | false  |",
+            "| true  | false | true   |",
+            "| false | true  | true   |",
+            "| false | false | false  |",
+            "+-------+-------+--------+",
+        ];
+
+        test_scalar_fn_semantics(
+            "scalar_fn_to_built_in_binary_expr_xor.substrait.json",
+            expected,
+        )
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_and_not_semantics() -> Result<()> {
+        let expected = vec![
+            "+-------+-------+--------+",
+            "| a     | b     | result |",
+            "+-------+-------+--------+",
+            "| true  | true  | false  |",
+            "| true  | false | true   |",
+            "| false | true  | false  |",
+            "| false | false | false  |",
+            "+-------+-------+--------+",
+        ];
+
+        test_scalar_fn_semantics(
+            "scalar_fn_to_built_in_binary_expr_and_not.substrait.json",
+            expected,
+        )
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_logb_semantics() -> Result<()> {
+        let expected = vec![
+            "+-------+------+--------+",
+            "| x     | base | result |",
+            "+-------+------+--------+",
+            "| 1.0   | 10.0 | 0.0    |",
+            "| 100.0 | 10.0 | 2.0    |",
+            "+-------+------+--------+",
+        ];
+
+        test_scalar_fn_semantics("scalar_fn_logb_expr.substrait.json", expected).await
+    }
+}
diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs
index 4a121e41d27e7..88c4eb6678feb 100644
--- a/datafusion/substrait/tests/cases/consumer_integration.rs
+++ b/datafusion/substrait/tests/cases/consumer_integration.rs
@@ -53,13 +53,13 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(Int64(1)) AS COUNT_ORDER
-              Sort: LINEITEM.L_RETURNFLAG ASC NULLS LAST, LINEITEM.L_LINESTATUS ASC NULLS LAST
-                Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(Int64(1))]]
-                  Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, LINEITEM.L_QUANTITY, LINEITEM.L_EXTENDEDPRICE, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT), LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) * (CAST(Int32(1) AS Decimal128(15, 2)) + LINEITEM.L_TAX), LINEITEM.L_DISCOUNT
-                    Filter: LINEITEM.L_SHIPDATE <= Date32("1998-12-01") - IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 10368000 }")
-                      TableScan: LINEITEM
-            "#
+        Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, sum(LINEITEM.L_QUANTITY) AS SUM_QTY, sum(LINEITEM.L_EXTENDEDPRICE) AS SUM_BASE_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS SUM_DISC_PRICE, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX) AS SUM_CHARGE, avg(LINEITEM.L_QUANTITY) AS AVG_QTY, avg(LINEITEM.L_EXTENDEDPRICE) AS AVG_PRICE, avg(LINEITEM.L_DISCOUNT) AS AVG_DISC, count(Int64(1)) AS COUNT_ORDER
+          Sort: LINEITEM.L_RETURNFLAG ASC NULLS LAST, LINEITEM.L_LINESTATUS ASC NULLS LAST
+            Aggregate: groupBy=[[LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS]], aggr=[[sum(LINEITEM.L_QUANTITY), sum(LINEITEM.L_EXTENDEDPRICE), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT * Int32(1) + LINEITEM.L_TAX), avg(LINEITEM.L_QUANTITY), avg(LINEITEM.L_EXTENDEDPRICE), avg(LINEITEM.L_DISCOUNT), count(Int64(1))]]
+              Projection: LINEITEM.L_RETURNFLAG, LINEITEM.L_LINESTATUS, LINEITEM.L_QUANTITY, LINEITEM.L_EXTENDEDPRICE, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT), LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) * (CAST(Int32(1) AS Decimal128(15, 2)) + LINEITEM.L_TAX), LINEITEM.L_DISCOUNT
+                Filter: LINEITEM.L_SHIPDATE <= Date32("1998-12-01") - IntervalDayTime("IntervalDayTime { days: 0, milliseconds: 10368000 }")
+                  TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -70,31 +70,31 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Limit: skip=0, fetch=100
-              Sort: SUPPLIER.S_ACCTBAL DESC NULLS FIRST, NATION.N_NAME ASC NULLS LAST, SUPPLIER.S_NAME ASC NULLS LAST, PART.P_PARTKEY ASC NULLS LAST
-                Projection: SUPPLIER.S_ACCTBAL, SUPPLIER.S_NAME, NATION.N_NAME, PART.P_PARTKEY, PART.P_MFGR, SUPPLIER.S_ADDRESS, SUPPLIER.S_PHONE, SUPPLIER.S_COMMENT
-                  Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND PART.P_SIZE = Int32(15) AND PART.P_TYPE LIKE CAST(Utf8("%BRASS") AS Utf8) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE") AND PARTSUPP.PS_SUPPLYCOST = (<subquery>)
-                    Subquery:
-                      Aggregate: groupBy=[[]], aggr=[[min(PARTSUPP.PS_SUPPLYCOST)]]
-                        Projection: PARTSUPP.PS_SUPPLYCOST
-                          Filter: PARTSUPP.PS_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE")
-                            Cross Join: 
-                              Cross Join: 
-                                Cross Join: 
-                                  TableScan: PARTSUPP
-                                  TableScan: SUPPLIER
-                                TableScan: NATION
-                              TableScan: REGION
-                    Cross Join: 
-                      Cross Join: 
-                        Cross Join: 
-                          Cross Join: 
-                            TableScan: PART
-                            TableScan: SUPPLIER
-                          TableScan: PARTSUPP
-                        TableScan: NATION
-                      TableScan: REGION
-            "#
+        Limit: skip=0, fetch=100
+          Sort: SUPPLIER.S_ACCTBAL DESC NULLS FIRST, NATION.N_NAME ASC NULLS LAST, SUPPLIER.S_NAME ASC NULLS LAST, PART.P_PARTKEY ASC NULLS LAST
+            Projection: SUPPLIER.S_ACCTBAL, SUPPLIER.S_NAME, NATION.N_NAME, PART.P_PARTKEY, PART.P_MFGR, SUPPLIER.S_ADDRESS, SUPPLIER.S_PHONE, SUPPLIER.S_COMMENT
+              Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND PART.P_SIZE = Int32(15) AND PART.P_TYPE LIKE CAST(Utf8("%BRASS") AS Utf8) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE") AND PARTSUPP.PS_SUPPLYCOST = (<subquery>)
+                Subquery:
+                  Aggregate: groupBy=[[]], aggr=[[min(PARTSUPP.PS_SUPPLYCOST)]]
+                    Projection: PARTSUPP.PS_SUPPLYCOST
+                      Filter: outer_ref(PART.P_PARTKEY) = PARTSUPP.PS_PARTKEY AND SUPPLIER.S_SUPPKEY = PARTSUPP.PS_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("EUROPE")
+                        Cross Join:
+                          Cross Join:
+                            Cross Join:
+                              TableScan: PARTSUPP
+                              TableScan: SUPPLIER
+                            TableScan: NATION
+                          TableScan: REGION
+                Cross Join:
+                  Cross Join:
+                    Cross Join:
+                      Cross Join:
+                        TableScan: PART
+                        TableScan: SUPPLIER
+                      TableScan: PARTSUPP
+                    TableScan: NATION
+                  TableScan: REGION
+        "#
                 );
         Ok(())
     }
@@ -105,19 +105,19 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
-              Limit: skip=0, fetch=10
-                Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
-                  Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
-                    Aggregate: groupBy=[[LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                      Projection: LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                        Filter: CUSTOMER.C_MKTSEGMENT = Utf8("BUILDING") AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-03-15") AS Date32) AND LINEITEM.L_SHIPDATE > CAST(Utf8("1995-03-15") AS Date32)
-                          Cross Join: 
-                            Cross Join: 
-                              TableScan: LINEITEM
-                              TableScan: CUSTOMER
-                            TableScan: ORDERS
-            "#
+        Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
+          Limit: skip=0, fetch=10
+            Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
+              Projection: LINEITEM.L_ORDERKEY, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY
+                Aggregate: groupBy=[[LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+                  Projection: LINEITEM.L_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_SHIPPRIORITY, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                    Filter: CUSTOMER.C_MKTSEGMENT = Utf8("BUILDING") AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-03-15") AS Date32) AND LINEITEM.L_SHIPDATE > CAST(Utf8("1995-03-15") AS Date32)
+                      Cross Join:
+                        Cross Join:
+                          TableScan: LINEITEM
+                          TableScan: CUSTOMER
+                        TableScan: ORDERS
+        "#
                 );
         Ok(())
     }
@@ -128,16 +128,16 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: ORDERS.O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT
-              Sort: ORDERS.O_ORDERPRIORITY ASC NULLS LAST
-                Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(Int64(1))]]
-                  Projection: ORDERS.O_ORDERPRIORITY
-                    Filter: ORDERS.O_ORDERDATE >= CAST(Utf8("1993-07-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1993-10-01") AS Date32) AND EXISTS (<subquery>)
-                      Subquery:
-                        Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_ORDERKEY AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE
-                          TableScan: LINEITEM
-                      TableScan: ORDERS
-            "#
+        Projection: ORDERS.O_ORDERPRIORITY, count(Int64(1)) AS ORDER_COUNT
+          Sort: ORDERS.O_ORDERPRIORITY ASC NULLS LAST
+            Aggregate: groupBy=[[ORDERS.O_ORDERPRIORITY]], aggr=[[count(Int64(1))]]
+              Projection: ORDERS.O_ORDERPRIORITY
+                Filter: ORDERS.O_ORDERDATE >= CAST(Utf8("1993-07-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1993-10-01") AS Date32) AND EXISTS (<subquery>)
+                  Subquery:
+                    Filter: LINEITEM.L_ORDERKEY = outer_ref(ORDERS.O_ORDERKEY) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE
+                      TableScan: LINEITEM
+                  TableScan: ORDERS
+        "#
                 );
         Ok(())
     }
@@ -148,23 +148,23 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: NATION.N_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE
-              Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
-                Aggregate: groupBy=[[NATION.N_NAME]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                  Projection: NATION.N_NAME, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                    Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND LINEITEM.L_SUPPKEY = SUPPLIER.S_SUPPKEY AND CUSTOMER.C_NATIONKEY = SUPPLIER.S_NATIONKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("ASIA") AND ORDERS.O_ORDERDATE >= CAST(Utf8("1994-01-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-01-01") AS Date32)
-                      Cross Join: 
-                        Cross Join: 
-                          Cross Join: 
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: CUSTOMER
-                                TableScan: ORDERS
-                              TableScan: LINEITEM
-                            TableScan: SUPPLIER
-                          TableScan: NATION
-                        TableScan: REGION
-            "#
+        Projection: NATION.N_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE
+          Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
+            Aggregate: groupBy=[[NATION.N_NAME]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+              Projection: NATION.N_NAME, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND LINEITEM.L_SUPPKEY = SUPPLIER.S_SUPPKEY AND CUSTOMER.C_NATIONKEY = SUPPLIER.S_NATIONKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_REGIONKEY = REGION.R_REGIONKEY AND REGION.R_NAME = Utf8("ASIA") AND ORDERS.O_ORDERDATE >= CAST(Utf8("1994-01-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1995-01-01") AS Date32)
+                  Cross Join:
+                    Cross Join:
+                      Cross Join:
+                        Cross Join:
+                          Cross Join:
+                            TableScan: CUSTOMER
+                            TableScan: ORDERS
+                          TableScan: LINEITEM
+                        TableScan: SUPPLIER
+                      TableScan: NATION
+                    TableScan: REGION
+        "#
                 );
         Ok(())
     }
@@ -175,11 +175,11 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT) AS REVENUE]]
-              Projection: LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT
-                Filter: LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32) AND LINEITEM.L_DISCOUNT >= Decimal128(Some(5),3,2) AND LINEITEM.L_DISCOUNT <= Decimal128(Some(7),3,2) AND LINEITEM.L_QUANTITY < CAST(Int32(24) AS Decimal128(15, 2))
-                  TableScan: LINEITEM
-            "#
+        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT) AS REVENUE]]
+          Projection: LINEITEM.L_EXTENDEDPRICE * LINEITEM.L_DISCOUNT
+            Filter: LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32) AND LINEITEM.L_DISCOUNT >= Decimal128(Some(5),3,2) AND LINEITEM.L_DISCOUNT <= Decimal128(Some(7),3,2) AND LINEITEM.L_QUANTITY < CAST(Int32(24) AS Decimal128(15, 2))
+              TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -214,21 +214,21 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
-              Limit: skip=0, fetch=20
-                Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
-                  Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
-                    Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                      Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                        Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE >= CAST(Utf8("1993-10-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RETURNFLAG = Utf8("R") AND CUSTOMER.C_NATIONKEY = NATION.N_NATIONKEY
-                          Cross Join: 
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: CUSTOMER
-                                TableScan: ORDERS
-                              TableScan: LINEITEM
-                            TableScan: NATION
-            "#
+        Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE, CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
+          Limit: skip=0, fetch=20
+            Sort: sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) DESC NULLS FIRST
+              Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT), CUSTOMER.C_ACCTBAL, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_PHONE, CUSTOMER.C_COMMENT
+                Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+                  Projection: CUSTOMER.C_CUSTKEY, CUSTOMER.C_NAME, CUSTOMER.C_ACCTBAL, CUSTOMER.C_PHONE, NATION.N_NAME, CUSTOMER.C_ADDRESS, CUSTOMER.C_COMMENT, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+                    Filter: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND LINEITEM.L_ORDERKEY = ORDERS.O_ORDERKEY AND ORDERS.O_ORDERDATE >= CAST(Utf8("1993-10-01") AS Date32) AND ORDERS.O_ORDERDATE < CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RETURNFLAG = Utf8("R") AND CUSTOMER.C_NATIONKEY = NATION.N_NATIONKEY
+                      Cross Join:
+                        Cross Join:
+                          Cross Join:
+                            TableScan: CUSTOMER
+                            TableScan: ORDERS
+                          TableScan: LINEITEM
+                        TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -239,28 +239,28 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: PARTSUPP.PS_PARTKEY, sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) AS value
-              Sort: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) DESC NULLS FIRST
-                Filter: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) > (<subquery>)
-                  Subquery:
-                    Projection: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) * Decimal128(Some(1000000),11,10)
-                      Aggregate: groupBy=[[]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
-                        Projection: PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
-                          Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
-                            Cross Join: 
-                              Cross Join: 
-                                TableScan: PARTSUPP
-                                TableScan: SUPPLIER
-                              TableScan: NATION
-                  Aggregate: groupBy=[[PARTSUPP.PS_PARTKEY]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
-                    Projection: PARTSUPP.PS_PARTKEY, PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
+        Projection: PARTSUPP.PS_PARTKEY, sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) AS value
+          Sort: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) DESC NULLS FIRST
+            Filter: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) > (<subquery>)
+              Subquery:
+                Projection: sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY) * Decimal128(Some(1000000),11,10)
+                  Aggregate: groupBy=[[]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
+                    Projection: PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
                       Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
-                        Cross Join: 
-                          Cross Join: 
+                        Cross Join:
+                          Cross Join:
                             TableScan: PARTSUPP
                             TableScan: SUPPLIER
                           TableScan: NATION
-            "#
+              Aggregate: groupBy=[[PARTSUPP.PS_PARTKEY]], aggr=[[sum(PARTSUPP.PS_SUPPLYCOST * PARTSUPP.PS_AVAILQTY)]]
+                Projection: PARTSUPP.PS_PARTKEY, PARTSUPP.PS_SUPPLYCOST * CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0))
+                  Filter: PARTSUPP.PS_SUPPKEY = SUPPLIER.S_SUPPKEY AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("JAPAN")
+                    Cross Join:
+                      Cross Join:
+                        TableScan: PARTSUPP
+                        TableScan: SUPPLIER
+                      TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -271,15 +271,15 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: LINEITEM.L_SHIPMODE, sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS HIGH_LINE_COUNT, sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS LOW_LINE_COUNT
-              Sort: LINEITEM.L_SHIPMODE ASC NULLS LAST
-                Aggregate: groupBy=[[LINEITEM.L_SHIPMODE]], aggr=[[sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END), sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END)]]
-                  Projection: LINEITEM.L_SHIPMODE, CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END, CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END
-                    Filter: ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND (LINEITEM.L_SHIPMODE = CAST(Utf8("MAIL") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("SHIP") AS Utf8)) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE AND LINEITEM.L_SHIPDATE < LINEITEM.L_COMMITDATE AND LINEITEM.L_RECEIPTDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RECEIPTDATE < CAST(Utf8("1995-01-01") AS Date32)
-                      Cross Join: 
-                        TableScan: ORDERS
-                        TableScan: LINEITEM
-            "#
+        Projection: LINEITEM.L_SHIPMODE, sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS HIGH_LINE_COUNT, sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END) AS LOW_LINE_COUNT
+          Sort: LINEITEM.L_SHIPMODE ASC NULLS LAST
+            Aggregate: groupBy=[[LINEITEM.L_SHIPMODE]], aggr=[[sum(CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END), sum(CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END)]]
+              Projection: LINEITEM.L_SHIPMODE, CASE WHEN ORDERS.O_ORDERPRIORITY = Utf8("1-URGENT") OR ORDERS.O_ORDERPRIORITY = Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END, CASE WHEN ORDERS.O_ORDERPRIORITY != Utf8("1-URGENT") AND ORDERS.O_ORDERPRIORITY != Utf8("2-HIGH") THEN Int32(1) ELSE Int32(0) END
+                Filter: ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND (LINEITEM.L_SHIPMODE = CAST(Utf8("MAIL") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("SHIP") AS Utf8)) AND LINEITEM.L_COMMITDATE < LINEITEM.L_RECEIPTDATE AND LINEITEM.L_SHIPDATE < LINEITEM.L_COMMITDATE AND LINEITEM.L_RECEIPTDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_RECEIPTDATE < CAST(Utf8("1995-01-01") AS Date32)
+                  Cross Join:
+                    TableScan: ORDERS
+                    TableScan: LINEITEM
+        "#
                 );
         Ok(())
     }
@@ -290,17 +290,17 @@ mod tests {
         assert_snapshot!(
             plan_str,
             @r#"
-            Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST
-              Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST
-                Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))
-                  Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]
-                    Projection: count(ORDERS.O_ORDERKEY)
-                      Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]
-                        Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY
-                          Left Join: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY Filter: NOT ORDERS.O_COMMENT LIKE CAST(Utf8("%special%requests%") AS Utf8)
-                            TableScan: CUSTOMER
-                            TableScan: ORDERS
-            "#        );
+        Projection: count(ORDERS.O_ORDERKEY) AS C_COUNT, count(Int64(1)) AS CUSTDIST
+          Sort: count(Int64(1)) DESC NULLS FIRST, count(ORDERS.O_ORDERKEY) DESC NULLS FIRST
+            Projection: count(ORDERS.O_ORDERKEY), count(Int64(1))
+              Aggregate: groupBy=[[count(ORDERS.O_ORDERKEY)]], aggr=[[count(Int64(1))]]
+                Projection: count(ORDERS.O_ORDERKEY)
+                  Aggregate: groupBy=[[CUSTOMER.C_CUSTKEY]], aggr=[[count(ORDERS.O_ORDERKEY)]]
+                    Projection: CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY
+                      Left Join: CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY Filter: NOT ORDERS.O_COMMENT LIKE CAST(Utf8("%special%requests%") AS Utf8)
+                        TableScan: CUSTOMER
+                        TableScan: ORDERS
+        "#        );
         Ok(())
     }
 
@@ -310,14 +310,14 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END) / sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS PROMO_REVENUE
-              Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
-                Projection: CASE WHEN PART.P_TYPE LIKE CAST(Utf8("PROMO%") AS Utf8) THEN LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) ELSE Decimal128(Some(0),19,4) END, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                  Filter: LINEITEM.L_PARTKEY = PART.P_PARTKEY AND LINEITEM.L_SHIPDATE >= Date32("1995-09-01") AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-10-01") AS Date32)
-                    Cross Join: 
-                      TableScan: LINEITEM
-                      TableScan: PART
-            "#
+        Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END) / sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS PROMO_REVENUE
+          Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN PART.P_TYPE LIKE Utf8("PROMO%") THEN LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT ELSE Decimal128(Some(0),19,4) END), sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT)]]
+            Projection: CASE WHEN PART.P_TYPE LIKE CAST(Utf8("PROMO%") AS Utf8) THEN LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT) ELSE Decimal128(Some(0),19,4) END, LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+              Filter: LINEITEM.L_PARTKEY = PART.P_PARTKEY AND LINEITEM.L_SHIPDATE >= Date32("1995-09-01") AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-10-01") AS Date32)
+                Cross Join:
+                  TableScan: LINEITEM
+                  TableScan: PART
+        "#
                 );
         Ok(())
     }
@@ -336,28 +336,44 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, count(DISTINCT PARTSUPP.PS_SUPPKEY) AS SUPPLIER_CNT
-              Sort: count(DISTINCT PARTSUPP.PS_SUPPKEY) DESC NULLS FIRST, PART.P_BRAND ASC NULLS LAST, PART.P_TYPE ASC NULLS LAST, PART.P_SIZE ASC NULLS LAST
-                Aggregate: groupBy=[[PART.P_BRAND, PART.P_TYPE, PART.P_SIZE]], aggr=[[count(DISTINCT PARTSUPP.PS_SUPPKEY)]]
-                  Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, PARTSUPP.PS_SUPPKEY
-                    Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND PART.P_BRAND != Utf8("Brand#45") AND NOT PART.P_TYPE LIKE CAST(Utf8("MEDIUM POLISHED%") AS Utf8) AND (PART.P_SIZE = Int32(49) OR PART.P_SIZE = Int32(14) OR PART.P_SIZE = Int32(23) OR PART.P_SIZE = Int32(45) OR PART.P_SIZE = Int32(19) OR PART.P_SIZE = Int32(3) OR PART.P_SIZE = Int32(36) OR PART.P_SIZE = Int32(9)) AND NOT PARTSUPP.PS_SUPPKEY IN (<subquery>)
-                      Subquery:
-                        Projection: SUPPLIER.S_SUPPKEY
-                          Filter: SUPPLIER.S_COMMENT LIKE CAST(Utf8("%Customer%Complaints%") AS Utf8)
-                            TableScan: SUPPLIER
-                      Cross Join: 
-                        TableScan: PARTSUPP
-                        TableScan: PART
-            "#
+        Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, count(DISTINCT PARTSUPP.PS_SUPPKEY) AS SUPPLIER_CNT
+          Sort: count(DISTINCT PARTSUPP.PS_SUPPKEY) DESC NULLS FIRST, PART.P_BRAND ASC NULLS LAST, PART.P_TYPE ASC NULLS LAST, PART.P_SIZE ASC NULLS LAST
+            Aggregate: groupBy=[[PART.P_BRAND, PART.P_TYPE, PART.P_SIZE]], aggr=[[count(DISTINCT PARTSUPP.PS_SUPPKEY)]]
+              Projection: PART.P_BRAND, PART.P_TYPE, PART.P_SIZE, PARTSUPP.PS_SUPPKEY
+                Filter: PART.P_PARTKEY = PARTSUPP.PS_PARTKEY AND PART.P_BRAND != Utf8("Brand#45") AND NOT PART.P_TYPE LIKE CAST(Utf8("MEDIUM POLISHED%") AS Utf8) AND (PART.P_SIZE = Int32(49) OR PART.P_SIZE = Int32(14) OR PART.P_SIZE = Int32(23) OR PART.P_SIZE = Int32(45) OR PART.P_SIZE = Int32(19) OR PART.P_SIZE = Int32(3) OR PART.P_SIZE = Int32(36) OR PART.P_SIZE = Int32(9)) AND NOT PARTSUPP.PS_SUPPKEY IN (<subquery>)
+                  Subquery:
+                    Projection: SUPPLIER.S_SUPPKEY
+                      Filter: SUPPLIER.S_COMMENT LIKE CAST(Utf8("%Customer%Complaints%") AS Utf8)
+                        TableScan: SUPPLIER
+                  Cross Join:
+                    TableScan: PARTSUPP
+                    TableScan: PART
+        "#
                 );
         Ok(())
     }
 
-    #[ignore]
     #[tokio::test]
     async fn tpch_test_17() -> Result<()> {
         let plan_str = tpch_plan_to_string(17).await?;
-        assert_snapshot!(plan_str, "panics due to out of bounds field access");
+        assert_snapshot!(
+        plan_str,
+        @r#"
+        Projection: sum(LINEITEM.L_EXTENDEDPRICE) / Decimal128(Some(70),2,1) AS AVG_YEARLY
+          Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE)]]
+            Projection: LINEITEM.L_EXTENDEDPRICE
+              Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND PART.P_CONTAINER = Utf8("MED BOX") AND LINEITEM.L_QUANTITY < (<subquery>)
+                Subquery:
+                  Projection: Decimal128(Some(2),2,1) * avg(LINEITEM.L_QUANTITY)
+                    Aggregate: groupBy=[[]], aggr=[[avg(LINEITEM.L_QUANTITY)]]
+                      Projection: LINEITEM.L_QUANTITY
+                        Filter: LINEITEM.L_PARTKEY = outer_ref(PART.P_PARTKEY)
+                          TableScan: LINEITEM
+                Cross Join:
+                  TableScan: LINEITEM
+                  TableScan: PART
+        "#
+                );
         Ok(())
     }
 
@@ -366,25 +382,25 @@ mod tests {
         let plan_str = tpch_plan_to_string(18).await?;
         assert_snapshot!(
         plan_str,
-        @r#"
-            Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, sum(LINEITEM.L_QUANTITY) AS EXPR$5
-              Limit: skip=0, fetch=100
-                Sort: ORDERS.O_TOTALPRICE DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
-                  Aggregate: groupBy=[[CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                    Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, LINEITEM.L_QUANTITY
-                      Filter: ORDERS.O_ORDERKEY IN (<subquery>) AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
-                        Subquery:
-                          Projection: LINEITEM.L_ORDERKEY
-                            Filter: sum(LINEITEM.L_QUANTITY) > CAST(Int32(300) AS Decimal128(15, 2))
-                              Aggregate: groupBy=[[LINEITEM.L_ORDERKEY]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                                Projection: LINEITEM.L_ORDERKEY, LINEITEM.L_QUANTITY
-                                  TableScan: LINEITEM
-                        Cross Join: 
-                          Cross Join: 
-                            TableScan: CUSTOMER
-                            TableScan: ORDERS
-                          TableScan: LINEITEM
-            "#
+        @r"
+        Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, sum(LINEITEM.L_QUANTITY) AS EXPR$5
+          Limit: skip=0, fetch=100
+            Sort: ORDERS.O_TOTALPRICE DESC NULLS FIRST, ORDERS.O_ORDERDATE ASC NULLS LAST
+              Aggregate: groupBy=[[CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                Projection: CUSTOMER.C_NAME, CUSTOMER.C_CUSTKEY, ORDERS.O_ORDERKEY, ORDERS.O_ORDERDATE, ORDERS.O_TOTALPRICE, LINEITEM.L_QUANTITY
+                  Filter: ORDERS.O_ORDERKEY IN (<subquery>) AND CUSTOMER.C_CUSTKEY = ORDERS.O_CUSTKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY
+                    Subquery:
+                      Projection: LINEITEM.L_ORDERKEY
+                        Filter: sum(LINEITEM.L_QUANTITY) > CAST(Int32(300) AS Decimal128(15, 2))
+                          Aggregate: groupBy=[[LINEITEM.L_ORDERKEY]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                            Projection: LINEITEM.L_ORDERKEY, LINEITEM.L_QUANTITY
+                              TableScan: LINEITEM
+                    Cross Join:
+                      Cross Join:
+                        TableScan: CUSTOMER
+                        TableScan: ORDERS
+                      TableScan: LINEITEM
+        "
                 );
         Ok(())
     }
@@ -394,13 +410,13 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE]]
-              Projection: LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
-                Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#12") AND (PART.P_CONTAINER = CAST(Utf8("SM CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(1) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(1) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(5) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND (PART.P_CONTAINER = CAST(Utf8("MED BAG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PKG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PACK") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(10) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(10) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(10) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#34") AND (PART.P_CONTAINER = CAST(Utf8("LG CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(20) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(20) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(15) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON")
-                  Cross Join: 
-                    TableScan: LINEITEM
-                    TableScan: PART
-            "#
+        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_EXTENDEDPRICE * Int32(1) - LINEITEM.L_DISCOUNT) AS REVENUE]]
+          Projection: LINEITEM.L_EXTENDEDPRICE * (CAST(Int32(1) AS Decimal128(15, 2)) - LINEITEM.L_DISCOUNT)
+            Filter: PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#12") AND (PART.P_CONTAINER = CAST(Utf8("SM CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("SM PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(1) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(1) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(5) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#23") AND (PART.P_CONTAINER = CAST(Utf8("MED BAG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PKG") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("MED PACK") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(10) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(10) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(10) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON") OR PART.P_PARTKEY = LINEITEM.L_PARTKEY AND PART.P_BRAND = Utf8("Brand#34") AND (PART.P_CONTAINER = CAST(Utf8("LG CASE") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG BOX") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PACK") AS Utf8) OR PART.P_CONTAINER = CAST(Utf8("LG PKG") AS Utf8)) AND LINEITEM.L_QUANTITY >= CAST(Int32(20) AS Decimal128(15, 2)) AND LINEITEM.L_QUANTITY <= CAST(Int32(20) + Int32(10) AS Decimal128(15, 2)) AND PART.P_SIZE >= Int32(1) AND PART.P_SIZE <= Int32(15) AND (LINEITEM.L_SHIPMODE = CAST(Utf8("AIR") AS Utf8) OR LINEITEM.L_SHIPMODE = CAST(Utf8("AIR REG") AS Utf8)) AND LINEITEM.L_SHIPINSTRUCT = Utf8("DELIVER IN PERSON")
+              Cross Join:
+                TableScan: LINEITEM
+                TableScan: PART
+        "#
                 );
         Ok(())
     }
@@ -411,27 +427,27 @@ mod tests {
         assert_snapshot!(
         plan_str,
         @r#"
-            Sort: SUPPLIER.S_NAME ASC NULLS LAST
-              Projection: SUPPLIER.S_NAME, SUPPLIER.S_ADDRESS
-                Filter: SUPPLIER.S_SUPPKEY IN (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("CANADA")
-                  Subquery:
-                    Projection: PARTSUPP.PS_SUPPKEY
-                      Filter: PARTSUPP.PS_PARTKEY IN (<subquery>) AND CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0)) > (<subquery>)
-                        Subquery:
-                          Projection: PART.P_PARTKEY
-                            Filter: PART.P_NAME LIKE CAST(Utf8("forest%") AS Utf8)
-                              TableScan: PART
-                        Subquery:
-                          Projection: Decimal128(Some(5),2,1) * sum(LINEITEM.L_QUANTITY)
-                            Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
-                              Projection: LINEITEM.L_QUANTITY
-                                Filter: LINEITEM.L_PARTKEY = LINEITEM.L_ORDERKEY AND LINEITEM.L_SUPPKEY = LINEITEM.L_PARTKEY AND LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32)
-                                  TableScan: LINEITEM
-                        TableScan: PARTSUPP
-                  Cross Join: 
-                    TableScan: SUPPLIER
-                    TableScan: NATION
-            "#
+        Sort: SUPPLIER.S_NAME ASC NULLS LAST
+          Projection: SUPPLIER.S_NAME, SUPPLIER.S_ADDRESS
+            Filter: SUPPLIER.S_SUPPKEY IN (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("CANADA")
+              Subquery:
+                Projection: PARTSUPP.PS_SUPPKEY
+                  Filter: PARTSUPP.PS_PARTKEY IN (<subquery>) AND CAST(PARTSUPP.PS_AVAILQTY AS Decimal128(19, 0)) > (<subquery>)
+                    Subquery:
+                      Projection: PART.P_PARTKEY
+                        Filter: PART.P_NAME LIKE CAST(Utf8("forest%") AS Utf8)
+                          TableScan: PART
+                    Subquery:
+                      Projection: Decimal128(Some(5),2,1) * sum(LINEITEM.L_QUANTITY)
+                        Aggregate: groupBy=[[]], aggr=[[sum(LINEITEM.L_QUANTITY)]]
+                          Projection: LINEITEM.L_QUANTITY
+                            Filter: LINEITEM.L_PARTKEY = outer_ref(PARTSUPP.PS_PARTKEY) AND LINEITEM.L_SUPPKEY = outer_ref(PARTSUPP.PS_SUPPKEY) AND LINEITEM.L_SHIPDATE >= CAST(Utf8("1994-01-01") AS Date32) AND LINEITEM.L_SHIPDATE < CAST(Utf8("1995-01-01") AS Date32)
+                              TableScan: LINEITEM
+                    TableScan: PARTSUPP
+              Cross Join:
+                TableScan: SUPPLIER
+                TableScan: NATION
+        "#
                 );
         Ok(())
     }
@@ -449,14 +465,14 @@ mod tests {
                 Projection: SUPPLIER.S_NAME
                   Filter: SUPPLIER.S_SUPPKEY = LINEITEM.L_SUPPKEY AND ORDERS.O_ORDERKEY = LINEITEM.L_ORDERKEY AND ORDERS.O_ORDERSTATUS = Utf8("F") AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE AND EXISTS (<subquery>) AND NOT EXISTS (<subquery>) AND SUPPLIER.S_NATIONKEY = NATION.N_NATIONKEY AND NATION.N_NAME = Utf8("SAUDI ARABIA")
                     Subquery:
-                      Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_TAX AND LINEITEM.L_SUPPKEY != LINEITEM.L_LINESTATUS
+                      Filter: LINEITEM.L_ORDERKEY = outer_ref(LINEITEM.L_ORDERKEY) AND LINEITEM.L_SUPPKEY != outer_ref(LINEITEM.L_SUPPKEY)
                         TableScan: LINEITEM
                     Subquery:
-                      Filter: LINEITEM.L_ORDERKEY = LINEITEM.L_TAX AND LINEITEM.L_SUPPKEY != LINEITEM.L_LINESTATUS AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE
+                      Filter: LINEITEM.L_ORDERKEY = outer_ref(LINEITEM.L_ORDERKEY) AND LINEITEM.L_SUPPKEY != outer_ref(LINEITEM.L_SUPPKEY) AND LINEITEM.L_RECEIPTDATE > LINEITEM.L_COMMITDATE
                         TableScan: LINEITEM
-                    Cross Join: 
-                      Cross Join: 
-                        Cross Join: 
+                    Cross Join:
+                      Cross Join:
+                        Cross Join:
                           TableScan: SUPPLIER
                           TableScan: LINEITEM
                         TableScan: ORDERS
@@ -483,7 +499,7 @@ mod tests {
                         Filter: CUSTOMER.C_ACCTBAL > Decimal128(Some(0),3,2) AND (substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("13") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("31") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("23") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("29") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("30") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("18") AS Utf8) OR substr(CUSTOMER.C_PHONE, Int32(1), Int32(2)) = CAST(Utf8("17") AS Utf8))
                           TableScan: CUSTOMER
                   Subquery:
-                    Filter: ORDERS.O_CUSTKEY = ORDERS.O_ORDERKEY
+                    Filter: ORDERS.O_CUSTKEY = outer_ref(CUSTOMER.C_CUSTKEY)
                       TableScan: ORDERS
                   TableScan: CUSTOMER
         "#
@@ -491,6 +507,52 @@ mod tests {
         Ok(())
     }
 
+    /// Tests nested correlated subqueries where the innermost subquery
+    /// references the outermost query (steps_out=2).
+    ///
+    /// This tests the outer schema stack with depth > 1.
+    /// The plan represents:
+    /// ```sql
+    /// SELECT * FROM A
+    /// WHERE EXISTS (
+    ///     SELECT * FROM B
+    ///     WHERE B.b1 = A.a1              -- steps_out=1 (references immediate parent)
+    ///       AND EXISTS (
+    ///         SELECT * FROM C
+    ///         WHERE C.c1 = A.a1          -- steps_out=2 (references grandparent)
+    ///           AND C.c2 = B.b2          -- steps_out=1 (references immediate parent)
+    ///     )
+    /// )
+    /// ```
+    ///
+    #[tokio::test]
+    async fn test_nested_correlated_subquery() -> Result<()> {
+        let path = "tests/testdata/test_plans/nested_correlated_subquery.substrait.json";
+        let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
+            File::open(path).expect("file not found"),
+        ))
+        .expect("failed to parse json");
+
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto).await?;
+        let plan_str = format!("{plan}");
+
+        assert_snapshot!(
+            plan_str,
+            @r#"
+        Filter: EXISTS (<subquery>)
+          Subquery:
+            Filter: B.b1 = outer_ref(A.a1) AND EXISTS (<subquery>)
+              Subquery:
+                Filter: C.c1 = outer_ref(A.a1) AND C.c2 = outer_ref(B.b2)
+                  TableScan: C
+              TableScan: B
+          TableScan: A
+        "#
+        );
+        Ok(())
+    }
+
     async fn test_plan_to_string(name: &str) -> Result<String> {
         let path = format!("tests/testdata/test_plans/{name}");
         let proto = serde_json::from_reader::<_, Plan>(BufReader::new(
@@ -509,39 +571,126 @@ mod tests {
         let plan_str =
             test_plan_to_string("select_count_from_select_1.substrait.json").await?;
 
+        assert_snapshot!(
+        plan_str,
+        @r"
+        Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]
+          Values: (Int64(0))
+        "
+                );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_expressions_in_virtual_table() -> Result<()> {
+        let plan_str =
+            test_plan_to_string("virtual_table_with_expressions.substrait.json").await?;
+
         assert_snapshot!(
         plan_str,
         @r#"
-            Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]]
-              Values: (Int64(0))
-            "#
+        Projection: dummy1 AS result1, dummy2 AS result2
+          Values: (Int64(0), Utf8("temp")), (Int64(1), Utf8("test"))
+        "#
                 );
         Ok(())
     }
 
+    #[tokio::test]
+    //There are some Substrait functions that can be represented with nested built-in expressions
+    //xor:bool_bool is implemented in the consumer with binary expressions
+    //This tests that the consumer correctly builds the nested expressions for this function
+    async fn test_built_in_binary_exprs_for_xor() -> Result<()> {
+        let plan_str =
+            test_plan_to_string("scalar_fn_to_built_in_binary_expr_xor.substrait.json")
+                .await?;
+
+        //Test correct plan structure
+        assert_snapshot!(plan_str,
+          @r"
+        Projection: a, b, (a OR b) AND NOT a AND b AS result
+          Values: (Boolean(true), Boolean(true)), (Boolean(true), Boolean(false)), (Boolean(false), Boolean(true)), (Boolean(false), Boolean(false))
+        "
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    //There are some Substrait functions that can be represented with nested built-in expressions
+    //and_not:bool_bool is implemented in the consumer as binary expressions
+    //This tests that the consumer correctly builds the nested expressions for this function
+    async fn test_built_in_binary_exprs_for_and_not() -> Result<()> {
+        let plan_str = test_plan_to_string(
+            "scalar_fn_to_built_in_binary_expr_and_not.substrait.json",
+        )
+        .await?;
+
+        //Test correct plan structure
+        assert_snapshot!(plan_str,
+          @r"
+        Projection: a, b, a AND NOT b AS result
+          Values: (Boolean(true), Boolean(true)), (Boolean(true), Boolean(false)), (Boolean(false), Boolean(true)), (Boolean(false), Boolean(false))
+        "
+        );
+
+        Ok(())
+    }
+
+    //The between:any_any_any function is implemented as Expr::Between in the Substrait consumer
+    //This test tests that the consumer correctly builds the Expr::Between expression for this function
+    #[tokio::test]
+    async fn test_between_expr() -> Result<()> {
+        let plan_str =
+            test_plan_to_string("scalar_fn_to_between_expr.substrait.json").await?;
+        assert_snapshot!(plan_str,
+          @r"
+        Projection: expr BETWEEN low AND high AS result
+          Values: (Int8(2), Int8(1), Int8(3)), (Int8(4), Int8(1), Int8(2))
+        "
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_logb_expr() -> Result<()> {
+        let plan_str = test_plan_to_string("scalar_fn_logb_expr.substrait.json").await?;
+        assert_snapshot!(plan_str,
+          @r"
+        Projection: x, base, log(base, x) AS result
+          Values: (Float32(1), Float32(10)), (Float32(100), Float32(10))
+        "
+        );
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_multiple_joins() -> Result<()> {
         let plan_str = test_plan_to_string("multiple_joins.json").await?;
-        assert_eq!(
+        assert_snapshot!(
             plan_str,
-            "Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third\
-            \n  Left Join: left.id = right.id\
-            \n    SubqueryAlias: left\
-            \n      Left Join: left.id = right.id\
-            \n        SubqueryAlias: left\
-            \n          Left Join: left.id = right.id\
-            \n            SubqueryAlias: left\
-            \n              Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\
-            \n                Values: (Int64(1)), (Int64(2))\
-            \n            SubqueryAlias: right\
-            \n              Aggregate: groupBy=[[id, category]], aggr=[[]]\
-            \n                Values: (Int64(1), Utf8(\"info\")), (Int64(2), Utf8(\"low\"))\
-            \n        SubqueryAlias: right\
-            \n          Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\
-            \n            Values: (Int64(1)), (Int64(2))\
-            \n    SubqueryAlias: right\
-            \n      Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]\
-            \n        Values: (Int64(1)), (Int64(2))"
+            @r#"
+        Projection: left.count(Int64(1)) AS count_first, left.category, left.count(Int64(1)):1 AS count_second, right.count(Int64(1)) AS count_third
+          Left Join: left.id = right.id
+            SubqueryAlias: left
+              Projection: left.id, left.count(Int64(1)), left.id:1, left.category, right.id AS id:2, right.count(Int64(1)) AS count(Int64(1)):1
+                Left Join: left.id = right.id
+                  SubqueryAlias: left
+                    Projection: left.id, left.count(Int64(1)), right.id AS id:1, right.category
+                      Left Join: left.id = right.id
+                        SubqueryAlias: left
+                          Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]
+                            Values: (Int64(1)), (Int64(2))
+                        SubqueryAlias: right
+                          Aggregate: groupBy=[[id, category]], aggr=[[]]
+                            Values: (Int64(1), Utf8("info")), (Int64(2), Utf8("low"))
+                  SubqueryAlias: right
+                    Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]
+                      Values: (Int64(1)), (Int64(2))
+            SubqueryAlias: right
+              Aggregate: groupBy=[[id]], aggr=[[count(Int64(1))]]
+                Values: (Int64(1)), (Int64(2))
+        "#
         );
         Ok(())
     }
@@ -552,11 +701,11 @@ mod tests {
 
         assert_snapshot!(
         plan_str,
-        @r#"
+        @r"
         Projection: count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
           WindowAggr: windowExpr=[[count(Int64(1)) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
             TableScan: DATA
-        "#
+        "
                         );
         Ok(())
     }
@@ -584,4 +733,33 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_join_with_expression_key() -> Result<()> {
+        let plan_str = test_plan_to_string("join_with_expression_key.json").await?;
+        assert_snapshot!(
+        plan_str,
+        @r#"
+        Projection: left.index_name AS index, right.upper(host) AS host, left.max(size_bytes) AS idx_size, right.max(total_bytes) AS db_size, CAST(left.max(size_bytes) AS Float64) / CAST(right.max(total_bytes) AS Float64) * Float64(100) AS pct_of_db
+          Inner Join: left.upper(host) = right.upper(host)
+            SubqueryAlias: left
+              Aggregate: groupBy=[[index_name, upper(host)]], aggr=[[max(size_bytes)]]
+                Projection: size_bytes, index_name, upper(host)
+                  Filter: index_name = Utf8("aaa")
+                    Values: (Utf8("aaa"), Utf8("host-a"), Int64(128)), (Utf8("bbb"), Utf8("host-b"), Int64(256))
+            SubqueryAlias: right
+              Aggregate: groupBy=[[upper(host)]], aggr=[[max(total_bytes)]]
+                Projection: total_bytes, upper(host)
+                  Inner Join:  Filter: upper(host) = upper(host)
+                    Values: (Utf8("host-a"), Int64(107)), (Utf8("host-b"), Int64(214))
+                    Projection: upper(host)
+                      Aggregate: groupBy=[[index_name, upper(host)]], aggr=[[max(size_bytes)]]
+                        Projection: size_bytes, index_name, upper(host)
+                          Filter: index_name = Utf8("aaa")
+                            Values: (Utf8("aaa"), Utf8("host-a"), Int64(128)), (Utf8("bbb"), Utf8("host-b"), Int64(256))
+        "#
+        );
+
+        Ok(())
+    }
 }
diff --git a/datafusion/substrait/tests/cases/emit_kind_tests.rs b/datafusion/substrait/tests/cases/emit_kind_tests.rs
index e916b4cb0e1a9..24508fd054d97 100644
--- a/datafusion/substrait/tests/cases/emit_kind_tests.rs
+++ b/datafusion/substrait/tests/cases/emit_kind_tests.rs
@@ -38,10 +38,10 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: DATA.A AS a, DATA.B AS b, DATA.A + Int64(1) AS add1
-              TableScan: DATA
-            "#
+        @r"
+        Projection: DATA.A AS a, DATA.B AS b, DATA.A + Int64(1) AS add1
+          TableScan: DATA
+        "
                 );
         Ok(())
     }
@@ -57,11 +57,11 @@ mod tests {
         assert_snapshot!(
         plan,
         // Note that duplicate references in the remap are aliased
-        @r#"
-            Projection: DATA.B, DATA.A AS A1, DATA.A AS DATA.A__temp__0 AS A2
-              Filter: DATA.B = Int64(2)
-                TableScan: DATA
-            "#
+        @r"
+        Projection: DATA.B, DATA.A AS A1, DATA.A AS DATA.A__temp__0 AS A2
+          Filter: DATA.B = Int64(2)
+            TableScan: DATA
+        "
                 );
         Ok(())
     }
@@ -88,21 +88,21 @@ mod tests {
         let plan = df.into_unoptimized_plan();
         assert_snapshot!(
             plan,
-            @r#"
-            Projection: random() AS c1, data.a + Int64(1) AS c2
-              TableScan: data
-            "#        );
+            @r"
+        Projection: random() AS c1, data.a + Int64(1) AS c2
+          TableScan: data
+        "        );
 
         let proto = to_substrait_plan(&plan, &ctx.state())?;
         let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
         // note how the Projections are not flattened
         assert_snapshot!(
         plan2,
-        @r#"
-            Projection: random() AS c1, data.a + Int64(1) AS c2
-              Projection: data.a, data.b, data.c, data.d, data.e, data.f, random(), data.a + Int64(1)
-                TableScan: data
-            "#
+        @r"
+        Projection: random() AS c1, data.a + Int64(1) AS c2
+          Projection: data.a, data.b, data.c, data.d, data.e, data.f, random(), data.a + Int64(1)
+            TableScan: data
+        "
                 );
         Ok(())
     }
@@ -115,10 +115,10 @@ mod tests {
         let plan = df.into_unoptimized_plan();
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: data.a + Int64(1), data.b + Int64(2)
-              TableScan: data
-            "#
+        @r"
+        Projection: data.a + Int64(1), data.b + Int64(2)
+          TableScan: data
+        "
                 );
 
         let proto = to_substrait_plan(&plan, &ctx.state())?;
diff --git a/datafusion/substrait/tests/cases/function_test.rs b/datafusion/substrait/tests/cases/function_test.rs
index 1816c64d39212..d71c80678a091 100644
--- a/datafusion/substrait/tests/cases/function_test.rs
+++ b/datafusion/substrait/tests/cases/function_test.rs
@@ -35,10 +35,10 @@ mod tests {
         assert_snapshot!(
         plan,
         @r#"
-            Projection: nation.n_name
-              Filter: contains(nation.n_name, Utf8("IA"))
-                TableScan: nation
-            "#
+        Projection: nation.n_name
+          Filter: contains(nation.n_name, Utf8("IA"))
+            TableScan: nation
+        "#
                 );
         Ok(())
     }
diff --git a/datafusion/substrait/tests/cases/logical_plans.rs b/datafusion/substrait/tests/cases/logical_plans.rs
index 4dd97193034bd..663a372fe2e4f 100644
--- a/datafusion/substrait/tests/cases/logical_plans.rs
+++ b/datafusion/substrait/tests/cases/logical_plans.rs
@@ -20,6 +20,9 @@
 #[cfg(test)]
 mod tests {
     use crate::utils::test::{add_plan_schemas_to_ctx, read_json};
+    use datafusion::common::test_util::format_batches;
+    use std::collections::HashSet;
+
     use datafusion::common::Result;
     use datafusion::dataframe::DataFrame;
     use datafusion::prelude::SessionContext;
@@ -43,10 +46,10 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: NOT DATA.D AS EXPR$0
-              TableScan: DATA
-            "#
+        @r"
+        Projection: NOT DATA.D AS EXPR$0
+          TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -74,11 +77,11 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
-              WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
-                TableScan: DATA
-            "#
+        @r"
+        Projection: sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING AS LEAD_EXPR
+          WindowAggr: windowExpr=[[sum(DATA.D) PARTITION BY [DATA.PART] ORDER BY [DATA.ORD ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING]]
+            TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -101,11 +104,11 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED
-              WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                TableScan: DATA
-            "#
+        @r"
+        Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW__temp__0 AS ALIASED
+          WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -130,12 +133,12 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1
-              WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-                  TableScan: DATA
-            "#
+        @r"
+        Projection: row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$0, row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS EXPR$1
+          WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+            WindowAggr: windowExpr=[[row_number() PARTITION BY [DATA.A] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+              TableScan: DATA
+        "
                 );
 
         // Trigger execution to ensure plan validity
@@ -144,6 +147,41 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn null_literal_before_and_after_joins() -> Result<()> {
+        // Confirms that literals used before and after a join but for different columns
+        // are correctly handled.
+
+        // File generated with substrait-java's Isthmus:
+        // ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a"
+        let proto_plan = read_json(
+            "tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json",
+        );
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+            plan,
+            @r"
+        Projection: left.A, left.Utf8(NULL) AS C, right.D, Utf8(NULL) AS Utf8(NULL)__temp__0 AS E
+          Left Join: left.A = right.A
+            SubqueryAlias: left
+              Union
+                Projection: A.A, Utf8(NULL)
+                  TableScan: A
+                Projection: B.A, CAST(B.C AS Utf8)
+                  TableScan: B
+            SubqueryAlias: right
+              TableScan: C
+        "
+        );
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn non_nullable_lists() -> Result<()> {
         // DataFusion's Substrait consumer treats all lists as nullable, even if the Substrait plan specifies them as non-nullable.
@@ -156,9 +194,7 @@ mod tests {
 
         assert_snapshot!(
                 &plan,
-            @r#"
-        Values: (List([1, 2]))
-        "#
+            @"Values: (List([1, 2]))"
         );
 
         // Trigger execution to ensure plan validity
@@ -176,14 +212,82 @@ mod tests {
 
         assert_snapshot!(
         plan,
-        @r#"
-            Projection: lower(sales.product) AS lower(product), sum(count(sales.product)) AS product_count
-              Aggregate: groupBy=[[sales.product]], aggr=[[sum(count(sales.product))]]
-                Aggregate: groupBy=[[sales.product]], aggr=[[count(sales.product)]]
-                  TableScan: sales
-            "#
+        @r"
+        Projection: lower(sales.product) AS lower(product), sum(count(sales.product)) AS product_count
+          Aggregate: groupBy=[[sales.product]], aggr=[[sum(count(sales.product))]]
+            Aggregate: groupBy=[[sales.product]], aggr=[[count(sales.product)]]
+              TableScan: sales
+        "
+                );
+
+        // Trigger execution to ensure plan validity
+        DataFrame::new(ctx.state(), plan).show().await?;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn duplicate_name_in_union() -> Result<()> {
+        let proto_plan =
+            read_json("tests/testdata/test_plans/duplicate_name_in_union.substrait.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+        plan,
+        @r"
+        Projection: foo AS col1, bar AS col2
+          Union
+            Projection: foo, bar
+              Values: (Int64(100), Int64(200))
+            Projection: x, foo
+              Values: (Int32(300), Int64(400))
+        "
                 );
 
+        // Trigger execution to ensure plan validity
+        let results = DataFrame::new(ctx.state(), plan).collect().await?;
+
+        assert_snapshot!(
+            format_batches(&results)?,
+            @r"
+        +------+------+
+        | col1 | col2 |
+        +------+------+
+        | 100  | 200  |
+        | 300  | 400  |
+        +------+------+
+        ",
+        );
+
+        // also verify that the output schema has unique field names
+        let schema = results[0].schema();
+        for batch in &results {
+            assert_eq!(schema, batch.schema());
+        }
+        let field_names: HashSet<_> = schema.fields().iter().map(|f| f.name()).collect();
+        assert_eq!(field_names.len(), schema.fields().len());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn nested_list_expressions() -> Result<()> {
+        // Tests that a Substrait Nested list expression containing non-literal
+        // expressions (column references) uses the make_array UDF.
+        let proto_plan =
+            read_json("tests/testdata/test_plans/nested_list_expressions.substrait.json");
+        let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
+        let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
+
+        assert_snapshot!(
+            plan,
+            @r"
+        Projection: make_array(DATA.a, DATA.b) AS my_list
+          TableScan: DATA
+        "
+        );
+
         // Trigger execution to ensure plan validity
         DataFrame::new(ctx.state(), plan).show().await?;
 
diff --git a/datafusion/substrait/tests/cases/mod.rs b/datafusion/substrait/tests/cases/mod.rs
index 777246e4139bf..0870c56cd3ba2 100644
--- a/datafusion/substrait/tests/cases/mod.rs
+++ b/datafusion/substrait/tests/cases/mod.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod aggregation_tests;
+mod builtin_expr_semantics_tests;
 mod consumer_integration;
 mod emit_kind_tests;
 mod function_test;
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index 7a5cfeb398365..5dd4aa4e2be91 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -17,6 +17,8 @@
 
 use crate::utils::test::read_json;
 use datafusion::arrow::array::ArrayRef;
+use datafusion::functions_nested::map::map;
+use datafusion::logical_expr::LogicalPlanBuilder;
 use datafusion::physical_plan::Accumulator;
 use datafusion::scalar::ScalarValue;
 use datafusion_substrait::logical_plan::{
@@ -26,14 +28,16 @@ use std::cmp::Ordering;
 use std::mem::size_of_val;
 
 use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit};
-use datafusion::common::{not_impl_err, plan_err, DFSchema, DFSchemaRef};
+use datafusion::common::tree_node::Transformed;
+use datafusion::common::{DFSchema, DFSchemaRef, Spans, not_impl_err, plan_err};
 use datafusion::error::Result;
 use datafusion::execution::registry::SerializerRegistry;
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::execution::session_state::SessionStateBuilder;
+use datafusion::logical_expr::expr::{Exists, SetComparison, SetQuantifier};
 use datafusion::logical_expr::{
-    Extension, InvariantLevel, LogicalPlan, PartitionEvaluator, Repartition,
-    UserDefinedLogicalNode, Values, Volatility,
+    EmptyRelation, Extension, InvariantLevel, LogicalPlan, Operator, PartitionEvaluator,
+    Repartition, Subquery, UserDefinedLogicalNode, Values, Volatility,
 };
 use datafusion::optimizer::simplify_expressions::expr_simplifier::THRESHOLD_INLINE_INLIST;
 use datafusion::prelude::*;
@@ -42,7 +46,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use substrait::proto::extensions::simple_extension_declaration::MappingType;
 use substrait::proto::rel::RelType;
-use substrait::proto::{plan_rel, Plan, Rel};
+use substrait::proto::{Plan, Rel, plan_rel};
 
 #[derive(Debug)]
 struct MockSerializerRegistry;
@@ -92,6 +96,8 @@ impl PartialOrd for MockUserDefinedLogicalPlan {
             Some(Ordering::Equal) => self.inputs.partial_cmp(&other.inputs),
             cmp => cmp,
         }
+        // TODO (https://github.com/apache/datafusion/issues/17477) avoid recomparing all fields
+        .filter(|cmp| *cmp != Ordering::Equal || self == other)
     }
 }
 
@@ -112,11 +118,7 @@ impl UserDefinedLogicalNode for MockUserDefinedLogicalPlan {
         &self.empty_schema
     }
 
-    fn check_invariants(
-        &self,
-        _check: InvariantLevel,
-        _plan: &LogicalPlan,
-    ) -> Result<()> {
+    fn check_invariants(&self, _check: InvariantLevel) -> Result<()> {
         Ok(())
     }
 
@@ -187,16 +189,56 @@ async fn simple_select() -> Result<()> {
     roundtrip("SELECT a, b FROM data").await
 }
 
+#[tokio::test]
+async fn roundtrip_literal_without_from() -> Result<()> {
+    roundtrip("SELECT 1 AS one").await
+}
+
+#[tokio::test]
+async fn roundtrip_empty_relation_with_schema() -> Result<()> {
+    // Test produce_one_row=true with multiple typed columns
+    roundtrip("SELECT 1::int as a, 'hello'::text as b, 3.14::double as c").await
+}
+
+#[tokio::test]
+async fn roundtrip_empty_relation_no_rows() -> Result<()> {
+    // Test produce_one_row=false
+    let ctx = create_context().await?;
+    let plan = LogicalPlan::EmptyRelation(EmptyRelation {
+        produce_one_row: false,
+        schema: DFSchemaRef::new(DFSchema::empty()),
+    });
+    roundtrip_logical_plan_with_ctx(plan, ctx).await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_subquery_with_empty_relation() -> Result<()> {
+    // Test EmptyRelation in the context of scalar subqueries.
+    // The optimizer may simplify the subquery away, but we're testing that
+    // the EmptyRelation round-trips correctly when it appears in the plan.
+    let ctx = create_context().await?;
+    let df = ctx.sql("SELECT (SELECT 1) as nested").await?;
+    let plan = df.into_optimized_plan()?;
+
+    // Just verify the round-trip succeeds and produces valid results
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+    let df2 = DataFrame::new(ctx.state(), plan2);
+    df2.show().await?;
+    Ok(())
+}
+
 #[tokio::test]
 async fn wildcard_select() -> Result<()> {
     let plan = generate_plan_from_sql("SELECT * FROM data", true, false).await?;
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a, data.b, data.c, data.d, data.e, data.f
       TableScan: data
-    "#
+    "
     );
     Ok(())
 }
@@ -312,11 +354,31 @@ async fn aggregate_grouping_rollup() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
-        Projection: data.a, data.c, data.e, avg(data.b)
-          Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]
-            TableScan: data projection=[a, b, c, e]
-        "#
+    @r"
+    Projection: data.a, data.c, data.e, avg(data.b)
+      Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]
+        TableScan: data projection=[a, b, c, e]
+    "
+    );
+    Ok(())
+}
+
+#[tokio::test]
+async fn aggregate_grouping_cube() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT a, c, avg(b) FROM data GROUP BY CUBE (a, c)",
+        true,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    Projection: data.a, data.c, avg(data.b)
+      Aggregate: groupBy=[[GROUPING SETS ((), (data.a), (data.c), (data.a, data.c))]], aggr=[[avg(data.b)]]
+        TableScan: data projection=[a, b, c]
+    "
     );
     Ok(())
 }
@@ -332,11 +394,11 @@ async fn multilayer_aggregate() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[data.a]], aggr=[[sum(count(data.b)) AS sum(partial_count_b)]]
       Aggregate: groupBy=[[data.a]], aggr=[[count(data.b)]]
         TableScan: data projection=[a, b]
-    "#
+    "
     );
     Ok(())
 }
@@ -348,7 +410,7 @@ async fn decimal_literal() -> Result<()> {
 
 #[tokio::test]
 async fn null_decimal_literal() -> Result<()> {
-    roundtrip("SELECT * FROM data WHERE b = NULL").await
+    roundtrip("SELECT *, CAST(NULL AS decimal(10, 2)) FROM data").await
 }
 
 #[tokio::test]
@@ -426,6 +488,41 @@ async fn simple_scalar_function_substr() -> Result<()> {
     roundtrip("SELECT SUBSTR(f, 1, 3) FROM data").await
 }
 
+// Test that DataFusion functions gets correctly mapped to Substrait names (when the names are different)
+// Follows the same structure as existing roundtrip tests, but more explicitly tests for name mappings
+async fn test_substrait_to_df_name_mapping(
+    substrait_name: &str,
+    sql: &str,
+) -> Result<()> {
+    let ctx = create_context().await?;
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+
+    let function_name = match proto.extensions[0].mapping_type.as_ref().unwrap() {
+        MappingType::ExtensionFunction(ext_f) => &ext_f.name,
+        _ => unreachable!("Expected function extension"),
+    };
+
+    assert_eq!(function_name, substrait_name);
+
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+    let plan2 = ctx.state().optimize(&plan2)?;
+
+    let plan1str = format!("{plan}");
+    let plan2str = format!("{plan2}");
+    assert_eq!(plan1str, plan2str);
+
+    assert_eq!(plan.schema(), plan2.schema());
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn scalar_function_is_nan_mapping() -> Result<()> {
+    test_substrait_to_df_name_mapping("is_nan", "SELECT ISNAN(a) FROM data").await
+}
+
 #[tokio::test]
 async fn simple_scalar_function_is_null() -> Result<()> {
     roundtrip("SELECT * FROM data WHERE a IS NULL").await
@@ -485,10 +582,10 @@ async fn aggregate_case() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE Int64(NULL) END) AS sum(CASE WHEN data.a > Int64(0) THEN Int64(1) ELSE NULL END)]]
       TableScan: data projection=[a]
-    "#
+    "
     );
     Ok(())
 }
@@ -583,12 +680,126 @@ async fn roundtrip_exists_filter() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.b
       LeftSemi Join: data.a = data2.a Filter: data2.e != CAST(data.e AS Int64)
         TableScan: data projection=[a, b, e]
         TableScan: data2 projection=[a, e]
-    "#
+    "
+            );
+    Ok(())
+}
+
+// assemble logical plan manually to ensure SetComparison expr is present (not rewrite away)
+#[tokio::test]
+async fn roundtrip_set_comparison_any_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_set_comparison_plan(&ctx, SetQuantifier::Any, Operator::Gt).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_set_comparison_predicate(&roundtrip_plan, Operator::Gt, SetQuantifier::Any);
+    Ok(())
+}
+
+// assemble logical plan manually to ensure SetComparison expr is present (not rewrite away)
+#[tokio::test]
+async fn roundtrip_set_comparison_all_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan =
+        build_set_comparison_plan(&ctx, SetQuantifier::All, Operator::NotEq).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_set_comparison_predicate(&roundtrip_plan, Operator::NotEq, SetQuantifier::All);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_scalar_subquery_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_scalar_subquery_projection_plan(&ctx).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    assert_root_project_has_scalar_subquery(proto.as_ref());
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_projection_contains_scalar_subquery(&roundtrip_plan);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_exists_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_exists_filter_plan(&ctx, false).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_exists_predicate(&roundtrip_plan, false);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_not_exists_substrait() -> Result<()> {
+    let ctx = create_context().await?;
+    let plan = build_exists_filter_plan(&ctx, true).await?;
+    let proto = to_substrait_plan(&plan, &ctx.state())?;
+    let roundtrip_plan = from_substrait_plan(&ctx.state(), &proto).await?;
+    assert_exists_predicate(&roundtrip_plan, true);
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_not_exists_filter_left_anti_join() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT ba.isbn, ba.author FROM book_author ba WHERE NOT EXISTS (SELECT 1 FROM book b WHERE b.isbn = ba.isbn)",
+        false,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    LeftAnti Join: book_author.isbn = book.isbn
+      TableScan: book_author projection=[isbn, author]
+      TableScan: book projection=[isbn]
+    "
+            );
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_right_anti_join() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT * FROM book b RIGHT ANTI JOIN book_author ba ON b.isbn = ba.isbn",
+        false,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    RightAnti Join: book.isbn = book_author.isbn
+      TableScan: book projection=[isbn]
+      TableScan: book_author projection=[isbn, author]
+    "
+            );
+    Ok(())
+}
+
+#[tokio::test]
+async fn roundtrip_right_semi_join() -> Result<()> {
+    let plan = generate_plan_from_sql(
+        "SELECT * FROM book b RIGHT SEMI JOIN book_author ba ON b.isbn = ba.isbn",
+        false,
+        true,
+    )
+    .await?;
+
+    assert_snapshot!(
+    plan,
+    @r"
+    RightSemi Join: book.isbn = book_author.isbn
+      TableScan: book projection=[isbn]
+      TableScan: book_author projection=[isbn, author]
+    "
             );
     Ok(())
 }
@@ -604,12 +815,12 @@ async fn inner_join() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a
       Inner Join: data.a = data2.a
         TableScan: data projection=[a]
         TableScan: data2 projection=[a]
-    "#
+    "
             );
     Ok(())
 }
@@ -633,17 +844,50 @@ async fn roundtrip_outer_join() -> Result<()> {
 async fn roundtrip_self_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip("SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.a = right.a").await?;
-    roundtrip("SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.b = right.b").await
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.a = right.a";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    // Test second variant
+    let sql2 = "SELECT left.a as left_a, left.b, right.a as right_a, right.c FROM data AS left JOIN data AS right ON left.b = right.b";
+    let df2 = ctx.sql(sql2).await?;
+    let plan3 = df2.into_optimized_plan()?;
+    let plan4 = substrait_roundtrip(&plan3, &ctx).await?;
+    assert_eq!(plan3.schema(), plan4.schema());
+    DataFrame::new(ctx.state(), plan4).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
 async fn roundtrip_self_implicit_cross_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip("SELECT left.a left_a, left.b, right.a right_a, right.c FROM data AS left, data AS right").await
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.a left_a, left.b, right.a right_a, right.c FROM data AS left, data AS right";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
@@ -657,14 +901,14 @@ async fn self_join_introduces_aliases() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: left.b, right.c
       Inner Join: left.b = right.b
         SubqueryAlias: left
           TableScan: data projection=[b]
         SubqueryAlias: right
           TableScan: data projection=[b, c]
-    "#
+    "
             );
     Ok(())
 }
@@ -814,26 +1058,27 @@ async fn aggregate_wo_projection_consume() -> Result<()> {
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
-            Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
-              TableScan: data projection=[a]
-            "#
+    @r"
+    Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
+      TableScan: data projection=[a]
+    "
         );
     Ok(())
 }
 
 #[tokio::test]
 async fn aggregate_wo_projection_group_expression_ref_consume() -> Result<()> {
-    let proto_plan =
-        read_json("tests/testdata/test_plans/aggregate_no_project_group_expression_ref.substrait.json");
+    let proto_plan = read_json(
+        "tests/testdata/test_plans/aggregate_no_project_group_expression_ref.substrait.json",
+    );
 
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
-            Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
-              TableScan: data projection=[a]
-            "#
+    @r"
+    Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) AS countA]]
+      TableScan: data projection=[a]
+    "
         );
     Ok(())
 }
@@ -846,26 +1091,27 @@ async fn aggregate_wo_projection_sorted_consume() -> Result<()> {
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[data.a]], aggr=[[count(data.a) ORDER BY [data.a DESC NULLS FIRST] AS countA]]
       TableScan: data projection=[a]
-    "#
+    "
             );
     Ok(())
 }
 
 #[tokio::test]
 async fn aggregate_identical_grouping_expressions() -> Result<()> {
-    let proto_plan =
-        read_json("tests/testdata/test_plans/aggregate_identical_grouping_expressions.substrait.json");
+    let proto_plan = read_json(
+        "tests/testdata/test_plans/aggregate_identical_grouping_expressions.substrait.json",
+    );
 
     let plan = generate_plan_from_substrait(proto_plan).await?;
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Aggregate: groupBy=[[Int32(1) AS grouping_col_1, Int32(1) AS grouping_col_2]], aggr=[[]]
       TableScan: data projection=[]
-    "#
+    "
             );
     Ok(())
 }
@@ -1006,6 +1252,96 @@ async fn simple_intersect_table_reuse() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn self_referential_intersect() -> Result<()> {
+    // Test INTERSECT with the same table on both sides
+    // This previously failed with "Schema contains duplicate qualified field name"
+    // The fix ensures requalify_sides_if_needed is called in intersect_or_except
+    // After roundtrip through Substrait, SubqueryAlias is lost and requalification
+    // produces "left" and "right" aliases
+    // Note: INTERSECT (without ALL) includes DISTINCT, but the outer Aggregate
+    // is optimized away, resulting in just the **LeftSemi** join
+    // (LeftSemi returns rows from left that exist in right)
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 INTERSECT SELECT a FROM data WHERE a < 5",
+        "LeftSemi Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Aggregate: groupBy=[[data.a]], aggr=[[]]\
+        \n      Filter: data.a > Int64(0)\
+        \n        TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_except() -> Result<()> {
+    // Test EXCEPT with the same table on both sides
+    // This previously failed with "Schema contains duplicate qualified field name"
+    // The fix ensures requalify_sides_if_needed is called in intersect_or_except
+    // After roundtrip through Substrait, SubqueryAlias is lost and requalification
+    // produces "left" and "right" aliases
+    // Note: EXCEPT (without ALL) includes DISTINCT, but the outer Aggregate
+    // is optimized away, resulting in just the **LeftAnti** join
+    // (LeftAnti returns rows from left that don't exist in right)
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 EXCEPT SELECT a FROM data WHERE a < 5",
+        "LeftAnti Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Aggregate: groupBy=[[data.a]], aggr=[[]]\
+        \n      Filter: data.a > Int64(0)\
+        \n        TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_intersect_all() -> Result<()> {
+    // Test INTERSECT ALL with the same table on both sides
+    // INTERSECT ALL preserves duplicates and does not include DISTINCT
+    // Uses **LeftSemi** join (returns rows from left that exist in right)
+    // The requalification ensures no duplicate field name errors
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 INTERSECT ALL SELECT a FROM data WHERE a < 5",
+        "LeftSemi Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Filter: data.a > Int64(0)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
+#[tokio::test]
+async fn self_referential_except_all() -> Result<()> {
+    // Test EXCEPT ALL with the same table on both sides
+    // EXCEPT ALL preserves duplicates and does not include DISTINCT
+    // Uses **LeftAnti** join (returns rows from left that don't exist in right)
+    // The requalification ensures no duplicate field name errors
+    assert_expected_plan(
+        "SELECT a FROM data WHERE a > 0 EXCEPT ALL SELECT a FROM data WHERE a < 5",
+        "LeftAnti Join: left.a = right.a\
+        \n  SubqueryAlias: left\
+        \n    Filter: data.a > Int64(0)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a > Int64(0)]\
+        \n  SubqueryAlias: right\
+        \n    Filter: data.a < Int64(5)\
+        \n      TableScan: data projection=[a], partial_filters=[data.a < Int64(5)]",
+        true,
+    )
+    .await
+}
+
 #[tokio::test]
 async fn simple_window_function() -> Result<()> {
     roundtrip("SELECT RANK() OVER (PARTITION BY a ORDER BY b), d, sum(b) OVER (PARTITION BY a) FROM data;").await
@@ -1051,6 +1387,7 @@ async fn all_type_literal() -> Result<()> {
             uint32_col = arrow_cast('0', 'UInt32') AND
             int64_col = arrow_cast('0', 'Int64') AND
             uint64_col = arrow_cast('0', 'UInt64') AND
+            float16_col = arrow_cast(0.0, 'Float16') AND
             float32_col = arrow_cast('0', 'Float32') AND
             float64_col = arrow_cast('0', 'Float64') AND
             sec_timestamp_col = arrow_cast('2020-01-01 00:00:00', 'Timestamp (Second, None)') AND
@@ -1084,10 +1421,10 @@ async fn roundtrip_literal_struct() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: Struct({c0:1,c1:true,c2:}) AS struct(Int64(1),Boolean(true),NULL)
       TableScan: data projection=[]
-    "#
+    "
             );
     Ok(())
 }
@@ -1104,7 +1441,7 @@ async fn roundtrip_literal_named_struct() -> Result<()> {
     assert_snapshot!(
     plan,
     @r#"
-    Projection: Struct({int_field:1,boolean_field:true,string_field:}) AS named_struct(Utf8("int_field"),Int64(1),Utf8("boolean_field"),Boolean(true),Utf8("string_field"),NULL)
+    Projection: CAST(Struct({c0:1,c1:true,c2:}) AS Struct("int_field": Int64, "boolean_field": Boolean, "string_field": Utf8View)) AS named_struct(Utf8("int_field"),Int64(1),Utf8("boolean_field"),Boolean(true),Utf8("string_field"),NULL)
       TableScan: data projection=[]
     "#
             );
@@ -1125,7 +1462,7 @@ async fn roundtrip_literal_renamed_struct() -> Result<()> {
     assert_snapshot!(
     plan,
     @r#"
-    Projection: Struct({int_field:1}) AS Struct({c0:1})
+    Projection: CAST(Struct({c0:1}) AS Struct("int_field": Int32))
       TableScan: data projection=[]
     "#
             );
@@ -1153,9 +1490,7 @@ async fn roundtrip_values() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
-    Values: (Int64(1), Utf8("a"), List([[-213.1, , 5.5, 2.0, 1.0], []]), LargeList([1, 2, 3]), Struct({c0:true,int_field:1,c2:}), List([{struct_field: {string_field: a}}, {struct_field: {string_field: b}}])), (Int64(NULL), Utf8(NULL), List(), LargeList(), Struct({c0:,int_field:,c2:}), List())
-    "#
+    @r#"Values: (Int64(1), Utf8("a"), List([[-213.1, , 5.5, 2.0, 1.0], []]), LargeList([1, 2, 3]), Struct({c0:true,int_field:1,c2:}), List([{struct_field: {string_field: a}}, {struct_field: {string_field: b}}])), (Int64(NULL), Utf8(NULL), List(), LargeList(), Struct({c0:,int_field:,c2:}), List())"#
             );
     Ok(())
 }
@@ -1172,6 +1507,34 @@ async fn roundtrip_values_no_columns() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn roundtrip_values_with_scalar_function() -> Result<()> {
+    let ctx = create_context().await?;
+    //  datafusion::functions_nested::map::map;
+    let expr = map(vec![lit("a")], vec![lit(1)]);
+    let plan = LogicalPlanBuilder::values(vec![vec![expr]])?.build()?;
+    let expected = ctx.state().optimize(&plan)?;
+
+    let actual = substrait_roundtrip(&plan, &ctx).await?;
+
+    let strip_aliases_from_values = |plan: &LogicalPlan| -> LogicalPlan {
+        plan.clone()
+            .map_expressions(|expr| Ok(Transformed::yes(expr.unalias())))
+            .map(|t| t.data)
+            .unwrap_or_else(|_| plan.clone())
+    };
+
+    let normalized_expected = strip_aliases_from_values(&expected);
+    let normalized_actual = strip_aliases_from_values(&actual);
+
+    assert_eq!(
+        format!("{normalized_expected}"),
+        format!("{normalized_actual}")
+    );
+    assert_eq!(normalized_expected.schema(), normalized_actual.schema());
+    Ok(())
+}
+
 #[tokio::test]
 async fn roundtrip_values_empty_relation() -> Result<()> {
     roundtrip("SELECT * FROM (VALUES ('a')) LIMIT 0").await
@@ -1181,16 +1544,26 @@ async fn roundtrip_values_empty_relation() -> Result<()> {
 async fn roundtrip_values_duplicate_column_join() -> Result<()> {
     // Substrait does currently NOT maintain the alias of the tables.
     // Instead, when we consume Substrait, we add aliases before a join that'd otherwise collide.
-    // This roundtrip works because we set aliases to what the Substrait consumer will generate.
-    roundtrip(
-        "SELECT left.column1 as c1, right.column1 as c2 \
+    // The improved NameTracker now adds __temp__0 suffix to handle naming conflicts.
+    // We verify semantic equivalence rather than exact string match.
+    let ctx = create_context().await?;
+    let sql = "SELECT left.column1 as c1, right.column1 as c2 \
     FROM \
         (VALUES (1)) AS left \
     JOIN \
         (VALUES (2)) AS right \
-    ON left.column1 == right.column1",
-    )
-    .await
+    ON left.column1 == right.column1";
+    let df = ctx.sql(sql).await?;
+    let plan = df.into_optimized_plan()?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
+
+    // Verify schemas are equivalent
+    assert_eq!(plan.schema(), plan2.schema());
+
+    // Execute to ensure plan validity
+    DataFrame::new(ctx.state(), plan2).show().await?;
+
+    Ok(())
 }
 
 #[tokio::test]
@@ -1208,11 +1581,11 @@ async fn duplicate_column() -> Result<()> {
 
     assert_snapshot!(
     plan,
-    @r#"
+    @r"
     Projection: data.a + Int64(1) AS sum_a, data.a + Int64(1) AS data.a + Int64(1)__temp__0 AS sum_a_2
       Projection: data.a + Int64(1)
         TableScan: data projection=[a]
-    "#
+    "
         );
     Ok(())
 }
@@ -1355,9 +1728,7 @@ async fn roundtrip_repartition_roundrobin() -> Result<()> {
         partitioning_scheme: Partitioning::RoundRobinBatch(8),
     });
 
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     assert_eq!(format!("{plan}"), format!("{plan2}"));
     Ok(())
@@ -1372,9 +1743,7 @@ async fn roundtrip_repartition_hash() -> Result<()> {
         partitioning_scheme: Partitioning::Hash(vec![col("data.a")], 8),
     });
 
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     assert_eq!(format!("{plan}"), format!("{plan2}"));
     Ok(())
@@ -1442,7 +1811,7 @@ fn check_post_join_filters(rel: &Rel) -> Result<()> {
         }
         Some(RelType::ExtensionLeaf(_)) | Some(RelType::Read(_)) => Ok(()),
         _ => not_impl_err!(
-            "Unsupported RelType: {:?} in post join filter check",
+            "Unsupported Reltype: {:?} in post join filter check",
             rel.rel_type
         ),
     }
@@ -1556,9 +1925,7 @@ async fn assert_expected_plan(
     let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     if assert_schema {
         assert_eq!(plan.schema(), plan2.schema());
@@ -1596,13 +1963,193 @@ async fn assert_substrait_sql(substrait_plan: Plan, sql: &str) -> Result<()> {
     Ok(())
 }
 
+async fn build_set_comparison_plan(
+    ctx: &SessionContext,
+    quantifier: SetQuantifier,
+    op: Operator,
+) -> Result<LogicalPlan> {
+    let base_scan = ctx.table("data").await?.into_unoptimized_plan();
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("data2.a")])?
+        .build()?;
+    let predicate = Expr::SetComparison(SetComparison::new(
+        Box::new(col("data.a")),
+        Subquery {
+            subquery: Arc::new(subquery_plan),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        },
+        op,
+        quantifier,
+    ));
+
+    LogicalPlanBuilder::from(base_scan)
+        .filter(predicate)?
+        .project(vec![col("data.a")])?
+        .build()
+}
+
+async fn build_scalar_subquery_projection_plan(
+    ctx: &SessionContext,
+) -> Result<LogicalPlan> {
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("a")])?
+        .limit(0, Some(1))?
+        .build()?;
+
+    let scalar_subquery = Expr::ScalarSubquery(Subquery {
+        subquery: Arc::new(subquery_plan),
+        outer_ref_columns: vec![],
+        spans: Spans::new(),
+    });
+
+    let outer_empty_relation = LogicalPlan::EmptyRelation(EmptyRelation {
+        produce_one_row: true,
+        schema: DFSchemaRef::new(DFSchema::empty()),
+    });
+
+    LogicalPlanBuilder::from(outer_empty_relation)
+        .project(vec![scalar_subquery.alias("sq")])?
+        .build()
+}
+
+async fn build_exists_filter_plan(
+    ctx: &SessionContext,
+    negated: bool,
+) -> Result<LogicalPlan> {
+    let base_scan = ctx.table("data").await?.into_unoptimized_plan();
+    let subquery_scan = ctx.table("data2").await?.into_unoptimized_plan();
+    let subquery_plan = LogicalPlanBuilder::from(subquery_scan)
+        .project(vec![col("data2.a")])?
+        .build()?;
+
+    let predicate = Expr::Exists(Exists::new(
+        Subquery {
+            subquery: Arc::new(subquery_plan),
+            outer_ref_columns: vec![],
+            spans: Spans::new(),
+        },
+        negated,
+    ));
+
+    LogicalPlanBuilder::from(base_scan)
+        .filter(predicate)?
+        .project(vec![col("data.a")])?
+        .build()
+}
+
+fn assert_set_comparison_predicate(
+    plan: &LogicalPlan,
+    expected_op: Operator,
+    expected_quantifier: SetQuantifier,
+) {
+    let predicate = match plan {
+        LogicalPlan::Projection(p) => match p.input.as_ref() {
+            LogicalPlan::Filter(filter) => &filter.predicate,
+            other => panic!("expected Filter inside Projection, got {other:?}"),
+        },
+        LogicalPlan::Filter(filter) => &filter.predicate,
+        other => panic!("expected Filter plan, got {other:?}"),
+    };
+
+    match predicate {
+        Expr::SetComparison(set_comparison) => {
+            assert_eq!(set_comparison.op, expected_op);
+            assert_eq!(set_comparison.quantifier, expected_quantifier);
+        }
+        other => panic!("expected SetComparison predicate, got {other:?}"),
+    }
+}
+
+fn assert_root_project_has_scalar_subquery(proto: &Plan) {
+    let relation = proto
+        .relations
+        .first()
+        .expect("expected Substrait plan to have at least one relation");
+
+    let root = match relation.rel_type.as_ref() {
+        Some(plan_rel::RelType::Root(root)) => root,
+        other => panic!("expected root relation, got {other:?}"),
+    };
+
+    let input = root.input.as_ref().expect("expected root input relation");
+    let project = match input.rel_type.as_ref() {
+        Some(RelType::Project(project)) => project,
+        other => panic!("expected Project relation at root input, got {other:?}"),
+    };
+
+    let expr = project
+        .expressions
+        .first()
+        .expect("expected at least one project expression");
+    let subquery = match expr.rex_type.as_ref() {
+        Some(substrait::proto::expression::RexType::Subquery(subquery)) => subquery,
+        other => panic!("expected Subquery expression, got {other:?}"),
+    };
+
+    assert!(
+        matches!(
+            subquery.subquery_type.as_ref(),
+            Some(substrait::proto::expression::subquery::SubqueryType::Scalar(_))
+        ),
+        "expected scalar subquery type"
+    );
+}
+
+fn assert_projection_contains_scalar_subquery(plan: &LogicalPlan) {
+    let projection = match plan {
+        LogicalPlan::Projection(projection) => projection,
+        other => panic!("expected Projection plan, got {other:?}"),
+    };
+
+    let found_scalar_subquery = projection.expr.iter().any(expr_contains_scalar_subquery);
+    assert!(
+        found_scalar_subquery,
+        "expected Projection to contain ScalarSubquery expression"
+    );
+}
+
+fn expr_contains_scalar_subquery(expr: &Expr) -> bool {
+    match expr {
+        Expr::ScalarSubquery(_) => true,
+        Expr::Alias(alias) => expr_contains_scalar_subquery(alias.expr.as_ref()),
+        _ => false,
+    }
+}
+
+fn assert_exists_predicate(plan: &LogicalPlan, expected_negated: bool) {
+    let predicate = match plan {
+        LogicalPlan::Projection(projection) => match projection.input.as_ref() {
+            LogicalPlan::Filter(filter) => &filter.predicate,
+            other => panic!("expected Filter inside Projection, got {other:?}"),
+        },
+        LogicalPlan::Filter(filter) => &filter.predicate,
+        other => panic!("expected Filter plan, got {other:?}"),
+    };
+
+    if expected_negated {
+        match predicate {
+            Expr::Not(inner) => match inner.as_ref() {
+                Expr::Exists(exists) => assert!(!exists.negated),
+                other => panic!("expected Exists inside NOT, got {other:?}"),
+            },
+            other => panic!("expected NOT EXISTS predicate, got {other:?}"),
+        }
+    } else {
+        match predicate {
+            Expr::Exists(exists) => assert!(!exists.negated),
+            other => panic!("expected EXISTS predicate, got {other:?}"),
+        }
+    }
+}
+
 async fn roundtrip_fill_na(sql: &str) -> Result<()> {
     let ctx = create_context().await?;
     let df = ctx.sql(sql).await?;
     let plan = df.into_optimized_plan()?;
-    let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     // Format plan string and replace all None's with 0
     let plan1str = format!("{plan}").replace("None", "0");
@@ -1614,6 +2161,18 @@ async fn roundtrip_fill_na(sql: &str) -> Result<()> {
     Ok(())
 }
 
+/// Converts a logical plan to Substrait and back, applying optimization.
+/// Returns the roundtripped and optimized logical plan.
+async fn substrait_roundtrip(
+    plan: &LogicalPlan,
+    ctx: &SessionContext,
+) -> Result<LogicalPlan> {
+    let proto = to_substrait_plan(plan, &ctx.state())?;
+    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
+    let plan2 = ctx.state().optimize(&plan2)?;
+    Ok(plan2)
+}
+
 async fn test_alias(sql_with_alias: &str, sql_no_alias: &str) -> Result<()> {
     // Since we ignore the SubqueryAlias in the producer, the result should be
     // the same as producing a Substrait plan from the same query without aliases
@@ -1641,8 +2200,7 @@ async fn roundtrip_logical_plan_with_ctx(
     ctx: SessionContext,
 ) -> Result<Box<Plan>> {
     let proto = to_substrait_plan(&plan, &ctx.state())?;
-    let plan2 = from_substrait_plan(&ctx.state(), &proto).await?;
-    let plan2 = ctx.state().optimize(&plan2)?;
+    let plan2 = substrait_roundtrip(&plan, &ctx).await?;
 
     let plan1str = format!("{plan}");
     let plan2str = format!("{plan2}");
@@ -1718,6 +2276,34 @@ async fn create_context() -> Result<SessionContext> {
     ctx.register_csv("data2", "tests/testdata/data.csv", CsvReadOptions::new())
         .await?;
 
+    // Register test tables for anti join tests
+    let book_fields = vec![
+        Field::new("isbn", DataType::Int64, false),
+        Field::new("title", DataType::Utf8, true),
+        Field::new("genre", DataType::Utf8, true),
+    ];
+    let book_schema = Schema::new(book_fields);
+    let mut book_options = CsvReadOptions::new();
+    book_options.schema = Some(&book_schema);
+    book_options.has_header = false;
+    ctx.register_csv("book", "tests/testdata/empty.csv", book_options)
+        .await?;
+
+    let book_author_fields = vec![
+        Field::new("isbn", DataType::Int64, true),
+        Field::new("author", DataType::Utf8, true),
+    ];
+    let book_author_schema = Schema::new(book_author_fields);
+    let mut book_author_options = CsvReadOptions::new();
+    book_author_options.schema = Some(&book_author_schema);
+    book_author_options.has_header = false;
+    ctx.register_csv(
+        "book_author",
+        "tests/testdata/empty.csv",
+        book_author_options,
+    )
+    .await?;
+
     Ok(ctx)
 }
 
@@ -1735,6 +2321,7 @@ async fn create_all_type_context() -> Result<SessionContext> {
         Field::new("uint32_col", DataType::UInt32, true),
         Field::new("int64_col", DataType::Int64, true),
         Field::new("uint64_col", DataType::UInt64, true),
+        Field::new("float16_col", DataType::Float16, true),
         Field::new("float32_col", DataType::Float32, true),
         Field::new("float64_col", DataType::Float64, true),
         Field::new(
diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
index 64599465f96f7..9773cf4aba10f 100644
--- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs
@@ -26,7 +26,7 @@ use datafusion::datasource::physical_plan::{
     FileGroup, FileScanConfigBuilder, ParquetSource,
 };
 use datafusion::error::Result;
-use datafusion::physical_plan::{displayable, ExecutionPlan};
+use datafusion::physical_plan::{ExecutionPlan, displayable};
 use datafusion::prelude::{ParquetReadOptions, SessionContext};
 use datafusion_substrait::physical_plan::{consumer, producer};
 
@@ -35,24 +35,22 @@ use substrait::proto::extensions;
 
 #[tokio::test]
 async fn parquet_exec() -> Result<()> {
-    let source = Arc::new(ParquetSource::default());
-
-    let scan_config = FileScanConfigBuilder::new(
-        ObjectStoreUrl::local_filesystem(),
-        Arc::new(Schema::empty()),
-        source,
-    )
-    .with_file_groups(vec![
-        FileGroup::new(vec![PartitionedFile::new(
-            "file://foo/part-0.parquet".to_string(),
-            123,
-        )]),
-        FileGroup::new(vec![PartitionedFile::new(
-            "file://foo/part-1.parquet".to_string(),
-            123,
-        )]),
-    ])
-    .build();
+    let schema = Arc::new(Schema::empty());
+    let source = Arc::new(ParquetSource::new(schema.clone()));
+
+    let scan_config =
+        FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
+            .with_file_groups(vec![
+                FileGroup::new(vec![PartitionedFile::new(
+                    "file://foo/part-0.parquet".to_string(),
+                    123,
+                )]),
+                FileGroup::new(vec![PartitionedFile::new(
+                    "file://foo/part-1.parquet".to_string(),
+                    123,
+                )]),
+            ])
+            .build();
     let parquet_exec: Arc<dyn ExecutionPlan> =
         DataSourceExec::from_data_source(scan_config);
 
diff --git a/datafusion/substrait/tests/cases/serialize.rs b/datafusion/substrait/tests/cases/serialize.rs
index 39c0622e3ba39..2d7257fad3394 100644
--- a/datafusion/substrait/tests/cases/serialize.rs
+++ b/datafusion/substrait/tests/cases/serialize.rs
@@ -17,7 +17,6 @@
 
 #[cfg(test)]
 mod tests {
-    use datafusion::common::assert_contains;
     use datafusion::datasource::provider_as_source;
     use datafusion::logical_expr::LogicalPlanBuilder;
     use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
@@ -31,7 +30,7 @@ mod tests {
     use std::fs;
     use substrait::proto::plan_rel::RelType;
     use substrait::proto::rel_common::{Emit, EmitKind};
-    use substrait::proto::{rel, RelCommon};
+    use substrait::proto::{RelCommon, rel};
 
     #[tokio::test]
     async fn serialize_to_file() -> Result<()> {
@@ -44,8 +43,18 @@ mod tests {
         serializer::deserialize(path).await?;
 
         // Test case 2: serializing to an existing file should fail.
-        let got = serializer::serialize(sql, &ctx, path).await.unwrap_err();
-        assert_contains!(got.to_string(), "File exists");
+        let got = serializer::serialize(sql, &ctx, path)
+            .await
+            .unwrap_err()
+            .to_string();
+        assert!(
+            [
+                "File exists", // unix
+                "os error 80"  // windows
+            ]
+            .iter()
+            .any(|s| got.contains(s))
+        );
 
         fs::remove_file(path)?;
 
@@ -95,10 +104,10 @@ mod tests {
 
         assert_snapshot!(
                     format!("{}", datafusion_plan),
-                    @r#"
-Projection: data.b, data.a + data.a, data.a
-  TableScan: data projection=[a, b]
-"#
+                    @r"
+        Projection: data.b, data.a + data.a, data.a
+          TableScan: data projection=[a, b]
+        "
         ,
                 );
 
@@ -142,11 +151,11 @@ Projection: data.b, data.a + data.a, data.a
         let datafusion_plan = df.into_optimized_plan()?;
         assert_snapshot!(
                     datafusion_plan,
-                    @r#"
-Projection: data.b, rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, data.c
-  WindowAggr: windowExpr=[[rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-    TableScan: data projection=[a, b, c]
-"#
+                    @r"
+        Projection: data.b, rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, data.c
+          WindowAggr: windowExpr=[[rank() PARTITION BY [data.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+            TableScan: data projection=[a, b, c]
+        "
         ,
                 );
 
diff --git a/datafusion/substrait/tests/cases/substrait_validations.rs b/datafusion/substrait/tests/cases/substrait_validations.rs
index a31b3ca385e9c..9841c736da8c9 100644
--- a/datafusion/substrait/tests/cases/substrait_validations.rs
+++ b/datafusion/substrait/tests/cases/substrait_validations.rs
@@ -51,7 +51,7 @@ mod tests {
             let ctx = SessionContext::new();
             ctx.register_table(
                 table_ref,
-                Arc::new(EmptyTable::new(df_schema.inner().clone())),
+                Arc::new(EmptyTable::new(Arc::clone(df_schema.inner()))),
             )?;
             Ok(ctx)
         }
@@ -69,10 +69,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA
+            "
                         );
             Ok(())
         }
@@ -92,10 +92,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA projection=[a, b]
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA projection=[a, b]
+            "
                         );
             Ok(())
         }
@@ -117,10 +117,10 @@ mod tests {
 
             assert_snapshot!(
             plan,
-            @r#"
-                Projection: DATA.a, DATA.b
-                  TableScan: DATA projection=[a, b]
-                "#
+            @r"
+            Projection: DATA.a, DATA.b
+              TableScan: DATA projection=[a, b]
+            "
                         );
             Ok(())
         }
diff --git a/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/no_groupings.json b/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/no_groupings.json
new file mode 100644
index 0000000000000..9305aa6461ab9
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/no_groupings.json
@@ -0,0 +1,92 @@
+{
+  "extensionUris": [
+    {
+      "extensionUriAnchor": 1,
+      "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml"
+    }
+  ],
+  "extensions": [
+    {
+      "extensionFunction": {
+        "extensionUriReference": 1,
+        "functionAnchor": 1,
+        "name": "sum:i8"
+      }
+    }
+  ],
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "aggregate": {
+            "common": {
+              "direct": {}
+            },
+            "input": {
+              "read": {
+                "baseSchema": {
+                  "names": [
+                    "c0",
+                    "c1"
+                  ],
+                  "struct": {
+                    "nullability": "NULLABILITY_REQUIRED",
+                    "types": [
+                      {
+                        "i8": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      {
+                        "i8": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }
+                    ]
+                  }
+                },
+                "common": {
+                  "direct": {}
+                },
+                "virtualTable": {}
+              }
+            },
+            "measures": [
+              {
+                "measure": {
+                  "arguments": [
+                    {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {}
+                          },
+                          "rootReference": {}
+                        }
+                      }
+                    }
+                  ],
+                  "functionReference": 1,
+                  "invocation": "AGGREGATION_INVOCATION_ALL",
+                  "outputType": {
+                    "i8": {
+                      "nullability": "NULLABILITY_NULLABLE"
+                    }
+                  },
+                  "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT"
+                }
+              }
+            ]
+          }
+        },
+        "names": [
+          "summation"
+        ]
+      }
+    }
+  ],
+  "version": {
+    "minorNumber": 29,
+    "producer": "substrait-go v4.2.0"
+  }
+}
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/single_grouping.json b/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/single_grouping.json
new file mode 100644
index 0000000000000..9535596a1e819
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/aggregate_groupings/single_grouping.json
@@ -0,0 +1,109 @@
+{
+  "extensionUris": [
+    {
+      "extensionUriAnchor": 1,
+      "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml"
+    }
+  ],
+  "extensions": [
+    {
+      "extensionFunction": {
+        "extensionUriReference": 1,
+        "functionAnchor": 1,
+        "name": "sum:i8"
+      }
+    }
+  ],
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "aggregate": {
+            "common": {
+              "direct": {}
+            },
+            "input": {
+              "read": {
+                "baseSchema": {
+                  "names": [
+                    "c0",
+                    "c1"
+                  ],
+                  "struct": {
+                    "nullability": "NULLABILITY_REQUIRED",
+                    "types": [
+                      {
+                        "i8": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      {
+                        "i8": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }
+                    ]
+                  }
+                },
+                "common": {
+                  "direct": {}
+                },
+                "virtualTable": {}
+              }
+            },
+            "groupingExpressions": [
+              {
+                "selection": {
+                  "directReference": {
+                    "structField": {}
+                  },
+                  "rootReference": {}
+                }
+              }
+            ],
+            "groupings": [
+              {
+                "expressionReferences": [0]
+              }
+
+            ],
+            "measures": [
+              {
+                "measure": {
+                  "arguments": [
+                    {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {}
+                          },
+                          "rootReference": {}
+                        }
+                      }
+                    }
+                  ],
+                  "functionReference": 1,
+                  "invocation": "AGGREGATION_INVOCATION_ALL",
+                  "outputType": {
+                    "i8": {
+                      "nullability": "NULLABILITY_NULLABLE"
+                    }
+                  },
+                  "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT"
+                }
+              }
+            ]
+          }
+        },
+        "names": [
+          "c0",
+          "summation"
+        ]
+      }
+    }
+  ],
+  "version": {
+    "minorNumber": 29,
+    "producer": "substrait-go v4.2.0"
+  }
+}
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json
new file mode 100644
index 0000000000000..d72830898f913
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json
@@ -0,0 +1,287 @@
+{
+  "extensionUris": [{
+    "extensionUriAnchor": 1,
+    "uri": "/functions_comparison.yaml"
+  }],
+  "extensions": [{
+    "extensionFunction": {
+      "extensionUriReference": 1,
+      "functionAnchor": 1,
+      "name": "equal:any_any"
+    }
+  }],
+  "relations": [{
+    "root": {
+      "input": {
+        "project": {
+          "common": {
+            "emit": {
+              "outputMapping": [4, 5, 6, 7]
+            }
+          },
+          "input": {
+            "join": {
+              "common": {
+                "direct": {
+                }
+              },
+              "left": {
+                "set": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "inputs": [{
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [1, 2]
+                        }
+                      },
+                      "input": {
+                        "read": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "baseSchema": {
+                            "names": ["A"],
+                            "struct": {
+                              "types": [{
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }],
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_REQUIRED"
+                            }
+                          },
+                          "namedTable": {
+                            "names": ["A"]
+                          }
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "literal": {
+                          "null": {
+                            "string": {
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "nullable": false,
+                          "typeVariationReference": 0
+                        }
+                      }]
+                    }
+                  }, {
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [2, 3]
+                        }
+                      },
+                      "input": {
+                        "read": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "baseSchema": {
+                            "names": ["A", "C"],
+                            "struct": {
+                              "types": [{
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }, {
+                                "i32": {
+                                  "typeVariationReference": 0,
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              }],
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_REQUIRED"
+                            }
+                          },
+                          "namedTable": {
+                            "names": ["B"]
+                          }
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "cast": {
+                          "type": {
+                            "string": {
+                              "typeVariationReference": 0,
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "input": {
+                            "selection": {
+                              "directReference": {
+                                "structField": {
+                                  "field": 1
+                                }
+                              },
+                              "rootReference": {
+                              }
+                            }
+                          },
+                          "failureBehavior": "FAILURE_BEHAVIOR_THROW_EXCEPTION"
+                        }
+                      }]
+                    }
+                  }],
+                  "op": "SET_OP_UNION_ALL"
+                }
+              },
+              "right": {
+                "read": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "baseSchema": {
+                    "names": ["A", "D"],
+                    "struct": {
+                      "types": [{
+                        "i32": {
+                          "typeVariationReference": 0,
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }, {
+                        "i32": {
+                          "typeVariationReference": 0,
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }],
+                      "typeVariationReference": 0,
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "namedTable": {
+                    "names": ["C"]
+                  }
+                }
+              },
+              "expression": {
+                "scalarFunction": {
+                  "functionReference": 1,
+                  "args": [],
+                  "outputType": {
+                    "bool": {
+                      "typeVariationReference": 0,
+                      "nullability": "NULLABILITY_NULLABLE"
+                    }
+                  },
+                  "arguments": [{
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }, {
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 2
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }],
+                  "options": []
+                }
+              },
+              "type": "JOIN_TYPE_LEFT"
+            }
+          },
+          "expressions": [{
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 0
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 1
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 3
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "literal": {
+              "null": {
+                "string": {
+                  "typeVariationReference": 0,
+                  "nullability": "NULLABILITY_NULLABLE"
+                }
+              },
+              "nullable": false,
+              "typeVariationReference": 0
+            }
+          }]
+        }
+      },
+      "names": ["A", "C", "D", "E"]
+    }
+  }],
+  "expectedTypeUrls": [],
+  "version": {
+    "majorNumber": 0,
+    "minorNumber": 74,
+    "patchNumber": 0,
+    "gitHash": "",
+    "producer": "isthmus"
+  },
+  "parameterBindings": []
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json b/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json
new file mode 100644
index 0000000000000..1da2ff6131368
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/duplicate_name_in_union.substrait.json
@@ -0,0 +1,171 @@
+{
+  "version": {
+    "minorNumber": 54,
+    "producer": "datafusion-test"
+  },
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "set": {
+            "common": {
+              "direct": {}
+            },
+            "inputs": [
+              {
+                "project": {
+                  "common": {
+                    "emit": {
+                      "outputMapping": [2, 3]
+                    }
+                  },
+                  "input": {
+                    "read": {
+                      "common": {
+                        "direct": {}
+                      },
+                      "baseSchema": {
+                        "names": ["foo", "bar"],
+                        "struct": {
+                          "types": [
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            },
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }
+                          ],
+                          "nullability": "NULLABILITY_REQUIRED"
+                        }
+                      },
+                      "virtualTable": {
+                        "expressions": [
+                          {
+                            "fields": [
+                              {
+                                "literal": {
+                                  "i64": "100"
+                                }
+                              },
+                              {
+                                "literal": {
+                                  "i64": "200"
+                                }
+                              }
+                            ]
+                          }
+                        ]
+                      }
+                    }
+                  },
+                  "expressions": [
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    },
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    }
+                  ]
+                }
+              },
+              {
+                "project": {
+                  "common": {
+                    "emit": {
+                      "outputMapping": [2, 3]
+                    }
+                  },
+                  "input": {
+                    "read": {
+                      "common": {
+                        "direct": {}
+                      },
+                      "baseSchema": {
+                        "names": ["x", "foo"],
+                        "struct": {
+                          "types": [
+                            {
+                              "i32": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            },
+                            {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }
+                          ],
+                          "nullability": "NULLABILITY_REQUIRED"
+                        }
+                      },
+                      "virtualTable": {
+                        "expressions": [
+                          {
+                            "fields": [
+                              {
+                                "literal": {
+                                  "i32": 300
+                                }
+                              },
+                              {
+                                "literal": {
+                                  "i64": "400"
+                                }
+                              }
+                            ]
+                          }
+                        ]
+                      }
+                    }
+                  },
+                  "expressions": [
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 0
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    },
+                    {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {}
+                      }
+                    }
+                  ]
+                }
+              }
+            ],
+            "op": "SET_OP_UNION_ALL"
+          }
+        },
+        "names": ["col1", "col2"]
+      }
+    }
+  ]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/join_with_expression_key.json b/datafusion/substrait/tests/testdata/test_plans/join_with_expression_key.json
new file mode 100644
index 0000000000000..73fa06eea5f05
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/join_with_expression_key.json
@@ -0,0 +1,814 @@
+{
+  "extensionUris": [{
+    "extensionUriAnchor": 3,
+    "uri": "/functions_arithmetic.yaml"
+  }, {
+    "extensionUriAnchor": 2,
+    "uri": "/functions_string.yaml"
+  }, {
+    "extensionUriAnchor": 1,
+    "uri": "/functions_comparison.yaml"
+  }],
+  "extensions": [{
+    "extensionFunction": {
+      "extensionUriReference": 1,
+      "functionAnchor": 0,
+      "name": "equal:any_any"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 2,
+      "functionAnchor": 1,
+      "name": "upper:str"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 3,
+      "functionAnchor": 2,
+      "name": "max:i64"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 3,
+      "functionAnchor": 3,
+      "name": "multiply:fp64_fp64"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 3,
+      "functionAnchor": 4,
+      "name": "divide:fp64_fp64"
+    }
+  }],
+  "relations": [{
+    "root": {
+      "input": {
+        "project": {
+          "common": {
+            "emit": {
+              "outputMapping": [5, 6, 7, 8, 9]
+            }
+          },
+          "input": {
+            "join": {
+              "common": {
+                "direct": {
+                }
+              },
+              "left": {
+                "aggregate": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "input": {
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [3, 4, 5]
+                        }
+                      },
+                      "input": {
+                        "filter": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "input": {
+                            "read": {
+                              "common": {
+                                "direct": {
+                                }
+                              },
+                              "baseSchema": {
+                                "names": ["index_name", "host", "size_bytes"],
+                                "struct": {
+                                  "types": [{
+                                    "string": {
+                                      "nullability": "NULLABILITY_NULLABLE"
+                                    }
+                                  }, {
+                                    "string": {
+                                      "nullability": "NULLABILITY_NULLABLE"
+                                    }
+                                  }, {
+                                    "i64": {
+                                      "nullability": "NULLABILITY_NULLABLE"
+                                    }
+                                  }],
+                                  "nullability": "NULLABILITY_REQUIRED"
+                                }
+                              },
+                              "virtualTable": {
+                                "values": [{
+                                  "fields": [{
+                                    "string": "aaa",
+                                    "nullable": true
+                                  }, {
+                                    "string": "host-a",
+                                    "nullable": true
+                                  }, {
+                                    "i64": "128",
+                                    "nullable": true
+                                  }]
+                                }, {
+                                  "fields": [{
+                                    "string": "bbb",
+                                    "nullable": true
+                                  }, {
+                                    "string": "host-b",
+                                    "nullable": true
+                                  }, {
+                                    "i64": "256",
+                                    "nullable": true
+                                  }]
+                                }]
+                              }
+                            }
+                          },
+                          "condition": {
+                            "scalarFunction": {
+                              "functionReference": 0,
+                              "outputType": {
+                                "bool": {
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              },
+                              "arguments": [{
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 0
+                                      }
+                                    },
+                                    "rootReference": {
+                                    }
+                                  }
+                                }
+                              }, {
+                                "value": {
+                                  "literal": {
+                                    "string": "aaa"
+                                  }
+                                }
+                              }]
+                            }
+                          }
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 2
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "scalarFunction": {
+                          "functionReference": 1,
+                          "outputType": {
+                            "string": {
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "arguments": [{
+                            "value": {
+                              "selection": {
+                                "directReference": {
+                                  "structField": {
+                                    "field": 1
+                                  }
+                                },
+                                "rootReference": {
+                                }
+                              }
+                            }
+                          }]
+                        }
+                      }]
+                    }
+                  },
+                  "groupings": [{
+                    "groupingExpressions": [{
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }, {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 2
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }]
+                  }],
+                  "measures": [{
+                    "measure": {
+                      "functionReference": 2,
+                      "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
+                      "outputType": {
+                        "i64": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      "invocation": "AGGREGATION_INVOCATION_ALL",
+                      "arguments": [{
+                        "value": {
+                          "selection": {
+                            "directReference": {
+                              "structField": {
+                                "field": 0
+                              }
+                            },
+                            "rootReference": {
+                            }
+                          }
+                        }
+                      }]
+                    }
+                  }]
+                }
+              },
+              "right": {
+                "aggregate": {
+                  "common": {
+                    "direct": {
+                    }
+                  },
+                  "input": {
+                    "project": {
+                      "common": {
+                        "emit": {
+                          "outputMapping": [3, 4]
+                        }
+                      },
+                      "input": {
+                        "join": {
+                          "common": {
+                            "direct": {
+                            }
+                          },
+                          "left": {
+                            "read": {
+                              "common": {
+                                "direct": {
+                                }
+                              },
+                              "baseSchema": {
+                                "names": ["host", "total_bytes"],
+                                "struct": {
+                                  "types": [{
+                                    "string": {
+                                      "nullability": "NULLABILITY_NULLABLE"
+                                    }
+                                  }, {
+                                    "i64": {
+                                      "nullability": "NULLABILITY_NULLABLE"
+                                    }
+                                  }],
+                                  "nullability": "NULLABILITY_REQUIRED"
+                                }
+                              },
+                              "virtualTable": {
+                                "values": [{
+                                  "fields": [{
+                                    "string": "host-a",
+                                    "nullable": true
+                                  }, {
+                                    "i64": "107",
+                                    "nullable": true
+                                  }]
+                                }, {
+                                  "fields": [{
+                                    "string": "host-b",
+                                    "nullable": true
+                                  }, {
+                                    "i64": "214",
+                                    "nullable": true
+                                  }]
+                                }]
+                              }
+                            }
+                          },
+                          "right": {
+                            "project": {
+                              "common": {
+                                "emit": {
+                                  "outputMapping": [3]
+                                }
+                              },
+                              "input": {
+                                "aggregate": {
+                                  "common": {
+                                    "direct": {
+                                    }
+                                  },
+                                  "input": {
+                                    "project": {
+                                      "common": {
+                                        "emit": {
+                                          "outputMapping": [3, 4, 5]
+                                        }
+                                      },
+                                      "input": {
+                                        "filter": {
+                                          "common": {
+                                            "direct": {
+                                            }
+                                          },
+                                          "input": {
+                                            "read": {
+                                              "common": {
+                                                "direct": {
+                                                }
+                                              },
+                                              "baseSchema": {
+                                                "names": ["index_name", "host", "size_bytes"],
+                                                "struct": {
+                                                  "types": [{
+                                                    "string": {
+                                                      "nullability": "NULLABILITY_NULLABLE"
+                                                    }
+                                                  }, {
+                                                    "string": {
+                                                      "nullability": "NULLABILITY_NULLABLE"
+                                                    }
+                                                  }, {
+                                                    "i64": {
+                                                      "nullability": "NULLABILITY_NULLABLE"
+                                                    }
+                                                  }],
+                                                  "nullability": "NULLABILITY_REQUIRED"
+                                                }
+                                              },
+                                              "virtualTable": {
+                                                "values": [{
+                                                  "fields": [{
+                                                    "string": "aaa",
+                                                    "nullable": true
+                                                  }, {
+                                                    "string": "host-a",
+                                                    "nullable": true
+                                                  }, {
+                                                    "i64": "128",
+                                                    "nullable": true
+                                                  }]
+                                                }, {
+                                                  "fields": [{
+                                                    "string": "bbb",
+                                                    "nullable": true
+                                                  }, {
+                                                    "string": "host-b",
+                                                    "nullable": true
+                                                  }, {
+                                                    "i64": "256",
+                                                    "nullable": true
+                                                  }]
+                                                }]
+                                              }
+                                            }
+                                          },
+                                          "condition": {
+                                            "scalarFunction": {
+                                              "functionReference": 0,
+                                              "outputType": {
+                                                "bool": {
+                                                  "nullability": "NULLABILITY_NULLABLE"
+                                                }
+                                              },
+                                              "arguments": [{
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 0
+                                                      }
+                                                    },
+                                                    "rootReference": {
+                                                    }
+                                                  }
+                                                }
+                                              }, {
+                                                "value": {
+                                                  "literal": {
+                                                    "string": "aaa"
+                                                  }
+                                                }
+                                              }]
+                                            }
+                                          }
+                                        }
+                                      },
+                                      "expressions": [{
+                                        "selection": {
+                                          "directReference": {
+                                            "structField": {
+                                              "field": 2
+                                            }
+                                          },
+                                          "rootReference": {
+                                          }
+                                        }
+                                      }, {
+                                        "selection": {
+                                          "directReference": {
+                                            "structField": {
+                                              "field": 0
+                                            }
+                                          },
+                                          "rootReference": {
+                                          }
+                                        }
+                                      }, {
+                                        "scalarFunction": {
+                                          "functionReference": 1,
+                                          "outputType": {
+                                            "string": {
+                                              "nullability": "NULLABILITY_NULLABLE"
+                                            }
+                                          },
+                                          "arguments": [{
+                                            "value": {
+                                              "selection": {
+                                                "directReference": {
+                                                  "structField": {
+                                                    "field": 1
+                                                  }
+                                                },
+                                                "rootReference": {
+                                                }
+                                              }
+                                            }
+                                          }]
+                                        }
+                                      }]
+                                    }
+                                  },
+                                  "groupings": [{
+                                    "groupingExpressions": [{
+                                      "selection": {
+                                        "directReference": {
+                                          "structField": {
+                                            "field": 1
+                                          }
+                                        },
+                                        "rootReference": {
+                                        }
+                                      }
+                                    }, {
+                                      "selection": {
+                                        "directReference": {
+                                          "structField": {
+                                            "field": 2
+                                          }
+                                        },
+                                        "rootReference": {
+                                        }
+                                      }
+                                    }]
+                                  }],
+                                  "measures": [{
+                                    "measure": {
+                                      "functionReference": 2,
+                                      "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
+                                      "outputType": {
+                                        "i64": {
+                                          "nullability": "NULLABILITY_NULLABLE"
+                                        }
+                                      },
+                                      "invocation": "AGGREGATION_INVOCATION_ALL",
+                                      "arguments": [{
+                                        "value": {
+                                          "selection": {
+                                            "directReference": {
+                                              "structField": {
+                                                "field": 0
+                                              }
+                                            },
+                                            "rootReference": {
+                                            }
+                                          }
+                                        }
+                                      }]
+                                    }
+                                  }]
+                                }
+                              },
+                              "expressions": [{
+                                "selection": {
+                                  "directReference": {
+                                    "structField": {
+                                      "field": 1
+                                    }
+                                  },
+                                  "rootReference": {
+                                  }
+                                }
+                              }]
+                            }
+                          },
+                          "expression": {
+                            "scalarFunction": {
+                              "functionReference": 0,
+                              "outputType": {
+                                "bool": {
+                                  "nullability": "NULLABILITY_NULLABLE"
+                                }
+                              },
+                              "arguments": [{
+                                "value": {
+                                  "scalarFunction": {
+                                    "functionReference": 1,
+                                    "outputType": {
+                                      "string": {
+                                        "nullability": "NULLABILITY_NULLABLE"
+                                      }
+                                    },
+                                    "arguments": [{
+                                      "value": {
+                                        "selection": {
+                                          "directReference": {
+                                            "structField": {
+                                              "field": 0
+                                            }
+                                          },
+                                          "rootReference": {
+                                          }
+                                        }
+                                      }
+                                    }]
+                                  }
+                                }
+                              }, {
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 2
+                                      }
+                                    },
+                                    "rootReference": {
+                                    }
+                                  }
+                                }
+                              }]
+                            }
+                          },
+                          "type": "JOIN_TYPE_INNER"
+                        }
+                      },
+                      "expressions": [{
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }, {
+                        "scalarFunction": {
+                          "functionReference": 1,
+                          "outputType": {
+                            "string": {
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "arguments": [{
+                            "value": {
+                              "selection": {
+                                "directReference": {
+                                  "structField": {
+                                    "field": 0
+                                  }
+                                },
+                                "rootReference": {
+                                }
+                              }
+                            }
+                          }]
+                        }
+                      }]
+                    }
+                  },
+                  "groupings": [{
+                    "groupingExpressions": [{
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }]
+                  }],
+                  "measures": [{
+                    "measure": {
+                      "functionReference": 2,
+                      "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT",
+                      "outputType": {
+                        "i64": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      "invocation": "AGGREGATION_INVOCATION_ALL",
+                      "arguments": [{
+                        "value": {
+                          "selection": {
+                            "directReference": {
+                              "structField": {
+                                "field": 0
+                              }
+                            },
+                            "rootReference": {
+                            }
+                          }
+                        }
+                      }]
+                    }
+                  }]
+                }
+              },
+              "expression": {
+                "scalarFunction": {
+                  "functionReference": 0,
+                  "outputType": {
+                    "bool": {
+                      "nullability": "NULLABILITY_NULLABLE"
+                    }
+                  },
+                  "arguments": [{
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 1
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }, {
+                    "value": {
+                      "selection": {
+                        "directReference": {
+                          "structField": {
+                            "field": 3
+                          }
+                        },
+                        "rootReference": {
+                        }
+                      }
+                    }
+                  }]
+                }
+              },
+              "type": "JOIN_TYPE_INNER"
+            }
+          },
+          "expressions": [{
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 0
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 3
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 2
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "selection": {
+              "directReference": {
+                "structField": {
+                  "field": 4
+                }
+              },
+              "rootReference": {
+              }
+            }
+          }, {
+            "scalarFunction": {
+              "functionReference": 3,
+              "outputType": {
+                "fp64": {
+                  "nullability": "NULLABILITY_NULLABLE"
+                }
+              },
+              "arguments": [{
+                "value": {
+                  "scalarFunction": {
+                    "functionReference": 4,
+                    "outputType": {
+                      "fp64": {
+                        "nullability": "NULLABILITY_NULLABLE"
+                      }
+                    },
+                    "arguments": [{
+                      "value": {
+                        "cast": {
+                          "type": {
+                            "fp64": {
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "input": {
+                            "selection": {
+                              "directReference": {
+                                "structField": {
+                                  "field": 2
+                                }
+                              },
+                              "rootReference": {
+                              }
+                            }
+                          },
+                          "failureBehavior": "FAILURE_BEHAVIOR_THROW_EXCEPTION"
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "cast": {
+                          "type": {
+                            "fp64": {
+                              "nullability": "NULLABILITY_NULLABLE"
+                            }
+                          },
+                          "input": {
+                            "selection": {
+                              "directReference": {
+                                "structField": {
+                                  "field": 4
+                                }
+                              },
+                              "rootReference": {
+                              }
+                            }
+                          },
+                          "failureBehavior": "FAILURE_BEHAVIOR_THROW_EXCEPTION"
+                        }
+                      }
+                    }]
+                  }
+                }
+              }, {
+                "value": {
+                  "literal": {
+                    "fp64": 100.0
+                  }
+                }
+              }]
+            }
+          }]
+        }
+      },
+      "names": ["index", "host", "idx_size", "db_size", "pct_of_db"]
+    }
+  }]
+}
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json b/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json
new file mode 100644
index 0000000000000..6c565a0f94e2f
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/nested_correlated_subquery.substrait.json
@@ -0,0 +1,265 @@
+{
+  "extensionUris": [{
+    "extensionUriAnchor": 1,
+    "uri": "/functions_boolean.yaml"
+  }, {
+    "extensionUriAnchor": 2,
+    "uri": "/functions_comparison.yaml"
+  }],
+  "extensions": [{
+    "extensionFunction": {
+      "extensionUriReference": 1,
+      "name": "and:bool"
+    }
+  }, {
+    "extensionFunction": {
+      "extensionUriReference": 2,
+      "functionAnchor": 1,
+      "name": "equal:any_any"
+    }
+  }],
+  "relations": [{
+    "root": {
+      "input": {
+        "filter": {
+          "common": {
+            "direct": {}
+          },
+          "input": {
+            "read": {
+              "common": {
+                "direct": {}
+              },
+              "baseSchema": {
+                "names": ["a1", "a2"],
+                "struct": {
+                  "types": [{
+                    "i64": {
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  }, {
+                    "i64": {
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  }],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "namedTable": {
+                "names": ["A"]
+              }
+            }
+          },
+          "condition": {
+            "subquery": {
+              "setPredicate": {
+                "predicateOp": "PREDICATE_OP_EXISTS",
+                "tuples": {
+                  "filter": {
+                    "common": {
+                      "direct": {}
+                    },
+                    "input": {
+                      "read": {
+                        "common": {
+                          "direct": {}
+                        },
+                        "baseSchema": {
+                          "names": ["b1", "b2"],
+                          "struct": {
+                            "types": [{
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }, {
+                              "i64": {
+                                "nullability": "NULLABILITY_REQUIRED"
+                              }
+                            }],
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        "namedTable": {
+                          "names": ["B"]
+                        }
+                      }
+                    },
+                    "condition": {
+                      "scalarFunction": {
+                        "outputType": {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        "arguments": [{
+                          "value": {
+                            "scalarFunction": {
+                              "functionReference": 1,
+                              "outputType": {
+                                "bool": {
+                                  "nullability": "NULLABILITY_REQUIRED"
+                                }
+                              },
+                              "arguments": [{
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 0
+                                      }
+                                    },
+                                    "rootReference": {}
+                                  }
+                                }
+                              }, {
+                                "value": {
+                                  "selection": {
+                                    "directReference": {
+                                      "structField": {
+                                        "field": 0
+                                      }
+                                    },
+                                    "outerReference": {
+                                      "stepsOut": 1
+                                    }
+                                  }
+                                }
+                              }]
+                            }
+                          }
+                        }, {
+                          "value": {
+                            "subquery": {
+                              "setPredicate": {
+                                "predicateOp": "PREDICATE_OP_EXISTS",
+                                "tuples": {
+                                  "filter": {
+                                    "common": {
+                                      "direct": {}
+                                    },
+                                    "input": {
+                                      "read": {
+                                        "common": {
+                                          "direct": {}
+                                        },
+                                        "baseSchema": {
+                                          "names": ["c1", "c2"],
+                                          "struct": {
+                                            "types": [{
+                                              "i64": {
+                                                "nullability": "NULLABILITY_REQUIRED"
+                                              }
+                                            }, {
+                                              "i64": {
+                                                "nullability": "NULLABILITY_REQUIRED"
+                                              }
+                                            }],
+                                            "nullability": "NULLABILITY_REQUIRED"
+                                          }
+                                        },
+                                        "namedTable": {
+                                          "names": ["C"]
+                                        }
+                                      }
+                                    },
+                                    "condition": {
+                                      "scalarFunction": {
+                                        "outputType": {
+                                          "bool": {
+                                            "nullability": "NULLABILITY_REQUIRED"
+                                          }
+                                        },
+                                        "arguments": [{
+                                          "value": {
+                                            "scalarFunction": {
+                                              "functionReference": 1,
+                                              "outputType": {
+                                                "bool": {
+                                                  "nullability": "NULLABILITY_REQUIRED"
+                                                }
+                                              },
+                                              "arguments": [{
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 0
+                                                      }
+                                                    },
+                                                    "rootReference": {}
+                                                  }
+                                                }
+                                              }, {
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 0
+                                                      }
+                                                    },
+                                                    "outerReference": {
+                                                      "stepsOut": 2
+                                                    }
+                                                  }
+                                                }
+                                              }]
+                                            }
+                                          }
+                                        }, {
+                                          "value": {
+                                            "scalarFunction": {
+                                              "functionReference": 1,
+                                              "outputType": {
+                                                "bool": {
+                                                  "nullability": "NULLABILITY_REQUIRED"
+                                                }
+                                              },
+                                              "arguments": [{
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 1
+                                                      }
+                                                    },
+                                                    "rootReference": {}
+                                                  }
+                                                }
+                                              }, {
+                                                "value": {
+                                                  "selection": {
+                                                    "directReference": {
+                                                      "structField": {
+                                                        "field": 1
+                                                      }
+                                                    },
+                                                    "outerReference": {
+                                                      "stepsOut": 1
+                                                    }
+                                                  }
+                                                }
+                                              }]
+                                            }
+                                          }
+                                        }]
+                                      }
+                                    }
+                                  }
+                                }
+                              }
+                            }
+                          }
+                        }]
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "names": ["a1", "a2"]
+    }
+  }]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json
new file mode 100644
index 0000000000000..85a69c41c5eb1
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/nested_list_expressions.substrait.json
@@ -0,0 +1,77 @@
+{
+  "relations": [
+    {
+      "root": {
+        "input": {
+          "project": {
+            "common": {
+              "emit": {
+                "outputMapping": [2]
+              }
+            },
+            "input": {
+              "read": {
+                "common": {
+                  "direct": {}
+                },
+                "baseSchema": {
+                  "names": ["a", "b"],
+                  "struct": {
+                    "types": [
+                      {
+                        "i32": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      },
+                      {
+                        "i32": {
+                          "nullability": "NULLABILITY_NULLABLE"
+                        }
+                      }
+                    ],
+                    "nullability": "NULLABILITY_REQUIRED"
+                  }
+                },
+                "namedTable": {
+                  "names": ["DATA"]
+                }
+              }
+            },
+            "expressions": [
+              {
+                "nested": {
+                  "nullable": false,
+                  "list": {
+                    "values": [
+                      {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {}
+                        }
+                      },
+                      {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {}
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        },
+        "names": ["my_list"]
+      }
+    }
+  ]
+}
diff --git a/datafusion/substrait/tests/testdata/test_plans/scalar_fn_logb_expr.substrait.json b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_logb_expr.substrait.json
new file mode 100644
index 0000000000000..eeaf5a3dd8476
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_logb_expr.substrait.json
@@ -0,0 +1,116 @@
+{
+    "extensionUris": [
+      {
+        "extensionUriAnchor": 1,
+        "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml"
+      }
+    ],
+    "extensions": [
+      {
+        "extensionFunction": {
+          "extensionUriReference": 1,
+          "functionAnchor": 1,
+          "name": "logb:fp32_fp32"
+        }
+      }
+    ],
+    "relations": [
+      {
+        "root": {
+          "input": {
+            "project": {
+              "common": {
+                "direct": {}
+              },
+              "expressions": [
+                {
+                  "scalarFunction": {
+                    "arguments": [{
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }],
+                    "functionReference": 1,
+                    "outputType": {
+                      "bool": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    }
+                  }
+                }
+              ],
+              "input": {
+                "read": {
+                  "baseSchema": {
+                    "names": [
+                      "x", "base"
+                    ],
+                    "struct": {
+                      "types": [
+                        {
+                          "fp32": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        {
+                          "fp32": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        }
+                      ],
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "common": {
+                    "direct": {}
+                  },
+                  "virtualTable": {
+                    "values": [{
+                      "fields": [{
+                        "fp32": 1.0,
+                        "nullable": false
+                      }, {
+                        "fp32": 10.0,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "fp32": 100.0,
+                        "nullable": false
+                      }, {
+                        "fp32": 10.0,
+                        "nullable": false
+                      }]
+                    }]
+                  }
+                }
+              }
+            }
+          },
+          "names": [
+            "x", "base", "result"
+          ]
+        }
+      }
+    ]
+  }
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_between_expr.substrait.json b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_between_expr.substrait.json
new file mode 100644
index 0000000000000..6749a301b17df
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_between_expr.substrait.json
@@ -0,0 +1,143 @@
+{
+    "extensionUris": [
+      {
+        "extensionUriAnchor": 1,
+        "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_comparison.yaml"
+      }
+    ],
+    "extensions": [
+      {
+        "extensionFunction": {
+          "extensionUriReference": 1,
+          "functionAnchor": 1,
+          "name": "between:any_any_any"
+        }
+      }
+    ],
+    "relations": [
+      {
+        "root": {
+          "input": {
+            "project": {
+              "common": {
+                "emit": {
+                  "outputMapping": [
+                    3
+                  ]
+                }
+              },
+              "expressions": [
+                {
+                  "scalarFunction": {
+                    "arguments": [{
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 2
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }],
+                    "functionReference": 1,
+                    "outputType": {
+                      "bool": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    }
+                  }
+                }
+              ],
+              "input": {
+                "read": {
+                  "baseSchema": {
+                    "names": [
+                      "expr", "low", "high"
+                    ],
+                    "struct": {
+                      "types": [
+                        {
+                          "i8": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        {
+                          "i8": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        {
+                          "i8": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        }
+                      ],
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "common": {
+                    "direct": {}
+                  },
+                  "virtualTable": {
+                    "values": [{
+                      "fields": [{
+                        "i8": 2,
+                        "nullable": false
+                      }, {
+                        "i8": 1,
+                        "nullable": false
+                      }, {
+                        "i8": 3,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "i8": 4,
+                        "nullable": false
+                      }, {
+                        "i8": 1,
+                        "nullable": false
+                      }, {
+                        "i8": 2,
+                        "nullable": false
+                      }]
+                    }]
+                  }
+                }
+              }
+            }
+          },
+          "names": [
+            "result"
+          ]
+        }
+      }
+    ]
+  }
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_and_not.substrait.json b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_and_not.substrait.json
new file mode 100644
index 0000000000000..8365b1edfe250
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_and_not.substrait.json
@@ -0,0 +1,132 @@
+{
+    "extensionUris": [
+      {
+        "extensionUriAnchor": 1,
+        "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml"
+      }
+    ],
+    "extensions": [
+      {
+        "extensionFunction": {
+          "extensionUriReference": 1,
+          "functionAnchor": 1,
+          "name": "and_not:bool_bool"
+        }
+      }
+    ],
+    "relations": [
+      {
+        "root": {
+          "input": {
+            "project": {
+              "common": {
+                "direct": {}
+              },
+              "expressions": [
+                {
+                  "scalarFunction": {
+                    "arguments": [{
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }],
+                    "functionReference": 1,
+                    "outputType": {
+                      "bool": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    }
+                  }
+                }
+              ],
+              "input": {
+                "read": {
+                  "baseSchema": {
+                    "names": [
+                      "a", "b"
+                    ],
+                    "struct": {
+                      "types": [
+                        {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        }
+                      ],
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "common": {
+                    "direct": {}
+                  },
+                  "virtualTable": {
+                    "values": [{
+                      "fields": [{
+                        "boolean": true,
+                        "nullable": false
+                      }, {
+                        "boolean": true,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": true,
+                        "nullable": false
+                      }, {
+                        "boolean": false,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": false,
+                        "nullable": false
+                      }, {
+                        "boolean": true,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": false,
+                        "nullable": false
+                      }, {
+                        "boolean": false,
+                        "nullable": false
+                      }]
+                    }]
+                  }
+                }
+              }
+            }
+          },
+          "names": [
+            "a", "b", "result"
+          ]
+        }
+      }
+    ]
+  }
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_xor.substrait.json b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_xor.substrait.json
new file mode 100644
index 0000000000000..cfd760de890c0
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/scalar_fn_to_built_in_binary_expr_xor.substrait.json
@@ -0,0 +1,132 @@
+{
+    "extensionUris": [
+      {
+        "extensionUriAnchor": 1,
+        "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml"
+      }
+    ],
+    "extensions": [
+      {
+        "extensionFunction": {
+          "extensionUriReference": 1,
+          "functionAnchor": 1,
+          "name": "xor:bool_bool"
+        }
+      }
+    ],
+    "relations": [
+      {
+        "root": {
+          "input": {
+            "project": {
+              "common": {
+                "direct": {}
+              },
+              "expressions": [
+                {
+                  "scalarFunction": {
+                    "arguments": [{
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 0
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }, {
+                      "value": {
+                        "selection": {
+                          "directReference": {
+                            "structField": {
+                              "field": 1
+                            }
+                          },
+                          "rootReference": {
+                          }
+                        }
+                      }
+                    }],
+                    "functionReference": 1,
+                    "outputType": {
+                      "bool": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    }
+                  }
+                }
+              ],
+              "input": {
+                "read": {
+                  "baseSchema": {
+                    "names": [
+                      "a", "b"
+                    ],
+                    "struct": {
+                      "types": [
+                        {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        },
+                        {
+                          "bool": {
+                            "nullability": "NULLABILITY_REQUIRED"
+                          }
+                        }
+                      ],
+                      "nullability": "NULLABILITY_REQUIRED"
+                    }
+                  },
+                  "common": {
+                    "direct": {}
+                  },
+                  "virtualTable": {
+                    "values": [{
+                      "fields": [{
+                        "boolean": true,
+                        "nullable": false
+                      }, {
+                        "boolean": true,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": true,
+                        "nullable": false
+                      }, {
+                        "boolean": false,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": false,
+                        "nullable": false
+                      }, {
+                        "boolean": true,
+                        "nullable": false
+                      }]
+                    }, {
+                      "fields": [{
+                        "boolean": false,
+                        "nullable": false
+                      }, {
+                        "boolean": false,
+                        "nullable": false
+                      }]
+                    }]
+                  }
+                }
+              }
+            }
+          },
+          "names": [
+            "a", "b", "result"
+          ]
+        }
+      }
+    ]
+  }
\ No newline at end of file
diff --git a/datafusion/substrait/tests/testdata/test_plans/virtual_table_with_expressions.substrait.json b/datafusion/substrait/tests/testdata/test_plans/virtual_table_with_expressions.substrait.json
new file mode 100644
index 0000000000000..2c634fa957579
--- /dev/null
+++ b/datafusion/substrait/tests/testdata/test_plans/virtual_table_with_expressions.substrait.json
@@ -0,0 +1,75 @@
+{
+    "relations": [
+      {
+        "root": {
+          "input": {
+            "read": {
+              "common": {
+                "direct": {
+                }
+              },
+              "baseSchema": {
+                "names": [
+                  "dummy1", "dummy2"
+                ],
+                "struct": {
+                  "types": [
+                    {
+                      "i64": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    },
+                    {
+                      "string": {
+                        "nullability": "NULLABILITY_REQUIRED"
+                      }
+                    }
+                  ],
+                  "nullability": "NULLABILITY_REQUIRED"
+                }
+              },
+              "virtualTable": {
+                "expressions": [
+                  {
+                    "fields": [
+                      {
+                          "literal": {
+                              "i64": "0",
+                              "nullable": false
+                          }
+                      },
+                      {
+                        "literal": {
+                            "string": "temp",
+                            "nullable": false
+                        }
+                      }
+                    ]
+                  },
+                  {
+                    "fields": [
+                      {
+                          "literal": {
+                              "i64": "1",
+                              "nullable": false
+                          }
+                      },
+                      {
+                        "literal": {
+                            "string": "test",
+                            "nullable": false
+                        }
+                      }
+                    ]
+                  }
+                ]
+              }
+            }
+          },
+          "names": [
+            "result1", "result2"
+          ]
+        }
+      }
+    ]
+  }
\ No newline at end of file
diff --git a/datafusion/substrait/tests/utils.rs b/datafusion/substrait/tests/utils.rs
index e3e3ec3fab018..4d9b5ca83e5e0 100644
--- a/datafusion/substrait/tests/utils.rs
+++ b/datafusion/substrait/tests/utils.rs
@@ -17,14 +17,14 @@
 
 #[cfg(test)]
 pub mod test {
-    use datafusion::common::{substrait_datafusion_err, substrait_err, TableReference};
-    use datafusion::datasource::empty::EmptyTable;
+    use datafusion::common::{TableReference, substrait_datafusion_err, substrait_err};
     use datafusion::datasource::TableProvider;
+    use datafusion::datasource::empty::EmptyTable;
     use datafusion::error::Result;
     use datafusion::prelude::SessionContext;
     use datafusion_substrait::extensions::Extensions;
     use datafusion_substrait::logical_plan::consumer::{
-        from_substrait_named_struct, DefaultSubstraitConsumer, SubstraitConsumer,
+        DefaultSubstraitConsumer, SubstraitConsumer, from_substrait_named_struct,
     };
     use std::collections::HashMap;
     use std::fs::File;
@@ -32,9 +32,9 @@ pub mod test {
     use std::sync::Arc;
     use substrait::proto::exchange_rel::ExchangeKind;
     use substrait::proto::expand_rel::expand_field::FieldType;
+    use substrait::proto::expression::RexType;
     use substrait::proto::expression::nested::NestedType;
     use substrait::proto::expression::subquery::SubqueryType;
-    use substrait::proto::expression::RexType;
     use substrait::proto::function_argument::ArgType;
     use substrait::proto::read_rel::{NamedTable, ReadType};
     use substrait::proto::rel::RelType;
@@ -69,12 +69,14 @@ pub mod test {
             let schema = table.schema();
             if let Some(existing_table) =
                 schema_map.insert(table_reference.clone(), table)
+                && existing_table.schema() != schema
             {
-                if existing_table.schema() != schema {
-                    return substrait_err!(
-                        "Substrait plan contained the same table {} with different schemas.\nSchema 1: {}\nSchema 2: {}",
-                        table_reference, existing_table.schema(), schema);
-                }
+                return substrait_err!(
+                    "Substrait plan contained the same table {} with different schemas.\nSchema 1: {}\nSchema 2: {}",
+                    table_reference,
+                    existing_table.schema(),
+                    schema
+                );
             }
         }
         for (table_reference, table) in schema_map.into_iter() {
@@ -150,12 +152,12 @@ pub mod test {
             let df_schema = from_substrait_named_struct(self.consumer, substrait_schema)?
                 .replace_qualifier(table_reference.clone());
 
-            let table = EmptyTable::new(df_schema.inner().clone());
+            let table = EmptyTable::new(Arc::clone(df_schema.inner()));
             self.schemas.push((table_reference, Arc::new(table)));
             Ok(())
         }
 
-        #[allow(deprecated)]
+        #[expect(deprecated)]
         fn collect_schemas_from_rel(&mut self, rel: &Rel) -> Result<()> {
             let rel_type = rel
                 .rel_type
@@ -482,7 +484,9 @@ pub mod test {
                 }
                 RexType::DynamicParameter(_) => {}
                 // Enum is deprecated
+                #[expect(deprecated)]
                 RexType::Enum(_) => {}
+                RexType::Lambda(_) | RexType::LambdaInvocation(_) => {}
             }
             Ok(())
         }
diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml
index b43c34f197601..e033056f99845 100644
--- a/datafusion/wasmtest/Cargo.toml
+++ b/datafusion/wasmtest/Cargo.toml
@@ -30,6 +30,9 @@ rust-version = { workspace = true }
 [package.metadata.docs.rs]
 all-features = true
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
@@ -39,26 +42,30 @@ crate-type = ["cdylib", "rlib"]
 [dependencies]
 # chrono must be compiled with wasmbind feature
 chrono = { version = "0.4", features = ["wasmbind"] }
-
 # The `console_error_panic_hook` crate provides better debugging of panics by
 # logging them with `console.error`. This is great for development, but requires
 # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
 # code size when deploying.
 console_error_panic_hook = { version = "0.1.1", optional = true }
-datafusion = { workspace = true, features = ["parquet"] }
-datafusion-common = { workspace = true, default-features = true }
+datafusion = { workspace = true, features = ["compression", "parquet", "sql"] }
+datafusion-common = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-optimizer = { workspace = true, default-features = true }
 datafusion-physical-plan = { workspace = true }
 datafusion-sql = { workspace = true }
+# needs to be compiled
 getrandom = { version = "0.3", features = ["wasm_js"] }
 wasm-bindgen = "0.2.99"
 
 [dev-dependencies]
-insta = { workspace = true }
+bytes = { workspace = true }
+futures = { workspace = true }
 object_store = { workspace = true }
 # needs to be compiled
 tokio = { workspace = true }
 url = { workspace = true }
-wasm-bindgen-test = "0.3.49"
+wasm-bindgen-test = "0.3.62"
+
+[package.metadata.cargo-machete]
+ignored = ["chrono", "getrandom"]
diff --git a/datafusion/wasmtest/README.md b/datafusion/wasmtest/README.md
index 70f4daef91034..57a12ef8b8321 100644
--- a/datafusion/wasmtest/README.md
+++ b/datafusion/wasmtest/README.md
@@ -32,7 +32,7 @@ Some of DataFusion's downstream projects compile to WASM to run in the browser.
 
 ## Setup
 
-First, [install wasm-pack](https://rustwasm.github.io/wasm-pack/installer/)
+First, [install wasm-pack](https://drager.github.io/wasm-pack/installer/)
 
 Then use wasm-pack to compile the crate from within this directory
 
@@ -40,6 +40,20 @@ Then use wasm-pack to compile the crate from within this directory
 wasm-pack build
 ```
 
+### Apple silicon
+
+The default installation of Clang on Apple silicon does not support wasm, so you'll need to install LLVM Clang. For example via Homebrew:
+
+```sh
+brew install llvm
+# You will also need to install wasm-bindgen-cli separately, changing version as needed (0.3.53 = 0.2.103)
+cargo install wasm-bindgen-cli@0.2.103
+# Need to run commands like so, unless you edit your PATH to prepend the LLVM version of Clang
+PATH="/opt/homebrew/opt/llvm/bin:$PATH" RUSTFLAGS='--cfg getrandom_backend="wasm_js"' wasm-pack build
+```
+
+- For reference: https://github.com/briansmith/ring/issues/1824
+
 ## Try it out
 
 The `datafusion-wasm-app` directory contains a simple app (created with [`create-wasm-app`](https://github.com/rustwasm/create-wasm-app) and then manually updated to WebPack 5) that invokes DataFusion and writes results to the browser console.
diff --git a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
index c018e779fcbf3..8f175b0001229 100644
--- a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
+++ b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
@@ -13,9 +13,9 @@
       },
       "devDependencies": {
         "copy-webpack-plugin": "12.0.2",
-        "webpack": "5.94.0",
+        "webpack": "5.105.0",
         "webpack-cli": "5.1.4",
-        "webpack-dev-server": "4.15.1"
+        "webpack-dev-server": "5.2.1"
       }
     },
     "../pkg": {
@@ -32,17 +32,13 @@
       }
     },
     "node_modules/@jridgewell/gen-mapping": {
-      "version": "0.3.5",
-      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
-      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
       "dev": true,
       "dependencies": {
-        "@jridgewell/set-array": "^1.2.1",
-        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/sourcemap-codec": "^1.5.0",
         "@jridgewell/trace-mapping": "^0.3.24"
-      },
-      "engines": {
-        "node": ">=6.0.0"
       }
     },
     "node_modules/@jridgewell/resolve-uri": {
@@ -54,19 +50,10 @@
         "node": ">=6.0.0"
       }
     },
-    "node_modules/@jridgewell/set-array": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
-      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
-      "dev": true,
-      "engines": {
-        "node": ">=6.0.0"
-      }
-    },
     "node_modules/@jridgewell/source-map": {
-      "version": "0.3.6",
-      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
-      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "version": "0.3.11",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.11.tgz",
+      "integrity": "sha512-ZMp1V8ZFcPG5dIWnQLr3NSI1MiCU7UETdS/A0G8V/XWHvJv3ZsFqutJn1Y5RPmAPX6F3BiE397OqveU/9NCuIA==",
       "dev": true,
       "dependencies": {
         "@jridgewell/gen-mapping": "^0.3.5",
@@ -74,15 +61,15 @@
       }
     },
     "node_modules/@jridgewell/sourcemap-codec": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
-      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
       "dev": true
     },
     "node_modules/@jridgewell/trace-mapping": {
-      "version": "0.3.25",
-      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
-      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
       "dev": true,
       "dependencies": {
         "@jridgewell/resolve-uri": "^3.1.0",
@@ -90,10 +77,11 @@
       }
     },
     "node_modules/@leichtgewicht/ip-codec": {
-      "version": "2.0.4",
-      "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.4.tgz",
-      "integrity": "sha512-Hcv+nVC0kZnQ3tD9GVu5xSMR4VVYOteQIr/hwFPVEvPdlXqgGEuRjiheChHgdM+JyqdgNcmzZOX/tnl0JOiI7A==",
-      "dev": true
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.5.tgz",
+      "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==",
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/@nodelib/fs.scandir": {
       "version": "2.1.5",
@@ -157,10 +145,11 @@
       }
     },
     "node_modules/@types/bonjour": {
-      "version": "3.5.11",
-      "resolved": "https://registry.npmjs.org/@types/bonjour/-/bonjour-3.5.11.tgz",
-      "integrity": "sha512-isGhjmBtLIxdHBDl2xGwUzEM8AOyOvWsADWq7rqirdi/ZQoHnLWErHvsThcEzTX8juDRiZtzp2Qkv5bgNh6mAg==",
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@types/bonjour/-/bonjour-3.5.13.tgz",
+      "integrity": "sha512-z9fJ5Im06zvUL548KvYNecEVlA7cVDkGUi6kZusb04mpyEFKCIZJvloCcmpmLaIahDpOQGHaHmG6imtPMmPXGQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*"
       }
@@ -175,26 +164,48 @@
       }
     },
     "node_modules/@types/connect-history-api-fallback": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.5.1.tgz",
-      "integrity": "sha512-iaQslNbARe8fctL5Lk+DsmgWOM83lM+7FzP0eQUJs1jd3kBE8NWqBTIT2S8SqQOJjxvt2eyIjpOuYeRXq2AdMw==",
+      "version": "1.5.4",
+      "resolved": "https://registry.npmjs.org/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.5.4.tgz",
+      "integrity": "sha512-n6Cr2xS1h4uAulPRdlw6Jl6s1oG8KrVilPN2yUITEs+K48EzMJJ3W1xy8K5eWuFvjp3R74AOIGSmp2UfBJ8HFw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/express-serve-static-core": "*",
         "@types/node": "*"
       }
     },
+    "node_modules/@types/eslint": {
+      "version": "9.6.1",
+      "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz",
+      "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==",
+      "dev": true,
+      "dependencies": {
+        "@types/estree": "*",
+        "@types/json-schema": "*"
+      }
+    },
+    "node_modules/@types/eslint-scope": {
+      "version": "3.7.7",
+      "resolved": "https://registry.npmjs.org/@types/eslint-scope/-/eslint-scope-3.7.7.tgz",
+      "integrity": "sha512-MzMFlSLBqNF2gcHWO0G1vP/YQyfvrxZ0bF+u7mzUdZ1/xK4A4sru+nraZz5i3iEIk1l1uyicaDVTB4QbbEkAYg==",
+      "dev": true,
+      "dependencies": {
+        "@types/eslint": "*",
+        "@types/estree": "*"
+      }
+    },
     "node_modules/@types/estree": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
-      "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
       "dev": true
     },
     "node_modules/@types/express": {
-      "version": "4.17.17",
-      "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.17.tgz",
-      "integrity": "sha512-Q4FmmuLGBG58btUnfS1c1r/NQdlp3DMfGDGig8WhfpA2YRUtEkxAjkZb0yvplJGYdF1fsQ81iMDcH24sSCNC/Q==",
+      "version": "4.17.22",
+      "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.22.tgz",
+      "integrity": "sha512-eZUmSnhRX9YRSkplpz0N+k6NljUUn5l3EWZIKZvYzhvMphEuNiyyy1viH/ejgt66JWgALwC/gtSUAeQKtSwW/w==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/body-parser": "*",
         "@types/express-serve-static-core": "^4.17.33",
@@ -230,9 +241,9 @@
       }
     },
     "node_modules/@types/json-schema": {
-      "version": "7.0.13",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.13.tgz",
-      "integrity": "sha512-RbSSoHliUbnXj3ny0CNFOoxrIDV6SUGyStHsvDqosw6CkdPV8TtWGlfecuK4ToyMEAql6pzNxgCFKanovUzlgQ==",
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
       "dev": true
     },
     "node_modules/@types/mime": {
@@ -247,6 +258,16 @@
       "integrity": "sha512-HksnYH4Ljr4VQgEy2lTStbCKv/P590tmPe5HqOnv9Gprffgv5WXAY+Y5Gqniu0GGqeTCUdBnzC3QSrzPkBkAMA==",
       "dev": true
     },
+    "node_modules/@types/node-forge": {
+      "version": "1.3.11",
+      "resolved": "https://registry.npmjs.org/@types/node-forge/-/node-forge-1.3.11.tgz",
+      "integrity": "sha512-FQx220y22OKNTqaByeBGqHWYz4cl94tpcxeFdvBo3wjG6XPBuZ0BNgNZRV5J5TFmmcsJ4IzsLkmGRiQbnYsBEQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
     "node_modules/@types/qs": {
       "version": "6.9.8",
       "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.8.tgz",
@@ -260,10 +281,11 @@
       "dev": true
     },
     "node_modules/@types/retry": {
-      "version": "0.12.0",
-      "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz",
-      "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==",
-      "dev": true
+      "version": "0.12.2",
+      "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.2.tgz",
+      "integrity": "sha512-XISRgDJ2Tc5q4TRqvgJtzsRkFYNJzZrhTdtMoGVBttwzzQJkPnS3WWTFc7kuDRoPtPakl+T+OfdEUjYJj7Jbow==",
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/@types/send": {
       "version": "0.17.1",
@@ -276,186 +298,190 @@
       }
     },
     "node_modules/@types/serve-index": {
-      "version": "1.9.1",
-      "resolved": "https://registry.npmjs.org/@types/serve-index/-/serve-index-1.9.1.tgz",
-      "integrity": "sha512-d/Hs3nWDxNL2xAczmOVZNj92YZCS6RGxfBPjKzuu/XirCgXdpKEb88dYNbrYGint6IVWLNP+yonwVAuRC0T2Dg==",
+      "version": "1.9.4",
+      "resolved": "https://registry.npmjs.org/@types/serve-index/-/serve-index-1.9.4.tgz",
+      "integrity": "sha512-qLpGZ/c2fhSs5gnYsQxtDEq3Oy8SXPClIXkW5ghvAvsNuVSA8k+gCONcUCS/UjLEYvYps+e8uBtfgXgvhwfNug==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/express": "*"
       }
     },
     "node_modules/@types/serve-static": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.2.tgz",
-      "integrity": "sha512-J2LqtvFYCzaj8pVYKw8klQXrLLk7TBZmQ4ShlcdkELFKGwGMfevMLneMMRkMgZxotOD9wg497LpC7O8PcvAmfw==",
+      "version": "1.15.7",
+      "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.7.tgz",
+      "integrity": "sha512-W8Ym+h8nhuRwaKPaDw34QUkwsGi6Rc4yYqvKFo5rm2FUEhCFbzVWrxXUxuKK8TASjWsysJY0nsmNCGhCOIsrOw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/http-errors": "*",
-        "@types/mime": "*",
-        "@types/node": "*"
+        "@types/node": "*",
+        "@types/send": "*"
       }
     },
     "node_modules/@types/sockjs": {
-      "version": "0.3.33",
-      "resolved": "https://registry.npmjs.org/@types/sockjs/-/sockjs-0.3.33.tgz",
-      "integrity": "sha512-f0KEEe05NvUnat+boPTZ0dgaLZ4SfSouXUgv5noUiefG2ajgKjmETo9ZJyuqsl7dfl2aHlLJUiki6B4ZYldiiw==",
+      "version": "0.3.36",
+      "resolved": "https://registry.npmjs.org/@types/sockjs/-/sockjs-0.3.36.tgz",
+      "integrity": "sha512-MK9V6NzAS1+Ud7JV9lJLFqW85VbC9dq3LmwZCuBe4wBDgKC0Kj/jd8Xl+nSviU+Qc3+m7umHHyHg//2KSa0a0Q==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*"
       }
     },
     "node_modules/@types/ws": {
-      "version": "8.5.5",
-      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.5.tgz",
-      "integrity": "sha512-lwhs8hktwxSjf9UaZ9tG5M03PGogvFaH8gUgLNbN9HKIg0dvv6q+gkSuJ8HN4/VbyxkuLzCjlN7GquQ0gUJfIg==",
+      "version": "8.18.1",
+      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
+      "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*"
       }
     },
     "node_modules/@webassemblyjs/ast": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz",
-      "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.14.1.tgz",
+      "integrity": "sha512-nuBEDgQfm1ccRp/8bCQrx1frohyufl4JlbMMZ4P1wpeOfDhF6FQkxZJ1b/e+PLwr6X1Nhw6OLme5usuBWYBvuQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/helper-numbers": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6"
+        "@webassemblyjs/helper-numbers": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/floating-point-hex-parser": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz",
-      "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.13.2.tgz",
+      "integrity": "sha512-6oXyTOzbKxGH4steLbLNOu71Oj+C8Lg34n6CqRvqfS2O71BxY6ByfMDRhBytzknj9yGUPVJ1qIKhRlAwO1AovA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-api-error": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz",
-      "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.13.2.tgz",
+      "integrity": "sha512-U56GMYxy4ZQCbDZd6JuvvNV/WFildOjsaWD3Tzzvmw/mas3cXzRJPMjP83JqEsgSbyrmaGjBfDtV7KDXV9UzFQ==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-buffer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz",
-      "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.14.1.tgz",
+      "integrity": "sha512-jyH7wtcHiKssDtFPRB+iQdxlDf96m0E39yb0k5uJVhFGleZFoNw1c4aeIcVUPPbXUVJ94wwnMOAqUHyzoEPVMA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-numbers": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz",
-      "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.13.2.tgz",
+      "integrity": "sha512-FE8aCmS5Q6eQYcV3gI35O4J789wlQA+7JrqTTpJqn5emA4U2hvwJmvFRC0HODS+3Ye6WioDklgd6scJ3+PLnEA==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/floating-point-hex-parser": "1.11.6",
-        "@webassemblyjs/helper-api-error": "1.11.6",
+        "@webassemblyjs/floating-point-hex-parser": "1.13.2",
+        "@webassemblyjs/helper-api-error": "1.13.2",
         "@xtuc/long": "4.2.2"
       }
     },
     "node_modules/@webassemblyjs/helper-wasm-bytecode": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz",
-      "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.13.2.tgz",
+      "integrity": "sha512-3QbLKy93F0EAIXLh0ogEVR6rOubA9AoZ+WRYhNbFyuB70j3dRdwH9g+qXhLAO0kiYGlg3TxDV+I4rQTr/YNXkA==",
       "dev": true
     },
     "node_modules/@webassemblyjs/helper-wasm-section": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz",
-      "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.14.1.tgz",
+      "integrity": "sha512-ds5mXEqTJ6oxRoqjhWDU83OgzAYjwsCV8Lo/N+oRsNDmx/ZDpqalmrtgOMkHwxsG0iI//3BwWAErYRHtgn0dZw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/wasm-gen": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/wasm-gen": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/ieee754": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.11.6.tgz",
-      "integrity": "sha512-LM4p2csPNvbij6U1f19v6WR56QZ8JcHg3QIJTlSwzFcmx6WSORicYj6I63f9yU1kEUtrpG+kjkiIAkevHpDXrg==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.13.2.tgz",
+      "integrity": "sha512-4LtOzh58S/5lX4ITKxnAK2USuNEvpdVV9AlgGQb8rJDHaLeHciwG4zlGr0j/SNWlr7x3vO1lDEsuePvtcDNCkw==",
       "dev": true,
       "dependencies": {
         "@xtuc/ieee754": "^1.2.0"
       }
     },
     "node_modules/@webassemblyjs/leb128": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.11.6.tgz",
-      "integrity": "sha512-m7a0FhE67DQXgouf1tbN5XQcdWoNgaAuoULHIfGFIEVKA6tu/edls6XnIlkmS6FrXAquJRPni3ZZKjw6FSPjPQ==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.13.2.tgz",
+      "integrity": "sha512-Lde1oNoIdzVzdkNEAWZ1dZ5orIbff80YPdHx20mrHwHrVNNTjNr8E3xz9BdpcGqRQbAEa+fkrCb+fRFTl/6sQw==",
       "dev": true,
       "dependencies": {
         "@xtuc/long": "4.2.2"
       }
     },
     "node_modules/@webassemblyjs/utf8": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.11.6.tgz",
-      "integrity": "sha512-vtXf2wTQ3+up9Zsg8sa2yWiQpzSsMyXj0qViVP6xKGCUT8p8YJ6HqI7l5eCnWx1T/FYdsv07HQs2wTFbbof/RA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.13.2.tgz",
+      "integrity": "sha512-3NQWGjKTASY1xV5m7Hr0iPeXD9+RDobLll3T9d2AO+g3my8xy5peVyjSag4I50mR1bBSN/Ct12lo+R9tJk0NZQ==",
       "dev": true
     },
     "node_modules/@webassemblyjs/wasm-edit": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.12.1.tgz",
-      "integrity": "sha512-1DuwbVvADvS5mGnXbE+c9NfA8QRcZ6iKquqjjmR10k6o+zzsRVesil54DKexiowcFCPdr/Q0qaMgB01+SQ1u6g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.14.1.tgz",
+      "integrity": "sha512-RNJUIQH/J8iA/1NzlE4N7KtyZNHi3w7at7hDjvRNm5rcUXa00z1vRz3glZoULfJ5mpvYhLybmVcwcjGrC1pRrQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/helper-wasm-section": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-opt": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1",
-        "@webassemblyjs/wast-printer": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/helper-wasm-section": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-opt": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1",
+        "@webassemblyjs/wast-printer": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/wasm-gen": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.12.1.tgz",
-      "integrity": "sha512-TDq4Ojh9fcohAw6OIMXqiIcTq5KUXTGRkVxbSo1hQnSy6lAM5GSdfwWeSxpAo0YzgsgF182E/U0mDNhuA0tW7w==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.14.1.tgz",
+      "integrity": "sha512-AmomSIjP8ZbfGQhumkNvgC33AY7qtMCXnN6bL2u2Js4gVCg8fp735aEiMSBbDR7UQIj90n4wKAFUSEd0QN2Ukg==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/wasm-opt": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.12.1.tgz",
-      "integrity": "sha512-Jg99j/2gG2iaz3hijw857AVYekZe2SAskcqlWIZXjji5WStnOpVoat3gQfT/Q5tb2djnCjBtMocY/Su1GfxPBg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.14.1.tgz",
+      "integrity": "sha512-PTcKLUNvBqnY2U6E5bdOQcSM+oVP/PmrDY9NzowJjislEjwP/C4an2303MCVS2Mg9d3AJpIGdUFIQQWbPds0Sw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1"
       }
     },
     "node_modules/@webassemblyjs/wasm-parser": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.12.1.tgz",
-      "integrity": "sha512-xikIi7c2FHXysxXe3COrVUPSheuBtpcfhbpFj4gmu7KRLYOzANztwUU0IbsqvMqzuNK2+glRGWCEqZo1WCLyAQ==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.14.1.tgz",
+      "integrity": "sha512-JLBl+KZ0R5qB7mCnud/yyX08jWFw5MsoalJ1pQ4EdFlgj9VdXKGuENGsiCIjegI1W7p91rUlcB/LB5yRJKNTcQ==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-api-error": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-api-error": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "node_modules/@webassemblyjs/wast-printer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.12.1.tgz",
-      "integrity": "sha512-+X4WAlOisVWQMikjbcvY2e0rwPsKQ9F688lksZhBcPycBBuii3O7m8FACbDMWDojpAqvjIncrG8J0XHKyQfVeA==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.14.1.tgz",
+      "integrity": "sha512-kPSSXE6De1XOR820C90RIo2ogvZG+c3KiHzqUoO/F34Y2shGzesfqv7o57xrxovZJH/MetF5UjroJ/R/3isoiw==",
       "dev": true,
       "dependencies": {
-        "@webassemblyjs/ast": "1.12.1",
+        "@webassemblyjs/ast": "1.14.1",
         "@xtuc/long": "4.2.2"
       }
     },
@@ -529,9 +555,9 @@
       }
     },
     "node_modules/acorn": {
-      "version": "8.12.1",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
-      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
+      "version": "8.15.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
       "dev": true,
       "bin": {
         "acorn": "bin/acorn"
@@ -540,25 +566,28 @@
         "node": ">=0.4.0"
       }
     },
-    "node_modules/acorn-import-attributes": {
-      "version": "1.9.5",
-      "resolved": "https://registry.npmjs.org/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz",
-      "integrity": "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==",
+    "node_modules/acorn-import-phases": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz",
+      "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==",
       "dev": true,
+      "engines": {
+        "node": ">=10.13.0"
+      },
       "peerDependencies": {
-        "acorn": "^8"
+        "acorn": "^8.14.0"
       }
     },
     "node_modules/ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
       "dev": true,
       "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3",
+        "fast-uri": "^3.0.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2"
       },
       "funding": {
         "type": "github",
@@ -582,35 +611,16 @@
         }
       }
     },
-    "node_modules/ajv-formats/node_modules/ajv": {
-      "version": "8.12.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+    "node_modules/ajv-keywords": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
+      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
       "dev": true,
       "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3"
       },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/ajv-formats/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true
-    },
-    "node_modules/ajv-keywords": {
-      "version": "3.5.2",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz",
-      "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==",
-      "dev": true,
       "peerDependencies": {
-        "ajv": "^6.9.1"
+        "ajv": "^8.8.2"
       }
     },
     "node_modules/ansi-html-community": {
@@ -630,6 +640,7 @@
       "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
       "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
       "dev": true,
+      "license": "ISC",
       "dependencies": {
         "normalize-path": "^3.0.0",
         "picomatch": "^2.0.4"
@@ -639,16 +650,20 @@
       }
     },
     "node_modules/array-flatten": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-2.1.2.tgz",
-      "integrity": "sha512-hNfzcOV8W4NdualtqBFPyVO+54DSJuZGY9qT4pRroB6S9e3iiido2ISIC5h9R2sPJ8H3FHCIiEnsv1lPXO3KtQ==",
-      "dev": true
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==",
+      "dev": true,
+      "license": "MIT"
     },
-    "node_modules/balanced-match": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
-      "dev": true
+    "node_modules/baseline-browser-mapping": {
+      "version": "2.9.19",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
+      "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
+      "dev": true,
+      "bin": {
+        "baseline-browser-mapping": "dist/cli.js"
+      }
     },
     "node_modules/batch": {
       "version": "0.6.1",
@@ -657,12 +672,16 @@
       "dev": true
     },
     "node_modules/binary-extensions": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz",
-      "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
+      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/body-parser": {
@@ -670,6 +689,7 @@
       "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz",
       "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "bytes": "3.1.2",
         "content-type": "~1.0.5",
@@ -689,20 +709,12 @@
         "npm": "1.2.8000 || >= 1.4.16"
       }
     },
-    "node_modules/body-parser/node_modules/bytes": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
-      "dev": true,
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/body-parser/node_modules/debug": {
       "version": "2.6.9",
       "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
       "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ms": "2.0.0"
       }
@@ -712,32 +724,22 @@
       "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
       "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
     },
     "node_modules/bonjour-service": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.1.1.tgz",
-      "integrity": "sha512-Z/5lQRMOG9k7W+FkeGTNjh7htqn/2LMnfOvBZ8pynNZCM9MwkQkI3zeI4oz09uWdcgmgHugVvBqxGg4VQJ5PCg==",
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.3.0.tgz",
+      "integrity": "sha512-3YuAUiSkWykd+2Azjgyxei8OWf8thdn8AITIog2M4UICzoqfjlqr64WIjEXZllf/W6vK1goqleSR6brGomxQqA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "array-flatten": "^2.1.2",
-        "dns-equal": "^1.0.0",
         "fast-deep-equal": "^3.1.3",
         "multicast-dns": "^7.2.5"
       }
     },
-    "node_modules/brace-expansion": {
-      "version": "1.1.11",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
-      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
-      "dev": true,
-      "dependencies": {
-        "balanced-match": "^1.0.0",
-        "concat-map": "0.0.1"
-      }
-    },
     "node_modules/braces": {
       "version": "3.0.3",
       "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
@@ -751,9 +753,9 @@
       }
     },
     "node_modules/browserslist": {
-      "version": "4.21.11",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.11.tgz",
-      "integrity": "sha512-xn1UXOKUz7DjdGlg9RrUr0GGiWzI97UQJnugHtH0OLDfJB7jMgoIkYvRIEO1l9EeEERVqeqLYOcFBW9ldjypbQ==",
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
       "dev": true,
       "funding": [
         {
@@ -770,10 +772,11 @@
         }
       ],
       "dependencies": {
-        "caniuse-lite": "^1.0.30001538",
-        "electron-to-chromium": "^1.4.526",
-        "node-releases": "^2.0.13",
-        "update-browserslist-db": "^1.0.13"
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
       },
       "bin": {
         "browserslist": "cli.js"
@@ -788,26 +791,54 @@
       "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
       "dev": true
     },
+    "node_modules/bundle-name": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz",
+      "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "run-applescript": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/bytes": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
-      "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
       "dev": true,
       "engines": {
         "node": ">= 0.8"
       }
     },
-    "node_modules/call-bind": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
-      "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==",
+    "node_modules/call-bind-apply-helpers": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "es-define-property": "^1.0.0",
         "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "set-function-length": "^1.2.1"
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/call-bound": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
+      "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "get-intrinsic": "^1.3.0"
       },
       "engines": {
         "node": ">= 0.4"
@@ -817,9 +848,9 @@
       }
     },
     "node_modules/caniuse-lite": {
-      "version": "1.0.30001538",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001538.tgz",
-      "integrity": "sha512-HWJnhnID+0YMtGlzcp3T9drmBJUVDchPJ08tpUGFLs9CYlwWPH2uLgpHn8fND5pCgXVtnGS3H4QR9XLMHVNkHw==",
+      "version": "1.0.30001768",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001768.tgz",
+      "integrity": "sha512-qY3aDRZC5nWPgHUgIB84WL+nySuo19wk0VJpp/XI9T34lrvkyhRvNVOFJOp2kxClQhiFBu+TaUSudf6oa3vkSA==",
       "dev": true,
       "funding": [
         {
@@ -837,16 +868,11 @@
       ]
     },
     "node_modules/chokidar": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz",
-      "integrity": "sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==",
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
+      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
       "dev": true,
-      "funding": [
-        {
-          "type": "individual",
-          "url": "https://paulmillr.com/funding/"
-        }
-      ],
+      "license": "MIT",
       "dependencies": {
         "anymatch": "~3.1.2",
         "braces": "~3.0.2",
@@ -859,6 +885,9 @@
       "engines": {
         "node": ">= 8.10.0"
       },
+      "funding": {
+        "url": "https://paulmillr.com/funding/"
+      },
       "optionalDependencies": {
         "fsevents": "~2.3.2"
       }
@@ -914,17 +943,17 @@
       }
     },
     "node_modules/compression": {
-      "version": "1.7.4",
-      "resolved": "https://registry.npmjs.org/compression/-/compression-1.7.4.tgz",
-      "integrity": "sha512-jaSIDzP9pZVS4ZfQ+TzvtiWhdpFhE2RDHz8QJkpX9SIpLq88VueF5jJw6t+6CUQcAoA6t+x89MLrWAqpfDE8iQ==",
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz",
+      "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==",
       "dev": true,
       "dependencies": {
-        "accepts": "~1.3.5",
-        "bytes": "3.0.0",
-        "compressible": "~2.0.16",
+        "bytes": "3.1.2",
+        "compressible": "~2.0.18",
         "debug": "2.6.9",
-        "on-headers": "~1.0.2",
-        "safe-buffer": "5.1.2",
+        "negotiator": "~0.6.4",
+        "on-headers": "~1.1.0",
+        "safe-buffer": "5.2.1",
         "vary": "~1.1.2"
       },
       "engines": {
@@ -940,11 +969,34 @@
         "ms": "2.0.0"
       }
     },
-    "node_modules/concat-map": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
-      "dev": true
+    "node_modules/compression/node_modules/negotiator": {
+      "version": "0.6.4",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz",
+      "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/compression/node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ]
     },
     "node_modules/connect-history-api-fallback": {
       "version": "2.0.0",
@@ -960,6 +1012,7 @@
       "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz",
       "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "safe-buffer": "5.2.1"
       },
@@ -985,13 +1038,15 @@
           "type": "consulting",
           "url": "https://feross.org/support"
         }
-      ]
+      ],
+      "license": "MIT"
     },
     "node_modules/content-type": {
       "version": "1.0.5",
       "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz",
       "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -1001,6 +1056,7 @@
       "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.1.tgz",
       "integrity": "sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -1009,7 +1065,8 @@
       "version": "1.0.6",
       "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
       "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/copy-webpack-plugin": {
       "version": "12.0.2",
@@ -1036,36 +1093,6 @@
         "webpack": "^5.1.0"
       }
     },
-    "node_modules/copy-webpack-plugin/node_modules/ajv": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3",
-        "fast-uri": "^3.0.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/copy-webpack-plugin/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
     "node_modules/copy-webpack-plugin/node_modules/glob-parent": {
       "version": "6.0.2",
       "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
@@ -1079,33 +1106,6 @@
         "node": ">=10.13.0"
       }
     },
-    "node_modules/copy-webpack-plugin/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/copy-webpack-plugin/node_modules/schema-utils": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.0.tgz",
-      "integrity": "sha512-Gf9qqc58SpCA/xdziiHz35F4GNIWYWZrEshUc/G/r5BnLph6xpKuLeoJoQuj5WfBIx/eQLf+hmVPYHaxJu7V2g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 10.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
     "node_modules/core-util-is": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
@@ -1146,42 +1146,47 @@
       "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
       "dev": true
     },
-    "node_modules/default-gateway": {
-      "version": "6.0.3",
-      "resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-6.0.3.tgz",
-      "integrity": "sha512-fwSOJsbbNzZ/CUFpqFBqYfYNLj1NbMPm8MMCIzHjC83iSJRBEGmDUxU+WP661BaBQImeC2yHwXtz+P/O9o+XEg==",
+    "node_modules/default-browser": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.2.1.tgz",
+      "integrity": "sha512-WY/3TUME0x3KPYdRRxEJJvXRHV4PyPoUsxtZa78lwItwRQRHhd2U9xOscaT/YTf8uCXIAjeJOFBVEh/7FtD8Xg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "execa": "^5.0.0"
+        "bundle-name": "^4.1.0",
+        "default-browser-id": "^5.0.0"
       },
       "engines": {
-        "node": ">= 10"
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/define-data-property": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
-      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+    "node_modules/default-browser-id": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.0.tgz",
+      "integrity": "sha512-A6p/pu/6fyBcA1TRz/GqWYPViplrftcW2gZC9q79ngNCKAeR/X3gcEdXQHl4KNXV+3wgIJ1CPkJQ3IHM6lcsyA==",
       "dev": true,
-      "dependencies": {
-        "es-define-property": "^1.0.0",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.0.1"
-      },
+      "license": "MIT",
       "engines": {
-        "node": ">= 0.4"
+        "node": ">=18"
       },
       "funding": {
-        "url": "https://github.com/sponsors/ljharb"
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/define-lazy-prop": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
-      "integrity": "sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz",
+      "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
-        "node": ">=8"
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/depd": {
@@ -1198,6 +1203,7 @@
       "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
       "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8",
         "npm": "1.2.8000 || >= 1.4.16"
@@ -1209,17 +1215,12 @@
       "integrity": "sha512-ZIzRpLJrOj7jjP2miAtgqIfmzbxa4ZOr5jJc601zklsfEx9oTzmmj2nVpIPRpNlRTIh8lc1kyViIY7BWSGNmKw==",
       "dev": true
     },
-    "node_modules/dns-equal": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/dns-equal/-/dns-equal-1.0.0.tgz",
-      "integrity": "sha512-z+paD6YUQsk+AbGCEM4PrOXSss5gd66QfcVBFTKR/HpFL9jCqikS94HYwKww6fQyO7IxrIIyUu+g0Ka9tUS2Cg==",
-      "dev": true
-    },
     "node_modules/dns-packet": {
       "version": "5.6.1",
       "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-5.6.1.tgz",
       "integrity": "sha512-l4gcSouhcgIKRvyy99RNVOgxXiicE+2jZoNmaNmZ6JXiGajBOJAesk1OBlJuM5k2c+eudGdLxDqXuPCKIj6kpw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "@leichtgewicht/ip-codec": "^2.0.1"
       },
@@ -1227,35 +1228,52 @@
         "node": ">=6"
       }
     },
+    "node_modules/dunder-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.2.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
     "node_modules/ee-first": {
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
       "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/electron-to-chromium": {
-      "version": "1.4.528",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.528.tgz",
-      "integrity": "sha512-UdREXMXzLkREF4jA8t89FQjA8WHI6ssP38PMY4/4KhXFQbtImnghh4GkCgrtiZwLKUKVD2iTVXvDVQjfomEQuA==",
+      "version": "1.5.286",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
+      "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
       "dev": true
     },
     "node_modules/encodeurl": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
-      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
+      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
     },
     "node_modules/enhanced-resolve": {
-      "version": "5.17.1",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz",
-      "integrity": "sha512-LMHl3dXhTcfv8gM4kEzIUeTQ+7fpdA0l2tUf34BddXPkz2A5xJ5L/Pchd5BL6rdccM9QGvu0sWZzK1Z1t4wwyg==",
+      "version": "5.19.0",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.19.0.tgz",
+      "integrity": "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg==",
       "dev": true,
       "dependencies": {
         "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
+        "tapable": "^2.3.0"
       },
       "engines": {
         "node": ">=10.13.0"
@@ -1274,13 +1292,11 @@
       }
     },
     "node_modules/es-define-property": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
-      "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
       "dev": true,
-      "dependencies": {
-        "get-intrinsic": "^1.2.4"
-      },
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       }
@@ -1290,20 +1306,34 @@
       "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
       "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       }
     },
     "node_modules/es-module-lexer": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.1.tgz",
-      "integrity": "sha512-JUFAyicQV9mXc3YRxPnDlrfBKpqt6hUYzz9/boprUJHs4e4KVr3XwOF70doO6gwXUor6EWZJAyWAfKki84t20Q==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz",
+      "integrity": "sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==",
       "dev": true
     },
+    "node_modules/es-object-atoms": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
     "node_modules/escalade": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
-      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
       "dev": true,
       "engines": {
         "node": ">=6"
@@ -1363,6 +1393,7 @@
       "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
       "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -1382,82 +1413,58 @@
         "node": ">=0.8.x"
       }
     },
-    "node_modules/execa": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz",
-      "integrity": "sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==",
-      "dev": true,
-      "dependencies": {
-        "cross-spawn": "^7.0.3",
-        "get-stream": "^6.0.0",
-        "human-signals": "^2.1.0",
-        "is-stream": "^2.0.0",
-        "merge-stream": "^2.0.0",
-        "npm-run-path": "^4.0.1",
-        "onetime": "^5.1.2",
-        "signal-exit": "^3.0.3",
-        "strip-final-newline": "^2.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sindresorhus/execa?sponsor=1"
-      }
-    },
     "node_modules/express": {
-      "version": "4.21.1",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz",
-      "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==",
+      "version": "4.22.1",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz",
+      "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==",
       "dev": true,
       "dependencies": {
         "accepts": "~1.3.8",
         "array-flatten": "1.1.1",
-        "body-parser": "1.20.3",
-        "content-disposition": "0.5.4",
+        "body-parser": "~1.20.3",
+        "content-disposition": "~0.5.4",
         "content-type": "~1.0.4",
-        "cookie": "0.7.1",
-        "cookie-signature": "1.0.6",
+        "cookie": "~0.7.1",
+        "cookie-signature": "~1.0.6",
         "debug": "2.6.9",
         "depd": "2.0.0",
         "encodeurl": "~2.0.0",
         "escape-html": "~1.0.3",
         "etag": "~1.8.1",
-        "finalhandler": "1.3.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
+        "finalhandler": "~1.3.1",
+        "fresh": "~0.5.2",
+        "http-errors": "~2.0.0",
         "merge-descriptors": "1.0.3",
         "methods": "~1.1.2",
-        "on-finished": "2.4.1",
+        "on-finished": "~2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.10",
+        "path-to-regexp": "~0.1.12",
         "proxy-addr": "~2.0.7",
-        "qs": "6.13.0",
+        "qs": "~6.14.0",
         "range-parser": "~1.2.1",
         "safe-buffer": "5.2.1",
-        "send": "0.19.0",
-        "serve-static": "1.16.2",
+        "send": "~0.19.0",
+        "serve-static": "~1.16.2",
         "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
+        "statuses": "~2.0.1",
         "type-is": "~1.6.18",
         "utils-merge": "1.0.1",
         "vary": "~1.1.2"
       },
       "engines": {
         "node": ">= 0.10.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
       }
     },
-    "node_modules/express/node_modules/array-flatten": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
-      "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==",
-      "dev": true
-    },
     "node_modules/express/node_modules/debug": {
       "version": "2.6.9",
       "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
       "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ms": "2.0.0"
       }
@@ -1467,17 +1474,24 @@
       "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
       "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
     },
-    "node_modules/express/node_modules/encodeurl": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
+    "node_modules/express/node_modules/qs": {
+      "version": "6.14.0",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
+      "integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
       "dev": true,
+      "dependencies": {
+        "side-channel": "^1.1.0"
+      },
       "engines": {
-        "node": ">= 0.8"
+        "node": ">=0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
       }
     },
     "node_modules/express/node_modules/safe-buffer": {
@@ -1498,13 +1512,15 @@
           "type": "consulting",
           "url": "https://feross.org/support"
         }
-      ]
+      ],
+      "license": "MIT"
     },
     "node_modules/express/node_modules/statuses": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
       "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -1532,16 +1548,10 @@
         "node": ">=8.6.0"
       }
     },
-    "node_modules/fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true
-    },
     "node_modules/fast-uri": {
-      "version": "3.0.6",
-      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz",
-      "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+      "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
       "dev": true,
       "funding": [
         {
@@ -1552,8 +1562,7 @@
           "type": "opencollective",
           "url": "https://opencollective.com/fastify"
         }
-      ],
-      "license": "BSD-3-Clause"
+      ]
     },
     "node_modules/fastest-levenshtein": {
       "version": "1.0.16",
@@ -1603,6 +1612,7 @@
       "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz",
       "integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "debug": "2.6.9",
         "encodeurl": "~2.0.0",
@@ -1621,24 +1631,17 @@
       "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
       "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ms": "2.0.0"
       }
     },
-    "node_modules/finalhandler/node_modules/encodeurl": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
-      "dev": true,
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/finalhandler/node_modules/statuses": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
       "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -1681,6 +1684,7 @@
       "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
       "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -1690,28 +1694,18 @@
       "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
       "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
     },
-    "node_modules/fs-monkey": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/fs-monkey/-/fs-monkey-1.0.4.tgz",
-      "integrity": "sha512-INM/fWAxMICjttnD0DX1rBvinKskj5G1w+oy/pnm9u/tSlnBrzFonJMcalKJ30P8RRsPzKcCG7Q8l0jx5Fh9YQ==",
-      "dev": true
-    },
-    "node_modules/fs.realpath": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
-      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
-      "dev": true
-    },
     "node_modules/fsevents": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
       "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
       "dev": true,
       "hasInstallScript": true,
+      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -1730,16 +1724,22 @@
       }
     },
     "node_modules/get-intrinsic": {
-      "version": "1.2.4",
-      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz",
-      "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==",
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "es-define-property": "^1.0.1",
         "es-errors": "^1.3.0",
+        "es-object-atoms": "^1.1.1",
         "function-bind": "^1.1.2",
-        "has-proto": "^1.0.1",
-        "has-symbols": "^1.0.3",
-        "hasown": "^2.0.0"
+        "get-proto": "^1.0.1",
+        "gopd": "^1.2.0",
+        "has-symbols": "^1.1.0",
+        "hasown": "^2.0.2",
+        "math-intrinsics": "^1.1.0"
       },
       "engines": {
         "node": ">= 0.4"
@@ -1748,36 +1748,18 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/get-stream": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz",
-      "integrity": "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==",
-      "dev": true,
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/glob": {
-      "version": "7.2.3",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
-      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+    "node_modules/get-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "fs.realpath": "^1.0.0",
-        "inflight": "^1.0.4",
-        "inherits": "2",
-        "minimatch": "^3.1.1",
-        "once": "^1.3.0",
-        "path-is-absolute": "^1.0.0"
+        "dunder-proto": "^1.0.1",
+        "es-object-atoms": "^1.0.0"
       },
       "engines": {
-        "node": "*"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
+        "node": ">= 0.4"
       }
     },
     "node_modules/glob-parent": {
@@ -1820,12 +1802,13 @@
       }
     },
     "node_modules/gopd": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz",
-      "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
       "dev": true,
-      "dependencies": {
-        "get-intrinsic": "^1.1.3"
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
       },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
@@ -1864,35 +1847,12 @@
         "node": ">=8"
       }
     },
-    "node_modules/has-property-descriptors": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
-      "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
-      "dev": true,
-      "dependencies": {
-        "es-define-property": "^1.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-proto": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz",
-      "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==",
-      "dev": true,
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/has-symbols": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
-      "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       },
@@ -1905,6 +1865,7 @@
       "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
       "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "function-bind": "^1.1.2"
       },
@@ -1924,22 +1885,6 @@
         "wbuf": "^1.1.0"
       }
     },
-    "node_modules/html-entities": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.4.0.tgz",
-      "integrity": "sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/mdevils"
-        },
-        {
-          "type": "patreon",
-          "url": "https://patreon.com/mdevils"
-        }
-      ]
-    },
     "node_modules/http-deceiver": {
       "version": "1.2.7",
       "resolved": "https://registry.npmjs.org/http-deceiver/-/http-deceiver-1.2.7.tgz",
@@ -1951,6 +1896,7 @@
       "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
       "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "depd": "2.0.0",
         "inherits": "2.0.4",
@@ -1967,6 +1913,7 @@
       "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
       "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -1975,13 +1922,15 @@
       "version": "2.0.4",
       "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
       "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
-      "dev": true
+      "dev": true,
+      "license": "ISC"
     },
     "node_modules/http-errors/node_modules/statuses": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
       "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -2031,13 +1980,14 @@
         }
       }
     },
-    "node_modules/human-signals": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz",
-      "integrity": "sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==",
+    "node_modules/hyperdyperid": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/hyperdyperid/-/hyperdyperid-1.2.0.tgz",
+      "integrity": "sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A==",
       "dev": true,
+      "license": "MIT",
       "engines": {
-        "node": ">=10.17.0"
+        "node": ">=10.18"
       }
     },
     "node_modules/iconv-lite": {
@@ -2045,6 +1995,7 @@
       "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
       "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "safer-buffer": ">= 2.1.2 < 3"
       },
@@ -2081,16 +2032,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/inflight": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
-      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
-      "dev": true,
-      "dependencies": {
-        "once": "^1.3.0",
-        "wrappy": "1"
-      }
-    },
     "node_modules/inherits": {
       "version": "2.0.3",
       "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
@@ -2120,6 +2061,7 @@
       "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
       "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "binary-extensions": "^2.0.0"
       },
@@ -2140,15 +2082,16 @@
       }
     },
     "node_modules/is-docker": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz",
-      "integrity": "sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz",
+      "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==",
       "dev": true,
+      "license": "MIT",
       "bin": {
         "is-docker": "cli.js"
       },
       "engines": {
-        "node": ">=8"
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
       },
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
@@ -2175,6 +2118,38 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/is-inside-container": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz",
+      "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-docker": "^3.0.0"
+      },
+      "bin": {
+        "is-inside-container": "cli.js"
+      },
+      "engines": {
+        "node": ">=14.16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-network-error": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/is-network-error/-/is-network-error-1.1.0.tgz",
+      "integrity": "sha512-tUdRRAnhT+OtCZR/LxZelH/C7QtjtFrTu5tXCA8pl55eTUElUHT+GPYV8MBMBvea/j+NxQqVt3LbWMRir7Gx9g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/is-number": {
       "version": "7.0.0",
       "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
@@ -2208,28 +2183,20 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/is-stream": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
-      "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
-      "dev": true,
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/is-wsl": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
-      "integrity": "sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz",
+      "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "is-docker": "^2.0.0"
+        "is-inside-container": "^1.0.0"
       },
       "engines": {
-        "node": ">=8"
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/isarray": {
@@ -2274,9 +2241,9 @@
       "dev": true
     },
     "node_modules/json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "dev": true
     },
     "node_modules/kind-of": {
@@ -2289,62 +2256,181 @@
       }
     },
     "node_modules/launch-editor": {
-      "version": "2.6.0",
-      "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.6.0.tgz",
-      "integrity": "sha512-JpDCcQnyAAzZZaZ7vEiSqL690w7dAEyLao+KC96zBplnYbJS7TYNjvM3M7y3dGz+v7aIsJk3hllWuc0kWAjyRQ==",
+      "version": "2.10.0",
+      "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.10.0.tgz",
+      "integrity": "sha512-D7dBRJo/qcGX9xlvt/6wUYzQxjh5G1RvZPgPv8vi4KRU99DVQL/oW7tnVOCCTm2HGeo3C5HvGE5Yrh6UBoZ0vA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "picocolors": "^1.0.0",
+        "shell-quote": "^1.8.1"
+      }
+    },
+    "node_modules/loader-runner": {
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.1.tgz",
+      "integrity": "sha512-IWqP2SCPhyVFTBtRcgMHdzlf9ul25NwaFx4wCEH/KjAXuuHY4yNjvPXsBokp8jCB936PyWRaPKUNh8NvylLp2Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.11.5"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/webpack"
+      }
+    },
+    "node_modules/locate-path": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz",
+      "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==",
+      "dev": true,
+      "dependencies": {
+        "p-locate": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/math-intrinsics": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/media-typer": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/memfs": {
+      "version": "4.17.2",
+      "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.17.2.tgz",
+      "integrity": "sha512-NgYhCOWgovOXSzvYgUW0LQ7Qy72rWQMGGFJDoWg4G30RHd3z77VbYdtJ4fembJXBy8pMIUA31XNAupobOQlwdg==",
       "dev": true,
+      "license": "Apache-2.0",
       "dependencies": {
-        "picocolors": "^1.0.0",
-        "shell-quote": "^1.7.3"
+        "@jsonjoy.com/json-pack": "^1.0.3",
+        "@jsonjoy.com/util": "^1.3.0",
+        "tree-dump": "^1.0.1",
+        "tslib": "^2.0.0"
+      },
+      "engines": {
+        "node": ">= 4.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/streamich"
       }
     },
-    "node_modules/loader-runner": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz",
-      "integrity": "sha512-3R/1M+yS3j5ou80Me59j7F9IMs4PXs3VqRrm0TU3AbKPxlmpoY1TNscJV/oGJXo8qCatFGTfDbY6W6ipGOYXfg==",
+    "node_modules/memfs/node_modules/@jsonjoy.com/base64": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@jsonjoy.com/base64/-/base64-1.1.2.tgz",
+      "integrity": "sha512-q6XAnWQDIMA3+FTiOYajoYqySkO+JSat0ytXGSuRdq9uXE7o92gzuQwQM14xaCRlBLGq3v5miDGC4vkVTn54xA==",
       "dev": true,
+      "license": "Apache-2.0",
       "engines": {
-        "node": ">=6.11.5"
+        "node": ">=10.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/streamich"
+      },
+      "peerDependencies": {
+        "tslib": "2"
       }
     },
-    "node_modules/locate-path": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz",
-      "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==",
+    "node_modules/memfs/node_modules/@jsonjoy.com/json-pack": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@jsonjoy.com/json-pack/-/json-pack-1.2.0.tgz",
+      "integrity": "sha512-io1zEbbYcElht3tdlqEOFxZ0dMTYrHz9iMf0gqn1pPjZFTCgM5R4R5IMA20Chb2UPYYsxjzs8CgZ7Nb5n2K2rA==",
       "dev": true,
+      "license": "Apache-2.0",
       "dependencies": {
-        "p-locate": "^4.1.0"
+        "@jsonjoy.com/base64": "^1.1.1",
+        "@jsonjoy.com/util": "^1.1.2",
+        "hyperdyperid": "^1.2.0",
+        "thingies": "^1.20.0"
       },
       "engines": {
-        "node": ">=8"
+        "node": ">=10.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/streamich"
+      },
+      "peerDependencies": {
+        "tslib": "2"
       }
     },
-    "node_modules/media-typer": {
-      "version": "0.3.0",
-      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
-      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
+    "node_modules/memfs/node_modules/@jsonjoy.com/util": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@jsonjoy.com/util/-/util-1.6.0.tgz",
+      "integrity": "sha512-sw/RMbehRhN68WRtcKCpQOPfnH6lLP4GJfqzi3iYej8tnzpZUDr6UkZYJjcjjC0FWEJOJbyM3PTIwxucUmDG2A==",
       "dev": true,
+      "license": "Apache-2.0",
       "engines": {
-        "node": ">= 0.6"
+        "node": ">=10.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/streamich"
+      },
+      "peerDependencies": {
+        "tslib": "2"
       }
     },
-    "node_modules/memfs": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/memfs/-/memfs-3.5.3.tgz",
-      "integrity": "sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==",
+    "node_modules/memfs/node_modules/thingies": {
+      "version": "1.21.0",
+      "resolved": "https://registry.npmjs.org/thingies/-/thingies-1.21.0.tgz",
+      "integrity": "sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g==",
       "dev": true,
-      "dependencies": {
-        "fs-monkey": "^1.0.4"
+      "license": "Unlicense",
+      "engines": {
+        "node": ">=10.18"
       },
+      "peerDependencies": {
+        "tslib": "^2"
+      }
+    },
+    "node_modules/memfs/node_modules/tree-dump": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/tree-dump/-/tree-dump-1.0.3.tgz",
+      "integrity": "sha512-il+Cv80yVHFBwokQSfd4bldvr1Md951DpgAGfmhydt04L+YzHgubm2tQ7zueWDcGENKHq0ZvGFR/hjvNXilHEg==",
+      "dev": true,
+      "license": "Apache-2.0",
       "engines": {
-        "node": ">= 4.0.0"
+        "node": ">=10.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/streamich"
+      },
+      "peerDependencies": {
+        "tslib": "2"
       }
     },
+    "node_modules/memfs/node_modules/tslib": {
+      "version": "2.8.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+      "dev": true,
+      "license": "0BSD"
+    },
     "node_modules/merge-descriptors": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz",
       "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==",
       "dev": true,
+      "license": "MIT",
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
@@ -2370,6 +2456,7 @@
       "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
       "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -2393,6 +2480,7 @@
       "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
       "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
       "dev": true,
+      "license": "MIT",
       "bin": {
         "mime": "cli.js"
       },
@@ -2421,33 +2509,12 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/mimic-fn": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
-      "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==",
-      "dev": true,
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/minimalistic-assert": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz",
       "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==",
       "dev": true
     },
-    "node_modules/minimatch": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
-      "dev": true,
-      "dependencies": {
-        "brace-expansion": "^1.1.7"
-      },
-      "engines": {
-        "node": "*"
-      }
-    },
     "node_modules/ms": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
@@ -2459,6 +2526,7 @@
       "resolved": "https://registry.npmjs.org/multicast-dns/-/multicast-dns-7.2.5.tgz",
       "integrity": "sha512-2eznPJP8z2BFLX50tf0LuODrpINqP1RVIm/CObbTcBRITQgmC/TjcREF1NeTBzIcR5XO/ukWo+YHOjBbFwIupg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "dns-packet": "^5.2.2",
         "thunky": "^1.0.2"
@@ -2483,18 +2551,18 @@
       "dev": true
     },
     "node_modules/node-forge": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.1.tgz",
-      "integrity": "sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==",
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.2.tgz",
+      "integrity": "sha512-6xKiQ+cph9KImrRh0VsjH2d8/GXA4FIMlgU4B757iI1ApvcyA9VlouP0yZJha01V+huImO+kKMU7ih+2+E14fw==",
       "dev": true,
       "engines": {
         "node": ">= 6.13.0"
       }
     },
     "node_modules/node-releases": {
-      "version": "2.0.13",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.13.tgz",
-      "integrity": "sha512-uYr7J37ae/ORWdZeQ1xxMJe3NtdmqMC/JZK+geofDrkLUApKRHPd18/TxtBOJ4A0/+uUIliorNrfYV6s1b02eQ==",
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
       "dev": true
     },
     "node_modules/normalize-path": {
@@ -2506,23 +2574,12 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/npm-run-path": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
-      "integrity": "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==",
-      "dev": true,
-      "dependencies": {
-        "path-key": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/object-inspect": {
-      "version": "1.13.2",
-      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
-      "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
+      "version": "1.13.4",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
+      "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4"
       },
@@ -2541,6 +2598,7 @@
       "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
       "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ee-first": "1.1.1"
       },
@@ -2549,50 +2607,28 @@
       }
     },
     "node_modules/on-headers": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.0.2.tgz",
-      "integrity": "sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz",
+      "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==",
       "dev": true,
       "engines": {
         "node": ">= 0.8"
       }
     },
-    "node_modules/once": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
-      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-      "dev": true,
-      "dependencies": {
-        "wrappy": "1"
-      }
-    },
-    "node_modules/onetime": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
-      "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==",
-      "dev": true,
-      "dependencies": {
-        "mimic-fn": "^2.1.0"
-      },
-      "engines": {
-        "node": ">=6"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/open": {
-      "version": "8.4.2",
-      "resolved": "https://registry.npmjs.org/open/-/open-8.4.2.tgz",
-      "integrity": "sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==",
+      "version": "10.1.2",
+      "resolved": "https://registry.npmjs.org/open/-/open-10.1.2.tgz",
+      "integrity": "sha512-cxN6aIDPz6rm8hbebcP7vrQNhvRcveZoJU72Y7vskh4oIm+BZwBECnx5nTmrlres1Qapvx27Qo1Auukpf8PKXw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "define-lazy-prop": "^2.0.0",
-        "is-docker": "^2.1.1",
-        "is-wsl": "^2.2.0"
+        "default-browser": "^5.2.1",
+        "define-lazy-prop": "^3.0.0",
+        "is-inside-container": "^1.0.0",
+        "is-wsl": "^3.1.0"
       },
       "engines": {
-        "node": ">=12"
+        "node": ">=18"
       },
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
@@ -2626,16 +2662,21 @@
       }
     },
     "node_modules/p-retry": {
-      "version": "4.6.2",
-      "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz",
-      "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==",
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-6.2.1.tgz",
+      "integrity": "sha512-hEt02O4hUct5wtwg4H4KcWgDdm+l1bOaEy/hWzd8xtXB9BqxTWBBhb+2ImAtH4Cv4rPjV76xN3Zumqk3k3AhhQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "@types/retry": "0.12.0",
+        "@types/retry": "0.12.2",
+        "is-network-error": "^1.0.0",
         "retry": "^0.13.1"
       },
       "engines": {
-        "node": ">=8"
+        "node": ">=16.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/p-try": {
@@ -2665,15 +2706,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/path-is-absolute": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
-      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
-      "dev": true,
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/path-key": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
@@ -2690,10 +2722,11 @@
       "dev": true
     },
     "node_modules/path-to-regexp": {
-      "version": "0.1.10",
-      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz",
-      "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==",
-      "dev": true
+      "version": "0.1.12",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
+      "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==",
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/path-type": {
       "version": "6.0.0",
@@ -2709,9 +2742,9 @@
       }
     },
     "node_modules/picocolors": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
-      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "dev": true
     },
     "node_modules/picomatch": {
@@ -2749,6 +2782,7 @@
       "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
       "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "forwarded": "0.2.0",
         "ipaddr.js": "1.9.1"
@@ -2762,24 +2796,17 @@
       "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
       "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.10"
       }
     },
-    "node_modules/punycode": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz",
-      "integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==",
-      "dev": true,
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/qs": {
       "version": "6.13.0",
       "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
       "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==",
       "dev": true,
+      "license": "BSD-3-Clause",
       "dependencies": {
         "side-channel": "^1.0.6"
       },
@@ -2825,6 +2852,7 @@
       "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
       "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.6"
       }
@@ -2834,6 +2862,7 @@
       "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
       "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "bytes": "3.1.2",
         "http-errors": "2.0.0",
@@ -2844,15 +2873,6 @@
         "node": ">= 0.8"
       }
     },
-    "node_modules/raw-body/node_modules/bytes": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
-      "dev": true,
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/readable-stream": {
       "version": "2.3.6",
       "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
@@ -2873,6 +2893,7 @@
       "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
       "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "picomatch": "^2.2.1"
       },
@@ -2950,6 +2971,7 @@
       "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
       "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 4"
       }
@@ -2965,19 +2987,17 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/rimraf": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
-      "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
+    "node_modules/run-applescript": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz",
+      "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==",
       "dev": true,
-      "dependencies": {
-        "glob": "^7.1.3"
-      },
-      "bin": {
-        "rimraf": "bin.js"
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
       },
       "funding": {
-        "url": "https://github.com/sponsors/isaacs"
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/run-parallel": {
@@ -3014,17 +3034,19 @@
       "version": "2.1.2",
       "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
       "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/schema-utils": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz",
-      "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz",
+      "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==",
       "dev": true,
       "dependencies": {
-        "@types/json-schema": "^7.0.8",
-        "ajv": "^6.12.5",
-        "ajv-keywords": "^3.5.2"
+        "@types/json-schema": "^7.0.9",
+        "ajv": "^8.9.0",
+        "ajv-formats": "^2.1.1",
+        "ajv-keywords": "^5.1.0"
       },
       "engines": {
         "node": ">= 10.13.0"
@@ -3041,11 +3063,13 @@
       "dev": true
     },
     "node_modules/selfsigned": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-2.1.1.tgz",
-      "integrity": "sha512-GSL3aowiF7wa/WtSFwnUrludWFoNhftq8bUkH9pkzjpN2XSPOAYEgg6e0sS9s0rZwgJzJiQRPU18A6clnoW5wQ==",
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-2.4.1.tgz",
+      "integrity": "sha512-th5B4L2U+eGLq1TVh7zNRGBapioSORUeymIydxgFpwww9d2qyKvtuPU2jJuHvYAwwqi2Y596QBL3eEqcPEYL8Q==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
+        "@types/node-forge": "^1.3.0",
         "node-forge": "^1"
       },
       "engines": {
@@ -3057,6 +3081,7 @@
       "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz",
       "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "debug": "2.6.9",
         "depd": "2.0.0",
@@ -3081,6 +3106,7 @@
       "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
       "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "ms": "2.0.0"
       }
@@ -3089,13 +3115,25 @@
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
       "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/send/node_modules/depd": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
       "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
       "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/send/node_modules/encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -3104,13 +3142,15 @@
       "version": "2.1.3",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
       "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/send/node_modules/statuses": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
       "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
@@ -3178,6 +3218,7 @@
       "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz",
       "integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "encodeurl": "~2.0.0",
         "escape-html": "~1.0.3",
@@ -3188,37 +3229,12 @@
         "node": ">= 0.8.0"
       }
     },
-    "node_modules/serve-static/node_modules/encodeurl": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
-      "dev": true,
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/set-function-length": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
-      "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
-      "dev": true,
-      "dependencies": {
-        "define-data-property": "^1.1.4",
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "gopd": "^1.0.1",
-        "has-property-descriptors": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
     "node_modules/setprototypeof": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
       "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==",
-      "dev": true
+      "dev": true,
+      "license": "ISC"
     },
     "node_modules/shallow-clone": {
       "version": "3.0.1",
@@ -3254,24 +3270,47 @@
       }
     },
     "node_modules/shell-quote": {
-      "version": "1.8.1",
-      "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.1.tgz",
-      "integrity": "sha512-6j1W9l1iAs/4xYBI1SYOVZyFcCis9b4KCLQ8fgAGG07QvzaRLVVRQvAy85yNmmZSjYjg4MWh4gNvlPujU/5LpA==",
+      "version": "1.8.3",
+      "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.3.tgz",
+      "integrity": "sha512-ObmnIF4hXNg1BqhnHmgbDETF8dLPCggZWBjkQfhZpbszZnYur5DUljTcCHii5LC3J5E0yeO/1LIMyH+UvHQgyw==",
       "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
       }
     },
     "node_modules/side-channel": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz",
-      "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
+      "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "object-inspect": "^1.13.3",
+        "side-channel-list": "^1.0.0",
+        "side-channel-map": "^1.0.1",
+        "side-channel-weakmap": "^1.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/side-channel-list": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
+      "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "call-bind": "^1.0.7",
         "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.4",
-        "object-inspect": "^1.13.1"
+        "object-inspect": "^1.13.3"
       },
       "engines": {
         "node": ">= 0.4"
@@ -3280,11 +3319,44 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/signal-exit": {
-      "version": "3.0.7",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
-      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
-      "dev": true
+    "node_modules/side-channel-map": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
+      "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/side-channel-weakmap": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
+      "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3",
+        "side-channel-map": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
     },
     "node_modules/slash": {
       "version": "5.1.0",
@@ -3391,15 +3463,6 @@
         "safe-buffer": "~5.1.0"
       }
     },
-    "node_modules/strip-final-newline": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz",
-      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
-      "dev": true,
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/supports-color": {
       "version": "8.1.1",
       "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
@@ -3428,22 +3491,26 @@
       }
     },
     "node_modules/tapable": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz",
-      "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz",
+      "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==",
       "dev": true,
       "engines": {
         "node": ">=6"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/webpack"
       }
     },
     "node_modules/terser": {
-      "version": "5.31.6",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.6.tgz",
-      "integrity": "sha512-PQ4DAriWzKj+qgehQ7LK5bQqCFNMmlhjR2PFFLuqGCpuCAauxemVBWwWOxo3UIwWQx8+Pr61Df++r76wDmkQBg==",
+      "version": "5.46.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.46.0.tgz",
+      "integrity": "sha512-jTwoImyr/QbOWFFso3YoU3ik0jBBDJ6JTOQiy/J2YxVJdZCc+5u7skhNwiOR3FQIygFqVUPHl7qbbxtjW2K3Qg==",
       "dev": true,
       "dependencies": {
         "@jridgewell/source-map": "^0.3.3",
-        "acorn": "^8.8.2",
+        "acorn": "^8.15.0",
         "commander": "^2.20.0",
         "source-map-support": "~0.5.20"
       },
@@ -3455,16 +3522,16 @@
       }
     },
     "node_modules/terser-webpack-plugin": {
-      "version": "5.3.10",
-      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.10.tgz",
-      "integrity": "sha512-BKFPWlPDndPs+NGGCr1U59t0XScL5317Y0UReNrHaw9/FwhPENlq6bfgs+4yPfyP51vqC1bQ4rp1EfXW5ZSH9w==",
+      "version": "5.3.16",
+      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.16.tgz",
+      "integrity": "sha512-h9oBFCWrq78NyWWVcSwZarJkZ01c2AyGrzs1crmHZO3QUg9D61Wu4NPjBy69n7JqylFF5y+CsUZYmYEIZ3mR+Q==",
       "dev": true,
       "dependencies": {
-        "@jridgewell/trace-mapping": "^0.3.20",
+        "@jridgewell/trace-mapping": "^0.3.25",
         "jest-worker": "^27.4.5",
-        "schema-utils": "^3.1.1",
-        "serialize-javascript": "^6.0.1",
-        "terser": "^5.26.0"
+        "schema-utils": "^4.3.0",
+        "serialize-javascript": "^6.0.2",
+        "terser": "^5.31.1"
       },
       "engines": {
         "node": ">= 10.13.0"
@@ -3492,7 +3559,8 @@
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/thunky/-/thunky-1.1.0.tgz",
       "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
-      "dev": true
+      "dev": true,
+      "license": "MIT"
     },
     "node_modules/to-regex-range": {
       "version": "5.0.1",
@@ -3511,6 +3579,7 @@
       "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
       "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">=0.6"
       }
@@ -3526,6 +3595,7 @@
       "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
       "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "media-typer": "0.3.0",
         "mime-types": "~2.1.24"
@@ -3552,14 +3622,15 @@
       "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
       "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.8"
       }
     },
     "node_modules/update-browserslist-db": {
-      "version": "1.0.13",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz",
-      "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
       "dev": true,
       "funding": [
         {
@@ -3576,8 +3647,8 @@
         }
       ],
       "dependencies": {
-        "escalade": "^3.1.1",
-        "picocolors": "^1.0.0"
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
       },
       "bin": {
         "update-browserslist-db": "cli.js"
@@ -3586,15 +3657,6 @@
         "browserslist": ">= 4.21.0"
       }
     },
-    "node_modules/uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
-      "dev": true,
-      "dependencies": {
-        "punycode": "^2.1.0"
-      }
-    },
     "node_modules/util-deprecate": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
@@ -3606,6 +3668,7 @@
       "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
       "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">= 0.4.0"
       }
@@ -3629,9 +3692,9 @@
       }
     },
     "node_modules/watchpack": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.2.tgz",
-      "integrity": "sha512-TnbFSbcOCcDgjZ4piURLCbJ3nJhznVh9kw6F6iokjiFPl8ONxe9A6nMDVXDiNbrSfLILs6vB07F7wLBrwPYzJw==",
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
+      "integrity": "sha512-Zn5uXdcFNIA1+1Ei5McRd+iRzfhENPCe7LeABkJtNulSxjma+l7ltNx55BWZkRlwRnpOgHqxnjyaDgJnNXnqzg==",
       "dev": true,
       "dependencies": {
         "glob-to-regexp": "^0.4.1",
@@ -3651,34 +3714,36 @@
       }
     },
     "node_modules/webpack": {
-      "version": "5.94.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.94.0.tgz",
-      "integrity": "sha512-KcsGn50VT+06JH/iunZJedYGUJS5FGjow8wb9c0v5n1Om8O1g4L6LjtfxwlXIATopoQu+vOXXa7gYisWxCoPyg==",
-      "dev": true,
-      "dependencies": {
-        "@types/estree": "^1.0.5",
-        "@webassemblyjs/ast": "^1.12.1",
-        "@webassemblyjs/wasm-edit": "^1.12.1",
-        "@webassemblyjs/wasm-parser": "^1.12.1",
-        "acorn": "^8.7.1",
-        "acorn-import-attributes": "^1.9.5",
-        "browserslist": "^4.21.10",
+      "version": "5.105.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.105.0.tgz",
+      "integrity": "sha512-gX/dMkRQc7QOMzgTe6KsYFM7DxeIONQSui1s0n/0xht36HvrgbxtM1xBlgx596NbpHuQU8P7QpKwrZYwUX48nw==",
+      "dev": true,
+      "dependencies": {
+        "@types/eslint-scope": "^3.7.7",
+        "@types/estree": "^1.0.8",
+        "@types/json-schema": "^7.0.15",
+        "@webassemblyjs/ast": "^1.14.1",
+        "@webassemblyjs/wasm-edit": "^1.14.1",
+        "@webassemblyjs/wasm-parser": "^1.14.1",
+        "acorn": "^8.15.0",
+        "acorn-import-phases": "^1.0.3",
+        "browserslist": "^4.28.1",
         "chrome-trace-event": "^1.0.2",
-        "enhanced-resolve": "^5.17.1",
-        "es-module-lexer": "^1.2.1",
+        "enhanced-resolve": "^5.19.0",
+        "es-module-lexer": "^2.0.0",
         "eslint-scope": "5.1.1",
         "events": "^3.2.0",
         "glob-to-regexp": "^0.4.1",
         "graceful-fs": "^4.2.11",
         "json-parse-even-better-errors": "^2.3.1",
-        "loader-runner": "^4.2.0",
+        "loader-runner": "^4.3.1",
         "mime-types": "^2.1.27",
         "neo-async": "^2.6.2",
-        "schema-utils": "^3.2.0",
-        "tapable": "^2.1.1",
-        "terser-webpack-plugin": "^5.3.10",
-        "watchpack": "^2.4.1",
-        "webpack-sources": "^3.2.3"
+        "schema-utils": "^4.3.3",
+        "tapable": "^2.3.0",
+        "terser-webpack-plugin": "^5.3.16",
+        "watchpack": "^2.5.1",
+        "webpack-sources": "^3.3.3"
       },
       "bin": {
         "webpack": "bin/webpack.js"
@@ -3751,130 +3816,83 @@
       }
     },
     "node_modules/webpack-dev-middleware": {
-      "version": "5.3.4",
-      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
-      "integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
+      "version": "7.4.2",
+      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-7.4.2.tgz",
+      "integrity": "sha512-xOO8n6eggxnwYpy1NlzUKpvrjfJTvae5/D6WOK0S2LSo7vjmo5gCM1DbLUmFqrMTJP+W/0YZNctm7jasWvLuBA==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
         "colorette": "^2.0.10",
-        "memfs": "^3.4.3",
+        "memfs": "^4.6.0",
         "mime-types": "^2.1.31",
+        "on-finished": "^2.4.1",
         "range-parser": "^1.2.1",
         "schema-utils": "^4.0.0"
       },
       "engines": {
-        "node": ">= 12.13.0"
+        "node": ">= 18.12.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/webpack"
       },
       "peerDependencies": {
-        "webpack": "^4.0.0 || ^5.0.0"
-      }
-    },
-    "node_modules/webpack-dev-middleware/node_modules/ajv": {
-      "version": "8.12.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2",
-        "uri-js": "^4.2.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/webpack-dev-middleware/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
+        "webpack": "^5.0.0"
       },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
+      "peerDependenciesMeta": {
+        "webpack": {
+          "optional": true
+        }
       }
     },
-    "node_modules/webpack-dev-middleware/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true
-    },
-    "node_modules/webpack-dev-middleware/node_modules/schema-utils": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-      "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
+    "node_modules/webpack-dev-server": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.2.1.tgz",
+      "integrity": "sha512-ml/0HIj9NLpVKOMq+SuBPLHcmbG+TGIjXRHsYfZwocUBIqEvws8NnS/V9AFQ5FKP+tgn5adwVwRrTEpGL33QFQ==",
       "dev": true,
+      "license": "MIT",
       "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 12.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
-    "node_modules/webpack-dev-server": {
-      "version": "4.15.1",
-      "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-4.15.1.tgz",
-      "integrity": "sha512-5hbAst3h3C3L8w6W4P96L5vaV0PxSmJhxZvWKYIdgxOQm8pNZ5dEOmmSLBVpP85ReeyRt6AS1QJNyo/oFFPeVA==",
-      "dev": true,
-      "dependencies": {
-        "@types/bonjour": "^3.5.9",
-        "@types/connect-history-api-fallback": "^1.3.5",
-        "@types/express": "^4.17.13",
-        "@types/serve-index": "^1.9.1",
-        "@types/serve-static": "^1.13.10",
-        "@types/sockjs": "^0.3.33",
-        "@types/ws": "^8.5.5",
+        "@types/bonjour": "^3.5.13",
+        "@types/connect-history-api-fallback": "^1.5.4",
+        "@types/express": "^4.17.21",
+        "@types/express-serve-static-core": "^4.17.21",
+        "@types/serve-index": "^1.9.4",
+        "@types/serve-static": "^1.15.5",
+        "@types/sockjs": "^0.3.36",
+        "@types/ws": "^8.5.10",
         "ansi-html-community": "^0.0.8",
-        "bonjour-service": "^1.0.11",
-        "chokidar": "^3.5.3",
+        "bonjour-service": "^1.2.1",
+        "chokidar": "^3.6.0",
         "colorette": "^2.0.10",
         "compression": "^1.7.4",
         "connect-history-api-fallback": "^2.0.0",
-        "default-gateway": "^6.0.3",
-        "express": "^4.17.3",
+        "express": "^4.21.2",
         "graceful-fs": "^4.2.6",
-        "html-entities": "^2.3.2",
-        "http-proxy-middleware": "^2.0.3",
-        "ipaddr.js": "^2.0.1",
-        "launch-editor": "^2.6.0",
-        "open": "^8.0.9",
-        "p-retry": "^4.5.0",
-        "rimraf": "^3.0.2",
-        "schema-utils": "^4.0.0",
-        "selfsigned": "^2.1.1",
+        "http-proxy-middleware": "^2.0.7",
+        "ipaddr.js": "^2.1.0",
+        "launch-editor": "^2.6.1",
+        "open": "^10.0.3",
+        "p-retry": "^6.2.0",
+        "schema-utils": "^4.2.0",
+        "selfsigned": "^2.4.1",
         "serve-index": "^1.9.1",
         "sockjs": "^0.3.24",
         "spdy": "^4.0.2",
-        "webpack-dev-middleware": "^5.3.1",
-        "ws": "^8.13.0"
+        "webpack-dev-middleware": "^7.4.2",
+        "ws": "^8.18.0"
       },
       "bin": {
         "webpack-dev-server": "bin/webpack-dev-server.js"
       },
       "engines": {
-        "node": ">= 12.13.0"
+        "node": ">= 18.12.0"
       },
       "funding": {
         "type": "opencollective",
         "url": "https://opencollective.com/webpack"
       },
       "peerDependencies": {
-        "webpack": "^4.37.0 || ^5.0.0"
+        "webpack": "^5.0.0"
       },
       "peerDependenciesMeta": {
         "webpack": {
@@ -3885,59 +3903,6 @@
         }
       }
     },
-    "node_modules/webpack-dev-server/node_modules/ajv": {
-      "version": "8.12.0",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2",
-        "uri-js": "^4.2.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/webpack-dev-server/node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
-    "node_modules/webpack-dev-server/node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true
-    },
-    "node_modules/webpack-dev-server/node_modules/schema-utils": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-      "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
-      "dev": true,
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 12.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
     "node_modules/webpack-merge": {
       "version": "5.9.0",
       "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-5.9.0.tgz",
@@ -3951,10 +3916,10 @@
         "node": ">=10.0.0"
       }
     },
-    "node_modules/webpack/node_modules/webpack-sources": {
-      "version": "3.2.3",
-      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz",
-      "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==",
+    "node_modules/webpack-sources": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz",
+      "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==",
       "dev": true,
       "engines": {
         "node": ">=10.13.0"
@@ -4004,17 +3969,12 @@
       "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
       "dev": true
     },
-    "node_modules/wrappy": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
-      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-      "dev": true
-    },
     "node_modules/ws": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
-      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
+      "version": "8.18.2",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.2.tgz",
+      "integrity": "sha512-DMricUmwGZUVr++AEAe2uiVM7UoO9MAVZMDu05UQOaUII0lp+zOzLLU4Xqh/JvTqklB1T4uELaaPBKyjE1r4fQ==",
       "dev": true,
+      "license": "MIT",
       "engines": {
         "node": ">=10.0.0"
       },
@@ -4040,13 +4000,12 @@
       "dev": true
     },
     "@jridgewell/gen-mapping": {
-      "version": "0.3.5",
-      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
-      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
       "dev": true,
       "requires": {
-        "@jridgewell/set-array": "^1.2.1",
-        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/sourcemap-codec": "^1.5.0",
         "@jridgewell/trace-mapping": "^0.3.24"
       }
     },
@@ -4056,16 +4015,10 @@
       "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
       "dev": true
     },
-    "@jridgewell/set-array": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
-      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
-      "dev": true
-    },
     "@jridgewell/source-map": {
-      "version": "0.3.6",
-      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
-      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "version": "0.3.11",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.11.tgz",
+      "integrity": "sha512-ZMp1V8ZFcPG5dIWnQLr3NSI1MiCU7UETdS/A0G8V/XWHvJv3ZsFqutJn1Y5RPmAPX6F3BiE397OqveU/9NCuIA==",
       "dev": true,
       "requires": {
         "@jridgewell/gen-mapping": "^0.3.5",
@@ -4073,15 +4026,15 @@
       }
     },
     "@jridgewell/sourcemap-codec": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
-      "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
       "dev": true
     },
     "@jridgewell/trace-mapping": {
-      "version": "0.3.25",
-      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
-      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
       "dev": true,
       "requires": {
         "@jridgewell/resolve-uri": "^3.1.0",
@@ -4089,9 +4042,9 @@
       }
     },
     "@leichtgewicht/ip-codec": {
-      "version": "2.0.4",
-      "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.4.tgz",
-      "integrity": "sha512-Hcv+nVC0kZnQ3tD9GVu5xSMR4VVYOteQIr/hwFPVEvPdlXqgGEuRjiheChHgdM+JyqdgNcmzZOX/tnl0JOiI7A==",
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.5.tgz",
+      "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==",
       "dev": true
     },
     "@nodelib/fs.scandir": {
@@ -4137,9 +4090,9 @@
       }
     },
     "@types/bonjour": {
-      "version": "3.5.11",
-      "resolved": "https://registry.npmjs.org/@types/bonjour/-/bonjour-3.5.11.tgz",
-      "integrity": "sha512-isGhjmBtLIxdHBDl2xGwUzEM8AOyOvWsADWq7rqirdi/ZQoHnLWErHvsThcEzTX8juDRiZtzp2Qkv5bgNh6mAg==",
+      "version": "3.5.13",
+      "resolved": "https://registry.npmjs.org/@types/bonjour/-/bonjour-3.5.13.tgz",
+      "integrity": "sha512-z9fJ5Im06zvUL548KvYNecEVlA7cVDkGUi6kZusb04mpyEFKCIZJvloCcmpmLaIahDpOQGHaHmG6imtPMmPXGQ==",
       "dev": true,
       "requires": {
         "@types/node": "*"
@@ -4155,25 +4108,45 @@
       }
     },
     "@types/connect-history-api-fallback": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.5.1.tgz",
-      "integrity": "sha512-iaQslNbARe8fctL5Lk+DsmgWOM83lM+7FzP0eQUJs1jd3kBE8NWqBTIT2S8SqQOJjxvt2eyIjpOuYeRXq2AdMw==",
+      "version": "1.5.4",
+      "resolved": "https://registry.npmjs.org/@types/connect-history-api-fallback/-/connect-history-api-fallback-1.5.4.tgz",
+      "integrity": "sha512-n6Cr2xS1h4uAulPRdlw6Jl6s1oG8KrVilPN2yUITEs+K48EzMJJ3W1xy8K5eWuFvjp3R74AOIGSmp2UfBJ8HFw==",
       "dev": true,
       "requires": {
         "@types/express-serve-static-core": "*",
         "@types/node": "*"
       }
     },
+    "@types/eslint": {
+      "version": "9.6.1",
+      "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz",
+      "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==",
+      "dev": true,
+      "requires": {
+        "@types/estree": "*",
+        "@types/json-schema": "*"
+      }
+    },
+    "@types/eslint-scope": {
+      "version": "3.7.7",
+      "resolved": "https://registry.npmjs.org/@types/eslint-scope/-/eslint-scope-3.7.7.tgz",
+      "integrity": "sha512-MzMFlSLBqNF2gcHWO0G1vP/YQyfvrxZ0bF+u7mzUdZ1/xK4A4sru+nraZz5i3iEIk1l1uyicaDVTB4QbbEkAYg==",
+      "dev": true,
+      "requires": {
+        "@types/eslint": "*",
+        "@types/estree": "*"
+      }
+    },
     "@types/estree": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
-      "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
       "dev": true
     },
     "@types/express": {
-      "version": "4.17.17",
-      "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.17.tgz",
-      "integrity": "sha512-Q4FmmuLGBG58btUnfS1c1r/NQdlp3DMfGDGig8WhfpA2YRUtEkxAjkZb0yvplJGYdF1fsQ81iMDcH24sSCNC/Q==",
+      "version": "4.17.22",
+      "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.22.tgz",
+      "integrity": "sha512-eZUmSnhRX9YRSkplpz0N+k6NljUUn5l3EWZIKZvYzhvMphEuNiyyy1viH/ejgt66JWgALwC/gtSUAeQKtSwW/w==",
       "dev": true,
       "requires": {
         "@types/body-parser": "*",
@@ -4210,9 +4183,9 @@
       }
     },
     "@types/json-schema": {
-      "version": "7.0.13",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.13.tgz",
-      "integrity": "sha512-RbSSoHliUbnXj3ny0CNFOoxrIDV6SUGyStHsvDqosw6CkdPV8TtWGlfecuK4ToyMEAql6pzNxgCFKanovUzlgQ==",
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
       "dev": true
     },
     "@types/mime": {
@@ -4227,6 +4200,15 @@
       "integrity": "sha512-HksnYH4Ljr4VQgEy2lTStbCKv/P590tmPe5HqOnv9Gprffgv5WXAY+Y5Gqniu0GGqeTCUdBnzC3QSrzPkBkAMA==",
       "dev": true
     },
+    "@types/node-forge": {
+      "version": "1.3.11",
+      "resolved": "https://registry.npmjs.org/@types/node-forge/-/node-forge-1.3.11.tgz",
+      "integrity": "sha512-FQx220y22OKNTqaByeBGqHWYz4cl94tpcxeFdvBo3wjG6XPBuZ0BNgNZRV5J5TFmmcsJ4IzsLkmGRiQbnYsBEQ==",
+      "dev": true,
+      "requires": {
+        "@types/node": "*"
+      }
+    },
     "@types/qs": {
       "version": "6.9.8",
       "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.8.tgz",
@@ -4240,9 +4222,9 @@
       "dev": true
     },
     "@types/retry": {
-      "version": "0.12.0",
-      "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz",
-      "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==",
+      "version": "0.12.2",
+      "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.2.tgz",
+      "integrity": "sha512-XISRgDJ2Tc5q4TRqvgJtzsRkFYNJzZrhTdtMoGVBttwzzQJkPnS3WWTFc7kuDRoPtPakl+T+OfdEUjYJj7Jbow==",
       "dev": true
     },
     "@types/send": {
@@ -4256,186 +4238,186 @@
       }
     },
     "@types/serve-index": {
-      "version": "1.9.1",
-      "resolved": "https://registry.npmjs.org/@types/serve-index/-/serve-index-1.9.1.tgz",
-      "integrity": "sha512-d/Hs3nWDxNL2xAczmOVZNj92YZCS6RGxfBPjKzuu/XirCgXdpKEb88dYNbrYGint6IVWLNP+yonwVAuRC0T2Dg==",
+      "version": "1.9.4",
+      "resolved": "https://registry.npmjs.org/@types/serve-index/-/serve-index-1.9.4.tgz",
+      "integrity": "sha512-qLpGZ/c2fhSs5gnYsQxtDEq3Oy8SXPClIXkW5ghvAvsNuVSA8k+gCONcUCS/UjLEYvYps+e8uBtfgXgvhwfNug==",
       "dev": true,
       "requires": {
         "@types/express": "*"
       }
     },
     "@types/serve-static": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.2.tgz",
-      "integrity": "sha512-J2LqtvFYCzaj8pVYKw8klQXrLLk7TBZmQ4ShlcdkELFKGwGMfevMLneMMRkMgZxotOD9wg497LpC7O8PcvAmfw==",
+      "version": "1.15.7",
+      "resolved": "https://registry.npmjs.org/@types/serve-static/-/serve-static-1.15.7.tgz",
+      "integrity": "sha512-W8Ym+h8nhuRwaKPaDw34QUkwsGi6Rc4yYqvKFo5rm2FUEhCFbzVWrxXUxuKK8TASjWsysJY0nsmNCGhCOIsrOw==",
       "dev": true,
       "requires": {
         "@types/http-errors": "*",
-        "@types/mime": "*",
-        "@types/node": "*"
+        "@types/node": "*",
+        "@types/send": "*"
       }
     },
     "@types/sockjs": {
-      "version": "0.3.33",
-      "resolved": "https://registry.npmjs.org/@types/sockjs/-/sockjs-0.3.33.tgz",
-      "integrity": "sha512-f0KEEe05NvUnat+boPTZ0dgaLZ4SfSouXUgv5noUiefG2ajgKjmETo9ZJyuqsl7dfl2aHlLJUiki6B4ZYldiiw==",
+      "version": "0.3.36",
+      "resolved": "https://registry.npmjs.org/@types/sockjs/-/sockjs-0.3.36.tgz",
+      "integrity": "sha512-MK9V6NzAS1+Ud7JV9lJLFqW85VbC9dq3LmwZCuBe4wBDgKC0Kj/jd8Xl+nSviU+Qc3+m7umHHyHg//2KSa0a0Q==",
       "dev": true,
       "requires": {
         "@types/node": "*"
       }
     },
     "@types/ws": {
-      "version": "8.5.5",
-      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.5.tgz",
-      "integrity": "sha512-lwhs8hktwxSjf9UaZ9tG5M03PGogvFaH8gUgLNbN9HKIg0dvv6q+gkSuJ8HN4/VbyxkuLzCjlN7GquQ0gUJfIg==",
+      "version": "8.18.1",
+      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
+      "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
       "dev": true,
       "requires": {
         "@types/node": "*"
       }
     },
     "@webassemblyjs/ast": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz",
-      "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.14.1.tgz",
+      "integrity": "sha512-nuBEDgQfm1ccRp/8bCQrx1frohyufl4JlbMMZ4P1wpeOfDhF6FQkxZJ1b/e+PLwr6X1Nhw6OLme5usuBWYBvuQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/helper-numbers": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6"
+        "@webassemblyjs/helper-numbers": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2"
       }
     },
     "@webassemblyjs/floating-point-hex-parser": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz",
-      "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.13.2.tgz",
+      "integrity": "sha512-6oXyTOzbKxGH4steLbLNOu71Oj+C8Lg34n6CqRvqfS2O71BxY6ByfMDRhBytzknj9yGUPVJ1qIKhRlAwO1AovA==",
       "dev": true
     },
     "@webassemblyjs/helper-api-error": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz",
-      "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.13.2.tgz",
+      "integrity": "sha512-U56GMYxy4ZQCbDZd6JuvvNV/WFildOjsaWD3Tzzvmw/mas3cXzRJPMjP83JqEsgSbyrmaGjBfDtV7KDXV9UzFQ==",
       "dev": true
     },
     "@webassemblyjs/helper-buffer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz",
-      "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.14.1.tgz",
+      "integrity": "sha512-jyH7wtcHiKssDtFPRB+iQdxlDf96m0E39yb0k5uJVhFGleZFoNw1c4aeIcVUPPbXUVJ94wwnMOAqUHyzoEPVMA==",
       "dev": true
     },
     "@webassemblyjs/helper-numbers": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz",
-      "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.13.2.tgz",
+      "integrity": "sha512-FE8aCmS5Q6eQYcV3gI35O4J789wlQA+7JrqTTpJqn5emA4U2hvwJmvFRC0HODS+3Ye6WioDklgd6scJ3+PLnEA==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/floating-point-hex-parser": "1.11.6",
-        "@webassemblyjs/helper-api-error": "1.11.6",
+        "@webassemblyjs/floating-point-hex-parser": "1.13.2",
+        "@webassemblyjs/helper-api-error": "1.13.2",
         "@xtuc/long": "4.2.2"
       }
     },
     "@webassemblyjs/helper-wasm-bytecode": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz",
-      "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.13.2.tgz",
+      "integrity": "sha512-3QbLKy93F0EAIXLh0ogEVR6rOubA9AoZ+WRYhNbFyuB70j3dRdwH9g+qXhLAO0kiYGlg3TxDV+I4rQTr/YNXkA==",
       "dev": true
     },
     "@webassemblyjs/helper-wasm-section": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz",
-      "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.14.1.tgz",
+      "integrity": "sha512-ds5mXEqTJ6oxRoqjhWDU83OgzAYjwsCV8Lo/N+oRsNDmx/ZDpqalmrtgOMkHwxsG0iI//3BwWAErYRHtgn0dZw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/wasm-gen": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/wasm-gen": "1.14.1"
       }
     },
     "@webassemblyjs/ieee754": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.11.6.tgz",
-      "integrity": "sha512-LM4p2csPNvbij6U1f19v6WR56QZ8JcHg3QIJTlSwzFcmx6WSORicYj6I63f9yU1kEUtrpG+kjkiIAkevHpDXrg==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.13.2.tgz",
+      "integrity": "sha512-4LtOzh58S/5lX4ITKxnAK2USuNEvpdVV9AlgGQb8rJDHaLeHciwG4zlGr0j/SNWlr7x3vO1lDEsuePvtcDNCkw==",
       "dev": true,
       "requires": {
         "@xtuc/ieee754": "^1.2.0"
       }
     },
     "@webassemblyjs/leb128": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.11.6.tgz",
-      "integrity": "sha512-m7a0FhE67DQXgouf1tbN5XQcdWoNgaAuoULHIfGFIEVKA6tu/edls6XnIlkmS6FrXAquJRPni3ZZKjw6FSPjPQ==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.13.2.tgz",
+      "integrity": "sha512-Lde1oNoIdzVzdkNEAWZ1dZ5orIbff80YPdHx20mrHwHrVNNTjNr8E3xz9BdpcGqRQbAEa+fkrCb+fRFTl/6sQw==",
       "dev": true,
       "requires": {
         "@xtuc/long": "4.2.2"
       }
     },
     "@webassemblyjs/utf8": {
-      "version": "1.11.6",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.11.6.tgz",
-      "integrity": "sha512-vtXf2wTQ3+up9Zsg8sa2yWiQpzSsMyXj0qViVP6xKGCUT8p8YJ6HqI7l5eCnWx1T/FYdsv07HQs2wTFbbof/RA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.13.2.tgz",
+      "integrity": "sha512-3NQWGjKTASY1xV5m7Hr0iPeXD9+RDobLll3T9d2AO+g3my8xy5peVyjSag4I50mR1bBSN/Ct12lo+R9tJk0NZQ==",
       "dev": true
     },
     "@webassemblyjs/wasm-edit": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.12.1.tgz",
-      "integrity": "sha512-1DuwbVvADvS5mGnXbE+c9NfA8QRcZ6iKquqjjmR10k6o+zzsRVesil54DKexiowcFCPdr/Q0qaMgB01+SQ1u6g==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.14.1.tgz",
+      "integrity": "sha512-RNJUIQH/J8iA/1NzlE4N7KtyZNHi3w7at7hDjvRNm5rcUXa00z1vRz3glZoULfJ5mpvYhLybmVcwcjGrC1pRrQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/helper-wasm-section": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-opt": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1",
-        "@webassemblyjs/wast-printer": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/helper-wasm-section": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-opt": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1",
+        "@webassemblyjs/wast-printer": "1.14.1"
       }
     },
     "@webassemblyjs/wasm-gen": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.12.1.tgz",
-      "integrity": "sha512-TDq4Ojh9fcohAw6OIMXqiIcTq5KUXTGRkVxbSo1hQnSy6lAM5GSdfwWeSxpAo0YzgsgF182E/U0mDNhuA0tW7w==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.14.1.tgz",
+      "integrity": "sha512-AmomSIjP8ZbfGQhumkNvgC33AY7qtMCXnN6bL2u2Js4gVCg8fp735aEiMSBbDR7UQIj90n4wKAFUSEd0QN2Ukg==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "@webassemblyjs/wasm-opt": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.12.1.tgz",
-      "integrity": "sha512-Jg99j/2gG2iaz3hijw857AVYekZe2SAskcqlWIZXjji5WStnOpVoat3gQfT/Q5tb2djnCjBtMocY/Su1GfxPBg==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.14.1.tgz",
+      "integrity": "sha512-PTcKLUNvBqnY2U6E5bdOQcSM+oVP/PmrDY9NzowJjislEjwP/C4an2303MCVS2Mg9d3AJpIGdUFIQQWbPds0Sw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-buffer": "1.12.1",
-        "@webassemblyjs/wasm-gen": "1.12.1",
-        "@webassemblyjs/wasm-parser": "1.12.1"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-buffer": "1.14.1",
+        "@webassemblyjs/wasm-gen": "1.14.1",
+        "@webassemblyjs/wasm-parser": "1.14.1"
       }
     },
     "@webassemblyjs/wasm-parser": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.12.1.tgz",
-      "integrity": "sha512-xikIi7c2FHXysxXe3COrVUPSheuBtpcfhbpFj4gmu7KRLYOzANztwUU0IbsqvMqzuNK2+glRGWCEqZo1WCLyAQ==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.14.1.tgz",
+      "integrity": "sha512-JLBl+KZ0R5qB7mCnud/yyX08jWFw5MsoalJ1pQ4EdFlgj9VdXKGuENGsiCIjegI1W7p91rUlcB/LB5yRJKNTcQ==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
-        "@webassemblyjs/helper-api-error": "1.11.6",
-        "@webassemblyjs/helper-wasm-bytecode": "1.11.6",
-        "@webassemblyjs/ieee754": "1.11.6",
-        "@webassemblyjs/leb128": "1.11.6",
-        "@webassemblyjs/utf8": "1.11.6"
+        "@webassemblyjs/ast": "1.14.1",
+        "@webassemblyjs/helper-api-error": "1.13.2",
+        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
+        "@webassemblyjs/ieee754": "1.13.2",
+        "@webassemblyjs/leb128": "1.13.2",
+        "@webassemblyjs/utf8": "1.13.2"
       }
     },
     "@webassemblyjs/wast-printer": {
-      "version": "1.12.1",
-      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.12.1.tgz",
-      "integrity": "sha512-+X4WAlOisVWQMikjbcvY2e0rwPsKQ9F688lksZhBcPycBBuii3O7m8FACbDMWDojpAqvjIncrG8J0XHKyQfVeA==",
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.14.1.tgz",
+      "integrity": "sha512-kPSSXE6De1XOR820C90RIo2ogvZG+c3KiHzqUoO/F34Y2shGzesfqv7o57xrxovZJH/MetF5UjroJ/R/3isoiw==",
       "dev": true,
       "requires": {
-        "@webassemblyjs/ast": "1.12.1",
+        "@webassemblyjs/ast": "1.14.1",
         "@xtuc/long": "4.2.2"
       }
     },
@@ -4483,28 +4465,28 @@
       }
     },
     "acorn": {
-      "version": "8.12.1",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
-      "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==",
+      "version": "8.15.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
       "dev": true
     },
-    "acorn-import-attributes": {
-      "version": "1.9.5",
-      "resolved": "https://registry.npmjs.org/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz",
-      "integrity": "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==",
+    "acorn-import-phases": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz",
+      "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==",
       "dev": true,
       "requires": {}
     },
     "ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
       "dev": true,
       "requires": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
+        "fast-deep-equal": "^3.1.3",
+        "fast-uri": "^3.0.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2"
       }
     },
     "ajv-formats": {
@@ -4514,34 +4496,16 @@
       "dev": true,
       "requires": {
         "ajv": "^8.0.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2",
-            "uri-js": "^4.2.2"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        }
       }
     },
     "ajv-keywords": {
-      "version": "3.5.2",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz",
-      "integrity": "sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==",
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
+      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
       "dev": true,
-      "requires": {}
+      "requires": {
+        "fast-deep-equal": "^3.1.3"
+      }
     },
     "ansi-html-community": {
       "version": "0.0.8",
@@ -4560,15 +4524,15 @@
       }
     },
     "array-flatten": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-2.1.2.tgz",
-      "integrity": "sha512-hNfzcOV8W4NdualtqBFPyVO+54DSJuZGY9qT4pRroB6S9e3iiido2ISIC5h9R2sPJ8H3FHCIiEnsv1lPXO3KtQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==",
       "dev": true
     },
-    "balanced-match": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+    "baseline-browser-mapping": {
+      "version": "2.9.19",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.19.tgz",
+      "integrity": "sha512-ipDqC8FrAl/76p2SSWKSI+H9tFwm7vYqXQrItCuiVPt26Km0jS+NzSsBWAaBusvSbQcfJG+JitdMm+wZAgTYqg==",
       "dev": true
     },
     "batch": {
@@ -4578,9 +4542,9 @@
       "dev": true
     },
     "binary-extensions": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz",
-      "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
+      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
       "dev": true
     },
     "body-parser": {
@@ -4603,12 +4567,6 @@
         "unpipe": "1.0.0"
       },
       "dependencies": {
-        "bytes": {
-          "version": "3.1.2",
-          "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-          "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
-          "dev": true
-        },
         "debug": {
           "version": "2.6.9",
           "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
@@ -4627,27 +4585,15 @@
       }
     },
     "bonjour-service": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.1.1.tgz",
-      "integrity": "sha512-Z/5lQRMOG9k7W+FkeGTNjh7htqn/2LMnfOvBZ8pynNZCM9MwkQkI3zeI4oz09uWdcgmgHugVvBqxGg4VQJ5PCg==",
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/bonjour-service/-/bonjour-service-1.3.0.tgz",
+      "integrity": "sha512-3YuAUiSkWykd+2Azjgyxei8OWf8thdn8AITIog2M4UICzoqfjlqr64WIjEXZllf/W6vK1goqleSR6brGomxQqA==",
       "dev": true,
       "requires": {
-        "array-flatten": "^2.1.2",
-        "dns-equal": "^1.0.0",
         "fast-deep-equal": "^3.1.3",
         "multicast-dns": "^7.2.5"
       }
     },
-    "brace-expansion": {
-      "version": "1.1.11",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
-      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
-      "dev": true,
-      "requires": {
-        "balanced-match": "^1.0.0",
-        "concat-map": "0.0.1"
-      }
-    },
     "braces": {
       "version": "3.0.3",
       "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
@@ -4658,15 +4604,16 @@
       }
     },
     "browserslist": {
-      "version": "4.21.11",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.11.tgz",
-      "integrity": "sha512-xn1UXOKUz7DjdGlg9RrUr0GGiWzI97UQJnugHtH0OLDfJB7jMgoIkYvRIEO1l9EeEERVqeqLYOcFBW9ldjypbQ==",
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
       "dev": true,
       "requires": {
-        "caniuse-lite": "^1.0.30001538",
-        "electron-to-chromium": "^1.4.526",
-        "node-releases": "^2.0.13",
-        "update-browserslist-db": "^1.0.13"
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
       }
     },
     "buffer-from": {
@@ -4675,35 +4622,51 @@
       "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
       "dev": true
     },
+    "bundle-name": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz",
+      "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==",
+      "dev": true,
+      "requires": {
+        "run-applescript": "^7.0.0"
+      }
+    },
     "bytes": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
-      "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg=",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
       "dev": true
     },
-    "call-bind": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
-      "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==",
+    "call-bind-apply-helpers": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
       "dev": true,
       "requires": {
-        "es-define-property": "^1.0.0",
         "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "set-function-length": "^1.2.1"
+        "function-bind": "^1.1.2"
+      }
+    },
+    "call-bound": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
+      "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
+      "dev": true,
+      "requires": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "get-intrinsic": "^1.3.0"
       }
     },
     "caniuse-lite": {
-      "version": "1.0.30001538",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001538.tgz",
-      "integrity": "sha512-HWJnhnID+0YMtGlzcp3T9drmBJUVDchPJ08tpUGFLs9CYlwWPH2uLgpHn8fND5pCgXVtnGS3H4QR9XLMHVNkHw==",
+      "version": "1.0.30001768",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001768.tgz",
+      "integrity": "sha512-qY3aDRZC5nWPgHUgIB84WL+nySuo19wk0VJpp/XI9T34lrvkyhRvNVOFJOp2kxClQhiFBu+TaUSudf6oa3vkSA==",
       "dev": true
     },
     "chokidar": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz",
-      "integrity": "sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==",
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
+      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
       "dev": true,
       "requires": {
         "anymatch": "~3.1.2",
@@ -4758,17 +4721,17 @@
       }
     },
     "compression": {
-      "version": "1.7.4",
-      "resolved": "https://registry.npmjs.org/compression/-/compression-1.7.4.tgz",
-      "integrity": "sha512-jaSIDzP9pZVS4ZfQ+TzvtiWhdpFhE2RDHz8QJkpX9SIpLq88VueF5jJw6t+6CUQcAoA6t+x89MLrWAqpfDE8iQ==",
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz",
+      "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==",
       "dev": true,
       "requires": {
-        "accepts": "~1.3.5",
-        "bytes": "3.0.0",
-        "compressible": "~2.0.16",
+        "bytes": "3.1.2",
+        "compressible": "~2.0.18",
         "debug": "2.6.9",
-        "on-headers": "~1.0.2",
-        "safe-buffer": "5.1.2",
+        "negotiator": "~0.6.4",
+        "on-headers": "~1.1.0",
+        "safe-buffer": "5.2.1",
         "vary": "~1.1.2"
       },
       "dependencies": {
@@ -4780,15 +4743,21 @@
           "requires": {
             "ms": "2.0.0"
           }
+        },
+        "negotiator": {
+          "version": "0.6.4",
+          "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz",
+          "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==",
+          "dev": true
+        },
+        "safe-buffer": {
+          "version": "5.2.1",
+          "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+          "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+          "dev": true
         }
       }
     },
-    "concat-map": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
-      "dev": true
-    },
     "connect-history-api-fallback": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/connect-history-api-fallback/-/connect-history-api-fallback-2.0.0.tgz",
@@ -4844,27 +4813,6 @@
         "serialize-javascript": "^6.0.2"
       },
       "dependencies": {
-        "ajv": {
-          "version": "8.17.1",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-          "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3",
-            "fast-uri": "^3.0.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
         "glob-parent": {
           "version": "6.0.2",
           "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
@@ -4873,24 +4821,6 @@
           "requires": {
             "is-glob": "^4.0.3"
           }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        },
-        "schema-utils": {
-          "version": "4.3.0",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.0.tgz",
-          "integrity": "sha512-Gf9qqc58SpCA/xdziiHz35F4GNIWYWZrEshUc/G/r5BnLph6xpKuLeoJoQuj5WfBIx/eQLf+hmVPYHaxJu7V2g==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
         }
       }
     },
@@ -4931,30 +4861,26 @@
         }
       }
     },
-    "default-gateway": {
-      "version": "6.0.3",
-      "resolved": "https://registry.npmjs.org/default-gateway/-/default-gateway-6.0.3.tgz",
-      "integrity": "sha512-fwSOJsbbNzZ/CUFpqFBqYfYNLj1NbMPm8MMCIzHjC83iSJRBEGmDUxU+WP661BaBQImeC2yHwXtz+P/O9o+XEg==",
+    "default-browser": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.2.1.tgz",
+      "integrity": "sha512-WY/3TUME0x3KPYdRRxEJJvXRHV4PyPoUsxtZa78lwItwRQRHhd2U9xOscaT/YTf8uCXIAjeJOFBVEh/7FtD8Xg==",
       "dev": true,
       "requires": {
-        "execa": "^5.0.0"
+        "bundle-name": "^4.1.0",
+        "default-browser-id": "^5.0.0"
       }
     },
-    "define-data-property": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
-      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
-      "dev": true,
-      "requires": {
-        "es-define-property": "^1.0.0",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.0.1"
-      }
+    "default-browser-id": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.0.tgz",
+      "integrity": "sha512-A6p/pu/6fyBcA1TRz/GqWYPViplrftcW2gZC9q79ngNCKAeR/X3gcEdXQHl4KNXV+3wgIJ1CPkJQ3IHM6lcsyA==",
+      "dev": true
     },
     "define-lazy-prop": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-2.0.0.tgz",
-      "integrity": "sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz",
+      "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==",
       "dev": true
     },
     "depd": {
@@ -4975,12 +4901,6 @@
       "integrity": "sha512-ZIzRpLJrOj7jjP2miAtgqIfmzbxa4ZOr5jJc601zklsfEx9oTzmmj2nVpIPRpNlRTIh8lc1kyViIY7BWSGNmKw==",
       "dev": true
     },
-    "dns-equal": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/dns-equal/-/dns-equal-1.0.0.tgz",
-      "integrity": "sha512-z+paD6YUQsk+AbGCEM4PrOXSss5gd66QfcVBFTKR/HpFL9jCqikS94HYwKww6fQyO7IxrIIyUu+g0Ka9tUS2Cg==",
-      "dev": true
-    },
     "dns-packet": {
       "version": "5.6.1",
       "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-5.6.1.tgz",
@@ -4990,6 +4910,17 @@
         "@leichtgewicht/ip-codec": "^2.0.1"
       }
     },
+    "dunder-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+      "dev": true,
+      "requires": {
+        "call-bind-apply-helpers": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.2.0"
+      }
+    },
     "ee-first": {
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
@@ -4997,25 +4928,25 @@
       "dev": true
     },
     "electron-to-chromium": {
-      "version": "1.4.528",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.528.tgz",
-      "integrity": "sha512-UdREXMXzLkREF4jA8t89FQjA8WHI6ssP38PMY4/4KhXFQbtImnghh4GkCgrtiZwLKUKVD2iTVXvDVQjfomEQuA==",
+      "version": "1.5.286",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.286.tgz",
+      "integrity": "sha512-9tfDXhJ4RKFNerfjdCcZfufu49vg620741MNs26a9+bhLThdB+plgMeou98CAaHu/WATj2iHOOHTp1hWtABj2A==",
       "dev": true
     },
     "encodeurl": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
-      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
+      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
       "dev": true
     },
     "enhanced-resolve": {
-      "version": "5.17.1",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz",
-      "integrity": "sha512-LMHl3dXhTcfv8gM4kEzIUeTQ+7fpdA0l2tUf34BddXPkz2A5xJ5L/Pchd5BL6rdccM9QGvu0sWZzK1Z1t4wwyg==",
+      "version": "5.19.0",
+      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.19.0.tgz",
+      "integrity": "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg==",
       "dev": true,
       "requires": {
         "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
+        "tapable": "^2.3.0"
       }
     },
     "envinfo": {
@@ -5024,14 +4955,11 @@
       "integrity": "sha512-ZtUjZO6l5mwTHvc1L9+1q5p/R3wTopcfqMW8r5t8SJSKqeVI/LtajORwRFEKpEFuekjD0VBjwu1HMxL4UalIRw==",
       "dev": true
     },
-    "es-define-property": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
-      "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
-      "dev": true,
-      "requires": {
-        "get-intrinsic": "^1.2.4"
-      }
+    "es-define-property": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+      "dev": true
     },
     "es-errors": {
       "version": "1.3.0",
@@ -5040,15 +4968,24 @@
       "dev": true
     },
     "es-module-lexer": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.3.1.tgz",
-      "integrity": "sha512-JUFAyicQV9mXc3YRxPnDlrfBKpqt6hUYzz9/boprUJHs4e4KVr3XwOF70doO6gwXUor6EWZJAyWAfKki84t20Q==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.0.0.tgz",
+      "integrity": "sha512-5POEcUuZybH7IdmGsD8wlf0AI55wMecM9rVBTI/qEAy2c1kTOm3DjFYjrBdI2K3BaJjJYfYFeRtM0t9ssnRuxw==",
       "dev": true
     },
+    "es-object-atoms": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+      "dev": true,
+      "requires": {
+        "es-errors": "^1.3.0"
+      }
+    },
     "escalade": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
-      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
       "dev": true
     },
     "escape-html": {
@@ -5108,68 +5045,45 @@
       "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==",
       "dev": true
     },
-    "execa": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz",
-      "integrity": "sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==",
-      "dev": true,
-      "requires": {
-        "cross-spawn": "^7.0.3",
-        "get-stream": "^6.0.0",
-        "human-signals": "^2.1.0",
-        "is-stream": "^2.0.0",
-        "merge-stream": "^2.0.0",
-        "npm-run-path": "^4.0.1",
-        "onetime": "^5.1.2",
-        "signal-exit": "^3.0.3",
-        "strip-final-newline": "^2.0.0"
-      }
-    },
     "express": {
-      "version": "4.21.1",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz",
-      "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==",
+      "version": "4.22.1",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz",
+      "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.8",
         "array-flatten": "1.1.1",
-        "body-parser": "1.20.3",
-        "content-disposition": "0.5.4",
+        "body-parser": "~1.20.3",
+        "content-disposition": "~0.5.4",
         "content-type": "~1.0.4",
-        "cookie": "0.7.1",
-        "cookie-signature": "1.0.6",
+        "cookie": "~0.7.1",
+        "cookie-signature": "~1.0.6",
         "debug": "2.6.9",
         "depd": "2.0.0",
         "encodeurl": "~2.0.0",
         "escape-html": "~1.0.3",
         "etag": "~1.8.1",
-        "finalhandler": "1.3.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
+        "finalhandler": "~1.3.1",
+        "fresh": "~0.5.2",
+        "http-errors": "~2.0.0",
         "merge-descriptors": "1.0.3",
         "methods": "~1.1.2",
-        "on-finished": "2.4.1",
+        "on-finished": "~2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.10",
+        "path-to-regexp": "~0.1.12",
         "proxy-addr": "~2.0.7",
-        "qs": "6.13.0",
+        "qs": "~6.14.0",
         "range-parser": "~1.2.1",
         "safe-buffer": "5.2.1",
-        "send": "0.19.0",
-        "serve-static": "1.16.2",
+        "send": "~0.19.0",
+        "serve-static": "~1.16.2",
         "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
+        "statuses": "~2.0.1",
         "type-is": "~1.6.18",
         "utils-merge": "1.0.1",
         "vary": "~1.1.2"
       },
       "dependencies": {
-        "array-flatten": {
-          "version": "1.1.1",
-          "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
-          "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==",
-          "dev": true
-        },
         "debug": {
           "version": "2.6.9",
           "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
@@ -5185,11 +5099,14 @@
           "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
           "dev": true
         },
-        "encodeurl": {
-          "version": "2.0.0",
-          "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-          "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
-          "dev": true
+        "qs": {
+          "version": "6.14.0",
+          "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
+          "integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
+          "dev": true,
+          "requires": {
+            "side-channel": "^1.1.0"
+          }
         },
         "safe-buffer": {
           "version": "5.2.1",
@@ -5224,16 +5141,10 @@
         "micromatch": "^4.0.8"
       }
     },
-    "fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true
-    },
     "fast-uri": {
-      "version": "3.0.6",
-      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz",
-      "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+      "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
       "dev": true
     },
     "fastest-levenshtein": {
@@ -5293,12 +5204,6 @@
             "ms": "2.0.0"
           }
         },
-        "encodeurl": {
-          "version": "2.0.0",
-          "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-          "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
-          "dev": true
-        },
         "statuses": {
           "version": "2.0.1",
           "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
@@ -5335,18 +5240,6 @@
       "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==",
       "dev": true
     },
-    "fs-monkey": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/fs-monkey/-/fs-monkey-1.0.4.tgz",
-      "integrity": "sha512-INM/fWAxMICjttnD0DX1rBvinKskj5G1w+oy/pnm9u/tSlnBrzFonJMcalKJ30P8RRsPzKcCG7Q8l0jx5Fh9YQ==",
-      "dev": true
-    },
-    "fs.realpath": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
-      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
-      "dev": true
-    },
     "fsevents": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@@ -5361,36 +5254,31 @@
       "dev": true
     },
     "get-intrinsic": {
-      "version": "1.2.4",
-      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz",
-      "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==",
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
       "dev": true,
       "requires": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "es-define-property": "^1.0.1",
         "es-errors": "^1.3.0",
+        "es-object-atoms": "^1.1.1",
         "function-bind": "^1.1.2",
-        "has-proto": "^1.0.1",
-        "has-symbols": "^1.0.3",
-        "hasown": "^2.0.0"
+        "get-proto": "^1.0.1",
+        "gopd": "^1.2.0",
+        "has-symbols": "^1.1.0",
+        "hasown": "^2.0.2",
+        "math-intrinsics": "^1.1.0"
       }
     },
-    "get-stream": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz",
-      "integrity": "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==",
-      "dev": true
-    },
-    "glob": {
-      "version": "7.2.3",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
-      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+    "get-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
       "dev": true,
       "requires": {
-        "fs.realpath": "^1.0.0",
-        "inflight": "^1.0.4",
-        "inherits": "2",
-        "minimatch": "^3.1.1",
-        "once": "^1.3.0",
-        "path-is-absolute": "^1.0.0"
+        "dunder-proto": "^1.0.1",
+        "es-object-atoms": "^1.0.0"
       }
     },
     "glob-parent": {
@@ -5423,13 +5311,10 @@
       }
     },
     "gopd": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz",
-      "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==",
-      "dev": true,
-      "requires": {
-        "get-intrinsic": "^1.1.3"
-      }
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+      "dev": true
     },
     "graceful-fs": {
       "version": "4.2.11",
@@ -5458,25 +5343,10 @@
       "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
       "dev": true
     },
-    "has-property-descriptors": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
-      "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
-      "dev": true,
-      "requires": {
-        "es-define-property": "^1.0.0"
-      }
-    },
-    "has-proto": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz",
-      "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==",
-      "dev": true
-    },
     "has-symbols": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
-      "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
       "dev": true
     },
     "hasown": {
@@ -5500,12 +5370,6 @@
         "wbuf": "^1.1.0"
       }
     },
-    "html-entities": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.4.0.tgz",
-      "integrity": "sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ==",
-      "dev": true
-    },
     "http-deceiver": {
       "version": "1.2.7",
       "resolved": "https://registry.npmjs.org/http-deceiver/-/http-deceiver-1.2.7.tgz",
@@ -5575,10 +5439,10 @@
         "micromatch": "^4.0.2"
       }
     },
-    "human-signals": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz",
-      "integrity": "sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==",
+    "hyperdyperid": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/hyperdyperid/-/hyperdyperid-1.2.0.tgz",
+      "integrity": "sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A==",
       "dev": true
     },
     "iconv-lite": {
@@ -5606,16 +5470,6 @@
         "resolve-cwd": "^3.0.0"
       }
     },
-    "inflight": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
-      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
-      "dev": true,
-      "requires": {
-        "once": "^1.3.0",
-        "wrappy": "1"
-      }
-    },
     "inherits": {
       "version": "2.0.3",
       "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
@@ -5653,9 +5507,9 @@
       }
     },
     "is-docker": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz",
-      "integrity": "sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz",
+      "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==",
       "dev": true
     },
     "is-extglob": {
@@ -5673,6 +5527,21 @@
         "is-extglob": "^2.1.1"
       }
     },
+    "is-inside-container": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz",
+      "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==",
+      "dev": true,
+      "requires": {
+        "is-docker": "^3.0.0"
+      }
+    },
+    "is-network-error": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/is-network-error/-/is-network-error-1.1.0.tgz",
+      "integrity": "sha512-tUdRRAnhT+OtCZR/LxZelH/C7QtjtFrTu5tXCA8pl55eTUElUHT+GPYV8MBMBvea/j+NxQqVt3LbWMRir7Gx9g==",
+      "dev": true
+    },
     "is-number": {
       "version": "7.0.0",
       "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
@@ -5694,19 +5563,13 @@
         "isobject": "^3.0.1"
       }
     },
-    "is-stream": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
-      "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
-      "dev": true
-    },
     "is-wsl": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
-      "integrity": "sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==",
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz",
+      "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==",
       "dev": true,
       "requires": {
-        "is-docker": "^2.0.0"
+        "is-inside-container": "^1.0.0"
       }
     },
     "isarray": {
@@ -5745,9 +5608,9 @@
       "dev": true
     },
     "json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "dev": true
     },
     "kind-of": {
@@ -5757,19 +5620,19 @@
       "dev": true
     },
     "launch-editor": {
-      "version": "2.6.0",
-      "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.6.0.tgz",
-      "integrity": "sha512-JpDCcQnyAAzZZaZ7vEiSqL690w7dAEyLao+KC96zBplnYbJS7TYNjvM3M7y3dGz+v7aIsJk3hllWuc0kWAjyRQ==",
+      "version": "2.10.0",
+      "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.10.0.tgz",
+      "integrity": "sha512-D7dBRJo/qcGX9xlvt/6wUYzQxjh5G1RvZPgPv8vi4KRU99DVQL/oW7tnVOCCTm2HGeo3C5HvGE5Yrh6UBoZ0vA==",
       "dev": true,
       "requires": {
         "picocolors": "^1.0.0",
-        "shell-quote": "^1.7.3"
+        "shell-quote": "^1.8.1"
       }
     },
     "loader-runner": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz",
-      "integrity": "sha512-3R/1M+yS3j5ou80Me59j7F9IMs4PXs3VqRrm0TU3AbKPxlmpoY1TNscJV/oGJXo8qCatFGTfDbY6W6ipGOYXfg==",
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.1.tgz",
+      "integrity": "sha512-IWqP2SCPhyVFTBtRcgMHdzlf9ul25NwaFx4wCEH/KjAXuuHY4yNjvPXsBokp8jCB936PyWRaPKUNh8NvylLp2Q==",
       "dev": true
     },
     "locate-path": {
@@ -5781,6 +5644,12 @@
         "p-locate": "^4.1.0"
       }
     },
+    "math-intrinsics": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+      "dev": true
+    },
     "media-typer": {
       "version": "0.3.0",
       "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
@@ -5788,12 +5657,63 @@
       "dev": true
     },
     "memfs": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/memfs/-/memfs-3.5.3.tgz",
-      "integrity": "sha512-UERzLsxzllchadvbPs5aolHh65ISpKpM+ccLbOJ8/vvpBKmAWf+la7dXFy7Mr0ySHbdHrFv5kGFCUHHe6GFEmw==",
+      "version": "4.17.2",
+      "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.17.2.tgz",
+      "integrity": "sha512-NgYhCOWgovOXSzvYgUW0LQ7Qy72rWQMGGFJDoWg4G30RHd3z77VbYdtJ4fembJXBy8pMIUA31XNAupobOQlwdg==",
       "dev": true,
       "requires": {
-        "fs-monkey": "^1.0.4"
+        "@jsonjoy.com/json-pack": "^1.0.3",
+        "@jsonjoy.com/util": "^1.3.0",
+        "tree-dump": "^1.0.1",
+        "tslib": "^2.0.0"
+      },
+      "dependencies": {
+        "@jsonjoy.com/base64": {
+          "version": "1.1.2",
+          "resolved": "https://registry.npmjs.org/@jsonjoy.com/base64/-/base64-1.1.2.tgz",
+          "integrity": "sha512-q6XAnWQDIMA3+FTiOYajoYqySkO+JSat0ytXGSuRdq9uXE7o92gzuQwQM14xaCRlBLGq3v5miDGC4vkVTn54xA==",
+          "dev": true,
+          "requires": {}
+        },
+        "@jsonjoy.com/json-pack": {
+          "version": "1.2.0",
+          "resolved": "https://registry.npmjs.org/@jsonjoy.com/json-pack/-/json-pack-1.2.0.tgz",
+          "integrity": "sha512-io1zEbbYcElht3tdlqEOFxZ0dMTYrHz9iMf0gqn1pPjZFTCgM5R4R5IMA20Chb2UPYYsxjzs8CgZ7Nb5n2K2rA==",
+          "dev": true,
+          "requires": {
+            "@jsonjoy.com/base64": "^1.1.1",
+            "@jsonjoy.com/util": "^1.1.2",
+            "hyperdyperid": "^1.2.0",
+            "thingies": "^1.20.0"
+          }
+        },
+        "@jsonjoy.com/util": {
+          "version": "1.6.0",
+          "resolved": "https://registry.npmjs.org/@jsonjoy.com/util/-/util-1.6.0.tgz",
+          "integrity": "sha512-sw/RMbehRhN68WRtcKCpQOPfnH6lLP4GJfqzi3iYej8tnzpZUDr6UkZYJjcjjC0FWEJOJbyM3PTIwxucUmDG2A==",
+          "dev": true,
+          "requires": {}
+        },
+        "thingies": {
+          "version": "1.21.0",
+          "resolved": "https://registry.npmjs.org/thingies/-/thingies-1.21.0.tgz",
+          "integrity": "sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g==",
+          "dev": true,
+          "requires": {}
+        },
+        "tree-dump": {
+          "version": "1.0.3",
+          "resolved": "https://registry.npmjs.org/tree-dump/-/tree-dump-1.0.3.tgz",
+          "integrity": "sha512-il+Cv80yVHFBwokQSfd4bldvr1Md951DpgAGfmhydt04L+YzHgubm2tQ7zueWDcGENKHq0ZvGFR/hjvNXilHEg==",
+          "dev": true,
+          "requires": {}
+        },
+        "tslib": {
+          "version": "2.8.1",
+          "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+          "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+          "dev": true
+        }
       }
     },
     "merge-descriptors": {
@@ -5851,27 +5771,12 @@
         "mime-db": "1.52.0"
       }
     },
-    "mimic-fn": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
-      "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==",
-      "dev": true
-    },
     "minimalistic-assert": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz",
       "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==",
       "dev": true
     },
-    "minimatch": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
-      "dev": true,
-      "requires": {
-        "brace-expansion": "^1.1.7"
-      }
-    },
     "ms": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
@@ -5901,15 +5806,15 @@
       "dev": true
     },
     "node-forge": {
-      "version": "1.3.1",
-      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.1.tgz",
-      "integrity": "sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==",
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.2.tgz",
+      "integrity": "sha512-6xKiQ+cph9KImrRh0VsjH2d8/GXA4FIMlgU4B757iI1ApvcyA9VlouP0yZJha01V+huImO+kKMU7ih+2+E14fw==",
       "dev": true
     },
     "node-releases": {
-      "version": "2.0.13",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.13.tgz",
-      "integrity": "sha512-uYr7J37ae/ORWdZeQ1xxMJe3NtdmqMC/JZK+geofDrkLUApKRHPd18/TxtBOJ4A0/+uUIliorNrfYV6s1b02eQ==",
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
       "dev": true
     },
     "normalize-path": {
@@ -5918,19 +5823,10 @@
       "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
       "dev": true
     },
-    "npm-run-path": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
-      "integrity": "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==",
-      "dev": true,
-      "requires": {
-        "path-key": "^3.0.0"
-      }
-    },
     "object-inspect": {
-      "version": "1.13.2",
-      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
-      "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
+      "version": "1.13.4",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
+      "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
       "dev": true
     },
     "obuf": {
@@ -5949,38 +5845,21 @@
       }
     },
     "on-headers": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.0.2.tgz",
-      "integrity": "sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz",
+      "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==",
       "dev": true
     },
-    "once": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
-      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-      "dev": true,
-      "requires": {
-        "wrappy": "1"
-      }
-    },
-    "onetime": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
-      "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==",
-      "dev": true,
-      "requires": {
-        "mimic-fn": "^2.1.0"
-      }
-    },
     "open": {
-      "version": "8.4.2",
-      "resolved": "https://registry.npmjs.org/open/-/open-8.4.2.tgz",
-      "integrity": "sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==",
+      "version": "10.1.2",
+      "resolved": "https://registry.npmjs.org/open/-/open-10.1.2.tgz",
+      "integrity": "sha512-cxN6aIDPz6rm8hbebcP7vrQNhvRcveZoJU72Y7vskh4oIm+BZwBECnx5nTmrlres1Qapvx27Qo1Auukpf8PKXw==",
       "dev": true,
       "requires": {
-        "define-lazy-prop": "^2.0.0",
-        "is-docker": "^2.1.1",
-        "is-wsl": "^2.2.0"
+        "default-browser": "^5.2.1",
+        "define-lazy-prop": "^3.0.0",
+        "is-inside-container": "^1.0.0",
+        "is-wsl": "^3.1.0"
       }
     },
     "p-locate": {
@@ -6004,12 +5883,13 @@
       }
     },
     "p-retry": {
-      "version": "4.6.2",
-      "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz",
-      "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==",
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-6.2.1.tgz",
+      "integrity": "sha512-hEt02O4hUct5wtwg4H4KcWgDdm+l1bOaEy/hWzd8xtXB9BqxTWBBhb+2ImAtH4Cv4rPjV76xN3Zumqk3k3AhhQ==",
       "dev": true,
       "requires": {
-        "@types/retry": "0.12.0",
+        "@types/retry": "0.12.2",
+        "is-network-error": "^1.0.0",
         "retry": "^0.13.1"
       }
     },
@@ -6031,12 +5911,6 @@
       "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
       "dev": true
     },
-    "path-is-absolute": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
-      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
-      "dev": true
-    },
     "path-key": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
@@ -6050,9 +5924,9 @@
       "dev": true
     },
     "path-to-regexp": {
-      "version": "0.1.10",
-      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz",
-      "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==",
+      "version": "0.1.12",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
+      "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==",
       "dev": true
     },
     "path-type": {
@@ -6062,9 +5936,9 @@
       "dev": true
     },
     "picocolors": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
-      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "dev": true
     },
     "picomatch": {
@@ -6106,12 +5980,6 @@
         }
       }
     },
-    "punycode": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.0.tgz",
-      "integrity": "sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==",
-      "dev": true
-    },
     "qs": {
       "version": "6.13.0",
       "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
@@ -6152,14 +6020,6 @@
         "http-errors": "2.0.0",
         "iconv-lite": "0.4.24",
         "unpipe": "1.0.0"
-      },
-      "dependencies": {
-        "bytes": {
-          "version": "3.1.2",
-          "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-          "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
-          "dev": true
-        }
       }
     },
     "readable-stream": {
@@ -6245,14 +6105,11 @@
       "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==",
       "dev": true
     },
-    "rimraf": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
-      "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
-      "dev": true,
-      "requires": {
-        "glob": "^7.1.3"
-      }
+    "run-applescript": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz",
+      "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==",
+      "dev": true
     },
     "run-parallel": {
       "version": "1.2.0",
@@ -6276,14 +6133,15 @@
       "dev": true
     },
     "schema-utils": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz",
-      "integrity": "sha512-pN/yOAvcC+5rQ5nERGuwrjLlYvLTbCibnZ1I7B1LaiAz9BRBlE9GMgE/eqV30P7aJQUf7Ddimy/RsbYO/GrVGg==",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz",
+      "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==",
       "dev": true,
       "requires": {
-        "@types/json-schema": "^7.0.8",
-        "ajv": "^6.12.5",
-        "ajv-keywords": "^3.5.2"
+        "@types/json-schema": "^7.0.9",
+        "ajv": "^8.9.0",
+        "ajv-formats": "^2.1.1",
+        "ajv-keywords": "^5.1.0"
       }
     },
     "select-hose": {
@@ -6293,11 +6151,12 @@
       "dev": true
     },
     "selfsigned": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-2.1.1.tgz",
-      "integrity": "sha512-GSL3aowiF7wa/WtSFwnUrludWFoNhftq8bUkH9pkzjpN2XSPOAYEgg6e0sS9s0rZwgJzJiQRPU18A6clnoW5wQ==",
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-2.4.1.tgz",
+      "integrity": "sha512-th5B4L2U+eGLq1TVh7zNRGBapioSORUeymIydxgFpwww9d2qyKvtuPU2jJuHvYAwwqi2Y596QBL3eEqcPEYL8Q==",
       "dev": true,
       "requires": {
+        "@types/node-forge": "^1.3.0",
         "node-forge": "^1"
       }
     },
@@ -6345,6 +6204,12 @@
           "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
           "dev": true
         },
+        "encodeurl": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+          "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+          "dev": true
+        },
         "ms": {
           "version": "2.1.3",
           "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
@@ -6422,28 +6287,6 @@
         "escape-html": "~1.0.3",
         "parseurl": "~1.3.3",
         "send": "0.19.0"
-      },
-      "dependencies": {
-        "encodeurl": {
-          "version": "2.0.0",
-          "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
-          "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
-          "dev": true
-        }
-      }
-    },
-    "set-function-length": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
-      "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
-      "dev": true,
-      "requires": {
-        "define-data-property": "^1.1.4",
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "gopd": "^1.0.1",
-        "has-property-descriptors": "^1.0.2"
       }
     },
     "setprototypeof": {
@@ -6477,28 +6320,58 @@
       "dev": true
     },
     "shell-quote": {
-      "version": "1.8.1",
-      "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.1.tgz",
-      "integrity": "sha512-6j1W9l1iAs/4xYBI1SYOVZyFcCis9b4KCLQ8fgAGG07QvzaRLVVRQvAy85yNmmZSjYjg4MWh4gNvlPujU/5LpA==",
+      "version": "1.8.3",
+      "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.3.tgz",
+      "integrity": "sha512-ObmnIF4hXNg1BqhnHmgbDETF8dLPCggZWBjkQfhZpbszZnYur5DUljTcCHii5LC3J5E0yeO/1LIMyH+UvHQgyw==",
       "dev": true
     },
     "side-channel": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz",
-      "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==",
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
+      "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
       "dev": true,
       "requires": {
-        "call-bind": "^1.0.7",
         "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.4",
-        "object-inspect": "^1.13.1"
+        "object-inspect": "^1.13.3",
+        "side-channel-list": "^1.0.0",
+        "side-channel-map": "^1.0.1",
+        "side-channel-weakmap": "^1.0.2"
       }
     },
-    "signal-exit": {
-      "version": "3.0.7",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
-      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
-      "dev": true
+    "side-channel-list": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
+      "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
+      "dev": true,
+      "requires": {
+        "es-errors": "^1.3.0",
+        "object-inspect": "^1.13.3"
+      }
+    },
+    "side-channel-map": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
+      "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
+      "dev": true,
+      "requires": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3"
+      }
+    },
+    "side-channel-weakmap": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
+      "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
+      "dev": true,
+      "requires": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3",
+        "side-channel-map": "^1.0.1"
+      }
     },
     "slash": {
       "version": "5.1.0",
@@ -6588,12 +6461,6 @@
         "safe-buffer": "~5.1.0"
       }
     },
-    "strip-final-newline": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz",
-      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
-      "dev": true
-    },
     "supports-color": {
       "version": "8.1.1",
       "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
@@ -6610,34 +6477,34 @@
       "dev": true
     },
     "tapable": {
-      "version": "2.2.1",
-      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.1.tgz",
-      "integrity": "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz",
+      "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==",
       "dev": true
     },
     "terser": {
-      "version": "5.31.6",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.6.tgz",
-      "integrity": "sha512-PQ4DAriWzKj+qgehQ7LK5bQqCFNMmlhjR2PFFLuqGCpuCAauxemVBWwWOxo3UIwWQx8+Pr61Df++r76wDmkQBg==",
+      "version": "5.46.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.46.0.tgz",
+      "integrity": "sha512-jTwoImyr/QbOWFFso3YoU3ik0jBBDJ6JTOQiy/J2YxVJdZCc+5u7skhNwiOR3FQIygFqVUPHl7qbbxtjW2K3Qg==",
       "dev": true,
       "requires": {
         "@jridgewell/source-map": "^0.3.3",
-        "acorn": "^8.8.2",
+        "acorn": "^8.15.0",
         "commander": "^2.20.0",
         "source-map-support": "~0.5.20"
       }
     },
     "terser-webpack-plugin": {
-      "version": "5.3.10",
-      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.10.tgz",
-      "integrity": "sha512-BKFPWlPDndPs+NGGCr1U59t0XScL5317Y0UReNrHaw9/FwhPENlq6bfgs+4yPfyP51vqC1bQ4rp1EfXW5ZSH9w==",
+      "version": "5.3.16",
+      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.16.tgz",
+      "integrity": "sha512-h9oBFCWrq78NyWWVcSwZarJkZ01c2AyGrzs1crmHZO3QUg9D61Wu4NPjBy69n7JqylFF5y+CsUZYmYEIZ3mR+Q==",
       "dev": true,
       "requires": {
-        "@jridgewell/trace-mapping": "^0.3.20",
+        "@jridgewell/trace-mapping": "^0.3.25",
         "jest-worker": "^27.4.5",
-        "schema-utils": "^3.1.1",
-        "serialize-javascript": "^6.0.1",
-        "terser": "^5.26.0"
+        "schema-utils": "^4.3.0",
+        "serialize-javascript": "^6.0.2",
+        "terser": "^5.31.1"
       }
     },
     "thunky": {
@@ -6690,22 +6557,13 @@
       "dev": true
     },
     "update-browserslist-db": {
-      "version": "1.0.13",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz",
-      "integrity": "sha512-xebP81SNcPuNpPP3uzeW1NYXxI3rxyJzF3pD6sH4jE7o/IX+WtSpwnVU+qIsDPyk0d3hmFQ7mjqc6AtV604hbg==",
-      "dev": true,
-      "requires": {
-        "escalade": "^3.1.1",
-        "picocolors": "^1.0.0"
-      }
-    },
-    "uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
       "dev": true,
       "requires": {
-        "punycode": "^2.1.0"
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
       }
     },
     "util-deprecate": {
@@ -6733,9 +6591,9 @@
       "dev": true
     },
     "watchpack": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.4.2.tgz",
-      "integrity": "sha512-TnbFSbcOCcDgjZ4piURLCbJ3nJhznVh9kw6F6iokjiFPl8ONxe9A6nMDVXDiNbrSfLILs6vB07F7wLBrwPYzJw==",
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
+      "integrity": "sha512-Zn5uXdcFNIA1+1Ei5McRd+iRzfhENPCe7LeABkJtNulSxjma+l7ltNx55BWZkRlwRnpOgHqxnjyaDgJnNXnqzg==",
       "dev": true,
       "requires": {
         "glob-to-regexp": "^0.4.1",
@@ -6752,42 +6610,36 @@
       }
     },
     "webpack": {
-      "version": "5.94.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.94.0.tgz",
-      "integrity": "sha512-KcsGn50VT+06JH/iunZJedYGUJS5FGjow8wb9c0v5n1Om8O1g4L6LjtfxwlXIATopoQu+vOXXa7gYisWxCoPyg==",
-      "dev": true,
-      "requires": {
-        "@types/estree": "^1.0.5",
-        "@webassemblyjs/ast": "^1.12.1",
-        "@webassemblyjs/wasm-edit": "^1.12.1",
-        "@webassemblyjs/wasm-parser": "^1.12.1",
-        "acorn": "^8.7.1",
-        "acorn-import-attributes": "^1.9.5",
-        "browserslist": "^4.21.10",
+      "version": "5.105.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.105.0.tgz",
+      "integrity": "sha512-gX/dMkRQc7QOMzgTe6KsYFM7DxeIONQSui1s0n/0xht36HvrgbxtM1xBlgx596NbpHuQU8P7QpKwrZYwUX48nw==",
+      "dev": true,
+      "requires": {
+        "@types/eslint-scope": "^3.7.7",
+        "@types/estree": "^1.0.8",
+        "@types/json-schema": "^7.0.15",
+        "@webassemblyjs/ast": "^1.14.1",
+        "@webassemblyjs/wasm-edit": "^1.14.1",
+        "@webassemblyjs/wasm-parser": "^1.14.1",
+        "acorn": "^8.15.0",
+        "acorn-import-phases": "^1.0.3",
+        "browserslist": "^4.28.1",
         "chrome-trace-event": "^1.0.2",
-        "enhanced-resolve": "^5.17.1",
-        "es-module-lexer": "^1.2.1",
+        "enhanced-resolve": "^5.19.0",
+        "es-module-lexer": "^2.0.0",
         "eslint-scope": "5.1.1",
         "events": "^3.2.0",
         "glob-to-regexp": "^0.4.1",
         "graceful-fs": "^4.2.11",
         "json-parse-even-better-errors": "^2.3.1",
-        "loader-runner": "^4.2.0",
+        "loader-runner": "^4.3.1",
         "mime-types": "^2.1.27",
         "neo-async": "^2.6.2",
-        "schema-utils": "^3.2.0",
-        "tapable": "^2.1.1",
-        "terser-webpack-plugin": "^5.3.10",
-        "watchpack": "^2.4.1",
-        "webpack-sources": "^3.2.3"
-      },
-      "dependencies": {
-        "webpack-sources": {
-          "version": "3.2.3",
-          "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz",
-          "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==",
-          "dev": true
-        }
+        "schema-utils": "^4.3.3",
+        "tapable": "^2.3.0",
+        "terser-webpack-plugin": "^5.3.16",
+        "watchpack": "^2.5.1",
+        "webpack-sources": "^3.3.3"
       }
     },
     "webpack-cli": {
@@ -6820,136 +6672,53 @@
       }
     },
     "webpack-dev-middleware": {
-      "version": "5.3.4",
-      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
-      "integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
+      "version": "7.4.2",
+      "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-7.4.2.tgz",
+      "integrity": "sha512-xOO8n6eggxnwYpy1NlzUKpvrjfJTvae5/D6WOK0S2LSo7vjmo5gCM1DbLUmFqrMTJP+W/0YZNctm7jasWvLuBA==",
       "dev": true,
       "requires": {
         "colorette": "^2.0.10",
-        "memfs": "^3.4.3",
+        "memfs": "^4.6.0",
         "mime-types": "^2.1.31",
+        "on-finished": "^2.4.1",
         "range-parser": "^1.2.1",
         "schema-utils": "^4.0.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2",
-            "uri-js": "^4.2.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        },
-        "schema-utils": {
-          "version": "4.2.0",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-          "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
-        }
       }
     },
     "webpack-dev-server": {
-      "version": "4.15.1",
-      "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-4.15.1.tgz",
-      "integrity": "sha512-5hbAst3h3C3L8w6W4P96L5vaV0PxSmJhxZvWKYIdgxOQm8pNZ5dEOmmSLBVpP85ReeyRt6AS1QJNyo/oFFPeVA==",
-      "dev": true,
-      "requires": {
-        "@types/bonjour": "^3.5.9",
-        "@types/connect-history-api-fallback": "^1.3.5",
-        "@types/express": "^4.17.13",
-        "@types/serve-index": "^1.9.1",
-        "@types/serve-static": "^1.13.10",
-        "@types/sockjs": "^0.3.33",
-        "@types/ws": "^8.5.5",
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.2.1.tgz",
+      "integrity": "sha512-ml/0HIj9NLpVKOMq+SuBPLHcmbG+TGIjXRHsYfZwocUBIqEvws8NnS/V9AFQ5FKP+tgn5adwVwRrTEpGL33QFQ==",
+      "dev": true,
+      "requires": {
+        "@types/bonjour": "^3.5.13",
+        "@types/connect-history-api-fallback": "^1.5.4",
+        "@types/express": "^4.17.21",
+        "@types/express-serve-static-core": "^4.17.21",
+        "@types/serve-index": "^1.9.4",
+        "@types/serve-static": "^1.15.5",
+        "@types/sockjs": "^0.3.36",
+        "@types/ws": "^8.5.10",
         "ansi-html-community": "^0.0.8",
-        "bonjour-service": "^1.0.11",
-        "chokidar": "^3.5.3",
+        "bonjour-service": "^1.2.1",
+        "chokidar": "^3.6.0",
         "colorette": "^2.0.10",
         "compression": "^1.7.4",
         "connect-history-api-fallback": "^2.0.0",
-        "default-gateway": "^6.0.3",
-        "express": "^4.17.3",
+        "express": "^4.21.2",
         "graceful-fs": "^4.2.6",
-        "html-entities": "^2.3.2",
-        "http-proxy-middleware": "^2.0.3",
-        "ipaddr.js": "^2.0.1",
-        "launch-editor": "^2.6.0",
-        "open": "^8.0.9",
-        "p-retry": "^4.5.0",
-        "rimraf": "^3.0.2",
-        "schema-utils": "^4.0.0",
-        "selfsigned": "^2.1.1",
+        "http-proxy-middleware": "^2.0.7",
+        "ipaddr.js": "^2.1.0",
+        "launch-editor": "^2.6.1",
+        "open": "^10.0.3",
+        "p-retry": "^6.2.0",
+        "schema-utils": "^4.2.0",
+        "selfsigned": "^2.4.1",
         "serve-index": "^1.9.1",
         "sockjs": "^0.3.24",
         "spdy": "^4.0.2",
-        "webpack-dev-middleware": "^5.3.1",
-        "ws": "^8.13.0"
-      },
-      "dependencies": {
-        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
-          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.1",
-            "json-schema-traverse": "^1.0.0",
-            "require-from-string": "^2.0.2",
-            "uri-js": "^4.2.2"
-          }
-        },
-        "ajv-keywords": {
-          "version": "5.1.0",
-          "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-          "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-          "dev": true,
-          "requires": {
-            "fast-deep-equal": "^3.1.3"
-          }
-        },
-        "json-schema-traverse": {
-          "version": "1.0.0",
-          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "dev": true
-        },
-        "schema-utils": {
-          "version": "4.2.0",
-          "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.2.0.tgz",
-          "integrity": "sha512-L0jRsrPpjdckP3oPug3/VxNKt2trR8TcabrM6FOAAlvC/9Phcmm+cuAgTlxBqdBR1WJx7Naj9WHw+aOmheSVbw==",
-          "dev": true,
-          "requires": {
-            "@types/json-schema": "^7.0.9",
-            "ajv": "^8.9.0",
-            "ajv-formats": "^2.1.1",
-            "ajv-keywords": "^5.1.0"
-          }
-        }
+        "webpack-dev-middleware": "^7.4.2",
+        "ws": "^8.18.0"
       }
     },
     "webpack-merge": {
@@ -6962,6 +6731,12 @@
         "wildcard": "^2.0.0"
       }
     },
+    "webpack-sources": {
+      "version": "3.3.3",
+      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz",
+      "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==",
+      "dev": true
+    },
     "websocket-driver": {
       "version": "0.7.4",
       "resolved": "https://registry.npmjs.org/websocket-driver/-/websocket-driver-0.7.4.tgz",
@@ -6994,16 +6769,10 @@
       "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
       "dev": true
     },
-    "wrappy": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
-      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-      "dev": true
-    },
     "ws": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
-      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
+      "version": "8.18.2",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.2.tgz",
+      "integrity": "sha512-DMricUmwGZUVr++AEAe2uiVM7UoO9MAVZMDu05UQOaUII0lp+zOzLLU4Xqh/JvTqklB1T4uELaaPBKyjE1r4fQ==",
       "dev": true,
       "requires": {}
     }
diff --git a/datafusion/wasmtest/datafusion-wasm-app/package.json b/datafusion/wasmtest/datafusion-wasm-app/package.json
index 5a2262400cfd5..aecc5b689554e 100644
--- a/datafusion/wasmtest/datafusion-wasm-app/package.json
+++ b/datafusion/wasmtest/datafusion-wasm-app/package.json
@@ -27,9 +27,9 @@
     "datafusion-wasmtest": "../pkg"
   },
   "devDependencies": {
-    "webpack": "5.94.0",
+    "webpack": "5.105.0",
     "webpack-cli": "5.1.4",
-    "webpack-dev-server": "4.15.1",
+    "webpack-dev-server": "5.2.1",
     "copy-webpack-plugin": "12.0.2"
   }
 }
diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs
index e30a1046ab274..f545ccf19306a 100644
--- a/datafusion/wasmtest/src/lib.rs
+++ b/datafusion/wasmtest/src/lib.rs
@@ -15,22 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#![cfg_attr(test, allow(clippy::needless_pass_by_value))]
 #![doc(
     html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
     html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
 )]
-#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
 extern crate wasm_bindgen;
 
-use datafusion_common::{DFSchema, ScalarValue};
-use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_common::ScalarValue;
 use datafusion_expr::lit;
 use datafusion_expr::simplify::SimplifyContext;
 use datafusion_optimizer::simplify_expressions::ExprSimplifier;
 use datafusion_sql::sqlparser::dialect::GenericDialect;
 use datafusion_sql::sqlparser::parser::Parser;
-use std::sync::Arc;
 use wasm_bindgen::prelude::*;
 pub fn set_panic_hook() {
     // When the `console_error_panic_hook` feature is enabled, we can call the
@@ -62,10 +61,7 @@ pub fn basic_exprs() {
     log(&format!("Expr: {expr:?}"));
 
     // Simplify Expr (using datafusion-phys-expr and datafusion-optimizer)
-    let schema = Arc::new(DFSchema::empty());
-    let execution_props = ExecutionProps::new();
-    let simplifier =
-        ExprSimplifier::new(SimplifyContext::new(&execution_props).with_schema(schema));
+    let simplifier = ExprSimplifier::new(SimplifyContext::default());
     let simplified_expr = simplifier.simplify(expr).unwrap();
     log(&format!("Simplified Expr: {simplified_expr:?}"));
 }
@@ -81,7 +77,10 @@ pub fn basic_parse() {
 
 #[cfg(test)]
 mod test {
-    use super::*;
+    use std::sync::Arc;
+
+    use bytes::Bytes;
+    use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
     use datafusion::{
         arrow::{
             array::{ArrayRef, Int32Array, RecordBatch, StringArray},
@@ -89,8 +88,9 @@ mod test {
         },
         datasource::MemTable,
         execution::context::SessionContext,
+        prelude::CsvReadOptions,
     };
-    use datafusion_common::test_util::batches_to_string;
+    use datafusion_common::{DataFusionError, test_util::batches_to_string};
     use datafusion_execution::{
         config::SessionConfig,
         disk_manager::{DiskManagerBuilder, DiskManagerMode},
@@ -98,17 +98,18 @@ mod test {
     };
     use datafusion_physical_plan::collect;
     use datafusion_sql::parser::DFParser;
-    use object_store::{memory::InMemory, path::Path, ObjectStore};
+    use futures::{StreamExt, TryStreamExt, stream};
+    use object_store::{ObjectStoreExt, PutPayload, memory::InMemory, path::Path};
     use url::Url;
     use wasm_bindgen_test::wasm_bindgen_test;
 
     wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 
+    #[cfg(target_arch = "wasm32")]
     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-    #[cfg_attr(not(target_arch = "wasm32"), allow(dead_code))]
     fn datafusion_test() {
-        basic_exprs();
-        basic_parse();
+        super::basic_exprs();
+        super::basic_parse();
     }
 
     fn get_ctx() -> Arc<SessionContext> {
@@ -261,4 +262,55 @@ mod test {
              +----+-------+"
         );
     }
+
+    #[wasm_bindgen_test(unsupported = tokio::test)]
+    async fn test_csv_read_xz_compressed() {
+        let csv_data = "id,value\n1,a\n2,b\n3,c\n";
+        let input = Bytes::from(csv_data.as_bytes().to_vec());
+        let input_stream =
+            stream::iter(vec![Ok::<Bytes, DataFusionError>(input)]).boxed();
+
+        let compressed_stream = FileCompressionType::XZ
+            .convert_to_compress_stream(input_stream)
+            .unwrap();
+        let compressed_data: Vec<Bytes> = compressed_stream.try_collect().await.unwrap();
+
+        let store = InMemory::new();
+        let path = Path::from("data.csv.xz");
+        store
+            .put(&path, PutPayload::from_iter(compressed_data))
+            .await
+            .unwrap();
+
+        let url = Url::parse("memory://").unwrap();
+        let ctx = SessionContext::new();
+        ctx.register_object_store(&url, Arc::new(store));
+
+        let csv_options = CsvReadOptions::new()
+            .has_header(true)
+            .file_compression_type(FileCompressionType::XZ)
+            .file_extension("csv.xz");
+        ctx.register_csv("compressed", "memory:///data.csv.xz", csv_options)
+            .await
+            .unwrap();
+
+        let result = ctx
+            .sql("SELECT * FROM compressed")
+            .await
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        assert_eq!(
+            batches_to_string(&result),
+            "+----+-------+\n\
+             | id | value |\n\
+             +----+-------+\n\
+             | 1  | a     |\n\
+             | 2  | b     |\n\
+             | 3  | c     |\n\
+             +----+-------+"
+        );
+    }
 }
diff --git a/dev/changelog/48.0.0.md b/dev/changelog/48.0.0.md
new file mode 100644
index 0000000000000..9cf6c03b7acf0
--- /dev/null
+++ b/dev/changelog/48.0.0.md
@@ -0,0 +1,405 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 48.0.0 Changelog
+
+This release consists of 267 commits from 89 contributors. See credits at the end of this changelog for more information.
+
+**Breaking changes:**
+
+- Attach Diagnostic to syntax errors [#15680](https://github.com/apache/datafusion/pull/15680) (logan-keede)
+- Change `flatten` so it does only a level, not recursively [#15160](https://github.com/apache/datafusion/pull/15160) (delamarch3)
+- Improve `simplify_expressions` rule [#15735](https://github.com/apache/datafusion/pull/15735) (xudong963)
+- Support WITHIN GROUP syntax to standardize certain existing aggregate functions [#13511](https://github.com/apache/datafusion/pull/13511) (Garamda)
+- Add Extension Type / Metadata support for Scalar UDFs [#15646](https://github.com/apache/datafusion/pull/15646) (timsaucer)
+- chore: fix clippy::large_enum_variant for DataFusionError [#15861](https://github.com/apache/datafusion/pull/15861) (rroelke)
+- Feat: introduce `ExecutionPlan::partition_statistics` API [#15852](https://github.com/apache/datafusion/pull/15852) (xudong963)
+- refactor: remove deprecated `ParquetExec` [#15973](https://github.com/apache/datafusion/pull/15973) (miroim)
+- refactor: remove deprecated `ArrowExec` [#16006](https://github.com/apache/datafusion/pull/16006) (miroim)
+- refactor: remove deprecated `MemoryExec` [#16007](https://github.com/apache/datafusion/pull/16007) (miroim)
+- refactor: remove deprecated `JsonExec` [#16005](https://github.com/apache/datafusion/pull/16005) (miroim)
+- feat: metadata handling for aggregates and window functions [#15911](https://github.com/apache/datafusion/pull/15911) (timsaucer)
+- Remove `Filter::having` field [#16154](https://github.com/apache/datafusion/pull/16154) (findepi)
+- Shift from Field to FieldRef for all user defined functions [#16122](https://github.com/apache/datafusion/pull/16122) (timsaucer)
+- Change default SQL mapping for `VARCAHR` from `Utf8` to `Utf8View` [#16142](https://github.com/apache/datafusion/pull/16142) (zhuqi-lucas)
+- Minor: remove unused IPCWriter [#16215](https://github.com/apache/datafusion/pull/16215) (alamb)
+- Reduce size of `Expr` struct [#16207](https://github.com/apache/datafusion/pull/16207) (hendrikmakait)
+
+**Performance related:**
+
+- Apply pre-selection and computation skipping to short-circuit optimization [#15694](https://github.com/apache/datafusion/pull/15694) (acking-you)
+- Add a fast path for `optimize_projection` [#15746](https://github.com/apache/datafusion/pull/15746) (xudong963)
+- Speed up `optimize_projection` by improving `is_projection_unnecessary` [#15761](https://github.com/apache/datafusion/pull/15761) (xudong963)
+- Speed up `optimize_projection` [#15787](https://github.com/apache/datafusion/pull/15787) (xudong963)
+- Support `GroupsAccumulator` for Avg duration [#15748](https://github.com/apache/datafusion/pull/15748) (shruti2522)
+- Optimize performance of `string::ascii` function [#16087](https://github.com/apache/datafusion/pull/16087) (tlm365)
+
+**Implemented enhancements:**
+
+- Set DataFusion runtime configurations through SQL interface [#15594](https://github.com/apache/datafusion/pull/15594) (kumarlokesh)
+- feat: Add option to adjust writer buffer size for query output [#15747](https://github.com/apache/datafusion/pull/15747) (m09526)
+- feat: Add `datafusion-spark` crate [#15168](https://github.com/apache/datafusion/pull/15168) (shehabgamin)
+- feat: create helpers to set the max_temp_directory_size [#15919](https://github.com/apache/datafusion/pull/15919) (jdrouet)
+- feat: ORDER BY ALL [#15772](https://github.com/apache/datafusion/pull/15772) (PokIsemaine)
+- feat: support min/max for struct [#15667](https://github.com/apache/datafusion/pull/15667) (chenkovsky)
+- feat(proto): udf decoding fallback [#15997](https://github.com/apache/datafusion/pull/15997) (leoyvens)
+- feat: make error handling in indent explain consistent with that in tree [#16097](https://github.com/apache/datafusion/pull/16097) (chenkovsky)
+- feat: coerce to/from fixed size binary to binary view [#16110](https://github.com/apache/datafusion/pull/16110) (chenkovsky)
+- feat: array_length for fixed size list [#16167](https://github.com/apache/datafusion/pull/16167) (chenkovsky)
+- feat: ADD sha2 spark function [#16168](https://github.com/apache/datafusion/pull/16168) (getChan)
+- feat: create builder for disk manager [#16191](https://github.com/apache/datafusion/pull/16191) (jdrouet)
+- feat: Add Aggregate UDF to FFI crate [#14775](https://github.com/apache/datafusion/pull/14775) (timsaucer)
+- feat(small): Add `BaselineMetrics` to `generate_series()` table function [#16255](https://github.com/apache/datafusion/pull/16255) (2010YOUY01)
+- feat: Add Window UDFs to FFI Crate [#16261](https://github.com/apache/datafusion/pull/16261) (timsaucer)
+
+**Fixed bugs:**
+
+- fix: serialize listing table without partition column [#15737](https://github.com/apache/datafusion/pull/15737) (chenkovsky)
+- fix: describe Parquet schema with coerce_int96 [#15750](https://github.com/apache/datafusion/pull/15750) (chenkovsky)
+- fix: clickbench type err [#15773](https://github.com/apache/datafusion/pull/15773) (chenkovsky)
+- Fix: fetch is missing in `replace_order_preserving_variants` method during `EnforceDistribution` optimizer [#15808](https://github.com/apache/datafusion/pull/15808) (xudong963)
+- Fix: fetch is missing in `EnforceSorting` optimizer (two places) [#15822](https://github.com/apache/datafusion/pull/15822) (xudong963)
+- fix: Avoid mistaken ILike to string equality optimization [#15836](https://github.com/apache/datafusion/pull/15836) (srh)
+- Map file-level column statistics to the table-level [#15865](https://github.com/apache/datafusion/pull/15865) (xudong963)
+- fix(avro): Respect projection order in Avro reader [#15840](https://github.com/apache/datafusion/pull/15840) (nantunes)
+- fix: correctly specify the nullability of `map_values` return type [#15901](https://github.com/apache/datafusion/pull/15901) (rluvaton)
+- Fix CI in main [#15917](https://github.com/apache/datafusion/pull/15917) (blaginin)
+- fix: sqllogictest on Windows [#15932](https://github.com/apache/datafusion/pull/15932) (nuno-faria)
+- fix: fold cast null to substrait typed null [#15854](https://github.com/apache/datafusion/pull/15854) (discord9)
+- Fix: `build_predicate_expression` method doesn't process `false` expr correctly [#15995](https://github.com/apache/datafusion/pull/15995) (xudong963)
+- fix: add an "expr_planners" method to SessionState [#15119](https://github.com/apache/datafusion/pull/15119) (niebayes)
+- fix: overcounting of memory in first/last. [#15924](https://github.com/apache/datafusion/pull/15924) (ashdnazg)
+- fix: track timing for coalescer's in execution time [#16048](https://github.com/apache/datafusion/pull/16048) (waynexia)
+- fix: stack overflow for substrait functions with large argument lists that translate to DataFusion binary operators [#16031](https://github.com/apache/datafusion/pull/16031) (fmonjalet)
+- fix: coerce int96 resolution inside of list, struct, and map types [#16058](https://github.com/apache/datafusion/pull/16058) (mbutrovich)
+- fix: Add coercion rules for Float16 types [#15816](https://github.com/apache/datafusion/pull/15816) (etseidl)
+- fix: describe escaped quoted identifiers [#16082](https://github.com/apache/datafusion/pull/16082) (jfahne)
+- fix: Remove trailing whitespace in `Display` for `LogicalPlan::Projection` [#16164](https://github.com/apache/datafusion/pull/16164) (atahanyorganci)
+- fix: metadata of join schema [#16221](https://github.com/apache/datafusion/pull/16221) (chenkovsky)
+- fix: add missing row count limits to TPC-H queries [#16230](https://github.com/apache/datafusion/pull/16230) (0ax1)
+- fix: NaN semantics in GROUP BY [#16256](https://github.com/apache/datafusion/pull/16256) (chenkovsky)
+
+**Documentation updates:**
+
+- Add DataFusion 47.0.0 Upgrade Guide [#15749](https://github.com/apache/datafusion/pull/15749) (alamb)
+- Improve documentation for format `OPTIONS` clause [#15708](https://github.com/apache/datafusion/pull/15708) (marvelshan)
+- doc: Adding Feldera as known user [#15799](https://github.com/apache/datafusion/pull/15799) (comphead)
+- docs: add ArkFlow [#15826](https://github.com/apache/datafusion/pull/15826) (chenquan)
+- Fix `from_unixtime` function documentation [#15844](https://github.com/apache/datafusion/pull/15844) (Viicos)
+- Upgrade-guide: Downgrade "FileScanConfig –> FileScanConfigBuilder" headline [#15883](https://github.com/apache/datafusion/pull/15883) (simonvandel)
+- doc: Update known users docs [#15895](https://github.com/apache/datafusion/pull/15895) (comphead)
+- Add `union_tag` scalar function [#14687](https://github.com/apache/datafusion/pull/14687) (gstvg)
+- Fix typo in introduction.md [#15910](https://github.com/apache/datafusion/pull/15910) (tom-mont)
+- Add `FormatOptions` to Config [#15793](https://github.com/apache/datafusion/pull/15793) (blaginin)
+- docs: Label `bloom_filter_on_read` as a reading config [#15933](https://github.com/apache/datafusion/pull/15933) (nuno-faria)
+- Implement Parquet filter pushdown via new filter pushdown APIs [#15769](https://github.com/apache/datafusion/pull/15769) (adriangb)
+- Enable repartitioning on MemTable. [#15409](https://github.com/apache/datafusion/pull/15409) (wiedld)
+- Updated extending operators documentation [#15612](https://github.com/apache/datafusion/pull/15612) (the0ninjas)
+- chore: Replace MSRV link on main page with Github badge [#16020](https://github.com/apache/datafusion/pull/16020) (comphead)
+- Add note to upgrade guide for removal of `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` [#16034](https://github.com/apache/datafusion/pull/16034) (alamb)
+- docs: Clarify that it is only the name of the field that is ignored [#16052](https://github.com/apache/datafusion/pull/16052) (alamb)
+- [Docs]: Added SQL example for all window functions [#16074](https://github.com/apache/datafusion/pull/16074) (Adez017)
+- Fix CI on main: Add window function examples in code [#16102](https://github.com/apache/datafusion/pull/16102) (alamb)
+- chore: Remove SMJ experimental status in docs [#16072](https://github.com/apache/datafusion/pull/16072) (comphead)
+- doc: fix indent format explain [#16085](https://github.com/apache/datafusion/pull/16085) (chenkovsky)
+- Update documentation for `datafusion.execution.collect_statistics` [#16100](https://github.com/apache/datafusion/pull/16100) (alamb)
+- Make `SessionContext::register_parquet` obey `collect_statistics` config [#16080](https://github.com/apache/datafusion/pull/16080) (adriangb)
+- Improve the DML / DDL Documentation [#16115](https://github.com/apache/datafusion/pull/16115) (alamb)
+- docs: Fix typos and minor grammatical issues in Architecture docs [#16119](https://github.com/apache/datafusion/pull/16119) (patrickcsullivan)
+- Set `TrackConsumersPool` as default in datafusion-cli [#16081](https://github.com/apache/datafusion/pull/16081) (ding-young)
+- Minor: Fix links in substrait readme [#16156](https://github.com/apache/datafusion/pull/16156) (alamb)
+- Add macro for creating DataFrame (#16090) [#16104](https://github.com/apache/datafusion/pull/16104) (cj-zhukov)
+- doc: Move `dataframe!` example into dedicated example [#16197](https://github.com/apache/datafusion/pull/16197) (comphead)
+- doc: add diagram to describe how DataSource, FileSource, and DataSourceExec are related [#16181](https://github.com/apache/datafusion/pull/16181) (onlyjackfrost)
+- Clarify documentation about gathering statistics for parquet files [#16157](https://github.com/apache/datafusion/pull/16157) (alamb)
+- Add change to VARCHAR in the upgrade guide [#16216](https://github.com/apache/datafusion/pull/16216) (alamb)
+- Add iceberg-rust to user list [#16246](https://github.com/apache/datafusion/pull/16246) (jonathanc-n)
+- Prepare for 48.0.0 release: Version and Changelog [#16238](https://github.com/apache/datafusion/pull/16238) (xudong963)
+
+**Other:**
+
+- Enable setting default values for target_partitions and planning_concurrency [#15712](https://github.com/apache/datafusion/pull/15712) (nuno-faria)
+- minor: fix doc comment [#15733](https://github.com/apache/datafusion/pull/15733) (niebayes)
+- chore(deps-dev): bump http-proxy-middleware from 2.0.6 to 2.0.9 in /datafusion/wasmtest/datafusion-wasm-app [#15738](https://github.com/apache/datafusion/pull/15738) (dependabot[bot])
+- Avoid computing unnecessary statstics [#15729](https://github.com/apache/datafusion/pull/15729) (xudong963)
+- chore(deps): bump libc from 0.2.171 to 0.2.172 [#15745](https://github.com/apache/datafusion/pull/15745) (dependabot[bot])
+- Final release note touchups [#15741](https://github.com/apache/datafusion/pull/15741) (alamb)
+- Refactor regexp slt tests [#15709](https://github.com/apache/datafusion/pull/15709) (kumarlokesh)
+- ExecutionPlan: add APIs for filter pushdown & optimizer rule to apply them [#15566](https://github.com/apache/datafusion/pull/15566) (adriangb)
+- Coerce and simplify FixedSizeBinary equality to literal binary [#15726](https://github.com/apache/datafusion/pull/15726) (leoyvens)
+- Minor: simplify code in datafusion-proto [#15752](https://github.com/apache/datafusion/pull/15752) (alamb)
+- chore(deps): bump clap from 4.5.35 to 4.5.36 [#15759](https://github.com/apache/datafusion/pull/15759) (dependabot[bot])
+- Support `Accumulator` for avg duration [#15468](https://github.com/apache/datafusion/pull/15468) (shruti2522)
+- Show current SQL recursion limit in RecursionLimitExceeded error message [#15644](https://github.com/apache/datafusion/pull/15644) (kumarlokesh)
+- Minor: fix flaky test in `aggregate.slt` [#15786](https://github.com/apache/datafusion/pull/15786) (xudong963)
+- Minor: remove unused logic for limit pushdown [#15730](https://github.com/apache/datafusion/pull/15730) (zhuqi-lucas)
+- chore(deps): bump sqllogictest from 0.28.0 to 0.28.1 [#15788](https://github.com/apache/datafusion/pull/15788) (dependabot[bot])
+- Add try_new for LogicalPlan::Join [#15757](https://github.com/apache/datafusion/pull/15757) (kumarlokesh)
+- Minor: eliminate unnecessary struct creation in session state build [#15800](https://github.com/apache/datafusion/pull/15800) (Rachelint)
+- chore(deps): bump half from 2.5.0 to 2.6.0 [#15806](https://github.com/apache/datafusion/pull/15806) (dependabot[bot])
+- Add `or_fun_call` and `unnecessary_lazy_evaluations` lints on `core` [#15807](https://github.com/apache/datafusion/pull/15807) (Rachelint)
+- chore(deps): bump env_logger from 0.11.7 to 0.11.8 [#15823](https://github.com/apache/datafusion/pull/15823) (dependabot[bot])
+- Support unparsing `UNION` for distinct results [#15814](https://github.com/apache/datafusion/pull/15814) (phillipleblanc)
+- Add `MemoryPool::memory_limit` to expose setting memory usage limit [#15828](https://github.com/apache/datafusion/pull/15828) (Rachelint)
+- Preserve projection for inline scan [#15825](https://github.com/apache/datafusion/pull/15825) (jayzhan211)
+- Minor: cleanup hash table after emit all [#15834](https://github.com/apache/datafusion/pull/15834) (jayzhan211)
+- chore(deps): bump pyo3 from 0.24.1 to 0.24.2 [#15838](https://github.com/apache/datafusion/pull/15838) (dependabot[bot])
+- Minor: fix potential flaky test in aggregate.slt [#15829](https://github.com/apache/datafusion/pull/15829) (bikbov)
+- Fix `ILIKE` expression support in SQL unparser [#15820](https://github.com/apache/datafusion/pull/15820) (ewgenius)
+- Make `Diagnostic` easy/convinient to attach by using macro and avoiding `map_err` [#15796](https://github.com/apache/datafusion/pull/15796) (logan-keede)
+- Feature/benchmark config from env [#15782](https://github.com/apache/datafusion/pull/15782) (ctsk)
+- predicate pruning: support cast and try_cast for more types [#15764](https://github.com/apache/datafusion/pull/15764) (adriangb)
+- Fix: fetch is missing in `plan_with_order_breaking_variants` method [#15842](https://github.com/apache/datafusion/pull/15842) (xudong963)
+- Fix `CoalescePartitionsExec` proto serialization [#15824](https://github.com/apache/datafusion/pull/15824) (lewiszlw)
+- Fix build failure caused by new `CoalescePartitionsExec::with_fetch` method [#15849](https://github.com/apache/datafusion/pull/15849) (lewiszlw)
+- Fix ScalarValue::List comparison when the compared lists have different lengths [#15856](https://github.com/apache/datafusion/pull/15856) (gabotechs)
+- chore: More details to `No UDF registered` error [#15843](https://github.com/apache/datafusion/pull/15843) (comphead)
+- chore(deps): bump clap from 4.5.36 to 4.5.37 [#15853](https://github.com/apache/datafusion/pull/15853) (dependabot[bot])
+- Remove usage of `dbg!` [#15858](https://github.com/apache/datafusion/pull/15858) (phillipleblanc)
+- Minor: Interval singleton [#15859](https://github.com/apache/datafusion/pull/15859) (jayzhan211)
+- Make aggr fuzzer query builder more configurable [#15851](https://github.com/apache/datafusion/pull/15851) (Rachelint)
+- chore(deps): bump aws-config from 1.6.1 to 1.6.2 [#15874](https://github.com/apache/datafusion/pull/15874) (dependabot[bot])
+- Add slt tests for `datafusion.execution.parquet.coerce_int96` setting [#15723](https://github.com/apache/datafusion/pull/15723) (alamb)
+- Improve `ListingTable` / `ListingTableOptions` docs [#15767](https://github.com/apache/datafusion/pull/15767) (alamb)
+- Migrate Optimizer tests to insta, part2 [#15884](https://github.com/apache/datafusion/pull/15884) (qstommyshu)
+- Improve documentation for `FileSource`, `DataSource` and `DataSourceExec` [#15766](https://github.com/apache/datafusion/pull/15766) (alamb)
+- Implement min max for dictionary types [#15827](https://github.com/apache/datafusion/pull/15827) (XiangpengHao)
+- chore(deps): bump blake3 from 1.8.1 to 1.8.2 [#15890](https://github.com/apache/datafusion/pull/15890) (dependabot[bot])
+- Respect ignore_nulls in array_agg [#15544](https://github.com/apache/datafusion/pull/15544) (joroKr21)
+- Set HashJoin seed [#15783](https://github.com/apache/datafusion/pull/15783) (ctsk)
+- Saner handling of nulls inside arrays [#15149](https://github.com/apache/datafusion/pull/15149) (joroKr21)
+- Keeping pull request in sync with the base branch [#15894](https://github.com/apache/datafusion/pull/15894) (xudong963)
+- Fix `flatten` scalar function when inner list is `FixedSizeList` [#15898](https://github.com/apache/datafusion/pull/15898) (gstvg)
+- support OR operator in binary `evaluate_bounds` [#15716](https://github.com/apache/datafusion/pull/15716) (davidhewitt)
+- infer placeholder datatype for IN lists [#15864](https://github.com/apache/datafusion/pull/15864) (kczimm)
+- Fix allow_update_branch [#15904](https://github.com/apache/datafusion/pull/15904) (xudong963)
+- chore(deps): bump tokio from 1.44.1 to 1.44.2 [#15900](https://github.com/apache/datafusion/pull/15900) (dependabot[bot])
+- chore(deps): bump assert_cmd from 2.0.16 to 2.0.17 [#15909](https://github.com/apache/datafusion/pull/15909) (dependabot[bot])
+- Factor out Substrait consumers into separate files [#15794](https://github.com/apache/datafusion/pull/15794) (gabotechs)
+- Unparse `UNNEST` projection with the table column alias [#15879](https://github.com/apache/datafusion/pull/15879) (goldmedal)
+- Migrate Optimizer tests to insta, part3 [#15893](https://github.com/apache/datafusion/pull/15893) (qstommyshu)
+- Minor: cleanup datafusion-spark scalar functions [#15921](https://github.com/apache/datafusion/pull/15921) (alamb)
+- Fix ClickBench extended queries after update to APPROX_PERCENTILE_CONT [#15929](https://github.com/apache/datafusion/pull/15929) (alamb)
+- Add extended query for checking improvement for blocked groups optimization [#15936](https://github.com/apache/datafusion/pull/15936) (Rachelint)
+- Speedup `character_length` [#15931](https://github.com/apache/datafusion/pull/15931) (Dandandan)
+- chore(deps): bump tokio-util from 0.7.14 to 0.7.15 [#15918](https://github.com/apache/datafusion/pull/15918) (dependabot[bot])
+- Migrate Optimizer tests to insta, part4 [#15937](https://github.com/apache/datafusion/pull/15937) (qstommyshu)
+- fix query results for predicates referencing partition columns and data columns [#15935](https://github.com/apache/datafusion/pull/15935) (adriangb)
+- chore(deps): bump substrait from 0.55.0 to 0.55.1 [#15941](https://github.com/apache/datafusion/pull/15941) (dependabot[bot])
+- Fix main CI by adding `rowsort` to slt test [#15942](https://github.com/apache/datafusion/pull/15942) (xudong963)
+- Improve sqllogictest error reporting [#15905](https://github.com/apache/datafusion/pull/15905) (gabotechs)
+- refactor filter pushdown apis [#15801](https://github.com/apache/datafusion/pull/15801) (adriangb)
+- Add additional tests for filter pushdown apis [#15955](https://github.com/apache/datafusion/pull/15955) (adriangb)
+- Improve filter pushdown optimizer rule performance [#15959](https://github.com/apache/datafusion/pull/15959) (adriangb)
+- Reduce rehashing cost for primitive grouping by also reusing hash value [#15962](https://github.com/apache/datafusion/pull/15962) (Rachelint)
+- chore(deps): bump chrono from 0.4.40 to 0.4.41 [#15956](https://github.com/apache/datafusion/pull/15956) (dependabot[bot])
+- refactor: replace `unwrap_or` with `unwrap_or_else` for improved lazy… [#15841](https://github.com/apache/datafusion/pull/15841) (NevroHelios)
+- add benchmark code for `Reuse rows in row cursor stream` [#15913](https://github.com/apache/datafusion/pull/15913) (acking-you)
+- [Update] : Removal of duplicate CI jobs [#15966](https://github.com/apache/datafusion/pull/15966) (Adez017)
+- Segfault in ByteGroupValueBuilder [#15968](https://github.com/apache/datafusion/pull/15968) (thinkharderdev)
+- make can_expr_be_pushed_down_with_schemas public again [#15971](https://github.com/apache/datafusion/pull/15971) (adriangb)
+- re-export can_expr_be_pushed_down_with_schemas to be public [#15974](https://github.com/apache/datafusion/pull/15974) (adriangb)
+- Migrate Optimizer tests to insta, part5 [#15945](https://github.com/apache/datafusion/pull/15945) (qstommyshu)
+- Show LogicalType name for `INFORMATION_SCHEMA` [#15965](https://github.com/apache/datafusion/pull/15965) (goldmedal)
+- chore(deps): bump sha2 from 0.10.8 to 0.10.9 [#15970](https://github.com/apache/datafusion/pull/15970) (dependabot[bot])
+- chore(deps): bump insta from 1.42.2 to 1.43.1 [#15988](https://github.com/apache/datafusion/pull/15988) (dependabot[bot])
+- [datafusion-spark] Add Spark-compatible hex function [#15947](https://github.com/apache/datafusion/pull/15947) (andygrove)
+- refactor: remove deprecated `AvroExec` [#15987](https://github.com/apache/datafusion/pull/15987) (miroim)
+- Substrait: Handle inner map fields in schema renaming [#15869](https://github.com/apache/datafusion/pull/15869) (cht42)
+- refactor: remove deprecated `CsvExec` [#15991](https://github.com/apache/datafusion/pull/15991) (miroim)
+- Migrate Optimizer tests to insta, part6 [#15984](https://github.com/apache/datafusion/pull/15984) (qstommyshu)
+- chore(deps): bump nix from 0.29.0 to 0.30.1 [#16002](https://github.com/apache/datafusion/pull/16002) (dependabot[bot])
+- Implement RightSemi join for SortMergeJoin [#15972](https://github.com/apache/datafusion/pull/15972) (irenjj)
+- Migrate Optimizer tests to insta, part7 [#16010](https://github.com/apache/datafusion/pull/16010) (qstommyshu)
+- chore(deps): bump sysinfo from 0.34.2 to 0.35.1 [#16027](https://github.com/apache/datafusion/pull/16027) (dependabot[bot])
+- refactor: move `should_enable_page_index` from `mod.rs` to `opener.rs` [#16026](https://github.com/apache/datafusion/pull/16026) (miroim)
+- chore(deps): bump sqllogictest from 0.28.1 to 0.28.2 [#16037](https://github.com/apache/datafusion/pull/16037) (dependabot[bot])
+- chores: Add lint rule to enforce string formatting style [#16024](https://github.com/apache/datafusion/pull/16024) (Lordworms)
+- Use human-readable byte sizes in `EXPLAIN` [#16043](https://github.com/apache/datafusion/pull/16043) (tlm365)
+- Docs: Add example of creating a field in `return_field_from_args` [#16039](https://github.com/apache/datafusion/pull/16039) (alamb)
+- Support `MIN` and `MAX` for `DataType::List` [#16025](https://github.com/apache/datafusion/pull/16025) (gabotechs)
+- Improve docs for Exprs and scalar functions [#16036](https://github.com/apache/datafusion/pull/16036) (alamb)
+- Add h2o window benchmark [#16003](https://github.com/apache/datafusion/pull/16003) (2010YOUY01)
+- Fix Infer prepare statement type tests [#15743](https://github.com/apache/datafusion/pull/15743) (brayanjuls)
+- style: simplify some strings for readability [#15999](https://github.com/apache/datafusion/pull/15999) (hamirmahal)
+- support simple/cross lateral joins [#16015](https://github.com/apache/datafusion/pull/16015) (jayzhan211)
+- Improve error message on Out of Memory [#16050](https://github.com/apache/datafusion/pull/16050) (ding-young)
+- chore(deps): bump the arrow-parquet group with 7 updates [#16047](https://github.com/apache/datafusion/pull/16047) (dependabot[bot])
+- chore(deps): bump petgraph from 0.7.1 to 0.8.1 [#15669](https://github.com/apache/datafusion/pull/15669) (dependabot[bot])
+- [datafusion-spark] Add Spark-compatible `char` expression [#15994](https://github.com/apache/datafusion/pull/15994) (andygrove)
+- chore(deps): bump substrait from 0.55.1 to 0.56.0 [#16091](https://github.com/apache/datafusion/pull/16091) (dependabot[bot])
+- Add test that demonstrate behavior for `collect_statistics` [#16098](https://github.com/apache/datafusion/pull/16098) (alamb)
+- Refactor substrait producer into multiple files [#16089](https://github.com/apache/datafusion/pull/16089) (gabotechs)
+- Fix temp dir leak in tests [#16094](https://github.com/apache/datafusion/pull/16094) (findepi)
+- Label Spark functions PRs with spark label [#16095](https://github.com/apache/datafusion/pull/16095) (findepi)
+- Added SLT tests for IMDB benchmark queries [#16067](https://github.com/apache/datafusion/pull/16067) (kumarlokesh)
+- chore(CI) Upgrade toolchain to Rust-1.87 [#16068](https://github.com/apache/datafusion/pull/16068) (kadai0308)
+- minor: Add benchmark query and corresponding documentation for Average Duration [#16105](https://github.com/apache/datafusion/pull/16105) (logan-keede)
+- Use qualified names on DELETE selections [#16033](https://github.com/apache/datafusion/pull/16033) (nuno-faria)
+- chore(deps): bump testcontainers from 0.23.3 to 0.24.0 [#15989](https://github.com/apache/datafusion/pull/15989) (dependabot[bot])
+- Clean up ExternalSorter and use upstream kernel [#16109](https://github.com/apache/datafusion/pull/16109) (alamb)
+- Test Duration in aggregation `fuzz` tests [#16111](https://github.com/apache/datafusion/pull/16111) (alamb)
+- Move PruningStatistics into datafusion::common [#16069](https://github.com/apache/datafusion/pull/16069) (adriangb)
+- Revert use file schema in parquet pruning [#16086](https://github.com/apache/datafusion/pull/16086) (adriangb)
+- Minor: Add `ScalarFunctionArgs::return_type` method [#16113](https://github.com/apache/datafusion/pull/16113) (alamb)
+- Fix `contains` function expression [#16046](https://github.com/apache/datafusion/pull/16046) (liamzwbao)
+- chore: Use materialized data for filter pushdown tests [#16123](https://github.com/apache/datafusion/pull/16123) (comphead)
+- chore: Upgrade rand crate and some other minor crates [#16062](https://github.com/apache/datafusion/pull/16062) (comphead)
+- Include data types in logical plans of inferred prepare statements [#16019](https://github.com/apache/datafusion/pull/16019) (brayanjuls)
+- CI: Fix extended test failure [#16144](https://github.com/apache/datafusion/pull/16144) (2010YOUY01)
+- Fix: handle column name collisions when combining UNION logical inputs & nested Column expressions in maybe_fix_physical_column_name [#16064](https://github.com/apache/datafusion/pull/16064) (LiaCastaneda)
+- adding support for Min/Max over LargeList and FixedSizeList [#16071](https://github.com/apache/datafusion/pull/16071) (logan-keede)
+- Move prepare/parameter handling tests into `params.rs` [#16141](https://github.com/apache/datafusion/pull/16141) (liamzwbao)
+- Minor: Add `Accumulator::return_type` and `StateFieldsArgs::return_type` to help with upgrade to 48 [#16112](https://github.com/apache/datafusion/pull/16112) (alamb)
+- Support filtering specific sqllogictests identified by line number [#16029](https://github.com/apache/datafusion/pull/16029) (gabotechs)
+- Enrich GroupedHashAggregateStream name to ease debugging Resources exhausted errors [#16152](https://github.com/apache/datafusion/pull/16152) (ahmed-mez)
+- chore(deps): bump uuid from 1.16.0 to 1.17.0 [#16162](https://github.com/apache/datafusion/pull/16162) (dependabot[bot])
+- Clarify docs and names in parquet predicate pushdown tests [#16155](https://github.com/apache/datafusion/pull/16155) (alamb)
+- Minor: Fix name() for FilterPushdown physical optimizer rule [#16175](https://github.com/apache/datafusion/pull/16175) (adriangb)
+- migrate tests in `pool.rs` to use insta [#16145](https://github.com/apache/datafusion/pull/16145) (lifan-ake)
+- refactor(optimizer): Add support for dynamically adding test tables [#16138](https://github.com/apache/datafusion/pull/16138) (atahanyorganci)
+- [Minor] Speedup TPC-H benchmark run with memtable option [#16159](https://github.com/apache/datafusion/pull/16159) (Dandandan)
+- Fast path for joins with distinct values in build side [#16153](https://github.com/apache/datafusion/pull/16153) (Dandandan)
+- chore: Reduce repetition in the parameter type inference tests [#16079](https://github.com/apache/datafusion/pull/16079) (jsai28)
+- chore(deps): bump tokio from 1.45.0 to 1.45.1 [#16190](https://github.com/apache/datafusion/pull/16190) (dependabot[bot])
+- Improve `unproject_sort_expr` to handle arbitrary expressions [#16127](https://github.com/apache/datafusion/pull/16127) (phillipleblanc)
+- chore(deps): bump rustyline from 15.0.0 to 16.0.0 [#16194](https://github.com/apache/datafusion/pull/16194) (dependabot[bot])
+- migrate `logical_plan` tests to insta [#16184](https://github.com/apache/datafusion/pull/16184) (lifan-ake)
+- chore(deps): bump clap from 4.5.38 to 4.5.39 [#16204](https://github.com/apache/datafusion/pull/16204) (dependabot[bot])
+- implement `AggregateExec.partition_statistics` [#15954](https://github.com/apache/datafusion/pull/15954) (UBarney)
+- Propagate .execute() calls immediately in `RepartitionExec` [#16093](https://github.com/apache/datafusion/pull/16093) (gabotechs)
+- Set aggregation hash seed [#16165](https://github.com/apache/datafusion/pull/16165) (ctsk)
+- Fix ScalarStructBuilder::build() for an empty struct [#16205](https://github.com/apache/datafusion/pull/16205) (Blizzara)
+- Return an error on overflow in `do_append_val_inner` [#16201](https://github.com/apache/datafusion/pull/16201) (liamzwbao)
+- chore(deps): bump testcontainers-modules from 0.12.0 to 0.12.1 [#16212](https://github.com/apache/datafusion/pull/16212) (dependabot[bot])
+- Substrait: handle identical grouping expressions [#16189](https://github.com/apache/datafusion/pull/16189) (cht42)
+- Add new stats pruning helpers to allow combining partition values in file level stats [#16139](https://github.com/apache/datafusion/pull/16139) (adriangb)
+- Implement schema adapter support for FileSource and add integration tests [#16148](https://github.com/apache/datafusion/pull/16148) (kosiew)
+- Minor: update documentation for PrunableStatistics [#16213](https://github.com/apache/datafusion/pull/16213) (alamb)
+- Remove use of deprecated dict_ordered in datafusion-proto (#16218) [#16220](https://github.com/apache/datafusion/pull/16220) (cj-zhukov)
+- Minor: Print cargo command in bench script [#16236](https://github.com/apache/datafusion/pull/16236) (2010YOUY01)
+- Simplify FileSource / SchemaAdapterFactory API [#16214](https://github.com/apache/datafusion/pull/16214) (alamb)
+- Add dicts to aggregation fuzz testing [#16232](https://github.com/apache/datafusion/pull/16232) (blaginin)
+- chore(deps): bump sysinfo from 0.35.1 to 0.35.2 [#16247](https://github.com/apache/datafusion/pull/16247) (dependabot[bot])
+- Improve performance of constant aggregate window expression [#16234](https://github.com/apache/datafusion/pull/16234) (suibianwanwank)
+- Support compound identifier when parsing tuples [#16225](https://github.com/apache/datafusion/pull/16225) (hozan23)
+- Schema adapter helper [#16108](https://github.com/apache/datafusion/pull/16108) (kosiew)
+- Update tpch, clickbench, sort_tpch to mark failed queries [#16182](https://github.com/apache/datafusion/pull/16182) (ding-young)
+- Adjust slttest to pass without RUST_BACKTRACE enabled [#16251](https://github.com/apache/datafusion/pull/16251) (alamb)
+- Handle dicts for distinct count [#15871](https://github.com/apache/datafusion/pull/15871) (blaginin)
+- Add `--substrait-round-trip` option in sqllogictests [#16183](https://github.com/apache/datafusion/pull/16183) (gabotechs)
+- Minor: fix upgrade papercut `pub use PruningStatistics` [#16264](https://github.com/apache/datafusion/pull/16264) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    30	dependabot[bot]
+    29	Andrew Lamb
+    16	xudong.w
+    14	Adrian Garcia Badaracco
+    10	Chen Chongchen
+     8	Gabriel
+     8	Oleks V
+     7	miro
+     6	Tommy shu
+     6	kamille
+     5	Lokesh
+     5	Tim Saucer
+     4	Dmitrii Blaginin
+     4	Jay Zhan
+     4	Nuno Faria
+     4	Yongting You
+     4	logan-keede
+     3	Christian
+     3	Daniël Heres
+     3	Liam Bao
+     3	Phillip LeBlanc
+     3	Piotr Findeisen
+     3	ding-young
+     2	Andy Grove
+     2	Atahan Yorgancı
+     2	Brayan Jules
+     2	Georgi Krastev
+     2	Jax Liu
+     2	Jérémie Drouet
+     2	LB7666
+     2	Leonardo Yvens
+     2	Qi Zhu
+     2	Sergey Zhukov
+     2	Shruti Sharma
+     2	Tai Le Manh
+     2	aditya singh rathore
+     2	ake
+     2	cht42
+     2	gstvg
+     2	kosiew
+     2	niebayes
+     2	张林伟
+     1	Ahmed Mezghani
+     1	Alexander Droste
+     1	Andy Yen
+     1	Arka Dash
+     1	Arttu
+     1	Dan Harris
+     1	David Hewitt
+     1	Davy
+     1	Ed Seidl
+     1	Eshed Schacham
+     1	Evgenii Khramkov
+     1	Florent Monjalet
+     1	Galim Bikbov
+     1	Garam Choi
+     1	Hamir Mahal
+     1	Hendrik Makait
+     1	Jonathan Chen
+     1	Joseph Fahnestock
+     1	Kevin Zimmerman
+     1	Lordworms
+     1	Lía Adriana
+     1	Matt Butrovich
+     1	Namgung Chan
+     1	Nelson Antunes
+     1	Patrick Sullivan
+     1	Raz Luvaton
+     1	Ruihang Xia
+     1	Ryan Roelke
+     1	Sam Hughes
+     1	Shehab Amin
+     1	Sile Zhou
+     1	Simon Vandel Sillesen
+     1	Tom Montgomery
+     1	UBarney
+     1	Victorien
+     1	Xiangpeng Hao
+     1	Zaki
+     1	chen quan
+     1	delamarch3
+     1	discord9
+     1	hozan23
+     1	irenjj
+     1	jsai28
+     1	m09526
+     1	suibianwanwan
+     1	the0ninjas
+     1	wiedld
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/49.0.0.md b/dev/changelog/49.0.0.md
new file mode 100644
index 0000000000000..239c7c9dfc973
--- /dev/null
+++ b/dev/changelog/49.0.0.md
@@ -0,0 +1,387 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 49.0.0 Changelog
+
+This release consists of 253 commits from 71 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- feat: add metadata to literal expressions [#16170](https://github.com/apache/datafusion/pull/16170) (timsaucer)
+- [MAJOR] Equivalence System Overhaul [#16217](https://github.com/apache/datafusion/pull/16217) (ozankabak)
+- remove unused methods in SortExec [#16457](https://github.com/apache/datafusion/pull/16457) (adriangb)
+- Move Pruning Logic to a Dedicated datafusion-pruning Crate for Improved Modularity [#16549](https://github.com/apache/datafusion/pull/16549) (kosiew)
+- Fix type of ExecutionOptions::time_zone [#16569](https://github.com/apache/datafusion/pull/16569) (findepi)
+- Convert Option<Vec<sort expression>> to Vec<sort expression> [#16615](https://github.com/apache/datafusion/pull/16615) (ViggoC)
+- Refactor error handling to use boxed errors for DataFusionError variants [#16672](https://github.com/apache/datafusion/pull/16672) (kosiew)
+- Reuse Rows allocation in RowCursorStream [#16647](https://github.com/apache/datafusion/pull/16647) (Dandandan)
+- refactor: shrink `SchemaError` [#16653](https://github.com/apache/datafusion/pull/16653) (crepererum)
+- Remove unused AggregateUDF struct [#16683](https://github.com/apache/datafusion/pull/16683) (ViggoC)
+- Bump the MSRV to `1.85.1` due to transitive dependencies (`aws-sdk`) [#16728](https://github.com/apache/datafusion/pull/16728) (rtyler)
+
+**Performance related:**
+
+- Add late pruning of Parquet files based on file level statistics [#16014](https://github.com/apache/datafusion/pull/16014) (adriangb)
+- Add fast paths for try_process_unnest [#16389](https://github.com/apache/datafusion/pull/16389) (simonvandel)
+- Set the default value of `datafusion.execution.collect_statistics` to `true` [#16447](https://github.com/apache/datafusion/pull/16447) (AdamGS)
+- Perf: Optimize CursorValues compare performance for StringViewArray (1.4X faster for sort-tpch Q11) [#16509](https://github.com/apache/datafusion/pull/16509) (zhuqi-lucas)
+- Simplify predicates in `PushDownFilter` optimizer rule [#16362](https://github.com/apache/datafusion/pull/16362) (xudong963)
+- optimize `ScalarValue::to_array_of_size` for structural types [#16706](https://github.com/apache/datafusion/pull/16706) (ding-young)
+- Refactor filter pushdown APIs to enable joins to pass through filters [#16732](https://github.com/apache/datafusion/pull/16732) (adriangb)
+- perf: Optimize hash joins with an empty build side [#16716](https://github.com/apache/datafusion/pull/16716) (nuno-faria)
+- Per file filter evaluation [#15057](https://github.com/apache/datafusion/pull/15057) (adriangb)
+
+**Implemented enhancements:**
+
+- feat: Support defining custom MetricValues in PhysicalPlans [#16195](https://github.com/apache/datafusion/pull/16195) (sfluor)
+- feat: Allow cancelling of grouping operations which are CPU bound [#16196](https://github.com/apache/datafusion/pull/16196) (zhuqi-lucas)
+- feat: support FixedSizeList for array_has [#16333](https://github.com/apache/datafusion/pull/16333) (chenkovsky)
+- feat: Support tpch and tpch10 benchmark for csv format [#16373](https://github.com/apache/datafusion/pull/16373) (zhuqi-lucas)
+- feat: Support RightMark join for NestedLoop and Hash join [#16083](https://github.com/apache/datafusion/pull/16083) (jonathanc-n)
+- feat: mapping sql Char/Text/String default to Utf8View [#16290](https://github.com/apache/datafusion/pull/16290) (zhuqi-lucas)
+- feat: support fixed size list for array reverse [#16423](https://github.com/apache/datafusion/pull/16423) (chenkovsky)
+- feat: add SchemaProvider::table_type(table_name: &str) [#16401](https://github.com/apache/datafusion/pull/16401) (epgif)
+- feat: derive `Debug` and `Clone` for `ScalarFunctionArgs` [#16471](https://github.com/apache/datafusion/pull/16471) (crepererum)
+- feat: support `map_entries` builtin function [#16557](https://github.com/apache/datafusion/pull/16557) (comphead)
+- feat: add `array_min` scalar function and associated tests [#16574](https://github.com/apache/datafusion/pull/16574) (dharanad)
+- feat: Finalize support for `RightMark` join + `Mark` join swap [#16488](https://github.com/apache/datafusion/pull/16488) (jonathanc-n)
+- feat: Parquet modular encryption [#16351](https://github.com/apache/datafusion/pull/16351) (corwinjoy)
+- feat: Support `u32` indices for `HashJoinExec` [#16434](https://github.com/apache/datafusion/pull/16434) (jonathanc-n)
+- feat: expose intersect distinct/except distinct in dataframe api [#16578](https://github.com/apache/datafusion/pull/16578) (chenkovsky)
+- feat: Add a configuration to make parquet encryption optional [#16649](https://github.com/apache/datafusion/pull/16649) (corwinjoy)
+
+**Fixed bugs:**
+
+- fix: preserve null_equals_null flag in eliminate_cross_join rule [#16356](https://github.com/apache/datafusion/pull/16356) (waynexia)
+- fix: Fix SparkSha2 to be compliant with Spark response and add support for Int32 [#16350](https://github.com/apache/datafusion/pull/16350) (rishvin)
+- fix: Fixed error handling for `generate_series/range` [#16391](https://github.com/apache/datafusion/pull/16391) (jonathanc-n)
+- fix: Enable WASM compilation by making sqlparser's recursive-protection optional [#16418](https://github.com/apache/datafusion/pull/16418) (jonmmease)
+- fix: create file for empty stream [#16342](https://github.com/apache/datafusion/pull/16342) (chenkovsky)
+- fix: document and fix macro hygiene for `config_field!` [#16473](https://github.com/apache/datafusion/pull/16473) (crepererum)
+- fix: make `with_new_state` a trait method for `ExecutionPlan` [#16469](https://github.com/apache/datafusion/pull/16469) (geoffreyclaude)
+- fix: column indices in FFI partition evaluator [#16480](https://github.com/apache/datafusion/pull/16480) (timsaucer)
+- fix: support within_group [#16538](https://github.com/apache/datafusion/pull/16538) (chenkovsky)
+- fix: disallow specify both order_by and within_group [#16606](https://github.com/apache/datafusion/pull/16606) (watchingthewheelsgo)
+- fix: format within_group error message [#16613](https://github.com/apache/datafusion/pull/16613) (watchingthewheelsgo)
+- fix: reserved keywords in qualified column names [#16584](https://github.com/apache/datafusion/pull/16584) (crepererum)
+- fix: support scalar function nested in get_field in Unparser [#16610](https://github.com/apache/datafusion/pull/16610) (chenkovsky)
+- fix: sqllogictest runner label condition mismatch [#16633](https://github.com/apache/datafusion/pull/16633) (lliangyu-lin)
+- fix: port arrow inline fast key fix to datafusion [#16698](https://github.com/apache/datafusion/pull/16698) (zhuqi-lucas)
+- fix: try to lower plain reserved functions to columns as well [#16669](https://github.com/apache/datafusion/pull/16669) (crepererum)
+- fix: Fix CI failing due to #16686 [#16718](https://github.com/apache/datafusion/pull/16718) (jonathanc-n)
+- fix: return NULL if any of the param to make_date is NULL [#16759](https://github.com/apache/datafusion/pull/16759) (feniljain)
+- fix: add `order_requirement` & `dist_requirement` to `OutputRequirementExec` display [#16726](https://github.com/apache/datafusion/pull/16726) (Loaki07)
+- fix: support nullable columns in pre-sorted data sources [#16783](https://github.com/apache/datafusion/pull/16783) (crepererum)
+- fix: The inconsistency between scalar and array on the cast decimal to timestamp [#16539](https://github.com/apache/datafusion/pull/16539) (chenkovsky)
+- fix: unit test for object_storage [#16824](https://github.com/apache/datafusion/pull/16824) (chenkovsky)
+- fix(docs): Update broken links to `TableProvider` docs [#16830](https://github.com/apache/datafusion/pull/16830) (jcsherin)
+
+**Documentation updates:**
+
+- Minor: Add upgrade guide for `Expr::WindowFunction` [#16313](https://github.com/apache/datafusion/pull/16313) (alamb)
+- Fix `array_position` on empty list [#16292](https://github.com/apache/datafusion/pull/16292) (Blizzara)
+- Fix: mark "Spilling (to disk) Joins" as supported in features [#16343](https://github.com/apache/datafusion/pull/16343) (kosiew)
+- Fix cp_solver doc formatting [#16352](https://github.com/apache/datafusion/pull/16352) (xudong963)
+- docs: Expand `MemoryPool` docs with related structs [#16289](https://github.com/apache/datafusion/pull/16289) (2010YOUY01)
+- Support datafusion-cli access to public S3 buckets that do not require authentication [#16300](https://github.com/apache/datafusion/pull/16300) (alamb)
+- Document Table Constraint Enforcement Behavior in Custom Table Providers Guide [#16340](https://github.com/apache/datafusion/pull/16340) (kosiew)
+- doc: Add SQL examples for SEMI + ANTI Joins [#16316](https://github.com/apache/datafusion/pull/16316) (jonathanc-n)
+- [datafusion-spark] Example of using Spark compatible function library [#16384](https://github.com/apache/datafusion/pull/16384) (alamb)
+- Add note in upgrade guide about changes to `Expr::Scalar` in 48.0.0 [#16360](https://github.com/apache/datafusion/pull/16360) (alamb)
+- Update PMC management instructions to follow new ASF process [#16417](https://github.com/apache/datafusion/pull/16417) (alamb)
+- Add design process section to the docs [#16397](https://github.com/apache/datafusion/pull/16397) (alamb)
+- Unify Metadata Handing: use `FieldMetadata` in `Expr::Alias` and `ExprSchemable` [#16320](https://github.com/apache/datafusion/pull/16320) (alamb)
+- TopK dynamic filter pushdown attempt 2 [#15770](https://github.com/apache/datafusion/pull/15770) (adriangb)
+- Update Roadmap documentation [#16399](https://github.com/apache/datafusion/pull/16399) (alamb)
+- doc: Add comments to clarify algorithm for `MarkJoin`s [#16436](https://github.com/apache/datafusion/pull/16436) (jonathanc-n)
+- Add compression option to SpillManager [#16268](https://github.com/apache/datafusion/pull/16268) (ding-young)
+- Redirect user defined function webpage [#16475](https://github.com/apache/datafusion/pull/16475) (alamb)
+- Use Tokio's task budget consistently, better APIs to support task cancellation [#16398](https://github.com/apache/datafusion/pull/16398) (pepijnve)
+- doc: upgrade guide for new compression option for spill files [#16472](https://github.com/apache/datafusion/pull/16472) (2010YOUY01)
+- Introduce Async User Defined Functions [#14837](https://github.com/apache/datafusion/pull/14837) (goldmedal)
+- Minor: Add more links to cooperative / scheduling docs [#16484](https://github.com/apache/datafusion/pull/16484) (alamb)
+- doc: Document DESCRIBE comman in ddl.md [#16524](https://github.com/apache/datafusion/pull/16524) (krikera)
+- Add more doc for physical filter pushdown [#16504](https://github.com/apache/datafusion/pull/16504) (xudong963)
+- chore: fix CI failures on `ddl.md` [#16526](https://github.com/apache/datafusion/pull/16526) (comphead)
+- Add some comments about adding new dependencies in datafusion-sql [#16543](https://github.com/apache/datafusion/pull/16543) (alamb)
+- Add note for planning release in Upgrade Guides [#16534](https://github.com/apache/datafusion/pull/16534) (xudong963)
+- Consolidate configuration sections in docs [#16544](https://github.com/apache/datafusion/pull/16544) (alamb)
+- Minor: add clearer link to the main website from intro paragraph. [#16556](https://github.com/apache/datafusion/pull/16556) (alamb)
+- Simplify AsyncScalarUdfImpl so it extends ScalarUdfImpl [#16523](https://github.com/apache/datafusion/pull/16523) (alamb)
+- docs: Minor grammatical fixes for the scalar UDF docs [#16618](https://github.com/apache/datafusion/pull/16618) (ianthetechie)
+- Implementation for regex_instr [#15928](https://github.com/apache/datafusion/pull/15928) (nirnayroy)
+- Update Upgrade Guide for 48.0.1 [#16699](https://github.com/apache/datafusion/pull/16699) (alamb)
+- ensure MemTable has at least one partition [#16754](https://github.com/apache/datafusion/pull/16754) (waynexia)
+- Restore custom SchemaAdapter functionality for Parquet [#16791](https://github.com/apache/datafusion/pull/16791) (adriangb)
+- Update `upgrading.md` for new unified config for sql string mapping to utf8view [#16809](https://github.com/apache/datafusion/pull/16809) (zhuqi-lucas)
+- docs: Remove reference to forthcoming example (#16817) [#16818](https://github.com/apache/datafusion/pull/16818) (m09526)
+- docs: Fix broken links [#16839](https://github.com/apache/datafusion/pull/16839) (2010YOUY01)
+- Add note to upgrade guide about MSRV update [#16845](https://github.com/apache/datafusion/pull/16845) (alamb)
+
+**Other:**
+
+- chore(deps): bump sqllogictest from 0.28.2 to 0.28.3 [#16286](https://github.com/apache/datafusion/pull/16286) (dependabot[bot])
+- chore(deps-dev): bump webpack-dev-server from 4.15.1 to 5.2.1 in /datafusion/wasmtest/datafusion-wasm-app [#16253](https://github.com/apache/datafusion/pull/16253) (dependabot[bot])
+- Improve DataFusion subcrate readme files [#16263](https://github.com/apache/datafusion/pull/16263) (alamb)
+- Fix intermittent SQL logic test failure in limit.slt by adding ORDER BY clause [#16257](https://github.com/apache/datafusion/pull/16257) (kosiew)
+- Extend benchmark comparison script with more detailed statistics [#16262](https://github.com/apache/datafusion/pull/16262) (pepijnve)
+- chore(deps): bump flate2 from 1.1.1 to 1.1.2 [#16338](https://github.com/apache/datafusion/pull/16338) (dependabot[bot])
+- chore(deps): bump petgraph from 0.8.1 to 0.8.2 [#16337](https://github.com/apache/datafusion/pull/16337) (dependabot[bot])
+- chore(deps): bump substrait from 0.56.0 to 0.57.0 [#16143](https://github.com/apache/datafusion/pull/16143) (dependabot[bot])
+- Add test for ordering of predicate pushdown into parquet [#16169](https://github.com/apache/datafusion/pull/16169) (adriangb)
+- Fix distinct count for DictionaryArray to correctly account for nulls in values array [#16258](https://github.com/apache/datafusion/pull/16258) (kosiew)
+- Fix inconsistent schema projection in ListingTable even when schema is specified [#16305](https://github.com/apache/datafusion/pull/16305) (kosiew)
+- tpch: move reading of SQL queries out of timed span. [#16357](https://github.com/apache/datafusion/pull/16357) (pepijnve)
+- chore(deps): bump clap from 4.5.39 to 4.5.40 [#16354](https://github.com/apache/datafusion/pull/16354) (dependabot[bot])
+- chore(deps): bump syn from 2.0.101 to 2.0.102 [#16355](https://github.com/apache/datafusion/pull/16355) (dependabot[bot])
+- Encapsulate metadata for literals on to a `FieldMetadata` structure [#16317](https://github.com/apache/datafusion/pull/16317) (alamb)
+- Add support `UInt64` and other integer data types for `to_hex` [#16335](https://github.com/apache/datafusion/pull/16335) (tlm365)
+- Document `copy_array_data` function with example [#16361](https://github.com/apache/datafusion/pull/16361) (alamb)
+- Fix array_agg memory over use [#16346](https://github.com/apache/datafusion/pull/16346) (gabotechs)
+- Update publish command [#16377](https://github.com/apache/datafusion/pull/16377) (xudong963)
+- Add more context to error message for datafusion-cli config failure [#16379](https://github.com/apache/datafusion/pull/16379) (alamb)
+- Fix: datafusion-sqllogictest 48.0.0 can't be published [#16376](https://github.com/apache/datafusion/pull/16376) (xudong963)
+- bug: remove busy-wait while sort is ongoing [#16322](https://github.com/apache/datafusion/pull/16322) (pepijnve)
+- chore: refactor Substrait consumer's "rename_field" and implement the rest of types [#16345](https://github.com/apache/datafusion/pull/16345) (Blizzara)
+- chore(deps): bump object_store from 0.12.1 to 0.12.2 [#16368](https://github.com/apache/datafusion/pull/16368) (dependabot[bot])
+- Disable `datafusion-cli` tests for hash_collision tests, fix extended CI [#16382](https://github.com/apache/datafusion/pull/16382) (alamb)
+- Fix array_concat with NULL arrays [#16348](https://github.com/apache/datafusion/pull/16348) (alexanderbianchi)
+- Minor: add testing case for add YieldStreamExec and polish docs [#16369](https://github.com/apache/datafusion/pull/16369) (zhuqi-lucas)
+- chore(deps): bump aws-config from 1.6.3 to 1.8.0 [#16394](https://github.com/apache/datafusion/pull/16394) (dependabot[bot])
+- fix typo in test file name [#16403](https://github.com/apache/datafusion/pull/16403) (adriangb)
+- Add topk_tpch benchmark [#16410](https://github.com/apache/datafusion/pull/16410) (Dandandan)
+- Reduce some cloning [#16404](https://github.com/apache/datafusion/pull/16404) (simonvandel)
+- chore(deps): bump syn from 2.0.102 to 2.0.103 [#16393](https://github.com/apache/datafusion/pull/16393) (dependabot[bot])
+- Simplify expressions passed to table functions [#16388](https://github.com/apache/datafusion/pull/16388) (simonvandel)
+- Minor: Clean-up `bench.sh` usage message [#16416](https://github.com/apache/datafusion/pull/16416) (2010YOUY01)
+- chore(deps): bump rust_decimal from 1.37.1 to 1.37.2 [#16422](https://github.com/apache/datafusion/pull/16422) (dependabot[bot])
+- Migrate core test to insta, part1 [#16324](https://github.com/apache/datafusion/pull/16324) (Chen-Yuan-Lai)
+- chore(deps): bump mimalloc from 0.1.46 to 0.1.47 [#16426](https://github.com/apache/datafusion/pull/16426) (dependabot[bot])
+- chore(deps): bump libc from 0.2.172 to 0.2.173 [#16421](https://github.com/apache/datafusion/pull/16421) (dependabot[bot])
+- Use dedicated NullEquality enum instead of null_equals_null boolean [#16419](https://github.com/apache/datafusion/pull/16419) (tobixdev)
+- chore: generate basic spark function tests [#16409](https://github.com/apache/datafusion/pull/16409) (shehabgamin)
+- Fix CI Failure: replace false with NullEqualsNothing [#16437](https://github.com/apache/datafusion/pull/16437) (ding-young)
+- chore(deps): bump bzip2 from 0.5.2 to 0.6.0 [#16441](https://github.com/apache/datafusion/pull/16441) (dependabot[bot])
+- chore(deps): bump libc from 0.2.173 to 0.2.174 [#16440](https://github.com/apache/datafusion/pull/16440) (dependabot[bot])
+- Remove redundant license-header-check CI job [#16451](https://github.com/apache/datafusion/pull/16451) (alamb)
+- Remove unused feature in `physical-plan` and fix compilation error in benchmark [#16449](https://github.com/apache/datafusion/pull/16449) (AdamGS)
+- Temporarily fix bug in dynamic top-k optimization [#16465](https://github.com/apache/datafusion/pull/16465) (AdamGS)
+- Ignore `sort_query_fuzzer_runner` [#16462](https://github.com/apache/datafusion/pull/16462) (blaginin)
+- Revert "Ignore `sort_query_fuzzer_runner` (#16462)" [#16470](https://github.com/apache/datafusion/pull/16470) (2010YOUY01)
+- Reapply "Ignore `sort_query_fuzzer_runner` (#16462)" (#16470) [#16485](https://github.com/apache/datafusion/pull/16485) (alamb)
+- Fix constant window for evaluate stateful [#16430](https://github.com/apache/datafusion/pull/16430) (suibianwanwank)
+- Use UDTF name in logical plan table scan [#16468](https://github.com/apache/datafusion/pull/16468) (Jeadie)
+- refactor reassign_predicate_columns to accept an &Schema instead of &Arc<Schema> [#16499](https://github.com/apache/datafusion/pull/16499) (adriangb)
+- re-enable `sort_query_fuzzer_runner` [#16491](https://github.com/apache/datafusion/pull/16491) (adriangb)
+- Example for using a separate threadpool for CPU bound work (try 3) [#16331](https://github.com/apache/datafusion/pull/16331) (alamb)
+- chore(deps): bump syn from 2.0.103 to 2.0.104 [#16507](https://github.com/apache/datafusion/pull/16507) (dependabot[bot])
+- use 'lit' as the field name for literal values [#16498](https://github.com/apache/datafusion/pull/16498) (adriangb)
+- [datafusion-spark] Implement `factorical` function [#16125](https://github.com/apache/datafusion/pull/16125) (tlm365)
+- Add DESC alias for DESCRIBE command. [#16514](https://github.com/apache/datafusion/pull/16514) (lucqui)
+- Split clickbench query set into one file per query [#16476](https://github.com/apache/datafusion/pull/16476) (pepijnve)
+- Support query filter on all benchmarks [#16477](https://github.com/apache/datafusion/pull/16477) (pepijnve)
+- `TableProvider` to skip files in the folder which non relevant to selected reader [#16487](https://github.com/apache/datafusion/pull/16487) (comphead)
+- Reuse `BaselineMetrics` in `UnnestMetrics` [#16497](https://github.com/apache/datafusion/pull/16497) (hendrikmakait)
+- Fix array_has to return false for empty arrays instead of null [#16529](https://github.com/apache/datafusion/pull/16529) (kosiew)
+- Minor: Add documentation to `AggregateWindowExpr::get_result_column` [#16479](https://github.com/apache/datafusion/pull/16479) (alamb)
+- Fix WindowFrame::new with order_by [#16537](https://github.com/apache/datafusion/pull/16537) (findepi)
+- chore(deps): bump object_store from 0.12.1 to 0.12.2 [#16548](https://github.com/apache/datafusion/pull/16548) (dependabot[bot])
+- chore(deps): bump mimalloc from 0.1.46 to 0.1.47 [#16547](https://github.com/apache/datafusion/pull/16547) (dependabot[bot])
+- Add support for Arrow Duration type in Substrait [#16503](https://github.com/apache/datafusion/pull/16503) (jkosh44)
+- Allow unparser to override the alias name for the specific dialect [#16540](https://github.com/apache/datafusion/pull/16540) (goldmedal)
+- Avoid clones when calling find_window_exprs [#16551](https://github.com/apache/datafusion/pull/16551) (findepi)
+- Update `spilled_bytes` metric to reflect actual disk usage [#16535](https://github.com/apache/datafusion/pull/16535) (ding-young)
+- adapt filter expressions to file schema during parquet scan [#16461](https://github.com/apache/datafusion/pull/16461) (adriangb)
+- datafusion-cli: Use correct S3 region if it is not specified [#16502](https://github.com/apache/datafusion/pull/16502) (liamzwbao)
+- Add nested struct casting support and integrate into SchemaAdapter [#16371](https://github.com/apache/datafusion/pull/16371) (kosiew)
+- Improve err message grammar [#16566](https://github.com/apache/datafusion/pull/16566) (findepi)
+- refactor: move PruningPredicate into its own module [#16587](https://github.com/apache/datafusion/pull/16587) (adriangb)
+- chore(deps): bump indexmap from 2.9.0 to 2.10.0 [#16582](https://github.com/apache/datafusion/pull/16582) (dependabot[bot])
+- Skip re-pruning based on partition values and file level stats if there are no dynamic filters [#16424](https://github.com/apache/datafusion/pull/16424) (adriangb)
+- Support timestamp and date arguments for `range` and `generate_series` table functions [#16552](https://github.com/apache/datafusion/pull/16552) (simonvandel)
+- Fix normalization of columns in JOIN ... USING. [#16560](https://github.com/apache/datafusion/pull/16560) (brunal)
+- Revert Finalize support for `RightMark` join + `Mark` join [#16597](https://github.com/apache/datafusion/pull/16597) (comphead)
+- move min_batch/max_batch to functions-aggregate-common [#16593](https://github.com/apache/datafusion/pull/16593) (adriangb)
+- Allow usage of table functions in relations [#16571](https://github.com/apache/datafusion/pull/16571) (osipovartem)
+- Update to arrow/parquet 55.2.0 [#16575](https://github.com/apache/datafusion/pull/16575) (alamb)
+- Improve field naming in first_value, last_value implementation [#16631](https://github.com/apache/datafusion/pull/16631) (findepi)
+- Fix spurious failure in convert_batches test helper [#16627](https://github.com/apache/datafusion/pull/16627) (findepi)
+- Aggregate UDF cleanup [#16628](https://github.com/apache/datafusion/pull/16628) (findepi)
+- Avoid treating incomparable scalars as equal [#16624](https://github.com/apache/datafusion/pull/16624) (findepi)
+- restore topk pre-filtering of batches and make sort query fuzzer less sensitive to expected non determinism [#16501](https://github.com/apache/datafusion/pull/16501) (alamb)
+- Add support for Arrow Time types in Substrait [#16558](https://github.com/apache/datafusion/pull/16558) (jkosh44)
+- chore(deps): bump substrait from 0.57.0 to 0.58.0 [#16640](https://github.com/apache/datafusion/pull/16640) (dependabot[bot])
+- Support explain tree format debug for benchmark debug [#16604](https://github.com/apache/datafusion/pull/16604) (zhuqi-lucas)
+- Add microbenchmark for spilling with compression [#16512](https://github.com/apache/datafusion/pull/16512) (ding-young)
+- Fix parquet filter_pushdown: respect parquet filter pushdown config in scan [#16646](https://github.com/apache/datafusion/pull/16646) (adriangb)
+- chore(deps): bump aws-config from 1.8.0 to 1.8.1 [#16651](https://github.com/apache/datafusion/pull/16651) (dependabot[bot])
+- Migrate core test to insta, part 2 [#16617](https://github.com/apache/datafusion/pull/16617) (Chen-Yuan-Lai)
+- Update all spark SLT files [#16637](https://github.com/apache/datafusion/pull/16637) (findepi)
+- Add PhysicalExpr optimizer and cast unwrapping [#16530](https://github.com/apache/datafusion/pull/16530) (adriangb)
+- benchmark: Support sort_tpch10 for benchmark [#16671](https://github.com/apache/datafusion/pull/16671) (zhuqi-lucas)
+- chore(deps): bump tokio from 1.45.1 to 1.46.0 [#16666](https://github.com/apache/datafusion/pull/16666) (dependabot[bot])
+- Fix TopK Sort incorrectly pushed down past Join with anti join [#16641](https://github.com/apache/datafusion/pull/16641) (zhuqi-lucas)
+- Improve error message when ScalarValue fails to cast array [#16670](https://github.com/apache/datafusion/pull/16670) (findepi)
+- Add an example of embedding indexes inside a parquet file [#16395](https://github.com/apache/datafusion/pull/16395) (zhuqi-lucas)
+- `datafusion-cli`: Refactor statement execution logic [#16634](https://github.com/apache/datafusion/pull/16634) (liamzwbao)
+- Add SchemaAdapterFactory Support for ListingTable with Schema Evolution and Mapping [#16583](https://github.com/apache/datafusion/pull/16583) (kosiew)
+- Perf: fast CursorValues compare for StringViewArray using inline*key*… [#16630](https://github.com/apache/datafusion/pull/16630) (zhuqi-lucas)
+- Update to Rust 1.88 [#16663](https://github.com/apache/datafusion/pull/16663) (melroy12)
+- Refactor StreamJoinMetrics to reuse BaselineMetrics [#16674](https://github.com/apache/datafusion/pull/16674) (Standing-Man)
+- chore: refactor `BuildProbeJoinMetrics` to use `BaselineMetrics` [#16500](https://github.com/apache/datafusion/pull/16500) (Samyak2)
+- Use compression type in CSV file suffices [#16609](https://github.com/apache/datafusion/pull/16609) (theirix)
+- Clarify the generality of the embedded parquet index [#16692](https://github.com/apache/datafusion/pull/16692) (alamb)
+- Refactor SortMergeJoinMetrics to reuse BaselineMetrics [#16675](https://github.com/apache/datafusion/pull/16675) (Standing-Man)
+- Add support for Arrow Dictionary type in Substrait [#16608](https://github.com/apache/datafusion/pull/16608) (jkosh44)
+- Fix duplicate field name error in Join::try_new_with_project_input during physical planning [#16454](https://github.com/apache/datafusion/pull/16454) (LiaCastaneda)
+- chore(deps): bump tokio from 1.46.0 to 1.46.1 [#16700](https://github.com/apache/datafusion/pull/16700) (dependabot[bot])
+- Add reproducer for tpch Q16 deserialization bug [#16662](https://github.com/apache/datafusion/pull/16662) (NGA-TRAN)
+- Minor: Update release instructions [#16701](https://github.com/apache/datafusion/pull/16701) (alamb)
+- refactor filter pushdown APIs [#16642](https://github.com/apache/datafusion/pull/16642) (adriangb)
+- Add comments to ClickBench queries about setting binary_as_string [#16605](https://github.com/apache/datafusion/pull/16605) (alamb)
+- minor: improve display output for FFI execution plans [#16713](https://github.com/apache/datafusion/pull/16713) (timsaucer)
+- Revert "fix: create file for empty stream" [#16682](https://github.com/apache/datafusion/pull/16682) (brunal)
+- Add the missing equivalence info for filter pushdown [#16686](https://github.com/apache/datafusion/pull/16686) (liamzwbao)
+- Fix sqllogictests test running compatibility (ignore `--test-threads`) [#16694](https://github.com/apache/datafusion/pull/16694) (mjgarton)
+- Fix: Make `CopyTo` logical plan output schema consistent with physical schema [#16705](https://github.com/apache/datafusion/pull/16705) (bert-beyondloops)
+- chore(devcontainer): use debian's `protobuf-compiler` package [#16687](https://github.com/apache/datafusion/pull/16687) (fvj)
+- Add link to upgrade guide in changelog script [#16680](https://github.com/apache/datafusion/pull/16680) (alamb)
+- Improve display format of BoundedWindowAggExec [#16645](https://github.com/apache/datafusion/pull/16645) (geetanshjuneja)
+- Fix: optimize projections for unnest logical plan. [#16632](https://github.com/apache/datafusion/pull/16632) (bert-beyondloops)
+- Use the `test-threads` option in sqllogictests [#16722](https://github.com/apache/datafusion/pull/16722) (mjgarton)
+- chore(deps): bump clap from 4.5.40 to 4.5.41 [#16735](https://github.com/apache/datafusion/pull/16735) (dependabot[bot])
+- chore: make more clarity for internal errors [#16741](https://github.com/apache/datafusion/pull/16741) (comphead)
+- Remove parquet_filter and parquet `sort` benchmarks [#16730](https://github.com/apache/datafusion/pull/16730) (alamb)
+- Perform type coercion for corr aggregate function [#15776](https://github.com/apache/datafusion/pull/15776) (kumarlokesh)
+- Improve dictionary null handling in hashing and expand aggregate test coverage for nulls [#16466](https://github.com/apache/datafusion/pull/16466) (kosiew)
+- Improve Ci cache [#16709](https://github.com/apache/datafusion/pull/16709) (blaginin)
+- Fix in list round trip in df proto [#16744](https://github.com/apache/datafusion/pull/16744) (XiangpengHao)
+- chore: Make `GroupValues` and APIs on `PhysicalGroupBy` aggregation APIs public [#16733](https://github.com/apache/datafusion/pull/16733) (haohuaijin)
+- Extend binary coercion rules to support Decimal arithmetic operations with integer(signed and unsigned) types [#16668](https://github.com/apache/datafusion/pull/16668) (jatin510)
+- Support Type Coercion for NULL in Binary Arithmetic Expressions [#16761](https://github.com/apache/datafusion/pull/16761) (kosiew)
+- chore(deps): bump chrono-tz from 0.10.3 to 0.10.4 [#16769](https://github.com/apache/datafusion/pull/16769) (dependabot[bot])
+- limit intermediate batch size in nested_loop_join [#16443](https://github.com/apache/datafusion/pull/16443) (UBarney)
+- Add serialization/deserialization and round-trip tests for all tpc-h queries [#16742](https://github.com/apache/datafusion/pull/16742) (NGA-TRAN)
+- Auto start testcontainers for `datafusion-cli` [#16644](https://github.com/apache/datafusion/pull/16644) (blaginin)
+- Refactor BinaryTypeCoercer to Handle Null Coercion Early and Avoid Redundant Checks [#16768](https://github.com/apache/datafusion/pull/16768) (kosiew)
+- Remove fixed version from MSRV check [#16786](https://github.com/apache/datafusion/pull/16786) (findepi)
+- Add `clickbench_pushdown` benchmark [#16731](https://github.com/apache/datafusion/pull/16731) (alamb)
+- add filter to handle backtrace [#16752](https://github.com/apache/datafusion/pull/16752) (geetanshjuneja)
+- Support min/max aggregates for FixedSizeBinary type [#16765](https://github.com/apache/datafusion/pull/16765) (theirix)
+- fix tests in page_pruning when filter pushdown is enabled by default [#16794](https://github.com/apache/datafusion/pull/16794) (XiangpengHao)
+- Automatically split large single RecordBatches in `MemorySource` into smaller batches [#16734](https://github.com/apache/datafusion/pull/16734) (kosiew)
+- CI: Fix slow join test [#16796](https://github.com/apache/datafusion/pull/16796) (2010YOUY01)
+- Benchmark for char expression [#16743](https://github.com/apache/datafusion/pull/16743) (ajita-asthana)
+- Add example of custom file schema casting rules [#16803](https://github.com/apache/datafusion/pull/16803) (adriangb)
+- Fix discrepancy in Float64 to timestamp(9) casts for constants [#16639](https://github.com/apache/datafusion/pull/16639) (findepi)
+- Fix: Preserve sorting for the COPY TO plan [#16785](https://github.com/apache/datafusion/pull/16785) (bert-beyondloops)
+- chore(deps): bump object_store from 0.12.2 to 0.12.3 [#16807](https://github.com/apache/datafusion/pull/16807) (dependabot[bot])
+- Implement equals for stateful functions [#16781](https://github.com/apache/datafusion/pull/16781) (findepi)
+- benchmark: Add parquet h2o support [#16804](https://github.com/apache/datafusion/pull/16804) (zhuqi-lucas)
+- chore: use `equals_datatype` for `BinaryExpr` [#16813](https://github.com/apache/datafusion/pull/16813) (comphead)
+- chore: add tests for out of bounds for NullArray [#16802](https://github.com/apache/datafusion/pull/16802) (comphead)
+- Refactor binary.rs tests into modular submodules under `binary/tests` [#16782](https://github.com/apache/datafusion/pull/16782) (kosiew)
+- cache generation of dictionary keys and null arrays for ScalarValue [#16789](https://github.com/apache/datafusion/pull/16789) (adriangb)
+- refactor(examples): remove redundant call to create directory in `parquet_embedded_index.rs` [#16825](https://github.com/apache/datafusion/pull/16825) (jcsherin)
+- Add benchmark for ByteViewGroupValueBuilder [#16826](https://github.com/apache/datafusion/pull/16826) (zhuqi-lucas)
+- Simplify try cast expr evaluation [#16834](https://github.com/apache/datafusion/pull/16834) (lewiszlw)
+- Fix flaky test case in joins.slt [#16849](https://github.com/apache/datafusion/pull/16849) (findepi)
+- chore(deps): bump sysinfo from 0.35.2 to 0.36.1 [#16850](https://github.com/apache/datafusion/pull/16850) (dependabot[bot])
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    33	Andrew Lamb
+    26	dependabot[bot]
+    19	Adrian Garcia Badaracco
+    14	kosiew
+    13	Piotr Findeisen
+    13	Qi Zhu
+     7	Jonathan Chen
+     6	Chen Chongchen
+     6	Marco Neumann
+     6	Oleks V
+     6	Pepijn Van Eeckhoudt
+     6	xudong.w
+     5	Yongting You
+     5	ding-young
+     4	Simon Vandel Sillesen
+     3	Adam Gutglick
+     3	Bert Vermeiren
+     3	Dmitrii Blaginin
+     3	Joseph Koshakow
+     3	Liam Bao
+     3	Tim Saucer
+     2	Alan Tang
+     2	Arttu
+     2	Bruno
+     2	Corwin Joy
+     2	Daniël Heres
+     2	Geetansh Juneja
+     2	Ian Lai
+     2	Jax Liu
+     2	Martin Garton
+     2	Nga Tran
+     2	Ruihang Xia
+     2	Tai Le Manh
+     2	ViggoC
+     2	Xiangpeng Hao
+     2	haiywu
+     2	theirix
+     1	Ajeeta Asthana
+     1	Artem Osipov
+     1	Dharan Aditya
+     1	Gabriel
+     1	Geoffrey Claude
+     1	Hendrik Makait
+     1	Huaijin
+     1	Ian Wagner
+     1	Jack Eadie
+     1	Jagdish Parihar
+     1	Jon Mease
+     1	Julius von Froreich
+     1	K
+     1	Leon Lin
+     1	Loakesh Indiran
+     1	Lokesh
+     1	Lucas Earl
+     1	Lía Adriana
+     1	Mehmet Ozan Kabak
+     1	Melroy dsilva
+     1	Nirnay Roy
+     1	Nuno Faria
+     1	R. Tyler Croy
+     1	Rishab Joshi
+     1	Sami Tabet
+     1	Samyak Sarnayak
+     1	Shehab Amin
+     1	Tobias Schwarzinger
+     1	UBarney
+     1	alexanderbianchi
+     1	epgif
+     1	feniljain
+     1	m09526
+     1	suibianwanwan
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/49.0.1.md b/dev/changelog/49.0.1.md
new file mode 100644
index 0000000000000..06d7c1e2c77a6
--- /dev/null
+++ b/dev/changelog/49.0.1.md
@@ -0,0 +1,48 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 49.0.1 Changelog
+
+This release consists of 5 commits from 5 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-49] Final Changelog Tweaks [#16852](https://github.com/apache/datafusion/pull/16852) (alamb)
+- [branch-49] remove warning from every file open [#17059](https://github.com/apache/datafusion/pull/17059) (mbutrovich)
+- [branch-49] Backport PR #16995 to branch-49 [#17068](https://github.com/apache/datafusion/pull/17068) (pepijnve)
+- [branch-49] Backport "Add ExecutionPlan::reset_state (apache#17028)" to v49 [#17096](https://github.com/apache/datafusion/pull/17096) (adriangb)
+- [branch-49] Backport #17129 to branch 49 [#17143](https://github.com/apache/datafusion/pull/17143) (AdamGS)
+- [branch-49] Backport Pass the input schema to stats_projection for ProjectionExpr (#17123) [#17174](https://github.com/apache/datafusion/pull/17174) (alamb)
+- [branch-49] fix: string_agg not respecting ORDER BY [#17058](https://github.com/apache/datafusion/pull/17058) (nuno-faria)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     1	Adam Gutglick
+     1	Adrian Garcia Badaracco
+     1	Andrew Lamb
+     1	Matt Butrovich
+     1	Pepijn Van Eeckhoudt
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/49.0.2.md b/dev/changelog/49.0.2.md
new file mode 100644
index 0000000000000..7e6fc3e7eb487
--- /dev/null
+++ b/dev/changelog/49.0.2.md
@@ -0,0 +1,45 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 49.0.2 Changelog
+
+This release consists of 3 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Fixed bugs:**
+
+- fix: align `array_has` null buffer for scalar (#17272) [#17274](https://github.com/apache/datafusion/pull/17274) (comphead)
+
+**Other:**
+
+- [branch-49] Backport fix: deserialization error for FilterExec (predicates with inlist) [#17254](https://github.com/apache/datafusion/pull/17254) (haohuaijin)
+- [branch-49] FFI_RecordBatchStream was causing a memory leak (#17190) [#17270](https://github.com/apache/datafusion/pull/17270) (timsaucer)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     1	Huaijin
+     1	Oleks V
+     1	Tim Saucer
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/50.0.0.md b/dev/changelog/50.0.0.md
new file mode 100644
index 0000000000000..7563d57777d56
--- /dev/null
+++ b/dev/changelog/50.0.0.md
@@ -0,0 +1,445 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 50.0.0 Changelog
+
+This release consists of 315 commits from 79 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Support multiple ordered `array_agg` aggregations [#16625](https://github.com/apache/datafusion/pull/16625) (findepi)
+- Make `AsyncScalarUDFImpl::invoke_async_with_args` consistent with `ScalarUDFImpl::invoke_with_args` [#16902](https://github.com/apache/datafusion/pull/16902) (geetanshjuneja)
+- Derive `WindowUDFImpl` equality, hash from `Eq`, `Hash` traits [#17081](https://github.com/apache/datafusion/pull/17081) (findepi)
+- Remove redundant `plan` from extension's check_invariants [#17199](https://github.com/apache/datafusion/pull/17199) (findepi)
+- feat: Make parquet_encryption a non-default feature [#17137](https://github.com/apache/datafusion/pull/17137) (miroim)
+- chore: fix typos [#17135](https://github.com/apache/datafusion/pull/17135) (waynexia)
+- Use a struct for ProjectionExpr [#17398](https://github.com/apache/datafusion/pull/17398) (adriangb)
+- Use DataFusionError instead of ArrowError in FileOpenFuture [#17397](https://github.com/apache/datafusion/pull/17397) (adriangb)
+- Use return_field instead of return_type for calling aggregates via FFI [#17407](https://github.com/apache/datafusion/pull/17407) (timsaucer)
+
+**Performance related:**
+
+- feat: improve LiteralGuarantee for the case like `(a=1 AND b=1) OR (a=2 AND b=3)` [#16762](https://github.com/apache/datafusion/pull/16762) (haohuaijin)
+- optimize `initcap` function by avoiding memory allocation [#16878](https://github.com/apache/datafusion/pull/16878) (waynexia)
+- speedup `date_trunc` (~7x faster) in some cases [#16859](https://github.com/apache/datafusion/pull/16859) (waynexia)
+- Feature: Improve hash Expr performance [#16977](https://github.com/apache/datafusion/pull/16977) (tobixdev)
+- Perf: Port arrow-rs optimization for get_buffer_memory_size and add fast path for no buffer for gc string view [#17008](https://github.com/apache/datafusion/pull/17008) (zhuqi-lucas-001)
+- Simplify comparisons and binary operations involving NULL [#17088](https://github.com/apache/datafusion/pull/17088) (findepi)
+- Eliminate all redundant aggregations [#17139](https://github.com/apache/datafusion/pull/17139) (findepi)
+
+**Implemented enhancements:**
+
+- feat: Allow tree explain format width to be customizable [#16827](https://github.com/apache/datafusion/pull/16827) (nuno-faria)
+- feat(spark): Implement Spark `string` function `luhn_check` [#16848](https://github.com/apache/datafusion/pull/16848) (Standing-Man)
+- feat(spark): implement Spark datetime function last_day [#16828](https://github.com/apache/datafusion/pull/16828) (Standing-Man)
+- feat: Add `ScalarValue::{new_one,new_zero,new_ten,distance}` support for `Decimal128` and `Decimal256` [#16831](https://github.com/apache/datafusion/pull/16831) (theirix)
+- feat: support distinct for window [#16925](https://github.com/apache/datafusion/pull/16925) (zhuqi-lucas-001)
+- feat: add multi level merge sort that will always fit in memory [#15700](https://github.com/apache/datafusion/pull/15700) (rluvaton)
+- feat: [datafusion-spark] Implement `next_day` function [#16780](https://github.com/apache/datafusion/pull/16780) (petern48)
+- feat: Support distinct window for sum [#16943](https://github.com/apache/datafusion/pull/16943) (zhuqi-lucas-001)
+- feat(spark): implement Spark math function rint [#16924](https://github.com/apache/datafusion/pull/16924) (chenkovsky)
+- feat(spark): implement Spark string function like/ilike [#16962](https://github.com/apache/datafusion/pull/16962) (chenkovsky)
+- feat: Cache Parquet metadata in built in parquet reader [#16971](https://github.com/apache/datafusion/pull/16971) (nuno-faria)
+- feat: Add `Arc<ConfigOptions>` to `ScalarFunctionArgs`, don't copy `ConfigOptions` on each query [#16970](https://github.com/apache/datafusion/pull/16970) (Omega359)
+- feat(spark): implement spark hash function crc32/sha1 [#17032](https://github.com/apache/datafusion/pull/17032) (chenkovsky)
+- feat: Limit the memory used in the file metadata cache [#17031](https://github.com/apache/datafusion/pull/17031) (nuno-faria)
+- feat: Dynamic Parquet encryption and decryption properties [#16779](https://github.com/apache/datafusion/pull/16779) (adamreeve)
+- feat: Use Cached Metadata for ListingTable Statistics [#17022](https://github.com/apache/datafusion/pull/17022) (shehabgamin)
+- feat(spark): implement Spark math function mod/pmod [#16829](https://github.com/apache/datafusion/pull/16829) (chenkovsky)
+- feat(spark): implement Spark math function bit_get/bit_count [#16942](https://github.com/apache/datafusion/pull/16942) (chenkovsky)
+- feat: add `isodow` (ISO day-of-week) support to date_part (Monday = 0) [#17112](https://github.com/apache/datafusion/pull/17112) (ayemjay)
+- feat(spark): implement spark datetime function date_add/date_sub [#17024](https://github.com/apache/datafusion/pull/17024) (chenkovsky)
+- feat: Add the ability to review the contents of the Metadata Cache [#17126](https://github.com/apache/datafusion/pull/17126) (nuno-faria)
+- feat: add `datafusion-physical-adapter`, implement predicate adaptation missing fields of structs [#16589](https://github.com/apache/datafusion/pull/16589) (adriangb)
+- feat: implement QUALIFY clause [#16933](https://github.com/apache/datafusion/pull/16933) (haohuaijin)
+- feat: allow to `spawn`/`spawn_blocking` on a provided runtime in `RecordBatchReceiverStreamBuilder` [#17239](https://github.com/apache/datafusion/pull/17239) (rluvaton)
+- feat: Support SortMergeJoin proto serde [#17296](https://github.com/apache/datafusion/pull/17296) (milenkovicm)
+- feat(spark): implement Spark `bitmap` function `bitmap_count` [#17179](https://github.com/apache/datafusion/pull/17179) (SparkApplicationMaster)
+- feat: Track peak value in tracked consumer [#17327](https://github.com/apache/datafusion/pull/17327) (wForget)
+- feat(spark): implement Spark conditional function if [#16946](https://github.com/apache/datafusion/pull/16946) (chenkovsky)
+- feat(spark): implement Spark `width_bucket` function [#17331](https://github.com/apache/datafusion/pull/17331) (davidlghellin)
+- feat: Make Parquet EncryptionFactory async [#17342](https://github.com/apache/datafusion/pull/17342) (adamreeve)
+- feat: Support `FILTER` clause in aggregate window functions [#17378](https://github.com/apache/datafusion/pull/17378) (geoffreyclaude)
+- feat: Support binary data types for `SortMergeJoin` `on` clause [#17431](https://github.com/apache/datafusion/pull/17431) (stuartcarnie)
+
+**Fixed bugs:**
+
+- fix: The inconsistency between scalar and array on the cast decimal to timestamp [#16539](https://github.com/apache/datafusion/pull/16539) (chenkovsky)
+- fix: unit test for object_storage [#16824](https://github.com/apache/datafusion/pull/16824) (chenkovsky)
+- fix(docs): Update broken links to `TableProvider` docs [#16830](https://github.com/apache/datafusion/pull/16830) (jcsherin)
+- fix: `PlaceholderRowExec::partition_statistics` [#16851](https://github.com/apache/datafusion/pull/16851) (crepererum)
+- fix: skip predicates on struct unnest in PushDownFilter [#16790](https://github.com/apache/datafusion/pull/16790) (akoshchiy)
+- fix: regex bench [#16890](https://github.com/apache/datafusion/pull/16890) (chenkovsky)
+- fix: `ComposedPhysicalExtensionCodec` does not use the same codec as encoding when decoding [#16986](https://github.com/apache/datafusion/pull/16986) (Thearas)
+- fix: Remove `datafusion.execution.parquet.cache_metadata` config [#17062](https://github.com/apache/datafusion/pull/17062) (jonathanc-n)
+- fix: Add missing member to visitor for ConfigFileEncryptionProperties [#17103](https://github.com/apache/datafusion/pull/17103) (corwinjoy)
+- fix(ci): update `datafusion-physical-expr-adapter` version to 49.0.1in Cargo.lock [#17209](https://github.com/apache/datafusion/pull/17209) (miroim)
+- fix: respect inexact flags in row group metadata [#16412](https://github.com/apache/datafusion/pull/16412) (CookiePieWw)
+- fix: deserialization error for `FilterExec` (predicates with inlist) [#17224](https://github.com/apache/datafusion/pull/17224) (haohuaijin)
+- FFI_RecordBatchStream was causing a memory leak [#17190](https://github.com/apache/datafusion/pull/17190) (timsaucer)
+- fix: Windows paths crashing core tests [#17231](https://github.com/apache/datafusion/pull/17231) (nuno-faria)
+- fix: sort should always output batches with `batch_size` rows [#17244](https://github.com/apache/datafusion/pull/17244) (rluvaton)
+- fix: align `array_has` null buffer for scalar [#17272](https://github.com/apache/datafusion/pull/17272) (comphead)
+- fix: dataframe function count_all with alias [#17282](https://github.com/apache/datafusion/pull/17282) (Loaki07)
+- fix: correct readme field in `Cargo.toml` [#17310](https://github.com/apache/datafusion/pull/17310) (Weijun-H)
+- fix(doc): update the link of deprecation guidelines (#17328) [#17329](https://github.com/apache/datafusion/pull/17329) (ivila)
+- fix: lazy case else evaluation [#17311](https://github.com/apache/datafusion/pull/17311) (chenkovsky)
+- fix: set distinct_count to Absent when merging statistics [#17385](https://github.com/apache/datafusion/pull/17385) (adriangb)
+- fix: Remove duplicate filter from `CrossJoin` unparsing [#17382](https://github.com/apache/datafusion/pull/17382) (jonathanc-n)
+- fix: set IPC alignment based on schema [#17363](https://github.com/apache/datafusion/pull/17363) (ding-young)
+- fix: return ALL constants in `EquivalenceProperties::constants` [#17404](https://github.com/apache/datafusion/pull/17404) (crepererum)
+- fix: align `map_keys` nullability flag [#17454](https://github.com/apache/datafusion/pull/17454) (comphead)
+
+**Documentation updates:**
+
+- docs: Fix broken links [#16839](https://github.com/apache/datafusion/pull/16839) (2010YOUY01)
+- Add note to upgrade guide about MSRV update [#16845](https://github.com/apache/datafusion/pull/16845) (alamb)
+- [main] Update version to 49.0.0, add 49.0.0 changelog [#16855](https://github.com/apache/datafusion/pull/16855) (alamb)
+- Improve async_udf example and docs [#16846](https://github.com/apache/datafusion/pull/16846) (alamb)
+- Docs: Update Upgrading.md to reflect 49.0.0 is released [#16853](https://github.com/apache/datafusion/pull/16853) (alamb)
+- docs: Remove references to DataFusion for Ray sub project [#16966](https://github.com/apache/datafusion/pull/16966) (andygrove)
+- Add `temp_directory` and `max_temp_directory_size` runtime config variables [#16934](https://github.com/apache/datafusion/pull/16934) (delamarch3)
+- Add `sql_parser.default_null_ordering` config option to customize the default null ordering [#16963](https://github.com/apache/datafusion/pull/16963) (goldmedal)
+- Added Example for `Statistical Functions` in Docs [#16927](https://github.com/apache/datafusion/pull/16927) (Adez017)
+- Fix window_functions docs formatting [#17005](https://github.com/apache/datafusion/pull/17005) (mattmatravers)
+- docs: Fix 'Analaysis' typo in query optimizer docs [#17015](https://github.com/apache/datafusion/pull/17015) (petern48)
+- docs: Fix random extra bullet for 'Analytical Functions' [#17014](https://github.com/apache/datafusion/pull/17014) (petern48)
+- docs: Fix failing documentation check in CI [#17026](https://github.com/apache/datafusion/pull/17026) (adamreeve)
+- Upgrade arrow/parquet to 56.0.0 [#16690](https://github.com/apache/datafusion/pull/16690) (alamb)
+- fix error result in execute&pre_selection [#16930](https://github.com/apache/datafusion/pull/16930) (acking-you)
+- docs: Fix failing CI [#17041](https://github.com/apache/datafusion/pull/17041) (liamzwbao)
+- Docs: Add Examples to Config Options page [#17039](https://github.com/apache/datafusion/pull/17039) (alamb)
+- Docs: Add Tuning Guide for small data / short queries [#17040](https://github.com/apache/datafusion/pull/17040) (alamb)
+- Docs: Update the crate configuration / build settings page [#17038](https://github.com/apache/datafusion/pull/17038) (alamb)
+- Support `centroids` config for `approx_percentile_cont_with_weight` [#17003](https://github.com/apache/datafusion/pull/17003) (liamzwbao)
+- Add ExecutionPlan::reset_state [#17028](https://github.com/apache/datafusion/pull/17028) (adriangb)
+- Docs: Add Tuning Guide for larger-than-memory queries [#17069](https://github.com/apache/datafusion/pull/17069) (2010YOUY01)
+- Link UdfEq and PtrEq to help understand relationship [#17082](https://github.com/apache/datafusion/pull/17082) (findepi)
+- Derive `AggregateUDFImpl` equality, hash from `Eq`, `Hash` traits [#17130](https://github.com/apache/datafusion/pull/17130) (findepi)
+- chore: Clarify `EmptyRelation` description [#17157](https://github.com/apache/datafusion/pull/17157) (comphead)
+- Update dev env documentation to reflect pinned rust version [#17107](https://github.com/apache/datafusion/pull/17107) (Jefffrey)
+- Differentiate 0-row and 1-row EmptyRelation in EXPLAIN [#17145](https://github.com/apache/datafusion/pull/17145) (findepi)
+- (Re)Support old syntax for `approx_percentile_cont` and `approx_percentile_cont_with_weight` [#16999](https://github.com/apache/datafusion/pull/16999) (alamb)
+- Derive `ScalarUDFImpl` equality, hash from `Eq`, `Hash` traits [#17164](https://github.com/apache/datafusion/pull/17164) (findepi)
+- #17128 Add support for chr(0) [#17131](https://github.com/apache/datafusion/pull/17131) (pepijnve)
+- [main] Update version to 49.0.1 and add changelog (#17175) [#17191](https://github.com/apache/datafusion/pull/17191) (alamb)
+- Docs: Consolidate feature proposal content into roadmap [#17156](https://github.com/apache/datafusion/pull/17156) (alamb)
+- Doc: Update upgrade guide for the rewritten NLJ operator [#17202](https://github.com/apache/datafusion/pull/17202) (2010YOUY01)
+- Support serializing `generate_series` in `datafusion-proto` [#17200](https://github.com/apache/datafusion/pull/17200) (cetra3)
+- Fix broken links in user docs [#17228](https://github.com/apache/datafusion/pull/17228) (AdamGS)
+- Format `Date32` to string given timestamp specifiers [#15361](https://github.com/apache/datafusion/pull/15361) (friendlymatthew)
+- Improve documentation for Signature, Volatility, and TypeSignature [#17264](https://github.com/apache/datafusion/pull/17264) (alamb)
+- [main] Forward port `49.0.2` version and changelog (#17277) [#17287](https://github.com/apache/datafusion/pull/17287) (alamb)
+- Document schema merging. [#17249](https://github.com/apache/datafusion/pull/17249) (wiedld)
+- Support from-first SQL syntax [#17295](https://github.com/apache/datafusion/pull/17295) (simonvandel)
+- Add `cfg(feature = "avro")` attribute to Avro example in SQL API docs [#17142](https://github.com/apache/datafusion/pull/17142) (kosiew)
+- Push the limits past window functions [#17347](https://github.com/apache/datafusion/pull/17347) (avantgardnerio)
+- Refactor DataSourceExec::try_swapping_with_projection to simplify and remove abstraction leakage [#17395](https://github.com/apache/datafusion/pull/17395) (adriangb)
+- doc: Document caveats of `swap_inputs()` interface in join executors [#17373](https://github.com/apache/datafusion/pull/17373) (2010YOUY01)
+- Fix syntax error in DDL documentation example [#17412](https://github.com/apache/datafusion/pull/17412) (pepijnve)
+- Add MSRV change to upgrade guide [#17406](https://github.com/apache/datafusion/pull/17406) (findepi)
+- Add PhysicalExpr::is_volatile_node to upgrade guide [#17443](https://github.com/apache/datafusion/pull/17443) (adriangb)
+- docs: Render `--` properly in profiling docs [#17430](https://github.com/apache/datafusion/pull/17430) (petern48)
+
+**Other:**
+
+- chore: use `equals_datatype` for `BinaryExpr` [#16813](https://github.com/apache/datafusion/pull/16813) (comphead)
+- chore: add tests for out of bounds for NullArray [#16802](https://github.com/apache/datafusion/pull/16802) (comphead)
+- Refactor binary.rs tests into modular submodules under `binary/tests` [#16782](https://github.com/apache/datafusion/pull/16782) (kosiew)
+- cache generation of dictionary keys and null arrays for ScalarValue [#16789](https://github.com/apache/datafusion/pull/16789) (adriangb)
+- refactor(examples): remove redundant call to create directory in `parquet_embedded_index.rs` [#16825](https://github.com/apache/datafusion/pull/16825) (jcsherin)
+- Add benchmark for ByteViewGroupValueBuilder [#16826](https://github.com/apache/datafusion/pull/16826) (zhuqi-lucas-001)
+- Simplify try cast expr evaluation [#16834](https://github.com/apache/datafusion/pull/16834) (lewiszlw)
+- Fix flaky test case in joins.slt [#16849](https://github.com/apache/datafusion/pull/16849) (findepi)
+- chore(deps): bump sysinfo from 0.35.2 to 0.36.1 [#16850](https://github.com/apache/datafusion/pull/16850) (dependabot[bot])
+- chore(deps): bump aws-credential-types from 1.2.3 to 1.2.4 [#16815](https://github.com/apache/datafusion/pull/16815) (dependabot[bot])
+- fix(build-wasm): put `arrow-ipc/zstd` dep under `compression` feature [#16844](https://github.com/apache/datafusion/pull/16844) (chrisvander)
+- chore(deps): bump serde_json from 1.0.140 to 1.0.141 [#16863](https://github.com/apache/datafusion/pull/16863) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.1 to 1.8.2 [#16864](https://github.com/apache/datafusion/pull/16864) (dependabot[bot])
+- test: Fix flaky join tests [#16860](https://github.com/apache/datafusion/pull/16860) (2010YOUY01)
+- chore(deps): bump rand from 0.9.1 to 0.9.2 [#16882](https://github.com/apache/datafusion/pull/16882) (dependabot[bot])
+- Report error when `SessionState::sql_to_expr_with_alias` does not consume all input [#16811](https://github.com/apache/datafusion/pull/16811) (pepijnve)
+- test: fix more flaky join tests [#16880](https://github.com/apache/datafusion/pull/16880) (2010YOUY01)
+- MINOR: add unit tests for chr function [#16856](https://github.com/apache/datafusion/pull/16856) (waynexia)
+- remove deprecated methods from FileScanConfig / DataSourceExec [#16901](https://github.com/apache/datafusion/pull/16901) (adriangb)
+- Support utf8view for spark hex [#16885](https://github.com/apache/datafusion/pull/16885) (xudong963)
+- Fixes 3 bugs during serialization and deserialization of physical plans [#16858](https://github.com/apache/datafusion/pull/16858) (NGA-TRAN)
+- chore(deps): bump aws-config from 1.8.2 to 1.8.3 [#16912](https://github.com/apache/datafusion/pull/16912) (dependabot[bot])
+- Derive UDF equality from PartialEq, Hash [#16842](https://github.com/apache/datafusion/pull/16842) (findepi)
+- Ensure Substrait consumer can handle expressions in VirtualTable [#16857](https://github.com/apache/datafusion/pull/16857) (lorenarosati)
+- Mutable Join Unwind [#16883](https://github.com/apache/datafusion/pull/16883) (berkaysynnada)
+- fix(datafusion-proto): support serializing/deserilizing ArrowFormat tables [#16875](https://github.com/apache/datafusion/pull/16875) (colinmarc)
+- ScalarValue Default + Min + Max [#16891](https://github.com/apache/datafusion/pull/16891) (berkaysynnada)
+- minor: add is_superset() method for Interval's [#16895](https://github.com/apache/datafusion/pull/16895) (berkaysynnada)
+- minor: implement with_new_expressions for AggregateFunctionExpr [#16897](https://github.com/apache/datafusion/pull/16897) (berkaysynnada)
+- minor: Rename add_spm_on_top as add_merge_on_top [#16913](https://github.com/apache/datafusion/pull/16913) (berkaysynnada)
+- Implement Helpers for ScopedTimerGuard and Time Structs [#16911](https://github.com/apache/datafusion/pull/16911) (berkaysynnada)
+- Fix Partial Sort Get Slice Point Between Batches [#16881](https://github.com/apache/datafusion/pull/16881) (berkaysynnada)
+- Fix `schema_adapter` integration tests not running [#16835](https://github.com/apache/datafusion/pull/16835) (kosiew)
+- Update release process [#16929](https://github.com/apache/datafusion/pull/16929) (xudong963)
+- Fix `next_up` and `next_down` behavior for zero float values [#16745](https://github.com/apache/datafusion/pull/16745) (liamzwbao)
+- Add Fetch Property to OutputRequirementExec [#16892](https://github.com/apache/datafusion/pull/16892) (berkaysynnada)
+- chore(deps): bump tokio from 1.46.1 to 1.47.0 [#16952](https://github.com/apache/datafusion/pull/16952) (dependabot[bot])
+- chore(deps): bump serde_json from 1.0.140 to 1.0.141 [#16951](https://github.com/apache/datafusion/pull/16951) (dependabot[bot])
+- chore: Remove attributes to allow dead_code that aren't relevant anymore [#16953](https://github.com/apache/datafusion/pull/16953) (AdamGS)
+- chore(deps): bump rand from 0.9.1 to 0.9.2 [#16960](https://github.com/apache/datafusion/pull/16960) (dependabot[bot])
+- chore(deps): bump ctor from 0.4.2 to 0.4.3 [#16961](https://github.com/apache/datafusion/pull/16961) (dependabot[bot])
+- disallow pushdown of volatile functions [#16861](https://github.com/apache/datafusion/pull/16861) (adriangb)
+- remove warning from every file open [#16968](https://github.com/apache/datafusion/pull/16968) (adriangb)
+- Pin github actions to commit sha [#16964](https://github.com/apache/datafusion/pull/16964) (gopidesupavan)
+- Enable physical filter pushdown for hash joins [#16954](https://github.com/apache/datafusion/pull/16954) (adriangb)
+- Fix [Bug] Aggregate + TopK fails when asc = false [#16972](https://github.com/apache/datafusion/pull/16972) (avantgardnerio)
+- Use tokio::task::coop::poll_proceed by default in CooperativeStream [#16748](https://github.com/apache/datafusion/pull/16748) (pepijnve)
+- Add benchmark utility to profile peak memory usage [#16814](https://github.com/apache/datafusion/pull/16814) (ding-young)
+- chore(deps): bump indicatif from 0.17.11 to 0.18.0 [#16992](https://github.com/apache/datafusion/pull/16992) (dependabot[bot])
+- test(datafusion-cli): migrate tests to `insta` in `print_format.rs` [#16993](https://github.com/apache/datafusion/pull/16993) (Thearas)
+- Chore: remove 'spill_record_batch_by_size' api [#16958](https://github.com/apache/datafusion/pull/16958) (ding-young)
+- chore(deps): bump serde_json from 1.0.141 to 1.0.142 [#17006](https://github.com/apache/datafusion/pull/17006) (dependabot[bot])
+- Add tests for yielding in `SpillManager::read_spill_as_stream` [#16616](https://github.com/apache/datafusion/pull/16616) (ding-young)
+- #16994 Ensure CooperativeExec#maintains_input_order returns a Vec of the correct size [#16995](https://github.com/apache/datafusion/pull/16995) (pepijnve)
+- test: Add logic tests for string_agg with order [#17033](https://github.com/apache/datafusion/pull/17033) (nuno-faria)
+- Implement `From<Option<String>>' for `ScalarValue` [#17043](https://github.com/apache/datafusion/pull/17043) (findepi)
+- chore(deps): bump tokio-util from 0.7.15 to 0.7.16 [#17030](https://github.com/apache/datafusion/pull/17030) (dependabot[bot])
+- Add missing Substrait to DataFusion function name mappings [#16950](https://github.com/apache/datafusion/pull/16950) (lorenarosati)
+- refactor: use upstream arrow-rs inline_key_fast [#17044](https://github.com/apache/datafusion/pull/17044) (zhuqi-lucas-001)
+- Implement spark `array` function `array` [#16936](https://github.com/apache/datafusion/pull/16936) (Standing-Man)
+- Address memory over-accounting in array_agg [#16816](https://github.com/apache/datafusion/pull/16816) (gabotechs)
+- chore(deps): bump aws-credential-types from 1.2.4 to 1.2.5 [#17053](https://github.com/apache/datafusion/pull/17053) (dependabot[bot])
+- Support Substrait functions and_not, xor, and between in consumer built-in expression builder [#16984](https://github.com/apache/datafusion/pull/16984) (lorenarosati)
+- Derive UDWF equality from PartialEq, Hash [#17057](https://github.com/apache/datafusion/pull/17057) (findepi)
+- fix return field for `is_not_null` expression [#17056](https://github.com/apache/datafusion/pull/17056) (davidhewitt)
+- chore(deps): bump tokio from 1.47.0 to 1.47.1 [#17063](https://github.com/apache/datafusion/pull/17063) (dependabot[bot])
+- Optimize char expression [#16076](https://github.com/apache/datafusion/pull/16076) (ajita-asthana)
+- Fix equality of parametrizable ArrayAgg function [#17065](https://github.com/apache/datafusion/pull/17065) (findepi)
+- Implement Spark `url` function `parse_url` [#16937](https://github.com/apache/datafusion/pull/16937) (Standing-Man)
+- Derive UDAF equality from Eq, Hash [#17067](https://github.com/apache/datafusion/pull/17067) (findepi)
+- Remove elements deprecated since v 45 [#17075](https://github.com/apache/datafusion/pull/17075) (findepi)
+- Deprecate ScalarUDF::is_nullable [#17074](https://github.com/apache/datafusion/pull/17074) (findepi)
+- Re-export `object_store` crate via DataFusion Core and Common [#17070](https://github.com/apache/datafusion/pull/17070) (kosiew)
+- Fix hash/equality issues for ScalarFunctionExpr [#17078](https://github.com/apache/datafusion/pull/17078) (findepi)
+- Fill missing methods in aliased UDF impls [#17080](https://github.com/apache/datafusion/pull/17080) (findepi)
+- Improve Hash speed for ScalarFunctionExpr [#17099](https://github.com/apache/datafusion/pull/17099) (findepi)
+- chore(deps): bump clap from 4.5.42 to 4.5.43 [#17079](https://github.com/apache/datafusion/pull/17079) (dependabot[bot])
+- minor: remove unused import in docstring of datafusion_common::record_batch [#17106](https://github.com/apache/datafusion/pull/17106) (Jefffrey)
+- Make macros in common::test_util hygenic and not dependent on user dependencies [#17102](https://github.com/apache/datafusion/pull/17102) (AdamGS)
+- minor: remove unnecessary clippy:large_enum_variant allows [#17108](https://github.com/apache/datafusion/pull/17108) (Jefffrey)
+- minor: Improve equivalence handling of joins [#16893](https://github.com/apache/datafusion/pull/16893) (berkaysynnada)
+- Fix incorrect `NULL IN ()` optimization [#17092](https://github.com/apache/datafusion/pull/17092) (findepi)
+- Add `prettier` to the devcontainer (GitHub codespaces) [#17019](https://github.com/apache/datafusion/pull/17019) (alamb)
+- Set a lower threshold for clippy to flag large error variants [#17109](https://github.com/apache/datafusion/pull/17109) (Jefffrey)
+- chore(deps): bump rustyline from 16.0.0 to 17.0.0 [#17116](https://github.com/apache/datafusion/pull/17116) (dependabot[bot])
+- Add dynamic filter (bounds) pushdown to HashJoinExec [#16445](https://github.com/apache/datafusion/pull/16445) (adriangb)
+- Remove the "run extended tests" github PR commend action [#17119](https://github.com/apache/datafusion/pull/17119) (alamb)
+- chore(deps): bump sysinfo from 0.36.1 to 0.37.0 [#17124](https://github.com/apache/datafusion/pull/17124) (dependabot[bot])
+- chore(deps): bump libc from 0.2.174 to 0.2.175 [#17121](https://github.com/apache/datafusion/pull/17121) (dependabot[bot])
+- ff: Preserve cached plan information when pushing projection [#17129](https://github.com/apache/datafusion/pull/17129) (friendlymatthew)
+- chore: Enforce checks for RC branches [#17132](https://github.com/apache/datafusion/pull/17132) (comphead)
+- chore(deps): bump actions/checkout from 4.2.2 to 5.0.0 [#17149](https://github.com/apache/datafusion/pull/17149) (dependabot[bot])
+- minor: enhance comment in SortPreservingMergeStream.abort [#17115](https://github.com/apache/datafusion/pull/17115) (mapleFU)
+- Update workspace to use Rust 1.89 [#17100](https://github.com/apache/datafusion/pull/17100) (shruti2522)
+- chore(deps): bump on-headers and compression in /datafusion/wasmtest/datafusion-wasm-app [#16812](https://github.com/apache/datafusion/pull/16812) (dependabot[bot])
+- chore(deps): bump slab from 0.4.10 to 0.4.11 [#17161](https://github.com/apache/datafusion/pull/17161) (dependabot[bot])
+- refactor `character_length` impl by unifying null handling logic [#16877](https://github.com/apache/datafusion/pull/16877) (waynexia)
+- chore(deps): bump clap from 4.5.43 to 4.5.44 [#17148](https://github.com/apache/datafusion/pull/17148) (dependabot[bot])
+- Pass the input schema to stats_projection for ProjectionExpr [#17123](https://github.com/apache/datafusion/pull/17123) (hareshkh)
+- Fix extended tests failure on main by updating `datafusion-testing` pin [#17176](https://github.com/apache/datafusion/pull/17176) (alamb)
+- Minor: display filter in HashJoin's tree explain [#17170](https://github.com/apache/datafusion/pull/17170) (2010YOUY01)
+- add test for multi-column topk dynamic filter pushdown [#17162](https://github.com/apache/datafusion/pull/17162) (adriangb)
+- Test: Add checks to sqllogictest temporary file creations [#17017](https://github.com/apache/datafusion/pull/17017) (2010YOUY01)
+- Deprecate unused `ScalarUDF::display_name` [#17168](https://github.com/apache/datafusion/pull/17168) (findepi)
+- CI: Fix extended test failure by updating `datafusion-testing` submodule [#17187](https://github.com/apache/datafusion/pull/17187) (2010YOUY01)
+- Normalize `NUL` to `\0` in sqllogictests [#17181](https://github.com/apache/datafusion/pull/17181) (Jefffrey)
+- Simplify `GetFieldFunc`'s `display_name`, `schema_name` [#17167](https://github.com/apache/datafusion/pull/17167) (findepi)
+- Rewrite Nested Loop Join executor for 5× speed and 1% memory usage [#16996](https://github.com/apache/datafusion/pull/16996) (2010YOUY01)
+- Minor: Fix compiler warning when compiling `datafusion-cli` [#17205](https://github.com/apache/datafusion/pull/17205) (2010YOUY01)
+- Refactor: Do not silently ignore errors in `stats_projection` [#17154](https://github.com/apache/datafusion/pull/17154) (alamb)
+- Miscellaneous cleanups [#17189](https://github.com/apache/datafusion/pull/17189) (findepi)
+- [Parquet Metadata Cache] Document the ListingTable cache [#17133](https://github.com/apache/datafusion/pull/17133) (alamb)
+- Fix: Show backtrace for ArrowError [#17204](https://github.com/apache/datafusion/pull/17204) (2010YOUY01)
+- minor: clean up distinct window code [#17215](https://github.com/apache/datafusion/pull/17215) (zhuqi-lucas-001)
+- chore: Add drop table test on create_drop.rs [#17219](https://github.com/apache/datafusion/pull/17219) (caicancai)
+- chore(deps): bump async-trait from 0.1.88 to 0.1.89 [#17203](https://github.com/apache/datafusion/pull/17203) (dependabot[bot])
+- Bump MSRV to 1.86.0 [#17230](https://github.com/apache/datafusion/pull/17230) (adriangb)
+- Minor: improve error message when file creation failed [#17217](https://github.com/apache/datafusion/pull/17217) (2010YOUY01)
+- Fix dynamic filter pushdown in HashJoinExec [#17201](https://github.com/apache/datafusion/pull/17201) (adriangb)
+- Fix Analyze Exec protobuf roundtrip [#17234](https://github.com/apache/datafusion/pull/17234) (cetra3)
+- Preserve `distinct` and `ignore_nulls` in window expressions during proto serde [#17235](https://github.com/apache/datafusion/pull/17235) (cetra3)
+- chore(deps): bump serde_json from 1.0.142 to 1.0.143 [#17240](https://github.com/apache/datafusion/pull/17240) (dependabot[bot])
+- chore(deps): bump syn from 2.0.105 to 2.0.106 [#17243](https://github.com/apache/datafusion/pull/17243) (dependabot[bot])
+- Push dynamic pushdown through cooperative and projection execs [#17238](https://github.com/apache/datafusion/pull/17238) (jackkleeman)
+- Configure cli test that requires backtrace to be optional [#17236](https://github.com/apache/datafusion/pull/17236) (Jefffrey)
+- chore(deps): Update sqlparser to 0.58 [#16456](https://github.com/apache/datafusion/pull/16456) (Dimchikkk)
+- chore(deps): bump rustyline from 17.0.0 to 17.0.1 [#17252](https://github.com/apache/datafusion/pull/17252) (dependabot[bot])
+- chore(deps): bump thiserror from 2.0.14 to 2.0.16 [#17257](https://github.com/apache/datafusion/pull/17257) (dependabot[bot])
+- Fix HashJoinExec sideways information passing for partitioned queries [#17197](https://github.com/apache/datafusion/pull/17197) (adriangb)
+- Fix HashJoinExec test snapshot under force_hash_collisions=true [#17265](https://github.com/apache/datafusion/pull/17265) (adriangb)
+- Deprecate confusingly named `UserDefinedFunctionPlanner` [#17247](https://github.com/apache/datafusion/pull/17247) (alamb)
+- Fix: ListingTableFactory paths with dots [#17233](https://github.com/apache/datafusion/pull/17233) (BlakeOrth)
+- chore(deps): bump tempfile from 3.20.0 to 3.21.0 [#17268](https://github.com/apache/datafusion/pull/17268) (dependabot[bot])
+- Fix PartialOrd for ScalarUDF [#17182](https://github.com/apache/datafusion/pull/17182) (findepi)
+- chore(deps): bump url from 2.5.4 to 2.5.6 [#17283](https://github.com/apache/datafusion/pull/17283) (dependabot[bot])
+- Make dynamic filter creation in HashJoinExec deterministic against partition evaluation order [#17280](https://github.com/apache/datafusion/pull/17280) (adriangb)
+- Consolidate Parquet Metadata handling into its own module and struct `DFParquetMetadata` [#17127](https://github.com/apache/datafusion/pull/17127) (alamb)
+- Only update TopK dynamic filters if the new ones are more selective [#16433](https://github.com/apache/datafusion/pull/16433) (adriangb)
+- Add documentation for UNION schema handling. [#17248](https://github.com/apache/datafusion/pull/17248) (wiedld)
+- Replace π-related bound constants with next_up/next_down [#16823](https://github.com/apache/datafusion/pull/16823) (rthummaluru)
+- chore: add example for how to use TrackConsumersPool [#17213](https://github.com/apache/datafusion/pull/17213) (wiedld)
+- minor: Remove extra line break in explain physical plan [#17303](https://github.com/apache/datafusion/pull/17303) (nuno-faria)
+- Support `avg(distinct)` for `float64` type [#17255](https://github.com/apache/datafusion/pull/17255) (Jefffrey)
+- chore: check the error message log [#17308](https://github.com/apache/datafusion/pull/17308) (caicancai)
+- Expand sql_planner benchmark for benchmarking physical and logical optimization. [#17276](https://github.com/apache/datafusion/pull/17276) (Omega359)
+- Encapsulate early File pruning in parquet opener in its own stream [#17293](https://github.com/apache/datafusion/pull/17293) (alamb)
+- Implement `partition_statistics` API for `RepartitionExec` [#17061](https://github.com/apache/datafusion/pull/17061) (liamzwbao)
+- chore: replace Schema with SchemaRef in PruningExpressionBuilder [#17216](https://github.com/apache/datafusion/pull/17216) (etolbakov)
+- chore(deps): bump regex-syntax from 0.8.5 to 0.8.6 [#17320](https://github.com/apache/datafusion/pull/17320) (dependabot[bot])
+- chore(deps): bump indexmap from 2.10.0 to 2.11.0 [#17316](https://github.com/apache/datafusion/pull/17316) (dependabot[bot])
+- refactor: Split `SortMergeJoin` into multiple modules [#17304](https://github.com/apache/datafusion/pull/17304) (jonathanc-n)
+- MINOR: add missing examples to example list [#17333](https://github.com/apache/datafusion/pull/17333) (waynexia)
+- chore: split hash join to smaller modules [#17300](https://github.com/apache/datafusion/pull/17300) (2010YOUY01)
+- chore(deps): bump url from 2.5.6 to 2.5.7 [#17324](https://github.com/apache/datafusion/pull/17324) (dependabot[bot])
+- chore(deps): bump regex from 1.11.1 to 1.11.2 [#17325](https://github.com/apache/datafusion/pull/17325) (dependabot[bot])
+- add a ci job for typo checking [#17339](https://github.com/apache/datafusion/pull/17339) (waynexia)
+- chore(deps): bump clap from 4.5.45 to 4.5.46 [#17338](https://github.com/apache/datafusion/pull/17338) (dependabot[bot])
+- chore(deps): bump korandoru/hawkeye from 6.1.1 to 6.2.0 [#17321](https://github.com/apache/datafusion/pull/17321) (dependabot[bot])
+- chore: avoid very cheap copy in `SchemaMapping` [#17344](https://github.com/apache/datafusion/pull/17344) (rluvaton)
+- chore(deps): bump actions/checkout from 4.2.2 to 5.0.0 [#17345](https://github.com/apache/datafusion/pull/17345) (dependabot[bot])
+- chore(deps): bump libmimalloc-sys from 0.1.43 to 0.1.44 [#17343](https://github.com/apache/datafusion/pull/17343) (dependabot[bot])
+- fix EquivalenceProperties calculation in DataSourceExec [#17323](https://github.com/apache/datafusion/pull/17323) (adriangb)
+- chore(deps): bump mimalloc from 0.1.47 to 0.1.48 [#17353](https://github.com/apache/datafusion/pull/17353) (dependabot[bot])
+- chore(deps): bump tracing-subscriber from 0.3.19 to 0.3.20 [#17355](https://github.com/apache/datafusion/pull/17355) (dependabot[bot])
+- refactor: simplify json_shredding example by using ListingTable [#17369](https://github.com/apache/datafusion/pull/17369) (waynexia)
+- Fix incorrect memory accounting for sliced `StringViewArray` [#17315](https://github.com/apache/datafusion/pull/17315) (ding-young)
+- chore(deps): bump aws-credential-types from 1.2.5 to 1.2.6 [#17368](https://github.com/apache/datafusion/pull/17368) (dependabot[bot])
+- minor: use debug level log for physical optimizer [#17383](https://github.com/apache/datafusion/pull/17383) (waynexia)
+- chore(deps): bump uuid from 1.18.0 to 1.18.1 [#17384](https://github.com/apache/datafusion/pull/17384) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.5 to 1.8.6 [#17386](https://github.com/apache/datafusion/pull/17386) (dependabot[bot])
+- minor: make dict_from_values public [#17376](https://github.com/apache/datafusion/pull/17376) (parthchandra)
+- chore: add memory catalog test to handle table removal before schema deregistration [#17307](https://github.com/apache/datafusion/pull/17307) (caicancai)
+- chore(deps): bump actions/setup-node from 4.4.0 to 5.0.0 [#17410](https://github.com/apache/datafusion/pull/17410) (dependabot[bot])
+- chore(deps): bump actions/stale from 9.1.0 to 10.0.0 [#17409](https://github.com/apache/datafusion/pull/17409) (dependabot[bot])
+- chore(deps): bump actions/labeler from 5.0.0 to 6.0.0 [#17408](https://github.com/apache/datafusion/pull/17408) (dependabot[bot])
+- Avoid panic when 'with order' expression could not be converted to a logical expression [#17394](https://github.com/apache/datafusion/pull/17394) (pepijnve)
+- chore(deps): bump apache-avro from 0.17.0 to 0.20.0 [#16092](https://github.com/apache/datafusion/pull/16092) (dependabot[bot])
+- chore(deps): bump actions/setup-python from 5.6.0 to 6.0.0 [#17413](https://github.com/apache/datafusion/pull/17413) (dependabot[bot])
+- Test grouping by FixedSizeList [#17415](https://github.com/apache/datafusion/pull/17415) (findepi)
+- re-export physical_expr_adapter [#17414](https://github.com/apache/datafusion/pull/17414) (adriangb)
+- Benchmark window function with multiple partitioning columns [#17402](https://github.com/apache/datafusion/pull/17402) (findepi)
+- Fix PartialOrd for Window [#17393](https://github.com/apache/datafusion/pull/17393) (findepi)
+- Memory datasource protobuf support [#17290](https://github.com/apache/datafusion/pull/17290) (lewiszlw)
+- fix bounds accumulator reset in HashJoinExec dynamic filter pushdown [#17371](https://github.com/apache/datafusion/pull/17371) (adriangb)
+- Unimplement `PartialOrd` for `TDigest`'s `Centroid` [#17440](https://github.com/apache/datafusion/pull/17440) (findepi)
+- Unimplement `PartialEq`, `PartialOrd` from `ToRepartition`, `RePartition` [#17441](https://github.com/apache/datafusion/pull/17441) (findepi)
+- chore(deps): bump insta from 1.43.1 to 1.43.2 [#17436](https://github.com/apache/datafusion/pull/17436) (dependabot[bot])
+- chore(deps): bump actions/labeler from 6.0.0 to 6.0.1 [#17433](https://github.com/apache/datafusion/pull/17433) (dependabot[bot])
+- chore(deps): bump clap from 4.5.46 to 4.5.47 [#17435](https://github.com/apache/datafusion/pull/17435) (dependabot[bot])
+- Add PhysicalExpr::is_volatile [#17351](https://github.com/apache/datafusion/pull/17351) (adriangb)
+- refactor: Use `BufferedBatchState` enum for SMJ spilling [#17429](https://github.com/apache/datafusion/pull/17429) (jonathanc-n)
+- Re-enable page index for encrypted Parquet [#17426](https://github.com/apache/datafusion/pull/17426) (adamreeve)
+- Re-export apache-avro when avro feature flag is set [#17388](https://github.com/apache/datafusion/pull/17388) (shivbhatia10)
+- Improved experience when remote object store URL does not end in / [#17364](https://github.com/apache/datafusion/pull/17364) (xiedeyantu)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    51	dependabot[bot]
+    31	Piotr Findeisen
+    24	Adrian Garcia Badaracco
+    21	Andrew Lamb
+    14	Yongting You
+    11	Chen Chongchen
+     9	Berkay Şahin
+     9	Ruihang Xia
+     7	Jeffrey Vo
+     7	Nuno Faria
+     6	Oleks V
+     6	Pepijn Van Eeckhoudt
+     6	Qi Zhu
+     5	ding-young
+     4	Adam Reeve
+     4	Alan Tang
+     4	Jonathan Chen
+     4	Liam Bao
+     4	Peter Nguyen
+     4	Raz Luvaton
+     4	kosiew
+     3	Adam Gutglick
+     3	Cancai Cai
+     3	Huaijin
+     3	Peter L
+     3	lorenarosati
+     3	wiedld
+     2	Brent Gardner
+     2	Bruce Ritchie
+     2	Marco Neumann
+     2	Matthew Kim
+     2	Sherin Jacob
+     2	Thearas
+     2	Tim Saucer
+     2	miro
+     2	xudong.w
+     2	张林伟
+     1	Ajeeta Asthana
+     1	Alex Huang
+     1	Andrey Koshchiy
+     1	Andy Grove
+     1	Blake Orth
+     1	Christian van der Loo
+     1	Colin Marc
+     1	Corwin Joy
+     1	David Hewitt
+     1	David López
+     1	Dima
+     1	Eugene Tolbakov
+     1	Evgenii Glotov
+     1	GPK
+     1	Gabriel
+     1	Geetansh Juneja
+     1	Geoffrey Claude
+     1	Haresh Khanna
+     1	Jack Kleeman
+     1	Jax Liu
+     1	Jensen
+     1	LB7666
+     1	Loakesh Indiran
+     1	Marko Milenković
+     1	Matt Matravers
+     1	Nga Tran
+     1	Parth Chandra
+     1	Ronit Thummaluru
+     1	Shehab Amin
+     1	Shiv Bhatia
+     1	Shruti Sharma
+     1	Simon Vandel Sillesen
+     1	Stuart Carnie
+     1	Tobias Schwarzinger
+     1	Yuhan Wang
+     1	ZC
+     1	Zhen Wang
+     1	aditya singh rathore
+     1	ayemjay
+     1	delamarch3
+     1	mwish
+     1	theirix
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/50.1.0.md b/dev/changelog/50.1.0.md
new file mode 100644
index 0000000000000..e4ead4cb456c3
--- /dev/null
+++ b/dev/changelog/50.1.0.md
@@ -0,0 +1,47 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 50.1.0 Changelog
+
+This release consists of 4 commits from 4 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-50] fix: ignore non-existent columns when adding filter equivalence info in FileScanConfig (#17546) [#17600](https://github.com/apache/datafusion/pull/17600) (rkrishn7)
+- [branch-50] fix: Ensure the CachedParquetFileReader respects the metadata prefetch hint (#17302) [#17613](https://github.com/apache/datafusion/pull/17613) (shehabgamin)
+- [branch-50] Partial AggregateMode will generate duplicate field names which will fail DFSchema construct to branch-50 [#17717](https://github.com/apache/datafusion/pull/17717) (zhuqi-lucas)
+- [branch-50]: feat: expose `udafs` and `udwfs` methods on `FunctionRegistry` (#17650) [#17725](https://github.com/apache/datafusion/pull/17725) (milenkovicm)
+- [branch-50] Backport change to avoid debug symbols in ci builds to 50.0.0 [#17795](https://github.com/apache/datafusion/pull/17795) (alamb)
+- [branch-50] Backport Prevent exponential planning time for Window functions - v2 #17684 [#17778](https://github.com/apache/datafusion/pull/17778) (alamb)
+- [branch-50] Fix potential overflow when we print verbose physical plan [#17804](https://github.com/apache/datafusion/pull/17804) (zhuqi-lucas)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     1	Marko Milenković
+     1	Qi Zhu
+     1	Rohan Krishnaswamy
+     1	Shehab Amin
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/50.2.0.md b/dev/changelog/50.2.0.md
new file mode 100644
index 0000000000000..6d16ace832ab7
--- /dev/null
+++ b/dev/changelog/50.2.0.md
@@ -0,0 +1,43 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 50.2.0 Changelog
+
+This release consists of 3 commits from 1 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Documentation updates:**
+
+- [branch-50] Backport: fix typos & pin action hashes (#17855) [#17892](https://github.com/apache/datafusion/pull/17892) (AdamGS)
+
+**Other:**
+
+- [branch-50] Backport: Fix docs.rs build: Replace auto_doc_cfg with doc_cfg [#17890](https://github.com/apache/datafusion/pull/17890) (AdamGS)
+- [branch-50] Backport: `avg(distinct)` support for decimal types (#17560) [#17885](https://github.com/apache/datafusion/pull/17885) (AdamGS)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     3	Adam Gutglick
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/50.3.0.md b/dev/changelog/50.3.0.md
new file mode 100644
index 0000000000000..49950e00c282d
--- /dev/null
+++ b/dev/changelog/50.3.0.md
@@ -0,0 +1,47 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 50.3.0 Changelog
+
+This release consists of 7 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-50] chore: Fix `no space left on device` [#18141](https://github.com/apache/datafusion/pull/18141) (comphead)
+- [branch-50]: chore: cherry pick `concat` to 50.3.0 [#18128](https://github.com/apache/datafusion/pull/18128) (comphead)
+- [branch-50] Backport Fix bug in LimitPushPastWindows (#18029) [#18107](https://github.com/apache/datafusion/pull/18107) (avantgardnerio)
+- [branch-50] fix(SubqueryAlias): use maybe_project_redundant_column (#17478) [#18130](https://github.com/apache/datafusion/pull/18130) (hareshkh)
+- [branch-50]: fix: Add overflow checks to SparkDateAdd/Sub to avoid panics (#18013) [#18131](https://github.com/apache/datafusion/pull/18131) (hareshkh)
+- [branch-50] Fix ambiguous column names in substrait conversion #17299 [#18077](https://github.com/apache/datafusion/pull/18077) (hareshkh)
+- [branch-50] Extend datatype semantic equality check to include timestamps (#17777) [#18129](https://github.com/apache/datafusion/pull/18129) (hareshkh)
+- [branch-50] perf: Fix NLJ slow join with condition `array_has` (#18161) [#18179](https://github.com/apache/datafusion/pull/18179) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     4	Haresh Khanna
+     2	Oleks V
+     1	Brent Gardner
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/51.0.0.md b/dev/changelog/51.0.0.md
new file mode 100644
index 0000000000000..60dd24cde5595
--- /dev/null
+++ b/dev/changelog/51.0.0.md
@@ -0,0 +1,717 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 51.0.0 Changelog
+
+This release consists of 537 commits from 129 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Introduce `TypeSignatureClass::Binary` to allow accepting arbitrarily sized `FixedSizeBinary` arguments [#17531](https://github.com/apache/datafusion/pull/17531) (Jefffrey)
+- feat: change `datafusion-proto` to use `TaskContext` rather than`SessionContext` for physical plan serialization [#17601](https://github.com/apache/datafusion/pull/17601) (milenkovicm)
+- chore: refactor usage of `reassign_predicate_columns` [#17703](https://github.com/apache/datafusion/pull/17703) (rkrishn7)
+- fix: correct edge case where null haystack returns false instead of null [#17818](https://github.com/apache/datafusion/pull/17818) (Jefffrey)
+- clean up duplicate information in FileOpener trait [#17956](https://github.com/apache/datafusion/pull/17956) (adriangb)
+- refactor : deprecate `ParquetSource::predicate()` and merge into `FileSource::filter()` [#17971](https://github.com/apache/datafusion/pull/17971) (getChan)
+- feat: convert_array_to_scalar_vec respects null elements [#17891](https://github.com/apache/datafusion/pull/17891) (vegarsti)
+- make Union::try_new pub [#18125](https://github.com/apache/datafusion/pull/18125) (leoyvens)
+- refactor: remove unused `type_coercion/aggregate.rs` functions [#18091](https://github.com/apache/datafusion/pull/18091) (Jefffrey)
+- refactor: remove core crate from datafusion-proto [#18123](https://github.com/apache/datafusion/pull/18123) (timsaucer)
+- Use TableSchema in FileScanConfig [#18231](https://github.com/apache/datafusion/pull/18231) (adriangb)
+- Enable placeholders with extension types [#17986](https://github.com/apache/datafusion/pull/17986) (paleolimbot)
+- Implement `DESCRIBE SELECT` to show schema rather than `EXPLAIN` plan [#18238](https://github.com/apache/datafusion/pull/18238) (djanderson)
+- Push partition_statistics into DataSource [#18233](https://github.com/apache/datafusion/pull/18233) (adriangb)
+- Let `FileScanConfig` own a list of `ProjectionExpr`s [#18253](https://github.com/apache/datafusion/pull/18253) (friendlymatthew)
+- Introduce `expr_fields` to `AccumulatorArgs` to hold input argument fields [#18100](https://github.com/apache/datafusion/pull/18100) (Jefffrey)
+- Rename `is_ordered_set_aggregate` to `supports_within_group_clause` for UDAFs [#18397](https://github.com/apache/datafusion/pull/18397) (Jefffrey)
+- Move generate_series projection logic into LazyMemoryStream [#18373](https://github.com/apache/datafusion/pull/18373) (mkleen)
+
+**Performance related:**
+
+- Improve `Hash` and `Ord` speed for `dyn LogicalType` [#17437](https://github.com/apache/datafusion/pull/17437) (findepi)
+- Faster `&&String::to_string` [#17583](https://github.com/apache/datafusion/pull/17583) (findepi)
+- perf: Simplify CASE for any WHEN TRUE [#17602](https://github.com/apache/datafusion/pull/17602) (petern48)
+- perf: Improve the performance of WINDOW functions with many partitions [#17528](https://github.com/apache/datafusion/pull/17528) (nuno-faria)
+- Avoid redundant Schema clones [#17643](https://github.com/apache/datafusion/pull/17643) (findepi)
+- Prevent exponential planning time for Window functions - v2 [#17684](https://github.com/apache/datafusion/pull/17684) (berkaysynnada)
+- Add case expr simplifiers for literal comparisons [#17743](https://github.com/apache/datafusion/pull/17743) (jackkleeman)
+- Enable Projection Pushdown Optimization for Recursive CTEs [#16696](https://github.com/apache/datafusion/pull/16696) (kosiew)
+- perf: Optimize CASE for any WHEN false [#17835](https://github.com/apache/datafusion/pull/17835) (petern48)
+- feat: Simplify `NOT(IN ..)` to `NOT IN` and `NOT (EXISTS ..)` to `NOT EXISTS` [#17848](https://github.com/apache/datafusion/pull/17848) (Tpt)
+- perf: Faster `string_agg()` aggregate function (1000x speed for no DISTINCT and ORDER case) [#17837](https://github.com/apache/datafusion/pull/17837) (2010YOUY01)
+- optimizer: allow projection pushdown through aliased recursive CTE references [#17875](https://github.com/apache/datafusion/pull/17875) (kosiew)
+- perf: Implement boolean group values [#17726](https://github.com/apache/datafusion/pull/17726) (ashdnazg)
+- #17838 Rewrite `regexp_like` calls as `~` and `*~` operator expressions when possible [#17839](https://github.com/apache/datafusion/pull/17839) (pepijnve)
+- perf: add to `aggregate_vectorized` bench benchmark for `PrimitiveGroupValueBuilder` as well [#17930](https://github.com/apache/datafusion/pull/17930) (rluvaton)
+- #17972 Restore case expr/expr optimisation while ensuring lazy evaluation [#17973](https://github.com/apache/datafusion/pull/17973) (pepijnve)
+- chore: use `NullBuffer::union` for Spark `concat` [#18087](https://github.com/apache/datafusion/pull/18087) (comphead)
+- Short circuit complex case evaluation modes as soon as possible [#17898](https://github.com/apache/datafusion/pull/17898) (pepijnve)
+- perf: Fix NLJ slow join with condition `array_has` [#18161](https://github.com/apache/datafusion/pull/18161) (2010YOUY01)
+- perf: improve `ScalarValue::to_array_of_size` for Boolean and some null values [#18180](https://github.com/apache/datafusion/pull/18180) (rluvaton)
+- Allow filter pushdown through AggregateExec [#18404](https://github.com/apache/datafusion/pull/18404) (LiaCastaneda)
+- Avoid scatter operation in `ExpressionOrExpression` case evaluation method [#18444](https://github.com/apache/datafusion/pull/18444) (pepijnve)
+
+**Implemented enhancements:**
+
+- feat: Implement `DFSchema.print_schema_tree()` method [#17459](https://github.com/apache/datafusion/pull/17459) (comphead)
+- feat(spark): implement Spark `length` function [#17475](https://github.com/apache/datafusion/pull/17475) (wForget)
+- feat: Add binary to `join_fuzz` testing [#17497](https://github.com/apache/datafusion/pull/17497) (jonathanc-n)
+- feat: Support log for Decimal128 and Decimal256 [#17023](https://github.com/apache/datafusion/pull/17023) (theirix)
+- feat(spark): implement Spark bitwise function shiftleft/shiftright/shiftrightunsighed [#17013](https://github.com/apache/datafusion/pull/17013) (chenkovsky)
+- feat: Ensure explain format in config is valid [#17549](https://github.com/apache/datafusion/pull/17549) (Weijun-H)
+- feat: Simplify CASE WHEN true THEN expr to expr [#17450](https://github.com/apache/datafusion/pull/17450) (EeshanBembi)
+- feat: add `sql` feature to make sql planning optional [#17332](https://github.com/apache/datafusion/pull/17332) (timsaucer)
+- feat: Add `OR REPLACE` to creating external tables [#17580](https://github.com/apache/datafusion/pull/17580) (jonathanc-n)
+- feat(substrait): add support for RightAnti and RightSemi join types [#17604](https://github.com/apache/datafusion/pull/17604) (bvolpato)
+- feat(small): Display `NullEquality` in join executor's `EXPLAIN` output [#17664](https://github.com/apache/datafusion/pull/17664) (2010YOUY01)
+- feat(substrait): add time literal support [#17655](https://github.com/apache/datafusion/pull/17655) (bvolpato)
+- feat(spark): implement Spark `make_interval` function [#17424](https://github.com/apache/datafusion/pull/17424) (davidlghellin)
+- feat: expose `udafs` and `udwfs` methods on `FunctionRegistry` [#17650](https://github.com/apache/datafusion/pull/17650) (milenkovicm)
+- feat: Support Seconds and Milliseconds literals in substrait [#17707](https://github.com/apache/datafusion/pull/17707) (petern48)
+- feat: support for null, date, and timestamp types in approx_distinct [#17618](https://github.com/apache/datafusion/pull/17618) (killme2008)
+- feat: support `Utf8View` for more args of `regexp_replace` [#17195](https://github.com/apache/datafusion/pull/17195) (mbutrovich)
+- feat(spark): implement Spark `map` function `map_from_arrays` [#17456](https://github.com/apache/datafusion/pull/17456) (SparkApplicationMaster)
+- feat: Display window function's alias name in output column [#17788](https://github.com/apache/datafusion/pull/17788) (devampatel03)
+- feat(spark): implement Spark `make_dt_interval` function [#17728](https://github.com/apache/datafusion/pull/17728) (davidlghellin)
+- feat: support multi-threaded writing of Parquet files with modular encryption [#16738](https://github.com/apache/datafusion/pull/16738) (rok)
+- feat(spark): implement Spark `map` function `map_from_entries` [#17779](https://github.com/apache/datafusion/pull/17779) (SparkApplicationMaster)
+- feat: Add Hash Join benchmarks [#17636](https://github.com/apache/datafusion/pull/17636) (jonathanc-n)
+- feat: Support swap for `RightMark` Join [#17651](https://github.com/apache/datafusion/pull/17651) (jonathanc-n)
+- feat: support spark udf format_string [#17561](https://github.com/apache/datafusion/pull/17561) (chenkovsky)
+- feat(spark): implement Spark `try_parse_url` function [#17485](https://github.com/apache/datafusion/pull/17485) (rafafrdz)
+- feat: Support reading CSV files with inconsistent column counts [#17553](https://github.com/apache/datafusion/pull/17553) (EeshanBembi)
+- feat: Adds Instrumented Object Store Registry to datafusion-cli [#17953](https://github.com/apache/datafusion/pull/17953) (BlakeOrth)
+- feat: add cargo-machete in CI [#18030](https://github.com/apache/datafusion/pull/18030) (Weijun-H)
+- feat(spark): implement Spark `elt` function [#17729](https://github.com/apache/datafusion/pull/17729) (davidlghellin)
+- feat: support Spark `concat` string function [#18063](https://github.com/apache/datafusion/pull/18063) (comphead)
+- feat: support `null_treatment`, `distinct`, and `filter` for window functions in proto [#18024](https://github.com/apache/datafusion/pull/18024) (dqkqd)
+- feat: Add percentile_cont aggregate function [#17988](https://github.com/apache/datafusion/pull/17988) (adriangb)
+- feat: spark udf array shuffle [#17674](https://github.com/apache/datafusion/pull/17674) (chenkovsky)
+- feat: Support configurable `EXPLAIN ANALYZE` detail level [#18098](https://github.com/apache/datafusion/pull/18098) (2010YOUY01)
+- feat: add fp16 support to Substrait [#18086](https://github.com/apache/datafusion/pull/18086) (westonpace)
+- feat: `ClassicJoin` for PWMJ [#17482](https://github.com/apache/datafusion/pull/17482) (jonathanc-n)
+- feat(docs): display compatible logo for dark mode [#18197](https://github.com/apache/datafusion/pull/18197) (foskey51)
+- feat: Add `deregister_object_store` [#17999](https://github.com/apache/datafusion/pull/17999) (jonathanc-n)
+- feat: Add existence join to NestedLoopJoin benchmarks [#18005](https://github.com/apache/datafusion/pull/18005) (jonathanc-n)
+- feat(small): Set 'summary' level metrics for `DataSourceExec` with parquet source [#18196](https://github.com/apache/datafusion/pull/18196) (2010YOUY01)
+- feat: be indifferent to padding when decoding base64 [#18264](https://github.com/apache/datafusion/pull/18264) (colinmarc)
+- feat: Add `output_bytes` to baseline metrics [#18268](https://github.com/apache/datafusion/pull/18268) (2010YOUY01)
+- feat: Introduce `PruningMetrics` and use it in parquet file pruning metric [#18297](https://github.com/apache/datafusion/pull/18297) (2010YOUY01)
+- feat: Improve metrics for aggregate streams. [#18325](https://github.com/apache/datafusion/pull/18325) (EmilyMatt)
+- feat: allow pushdown of dynamic filters having partition cols [#18172](https://github.com/apache/datafusion/pull/18172) (feniljain)
+- feat: support temporary views in DataFrameTableProvider [#18158](https://github.com/apache/datafusion/pull/18158) (r1b)
+- feat: Better parquet row-group/page pruning metrics display [#18321](https://github.com/apache/datafusion/pull/18321) (2010YOUY01)
+- feat: Add Hash trait to StatsType enum [#18382](https://github.com/apache/datafusion/pull/18382) (rluvaton)
+- feat: support get_field for map literal [#18371](https://github.com/apache/datafusion/pull/18371) (chenkovsky)
+- feat(docs): enable navbar [#18324](https://github.com/apache/datafusion/pull/18324) (foskey51)
+- feat: Add `selectivity` metrics to `FilterExec` [#18406](https://github.com/apache/datafusion/pull/18406) (2010YOUY01)
+- feat: Add `reduction_factor` metric to `AggregateExec` for EXPLAIN ANALYZE [#18455](https://github.com/apache/datafusion/pull/18455) (petern48)
+- feat: support named arguments for aggregate and window udfs [#18389](https://github.com/apache/datafusion/pull/18389) (bubulalabu)
+- feat: Add selectivity metric to NestedLoopJoinExec for EXPLAIN ANALYZE [#18481](https://github.com/apache/datafusion/pull/18481) (petern48)
+
+**Fixed bugs:**
+
+- fix: lazy evaluation for coalesce [#17357](https://github.com/apache/datafusion/pull/17357) (chenkovsky)
+- fix: Implement AggregateUDFImpl::reverse_expr for StringAgg [#17165](https://github.com/apache/datafusion/pull/17165) (nuno-faria)
+- fix: Support aggregate expressions in `QUALIFY` [#17313](https://github.com/apache/datafusion/pull/17313) (rkrishn7)
+- fix: synchronize partition bounds reporting in HashJoin [#17452](https://github.com/apache/datafusion/pull/17452) (rkrishn7)
+- fix: correct typos in `CONTRIBUTING.md` [#17507](https://github.com/apache/datafusion/pull/17507) (Weijun-H)
+- fix: Add AWS environment variable checks for S3 tests [#17519](https://github.com/apache/datafusion/pull/17519) (Weijun-H)
+- fix: Ensure the CachedParquetFileReader respects the metadata prefetch hint [#17302](https://github.com/apache/datafusion/pull/17302) (nuno-faria)
+- fix: prevent UnionExec panic with empty inputs [#17449](https://github.com/apache/datafusion/pull/17449) (EeshanBembi)
+- fix: ignore non-existent columns when adding filter equivalence info in `FileScanConfig` [#17546](https://github.com/apache/datafusion/pull/17546) (rkrishn7)
+- fix: Prevent duplicate expressions in DynamicPhysicalExpr [#17551](https://github.com/apache/datafusion/pull/17551) (UBarney)
+- fix: `SortExec` `TopK` OOM [#17622](https://github.com/apache/datafusion/pull/17622) (nuno-faria)
+- fix: Change `OuterReferenceColumn` to contain the entire outer field to prevent metadata loss [#17524](https://github.com/apache/datafusion/pull/17524) (Kontinuation)
+- fix: Preserves field metadata when creating logical plan for VALUES expression [#17525](https://github.com/apache/datafusion/pull/17525) (Kontinuation)
+- fix: Ignore governance doc from typos [#17678](https://github.com/apache/datafusion/pull/17678) (rkrishn7)
+- fix: null padding for `array_reverse` on `FixedSizeList` [#17673](https://github.com/apache/datafusion/pull/17673) (chenkovsky)
+- fix: correct statistics for `NestedLoopJoinExec` [#17680](https://github.com/apache/datafusion/pull/17680) (duongcongtoai)
+- fix: Partial AggregateMode will generate duplicate field names which will fail DFSchema construct [#17706](https://github.com/apache/datafusion/pull/17706) (zhuqi-lucas)
+- fix: Remove parquet encryption feature from root deps [#17700](https://github.com/apache/datafusion/pull/17700) (Vyquos)
+- fix: Remove datafusion-macros's dependency on datafusion-expr [#17688](https://github.com/apache/datafusion/pull/17688) (yutannihilation)
+- fix: Filter out nulls properly in approx_percentile_cont_with_weight [#17780](https://github.com/apache/datafusion/pull/17780) (Jefffrey)
+- fix: ignore `DataType::Null` in possible types during csv type inference [#17796](https://github.com/apache/datafusion/pull/17796) (dqkqd)
+- fix: `ParquetSource` - `with_predicate()` don't have to reset metrics [#17858](https://github.com/apache/datafusion/pull/17858) (2010YOUY01)
+- fix: optimizer `common_sub_expression_eliminate` fails in a window function [#17852](https://github.com/apache/datafusion/pull/17852) (dqkqd)
+- fix: fix failing test compilation on main [#17955](https://github.com/apache/datafusion/pull/17955) (Jefffrey)
+- fix: update `PrimitiveGroupValueBuilder` to match NaN correctly in scalar `equal_to` [#17979](https://github.com/apache/datafusion/pull/17979) (rluvaton)
+- fix: Add overflow checks to SparkDateAdd/Sub to avoid panics [#18013](https://github.com/apache/datafusion/pull/18013) (andygrove)
+- fix: Ensure ListingTable partitions are pruned when filters are not used [#17958](https://github.com/apache/datafusion/pull/17958) (peasee)
+- fix: Improve null handling in array_to_string function [#18076](https://github.com/apache/datafusion/pull/18076) (Weijun-H)
+- fix: Re-bump latest datafusion-testing module so extended tests succeed [#18110](https://github.com/apache/datafusion/pull/18110) (Jefffrey)
+- fix: window unparsing [#17367](https://github.com/apache/datafusion/pull/17367) (chenkovsky)
+- fix: Add dictionary coercion support for numeric comparison operations [#18099](https://github.com/apache/datafusion/pull/18099) (ahmed-mez)
+- fix(substrait): schema errors for Aggregates with no groupings [#17909](https://github.com/apache/datafusion/pull/17909) (vbarua)
+- fix: `array_distinct` inner nullability causing type mismatch [#18104](https://github.com/apache/datafusion/pull/18104) (dqkqd)
+- fix: improve document ui [#18157](https://github.com/apache/datafusion/pull/18157) (getChan)
+- fix(docs): resolve extra outline on tables [#18193](https://github.com/apache/datafusion/pull/18193) (foskey51)
+- fix: Use dynamic timezone in now() function for accurate timestamp [#18017](https://github.com/apache/datafusion/pull/18017) (Weijun-H)
+- fix: UnnestExec preserves relevant equivalence properties of input [#16985](https://github.com/apache/datafusion/pull/16985) (vegarsti)
+- fix: wrong simplification for >= >, <= < [#18222](https://github.com/apache/datafusion/pull/18222) (chenkovsky)
+- fix: only fall back to listing prefixes on 404 errors [#18263](https://github.com/apache/datafusion/pull/18263) (colinmarc)
+- fix: Support Dictionary[Int32, Binary] for bitmap count spark function [#18273](https://github.com/apache/datafusion/pull/18273) (kazantsev-maksim)
+- fix: support float16 for `abs()` [#18304](https://github.com/apache/datafusion/pull/18304) (Jefffrey)
+- fix: Add WITH ORDER display in information_schema.views [#18282](https://github.com/apache/datafusion/pull/18282) (gene-bordegaray)
+- fix: correct date_trunc for times before the epoch [#18356](https://github.com/apache/datafusion/pull/18356) (mhilton)
+- fix: Preserve percent-encoding in `PartitionedFile` paths during deserialization [#18346](https://github.com/apache/datafusion/pull/18346) (lonless9)
+- fix: SortPreservingMerge sanity check rejects valid ORDER BY with CASE expression [#18342](https://github.com/apache/datafusion/pull/18342) (watford-ep)
+- fix: `DataFrame::select_columns` and `DataFrame::drop_columns` for qualified duplicated field names [#18236](https://github.com/apache/datafusion/pull/18236) (dqkqd)
+- fix(docs): remove navbar padding breaking ui on mobile [#18402](https://github.com/apache/datafusion/pull/18402) (foskey51)
+- fix: null cast not valid in substrait round trip [#18414](https://github.com/apache/datafusion/pull/18414) (gene-bordegaray)
+- fix: map benchmark failing [#18469](https://github.com/apache/datafusion/pull/18469) (randyli)
+- fix: eliminate warning when building without sql feature [#18480](https://github.com/apache/datafusion/pull/18480) (corasaurus-hex)
+- fix: spark array return type mismatch when inner data type is LargeList [#18485](https://github.com/apache/datafusion/pull/18485) (jizezhang)
+- fix: shuffle seed [#18518](https://github.com/apache/datafusion/pull/18518) (chenkovsky)
+
+**Documentation updates:**
+
+- Auto detect hive column partitioning with ListingTableFactory / `CREATE EXTERNAL TABLE` [#17232](https://github.com/apache/datafusion/pull/17232) (BlakeOrth)
+- Rename Blaze to Auron [#17532](https://github.com/apache/datafusion/pull/17532) (merrily01)
+- Revert #17295 (Support from-first SQL syntax) [#17520](https://github.com/apache/datafusion/pull/17520) (adriangb)
+- minor: Update doc comments on type signature [#17556](https://github.com/apache/datafusion/pull/17556) (Jefffrey)
+- docs: Update documentation on Epics and Supervising Maintainers [#17505](https://github.com/apache/datafusion/pull/17505) (alamb)
+- docs: Move Google Summer of Code 2025 pages to a section [#17504](https://github.com/apache/datafusion/pull/17504) (alamb)
+- Upgrade to arrow 56.1.0 [#17275](https://github.com/apache/datafusion/pull/17275) (alamb)
+- docs: add xorq to list of known users [#17668](https://github.com/apache/datafusion/pull/17668) (dlovell)
+- docs: deduplicate links in `introduction.md` [#17669](https://github.com/apache/datafusion/pull/17669) (Jefffrey)
+- Add explicit PMC/committers list to governance docs page [#17574](https://github.com/apache/datafusion/pull/17574) (alamb)
+- chore: Update READMEs of crates to be more consistent [#17691](https://github.com/apache/datafusion/pull/17691) (Jefffrey)
+- chore: fix wasm-pack installation link in wasmtest README [#17704](https://github.com/apache/datafusion/pull/17704) (Jefffrey)
+- docs: Remove disclaimer that `datafusion` 50.0.0 is not released [#17695](https://github.com/apache/datafusion/pull/17695) (nuno-faria)
+- Bump MSRV to 1.87.0 [#17724](https://github.com/apache/datafusion/pull/17724) (findepi)
+- docs: Fix 'Clicking a link in optimizer docs downloads the file instead of redirecting to github' [#17723](https://github.com/apache/datafusion/pull/17723) (petern48)
+- Move misplaced upgrading entry about MSRV [#17727](https://github.com/apache/datafusion/pull/17727) (findepi)
+- Introduce `avg_distinct()` and `sum_distinct()` functions to DataFrame API [#17536](https://github.com/apache/datafusion/pull/17536) (Jefffrey)
+- Support `WHERE`, `ORDER BY`, `LIMIT`, `SELECT`, `EXTEND` pipe operators [#17278](https://github.com/apache/datafusion/pull/17278) (simonvandel)
+- doc: add missing examples for multiple math functions [#17018](https://github.com/apache/datafusion/pull/17018) (Adez017)
+- chore: remove homebrew publish instructions from release steps [#17735](https://github.com/apache/datafusion/pull/17735) (Jefffrey)
+- Improve documentation for ordered set aggregate functions [#17744](https://github.com/apache/datafusion/pull/17744) (alamb)
+- docs: fix sidebar overlapping table on configuration page on website [#17738](https://github.com/apache/datafusion/pull/17738) (saimahendra282)
+- docs: add Ballista link to landing page (#17746) [#17775](https://github.com/apache/datafusion/pull/17775) (Nihallllll)
+- [DOCS] Add dbt Fusion engine and R2 Query Engine to "Known Users" [#17793](https://github.com/apache/datafusion/pull/17793) (dataders)
+- docs: update wasmtest README with instructions for Apple silicon [#17755](https://github.com/apache/datafusion/pull/17755) (Jefffrey)
+- docs: Add SedonaDB as known user of Apache DataFusion [#17806](https://github.com/apache/datafusion/pull/17806) (petern48)
+- minor: simplify docs build process & pin pip package versions [#17816](https://github.com/apache/datafusion/pull/17816) (Jefffrey)
+- Cleanup user guide known users section [#17834](https://github.com/apache/datafusion/pull/17834) (blaginin)
+- Fix the doc about row_groups pruning metrics in explain_usage.md [#17846](https://github.com/apache/datafusion/pull/17846) (xudong963)
+- Fix docs.rs build: Replace `auto_doc_cfg` with `doc_cfg` [#17845](https://github.com/apache/datafusion/pull/17845) (mbrobbel)
+- docs: Add rerun.io to known users guide [#17825](https://github.com/apache/datafusion/pull/17825) (alamb)
+- chore: fix typos & pin action hashes [#17855](https://github.com/apache/datafusion/pull/17855) (Jefffrey)
+- Clarify email reply instructions for invitations [#17851](https://github.com/apache/datafusion/pull/17851) (rluvaton)
+- Add missing parenthesis in features documentation [#17869](https://github.com/apache/datafusion/pull/17869) (Viicos)
+- Improve comments for DataSinkExec [#17873](https://github.com/apache/datafusion/pull/17873) (xudong963)
+- minor: Make `FunctionRegistry` `udafs` and `udwfs` methods mandatory [#17847](https://github.com/apache/datafusion/pull/17847) (milenkovicm)
+- docs: Improve documentation for FunctionFactory / CREATE FUNCTION [#17859](https://github.com/apache/datafusion/pull/17859) (alamb)
+- Support `AS`, `UNION`, `INTERSECTION`, `EXCEPT`, `AGGREGATE` pipe operators [#17312](https://github.com/apache/datafusion/pull/17312) (simonvandel)
+- [forward port] Change version to 50.1.0 and add changelog (#17748) [#17826](https://github.com/apache/datafusion/pull/17826) (alamb)
+- chore(deps): bump maturin from 1.9.4 to 1.9.5 in /docs [#17940](https://github.com/apache/datafusion/pull/17940) (dependabot[bot])
+- docs: `Window::try_new_with_schema` with a descriptive error message [#17926](https://github.com/apache/datafusion/pull/17926) (dqkqd)
+- Support `JOIN` pipe operator [#17969](https://github.com/apache/datafusion/pull/17969) (simonvandel)
+- Adds Object Store Profiling options/commands to CLI [#18004](https://github.com/apache/datafusion/pull/18004) (BlakeOrth)
+- docs: typo in `working-with-exprs.md` [#18033](https://github.com/apache/datafusion/pull/18033) (Weijun-H)
+- chore(deps): bump maturin from 1.9.5 to 1.9.6 in /docs [#18039](https://github.com/apache/datafusion/pull/18039) (dependabot[bot])
+- [forward port] Change version to 50.2.0 and add changelog [#18057](https://github.com/apache/datafusion/pull/18057) (xudong963)
+- Update committers on governance page [#18015](https://github.com/apache/datafusion/pull/18015) (alamb)
+- Feat: Make current_date aware of execution timezone. [#18034](https://github.com/apache/datafusion/pull/18034) (codetyri0n)
+- Add independent configs for topk/join dynamic filter [#18090](https://github.com/apache/datafusion/pull/18090) (xudong963)
+- Adds Trace and Summary to CLI instrumented stores [#18064](https://github.com/apache/datafusion/pull/18064) (BlakeOrth)
+- refactor: add dialect enum [#18043](https://github.com/apache/datafusion/pull/18043) (dariocurr)
+- #17982 Make `nvl` a thin wrapper for `coalesce` [#17991](https://github.com/apache/datafusion/pull/17991) (pepijnve)
+- minor: fix incorrect deprecation version & window docs [#18093](https://github.com/apache/datafusion/pull/18093) (Jefffrey)
+- Adding hiop as known user [#18114](https://github.com/apache/datafusion/pull/18114) (enryls)
+- Improve datafusion-cli object store profiling summary display [#18085](https://github.com/apache/datafusion/pull/18085) (alamb)
+- Feat: Make current_time aware of execution timezone. [#18040](https://github.com/apache/datafusion/pull/18040) (codetyri0n)
+- Docs: Update SQL example for current_time() and current_date(). [#18200](https://github.com/apache/datafusion/pull/18200) (codetyri0n)
+- doc: Add `Metrics` section to the user-guide [#18216](https://github.com/apache/datafusion/pull/18216) (2010YOUY01)
+- docs: Update HOWTOs for adding new functions [#18089](https://github.com/apache/datafusion/pull/18089) (Jefffrey)
+- docs: fix trim for `rust,ignore` blocks [#18239](https://github.com/apache/datafusion/pull/18239) (Jefffrey)
+- docs: refine `AggregateUDFImpl::is_ordered_set_aggregate` documentation [#17805](https://github.com/apache/datafusion/pull/17805) (Jefffrey)
+- docs: fix broken SQL & DataFrame links in root README (#18153) [#18274](https://github.com/apache/datafusion/pull/18274) (manasa-manoj-nbr)
+- doc: Contributor guide for AI-generated PRs [#18237](https://github.com/apache/datafusion/pull/18237) (2010YOUY01)
+- doc: Add Join Physical Plan documentation, and configuration flag to benchmarks [#18209](https://github.com/apache/datafusion/pull/18209) (jonathanc-n)
+- "Gentle Introduction to Arrow / Record Batches" #11336 [#18051](https://github.com/apache/datafusion/pull/18051) (sm4rtm4art)
+- Upgrade DataFusion to arrow/parquet 57.0.0 [#17888](https://github.com/apache/datafusion/pull/17888) (alamb)
+- Deduplicate range/gen_series nested functions code [#18198](https://github.com/apache/datafusion/pull/18198) (Jefffrey)
+- minor: doc fixes for timestamp output format [#18315](https://github.com/apache/datafusion/pull/18315) (Jefffrey)
+- Add PostgreSQL-style named arguments support for scalar functions [#18019](https://github.com/apache/datafusion/pull/18019) (bubulalabu)
+- Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files [#18160](https://github.com/apache/datafusion/pull/18160) (zhuqi-lucas)
+- Bump MSRV to 1.88.0 [#18403](https://github.com/apache/datafusion/pull/18403) (harshasiddartha)
+- Change default `time_zone` to `None` (was `"+00:00"`) [#18359](https://github.com/apache/datafusion/pull/18359) (Omega359)
+- Fix instances of "the the" to be "the" in comments/docs [#18478](https://github.com/apache/datafusion/pull/18478) (corasaurus-hex)
+- Update roadmap links for DataFusion Q1 2026 [#18495](https://github.com/apache/datafusion/pull/18495) (alamb)
+- Add a SpillingPool to manage collections of spill files [#18207](https://github.com/apache/datafusion/pull/18207) (adriangb)
+- [branch-51] Update version to 51.0.0, add Changelog [#18551](https://github.com/apache/datafusion/pull/18551) (alamb)
+- [branch-51] Revert rewrite for coalesce, `nvl` and `nvl2` simplification [#18567](https://github.com/apache/datafusion/pull/18567) (alamb)
+
+**Other:**
+
+- Extract complex default impls from AggregateUDFImpl trait [#17391](https://github.com/apache/datafusion/pull/17391) (findepi)
+- chore: make `TableFunction` clonable [#17457](https://github.com/apache/datafusion/pull/17457) (sunng87)
+- chore(deps): bump wasm-bindgen-test from 0.3.50 to 0.3.51 [#17470](https://github.com/apache/datafusion/pull/17470) (dependabot[bot])
+- chore(deps): bump log from 0.4.27 to 0.4.28 [#17471](https://github.com/apache/datafusion/pull/17471) (dependabot[bot])
+- Support csv truncated rows in datafusion [#17465](https://github.com/apache/datafusion/pull/17465) (zhuqi-lucas)
+- chore(deps): bump indexmap from 2.11.0 to 2.11.1 [#17484](https://github.com/apache/datafusion/pull/17484) (dependabot[bot])
+- chore(deps): bump chrono from 0.4.41 to 0.4.42 [#17483](https://github.com/apache/datafusion/pull/17483) (dependabot[bot])
+- Improve `PartialEq`, `Eq` speed for `LexOrdering`, make `PartialEq` and `PartialOrd` consistent [#17442](https://github.com/apache/datafusion/pull/17442) (findepi)
+- Fix array types coercion: preserve child element nullability for list types [#17306](https://github.com/apache/datafusion/pull/17306) (sgrebnov)
+- better preserve statistics when applying limits [#17381](https://github.com/apache/datafusion/pull/17381) (adriangb)
+- Refactor HashJoinExec to progressively accumulate dynamic filter bounds instead of computing them after data is accumulated [#17444](https://github.com/apache/datafusion/pull/17444) (adriangb)
+- Fix `PartialOrd` for logical plan nodes and expressions [#17438](https://github.com/apache/datafusion/pull/17438) (findepi)
+- chore(deps): bump sqllogictest from 0.28.3 to 0.28.4 [#17500](https://github.com/apache/datafusion/pull/17500) (dependabot[bot])
+- chore(deps): bump tempfile from 3.21.0 to 3.22.0 [#17499](https://github.com/apache/datafusion/pull/17499) (dependabot[bot])
+- refactor: Move `SMJ` tests into own file [#17495](https://github.com/apache/datafusion/pull/17495) (jonathanc-n)
+- move MinAggregator and MaxAggregator to functions-aggregate-common [#17492](https://github.com/apache/datafusion/pull/17492) (adriangb)
+- Update datafusion-testing pin to update expected output for extended tests [#17490](https://github.com/apache/datafusion/pull/17490) (alamb)
+- update physical-plan to use datafusion-functions-aggregate-common for Min/MaxAccumulator [#17502](https://github.com/apache/datafusion/pull/17502) (adriangb)
+- bug: Always use 'indent' format for explain verbose [#17481](https://github.com/apache/datafusion/pull/17481) (petern48)
+- Fix ambiguous column names in substrait conversion as a result of literals having the same name during conversion. [#17299](https://github.com/apache/datafusion/pull/17299) (xanderbailey)
+- Fix NULL Arithmetic Handling for Numerical Operators in Type Coercion [#17418](https://github.com/apache/datafusion/pull/17418) (etolbakov)
+- Prepare for Merge Queue [#17183](https://github.com/apache/datafusion/pull/17183) (blaginin)
+- bug: Support null as argument to to_local_time [#17491](https://github.com/apache/datafusion/pull/17491) (petern48)
+- Implement timestamp_cast_dtype for SqliteDialect [#17479](https://github.com/apache/datafusion/pull/17479) (krinart)
+- Disable `required_status_checks` for now [#17537](https://github.com/apache/datafusion/pull/17537) (blaginin)
+- Update Bug issue template to use Bug issue type [#17540](https://github.com/apache/datafusion/pull/17540) (findepi)
+- Fix predicate simplification for incompatible types in push_down_filter [#17521](https://github.com/apache/datafusion/pull/17521) (adriangb)
+- Add assertion that ScalarUDFImpl implementation is consistent with declared return type [#17515](https://github.com/apache/datafusion/pull/17515) (findepi)
+- Using `encode_arrow_schema` from arrow-rs. [#17543](https://github.com/apache/datafusion/pull/17543) (samueleresca)
+- Add test for decimal256 and float math [#17530](https://github.com/apache/datafusion/pull/17530) (Jefffrey)
+- Document how schema projection works. [#17250](https://github.com/apache/datafusion/pull/17250) (wiedld)
+- chore(deps): bump rust_decimal from 1.37.2 to 1.38.0 [#17564](https://github.com/apache/datafusion/pull/17564) (dependabot[bot])
+- chore(deps): bump semver from 1.0.26 to 1.0.27 [#17566](https://github.com/apache/datafusion/pull/17566) (dependabot[bot])
+- Generalize struct-to-struct casting with CastOptions and SchemaAdapter integration [#17468](https://github.com/apache/datafusion/pull/17468) (kosiew)
+- Add `TableProvider::scan_with_args` [#17336](https://github.com/apache/datafusion/pull/17336) (adriangb)
+- Use taiki-e/install-action and binstall in CI [#17573](https://github.com/apache/datafusion/pull/17573) (AdamGS)
+- Trying cargo machete to prune unused deps. [#17545](https://github.com/apache/datafusion/pull/17545) (samueleresca)
+- Fix typo in error message in `substring.rs` [#17570](https://github.com/apache/datafusion/pull/17570) (AdamGS)
+- chore(deps): bump taiki-e/install-action from 2.61.5 to 2.61.6 [#17586](https://github.com/apache/datafusion/pull/17586) (dependabot[bot])
+- datafusion/substrait: enable `unicode_expressions` in dev-dependencies to fix substring planning test [#17584](https://github.com/apache/datafusion/pull/17584) (kosiew)
+- chore: replace deprecated UnionExec API [#17588](https://github.com/apache/datafusion/pull/17588) (etolbakov)
+- minor: fix compilation issue for extended tests due to missing parquet encryption flag [#17579](https://github.com/apache/datafusion/pull/17579) (Jefffrey)
+- Update release README for new `datafusion/physical-expr-adapter` crate [#17591](https://github.com/apache/datafusion/pull/17591) (xudong963)
+- chore(deps): bump indexmap from 2.11.1 to 2.11.3 [#17587](https://github.com/apache/datafusion/pull/17587) (dependabot[bot])
+- chore(deps): bump serde_json from 1.0.143 to 1.0.145 [#17585](https://github.com/apache/datafusion/pull/17585) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.61.6 to 2.61.8 [#17615](https://github.com/apache/datafusion/pull/17615) (dependabot[bot])
+- Always run CI checks [#17538](https://github.com/apache/datafusion/pull/17538) (blaginin)
+- Revert "Always run CI checks" [#17629](https://github.com/apache/datafusion/pull/17629) (blaginin)
+- Bump datafusion-testing to latest [#17609](https://github.com/apache/datafusion/pull/17609) (Jefffrey)
+- Use `Display` formatting of `DataType`:s in error messages [#17565](https://github.com/apache/datafusion/pull/17565) (emilk)
+- `avg(distinct)` support for decimal types [#17560](https://github.com/apache/datafusion/pull/17560) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.61.8 to 2.61.9 [#17640](https://github.com/apache/datafusion/pull/17640) (dependabot[bot])
+- chore(deps): bump Swatinem/rust-cache from 2.8.0 to 2.8.1 [#17641](https://github.com/apache/datafusion/pull/17641) (dependabot[bot])
+- Validate the memory consumption in SPM created by multi level merge [#17029](https://github.com/apache/datafusion/pull/17029) (ding-young)
+- fix(SubqueryAlias): use maybe_project_redundant_column [#17478](https://github.com/apache/datafusion/pull/17478) (notfilippo)
+- minor: Ensure `datafusion-sql` package dependencies have `sql` flag [#17644](https://github.com/apache/datafusion/pull/17644) (Jefffrey)
+- optimizer: Rewrite `IS NOT DISTINCT FROM` joins as Hash Joins [#17319](https://github.com/apache/datafusion/pull/17319) (2010YOUY01)
+- chore(deps): bump serde from 1.0.223 to 1.0.225 [#17614](https://github.com/apache/datafusion/pull/17614) (dependabot[bot])
+- chore: Update dynamic filter formatting [#17647](https://github.com/apache/datafusion/pull/17647) (rkrishn7)
+- chore(deps): bump taiki-e/install-action from 2.61.9 to 2.61.10 [#17660](https://github.com/apache/datafusion/pull/17660) (dependabot[bot])
+- proto: don't include parquet feature by default [#17577](https://github.com/apache/datafusion/pull/17577) (jackkleeman)
+- minor: Ensure `proto` crate has datetime & unicode expr flags in datafusion dev dependency [#17656](https://github.com/apache/datafusion/pull/17656) (Jefffrey)
+- chore(deps): bump indexmap from 2.11.3 to 2.11.4 [#17661](https://github.com/apache/datafusion/pull/17661) (dependabot[bot])
+- Support Decimal32/64 types [#17501](https://github.com/apache/datafusion/pull/17501) (AdamGS)
+- minor: Improve hygiene for `datafusion-functions` macros [#17638](https://github.com/apache/datafusion/pull/17638) (Jefffrey)
+- [unparser] Custom timestamp format for DuckDB [#17653](https://github.com/apache/datafusion/pull/17653) (krinart)
+- Support LargeList for array_sort [#17657](https://github.com/apache/datafusion/pull/17657) (Jefffrey)
+- Support FixedSizeList for array_except [#17658](https://github.com/apache/datafusion/pull/17658) (Jefffrey)
+- chore: refactor array fn signatures & add more slt tests [#17672](https://github.com/apache/datafusion/pull/17672) (Jefffrey)
+- Support FixedSizeList for array_to_string [#17666](https://github.com/apache/datafusion/pull/17666) (Jefffrey)
+- minor: add SQLancer fuzzed SLT case for natural joins [#17683](https://github.com/apache/datafusion/pull/17683) (Jefffrey)
+- chore: Upgrade Rust version to 1.90.0 [#17677](https://github.com/apache/datafusion/pull/17677) (rkrishn7)
+- Support FixedSizeList for array_position [#17659](https://github.com/apache/datafusion/pull/17659) (Jefffrey)
+- chore(deps): bump the proto group with 2 updates [#16806](https://github.com/apache/datafusion/pull/16806) (dependabot[bot])
+- chore: update a bunch of dependencies [#17708](https://github.com/apache/datafusion/pull/17708) (Jefffrey)
+- Support FixedSizeList for array_slice via coercion to List [#17667](https://github.com/apache/datafusion/pull/17667) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.61.10 to 2.62.1 [#17710](https://github.com/apache/datafusion/pull/17710) (dependabot[bot])
+- fix(agg/corr): return NULL when variance is zero or samples < 2 [#17621](https://github.com/apache/datafusion/pull/17621) (killme2008)
+- chore(deps): bump taiki-e/install-action from 2.62.1 to 2.62.4 [#17739](https://github.com/apache/datafusion/pull/17739) (dependabot[bot])
+- chore(deps): bump tempfile from 3.22.0 to 3.23.0 [#17741](https://github.com/apache/datafusion/pull/17741) (dependabot[bot])
+- chore: make `LimitPushPastWindows` public [#17736](https://github.com/apache/datafusion/pull/17736) (linhr)
+- minor: create `OptimizerContext` with provided `ConfigOptions` [#17742](https://github.com/apache/datafusion/pull/17742) (MichaelScofield)
+- Add support for calling async UDF as aggregation expression [#17620](https://github.com/apache/datafusion/pull/17620) (simonvandel)
+- chore(deps): bump taiki-e/install-action from 2.62.4 to 2.62.5 [#17750](https://github.com/apache/datafusion/pull/17750) (dependabot[bot])
+- (fix): Lag function creates unwanted projection (#17630) [#17639](https://github.com/apache/datafusion/pull/17639) (renato2099)
+- Support `LargeList` in `array_has` simplification to `InList` [#17732](https://github.com/apache/datafusion/pull/17732) (Jefffrey)
+- chore(deps): bump wasm-bindgen-test from 0.3.51 to 0.3.53 [#17642](https://github.com/apache/datafusion/pull/17642) (dependabot[bot])
+- chore(deps): bump object_store from 0.12.3 to 0.12.4 [#17753](https://github.com/apache/datafusion/pull/17753) (dependabot[bot])
+- Update `arrow` / `parquet` to 56.2.0 [#17631](https://github.com/apache/datafusion/pull/17631) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.5 to 2.62.6 [#17766](https://github.com/apache/datafusion/pull/17766) (dependabot[bot])
+- Keep aggregate udaf schema names unique when missing an order-by [#17731](https://github.com/apache/datafusion/pull/17731) (wiedld)
+- feat : Display function alias in output column name [#17690](https://github.com/apache/datafusion/pull/17690) (devampatel03)
+- Support join cardinality estimation less conservatively [#17476](https://github.com/apache/datafusion/pull/17476) (jackkleeman)
+- chore(deps): bump libc from 0.2.175 to 0.2.176 [#17767](https://github.com/apache/datafusion/pull/17767) (dependabot[bot])
+- chore(deps): bump postgres-types from 0.2.9 to 0.2.10 [#17768](https://github.com/apache/datafusion/pull/17768) (dependabot[bot])
+- Use `Expr::qualified_name()` and `Column::new()` to extract partition keys from window and aggregate operators [#17757](https://github.com/apache/datafusion/pull/17757) (masonh22)
+- chore(deps): bump taiki-e/install-action from 2.62.6 to 2.62.8 [#17781](https://github.com/apache/datafusion/pull/17781) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.53 to 0.3.54 [#17784](https://github.com/apache/datafusion/pull/17784) (dependabot[bot])
+- chore: Action some old TODOs in github actions [#17694](https://github.com/apache/datafusion/pull/17694) (Jefffrey)
+- dev: Add benchmark for compilation profiles [#17754](https://github.com/apache/datafusion/pull/17754) (2010YOUY01)
+- chore(deps): bump tokio-postgres from 0.7.13 to 0.7.14 [#17785](https://github.com/apache/datafusion/pull/17785) (dependabot[bot])
+- chore(deps): bump serde from 1.0.226 to 1.0.227 [#17783](https://github.com/apache/datafusion/pull/17783) (dependabot[bot])
+- chore(deps): bump regex from 1.11.2 to 1.11.3 [#17782](https://github.com/apache/datafusion/pull/17782) (dependabot[bot])
+- Test `CAST` from temporal to `Utf8View` [#17535](https://github.com/apache/datafusion/pull/17535) (findepi)
+- chore: dependabot to run weekly [#17797](https://github.com/apache/datafusion/pull/17797) (comphead)
+- chore(deps): bump sysinfo from 0.37.0 to 0.37.1 [#17800](https://github.com/apache/datafusion/pull/17800) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.8 to 2.62.9 [#17799](https://github.com/apache/datafusion/pull/17799) (dependabot[bot])
+- Fix potential overflow when we print verbose physical plan [#17798](https://github.com/apache/datafusion/pull/17798) (zhuqi-lucas)
+- Extend datatype semantic equality check to include timestamps [#17777](https://github.com/apache/datafusion/pull/17777) (shivbhatia10)
+- dev: Add Apache license check to the lint script [#17787](https://github.com/apache/datafusion/pull/17787) (2010YOUY01)
+- Fix: common_sub_expression_eliminate optimizer rule failed [#16066](https://github.com/apache/datafusion/pull/16066) (Col-Waltz)
+- chore: remove dialect fixes in SLT tests that are outdated [#17807](https://github.com/apache/datafusion/pull/17807) (Jefffrey)
+- chore(deps): bump thiserror from 2.0.16 to 2.0.17 [#17821](https://github.com/apache/datafusion/pull/17821) (dependabot[bot])
+- chore(deps): bump quote from 1.0.40 to 1.0.41 [#17822](https://github.com/apache/datafusion/pull/17822) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.9 to 2.62.12 [#17823](https://github.com/apache/datafusion/pull/17823) (dependabot[bot])
+- chore(deps): bump serde from 1.0.227 to 1.0.228 [#17827](https://github.com/apache/datafusion/pull/17827) (dependabot[bot])
+- Temporarily disable failing `sql_planner` benchmark query [#17809](https://github.com/apache/datafusion/pull/17809) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.12 to 2.62.13 [#17836](https://github.com/apache/datafusion/pull/17836) (dependabot[bot])
+- More decimal 32/64 support - type coercsion and misc gaps [#17808](https://github.com/apache/datafusion/pull/17808) (AdamGS)
+- Implement `AsRef` for `Expr` [#17819](https://github.com/apache/datafusion/pull/17819) (findepi)
+- chore(deps): bump taiki-e/install-action from 2.62.13 to 2.62.14 [#17840](https://github.com/apache/datafusion/pull/17840) (dependabot[bot])
+- chore(deps): bump petgraph from 0.8.2 to 0.8.3 [#17842](https://github.com/apache/datafusion/pull/17842) (dependabot[bot])
+- Relax constraint that file sort order must only reference individual columns [#17419](https://github.com/apache/datafusion/pull/17419) (pepijnve)
+- minor: Include consumer name in OOM message [#17870](https://github.com/apache/datafusion/pull/17870) (andygrove)
+- Implement `partition_statistics` API for `InterleaveExec` [#17051](https://github.com/apache/datafusion/pull/17051) (liamzwbao)
+- Add `CastColumnExpr` for struct-aware column casting [#17773](https://github.com/apache/datafusion/pull/17773) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.14 to 2.62.16 [#17879](https://github.com/apache/datafusion/pull/17879) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.0 to 1.37.1 [#17878](https://github.com/apache/datafusion/pull/17878) (dependabot[bot])
+- Fix failing CI caused by hash collisions [#17886](https://github.com/apache/datafusion/pull/17886) (liamzwbao)
+- Minor: reuse test schemas in simplify tests [#17864](https://github.com/apache/datafusion/pull/17864) (alamb)
+- Make limit pushdown work for SortPreservingMergeExec [#17893](https://github.com/apache/datafusion/pull/17893) (Dandandan)
+- chore(deps): bump taiki-e/install-action from 2.62.16 to 2.62.17 [#17896](https://github.com/apache/datafusion/pull/17896) (dependabot[bot])
+- Consolidate `apply_schema_adapter_tests` [#17905](https://github.com/apache/datafusion/pull/17905) (alamb)
+- Improve `InListExpr` plan display [#17884](https://github.com/apache/datafusion/pull/17884) (pepijnve)
+- Export JoinSetTracerError from datafusion-common-runtime [#17877](https://github.com/apache/datafusion/pull/17877) (JanKaul)
+- Clippy to `extended_tests` [#17922](https://github.com/apache/datafusion/pull/17922) (blaginin)
+- chore: rename Schema `print_schema_tree` to `tree_string` [#17919](https://github.com/apache/datafusion/pull/17919) (comphead)
+- chore: utilize trait upcasting for AsyncScalarUDF PartialEq & Hash [#17872](https://github.com/apache/datafusion/pull/17872) (Jefffrey)
+- Refactor: Update enforce_sorting tests to use insta snapshots for easier updates [#17900](https://github.com/apache/datafusion/pull/17900) (alamb)
+- chore(deps): bump flate2 from 1.1.2 to 1.1.4 [#17938](https://github.com/apache/datafusion/pull/17938) (dependabot[bot])
+- chore(deps): bump actions/stale from 10.0.0 to 10.1.0 [#17937](https://github.com/apache/datafusion/pull/17937) (dependabot[bot])
+- chore(deps): bump aws-credential-types from 1.2.6 to 1.2.7 [#17936](https://github.com/apache/datafusion/pull/17936) (dependabot[bot])
+- chore(deps): bump rustyline from 17.0.1 to 17.0.2 [#17932](https://github.com/apache/datafusion/pull/17932) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.17 to 2.62.21 [#17934](https://github.com/apache/datafusion/pull/17934) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.1 to 1.37.2 [#17935](https://github.com/apache/datafusion/pull/17935) (dependabot[bot])
+- chore: upgrade sqlparser [#17925](https://github.com/apache/datafusion/pull/17925) (chenkovsky)
+- minor: impl Clone and Debug on CaseBuilder [#17927](https://github.com/apache/datafusion/pull/17927) (timsaucer)
+- chore: Extend backtrace coverage for `Execution` and `Internal` errors [#17921](https://github.com/apache/datafusion/pull/17921) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.21 to 2.62.22 [#17949](https://github.com/apache/datafusion/pull/17949) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.37.2 to 1.38.0 [#17948](https://github.com/apache/datafusion/pull/17948) (dependabot[bot])
+- Feat: [datafusion-spark] Migrate avg from comet to datafusion-spark and add tests. [#17871](https://github.com/apache/datafusion/pull/17871) (codetyri0n)
+- Update tests to use insta / make them easier to update [#17945](https://github.com/apache/datafusion/pull/17945) (alamb)
+- Minor Test refactor: avoid creating the same SchemaRef [#17951](https://github.com/apache/datafusion/pull/17951) (alamb)
+- Precision::<usize>::{add, sub, multiply}: avoid overflows [#17929](https://github.com/apache/datafusion/pull/17929) (Tpt)
+- Resolve `ListingScan` projection against table schema including partition columns [#17911](https://github.com/apache/datafusion/pull/17911) (mach-kernel)
+- chore(deps): bump crate-ci/typos from 1.38.0 to 1.38.1 [#17960](https://github.com/apache/datafusion/pull/17960) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.22 to 2.62.23 [#17959](https://github.com/apache/datafusion/pull/17959) (dependabot[bot])
+- bench: fix `vectorized_equal_to` bench mutated between iterations [#17968](https://github.com/apache/datafusion/pull/17968) (rluvaton)
+- fix docs and broken example from #17956 [#17980](https://github.com/apache/datafusion/pull/17980) (adriangb)
+- Refactor: Update `replace_with_order_preserving_variants` tests to use insta snapshots for easier updates [#17962](https://github.com/apache/datafusion/pull/17962) (blaginin)
+- Support repartitioned() method in RepartitionExec [#17990](https://github.com/apache/datafusion/pull/17990) (gabotechs)
+- Adds Instrumented Object Store to CLI [#17984](https://github.com/apache/datafusion/pull/17984) (BlakeOrth)
+- Migrate `join_selection` tests to snapshot-based testing [#17974](https://github.com/apache/datafusion/pull/17974) (blaginin)
+- bench: fix actually generate a lot of unique values in benchmark table [#17967](https://github.com/apache/datafusion/pull/17967) (rluvaton)
+- Adds Instrument Mode for InstrumentedObjectStore in datafusion-cli [#18000](https://github.com/apache/datafusion/pull/18000) (BlakeOrth)
+- minor: refactor Spark ascii function to reuse DataFusion ascii function code [#17965](https://github.com/apache/datafusion/pull/17965) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.23 to 2.62.24 [#17989](https://github.com/apache/datafusion/pull/17989) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.24 to 2.62.25 [#18007](https://github.com/apache/datafusion/pull/18007) (dependabot[bot])
+- Clarify documentation that ScalarUDFImpl::simplity must not change the schema [#17981](https://github.com/apache/datafusion/pull/17981) (alamb)
+- Expose trace_future and trace_block outside of common-runtime [#17976](https://github.com/apache/datafusion/pull/17976) (AdamGS)
+- Adds instrumentation to get requests for datafusion-cli [#18016](https://github.com/apache/datafusion/pull/18016) (BlakeOrth)
+- chore(deps): bump half from 2.6.0 to 2.7.0 [#18036](https://github.com/apache/datafusion/pull/18036) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.6 to 1.8.7 [#18038](https://github.com/apache/datafusion/pull/18038) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.25 to 2.62.28 [#18037](https://github.com/apache/datafusion/pull/18037) (dependabot[bot])
+- refactor: cleanup naming and macro usages for binary operator [#17985](https://github.com/apache/datafusion/pull/17985) (sunng87)
+- Impl `gather_filters_for_pushdown` for `CoalescePartitionsExec` [#18046](https://github.com/apache/datafusion/pull/18046) (xudong963)
+- Fix bug in LimitPushPastWindows [#18029](https://github.com/apache/datafusion/pull/18029) (avantgardnerio)
+- Fix `SortPreservingMergeExec` tree formatting with limit [#18009](https://github.com/apache/datafusion/pull/18009) (AdamGS)
+- chore(deps): bump actions/setup-node from 5.0.0 to 6.0.0 [#18049](https://github.com/apache/datafusion/pull/18049) (dependabot[bot])
+- chore(deps): bump sysinfo from 0.37.1 to 0.37.2 [#18035](https://github.com/apache/datafusion/pull/18035) (dependabot[bot])
+- FileScanConfig: Preserve schema metadata across ser/de boundary [#17966](https://github.com/apache/datafusion/pull/17966) (mach-kernel)
+- physical-plan: push filters down to UnionExec children [#18054](https://github.com/apache/datafusion/pull/18054) (asubiotto)
+- Add `min_max_bytes` benchmark (Reproduce quadratic runtime in min_max_bytes) [#18041](https://github.com/apache/datafusion/pull/18041) (ctsk)
+- Adds summary output to CLI instrumented object stores [#18045](https://github.com/apache/datafusion/pull/18045) (BlakeOrth)
+- Impl spark bit not function [#18018](https://github.com/apache/datafusion/pull/18018) (kazantsev-maksim)
+- chore: revert tests [#18065](https://github.com/apache/datafusion/pull/18065) (comphead)
+- chore: Use an enum to express the different kinds of nullability in an array [#18048](https://github.com/apache/datafusion/pull/18048) (martin-g)
+- chore(deps): bump taiki-e/install-action from 2.62.28 to 2.62.29 [#18069](https://github.com/apache/datafusion/pull/18069) (dependabot[bot])
+- Split up monster test_window_partial_constant_and_set_monotonicity into smaller functions [#17952](https://github.com/apache/datafusion/pull/17952) (alamb)
+- Push Down Filter Subexpressions in Nested Loop Joins as Projections [#17906](https://github.com/apache/datafusion/pull/17906) (tobixdev)
+- ci: Use PR description for merge commit body in squash merges [#18027](https://github.com/apache/datafusion/pull/18027) (Weijun-H)
+- Fix extended tests on main to get CI green [#18096](https://github.com/apache/datafusion/pull/18096) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.29 to 2.62.31 [#18094](https://github.com/apache/datafusion/pull/18094) (dependabot[bot])
+- chore: run extended suite on PRs for critical areas [#18088](https://github.com/apache/datafusion/pull/18088) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.31 to 2.62.33 [#18113](https://github.com/apache/datafusion/pull/18113) (dependabot[bot])
+- chore: remove unnecessary `skip_failed_rules` config in slt [#18117](https://github.com/apache/datafusion/pull/18117) (Jefffrey)
+- Refactor repartition to use `insta` [#18106](https://github.com/apache/datafusion/pull/18106) (blaginin)
+- refactor: move ListingTable over to the catalog-listing-table crate [#18080](https://github.com/apache/datafusion/pull/18080) (timsaucer)
+- refactor: move arrow datasource to new `datafusion-datasource-arrow` crate [#18082](https://github.com/apache/datafusion/pull/18082) (timsaucer)
+- Adds instrumentation to LIST operations in CLI [#18103](https://github.com/apache/datafusion/pull/18103) (BlakeOrth)
+- Add extra case_when benchmarks [#18097](https://github.com/apache/datafusion/pull/18097) (pepijnve)
+- Adds instrumentation to delimited LIST operations in CLI [#18134](https://github.com/apache/datafusion/pull/18134) (BlakeOrth)
+- test: `to_timestamp(double)` for vectorized input [#18147](https://github.com/apache/datafusion/pull/18147) (dqkqd)
+- Fix `concat_elements_utf8view` capacity initialization. [#18003](https://github.com/apache/datafusion/pull/18003) (samueleresca)
+- Use < instead of = in case benchmark predicates, use Integers [#18144](https://github.com/apache/datafusion/pull/18144) (pepijnve)
+- Adds instrumentation to PUT ops in the CLI [#18139](https://github.com/apache/datafusion/pull/18139) (BlakeOrth)
+- [main] chore: Fix `no space left on device` (#18141) [#18151](https://github.com/apache/datafusion/pull/18151) (alamb)
+- Fix `DISTINCT ON` for tables with no columns (ReplaceDistinctWithAggregate: do not fail when on input without columns) [#18133](https://github.com/apache/datafusion/pull/18133) (Tpt)
+- Fix quadratic runtime in min_max_bytes [#18044](https://github.com/apache/datafusion/pull/18044) (ctsk)
+- chore(deps): bump getrandom from 0.3.3 to 0.3.4 [#18163](https://github.com/apache/datafusion/pull/18163) (dependabot[bot])
+- chore(deps): bump tokio from 1.47.1 to 1.48.0 [#18164](https://github.com/apache/datafusion/pull/18164) (dependabot[bot])
+- chore(deps): bump indexmap from 2.11.4 to 2.12.0 [#18162](https://github.com/apache/datafusion/pull/18162) (dependabot[bot])
+- chore(deps): bump bzip2 from 0.6.0 to 0.6.1 [#18165](https://github.com/apache/datafusion/pull/18165) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.33 to 2.62.34 [#18194](https://github.com/apache/datafusion/pull/18194) (dependabot[bot])
+- Fix COPY TO does not produce an output file for the empty set [#18074](https://github.com/apache/datafusion/pull/18074) (bert-beyondloops)
+- Add Projection struct w/ helper methods to manipulate projections [#18176](https://github.com/apache/datafusion/pull/18176) (adriangb)
+- Add TableSchema helper to encapsulate file schema + partition fields [#18178](https://github.com/apache/datafusion/pull/18178) (adriangb)
+- Add spilling to RepartitionExec [#18014](https://github.com/apache/datafusion/pull/18014) (adriangb)
+- Adds DELETE and HEAD instrumentation to CLI [#18206](https://github.com/apache/datafusion/pull/18206) (BlakeOrth)
+- [branch-50] Prepare 50.3.0 release version number and README (#18173) [#18182](https://github.com/apache/datafusion/pull/18182) (alamb)
+- Fix array_has simplification with null argument [#18186](https://github.com/apache/datafusion/pull/18186) (joroKr21)
+- chore(deps): bump taiki-e/install-action from 2.62.34 to 2.62.35 [#18215](https://github.com/apache/datafusion/pull/18215) (dependabot[bot])
+- bench: create benchmark for lookup table like `CASE WHEN` [#18203](https://github.com/apache/datafusion/pull/18203) (rluvaton)
+- Adds instrumentation to COPY operations in the CLI [#18227](https://github.com/apache/datafusion/pull/18227) (BlakeOrth)
+- Consolidate core_integration/datasource and rename parquet_source --> parquet_integration [#18226](https://github.com/apache/datafusion/pull/18226) (alamb)
+- CoalescePartitionsExec fetch is not consistent with one partition and more than one partition [#18245](https://github.com/apache/datafusion/pull/18245) (zhuqi-lucas)
+- Migrate core test to insta part 3 [#16978](https://github.com/apache/datafusion/pull/16978) (Chen-Yuan-Lai)
+- chore(deps): bump taiki-e/install-action from 2.62.35 to 2.62.36 [#18240](https://github.com/apache/datafusion/pull/18240) (dependabot[bot])
+- Fix: Do not normalize table names when deserializing from protobuf [#18187](https://github.com/apache/datafusion/pull/18187) (drin)
+- Revert "chore: revert tests (#18065)" [#18255](https://github.com/apache/datafusion/pull/18255) (dqkqd)
+- Refactor `nvl2` Function to Support Lazy Evaluation and Simplification via CASE Expression [#18191](https://github.com/apache/datafusion/pull/18191) (kosiew)
+- fix null count stats computation [#18276](https://github.com/apache/datafusion/pull/18276) (adriangb)
+- Improve docs and examples for `DataTypeExt` and `FieldExt` [#18271](https://github.com/apache/datafusion/pull/18271) (alamb)
+- Easier construction of ScalarAndMetadata [#18272](https://github.com/apache/datafusion/pull/18272) (alamb)
+- Add integration test for IO operations for listing tables queries [#18229](https://github.com/apache/datafusion/pull/18229) (alamb)
+- Fix: Error rather than silently ignore extra parameter passed to ceil/floor [#18265](https://github.com/apache/datafusion/pull/18265) (toxicteddy00077)
+- chore(deps): Update `half` to 2.7.1, ignore `RUSTSEC-2025-0111` [#18287](https://github.com/apache/datafusion/pull/18287) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.62.36 to 2.62.38 [#18293](https://github.com/apache/datafusion/pull/18293) (dependabot[bot])
+- chore(deps): bump regex from 1.11.3 to 1.12.2 [#18294](https://github.com/apache/datafusion/pull/18294) (dependabot[bot])
+- chore(deps): bump clap from 4.5.48 to 4.5.50 [#18292](https://github.com/apache/datafusion/pull/18292) (dependabot[bot])
+- chore(deps): bump syn from 2.0.106 to 2.0.108 [#18291](https://github.com/apache/datafusion/pull/18291) (dependabot[bot])
+- Enforce unique names for `is_set` on `first_value` and `last_value` [#18303](https://github.com/apache/datafusion/pull/18303) (marc-pydantic)
+- chore(deps): update testcontainers to `0.25.2` and drop ignore of `RUSTSEC-2025-0111` [#18305](https://github.com/apache/datafusion/pull/18305) (DDtKey)
+- Using `try_append_value` from arrow-rs 57.0.0 [#18313](https://github.com/apache/datafusion/pull/18313) (samueleresca)
+- minor: Add documentation to function `concat_elements_utf8view` [#18316](https://github.com/apache/datafusion/pull/18316) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.38 to 2.62.40 [#18318](https://github.com/apache/datafusion/pull/18318) (dependabot[bot])
+- Fix: Add projection to generate_series [#18298](https://github.com/apache/datafusion/pull/18298) (mkleen)
+- Do not accept null is_set for first_value/last_value [#18301](https://github.com/apache/datafusion/pull/18301) (marc-pydantic)
+- Optimize merging of partial case expression results [#18152](https://github.com/apache/datafusion/pull/18152) (pepijnve)
+- chore: Format examples in doc strings - execution [#18339](https://github.com/apache/datafusion/pull/18339) (CuteChuanChuan)
+- chore: Format examples in doc strings - common [#18336](https://github.com/apache/datafusion/pull/18336) (CuteChuanChuan)
+- chore: Format examples in doc strings - crate datafusion [#18333](https://github.com/apache/datafusion/pull/18333) (CuteChuanChuan)
+- chore: Format examples in doc strings - expr [#18340](https://github.com/apache/datafusion/pull/18340) (CuteChuanChuan)
+- chore: Format examples in doc strings - datasource crates [#18338](https://github.com/apache/datafusion/pull/18338) (CuteChuanChuan)
+- Insta for enforce_distrubution (easy ones) [#18248](https://github.com/apache/datafusion/pull/18248) (blaginin)
+- chore: Format examples in doc strings - macros and optmizer [#18354](https://github.com/apache/datafusion/pull/18354) (CuteChuanChuan)
+- chore: Format examples in doc strings - proto, pruning, and session [#18358](https://github.com/apache/datafusion/pull/18358) (CuteChuanChuan)
+- chore: Format examples in doc strings - catalog listing [#18335](https://github.com/apache/datafusion/pull/18335) (CuteChuanChuan)
+- ci: fix temporary file creation in tests and tighten CI check [#18374](https://github.com/apache/datafusion/pull/18374) (2010YOUY01)
+- Run extended tests when there are changes to datafusion-testing pin [#18310](https://github.com/apache/datafusion/pull/18310) (alamb)
+- Add simple unit test for `merge` in case expression [#18369](https://github.com/apache/datafusion/pull/18369) (pepijnve)
+- chore(deps): bump taiki-e/install-action from 2.62.40 to 2.62.41 [#18377](https://github.com/apache/datafusion/pull/18377) (dependabot[bot])
+- Refactor `range`/`gen_series` signature away from user defined [#18317](https://github.com/apache/datafusion/pull/18317) (Jefffrey)
+- Adds Partitioned CSV test to object store access tests [#18370](https://github.com/apache/datafusion/pull/18370) (BlakeOrth)
+- Add reproducer for consecutive RepartitionExec [#18343](https://github.com/apache/datafusion/pull/18343) (NGA-TRAN)
+- chore: bump substrait version to `0.60.0` to use substrait spec v0.75.0 [#17866](https://github.com/apache/datafusion/pull/17866) (benbellick)
+- Use the upstream arrow-rs coalesce kernel [#17193](https://github.com/apache/datafusion/pull/17193) (zhuqi-lucas)
+- Extract out super slow planning benchmark to it's own benchmark [#18388](https://github.com/apache/datafusion/pull/18388) (Omega359)
+- minor: Fix parquet pruning metrics display order [#18379](https://github.com/apache/datafusion/pull/18379) (2010YOUY01)
+- chore: use enum as `date_trunc` granularity [#18390](https://github.com/apache/datafusion/pull/18390) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.62.41 to 2.62.43 [#18398](https://github.com/apache/datafusion/pull/18398) (dependabot[bot])
+- Project record batches to avoid filtering unused columns in `CASE` evaluation [#18329](https://github.com/apache/datafusion/pull/18329) (pepijnve)
+- catch errors when simplifying cast(lit(...), ...) and bubble those up [#18332](https://github.com/apache/datafusion/pull/18332) (adriangb)
+- Align `NowFunc::new()` with canonical `ConfigOptions` timezone and enhance documentation [#18347](https://github.com/apache/datafusion/pull/18347) (kosiew)
+- chore: Format examples in doc strings - physical expr, optimizer, and plan [#18357](https://github.com/apache/datafusion/pull/18357) (CuteChuanChuan)
+- Fix: spark bit_count function [#18322](https://github.com/apache/datafusion/pull/18322) (kazantsev-maksim)
+- chore: bump workspace rust version to 1.91.0 [#18422](https://github.com/apache/datafusion/pull/18422) (randyli)
+- Minor: Remove unneccessary vec! in SortMergeJoinStream initialization [#18430](https://github.com/apache/datafusion/pull/18430) (mapleFU)
+- minor: refactor array reverse internals [#18445](https://github.com/apache/datafusion/pull/18445) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.43 to 2.62.45 [#18465](https://github.com/apache/datafusion/pull/18465) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.38.1 to 1.39.0 [#18464](https://github.com/apache/datafusion/pull/18464) (dependabot[bot])
+- chore(deps): bump rstest from 0.25.0 to 0.26.1 [#18463](https://github.com/apache/datafusion/pull/18463) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.54 to 0.3.55 [#18462](https://github.com/apache/datafusion/pull/18462) (dependabot[bot])
+- chore(deps): bump postgres-types from 0.2.10 to 0.2.11 [#18461](https://github.com/apache/datafusion/pull/18461) (dependabot[bot])
+- chore(deps): bump ctor from 0.4.3 to 0.6.1 [#18460](https://github.com/apache/datafusion/pull/18460) (dependabot[bot])
+- chore(deps): bump libc from 0.2.176 to 0.2.177 [#18459](https://github.com/apache/datafusion/pull/18459) (dependabot[bot])
+- chore: Format examples in doc strings - functions [#18353](https://github.com/apache/datafusion/pull/18353) (CuteChuanChuan)
+- Feat: Support array flatten() on `List(LargeList(_))` types [#18363](https://github.com/apache/datafusion/pull/18363) (sdf-jkl)
+- Reproducer tests for #18380 (resorting sorted inputs) [#18352](https://github.com/apache/datafusion/pull/18352) (rgehan)
+- Update criterion to 0.7.\* [#18472](https://github.com/apache/datafusion/pull/18472) (Omega359)
+- chore(deps): bump taiki-e/install-action from 2.62.45 to 2.62.46 [#18484](https://github.com/apache/datafusion/pull/18484) (dependabot[bot])
+- Consolidate flight examples (#18142) [#18442](https://github.com/apache/datafusion/pull/18442) (cj-zhukov)
+- Support reverse for ListView [#18424](https://github.com/apache/datafusion/pull/18424) (vegarsti)
+- Complete migrating `enforce_distrubution` tests to insta [#18185](https://github.com/apache/datafusion/pull/18185) (blaginin)
+- Add benchmark for array_reverse [#18425](https://github.com/apache/datafusion/pull/18425) (vegarsti)
+- chore: simplify map const [#18440](https://github.com/apache/datafusion/pull/18440) (chenkovsky)
+- Fix an out of date comment for `snapshot_physical_expr` [#18498](https://github.com/apache/datafusion/pull/18498) (AdamGS)
+- Disable `parquet_encryption` by default in datafusion-sqllogictests [#18492](https://github.com/apache/datafusion/pull/18492) (zhuqi-lucas)
+- Make extended test to use optional parquet_encryption feature [#18507](https://github.com/apache/datafusion/pull/18507) (zhuqi-lucas)
+- Consolidate udf examples (#18142) [#18493](https://github.com/apache/datafusion/pull/18493) (cj-zhukov)
+- test: add prepare alias slt test [#18522](https://github.com/apache/datafusion/pull/18522) (dqkqd)
+- CI: add `clippy::needless_pass_by_value` rule [#18468](https://github.com/apache/datafusion/pull/18468) (2010YOUY01)
+- Refactor create_hashes to accept array references [#18448](https://github.com/apache/datafusion/pull/18448) (adriangb)
+- chore: Format examples in doc strings - spark, sql, sqllogictest, sibstrait [#18443](https://github.com/apache/datafusion/pull/18443) (CuteChuanChuan)
+- refactor: simplify `calculate_binary_math` in datafusion-functions [#18525](https://github.com/apache/datafusion/pull/18525) (Jefffrey)
+- ci: enforce needless_pass_by_value for datafusion-optimzer [#18533](https://github.com/apache/datafusion/pull/18533) (jizezhang)
+- Add comments to Cargo.toml about workspace overrides [#18526](https://github.com/apache/datafusion/pull/18526) (alamb)
+- minor: Remove inconsistent comment [#18539](https://github.com/apache/datafusion/pull/18539) (2010YOUY01)
+- Refactor `log()` signature to use coercion API + fixes [#18519](https://github.com/apache/datafusion/pull/18519) (Jefffrey)
+- [branch-51] Update Changelog [#18592](https://github.com/apache/datafusion/pull/18592) (alamb)
+- [branch-51] bugfix: correct regression on TableType in into_view in DF51 [#18618](https://github.com/apache/datafusion/pull/18618) (timsaucer)
+- [branch-51]: Add timezone to date_trunc fast path (#18596) [#18629](https://github.com/apache/datafusion/pull/18629) (hareshkh)
+- [branch-51] bugfix: select_columns should validate column names [#18624](https://github.com/apache/datafusion/pull/18624) (timsaucer)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    88	dependabot[bot]
+    49	Jeffrey Vo
+    35	Andrew Lamb
+    20	Yongting You
+    19	Adrian Garcia Badaracco
+    14	Blake Orth
+    12	Pepijn Van Eeckhoudt
+    12	Piotr Findeisen
+    11	Chen Chongchen
+    11	Dmitrii Blaginin
+    11	Yu-Chuan Hung
+     9	Jonathan Chen
+     9	Khanh Duong
+     9	Oleks V
+     9	Peter Nguyen
+     8	Alex Huang
+     8	Qi Zhu
+     8	Raz Luvaton
+     7	Adam Gutglick
+     7	Rohan Krishnaswamy
+     7	Tim Saucer
+     7	kosiew
+     6	xudong.w
+     5	Nuno Faria
+     4	Dhanush
+     4	Samuele Resca
+     4	Simon Vandel Sillesen
+     4	Sriram Sundar
+     4	Vegard Stikbakke
+     3	Bruce Ritchie
+     3	David López
+     3	EeshanBembi
+     3	Jack Kleeman
+     3	Kazantsev Maksim
+     3	Marko Milenković
+     3	Thomas Tanon
+     2	Andy Grove
+     2	Bruno Volpato
+     2	Christian
+     2	Colin Marc
+     2	Cora Sutton
+     2	David Stancu
+     2	Devam Patel
+     2	Eugene Tolbakov
+     2	Evgenii Glotov
+     2	Kristin Cowalcijk
+     2	Liam Bao
+     2	Marc Brinkmann
+     2	Michael Kleen
+     2	Namgung Chan
+     2	Ning Sun
+     2	Randy
+     2	Sergey Zhukov
+     2	Viktor Yershov
+     2	bubulalabu
+     2	dennis zhuang
+     2	jizezhang
+     2	wiedld
+     1	Ahmed Mezghani
+     1	Aldrin M
+     1	Alfonso Subiotto Marqués
+     1	Anders
+     1	Artem Medvedev
+     1	Aryamaan Singh
+     1	Ben Bellick
+     1	Berkay Şahin
+     1	Bert Vermeiren
+     1	Brent Gardner
+     1	Christopher Watford
+     1	Dan Lovell
+     1	Daniël Heres
+     1	Dewey Dunnington
+     1	Douglas Anderson
+     1	Duong Cong Toai
+     1	Emil Ernerfeldt
+     1	Emily Matheys
+     1	Enrico La Sala
+     1	Eshed Schacham
+     1	Filippo Rossi
+     1	Gabriel
+     1	Gene Bordegaray
+     1	Georgi Krastev
+     1	Haresh Khanna
+     1	Heran Lin
+     1	Hiroaki Yutani
+     1	Ian Lai
+     1	Ilya Ostanevich
+     1	JanKaul
+     1	Kosta Tarasov
+     1	LFC
+     1	Leonardo Yvens
+     1	Lía Adriana
+     1	Manasa Manoj
+     1	Martin
+     1	Martin Grigorov
+     1	Martin Hilton
+     1	Mason
+     1	Matt Butrovich
+     1	Matthew Kim
+     1	Matthijs Brobbel
+     1	Nga Tran
+     1	Nihal Rajak
+     1	Rafael Fernández
+     1	Renan GEHAN
+     1	Renato Marroquin
+     1	Rok Mihevc
+     1	Ruilei Ma
+     1	Sai Mahendra
+     1	Sergei Grebnov
+     1	Shiv Bhatia
+     1	Tobias Schwarzinger
+     1	UBarney
+     1	Victor Barua
+     1	Victorien
+     1	Vyquos
+     1	Weston Pace
+     1	XL Liang
+     1	Xander
+     1	Zhen Wang
+     1	aditya singh rathore
+     1	dario curreri
+     1	ding-young
+     1	feniljain
+     1	gene-bordegaray
+     1	harshasiddartha
+     1	mwish
+     1	peasee
+     1	r1b
+     1	theirix
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.0.0.md b/dev/changelog/52.0.0.md
new file mode 100644
index 0000000000000..4536fd5a06907
--- /dev/null
+++ b/dev/changelog/52.0.0.md
@@ -0,0 +1,745 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.0.0 Changelog
+
+This release consists of 549 commits from 121 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Breaking changes:**
+
+- Force `FileSource` to be constructed with a `Schema` [#18386](https://github.com/apache/datafusion/pull/18386) (adriangb)
+- Support Arrow IPC Stream Files [#18457](https://github.com/apache/datafusion/pull/18457) (corasaurus-hex)
+- Change default of `AggregateUDFImpl::supports_null_handling_clause` to `false` [#18441](https://github.com/apache/datafusion/pull/18441) (Jefffrey)
+- [Minor] Remove RawTableAllocExt [#18748](https://github.com/apache/datafusion/pull/18748) (Dandandan)
+- Change `CacheAccessor::remove` to take `&self` rather than `&mut self` [#18726](https://github.com/apache/datafusion/pull/18726) (alchemist51)
+- Move statistics handling into FileScanConfig [#18721](https://github.com/apache/datafusion/pull/18721) (adriangb)
+- chore: remove `pyarrow` feature [#18528](https://github.com/apache/datafusion/pull/18528) (timsaucer)
+- Limit visibility of internal impl functions in function crates [#18877](https://github.com/apache/datafusion/pull/18877) (Jefffrey)
+- FFI: return underlying trait type when converting from FFI structs [#18672](https://github.com/apache/datafusion/pull/18672) (timsaucer)
+- Refactor crypto functions code [#18664](https://github.com/apache/datafusion/pull/18664) (Jefffrey)
+- move projection handling into FileSource [#18627](https://github.com/apache/datafusion/pull/18627) (adriangb)
+- Add PhysicalOptimizerRule::optimize_plan to allow passing more context into optimizer rules [#18739](https://github.com/apache/datafusion/pull/18739) (adriangb)
+- Optimize planning / stop cloning Strings / Fields so much (2-3% faster planning time) [#18415](https://github.com/apache/datafusion/pull/18415) (alamb)
+- Adds memory-bound DefaultListFilesCache [#18855](https://github.com/apache/datafusion/pull/18855) (BlakeOrth)
+- Allow Logical expression ScalarVariable to represent an extension type or metadata [#18243](https://github.com/apache/datafusion/pull/18243) (batmnnn)
+- feat: Implement the `statistics_cache` function [#19054](https://github.com/apache/datafusion/pull/19054) (nuno-faria)
+- Move `newlines_in_values` from `FileScanConfig` to `CsvSource` [#19313](https://github.com/apache/datafusion/pull/19313) (adriangb)
+- Remove SchemaAdapter [#19345](https://github.com/apache/datafusion/pull/19345) (adriangb)
+- feat: hash partitioning satisfies subset [#19304](https://github.com/apache/datafusion/pull/19304) (gene-bordegaray)
+- feat: update FFI TableProvider and ExecutionPlan to use FFI Session and TaskContext [#19281](https://github.com/apache/datafusion/pull/19281) (timsaucer)
+- Allow logical optimizer to be run without evaluating now() & refactor SimplifyInfo [#19505](https://github.com/apache/datafusion/pull/19505) (adriangb)
+- Make default ListingFilesCache table scoped [#19616](https://github.com/apache/datafusion/pull/19616) (jizezhang)
+
+**Performance related:**
+
+- Normalize partitioned and flat object listing [#18146](https://github.com/apache/datafusion/pull/18146) (BlakeOrth)
+- perf: Improve NLJ for very small right side case [#17562](https://github.com/apache/datafusion/pull/17562) (2010YOUY01)
+- Consolidate `EliminateNestedUnion` and `EliminateOneUnion` optimizer rules' [#18678](https://github.com/apache/datafusion/pull/18678) (alamb)
+- perf: improve performance of `vectorized_equal_to` for `PrimitiveGroupValueBuilder` in multi group by aggregation [#17977](https://github.com/apache/datafusion/pull/17977) (rluvaton)
+- optimizer: Support dynamic filter in `MIN/MAX` aggregates [#18644](https://github.com/apache/datafusion/pull/18644) (2010YOUY01)
+- perf: use `new_repeated` when converting scalar to an array [#19018](https://github.com/apache/datafusion/pull/19018) (rluvaton)
+- perf: optimize CASE WHEN lookup table (2.5-22.5 times faster) [#18183](https://github.com/apache/datafusion/pull/18183) (rluvaton)
+- add specialized InList implementations for common scalar types [#18832](https://github.com/apache/datafusion/pull/18832) (adriangb)
+- Add hashing microbenchmark `with_hashes` [#19373](https://github.com/apache/datafusion/pull/19373) (alamb)
+- Optimize muti-column grouping with StringView/ByteView (option 2) - 25% faster [#19413](https://github.com/apache/datafusion/pull/19413) (alamb)
+- Optimize hashing for StringView and ByteView (15-70% faster) [#19374](https://github.com/apache/datafusion/pull/19374) (alamb)
+- perf: Improve performance of `to_hex` (> 2x) [#19503](https://github.com/apache/datafusion/pull/19503) (andygrove)
+- perf: improve performance of string repeat [#19502](https://github.com/apache/datafusion/pull/19502) (andygrove)
+- perf: Optimize `starts_with` and `ends_with` for scalar arguments [#19516](https://github.com/apache/datafusion/pull/19516) (andygrove)
+- perf: improve performance of string replace [#19530](https://github.com/apache/datafusion/pull/19530) (viirya)
+- perf: improve performance of levenshtein by reusing cache buffer [#19532](https://github.com/apache/datafusion/pull/19532) (viirya)
+- perf: improve performance of translate by reusing buffers [#19533](https://github.com/apache/datafusion/pull/19533) (viirya)
+- perf: Optimize `contains` for scalar search arg [#19529](https://github.com/apache/datafusion/pull/19529) (andygrove)
+- perf: improve performance of lpad/rpad by reusing buffers [#19558](https://github.com/apache/datafusion/pull/19558) (viirya)
+- perf: optimize regexp_count to avoid String allocation when start position is provided [#19553](https://github.com/apache/datafusion/pull/19553) (viirya)
+- perf: Improve performance of `md5` [#19568](https://github.com/apache/datafusion/pull/19568) (andygrove)
+- perf: optimize strpos by eliminating double iteration for UTF-8 [#19572](https://github.com/apache/datafusion/pull/19572) (viirya)
+- perf: optimize factorial function performance [#19575](https://github.com/apache/datafusion/pull/19575) (getChan)
+- perf: Improve performance of ltrim, rtrim, btrim [#19551](https://github.com/apache/datafusion/pull/19551) (andygrove)
+- perf: optimize `HashTableLookupExpr::evaluate` [#19602](https://github.com/apache/datafusion/pull/19602) (UBarney)
+- perf: Improve performance of `split_part` [#19570](https://github.com/apache/datafusion/pull/19570) (andygrove)
+- Optimize `Nullstate` / accumulators [#19625](https://github.com/apache/datafusion/pull/19625) (Dandandan)
+
+**Implemented enhancements:**
+
+- feat: Enhance `array_slice` functionality to support `ListView` and `LargeListView` types [#18432](https://github.com/apache/datafusion/pull/18432) (Weijun-H)
+- feat: support complex expr for prepared statement argument [#18383](https://github.com/apache/datafusion/pull/18383) (chenkovsky)
+- feat: Implement `SessionState::create_logical_expr_from_sql_expr` [#18423](https://github.com/apache/datafusion/pull/18423) (petern48)
+- feat: added clippy::needless_pass_by_value lint rule to datafusion/expr [#18532](https://github.com/apache/datafusion/pull/18532) (Gohlub)
+- feat: support nested key for get_field [#18394](https://github.com/apache/datafusion/pull/18394) (chenkovsky)
+- feat: Add `ansi` enable parameter for execution config [#18635](https://github.com/apache/datafusion/pull/18635) (comphead)
+- feat: Add evaluate_to_arrays function [#18446](https://github.com/apache/datafusion/pull/18446) (EmilyMatt)
+- feat: support named variables & defaults for `CREATE FUNCTION` [#18450](https://github.com/apache/datafusion/pull/18450) (r1b)
+- feat: Add new() constructor for CachedParquetFileReader [#18575](https://github.com/apache/datafusion/pull/18575) (petern48)
+- feat: support decimal for math functions: power [#18032](https://github.com/apache/datafusion/pull/18032) (theirix)
+- feat: selectivity metrics (for Explain Analyze) in Hash Join [#18488](https://github.com/apache/datafusion/pull/18488) (feniljain)
+- feat: Handle edge case with `corr` with single row and `NaN` [#18677](https://github.com/apache/datafusion/pull/18677) (comphead)
+- feat: support spark csc [#18642](https://github.com/apache/datafusion/pull/18642) (psvri)
+- feat: support spark sec [#18728](https://github.com/apache/datafusion/pull/18728) (psvri)
+- feat(parquet): Implement `scan_efficiency_ratio` metric for parquet reading [#18577](https://github.com/apache/datafusion/pull/18577) (petern48)
+- feat: Enhance map handling to support NULL map values [#18531](https://github.com/apache/datafusion/pull/18531) (Weijun-H)
+- feat: add RESET statement for configuration variabless [#18408](https://github.com/apache/datafusion/pull/18408) (Weijun-H)
+- feat: add human-readable formatting to EXPLAIN ANALYZE metrics #18689 [#18734](https://github.com/apache/datafusion/pull/18734) (T2MIX)
+- feat: support Spark-compatible `abs` math function part 1 - non-ANSI mode [#18205](https://github.com/apache/datafusion/pull/18205) (hsiang-c)
+- feat: Support Show runtime settings [#18564](https://github.com/apache/datafusion/pull/18564) (Weijun-H)
+- feat(small): Support `<slt:ignore>` marker in `sqllogictest` for non-deterministic expected parts [#18857](https://github.com/apache/datafusion/pull/18857) (2010YOUY01)
+- feat: allow custom caching via logical node [#18688](https://github.com/apache/datafusion/pull/18688) (jizezhang)
+- feat: add `array_slice` benchmark [#18879](https://github.com/apache/datafusion/pull/18879) (dqkqd)
+- feat: Support recursive queries with a distinct 'UNION' [#18254](https://github.com/apache/datafusion/pull/18254) (Tpt)
+- feat: Makes error macros hygienic [#18995](https://github.com/apache/datafusion/pull/18995) (Tpt)
+- feat: Add builder API for CreateExternalTable to reduce verbosity [#19066](https://github.com/apache/datafusion/pull/19066) (AryanBagade)
+- feat(spark): Implement Spark functions `url_encode`, `url_decode` and `try_url_decode` [#17399](https://github.com/apache/datafusion/pull/17399) (anhvdq)
+- feat: Move DefaultMetadataCache into its own module [#19125](https://github.com/apache/datafusion/pull/19125) (AryanBagade)
+- feat: Add `remove_optimizer_rule` to `SessionContext` [#19209](https://github.com/apache/datafusion/pull/19209) (nuno-faria)
+- feat: integrate batch coalescer with repartition exec [#19002](https://github.com/apache/datafusion/pull/19002) (jizezhang)
+- feat: Preserve File Partitioning From File Scans [#19124](https://github.com/apache/datafusion/pull/19124) (gene-bordegaray)
+- feat: Add constant column extraction and rewriting for projections in ParquetOpener [#19136](https://github.com/apache/datafusion/pull/19136) (Weijun-H)
+- feat: Support sliding window queries for MedianAccumulator by implementing `retract_batch` [#19278](https://github.com/apache/datafusion/pull/19278) (petern48)
+- feat: add compression level configuration for JSON/CSV writers [#18954](https://github.com/apache/datafusion/pull/18954) (Smotrov)
+- feat(spark): implement Spark `try_sum` function [#18569](https://github.com/apache/datafusion/pull/18569) (davidlghellin)
+- feat: Support log for Decimal32 and Decimal64 [#18999](https://github.com/apache/datafusion/pull/18999) (Mark1626)
+- feat(proto): Add protobuf serialization for HashExpr [#19379](https://github.com/apache/datafusion/pull/19379) (adriangb)
+- feat: Add decimal support for round [#19384](https://github.com/apache/datafusion/pull/19384) (kumarUjjawal)
+- Support nested field access in `get_field` with multiple path arguments [#19389](https://github.com/apache/datafusion/pull/19389) (adriangb)
+- feat: fix matching for named parameters with non-lowercase signatures [#19378](https://github.com/apache/datafusion/pull/19378) (bubulalabu)
+- feat: Add per-expression evaluation timing metrics to ProjectionExec [#19447](https://github.com/apache/datafusion/pull/19447) (2010YOUY01)
+- feat: Improve sort memory resilience [#19494](https://github.com/apache/datafusion/pull/19494) (EmilyMatt)
+- feat: Add DELETE/UPDATE hooks to TableProvider trait and to MemTable implementation [#19142](https://github.com/apache/datafusion/pull/19142) (ethan-tyler)
+- feat: implement partition_statistics for WindowAggExec [#18534](https://github.com/apache/datafusion/pull/18534) (0xPoe)
+- feat: integrate batch coalescer with async fn exec [#19342](https://github.com/apache/datafusion/pull/19342) (feniljain)
+- feat: output statistics for constant columns in projections [#19419](https://github.com/apache/datafusion/pull/19419) (shashidhar-bm)
+- feat: `to_time` function [#19540](https://github.com/apache/datafusion/pull/19540) (kumarUjjawal)
+- feat: Implement Spark functions hour, minute, second [#19512](https://github.com/apache/datafusion/pull/19512) (andygrove)
+- feat: plan-time SQL expression simplifying [#19311](https://github.com/apache/datafusion/pull/19311) (theirix)
+- feat: Implement Spark function `space` [#19610](https://github.com/apache/datafusion/pull/19610) (kazantsev-maksim)
+- feat: Implement `partition_statistics` API for `SortMergeJoinExec` [#19567](https://github.com/apache/datafusion/pull/19567) (kumarUjjawal)
+- feat: add list_files_cache table function for `datafusion-cli` [#19388](https://github.com/apache/datafusion/pull/19388) (jizezhang)
+- feat: implement metrics for AsyncFuncExec [#19626](https://github.com/apache/datafusion/pull/19626) (feniljain)
+- feat: split BatchPartitioner::try_new into hash and round-robin constructors [#19668](https://github.com/apache/datafusion/pull/19668) (mohit7705)
+- feat: add Time type support to date_trunc function [#19640](https://github.com/apache/datafusion/pull/19640) (kumarUjjawal)
+- feat: Allow log with non-integer base on decimals [#19372](https://github.com/apache/datafusion/pull/19372) (Yuvraj-cyborg)
+
+**Fixed bugs:**
+
+- fix: Eliminate consecutive repartitions [#18521](https://github.com/apache/datafusion/pull/18521) (gene-bordegaray)
+- fix: `with_param_values` on `LogicalPlan::EmptyRelation` returns incorrect schema [#18286](https://github.com/apache/datafusion/pull/18286) (dqkqd)
+- fix: Nested arrays should not get a field in lookup [#18745](https://github.com/apache/datafusion/pull/18745) (EmilyMatt)
+- fix: update schema's data type for `LogicalPlan::Values` after placeholder substitution [#18740](https://github.com/apache/datafusion/pull/18740) (dqkqd)
+- fix: Pick correct columns in Sort Merge Equijoin [#18772](https://github.com/apache/datafusion/pull/18772) (tglanz)
+- fix: remove `WorkTableExec` special case in `reset_plan_states` [#18803](https://github.com/apache/datafusion/pull/18803) (geoffreyclaude)
+- fix: display the failed sqllogictest file and query that failed in case of a panic [#18785](https://github.com/apache/datafusion/pull/18785) (rluvaton)
+- fix: preserve byte-size statistics in AggregateExec [#18885](https://github.com/apache/datafusion/pull/18885) (Tamar-Posen)
+- fix: Track elapsed_compute metric for CSV scans [#18901](https://github.com/apache/datafusion/pull/18901) (Nithurshen)
+- fix: Implement Substrait consumer support for like_match, like_imatch, and negated variants [#18929](https://github.com/apache/datafusion/pull/18929) (Nithurshen)
+- fix: Initialize CsvOptions::double_quote from proto_opts.double_quote [#18967](https://github.com/apache/datafusion/pull/18967) (martin-g)
+- fix: `rstest` is a DEV dependency [#19014](https://github.com/apache/datafusion/pull/19014) (crepererum)
+- fix: partition pruning stats pruning when multiple values are present [#18923](https://github.com/apache/datafusion/pull/18923) (Mark1626)
+- fix: deprecate data_type_and_nullable and simplify API usage [#18869](https://github.com/apache/datafusion/pull/18869) (BipulLamsal)
+- fix: pre-warm listing file statistics cache during listing table creation [#18971](https://github.com/apache/datafusion/pull/18971) (bharath-techie)
+- fix: log metadata differences when comparing physical and logical schema [#19070](https://github.com/apache/datafusion/pull/19070) (erratic-pattern)
+- fix: fix panic when lo is greater than hi [#19099](https://github.com/apache/datafusion/pull/19099) (tshauck)
+- fix: escape underscores when simplifying `starts_with` [#19077](https://github.com/apache/datafusion/pull/19077) (willemv)
+- fix: custom nullability for length (#19175) [#19182](https://github.com/apache/datafusion/pull/19182) (skushagra)
+- fix: inverted null_percent logic in in_list benchmark [#19204](https://github.com/apache/datafusion/pull/19204) (geoffreyclaude)
+- fix: Ensure column names do not change with `expand_views_at_output` [#19019](https://github.com/apache/datafusion/pull/19019) (nuno-faria)
+- fix: bitmap_count should report nullability correctly [#19195](https://github.com/apache/datafusion/pull/19195) (harshitsaini17)
+- fix: bit_count function to report nullability correctly [#19197](https://github.com/apache/datafusion/pull/19197) (harshitsaini17)
+- fix: derive custom nullability for spark `bit_shift` [#19222](https://github.com/apache/datafusion/pull/19222) (kumarUjjawal)
+- fix: spark elt custom nullability [#19207](https://github.com/apache/datafusion/pull/19207) (EeshanBembi)
+- fix: `array_remove`/`array_remove_n`/`array_remove_all` not using the same nullability as the input [#19259](https://github.com/apache/datafusion/pull/19259) (rluvaton)
+- fix: typo in sql/ddl [#19276](https://github.com/apache/datafusion/pull/19276) (mag1c1an1)
+- fix: flaky cache test [#19140](https://github.com/apache/datafusion/pull/19140) (xonx4l)
+- fix: Add custom nullability for Spark ILIKE function [#19206](https://github.com/apache/datafusion/pull/19206) (Eshaan-byte)
+- fix: derive custom nullability for spark `map_from_arrays` [#19275](https://github.com/apache/datafusion/pull/19275) (kumarUjjawal)
+- fix: derive custom nullability for spark map_from_entries [#19274](https://github.com/apache/datafusion/pull/19274) (kumarUjjawal)
+- fix: derive custom nullable for spark `make_dt_interval` [#19236](https://github.com/apache/datafusion/pull/19236) (kumarUjjawal)
+- fix: derive custome nullable for the spark last_day [#19232](https://github.com/apache/datafusion/pull/19232) (kumarUjjawal)
+- fix: derive custom nullable for spark `date_sub` [#19225](https://github.com/apache/datafusion/pull/19225) (kumarUjjawal)
+- fix: Fix a few minor issues with join metrics [#19283](https://github.com/apache/datafusion/pull/19283) (linhr)
+- fix: derive nullability for spark `bit_get` [#19220](https://github.com/apache/datafusion/pull/19220) (kumarUjjawal)
+- fix: pow() with integer base and negative float exponent returns error [#19303](https://github.com/apache/datafusion/pull/19303) (adriangb)
+- fix(concat): correct nullability inference (nullable only if all arguments nullable) [#19189](https://github.com/apache/datafusion/pull/19189) (ujjwaltwri)
+- fix: Added nullable return from date_add(#19151) [#19229](https://github.com/apache/datafusion/pull/19229) (manishkr)
+- fix: spark sha1 nullability reporting [#19242](https://github.com/apache/datafusion/pull/19242) (shashidhar-bm)
+- fix: derive custom nullability for the spark `next_day` [#19253](https://github.com/apache/datafusion/pull/19253) (kumarUjjawal)
+- fix: preserve ListFilesCache TTL when not set in config [#19401](https://github.com/apache/datafusion/pull/19401) (shashidhar-bm)
+- fix: projection for `CooperativeExec` and `CoalesceBatchesExec` [#19400](https://github.com/apache/datafusion/pull/19400) (haohuaijin)
+- fix: spark crc32 custom nullability [#19271](https://github.com/apache/datafusion/pull/19271) (watanaberin)
+- fix: Fix skip aggregate test to cover regression [#19461](https://github.com/apache/datafusion/pull/19461) (kumarUjjawal)
+- fix: [19450]Added flush for tokio file(substrait) write [#19456](https://github.com/apache/datafusion/pull/19456) (manishkr)
+- fix: csv schema_infer_max_records set to 0 return null datatype [#19432](https://github.com/apache/datafusion/pull/19432) (haohuaijin)
+- fix: Add custom nullability for Spark LIKE function [#19218](https://github.com/apache/datafusion/pull/19218) (KaranPradhan266)
+- fix: implement custom nullability for spark abs function [#19395](https://github.com/apache/datafusion/pull/19395) (batmnnn)
+- fix: custom nullability for format_string (#19173) [#19190](https://github.com/apache/datafusion/pull/19190) (skushagra)
+- fix: Implement `reset_state` for `LazyMemoryExec` [#19362](https://github.com/apache/datafusion/pull/19362) (nuno-faria)
+- fix: CteWorkTable: properly apply TableProvider::scan projection argument [#18993](https://github.com/apache/datafusion/pull/18993) (Tpt)
+- fix: Median() integer overflow [#19509](https://github.com/apache/datafusion/pull/19509) (kumarUjjawal)
+- fix: Reverse row selection should respect the row group index [#19557](https://github.com/apache/datafusion/pull/19557) (zhuqi-lucas)
+- fix: emit empty RecordBatch for empty file writes [#19370](https://github.com/apache/datafusion/pull/19370) (nlimpid)
+- fix: handle invalid byte ranges in calculate_range for single-line files [#19607](https://github.com/apache/datafusion/pull/19607) (vigimite)
+- fix: NULL handling in arrow_intersect and arrow_union [#19415](https://github.com/apache/datafusion/pull/19415) (feniljain)
+- fix(doc): close #19393, make upgrading guide match v51 api [#19648](https://github.com/apache/datafusion/pull/19648) (mag1c1an1)
+- fix(spark): Use wrapping addition/subtraction in `SparkDateAdd` and `SparkDateSub` [#19377](https://github.com/apache/datafusion/pull/19377) (mzabaluev)
+- fix(functions): Make translate function postgres compatible [#19630](https://github.com/apache/datafusion/pull/19630) (devanshu0987)
+- fix: Return Int for Date - Date instead of duration [#19563](https://github.com/apache/datafusion/pull/19563) (kumarUjjawal)
+- fix: DynamicFilterPhysicalExpr violates Hash/Eq contract [#19659](https://github.com/apache/datafusion/pull/19659) (kumarUjjawal)
+
+**Documentation updates:**
+
+- [main] Update version to 51.0.0, add Changelog (#18551) [#18565](https://github.com/apache/datafusion/pull/18565) (alamb)
+- refactor: include metric output_batches into BaselineMetrics [#18491](https://github.com/apache/datafusion/pull/18491) (nmbr7)
+- chore(deps): bump maturin from 1.9.6 to 1.10.0 in /docs [#18590](https://github.com/apache/datafusion/pull/18590) (dependabot[bot])
+- Update release download links on download page [#18550](https://github.com/apache/datafusion/pull/18550) (alamb)
+- docs: fix rustup cmd for adding rust-analyzer [#18605](https://github.com/apache/datafusion/pull/18605) (Jefffrey)
+- Enforce explicit opt-in for `WITHIN GROUP` syntax in aggregate UDAFs [#18607](https://github.com/apache/datafusion/pull/18607) (kosiew)
+- docs: fix broken catalog example links [#18765](https://github.com/apache/datafusion/pull/18765) (nlimpid)
+- doc: Add documentation for error handling [#18762](https://github.com/apache/datafusion/pull/18762) (2010YOUY01)
+- docs: Fix the examples for char_length() and character_length() [#18808](https://github.com/apache/datafusion/pull/18808) (martin-g)
+- chore: Support 'untake' for unassigning github issues [#18637](https://github.com/apache/datafusion/pull/18637) (petern48)
+- chore: Add filtered pending PRs link to main page [#18854](https://github.com/apache/datafusion/pull/18854) (comphead)
+- Docs: Enhance contributor guide with testing section [#18852](https://github.com/apache/datafusion/pull/18852) (alamb)
+- Docs: Enhance testing documentation with examples and links [#18851](https://github.com/apache/datafusion/pull/18851) (alamb)
+- chore(deps): bump maturin from 1.10.0 to 1.10.2 in /docs [#18905](https://github.com/apache/datafusion/pull/18905) (dependabot[bot])
+- Update links in documentation to point at new example locations [#18931](https://github.com/apache/datafusion/pull/18931) (alamb)
+- Add Kubeflow Trainer to known users [#18935](https://github.com/apache/datafusion/pull/18935) (andreyvelich)
+- Add PGO documentation section to crate configuration [#18959](https://github.com/apache/datafusion/pull/18959) (jatinkumarsingh)
+- Add upgrade guide for PhysicalOptimizerRule::optimize_plan [#19030](https://github.com/apache/datafusion/pull/19030) (adriangb)
+- doc: add `FilterExec` metrics to `user-guide/metrics.md` [#19043](https://github.com/apache/datafusion/pull/19043) (2010YOUY01)
+- Add `force_filter_selections` to restore `pushdown_filters` behavior prior to parquet 57.1.0 upgrade [#19003](https://github.com/apache/datafusion/pull/19003) (alamb)
+- Implement FFI task context and task context provider [#18918](https://github.com/apache/datafusion/pull/18918) (timsaucer)
+- Minor: fix link errors in docs [#19088](https://github.com/apache/datafusion/pull/19088) (alamb)
+- Cut `Parquet` over to PhysicalExprAdapter, remove `SchemaAdapter` [#18998](https://github.com/apache/datafusion/pull/18998) (adriangb)
+- Update Committer / PMC list [#19105](https://github.com/apache/datafusion/pull/19105) (alamb)
+- Revert adding PhysicalOptimizerRule::optimize_plan [#19186](https://github.com/apache/datafusion/pull/19186) (adriangb)
+- Push down InList or hash table references from HashJoinExec depending on the size of the build side [#18393](https://github.com/apache/datafusion/pull/18393) (adriangb)
+- Move partition handling out of PhysicalExprAdapter [#19128](https://github.com/apache/datafusion/pull/19128) (adriangb)
+- Push down projection expressions into ParquetOpener [#19111](https://github.com/apache/datafusion/pull/19111) (adriangb)
+- Track column sizes in Statistics; propagate through projections [#19113](https://github.com/apache/datafusion/pull/19113) (adriangb)
+- Improve ProjectionExpr documentation and comments [#19263](https://github.com/apache/datafusion/pull/19263) (alamb)
+- Update README occording to the new examples (#18529) [#19257](https://github.com/apache/datafusion/pull/19257) (cj-zhukov)
+- Add make_time function [#19183](https://github.com/apache/datafusion/pull/19183) (Omega359)
+- Update to_date udf function to support a consistent set of argument types [#19134](https://github.com/apache/datafusion/pull/19134) (Omega359)
+- Add library user guide for extending SQL syntax [#19265](https://github.com/apache/datafusion/pull/19265) (geoffreyclaude)
+- Add runtime config options for `list_files_cache_limit` and `list_files_cache_ttl` [#19108](https://github.com/apache/datafusion/pull/19108) (delamarch3)
+- Minor: clean up titles and links n extending operators and optimizer pages [#19317](https://github.com/apache/datafusion/pull/19317) (alamb)
+- Establish the high level API for sort pushdown and the optimizer rule and support reverse files and row groups [#19064](https://github.com/apache/datafusion/pull/19064) (zhuqi-lucas)
+- Add Decimal support to Ceil and Floor [#18979](https://github.com/apache/datafusion/pull/18979) (kumarUjjawal)
+- doc: add example for cache factory [#19139](https://github.com/apache/datafusion/pull/19139) (jizezhang)
+- chore(deps): bump sphinx-reredirects from 1.0.0 to 1.1.0 in /docs [#19455](https://github.com/apache/datafusion/pull/19455) (dependabot[bot])
+- Add:arrow_metadata() UDF [#19435](https://github.com/apache/datafusion/pull/19435) (xonx4l)
+- Update date_bin to support Time32 and Time64 data types [#19341](https://github.com/apache/datafusion/pull/19341) (Omega359)
+- Update `to_unixtime` udf function to support a consistent set of argument types [#19442](https://github.com/apache/datafusion/pull/19442) (kumarUjjawal)
+- docs: Improve config tables' readability [#19522](https://github.com/apache/datafusion/pull/19522) (nuno-faria)
+- Introduce `TypeSignatureClass::Any` [#19485](https://github.com/apache/datafusion/pull/19485) (Jefffrey)
+- Enables DefaultListFilesCache by default [#19366](https://github.com/apache/datafusion/pull/19366) (BlakeOrth)
+- Fix typo in contributor guide architecture section [#19613](https://github.com/apache/datafusion/pull/19613) (cdegroc)
+- docs: fix typos in PartitionEvaluator trait documentation [#19631](https://github.com/apache/datafusion/pull/19631) (SolariSystems)
+- Respect execution timezone in to_timestamp and related functions [#19078](https://github.com/apache/datafusion/pull/19078) (Omega359)
+- perfect hash join [#19411](https://github.com/apache/datafusion/pull/19411) (UBarney)
+
+**Other:**
+
+- chore(deps): bump taiki-e/install-action from 2.62.46 to 2.62.47 [#18508](https://github.com/apache/datafusion/pull/18508) (dependabot[bot])
+- Consolidate builtin functions examples (#18142) [#18523](https://github.com/apache/datafusion/pull/18523) (cj-zhukov)
+- refactor: update cmp and nested data in binary operator [#18256](https://github.com/apache/datafusion/pull/18256) (sunng87)
+- Fix: topk_aggregate benchmark failing [#18502](https://github.com/apache/datafusion/pull/18502) (randyli)
+- refactor: Add `assert_or_internal_err!` macro for more ergonomic internal invariant checks [#18511](https://github.com/apache/datafusion/pull/18511) (2010YOUY01)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-physical-optimizer [#18555](https://github.com/apache/datafusion/pull/18555) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value for datafusion-sql [#18554](https://github.com/apache/datafusion/pull/18554) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value to physical-expr-common [#18556](https://github.com/apache/datafusion/pull/18556) (foskey51)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-physical-expr` [#18557](https://github.com/apache/datafusion/pull/18557) (corasaurus-hex)
+- Fix out-of-bounds access in SLT runner [#18562](https://github.com/apache/datafusion/pull/18562) (theirix)
+- Make array_reverse faster for List and FixedSizeList [#18500](https://github.com/apache/datafusion/pull/18500) (vegarsti)
+- Consolidate custom data source examples (#18142) [#18553](https://github.com/apache/datafusion/pull/18553) (cj-zhukov)
+- chore(deps): bump taiki-e/install-action from 2.62.47 to 2.62.49 [#18581](https://github.com/apache/datafusion/pull/18581) (dependabot[bot])
+- chore: Remove unused `tokio` dependency and clippy [#18598](https://github.com/apache/datafusion/pull/18598) (comphead)
+- minor: enforce `clippy::needless_pass_by_value` for crates that don't require code changes. [#18586](https://github.com/apache/datafusion/pull/18586) (2010YOUY01)
+- refactor: merge CoalesceAsyncExecInput into CoalesceBatches [#18540](https://github.com/apache/datafusion/pull/18540) (Tim-53)
+- Enhance the help message for invalid command in datafusion-cli [#18603](https://github.com/apache/datafusion/pull/18603) (klion26)
+- Update Release README.md with latest process [#18549](https://github.com/apache/datafusion/pull/18549) (alamb)
+- Add timezone to date_trunc fast path [#18596](https://github.com/apache/datafusion/pull/18596) (hareshkh)
+- Coalesce batches inside FilterExec [#18604](https://github.com/apache/datafusion/pull/18604) (Dandandan)
+- Fix misleading boolean 'null' interval tests [#18620](https://github.com/apache/datafusion/pull/18620) (pepijnve)
+- Clarify tests for `Interval::and`, `Interval::not`, and add `Interval::or` tests [#18621](https://github.com/apache/datafusion/pull/18621) (pepijnve)
+- bugfix: correct regression on TableType for into_view [#18617](https://github.com/apache/datafusion/pull/18617) (timsaucer)
+- Separating Benchmarks for physical sorted union over large columns in SQL planner based on Datatype [#18599](https://github.com/apache/datafusion/pull/18599) (logan-keede)
+- Add RunEndEncoded type coercion [#18561](https://github.com/apache/datafusion/pull/18561) (vegarsti)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/sql` [#18614](https://github.com/apache/datafusion/pull/18614) (2010YOUY01)
+- chore: ASF tracking process on `.asf.yaml` [#18636](https://github.com/apache/datafusion/pull/18636) (comphead)
+- Refactor bit aggregate functions signature [#18593](https://github.com/apache/datafusion/pull/18593) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.49 to 2.62.50 [#18645](https://github.com/apache/datafusion/pull/18645) (dependabot[bot])
+- bugfix: select_columns should validate column names [#18623](https://github.com/apache/datafusion/pull/18623) (timsaucer)
+- Consolidate data io examples (#18142) [#18591](https://github.com/apache/datafusion/pull/18591) (cj-zhukov)
+- Correct implementations of `NullableInterval::and` and `NullableInterval::or`. [#18625](https://github.com/apache/datafusion/pull/18625) (pepijnve)
+- chore: ASF tracking process on `.asf.yaml` [#18652](https://github.com/apache/datafusion/pull/18652) (comphead)
+- Refactor Spark bitshift signature [#18649](https://github.com/apache/datafusion/pull/18649) (Jefffrey)
+- chore(deps): bump crate-ci/typos from 1.39.0 to 1.39.1 [#18667](https://github.com/apache/datafusion/pull/18667) (dependabot[bot])
+- Update docs for aggregate repartition test [#18650](https://github.com/apache/datafusion/pull/18650) (xanderbailey)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-catalog` [#18638](https://github.com/apache/datafusion/pull/18638) (Standing-Man)
+- [main] Update Changelog (#18592) [#18616](https://github.com/apache/datafusion/pull/18616) (alamb)
+- Refactor distinct aggregate implementations to use common buffer [#18348](https://github.com/apache/datafusion/pull/18348) (Jefffrey)
+- chore: enforce lint rule `clippy::needless_pass_by_value` to `datafusion-datasource-avro` [#18641](https://github.com/apache/datafusion/pull/18641) (Standing-Man)
+- Refactor Spark expm1 signature [#18655](https://github.com/apache/datafusion/pull/18655) (Jefffrey)
+- chore(core): Enforce lint rule `clippy::needless_pass_by_value` to `datafusion-core` [#18640](https://github.com/apache/datafusion/pull/18640) (Standing-Man)
+- Refactor substr signature [#18653](https://github.com/apache/datafusion/pull/18653) (Jefffrey)
+- minor: Use allow->expect to explicitly suppress Clippy lint checks [#18686](https://github.com/apache/datafusion/pull/18686) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.50 to 2.62.51 [#18693](https://github.com/apache/datafusion/pull/18693) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.39.1 to 1.39.2 [#18694](https://github.com/apache/datafusion/pull/18694) (dependabot[bot])
+- Remove FilterExec from CoalesceBatches optimization rule, add fetch support [#18630](https://github.com/apache/datafusion/pull/18630) (Dandandan)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/datasource` [#18697](https://github.com/apache/datafusion/pull/18697) (kumarUjjawal)
+- chore: Enforce lint rule `clippy::needless_pass_by_value` to datafusion-datasource [#18682](https://github.com/apache/datafusion/pull/18682) (AryanBagade)
+- [main] Update changelog for 51.0.0 RC2 [#18710](https://github.com/apache/datafusion/pull/18710) (alamb)
+- Refactor Spark crc32/sha1 signatures [#18662](https://github.com/apache/datafusion/pull/18662) (Jefffrey)
+- CI: try free up space in `Rust / cargo test (amd64)` action [#18709](https://github.com/apache/datafusion/pull/18709) (Jefffrey)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-proto [#18715](https://github.com/apache/datafusion/pull/18715) (foskey51)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-spark [#18714](https://github.com/apache/datafusion/pull/18714) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/optimizer` [#18699](https://github.com/apache/datafusion/pull/18699) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions` [#18700](https://github.com/apache/datafusion/pull/18700) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/expr-common` [#18702](https://github.com/apache/datafusion/pull/18702) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions-aggregate` [#18716](https://github.com/apache/datafusion/pull/18716) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-execution [#18723](https://github.com/apache/datafusion/pull/18723) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/functions-nested` [#18724](https://github.com/apache/datafusion/pull/18724) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-substrait [#18703](https://github.com/apache/datafusion/pull/18703) (foskey51)
+- chore: Refactor with assert_or_internal_err!() in datafusion/spark. [#18674](https://github.com/apache/datafusion/pull/18674) (codetyri0n)
+- Minor: Add docs to release/README.md about rate limits [#18704](https://github.com/apache/datafusion/pull/18704) (alamb)
+- Consolidate query planning examples (#18142) [#18690](https://github.com/apache/datafusion/pull/18690) (cj-zhukov)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-expr-common` [#18735](https://github.com/apache/datafusion/pull/18735) (kumarUjjawal)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-expr` [#18736](https://github.com/apache/datafusion/pull/18736) (kumarUjjawal)
+- Consolidate ArrowFileSource and ArrowStreamFileSource [#18720](https://github.com/apache/datafusion/pull/18720) (adriangb)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-optimizer` [#18732](https://github.com/apache/datafusion/pull/18732) (kumarUjjawal)
+- refactor: reduce duplication in make_udf_function macro [#18733](https://github.com/apache/datafusion/pull/18733) (shashidhar-bm)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/physical-plan` [#18730](https://github.com/apache/datafusion/pull/18730) (kumarUjjawal)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-functions-aggregate-common [#18741](https://github.com/apache/datafusion/pull/18741) (foskey51)
+- Optimize NullState::build [#18737](https://github.com/apache/datafusion/pull/18737) (Dandandan)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-datasource-parquet [#18695](https://github.com/apache/datafusion/pull/18695) (foskey51)
+- minor: refactor with `assert_or_internal_err!()` in `datafusion/expr` [#18731](https://github.com/apache/datafusion/pull/18731) (kumarUjjawal)
+- minor: Fix an example in the `PruningPredicate` documentation [#18742](https://github.com/apache/datafusion/pull/18742) (2010YOUY01)
+- chore(deps): bump indicatif from 0.18.2 to 0.18.3 [#18756](https://github.com/apache/datafusion/pull/18756) (dependabot[bot])
+- Fix map_query_sql benchmark duplicate key error [#18427](https://github.com/apache/datafusion/pull/18427) (atheendre130505)
+- minor: enforce lint rule clippy::needless_pass_by_value to datafusion-ffi [#18764](https://github.com/apache/datafusion/pull/18764) (Standing-Man)
+- Rename boolean `Interval` constants to match `NullableInterval` [#18654](https://github.com/apache/datafusion/pull/18654) (pepijnve)
+- chore(deps): bump bytes from 1.10.1 to 1.11.0 [#18755](https://github.com/apache/datafusion/pull/18755) (dependabot[bot])
+- CI: Fix `main` branch CI test failure [#18792](https://github.com/apache/datafusion/pull/18792) (2010YOUY01)
+- chore: Enforce 'clippy::needless_pass_by_value' to datafusion-expr-common [#18775](https://github.com/apache/datafusion/pull/18775) (petern48)
+- chore: Finish refactor with `assert_or_internal_err!()` [#18790](https://github.com/apache/datafusion/pull/18790) (2010YOUY01)
+- Switch from xz2 to liblzma to reduce duplicate dependencies [#17509](https://github.com/apache/datafusion/pull/17509) (timsaucer)
+- chore(deps): bump taiki-e/install-action from 2.62.51 to 2.62.53 [#18796](https://github.com/apache/datafusion/pull/18796) (dependabot[bot])
+- chore(deps): bump actions/checkout from 5.0.0 to 5.0.1 [#18797](https://github.com/apache/datafusion/pull/18797) (dependabot[bot])
+- Misc improvements to ProjectionExprs [#18719](https://github.com/apache/datafusion/pull/18719) (adriangb)
+- Fix incorrect link for sql_query.rs example in README [#18807](https://github.com/apache/datafusion/pull/18807) (kondamudikarthik)
+- Adds prefix filtering for table URLs [#18780](https://github.com/apache/datafusion/pull/18780) (BlakeOrth)
+- Refactor InListExpr to support structs by re-using existing hashing infrastructure [#18449](https://github.com/apache/datafusion/pull/18449) (adriangb)
+- chore: Add script to protect RC branches during the release [#18660](https://github.com/apache/datafusion/pull/18660) (comphead)
+- Prevent overflow and panics when casting DATE to TIMESTAMP by validating bounds [#18761](https://github.com/apache/datafusion/pull/18761) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.53 to 2.62.54 [#18815](https://github.com/apache/datafusion/pull/18815) (dependabot[bot])
+- CI : Enforce clippy: :needless_pass_by_value rule to datafusion-functions-aggregate [#18805](https://github.com/apache/datafusion/pull/18805) (codetyri0n)
+- Consolidate sql operations examples (#18142) [#18743](https://github.com/apache/datafusion/pull/18743) (cj-zhukov)
+- Move `GuaranteeRewriter` to datafusion_expr [#18821](https://github.com/apache/datafusion/pull/18821) (pepijnve)
+- Refactor state management in `HashJoinExec` and use CASE expressions for more precise filters [#18451](https://github.com/apache/datafusion/pull/18451) (adriangb)
+- Refactor avg & sum signatures away from user defined [#18769](https://github.com/apache/datafusion/pull/18769) (Jefffrey)
+- Hash UnionArrays [#18718](https://github.com/apache/datafusion/pull/18718) (friendlymatthew)
+- CI: add clippy::needless_pass_by_value rule to datafusion-functions-window crate [#18838](https://github.com/apache/datafusion/pull/18838) (codetyri0n)
+- Add field to DynamicPhysicalExpr to indicate when the filter is complete or updated [#18799](https://github.com/apache/datafusion/pull/18799) (LiaCastaneda)
+- #17801 Improve nullability reporting of case expressions [#17813](https://github.com/apache/datafusion/pull/17813) (pepijnve)
+- Consolidate execution monitoring examples (#18142) [#18846](https://github.com/apache/datafusion/pull/18846) (cj-zhukov)
+- Implement CatalogProviderList in FFI [#18657](https://github.com/apache/datafusion/pull/18657) (timsaucer)
+- Removed incorrect union check in enforce_sorting and updated tests [#18661](https://github.com/apache/datafusion/pull/18661) (gene-bordegaray)
+- chore(deps): bump actions/checkout from 5.0.1 to 6.0.0 [#18865](https://github.com/apache/datafusion/pull/18865) (dependabot[bot])
+- Remove unnecessary bit counting code from spark `bit_count` [#18841](https://github.com/apache/datafusion/pull/18841) (pepijnve)
+- Fix async_udf batch size behaviour [#18819](https://github.com/apache/datafusion/pull/18819) (shivbhatia10)
+- Fix Partial AggregateExec correctness issue dropping rows [#18712](https://github.com/apache/datafusion/pull/18712) (xanderbailey)
+- chore: Add missing boolean tests to `bit_count` Spark function [#18871](https://github.com/apache/datafusion/pull/18871) (comphead)
+- Consolidate proto examples (#18142) [#18861](https://github.com/apache/datafusion/pull/18861) (cj-zhukov)
+- Use logical null count in `case_when_with_expr` [#18872](https://github.com/apache/datafusion/pull/18872) (pepijnve)
+- chore: enforce `clippy::needless_pass_by_value` to `datafusion-physical-plan` [#18864](https://github.com/apache/datafusion/pull/18864) (2010YOUY01)
+- Refactor spark `bit_get()` signature away from user defined [#18836](https://github.com/apache/datafusion/pull/18836) (Jefffrey)
+- minor: enforce lint rule clippy::needless_pass_by_value to datafusion-functions [#18768](https://github.com/apache/datafusion/pull/18768) (Standing-Man)
+- chore: enforce clippy lint needless_pass_by_value to datafusion-functions-nested [#18839](https://github.com/apache/datafusion/pull/18839) (foskey51)
+- chore: fix CI on main [#18876](https://github.com/apache/datafusion/pull/18876) (Jefffrey)
+- chore: update Repartition DisplayAs to indicate maintained sort order [#18673](https://github.com/apache/datafusion/pull/18673) (ruchirK)
+- implement sum for durations [#18853](https://github.com/apache/datafusion/pull/18853) (logan-keede)
+- Consolidate dataframe examples (#18142) [#18862](https://github.com/apache/datafusion/pull/18862) (cj-zhukov)
+- Avoid the need to rewrite expressions when evaluating logical case nullability [#18849](https://github.com/apache/datafusion/pull/18849) (pepijnve)
+- Avoid skew in Roundrobin repartition [#18880](https://github.com/apache/datafusion/pull/18880) (Dandandan)
+- Add benchmark for array_has/array_has_all/array_has_any [#18729](https://github.com/apache/datafusion/pull/18729) (zhuqi-lucas)
+- chore(deps): bump taiki-e/install-action from 2.62.54 to 2.62.56 [#18899](https://github.com/apache/datafusion/pull/18899) (dependabot[bot])
+- chore(deps): bump indicatif from 0.18.0 to 0.18.3 [#18897](https://github.com/apache/datafusion/pull/18897) (dependabot[bot])
+- chore(deps): bump tokio-util from 0.7.16 to 0.7.17 [#18898](https://github.com/apache/datafusion/pull/18898) (dependabot[bot])
+- Support Non-Literal Expressions in Substrait VirtualTable Values and Improve Round-Trip Robustness [#18866](https://github.com/apache/datafusion/pull/18866) (kosiew)
+- chore(deps): bump indexmap from 2.12.0 to 2.12.1 [#18895](https://github.com/apache/datafusion/pull/18895) (dependabot[bot])
+- chore(deps): bump aws-config from 1.8.7 to 1.8.11 [#18896](https://github.com/apache/datafusion/pull/18896) (dependabot[bot])
+- chore(deps): bump flate2 from 1.1.4 to 1.1.5 [#18900](https://github.com/apache/datafusion/pull/18900) (dependabot[bot])
+- Add iter() method to `Extensions` [#18887](https://github.com/apache/datafusion/pull/18887) (gabotechs)
+- chore: Enforce `clippy::needless_pass_by_value` globally across the workspace [#18904](https://github.com/apache/datafusion/pull/18904) (2010YOUY01)
+- Consolidate external dependency examples (#18142) [#18747](https://github.com/apache/datafusion/pull/18747) (cj-zhukov)
+- Optimize planning for projected nested union [#18713](https://github.com/apache/datafusion/pull/18713) (logan-keede)
+- chore(deps): bump taiki-e/install-action from 2.62.56 to 2.62.57 [#18927](https://github.com/apache/datafusion/pull/18927) (dependabot[bot])
+- chore(deps): bump actions/setup-python from 6.0.0 to 6.1.0 [#18925](https://github.com/apache/datafusion/pull/18925) (dependabot[bot])
+- Fix `map` function alias handling in SQL planner [#18914](https://github.com/apache/datafusion/pull/18914) (friendlymatthew)
+- minor: add builder setting `NdJsonReadOptions::schema_infer_max_records` [#18920](https://github.com/apache/datafusion/pull/18920) (Jefffrey)
+- Implement Substrait Support for `GROUPING SET CUBE` [#18798](https://github.com/apache/datafusion/pull/18798) (kosiew)
+- chore: unify common dependencies as workspace dependencies [#18665](https://github.com/apache/datafusion/pull/18665) (Jefffrey)
+- Fix bug where binary types were incorrectly being casted for coercible signatures [#18750](https://github.com/apache/datafusion/pull/18750) (Jefffrey)
+- Refactor approx_median signature & support f16 [#18647](https://github.com/apache/datafusion/pull/18647) (Jefffrey)
+- Refactor `to_local_time()` signature away from user_defined [#18707](https://github.com/apache/datafusion/pull/18707) (Jefffrey)
+- chore(deps-dev): bump node-forge from 1.3.1 to 1.3.2 in /datafusion/wasmtest/datafusion-wasm-app [#18958](https://github.com/apache/datafusion/pull/18958) (dependabot[bot])
+- Support LikeMatch, ILikeMatch, NotLikeMatch, NotILikeMatch operators in protobuf serialization [#18961](https://github.com/apache/datafusion/pull/18961) (zhuqi-lucas)
+- chore: cargo fmt to fix CI [#18969](https://github.com/apache/datafusion/pull/18969) (Jefffrey)
+- chore(deps): bump Swatinem/rust-cache from 2.8.1 to 2.8.2 [#18963](https://github.com/apache/datafusion/pull/18963) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.57 to 2.62.58 [#18964](https://github.com/apache/datafusion/pull/18964) (dependabot[bot])
+- chore(deps): bump crate-ci/typos from 1.39.2 to 1.40.0 [#18965](https://github.com/apache/datafusion/pull/18965) (dependabot[bot])
+- [Minor] Refactor `traverse_chain` macro to function [#18951](https://github.com/apache/datafusion/pull/18951) (Dandandan)
+- Enable clippy::allow_attributes lint for datafusion-catalog [#18973](https://github.com/apache/datafusion/pull/18973) (chakkk309)
+- chore: update group of crates to rust 2024 edition [#18915](https://github.com/apache/datafusion/pull/18915) (timsaucer)
+- chore(deps): bump taiki-e/install-action from 2.62.58 to 2.62.59 [#18978](https://github.com/apache/datafusion/pull/18978) (dependabot[bot])
+- Simplify percentile_cont for 0/1 percentiles [#18837](https://github.com/apache/datafusion/pull/18837) (kumarUjjawal)
+- chore: enforce clippy::allow_attributes for functions-\* crates [#18986](https://github.com/apache/datafusion/pull/18986) (carlosahs)
+- chore: enforce clippy::allow_attributes for common crates [#18988](https://github.com/apache/datafusion/pull/18988) (chakkk309)
+- Fix predicate_rows_pruned & predicate_rows_matched metrics [#18980](https://github.com/apache/datafusion/pull/18980) (xudong963)
+- Allocate a buffer of the correct length for ScalarValue::FixedSizeBinary in ScalarValue::to_array_of_size [#18903](https://github.com/apache/datafusion/pull/18903) (tobixdev)
+- Fix error planning aggregates with duplicated names in select list [#18831](https://github.com/apache/datafusion/pull/18831) (tshauck)
+- chore: remove `deny`s of `needless_pass_by_value` in `lib.rs` files [#18996](https://github.com/apache/datafusion/pull/18996) (Jefffrey)
+- Add Explicit Error Handling for Unsupported SQL `FETCH` Clause in Planner and CLI [#18691](https://github.com/apache/datafusion/pull/18691) (kosiew)
+- chore(deps): bump criterion from 0.7.0 to 0.8.0 [#19009](https://github.com/apache/datafusion/pull/19009) (dependabot[bot])
+- chore(deps): bump syn from 2.0.108 to 2.0.111 [#19011](https://github.com/apache/datafusion/pull/19011) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.59 to 2.62.60 [#19012](https://github.com/apache/datafusion/pull/19012) (dependabot[bot])
+- chore: remove redundant clone code [#18997](https://github.com/apache/datafusion/pull/18997) (Smith-Cruise)
+- Update to `arrow`, `parquet` to `57.1.0` [#18820](https://github.com/apache/datafusion/pull/18820) (alamb)
+- deny on allow_attributes lint in physical-plan [#18983](https://github.com/apache/datafusion/pull/18983) (YuraLitvinov)
+- Add additional test coverage of multi-value PartitionPruningStats [#19021](https://github.com/apache/datafusion/pull/19021) (alamb)
+- Fix tpch benchmark harness [#19033](https://github.com/apache/datafusion/pull/19033) (alamb)
+- Fix data for tpch_csv and tpch_csv10 [#19034](https://github.com/apache/datafusion/pull/19034) (alamb)
+- chore: update group of 3 crates to rust 2024 edition [#19001](https://github.com/apache/datafusion/pull/19001) (timsaucer)
+- chore(deps-dev): bump express from 4.21.2 to 4.22.1 in /datafusion/wasmtest/datafusion-wasm-app [#19040](https://github.com/apache/datafusion/pull/19040) (dependabot[bot])
+- Allow repartitioning on files with ranges [#18948](https://github.com/apache/datafusion/pull/18948) (Samyak2)
+- Support simplify not for physical expr [#18970](https://github.com/apache/datafusion/pull/18970) (xudong963)
+- dev: Add typos check to the local `dev/rust_lint.sh` [#17863](https://github.com/apache/datafusion/pull/17863) (2010YOUY01)
+- Implement FFI_PhysicalExpr and the structs it needs to support it. [#18916](https://github.com/apache/datafusion/pull/18916) (timsaucer)
+- chore(deps): bump actions/setup-node from 6.0.0 to 6.1.0 [#19063](https://github.com/apache/datafusion/pull/19063) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.60 to 2.62.61 [#19062](https://github.com/apache/datafusion/pull/19062) (dependabot[bot])
+- chore(deps): bump actions/stale from 10.1.0 to 10.1.1 [#19061](https://github.com/apache/datafusion/pull/19061) (dependabot[bot])
+- chore: merge make_array and spark array [#19006](https://github.com/apache/datafusion/pull/19006) (jizezhang)
+- chore(deps): bump actions/checkout from 6.0.0 to 6.0.1 [#19060](https://github.com/apache/datafusion/pull/19060) (dependabot[bot])
+- Add documentation example for `PartitionPruningStatistics` [#19020](https://github.com/apache/datafusion/pull/19020) (alamb)
+- chore: upgrade expr and execution crates to rust 2024 edition [#19047](https://github.com/apache/datafusion/pull/19047) (timsaucer)
+- refactor: Refactor spark make_interval signature away from user defined [#19027](https://github.com/apache/datafusion/pull/19027) (kumarUjjawal)
+- Fix: Align sort_merge_join filter output with join schema to fix right-anti panic [#18800](https://github.com/apache/datafusion/pull/18800) (kumarUjjawal)
+- Support Substrait Round-Trip of `EmptyRelation` Including `produce_one_row` Semantics [#18842](https://github.com/apache/datafusion/pull/18842) (kosiew)
+- chore(deps): bump taiki-e/install-action from 2.62.61 to 2.62.62 [#19081](https://github.com/apache/datafusion/pull/19081) (dependabot[bot])
+- chore: enforce clippy::allow_attributes for datasource crates [#19068](https://github.com/apache/datafusion/pull/19068) (chakkk309)
+- common: Add hashing support for REE arrays [#18981](https://github.com/apache/datafusion/pull/18981) (brancz)
+- Use `tpchgen-cli` to generate tpch data in bench.sh [#19035](https://github.com/apache/datafusion/pull/19035) (alamb)
+- Update aggregate probe to be locked only if skipping aggregation [#18766](https://github.com/apache/datafusion/pull/18766) (hareshkh)
+- Fix function doc CI check [#19093](https://github.com/apache/datafusion/pull/19093) (alamb)
+- Fix Schema Duplication Errors in Self‑Referential INTERSECT/EXCEPT by Requalifying Input Sides [#18814](https://github.com/apache/datafusion/pull/18814) (kosiew)
+- run cargo fmt to fix after #18998 [#19102](https://github.com/apache/datafusion/pull/19102) (adriangb)
+- bench: set test_util as required feature for aggregate_vectorized [#19101](https://github.com/apache/datafusion/pull/19101) (rluvaton)
+- use ProjectionExprs:project_statistics in FileScanConfig [#19094](https://github.com/apache/datafusion/pull/19094) (adriangb)
+- Temporarily ignore test_cache_with_ttl_and_lru test [#19115](https://github.com/apache/datafusion/pull/19115) (alamb)
+- refactor: move human readable display utilities to `datafusion-common` crate [#19080](https://github.com/apache/datafusion/pull/19080) (2010YOUY01)
+- Always remove unecessary software from github runners for all jobs (fix intermittent out of space on runners) [#19122](https://github.com/apache/datafusion/pull/19122) (alamb)
+- [datafusion-spark]: Refactor make_dt_interval's signature away from user defined [#19083](https://github.com/apache/datafusion/pull/19083) (codetyri0n)
+- fix deprecation notes with incorrect versions from #13083 [#19135](https://github.com/apache/datafusion/pull/19135) (adriangb)
+- Run the examples in the new format [#18946](https://github.com/apache/datafusion/pull/18946) (cj-zhukov)
+- Add constant expression evaluator to physical expression simplifier [#19130](https://github.com/apache/datafusion/pull/19130) (adriangb)
+- Fix shuffle function to report nullability correctly [#19184](https://github.com/apache/datafusion/pull/19184) (harshitsaini17)
+- chore: enforce clippy::allow_attributes for physical crates [#19185](https://github.com/apache/datafusion/pull/19185) (carlosahs)
+- Update 5 crates to rust 2024 edition [#19091](https://github.com/apache/datafusion/pull/19091) (timsaucer)
+- Coalesce batches inside hash join, reuse indices buffer [#18972](https://github.com/apache/datafusion/pull/18972) (Dandandan)
+- slt test coverage for `CASE` exprs with constant value lookup tables [#19143](https://github.com/apache/datafusion/pull/19143) (alamb)
+- Fix fmt after logical conflict [#19208](https://github.com/apache/datafusion/pull/19208) (alamb)
+- chore: Add TPCDS benchmarks [#19138](https://github.com/apache/datafusion/pull/19138) (comphead)
+- Arc partition values in TableSchema [#19137](https://github.com/apache/datafusion/pull/19137) (adriangb)
+- Add sorted data benchmark. [#19042](https://github.com/apache/datafusion/pull/19042) (zhuqi-lucas)
+- Refactor PhysicalExprSimplfier to &self instead of &mut self [#19212](https://github.com/apache/datafusion/pull/19212) (adriangb)
+- chore(deps): bump uuid from 1.18.1 to 1.19.0 [#19199](https://github.com/apache/datafusion/pull/19199) (dependabot[bot])
+- chore(deps): bump async-compression from 0.4.34 to 0.4.35 [#19201](https://github.com/apache/datafusion/pull/19201) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.62.62 to 2.62.63 [#19198](https://github.com/apache/datafusion/pull/19198) (dependabot[bot])
+- chore(deps): bump tracing-subscriber from 0.3.20 to 0.3.22 [#19200](https://github.com/apache/datafusion/pull/19200) (dependabot[bot])
+- chore(deps): bump wasm-bindgen-test from 0.3.55 to 0.3.56 [#19202](https://github.com/apache/datafusion/pull/19202) (dependabot[bot])
+- bench: add dedicated Utf8View benchmarks for InList [#19211](https://github.com/apache/datafusion/pull/19211) (geoffreyclaude)
+- Fix PruningPredicate interaction with DynamicFilterPhysicalExpr that references partition columns [#19129](https://github.com/apache/datafusion/pull/19129) (adriangb)
+- Implement physical and logical codecs in FFI [#19079](https://github.com/apache/datafusion/pull/19079) (timsaucer)
+- refactor: Refactor spark width bucket signature away from user defined [#19065](https://github.com/apache/datafusion/pull/19065) (kumarUjjawal)
+- Sort Merge Join: Reduce batch concatenation, use `BatchCoalescer`, new benchmarks (TPC-H Q21 SMJ up to ~4000x faster) [#18875](https://github.com/apache/datafusion/pull/18875) (mbutrovich)
+- Add relation planner extension support to customize SQL planning [#17843](https://github.com/apache/datafusion/pull/17843) (geoffreyclaude)
+- Add additional tests for InListExpr [#19050](https://github.com/apache/datafusion/pull/19050) (adriangb)
+- chore(deps): bump taiki-e/install-action from 2.62.63 to 2.62.64 [#19226](https://github.com/apache/datafusion/pull/19226) (dependabot[bot])
+- Use strum in the examples (#19126) [#19205](https://github.com/apache/datafusion/pull/19205) (cj-zhukov)
+- [Proto]: Serialization support for `AsyncFuncExec` [#19118](https://github.com/apache/datafusion/pull/19118) (mach-kernel)
+- chore: add test case for decimal overflow [#19255](https://github.com/apache/datafusion/pull/19255) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.62.64 to 2.62.65 [#19251](https://github.com/apache/datafusion/pull/19251) (dependabot[bot])
+- chore: update 6 crates to rust edition 2024 [#19196](https://github.com/apache/datafusion/pull/19196) (timsaucer)
+- Implement FFI_Session [#19223](https://github.com/apache/datafusion/pull/19223) (timsaucer)
+- Feat: Add an option for fast tests by gating slow tests to extended_tests feature [#19237](https://github.com/apache/datafusion/pull/19237) (Yuvraj-cyborg)
+- chore: enforce clippy::allow_attributes for 7 crates [#19133](https://github.com/apache/datafusion/pull/19133) (chakkk309)
+- dev: Add CI doc prettier check to local `rust_lint.sh` [#19254](https://github.com/apache/datafusion/pull/19254) (2010YOUY01)
+- bug: Eliminate dead round-robin insertion in enforce distribution [#19132](https://github.com/apache/datafusion/pull/19132) (gene-bordegaray)
+- Automatically download tpcds benchmark data to the right place [#19244](https://github.com/apache/datafusion/pull/19244) (alamb)
+- [datafusion-spark]: Refactor hex's signature away from user_defined [#19235](https://github.com/apache/datafusion/pull/19235) (codetyri0n)
+- fix : correct nullability propagation for spark.bitwise_not [#19224](https://github.com/apache/datafusion/pull/19224) (shifluxxc)
+- added custom nullability for char [#19268](https://github.com/apache/datafusion/pull/19268) (skushagra)
+- replace HashTableLookupExpr with lit(true) in proto serialization [#19300](https://github.com/apache/datafusion/pull/19300) (adriangb)
+- chore: fix return_field_from_args doc [#19307](https://github.com/apache/datafusion/pull/19307) (xumingming)
+- chore: enforce clippy::allow_attributes for spark,sql,sustrait [#19309](https://github.com/apache/datafusion/pull/19309) (kumarUjjawal)
+- Simplify make_date & fix null handling [#19296](https://github.com/apache/datafusion/pull/19296) (Jefffrey)
+- Allow base64 encoding of fixedsizebinary arrays [#18950](https://github.com/apache/datafusion/pull/18950) (maxburke)
+- chore: update 11 crates to Rust 2024 edition [#19258](https://github.com/apache/datafusion/pull/19258) (timsaucer)
+- Minor: remove unnecessary unit tests for fixed size binary [#19318](https://github.com/apache/datafusion/pull/19318) (alamb)
+- Populate partition column statistics for PartitionedFile [#19284](https://github.com/apache/datafusion/pull/19284) (adriangb)
+- refactor: move metrics module to `datafusion-common` crate [#19247](https://github.com/apache/datafusion/pull/19247) (2010YOUY01)
+- chore(deps): bump taiki-e/install-action from 2.62.65 to 2.62.67 [#19295](https://github.com/apache/datafusion/pull/19295) (dependabot[bot])
+- chore(deps): bump ctor from 0.6.1 to 0.6.3 [#19328](https://github.com/apache/datafusion/pull/19328) (dependabot[bot])
+- Refactor `power()` signature away from user defined [#18968](https://github.com/apache/datafusion/pull/18968) (Jefffrey)
+- chore: enforce `clippy::allow_attributes` for optimizer and macros [#19310](https://github.com/apache/datafusion/pull/19310) (kumarUjjawal)
+- chore(deps): bump taiki-e/install-action from 2.62.67 to 2.63.3 [#19349](https://github.com/apache/datafusion/pull/19349) (dependabot[bot])
+- chore(deps): bump clap from 4.5.50 to 4.5.53 [#19326](https://github.com/apache/datafusion/pull/19326) (dependabot[bot])
+- chore(deps): bump insta from 1.43.2 to 1.44.3 [#19327](https://github.com/apache/datafusion/pull/19327) (dependabot[bot])
+- remove repartition exec from coalesce batches optimizer [#19239](https://github.com/apache/datafusion/pull/19239) (jizezhang)
+- minor: cleanup unnecessary config in `decimal.slt` [#19352](https://github.com/apache/datafusion/pull/19352) (Jefffrey)
+- Fix panic for `GROUPING SETS(())` and handle empty-grouping aggregates [#19252](https://github.com/apache/datafusion/pull/19252) (kosiew)
+- Update datafusion-core crate to Rust 2024 edition [#19332](https://github.com/apache/datafusion/pull/19332) (timsaucer)
+- Update 4 crates to rust 2024 edition [#19357](https://github.com/apache/datafusion/pull/19357) (timsaucer)
+- preserve Field metadata in first_value/last_value [#19335](https://github.com/apache/datafusion/pull/19335) (adriangb)
+- Fix flaky SpillPool channel test by synchronizing reader and writer tasks [#19110](https://github.com/apache/datafusion/pull/19110) (kosiew)
+- [minor] Upgrade rust version [#19363](https://github.com/apache/datafusion/pull/19363) (Dandandan)
+- Minor: fix cargo fmt [#19368](https://github.com/apache/datafusion/pull/19368) (zhuqi-lucas)
+- chore: enforce clippy::allow_attributes for proto, pruning, session [#19350](https://github.com/apache/datafusion/pull/19350) (kumarUjjawal)
+- Update remaining crates to rust 2024 edition [#19361](https://github.com/apache/datafusion/pull/19361) (timsaucer)
+- Minor: Make `ProjectionExpr::new` easier to use with constants [#19343](https://github.com/apache/datafusion/pull/19343) (alamb)
+- Feat: DefaultListFilesCache prefix-aware for partition pruning optimization [#19298](https://github.com/apache/datafusion/pull/19298) (Yuvraj-cyborg)
+- Extend in_list benchmark coverage [#19376](https://github.com/apache/datafusion/pull/19376) (geoffreyclaude)
+- [datafusion-cli] Implement average LIST duration for object store profiling [#19127](https://github.com/apache/datafusion/pull/19127) (peterxcli)
+- chore(deps): bump taiki-e/install-action from 2.63.3 to 2.64.0 [#19382](https://github.com/apache/datafusion/pull/19382) (dependabot[bot])
+- update insta snapshots [#19381](https://github.com/apache/datafusion/pull/19381) (kosiew)
+- Fix regression for negative-scale decimal128 in log [#19315](https://github.com/apache/datafusion/pull/19315) (shifluxxc)
+- Fix input handling for encoding functions & various refactors [#18754](https://github.com/apache/datafusion/pull/18754) (Jefffrey)
+- Fix ORDER BY positional reference regression with aliased aggregates [#19412](https://github.com/apache/datafusion/pull/19412) (adriangb)
+- Implement disk spilling for all grouping ordering modes in GroupedHashAggregateStream [#19287](https://github.com/apache/datafusion/pull/19287) (pepijnve)
+- refactor: add ParquetOpenerBuilder to reduce test code duplication [#19405](https://github.com/apache/datafusion/pull/19405) (shashidhar-bm)
+- bench: add `range_and_generate_series` [#19428](https://github.com/apache/datafusion/pull/19428) (rluvaton)
+- chore: use extend instead of manual loop in multi group by [#19429](https://github.com/apache/datafusion/pull/19429) (rluvaton)
+- chore(deps): bump taiki-e/install-action from 2.64.0 to 2.64.2 [#19399](https://github.com/apache/datafusion/pull/19399) (dependabot[bot])
+- Add recursive protection on planner's `create_physical_expr` [#19299](https://github.com/apache/datafusion/pull/19299) (rgehan)
+- chore(deps): bump aws-config from 1.8.11 to 1.8.12 [#19453](https://github.com/apache/datafusion/pull/19453) (dependabot[bot])
+- chore(deps): bump log from 0.4.28 to 0.4.29 [#19452](https://github.com/apache/datafusion/pull/19452) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.64.2 to 2.65.1 [#19451](https://github.com/apache/datafusion/pull/19451) (dependabot[bot])
+- chore(deps): bump insta from 1.44.3 to 1.45.0 [#19454](https://github.com/apache/datafusion/pull/19454) (dependabot[bot])
+- added support for negative scale for log decimal32/64 and power [#19409](https://github.com/apache/datafusion/pull/19409) (shifluxxc)
+- Remove core dependency from ffi [#19422](https://github.com/apache/datafusion/pull/19422) (timsaucer)
+- bench: increase in_list benchmark coverage [#19443](https://github.com/apache/datafusion/pull/19443) (geoffreyclaude)
+- Use SortMergeJoinExec name consistently in physical plan outputs [#19246](https://github.com/apache/datafusion/pull/19246) (xavlee)
+- Fix panic during spill to disk in clickbench query [#19421](https://github.com/apache/datafusion/pull/19421) (alamb)
+- Optimize memory footprint of view arrays from `ScalarValue::to_array_of_size` [#19441](https://github.com/apache/datafusion/pull/19441) (Jefffrey)
+- minor: refactoring of some `ScalarValue` code [#19439](https://github.com/apache/datafusion/pull/19439) (Jefffrey)
+- Refactor Spark crc32 & sha1 to remove unnecessary scalar argument check [#19466](https://github.com/apache/datafusion/pull/19466) (Jefffrey)
+- Add link to arrow-rs ticket in comments [#19479](https://github.com/apache/datafusion/pull/19479) (alamb)
+- chore(deps): bump taiki-e/install-action from 2.65.1 to 2.65.2 [#19474](https://github.com/apache/datafusion/pull/19474) (dependabot[bot])
+- Improve plan_to_sql handling of empty projections with dialect-specific SELECT list support [#19221](https://github.com/apache/datafusion/pull/19221) (kosiew)
+- examples: replace sql_dialect with custom_sql_parser example [#19383](https://github.com/apache/datafusion/pull/19383) (geoffreyclaude)
+- Replace custom merge operator with arrow-rs implementation [#19424](https://github.com/apache/datafusion/pull/19424) (pepijnve)
+- Implement nested recursive CTEs [#18956](https://github.com/apache/datafusion/pull/18956) (Tpt)
+- Add: PI upper/lower bound f16 constants to ScalarValue [#19497](https://github.com/apache/datafusion/pull/19497) (xonx4l)
+- chore: enforce clippy::allow_attributes for datafusion-ffi crate [#19480](https://github.com/apache/datafusion/pull/19480) (chakkk309)
+- Add CI check to ensure examples are documented in README [#19371](https://github.com/apache/datafusion/pull/19371) (cj-zhukov)
+- fix : snapshot to the modern multiline format [#19517](https://github.com/apache/datafusion/pull/19517) (Nachiket-Roy)
+- chore(deps): bump taiki-e/install-action from 2.65.2 to 2.65.3 [#19499](https://github.com/apache/datafusion/pull/19499) (dependabot[bot])
+- docs : clarify unused test utility [#19508](https://github.com/apache/datafusion/pull/19508) (Nachiket-Roy)
+- Date / time / interval arithmetic improvements [#19460](https://github.com/apache/datafusion/pull/19460) (Omega359)
+- Preserve ORDER BY in Unparser for projection -> order by pattern [#19483](https://github.com/apache/datafusion/pull/19483) (adriangb)
+- Redesign the try_reverse_output to support more cases [#19446](https://github.com/apache/datafusion/pull/19446) (zhuqi-lucas)
+- refactor: Spark `ascii` signature away from `user_defined` [#19513](https://github.com/apache/datafusion/pull/19513) (kumarUjjawal)
+- Fix: SparkAscii nullability to depend on input nullability [#19531](https://github.com/apache/datafusion/pull/19531) (Yuvraj-cyborg)
+- chore(deps): bump tracing from 0.1.41 to 0.1.43 [#19543](https://github.com/apache/datafusion/pull/19543) (dependabot[bot])
+- chore(deps): bump substrait from 0.62.0 to 0.62.2 [#19542](https://github.com/apache/datafusion/pull/19542) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.65.3 to 2.65.6 [#19541](https://github.com/apache/datafusion/pull/19541) (dependabot[bot])
+- minor: run all examples by default [#19506](https://github.com/apache/datafusion/pull/19506) (theirix)
+- Refactor TopKHashTable to use HashTable API [#19464](https://github.com/apache/datafusion/pull/19464) (Dandandan)
+- Revert Spark Elt nullability change [#19510](https://github.com/apache/datafusion/pull/19510) (Jefffrey)
+- minor: implement more arms for `get_data_types()` for `NativeType` [#19449](https://github.com/apache/datafusion/pull/19449) (Jefffrey)
+- Upgrade hashbrown to 0.16 [#19554](https://github.com/apache/datafusion/pull/19554) (Dandandan)
+- minor : add crypto function benchmark [#19539](https://github.com/apache/datafusion/pull/19539) (getChan)
+- chore(deps): bump taiki-e/install-action from 2.65.6 to 2.65.8 [#19559](https://github.com/apache/datafusion/pull/19559) (dependabot[bot])
+- bugfix: preserve schema metadata for record batch in FFI [#19293](https://github.com/apache/datafusion/pull/19293) (timsaucer)
+- refactor: extract the data generate out of aggregate_topk benchmark [#19523](https://github.com/apache/datafusion/pull/19523) (haohuaijin)
+- Compute Dynamic Filters only when a consumer supports them [#19546](https://github.com/apache/datafusion/pull/19546) (LiaCastaneda)
+- Various refactors to string functions [#19402](https://github.com/apache/datafusion/pull/19402) (Jefffrey)
+- Implement `partition_statistics` API for `NestedLoopJoinExec` [#19468](https://github.com/apache/datafusion/pull/19468) (kumarUjjawal)
+- Replace deprecated structopt with clap in datafusion-benchmarks [#19492](https://github.com/apache/datafusion/pull/19492) (Yuvraj-cyborg)
+- Refactor duplicate code in `type_coercion/functions.rs` [#19518](https://github.com/apache/datafusion/pull/19518) (Jefffrey)
+- chore(deps): bump taiki-e/install-action from 2.65.8 to 2.65.10 [#19578](https://github.com/apache/datafusion/pull/19578) (dependabot[bot])
+- perf: Improve performance of hex encoding in spark functions [#19586](https://github.com/apache/datafusion/pull/19586) (shashidhar-bm)
+- Add left function benchmark [#19600](https://github.com/apache/datafusion/pull/19600) (viirya)
+- chore: Add TPCDS benchmark comparison for PR [#19552](https://github.com/apache/datafusion/pull/19552) (comphead)
+- chore(deps): bump taiki-e/install-action from 2.65.10 to 2.65.11 [#19601](https://github.com/apache/datafusion/pull/19601) (dependabot[bot])
+- chore: bump testcontainers-modules to 0.14 and remove testcontainers dep [#19620](https://github.com/apache/datafusion/pull/19620) (Jefffrey)
+- Validate parquet writer version [#19515](https://github.com/apache/datafusion/pull/19515) (AlyAbdelmoneim)
+- chore(deps): bump insta from 1.45.0 to 1.46.0 [#19643](https://github.com/apache/datafusion/pull/19643) (dependabot[bot])
+- chore(deps): bump taiki-e/install-action from 2.65.11 to 2.65.13 [#19646](https://github.com/apache/datafusion/pull/19646) (dependabot[bot])
+- chore(deps): bump tracing from 0.1.43 to 0.1.44 [#19644](https://github.com/apache/datafusion/pull/19644) (dependabot[bot])
+- chore(deps): bump syn from 2.0.111 to 2.0.113 [#19645](https://github.com/apache/datafusion/pull/19645) (dependabot[bot])
+- Refactor `percentile_cont` to clarify support input types [#19611](https://github.com/apache/datafusion/pull/19611) (Jefffrey)
+- Add a protection to release candidate branch 52 [#19660](https://github.com/apache/datafusion/pull/19660) (xudong963)
+- Downgrade aws-smithy-runtime, update `rust_decimal`, ignore RUSTSEC-2026-0001 to get clean CI [#19657](https://github.com/apache/datafusion/pull/19657) (alamb)
+- Update dependencies [#19667](https://github.com/apache/datafusion/pull/19667) (alamb)
+- Refactor PartitionedFile: add ordering field and new_from_meta constructor [#19596](https://github.com/apache/datafusion/pull/19596) (adriangb)
+- Remove coalesce batches rule and deprecate CoalesceBatchesExec [#19622](https://github.com/apache/datafusion/pull/19622) (feniljain)
+- Perf: Optimize `substring_index` via single-byte fast path and direct indexing [#19590](https://github.com/apache/datafusion/pull/19590) (lyne7-sc)
+- refactor: Use `Signature::coercible` for isnan/iszero [#19604](https://github.com/apache/datafusion/pull/19604) (kumarUjjawal)
+- Parquet: Push down supported list predicates (array_has/any/all) during decoding [#19545](https://github.com/apache/datafusion/pull/19545) (kosiew)
+- Remove dependency on `rust_decimal`, remove ignore of `RUSTSEC-2026-0001` [#19666](https://github.com/apache/datafusion/pull/19666) (alamb)
+- Store example data directly inside the datafusion-examples (#19141) [#19319](https://github.com/apache/datafusion/pull/19319) (cj-zhukov)
+- minor: More comments to `ParquetOpener::open()` [#19677](https://github.com/apache/datafusion/pull/19677) (2010YOUY01)
+- Feat: Allow pow with negative & non-integer exponent on decimals [#19369](https://github.com/apache/datafusion/pull/19369) (Yuvraj-cyborg)
+- chore(deps): bump taiki-e/install-action from 2.65.13 to 2.65.15 [#19676](https://github.com/apache/datafusion/pull/19676) (dependabot[bot])
+- Refactor cache APIs to support ordering information [#19597](https://github.com/apache/datafusion/pull/19597) (adriangb)
+- Record sort order when writing Parquet with WITH ORDER [#19595](https://github.com/apache/datafusion/pull/19595) (adriangb)
+- implement var distinct [#19706](https://github.com/apache/datafusion/pull/19706) (thinh2)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    67	dependabot[bot]
+    38	Andrew Lamb
+    36	Jeffrey Vo
+    35	Kumar Ujjawal
+    34	Adrian Garcia Badaracco
+    22	Tim Saucer
+    19	Yongting You
+    13	Sergey Zhukov
+    11	Pepijn Van Eeckhoudt
+    11	kosiew
+    10	Daniël Heres
+    10	Dhanush
+    10	Oleks V
+     8	Geoffrey Claude
+     8	Raz Luvaton
+     7	Andy Grove
+     7	Liang-Chi Hsieh
+     7	Qi Zhu
+     6	Peter Nguyen
+     6	Shashidhar B M
+     5	Alan Tang
+     5	Alex Huang
+     5	Bruce Ritchie
+     5	Gene Bordegaray
+     5	Nuno Faria
+     5	Sriram Sundar
+     4	Blake Orth
+     4	Thomas Tanon
+     4	Yuvraj
+     4	theirix
+     3	Aryan Bagade
+     3	Chakkk
+     3	Emily Matheys
+     3	Huaijin
+     3	Khanh Duong
+     3	Kushagra S
+     3	Vedic Chawla
+     3	feniljain
+     3	harshit saini
+     3	jizezhang
+     3	shifluxxc
+     3	xonx
+     3	xudong.w
+     2	Carlos Hurtado
+     2	Chen Chongchen
+     2	Cora Sutton
+     2	Haresh Khanna
+     2	Lía Adriana
+     2	Manish Kumar
+     2	Martin Grigorov
+     2	Matthew Kim
+     2	Namgung Chan
+     2	Nimalan
+     2	Nithurshen
+     2	Rosai
+     2	Shubham Yadav
+     2	Trent Hauck
+     2	Vegard Stikbakke
+     2	Vrishabh
+     2	Xander
+     2	chakkk309
+     2	mag1c1an1
+     2	nlimpid
+     2	yqrz
+     1	Adam Curtis
+     1	Aly Abdelmoneim
+     1	Andrey Velichkevich
+     1	Arpit Bandejiya
+     1	Bharathwaj G
+     1	Bipul Lamsal
+     1	Clement de Groc
+     1	Congxian Qiu
+     1	David López
+     1	David Stancu
+     1	Devanshu
+     1	Dongpo Liu
+     1	EeshanBembi
+     1	Eshaan Gupta
+     1	Ethan Urbanski
+     1	Frederic Branczyk
+     1	Gabriel
+     1	Gohlub
+     1	Heran Lin
+     1	James Xu
+     1	Jatin Kumar singh
+     1	Karan Pradhan
+     1	Karthik Kondamudi
+     1	Kazantsev Maksim
+     1	Marco Neumann
+     1	Matt Butrovich
+     1	Max Burke
+     1	Michele Vigilante
+     1	Mikhail Zabaluev
+     1	Mohit rao
+     1	Ning Sun
+     1	Peter Lee
+     1	Quoc Anh
+     1	Ram
+     1	Randy
+     1	Renan GEHAN
+     1	Ruchir Khaitan
+     1	Samyak Sarnayak
+     1	Shiv Bhatia
+     1	Smith Cruise
+     1	Smotrov Oleksii
+     1	Solari Systems
+     1	Suhail
+     1	T2MIX
+     1	Tal Glanzman
+     1	Tamar
+     1	Tim-53
+     1	Tobias Schwarzinger
+     1	Ujjwal Kumar Tiwari
+     1	Willem Verstraeten
+     1	YuraLitvinov
+     1	bubulalabu
+     1	delamarch3
+     1	hsiang-c
+     1	r1b
+     1	rin
+     1	xavlee
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.1.0.md b/dev/changelog/52.1.0.md
new file mode 100644
index 0000000000000..97a1435c41a44
--- /dev/null
+++ b/dev/changelog/52.1.0.md
@@ -0,0 +1,46 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.1.0 Changelog
+
+This release consists of 3 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Documentation updates:**
+
+- [branch-52] Fix Internal error: Assertion failed: !self.finished: LimitedBatchCoalescer (#19785) [#19836](https://github.com/apache/datafusion/pull/19836) (alamb)
+
+**Other:**
+
+- [branch-52] fix: expose `ListFilesEntry` [#19818](https://github.com/apache/datafusion/pull/19818) (lonless9)
+- [branch 52] Fix grouping set subset satisfaction [#19855](https://github.com/apache/datafusion/pull/19855) (gabotechs)
+- Add BatchAdapter to simplify using PhysicalExprAdapter / Projector [#19877](https://github.com/apache/datafusion/pull/19877) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     1	Andrew Lamb
+     1	Gabriel
+     1	XL Liang
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.2.0.md b/dev/changelog/52.2.0.md
new file mode 100644
index 0000000000000..0801ec5e6a7ee
--- /dev/null
+++ b/dev/changelog/52.2.0.md
@@ -0,0 +1,47 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.2.0 Changelog
+
+This release consists of 5 commits from 3 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Other:**
+
+- [branch-52] fix: filter pushdown when merge filter (#20110) [#20289](https://github.com/apache/datafusion/pull/20289) (haohuaijin)
+- [branch-52] FilterExec should remap indices of parent dynamic filters (#20286) [#20347](https://github.com/apache/datafusion/pull/20347) (alamb)
+- [branch-52] fix: validate inter-file ordering in eq_properties() (#20329) [#20509](https://github.com/apache/datafusion/pull/20509) (alamb)
+- Fix name tracker (#19856) [#20539](https://github.com/apache/datafusion/pull/20539) (hareshkh)
+- [branch-52] fix: HashJoin panic with dictionary-encoded columns in multi-key joins (#20441) [#20512](https://github.com/apache/datafusion/pull/20512) (alamb)
+- [branch-52] Fix incorrect `SortExec` removal before `AggregateExec` (#20247) [#20507](https://github.com/apache/datafusion/pull/20507) (alamb)
+- [branch-52] Update aws-smithy, bytes and time for security audits [#20546](https://github.com/apache/datafusion/pull/20546) (alamb)
+- [branch-52] Clamp early aggregation emit to the sort boundary when using partial group ordering (#20446) [#20558](https://github.com/apache/datafusion/pull/20558) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     3	Andrew Lamb
+     1	Haresh Khanna
+     1	Huaijin
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/changelog/52.3.0.md b/dev/changelog/52.3.0.md
new file mode 100644
index 0000000000000..ed505b7fc2d0a
--- /dev/null
+++ b/dev/changelog/52.3.0.md
@@ -0,0 +1,50 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Apache DataFusion 52.3.0 Changelog
+
+This release consists of 7 commits from 4 contributors. See credits at the end of this changelog for more information.
+
+See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) for information on how to upgrade from previous versions.
+
+**Performance related:**
+
+- [branch-52] perf: sort replace free()->try_grow() pattern with try_resize() to reduce memory pool interactions [#20732](https://github.com/apache/datafusion/pull/20732) (mbutrovich)
+
+**Other:**
+
+- [branch-52] Backport fix: SortMergeJoin don't wait for all input before emitting #20482 [#20699](https://github.com/apache/datafusion/pull/20699) (mbutrovich)
+- [branch-52] Fix Arrow Spill Underrun (#20159) [#20684](https://github.com/apache/datafusion/pull/20684) (hareshkh)
+- [branch-52] Fix constant value from stats (#20042) [#20709](https://github.com/apache/datafusion/pull/20709) (alamb)
+- [branch-52] fix: `HashJoin` panic with String dictionary keys (don't flatten keys) (#20505) [#20708](https://github.com/apache/datafusion/pull/20708) (alamb)
+- [branch-52] FFI_TableOptions are using default values only [#20705](https://github.com/apache/datafusion/pull/20705) (timsaucer)
+- [branch-52] Fix repartition from dropping data when spilling (#20672) [#20777](https://github.com/apache/datafusion/pull/20777) (alamb)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+     3	Andrew Lamb
+     2	Matt Butrovich
+     1	Haresh Khanna
+     1	Tim Saucer
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/depcheck/Cargo.toml b/dev/depcheck/Cargo.toml
index 23cefaec43be4..3e4bf39cced42 100644
--- a/dev/depcheck/Cargo.toml
+++ b/dev/depcheck/Cargo.toml
@@ -18,8 +18,9 @@
 # Circular dependency checker for DataFusion
 [package]
 name = "depcheck"
+edition = "2024"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-cargo = "0.81.0"
+cargo = "0.92.0"
diff --git a/dev/depcheck/rust-toolchain.toml b/dev/depcheck/rust-toolchain.toml
new file mode 100644
index 0000000000000..55d572362d142
--- /dev/null
+++ b/dev/depcheck/rust-toolchain.toml
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file specifies the default version of Rust used
+# to compile this workspace and run CI jobs.
+
+[toolchain]
+channel = "1.89.0"
+components = ["rustfmt", "clippy"]
diff --git a/dev/depcheck/src/main.rs b/dev/depcheck/src/main.rs
index 80feefcd1b1c5..ebd79faa6f465 100644
--- a/dev/depcheck/src/main.rs
+++ b/dev/depcheck/src/main.rs
@@ -48,7 +48,7 @@ fn main() -> CargoResult<()> {
         root_cargo_toml.display()
     );
     let workspace = cargo::core::Workspace::new(&root_cargo_toml, &gctx)?;
-    let (_, resolve) = cargo::ops::resolve_ws(&workspace)?;
+    let (_, resolve) = cargo::ops::resolve_ws(&workspace, false)?;
 
     let mut package_deps = HashMap::new();
     for package_id in resolve
diff --git a/dev/pyproject.toml b/dev/pyproject.toml
new file mode 100644
index 0000000000000..a2f5653d9d879
--- /dev/null
+++ b/dev/pyproject.toml
@@ -0,0 +1,5 @@
+[project]
+name = "datafusion-dev"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = ["tomlkit", "PyGithub", "requests"]
diff --git a/dev/release/README.md b/dev/release/README.md
index 6e4079de8f069..0ca13c175f230 100644
--- a/dev/release/README.md
+++ b/dev/release/README.md
@@ -23,55 +23,91 @@ DataFusion typically has major releases around once per month, including breakin
 
 Patch releases are made on an adhoc basis, but we try and avoid them given the frequent major releases.
 
-## Branching Policy
+## Release Process Overview
 
-- When we prepare a new release, we create a release branch, such as `branch-37` in the Apache repository (not in a fork)
-- We update the crate version and generate the changelog in this branch and create a PR against the main branch
-- Once the PR is approved and merged, we tag the rc in the release branch, and release from the release branch
-- Bug fixes can be merged to the release branch and patch releases can be created from the release branch
+New development happens on the `main` branch.
+Releases are made from branches, e.g. `branch-50` for the `50.x.y` release series.
 
-#### How to backport (add changes) to `branch-*` branch
+To prepare for a new release series, we:
 
-If you would like to propose your change for inclusion in a release branch for a
-patch release:
+- Create a new branch from `main`, such as `branch-50` in the Apache repository (not in a fork)
+- Continue merging new features changes to `main` branch
+- Prepare the release branch for release:
+  - Update version numbers in `Cargo.toml` files and create `CHANGELOG.md`
+  - Add additional changes to the release branch as needed
+- When the code is ready, create GitHub tags release candidate (rc) artifacts from the release branch.
+- After the release is approved, publish to [crates.io], the ASF distribution servers, and GitHub tags.
+
+To add changes to the release branch, depending on the change we either:
+
+- Fix the issue on `main` and then backport the change to the release branch (e.g. [#18129])
+- Fix the issue on the release branch and then forward-port the change back to `main` (e.g.[#18057])
+
+[crates.io]: https://crates.io/crates/datafusion
+[#18129]: https://github.com/apache/datafusion/pull/18129
+[#18057]: https://github.com/apache/datafusion/pull/18057
+
+## Backporting (add changes) to `branch-*` branch
+
+If you would like to propose your change for inclusion in a patch release, the
+change must be applied to the relevant release branch. To do so please follow
+these steps:
 
 1. Find (or create) the issue for the incremental release ([example release issue]) and discuss the proposed change there with the maintainers.
-1. Follow normal workflow to create PR to `main` branch and wait for its approval and merge.
-1. After PR is squash merged to `main`, branch from most recent release branch (e.g. `branch-37`), cherry-pick the commit and create a PR targeting the release branch [example backport PR].
+2. Follow normal workflow to create PR to `main` branch and wait for its approval and merge.
+3. After PR is squash merged to `main`, branch from most recent release branch (e.g. `branch-50`), cherry-pick the commit and create a PR targeting the release branch [example backport PR].
 
-For example, to backport commit `12345` from `main` to `branch-43`:
+For example, to backport commit `12345` from `main` to `branch-50`:
 
 ```shell
-git checkout branch-43
-git checkout -b backport_to_43
-git cherry-pick 12345
+git checkout branch-50
+git checkout -b backport_to_50
+git cherry-pick 12345 # your git commit hash
 git push -u <your fork>
-# make a PR as normal
+# make a PR as normal targeting branch-50, prefixed with [branch-50]
+```
+
+It is also acceptable to fix the issue directly on the release branch first
+and then cherry-pick the change back to `main` branch in a new PR.
+
+[example release issue]: https://github.com/apache/datafusion/issues/18072
+[example backport pr]: https://github.com/apache/datafusion/pull/18131
+
+## Release Prerequisites
+
+### Add git remote for `apache` repo
+
+The instructions below assume the upstream git repo `git@github.com:apache/datafusion.git` in remote `apache`.
+
+```shell
+git remote add apache git@github.com:apache/datafusion.git
 ```
 
-[example release issue]: https://github.com/apache/datafusion/issues/9904
-[example backport pr]: https://github.com/apache/datafusion/pull/10123
+### Create GitHub Personal Access Token (PAT)
 
-## Release Prerequisite
+A personal access token (PAT) is needed for changelog automation script. If you
+do not already have one, create a token with the `repo` access by navigating to
+[GitHub Developer Settings] page, and [follow these steps].
 
-- Have upstream git repo `git@github.com:apache/datafusion.git` add as git remote `apache`.
-- Created a personal access token in GitHub for changelog automation script.
-  - Github PAT should be created with `repo` access
-- Make sure your signing key is added to the following files in SVN:
-  - https://dist.apache.org/repos/dist/dev/datafusion/KEYS
-  - https://dist.apache.org/repos/dist/release/datafusion/KEYS
+[github developer settings]: https://github.com/settings/developers
+[follow these steps]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
 
-### How to add signing key
+### Add GPG Public Key to SVN `KEYS` file
+
+If you will be releasing the final tarball, your GPG public key must be present in the following SVN files:
+
+- https://dist.apache.org/repos/dist/dev/datafusion/KEYS
+- https://dist.apache.org/repos/dist/release/datafusion/KEYS
 
 See instructions at https://infra.apache.org/release-signing.html#generate for generating keys.
 
-Committers can add signing keys in Subversion client with their ASF account. e.g.:
+Committers can add signing keys using the Subversion client and their ASF account:
 
 ```shell
 $ svn co https://dist.apache.org/repos/dist/dev/datafusion
 $ cd datafusion
-$ editor KEYS
-$ svn ci KEYS
+$ editor KEYS # add your key here
+$ svn ci KEYS # commit changes
 ```
 
 Follow the instructions in the header of the KEYS file to append your key. Here is an example:
@@ -81,166 +117,228 @@ Follow the instructions in the header of the KEYS file to append your key. Here
 svn commit KEYS -m "Add key for John Doe"
 ```
 
-## Process Overview
+## Release Process: Step by Step
 
 As part of the Apache governance model, official releases consist of signed
 source tarballs approved by the PMC.
+We then publish the code in the approved artifacts to crates.io.
 
-We then use the code in the approved artifacts to release to crates.io and
-PyPI.
+### 1. Create Release Branch
 
-### Change Log
+First create a new release branch from `main` in the apache repository.
 
-We maintain a `CHANGELOG.md` so our users know what has been changed between releases.
+For example, to create the `branch-50` branch for the `50.x.y` release series:
 
-You will need a GitHub Personal Access Token for the following steps. Follow
-[these instructions](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
-to generate one if you do not already have one.
+```shell
+git fetch apache             # make sure we are up to date
+git checkout apache/main     # checkout current latest development branch
+git checkout -b branch-50    # create local branch
+git push -u apache branch-50 # push branch to apache remote
+```
 
-The changelog is generated using a Python script. There is a dependency on `PyGitHub`, which can be installed using pip:
+### 2. Add a protection to release candidate branch
+
+To protect a release candidate branch from accidental merges, run:
 
 ```shell
-pip3 install PyGitHub
+./dev/release/add-branch-protection.sh 50
 ```
 
-To generate the changelog, set the `GITHUB_TOKEN` environment variable to a valid token and then run the script
-providing two commit ids or tags followed by the version number of the release being created. The following
-example generates a change log of all changes between the first commit and the current HEAD revision.
+The script will modify `.asf.yaml` and add following block:
 
-```shell
-export GITHUB_TOKEN=<your-token-here>
-./dev/release/generate-changelog.py 24.0.0 HEAD 25.0.0 > dev/changelog/25.0.0.md
+```yaml
+branch-50:
+  required_pull_request_reviews:
+    required_approving_review_count: 1
 ```
 
-This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
-titles starting with `feat:`, `fix:`, or `docs:`.
+- Create a PR.
+- Merge to `main`.
 
-Once the change log is generated, run `prettier` to format the document:
+### 3. Prepare PR to Update Changelog and the Release Version
+
+First, prepare a PR to update the changelog and versions to reflect the planned
+release. See [#18173](https://github.com/apache/datafusion/pull/18173) for an example.
+
+#### Update Version Numbers
+
+Manually update the DataFusion version in the root `Cargo.toml` to reflect the new release version.
+
+Ensure Cargo.lock is updated accordingly by running:
 
 ```shell
-prettier -w dev/changelog/25.0.0md
+cargo check -p datafusion
 ```
 
-## Prepare release commits and PR
+#### Changelog Generation
+
+We maintain a [changelog] so our users know what has been changed between releases.
 
-Prepare a PR to update `CHANGELOG.md` and versions to reflect the planned
-release.
+[changelog]: ../changelog
 
-See [#9697](https://github.com/apache/datafusion/pull/9697) for an example.
+The changelog is generated using a Python script.
 
-Here are the commands that could be used to prepare the `38.0.0` release:
+To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the dev dependencies via `uv`:
 
-### Update Version
+```shell
+uv sync
+```
 
-Checkout the main commit to be released
+To generate the changelog, set the `GITHUB_TOKEN` environment variable and then run `./dev/release/generate-changelog.py`
+providing two commit ids or tags followed by the version number of the release being created. For example,
+to generate a change log of all changes between the `50.3.0` tag and `branch-51`, in preparation for release `51.0.0`:
+
+> [!NOTE]
+>
+> If you see errors such as the following, it is likely due to not setting
+> the `GITHUB_TOKEN` environment variable.
+>
+> ```
+> Request GET ... failed with 403: rate limit exceeded
+> ```
 
 ```shell
-git fetch apache
-git checkout apache/main
+export GITHUB_TOKEN=<your-token-here>
+uv run ./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
 ```
 
-Manually update the datafusion version in the root `Cargo.toml` to `38.0.0`.
+This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
+titles starting with `feat:`, `fix:`, or `docs:`.
 
-Run `cargo test` to re-generate some example files:
+Once the change log is generated, run `prettier` to format the document:
 
 ```shell
-cargo test
+prettier -w dev/changelog/51.0.0.md
 ```
 
-Lastly commit the version change:
+#### Commit and PR
+
+Then commit the changes and create a PR targeting the release branch.
 
 ```shell
 git commit -a -m 'Update version'
 ```
 
-## Prepare release candidate artifacts
+Remember to merge any fixes back to `main` branch as well.
+
+### 4. Prepare Release Candidate Artifacts
 
 After the PR gets merged, you are ready to create release artifacts based off the
 merged commit.
 
 (Note you need to be a committer to run these scripts as they upload to the apache svn distribution servers)
 
-### Pick a Release Candidate (RC) number
+#### Pick a Release Candidate (RC) number
 
-Pick numbers in sequential order, with `0` for `rc0`, `1` for `rc1`, etc.
+Pick numbers in sequential order, with `1` for `rc1`, `2` for `rc2`, etc.
 
-### Create git tag for the release:
+#### Create git Tag for the Release:
 
 While the official release artifacts are signed tarballs and zip files, we also
-tag the commit it was created for convenience and code archaeology.
+tag the commit it was created for convenience and code archaeology. Release tags
+have the format `<version>` (e.g. `38.0.0`), and release candidates have the
+format `<version>-rc<rc>` (e.g. `38.0.0-rc0`). See [the list of existing
+tags].
+
+[the list of existing tags]: https://github.com/apache/datafusion/tags
 
-Using a string such as `38.0.0` as the `<version>`, create and push the tag by running these commands:
+Using a string such as `38.0.0` as the `<version>`, create and push the rc tag by running these commands:
 
 ```shell
 git fetch apache
-git tag <version>-<rc> apache/main
-# push tag to Github remote
-git push apache <version>
+git tag <version>-<rc> apache/branch-X # create tag from the release branch
+git push apache <version>-<rc>         # push tag to Github remote
 ```
 
-### Create, sign, and upload artifacts
+For example, to create the `50.3.0-rc1 tag from `branch-50`:
 
-Run `create-tarball.sh` with the `<version>` tag and `<rc>` and you found in previous steps:
+```shell
+git fetch apache
+git tag 50.3.0-rc1 apache/branch-50
+git push apache 50.3.0-rc1
+```
+
+#### Create, Sign, and Upload Artifacts
+
+Run the `create-tarball.sh` script with the `<version>` tag and `<rc>` and you determined in previous steps:
+
+For example, to create the `50.3.0-rc1` artifacts:
 
 ```shell
-GH_TOKEN=<TOKEN> ./dev/release/create-tarball.sh 38.0.0 0
+GH_TOKEN=<TOKEN> ./dev/release/create-tarball.sh 50.3.0 1
 ```
 
 The `create-tarball.sh` script
 
-1. creates and uploads all release candidate artifacts to the [datafusion
+1. Creates and uploads all release candidate artifacts to the [datafusion
    dev](https://dist.apache.org/repos/dist/dev/datafusion) location on the
-   apache distribution svn server
+   apache distribution SVN server
 
-2. provide you an email template to
+2. Provides you an email template to
    send to dev@datafusion.apache.org for release voting.
 
-### Vote on Release Candidate artifacts
+### 5. Vote on Release Candidate Artifacts
 
 Send the email output from the script to dev@datafusion.apache.org.
 
-For the release to become "official" it needs at least three PMC members to vote +1 on it.
+In order to publish the release on crates.io, it must be "official". To become
+official it needs at least three PMC members to vote +1 on it.
 
-### Verifying Release Candidates
+#### Verifying Release Candidates
 
 The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like:
 
 ```shell
-./dev/release/verify-release-candidate.sh 38.0.0 0
+./dev/release/verify-release-candidate.sh 50.3.0 1
 ```
 
-#### If the release is not approved
+#### If the Release is not Approved
 
 If the release is not approved, fix whatever the problem is, merge changelog
-changes into main if there is any and try again with the next RC number.
+changes into the release branch and try again with the next RC number.
 
-## Finalize the release
+Remember to merge any fixes back to `main` branch as well.
+
+#### If the Release is Approved: Call the Vote
+
+Call the vote on the Arrow dev list by replying to the RC voting thread. The
+reply should have a new subject constructed by adding `[RESULT]` prefix to the
+old subject line.
+
+Sample announcement template:
+
+```
+The vote has passed with <NUMBER> +1 votes. Thank you to all who helped
+with the release verification.
+```
+
+### 6. Finalize the Release
 
 NOTE: steps in this section can only be done by PMC members.
 
-### After the release is approved
+#### After the release is approved
 
 Move artifacts to the release location in SVN, e.g.
-https://dist.apache.org/repos/dist/release/datafusion/datafusion-38.0.0/, using
+https://dist.apache.org/repos/dist/release/datafusion/datafusion-50.3.0/, using
 the `release-tarball.sh` script:
 
 ```shell
-./dev/release/release-tarball.sh 38.0.0 0
+./dev/release/release-tarball.sh 50.3.0 1
 ```
 
 Congratulations! The release is now official!
 
-### Create release git tags
+### 7. Create Release git tags
 
 Tag the same release candidate commit with the final release tag
 
 ```shell
-git co apache/38.0.0-rc0
-git tag 38.0.0
-git push apache 38.0.0
+git co apache/50.3.0-rc1
+git tag 50.3.0
+git push apache 50.3.0
 ```
 
-### Publish on Crates.io
+### 8. Publish on Crates.io
 
 Only approved releases of the tarball should be published to
 crates.io, in order to conform to Apache Software Foundation
@@ -252,7 +350,7 @@ been made to crates.io using the following instructions.
 Follow [these
 instructions](https://doc.rust-lang.org/cargo/reference/publishing.html) to
 create an account and login to crates.io before asking to be added as an owner
-to all of the DataFusion crates.
+to all DataFusion crates.
 
 Download and unpack the official release tarball
 
@@ -271,6 +369,7 @@ Verify that the Cargo.toml in the tarball contains the correct version
 (cd datafusion/execution && cargo publish)
 (cd datafusion/functions && cargo publish)
 (cd datafusion/physical-expr && cargo publish)
+(cd datafusion/physical-expr-adapter && cargo publish)
 (cd datafusion/functions-aggregate && cargo publish)
 (cd datafusion/functions-window && cargo publish)
 (cd datafusion/functions-nested && cargo publish)
@@ -278,85 +377,45 @@ Verify that the Cargo.toml in the tarball contains the correct version
 (cd datafusion/optimizer && cargo publish)
 (cd datafusion/common-runtime && cargo publish)
 (cd datafusion/physical-plan && cargo publish)
+(cd datafusion/pruning && cargo publish)
 (cd datafusion/physical-optimizer && cargo publish)
-(cd datafusion/catalog && cargo publish)
+(cd datafusion/session && cargo publish)
 (cd datafusion/datasource && cargo publish)
+(cd datafusion/catalog && cargo publish)
 (cd datafusion/catalog-listing && cargo publish)
 (cd datafusion/functions-table && cargo publish)
+(cd datafusion/datasource-arrow && cargo publish)
+(cd datafusion/datasource-csv && cargo publish)
+(cd datafusion/datasource-json && cargo publish)
+(cd datafusion/datasource-parquet && cargo publish)
 (cd datafusion/core && cargo publish)
 (cd datafusion/proto-common && cargo publish)
 (cd datafusion/proto && cargo publish)
+(cd datafusion/datasource-avro && cargo publish)
 (cd datafusion/substrait && cargo publish)
 (cd datafusion/ffi && cargo publish)
 (cd datafusion-cli && cargo publish)
+(cd datafusion/spark && cargo publish)
 (cd datafusion/sqllogictest && cargo publish)
 ```
 
 ### Publish datafusion-cli on Homebrew
 
-Run `publish_homebrew.sh` to publish `datafusion-cli` on Homebrew. In order to do so it is necessary to
-fork the `homebrew-core` repo https://github.com/Homebrew/homebrew-core/, have Homebrew installed on your
-macOS/Linux/WSL2 and properly configured and have a Github Personal Access Token that has permission to file pull requests in the `homebrew-core` repo.
-
-#### Fork the `homebrew-core` repo
-
-Go to https://github.com/Homebrew/homebrew-core/ and fork the repo.
-
-#### Install and configure Homebrew
-
-Please visit https://brew.sh/ to obtain Homebrew. In addition to that please check out https://docs.brew.sh/Homebrew-on-Linux if you are on Linux or WSL2.
-
-Before running the script make sure that you can run the following command in your bash to make sure
-that `brew` has been installed and configured properly:
-
-```shell
-brew --version
-```
-
-#### Create a Github Personal Access Token
-
-To create a Github Personal Access Token, please visit https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token for instructions.
-
-- Make sure to select either **All repositories** or **Only selected repositories** so that you have access to **Repository permissions**.
-- If you only use the token for selected repos make sure you include your
-  fork of `homebrew-core` in the list of repos under **Selected repositories**.
-- Make sure to have **Read and write** access enabled for pull requests in your **Repository permissions**.
-
-After all of the above is complete execute the following command:
-
-```shell
-dev/release/publish_homebrew.sh <version> <github-user> <github-token> <homebrew-default-branch-name>
-```
-
-Note that sometimes someone else has already submitted a PR to update the datafusion formula in homebrew.
-In this case you will get an error with a message that your PR is a duplicate of an existing one. In this
-case no further action is required.
+Note: [`datafusion` formula](https://formulae.brew.sh/formula/datafusion) is [updated automatically](https://github.com/Homebrew/homebrew-core/pulls?q=is%3Apr+datafusion+is%3Aclosed),
+so no action is needed.
 
-Alternatively manually submit a simple PR to update tag and commit hash for the datafusion
-formula in homebrew-core. Here is an example PR:
-https://github.com/Homebrew/homebrew-core/pull/89562.
+### 9: Add the release to Apache Reporter
 
-### Call the vote
-
-Call the vote on the Arrow dev list by replying to the RC voting thread. The
-reply should have a new subject constructed by adding `[RESULT]` prefix to the
-old subject line.
-
-Sample announcement template:
-
-```
-The vote has passed with <NUMBER> +1 votes. Thank you to all who helped
-with the release verification.
-```
-
-### Add the release to Apache Reporter
-
-Add the release to https://reporter.apache.org/addrelease.html?datafusion using the version number e.g. 38.0.0.
+When you have published the release, please help the project by adding the release to
+[Apache Reporter](https://reporter.apache.org/). The reporter system should
+send you a reminder email, but in case you miss it, you can add
+the release to https://reporter.apache.org/addrelease.html?datafusion following
+the examples from previous releases.
 
 The release information is used to generate a template for a board report (see example from Apache Arrow project
 [here](https://github.com/apache/arrow/pull/14357)).
 
-### Delete old RCs and Releases
+### 10: Delete old RCs and Releases
 
 See the ASF documentation on [when to archive](https://www.apache.org/legal/release-policy.html#when-to-archive)
 for more information.
@@ -374,7 +433,7 @@ svn ls https://dist.apache.org/repos/dist/dev/datafusion
 Delete a release candidate:
 
 ```shell
-svn delete -m "delete old DataFusion RC" https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-38.0.0-rc1/
+svn delete -m "delete old DataFusion RC" https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-50.0.0-rc1/
 ```
 
 #### Deleting old releases from `release` svn
@@ -390,31 +449,5 @@ svn ls https://dist.apache.org/repos/dist/release/datafusion
 Delete a release:
 
 ```shell
-svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-37.0.0
+svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-50.0.0
 ```
-
-### Optional: Write a blog post announcing the release
-
-We typically crowd source release announcements by collaborating on a Google document, usually starting
-with a copy of the previous release announcement.
-
-Run the following commands to get the number of commits and number of unique contributors for inclusion in the blog post.
-
-```shell
-git log --pretty=oneline 37.0.0..38.0.0 datafusion datafusion-cli datafusion-examples | wc -l
-git shortlog -sn 37.0.0..38.0.0 datafusion datafusion-cli datafusion-examples | wc -l
-```
-
-Once there is consensus on the contents of the post, create a PR to add a blog post to the
-[arrow-site](https://github.com/apache/arrow-site) repository. Note that there is no need for a formal
-PMC vote on the blog post contents since this isn't considered to be a "release".
-
-Here is an example blog post PR:
-
-- https://github.com/apache/arrow-site/pull/217
-
-Once the PR is merged, a GitHub action will publish the new blog post to https://arrow.apache.org/blog/.
-
-### Update the version on the download page
-
-Update the version on the [download page](https://datafusion.apache.org/download) to point to the latest release [here](../../docs/source/download.md).
diff --git a/dev/release/add-branch-protection.sh b/dev/release/add-branch-protection.sh
new file mode 100755
index 0000000000000..735bae7f90fd9
--- /dev/null
+++ b/dev/release/add-branch-protection.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -eu
+
+# Script to add branch protection for a new release branch in .asf.yaml
+#
+# This script automates the process of adding branch protection rules to .asf.yaml
+# for new release branches. It ensures the branch protection block doesn't already
+# exist before adding it.
+#
+# Usage:
+#   ./dev/release/add-branch-protection.sh <release_number>
+#
+# Examples:
+#   ./dev/release/add-branch-protection.sh 52
+#   ./dev/release/add-branch-protection.sh 53
+#
+# The script will:
+#   1. Validate the release number is a positive integer
+#   2. Check if branch protection already exists for branch-<release_number>
+#   3. Add the branch protection block to .asf.yaml if it doesn't exist
+#   4. Error out if the block already exists
+
+# Check if release number is provided
+if [ $# -eq 0 ]; then
+    echo "Error: Release number is required"
+    echo "Usage: $0 <release_number>"
+    echo "Example: $0 52"
+    exit 1
+fi
+
+RELEASE_NUM=$1
+BRANCH_NAME="branch-${RELEASE_NUM}"
+ASF_YAML_FILE=".asf.yaml"
+
+# Validate release number is a positive integer
+if ! [[ "$RELEASE_NUM" =~ ^[0-9]+$ ]]; then
+    echo "Error: Release number must be a positive integer"
+    echo "Provided: $RELEASE_NUM"
+    echo "Example: ./dev/release/add-branch-protection.sh 52"
+    exit 1
+fi
+
+# Check if .asf.yaml exists
+if [ ! -f "$ASF_YAML_FILE" ]; then
+    echo "Error: $ASF_YAML_FILE not found in current directory"
+    echo "Please run this script from the repository root"
+    exit 1
+fi
+
+# Check if the branch exists in the official Apache DataFusion repository
+GITHUB_API_URL="https://api.github.com/repos/apache/datafusion/branches/${BRANCH_NAME}"
+HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$GITHUB_API_URL")
+
+if [ "$HTTP_STATUS" != "200" ]; then
+    echo "Error: Branch ${BRANCH_NAME} does not exist in the official Apache DataFusion repository"
+    echo "Please create the branch '${BRANCH_NAME}' first before adding branch protection"
+    echo ""
+    echo "To check existing branches, visit:"
+    echo "  https://github.com/apache/datafusion/branches"
+    exit 1
+fi
+
+# Check if branch protection already exists for this release
+if grep -q "^[[:space:]]*${BRANCH_NAME}:" "$ASF_YAML_FILE"; then
+    echo "Error: Branch protection for ${BRANCH_NAME} already exists in $ASF_YAML_FILE"
+    exit 1
+fi
+
+# Create a temporary file
+TEMP_FILE=$(mktemp)
+
+# Read the file and insert the new branch protection block
+# We'll insert it after the last branch-XX block
+awk -v branch="$BRANCH_NAME" '
+/^[[:space:]]*branch-[0-9]+:/ {
+    last_branch_line = NR
+    last_branch_content = $0
+}
+{
+    lines[NR] = $0
+}
+END {
+    if (last_branch_line == 0) {
+        print "Error: No existing branch protection blocks found" > "/dev/stderr"
+        exit 1
+    }
+    
+    # Print all lines up to and including the last branch block
+    for (i = 1; i <= last_branch_line; i++) {
+        print lines[i]
+    }
+    
+    # Print the required_pull_request_reviews lines after the last branch
+    for (i = last_branch_line + 1; i <= NR; i++) {
+        print lines[i]
+        # After printing the required_approving_review_count line, insert new branch
+        if (lines[i] ~ /required_approving_review_count:/) {
+            # Check if this belongs to the last branch block by looking ahead
+            next_non_empty = i + 1
+            while (next_non_empty <= NR && lines[next_non_empty] ~ /^[[:space:]]*$/) {
+                next_non_empty++
+            }
+            # If next non-empty line is not indented more than branch level, we found the end
+            if (next_non_empty > NR || lines[next_non_empty] !~ /^[[:space:]]{6,}/) {
+                print "    " branch ":"
+                print "      required_pull_request_reviews:"
+                print "        required_approving_review_count: 1"
+                # Skip to next iteration to avoid double printing
+                for (j = i + 1; j <= NR; j++) {
+                    i = j
+                    if (j <= NR) print lines[j]
+                }
+                break
+            }
+        }
+    }
+}
+' "$ASF_YAML_FILE" > "$TEMP_FILE"
+
+# Check if awk succeeded
+if [ $? -ne 0 ]; then
+    rm -f "$TEMP_FILE"
+    exit 1
+fi
+
+# Verify the new content was added
+if ! grep -q "^[[:space:]]*${BRANCH_NAME}:" "$TEMP_FILE"; then
+    echo "Error: Failed to add branch protection block"
+    rm -f "$TEMP_FILE"
+    exit 1
+fi
+
+# Replace the original file with the modified version
+mv "$TEMP_FILE" "$ASF_YAML_FILE"
+
+echo "Successfully added branch protection for ${BRANCH_NAME} to $ASF_YAML_FILE"
+echo ""
+echo "Added block:"
+echo "    ${BRANCH_NAME}:"
+echo "      required_pull_request_reviews:"
+echo "        required_approving_review_count: 1"
+echo ""
+echo "Please review the changes and commit them."
\ No newline at end of file
diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh
index 7d2b7d56bd25e..6671d2a930d7f 100755
--- a/dev/release/create-tarball.sh
+++ b/dev/release/create-tarball.sh
@@ -102,7 +102,7 @@ Here is my vote:
 
 [1]: https://github.com/apache/datafusion/tree/${release_hash}
 [2]: ${url}
-[3]: https://github.com/apache/datafusion/blob/${release_hash}/CHANGELOG.md
+[3]: https://github.com/apache/datafusion/blob/${release_hash}/dev/changelog/${version}.md
 MAIL
 echo "---------------------------------------------------------"
 
diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py
index 1349416bcaa59..830d329f73c4f 100755
--- a/dev/release/generate-changelog.py
+++ b/dev/release/generate-changelog.py
@@ -124,6 +124,9 @@ def generate_changelog(repo, repo_name, tag1, tag2, version):
     print(f"This release consists of {commit_count} commits from {contributor_count} contributors. "
           f"See credits at the end of this changelog for more information.\n")
 
+    print("See the [upgrade guide](https://datafusion.apache.org/library-user-guide/upgrading.html) "
+           "for information on how to upgrade from previous versions.\n")
+
     print_pulls(repo_name, "Breaking changes", breaking)
     print_pulls(repo_name, "Performance related", performance)
     print_pulls(repo_name, "Implemented enhancements", enhancements)
diff --git a/dev/release/publish_homebrew.sh b/dev/release/publish_homebrew.sh
deleted file mode 100644
index 20955953e85a7..0000000000000
--- a/dev/release/publish_homebrew.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ue
-
-if [ "$#" -ne 4 ]; then
-  echo "Usage: $0 <version> <github-user> <github-token> <homebrew-default-branch-name>"
-  exit 1
-fi
-
-version=$1
-github_user=$2
-github_token=$3
-# Prepare for possible renaming of the default branch on Homebrew
-homebrew_default_branch_name=$4
-
-# Git parallel fetch
-if sysctl -n hw.ncpu 2>/dev/null; then # macOS
-  num_processing_units=$(sysctl -n hw.ncpu)
-elif [ -x "$(command -v nproc)" ]; then # Linux
-  num_processing_units=$(nproc)
-else # Fallback
-  num_processing_units=1
-fi
-
-url="https://www.apache.org/dyn/closer.lua?path=datafusion/datafusion-${version}/apache-datafusion-${version}.tar.gz"
-sha256="$(curl https://dist.apache.org/repos/dist/release/datafusion/datafusion-${version}/apache-datafusion-${version}.tar.gz.sha256 | cut -d' ' -f1)"
-
-pushd "$(brew --repository homebrew/core)"
-
-if ! git remote | grep -q --fixed-strings ${github_user}; then
-  echo "Setting ''${github_user}' remote"
-  git remote add ${github_user} git@github.com:${github_user}/homebrew-core.git
-fi
-
-echo "Updating working copy"
-git fetch --all --prune --tags --force -j$num_processing_units
-
-branch=apache-datafusion-${version}
-echo "Creating branch: ${branch}"
-git branch -D ${branch} || :
-git checkout -b ${branch} origin/master
-
-echo "Updating datafusion formulae"
-brew bump-formula-pr \
-     --commit \
-     --no-audit \
-     --sha256="${sha256}" \
-     --url="${url}" \
-     --verbose \
-     --write-only \
-     datafusion
-
-echo "Testing datafusion formulae"
-brew uninstall datafusion || :
-brew install --build-from-source datafusion
-brew test datafusion
-brew audit --strict datafusion
-
-git push -u $github_user ${branch}
-
-git checkout -
-
-popd
-
-echo "Create the pull request"
-title="datafusion ${version}"
-body="Created using \`bump-formula-pr\`"
-data="{\"title\":\"$title\", \"body\":\"$body\", \"head\":\"$github_username:$branch\", \"base\":\"$homebrew_default_branch_name\"}"
-curl -X POST \
-    -H "Accept: application/vnd.github+json" \
-    -H "Authorization: Bearer $github_token" \
-    https://api.github.com/repos/Homebrew/homebrew-core/pulls \
-    -d "$data"
-
-echo "Complete!"
diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh
index bd858d23a767c..a284b6c4351f3 100755
--- a/dev/release/release-tarball.sh
+++ b/dev/release/release-tarball.sh
@@ -43,6 +43,13 @@ fi
 version=$1
 rc=$2
 
+read -r -p "Proceed to release tarball for ${version}-rc${rc}? [y/N]: " answer
+answer=${answer:-no}
+if [ "${answer}" != "y" ]; then
+  echo "Cancelled tarball release!"
+  exit 1
+fi
+
 tmp_dir=tmp-apache-datafusion-dist
 
 echo "Recreate temporary directory: ${tmp_dir}"
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 9ecbe1bc1713c..9ddd1d3ba8553 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -22,7 +22,7 @@
 check_dependencies() {
   local missing_deps=0
   local required_deps=("curl" "git" "gpg" "cc" "protoc")
-  
+
   # Either shasum or sha256sum/sha512sum are required
   local has_sha_tools=0
 
@@ -32,7 +32,7 @@ check_dependencies() {
       missing_deps=1
     fi
   done
-  
+
   # Check for either shasum or sha256sum/sha512sum
   if command -v shasum &> /dev/null; then
     has_sha_tools=1
@@ -42,7 +42,7 @@ check_dependencies() {
     echo "Error: Neither shasum nor sha256sum/sha512sum are installed or in PATH"
     missing_deps=1
   fi
-  
+
   if [ $missing_deps -ne 0 ]; then
     echo "Please install missing dependencies and try again"
     exit 1
@@ -163,7 +163,7 @@ test_source_distribution() {
   git clone https://github.com/apache/parquet-testing.git parquet-testing
 
   cargo build
-  cargo test --all --features=avro
+  cargo test --profile=ci --all --features=avro
 
   if ( find -iname 'Cargo.toml' | xargs grep SNAPSHOT ); then
     echo "Cargo.toml version should not contain SNAPSHOT for releases"
diff --git a/dev/requirements.txt b/dev/requirements.txt
deleted file mode 100644
index 7fcba04931290..0000000000000
--- a/dev/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-tomlkit
-PyGitHub
\ No newline at end of file
diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh
index af0fce72ccfa5..43d29bd88166d 100755
--- a/dev/rust_lint.sh
+++ b/dev/rust_lint.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -19,14 +19,107 @@
 
 # This script runs all the Rust lints locally the same way the
 # DataFusion CI does
+#
+# Note: The installed checking tools (e.g., taplo) are not guaranteed to match
+# the CI versions for simplicity, there might be some minor differences. Check
+# `.github/workflows` for the CI versions.
+#
+#
+#
+# For each lint scripts:
+#
+# By default, they run in check mode:
+#     ./ci/scripts/rust_fmt.sh
+#
+# With `--write`, scripts perform best-effort auto fixes:
+#     ./ci/scripts/rust_fmt.sh --write
+#
+# The `--write` flag assumes a clean git repository (no uncommitted changes); to force
+# auto fixes even if there are unstaged changes, use `--allow-dirty`:
+#     ./ci/scripts/rust_fmt.sh --write --allow-dirty
+#
+# New scripts can use `rust_fmt.sh` as a reference.
+
+set -euo pipefail
+
+usage() {
+  cat >&2 <<EOF
+Usage: $0 [--write] [--allow-dirty]
+
+Runs the local Rust lint suite similar to CI.
+--write        Run formatters, clippy and other non-functional checks in best-effort write/fix mode (requires a clean git worktree, no uncommitted changes; some checks are test-only and ignore this flag).
+--allow-dirty  Allow \`--write\` to run even when the git worktree has uncommitted changes.
+EOF
+  exit 1
+}
+
+ensure_tool() {
+  local cmd="$1"
+  local install_cmd="$2"
+  if ! command -v "$cmd" &> /dev/null; then
+    echo "Installing $cmd using: $install_cmd"
+    eval "$install_cmd"
+  fi
+}
+
+MODE="check"
+ALLOW_DIRTY=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --write)
+      MODE="write"
+      ;;
+    --allow-dirty)
+      ALLOW_DIRTY=1
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      usage
+      ;;
+  esac
+  shift
+done
+
+SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")"
+
+ensure_tool "taplo" "cargo install taplo-cli --locked"
+ensure_tool "hawkeye" "cargo install hawkeye --locked"
+ensure_tool "typos" "cargo install typos-cli --locked"
+
+run_step() {
+  local name="$1"
+  shift
+  echo "[${SCRIPT_NAME}] Running ${name}"
+  "$@"
+}
+
+declare -a WRITE_STEPS=(
+  "ci/scripts/rust_fmt.sh|true"
+  "ci/scripts/rust_clippy.sh|true"
+  "ci/scripts/rust_toml_fmt.sh|true"
+  "ci/scripts/license_header.sh|true"
+  "ci/scripts/typos_check.sh|true"
+  "ci/scripts/doc_prettier_check.sh|true"
+)
 
-set -e
-if ! command -v taplo &> /dev/null; then
-    echo "Installing taplo using cargo"
-    cargo install taplo-cli
-fi
+declare -a READONLY_STEPS=(
+  "ci/scripts/rust_docs.sh|false"
+)
 
-ci/scripts/rust_fmt.sh
-ci/scripts/rust_clippy.sh
-ci/scripts/rust_toml_fmt.sh
-ci/scripts/rust_docs.sh
+for entry in "${WRITE_STEPS[@]}" "${READONLY_STEPS[@]}"; do
+  IFS='|' read -r script_path supports_write <<<"$entry"
+  script_name="$(basename "$script_path")"
+  args=()
+  if [[ "$supports_write" == "true" && "$MODE" == "write" ]]; then
+    args+=(--write)
+    [[ $ALLOW_DIRTY -eq 1 ]] && args+=(--allow-dirty)
+  fi
+  if [[ ${#args[@]} -gt 0 ]]; then
+    run_step "$script_name" "$script_path" "${args[@]}"
+  else
+    run_step "$script_name" "$script_path"
+  fi
+done
diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py
index 6bd5d47ff0597..bdfdfe22eaeb6 100755
--- a/dev/update_arrow_deps.py
+++ b/dev/update_arrow_deps.py
@@ -19,7 +19,7 @@
 # Script that updates the arrow dependencies in datafusion locally
 #
 # installation:
-# pip install tomlkit requests
+# uv sync
 #
 # pin all arrow crates deps to a specific version:
 #
diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh
index 585cb77839f98..f39bdda3aee87 100755
--- a/dev/update_config_docs.sh
+++ b/dev/update_config_docs.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -20,11 +20,15 @@
 
 set -e
 
-SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "${SOURCE_DIR}/../" && pwd
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+cd "${ROOT_DIR}"
+
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
 
 TARGET_FILE="docs/source/user-guide/configs.md"
 PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs"
+PRINT_RUNTIME_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_runtime_config_docs"
 
 echo "Inserting header"
 cat <<'EOF' > "$TARGET_FILE"
@@ -48,29 +52,195 @@ cat <<'EOF' > "$TARGET_FILE"
 -->
 
 <!---
-This file was generated by the dev/update_config_docs.sh script.
+NOTE: This file was generated by the dev/update_config_docs.sh script.
 Do not edit it manually as changes will be overwritten.
 Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src/config.rs.
 -->
 
 # Configuration Settings
 
-The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
+DataFusion configurations control various aspects of DataFusion planning and execution
+
+## Setting Configuration Options
+
+### Programmatically
+You can set the options programmatically via the [`ConfigOptions`] object. For
+example, to configure the `datafusion.execution.target_partitions` using the API:
+
+```rust
+use datafusion::common::config::ConfigOptions;
+let mut config = ConfigOptions::new();
+config.execution.target_partitions = 1;
+```
+
+### Via Environment Variables
+
+You can also set configuration options via environment variables using
+[`ConfigOptions::from_env`], for example
+
+```shell
+DATAFUSION_EXECUTION_TARGET_PARTITIONS=1 ./your_program
+```
+
+### Via SQL
 
-For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
-To construct a session with options from the environment, use `SessionConfig::from_env`.
-The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
-For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
-Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
-If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
-Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
+You can also set configuration options via SQL using the `SET` command. For
+example, to configure `datafusion.execution.target_partitions`:
+
+```sql
+SET datafusion.execution.target_partitions = '1';
+```
+
+[`ConfigOptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html
+[`ConfigOptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env
+
+The following configuration settings are available:
 
 EOF
 
 echo "Running CLI and inserting config docs table"
 $PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Inserting runtime config header"
+cat <<'EOF' >> "$TARGET_FILE"
+
+# Runtime Configuration Settings
+
+DataFusion runtime configurations can be set via SQL using the `SET` command.
+
+For example, to configure `datafusion.runtime.memory_limit`:
+
+```sql
+SET datafusion.runtime.memory_limit = '2G';
+```
+
+The following runtime configuration settings are available:
+
+EOF
+
+echo "Running CLI and inserting runtime config docs table"
+$PRINT_RUNTIME_CONFIG_DOCS_COMMAND >> "$TARGET_FILE"
+
+cat <<'EOF' >> "$TARGET_FILE"
+
+# Tuning Guide
+
+## Short Queries
+
+By default DataFusion will attempt to maximize parallelism and use all cores --
+For example, if you have 32 cores, each plan will split the data into 32
+partitions. However, if your data is small, the overhead of splitting the data
+to enable parallelization can dominate the actual computation.
+
+You can find out how many cores are being used via the [`EXPLAIN`] command and look
+at the number of partitions in the plan.
+
+[`EXPLAIN`]: sql/explain.md
+
+The `datafusion.optimizer.repartition_file_min_size` option controls the minimum file size the
+[`ListingTable`] provider will attempt to repartition. However, this
+does not apply to user defined data sources and only works when DataFusion has accurate statistics.
+
+If you know your data is small, you can set the `datafusion.execution.target_partitions`
+option to a smaller number to reduce the overhead of repartitioning. For very small datasets (e.g. less
+than 1MB), we recommend setting `target_partitions` to 1 to avoid repartitioning altogether.
+
+```sql
+SET datafusion.execution.target_partitions = '1';
+```
+
+[`ListingTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
+
+## Memory-limited Queries
+
+When executing a memory-consuming query under a tight memory limit, DataFusion 
+will spill intermediate results to disk.
+
+When the [`FairSpillPool`] is used, memory is divided evenly among partitions. 
+The higher the value of `datafusion.execution.target_partitions`, the less memory 
+is allocated to each partition, and the out-of-core execution path may trigger 
+more frequently, possibly slowing down execution.
+
+Additionally, while spilling, data is read back in `datafusion.execution.batch_size` size batches.
+The larger this value, the fewer spilled sorted runs can be merged. Decreasing this setting
+can help reduce the number of subsequent spills required. 
+
+In conclusion, for queries under a very tight memory limit, it's recommended to
+set `target_partitions` and `batch_size` to smaller values.
+
+```sql
+-- Query still gets parallelized, but each partition will have more memory to use
+SET datafusion.execution.target_partitions = 4;
+-- Smaller than the default '8192', while still keep the benefit of vectorized execution
+SET datafusion.execution.batch_size = 1024;
+```
+
+[`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html
+
+## Join Queries
+
+Currently Apache Datafusion supports the following join algorithms:
+
+- Nested Loop Join
+- Sort Merge Join
+- Hash Join
+- Symmetric Hash Join
+- Piecewise Merge Join (experimental)
+
+The physical planner will choose the appropriate algorithm based on the statistics + join
+condition of the two tables.
+
+# Join Algorithm Optimizer Configurations
+
+You can modify join optimization behavior in your queries by setting specific configuration values.
+Use the following command to update a configuration:
+
+``` sql
+SET datafusion.optimizer.<configuration_name>;
+```
+
+Example
+
+``` sql
+SET datafusion.optimizer.prefer_hash_join = false;
+```
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query:
+
+## Join Optimizer Configurations
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query.
+
+### allow_symmetric_joins_without_pruning (bool, default = true)
+
+Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
+lack ordering or filtering.
+
+- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+### prefer_hash_join (bool, default = true)
+
+Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
+
+- true: favors HashJoin for faster execution when sufficient memory is available.
+- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+### enable_piecewise_merge_join (bool, default = false)
+
+Enables the experimental Piecewise Merge Join algorithm.
+
+- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
+  filter in the join condition.
+- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
+  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
+  equal in size.
+
+EOF
+
+
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py
index cf72e5a4159dc..1edf0f127112f 100755
--- a/dev/update_datafusion_versions.py
+++ b/dev/update_datafusion_versions.py
@@ -19,7 +19,7 @@
 # Script that updates versions for datafusion crates, locally
 #
 # dependencies:
-# pip install tomlkit
+# uv sync
 
 import re
 import argparse
@@ -48,7 +48,6 @@
     'datafusion-benchmarks': 'benchmarks/Cargo.toml',
     'datafusion-cli': 'datafusion-cli/Cargo.toml',
     'datafusion-examples': 'datafusion-examples/Cargo.toml',
-    'datafusion-docs': 'docs/Cargo.toml',
 }
 
 def update_workspace_version(new_version: str):
@@ -116,7 +115,9 @@ def update_docs(path: str, new_version: str):
     with open(path, 'r+') as fd:
         content = fd.read()
         fd.seek(0)
-        content = re.sub(r'datafusion = "(.+)"', f'datafusion = "{new_version}"', content)
+        content = re.sub(r'datafusion\s*=\s*"(.+?)"', f'datafusion = "{new_version}"', content)
+        content = re.sub(r'datafusion\s*=\s*\{\s*version\s*=\s*"(.+?)"', f'datafusion = {{ version = "{new_version}"', content)
+        fd.truncate()
         fd.write(content)
 
 
@@ -144,6 +145,9 @@ def main():
         update_downstream_versions(cargo_toml, new_version)
 
     update_docs("README.md", new_version)
+    update_docs("docs/source/download.md", new_version)
+    update_docs("docs/source/user-guide/example-usage.md", new_version)
+    update_docs("docs/source/user-guide/crate-configuration.md", new_version)
 
 
 if __name__ == "__main__":
diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh
index a9e87aacf5ad1..86a272ae196c8 100755
--- a/dev/update_function_docs.sh
+++ b/dev/update_function_docs.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
@@ -20,9 +20,11 @@
 
 set -e
 
-SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "${SOURCE_DIR}/../" && pwd
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+cd "${ROOT_DIR}"
 
+# Load centralized tool versions
+source "${ROOT_DIR}/ci/scripts/utils/tool_versions.sh"
 
 TARGET_FILE="docs/source/user-guide/sql/aggregate_functions.md"
 PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- aggregate"
@@ -59,13 +61,62 @@ dev/update_function_docs.sh file for updating surrounding text.
 # Aggregate Functions
 
 Aggregate functions operate on a set of values to compute a single result.
+
+## Filter clause
+
+Aggregate functions support the SQL `FILTER (WHERE ...)` clause to restrict which input rows contribute to the aggregate result.
+
+```sql
+function([exprs]) FILTER (WHERE condition)
+```
+
+Example:
+
+```sql
+SELECT
+  sum(salary) FILTER (WHERE salary > 0) AS sum_positive_salaries,
+  count(*)    FILTER (WHERE active)     AS active_count
+FROM employees;
+```
+
+Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
+
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
 EOF
 
 echo "Running CLI and inserting aggregate function docs table"
 $PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
 
@@ -108,8 +159,8 @@ EOF
 echo "Running CLI and inserting scalar function docs table"
 $PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
 
@@ -266,6 +317,17 @@ where **offset** is an non-negative integer.
 
 RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column).
 
+## Filter clause for aggregate window functions
+
+Aggregate window functions support the SQL `FILTER (WHERE ...)` clause to include only rows that satisfy the predicate from the window frame in the aggregation.
+
+```sql
+sum(salary) FILTER (WHERE salary > 0)
+  OVER (PARTITION BY depname ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+```
+
+If no rows in the frame satisfy the filter for a given output row, `COUNT` yields `0` while `SUM`/`AVG`/`MIN`/`MAX` yield `NULL`.
+
 ## Aggregate functions
 
 All [aggregate functions](aggregate_functions.md) can be used as window functions.
@@ -275,7 +337,7 @@ EOF
 echo "Running CLI and inserting window function docs table"
 $PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE"
 
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
+echo "Running prettier ${PRETTIER_VERSION}"
+npx "prettier@${PRETTIER_VERSION}" --write "$TARGET_FILE"
 
 echo "'$TARGET_FILE' successfully updated!"
diff --git a/dev/update_runtime_config_docs.sh b/dev/update_runtime_config_docs.sh
deleted file mode 100755
index 0d9d0f1033236..0000000000000
--- a/dev/update_runtime_config_docs.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-set -e
-
-SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "${SOURCE_DIR}/../" && pwd
-
-TARGET_FILE="docs/source/user-guide/runtime_configs.md"
-PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_runtime_config_docs"
-
-echo "Inserting header"
-cat <<'EOF' > "$TARGET_FILE"
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-<!---
-This file was generated by the dev/update_runtime_config_docs.sh script.
-Do not edit it manually as changes will be overwritten.
-Instead, edit dev/update_runtime_config_docs.sh or the docstrings in datafusion/execution/src/runtime_env.rs.
--->
-
-# Runtime Environment Configurations
-
-DataFusion runtime configurations can be set via SQL using the `SET` command.
-
-For example, to configure `datafusion.runtime.memory_limit`:
-
-```sql
-SET datafusion.runtime.memory_limit = '2G';
-```
-
-The following runtime configuration settings are available:
-
-EOF
-
-echo "Running CLI and inserting runtime config docs table"
-$PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE"
-
-echo "Running prettier"
-npx prettier@2.3.2 --write "$TARGET_FILE"
-
-echo "'$TARGET_FILE' successfully updated!"
diff --git a/docs/.gitignore b/docs/.gitignore
index e2a54c053edf9..e73866cc0f359 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -15,7 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-build
-temp
+temp/
+build/
 venv/
 .python-version
+__pycache__/
+
+# Generated dependency graph artifacts (produced during docs CI)
+source/_static/data/deps.dot
+source/_static/data/deps.svg
diff --git a/docs/Makefile b/docs/Makefile
index 6bce19911da5b..20ccd822f59c7 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -21,7 +21,7 @@
 
 # You can set these variables from the command line, and also
 # from the environment for the first two.
-SPHINXOPTS    ?=
+SPHINXOPTS    ?= -W
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
diff --git a/docs/README.md b/docs/README.md
index acf3cb754c008..48fdcefdeae1a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -25,25 +25,37 @@ https://datafusion.apache.org/ as part of the release process.
 
 ## Dependencies
 
-It's recommended to install build dependencies and build the documentation
-inside a Python virtualenv.
+Install build dependencies and build the documentation using
+[uv](https://docs.astral.sh/uv/):
 
-- Python
-- `pip install -r requirements.txt`
+```sh
+uv sync
+uv run bash build.sh
+```
+
+The docs build regenerates the workspace dependency graph via
+`docs/scripts/generate_dependency_graph.sh`, so ensure `cargo`, `cargo-depgraph`
+(`cargo install cargo-depgraph --version ^1.6 --locked`), and Graphviz `dot`
+(`brew install graphviz` or `sudo apt-get install -y graphviz`) are available.
 
 ## Build & Preview
 
 Run the provided script to build the HTML pages.
 
 ```bash
+# If using venv, ensure you have activated it
 ./build.sh
 ```
 
-The HTML will be generated into a `build` directory.
+The HTML will be generated into a `build` directory. Open `build/html/index.html`
+in your preferred browser, e.g.
 
 Preview the site on Linux by running this command.
 
 ```bash
+# On macOS
+open build/html/index.html
+# On Linux with Firefox
 firefox build/html/index.html
 ```
 
diff --git a/docs/build.sh b/docs/build.sh
index 73516e8e9c68c..e12e3c1a5f202 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -18,14 +18,14 @@
 # under the License.
 #
 
-set -e
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${SCRIPT_DIR}"
+
 rm -rf build 2> /dev/null
-rm -rf temp 2> /dev/null
-mkdir temp
-cp -rf source/* temp/
-# replace relative URLs with absolute URLs
-sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/blob\/main\//g' temp/contributor-guide/index.md
 
-python rustdoc_trim.py
+# Keep the workspace dependency graph in sync with the codebase.
+scripts/generate_dependency_graph.sh
 
-make SOURCEDIR=`pwd`/temp SPHINXOPTS=-W html
+make html
diff --git a/docs/make.bat b/docs/make.bat
index ded5b4a3e2b67..33e25e4ee4651 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -23,7 +23,8 @@ REM Command file for Sphinx documentation
 
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
-)
+)
+set SPHINXOPTS=-W
 set SOURCEDIR=source
 set BUILDDIR=build
 
diff --git a/docs/pyproject.toml b/docs/pyproject.toml
new file mode 100644
index 0000000000000..12eb6b1e4bfa6
--- /dev/null
+++ b/docs/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "datafusion-docs"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+  "sphinx>=9,<10",
+  "sphinx-reredirects>=1.1,<2",
+  "pydata-sphinx-theme>=0.16,<1",
+  "myst-parser>=5,<6",
+  "maturin>=1.11,<2",
+  "jinja2>=3.1,<4",
+  "setuptools>=82,<83",
+]
diff --git a/docs/rustdoc_trim.py b/docs/rustdoc_trim.py
index 7ea96dbb44a54..ab146c6a662fd 100644
--- a/docs/rustdoc_trim.py
+++ b/docs/rustdoc_trim.py
@@ -16,11 +16,10 @@
 # under the License.
 
 import re
-
-from pathlib import Path
+from sphinx.application import Sphinx
 
 # Regex pattern to match Rust code blocks in Markdown
-RUST_CODE_BLOCK_PATTERN = re.compile(r"```rust\s*(.*?)```", re.DOTALL)
+RUST_CODE_BLOCK_PATTERN = re.compile(r"```rust(?:,ignore)?\s*(.*?)```", re.DOTALL)
 
 
 def remove_hashtag_lines_in_rust_blocks(markdown_content):
@@ -46,30 +45,16 @@ def _process_code_block(match):
     return RUST_CODE_BLOCK_PATTERN.sub(_process_code_block, markdown_content)
 
 
-# Example usage
-def process_markdown_file(file_path):
-    # Read the Markdown file
-    with open(file_path, "r", encoding="utf-8") as file:
-        markdown_content = file.read()
-
+def process_source_file(app: Sphinx, docname: str, source: list[str]):
+    original_content = source[0]
     # Remove lines starting with '#' in Rust code blocks
-    updated_markdown_content = remove_hashtag_lines_in_rust_blocks(markdown_content)
-
-    # Write the updated content back to the Markdown file
-    with open(file_path, "w", encoding="utf-8") as file:
-        file.write(updated_markdown_content)
-
-    print(f"Done processing file: {file_path}")
-
-
-root_directory = Path("./temp/library-user-guide")
-for file_path in root_directory.rglob("*.md"):
-    print(f"Processing file: {file_path}")
-    process_markdown_file(file_path)
+    modified_content = remove_hashtag_lines_in_rust_blocks(original_content)
+    source[0] = modified_content
 
-root_directory = Path("./temp/user-guide")
-for file_path in root_directory.rglob("*.md"):
-    print(f"Processing file: {file_path}")
-    process_markdown_file(file_path)
 
-print("All Markdown files processed.")
+def setup(app: Sphinx):
+    app.connect("source-read", process_source_file)
+    return dict(
+        parallel_read_safe=True,
+        parallel_write_safe=True,
+    )
diff --git a/docs/scripts/generate_dependency_graph.sh b/docs/scripts/generate_dependency_graph.sh
new file mode 100755
index 0000000000000..771f6f1932c37
--- /dev/null
+++ b/docs/scripts/generate_dependency_graph.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# See `usage()` for details about this script.
+#
+# The key commands to generate the dependency graph SVG in this script are:
+#   cargo depgraph ... | dot -Tsvg > deps.svg
+# See below for the exact command used.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+OUTPUT_DIR="${REPO_DIR}/docs/source/_static/data"
+SVG_OUTPUT="${OUTPUT_DIR}/deps.svg"
+
+usage() {
+  cat <<EOF
+Generate the workspace dependency graph SVG for the docs.
+
+'deps.svg' is embedded in the DataFusion docs (Contributor Guide → Architecture → Workspace Dependency Graph).
+
+Output:
+  SVG: ${SVG_OUTPUT}
+
+Usage: $(basename "$0")
+
+Options:
+  -h, --help  Show this help message.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if ! command -v cargo >/dev/null 2>&1; then
+  echo "cargo is required to build the dependency graph." >&2
+  exit 1
+fi
+
+if ! command -v cargo-depgraph > /dev/null 2>&1; then
+  echo "cargo-depgraph is required (install with: cargo install cargo-depgraph)." >&2
+  exit 1
+fi
+
+if ! command -v dot >/dev/null 2>&1; then
+  echo "Graphviz 'dot' is required to render the SVG." >&2
+  exit 1
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+
+(
+  cd "${REPO_DIR}"
+  # Ignore utility crates only used by internal scripts
+  cargo depgraph \
+    --workspace-only \
+    --all-deps \
+    --dedup-transitive-deps \
+    --exclude gen,gen-common \
+    | dot \
+      -Grankdir=TB \
+      -Gconcentrate=true \
+      -Goverlap=false \
+      -Tsvg \
+      > "${SVG_OUTPUT}"
+)
+
+echo "Wrote dependency graph SVG to ${SVG_OUTPUT}"
diff --git a/docs/scripts/update_committer_list.py b/docs/scripts/update_committer_list.py
new file mode 100755
index 0000000000000..c66eb52468523
--- /dev/null
+++ b/docs/scripts/update_committer_list.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+Utility for updating the committer list in the governance documentation
+by reading from the Apache DataFusion phonebook and combining with existing data.
+"""
+
+import re
+import requests
+import sys
+import os
+from typing import Dict, List, NamedTuple, Set
+
+
+class Committer(NamedTuple):
+    name: str
+    apache: str
+    github: str
+    affiliation: str
+    role: str
+
+
+# Return (pmc, committers) each a dictionary like
+# key: apache id
+# value: Real name
+
+def get_asf_roster():
+    """Get the current roster from Apache phonebook."""
+    # See https://home.apache.org/phonebook-about.html
+    committers_url = "https://whimsy.apache.org/public/public_ldap_projects.json"
+
+    # people https://whimsy.apache.org/public/public_ldap_people.json
+    people_url = "https://whimsy.apache.org/public/public_ldap_people.json"
+
+    try:
+        r = requests.get(committers_url)
+        r.raise_for_status()
+        j = r.json()
+        proj = j['projects']['datafusion']
+
+        # Get PMC members and committers
+        pmc_ids = set(proj['owners'])
+        committer_ids = set(proj['members']) - pmc_ids
+
+    except Exception as e:
+        print(f"Error fetching ASF roster: {e}")
+        return set(), set()
+
+    # Fetch people to get github handles and affiliations
+    #
+    # The data looks like this:
+    # {
+    #   "lastCreateTimestamp": "20250913131506Z",
+    #   "people_count": 9932,
+    #   "people": {
+    #     "a_budroni": {
+    #       "name": "Alessandro Budroni",
+    #       "createTimestamp": "20160720223917Z"
+    #     },
+    #   ...
+    #  }
+    try:
+        r = requests.get(people_url)
+        r.raise_for_status()
+        j = r.json()
+        people = j['people']
+
+        # make a dictionary with each pmc_id and value their real name
+        pmcs = {p: people[p]['name'] for p in pmc_ids}
+        committers = {c: people[c]['name'] for c in committer_ids}
+
+    except Exception as e:
+        print(f"Error fetching ASF people: {e}")
+
+
+    return pmcs, committers
+
+
+
+def parse_existing_table(content: str) -> List[Committer]:
+    """Parse the existing committer table from the markdown content."""
+    committers = []
+
+    # Find the table between the markers
+    start_marker = "<!-- Begin Auto-Generated Committer List -->"
+    end_marker = "<!-- End Auto-Generated Committer List -->"
+
+    start_idx = content.find(start_marker)
+    end_idx = content.find(end_marker)
+
+    if start_idx == -1 or end_idx == -1:
+        return committers
+
+    table_content = content[start_idx:end_idx]
+
+    # Parse table rows (skip header and separator)
+    lines = table_content.split('\n')
+    for line in lines:
+        line = line.strip()
+        if line.startswith('|')  and '---' not in line and line.count('|') >= 4:
+            # Split by | and clean up
+            parts = [part.strip() for part in line.split('|')]
+            if len(parts) >= 5:
+                name = parts[1].strip()
+                apache = parts[2].strip()
+                github = parts[3].strip()
+                affiliation = parts[4].strip()
+                role = parts[5].strip()
+
+                if name and name != 'Name' and (not '-----' in name):
+                    committers.append(Committer(name, apache, github, affiliation, role))
+
+    return committers
+
+
+def generate_table_row(committer: Committer) -> str:
+    """Generate a markdown table row for a committer."""
+    github_link = f"[{committer.github}](https://github.com/{committer.github})"
+    return f"| {committer.name:<23} | {committer.apache:<39} |{committer.github:<39} | {committer.affiliation:<11} | {committer.role:<9} |"
+
+
+def sort_committers(committers: List[Committer]) -> List[Committer]:
+    """Sort committers by role ('PMC Chair', PMC, Committer) then by apache id."""
+    role_order = {'PMC Chair': 0, 'PMC': 1, 'Committer': 2}
+
+    return sorted(committers, key=lambda c: (role_order.get(c.role, 3), c.apache.lower()))
+
+
+def update_governance_file(file_path: str):
+    """Update the governance file with the latest committer information."""
+    try:
+        with open(file_path, 'r') as f:
+            content = f.read()
+    except FileNotFoundError:
+        print(f"Error: File {file_path} not found")
+        return False
+
+    # Parse existing committers
+    existing_committers = parse_existing_table(content)
+    print(f"Found {len(existing_committers)} existing committers")
+
+    # Get ASF roster
+    asf_pmcs, asf_committers = get_asf_roster()
+    print(f"Found {len(asf_pmcs)} PMCs and {len(asf_committers)} committers in ASF roster")
+
+
+    # Create a map of existing committers by apache id
+    existing_by_apache = {c.apache: c for c in existing_committers}
+
+    # Update the entries based on the ASF roster
+    updated_committers = []
+    for apache_id, name in {**asf_pmcs, **asf_committers}.items():
+        role = 'PMC' if apache_id in asf_pmcs else 'Committer'
+        if apache_id in existing_by_apache:
+            existing = existing_by_apache[apache_id]
+            # Preserve PMC Chair role if already set
+            if existing.role == 'PMC Chair':
+                role = 'PMC Chair'
+            updated_committers.append(Committer(
+                name=existing.name,
+                apache=apache_id,
+                github=existing.github,
+                affiliation=existing.affiliation,
+                role=role
+            ))
+        # add a new entry for new committers with placeholder values
+        else:
+            print(f"New entry found: {name} ({apache_id})")
+            # Placeholder github and affiliation
+            updated_committers.append(Committer(
+                name=name,
+                apache=apache_id,
+                github="", # user should update
+                affiliation="",  # User should update
+                role=role
+            ))
+
+
+    # Sort the committers
+    sorted_committers = sort_committers(updated_committers)
+
+    # Generate new table
+    table_lines = [
+        "| Name                    | Apache ID | github                     | Affiliation | Role      |",
+        "|-------------------------|-----------|----------------------------|-------------|-----------|"
+    ]
+
+    for committer in sorted_committers:
+        table_lines.append(generate_table_row(committer))
+
+    new_table = '\n'.join(table_lines)
+
+    # Replace the table in the content
+    start_marker = "<!-- Begin Auto-Generated Committer List -->"
+    end_marker = "<!-- End Auto-Generated Committer List -->"
+
+    start_idx = content.find(start_marker)
+    end_idx = content.find(end_marker)
+
+    if start_idx == -1 or end_idx == -1:
+        print("Error: Could not find table markers in file")
+        return False
+
+    # Find the end of the start marker line
+    start_line_end = content.find('\n', start_idx) + 1
+
+    new_content = (
+        content[:start_line_end] +
+        new_table + '\n' +
+        content[end_idx:]
+    )
+
+    # Write back to file
+    try:
+        with open(file_path, 'w') as f:
+            f.write(new_content)
+        print(f"Successfully updated {file_path}")
+        return True
+    except Exception as e:
+        print(f"Error writing file: {e}")
+        return False
+
+
+def main():
+    """Main function."""
+    # Default path to governance file
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+    governance_file = os.path.join(repo_root, "source", "contributor-guide", "governance.md")
+
+    if len(sys.argv) > 1:
+        governance_file = sys.argv[1]
+
+    if not os.path.exists(governance_file):
+        print(f"Error: Governance file not found at {governance_file}")
+        sys.exit(1)
+
+    print(f"Updating committer list in {governance_file}")
+
+    if update_governance_file(governance_file):
+        print("Committer list updated successfully")
+    else:
+        print("Failed to update committer list")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/_static/favicon.svg b/docs/source/_static/favicon.svg
new file mode 100644
index 0000000000000..bf174719bcf21
--- /dev/null
+++ b/docs/source/_static/favicon.svg
@@ -0,0 +1,10 @@
+<svg width="153" height="168" viewBox="0 0 153 168" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M76.134 168C88.4689 168 99.6146 158.249 107.584 142.553C127.231 144.598 143.098 139.996 149.76 128.509C156.288 117.253 152.646 101.683 141.581 86.0881C152.361 70.6665 155.851 55.3164 149.395 44.1852C143.015 33.1851 128.195 28.4991 109.697 29.9172C101.651 11.6232 89.6012 0 76.134 0C62.6024 0 50.5019 11.7346 42.4559 30.18C24.3535 28.9678 9.88686 33.6772 3.6047 44.5089C-2.8057 55.5617 0.590737 70.774 11.1922 86.0857C0.30461 101.573 -3.24333 117.007 3.24001 128.185C9.80558 139.506 25.3108 144.139 44.5631 142.314C52.5405 158.15 63.7366 168 76.134 168ZM52.723 141.192C59.5131 154.212 68.6244 162.178 78.6404 162.178C88.3909 162.178 97.6445 154.628 104.759 142.216C95.7498 141.011 86.0249 138.458 76.0813 134.579C68.0559 137.653 60.1825 139.861 52.723 141.192ZM65.066 129.772C59.6365 131.524 54.3493 132.843 49.3044 133.722C47.3604 128.877 45.6948 123.542 44.3578 117.82C47.6743 120.075 51.143 122.249 54.7512 124.322C58.1804 126.293 61.6254 128.11 65.066 129.772ZM76.1153 125.681C82.0554 123.201 88.0811 120.236 94.0712 116.793C102.753 111.803 110.601 106.248 117.406 100.431C118.007 95.1332 118.325 89.6363 118.325 84C118.325 76.7877 117.804 69.8034 116.837 63.1724C111.111 58.5306 104.726 54.1259 97.7783 50.1327C90.5496 45.9781 83.2382 42.5717 76.0688 39.9083C69.1761 42.5221 62.1608 45.8209 55.2217 49.809C50.8567 52.3176 46.714 54.9887 42.8173 57.7789C41.3987 65.9754 40.6264 74.8022 40.6264 84C40.6264 91.6514 41.1608 99.046 42.1572 106.035C47.3013 109.933 52.9148 113.66 58.9288 117.117C64.6605 120.411 70.4247 123.268 76.1153 125.681ZM87.1306 129.834C90.8382 128.069 94.5526 126.123 98.2488 123.999C104.42 120.452 110.183 116.612 115.475 112.577C113.788 120.729 111.414 128.213 108.516 134.761C101.819 133.947 94.6043 132.298 87.1306 129.834ZM117.71 110.839C116.16 119.645 113.869 127.798 110.979 135.025C127.491 136.554 140.523 132.79 145.949 123.434C151.173 114.426 148.569 101.574 140.004 88.2746C134.26 96.0014 126.717 103.697 117.71 110.839ZM136.558 79.6112C145.456 66.5866 148.333 53.995 143.077 44.9322C137.99 36.1605 126.021 31.9659 110.76 32.4296C114.303 41.1419 116.969 51.2264 118.504 62.197C125.46 67.7548 131.53 73.6409 136.558 79.6112ZM108.207 32.5487C111.854 40.6018 114.7 50.1134 116.432 60.5713C110.979 56.37 105.016 52.3749 98.6135 48.6953C91.9605 44.8716 85.2484 41.6273 78.6231 38.9688C89.0583 35.2503 99.1528 33.1155 108.207 32.5487ZM107.079 30.1561C97.4373 31.1742 86.8906 33.7897 76.078 37.9747C67.3015 34.6398 58.7118 32.342 50.652 31.0966C57.5997 15.5588 67.5671 5.82178 78.6404 5.82178C89.4885 5.82178 99.7214 15.1669 107.079 30.1561ZM73.5221 38.9912C65.2343 36.1013 57.1742 34.2138 49.6945 33.3208C46.9942 39.8377 44.805 47.278 43.2545 55.3791C46.7937 52.9437 50.5095 50.5998 54.3865 48.3716C60.7535 44.7123 67.1747 41.5836 73.5221 38.9912ZM33.7221 62.4957C35.2332 51.5364 37.8718 41.4543 41.3859 32.7321C26.5331 32.4484 14.915 36.6486 9.92297 45.2558C4.71482 54.2357 7.49152 66.68 16.1993 79.577C21.0882 73.7301 26.9781 67.96 33.7221 62.4957ZM12.7737 88.2981C18.3786 95.9001 25.7286 103.478 34.5053 110.536C36.0375 119.356 38.3126 127.526 41.1886 134.774C25.0714 136.101 12.386 132.308 7.05132 123.11C1.87209 114.18 4.38588 101.474 12.7737 88.2981ZM17.5655 81.5365C21.902 75.9826 27.2176 70.4453 33.3781 65.1689C32.6541 71.2227 32.2716 77.5258 32.2716 84C32.2716 88.9107 32.4917 93.723 32.9139 98.4027C26.8006 92.8905 21.6275 87.1849 17.5655 81.5365ZM135.191 81.5497C130.996 87.3221 125.646 93.1492 119.321 98.7635C119.765 93.9705 119.996 89.037 119.996 84C119.996 77.4071 119.6 70.9917 118.85 64.8363C125.223 70.211 130.718 75.8692 135.191 81.5497Z" fill="url(#paint0_linear_1_164)"/>
+<path d="M139.008 3.736H135.616V13H133.456V3.736H130.128V1.656H139.008V3.736ZM151.557 13H149.477V5.72L146.501 11.336H145.125L142.149 5.72L142.133 13H140.117V1.656H142.149L145.893 8.584L149.477 1.656H151.541L151.557 13Z" fill="#1F2937"/>
+<defs>
+<linearGradient id="paint0_linear_1_164" x1="30.4301" y1="141.748" x2="176.07" y2="60.7867" gradientUnits="userSpaceOnUse">
+<stop stop-color="#EF4136"/>
+<stop offset="1" stop-color="#FBB040"/>
+</linearGradient>
+</defs>
+</svg>
diff --git a/docs/source/_static/images/original_dark.svg b/docs/source/_static/images/original_dark.svg
new file mode 100644
index 0000000000000..532434f9214a6
--- /dev/null
+++ b/docs/source/_static/images/original_dark.svg
@@ -0,0 +1,31 @@
+<svg width="801" height="168" viewBox="0 0 801 168" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_1_181)">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M76.1297 168C88.4597 168 99.6097 158.25 107.58 142.55C127.23 144.6 143.09 139.99 149.76 128.51C156.29 117.25 152.65 101.68 141.58 86.09C152.36 70.67 155.85 55.32 149.39 44.19C143.01 33.19 128.19 28.5 109.69 29.92C101.65 11.62 89.5997 0 76.1297 0C62.6597 0 50.4997 11.73 42.4497 30.18C24.3497 28.97 9.87966 33.68 3.59966 44.51C-2.81034 55.56 0.589656 70.78 11.1897 86.09C0.299656 101.57 -3.24034 117.01 3.23966 128.18C9.80966 139.5 25.3097 144.13 44.5597 142.31C52.5397 158.15 63.7297 168 76.1297 168ZM52.7197 141.19C59.5097 154.21 68.6197 162.18 78.6397 162.18C88.6597 162.18 97.6397 154.63 104.76 142.22C95.7497 141.01 86.0297 138.46 76.0797 134.58C68.0497 137.65 60.1797 139.86 52.7197 141.19ZM65.0697 129.77C59.6397 131.52 54.3497 132.84 49.3097 133.72C47.3697 128.87 45.6997 123.54 44.3597 117.82C47.6797 120.07 51.1497 122.25 54.7497 124.32C58.1797 126.29 61.6197 128.11 65.0597 129.77H65.0697ZM76.1197 125.68C82.0597 123.2 88.0897 120.24 94.0797 116.79C102.76 111.8 110.61 106.25 117.41 100.43C118.01 95.13 118.33 89.64 118.33 84C118.33 76.79 117.81 69.8 116.84 63.17C111.11 58.53 104.73 54.12 97.7797 50.13C90.5497 45.98 83.2397 42.57 76.0697 39.91C69.1797 42.52 62.1597 45.82 55.2197 49.81C50.8597 52.32 46.7097 54.99 42.8197 57.78C41.3997 65.98 40.6297 74.8 40.6297 84C40.6297 91.65 41.1597 99.05 42.1597 106.04C47.2997 109.94 52.9197 113.66 58.9297 117.12C64.6597 120.41 70.4297 123.27 76.1197 125.68ZM87.1297 129.83C90.8397 128.07 94.5497 126.12 98.2497 124C104.42 120.45 110.18 116.61 115.48 112.58C113.79 120.73 111.42 128.22 108.52 134.76C101.82 133.95 94.6097 132.3 87.1297 129.83ZM117.71 110.84C116.16 119.65 113.87 127.8 110.98 135.03C127.49 136.56 140.52 132.79 145.95 123.44C151.17 114.43 148.57 101.58 140 88.28C134.26 96.01 126.71 103.7 117.71 110.84ZM136.56 79.61C145.46 66.59 148.33 53.99 143.08 44.93C137.99 36.16 126.02 31.96 110.76 32.43C114.3 41.14 116.97 51.23 118.5 62.2C125.46 67.76 131.53 73.64 136.55 79.61H136.56ZM108.21 32.55C111.86 40.6 114.7 50.11 116.43 60.57C110.98 56.37 105.01 52.37 98.6097 48.69C91.9597 44.87 85.2397 41.62 78.6197 38.96C89.0597 35.24 99.1497 33.11 108.2 32.54L108.21 32.55ZM107.08 30.16C97.4397 31.18 86.8897 33.79 76.0797 37.98C67.2997 34.65 58.7097 32.35 50.6497 31.1C57.5997 15.56 67.5697 5.83 78.6397 5.83C89.7097 5.83 99.7197 15.18 107.08 30.16ZM73.5197 38.99C65.2297 36.1 57.1697 34.21 49.6897 33.32C46.9897 39.84 44.7997 47.28 43.2497 55.38C46.7897 52.94 50.4997 50.6 54.3797 48.37C60.7497 44.71 67.1697 41.58 73.5197 38.99ZM33.7197 62.5C35.2297 51.54 37.8697 41.46 41.3797 32.74C26.5297 32.46 14.9097 36.66 9.91966 45.26C4.70966 54.24 7.48966 66.68 16.1997 79.58C21.0897 73.73 26.9797 67.96 33.7197 62.5ZM12.7697 88.3C18.3697 95.9 25.7197 103.48 34.4997 110.54C36.0297 119.36 38.3097 127.53 41.1797 134.78C25.0597 136.11 12.3797 132.31 7.03966 123.12C1.85966 114.19 4.36966 101.48 12.7597 88.31L12.7697 88.3ZM17.5697 81.54C21.9097 75.99 27.2197 70.45 33.3797 65.17C32.6597 71.22 32.2697 77.53 32.2697 84C32.2697 88.91 32.4897 93.72 32.9097 98.4C26.7997 92.89 21.6197 87.18 17.5597 81.53L17.5697 81.54ZM135.19 81.55C131 87.32 125.65 93.15 119.32 98.76C119.76 93.97 120 89.03 120 84C120 77.41 119.6 70.99 118.85 64.84C125.22 70.21 130.72 75.87 135.19 81.55Z" fill="url(#paint0_linear_1_181)"/>
+<path d="M204.34 45.97H192.83L190.47 52.2H185.2L196.07 23.88H201.47L212.54 52.2H206.78L204.34 45.97ZM202.74 41.74L198.54 30.99L194.46 41.74H202.73H202.74Z" fill="#FFF"/>
+<path d="M235.55 23.88H246.82C249.59 23.88 251.9 24.73 253.73 26.44C255.57 28.14 256.49 30.38 256.49 33.15C256.49 35.92 255.57 38.2 253.73 39.9C251.89 41.6 249.59 42.46 246.82 42.46H240.95V52.21H235.56V23.89L235.55 23.88ZM240.94 37.26H246.85C248 37.26 248.96 36.87 249.73 36.1C250.5 35.3 250.89 34.32 250.89 33.14C250.89 31.96 250.5 31 249.73 30.22C248.98 29.45 248.02 29.06 246.85 29.06H240.94V37.25V37.26Z" fill="#FFF"/>
+<path d="M293.67 45.97H282.16L279.8 52.2H274.53L285.4 23.88H290.79L301.86 52.2H296.1L293.66 45.97H293.67ZM292.07 41.74L287.87 30.99L283.79 41.74H292.06H292.07Z" fill="#FFF"/>
+<path d="M335.99 23C338.57 23 340.92 23.68 343.02 25.04C345.15 26.37 346.84 28.18 348.1 30.47L343.1 32.87C341.37 29.57 339 27.92 335.99 27.92C333.62 27.92 331.65 28.91 330.07 30.88C328.53 32.82 327.75 35.22 327.75 38.07C327.75 40.92 328.52 43.32 330.07 45.26C331.62 47.18 333.59 48.14 335.99 48.14C337.45 48.14 338.81 47.73 340.07 46.9C341.32 46.05 342.32 44.93 343.07 43.54L347.99 45.9C346.79 48.14 345.13 49.89 342.99 51.17C340.86 52.42 338.53 53.05 336 53.05C331.95 53.05 328.63 51.65 326.05 48.86C323.47 46.04 322.17 42.44 322.17 38.07C322.17 33.7 323.48 30.13 326.09 27.28C328.73 24.43 332.03 23.01 336 23.01L335.99 23Z" fill="#FFF"/>
+<path d="M393.49 52.2H388.09V40.61H377.1V52.2H371.71V23.88H377.1V35.43H388.09V23.88H393.49V52.2Z" fill="#FFF"/>
+<path d="M435.17 29.07H423.9V35.46H432.41V40.65H423.9V47H435.17V52.19H418.51V23.87H435.17V29.06V29.07Z" fill="#FFF"/>
+<path d="M185 76.1H207.25C218.63 76.1 227.58 79.3 234.1 85.69C240.69 92.02 243.98 100.14 243.98 110.04C243.98 119.94 240.69 128.1 234.1 134.49C227.51 140.88 218.53 144.08 207.15 144.08H184.99V76.1H185ZM199.96 90.49V129.7H206.67C209.55 129.7 212.2 129.38 214.63 128.74C217.12 128.1 219.43 127.08 221.54 125.67C223.71 124.26 225.41 122.25 226.62 119.63C227.83 116.95 228.44 113.75 228.44 110.04C228.44 106.33 227.83 103.17 226.62 100.55C225.4 97.93 223.71 95.92 221.54 94.51C219.43 93.1 217.16 92.08 214.73 91.44C212.3 90.8 209.61 90.48 206.67 90.48H199.96V90.49Z" fill="#FFF"/>
+<path d="M291.82 130.85H264.96L259.97 144.08H245.39L271.38 76.1H286.34L312.91 144.08H296.89L291.81 130.85H291.82ZM287.4 119.35L278.29 95.76L269.37 119.35H287.4Z" fill="#FFF"/>
+<path d="M362.7 90.49H342.94V144.09H327.98V90.49H308.51V76.11H362.7V90.49Z" fill="#FFF"/>
+<path d="M404.77 130.85H377.92L372.93 144.08H358.35L384.34 76.1H399.3L425.87 144.08H409.85L404.77 130.85ZM400.36 119.35L391.25 95.76L382.33 119.35H400.36Z" fill="#FFF"/>
+<path d="M474.73 90.49H448.45V102.95H468.4V117.33H448.45V144.08H433.49V76.1H474.73V90.48V90.49Z" fill="#FFF"/>
+<path d="M541.71 76.1V118.96C541.65 127.27 538.96 133.85 533.65 138.71C528.41 143.57 521.53 146 513.03 146C504.53 146 497.78 143.57 492.41 138.71C487.04 133.85 484.39 127.27 484.45 118.96V76.1H499.32V119.05C499.32 122.88 500.6 125.95 503.16 128.25C505.78 130.49 509.08 131.61 513.04 131.61C517 131.61 520.27 130.49 522.82 128.25C525.38 125.95 526.69 122.88 526.75 119.05V76.1H541.71Z" fill="#FFF"/>
+<path d="M578.69 73.99C585.72 73.99 591.96 75.59 597.39 78.78L590.77 91.24C585.98 89.19 581.43 88.17 577.15 88.17C570.95 88.17 567.85 90.25 567.85 94.4C567.85 96.45 569.03 98.17 571.4 99.58C573.83 100.92 577.28 102.39 581.76 103.99C586.3 105.52 589.72 106.96 592.02 108.3C598.22 111.88 601.32 117.25 601.32 124.41C601.32 131.12 598.86 136.4 593.93 140.23C589.01 144.07 582.87 145.98 575.52 145.98C571.36 145.98 567.05 145.21 562.57 143.68C558.09 142.15 554.61 140.39 552.12 138.41L560.18 126.43C567.02 130.46 573.22 132.21 578.79 131.7C580.96 131.51 582.76 130.74 584.16 129.4C585.63 128.06 586.24 126.43 585.98 124.51C585.72 122.72 584.22 121.25 581.47 120.1C578.72 118.89 575.14 117.48 570.73 115.88C566.38 114.28 562.83 112.4 560.08 110.22C555.03 106.13 552.5 101.05 552.5 94.98C552.5 91.98 553.01 89.26 554.03 86.83C555.05 84.4 556.4 82.42 558.06 80.89C559.79 79.29 561.8 77.98 564.1 76.96C566.4 75.87 568.77 75.11 571.2 74.66C573.63 74.21 576.12 73.99 578.68 73.99H578.69Z" fill="#FFF"/>
+<path d="M626.79 144.08H611.83V76.1H626.79V144.08Z" fill="#FFF"/>
+<path d="M647.04 84.16C653.88 77.38 662.48 74 672.84 74C683.2 74 691.73 77.39 698.45 84.16C705.23 90.87 708.62 99.4 708.62 109.76C708.62 120.12 705.33 128.68 698.74 135.45C692.15 142.16 683.52 145.68 672.84 146C662.48 146 653.88 142.58 647.04 135.74C640.26 128.84 636.87 120.21 636.87 109.85C636.87 99.49 640.26 90.93 647.04 84.15V84.16ZM672.84 87.61C666.64 87.61 661.68 89.69 657.97 93.84C654.26 97.99 652.41 103.3 652.41 109.76C652.41 116.22 654.26 121.62 657.97 125.96C661.74 130.24 666.7 132.38 672.84 132.38C678.98 132.38 683.96 130.24 687.61 125.96C691.25 121.68 693.08 116.31 693.08 109.85C693.08 103.39 691.26 98.12 687.61 94.03C684.03 89.88 679.11 87.73 672.84 87.61Z" fill="#FFF"/>
+<path d="M775 144.08H759.65L732.7 101.99V144.08H718.7V76.1H732.99L761 118.57V76.1H775V144.08Z" fill="#FFF"/>
+<path d="M788.01 78.18H784.62V87.44H782.46V78.18H779.13V76.1H788.01V78.18ZM800.56 87.45H798.48V80.17L795.5 85.79H794.12L791.14 80.17L791.12 87.45H789.1V76.11H791.13L794.87 83.04L798.45 76.11H800.51L800.53 87.45H800.56Z" fill="#FFF"/>
+</g>
+<defs>
+<linearGradient id="paint0_linear_1_181" x1="17.0897" y1="117.74" x2="162.73" y2="36.78" gradientUnits="userSpaceOnUse">
+<stop stop-color="#EF4137"/>
+<stop offset="1" stop-color="#FBB042"/>
+</linearGradient>
+<clipPath id="clip0_1_181">
+<rect width="800.56" height="168" fill="currentColor"/>
+</clipPath>
+</defs>
+</svg>
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
index 3b1b86daac6aa..91bc8359658f1 100644
--- a/docs/source/_static/theme_overrides.css
+++ b/docs/source/_static/theme_overrides.css
@@ -29,7 +29,6 @@
   --pst-color-h2: var(--color-text-base);
   /* Use softer blue from bootstrap's default info color */
   --pst-color-info: 23, 162, 184;
-  --pst-header-height: 0px;
 }
 
 code {
@@ -40,12 +39,34 @@ code {
   text-align: center;
 }
 
-/* Ensure the logo is properly displayed */
+/* Limit both light and dark mode logos in the navbar */
+.logo__image {
+  height: 32px;
+  width: auto;     
+  max-height: 2.5rem;
+}
+
+/* Display appropriate logo for dark and light mode */
+.light-logo {
+  display: inline;
+}
+
+.dark-logo {
+  display: none;
+}
+
+html[data-theme="dark"] .light-logo {
+  display: none;
+}
 
-.navbar-brand {
-  height: auto;
-  width: auto;
-  padding: 0 2em;
+html[data-theme="dark"] .dark-logo {
+  display: inline;
+  background-color: transparent !important;
+}
+
+/* Align search bar & theme switch right */
+.navbar-header-items__end {
+  margin-left: auto;
 }
 
 /* This is the bootstrap CSS style for "table-striped". Since the theme does
@@ -84,3 +105,54 @@ Details: 8rem for search box etc*/
         white-space: normal !important;
     }
 }
+
+/* Make wide tables scroll within the content area to avoid overlapping the
+   right sidebar. Prevents tables from bleeding underneath the sticky sidebar. */
+.bd-content table {
+  display: block;
+  overflow-x: auto;
+  -webkit-overflow-scrolling: touch;
+  max-width: 100%;
+}
+
+/* Make table container width fit content instead of spanning full width. */
+.pst-scrollable-table-container {
+  display: inline-block;
+  overflow-x: auto;
+  max-width: 100%;
+}
+
+/* Restore proper table display to maintain column alignment */
+.bd-content table thead,
+.bd-content table tbody { display: table-row-group; }
+
+.bd-content table tr { display: table-row; }
+
+.bd-content table th,
+.bd-content table td { 
+  display: table-cell; 
+  white-space: normal;
+}
+
+/* Maintain striped styling when table scrolls */
+.bd-content table tbody tr:nth-of-type(odd) {
+  background-color: rgba(0, 0, 0, 0.03);
+}
+
+
+/* Ensure the config tables are readable without having to scroll horizontally. */
+
+:is(#configuration-settings, #runtime-configuration-settings) table {
+  display: table;
+  table-layout: fixed;
+}
+
+:is(#configuration-settings, #runtime-configuration-settings) th,
+:is(#configuration-settings, #runtime-configuration-settings) td {
+  word-wrap: break-word;
+}
+
+:is(#configuration-settings, #runtime-configuration-settings) th:nth-child(2),
+:is(#configuration-settings, #runtime-configuration-settings) td:nth-child(2) {
+  width: 15%;
+}
diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html
index 7c3ecc3d802e1..fa3cd96b13605 100644
--- a/docs/source/_templates/docs-sidebar.html
+++ b/docs/source/_templates/docs-sidebar.html
@@ -1,21 +1,10 @@
-
-
-<form class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get">
-  <i class="icon fas fa-search"></i>
-  <input type="search" class="form-control" name="q" id="search-input" placeholder="{{ theme_search_bar_text }}" aria-label="{{ theme_search_bar_text }}" autocomplete="off" >
-</form>
-
 <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
+
   <div class="bd-toc-item active">
     {% if "python/api" in pagename or "python/generated" in pagename %}
-    {{ generate_nav_html("sidebar", startdepth=0, maxdepth=3, collapse=False, includehidden=True, titles_only=True) }}
+    {{ generate_toctree_html("sidebar", startdepth=0, maxdepth=3, collapse=False, includehidden=True, titles_only=True) }}
     {% else %}
-    {{ generate_nav_html("sidebar", startdepth=0, maxdepth=4, collapse=False, includehidden=True, titles_only=True) }}
+    {{ generate_toctree_html("sidebar", startdepth=0, maxdepth=4, collapse=False, includehidden=True, titles_only=True) }}
     {% endif %}
   </div>
-
-  <a class="navbar-brand" href="{{ pathto(master_doc) }}">
-    <img src="{{ pathto('_static/images/2x_bgwhite_original.png', 1) }}" class="logo" alt="logo">
-  </a>
-</nav>
-
+</nav>
\ No newline at end of file
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index d3163f1a81476..b7b135af5f0a9 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -1,9 +1,5 @@
 {% extends "pydata_sphinx_theme/layout.html" %}
 
-{# Silence the navbar #}
-{% block docs_navbar %}
-{% endblock %}
-
 <!--
     Custom footer
 -->
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 00037867a0923..03dcfb5bfa61b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,16 +26,17 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+import os
+import sys
+
+# To pickup rustdoc_trim.py
+sys.path.insert(0, os.path.abspath(".."))
 
 # -- Project information -----------------------------------------------------
 
-project = 'Apache DataFusion'
-copyright = '2019-2025, Apache Software Foundation'
-author = 'Apache Software Foundation'
+project = "Apache DataFusion"
+copyright = "2019-2025, Apache Software Foundation"
+author = "Apache Software Foundation"
 
 
 # -- General configuration ---------------------------------------------------
@@ -44,23 +45,25 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.ifconfig',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.napoleon',
-    'myst_parser',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.doctest",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "myst_parser",
+    "sphinx_reredirects",
+    "rustdoc_trim",
 ]
 
 source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
+    ".rst": "restructuredtext",
+    ".md": "markdown",
 }
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -82,10 +85,16 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'pydata_sphinx_theme'
+html_theme = "pydata_sphinx_theme"
 
 html_theme_options = {
+     "logo": {
+        "image_light": "_static/images/original.svg", 
+        "image_dark": "_static/images/original_dark.svg", 
+    },
     "use_edit_page_button": True,
+    "navbar_center": [],
+    "navbar_end": ["theme-switcher"],
 }
 
 html_context = {
@@ -98,18 +107,18 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 html_logo = "_static/images/2x_bgwhite_original.png"
 
-html_css_files = [
-    "theme_overrides.css"
-]
+html_css_files = ["theme_overrides.css"]
 
 html_sidebars = {
     "**": ["docs-sidebar.html"],
 }
 
+html_favicon = '_static/favicon.svg'
+
 # tell myst_parser to auto-generate anchor links for headers h1, h2, h3
 myst_heading_anchors = 3
 
@@ -120,4 +129,10 @@
 # presence of some special characters like: 🚀, å, {,... But this isn’t a major
 # issue for our documentation. So, suppress these warnings to keep our build
 # log cleaner.
-suppress_warnings = ['misc.highlighting_failure']
+suppress_warnings = ["misc.highlighting_failure"]
+
+redirects = {
+    "library-user-guide/adding-udfs": "functions/index.html",
+    "user-guide/runtime_configs": "configs.html",
+    "library-user-guide/upgrading": "/library-user-guide/upgrading/index.html",
+}
diff --git a/docs/source/contributor-guide/api-health.md b/docs/source/contributor-guide/api-health.md
index d811bc357445a..f950c7cc0b365 100644
--- a/docs/source/contributor-guide/api-health.md
+++ b/docs/source/contributor-guide/api-health.md
@@ -19,39 +19,68 @@
 
 # API health policy
 
-DataFusion is used extensively as a library and has a large public API, thus it
-is important that the API is well maintained. In general, we try to minimize
-breaking API changes, but they are sometimes necessary.
+DataFusion is used extensively as a library in other applications and has a
+large public API. We try to keep the API well maintained and minimize breaking
+changes to avoid issues for downstream users.
 
-When possible, rather than making breaking API changes, we prefer to deprecate
-APIs to give users time to adjust to the changes.
+## Breaking API Changes
 
-## Upgrade Guides
-
-When making changes that require DataFusion users to make changes to their code
-as part of an upgrade please consider adding documentation to the version
-specific [Upgrade Guide]
-
-[upgrade guide]: ../library-user-guide/upgrading.md
+### What is the public API and what is a breaking API change?
 
-## Breaking Changes
-
-In general, a function is part of the public API if it appears on the [docs.rs page]
+In general, an item is part of the public API if it appears on the [docs.rs page].
 
 Breaking public API changes are those that _require_ users to change their code
 for it to compile and execute, and are listed as "Major Changes" in the [SemVer
-Compatibility Section of the cargo book]. Common examples of breaking changes:
+Compatibility Section of the Cargo Book]. Common examples of breaking changes include:
 
 - Adding new required parameters to a function (`foo(a: i32, b: i32)` -> `foo(a: i32, b: i32, c: i32)`)
 - Removing a `pub` function
 - Changing the return type of a function
+- Adding a new function to a `trait` without a default implementation
+
+Examples of non-breaking changes include:
+
+- Marking a function as deprecated (`#[deprecated]`)
+- Adding a new function to a `trait` with a default implementation
+
+### When to make breaking API changes?
+
+When possible, we prefer to avoid making breaking API changes. One common way to
+avoid such changes is to deprecate the old API, as described in the [Deprecation
+Guidelines](#deprecation-guidelines) section below.
+
+If you do want to propose a breaking API change, we must weigh the benefits of the
+change with the cost (impact on downstream users). It is often frustrating for
+downstream users to change their applications, and it is even more so if they
+do not gain improved capabilities.
+
+Examples of good reasons for making a breaking API change include:
 
-When making breaking public API changes, please add the `api-change` label to
-the PR so we can highlight the changes in the release notes.
+- The change allows new use cases that were not possible before
+- The change significantly enables improved performance
+
+Examples of potentially weak reasons for making breaking API changes include:
+
+- The change is an internal refactor to make DataFusion more consistent
+- The change is to remove an API that is not widely used but has not been marked as deprecated
+
+### What to do when making breaking API changes?
+
+When making breaking public API changes, please:
+
+1. Add the `api-change` label to the PR so we can highlight the changes in the release notes.
+2. Consider adding documentation to the version-specific [Upgrade Guide] if the required changes are non-trivial.
 
 [docs.rs page]: https://docs.rs/datafusion/latest/datafusion/index.html
 [semver compatibility section of the cargo book]: https://doc.rust-lang.org/cargo/reference/semver.html#change-categories
 
+## Upgrade Guides
+
+When a change requires DataFusion users to modify their code as part of an
+upgrade, please consider documenting it in the version-specific [Upgrade Guide].
+
+[upgrade guide]: ../library-user-guide/upgrading/index.rst
+
 ## Deprecation Guidelines
 
 When deprecating a method:
@@ -59,8 +88,8 @@ When deprecating a method:
 - Mark the API as deprecated using `#[deprecated]` and specify the exact DataFusion version in which it was deprecated
 - Concisely describe the preferred API to help the user transition
 
-The deprecated version is the next version which contains the deprecation. For
-example, if the current version listed in [`Cargo.toml`] is `43.0.0` then the next
+The deprecated version is the next version that introduces the deprecation. For
+example, if the current version listed in [`Cargo.toml`] is `43.0.0`, then the next
 version will be `44.0.0`.
 
 [`cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
@@ -76,4 +105,4 @@ pub fn api_to_deprecated(a: usize, b: usize) {}
 
 Deprecated methods will remain in the codebase for a period of 6 major versions or 6 months, whichever is longer, to provide users ample time to transition away from them.
 
-Please refer to [DataFusion releases](https://crates.io/crates/datafusion/versions) to plan ahead API migration
+Please refer to [DataFusion releases](https://crates.io/crates/datafusion/versions) to plan API migration ahead of time.
diff --git a/docs/source/contributor-guide/architecture.md b/docs/source/contributor-guide/architecture.md
index 1a094968a2742..8197e0cd00a08 100644
--- a/docs/source/contributor-guide/architecture.md
+++ b/docs/source/contributor-guide/architecture.md
@@ -55,7 +55,7 @@ contain features that are useful for a wide range of use cases. Use case specifi
 functionality (such as very specific time series or stream processing features)
 are typically implemented using the extension APIs.
 
-If have a use case that is not covered by the existing APIs, we would love to
+If you have a use case that is not covered by the existing APIs, we would love to
 work with you to design a new general purpose API. There are often others who are
 interested in similar extensions and the act of defining the API often improves
 the code overall for everyone.
diff --git a/docs/source/contributor-guide/architecture/dependency-graph.md b/docs/source/contributor-guide/architecture/dependency-graph.md
new file mode 100644
index 0000000000000..be3502f48beda
--- /dev/null
+++ b/docs/source/contributor-guide/architecture/dependency-graph.md
@@ -0,0 +1,180 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Workspace Dependency Graph
+
+This page shows the dependency relationships between DataFusion's workspace
+crates. This only includes internal dependencies, external crates like `Arrow` are not included
+
+The dependency graph is auto-generated by `docs/scripts/generate_dependency_graph.sh` to ensure it stays up-to-date, and the script now runs automatically as part of `docs/build.sh`.
+
+## Dependency Graph for Workspace Crates
+
+<!--
+  Below is an embedded .svg file, with interactive functionalities like drag/zoom-in/etc.
+  -->
+
+```{raw} html
+<div id="workspace-deps-wrapper" style="border:1px solid #d4d4d8;border-radius:10px;overflow:hidden;background:#fff;">
+  <div id="workspace-deps-inline" style="min-height:760px;width:100%;background:#f8fafc;overflow:hidden;padding:0;margin:0;">
+```
+
+```{eval-rst}
+.. raw:: html
+   :file: ../../_static/data/deps.svg
+```
+
+```{raw} html
+  </div>
+  <div style="padding:10px 12px;background:#f1f5f9;border-top:1px solid #e5e7eb;display:flex;justify-content:space-between;align-items:center;flex-wrap:wrap;gap:8px;">
+    <span style="color:#334155;font-size:0.95rem;">Interactive SVG (pan, zoom, search)</span>
+    <div style="display:flex;align-items:center;gap:6px;">
+      <button id="workspace-deps-zoom-out" type="button" style="padding:6px 10px;border:1px solid #cbd5e1;border-radius:6px;background:#fff;color:#334155;cursor:pointer;">−</button>
+      <button id="workspace-deps-zoom-in" type="button" style="padding:6px 10px;border:1px solid #cbd5e1;border-radius:6px;background:#fff;color:#334155;cursor:pointer;">+</button>
+    </div>
+    <a href="../../_static/data/deps.svg" target="_blank" rel="noopener"
+       style="font-weight:600;color:#2563eb;text-decoration:none;">Open SVG ↗</a>
+  </div>
+</div>
+<script>
+  (function () {
+    const host = document.getElementById("workspace-deps-inline");
+    if (!host) {
+      return;
+    }
+
+    const svg = host.querySelector("svg");
+      if (!svg) {
+        host.textContent = "Unable to load dependency graph.";
+        host.style.display = "flex";
+        host.style.alignItems = "center";
+        host.style.justifyContent = "center";
+      host.style.background = "#f8fafc";
+      return;
+    }
+
+    svg.removeAttribute("width");
+    svg.removeAttribute("height");
+    svg.style.width = "100%";
+    svg.style.height = "100%";
+    svg.style.cursor = "grab";
+    svg.style.touchAction = "none";
+
+    const rawViewBox = (svg.getAttribute("viewBox") || "").split(/\s+/).map(Number);
+    if (rawViewBox.length !== 4 || rawViewBox.some((v) => Number.isNaN(v))) {
+      return;
+    }
+
+    const initial = {
+      x: rawViewBox[0],
+      y: rawViewBox[1],
+      width: rawViewBox[2],
+      height: rawViewBox[3],
+    };
+
+    const state = { ...initial };
+    const applyViewBox = () => {
+      svg.setAttribute("viewBox", `${state.x} ${state.y} ${state.width} ${state.height}`);
+    };
+
+    let isPanning = false;
+    let last = { x: 0, y: 0 };
+
+    svg.addEventListener("pointerdown", (event) => {
+      isPanning = true;
+      last = { x: event.clientX, y: event.clientY };
+      svg.setPointerCapture(event.pointerId);
+      svg.style.cursor = "grabbing";
+    });
+
+    const endPan = (event) => {
+      if (event && svg.hasPointerCapture(event.pointerId)) {
+        svg.releasePointerCapture(event.pointerId);
+      }
+      isPanning = false;
+      svg.style.cursor = "grab";
+    };
+
+    svg.addEventListener("pointerup", endPan);
+    svg.addEventListener("pointerleave", endPan);
+    svg.addEventListener("pointercancel", endPan);
+
+    const zoomBy = (factor) => {
+      const targetWidth = state.width * factor;
+      const targetHeight = state.height * factor;
+      const minSize = Math.max(initial.width * 0.05, 10);
+      const maxSize = initial.width * 20;
+      const clampedWidth = Math.min(Math.max(targetWidth, minSize), maxSize);
+      const clampedHeight = Math.min(Math.max(targetHeight, minSize), maxSize);
+
+      state.x += (state.width - clampedWidth) / 2;
+      state.y += (state.height - clampedHeight) / 2;
+      state.width = clampedWidth;
+      state.height = clampedHeight;
+      applyViewBox();
+    };
+
+    const normalizeDelta = (deltaY, deltaMode) => {
+      // Make trackpad/wheel zoom feel smooth across devices.
+      const multiplier = deltaMode === 1 ? 16 : deltaMode === 2 ? window.innerHeight : 1;
+      return deltaY * multiplier;
+    };
+
+    svg.addEventListener("pointermove", (event) => {
+      if (!isPanning) {
+        return;
+      }
+      const scaleX = state.width / svg.clientWidth;
+      const scaleY = state.height / svg.clientHeight;
+      state.x -= (event.clientX - last.x) * scaleX;
+      state.y -= (event.clientY - last.y) * scaleY;
+      last = { x: event.clientX, y: event.clientY };
+      applyViewBox();
+    });
+
+    svg.addEventListener("wheel", (event) => {
+      event.preventDefault();
+
+      const delta = normalizeDelta(event.deltaY, event.deltaMode);
+      const factor = Math.exp(delta * 0.0015); // smaller magnitude for smoother scrolling
+      zoomBy(factor);
+    }, { passive: false });
+
+    const zoomIn = document.getElementById("workspace-deps-zoom-in");
+    const zoomOut = document.getElementById("workspace-deps-zoom-out");
+    if (zoomIn) {
+      zoomIn.addEventListener("click", () => zoomBy(0.9));
+    }
+    if (zoomOut) {
+      zoomOut.addEventListener("click", () => zoomBy(1.1));
+    }
+  })();
+</script>
+```
+
+### Legend
+
+- black lines: normal dependency
+- blue lines: dev-dependency
+- green lines: build-dependency
+- dotted lines: optional dependency (could be removed by disabling a cargo feature)
+
+Transitive dependencies are intentionally ignored to keep the graph readable.
+
+The dependency graph is generated through `cargo depgraph` by `docs/scripts/generate_dependency_graph.sh`.
diff --git a/docs/source/contributor-guide/communication.md b/docs/source/contributor-guide/communication.md
index 5d4561a3512c8..ad80ea498f501 100644
--- a/docs/source/contributor-guide/communication.md
+++ b/docs/source/contributor-guide/communication.md
@@ -17,56 +17,56 @@
   under the License.
 -->
 
-# Communication
+# Community Communication
 
 We welcome participation from everyone and encourage you to join us, ask
 questions, and get involved.
-
 All participation in the Apache DataFusion project is governed by the
 Apache Software Foundation's [code of
 conduct](https://www.apache.org/foundation/policies/conduct.html).
 
 ## GitHub
 
-The vast majority of communication occurs in the open on our
-[github repository](https://github.com/apache/datafusion) in the form of tickets, issues, discussions, and Pull Requests.
+The primary means of communication is the
+[GitHub repository](https://github.com/apache/datafusion) in the form of issues, discussions, and Pull Requests.
+Our repository is open to everyone. We encourage you to
+participate by reporting issues, asking questions, and contributing code.
 
-## Slack and Discord
+## Chat
 
-We use the Slack and Discord platforms for informal discussions and coordination. These are great places to
-meet other contributors and get guidance on where to contribute. It is important to note that any technical designs and
-decisions are made fully in the open, on GitHub.
+We also use the Discord and Slack platforms for lower latency, informal discussions and coordination.
+These are great places to
+meet other members of the community, ask questions, and brainstorm ideas.
+However, to ensure technical discussions are archived and accessible to everyone,
+all technical designs are recorded and formalized in GitHub issues.
 
-Most of us use the [ASF Slack
-workspace](https://s.apache.org/slack-invite) and the [Arrow Rust Discord
-server][discord-link] for discussions.
+### Discord
 
-There are specific channels for Arrow, DataFusion, and the DataFusion subprojects (Ballista, Comet, Python, etc).
+Historically, the most active discussion forum has been the [Arrow Rust Discord
+server][discord-link] which has specific channels for Arrow, DataFusion, and
+DataFusion subprojects such as Ballista, Comet, Python, etc.
+DataFusion specific channels are prefixed with the `#datafusion-` tag.
+We recommend new users join this server for real-time discussions with the community.
 
-In Slack we use these channels:
+### Slack
 
-- #arrow
-- #arrow-rust
-- #datafusion
-- #datafusion-ballista
-- #datafusion-comet
-- #datafusion-python
+Some of the community also uses the [ASF Slack workspace] for discussions. This
+has historically been much less active than the Discord server.
+Unfortunately, due to spammers, the ASF Slack workspace [requires an invitation]
+to join. We are happy to invite any community member -- please ask for an
+invitation in the Discord server.
 
-In Discord we use these channels:
+[asf slack workspace]: https://the-asf.slack.com/
+[requires an invitation]: https://s.apache.org/slack-invite
 
-- #ballista
-- #comet
-- #contrib-federation
-- #datafusion
-- #datafusion-python
-- #dolomite-optimizer
-- #general
-- #hiring
-- #incremental-materialized-views
+In Slack, we use these channels:
 
-Unfortunately, due to spammers, the ASF Slack workspace requires an invitation
-to join. We are happy to invite you -- please ask for an invitation in the
-Discord server.
+- `#arrow`
+- `#arrow-rust`
+- `#datafusion`
+- `#datafusion-ballista`
+- `#datafusion-comet`
+- `#datafusion-python`
 
 ### Job Board
 
@@ -77,8 +77,8 @@ Please feel free to post links to DataFusion related jobs there.
 ## Mailing Lists
 
 Like other Apache projects, we use [mailing lists] for certain purposes, most
-importantly release coordination. Other than the release process, most
-DataFusion mailing list traffic will simply link to a GitHub issue or PR where
+importantly release coordination and announcing new committers and PMC members.
+Other than these processes, most DataFusion mailing list traffic will link to a GitHub issue or PR where
 the actual discussion occurs. The project mailing lists are:
 
 - [`dev@datafusion.apache.org`](mailto:dev@datafusion.apache.org): the main
diff --git a/docs/source/contributor-guide/development_environment.md b/docs/source/contributor-guide/development_environment.md
index cd1b8ea356427..77910b3540dc1 100644
--- a/docs/source/contributor-guide/development_environment.md
+++ b/docs/source/contributor-guide/development_environment.md
@@ -75,18 +75,20 @@ Alternatively a binary release can be downloaded from the [Release Page](https:/
 
 DataFusion is written in Rust and it uses a standard rust toolkit:
 
+- `rustup update stable` DataFusion generally uses the latest stable release of Rust, though it may lag when new Rust toolchains release
+  - See which toolchain is currently pinned in the [`rust-toolchain.toml`](https://github.com/apache/datafusion/blob/main/rust-toolchain.toml) file
+  - This can cause issues such as not having the rust-analyzer component installed for the specified toolchain, in which case just install it manually, e.g. `rustup component add --toolchain 1.88.0 rust-analyzer`
 - `cargo build`
 - `cargo fmt` to format the code
-- `cargo test` to test
 - etc.
 
-Note that running `cargo test` requires significant memory resources, due to cargo running many tests in parallel by default. If you run into issues with slow tests or system lock ups, you can significantly reduce the memory required by instead running `cargo test -- --test-threads=1`. For more information see [this issue](https://github.com/apache/datafusion/issues/5347).
-
 Testing setup:
 
-- `rustup update stable` DataFusion uses the latest stable release of rust
 - `git submodule init`
 - `git submodule update --init --remote --recursive`
+- `cargo test` to run tests
+
+Note that running `cargo test` requires significant memory resources, due to cargo running many tests in parallel by default. If you run into issues with slow tests or system lock ups, you can significantly reduce the memory required by instead running `cargo test -- --test-threads=1`. For more information see [this issue](https://github.com/apache/datafusion/issues/5347).
 
 Formatting instructions:
 
diff --git a/docs/source/contributor-guide/governance.md b/docs/source/contributor-guide/governance.md
index 27ff90eb92c8d..e08308ad7a816 100644
--- a/docs/source/contributor-guide/governance.md
+++ b/docs/source/contributor-guide/governance.md
@@ -19,10 +19,6 @@
 
 # Governance
 
-The current PMC and committers are listed in the [Apache Phonebook].
-
-[apache phonebook]: https://projects.apache.org/committee.html?datafusion
-
 ## Overview
 
 DataFusion is part of the [Apache Software Foundation] and is governed following
@@ -38,6 +34,88 @@ As much as practicable, we strive to make decisions by consensus, and anyone in
 the community is encouraged to propose ideas, start discussions, and contribute
 to the project.
 
+## People
+
+DataFusion is currently governed by the following individuals
+
+<!--
+
+The following table can be updated by running the following script:
+
+docs/scripts/update_committer_list.py
+
+Notes:
+
+* The script only updates the Name and Apache ID columns. The rest of the data
+  is manually provided.
+
+-->
+
+<!-- Begin Auto-Generated Committer List -->
+
+| Name                    | Apache ID        | github                                                  | Affiliation    | Role      |
+| ----------------------- | ---------------- | ------------------------------------------------------- | -------------- | --------- |
+| Andrew Lamb             | alamb            | [alamb](https://github.com/alamb)                       | InfluxData     | PMC Chair |
+| Andrew Grove            | agrove           | [andygrove](https://github.com/andygrove)               | Apple          | PMC       |
+| Mustafa Akur            | akurmustafa      | [akurmustafa](https://github.com/akurmustafa)           | OHSU           | PMC       |
+| Berkay Şahin            | berkay           | [berkaysynnada](https://github.com/berkaysynnada)       | Synnada        | PMC       |
+| Oleksandr Voievodin     | comphead         | [comphead](https://github.com/comphead)                 | Apple          | PMC       |
+| Daniël Heres            | dheres           | [Dandandan](https://github.com/Dandandan)               |                | PMC       |
+| QP Hou                  | houqp            | [houqp](https://github.com/houqp)                       |                | PMC       |
+| Jie Wen                 | jakevin          | [jackwener](https://github.com/jackwener)               |                | PMC       |
+| Jay Zhan                | jayzhan          | [jayzhan211](https://github.com/jayzhan211)             |                | PMC       |
+| Jeffrey Vo              | jeffreyvo        | [Jefffrey](https://github.com/Jefffrey)                 |                | PMC       |
+| Jonah Gao               | jonah            | [jonahgao](https://github.com/jonahgao)                 |                | PMC       |
+| Kun Liu                 | liukun           | [liukun4515](https://github.com/liukun4515)             |                | PMC       |
+| Mehmet Ozan Kabak       | ozankabak        | [ozankabak](https://github.com/ozankabak)               | Synnada, Inc   | PMC       |
+| Tim Saucer              | timsaucer        | [timsaucer](https://github.com/timsaucer)               |                | PMC       |
+| L. C. Hsieh             | viirya           | [viirya](https://github.com/viirya)                     | Databricks     | PMC       |
+| Ruihang Xia             | wayne            | [waynexia](https://github.com/waynexia)                 | Greptime       | PMC       |
+| Wes McKinney            | wesm             | [wesm](https://github.com/wesm)                         | Posit          | PMC       |
+| Will Jones              | wjones127        | [wjones127](https://github.com/wjones127)               | LanceDB        | PMC       |
+| Xudong Wang             | xudong963        | [xudong963](https://github.com/xudong963)               | Polygon.io     | PMC       |
+| Yongting You            | ytyou            | [2010YOUY01](https://github.com/2010YOUY01)             | Independent    | PMC       |
+| Adrian Garcia Badaracco | adriangb         | [adriangb](https://github.com/adriangb)                 | Pydantic       | Committer |
+| Brent Gardner           | avantgardner     | [avantgardnerio](https://github.com/avantgardnerio)     | Coralogix      | Committer |
+| Dmitrii Blaginin        | blaginin         | [blaginin](https://github.com/blaginin)                 | SpiralDB       | Committer |
+| Piotr Findeisen         | findepi          | [findepi](https://github.com/findepi)                   | dbt Labs       | Committer |
+| Gabriel Musat           | gabotechs        | [gabotechs](https://github.com/gabotechs)               | DataDog        | Committer |
+| Jax Liu                 | goldmedal        | [goldmedal](https://github.com/goldmedal)               | Canner         | Committer |
+| Huaxin Gao              | huaxingao        | [huaxingao](https://github.com/huaxingao)               |                | Committer |
+| Ifeanyi Ubah            | iffyio           | [iffyio](https://github.com/iffyio)                     | Validio        | Committer |
+| Liu Jiayu               | jiayuliu         | [jimexist](https://github.com/jimexist)                 |                | Committer |
+| Ruiqiu Cao              | kamille          | [Rachelint](https://github.com/Rachelint)               | Tencent        | Committer |
+| Kazuyuki Tanimura       | kazuyukitanimura | [kazuyukitanimura](https://github.com/kazuyukitanimura) |                | Committer |
+| Eduard Karacharov       | korowa           | [korowa](https://github.com/korowa)                     |                | Committer |
+| Siew Kam Onn            | kosiew           | [kosiew](https://github.com/kosiew)                     |                | Committer |
+| Lewis Zhang             | linwei           | [lewiszlw](https://github.com/lewiszlw)                 | diit.cn        | Committer |
+| Matt Butrovich          | mbutrovich       | [mbutrovich](https://github.com/mbutrovich)             | Apple          | Committer |
+| Metehan Yildirim        | mete             | [metegenez](https://github.com/metegenez)               |                | Committer |
+| Marko Milenković        | milenkovicm      | [milenkovicm](https://github.com/milenkovicm)           |                | Committer |
+| Wang Mingming           | mingmwang        | [mingmwang](https://github.com/mingmwang)               |                | Committer |
+| Michael Ward            | mjward           | [Michael-J-Ward ](https://github.com/Michael-J-Ward)    |                | Committer |
+| Marco Neumann           | mneumann         | [crepererum](https://github.com/crepererum)             | InfluxData     | Committer |
+| Zhong Yanghong          | nju_yaho         | [yahoNanJing](https://github.com/yahoNanJing)           |                | Committer |
+| Paddy Horan             | paddyhoran       | [paddyhoran](https://github.com/paddyhoran)             | Assured Allies | Committer |
+| Parth Chandra           | parthc           | [parthchandra](https://github.com/parthchandra)         | Apple          | Committer |
+| Rémi Dettai             | rdettai          | [rdettai](https://github.com/rdettai)                   |                | Committer |
+| Raz Luvaton             | rluvaton         | [rluvaton](https://github.com/rluvaton)                 |                | Committer |
+| Chao Sun                | sunchao          | [sunchao](https://github.com/sunchao)                   | OpenAI         | Committer |
+| Daniel Harris           | thinkharderdev   | [thinkharderdev](https://github.com/thinkharderdev)     | Coralogix      | Committer |
+| Raphael Taylor-Davies   | tustvold         | [tustvold](https://github.com/tustvold)                 |                | Committer |
+| Zhen Wang               | wangzhen         | [wForget](https://github.com/wForget)                   |                | Committer |
+| Weijun Huang            | weijun           | [Weijun-H](https://github.com/Weijun-H)                 | OrbDB          | Committer |
+| Yang Jiang              | yangjiang        | [Ted-jiang](https://github.com/Ted-jiang)               | Ebay           | Committer |
+| Yoav Cohen              | ycohen           | [yoavcloud](https://github.com/yoavcloud)               |                | Committer |
+| Yijie Shen              | yjshen           | [yjshen](https://github.com/yjshen)                     | DataPelago     | Committer |
+| Qi Zhu                  | zhuqi            | [zhuqi-lucas](https://github.com/zhuqi-lucas)           | Polygon.io     | Committer |
+
+<!-- End Auto-Generated Committer List -->
+
+Note that the authoritative list of PMC and committers is the [Apache Phonebook]
+
+[apache phonebook]: https://projects.apache.org/committee.html?datafusion
+
 ## Roles
 
 - **Contributors**: Anyone who contributes to the project, whether it be code,
diff --git a/docs/source/contributor-guide/gsoc_application_guidelines.md b/docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md
similarity index 99%
rename from docs/source/contributor-guide/gsoc_application_guidelines.md
rename to docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md
index e8ca9703a5ddf..c127b4231b8e1 100644
--- a/docs/source/contributor-guide/gsoc_application_guidelines.md
+++ b/docs/source/contributor-guide/gsoc/gsoc_application_guidelines_2025.md
@@ -1,4 +1,4 @@
-# GSoC Application Guidelines
+# GSoC Application Guidelines (2025)
 
 ## Introduction
 
diff --git a/docs/source/contributor-guide/gsoc_project_ideas.md b/docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md
similarity index 99%
rename from docs/source/contributor-guide/gsoc_project_ideas.md
rename to docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md
index da6c24e2921b1..d81d9eb9adab5 100644
--- a/docs/source/contributor-guide/gsoc_project_ideas.md
+++ b/docs/source/contributor-guide/gsoc/gsoc_project_ideas_2025.md
@@ -1,4 +1,4 @@
-# GSoC Project Ideas
+# GSoC Project Ideas (2025)
 
 ## Introduction
 
diff --git a/docs/source/contributor-guide/gsoc/index.rst b/docs/source/contributor-guide/gsoc/index.rst
new file mode 100644
index 0000000000000..10b0013e9b169
--- /dev/null
+++ b/docs/source/contributor-guide/gsoc/index.rst
@@ -0,0 +1,36 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Google Summer of Code (GSOC)
+============================
+
+DataFusion has participated in
+`Google Summer of Code (GSOC) <https://summerofcode.withgoogle.com/>`_
+since 2025. GSOC is a global program that offers students stipends to
+write code for open source projects.
+
+If you are a interested in contributing to DataFusion, we encourage you
+to apply. You can find more information about the application process and
+project ideas in the sections below.
+
+
+.. toctree::
+   :maxdepth: 1
+
+   gsoc_application_guidelines_2025
+   gsoc_project_ideas_2025
+
diff --git a/docs/source/contributor-guide/howtos.md b/docs/source/contributor-guide/howtos.md
index 89a1bc7360a14..18d9391d24bbe 100644
--- a/docs/source/contributor-guide/howtos.md
+++ b/docs/source/contributor-guide/howtos.md
@@ -21,60 +21,86 @@
 
 ## How to update the version of Rust used in CI tests
 
-- Make a PR to update the [rust-toolchain] file in the root of the repository:
+Make a PR to update the [rust-toolchain] file in the root of the repository.
 
 [rust-toolchain]: https://github.com/apache/datafusion/blob/main/rust-toolchain.toml
 
-## How to add a new scalar function
-
-Below is a checklist of what you need to do to add a new scalar function to DataFusion:
-
-- Add the actual implementation of the function to a new module file within:
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions-nested) for arrays, maps and structs functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/crypto) for crypto functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/datetime) for datetime functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/encoding) for encoding functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/math) for math functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/regex) for regex functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/string) for string functions
-  - [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/unicode) for unicode functions
-  - create a new module [here](https://github.com/apache/datafusion/tree/main/datafusion/functions/src/) for other functions.
-- New function modules - for example a `vector` module, should use a [rust feature](https://doc.rust-lang.org/cargo/reference/features.html) (for example `vector_expressions`) to allow DataFusion
-  users to enable or disable the new module as desired.
-- The implementation of the function is done via implementing `ScalarUDFImpl` trait for the function struct.
-  - See the [advanced_udf.rs] example for an example implementation
-  - Add tests for the new function
-- To connect the implementation of the function add to the mod.rs file:
-  - a `mod xyz;` where xyz is the new module file
-  - a call to `make_udf_function!(..);`
-  - an item in `export_functions!(..);`
-- In [sqllogictest/test_files], add new `sqllogictest` integration tests where the function is called through SQL against well known data and returns the expected result.
-  - Documentation for `sqllogictest` [here](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md)
-- Add SQL reference documentation [here](https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql/scalar_functions.md)
-  - An example of this being done can be seen [here](https://github.com/apache/datafusion/pull/12775)
-  - Run `./dev/update_function_docs.sh` to update docs
-
-[advanced_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
-[datafusion/expr/src]: https://github.com/apache/datafusion/tree/main/datafusion/expr/src
-[sqllogictest/test_files]: https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest/test_files
-
-## How to add a new aggregate function
-
-Below is a checklist of what you need to do to add a new aggregate function to DataFusion:
-
-- Add the actual implementation of an `Accumulator` and `AggregateExpr`:
-- In [datafusion/expr/src], add:
-  - a new variant to `AggregateFunction`
-  - a new entry to `FromStr` with the name of the function as called by SQL
-  - a new line in `return_type` with the expected return type of the function, given an incoming type
-  - a new line in `signature` with the signature of the function (number and types of its arguments)
-  - a new line in `create_aggregate_expr` mapping the built-in to the implementation
-  - tests to the function.
-- In [sqllogictest/test_files], add new `sqllogictest` integration tests where the function is called through SQL against well known data and returns the expected result.
-  - Documentation for `sqllogictest` [here](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md)
-- Add SQL reference documentation [here](https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql/aggregate_functions.md)
-  - An example of this being done can be seen [here](https://github.com/apache/datafusion/pull/12775)
-  - Run `./dev/update_function_docs.sh` to update docs
+## Adding new functions
+
+**Implementation**
+
+| Function type | Location to implement     | Trait to implement                             | Macros to use                                    | Example              |
+| ------------- | ------------------------- | ---------------------------------------------- | ------------------------------------------------ | -------------------- |
+| Scalar        | [functions][df-functions] | [`ScalarUDFImpl`]                              | `make_udf_function!()` and `export_functions!()` | [`advanced_udf.rs`]  |
+| Nested        | [functions-nested]        | [`ScalarUDFImpl`]                              | `make_udf_expr_and_func!()`                      |                      |
+| Aggregate     | [functions-aggregate]     | [`AggregateUDFImpl`] and an [`Accumulator`]    | `make_udaf_expr_and_func!()`                     | [`advanced_udaf.rs`] |
+| Window        | [functions-window]        | [`WindowUDFImpl`] and a [`PartitionEvaluator`] | `define_udwf_and_expr!()`                        | [`advanced_udwf.rs`] |
+| Table         | [functions-table]         | [`TableFunctionImpl`] and a [`TableProvider`]  | `create_udtf_function!()`                        | [`simple_udtf.rs`]   |
+
+- The macros are to simplify some boilerplate such as ensuring a DataFrame API compatible function is also created
+- Ensure new functions are properly exported through the subproject
+  `mod.rs` or `lib.rs`.
+- Functions should preferably provide documentation via the `#[user_doc(...)]` attribute so their documentation
+  can be included in the SQL reference documentation (see below section)
+- Scalar functions are further grouped into modules for families of functions (e.g. string, math, datetime).
+  Functions should be added to the relevant module; if a new module needs to be created then a new [Rust feature]
+  should also be added to allow DataFusion users to conditionally compile the modules as needed
+- Aggregate functions can optionally implement a [`GroupsAccumulator`] for better performance
+
+Spark compatible functions are [located in separate crate][df-spark] but otherwise follow the same steps, though all
+function types (e.g. scalar, nested, aggregate) are grouped together in the single location.
+
+[df-functions]: https://github.com/apache/datafusion/tree/main/datafusion/functions
+[functions-nested]: https://github.com/apache/datafusion/tree/main/datafusion/functions-nested
+[functions-aggregate]: https://github.com/apache/datafusion/tree/main/datafusion/functions-aggregate
+[functions-window]: https://github.com/apache/datafusion/tree/main/datafusion/functions-window
+[functions-table]: https://github.com/apache/datafusion/tree/main/datafusion/functions-table
+[df-spark]: https://github.com/apache/datafusion/tree/main/datafusion/spark
+[`scalarudfimpl`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.ScalarUDFImpl.html
+[`aggregateudfimpl`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.AggregateUDFImpl.html
+[`accumulator`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.Accumulator.html
+[`groupsaccumulator`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.GroupsAccumulator.html
+[`windowudfimpl`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.WindowUDFImpl.html
+[`partitionevaluator`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.PartitionEvaluator.html
+[`tablefunctionimpl`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableFunctionImpl.html
+[`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
+[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
+[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
+[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
+[`simple_udtf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udtf.rs
+[rust feature]: https://doc.rust-lang.org/cargo/reference/features.html
+
+**Testing**
+
+Prefer adding `sqllogictest` integration tests where the function is called via SQL against
+well known data and returns an expected result. See the existing [test files][slt-test-files] if
+there is an appropriate file to add test cases to, otherwise create a new file. See the
+[`sqllogictest` documentation][slt-readme] for details on how to construct these tests.
+Ensure edge case, `null` input cases are considered in these tests.
+
+If a behaviour cannot be tested via `sqllogictest` (e.g. testing `simplify()`, needs to be
+tested in isolation from the optimizer, difficult to construct exact input via `sqllogictest`)
+then tests can be added as Rust unit tests in the implementation module, though these should be
+kept minimal where possible
+
+[slt-test-files]: https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest/test_files
+[slt-readme]: https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md
+
+**Documentation**
+
+Run documentation update script `./dev/update_function_docs.sh` which will update the relevant
+markdown document [here][fn-doc-home] (see the documents for [scalar][fn-doc-scalar],
+[aggregate][fn-doc-aggregate] and [window][fn-doc-window] functions)
+
+- You _should not_ manually update the markdown document after running the script as those manual
+  changes would be overwritten on next execution
+- Reference [GitHub issue] which introduced this behaviour
+
+[fn-doc-home]: https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql
+[fn-doc-scalar]: https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql/scalar_functions.md
+[fn-doc-aggregate]: https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql/aggregate_functions.md
+[fn-doc-window]: https://github.com/apache/datafusion/blob/main/docs/source/user-guide/sql/window_functions.md
+[github issue]: https://github.com/apache/datafusion/issues/12740
 
 ## How to display plans graphically
 
@@ -97,11 +123,13 @@ can be displayed. For example, the following command creates a
 dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf
 ```
 
-## How to format `.md` document
+## How to format `.md` documents
 
-We are using `prettier` to format `.md` files.
+We use [`prettier`] to format `.md` files.
 
-You can either use `npm i -g prettier` to install it globally or use `npx` to run it as a standalone binary. Using `npx` required a working node environment. Upgrading to the latest prettier is recommended (by adding `--upgrade` to the `npm` command).
+You can either use `npm i -g prettier` to install it globally or use `npx` to run it as a standalone binary.
+Using `npx` requires a working node environment. Upgrading to the latest prettier is recommended (by adding
+`--upgrade` to the `npm` command).
 
 ```bash
 $ prettier --version
@@ -114,19 +142,19 @@ After you've confirmed your prettier version, you can format all the `.md` files
 prettier -w {datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md
 ```
 
+[`prettier`]: https://prettier.io/
+
 ## How to format `.toml` files
 
-We use `taplo` to format `.toml` files.
+We use [`taplo`] to format `.toml` files.
 
-For Rust developers, you can install it via:
+To install via cargo:
 
 ```sh
 cargo install taplo-cli --locked
 ```
 
-> Refer to the [Installation section][doc] on other ways to install it.
->
-> [doc]: https://taplo.tamasfe.dev/cli/installation/binary.html
+> Refer to the [taplo installation documentation][taplo-install] for other ways to install it.
 
 ```bash
 $ taplo --version
@@ -139,28 +167,24 @@ After you've confirmed your `taplo` version, you can format all the `.toml` file
 taplo fmt
 ```
 
+[`taplo`]: https://taplo.tamasfe.dev/
+[taplo-install]: https://taplo.tamasfe.dev/cli/installation/binary.html
+
 ## How to update protobuf/gen dependencies
 
-The prost/tonic code can be generated by running `./regen.sh`, which in turn invokes the Rust binary located in `./gen`
+For the `proto` and `proto-common` crates, the prost/tonic code is generated by running their respective `./regen.sh` scripts,
+which in turn invokes the Rust binary located in `./gen`.
 
 This is necessary after modifying the protobuf definitions or altering the dependencies of `./gen`, and requires a
 valid installation of [protoc] (see [installation instructions] for details).
 
 ```bash
-./regen.sh
+# From repository root
+# proto-common
+./datafusion/proto-common/regen.sh
+# proto
+./datafusion/proto/regen.sh
 ```
 
 [protoc]: https://github.com/protocolbuffers/protobuf#protocol-compiler-installation
-[installation instructions]: https://datafusion.apache.org/contributor-guide/getting_started.html#protoc-installation
-
-## How to add/edit documentation for UDFs
-
-Documentations for the UDF documentations are generated from code (related [github issue]). To generate markdown run `./update_function_docs.sh`.
-
-This is necessary after adding new UDF implementation or modifying existing implementation which requires to update documentation.
-
-```bash
-./dev/update_function_docs.sh
-```
-
-[github issue]: https://github.com/apache/datafusion/issues/12740
+[installation instructions]: https://datafusion.apache.org/contributor-guide/development_environment.html#protoc-installation
diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md
index e38898db5a92a..2ee8a2aaac6cc 100644
--- a/docs/source/contributor-guide/index.md
+++ b/docs/source/contributor-guide/index.md
@@ -32,19 +32,21 @@ community as well as get more familiar with Rust and the relevant codebases.
 
 ## Development Environment
 
-You can find how to setup build and testing environment [here](https://datafusion.apache.org/contributor-guide/development_environment.html)
+Setup your development environment [here](development_environment.md), and learn
+how to test the code [here](testing.md).
 
 ## Finding and Creating Issues to Work On
 
 You can find a curated [good-first-issue] list to help you get started.
+You can read about how we plan larger projects in the [Roadmap and Improvement Proposals](roadmap.md) section.
 
 [good-first-issue]: https://github.com/apache/datafusion/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22
 
 ### Open Contribution and Assigning tickets
 
 DataFusion is an open contribution project, and thus there is no particular
-project imposed deadline for completing any issue or any restriction on who can
-work on an issue, nor how many people can work on an issue at the same time.
+project imposed deadline for completing issues or restrictions on who can
+work on an issue, nor limits to how many people can work on an issue at the same time.
 
 Contributors drive the project forward based on their own priorities and
 interests and thus you are free to work on any issue that interests you.
@@ -58,55 +60,8 @@ If you want to work on an issue which is not already assigned to someone else
 and there are no comment indicating that someone is already working on that
 issue then you can assign the issue to yourself by submitting a single word
 comment `take`. This will assign the issue to yourself. However, if you are
-unable to make progress you should unassign the issue by using the `unassign me`
-link at the top of the issue page (and ask for help if are stuck) so that
-someone else can get involved in the work.
-
-### Discussing New Features
-
-If you plan to work on a new feature that doesn't have an existing ticket, it is
-a good idea to open a ticket to discuss the feature. Advanced discussion often
-helps avoid wasted effort by determining early if the feature is a good fit for
-DataFusion before too much time is invested. Discussion on a ticket can help
-gather feedback from the community and is likely easier to discuss than a 1000
-line PR.
-
-If you open a ticket and it doesn't get any response, you can try `@`-mentioning
-recently active community members in the ticket to get their attention.
-
-### What Contributions are Good Fits?
-
-DataFusion is designed to be highly extensible, and many features can be
-implemented as extensions without changes or additions to the core. Support for
-new functions, data formats, and similar functionality can be added using those
-extension APIs, and there are already many existing community supported
-extensions listed in the [extensions list].
-
-Query engines are complex pieces of software to develop and maintain. Given our
-limited maintenance bandwidth, we try to keep the DataFusion core as simple and
-focused as possible, while still satisfying the [design goal] of an easy to
-start initial experience.
-
-With that in mind, contributions that meet the following criteria are more likely
-to be accepted:
-
-1. Bug fixes for existing features
-2. Test coverage for existing features
-3. Documentation improvements / examples
-4. Performance improvements to existing features (with benchmarks)
-5. "Small" functional improvements to existing features (if they don't change existing behavior)
-6. Additional APIs for extending DataFusion's capabilities
-7. CI improvements
-
-Contributions that will likely involve more discussion (see Discussing New
-Features above) prior to acceptance include:
-
-1. Major new functionality (even if it is part of the "standard SQL")
-2. New functions, especially if they aren't part of "standard SQL"
-3. New data sources (e.g. support for Apache ORC)
-
-[extensions list]: ../library-user-guide/extensions.md
-[design goal]: https://docs.rs/datafusion/latest/datafusion/index.html#design-goals
+unable to make progress you should unassign the issue by commenting a single
+word `untake`.
 
 # Developer's guide
 
@@ -158,7 +113,7 @@ do take priority over the conventional commit approach, allowing maintainers to
 
 [conventional commits]: https://www.conventionalcommits.org/en/v1.0.0/
 
-# Reviewing Pull Requests
+## Reviewing Pull Requests
 
 Some helpful links:
 
@@ -220,3 +175,59 @@ The good thing about open code and open development is that any issues in one ch
 
 Pull requests will be marked with a `stale` label after 60 days of inactivity and then closed 7 days after that.
 Commenting on the PR will remove the `stale` label.
+
+## AI-Assisted contributions
+
+DataFusion has the following policy for AI-assisted PRs:
+
+- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review.
+- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently".
+
+### Why fully AI-generated PRs without understanding are not helpful
+
+Today, AI tools cannot reliably make complex changes to DataFusion on their own, which is why we rely on pull requests and code review.
+
+The purposes of code review are:
+
+1. Finish the intended task.
+2. Share knowledge between authors and reviewers, as a long-term investment in the project. For this reason, even if someone familiar with the codebase can finish a task quickly, we're still happy to help a new contributor work on it even if it takes longer.
+
+An AI dump for an issue doesn’t meet these purposes. Maintainers could finish the task faster by using AI directly, and the submitters gain little knowledge if they act only as a pass through AI proxy without understanding.
+
+Please understand the reviewing capacity is **very limited** for the project, so large PRs which appear to not have the requisite understanding might not get reviewed, and eventually closed or redirected.
+
+### Better ways to contribute than an “AI dump”
+
+It's recommended to write a high-quality issue with a clear problem statement and a minimal, reproducible example. This can make it easier for others to contribute.
+
+### CI Runners
+
+#### Runs-On
+
+We use [Runs-On](https://runs-on.com/) for some actions in the main repository, which run in the ASF AWS account to speed up CI. In forks, these actions run on the default GitHub runners since forks do not have access to ASF infrastructure.
+
+To configure them, we use the following format:
+
+`runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}`
+
+This is a conditional expression that uses Runs-On custom runners for the main repository and falls back to the standard GitHub runners for forks. Runs-On configuration follows the [Runs-On pattern](https://runs-on.com/configuration/job-labels/).
+
+For those actions we also use the [Runs-On action](https://runs-on.com/caching/magic-cache/#how-to-use), which adds support for external caching and reports job metrics:
+
+`- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e`
+
+For the standard GitHub runners, this action will do nothing.
+
+##### Spot Instances
+
+By default, Runs-On actions run as [spot instances](https://runs-on.com/configuration/spot-instances/), which means they might occasionally be interrupted. In the CI you would see:
+
+```
+Error: The operation was canceled.
+```
+
+According to Runs-On, spot instance termination is extremely rare for instances running for less than 1h. Those actions will be restarted automatically.
+
+#### GitHub Runners
+
+We also use standard GitHub runners for some actions in the main repository; these are also runnable in forks.
diff --git a/docs/source/contributor-guide/inviting.md b/docs/source/contributor-guide/inviting.md
index c6ed2695cfc12..9696bd1238a4a 100644
--- a/docs/source/contributor-guide/inviting.md
+++ b/docs/source/contributor-guide/inviting.md
@@ -126,7 +126,7 @@ explicitly add them to the roster on the [Whimsy Roster Tool].
 ### Step 4: Announce and Celebrate the New Committer
 
 Email to Send an email such as the following to
-[dev@datafusion.apache.org](mailto:dev@datafusion.apache.org]) to celebrate and
+[dev@datafusion.apache.org](mailto:dev@datafusion.apache.org) to celebrate and
 acknowledge the new committer to the community.
 
 ```
@@ -175,8 +175,8 @@ Of course, you can decline and instead remain as a contributor,
 participating as you do now.
 
 A. This personal invitation is a chance for you to accept or decline
-in private. Either way, please let us know in reply to the
-private@datafusion.apache.org address only.
+in private. Either way, please let us know in reply to this email, make sure to reply-all (it should send a copy to
+private@datafusion.apache.org) for record keeping / log of the project.
 
 B. If you accept, the next step is to register an ICLA:
 
@@ -232,8 +232,8 @@ Of course, you can decline and instead remain as a contributor,
 participating as you do now.
 
 This personal invitation is a chance for you to accept or decline
-in private. Either way, please let us know in reply to the
-private@datafusion.apache.org address only. We will have to request an
+in private. Either way, please let us know in reply to this email, make sure to reply-all (it should send a copy to
+private@datafusion.apache.org) for record keeping / log of the project. We will have to request an
 Apache account be created for you, so please let us know what user id
 you would prefer.
 
@@ -275,14 +275,16 @@ probably find that you spend more time here.
 Of course, you can decline and instead remain as a contributor,
 participating as you do now.
 
-If you accept, please let us know by replying to private@datafusion.apache.org.
+If you accept, please let us know in reply to this email, make sure to reply-all (it should send a copy to
+private@datafusion.apache.org) for record keeping / log of the project.
 ```
 
 ## New PMC Members
 
-See also the ASF instructions on [how to add a PMC member].
+This is a DataFusion specific cookbook for the Apache Software Foundation
+instructions on [how to add a PMC member].
 
-[how to add a pmc member]: https://www.apache.org/dev/pmc.html#newpmc
+[how to add a pmc member]: https://www.apache.org/dev/pmc.html#pmcmembers
 
 ### Step 1: Start a Discussion Thread
 
@@ -333,29 +335,18 @@ Thanks,
 Your Name
 ```
 
-### Step 3: Send Notice to ASF Board
-
-The DataFusion PMC Chair then sends a NOTICE to `board@apache.org` (cc'ing
-`private@`) like this:
+If this vote succeeds, send a "RESULT" email to `private@` like this:
 
 ```
-To: board@apache.org
-Cc: private@datafusion.apache.org
-Subject: [NOTICE] $NEW_PMC_MEMBER to join DataFusion PMC
-
-DataFusion proposes to invite $NEW_PMC_MEMBER ($NEW_PMC_MEMBER_APACHE_ID) to join the PMC.
-
-The vote result is available here:
-$VOTE_RESULT_URL
+To: private@datafusion.apache.org
+Subject: [RESULT][VOTE] $NEW_PMC_MEMBER for PMC
 
-FYI: Full vote details:
-$VOTE_URL
+The vote carries with N +1 votes and no -1 votes. I will send an invitation
 ```
 
-### Step 4: Send invitation email
+### Step 3: Send invitation email
 
-Once, the PMC chair has confirmed that the email sent to `board@apache.org` has
-made it to the archives, the Chair sends an invitation e-mail to the new PMC
+Assuming the vote passes, the Chair sends an invitation e-mail to the new PMC
 member (cc'ing `private@`) like this:
 
 ```
@@ -405,11 +396,11 @@ With the expectation of your acceptance, welcome!
 The Apache DataFusion PMC
 ```
 
-### Step 5: Chair Promotes the Committer to PMC
+### Step 4: Chair Promotes the Committer to PMC
 
 The PMC chair adds the user to the PMC using the [Whimsy Roster Tool].
 
-### Step 6: Announce and Celebrate the New PMC Member
+### Step 5: Announce and Celebrate the New PMC Member
 
 Send an email such as the following to `dev@datafusion.apache.org` to celebrate:
 
diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md
index 3d9c1ee371fe6..aac0710dadf77 100644
--- a/docs/source/contributor-guide/roadmap.md
+++ b/docs/source/contributor-guide/roadmap.md
@@ -17,7 +17,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-# Roadmap
+# Roadmap and Improvement Proposals
 
 The [project introduction](../user-guide/introduction) explains the
 overview and goals of DataFusion, and our development efforts largely
@@ -25,102 +25,128 @@ align to that vision.
 
 ## Planning `EPIC`s
 
-DataFusion uses [GitHub
-issues](https://github.com/apache/datafusion/issues) to track
-planned work. We collect related tickets using tracking issues labeled
-with `[EPIC]` which contain discussion and links to more detailed items.
-
-Epics offer a high level roadmap of what the DataFusion
-community is thinking about. The epics are not meant to restrict
-possibilities, but rather help the community see where development is
-headed, align our work, and inspire additional contributions.
-
-As this project is entirely driven by volunteers, we welcome
-contributions for items not currently covered by epics. However,
-before submitting a large PR, we strongly suggest and request you
-start a conversation using a github issue or the
-[dev@arrow.apache.org](mailto:dev@arrow.apache.org) mailing list to
-make review efficient and avoid surprises.
-
-[The current list of `EPIC`s can be found here](https://github.com/apache/datafusion/issues?q=is%3Aissue+is%3Aopen+epic).
-
-# Quarterly Roadmap
-
-A quarterly roadmap will be published to give the DataFusion community
-visibility into the priorities of the projects contributors. This roadmap is not
-binding and we would welcome any/all contributions to help keep this list up to
-date.
-
-## 2023 Q4
-
-- Improve data output (`COPY`, `INSERT` and DataFrame) output capability [#6569](https://github.com/apache/datafusion/issues/6569)
-- Implementation of `ARRAY` types and related functions [#6980](https://github.com/apache/datafusion/issues/6980)
-- Write an industrial paper about DataFusion for SIGMOD [#6782](https://github.com/apache/datafusion/issues/6782)
-
-## 2022 Q2
-
-### DataFusion Core
-
-- IO Improvements
-  - Reading, registering, and writing more file formats from both DataFrame API and SQL
-  - Additional options for IO including partitioning and metadata support
-- Work Scheduling
-  - Improve predictability, observability and performance of IO and CPU-bound work
-  - Develop a more explicit story for managing parallelism during plan execution
-- Memory Management
-  - Add more operators for memory limited execution
-- Performance
-  - Incorporate row-format into operators such as aggregate
-  - Add row-format benchmarks
-  - Explore JIT-compiling complex expressions
-  - Explore LLVM for JIT, with inline Rust functions as the primary goal
-  - Improve performance of Sort and Merge using Row Format / JIT expressions
-- Documentation
-  - General improvements to DataFusion website
-  - Publish design documents
-- Streaming
-  - Create `StreamProvider` trait
-
-### Ballista
-
-- Make production ready
-  - Shuffle file cleanup
-  - Fill functional gaps between DataFusion and Ballista
-  - Improve task scheduling and data exchange efficiency
-  - Better error handling
-    - Task failure
-    - Executor lost
-    - Schedule restart
-  - Improve monitoring and logging
-  - Auto scaling support
-- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching.
-- Executor deployment grouping based on resource allocation
-
-### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib))
-
-### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python)
-
-- Add missing functionality to DataFrame and SessionContext
-- Improve documentation
-
-### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3)
-
-- Create Python bindings to use with datafusion-python
-
-### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui)
-
-- Create multiple SQL editors
-- Expose more Context and query metadata
-- Support new data sources
-  - BigTable, HDFS, HTTP APIs
-
-### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable)
-
-- Python binding to use with datafusion-python
-- Timestamp range predicate pushdown
-- Multi-threaded partition aware execution
-- Production ready Rust SDK
-
-### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams)
-
-- Create experimental implementation of `StreamProvider` trait
+DataFusion uses [GitHub issues] to track planned work. We collect related
+tickets using tracking issues marked with the `EPIC` label, containing
+discussion and links to more detailed items:
+
+[github issues]: https://github.com/apache/datafusion/issues
+
+- [The current list of `EPIC`s can be found here.](https://github.com/apache/datafusion/issues?q=is%3Aissue%20state%3Aopen%20label%3AEPIC)
+
+- [The current list of `PROPOSAL EPIC` (that are not yet underway) can be found here.](https://github.com/apache/datafusion/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22PROPOSAL%20EPIC%22)
+
+Epics offer a high level roadmap of what the DataFusion community is thinking
+about. The epics are not meant to restrict possibilities, but rather help
+organize the community and make it easier to see where development is headed,
+align our work, and inspire additional contributions.
+
+We also welcome contributions for items not covered by epics. However, before
+submitting a large PR, we strongly suggest and request you start a conversation as described in [Discussing New Features](#discussing-new-features) below.
+
+[dev@arrow.apache.org]: mailto:dev@arrow.apache.org
+
+## Quarterly Roadmap
+
+The DataFusion roadmap is driven by the priorities of contributors rather than
+any single organization or coordinating committee. We typically discuss our
+roadmap using GitHub issues, approximately quarterly, and invite you to join the
+discussion.
+
+For more information:
+
+1. [Search for issues labeled `roadmap`](https://github.com/apache/datafusion/issues?q=is%3Aissue%20%20%20roadmap)
+2. [DataFusion Road Map: Q1 2026](https://github.com/apache/datafusion/issues/18494)
+3. [DataFusion Road Map: Q3-Q4 2025](https://github.com/apache/datafusion/issues/15878)
+4. [2024 Q4 / 2025 Q1 Roadmap](https://github.com/apache/datafusion/issues/13274)
+
+## Improvement Proposals
+
+### Discussing New Features
+
+If you plan to work on a new feature that doesn't have an existing ticket, it is
+a good idea to open one for discussion. Advanced discussion helps avoid wasted
+effort by determining if the feature is a good fit for DataFusion before too
+much time is invested. Discussion on a ticket can help gather feedback from the
+community and is likely easier to discuss than a 1000 line PR.
+
+Maintainers will mark major proposals as `PROPOSED EPIC` to make them more
+visible, but we are very limited on review bandwidth. If you open a ticket and it
+doesn't get any response, try `@`-mentioning recently active community members
+in the ticket, or [posting to the mailing list or Discord](communication.md).
+
+### Supervising Maintainers
+
+We have found that most successful epics have one or more "supervising
+maintainers", a committer ([see here for current list]) who take the lead on
+reviewing and committing PRs, helps with design, and coordinates and
+communicates with the community. If you want to ship a large feature, we
+recommend finding such maintainer upfront; otherwise, your PRs may
+remain unreviewed for a very long time.
+
+Supervising maintainers have no additional formal authority and there is
+currently no formal process for appointing, approving or tracking who has that
+role for a given epic. Instead, we rely on discussion on the ticket or PR.
+Helping complete an epic is a significant time commitment, so maintainers are
+more likely to help features they are particularly interested in or align with
+their own project's use of DataFusion.
+
+If you are willing to be a supervising maintainer for a feature, please say so
+explicitly. If you are unsure, we suggest asking directly who is willing to take
+the role, as it can be hard to tell sometimes whether a committer is simply
+participating and giving general feedback.
+
+[see here for current list]: governance.md
+
+### What Contributions are Good Fits?
+
+DataFusion is designed to be highly extensible, and many features can be
+implemented as extensions without changes or additions to the core. Support for
+new functions, data formats, and similar functionality can be added using those
+extension APIs, and there are already many existing community supported
+extensions listed in the [extensions list].
+
+Query engines are complex pieces of software to develop and maintain. Given our
+limited maintenance bandwidth, we try to keep the DataFusion core as simple and
+focused as possible, while still satisfying the [design goal] of an easy to
+start initial experience.
+
+With that in mind, contributions that meet the following criteria are more likely
+to be accepted:
+
+1. Bug fixes for existing features
+2. Test coverage for existing features
+3. Documentation improvements / examples
+4. Performance improvements to existing features (with benchmarks)
+5. "Small" functional improvements to existing features (if they don't change existing behavior)
+6. Additional APIs for extending DataFusion's capabilities
+7. CI improvements
+
+Contributions that will likely involve more discussion (see Discussing New
+Features above) prior to acceptance include:
+
+1. Major new functionality (even if it is part of the "standard SQL")
+2. New functions, especially if they aren't part of "standard SQL"
+3. New data sources (e.g. support for Apache ORC)
+
+[extensions list]: ../library-user-guide/extensions.md
+[design goal]: https://docs.rs/datafusion/latest/datafusion/index.html#design-goals
+
+### Design Build vs. Big Up Front Design
+
+Typically, the DataFusion community attacks large problems by solving them bit
+by bit and refining a solution iteratively on the `main` branch as a series of
+Pull Requests. This is different from projects which front-load the effort
+with a more comprehensive design process.
+
+By "advancing the front" the community always makes tangible progress, and the strategy is
+especially effective in a project that relies on individual contributors who may
+not have the time or resources to invest in a large upfront design effort.
+However, this "bit by bit approach" doesn't always succeed, and sometimes we get
+stuck or go down the wrong path and then change directions.
+
+Our process necessarily results in imperfect solutions being the "state of the
+code" in some cases, and larger visions are not yet fully realized. However, the
+community is good at driving things to completion in the long run. If you see
+something that needs improvement or an area that is not yet fully realized,
+please consider submitting an issue or PR to improve it. We are always looking
+for more contributions.
diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md
index eeed2a0c5d76c..43b727211de77 100644
--- a/docs/source/contributor-guide/testing.md
+++ b/docs/source/contributor-guide/testing.md
@@ -46,19 +46,43 @@ cargo nextest run
 ## Unit tests
 
 Tests for code in an individual module are defined in the same source file with a `test` module, following Rust convention.
-The [test_util](https://github.com/apache/datafusion/tree/main/datafusion/common/src/test_util.rs) module provides useful macros to write unit tests effectively, such as `assert_batches_sorted_eq` and `assert_batches_eq` for RecordBatches and `assert_contains` / `assert_not_contains` which are used extensively in the codebase.
+
+For example, to run tests in the `datafusion` crate:
+
+```shell
+cargo test -p datafusion
+```
+
+The [test_util] module provides useful macros to write unit tests effectively, such as [`assert_batches_sorted_eq`] and [`assert_batches_eq`] for RecordBatches and [`assert_contains`] / [`assert_not_contains`] which are used extensively in the codebase.
+
+[test_util]: https://github.com/apache/datafusion/tree/main/datafusion/common/src/test_util.rs
+[`assert_batches_sorted_eq`]: https://docs.rs/datafusion/latest/datafusion/macro.assert_batches_sorted_eq.html
+[`assert_batches_eq`]: https://docs.rs/datafusion/latest/datafusion/macro.assert_batches_eq.html
+[`assert_contains`]: https://docs.rs/datafusion/latest/datafusion/common/macro.assert_contains.html
+[`assert_not_contains`]: https://docs.rs/datafusion/latest/datafusion/common/macro.assert_not_contains.html
 
 ## sqllogictests Tests
 
-DataFusion's SQL implementation is tested using [sqllogictest](https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest) which are run like other tests using `cargo test --test sqllogictests`.
+DataFusion's SQL implementation is tested using [sqllogictest](https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest). You can run these tests with commands like:
 
-`sqllogictests` tests may be less convenient for new contributors who are familiar with writing `.rs` tests as they require learning another tool. However, `sqllogictest` based tests are much easier to develop and maintain as they 1) do not require a slow recompile/link cycle and 2) can be automatically updated via `cargo test --test sqllogictests -- --complete`.
+```shell
+# Run all tests
+cargo test --profile=ci --test sqllogictests
+# Run a specific test file
+cargo test --profile=ci --test sqllogictests -- aggregate.slt
+# Run a specific test file and update expected outputs
+cargo test --profile=ci --test sqllogictests -- aggregate.slt --complete
+# Run and update expected outputs for all test files
+cargo test --profile=ci --test sqllogictests -- --complete
+```
+
+`sqllogictests` may be less convenient for new contributors who are familiar with writing `.rs` tests as they require learning another tool. However, `sqllogictest` based tests are much easier to develop and maintain as they 1) do not require a slow recompile/link cycle and 2) can be automatically updated.
 
 Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusion has chosen to trade off a slightly higher barrier to contribution for longer term maintainability.
 
 DataFusion has integrated [sqlite's test suite](https://sqlite.org/sqllogictest/doc/trunk/about.wiki) as a supplemental test suite that is run whenever a PR is merged into DataFusion. To run it manually please refer to the [README](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md#running-tests-sqlite) file for instructions.
 
-## Snapshot testing
+## Snapshot testing (`cargo insta`)
 
 [Insta](https://github.com/mitsuhiko/insta) is used for snapshot testing. Snapshots are generated
 and compared on each test run. If the output changes, tests will fail.
@@ -75,28 +99,23 @@ cargo insta review
 In addition to the standard CI test suite that is run on all PRs prior to merge,
 DataFusion has "extended" tests (defined in [extended.yml]) that are run on each
 commit to `main`. These tests rarely fail but take significantly longer to run
-than the standard test suite and add important test coverage such as that the
-code works when there are hash collisions as well as running the relevant
-portions of the entire [sqlite test suite].
-
-You can run the extended tests on any PR by leaving the following comment (see [example here]):
+than the standard test suite and add important test coverage such as ensuring
+correctness when there are hash collisions and running the relevant portions of
+the entire [sqlite test suite]. You can run the extended tests
+locally by following the [instructions in the documentation].
 
-```
-Run extended tests
-```
-
-[extended.yml]: https://github.com/apache/datafusion/blob/main/.github/workflows/extended.yml
 [sqlite test suite]: https://www.sqlite.org/sqllogictest/dir?ci=tip
-[example here]: https://github.com/apache/datafusion/pull/15427#issuecomment-2759160812
+[instructions in the documentation]: https://github.com/apache/datafusion/tree/main/datafusion/sqllogictest#running-tests-sqlite
+[extended.yml]: https://github.com/apache/datafusion/blob/main/.github/workflows/extended.yml
 
 ## Rust Integration Tests
 
-There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/datafusion/tree/main/datafusion/core/tests) directory.
+There are several public interface tests for the DataFusion library in the [tests](https://github.com/apache/datafusion/tree/main/datafusion/core/tests) directory.
 
 You can run these tests individually using `cargo` as normal command such as
 
 ```shell
-cargo test -p datafusion --test parquet_exec
+cargo test -p datafusion --test parquet_integration
 ```
 
 ## SQL "Fuzz" testing
diff --git a/docs/source/download.md b/docs/source/download.md
index 33a6d70088779..3be76a6acf7b4 100644
--- a/docs/source/download.md
+++ b/docs/source/download.md
@@ -19,19 +19,25 @@
 
 # Download
 
-While DataFusion is also distributed via the Rust [crates.io] package manager as a convenience, the
+Most users use DataFusion as a library in their Rust projects by adding it as a dependency
+in their `Cargo.toml` file and downloading it from the Rust [crates.io] package registry.
+
+For example:
+
+```toml
+[dependencies]
+datafusion = "52.1.0"
+```
+
+While DataFusion is distributed via [crates.io] as a convenience, the
 official Apache DataFusion releases are provided as source artifacts.
 
 [crates.io]: https://crates.io/crates/datafusion
 
 ## Releases
 
-The latest source release is [41.0.0][source-link] ([asc][asc-link],
-[sha512][sha512-link]).
-
-[source-link]: https://www.apache.org/dyn/closer.lua/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz?action=download
-[asc-link]: https://downloads.apache.org/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz.asc
-[sha512-link]: https://downloads.apache.org/datafusion/datafusion-41.0.0/apache-datafusion-41.0.0.tar.gz.sha512
+You can find the latest releases, signatures and checksums on
+the [ASF Release Page](https://dist.apache.org/repos/dist/release/datafusion)
 
 For previous releases, please check the [archive](https://archive.apache.org/dist/datafusion/).
 
@@ -40,8 +46,10 @@ For releases earlier than 37.0.0, please check [Arrow's archive](https://archive
 ## Notes
 
 - When downloading a release, please verify the OpenPGP compatible signature (or failing that, check the SHA-512); these should be fetched from the main Apache site.
-- The KEYS file contains the public keys used for signing release. It is recommended that (when possible) a web of trust is used to confirm the identity of these keys.
-- Please download the [KEYS](https://downloads.apache.org/datafusion/KEYS) as well as the .asc signature files.
+- The [KEYS] file contains the public keys used for signing release. It is recommended that (when possible) a web of trust is used to confirm the identity of these keys.
+- Please download the [KEYS] file as well as the .asc signature files.
+
+[keys]: https://downloads.apache.org/datafusion/KEYS
 
 ### To verify the signature of the release artifact
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e920a0f036cbe..4d57faa0cbf73 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -15,8 +15,13 @@
 .. specific language governing permissions and limitations
 .. under the License.
 
-.. image:: _static/images/2x_bgwhite_original.png
-  :alt: DataFusion Logo
+.. image:: _static/images/original.svg
+   :alt: DataFusion Logo
+   :class: light-logo
+
+.. image:: _static/images/original_dark.svg
+   :alt: DataFusion Logo
+   :class: dark-logo
 
 =================
 Apache DataFusion
@@ -49,16 +54,16 @@ The following related subprojects target end users and have separate documentati
 
 - `DataFusion Python <https://datafusion.apache.org/python/>`_ offers a Python interface for SQL and DataFrame
   queries.
-- `DataFusion Ray <https://github.com/apache/datafusion-ray/>`_ provides a distributed version of DataFusion
-  that scales out on `Ray <https://www.ray.io>`_ clusters.
 - `DataFusion Comet <https://datafusion.apache.org/comet/>`_ is an accelerator for Apache Spark based on
   DataFusion.
+- `DataFusion Ballista <https://datafusion.apache.org/ballista/>`_ is distributed processing extension for DataFusion.
 
 "Out of the box," DataFusion offers `SQL <https://datafusion.apache.org/user-guide/sql/index.html>`_
 and `Dataframe <https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html>`_ APIs,
 excellent `performance <https://benchmark.clickhouse.com/>`_, built-in support for CSV, Parquet, JSON, and Avro,
-extensive customization, and a great community.
+extensive customization, and a great `community`_.
 `Python Bindings <https://github.com/apache/datafusion-python>`_ are also available.
+`Ballista <https://datafusion.apache.org/ballista/>`_ is Apache DataFusion extension enabling the parallelized execution of workloads across multiple nodes in a distributed environment.
 
 DataFusion features a full query planner, a columnar, streaming, multi-threaded,
 vectorized execution engine, and partitioned data sources. You can
@@ -76,6 +81,7 @@ To get started, see
 .. _datafusion-examples: https://github.com/apache/datafusion/tree/main/datafusion-examples
 .. _developer’s guide: contributor-guide/index.html#developer-s-guide
 .. _library user guide: library-user-guide/index.html
+.. _community: contributor-guide/communication.html
 .. _communication: contributor-guide/communication.html
 
 .. _toc.asf-links:
@@ -113,11 +119,12 @@ To get started, see
    user-guide/crate-configuration
    user-guide/cli/index
    user-guide/dataframe
+   user-guide/arrow-introduction
    user-guide/expressions
    user-guide/sql/index
    user-guide/configs
-   user-guide/runtime_configs
    user-guide/explain-usage
+   user-guide/metrics
    user-guide/faq
 
 .. _toc.library-user-guide:
@@ -127,18 +134,20 @@ To get started, see
    :caption: Library User Guide
    
    library-user-guide/index
+   library-user-guide/upgrading/index
    library-user-guide/extensions
    library-user-guide/using-the-sql-api
+   library-user-guide/extending-sql
    library-user-guide/working-with-exprs
    library-user-guide/using-the-dataframe-api
    library-user-guide/building-logical-plans
    library-user-guide/catalogs
-   library-user-guide/adding-udfs
+   library-user-guide/functions/index
    library-user-guide/custom-table-providers
+   library-user-guide/table-constraints
    library-user-guide/extending-operators
    library-user-guide/profiling
    library-user-guide/query-optimizer
-   library-user-guide/upgrading
 
 .. .. _toc.contributor-guide:
 
@@ -150,6 +159,7 @@ To get started, see
    contributor-guide/communication
    contributor-guide/development_environment
    contributor-guide/architecture
+   contributor-guide/architecture/dependency-graph
    contributor-guide/testing
    contributor-guide/api-health
    contributor-guide/howtos
@@ -157,8 +167,7 @@ To get started, see
    contributor-guide/governance
    contributor-guide/inviting
    contributor-guide/specification/index
-   contributor-guide/gsoc_application_guidelines
-   contributor-guide/gsoc_project_ideas
+   contributor-guide/gsoc/index
 
 .. _toc.subprojects:
 
@@ -166,6 +175,6 @@ To get started, see
    :maxdepth: 1
    :caption: DataFusion Subprojects
 
-   DataFusion Ballista <https://arrow.apache.org/ballista/>
+   DataFusion Ballista <https://datafusion.apache.org/ballista/>
    DataFusion Comet <https://datafusion.apache.org/comet/>
    DataFusion Python <https://datafusion.apache.org/python/>
diff --git a/docs/source/library-user-guide/building-logical-plans.md b/docs/source/library-user-guide/building-logical-plans.md
index e1e75b3e4bdbd..9dc0fcbf31578 100644
--- a/docs/source/library-user-guide/building-logical-plans.md
+++ b/docs/source/library-user-guide/building-logical-plans.md
@@ -153,9 +153,9 @@ Filter: person.id > Int32(500) [id:Int32;N, name:Utf8;N]
 Logical plans can not be directly executed. They must be "compiled" into an
 [`ExecutionPlan`], which is often referred to as a "physical plan".
 
-Compared to `LogicalPlan`s `ExecutionPlans` have many more details such as
-specific algorithms and detailed optimizations compared to. Given a
-`LogicalPlan` the easiest way to create an `ExecutionPlan` is using
+Compared to `LogicalPlan`s, `ExecutionPlan`s have many more details such as
+specific algorithms and detailed optimizations. Given a
+`LogicalPlan`, the easiest way to create an `ExecutionPlan` is using
 [`SessionState::create_physical_plan`] as shown below
 
 ```rust
@@ -181,7 +181,7 @@ async fn main() -> Result<(), DataFusionError> {
     // TableProvider. For this example, we don't provide any data
     // but in production code, this would have `RecordBatch`es with
     // in memory data
-    let table_provider = Arc::new(MemTable::try_new(Arc::new(schema), vec![])?);
+    let table_provider = Arc::new(MemTable::try_new(Arc::new(schema), vec![vec![]])?);
     // Use the provider_as_source function to convert the TableProvider to a table source
     let table_source = provider_as_source(table_provider);
 
@@ -220,7 +220,7 @@ However, it is more common to use a [TableProvider]. To get a [TableSource] from
 [logicaltablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/builder/struct.LogicalTableSource.html
 [defaulttablesource]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html
 [provider_as_source]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/fn.provider_as_source.html
-[tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html
+[tableprovider]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
 [tablesource]: https://docs.rs/datafusion-expr/latest/datafusion_expr/trait.TableSource.html
 [`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
 [`sessionstate::create_physical_plan`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html#method.create_physical_plan
diff --git a/docs/source/library-user-guide/catalogs.md b/docs/source/library-user-guide/catalogs.md
index d4e6633d40ba7..daa329523afee 100644
--- a/docs/source/library-user-guide/catalogs.md
+++ b/docs/source/library-user-guide/catalogs.md
@@ -19,7 +19,7 @@
 
 # Catalogs, Schemas, and Tables
 
-This section describes how to create and manage catalogs, schemas, and tables in DataFusion. For those wanting to dive into the code quickly please see the [example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/catalog.rs).
+This section describes how to create and manage catalogs, schemas, and tables in DataFusion. For those wanting to dive into the code quickly please see the [example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/data_io/catalog.rs).
 
 ## General Concepts
 
diff --git a/docs/source/library-user-guide/custom-table-providers.md b/docs/source/library-user-guide/custom-table-providers.md
index 54f79a4218239..70b6be3ae2ab1 100644
--- a/docs/source/library-user-guide/custom-table-providers.md
+++ b/docs/source/library-user-guide/custom-table-providers.md
@@ -23,6 +23,9 @@ Like other areas of DataFusion, you extend DataFusion's functionality by impleme
 
 This section describes how to create a [`TableProvider`] and how to configure DataFusion to use it for reading.
 
+For details on how table constraints such as primary keys or unique
+constraints are handled, see [Table Constraint Enforcement](table-constraints.md).
+
 ## Table Provider and Scan
 
 The [`TableProvider::scan`] method reads data from the table and is likely the most important. It returns an [`ExecutionPlan`] that DataFusion will use to read the actual data during execution of the query. The [`TableProvider::insert_into`] method is used to `INSERT` data into the table.
@@ -49,11 +52,12 @@ use std::any::Any;
 use std::sync::{Arc, Mutex};
 use std::collections::{BTreeMap, HashMap};
 use datafusion::common::Result;
+use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use datafusion::physical_plan::expressions::PhysicalSortExpr;
 use datafusion::physical_plan::{
     ExecutionPlan, SendableRecordBatchStream, DisplayAs, DisplayFormatType,
-    Statistics, PlanProperties
+    Statistics, PlanProperties, PhysicalExpr
 };
 use datafusion::execution::context::TaskContext;
 use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
@@ -105,7 +109,7 @@ impl ExecutionPlan for CustomExec {
     }
 
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         unreachable!()
     }
 
@@ -150,6 +154,13 @@ impl ExecutionPlan for CustomExec {
             None,
         )?))
     }
+
+    fn apply_expressions(
+        &self,
+        _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+    ) -> Result<TreeNodeRecursion> {
+        Ok(TreeNodeRecursion::Continue)
+    }
 }
 ```
 
@@ -229,7 +240,7 @@ The `scan` method of the `TableProvider` returns a `Result<Arc<dyn ExecutionPlan
 #     }
 #
 #
-#     fn properties(&self) -> &PlanProperties {
+#     fn properties(&self) -> &Arc<PlanProperties> {
 #         unreachable!()
 #     }
 #
@@ -274,12 +285,20 @@ The `scan` method of the `TableProvider` returns a `Result<Arc<dyn ExecutionPlan
 #             None,
 #         )?))
 #     }
+#
+#     fn apply_expressions(
+#         &self,
+#         _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+#     ) -> Result<TreeNodeRecursion> {
+#         Ok(TreeNodeRecursion::Continue)
+#     }
 # }
 
 use async_trait::async_trait;
+use datafusion::common::tree_node::TreeNodeRecursion;
 use datafusion::logical_expr::expr::Expr;
 use datafusion::datasource::{TableProvider, TableType};
-use datafusion::physical_plan::project_schema;
+use datafusion::physical_plan::{project_schema, PhysicalExpr};
 use datafusion::catalog::Session;
 
 impl CustomExec {
@@ -421,7 +440,7 @@ This will allow you to use the custom table provider in DataFusion. For example,
 #     }
 #
 #
-#     fn properties(&self) -> &PlanProperties {
+#     fn properties(&self) -> &Arc<PlanProperties> {
 #         unreachable!()
 #     }
 #
@@ -466,12 +485,20 @@ This will allow you to use the custom table provider in DataFusion. For example,
 #             None,
 #         )?))
 #     }
+#
+#     fn apply_expressions(
+#         &self,
+#         _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+#     ) -> Result<TreeNodeRecursion> {
+#         Ok(TreeNodeRecursion::Continue)
+#     }
 # }
 
 # use async_trait::async_trait;
+# use datafusion::common::tree_node::TreeNodeRecursion;
 # use datafusion::logical_expr::expr::Expr;
 # use datafusion::datasource::{TableProvider, TableType};
-# use datafusion::physical_plan::project_schema;
+# use datafusion::physical_plan::{project_schema, PhysicalExpr};
 # use datafusion::catalog::Session;
 #
 # impl CustomExec {
@@ -566,6 +593,6 @@ More abstractly, see the following traits for more information on how to impleme
 - `FileFormat` - a trait for reading a file format
 - `ListingTableProvider` - a useful trait for implementing a `TableProvider` that lists files in a directory
 
-[ex]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion-examples/examples/custom_datasource.rs#L214C1-L276
+[ex]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/custom_datasource.rs
 [csv]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion/core/src/datasource/physical_plan/csv.rs#L57-L70
 [parquet]: https://github.com/apache/datafusion/blob/a5e86fae3baadbd99f8fd0df83f45fde22f7b0c6/datafusion/core/src/datasource/physical_plan/parquet.rs#L77-L104
diff --git a/docs/source/library-user-guide/extending-operators.md b/docs/source/library-user-guide/extending-operators.md
index 3d491806a4e6b..0a169531757c2 100644
--- a/docs/source/library-user-guide/extending-operators.md
+++ b/docs/source/library-user-guide/extending-operators.md
@@ -17,9 +17,12 @@
   under the License.
 -->
 
-# Extending DataFusion's operators: custom LogicalPlan and Execution Plans
+# Extending Operators
 
-DataFusion supports extension of operators by transforming logical plan and execution plan through customized [optimizer rules](https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html). This section will use the µWheel project to illustrate such capabilities.
+DataFusion supports extending operators by transforming [`LogicalPlan`] and [`ExecutionPlan`] through customized [optimizer rules](https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html). This section will use the µWheel project to illustrate such capabilities.
+
+[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html
+[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
 
 ## About DataFusion µWheel
 
@@ -35,7 +38,7 @@ fn rewrite(
   plan: LogicalPlan,
   _config: &dyn OptimizerConfig,
 ) -> Result<Transformed<LogicalPlan>> {
-    // Attemps to rewrite a logical plan to a uwheel-based plan that either provides
+    // Attempts to rewrite a logical plan to a uwheel-based plan that either provides
     // plan-time aggregates or skips execution based on min/max pruning.
     if let Some(rewritten) = self.try_rewrite(&plan) {
         Ok(Transformed::yes(rewritten))
diff --git a/docs/source/library-user-guide/extending-sql.md b/docs/source/library-user-guide/extending-sql.md
new file mode 100644
index 0000000000000..eea5b3b1acfc9
--- /dev/null
+++ b/docs/source/library-user-guide/extending-sql.md
@@ -0,0 +1,389 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Extending SQL Syntax
+
+DataFusion provides a flexible extension system that allows you to customize SQL
+parsing and planning without modifying the core codebase. This is useful when you
+need to:
+
+- Support custom operators from other SQL dialects (e.g., PostgreSQL's `->` for JSON)
+- Add custom data types not natively supported
+- Implement SQL constructs like `TABLESAMPLE`, `PIVOT`/`UNPIVOT`, or `MATCH_RECOGNIZE`
+
+You can read more about this topic in the [Extending SQL in DataFusion: from ->>
+to TABLESAMPLE] blog.
+
+[extending sql in datafusion: from ->> to tablesample]: https://datafusion.apache.org/blog/2026/01/12/extending-sql
+
+## Architecture Overview
+
+When DataFusion processes a SQL query, it goes through these stages:
+
+```text
+┌─────────────┐    ┌─────────┐    ┌──────────────────────┐    ┌─────────────┐
+│ SQL String  │───▶│ Parser  │───▶│      SqlToRel        │───▶│ LogicalPlan │
+└─────────────┘    └─────────┘    │ (SQL to LogicalPlan) │    └─────────────┘
+                                  └──────────────────────┘
+                                              │
+                                              │ uses
+                                              ▼
+                                  ┌───────────────────────┐
+                                  │  Extension Planners   │
+                                  │  • ExprPlanner        │
+                                  │  • TypePlanner        │
+                                  │  • RelationPlanner    │
+                                  └───────────────────────┘
+```
+
+The extension planners intercept specific parts of the SQL AST during the
+`SqlToRel` phase and allow you to customize how they are converted to DataFusion's
+logical plan.
+
+## Extension Points
+
+DataFusion provides three planner traits for extending SQL:
+
+| Trait               | Purpose                                 | Registration Method                        |
+| ------------------- | --------------------------------------- | ------------------------------------------ |
+| [`ExprPlanner`]     | Custom expressions and operators        | `ctx.register_expr_planner()`              |
+| [`TypePlanner`]     | Custom SQL data types                   | `SessionStateBuilder::with_type_planner()` |
+| [`RelationPlanner`] | Custom FROM clause elements (relations) | `ctx.register_relation_planner()`          |
+
+**Planner Precedence**: Multiple [`ExprPlanner`]s and [`RelationPlanner`]s can be
+registered; they are invoked in reverse registration order (last registered wins).
+Return `Original(...)` to delegate to the next planner. Only one `TypePlanner`
+can be active at a time.
+
+### ExprPlanner: Custom Expressions and Operators
+
+Use [`ExprPlanner`] to customize how SQL expressions are converted to DataFusion
+logical expressions. This is useful for:
+
+- Custom binary operators (e.g., `->`, `->>`, `@>`, `?`)
+- Custom field access patterns
+- Custom aggregate or window function handling
+
+#### Available Methods
+
+| Category           | Methods                                                                            |
+| ------------------ | ---------------------------------------------------------------------------------- |
+| Operators          | `plan_binary_op`, `plan_any`                                                       |
+| Literals           | `plan_array_literal`, `plan_dictionary_literal`, `plan_struct_literal`             |
+| Functions          | `plan_extract`, `plan_substring`, `plan_overlay`, `plan_position`, `plan_make_map` |
+| Identifiers        | `plan_field_access`, `plan_compound_identifier`                                    |
+| Aggregates/Windows | `plan_aggregate`, `plan_window`                                                    |
+
+See the [ExprPlanner API documentation] for full method signatures.
+
+#### Example: Custom Arrow Operator
+
+This example maps the `->` operator to string concatenation:
+
+```rust
+# use std::sync::Arc;
+# use datafusion::common::DFSchema;
+# use datafusion::error::Result;
+# use datafusion::logical_expr::Operator;
+# use datafusion::prelude::*;
+# use datafusion::sql::sqlparser::ast::BinaryOperator;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult, RawBinaryExpr};
+# use datafusion_expr::BinaryExpr;
+
+#[derive(Debug)]
+struct MyCustomPlanner;
+
+impl ExprPlanner for MyCustomPlanner {
+    fn plan_binary_op(
+        &self,
+        expr: RawBinaryExpr,
+        _schema: &DFSchema,
+    ) -> Result<PlannerResult<RawBinaryExpr>> {
+        match &expr.op {
+            // Map `->` to string concatenation
+            BinaryOperator::Arrow => {
+                Ok(PlannerResult::Planned(Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(expr.left.clone()),
+                    right: Box::new(expr.right.clone()),
+                    op: Operator::StringConcat,
+                })))
+            }
+            _ => Ok(PlannerResult::Original(expr)),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Use postgres dialect to enable `->` operator parsing
+    let config = SessionConfig::new()
+        .set_str("datafusion.sql_parser.dialect", "postgres");
+    let mut ctx = SessionContext::new_with_config(config);
+
+    // Register the custom planner
+    ctx.register_expr_planner(Arc::new(MyCustomPlanner))?;
+
+    // Now `->` works as string concatenation
+    let results = ctx.sql("SELECT 'hello'->'world'").await?.collect().await?;
+    // Returns: "helloworld"
+    Ok(())
+}
+```
+
+For more details, see the [ExprPlanner API documentation] and the
+[expr_planner test examples].
+
+### TypePlanner: Custom Data Types
+
+Use [`TypePlanner`] to map SQL data types to Arrow/DataFusion types. This is useful
+when you need to support SQL types that aren't natively recognized.
+
+#### Example: Custom DATETIME Type
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, FieldRef, TimeUnit};
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+# use datafusion::execution::SessionStateBuilder;
+use datafusion_expr::planner::TypePlanner;
+# use sqlparser::ast;
+
+#[derive(Debug)]
+struct MyTypePlanner;
+
+impl TypePlanner for MyTypePlanner {
+    fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
+        match sql_type {
+            // Map DATETIME(precision) to Arrow Timestamp
+            ast::DataType::Datetime(precision) => {
+                let time_unit = match precision {
+                    Some(0) => TimeUnit::Second,
+                    Some(3) => TimeUnit::Millisecond,
+                    Some(6) => TimeUnit::Microsecond,
+                    None | Some(9) => TimeUnit::Nanosecond,
+                    _ => return Ok(None), // Let default handling take over
+                };
+                Ok(Some(
+                    DataType::Timestamp(time_unit, None).into_nullable_field_ref()
+                ))
+            }
+            _ => Ok(None), // Return None for types we don't handle
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_type_planner(Arc::new(MyTypePlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Now DATETIME type is recognized
+    ctx.sql("CREATE TABLE events (ts DATETIME(3))").await?;
+    Ok(())
+}
+```
+
+#### Example: Supporting the UUID Type
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, FieldRef, TimeUnit};
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+# use datafusion::execution::SessionStateBuilder;
+use datafusion_expr::planner::TypePlanner;
+# use sqlparser::ast;
+
+#[derive(Debug)]
+struct MyTypePlanner;
+
+impl TypePlanner for MyTypePlanner {
+    fn plan_type_field(&self, sql_type: &ast::DataType) -> Result<Option<FieldRef>> {
+        match sql_type {
+            sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new(
+                Field::new("", DataType::FixedSizeBinary(16), true).with_metadata(
+                    [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())]
+                        .into(),
+                ),
+            ))),
+            _ => Ok(None),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let state = SessionStateBuilder::new()
+        .with_default_features()
+        .with_type_planner(Arc::new(MyTypePlanner))
+        .build();
+
+    let ctx = SessionContext::new_with_state(state);
+
+    // Now UUID type is recognized
+    ctx.sql("CREATE TABLE idx (uuid UUID)").await?;
+    Ok(())
+}
+```
+
+For more details, see the [TypePlanner API documentation].
+
+### RelationPlanner: Custom FROM Clause Elements
+
+Use [`RelationPlanner`] to handle custom relations in the FROM clause. This
+enables you to implement SQL constructs like:
+
+- `TABLESAMPLE` for sampling data
+- `PIVOT` / `UNPIVOT` for data reshaping
+- `MATCH_RECOGNIZE` for pattern matching
+- Any custom relation syntax parsed by sqlparser
+
+#### The RelationPlannerContext
+
+When implementing [`RelationPlanner`], you receive a [`RelationPlannerContext`] that
+provides utilities for planning:
+
+| Method                      | Purpose                                         |
+| --------------------------- | ----------------------------------------------- |
+| `plan(relation)`            | Recursively plan a nested relation              |
+| `sql_to_expr(expr, schema)` | Convert SQL expression to DataFusion Expr       |
+| `context_provider()`        | Access session configuration, tables, functions |
+
+See the [RelationPlanner API documentation] for additional methods like
+`normalize_ident()` and `object_name_to_table_reference()`.
+
+#### Implementation Strategies
+
+There are two main approaches when implementing a [`RelationPlanner`]:
+
+1. **Rewrite to Standard SQL**: Transform custom syntax into equivalent standard
+   operations that DataFusion already knows how to execute (e.g., PIVOT → GROUP BY
+   with CASE expressions). This is the simplest approach when possible.
+
+2. **Custom Logical and Physical Nodes**: Create a [`UserDefinedLogicalNode`] to
+   represent the operation in the logical plan, along with a custom [`ExecutionPlan`]
+   to execute it. Both are required for end-to-end execution.
+
+#### Example: Basic RelationPlanner Structure
+
+```rust
+# use std::sync::Arc;
+# use datafusion::error::Result;
+# use datafusion::prelude::*;
+use datafusion_expr::planner::{
+    PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning,
+};
+use datafusion_sql::sqlparser::ast::TableFactor;
+
+#[derive(Debug)]
+struct MyRelationPlanner;
+
+impl RelationPlanner for MyRelationPlanner {
+    fn plan_relation(
+        &self,
+        relation: TableFactor,
+        ctx: &mut dyn RelationPlannerContext,
+    ) -> Result<RelationPlanning> {
+        match relation {
+            // Handle your custom relation
+            TableFactor::Pivot { table, alias, .. } => {
+                // Plan the input table
+                let input = ctx.plan(*table)?;
+
+                // Transform or wrap the plan as needed
+                // ...
+
+                Ok(RelationPlanning::Planned(PlannedRelation::new(input, alias)))
+            }
+
+            // Return Original for relations you don't handle
+            other => Ok(RelationPlanning::Original(other)),
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let ctx = SessionContext::new();
+
+    // Register the custom planner
+    ctx.register_relation_planner(Arc::new(MyRelationPlanner))?;
+
+    Ok(())
+}
+```
+
+## Complete Examples
+
+The DataFusion repository includes comprehensive examples demonstrating each
+approach:
+
+### TABLESAMPLE (Custom Logical and Physical Nodes)
+
+The [table_sample.rs] example shows a complete end-to-end implementation of how to
+support queries such as:
+
+```sql
+SELECT * FROM table TABLESAMPLE BERNOULLI(10 PERCENT) REPEATABLE(42)
+```
+
+### PIVOT/UNPIVOT (Rewrite Strategy)
+
+The [pivot_unpivot.rs] example demonstrates rewriting custom syntax to standard SQL
+for queries such as:
+
+```sql
+SELECT * FROM sales
+  PIVOT (SUM(amount) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4'))
+```
+
+## Recap
+
+1. Use [`ExprPlanner`] for custom operators and expression handling
+2. Use [`TypePlanner` for custom SQL data types
+3. Use [`RelationPlanner`] for custom FROM clause syntax (TABLESAMPLE, PIVOT, etc.)
+4. Register planners via [`SessionContext`] or [`SessionStateBuilder`]
+
+## See Also
+
+- API Documentation: [`ExprPlanner`], [`TypePlanner`], [`RelationPlanner`]
+- [relation_planner examples] - Complete TABLESAMPLE, PIVOT/UNPIVOT implementations
+- [expr_planner test examples] - Custom operator examples
+- [Custom Expression Planning](functions/adding-udfs.md#custom-expression-planning) in the UDF guide
+
+[`exprplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.ExprPlanner.html
+[`typeplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.TypePlanner.html
+[`relationplanner`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlanner.html
+[`userdefinedlogicalnode`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/trait.UserDefinedLogicalNode.html
+[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
+[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+[`sessionstatebuilder`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionStateBuilder.html
+[`relationplannercontext`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlannerContext.html
+[exprplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.ExprPlanner.html
+[typeplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.TypePlanner.html
+[relationplanner api documentation]: https://docs.rs/datafusion/latest/datafusion/logical_expr/planner/trait.RelationPlanner.html
+[expr_planner test examples]: https://github.com/apache/datafusion/blob/main/datafusion/core/tests/user_defined/expr_planner.rs
+[relation_planner examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/relation_planner
+[table_sample.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/relation_planner/table_sample.rs
+[pivot_unpivot.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md
similarity index 76%
rename from docs/source/library-user-guide/adding-udfs.md
rename to docs/source/library-user-guide/functions/adding-udfs.md
index 8fb8a59fb8609..48162d6abcdfb 100644
--- a/docs/source/library-user-guide/adding-udfs.md
+++ b/docs/source/library-user-guide/functions/adding-udfs.md
@@ -23,19 +23,29 @@ User Defined Functions (UDFs) are functions that can be used in the context of D
 
 This page covers how to add UDFs to DataFusion. In particular, it covers how to add Scalar, Window, and Aggregate UDFs.
 
-| UDF Type  | Description                                                                                                | Example             |
-| --------- | ---------------------------------------------------------------------------------------------------------- | ------------------- |
-| Scalar    | A function that takes a row of data and returns a single value.                                            | [simple_udf.rs][1]  |
-| Window    | A function that takes a row of data and returns a single value, but also has access to the rows around it. | [simple_udwf.rs][2] |
-| Aggregate | A function that takes a group of rows and returns a single value.                                          | [simple_udaf.rs][3] |
-| Table     | A function that takes parameters and returns a `TableProvider` to be used in an query plan.                | [simple_udtf.rs][4] |
+| UDF Type       | Description                                                                                                | Example(s)                            |
+| -------------- | ---------------------------------------------------------------------------------------------------------- | ------------------------------------- |
+| Scalar         | A function that takes a row of data and returns a single value.                                            | [simple_udf.rs] / [advanced_udf.rs]   |
+| Window         | A function that takes a row of data and returns a single value, but also has access to the rows around it. | [simple_udwf.rs] / [advanced_udwf.rs] |
+| Aggregate      | A function that takes a group of rows and returns a single value.                                          | [simple_udaf.rs] / [advanced_udaf.rs] |
+| Table          | A function that takes parameters and returns a `TableProvider` to be used in an query plan.                | [simple_udtf.rs]                      |
+| Scalar (async) | A scalar function for performing `async` operations (such as network or I/O calls) within the UDF.         | [async_udf.rs]                        |
+
+[simple_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udf.rs
+[advanced_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
+[simple_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udwf.rs
+[advanced_udwf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
+[simple_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udaf.rs
+[advanced_udaf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
+[simple_udtf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/simple_udtf.rs
+[async_udf.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/async_udf.rs
 
 First we'll talk about adding an Scalar UDF end-to-end, then we'll talk about the differences between the different
 types of UDFs.
 
 ## Adding a Scalar UDF
 
-A Scalar UDF is a function that takes a row of data and returns a single value. In order for good performance
+A Scalar UDF is a function that takes a row of data and returns a single value. To achieve good performance,
 such functions are "vectorized" in DataFusion, meaning they get one or more Arrow Arrays as input and produce
 an Arrow Array with the same number of rows as output.
 
@@ -47,8 +57,8 @@ To create a Scalar UDF, you
 
 In the following example, we will add a function takes a single i64 and returns a single i64 with 1 added to it:
 
-For brevity, we'll skipped some error handling, but e.g. you may want to check that `args.len()` is the expected number
-of arguments.
+For brevity, we'll skip some error handling.
+For production code, you may want to check, for example, that `args.len()` matches the expected number of arguments.
 
 ### Adding by `impl ScalarUDFImpl`
 
@@ -73,7 +83,7 @@ use datafusion_doc::Documentation;
     description = "Add one udf",
     syntax_example = "add_one(1)"
 )]
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq, Hash)]
 struct AddOne {
   signature: Signature,
 }
@@ -136,7 +146,7 @@ We now need to register the function with DataFusion so that it can be used in t
 #     description = "Add one udf",
 #     syntax_example = "add_one(1)"
 # )]
-# #[derive(Debug)]
+# #[derive(Debug, PartialEq, Eq, Hash)]
 # struct AddOne {
 #   signature: Signature,
 # }
@@ -344,16 +354,339 @@ async fn main() {
 }
 ```
 
+## Adding an Async Scalar UDF
+
+An Async Scalar UDF allows you to implement user-defined functions that support
+asynchronous execution, such as performing network or I/O operations within the
+UDF.
+
+To add a Scalar Async UDF, you need to:
+
+1. Implement the `AsyncScalarUDFImpl` trait to define your async function logic, signature, and types.
+2. Wrap your implementation with `AsyncScalarUDF::new` and register it with the `SessionContext`.
+
+### Adding by `impl AsyncScalarUDFImpl`
+
+```rust
+# use arrow::array::{ArrayIter, ArrayRef, AsArray, StringArray};
+# use arrow_schema::DataType;
+# use async_trait::async_trait;
+# use datafusion::common::error::Result;
+# use datafusion::common::{internal_err, not_impl_err};
+# use datafusion::common::types::logical_string;
+# use datafusion::config::ConfigOptions;
+# use datafusion_expr::ScalarUDFImpl;
+# use datafusion::logical_expr::async_udf::AsyncScalarUDFImpl;
+# use datafusion::logical_expr::{
+#     ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility, ScalarFunctionArgs
+# };
+# use datafusion::logical_expr_common::signature::Coercion;
+# use std::any::Any;
+# use std::sync::Arc;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct AsyncUpper {
+    signature: Signature,
+}
+
+impl Default for AsyncUpper {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AsyncUpper {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::new(
+                TypeSignature::Coercible(vec![Coercion::Exact {
+                    desired_type: TypeSignatureClass::Native(logical_string()),
+                }]),
+                Volatility::Volatile,
+            ),
+        }
+    }
+}
+
+/// Implement the normal ScalarUDFImpl trait for AsyncUpper
+#[async_trait]
+impl ScalarUDFImpl for AsyncUpper {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "async_upper"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    // Note the normal invoke_with_args method is not called for Async UDFs
+    fn invoke_with_args(
+        &self,
+        _args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        not_impl_err!("AsyncUpper can only be called from async contexts")
+    }
+}
+
+/// The actual implementation of the async UDF
+#[async_trait]
+impl AsyncScalarUDFImpl for AsyncUpper {
+    fn ideal_batch_size(&self) -> Option<usize> {
+        Some(10)
+    }
+
+    /// This method is called to execute the async UDF and is similar
+    /// to the normal `invoke_with_args` except it is `async`.
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let value = &args.args[0];
+        // This function simply implements a simple string to uppercase conversion
+        // but can be used for any async operation such as network calls.
+        let result = match value {
+            ColumnarValue::Array(array) => {
+                let string_array = array.as_string::<i32>();
+                let iter = ArrayIter::new(string_array);
+                let result = iter
+                    .map(|string| string.map(|s| s.to_uppercase()))
+                    .collect::<StringArray>();
+                Arc::new(result) as ArrayRef
+            }
+            _ => return internal_err!("Expected a string argument, got {:?}", value),
+        };
+        Ok(ColumnarValue::from(result))
+    }
+}
+```
+
+We can now transfer the async UDF into the normal scalar using `into_scalar_udf` to register the function with DataFusion so that it can be used in the context of a query.
+
+```rust
+# use arrow::array::{ArrayIter, ArrayRef, AsArray, StringArray};
+# use arrow_schema::DataType;
+# use async_trait::async_trait;
+# use datafusion::common::error::Result;
+# use datafusion::common::{internal_err, not_impl_err};
+# use datafusion::common::types::logical_string;
+# use datafusion::config::ConfigOptions;
+# use datafusion_expr::ScalarUDFImpl;
+# use datafusion::logical_expr::async_udf::AsyncScalarUDFImpl;
+# use datafusion::logical_expr::{
+#     ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility, ScalarFunctionArgs
+# };
+# use datafusion::logical_expr_common::signature::Coercion;
+# use log::trace;
+# use std::any::Any;
+# use std::sync::Arc;
+#
+# #[derive(Debug, PartialEq, Eq, Hash)]
+# pub struct AsyncUpper {
+#     signature: Signature,
+# }
+#
+# impl Default for AsyncUpper {
+#     fn default() -> Self {
+#         Self::new()
+#     }
+# }
+#
+# impl AsyncUpper {
+#     pub fn new() -> Self {
+#         Self {
+#             signature: Signature::new(
+#                 TypeSignature::Coercible(vec![Coercion::Exact {
+#                     desired_type: TypeSignatureClass::Native(logical_string()),
+#                 }]),
+#                 Volatility::Volatile,
+#             ),
+#         }
+#     }
+# }
+#
+# #[async_trait]
+# impl ScalarUDFImpl for AsyncUpper {
+#     fn as_any(&self) -> &dyn Any {
+#         self
+#     }
+#
+#     fn name(&self) -> &str {
+#         "async_upper"
+#     }
+#
+#     fn signature(&self) -> &Signature {
+#         &self.signature
+#     }
+#
+#     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+#         Ok(DataType::Utf8)
+#     }
+#
+#     fn invoke_with_args(
+#        &self,
+#        _args: ScalarFunctionArgs,
+#     ) -> Result<ColumnarValue> {
+#         not_impl_err!("AsyncUpper can only be called from async contexts")
+#     }
+# }
+#
+# #[async_trait]
+# impl AsyncScalarUDFImpl for AsyncUpper {
+#     fn ideal_batch_size(&self) -> Option<usize> {
+#         Some(10)
+#     }
+#
+#     async fn invoke_async_with_args(
+#         &self,
+#         args: ScalarFunctionArgs,
+#     ) -> Result<ColumnarValue> {
+#         trace!("Invoking async_upper with args: {:?}", args);
+#         let value = &args.args[0];
+#         let result = match value {
+#             ColumnarValue::Array(array) => {
+#                 let string_array = array.as_string::<i32>();
+#                 let iter = ArrayIter::new(string_array);
+#                 let result = iter
+#                     .map(|string| string.map(|s| s.to_uppercase()))
+#                     .collect::<StringArray>();
+#                 Arc::new(result) as ArrayRef
+#             }
+#             _ => return internal_err!("Expected a string argument, got {:?}", value),
+#         };
+#         Ok(ColumnarValue::from(result))
+#     }
+# }
+use datafusion::execution::context::SessionContext;
+use datafusion::logical_expr::async_udf::AsyncScalarUDF;
+
+let async_upper = AsyncUpper::new();
+let udf = AsyncScalarUDF::new(Arc::new(async_upper));
+let mut ctx = SessionContext::new();
+ctx.register_udf(udf.into_scalar_udf());
+```
+
+After registration, you can use these async UDFs directly in SQL queries, for example:
+
+```sql
+SELECT async_upper('datafusion');
+```
+
+For async UDF implementation details, see [`async_udf.rs`](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/async_udf.rs).
+
 [`scalarudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.ScalarUDF.html
 [`create_udf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udf.html
-[`process_scalar_func_inputs`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/functions/fn.process_scalar_func_inputs.html
-[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
+[`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udf.rs
+
+## Named Arguments
+
+DataFusion supports named arguments for Scalar, Window, and Aggregate UDFs, allowing you to pass arguments by parameter name:
+
+```sql
+-- Scalar function
+SELECT substr(str => 'hello', start_pos => 2, length => 3);
+
+-- Window function
+SELECT lead(expr => value, offset => 1) OVER (ORDER BY id) FROM table;
+
+-- Aggregate function
+SELECT corr(y => col1, x => col2) FROM table;
+```
+
+Named arguments can be mixed with positional arguments, but positional arguments must come first:
+
+```sql
+SELECT substr('hello', start_pos => 2, length => 3);  -- Valid
+```
+
+### Implementing Functions with Named Arguments
+
+To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`. This works the same way for Scalar, Window, and Aggregate UDFs:
+
+```rust
+# use std::sync::Arc;
+# use std::any::Any;
+# use arrow::datatypes::DataType;
+# use datafusion_common::Result;
+# use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+# use datafusion_expr::ScalarUDFImpl;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct PowerFunction {
+    signature: Signature,
+}
+
+impl PowerFunction {
+    fn new() -> Self {
+        Self {
+            signature: Signature::uniform(
+                2,
+                vec![DataType::Float64],
+                Volatility::Immutable
+            )
+            .with_parameter_names(vec![
+                "base".to_string(),
+                "exponent".to_string()
+            ])
+            .expect("valid parameter names"),
+        }
+    }
+}
+
+impl ScalarUDFImpl for PowerFunction {
+    fn as_any(&self) -> &dyn Any { self }
+    fn name(&self) -> &str { "power" }
+    fn signature(&self) -> &Signature { &self.signature }
+
+    fn return_type(&self, _args: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        // Your implementation - arguments are in correct positional order
+        unimplemented!()
+    }
+}
+```
+
+The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function.
+
+Once registered, users can call your functions with named arguments in any order:
+
+```sql
+-- All equivalent
+SELECT power(base => 2.0, exponent => 3.0);
+SELECT power(exponent => 3.0, base => 2.0);
+SELECT power(2.0, exponent => 3.0);
+```
+
+### Error Messages
+
+When a function call fails due to incorrect arguments, DataFusion will show the parameter names in error messages to help users:
+
+```text
+No function matches the given name and argument types substr(Utf8).
+    Candidate functions:
+    substr(str: Any, start_pos: Any)
+    substr(str: Any, start_pos: Any, length: Any)
+```
 
 ## Adding a Window UDF
 
 Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have
 access to the rows around them. Access to the proximal rows is helpful, but adds some complexity to the implementation.
 
+For background and other considerations, see the [User defined Window Functions in DataFusion] blog.
+
+[user defined window functions in datafusion]: https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions
+
 For example, we will declare a user defined window function that computes a moving average.
 
 ```rust
@@ -490,7 +823,7 @@ let smooth_it = create_udwf(
 
 [`windowudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.WindowUDF.html
 [`create_udwf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udwf.html
-[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udwf.rs
+[`advanced_udwf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udwf.rs
 
 The `create_udwf` has five arguments to check:
 
@@ -1018,9 +1351,9 @@ async fn main() -> Result<()> {
 
 [`aggregateudf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.AggregateUDF.html
 [`create_udaf`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.create_udaf.html
-[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+[`advanced_udaf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/udf/advanced_udaf.rs
 
-## Adding a User-Defined Table Function
+## Adding a Table UDF
 
 A User-Defined Table Function (UDTF) is a function that takes parameters and returns a `TableProvider`.
 
@@ -1029,8 +1362,8 @@ This is a simple struct that holds a set of RecordBatches in memory and treats t
 be replaced with your own struct that implements `TableProvider`.
 
 While this is a simple example for illustrative purposes, UDTFs have a lot of potential use cases. And can be
-particularly useful for reading data from external sources and interactive analysis. For example, see the [example][4]
-for a working example that reads from a CSV file. As another example, you could use the built-in UDTF `parquet_metadata`
+particularly useful for reading data from external sources and interactive analysis. See the [working example][simple_udtf.rs]
+which reads from a CSV file. As another example, you could use the built-in UDTF `parquet_metadata`
 in the CLI to read the metadata from a Parquet file.
 
 ```console
@@ -1076,7 +1409,7 @@ pub struct EchoFunction {}
 
 impl TableFunctionImpl for EchoFunction {
     fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-        let Some(Expr::Literal(ScalarValue::Int64(Some(value)))) = exprs.get(0) else {
+        let Some(Expr::Literal(ScalarValue::Int64(Some(value)), _)) = exprs.get(0) else {
             return plan_err!("First argument must be an integer");
         };
 
@@ -1117,7 +1450,7 @@ With the UDTF implemented, you can register it with the `SessionContext`:
 #
 # impl TableFunctionImpl for EchoFunction {
 #     fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
-#         let Some(Expr::Literal(ScalarValue::Int64(Some(value)))) = exprs.get(0) else {
+#         let Some(Expr::Literal(ScalarValue::Int64(Some(value)), _)) = exprs.get(0) else {
 #             return plan_err!("First argument must be an integer");
 #         };
 #
@@ -1162,7 +1495,9 @@ async fn main() -> Result<()> {
 
 ## Custom Expression Planning
 
-DataFusion provides native support for common SQL operators by default such as `+`, `-`, `||`. However it does not provide support for other operators such as `@>`. To override DataFusion's default handling or support unsupported operators, developers can extend DataFusion by implementing custom expression planning, a core feature of DataFusion
+DataFusion provides native support for common SQL operators and constructs by default such as `+`, `-`, `||`. However it does not provide support for other operators such as `@>` or constructs like `TABLESAMPLE` which are less common or vary more between SQL dialects. To override DataFusion's default handling or support these unsupported features, developers can extend DataFusion by implementing custom expression planning, a core feature of DataFusion.
+
+For a comprehensive guide on extending SQL syntax including `ExprPlanner`, `TypePlanner`, and `RelationPlanner`, see [Extending DataFusion's SQL Syntax](../extending-sql.md)
 
 ### Implementing Custom Expression Planning
 
@@ -1244,8 +1579,3 @@ async fn main() -> Result<()> {
     Ok(())
 }
 ```
-
-[1]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
-[2]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udwf.rs
-[3]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udaf.rs
-[4]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udtf.rs
diff --git a/docs/source/library-user-guide/functions/index.rst b/docs/source/library-user-guide/functions/index.rst
new file mode 100644
index 0000000000000..d6127446c2286
--- /dev/null
+++ b/docs/source/library-user-guide/functions/index.rst
@@ -0,0 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Functions
+=============
+
+.. toctree::
+   :maxdepth: 2
+
+   adding-udfs
+   spark
diff --git a/docs/source/library-user-guide/functions/spark.md b/docs/source/library-user-guide/functions/spark.md
new file mode 100644
index 0000000000000..c371ae1cb5a86
--- /dev/null
+++ b/docs/source/library-user-guide/functions/spark.md
@@ -0,0 +1,29 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Spark Compatible Functions
+
+The [`datafusion-spark`] crate provides Apache Spark-compatible expressions for
+use with DataFusion.
+
+[`datafusion-spark`]: https://crates.io/crates/datafusion-spark
+
+Please see the documentation for the [`datafusion-spark` crate] for more details.
+
+[`datafusion-spark` crate]: https://docs.rs/datafusion-spark/latest/datafusion_spark/
diff --git a/docs/source/library-user-guide/profiling.md b/docs/source/library-user-guide/profiling.md
index 61e848a2b7d9b..a2ea6723e55a7 100644
--- a/docs/source/library-user-guide/profiling.md
+++ b/docs/source/library-user-guide/profiling.md
@@ -48,7 +48,7 @@ Ensure that you're in the directory containing the necessary data files for your
 
 ### Step 3: Running the Flamegraph Tool
 
-To generate a flamegraph, you'll need to use the -- separator to pass arguments to the binary you're profiling. For datafusion-cli, you need to make sure to run the command with sudo permissions (especially on macOS, where DTrace requires elevated privileges).
+To generate a flamegraph, you'll need to use the `--` separator to pass arguments to the binary you're profiling. For datafusion-cli, you need to make sure to run the command with sudo permissions (especially on macOS, where DTrace requires elevated privileges).
 
 Here is a general example:
 
diff --git a/docs/source/library-user-guide/query-optimizer.md b/docs/source/library-user-guide/query-optimizer.md
index a1ccd0a15a7e7..2254776bf6e3c 100644
--- a/docs/source/library-user-guide/query-optimizer.md
+++ b/docs/source/library-user-guide/query-optimizer.md
@@ -17,7 +17,7 @@
   under the License.
 -->
 
-# DataFusion Query Optimizer
+# Query Optimizer
 
 [DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory
 format.
@@ -25,9 +25,22 @@ format.
 DataFusion has modular design, allowing individual crates to be re-used in other projects.
 
 This crate is a submodule of DataFusion that provides a query optimizer for logical plans, and
-contains an extensive set of OptimizerRules that may rewrite the plan and/or its expressions so
+contains an extensive set of [`OptimizerRule`]s and [`PhysicalOptimizerRule`]s that may rewrite the plan and/or its expressions so
 they execute more quickly while still computing the same result.
 
+For a deeper background on optimizer architecture and rule types and predicates, see
+[Optimizing SQL (and DataFrames) in DataFusion, Part 1], [Part 2],
+[Using Ordering for Better Plans in Apache DataFusion], and
+[Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries].
+
+[`optimizerrule`]: https://docs.rs/datafusion/latest/datafusion/optimizer/trait.OptimizerRule.html
+[`physicaloptimizerrule`]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/trait.PhysicalOptimizerRule.html
+[optimizing sql (and dataframes) in datafusion, part 1]: https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-one
+[part 2]: https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-two
+[using ordering for better plans in apache datafusion]: https://datafusion.apache.org/blog/2025/03/11/ordering-analysis
+[dynamic filters: passing information between operators during execution for 25x faster queries]: https://datafusion.apache.org/blog/2025/09/10/dynamic-filters
+[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html
+
 ## Running the Optimizer
 
 The following code demonstrates the basic flow of creating the optimizer with a default set of optimization rules
@@ -68,11 +81,11 @@ fn observer(plan: &LogicalPlan, rule: &dyn OptimizerRule) {
 ## Writing Optimization Rules
 
 Please refer to the
-[optimizer_rule.rs](../../../datafusion-examples/examples/optimizer_rule.rs)
+[optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/optimizer_rule.rs)
 example to learn more about the general approach to writing optimizer rules and
 then move onto studying the existing rules.
 
-`OptimizerRule` transforms one ['LogicalPlan'] into another which
+`OptimizerRule` transforms one [`LogicalPlan`] into another which
 computes the same results, but in a potentially more efficient
 way. If there are no suitable transformations for the input plan,
 the optimizer can simply return it as is.
@@ -428,7 +441,7 @@ Each of these statistics is wrapped in a `Precision` type that indicates whether
 exact or estimated, allowing the optimizer to make informed decisions about the reliability
 of its cardinality estimates.
 
-### Boundary Analaysis Flow
+### Boundary Analysis Flow
 
 The boundary analysis process flows through several stages, with each stage building
 upon the information gathered in previous stages. The `AnalysisContext` is continuously
@@ -478,13 +491,10 @@ fn analyze_filter_example() -> Result<()> {
     let schema = Arc::new(Schema::new(vec![age]));
 
     // Define column statistics
-    let column_stats = ColumnStatistics {
-        null_count: Precision::Exact(0),
-        max_value: Precision::Exact(ScalarValue::Int64(Some(79))),
-        min_value: Precision::Exact(ScalarValue::Int64(Some(14))),
-        distinct_count: Precision::Absent,
-        sum_value: Precision::Absent,
-    };
+    let column_stats = ColumnStatistics::default()
+        .with_min_value(Precision::Exact(ScalarValue::Int64(Some(14))))
+        .with_max_value(Precision::Exact(ScalarValue::Int64(Some(79))))
+        .with_null_count(Precision::Exact(0));
 
     // Create expression: age > 18 AND age <= 25
     let expr = col("age")
@@ -504,3 +514,5 @@ fn analyze_filter_example() -> Result<()> {
     Ok(())
 }
 ```
+
+[treenode api]: https://docs.rs/datafusion/latest/datafusion/common/tree_node/trait.TreeNode.html
diff --git a/docs/source/library-user-guide/table-constraints.md b/docs/source/library-user-guide/table-constraints.md
new file mode 100644
index 0000000000000..252817822d990
--- /dev/null
+++ b/docs/source/library-user-guide/table-constraints.md
@@ -0,0 +1,42 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Table Constraint Enforcement
+
+Table providers can describe table constraints using the
+[`TableConstraint`] and [`Constraints`] APIs. These constraints include
+primary keys, unique keys, foreign keys and check constraints.
+
+DataFusion does **not** currently enforce these constraints at runtime.
+They are provided for informational purposes and can be used by custom
+`TableProvider` implementations or other parts of the system.
+
+- **Nullability**: The only property enforced by DataFusion is the
+  nullability of each [`Field`] in a schema. Returning data with null values
+  for Columns marked as not nullable will result in runtime errors during execution. DataFusion
+  does not check or enforce nullability when data is ingested.
+- **Primary and unique keys**: DataFusion does not verify that the data
+  satisfies primary or unique key constraints. Table providers that
+  require this behaviour must implement their own checks.
+- **Foreign keys and check constraints**: These constraints are parsed
+  but are not validated or used during query planning.
+
+[`tableconstraint`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/sqlparser/ast/enum.TableConstraint.html
+[`constraints`]: https://docs.rs/datafusion/latest/datafusion/common/struct.Constraints.html
+[`field`]: https://docs.rs/arrow/latest/arrow/datatypes/struct.Field.html
diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading/46.0.0.md
similarity index 52%
rename from docs/source/library-user-guide/upgrading.md
rename to docs/source/library-user-guide/upgrading/46.0.0.md
index 3922e0d45d884..e38d18c3d6609 100644
--- a/docs/source/library-user-guide/upgrading.md
+++ b/docs/source/library-user-guide/upgrading/46.0.0.md
@@ -19,250 +19,7 @@
 
 # Upgrade Guides
 
-## DataFusion `48.0.0`
-
-### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow.
-
-The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to `Utf8View`
-which improves performance for many string operations. You can read more about
-`Utf8View` in the [DataFusion blog post on German-style strings]
-
-[datafusion blog post on german-style strings]: https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/
-
-This means that when you create a table with a `VARCHAR` column, it will now use
-`Utf8View` as the underlying data type. For example:
-
-```sql
-> CREATE TABLE my_table (my_column VARCHAR);
-0 row(s) fetched.
-Elapsed 0.001 seconds.
-
-> DESCRIBE my_table;
-+-------------+-----------+-------------+
-| column_name | data_type | is_nullable |
-+-------------+-----------+-------------+
-| my_column   | Utf8View  | YES         |
-+-------------+-----------+-------------+
-1 row(s) fetched.
-Elapsed 0.000 seconds.
-```
-
-You can restore the old behavior of using `Utf8` by changing the
-`datafusion.sql_parser.map_varchar_to_utf8view` configuration setting. For
-example
-
-```sql
-> set datafusion.sql_parser.map_varchar_to_utf8view = false;
-0 row(s) fetched.
-Elapsed 0.001 seconds.
-
-> CREATE TABLE my_table (my_column VARCHAR);
-0 row(s) fetched.
-Elapsed 0.014 seconds.
-
-> DESCRIBE my_table;
-+-------------+-----------+-------------+
-| column_name | data_type | is_nullable |
-+-------------+-----------+-------------+
-| my_column   | Utf8      | YES         |
-+-------------+-----------+-------------+
-1 row(s) fetched.
-Elapsed 0.004 seconds.
-```
-
-### `ListingOptions` default for `collect_stat` changed from `true` to `false`
-
-This makes it agree with the default for `SessionConfig`.
-Most users won't be impacted by this change but if you were using `ListingOptions` directly
-and relied on the default value of `collect_stat` being `true`, you will need to
-explicitly set it to `true` in your code.
-
-```rust
-# /* comment to avoid running
-ListingOptions::new(Arc::new(ParquetFormat::default()))
-    .with_collect_stat(true)
-    // other options
-# */
-```
-
-### Processing `FieldRef` instead of `DataType` for user defined functions
-
-In order to support metadata handling and extension types, user defined functions are
-now switching to traits which use `FieldRef` rather than a `DataType` and nullability.
-This gives a single interface to both of these parameters and additionally allows
-access to metadata fields, which can be used for extension types.
-
-To upgrade structs which implement `ScalarUDFImpl`, if you have implemented
-`return_type_from_args` you need instead to implement `return_field_from_args`.
-If your functions do not need to handle metadata, this should be straightforward
-repackaging of the output data into a `FieldRef`. The name you specify on the
-field is not important. It will be overwritten during planning. `ReturnInfo`
-has been removed, so you will need to remove all references to it.
-
-`ScalarFunctionArgs` now contains a field called `arg_fields`. You can use this
-to access the metadata associated with the columnar values during invocation.
-
-To upgrade user defined aggregate functions, there is now a function
-`return_field` that will allow you to specify both metadata and nullability of
-your function. You are not required to implement this if you do not need to
-handle metatdata.
-
-The largest change to aggregate functions happens in the accumulator arguments.
-Both the `AccumulatorArgs` and `StateFieldsArgs` now contain `FieldRef` rather
-than `DataType`.
-
-To upgrade window functions, `ExpressionArgs` now contains input fields instead
-of input data types. When setting these fields, the name of the field is
-not important since this gets overwritten during the planning stage. All you
-should need to do is wrap your existing data types in fields with nullability
-set depending on your use case.
-
-### Physical Expression return `Field`
-
-To support the changes to user defined functions processing metadata, the
-`PhysicalExpr` trait, which now must specify a return `Field` based on the input
-schema. To upgrade structs which implement `PhysicalExpr` you need to implement
-the `return_field` function. There are numerous examples in the `physical-expr`
-crate.
-
-### `FileFormat::supports_filters_pushdown` replaced with `FileSource::try_pushdown_filters`
-
-To support more general filter pushdown, the `FileFormat::supports_filters_pushdown` was replaced with
-`FileSource::try_pushdown_filters`.
-If you implemented a custom `FileFormat` that uses a custom `FileSource` you will need to implement
-`FileSource::try_pushdown_filters`.
-See `ParquetSource::try_pushdown_filters` for an example of how to implement this.
-
-`FileFormat::supports_filters_pushdown` has been removed.
-
-### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` Removed
-
-`ParquetExec`, `AvroExec`, `CsvExec`, and `JsonExec` were deprecated in
-DataFusion 46 and are removed in DataFusion 48. This is sooner than the normal
-process described in the [API Deprecation Guidelines] because all the tests
-cover the new `DataSourceExec` rather than the older structures. As we evolve
-`DataSource`, the old structures began to show signs of "bit rotting" (not
-working but no one knows due to lack of test coverage).
-
-[api deprecation guidelines]: https://datafusion.apache.org/contributor-guide/api-health.html#deprecation-guidelines
-
-## DataFusion `47.0.0`
-
-This section calls out some of the major changes in the `47.0.0` release of DataFusion.
-
-Here are some example upgrade PRs that demonstrate changes required when upgrading from DataFusion 46.0.0:
-
-- [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378)
-- [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563)
-- [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434)
-
-### Upgrades to `arrow-rs` and `arrow-parquet` 55.0.0 and `object_store` 0.12.0
-
-Several APIs are changed in the underlying arrow and parquet libraries to use a
-`u64` instead of `usize` to better support WASM (See [#7371] and [#6961])
-
-Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619])
-
-[#6619]: https://github.com/apache/arrow-rs/pull/6619
-[#7371]: https://github.com/apache/arrow-rs/pull/7371
-[#7328]: https://github.com/apache/arrow-rs/pull/6961
-
-This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as
-
-```rust
-# /* comment to avoid running
-impl Objectstore {
-    ...
-    // The range is now a u64 instead of usize
-    async fn get_range(&self, location: &Path, range: Range<u64>) -> ObjectStoreResult<Bytes> {
-        self.inner.get_range(location, range).await
-    }
-    ...
-    // the lifetime is now 'static instead of `_ (meaning the captured closure can't contain references)
-    // (this also applies to list_with_offset)
-    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult<ObjectMeta>> {
-        self.inner.list(prefix)
-    }
-}
-# */
-```
-
-The `ParquetObjectReader` has been updated to no longer require the object size
-(it can be fetched using a single suffix request). See [#7334] for details
-
-[#7334]: https://github.com/apache/arrow-rs/pull/7334
-
-Pattern in DataFusion `46.0.0`:
-
-```rust
-# /* comment to avoid running
-let meta: ObjectMeta = ...;
-let reader = ParquetObjectReader::new(store, meta);
-# */
-```
-
-Pattern in DataFusion `47.0.0`:
-
-```rust
-# /* comment to avoid running
-let meta: ObjectMeta = ...;
-let reader = ParquetObjectReader::new(store, location)
-  .with_file_size(meta.size);
-# */
-```
-
-### `DisplayFormatType::TreeRender`
-
-DataFusion now supports [`tree` style explain plans]. Implementations of
-`Executionplan` must also provide a description in the
-`DisplayFormatType::TreeRender` format. This can be the same as the existing
-`DisplayFormatType::Default`.
-
-[`tree` style explain plans]: https://datafusion.apache.org/user-guide/sql/explain.html#tree-format-default
-
-### Removed Deprecated APIs
-
-Several APIs have been removed in this release. These were either deprecated
-previously or were hard to use correctly such as the multiple different
-`ScalarUDFImpl::invoke*` APIs. See [#15130], [#15123], and [#15027] for more
-details.
-
-[#15130]: https://github.com/apache/datafusion/pull/15130
-[#15123]: https://github.com/apache/datafusion/pull/15123
-[#15027]: https://github.com/apache/datafusion/pull/15027
-
-### `FileScanConfig` --> `FileScanConfigBuilder`
-
-Previously, `FileScanConfig::build()` directly created ExecutionPlans. In
-DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See
-[#15352] for details.
-
-[#15352]: https://github.com/apache/datafusion/pull/15352
-
-Pattern in DataFusion `46.0.0`:
-
-```rust
-# /* comment to avoid running
-let plan = FileScanConfig::new(url, schema, Arc::new(file_source))
-  .with_statistics(stats)
-  ...
-  .build()
-# */
-```
-
-Pattern in DataFusion `47.0.0`:
-
-```rust
-# /* comment to avoid running
-let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source))
-  .with_statistics(stats)
-  ...
-  .build();
-let scan = DataSourceExec::from_data_source(config);
-# */
-```
-
-## DataFusion `46.0.0`
+## DataFusion 46.0.0
 
 ### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()`
 
@@ -460,8 +217,8 @@ Elapsed 0.005 seconds.
 DataFusion 46 has changed the way scalar array function signatures are
 declared. Previously, functions needed to select from a list of predefined
 signatures within the `ArrayFunctionSignature` enum. Now the signatures
-can be defined via a `Vec` of psuedo-types, which each correspond to a
-single argument. Those psuedo-types are the variants of the
+can be defined via a `Vec` of pseudo-types, which each correspond to a
+single argument. Those pseudo-types are the variants of the
 `ArrayFunctionArgument` enum and are as follows:
 
 - `Array`: An argument of type List/LargeList/FixedSizeList. All Array
diff --git a/docs/source/library-user-guide/upgrading/47.0.0.md b/docs/source/library-user-guide/upgrading/47.0.0.md
new file mode 100644
index 0000000000000..354b6740df02f
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/47.0.0.md
@@ -0,0 +1,135 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 47.0.0
+
+This section calls out some of the major changes in the `47.0.0` release of DataFusion.
+
+Here are some example upgrade PRs that demonstrate changes required when upgrading from DataFusion 46.0.0:
+
+- [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378)
+- [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563)
+- [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434)
+
+### Upgrades to `arrow-rs` and `arrow-parquet` 55.0.0 and `object_store` 0.12.0
+
+Several APIs are changed in the underlying arrow and parquet libraries to use a
+`u64` instead of `usize` to better support WASM (See [#7371] and [#6961])
+
+Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619])
+
+[#6619]: https://github.com/apache/arrow-rs/pull/6619
+[#7371]: https://github.com/apache/arrow-rs/pull/7371
+
+This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as
+
+```rust
+# /* comment to avoid running
+impl Objectstore {
+    ...
+    // The range is now a u64 instead of usize
+    async fn get_range(&self, location: &Path, range: Range<u64>) -> ObjectStoreResult<Bytes> {
+        self.inner.get_range(location, range).await
+    }
+    ...
+    // the lifetime is now 'static instead of `_ (meaning the captured closure can't contain references)
+    // (this also applies to list_with_offset)
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult<ObjectMeta>> {
+        self.inner.list(prefix)
+    }
+}
+# */
+```
+
+The `ParquetObjectReader` has been updated to no longer require the object size
+(it can be fetched using a single suffix request). See [#7334] for details
+
+[#7334]: https://github.com/apache/arrow-rs/pull/7334
+
+Pattern in DataFusion `46.0.0`:
+
+```rust
+# /* comment to avoid running
+let meta: ObjectMeta = ...;
+let reader = ParquetObjectReader::new(store, meta);
+# */
+```
+
+Pattern in DataFusion `47.0.0`:
+
+```rust
+# /* comment to avoid running
+let meta: ObjectMeta = ...;
+let reader = ParquetObjectReader::new(store, location)
+  .with_file_size(meta.size);
+# */
+```
+
+### `DisplayFormatType::TreeRender`
+
+DataFusion now supports [`tree` style explain plans]. Implementations of
+`Executionplan` must also provide a description in the
+`DisplayFormatType::TreeRender` format. This can be the same as the existing
+`DisplayFormatType::Default`.
+
+[`tree` style explain plans]: https://datafusion.apache.org/user-guide/sql/explain.html#tree-format-default
+
+### Removed Deprecated APIs
+
+Several APIs have been removed in this release. These were either deprecated
+previously or were hard to use correctly such as the multiple different
+`ScalarUDFImpl::invoke*` APIs. See [#15130], [#15123], and [#15027] for more
+details.
+
+[#15130]: https://github.com/apache/datafusion/pull/15130
+[#15123]: https://github.com/apache/datafusion/pull/15123
+[#15027]: https://github.com/apache/datafusion/pull/15027
+
+### `FileScanConfig` --> `FileScanConfigBuilder`
+
+Previously, `FileScanConfig::build()` directly created ExecutionPlans. In
+DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See
+[#15352] for details.
+
+[#15352]: https://github.com/apache/datafusion/pull/15352
+
+Pattern in DataFusion `46.0.0`:
+
+```rust
+# /* comment to avoid running
+let plan = FileScanConfig::new(url, schema, Arc::new(file_source))
+  .with_statistics(stats)
+  ...
+  .build()
+# */
+```
+
+Pattern in DataFusion `47.0.0`:
+
+```rust
+# /* comment to avoid running
+let config = FileScanConfigBuilder::new(url, Arc::new(file_source))
+  .with_statistics(stats)
+  ...
+  .build();
+let scan = DataSourceExec::from_data_source(config);
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/48.0.0.md b/docs/source/library-user-guide/upgrading/48.0.0.md
new file mode 100644
index 0000000000000..7872a6f54f245
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/48.0.0.md
@@ -0,0 +1,244 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 48.0.0
+
+### `Expr::Literal` has optional metadata
+
+The [`Expr::Literal`] variant now includes optional metadata, which allows for
+carrying through Arrow field metadata to support extension types and other uses.
+
+This means code such as
+
+```rust
+# /* comment to avoid running
+match expr {
+...
+  Expr::Literal(scalar) => ...
+...
+}
+#  */
+```
+
+Should be updated to:
+
+```rust
+# /* comment to avoid running
+match expr {
+...
+  Expr::Literal(scalar, _metadata) => ...
+...
+}
+#  */
+```
+
+Likewise constructing `Expr::Literal` requires metadata as well. The [`lit`] function
+has not changed and returns an `Expr::Literal` with no metadata.
+
+[`expr::literal`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#variant.Literal
+[`lit`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/fn.lit.html
+
+### `Expr::WindowFunction` is now `Box`ed
+
+`Expr::WindowFunction` is now a `Box<WindowFunction>` instead of a `WindowFunction` directly.
+This change was made to reduce the size of `Expr` and improve performance when
+planning queries (see [details on #16207]).
+
+This is a breaking change, so you will need to update your code if you match
+on `Expr::WindowFunction` directly. For example, if you have code like this:
+
+```rust
+# /* comment to avoid running
+match expr {
+  Expr::WindowFunction(WindowFunction {
+    params:
+      WindowFunctionParams {
+       partition_by,
+       order_by,
+      ..
+    }
+  }) => {
+    // Use partition_by and order_by as needed
+  }
+  _ => {
+    // other expr
+  }
+}
+# */
+```
+
+You will need to change it to:
+
+```rust
+# /* comment to avoid running
+match expr {
+  Expr::WindowFunction(window_fun) => {
+    let WindowFunction {
+      fun,
+      params: WindowFunctionParams {
+        args,
+        partition_by,
+        ..
+        },
+    } = window_fun.as_ref();
+    // Use partition_by and order_by as needed
+  }
+  _ => {
+    // other expr
+  }
+}
+#  */
+```
+
+[details on #16207]: https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
+
+### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
+
+The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to `Utf8View`
+which improves performance for many string operations. You can read more about
+`Utf8View` in the [DataFusion blog post on German-style strings]
+
+[datafusion blog post on german-style strings]: https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/
+
+This means that when you create a table with a `VARCHAR` column, it will now use
+`Utf8View` as the underlying data type. For example:
+
+```sql
+> CREATE TABLE my_table (my_column VARCHAR);
+0 row(s) fetched.
+Elapsed 0.001 seconds.
+
+> DESCRIBE my_table;
++-------------+-----------+-------------+
+| column_name | data_type | is_nullable |
++-------------+-----------+-------------+
+| my_column   | Utf8View  | YES         |
++-------------+-----------+-------------+
+1 row(s) fetched.
+Elapsed 0.000 seconds.
+```
+
+You can restore the old behavior of using `Utf8` by changing the
+`datafusion.sql_parser.map_varchar_to_utf8view` configuration setting. For
+example
+
+```sql
+> set datafusion.sql_parser.map_varchar_to_utf8view = false;
+0 row(s) fetched.
+Elapsed 0.001 seconds.
+
+> CREATE TABLE my_table (my_column VARCHAR);
+0 row(s) fetched.
+Elapsed 0.014 seconds.
+
+> DESCRIBE my_table;
++-------------+-----------+-------------+
+| column_name | data_type | is_nullable |
++-------------+-----------+-------------+
+| my_column   | Utf8      | YES         |
++-------------+-----------+-------------+
+1 row(s) fetched.
+Elapsed 0.004 seconds.
+```
+
+### `ListingOptions` default for `collect_stat` changed from `true` to `false`
+
+This makes it agree with the default for `SessionConfig`.
+Most users won't be impacted by this change but if you were using `ListingOptions` directly
+and relied on the default value of `collect_stat` being `true`, you will need to
+explicitly set it to `true` in your code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+    .with_collect_stat(true)
+    // other options
+# */
+```
+
+### Processing `FieldRef` instead of `DataType` for user defined functions
+
+In order to support metadata handling and extension types, user defined functions are
+now switching to traits which use `FieldRef` rather than a `DataType` and nullability.
+This gives a single interface to both of these parameters and additionally allows
+access to metadata fields, which can be used for extension types.
+
+To upgrade structs which implement `ScalarUDFImpl`, if you have implemented
+`return_type_from_args` you need instead to implement `return_field_from_args`.
+If your functions do not need to handle metadata, this should be straightforward
+repackaging of the output data into a `FieldRef`. The name you specify on the
+field is not important. It will be overwritten during planning. `ReturnInfo`
+has been removed, so you will need to remove all references to it.
+
+`ScalarFunctionArgs` now contains a field called `arg_fields`. You can use this
+to access the metadata associated with the columnar values during invocation.
+
+To upgrade user defined aggregate functions, there is now a function
+`return_field` that will allow you to specify both metadata and nullability of
+your function. You are not required to implement this if you do not need to
+handle metadata.
+
+The largest change to aggregate functions happens in the accumulator arguments.
+Both the `AccumulatorArgs` and `StateFieldsArgs` now contain `FieldRef` rather
+than `DataType`.
+
+To upgrade window functions, `ExpressionArgs` now contains input fields instead
+of input data types. When setting these fields, the name of the field is
+not important since this gets overwritten during the planning stage. All you
+should need to do is wrap your existing data types in fields with nullability
+set depending on your use case.
+
+### Physical Expression return `Field`
+
+To support the changes to user defined functions processing metadata, the
+`PhysicalExpr` trait, which now must specify a return `Field` based on the input
+schema. To upgrade structs which implement `PhysicalExpr` you need to implement
+the `return_field` function. There are numerous examples in the `physical-expr`
+crate.
+
+### `FileFormat::supports_filters_pushdown` replaced with `FileSource::try_pushdown_filters`
+
+To support more general filter pushdown, the `FileFormat::supports_filters_pushdown` was replaced with
+`FileSource::try_pushdown_filters`.
+If you implemented a custom `FileFormat` that uses a custom `FileSource` you will need to implement
+`FileSource::try_pushdown_filters`.
+See `ParquetSource::try_pushdown_filters` for an example of how to implement this.
+
+`FileFormat::supports_filters_pushdown` has been removed.
+
+### `ParquetExec`, `AvroExec`, `CsvExec`, `JsonExec` Removed
+
+`ParquetExec`, `AvroExec`, `CsvExec`, and `JsonExec` were deprecated in
+DataFusion 46 and are removed in DataFusion 48. This is sooner than the normal
+process described in the [API Deprecation Guidelines] because all the tests
+cover the new `DataSourceExec` rather than the older structures. As we evolve
+`DataSource`, the old structures began to show signs of "bit rotting" (not
+working but no one knows due to lack of test coverage).
+
+[api deprecation guidelines]: https://datafusion.apache.org/contributor-guide/api-health.html#deprecation-guidelines
+
+### `PartitionedFile` added as an argument to the `FileOpener` trait
+
+This is necessary to properly fix filter pushdown for filters that combine partition
+columns and file columns (e.g. `day = username['dob']`).
+
+If you implemented a custom `FileOpener` you will need to add the `PartitionedFile` argument
+but are not required to use it in any way.
diff --git a/docs/source/library-user-guide/upgrading/48.0.1.md b/docs/source/library-user-guide/upgrading/48.0.1.md
new file mode 100644
index 0000000000000..5dfb9e1e3d0b1
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/48.0.1.md
@@ -0,0 +1,39 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 48.0.1
+
+### `datafusion.execution.collect_statistics` now defaults to `true`
+
+The default value of the `datafusion.execution.collect_statistics` configuration
+setting is now true. This change impacts users that use that value directly and relied
+on its default value being `false`.
+
+This change also restores the default behavior of `ListingTable` to its previous. If you use it directly
+you can maintain the current behavior by overriding the default value in your code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+    .with_collect_stat(false)
+    // other options
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/49.0.0.md b/docs/source/library-user-guide/upgrading/49.0.0.md
new file mode 100644
index 0000000000000..92dee8135590a
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/49.0.0.md
@@ -0,0 +1,222 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 49.0.0
+
+### `MSRV` updated to 1.85.1
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.85.1`]. See
+[#16728] for details.
+
+[`1.85.1`]: https://releases.rs/docs/1.85.1/
+[#16728]: https://github.com/apache/datafusion/pull/16728
+
+### `DataFusionError` variants are now `Box`ed
+
+To reduce the size of `DataFusionError`, several variants that were previously stored inline are now `Box`ed. This reduces the size of `Result<T, DataFusionError>` and thus stack usage and async state machine size. Please see [#16652] for more details.
+
+The following variants of `DataFusionError` are now boxed:
+
+- `ArrowError`
+- `SQL`
+- `SchemaError`
+
+This is a breaking change. Code that constructs or matches on these variants will need to be updated.
+
+For example, to create a `SchemaError`, instead of:
+
+```rust
+# /* comment to avoid running
+use datafusion_common::{DataFusionError, SchemaError};
+DataFusionError::SchemaError(
+  SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() },
+  Box::new(None)
+)
+# */
+```
+
+You now need to `Box` the inner error:
+
+```rust
+# /* comment to avoid running
+use datafusion_common::{DataFusionError, SchemaError};
+DataFusionError::SchemaError(
+  Box::new(SchemaError::DuplicateUnqualifiedField { name: "foo".to_string() }),
+  Box::new(None)
+)
+# */
+```
+
+[#16652]: https://github.com/apache/datafusion/issues/16652
+
+### Metadata on Arrow Types is now represented by `FieldMetadata`
+
+Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
+structure. In prior versions it was stored as both a `HashMap<String, String>`
+and a `BTreeMap<String, String>`. `FieldMetadata` is a easier to work with and
+is more efficient.
+
+To create `FieldMetadata` from a `Field`:
+
+```rust
+# /* comment to avoid running
+ let metadata = FieldMetadata::from(&field);
+# */
+```
+
+To add metadata to a `Field`, use the `add_to_field` method:
+
+```rust
+# /* comment to avoid running
+let updated_field = metadata.add_to_field(field);
+# */
+```
+
+See [#16317] for details.
+
+[#16317]: https://github.com/apache/datafusion/pull/16317
+
+### New `datafusion.execution.spill_compression` configuration option
+
+DataFusion 49.0.0 adds support for compressing spill files when data is written to disk during spilling query execution. A new configuration option `datafusion.execution.spill_compression` controls the compression codec used.
+
+**Configuration:**
+
+- **Key**: `datafusion.execution.spill_compression`
+- **Default**: `uncompressed`
+- **Valid values**: `uncompressed`, `lz4_frame`, `zstd`
+
+**Usage:**
+
+```rust
+# /* comment to avoid running
+use datafusion::prelude::*;
+use datafusion_common::config::SpillCompression;
+
+let config = SessionConfig::default()
+    .with_spill_compression(SpillCompression::Zstd);
+let ctx = SessionContext::new_with_config(config);
+# */
+```
+
+Or via SQL:
+
+```sql
+SET datafusion.execution.spill_compression = 'zstd';
+```
+
+For more details about this configuration option, including performance trade-offs between different compression codecs, see the [Configuration Settings](../../user-guide/configs) documentation.
+
+### Deprecated `map_varchar_to_utf8view` configuration option
+
+See [issue #16290](https://github.com/apache/datafusion/pull/16290) for more information
+The old configuration
+
+```text
+datafusion.sql_parser.map_varchar_to_utf8view
+```
+
+is now **deprecated** in favor of the unified option below.\
+If you previously used this to control only `VARCHAR`→`Utf8View` mapping, please migrate to `map_string_types_to_utf8view`.
+
+---
+
+### New `map_string_types_to_utf8view` configuration option
+
+To unify **all** SQL string types (`CHAR`, `VARCHAR`, `TEXT`, `STRING`) to Arrow’s zero‑copy `Utf8View`, DataFusion 49.0.0 introduces:
+
+- **Key**: `datafusion.sql_parser.map_string_types_to_utf8view`
+- **Default**: `true`
+
+**Description:**
+
+- When **true** (default), **all** SQL string types are mapped to `Utf8View`, avoiding full‑copy UTF‑8 allocations and improving performance.
+- When **false**, DataFusion falls back to the legacy `Utf8` mapping for **all** string types.
+
+#### Examples
+
+```rust
+# /* comment to avoid running
+// Disable Utf8View mapping for all SQL string types
+let opts = datafusion::sql::planner::ParserOptions::new()
+    .with_map_string_types_to_utf8view(false);
+
+// Verify the setting is applied
+assert!(!opts.map_string_types_to_utf8view);
+# */
+```
+
+---
+
+```sql
+-- Disable Utf8View mapping globally
+SET datafusion.sql_parser.map_string_types_to_utf8view = false;
+
+-- Now VARCHAR, CHAR, TEXT, STRING all use Utf8 rather than Utf8View
+CREATE TABLE my_table (a VARCHAR, b TEXT, c STRING);
+DESCRIBE my_table;
+```
+
+### Deprecating `SchemaAdapterFactory` and `SchemaAdapter`
+
+We are moving away from converting data (using `SchemaAdapter`) to converting the expressions themselves (which is more efficient and flexible).
+
+See [issue #16800](https://github.com/apache/datafusion/issues/16800) for more information
+The first place this change has taken place is in predicate pushdown for Parquet.
+By default if you do not use a custom `SchemaAdapterFactory` we will use expression conversion instead.
+If you do set a custom `SchemaAdapterFactory` we will continue to use it but emit a warning about that code path being deprecated.
+
+To resolve this you need to implement a custom `PhysicalExprAdapterFactory` and use that instead of a `SchemaAdapterFactory`.
+See the [default values](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for an example of how to do this.
+Opting into the new APIs will set you up for future changes since we plan to expand use of `PhysicalExprAdapterFactory` to other areas of DataFusion.
+
+See [#16800] for details.
+
+[#16800]: https://github.com/apache/datafusion/issues/16800
+
+### `TableParquetOptions` Updated
+
+The `TableParquetOptions` struct has a new `crypto` field to specify encryption
+options for Parquet files. The `ParquetEncryptionOptions` implements `Default`
+so you can upgrade your existing code like this:
+
+```rust
+# /* comment to avoid running
+TableParquetOptions {
+  global,
+  column_specific_options,
+  key_value_metadata,
+}
+# */
+```
+
+To this:
+
+```rust
+# /* comment to avoid running
+TableParquetOptions {
+  global,
+  column_specific_options,
+  key_value_metadata,
+  crypto: Default::default(), // New crypto field
+}
+# */
+```
diff --git a/docs/source/library-user-guide/upgrading/50.0.0.md b/docs/source/library-user-guide/upgrading/50.0.0.md
new file mode 100644
index 0000000000000..d8155dab58962
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/50.0.0.md
@@ -0,0 +1,330 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 50.0.0
+
+### ListingTable automatically detects Hive Partitioned tables
+
+DataFusion 50.0.0 automatically infers Hive partitions when using the `ListingTableFactory` and `CREATE EXTERNAL TABLE`. Previously,
+when creating a `ListingTable`, datasets that use Hive partitioning (e.g.
+`/table_root/column1=value1/column2=value2/data.parquet`) would not have the Hive columns reflected in
+the table's schema or data. The previous behavior can be
+restored by setting the `datafusion.execution.listing_table_factory_infer_partitions` configuration option to `false`.
+See [issue #17049] for more details.
+
+[issue #17049]: https://github.com/apache/datafusion/issues/17049
+
+### `MSRV` updated to 1.86.0
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.86.0`].
+See [#17230] for details.
+
+[`1.86.0`]: https://releases.rs/docs/1.86.0/
+[#17230]: https://github.com/apache/datafusion/pull/17230
+
+### `ScalarUDFImpl`, `AggregateUDFImpl` and `WindowUDFImpl` traits now require `PartialEq`, `Eq`, and `Hash` traits
+
+To address error-proneness of `ScalarUDFImpl::equals`, `AggregateUDFImpl::equals`and
+`WindowUDFImpl::equals` methods and to make it easy to implement function equality correctly,
+the `equals` and `hash_value` methods have been removed from `ScalarUDFImpl`, `AggregateUDFImpl`
+and `WindowUDFImpl` traits. They are replaced the requirement to implement the `PartialEq`, `Eq`,
+and `Hash` traits on any type implementing `ScalarUDFImpl`, `AggregateUDFImpl` or `WindowUDFImpl`.
+Please see [issue #16677] for more details.
+
+Most of the scalar functions are stateless and have a `signature` field. These can be migrated
+using regular expressions
+
+- search for `\#\[derive\(Debug\)\](\n *(pub )?struct \w+ \{\n *signature\: Signature\,\n *\})`,
+- replace with `#[derive(Debug, PartialEq, Eq, Hash)]$1`,
+- review all the changes and make sure only function structs were changed.
+
+[issue #16677]: https://github.com/apache/datafusion/issues/16677
+
+### `AsyncScalarUDFImpl::invoke_async_with_args` returns `ColumnarValue`
+
+In order to enable single value optimizations and be consistent with other
+user defined function APIs, the `AsyncScalarUDFImpl::invoke_async_with_args` method now
+returns a `ColumnarValue` instead of a `ArrayRef`.
+
+To upgrade, change the return type of your implementation
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ColumnarValue> {
+        ..
+      return array_ref; // old code
+    }
+}
+# */
+```
+
+To return a `ColumnarValue`
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ColumnarValue> {
+        ..
+      return ColumnarValue::from(array_ref); // new code
+    }
+}
+# */
+```
+
+See [#16896](https://github.com/apache/datafusion/issues/16896) for more details.
+
+### `ProjectionExpr` changed from type alias to struct
+
+`ProjectionExpr` has been changed from a type alias to a struct with named fields to improve code clarity and maintainability.
+
+**Before:**
+
+```rust,ignore
+pub type ProjectionExpr = (Arc<dyn PhysicalExpr>, String);
+```
+
+**After:**
+
+```rust,ignore
+#[derive(Debug, Clone)]
+pub struct ProjectionExpr {
+    pub expr: Arc<dyn PhysicalExpr>,
+    pub alias: String,
+}
+```
+
+To upgrade your code:
+
+- Replace tuple construction `(expr, alias)` with `ProjectionExpr::new(expr, alias)` or `ProjectionExpr { expr, alias }`
+- Replace tuple field access `.0` and `.1` with `.expr` and `.alias`
+- Update pattern matching from `(expr, alias)` to `ProjectionExpr { expr, alias }`
+
+This mainly impacts use of `ProjectionExec`.
+
+This change was done in [#17398]
+
+[#17398]: https://github.com/apache/datafusion/pull/17398
+
+### `SessionState`, `SessionConfig`, and `OptimizerConfig` returns `&Arc<ConfigOptions>` instead of `&ConfigOptions`
+
+To provide broader access to `ConfigOptions` and reduce required clones, some
+APIs have been changed to return a `&Arc<ConfigOptions>` instead of a
+`&ConfigOptions`. This allows sharing the same `ConfigOptions` across multiple
+threads without needing to clone the entire `ConfigOptions` structure unless it
+is modified.
+
+Most users will not be impacted by this change since the Rust compiler typically
+automatically dereference the `Arc` when needed. However, in some cases you may
+have to change your code to explicitly call `as_ref()` for example, from
+
+```rust
+# /* comment to avoid running
+let optimizer_config: &ConfigOptions = state.options();
+#  */
+```
+
+To
+
+```rust
+# /* comment to avoid running
+let optimizer_config: &ConfigOptions = state.options().as_ref();
+#  */
+```
+
+See PR [#16970](https://github.com/apache/datafusion/pull/16970)
+
+### API Change to `AsyncScalarUDFImpl::invoke_async_with_args`
+
+The `invoke_async_with_args` method of the `AsyncScalarUDFImpl` trait has been
+updated to remove the `_option: &ConfigOptions` parameter to simplify the API
+now that the `ConfigOptions` can be accessed through the `ScalarFunctionArgs`
+parameter.
+
+You can change your code like this
+
+```rust
+# /* comment to avoid running
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+        _option: &ConfigOptions,
+    ) -> Result<ArrayRef> {
+        ..
+    }
+    ...
+}
+# */
+```
+
+To this:
+
+```rust
+# /* comment to avoid running
+
+impl AsyncScalarUDFImpl for AskLLM {
+    async fn invoke_async_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> Result<ArrayRef> {
+        let options = &args.config_options;
+        ..
+    }
+    ...
+}
+# */
+```
+
+### Schema Rewriter Module Moved to New Crate
+
+The `schema_rewriter` module and its associated symbols have been moved from `datafusion_physical_expr` to a new crate `datafusion_physical_expr_adapter`. This affects the following symbols:
+
+- `DefaultPhysicalExprAdapter`
+- `DefaultPhysicalExprAdapterFactory`
+- `PhysicalExprAdapter`
+- `PhysicalExprAdapterFactory`
+
+To upgrade, change your imports to:
+
+```rust
+use datafusion_physical_expr_adapter::{
+    DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory,
+    PhysicalExprAdapter, PhysicalExprAdapterFactory
+};
+```
+
+### Upgrade to arrow `56.0.0` and parquet `56.0.0`
+
+This version of DataFusion upgrades the underlying Apache Arrow implementation
+to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0)
+for more details.
+
+### Added `ExecutionPlan::reset_state`
+
+In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`)
+produced incorrect results in recursive queries, a new method `reset_state` has been added to the `ExecutionPlan` trait.
+
+Any `ExecutionPlan` that needs to maintain internal state or references to other nodes in the execution plan tree should implement this method to reset that state.
+See [#17028] for more details and an example implementation for `SortExec`.
+
+[#17028]: https://github.com/apache/datafusion/pull/17028
+
+### Nested Loop Join input sort order cannot be preserved
+
+The Nested Loop Join operator has been rewritten from scratch to improve performance and memory efficiency. From the micro-benchmarks: this change introduces up to 5X speed-up and uses only 1% memory in extreme cases compared to the previous implementation.
+
+However, the new implementation cannot preserve input sort order like the old version could. This is a fundamental design trade-off that prioritizes performance and memory efficiency over sort order preservation.
+
+See [#16996] for details.
+
+[#16996]: https://github.com/apache/datafusion/pull/16996
+
+### Add `as_any()` method to `LazyBatchGenerator`
+
+To help with protobuf serialization, the `as_any()` method has been added to the `LazyBatchGenerator` trait. This means you will need to add `as_any()` to your implementation of `LazyBatchGenerator`:
+
+```rust
+# /* comment to avoid running
+
+impl LazyBatchGenerator for MyBatchGenerator {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    ...
+}
+
+# */
+```
+
+See [#17200](https://github.com/apache/datafusion/pull/17200) for details.
+
+### Refactored `DataSource::try_swapping_with_projection`
+
+We refactored `DataSource::try_swapping_with_projection` to simplify the method and minimize leakage across the ExecutionPlan <-> DataSource abstraction layer.
+Reimplementation for any custom `DataSource` should be relatively straightforward, see [#17395] for more details.
+
+[#17395]: https://github.com/apache/datafusion/pull/17395/
+
+### `FileOpenFuture` now uses `DataFusionError` instead of `ArrowError`
+
+The `FileOpenFuture` type alias has been updated to use `DataFusionError` instead of `ArrowError` for its error type. This change affects the `FileOpener` trait and any implementations that work with file streaming operations.
+
+**Before:**
+
+```rust,ignore
+pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch, ArrowError>>>>;
+```
+
+**After:**
+
+```rust,ignore
+pub type FileOpenFuture = BoxFuture<'static, Result<BoxStream<'static, Result<RecordBatch>>>>;
+```
+
+If you have custom implementations of `FileOpener` or work directly with `FileOpenFuture`, you'll need to update your error handling to use `DataFusionError` instead of `ArrowError`. The `FileStreamState` enum's `Open` variant has also been updated accordingly. See [#17397] for more details.
+
+[#17397]: https://github.com/apache/datafusion/pull/17397
+
+### FFI user defined aggregate function signature change
+
+The Foreign Function Interface (FFI) signature for user defined aggregate functions
+has been updated to call `return_field` instead of `return_type` on the underlying
+aggregate function. This is to support metadata handling with these aggregate functions.
+This change should be transparent to most users. If you have written unit tests to call
+`return_type` directly, you may need to change them to calling `return_field` instead.
+
+This update is a breaking change to the FFI API. The current best practice when using the
+FFI crate is to ensure that all libraries that are interacting are using the same
+underlying Rust version. Issue [#17374] has been opened to discuss stabilization of
+this interface so that these libraries can be used across different DataFusion versions.
+
+See [#17407] for details.
+
+[#17407]: https://github.com/apache/datafusion/pull/17407
+[#17374]: https://github.com/apache/datafusion/issues/17374
+
+### Added `PhysicalExpr::is_volatile_node`
+
+We added a method to `PhysicalExpr` to mark a `PhysicalExpr` as volatile:
+
+```rust,ignore
+impl PhysicalExpr for MyRandomExpr {
+  fn is_volatile_node(&self) -> bool {
+    true
+  }
+}
+```
+
+We've shipped this with a default value of `false` to minimize breakage but we highly recommend that implementers of `PhysicalExpr` opt into a behavior, even if it is returning `false`.
+
+You can see more discussion and example implementations in [#17351].
+
+[#17351]: https://github.com/apache/datafusion/pull/17351
diff --git a/docs/source/library-user-guide/upgrading/51.0.0.md b/docs/source/library-user-guide/upgrading/51.0.0.md
new file mode 100644
index 0000000000000..c3acfe15c493f
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/51.0.0.md
@@ -0,0 +1,272 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 51.0.0
+
+### `arrow` / `parquet` updated to 57.0.0
+
+### Upgrade to arrow `57.0.0` and parquet `57.0.0`
+
+This version of DataFusion upgrades the underlying Apache Arrow implementation
+to version `57.0.0`, including several dependent crates such as `prost`,
+`tonic`, `pyo3`, and `substrait`. . See the [release
+notes](https://github.com/apache/arrow-rs/releases/tag/57.0.0) for more details.
+
+### `MSRV` updated to 1.88.0
+
+The Minimum Supported Rust Version (MSRV) has been updated to [`1.88.0`].
+
+[`1.88.0`]: https://releases.rs/docs/1.88.0/
+
+### `FunctionRegistry` exposes two additional methods
+
+`FunctionRegistry` exposes two additional methods `udafs` and `udwfs` which expose set of registered user defined aggregation and window function names. To upgrade implement methods returning set of registered function names:
+
+```diff
+impl FunctionRegistry for FunctionRegistryImpl {
+      fn udfs(&self) -> HashSet<String> {
+         self.scalar_functions.keys().cloned().collect()
+     }
++    fn udafs(&self) -> HashSet<String> {
++        self.aggregate_functions.keys().cloned().collect()
++    }
++
++    fn udwfs(&self) -> HashSet<String> {
++        self.window_functions.keys().cloned().collect()
++    }
+}
+```
+
+### `datafusion-proto` use `TaskContext` rather than `SessionContext` in physical plan serde methods
+
+There have been changes in the public API methods of `datafusion-proto` which handle physical plan serde.
+
+Methods like `physical_plan_from_bytes`, `parse_physical_expr` and similar, expect `TaskContext` instead of `SessionContext`
+
+```diff
+- let plan2 = physical_plan_from_bytes(&bytes, &ctx)?;
++ let plan2 = physical_plan_from_bytes(&bytes, &ctx.task_ctx())?;
+```
+
+as `TaskContext` contains `RuntimeEnv` methods such as `try_into_physical_plan` will not have explicit `RuntimeEnv` parameter.
+
+```diff
+let result_exec_plan: Arc<dyn ExecutionPlan> = proto
+-   .try_into_physical_plan(&ctx, runtime.deref(), &composed_codec)
++.  .try_into_physical_plan(&ctx.task_ctx(), &composed_codec)
+```
+
+`PhysicalExtensionCodec::try_decode()` expects `TaskContext` instead of `FunctionRegistry`:
+
+```diff
+pub trait PhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+-        registry: &dyn FunctionRegistry,
++        ctx: &TaskContext,
+    ) -> Result<Arc<dyn ExecutionPlan>>;
+```
+
+See [issue #17601] for more details.
+
+[issue #17601]: https://github.com/apache/datafusion/issues/17601
+
+### `SessionState`'s `sql_to_statement` method takes `Dialect` rather than a `str`
+
+The `dialect` parameter of `sql_to_statement` method defined in `datafusion::execution::session_state::SessionState`
+has changed from `&str` to `&Dialect`.
+`Dialect` is an enum defined in the `datafusion-common`
+crate under the `config` module that provides type safety
+and better validation for SQL dialect selection
+
+### Reorganization of `ListingTable` into `datafusion-catalog-listing` crate
+
+There has been a long standing request to remove features such as `ListingTable`
+from the `datafusion` crate to support faster build times. The structs
+`ListingOptions`, `ListingTable`, and `ListingTableConfig` are now available
+within the `datafusion-catalog-listing` crate. These are re-exported in
+the `datafusion` crate, so this should be a minimal impact to existing users.
+
+See [issue #14462] and [issue #17713] for more details.
+
+[issue #14462]: https://github.com/apache/datafusion/issues/14462
+[issue #17713]: https://github.com/apache/datafusion/issues/17713
+
+### Reorganization of `ArrowSource` into `datafusion-datasource-arrow` crate
+
+To support [issue #17713] the `ArrowSource` code has been removed from
+the `datafusion` core crate into it's own crate, `datafusion-datasource-arrow`.
+This follows the pattern for the AVRO, CSV, JSON, and Parquet data sources.
+Users may need to update their paths to account for these changes.
+
+See [issue #17713] for more details.
+
+### `FileScanConfig::projection` renamed to `FileScanConfig::projection_exprs`
+
+The `projection` field in `FileScanConfig` has been renamed to `projection_exprs` and its type has changed from `Option<Vec<usize>>` to `Option<ProjectionExprs>`. This change enables more powerful projection pushdown capabilities by supporting arbitrary physical expressions rather than just column indices.
+
+**Impact on direct field access:**
+
+If you directly access the `projection` field:
+
+```rust,ignore
+let config: FileScanConfig = ...;
+let projection = config.projection;
+```
+
+You should update to:
+
+```rust,ignore
+let config: FileScanConfig = ...;
+let projection_exprs = config.projection_exprs;
+```
+
+**Impact on builders:**
+
+The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`:
+
+```diff
+let config = FileScanConfigBuilder::new(url, file_source)
+-   .with_projection(Some(vec![0, 2, 3]))
++   .with_projection_indices(Some(vec![0, 2, 3]))
+    .build();
+```
+
+Note: `with_projection()` still works but is deprecated and will be removed in a future release.
+
+**What is `ProjectionExprs`?**
+
+`ProjectionExprs` is a new type that represents a list of physical expressions for projection. While it can be constructed from column indices (which is what `with_projection_indices` does internally), it also supports arbitrary physical expressions, enabling advanced features like expression evaluation during scanning.
+
+You can access column indices from `ProjectionExprs` using its methods if needed:
+
+```rust,ignore
+let projection_exprs: ProjectionExprs = ...;
+// Get the column indices if the projection only contains simple column references
+let indices = projection_exprs.column_indices();
+```
+
+### `DESCRIBE query` support
+
+`DESCRIBE query` was previously an alias for `EXPLAIN query`, which outputs the
+_execution plan_ of the query. With this release, `DESCRIBE query` now outputs
+the computed _schema_ of the query, consistent with the behavior of `DESCRIBE table_name`.
+
+### `datafusion.execution.time_zone` default configuration changed
+
+The default value for `datafusion.execution.time_zone` previously was a string value of `+00:00` (GMT/Zulu time).
+This was changed to be an `Option<String>` with a default of `None`. If you want to change the timezone back
+to the previous value you can execute the sql:
+
+```sql
+SET
+TIMEZONE = '+00:00';
+```
+
+This change was made to better support using the default timezone in scalar UDF functions such as
+`now`, `current_date`, `current_time`, and `to_timestamp` among others.
+
+### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method
+
+A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between:
+
+- **File schema**: The schema of actual data files on disk
+- **Partition columns**: Columns derived from directory structure (e.g., Hive-style partitioning)
+- **Table schema**: The complete schema combining both file and partition columns
+
+As part of this change, the `FileSource::with_schema()` method signature has changed from accepting a `SchemaRef` to accepting a `TableSchema`.
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations will need to update their code
+- Users who only use built-in file sources (Parquet, CSV, JSON, AVRO, Arrow) are not affected
+
+**Migration guide for custom `FileSource` implementations:**
+
+```diff
+ use datafusion_datasource::file::FileSource;
+-use arrow::datatypes::SchemaRef;
++use datafusion_datasource::TableSchema;
+
+ impl FileSource for MyCustomSource {
+-    fn with_schema(&self, schema: SchemaRef) -> Arc<dyn FileSource> {
++    fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
+         Arc::new(Self {
+-            schema: Some(schema),
++            // Use schema.file_schema() to get the file schema without partition columns
++            schema: Some(Arc::clone(schema.file_schema())),
+             ..self.clone()
+         })
+     }
+ }
+```
+
+For implementations that need access to partition columns:
+
+```rust,ignore
+fn with_schema(&self, schema: TableSchema) -> Arc<dyn FileSource> {
+    Arc::new(Self {
+        file_schema: Arc::clone(schema.file_schema()),
+        partition_cols: schema.table_partition_cols().clone(),
+        table_schema: Arc::clone(schema.table_schema()),
+        ..self.clone()
+    })
+}
+```
+
+**Note**: Most `FileSource` implementations only need to store the file schema (without partition columns), as shown in the first example. The second pattern of storing all three schema components is typically only needed for advanced use cases where you need access to different schema representations for different operations (e.g., ParquetSource uses the file schema for building pruning predicates but needs the table schema for filter pushdown logic).
+
+**Using `TableSchema` directly:**
+
+If you're constructing a `FileScanConfig` or working with table schemas and partition columns, you can now use `TableSchema`:
+
+```rust
+use datafusion_datasource::TableSchema;
+use arrow::datatypes::{Schema, Field, DataType};
+use std::sync::Arc;
+
+// Create a TableSchema with partition columns
+let file_schema = Arc::new(Schema::new(vec![
+    Field::new("user_id", DataType::Int64, false),
+    Field::new("amount", DataType::Float64, false),
+]));
+
+let partition_cols = vec![
+    Arc::new(Field::new("date", DataType::Utf8, false)),
+    Arc::new(Field::new("region", DataType::Utf8, false)),
+];
+
+let table_schema = TableSchema::new(file_schema, partition_cols);
+
+// Access different schema representations
+let file_schema_ref = table_schema.file_schema();      // Schema without partition columns
+let full_schema = table_schema.table_schema();          // Complete schema with partition columns
+let partition_cols_ref = table_schema.table_partition_cols(); // Just the partition columns
+```
+
+### `AggregateUDFImpl::is_ordered_set_aggregate` has been renamed to `AggregateUDFImpl::supports_within_group_clause`
+
+This method has been renamed to better reflect the actual impact it has for aggregate UDF implementations.
+The accompanying `AggregateUDF::is_ordered_set_aggregate` has also been renamed to `AggregateUDF::supports_within_group_clause`.
+No functionality has been changed with regards to this method; it still refers only to permitting use of `WITHIN GROUP`
+SQL syntax for the aggregate function.
diff --git a/docs/source/library-user-guide/upgrading/52.0.0.md b/docs/source/library-user-guide/upgrading/52.0.0.md
new file mode 100644
index 0000000000000..8bf2f803bede6
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/52.0.0.md
@@ -0,0 +1,669 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 52.0.0
+
+### Changes to DFSchema API
+
+To permit more efficient planning, several methods on `DFSchema` have been
+changed to return references to the underlying [`&FieldRef`] rather than
+[`&Field`]. This allows planners to more cheaply copy the references via
+`Arc::clone` rather than cloning the entire `Field` structure.
+
+You may need to change code to use `Arc::clone` instead of `.as_ref().clone()`
+directly on the `Field`. For example:
+
+```diff
+- let field = df_schema.field("my_column").as_ref().clone();
++ let field = Arc::clone(df_schema.field("my_column"));
+```
+
+### ListingTableProvider now caches `LIST` commands
+
+In prior versions, `ListingTableProvider` would issue `LIST` commands to
+the underlying object store each time it needed to list files for a query.
+To improve performance, `ListingTableProvider` now caches the results of
+`LIST` commands for the lifetime of the `ListingTableProvider` instance or
+until a cache entry expires.
+
+Note that by default the cache has no expiration time, so if files are added or removed
+from the underlying object store, the `ListingTableProvider` will not see
+those changes until the `ListingTableProvider` instance is dropped and recreated.
+
+You can configure the maximum cache size and cache entry expiration time via configuration options:
+
+- `datafusion.runtime.list_files_cache_limit` - Limits the size of the cache in bytes
+- `datafusion.runtime.list_files_cache_ttl` - Limits the TTL (time-to-live) of an entry in minutes and/or seconds
+
+Detailed configuration information can be found in the [DataFusion Runtime
+Configuration](https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings) user's guide.
+
+Caching can be disabled by setting the limit to 0:
+
+```sql
+SET datafusion.runtime.list_files_cache_limit TO "0K";
+```
+
+Note that the internal API has changed to use a trait `ListFilesCache` instead of a type alias.
+
+### `newlines_in_values` moved from `FileScanConfig` to `CsvOptions`
+
+The CSV-specific `newlines_in_values` configuration option has been moved from `FileScanConfig` to `CsvOptions`, as it only applies to CSV file parsing.
+
+**Who is affected:**
+
+- Users who set `newlines_in_values` via `FileScanConfigBuilder::with_newlines_in_values()`
+
+**Migration guide:**
+
+Set `newlines_in_values` in `CsvOptions` instead of on `FileScanConfigBuilder`:
+
+**Before:**
+
+```rust,ignore
+let source = Arc::new(CsvSource::new(file_schema.clone()));
+let config = FileScanConfigBuilder::new(object_store_url, source)
+    .with_newlines_in_values(true)
+    .build();
+```
+
+**After:**
+
+```rust,ignore
+let options = CsvOptions {
+    newlines_in_values: Some(true),
+    ..Default::default()
+};
+let source = Arc::new(CsvSource::new(file_schema.clone())
+    .with_csv_options(options));
+let config = FileScanConfigBuilder::new(object_store_url, source)
+    .build();
+```
+
+### Removal of `pyarrow` feature
+
+The `pyarrow` feature flag has been removed. This feature has been migrated to
+the `datafusion-python` repository since version `44.0.0`.
+
+### Refactoring of `FileSource` constructors and `FileScanConfigBuilder` to accept schemas upfront
+
+The way schemas are passed to file sources and scan configurations has been significantly refactored. File sources now require the schema (including partition columns) to be provided at construction time, and `FileScanConfigBuilder` no longer takes a separate schema parameter.
+
+**Who is affected:**
+
+- Users who create `FileScanConfig` or file sources (`ParquetSource`, `CsvSource`, `JsonSource`, `AvroSource`) directly
+- Users who implement custom `FileFormat` implementations
+
+**Key changes:**
+
+1. **FileSource constructors now require TableSchema**: All built-in file sources now take the schema in their constructor:
+
+   ```diff
+   - let source = ParquetSource::default();
+   + let source = ParquetSource::new(table_schema);
+   ```
+
+2. **FileScanConfigBuilder no longer takes schema as a parameter**: The schema is now passed via the FileSource:
+
+   ```diff
+   - FileScanConfigBuilder::new(url, schema, source)
+   + FileScanConfigBuilder::new(url, source)
+   ```
+
+3. **Partition columns are now part of TableSchema**: The `with_table_partition_cols()` method has been removed from `FileScanConfigBuilder`. Partition columns are now passed as part of the `TableSchema` to the FileSource constructor:
+
+   ```diff
+   + let table_schema = TableSchema::new(
+   +     file_schema,
+   +     vec![Arc::new(Field::new("date", DataType::Utf8, false))],
+   + );
+   + let source = ParquetSource::new(table_schema);
+     let config = FileScanConfigBuilder::new(url, source)
+   -     .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)])
+         .with_file(partitioned_file)
+         .build();
+   ```
+
+4. **FileFormat::file_source() now takes TableSchema parameter**: Custom `FileFormat` implementations must be updated:
+   ```diff
+   impl FileFormat for MyFileFormat {
+   -   fn file_source(&self) -> Arc<dyn FileSource> {
+   +   fn file_source(&self, table_schema: TableSchema) -> Arc<dyn FileSource> {
+   -       Arc::new(MyFileSource::default())
+   +       Arc::new(MyFileSource::new(table_schema))
+       }
+   }
+   ```
+
+**Migration examples:**
+
+For Parquet files:
+
+```diff
+- let source = Arc::new(ParquetSource::default());
+- let config = FileScanConfigBuilder::new(url, schema, source)
++ let table_schema = TableSchema::new(schema, vec![]);
++ let source = Arc::new(ParquetSource::new(table_schema));
++ let config = FileScanConfigBuilder::new(url, source)
+      .with_file(partitioned_file)
+      .build();
+```
+
+For CSV files with partition columns:
+
+```diff
+- let source = Arc::new(CsvSource::new(true, b',', b'"'));
+- let config = FileScanConfigBuilder::new(url, file_schema, source)
+-     .with_table_partition_cols(vec![Field::new("year", DataType::Int32, false)])
++ let options = CsvOptions {
++     has_header: Some(true),
++     delimiter: b',',
++     quote: b'"',
++     ..Default::default()
++ };
++ let table_schema = TableSchema::new(
++     file_schema,
++     vec![Arc::new(Field::new("year", DataType::Int32, false))],
++ );
++ let source = Arc::new(CsvSource::new(table_schema).with_csv_options(options));
++ let config = FileScanConfigBuilder::new(url, source)
+      .build();
+```
+
+### Adaptive filter representation in Parquet filter pushdown
+
+As of Arrow 57.1.0, DataFusion uses a new adaptive filter strategy when
+evaluating pushed down filters for Parquet files. This new strategy improves
+performance for certain types of queries where the results of filtering are
+more efficiently represented with a bitmask rather than a selection.
+See [arrow-rs #5523] for more details.
+
+This change only applies to the built-in Parquet data source with filter-pushdown enabled (
+which is [not yet the default behavior]).
+
+You can disable the new behavior by setting the
+`datafusion.execution.parquet.force_filter_selections` [configuration setting] to true.
+
+```sql
+> set datafusion.execution.parquet.force_filter_selections = true;
+```
+
+[arrow-rs #5523]: https://github.com/apache/arrow-rs/issues/5523
+[configuration setting]: https://datafusion.apache.org/user-guide/configs.html
+[not yet the default behavior]: https://github.com/apache/datafusion/issues/3463
+
+### Statistics handling moved from `FileSource` to `FileScanConfig`
+
+Statistics are now managed directly by `FileScanConfig` instead of being delegated to `FileSource` implementations. This simplifies the `FileSource` trait and provides more consistent statistics handling across all file formats.
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations
+
+**Breaking changes:**
+
+Two methods have been removed from the `FileSource` trait:
+
+- `with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource>`
+- `statistics(&self) -> Result<Statistics>`
+
+**Migration guide:**
+
+If you have a custom `FileSource` implementation, you need to:
+
+1. Remove the `with_statistics` method implementation
+2. Remove the `statistics` method implementation
+3. Remove any internal state that was storing statistics
+
+**Before:**
+
+```rust,ignore
+#[derive(Clone)]
+struct MyCustomSource {
+    table_schema: TableSchema,
+    projected_statistics: Option<Statistics>,
+    // other fields...
+}
+
+impl FileSource for MyCustomSource {
+    fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> {
+        Arc::new(Self {
+            table_schema: self.table_schema.clone(),
+            projected_statistics: Some(statistics),
+            // other fields...
+        })
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(self.projected_statistics.clone().unwrap_or_else(||
+            Statistics::new_unknown(self.table_schema.file_schema())
+        ))
+    }
+
+    // other methods...
+}
+```
+
+**After:**
+
+```rust,ignore
+#[derive(Clone)]
+struct MyCustomSource {
+    table_schema: TableSchema,
+    // projected_statistics field removed
+    // other fields...
+}
+
+impl FileSource for MyCustomSource {
+    // with_statistics method removed
+    // statistics method removed
+
+    // other methods...
+}
+```
+
+**Accessing statistics:**
+
+Statistics are now accessed through `FileScanConfig` instead of `FileSource`:
+
+```diff
+- let stats = config.file_source.statistics()?;
++ let stats = config.statistics();
+```
+
+Note that `FileScanConfig::statistics()` automatically marks statistics as inexact when filters are present, ensuring correctness when filters are pushed down.
+
+### Partition column handling moved out of `PhysicalExprAdapter`
+
+Partition column replacement is now a separate preprocessing step performed before expression rewriting via `PhysicalExprAdapter`. This change provides better separation of concerns and makes the adapter more focused on schema differences rather than partition value substitution.
+
+**Who is affected:**
+
+- Users who have custom implementations of `PhysicalExprAdapterFactory` that handle partition columns
+- Users who directly use the `FilePruner` API
+
+**Breaking changes:**
+
+1. `FilePruner::try_new()` signature changed: the `partition_fields` parameter has been removed since partition column handling is now done separately
+2. Partition column replacement must now be done via `replace_columns_with_literals()` before expressions are passed to the adapter
+
+**Migration guide:**
+
+If you have code that creates a `FilePruner` with partition fields:
+
+**Before:**
+
+```rust,ignore
+use datafusion_pruning::FilePruner;
+
+let pruner = FilePruner::try_new(
+    predicate,
+    file_schema,
+    partition_fields,  // This parameter is removed
+    file_stats,
+)?;
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_pruning::FilePruner;
+
+// Partition fields are no longer needed
+let pruner = FilePruner::try_new(
+    predicate,
+    file_schema,
+    file_stats,
+)?;
+```
+
+If you have custom code that relies on `PhysicalExprAdapter` to handle partition columns, you must now call `replace_columns_with_literals()` separately:
+
+**Before:**
+
+```rust,ignore
+// Adapter handled partition column replacement internally
+let adapted_expr = adapter.rewrite(expr)?;
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_physical_expr_adapter::replace_columns_with_literals;
+
+// Replace partition columns first
+let expr_with_literals = replace_columns_with_literals(expr, &partition_values)?;
+// Then apply the adapter
+let adapted_expr = adapter.rewrite(expr_with_literals)?;
+```
+
+### `build_row_filter` signature simplified
+
+The `build_row_filter` function in `datafusion-datasource-parquet` has been simplified to take a single schema parameter instead of two.
+The expectation is now that the filter has been adapted to the physical file schema (the arrow representation of the parquet file's schema) before being passed to this function
+using a `PhysicalExprAdapter` for example.
+
+**Who is affected:**
+
+- Users who call `build_row_filter` directly
+
+**Breaking changes:**
+
+The function signature changed from:
+
+```rust,ignore
+pub fn build_row_filter(
+    expr: &Arc<dyn PhysicalExpr>,
+    physical_file_schema: &SchemaRef,
+    predicate_file_schema: &SchemaRef,  // removed
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+) -> Result<Option<RowFilter>>
+```
+
+To:
+
+```rust,ignore
+pub fn build_row_filter(
+    expr: &Arc<dyn PhysicalExpr>,
+    file_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    reorder_predicates: bool,
+    file_metrics: &ParquetFileMetrics,
+) -> Result<Option<RowFilter>>
+```
+
+**Migration guide:**
+
+Remove the duplicate schema parameter from your call:
+
+```diff
+- build_row_filter(&predicate, &file_schema, &file_schema, metadata, reorder, metrics)
++ build_row_filter(&predicate, &file_schema, metadata, reorder, metrics)
+```
+
+### Planner now requires explicit opt-in for WITHIN GROUP syntax
+
+The SQL planner now enforces the aggregate UDF contract more strictly: the
+`WITHIN GROUP (ORDER BY ...)` syntax is accepted only if the aggregate UDAF
+explicitly advertises support by returning `true` from
+`AggregateUDFImpl::supports_within_group_clause()`.
+
+Previously the planner forwarded a `WITHIN GROUP` clause to order-sensitive
+aggregates even when they did not implement ordered-set semantics, which could
+cause queries such as `SUM(x) WITHIN GROUP (ORDER BY x)` to plan successfully.
+This behavior was too permissive and has been changed to match PostgreSQL and
+the documented semantics.
+
+Migration: If your UDAF intentionally implements ordered-set semantics and
+wants to accept the `WITHIN GROUP` SQL syntax, update your implementation to
+return `true` from `supports_within_group_clause()` and handle the ordering
+semantics in your accumulator implementation. If your UDAF is merely
+order-sensitive (but not an ordered-set aggregate), do not advertise
+`supports_within_group_clause()` and clients should use alternative function
+signatures (for example, explicit ordering as a function argument) instead.
+
+### `AggregateUDFImpl::supports_null_handling_clause` now defaults to `false`
+
+This method specifies whether an aggregate function allows `IGNORE NULLS`/`RESPECT NULLS`
+during SQL parsing, with the implication it respects these configs during computation.
+
+Most DataFusion aggregate functions silently ignored this syntax in prior versions
+as they did not make use of it and it was permitted by default. We change this so
+only the few functions which do respect this clause (e.g. `array_agg`, `first_value`,
+`last_value`) need to implement it.
+
+Custom user defined aggregate functions will also error if this syntax is used,
+unless they explicitly declare support by overriding the method.
+
+For example, SQL parsing will now fail for queries such as this:
+
+```sql
+SELECT median(c1) IGNORE NULLS FROM table
+```
+
+Instead of silently succeeding.
+
+### API change for `CacheAccessor` trait
+
+The remove API no longer requires a mutable instance
+
+### FFI crate updates
+
+Many of the structs in the `datafusion-ffi` crate have been updated to allow easier
+conversion to the underlying trait types they represent. This simplifies some code
+paths, but also provides an additional improvement in cases where library code goes
+through a round trip via the foreign function interface.
+
+To update your code, suppose you have a `FFI_SchemaProvider` called `ffi_provider`
+and you wish to use this as a `SchemaProvider`. In the old approach you would do
+something like:
+
+```rust,ignore
+    let foreign_provider: ForeignSchemaProvider = ffi_provider.into();
+    let foreign_provider = Arc::new(foreign_provider) as Arc<dyn SchemaProvider>;
+```
+
+This code should now be written as:
+
+```rust,ignore
+    let foreign_provider: Arc<dyn SchemaProvider + Send> = ffi_provider.into();
+    let foreign_provider = foreign_provider as Arc<dyn SchemaProvider>;
+```
+
+For the case of user defined functions, the updates are similar but you
+may need to change the way you call the creation of the `ScalarUDF`.
+Aggregate and window functions follow the same pattern.
+
+Previously you may write:
+
+```rust,ignore
+    let foreign_udf: ForeignScalarUDF = ffi_udf.try_into()?;
+    let foreign_udf: ScalarUDF = foreign_udf.into();
+```
+
+Instead this should now be:
+
+```rust,ignore
+    let foreign_udf: Arc<dyn ScalarUDFImpl> = ffi_udf.into();
+    let foreign_udf = ScalarUDF::new_from_shared_impl(foreign_udf);
+```
+
+When creating any of the following structs, we now require the user to
+provide a `TaskContextProvider` and optionally a `LogicalExtensionCodec`:
+
+- `FFI_CatalogListProvider`
+- `FFI_CatalogProvider`
+- `FFI_SchemaProvider`
+- `FFI_TableProvider`
+- `FFI_TableFunction`
+
+Each of these structs has a `new()` and a `new_with_ffi_codec()` method for
+instantiation. For example, when you previously would write
+
+```rust,ignore
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new(table, None);
+```
+
+Now you will need to provide a `TaskContextProvider`. The most common
+implementation of this trait is `SessionContext`.
+
+```rust,ignore
+   let ctx = Arc::new(SessionContext::default());
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new(table, None, ctx, None);
+```
+
+The alternative function to create these structures may be more convenient
+if you are doing many of these operations. A `FFI_LogicalExtensionCodec` will
+store the `TaskContextProvider` as well.
+
+```rust,ignore
+   let codec = Arc::new(DefaultLogicalExtensionCodec {});
+   let ctx = Arc::new(SessionContext::default());
+   let ffi_codec = FFI_LogicalExtensionCodec::new(codec, None, ctx);
+   let table = Arc::new(MyTableProvider::new());
+   let ffi_table = FFI_TableProvider::new_with_ffi_codec(table, None, ffi_codec);
+```
+
+Additional information about the usage of the `TaskContextProvider` can be
+found in the crate README.
+
+Additionally, the FFI structure for Scalar UDF's no longer contains a
+`return_type` call. This code was not used since the `ForeignScalarUDF`
+struct implements the `return_field_from_args` instead.
+
+### Projection handling moved from FileScanConfig to FileSource
+
+Projection handling has been moved from `FileScanConfig` into `FileSource` implementations. This enables format-specific projection pushdown (e.g., Parquet can push down struct field access, Vortex can push down computed expressions into un-decoded data).
+
+**Who is affected:**
+
+- Users who have implemented custom `FileSource` implementations
+- Users who use `FileScanConfigBuilder::with_projection_indices` directly
+
+**Breaking changes:**
+
+1. **`FileSource::with_projection` replaced with `try_pushdown_projection`:**
+
+   The `with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource>` method has been removed and replaced with `try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result<Option<Arc<dyn FileSource>>>`.
+
+2. **`FileScanConfig.projection_exprs` field removed:**
+
+   Projections are now stored in the `FileSource` directly, not in `FileScanConfig`.
+   Various public helper methods that access projection information have been removed from `FileScanConfig`.
+
+3. **`FileScanConfigBuilder::with_projection_indices` now returns `Result<Self>`:**
+
+   This method can now fail if the projection pushdown fails.
+
+4. **`FileSource::create_file_opener` now returns `Result<Arc<dyn FileOpener>>`:**
+
+   Previously returned `Arc<dyn FileOpener>` directly.
+   Any `FileSource` implementation that may fail to create a `FileOpener` should now return an appropriate error.
+
+5. **`DataSource::try_swapping_with_projection` signature changed:**
+
+   Parameter changed from `&[ProjectionExpr]` to `&ProjectionExprs`.
+
+**Migration guide:**
+
+If you have a custom `FileSource` implementation:
+
+**Before:**
+
+```rust,ignore
+impl FileSource for MyCustomSource {
+    fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
+        // Apply projection from config
+        Arc::new(Self { /* ... */ })
+    }
+
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Arc<dyn FileOpener> {
+        Arc::new(MyOpener { /* ... */ })
+    }
+}
+```
+
+**After:**
+
+```rust,ignore
+impl FileSource for MyCustomSource {
+    fn try_pushdown_projection(
+        &self,
+        projection: &ProjectionExprs,
+    ) -> Result<Option<Arc<dyn FileSource>>> {
+        // Return None if projection cannot be pushed down
+        // Return Some(new_source) with projection applied if it can
+        Ok(Some(Arc::new(Self {
+            projection: Some(projection.clone()),
+            /* ... */
+        })))
+    }
+
+    fn projection(&self) -> Option<&ProjectionExprs> {
+        self.projection.as_ref()
+    }
+
+    fn create_file_opener(
+        &self,
+        object_store: Arc<dyn ObjectStore>,
+        base_config: &FileScanConfig,
+        partition: usize,
+    ) -> Result<Arc<dyn FileOpener>> {
+        Ok(Arc::new(MyOpener { /* ... */ }))
+    }
+}
+```
+
+We recommend you look at [#18627](https://github.com/apache/datafusion/pull/18627)
+that introduced these changes for more examples for how this was handled for the various built in file sources.
+
+We have added [`SplitProjection`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.SplitProjection.html) and [`ProjectionOpener`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.ProjectionOpener.html) helpers to make it easier to handle projections in your `FileSource` implementations.
+
+For file sources that can only handle simple column selections (not computed expressions), use the `SplitProjection` and `ProjectionOpener` helpers to split the projection into pushdownable and non-pushdownable parts:
+
+```rust,ignore
+use datafusion_datasource::projection::{SplitProjection, ProjectionOpener};
+
+// In try_pushdown_projection:
+let split = SplitProjection::new(projection, self.table_schema())?;
+// Use split.file_projection() for what to push down to the file format
+// The ProjectionOpener wrapper will handle the rest
+```
+
+**For `FileScanConfigBuilder` users:**
+
+```diff
+let config = FileScanConfigBuilder::new(url, source)
+-   .with_projection_indices(Some(vec![0, 2, 3]))
++   .with_projection_indices(Some(vec![0, 2, 3]))?
+    .build();
+```
+
+### `SchemaAdapter` and `SchemaAdapterFactory` completely removed
+
+Following the deprecation announced in [DataFusion 49.0.0](49.0.0.md#deprecating-schemaadapterfactory-and-schemaadapter), `SchemaAdapterFactory` has been fully removed from Parquet scanning. This applies to both:
+
+The following symbols have been deprecated and will be removed in the next release:
+
+- `SchemaAdapter` trait
+- `SchemaAdapterFactory` trait
+- `SchemaMapper` trait
+- `SchemaMapping` struct
+- `DefaultSchemaAdapterFactory` struct
+
+These types were previously used to adapt record batch schemas during file reading.
+This functionality has been replaced by `PhysicalExprAdapterFactory`, which rewrites expressions at planning time rather than transforming batches at runtime.
+If you were using a custom `SchemaAdapterFactory` for schema adaptation (e.g., default column values, type coercion), you should now implement `PhysicalExprAdapterFactory` instead.
+See the [default column values example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for how to implement a custom `PhysicalExprAdapterFactory`.
+
+**Migration guide:**
+
+If you implemented a custom `SchemaAdapterFactory`, migrate to `PhysicalExprAdapterFactory`.
+See the [default column values example](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/default_column_values.rs) for a complete implementation.
diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md
new file mode 100644
index 0000000000000..ef5f5743f5ea6
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/53.0.0.md
@@ -0,0 +1,474 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 53.0.0
+
+**Note:** DataFusion `53.0.0` has not been released yet. The information provided
+*in this section pertains to features and changes that have already been merged
+*to the main branch and are awaiting release in this version. See [#19692] for
+\*more details.
+
+[#19692]: https://github.com/apache/datafusion/issues/19692
+
+### Upgrade arrow/parquet to 58.0.0 and object_store to 0.13.0
+
+DataFusion 53.0.0 uses `arrow` and `parquet` 58.0.0, and `object_store` 0.13.0.
+This may require updates to your Cargo.toml if you have direct dependencies on
+these crates.
+
+See the [Arrow 58.0.0 release notes] and the [object_store 0.13.0 upgrade guide] for details on breaking changes in those versions.
+
+[arrow 58.0.0 release notes]: https://github.com/apache/arrow-rs/releases/tag/58.0.0
+[object_store 0.13.0 upgrade guide]: https://github.com/apache/arrow-rs/releases/tag/58.0.0
+
+### `ExecutionPlan::properties` now returns `&Arc<PlanProperties>`
+
+Now `ExecutionPlan::properties()` returns `&Arc<PlanProperties>` instead of a
+reference. This make it possible to cheaply clone properties and reuse them across multiple
+`ExecutionPlans`. It also makes it possible to optimize [`ExecutionPlan::with_new_children`]
+to reuse properties when the children plans have not changed, which can significantly reduce
+planning time for complex queries.
+
+[`ExecutionPlan::with_new_children`](https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#tymethod.with_new_children)
+
+To migrate, in all `ExecutionPlan` implementations, you will likely need to wrap
+stored `PlanProperties` in an `Arc`:
+
+```diff
+-    cache: PlanProperties,
++    cache: Arc<PlanProperties>,
+
+...
+
+-    fn properties(&self) -> &PlanProperties {
++    fn properties(&self) -> &Arc<PlanProperties> {
+         &self.cache
+     }
+```
+
+To improve performance of `with_new_children` for custom `ExecutionPlan`
+implementations, you can use the new macro: `check_if_same_properties`. For it
+to work, you need to implement the function:
+`with_new_children_and_same_properties` with semantics identical to
+`with_new_children`, but operating under the assumption that the properties of
+the children plans have not changed.
+
+An example of supporting this optimization for `ProjectionExec`:
+
+```diff
+     impl ProjectionExec {
++       fn with_new_children_and_same_properties(
++           &self,
++           mut children: Vec<Arc<dyn ExecutionPlan>>,
++       ) -> Self {
++           Self {
++               input: children.swap_remove(0),
++               metrics: ExecutionPlanMetricsSet::new(),
++               ..Self::clone(self)
++           }
++       }
+    }
+
+    impl ExecutionPlan for ProjectionExec {
+        fn with_new_children(
+            self: Arc<Self>,
+            mut children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
++           check_if_same_properties!(self, children);
+            ProjectionExec::try_new(
+                self.projector.projection().into_iter().cloned(),
+                children.swap_remove(0),
+            )
+            .map(|p| Arc::new(p) as _)
+        }
+    }
+```
+
+### `PlannerContext` outer query schema API now uses a stack
+
+`PlannerContext` no longer stores a single `outer_query_schema`. It now tracks a
+stack of outer relation schemas so nested subqueries can access non-adjacent
+outer relations.
+
+**Before:**
+
+```rust,ignore
+let old_outer_query_schema =
+    planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
+let sub_plan = self.query_to_plan(subquery, planner_context)?;
+planner_context.set_outer_query_schema(old_outer_query_schema);
+```
+
+**After:**
+
+```rust,ignore
+planner_context.append_outer_query_schema(input_schema.clone().into());
+let sub_plan = self.query_to_plan(subquery, planner_context)?;
+planner_context.pop_outer_query_schema();
+```
+
+### `FileSinkConfig` adds `file_output_mode`
+
+`FileSinkConfig` now includes a `file_output_mode: FileOutputMode` field to control
+single-file vs directory output behavior. Any code constructing `FileSinkConfig` via struct
+literals must initialize this field.
+
+The `FileOutputMode` enum has three variants:
+
+- `Automatic` (default): Infer output mode from the URL (extension/trailing `/` heuristic)
+- `SingleFile`: Write to a single file at the exact output path
+- `Directory`: Write to a directory with generated filenames
+
+**Before:**
+
+```rust,ignore
+FileSinkConfig {
+    // ...
+    file_extension: "parquet".into(),
+}
+```
+
+**After:**
+
+```rust,ignore
+use datafusion_datasource::file_sink_config::FileOutputMode;
+
+FileSinkConfig {
+    // ...
+    file_extension: "parquet".into(),
+    file_output_mode: FileOutputMode::Automatic,
+}
+```
+
+### `SimplifyInfo` trait removed, `SimplifyContext` now uses builder-style API
+
+The `SimplifyInfo` trait has been removed and replaced with the concrete `SimplifyContext` struct. This simplifies the expression simplification API and removes the need for trait objects.
+
+**Who is affected:**
+
+- Users who implemented custom `SimplifyInfo` implementations
+- Users who implemented `ScalarUDFImpl::simplify()` for custom scalar functions
+- Users who directly use `SimplifyContext` or `ExprSimplifier`
+
+**Breaking changes:**
+
+1. The `SimplifyInfo` trait has been removed entirely
+2. `SimplifyContext` no longer takes `&ExecutionProps` - it now uses a builder-style API with direct fields
+3. `ScalarUDFImpl::simplify()` now takes `&SimplifyContext` instead of `&dyn SimplifyInfo`
+4. Time-dependent function simplification (e.g., `now()`) is now optional - if `query_execution_start_time` is `None`, these functions won't be simplified
+
+**Migration guide:**
+
+If you implemented a custom `SimplifyInfo`:
+
+**Before:**
+
+```rust,ignore
+impl SimplifyInfo for MySimplifyInfo {
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> { ... }
+    fn nullable(&self, expr: &Expr) -> Result<bool> { ... }
+    fn execution_props(&self) -> &ExecutionProps { ... }
+    fn get_data_type(&self, expr: &Expr) -> Result<DataType> { ... }
+}
+```
+
+**After:**
+
+Use `SimplifyContext` directly with the builder-style API:
+
+```rust,ignore
+let context = SimplifyContext::default()
+    .with_schema(schema)
+    .with_config_options(config_options)
+    .with_query_execution_start_time(Some(Utc::now())); // or use .with_current_time()
+```
+
+If you implemented `ScalarUDFImpl::simplify()`:
+
+**Before:**
+
+```rust,ignore
+fn simplify(
+    &self,
+    args: Vec<Expr>,
+    info: &dyn SimplifyInfo,
+) -> Result<ExprSimplifyResult> {
+    let now_ts = info.execution_props().query_execution_start_time;
+    // ...
+}
+```
+
+**After:**
+
+```rust,ignore
+fn simplify(
+    &self,
+    args: Vec<Expr>,
+    info: &SimplifyContext,
+) -> Result<ExprSimplifyResult> {
+    // query_execution_start_time is now Option<DateTime<Utc>>
+    // Return Original if time is not set (simplification skipped)
+    let Some(now_ts) = info.query_execution_start_time() else {
+        return Ok(ExprSimplifyResult::Original(args));
+    };
+    // ...
+}
+```
+
+If you created `SimplifyContext` from `ExecutionProps`:
+
+**Before:**
+
+```rust,ignore
+let props = ExecutionProps::new();
+let context = SimplifyContext::new(&props).with_schema(schema);
+```
+
+**After:**
+
+```rust,ignore
+let context = SimplifyContext::default()
+    .with_schema(schema)
+    .with_config_options(config_options)
+    .with_current_time(); // Sets query_execution_start_time to Utc::now()
+```
+
+See [`SimplifyContext` documentation](https://docs.rs/datafusion-expr/latest/datafusion_expr/simplify/struct.SimplifyContext.html) for more details.
+
+### Struct Casting Now Requires Field Name Overlap
+
+DataFusion's struct casting mechanism previously allowed casting between structs with differing field names if the field counts matched. This "positional fallback" behavior could silently misalign fields and cause data corruption.
+
+**Breaking Change:**
+
+Starting with DataFusion 53.0.0, struct casts now require **at least one overlapping field name** between the source and target structs. Casts without field name overlap are rejected at plan time with a clear error message.
+
+**Who is affected:**
+
+- Applications that cast between structs with no overlapping field names
+- Queries that rely on positional struct field mapping (e.g., casting `struct(x, y)` to `struct(a, b)` based solely on position)
+- Code that constructs or transforms struct columns programmatically
+
+**Migration guide:**
+
+If you encounter an error like:
+
+```text
+Cannot cast struct with 2 fields to 2 fields because there is no field name overlap
+```
+
+You must explicitly rename or map fields to ensure at least one field name matches. Here are common patterns:
+
+**Example 1: Source and target field names already match (Name-based casting)**
+
+**Success case (field names align):**
+
+```sql
+-- source_col has schema: STRUCT<x INT, y INT>
+-- Casting to the same field names succeeds (no-op or type validation only)
+SELECT CAST(source_col AS STRUCT<x INT, y INT>) FROM table1;
+```
+
+**Example 2: Source and target field names differ (Migration scenario)**
+
+**What fails now (no field name overlap):**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- This FAILS because there is no field name overlap:
+-- ❌ SELECT CAST(source_col AS STRUCT<x INT, y INT>) FROM table1;
+-- Error: Cannot cast struct with 2 fields to 2 fields because there is no field name overlap
+```
+
+**Migration options (must align names):**
+
+**Option A: Use struct constructor for explicit field mapping**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- Use STRUCT_CONSTRUCT with explicit field names
+SELECT STRUCT_CONSTRUCT(
+    'x', source_col.a,
+    'y', source_col.b
+) AS renamed_struct FROM table1;
+```
+
+**Option B: Rename in the cast target to match source names**
+
+```sql
+-- source_col has schema: STRUCT<a INT, b INT>
+-- Cast to target with matching field names
+SELECT CAST(source_col AS STRUCT<a INT, b INT>) FROM table1;
+```
+
+**Example 3: Using struct constructors in Rust API**
+
+If you need to map fields programmatically, build the target struct explicitly:
+
+```rust,ignore
+// Build the target struct with explicit field names
+let target_struct_type = DataType::Struct(vec![
+    FieldRef::new("x", DataType::Int32),
+    FieldRef::new("y", DataType::Utf8),
+]);
+
+// Use struct constructors rather than casting for field mapping
+// This makes the field mapping explicit and unambiguous
+// Use struct builders or row constructors that preserve your mapping logic
+```
+
+**Why this change:**
+
+1. **Safety:** Field names are now the primary contract for struct compatibility
+2. **Explicitness:** Prevents silent data misalignment caused by positional assumptions
+3. **Consistency:** Matches DuckDB's behavior and aligns with other SQL engines that enforce name-based matching
+4. **Debuggability:** Errors now appear at plan time rather than as silent data corruption
+
+See [Issue #19841](https://github.com/apache/datafusion/issues/19841) and [PR #19955](https://github.com/apache/datafusion/pull/19955) for more details.
+
+### `FilterExec` builder methods deprecated
+
+The following methods on `FilterExec` have been deprecated in favor of using `FilterExecBuilder`:
+
+- `with_projection()`
+- `with_batch_size()`
+
+**Who is affected:**
+
+- Users who create `FilterExec` instances and use these methods to configure them
+
+**Migration guide:**
+
+Use `FilterExecBuilder` instead of chaining method calls on `FilterExec`:
+
+**Before:**
+
+```rust,ignore
+let filter = FilterExec::try_new(predicate, input)?
+    .with_projection(Some(vec![0, 2]))?
+    .with_batch_size(8192)?;
+```
+
+**After:**
+
+```rust,ignore
+let filter = FilterExecBuilder::new(predicate, input)
+    .with_projection(Some(vec![0, 2]))
+    .with_batch_size(8192)
+    .build()?;
+```
+
+The builder pattern is more efficient as it computes properties once during `build()` rather than recomputing them for each method call.
+
+Note: `with_default_selectivity()` is not deprecated as it simply updates a field value and does not require the overhead of the builder pattern.
+
+### Protobuf conversion trait added
+
+A new trait, `PhysicalProtoConverterExtension`, has been added to the `datafusion-proto`
+crate. This is used for controlling the process of conversion of physical plans and
+expressions to and from their protobuf equivalents. The methods for conversion now
+require an additional parameter.
+
+The primary APIs for interacting with this crate have not been modified, so most users
+should not need to make any changes. If you do require this trait, you can use the
+`DefaultPhysicalProtoConverter` implementation.
+
+For example, to convert a sort expression protobuf node you can make the following
+updates:
+
+**Before:**
+
+```rust,ignore
+let sort_expr = parse_physical_sort_expr(
+    sort_proto,
+    ctx,
+    input_schema,
+    codec,
+);
+```
+
+**After:**
+
+```rust,ignore
+let converter = DefaultPhysicalProtoConverter {};
+let sort_expr = parse_physical_sort_expr(
+    sort_proto,
+    ctx,
+    input_schema,
+    codec,
+    &converter
+);
+```
+
+Similarly to convert from a physical sort expression into a protobuf node:
+
+**Before:**
+
+```rust,ignore
+let sort_proto = serialize_physical_sort_expr(
+    sort_expr,
+    codec,
+);
+```
+
+**After:**
+
+```rust,ignore
+let converter = DefaultPhysicalProtoConverter {};
+let sort_proto = serialize_physical_sort_expr(
+    sort_expr,
+    codec,
+    &converter,
+);
+```
+
+### `generate_series` and `range` table functions changed
+
+The `generate_series` and `range` table functions now return an empty set when the interval is invalid, instead of an error.
+This behavior is consistent with systems like PostgreSQL.
+
+Before:
+
+```sql
+> select * from generate_series(0, -1);
+Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+
+> select * from range(0, -1);
+Error during planning: Start is bigger than end, but increment is positive: Cannot generate infinite series
+```
+
+Now:
+
+```sql
+> select * from generate_series(0, -1);
++-------+
+| value |
++-------+
++-------+
+0 row(s) fetched.
+
+> select * from range(0, -1);
++-------+
+| value |
++-------+
++-------+
+0 row(s) fetched.
+```
diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md
new file mode 100644
index 0000000000000..77b4fb6f71a35
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/54.0.0.md
@@ -0,0 +1,124 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Upgrade Guides
+
+## DataFusion 54.0.0
+
+**Note:** DataFusion `54.0.0` has not been released yet. The information provided
+in this section pertains to features and changes that have already been merged
+to the main branch and are awaiting release in this version.
+
+### `ExecutionPlan::apply_expressions` is now a required method
+
+`apply_expressions` has been added as a **required** method on the `ExecutionPlan` trait (no default implementation). The same applies to the `FileSource` and `DataSource` traits. Any custom implementation of these traits must now implement `apply_expressions`.
+
+**Who is affected:**
+
+- Users who implement custom `ExecutionPlan` nodes
+- Users who implement custom `FileSource` or `DataSource` sources
+
+**Migration guide:**
+
+Add `apply_expressions` to your implementation. Call `f` on each top-level `PhysicalExpr` your node owns, using `visit_sibling` to correctly propagate `TreeNodeRecursion`:
+
+**Node with no expressions:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    _f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    Ok(TreeNodeRecursion::Continue)
+}
+```
+
+**Node with a single expression:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    f(self.predicate.as_ref())
+}
+```
+
+**Node with multiple expressions:**
+
+```rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    let mut tnr = TreeNodeRecursion::Continue;
+    for expr in &self.expressions {
+        tnr = tnr.visit_sibling(|| f(expr.as_ref()))?;
+    }
+    Ok(tnr)
+}
+```
+
+**Node whose only expressions are in `output_ordering()` (e.g. a synthetic test node with no owned expression fields):**
+
+````rust,ignore
+fn apply_expressions(
+    &self,
+    f: &mut dyn FnMut(&dyn PhysicalExpr) -> Result<TreeNodeRecursion>,
+) -> Result<TreeNodeRecursion> {
+    let mut tnr = TreeNodeRecursion::Continue;
+    if let Some(ordering) = self.cache.output_ordering() {
+        for sort_expr in ordering {
+            tnr = tnr.visit_sibling(|| f(sort_expr.expr.as_ref()))?;
+        }
+    }
+    Ok(tnr)
+}
+
+### `ExecutionPlan::partition_statistics` now returns `Arc<Statistics>`
+
+`ExecutionPlan::partition_statistics` now returns `Result<Arc<Statistics>>` instead of `Result<Statistics>`. This avoids cloning `Statistics` when it is shared across multiple consumers.
+
+**Before:**
+
+```rust,ignore
+fn partition_statistics(&self, partition: Option<usize>) -> Result<Statistics> {
+    Ok(Statistics::new_unknown(&self.schema()))
+}
+````
+
+**After:**
+
+```rust,ignore
+fn partition_statistics(&self, partition: Option<usize>) -> Result<Arc<Statistics>> {
+    Ok(Arc::new(Statistics::new_unknown(&self.schema())))
+}
+```
+
+If you need an owned `Statistics` value (e.g. to mutate it), use `Arc::unwrap_or_clone`:
+
+```rust,ignore
+// If you previously consumed the Statistics directly:
+let stats = plan.partition_statistics(None)?;
+stats.column_statistics[0].min_value = ...;
+
+// Now unwrap the Arc first:
+let mut stats = Arc::unwrap_or_clone(plan.partition_statistics(None)?);
+stats.column_statistics[0].min_value = ...;
+```
diff --git a/docs/source/library-user-guide/upgrading/index.rst b/docs/source/library-user-guide/upgrading/index.rst
new file mode 100644
index 0000000000000..1ed5eca2a5d2a
--- /dev/null
+++ b/docs/source/library-user-guide/upgrading/index.rst
@@ -0,0 +1,33 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Upgrade Guides
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   DataFusion 54.0.0 <54.0.0>
+   DataFusion 53.0.0 <53.0.0>
+   DataFusion 52.0.0 <52.0.0>
+   DataFusion 51.0.0 <51.0.0>
+   DataFusion 50.0.0 <50.0.0>
+   DataFusion 49.0.0 <49.0.0>
+   DataFusion 48.0.1 <48.0.1>
+   DataFusion 48.0.0 <48.0.0>
+   DataFusion 47.0.0 <47.0.0>
+   DataFusion 46.0.0 <46.0.0>
diff --git a/docs/source/library-user-guide/using-the-dataframe-api.md b/docs/source/library-user-guide/using-the-dataframe-api.md
index 7f3e28c255c6e..024eff5d20834 100644
--- a/docs/source/library-user-guide/using-the-dataframe-api.md
+++ b/docs/source/library-user-guide/using-the-dataframe-api.md
@@ -198,7 +198,7 @@ async fn main() -> Result<()> {
 }
 ```
 
-[`custom_file_format.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_file_format.rs
+[`custom_file_format.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/custom_file_format.rs
 
 The output file will look like (Example Output):
 
diff --git a/docs/source/library-user-guide/using-the-sql-api.md b/docs/source/library-user-guide/using-the-sql-api.md
index f78cf16f4cb67..8b8ba2a3716a3 100644
--- a/docs/source/library-user-guide/using-the-sql-api.md
+++ b/docs/source/library-user-guide/using-the-sql-api.md
@@ -119,6 +119,8 @@ async fn main() -> Result<()> {
 DataFusion can also read Avro files using the `register_avro` method.
 
 ```rust
+# #[cfg(feature = "avro")]
+{
 use datafusion::arrow::util::pretty;
 use datafusion::error::Result;
 use datafusion::prelude::*;
@@ -154,6 +156,7 @@ async fn main() -> Result<()> {
     );
     Ok(())
 }
+}
 ```
 
 ## Reading Multiple Files as a table
diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md
index df4e5e3940aa6..472ab2481360e 100644
--- a/docs/source/library-user-guide/working-with-exprs.md
+++ b/docs/source/library-user-guide/working-with-exprs.md
@@ -52,13 +52,13 @@ As the writer of a library, you can use `Expr`s to represent computations that y
 
 ## Arrow Schema and DataFusion DFSchema
 
-Apache Arrow `Schema` provides a lightweight structure for defining data, and Apache Datafusion`DFSchema` extends it with extra information such as column qualifiers and functional dependencies. Column qualifiers are multi part path to the table e.g table, schema, catalog. Functional Dependency is the relationship between attributes(characteristics) of a table related to each other.
+Apache Arrow `Schema` provides a lightweight structure for defining data, and Apache Datafusion `DFSchema` extends it with extra information such as column qualifiers and functional dependencies. Column qualifiers are multi part path to the table e.g table, schema, catalog. Functional Dependency is the relationship between attributes(characteristics) of a table related to each other.
 
 ### Difference between Schema and DFSchema
 
 - Schema: A fundamental component of Apache Arrow, `Schema` defines a dataset's structure, specifying column names and their data types.
 
-  > Please see [Struct Schema](https://docs.rs/arrow-schema/54.2.1/arrow_schema/struct.Schema.html) for a detailed document of Arrow Schema.
+  > Please see [Struct Schema](https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html) for a detailed document of Arrow Schema.
 
 - DFSchema: Extending `Schema`, `DFSchema` incorporates qualifiers such as table names, enabling it to carry additional context when required. This is particularly valuable for managing queries across multiple tables.
   > Please see [Struct DFSchema](https://docs.rs/datafusion/latest/datafusion/common/struct.DFSchema.html) for a detailed document of DFSchema.
@@ -71,11 +71,11 @@ From DFSchema to Schema: Since the `Into` trait has been implemented for DFSchem
 
 ## Creating and Evaluating `Expr`s
 
-Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs) for well commented code for creating, evaluating, simplifying, and analyzing `Expr`s.
+Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs) for well commented code for creating, evaluating, simplifying, and analyzing `Expr`s.
 
 ## A Scalar UDF Example
 
-We'll use a `ScalarUDF` expression as our example. This necessitates implementing an actual UDF, and for ease we'll use the same example from the [adding UDFs](./adding-udfs.md) guide.
+We'll use a `ScalarUDF` expression as our example. This necessitates implementing an actual UDF, and for ease we'll use the same example from the [adding UDFs](functions/adding-udfs.md) guide.
 
 So assuming you've written that function, you can use it to create an `Expr`:
 
@@ -121,11 +121,11 @@ If you'd like to learn more about `Expr`s, before we get into the details of cre
 
 ## Rewriting `Expr`s
 
-There are several examples of rewriting and working with `Exprs`:
+There are several examples of rewriting and working with `Expr`s:
 
-- [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs)
-- [analyzer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/analyzer_rule.rs)
-- [optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/optimizer_rule.rs)
+- [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/expr_api.rs)
+- [analyzer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/analyzer_rule.rs)
+- [optimizer_rule.rs](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/query_planning/optimizer_rule.rs)
 
 Rewriting Expressions is the process of taking an `Expr` and transforming it into another `Expr`. This is useful for a number of reasons, including:
 
@@ -162,7 +162,7 @@ fn rewrite_add_one(expr: Expr) -> Result<Transformed<Expr>> {
 
 ### Creating an `OptimizerRule`
 
-In DataFusion, an `OptimizerRule` is a trait that supports rewriting`Expr`s that appear in various parts of the `LogicalPlan`. It follows DataFusion's general mantra of trait implementations to drive behavior.
+In DataFusion, an `OptimizerRule` is a trait that supports rewriting `Expr`s that appear in various parts of the `LogicalPlan`. It follows DataFusion's general mantra of trait implementations to drive behavior.
 
 We'll call our rule `AddOneInliner` and implement the `OptimizerRule` trait. The `OptimizerRule` trait has two methods:
 
@@ -322,7 +322,7 @@ async fn main() -> Result<()> {
     let plan = ctx.sql(sql).await?.into_optimized_plan()?.clone();
 
     let expected = r#"Projection: Int64(6) AS added_one
-  EmptyRelation"#;
+  EmptyRelation: rows=1"#;
 
     assert_eq!(plan.to_string(), expected);
 
diff --git a/docs/source/user-guide/arrow-introduction.md b/docs/source/user-guide/arrow-introduction.md
new file mode 100644
index 0000000000000..5a225782adfdb
--- /dev/null
+++ b/docs/source/user-guide/arrow-introduction.md
@@ -0,0 +1,256 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Gentle Arrow Introduction
+
+```{contents}
+:local:
+:depth: 2
+```
+
+## Overview
+
+DataFusion uses [Apache Arrow] as its native in-memory format, so anyone using DataFusion will likely interact with Arrow at some point. This guide introduces the key Arrow concepts you need to know to effectively use DataFusion.
+
+Apache Arrow defines a standardized columnar representation for in-memory data. This enables different systems and languages (e.g., Rust and Python) to share data with zero-copy interchange, avoiding serialization overhead. In addition to zero copy interchange, Arrow also standardizes best practice columnar data representation enabling high performance analytical processing through vectorized execution.
+
+## Columnar Layout
+
+Quick visual: row-major (left) vs Arrow's columnar layout (right). For a deeper primer, see the [arrow2 guide].
+
+```text
+Traditional Row Storage:          Arrow Columnar Storage:
+┌──────────────────┐              ┌─────────┬─────────┬──────────┐
+│ id │ name │ age  │              │   id    │  name   │   age    │
+├────┼──────┼──────┤              ├─────────┼─────────┼──────────┤
+│ 1  │  A   │  30  │              │ [1,2,3] │ [A,B,C] │[30,25,35]│
+│ 2  │  B   │  25  │              └─────────┴─────────┴──────────┘
+│ 3  │  C   │  35  │                   ↑          ↑         ↑
+└──────────────────┘              Int32Array StringArray Int32Array
+(read entire rows)                (process entire columns at once)
+```
+
+## `RecordBatch`
+
+Arrow's standard unit for packaging data is the **[`RecordBatch`]**.
+
+A **[`RecordBatch`]** represents a horizontal slice of a table—a collection of equal-length columnar arrays that conform to a defined schema. Each column within the slice is a contiguous Arrow array, and all columns have the same number of rows (length). This chunked, immutable unit enables efficient streaming and parallel execution.
+
+Think of it as having two perspectives:
+
+- **Columnar inside**: Each column (`id`, `name`, `age`) is a contiguous array optimized for vectorized operations
+- **Row-chunked externally**: The batch represents a chunk of rows (e.g., rows 1-1000), making it a manageable unit for streaming
+
+RecordBatches are **immutable snapshots**—once created, they cannot be modified. Any transformation produces a _new_ RecordBatch, enabling safe parallel processing without locks or coordination overhead.
+
+This design allows DataFusion to process streams of row-based chunks while gaining maximum performance from the columnar layout.
+
+## Streaming Through the Engine
+
+DataFusion processes queries as pull-based pipelines where operators request batches from their inputs. This streaming approach enables early result production, bounds memory usage (spilling to disk only when necessary), and naturally supports parallel execution across multiple CPU cores.
+
+For example, given the following query:
+
+```sql
+SELECT name FROM 'data.parquet' WHERE id > 10
+```
+
+The DataFusion Pipeline looks like this:
+
+```text
+
+┌─────────────┐    ┌──────────────┐    ┌────────────────┐    ┌──────────────────┐    ┌──────────┐
+│ Parquet     │───▶│ Scan         │───▶│ Filter         │───▶│ Projection       │───▶│ Results  │
+│ File        │    │ Operator     │    │ Operator       │    │ Operator         │    │          │
+└─────────────┘    └──────────────┘    └────────────────┘    └──────────────────┘    └──────────┘
+                   (reads data)        (id > 10)             (keeps "name" col)
+                   RecordBatch ───▶    RecordBatch ────▶     RecordBatch ────▶        RecordBatch
+```
+
+In this pipeline, [`RecordBatch`]es are the "packages" of columnar data that flow between the different stages of query execution. Each operator processes batches incrementally, enabling the system to produce results before reading the entire input.
+
+## Creating `ArrayRef` and `RecordBatch`es
+
+Sometimes you need to create Arrow data programmatically rather than reading from files.
+
+The first thing needed is creating an Arrow Array, for each column. [arrow-rs] provides array builders and `From` impls to create arrays from Rust vectors.
+
+```rust
+use arrow::array::{StringArray, Int32Array};
+// Create an Int32Array from a vector of i32 values
+let ids = Int32Array::from(vec![1, 2, 3]);
+// There are similar constructors for other array types, e.g., StringArray, Float64Array, etc.
+let names = StringArray::from(vec![Some("alice"), None, Some("carol")]);
+```
+
+Every element in an Arrow array can be "null" (aka missing). Often, arrays are
+created from `Option<T>` values to indicate nullability (e.g., `Some("alice")`
+vs `None` above).
+
+Note: You'll see [`Arc`] used frequently in the code—Arrow arrays are wrapped in
+[`Arc`] (atomically reference-counted pointers) to enable cheap, thread-safe
+sharing across operators and tasks. [`ArrayRef`] is simply a type alias for
+`Arc<dyn Array>`. To create an `ArrayRef`, wrap your array in `Arc::new(...)` as shown below.
+
+```rust
+use std::sync::Arc;
+# use arrow::array::{ArrayRef, Int32Array, StringArray};
+// To get an ArrayRef, wrap the Int32Array in an Arc.
+// (note you will often have to explicitly type annotate to ArrayRef)
+let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+
+// you can also store Strings and other types in ArrayRefs
+let arr: ArrayRef = Arc::new(
+  StringArray::from(vec![Some("alice"), None, Some("carol")])
+);
+```
+
+To create a [`RecordBatch`], you need to define its [`Schema`] (the column names and types) and provide the corresponding columns as [`ArrayRef`]s as shown below:
+
+```rust
+# use std::sync::Arc;
+# use arrow_schema::ArrowError;
+# use arrow::array::{ArrayRef, Int32Array, StringArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+
+// Create the columns as Arrow arrays
+let ids = Int32Array::from(vec![1, 2, 3]);
+let names = StringArray::from(vec![Some("alice"), None, Some("carol")]);
+// Create the schema
+let schema = Arc::new(Schema::new(vec![
+    Field::new("id", DataType::Int32, false), // false means non-nullable
+    Field::new("name", DataType::Utf8, true), // true means nullable
+]));
+// Assemble the columns
+let cols: Vec<ArrayRef> = vec![
+      Arc::new(ids),
+      Arc::new(names)
+];
+// Finally, create the RecordBatch
+RecordBatch::try_new(schema, cols).expect("Failed to create RecordBatch");
+```
+
+## Working with `ArrayRef` and `RecordBatch`
+
+Most DataFusion APIs are in terms of [`ArrayRef`] and [`RecordBatch`]. To work with the
+underlying data, you typically downcast the [`ArrayRef`] to its concrete type
+(e.g., [`Int32Array`]).
+
+To do so either use the `as_any().downcast_ref::<T>()` method or the
+`as_::<T>()` helper method from the [AsArray] trait.
+
+[asarray]: https://docs.rs/arrow-array/latest/arrow_array/cast/trait.AsArray.html
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, Int32Type};
+# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch};
+# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+// First check the data type of the array
+match arr.data_type() {
+   &DataType::Int32 => {
+         // Downcast to Int32Array
+         let int_array = arr.as_primitive::<Int32Type>();
+         // Now you can access Int32Array methods
+         for i in 0..int_array.len() {
+              println!("Value at index {}: {}", i, int_array.value(i));
+         }
+   }
+    _ => {
+        println ! ("Array is not of type Int32");
+    }
+}
+```
+
+The following two downcasting methods are equivalent:
+
+```rust
+# use std::sync::Arc;
+# use arrow::datatypes::{DataType, Int32Type};
+# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch};
+# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
+// Downcast to Int32Array using as_any
+let int_array1 = arr.as_any().downcast_ref::<Int32Array>().unwrap();
+// This is the same as using the as_::<T>() helper
+let int_array2 = arr.as_primitive::<Int32Type>();
+assert_eq!(int_array1, int_array2);
+```
+
+## Common Pitfalls
+
+When working with Arrow and RecordBatches, watch out for these common issues:
+
+- **Schema consistency**: All batches in a stream must share the exact same [`Schema`]. For example, you can't have one batch where a column is [`Int32`] and the next where it's [`Int64`], even if the values would fit
+- **Immutability**: Arrays are immutable—to "modify" data, you must build new arrays or new RecordBatches. For instance, to change a value in an array, you'd create a new array with the updated value
+- **Row by Row Processing**: Avoid iterating over Arrays element by element when possible, and use Arrow's built-in [compute kernels] instead
+- **Type mismatches**: Mixed input types across files may require explicit casts. For example, a string column `"123"` from a CSV file won't automatically join with an integer column `123` from a Parquet file—you'll need to cast one to match the other. Use Arrow's [`cast`] kernel where appropriate
+- **Batch size assumptions**: Don't assume a particular batch size; always iterate until the stream ends. One file might produce 8192-row batches while another produces 1024-row batches
+
+[compute kernels]: https://docs.rs/arrow/latest/arrow/compute/index.html
+
+## Further reading
+
+**Arrow Documentation:**
+
+- [Arrow Format Introduction](https://arrow.apache.org/docs/format/Intro.html) - Understand the Arrow specification and why it enables zero-copy data sharing
+- [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) - Deep dive into memory layout for performance optimization
+- [Arrow Rust Documentation](https://docs.rs/arrow/latest/arrow/) - Complete API reference for the Rust implementation
+
+**Key API References:**
+
+- [RecordBatch](https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html) - The fundamental data structure for columnar data (a table slice)
+- [ArrayRef](https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html) - Represents a reference-counted Arrow array (single column)
+- [DataType](https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html) - Enum of all supported Arrow data types (e.g., Int32, Utf8)
+- [Schema](https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html) - Describes the structure of a RecordBatch (column names and types)
+
+[apache arrow]: https://arrow.apache.org/docs/index.html
+[arrow-rs]: https://github.com/apache/arrow-rs
+[`arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html
+[`arrayref`]: https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html
+[`cast`]: https://docs.rs/arrow/latest/arrow/compute/fn.cast.html
+[`field`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Field.html
+[`schema`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html
+[`datatype`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html
+[`int32array`]: https://docs.rs/arrow/latest/arrow/array/type.Int32Array.html
+[`stringarray`]: https://docs.rs/arrow/latest/arrow/array/type.StringArray.html
+[`int32`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int32
+[`int64`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int64
+[extension points]: ../library-user-guide/extensions.md
+[`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html
+[custom table providers guide]: ../library-user-guide/custom-table-providers.md
+[user-defined functions (udfs)]: ../library-user-guide/functions/adding-udfs.md
+[custom optimizer rules and physical operators]: ../library-user-guide/extending-operators.md
+[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
+[`.register_table()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_table
+[`.sql()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql
+[`.show()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.show
+[`memtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/struct.MemTable.html
+[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+[`csvreadoptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/file_format/options/struct.CsvReadOptions.html
+[`parquetreadoptions`]: https://docs.rs/datafusion/latest/datafusion/datasource/file_format/options/struct.ParquetReadOptions.html
+[`recordbatch`]: https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html
+[`read_csv`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_csv
+[`read_parquet`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_parquet
+[`read_json`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_json
+[`read_avro`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_avro
+[`dataframe`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html
+[`.collect()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.collect
+[arrow2 guide]: https://jorgecarleitao.github.io/arrow2/main/guide/arrow.html#what-is-apache-arrow
+[configuration settings]: configs.md
+[`datafusion.execution.batch_size`]: configs.md#setting-configuration-options
diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md
index 2e14f1f54c6c1..6b1a4887a8a0f 100644
--- a/docs/source/user-guide/cli/datasources.md
+++ b/docs/source/user-guide/cli/datasources.md
@@ -82,22 +82,29 @@ select count(*) from 'https://datasets.clickhouse.com/hits_compatible/athena_par
 To read from an AWS S3 or GCS, use `s3` or `gs` as a protocol prefix. For
 example, to read a file in an S3 bucket named `my-data-bucket` use the URL
 `s3://my-data-bucket`and set the relevant access credentials as environmental
-variables (e.g. for AWS S3 you need to at least `AWS_ACCESS_KEY_ID` and
+variables (e.g. for AWS S3 you can use `AWS_ACCESS_KEY_ID` and
 `AWS_SECRET_ACCESS_KEY`).
 
 ```sql
-select count(*) from 's3://my-data-bucket/athena_partitioned/hits.parquet'
+> select count(*) from 's3://altinity-clickhouse-data/nyc_taxi_rides/data/tripdata_parquet/';
++------------+
+| count(*)   |
++------------+
+| 1310903963 |
++------------+
 ```
 
-See the [`CREATE EXTERNAL TABLE`](#create-external-table) section for
+See the [`CREATE EXTERNAL TABLE`](#create-external-table) section below for
 additional configuration options.
 
 # `CREATE EXTERNAL TABLE`
 
 It is also possible to create a table backed by files or remote locations via
-`CREATE EXTERNAL TABLE` as shown below. Note that DataFusion does not support wildcards (e.g. `*`) in file paths; instead, specify the directory path directly to read all compatible files in that directory.
+`CREATE EXTERNAL TABLE` as shown below. Note that DataFusion does not support
+wildcards (e.g. `*`) in file paths; instead, specify the directory path directly
+to read all compatible files in that directory.
 
-For example, to create a table `hits` backed by a local parquet file, use:
+For example, to create a table `hits` backed by a local parquet file named `hits.parquet`:
 
 ```sql
 CREATE EXTERNAL TABLE hits
@@ -105,7 +112,7 @@ STORED AS PARQUET
 LOCATION 'hits.parquet';
 ```
 
-To create a table `hits` backed by a remote parquet file via HTTP(S), use
+To create a table `hits` backed by a remote parquet file via HTTP(S):
 
 ```sql
 CREATE EXTERNAL TABLE hits
@@ -127,7 +134,11 @@ select count(*) from hits;
 
 **Why Wildcards Are Not Supported**
 
-Although wildcards (e.g., _.parquet or \*\*/_.parquet) may work for local filesystems in some cases, they are not officially supported by DataFusion. This is because wildcards are not universally applicable across all storage backends (e.g., S3, GCS). Instead, DataFusion expects the user to specify the directory path, and it will automatically read all compatible files within that directory.
+Although wildcards (e.g., _.parquet or \*\*/_.parquet) may work for local
+filesystems in some cases, they are not supported by DataFusion CLI. This
+is because wildcards are not universally applicable across all storage backends
+(e.g., S3, GCS). Instead, DataFusion expects the user to specify the directory
+path, and it will automatically read all compatible files within that directory.
 
 For example, the following usage is not supported:
 
@@ -148,7 +159,31 @@ CREATE EXTERNAL TABLE test (
     day DATE
 )
 STORED AS PARQUET
-LOCATION 'gs://bucket/my_table';
+LOCATION 'gs://bucket/my_table/';
+```
+
+When specifying a directory path that has a Hive compliant partition structure, by default, DataFusion CLI will
+automatically parse and incorporate the Hive columns and their values into the table's schema and data. Given the
+following remote object paths:
+
+```console
+gs://bucket/my_table/a=1/b=100/file1.parquet
+gs://bucket/my_table/a=2/b=200/file2.parquet
+```
+
+`my_table` can be queried and filtered on the Hive columns:
+
+```sql
+CREATE EXTERNAL TABLE my_table
+STORED AS PARQUET
+LOCATION 'gs://bucket/my_table/';
+
+SELECT count(*) FROM my_table WHERE b=200;
++----------+
+| count(*) |
++----------+
+| 1        |
++----------+
 ```
 
 # Formats
@@ -168,17 +203,63 @@ LOCATION '/mnt/nyctaxi/tripdata.parquet';
 Register a single folder parquet datasource. Note: All files inside must be valid
 parquet files and have compatible schemas
 
+:::{note}
+Paths must end in Slash `/`
+: The path must end in `/` otherwise DataFusion will treat the path as a file and not a directory
+:::
+
 ```sql
 CREATE EXTERNAL TABLE taxi
 STORED AS PARQUET
 LOCATION '/mnt/nyctaxi/';
 ```
 
+### Parquet Specific Options
+
+You can specify additional options for parquet files using the `OPTIONS` clause.
+For example, to read and write a parquet directory with encryption settings you could use:
+
+```sql
+CREATE EXTERNAL TABLE encrypted_parquet_table
+(
+double_field double,
+float_field float
+)
+STORED AS PARQUET LOCATION 'pq/' OPTIONS (
+    -- encryption
+    'format.crypto.file_encryption.encrypt_footer' 'true',
+    'format.crypto.file_encryption.footer_key_as_hex' '30313233343536373839303132333435',  -- b"0123456789012345"
+    'format.crypto.file_encryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450"
+    'format.crypto.file_encryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451"
+    -- decryption
+    'format.crypto.file_decryption.footer_key_as_hex' '30313233343536373839303132333435', -- b"0123456789012345"
+    'format.crypto.file_decryption.column_key_as_hex::double_field' '31323334353637383930313233343530', -- b"1234567890123450"
+    'format.crypto.file_decryption.column_key_as_hex::float_field' '31323334353637383930313233343531', -- b"1234567890123451"
+);
+```
+
+Here the keys are specified in hexadecimal format because they are binary data. These can be encoded in SQL using:
+
+```sql
+select encode('0123456789012345', 'hex');
+/*
++----------------------------------------------+
+| encode(Utf8("0123456789012345"),Utf8("hex")) |
++----------------------------------------------+
+| 30313233343536373839303132333435             |
++----------------------------------------------+
+*/
+```
+
+For more details on the available options, refer to the Rust
+[TableParquetOptions](https://docs.rs/datafusion/latest/datafusion/common/config/struct.TableParquetOptions.html)
+documentation in DataFusion.
+
 ## CSV
 
 DataFusion will infer the CSV schema automatically or you can provide it explicitly.
 
-Register a single file csv datasource with a header row.
+Register a single file csv datasource with a header row:
 
 ```sql
 CREATE EXTERNAL TABLE test
@@ -187,7 +268,7 @@ LOCATION '/path/to/aggregate_test_100.csv'
 OPTIONS ('has_header' 'true');
 ```
 
-Register a single file csv datasource with explicitly defined schema.
+Register a single file csv datasource with explicitly defined schema:
 
 ```sql
 CREATE EXTERNAL TABLE test (
@@ -213,7 +294,7 @@ LOCATION '/path/to/aggregate_test_100.csv';
 
 ## HTTP(s)
 
-To read from a remote parquet file via HTTP(S) you can use the following:
+To read from a remote parquet file via HTTP(S):
 
 ```sql
 CREATE EXTERNAL TABLE hits
@@ -223,9 +304,12 @@ LOCATION 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hit
 
 ## S3
 
-[AWS S3](https://aws.amazon.com/s3/) data sources must have connection credentials configured.
+DataFusion CLI supports configuring [AWS S3](https://aws.amazon.com/s3/) via the
+`CREATE EXTERNAL TABLE` statement and standard AWS configuration methods (via the
+[`aws-config`] AWS SDK crate).
 
-To create an external table from a file in an S3 bucket:
+To create an external table from a file in an S3 bucket with explicit
+credentials:
 
 ```sql
 CREATE EXTERNAL TABLE test
@@ -238,7 +322,7 @@ OPTIONS(
 LOCATION 's3://bucket/path/file.parquet';
 ```
 
-It is also possible to specify the access information using environment variables:
+To create an external table using environment variables:
 
 ```bash
 $ export AWS_DEFAULT_REGION=us-east-2
@@ -247,7 +331,7 @@ $ export AWS_ACCESS_KEY_ID=******
 
 $ datafusion-cli
 `datafusion-cli v21.0.0
-> create external table test stored as parquet location 's3://bucket/path/file.parquet';
+> create CREATE TABLE test STORED AS PARQUET LOCATION 's3://bucket/path/file.parquet';
 0 rows in set. Query took 0.374 seconds.
 > select * from test;
 +----------+----------+
@@ -258,19 +342,39 @@ $ datafusion-cli
 1 row in set. Query took 0.171 seconds.
 ```
 
+To read from a public S3 bucket without signatures, use the
+`aws.SKIP_SIGNATURE` option:
+
+```sql
+CREATE EXTERNAL TABLE nyc_taxi_rides
+STORED AS PARQUET LOCATION 's3://altinity-clickhouse-data/nyc_taxi_rides/data/tripdata_parquet/'
+OPTIONS(aws.SKIP_SIGNATURE true);
+```
+
+Credentials are taken in this order of precedence:
+
+1. Explicitly specified in the `OPTIONS` clause of the `CREATE EXTERNAL TABLE` statement.
+2. Determined by [`aws-config`] crate (standard environment variables such as `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` as well as other AWS specific features).
+
+If no credentials are specified, DataFusion CLI will use unsigned requests to S3,
+which allows reading from public buckets.
+
 Supported configuration options are:
 
-| Environment Variable                     | Configuration Option    | Description                                          |
-| ---------------------------------------- | ----------------------- | ---------------------------------------------------- |
-| `AWS_ACCESS_KEY_ID`                      | `aws.access_key_id`     |                                                      |
-| `AWS_SECRET_ACCESS_KEY`                  | `aws.secret_access_key` |                                                      |
-| `AWS_DEFAULT_REGION`                     | `aws.region`            |                                                      |
-| `AWS_ENDPOINT`                           | `aws.endpoint`          |                                                      |
-| `AWS_SESSION_TOKEN`                      | `aws.token`             |                                                      |
-| `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` |                         | See [IAM Roles]                                      |
-| `AWS_ALLOW_HTTP`                         |                         | set to "true" to permit HTTP connections without TLS |
+| Environment Variable                     | Configuration Option    | Description                                    |
+| ---------------------------------------- | ----------------------- | ---------------------------------------------- |
+| `AWS_ACCESS_KEY_ID`                      | `aws.access_key_id`     |                                                |
+| `AWS_SECRET_ACCESS_KEY`                  | `aws.secret_access_key` |                                                |
+| `AWS_DEFAULT_REGION`                     | `aws.region`            |                                                |
+| `AWS_ENDPOINT`                           | `aws.endpoint`          |                                                |
+| `AWS_SESSION_TOKEN`                      | `aws.token`             |                                                |
+| `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` |                         | See [IAM Roles]                                |
+| `AWS_ALLOW_HTTP`                         |                         | If "true", permit HTTP connections without TLS |
+| `AWS_SKIP_SIGNATURE`                     | `aws.skip_signature`    | If "true", does not sign requests              |
+|                                          | `aws.nosign`            | Alias for `skip_signature`                     |
 
 [iam roles]: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html
+[`aws-config`]: https://docs.rs/aws-config/latest/aws_config/
 
 ## OSS
 
diff --git a/docs/source/user-guide/cli/functions.md b/docs/source/user-guide/cli/functions.md
new file mode 100644
index 0000000000000..ea353d5c8dcc8
--- /dev/null
+++ b/docs/source/user-guide/cli/functions.md
@@ -0,0 +1,224 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# CLI Specific Functions
+
+`datafusion-cli` comes with build-in functions that are not included in the
+DataFusion SQL engine by default. These functions are:
+
+## `parquet_metadata`
+
+The `parquet_metadata` table function can be used to inspect detailed metadata
+about a parquet file such as statistics, sizes, and other information. This can
+be helpful to understand how parquet files are structured.
+
+For example, to see information about the `"WatchID"` column in the
+`hits.parquet` file, you can use:
+
+```sql
+SELECT path_in_schema, row_group_id, row_group_num_rows, stats_min, stats_max, total_compressed_size
+FROM parquet_metadata('hits.parquet')
+WHERE path_in_schema = '"WatchID"'
+LIMIT 3;
+
++----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
+| path_in_schema | row_group_id | row_group_num_rows | stats_min           | stats_max           | total_compressed_size |
++----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
+| "WatchID"      | 0            | 450560             | 4611687214012840539 | 9223369186199968220 | 3883759               |
+| "WatchID"      | 1            | 612174             | 4611689135232456464 | 9223371478009085789 | 5176803               |
+| "WatchID"      | 2            | 344064             | 4611692774829951781 | 9223363791697310021 | 3031680               |
++----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
+3 rows in set. Query took 0.053 seconds.
+```
+
+The returned table has the following columns for each row for each column chunk
+in the file. Please refer to the [Parquet Documentation] for more information in
+the meaning of these fields.
+
+[parquet documentation]: https://parquet.apache.org/
+
+| column_name             | data_type | Description                                                                                         |
+| ----------------------- | --------- | --------------------------------------------------------------------------------------------------- |
+| filename                | Utf8      | Name of the file                                                                                    |
+| row_group_id            | Int64     | Row group index the column chunk belongs to                                                         |
+| row_group_num_rows      | Int64     | Count of rows stored in the row group                                                               |
+| row_group_num_columns   | Int64     | Total number of columns in the row group (same for all row groups)                                  |
+| row_group_bytes         | Int64     | Number of bytes used to store the row group (not including metadata)                                |
+| column_id               | Int64     | ID of the column                                                                                    |
+| file_offset             | Int64     | Offset within the file that this column chunk's data begins                                         |
+| num_values              | Int64     | Total number of values in this column chunk                                                         |
+| path_in_schema          | Utf8      | "Path" (column name) of the column chunk in the schema                                              |
+| type                    | Utf8      | Parquet data type of the column chunk                                                               |
+| stats_min               | Utf8      | The minimum value for this column chunk, if stored in the statistics, cast to a string              |
+| stats_max               | Utf8      | The maximum value for this column chunk, if stored in the statistics, cast to a string              |
+| stats_null_count        | Int64     | Number of null values in this column chunk, if stored in the statistics                             |
+| stats_distinct_count    | Int64     | Number of distinct values in this column chunk, if stored in the statistics                         |
+| stats_min_value         | Utf8      | Same as `stats_min`                                                                                 |
+| stats_max_value         | Utf8      | Same as `stats_max`                                                                                 |
+| compression             | Utf8      | Block level compression (e.g. `SNAPPY`) used for this column chunk                                  |
+| encodings               | Utf8      | All block level encodings (e.g. `[PLAIN_DICTIONARY, PLAIN, RLE]`) used for this column chunk        |
+| index_page_offset       | Int64     | Offset in the file of the [`page index`], if any                                                    |
+| dictionary_page_offset  | Int64     | Offset in the file of the dictionary page, if any                                                   |
+| data_page_offset        | Int64     | Offset in the file of the first data page, if any                                                   |
+| total_compressed_size   | Int64     | Number of bytes the column chunk's data after encoding and compression (what is stored in the file) |
+| total_uncompressed_size | Int64     | Number of bytes the column chunk's data after encoding                                              |
+
+[`page index`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
+
+## `metadata_cache`
+
+The `metadata_cache` function shows information about the default File Metadata Cache that is used by the
+[`ListingTable`] implementation in DataFusion. This cache is used to speed up
+reading metadata from files when scanning directories with many files.
+
+For example, after creating a table with the [CREATE EXTERNAL TABLE](../sql/ddl.md#create-external-table)
+command:
+
+```sql
+> create external table hits
+  stored as parquet
+  location 's3://clickhouse-public-datasets/hits_compatible/athena_partitioned/';
+```
+
+You can inspect the metadata cache by querying the `metadata_cache` function:
+
+```sql
+> select * from metadata_cache();
++----------------------------------------------------+---------------------+-----------------+---------------------------------------+---------+---------------------+------+------------------+
+| path                                               | file_modified       | file_size_bytes | e_tag                                 | version | metadata_size_bytes | hits | extra            |
++----------------------------------------------------+---------------------+-----------------+---------------------------------------+---------+---------------------+------+------------------+
+| hits_compatible/athena_partitioned/hits_61.parquet | 2022-07-03T15:40:34 | 117270944       | "5db11cad1ca0d80d748fc92c914b010a-6"  | NULL    | 212949              | 0    | page_index=false |
+| hits_compatible/athena_partitioned/hits_32.parquet | 2022-07-03T15:37:17 | 94506004        | "2f7db49a9fe242179590b615b94a39d2-5"  | NULL    | 278157              | 0    | page_index=false |
+| hits_compatible/athena_partitioned/hits_40.parquet | 2022-07-03T15:38:07 | 142508647       | "9e5852b45a469d5a05bf270a286eab8a-8"  | NULL    | 212917              | 0    | page_index=false |
+| hits_compatible/athena_partitioned/hits_93.parquet | 2022-07-03T15:44:07 | 127987774       | "751100bf0dac7d489b9836abf3108b99-7"  | NULL    | 278318              | 0    | page_index=false |
+| .                                                                                                                                                                                            |
++----------------------------------------------------+---------------------+-----------------+---------------------------------------+---------+---------------------+------+------------------+
+```
+
+Since `metadata_cache` is a normal table function, you can use it in most places you can use
+a table reference.
+
+For example, to get the total size consumed by the cached entries:
+
+```sql
+> select sum(metadata_size_bytes) from metadata_cache();
++-------------------------------------------+
+| sum(metadata_cache().metadata_size_bytes) |
++-------------------------------------------+
+| 22972345                                  |
++-------------------------------------------+
+```
+
+The columns of the returned table are:
+
+| column_name         | data_type | Description                                                                               |
+| ------------------- | --------- | ----------------------------------------------------------------------------------------- |
+| path                | Utf8      | File path relative to the object store / filesystem root                                  |
+| file_modified       | Timestamp | Last modified time of the file                                                            |
+| file_size_bytes     | UInt64    | Size of the file in bytes                                                                 |
+| e_tag               | Utf8      | [Entity Tag] (ETag) of the file if available                                              |
+| version             | Utf8      | Version of the file if available (for object stores that support versioning)              |
+| metadata_size_bytes | UInt64    | Size of the cached metadata in memory (not its thrift encoded form)                       |
+| hits                | UInt64    | Number of times the cached metadata has been accessed                                     |
+| extra               | Utf8      | Extra information about the cached metadata (e.g., if page index information is included) |
+
+## `statistics_cache`
+
+Similarly to the `metadata_cache`, the `statistics_cache` function can be used to show information
+about the File Statistics Cache that is used by the [`ListingTable`] implementation in DataFusion.
+For the statistics to be collected, the config `datafusion.execution.collect_statistics` must be
+enabled.
+
+You can inspect the statistics cache by querying the `statistics_cache` function. For example:
+
+```sql
+> select * from statistics_cache();
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+| path             | file_modified       | file_size_bytes | e_tag                  | version | num_rows        | num_columns | table_size_bytes   | statistics_size_bytes |
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+| .../hits.parquet | 2022-06-25T22:22:22 | 14779976446     | 0-5e24d1ee16380-370f48 | NULL    | Exact(99997497) | 105         | Exact(36445943240) | 0                     |
++------------------+---------------------+-----------------+------------------------+---------+-----------------+-------------+--------------------+-----------------------+
+```
+
+The columns of the returned table are:
+
+| column_name           | data_type | Description                                                                  |
+| --------------------- | --------- | ---------------------------------------------------------------------------- |
+| path                  | Utf8      | File path relative to the object store / filesystem root                     |
+| file_modified         | Timestamp | Last modified time of the file                                               |
+| file_size_bytes       | UInt64    | Size of the file in bytes                                                    |
+| e_tag                 | Utf8      | [Entity Tag] (ETag) of the file if available                                 |
+| version               | Utf8      | Version of the file if available (for object stores that support versioning) |
+| num_rows              | Utf8      | Number of rows in the table                                                  |
+| num_columns           | UInt64    | Number of columns in the table                                               |
+| table_size_bytes      | Utf8      | Size of the table, in bytes                                                  |
+| statistics_size_bytes | UInt64    | Size of the cached statistics in memory                                      |
+
+## `list_files_cache`
+
+The `list_files_cache` function shows information about the `ListFilesCache` that is used by the [`ListingTable`] implementation in DataFusion. When creating a [`ListingTable`], DataFusion lists the files in the table's location and caches results in the `ListFilesCache`. Subsequent queries against the same table can reuse this cached information instead of re-listing the files. Cache entries are scoped to tables.
+
+You can inspect the cache by querying the `list_files_cache` function. For example,
+
+```sql
+> set datafusion.runtime.list_files_cache_ttl = "30s";
+> create external table overturemaps
+stored as parquet
+location 's3://overturemaps-us-west-2/release/2025-12-17.0/theme=base/type=infrastructure';
+0 row(s) fetched.
+> select table, path, metadata_size_bytes, expires_in, unnest(metadata_list)['file_size_bytes'] as file_size_bytes, unnest(metadata_list)['e_tag'] as e_tag from list_files_cache() limit 10;
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+| table        | path                                                | metadata_size_bytes | expires_in                        | file_size_bytes | e_tag                                 |
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 999055952       | "35fc8fbe8400960b54c66fbb408c48e8-60" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 975592768       | "8a16e10b722681cdc00242564b502965-59" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1082925747      | "24cd13ddb5e0e438952d2499f5dabe06-65" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1008425557      | "37663e31c7c64d4ef355882bcd47e361-61" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1065561905      | "4e7c50d2d1b3c5ed7b82b4898f5ac332-64" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1045655427      | "8fff7e6a72d375eba668727c55d4f103-63" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1086822683      | "b67167d8022d778936c330a52a5f1922-65" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1016732378      | "6d70857a0473ed9ed3fc6e149814168b-61" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 991363784       | "c9cafb42fcbb413f851691c895dd7c2b-60" |
+| overturemaps | release/2025-12-17.0/theme=base/type=infrastructure | 2750                | 0 days 0 hours 0 mins 25.264 secs | 1032469715      | "7540252d0d67158297a67038a3365e0f-62" |
++--------------+-----------------------------------------------------+---------------------+-----------------------------------+-----------------+---------------------------------------+
+```
+
+The columns of the returned table are:
+| column_name | data_type | Description |
+| ------------------- | ------------ | ----------------------------------------------------------------------------------------- |
+| table | Utf8 | Name of the table |
+| path | Utf8 | File path relative to the object store / filesystem root |
+| metadata_size_bytes | UInt64 | Size of the cached metadata in memory (not its thrift encoded form) |
+| expires_in | Duration(ms) | Last modified time of the file |
+| metadata_list | List(Struct) | List of metadatas, one for each file under the path. |
+
+A metadata struct in the metadata_list contains the following fields:
+
+```text
+{
+  "file_path": "release/2025-12-17.0/theme=base/type=infrastructure/part-00000-d556e455-e0c5-4940-b367-daff3287a952-c000.zstd.parquet",
+  "file_modified": "2025-12-17T22:20:29",
+  "file_size_bytes": 999055952,
+  "e_tag": "35fc8fbe8400960b54c66fbb408c48e8-60",
+  "version": null
+}
+```
+
+[`listingtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
+[entity tag]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
diff --git a/docs/source/user-guide/cli/index.rst b/docs/source/user-guide/cli/index.rst
index 874cfc0eae868..325b0dce3fb19 100644
--- a/docs/source/user-guide/cli/index.rst
+++ b/docs/source/user-guide/cli/index.rst
@@ -25,3 +25,4 @@ DataFusion CLI
    installation
    usage
    datasources
+   functions
diff --git a/docs/source/user-guide/cli/usage.md b/docs/source/user-guide/cli/usage.md
index 13f0e7cff175d..7c56b994fcc33 100644
--- a/docs/source/user-guide/cli/usage.md
+++ b/docs/source/user-guide/cli/usage.md
@@ -58,11 +58,17 @@ OPTIONS:
             Specify the memory pool type 'greedy' or 'fair', default to 'greedy'
 
         --top-memory-consumers <TOP_MEMORY_CONSUMERS>
-            The number of top memory consumers to display when query fails due to memory exhaustion. To disable memory consumer tracking, set this value to 0 [default: 3]
+            The number of top memory consumers to display when query fails due to memory exhaustion.
+            To disable memory consumer tracking, set this value to 0 [default: 3].
+            Please set one of the runtime configs: '--memory-limit' or '--mem-pool-type' to see 'top-memory-consumers' result when memory is exhausted.
 
     -d, --disk-limit <DISK_LIMIT>
             Available disk space for spilling queries (e.g. '10g'), default to None (uses DataFusion's default value of '100g')
 
+      --object-store-profiling <OBJECT_STORE_PROFILING>
+          Specify the default object_store_profiling mode, defaults to 'disabled'.
+          [possible values: disabled, summary, trace] [default: Disabled]
+
     -p, --data-path <DATA_PATH>
             Path to your data, default to current directory
 
@@ -122,6 +128,41 @@ Available commands inside DataFusion CLI are:
 > \h function
 ```
 
+- Object Store Profiling Mode
+
+```bash
+> \object_store_profiling [disabled|summary|trace]
+```
+
+When enabled, prints detailed information about object store (I/O) operations
+performed during query execution to STDOUT.
+
+```sql
+> \object_store_profiling trace
+ObjectStore Profile mode set to Trace
+> select count(*) from 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet';
++----------+
+| count(*) |
++----------+
+| 1000000  |
++----------+
+1 row(s) fetched.
+Elapsed 0.552 seconds.
+
+Object Store Profiling
+Instrumented Object Store: instrument_mode: Trace, inner: HttpStore
+2025-10-17T18:08:48.457992+00:00 operation=Get duration=0.043592s size=8 range: bytes=174965036-174965043 path=hits_compatible/athena_partitioned/hits_1.parquet
+2025-10-17T18:08:48.501878+00:00 operation=Get duration=0.031542s size=34322 range: bytes=174930714-174965035 path=hits_compatible/athena_partitioned/hits_1.parquet
+
+Summaries:
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Operation | Metric   | min       | max       | avg       | sum       | count |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+| Get       | duration | 0.031542s | 0.043592s | 0.037567s | 0.075133s | 2     |
+| Get       | size     | 8 B       | 34322 B   | 17165 B   | 34330 B   | 2     |
++-----------+----------+-----------+-----------+-----------+-----------+-------+
+```
+
 ## Supported SQL
 
 In addition to the normal [SQL supported in DataFusion], `datafusion-cli` also
@@ -231,64 +272,5 @@ DataFusion CLI v13.0.0
 ## Functions
 
 `datafusion-cli` comes with build-in functions that are not included in the
-DataFusion SQL engine. These functions are:
-
-### `parquet_metadata`
-
-The `parquet_metadata` table function can be used to inspect detailed metadata
-about a parquet file such as statistics, sizes, and other information. This can
-be helpful to understand how parquet files are structured.
-
-For example, to see information about the `"WatchID"` column in the
-`hits.parquet` file, you can use:
-
-```sql
-SELECT path_in_schema, row_group_id, row_group_num_rows, stats_min, stats_max, total_compressed_size
-FROM parquet_metadata('hits.parquet')
-WHERE path_in_schema = '"WatchID"'
-LIMIT 3;
-
-+----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
-| path_in_schema | row_group_id | row_group_num_rows | stats_min           | stats_max           | total_compressed_size |
-+----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
-| "WatchID"      | 0            | 450560             | 4611687214012840539 | 9223369186199968220 | 3883759               |
-| "WatchID"      | 1            | 612174             | 4611689135232456464 | 9223371478009085789 | 5176803               |
-| "WatchID"      | 2            | 344064             | 4611692774829951781 | 9223363791697310021 | 3031680               |
-+----------------+--------------+--------------------+---------------------+---------------------+-----------------------+
-3 rows in set. Query took 0.053 seconds.
-```
-
-The returned table has the following columns for each row for each column chunk
-in the file. Please refer to the [Parquet Documentation] for more information.
-
-[parquet documentation]: https://parquet.apache.org/
-
-| column_name             | data_type | Description                                                                                         |
-| ----------------------- | --------- | --------------------------------------------------------------------------------------------------- |
-| filename                | Utf8      | Name of the file                                                                                    |
-| row_group_id            | Int64     | Row group index the column chunk belongs to                                                         |
-| row_group_num_rows      | Int64     | Count of rows stored in the row group                                                               |
-| row_group_num_columns   | Int64     | Total number of columns in the row group (same for all row groups)                                  |
-| row_group_bytes         | Int64     | Number of bytes used to store the row group (not including metadata)                                |
-| column_id               | Int64     | ID of the column                                                                                    |
-| file_offset             | Int64     | Offset within the file that this column chunk's data begins                                         |
-| num_values              | Int64     | Total number of values in this column chunk                                                         |
-| path_in_schema          | Utf8      | "Path" (column name) of the column chunk in the schema                                              |
-| type                    | Utf8      | Parquet data type of the column chunk                                                               |
-| stats_min               | Utf8      | The minimum value for this column chunk, if stored in the statistics, cast to a string              |
-| stats_max               | Utf8      | The maximum value for this column chunk, if stored in the statistics, cast to a string              |
-| stats_null_count        | Int64     | Number of null values in this column chunk, if stored in the statistics                             |
-| stats_distinct_count    | Int64     | Number of distinct values in this column chunk, if stored in the statistics                         |
-| stats_min_value         | Utf8      | Same as `stats_min`                                                                                 |
-| stats_max_value         | Utf8      | Same as `stats_max`                                                                                 |
-| compression             | Utf8      | Block level compression (e.g. `SNAPPY`) used for this column chunk                                  |
-| encodings               | Utf8      | All block level encodings (e.g. `[PLAIN_DICTIONARY, PLAIN, RLE]`) used for this column chunk        |
-| index_page_offset       | Int64     | Offset in the file of the [`page index`], if any                                                    |
-| dictionary_page_offset  | Int64     | Offset in the file of the dictionary page, if any                                                   |
-| data_page_offset        | Int64     | Offset in the file of the first data page, if any                                                   |
-| total_compressed_size   | Int64     | Number of bytes the column chunk's data after encoding and compression (what is stored in the file) |
-| total_uncompressed_size | Int64     | Number of bytes the column chunk's data after encoding                                              |
-
-+-------------------------+-----------+-------------+
-
-[`page index`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
+DataFusion SQL engine, see [DataFusion CLI specific functions](functions.md) section
+for details.
diff --git a/docs/source/user-guide/concepts-readings-events.md b/docs/source/user-guide/concepts-readings-events.md
index ad444ef91c474..355d622f77686 100644
--- a/docs/source/user-guide/concepts-readings-events.md
+++ b/docs/source/user-guide/concepts-readings-events.md
@@ -21,7 +21,7 @@
 
 ## 🧭 Background Concepts
 
-- **2024-06-13**: [2024 ACM SIGMOD International Conference on Management of Data: Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine](https://dl.acm.org/doi/10.1145/3626246.3653368) - [Download](http://andrew.nerdnetworks.org/other/SIGMOD-2024-lamb.pdf), [Talk](https://youtu.be/-DpKcPfnNms), [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p), [Recording ](https://youtu.be/-DpKcPfnNms)
+- **2024-06-13**: [2024 ACM SIGMOD International Conference on Management of Data: Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine](https://dl.acm.org/doi/10.1145/3626246.3653368) - [Download](https://andrew.nerdnetworks.org/pdf/SIGMOD-2024-lamb.pdf), [Talk](https://youtu.be/-DpKcPfnNms), [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p), [Recording ](https://youtu.be/-DpKcPfnNms)
 
 - **2024-06-07**: [Video: SIGMOD 2024 Practice: Apache Arrow DataFusion A Fast, Embeddable, Modular Analytic Query Engine](https://www.youtube.com/watch?v=-DpKcPfnNms&t=5s) - [Slides](https://docs.google.com/presentation/d/1gqcxSNLGVwaqN0_yJtCbNm19-w5pqPuktII5_EDA6_k/edit#slide=id.p)
 
@@ -37,6 +37,34 @@
 
 This is a list of DataFusion related blog posts, articles, and other resources. Please open a PR to add any new resources you create or find
 
+- **2026-01-12** [Blog: Extending SQL in DataFusion: from ->> to TABLESAMPLE](https://datafusion.apache.org/blog/2026/01/12/extending-sql)
+
+- **2025-12-15** [Blog: Optimizing Repartitions in DataFusion: How I Went From Database Noob to Core Contribution](https://datafusion.apache.org/blog/2025/12/15/avoid-consecutive-repartitions)
+
+- **2025-09-21** [Blog: Implementing User Defined Types and Custom Metadata in DataFusion](https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata)
+
+- **2025-09-10** [Blog: Dynamic Filters: Passing Information Between Operators During Execution for 25x Faster Queries](https://datafusion.apache.org/blog/2025/09/10/dynamic-filters)
+
+- **2025-08-15** [Blog: Using External Indexes, Metadata Stores, Catalogs and Caches to Accelerate Queries on Apache Parquet](https://datafusion.apache.org/blog/2025/08/15/external-parquet-indexes)
+
+- **2025-07-14** [Blog: Embedding User-Defined Indexes in Apache Parquet Files](https://datafusion.apache.org/blog/2025/07/14/user-defined-parquet-indexes)
+
+- **2025-06-30** [Blog: Using Rust async for Query Execution and Cancelling Long-Running Queries](https://datafusion.apache.org/blog/2025/06/30/cancellation)
+
+- **2025-06-15** [Blog: Optimizing SQL (and DataFrames) in DataFusion, Part 1: Query Optimization Overview](https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-one)
+
+- **2025-06-15** [Blog: Optimizing SQL (and DataFrames) in DataFusion, Part 2: Optimizers in Apache DataFusion](https://datafusion.apache.org/blog/2025/06/15/optimizing-sql-dataframes-part-two)
+
+- **2025-04-19** [Blog: User defined Window Functions in DataFusion](https://datafusion.apache.org/blog/2025/04/19/user-defined-window-functions)
+
+- **2025-04-10** [Blog: tpchgen-rs World's fastest open source TPC-H data generator, written in Rust](https://datafusion.apache.org/blog/2025/04/10/fastest-tpch-generator)
+
+- **2025-03-11** [Blog: Using Ordering for Better Plans in Apache DataFusion](https://datafusion.apache.org/blog/2025/03/11/ordering-analysis)
+
+- **2024-05-07** [Blog: Announcing Apache Arrow DataFusion is now Apache DataFusion](https://datafusion.apache.org/blog/2024/05/07/datafusion-tlp)
+
+- **2024-03-06** [Blog: Announcing Apache Arrow DataFusion Comet](https://datafusion.apache.org/blog/2024/03/06/comet-donation)
+
 - **2025-03-21** [Blog: Efficient Filter Pushdown in Parquet](https://datafusion.apache.org/blog/2025/03/21/parquet-pushdown/)
 
 - **2025-03-20** [Blog: Parquet Pruning in DataFusion: Read Only What Matters](https://datafusion.apache.org/blog/2025/03/20/parquet-pruning/)
@@ -59,16 +87,14 @@ This is a list of DataFusion related blog posts, articles, and other resources.
 
 - **2024-10-29** [Video: MiDAS Seminar Fall 2024 on "Apache DataFusion" by Andrew Lamb](https://www.youtube.com/watch?v=CpnxuBwHbUc)
 
-- **2024-10-27** [Blog: Caching in DataFusion: Don't read twice](https://blog.haoxp.xyz/posts/caching-datafusion)
+- **2024-10-27** [Blog: Caching in DataFusion: Don't read twice](https://blog.xiangpeng.systems/posts/caching-datafusion/)
 
-- **2024-10-24** [Blog: Parquet pruning in DataFusion: Read no more than you need](https://blog.haoxp.xyz/posts/parquet-to-arrow/)
+- **2024-10-24** [Blog: Parquet pruning in DataFusion: Read no more than you need](https://blog.xiangpeng.systems/posts/parquet-to-arrow/)
 
 - **2024-09-13** [Blog: Using StringView / German Style Strings to make Queries Faster: Part 2 - String Operations](https://www.influxdata.com/blog/faster-queries-with-stringview-part-two-influxdb/) | [Reposted on DataFusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-2/)
 
 - **2024-09-13** [Blog: Using StringView / German Style Strings to Make Queries Faster: Part 1- Reading Parquet](https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/) | [Reposted on Datafusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/)
 
-- **2024-10-16** [Blog: Candle Image Segmentation](https://www.letsql.com/posts/candle-image-segmentation/)
-
 - **2024-09-23 → 2024-12-02** [Talks: Carnegie Mellon University: Database Building Blocks Seminar Series - Fall 2024](https://db.cs.cmu.edu/seminar2024/)
 
   - **2024-11-12** [Video: Building InfluxDB 3.0 with the FDAP Stack: Apache Flight, DataFusion, Arrow and Parquet (Paul Dix)](https://www.youtube.com/watch?v=AGS4GNGDK_4)
@@ -138,34 +164,35 @@ This is a list of DataFusion related blog posts, articles, and other resources.
 
 ## 📅 Release Notes & Updates
 
-- **2025-03-24** [Apache DataFusion 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/24/datafusion-46.0.0/)
-
-- **2024-09-14** [Apache DataFusion Python 43.1.0 Released](https://datafusion.apache.org/blog/2024/12/14/datafusion-python-43.1.0/)
-
-- **2024-08-24** [Apache DataFusion Python 40.1.0 Released, Significant usability updates](https://datafusion.apache.org/blog/2024/08/20/python-datafusion-40.0.0/)
-
-- **2024-07-24** [DataFusion 40.0.0 Release](https://datafusion.apache.org/blog/2024/07/24/datafusion-40.0.0/)
-
-- **2024-01-19** [DataFusion 34.0.0 Release](https://datafusion.apache.org/blog/2024/01/19/datafusion-34.0.0/)
-
-- **2023-06-24** [DataFusion 25.0.0 Release](https://arrow.apache.org/blog/2023/06/24/datafusion-25.0.0/)
-
-- **2023-01-19** [DataFusion 16.0.0 Release](https://arrow.apache.org/blog/2023/01/19/datafusion-16.0.0/)
-
-- **2022-10-25** [DataFusion 13.0.0 Release](https://arrow.apache.org/blog/2022/10/25/datafusion-13.0.0/)
-
-- **2022-05-16** [DataFusion 8.0.0 Release](https://arrow.apache.org/blog/2022/05/16/datafusion-8.0.0/)
-
-- **2022-02-28** [DataFusion 7.0.0 Release](https://arrow.apache.org/blog/2022/02/28/datafusion-7.0.0/)
-
-- **2021-11-19** [DataFusion 6.0.0 Release](https://arrow.apache.org/blog/2021/11/19/datafusion-6.0.0/)
-
-- **2021-08-18** [DataFusion 5.0.0 Release](https://arrow.apache.org/blog/2021/08/18/datafusion-5.0.0/)
-
-- **2019-09-22** [DataFusion 0.15.0 Release Notes](https://andygrove.io/2019/09/datafusion-0.15.0-release-notes/)
+- **2026-01-30** [Apache DataFusion Comet 0.13.0 Release](https://datafusion.apache.org/blog/2026/01/30/datafusion-comet-0.13.0)
+- **2026-01-12** [Apache DataFusion 52.0.0 Released](https://datafusion.apache.org/blog/2026/01/12/datafusion-52.0.0)
+- **2025-12-04** [Apache DataFusion Comet 0.12.0 Release](https://datafusion.apache.org/blog/2025/12/04/datafusion-comet-0.12.0)
+- **2025-11-25** [Apache DataFusion 51.0.0 Released](https://datafusion.apache.org/blog/2025/11/25/datafusion-51.0.0)
+- **2025-10-21** [Apache DataFusion Comet 0.11.0 Release](https://datafusion.apache.org/blog/2025/10/21/datafusion-comet-0.11.0)
+- **2025-09-29** [Apache DataFusion 50.0.0 Released](https://datafusion.apache.org/blog/2025/09/29/datafusion-50.0.0)
+- **2025-09-16** [Apache DataFusion Comet 0.10.0 Release](https://datafusion.apache.org/blog/2025/09/16/datafusion-comet-0.10.0)
+- **2025-07-28** [Apache DataFusion 49.0.0 Released](https://datafusion.apache.org/blog/2025/07/28/datafusion-49.0.0)
+- **2025-07-16** [Apache DataFusion 48.0.0 Released](https://datafusion.apache.org/blog/2025/07/16/datafusion-48.0.0)
+- **2025-07-11** [Apache DataFusion 47.0.0 Released](https://datafusion.apache.org/blog/2025/07/11/datafusion-47.0.0)
+- **2025-07-01** [Apache DataFusion Comet 0.9.0 Release](https://datafusion.apache.org/blog/2025/07/01/datafusion-comet-0.9.0)
+- **2025-05-06** [Apache DataFusion Comet 0.8.0 Release](https://datafusion.apache.org/blog/2025/05/06/datafusion-comet-0.8.0)
+- **2025-03-30** [Apache DataFusion Python 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/30/datafusion-python-46.0.0)
+- **2025-03-24** [Apache DataFusion 46.0.0 Released](https://datafusion.apache.org/blog/2025/03/24/datafusion-46.0.0)
+- **2025-03-20** [Apache DataFusion Comet 0.7.0 Release](https://datafusion.apache.org/blog/2025/03/20/datafusion-comet-0.7.0)
+- **2025-02-20** [Apache DataFusion 45.0.0 Released](https://datafusion.apache.org/blog/2025/02/20/datafusion-45.0.0)
+- **2025-02-17** [Apache DataFusion Comet 0.6.0 Release](https://datafusion.apache.org/blog/2025/02/17/datafusion-comet-0.6.0)
+- **2025-02-02** [Apache DataFusion Ballista 43.0.0 Released](https://datafusion.apache.org/blog/2025/02/02/datafusion-ballista-43.0.0)
+- **2025-01-17** [Apache DataFusion Comet 0.5.0 Release](https://datafusion.apache.org/blog/2025/01/17/datafusion-comet-0.5.0)
 
 # 🌎 Community Events
 
+- **2026-07-22** [Denver Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/18428) - [RSVP](https://luma.com/jsu6faie)
+- **2026-05-12** [New York City Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/20030) - [RSVP](https://luma.com/adhshv92)
+- **2026-04-23** [Seattle Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/13500) - [RSVP](https://luma.com/hxshbp0m)
+- **2026-03-05** [Stockholm Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/18429) - [RSVP](https://luma.com/ctqtiqap)
+- **2026-02-19** [San Francisco Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/19859) - [RSVP](https://luma.com/p7r6fp2z), [Recording](https://www.youtube.com/playlist?list=PL42Ljm2tTt5peGUWMBN7WFkASq73j8PoU)
+- **2025-11-12** [Boston Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/16703) - [Recording](https://youtu.be/wCAud478Dg8), [Slides](https://drive.google.com/file/d/18KGH_wGHkgdAfjy5sQVKFhnN1GyYXSzU)
+- **2025-09-15** [New York City Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/16265) - [RSVP](https://lu.ma/qkcyycg0), [Recording](https://youtu.be/ElAiN_1fX_4)
 - **2025-01-23** [Amsterdam Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/12988) - [Slides](https://github.com/apache/datafusion/discussions/12988)
 - **2025-01-22** [Datadog Apache DataFusion Community Meeting](https://www.linkedin.com/posts/seshendranalla_apache-datafusion-community-meeting-2025-activity-7290384383201435648-8tqv) - [Recording](https://www.youtube.com/watch?v=ceTo2vUyRI0)
 - **2025-01-15** [Boston Apache DataFusion Meetup](https://github.com/apache/datafusion/discussions/13165) - [Slides](https://docs.google.com/presentation/d/1_zBLHdqxPlhWuNK2oCA2d_hCpb6HWgHbVJBseiUXA80)
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index a794241dfc4fc..6f6d5b205877f 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -18,127 +18,314 @@
 -->
 
 <!---
-This file was generated by the dev/update_config_docs.sh script.
+NOTE: This file was generated by the dev/update_config_docs.sh script.
 Do not edit it manually as changes will be overwritten.
 Instead, edit dev/update_config_docs.sh or the docstrings in datafusion/core/src/config.rs.
 -->
 
 # Configuration Settings
 
-The following configuration options can be passed to `SessionConfig` to control various aspects of query execution.
-
-For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables.
-To construct a session with options from the environment, use `SessionConfig::from_env`.
-The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores.
-For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable.
-Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html).
-If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted.
-Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions.
-
-| key                                                                     | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ----------------------------------------------------------------------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| datafusion.catalog.create_default_catalog_and_schema                    | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.catalog.default_catalog                                      | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.catalog.default_schema                                       | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.information_schema                                   | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.catalog.location                                             | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.catalog.format                                               | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.catalog.has_header                                           | true                      | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.catalog.newlines_in_values                                   | false                     | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.batch_size                                         | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.coalesce_batches                                   | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.execution.collect_statistics                                 | false                     | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.target_partitions                                  | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.time_zone                                          | +00:00                    | The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.enable_page_index                          | true                      | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.pruning                                    | true                      | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.skip_metadata                              | true                      | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.parquet.metadata_size_hint                         | NULL                      | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.pushdown_filters                           | false                     | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.reorder_filters                            | false                     | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.execution.parquet.schema_force_view_types                    | true                      | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.binary_as_string                           | false                     | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.coerce_int96                               | NULL                      | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.bloom_filter_on_read                       | true                      | (reading) Use any available bloom filters when reading parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | (writing) Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.execution.parquet.write_batch_size                           | 1024                      | (writing) Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.parquet.writer_version                             | 1.0                       | (writing) Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.execution.parquet.skip_arrow_metadata                        | false                     | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.compression                                | zstd(3)                   | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.dictionary_enabled                         | true                      | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | (writing) Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.statistics_enabled                         | page                      | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.max_statistics_size                        | 4096                      | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.parquet.created_by                                 | datafusion version 47.0.0 | (writing) Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.column_index_truncate_length               | 64                        | (writing) Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.execution.parquet.statistics_truncate_length                 | NULL                      | (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.parquet.data_page_row_count_limit                  | 20000                     | (writing) Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.encoding                                   | NULL                      | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.bloom_filter_on_write                      | false                     | (writing) Write bloom filters for all columns when creating parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.parquet.bloom_filter_fpp                           | NULL                      | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.parquet.bloom_filter_ndv                           | NULL                      | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.parquet.allow_single_file_parallelism              | true                      | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.skip_physical_aggregate_schema_check               | false                     | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.minimum_parallel_output_files                      | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.soft_max_rows_per_output_file                      | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.execution.max_buffered_batches_per_output_file               | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.execution.listing_table_ignore_subdirectory                  | true                      | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.execution.enable_recursive_ctes                              | true                      | Should DataFusion support recursive CTEs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.execution.split_file_groups_by_statistics                    | false                     | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.execution.keep_partition_by_columns                          | false                     | Should DataFusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold     | 0.8                       | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.execution.skip_partial_aggregation_probe_rows_threshold      | 100000                    | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.execution.use_row_number_estimates_to_optimize_partitioning  | false                     | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.enforce_batch_size_in_joins                        | false                     | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.execution.objectstore_writer_buffer_size                     | 10485760                  | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.enable_distinct_aggregation_soft_limit             | true                      | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.optimizer.repartition_joins                                  | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| datafusion.optimizer.allow_symmetric_joins_without_pruning              | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.optimizer.repartition_file_scans                             | true                      | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation. |
-| datafusion.optimizer.repartition_windows                                | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| datafusion.optimizer.repartition_sorts                                  | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.optimizer.prefer_existing_sort                               | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.skip_failed_rules                                  | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.max_passes                                         | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.optimizer.top_down_join_key_reordering                       | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| datafusion.optimizer.prefer_hash_join                                   | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.optimizer.hash_join_single_partition_threshold               | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.optimizer.hash_join_single_partition_threshold_rows          | 131072                    | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.optimizer.default_filter_selectivity                         | 20                        | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.optimizer.prefer_existing_union                              | false                     | When set to true, the optimizer will not attempt to convert Union to Interleave                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| datafusion.optimizer.expand_views_at_output                             | false                     | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.explain.logical_plan_only                                    | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.physical_plan_only                                   | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.explain.show_statistics                                      | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.explain.show_sizes                                           | true                      | When set to true, the explain statement will print the partition sizes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| datafusion.explain.show_schema                                          | false                     | When set to true, the explain statement will print schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.explain.format                                               | indent                    | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.sql_parser.parse_float_as_decimal                            | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| datafusion.sql_parser.enable_ident_normalization                        | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.sql_parser.enable_options_value_normalization                | false                     | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| datafusion.sql_parser.support_varchar_with_length                       | true                      | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.sql_parser.map_varchar_to_utf8view                           | true                      | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.sql_parser.recursion_limit                                   | 50                        | Specifies the recursion depth limit when parsing complex SQL Queries                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| datafusion.format.safe                                                  | true                      | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.null                                                  |                           | Format string for nulls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| datafusion.format.date_format                                           | %Y-%m-%d                  | Date format for date arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.format.datetime_format                                       | %Y-%m-%dT%H:%M:%S%.f      | Format for DateTime arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.timestamp_format                                      | %Y-%m-%dT%H:%M:%S%.f      | Timestamp format for timestamp arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| datafusion.format.timestamp_tz_format                                   | NULL                      | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| datafusion.format.time_format                                           | %H:%M:%S%.f               | Time format for time arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| datafusion.format.duration_format                                       | pretty                    | Duration format. Can be either `"pretty"` or `"ISO8601"`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| datafusion.format.types_info                                            | false                     | Show types in visual representation batches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+DataFusion configurations control various aspects of DataFusion planning and execution
+
+## Setting Configuration Options
+
+### Programmatically
+
+You can set the options programmatically via the [`ConfigOptions`] object. For
+example, to configure the `datafusion.execution.target_partitions` using the API:
+
+```rust
+use datafusion::common::config::ConfigOptions;
+let mut config = ConfigOptions::new();
+config.execution.target_partitions = 1;
+```
+
+### Via Environment Variables
+
+You can also set configuration options via environment variables using
+[`ConfigOptions::from_env`], for example
+
+```shell
+DATAFUSION_EXECUTION_TARGET_PARTITIONS=1 ./your_program
+```
+
+### Via SQL
+
+You can also set configuration options via SQL using the `SET` command. For
+example, to configure `datafusion.execution.target_partitions`:
+
+```sql
+SET datafusion.execution.target_partitions = '1';
+```
+
+[`configoptions`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html
+[`configoptions::from_env`]: https://docs.rs/datafusion/latest/datafusion/common/config/struct.ConfigOptions.html#method.from_env
+
+The following configuration settings are available:
+
+| key                                                                     | default                   | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| ----------------------------------------------------------------------- | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.catalog.create_default_catalog_and_schema                    | true                      | Whether the default catalog and schema should be created automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.catalog.default_catalog                                      | datafusion                | The default catalog name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.catalog.default_schema                                       | public                    | The default schema name - this impacts what SQL queries use if not specified                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.catalog.information_schema                                   | false                     | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.catalog.location                                             | NULL                      | Location scanned to load tables for `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.catalog.format                                               | NULL                      | Type of `TableProvider` to use when loading `default` schema                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.catalog.has_header                                           | true                      | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.catalog.newlines_in_values                                   | false                     | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.batch_size                                         | 8192                      | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.perfect_hash_join_small_build_threshold            | 1024                      | A perfect hash join (see `HashJoinExec` for more details) will be considered if the range of keys (max - min) on the build side is < this threshold. This provides a fast path for joins with very small key ranges, bypassing the density check. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.execution.perfect_hash_join_min_key_density                  | 0.15                      | The minimum required density of join keys on the build side to consider a perfect hash join (see `HashJoinExec` for more details). Density is calculated as: `(number of rows) / (max_key - min_key + 1)`. A perfect hash join may be used if the actual key density > this value. Currently only supports cases where build_side.num_rows() < u32::MAX. Support for build_side.num_rows() >= u32::MAX will be added in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.coalesce_batches                                   | true                      | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.collect_statistics                                 | true                      | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.execution.target_partitions                                  | 0                         | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.time_zone                                          | NULL                      | The default time zone Some functions, e.g. `now` return timestamps in this time zone                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.enable_page_index                          | true                      | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.pruning                                    | true                      | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.skip_metadata                              | true                      | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.parquet.metadata_size_hint                         | 524288                    | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.pushdown_filters                           | false                     | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.reorder_filters                            | false                     | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.execution.parquet.force_filter_selections                    | false                     | (reading) Force the use of RowSelections for filter results, when pushdown_filters is enabled. If false, the reader will automatically choose between a RowSelection and a Bitmap based on the number and pattern of selected rows.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.schema_force_view_types                    | true                      | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.binary_as_string                           | false                     | (reading) If true, parquet reader will read columns of `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`. Parquet files generated by some legacy writers do not correctly set the UTF8 flag for strings, causing string columns to be loaded as BLOB instead.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.coerce_int96                               | NULL                      | (reading) If true, parquet reader will read columns of physical type int96 as originating from a different resolution than nanosecond. This is useful for reading data from systems like Spark which stores microsecond resolution timestamps in an int96 allowing it to write values with a larger date range than 64-bit timestamps with nanosecond resolution.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.parquet.bloom_filter_on_read                       | true                      | (reading) Use any available bloom filters when reading parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.max_predicate_cache_size                   | NULL                      | (reading) The maximum predicate cache size, in bytes. When `pushdown_filters` is enabled, sets the maximum memory used to cache the results of predicate evaluation between filter evaluation and output generation. Decreasing this value will reduce memory usage, but may increase IO and CPU usage. None means use the default parquet reader setting. 0 means no caching.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.data_pagesize_limit                        | 1048576                   | (writing) Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.execution.parquet.write_batch_size                           | 1024                      | (writing) Sets write_batch_size in rows                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.writer_version                             | 1.0                       | (writing) Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.parquet.skip_arrow_metadata                        | false                     | (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.compression                                | zstd(3)                   | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.parquet.dictionary_enabled                         | true                      | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.dictionary_page_size_limit                 | 1048576                   | (writing) Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.parquet.statistics_enabled                         | page                      | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.max_row_group_size                         | 1048576                   | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.parquet.created_by                                 | datafusion version 52.3.0 | (writing) Sets "created by" property                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.column_index_truncate_length               | 64                        | (writing) Sets column index truncate length                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.execution.parquet.statistics_truncate_length                 | 64                        | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.parquet.data_page_row_count_limit                  | 20000                     | (writing) Sets best effort maximum number of rows in data page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.encoding                                   | NULL                      | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.bloom_filter_on_write                      | false                     | (writing) Write bloom filters for all columns when creating parquet files                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.parquet.bloom_filter_fpp                           | NULL                      | (writing) Sets bloom filter false positive probability. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.parquet.bloom_filter_ndv                           | NULL                      | (writing) Sets bloom filter number of distinct values. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.parquet.allow_single_file_parallelism              | true                      | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.skip_physical_aggregate_schema_check               | false                     | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.spill_compression                                  | uncompressed              | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.execution.max_spill_file_size_bytes                          | 134217728                 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.minimum_parallel_output_files                      | 4                         | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.soft_max_rows_per_output_file                      | 50000000                  | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.execution.max_buffered_batches_per_output_file               | 2                         | This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.execution.listing_table_ignore_subdirectory                  | true                      | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.execution.listing_table_factory_infer_partitions             | true                      | Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.enable_recursive_ctes                              | true                      | Should DataFusion support recursive CTEs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.split_file_groups_by_statistics                    | false                     | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.execution.keep_partition_by_columns                          | false                     | Should DataFusion keep the columns used for partition_by in the output RecordBatches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.skip_partial_aggregation_probe_ratio_threshold     | 0.8                       | Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.execution.skip_partial_aggregation_probe_rows_threshold      | 100000                    | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.execution.use_row_number_estimates_to_optimize_partitioning  | false                     | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| datafusion.execution.enforce_batch_size_in_joins                        | false                     | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.execution.objectstore_writer_buffer_size                     | 10485760                  | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.execution.enable_ansi_mode                                   | false                     | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default.                          |
+| datafusion.execution.hash_join_buffering_capacity                       | 0                         | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_distinct_aggregation_soft_limit             | true                      | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_round_robin_repartition                     | true                      | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.optimizer.enable_topk_aggregation                            | true                      | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.enable_window_limits                               | true                      | When set to true, the optimizer will attempt to push limit operations past window functions, if possible                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_topk_repartition                            | true                      | When set to true, the optimizer will push TopK (Sort with fetch) below hash repartition when the partition key is a prefix of the sort key, reducing data volume before the shuffle.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.enable_topk_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down TopK dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_join_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown           | true                      | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.enable_dynamic_filter_pushdown                     | true                      | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.repartition_joins                                  | true                      | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.allow_symmetric_joins_without_pruning              | true                      | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.repartition_file_scans                             | true                      | When set to `true`, datasource partitions will be repartitioned to achieve maximum parallelism. This applies to both in-memory partitions and FileSource's file groups (1 group is 1 partition). For FileSources, only Parquet and CSV formats are currently supported. If set to `true` for a FileSource, all files will be repartitioned evenly (i.e., a single large file might be partitioned into smaller chunks) for parallel scanning. If set to `false` for a FileSource, different files will be read in parallel, but repartitioning won't happen within a single file. If set to `true` for an in-memory source, all memtable's partitions will have their batches repartitioned evenly to the desired number of `target_partitions`. Repartitioning can change the total number of partitions and batches per partition, but does not slice the initial record tables provided to the MemTable on creation.                                                                                                                                                                                                                                                                                                                      |
+| datafusion.optimizer.preserve_file_partitions                           | 0                         | Minimum number of distinct partition values required to group files by their Hive partition column values (enabling Hash partitioning declaration). How the option is used: - preserve_file_partitions=0: Disable it. - preserve_file_partitions=1: Always enable it. - preserve_file_partitions=N, actual file partitions=M: Only enable when M >= N. This threshold preserves I/O parallelism when file partitioning is below it. Note: This may reduce parallelism, rooting from the I/O level, if the number of distinct partitions is less than the target_partitions.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.optimizer.repartition_windows                                | true                      | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| datafusion.optimizer.repartition_sorts                                  | true                      | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", `                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.optimizer.subset_repartition_threshold                       | 4                         | Partition count threshold for subset satisfaction optimization. When the current partition count is >= this threshold, DataFusion will skip repartitioning if the required partitioning expression is a subset of the current partition expression such as Hash(a) satisfies Hash(a, b). When the current partition count is < this threshold, DataFusion will repartition to increase parallelism even when subset satisfaction applies. Set to 0 to always repartition (disable subset satisfaction optimization). Set to a high value to always use subset satisfaction. Example (subset_repartition_threshold = 4): `text Hash([a]) satisfies Hash([a, b]) because (Hash([a, b]) is subset of Hash([a]) If current partitions (3) < threshold (4), repartition: AggregateExec: mode=FinalPartitioned, gby=[a, b], aggr=[SUM(x)] RepartitionExec: partitioning=Hash([a, b], 8), input_partitions=3 AggregateExec: mode=Partial, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 3) If current partitions (8) >= threshold (4), use subset satisfaction: AggregateExec: mode=SinglePartitioned, gby=[a, b], aggr=[SUM(x)] DataSourceExec: file_groups={...}, output_partitioning=Hash([a], 8) ` |
+| datafusion.optimizer.prefer_existing_sort                               | false                     | When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.skip_failed_rules                                  | false                     | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.max_passes                                         | 3                         | Number of times that the optimizer will attempt to optimize the plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.optimizer.top_down_join_key_reordering                       | true                      | When set to true, the physical plan optimizer will run a top down process to reorder the join keys                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| datafusion.optimizer.prefer_hash_join                                   | true                      | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.optimizer.enable_piecewise_merge_join                        | false                     | When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.optimizer.hash_join_single_partition_threshold               | 1048576                   | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.optimizer.hash_join_single_partition_threshold_rows          | 131072                    | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.optimizer.hash_join_inlist_pushdown_max_size                 | 131072                    | Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides larger than this will use hash table lookups instead. Set to 0 to always use hash table lookups. InList pushdown can be more efficient for small build sides because it can result in better statistics pruning as well as use any bloom filters present on the scan side. InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion. On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory. This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` \* `target_partitions` memory. The default is 128kB per partition. This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases but avoids excessive memory usage or overhead for larger joins.                                                                                                                                                                                                             |
+| datafusion.optimizer.hash_join_inlist_pushdown_max_distinct_values      | 150                       | Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering. Build sides with more rows than this will use hash table lookups instead. Set to 0 to always use hash table lookups. This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent very large IN lists that might not provide much benefit over hash table lookups. This uses the deduplicated row count once the build side has been evaluated. The default is 150 values per partition. This is inspired by Trino's `max-filter-keys-per-column` setting. See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.optimizer.default_filter_selectivity                         | 20                        | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.optimizer.prefer_existing_union                              | false                     | When set to true, the optimizer will not attempt to convert Union to Interleave                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| datafusion.optimizer.expand_views_at_output                             | false                     | When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.enable_sort_pushdown                               | true                      | Enable sort pushdown optimization. When enabled, attempts to push sort requirements down to data sources that can natively handle them (e.g., by reversing file/row group read order). Returns **inexact ordering**: Sort operator is kept for correctness, but optimized input enables early termination for TopK queries (ORDER BY ... LIMIT N), providing significant speedup. Memory: No additional overhead (only changes read order). Future: Will add option to detect perfectly sorted data and eliminate Sort completely. Default: true                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.optimizer.enable_leaf_expression_pushdown                    | true                      | When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.explain.logical_plan_only                                    | false                     | When set to true, the explain statement will only print logical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.physical_plan_only                                   | false                     | When set to true, the explain statement will only print physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.explain.show_statistics                                      | false                     | When set to true, the explain statement will print operator statistics for physical plans                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.explain.show_sizes                                           | true                      | When set to true, the explain statement will print the partition sizes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| datafusion.explain.show_schema                                          | false                     | When set to true, the explain statement will print schema information                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.format                                               | indent                    | Display format of explain. Default is "indent". When set to "tree", it will print the plan in a tree-rendered format.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.explain.tree_maximum_render_width                            | 240                       | (format=tree only) Maximum total width of the rendered tree. When set to 0, the tree will have no width limit.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.explain.analyze_level                                        | dev                       | Verbosity level for "EXPLAIN ANALYZE". Default is "dev" "summary" shows common metrics for high-level insights. "dev" provides deep operator-level introspection for developers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| datafusion.sql_parser.parse_float_as_decimal                            | false                     | When set to true, SQL parser will parse float as decimal type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| datafusion.sql_parser.enable_ident_normalization                        | true                      | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| datafusion.sql_parser.enable_options_value_normalization                | false                     | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.sql_parser.support_varchar_with_length                       | true                      | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.sql_parser.map_string_types_to_utf8view                      | true                      | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.sql_parser.recursion_limit                                   | 50                        | Specifies the recursion depth limit when parsing complex SQL Queries                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| datafusion.sql_parser.default_null_ordering                             | nulls_max                 | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| datafusion.format.safe                                                  | true                      | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.null                                                  |                           | Format string for nulls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| datafusion.format.date_format                                           | %Y-%m-%d                  | Date format for date arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.format.datetime_format                                       | %Y-%m-%dT%H:%M:%S%.f      | Format for DateTime arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.timestamp_format                                      | %Y-%m-%dT%H:%M:%S%.f      | Timestamp format for timestamp arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.format.timestamp_tz_format                                   | NULL                      | Timestamp format for timestamp with timezone arrays. When `None`, ISO 8601 format is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.format.time_format                                           | %H:%M:%S%.f               | Time format for time arrays                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| datafusion.format.duration_format                                       | pretty                    | Duration format. Can be either `"pretty"` or `"ISO8601"`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| datafusion.format.types_info                                            | false                     | Show types in visual representation batches                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+
+# Runtime Configuration Settings
+
+DataFusion runtime configurations can be set via SQL using the `SET` command.
+
+For example, to configure `datafusion.runtime.memory_limit`:
+
+```sql
+SET datafusion.runtime.memory_limit = '2G';
+```
+
+The following runtime configuration settings are available:
+
+| key                                        | default | description                                                                                                                                                               |
+| ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| datafusion.runtime.list_files_cache_limit  | 1M      | Maximum memory to use for list files cache. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                             |
+| datafusion.runtime.list_files_cache_ttl    | NULL    | TTL (time-to-live) of the entries in the list file cache. Supports units m (minutes), and s (seconds). Example: '2m' for 2 minutes.                                       |
+| datafusion.runtime.max_temp_directory_size | 100G    | Maximum temporary file directory size. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                                  |
+| datafusion.runtime.memory_limit            | NULL    | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes.                               |
+| datafusion.runtime.metadata_cache_limit    | 50M     | Maximum memory to use for file metadata cache such as Parquet metadata. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes. |
+| datafusion.runtime.temp_directory          | NULL    | The path to the temporary file directory.                                                                                                                                 |
+
+# Tuning Guide
+
+## Short Queries
+
+By default DataFusion will attempt to maximize parallelism and use all cores --
+For example, if you have 32 cores, each plan will split the data into 32
+partitions. However, if your data is small, the overhead of splitting the data
+to enable parallelization can dominate the actual computation.
+
+You can find out how many cores are being used via the [`EXPLAIN`] command and look
+at the number of partitions in the plan.
+
+[`explain`]: sql/explain.md
+
+The `datafusion.optimizer.repartition_file_min_size` option controls the minimum file size the
+[`ListingTable`] provider will attempt to repartition. However, this
+does not apply to user defined data sources and only works when DataFusion has accurate statistics.
+
+If you know your data is small, you can set the `datafusion.execution.target_partitions`
+option to a smaller number to reduce the overhead of repartitioning. For very small datasets (e.g. less
+than 1MB), we recommend setting `target_partitions` to 1 to avoid repartitioning altogether.
+
+```sql
+SET datafusion.execution.target_partitions = '1';
+```
+
+[`listingtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTable.html
+
+## Memory-limited Queries
+
+When executing a memory-consuming query under a tight memory limit, DataFusion
+will spill intermediate results to disk.
+
+When the [`FairSpillPool`] is used, memory is divided evenly among partitions.
+The higher the value of `datafusion.execution.target_partitions`, the less memory
+is allocated to each partition, and the out-of-core execution path may trigger
+more frequently, possibly slowing down execution.
+
+Additionally, while spilling, data is read back in `datafusion.execution.batch_size` size batches.
+The larger this value, the fewer spilled sorted runs can be merged. Decreasing this setting
+can help reduce the number of subsequent spills required.
+
+In conclusion, for queries under a very tight memory limit, it's recommended to
+set `target_partitions` and `batch_size` to smaller values.
+
+```sql
+-- Query still gets parallelized, but each partition will have more memory to use
+SET datafusion.execution.target_partitions = 4;
+-- Smaller than the default '8192', while still keep the benefit of vectorized execution
+SET datafusion.execution.batch_size = 1024;
+```
+
+[`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html
+
+## Join Queries
+
+Currently Apache Datafusion supports the following join algorithms:
+
+- Nested Loop Join
+- Sort Merge Join
+- Hash Join
+- Symmetric Hash Join
+- Piecewise Merge Join (experimental)
+
+The physical planner will choose the appropriate algorithm based on the statistics + join
+condition of the two tables.
+
+# Join Algorithm Optimizer Configurations
+
+You can modify join optimization behavior in your queries by setting specific configuration values.
+Use the following command to update a configuration:
+
+```sql
+SET datafusion.optimizer.<configuration_name>;
+```
+
+Example
+
+```sql
+SET datafusion.optimizer.prefer_hash_join = false;
+```
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query:
+
+## Join Optimizer Configurations
+
+Adjusting the following configuration values influences how the optimizer selects the join algorithm
+used to execute your SQL query.
+
+### allow_symmetric_joins_without_pruning (bool, default = true)
+
+Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs
+lack ordering or filtering.
+
+- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution.
+
+### prefer_hash_join (bool, default = true)
+
+Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection.
+
+- true: favors HashJoin for faster execution when sufficient memory is available.
+- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed.
+
+### enable_piecewise_merge_join (bool, default = false)
+
+Enables the experimental Piecewise Merge Join algorithm.
+
+- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range
+  filter in the join condition.
+- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter
+  except for cases where it is joining two large tables (num_rows > 100,000) that are approximately
+  equal in size.
diff --git a/docs/source/user-guide/crate-configuration.md b/docs/source/user-guide/crate-configuration.md
index f4a1910f5f78f..44b4d39839c5b 100644
--- a/docs/source/user-guide/crate-configuration.md
+++ b/docs/source/user-guide/crate-configuration.md
@@ -19,18 +19,20 @@
 
 # Crate Configuration
 
-This section contains information on how to configure DataFusion in your Rust
-project. See the [Configuration Settings] section for a list of options that
-control DataFusion's behavior.
+This section contains information on how to configure builds of DataFusion in
+your Rust project. The [Configuration Settings] section lists options that
+control additional aspects DataFusion's runtime behavior.
 
 [configuration settings]: configs.md
+[support for adding dependencies]: https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies
 
-## Add latest non published DataFusion dependency
+## Using the nightly DataFusion builds
 
 DataFusion changes are published to `crates.io` according to the [release schedule](https://github.com/apache/datafusion/blob/main/dev/release/README.md#release-process)
 
-If you would like to test out DataFusion changes which are merged but not yet
-published, Cargo supports adding dependency directly to GitHub branch:
+If you would like to use or test versions of the DataFusion code which are
+merged but not yet published, you can use Cargo's [support for adding
+dependencies] directly to a GitHub branch:
 
 ```toml
 datafusion = { git = "https://github.com/apache/datafusion", branch = "main"}
@@ -50,22 +52,88 @@ datafusion = { git = "https://github.com/apache/datafusion", branch = "main", de
 
 More on [Cargo dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies)
 
-## Optimized Configuration
+## Optimizing Builds
 
-For an optimized build several steps are required. First, use the below in your `Cargo.toml`. It is
-worth noting that using the settings in the `[profile.release]` section will significantly increase the build time.
+Here are several suggestions to get the Rust compler to produce faster code when
+compiling DataFusion. Note that these changes may increase compile time and
+binary size.
 
-```toml
-[dependencies]
-datafusion = { version = "22.0" }
-tokio = { version = "^1.0", features = ["rt-multi-thread"] }
-snmalloc-rs = "0.3"
+### Generate Code with CPU Specific Instructions
+
+By default, the Rust compiler produces code that runs on a wide range of CPUs,
+but may not take advantage of all the features of your specific CPU (such as
+certain [SIMD instructions]). This is especially true for x86_64 CPUs, where the
+default target is `x86_64-unknown-linux-gnu`, which only guarantees support for
+the `SSE2` instruction set. DataFusion can benefit from the more advanced
+instructions in the `AVX2` and `AVX512` to speed up operations like filtering,
+aggregation, and joins. To tell the Rust compiler to use these instructions, set
+the `RUSTFLAGS` environment variable to specify a more specific target CPU.
+
+We recommend setting `target-cpu` or at least `avx2`, or preferably at least
+`native` (whatever the current CPU is). For example, to build and run DataFusion
+with optimizations for your current CPU:
+
+```shell
+RUSTFLAGS='-C target-cpu=native' cargo run --release
+```
+
+[simd instructions]: https://en.wikipedia.org/wiki/SIMD
+
+### Enable Link Time Optimization / Single Codegen Unit
 
+You can potentially improve your performance by compiling DataFusion into a
+single codegen unit which gives the Rust compiler more opportunity to optimize
+across crate boundaries. To do so, modify your projects' `Cargo.toml` to include
+`lto = true` and `codegen-units = 1` as shown below. Beware that using a single
+codegen unit _significantly_ increases `--release` build times.
+
+```toml
 [profile.release]
 lto = true
 codegen-units = 1
 ```
 
+### Profile Guided Optimization (PGO)
+
+Profile Guided Optimization can improve DataFusion performance by up to 25%. It works by compiling with instrumentation, running representative workloads to collect profile data, then recompiling with optimizations based on that data.
+
+Build with instrumentation:
+
+```shell
+RUSTFLAGS="-C profile-generate=/tmp/pgo-data" cargo build --release
+```
+
+Run your workloads to collect profile data. Use benchmarks like TPCH or Clickbench, or your actual production queries:
+
+```shell
+./target/release/your-datafusion-app --benchmark
+```
+
+Rebuild using the collected profile:
+
+```shell
+RUSTFLAGS="-C profile-use=/tmp/pgo-data" cargo build --release
+```
+
+Tips:
+
+- Use workloads that match your production patterns
+- Run multiple iterations during profiling for better coverage
+- Combine with LTO and CPU-specific optimizations for best results
+
+See the [Rust compiler guide](https://rustc-dev-guide.rust-lang.org/building/optimized-build.html#profile-guided-optimization) for more details. Discussion and results in [issue #9507](https://github.com/apache/datafusion/issues/9507).
+
+### Alternate Allocator: `snmalloc`
+
+You can also use [snmalloc-rs](https://crates.io/crates/snmalloc-rs) crate as
+the memory allocator for DataFusion to improve performance. To do so, add the
+dependency to your `Cargo.toml` as shown below.
+
+```toml
+[dependencies]
+snmalloc-rs = "0.3"
+```
+
 Then, in `main.rs.` update the memory allocator with the below after your imports:
 
 <!-- Note can't include snmalloc-rs in a runnable example, because it takes over the global allocator -->
@@ -82,20 +150,13 @@ async fn main() -> datafusion::error::Result<()> {
 }
 ```
 
-Based on the instruction set architecture you are building on you will want to configure the `target-cpu` as well, ideally
-with `native` or at least `avx2`.
-
-```shell
-RUSTFLAGS='-C target-cpu=native' cargo run --release
-```
-
-## Enable backtraces
+## Enable Backtraces
 
-By default Datafusion returns errors as a plain message. There is option to enable more verbose details about the error,
-like error backtrace. To enable a backtrace you need to add Datafusion `backtrace` feature to your `Cargo.toml` file:
+By default, Datafusion returns errors as a plain text message. You can enable more verbose details about the error,
+such as backtraces by enabling the `backtrace` feature to your `Cargo.toml` file like this:
 
 ```toml
-datafusion = { version = "31.0.0", features = ["backtrace"]}
+datafusion = { version = "52.1.0", features = ["backtrace"]}
 ```
 
 Set environment [variables](https://doc.rust-lang.org/std/backtrace/index.html#environment-variables)
diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md
index 82f1eeb2823dc..85724a72399ad 100644
--- a/docs/source/user-guide/dataframe.md
+++ b/docs/source/user-guide/dataframe.md
@@ -19,6 +19,8 @@
 
 # DataFrame API
 
+## DataFrame overview
+
 A DataFrame represents a logical set of rows with the same named columns,
 similar to a [Pandas DataFrame] or [Spark DataFrame].
 
diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md
index 6108315f398aa..83ba530d2b3b6 100644
--- a/docs/source/user-guide/example-usage.md
+++ b/docs/source/user-guide/example-usage.md
@@ -29,7 +29,7 @@ Find latest available Datafusion version on [DataFusion's
 crates.io] page. Add the dependency to your `Cargo.toml` file:
 
 ```toml
-datafusion = "latest_version"
+datafusion = "52.1.0"
 tokio = { version = "1.0", features = ["rt-multi-thread"] }
 ```
 
@@ -103,8 +103,8 @@ exported by DataFusion, for example:
 use datafusion::arrow::datatypes::Schema;
 ```
 
-For example, [DataFusion `25.0.0` dependencies] require `arrow`
-`39.0.0`. If instead you used `arrow` `40.0.0` in your project you may
+For example, [DataFusion `26.0.0` dependencies] require `arrow`
+`40.0.0`. If instead you used `arrow` `41.0.0` in your project you may
 see errors such as:
 
 ```text
diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md
index 68712012f43fc..c047659e9940d 100644
--- a/docs/source/user-guide/explain-usage.md
+++ b/docs/source/user-guide/explain-usage.md
@@ -225,14 +225,13 @@ Again, reading from bottom up:
 
 When predicate pushdown is enabled, `DataSourceExec` with `ParquetSource` gains the following metrics:
 
-- `page_index_rows_matched`: number of rows in pages that were tested by a page index filter, and passed
-- `page_index_rows_pruned`: number of rows in pages that were tested by a page index filter, and did not pass
-- `row_groups_matched_bloom_filter`: number of rows in row groups that were tested by a Bloom Filter, and passed
-- `row_groups_pruned_bloom_filter`: number of rows in row groups that were tested by a Bloom Filter, and did not pass
-- `row_groups_matched_statistics`: number of rows in row groups that were tested by row group statistics (min and max value), and passed
-- `row_groups_pruned_statistics`: number of rows in row groups that were tested by row group statistics (min and max value), and did not pass
-- `pushdown_rows_matched`: rows that were tested by any of the above filtered, and passed all of them (this should be minimum of `page_index_rows_matched`, `row_groups_pruned_bloom_filter`, and `row_groups_pruned_statistics`)
-- `pushdown_rows_pruned`: rows that were tested by any of the above filtered, and did not pass one of them (this should be sum of `page_index_rows_matched`, `row_groups_pruned_bloom_filter`, and `row_groups_pruned_statistics`)
+- `page_index_rows_pruned`: number of rows evaluated by page index filters. The metric reports both how many rows were considered in total and how many matched (were not pruned).
+- `page_index_pages_pruned`: number of pages evaluated by page index filters. The metric reports both how many pages were considered in total and how many matched (were not pruned).
+- `row_groups_pruned_bloom_filter`: number of row groups evaluated by Bloom Filters, reporting both total checked groups and groups that matched.
+- `row_groups_pruned_statistics`: number of row groups evaluated by row-group statistics (min/max), reporting both total checked groups and groups that matched.
+- `limit_pruned_row_groups`: number of row groups pruned by the limit.
+- `pushdown_rows_matched`: rows that were tested by any of the above filters, and passed all of them.
+- `pushdown_rows_pruned`: rows that were tested by any of the above filters, and did not pass at least one of them.
 - `predicate_evaluation_errors`: number of times evaluating the filter expression failed (expected to be zero in normal operation)
 - `num_predicate_creation_errors`: number of errors creating predicates (expected to be zero in normal operation)
 - `bloom_filter_eval_time`: time spent parsing and evaluating Bloom Filters
@@ -249,7 +248,7 @@ a separate core. Data crosses between cores only within certain operators such a
 
 You can read more about this in the [Partitioning Docs].
 
-[partitoning docs]: https://docs.rs/datafusion/latest/datafusion/physical_expr/enum.Partitioning.html
+[partitioning docs]: https://docs.rs/datafusion/latest/datafusion/physical_expr/enum.Partitioning.html
 
 ## Example of an Aggregate Query
 
diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md
index 03ab86eeb813a..56d78ac473f14 100644
--- a/docs/source/user-guide/expressions.md
+++ b/docs/source/user-guide/expressions.md
@@ -179,8 +179,8 @@ select log(-1), log(0), sqrt(-1);
 | ascii(character)                               | Returns a numeric representation of the character (`character`). Example: `ascii('a') -> 97`                                                                                                                                             |
 | bit_length(text)                               | Returns the length of the string (`text`) in bits. Example: `bit_length('spider') -> 48`                                                                                                                                                 |
 | btrim(text, characters)                        | Removes all specified characters (`characters`) from both the beginning and the end of the string (`text`). Example: `btrim('aabchelloccb', 'abc') -> hello`                                                                             |
-| char_length(text)                              | Returns number of characters in the string (`text`). The same as `character_length` and `length`. Example: `character_length('lion') -> 4`                                                                                               |
-| character_length(text)                         | Returns number of characters in the string (`text`). The same as `char_length` and `length`. Example: `char_length('lion') -> 4`                                                                                                         |
+| char_length(text)                              | Returns number of characters in the string (`text`). The same as `character_length` and `length`. Example: `char_length('lion') -> 4`                                                                                                    |
+| character_length(text)                         | Returns number of characters in the string (`text`). The same as `char_length` and `length`. Example: `character_length('lion') -> 4`                                                                                                    |
 | concat(value1, [value2 [, ...]])               | Concatenates the text representations (`value1, [value2 [, ...]]`) of all the arguments. NULL arguments are ignored. Example: `concat('aaa', 'bbc', NULL, 321) -> aaabbc321`                                                             |
 | concat_ws(separator, value1, [value2 [, ...]]) | Concatenates the text representations (`value1, [value2 [, ...]]`) of all the arguments with the separator (`separator`). NULL arguments are ignored. `concat_ws('/', 'path', 'to', NULL, 'my', 'folder', 123) -> path/to/my/folder/123` |
 | chr(integer)                                   | Returns a character by its numeric representation (`integer`). Example: `chr(90) -> 8`                                                                                                                                                   |
@@ -285,33 +285,35 @@ select log(-1), log(0), sqrt(-1);
 
 ## Aggregate Functions
 
-| Syntax                                                            | Description                                                                             |
-| ----------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
-| avg(expr)                                                         | Сalculates the average value for `expr`.                                                |
-| approx_distinct(expr)                                             | Calculates an approximate count of the number of distinct values for `expr`.            |
-| approx_median(expr)                                               | Calculates an approximation of the median for `expr`.                                   |
-| approx_percentile_cont(expr, percentile)                          | Calculates an approximation of the specified `percentile` for `expr`.                   |
-| approx_percentile_cont_with_weight(expr, weight_expr, percentile) | Calculates an approximation of the specified `percentile` for `expr` and `weight_expr`. |
-| bit_and(expr)                                                     | Computes the bitwise AND of all non-null input values for `expr`.                       |
-| bit_or(expr)                                                      | Computes the bitwise OR of all non-null input values for `expr`.                        |
-| bit_xor(expr)                                                     | Computes the bitwise exclusive OR of all non-null input values for `expr`.              |
-| bool_and(expr)                                                    | Returns true if all non-null input values (`expr`) are true, otherwise false.           |
-| bool_or(expr)                                                     | Returns true if any non-null input value (`expr`) is true, otherwise false.             |
-| count(expr)                                                       | Returns the number of rows for `expr`.                                                  |
-| count_distinct                                                    | Creates an expression to represent the count(distinct) aggregate function               |
-| cube(exprs)                                                       | Creates a grouping set for all combination of `exprs`                                   |
-| grouping_set(exprs)                                               | Create a grouping set.                                                                  |
-| max(expr)                                                         | Finds the maximum value of `expr`.                                                      |
-| median(expr)                                                      | Сalculates the median of `expr`.                                                        |
-| min(expr)                                                         | Finds the minimum value of `expr`.                                                      |
-| rollup(exprs)                                                     | Creates a grouping set for rollup sets.                                                 |
-| sum(expr)                                                         | Сalculates the sum of `expr`.                                                           |
+| Syntax                                                                          | Description                                                                                                                                              |
+| ------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| avg(expr)                                                                       | Сalculates the average value for `expr`.                                                                                                                 |
+| avg_distinct(expr)                                                              | Creates an expression to represent the avg(distinct) aggregate function                                                                                  |
+| approx_distinct(expr)                                                           | Calculates an approximate count of the number of distinct values for `expr`.                                                                             |
+| approx_median(expr)                                                             | Calculates an approximation of the median for `expr`.                                                                                                    |
+| approx_percentile_cont(expr, percentile [, centroids])                          | Calculates an approximation of the specified `percentile` for `expr`. Optional `centroids` parameter controls accuracy (default: 100).                   |
+| approx_percentile_cont_with_weight(expr, weight_expr, percentile [, centroids]) | Calculates an approximation of the specified `percentile` for `expr` and `weight_expr`. Optional `centroids` parameter controls accuracy (default: 100). |
+| bit_and(expr)                                                                   | Computes the bitwise AND of all non-null input values for `expr`.                                                                                        |
+| bit_or(expr)                                                                    | Computes the bitwise OR of all non-null input values for `expr`.                                                                                         |
+| bit_xor(expr)                                                                   | Computes the bitwise exclusive OR of all non-null input values for `expr`.                                                                               |
+| bool_and(expr)                                                                  | Returns true if all non-null input values (`expr`) are true, otherwise false.                                                                            |
+| bool_or(expr)                                                                   | Returns true if any non-null input value (`expr`) is true, otherwise false.                                                                              |
+| count(expr)                                                                     | Returns the number of rows for `expr`.                                                                                                                   |
+| count_distinct(expr)                                                            | Creates an expression to represent the count(distinct) aggregate function                                                                                |
+| cube(exprs)                                                                     | Creates a grouping set for all combination of `exprs`                                                                                                    |
+| grouping_set(exprs)                                                             | Create a grouping set.                                                                                                                                   |
+| max(expr)                                                                       | Finds the maximum value of `expr`.                                                                                                                       |
+| median(expr)                                                                    | Сalculates the median of `expr`.                                                                                                                         |
+| min(expr)                                                                       | Finds the minimum value of `expr`.                                                                                                                       |
+| rollup(exprs)                                                                   | Creates a grouping set for rollup sets.                                                                                                                  |
+| sum(expr)                                                                       | Сalculates the sum of `expr`.                                                                                                                            |
+| sum_distinct(expr)                                                              | Creates an expression to represent the sum(distinct) aggregate function                                                                                  |
 
 ## Aggregate Function Builder
 
 You can also use the `ExprFunctionExt` trait to more easily build Aggregate arguments `Expr`.
 
-See `datafusion-examples/examples/expr_api.rs` for example usage.
+See `datafusion-examples/examples/query_planning/expr_api.rs` for example usage.
 
 | Syntax                                                                  | Equivalent to                       |
 | ----------------------------------------------------------------------- | ----------------------------------- |
diff --git a/docs/source/user-guide/features.md b/docs/source/user-guide/features.md
index 1f73ce7eac113..967e81e681f50 100644
--- a/docs/source/user-guide/features.md
+++ b/docs/source/user-guide/features.md
@@ -43,7 +43,7 @@
 - [x] Filter (`WHERE`)
 - [x] Filter post-aggregate (`HAVING`)
 - [x] Sorting (`ORDER BY`)
-- [x] Limit (`LIMIT`
+- [x] Limit (`LIMIT`)
 - [x] Aggregate (`GROUP BY`)
 - [x] cast /try_cast
 - [x] [`VALUES` lists](https://www.postgresql.org/docs/current/queries-values.html)
@@ -93,7 +93,8 @@
 - [x] Memory limits enforced
 - [x] Spilling (to disk) Sort
 - [x] Spilling (to disk) Grouping
-- [ ] Spilling (to disk) Joins
+- [x] Spilling (to disk) Sort Merge Join
+- [ ] Spilling (to disk) Hash Join
 
 ## Data Sources
 
diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md
index 4151d7600b574..9ad42a2a1015b 100644
--- a/docs/source/user-guide/introduction.md
+++ b/docs/source/user-guide/introduction.md
@@ -78,15 +78,16 @@ Here are some example systems built using DataFusion:
 - Specialized Analytical Database systems such as [HoraeDB] and more general Apache Spark like system such as [Ballista]
 - New query language engines such as [prql-query] and accelerators such as [VegaFusion]
 - Research platform for new Database Systems, such as [Flock]
-- SQL support to another library, such as [dask sql]
+- SQL support to another library, such as [Vortex]
 - Streaming data platforms such as [Synnada]
 - Tools for reading / sorting / transcoding Parquet, CSV, AVRO, and JSON files such as [qv]
-- Native Spark runtime replacement such as [Blaze]
+- Native Spark runtime replacement such as [Auron]
+- Distributed data cache to boost GPU utilization of AI workloads with [Kubeflow Trainer](https://www.kubeflow.org/docs/components/trainer/user-guides/data-cache/)
 
 By using DataFusion, projects are freed to focus on their specific
 features, and avoid reimplementing general (but still necessary)
 features such as an expression representation, standard optimizations,
-parellelized streaming execution plans, file format support, etc.
+parallelized streaming execution plans, file format support, etc.
 
 ## Known Users
 
@@ -96,57 +97,70 @@ Here are some active projects using DataFusion:
 
 - [Arroyo](https://github.com/ArroyoSystems/arroyo) Distributed stream processing engine in Rust
 - [ArkFlow](https://github.com/arkflow-rs/arkflow) High-performance Rust stream processing engine
-- [Ballista](https://github.com/apache/datafusion-ballista) Distributed SQL Query Engine
-- [Blaze](https://github.com/kwai/blaze) The Blaze accelerator for Apache Spark leverages native vectorized execution to accelerate query processing
-- [CnosDB](https://github.com/cnosdb/cnosdb) Open Source Distributed Time Series Database
+- [Auron] The Auron accelerator for big data engine (e.g., Spark, Flink) leverages native vectorized execution to accelerate query processing
+- [Ballista] Distributed SQL Query Engine
+- [CnosDB] Open Source Distributed Time Series Database
 - [Comet](https://github.com/apache/datafusion-comet) Apache Spark native query execution plugin
-- [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) Cube’s universal semantic layer platform is the next evolution of OLAP technology for AI, BI, spreadsheets, and embedded analytics
-- [Dask SQL](https://github.com/dask-contrib/dask-sql) Distributed SQL query engine in Python
+- [Cube Store] Cube’s universal semantic layer platform is the next evolution of OLAP technology for AI, BI, spreadsheets, and embedded analytics
 - [datafusion-dft](https://github.com/datafusion-contrib/datafusion-dft) Batteries included CLI, TUI, and server implementations for DataFusion.
-- [delta-rs](https://github.com/delta-io/delta-rs) Native Rust implementation of Delta Lake
-- [Exon](https://github.com/wheretrue/exon) Analysis toolkit for life-science applications
+- [dbt Fusion engine](https://github.com/dbt-labs/dbt-fusion) The dbt Fusion engine, written in Rust, designed for speed and correctness with a native SQL understanding across DWH SQL dialects.
+- [delta-rs] Native Rust implementation of Delta Lake
+- [EDB Postgres Lakehouse] built with [Seafowl]
 - [Feldera](https://github.com/feldera/feldera) Fast query engine for incremental computation
 - [Funnel](https://funnel.io/) Data Platform powering Marketing Intelligence applications.
 - [GlareDB](https://github.com/GlareDB/glaredb) Fast SQL database for querying and analyzing distributed data.
-- [GreptimeDB](https://github.com/GreptimeTeam/greptimedb) Open Source & Cloud Native Distributed Time Series Database
-- [HoraeDB](https://github.com/apache/incubator-horaedb) Distributed Time-Series Database
-- [InfluxDB](https://github.com/influxdata/influxdb) Time Series Database
-- [Kamu](https://github.com/kamu-data/kamu-cli/) Planet-scale streaming data pipeline
+- [GreptimeDB] Open Source & Cloud Native Distributed Time Series Database
+- [hiop](https://hiop.io) Serverless Data Logistic Platform
+- [HoraeDB] Distributed Time-Series Database
+- [Iceberg-rust](https://github.com/apache/iceberg-rust) Rust implementation of Apache Iceberg
+- [InfluxDB] Time Series Database
+- [Kamu] Planet-scale streaming data pipeline
+- [Kubeflow Trainer](https://github.com/kubeflow/trainer) Kubernetes-native project designed for
+  scalable LLMs fine-tuning and distributed AI model training.
 - [LakeSoul](https://github.com/lakesoul-io/LakeSoul) Open source LakeHouse framework with native IO in Rust.
 - [Lance](https://github.com/lancedb/lance) Modern columnar data format for ML
-- [OpenObserve](https://github.com/openobserve/openobserve) Distributed cloud native observability platform
+- [OpenObserve] Distributed cloud native observability platform
 - [ParadeDB](https://github.com/paradedb/paradedb) PostgreSQL for Search & Analytics
-- [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform
+- [Parseable] Log storage and observability platform
 - [Polygon.io](https://polygon.io/) Stock Market API
-- [qv](https://github.com/timvw/qv) Quickly view your data
+- [qv] Quickly view your data
+- [R2 Query Engine](https://blog.cloudflare.com/r2-sql-deep-dive/) Cloudflare's distributed engine for querying data in Iceberg Catalogs
+- [rerun.io](https://rerun.io/) Visualize and query robotics logs and transform them into training data.
 - [Restate](https://github.com/restatedev) Easily build resilient applications using distributed durable async/await
-- [ROAPI](https://github.com/roapi/roapi) Create full-fledged APIs for slowly moving datasets without writing a single line of code
+- [ROAPI] Create full-fledged APIs for slowly moving datasets without writing a single line of code
 - [Sail](https://github.com/lakehq/sail) Unifying stream, batch and AI workloads with Apache Spark compatibility
-- [Seafowl](https://github.com/splitgraph/seafowl) CDN-friendly analytical database
+- [SedonaDB](https://github.com/apache/sedona-db) A single-node analytical database engine with geospatial as a first-class citizen
 - [Sleeper](https://github.com/gchq/sleeper) Serverless, cloud-native, log-structured merge tree based, scalable key-value store
-- [Spice.ai](https://github.com/spiceai/spiceai) Building blocks for data-driven AI applications
-- [Synnada](https://synnada.ai/) Streaming-first framework for data products
-- [VegaFusion](https://vegafusion.io/) Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar
+- [Spice.ai] Building blocks for data-driven AI applications
+- [Synnada] Streaming-first framework for data products
+- [VegaFusion] Server-side acceleration for the [Vega](https://vega.github.io/) visualization grammar
+- [Vortex] An extensible, state of the art columnar file format
 - [Telemetry](https://telemetry.sh/) Structured logging made easy
+- [Xorq](https://github.com/xorq-labs/xorq/) Xorq is a multi-engine batch transformation framework built on Ibis, DataFusion and Arrow
 
 Here are some less active projects that used DataFusion:
 
 - [bdt](https://github.com/datafusion-contrib/bdt) Boring Data Tool
-- [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust)
-- [Flock](https://github.com/flock-lab/flock)
-- [Tensorbase](https://github.com/tensorbase/tensorbase)
+- [Cloudfuse Buzz]
+- [Dask SQL] Distributed SQL query engine in Python
+- [Exon] Analysis toolkit for life-science applications
+- [Flock]
+- [Tensorbase]
+
+If you know of another project, please submit a PR to add a link!
 
 [ballista]: https://github.com/apache/datafusion-ballista
-[blaze]: https://github.com/blaze-init/blaze
+[auron]: https://github.com/apache/auron
 [cloudfuse buzz]: https://github.com/cloudfuse-io/buzz-rust
 [cnosdb]: https://github.com/cnosdb/cnosdb
 [cube store]: https://github.com/cube-js/cube.js/tree/master/rust
 [dask sql]: https://github.com/dask-contrib/dask-sql
-[datafusion-tui]: https://github.com/datafusion-contrib/datafusion-tui
 [delta-rs]: https://github.com/delta-io/delta-rs
+[edb postgres lakehouse]: https://www.enterprisedb.com/products/analytics
+[exon]: https://github.com/wheretrue/exon
 [flock]: https://github.com/flock-lab/flock
 [kamu]: https://github.com/kamu-data/kamu-cli
-[greptime db]: https://github.com/GreptimeTeam/greptimedb
+[greptimedb]: https://github.com/GreptimeTeam/greptimedb
 [horaedb]: https://github.com/apache/incubator-horaedb
 [influxdb]: https://github.com/influxdata/influxdb
 [openobserve]: https://github.com/openobserve/openobserve
@@ -158,7 +172,8 @@ Here are some less active projects that used DataFusion:
 [spice.ai]: https://github.com/spiceai/spiceai
 [synnada]: https://synnada.ai/
 [tensorbase]: https://github.com/tensorbase/tensorbase
-[vegafusion]: https://vegafusion.io/ "if you know of another project, please submit a PR to add a link!"
+[vegafusion]: https://vegafusion.io/
+[vortex]: https://vortex.dev/
 
 ## Integrations and Extensions
 
diff --git a/docs/source/user-guide/metrics.md b/docs/source/user-guide/metrics.md
new file mode 100644
index 0000000000000..7e0363f4ceb9b
--- /dev/null
+++ b/docs/source/user-guide/metrics.md
@@ -0,0 +1,47 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Metrics
+
+DataFusion operators expose runtime metrics so you can understand where time is spent and how much data flows through the pipeline. See more in [EXPLAIN ANALYZE](sql/explain.md#explain-analyze).
+
+## Common Metrics
+
+### BaselineMetrics
+
+`BaselineMetrics` are available in most physical operators to capture common measurements.
+
+| Metric          | Description                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| elapsed_compute | CPU time the operator actively spends processing work.                                                                                                                                             |
+| output_rows     | Total number of rows the operator produces.                                                                                                                                                        |
+| output_bytes    | Memory usage of all output batches. Note: This value may be overestimated. If multiple output `RecordBatch` instances share underlying memory buffers, their sizes will be counted multiple times. |
+| output_batches  | Total number of output batches the operator produces.                                                                                                                                              |
+
+## Operator-specific Metrics
+
+### FilterExec
+
+| Metric      | Description                                                       |
+| ----------- | ----------------------------------------------------------------- |
+| selectivity | Selectivity of the filter, calculated as output_rows / input_rows |
+
+## TODO
+
+Add metrics for the remaining operators
diff --git a/docs/source/user-guide/runtime_configs.md b/docs/source/user-guide/runtime_configs.md
deleted file mode 100644
index feef709db9929..0000000000000
--- a/docs/source/user-guide/runtime_configs.md
+++ /dev/null
@@ -1,40 +0,0 @@
-<!---
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-<!---
-This file was generated by the dev/update_runtime_config_docs.sh script.
-Do not edit it manually as changes will be overwritten.
-Instead, edit dev/update_runtime_config_docs.sh or the docstrings in datafusion/execution/src/runtime_env.rs.
--->
-
-# Runtime Environment Configurations
-
-DataFusion runtime configurations can be set via SQL using the `SET` command.
-
-For example, to configure `datafusion.runtime.memory_limit`:
-
-```sql
-SET datafusion.runtime.memory_limit = '2G';
-```
-
-The following runtime configuration settings are available:
-
-| key                             | default | description                                                                                                                                 |
-| ------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| datafusion.runtime.memory_limit | NULL    | Maximum memory limit for query execution. Supports suffixes K (kilobytes), M (megabytes), and G (gigabytes). Example: '2G' for 2 gigabytes. |
diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md
index 774a4fae6bf32..ba9c6ae12477b 100644
--- a/docs/source/user-guide/sql/aggregate_functions.md
+++ b/docs/source/user-guide/sql/aggregate_functions.md
@@ -29,6 +29,55 @@ dev/update_function_docs.sh file for updating surrounding text.
 
 Aggregate functions operate on a set of values to compute a single result.
 
+## Filter clause
+
+Aggregate functions support the SQL `FILTER (WHERE ...)` clause to restrict which input rows contribute to the aggregate result.
+
+```sql
+function([exprs]) FILTER (WHERE condition)
+```
+
+Example:
+
+```sql
+SELECT
+  sum(salary) FILTER (WHERE salary > 0) AS sum_positive_salaries,
+  count(*)    FILTER (WHERE active)     AS active_count
+FROM employees;
+```
+
+Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
+
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
+
 ## General Functions
 
 - [array_agg](#array_agg)
@@ -46,6 +95,8 @@ Aggregate functions operate on a set of values to compute a single result.
 - [mean](#mean)
 - [median](#median)
 - [min](#min)
+- [percentile_cont](#percentile_cont)
+- [quantile_cont](#quantile_cont)
 - [string_agg](#string_agg)
 - [sum](#sum)
 - [var](#var)
@@ -369,6 +420,49 @@ min(expression)
 +----------------------+
 ```
 
+### `percentile_cont`
+
+Returns the exact percentile of input values, interpolating between values if needed.
+
+```sql
+percentile_cont(percentile) WITHIN GROUP (ORDER BY expression)
+```
+
+#### Arguments
+
+- **expression**: The expression to operate on. Can be a constant, column, or function, and any combination of operators.
+- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive).
+
+#### Example
+
+```sql
+> SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++----------------------------------------------------------+
+| percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) |
++----------------------------------------------------------+
+| 45.5                                                     |
++----------------------------------------------------------+
+```
+
+An alternate syntax is also supported:
+
+```sql
+> SELECT percentile_cont(column_name, 0.75) FROM table_name;
++---------------------------------------+
+| percentile_cont(column_name, 0.75)    |
++---------------------------------------+
+| 45.5                                  |
++---------------------------------------+
+```
+
+#### Aliases
+
+- quantile_cont
+
+### `quantile_cont`
+
+_Alias of [percentile_cont](#percentile_cont)._
+
 ### `string_agg`
 
 Concatenates the values of string expressions and places separator values between them. If ordering is required, strings are concatenated in the specified order. This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the first argument expression.
@@ -618,6 +712,29 @@ regr_avgx(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table daily_sales(day int, total_sales int) as values (1,100), (2,150), (3,200), (4,NULL), (5,250);
+select * from daily_sales;
++-----+-------------+
+| day | total_sales |
+| --- | ----------- |
+| 1   | 100         |
+| 2   | 150         |
+| 3   | 200         |
+| 4   | NULL        |
+| 5   | 250         |
++-----+-------------+
+
+SELECT regr_avgx(total_sales, day) AS avg_day FROM daily_sales;
++----------+
+| avg_day  |
++----------+
+|   2.75   |
++----------+
+```
+
 ### `regr_avgy`
 
 Computes the average of the dependent variable (output) expression_y for the non-null paired data points.
@@ -631,6 +748,30 @@ regr_avgy(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table daily_temperature(day int, temperature int) as values (1,30), (2,32), (3, NULL), (4,35), (5,36);
+select * from daily_temperature;
++-----+-------------+
+| day | temperature |
+| --- | ----------- |
+| 1   | 30          |
+| 2   | 32          |
+| 3   | NULL        |
+| 4   | 35          |
+| 5   | 36          |
++-----+-------------+
+
+-- temperature as Dependent Variable(Y), day as Independent Variable(X)
+SELECT regr_avgy(temperature, day) AS avg_temperature FROM daily_temperature;
++-----------------+
+| avg_temperature |
++-----------------+
+| 33.25           |
++-----------------+
+```
+
 ### `regr_count`
 
 Counts the number of non-null paired data points.
@@ -644,6 +785,29 @@ regr_count(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table daily_metrics(day int, user_signups int) as values (1,100), (2,120), (3, NULL), (4,110), (5,NULL);
+select * from daily_metrics;
++-----+---------------+
+| day | user_signups  |
+| --- | ------------- |
+| 1   | 100           |
+| 2   | 120           |
+| 3   | NULL          |
+| 4   | 110           |
+| 5   | NULL          |
++-----+---------------+
+
+SELECT regr_count(user_signups, day) AS valid_pairs FROM daily_metrics;
++-------------+
+| valid_pairs |
++-------------+
+| 3           |
++-------------+
+```
+
 ### `regr_intercept`
 
 Computes the y-intercept of the linear regression line. For the equation (y = kx + b), this function returns b.
@@ -657,6 +821,30 @@ regr_intercept(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table weekly_performance(week int, productivity_score int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++------+---------------------+
+| week | productivity_score  |
+| ---- | ------------------- |
+| 1    | 60                  |
+| 2    | 65                  |
+| 3    | 70                  |
+| 4    | 75                  |
+| 5    | 80                  |
++------+---------------------+
+
+SELECT regr_intercept(productivity_score, week) AS intercept FROM weekly_performance;
++----------+
+|intercept|
+|intercept |
++----------+
+|  55      |
++----------+
+```
+
 ### `regr_r2`
 
 Computes the square of the correlation coefficient between the independent and dependent variables.
@@ -670,6 +858,29 @@ regr_r2(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table weekly_performance(day int ,user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++-----+--------------+
+| day | user_signups |
++-----+--------------+
+| 1   | 60           |
+| 2   | 65           |
+| 3   | 70           |
+| 4   | 75           |
+| 5   | 80           |
++-----+--------------+
+
+SELECT regr_r2(user_signups, day) AS r_squared FROM weekly_performance;
++---------+
+|r_squared|
++---------+
+| 1.0     |
++---------+
+```
+
 ### `regr_slope`
 
 Returns the slope of the linear regression line for non-null pairs in aggregate columns. Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k\*X + b) using minimal RSS fitting.
@@ -683,6 +894,29 @@ regr_slope(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table weekly_performance(day int, user_signups int) as values (1,60), (2,65), (3, 70), (4,75), (5,80);
+select * from weekly_performance;
++-----+--------------+
+| day | user_signups |
++-----+--------------+
+| 1   | 60           |
+| 2   | 65           |
+| 3   | 70           |
+| 4   | 75           |
+| 5   | 80           |
++-----+--------------+
+
+SELECT regr_slope(user_signups, day) AS slope FROM weekly_performance;
++--------+
+| slope  |
++--------+
+| 5.0    |
++--------+
+```
+
 ### `regr_sxx`
 
 Computes the sum of squares of the independent variable.
@@ -696,6 +930,29 @@ regr_sxx(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table study_hours(student_id int, hours int, test_score int) as values (1,2,55), (2,4,65), (3,6,75), (4,8,85), (5,10,95);
+select * from study_hours;
++------------+-------+------------+
+| student_id | hours | test_score |
++------------+-------+------------+
+| 1          | 2     | 55         |
+| 2          | 4     | 65         |
+| 3          | 6     | 75         |
+| 4          | 8     | 85         |
+| 5          | 10    | 95         |
++------------+-------+------------+
+
+SELECT regr_sxx(test_score, hours) AS sxx FROM study_hours;
++------+
+| sxx  |
++------+
+| 40.0 |
++------+
+```
+
 ### `regr_sxy`
 
 Computes the sum of products of paired data points.
@@ -709,6 +966,27 @@ regr_sxy(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table employee_productivity(week int, productivity_score int) as values(1,60), (2,65), (3,70);
+select * from employee_productivity;
++------+--------------------+
+| week | productivity_score |
++------+--------------------+
+| 1    | 60                 |
+| 2    | 65                 |
+| 3    | 70                 |
++------+--------------------+
+
+SELECT regr_sxy(productivity_score, week) AS sum_product_deviations FROM employee_productivity;
++------------------------+
+| sum_product_deviations |
++------------------------+
+|       10.0             |
++------------------------+
+```
+
 ### `regr_syy`
 
 Computes the sum of squares of the dependent variable.
@@ -722,6 +1000,27 @@ regr_syy(expression_y, expression_x)
 - **expression_y**: Dependent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_x**: Independent variable expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+create table employee_productivity(week int, productivity_score int) as values (1,60), (2,65), (3,70);
+select * from employee_productivity;
++------+--------------------+
+| week | productivity_score |
++------+--------------------+
+| 1    | 60                 |
+| 2    | 65                 |
+| 3    | 70                 |
++------+--------------------+
+
+SELECT regr_syy(productivity_score, week) AS sum_squares_y FROM employee_productivity;
++---------------+
+| sum_squares_y |
++---------------+
+|    50.0       |
++---------------+
+```
+
 ### `stddev`
 
 Returns the standard deviation of a set of numbers.
@@ -834,7 +1133,7 @@ approx_median(expression)
 Returns the approximate percentile of input values using the t-digest algorithm.
 
 ```sql
-approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)
+approx_percentile_cont(percentile [, centroids]) WITHIN GROUP (ORDER BY expression)
 ```
 
 #### Arguments
@@ -846,6 +1145,12 @@ approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)
 #### Example
 
 ```sql
+> SELECT approx_percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++------------------------------------------------------------------+
+| approx_percentile_cont(0.75) WITHIN GROUP (ORDER BY column_name) |
++------------------------------------------------------------------+
+| 65.0                                                             |
++------------------------------------------------------------------+
 > SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
 +-----------------------------------------------------------------------+
 | approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) |
@@ -854,12 +1159,30 @@ approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)
 +-----------------------------------------------------------------------+
 ```
 
+An alternate syntax is also supported:
+
+```sql
+> SELECT approx_percentile_cont(column_name, 0.75) FROM table_name;
++-----------------------------------------------+
+| approx_percentile_cont(column_name, 0.75)     |
++-----------------------------------------------+
+| 65.0                                          |
++-----------------------------------------------+
+
+> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
++----------------------------------------------------------+
+| approx_percentile_cont(column_name, 0.75, 100)           |
++----------------------------------------------------------+
+| 65.0                                                     |
++----------------------------------------------------------+
+```
+
 ### `approx_percentile_cont_with_weight`
 
 Returns the weighted approximate percentile of input values using the t-digest algorithm.
 
 ```sql
-approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression)
+approx_percentile_cont_with_weight(weight, percentile [, centroids]) WITHIN GROUP (ORDER BY expression)
 ```
 
 #### Arguments
@@ -867,6 +1190,7 @@ approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY ex
 - **expression**: The expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **weight**: Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators.
 - **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive).
+- **centroids**: Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.
 
 #### Example
 
@@ -877,4 +1201,21 @@ approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY ex
 +---------------------------------------------------------------------------------------------+
 | 78.5                                                                                        |
 +---------------------------------------------------------------------------------------------+
+> SELECT approx_percentile_cont_with_weight(weight_column, 0.90, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++--------------------------------------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(weight_column, 0.90, 100) WITHIN GROUP (ORDER BY column_name) |
++--------------------------------------------------------------------------------------------------+
+| 78.5                                                                                             |
++--------------------------------------------------------------------------------------------------+
+```
+
+An alternative syntax is also supported:
+
+```sql
+> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name;
++--------------------------------------------------+
+| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
++--------------------------------------------------+
+| 78.5                                             |
++--------------------------------------------------+
 ```
diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md
index d977a4396e40d..502193df41a64 100644
--- a/docs/source/user-guide/sql/data_types.md
+++ b/docs/source/user-guide/sql/data_types.md
@@ -25,6 +25,11 @@ execution. The SQL types from
 are mapped to [Arrow data types](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) according to the following table.
 This mapping occurs when defining the schema in a `CREATE EXTERNAL TABLE` command or when performing a SQL `CAST` operation.
 
+For background on extension types and custom metadata, see the
+[Implementing User Defined Types and Custom Metadata in DataFusion] blog.
+
+[implementing user defined types and custom metadata in datafusion]: https://datafusion.apache.org/blog/2025/09/21/custom-types-using-metadata
+
 You can see the corresponding Arrow type for any SQL expression using
 the `arrow_typeof` function. For example:
 
@@ -41,7 +46,18 @@ You can cast a SQL expression to a specific Arrow type using the `arrow_cast` fu
 For example, to cast the output of `now()` to a `Timestamp` with second precision:
 
 ```sql
-select arrow_cast(now(), 'Timestamp(Second, None)');
+select arrow_cast(now(), 'Timestamp(s)') as "now()";
++---------------------+
+| now()               |
++---------------------+
+| 2025-10-24T20:02:45 |
++---------------------+
+```
+
+The older syntax still works as well:
+
+```sql
+select arrow_cast(now(), 'Timestamp(Second, None)') as "now()";
 +---------------------+
 | now()               |
 +---------------------+
@@ -53,27 +69,32 @@ select arrow_cast(now(), 'Timestamp(Second, None)');
 
 | SQL DataType | Arrow DataType |
 | ------------ | -------------- |
-| `CHAR`       | `Utf8`         |
-| `VARCHAR`    | `Utf8`         |
-| `TEXT`       | `Utf8`         |
-| `STRING`     | `Utf8`         |
+| `CHAR`       | `Utf8View`     |
+| `VARCHAR`    | `Utf8View`     |
+| `TEXT`       | `Utf8View`     |
+| `STRING`     | `Utf8View`     |
+
+By default, string types are mapped to `Utf8View`. This can be configured using the `datafusion.sql_parser.map_string_types_to_utf8view` setting. When set to `false`, string types are mapped to `Utf8` instead.
 
 ## Numeric Types
 
-| SQL DataType                         | Arrow DataType                 |
-| ------------------------------------ | :----------------------------- |
-| `TINYINT`                            | `Int8`                         |
-| `SMALLINT`                           | `Int16`                        |
-| `INT` or `INTEGER`                   | `Int32`                        |
-| `BIGINT`                             | `Int64`                        |
-| `TINYINT UNSIGNED`                   | `UInt8`                        |
-| `SMALLINT UNSIGNED`                  | `UInt16`                       |
-| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32`                       |
-| `BIGINT UNSIGNED`                    | `UInt64`                       |
-| `FLOAT`                              | `Float32`                      |
-| `REAL`                               | `Float32`                      |
-| `DOUBLE`                             | `Float64`                      |
-| `DECIMAL(precision, scale)`          | `Decimal128(precision, scale)` |
+| SQL DataType                                     | Arrow DataType                 |
+| ------------------------------------------------ | :----------------------------- |
+| `TINYINT`                                        | `Int8`                         |
+| `SMALLINT`                                       | `Int16`                        |
+| `INT` or `INTEGER`                               | `Int32`                        |
+| `BIGINT`                                         | `Int64`                        |
+| `TINYINT UNSIGNED`                               | `UInt8`                        |
+| `SMALLINT UNSIGNED`                              | `UInt16`                       |
+| `INT UNSIGNED` or `INTEGER UNSIGNED`             | `UInt32`                       |
+| `BIGINT UNSIGNED`                                | `UInt64`                       |
+| `FLOAT`                                          | `Float32`                      |
+| `REAL`                                           | `Float32`                      |
+| `DOUBLE`                                         | `Float64`                      |
+| `DECIMAL(precision, scale)` where precision ≤ 38 | `Decimal128(precision, scale)` |
+| `DECIMAL(precision, scale)` where precision > 38 | `Decimal256(precision, scale)` |
+
+The maximum supported precision for `DECIMAL` types is 76.
 
 ## Date/Time Types
 
@@ -115,42 +136,3 @@ You can create binary literals using a hex string literal such as
 | `ENUM`        | _Not yet supported_ |
 | `SET`         | _Not yet supported_ |
 | `DATETIME`    | _Not yet supported_ |
-
-## Supported Arrow Types
-
-The following types are supported by the `arrow_typeof` function:
-
-| Arrow Type                                                  |
-| ----------------------------------------------------------- |
-| `Null`                                                      |
-| `Boolean`                                                   |
-| `Int8`                                                      |
-| `Int16`                                                     |
-| `Int32`                                                     |
-| `Int64`                                                     |
-| `UInt8`                                                     |
-| `UInt16`                                                    |
-| `UInt32`                                                    |
-| `UInt64`                                                    |
-| `Float16`                                                   |
-| `Float32`                                                   |
-| `Float64`                                                   |
-| `Utf8`                                                      |
-| `LargeUtf8`                                                 |
-| `Binary`                                                    |
-| `Timestamp(Second, None)`                                   |
-| `Timestamp(Millisecond, None)`                              |
-| `Timestamp(Microsecond, None)`                              |
-| `Timestamp(Nanosecond, None)`                               |
-| `Time32`                                                    |
-| `Time64`                                                    |
-| `Duration(Second)`                                          |
-| `Duration(Millisecond)`                                     |
-| `Duration(Microsecond)`                                     |
-| `Duration(Nanosecond)`                                      |
-| `Interval(YearMonth)`                                       |
-| `Interval(DayTime)`                                         |
-| `Interval(MonthDayNano)`                                    |
-| `FixedSizeBinary(<len>)` (e.g. `FixedSizeBinary(16)`)       |
-| `Decimal128(<precision>, <scale>)` e.g. `Decimal128(3, 10)` |
-| `Decimal256(<precision>, <scale>)` e.g. `Decimal256(3, 10)` |
diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md
index ff8fa9bac0017..3a5c934ae8156 100644
--- a/docs/source/user-guide/sql/ddl.md
+++ b/docs/source/user-guide/sql/ddl.md
@@ -71,7 +71,7 @@ LOCATION <literal>
 
 <ordered_column_list> := (<column_name> <sort_clause>, ...)
 
-<key_value_list> := (<literal> <literal, <literal> <literal>, ...)
+<key_value_list> := (<literal> <literal>, <literal> <literal>, ...)
 ```
 
 For a comprehensive list of format-specific options that can be specified in the `OPTIONS` clause, see [Format Options](format_options.md).
@@ -95,14 +95,14 @@ LOCATION '/mnt/nyctaxi/tripdata.parquet';
 
 :::{note}
 Statistics
-: By default, when a table is created, DataFusion will _NOT_ read the files
+: By default, when a table is created, DataFusion will read the files
 to gather statistics, which can be expensive but can accelerate subsequent
-queries substantially. If you want to gather statistics
+queries substantially. If you don't want to gather statistics
 when creating a table, set the `datafusion.execution.collect_statistics`
-configuration option to `true` before creating the table. For example:
+configuration option to `false` before creating the table. For example:
 
 ```sql
-SET datafusion.execution.collect_statistics = true;
+SET datafusion.execution.collect_statistics = false;
 ```
 
 See the [config settings docs](../configs.md) for more details.
@@ -169,6 +169,35 @@ LOCATION '/path/to/directory/of/files'
 OPTIONS ('has_header' 'true');
 ```
 
+Tables that are partitioned using a Hive compliant partitioning scheme will have their columns and values automatically
+detected and incorporated into the table's schema and data. Given the following example directory structure:
+
+```console
+hive_partitioned/
+├── a=1
+│   └── b=200
+│       └── file1.parquet
+└── a=2
+    └── b=100
+        └── file2.parquet
+```
+
+Users can specify the top level `hive_partitioned` directory as an `EXTERNAL TABLE` and leverage the Hive partitions to query
+and filter data.
+
+```sql
+CREATE EXTERNAL TABLE hive_partitioned
+STORED AS PARQUET
+LOCATION '/path/to/hive_partitioned/';
+
+SELECT count(*) FROM hive_partitioned WHERE b=100;
++------------------+
+| count(*)         |
++------------------+
+| 1                |
++------------------+
+```
+
 ### Example: Unbounded Data Sources
 
 We can create unbounded data sources using the `CREATE UNBOUNDED EXTERNAL TABLE` SQL statement.
@@ -207,7 +236,7 @@ CREATE EXTERNAL TABLE test (
     c13 VARCHAR NOT NULL
 )
 STORED AS CSV
-WITH ORDER (c2 ASC, c5 + c8 DESC NULL FIRST)
+WITH ORDER (c2 ASC, c5 + c8 DESC NULLS FIRST)
 LOCATION '/path/to/aggregate_test_100.csv'
 OPTIONS ('has_header' 'true');
 ```
@@ -316,3 +345,78 @@ DROP VIEW [ IF EXISTS ] <b><i>view_name</i></b>;
 -- drop users_v view from the customer_a schema
 DROP VIEW IF EXISTS customer_a.users_v;
 ```
+
+## DESCRIBE
+
+Displays the schema of a table, showing column names, data types, and nullable status. Both `DESCRIBE` and `DESC` are supported as aliases.
+
+<pre>
+{ DESCRIBE | DESC } <b><i>table_name</i></b>
+</pre>
+
+The output contains three columns:
+
+- `column_name`: The name of the column
+- `data_type`: The data type of the column (e.g., Int32, Utf8, Boolean)
+- `is_nullable`: Whether the column can contain null values (YES/NO)
+
+### Example: Basic table description
+
+```sql
+-- Create a table
+CREATE TABLE users AS VALUES (1, 'Alice', true), (2, 'Bob', false);
+
+-- Describe the table structure
+DESCRIBE users;
+```
+
+Output:
+
+```sql
++--------------+-----------+-------------+
+| column_name  | data_type | is_nullable |
++--------------+-----------+-------------+
+| column1      | Int64     | YES         |
+| column2      | Utf8      | YES         |
+| column3      | Boolean   | YES         |
++--------------+-----------+-------------+
+```
+
+### Example: Using DESC alias
+
+```sql
+-- DESC is an alias for DESCRIBE
+DESC users;
+```
+
+### Example: Describing external tables
+
+```sql
+-- Create an external table
+CREATE EXTERNAL TABLE taxi
+STORED AS PARQUET
+LOCATION '/mnt/nyctaxi/tripdata.parquet';
+
+-- Describe its schema
+DESCRIBE taxi;
+```
+
+Output might show:
+
+```sql
++--------------------+-----------------------------+-------------+
+| column_name        | data_type                   | is_nullable |
++--------------------+-----------------------------+-------------+
+| vendor_id          | Int32                       | YES         |
+| pickup_datetime    | Timestamp(Nanosecond, None) | NO          |
+| passenger_count    | Int32                       | YES         |
+| trip_distance      | Float64                     | YES         |
++--------------------+-----------------------------+-------------+
+```
+
+The `DESCRIBE` command works with all table types in DataFusion, including:
+
+- Regular tables created with `CREATE TABLE`
+- External tables created with `CREATE EXTERNAL TABLE`
+- Views created with `CREATE VIEW`
+- Tables in different schemas using qualified names (e.g., `DESCRIBE schema_name.table_name`)
diff --git a/docs/source/user-guide/sql/dml.md b/docs/source/user-guide/sql/dml.md
index c29447f23cd9c..4934bc2674375 100644
--- a/docs/source/user-guide/sql/dml.md
+++ b/docs/source/user-guide/sql/dml.md
@@ -88,7 +88,7 @@ of hive-style partitioned parquet files:
 +-------+
 ```
 
-If the the data contains values of `x` and `y` in column1 and only `a` in
+If the data contains values of `x` and `y` in column1 and only `a` in
 column2, output files will appear in the following directory structure:
 
 ```text
diff --git a/docs/source/user-guide/sql/explain.md b/docs/source/user-guide/sql/explain.md
index c5e2e215a6b66..23101632625b1 100644
--- a/docs/source/user-guide/sql/explain.md
+++ b/docs/source/user-guide/sql/explain.md
@@ -37,8 +37,6 @@ The optional `[FORMAT format]` clause controls how the plan is displayed as
 explained below. If this clause is not specified, the plan is displayed using
 the format from the [configuration value] `datafusion.explain.format`.
 
-[configuration value]: ../configs.md
-
 ### `tree` format (default)
 
 The `tree` format is modeled after [DuckDB plans] and is designed to be easier
@@ -72,19 +70,10 @@ to see the high level structure of the plan
 |               | │      RepartitionExec      │ |
 |               | │    --------------------   │ |
 |               | │   input_partition_count:  │ |
-|               | │             16            │ |
-|               | │                           │ |
-|               | │    partitioning_scheme:   │ |
-|               | │      Hash([b@0], 16)      │ |
-|               | └─────────────┬─────────────┘ |
-|               | ┌─────────────┴─────────────┐ |
-|               | │      RepartitionExec      │ |
-|               | │    --------------------   │ |
-|               | │   input_partition_count:  │ |
 |               | │             1             │ |
 |               | │                           │ |
 |               | │    partitioning_scheme:   │ |
-|               | │    RoundRobinBatch(16)    │ |
+|               | │      Hash([b@0], 16)      │ |
 |               | └─────────────┬─────────────┘ |
 |               | ┌─────────────┴─────────────┐ |
 |               | │       AggregateExec       │ |
@@ -128,10 +117,9 @@ Elapsed 0.004 seconds.
 | physical_plan | ProjectionExec: expr=[sum(t.x)@1 as sum(t.x)]                                 |
 |               |   AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(t.x)]       |
 |               |     CoalesceBatchesExec: target_batch_size=8192                               |
-|               |       RepartitionExec: partitioning=Hash([b@0], 16), input_partitions=16      |
-|               |         RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1 |
-|               |           AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(t.x)]        |
-|               |             DataSourceExec: partitions=1, partition_sizes=[1]                 |
+|               |       RepartitionExec: partitioning=Hash([b@0], 16), input_partitions=1       |
+|               |         AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(t.x)]          |
+|               |           DataSourceExec: partitions=1, partition_sizes=[1]                   |
 |               |                                                                               |
 +---------------+-------------------------------------------------------------------------------+
 2 row(s) fetched.
@@ -239,8 +227,7 @@ Elapsed 0.010 seconds.
 
 ## `EXPLAIN ANALYZE`
 
-Shows the execution plan and metrics of a statement. If you need more
-information output, use `EXPLAIN ANALYZE VERBOSE`. Note that `EXPLAIN ANALYZE`
+Shows the execution plan and metrics of a statement. Note that `EXPLAIN ANALYZE`
 only supports the `indent` format.
 
 ```sql
@@ -259,3 +246,9 @@ EXPLAIN ANALYZE SELECT SUM(x) FROM table GROUP BY b;
 |                   |               DataSourceExec: file_groups={1 group: [[/tmp/table.csv]]}, has_header=false, metrics=[]                                                        |
 +-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
 ```
+
+By default `EXPLAIN ANALYZE` shows the aggregated metrics from all partitions for each operator. If you need to display per-partition metrics, use `EXPLAIN ANALYZE VERBOSE`.
+
+You can also set `datafusion.explain.analyze_level` from the [configuration value] to control the detail level for the metrics displayed.
+
+[configuration value]: ../configs.md
diff --git a/docs/source/user-guide/sql/format_options.md b/docs/source/user-guide/sql/format_options.md
index e8008eafb166c..338508031413c 100644
--- a/docs/source/user-guide/sql/format_options.md
+++ b/docs/source/user-guide/sql/format_options.md
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
 
 The following options are available when reading or writing CSV files. Note: If any unsupported option is specified, an error will be raised and the query will fail.
 
-| Option               | Description                                                                                                                       | Default Value      |
-| -------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
-| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. | UNCOMPRESSED       |
-| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                   | None               |
-| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                              | `,` (comma)        |
-| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                   | `"` (double quote) |
-| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                               | None               |
-| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                      | None               |
-| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                      | None               |
-| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                      | None               |
-| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                              | None               |
-| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                          | None               |
-| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                         | None               |
-| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                           | None               |
-| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                              | None               |
-| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                 | None               |
-| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                    | None               |
-| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema.                                                                   | None               |
-| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                | None               |
+| Option               | Description                                                                                                                                                      | Default Value      |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| COMPRESSION          | Sets the compression that should be applied to the entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.                                | UNCOMPRESSED       |
+| HAS_HEADER           | Sets if the CSV file should include column headers. If not set, uses session or system default.                                                                  | None               |
+| DELIMITER            | Sets the character which should be used as the column delimiter within the CSV file.                                                                             | `,` (comma)        |
+| QUOTE                | Sets the character which should be used for quoting values within the CSV file.                                                                                  | `"` (double quote) |
+| TERMINATOR           | Sets the character which should be used as the line terminator within the CSV file.                                                                              | None               |
+| ESCAPE               | Sets the character which should be used for escaping special characters within the CSV file.                                                                     | None               |
+| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped by doubling them (e.g., `"aaa""bbb"`).                                                                     | None               |
+| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If not set, uses session or system default.                                                                     | None               |
+| DATE_FORMAT          | Sets the format that dates should be encoded in within the CSV file.                                                                                             | None               |
+| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in within the CSV file.                                                                                         | None               |
+| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in within the CSV file.                                                                                        | None               |
+| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should be encoded in within the CSV file.                                                                          | None               |
+| TIME_FORMAT          | Sets the format that times should be encoded in within the CSV file.                                                                                             | None               |
+| NULL_VALUE           | Sets the string which should be used to indicate null values within the CSV file.                                                                                | None               |
+| NULL_REGEX           | Sets the regex pattern to match null values when loading CSVs.                                                                                                   | None               |
+| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer the schema. If set to 0, schema inference is disabled and all fields will be inferred as Utf8 (string) type. | None               |
+| COMMENT              | Sets the character which should be used to indicate comment lines in the CSV file.                                                                               | None               |
 
 **Example:**
 
@@ -132,38 +132,38 @@ OPTIONS('DELIMITER' '|', 'HAS_HEADER' 'true', 'NEWLINES_IN_VALUES' 'true');
 
 The following options are available when reading or writing Parquet files. If any unsupported option is specified, an error will be raised and the query will fail. If a column-specific option is specified for a column that does not exist, the option will be ignored without error.
 
-| Option                                     | Can be Column Specific? | Description                                                                                                                                                                                                                                                                                                                                 | OPTIONS Key                                           | Default Value            |
-| ------------------------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------------------------ |
-| COMPRESSION                                | Yes                     | Sets the internal Parquet **compression codec** for data pages, optionally including the compression level. Applies globally if set without `::col`, or specifically to a column if set using `'compression::column_name'`. Valid values: `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, `lz4`, `zstd(level)`, `lz4_raw`. | `'compression'` or `'compression::col'`               | zstd(3)                  |
-| ENCODING                                   | Yes                     | Sets the **encoding** scheme for data pages. Valid values: `plain`, `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, `byte_stream_split`. Use key `'encoding'` or `'encoding::col'` in OPTIONS.                                                             | `'encoding'` or `'encoding::col'`                     | None                     |
-| DICTIONARY_ENABLED                         | Yes                     | Sets whether dictionary encoding should be enabled globally or for a specific column.                                                                                                                                                                                                                                                       | `'dictionary_enabled'` or `'dictionary_enabled::col'` | true                     |
-| STATISTICS_ENABLED                         | Yes                     | Sets the level of statistics to write (`none`, `chunk`, `page`).                                                                                                                                                                                                                                                                            | `'statistics_enabled'` or `'statistics_enabled::col'` | page                     |
-| BLOOM_FILTER_ENABLED                       | Yes                     | Sets whether a bloom filter should be written for a specific column.                                                                                                                                                                                                                                                                        | `'bloom_filter_enabled::column_name'`                 | None                     |
-| BLOOM_FILTER_FPP                           | Yes                     | Sets bloom filter false positive probability (global or per column).                                                                                                                                                                                                                                                                        | `'bloom_filter_fpp'` or `'bloom_filter_fpp::col'`     | None                     |
-| BLOOM_FILTER_NDV                           | Yes                     | Sets bloom filter number of distinct values (global or per column).                                                                                                                                                                                                                                                                         | `'bloom_filter_ndv'` or `'bloom_filter_ndv::col'`     | None                     |
-| MAX_ROW_GROUP_SIZE                         | No                      | Sets the maximum number of rows per row group. Larger groups require more memory but can improve compression and scan efficiency.                                                                                                                                                                                                           | `'max_row_group_size'`                                | 1048576                  |
-| ENABLE_PAGE_INDEX                          | No                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce I/O and decoding.                                                                                                                                                                                                                               | `'enable_page_index'`                                 | true                     |
-| PRUNING                                    | No                      | If true, enables row group pruning based on min/max statistics.                                                                                                                                                                                                                                                                             | `'pruning'`                                           | true                     |
-| SKIP_METADATA                              | No                      | If true, skips optional embedded metadata in the file schema.                                                                                                                                                                                                                                                                               | `'skip_metadata'`                                     | true                     |
-| METADATA_SIZE_HINT                         | No                      | Sets the size hint (in bytes) for fetching Parquet file metadata.                                                                                                                                                                                                                                                                           | `'metadata_size_hint'`                                | None                     |
-| PUSHDOWN_FILTERS                           | No                      | If true, enables filter pushdown during Parquet decoding.                                                                                                                                                                                                                                                                                   | `'pushdown_filters'`                                  | false                    |
-| REORDER_FILTERS                            | No                      | If true, enables heuristic reordering of filters during Parquet decoding.                                                                                                                                                                                                                                                                   | `'reorder_filters'`                                   | false                    |
-| SCHEMA_FORCE_VIEW_TYPES                    | No                      | If true, reads Utf8/Binary columns as view types.                                                                                                                                                                                                                                                                                           | `'schema_force_view_types'`                           | true                     |
-| BINARY_AS_STRING                           | No                      | If true, reads Binary columns as strings.                                                                                                                                                                                                                                                                                                   | `'binary_as_string'`                                  | false                    |
-| DATA_PAGESIZE_LIMIT                        | No                      | Sets best effort maximum size of data page in bytes.                                                                                                                                                                                                                                                                                        | `'data_pagesize_limit'`                               | 1048576                  |
-| DATA_PAGE_ROW_COUNT_LIMIT                  | No                      | Sets best effort maximum number of rows in data page.                                                                                                                                                                                                                                                                                       | `'data_page_row_count_limit'`                         | 20000                    |
-| DICTIONARY_PAGE_SIZE_LIMIT                 | No                      | Sets best effort maximum dictionary page size, in bytes.                                                                                                                                                                                                                                                                                    | `'dictionary_page_size_limit'`                        | 1048576                  |
-| WRITE_BATCH_SIZE                           | No                      | Sets write_batch_size in bytes.                                                                                                                                                                                                                                                                                                             | `'write_batch_size'`                                  | 1024                     |
-| WRITER_VERSION                             | No                      | Sets the Parquet writer version (`1.0` or `2.0`).                                                                                                                                                                                                                                                                                           | `'writer_version'`                                    | 1.0                      |
-| SKIP_ARROW_METADATA                        | No                      | If true, skips writing Arrow schema information into the Parquet file metadata.                                                                                                                                                                                                                                                             | `'skip_arrow_metadata'`                               | false                    |
-| CREATED_BY                                 | No                      | Sets the "created by" string in the Parquet file metadata.                                                                                                                                                                                                                                                                                  | `'created_by'`                                        | datafusion version X.Y.Z |
-| COLUMN_INDEX_TRUNCATE_LENGTH               | No                      | Sets the length (in bytes) to truncate min/max values in column indexes.                                                                                                                                                                                                                                                                    | `'column_index_truncate_length'`                      | 64                       |
-| STATISTICS_TRUNCATE_LENGTH                 | No                      | Sets statistics truncate length.                                                                                                                                                                                                                                                                                                            | `'statistics_truncate_length'`                        | None                     |
-| BLOOM_FILTER_ON_WRITE                      | No                      | Sets whether bloom filters should be written for all columns by default (can be overridden per column).                                                                                                                                                                                                                                     | `'bloom_filter_on_write'`                             | false                    |
-| ALLOW_SINGLE_FILE_PARALLELISM              | No                      | Enables parallel serialization of columns in a single file.                                                                                                                                                                                                                                                                                 | `'allow_single_file_parallelism'`                     | true                     |
-| MAXIMUM_PARALLEL_ROW_GROUP_WRITERS         | No                      | Maximum number of parallel row group writers.                                                                                                                                                                                                                                                                                               | `'maximum_parallel_row_group_writers'`                | 1                        |
-| MAXIMUM_BUFFERED_RECORD_BATCHES_PER_STREAM | No                      | Maximum number of buffered record batches per stream.                                                                                                                                                                                                                                                                                       | `'maximum_buffered_record_batches_per_stream'`        | 2                        |
-| KEY_VALUE_METADATA                         | No (Key is specific)    | Adds custom key-value pairs to the file metadata. Use the format `'metadata::your_key_name' 'your_value'`. Multiple entries allowed.                                                                                                                                                                                                        | `'metadata::key_name'`                                | None                     |
+| Option                                     | Can be Column Specific? | Description                                                                                                                                                                                                                                                                                                                          | OPTIONS Key                                           | Default Value            |
+| ------------------------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | ------------------------ |
+| COMPRESSION                                | Yes                     | Sets the internal Parquet **compression codec** for data pages, optionally including the compression level. Applies globally if set without `::col`, or specifically to a column if set using `'compression::column_name'`. Valid values: `uncompressed`, `snappy`, `gzip(level)`, `brotli(level)`, `lz4`, `zstd(level)`, `lz4_raw`. | `'compression'` or `'compression::col'`               | zstd(3)                  |
+| ENCODING                                   | Yes                     | Sets the **encoding** scheme for data pages. Valid values: `plain`, `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, `byte_stream_split`. Use key `'encoding'` or `'encoding::col'` in OPTIONS.                                                      | `'encoding'` or `'encoding::col'`                     | None                     |
+| DICTIONARY_ENABLED                         | Yes                     | Sets whether dictionary encoding should be enabled globally or for a specific column.                                                                                                                                                                                                                                                | `'dictionary_enabled'` or `'dictionary_enabled::col'` | true                     |
+| STATISTICS_ENABLED                         | Yes                     | Sets the level of statistics to write (`none`, `chunk`, `page`).                                                                                                                                                                                                                                                                     | `'statistics_enabled'` or `'statistics_enabled::col'` | page                     |
+| BLOOM_FILTER_ENABLED                       | Yes                     | Sets whether a bloom filter should be written for a specific column.                                                                                                                                                                                                                                                                 | `'bloom_filter_enabled::column_name'`                 | None                     |
+| BLOOM_FILTER_FPP                           | Yes                     | Sets bloom filter false positive probability (global or per column).                                                                                                                                                                                                                                                                 | `'bloom_filter_fpp'` or `'bloom_filter_fpp::col'`     | None                     |
+| BLOOM_FILTER_NDV                           | Yes                     | Sets bloom filter number of distinct values (global or per column).                                                                                                                                                                                                                                                                  | `'bloom_filter_ndv'` or `'bloom_filter_ndv::col'`     | None                     |
+| MAX_ROW_GROUP_SIZE                         | No                      | Sets the maximum number of rows per row group. Larger groups require more memory but can improve compression and scan efficiency.                                                                                                                                                                                                    | `'max_row_group_size'`                                | 1048576                  |
+| ENABLE_PAGE_INDEX                          | No                      | If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce I/O and decoding.                                                                                                                                                                                                                        | `'enable_page_index'`                                 | true                     |
+| PRUNING                                    | No                      | If true, enables row group pruning based on min/max statistics.                                                                                                                                                                                                                                                                      | `'pruning'`                                           | true                     |
+| SKIP_METADATA                              | No                      | If true, skips optional embedded metadata in the file schema.                                                                                                                                                                                                                                                                        | `'skip_metadata'`                                     | true                     |
+| METADATA_SIZE_HINT                         | No                      | Sets the size hint (in bytes) for fetching Parquet file metadata.                                                                                                                                                                                                                                                                    | `'metadata_size_hint'`                                | None                     |
+| PUSHDOWN_FILTERS                           | No                      | If true, enables filter pushdown during Parquet decoding.                                                                                                                                                                                                                                                                            | `'pushdown_filters'`                                  | false                    |
+| REORDER_FILTERS                            | No                      | If true, enables heuristic reordering of filters during Parquet decoding.                                                                                                                                                                                                                                                            | `'reorder_filters'`                                   | false                    |
+| SCHEMA_FORCE_VIEW_TYPES                    | No                      | If true, reads Utf8/Binary columns as view types.                                                                                                                                                                                                                                                                                    | `'schema_force_view_types'`                           | true                     |
+| BINARY_AS_STRING                           | No                      | If true, reads Binary columns as strings.                                                                                                                                                                                                                                                                                            | `'binary_as_string'`                                  | false                    |
+| DATA_PAGESIZE_LIMIT                        | No                      | Sets best effort maximum size of data page in bytes.                                                                                                                                                                                                                                                                                 | `'data_pagesize_limit'`                               | 1048576                  |
+| DATA_PAGE_ROW_COUNT_LIMIT                  | No                      | Sets best effort maximum number of rows in data page.                                                                                                                                                                                                                                                                                | `'data_page_row_count_limit'`                         | 20000                    |
+| DICTIONARY_PAGE_SIZE_LIMIT                 | No                      | Sets best effort maximum dictionary page size, in bytes.                                                                                                                                                                                                                                                                             | `'dictionary_page_size_limit'`                        | 1048576                  |
+| WRITE_BATCH_SIZE                           | No                      | Sets write_batch_size in rows.                                                                                                                                                                                                                                                                                                       | `'write_batch_size'`                                  | 1024                     |
+| WRITER_VERSION                             | No                      | Sets the Parquet writer version (`1.0` or `2.0`).                                                                                                                                                                                                                                                                                    | `'writer_version'`                                    | 1.0                      |
+| SKIP_ARROW_METADATA                        | No                      | If true, skips writing Arrow schema information into the Parquet file metadata.                                                                                                                                                                                                                                                      | `'skip_arrow_metadata'`                               | false                    |
+| CREATED_BY                                 | No                      | Sets the "created by" string in the Parquet file metadata.                                                                                                                                                                                                                                                                           | `'created_by'`                                        | datafusion version X.Y.Z |
+| COLUMN_INDEX_TRUNCATE_LENGTH               | No                      | Sets the length (in bytes) to truncate min/max values in column indexes.                                                                                                                                                                                                                                                             | `'column_index_truncate_length'`                      | 64                       |
+| STATISTICS_TRUNCATE_LENGTH                 | No                      | Sets statistics truncate length.                                                                                                                                                                                                                                                                                                     | `'statistics_truncate_length'`                        | None                     |
+| BLOOM_FILTER_ON_WRITE                      | No                      | Sets whether bloom filters should be written for all columns by default (can be overridden per column).                                                                                                                                                                                                                              | `'bloom_filter_on_write'`                             | false                    |
+| ALLOW_SINGLE_FILE_PARALLELISM              | No                      | Enables parallel serialization of columns in a single file.                                                                                                                                                                                                                                                                          | `'allow_single_file_parallelism'`                     | true                     |
+| MAXIMUM_PARALLEL_ROW_GROUP_WRITERS         | No                      | Maximum number of parallel row group writers.                                                                                                                                                                                                                                                                                        | `'maximum_parallel_row_group_writers'`                | 1                        |
+| MAXIMUM_BUFFERED_RECORD_BATCHES_PER_STREAM | No                      | Maximum number of buffered record batches per stream.                                                                                                                                                                                                                                                                                | `'maximum_buffered_record_batches_per_stream'`        | 2                        |
+| KEY_VALUE_METADATA                         | No (Key is specific)    | Adds custom key-value pairs to the file metadata. Use the format `'metadata::your_key_name' 'your_value'`. Multiple entries allowed.                                                                                                                                                                                                 | `'metadata::key_name'`                                | None                     |
 
 **Example:**
 
diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst
index a13d40334b639..f1fef45f705a8 100644
--- a/docs/source/user-guide/sql/index.rst
+++ b/docs/source/user-guide/sql/index.rst
@@ -22,6 +22,7 @@ SQL Reference
    :maxdepth: 2
 
    data_types
+   struct_coercion
    select
    subqueries
    ddl
diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
index cbcec710e267f..ae5dbd5bee757 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -81,6 +81,17 @@ abs(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT abs(-5);
++----------+
+| abs(-5)  |
++----------+
+| 5        |
++----------+
+```
+
 ### `acos`
 
 Returns the arc cosine or inverse cosine of a number.
@@ -93,6 +104,17 @@ acos(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT acos(1);
++----------+
+| acos(1)  |
++----------+
+| 0.0      |
++----------+
+```
+
 ### `acosh`
 
 Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number.
@@ -105,6 +127,17 @@ acosh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT acosh(2);
++------------+
+| acosh(2)   |
++------------+
+| 1.31696    |
++------------+
+```
+
 ### `asin`
 
 Returns the arc sine or inverse sine of a number.
@@ -117,6 +150,17 @@ asin(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT asin(0.5);
++------------+
+| asin(0.5)  |
++------------+
+| 0.5235988  |
++------------+
+```
+
 ### `asinh`
 
 Returns the area hyperbolic sine or inverse hyperbolic sine of a number.
@@ -129,6 +173,17 @@ asinh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT asinh(1);
++------------+
+| asinh(1)   |
++------------+
+| 0.8813736  |
++------------+
+```
+
 ### `atan`
 
 Returns the arc tangent or inverse tangent of a number.
@@ -141,6 +196,17 @@ atan(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+    > SELECT atan(1);
++-----------+
+| atan(1)   |
++-----------+
+| 0.7853982 |
++-----------+
+```
+
 ### `atan2`
 
 Returns the arc tangent or inverse tangent of `expression_y / expression_x`.
@@ -156,6 +222,17 @@ atan2(expression_y, expression_x)
 - **expression_x**: Second numeric expression to operate on.
   Can be a constant, column, or function, and any combination of arithmetic operators.
 
+#### Example
+
+```sql
+> SELECT atan2(1, 1);
++------------+
+| atan2(1,1) |
++------------+
+| 0.7853982  |
++------------+
+```
+
 ### `atanh`
 
 Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number.
@@ -168,6 +245,17 @@ atanh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+    > SELECT atanh(0.5);
++-------------+
+| atanh(0.5)  |
++-------------+
+| 0.5493061   |
++-------------+
+```
+
 ### `cbrt`
 
 Returns the cube root of a number.
@@ -180,6 +268,17 @@ cbrt(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT cbrt(27);
++-----------+
+| cbrt(27)  |
++-----------+
+| 3.0       |
++-----------+
+```
+
 ### `ceil`
 
 Returns the nearest integer greater than or equal to a number.
@@ -192,6 +291,17 @@ ceil(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT ceil(3.14);
++------------+
+| ceil(3.14) |
++------------+
+| 4.0        |
++------------+
+```
+
 ### `cos`
 
 Returns the cosine of a number.
@@ -204,6 +314,17 @@ cos(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT cos(0);
++--------+
+| cos(0) |
++--------+
+| 1.0    |
++--------+
+```
+
 ### `cosh`
 
 Returns the hyperbolic cosine of a number.
@@ -216,6 +337,17 @@ cosh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT cosh(1);
++-----------+
+| cosh(1)   |
++-----------+
+| 1.5430806 |
++-----------+
+```
+
 ### `cot`
 
 Returns the cotangent of a number.
@@ -228,6 +360,17 @@ cot(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT cot(1);
++---------+
+| cot(1)  |
++---------+
+| 0.64209 |
++---------+
+```
+
 ### `degrees`
 
 Converts radians to degrees.
@@ -240,6 +383,17 @@ degrees(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+    > SELECT degrees(pi());
++------------+
+| degrees(0) |
++------------+
+| 180.0      |
++------------+
+```
+
 ### `exp`
 
 Returns the base-e exponential of a number.
@@ -252,6 +406,17 @@ exp(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT exp(1);
++---------+
+| exp(1)  |
++---------+
+| 2.71828 |
++---------+
+```
+
 ### `factorial`
 
 Factorial. Returns 1 if value is less than 2.
@@ -264,6 +429,17 @@ factorial(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT factorial(5);
++---------------+
+| factorial(5)  |
++---------------+
+| 120           |
++---------------+
+```
+
 ### `floor`
 
 Returns the nearest integer less than or equal to a number.
@@ -276,6 +452,17 @@ floor(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT floor(3.14);
++-------------+
+| floor(3.14) |
++-------------+
+| 3.0         |
++-------------+
+```
+
 ### `gcd`
 
 Returns the greatest common divisor of `expression_x` and `expression_y`. Returns 0 if both inputs are zero.
@@ -289,6 +476,17 @@ gcd(expression_x, expression_y)
 - **expression_x**: First numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_y**: Second numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT gcd(48, 18);
++------------+
+| gcd(48,18) |
++------------+
+| 6          |
++------------+
+```
+
 ### `isnan`
 
 Returns true if a given number is +NaN or -NaN otherwise returns false.
@@ -301,6 +499,17 @@ isnan(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT isnan(1);
++----------+
+| isnan(1) |
++----------+
+| false    |
++----------+
+```
+
 ### `iszero`
 
 Returns true if a given number is +0.0 or -0.0 otherwise returns false.
@@ -313,6 +522,17 @@ iszero(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT iszero(0);
++------------+
+| iszero(0)  |
++------------+
+| true       |
++------------+
+```
+
 ### `lcm`
 
 Returns the least common multiple of `expression_x` and `expression_y`. Returns 0 if either input is zero.
@@ -326,6 +546,17 @@ lcm(expression_x, expression_y)
 - **expression_x**: First numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **expression_y**: Second numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT lcm(4, 5);
++----------+
+| lcm(4,5) |
++----------+
+| 20       |
++----------+
+```
+
 ### `ln`
 
 Returns the natural logarithm of a number.
@@ -338,6 +569,17 @@ ln(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT ln(2.71828);
++-------------+
+| ln(2.71828) |
++-------------+
+| 1.0         |
++-------------+
+```
+
 ### `log`
 
 Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.
@@ -352,6 +594,17 @@ log(numeric_expression)
 - **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT log(10);
++---------+
+| log(10) |
++---------+
+| 1.0     |
++---------+
+```
+
 ### `log10`
 
 Returns the base-10 logarithm of a number.
@@ -364,6 +617,17 @@ log10(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT log10(100);
++-------------+
+| log10(100)  |
++-------------+
+| 2.0         |
++-------------+
+```
+
 ### `log2`
 
 Returns the base-2 logarithm of a number.
@@ -376,6 +640,17 @@ log2(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT log2(8);
++-----------+
+| log2(8)   |
++-----------+
+| 3.0       |
++-----------+
+```
+
 ### `nanvl`
 
 Returns the first argument if it's not _NaN_.
@@ -390,6 +665,17 @@ nanvl(expression_x, expression_y)
 - **expression_x**: Numeric expression to return if it's not _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators.
 - **expression_y**: Numeric expression to return if the first expression is _NaN_. Can be a constant, column, or function, and any combination of arithmetic operators.
 
+#### Example
+
+```sql
+> SELECT nanvl(0, 5);
++------------+
+| nanvl(0,5) |
++------------+
+| 0          |
++------------+
+```
+
 ### `pi`
 
 Returns an approximate value of π.
@@ -415,6 +701,17 @@ power(base, exponent)
 - **base**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **exponent**: Exponent numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT power(2, 3);
++-------------+
+| power(2,3)  |
++-------------+
+| 8           |
++-------------+
+```
+
 #### Aliases
 
 - pow
@@ -431,6 +728,17 @@ radians(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT radians(180);
++----------------+
+| radians(180)   |
++----------------+
+| 3.14159265359  |
++----------------+
+```
+
 ### `random`
 
 Returns a random float value in the range [0, 1).
@@ -440,6 +748,17 @@ The random seed is unique to each row.
 random()
 ```
 
+#### Example
+
+```sql
+> SELECT random();
++------------------+
+| random()         |
++------------------+
+| 0.7389238902938  |
++------------------+
+```
+
 ### `round`
 
 Rounds a number to the nearest integer.
@@ -453,6 +772,17 @@ round(numeric_expression[, decimal_places])
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **decimal_places**: Optional. The number of decimal places to round to. Defaults to 0.
 
+#### Example
+
+```sql
+> SELECT round(3.14159);
++--------------+
+| round(3.14159)|
++--------------+
+| 3.0          |
++--------------+
+```
+
 ### `signum`
 
 Returns the sign of a number.
@@ -467,6 +797,17 @@ signum(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT signum(-42);
++-------------+
+| signum(-42) |
++-------------+
+| -1          |
++-------------+
+```
+
 ### `sin`
 
 Returns the sine of a number.
@@ -479,6 +820,17 @@ sin(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT sin(0);
++----------+
+| sin(0)   |
++----------+
+| 0.0      |
++----------+
+```
+
 ### `sinh`
 
 Returns the hyperbolic sine of a number.
@@ -491,6 +843,17 @@ sinh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT sinh(1);
++-----------+
+| sinh(1)   |
++-----------+
+| 1.1752012 |
++-----------+
+```
+
 ### `sqrt`
 
 Returns the square root of a number.
@@ -515,6 +878,17 @@ tan(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+> SELECT tan(pi()/4);
++--------------+
+| tan(PI()/4)  |
++--------------+
+| 1.0          |
++--------------+
+```
+
 ### `tanh`
 
 Returns the hyperbolic tangent of a number.
@@ -527,6 +901,17 @@ tanh(numeric_expression)
 
 - **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators.
 
+#### Example
+
+```sql
+  > SELECT tanh(20);
+  +----------+
+  | tanh(20) |
+  +----------+
+  | 1.0      |
+  +----------+
+```
+
 ### `trunc`
 
 Truncates a number to a whole number or truncated to the specified decimal places.
@@ -544,6 +929,17 @@ trunc(numeric_expression[, decimal_places])
   right of the decimal point. If `decimal_places` is a negative
   integer, replaces digits to the left of the decimal point with `0`.
 
+#### Example
+
+```sql
+> SELECT trunc(42.738);
++----------------+
+| trunc(42.738)  |
++----------------+
+| 42             |
++----------------+
+```
+
 ## Conditional Functions
 
 - [coalesce](#coalesce)
@@ -660,7 +1056,7 @@ nullif(expression1, expression2)
 
 ### `nvl`
 
-Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_.
+Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_ and _expression2_ is not evaluated. This function can be used to substitute a default value for NULL values.
 
 ```sql
 nvl(expression1, expression2)
@@ -768,7 +1164,7 @@ nvl2(expression1, expression2, expression3)
 
 ### `ascii`
 
-Returns the Unicode character code of the first character in a string.
+Returns the first Unicode scalar value of a string.
 
 ```sql
 ascii(str)
@@ -829,7 +1225,7 @@ bit_length(str)
 
 ### `btrim`
 
-Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.
+Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string.
 
 ```sql
 btrim(str[, trim_str])
@@ -838,7 +1234,7 @@ btrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._
+- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is a space._
 
 #### Example
 
@@ -909,7 +1305,7 @@ character_length(str)
 
 ### `chr`
 
-Returns the character with the specified ASCII or Unicode code value.
+Returns a string containing the character with the specified Unicode scalar value.
 
 ```sql
 chr(expression)
@@ -1196,7 +1592,7 @@ lpad(str, n[, padding_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **n**: String length to pad to.
+- **n**: String length to pad to. If the input string is longer than this length, it is truncated (on the right).
 - **padding_str**: Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._
 
 #### Example
@@ -1216,7 +1612,7 @@ lpad(str, n[, padding_str])
 
 ### `ltrim`
 
-Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.
+Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string.
 
 ```sql
 ltrim(str[, trim_str])
@@ -1225,7 +1621,7 @@ ltrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._
 
 #### Example
 
@@ -1424,7 +1820,7 @@ rpad(str, n[, padding_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **n**: String length to pad to.
+- **n**: String length to pad to. If the input string is longer than this length, it is truncated.
 - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._
 
 #### Example
@@ -1444,7 +1840,7 @@ rpad(str, n[, padding_str])
 
 ### `rtrim`
 
-Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.
+Trims the specified trim string from the end of a string. If no trim string is provided, all spaces are removed from the end of the input string.
 
 ```sql
 rtrim(str[, trim_str])
@@ -1453,7 +1849,7 @@ rtrim(str[, trim_str])
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._
+- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._
 
 #### Example
 
@@ -1495,7 +1891,7 @@ split_part(str, delimiter, pos)
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
 - **delimiter**: String or character to split on.
-- **pos**: Position of the part to return.
+- **pos**: Position of the part to return (counting from 1). Negative values count backward from the end of the string.
 
 #### Example
 
@@ -1672,17 +2068,17 @@ to_hex(int)
 
 ### `translate`
 
-Translates characters in a string to specified translation characters.
+Performs character-wise substitution based on a mapping.
 
 ```sql
-translate(str, chars, translation)
+translate(str, from, to)
 ```
 
 #### Arguments
 
 - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
-- **chars**: Characters to translate.
-- **translation**: Translation characters. Translation characters replace only characters at the same position in the **chars** string.
+- **from**: The characters to be replaced.
+- **to**: The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed. If a character appears more than once in **from**, the first occurrence determines the mapping.
 
 #### Example
 
@@ -1729,7 +2125,7 @@ upper(str)
 
 ### `uuid`
 
-Returns [`UUID v4`](<https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)>) string value which is unique per row.
+Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_%28random%29) string value which is unique per row.
 
 ```sql
 uuid()
@@ -1779,7 +2175,7 @@ encode(expression, format)
 #### Arguments
 
 - **expression**: Expression containing string or binary data
-- **format**: Supported formats are: `base64`, `hex`
+- **format**: Supported formats are: `base64`, `base64pad`, `hex`
 
 **Related functions**:
 
@@ -1793,6 +2189,7 @@ regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
 The following regular expression functions are supported:
 
 - [regexp_count](#regexp_count)
+- [regexp_instr](#regexp_instr)
 - [regexp_like](#regexp_like)
 - [regexp_match](#regexp_match)
 - [regexp_replace](#regexp_replace)
@@ -1828,6 +2225,39 @@ regexp_count(str, regexp[, start, flags])
 +---------------------------------------------------------------+
 ```
 
+### `regexp_instr`
+
+Returns the position in a string where the specified occurrence of a POSIX regular expression is located.
+
+```sql
+regexp_instr(str, regexp[, start[, N[, flags[, subexpr]]]])
+```
+
+#### Arguments
+
+- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
+- **regexp**: Regular expression to operate on. Can be a constant, column, or function, and any combination of operators.
+- **start**: - **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function. Defaults to 1
+- **N**: - **N**: Optional The N-th occurrence of pattern to find. Defaults to 1 (first match). Can be a constant, column, or function.
+- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?
+- **subexpr**: Optional Specifies which capture group (subexpression) to return the position for. Defaults to 0, which returns the position of the entire match.
+
+#### Example
+
+```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```
+
 ### `regexp_like`
 
 Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.
@@ -1864,7 +2294,7 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
 +--------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ### `regexp_match`
 
@@ -1903,7 +2333,7 @@ regexp_match(str, regexp[, flags])
             +---------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ### `regexp_replace`
 
@@ -1944,7 +2374,7 @@ SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
 +-------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
 
 ## Time and Date Functions
 
@@ -1959,10 +2389,12 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 - [datetrunc](#datetrunc)
 - [from_unixtime](#from_unixtime)
 - [make_date](#make_date)
+- [make_time](#make_time)
 - [now](#now)
 - [to_char](#to_char)
 - [to_date](#to_date)
 - [to_local_time](#to_local_time)
+- [to_time](#to_time)
 - [to_timestamp](#to_timestamp)
 - [to_timestamp_micros](#to_timestamp_micros)
 - [to_timestamp_millis](#to_timestamp_millis)
@@ -1973,12 +2405,14 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 
 ### `current_date`
 
-Returns the current UTC date.
+Returns the current date in the session time zone.
 
 The `current_date()` return value is determined at query time and will return the same date, no matter when in the query plan the function executes.
 
 ```sql
 current_date()
+    (optional) SET datafusion.execution.time_zone = '+00:00';
+    SELECT current_date();
 ```
 
 #### Aliases
@@ -1987,12 +2421,16 @@ current_date()
 
 ### `current_time`
 
-Returns the current UTC time.
+Returns the current time in the session time zone.
 
 The `current_time()` return value is determined at query time and will return the same time, no matter when in the query plan the function executes.
 
+The session time zone can be set using the statement 'SET datafusion.execution.time_zone = desired time zone'. The time zone can be a value like +00:00, 'Europe/London' etc.
+
 ```sql
 current_time()
+    (optional) SET datafusion.execution.time_zone = '+00:00';
+    SELECT current_time();
 ```
 
 ### `current_timestamp`
@@ -2051,6 +2489,17 @@ FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z')  t(time);
 | 2023-01-03T03:00:00 |
 +---------------------+
 2 row(s) fetched.
+
+-- Bin the time into 15 minute intervals starting at 1 min
+>  SELECT date_bin(interval '15 minutes', time, TIME '00:01:00') as bin
+FROM VALUES (TIME '02:18:18'), (TIME '19:00:03')  t(time);
++----------+
+| bin      |
++----------+
+| 02:16:00 |
+| 18:46:00 |
++----------+
+2 row(s) fetched.
 ```
 
 ### `date_format`
@@ -2070,6 +2519,7 @@ date_part(part, expression)
 - **part**: Part of the date to return. The following date parts are supported:
 
   - year
+  - isoyear (ISO 8601 week-numbering year)
   - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in)
   - month
   - week (week of the year)
@@ -2080,9 +2530,10 @@ date_part(part, expression)
   - millisecond
   - microsecond
   - nanosecond
-  - dow (day of the week)
+  - dow (day of the week where Sunday is 0)
   - doy (day of the year)
-  - epoch (seconds since Unix epoch)
+  - epoch (seconds since Unix epoch for timestamps/dates, total seconds for intervals)
+  - isodow (day of the week where Monday is 0)
 
 - **expression**: Time expression to operate on. Can be a constant, column, or function.
 
@@ -2098,7 +2549,7 @@ extract(field FROM source)
 
 ### `date_trunc`
 
-Truncates a timestamp value to a specified precision.
+Truncates a timestamp or time value to a specified precision.
 
 ```sql
 date_trunc(precision, expression)
@@ -2108,6 +2559,8 @@ date_trunc(precision, expression)
 
 - **precision**: Time precision to truncate to. The following precisions are supported:
 
+  For Timestamp types:
+
   - year / YEAR
   - quarter / QUARTER
   - month / MONTH
@@ -2116,8 +2569,18 @@ date_trunc(precision, expression)
   - hour / HOUR
   - minute / MINUTE
   - second / SECOND
+  - millisecond / MILLISECOND
+  - microsecond / MICROSECOND
 
-- **expression**: Time expression to operate on. Can be a constant, column, or function.
+  For Time types (hour, minute, second, millisecond, microsecond only):
+
+  - hour / HOUR
+  - minute / MINUTE
+  - second / SECOND
+  - millisecond / MILLISECOND
+  - microsecond / MICROSECOND
+
+- **expression**: Timestamp or time expression to operate on. Can be a constant, column, or function.
 
 #### Aliases
 
@@ -2186,11 +2649,44 @@ make_date(year, month, day)
 +-----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/make_date.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+
+### `make_time`
+
+Make a time from hour/minute/second component parts.
+
+```sql
+make_time(hour, minute, second)
+```
+
+#### Arguments
+
+- **hour**: Hour to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+- **minute**: Minute to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+- **second**: Second to use when making the time. Can be a constant, column or function, and any combination of arithmetic operators.
+
+#### Example
+
+```sql
+> select make_time(13, 23, 1);
++-------------------------------------------+
+| make_time(Int64(13),Int64(23),Int64(1))   |
++-------------------------------------------+
+| 13:23:01                                  |
++-------------------------------------------+
+> select make_time('23', '01', '31');
++-----------------------------------------------+
+| make_time(Utf8("23"),Utf8("01"),Utf8("31"))   |
++-----------------------------------------------+
+| 23:01:31                                      |
++-----------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `now`
 
-Returns the current UTC timestamp.
+Returns the current timestamp in the system configured timezone (None by default).
 
 The `now()` return value is determined at query time and will return the same timestamp, no matter when in the query plan the function executes.
 
@@ -2227,7 +2723,7 @@ to_char(expression, format)
 +----------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 #### Aliases
 
@@ -2236,7 +2732,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo
 ### `to_date`
 
 Converts a value to a date (`YYYY-MM-DD`).
-Supports strings, integer and double types as input.
+Supports strings, numeric and timestamp types as input.
 Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
 Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
 Returns the corresponding date.
@@ -2271,7 +2767,7 @@ to_date('2017-05-31', '%Y-%m-%d')
 +---------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_local_time`
 
@@ -2310,11 +2806,11 @@ to_local_time(expression)
 FROM (
   SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time
 );
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
-| time                      | type                                           | to_local_time       | to_local_time_type          |
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
-| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) |
-+---------------------------+------------------------------------------------+---------------------+-----------------------------+
++---------------------------+----------------------------------+---------------------+--------------------+
+| time                      | type                             | to_local_time       | to_local_time_type |
++---------------------------+----------------------------------+---------------------+--------------------+
+| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns)      |
++---------------------------+----------------------------------+---------------------+--------------------+
 
 # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather
 # than UTC boundaries
@@ -2334,11 +2830,68 @@ FROM (
 +---------------------------+
 ```
 
+### `to_time`
+
+Converts a value to a time (`HH:MM:SS.nnnnnnnnn`).
+Supports strings and timestamps as input.
+Strings are parsed as `HH:MM:SS`, `HH:MM:SS.nnnnnnnnn`, or `HH:MM` if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided.
+Timestamps will have the time portion extracted.
+Returns the corresponding time.
+
+Note: `to_time` returns Time64(Nanosecond), which represents the time of day in nanoseconds since midnight.
+
+```sql
+to_time('12:30:45', '%H:%M:%S')
+```
+
+#### Arguments
+
+- **expression**: String or Timestamp expression to operate on. Can be a constant, column, or function, and any combination of operators.
+- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order
+  they appear with the first successful one being returned. If none of the formats successfully parse the expression
+  an error will be returned.
+
+#### Example
+
+```sql
+> select to_time('12:30:45');
++---------------------------+
+| to_time(Utf8("12:30:45")) |
++---------------------------+
+| 12:30:45                  |
++---------------------------+
+> select to_time('12-30-45', '%H-%M-%S');
++--------------------------------------------+
+| to_time(Utf8("12-30-45"),Utf8("%H-%M-%S")) |
++--------------------------------------------+
+| 12:30:45                                   |
++--------------------------------------------+
+> select to_time('2024-01-15 14:30:45'::timestamp);
++--------------------------------------------------+
+| to_time(Utf8("2024-01-15 14:30:45"))             |
++--------------------------------------------------+
+| 14:30:45                                         |
++--------------------------------------------------+
+```
+
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
+
 ### `to_timestamp`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
 
-Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds.
+Note: `to_timestamp` returns `Timestamp(ns, TimeZone)` where the time zone is the session time zone. The supported range
+for integer input is between`-9223372037` and `9223372036`. Supported range for string input is between
+`1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds`
+for the input outside of supported bounds.
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp(expression[, ..., format_n])
@@ -2347,7 +2900,11 @@ to_timestamp(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2366,11 +2923,19 @@ to_timestamp(expression[, ..., format_n])
 +--------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_micros`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`) Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_micros(expression[, ..., format_n])
@@ -2379,7 +2944,11 @@ to_timestamp_micros(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2398,11 +2967,19 @@ to_timestamp_micros(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_millis`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_millis(expression[, ..., format_n])
@@ -2411,7 +2988,11 @@ to_timestamp_millis(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2430,11 +3011,18 @@ to_timestamp_millis(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_nanos`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone. Integers, unsigned integers, and doubles are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_nanos(expression[, ..., format_n])
@@ -2443,7 +3031,11 @@ to_timestamp_nanos(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2462,11 +3054,19 @@ to_timestamp_nanos(expression[, ..., format_n])
 +---------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_timestamp_seconds`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00<TZ>`) in the session time zone. Supports strings,
+integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Strings that parse without a time zone are treated as if they are in the
+session time zone, or UTC if no session time zone is set.
+Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).
+
+The session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.
+The time zone can be a value like +00:00, 'Europe/London' etc.
 
 ```sql
 to_timestamp_seconds(expression[, ..., format_n])
@@ -2475,7 +3075,11 @@ to_timestamp_seconds(expression[, ..., format_n])
 #### Arguments
 
 - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.
-- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.
+- **format_n**:
+  Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression.
+  Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully
+  parse the expression an error will be returned. Note: parsing of named timezones (e.g. 'America/New_York') using %Z is
+  only supported at the end of the string preceded by a space.
 
 #### Example
 
@@ -2494,11 +3098,15 @@ to_timestamp_seconds(expression[, ..., format_n])
 +----------------------------------------------------------------------------------------------------------------+
 ```
 
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_timestamp.rs)
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/date_time.rs)
 
 ### `to_unixtime`
 
-Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00Z`). Supports strings, dates, timestamps and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00`).
+Supports strings, dates, timestamps, integer, unsigned integer, and float types as input.
+Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')
+if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided.
+Integers, unsigned integers, and floats are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00`).
 
 ```sql
 to_unixtime(expression[, ..., format_n])
@@ -2552,6 +3160,7 @@ _Alias of [current_date](#current_date)._
 - [array_join](#array_join)
 - [array_length](#array_length)
 - [array_max](#array_max)
+- [array_min](#array_min)
 - [array_ndims](#array_ndims)
 - [array_pop_back](#array_pop_back)
 - [array_pop_front](#array_pop_front)
@@ -2574,6 +3183,7 @@ _Alias of [current_date](#current_date)._
 - [array_to_string](#array_to_string)
 - [array_union](#array_union)
 - [arrays_overlap](#arrays_overlap)
+- [arrays_zip](#arrays_zip)
 - [cardinality](#cardinality)
 - [empty](#empty)
 - [flatten](#flatten)
@@ -2619,6 +3229,7 @@ _Alias of [current_date](#current_date)._
 - [list_sort](#list_sort)
 - [list_to_string](#list_to_string)
 - [list_union](#list_union)
+- [list_zip](#list_zip)
 - [make_array](#make_array)
 - [make_list](#make_list)
 - [range](#range)
@@ -2934,16 +3545,16 @@ array_has_all(array, sub-array)
 
 ### `array_has_any`
 
-Returns true if any elements exist in both arrays.
+Returns true if the arrays have any elements in common.
 
 ```sql
-array_has_any(array, sub-array)
+array_has_any(array1, array2)
 ```
 
 #### Arguments
 
-- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **sub-array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **array1**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+- **array2**: Array expression. Can be a constant, column, or function, and any combination of array operators.
 
 #### Example
 
@@ -3058,6 +3669,29 @@ array_max(array)
 
 - list_max
 
+### `array_min`
+
+Returns the minimum value in the array.
+
+```sql
+array_min(array)
+```
+
+#### Arguments
+
+- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
+
+#### Example
+
+```sql
+> select array_min([3,1,4,2]);
++-----------------------------------------+
+| array_min(List([3,1,4,2]))              |
++-----------------------------------------+
+| 1                                       |
++-----------------------------------------+
+```
+
 ### `array_ndims`
 
 Returns the number of dimensions of the array.
@@ -3142,7 +3776,7 @@ array_pop_front(array)
 
 ### `array_position`
 
-Returns the position of the first occurrence of the specified element in the array.
+Returns the position of the first occurrence of the specified element in the array, or NULL if not found. Comparisons are done using `IS DISTINCT FROM` semantics, so NULL is considered to match NULL.
 
 ```sql
 array_position(array, element)
@@ -3152,8 +3786,8 @@ array_position(array, element, index)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **element**: Element to search for position in the array.
-- **index**: Index at which to start searching.
+- **element**: Element to search for in the array.
+- **index**: Index at which to start searching (1-indexed).
 
 #### Example
 
@@ -3189,7 +3823,7 @@ array_positions(array, element)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
-- **element**: Element to search for position in the array.
+- **element**: Element to search for in the array.
 
 #### Example
 
@@ -3578,7 +4212,7 @@ array_to_string(array, delimiter[, null_string])
 
 - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators.
 - **delimiter**: Array element separator.
-- **null_string**: Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior.
+- **null_string**: Optional. String to use for null values in the output. If not provided, nulls will be omitted.
 
 #### Example
 
@@ -3599,7 +4233,7 @@ array_to_string(array, delimiter[, null_string])
 
 ### `array_union`
 
-Returns an array of elements that are present in both arrays (all elements from both arrays) with out duplicates.
+Returns an array of elements that are present in both arrays (all elements from both arrays) without duplicates.
 
 ```sql
 array_union(array1, array2)
@@ -3635,6 +4269,40 @@ array_union(array1, array2)
 
 _Alias of [array_has_any](#array_has_any)._
 
+### `arrays_zip`
+
+Returns an array of structs created by combining the elements of each input array at the same index. If the arrays have different lengths, shorter arrays are padded with NULLs.
+
+```sql
+arrays_zip(array1[, ..., array_n])
+```
+
+#### Arguments
+
+- **array1**: First array expression.
+- **array_n**: Optional additional array expressions.
+
+#### Example
+
+```sql
+> select arrays_zip([1, 2, 3]);
++---------------------------------------------------+
+| arrays_zip([1, 2, 3])                             |
++---------------------------------------------------+
+| [{1: 1}, {1: 2}, {1: 3}]                          |
++---------------------------------------------------+
+> select arrays_zip([1, 2], [3, 4, 5]);
++---------------------------------------------------+
+| arrays_zip([1, 2], [3, 4, 5])                     |
++---------------------------------------------------+
+| [{1: 1, 2: 3}, {1: 2, 2: 4}, {1: NULL, 2: 5}]     |
++---------------------------------------------------+
+```
+
+#### Aliases
+
+- list_zip
+
 ### `cardinality`
 
 Returns the total number of elements in the array.
@@ -3719,7 +4387,8 @@ flatten(array)
 Similar to the range function, but it includes the upper bound.
 
 ```sql
-generate_series(start, stop, step)
+generate_series(stop)
+generate_series(start, stop[, step])
 ```
 
 #### Arguments
@@ -3903,6 +4572,10 @@ _Alias of [array_to_string](#array_to_string)._
 
 _Alias of [array_union](#array_union)._
 
+### `list_zip`
+
+_Alias of [arrays_zip](#arrays_zip)._
+
 ### `make_array`
 
 Returns an array using the specified input expressions.
@@ -3939,7 +4612,8 @@ _Alias of [make_array](#make_array)._
 Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.
 
 ```sql
-range(start, stop, step)
+range(stop)
+range(start, stop[, step])
 ```
 
 #### Arguments
@@ -3959,11 +4633,11 @@ range(start, stop, step)
 +-----------------------------------+
 
 > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH);
-+--------------------------------------------------------------+
-| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) |
-+--------------------------------------------------------------+
++--------------------------------------------------------------------------+
+| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH)          |
++--------------------------------------------------------------------------+
 | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] |
-+--------------------------------------------------------------+
++--------------------------------------------------------------------------+
 ```
 
 ### `string_to_array`
@@ -4105,6 +4779,7 @@ select struct(a as field_a, b) from t;
 
 - [element_at](#element_at)
 - [map](#map)
+- [map_entries](#map_entries)
 - [map_extract](#map_extract)
 - [map_keys](#map_keys)
 - [map_values](#map_values)
@@ -4162,6 +4837,30 @@ SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]);
 {key1: value1, key2: }
 ```
 
+### `map_entries`
+
+Returns a list of all entries in the map.
+
+```sql
+map_entries(map)
+```
+
+#### Arguments
+
+- **map**: Map expression. Can be a constant, column, or function, and any combination of map operators.
+
+#### Example
+
+```sql
+SELECT map_entries(MAP {'a': 1, 'b': NULL, 'c': 3});
+----
+[{'key': a, 'value': 1}, {'key': b, 'value': NULL}, {'key': c, 'value': 3}]
+
+SELECT map_entries(map([100, 5], [42, 43]));
+----
+[{'key': 100, 'value': 42}, {'key': 5, 'value': 43}]
+```
+
 ### `map_extract`
 
 Returns a list containing the value for the given key or an empty list if the key is not present in the map.
@@ -4277,11 +4976,11 @@ digest(expression, algorithm)
 
 ```sql
 > select digest('foo', 'sha256');
-+------------------------------------------+
-| digest(Utf8("foo"), Utf8("sha256"))      |
-+------------------------------------------+
-| <binary_hash_result>                     |
-+------------------------------------------+
++------------------------------------------------------------------+
+| digest(Utf8("foo"),Utf8("sha256"))                               |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```
 
 ### `md5`
@@ -4300,11 +4999,11 @@ md5(expression)
 
 ```sql
 > select md5('foo');
-+-------------------------------------+
-| md5(Utf8("foo"))                    |
-+-------------------------------------+
-| <md5_checksum_result>               |
-+-------------------------------------+
++----------------------------------+
+| md5(Utf8("foo"))                 |
++----------------------------------+
+| acbd18db4cc2f85cedef654fccc4a4d8 |
++----------------------------------+
 ```
 
 ### `sha224`
@@ -4323,11 +5022,11 @@ sha224(expression)
 
 ```sql
 > select sha224('foo');
-+------------------------------------------+
-| sha224(Utf8("foo"))                      |
-+------------------------------------------+
-| <sha224_hash_result>                     |
-+------------------------------------------+
++----------------------------------------------------------+
+| sha224(Utf8("foo"))                                      |
++----------------------------------------------------------+
+| 0808f64e60d58979fcb676c96ec938270dea42445aeefcd3a4e6f8db |
++----------------------------------------------------------+
 ```
 
 ### `sha256`
@@ -4346,11 +5045,11 @@ sha256(expression)
 
 ```sql
 > select sha256('foo');
-+--------------------------------------+
-| sha256(Utf8("foo"))                  |
-+--------------------------------------+
-| <sha256_hash_result>                 |
-+--------------------------------------+
++------------------------------------------------------------------+
+| sha256(Utf8("foo"))                                              |
++------------------------------------------------------------------+
+| 2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae |
++------------------------------------------------------------------+
 ```
 
 ### `sha384`
@@ -4369,11 +5068,11 @@ sha384(expression)
 
 ```sql
 > select sha384('foo');
-+-----------------------------------------+
-| sha384(Utf8("foo"))                     |
-+-----------------------------------------+
-| <sha384_hash_result>                    |
-+-----------------------------------------+
++--------------------------------------------------------------------------------------------------+
+| sha384(Utf8("foo"))                                                                              |
++--------------------------------------------------------------------------------------------------+
+| 98c11ffdfdd540676b1a137cb1a22b2a70350c9a44171d6b1180c6be5cbb2ee3f79d532c8a1dd9ef2e8e08e752a3babb |
++--------------------------------------------------------------------------------------------------+
 ```
 
 ### `sha512`
@@ -4392,11 +5091,11 @@ sha512(expression)
 
 ```sql
 > select sha512('foo');
-+-------------------------------------------+
-| sha512(Utf8("foo"))                       |
-+-------------------------------------------+
-| <sha512_hash_result>                      |
-+-------------------------------------------+
++----------------------------------------------------------------------------------------------------------------------------------+
+| sha512(Utf8("foo"))                                                                                                              |
++----------------------------------------------------------------------------------------------------------------------------------+
+| f7fbba6e0636f890e56fbbf3283e524c6fa3204ae298382d624741d0dc6638326e282c41be5e4254d8820772c5518a2c5a8c0c7f7eda19594a7eb539453e1ed7 |
++----------------------------------------------------------------------------------------------------------------------------------+
 ```
 
 ## Union Functions
@@ -4464,6 +5163,7 @@ union_tag(union_expression)
 ## Other Functions
 
 - [arrow_cast](#arrow_cast)
+- [arrow_metadata](#arrow_metadata)
 - [arrow_typeof](#arrow_typeof)
 - [get_field](#get_field)
 - [version](#version)
@@ -4484,16 +5184,56 @@ arrow_cast(expression, datatype)
 #### Example
 
 ```sql
-> select arrow_cast(-5, 'Int8') as a,
+> select
+  arrow_cast(-5,    'Int8') as a,
   arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b,
-  arrow_cast('bar', 'LargeUtf8') as c,
-  arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d
-  ;
-+----+-----+-----+---------------------------+
-| a  | b   | c   | d                         |
-+----+-----+-----+---------------------------+
-| -5 | foo | bar | 2023-01-02T12:53:02+08:00 |
-+----+-----+-----+---------------------------+
+  arrow_cast('bar', 'LargeUtf8') as c;
+
++----+-----+-----+
+| a  | b   | c   |
++----+-----+-----+
+| -5 | foo | bar |
++----+-----+-----+
+
+> select
+  arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d,
+  arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e;
+
++---------------------------+---------------------+
+| d                         | e                   |
++---------------------------+---------------------+
+| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 |
++---------------------------+---------------------+
+```
+
+### `arrow_metadata`
+
+Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.
+
+```sql
+arrow_metadata(expression[, key])
+```
+
+#### Arguments
+
+- **expression**: The expression to retrieve metadata from. Can be a column or other expression.
+- **key**: Optional. The specific metadata key to retrieve.
+
+#### Example
+
+```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
 ```
 
 ### `arrow_typeof`
@@ -4522,44 +5262,53 @@ arrow_typeof(expression)
 ### `get_field`
 
 Returns a field within a map or a struct with the given key.
+Supports nested field access by providing multiple field names.
 Note: most users invoke `get_field` indirectly via field access
 syntax such as `my_struct_col['field_name']` which results in a call to
 `get_field(my_struct_col, 'field_name')`.
+Nested access like `my_struct['a']['b']` is optimized to a single call:
+`get_field(my_struct, 'a', 'b')`.
 
 ```sql
-get_field(expression1, expression2)
+get_field(expression, field_name[, field_name2, ...])
 ```
 
 #### Arguments
 
-- **expression1**: The map or struct to retrieve a field for.
-- **expression2**: The field name in the map or struct to retrieve data for. Must evaluate to a string.
+- **expression**: The map or struct to retrieve a field from.
+- **field_name**: The field name(s) to access, in order for nested access. Must evaluate to strings.
 
 #### Example
 
 ```sql
-> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow');
-> select struct(idx, v) from t as c;
-+-------------------------+
-| struct(c.idx,c.v)       |
-+-------------------------+
-| {c0: data, c1: fusion}  |
-| {c0: apache, c1: arrow} |
-+-------------------------+
-> select get_field((select struct(idx, v) from t), 'c0');
-+-----------------------+
-| struct(t.idx,t.v)[c0] |
-+-----------------------+
-| data                  |
-| apache                |
-+-----------------------+
-> select get_field((select struct(idx, v) from t), 'c1');
-+-----------------------+
-| struct(t.idx,t.v)[c1] |
-+-----------------------+
-| fusion                |
-| arrow                 |
-+-----------------------+
+> -- Access a field from a struct column
+> create table test( struct_col) as values
+    ({name: 'Alice', age: 30}),
+    ({name: 'Bob', age: 25});
+> select struct_col from test;
++-----------------------------+
+| struct_col                  |
++-----------------------------+
+| {name: Alice, age: 30}      |
+| {name: Bob, age: 25}        |
++-----------------------------+
+> select struct_col['name'] as name from test;
++-------+
+| name  |
++-------+
+| Alice |
+| Bob   |
++-------+
+
+> -- Nested field access with multiple arguments
+> create table test(struct_col) as values
+    ({outer: {inner_val: 42}});
+> select struct_col['outer']['inner_val'] as result from test;
++--------+
+| result |
++--------+
+| 42     |
++--------+
 ```
 
 ### `version`
diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md
index b2fa0a6305888..baacf432f5fde 100644
--- a/docs/source/user-guide/sql/select.md
+++ b/docs/source/user-guide/sql/select.md
@@ -35,10 +35,12 @@ DataFusion supports the following syntax for queries:
 [ [WHERE](#where-clause) condition ] <br/>
 [ [GROUP BY](#group-by-clause) grouping_element [, ...] ] <br/>
 [ [HAVING](#having-clause) condition] <br/>
+[ [QUALIFY](#qualify-clause) condition] <br/>
 [ [UNION](#union-clause) [ ALL | select ] <br/>
 [ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ] <br/>
 [ [LIMIT](#limit-clause) count ] <br/>
 [ [EXCLUDE | EXCEPT](#exclude-and-except-clause) ] <br/>
+[Pipe operators](#pipe-operators) <br/>
 
 </code>
 
@@ -84,7 +86,7 @@ SELECT a FROM table WHERE a > 10
 
 ## JOIN clause
 
-DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, `NATURAL JOIN` and `CROSS JOIN`.
+DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, `NATURAL JOIN`, `CROSS JOIN`, `LEFT SEMI JOIN`, `RIGHT SEMI JOIN`, `LEFT ANTI JOIN`, and `RIGHT ANTI JOIN`.
 
 The following examples are based on this table:
 
@@ -102,7 +104,7 @@ select * from x;
 The keywords `JOIN` or `INNER JOIN` define a join that only shows rows where there is a match in both tables.
 
 ```sql
-select * from x inner join x y ON x.column_1 = y.column_1;
+SELECT * FROM x INNER JOIN x y ON x.column_1 = y.column_1;
 +----------+----------+----------+----------+
 | column_1 | column_2 | column_1 | column_2 |
 +----------+----------+----------+----------+
@@ -116,7 +118,7 @@ The keywords `LEFT JOIN` or `LEFT OUTER JOIN` define a join that includes all ro
 is not a match in the right table. When there is no match, null values are produced for the right side of the join.
 
 ```sql
-select * from x left join x y ON x.column_1 = y.column_2;
+SELECT * FROM x LEFT JOIN x y ON x.column_1 = y.column_2;
 +----------+----------+----------+----------+
 | column_1 | column_2 | column_1 | column_2 |
 +----------+----------+----------+----------+
@@ -130,7 +132,7 @@ The keywords `RIGHT JOIN` or `RIGHT OUTER JOIN` define a join that includes all
 is not a match in the left table. When there is no match, null values are produced for the left side of the join.
 
 ```sql
-select * from x right join x y ON x.column_1 = y.column_2;
+SELECT * FROM x RIGHT JOIN x y ON x.column_1 = y.column_2;
 +----------+----------+----------+----------+
 | column_1 | column_2 | column_1 | column_2 |
 +----------+----------+----------+----------+
@@ -145,7 +147,7 @@ The keywords `FULL JOIN` or `FULL OUTER JOIN` define a join that is effectively
 either side of the join where there is not a match.
 
 ```sql
-select * from x full outer join x y ON x.column_1 = y.column_2;
+SELECT * FROM x FULL OUTER JOIN x y ON x.column_1 = y.column_2;
 +----------+----------+----------+----------+
 | column_1 | column_2 | column_1 | column_2 |
 +----------+----------+----------+----------+
@@ -156,11 +158,11 @@ select * from x full outer join x y ON x.column_1 = y.column_2;
 
 ### NATURAL JOIN
 
-A natural join defines an inner join based on common column names found between the input tables. When no common
-column names are found, it behaves like a cross join.
+A `NATURAL JOIN` defines an inner join based on common column names found between the input tables. When no common
+column names are found, it behaves like a `CROSS JOIN`.
 
 ```sql
-select * from x natural join x y;
+SELECT * FROM x NATURAL JOIN x y;
 +----------+----------+
 | column_1 | column_2 |
 +----------+----------+
@@ -170,11 +172,11 @@ select * from x natural join x y;
 
 ### CROSS JOIN
 
-A cross join produces a cartesian product that matches every row in the left side of the join with every row in the
+A `CROSS JOIN` produces a cartesian product that matches every row in the left side of the join with every row in the
 right side of the join.
 
 ```sql
-select * from x cross join x y;
+SELECT * FROM x CROSS JOIN x y;
 +----------+----------+----------+----------+
 | column_1 | column_2 | column_1 | column_2 |
 +----------+----------+----------+----------+
@@ -182,6 +184,60 @@ select * from x cross join x y;
 +----------+----------+----------+----------+
 ```
 
+### LEFT SEMI JOIN
+
+The `LEFT SEMI JOIN` returns all rows from the left table that have at least one matching row in the right table, and
+projects only the columns from the left table.
+
+```sql
+SELECT * FROM x LEFT SEMI JOIN x y ON x.column_1 = y.column_1;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
+| 1        | 2        |
++----------+----------+
+```
+
+### RIGHT SEMI JOIN
+
+The `RIGHT SEMI JOIN` returns all rows from the right table that have at least one matching row in the left table, and
+only projects the columns from the right table.
+
+```sql
+SELECT * FROM x RIGHT SEMI JOIN x y ON x.column_1 = y.column_1;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
+| 1        | 2        |
++----------+----------+
+```
+
+### LEFT ANTI JOIN
+
+The `LEFT ANTI JOIN` returns all rows from the left table that do not have any matching row in the right table, projecting
+only the left table’s columns.
+
+```sql
+SELECT * FROM x LEFT ANTI JOIN x y ON x.column_1 = y.column_1;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
++----------+----------+
+```
+
+### RIGHT ANTI JOIN
+
+The `RIGHT ANTI JOIN` returns all rows from the right table that do not have any matching row in the left table, projecting
+only the right table’s columns.
+
+```sql
+SELECT * FROM x RIGHT ANTI JOIN x y ON x.column_1 = y.column_1;
++----------+----------+
+| column_1 | column_2 |
++----------+----------+
++----------+----------+
+```
+
 ## GROUP BY clause
 
 Example:
@@ -207,6 +263,14 @@ Example:
 SELECT a, b, MAX(c) FROM table GROUP BY a, b HAVING MAX(c) > 10
 ```
 
+## QUALIFY clause
+
+Example:
+
+```sql
+SELECT ROW_NUMBER() OVER (PARTITION BY region) AS rk FROM table QUALIFY rk > 1;
+```
+
 ## UNION clause
 
 Example:
@@ -264,3 +328,215 @@ FROM table;
 SELECT * EXCLUDE(age, person)
 FROM table;
 ```
+
+## Pipe operators
+
+Some SQL dialects (e.g. BigQuery) support the pipe operator `|>`.
+The SQL dialect can be set like this:
+
+```sql
+set datafusion.sql_parser.dialect = 'BigQuery';
+```
+
+DataFusion currently supports the following pipe operators:
+
+- [WHERE](#pipe_where)
+- [ORDER BY](#pipe_order_by)
+- [LIMIT](#pipe_limit)
+- [SELECT](#pipe_select)
+- [EXTEND](#pipe_extend)
+- [AS](#pipe_as)
+- [UNION](#pipe_union)
+- [INTERSECT](#pipe_intersect)
+- [EXCEPT](#pipe_except)
+- [AGGREGATE](#pipe_aggregate)
+- [JOIN](#pipe_join)
+
+(pipe_where)=
+
+### WHERE
+
+```sql
+select * from range(0,10)
+|> where value < 2;
++-------+
+| value |
++-------+
+| 0     |
+| 1     |
++-------+
+```
+
+(pipe_order_by)=
+
+### ORDER BY
+
+```sql
+select * from range(0,3)
+|> order by value desc;
++-------+
+| value |
++-------+
+| 2     |
+| 1     |
+| 0     |
++-------+
+```
+
+(pipe_limit)=
+
+### LIMIT
+
+```sql
+select * from range(0,3)
+|> order by value desc
+|> limit 1;
++-------+
+| value |
++-------+
+| 2     |
++-------+
+```
+
+(pipe_select)=
+
+### SELECT
+
+```sql
+select * from range(0,3)
+|> select value + 10;
++---------------------------+
+| range().value + Int64(10) |
++---------------------------+
+| 10                        |
+| 11                        |
+| 12                        |
++---------------------------+
+```
+
+(pipe_extend)=
+
+### EXTEND
+
+```sql
+select * from range(0,3)
+|> extend -value AS minus_value;
++-------+-------------+
+| value | minus_value |
++-------+-------------+
+| 0     | 0           |
+| 1     | -1          |
+| 2     | -2          |
++-------+-------------+
+```
+
+(pipe_as)=
+
+### AS
+
+```sql
+select * from range(0,3)
+|> as my_range
+|> SELECT my_range.value;
++-------+
+| value |
++-------+
+| 0     |
+| 1     |
+| 2     |
++-------+
+```
+
+(pipe_union)=
+
+### UNION
+
+```sql
+select * from range(0,3)
+|> union all (
+  select * from range(3,6)
+);
++-------+
+| value |
++-------+
+| 0     |
+| 1     |
+| 2     |
+| 3     |
+| 4     |
+| 5     |
++-------+
+```
+
+(pipe_intersect)=
+
+### INTERSECT
+
+```sql
+select * from range(0,100)
+|> INTERSECT DISTINCT (
+  select 3
+);
++-------+
+| value |
++-------+
+| 3     |
++-------+
+```
+
+(pipe_except)=
+
+### EXCEPT
+
+```sql
+select * from range(0,10)
+|> EXCEPT DISTINCT (select * from range(5,10));
++-------+
+| value |
++-------+
+| 0     |
+| 1     |
+| 2     |
+| 3     |
+| 4     |
++-------+
+```
+
+(pipe_aggregate)=
+
+### AGGREGATE
+
+```sql
+select * from range(0,3)
+|> aggregate sum(value) AS total;
++-------+
+| total |
++-------+
+| 3     |
++-------+
+```
+
+(pipe_join)=
+
+### JOIN
+
+```sql
+(
+  SELECT 'apples' AS item, 2 AS sales
+  UNION ALL
+  SELECT 'bananas' AS item, 5 AS sales
+)
+|> AS produce_sales
+|> LEFT JOIN
+     (
+       SELECT 'apples' AS item, 123 AS id
+     ) AS produce_data
+   ON produce_sales.item = produce_data.item
+|> SELECT produce_sales.item, sales, id;
++--------+-------+------+
+| item   | sales | id   |
++--------+-------+------+
+| apples | 2     | 123  |
+| bananas| 5     | NULL |
++--------+-------+------+
+```
diff --git a/docs/source/user-guide/sql/special_functions.md b/docs/source/user-guide/sql/special_functions.md
index 7c9efbb66218f..4f2a39f642b06 100644
--- a/docs/source/user-guide/sql/special_functions.md
+++ b/docs/source/user-guide/sql/special_functions.md
@@ -69,6 +69,7 @@ Expands an array or map into rows.
 ### `unnest (struct)`
 
 Expand a struct fields into individual columns.
+Each field of the struct will be prefixed with `__unnest_placeholder` and could be accessed via `"__unnest_placeholder(<struct>).<field>"`.
 
 #### Arguments
 
@@ -91,10 +92,10 @@ Expand a struct fields into individual columns.
 +---------------------------+
 
 > select unnest(struct_column) from foov;
-+------------------------------------------+------------------------------------------+
-| unnest_placeholder(foov.struct_column).a | unnest_placeholder(foov.struct_column).b |
-+------------------------------------------+------------------------------------------+
-| 5                                        | a string                                 |
-| 6                                        | another string                           |
-+------------------------------------------+------------------------------------------+
++--------------------------------------------+--------------------------------------------+
+| __unnest_placeholder(foov.struct_column).a | __unnest_placeholder(foov.struct_column).b |
++--------------------------------------------+--------------------------------------------+
+| 5                                          | a string                                   |
+| 6                                          | another string                             |
++--------------------------------------------+--------------------------------------------+
 ```
diff --git a/docs/source/user-guide/sql/struct_coercion.md b/docs/source/user-guide/sql/struct_coercion.md
new file mode 100644
index 0000000000000..d2a32fcee2650
--- /dev/null
+++ b/docs/source/user-guide/sql/struct_coercion.md
@@ -0,0 +1,354 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Struct Type Coercion and Field Mapping
+
+DataFusion uses **name-based field mapping** when coercing struct types across different operations. This document explains how struct coercion works, when it applies, and how to handle NULL fields.
+
+## Overview: Name-Based vs Positional Mapping
+
+When combining structs from different sources (e.g., in UNION, array construction, or JOINs), DataFusion matches struct fields by **name** rather than by **position**. This provides more robust and predictable behavior compared to positional matching.
+
+### Example: Field Reordering is Handled Transparently
+
+```sql
+-- These two structs have the same fields in different order
+SELECT [{a: 1, b: 2}, {b: 3, a: 4}];
+
+-- Result: Field names matched, values unified
+-- [{"a": 1, "b": 2}, {"a": 4, "b": 3}]
+```
+
+## Coercion Paths Using Name-Based Matching
+
+The following query operations use name-based field mapping for struct coercion:
+
+### 1. Array Literal Construction
+
+When creating array literals with struct elements that have different field orders:
+
+```sql
+-- Structs with reordered fields in array literal
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+
+-- Unified type: List(Struct("x": Int32, "y": Int32))
+-- Values: [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+**When it applies:**
+
+- Array literals with struct elements: `[{...}, {...}]`
+- Nested arrays with structs: `[[{x: 1}, {x: 2}]]`
+
+### 2. Array Construction from Columns
+
+When constructing arrays from table columns with different struct schemas:
+
+```sql
+CREATE TABLE t_left (s struct(x int, y int)) AS VALUES ({x: 1, y: 2});
+CREATE TABLE t_right (s struct(y int, x int)) AS VALUES ({y: 3, x: 4});
+
+-- Dynamically constructs unified array schema
+SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right;
+
+-- Result: [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+**When it applies:**
+
+- Array construction with column references: `[col1, col2]`
+- Array construction in joins with matching field names
+
+### 3. UNION Operations
+
+When combining query results with different struct field orders:
+
+```sql
+SELECT {a: 1, b: 2} as s
+UNION ALL
+SELECT {b: 3, a: 4} as s;
+
+-- Result: {"a": 1, "b": 2} and {"a": 4, "b": 3}
+```
+
+**When it applies:**
+
+- UNION ALL with structs: field names matched across branches
+- UNION (deduplicated) with structs
+
+### 4. Common Table Expressions (CTEs)
+
+When multiple CTEs produce structs with different field orders that are combined:
+
+```sql
+WITH
+  t1 AS (SELECT {a: 1, b: 2} as s),
+  t2 AS (SELECT {b: 3, a: 4} as s)
+SELECT s FROM t1
+UNION ALL
+SELECT s FROM t2;
+
+-- Result: Field names matched across CTEs
+```
+
+### 5. VALUES Clauses
+
+When creating tables or temporary results with struct values in different field orders:
+
+```sql
+CREATE TABLE t AS VALUES ({a: 1, b: 2}), ({b: 3, a: 4});
+
+-- Table schema unified: struct(a: int, b: int)
+-- Values: {a: 1, b: 2} and {a: 4, b: 3}
+```
+
+### 6. JOIN Operations
+
+When joining tables where the JOIN condition involves structs with different field orders:
+
+```sql
+CREATE TABLE orders (customer struct(name varchar, id int));
+CREATE TABLE customers (info struct(id int, name varchar));
+
+-- Join matches struct fields by name
+SELECT * FROM orders
+JOIN customers ON orders.customer = customers.info;
+```
+
+### 7. Aggregate Functions
+
+When collecting structs with different field orders using aggregate functions like `array_agg`:
+
+```sql
+SELECT array_agg(s) FROM (
+  SELECT {x: 1, y: 2} as s
+  UNION ALL
+  SELECT {y: 3, x: 4} as s
+) t
+GROUP BY category;
+
+-- Result: Array of structs with unified field order
+```
+
+### 8. Window Functions
+
+When using window functions with struct expressions having different field orders:
+
+```sql
+SELECT
+  id,
+  row_number() over (partition by s order by id) as rn
+FROM (
+  SELECT {category: 1, value: 10} as s, 1 as id
+  UNION ALL
+  SELECT {value: 20, category: 1} as s, 2 as id
+);
+
+-- Fields matched by name in PARTITION BY clause
+```
+
+## NULL Handling for Missing Fields
+
+When structs have different field sets, missing fields are filled with **NULL** values during coercion.
+
+### Example: Partial Field Overlap
+
+```sql
+-- Struct in first position has fields: a, b
+-- Struct in second position has fields: b, c
+-- Unified schema includes all fields: a, b, c
+
+SELECT [
+  CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)),
+  CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT))
+];
+
+-- Result:
+-- [
+--   {"a": 1, "b": 2, "c": NULL},
+--   {"a": NULL, "b": 3, "c": 4}
+-- ]
+```
+
+### Limitations
+
+**Field count must match exactly.** If structs have different numbers of fields and their field names don't completely overlap, the query will fail:
+
+```sql
+-- This fails because field sets don't match:
+-- t_left has {x, y} but t_right has {x, y, z}
+SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right;
+-- Error: Cannot coerce struct with mismatched field counts
+```
+
+**Workaround: Use explicit CAST**
+
+To handle partial field overlap, explicitly cast structs to a unified schema:
+
+```sql
+SELECT [
+  CAST(t_left.s AS STRUCT(x INT, y INT, z INT)),
+  CAST(t_right.s AS STRUCT(x INT, y INT, z INT))
+] FROM t_left JOIN t_right;
+```
+
+## Migration Guide: From Positional to Name-Based Matching
+
+If you have existing code that relied on **positional** struct field matching, you may need to update it.
+
+### Example: Query That Changes Behavior
+
+**Old behavior (positional):**
+
+```sql
+-- These would have been positionally mapped (left-to-right)
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+-- Old result (positional): [{"x": 1, "y": 2}, {"y": 3, "x": 4}]
+```
+
+**New behavior (name-based):**
+
+```sql
+-- Now uses name-based matching
+SELECT [{x: 1, y: 2}, {y: 3, x: 4}];
+-- New result (by name): [{"x": 1, "y": 2}, {"x": 4, "y": 3}]
+```
+
+### Migration Steps
+
+1. **Review struct operations** - Look for queries that combine structs from different sources
+2. **Check field names** - Verify that field names match as expected (not positions)
+3. **Test with new coercion** - Run queries and verify the results match your expectations
+4. **Handle field reordering** - If you need specific field orders, use explicit CAST operations
+
+### Using Explicit CAST for Compatibility
+
+If you need precise control over struct field order and types, use explicit `CAST`:
+
+```sql
+-- Guarantee specific field order and types
+SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT));
+-- Result: {"a": 4, "b": 3}
+```
+
+## Best Practices
+
+### 1. Be Explicit with Schema Definitions
+
+When joining or combining structs, define target schemas explicitly:
+
+```sql
+-- Good: explicit schema definition
+SELECT CAST(data AS STRUCT(id INT, name VARCHAR, active BOOLEAN))
+FROM external_source;
+```
+
+### 2. Use Named Struct Constructors
+
+Prefer named struct constructors for clarity:
+
+```sql
+-- Good: field names are explicit
+SELECT named_struct('id', 1, 'name', 'Alice', 'active', true);
+
+-- Or using struct literal syntax
+SELECT {id: 1, name: 'Alice', active: true};
+```
+
+### 3. Test Field Mappings
+
+Always verify that field mappings work as expected:
+
+```sql
+-- Use arrow_typeof to verify unified schema
+SELECT arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]);
+-- Result: List(Struct("x": Int32, "y": Int32))
+```
+
+### 4. Handle Partial Field Overlap Explicitly
+
+When combining structs with partial field overlap, use explicit CAST:
+
+```sql
+-- Instead of relying on implicit coercion
+SELECT [
+  CAST(left_struct AS STRUCT(x INT, y INT, z INT)),
+  CAST(right_struct AS STRUCT(x INT, y INT, z INT))
+];
+```
+
+### 5. Document Struct Schemas
+
+In complex queries, document the expected struct schemas:
+
+```sql
+-- Expected schema: {customer_id: INT, name: VARCHAR, age: INT}
+SELECT {
+  customer_id: c.id,
+  name: c.name,
+  age: c.age
+} as customer_info
+FROM customers c;
+```
+
+## Error Messages and Troubleshooting
+
+### "Cannot coerce struct with different field counts"
+
+**Cause:** Trying to combine structs with different numbers of fields.
+
+**Solution:**
+
+```sql
+-- Use explicit CAST to handle missing fields
+SELECT [
+  CAST(struct1 AS STRUCT(a INT, b INT, c INT)),
+  CAST(struct2 AS STRUCT(a INT, b INT, c INT))
+];
+```
+
+### "Field X not found in struct"
+
+**Cause:** Referencing a field name that doesn't exist in the struct.
+
+**Solution:**
+
+```sql
+-- Verify field names match exactly (case-sensitive)
+SELECT s['field_name'] FROM my_table;  -- Use bracket notation for access
+-- Or use get_field function
+SELECT get_field(s, 'field_name') FROM my_table;
+```
+
+### Unexpected NULL values after coercion
+
+**Cause:** Struct coercion added NULL for missing fields.
+
+**Solution:** Check that all structs have the required fields, or explicitly handle NULLs:
+
+```sql
+SELECT COALESCE(s['field'], default_value) FROM my_table;
+```
+
+## Related Functions
+
+- `arrow_typeof()` - Returns the Arrow type of an expression
+- `struct()` / `named_struct()` - Creates struct values
+- `get_field()` - Extracts field values from structs
+- `CAST()` - Explicitly casts structs to specific schemas
diff --git a/docs/source/user-guide/sql/subqueries.md b/docs/source/user-guide/sql/subqueries.md
index ee75a6a1575c5..692d1c4020d74 100644
--- a/docs/source/user-guide/sql/subqueries.md
+++ b/docs/source/user-guide/sql/subqueries.md
@@ -183,7 +183,7 @@ FROM
 and return _true_ or _false_.
 Rows that evaluate to _false_ or NULL are filtered from results.
 The `WHERE` clause supports correlated and non-correlated subqueries
-as well as scalar and non-scalar subqueries (depending on the the operator used
+as well as scalar and non-scalar subqueries (depending on the operator used
 in the predicate expression).
 
 ```sql
@@ -293,7 +293,7 @@ returned by aggregate functions in the `SELECT` clause to the result of the
 subquery and return _true_ or _false_.
 Rows that evaluate to _false_ are filtered from results.
 The `HAVING` clause supports correlated and non-correlated subqueries
-as well as scalar and non-scalar subqueries (depending on the the operator used
+as well as scalar and non-scalar subqueries (depending on the operator used
 in the predicate expression).
 
 ```sql
diff --git a/docs/source/user-guide/sql/window_functions.md b/docs/source/user-guide/sql/window_functions.md
index bcb33bad7fb50..2c8050ce1f9ca 100644
--- a/docs/source/user-guide/sql/window_functions.md
+++ b/docs/source/user-guide/sql/window_functions.md
@@ -145,6 +145,17 @@ where **offset** is an non-negative integer.
 
 RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column).
 
+## Filter clause for aggregate window functions
+
+Aggregate window functions support the SQL `FILTER (WHERE ...)` clause to include only rows that satisfy the predicate from the window frame in the aggregation.
+
+```sql
+sum(salary) FILTER (WHERE salary > 0)
+  OVER (PARTITION BY depname ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+```
+
+If no rows in the frame satisfy the filter for a given output row, `COUNT` yields `0` while `SUM`/`AVG`/`MIN`/`MAX` yield `NULL`.
+
 ## Aggregate functions
 
 All [aggregate functions](aggregate_functions.md) can be used as window functions.
@@ -169,13 +180,11 @@ cume_dist()
 #### Example
 
 ```sql
-    --Example usage of the cume_dist window function:
-    SELECT salary,
-       cume_dist() OVER (ORDER BY salary) AS cume_dist
-    FROM employees;
-```
+-- Example usage of the cume_dist window function:
+SELECT salary,
+    cume_dist() OVER (ORDER BY salary) AS cume_dist
+FROM employees;
 
-```sql
 +--------+-----------+
 | salary | cume_dist |
 +--------+-----------+
@@ -196,14 +205,12 @@ dense_rank()
 #### Example
 
 ```sql
-    --Example usage of the dense_rank window function:
-    SELECT department,
-           salary,
-           dense_rank() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
-    FROM employees;
-```
+-- Example usage of the dense_rank window function:
+SELECT department,
+    salary,
+    dense_rank() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
+FROM employees;
 
-```sql
 +-------------+--------+------------+
 | department  | salary | dense_rank |
 +-------------+--------+------------+
@@ -231,14 +238,12 @@ ntile(expression)
 #### Example
 
 ```sql
-    --Example usage of the ntile window function:
-    SELECT employee_id,
-           salary,
-           ntile(4) OVER (ORDER BY salary DESC) AS quartile
-    FROM employees;
-```
+-- Example usage of the ntile window function:
+SELECT employee_id,
+    salary,
+    ntile(4) OVER (ORDER BY salary DESC) AS quartile
+FROM employees;
 
-```sql
 +-------------+--------+----------+
 | employee_id | salary | quartile |
 +-------------+--------+----------+
@@ -264,14 +269,12 @@ percent_rank()
 #### Example
 
 ```sql
-    --Example usage of the percent_rank window function:
-    SELECT employee_id,
-           salary,
-           percent_rank() OVER (ORDER BY salary) AS percent_rank
-    FROM employees;
-```
+    -- Example usage of the percent_rank window function:
+SELECT employee_id,
+    salary,
+    percent_rank() OVER (ORDER BY salary) AS percent_rank
+FROM employees;
 
-```sql
 +-------------+--------+---------------+
 | employee_id | salary | percent_rank  |
 +-------------+--------+---------------+
@@ -292,14 +295,12 @@ rank()
 #### Example
 
 ```sql
-    --Example usage of the rank window function:
-    SELECT department,
-           salary,
-           rank() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
-    FROM employees;
-```
+-- Example usage of the rank window function:
+SELECT department,
+    salary,
+    rank() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
+FROM employees;
 
-```sql
 +-------------+--------+------+
 | department  | salary | rank |
 +-------------+--------+------+
@@ -323,14 +324,12 @@ row_number()
 #### Example
 
 ```sql
-    --Example usage of the row_number window function:
-    SELECT department,
-           salary,
-           row_number() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
-    FROM employees;
-```
+-- Example usage of the row_number window function:
+SELECT department,
+  salary,
+  row_number() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
+FROM employees;
 
-````sql
 +-------------+--------+---------+
 | department  | salary | row_num |
 +-------------+--------+---------+
@@ -341,8 +340,7 @@ row_number()
 | Engineering | 90000  | 1       |
 | Engineering | 80000  | 2       |
 +-------------+--------+---------+
-```#
-
+```
 
 ## Analytical Functions
 
@@ -358,7 +356,7 @@ Returns value evaluated at the row that is the first row of the window frame.
 
 ```sql
 first_value(expression)
-````
+```
 
 #### Arguments
 
@@ -367,15 +365,13 @@ first_value(expression)
 #### Example
 
 ```sql
-    --Example usage of the first_value window function:
-    SELECT department,
-           employee_id,
-           salary,
-           first_value(salary) OVER (PARTITION BY department ORDER BY salary DESC) AS top_salary
-    FROM employees;
-```
+-- Example usage of the first_value window function:
+SELECT department,
+  employee_id,
+  salary,
+  first_value(salary) OVER (PARTITION BY department ORDER BY salary DESC) AS top_salary
+FROM employees;
 
-```sql
 +-------------+-------------+--------+------------+
 | department  | employee_id | salary | top_salary |
 +-------------+-------------+--------+------------+
@@ -404,14 +400,12 @@ lag(expression, offset, default)
 #### Example
 
 ```sql
-    --Example usage of the lag window function:
-    SELECT employee_id,
-           salary,
-           lag(salary, 1, 0) OVER (ORDER BY employee_id) AS prev_salary
-    FROM employees;
-```
+-- Example usage of the lag window function:
+SELECT employee_id,
+    salary,
+    lag(salary, 1, 0) OVER (ORDER BY employee_id) AS prev_salary
+FROM employees;
 
-```sql
 +-------------+--------+-------------+
 | employee_id | salary | prev_salary |
 +-------------+--------+-------------+
@@ -443,9 +437,7 @@ SELECT department,
        salary,
        last_value(salary) OVER (PARTITION BY department ORDER BY salary) AS running_last_salary
 FROM employees;
-```
 
-```sql
 +-------------+-------------+--------+---------------------+
 | department  | employee_id | salary | running_last_salary |
 +-------------+-------------+--------+---------------------+
@@ -474,16 +466,14 @@ lead(expression, offset, default)
 #### Example
 
 ```sql
--- Example usage of lead() :
+-- Example usage of lead window function:
 SELECT
     employee_id,
     department,
     salary,
     lead(salary, 1, 0) OVER (PARTITION BY department ORDER BY salary) AS next_salary
 FROM employees;
-```
 
-```sql
 +-------------+-------------+--------+--------------+
 | employee_id | department  | salary | next_salary  |
 +-------------+-------------+--------+--------------+
@@ -526,9 +516,7 @@ SELECT nth_value(salary, 2) OVER (
   ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
 ) AS nth_value
 FROM employees;
-```
 
-```text
 +-----------+
 | nth_value |
 +-----------+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..6fc7705d8536f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.uv.workspace]
+members = ["benchmarks", "dev", "docs"]
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index c52dd7322d9a3..c7d61a9e24f7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -19,5 +19,5 @@
 # to compile this workspace and run CI jobs.
 
 [toolchain]
-channel = "1.87.0"
+channel = "1.94.0"
 components = ["rustfmt", "clippy"]
diff --git a/rustfmt.toml b/rustfmt.toml
index 4522e520a469b..c680d9d068d5c 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-edition = "2021"
+edition = "2024"
 max_width = 90
 
 # ignore generated files
diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml
index 811102cf6dbdb..cdaee6f442bf2 100644
--- a/test-utils/Cargo.toml
+++ b/test-utils/Cargo.toml
@@ -22,12 +22,15 @@ edition = { workspace = true }
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+# Note: add additional linter rules in lib.rs.
+# Rust does not support workspace + new linter rules in subcrates yet
+# https://github.com/rust-lang/cargo/issues/13157
 [lints]
 workspace = true
 
 [dependencies]
 arrow = { workspace = true }
-chrono-tz = { version = "0.10.3", default-features = false }
+chrono-tz = { version = "0.10.4", default-features = false }
 datafusion-common = { workspace = true, default-features = true }
 env_logger = { workspace = true }
 rand = { workspace = true }
diff --git a/test-utils/src/array_gen/binary.rs b/test-utils/src/array_gen/binary.rs
index 9740eeae5e7fe..ab0530a9ab4e4 100644
--- a/test-utils/src/array_gen/binary.rs
+++ b/test-utils/src/array_gen/binary.rs
@@ -19,8 +19,8 @@ use arrow::array::{
     ArrayRef, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, UInt32Array,
 };
 use arrow::compute;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 /// Randomly generate binary arrays
 pub struct BinaryArrayGenerator {
diff --git a/test-utils/src/array_gen/boolean.rs b/test-utils/src/array_gen/boolean.rs
index 004d615b4caa4..c9104c0b8e788 100644
--- a/test-utils/src/array_gen/boolean.rs
+++ b/test-utils/src/array_gen/boolean.rs
@@ -17,8 +17,8 @@
 
 use arrow::array::{ArrayRef, BooleanArray, BooleanBuilder, UInt32Array};
 use arrow::compute::take;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 /// Randomly generate boolean arrays
 pub struct BooleanArrayGenerator {
diff --git a/test-utils/src/array_gen/decimal.rs b/test-utils/src/array_gen/decimal.rs
index c5ec8ac5e8938..54fa2269d6e4c 100644
--- a/test-utils/src/array_gen/decimal.rs
+++ b/test-utils/src/array_gen/decimal.rs
@@ -17,8 +17,8 @@
 
 use arrow::array::{ArrayRef, PrimitiveArray, PrimitiveBuilder, UInt32Array};
 use arrow::datatypes::DecimalType;
-use rand::rngs::StdRng;
 use rand::Rng;
+use rand::rngs::StdRng;
 
 use super::random_data::RandomNativeData;
 
diff --git a/test-utils/src/array_gen/primitive.rs b/test-utils/src/array_gen/primitive.rs
index 62a38a1b4ce1d..5944879600cb0 100644
--- a/test-utils/src/array_gen/primitive.rs
+++ b/test-utils/src/array_gen/primitive.rs
@@ -17,9 +17,9 @@
 
 use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, UInt32Array};
 use arrow::datatypes::DataType;
-use chrono_tz::{Tz, TZ_VARIANTS};
+use chrono_tz::{TZ_VARIANTS, Tz};
 use rand::prelude::IndexedRandom;
-use rand::{rng, rngs::StdRng, Rng};
+use rand::{Rng, rng, rngs::StdRng};
 use std::sync::Arc;
 
 use super::random_data::RandomNativeData;
diff --git a/test-utils/src/array_gen/random_data.rs b/test-utils/src/array_gen/random_data.rs
index 78518b7bf9dc1..f341d23417439 100644
--- a/test-utils/src/array_gen/random_data.rs
+++ b/test-utils/src/array_gen/random_data.rs
@@ -17,19 +17,19 @@
 
 use arrow::array::ArrowPrimitiveType;
 use arrow::datatypes::{
-    i256, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
+    Date32Type, Date64Type, Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
     DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
-    DurationSecondType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
-    Int8Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano,
+    DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+    Int64Type, IntervalDayTime, IntervalDayTimeType, IntervalMonthDayNano,
     IntervalMonthDayNanoType, IntervalYearMonthType, Time32MillisecondType,
     Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
     TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type, i256,
 };
+use rand::Rng;
 use rand::distr::StandardUniform;
 use rand::prelude::Distribution;
 use rand::rngs::StdRng;
-use rand::Rng;
 
 /// Generate corresponding NativeType value randomly according to
 /// ArrowPrimitiveType.
@@ -67,6 +67,8 @@ basic_random_data!(Time32MillisecondType);
 basic_random_data!(Time64MicrosecondType);
 basic_random_data!(Time64NanosecondType);
 basic_random_data!(IntervalYearMonthType);
+basic_random_data!(Decimal32Type);
+basic_random_data!(Decimal64Type);
 basic_random_data!(Decimal128Type);
 basic_random_data!(TimestampSecondType);
 basic_random_data!(TimestampMillisecondType);
diff --git a/test-utils/src/array_gen/string.rs b/test-utils/src/array_gen/string.rs
index 546485fd8dc16..896182290ccca 100644
--- a/test-utils/src/array_gen/string.rs
+++ b/test-utils/src/array_gen/string.rs
@@ -18,9 +18,9 @@
 use arrow::array::{
     ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray, UInt32Array,
 };
+use rand::Rng;
 use rand::distr::StandardUniform;
 use rand::rngs::StdRng;
-use rand::Rng;
 
 /// Randomly generate string arrays
 pub struct StringArrayGenerator {
diff --git a/test-utils/src/data_gen.rs b/test-utils/src/data_gen.rs
index 2228010b28dd1..bb8fdad5a0f89 100644
--- a/test-utils/src/data_gen.rs
+++ b/test-utils/src/data_gen.rs
@@ -129,7 +129,7 @@ impl BatchBuilder {
         }
     }
 
-    #[allow(clippy::too_many_arguments)]
+    #[expect(clippy::too_many_arguments)]
     fn append_row(
         &mut self,
         rng: &mut StdRng,
diff --git a/test-utils/src/string_gen.rs b/test-utils/src/string_gen.rs
index 75ed03898a279..21eecc05b8ce9 100644
--- a/test-utils/src/string_gen.rs
+++ b/test-utils/src/string_gen.rs
@@ -19,7 +19,7 @@ use crate::array_gen::StringArrayGenerator;
 use crate::stagger_batch;
 use arrow::record_batch::RecordBatch;
 use rand::rngs::StdRng;
-use rand::{rng, Rng, SeedableRng};
+use rand::{Rng, SeedableRng, rng};
 
 /// Randomly generate strings
 pub struct StringBatchGenerator(StringArrayGenerator);
diff --git a/test-utils/src/tpcds.rs b/test-utils/src/tpcds.rs
index ce5bac5bfd83d..a12ae8ceaef9c 100644
--- a/test-utils/src/tpcds.rs
+++ b/test-utils/src/tpcds.rs
@@ -299,7 +299,7 @@ pub fn tpcds_schemas() -> Vec<TableDef> {
                 Field::new("c_birth_country", DataType::Utf8, false),
                 Field::new("c_login", DataType::Utf8, false),
                 Field::new("c_email_address", DataType::Utf8, false),
-                Field::new("c_last_review_date_sk", DataType::Utf8, false),
+                Field::new("c_last_review_date", DataType::Utf8, false),
             ]),
         ),
         TableDef::new(
diff --git a/testing b/testing
index d2a1371230349..0d60ccae40d0e 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit d2a13712303498963395318a4eb42872e66aead7
+Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6
diff --git a/typos.toml b/typos.toml
new file mode 100644
index 0000000000000..196766f12fbc0
--- /dev/null
+++ b/typos.toml
@@ -0,0 +1,54 @@
+[default.extend-words]
+# random words from unit tests
+Pn = "Pn"
+fo = "fo"
+nd = "nd"
+Nd = "Nd"
+ba = "ba"
+ECT = "ECT"
+Ue = "Ue"
+Iy = "Iy"
+hte = "hte"
+numer = "numer"
+abd = "abd"
+aroun = "aroun"
+abov = "abov"
+Ois = "Ois"
+alo = "alo"
+
+# abbreviations, common words, etc.
+typ = "typ"
+datas = "datas"
+YOUY = "YOUY"
+lits = "lits"
+
+# exposed to public API
+Serializeable = "Serializeable"
+
+# from test cases like TPC-* or ClickBench
+carefull = "carefull"
+precentage = "precentage"
+flate = "flate"
+hom = "hom"
+alph = "alph"
+wih = "wih"
+Ded = "Ded"
+
+# From SLT README
+nteger = "nteger"
+
+[files]
+extend-exclude = [
+    "*.slt",
+    "*.slt.part",
+    "*.svg",
+    "*.sql",
+    "dev/changelog/**",
+    "benchmarks/**",
+    "*.csv",
+    "docs/source/contributor-guide/governance.md",
+    # submodules
+    "parquet-testing/**",
+    "datafusion-testing/**",
+    "testing/**",
+]
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000000000..1354870f4a5ee
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,1149 @@
+version = 1
+revision = 3
+requires-python = ">=3.11"
+resolution-markers = [
+    "python_full_version >= '3.12'",
+    "python_full_version < '3.12'",
+]
+
+[manifest]
+members = [
+    "datafusion-benchmarks",
+    "datafusion-dev",
+    "datafusion-docs",
+]
+
+[[package]]
+name = "accessible-pygments"
+version = "0.0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" },
+]
+
+[[package]]
+name = "alabaster"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
+]
+
+[[package]]
+name = "babel"
+version = "2.18.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" },
+]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" },
+]
+
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" },
+    { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" },
+    { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" },
+    { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
+    { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
+    { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
+    { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
+    { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
+    { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
+    { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
+    { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
+    { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
+    { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
+    { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
+    { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
+    { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
+    { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/27/c6491ff4954e58a10f69ad90aca8a1b6fe9c5d3c6f380907af3c37435b59/charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8", size = 206988, upload-time = "2025-10-14T04:40:33.79Z" },
+    { url = "https://files.pythonhosted.org/packages/94/59/2e87300fe67ab820b5428580a53cad894272dbb97f38a7a814a2a1ac1011/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0", size = 147324, upload-time = "2025-10-14T04:40:34.961Z" },
+    { url = "https://files.pythonhosted.org/packages/07/fb/0cf61dc84b2b088391830f6274cb57c82e4da8bbc2efeac8c025edb88772/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3", size = 142742, upload-time = "2025-10-14T04:40:36.105Z" },
+    { url = "https://files.pythonhosted.org/packages/62/8b/171935adf2312cd745d290ed93cf16cf0dfe320863ab7cbeeae1dcd6535f/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc", size = 160863, upload-time = "2025-10-14T04:40:37.188Z" },
+    { url = "https://files.pythonhosted.org/packages/09/73/ad875b192bda14f2173bfc1bc9a55e009808484a4b256748d931b6948442/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897", size = 157837, upload-time = "2025-10-14T04:40:38.435Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/fc/de9cce525b2c5b94b47c70a4b4fb19f871b24995c728e957ee68ab1671ea/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381", size = 151550, upload-time = "2025-10-14T04:40:40.053Z" },
+    { url = "https://files.pythonhosted.org/packages/55/c2/43edd615fdfba8c6f2dfbd459b25a6b3b551f24ea21981e23fb768503ce1/charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815", size = 149162, upload-time = "2025-10-14T04:40:41.163Z" },
+    { url = "https://files.pythonhosted.org/packages/03/86/bde4ad8b4d0e9429a4e82c1e8f5c659993a9a863ad62c7df05cf7b678d75/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0", size = 150019, upload-time = "2025-10-14T04:40:42.276Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/86/a151eb2af293a7e7bac3a739b81072585ce36ccfb4493039f49f1d3cae8c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161", size = 143310, upload-time = "2025-10-14T04:40:43.439Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/fe/43dae6144a7e07b87478fdfc4dbe9efd5defb0e7ec29f5f58a55aeef7bf7/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4", size = 162022, upload-time = "2025-10-14T04:40:44.547Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e6/7aab83774f5d2bca81f42ac58d04caf44f0cc2b65fc6db2b3b2e8a05f3b3/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89", size = 149383, upload-time = "2025-10-14T04:40:46.018Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/e8/b289173b4edae05c0dde07f69f8db476a0b511eac556dfe0d6bda3c43384/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569", size = 159098, upload-time = "2025-10-14T04:40:47.081Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/df/fe699727754cae3f8478493c7f45f777b17c3ef0600e28abfec8619eb49c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224", size = 152991, upload-time = "2025-10-14T04:40:48.246Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/86/584869fe4ddb6ffa3bd9f491b87a01568797fb9bd8933f557dba9771beaf/charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a", size = 99456, upload-time = "2025-10-14T04:40:49.376Z" },
+    { url = "https://files.pythonhosted.org/packages/65/f6/62fdd5feb60530f50f7e38b4f6a1d5203f4d16ff4f9f0952962c044e919a/charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016", size = 106978, upload-time = "2025-10-14T04:40:50.844Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/9d/0710916e6c82948b3be62d9d398cb4fcf4e97b56d6a6aeccd66c4b2f2bd5/charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1", size = 99969, upload-time = "2025-10-14T04:40:52.272Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/85/1637cd4af66fa687396e757dec650f28025f2a2f5a5531a3208dc0ec43f2/charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394", size = 208425, upload-time = "2025-10-14T04:40:53.353Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6a/04130023fef2a0d9c62d0bae2649b69f7b7d8d24ea5536feef50551029df/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25", size = 148162, upload-time = "2025-10-14T04:40:54.558Z" },
+    { url = "https://files.pythonhosted.org/packages/78/29/62328d79aa60da22c9e0b9a66539feae06ca0f5a4171ac4f7dc285b83688/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef", size = 144558, upload-time = "2025-10-14T04:40:55.677Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bb/b32194a4bf15b88403537c2e120b817c61cd4ecffa9b6876e941c3ee38fe/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d", size = 161497, upload-time = "2025-10-14T04:40:57.217Z" },
+    { url = "https://files.pythonhosted.org/packages/19/89/a54c82b253d5b9b111dc74aca196ba5ccfcca8242d0fb64146d4d3183ff1/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8", size = 159240, upload-time = "2025-10-14T04:40:58.358Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/10/d20b513afe03acc89ec33948320a5544d31f21b05368436d580dec4e234d/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86", size = 153471, upload-time = "2025-10-14T04:40:59.468Z" },
+    { url = "https://files.pythonhosted.org/packages/61/fa/fbf177b55bdd727010f9c0a3c49eefa1d10f960e5f09d1d887bf93c2e698/charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a", size = 150864, upload-time = "2025-10-14T04:41:00.623Z" },
+    { url = "https://files.pythonhosted.org/packages/05/12/9fbc6a4d39c0198adeebbde20b619790e9236557ca59fc40e0e3cebe6f40/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f", size = 150647, upload-time = "2025-10-14T04:41:01.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/6a9a593d52e3e8c5d2b167daf8c6b968808efb57ef4c210acb907c365bc4/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc", size = 145110, upload-time = "2025-10-14T04:41:03.231Z" },
+    { url = "https://files.pythonhosted.org/packages/30/42/9a52c609e72471b0fc54386dc63c3781a387bb4fe61c20231a4ebcd58bdd/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf", size = 162839, upload-time = "2025-10-14T04:41:04.715Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/5b/c0682bbf9f11597073052628ddd38344a3d673fda35a36773f7d19344b23/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15", size = 150667, upload-time = "2025-10-14T04:41:05.827Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/24/a41afeab6f990cf2daf6cb8c67419b63b48cf518e4f56022230840c9bfb2/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9", size = 160535, upload-time = "2025-10-14T04:41:06.938Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/e5/6a4ce77ed243c4a50a1fecca6aaaab419628c818a49434be428fe24c9957/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0", size = 154816, upload-time = "2025-10-14T04:41:08.101Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/ef/89297262b8092b312d29cdb2517cb1237e51db8ecef2e9af5edbe7b683b1/charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26", size = 99694, upload-time = "2025-10-14T04:41:09.23Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/2d/1e5ed9dd3b3803994c155cd9aacb60c82c331bad84daf75bcb9c91b3295e/charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525", size = 107131, upload-time = "2025-10-14T04:41:10.467Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/d9/0ed4c7098a861482a7b6a95603edce4c0d9db2311af23da1fb2b75ec26fc/charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3", size = 100390, upload-time = "2025-10-14T04:41:11.915Z" },
+    { url = "https://files.pythonhosted.org/packages/97/45/4b3a1239bbacd321068ea6e7ac28875b03ab8bc0aa0966452db17cd36714/charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794", size = 208091, upload-time = "2025-10-14T04:41:13.346Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/62/73a6d7450829655a35bb88a88fca7d736f9882a27eacdca2c6d505b57e2e/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed", size = 147936, upload-time = "2025-10-14T04:41:14.461Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c5/adb8c8b3d6625bef6d88b251bbb0d95f8205831b987631ab0c8bb5d937c2/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72", size = 144180, upload-time = "2025-10-14T04:41:15.588Z" },
+    { url = "https://files.pythonhosted.org/packages/91/ed/9706e4070682d1cc219050b6048bfd293ccf67b3d4f5a4f39207453d4b99/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328", size = 161346, upload-time = "2025-10-14T04:41:16.738Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/0d/031f0d95e4972901a2f6f09ef055751805ff541511dc1252ba3ca1f80cf5/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede", size = 158874, upload-time = "2025-10-14T04:41:17.923Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894", size = 153076, upload-time = "2025-10-14T04:41:19.106Z" },
+    { url = "https://files.pythonhosted.org/packages/75/1e/5ff781ddf5260e387d6419959ee89ef13878229732732ee73cdae01800f2/charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1", size = 150601, upload-time = "2025-10-14T04:41:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/57/71be810965493d3510a6ca79b90c19e48696fb1ff964da319334b12677f0/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490", size = 150376, upload-time = "2025-10-14T04:41:21.398Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d5/c3d057a78c181d007014feb7e9f2e65905a6c4ef182c0ddf0de2924edd65/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44", size = 144825, upload-time = "2025-10-14T04:41:22.583Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/8c/d0406294828d4976f275ffbe66f00266c4b3136b7506941d87c00cab5272/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133", size = 162583, upload-time = "2025-10-14T04:41:23.754Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/24/e2aa1f18c8f15c4c0e932d9287b8609dd30ad56dbe41d926bd846e22fb8d/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3", size = 150366, upload-time = "2025-10-14T04:41:25.27Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/5b/1e6160c7739aad1e2df054300cc618b06bf784a7a164b0f238360721ab86/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e", size = 160300, upload-time = "2025-10-14T04:41:26.725Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/10/f882167cd207fbdd743e55534d5d9620e095089d176d55cb22d5322f2afd/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc", size = 154465, upload-time = "2025-10-14T04:41:28.322Z" },
+    { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" },
+    { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" },
+    { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" },
+    { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" },
+    { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" },
+    { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" },
+    { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "cryptography"
+version = "46.0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/60/04/ee2a9e8542e4fa2773b81771ff8349ff19cdd56b7258a0cc442639052edb/cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d", size = 750064, upload-time = "2026-02-10T19:18:38.255Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f7/81/b0bb27f2ba931a65409c6b8a8b358a7f03c0e46eceacddff55f7c84b1f3b/cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad", size = 7176289, upload-time = "2026-02-10T19:17:08.274Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/9e/6b4397a3e3d15123de3b1806ef342522393d50736c13b20ec4c9ea6693a6/cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b", size = 4275637, upload-time = "2026-02-10T19:17:10.53Z" },
+    { url = "https://files.pythonhosted.org/packages/63/e7/471ab61099a3920b0c77852ea3f0ea611c9702f651600397ac567848b897/cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b", size = 4424742, upload-time = "2026-02-10T19:17:12.388Z" },
+    { url = "https://files.pythonhosted.org/packages/37/53/a18500f270342d66bf7e4d9f091114e31e5ee9e7375a5aba2e85a91e0044/cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263", size = 4277528, upload-time = "2026-02-10T19:17:13.853Z" },
+    { url = "https://files.pythonhosted.org/packages/22/29/c2e812ebc38c57b40e7c583895e73c8c5adb4d1e4a0cc4c5a4fdab2b1acc/cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d", size = 4947993, upload-time = "2026-02-10T19:17:15.618Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/e7/237155ae19a9023de7e30ec64e5d99a9431a567407ac21170a046d22a5a3/cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed", size = 4456855, upload-time = "2026-02-10T19:17:17.221Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/87/fc628a7ad85b81206738abbd213b07702bcbdada1dd43f72236ef3cffbb5/cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2", size = 3984635, upload-time = "2026-02-10T19:17:18.792Z" },
+    { url = "https://files.pythonhosted.org/packages/84/29/65b55622bde135aedf4565dc509d99b560ee4095e56989e815f8fd2aa910/cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2", size = 4277038, upload-time = "2026-02-10T19:17:20.256Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/36/45e76c68d7311432741faf1fbf7fac8a196a0a735ca21f504c75d37e2558/cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0", size = 4912181, upload-time = "2026-02-10T19:17:21.825Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/1a/c1ba8fead184d6e3d5afcf03d569acac5ad063f3ac9fb7258af158f7e378/cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731", size = 4456482, upload-time = "2026-02-10T19:17:25.133Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/e5/3fb22e37f66827ced3b902cf895e6a6bc1d095b5b26be26bd13c441fdf19/cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82", size = 4405497, upload-time = "2026-02-10T19:17:26.66Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/df/9d58bb32b1121a8a2f27383fabae4d63080c7ca60b9b5c88be742be04ee7/cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1", size = 4667819, upload-time = "2026-02-10T19:17:28.569Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/ed/325d2a490c5e94038cdb0117da9397ece1f11201f425c4e9c57fe5b9f08b/cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48", size = 3028230, upload-time = "2026-02-10T19:17:30.518Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/5a/ac0f49e48063ab4255d9e3b79f5def51697fce1a95ea1370f03dc9db76f6/cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4", size = 3480909, upload-time = "2026-02-10T19:17:32.083Z" },
+    { url = "https://files.pythonhosted.org/packages/00/13/3d278bfa7a15a96b9dc22db5a12ad1e48a9eb3d40e1827ef66a5df75d0d0/cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2", size = 7119287, upload-time = "2026-02-10T19:17:33.801Z" },
+    { url = "https://files.pythonhosted.org/packages/67/c8/581a6702e14f0898a0848105cbefd20c058099e2c2d22ef4e476dfec75d7/cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678", size = 4265728, upload-time = "2026-02-10T19:17:35.569Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/4a/ba1a65ce8fc65435e5a849558379896c957870dd64fecea97b1ad5f46a37/cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87", size = 4408287, upload-time = "2026-02-10T19:17:36.938Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/67/8ffdbf7b65ed1ac224d1c2df3943553766914a8ca718747ee3871da6107e/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee", size = 4270291, upload-time = "2026-02-10T19:17:38.748Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/e5/f52377ee93bc2f2bba55a41a886fd208c15276ffbd2569f2ddc89d50e2c5/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981", size = 4927539, upload-time = "2026-02-10T19:17:40.241Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/02/cfe39181b02419bbbbcf3abdd16c1c5c8541f03ca8bda240debc467d5a12/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9", size = 4442199, upload-time = "2026-02-10T19:17:41.789Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/96/2fcaeb4873e536cf71421a388a6c11b5bc846e986b2b069c79363dc1648e/cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648", size = 3960131, upload-time = "2026-02-10T19:17:43.379Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/d2/b27631f401ddd644e94c5cf33c9a4069f72011821cf3dc7309546b0642a0/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4", size = 4270072, upload-time = "2026-02-10T19:17:45.481Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/a7/60d32b0370dae0b4ebe55ffa10e8599a2a59935b5ece1b9f06edb73abdeb/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0", size = 4892170, upload-time = "2026-02-10T19:17:46.997Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/b9/cf73ddf8ef1164330eb0b199a589103c363afa0cf794218c24d524a58eab/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663", size = 4441741, upload-time = "2026-02-10T19:17:48.661Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/eb/eee00b28c84c726fe8fa0158c65afe312d9c3b78d9d01daf700f1f6e37ff/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826", size = 4396728, upload-time = "2026-02-10T19:17:50.058Z" },
+    { url = "https://files.pythonhosted.org/packages/65/f4/6bc1a9ed5aef7145045114b75b77c2a8261b4d38717bd8dea111a63c3442/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d", size = 4652001, upload-time = "2026-02-10T19:17:51.54Z" },
+    { url = "https://files.pythonhosted.org/packages/86/ef/5d00ef966ddd71ac2e6951d278884a84a40ffbd88948ef0e294b214ae9e4/cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a", size = 3003637, upload-time = "2026-02-10T19:17:52.997Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/57/f3f4160123da6d098db78350fdfd9705057aad21de7388eacb2401dceab9/cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4", size = 3469487, upload-time = "2026-02-10T19:17:54.549Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/fa/a66aa722105ad6a458bebd64086ca2b72cdd361fed31763d20390f6f1389/cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31", size = 7170514, upload-time = "2026-02-10T19:17:56.267Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/04/c85bdeab78c8bc77b701bf0d9bdcf514c044e18a46dcff330df5448631b0/cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18", size = 4275349, upload-time = "2026-02-10T19:17:58.419Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/32/9b87132a2f91ee7f5223b091dc963055503e9b442c98fc0b8a5ca765fab0/cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235", size = 4420667, upload-time = "2026-02-10T19:18:00.619Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/a6/a7cb7010bec4b7c5692ca6f024150371b295ee1c108bdc1c400e4c44562b/cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a", size = 4276980, upload-time = "2026-02-10T19:18:02.379Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/7c/c4f45e0eeff9b91e3f12dbd0e165fcf2a38847288fcfd889deea99fb7b6d/cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76", size = 4939143, upload-time = "2026-02-10T19:18:03.964Z" },
+    { url = "https://files.pythonhosted.org/packages/37/19/e1b8f964a834eddb44fa1b9a9976f4e414cbb7aa62809b6760c8803d22d1/cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614", size = 4453674, upload-time = "2026-02-10T19:18:05.588Z" },
+    { url = "https://files.pythonhosted.org/packages/db/ed/db15d3956f65264ca204625597c410d420e26530c4e2943e05a0d2f24d51/cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229", size = 3978801, upload-time = "2026-02-10T19:18:07.167Z" },
+    { url = "https://files.pythonhosted.org/packages/41/e2/df40a31d82df0a70a0daf69791f91dbb70e47644c58581d654879b382d11/cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1", size = 4276755, upload-time = "2026-02-10T19:18:09.813Z" },
+    { url = "https://files.pythonhosted.org/packages/33/45/726809d1176959f4a896b86907b98ff4391a8aa29c0aaaf9450a8a10630e/cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d", size = 4901539, upload-time = "2026-02-10T19:18:11.263Z" },
+    { url = "https://files.pythonhosted.org/packages/99/0f/a3076874e9c88ecb2ecc31382f6e7c21b428ede6f55aafa1aa272613e3cd/cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c", size = 4452794, upload-time = "2026-02-10T19:18:12.914Z" },
+    { url = "https://files.pythonhosted.org/packages/02/ef/ffeb542d3683d24194a38f66ca17c0a4b8bf10631feef44a7ef64e631b1a/cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4", size = 4404160, upload-time = "2026-02-10T19:18:14.375Z" },
+    { url = "https://files.pythonhosted.org/packages/96/93/682d2b43c1d5f1406ed048f377c0fc9fc8f7b0447a478d5c65ab3d3a66eb/cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9", size = 4667123, upload-time = "2026-02-10T19:18:15.886Z" },
+    { url = "https://files.pythonhosted.org/packages/45/2d/9c5f2926cb5300a8eefc3f4f0b3f3df39db7f7ce40c8365444c49363cbda/cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72", size = 3010220, upload-time = "2026-02-10T19:18:17.361Z" },
+    { url = "https://files.pythonhosted.org/packages/48/ef/0c2f4a8e31018a986949d34a01115dd057bf536905dca38897bacd21fac3/cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595", size = 3467050, upload-time = "2026-02-10T19:18:18.899Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/dd/2d9fdb07cebdf3d51179730afb7d5e576153c6744c3ff8fded23030c204e/cryptography-46.0.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c", size = 3476964, upload-time = "2026-02-10T19:18:20.687Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/6f/6cc6cc9955caa6eaf83660b0da2b077c7fe8ff9950a3c5e45d605038d439/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a", size = 4218321, upload-time = "2026-02-10T19:18:22.349Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/5d/c4da701939eeee699566a6c1367427ab91a8b7088cc2328c09dbee940415/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356", size = 4381786, upload-time = "2026-02-10T19:18:24.529Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/97/a538654732974a94ff96c1db621fa464f455c02d4bb7d2652f4edc21d600/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da", size = 4217990, upload-time = "2026-02-10T19:18:25.957Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/11/7e500d2dd3ba891197b9efd2da5454b74336d64a7cc419aa7327ab74e5f6/cryptography-46.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257", size = 4381252, upload-time = "2026-02-10T19:18:27.496Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605, upload-time = "2026-02-10T19:18:29.233Z" },
+]
+
+[[package]]
+name = "datafusion-benchmarks"
+version = "0.1.0"
+source = { virtual = "benchmarks" }
+dependencies = [
+    { name = "falsa" },
+    { name = "rich" },
+    { name = "typing-extensions" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "falsa" },
+    { name = "rich" },
+    { name = "typing-extensions" },
+]
+
+[[package]]
+name = "datafusion-dev"
+version = "0.1.0"
+source = { virtual = "dev" }
+dependencies = [
+    { name = "pygithub" },
+    { name = "requests" },
+    { name = "tomlkit" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "pygithub" },
+    { name = "requests" },
+    { name = "tomlkit" },
+]
+
+[[package]]
+name = "datafusion-docs"
+version = "0.1.0"
+source = { virtual = "docs" }
+dependencies = [
+    { name = "jinja2" },
+    { name = "maturin" },
+    { name = "myst-parser" },
+    { name = "pydata-sphinx-theme" },
+    { name = "setuptools" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "sphinx-reredirects" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "jinja2", specifier = ">=3.1,<4" },
+    { name = "maturin", specifier = ">=1.11,<2" },
+    { name = "myst-parser", specifier = ">=5,<6" },
+    { name = "pydata-sphinx-theme", specifier = ">=0.16,<1" },
+    { name = "setuptools", specifier = ">=82,<83" },
+    { name = "sphinx", specifier = ">=9,<10" },
+    { name = "sphinx-reredirects", specifier = ">=1.1,<2" },
+]
+
+[[package]]
+name = "docutils"
+version = "0.22.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
+]
+
+[[package]]
+name = "falsa"
+version = "0.0.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pyarrow" },
+    { name = "typer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/36/65/0f51f3509cfe4f8cc5b9b1a7ba614a5c0ca0b7ada7a2f8de4275ddc5d979/falsa-0.0.6.tar.gz", hash = "sha256:1b037941886755a73a77f3c80ecb661ee4732085bd68947c0ec788f77b487b32", size = 524238, upload-time = "2025-09-20T07:35:15.162Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/61/9fb4f242b37ecf4b706703cdc1c8ca0e8333edab42172340d27680c19c86/falsa-0.0.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048d6b23fe7d2457761a406c667110904634685bac4816732455ee0c4f38ad0b", size = 437619, upload-time = "2025-09-20T07:33:31.806Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/cd/efb9c57f94d339a9dc7cf3ae555fa7dabcdf9c4c5d18bd1cf464b93e5457/falsa-0.0.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:85d96e0a0c481f50023ff5aa18b4dd663cdad7b778d2f98ca7d21e3fa132eef3", size = 435477, upload-time = "2025-09-20T07:33:43.118Z" },
+    { url = "https://files.pythonhosted.org/packages/17/85/814e049f046f25611be25352959be8a9a711ef384b46cba7c0797fe03882/falsa-0.0.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e44ecdff3361e4ecbfc67b84dc0ed04e3f73d37b20ebfb435c8d1ebca7b85bb9", size = 652226, upload-time = "2025-09-20T07:33:54.515Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/a3/0a064fedccc3462ea413c87d15b35da854878b300d432bd79a3404b4de36/falsa-0.0.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dc08fbb6833ead8bf63106837615236e259dd05fc4d1dd4b1b91b949ba632e2", size = 476290, upload-time = "2025-09-20T07:34:05.171Z" },
+    { url = "https://files.pythonhosted.org/packages/46/38/d7f9182a505439d893c9741acf12a9daa04ea2ae9c9afff01a65fc5619ef/falsa-0.0.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b57b6ef70842776c5698498d04c1c38602b255083ee6822fe6d8a67aa32b3260", size = 598436, upload-time = "2025-09-20T07:34:26.207Z" },
+    { url = "https://files.pythonhosted.org/packages/61/03/6199cc9011e8e708bef3e0420009b4e93be517f642184ee1f564b33b16d5/falsa-0.0.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9305aabafdf1be131b157d97ba7e105da115eef0e02af73f4716bcae64a18041", size = 461327, upload-time = "2025-09-20T07:34:16.337Z" },
+    { url = "https://files.pythonhosted.org/packages/85/58/8d72300acf63c671f4ed8fcf6d74312581e6ad72d530676ec4a8c30e2b06/falsa-0.0.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0ffaf1c24296b16320b11116420d221b4678f1c4942ecf88599b33b094e78c7", size = 616922, upload-time = "2025-09-20T07:34:34.73Z" },
+    { url = "https://files.pythonhosted.org/packages/31/09/da0a47ef5f56d3b9466f24b0451d6f326c6637da383b3b95b07ccd7be7c3/falsa-0.0.6-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:828f151c6737ed4d9051edbf695738e4d758815c316b58fa18166e0ab3d1fea7", size = 699657, upload-time = "2025-09-20T07:34:45.774Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/98/bc733bc0d88fb975577b530dca848cfcfbae20010af1884822d18fed634e/falsa-0.0.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:938f5170282f699638e0c7a941cc80235bd5ca8a8c5a19b65615aa0dc6fbf3f8", size = 632823, upload-time = "2025-09-20T07:34:56.436Z" },
+    { url = "https://files.pythonhosted.org/packages/42/8e/eb5a164f44dddf674c6c248da8d4f241dc8d2bf1fcff4db74bc00f9c0036/falsa-0.0.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56e500c635ad608fe3cf7d2634bd6e3d736aa432dfe00498af14e470eb354254", size = 605256, upload-time = "2025-09-20T07:35:06.564Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/20/3d74be0cc90d3d6d4edea625c5e57efa404a388428506c54f11cbd8413f0/falsa-0.0.6-cp311-cp311-win32.whl", hash = "sha256:fe0ff809e7246d1b06e03662c3a84f2e10d252590f62e06d0f937d498cda24d8", size = 253058, upload-time = "2025-09-20T07:35:21.813Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/f4/95c01bd3fda06fbe711e69252ba99a99484a701ca426481556cb362a7121/falsa-0.0.6-cp311-cp311-win_amd64.whl", hash = "sha256:050bc5eb7cbd1c0c6551851af0d3ef6a6db1794123c49718bdf2472103facf65", size = 276389, upload-time = "2025-09-20T07:35:17.047Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/f7/bce7df04f3ea86c88e6b2b82bd4cfce3d50b0057b68ae98fb1703730ad3e/falsa-0.0.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2a17bf26161fd5fdde8db3bcb0f290bbcad679ae231842d53bfebd506130faf", size = 436615, upload-time = "2025-09-20T07:33:32.811Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/34/e42d33525910f37b165ba765a8548eca8079ee94ec4ca4001a3f13e7eab1/falsa-0.0.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c685c34779b33e8db9d13517931d3ea6df785756fea26b7ac11a49059c1375ca", size = 435130, upload-time = "2025-09-20T07:33:44.498Z" },
+    { url = "https://files.pythonhosted.org/packages/53/dc/212f5b3b7e7a99a3867af1d49745e393d79610aa4c2218c72b6a4c9e9312/falsa-0.0.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6682631faa42ad303730872db6dce7b809da94842546fbd15431ebabba2b99bc", size = 651373, upload-time = "2025-09-20T07:33:55.721Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/e5/076c350bd7f6887463f28d7c49d97abb738daaeab356da5c5793720d32ba/falsa-0.0.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf5d69cce8670b8d8617daa0a874e5bcb0a3409d368bfb044354b0db9404ff72", size = 475126, upload-time = "2025-09-20T07:34:06.562Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/3c/44d9e23b01da33b094bd4ee4cdae4f667a1cf0e123413981d16509660609/falsa-0.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:818ed089f8088ff9d170f366ad3df07c1458581d864ec3153b48be5bf06fc6f3", size = 597193, upload-time = "2025-09-20T07:34:27.531Z" },
+    { url = "https://files.pythonhosted.org/packages/11/aa/70afcfbb1d76ccf275d7fb1cb6ee99720039a11b9d66ed23219f6cd4209a/falsa-0.0.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0e48df7acf762af490fcc3bfe9baeaeec82d151669e111c7630b37d38707bf73", size = 460932, upload-time = "2025-09-20T07:34:17.351Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/54/bd69faa0989fbbdf61793dedff7d953cd3832580ef35398f9f5a43443b29/falsa-0.0.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:eee10e87d74efe7a089db0a58c8cb6e02082b80618c8be70c75816e818d0194a", size = 616017, upload-time = "2025-09-20T07:34:36.222Z" },
+    { url = "https://files.pythonhosted.org/packages/26/29/06a92316c7799337a40c7e3d8737827ea3590b1bdc66fb8341c720d96e8e/falsa-0.0.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a97cc63f77f635e9ec738584565edf933d31078e94825788c236864488e7b062", size = 698946, upload-time = "2025-09-20T07:34:47.185Z" },
+    { url = "https://files.pythonhosted.org/packages/df/14/5081e53d8e2927f86af70007e7d424a8bc3992527f87db78d8f21541e89c/falsa-0.0.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4a3494b7c352e506c64c708b64e85afcb593419d541dbadf38405dc0fbc02f61", size = 632186, upload-time = "2025-09-20T07:34:57.499Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/fe/8d691ed9f2159726828cbe0765c579c032d35eb647ccfeb6ab10ffaa2f48/falsa-0.0.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:852d57713f169043d9ecbdb2ae6b8a93e87de68aa790e800f487fa61dfed1729", size = 603671, upload-time = "2025-09-20T07:35:07.65Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/70/425e1ad3b447a86c4f246433020d6c5ff359f278120e57e08e4b0b91cd16/falsa-0.0.6-cp312-cp312-win_amd64.whl", hash = "sha256:ea831bfdcbca03c2ca220dc61b2a8de14526af9a9a6a014f275299aace25f5c5", size = 275829, upload-time = "2025-09-20T07:35:18.074Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/8f/fb2e90057ae3f69b89f188c83dc4b930b34e6ecf89d7e5b7d99ae07e6b52/falsa-0.0.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7bb9884f8cf468e1de57f0fa59532ed99c8bfd41999cf85e57e78a9fb8fd0ca", size = 436591, upload-time = "2025-09-20T07:33:34.336Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/c2/57e1b88757e637865fb2390560f927fd9eb60e793d82bbcf18d411b36104/falsa-0.0.6-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc80e361b29d19d5739a6cb1ace1e00765f139e1d065c70693a644f7c4375089", size = 434955, upload-time = "2025-09-20T07:33:45.802Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/29/79585d31bce867fa083d2ca11bb469a3530077407ea2549046d6e496df24/falsa-0.0.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca667084eb89a07893c373bbe05492235482a214b23b13da39626d71c9028ce7", size = 650688, upload-time = "2025-09-20T07:33:56.767Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/50/cda029ec50341601c283b040748172ba9cacc0a16880e93e4cb6239a715e/falsa-0.0.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4811ab6aa0b2a155180aac6b3800ae5ea800bf422bddf8fb11daa509908c793", size = 475074, upload-time = "2025-09-20T07:34:07.88Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/62/1272b0c50203d0be2df3253e237f1ddbadce1642117d9dab4fb658fd241a/falsa-0.0.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd0e7075aa22daaa970ca113502c51d1e0d89cf3322be116213099f61aa5fe", size = 597359, upload-time = "2025-09-20T07:34:28.566Z" },
+    { url = "https://files.pythonhosted.org/packages/72/c9/4cc472d2e734bd4788ff5ce43825aaeba4715fc70f4900f2bfd6099b809e/falsa-0.0.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:37882088385512187511311d56a26226d45fd4f53dad081e50fdb07f587e0201", size = 461025, upload-time = "2025-09-20T07:34:18.436Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/a3/32206b72a42c06d771cd18b1211321d2fa413695e4cc9616b72d80708252/falsa-0.0.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:22f0c8dd927e857480c83b4db1e4209021e0a301efb8e76b2d3a91ad747b3768", size = 616183, upload-time = "2025-09-20T07:34:37.526Z" },
+    { url = "https://files.pythonhosted.org/packages/54/57/244227fd859a5173938501a17bd2ec81c09ce25a60472dceb1f54dbb529b/falsa-0.0.6-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:fd06795b6873926a507f685eb147a06fb6c7282789ceb550558c42325bcbc637", size = 698951, upload-time = "2025-09-20T07:34:48.241Z" },
+    { url = "https://files.pythonhosted.org/packages/41/6f/57d82f555f288ea9106b7a7ffb1978d27f8ffc1bf52753b8c2c4298acc00/falsa-0.0.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5b6fd2c5cc4bbcae5b1a28f533705eb95ba0e220c8b70c67c830e86309477fb5", size = 632175, upload-time = "2025-09-20T07:34:58.664Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a4/64c6c7dfe0e73948ead7e19217e38116853fa49512ee91dfdf41e8f799ca/falsa-0.0.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ea73bd1b098198b0cabd94eec7952de37051024b26805a30906ed350d3b474a8", size = 604022, upload-time = "2025-09-20T07:35:08.71Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/e2/42d9b92f09671cacc629a000d08656fe4f0da4ec818f4841fa700a0651f0/falsa-0.0.6-cp313-cp313-win_amd64.whl", hash = "sha256:80908855b7e8144add3d5f9b1ff7ef58d2fc574a6e8f7ac755437a178058d2ac", size = 275625, upload-time = "2025-09-20T07:35:19.664Z" },
+    { url = "https://files.pythonhosted.org/packages/90/9e/304d3ce465ca33055ed22560e7694dd8418f200d1c6eaca16236aa24035e/falsa-0.0.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6177b18bb6e61f333cca5c73d1c60a809a688937090130f8baeea4363366b9e", size = 436505, upload-time = "2025-09-20T07:33:35.655Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/e8/0f51c6562ee4e0c572e3cac4c9ea338678a15e349351474e4f298184f8c0/falsa-0.0.6-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5df6bedb01acf73134f565b0352493b981aa3ea84d09fd4e8d6f2c618042a1f3", size = 433993, upload-time = "2025-09-20T07:33:47.056Z" },
+    { url = "https://files.pythonhosted.org/packages/46/6e/7a0a4acfc0bf397fd6f3c749040287c75e6fc9677d32ec20bca8e06ae4e0/falsa-0.0.6-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:731acd74b9b41e9bca388176c7e7be6ea48b5ba136f149f41bdfaaaaa53a40e4", size = 649979, upload-time = "2025-09-20T07:33:57.991Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/2a/19d66b0b38232d6230ed163e9c24c55683f38348930e25c7e36188b9e7a1/falsa-0.0.6-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d7aa02f407b473fe81a5e94d3cbaa5ba34e243da35593fbfb1b71351093eac8", size = 474443, upload-time = "2025-09-20T07:34:08.949Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/df/80bea42472af460b2b18c3bb547ae5eaf55bea9eff63f5abf266dca51b5a/falsa-0.0.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f0214f94434924e03308b48a81ddf246d0c8c9e1e4b323184bb417fe81df190e", size = 615972, upload-time = "2025-09-20T07:34:38.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/6d/449f03ad7b5c31f7cac1fc7177419a67d0c53b7733c83034772ca491b697/falsa-0.0.6-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:4e2982b9ef053fedca216f6abeb5d7325d73f4df24540dd9a0fe8463a9c80abd", size = 698052, upload-time = "2025-09-20T07:34:49.336Z" },
+    { url = "https://files.pythonhosted.org/packages/34/6f/723bed02c00e9b3741a2b8fdbbca1afb7ba3fc2ad398be85cd477408f611/falsa-0.0.6-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:4953ae9f87aefed8a3936562dbab20dd6b3a6cdadf32f009ef552e9e5df96a56", size = 631684, upload-time = "2025-09-20T07:34:59.715Z" },
+    { url = "https://files.pythonhosted.org/packages/54/70/a8a0bda4afa93bd602ce05efe3f615f25e2145880e5abb0f8138312fcaed/falsa-0.0.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fcf31b451835037ccdf6b9adb9353d99981178d6e96601b6b023fbac1db74342", size = 604314, upload-time = "2025-09-20T07:35:09.78Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/47/6e1a6a2cf730e7cf5b2a5159066590a5151867b0cf1c913386285b39d52c/falsa-0.0.6-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de0bd27c505b47c8870463ef9376e52e72d54a7f3bb7b393e6a0f5fe8227c95e", size = 597105, upload-time = "2025-09-20T07:34:29.668Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/a0/3d697341c44c238e635af6f4ccc87d1150edbb5374c67e6f7c86c9818336/falsa-0.0.6-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf8f50d6f8f65009ae5b986f4220dd823cb22d704221e29ca91a06dd0c178599", size = 461233, upload-time = "2025-09-20T07:34:19.704Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/a6/a59e8d6f27c049a0955f3b7d7a229633213f485b0175d6a348fc66047bdd/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b8714397240eeb05f490b8e2c1ca6592edb2e6c5e6652baaf1d29ea4bd2c4a6", size = 438116, upload-time = "2025-09-20T07:33:39.668Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/e8/27f367c60dd662e009dd2945c1fdbc74fad277c6b668d02ee004ba41e2ee/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47a610301a11f1b53c12092d97b5dff80e576b1534883e62a02d019bc759d06f", size = 436210, upload-time = "2025-09-20T07:33:50.477Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/a4/6163320b1130da9333f851633a6f7b726ea42974bafc6db333fc3c0a69e0/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:13c98c49225232016dfd8bdd0e5f2e10649f9d0388fde9b1020b04d7409c9078", size = 651561, upload-time = "2025-09-20T07:34:01.522Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/5d/f06f625cb2e9af5769f0f755154469e9a280b9ce6bedfff15564bce9483a/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e8d63db146847709114032382c4cdaf7274654781d3a56732eb5e622350654f2", size = 476530, upload-time = "2025-09-20T07:34:12.248Z" },
+    { url = "https://files.pythonhosted.org/packages/54/cb/81fd6f2d542ef1833485d95f766c29bf5a9bf73213d4c6dad8b2c4541327/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b552b1525300b14abd2400dc692cfb79de6813cec725deca03aaf251ca94111", size = 598516, upload-time = "2025-09-20T07:34:31.807Z" },
+    { url = "https://files.pythonhosted.org/packages/97/33/07809af6ff17d1fc3e059ea1a73a76cc5593661832cf0c91498be9bc8172/falsa-0.0.6-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:535f9d6cc9a745d7aed0b108f8447de1780e548fc30fbeb0d360f8403ed86b6e", size = 461808, upload-time = "2025-09-20T07:34:24.119Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/6a/0b4a3903f7c8ed15e2f5c8b4d226e0cf214f7f32dca1b74a8064f6d27c47/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:04109d8e1c58cd8d87d513546fa945db4b5883e1ddc29a1dc14b9bb999991d6d", size = 617349, upload-time = "2025-09-20T07:34:42.168Z" },
+    { url = "https://files.pythonhosted.org/packages/08/cc/3a7d98bd4f8569c9ec683d358379b6167e19911007263fcc45e4f414f407/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:9623ada575625e65245488ec6ef7cf09e40e134245c5ab8a440267338212f73e", size = 700202, upload-time = "2025-09-20T07:34:52.724Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/5c/88e1a1d2c29b83e0c5da30960815f830dd79694c474f6b7ae2eb716a8e65/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:9a8e8cd40e0389f56c2fb41bd0a0c2472c2365265b78966c7f187aaf3409558a", size = 633105, upload-time = "2025-09-20T07:35:03.315Z" },
+    { url = "https://files.pythonhosted.org/packages/37/03/94f5e53369796b3e93c3d942d6c010f3215957330a697a2c715fe93f2ac6/falsa-0.0.6-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:afaadf6ac8599bbf2e42f54bccda76e9f0218f6d6429085186d38d243c6b28da", size = 605690, upload-time = "2025-09-20T07:35:13.015Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
+]
+
+[[package]]
+name = "imagesize"
+version = "1.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" },
+    { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" },
+    { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" },
+    { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
+    { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
+    { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
+    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
+[[package]]
+name = "maturin"
+version = "1.12.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/13/aeff8a21835ed0e40c329c286750fcdcdcbf231f1a5cb327378666c5def6/maturin-1.12.2.tar.gz", hash = "sha256:d6253079f53dbb692395a13abddc0f2d3d96af32f8c0b32e2912849713c55794", size = 257279, upload-time = "2026-02-16T13:56:20.221Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/9d/4811e1fcaa346a0b9fad6aee0ac0eec9eb376a24fe27c66d5d4fe975586e/maturin-1.12.2-py3-none-linux_armv6l.whl", hash = "sha256:0ed31b6a392928ad23645a470edc4f3814b952a416e41f8e5daac42d7bfbabc6", size = 9653200, upload-time = "2026-02-16T13:56:16.216Z" },
+    { url = "https://files.pythonhosted.org/packages/69/db/74d582af74c32bbda12e4d7e153b389884409a1c5cd31edc9d3194d515f7/maturin-1.12.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f1c2e4ee43bf286b052091a3b2356a157978985837c7aed42354deb2947a4006", size = 18870087, upload-time = "2026-02-16T13:56:18.463Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6f/71be226c6780387f032c0b4ab791c390c7162ed62f93a11e600f9266dafd/maturin-1.12.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:04c9c4f9c9f904f007cbfcd4640c406e53f19d04c220f5940d1537edb914d325", size = 9762083, upload-time = "2026-02-16T13:56:27.853Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/cc/989dce6140227277b4184aab248d07fe67fa11f95411ccf90e272542287d/maturin-1.12.2-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:4bdc486b9ab80d8b50143ecc9a1924b890866fe95be150dd9a59fa22a6b37238", size = 9710711, upload-time = "2026-02-16T13:56:21.364Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/e8/02bb64f7150013d8af3ca622944e22f550beb312b6d5cf8760dc2896cce8/maturin-1.12.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:134e895578258a693ba1d55b166c2ba96e9f51067e106b8a74d422432653d45b", size = 10205015, upload-time = "2026-02-16T13:56:07.994Z" },
+    { url = "https://files.pythonhosted.org/packages/84/81/b603a74bef68fabd402d1e54f43560213ea69c3c01467610d0256eea013b/maturin-1.12.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:39665d622dcc950ab17b9569e8cab84a4d64eea6a18b540a8b49e00c0f7dda02", size = 9536887, upload-time = "2026-02-16T13:56:25.658Z" },
+    { url = "https://files.pythonhosted.org/packages/70/a5/387c7bced34f7fd8d08d399c6b1ac3d94d7ca50c9f87db9e1bc0dd8c8d08/maturin-1.12.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ca3b20bcc3aff115c9eaf97340e78bff58829ea1efa16764940dd0d858dcf6af", size = 9487394, upload-time = "2026-02-16T13:56:29.875Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/30/d5ae812c54a70d5d3a5b67b073e92d1d14d36675242e2d00e6a175fa6117/maturin-1.12.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:d1617989b4a5dc543fea6d23c28b2f07fadb2c726ff00fe959538ee71a301384", size = 12577754, upload-time = "2026-02-16T13:56:31.902Z" },
+    { url = "https://files.pythonhosted.org/packages/84/f4/7baac2fa5324ccdc3f888ff5f6a793f3eb5a7805d89bc17a8bacbe9fc566/maturin-1.12.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6af778e7ee048612e55a1255488db7678741bea2ba881e66a19712f59f2534cb", size = 10375409, upload-time = "2026-02-16T13:56:23.316Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/ed/5680efbb1becb4f47da3ada8ea4eb6844d2fd91ae558287e1dd0871cb603/maturin-1.12.2-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:72aad9efe09a6392de9930f2bea80bfcc36fd98e18caa621f512571179c02d41", size = 10010584, upload-time = "2026-02-16T13:56:10.357Z" },
+    { url = "https://files.pythonhosted.org/packages/86/20/7e27e07dd2270b707dd0124256cd46bef7c8832476b0aefa2ecd74835365/maturin-1.12.2-py3-none-win32.whl", hash = "sha256:9763d277e143409cf0ce309eb1a493fc4e1e75777364d67ccac39a161b51b5b0", size = 8483122, upload-time = "2026-02-16T13:56:12.606Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/6e/9cc0e19c9a336fbc1b9664c1a7955caa6d8fd510c0047ace9be66a33704a/maturin-1.12.2-py3-none-win_amd64.whl", hash = "sha256:c06d218931985035d7ab4d0211ba96027e1bc7e4b01a87c8c4e30a57790403ec", size = 9825577, upload-time = "2026-02-16T13:56:34.193Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/67/07ea2c991ca1a55c6b08cd821710736276af7a3e160e1f869ea5c41c78c3/maturin-1.12.2-py3-none-win_arm64.whl", hash = "sha256:a882cc80c241b1e2c27bd1acd713b09e9ac9266a3159cc1e34e8c7b77f049bba", size = 8522702, upload-time = "2026-02-16T13:56:14.42Z" },
+]
+
+[[package]]
+name = "mdit-py-plugins"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b2/fd/a756d36c0bfba5f6e39a1cdbdbfdd448dc02692467d83816dff4592a1ebc/mdit_py_plugins-0.5.0.tar.gz", hash = "sha256:f4918cb50119f50446560513a8e311d574ff6aaed72606ddae6d35716fe809c6", size = 44655, upload-time = "2025-08-11T07:25:49.083Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/86/dd6e5db36df29e76c7a7699123569a4a18c1623ce68d826ed96c62643cae/mdit_py_plugins-0.5.0-py3-none-any.whl", hash = "sha256:07a08422fc1936a5d26d146759e9155ea466e842f5ab2f7d2266dd084c8dab1f", size = 57205, upload-time = "2025-08-11T07:25:47.597Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "myst-parser"
+version = "5.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "jinja2" },
+    { name = "markdown-it-py" },
+    { name = "mdit-py-plugins" },
+    { name = "pyyaml" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/fa/7b45eef11b7971f0beb29d27b7bfe0d747d063aa29e170d9edd004733c8a/myst_parser-5.0.0.tar.gz", hash = "sha256:f6f231452c56e8baa662cc352c548158f6a16fcbd6e3800fc594978002b94f3a", size = 98535, upload-time = "2026-01-15T09:08:18.036Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/ac/686789b9145413f1a61878c407210e41bfdb097976864e0913078b24098c/myst_parser-5.0.0-py3-none-any.whl", hash = "sha256:ab31e516024918296e169139072b81592336f2fef55b8986aa31c9f04b5f7211", size = 84533, upload-time = "2026-01-15T09:08:16.788Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/fd/0005efbd0af48e55eb3c7208af93f2862d4b1a56cd78e84309a2d959208d/numpy-2.4.2.tar.gz", hash = "sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae", size = 20723651, upload-time = "2026-01-31T23:13:10.135Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/44/71852273146957899753e69986246d6a176061ea183407e95418c2aa4d9a/numpy-2.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7e88598032542bd49af7c4747541422884219056c268823ef6e5e89851c8825", size = 16955478, upload-time = "2026-01-31T23:10:25.623Z" },
+    { url = "https://files.pythonhosted.org/packages/74/41/5d17d4058bd0cd96bcbd4d9ff0fb2e21f52702aab9a72e4a594efa18692f/numpy-2.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7edc794af8b36ca37ef5fcb5e0d128c7e0595c7b96a2318d1badb6fcd8ee86b1", size = 14965467, upload-time = "2026-01-31T23:10:28.186Z" },
+    { url = "https://files.pythonhosted.org/packages/49/48/fb1ce8136c19452ed15f033f8aee91d5defe515094e330ce368a0647846f/numpy-2.4.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:6e9f61981ace1360e42737e2bae58b27bf28a1b27e781721047d84bd754d32e7", size = 5475172, upload-time = "2026-01-31T23:10:30.848Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a9/3feb49f17bbd1300dd2570432961f5c8a4ffeff1db6f02c7273bd020a4c9/numpy-2.4.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:cb7bbb88aa74908950d979eeaa24dbdf1a865e3c7e45ff0121d8f70387b55f73", size = 6805145, upload-time = "2026-01-31T23:10:32.352Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/39/fdf35cbd6d6e2fcad42fcf85ac04a85a0d0fbfbf34b30721c98d602fd70a/numpy-2.4.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f069069931240b3fc703f1e23df63443dbd6390614c8c44a87d96cd0ec81eb1", size = 15966084, upload-time = "2026-01-31T23:10:34.502Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/46/6fa4ea94f1ddf969b2ee941290cca6f1bfac92b53c76ae5f44afe17ceb69/numpy-2.4.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c02ef4401a506fb60b411467ad501e1429a3487abca4664871d9ae0b46c8ba32", size = 16899477, upload-time = "2026-01-31T23:10:37.075Z" },
+    { url = "https://files.pythonhosted.org/packages/09/a1/2a424e162b1a14a5bd860a464ab4e07513916a64ab1683fae262f735ccd2/numpy-2.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2653de5c24910e49c2b106499803124dde62a5a1fe0eedeaecf4309a5f639390", size = 17323429, upload-time = "2026-01-31T23:10:39.704Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a2/73014149ff250628df72c58204822ac01d768697913881aacf839ff78680/numpy-2.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1ae241bbfc6ae276f94a170b14785e561cb5e7f626b6688cf076af4110887413", size = 18635109, upload-time = "2026-01-31T23:10:41.924Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/0c/73e8be2f1accd56df74abc1c5e18527822067dced5ec0861b5bb882c2ce0/numpy-2.4.2-cp311-cp311-win32.whl", hash = "sha256:df1b10187212b198dd45fa943d8985a3c8cf854aed4923796e0e019e113a1bda", size = 6237915, upload-time = "2026-01-31T23:10:45.26Z" },
+    { url = "https://files.pythonhosted.org/packages/76/ae/e0265e0163cf127c24c3969d29f1c4c64551a1e375d95a13d32eab25d364/numpy-2.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:b9c618d56a29c9cb1c4da979e9899be7578d2e0b3c24d52079c166324c9e8695", size = 12607972, upload-time = "2026-01-31T23:10:47.021Z" },
+    { url = "https://files.pythonhosted.org/packages/29/a5/c43029af9b8014d6ea157f192652c50042e8911f4300f8f6ed3336bf437f/numpy-2.4.2-cp311-cp311-win_arm64.whl", hash = "sha256:47c5a6ed21d9452b10227e5e8a0e1c22979811cad7dcc19d8e3e2fb8fa03f1a3", size = 10485763, upload-time = "2026-01-31T23:10:50.087Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6e/6f394c9c77668153e14d4da83bcc247beb5952f6ead7699a1a2992613bea/numpy-2.4.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:21982668592194c609de53ba4933a7471880ccbaadcc52352694a59ecc860b3a", size = 16667963, upload-time = "2026-01-31T23:10:52.147Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f8/55483431f2b2fd015ae6ed4fe62288823ce908437ed49db5a03d15151678/numpy-2.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40397bda92382fcec844066efb11f13e1c9a3e2a8e8f318fb72ed8b6db9f60f1", size = 14693571, upload-time = "2026-01-31T23:10:54.789Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/20/18026832b1845cdc82248208dd929ca14c9d8f2bac391f67440707fff27c/numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b3a24467af63c67829bfaa61eecf18d5432d4f11992688537be59ecd6ad32f5e", size = 5203469, upload-time = "2026-01-31T23:10:57.343Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/2eb97c8a77daaba34eaa3fa7241a14ac5f51c46a6bd5911361b644c4a1e2/numpy-2.4.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:805cc8de9fd6e7a22da5aed858e0ab16be5a4db6c873dde1d7451c541553aa27", size = 6550820, upload-time = "2026-01-31T23:10:59.429Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/91/b97fdfd12dc75b02c44e26c6638241cc004d4079a0321a69c62f51470c4c/numpy-2.4.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d82351358ffbcdcd7b686b90742a9b86632d6c1c051016484fa0b326a0a1548", size = 15663067, upload-time = "2026-01-31T23:11:01.291Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/c6/a18e59f3f0b8071cc85cbc8d80cd02d68aa9710170b2553a117203d46936/numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e35d3e0144137d9fdae62912e869136164534d64a169f86438bc9561b6ad49f", size = 16619782, upload-time = "2026-01-31T23:11:03.669Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/83/9751502164601a79e18847309f5ceec0b1446d7b6aa12305759b72cf98b2/numpy-2.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adb6ed2ad29b9e15321d167d152ee909ec73395901b70936f029c3bc6d7f4460", size = 17013128, upload-time = "2026-01-31T23:11:05.913Z" },
+    { url = "https://files.pythonhosted.org/packages/61/c4/c4066322256ec740acc1c8923a10047818691d2f8aec254798f3dd90f5f2/numpy-2.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8906e71fd8afcb76580404e2a950caef2685df3d2a57fe82a86ac8d33cc007ba", size = 18345324, upload-time = "2026-01-31T23:11:08.248Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/af/6157aa6da728fa4525a755bfad486ae7e3f76d4c1864138003eb84328497/numpy-2.4.2-cp312-cp312-win32.whl", hash = "sha256:ec055f6dae239a6299cace477b479cca2fc125c5675482daf1dd886933a1076f", size = 5960282, upload-time = "2026-01-31T23:11:10.497Z" },
+    { url = "https://files.pythonhosted.org/packages/92/0f/7ceaaeaacb40567071e94dbf2c9480c0ae453d5bb4f52bea3892c39dc83c/numpy-2.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:209fae046e62d0ce6435fcfe3b1a10537e858249b3d9b05829e2a05218296a85", size = 12314210, upload-time = "2026-01-31T23:11:12.176Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/a3/56c5c604fae6dd40fa2ed3040d005fca97e91bd320d232ac9931d77ba13c/numpy-2.4.2-cp312-cp312-win_arm64.whl", hash = "sha256:fbde1b0c6e81d56f5dccd95dd4a711d9b95df1ae4009a60887e56b27e8d903fa", size = 10220171, upload-time = "2026-01-31T23:11:14.684Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/22/815b9fe25d1d7ae7d492152adbc7226d3eff731dffc38fe970589fcaaa38/numpy-2.4.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25f2059807faea4b077a2b6837391b5d830864b3543627f381821c646f31a63c", size = 16663696, upload-time = "2026-01-31T23:11:17.516Z" },
+    { url = "https://files.pythonhosted.org/packages/09/f0/817d03a03f93ba9c6c8993de509277d84e69f9453601915e4a69554102a1/numpy-2.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd3a7a9f5847d2fb8c2c6d1c862fa109c31a9abeca1a3c2bd5a64572955b2979", size = 14688322, upload-time = "2026-01-31T23:11:19.883Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b4/f805ab79293c728b9a99438775ce51885fd4f31b76178767cfc718701a39/numpy-2.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8e4549f8a3c6d13d55041925e912bfd834285ef1dd64d6bc7d542583355e2e98", size = 5198157, upload-time = "2026-01-31T23:11:22.375Z" },
+    { url = "https://files.pythonhosted.org/packages/74/09/826e4289844eccdcd64aac27d13b0fd3f32039915dd5b9ba01baae1f436c/numpy-2.4.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:aea4f66ff44dfddf8c2cffd66ba6538c5ec67d389285292fe428cb2c738c8aef", size = 6546330, upload-time = "2026-01-31T23:11:23.958Z" },
+    { url = "https://files.pythonhosted.org/packages/19/fb/cbfdbfa3057a10aea5422c558ac57538e6acc87ec1669e666d32ac198da7/numpy-2.4.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3cd545784805de05aafe1dde61752ea49a359ccba9760c1e5d1c88a93bbf2b7", size = 15660968, upload-time = "2026-01-31T23:11:25.713Z" },
+    { url = "https://files.pythonhosted.org/packages/04/dc/46066ce18d01645541f0186877377b9371b8fa8017fa8262002b4ef22612/numpy-2.4.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0d9b7c93578baafcbc5f0b83eaf17b79d345c6f36917ba0c67f45226911d499", size = 16607311, upload-time = "2026-01-31T23:11:28.117Z" },
+    { url = "https://files.pythonhosted.org/packages/14/d9/4b5adfc39a43fa6bf918c6d544bc60c05236cc2f6339847fc5b35e6cb5b0/numpy-2.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f74f0f7779cc7ae07d1810aab8ac6b1464c3eafb9e283a40da7309d5e6e48fbb", size = 17012850, upload-time = "2026-01-31T23:11:30.888Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/20/adb6e6adde6d0130046e6fdfb7675cc62bc2f6b7b02239a09eb58435753d/numpy-2.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c7ac672d699bf36275c035e16b65539931347d68b70667d28984c9fb34e07fa7", size = 18334210, upload-time = "2026-01-31T23:11:33.214Z" },
+    { url = "https://files.pythonhosted.org/packages/78/0e/0a73b3dff26803a8c02baa76398015ea2a5434d9b8265a7898a6028c1591/numpy-2.4.2-cp313-cp313-win32.whl", hash = "sha256:8e9afaeb0beff068b4d9cd20d322ba0ee1cecfb0b08db145e4ab4dd44a6b5110", size = 5958199, upload-time = "2026-01-31T23:11:35.385Z" },
+    { url = "https://files.pythonhosted.org/packages/43/bc/6352f343522fcb2c04dbaf94cb30cca6fd32c1a750c06ad6231b4293708c/numpy-2.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:7df2de1e4fba69a51c06c28f5a3de36731eb9639feb8e1cf7e4a7b0daf4cf622", size = 12310848, upload-time = "2026-01-31T23:11:38.001Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/8d/6da186483e308da5da1cc6918ce913dcfe14ffde98e710bfeff2a6158d4e/numpy-2.4.2-cp313-cp313-win_arm64.whl", hash = "sha256:0fece1d1f0a89c16b03442eae5c56dc0be0c7883b5d388e0c03f53019a4bfd71", size = 10221082, upload-time = "2026-01-31T23:11:40.392Z" },
+    { url = "https://files.pythonhosted.org/packages/25/a1/9510aa43555b44781968935c7548a8926274f815de42ad3997e9e83680dd/numpy-2.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5633c0da313330fd20c484c78cdd3f9b175b55e1a766c4a174230c6b70ad8262", size = 14815866, upload-time = "2026-01-31T23:11:42.495Z" },
+    { url = "https://files.pythonhosted.org/packages/36/30/6bbb5e76631a5ae46e7923dd16ca9d3f1c93cfa8d4ed79a129814a9d8db3/numpy-2.4.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d9f64d786b3b1dd742c946c42d15b07497ed14af1a1f3ce840cce27daa0ce913", size = 5325631, upload-time = "2026-01-31T23:11:44.7Z" },
+    { url = "https://files.pythonhosted.org/packages/46/00/3a490938800c1923b567b3a15cd17896e68052e2145d8662aaf3e1ffc58f/numpy-2.4.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:b21041e8cb6a1eb5312dd1d2f80a94d91efffb7a06b70597d44f1bd2dfc315ab", size = 6646254, upload-time = "2026-01-31T23:11:46.341Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/e9/fac0890149898a9b609caa5af7455a948b544746e4b8fe7c212c8edd71f8/numpy-2.4.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:00ab83c56211a1d7c07c25e3217ea6695e50a3e2f255053686b081dc0b091a82", size = 15720138, upload-time = "2026-01-31T23:11:48.082Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/5c/08887c54e68e1e28df53709f1893ce92932cc6f01f7c3d4dc952f61ffd4e/numpy-2.4.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fb882da679409066b4603579619341c6d6898fc83a8995199d5249f986e8e8f", size = 16655398, upload-time = "2026-01-31T23:11:50.293Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/89/253db0fa0e66e9129c745e4ef25631dc37d5f1314dad2b53e907b8538e6d/numpy-2.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:66cb9422236317f9d44b67b4d18f44efe6e9c7f8794ac0462978513359461554", size = 17079064, upload-time = "2026-01-31T23:11:52.927Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/d5/cbade46ce97c59c6c3da525e8d95b7abe8a42974a1dc5c1d489c10433e88/numpy-2.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0f01dcf33e73d80bd8dc0f20a71303abbafa26a19e23f6b68d1aa9990af90257", size = 18379680, upload-time = "2026-01-31T23:11:55.22Z" },
+    { url = "https://files.pythonhosted.org/packages/40/62/48f99ae172a4b63d981babe683685030e8a3df4f246c893ea5c6ef99f018/numpy-2.4.2-cp313-cp313t-win32.whl", hash = "sha256:52b913ec40ff7ae845687b0b34d8d93b60cb66dcee06996dd5c99f2fc9328657", size = 6082433, upload-time = "2026-01-31T23:11:58.096Z" },
+    { url = "https://files.pythonhosted.org/packages/07/38/e054a61cfe48ad9f1ed0d188e78b7e26859d0b60ef21cd9de4897cdb5326/numpy-2.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:5eea80d908b2c1f91486eb95b3fb6fab187e569ec9752ab7d9333d2e66bf2d6b", size = 12451181, upload-time = "2026-01-31T23:11:59.782Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/a4/a05c3a6418575e185dd84d0b9680b6bb2e2dc3e4202f036b7b4e22d6e9dc/numpy-2.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:fd49860271d52127d61197bb50b64f58454e9f578cb4b2c001a6de8b1f50b0b1", size = 10290756, upload-time = "2026-01-31T23:12:02.438Z" },
+    { url = "https://files.pythonhosted.org/packages/18/88/b7df6050bf18fdcfb7046286c6535cabbdd2064a3440fca3f069d319c16e/numpy-2.4.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:444be170853f1f9d528428eceb55f12918e4fda5d8805480f36a002f1415e09b", size = 16663092, upload-time = "2026-01-31T23:12:04.521Z" },
+    { url = "https://files.pythonhosted.org/packages/25/7a/1fee4329abc705a469a4afe6e69b1ef7e915117747886327104a8493a955/numpy-2.4.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d1240d50adff70c2a88217698ca844723068533f3f5c5fa6ee2e3220e3bdb000", size = 14698770, upload-time = "2026-01-31T23:12:06.96Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/0b/f9e49ba6c923678ad5bc38181c08ac5e53b7a5754dbca8e581aa1a56b1ff/numpy-2.4.2-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:7cdde6de52fb6664b00b056341265441192d1291c130e99183ec0d4b110ff8b1", size = 5208562, upload-time = "2026-01-31T23:12:09.632Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/12/d7de8f6f53f9bb76997e5e4c069eda2051e3fe134e9181671c4391677bb2/numpy-2.4.2-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:cda077c2e5b780200b6b3e09d0b42205a3d1c68f30c6dceb90401c13bff8fe74", size = 6543710, upload-time = "2026-01-31T23:12:11.969Z" },
+    { url = "https://files.pythonhosted.org/packages/09/63/c66418c2e0268a31a4cf8a8b512685748200f8e8e8ec6c507ce14e773529/numpy-2.4.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d30291931c915b2ab5717c2974bb95ee891a1cf22ebc16a8006bd59cd210d40a", size = 15677205, upload-time = "2026-01-31T23:12:14.33Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/6c/7f237821c9642fb2a04d2f1e88b4295677144ca93285fd76eff3bcba858d/numpy-2.4.2-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bba37bc29d4d85761deed3954a1bc62be7cf462b9510b51d367b769a8c8df325", size = 16611738, upload-time = "2026-01-31T23:12:16.525Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/a7/39c4cdda9f019b609b5c473899d87abff092fc908cfe4d1ecb2fcff453b0/numpy-2.4.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b2f0073ed0868db1dcd86e052d37279eef185b9c8db5bf61f30f46adac63c909", size = 17028888, upload-time = "2026-01-31T23:12:19.306Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b3/e84bb64bdfea967cc10950d71090ec2d84b49bc691df0025dddb7c26e8e3/numpy-2.4.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7f54844851cdb630ceb623dcec4db3240d1ac13d4990532446761baede94996a", size = 18339556, upload-time = "2026-01-31T23:12:21.816Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f5/954a291bc1192a27081706862ac62bb5920fbecfbaa302f64682aa90beed/numpy-2.4.2-cp314-cp314-win32.whl", hash = "sha256:12e26134a0331d8dbd9351620f037ec470b7c75929cb8a1537f6bfe411152a1a", size = 6006899, upload-time = "2026-01-31T23:12:24.14Z" },
+    { url = "https://files.pythonhosted.org/packages/05/cb/eff72a91b2efdd1bc98b3b8759f6a1654aa87612fc86e3d87d6fe4f948c4/numpy-2.4.2-cp314-cp314-win_amd64.whl", hash = "sha256:068cdb2d0d644cdb45670810894f6a0600797a69c05f1ac478e8d31670b8ee75", size = 12443072, upload-time = "2026-01-31T23:12:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/37/75/62726948db36a56428fce4ba80a115716dc4fad6a3a4352487f8bb950966/numpy-2.4.2-cp314-cp314-win_arm64.whl", hash = "sha256:6ed0be1ee58eef41231a5c943d7d1375f093142702d5723ca2eb07db9b934b05", size = 10494886, upload-time = "2026-01-31T23:12:28.488Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2f/ee93744f1e0661dc267e4b21940870cabfae187c092e1433b77b09b50ac4/numpy-2.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:98f16a80e917003a12c0580f97b5f875853ebc33e2eaa4bccfc8201ac6869308", size = 14818567, upload-time = "2026-01-31T23:12:30.709Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/24/6535212add7d76ff938d8bdc654f53f88d35cddedf807a599e180dcb8e66/numpy-2.4.2-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:20abd069b9cda45874498b245c8015b18ace6de8546bf50dfa8cea1696ed06ef", size = 5328372, upload-time = "2026-01-31T23:12:32.962Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/9d/c48f0a035725f925634bf6b8994253b43f2047f6778a54147d7e213bc5a7/numpy-2.4.2-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:e98c97502435b53741540a5717a6749ac2ada901056c7db951d33e11c885cc7d", size = 6649306, upload-time = "2026-01-31T23:12:34.797Z" },
+    { url = "https://files.pythonhosted.org/packages/81/05/7c73a9574cd4a53a25907bad38b59ac83919c0ddc8234ec157f344d57d9a/numpy-2.4.2-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da6cad4e82cb893db4b69105c604d805e0c3ce11501a55b5e9f9083b47d2ffe8", size = 15722394, upload-time = "2026-01-31T23:12:36.565Z" },
+    { url = "https://files.pythonhosted.org/packages/35/fa/4de10089f21fc7d18442c4a767ab156b25c2a6eaf187c0db6d9ecdaeb43f/numpy-2.4.2-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e4424677ce4b47fe73c8b5556d876571f7c6945d264201180db2dc34f676ab5", size = 16653343, upload-time = "2026-01-31T23:12:39.188Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/f9/d33e4ffc857f3763a57aa85650f2e82486832d7492280ac21ba9efda80da/numpy-2.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2b8f157c8a6f20eb657e240f8985cc135598b2b46985c5bccbde7616dc9c6b1e", size = 17078045, upload-time = "2026-01-31T23:12:42.041Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/b8/54bdb43b6225badbea6389fa038c4ef868c44f5890f95dd530a218706da3/numpy-2.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5daf6f3914a733336dab21a05cdec343144600e964d2fcdabaac0c0269874b2a", size = 18380024, upload-time = "2026-01-31T23:12:44.331Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/55/6e1a61ded7af8df04016d81b5b02daa59f2ea9252ee0397cb9f631efe9e5/numpy-2.4.2-cp314-cp314t-win32.whl", hash = "sha256:8c50dd1fc8826f5b26a5ee4d77ca55d88a895f4e4819c7ecc2a9f5905047a443", size = 6153937, upload-time = "2026-01-31T23:12:47.229Z" },
+    { url = "https://files.pythonhosted.org/packages/45/aa/fa6118d1ed6d776b0983f3ceac9b1a5558e80df9365b1c3aa6d42bf9eee4/numpy-2.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:fcf92bee92742edd401ba41135185866f7026c502617f422eb432cfeca4fe236", size = 12631844, upload-time = "2026-01-31T23:12:48.997Z" },
+    { url = "https://files.pythonhosted.org/packages/32/0a/2ec5deea6dcd158f254a7b372fb09cfba5719419c8d66343bab35237b3fb/numpy-2.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:1f92f53998a17265194018d1cc321b2e96e900ca52d54c7c77837b71b9465181", size = 10565379, upload-time = "2026-01-31T23:12:51.345Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/50e14d36d915ef64d8f8bc4a087fc8264d82c785eda6711f80ab7e620335/numpy-2.4.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:89f7268c009bc492f506abd6f5265defa7cb3f7487dc21d357c3d290add45082", size = 16833179, upload-time = "2026-01-31T23:12:53.5Z" },
+    { url = "https://files.pythonhosted.org/packages/17/17/809b5cad63812058a8189e91a1e2d55a5a18fd04611dbad244e8aeae465c/numpy-2.4.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6dee3bb76aa4009d5a912180bf5b2de012532998d094acee25d9cb8dee3e44a", size = 14889755, upload-time = "2026-01-31T23:12:55.933Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/ea/181b9bcf7627fc8371720316c24db888dcb9829b1c0270abf3d288b2e29b/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:cd2bd2bbed13e213d6b55dc1d035a4f91748a7d3edc9480c13898b0353708920", size = 5399500, upload-time = "2026-01-31T23:12:58.671Z" },
+    { url = "https://files.pythonhosted.org/packages/33/9f/413adf3fc955541ff5536b78fcf0754680b3c6d95103230252a2c9408d23/numpy-2.4.2-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:cf28c0c1d4c4bf00f509fa7eb02c58d7caf221b50b467bcb0d9bbf1584d5c821", size = 6714252, upload-time = "2026-01-31T23:13:00.518Z" },
+    { url = "https://files.pythonhosted.org/packages/91/da/643aad274e29ccbdf42ecd94dafe524b81c87bcb56b83872d54827f10543/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e04ae107ac591763a47398bb45b568fc38f02dbc4aa44c063f67a131f99346cb", size = 15797142, upload-time = "2026-01-31T23:13:02.219Z" },
+    { url = "https://files.pythonhosted.org/packages/66/27/965b8525e9cb5dc16481b30a1b3c21e50c7ebf6e9dbd48d0c4d0d5089c7e/numpy-2.4.2-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:602f65afdef699cda27ec0b9224ae5dc43e328f4c24c689deaf77133dbee74d0", size = 16727979, upload-time = "2026-01-31T23:13:04.62Z" },
+    { url = "https://files.pythonhosted.org/packages/de/e5/b7d20451657664b07986c2f6e3be564433f5dcaf3482d68eaecd79afaf03/numpy-2.4.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be71bf1edb48ebbbf7f6337b5bfd2f895d1902f6335a5830b20141fc126ffba0", size = 12502577, upload-time = "2026-01-31T23:13:07.08Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
+]
+
+[[package]]
+name = "pyarrow"
+version = "23.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" },
+    { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" },
+    { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" },
+    { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" },
+    { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" },
+    { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" },
+    { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" },
+    { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" },
+    { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" },
+    { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" },
+    { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" },
+]
+
+[[package]]
+name = "pycparser"
+version = "3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
+]
+
+[[package]]
+name = "pydata-sphinx-theme"
+version = "0.16.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "accessible-pygments" },
+    { name = "babel" },
+    { name = "beautifulsoup4" },
+    { name = "docutils" },
+    { name = "pygments" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/20/bb50f9de3a6de69e6abd6b087b52fa2418a0418b19597601605f855ad044/pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7", size = 2412693, upload-time = "2024-12-17T10:53:39.537Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" },
+]
+
+[[package]]
+name = "pygithub"
+version = "2.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyjwt", extra = ["crypto"] },
+    { name = "pynacl" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pyjwt"
+version = "2.12.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/10/e8192be5f38f3e8e7e046716de4cae33d56fd5ae08927a823bb916be36c1/pyjwt-2.12.0.tar.gz", hash = "sha256:2f62390b667cd8257de560b850bb5a883102a388829274147f1d724453f8fb02", size = 102511, upload-time = "2026-03-12T17:15:30.831Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/70/70f895f404d363d291dcf62c12c85fdd47619ad9674ac0f53364d035925a/pyjwt-2.12.0-py3-none-any.whl", hash = "sha256:9bb459d1bdd0387967d287f5656bf7ec2b9a26645d1961628cda1764e087fd6e", size = 29700, upload-time = "2026-03-12T17:15:29.257Z" },
+]
+
+[package.optional-dependencies]
+crypto = [
+    { name = "cryptography" },
+]
+
+[[package]]
+name = "pynacl"
+version = "1.6.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/9a/4019b524b03a13438637b11538c82781a5eda427394380381af8f04f467a/pynacl-1.6.2.tar.gz", hash = "sha256:018494d6d696ae03c7e656e5e74cdfd8ea1326962cc401bcf018f1ed8436811c", size = 3511692, upload-time = "2026-01-01T17:48:10.851Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/79/0e3c34dc3c4671f67d251c07aa8eb100916f250ee470df230b0ab89551b4/pynacl-1.6.2-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:622d7b07cc5c02c666795792931b50c91f3ce3c2649762efb1ef0d5684c81594", size = 390064, upload-time = "2026-01-01T17:31:57.264Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/1c/23a26e931736e13b16483795c8a6b2f641bf6a3d5238c22b070a5112722c/pynacl-1.6.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d071c6a9a4c94d79eb665db4ce5cedc537faf74f2355e4d502591d850d3913c0", size = 809370, upload-time = "2026-01-01T17:31:59.198Z" },
+    { url = "https://files.pythonhosted.org/packages/87/74/8d4b718f8a22aea9e8dcc8b95deb76d4aae380e2f5b570cc70b5fd0a852d/pynacl-1.6.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe9847ca47d287af41e82be1dd5e23023d3c31a951da134121ab02e42ac218c9", size = 1408304, upload-time = "2026-01-01T17:32:01.162Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/73/be4fdd3a6a87fe8a4553380c2b47fbd1f7f58292eb820902f5c8ac7de7b0/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:04316d1fc625d860b6c162fff704eb8426b1a8bcd3abacea11142cbd99a6b574", size = 844871, upload-time = "2026-01-01T17:32:02.824Z" },
+    { url = "https://files.pythonhosted.org/packages/55/ad/6efc57ab75ee4422e96b5f2697d51bbcf6cdcc091e66310df91fbdc144a8/pynacl-1.6.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44081faff368d6c5553ccf55322ef2819abb40e25afaec7e740f159f74813634", size = 1446356, upload-time = "2026-01-01T17:32:04.452Z" },
+    { url = "https://files.pythonhosted.org/packages/78/b7/928ee9c4779caa0a915844311ab9fb5f99585621c5d6e4574538a17dca07/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:a9f9932d8d2811ce1a8ffa79dcbdf3970e7355b5c8eb0c1a881a57e7f7d96e88", size = 826814, upload-time = "2026-01-01T17:32:06.078Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/a9/1bdba746a2be20f8809fee75c10e3159d75864ef69c6b0dd168fc60e485d/pynacl-1.6.2-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:bc4a36b28dd72fb4845e5d8f9760610588a96d5a51f01d84d8c6ff9849968c14", size = 1411742, upload-time = "2026-01-01T17:32:07.651Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/2f/5e7ea8d85f9f3ea5b6b87db1d8388daa3587eed181bdeb0306816fdbbe79/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bffb6d0f6becacb6526f8f42adfb5efb26337056ee0831fb9a7044d1a964444", size = 801714, upload-time = "2026-01-01T17:32:09.558Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ea/43fe2f7eab5f200e40fb10d305bf6f87ea31b3bbc83443eac37cd34a9e1e/pynacl-1.6.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2fef529ef3ee487ad8113d287a593fa26f48ee3620d92ecc6f1d09ea38e0709b", size = 1372257, upload-time = "2026-01-01T17:32:11.026Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/54/c9ea116412788629b1347e415f72195c25eb2f3809b2d3e7b25f5c79f13a/pynacl-1.6.2-cp314-cp314t-win32.whl", hash = "sha256:a84bf1c20339d06dc0c85d9aea9637a24f718f375d861b2668b2f9f96fa51145", size = 231319, upload-time = "2026-01-01T17:32:12.46Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/04/64e9d76646abac2dccf904fccba352a86e7d172647557f35b9fe2a5ee4a1/pynacl-1.6.2-cp314-cp314t-win_amd64.whl", hash = "sha256:320ef68a41c87547c91a8b58903c9caa641ab01e8512ce291085b5fe2fcb7590", size = 244044, upload-time = "2026-01-01T17:32:13.781Z" },
+    { url = "https://files.pythonhosted.org/packages/33/33/7873dc161c6a06f43cda13dec67b6fe152cb2f982581151956fa5e5cdb47/pynacl-1.6.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d29bfe37e20e015a7d8b23cfc8bd6aa7909c92a1b8f41ee416bbb3e79ef182b2", size = 188740, upload-time = "2026-01-01T17:32:15.083Z" },
+    { url = "https://files.pythonhosted.org/packages/be/7b/4845bbf88e94586ec47a432da4e9107e3fc3ce37eb412b1398630a37f7dd/pynacl-1.6.2-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:c949ea47e4206af7c8f604b8278093b674f7c79ed0d4719cc836902bf4517465", size = 388458, upload-time = "2026-01-01T17:32:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b4/e927e0653ba63b02a4ca5b4d852a8d1d678afbf69b3dbf9c4d0785ac905c/pynacl-1.6.2-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8845c0631c0be43abdd865511c41eab235e0be69c81dc66a50911594198679b0", size = 800020, upload-time = "2026-01-01T17:32:18.34Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/81/d60984052df5c97b1d24365bc1e30024379b42c4edcd79d2436b1b9806f2/pynacl-1.6.2-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:22de65bb9010a725b0dac248f353bb072969c94fa8d6b1f34b87d7953cf7bbe4", size = 1399174, upload-time = "2026-01-01T17:32:20.239Z" },
+    { url = "https://files.pythonhosted.org/packages/68/f7/322f2f9915c4ef27d140101dd0ed26b479f7e6f5f183590fd32dfc48c4d3/pynacl-1.6.2-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46065496ab748469cdd999246d17e301b2c24ae2fdf739132e580a0e94c94a87", size = 835085, upload-time = "2026-01-01T17:32:22.24Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d0/f301f83ac8dbe53442c5a43f6a39016f94f754d7a9815a875b65e218a307/pynacl-1.6.2-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a66d6fb6ae7661c58995f9c6435bda2b1e68b54b598a6a10247bfcdadac996c", size = 1437614, upload-time = "2026-01-01T17:32:23.766Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/58/fc6e649762b029315325ace1a8c6be66125e42f67416d3dbd47b69563d61/pynacl-1.6.2-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:26bfcd00dcf2cf160f122186af731ae30ab120c18e8375684ec2670dccd28130", size = 818251, upload-time = "2026-01-01T17:32:25.69Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/a8/b917096b1accc9acd878819a49d3d84875731a41eb665f6ebc826b1af99e/pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:c8a231e36ec2cab018c4ad4358c386e36eede0319a0c41fed24f840b1dac59f6", size = 1402859, upload-time = "2026-01-01T17:32:27.215Z" },
+    { url = "https://files.pythonhosted.org/packages/85/42/fe60b5f4473e12c72f977548e4028156f4d340b884c635ec6b063fe7e9a5/pynacl-1.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:68be3a09455743ff9505491220b64440ced8973fe930f270c8e07ccfa25b1f9e", size = 791926, upload-time = "2026-01-01T17:32:29.314Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/f9/e40e318c604259301cc091a2a63f237d9e7b424c4851cafaea4ea7c4834e/pynacl-1.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8b097553b380236d51ed11356c953bf8ce36a29a3e596e934ecabe76c985a577", size = 1363101, upload-time = "2026-01-01T17:32:31.263Z" },
+    { url = "https://files.pythonhosted.org/packages/48/47/e761c254f410c023a469284a9bc210933e18588ca87706ae93002c05114c/pynacl-1.6.2-cp38-abi3-win32.whl", hash = "sha256:5811c72b473b2f38f7e2a3dc4f8642e3a3e9b5e7317266e4ced1fba85cae41aa", size = 227421, upload-time = "2026-01-01T17:32:33.076Z" },
+    { url = "https://files.pythonhosted.org/packages/41/ad/334600e8cacc7d86587fe5f565480fde569dfb487389c8e1be56ac21d8ac/pynacl-1.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:62985f233210dee6548c223301b6c25440852e13d59a8b81490203c3227c5ba0", size = 239754, upload-time = "2026-01-01T17:32:34.557Z" },
+    { url = "https://files.pythonhosted.org/packages/29/7d/5945b5af29534641820d3bd7b00962abbbdfee84ec7e19f0d5b3175f9a31/pynacl-1.6.2-cp38-abi3-win_arm64.whl", hash = "sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c", size = 184801, upload-time = "2026-01-01T17:32:36.309Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
+]
+
+[[package]]
+name = "rich"
+version = "14.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/74/99/a4cab2acbb884f80e558b0771e97e21e939c5dfb460f488d19df485e8298/rich-14.3.2.tar.gz", hash = "sha256:e712f11c1a562a11843306f5ed999475f09ac31ffb64281f73ab29ffdda8b3b8", size = 230143, upload-time = "2026-02-01T16:20:47.908Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl", hash = "sha256:08e67c3e90884651da3239ea668222d19bea7b589149d8014a21c633420dbb69", size = 309963, upload-time = "2026-02-01T16:20:46.078Z" },
+]
+
+[[package]]
+name = "roman-numerals"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/41dc953bbeb056c17d5f7a519f50fdf010bd0553be2d630bc69d1e022703/roman_numerals-4.1.0.tar.gz", hash = "sha256:1af8b147eb1405d5839e78aeb93131690495fe9da5c91856cb33ad55a7f1e5b2", size = 9077, upload-time = "2025-12-17T18:25:34.381Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/54/6f679c435d28e0a568d8e8a7c0a93a09010818634c3c3907fc98d8983770/roman_numerals-4.1.0-py3-none-any.whl", hash = "sha256:647ba99caddc2cc1e55a51e4360689115551bf4476d90e8162cf8c345fe233c7", size = 7676, upload-time = "2025-12-17T18:25:33.098Z" },
+]
+
+[[package]]
+name = "setuptools"
+version = "82.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
+]
+
+[[package]]
+name = "snowballstemmer"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" },
+]
+
+[[package]]
+name = "soupsieve"
+version = "2.8.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.0.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.12'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version < '3.12'" },
+    { name = "babel", marker = "python_full_version < '3.12'" },
+    { name = "colorama", marker = "python_full_version < '3.12' and sys_platform == 'win32'" },
+    { name = "docutils", marker = "python_full_version < '3.12'" },
+    { name = "imagesize", marker = "python_full_version < '3.12'" },
+    { name = "jinja2", marker = "python_full_version < '3.12'" },
+    { name = "packaging", marker = "python_full_version < '3.12'" },
+    { name = "pygments", marker = "python_full_version < '3.12'" },
+    { name = "requests", marker = "python_full_version < '3.12'" },
+    { name = "roman-numerals", marker = "python_full_version < '3.12'" },
+    { name = "snowballstemmer", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.12'" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version >= '3.12'" },
+    { name = "babel", marker = "python_full_version >= '3.12'" },
+    { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
+    { name = "docutils", marker = "python_full_version >= '3.12'" },
+    { name = "imagesize", marker = "python_full_version >= '3.12'" },
+    { name = "jinja2", marker = "python_full_version >= '3.12'" },
+    { name = "packaging", marker = "python_full_version >= '3.12'" },
+    { name = "pygments", marker = "python_full_version >= '3.12'" },
+    { name = "requests", marker = "python_full_version >= '3.12'" },
+    { name = "roman-numerals", marker = "python_full_version >= '3.12'" },
+    { name = "snowballstemmer", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12'" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" },
+]
+
+[[package]]
+name = "sphinx-reredirects"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1b/8d/0e39fe2740d7d71417edf9a6424aa80ca2c27c17fc21282cdc39f90d5a40/sphinx_reredirects-1.1.0.tar.gz", hash = "sha256:fb9b195335ab14b43f8273287d0c7eeb637ba6c56c66581c11b47202f6718b29", size = 614624, upload-time = "2025-12-22T08:28:02.792Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/81/b5dd07067f3daac6d23687ec737b2d593740671ebcd145830c8f92d381c5/sphinx_reredirects-1.1.0-py3-none-any.whl", hash = "sha256:4b5692273c72cd2d4d917f4c6f87d5919e4d6114a752d4be033f7f5f6310efd9", size = 6351, upload-time = "2025-12-22T08:27:59.724Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" },
+]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" },
+]
+
+[[package]]
+name = "tomlkit"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
+]
+
+[[package]]
+name = "typer"
+version = "0.23.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-doc" },
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/ae/93d16574e66dfe4c2284ffdaca4b0320ade32858cb2cc586c8dd79f127c5/typer-0.23.2.tar.gz", hash = "sha256:a99706a08e54f1aef8bb6a8611503808188a4092808e86addff1828a208af0de", size = 120162, upload-time = "2026-02-16T18:52:40.354Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2c/dee705c427875402200fe779eb8a3c00ccb349471172c41178336e9599cc/typer-0.23.2-py3-none-any.whl", hash = "sha256:e9c8dc380f82450b3c851a9b9d5a0edf95d1d6456ae70c517d8b06a50c7a9978", size = 56834, upload-time = "2026-02-16T18:52:39.308Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+]